From 7dafbab3753fcf59bc81748e5b2c5bf04e1c62c7 Mon Sep 17 00:00:00 2001
From: Don Hiatt <don.hiatt@intel.com>
Date: Fri, 12 May 2017 09:19:55 -0700
Subject: IB/hfi1: Add functions to parse BTH/IB headers

Improve code readablity by adding inline functions
to read specific BTH/IB fields without knowledge of
byte offsets.

Reviewed-by: Brian Welty <brian.welty@intel.com>
Reviewed-by: Dasaratharaman Chandramouli <dasaratharaman.chandramouli@intel.com>
Reviewed-by: Dennis Dalessandro <dennis.dalessandro@intel.com>
Signed-off-by: Don Hiatt <don.hiatt@intel.com>
Signed-off-by: Dennis Dalessandro <dennis.dalessandro@intel.com>
Signed-off-by: Doug Ledford <dledford@redhat.com>
---
 include/rdma/ib_hdrs.h   | 84 ++++++++++++++++++++++++++++++++++++++++++++++++
 include/rdma/ib_verbs.h  |  2 ++
 include/rdma/rdmavt_qp.h |  2 +-
 3 files changed, 87 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/rdma/ib_hdrs.h b/include/rdma/ib_hdrs.h
index 5519f31f043a..c124d515f7d5 100644
--- a/include/rdma/ib_hdrs.h
+++ b/include/rdma/ib_hdrs.h
@@ -193,8 +193,12 @@ static inline void put_ib_ateth_compare(u64 val, struct ib_atomic_eth *ateth)
 #define IB_LNH_MASK		3
 #define IB_SC_MASK		0xf
 #define IB_SC_SHIFT		12
+#define IB_SC5_MASK		0x10
 #define IB_SL_MASK		0xf
 #define IB_SL_SHIFT		4
+#define IB_SL_SHIFT		4
+#define IB_LVER_MASK	0xf
+#define IB_LVER_SHIFT	8
 
 static inline u8 ib_get_lnh(struct ib_header *hdr)
 {
@@ -206,6 +210,11 @@ static inline u8 ib_get_sc(struct ib_header *hdr)
 	return ((be16_to_cpu(hdr->lrh[0]) >> IB_SC_SHIFT) & IB_SC_MASK);
 }
 
+static inline bool ib_is_sc5(u16 sc5)
+{
+	return !!(sc5 & IB_SC5_MASK);
+}
+
 static inline u8 ib_get_sl(struct ib_header *hdr)
 {
 	return ((be16_to_cpu(hdr->lrh[0]) >> IB_SL_SHIFT) & IB_SL_MASK);
@@ -221,6 +230,27 @@ static inline u16 ib_get_slid(struct ib_header *hdr)
 	return (be16_to_cpu(hdr->lrh[3]));
 }
 
+static inline u8 ib_get_lver(struct ib_header *hdr)
+{
+	return (u8)((be16_to_cpu(hdr->lrh[0]) >> IB_LVER_SHIFT) &
+		   IB_LVER_MASK);
+}
+
+static inline u16 ib_get_len(struct ib_header *hdr)
+{
+	return (u16)(be16_to_cpu(hdr->lrh[2]));
+}
+
+static inline u32 ib_get_qkey(struct ib_other_headers *ohdr)
+{
+	return be32_to_cpu(ohdr->u.ud.deth[0]);
+}
+
+static inline u32 ib_get_sqpn(struct ib_other_headers *ohdr)
+{
+	return ((be32_to_cpu(ohdr->u.ud.deth[1])) & IB_QPN_MASK);
+}
+
 /*
  * BTH
  */
@@ -229,6 +259,14 @@ static inline u16 ib_get_slid(struct ib_header *hdr)
 #define IB_BTH_PAD_MASK	3
 #define IB_BTH_PKEY_MASK	0xffff
 #define IB_BTH_PAD_SHIFT	20
+#define IB_BTH_A_MASK		1
+#define IB_BTH_A_SHIFT		31
+#define IB_BTH_M_MASK		1
+#define IB_BTH_M_SHIFT		22
+#define IB_BTH_SE_MASK		1
+#define IB_BTH_SE_SHIFT	23
+#define IB_BTH_TVER_MASK	0xf
+#define IB_BTH_TVER_SHIFT	16
 
 static inline u8 ib_bth_get_pad(struct ib_other_headers *ohdr)
 {
@@ -247,4 +285,50 @@ static inline u8 ib_bth_get_opcode(struct ib_other_headers *ohdr)
 		   IB_BTH_OPCODE_MASK);
 }
 
+static inline u8 ib_bth_get_ackreq(struct ib_other_headers *ohdr)
+{
+	return (u8)((be32_to_cpu(ohdr->bth[2]) >> IB_BTH_A_SHIFT) &
+		   IB_BTH_A_MASK);
+}
+
+static inline u8 ib_bth_get_migreq(struct ib_other_headers *ohdr)
+{
+	return (u8)((be32_to_cpu(ohdr->bth[0]) >> IB_BTH_M_SHIFT) &
+		    IB_BTH_M_MASK);
+}
+
+static inline u8 ib_bth_get_se(struct ib_other_headers *ohdr)
+{
+	return (u8)((be32_to_cpu(ohdr->bth[0]) >> IB_BTH_SE_SHIFT) &
+		    IB_BTH_SE_MASK);
+}
+
+static inline u32 ib_bth_get_psn(struct ib_other_headers *ohdr)
+{
+	return (u32)(be32_to_cpu(ohdr->bth[2]));
+}
+
+static inline u32 ib_bth_get_qpn(struct ib_other_headers *ohdr)
+{
+	return (u32)((be32_to_cpu(ohdr->bth[1])) & IB_QPN_MASK);
+}
+
+static inline u8 ib_bth_get_becn(struct ib_other_headers *ohdr)
+{
+	return (u8)((be32_to_cpu(ohdr->bth[1]) >> IB_BECN_SHIFT) &
+		     IB_BECN_MASK);
+}
+
+static inline u8 ib_bth_get_fecn(struct ib_other_headers *ohdr)
+{
+	return (u8)((be32_to_cpu(ohdr->bth[1]) >> IB_FECN_SHIFT) &
+		    IB_FECN_MASK);
+}
+
+static inline u8 ib_bth_get_tver(struct ib_other_headers *ohdr)
+{
+	return (u8)((be32_to_cpu(ohdr->bth[0]) >> IB_BTH_TVER_SHIFT)  &
+		    IB_BTH_TVER_MASK);
+}
+
 #endif                          /* IB_HDRS_H */
diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h
index ba8314ec5768..8f1ce4e27bbd 100644
--- a/include/rdma/ib_verbs.h
+++ b/include/rdma/ib_verbs.h
@@ -664,6 +664,8 @@ union rdma_network_hdr {
 	};
 };
 
+#define IB_QPN_MASK		0xFFFFFF
+
 enum {
 	IB_MULTICAST_QPN = 0xffffff
 };
diff --git a/include/rdma/rdmavt_qp.h b/include/rdma/rdmavt_qp.h
index be6472e5b06b..13f43b3527a8 100644
--- a/include/rdma/rdmavt_qp.h
+++ b/include/rdma/rdmavt_qp.h
@@ -396,7 +396,7 @@ struct rvt_srq {
 #define RVT_QPNMAP_ENTRIES          (RVT_QPN_MAX / PAGE_SIZE / BITS_PER_BYTE)
 #define RVT_BITS_PER_PAGE           (PAGE_SIZE * BITS_PER_BYTE)
 #define RVT_BITS_PER_PAGE_MASK      (RVT_BITS_PER_PAGE - 1)
-#define RVT_QPN_MASK		    0xFFFFFF
+#define RVT_QPN_MASK		    IB_QPN_MASK
 
 /*
  * QPN-map pages start out as NULL, they get allocated upon
-- 
cgit v1.2.3


From 14fe13fcd3afb96b06809f280b586be1c998332c Mon Sep 17 00:00:00 2001
From: Mike Marciniszyn <mike.marciniszyn@intel.com>
Date: Fri, 12 May 2017 09:20:31 -0700
Subject: IB/rdmavt: Compress adjacent SGEs in rvt_lkey_ok()

SGEs that are contiguous needlessly consume driver dependent TX resources.

The lkey validation logic is enhanced to compress the SGE that ends
up in the send wqe when consecutive addresses are detected.

The lkey validation API used to return 1 (success) or 0 (fail).

The return value is now an -errno, 0 (compressed), or 1 (uncompressed).  A
additional argument is added to pass the last SQE for the compression.

Loopback callers always pass a NULL to last_sge since the optimization is
of little benefit in that situation.

Reviewed-by: Dennis Dalessandro <dennis.dalessandro@intel.com>
Signed-off-by: Brian Welty <brian.welty@intel.com>
Signed-off-by: Venkata Sandeep Dhanalakota <venkata.s.dhanalakota@intel.com>
Signed-off-by: Mike Marciniszyn <mike.marciniszyn@intel.com>
Signed-off-by: Dennis Dalessandro <dennis.dalessandro@intel.com>
Signed-off-by: Doug Ledford <dledford@redhat.com>
---
 drivers/infiniband/hw/hfi1/ruc.c        |  2 +-
 drivers/infiniband/hw/qib/qib_ruc.c     |  2 +-
 drivers/infiniband/sw/rdmavt/mr.c       | 51 +++++++++++++++++++++++----
 drivers/infiniband/sw/rdmavt/qp.c       | 23 ++++++------
 drivers/infiniband/sw/rdmavt/trace_mr.h | 62 +++++++++++++++++++++++++++++++++
 drivers/infiniband/sw/rdmavt/trace_tx.h | 11 +++---
 include/rdma/rdma_vt.h                  |  3 +-
 7 files changed, 130 insertions(+), 24 deletions(-)

(limited to 'include')

diff --git a/drivers/infiniband/hw/hfi1/ruc.c b/drivers/infiniband/hw/hfi1/ruc.c
index 9cc9c7be9dd4..476fe5da2992 100644
--- a/drivers/infiniband/hw/hfi1/ruc.c
+++ b/drivers/infiniband/hw/hfi1/ruc.c
@@ -75,7 +75,7 @@ static int init_sge(struct rvt_qp *qp, struct rvt_rwqe *wqe)
 			continue;
 		/* Check LKEY */
 		if (!rvt_lkey_ok(rkt, pd, j ? &ss->sg_list[j - 1] : &ss->sge,
-				 &wqe->sg_list[i], IB_ACCESS_LOCAL_WRITE))
+				 NULL, &wqe->sg_list[i], IB_ACCESS_LOCAL_WRITE))
 			goto bad_lkey;
 		qp->r_len += wqe->sg_list[i].length;
 		j++;
diff --git a/drivers/infiniband/hw/qib/qib_ruc.c b/drivers/infiniband/hw/qib/qib_ruc.c
index bd09de7c6e56..88d84cbf7e5a 100644
--- a/drivers/infiniband/hw/qib/qib_ruc.c
+++ b/drivers/infiniband/hw/qib/qib_ruc.c
@@ -59,7 +59,7 @@ static int qib_init_sge(struct rvt_qp *qp, struct rvt_rwqe *wqe)
 			continue;
 		/* Check LKEY */
 		if (!rvt_lkey_ok(rkt, pd, j ? &ss->sg_list[j - 1] : &ss->sge,
-				 &wqe->sg_list[i], IB_ACCESS_LOCAL_WRITE))
+				 NULL, &wqe->sg_list[i], IB_ACCESS_LOCAL_WRITE))
 			goto bad_lkey;
 		qp->r_len += wqe->sg_list[i].length;
 		j++;
diff --git a/drivers/infiniband/sw/rdmavt/mr.c b/drivers/infiniband/sw/rdmavt/mr.c
index aa5f9ea318e4..ea95672d9675 100644
--- a/drivers/infiniband/sw/rdmavt/mr.c
+++ b/drivers/infiniband/sw/rdmavt/mr.c
@@ -777,24 +777,55 @@ out:
 	return ret;
 }
 
+/**
+ * rvt_sge_adjacent - is isge compressible
+ * @isge: outgoing internal SGE
+ * @last_sge: last outgoing SGE written
+ * @sge: SGE to check
+ *
+ * If adjacent will update last_sge to add length.
+ *
+ * Return: true if isge is adjacent to last sge
+ */
+static inline bool rvt_sge_adjacent(struct rvt_sge *isge,
+				    struct rvt_sge *last_sge,
+				    struct ib_sge *sge)
+{
+	if (last_sge && sge->lkey == last_sge->mr->lkey &&
+	    ((uint64_t)(last_sge->vaddr + last_sge->length) == sge->addr)) {
+		if (sge->lkey) {
+			if (unlikely((sge->addr - last_sge->mr->user_base +
+			      sge->length > last_sge->mr->length)))
+				return false; /* overrun, caller will catch */
+		} else {
+			last_sge->length += sge->length;
+		}
+		last_sge->sge_length += sge->length;
+		trace_rvt_sge_adjacent(last_sge, sge);
+		return true;
+	}
+	return false;
+}
+
 /**
  * rvt_lkey_ok - check IB SGE for validity and initialize
  * @rkt: table containing lkey to check SGE against
  * @pd: protection domain
  * @isge: outgoing internal SGE
+ * @last_sge: last outgoing SGE written
  * @sge: SGE to check
  * @acc: access flags
  *
  * Check the IB SGE for validity and initialize our internal version
  * of it.
  *
- * Return: 1 if valid and successful, otherwise returns 0.
- *
- * increments the reference count upon success
+ * Increments the reference count when a new sge is stored.
  *
+ * Return: 0 if compressed, 1 if added , otherwise returns -errno.
  */
 int rvt_lkey_ok(struct rvt_lkey_table *rkt, struct rvt_pd *pd,
-		struct rvt_sge *isge, struct ib_sge *sge, int acc)
+		struct rvt_sge *isge, struct rvt_sge *last_sge,
+		struct ib_sge *sge, int acc)
 {
 	struct rvt_mregion *mr;
 	unsigned n, m;
@@ -804,12 +835,14 @@ int rvt_lkey_ok(struct rvt_lkey_table *rkt, struct rvt_pd *pd,
 	 * We use LKEY == zero for kernel virtual addresses
 	 * (see rvt_get_dma_mr() and dma_virt_ops).
 	 */
-	rcu_read_lock();
 	if (sge->lkey == 0) {
 		struct rvt_dev_info *dev = ib_to_rvt(pd->ibpd.device);
 
 		if (pd->user)
-			goto bail;
+			return -EINVAL;
+		if (rvt_sge_adjacent(isge, last_sge, sge))
+			return 0;
+		rcu_read_lock();
 		mr = rcu_dereference(dev->dma_mr);
 		if (!mr)
 			goto bail;
@@ -824,6 +857,9 @@ int rvt_lkey_ok(struct rvt_lkey_table *rkt, struct rvt_pd *pd,
 		isge->n = 0;
 		goto ok;
 	}
+	if (rvt_sge_adjacent(isge, last_sge, sge))
+		return 0;
+	rcu_read_lock();
 	mr = rcu_dereference(rkt->table[sge->lkey >> rkt->shift]);
 	if (!mr)
 		goto bail;
@@ -874,12 +910,13 @@ int rvt_lkey_ok(struct rvt_lkey_table *rkt, struct rvt_pd *pd,
 	isge->m = m;
 	isge->n = n;
 ok:
+	trace_rvt_sge_new(isge, sge);
 	return 1;
 bail_unref:
 	rvt_put_mr(mr);
 bail:
 	rcu_read_unlock();
-	return 0;
+	return -EINVAL;
 }
 EXPORT_SYMBOL(rvt_lkey_ok);
 
diff --git a/drivers/infiniband/sw/rdmavt/qp.c b/drivers/infiniband/sw/rdmavt/qp.c
index 727e81cc2c8f..a3dd1e536860 100644
--- a/drivers/infiniband/sw/rdmavt/qp.c
+++ b/drivers/infiniband/sw/rdmavt/qp.c
@@ -1646,7 +1646,7 @@ static int rvt_post_one_wr(struct rvt_qp *qp,
 	struct rvt_pd *pd;
 	struct rvt_dev_info *rdi = ib_to_rvt(qp->ibqp.device);
 	u8 log_pmtu;
-	int ret;
+	int ret, incr;
 	size_t cplen;
 	bool reserved_op;
 	int local_ops_delayed = 0;
@@ -1719,22 +1719,23 @@ static int rvt_post_one_wr(struct rvt_qp *qp,
 	wqe->length = 0;
 	j = 0;
 	if (wr->num_sge) {
+		struct rvt_sge *last_sge = NULL;
+
 		acc = wr->opcode >= IB_WR_RDMA_READ ?
 			IB_ACCESS_LOCAL_WRITE : 0;
 		for (i = 0; i < wr->num_sge; i++) {
 			u32 length = wr->sg_list[i].length;
-			int ok;
 
 			if (length == 0)
 				continue;
-			ok = rvt_lkey_ok(rkt, pd, &wqe->sg_list[j],
-					 &wr->sg_list[i], acc);
-			if (!ok) {
-				ret = -EINVAL;
-				goto bail_inval_free;
-			}
+			incr = rvt_lkey_ok(rkt, pd, &wqe->sg_list[j], last_sge,
+					   &wr->sg_list[i], acc);
+			if (unlikely(incr < 0))
+				goto bail_lkey_error;
 			wqe->length += length;
-			j++;
+			if (incr)
+				last_sge = &wqe->sg_list[j];
+			j += incr;
 		}
 		wqe->wr.num_sge = j;
 	}
@@ -1781,12 +1782,14 @@ static int rvt_post_one_wr(struct rvt_qp *qp,
 		wqe->wr.send_flags &= ~RVT_SEND_RESERVE_USED;
 		qp->s_avail--;
 	}
-	trace_rvt_post_one_wr(qp, wqe);
+	trace_rvt_post_one_wr(qp, wqe, wr->num_sge);
 	smp_wmb(); /* see request builders */
 	qp->s_head = next;
 
 	return 0;
 
+bail_lkey_error:
+	ret = incr;
 bail_inval_free:
 	/* release mr holds */
 	while (j) {
diff --git a/drivers/infiniband/sw/rdmavt/trace_mr.h b/drivers/infiniband/sw/rdmavt/trace_mr.h
index 3318a6c36373..976e482930a3 100644
--- a/drivers/infiniband/sw/rdmavt/trace_mr.h
+++ b/drivers/infiniband/sw/rdmavt/trace_mr.h
@@ -103,6 +103,68 @@ DEFINE_EVENT(
 	TP_PROTO(struct rvt_mregion *mr, u16 m, u16 n, void *v, size_t len),
 	TP_ARGS(mr, m, n, v, len));
 
+DECLARE_EVENT_CLASS(
+	rvt_sge_template,
+	TP_PROTO(struct rvt_sge *sge, struct ib_sge *isge),
+	TP_ARGS(sge, isge),
+	TP_STRUCT__entry(
+		RDI_DEV_ENTRY(ib_to_rvt(sge->mr->pd->device))
+		__field(struct rvt_mregion *, mr)
+		__field(struct rvt_sge *, sge)
+		__field(struct ib_sge *, isge)
+		__field(void *, vaddr)
+		__field(u64, ivaddr)
+		__field(u32, lkey)
+		__field(u32, sge_length)
+		__field(u32, length)
+		__field(u32, ilength)
+		__field(int, user)
+		__field(u16, m)
+		__field(u16, n)
+	),
+	TP_fast_assign(
+		RDI_DEV_ASSIGN(ib_to_rvt(sge->mr->pd->device));
+		__entry->mr = sge->mr;
+		__entry->sge = sge;
+		__entry->isge = isge;
+		__entry->vaddr = sge->vaddr;
+		__entry->ivaddr = isge->addr;
+		__entry->lkey = sge->mr->lkey;
+		__entry->sge_length = sge->sge_length;
+		__entry->length = sge->length;
+		__entry->ilength = isge->length;
+		__entry->m = sge->m;
+		__entry->n = sge->m;
+		__entry->user = ibpd_to_rvtpd(sge->mr->pd)->user;
+	),
+	TP_printk(
+		"[%s] mr %p sge %p isge %p vaddr %p ivaddr %llx lkey %x sge_length %u length %u ilength %u m %u n %u user %u",
+		__get_str(dev),
+		__entry->mr,
+		__entry->sge,
+		__entry->isge,
+		__entry->vaddr,
+		__entry->ivaddr,
+		__entry->lkey,
+		__entry->sge_length,
+		__entry->length,
+		__entry->ilength,
+		__entry->m,
+		__entry->n,
+		__entry->user
+	)
+);
+
+DEFINE_EVENT(
+	rvt_sge_template, rvt_sge_adjacent,
+	TP_PROTO(struct rvt_sge *sge, struct ib_sge *isge),
+	TP_ARGS(sge, isge));
+
+DEFINE_EVENT(
+	rvt_sge_template, rvt_sge_new,
+	TP_PROTO(struct rvt_sge *sge, struct ib_sge *isge),
+	TP_ARGS(sge, isge));
+
 #endif /* __RVT_TRACE_MR_H */
 
 #undef TRACE_INCLUDE_PATH
diff --git a/drivers/infiniband/sw/rdmavt/trace_tx.h b/drivers/infiniband/sw/rdmavt/trace_tx.h
index a613a2223751..0ef25fc49f25 100644
--- a/drivers/infiniband/sw/rdmavt/trace_tx.h
+++ b/drivers/infiniband/sw/rdmavt/trace_tx.h
@@ -84,12 +84,12 @@ __print_symbolic(opcode,                                   \
 	wr_opcode_name(RESERVED10))
 
 #define POS_PRN \
-"[%s] wqe %p wr_id %llx send_flags %x qpn %x qpt %u psn %x lpsn %x ssn %x length %u opcode 0x%.2x,%s size %u avail %u head %u last %u pid %u num_sge %u"
+"[%s] wqe %p wr_id %llx send_flags %x qpn %x qpt %u psn %x lpsn %x ssn %x length %u opcode 0x%.2x,%s size %u avail %u head %u last %u pid %u num_sge %u wr_num_sge %u"
 
 TRACE_EVENT(
 	rvt_post_one_wr,
-	TP_PROTO(struct rvt_qp *qp, struct rvt_swqe *wqe),
-	TP_ARGS(qp, wqe),
+	TP_PROTO(struct rvt_qp *qp, struct rvt_swqe *wqe, int wr_num_sge),
+	TP_ARGS(qp, wqe, wr_num_sge),
 	TP_STRUCT__entry(
 		RDI_DEV_ENTRY(ib_to_rvt(qp->ibqp.device))
 		__field(u64, wr_id)
@@ -108,6 +108,7 @@ TRACE_EVENT(
 		__field(int, send_flags)
 		__field(pid_t, pid)
 		__field(int, num_sge)
+		__field(int, wr_num_sge)
 	),
 	TP_fast_assign(
 		RDI_DEV_ASSIGN(ib_to_rvt(qp->ibqp.device))
@@ -127,6 +128,7 @@ TRACE_EVENT(
 		__entry->ssn = wqe->ssn;
 		__entry->send_flags = wqe->wr.send_flags;
 		__entry->num_sge = wqe->wr.num_sge;
+		__entry->wr_num_sge = wr_num_sge;
 	),
 	TP_printk(
 		POS_PRN,
@@ -146,7 +148,8 @@ TRACE_EVENT(
 		__entry->head,
 		__entry->last,
 		__entry->pid,
-		__entry->num_sge
+		__entry->num_sge,
+		__entry->wr_num_sge
 	)
 );
 
diff --git a/include/rdma/rdma_vt.h b/include/rdma/rdma_vt.h
index 4878aaf7bdff..d0b9f91e5f4d 100644
--- a/include/rdma/rdma_vt.h
+++ b/include/rdma/rdma_vt.h
@@ -515,7 +515,8 @@ int rvt_invalidate_rkey(struct rvt_qp *qp, u32 rkey);
 int rvt_rkey_ok(struct rvt_qp *qp, struct rvt_sge *sge,
 		u32 len, u64 vaddr, u32 rkey, int acc);
 int rvt_lkey_ok(struct rvt_lkey_table *rkt, struct rvt_pd *pd,
-		struct rvt_sge *isge, struct ib_sge *sge, int acc);
+		struct rvt_sge *isge, struct rvt_sge *last_sge,
+		struct ib_sge *sge, int acc);
 struct rvt_mcast *rvt_mcast_find(struct rvt_ibport *ibp, union ib_gid *mgid,
 				 u16 lid);
 
-- 
cgit v1.2.3


From cb49366f3616fdf197893c24a5b2677b8c26ce29 Mon Sep 17 00:00:00 2001
From: "Vishwanathapura, Niranjana" <niranjana.vishwanathapura@intel.com>
Date: Thu, 1 Jun 2017 17:04:02 -0700
Subject: IB/core,rdmavt,hfi1,opa-vnic: Send OPA cap_mask3 in trap

Provide the ability for IB clients to modify the OPA specific
capability mask and include this mask in the subsequent trap data.

Reviewed-by: Niranjana Vishwanathapura <niranjana.vishwanathapura@intel.com>
Signed-off-by: Michael N. Henry <michael.n.henry@intel.com>
Signed-off-by: Doug Ledford <dledford@redhat.com>
---
 drivers/infiniband/hw/hfi1/mad.c                |  7 ++-----
 drivers/infiniband/hw/hfi1/mad.h                |  2 +-
 drivers/infiniband/hw/hfi1/verbs.c              |  6 +++++-
 drivers/infiniband/sw/rdmavt/vt.c               |  9 +++++++--
 drivers/infiniband/ulp/opa_vnic/opa_vnic_vema.c | 27 ++++++++++++++++++++++++-
 include/rdma/ib_verbs.h                         |  3 ++-
 include/rdma/rdma_vt.h                          |  1 +
 7 files changed, 44 insertions(+), 11 deletions(-)

(limited to 'include')

diff --git a/drivers/infiniband/hw/hfi1/mad.c b/drivers/infiniband/hw/hfi1/mad.c
index 5977673a52d4..70831ad621b0 100644
--- a/drivers/infiniband/hw/hfi1/mad.c
+++ b/drivers/infiniband/hw/hfi1/mad.c
@@ -260,6 +260,7 @@ void hfi1_cap_mask_chg(struct rvt_dev_info *rdi, u8 port_num)
 	data.issuer_lid = cpu_to_be32(lid);
 	data.ntc_144.lid = data.issuer_lid;
 	data.ntc_144.new_cap_mask = cpu_to_be32(ibp->rvp.port_cap_flags);
+	data.ntc_144.cap_mask3 = cpu_to_be16(ibp->rvp.port_cap3_flags);
 
 	send_trap(ibp, &data, sizeof(data));
 }
@@ -704,11 +705,7 @@ static int __subn_get_opa_portinfo(struct opa_smp *smp, u32 am, u8 *data,
 	buffer_units |= (dd->vl15_init << 11) & OPA_PI_MASK_BUF_UNIT_VL15_INIT;
 	pi->buffer_units = cpu_to_be32(buffer_units);
 
-	pi->opa_cap_mask = cpu_to_be16(OPA_CAP_MASK3_IsSharedSpaceSupported |
-				       OPA_CAP_MASK3_IsEthOnFabricSupported);
-	/* Driver does not support mcast/collective configuration */
-	pi->opa_cap_mask &=
-		cpu_to_be16(~OPA_CAP_MASK3_IsAddrRangeConfigSupported);
+	pi->opa_cap_mask = cpu_to_be16(ibp->rvp.port_cap3_flags);
 	pi->collectivemask_multicastmask = ((HFI1_COLLECTIVE_NR & 0x7)
 					    << 3 | (HFI1_MCAST_NR & 0x7));
 
diff --git a/drivers/infiniband/hw/hfi1/mad.h b/drivers/infiniband/hw/hfi1/mad.h
index 5aa3fd1be653..a4e2506bd5ca 100644
--- a/drivers/infiniband/hw/hfi1/mad.h
+++ b/drivers/infiniband/hw/hfi1/mad.h
@@ -115,7 +115,7 @@ struct opa_mad_notice_attr {
 			__be32	lid;		/* LID where change occurred */
 			__be32	new_cap_mask;	/* new capability mask */
 			__be16	reserved2;
-			__be16	cap_mask;
+			__be16	cap_mask3;
 			__be16	change_flags;	/* low 4 bits only */
 		} __packed ntc_144;
 
diff --git a/drivers/infiniband/hw/hfi1/verbs.c b/drivers/infiniband/hw/hfi1/verbs.c
index af54d3f4696a..2d7759f0c6b4 100644
--- a/drivers/infiniband/hw/hfi1/verbs.c
+++ b/drivers/infiniband/hw/hfi1/verbs.c
@@ -1537,9 +1537,13 @@ static void init_ibport(struct hfi1_pportdata *ppd)
 	/* Set the prefix to the default value (see ch. 4.1.1) */
 	ibp->rvp.gid_prefix = IB_DEFAULT_GID_PREFIX;
 	ibp->rvp.sm_lid = 0;
-	/* Below should only set bits defined in OPA PortInfo.CapabilityMask */
+	/*
+	 * Below should only set bits defined in OPA PortInfo.CapabilityMask
+	 * and PortInfo.CapabilityMask3
+	 */
 	ibp->rvp.port_cap_flags = IB_PORT_AUTO_MIGR_SUP |
 		IB_PORT_CAP_MASK_NOTICE_SUP;
+	ibp->rvp.port_cap3_flags = OPA_CAP_MASK3_IsSharedSpaceSupported;
 	ibp->rvp.pma_counter_select[0] = IB_PMA_PORT_XMIT_DATA;
 	ibp->rvp.pma_counter_select[1] = IB_PMA_PORT_RCV_DATA;
 	ibp->rvp.pma_counter_select[2] = IB_PMA_PORT_XMIT_PKTS;
diff --git a/drivers/infiniband/sw/rdmavt/vt.c b/drivers/infiniband/sw/rdmavt/vt.c
index 0d7c6bb551d9..64bdd442078a 100644
--- a/drivers/infiniband/sw/rdmavt/vt.c
+++ b/drivers/infiniband/sw/rdmavt/vt.c
@@ -202,8 +202,13 @@ static int rvt_modify_port(struct ib_device *ibdev, u8 port_num,
 		return -EINVAL;
 
 	rvp = rdi->ports[port_index];
-	rvp->port_cap_flags |= props->set_port_cap_mask;
-	rvp->port_cap_flags &= ~props->clr_port_cap_mask;
+	if (port_modify_mask & IB_PORT_OPA_MASK_CHG) {
+		rvp->port_cap3_flags |= props->set_port_cap_mask;
+		rvp->port_cap3_flags &= ~props->clr_port_cap_mask;
+	} else {
+		rvp->port_cap_flags |= props->set_port_cap_mask;
+		rvp->port_cap_flags &= ~props->clr_port_cap_mask;
+	}
 
 	if (props->set_port_cap_mask || props->clr_port_cap_mask)
 		rdi->driver_f.cap_mask_chg(rdi, port_num);
diff --git a/drivers/infiniband/ulp/opa_vnic/opa_vnic_vema.c b/drivers/infiniband/ulp/opa_vnic/opa_vnic_vema.c
index 875694f9a7f9..32cdd7a35415 100644
--- a/drivers/infiniband/ulp/opa_vnic/opa_vnic_vema.c
+++ b/drivers/infiniband/ulp/opa_vnic/opa_vnic_vema.c
@@ -52,7 +52,9 @@
 
 #include <linux/module.h>
 #include <rdma/ib_addr.h>
-#include <rdma/ib_smi.h>
+#include <rdma/ib_verbs.h>
+#include <rdma/opa_smi.h>
+#include <rdma/opa_port_info.h>
 
 #include "opa_vnic_internal.h"
 
@@ -979,6 +981,27 @@ static int vema_register(struct opa_vnic_ctrl_port *cport)
 	return 0;
 }
 
+/**
+ * opa_vnic_ctrl_config_dev -- This function sends a trap to the EM
+ * by way of ib_modify_port to indicate support for ethernet on the
+ * fabric.
+ * @cport: pointer to control port
+ * @en: enable or disable ethernet on fabric support
+ */
+static void opa_vnic_ctrl_config_dev(struct opa_vnic_ctrl_port *cport, bool en)
+{
+	struct ib_port_modify pm = { 0 };
+	int i;
+
+	if (en)
+		pm.set_port_cap_mask = OPA_CAP_MASK3_IsEthOnFabricSupported;
+	else
+		pm.clr_port_cap_mask = OPA_CAP_MASK3_IsEthOnFabricSupported;
+
+	for (i = 1; i <= cport->num_ports; i++)
+		ib_modify_port(cport->ibdev, i, IB_PORT_OPA_MASK_CHG, &pm);
+}
+
 /**
  * opa_vnic_vema_add_one -- Handle new ib device
  * @device: ib device pointer
@@ -1007,6 +1030,7 @@ static void opa_vnic_vema_add_one(struct ib_device *device)
 		c_info("VNIC client initialized\n");
 
 	ib_set_client_data(device, &opa_vnic_client, cport);
+	opa_vnic_ctrl_config_dev(cport, true);
 }
 
 /**
@@ -1025,6 +1049,7 @@ static void opa_vnic_vema_rem_one(struct ib_device *device,
 		return;
 
 	c_info("removing VNIC client\n");
+	opa_vnic_ctrl_config_dev(cport, false);
 	vema_unregister(cport);
 	kfree(cport);
 }
diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h
index 8f1ce4e27bbd..9d4d2a74c95e 100644
--- a/include/rdma/ib_verbs.h
+++ b/include/rdma/ib_verbs.h
@@ -577,7 +577,8 @@ struct ib_device_modify {
 enum ib_port_modify_flags {
 	IB_PORT_SHUTDOWN		= 1,
 	IB_PORT_INIT_TYPE		= (1<<2),
-	IB_PORT_RESET_QKEY_CNTR		= (1<<3)
+	IB_PORT_RESET_QKEY_CNTR		= (1<<3),
+	IB_PORT_OPA_MASK_CHG		= (1<<4)
 };
 
 struct ib_port_modify {
diff --git a/include/rdma/rdma_vt.h b/include/rdma/rdma_vt.h
index d0b9f91e5f4d..0f18ffd98dd7 100644
--- a/include/rdma/rdma_vt.h
+++ b/include/rdma/rdma_vt.h
@@ -75,6 +75,7 @@ struct rvt_ibport {
 	__be64 mkey;
 	u64 tid;
 	u32 port_cap_flags;
+	u16 port_cap3_flags;
 	u32 pma_sample_start;
 	u32 pma_sample_interval;
 	__be16 pma_counter_select[5];
-- 
cgit v1.2.3


From 250bbbdbed93ca27c9f4d445c960a87d9f7e2044 Mon Sep 17 00:00:00 2001
From: Lorenzo Bianconi <lorenzo.bianconi83@gmail.com>
Date: Tue, 20 Jun 2017 21:52:08 +0200
Subject: iio: common: st_sensors: move st_sensors_of_i2c_probe() in common
 code

Move st_sensors_of_i2c_probe() in st_sensors_core and rename it in
st_sensors_of_name_probe(). That change is necessary to add device-tree
support in spi code otherwise the rest of the autodetection will fail
since spi->modalias (and indio_dev->name) will be set using compatible
string value that differs from standard sensor name

Signed-off-by: Lorenzo Bianconi <lorenzo.bianconi@st.com>
Signed-off-by: Jonathan Cameron <jic23@kernel.org>
---
 drivers/iio/accel/st_accel_i2c.c                |  3 ++-
 drivers/iio/common/st_sensors/st_sensors_core.c | 31 +++++++++++++++++++++++++
 drivers/iio/common/st_sensors/st_sensors_i2c.c  | 29 -----------------------
 drivers/iio/gyro/st_gyro_i2c.c                  |  3 ++-
 drivers/iio/magnetometer/st_magn_i2c.c          |  3 ++-
 drivers/iio/pressure/st_pressure_i2c.c          |  3 ++-
 include/linux/iio/common/st_sensors.h           | 12 ++++++++++
 include/linux/iio/common/st_sensors_i2c.h       | 10 --------
 8 files changed, 51 insertions(+), 43 deletions(-)

(limited to 'include')

diff --git a/drivers/iio/accel/st_accel_i2c.c b/drivers/iio/accel/st_accel_i2c.c
index 543f0ad7fd7e..ac67826135be 100644
--- a/drivers/iio/accel/st_accel_i2c.c
+++ b/drivers/iio/accel/st_accel_i2c.c
@@ -144,7 +144,8 @@ static int st_accel_i2c_probe(struct i2c_client *client,
 	adata = iio_priv(indio_dev);
 
 	if (client->dev.of_node) {
-		st_sensors_of_i2c_probe(client, st_accel_of_match);
+		st_sensors_of_name_probe(&client->dev, st_accel_of_match,
+					 client->name, sizeof(client->name));
 	} else if (ACPI_HANDLE(&client->dev)) {
 		ret = st_sensors_match_acpi_device(&client->dev);
 		if ((ret < 0) || (ret >= ST_ACCEL_MAX))
diff --git a/drivers/iio/common/st_sensors/st_sensors_core.c b/drivers/iio/common/st_sensors/st_sensors_core.c
index 79c8c7cd70d5..274868100fd0 100644
--- a/drivers/iio/common/st_sensors/st_sensors_core.c
+++ b/drivers/iio/common/st_sensors/st_sensors_core.c
@@ -15,6 +15,7 @@
 #include <linux/iio/iio.h>
 #include <linux/regulator/consumer.h>
 #include <linux/of.h>
+#include <linux/of_device.h>
 #include <asm/unaligned.h>
 #include <linux/iio/common/st_sensors.h>
 
@@ -345,6 +346,36 @@ static struct st_sensors_platform_data *st_sensors_of_probe(struct device *dev,
 
 	return pdata;
 }
+
+/**
+ * st_sensors_of_name_probe() - device tree probe for ST sensor name
+ * @dev: driver model representation of the device.
+ * @match: the OF match table for the device, containing compatible strings
+ *	but also a .data field with the corresponding internal kernel name
+ *	used by this sensor.
+ * @name: device name buffer reference.
+ * @len: device name buffer length.
+ *
+ * In effect this function matches a compatible string to an internal kernel
+ * name for a certain sensor device, so that the rest of the autodetection can
+ * rely on that name from this point on. I2C/SPI devices will be renamed
+ * to match the internal kernel convention.
+ */
+void st_sensors_of_name_probe(struct device *dev,
+			      const struct of_device_id *match,
+			      char *name, int len)
+{
+	const struct of_device_id *of_id;
+
+	of_id = of_match_device(match, dev);
+	if (!of_id || !of_id->data)
+		return;
+
+	/* The name from the OF match takes precedence if present */
+	strncpy(name, of_id->data, len);
+	name[len - 1] = '\0';
+}
+EXPORT_SYMBOL(st_sensors_of_name_probe);
 #else
 static struct st_sensors_platform_data *st_sensors_of_probe(struct device *dev,
 		struct st_sensors_platform_data *defdata)
diff --git a/drivers/iio/common/st_sensors/st_sensors_i2c.c b/drivers/iio/common/st_sensors/st_sensors_i2c.c
index c83df4dbfcd7..b81e48e9f27e 100644
--- a/drivers/iio/common/st_sensors/st_sensors_i2c.c
+++ b/drivers/iio/common/st_sensors/st_sensors_i2c.c
@@ -79,35 +79,6 @@ void st_sensors_i2c_configure(struct iio_dev *indio_dev,
 }
 EXPORT_SYMBOL(st_sensors_i2c_configure);
 
-#ifdef CONFIG_OF
-/**
- * st_sensors_of_i2c_probe() - device tree probe for ST I2C sensors
- * @client: the I2C client device for the sensor
- * @match: the OF match table for the device, containing compatible strings
- *	but also a .data field with the corresponding internal kernel name
- *	used by this sensor.
- *
- * In effect this function matches a compatible string to an internal kernel
- * name for a certain sensor device, so that the rest of the autodetection can
- * rely on that name from this point on. I2C client devices will be renamed
- * to match the internal kernel convention.
- */
-void st_sensors_of_i2c_probe(struct i2c_client *client,
-			     const struct of_device_id *match)
-{
-	const struct of_device_id *of_id;
-
-	of_id = of_match_device(match, &client->dev);
-	if (!of_id)
-		return;
-
-	/* The name from the OF match takes precedence if present */
-	strncpy(client->name, of_id->data, sizeof(client->name));
-	client->name[sizeof(client->name) - 1] = '\0';
-}
-EXPORT_SYMBOL(st_sensors_of_i2c_probe);
-#endif
-
 #ifdef CONFIG_ACPI
 int st_sensors_match_acpi_device(struct device *dev)
 {
diff --git a/drivers/iio/gyro/st_gyro_i2c.c b/drivers/iio/gyro/st_gyro_i2c.c
index 3f628746cb93..b405b82b9177 100644
--- a/drivers/iio/gyro/st_gyro_i2c.c
+++ b/drivers/iio/gyro/st_gyro_i2c.c
@@ -75,7 +75,8 @@ static int st_gyro_i2c_probe(struct i2c_client *client,
 		return -ENOMEM;
 
 	gdata = iio_priv(indio_dev);
-	st_sensors_of_i2c_probe(client, st_gyro_of_match);
+	st_sensors_of_name_probe(&client->dev, st_gyro_of_match,
+				 client->name, sizeof(client->name));
 
 	st_sensors_i2c_configure(indio_dev, client, gdata);
 
diff --git a/drivers/iio/magnetometer/st_magn_i2c.c b/drivers/iio/magnetometer/st_magn_i2c.c
index 8aa37af306ed..6a6c8121ac2c 100644
--- a/drivers/iio/magnetometer/st_magn_i2c.c
+++ b/drivers/iio/magnetometer/st_magn_i2c.c
@@ -59,7 +59,8 @@ static int st_magn_i2c_probe(struct i2c_client *client,
 		return -ENOMEM;
 
 	mdata = iio_priv(indio_dev);
-	st_sensors_of_i2c_probe(client, st_magn_of_match);
+	st_sensors_of_name_probe(&client->dev, st_magn_of_match,
+				 client->name, sizeof(client->name));
 
 	st_sensors_i2c_configure(indio_dev, client, mdata);
 
diff --git a/drivers/iio/pressure/st_pressure_i2c.c b/drivers/iio/pressure/st_pressure_i2c.c
index 17417a4d5a5f..7f15e927fa2b 100644
--- a/drivers/iio/pressure/st_pressure_i2c.c
+++ b/drivers/iio/pressure/st_pressure_i2c.c
@@ -77,7 +77,8 @@ static int st_press_i2c_probe(struct i2c_client *client,
 	press_data = iio_priv(indio_dev);
 
 	if (client->dev.of_node) {
-		st_sensors_of_i2c_probe(client, st_press_of_match);
+		st_sensors_of_name_probe(&client->dev, st_press_of_match,
+					 client->name, sizeof(client->name));
 	} else if (ACPI_HANDLE(&client->dev)) {
 		ret = st_sensors_match_acpi_device(&client->dev);
 		if ((ret < 0) || (ret >= ST_PRESS_MAX))
diff --git a/include/linux/iio/common/st_sensors.h b/include/linux/iio/common/st_sensors.h
index 497f2b3a5a62..1f8211b6438b 100644
--- a/include/linux/iio/common/st_sensors.h
+++ b/include/linux/iio/common/st_sensors.h
@@ -325,4 +325,16 @@ ssize_t st_sensors_sysfs_sampling_frequency_avail(struct device *dev,
 ssize_t st_sensors_sysfs_scale_avail(struct device *dev,
 				struct device_attribute *attr, char *buf);
 
+#ifdef CONFIG_OF
+void st_sensors_of_name_probe(struct device *dev,
+			      const struct of_device_id *match,
+			      char *name, int len);
+#else
+static inline void st_sensors_of_name_probe(struct device *dev,
+					    const struct of_device_id *match,
+					    char *name, int len)
+{
+}
+#endif
+
 #endif /* ST_SENSORS_H */
diff --git a/include/linux/iio/common/st_sensors_i2c.h b/include/linux/iio/common/st_sensors_i2c.h
index 254de3c7dde8..0a2c25e06d1f 100644
--- a/include/linux/iio/common/st_sensors_i2c.h
+++ b/include/linux/iio/common/st_sensors_i2c.h
@@ -18,16 +18,6 @@
 void st_sensors_i2c_configure(struct iio_dev *indio_dev,
 		struct i2c_client *client, struct st_sensor_data *sdata);
 
-#ifdef CONFIG_OF
-void st_sensors_of_i2c_probe(struct i2c_client *client,
-			     const struct of_device_id *match);
-#else
-static inline void st_sensors_of_i2c_probe(struct i2c_client *client,
-					   const struct of_device_id *match)
-{
-}
-#endif
-
 #ifdef CONFIG_ACPI
 int st_sensors_match_acpi_device(struct device *dev);
 #else
-- 
cgit v1.2.3


From 5ea0727b163cb5575e36397a12eade68a1f35f24 Mon Sep 17 00:00:00 2001
From: Thomas Garnier <thgarnie@google.com>
Date: Wed, 14 Jun 2017 18:12:01 -0700
Subject: x86/syscalls: Check address limit on user-mode return

Ensure the address limit is a user-mode segment before returning to
user-mode. Otherwise a process can corrupt kernel-mode memory and elevate
privileges [1].

The set_fs function sets the TIF_SETFS flag to force a slow path on
return. In the slow path, the address limit is checked to be USER_DS if
needed.

The addr_limit_user_check function is added as a cross-architecture
function to check the address limit.

[1] https://bugs.chromium.org/p/project-zero/issues/detail?id=990

Signed-off-by: Thomas Garnier <thgarnie@google.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: kernel-hardening@lists.openwall.com
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Will Deacon <will.deacon@arm.com>
Cc: David Howells <dhowells@redhat.com>
Cc: Dave Hansen <dave.hansen@intel.com>
Cc: Miroslav Benes <mbenes@suse.cz>
Cc: Chris Metcalf <cmetcalf@mellanox.com>
Cc: Pratyush Anand <panand@redhat.com>
Cc: Russell King <linux@armlinux.org.uk>
Cc: Petr Mladek <pmladek@suse.com>
Cc: Rik van Riel <riel@redhat.com>
Cc: Kees Cook <keescook@chromium.org>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Josh Poimboeuf <jpoimboe@redhat.com>
Cc: linux-arm-kernel@lists.infradead.org
Cc: Will Drewry <wad@chromium.org>
Cc: linux-api@vger.kernel.org
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Paolo Bonzini <pbonzini@redhat.com>
Link: http://lkml.kernel.org/r/20170615011203.144108-1-thgarnie@google.com
---
 arch/x86/entry/common.c            |  3 +++
 arch/x86/include/asm/thread_info.h |  5 ++++-
 arch/x86/include/asm/uaccess.h     |  7 ++++++-
 include/linux/syscalls.h           | 16 ++++++++++++++++
 4 files changed, 29 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/arch/x86/entry/common.c b/arch/x86/entry/common.c
index cdefcfdd9e63..03505ffbe1b6 100644
--- a/arch/x86/entry/common.c
+++ b/arch/x86/entry/common.c
@@ -23,6 +23,7 @@
 #include <linux/user-return-notifier.h>
 #include <linux/uprobes.h>
 #include <linux/livepatch.h>
+#include <linux/syscalls.h>
 
 #include <asm/desc.h>
 #include <asm/traps.h>
@@ -183,6 +184,8 @@ __visible inline void prepare_exit_to_usermode(struct pt_regs *regs)
 	struct thread_info *ti = current_thread_info();
 	u32 cached_flags;
 
+	addr_limit_user_check();
+
 	if (IS_ENABLED(CONFIG_PROVE_LOCKING) && WARN_ON(!irqs_disabled()))
 		local_irq_disable();
 
diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h
index e00e1bd6e7b3..5161da1a0fa0 100644
--- a/arch/x86/include/asm/thread_info.h
+++ b/arch/x86/include/asm/thread_info.h
@@ -98,6 +98,7 @@ struct thread_info {
 #define TIF_SYSCALL_TRACEPOINT	28	/* syscall tracepoint instrumentation */
 #define TIF_ADDR32		29	/* 32-bit address space on 64 bits */
 #define TIF_X32			30	/* 32-bit native x86-64 binary */
+#define TIF_FSCHECK		31	/* Check FS is USER_DS on return */
 
 #define _TIF_SYSCALL_TRACE	(1 << TIF_SYSCALL_TRACE)
 #define _TIF_NOTIFY_RESUME	(1 << TIF_NOTIFY_RESUME)
@@ -122,6 +123,7 @@ struct thread_info {
 #define _TIF_SYSCALL_TRACEPOINT	(1 << TIF_SYSCALL_TRACEPOINT)
 #define _TIF_ADDR32		(1 << TIF_ADDR32)
 #define _TIF_X32		(1 << TIF_X32)
+#define _TIF_FSCHECK		(1 << TIF_FSCHECK)
 
 /*
  * work to do in syscall_trace_enter().  Also includes TIF_NOHZ for
@@ -137,7 +139,8 @@ struct thread_info {
 	(_TIF_SYSCALL_TRACE | _TIF_NOTIFY_RESUME | _TIF_SIGPENDING |	\
 	 _TIF_NEED_RESCHED | _TIF_SINGLESTEP | _TIF_SYSCALL_EMU |	\
 	 _TIF_SYSCALL_AUDIT | _TIF_USER_RETURN_NOTIFY | _TIF_UPROBE |	\
-	 _TIF_PATCH_PENDING | _TIF_NOHZ | _TIF_SYSCALL_TRACEPOINT)
+	 _TIF_PATCH_PENDING | _TIF_NOHZ | _TIF_SYSCALL_TRACEPOINT |	\
+	 _TIF_FSCHECK)
 
 /* flags to check in __switch_to() */
 #define _TIF_WORK_CTXSW							\
diff --git a/arch/x86/include/asm/uaccess.h b/arch/x86/include/asm/uaccess.h
index a059aac9e937..11433f9018e2 100644
--- a/arch/x86/include/asm/uaccess.h
+++ b/arch/x86/include/asm/uaccess.h
@@ -26,7 +26,12 @@
 
 #define get_ds()	(KERNEL_DS)
 #define get_fs()	(current->thread.addr_limit)
-#define set_fs(x)	(current->thread.addr_limit = (x))
+static inline void set_fs(mm_segment_t fs)
+{
+	current->thread.addr_limit = fs;
+	/* On user-mode return, check fs is correct */
+	set_thread_flag(TIF_FSCHECK);
+}
 
 #define segment_eq(a, b)	((a).seg == (b).seg)
 
diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
index 980c3c9b06f8..ac0cf6fb25d6 100644
--- a/include/linux/syscalls.h
+++ b/include/linux/syscalls.h
@@ -206,6 +206,22 @@ extern struct trace_event_functions exit_syscall_print_funcs;
 	}								\
 	static inline long SYSC##name(__MAP(x,__SC_DECL,__VA_ARGS__))
 
+#ifdef TIF_FSCHECK
+/*
+ * Called before coming back to user-mode. Returning to user-mode with an
+ * address limit different than USER_DS can allow to overwrite kernel memory.
+ */
+static inline void addr_limit_user_check(void)
+{
+
+	if (!test_thread_flag(TIF_FSCHECK))
+		return;
+
+	BUG_ON(!segment_eq(get_fs(), USER_DS));
+	clear_thread_flag(TIF_FSCHECK);
+}
+#endif
+
 asmlinkage long sys32_quotactl(unsigned int cmd, const char __user *special,
 			       qid_t id, void __user *addr);
 asmlinkage long sys_time(time_t __user *tloc);
-- 
cgit v1.2.3


From 9b1404c24a357332cb2a6df7c4337e943a4545fd Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Sun, 9 Jul 2017 10:34:35 -0400
Subject: msgrcv(2), msgsnd(2): move compat to native

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 include/linux/msg.h |  8 --------
 ipc/compat.c        | 37 -------------------------------------
 ipc/msg.c           | 45 +++++++++++++++++++++++++++++++++++++++++++--
 3 files changed, 43 insertions(+), 47 deletions(-)

(limited to 'include')

diff --git a/include/linux/msg.h b/include/linux/msg.h
index f3f302f9c197..4e5ec3cbf464 100644
--- a/include/linux/msg.h
+++ b/include/linux/msg.h
@@ -31,12 +31,4 @@ struct msg_queue {
 	struct list_head q_senders;
 };
 
-/* Helper routines for sys_msgsnd and sys_msgrcv */
-extern long do_msgsnd(int msqid, long mtype, void __user *mtext,
-			size_t msgsz, int msgflg);
-extern long do_msgrcv(int msqid, void __user *buf, size_t bufsz, long msgtyp,
-		      int msgflg,
-		      long (*msg_fill)(void __user *, struct msg_msg *,
-				       size_t));
-
 #endif /* _LINUX_MSG_H */
diff --git a/ipc/compat.c b/ipc/compat.c
index 00c2e3beccc8..0586687c3e31 100644
--- a/ipc/compat.c
+++ b/ipc/compat.c
@@ -34,11 +34,6 @@
 
 #include "util.h"
 
-struct compat_msgbuf {
-	compat_long_t mtype;
-	char mtext[1];
-};
-
 int get_compat_ipc64_perm(struct ipc64_perm *to,
 			  struct compat_ipc64_perm __user *from)
 {
@@ -85,38 +80,6 @@ void to_compat_ipc_perm(struct compat_ipc_perm *to, struct ipc64_perm *from)
 	to->seq = from->seq;
 }
 
-static long compat_do_msg_fill(void __user *dest, struct msg_msg *msg, size_t bufsz)
-{
-	struct compat_msgbuf __user *msgp = dest;
-	size_t msgsz;
-
-	if (put_user(msg->m_type, &msgp->mtype))
-		return -EFAULT;
-
-	msgsz = (bufsz > msg->m_ts) ? msg->m_ts : bufsz;
-	if (store_msg(msgp->mtext, msg, msgsz))
-		return -EFAULT;
-	return msgsz;
-}
-
-COMPAT_SYSCALL_DEFINE4(msgsnd, int, msqid, compat_uptr_t, msgp,
-		       compat_ssize_t, msgsz, int, msgflg)
-{
-	struct compat_msgbuf __user *up = compat_ptr(msgp);
-	compat_long_t mtype;
-
-	if (get_user(mtype, &up->mtype))
-		return -EFAULT;
-	return do_msgsnd(msqid, mtype, up->mtext, (ssize_t)msgsz, msgflg);
-}
-
-COMPAT_SYSCALL_DEFINE5(msgrcv, int, msqid, compat_uptr_t, msgp,
-		       compat_ssize_t, msgsz, compat_long_t, msgtyp, int, msgflg)
-{
-	return do_msgrcv(msqid, compat_ptr(msgp), (ssize_t)msgsz, (long)msgtyp,
-			 msgflg, compat_do_msg_fill);
-}
-
 #ifndef COMPAT_SHMLBA
 #define COMPAT_SHMLBA	SHMLBA
 #endif
diff --git a/ipc/msg.c b/ipc/msg.c
index 94690fb53f66..855da19c765a 100644
--- a/ipc/msg.c
+++ b/ipc/msg.c
@@ -730,7 +730,7 @@ static inline int pipelined_send(struct msg_queue *msq, struct msg_msg *msg,
 	return 0;
 }
 
-long do_msgsnd(int msqid, long mtype, void __user *mtext,
+static long do_msgsnd(int msqid, long mtype, void __user *mtext,
 		size_t msgsz, int msgflg)
 {
 	struct msg_queue *msq;
@@ -853,6 +853,25 @@ SYSCALL_DEFINE4(msgsnd, int, msqid, struct msgbuf __user *, msgp, size_t, msgsz,
 	return do_msgsnd(msqid, mtype, msgp->mtext, msgsz, msgflg);
 }
 
+#ifdef CONFIG_COMPAT
+
+struct compat_msgbuf {
+	compat_long_t mtype;
+	char mtext[1];
+};
+
+COMPAT_SYSCALL_DEFINE4(msgsnd, int, msqid, compat_uptr_t, msgp,
+		       compat_ssize_t, msgsz, int, msgflg)
+{
+	struct compat_msgbuf __user *up = compat_ptr(msgp);
+	compat_long_t mtype;
+
+	if (get_user(mtype, &up->mtype))
+		return -EFAULT;
+	return do_msgsnd(msqid, mtype, up->mtext, (ssize_t)msgsz, msgflg);
+}
+#endif
+
 static inline int convert_mode(long *msgtyp, int msgflg)
 {
 	if (msgflg & MSG_COPY)
@@ -949,7 +968,7 @@ static struct msg_msg *find_msg(struct msg_queue *msq, long *msgtyp, int mode)
 	return found ?: ERR_PTR(-EAGAIN);
 }
 
-long do_msgrcv(int msqid, void __user *buf, size_t bufsz, long msgtyp, int msgflg,
+static long do_msgrcv(int msqid, void __user *buf, size_t bufsz, long msgtyp, int msgflg,
 	       long (*msg_handler)(void __user *, struct msg_msg *, size_t))
 {
 	int mode;
@@ -1113,6 +1132,28 @@ SYSCALL_DEFINE5(msgrcv, int, msqid, struct msgbuf __user *, msgp, size_t, msgsz,
 	return do_msgrcv(msqid, msgp, msgsz, msgtyp, msgflg, do_msg_fill);
 }
 
+#ifdef CONFIG_COMPAT
+static long compat_do_msg_fill(void __user *dest, struct msg_msg *msg, size_t bufsz)
+{
+	struct compat_msgbuf __user *msgp = dest;
+	size_t msgsz;
+
+	if (put_user(msg->m_type, &msgp->mtype))
+		return -EFAULT;
+
+	msgsz = (bufsz > msg->m_ts) ? msg->m_ts : bufsz;
+	if (store_msg(msgp->mtext, msg, msgsz))
+		return -EFAULT;
+	return msgsz;
+}
+
+COMPAT_SYSCALL_DEFINE5(msgrcv, int, msqid, compat_uptr_t, msgp,
+		       compat_ssize_t, msgsz, compat_long_t, msgtyp, int, msgflg)
+{
+	return do_msgrcv(msqid, compat_ptr(msgp), (ssize_t)msgsz, (long)msgtyp,
+			 msgflg, compat_do_msg_fill);
+}
+#endif
 
 void msg_init_ns(struct ipc_namespace *ns)
 {
-- 
cgit v1.2.3


From 44ee454670122a959112caaa7aad86d8cacab1ff Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Sun, 9 Jul 2017 10:50:14 -0400
Subject: semtimedop(): move compat to native

... and finally kill the sodding compat_convert_timespec()

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 include/linux/compat.h |  9 ---------
 ipc/compat.c           | 10 ----------
 ipc/sem.c              | 44 +++++++++++++++++++++++++++++++++-----------
 kernel/compat.c        | 23 -----------------------
 4 files changed, 33 insertions(+), 53 deletions(-)

(limited to 'include')

diff --git a/include/linux/compat.h b/include/linux/compat.h
index 5a6a109b4a50..edae425ca8c0 100644
--- a/include/linux/compat.h
+++ b/include/linux/compat.h
@@ -171,15 +171,6 @@ extern int get_compat_itimerspec64(struct itimerspec64 *its,
 extern int put_compat_itimerspec64(const struct itimerspec64 *its,
 			struct compat_itimerspec __user *uits);
 
-/*
- * This function convert a timespec if necessary and returns a *user
- * space* pointer.  If no conversion is necessary, it returns the
- * initial pointer.  NULL is a legitimate argument and will always
- * output NULL.
- */
-extern int compat_convert_timespec(struct timespec __user **,
-				   const void __user *);
-
 struct compat_iovec {
 	compat_uptr_t	iov_base;
 	compat_size_t	iov_len;
diff --git a/ipc/compat.c b/ipc/compat.c
index 871d07da0a52..b17bf93d7b49 100644
--- a/ipc/compat.c
+++ b/ipc/compat.c
@@ -79,13 +79,3 @@ void to_compat_ipc_perm(struct compat_ipc_perm *to, struct ipc64_perm *from)
 	to->mode = from->mode;
 	to->seq = from->seq;
 }
-
-COMPAT_SYSCALL_DEFINE4(semtimedop, int, semid, struct sembuf __user *, tsems,
-		       unsigned, nsops,
-		       const struct compat_timespec __user *, timeout)
-{
-	struct timespec __user *ts64;
-	if (compat_convert_timespec(&ts64, timeout))
-		return -EFAULT;
-	return sys_semtimedop(semid, tsems, nsops, ts64);
-}
diff --git a/ipc/sem.c b/ipc/sem.c
index fcf064d6046a..6b832b7fa9fc 100644
--- a/ipc/sem.c
+++ b/ipc/sem.c
@@ -1855,8 +1855,8 @@ out:
 	return un;
 }
 
-SYSCALL_DEFINE4(semtimedop, int, semid, struct sembuf __user *, tsops,
-		unsigned, nsops, const struct timespec __user *, timeout)
+static long do_semtimedop(int semid, struct sembuf __user *tsops,
+		unsigned nsops, const struct timespec *timeout)
 {
 	int error = -EINVAL;
 	struct sem_array *sma;
@@ -1887,17 +1887,12 @@ SYSCALL_DEFINE4(semtimedop, int, semid, struct sembuf __user *, tsops,
 	}
 
 	if (timeout) {
-		struct timespec _timeout;
-		if (copy_from_user(&_timeout, timeout, sizeof(*timeout))) {
-			error = -EFAULT;
-			goto out_free;
-		}
-		if (_timeout.tv_sec < 0 || _timeout.tv_nsec < 0 ||
-			_timeout.tv_nsec >= 1000000000L) {
+		if (timeout->tv_sec < 0 || timeout->tv_nsec < 0 ||
+			timeout->tv_nsec >= 1000000000L) {
 			error = -EINVAL;
 			goto out_free;
 		}
-		jiffies_left = timespec_to_jiffies(&_timeout);
+		jiffies_left = timespec_to_jiffies(timeout);
 	}
 
 	max = 0;
@@ -2112,10 +2107,37 @@ out_free:
 	return error;
 }
 
+SYSCALL_DEFINE4(semtimedop, int, semid, struct sembuf __user *, tsops,
+		unsigned, nsops, const struct timespec __user *, timeout)
+{
+	if (timeout) {
+		struct timespec ts;
+		if (copy_from_user(&ts, timeout, sizeof(*timeout)))
+			return -EFAULT;
+		return do_semtimedop(semid, tsops, nsops, &ts);
+	}
+	return do_semtimedop(semid, tsops, nsops, NULL);
+}
+
+#ifdef CONFIG_COMPAT
+COMPAT_SYSCALL_DEFINE4(semtimedop, int, semid, struct sembuf __user *, tsems,
+		       unsigned, nsops,
+		       const struct compat_timespec __user *, timeout)
+{
+	if (timeout) {
+		struct timespec ts;
+		if (compat_get_timespec(&ts, timeout))
+			return -EFAULT;
+		return do_semtimedop(semid, tsems, nsops, &ts);
+	}
+	return do_semtimedop(semid, tsems, nsops, NULL);
+}
+#endif
+
 SYSCALL_DEFINE3(semop, int, semid, struct sembuf __user *, tsops,
 		unsigned, nsops)
 {
-	return sys_semtimedop(semid, tsops, nsops, NULL);
+	return do_semtimedop(semid, tsops, nsops, NULL);
 }
 
 /* If CLONE_SYSVSEM is set, establish sharing of SEM_UNDO state between
diff --git a/kernel/compat.c b/kernel/compat.c
index 6f0a0e723a06..772e038d04d9 100644
--- a/kernel/compat.c
+++ b/kernel/compat.c
@@ -200,29 +200,6 @@ int compat_put_timespec(const struct timespec *ts, void __user *uts)
 }
 EXPORT_SYMBOL_GPL(compat_put_timespec);
 
-int compat_convert_timespec(struct timespec __user **kts,
-			    const void __user *cts)
-{
-	struct timespec ts;
-	struct timespec __user *uts;
-
-	if (!cts || COMPAT_USE_64BIT_TIME) {
-		*kts = (struct timespec __user *)cts;
-		return 0;
-	}
-
-	uts = compat_alloc_user_space(sizeof(ts));
-	if (!uts)
-		return -EFAULT;
-	if (compat_get_timespec(&ts, cts))
-		return -EFAULT;
-	if (copy_to_user(uts, &ts, sizeof(ts)))
-		return -EFAULT;
-
-	*kts = uts;
-	return 0;
-}
-
 int get_compat_itimerval(struct itimerval *o, const struct compat_itimerval __user *i)
 {
 	struct compat_itimerval v32;
-- 
cgit v1.2.3


From 9d5b86ac13c573795525ecac6ed2db39ab23e2a8 Mon Sep 17 00:00:00 2001
From: Benjamin Coddington <bcodding@redhat.com>
Date: Sun, 16 Jul 2017 10:28:22 -0400
Subject: fs/locks: Remove fl_nspid and use fs-specific l_pid for remote locks

Since commit c69899a17ca4 "NFSv4: Update of VFS byte range lock must be
atomic with the stateid update", NFSv4 has been inserting locks in rpciod
worker context.  The result is that the file_lock's fl_nspid is the
kworker's pid instead of the original userspace pid.

The fl_nspid is only used to represent the namespaced virtual pid number
when displaying locks or returning from F_GETLK.  There's no reason to set
it for every inserted lock, since we can usually just look it up from
fl_pid.  So, instead of looking up and holding struct pid for every lock,
let's just look up the virtual pid number from fl_pid when it is needed.
That means we can remove fl_nspid entirely.

The translaton and presentation of fl_pid should handle the following four
cases:

1 - F_GETLK on a remote file with a remote lock:
    In this case, the filesystem should determine the l_pid to return here.
    Filesystems should indicate that the fl_pid represents a non-local pid
    value that should not be translated by returning an fl_pid <= 0.

2 - F_GETLK on a local file with a remote lock:
    This should be the l_pid of the lock manager process, and translated.

3 - F_GETLK on a remote file with a local lock, and
4 - F_GETLK on a local file with a local lock:
    These should be the translated l_pid of the local locking process.

Fuse was already doing the correct thing by translating the pid into the
caller's namespace.  With this change we must update fuse to translate
to init's pid namespace, so that the locks API can then translate from
init's pid namespace into the pid namespace of the caller.

With this change, the locks API will expect that if a filesystem returns
a remote pid as opposed to a local pid for F_GETLK, that remote pid will
be <= 0.  This signifies that the pid is remote, and the locks API will
forego translating that pid into the pid namespace of the local calling
process.

Finally, we convert remote filesystems to present remote pids using
negative numbers. Have lustre, 9p, ceph, cifs, and dlm negate the remote
pid returned for F_GETLK lock requests.

Since local pids will never be larger than PID_MAX_LIMIT (which is
currently defined as <= 4 million), but pid_t is an unsigned int, we
should have plenty of room to represent remote pids with negative
numbers if we assume that remote pid numbers are similarly limited.

If this is not the case, then we run the risk of having a remote pid
returned for which there is also a corresponding local pid.  This is a
problem we have now, but this patch should reduce the chances of that
occurring, while also returning those remote pid numbers, for whatever
that may be worth.

Signed-off-by: Benjamin Coddington <bcodding@redhat.com>
Signed-off-by: Jeff Layton <jlayton@redhat.com>
---
 drivers/staging/lustre/lustre/ldlm/ldlm_flock.c |  2 +-
 fs/9p/vfs_file.c                                |  2 +-
 fs/ceph/locks.c                                 |  2 +-
 fs/cifs/cifssmb.c                               |  2 +-
 fs/dlm/plock.c                                  |  2 +-
 fs/fuse/file.c                                  |  6 +--
 fs/locks.c                                      | 62 +++++++++++++++----------
 include/linux/fs.h                              |  1 -
 8 files changed, 45 insertions(+), 34 deletions(-)

(limited to 'include')

diff --git a/drivers/staging/lustre/lustre/ldlm/ldlm_flock.c b/drivers/staging/lustre/lustre/ldlm/ldlm_flock.c
index b7f28b39c7b3..abcbf075acc0 100644
--- a/drivers/staging/lustre/lustre/ldlm/ldlm_flock.c
+++ b/drivers/staging/lustre/lustre/ldlm/ldlm_flock.c
@@ -596,7 +596,7 @@ granted:
 		default:
 			getlk->fl_type = F_UNLCK;
 		}
-		getlk->fl_pid = (pid_t)lock->l_policy_data.l_flock.pid;
+		getlk->fl_pid = -(pid_t)lock->l_policy_data.l_flock.pid;
 		getlk->fl_start = (loff_t)lock->l_policy_data.l_flock.start;
 		getlk->fl_end = (loff_t)lock->l_policy_data.l_flock.end;
 	} else {
diff --git a/fs/9p/vfs_file.c b/fs/9p/vfs_file.c
index 3de3b4a89d89..43c242e17132 100644
--- a/fs/9p/vfs_file.c
+++ b/fs/9p/vfs_file.c
@@ -288,7 +288,7 @@ static int v9fs_file_getlock(struct file *filp, struct file_lock *fl)
 			fl->fl_end = OFFSET_MAX;
 		else
 			fl->fl_end = glock.start + glock.length - 1;
-		fl->fl_pid = glock.proc_id;
+		fl->fl_pid = -glock.proc_id;
 	}
 	kfree(glock.client_id);
 	return res;
diff --git a/fs/ceph/locks.c b/fs/ceph/locks.c
index 64ae74472046..8cd63e8123d8 100644
--- a/fs/ceph/locks.c
+++ b/fs/ceph/locks.c
@@ -79,7 +79,7 @@ static int ceph_lock_message(u8 lock_type, u16 operation, struct file *file,
 	err = ceph_mdsc_do_request(mdsc, inode, req);
 
 	if (operation == CEPH_MDS_OP_GETFILELOCK) {
-		fl->fl_pid = le64_to_cpu(req->r_reply_info.filelock_reply->pid);
+		fl->fl_pid = -le64_to_cpu(req->r_reply_info.filelock_reply->pid);
 		if (CEPH_LOCK_SHARED == req->r_reply_info.filelock_reply->type)
 			fl->fl_type = F_RDLCK;
 		else if (CEPH_LOCK_EXCL == req->r_reply_info.filelock_reply->type)
diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c
index 72a53bd19865..118a63e7e221 100644
--- a/fs/cifs/cifssmb.c
+++ b/fs/cifs/cifssmb.c
@@ -2522,7 +2522,7 @@ CIFSSMBPosixLock(const unsigned int xid, struct cifs_tcon *tcon,
 			pLockData->fl_start = le64_to_cpu(parm_data->start);
 			pLockData->fl_end = pLockData->fl_start +
 					le64_to_cpu(parm_data->length) - 1;
-			pLockData->fl_pid = le32_to_cpu(parm_data->pid);
+			pLockData->fl_pid = -le32_to_cpu(parm_data->pid);
 		}
 	}
 
diff --git a/fs/dlm/plock.c b/fs/dlm/plock.c
index d401425f602a..e631b1689228 100644
--- a/fs/dlm/plock.c
+++ b/fs/dlm/plock.c
@@ -367,7 +367,7 @@ int dlm_posix_get(dlm_lockspace_t *lockspace, u64 number, struct file *file,
 		locks_init_lock(fl);
 		fl->fl_type = (op->info.ex) ? F_WRLCK : F_RDLCK;
 		fl->fl_flags = FL_POSIX;
-		fl->fl_pid = op->info.pid;
+		fl->fl_pid = -op->info.pid;
 		fl->fl_start = op->info.start;
 		fl->fl_end = op->info.end;
 		rv = 0;
diff --git a/fs/fuse/file.c b/fs/fuse/file.c
index 3ee4fdc3da9e..7cd692f51d1d 100644
--- a/fs/fuse/file.c
+++ b/fs/fuse/file.c
@@ -2101,11 +2101,11 @@ static int convert_fuse_file_lock(struct fuse_conn *fc,
 		fl->fl_end = ffl->end;
 
 		/*
-		 * Convert pid into the caller's pid namespace. If the pid
-		 * does not map into the namespace fl_pid will get set to 0.
+		 * Convert pid into init's pid namespace.  The locks API will
+		 * translate it into the caller's pid namespace.
 		 */
 		rcu_read_lock();
-		fl->fl_pid = pid_vnr(find_pid_ns(ffl->pid, fc->pid_ns));
+		fl->fl_pid = pid_nr_ns(find_pid_ns(ffl->pid, fc->pid_ns), &init_pid_ns);
 		rcu_read_unlock();
 		break;
 
diff --git a/fs/locks.c b/fs/locks.c
index d7daa6c8932f..6d0949880ebd 100644
--- a/fs/locks.c
+++ b/fs/locks.c
@@ -137,6 +137,7 @@
 #define IS_FLOCK(fl)	(fl->fl_flags & FL_FLOCK)
 #define IS_LEASE(fl)	(fl->fl_flags & (FL_LEASE|FL_DELEG|FL_LAYOUT))
 #define IS_OFDLCK(fl)	(fl->fl_flags & FL_OFDLCK)
+#define IS_REMOTELCK(fl)	(fl->fl_pid <= 0)
 
 static inline bool is_remote_lock(struct file *filp)
 {
@@ -733,7 +734,6 @@ static void locks_wake_up_blocks(struct file_lock *blocker)
 static void
 locks_insert_lock_ctx(struct file_lock *fl, struct list_head *before)
 {
-	fl->fl_nspid = get_pid(task_tgid(current));
 	list_add_tail(&fl->fl_list, before);
 	locks_insert_global_locks(fl);
 }
@@ -743,10 +743,6 @@ locks_unlink_lock_ctx(struct file_lock *fl)
 {
 	locks_delete_global_locks(fl);
 	list_del_init(&fl->fl_list);
-	if (fl->fl_nspid) {
-		put_pid(fl->fl_nspid);
-		fl->fl_nspid = NULL;
-	}
 	locks_wake_up_blocks(fl);
 }
 
@@ -823,8 +819,6 @@ posix_test_lock(struct file *filp, struct file_lock *fl)
 	list_for_each_entry(cfl, &ctx->flc_posix, fl_list) {
 		if (posix_locks_conflict(fl, cfl)) {
 			locks_copy_conflock(fl, cfl);
-			if (cfl->fl_nspid)
-				fl->fl_pid = pid_vnr(cfl->fl_nspid);
 			goto out;
 		}
 	}
@@ -2048,9 +2042,33 @@ int vfs_test_lock(struct file *filp, struct file_lock *fl)
 }
 EXPORT_SYMBOL_GPL(vfs_test_lock);
 
+/**
+ * locks_translate_pid - translate a file_lock's fl_pid number into a namespace
+ * @fl: The file_lock who's fl_pid should be translated
+ * @ns: The namespace into which the pid should be translated
+ *
+ * Used to tranlate a fl_pid into a namespace virtual pid number
+ */
+static pid_t locks_translate_pid(struct file_lock *fl, struct pid_namespace *ns)
+{
+	pid_t vnr;
+	struct pid *pid;
+
+	if (IS_OFDLCK(fl))
+		return -1;
+	if (IS_REMOTELCK(fl))
+		return fl->fl_pid;
+
+	rcu_read_lock();
+	pid = find_pid_ns(fl->fl_pid, &init_pid_ns);
+	vnr = pid_nr_ns(pid, ns);
+	rcu_read_unlock();
+	return vnr;
+}
+
 static int posix_lock_to_flock(struct flock *flock, struct file_lock *fl)
 {
-	flock->l_pid = IS_OFDLCK(fl) ? -1 : fl->fl_pid;
+	flock->l_pid = locks_translate_pid(fl, task_active_pid_ns(current));
 #if BITS_PER_LONG == 32
 	/*
 	 * Make sure we can represent the posix lock via
@@ -2072,7 +2090,7 @@ static int posix_lock_to_flock(struct flock *flock, struct file_lock *fl)
 #if BITS_PER_LONG == 32
 static void posix_lock_to_flock64(struct flock64 *flock, struct file_lock *fl)
 {
-	flock->l_pid = IS_OFDLCK(fl) ? -1 : fl->fl_pid;
+	flock->l_pid = locks_translate_pid(fl, task_active_pid_ns(current));
 	flock->l_start = fl->fl_start;
 	flock->l_len = fl->fl_end == OFFSET_MAX ? 0 :
 		fl->fl_end - fl->fl_start + 1;
@@ -2584,22 +2602,16 @@ static void lock_get_status(struct seq_file *f, struct file_lock *fl,
 {
 	struct inode *inode = NULL;
 	unsigned int fl_pid;
+	struct pid_namespace *proc_pidns = file_inode(f->file)->i_sb->s_fs_info;
 
-	if (fl->fl_nspid) {
-		struct pid_namespace *proc_pidns = file_inode(f->file)->i_sb->s_fs_info;
-
-		/* Don't let fl_pid change based on who is reading the file */
-		fl_pid = pid_nr_ns(fl->fl_nspid, proc_pidns);
-
-		/*
-		 * If there isn't a fl_pid don't display who is waiting on
-		 * the lock if we are called from locks_show, or if we are
-		 * called from __show_fd_info - skip lock entirely
-		 */
-		if (fl_pid == 0)
-			return;
-	} else
-		fl_pid = fl->fl_pid;
+	fl_pid = locks_translate_pid(fl, proc_pidns);
+	/*
+	 * If there isn't a fl_pid don't display who is waiting on
+	 * the lock if we are called from locks_show, or if we are
+	 * called from __show_fd_info - skip lock entirely
+	 */
+	if (fl_pid == 0)
+		return;
 
 	if (fl->fl_file != NULL)
 		inode = locks_inode(fl->fl_file);
@@ -2674,7 +2686,7 @@ static int locks_show(struct seq_file *f, void *v)
 
 	fl = hlist_entry(v, struct file_lock, fl_link);
 
-	if (fl->fl_nspid && !pid_nr_ns(fl->fl_nspid, proc_pidns))
+	if (locks_translate_pid(fl, proc_pidns) == 0)
 		return 0;
 
 	lock_get_status(f, fl, iter->li_pos, "");
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 7b5d6816542b..f0b108af9b02 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -999,7 +999,6 @@ struct file_lock {
 	unsigned char fl_type;
 	unsigned int fl_pid;
 	int fl_link_cpu;		/* what cpu's list is this on? */
-	struct pid *fl_nspid;
 	wait_queue_head_t fl_wait;
 	struct file *fl_file;
 	loff_t fl_start;
-- 
cgit v1.2.3


From ccdb2d17df9f07c291d43b0aeea7c90e4c020489 Mon Sep 17 00:00:00 2001
From: Vincent Bernat <vincent@bernat.im>
Date: Sat, 15 Jul 2017 19:40:20 +0200
Subject: ip6: fix PMTU discovery when using /127 subnets

The definition of an "anycast destination address" has been tweaked as a
side-effect of commit 2647a9b07032 ("ipv6: Remove external dependency on
rt6i_gateway and RTF_ANYCAST"). The first address of a point-to-point
/127 subnet is now considered as an anycast address. This prevents
ICMPv6 errors to be returned to a sender of such a subnet and breaks
PMTU discovery.

This can be reproduced with:

    ip link add name out6 type veth peer name in6
    ip link add name out7 type veth peer name in7
    ip link set mtu 1400 dev out7
    ip link set mtu 1400 dev in7
    ip netns add next-hop
    ip netns add next-next-hop
    ip link set netns next-hop dev in6
    ip link set netns next-hop dev out7
    ip link set netns next-next-hop dev in7
    ip link set up dev out6
    ip addr add 2001:db8:1::12/127 dev out6
    ip netns exec next-hop ip link set up dev in6
    ip netns exec next-hop ip link set up dev out7
    ip netns exec next-hop ip addr add 2001:db8:1::13/127 dev in6
    ip netns exec next-hop ip addr add 2001:db8:1::14/127 dev out7
    ip netns exec next-hop ip route add default via 2001:db8:1::15
    ip netns exec next-hop sysctl -qw net.ipv6.conf.all.forwarding=1
    ip netns exec next-next-hop ip link set up dev in7
    ip netns exec next-next-hop ip addr add 2001:db8:1::15/127 dev in7
    ip netns exec next-next-hop ip addr add 2001:db8:1::50/128 dev in7
    ip netns exec next-next-hop ip route add default via 2001:db8:1::14
    ip netns exec next-next-hop sysctl -qw net.ipv6.conf.all.forwarding=1
    ip route add 2001:db8:1::48/123 via 2001:db8:1::13
    sleep 4
    ping -M do -s 1452 -c 3 2001:db8:1::50 || true
    ip route get 2001:db8:1::50

Before the patch, we get:

    2001:db8:1::50 from :: via 2001:db8:1::13 dev out6 src 2001:db8:1::12 metric 1024  pref medium

After the patch, we get:

    2001:db8:1::50 via 2001:db8:1::13 dev out6 src 2001:db8:1::12 metric 0
        cache  expires 578sec mtu 1400 pref medium

Fixes: 2647a9b07032 ("ipv6: Remove external dependency on rt6i_gateway and RTF_ANYCAST")
Signed-off-by: Vincent Bernat <vincent@bernat.im>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/ip6_route.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/net/ip6_route.h b/include/net/ip6_route.h
index 199056933dcb..907d39a42f6b 100644
--- a/include/net/ip6_route.h
+++ b/include/net/ip6_route.h
@@ -194,7 +194,7 @@ static inline bool ipv6_anycast_destination(const struct dst_entry *dst,
 	struct rt6_info *rt = (struct rt6_info *)dst;
 
 	return rt->rt6i_flags & RTF_ANYCAST ||
-		(rt->rt6i_dst.plen != 128 &&
+		(rt->rt6i_dst.plen < 127 &&
 		 ipv6_addr_equal(&rt->rt6i_dst.addr, daddr));
 }
 
-- 
cgit v1.2.3


From 788b950c62e06b02278a0fd380e1a0667996ce3c Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Sun, 16 Jul 2017 21:43:33 -0400
Subject: cgroup: distinguish local and children populated states

cgrp->populated_cnt counts both local (the cgroup's populated
css_sets) and subtree proper (populated children) so that it's only
zero when the whole subtree, including self, is empty.

This patch splits the counter into two so that local and children
populated states are tracked separately.  It allows finer-grained
tests on the state of the hierarchy which will be used to replace
css_set walking local populated test.

Signed-off-by: Tejun Heo <tj@kernel.org>
---
 include/linux/cgroup-defs.h | 13 +++++++++----
 include/linux/cgroup.h      |  2 +-
 kernel/cgroup/cgroup.c      | 37 +++++++++++++++++++++----------------
 3 files changed, 31 insertions(+), 21 deletions(-)

(limited to 'include')

diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h
index 09f4c7df1478..ae7bc1e70085 100644
--- a/include/linux/cgroup-defs.h
+++ b/include/linux/cgroup-defs.h
@@ -263,11 +263,16 @@ struct cgroup {
 
 	/*
 	 * Each non-empty css_set associated with this cgroup contributes
-	 * one to populated_cnt.  All children with non-zero popuplated_cnt
-	 * of their own contribute one.  The count is zero iff there's no
-	 * task in this cgroup or its subtree.
+	 * one to nr_populated_csets.  The counter is zero iff this cgroup
+	 * doesn't have any tasks.
+	 *
+	 * All children which have non-zero nr_populated_csets and/or
+	 * nr_populated_children of their own contribute one to
+	 * nr_populated_children.  The counter is zero iff this cgroup's
+	 * subtree proper doesn't have any tasks.
 	 */
-	int populated_cnt;
+	int nr_populated_csets;
+	int nr_populated_children;
 
 	struct kernfs_node *kn;		/* cgroup kernfs entry */
 	struct cgroup_file procs_file;	/* handle for "cgroup.procs" */
diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index 710a005c6b7a..308b10797a54 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -537,7 +537,7 @@ static inline bool task_under_cgroup_hierarchy(struct task_struct *task,
 /* no synchronization, the result can only be used as a hint */
 static inline bool cgroup_is_populated(struct cgroup *cgrp)
 {
-	return cgrp->populated_cnt;
+	return cgrp->nr_populated_csets + cgrp->nr_populated_children;
 }
 
 /* returns ino associated with a cgroup */
diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
index 4f02b5edd82c..5fe2644cd0f3 100644
--- a/kernel/cgroup/cgroup.c
+++ b/kernel/cgroup/cgroup.c
@@ -587,39 +587,44 @@ static bool css_set_populated(struct css_set *cset)
 }
 
 /**
- * cgroup_update_populated - updated populated count of a cgroup
+ * cgroup_update_populated - update the populated count of a cgroup
  * @cgrp: the target cgroup
  * @populated: inc or dec populated count
  *
  * One of the css_sets associated with @cgrp is either getting its first
- * task or losing the last.  Update @cgrp->populated_cnt accordingly.  The
- * count is propagated towards root so that a given cgroup's populated_cnt
- * is zero iff the cgroup and all its descendants don't contain any tasks.
+ * task or losing the last.  Update @cgrp->nr_populated_* accordingly.  The
+ * count is propagated towards root so that a given cgroup's
+ * nr_populated_children is zero iff none of its descendants contain any
+ * tasks.
  *
- * @cgrp's interface file "cgroup.populated" is zero if
- * @cgrp->populated_cnt is zero and 1 otherwise.  When @cgrp->populated_cnt
- * changes from or to zero, userland is notified that the content of the
- * interface file has changed.  This can be used to detect when @cgrp and
- * its descendants become populated or empty.
+ * @cgrp's interface file "cgroup.populated" is zero if both
+ * @cgrp->nr_populated_csets and @cgrp->nr_populated_children are zero and
+ * 1 otherwise.  When the sum changes from or to zero, userland is notified
+ * that the content of the interface file has changed.  This can be used to
+ * detect when @cgrp and its descendants become populated or empty.
  */
 static void cgroup_update_populated(struct cgroup *cgrp, bool populated)
 {
+	struct cgroup *child = NULL;
+	int adj = populated ? 1 : -1;
+
 	lockdep_assert_held(&css_set_lock);
 
 	do {
-		bool trigger;
+		bool was_populated = cgroup_is_populated(cgrp);
 
-		if (populated)
-			trigger = !cgrp->populated_cnt++;
+		if (!child)
+			cgrp->nr_populated_csets += adj;
 		else
-			trigger = !--cgrp->populated_cnt;
+			cgrp->nr_populated_children += adj;
 
-		if (!trigger)
+		if (was_populated == cgroup_is_populated(cgrp))
 			break;
 
 		cgroup1_check_for_release(cgrp);
 		cgroup_file_notify(&cgrp->events_file);
 
+		child = cgrp;
 		cgrp = cgroup_parent(cgrp);
 	} while (cgrp);
 }
@@ -630,7 +635,7 @@ static void cgroup_update_populated(struct cgroup *cgrp, bool populated)
  * @populated: whether @cset is populated or depopulated
  *
  * @cset is either getting the first task or losing the last.  Update the
- * ->populated_cnt of all associated cgroups accordingly.
+ * populated counters of all associated cgroups accordingly.
  */
 static void css_set_update_populated(struct css_set *cset, bool populated)
 {
@@ -653,7 +658,7 @@ static void css_set_update_populated(struct css_set *cset, bool populated)
  * css_set, @from_cset can be NULL.  If @task is being disassociated
  * instead of moved, @to_cset can be NULL.
  *
- * This function automatically handles populated_cnt updates and
+ * This function automatically handles populated counter updates and
  * css_task_iter adjustments but the caller is responsible for managing
  * @from_cset and @to_cset's reference counts.
  */
-- 
cgit v1.2.3


From a38905e6aa1a758af003b80f3318196eadb86dfe Mon Sep 17 00:00:00 2001
From: Xin Long <lucien.xin@gmail.com>
Date: Mon, 17 Jul 2017 11:29:49 +0800
Subject: sctp: remove the typedef sctp_ipv4addr_param_t

This patch is to remove the typedef sctp_ipv4addr_param_t, and replace
with struct sctp_ipv4addr_param in the places where it's using this
typedef.

Signed-off-by: Xin Long <lucien.xin@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/sctp.h     | 6 +++---
 net/sctp/protocol.c      | 2 +-
 net/sctp/sm_make_chunk.c | 2 +-
 3 files changed, 5 insertions(+), 5 deletions(-)

(limited to 'include')

diff --git a/include/linux/sctp.h b/include/linux/sctp.h
index 99e866487e2f..e42095d7ce57 100644
--- a/include/linux/sctp.h
+++ b/include/linux/sctp.h
@@ -273,10 +273,10 @@ struct sctp_init_chunk {
 
 
 /* Section 3.3.2.1. IPv4 Address Parameter (5) */
-typedef struct sctp_ipv4addr_param {
+struct sctp_ipv4addr_param {
 	struct sctp_paramhdr param_hdr;
-	struct in_addr  addr;
-} sctp_ipv4addr_param_t;
+	struct in_addr addr;
+};
 
 /* Section 3.3.2.1. IPv6 Address Parameter (6) */
 typedef struct sctp_ipv6addr_param {
diff --git a/net/sctp/protocol.c b/net/sctp/protocol.c
index 989a900383b5..852556d67ae3 100644
--- a/net/sctp/protocol.c
+++ b/net/sctp/protocol.c
@@ -292,7 +292,7 @@ static void sctp_v4_from_addr_param(union sctp_addr *addr,
 static int sctp_v4_to_addr_param(const union sctp_addr *addr,
 				 union sctp_addr_param *param)
 {
-	int length = sizeof(sctp_ipv4addr_param_t);
+	int length = sizeof(struct sctp_ipv4addr_param);
 
 	param->v4.param_hdr.type = SCTP_PARAM_IPV4_ADDRESS;
 	param->v4.param_hdr.length = htons(length);
diff --git a/net/sctp/sm_make_chunk.c b/net/sctp/sm_make_chunk.c
index 4e16b02ed832..0dc64da74d55 100644
--- a/net/sctp/sm_make_chunk.c
+++ b/net/sctp/sm_make_chunk.c
@@ -3153,7 +3153,7 @@ bool sctp_verify_asconf(const struct sctp_association *asoc,
 		case SCTP_PARAM_ERR_CAUSE:
 			break;
 		case SCTP_PARAM_IPV4_ADDRESS:
-			if (length != sizeof(sctp_ipv4addr_param_t))
+			if (length != sizeof(struct sctp_ipv4addr_param))
 				return false;
 			/* ensure there is only one addr param and it's in the
 			 * beginning of addip_hdr params, or we reject it.
-- 
cgit v1.2.3


From 00987cc07e3f0f01699800cd89adf13a908cdee5 Mon Sep 17 00:00:00 2001
From: Xin Long <lucien.xin@gmail.com>
Date: Mon, 17 Jul 2017 11:29:50 +0800
Subject: sctp: remove the typedef sctp_ipv6addr_param_t

This patch is to remove the typedef sctp_ipv6addr_param_t, and replace
with struct sctp_ipv6addr_param in the places where it's using this
typedef.

Signed-off-by: Xin Long <lucien.xin@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/sctp.h     | 4 ++--
 net/sctp/ipv6.c          | 2 +-
 net/sctp/sm_make_chunk.c | 2 +-
 3 files changed, 4 insertions(+), 4 deletions(-)

(limited to 'include')

diff --git a/include/linux/sctp.h b/include/linux/sctp.h
index e42095d7ce57..6b45c8a38642 100644
--- a/include/linux/sctp.h
+++ b/include/linux/sctp.h
@@ -279,10 +279,10 @@ struct sctp_ipv4addr_param {
 };
 
 /* Section 3.3.2.1. IPv6 Address Parameter (6) */
-typedef struct sctp_ipv6addr_param {
+struct sctp_ipv6addr_param {
 	struct sctp_paramhdr param_hdr;
 	struct in6_addr addr;
-} sctp_ipv6addr_param_t;
+};
 
 /* Section 3.3.2.1 Cookie Preservative (9) */
 typedef struct sctp_cookie_preserve_param {
diff --git a/net/sctp/ipv6.c b/net/sctp/ipv6.c
index 2a186b201ad2..107d7c912922 100644
--- a/net/sctp/ipv6.c
+++ b/net/sctp/ipv6.c
@@ -497,7 +497,7 @@ static void sctp_v6_from_addr_param(union sctp_addr *addr,
 static int sctp_v6_to_addr_param(const union sctp_addr *addr,
 				 union sctp_addr_param *param)
 {
-	int length = sizeof(sctp_ipv6addr_param_t);
+	int length = sizeof(struct sctp_ipv6addr_param);
 
 	param->v6.param_hdr.type = SCTP_PARAM_IPV6_ADDRESS;
 	param->v6.param_hdr.length = htons(length);
diff --git a/net/sctp/sm_make_chunk.c b/net/sctp/sm_make_chunk.c
index 0dc64da74d55..e5be305072b5 100644
--- a/net/sctp/sm_make_chunk.c
+++ b/net/sctp/sm_make_chunk.c
@@ -3163,7 +3163,7 @@ bool sctp_verify_asconf(const struct sctp_association *asoc,
 			addr_param_seen = true;
 			break;
 		case SCTP_PARAM_IPV6_ADDRESS:
-			if (length != sizeof(sctp_ipv6addr_param_t))
+			if (length != sizeof(struct sctp_ipv6addr_param))
 				return false;
 			if (param.v != addip->addip_hdr.params)
 				return false;
-- 
cgit v1.2.3


From 365ddb65e77f6b99d4aba09e0d8a096aada57815 Mon Sep 17 00:00:00 2001
From: Xin Long <lucien.xin@gmail.com>
Date: Mon, 17 Jul 2017 11:29:51 +0800
Subject: sctp: remove the typedef sctp_cookie_preserve_param_t

This patch is to remove the typedef sctp_cookie_preserve_param_t, and
replace with struct sctp_cookie_preserve_param in the places where it's
using this typedef.

It is also to fix some indents in sctp_sf_do_5_2_6_stale().

Signed-off-by: Xin Long <lucien.xin@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/sctp.h    |  6 +++---
 net/sctp/sm_statefuns.c | 11 +++++------
 2 files changed, 8 insertions(+), 9 deletions(-)

(limited to 'include')

diff --git a/include/linux/sctp.h b/include/linux/sctp.h
index 6b45c8a38642..d8f9d8f8649b 100644
--- a/include/linux/sctp.h
+++ b/include/linux/sctp.h
@@ -285,10 +285,10 @@ struct sctp_ipv6addr_param {
 };
 
 /* Section 3.3.2.1 Cookie Preservative (9) */
-typedef struct sctp_cookie_preserve_param {
+struct sctp_cookie_preserve_param {
 	struct sctp_paramhdr param_hdr;
-	__be32          lifespan_increment;
-} sctp_cookie_preserve_param_t;
+	__be32 lifespan_increment;
+};
 
 /* Section 3.3.2.1 Host Name Address (11) */
 typedef struct sctp_hostname_param {
diff --git a/net/sctp/sm_statefuns.c b/net/sctp/sm_statefuns.c
index b2a74c3823ee..ae4c48c4f657 100644
--- a/net/sctp/sm_statefuns.c
+++ b/net/sctp/sm_statefuns.c
@@ -2336,13 +2336,12 @@ static sctp_disposition_t sctp_sf_do_5_2_6_stale(struct net *net,
 						 void *arg,
 						 sctp_cmd_seq_t *commands)
 {
-	struct sctp_chunk *chunk = arg;
-	u32 stale;
-	sctp_cookie_preserve_param_t bht;
-	sctp_errhdr_t *err;
-	struct sctp_chunk *reply;
-	struct sctp_bind_addr *bp;
 	int attempts = asoc->init_err_counter + 1;
+	struct sctp_chunk *chunk = arg, *reply;
+	struct sctp_cookie_preserve_param bht;
+	struct sctp_bind_addr *bp;
+	sctp_errhdr_t *err;
+	u32 stale;
 
 	if (attempts > asoc->max_init_attempts) {
 		sctp_add_cmd_sf(commands, SCTP_CMD_SET_SK_ERR,
-- 
cgit v1.2.3


From df9af0063f154c1a4f22a5570749d185d080cf56 Mon Sep 17 00:00:00 2001
From: Xin Long <lucien.xin@gmail.com>
Date: Mon, 17 Jul 2017 11:29:52 +0800
Subject: sctp: remove the typedef sctp_hostname_param_t

Remove this typedef, there is even no places using it.

Signed-off-by: Xin Long <lucien.xin@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/sctp.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/linux/sctp.h b/include/linux/sctp.h
index d8f9d8f8649b..c43e9067d41a 100644
--- a/include/linux/sctp.h
+++ b/include/linux/sctp.h
@@ -291,10 +291,10 @@ struct sctp_cookie_preserve_param {
 };
 
 /* Section 3.3.2.1 Host Name Address (11) */
-typedef struct sctp_hostname_param {
+struct sctp_hostname_param {
 	struct sctp_paramhdr param_hdr;
 	uint8_t hostname[0];
-} sctp_hostname_param_t;
+};
 
 /* Section 3.3.2.1 Supported Address Types (12) */
 typedef struct sctp_supported_addrs_param {
-- 
cgit v1.2.3


From e925d506f1a21f7fd24a8fdd3e73e0810c655de4 Mon Sep 17 00:00:00 2001
From: Xin Long <lucien.xin@gmail.com>
Date: Mon, 17 Jul 2017 11:29:53 +0800
Subject: sctp: remove the typedef sctp_supported_addrs_param_t

This patch is to remove the typedef sctp_supported_addrs_param_t, and
replace with struct sctp_supported_addrs_param in the places where it's
using this typedef.

Signed-off-by: Xin Long <lucien.xin@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/sctp.h     | 4 ++--
 net/sctp/sm_make_chunk.c | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/include/linux/sctp.h b/include/linux/sctp.h
index c43e9067d41a..3ca3ab7302a6 100644
--- a/include/linux/sctp.h
+++ b/include/linux/sctp.h
@@ -297,10 +297,10 @@ struct sctp_hostname_param {
 };
 
 /* Section 3.3.2.1 Supported Address Types (12) */
-typedef struct sctp_supported_addrs_param {
+struct sctp_supported_addrs_param {
 	struct sctp_paramhdr param_hdr;
 	__be16 types[0];
-} sctp_supported_addrs_param_t;
+};
 
 /* Appendix A. ECN Capable (32768) */
 typedef struct sctp_ecn_capable_param {
diff --git a/net/sctp/sm_make_chunk.c b/net/sctp/sm_make_chunk.c
index e5be305072b5..fb06d4fd0515 100644
--- a/net/sctp/sm_make_chunk.c
+++ b/net/sctp/sm_make_chunk.c
@@ -223,7 +223,7 @@ struct sctp_chunk *sctp_make_init(const struct sctp_association *asoc,
 	struct sctp_chunk *retval = NULL;
 	int num_types, addrs_len = 0;
 	struct sctp_sock *sp;
-	sctp_supported_addrs_param_t sat;
+	struct sctp_supported_addrs_param sat;
 	__be16 types[2];
 	sctp_adaptation_ind_param_t aiparam;
 	sctp_supported_ext_param_t ext_param;
-- 
cgit v1.2.3


From c1dd5df39be5a98c843b9352c22c5569f84bec44 Mon Sep 17 00:00:00 2001
From: Xin Long <lucien.xin@gmail.com>
Date: Mon, 17 Jul 2017 11:29:54 +0800
Subject: sctp: remove struct sctp_ecn_capable_param

Remove it, there is even no places using it.

Signed-off-by: Xin Long <lucien.xin@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/sctp.h | 5 -----
 1 file changed, 5 deletions(-)

(limited to 'include')

diff --git a/include/linux/sctp.h b/include/linux/sctp.h
index 3ca3ab7302a6..75524829aa81 100644
--- a/include/linux/sctp.h
+++ b/include/linux/sctp.h
@@ -302,11 +302,6 @@ struct sctp_supported_addrs_param {
 	__be16 types[0];
 };
 
-/* Appendix A. ECN Capable (32768) */
-typedef struct sctp_ecn_capable_param {
-	struct sctp_paramhdr param_hdr;
-} sctp_ecn_capable_param_t;
-
 /* ADDIP Section 3.2.6 Adaptation Layer Indication */
 typedef struct sctp_adaptation_ind_param {
 	struct sctp_paramhdr param_hdr;
-- 
cgit v1.2.3


From 85f6bd24ac579ef0926eb4c564ba1f3c8a7f8563 Mon Sep 17 00:00:00 2001
From: Xin Long <lucien.xin@gmail.com>
Date: Mon, 17 Jul 2017 11:29:55 +0800
Subject: sctp: remove the typedef sctp_adaptation_ind_param_t

This patch is to remove the typedef sctp_adaptation_ind_param_t, and
replace with struct sctp_adaptation_ind_param in the places where it's
using this typedef.

Signed-off-by: Xin Long <lucien.xin@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/sctp.h     | 4 ++--
 net/sctp/sm_make_chunk.c | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

(limited to 'include')

diff --git a/include/linux/sctp.h b/include/linux/sctp.h
index 75524829aa81..72b87874ea76 100644
--- a/include/linux/sctp.h
+++ b/include/linux/sctp.h
@@ -303,10 +303,10 @@ struct sctp_supported_addrs_param {
 };
 
 /* ADDIP Section 3.2.6 Adaptation Layer Indication */
-typedef struct sctp_adaptation_ind_param {
+struct sctp_adaptation_ind_param {
 	struct sctp_paramhdr param_hdr;
 	__be32 adaptation_ind;
-} sctp_adaptation_ind_param_t;
+};
 
 /* ADDIP Section 4.2.7 Supported Extensions Parameter */
 typedef struct sctp_supported_ext_param {
diff --git a/net/sctp/sm_make_chunk.c b/net/sctp/sm_make_chunk.c
index fb06d4fd0515..d5f82c2f8c84 100644
--- a/net/sctp/sm_make_chunk.c
+++ b/net/sctp/sm_make_chunk.c
@@ -225,7 +225,7 @@ struct sctp_chunk *sctp_make_init(const struct sctp_association *asoc,
 	struct sctp_sock *sp;
 	struct sctp_supported_addrs_param sat;
 	__be16 types[2];
-	sctp_adaptation_ind_param_t aiparam;
+	struct sctp_adaptation_ind_param aiparam;
 	sctp_supported_ext_param_t ext_param;
 	int num_ext = 0;
 	__u8 extensions[3];
@@ -393,7 +393,7 @@ struct sctp_chunk *sctp_make_init_ack(const struct sctp_association *asoc,
 	sctp_cookie_param_t *cookie;
 	int cookie_len;
 	size_t chunksize;
-	sctp_adaptation_ind_param_t aiparam;
+	struct sctp_adaptation_ind_param aiparam;
 	sctp_supported_ext_param_t ext_param;
 	int num_ext = 0;
 	__u8 extensions[3];
-- 
cgit v1.2.3


From 15328d9feede450d64ff77cac5d25bc734ec8b27 Mon Sep 17 00:00:00 2001
From: Xin Long <lucien.xin@gmail.com>
Date: Mon, 17 Jul 2017 11:29:56 +0800
Subject: sctp: remove the typedef sctp_supported_ext_param_t

This patch is to remove the typedef sctp_supported_ext_param_t, and
replace with struct sctp_supported_ext_param in the places where it's
using this typedef.

It is also to use sizeof(variable) instead of sizeof(type).

Signed-off-by: Xin Long <lucien.xin@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/sctp.h     |  4 ++--
 net/sctp/sm_make_chunk.c | 22 ++++++++--------------
 2 files changed, 10 insertions(+), 16 deletions(-)

(limited to 'include')

diff --git a/include/linux/sctp.h b/include/linux/sctp.h
index 72b87874ea76..76245685f923 100644
--- a/include/linux/sctp.h
+++ b/include/linux/sctp.h
@@ -309,10 +309,10 @@ struct sctp_adaptation_ind_param {
 };
 
 /* ADDIP Section 4.2.7 Supported Extensions Parameter */
-typedef struct sctp_supported_ext_param {
+struct sctp_supported_ext_param {
 	struct sctp_paramhdr param_hdr;
 	__u8 chunks[0];
-} sctp_supported_ext_param_t;
+};
 
 /* AUTH Section 3.1 Random */
 typedef struct sctp_random_param {
diff --git a/net/sctp/sm_make_chunk.c b/net/sctp/sm_make_chunk.c
index d5f82c2f8c84..06dc351b6ba1 100644
--- a/net/sctp/sm_make_chunk.c
+++ b/net/sctp/sm_make_chunk.c
@@ -226,7 +226,7 @@ struct sctp_chunk *sctp_make_init(const struct sctp_association *asoc,
 	struct sctp_supported_addrs_param sat;
 	__be16 types[2];
 	struct sctp_adaptation_ind_param aiparam;
-	sctp_supported_ext_param_t ext_param;
+	struct sctp_supported_ext_param ext_param;
 	int num_ext = 0;
 	__u8 extensions[3];
 	struct sctp_paramhdr *auth_chunks = NULL,
@@ -305,8 +305,7 @@ struct sctp_chunk *sctp_make_init(const struct sctp_association *asoc,
 
 	/* If we have any extensions to report, account for that */
 	if (num_ext)
-		chunksize += SCTP_PAD4(sizeof(sctp_supported_ext_param_t) +
-				       num_ext);
+		chunksize += SCTP_PAD4(sizeof(ext_param) + num_ext);
 
 	/* RFC 2960 3.3.2 Initiation (INIT) (1)
 	 *
@@ -348,10 +347,8 @@ struct sctp_chunk *sctp_make_init(const struct sctp_association *asoc,
 	 */
 	if (num_ext) {
 		ext_param.param_hdr.type = SCTP_PARAM_SUPPORTED_EXT;
-		ext_param.param_hdr.length =
-			    htons(sizeof(sctp_supported_ext_param_t) + num_ext);
-		sctp_addto_chunk(retval, sizeof(sctp_supported_ext_param_t),
-				&ext_param);
+		ext_param.param_hdr.length = htons(sizeof(ext_param) + num_ext);
+		sctp_addto_chunk(retval, sizeof(ext_param), &ext_param);
 		sctp_addto_param(retval, num_ext, extensions);
 	}
 
@@ -394,7 +391,7 @@ struct sctp_chunk *sctp_make_init_ack(const struct sctp_association *asoc,
 	int cookie_len;
 	size_t chunksize;
 	struct sctp_adaptation_ind_param aiparam;
-	sctp_supported_ext_param_t ext_param;
+	struct sctp_supported_ext_param ext_param;
 	int num_ext = 0;
 	__u8 extensions[3];
 	struct sctp_paramhdr *auth_chunks = NULL,
@@ -468,8 +465,7 @@ struct sctp_chunk *sctp_make_init_ack(const struct sctp_association *asoc,
 	}
 
 	if (num_ext)
-		chunksize += SCTP_PAD4(sizeof(sctp_supported_ext_param_t) +
-				       num_ext);
+		chunksize += SCTP_PAD4(sizeof(ext_param) + num_ext);
 
 	/* Now allocate and fill out the chunk.  */
 	retval = sctp_make_control(asoc, SCTP_CID_INIT_ACK, 0, chunksize, gfp);
@@ -495,10 +491,8 @@ struct sctp_chunk *sctp_make_init_ack(const struct sctp_association *asoc,
 		sctp_addto_chunk(retval, sizeof(ecap_param), &ecap_param);
 	if (num_ext) {
 		ext_param.param_hdr.type = SCTP_PARAM_SUPPORTED_EXT;
-		ext_param.param_hdr.length =
-			    htons(sizeof(sctp_supported_ext_param_t) + num_ext);
-		sctp_addto_chunk(retval, sizeof(sctp_supported_ext_param_t),
-				 &ext_param);
+		ext_param.param_hdr.length = htons(sizeof(ext_param) + num_ext);
+		sctp_addto_chunk(retval, sizeof(ext_param), &ext_param);
 		sctp_addto_param(retval, num_ext, extensions);
 	}
 	if (asoc->peer.prsctp_capable)
-- 
cgit v1.2.3


From b02db702face3791889a4fcf06691c086648ee89 Mon Sep 17 00:00:00 2001
From: Xin Long <lucien.xin@gmail.com>
Date: Mon, 17 Jul 2017 11:29:57 +0800
Subject: sctp: remove the typedef sctp_random_param_t

This patch is to remove the typedef sctp_random_param_t, and
replace with struct sctp_random_param in the places where it's
using this typedef.

Signed-off-by: Xin Long <lucien.xin@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/sctp.h       | 4 ++--
 include/net/sctp/structs.h | 2 +-
 net/sctp/auth.c            | 9 ++++-----
 3 files changed, 7 insertions(+), 8 deletions(-)

(limited to 'include')

diff --git a/include/linux/sctp.h b/include/linux/sctp.h
index 76245685f923..9b1aa3907c9e 100644
--- a/include/linux/sctp.h
+++ b/include/linux/sctp.h
@@ -315,10 +315,10 @@ struct sctp_supported_ext_param {
 };
 
 /* AUTH Section 3.1 Random */
-typedef struct sctp_random_param {
+struct sctp_random_param {
 	struct sctp_paramhdr param_hdr;
 	__u8 random_val[0];
-} sctp_random_param_t;
+};
 
 /* AUTH Section 3.2 Chunk List */
 typedef struct sctp_chunks_param {
diff --git a/include/net/sctp/structs.h b/include/net/sctp/structs.h
index 5ab29af8ca8a..f22c079fd1f1 100644
--- a/include/net/sctp/structs.h
+++ b/include/net/sctp/structs.h
@@ -1556,7 +1556,7 @@ struct sctp_association {
 		 * and authenticated chunk list.  All that is part of the
 		 * cookie and these are just pointers to those locations
 		 */
-		sctp_random_param_t *peer_random;
+		struct sctp_random_param *peer_random;
 		sctp_chunks_param_t *peer_chunks;
 		sctp_hmac_algo_param_t *peer_hmacs;
 	} peer;
diff --git a/net/sctp/auth.c b/net/sctp/auth.c
index e001b01b0e68..0d9c63eba978 100644
--- a/net/sctp/auth.c
+++ b/net/sctp/auth.c
@@ -185,7 +185,7 @@ static int sctp_auth_compare_vectors(struct sctp_auth_bytes *vector1,
  *    are called the two key vectors.
  */
 static struct sctp_auth_bytes *sctp_auth_make_key_vector(
-			sctp_random_param_t *random,
+			struct sctp_random_param *random,
 			sctp_chunks_param_t *chunks,
 			sctp_hmac_algo_param_t *hmacs,
 			gfp_t gfp)
@@ -226,10 +226,9 @@ static struct sctp_auth_bytes *sctp_auth_make_local_vector(
 				    gfp_t gfp)
 {
 	return sctp_auth_make_key_vector(
-				    (sctp_random_param_t *)asoc->c.auth_random,
-				    (sctp_chunks_param_t *)asoc->c.auth_chunks,
-				    (sctp_hmac_algo_param_t *)asoc->c.auth_hmacs,
-				    gfp);
+			(struct sctp_random_param *)asoc->c.auth_random,
+			(sctp_chunks_param_t *)asoc->c.auth_chunks,
+			(sctp_hmac_algo_param_t *)asoc->c.auth_hmacs, gfp);
 }
 
 /* Make a key vector based on peer's parameters */
-- 
cgit v1.2.3


From a762a9d94d44980e3690f9de87b918376daa6428 Mon Sep 17 00:00:00 2001
From: Xin Long <lucien.xin@gmail.com>
Date: Mon, 17 Jul 2017 11:29:58 +0800
Subject: sctp: remove the typedef sctp_chunks_param_t

This patch is to remove the typedef sctp_chunks_param_t, and
replace with struct sctp_chunks_param in the places where it's
using this typedef.

It is also to use sizeof(variable) instead of sizeof(type).

Signed-off-by: Xin Long <lucien.xin@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/sctp.h       | 4 ++--
 include/net/sctp/structs.h | 2 +-
 net/sctp/auth.c            | 4 ++--
 net/sctp/endpointola.c     | 4 ++--
 4 files changed, 7 insertions(+), 7 deletions(-)

(limited to 'include')

diff --git a/include/linux/sctp.h b/include/linux/sctp.h
index 9b1aa3907c9e..b52def9bcfa1 100644
--- a/include/linux/sctp.h
+++ b/include/linux/sctp.h
@@ -321,10 +321,10 @@ struct sctp_random_param {
 };
 
 /* AUTH Section 3.2 Chunk List */
-typedef struct sctp_chunks_param {
+struct sctp_chunks_param {
 	struct sctp_paramhdr param_hdr;
 	__u8 chunks[0];
-} sctp_chunks_param_t;
+};
 
 /* AUTH Section 3.3 HMAC Algorithm */
 typedef struct sctp_hmac_algo_param {
diff --git a/include/net/sctp/structs.h b/include/net/sctp/structs.h
index f22c079fd1f1..8042e6380b0e 100644
--- a/include/net/sctp/structs.h
+++ b/include/net/sctp/structs.h
@@ -1557,7 +1557,7 @@ struct sctp_association {
 		 * cookie and these are just pointers to those locations
 		 */
 		struct sctp_random_param *peer_random;
-		sctp_chunks_param_t *peer_chunks;
+		struct sctp_chunks_param *peer_chunks;
 		sctp_hmac_algo_param_t *peer_hmacs;
 	} peer;
 
diff --git a/net/sctp/auth.c b/net/sctp/auth.c
index 0d9c63eba978..367994d9712a 100644
--- a/net/sctp/auth.c
+++ b/net/sctp/auth.c
@@ -186,7 +186,7 @@ static int sctp_auth_compare_vectors(struct sctp_auth_bytes *vector1,
  */
 static struct sctp_auth_bytes *sctp_auth_make_key_vector(
 			struct sctp_random_param *random,
-			sctp_chunks_param_t *chunks,
+			struct sctp_chunks_param *chunks,
 			sctp_hmac_algo_param_t *hmacs,
 			gfp_t gfp)
 {
@@ -227,7 +227,7 @@ static struct sctp_auth_bytes *sctp_auth_make_local_vector(
 {
 	return sctp_auth_make_key_vector(
 			(struct sctp_random_param *)asoc->c.auth_random,
-			(sctp_chunks_param_t *)asoc->c.auth_chunks,
+			(struct sctp_chunks_param *)asoc->c.auth_chunks,
 			(sctp_hmac_algo_param_t *)asoc->c.auth_hmacs, gfp);
 }
 
diff --git a/net/sctp/endpointola.c b/net/sctp/endpointola.c
index 0e86f988f836..35bf5af124fc 100644
--- a/net/sctp/endpointola.c
+++ b/net/sctp/endpointola.c
@@ -78,8 +78,8 @@ static struct sctp_endpoint *sctp_endpoint_init(struct sctp_endpoint *ep,
 		if (!auth_hmacs)
 			goto nomem;
 
-		auth_chunks = kzalloc(sizeof(sctp_chunks_param_t) +
-					SCTP_NUM_CHUNK_TYPES, gfp);
+		auth_chunks = kzalloc(sizeof(*auth_chunks) +
+				      SCTP_NUM_CHUNK_TYPES, gfp);
 		if (!auth_chunks)
 			goto nomem;
 
-- 
cgit v1.2.3


From 1474774a7f0daf9878fd9537a24714f419e744ed Mon Sep 17 00:00:00 2001
From: Xin Long <lucien.xin@gmail.com>
Date: Mon, 17 Jul 2017 11:29:59 +0800
Subject: sctp: remove the typedef sctp_hmac_algo_param_t

This patch is to remove the typedef sctp_hmac_algo_param_t, and
replace with struct sctp_hmac_algo_param in the places where it's
using this typedef.

It is also to use sizeof(variable) instead of sizeof(type).

Signed-off-by: Xin Long <lucien.xin@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/sctp.h       | 4 ++--
 include/net/sctp/structs.h | 2 +-
 net/sctp/auth.c            | 4 ++--
 net/sctp/endpointola.c     | 4 ++--
 4 files changed, 7 insertions(+), 7 deletions(-)

(limited to 'include')

diff --git a/include/linux/sctp.h b/include/linux/sctp.h
index b52def9bcfa1..913474dfc96c 100644
--- a/include/linux/sctp.h
+++ b/include/linux/sctp.h
@@ -327,10 +327,10 @@ struct sctp_chunks_param {
 };
 
 /* AUTH Section 3.3 HMAC Algorithm */
-typedef struct sctp_hmac_algo_param {
+struct sctp_hmac_algo_param {
 	struct sctp_paramhdr param_hdr;
 	__be16 hmac_ids[0];
-} sctp_hmac_algo_param_t;
+};
 
 /* RFC 2960.  Section 3.3.3 Initiation Acknowledgement (INIT ACK) (2):
  *   The INIT ACK chunk is used to acknowledge the initiation of an SCTP
diff --git a/include/net/sctp/structs.h b/include/net/sctp/structs.h
index 8042e6380b0e..66cd7639b912 100644
--- a/include/net/sctp/structs.h
+++ b/include/net/sctp/structs.h
@@ -1558,7 +1558,7 @@ struct sctp_association {
 		 */
 		struct sctp_random_param *peer_random;
 		struct sctp_chunks_param *peer_chunks;
-		sctp_hmac_algo_param_t *peer_hmacs;
+		struct sctp_hmac_algo_param *peer_hmacs;
 	} peer;
 
 	/* State       : A state variable indicating what state the
diff --git a/net/sctp/auth.c b/net/sctp/auth.c
index 367994d9712a..00667c50efa7 100644
--- a/net/sctp/auth.c
+++ b/net/sctp/auth.c
@@ -187,7 +187,7 @@ static int sctp_auth_compare_vectors(struct sctp_auth_bytes *vector1,
 static struct sctp_auth_bytes *sctp_auth_make_key_vector(
 			struct sctp_random_param *random,
 			struct sctp_chunks_param *chunks,
-			sctp_hmac_algo_param_t *hmacs,
+			struct sctp_hmac_algo_param *hmacs,
 			gfp_t gfp)
 {
 	struct sctp_auth_bytes *new;
@@ -228,7 +228,7 @@ static struct sctp_auth_bytes *sctp_auth_make_local_vector(
 	return sctp_auth_make_key_vector(
 			(struct sctp_random_param *)asoc->c.auth_random,
 			(struct sctp_chunks_param *)asoc->c.auth_chunks,
-			(sctp_hmac_algo_param_t *)asoc->c.auth_hmacs, gfp);
+			(struct sctp_hmac_algo_param *)asoc->c.auth_hmacs, gfp);
 }
 
 /* Make a key vector based on peer's parameters */
diff --git a/net/sctp/endpointola.c b/net/sctp/endpointola.c
index 35bf5af124fc..3d506b2f6193 100644
--- a/net/sctp/endpointola.c
+++ b/net/sctp/endpointola.c
@@ -73,8 +73,8 @@ static struct sctp_endpoint *sctp_endpoint_init(struct sctp_endpoint *ep,
 		 * variables.  There are arrays that we encode directly
 		 * into parameters to make the rest of the operations easier.
 		 */
-		auth_hmacs = kzalloc(sizeof(sctp_hmac_algo_param_t) +
-				sizeof(__u16) * SCTP_AUTH_NUM_HMACS, gfp);
+		auth_hmacs = kzalloc(sizeof(*auth_hmacs) +
+				     sizeof(__u16) * SCTP_AUTH_NUM_HMACS, gfp);
 		if (!auth_hmacs)
 			goto nomem;
 
-- 
cgit v1.2.3


From 94e92e7ac90d06e1e839e112d3ae80b2457dbdd7 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Mon, 17 Jul 2017 08:45:34 +0100
Subject: vfs: Add sb_rdonly(sb) to query the MS_RDONLY flag on s_flags

Add an sb_rdonly() function to query the MS_RDONLY flag on sb->s_flags
preparatory to providing an SB_RDONLY flag.

Signed-off-by: David Howells <dhowells@redhat.com>
---
 include/linux/fs.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/fs.h b/include/linux/fs.h
index 7b5d6816542b..6ae137c1bdf6 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1844,7 +1844,8 @@ struct super_operations {
  */
 #define __IS_FLG(inode, flg)	((inode)->i_sb->s_flags & (flg))
 
-#define IS_RDONLY(inode)	((inode)->i_sb->s_flags & MS_RDONLY)
+static inline bool sb_rdonly(const struct super_block *sb) { return sb->s_flags & MS_RDONLY; }
+#define IS_RDONLY(inode)	sb_rdonly((inode)->i_sb)
 #define IS_SYNC(inode)		(__IS_FLG(inode, MS_SYNCHRONOUS) || \
 					((inode)->i_flags & S_SYNC))
 #define IS_DIRSYNC(inode)	(__IS_FLG(inode, MS_SYNCHRONOUS|MS_DIRSYNC) || \
-- 
cgit v1.2.3


From e462ec50cb5fad19f6003a3d8087f4a0945dd2b1 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Mon, 17 Jul 2017 08:45:35 +0100
Subject: VFS: Differentiate mount flags (MS_*) from internal superblock flags

Differentiate the MS_* flags passed to mount(2) from the internal flags set
in the super_block's s_flags.  s_flags are now called SB_*, with the names
and the values for the moment mirroring the MS_* flags that they're
equivalent to.

In this patch, just the headers are altered and some kernel code where
blind automated conversion isn't necessarily correct.

Note that this shows up some interesting issues:

 (1) Some MS_* flags get translated to MNT_* flags (such as MS_NODEV ->
     MNT_NODEV) without passing this on to the filesystem, but some
     filesystems set such flags anyway.

 (2) The ->remount_fs() methods of some filesystems adjust the *flags
     argument by setting MS_* flags in it, such as MS_NOATIME - but these
     flags are then scrubbed by do_remount_sb() (only the occupants of
     MS_RMT_MASK are permitted: MS_RDONLY, MS_SYNCHRONOUS, MS_MANDLOCK,
     MS_I_VERSION and MS_LAZYTIME)

I'm not sure what's the best way to solve all these cases.

Suggested-by: Al Viro <viro@ZenIV.linux.org.uk>
Signed-off-by: David Howells <dhowells@redhat.com>
---
 Documentation/filesystems/porting |  2 +-
 fs/namespace.c                    | 56 ++++++++++++++++++--------------
 fs/super.c                        | 68 +++++++++++++++++++--------------------
 include/linux/fs.h                | 45 ++++++++++++++++++++------
 init/do_mounts.c                  |  4 +--
 5 files changed, 104 insertions(+), 71 deletions(-)

(limited to 'include')

diff --git a/Documentation/filesystems/porting b/Documentation/filesystems/porting
index 5fb17f49f7a2..93e0a2404532 100644
--- a/Documentation/filesystems/porting
+++ b/Documentation/filesystems/porting
@@ -228,7 +228,7 @@ anything from oops to silent memory corruption.
 ---
 [mandatory]
 
-	FS_NOMOUNT is gone.  If you use it - just set MS_NOUSER in flags
+	FS_NOMOUNT is gone.  If you use it - just set SB_NOUSER in flags
 (see rootfs for one kind of solution and bdev/socket/pipe for another).
 
 ---
diff --git a/fs/namespace.c b/fs/namespace.c
index e42c9abfeaa8..c26a82cfe4fc 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -971,7 +971,7 @@ vfs_kern_mount(struct file_system_type *type, int flags, const char *name, void
 	if (!mnt)
 		return ERR_PTR(-ENOMEM);
 
-	if (flags & MS_KERNMOUNT)
+	if (flags & SB_KERNMOUNT)
 		mnt->mnt.mnt_flags = MNT_INTERNAL;
 
 	root = mount_fs(type, flags, name, data);
@@ -1003,7 +1003,7 @@ vfs_submount(const struct dentry *mountpoint, struct file_system_type *type,
 	if (mountpoint->d_sb->s_user_ns != &init_user_ns)
 		return ERR_PTR(-EPERM);
 
-	return vfs_kern_mount(type, MS_SUBMOUNT, name, data);
+	return vfs_kern_mount(type, SB_SUBMOUNT, name, data);
 }
 EXPORT_SYMBOL_GPL(vfs_submount);
 
@@ -1535,7 +1535,7 @@ static int do_umount(struct mount *mnt, int flags)
 			return -EPERM;
 		down_write(&sb->s_umount);
 		if (!sb_rdonly(sb))
-			retval = do_remount_sb(sb, MS_RDONLY, NULL, 0);
+			retval = do_remount_sb(sb, SB_RDONLY, NULL, 0);
 		up_write(&sb->s_umount);
 		return retval;
 	}
@@ -2059,7 +2059,7 @@ static void unlock_mount(struct mountpoint *where)
 
 static int graft_tree(struct mount *mnt, struct mount *p, struct mountpoint *mp)
 {
-	if (mnt->mnt.mnt_sb->s_flags & MS_NOUSER)
+	if (mnt->mnt.mnt_sb->s_flags & SB_NOUSER)
 		return -EINVAL;
 
 	if (d_is_dir(mp->m_dentry) !=
@@ -2073,9 +2073,9 @@ static int graft_tree(struct mount *mnt, struct mount *p, struct mountpoint *mp)
  * Sanity check the flags to change_mnt_propagation.
  */
 
-static int flags_to_propagation_type(int flags)
+static int flags_to_propagation_type(int ms_flags)
 {
-	int type = flags & ~(MS_REC | MS_SILENT);
+	int type = ms_flags & ~(MS_REC | MS_SILENT);
 
 	/* Fail if any non-propagation flags are set */
 	if (type & ~(MS_SHARED | MS_PRIVATE | MS_SLAVE | MS_UNBINDABLE))
@@ -2089,18 +2089,18 @@ static int flags_to_propagation_type(int flags)
 /*
  * recursively change the type of the mountpoint.
  */
-static int do_change_type(struct path *path, int flag)
+static int do_change_type(struct path *path, int ms_flags)
 {
 	struct mount *m;
 	struct mount *mnt = real_mount(path->mnt);
-	int recurse = flag & MS_REC;
+	int recurse = ms_flags & MS_REC;
 	int type;
 	int err = 0;
 
 	if (path->dentry != path->mnt->mnt_root)
 		return -EINVAL;
 
-	type = flags_to_propagation_type(flag);
+	type = flags_to_propagation_type(ms_flags);
 	if (!type)
 		return -EINVAL;
 
@@ -2222,8 +2222,8 @@ static int change_mount_flags(struct vfsmount *mnt, int ms_flags)
  * If you've mounted a non-root directory somewhere and want to do remount
  * on it - tough luck.
  */
-static int do_remount(struct path *path, int flags, int mnt_flags,
-		      void *data)
+static int do_remount(struct path *path, int ms_flags, int sb_flags,
+		      int mnt_flags, void *data)
 {
 	int err;
 	struct super_block *sb = path->mnt->mnt_sb;
@@ -2267,12 +2267,12 @@ static int do_remount(struct path *path, int flags, int mnt_flags,
 		return err;
 
 	down_write(&sb->s_umount);
-	if (flags & MS_BIND)
-		err = change_mount_flags(path->mnt, flags);
+	if (ms_flags & MS_BIND)
+		err = change_mount_flags(path->mnt, ms_flags);
 	else if (!capable(CAP_SYS_ADMIN))
 		err = -EPERM;
 	else
-		err = do_remount_sb(sb, flags, data, 0);
+		err = do_remount_sb(sb, sb_flags, data, 0);
 	if (!err) {
 		lock_mount_hash();
 		mnt_flags |= mnt->mnt.mnt_flags & ~MNT_USER_SETTABLE_MASK;
@@ -2437,7 +2437,7 @@ static bool mount_too_revealing(struct vfsmount *mnt, int *new_mnt_flags);
  * create a new mount for userspace and request it to be added into the
  * namespace's tree
  */
-static int do_new_mount(struct path *path, const char *fstype, int flags,
+static int do_new_mount(struct path *path, const char *fstype, int sb_flags,
 			int mnt_flags, const char *name, void *data)
 {
 	struct file_system_type *type;
@@ -2451,7 +2451,7 @@ static int do_new_mount(struct path *path, const char *fstype, int flags,
 	if (!type)
 		return -ENODEV;
 
-	mnt = vfs_kern_mount(type, flags, name, data);
+	mnt = vfs_kern_mount(type, sb_flags, name, data);
 	if (!IS_ERR(mnt) && (type->fs_flags & FS_HAS_SUBTYPE) &&
 	    !mnt->mnt_sb->s_subtype)
 		mnt = fs_set_subtype(mnt, fstype);
@@ -2706,8 +2706,8 @@ long do_mount(const char *dev_name, const char __user *dir_name,
 		const char *type_page, unsigned long flags, void *data_page)
 {
 	struct path path;
+	unsigned int mnt_flags = 0, sb_flags;
 	int retval = 0;
-	int mnt_flags = 0;
 
 	/* Discard magic */
 	if ((flags & MS_MGC_MSK) == MS_MGC_VAL)
@@ -2717,6 +2717,9 @@ long do_mount(const char *dev_name, const char __user *dir_name,
 	if (data_page)
 		((char *)data_page)[PAGE_SIZE - 1] = 0;
 
+	if (flags & MS_NOUSER)
+		return -EINVAL;
+
 	/* ... and get the mountpoint */
 	retval = user_path(dir_name, &path);
 	if (retval)
@@ -2726,7 +2729,7 @@ long do_mount(const char *dev_name, const char __user *dir_name,
 				   type_page, flags, data_page);
 	if (!retval && !may_mount())
 		retval = -EPERM;
-	if (!retval && (flags & MS_MANDLOCK) && !may_mandlock())
+	if (!retval && (flags & SB_MANDLOCK) && !may_mandlock())
 		retval = -EPERM;
 	if (retval)
 		goto dput_out;
@@ -2748,7 +2751,7 @@ long do_mount(const char *dev_name, const char __user *dir_name,
 		mnt_flags |= MNT_NODIRATIME;
 	if (flags & MS_STRICTATIME)
 		mnt_flags &= ~(MNT_RELATIME | MNT_NOATIME);
-	if (flags & MS_RDONLY)
+	if (flags & SB_RDONLY)
 		mnt_flags |= MNT_READONLY;
 
 	/* The default atime for remount is preservation */
@@ -2759,12 +2762,15 @@ long do_mount(const char *dev_name, const char __user *dir_name,
 		mnt_flags |= path.mnt->mnt_flags & MNT_ATIME_MASK;
 	}
 
-	flags &= ~(MS_NOSUID | MS_NOEXEC | MS_NODEV | MS_ACTIVE | MS_BORN |
-		   MS_NOATIME | MS_NODIRATIME | MS_RELATIME| MS_KERNMOUNT |
-		   MS_STRICTATIME | MS_NOREMOTELOCK | MS_SUBMOUNT);
+	sb_flags = flags & (SB_RDONLY |
+			    SB_SYNCHRONOUS |
+			    SB_MANDLOCK |
+			    SB_DIRSYNC |
+			    SB_SILENT |
+			    SB_POSIXACL);
 
 	if (flags & MS_REMOUNT)
-		retval = do_remount(&path, flags & ~MS_REMOUNT, mnt_flags,
+		retval = do_remount(&path, flags, sb_flags, mnt_flags,
 				    data_page);
 	else if (flags & MS_BIND)
 		retval = do_loopback(&path, dev_name, flags & MS_REC);
@@ -2773,7 +2779,7 @@ long do_mount(const char *dev_name, const char __user *dir_name,
 	else if (flags & MS_MOVE)
 		retval = do_move_mount(&path, dev_name);
 	else
-		retval = do_new_mount(&path, type_page, flags, mnt_flags,
+		retval = do_new_mount(&path, type_page, sb_flags, mnt_flags,
 				      dev_name, data_page);
 dput_out:
 	path_put(&path);
@@ -3223,7 +3229,7 @@ void put_mnt_ns(struct mnt_namespace *ns)
 struct vfsmount *kern_mount_data(struct file_system_type *type, void *data)
 {
 	struct vfsmount *mnt;
-	mnt = vfs_kern_mount(type, MS_KERNMOUNT, type->name, data);
+	mnt = vfs_kern_mount(type, SB_KERNMOUNT, type->name, data);
 	if (!IS_ERR(mnt)) {
 		/*
 		 * it is a longterm mount, don't release mnt until
diff --git a/fs/super.c b/fs/super.c
index 7321958d81d8..d956e62e5866 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -360,7 +360,7 @@ static int grab_super(struct super_block *s) __releases(sb_lock)
 	s->s_count++;
 	spin_unlock(&sb_lock);
 	down_write(&s->s_umount);
-	if ((s->s_flags & MS_BORN) && atomic_inc_not_zero(&s->s_active)) {
+	if ((s->s_flags & SB_BORN) && atomic_inc_not_zero(&s->s_active)) {
 		put_super(s);
 		return 1;
 	}
@@ -390,7 +390,7 @@ bool trylock_super(struct super_block *sb)
 {
 	if (down_read_trylock(&sb->s_umount)) {
 		if (!hlist_unhashed(&sb->s_instances) &&
-		    sb->s_root && (sb->s_flags & MS_BORN))
+		    sb->s_root && (sb->s_flags & SB_BORN))
 			return true;
 		up_read(&sb->s_umount);
 	}
@@ -419,7 +419,7 @@ void generic_shutdown_super(struct super_block *sb)
 	if (sb->s_root) {
 		shrink_dcache_for_umount(sb);
 		sync_filesystem(sb);
-		sb->s_flags &= ~MS_ACTIVE;
+		sb->s_flags &= ~SB_ACTIVE;
 
 		fsnotify_unmount_inodes(sb);
 		cgroup_writeback_umount();
@@ -472,7 +472,7 @@ struct super_block *sget_userns(struct file_system_type *type,
 	struct super_block *old;
 	int err;
 
-	if (!(flags & (MS_KERNMOUNT|MS_SUBMOUNT)) &&
+	if (!(flags & (SB_KERNMOUNT|SB_SUBMOUNT)) &&
 	    !(type->fs_flags & FS_USERNS_MOUNT) &&
 	    !capable(CAP_SYS_ADMIN))
 		return ERR_PTR(-EPERM);
@@ -502,7 +502,7 @@ retry:
 	}
 	if (!s) {
 		spin_unlock(&sb_lock);
-		s = alloc_super(type, (flags & ~MS_SUBMOUNT), user_ns);
+		s = alloc_super(type, (flags & ~SB_SUBMOUNT), user_ns);
 		if (!s)
 			return ERR_PTR(-ENOMEM);
 		goto retry;
@@ -547,11 +547,11 @@ struct super_block *sget(struct file_system_type *type,
 	 * mount through to here so always use &init_user_ns
 	 * until that changes.
 	 */
-	if (flags & MS_SUBMOUNT)
+	if (flags & SB_SUBMOUNT)
 		user_ns = &init_user_ns;
 
 	/* Ensure the requestor has permissions over the target filesystem */
-	if (!(flags & (MS_KERNMOUNT|MS_SUBMOUNT)) && !ns_capable(user_ns, CAP_SYS_ADMIN))
+	if (!(flags & (SB_KERNMOUNT|SB_SUBMOUNT)) && !ns_capable(user_ns, CAP_SYS_ADMIN))
 		return ERR_PTR(-EPERM);
 
 	return sget_userns(type, test, set, flags, user_ns, data);
@@ -594,7 +594,7 @@ void iterate_supers(void (*f)(struct super_block *, void *), void *arg)
 		spin_unlock(&sb_lock);
 
 		down_read(&sb->s_umount);
-		if (sb->s_root && (sb->s_flags & MS_BORN))
+		if (sb->s_root && (sb->s_flags & SB_BORN))
 			f(sb, arg);
 		up_read(&sb->s_umount);
 
@@ -628,7 +628,7 @@ void iterate_supers_type(struct file_system_type *type,
 		spin_unlock(&sb_lock);
 
 		down_read(&sb->s_umount);
-		if (sb->s_root && (sb->s_flags & MS_BORN))
+		if (sb->s_root && (sb->s_flags & SB_BORN))
 			f(sb, arg);
 		up_read(&sb->s_umount);
 
@@ -664,7 +664,7 @@ rescan:
 			else
 				down_write(&sb->s_umount);
 			/* still alive? */
-			if (sb->s_root && (sb->s_flags & MS_BORN))
+			if (sb->s_root && (sb->s_flags & SB_BORN))
 				return sb;
 			if (!excl)
 				up_read(&sb->s_umount);
@@ -785,7 +785,7 @@ rescan:
 			spin_unlock(&sb_lock);
 			down_read(&sb->s_umount);
 			/* still alive? */
-			if (sb->s_root && (sb->s_flags & MS_BORN))
+			if (sb->s_root && (sb->s_flags & SB_BORN))
 				return sb;
 			up_read(&sb->s_umount);
 			/* nope, got unmounted */
@@ -801,13 +801,13 @@ rescan:
 /**
  *	do_remount_sb - asks filesystem to change mount options.
  *	@sb:	superblock in question
- *	@flags:	numeric part of options
+ *	@sb_flags: revised superblock flags
  *	@data:	the rest of options
  *      @force: whether or not to force the change
  *
  *	Alters the mount options of a mounted file system.
  */
-int do_remount_sb(struct super_block *sb, int flags, void *data, int force)
+int do_remount_sb(struct super_block *sb, int sb_flags, void *data, int force)
 {
 	int retval;
 	int remount_ro;
@@ -816,11 +816,11 @@ int do_remount_sb(struct super_block *sb, int flags, void *data, int force)
 		return -EBUSY;
 
 #ifdef CONFIG_BLOCK
-	if (!(flags & MS_RDONLY) && bdev_read_only(sb->s_bdev))
+	if (!(sb_flags & SB_RDONLY) && bdev_read_only(sb->s_bdev))
 		return -EACCES;
 #endif
 
-	remount_ro = (flags & MS_RDONLY) && !sb_rdonly(sb);
+	remount_ro = (sb_flags & SB_RDONLY) && !sb_rdonly(sb);
 
 	if (remount_ro) {
 		if (!hlist_empty(&sb->s_pins)) {
@@ -831,7 +831,7 @@ int do_remount_sb(struct super_block *sb, int flags, void *data, int force)
 				return 0;
 			if (sb->s_writers.frozen != SB_UNFROZEN)
 				return -EBUSY;
-			remount_ro = (flags & MS_RDONLY) && !sb_rdonly(sb);
+			remount_ro = (sb_flags & SB_RDONLY) && !sb_rdonly(sb);
 		}
 	}
 	shrink_dcache_sb(sb);
@@ -850,7 +850,7 @@ int do_remount_sb(struct super_block *sb, int flags, void *data, int force)
 	}
 
 	if (sb->s_op->remount_fs) {
-		retval = sb->s_op->remount_fs(sb, &flags, data);
+		retval = sb->s_op->remount_fs(sb, &sb_flags, data);
 		if (retval) {
 			if (!force)
 				goto cancel_readonly;
@@ -859,7 +859,7 @@ int do_remount_sb(struct super_block *sb, int flags, void *data, int force)
 			     sb->s_type->name, retval);
 		}
 	}
-	sb->s_flags = (sb->s_flags & ~MS_RMT_MASK) | (flags & MS_RMT_MASK);
+	sb->s_flags = (sb->s_flags & ~MS_RMT_MASK) | (sb_flags & MS_RMT_MASK);
 	/* Needs to be ordered wrt mnt_is_readonly() */
 	smp_wmb();
 	sb->s_readonly_remount = 0;
@@ -892,12 +892,12 @@ static void do_emergency_remount(struct work_struct *work)
 		sb->s_count++;
 		spin_unlock(&sb_lock);
 		down_write(&sb->s_umount);
-		if (sb->s_root && sb->s_bdev && (sb->s_flags & MS_BORN) &&
+		if (sb->s_root && sb->s_bdev && (sb->s_flags & SB_BORN) &&
 		    !sb_rdonly(sb)) {
 			/*
 			 * What lock protects sb->s_flags??
 			 */
-			do_remount_sb(sb, MS_RDONLY, NULL, 1);
+			do_remount_sb(sb, SB_RDONLY, NULL, 1);
 		}
 		up_write(&sb->s_umount);
 		spin_lock(&sb_lock);
@@ -1023,7 +1023,7 @@ struct dentry *mount_ns(struct file_system_type *fs_type,
 	/* Don't allow mounting unless the caller has CAP_SYS_ADMIN
 	 * over the namespace.
 	 */
-	if (!(flags & MS_KERNMOUNT) && !ns_capable(user_ns, CAP_SYS_ADMIN))
+	if (!(flags & SB_KERNMOUNT) && !ns_capable(user_ns, CAP_SYS_ADMIN))
 		return ERR_PTR(-EPERM);
 
 	sb = sget_userns(fs_type, ns_test_super, ns_set_super, flags,
@@ -1033,13 +1033,13 @@ struct dentry *mount_ns(struct file_system_type *fs_type,
 
 	if (!sb->s_root) {
 		int err;
-		err = fill_super(sb, data, flags & MS_SILENT ? 1 : 0);
+		err = fill_super(sb, data, flags & SB_SILENT ? 1 : 0);
 		if (err) {
 			deactivate_locked_super(sb);
 			return ERR_PTR(err);
 		}
 
-		sb->s_flags |= MS_ACTIVE;
+		sb->s_flags |= SB_ACTIVE;
 	}
 
 	return dget(sb->s_root);
@@ -1071,7 +1071,7 @@ struct dentry *mount_bdev(struct file_system_type *fs_type,
 	fmode_t mode = FMODE_READ | FMODE_EXCL;
 	int error = 0;
 
-	if (!(flags & MS_RDONLY))
+	if (!(flags & SB_RDONLY))
 		mode |= FMODE_WRITE;
 
 	bdev = blkdev_get_by_path(dev_name, mode, fs_type);
@@ -1089,14 +1089,14 @@ struct dentry *mount_bdev(struct file_system_type *fs_type,
 		error = -EBUSY;
 		goto error_bdev;
 	}
-	s = sget(fs_type, test_bdev_super, set_bdev_super, flags | MS_NOSEC,
+	s = sget(fs_type, test_bdev_super, set_bdev_super, flags | SB_NOSEC,
 		 bdev);
 	mutex_unlock(&bdev->bd_fsfreeze_mutex);
 	if (IS_ERR(s))
 		goto error_s;
 
 	if (s->s_root) {
-		if ((flags ^ s->s_flags) & MS_RDONLY) {
+		if ((flags ^ s->s_flags) & SB_RDONLY) {
 			deactivate_locked_super(s);
 			error = -EBUSY;
 			goto error_bdev;
@@ -1116,13 +1116,13 @@ struct dentry *mount_bdev(struct file_system_type *fs_type,
 		s->s_mode = mode;
 		snprintf(s->s_id, sizeof(s->s_id), "%pg", bdev);
 		sb_set_blocksize(s, block_size(bdev));
-		error = fill_super(s, data, flags & MS_SILENT ? 1 : 0);
+		error = fill_super(s, data, flags & SB_SILENT ? 1 : 0);
 		if (error) {
 			deactivate_locked_super(s);
 			goto error;
 		}
 
-		s->s_flags |= MS_ACTIVE;
+		s->s_flags |= SB_ACTIVE;
 		bdev->bd_super = s;
 	}
 
@@ -1162,12 +1162,12 @@ struct dentry *mount_nodev(struct file_system_type *fs_type,
 	if (IS_ERR(s))
 		return ERR_CAST(s);
 
-	error = fill_super(s, data, flags & MS_SILENT ? 1 : 0);
+	error = fill_super(s, data, flags & SB_SILENT ? 1 : 0);
 	if (error) {
 		deactivate_locked_super(s);
 		return ERR_PTR(error);
 	}
-	s->s_flags |= MS_ACTIVE;
+	s->s_flags |= SB_ACTIVE;
 	return dget(s->s_root);
 }
 EXPORT_SYMBOL(mount_nodev);
@@ -1188,12 +1188,12 @@ struct dentry *mount_single(struct file_system_type *fs_type,
 	if (IS_ERR(s))
 		return ERR_CAST(s);
 	if (!s->s_root) {
-		error = fill_super(s, data, flags & MS_SILENT ? 1 : 0);
+		error = fill_super(s, data, flags & SB_SILENT ? 1 : 0);
 		if (error) {
 			deactivate_locked_super(s);
 			return ERR_PTR(error);
 		}
-		s->s_flags |= MS_ACTIVE;
+		s->s_flags |= SB_ACTIVE;
 	} else {
 		do_remount_sb(s, flags, data, 0);
 	}
@@ -1227,7 +1227,7 @@ mount_fs(struct file_system_type *type, int flags, const char *name, void *data)
 	sb = root->d_sb;
 	BUG_ON(!sb);
 	WARN_ON(!sb->s_bdi);
-	sb->s_flags |= MS_BORN;
+	sb->s_flags |= SB_BORN;
 
 	error = security_sb_kern_mount(sb, flags, secdata);
 	if (error)
@@ -1434,7 +1434,7 @@ int freeze_super(struct super_block *sb)
 		return -EBUSY;
 	}
 
-	if (!(sb->s_flags & MS_BORN)) {
+	if (!(sb->s_flags & SB_BORN)) {
 		up_write(&sb->s_umount);
 		return 0;	/* sic - it's "nothing to do" */
 	}
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 6ae137c1bdf6..3d6ee0c0ebb0 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1269,6 +1269,33 @@ extern int send_sigurg(struct fown_struct *fown);
 
 struct mm_struct;
 
+/*
+ * sb->s_flags.  Note that these mirror the equivalent MS_* flags where
+ * represented in both.
+ */
+#define SB_RDONLY	 1	/* Mount read-only */
+#define SB_NOSUID	 2	/* Ignore suid and sgid bits */
+#define SB_NODEV	 4	/* Disallow access to device special files */
+#define SB_NOEXEC	 8	/* Disallow program execution */
+#define SB_SYNCHRONOUS	16	/* Writes are synced at once */
+#define SB_MANDLOCK	64	/* Allow mandatory locks on an FS */
+#define SB_DIRSYNC	128	/* Directory modifications are synchronous */
+#define SB_NOATIME	1024	/* Do not update access times. */
+#define SB_NODIRATIME	2048	/* Do not update directory access times */
+#define SB_SILENT	32768
+#define SB_POSIXACL	(1<<16)	/* VFS does not apply the umask */
+#define SB_KERNMOUNT	(1<<22) /* this is a kern_mount call */
+#define SB_I_VERSION	(1<<23) /* Update inode I_version field */
+#define SB_LAZYTIME	(1<<25) /* Update the on-disk [acm]times lazily */
+
+/* These sb flags are internal to the kernel */
+#define SB_SUBMOUNT     (1<<26)
+#define SB_NOREMOTELOCK	(1<<27)
+#define SB_NOSEC	(1<<28)
+#define SB_BORN		(1<<29)
+#define SB_ACTIVE	(1<<30)
+#define SB_NOUSER	(1<<31)
+
 /*
  *	Umount options
  */
@@ -1835,7 +1862,7 @@ struct super_operations {
  * possible to override it selectively if you really wanted to with some
  * ioctl() that is not currently implemented.
  *
- * Exception: MS_RDONLY is always applied to the entire file system.
+ * Exception: SB_RDONLY is always applied to the entire file system.
  *
  * Unfortunately, it is possible to change a filesystems flags with it mounted
  * with files in use.  This means that all of the inodes will not have their
@@ -1846,18 +1873,18 @@ struct super_operations {
 
 static inline bool sb_rdonly(const struct super_block *sb) { return sb->s_flags & MS_RDONLY; }
 #define IS_RDONLY(inode)	sb_rdonly((inode)->i_sb)
-#define IS_SYNC(inode)		(__IS_FLG(inode, MS_SYNCHRONOUS) || \
+#define IS_SYNC(inode)		(__IS_FLG(inode, SB_SYNCHRONOUS) || \
 					((inode)->i_flags & S_SYNC))
-#define IS_DIRSYNC(inode)	(__IS_FLG(inode, MS_SYNCHRONOUS|MS_DIRSYNC) || \
+#define IS_DIRSYNC(inode)	(__IS_FLG(inode, SB_SYNCHRONOUS|SB_DIRSYNC) || \
 					((inode)->i_flags & (S_SYNC|S_DIRSYNC)))
-#define IS_MANDLOCK(inode)	__IS_FLG(inode, MS_MANDLOCK)
-#define IS_NOATIME(inode)	__IS_FLG(inode, MS_RDONLY|MS_NOATIME)
-#define IS_I_VERSION(inode)	__IS_FLG(inode, MS_I_VERSION)
+#define IS_MANDLOCK(inode)	__IS_FLG(inode, SB_MANDLOCK)
+#define IS_NOATIME(inode)	__IS_FLG(inode, SB_RDONLY|SB_NOATIME)
+#define IS_I_VERSION(inode)	__IS_FLG(inode, SB_I_VERSION)
 
 #define IS_NOQUOTA(inode)	((inode)->i_flags & S_NOQUOTA)
 #define IS_APPEND(inode)	((inode)->i_flags & S_APPEND)
 #define IS_IMMUTABLE(inode)	((inode)->i_flags & S_IMMUTABLE)
-#define IS_POSIXACL(inode)	__IS_FLG(inode, MS_POSIXACL)
+#define IS_POSIXACL(inode)	__IS_FLG(inode, SB_POSIXACL)
 
 #define IS_DEADDIR(inode)	((inode)->i_flags & S_DEAD)
 #define IS_NOCMTIME(inode)	((inode)->i_flags & S_NOCMTIME)
@@ -2178,7 +2205,7 @@ static inline int __mandatory_lock(struct inode *ino)
 }
 
 /*
- * ... and these candidates should be on MS_MANDLOCK mounted fs,
+ * ... and these candidates should be on SB_MANDLOCK mounted fs,
  * otherwise these will be advisory locks
  */
 
@@ -3274,7 +3301,7 @@ static inline int check_sticky(struct inode *dir, struct inode *inode)
 
 static inline void inode_has_no_xattr(struct inode *inode)
 {
-	if (!is_sxid(inode->i_mode) && (inode->i_sb->s_flags & MS_NOSEC))
+	if (!is_sxid(inode->i_mode) && (inode->i_sb->s_flags & SB_NOSEC))
 		inode->i_flags |= S_NOSEC;
 }
 
diff --git a/init/do_mounts.c b/init/do_mounts.c
index bf7ea36ca286..f6d4dd764a52 100644
--- a/init/do_mounts.c
+++ b/init/do_mounts.c
@@ -420,8 +420,8 @@ retry:
 #endif
 		panic("VFS: Unable to mount root fs on %s", b);
 	}
-	if (!(flags & MS_RDONLY)) {
-		flags |= MS_RDONLY;
+	if (!(flags & SB_RDONLY)) {
+		flags |= SB_RDONLY;
 		goto retry;
 	}
 
-- 
cgit v1.2.3


From c698316661096e036b54448039b35e1c2c5809f0 Mon Sep 17 00:00:00 2001
From: Benson Leung <bleung@chromium.org>
Date: Mon, 17 Jul 2017 11:41:39 +0200
Subject: extcon: cros-ec: Add extcon-cros-ec driver to support display out

This is the driver for the USB Type C cable detection mechanism
built into the ChromeOS Embedded Controller on systems that
have USB Type-C ports.

At present, this allows for the presence of display out, but in
future, it may also be used to notify host and device type cables
and the presence of power.

Signed-off-by: Benson Leung <bleung@chromium.org>
Signed-off-by: Enric Balletbo i Serra <enric.balletbo@collabora.com>
Acked-by: Chanwoo Choi <cw00.chio@samsung.com>
Acked-by: Lee Jones <lee.jones@linaro.org>
Signed-off-by: Chanwoo Choi <cw00.choi@samsung.com>
---
 drivers/extcon/Kconfig               |   7 +
 drivers/extcon/Makefile              |   1 +
 drivers/extcon/extcon-usbc-cros-ec.c | 415 +++++++++++++++++++++++++++++++++++
 include/linux/mfd/cros_ec_commands.h |  75 +++++++
 4 files changed, 498 insertions(+)
 create mode 100644 drivers/extcon/extcon-usbc-cros-ec.c

(limited to 'include')

diff --git a/drivers/extcon/Kconfig b/drivers/extcon/Kconfig
index 6d50071f07d5..a7bca4207f44 100644
--- a/drivers/extcon/Kconfig
+++ b/drivers/extcon/Kconfig
@@ -150,4 +150,11 @@ config EXTCON_USB_GPIO
 	  Say Y here to enable GPIO based USB cable detection extcon support.
 	  Used typically if GPIO is used for USB ID pin detection.
 
+config EXTCON_USBC_CROS_EC
+	tristate "ChromeOS Embedded Controller EXTCON support"
+	depends on MFD_CROS_EC
+	help
+	  Say Y here to enable USB Type C cable detection extcon support when
+	  using Chrome OS EC based USB Type-C ports.
+
 endif
diff --git a/drivers/extcon/Makefile b/drivers/extcon/Makefile
index ecfa95804427..a73624e76193 100644
--- a/drivers/extcon/Makefile
+++ b/drivers/extcon/Makefile
@@ -20,3 +20,4 @@ obj-$(CONFIG_EXTCON_QCOM_SPMI_MISC) += extcon-qcom-spmi-misc.o
 obj-$(CONFIG_EXTCON_RT8973A)	+= extcon-rt8973a.o
 obj-$(CONFIG_EXTCON_SM5502)	+= extcon-sm5502.o
 obj-$(CONFIG_EXTCON_USB_GPIO)	+= extcon-usb-gpio.o
+obj-$(CONFIG_EXTCON_USBC_CROS_EC) += extcon-usbc-cros-ec.o
diff --git a/drivers/extcon/extcon-usbc-cros-ec.c b/drivers/extcon/extcon-usbc-cros-ec.c
new file mode 100644
index 000000000000..e759ed477735
--- /dev/null
+++ b/drivers/extcon/extcon-usbc-cros-ec.c
@@ -0,0 +1,415 @@
+/**
+ * drivers/extcon/extcon-usbc-cros-ec - ChromeOS Embedded Controller extcon
+ *
+ * Copyright (C) 2017 Google, Inc
+ * Author: Benson Leung <bleung@chromium.org>
+ *
+ * This software is licensed under the terms of the GNU General Public
+ * License version 2, as published by the Free Software Foundation, and
+ * may be copied, distributed, and modified under those terms.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ */
+
+#include <linux/extcon.h>
+#include <linux/kernel.h>
+#include <linux/mfd/cros_ec.h>
+#include <linux/module.h>
+#include <linux/notifier.h>
+#include <linux/of.h>
+#include <linux/platform_device.h>
+#include <linux/slab.h>
+#include <linux/sched.h>
+
+struct cros_ec_extcon_info {
+	struct device *dev;
+	struct extcon_dev *edev;
+
+	int port_id;
+
+	struct cros_ec_device *ec;
+
+	struct notifier_block notifier;
+
+	bool dp; /* DisplayPort enabled */
+	bool mux; /* SuperSpeed (usb3) enabled */
+	unsigned int power_type;
+};
+
+static const unsigned int usb_type_c_cable[] = {
+	EXTCON_DISP_DP,
+	EXTCON_NONE,
+};
+
+/**
+ * cros_ec_pd_command() - Send a command to the EC.
+ * @info: pointer to struct cros_ec_extcon_info
+ * @command: EC command
+ * @version: EC command version
+ * @outdata: EC command output data
+ * @outsize: Size of outdata
+ * @indata: EC command input data
+ * @insize: Size of indata
+ *
+ * Return: 0 on success, <0 on failure.
+ */
+static int cros_ec_pd_command(struct cros_ec_extcon_info *info,
+			      unsigned int command,
+			      unsigned int version,
+			      void *outdata,
+			      unsigned int outsize,
+			      void *indata,
+			      unsigned int insize)
+{
+	struct cros_ec_command *msg;
+	int ret;
+
+	msg = kzalloc(sizeof(*msg) + max(outsize, insize), GFP_KERNEL);
+
+	msg->version = version;
+	msg->command = command;
+	msg->outsize = outsize;
+	msg->insize = insize;
+
+	if (outsize)
+		memcpy(msg->data, outdata, outsize);
+
+	ret = cros_ec_cmd_xfer_status(info->ec, msg);
+	if (ret >= 0 && insize)
+		memcpy(indata, msg->data, insize);
+
+	kfree(msg);
+	return ret;
+}
+
+/**
+ * cros_ec_usb_get_power_type() - Get power type info about PD device attached
+ * to given port.
+ * @info: pointer to struct cros_ec_extcon_info
+ *
+ * Return: power type on success, <0 on failure.
+ */
+static int cros_ec_usb_get_power_type(struct cros_ec_extcon_info *info)
+{
+	struct ec_params_usb_pd_power_info req;
+	struct ec_response_usb_pd_power_info resp;
+	int ret;
+
+	req.port = info->port_id;
+	ret = cros_ec_pd_command(info, EC_CMD_USB_PD_POWER_INFO, 0,
+				 &req, sizeof(req), &resp, sizeof(resp));
+	if (ret < 0)
+		return ret;
+
+	return resp.type;
+}
+
+/**
+ * cros_ec_usb_get_pd_mux_state() - Get PD mux state for given port.
+ * @info: pointer to struct cros_ec_extcon_info
+ *
+ * Return: PD mux state on success, <0 on failure.
+ */
+static int cros_ec_usb_get_pd_mux_state(struct cros_ec_extcon_info *info)
+{
+	struct ec_params_usb_pd_mux_info req;
+	struct ec_response_usb_pd_mux_info resp;
+	int ret;
+
+	req.port = info->port_id;
+	ret = cros_ec_pd_command(info, EC_CMD_USB_PD_MUX_INFO, 0,
+				 &req, sizeof(req),
+				 &resp, sizeof(resp));
+	if (ret < 0)
+		return ret;
+
+	return resp.flags;
+}
+
+/**
+ * cros_ec_usb_get_role() - Get role info about possible PD device attached to a
+ * given port.
+ * @info: pointer to struct cros_ec_extcon_info
+ * @polarity: pointer to cable polarity (return value)
+ *
+ * Return: role info on success, -ENOTCONN if no cable is connected, <0 on
+ * failure.
+ */
+static int cros_ec_usb_get_role(struct cros_ec_extcon_info *info,
+				bool *polarity)
+{
+	struct ec_params_usb_pd_control pd_control;
+	struct ec_response_usb_pd_control_v1 resp;
+	int ret;
+
+	pd_control.port = info->port_id;
+	pd_control.role = USB_PD_CTRL_ROLE_NO_CHANGE;
+	pd_control.mux = USB_PD_CTRL_MUX_NO_CHANGE;
+	ret = cros_ec_pd_command(info, EC_CMD_USB_PD_CONTROL, 1,
+				 &pd_control, sizeof(pd_control),
+				 &resp, sizeof(resp));
+	if (ret < 0)
+		return ret;
+
+	if (!(resp.enabled & PD_CTRL_RESP_ENABLED_CONNECTED))
+		return -ENOTCONN;
+
+	*polarity = resp.polarity;
+
+	return resp.role;
+}
+
+/**
+ * cros_ec_pd_get_num_ports() - Get number of EC charge ports.
+ * @info: pointer to struct cros_ec_extcon_info
+ *
+ * Return: number of ports on success, <0 on failure.
+ */
+static int cros_ec_pd_get_num_ports(struct cros_ec_extcon_info *info)
+{
+	struct ec_response_usb_pd_ports resp;
+	int ret;
+
+	ret = cros_ec_pd_command(info, EC_CMD_USB_PD_PORTS,
+				 0, NULL, 0, &resp, sizeof(resp));
+	if (ret < 0)
+		return ret;
+
+	return resp.num_ports;
+}
+
+static int extcon_cros_ec_detect_cable(struct cros_ec_extcon_info *info,
+				       bool force)
+{
+	struct device *dev = info->dev;
+	int role, power_type;
+	bool polarity = false;
+	bool dp = false;
+	bool mux = false;
+	bool hpd = false;
+
+	power_type = cros_ec_usb_get_power_type(info);
+	if (power_type < 0) {
+		dev_err(dev, "failed getting power type err = %d\n",
+			power_type);
+		return power_type;
+	}
+
+	role = cros_ec_usb_get_role(info, &polarity);
+	if (role < 0) {
+		if (role != -ENOTCONN) {
+			dev_err(dev, "failed getting role err = %d\n", role);
+			return role;
+		}
+	} else {
+		int pd_mux_state;
+
+		pd_mux_state = cros_ec_usb_get_pd_mux_state(info);
+		if (pd_mux_state < 0)
+			pd_mux_state = USB_PD_MUX_USB_ENABLED;
+
+		dp = pd_mux_state & USB_PD_MUX_DP_ENABLED;
+		mux = pd_mux_state & USB_PD_MUX_USB_ENABLED;
+		hpd = pd_mux_state & USB_PD_MUX_HPD_IRQ;
+	}
+
+	if (force || info->dp != dp || info->mux != mux ||
+		info->power_type != power_type) {
+
+		info->dp = dp;
+		info->mux = mux;
+		info->power_type = power_type;
+
+		extcon_set_state(info->edev, EXTCON_DISP_DP, dp);
+
+		extcon_set_property(info->edev, EXTCON_DISP_DP,
+				    EXTCON_PROP_USB_TYPEC_POLARITY,
+				    (union extcon_property_value)(int)polarity);
+		extcon_set_property(info->edev, EXTCON_DISP_DP,
+				    EXTCON_PROP_USB_SS,
+				    (union extcon_property_value)(int)mux);
+		extcon_set_property(info->edev, EXTCON_DISP_DP,
+				    EXTCON_PROP_DISP_HPD,
+				    (union extcon_property_value)(int)hpd);
+
+		extcon_sync(info->edev, EXTCON_DISP_DP);
+
+	} else if (hpd) {
+		extcon_set_property(info->edev, EXTCON_DISP_DP,
+				    EXTCON_PROP_DISP_HPD,
+				    (union extcon_property_value)(int)hpd);
+		extcon_sync(info->edev, EXTCON_DISP_DP);
+	}
+
+	return 0;
+}
+
+static int extcon_cros_ec_event(struct notifier_block *nb,
+				unsigned long queued_during_suspend,
+				void *_notify)
+{
+	struct cros_ec_extcon_info *info;
+	struct cros_ec_device *ec;
+	u32 host_event;
+
+	info = container_of(nb, struct cros_ec_extcon_info, notifier);
+	ec = info->ec;
+
+	host_event = cros_ec_get_host_event(ec);
+	if (host_event & (EC_HOST_EVENT_MASK(EC_HOST_EVENT_PD_MCU) |
+			  EC_HOST_EVENT_MASK(EC_HOST_EVENT_USB_MUX))) {
+		extcon_cros_ec_detect_cable(info, false);
+		return NOTIFY_OK;
+	}
+
+	return NOTIFY_DONE;
+}
+
+static int extcon_cros_ec_probe(struct platform_device *pdev)
+{
+	struct cros_ec_extcon_info *info;
+	struct cros_ec_device *ec = dev_get_drvdata(pdev->dev.parent);
+	struct device *dev = &pdev->dev;
+	struct device_node *np = dev->of_node;
+	int numports, ret;
+
+	info = devm_kzalloc(dev, sizeof(*info), GFP_KERNEL);
+	if (!info)
+		return -ENOMEM;
+
+	info->dev = dev;
+	info->ec = ec;
+
+	if (np) {
+		u32 port;
+
+		ret = of_property_read_u32(np, "google,usb-port-id", &port);
+		if (ret < 0) {
+			dev_err(dev, "Missing google,usb-port-id property\n");
+			return ret;
+		}
+		info->port_id = port;
+	} else {
+		info->port_id = pdev->id;
+	}
+
+	numports = cros_ec_pd_get_num_ports(info);
+	if (numports < 0) {
+		dev_err(dev, "failed getting number of ports! ret = %d\n",
+			numports);
+		return numports;
+	}
+
+	if (info->port_id >= numports) {
+		dev_err(dev, "This system only supports %d ports\n", numports);
+		return -ENODEV;
+	}
+
+	info->edev = devm_extcon_dev_allocate(dev, usb_type_c_cable);
+	if (IS_ERR(info->edev)) {
+		dev_err(dev, "failed to allocate extcon device\n");
+		return -ENOMEM;
+	}
+
+	ret = devm_extcon_dev_register(dev, info->edev);
+	if (ret < 0) {
+		dev_err(dev, "failed to register extcon device\n");
+		return ret;
+	}
+
+	extcon_set_property_capability(info->edev, EXTCON_DISP_DP,
+				       EXTCON_PROP_USB_TYPEC_POLARITY);
+	extcon_set_property_capability(info->edev, EXTCON_DISP_DP,
+				       EXTCON_PROP_USB_SS);
+	extcon_set_property_capability(info->edev, EXTCON_DISP_DP,
+				       EXTCON_PROP_DISP_HPD);
+
+	platform_set_drvdata(pdev, info);
+
+	/* Get PD events from the EC */
+	info->notifier.notifier_call = extcon_cros_ec_event;
+	ret = blocking_notifier_chain_register(&info->ec->event_notifier,
+					       &info->notifier);
+	if (ret < 0) {
+		dev_err(dev, "failed to register notifier\n");
+		return ret;
+	}
+
+	/* Perform initial detection */
+	ret = extcon_cros_ec_detect_cable(info, true);
+	if (ret < 0) {
+		dev_err(dev, "failed to detect initial cable state\n");
+		goto unregister_notifier;
+	}
+
+	return 0;
+
+unregister_notifier:
+	blocking_notifier_chain_unregister(&info->ec->event_notifier,
+					   &info->notifier);
+	return ret;
+}
+
+static int extcon_cros_ec_remove(struct platform_device *pdev)
+{
+	struct cros_ec_extcon_info *info = platform_get_drvdata(pdev);
+
+	blocking_notifier_chain_unregister(&info->ec->event_notifier,
+					   &info->notifier);
+
+	return 0;
+}
+
+#ifdef CONFIG_PM_SLEEP
+static int extcon_cros_ec_suspend(struct device *dev)
+{
+	return 0;
+}
+
+static int extcon_cros_ec_resume(struct device *dev)
+{
+	int ret;
+	struct cros_ec_extcon_info *info = dev_get_drvdata(dev);
+
+	ret = extcon_cros_ec_detect_cable(info, true);
+	if (ret < 0)
+		dev_err(dev, "failed to detect cable state on resume\n");
+
+	return 0;
+}
+
+static const struct dev_pm_ops extcon_cros_ec_dev_pm_ops = {
+	SET_SYSTEM_SLEEP_PM_OPS(extcon_cros_ec_suspend, extcon_cros_ec_resume)
+};
+
+#define DEV_PM_OPS	(&extcon_cros_ec_dev_pm_ops)
+#else
+#define DEV_PM_OPS	NULL
+#endif /* CONFIG_PM_SLEEP */
+
+#ifdef CONFIG_OF
+static const struct of_device_id extcon_cros_ec_of_match[] = {
+	{ .compatible = "google,extcon-usbc-cros-ec" },
+	{ /* sentinel */ }
+};
+MODULE_DEVICE_TABLE(of, extcon_cros_ec_of_match);
+#endif /* CONFIG_OF */
+
+static struct platform_driver extcon_cros_ec_driver = {
+	.driver = {
+		.name  = "extcon-usbc-cros-ec",
+		.of_match_table = of_match_ptr(extcon_cros_ec_of_match),
+		.pm = DEV_PM_OPS,
+	},
+	.remove  = extcon_cros_ec_remove,
+	.probe   = extcon_cros_ec_probe,
+};
+
+module_platform_driver(extcon_cros_ec_driver);
+
+MODULE_DESCRIPTION("ChromeOS Embedded Controller extcon driver");
+MODULE_AUTHOR("Benson Leung <bleung@chromium.org>");
+MODULE_LICENSE("GPL");
diff --git a/include/linux/mfd/cros_ec_commands.h b/include/linux/mfd/cros_ec_commands.h
index 190c8f4afa02..2b16e95b9bb8 100644
--- a/include/linux/mfd/cros_ec_commands.h
+++ b/include/linux/mfd/cros_ec_commands.h
@@ -285,6 +285,11 @@ enum host_event_code {
 	EC_HOST_EVENT_HANG_DETECT = 20,
 	/* Hang detect logic detected a hang and warm rebooted the AP */
 	EC_HOST_EVENT_HANG_REBOOT = 21,
+	/* PD MCU triggering host event */
+	EC_HOST_EVENT_PD_MCU = 22,
+
+	/* EC desires to change state of host-controlled USB mux */
+	EC_HOST_EVENT_USB_MUX = 28,
 
 	/*
 	 * The high bit of the event mask is not used as a host event code.  If
@@ -2905,6 +2910,76 @@ struct ec_params_usb_pd_control {
 	uint8_t mux;
 } __packed;
 
+#define PD_CTRL_RESP_ENABLED_COMMS      (1 << 0) /* Communication enabled */
+#define PD_CTRL_RESP_ENABLED_CONNECTED  (1 << 1) /* Device connected */
+#define PD_CTRL_RESP_ENABLED_PD_CAPABLE (1 << 2) /* Partner is PD capable */
+
+struct ec_response_usb_pd_control_v1 {
+	uint8_t enabled;
+	uint8_t role;
+	uint8_t polarity;
+	char state[32];
+} __packed;
+
+#define EC_CMD_USB_PD_PORTS 0x102
+
+struct ec_response_usb_pd_ports {
+	uint8_t num_ports;
+} __packed;
+
+#define EC_CMD_USB_PD_POWER_INFO 0x103
+
+#define PD_POWER_CHARGING_PORT 0xff
+struct ec_params_usb_pd_power_info {
+	uint8_t port;
+} __packed;
+
+enum usb_chg_type {
+	USB_CHG_TYPE_NONE,
+	USB_CHG_TYPE_PD,
+	USB_CHG_TYPE_C,
+	USB_CHG_TYPE_PROPRIETARY,
+	USB_CHG_TYPE_BC12_DCP,
+	USB_CHG_TYPE_BC12_CDP,
+	USB_CHG_TYPE_BC12_SDP,
+	USB_CHG_TYPE_OTHER,
+	USB_CHG_TYPE_VBUS,
+	USB_CHG_TYPE_UNKNOWN,
+};
+
+struct usb_chg_measures {
+	uint16_t voltage_max;
+	uint16_t voltage_now;
+	uint16_t current_max;
+	uint16_t current_lim;
+} __packed;
+
+struct ec_response_usb_pd_power_info {
+	uint8_t role;
+	uint8_t type;
+	uint8_t dualrole;
+	uint8_t reserved1;
+	struct usb_chg_measures meas;
+	uint32_t max_power;
+} __packed;
+
+/* Get info about USB-C SS muxes */
+#define EC_CMD_USB_PD_MUX_INFO 0x11a
+
+struct ec_params_usb_pd_mux_info {
+	uint8_t port; /* USB-C port number */
+} __packed;
+
+/* Flags representing mux state */
+#define USB_PD_MUX_USB_ENABLED       (1 << 0)
+#define USB_PD_MUX_DP_ENABLED        (1 << 1)
+#define USB_PD_MUX_POLARITY_INVERTED (1 << 2)
+#define USB_PD_MUX_HPD_IRQ           (1 << 3)
+
+struct ec_response_usb_pd_mux_info {
+	uint8_t flags; /* USB_PD_MUX_*-encoded USB mux state */
+} __packed;
+
 /*****************************************************************************/
 /*
  * Passthru commands
-- 
cgit v1.2.3


From 450f0f6a8fb499a6ef3d10ab5c29a7c4650a9958 Mon Sep 17 00:00:00 2001
From: "oder_chiou@realtek.com" <oder_chiou@realtek.com>
Date: Mon, 10 Jul 2017 11:14:56 +0800
Subject: ASoC: rt5663: Add the manual offset field to compensate the DC offset

The patch adds the manual offset field in the devicetree to compensate the
DC offset that will be different between the PCB layout. It only can be
measured by the real production.

Signed-off-by: Oder Chiou <oder_chiou@realtek.com>
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 Documentation/devicetree/bindings/sound/rt5663.txt |  5 ++
 include/sound/rt5663.h                             | 20 ++++++++
 sound/soc/codecs/rt5663.c                          | 55 ++++++++++++++++++++--
 sound/soc/codecs/rt5663.h                          |  2 +
 4 files changed, 77 insertions(+), 5 deletions(-)
 create mode 100644 include/sound/rt5663.h

(limited to 'include')

diff --git a/Documentation/devicetree/bindings/sound/rt5663.txt b/Documentation/devicetree/bindings/sound/rt5663.txt
index 70eaeaed2b18..f7c7cc513ee8 100644
--- a/Documentation/devicetree/bindings/sound/rt5663.txt
+++ b/Documentation/devicetree/bindings/sound/rt5663.txt
@@ -12,6 +12,11 @@ Required properties:
 
 Optional properties:
 
+- "realtek,dc_offset_l_manual"
+- "realtek,dc_offset_r_manual"
+  Based on the different PCB layout, add the manual offset value to
+  compensate the DC offset for each L and R channel.
+
 Pins on the device (for linking into audio routes) for RT5663:
 
   * IN1P
diff --git a/include/sound/rt5663.h b/include/sound/rt5663.h
new file mode 100644
index 000000000000..be6f1b2d8fbd
--- /dev/null
+++ b/include/sound/rt5663.h
@@ -0,0 +1,20 @@
+/*
+ * linux/sound/rt5663.h -- Platform data for RT5663
+ *
+ * Copyright 2017 Realtek Semiconductor Corp.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#ifndef __LINUX_SND_RT5663_H
+#define __LINUX_SND_RT5663_H
+
+struct rt5663_platform_data {
+	unsigned int dc_offset_l_manual;
+	unsigned int dc_offset_r_manual;
+};
+
+#endif
+
diff --git a/sound/soc/codecs/rt5663.c b/sound/soc/codecs/rt5663.c
index 5b3c50c641d8..963cda9c995a 100644
--- a/sound/soc/codecs/rt5663.c
+++ b/sound/soc/codecs/rt5663.c
@@ -40,6 +40,7 @@ enum {
 
 struct rt5663_priv {
 	struct snd_soc_codec *codec;
+	struct rt5663_platform_data pdata;
 	struct regmap *regmap;
 	struct delayed_work jack_detect_work;
 	struct snd_soc_jack *hs_jack;
@@ -57,6 +58,10 @@ struct rt5663_priv {
 	int jack_type;
 };
 
+static const struct reg_sequence rt5663_patch_list[] = {
+	{ 0x002a, 0x8080 },
+};
+
 static const struct reg_default rt5663_v2_reg[] = {
 	{ 0x0000, 0x0000 },
 	{ 0x0001, 0xc8c8 },
@@ -476,7 +481,7 @@ static const struct reg_default rt5663_reg[] = {
 	{ 0x0023, 0x0039 },
 	{ 0x0026, 0xc0c0 },
 	{ 0x0029, 0x8080 },
-	{ 0x002a, 0xa0a0 },
+	{ 0x002a, 0x8080 },
 	{ 0x002c, 0x000c },
 	{ 0x002d, 0x0000 },
 	{ 0x0040, 0x0808 },
@@ -1958,15 +1963,11 @@ static const struct snd_kcontrol_new rt5663_adda_r_mix[] = {
 static const struct snd_kcontrol_new rt5663_sto1_dac_l_mix[] = {
 	SOC_DAPM_SINGLE("DAC L Switch", RT5663_STO_DAC_MIXER,
 			RT5663_M_DAC_L1_STO_L_SHIFT, 1, 1),
-	SOC_DAPM_SINGLE("DAC R Switch", RT5663_STO_DAC_MIXER,
-			RT5663_M_DAC_R1_STO_L_SHIFT, 1, 1),
 };
 
 static const struct snd_kcontrol_new rt5663_sto1_dac_r_mix[] = {
 	SOC_DAPM_SINGLE("DAC L Switch", RT5663_STO_DAC_MIXER,
 			RT5663_M_DAC_L1_STO_R_SHIFT, 1, 1),
-	SOC_DAPM_SINGLE("DAC R Switch", RT5663_STO_DAC_MIXER,
-			RT5663_M_DAC_R1_STO_R_SHIFT, 1, 1),
 };
 
 /* Out Switch */
@@ -3126,9 +3127,20 @@ static void rt5663_calibrate(struct rt5663_priv *rt5663)
 	usleep_range(10000, 10005);
 }
 
+static int rt5663_parse_dp(struct rt5663_priv *rt5663, struct device *dev)
+{
+	device_property_read_u32(dev, "realtek,dc_offset_l_manual",
+		&rt5663->pdata.dc_offset_l_manual);
+	device_property_read_u32(dev, "realtek,dc_offset_r_manual",
+		&rt5663->pdata.dc_offset_r_manual);
+
+	return 0;
+}
+
 static int rt5663_i2c_probe(struct i2c_client *i2c,
 		    const struct i2c_device_id *id)
 {
+	struct rt5663_platform_data *pdata = dev_get_platdata(&i2c->dev);
 	struct rt5663_priv *rt5663;
 	int ret;
 	unsigned int val;
@@ -3142,6 +3154,11 @@ static int rt5663_i2c_probe(struct i2c_client *i2c,
 
 	i2c_set_clientdata(i2c, rt5663);
 
+	if (pdata)
+		rt5663->pdata = *pdata;
+	else
+		rt5663_parse_dp(rt5663, &i2c->dev);
+
 	regmap = devm_regmap_init_i2c(i2c, &temp_regmap);
 	if (IS_ERR(regmap)) {
 		ret = PTR_ERR(regmap);
@@ -3190,6 +3207,34 @@ static int rt5663_i2c_probe(struct i2c_client *i2c,
 	regmap_write(rt5663->regmap, RT5663_RESET, 0);
 	dev_dbg(&i2c->dev, "calibrate done\n");
 
+	switch (rt5663->codec_ver) {
+	case CODEC_VER_1:
+		break;
+	case CODEC_VER_0:
+		ret = regmap_register_patch(rt5663->regmap, rt5663_patch_list,
+					    ARRAY_SIZE(rt5663_patch_list));
+		if (ret != 0)
+			dev_warn(&i2c->dev,
+				"Failed to apply regmap patch: %d\n", ret);
+		break;
+	default:
+		dev_err(&i2c->dev, "%s:Unknown codec type\n", __func__);
+	}
+
+	if (rt5663->pdata.dc_offset_l_manual) {
+		regmap_write(rt5663->regmap, RT5663_MIC_DECRO_2,
+			rt5663->pdata.dc_offset_l_manual >> 16);
+		regmap_write(rt5663->regmap, RT5663_MIC_DECRO_3,
+			rt5663->pdata.dc_offset_l_manual & 0xffff);
+	}
+
+	if (rt5663->pdata.dc_offset_r_manual) {
+		regmap_write(rt5663->regmap, RT5663_MIC_DECRO_5,
+			rt5663->pdata.dc_offset_r_manual >> 16);
+		regmap_write(rt5663->regmap, RT5663_MIC_DECRO_6,
+			rt5663->pdata.dc_offset_r_manual & 0xffff);
+	}
+
 	/* GPIO1 as IRQ */
 	regmap_update_bits(rt5663->regmap, RT5663_GPIO_1, RT5663_GP1_PIN_MASK,
 		RT5663_GP1_PIN_IRQ);
diff --git a/sound/soc/codecs/rt5663.h b/sound/soc/codecs/rt5663.h
index 4621812c94d8..c5a9b69579ad 100644
--- a/sound/soc/codecs/rt5663.h
+++ b/sound/soc/codecs/rt5663.h
@@ -12,6 +12,8 @@
 #ifndef __RT5663_H__
 #define __RT5663_H__
 
+#include <sound/rt5663.h>
+
 /* Info */
 #define RT5663_RESET				0x0000
 #define RT5663_VENDOR_ID			0x00fd
-- 
cgit v1.2.3


From c54182ec0e157988f0cafd1e8d37b68ab4210f87 Mon Sep 17 00:00:00 2001
From: Borislav Petkov <bp@suse.de>
Date: Thu, 29 Jun 2017 12:00:05 +0200
Subject: EDAC: Get rid of mci->mod_ver
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

It is a write-only variable so get rid of it.

Signed-off-by: Borislav Petkov <bp@suse.de>
Acked-by: Robert Richter <rric@kernel.org>
Acked-by: Michal Simek <michal.simek@xilinx.com>
Acked-by: Thor Thayer <thor.thayer@linux.intel.com>
Acked-by: Tony Luck <tony.luck@intel.com>
Cc: Mark Gross <mark.gross@intel.com>
Cc: Tim Small <tim@buttersideup.com>
Cc: Ranganathan Desikan <ravi@jetztechnologies.com>
Cc: "Arvind R." <arvino55@gmail.com>
Cc: Jason Baron <jbaron@akamai.com>
Cc: "Sören Brinkmann" <soren.brinkmann@xilinx.com>
Cc: Ralf Baechle <ralf@linux-mips.org>
Cc: David Daney <david.daney@cavium.com>
Cc: Loc Ho <lho@apm.com>
Cc: linux-edac@vger.kernel.org
Cc: linux-kernel@vger.kernel.org
Cc: linux-arm-kernel@lists.infradead.org
Cc: linux-mips@linux-mips.org
---
 drivers/edac/altera_edac.c      | 2 --
 drivers/edac/amd64_edac.c       | 1 -
 drivers/edac/amd76x_edac.c      | 2 --
 drivers/edac/cpc925_edac.c      | 1 -
 drivers/edac/e752x_edac.c       | 2 --
 drivers/edac/e7xxx_edac.c       | 2 --
 drivers/edac/ghes_edac.c        | 3 ---
 drivers/edac/highbank_mc_edac.c | 1 -
 drivers/edac/i3000_edac.c       | 3 ---
 drivers/edac/i3200_edac.c       | 3 ---
 drivers/edac/i5000_edac.c       | 1 -
 drivers/edac/i5100_edac.c       | 1 -
 drivers/edac/i5400_edac.c       | 1 -
 drivers/edac/i7300_edac.c       | 1 -
 drivers/edac/i7core_edac.c      | 1 -
 drivers/edac/i82443bxgx_edac.c  | 3 ---
 drivers/edac/i82860_edac.c      | 2 --
 drivers/edac/i82875p_edac.c     | 2 --
 drivers/edac/i82975x_edac.c     | 2 --
 drivers/edac/ie31200_edac.c     | 2 --
 drivers/edac/mv64x60_edac.c     | 1 -
 drivers/edac/ppc4xx_edac.c      | 1 -
 drivers/edac/r82600_edac.c      | 2 --
 drivers/edac/sb_edac.c          | 1 -
 drivers/edac/skx_edac.c         | 3 ---
 drivers/edac/synopsys_edac.c    | 1 -
 drivers/edac/thunderx_edac.c    | 1 -
 drivers/edac/x38_edac.c         | 3 ---
 drivers/edac/xgene_edac.c       | 1 -
 include/linux/edac.h            | 1 -
 30 files changed, 51 deletions(-)

(limited to 'include')

diff --git a/drivers/edac/altera_edac.c b/drivers/edac/altera_edac.c
index db75d4b614f7..fa2e5db56d24 100644
--- a/drivers/edac/altera_edac.c
+++ b/drivers/edac/altera_edac.c
@@ -38,7 +38,6 @@
 #include "edac_module.h"
 
 #define EDAC_MOD_STR		"altera_edac"
-#define EDAC_VERSION		"1"
 #define EDAC_DEVICE		"Altera"
 
 static const struct altr_sdram_prv_data c5_data = {
@@ -392,7 +391,6 @@ static int altr_sdram_probe(struct platform_device *pdev)
 	mci->edac_ctl_cap = EDAC_FLAG_NONE | EDAC_FLAG_SECDED;
 	mci->edac_cap = EDAC_FLAG_SECDED;
 	mci->mod_name = EDAC_MOD_STR;
-	mci->mod_ver = EDAC_VERSION;
 	mci->ctl_name = dev_name(&pdev->dev);
 	mci->scrub_mode = SCRUB_SW_SRC;
 	mci->dev_name = dev_name(&pdev->dev);
diff --git a/drivers/edac/amd64_edac.c b/drivers/edac/amd64_edac.c
index 3aea55698165..ac2f30295efe 100644
--- a/drivers/edac/amd64_edac.c
+++ b/drivers/edac/amd64_edac.c
@@ -3130,7 +3130,6 @@ static void setup_mci_misc_attrs(struct mem_ctl_info *mci,
 
 	mci->edac_cap		= determine_edac_cap(pvt);
 	mci->mod_name		= EDAC_MOD_STR;
-	mci->mod_ver		= EDAC_AMD64_VERSION;
 	mci->ctl_name		= fam->ctl_name;
 	mci->dev_name		= pci_name(pvt->F3);
 	mci->ctl_page_to_phys	= NULL;
diff --git a/drivers/edac/amd76x_edac.c b/drivers/edac/amd76x_edac.c
index a7450275ad28..9c6e326b4c14 100644
--- a/drivers/edac/amd76x_edac.c
+++ b/drivers/edac/amd76x_edac.c
@@ -19,7 +19,6 @@
 #include <linux/edac.h>
 #include "edac_module.h"
 
-#define AMD76X_REVISION	" Ver: 2.0.2"
 #define EDAC_MOD_STR	"amd76x_edac"
 
 #define amd76x_printk(level, fmt, arg...) \
@@ -263,7 +262,6 @@ static int amd76x_probe1(struct pci_dev *pdev, int dev_idx)
 	mci->edac_cap = ems_mode ?
 		(EDAC_FLAG_EC | EDAC_FLAG_SECDED) : EDAC_FLAG_NONE;
 	mci->mod_name = EDAC_MOD_STR;
-	mci->mod_ver = AMD76X_REVISION;
 	mci->ctl_name = amd76x_devs[dev_idx].ctl_name;
 	mci->dev_name = pci_name(pdev);
 	mci->edac_check = amd76x_check;
diff --git a/drivers/edac/cpc925_edac.c b/drivers/edac/cpc925_edac.c
index 837b62c4993d..cba8ee14a067 100644
--- a/drivers/edac/cpc925_edac.c
+++ b/drivers/edac/cpc925_edac.c
@@ -999,7 +999,6 @@ static int cpc925_probe(struct platform_device *pdev)
 	mci->edac_ctl_cap = EDAC_FLAG_NONE | EDAC_FLAG_SECDED;
 	mci->edac_cap = EDAC_FLAG_SECDED;
 	mci->mod_name = CPC925_EDAC_MOD_STR;
-	mci->mod_ver = CPC925_EDAC_REVISION;
 	mci->ctl_name = pdev->name;
 
 	if (edac_op_state == EDAC_OPSTATE_POLL)
diff --git a/drivers/edac/e752x_edac.c b/drivers/edac/e752x_edac.c
index 1a352cae1f52..b5de9a13ea3f 100644
--- a/drivers/edac/e752x_edac.c
+++ b/drivers/edac/e752x_edac.c
@@ -26,7 +26,6 @@
 #include <linux/edac.h>
 #include "edac_module.h"
 
-#define E752X_REVISION	" Ver: 2.0.2"
 #define EDAC_MOD_STR	"e752x_edac"
 
 static int report_non_memory_errors;
@@ -1303,7 +1302,6 @@ static int e752x_probe1(struct pci_dev *pdev, int dev_idx)
 		(EDAC_FLAG_NONE | EDAC_FLAG_SECDED | EDAC_FLAG_S4ECD4ED);
 	/* FIXME - what if different memory types are in different csrows? */
 	mci->mod_name = EDAC_MOD_STR;
-	mci->mod_ver = E752X_REVISION;
 	mci->pdev = &pdev->dev;
 
 	edac_dbg(3, "init pvt\n");
diff --git a/drivers/edac/e7xxx_edac.c b/drivers/edac/e7xxx_edac.c
index 67ef07aed923..75d7ce62b3be 100644
--- a/drivers/edac/e7xxx_edac.c
+++ b/drivers/edac/e7xxx_edac.c
@@ -32,7 +32,6 @@
 #include <linux/edac.h>
 #include "edac_module.h"
 
-#define	E7XXX_REVISION " Ver: 2.0.2"
 #define	EDAC_MOD_STR	"e7xxx_edac"
 
 #define e7xxx_printk(level, fmt, arg...) \
@@ -458,7 +457,6 @@ static int e7xxx_probe1(struct pci_dev *pdev, int dev_idx)
 		EDAC_FLAG_S4ECD4ED;
 	/* FIXME - what if different memory types are in different csrows? */
 	mci->mod_name = EDAC_MOD_STR;
-	mci->mod_ver = E7XXX_REVISION;
 	mci->pdev = &pdev->dev;
 	edac_dbg(3, "init pvt\n");
 	pvt = (struct e7xxx_pvt *)mci->pvt_info;
diff --git a/drivers/edac/ghes_edac.c b/drivers/edac/ghes_edac.c
index 4e61a6229dd2..6f80eb65c26c 100644
--- a/drivers/edac/ghes_edac.c
+++ b/drivers/edac/ghes_edac.c
@@ -17,8 +17,6 @@
 #include "edac_module.h"
 #include <ras/ras_event.h>
 
-#define GHES_EDAC_REVISION " Ver: 1.0.0"
-
 struct ghes_edac_pvt {
 	struct list_head list;
 	struct ghes *ghes;
@@ -451,7 +449,6 @@ int ghes_edac_register(struct ghes *ghes, struct device *dev)
 	mci->edac_ctl_cap = EDAC_FLAG_NONE;
 	mci->edac_cap = EDAC_FLAG_NONE;
 	mci->mod_name = "ghes_edac.c";
-	mci->mod_ver = GHES_EDAC_REVISION;
 	mci->ctl_name = "ghes_edac";
 	mci->dev_name = "ghes";
 
diff --git a/drivers/edac/highbank_mc_edac.c b/drivers/edac/highbank_mc_edac.c
index 0e7e0a404d89..6092e61be605 100644
--- a/drivers/edac/highbank_mc_edac.c
+++ b/drivers/edac/highbank_mc_edac.c
@@ -224,7 +224,6 @@ static int highbank_mc_probe(struct platform_device *pdev)
 	mci->edac_ctl_cap = EDAC_FLAG_NONE | EDAC_FLAG_SECDED;
 	mci->edac_cap = EDAC_FLAG_SECDED;
 	mci->mod_name = pdev->dev.driver->name;
-	mci->mod_ver = "1";
 	mci->ctl_name = id->compatible;
 	mci->dev_name = dev_name(&pdev->dev);
 	mci->scrub_mode = SCRUB_SW_SRC;
diff --git a/drivers/edac/i3000_edac.c b/drivers/edac/i3000_edac.c
index 5306240570d7..8085a32ec3bd 100644
--- a/drivers/edac/i3000_edac.c
+++ b/drivers/edac/i3000_edac.c
@@ -16,8 +16,6 @@
 #include <linux/edac.h>
 #include "edac_module.h"
 
-#define I3000_REVISION		"1.1"
-
 #define EDAC_MOD_STR		"i3000_edac"
 
 #define I3000_RANKS		8
@@ -375,7 +373,6 @@ static int i3000_probe1(struct pci_dev *pdev, int dev_idx)
 	mci->edac_cap = EDAC_FLAG_SECDED;
 
 	mci->mod_name = EDAC_MOD_STR;
-	mci->mod_ver = I3000_REVISION;
 	mci->ctl_name = i3000_devs[dev_idx].ctl_name;
 	mci->dev_name = pci_name(pdev);
 	mci->edac_check = i3000_check;
diff --git a/drivers/edac/i3200_edac.c b/drivers/edac/i3200_edac.c
index 77c58d201a30..d92d56cee101 100644
--- a/drivers/edac/i3200_edac.c
+++ b/drivers/edac/i3200_edac.c
@@ -17,8 +17,6 @@
 
 #include <linux/io-64-nonatomic-lo-hi.h>
 
-#define I3200_REVISION        "1.1"
-
 #define EDAC_MOD_STR        "i3200_edac"
 
 #define PCI_DEVICE_ID_INTEL_3200_HB    0x29f0
@@ -375,7 +373,6 @@ static int i3200_probe1(struct pci_dev *pdev, int dev_idx)
 	mci->edac_cap = EDAC_FLAG_SECDED;
 
 	mci->mod_name = EDAC_MOD_STR;
-	mci->mod_ver = I3200_REVISION;
 	mci->ctl_name = i3200_devs[dev_idx].ctl_name;
 	mci->dev_name = pci_name(pdev);
 	mci->edac_check = i3200_check;
diff --git a/drivers/edac/i5000_edac.c b/drivers/edac/i5000_edac.c
index 8f5a56e25bd2..53f24b18cd61 100644
--- a/drivers/edac/i5000_edac.c
+++ b/drivers/edac/i5000_edac.c
@@ -1430,7 +1430,6 @@ static int i5000_probe1(struct pci_dev *pdev, int dev_idx)
 	mci->edac_ctl_cap = EDAC_FLAG_NONE;
 	mci->edac_cap = EDAC_FLAG_NONE;
 	mci->mod_name = "i5000_edac.c";
-	mci->mod_ver = I5000_REVISION;
 	mci->ctl_name = i5000_devs[dev_idx].ctl_name;
 	mci->dev_name = pci_name(pdev);
 	mci->ctl_page_to_phys = NULL;
diff --git a/drivers/edac/i5100_edac.c b/drivers/edac/i5100_edac.c
index a8334c4acea7..b506eef6b146 100644
--- a/drivers/edac/i5100_edac.c
+++ b/drivers/edac/i5100_edac.c
@@ -1108,7 +1108,6 @@ static int i5100_init_one(struct pci_dev *pdev, const struct pci_device_id *id)
 	mci->edac_ctl_cap = EDAC_FLAG_SECDED;
 	mci->edac_cap = EDAC_FLAG_SECDED;
 	mci->mod_name = "i5100_edac.c";
-	mci->mod_ver = "not versioned";
 	mci->ctl_name = "i5100";
 	mci->dev_name = pci_name(pdev);
 	mci->ctl_page_to_phys = NULL;
diff --git a/drivers/edac/i5400_edac.c b/drivers/edac/i5400_edac.c
index cd889edc8516..6f8bcdb9256a 100644
--- a/drivers/edac/i5400_edac.c
+++ b/drivers/edac/i5400_edac.c
@@ -1315,7 +1315,6 @@ static int i5400_probe1(struct pci_dev *pdev, int dev_idx)
 	mci->edac_ctl_cap = EDAC_FLAG_NONE;
 	mci->edac_cap = EDAC_FLAG_NONE;
 	mci->mod_name = "i5400_edac.c";
-	mci->mod_ver = I5400_REVISION;
 	mci->ctl_name = i5400_devs[dev_idx].ctl_name;
 	mci->dev_name = pci_name(pdev);
 	mci->ctl_page_to_phys = NULL;
diff --git a/drivers/edac/i7300_edac.c b/drivers/edac/i7300_edac.c
index e391f5a716be..6b5a554ba8e4 100644
--- a/drivers/edac/i7300_edac.c
+++ b/drivers/edac/i7300_edac.c
@@ -1077,7 +1077,6 @@ static int i7300_init_one(struct pci_dev *pdev, const struct pci_device_id *id)
 	mci->edac_ctl_cap = EDAC_FLAG_NONE;
 	mci->edac_cap = EDAC_FLAG_NONE;
 	mci->mod_name = "i7300_edac.c";
-	mci->mod_ver = I7300_REVISION;
 	mci->ctl_name = i7300_devs[0].ctl_name;
 	mci->dev_name = pci_name(pdev);
 	mci->ctl_page_to_phys = NULL;
diff --git a/drivers/edac/i7core_edac.c b/drivers/edac/i7core_edac.c
index ba0fa112996f..d36cc8498084 100644
--- a/drivers/edac/i7core_edac.c
+++ b/drivers/edac/i7core_edac.c
@@ -2159,7 +2159,6 @@ static int i7core_register_mci(struct i7core_dev *i7core_dev)
 	mci->edac_ctl_cap = EDAC_FLAG_NONE;
 	mci->edac_cap = EDAC_FLAG_NONE;
 	mci->mod_name = "i7core_edac.c";
-	mci->mod_ver = I7CORE_REVISION;
 	mci->ctl_name = kasprintf(GFP_KERNEL, "i7 core #%d",
 				  i7core_dev->socket);
 	mci->dev_name = pci_name(i7core_dev->pdev[0]);
diff --git a/drivers/edac/i82443bxgx_edac.c b/drivers/edac/i82443bxgx_edac.c
index cb61a5b7d080..a2ca929e2168 100644
--- a/drivers/edac/i82443bxgx_edac.c
+++ b/drivers/edac/i82443bxgx_edac.c
@@ -31,8 +31,6 @@
 #include <linux/edac.h>
 #include "edac_module.h"
 
-#define I82443_REVISION	"0.1"
-
 #define EDAC_MOD_STR    "i82443bxgx_edac"
 
 /* The 82443BX supports SDRAM, or EDO (EDO for mobile only), "Memory
@@ -320,7 +318,6 @@ static int i82443bxgx_edacmc_probe1(struct pci_dev *pdev, int dev_idx)
 				I82443BXGX_EAP_OFFSET_MBE));
 
 	mci->mod_name = EDAC_MOD_STR;
-	mci->mod_ver = I82443_REVISION;
 	mci->ctl_name = "I82443BXGX";
 	mci->dev_name = pci_name(pdev);
 	mci->edac_check = i82443bxgx_edacmc_check;
diff --git a/drivers/edac/i82860_edac.c b/drivers/edac/i82860_edac.c
index 236c813227fc..3e3a80ffb322 100644
--- a/drivers/edac/i82860_edac.c
+++ b/drivers/edac/i82860_edac.c
@@ -16,7 +16,6 @@
 #include <linux/edac.h>
 #include "edac_module.h"
 
-#define  I82860_REVISION " Ver: 2.0.2"
 #define EDAC_MOD_STR	"i82860_edac"
 
 #define i82860_printk(level, fmt, arg...) \
@@ -216,7 +215,6 @@ static int i82860_probe1(struct pci_dev *pdev, int dev_idx)
 	/* I"m not sure about this but I think that all RDRAM is SECDED */
 	mci->edac_cap = EDAC_FLAG_SECDED;
 	mci->mod_name = EDAC_MOD_STR;
-	mci->mod_ver = I82860_REVISION;
 	mci->ctl_name = i82860_devs[dev_idx].ctl_name;
 	mci->dev_name = pci_name(pdev);
 	mci->edac_check = i82860_check;
diff --git a/drivers/edac/i82875p_edac.c b/drivers/edac/i82875p_edac.c
index e286b7e74c7a..ceac925af38c 100644
--- a/drivers/edac/i82875p_edac.c
+++ b/drivers/edac/i82875p_edac.c
@@ -20,7 +20,6 @@
 #include <linux/edac.h>
 #include "edac_module.h"
 
-#define I82875P_REVISION	" Ver: 2.0.2"
 #define EDAC_MOD_STR		"i82875p_edac"
 
 #define i82875p_printk(level, fmt, arg...) \
@@ -423,7 +422,6 @@ static int i82875p_probe1(struct pci_dev *pdev, int dev_idx)
 	mci->edac_ctl_cap = EDAC_FLAG_NONE | EDAC_FLAG_SECDED;
 	mci->edac_cap = EDAC_FLAG_UNKNOWN;
 	mci->mod_name = EDAC_MOD_STR;
-	mci->mod_ver = I82875P_REVISION;
 	mci->ctl_name = i82875p_devs[dev_idx].ctl_name;
 	mci->dev_name = pci_name(pdev);
 	mci->edac_check = i82875p_check;
diff --git a/drivers/edac/i82975x_edac.c b/drivers/edac/i82975x_edac.c
index 9dcdab28f665..892815eaa97b 100644
--- a/drivers/edac/i82975x_edac.c
+++ b/drivers/edac/i82975x_edac.c
@@ -16,7 +16,6 @@
 #include <linux/edac.h>
 #include "edac_module.h"
 
-#define I82975X_REVISION	" Ver: 1.0.0"
 #define EDAC_MOD_STR		"i82975x_edac"
 
 #define i82975x_printk(level, fmt, arg...) \
@@ -564,7 +563,6 @@ static int i82975x_probe1(struct pci_dev *pdev, int dev_idx)
 	mci->edac_ctl_cap = EDAC_FLAG_NONE | EDAC_FLAG_SECDED;
 	mci->edac_cap = EDAC_FLAG_NONE | EDAC_FLAG_SECDED;
 	mci->mod_name = EDAC_MOD_STR;
-	mci->mod_ver = I82975X_REVISION;
 	mci->ctl_name = i82975x_devs[dev_idx].ctl_name;
 	mci->dev_name = pci_name(pdev);
 	mci->edac_check = i82975x_check;
diff --git a/drivers/edac/ie31200_edac.c b/drivers/edac/ie31200_edac.c
index 4260579e6901..aac9b9b360b8 100644
--- a/drivers/edac/ie31200_edac.c
+++ b/drivers/edac/ie31200_edac.c
@@ -45,7 +45,6 @@
 #include <linux/io-64-nonatomic-lo-hi.h>
 #include "edac_module.h"
 
-#define IE31200_REVISION "1.0"
 #define EDAC_MOD_STR "ie31200_edac"
 
 #define ie31200_printk(level, fmt, arg...) \
@@ -420,7 +419,6 @@ static int ie31200_probe1(struct pci_dev *pdev, int dev_idx)
 	mci->edac_ctl_cap = EDAC_FLAG_SECDED;
 	mci->edac_cap = EDAC_FLAG_SECDED;
 	mci->mod_name = EDAC_MOD_STR;
-	mci->mod_ver = IE31200_REVISION;
 	mci->ctl_name = ie31200_devs[dev_idx].ctl_name;
 	mci->dev_name = pci_name(pdev);
 	mci->edac_check = ie31200_check;
diff --git a/drivers/edac/mv64x60_edac.c b/drivers/edac/mv64x60_edac.c
index d3650df94fe8..ec5d695bbb72 100644
--- a/drivers/edac/mv64x60_edac.c
+++ b/drivers/edac/mv64x60_edac.c
@@ -766,7 +766,6 @@ static int mv64x60_mc_err_probe(struct platform_device *pdev)
 	mci->edac_ctl_cap = EDAC_FLAG_NONE | EDAC_FLAG_SECDED;
 	mci->edac_cap = EDAC_FLAG_SECDED;
 	mci->mod_name = EDAC_MOD_STR;
-	mci->mod_ver = MV64x60_REVISION;
 	mci->ctl_name = mv64x60_ctl_name;
 
 	if (edac_op_state == EDAC_OPSTATE_POLL)
diff --git a/drivers/edac/ppc4xx_edac.c b/drivers/edac/ppc4xx_edac.c
index e55e92590106..98d6dc7ef8e8 100644
--- a/drivers/edac/ppc4xx_edac.c
+++ b/drivers/edac/ppc4xx_edac.c
@@ -1063,7 +1063,6 @@ static int ppc4xx_edac_mc_init(struct mem_ctl_info *mci,
 	/* Initialize strings */
 
 	mci->mod_name		= PPC4XX_EDAC_MODULE_NAME;
-	mci->mod_ver		= PPC4XX_EDAC_MODULE_REVISION;
 	mci->ctl_name		= ppc4xx_edac_match->compatible,
 	mci->dev_name		= np->full_name;
 
diff --git a/drivers/edac/r82600_edac.c b/drivers/edac/r82600_edac.c
index 978916625ced..851e53e122aa 100644
--- a/drivers/edac/r82600_edac.c
+++ b/drivers/edac/r82600_edac.c
@@ -22,7 +22,6 @@
 #include <linux/edac.h>
 #include "edac_module.h"
 
-#define R82600_REVISION	" Ver: 2.0.2"
 #define EDAC_MOD_STR	"r82600_edac"
 
 #define r82600_printk(level, fmt, arg...) \
@@ -316,7 +315,6 @@ static int r82600_probe1(struct pci_dev *pdev, int dev_idx)
 		mci->edac_cap = EDAC_FLAG_NONE;
 
 	mci->mod_name = EDAC_MOD_STR;
-	mci->mod_ver = R82600_REVISION;
 	mci->ctl_name = "R82600";
 	mci->dev_name = pci_name(pdev);
 	mci->edac_check = r82600_check;
diff --git a/drivers/edac/sb_edac.c b/drivers/edac/sb_edac.c
index 80d860cb0746..687d0f23b9cc 100644
--- a/drivers/edac/sb_edac.c
+++ b/drivers/edac/sb_edac.c
@@ -3125,7 +3125,6 @@ static int sbridge_register_mci(struct sbridge_dev *sbridge_dev, enum type type)
 	mci->edac_ctl_cap = EDAC_FLAG_NONE;
 	mci->edac_cap = EDAC_FLAG_NONE;
 	mci->mod_name = "sb_edac.c";
-	mci->mod_ver = SBRIDGE_REVISION;
 	mci->dev_name = pci_name(pdev);
 	mci->ctl_page_to_phys = NULL;
 
diff --git a/drivers/edac/skx_edac.c b/drivers/edac/skx_edac.c
index 64bef6c9cfb4..16dea97568a1 100644
--- a/drivers/edac/skx_edac.c
+++ b/drivers/edac/skx_edac.c
@@ -31,8 +31,6 @@
 
 #include "edac_module.h"
 
-#define SKX_REVISION    " Ver: 1.0 "
-
 /*
  * Debug macros
  */
@@ -473,7 +471,6 @@ static int skx_register_mci(struct skx_imc *imc)
 	mci->edac_cap = EDAC_FLAG_NONE;
 	mci->mod_name = "skx_edac.c";
 	mci->dev_name = pci_name(imc->chan[0].cdev);
-	mci->mod_ver = SKX_REVISION;
 	mci->ctl_page_to_phys = NULL;
 
 	rc = skx_get_dimm_config(mci);
diff --git a/drivers/edac/synopsys_edac.c b/drivers/edac/synopsys_edac.c
index 1c01dec78ec3..0c9c59e2b5a3 100644
--- a/drivers/edac/synopsys_edac.c
+++ b/drivers/edac/synopsys_edac.c
@@ -413,7 +413,6 @@ static int synps_edac_mc_init(struct mem_ctl_info *mci,
 	mci->ctl_name = "synps_ddr_controller";
 	mci->dev_name = SYNPS_EDAC_MOD_STRING;
 	mci->mod_name = SYNPS_EDAC_MOD_VER;
-	mci->mod_ver = "1";
 
 	edac_op_state = EDAC_OPSTATE_POLL;
 	mci->edac_check = synps_edac_check;
diff --git a/drivers/edac/thunderx_edac.c b/drivers/edac/thunderx_edac.c
index 2d352b40ae1c..c8e8b9fd4772 100644
--- a/drivers/edac/thunderx_edac.c
+++ b/drivers/edac/thunderx_edac.c
@@ -732,7 +732,6 @@ static int thunderx_lmc_probe(struct pci_dev *pdev,
 	mci->edac_cap = EDAC_FLAG_SECDED;
 
 	mci->mod_name = "thunderx-lmc";
-	mci->mod_ver = "1";
 	mci->ctl_name = "thunderx-lmc";
 	mci->dev_name = dev_name(&pdev->dev);
 	mci->scrub_mode = SCRUB_NONE;
diff --git a/drivers/edac/x38_edac.c b/drivers/edac/x38_edac.c
index 03c97a4bf590..cc779f3f9e2d 100644
--- a/drivers/edac/x38_edac.c
+++ b/drivers/edac/x38_edac.c
@@ -18,8 +18,6 @@
 #include <linux/io-64-nonatomic-lo-hi.h>
 #include "edac_module.h"
 
-#define X38_REVISION		"1.1"
-
 #define EDAC_MOD_STR		"x38_edac"
 
 #define PCI_DEVICE_ID_INTEL_X38_HB	0x29e0
@@ -357,7 +355,6 @@ static int x38_probe1(struct pci_dev *pdev, int dev_idx)
 	mci->edac_cap = EDAC_FLAG_SECDED;
 
 	mci->mod_name = EDAC_MOD_STR;
-	mci->mod_ver = X38_REVISION;
 	mci->ctl_name = x38_devs[dev_idx].ctl_name;
 	mci->dev_name = pci_name(pdev);
 	mci->edac_check = x38_check;
diff --git a/drivers/edac/xgene_edac.c b/drivers/edac/xgene_edac.c
index 669246056812..e8b81d7ef61f 100644
--- a/drivers/edac/xgene_edac.c
+++ b/drivers/edac/xgene_edac.c
@@ -415,7 +415,6 @@ static int xgene_edac_mc_add(struct xgene_edac *edac, struct device_node *np)
 	mci->edac_ctl_cap = EDAC_FLAG_SECDED;
 	mci->edac_cap = EDAC_FLAG_SECDED;
 	mci->mod_name = EDAC_MOD_STR;
-	mci->mod_ver = "0.1";
 	mci->ctl_page_to_phys = NULL;
 	mci->scrub_cap = SCRUB_FLAG_HW_SRC;
 	mci->scrub_mode = SCRUB_HW_SRC;
diff --git a/include/linux/edac.h b/include/linux/edac.h
index 8ae0f45fafd6..cd75c173fd00 100644
--- a/include/linux/edac.h
+++ b/include/linux/edac.h
@@ -619,7 +619,6 @@ struct mem_ctl_info {
 	 */
 	struct device *pdev;
 	const char *mod_name;
-	const char *mod_ver;
 	const char *ctl_name;
 	const char *dev_name;
 	void *pvt_info;
-- 
cgit v1.2.3


From a5d31a3f81c6fb13b381951bf6163444c0257e8b Mon Sep 17 00:00:00 2001
From: Logan Gunthorpe <logang@deltatee.com>
Date: Thu, 15 Jun 2017 14:05:20 -0600
Subject: char_dev: extend dynamic allocation of majors into a higher range

We've run into problems with running out of dynamicly assign char
device majors particullarly on automated test systems with
all-yes-configs. Roughly 40 dynamic assignments can be made with such
kernels at this time while space is reserved for only 20.

Currently, the kernel only prints a warning when dynamic allocation
overflows the reserved region. And when this happens drivers that have
fixed assignments can randomly fail depending on the order of
initialization of other drivers. Thus, adding a new char device can cause
unexpected failures in completely unrelated parts of the kernel.

This patch solves the problem by extending dynamic major number
allocations down from 511 once the 234-254 region fills up. Fixed
majors already exist above 255 so the infrastructure to support
high number majors is already in place. The patch reserves an
additional 128 major numbers which should hopefully last us a while.

Kernels that don't require more than 20 dynamic majors assigned (which
is pretty typical) should not be affected by this change.

Signed-off-by: Logan Gunthorpe <logang@deltatee.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Alan Cox <alan@linux.intel.com>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Linus Walleij <linus.walleij@linaro.org>
Link: https://lkml.org/lkml/2017/6/4/107
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 Documentation/admin-guide/devices.txt |  5 +++++
 fs/char_dev.c                         | 41 ++++++++++++++++++++++++-----------
 include/linux/fs.h                    |  4 ++++
 3 files changed, 37 insertions(+), 13 deletions(-)

(limited to 'include')

diff --git a/Documentation/admin-guide/devices.txt b/Documentation/admin-guide/devices.txt
index 6b71852dadc2..4ec843123cc3 100644
--- a/Documentation/admin-guide/devices.txt
+++ b/Documentation/admin-guide/devices.txt
@@ -3081,3 +3081,8 @@
 		  1 = /dev/osd1		Second OSD Device
 		  ...
 		  255 = /dev/osd255	256th OSD Device
+
+ 384-511 char	RESERVED FOR DYNAMIC ASSIGNMENT
+		Character devices that request a dynamic allocation of major
+		number will take numbers starting from 511 and downward,
+		once the 234-254 range is full.
diff --git a/fs/char_dev.c b/fs/char_dev.c
index fb8507f521b2..c9d18362e89d 100644
--- a/fs/char_dev.c
+++ b/fs/char_dev.c
@@ -59,6 +59,29 @@ void chrdev_show(struct seq_file *f, off_t offset)
 
 #endif /* CONFIG_PROC_FS */
 
+static int find_dynamic_major(void)
+{
+	int i;
+	struct char_device_struct *cd;
+
+	for (i = ARRAY_SIZE(chrdevs)-1; i > CHRDEV_MAJOR_DYN_END; i--) {
+		if (chrdevs[i] == NULL)
+			return i;
+	}
+
+	for (i = CHRDEV_MAJOR_DYN_EXT_START;
+	     i > CHRDEV_MAJOR_DYN_EXT_END; i--) {
+		for (cd = chrdevs[major_to_index(i)]; cd; cd = cd->next)
+			if (cd->major == i)
+				break;
+
+		if (cd == NULL || cd->major != i)
+			return i;
+	}
+
+	return -EBUSY;
+}
+
 /*
  * Register a single major with a specified minor range.
  *
@@ -84,22 +107,14 @@ __register_chrdev_region(unsigned int major, unsigned int baseminor,
 
 	mutex_lock(&chrdevs_lock);
 
-	/* temporary */
 	if (major == 0) {
-		for (i = ARRAY_SIZE(chrdevs)-1; i > 0; i--) {
-			if (chrdevs[i] == NULL)
-				break;
-		}
-
-		if (i < CHRDEV_MAJOR_DYN_END)
-			pr_warn("CHRDEV \"%s\" major number %d goes below the dynamic allocation range\n",
-				name, i);
-
-		if (i == 0) {
-			ret = -EBUSY;
+		ret = find_dynamic_major();
+		if (ret < 0) {
+			pr_err("CHRDEV \"%s\" dynamic allocation region is full\n",
+			       name);
 			goto out;
 		}
-		major = i;
+		major = ret;
 	}
 
 	cd->major = major;
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 7b5d6816542b..1773dcf1e318 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -2473,6 +2473,10 @@ static inline void bd_unlink_disk_holder(struct block_device *bdev,
 #define CHRDEV_MAJOR_HASH_SIZE	255
 /* Marks the bottom of the first segment of free char majors */
 #define CHRDEV_MAJOR_DYN_END 234
+/* Marks the top and bottom of the second segment of free char majors */
+#define CHRDEV_MAJOR_DYN_EXT_START 511
+#define CHRDEV_MAJOR_DYN_EXT_END 384
+
 extern int alloc_chrdev_region(dev_t *, unsigned, unsigned, const char *);
 extern int register_chrdev_region(dev_t, unsigned, const char *);
 extern int __register_chrdev(unsigned int major, unsigned int baseminor,
-- 
cgit v1.2.3


From 8a932f73e5b4227bf787474b44dc70b6961d6246 Mon Sep 17 00:00:00 2001
From: Logan Gunthorpe <logang@deltatee.com>
Date: Thu, 15 Jun 2017 14:05:21 -0600
Subject: char_dev: order /proc/devices by major number

Presently, the order of the char devices listed in /proc/devices is not
entirely sequential. If a char device has a major number greater than
CHRDEV_MAJOR_HASH_SIZE (255), it will be ordered as if its major were
module 255. For example, 511 appears after 1.

This patch cleans that up and prints each major number in the correct
order, regardless of where they are stored in the hash table.

In order to do this, we introduce CHRDEV_MAJOR_MAX as an artificial
limit (chosen to be 511). It will then print all devices in major
order number from 0 to the maximum.

Signed-off-by: Logan Gunthorpe <logang@deltatee.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: Alan Cox <alan@linux.intel.com>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Linus Walleij <linus.walleij@linaro.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/char_dev.c      | 17 +++++++++++++----
 fs/proc/devices.c  |  8 ++++----
 include/linux/fs.h |  2 +-
 3 files changed, 18 insertions(+), 9 deletions(-)

(limited to 'include')

diff --git a/fs/char_dev.c b/fs/char_dev.c
index c9d18362e89d..ebcc8fb3fa66 100644
--- a/fs/char_dev.c
+++ b/fs/char_dev.c
@@ -28,6 +28,8 @@ static struct kobj_map *cdev_map;
 
 static DEFINE_MUTEX(chrdevs_lock);
 
+#define CHRDEV_MAJOR_HASH_SIZE 255
+
 static struct char_device_struct {
 	struct char_device_struct *next;
 	unsigned int major;
@@ -49,12 +51,12 @@ void chrdev_show(struct seq_file *f, off_t offset)
 {
 	struct char_device_struct *cd;
 
-	if (offset < CHRDEV_MAJOR_HASH_SIZE) {
-		mutex_lock(&chrdevs_lock);
-		for (cd = chrdevs[offset]; cd; cd = cd->next)
+	mutex_lock(&chrdevs_lock);
+	for (cd = chrdevs[major_to_index(offset)]; cd; cd = cd->next) {
+		if (cd->major == offset)
 			seq_printf(f, "%3d %s\n", cd->major, cd->name);
-		mutex_unlock(&chrdevs_lock);
 	}
+	mutex_unlock(&chrdevs_lock);
 }
 
 #endif /* CONFIG_PROC_FS */
@@ -117,6 +119,13 @@ __register_chrdev_region(unsigned int major, unsigned int baseminor,
 		major = ret;
 	}
 
+	if (major >= CHRDEV_MAJOR_MAX) {
+		pr_err("CHRDEV \"%s\" major requested (%d) is greater than the maximum (%d)\n",
+		       name, major, CHRDEV_MAJOR_MAX);
+		ret = -EINVAL;
+		goto out;
+	}
+
 	cd->major = major;
 	cd->baseminor = baseminor;
 	cd->minorct = minorct;
diff --git a/fs/proc/devices.c b/fs/proc/devices.c
index 50493edc30e5..d196e22c4f1c 100644
--- a/fs/proc/devices.c
+++ b/fs/proc/devices.c
@@ -7,14 +7,14 @@ static int devinfo_show(struct seq_file *f, void *v)
 {
 	int i = *(loff_t *) v;
 
-	if (i < CHRDEV_MAJOR_HASH_SIZE) {
+	if (i < CHRDEV_MAJOR_MAX) {
 		if (i == 0)
 			seq_puts(f, "Character devices:\n");
 		chrdev_show(f, i);
 	}
 #ifdef CONFIG_BLOCK
 	else {
-		i -= CHRDEV_MAJOR_HASH_SIZE;
+		i -= CHRDEV_MAJOR_MAX;
 		if (i == 0)
 			seq_puts(f, "\nBlock devices:\n");
 		blkdev_show(f, i);
@@ -25,7 +25,7 @@ static int devinfo_show(struct seq_file *f, void *v)
 
 static void *devinfo_start(struct seq_file *f, loff_t *pos)
 {
-	if (*pos < (BLKDEV_MAJOR_HASH_SIZE + CHRDEV_MAJOR_HASH_SIZE))
+	if (*pos < (BLKDEV_MAJOR_HASH_SIZE + CHRDEV_MAJOR_MAX))
 		return pos;
 	return NULL;
 }
@@ -33,7 +33,7 @@ static void *devinfo_start(struct seq_file *f, loff_t *pos)
 static void *devinfo_next(struct seq_file *f, void *v, loff_t *pos)
 {
 	(*pos)++;
-	if (*pos >= (BLKDEV_MAJOR_HASH_SIZE + CHRDEV_MAJOR_HASH_SIZE))
+	if (*pos >= (BLKDEV_MAJOR_HASH_SIZE + CHRDEV_MAJOR_MAX))
 		return NULL;
 	return pos;
 }
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 1773dcf1e318..b07433c335ca 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -2470,7 +2470,7 @@ static inline void bd_unlink_disk_holder(struct block_device *bdev,
 #endif
 
 /* fs/char_dev.c */
-#define CHRDEV_MAJOR_HASH_SIZE	255
+#define CHRDEV_MAJOR_MAX 512
 /* Marks the bottom of the first segment of free char majors */
 #define CHRDEV_MAJOR_DYN_END 234
 /* Marks the top and bottom of the second segment of free char majors */
-- 
cgit v1.2.3


From 133d55cdb2f1f9e258d6dc34594a6c565f10b3fd Mon Sep 17 00:00:00 2001
From: Logan Gunthorpe <logang@deltatee.com>
Date: Fri, 16 Jun 2017 17:48:21 -0600
Subject: block: order /proc/devices by major number

Presently, the order of the block devices listed in /proc/devices is not
entirely sequential. If a block device has a major number greater than
BLKDEV_MAJOR_HASH_SIZE (255), it will be ordered as if its major were
module 255. For example, 511 appears after 1.

This patch cleans that up and prints each major number in the correct
order, regardless of where they are stored in the hash table.

In order to do this, we introduce BLKDEV_MAJOR_MAX as an artificial
limit (chosen to be 512). It will then print all devices in major
order number from 0 to the maximum.

Signed-off-by: Logan Gunthorpe <logang@deltatee.com>
Cc: Jens Axboe <axboe@kernel.dk>
Cc: Jeff Layton <jlayton@poochiereds.net>
Cc: "J. Bruce Fields" <bfields@fieldses.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 block/genhd.c      | 18 +++++++++++++-----
 fs/proc/devices.c  |  4 ++--
 include/linux/fs.h |  4 ++--
 3 files changed, 17 insertions(+), 9 deletions(-)

(limited to 'include')

diff --git a/block/genhd.c b/block/genhd.c
index 7f520fa25d16..51c1d407d93c 100644
--- a/block/genhd.c
+++ b/block/genhd.c
@@ -242,6 +242,7 @@ EXPORT_SYMBOL_GPL(disk_map_sector_rcu);
  * Can be deleted altogether. Later.
  *
  */
+#define BLKDEV_MAJOR_HASH_SIZE 255
 static struct blk_major_name {
 	struct blk_major_name *next;
 	int major;
@@ -259,12 +260,11 @@ void blkdev_show(struct seq_file *seqf, off_t offset)
 {
 	struct blk_major_name *dp;
 
-	if (offset < BLKDEV_MAJOR_HASH_SIZE) {
-		mutex_lock(&block_class_lock);
-		for (dp = major_names[offset]; dp; dp = dp->next)
+	mutex_lock(&block_class_lock);
+	for (dp = major_names[major_to_index(offset)]; dp; dp = dp->next)
+		if (dp->major == offset)
 			seq_printf(seqf, "%3d %s\n", dp->major, dp->name);
-		mutex_unlock(&block_class_lock);
-	}
+	mutex_unlock(&block_class_lock);
 }
 #endif /* CONFIG_PROC_FS */
 
@@ -309,6 +309,14 @@ int register_blkdev(unsigned int major, const char *name)
 		ret = major;
 	}
 
+	if (major >= BLKDEV_MAJOR_MAX) {
+		pr_err("register_blkdev: major requested (%d) is greater than the maximum (%d) for %s\n",
+		       major, BLKDEV_MAJOR_MAX, name);
+
+		ret = -EINVAL;
+		goto out;
+	}
+
 	p = kmalloc(sizeof(struct blk_major_name), GFP_KERNEL);
 	if (p == NULL) {
 		ret = -ENOMEM;
diff --git a/fs/proc/devices.c b/fs/proc/devices.c
index d196e22c4f1c..e5709343feb7 100644
--- a/fs/proc/devices.c
+++ b/fs/proc/devices.c
@@ -25,7 +25,7 @@ static int devinfo_show(struct seq_file *f, void *v)
 
 static void *devinfo_start(struct seq_file *f, loff_t *pos)
 {
-	if (*pos < (BLKDEV_MAJOR_HASH_SIZE + CHRDEV_MAJOR_MAX))
+	if (*pos < (BLKDEV_MAJOR_MAX + CHRDEV_MAJOR_MAX))
 		return pos;
 	return NULL;
 }
@@ -33,7 +33,7 @@ static void *devinfo_start(struct seq_file *f, loff_t *pos)
 static void *devinfo_next(struct seq_file *f, void *v, loff_t *pos)
 {
 	(*pos)++;
-	if (*pos >= (BLKDEV_MAJOR_HASH_SIZE + CHRDEV_MAJOR_MAX))
+	if (*pos >= (BLKDEV_MAJOR_MAX + CHRDEV_MAJOR_MAX))
 		return NULL;
 	return pos;
 }
diff --git a/include/linux/fs.h b/include/linux/fs.h
index b07433c335ca..570dcc61fda6 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -2503,14 +2503,14 @@ static inline void unregister_chrdev(unsigned int major, const char *name)
 #define BDEVT_SIZE	10	/* Largest string for MAJ:MIN for blkdev */
 
 #ifdef CONFIG_BLOCK
-#define BLKDEV_MAJOR_HASH_SIZE	255
+#define BLKDEV_MAJOR_MAX	512
 extern const char *__bdevname(dev_t, char *buffer);
 extern const char *bdevname(struct block_device *bdev, char *buffer);
 extern struct block_device *lookup_bdev(const char *);
 extern void blkdev_show(struct seq_file *,off_t);
 
 #else
-#define BLKDEV_MAJOR_HASH_SIZE	0
+#define BLKDEV_MAJOR_MAX	0
 #endif
 
 extern void init_special_inode(struct inode *, umode_t, dev_t);
-- 
cgit v1.2.3


From c2a737eb2ea5682ffe63bc08003965496d6dc088 Mon Sep 17 00:00:00 2001
From: Viresh Kumar <viresh.kumar@linaro.org>
Date: Thu, 29 Jun 2017 09:39:02 +0530
Subject: debugfs: Add dummy implementation of few helpers

This adds (missing) dummy implementations of
debugfs_create_file_unsafe() and debugfs_create_ulong().

Signed-off-by: Viresh Kumar <viresh.kumar@linaro.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/linux/debugfs.h | 16 ++++++++++++++++
 1 file changed, 16 insertions(+)

(limited to 'include')

diff --git a/include/linux/debugfs.h b/include/linux/debugfs.h
index aa86e6d8c1aa..b93efc8feecd 100644
--- a/include/linux/debugfs.h
+++ b/include/linux/debugfs.h
@@ -196,6 +196,14 @@ static inline struct dentry *debugfs_create_file(const char *name, umode_t mode,
 	return ERR_PTR(-ENODEV);
 }
 
+static inline struct dentry *debugfs_create_file_unsafe(const char *name,
+					umode_t mode, struct dentry *parent,
+					void *data,
+					const struct file_operations *fops)
+{
+	return ERR_PTR(-ENODEV);
+}
+
 static inline struct dentry *debugfs_create_file_size(const char *name, umode_t mode,
 					struct dentry *parent, void *data,
 					const struct file_operations *fops,
@@ -289,6 +297,14 @@ static inline struct dentry *debugfs_create_u64(const char *name, umode_t mode,
 	return ERR_PTR(-ENODEV);
 }
 
+static inline struct dentry *debugfs_create_ulong(const char *name,
+						umode_t mode,
+						struct dentry *parent,
+						unsigned long *value)
+{
+	return ERR_PTR(-ENODEV);
+}
+
 static inline struct dentry *debugfs_create_x8(const char *name, umode_t mode,
 					       struct dentry *parent,
 					       u8 *value)
-- 
cgit v1.2.3


From 95c40f41cfaf34e1c07812e93aa4b3263f9953f3 Mon Sep 17 00:00:00 2001
From: Stephen Hemminger <stephen@networkplumber.org>
Date: Sun, 25 Jun 2017 12:30:25 -0700
Subject: vmbus: drop unused ring_buffer_info elements

The elements ring_data_start_offset and priv_write_index
are not used.

Signed-off-by: Stephen Hemminger <sthemmin@microsoft.com>
Signed-off-by: K. Y. Srinivasan <kys@microsoft.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/linux/hyperv.h | 2 --
 1 file changed, 2 deletions(-)

(limited to 'include')

diff --git a/include/linux/hyperv.h b/include/linux/hyperv.h
index b7d7bbec74e0..5e5f966bf37f 100644
--- a/include/linux/hyperv.h
+++ b/include/linux/hyperv.h
@@ -124,8 +124,6 @@ struct hv_ring_buffer_info {
 	spinlock_t ring_lock;
 
 	u32 ring_datasize;		/* < ring_size */
-	u32 ring_data_startoffset;
-	u32 priv_write_index;
 	u32 priv_read_index;
 	u32 cached_read_index;
 };
-- 
cgit v1.2.3


From 8dd45f2ab005a1f3301296059b23b03ec3dbf79b Mon Sep 17 00:00:00 2001
From: Stephen Hemminger <stephen@networkplumber.org>
Date: Sun, 25 Jun 2017 12:30:26 -0700
Subject: vmbus: refactor hv_signal_on_read

The function hv_signal_on_read was defined in hyperv.h and
only used in one place in ring_buffer code. Clearer to just
move it inline there.

Signed-off-by: Stephen Hemminger <sthemmin@microsoft.com>
Signed-off-by: K. Y. Srinivasan <kys@microsoft.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/hv/ring_buffer.c | 32 +++++++++++++++++++++++++++++--
 include/linux/hyperv.h   | 49 ------------------------------------------------
 2 files changed, 30 insertions(+), 51 deletions(-)

(limited to 'include')

diff --git a/drivers/hv/ring_buffer.c b/drivers/hv/ring_buffer.c
index f29981764653..a9021f13379f 100644
--- a/drivers/hv/ring_buffer.c
+++ b/drivers/hv/ring_buffer.c
@@ -29,6 +29,7 @@
 #include <linux/uio.h>
 #include <linux/vmalloc.h>
 #include <linux/slab.h>
+#include <linux/prefetch.h>
 
 #include "hyperv_vmbus.h"
 
@@ -357,7 +358,7 @@ struct vmpacket_descriptor *hv_pkt_iter_first(struct vmbus_channel *channel)
 {
 	struct hv_ring_buffer_info *rbi = &channel->inbound;
 
-	/* set state for later hv_signal_on_read() */
+	/* set state for later hv_pkt_iter_close */
 	rbi->cached_read_index = rbi->ring_buffer->read_index;
 
 	if (hv_pkt_iter_avail(rbi) < sizeof(struct vmpacket_descriptor))
@@ -400,6 +401,8 @@ EXPORT_SYMBOL_GPL(__hv_pkt_iter_next);
 void hv_pkt_iter_close(struct vmbus_channel *channel)
 {
 	struct hv_ring_buffer_info *rbi = &channel->inbound;
+	u32 cur_write_sz, cached_write_sz;
+	u32 pending_sz;
 
 	/*
 	 * Make sure all reads are done before we update the read index since
@@ -409,6 +412,31 @@ void hv_pkt_iter_close(struct vmbus_channel *channel)
 	virt_rmb();
 	rbi->ring_buffer->read_index = rbi->priv_read_index;
 
-	hv_signal_on_read(channel);
+	/*
+	 * Issue a full memory barrier before making the signaling decision.
+	 * Here is the reason for having this barrier:
+	 * If the reading of the pend_sz (in this function)
+	 * were to be reordered and read before we commit the new read
+	 * index (in the calling function)  we could
+	 * have a problem. If the host were to set the pending_sz after we
+	 * have sampled pending_sz and go to sleep before we commit the
+	 * read index, we could miss sending the interrupt. Issue a full
+	 * memory barrier to address this.
+	 */
+	virt_mb();
+
+	pending_sz = READ_ONCE(rbi->ring_buffer->pending_send_sz);
+	/* If the other end is not blocked on write don't bother. */
+	if (pending_sz == 0)
+		return;
+
+	cur_write_sz = hv_get_bytes_to_write(rbi);
+
+	if (cur_write_sz < pending_sz)
+		return;
+
+	cached_write_sz = hv_get_cached_bytes_to_write(rbi);
+	if (cached_write_sz < pending_sz)
+		vmbus_setevent(channel);
 }
 EXPORT_SYMBOL_GPL(hv_pkt_iter_close);
diff --git a/include/linux/hyperv.h b/include/linux/hyperv.h
index 5e5f966bf37f..308e1f9706bb 100644
--- a/include/linux/hyperv.h
+++ b/include/linux/hyperv.h
@@ -1471,55 +1471,6 @@ hv_get_ring_buffer(const struct hv_ring_buffer_info *ring_info)
 	return ring_info->ring_buffer->buffer;
 }
 
-/*
- * To optimize the flow management on the send-side,
- * when the sender is blocked because of lack of
- * sufficient space in the ring buffer, potential the
- * consumer of the ring buffer can signal the producer.
- * This is controlled by the following parameters:
- *
- * 1. pending_send_sz: This is the size in bytes that the
- *    producer is trying to send.
- * 2. The feature bit feat_pending_send_sz set to indicate if
- *    the consumer of the ring will signal when the ring
- *    state transitions from being full to a state where
- *    there is room for the producer to send the pending packet.
- */
-
-static inline  void hv_signal_on_read(struct vmbus_channel *channel)
-{
-	u32 cur_write_sz, cached_write_sz;
-	u32 pending_sz;
-	struct hv_ring_buffer_info *rbi = &channel->inbound;
-
-	/*
-	 * Issue a full memory barrier before making the signaling decision.
-	 * Here is the reason for having this barrier:
-	 * If the reading of the pend_sz (in this function)
-	 * were to be reordered and read before we commit the new read
-	 * index (in the calling function)  we could
-	 * have a problem. If the host were to set the pending_sz after we
-	 * have sampled pending_sz and go to sleep before we commit the
-	 * read index, we could miss sending the interrupt. Issue a full
-	 * memory barrier to address this.
-	 */
-	virt_mb();
-
-	pending_sz = READ_ONCE(rbi->ring_buffer->pending_send_sz);
-	/* If the other end is not blocked on write don't bother. */
-	if (pending_sz == 0)
-		return;
-
-	cur_write_sz = hv_get_bytes_to_write(rbi);
-
-	if (cur_write_sz < pending_sz)
-		return;
-
-	cached_write_sz = hv_get_cached_bytes_to_write(rbi);
-	if (cached_write_sz < pending_sz)
-		vmbus_setevent(channel);
-}
-
 /*
  * Mask off host interrupt callback notifications
  */
-- 
cgit v1.2.3


From 05d00bc94ac27d220d8a78e365d7fa3a26dcca17 Mon Sep 17 00:00:00 2001
From: Stephen Hemminger <stephen@networkplumber.org>
Date: Sun, 25 Jun 2017 12:30:27 -0700
Subject: vmbus: eliminate duplicate cached index

Don't need cached read index anymore now that packet iterator
is used. The iterator has the original read index until the
visible read_index is updated.

Signed-off-by: Stephen Hemminger <sthemmin@microsoft.com>
Signed-off-by: K. Y. Srinivasan <kys@microsoft.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/hv/ring_buffer.c | 17 ++++-------------
 include/linux/hyperv.h   | 14 --------------
 2 files changed, 4 insertions(+), 27 deletions(-)

(limited to 'include')

diff --git a/drivers/hv/ring_buffer.c b/drivers/hv/ring_buffer.c
index a9021f13379f..b0f79526b86a 100644
--- a/drivers/hv/ring_buffer.c
+++ b/drivers/hv/ring_buffer.c
@@ -358,9 +358,6 @@ struct vmpacket_descriptor *hv_pkt_iter_first(struct vmbus_channel *channel)
 {
 	struct hv_ring_buffer_info *rbi = &channel->inbound;
 
-	/* set state for later hv_pkt_iter_close */
-	rbi->cached_read_index = rbi->ring_buffer->read_index;
-
 	if (hv_pkt_iter_avail(rbi) < sizeof(struct vmpacket_descriptor))
 		return NULL;
 
@@ -388,10 +385,7 @@ __hv_pkt_iter_next(struct vmbus_channel *channel,
 		rbi->priv_read_index -= dsize;
 
 	/* more data? */
-	if (hv_pkt_iter_avail(rbi) < sizeof(struct vmpacket_descriptor))
-		return NULL;
-	else
-		return hv_get_ring_buffer(rbi) + rbi->priv_read_index;
+	return hv_pkt_iter_first(channel);
 }
 EXPORT_SYMBOL_GPL(__hv_pkt_iter_next);
 
@@ -401,7 +395,7 @@ EXPORT_SYMBOL_GPL(__hv_pkt_iter_next);
 void hv_pkt_iter_close(struct vmbus_channel *channel)
 {
 	struct hv_ring_buffer_info *rbi = &channel->inbound;
-	u32 cur_write_sz, cached_write_sz;
+	u32 orig_write_sz = hv_get_bytes_to_write(rbi);
 	u32 pending_sz;
 
 	/*
@@ -430,13 +424,10 @@ void hv_pkt_iter_close(struct vmbus_channel *channel)
 	if (pending_sz == 0)
 		return;
 
-	cur_write_sz = hv_get_bytes_to_write(rbi);
-
-	if (cur_write_sz < pending_sz)
+	if (hv_get_bytes_to_write(rbi) < pending_sz)
 		return;
 
-	cached_write_sz = hv_get_cached_bytes_to_write(rbi);
-	if (cached_write_sz < pending_sz)
+	if (orig_write_sz < pending_sz)
 		vmbus_setevent(channel);
 }
 EXPORT_SYMBOL_GPL(hv_pkt_iter_close);
diff --git a/include/linux/hyperv.h b/include/linux/hyperv.h
index 308e1f9706bb..27db4e650b8c 100644
--- a/include/linux/hyperv.h
+++ b/include/linux/hyperv.h
@@ -125,7 +125,6 @@ struct hv_ring_buffer_info {
 
 	u32 ring_datasize;		/* < ring_size */
 	u32 priv_read_index;
-	u32 cached_read_index;
 };
 
 /*
@@ -178,19 +177,6 @@ static inline u32 hv_get_bytes_to_write(const struct hv_ring_buffer_info *rbi)
 	return write;
 }
 
-static inline u32 hv_get_cached_bytes_to_write(
-	const struct hv_ring_buffer_info *rbi)
-{
-	u32 read_loc, write_loc, dsize, write;
-
-	dsize = rbi->ring_datasize;
-	read_loc = rbi->cached_read_index;
-	write_loc = rbi->ring_buffer->write_index;
-
-	write = write_loc >= read_loc ? dsize - (write_loc - read_loc) :
-		read_loc - write_loc;
-	return write;
-}
 /*
  * VMBUS version is 32 bit entity broken up into
  * two 16 bit quantities: major_number. minor_number.
-- 
cgit v1.2.3


From b9045b9c6b79d2ea5b888a0f9e9aa156d09ba8a9 Mon Sep 17 00:00:00 2001
From: Dan Murphy <dmurphy@ti.com>
Date: Wed, 12 Jul 2017 13:37:00 -0500
Subject: ASoC: tlv320aic32x4: Add gpio configuration to the codec

Add the ability to configure the MFP1->MFP5 registers
as GPIOs.  In addition adding ALSA controls to get and set
the GPIO state.

Per the data sheet each MFP can be configured as a GPIO
input only, output only or either an input or output.

Signed-off-by: Dan Murphy <dmurphy@ti.com>
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 .../devicetree/bindings/sound/tlv320aic32x4.txt    |   9 +
 include/sound/tlv320aic32x4.h                      |  23 +++
 sound/soc/codecs/tlv320aic32x4.c                   | 206 +++++++++++++++++++++
 sound/soc/codecs/tlv320aic32x4.h                   |   3 +
 4 files changed, 241 insertions(+)

(limited to 'include')

diff --git a/Documentation/devicetree/bindings/sound/tlv320aic32x4.txt b/Documentation/devicetree/bindings/sound/tlv320aic32x4.txt
index 9651fc7503a4..ca75890f0d07 100644
--- a/Documentation/devicetree/bindings/sound/tlv320aic32x4.txt
+++ b/Documentation/devicetree/bindings/sound/tlv320aic32x4.txt
@@ -20,6 +20,8 @@ Optional properties:
  - reset-gpios: Reset-GPIO phandle with args as described in gpio/gpio.txt
  - clocks/clock-names: Clock named 'mclk' for the master clock of the codec.
    See clock/clock-bindings.txt for information about the detailed format.
+ - aic32x4-gpio-func - <array of 5 int>
+	- Types are defined in include/sound/tlv320aic32x4.h
 
 
 Example:
@@ -29,4 +31,11 @@ codec: tlv320aic32x4@18 {
 	reg = <0x18>;
 	clocks = <&clks 201>;
 	clock-names = "mclk";
+	aic32x4-gpio-func= <
+			0xff /* AIC32X4_MFPX_DEFAULT_VALUE */
+			0xff /* AIC32X4_MFPX_DEFAULT_VALUE */
+			0x04 /* MFP3 AIC32X4_MFP3_GPIO_ENABLED */
+			0xff /* AIC32X4_MFPX_DEFAULT_VALUE */
+			0x08 /* MFP5 AIC32X4_MFP5_GPIO_INPUT */
+		>;
 };
diff --git a/include/sound/tlv320aic32x4.h b/include/sound/tlv320aic32x4.h
index 24e5d991f148..22305c0ab31a 100644
--- a/include/sound/tlv320aic32x4.h
+++ b/include/sound/tlv320aic32x4.h
@@ -22,7 +22,30 @@
 #define AIC32X4_MICPGA_ROUTE_LMIC_IN2R_10K	0x00000001
 #define AIC32X4_MICPGA_ROUTE_RMIC_IN1L_10K	0x00000002
 
+/* GPIO API */
+#define AIC32X4_MFPX_DEFAULT_VALUE	0xff
+
+#define AIC32X4_MFP1_DIN_DISABLED	0
+#define AIC32X4_MFP1_DIN_ENABLED	0x2
+#define AIC32X4_MFP1_GPIO_IN		0x4
+
+#define AIC32X4_MFP2_GPIO_OUT_LOW	0x0
+#define AIC32X4_MFP2_GPIO_OUT_HIGH	0x1
+
+#define AIC32X4_MFP_GPIO_ENABLED	0x4
+
+#define AIC32X4_MFP5_GPIO_DISABLED	0x0
+#define AIC32X4_MFP5_GPIO_INPUT		0x8
+#define AIC32X4_MFP5_GPIO_OUTPUT	0xc
+#define AIC32X4_MFP5_GPIO_OUT_LOW	0x0
+#define AIC32X4_MFP5_GPIO_OUT_HIGH	0x1
+
+struct aic32x4_setup_data {
+	unsigned int gpio_func[5];
+};
+
 struct aic32x4_pdata {
+	struct aic32x4_setup_data *setup;
 	u32 power_cfg;
 	u32 micpga_routing;
 	bool swapdacs;
diff --git a/sound/soc/codecs/tlv320aic32x4.c b/sound/soc/codecs/tlv320aic32x4.c
index 28fdfc5ec544..300ee2890df3 100644
--- a/sound/soc/codecs/tlv320aic32x4.c
+++ b/sound/soc/codecs/tlv320aic32x4.c
@@ -74,6 +74,152 @@ struct aic32x4_priv {
 	struct regulator *supply_iov;
 	struct regulator *supply_dv;
 	struct regulator *supply_av;
+
+	struct aic32x4_setup_data *setup;
+	struct device *dev;
+};
+
+static int aic32x4_get_mfp1_gpio(struct snd_kcontrol *kcontrol,
+	struct snd_ctl_elem_value *ucontrol)
+{
+	struct snd_soc_codec *codec = snd_kcontrol_chip(kcontrol);
+	u8 val;
+
+	val = snd_soc_read(codec, AIC32X4_DINCTL);
+
+	ucontrol->value.integer.value[0] = (val & 0x01);
+
+	return 0;
+};
+
+static int aic32x4_set_mfp2_gpio(struct snd_kcontrol *kcontrol,
+	struct snd_ctl_elem_value *ucontrol)
+{
+	struct snd_soc_codec *codec = snd_kcontrol_chip(kcontrol);
+	u8 val;
+	u8 gpio_check;
+
+	val = snd_soc_read(codec, AIC32X4_DOUTCTL);
+	gpio_check = (val & AIC32X4_MFP_GPIO_ENABLED);
+	if (gpio_check != AIC32X4_MFP_GPIO_ENABLED) {
+		printk(KERN_ERR "%s: MFP2 is not configure as a GPIO output\n",
+			__func__);
+		return -EINVAL;
+	}
+
+	if (ucontrol->value.integer.value[0] == (val & AIC32X4_MFP2_GPIO_OUT_HIGH))
+		return 0;
+
+	if (ucontrol->value.integer.value[0])
+		val |= ucontrol->value.integer.value[0];
+	else
+		val &= ~AIC32X4_MFP2_GPIO_OUT_HIGH;
+
+	snd_soc_write(codec, AIC32X4_DOUTCTL, val);
+
+	return 0;
+};
+
+static int aic32x4_get_mfp3_gpio(struct snd_kcontrol *kcontrol,
+	struct snd_ctl_elem_value *ucontrol)
+{
+	struct snd_soc_codec *codec = snd_kcontrol_chip(kcontrol);
+	u8 val;
+
+	val = snd_soc_read(codec, AIC32X4_SCLKCTL);
+
+	ucontrol->value.integer.value[0] = (val & 0x01);
+
+	return 0;
+};
+
+static int aic32x4_set_mfp4_gpio(struct snd_kcontrol *kcontrol,
+	struct snd_ctl_elem_value *ucontrol)
+{
+	struct snd_soc_codec *codec = snd_kcontrol_chip(kcontrol);
+	u8 val;
+	u8 gpio_check;
+
+	val = snd_soc_read(codec, AIC32X4_MISOCTL);
+	gpio_check = (val & AIC32X4_MFP_GPIO_ENABLED);
+	if (gpio_check != AIC32X4_MFP_GPIO_ENABLED) {
+		printk(KERN_ERR "%s: MFP4 is not configure as a GPIO output\n",
+			__func__);
+		return -EINVAL;
+	}
+
+	if (ucontrol->value.integer.value[0] == (val & AIC32X4_MFP5_GPIO_OUT_HIGH))
+		return 0;
+
+	if (ucontrol->value.integer.value[0])
+		val |= ucontrol->value.integer.value[0];
+	else
+		val &= ~AIC32X4_MFP5_GPIO_OUT_HIGH;
+
+	snd_soc_write(codec, AIC32X4_MISOCTL, val);
+
+	return 0;
+};
+
+static int aic32x4_get_mfp5_gpio(struct snd_kcontrol *kcontrol,
+	struct snd_ctl_elem_value *ucontrol)
+{
+	struct snd_soc_codec *codec = snd_kcontrol_chip(kcontrol);
+	u8 val;
+
+	val = snd_soc_read(codec, AIC32X4_GPIOCTL);
+	ucontrol->value.integer.value[0] = ((val & 0x2) >> 1);
+
+	return 0;
+};
+
+static int aic32x4_set_mfp5_gpio(struct snd_kcontrol *kcontrol,
+	struct snd_ctl_elem_value *ucontrol)
+{
+	struct snd_soc_codec *codec = snd_kcontrol_chip(kcontrol);
+	u8 val;
+	u8 gpio_check;
+
+	val = snd_soc_read(codec, AIC32X4_GPIOCTL);
+	gpio_check = (val & AIC32X4_MFP5_GPIO_OUTPUT);
+	if (gpio_check != AIC32X4_MFP5_GPIO_OUTPUT) {
+		printk(KERN_ERR "%s: MFP5 is not configure as a GPIO output\n",
+			__func__);
+		return -EINVAL;
+	}
+
+	if (ucontrol->value.integer.value[0] == (val & 0x1))
+		return 0;
+
+	if (ucontrol->value.integer.value[0])
+		val |= ucontrol->value.integer.value[0];
+	else
+		val &= 0xfe;
+
+	snd_soc_write(codec, AIC32X4_GPIOCTL, val);
+
+	return 0;
+};
+
+static const struct snd_kcontrol_new aic32x4_mfp1[] = {
+	SOC_SINGLE_BOOL_EXT("MFP1 GPIO", 0, aic32x4_get_mfp1_gpio, NULL),
+};
+
+static const struct snd_kcontrol_new aic32x4_mfp2[] = {
+	SOC_SINGLE_BOOL_EXT("MFP2 GPIO", 0, NULL, aic32x4_set_mfp2_gpio),
+};
+
+static const struct snd_kcontrol_new aic32x4_mfp3[] = {
+	SOC_SINGLE_BOOL_EXT("MFP3 GPIO", 0, aic32x4_get_mfp3_gpio, NULL),
+};
+
+static const struct snd_kcontrol_new aic32x4_mfp4[] = {
+	SOC_SINGLE_BOOL_EXT("MFP4 GPIO", 0, NULL, aic32x4_set_mfp4_gpio),
+};
+
+static const struct snd_kcontrol_new aic32x4_mfp5[] = {
+	SOC_SINGLE_BOOL_EXT("MFP5 GPIO", 0, aic32x4_get_mfp5_gpio,
+		aic32x4_set_mfp5_gpio),
 };
 
 /* 0dB min, 0.5dB steps */
@@ -734,6 +880,52 @@ static struct snd_soc_dai_driver aic32x4_dai = {
 	.symmetric_rates = 1,
 };
 
+static void aic32x4_setup_gpios(struct snd_soc_codec *codec)
+{
+	struct aic32x4_priv *aic32x4 = snd_soc_codec_get_drvdata(codec);
+
+	/* setup GPIO functions */
+	/* MFP1 */
+	if (aic32x4->setup->gpio_func[0] != AIC32X4_MFPX_DEFAULT_VALUE) {
+		snd_soc_write(codec, AIC32X4_DINCTL,
+		      aic32x4->setup->gpio_func[0]);
+		snd_soc_add_codec_controls(codec, aic32x4_mfp1,
+			ARRAY_SIZE(aic32x4_mfp1));
+	}
+
+	/* MFP2 */
+	if (aic32x4->setup->gpio_func[1] != AIC32X4_MFPX_DEFAULT_VALUE) {
+		snd_soc_write(codec, AIC32X4_DOUTCTL,
+		      aic32x4->setup->gpio_func[1]);
+		snd_soc_add_codec_controls(codec, aic32x4_mfp2,
+			ARRAY_SIZE(aic32x4_mfp2));
+	}
+
+	/* MFP3 */
+	if (aic32x4->setup->gpio_func[2] != AIC32X4_MFPX_DEFAULT_VALUE) {
+		snd_soc_write(codec, AIC32X4_SCLKCTL,
+		      aic32x4->setup->gpio_func[2]);
+		snd_soc_add_codec_controls(codec, aic32x4_mfp3,
+			ARRAY_SIZE(aic32x4_mfp3));
+	}
+
+	/* MFP4 */
+	if (aic32x4->setup->gpio_func[3] != AIC32X4_MFPX_DEFAULT_VALUE) {
+		snd_soc_write(codec, AIC32X4_MISOCTL,
+		      aic32x4->setup->gpio_func[3]);
+		snd_soc_add_codec_controls(codec, aic32x4_mfp4,
+			ARRAY_SIZE(aic32x4_mfp4));
+	}
+
+	/* MFP5 */
+	if (aic32x4->setup->gpio_func[4] != AIC32X4_MFPX_DEFAULT_VALUE) {
+		snd_soc_write(codec, AIC32X4_GPIOCTL,
+		      aic32x4->setup->gpio_func[4]);
+		snd_soc_add_codec_controls(codec, aic32x4_mfp5,
+			ARRAY_SIZE(aic32x4_mfp5));
+	}
+}
+
 static int aic32x4_codec_probe(struct snd_soc_codec *codec)
 {
 	struct aic32x4_priv *aic32x4 = snd_soc_codec_get_drvdata(codec);
@@ -746,6 +938,9 @@ static int aic32x4_codec_probe(struct snd_soc_codec *codec)
 
 	snd_soc_write(codec, AIC32X4_RESET, 0x01);
 
+	if (aic32x4->setup)
+		aic32x4_setup_gpios(codec);
+
 	/* Power platform configuration */
 	if (aic32x4->power_cfg & AIC32X4_PWR_MICBIAS_2075_LDOIN) {
 		snd_soc_write(codec, AIC32X4_MICBIAS, AIC32X4_MICBIAS_LDOIN |
@@ -810,10 +1005,20 @@ static struct snd_soc_codec_driver soc_codec_dev_aic32x4 = {
 static int aic32x4_parse_dt(struct aic32x4_priv *aic32x4,
 		struct device_node *np)
 {
+	struct aic32x4_setup_data *aic32x4_setup;
+
+	aic32x4_setup = devm_kzalloc(aic32x4->dev, sizeof(*aic32x4_setup),
+							GFP_KERNEL);
+	if (!aic32x4_setup)
+		return -ENOMEM;
+
 	aic32x4->swapdacs = false;
 	aic32x4->micpga_routing = 0;
 	aic32x4->rstn_gpio = of_get_named_gpio(np, "reset-gpios", 0);
 
+	if (of_property_read_u32_array(np, "aic32x4-gpio-func",
+				aic32x4_setup->gpio_func, 5) >= 0)
+		aic32x4->setup = aic32x4_setup;
 	return 0;
 }
 
@@ -932,6 +1137,7 @@ int aic32x4_probe(struct device *dev, struct regmap *regmap)
 	if (aic32x4 == NULL)
 		return -ENOMEM;
 
+	aic32x4->dev = dev;
 	dev_set_drvdata(dev, aic32x4);
 
 	if (pdata) {
diff --git a/sound/soc/codecs/tlv320aic32x4.h b/sound/soc/codecs/tlv320aic32x4.h
index a197dd51addc..da7cec482bcb 100644
--- a/sound/soc/codecs/tlv320aic32x4.h
+++ b/sound/soc/codecs/tlv320aic32x4.h
@@ -44,8 +44,11 @@ int aic32x4_remove(struct device *dev);
 #define AIC32X4_IFACE4		31
 #define AIC32X4_IFACE5		32
 #define AIC32X4_IFACE6		33
+#define AIC32X4_GPIOCTL		52
 #define AIC32X4_DOUTCTL		53
 #define AIC32X4_DINCTL		54
+#define AIC32X4_MISOCTL		55
+#define AIC32X4_SCLKCTL		56
 #define AIC32X4_DACSPB		60
 #define AIC32X4_ADCSPB		61
 #define AIC32X4_DACSETUP	63
-- 
cgit v1.2.3


From 68f6be656179d1f0399d77918b3e361dad420455 Mon Sep 17 00:00:00 2001
From: Anatolij Gustschin <agust@denx.de>
Date: Wed, 14 Jun 2017 10:36:27 -0500
Subject: fpga: Add flag to indicate SPI bitstream is bit-reversed

Add a flag that is passed to the write_init() callback,
indicating that the SPI bitstream starts with LSB first.
SPI controllers usually send data with MSB first. If an
FPGA expects bitstream data as LSB first, the data must
be reversed either by the SPI controller or by the driver.

Alternatively the bitstream could be prepared as bit-reversed
to avoid the bit-swapping while sending. This flag indicates
such bit-reversed SPI bitstream. The low-level driver will
deal with the flag and perform bit-reversing if needed.

Signed-off-by: Anatolij Gustschin <agust@denx.de>
Signed-off-by: Joshua Clayton <stillcompiling@gmail.com>
Signed-off-by: Alan Tull <atull@kernel.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/linux/fpga/fpga-mgr.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include')

diff --git a/include/linux/fpga/fpga-mgr.h b/include/linux/fpga/fpga-mgr.h
index b4ac24c4411d..01c348ca38b7 100644
--- a/include/linux/fpga/fpga-mgr.h
+++ b/include/linux/fpga/fpga-mgr.h
@@ -67,10 +67,12 @@ enum fpga_mgr_states {
  * FPGA Manager flags
  * FPGA_MGR_PARTIAL_RECONFIG: do partial reconfiguration if supported
  * FPGA_MGR_EXTERNAL_CONFIG: FPGA has been configured prior to Linux booting
+ * FPGA_MGR_BITSTREAM_LSB_FIRST: SPI bitstream bit order is LSB first
  */
 #define FPGA_MGR_PARTIAL_RECONFIG	BIT(0)
 #define FPGA_MGR_EXTERNAL_CONFIG	BIT(1)
 #define FPGA_MGR_ENCRYPTED_BITSTREAM	BIT(2)
+#define FPGA_MGR_BITSTREAM_LSB_FIRST	BIT(3)
 
 /**
  * struct fpga_image_info - information specific to a FPGA image
-- 
cgit v1.2.3


From 3b88da4aba2549497ac8868877562f1b1913ca62 Mon Sep 17 00:00:00 2001
From: Joshua Clayton <stillcompiling@gmail.com>
Date: Wed, 14 Jun 2017 10:36:31 -0500
Subject: lib: add bitrev8x4()

Add a function to reverse bytes within a 32 bit word.
Operate on a u32 rather than individual bytes.

Signed-off-by: Joshua Clayton <stillcompiling@gmail.com>
Signed-off-by: Alan Tull <atull@kernel.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/linux/bitrev.h | 19 +++++++++++++++++++
 1 file changed, 19 insertions(+)

(limited to 'include')

diff --git a/include/linux/bitrev.h b/include/linux/bitrev.h
index fb790b8449c1..b97be27e5a85 100644
--- a/include/linux/bitrev.h
+++ b/include/linux/bitrev.h
@@ -29,6 +29,8 @@ static inline u32 __bitrev32(u32 x)
 
 #endif /* CONFIG_HAVE_ARCH_BITREVERSE */
 
+#define __bitrev8x4(x)	(__bitrev32(swab32(x)))
+
 #define __constant_bitrev32(x)	\
 ({					\
 	u32 __x = x;			\
@@ -50,6 +52,15 @@ static inline u32 __bitrev32(u32 x)
 	__x;								\
 })
 
+#define __constant_bitrev8x4(x) \
+({			\
+	u32 __x = x;	\
+	__x = ((__x & (u32)0xF0F0F0F0UL) >> 4) | ((__x & (u32)0x0F0F0F0FUL) << 4);	\
+	__x = ((__x & (u32)0xCCCCCCCCUL) >> 2) | ((__x & (u32)0x33333333UL) << 2);	\
+	__x = ((__x & (u32)0xAAAAAAAAUL) >> 1) | ((__x & (u32)0x55555555UL) << 1);	\
+	__x;								\
+})
+
 #define __constant_bitrev8(x)	\
 ({					\
 	u8 __x = x;			\
@@ -75,6 +86,14 @@ static inline u32 __bitrev32(u32 x)
 	__bitrev16(__x);				\
  })
 
+#define bitrev8x4(x) \
+({			\
+	u32 __x = x;	\
+	__builtin_constant_p(__x) ?	\
+	__constant_bitrev8x4(__x) :			\
+	__bitrev8x4(__x);				\
+ })
+
 #define bitrev8(x) \
 ({			\
 	u8 __x = x;	\
-- 
cgit v1.2.3


From b37fa56069ce61d97a77dadca8dd65e522db3387 Mon Sep 17 00:00:00 2001
From: Anatolij Gustschin <agust@denx.de>
Date: Wed, 14 Jun 2017 10:36:34 -0500
Subject: fpga: Add flag to indicate bitstream needs decompression

Add a flag that is passed to the write_init() callback, indicating
that the bitstream is compressed.

The low-level driver will deal with the flag, or return an error,
if compressed bitstreams are not supported.

Signed-off-by: Anatolij Gustschin <agust@denx.de>
Reviewed-by: Andy Shevchenko <andy.shevchenko@gmail.com>
Signed-off-by: Alan Tull <atull@kernel.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/linux/fpga/fpga-mgr.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include')

diff --git a/include/linux/fpga/fpga-mgr.h b/include/linux/fpga/fpga-mgr.h
index 01c348ca38b7..bfa14bc023fb 100644
--- a/include/linux/fpga/fpga-mgr.h
+++ b/include/linux/fpga/fpga-mgr.h
@@ -68,11 +68,13 @@ enum fpga_mgr_states {
  * FPGA_MGR_PARTIAL_RECONFIG: do partial reconfiguration if supported
  * FPGA_MGR_EXTERNAL_CONFIG: FPGA has been configured prior to Linux booting
  * FPGA_MGR_BITSTREAM_LSB_FIRST: SPI bitstream bit order is LSB first
+ * FPGA_MGR_COMPRESSED_BITSTREAM: FPGA bitstream is compressed
  */
 #define FPGA_MGR_PARTIAL_RECONFIG	BIT(0)
 #define FPGA_MGR_EXTERNAL_CONFIG	BIT(1)
 #define FPGA_MGR_ENCRYPTED_BITSTREAM	BIT(2)
 #define FPGA_MGR_BITSTREAM_LSB_FIRST	BIT(3)
+#define FPGA_MGR_COMPRESSED_BITSTREAM	BIT(4)
 
 /**
  * struct fpga_image_info - information specific to a FPGA image
-- 
cgit v1.2.3


From 805df2966f67a6b1a228c8e580e230b6c849b41e Mon Sep 17 00:00:00 2001
From: Viresh Kumar <viresh.kumar@linaro.org>
Date: Fri, 23 Jun 2017 14:55:32 +0530
Subject: arch_topology: Change return type of topology_parse_cpu_capacity() to
 bool

topology_parse_cpu_capacity() returns 1 on success and 0 on errors. Make
it return bool instead of int as that suits the purpose better.

Signed-off-by: Viresh Kumar <viresh.kumar@linaro.org>
Reviewed-by: Juri Lelli <juri.lelli@arm.com>
Tested-by: Juri Lelli <juri.lelli@arm.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/base/arch_topology.c  | 8 ++++----
 include/linux/arch_topology.h | 4 +++-
 2 files changed, 7 insertions(+), 5 deletions(-)

(limited to 'include')

diff --git a/drivers/base/arch_topology.c b/drivers/base/arch_topology.c
index a3cd7c869c3e..5728e2fbb765 100644
--- a/drivers/base/arch_topology.c
+++ b/drivers/base/arch_topology.c
@@ -119,13 +119,13 @@ void topology_normalize_cpu_scale(void)
 	mutex_unlock(&cpu_scale_mutex);
 }
 
-int __init topology_parse_cpu_capacity(struct device_node *cpu_node, int cpu)
+bool __init topology_parse_cpu_capacity(struct device_node *cpu_node, int cpu)
 {
-	int ret = 1;
+	int ret;
 	u32 cpu_capacity;
 
 	if (cap_parsing_failed)
-		return !ret;
+		return false;
 
 	ret = of_property_read_u32(cpu_node, "capacity-dmips-mhz",
 				   &cpu_capacity);
@@ -137,7 +137,7 @@ int __init topology_parse_cpu_capacity(struct device_node *cpu_node, int cpu)
 			if (!raw_capacity) {
 				pr_err("cpu_capacity: failed to allocate memory for raw capacities\n");
 				cap_parsing_failed = true;
-				return 0;
+				return false;
 			}
 		}
 		capacity_scale = max(cpu_capacity, capacity_scale);
diff --git a/include/linux/arch_topology.h b/include/linux/arch_topology.h
index 9af3c174c03a..716ce587247e 100644
--- a/include/linux/arch_topology.h
+++ b/include/linux/arch_topology.h
@@ -4,10 +4,12 @@
 #ifndef _LINUX_ARCH_TOPOLOGY_H_
 #define _LINUX_ARCH_TOPOLOGY_H_
 
+#include <linux/types.h>
+
 void topology_normalize_cpu_scale(void);
 
 struct device_node;
-int topology_parse_cpu_capacity(struct device_node *cpu_node, int cpu);
+bool topology_parse_cpu_capacity(struct device_node *cpu_node, int cpu);
 
 struct sched_domain;
 unsigned long topology_get_cpu_scale(struct sched_domain *sd, int cpu);
-- 
cgit v1.2.3


From 27eac47b00789522ba00501b0838026e1ecb6f05 Mon Sep 17 00:00:00 2001
From: David Herrmann <dh.herrmann@gmail.com>
Date: Mon, 17 Jul 2017 11:35:54 +0200
Subject: net/unix: drop obsolete fd-recursion limits

All unix sockets now account inflight FDs to the respective sender.
This was introduced in:

    commit 712f4aad406bb1ed67f3f98d04c044191f0ff593
    Author: willy tarreau <w@1wt.eu>
    Date:   Sun Jan 10 07:54:56 2016 +0100

        unix: properly account for FDs passed over unix sockets

and further refined in:

    commit 415e3d3e90ce9e18727e8843ae343eda5a58fad6
    Author: Hannes Frederic Sowa <hannes@stressinduktion.org>
    Date:   Wed Feb 3 02:11:03 2016 +0100

        unix: correctly track in-flight fds in sending process user_struct

Hence, regardless of the stacking depth of FDs, the total number of
inflight FDs is limited, and accounted. There is no known way for a
local user to exceed those limits or exploit the accounting.

Furthermore, the GC logic is independent of the recursion/stacking depth
as well. It solely depends on the total number of inflight FDs,
regardless of their layout.

Lastly, the current `recursion_level' suffers a TOCTOU race, since it
checks and inherits depths only at queue time. If we consider `A<-B' to
mean `queue-B-on-A', the following sequence circumvents the recursion
level easily:

    A<-B
       B<-C
          C<-D
             ...
               Y<-Z

resulting in:

    A<-B<-C<-...<-Z

With all of this in mind, lets drop the recursion limit. It has no
additional security value, anymore. On the contrary, it randomly
confuses message brokers that try to forward file-descriptors, since
any sendmsg(2) call can fail spuriously with ETOOMANYREFS if a client
maliciously modifies the FD while inflight.

Cc: Alban Crequy <alban.crequy@collabora.co.uk>
Cc: Simon McVittie <simon.mcvittie@collabora.co.uk>
Signed-off-by: David Herrmann <dh.herrmann@gmail.com>
Reviewed-by: Tom Gundersen <teg@jklm.no>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/af_unix.h |  1 -
 net/unix/af_unix.c    | 24 +-----------------------
 2 files changed, 1 insertion(+), 24 deletions(-)

(limited to 'include')

diff --git a/include/net/af_unix.h b/include/net/af_unix.h
index 678e4d6fa317..3b3194b2fc65 100644
--- a/include/net/af_unix.h
+++ b/include/net/af_unix.h
@@ -58,7 +58,6 @@ struct unix_sock {
 	struct list_head	link;
 	atomic_long_t		inflight;
 	spinlock_t		lock;
-	unsigned char		recursion_level;
 	unsigned long		gc_flags;
 #define UNIX_GC_CANDIDATE	0
 #define UNIX_GC_MAYBE_CYCLE	1
diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c
index 7b52a380d710..5c53f22d62e8 100644
--- a/net/unix/af_unix.c
+++ b/net/unix/af_unix.c
@@ -1528,26 +1528,13 @@ static inline bool too_many_unix_fds(struct task_struct *p)
 	return false;
 }
 
-#define MAX_RECURSION_LEVEL 4
-
 static int unix_attach_fds(struct scm_cookie *scm, struct sk_buff *skb)
 {
 	int i;
-	unsigned char max_level = 0;
 
 	if (too_many_unix_fds(current))
 		return -ETOOMANYREFS;
 
-	for (i = scm->fp->count - 1; i >= 0; i--) {
-		struct sock *sk = unix_get_socket(scm->fp->fp[i]);
-
-		if (sk)
-			max_level = max(max_level,
-					unix_sk(sk)->recursion_level);
-	}
-	if (unlikely(max_level > MAX_RECURSION_LEVEL))
-		return -ETOOMANYREFS;
-
 	/*
 	 * Need to duplicate file references for the sake of garbage
 	 * collection.  Otherwise a socket in the fps might become a
@@ -1559,7 +1546,7 @@ static int unix_attach_fds(struct scm_cookie *scm, struct sk_buff *skb)
 
 	for (i = scm->fp->count - 1; i >= 0; i--)
 		unix_inflight(scm->fp->user, scm->fp->fp[i]);
-	return max_level;
+	return 0;
 }
 
 static int unix_scm_to_skb(struct scm_cookie *scm, struct sk_buff *skb, bool send_fds)
@@ -1649,7 +1636,6 @@ static int unix_dgram_sendmsg(struct socket *sock, struct msghdr *msg,
 	struct sk_buff *skb;
 	long timeo;
 	struct scm_cookie scm;
-	int max_level;
 	int data_len = 0;
 	int sk_locked;
 
@@ -1701,7 +1687,6 @@ static int unix_dgram_sendmsg(struct socket *sock, struct msghdr *msg,
 	err = unix_scm_to_skb(&scm, skb, true);
 	if (err < 0)
 		goto out_free;
-	max_level = err + 1;
 
 	skb_put(skb, len - data_len);
 	skb->data_len = data_len;
@@ -1819,8 +1804,6 @@ restart_locked:
 		__net_timestamp(skb);
 	maybe_add_creds(skb, sock, other);
 	skb_queue_tail(&other->sk_receive_queue, skb);
-	if (max_level > unix_sk(other)->recursion_level)
-		unix_sk(other)->recursion_level = max_level;
 	unix_state_unlock(other);
 	other->sk_data_ready(other);
 	sock_put(other);
@@ -1855,7 +1838,6 @@ static int unix_stream_sendmsg(struct socket *sock, struct msghdr *msg,
 	int sent = 0;
 	struct scm_cookie scm;
 	bool fds_sent = false;
-	int max_level;
 	int data_len;
 
 	wait_for_unix_gc();
@@ -1905,7 +1887,6 @@ static int unix_stream_sendmsg(struct socket *sock, struct msghdr *msg,
 			kfree_skb(skb);
 			goto out_err;
 		}
-		max_level = err + 1;
 		fds_sent = true;
 
 		skb_put(skb, size - data_len);
@@ -1925,8 +1906,6 @@ static int unix_stream_sendmsg(struct socket *sock, struct msghdr *msg,
 
 		maybe_add_creds(skb, sock, other);
 		skb_queue_tail(&other->sk_receive_queue, skb);
-		if (max_level > unix_sk(other)->recursion_level)
-			unix_sk(other)->recursion_level = max_level;
 		unix_state_unlock(other);
 		other->sk_data_ready(other);
 		sent += size;
@@ -2324,7 +2303,6 @@ redo:
 		last_len = last ? last->len : 0;
 again:
 		if (skb == NULL) {
-			unix_sk(sk)->recursion_level = 0;
 			if (copied >= target)
 				goto unlock;
 
-- 
cgit v1.2.3


From b145425f269a17ed344d737f746b844dfac60c82 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Mon, 17 Jul 2017 02:56:10 -0700
Subject: inetpeer: remove AVL implementation in favor of RB tree

As discussed in Faro during Netfilter Workshop 2017, RB trees can be
used with RCU, using a seqlock.

Note that net/rxrpc/conn_service.c is already using this.

This patch converts inetpeer from AVL tree to RB tree, since it allows
to remove private AVL implementation in favor of shared RB code.

$ size net/ipv4/inetpeer.before net/ipv4/inetpeer.after
   text    data     bss     dec     hex filename
   3195      40     128    3363     d23 net/ipv4/inetpeer.before
   1562      24       0    1586     632 net/ipv4/inetpeer.after

The same technique can be used to speed up
net/netfilter/nft_set_rbtree.c (removing rwlock contention in fast path)

Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/inetpeer.h |  11 +-
 net/ipv4/inetpeer.c    | 428 +++++++++++--------------------------------------
 2 files changed, 92 insertions(+), 347 deletions(-)

(limited to 'include')

diff --git a/include/net/inetpeer.h b/include/net/inetpeer.h
index f2a215fc78e4..950ed182f62f 100644
--- a/include/net/inetpeer.h
+++ b/include/net/inetpeer.h
@@ -33,18 +33,12 @@ struct inetpeer_addr {
 };
 
 struct inet_peer {
-	/* group together avl_left,avl_right,v4daddr to speedup lookups */
-	struct inet_peer __rcu	*avl_left, *avl_right;
+	struct rb_node		rb_node;
 	struct inetpeer_addr	daddr;
-	__u32			avl_height;
 
 	u32			metrics[RTAX_MAX];
 	u32			rate_tokens;	/* rate limiting for ICMP */
 	unsigned long		rate_last;
-	union {
-		struct list_head	gc_list;
-		struct rcu_head     gc_rcu;
-	};
 	/*
 	 * Once inet_peer is queued for deletion (refcnt == 0), following field
 	 * is not available: rid
@@ -55,7 +49,6 @@ struct inet_peer {
 			atomic_t			rid;		/* Frag reception counter */
 		};
 		struct rcu_head         rcu;
-		struct inet_peer	*gc_next;
 	};
 
 	/* following fields might be frequently dirtied */
@@ -64,7 +57,7 @@ struct inet_peer {
 };
 
 struct inet_peer_base {
-	struct inet_peer __rcu	*root;
+	struct rb_root		rb_root;
 	seqlock_t		lock;
 	int			total;
 };
diff --git a/net/ipv4/inetpeer.c b/net/ipv4/inetpeer.c
index c5a117cc6619..337ad41bb80a 100644
--- a/net/ipv4/inetpeer.c
+++ b/net/ipv4/inetpeer.c
@@ -33,7 +33,7 @@
  *  also be removed if the pool is overloaded i.e. if the total amount of
  *  entries is greater-or-equal than the threshold.
  *
- *  Node pool is organised as an AVL tree.
+ *  Node pool is organised as an RB tree.
  *  Such an implementation has been chosen not just for fun.  It's a way to
  *  prevent easy and efficient DoS attacks by creating hash collisions.  A huge
  *  amount of long living nodes in a single hash slot would significantly delay
@@ -45,7 +45,7 @@
  *      AND reference count being 0.
  *  3.  Global variable peer_total is modified under the pool lock.
  *  4.  struct inet_peer fields modification:
- *		avl_left, avl_right, avl_parent, avl_height: pool lock
+ *		rb_node: pool lock
  *		refcnt: atomically against modifications on other CPU;
  *		   usually under some other lock to prevent node disappearing
  *		daddr: unchangeable
@@ -53,30 +53,15 @@
 
 static struct kmem_cache *peer_cachep __read_mostly;
 
-static LIST_HEAD(gc_list);
-static const int gc_delay = 60 * HZ;
-static struct delayed_work gc_work;
-static DEFINE_SPINLOCK(gc_lock);
-
-#define node_height(x) x->avl_height
-
-#define peer_avl_empty ((struct inet_peer *)&peer_fake_node)
-#define peer_avl_empty_rcu ((struct inet_peer __rcu __force *)&peer_fake_node)
-static const struct inet_peer peer_fake_node = {
-	.avl_left	= peer_avl_empty_rcu,
-	.avl_right	= peer_avl_empty_rcu,
-	.avl_height	= 0
-};
-
 void inet_peer_base_init(struct inet_peer_base *bp)
 {
-	bp->root = peer_avl_empty_rcu;
+	bp->rb_root = RB_ROOT;
 	seqlock_init(&bp->lock);
 	bp->total = 0;
 }
 EXPORT_SYMBOL_GPL(inet_peer_base_init);
 
-#define PEER_MAXDEPTH 40 /* sufficient for about 2^27 nodes */
+#define PEER_MAX_GC 32
 
 /* Exported for sysctl_net_ipv4.  */
 int inet_peer_threshold __read_mostly = 65536 + 128;	/* start to throw entries more
@@ -84,53 +69,6 @@ int inet_peer_threshold __read_mostly = 65536 + 128;	/* start to throw entries m
 int inet_peer_minttl __read_mostly = 120 * HZ;	/* TTL under high load: 120 sec */
 int inet_peer_maxttl __read_mostly = 10 * 60 * HZ;	/* usual time to live: 10 min */
 
-static void inetpeer_gc_worker(struct work_struct *work)
-{
-	struct inet_peer *p, *n, *c;
-	struct list_head list;
-
-	spin_lock_bh(&gc_lock);
-	list_replace_init(&gc_list, &list);
-	spin_unlock_bh(&gc_lock);
-
-	if (list_empty(&list))
-		return;
-
-	list_for_each_entry_safe(p, n, &list, gc_list) {
-
-		if (need_resched())
-			cond_resched();
-
-		c = rcu_dereference_protected(p->avl_left, 1);
-		if (c != peer_avl_empty) {
-			list_add_tail(&c->gc_list, &list);
-			p->avl_left = peer_avl_empty_rcu;
-		}
-
-		c = rcu_dereference_protected(p->avl_right, 1);
-		if (c != peer_avl_empty) {
-			list_add_tail(&c->gc_list, &list);
-			p->avl_right = peer_avl_empty_rcu;
-		}
-
-		n = list_entry(p->gc_list.next, struct inet_peer, gc_list);
-
-		if (refcount_read(&p->refcnt) == 1) {
-			list_del(&p->gc_list);
-			kmem_cache_free(peer_cachep, p);
-		}
-	}
-
-	if (list_empty(&list))
-		return;
-
-	spin_lock_bh(&gc_lock);
-	list_splice(&list, &gc_list);
-	spin_unlock_bh(&gc_lock);
-
-	schedule_delayed_work(&gc_work, gc_delay);
-}
-
 /* Called from ip_output.c:ip_init  */
 void __init inet_initpeers(void)
 {
@@ -153,225 +91,62 @@ void __init inet_initpeers(void)
 			sizeof(struct inet_peer),
 			0, SLAB_HWCACHE_ALIGN | SLAB_PANIC,
 			NULL);
-
-	INIT_DEFERRABLE_WORK(&gc_work, inetpeer_gc_worker);
 }
 
-#define rcu_deref_locked(X, BASE)				\
-	rcu_dereference_protected(X, lockdep_is_held(&(BASE)->lock.lock))
-
-/*
- * Called with local BH disabled and the pool lock held.
- */
-#define lookup(_daddr, _stack, _base)				\
-({								\
-	struct inet_peer *u;					\
-	struct inet_peer __rcu **v;				\
-								\
-	stackptr = _stack;					\
-	*stackptr++ = &_base->root;				\
-	for (u = rcu_deref_locked(_base->root, _base);		\
-	     u != peer_avl_empty;) {				\
-		int cmp = inetpeer_addr_cmp(_daddr, &u->daddr);	\
-		if (cmp == 0)					\
-			break;					\
-		if (cmp == -1)					\
-			v = &u->avl_left;			\
-		else						\
-			v = &u->avl_right;			\
-		*stackptr++ = v;				\
-		u = rcu_deref_locked(*v, _base);		\
-	}							\
-	u;							\
-})
-
-/*
- * Called with rcu_read_lock()
- * Because we hold no lock against a writer, its quite possible we fall
- * in an endless loop.
- * But every pointer we follow is guaranteed to be valid thanks to RCU.
- * We exit from this function if number of links exceeds PEER_MAXDEPTH
- */
-static struct inet_peer *lookup_rcu(const struct inetpeer_addr *daddr,
-				    struct inet_peer_base *base)
+/* Called with rcu_read_lock() or base->lock held */
+static struct inet_peer *lookup(const struct inetpeer_addr *daddr,
+				struct inet_peer_base *base,
+				unsigned int seq,
+				struct inet_peer *gc_stack[],
+				unsigned int *gc_cnt,
+				struct rb_node **parent_p,
+				struct rb_node ***pp_p)
 {
-	struct inet_peer *u = rcu_dereference(base->root);
-	int count = 0;
+	struct rb_node **pp, *parent;
+	struct inet_peer *p;
+
+	pp = &base->rb_root.rb_node;
+	parent = NULL;
+	while (*pp) {
+		int cmp;
 
-	while (u != peer_avl_empty) {
-		int cmp = inetpeer_addr_cmp(daddr, &u->daddr);
+		parent = rcu_dereference_raw(*pp);
+		p = rb_entry(parent, struct inet_peer, rb_node);
+		cmp = inetpeer_addr_cmp(daddr, &p->daddr);
 		if (cmp == 0) {
-			/* Before taking a reference, check if this entry was
-			 * deleted (refcnt=0)
-			 */
-			if (!refcount_inc_not_zero(&u->refcnt)) {
-				u = NULL;
-			}
-			return u;
+			if (!refcount_inc_not_zero(&p->refcnt))
+				break;
+			return p;
+		}
+		if (gc_stack) {
+			if (*gc_cnt < PEER_MAX_GC)
+				gc_stack[(*gc_cnt)++] = p;
+		} else if (unlikely(read_seqretry(&base->lock, seq))) {
+			break;
 		}
 		if (cmp == -1)
-			u = rcu_dereference(u->avl_left);
+			pp = &(*pp)->rb_left;
 		else
-			u = rcu_dereference(u->avl_right);
-		if (unlikely(++count == PEER_MAXDEPTH))
-			break;
+			pp = &(*pp)->rb_right;
 	}
+	*parent_p = parent;
+	*pp_p = pp;
 	return NULL;
 }
 
-/* Called with local BH disabled and the pool lock held. */
-#define lookup_rightempty(start, base)				\
-({								\
-	struct inet_peer *u;					\
-	struct inet_peer __rcu **v;				\
-	*stackptr++ = &start->avl_left;				\
-	v = &start->avl_left;					\
-	for (u = rcu_deref_locked(*v, base);			\
-	     u->avl_right != peer_avl_empty_rcu;) {		\
-		v = &u->avl_right;				\
-		*stackptr++ = v;				\
-		u = rcu_deref_locked(*v, base);			\
-	}							\
-	u;							\
-})
-
-/* Called with local BH disabled and the pool lock held.
- * Variable names are the proof of operation correctness.
- * Look into mm/map_avl.c for more detail description of the ideas.
- */
-static void peer_avl_rebalance(struct inet_peer __rcu **stack[],
-			       struct inet_peer __rcu ***stackend,
-			       struct inet_peer_base *base)
-{
-	struct inet_peer __rcu **nodep;
-	struct inet_peer *node, *l, *r;
-	int lh, rh;
-
-	while (stackend > stack) {
-		nodep = *--stackend;
-		node = rcu_deref_locked(*nodep, base);
-		l = rcu_deref_locked(node->avl_left, base);
-		r = rcu_deref_locked(node->avl_right, base);
-		lh = node_height(l);
-		rh = node_height(r);
-		if (lh > rh + 1) { /* l: RH+2 */
-			struct inet_peer *ll, *lr, *lrl, *lrr;
-			int lrh;
-			ll = rcu_deref_locked(l->avl_left, base);
-			lr = rcu_deref_locked(l->avl_right, base);
-			lrh = node_height(lr);
-			if (lrh <= node_height(ll)) {	/* ll: RH+1 */
-				RCU_INIT_POINTER(node->avl_left, lr);	/* lr: RH or RH+1 */
-				RCU_INIT_POINTER(node->avl_right, r);	/* r: RH */
-				node->avl_height = lrh + 1; /* RH+1 or RH+2 */
-				RCU_INIT_POINTER(l->avl_left, ll);       /* ll: RH+1 */
-				RCU_INIT_POINTER(l->avl_right, node);	/* node: RH+1 or RH+2 */
-				l->avl_height = node->avl_height + 1;
-				RCU_INIT_POINTER(*nodep, l);
-			} else { /* ll: RH, lr: RH+1 */
-				lrl = rcu_deref_locked(lr->avl_left, base);/* lrl: RH or RH-1 */
-				lrr = rcu_deref_locked(lr->avl_right, base);/* lrr: RH or RH-1 */
-				RCU_INIT_POINTER(node->avl_left, lrr);	/* lrr: RH or RH-1 */
-				RCU_INIT_POINTER(node->avl_right, r);	/* r: RH */
-				node->avl_height = rh + 1; /* node: RH+1 */
-				RCU_INIT_POINTER(l->avl_left, ll);	/* ll: RH */
-				RCU_INIT_POINTER(l->avl_right, lrl);	/* lrl: RH or RH-1 */
-				l->avl_height = rh + 1;	/* l: RH+1 */
-				RCU_INIT_POINTER(lr->avl_left, l);	/* l: RH+1 */
-				RCU_INIT_POINTER(lr->avl_right, node);	/* node: RH+1 */
-				lr->avl_height = rh + 2;
-				RCU_INIT_POINTER(*nodep, lr);
-			}
-		} else if (rh > lh + 1) { /* r: LH+2 */
-			struct inet_peer *rr, *rl, *rlr, *rll;
-			int rlh;
-			rr = rcu_deref_locked(r->avl_right, base);
-			rl = rcu_deref_locked(r->avl_left, base);
-			rlh = node_height(rl);
-			if (rlh <= node_height(rr)) {	/* rr: LH+1 */
-				RCU_INIT_POINTER(node->avl_right, rl);	/* rl: LH or LH+1 */
-				RCU_INIT_POINTER(node->avl_left, l);	/* l: LH */
-				node->avl_height = rlh + 1; /* LH+1 or LH+2 */
-				RCU_INIT_POINTER(r->avl_right, rr);	/* rr: LH+1 */
-				RCU_INIT_POINTER(r->avl_left, node);	/* node: LH+1 or LH+2 */
-				r->avl_height = node->avl_height + 1;
-				RCU_INIT_POINTER(*nodep, r);
-			} else { /* rr: RH, rl: RH+1 */
-				rlr = rcu_deref_locked(rl->avl_right, base);/* rlr: LH or LH-1 */
-				rll = rcu_deref_locked(rl->avl_left, base);/* rll: LH or LH-1 */
-				RCU_INIT_POINTER(node->avl_right, rll);	/* rll: LH or LH-1 */
-				RCU_INIT_POINTER(node->avl_left, l);	/* l: LH */
-				node->avl_height = lh + 1; /* node: LH+1 */
-				RCU_INIT_POINTER(r->avl_right, rr);	/* rr: LH */
-				RCU_INIT_POINTER(r->avl_left, rlr);	/* rlr: LH or LH-1 */
-				r->avl_height = lh + 1;	/* r: LH+1 */
-				RCU_INIT_POINTER(rl->avl_right, r);	/* r: LH+1 */
-				RCU_INIT_POINTER(rl->avl_left, node);	/* node: LH+1 */
-				rl->avl_height = lh + 2;
-				RCU_INIT_POINTER(*nodep, rl);
-			}
-		} else {
-			node->avl_height = (lh > rh ? lh : rh) + 1;
-		}
-	}
-}
-
-/* Called with local BH disabled and the pool lock held. */
-#define link_to_pool(n, base)					\
-do {								\
-	n->avl_height = 1;					\
-	n->avl_left = peer_avl_empty_rcu;			\
-	n->avl_right = peer_avl_empty_rcu;			\
-	/* lockless readers can catch us now */			\
-	rcu_assign_pointer(**--stackptr, n);			\
-	peer_avl_rebalance(stack, stackptr, base);		\
-} while (0)
-
 static void inetpeer_free_rcu(struct rcu_head *head)
 {
 	kmem_cache_free(peer_cachep, container_of(head, struct inet_peer, rcu));
 }
 
-static void unlink_from_pool(struct inet_peer *p, struct inet_peer_base *base,
-			     struct inet_peer __rcu **stack[PEER_MAXDEPTH])
-{
-	struct inet_peer __rcu ***stackptr, ***delp;
-
-	if (lookup(&p->daddr, stack, base) != p)
-		BUG();
-	delp = stackptr - 1; /* *delp[0] == p */
-	if (p->avl_left == peer_avl_empty_rcu) {
-		*delp[0] = p->avl_right;
-		--stackptr;
-	} else {
-		/* look for a node to insert instead of p */
-		struct inet_peer *t;
-		t = lookup_rightempty(p, base);
-		BUG_ON(rcu_deref_locked(*stackptr[-1], base) != t);
-		**--stackptr = t->avl_left;
-		/* t is removed, t->daddr > x->daddr for any
-		 * x in p->avl_left subtree.
-		 * Put t in the old place of p. */
-		RCU_INIT_POINTER(*delp[0], t);
-		t->avl_left = p->avl_left;
-		t->avl_right = p->avl_right;
-		t->avl_height = p->avl_height;
-		BUG_ON(delp[1] != &p->avl_left);
-		delp[1] = &t->avl_left; /* was &p->avl_left */
-	}
-	peer_avl_rebalance(stack, stackptr, base);
-	base->total--;
-	call_rcu(&p->rcu, inetpeer_free_rcu);
-}
-
 /* perform garbage collect on all items stacked during a lookup */
-static int inet_peer_gc(struct inet_peer_base *base,
-			struct inet_peer __rcu **stack[PEER_MAXDEPTH],
-			struct inet_peer __rcu ***stackptr)
+static void inet_peer_gc(struct inet_peer_base *base,
+			 struct inet_peer *gc_stack[],
+			 unsigned int gc_cnt)
 {
-	struct inet_peer *p, *gchead = NULL;
+	struct inet_peer *p;
 	__u32 delta, ttl;
-	int cnt = 0;
+	int i;
 
 	if (base->total >= inet_peer_threshold)
 		ttl = 0; /* be aggressive */
@@ -379,43 +154,38 @@ static int inet_peer_gc(struct inet_peer_base *base,
 		ttl = inet_peer_maxttl
 				- (inet_peer_maxttl - inet_peer_minttl) / HZ *
 					base->total / inet_peer_threshold * HZ;
-	stackptr--; /* last stack slot is peer_avl_empty */
-	while (stackptr > stack) {
-		stackptr--;
-		p = rcu_deref_locked(**stackptr, base);
-		if (refcount_read(&p->refcnt) == 1) {
-			smp_rmb();
-			delta = (__u32)jiffies - p->dtime;
-			if (delta >= ttl && refcount_dec_if_one(&p->refcnt)) {
-				p->gc_next = gchead;
-				gchead = p;
-			}
-		}
+	for (i = 0; i < gc_cnt; i++) {
+		p = gc_stack[i];
+		delta = (__u32)jiffies - p->dtime;
+		if (delta < ttl || !refcount_dec_if_one(&p->refcnt))
+			gc_stack[i] = NULL;
 	}
-	while ((p = gchead) != NULL) {
-		gchead = p->gc_next;
-		cnt++;
-		unlink_from_pool(p, base, stack);
+	for (i = 0; i < gc_cnt; i++) {
+		p = gc_stack[i];
+		if (p) {
+			rb_erase(&p->rb_node, &base->rb_root);
+			base->total--;
+			call_rcu(&p->rcu, inetpeer_free_rcu);
+		}
 	}
-	return cnt;
 }
 
 struct inet_peer *inet_getpeer(struct inet_peer_base *base,
 			       const struct inetpeer_addr *daddr,
 			       int create)
 {
-	struct inet_peer __rcu **stack[PEER_MAXDEPTH], ***stackptr;
-	struct inet_peer *p;
-	unsigned int sequence;
-	int invalidated, gccnt = 0;
+	struct inet_peer *p, *gc_stack[PEER_MAX_GC];
+	struct rb_node **pp, *parent;
+	unsigned int gc_cnt, seq;
+	int invalidated;
 
 	/* Attempt a lockless lookup first.
 	 * Because of a concurrent writer, we might not find an existing entry.
 	 */
 	rcu_read_lock();
-	sequence = read_seqbegin(&base->lock);
-	p = lookup_rcu(daddr, base);
-	invalidated = read_seqretry(&base->lock, sequence);
+	seq = read_seqbegin(&base->lock);
+	p = lookup(daddr, base, seq, NULL, &gc_cnt, &parent, &pp);
+	invalidated = read_seqretry(&base->lock, seq);
 	rcu_read_unlock();
 
 	if (p)
@@ -428,36 +198,31 @@ struct inet_peer *inet_getpeer(struct inet_peer_base *base,
 	/* retry an exact lookup, taking the lock before.
 	 * At least, nodes should be hot in our cache.
 	 */
+	parent = NULL;
 	write_seqlock_bh(&base->lock);
-relookup:
-	p = lookup(daddr, stack, base);
-	if (p != peer_avl_empty) {
-		refcount_inc(&p->refcnt);
-		write_sequnlock_bh(&base->lock);
-		return p;
-	}
-	if (!gccnt) {
-		gccnt = inet_peer_gc(base, stack, stackptr);
-		if (gccnt && create)
-			goto relookup;
-	}
-	p = create ? kmem_cache_alloc(peer_cachep, GFP_ATOMIC) : NULL;
-	if (p) {
-		p->daddr = *daddr;
-		refcount_set(&p->refcnt, 2);
-		atomic_set(&p->rid, 0);
-		p->metrics[RTAX_LOCK-1] = INETPEER_METRICS_NEW;
-		p->rate_tokens = 0;
-		/* 60*HZ is arbitrary, but chosen enough high so that the first
-		 * calculation of tokens is at its maximum.
-		 */
-		p->rate_last = jiffies - 60*HZ;
-		INIT_LIST_HEAD(&p->gc_list);
 
-		/* Link the node. */
-		link_to_pool(p, base);
-		base->total++;
+	gc_cnt = 0;
+	p = lookup(daddr, base, seq, gc_stack, &gc_cnt, &parent, &pp);
+	if (!p && create) {
+		p = kmem_cache_alloc(peer_cachep, GFP_ATOMIC);
+		if (p) {
+			p->daddr = *daddr;
+			refcount_set(&p->refcnt, 2);
+			atomic_set(&p->rid, 0);
+			p->metrics[RTAX_LOCK-1] = INETPEER_METRICS_NEW;
+			p->rate_tokens = 0;
+			/* 60*HZ is arbitrary, but chosen enough high so that the first
+			 * calculation of tokens is at its maximum.
+			 */
+			p->rate_last = jiffies - 60*HZ;
+
+			rb_link_node(&p->rb_node, parent, pp);
+			rb_insert_color(&p->rb_node, &base->rb_root);
+			base->total++;
+		}
 	}
+	if (gc_cnt)
+		inet_peer_gc(base, gc_stack, gc_cnt);
 	write_sequnlock_bh(&base->lock);
 
 	return p;
@@ -467,8 +232,9 @@ EXPORT_SYMBOL_GPL(inet_getpeer);
 void inet_putpeer(struct inet_peer *p)
 {
 	p->dtime = (__u32)jiffies;
-	smp_mb__before_atomic();
-	refcount_dec(&p->refcnt);
+
+	if (refcount_dec_and_test(&p->refcnt))
+		call_rcu(&p->rcu, inetpeer_free_rcu);
 }
 EXPORT_SYMBOL_GPL(inet_putpeer);
 
@@ -513,30 +279,16 @@ bool inet_peer_xrlim_allow(struct inet_peer *peer, int timeout)
 }
 EXPORT_SYMBOL(inet_peer_xrlim_allow);
 
-static void inetpeer_inval_rcu(struct rcu_head *head)
-{
-	struct inet_peer *p = container_of(head, struct inet_peer, gc_rcu);
-
-	spin_lock_bh(&gc_lock);
-	list_add_tail(&p->gc_list, &gc_list);
-	spin_unlock_bh(&gc_lock);
-
-	schedule_delayed_work(&gc_work, gc_delay);
-}
-
 void inetpeer_invalidate_tree(struct inet_peer_base *base)
 {
-	struct inet_peer *root;
-
-	write_seqlock_bh(&base->lock);
+	struct inet_peer *p, *n;
 
-	root = rcu_deref_locked(base->root, base);
-	if (root != peer_avl_empty) {
-		base->root = peer_avl_empty_rcu;
-		base->total = 0;
-		call_rcu(&root->gc_rcu, inetpeer_inval_rcu);
+	rbtree_postorder_for_each_entry_safe(p, n, &base->rb_root, rb_node) {
+		inet_putpeer(p);
+		cond_resched();
 	}
 
-	write_sequnlock_bh(&base->lock);
+	base->rb_root = RB_ROOT;
+	base->total = 0;
 }
 EXPORT_SYMBOL(inetpeer_invalidate_tree);
-- 
cgit v1.2.3


From 814abfabef3ceed390c10d06a0cc69a86454b6cf Mon Sep 17 00:00:00 2001
From: John Fastabend <john.fastabend@gmail.com>
Date: Mon, 17 Jul 2017 09:27:07 -0700
Subject: xdp: add bpf_redirect helper function

This adds support for a bpf_redirect helper function to the XDP
infrastructure. For now this only supports redirecting to the egress
path of a port.

In order to support drivers handling a xdp_buff natively this patches
uses a new ndo operation ndo_xdp_xmit() that takes pushes a xdp_buff
to the specified device.

If the program specifies either (a) an unknown device or (b) a device
that does not support the operation a BPF warning is thrown and the
XDP_ABORTED error code is returned.

Signed-off-by: John Fastabend <john.fastabend@gmail.com>
Acked-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Jesper Dangaard Brouer <brouer@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/filter.h    |  4 ++++
 include/linux/netdevice.h |  6 ++++++
 include/uapi/linux/bpf.h  |  1 +
 net/core/filter.c         | 52 +++++++++++++++++++++++++++++++++++++++++++++++
 4 files changed, 63 insertions(+)

(limited to 'include')

diff --git a/include/linux/filter.h b/include/linux/filter.h
index bfef1e5734f8..64cae7a08148 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -711,7 +711,11 @@ bool bpf_helper_changes_pkt_data(void *func);
 
 struct bpf_prog *bpf_patch_insn_single(struct bpf_prog *prog, u32 off,
 				       const struct bpf_insn *patch, u32 len);
+
+int xdp_do_redirect(struct net_device *dev, struct xdp_buff *xdp);
+
 void bpf_warn_invalid_xdp_action(u32 act);
+void bpf_warn_invalid_xdp_redirect(u32 ifindex);
 
 #ifdef CONFIG_BPF_JIT
 extern int bpf_jit_enable;
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 779b23595596..77f5376005e6 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -66,6 +66,7 @@ struct mpls_dev;
 /* UDP Tunnel offloads */
 struct udp_tunnel_info;
 struct bpf_prog;
+struct xdp_buff;
 
 void netdev_set_default_ethtool_ops(struct net_device *dev,
 				    const struct ethtool_ops *ops);
@@ -1138,6 +1139,9 @@ struct xfrmdev_ops {
  * int (*ndo_xdp)(struct net_device *dev, struct netdev_xdp *xdp);
  *	This function is used to set or query state related to XDP on the
  *	netdevice. See definition of enum xdp_netdev_command for details.
+ * int (*ndo_xdp_xmit)(struct net_device *dev, struct xdp_buff *xdp);
+ *	This function is used to submit a XDP packet for transmit on a
+ *	netdevice.
  *
  */
 struct net_device_ops {
@@ -1323,6 +1327,8 @@ struct net_device_ops {
 						       int needed_headroom);
 	int			(*ndo_xdp)(struct net_device *dev,
 					   struct netdev_xdp *xdp);
+	int			(*ndo_xdp_xmit)(struct net_device *dev,
+						struct xdp_buff *xdp);
 };
 
 /**
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index e99e3e6f8b37..4dbb7a3f4677 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -717,6 +717,7 @@ enum xdp_action {
 	XDP_DROP,
 	XDP_PASS,
 	XDP_TX,
+	XDP_REDIRECT,
 };
 
 /* user accessible metadata for XDP packet hook
diff --git a/net/core/filter.c b/net/core/filter.c
index c7f737058d89..d606a66d1040 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -2412,6 +2412,51 @@ static const struct bpf_func_proto bpf_xdp_adjust_head_proto = {
 	.arg2_type	= ARG_ANYTHING,
 };
 
+static int __bpf_tx_xdp(struct net_device *dev, struct xdp_buff *xdp)
+{
+	if (dev->netdev_ops->ndo_xdp_xmit) {
+		dev->netdev_ops->ndo_xdp_xmit(dev, xdp);
+		return 0;
+	}
+	bpf_warn_invalid_xdp_redirect(dev->ifindex);
+	return -EOPNOTSUPP;
+}
+
+int xdp_do_redirect(struct net_device *dev, struct xdp_buff *xdp)
+{
+	struct redirect_info *ri = this_cpu_ptr(&redirect_info);
+
+	dev = dev_get_by_index_rcu(dev_net(dev), ri->ifindex);
+	ri->ifindex = 0;
+	if (unlikely(!dev)) {
+		bpf_warn_invalid_xdp_redirect(ri->ifindex);
+		return -EINVAL;
+	}
+
+	return __bpf_tx_xdp(dev, xdp);
+}
+EXPORT_SYMBOL_GPL(xdp_do_redirect);
+
+BPF_CALL_2(bpf_xdp_redirect, u32, ifindex, u64, flags)
+{
+	struct redirect_info *ri = this_cpu_ptr(&redirect_info);
+
+	if (unlikely(flags))
+		return XDP_ABORTED;
+
+	ri->ifindex = ifindex;
+	ri->flags = flags;
+	return XDP_REDIRECT;
+}
+
+static const struct bpf_func_proto bpf_xdp_redirect_proto = {
+	.func           = bpf_xdp_redirect,
+	.gpl_only       = false,
+	.ret_type       = RET_INTEGER,
+	.arg1_type      = ARG_ANYTHING,
+	.arg2_type      = ARG_ANYTHING,
+};
+
 bool bpf_helper_changes_pkt_data(void *func)
 {
 	if (func == bpf_skb_vlan_push ||
@@ -3011,6 +3056,8 @@ xdp_func_proto(enum bpf_func_id func_id)
 		return &bpf_get_smp_processor_id_proto;
 	case BPF_FUNC_xdp_adjust_head:
 		return &bpf_xdp_adjust_head_proto;
+	case BPF_FUNC_redirect:
+		return &bpf_xdp_redirect_proto;
 	default:
 		return bpf_base_func_proto(func_id);
 	}
@@ -3310,6 +3357,11 @@ void bpf_warn_invalid_xdp_action(u32 act)
 }
 EXPORT_SYMBOL_GPL(bpf_warn_invalid_xdp_action);
 
+void bpf_warn_invalid_xdp_redirect(u32 ifindex)
+{
+	WARN_ONCE(1, "Illegal XDP redirect to unsupported device ifindex(%i)\n", ifindex);
+}
+
 static bool __is_valid_sock_ops_access(int off, int size)
 {
 	if (off < 0 || off >= sizeof(struct bpf_sock_ops))
-- 
cgit v1.2.3


From 6103aa96ec077c976e851e0b89cc2446cb76573d Mon Sep 17 00:00:00 2001
From: John Fastabend <john.fastabend@gmail.com>
Date: Mon, 17 Jul 2017 09:27:50 -0700
Subject: net: implement XDP_REDIRECT for xdp generic

Add support for redirect to xdp generic creating a fall back for
devices that do not yet have support and allowing test infrastructure
using veth pairs to be built.

Signed-off-by: John Fastabend <john.fastabend@gmail.com>
Tested-by: Andy Gospodarek <andy@greyhouse.net>
Acked-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Jesper Dangaard Brouer <brouer@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/filter.h |  1 +
 net/core/dev.c         | 22 ++++++++++++++++++++--
 net/core/filter.c      | 26 ++++++++++++++++++++++++++
 3 files changed, 47 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/linux/filter.h b/include/linux/filter.h
index 64cae7a08148..10df7daf5ec6 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -712,6 +712,7 @@ bool bpf_helper_changes_pkt_data(void *func);
 struct bpf_prog *bpf_patch_insn_single(struct bpf_prog *prog, u32 off,
 				       const struct bpf_insn *patch, u32 len);
 
+int xdp_do_generic_redirect(struct net_device *dev, struct sk_buff *skb);
 int xdp_do_redirect(struct net_device *dev, struct xdp_buff *xdp);
 
 void bpf_warn_invalid_xdp_action(u32 act);
diff --git a/net/core/dev.c b/net/core/dev.c
index a1ed7b41b3e8..9f3f4083ada5 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -3902,6 +3902,7 @@ static u32 netif_receive_generic_xdp(struct sk_buff *skb,
 		__skb_push(skb, -off);
 
 	switch (act) {
+	case XDP_REDIRECT:
 	case XDP_TX:
 		__skb_push(skb, mac_len);
 		/* fall through */
@@ -3956,14 +3957,27 @@ static int do_xdp_generic(struct sk_buff *skb)
 
 	if (xdp_prog) {
 		u32 act = netif_receive_generic_xdp(skb, xdp_prog);
+		int err;
 
 		if (act != XDP_PASS) {
-			if (act == XDP_TX)
+			switch (act) {
+			case XDP_REDIRECT:
+				err = xdp_do_generic_redirect(skb->dev, skb);
+				if (err)
+					goto out_redir;
+			/* fallthru to submit skb */
+			case XDP_TX:
 				generic_xdp_tx(skb, xdp_prog);
+				break;
+			}
 			return XDP_DROP;
 		}
 	}
 	return XDP_PASS;
+out_redir:
+	trace_xdp_exception(skb->dev, xdp_prog, XDP_REDIRECT);
+	kfree_skb(skb);
+	return XDP_DROP;
 }
 
 static int netif_rx_internal(struct sk_buff *skb)
@@ -3977,8 +3991,12 @@ static int netif_rx_internal(struct sk_buff *skb)
 	if (static_key_false(&generic_xdp_needed)) {
 		int ret = do_xdp_generic(skb);
 
+		/* Consider XDP consuming the packet a success from
+		 * the netdev point of view we do not want to count
+		 * this as an error.
+		 */
 		if (ret != XDP_PASS)
-			return NET_RX_DROP;
+			return NET_RX_SUCCESS;
 	}
 
 #ifdef CONFIG_RPS
diff --git a/net/core/filter.c b/net/core/filter.c
index d606a66d1040..eeb713461c25 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -2437,6 +2437,32 @@ int xdp_do_redirect(struct net_device *dev, struct xdp_buff *xdp)
 }
 EXPORT_SYMBOL_GPL(xdp_do_redirect);
 
+int xdp_do_generic_redirect(struct net_device *dev, struct sk_buff *skb)
+{
+	struct redirect_info *ri = this_cpu_ptr(&redirect_info);
+	unsigned int len;
+
+	dev = dev_get_by_index_rcu(dev_net(dev), ri->ifindex);
+	ri->ifindex = 0;
+	if (unlikely(!dev)) {
+		bpf_warn_invalid_xdp_redirect(ri->ifindex);
+		goto err;
+	}
+
+	if (unlikely(!(dev->flags & IFF_UP)))
+		goto err;
+
+	len = dev->mtu + dev->hard_header_len + VLAN_HLEN;
+	if (skb->len > len)
+		goto err;
+
+	skb->dev = dev;
+	return 0;
+err:
+	return -EINVAL;
+}
+EXPORT_SYMBOL_GPL(xdp_do_generic_redirect);
+
 BPF_CALL_2(bpf_xdp_redirect, u32, ifindex, u64, flags)
 {
 	struct redirect_info *ri = this_cpu_ptr(&redirect_info);
-- 
cgit v1.2.3


From 5acaee0a8964c9bab7775ab8bedcd1f66a2a1011 Mon Sep 17 00:00:00 2001
From: John Fastabend <john.fastabend@gmail.com>
Date: Mon, 17 Jul 2017 09:28:35 -0700
Subject: xdp: add trace event for xdp redirect

This adds a trace event for xdp redirect which may help when debugging
XDP programs that use redirect bpf commands.

Signed-off-by: John Fastabend <john.fastabend@gmail.com>
Acked-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Jesper Dangaard Brouer <brouer@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/intel/ixgbe/ixgbe_main.c |  2 +-
 include/linux/filter.h                        |  4 +++-
 include/trace/events/xdp.h                    | 31 ++++++++++++++++++++++++++-
 net/core/filter.c                             | 13 +++++++----
 4 files changed, 43 insertions(+), 7 deletions(-)

(limited to 'include')

diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
index 3db04736a048..38f7ff97d636 100644
--- a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
+++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
@@ -2232,7 +2232,7 @@ static struct sk_buff *ixgbe_run_xdp(struct ixgbe_adapter *adapter,
 		result = ixgbe_xmit_xdp_ring(adapter, xdp);
 		break;
 	case XDP_REDIRECT:
-		err = xdp_do_redirect(adapter->netdev, xdp);
+		err = xdp_do_redirect(adapter->netdev, xdp, xdp_prog);
 		if (!err)
 			result = IXGBE_XDP_TX;
 		else
diff --git a/include/linux/filter.h b/include/linux/filter.h
index 10df7daf5ec6..ce8211fa91c7 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -713,7 +713,9 @@ struct bpf_prog *bpf_patch_insn_single(struct bpf_prog *prog, u32 off,
 				       const struct bpf_insn *patch, u32 len);
 
 int xdp_do_generic_redirect(struct net_device *dev, struct sk_buff *skb);
-int xdp_do_redirect(struct net_device *dev, struct xdp_buff *xdp);
+int xdp_do_redirect(struct net_device *dev,
+		    struct xdp_buff *xdp,
+		    struct bpf_prog *prog);
 
 void bpf_warn_invalid_xdp_action(u32 act);
 void bpf_warn_invalid_xdp_redirect(u32 ifindex);
diff --git a/include/trace/events/xdp.h b/include/trace/events/xdp.h
index 1b61357d3f57..7b1eb7b4be41 100644
--- a/include/trace/events/xdp.h
+++ b/include/trace/events/xdp.h
@@ -12,7 +12,8 @@
 	FN(ABORTED)		\
 	FN(DROP)		\
 	FN(PASS)		\
-	FN(TX)
+	FN(TX)			\
+	FN(REDIRECT)
 
 #define __XDP_ACT_TP_FN(x)	\
 	TRACE_DEFINE_ENUM(XDP_##x);
@@ -48,6 +49,34 @@ TRACE_EVENT(xdp_exception,
 		  __print_symbolic(__entry->act, __XDP_ACT_SYM_TAB))
 );
 
+TRACE_EVENT(xdp_redirect,
+
+	TP_PROTO(const struct net_device *from,
+		 const struct net_device *to,
+		 const struct bpf_prog *xdp, u32 act),
+
+	TP_ARGS(from, to, xdp, act),
+
+	TP_STRUCT__entry(
+		__string(name_from, from->name)
+		__string(name_to, to->name)
+		__array(u8, prog_tag, 8)
+		__field(u32, act)
+	),
+
+	TP_fast_assign(
+		BUILD_BUG_ON(sizeof(__entry->prog_tag) != sizeof(xdp->tag));
+		memcpy(__entry->prog_tag, xdp->tag, sizeof(xdp->tag));
+		__assign_str(name_from, from->name);
+		__assign_str(name_to, to->name);
+		__entry->act = act;
+	),
+
+	TP_printk("prog=%s from=%s to=%s action=%s",
+		  __print_hex_str(__entry->prog_tag, 8),
+		  __get_str(name_from), __get_str(name_to),
+		  __print_symbolic(__entry->act, __XDP_ACT_SYM_TAB))
+);
 #endif /* _TRACE_XDP_H */
 
 #include <trace/define_trace.h>
diff --git a/net/core/filter.c b/net/core/filter.c
index eeb713461c25..e30d38b27f21 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -55,6 +55,7 @@
 #include <net/sock_reuseport.h>
 #include <net/busy_poll.h>
 #include <net/tcp.h>
+#include <linux/bpf_trace.h>
 
 /**
  *	sk_filter_trim_cap - run a packet through a socket filter
@@ -2422,18 +2423,22 @@ static int __bpf_tx_xdp(struct net_device *dev, struct xdp_buff *xdp)
 	return -EOPNOTSUPP;
 }
 
-int xdp_do_redirect(struct net_device *dev, struct xdp_buff *xdp)
+int xdp_do_redirect(struct net_device *dev, struct xdp_buff *xdp,
+		    struct bpf_prog *xdp_prog)
 {
 	struct redirect_info *ri = this_cpu_ptr(&redirect_info);
+	struct net_device *fwd;
 
-	dev = dev_get_by_index_rcu(dev_net(dev), ri->ifindex);
+	fwd = dev_get_by_index_rcu(dev_net(dev), ri->ifindex);
 	ri->ifindex = 0;
-	if (unlikely(!dev)) {
+	if (unlikely(!fwd)) {
 		bpf_warn_invalid_xdp_redirect(ri->ifindex);
 		return -EINVAL;
 	}
 
-	return __bpf_tx_xdp(dev, xdp);
+	trace_xdp_redirect(dev, fwd, xdp_prog, XDP_REDIRECT);
+
+	return __bpf_tx_xdp(fwd, xdp);
 }
 EXPORT_SYMBOL_GPL(xdp_do_redirect);
 
-- 
cgit v1.2.3


From 546ac1ffb70d25b56c1126940e5ec639c4dd7413 Mon Sep 17 00:00:00 2001
From: John Fastabend <john.fastabend@gmail.com>
Date: Mon, 17 Jul 2017 09:28:56 -0700
Subject: bpf: add devmap, a map for storing net device references

Device map (devmap) is a BPF map, primarily useful for networking
applications, that uses a key to lookup a reference to a netdevice.

The map provides a clean way for BPF programs to build virtual port
to physical port maps. Additionally, it provides a scoping function
for the redirect action itself allowing multiple optimizations. Future
patches will leverage the map to provide batching at the XDP layer.

Another optimization/feature, that is not yet implemented, would be
to support multiple netdevices per key to support efficient multicast
and broadcast support.

Signed-off-by: John Fastabend <john.fastabend@gmail.com>
Acked-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Jesper Dangaard Brouer <brouer@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/bpf_types.h               |   3 +
 include/uapi/linux/bpf.h                |   1 +
 kernel/bpf/Makefile                     |   3 +
 kernel/bpf/devmap.c                     | 264 ++++++++++++++++++++++++++++++++
 kernel/bpf/verifier.c                   |   8 +
 tools/testing/selftests/bpf/test_maps.c |  15 ++
 6 files changed, 294 insertions(+)
 create mode 100644 kernel/bpf/devmap.c

(limited to 'include')

diff --git a/include/linux/bpf_types.h b/include/linux/bpf_types.h
index 3d137c33d664..b1e1035ca24b 100644
--- a/include/linux/bpf_types.h
+++ b/include/linux/bpf_types.h
@@ -35,3 +35,6 @@ BPF_MAP_TYPE(BPF_MAP_TYPE_STACK_TRACE, stack_map_ops)
 #endif
 BPF_MAP_TYPE(BPF_MAP_TYPE_ARRAY_OF_MAPS, array_of_maps_map_ops)
 BPF_MAP_TYPE(BPF_MAP_TYPE_HASH_OF_MAPS, htab_of_maps_map_ops)
+#ifdef CONFIG_NET
+BPF_MAP_TYPE(BPF_MAP_TYPE_DEVMAP, dev_map_ops)
+#endif
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 4dbb7a3f4677..ecbb0e7e15bc 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -104,6 +104,7 @@ enum bpf_map_type {
 	BPF_MAP_TYPE_LPM_TRIE,
 	BPF_MAP_TYPE_ARRAY_OF_MAPS,
 	BPF_MAP_TYPE_HASH_OF_MAPS,
+	BPF_MAP_TYPE_DEVMAP,
 };
 
 enum bpf_prog_type {
diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile
index e1e5e658f2db..48e92705be59 100644
--- a/kernel/bpf/Makefile
+++ b/kernel/bpf/Makefile
@@ -2,6 +2,9 @@ obj-y := core.o
 
 obj-$(CONFIG_BPF_SYSCALL) += syscall.o verifier.o inode.o helpers.o
 obj-$(CONFIG_BPF_SYSCALL) += hashtab.o arraymap.o percpu_freelist.o bpf_lru_list.o lpm_trie.o map_in_map.o
+ifeq ($(CONFIG_NET),y)
+obj-$(CONFIG_BPF_SYSCALL) += devmap.o
+endif
 ifeq ($(CONFIG_PERF_EVENTS),y)
 obj-$(CONFIG_BPF_SYSCALL) += stackmap.o
 endif
diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c
new file mode 100644
index 000000000000..1a878356bd37
--- /dev/null
+++ b/kernel/bpf/devmap.c
@@ -0,0 +1,264 @@
+/* Copyright (c) 2017 Covalent IO, Inc. http://covalent.io
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ */
+
+/* Devmaps primary use is as a backend map for XDP BPF helper call
+ * bpf_redirect_map(). Because XDP is mostly concerned with performance we
+ * spent some effort to ensure the datapath with redirect maps does not use
+ * any locking. This is a quick note on the details.
+ *
+ * We have three possible paths to get into the devmap control plane bpf
+ * syscalls, bpf programs, and driver side xmit/flush operations. A bpf syscall
+ * will invoke an update, delete, or lookup operation. To ensure updates and
+ * deletes appear atomic from the datapath side xchg() is used to modify the
+ * netdev_map array. Then because the datapath does a lookup into the netdev_map
+ * array (read-only) from an RCU critical section we use call_rcu() to wait for
+ * an rcu grace period before free'ing the old data structures. This ensures the
+ * datapath always has a valid copy. However, the datapath does a "flush"
+ * operation that pushes any pending packets in the driver outside the RCU
+ * critical section. Each bpf_dtab_netdev tracks these pending operations using
+ * an atomic per-cpu bitmap. The bpf_dtab_netdev object will not be destroyed
+ * until all bits are cleared indicating outstanding flush operations have
+ * completed.
+ *
+ * BPF syscalls may race with BPF program calls on any of the update, delete
+ * or lookup operations. As noted above the xchg() operation also keep the
+ * netdev_map consistent in this case. From the devmap side BPF programs
+ * calling into these operations are the same as multiple user space threads
+ * making system calls.
+ */
+#include <linux/bpf.h>
+#include <linux/jhash.h>
+#include <linux/filter.h>
+#include <linux/rculist_nulls.h>
+#include "percpu_freelist.h"
+#include "bpf_lru_list.h"
+#include "map_in_map.h"
+
+struct bpf_dtab_netdev {
+	struct net_device *dev;
+	int key;
+	struct rcu_head rcu;
+	struct bpf_dtab *dtab;
+};
+
+struct bpf_dtab {
+	struct bpf_map map;
+	struct bpf_dtab_netdev **netdev_map;
+};
+
+static struct bpf_map *dev_map_alloc(union bpf_attr *attr)
+{
+	struct bpf_dtab *dtab;
+	u64 cost;
+	int err;
+
+	/* check sanity of attributes */
+	if (attr->max_entries == 0 || attr->key_size != 4 ||
+	    attr->value_size != 4 || attr->map_flags)
+		return ERR_PTR(-EINVAL);
+
+	/* if value_size is bigger, the user space won't be able to
+	 * access the elements.
+	 */
+	if (attr->value_size > KMALLOC_MAX_SIZE)
+		return ERR_PTR(-E2BIG);
+
+	dtab = kzalloc(sizeof(*dtab), GFP_USER);
+	if (!dtab)
+		return ERR_PTR(-ENOMEM);
+
+	/* mandatory map attributes */
+	dtab->map.map_type = attr->map_type;
+	dtab->map.key_size = attr->key_size;
+	dtab->map.value_size = attr->value_size;
+	dtab->map.max_entries = attr->max_entries;
+	dtab->map.map_flags = attr->map_flags;
+
+	err = -ENOMEM;
+
+	/* make sure page count doesn't overflow */
+	cost = (u64) dtab->map.max_entries * sizeof(struct bpf_dtab_netdev *);
+	if (cost >= U32_MAX - PAGE_SIZE)
+		goto free_dtab;
+
+	dtab->map.pages = round_up(cost, PAGE_SIZE) >> PAGE_SHIFT;
+
+	/* if map size is larger than memlock limit, reject it early */
+	err = bpf_map_precharge_memlock(dtab->map.pages);
+	if (err)
+		goto free_dtab;
+
+	dtab->netdev_map = bpf_map_area_alloc(dtab->map.max_entries *
+					      sizeof(struct bpf_dtab_netdev *));
+	if (!dtab->netdev_map)
+		goto free_dtab;
+
+	return &dtab->map;
+
+free_dtab:
+	kfree(dtab);
+	return ERR_PTR(err);
+}
+
+static void dev_map_free(struct bpf_map *map)
+{
+	struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map);
+	int i;
+
+	/* At this point bpf_prog->aux->refcnt == 0 and this map->refcnt == 0,
+	 * so the programs (can be more than one that used this map) were
+	 * disconnected from events. Wait for outstanding critical sections in
+	 * these programs to complete. The rcu critical section only guarantees
+	 * no further reads against netdev_map. It does __not__ ensure pending
+	 * flush operations (if any) are complete.
+	 */
+	synchronize_rcu();
+
+	for (i = 0; i < dtab->map.max_entries; i++) {
+		struct bpf_dtab_netdev *dev;
+
+		dev = dtab->netdev_map[i];
+		if (!dev)
+			continue;
+
+		dev_put(dev->dev);
+		kfree(dev);
+	}
+
+	/* At this point bpf program is detached and all pending operations
+	 * _must_ be complete
+	 */
+	bpf_map_area_free(dtab->netdev_map);
+	kfree(dtab);
+}
+
+static int dev_map_get_next_key(struct bpf_map *map, void *key, void *next_key)
+{
+	struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map);
+	u32 index = key ? *(u32 *)key : U32_MAX;
+	u32 *next = (u32 *)next_key;
+
+	if (index >= dtab->map.max_entries) {
+		*next = 0;
+		return 0;
+	}
+
+	if (index == dtab->map.max_entries - 1)
+		return -ENOENT;
+
+	*next = index + 1;
+	return 0;
+}
+
+/* rcu_read_lock (from syscall and BPF contexts) ensures that if a delete and/or
+ * update happens in parallel here a dev_put wont happen until after reading the
+ * ifindex.
+ */
+static void *dev_map_lookup_elem(struct bpf_map *map, void *key)
+{
+	struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map);
+	struct bpf_dtab_netdev *dev;
+	u32 i = *(u32 *)key;
+
+	if (i >= map->max_entries)
+		return NULL;
+
+	dev = READ_ONCE(dtab->netdev_map[i]);
+	return dev ? &dev->dev->ifindex : NULL;
+}
+
+static void __dev_map_entry_free(struct rcu_head *rcu)
+{
+	struct bpf_dtab_netdev *old_dev;
+
+	old_dev = container_of(rcu, struct bpf_dtab_netdev, rcu);
+	dev_put(old_dev->dev);
+	kfree(old_dev);
+}
+
+static int dev_map_delete_elem(struct bpf_map *map, void *key)
+{
+	struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map);
+	struct bpf_dtab_netdev *old_dev;
+	int k = *(u32 *)key;
+
+	if (k >= map->max_entries)
+		return -EINVAL;
+
+	/* Use synchronize_rcu() here to ensure any rcu critical sections
+	 * have completed, but this does not guarantee a flush has happened
+	 * yet. Because driver side rcu_read_lock/unlock only protects the
+	 * running XDP program. However, for pending flush operations the
+	 * dev and ctx are stored in another per cpu map. And additionally,
+	 * the driver tear down ensures all soft irqs are complete before
+	 * removing the net device in the case of dev_put equals zero.
+	 */
+	old_dev = xchg(&dtab->netdev_map[k], NULL);
+	if (old_dev)
+		call_rcu(&old_dev->rcu, __dev_map_entry_free);
+	return 0;
+}
+
+static int dev_map_update_elem(struct bpf_map *map, void *key, void *value,
+				u64 map_flags)
+{
+	struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map);
+	struct net *net = current->nsproxy->net_ns;
+	struct bpf_dtab_netdev *dev, *old_dev;
+	u32 i = *(u32 *)key;
+	u32 ifindex = *(u32 *)value;
+
+	if (unlikely(map_flags > BPF_EXIST))
+		return -EINVAL;
+
+	if (unlikely(i >= dtab->map.max_entries))
+		return -E2BIG;
+
+	if (unlikely(map_flags == BPF_NOEXIST))
+		return -EEXIST;
+
+	if (!ifindex) {
+		dev = NULL;
+	} else {
+		dev = kmalloc(sizeof(*dev), GFP_ATOMIC | __GFP_NOWARN);
+		if (!dev)
+			return -ENOMEM;
+
+		dev->dev = dev_get_by_index(net, ifindex);
+		if (!dev->dev) {
+			kfree(dev);
+			return -EINVAL;
+		}
+
+		dev->key = i;
+		dev->dtab = dtab;
+	}
+
+	/* Use call_rcu() here to ensure rcu critical sections have completed
+	 * Remembering the driver side flush operation will happen before the
+	 * net device is removed.
+	 */
+	old_dev = xchg(&dtab->netdev_map[i], dev);
+	if (old_dev)
+		call_rcu(&old_dev->rcu, __dev_map_entry_free);
+
+	return 0;
+}
+
+const struct bpf_map_ops dev_map_ops = {
+	.map_alloc = dev_map_alloc,
+	.map_free = dev_map_free,
+	.map_get_next_key = dev_map_get_next_key,
+	.map_lookup_elem = dev_map_lookup_elem,
+	.map_update_elem = dev_map_update_elem,
+	.map_delete_elem = dev_map_delete_elem,
+};
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 6a86723c5b64..4016774d5cca 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -1276,6 +1276,14 @@ static int check_map_func_compatibility(struct bpf_map *map, int func_id)
 		    func_id != BPF_FUNC_current_task_under_cgroup)
 			goto error;
 		break;
+	/* devmap returns a pointer to a live net_device ifindex that we cannot
+	 * allow to be modified from bpf side. So do not allow lookup elements
+	 * for now.
+	 */
+	case BPF_MAP_TYPE_DEVMAP:
+		if (func_id == BPF_FUNC_map_lookup_elem)
+			goto error;
+		break;
 	case BPF_MAP_TYPE_ARRAY_OF_MAPS:
 	case BPF_MAP_TYPE_HASH_OF_MAPS:
 		if (func_id != BPF_FUNC_map_lookup_elem)
diff --git a/tools/testing/selftests/bpf/test_maps.c b/tools/testing/selftests/bpf/test_maps.c
index 79601c81e169..36d6ac3f0c1c 100644
--- a/tools/testing/selftests/bpf/test_maps.c
+++ b/tools/testing/selftests/bpf/test_maps.c
@@ -438,6 +438,21 @@ static void test_arraymap_percpu_many_keys(void)
 	close(fd);
 }
 
+static void test_devmap(int task, void *data)
+{
+	int next_key, fd;
+	__u32 key, value;
+
+	fd = bpf_create_map(BPF_MAP_TYPE_DEVMAP, sizeof(key), sizeof(value),
+			    2, 0);
+	if (fd < 0) {
+		printf("Failed to create arraymap '%s'!\n", strerror(errno));
+		exit(1);
+	}
+
+	close(fd);
+}
+
 #define MAP_SIZE (32 * 1024)
 
 static void test_map_large(void)
-- 
cgit v1.2.3


From 97f91a7cf04ff605845c20948b8a80e54cbd3376 Mon Sep 17 00:00:00 2001
From: John Fastabend <john.fastabend@gmail.com>
Date: Mon, 17 Jul 2017 09:29:18 -0700
Subject: bpf: add bpf_redirect_map helper routine

BPF programs can use the devmap with a bpf_redirect_map() helper
routine to forward packets to netdevice in map.

Signed-off-by: John Fastabend <john.fastabend@gmail.com>
Signed-off-by: Jesper Dangaard Brouer <brouer@redhat.com>
Acked-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/bpf.h      |  3 +++
 include/uapi/linux/bpf.h |  8 +++++++-
 kernel/bpf/devmap.c      | 12 +++++++++++
 kernel/bpf/verifier.c    |  4 ++++
 net/core/filter.c        | 52 ++++++++++++++++++++++++++++++++++++++++++++++++
 5 files changed, 78 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index b69e7a5869ff..d0d3281ac678 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -379,4 +379,7 @@ extern const struct bpf_func_proto bpf_get_stackid_proto;
 void bpf_user_rnd_init_once(void);
 u64 bpf_user_rnd_u32(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5);
 
+/* Map specifics */
+struct net_device  *__dev_map_lookup_elem(struct bpf_map *map, u32 key);
+
 #endif /* _LINUX_BPF_H */
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index ecbb0e7e15bc..1106a8c4cd36 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -348,6 +348,11 @@ union bpf_attr {
  *     @flags: bit 0 - if set, redirect to ingress instead of egress
  *             other bits - reserved
  *     Return: TC_ACT_REDIRECT
+ * int bpf_redirect_map(key, map, flags)
+ *     redirect to endpoint in map
+ *     @key: index in map to lookup
+ *     @map: fd of map to do lookup in
+ *     @flags: --
  *
  * u32 bpf_get_route_realm(skb)
  *     retrieve a dst's tclassid
@@ -592,7 +597,8 @@ union bpf_attr {
 	FN(get_socket_uid),		\
 	FN(set_hash),			\
 	FN(setsockopt),			\
-	FN(skb_adjust_room),
+	FN(skb_adjust_room),		\
+	FN(redirect_map),
 
 /* integer value in 'imm' field of BPF_CALL instruction selects which helper
  * function eBPF program intends to call
diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c
index 1a878356bd37..36dc13deb2e1 100644
--- a/kernel/bpf/devmap.c
+++ b/kernel/bpf/devmap.c
@@ -159,6 +159,18 @@ static int dev_map_get_next_key(struct bpf_map *map, void *key, void *next_key)
 	return 0;
 }
 
+struct net_device  *__dev_map_lookup_elem(struct bpf_map *map, u32 key)
+{
+	struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map);
+	struct bpf_dtab_netdev *dev;
+
+	if (key >= map->max_entries)
+		return NULL;
+
+	dev = READ_ONCE(dtab->netdev_map[key]);
+	return dev ? dev->dev : NULL;
+}
+
 /* rcu_read_lock (from syscall and BPF contexts) ensures that if a delete and/or
  * update happens in parallel here a dev_put wont happen until after reading the
  * ifindex.
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 4016774d5cca..df05d65f0c87 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -1312,6 +1312,10 @@ static int check_map_func_compatibility(struct bpf_map *map, int func_id)
 		if (map->map_type != BPF_MAP_TYPE_CGROUP_ARRAY)
 			goto error;
 		break;
+	case BPF_FUNC_redirect_map:
+		if (map->map_type != BPF_MAP_TYPE_DEVMAP)
+			goto error;
+		break;
 	default:
 		break;
 	}
diff --git a/net/core/filter.c b/net/core/filter.c
index e30d38b27f21..e93a558324b5 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -1779,6 +1779,7 @@ static const struct bpf_func_proto bpf_clone_redirect_proto = {
 struct redirect_info {
 	u32 ifindex;
 	u32 flags;
+	struct bpf_map *map;
 };
 
 static DEFINE_PER_CPU(struct redirect_info, redirect_info);
@@ -1792,6 +1793,7 @@ BPF_CALL_2(bpf_redirect, u32, ifindex, u64, flags)
 
 	ri->ifindex = ifindex;
 	ri->flags = flags;
+	ri->map = NULL;
 
 	return TC_ACT_REDIRECT;
 }
@@ -1819,6 +1821,29 @@ static const struct bpf_func_proto bpf_redirect_proto = {
 	.arg2_type      = ARG_ANYTHING,
 };
 
+BPF_CALL_3(bpf_redirect_map, struct bpf_map *, map, u32, ifindex, u64, flags)
+{
+	struct redirect_info *ri = this_cpu_ptr(&redirect_info);
+
+	if (unlikely(flags))
+		return XDP_ABORTED;
+
+	ri->ifindex = ifindex;
+	ri->flags = flags;
+	ri->map = map;
+
+	return XDP_REDIRECT;
+}
+
+static const struct bpf_func_proto bpf_redirect_map_proto = {
+	.func           = bpf_redirect_map,
+	.gpl_only       = false,
+	.ret_type       = RET_INTEGER,
+	.arg1_type      = ARG_CONST_MAP_PTR,
+	.arg2_type      = ARG_ANYTHING,
+	.arg3_type      = ARG_ANYTHING,
+};
+
 BPF_CALL_1(bpf_get_cgroup_classid, const struct sk_buff *, skb)
 {
 	return task_get_classid(skb);
@@ -2423,14 +2448,39 @@ static int __bpf_tx_xdp(struct net_device *dev, struct xdp_buff *xdp)
 	return -EOPNOTSUPP;
 }
 
+int xdp_do_redirect_map(struct net_device *dev, struct xdp_buff *xdp,
+			struct bpf_prog *xdp_prog)
+{
+	struct redirect_info *ri = this_cpu_ptr(&redirect_info);
+	struct bpf_map *map = ri->map;
+	struct net_device *fwd;
+	int err = -EINVAL;
+
+	ri->ifindex = 0;
+	ri->map = NULL;
+
+	fwd = __dev_map_lookup_elem(map, ri->ifindex);
+	if (!fwd)
+		goto out;
+
+	trace_xdp_redirect(dev, fwd, xdp_prog, XDP_REDIRECT);
+	err = __bpf_tx_xdp(fwd, xdp);
+out:
+	return err;
+}
+
 int xdp_do_redirect(struct net_device *dev, struct xdp_buff *xdp,
 		    struct bpf_prog *xdp_prog)
 {
 	struct redirect_info *ri = this_cpu_ptr(&redirect_info);
 	struct net_device *fwd;
 
+	if (ri->map)
+		return xdp_do_redirect_map(dev, xdp, xdp_prog);
+
 	fwd = dev_get_by_index_rcu(dev_net(dev), ri->ifindex);
 	ri->ifindex = 0;
+	ri->map = NULL;
 	if (unlikely(!fwd)) {
 		bpf_warn_invalid_xdp_redirect(ri->ifindex);
 		return -EINVAL;
@@ -3089,6 +3139,8 @@ xdp_func_proto(enum bpf_func_id func_id)
 		return &bpf_xdp_adjust_head_proto;
 	case BPF_FUNC_redirect:
 		return &bpf_xdp_redirect_proto;
+	case BPF_FUNC_redirect_map:
+		return &bpf_redirect_map_proto;
 	default:
 		return bpf_base_func_proto(func_id);
 	}
-- 
cgit v1.2.3


From 11393cc9b9be2a1f61559e6fb9c27bc8fa20b1ff Mon Sep 17 00:00:00 2001
From: John Fastabend <john.fastabend@gmail.com>
Date: Mon, 17 Jul 2017 09:29:40 -0700
Subject: xdp: Add batching support to redirect map

For performance reasons we want to avoid updating the tail pointer in
the driver tx ring as much as possible. To accomplish this we add
batching support to the redirect path in XDP.

This adds another ndo op "xdp_flush" that is used to inform the driver
that it should bump the tail pointer on the TX ring.

Signed-off-by: John Fastabend <john.fastabend@gmail.com>
Signed-off-by: Jesper Dangaard Brouer <brouer@redhat.com>
Acked-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/intel/ixgbe/ixgbe_main.c | 28 ++++++++-
 include/linux/bpf.h                           |  2 +
 include/linux/filter.h                        |  7 +++
 include/linux/netdevice.h                     |  5 +-
 kernel/bpf/devmap.c                           | 84 ++++++++++++++++++++++++++-
 net/core/filter.c                             | 55 ++++++++++++++----
 6 files changed, 166 insertions(+), 15 deletions(-)

(limited to 'include')

diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
index 38f7ff97d636..0f867dcda65f 100644
--- a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
+++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
@@ -2415,6 +2415,8 @@ static int ixgbe_clean_rx_irq(struct ixgbe_q_vector *q_vector,
 		 */
 		wmb();
 		writel(ring->next_to_use, ring->tail);
+
+		xdp_do_flush_map();
 	}
 
 	u64_stats_update_begin(&rx_ring->syncp);
@@ -5817,6 +5819,9 @@ void ixgbe_down(struct ixgbe_adapter *adapter)
 
 	usleep_range(10000, 20000);
 
+	/* synchronize_sched() needed for pending XDP buffers to drain */
+	if (adapter->xdp_ring[0])
+		synchronize_sched();
 	netif_tx_stop_all_queues(netdev);
 
 	/* call carrier off first to avoid false dev_watchdog timeouts */
@@ -9850,15 +9855,31 @@ static int ixgbe_xdp_xmit(struct net_device *dev, struct xdp_buff *xdp)
 	if (err != IXGBE_XDP_TX)
 		return -ENOMEM;
 
+	return 0;
+}
+
+static void ixgbe_xdp_flush(struct net_device *dev)
+{
+	struct ixgbe_adapter *adapter = netdev_priv(dev);
+	struct ixgbe_ring *ring;
+
+	/* Its possible the device went down between xdp xmit and flush so
+	 * we need to ensure device is still up.
+	 */
+	if (unlikely(test_bit(__IXGBE_DOWN, &adapter->state)))
+		return;
+
+	ring = adapter->xdp_prog ? adapter->xdp_ring[smp_processor_id()] : NULL;
+	if (unlikely(!ring))
+		return;
+
 	/* Force memory writes to complete before letting h/w know there
 	 * are new descriptors to fetch.
 	 */
 	wmb();
-
-	ring = adapter->xdp_ring[smp_processor_id()];
 	writel(ring->next_to_use, ring->tail);
 
-	return 0;
+	return;
 }
 
 static const struct net_device_ops ixgbe_netdev_ops = {
@@ -9908,6 +9929,7 @@ static const struct net_device_ops ixgbe_netdev_ops = {
 	.ndo_features_check	= ixgbe_features_check,
 	.ndo_xdp		= ixgbe_xdp,
 	.ndo_xdp_xmit		= ixgbe_xdp_xmit,
+	.ndo_xdp_flush		= ixgbe_xdp_flush,
 };
 
 /**
diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index d0d3281ac678..6850a760dc94 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -381,5 +381,7 @@ u64 bpf_user_rnd_u32(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5);
 
 /* Map specifics */
 struct net_device  *__dev_map_lookup_elem(struct bpf_map *map, u32 key);
+void __dev_map_insert_ctx(struct bpf_map *map, u32 index);
+void __dev_map_flush(struct bpf_map *map);
 
 #endif /* _LINUX_BPF_H */
diff --git a/include/linux/filter.h b/include/linux/filter.h
index ce8211fa91c7..3323ee91c172 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -712,10 +712,17 @@ bool bpf_helper_changes_pkt_data(void *func);
 struct bpf_prog *bpf_patch_insn_single(struct bpf_prog *prog, u32 off,
 				       const struct bpf_insn *patch, u32 len);
 
+/* The pair of xdp_do_redirect and xdp_do_flush_map MUST be called in the
+ * same cpu context. Further for best results no more than a single map
+ * for the do_redirect/do_flush pair should be used. This limitation is
+ * because we only track one map and force a flush when the map changes.
+ * This does not appear to be a real limiation for existing software.
+ */
 int xdp_do_generic_redirect(struct net_device *dev, struct sk_buff *skb);
 int xdp_do_redirect(struct net_device *dev,
 		    struct xdp_buff *xdp,
 		    struct bpf_prog *prog);
+void xdp_do_flush_map(void);
 
 void bpf_warn_invalid_xdp_action(u32 act);
 void bpf_warn_invalid_xdp_redirect(u32 ifindex);
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 77f5376005e6..03b104908235 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -1142,7 +1142,9 @@ struct xfrmdev_ops {
  * int (*ndo_xdp_xmit)(struct net_device *dev, struct xdp_buff *xdp);
  *	This function is used to submit a XDP packet for transmit on a
  *	netdevice.
- *
+ * void (*ndo_xdp_flush)(struct net_device *dev);
+ *	This function is used to inform the driver to flush a paticular
+ *	xpd tx queue. Must be called on same CPU as xdp_xmit.
  */
 struct net_device_ops {
 	int			(*ndo_init)(struct net_device *dev);
@@ -1329,6 +1331,7 @@ struct net_device_ops {
 					   struct netdev_xdp *xdp);
 	int			(*ndo_xdp_xmit)(struct net_device *dev,
 						struct xdp_buff *xdp);
+	void			(*ndo_xdp_flush)(struct net_device *dev);
 };
 
 /**
diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c
index 36dc13deb2e1..b2ef04a1c86a 100644
--- a/kernel/bpf/devmap.c
+++ b/kernel/bpf/devmap.c
@@ -53,6 +53,7 @@ struct bpf_dtab_netdev {
 struct bpf_dtab {
 	struct bpf_map map;
 	struct bpf_dtab_netdev **netdev_map;
+	unsigned long int __percpu *flush_needed;
 };
 
 static struct bpf_map *dev_map_alloc(union bpf_attr *attr)
@@ -87,6 +88,7 @@ static struct bpf_map *dev_map_alloc(union bpf_attr *attr)
 
 	/* make sure page count doesn't overflow */
 	cost = (u64) dtab->map.max_entries * sizeof(struct bpf_dtab_netdev *);
+	cost += BITS_TO_LONGS(attr->max_entries) * sizeof(unsigned long);
 	if (cost >= U32_MAX - PAGE_SIZE)
 		goto free_dtab;
 
@@ -97,6 +99,14 @@ static struct bpf_map *dev_map_alloc(union bpf_attr *attr)
 	if (err)
 		goto free_dtab;
 
+	/* A per cpu bitfield with a bit per possible net device */
+	dtab->flush_needed = __alloc_percpu(
+				BITS_TO_LONGS(attr->max_entries) *
+				sizeof(unsigned long),
+				__alignof__(unsigned long));
+	if (!dtab->flush_needed)
+		goto free_dtab;
+
 	dtab->netdev_map = bpf_map_area_alloc(dtab->map.max_entries *
 					      sizeof(struct bpf_dtab_netdev *));
 	if (!dtab->netdev_map)
@@ -105,6 +115,7 @@ static struct bpf_map *dev_map_alloc(union bpf_attr *attr)
 	return &dtab->map;
 
 free_dtab:
+	free_percpu(dtab->flush_needed);
 	kfree(dtab);
 	return ERR_PTR(err);
 }
@@ -112,7 +123,7 @@ free_dtab:
 static void dev_map_free(struct bpf_map *map)
 {
 	struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map);
-	int i;
+	int i, cpu;
 
 	/* At this point bpf_prog->aux->refcnt == 0 and this map->refcnt == 0,
 	 * so the programs (can be more than one that used this map) were
@@ -123,6 +134,18 @@ static void dev_map_free(struct bpf_map *map)
 	 */
 	synchronize_rcu();
 
+	/* To ensure all pending flush operations have completed wait for flush
+	 * bitmap to indicate all flush_needed bits to be zero on _all_ cpus.
+	 * Because the above synchronize_rcu() ensures the map is disconnected
+	 * from the program we can assume no new bits will be set.
+	 */
+	for_each_online_cpu(cpu) {
+		unsigned long *bitmap = per_cpu_ptr(dtab->flush_needed, cpu);
+
+		while (!bitmap_empty(bitmap, dtab->map.max_entries))
+			cpu_relax();
+	}
+
 	for (i = 0; i < dtab->map.max_entries; i++) {
 		struct bpf_dtab_netdev *dev;
 
@@ -137,6 +160,7 @@ static void dev_map_free(struct bpf_map *map)
 	/* At this point bpf program is detached and all pending operations
 	 * _must_ be complete
 	 */
+	free_percpu(dtab->flush_needed);
 	bpf_map_area_free(dtab->netdev_map);
 	kfree(dtab);
 }
@@ -159,6 +183,14 @@ static int dev_map_get_next_key(struct bpf_map *map, void *key, void *next_key)
 	return 0;
 }
 
+void __dev_map_insert_ctx(struct bpf_map *map, u32 key)
+{
+	struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map);
+	unsigned long *bitmap = this_cpu_ptr(dtab->flush_needed);
+
+	__set_bit(key, bitmap);
+}
+
 struct net_device  *__dev_map_lookup_elem(struct bpf_map *map, u32 key)
 {
 	struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map);
@@ -171,6 +203,39 @@ struct net_device  *__dev_map_lookup_elem(struct bpf_map *map, u32 key)
 	return dev ? dev->dev : NULL;
 }
 
+/* __dev_map_flush is called from xdp_do_flush_map() which _must_ be signaled
+ * from the driver before returning from its napi->poll() routine. The poll()
+ * routine is called either from busy_poll context or net_rx_action signaled
+ * from NET_RX_SOFTIRQ. Either way the poll routine must complete before the
+ * net device can be torn down. On devmap tear down we ensure the ctx bitmap
+ * is zeroed before completing to ensure all flush operations have completed.
+ */
+void __dev_map_flush(struct bpf_map *map)
+{
+	struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map);
+	unsigned long *bitmap = this_cpu_ptr(dtab->flush_needed);
+	u32 bit;
+
+	for_each_set_bit(bit, bitmap, map->max_entries) {
+		struct bpf_dtab_netdev *dev = READ_ONCE(dtab->netdev_map[bit]);
+		struct net_device *netdev;
+
+		/* This is possible if the dev entry is removed by user space
+		 * between xdp redirect and flush op.
+		 */
+		if (unlikely(!dev))
+			continue;
+
+		netdev = dev->dev;
+
+		__clear_bit(bit, bitmap);
+		if (unlikely(!netdev || !netdev->netdev_ops->ndo_xdp_flush))
+			continue;
+
+		netdev->netdev_ops->ndo_xdp_flush(netdev);
+	}
+}
+
 /* rcu_read_lock (from syscall and BPF contexts) ensures that if a delete and/or
  * update happens in parallel here a dev_put wont happen until after reading the
  * ifindex.
@@ -188,11 +253,28 @@ static void *dev_map_lookup_elem(struct bpf_map *map, void *key)
 	return dev ? &dev->dev->ifindex : NULL;
 }
 
+static void dev_map_flush_old(struct bpf_dtab_netdev *old_dev)
+{
+	if (old_dev->dev->netdev_ops->ndo_xdp_flush) {
+		struct net_device *fl = old_dev->dev;
+		unsigned long *bitmap;
+		int cpu;
+
+		for_each_online_cpu(cpu) {
+			bitmap = per_cpu_ptr(old_dev->dtab->flush_needed, cpu);
+			__clear_bit(old_dev->key, bitmap);
+
+			fl->netdev_ops->ndo_xdp_flush(old_dev->dev);
+		}
+	}
+}
+
 static void __dev_map_entry_free(struct rcu_head *rcu)
 {
 	struct bpf_dtab_netdev *old_dev;
 
 	old_dev = container_of(rcu, struct bpf_dtab_netdev, rcu);
+	dev_map_flush_old(old_dev);
 	dev_put(old_dev->dev);
 	kfree(old_dev);
 }
diff --git a/net/core/filter.c b/net/core/filter.c
index e93a558324b5..e23aa6fa1119 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -1780,6 +1780,7 @@ struct redirect_info {
 	u32 ifindex;
 	u32 flags;
 	struct bpf_map *map;
+	struct bpf_map *map_to_flush;
 };
 
 static DEFINE_PER_CPU(struct redirect_info, redirect_info);
@@ -2438,34 +2439,68 @@ static const struct bpf_func_proto bpf_xdp_adjust_head_proto = {
 	.arg2_type	= ARG_ANYTHING,
 };
 
-static int __bpf_tx_xdp(struct net_device *dev, struct xdp_buff *xdp)
+static int __bpf_tx_xdp(struct net_device *dev,
+			struct bpf_map *map,
+			struct xdp_buff *xdp,
+			u32 index)
 {
-	if (dev->netdev_ops->ndo_xdp_xmit) {
-		dev->netdev_ops->ndo_xdp_xmit(dev, xdp);
-		return 0;
+	int err;
+
+	if (!dev->netdev_ops->ndo_xdp_xmit) {
+		bpf_warn_invalid_xdp_redirect(dev->ifindex);
+		return -EOPNOTSUPP;
 	}
-	bpf_warn_invalid_xdp_redirect(dev->ifindex);
-	return -EOPNOTSUPP;
+
+	err = dev->netdev_ops->ndo_xdp_xmit(dev, xdp);
+	if (err)
+		return err;
+
+	if (map)
+		__dev_map_insert_ctx(map, index);
+	else
+		dev->netdev_ops->ndo_xdp_flush(dev);
+
+	return err;
 }
 
+void xdp_do_flush_map(void)
+{
+	struct redirect_info *ri = this_cpu_ptr(&redirect_info);
+	struct bpf_map *map = ri->map_to_flush;
+
+	ri->map = NULL;
+	ri->map_to_flush = NULL;
+
+	if (map)
+		__dev_map_flush(map);
+}
+EXPORT_SYMBOL_GPL(xdp_do_flush_map);
+
 int xdp_do_redirect_map(struct net_device *dev, struct xdp_buff *xdp,
 			struct bpf_prog *xdp_prog)
 {
 	struct redirect_info *ri = this_cpu_ptr(&redirect_info);
 	struct bpf_map *map = ri->map;
+	u32 index = ri->ifindex;
 	struct net_device *fwd;
 	int err = -EINVAL;
 
 	ri->ifindex = 0;
 	ri->map = NULL;
 
-	fwd = __dev_map_lookup_elem(map, ri->ifindex);
+	fwd = __dev_map_lookup_elem(map, index);
 	if (!fwd)
 		goto out;
 
-	trace_xdp_redirect(dev, fwd, xdp_prog, XDP_REDIRECT);
-	err = __bpf_tx_xdp(fwd, xdp);
+	if (ri->map_to_flush && (ri->map_to_flush != map))
+		xdp_do_flush_map();
+
+	err = __bpf_tx_xdp(fwd, map, xdp, index);
+	if (likely(!err))
+		ri->map_to_flush = map;
+
 out:
+	trace_xdp_redirect(dev, fwd, xdp_prog, XDP_REDIRECT);
 	return err;
 }
 
@@ -2488,7 +2523,7 @@ int xdp_do_redirect(struct net_device *dev, struct xdp_buff *xdp,
 
 	trace_xdp_redirect(dev, fwd, xdp_prog, XDP_REDIRECT);
 
-	return __bpf_tx_xdp(fwd, xdp);
+	return __bpf_tx_xdp(fwd, NULL, xdp, 0);
 }
 EXPORT_SYMBOL_GPL(xdp_do_redirect);
 
-- 
cgit v1.2.3


From 2ddf71e23cc246e95af72a6deed67b4a50a7b81c Mon Sep 17 00:00:00 2001
From: John Fastabend <john.fastabend@gmail.com>
Date: Mon, 17 Jul 2017 09:30:02 -0700
Subject: net: add notifier hooks for devmap bpf map

The BPF map devmap holds a refcnt on the net_device structure when
it is in the map. We need to do this to ensure on driver unload we
don't lose a dev reference.

However, its not very convenient to have to manually unload the map
when destroying a net device so add notifier handlers to do the cleanup
automatically. But this creates a race between update/destroy BPF
syscall and programs and the unregister netdev hook.

Unfortunately, the best I could come up with is either to live with
requiring manual removal of net devices from the map before removing
the net device OR to add a mutex in devmap to ensure the map is not
modified while we are removing a device. The fallout also requires
that BPF programs no longer update/delete the map from the BPF program
side because the mutex may sleep and this can not be done from inside
an rcu critical section.  This is not a real problem though because I
have not come up with any use cases where this is actually useful in
practice. If/when we come up with a compelling user for this we may
need to revisit this.

Signed-off-by: John Fastabend <john.fastabend@gmail.com>
Acked-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Jesper Dangaard Brouer <brouer@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/filter.h |  2 +-
 kernel/bpf/devmap.c    | 73 ++++++++++++++++++++++++++++++++++++++++++++++++++
 kernel/bpf/verifier.c  |  2 +-
 3 files changed, 75 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/linux/filter.h b/include/linux/filter.h
index 3323ee91c172..d19ed3c15e1e 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -716,7 +716,7 @@ struct bpf_prog *bpf_patch_insn_single(struct bpf_prog *prog, u32 off,
  * same cpu context. Further for best results no more than a single map
  * for the do_redirect/do_flush pair should be used. This limitation is
  * because we only track one map and force a flush when the map changes.
- * This does not appear to be a real limiation for existing software.
+ * This does not appear to be a real limitation for existing software.
  */
 int xdp_do_generic_redirect(struct net_device *dev, struct sk_buff *skb);
 int xdp_do_redirect(struct net_device *dev,
diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c
index b2ef04a1c86a..899364d097f5 100644
--- a/kernel/bpf/devmap.c
+++ b/kernel/bpf/devmap.c
@@ -34,6 +34,17 @@
  * netdev_map consistent in this case. From the devmap side BPF programs
  * calling into these operations are the same as multiple user space threads
  * making system calls.
+ *
+ * Finally, any of the above may race with a netdev_unregister notifier. The
+ * unregister notifier must search for net devices in the map structure that
+ * contain a reference to the net device and remove them. This is a two step
+ * process (a) dereference the bpf_dtab_netdev object in netdev_map and (b)
+ * check to see if the ifindex is the same as the net_device being removed.
+ * Unfortunately, the xchg() operations do not protect against this. To avoid
+ * potentially removing incorrect objects the dev_map_list_mutex protects
+ * conflicting netdev unregister and BPF syscall operations. Updates and
+ * deletes from a BPF program (done in rcu critical section) are blocked
+ * because of this mutex.
  */
 #include <linux/bpf.h>
 #include <linux/jhash.h>
@@ -54,8 +65,12 @@ struct bpf_dtab {
 	struct bpf_map map;
 	struct bpf_dtab_netdev **netdev_map;
 	unsigned long int __percpu *flush_needed;
+	struct list_head list;
 };
 
+static DEFINE_MUTEX(dev_map_list_mutex);
+static LIST_HEAD(dev_map_list);
+
 static struct bpf_map *dev_map_alloc(union bpf_attr *attr)
 {
 	struct bpf_dtab *dtab;
@@ -112,6 +127,9 @@ static struct bpf_map *dev_map_alloc(union bpf_attr *attr)
 	if (!dtab->netdev_map)
 		goto free_dtab;
 
+	mutex_lock(&dev_map_list_mutex);
+	list_add_tail(&dtab->list, &dev_map_list);
+	mutex_unlock(&dev_map_list_mutex);
 	return &dtab->map;
 
 free_dtab:
@@ -146,6 +164,11 @@ static void dev_map_free(struct bpf_map *map)
 			cpu_relax();
 	}
 
+	/* Although we should no longer have datapath or bpf syscall operations
+	 * at this point we we can still race with netdev notifier, hence the
+	 * lock.
+	 */
+	mutex_lock(&dev_map_list_mutex);
 	for (i = 0; i < dtab->map.max_entries; i++) {
 		struct bpf_dtab_netdev *dev;
 
@@ -160,6 +183,8 @@ static void dev_map_free(struct bpf_map *map)
 	/* At this point bpf program is detached and all pending operations
 	 * _must_ be complete
 	 */
+	list_del(&dtab->list);
+	mutex_unlock(&dev_map_list_mutex);
 	free_percpu(dtab->flush_needed);
 	bpf_map_area_free(dtab->netdev_map);
 	kfree(dtab);
@@ -296,9 +321,11 @@ static int dev_map_delete_elem(struct bpf_map *map, void *key)
 	 * the driver tear down ensures all soft irqs are complete before
 	 * removing the net device in the case of dev_put equals zero.
 	 */
+	mutex_lock(&dev_map_list_mutex);
 	old_dev = xchg(&dtab->netdev_map[k], NULL);
 	if (old_dev)
 		call_rcu(&old_dev->rcu, __dev_map_entry_free);
+	mutex_unlock(&dev_map_list_mutex);
 	return 0;
 }
 
@@ -341,9 +368,11 @@ static int dev_map_update_elem(struct bpf_map *map, void *key, void *value,
 	 * Remembering the driver side flush operation will happen before the
 	 * net device is removed.
 	 */
+	mutex_lock(&dev_map_list_mutex);
 	old_dev = xchg(&dtab->netdev_map[i], dev);
 	if (old_dev)
 		call_rcu(&old_dev->rcu, __dev_map_entry_free);
+	mutex_unlock(&dev_map_list_mutex);
 
 	return 0;
 }
@@ -356,3 +385,47 @@ const struct bpf_map_ops dev_map_ops = {
 	.map_update_elem = dev_map_update_elem,
 	.map_delete_elem = dev_map_delete_elem,
 };
+
+static int dev_map_notification(struct notifier_block *notifier,
+				ulong event, void *ptr)
+{
+	struct net_device *netdev = netdev_notifier_info_to_dev(ptr);
+	struct bpf_dtab *dtab;
+	int i;
+
+	switch (event) {
+	case NETDEV_UNREGISTER:
+		mutex_lock(&dev_map_list_mutex);
+		list_for_each_entry(dtab, &dev_map_list, list) {
+			for (i = 0; i < dtab->map.max_entries; i++) {
+				struct bpf_dtab_netdev *dev;
+
+				dev = dtab->netdev_map[i];
+				if (!dev ||
+				    dev->dev->ifindex != netdev->ifindex)
+					continue;
+				dev = xchg(&dtab->netdev_map[i], NULL);
+				if (dev)
+					call_rcu(&dev->rcu,
+						 __dev_map_entry_free);
+			}
+		}
+		mutex_unlock(&dev_map_list_mutex);
+		break;
+	default:
+		break;
+	}
+	return NOTIFY_OK;
+}
+
+static struct notifier_block dev_map_notifier = {
+	.notifier_call = dev_map_notification,
+};
+
+static int __init dev_map_init(void)
+{
+	register_netdevice_notifier(&dev_map_notifier);
+	return 0;
+}
+
+subsys_initcall(dev_map_init);
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index df05d65f0c87..ebe9b38ff522 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -1281,7 +1281,7 @@ static int check_map_func_compatibility(struct bpf_map *map, int func_id)
 	 * for now.
 	 */
 	case BPF_MAP_TYPE_DEVMAP:
-		if (func_id == BPF_FUNC_map_lookup_elem)
+		if (func_id != BPF_FUNC_redirect_map)
 			goto error;
 		break;
 	case BPF_MAP_TYPE_ARRAY_OF_MAPS:
-- 
cgit v1.2.3


From 880388aa3c07fdea4f9b85e35641753017b1852f Mon Sep 17 00:00:00 2001
From: "David S. Miller" <davem@davemloft.net>
Date: Mon, 3 Jul 2017 07:29:12 -0700
Subject: net: Remove all references to SKB_GSO_UDP.

Such packets are no longer possible.

Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/virtio_net.h |  5 -----
 net/core/filter.c          |  8 ++++----
 net/ipv4/af_inet.c         | 12 ++----------
 net/ipv4/gre_offload.c     | 14 +-------------
 net/ipv4/udp_offload.c     |  6 ++----
 net/openvswitch/datapath.c | 14 --------------
 net/openvswitch/flow.c     |  6 +-----
 net/sched/act_csum.c       |  6 ------
 8 files changed, 10 insertions(+), 61 deletions(-)

(limited to 'include')

diff --git a/include/linux/virtio_net.h b/include/linux/virtio_net.h
index 5209b5ed2a64..32fb046f2173 100644
--- a/include/linux/virtio_net.h
+++ b/include/linux/virtio_net.h
@@ -18,9 +18,6 @@ static inline int virtio_net_hdr_to_skb(struct sk_buff *skb,
 		case VIRTIO_NET_HDR_GSO_TCPV6:
 			gso_type = SKB_GSO_TCPV6;
 			break;
-		case VIRTIO_NET_HDR_GSO_UDP:
-			gso_type = SKB_GSO_UDP;
-			break;
 		default:
 			return -EINVAL;
 		}
@@ -73,8 +70,6 @@ static inline int virtio_net_hdr_from_skb(const struct sk_buff *skb,
 			hdr->gso_type = VIRTIO_NET_HDR_GSO_TCPV4;
 		else if (sinfo->gso_type & SKB_GSO_TCPV6)
 			hdr->gso_type = VIRTIO_NET_HDR_GSO_TCPV6;
-		else if (sinfo->gso_type & SKB_GSO_UDP)
-			hdr->gso_type = VIRTIO_NET_HDR_GSO_UDP;
 		else
 			return -EINVAL;
 		if (sinfo->gso_type & SKB_GSO_TCP_ECN)
diff --git a/net/core/filter.c b/net/core/filter.c
index e23aa6fa1119..29e690cbe820 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -2051,8 +2051,8 @@ static int bpf_skb_proto_4_to_6(struct sk_buff *skb)
 		return ret;
 
 	if (skb_is_gso(skb)) {
-		/* SKB_GSO_UDP stays as is. SKB_GSO_TCPV4 needs to
-		 * be changed into SKB_GSO_TCPV6.
+		/* SKB_GSO_TCPV4 needs to be changed into
+		 * SKB_GSO_TCPV6.
 		 */
 		if (skb_shinfo(skb)->gso_type & SKB_GSO_TCPV4) {
 			skb_shinfo(skb)->gso_type &= ~SKB_GSO_TCPV4;
@@ -2087,8 +2087,8 @@ static int bpf_skb_proto_6_to_4(struct sk_buff *skb)
 		return ret;
 
 	if (skb_is_gso(skb)) {
-		/* SKB_GSO_UDP stays as is. SKB_GSO_TCPV6 needs to
-		 * be changed into SKB_GSO_TCPV4.
+		/* SKB_GSO_TCPV6 needs to be changed into
+		 * SKB_GSO_TCPV4.
 		 */
 		if (skb_shinfo(skb)->gso_type & SKB_GSO_TCPV6) {
 			skb_shinfo(skb)->gso_type &= ~SKB_GSO_TCPV6;
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
index 76c2077c3f5b..5ce44fb7d498 100644
--- a/net/ipv4/af_inet.c
+++ b/net/ipv4/af_inet.c
@@ -1219,10 +1219,9 @@ EXPORT_SYMBOL(inet_sk_rebuild_header);
 struct sk_buff *inet_gso_segment(struct sk_buff *skb,
 				 netdev_features_t features)
 {
-	bool udpfrag = false, fixedid = false, gso_partial, encap;
+	bool fixedid = false, gso_partial, encap;
 	struct sk_buff *segs = ERR_PTR(-EINVAL);
 	const struct net_offload *ops;
-	unsigned int offset = 0;
 	struct iphdr *iph;
 	int proto, tot_len;
 	int nhoff;
@@ -1257,7 +1256,6 @@ struct sk_buff *inet_gso_segment(struct sk_buff *skb,
 	segs = ERR_PTR(-EPROTONOSUPPORT);
 
 	if (!skb->encapsulation || encap) {
-		udpfrag = !!(skb_shinfo(skb)->gso_type & SKB_GSO_UDP);
 		fixedid = !!(skb_shinfo(skb)->gso_type & SKB_GSO_TCP_FIXEDID);
 
 		/* fixed ID is invalid if DF bit is not set */
@@ -1277,13 +1275,7 @@ struct sk_buff *inet_gso_segment(struct sk_buff *skb,
 	skb = segs;
 	do {
 		iph = (struct iphdr *)(skb_mac_header(skb) + nhoff);
-		if (udpfrag) {
-			iph->frag_off = htons(offset >> 3);
-			if (skb->next)
-				iph->frag_off |= htons(IP_MF);
-			offset += skb->len - nhoff - ihl;
-			tot_len = skb->len - nhoff;
-		} else if (skb_is_gso(skb)) {
+		if (skb_is_gso(skb)) {
 			if (!fixedid) {
 				iph->id = htons(id);
 				id += skb_shinfo(skb)->gso_segs;
diff --git a/net/ipv4/gre_offload.c b/net/ipv4/gre_offload.c
index d5cac99170b1..416bb304a281 100644
--- a/net/ipv4/gre_offload.c
+++ b/net/ipv4/gre_offload.c
@@ -24,7 +24,7 @@ static struct sk_buff *gre_gso_segment(struct sk_buff *skb,
 	__be16 protocol = skb->protocol;
 	u16 mac_len = skb->mac_len;
 	int gre_offset, outer_hlen;
-	bool need_csum, ufo, gso_partial;
+	bool need_csum, gso_partial;
 
 	if (!skb->encapsulation)
 		goto out;
@@ -47,20 +47,8 @@ static struct sk_buff *gre_gso_segment(struct sk_buff *skb,
 	need_csum = !!(skb_shinfo(skb)->gso_type & SKB_GSO_GRE_CSUM);
 	skb->encap_hdr_csum = need_csum;
 
-	ufo = !!(skb_shinfo(skb)->gso_type & SKB_GSO_UDP);
-
 	features &= skb->dev->hw_enc_features;
 
-	/* The only checksum offload we care about from here on out is the
-	 * outer one so strip the existing checksum feature flags based
-	 * on the fact that we will be computing our checksum in software.
-	 */
-	if (ufo) {
-		features &= ~NETIF_F_CSUM_MASK;
-		if (!need_csum)
-			features |= NETIF_F_HW_CSUM;
-	}
-
 	/* segment inner packet. */
 	segs = skb_mac_gso_segment(skb, features);
 	if (IS_ERR_OR_NULL(segs)) {
diff --git a/net/ipv4/udp_offload.c b/net/ipv4/udp_offload.c
index 781250151d40..4fedce3d5733 100644
--- a/net/ipv4/udp_offload.c
+++ b/net/ipv4/udp_offload.c
@@ -21,7 +21,7 @@ static struct sk_buff *__skb_udp_tunnel_segment(struct sk_buff *skb,
 	__be16 new_protocol, bool is_ipv6)
 {
 	int tnl_hlen = skb_inner_mac_header(skb) - skb_transport_header(skb);
-	bool remcsum, need_csum, offload_csum, ufo, gso_partial;
+	bool remcsum, need_csum, offload_csum, gso_partial;
 	struct sk_buff *segs = ERR_PTR(-EINVAL);
 	struct udphdr *uh = udp_hdr(skb);
 	u16 mac_offset = skb->mac_header;
@@ -61,8 +61,6 @@ static struct sk_buff *__skb_udp_tunnel_segment(struct sk_buff *skb,
 	remcsum = !!(skb_shinfo(skb)->gso_type & SKB_GSO_TUNNEL_REMCSUM);
 	skb->remcsum_offload = remcsum;
 
-	ufo = !!(skb_shinfo(skb)->gso_type & SKB_GSO_UDP);
-
 	need_ipsec = skb_dst(skb) && dst_xfrm(skb_dst(skb));
 	/* Try to offload checksum if possible */
 	offload_csum = !!(need_csum &&
@@ -77,7 +75,7 @@ static struct sk_buff *__skb_udp_tunnel_segment(struct sk_buff *skb,
 	 * outer one so strip the existing checksum feature flags and
 	 * instead set the flag based on our outer checksum offload value.
 	 */
-	if (remcsum || ufo) {
+	if (remcsum) {
 		features &= ~NETIF_F_CSUM_MASK;
 		if (!need_csum || offload_csum)
 			features |= NETIF_F_HW_CSUM;
diff --git a/net/openvswitch/datapath.c b/net/openvswitch/datapath.c
index 45fe8c8a884d..f6e229b51dfb 100644
--- a/net/openvswitch/datapath.c
+++ b/net/openvswitch/datapath.c
@@ -335,8 +335,6 @@ static int queue_gso_packets(struct datapath *dp, struct sk_buff *skb,
 			     const struct dp_upcall_info *upcall_info,
 				 uint32_t cutlen)
 {
-	unsigned short gso_type = skb_shinfo(skb)->gso_type;
-	struct sw_flow_key later_key;
 	struct sk_buff *segs, *nskb;
 	int err;
 
@@ -347,21 +345,9 @@ static int queue_gso_packets(struct datapath *dp, struct sk_buff *skb,
 	if (segs == NULL)
 		return -EINVAL;
 
-	if (gso_type & SKB_GSO_UDP) {
-		/* The initial flow key extracted by ovs_flow_key_extract()
-		 * in this case is for a first fragment, so we need to
-		 * properly mark later fragments.
-		 */
-		later_key = *key;
-		later_key.ip.frag = OVS_FRAG_TYPE_LATER;
-	}
-
 	/* Queue all of the segments. */
 	skb = segs;
 	do {
-		if (gso_type & SKB_GSO_UDP && skb != segs)
-			key = &later_key;
-
 		err = queue_userspace_packet(dp, skb, key, upcall_info, cutlen);
 		if (err)
 			break;
diff --git a/net/openvswitch/flow.c b/net/openvswitch/flow.c
index 3f76cb765e5b..597d96faca45 100644
--- a/net/openvswitch/flow.c
+++ b/net/openvswitch/flow.c
@@ -584,8 +584,7 @@ static int key_extract(struct sk_buff *skb, struct sw_flow_key *key)
 			key->ip.frag = OVS_FRAG_TYPE_LATER;
 			return 0;
 		}
-		if (nh->frag_off & htons(IP_MF) ||
-			skb_shinfo(skb)->gso_type & SKB_GSO_UDP)
+		if (nh->frag_off & htons(IP_MF))
 			key->ip.frag = OVS_FRAG_TYPE_FIRST;
 		else
 			key->ip.frag = OVS_FRAG_TYPE_NONE;
@@ -701,9 +700,6 @@ static int key_extract(struct sk_buff *skb, struct sw_flow_key *key)
 
 		if (key->ip.frag == OVS_FRAG_TYPE_LATER)
 			return 0;
-		if (skb_shinfo(skb)->gso_type & SKB_GSO_UDP)
-			key->ip.frag = OVS_FRAG_TYPE_FIRST;
-
 		/* Transport layer. */
 		if (key->ip.proto == NEXTHDR_TCP) {
 			if (tcphdr_ok(skb)) {
diff --git a/net/sched/act_csum.c b/net/sched/act_csum.c
index 3317a2f579da..67afc12df88b 100644
--- a/net/sched/act_csum.c
+++ b/net/sched/act_csum.c
@@ -231,9 +231,6 @@ static int tcf_csum_ipv4_udp(struct sk_buff *skb, unsigned int ihl,
 	const struct iphdr *iph;
 	u16 ul;
 
-	if (skb_is_gso(skb) && skb_shinfo(skb)->gso_type & SKB_GSO_UDP)
-		return 1;
-
 	/*
 	 * Support both UDP and UDPLITE checksum algorithms, Don't use
 	 * udph->len to get the real length without any protocol check,
@@ -287,9 +284,6 @@ static int tcf_csum_ipv6_udp(struct sk_buff *skb, unsigned int ihl,
 	const struct ipv6hdr *ip6h;
 	u16 ul;
 
-	if (skb_is_gso(skb) && skb_shinfo(skb)->gso_type & SKB_GSO_UDP)
-		return 1;
-
 	/*
 	 * Support both UDP and UDPLITE checksum algorithms, Don't use
 	 * udph->len to get the real length without any protocol check,
-- 
cgit v1.2.3


From d9d30adf56777c402c0027c0e6ae21f17cc0a365 Mon Sep 17 00:00:00 2001
From: "David S. Miller" <davem@davemloft.net>
Date: Mon, 3 Jul 2017 07:31:57 -0700
Subject: net: Kill NETIF_F_UFO and SKB_GSO_UDP.

No longer used.

Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netdev_features.h |  4 +---
 include/linux/netdevice.h       |  1 -
 include/linux/skbuff.h          | 31 +++++++++++++++----------------
 3 files changed, 16 insertions(+), 20 deletions(-)

(limited to 'include')

diff --git a/include/linux/netdev_features.h b/include/linux/netdev_features.h
index 1d4737cffc71..ebd273627334 100644
--- a/include/linux/netdev_features.h
+++ b/include/linux/netdev_features.h
@@ -36,7 +36,6 @@ enum {
 	/**/NETIF_F_GSO_SHIFT,		/* keep the order of SKB_GSO_* bits */
 	NETIF_F_TSO_BIT			/* ... TCPv4 segmentation */
 		= NETIF_F_GSO_SHIFT,
-	NETIF_F_UFO_BIT,		/* ... UDPv4 fragmentation */
 	NETIF_F_GSO_ROBUST_BIT,		/* ... ->SKB_GSO_DODGY */
 	NETIF_F_TSO_ECN_BIT,		/* ... TCP ECN support */
 	NETIF_F_TSO_MANGLEID_BIT,	/* ... IPV4 ID mangling allowed */
@@ -118,7 +117,6 @@ enum {
 #define NETIF_F_TSO6		__NETIF_F(TSO6)
 #define NETIF_F_TSO_ECN		__NETIF_F(TSO_ECN)
 #define NETIF_F_TSO		__NETIF_F(TSO)
-#define NETIF_F_UFO		__NETIF_F(UFO)
 #define NETIF_F_VLAN_CHALLENGED	__NETIF_F(VLAN_CHALLENGED)
 #define NETIF_F_RXFCS		__NETIF_F(RXFCS)
 #define NETIF_F_RXALL		__NETIF_F(RXALL)
@@ -172,7 +170,7 @@ enum {
 				 NETIF_F_FSO)
 
 /* List of features with software fallbacks. */
-#define NETIF_F_GSO_SOFTWARE	(NETIF_F_ALL_TSO | NETIF_F_UFO | \
+#define NETIF_F_GSO_SOFTWARE	(NETIF_F_ALL_TSO | \
 				 NETIF_F_GSO_SCTP)
 
 /*
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 03b104908235..c60351b84323 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -4098,7 +4098,6 @@ static inline bool net_gso_ok(netdev_features_t features, int gso_type)
 
 	/* check flags correspondence */
 	BUILD_BUG_ON(SKB_GSO_TCPV4   != (NETIF_F_TSO >> NETIF_F_GSO_SHIFT));
-	BUILD_BUG_ON(SKB_GSO_UDP     != (NETIF_F_UFO >> NETIF_F_GSO_SHIFT));
 	BUILD_BUG_ON(SKB_GSO_DODGY   != (NETIF_F_GSO_ROBUST >> NETIF_F_GSO_SHIFT));
 	BUILD_BUG_ON(SKB_GSO_TCP_ECN != (NETIF_F_TSO_ECN >> NETIF_F_GSO_SHIFT));
 	BUILD_BUG_ON(SKB_GSO_TCP_FIXEDID != (NETIF_F_TSO_MANGLEID >> NETIF_F_GSO_SHIFT));
diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index dbe29b6c9bd6..4d7a284ba3ee 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -463,39 +463,38 @@ enum {
 
 enum {
 	SKB_GSO_TCPV4 = 1 << 0,
-	SKB_GSO_UDP = 1 << 1,
 
 	/* This indicates the skb is from an untrusted source. */
-	SKB_GSO_DODGY = 1 << 2,
+	SKB_GSO_DODGY = 1 << 1,
 
 	/* This indicates the tcp segment has CWR set. */
-	SKB_GSO_TCP_ECN = 1 << 3,
+	SKB_GSO_TCP_ECN = 1 << 2,
 
-	SKB_GSO_TCP_FIXEDID = 1 << 4,
+	SKB_GSO_TCP_FIXEDID = 1 << 3,
 
-	SKB_GSO_TCPV6 = 1 << 5,
+	SKB_GSO_TCPV6 = 1 << 4,
 
-	SKB_GSO_FCOE = 1 << 6,
+	SKB_GSO_FCOE = 1 << 5,
 
-	SKB_GSO_GRE = 1 << 7,
+	SKB_GSO_GRE = 1 << 6,
 
-	SKB_GSO_GRE_CSUM = 1 << 8,
+	SKB_GSO_GRE_CSUM = 1 << 7,
 
-	SKB_GSO_IPXIP4 = 1 << 9,
+	SKB_GSO_IPXIP4 = 1 << 8,
 
-	SKB_GSO_IPXIP6 = 1 << 10,
+	SKB_GSO_IPXIP6 = 1 << 9,
 
-	SKB_GSO_UDP_TUNNEL = 1 << 11,
+	SKB_GSO_UDP_TUNNEL = 1 << 10,
 
-	SKB_GSO_UDP_TUNNEL_CSUM = 1 << 12,
+	SKB_GSO_UDP_TUNNEL_CSUM = 1 << 11,
 
-	SKB_GSO_PARTIAL = 1 << 13,
+	SKB_GSO_PARTIAL = 1 << 12,
 
-	SKB_GSO_TUNNEL_REMCSUM = 1 << 14,
+	SKB_GSO_TUNNEL_REMCSUM = 1 << 13,
 
-	SKB_GSO_SCTP = 1 << 15,
+	SKB_GSO_SCTP = 1 << 14,
 
-	SKB_GSO_ESP = 1 << 16,
+	SKB_GSO_ESP = 1 << 15,
 };
 
 #if BITS_PER_LONG > 32
-- 
cgit v1.2.3


From 06dc75ab06943fcc126a951a0680980ad5cb75c6 Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Mon, 17 Jul 2017 18:56:54 +0200
Subject: net: Revert "net: add function to allocate sk_buff head without data
 area"

It was added for netlink mmap tx, there are no callers in the tree.
The commit also added a check for skb->head != NULL in kfree_skb path,
remove that too -- all skbs ought to have skb->head set.

Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/skbuff.h |  6 ------
 net/core/skbuff.c      | 31 ++-----------------------------
 2 files changed, 2 insertions(+), 35 deletions(-)

(limited to 'include')

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 4d7a284ba3ee..4093552be1de 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -944,12 +944,6 @@ static inline struct sk_buff *alloc_skb_fclone(unsigned int size,
 	return __alloc_skb(size, priority, SKB_ALLOC_FCLONE, NUMA_NO_NODE);
 }
 
-struct sk_buff *__alloc_skb_head(gfp_t priority, int node);
-static inline struct sk_buff *alloc_skb_head(gfp_t priority)
-{
-	return __alloc_skb_head(priority, -1);
-}
-
 struct sk_buff *skb_morph(struct sk_buff *dst, struct sk_buff *src);
 int skb_copy_ubufs(struct sk_buff *skb, gfp_t gfp_mask);
 struct sk_buff *skb_clone(struct sk_buff *skb, gfp_t priority);
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index b352c6bcfb31..6bc19c80c210 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -158,31 +158,6 @@ out:
  *
  */
 
-struct sk_buff *__alloc_skb_head(gfp_t gfp_mask, int node)
-{
-	struct sk_buff *skb;
-
-	/* Get the HEAD */
-	skb = kmem_cache_alloc_node(skbuff_head_cache,
-				    gfp_mask & ~__GFP_DMA, node);
-	if (!skb)
-		goto out;
-
-	/*
-	 * Only clear those fields we need to clear, not those that we will
-	 * actually initialise below. Hence, don't put any more fields after
-	 * the tail pointer in struct sk_buff!
-	 */
-	memset(skb, 0, offsetof(struct sk_buff, tail));
-	skb->head = NULL;
-	skb->truesize = sizeof(struct sk_buff);
-	refcount_set(&skb->users, 1);
-
-	skb->mac_header = (typeof(skb->mac_header))~0U;
-out:
-	return skb;
-}
-
 /**
  *	__alloc_skb	-	allocate a network buffer
  *	@size: size to allocate
@@ -663,8 +638,7 @@ void skb_release_head_state(struct sk_buff *skb)
 static void skb_release_all(struct sk_buff *skb)
 {
 	skb_release_head_state(skb);
-	if (likely(skb->head))
-		skb_release_data(skb);
+	skb_release_data(skb);
 }
 
 /**
@@ -762,8 +736,7 @@ void consume_stateless_skb(struct sk_buff *skb)
 		return;
 
 	trace_consume_skb(skb);
-	if (likely(skb->head))
-		skb_release_data(skb);
+	skb_release_data(skb);
 	kfree_skbmem(skb);
 }
 
-- 
cgit v1.2.3


From 46f55cffa47330b99537985a50d92945d4b34658 Mon Sep 17 00:00:00 2001
From: John Fastabend <john.fastabend@gmail.com>
Date: Mon, 17 Jul 2017 21:56:48 -0700
Subject: net: fix build error in devmap helper calls

Initial patches missed case with CONFIG_BPF_SYSCALL not set.

Fixes: 11393cc9b9be ("xdp: Add batching support to redirect map")
Fixes: 97f91a7cf04f ("bpf: add bpf_redirect_map helper routine")
Signed-off-by: John Fastabend <john.fastabend@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/bpf.h | 25 ++++++++++++++++++++-----
 1 file changed, 20 insertions(+), 5 deletions(-)

(limited to 'include')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 6850a760dc94..6353c7474dba 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -318,6 +318,12 @@ static inline void bpf_long_memcpy(void *dst, const void *src, u32 size)
 
 /* verify correctness of eBPF program */
 int bpf_check(struct bpf_prog **fp, union bpf_attr *attr);
+
+/* Map specifics */
+struct net_device  *__dev_map_lookup_elem(struct bpf_map *map, u32 key);
+void __dev_map_insert_ctx(struct bpf_map *map, u32 index);
+void __dev_map_flush(struct bpf_map *map);
+
 #else
 static inline struct bpf_prog *bpf_prog_get(u32 ufd)
 {
@@ -356,6 +362,20 @@ static inline int __bpf_prog_charge(struct user_struct *user, u32 pages)
 static inline void __bpf_prog_uncharge(struct user_struct *user, u32 pages)
 {
 }
+
+static inline struct net_device  *__dev_map_lookup_elem(struct bpf_map *map,
+						       u32 key)
+{
+	return NULL;
+}
+
+static inline void __dev_map_insert_ctx(struct bpf_map *map, u32 index)
+{
+}
+
+static inline void __dev_map_flush(struct bpf_map *map)
+{
+}
 #endif /* CONFIG_BPF_SYSCALL */
 
 /* verifier prototypes for helper functions called from eBPF programs */
@@ -379,9 +399,4 @@ extern const struct bpf_func_proto bpf_get_stackid_proto;
 void bpf_user_rnd_init_once(void);
 u64 bpf_user_rnd_u32(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5);
 
-/* Map specifics */
-struct net_device  *__dev_map_lookup_elem(struct bpf_map *map, u32 key);
-void __dev_map_insert_ctx(struct bpf_map *map, u32 index);
-void __dev_map_flush(struct bpf_map *map);
-
 #endif /* _LINUX_BPF_H */
-- 
cgit v1.2.3


From 3cf29931453215536916d0c4da953fce1911ced3 Mon Sep 17 00:00:00 2001
From: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
Date: Fri, 14 Jul 2017 19:38:36 +0900
Subject: LSM: Remove security_task_create() hook.

Since commit a79be238600d1a03 ("selinux: Use task_alloc hook rather than
task_create hook") changed to use task_alloc hook, task_create hook is
no longer used.

Signed-off-by: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
Signed-off-by: James Morris <james.l.morris@oracle.com>
---
 include/linux/lsm_hooks.h | 7 -------
 include/linux/security.h  | 6 ------
 kernel/fork.c             | 4 ----
 security/security.c       | 5 -----
 4 files changed, 22 deletions(-)

(limited to 'include')

diff --git a/include/linux/lsm_hooks.h b/include/linux/lsm_hooks.h
index 3cc9d77c7527..575703cb17b8 100644
--- a/include/linux/lsm_hooks.h
+++ b/include/linux/lsm_hooks.h
@@ -529,11 +529,6 @@
  *
  * Security hooks for task operations.
  *
- * @task_create:
- *	Check permission before creating a child process.  See the clone(2)
- *	manual page for definitions of the @clone_flags.
- *	@clone_flags contains the flags indicating what should be shared.
- *	Return 0 if permission is granted.
  * @task_alloc:
  *	@task task being allocated.
  *	@clone_flags contains the flags indicating what should be shared.
@@ -1509,7 +1504,6 @@ union security_list_options {
 	int (*file_receive)(struct file *file);
 	int (*file_open)(struct file *file, const struct cred *cred);
 
-	int (*task_create)(unsigned long clone_flags);
 	int (*task_alloc)(struct task_struct *task, unsigned long clone_flags);
 	void (*task_free)(struct task_struct *task);
 	int (*cred_alloc_blank)(struct cred *cred, gfp_t gfp);
@@ -1784,7 +1778,6 @@ struct security_hook_heads {
 	struct list_head file_send_sigiotask;
 	struct list_head file_receive;
 	struct list_head file_open;
-	struct list_head task_create;
 	struct list_head task_alloc;
 	struct list_head task_free;
 	struct list_head cred_alloc_blank;
diff --git a/include/linux/security.h b/include/linux/security.h
index b6ea1dc9cc9d..458e24bea2d4 100644
--- a/include/linux/security.h
+++ b/include/linux/security.h
@@ -318,7 +318,6 @@ int security_file_send_sigiotask(struct task_struct *tsk,
 				 struct fown_struct *fown, int sig);
 int security_file_receive(struct file *file);
 int security_file_open(struct file *file, const struct cred *cred);
-int security_task_create(unsigned long clone_flags);
 int security_task_alloc(struct task_struct *task, unsigned long clone_flags);
 void security_task_free(struct task_struct *task);
 int security_cred_alloc_blank(struct cred *cred, gfp_t gfp);
@@ -885,11 +884,6 @@ static inline int security_file_open(struct file *file,
 	return 0;
 }
 
-static inline int security_task_create(unsigned long clone_flags)
-{
-	return 0;
-}
-
 static inline int security_task_alloc(struct task_struct *task,
 				      unsigned long clone_flags)
 {
diff --git a/kernel/fork.c b/kernel/fork.c
index aa1076c5e4a9..3a13a940a6ea 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1568,10 +1568,6 @@ static __latent_entropy struct task_struct *copy_process(
 			return ERR_PTR(-EINVAL);
 	}
 
-	retval = security_task_create(clone_flags);
-	if (retval)
-		goto fork_out;
-
 	retval = -ENOMEM;
 	p = dup_task_struct(current, node);
 	if (!p)
diff --git a/security/security.c b/security/security.c
index 30132378d103..55b5997e4b72 100644
--- a/security/security.c
+++ b/security/security.c
@@ -979,11 +979,6 @@ int security_file_open(struct file *file, const struct cred *cred)
 	return fsnotify_perm(file, MAY_OPEN);
 }
 
-int security_task_create(unsigned long clone_flags)
-{
-	return call_int_hook(task_create, 0, clone_flags);
-}
-
 int security_task_alloc(struct task_struct *task, unsigned long clone_flags)
 {
 	return call_int_hook(task_alloc, 0, task, clone_flags);
-- 
cgit v1.2.3


From fc60a8b675bd9499c71716d21c238eed5092ddfc Mon Sep 17 00:00:00 2001
From: Andreas Färber <afaerber@suse.de>
Date: Sun, 9 Jul 2017 22:29:42 +0200
Subject: tty: serial: owl: Implement console driver
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Implement serial console driver to complement earlycon.

Based on LeMaker linux-actions tree.

Signed-off-by: Andreas Färber <afaerber@suse.de>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/tty/serial/Kconfig       |   4 +-
 drivers/tty/serial/owl-uart.c    | 635 ++++++++++++++++++++++++++++++++++++++-
 include/uapi/linux/serial_core.h |   1 +
 3 files changed, 636 insertions(+), 4 deletions(-)

(limited to 'include')

diff --git a/drivers/tty/serial/Kconfig b/drivers/tty/serial/Kconfig
index 1f096e2bb398..b788fee54249 100644
--- a/drivers/tty/serial/Kconfig
+++ b/drivers/tty/serial/Kconfig
@@ -1689,7 +1689,7 @@ config SERIAL_MVEBU_CONSOLE
 	  Otherwise, say 'N'.
 
 config SERIAL_OWL
-	bool "Actions Semi Owl serial port support"
+	tristate "Actions Semi Owl serial port support"
 	depends on ARCH_ACTIONS || COMPILE_TEST
 	select SERIAL_CORE
 	help
@@ -1705,7 +1705,7 @@ config SERIAL_OWL_CONSOLE
 	default y
 	help
 	  Say 'Y' here if you wish to use Actions Semiconductor S500/S900 UART
-	  as the system console. Only earlycon is implemented currently.
+	  as the system console.
 
 endmenu
 
diff --git a/drivers/tty/serial/owl-uart.c b/drivers/tty/serial/owl-uart.c
index 1b8008797a1b..683b05478ade 100644
--- a/drivers/tty/serial/owl-uart.c
+++ b/drivers/tty/serial/owl-uart.c
@@ -20,6 +20,7 @@
  * along with this program. If not, see <http://www.gnu.org/licenses/>.
  */
 
+#include <linux/clk.h>
 #include <linux/console.h>
 #include <linux/delay.h>
 #include <linux/io.h>
@@ -28,22 +29,66 @@
 #include <linux/platform_device.h>
 #include <linux/serial.h>
 #include <linux/serial_core.h>
+#include <linux/tty.h>
+#include <linux/tty_flip.h>
+
+#define OWL_UART_PORT_NUM 7
+#define OWL_UART_DEV_NAME "ttyOWL"
 
 #define OWL_UART_CTL	0x000
+#define OWL_UART_RXDAT	0x004
 #define OWL_UART_TXDAT	0x008
 #define OWL_UART_STAT	0x00c
 
+#define OWL_UART_CTL_DWLS_MASK		GENMASK(1, 0)
+#define OWL_UART_CTL_DWLS_5BITS		(0x0 << 0)
+#define OWL_UART_CTL_DWLS_6BITS		(0x1 << 0)
+#define OWL_UART_CTL_DWLS_7BITS		(0x2 << 0)
+#define OWL_UART_CTL_DWLS_8BITS		(0x3 << 0)
+#define OWL_UART_CTL_STPS_2BITS		BIT(2)
+#define OWL_UART_CTL_PRS_MASK		GENMASK(6, 4)
+#define OWL_UART_CTL_PRS_NONE		(0x0 << 4)
+#define OWL_UART_CTL_PRS_ODD		(0x4 << 4)
+#define OWL_UART_CTL_PRS_MARK		(0x5 << 4)
+#define OWL_UART_CTL_PRS_EVEN		(0x6 << 4)
+#define OWL_UART_CTL_PRS_SPACE		(0x7 << 4)
+#define OWL_UART_CTL_AFE		BIT(12)
 #define OWL_UART_CTL_TRFS_TX		BIT(14)
 #define OWL_UART_CTL_EN			BIT(15)
+#define OWL_UART_CTL_RXDE		BIT(16)
+#define OWL_UART_CTL_TXDE		BIT(17)
 #define OWL_UART_CTL_RXIE		BIT(18)
 #define OWL_UART_CTL_TXIE		BIT(19)
+#define OWL_UART_CTL_LBEN		BIT(20)
 
 #define OWL_UART_STAT_RIP		BIT(0)
 #define OWL_UART_STAT_TIP		BIT(1)
+#define OWL_UART_STAT_RXER		BIT(2)
+#define OWL_UART_STAT_TFER		BIT(3)
+#define OWL_UART_STAT_RXST		BIT(4)
+#define OWL_UART_STAT_RFEM		BIT(5)
 #define OWL_UART_STAT_TFFU		BIT(6)
-#define OWL_UART_STAT_TRFL_MASK		(0x1f << 11)
+#define OWL_UART_STAT_CTSS		BIT(7)
+#define OWL_UART_STAT_RTSS		BIT(8)
+#define OWL_UART_STAT_TFES		BIT(10)
+#define OWL_UART_STAT_TRFL_MASK		GENMASK(16, 11)
 #define OWL_UART_STAT_UTBB		BIT(17)
 
+static struct uart_driver owl_uart_driver;
+
+struct owl_uart_info {
+	unsigned int tx_fifosize;
+};
+
+struct owl_uart_port {
+	struct uart_port port;
+	struct clk *clk;
+};
+
+#define to_owl_uart_port(prt) container_of(prt, struct owl_uart_port, prt)
+
+static struct owl_uart_port *owl_uart_ports[OWL_UART_PORT_NUM];
+
 static inline void owl_uart_write(struct uart_port *port, u32 val, unsigned int off)
 {
 	writel(val, port->membase + off);
@@ -54,6 +99,397 @@ static inline u32 owl_uart_read(struct uart_port *port, unsigned int off)
 	return readl(port->membase + off);
 }
 
+static void owl_uart_set_mctrl(struct uart_port *port, unsigned int mctrl)
+{
+	u32 ctl;
+
+	ctl = owl_uart_read(port, OWL_UART_CTL);
+
+	if (mctrl & TIOCM_LOOP)
+		ctl |= OWL_UART_CTL_LBEN;
+	else
+		ctl &= ~OWL_UART_CTL_LBEN;
+
+	owl_uart_write(port, ctl, OWL_UART_CTL);
+}
+
+static unsigned int owl_uart_get_mctrl(struct uart_port *port)
+{
+	unsigned int mctrl = TIOCM_CAR | TIOCM_DSR;
+	u32 stat, ctl;
+
+	ctl = owl_uart_read(port, OWL_UART_CTL);
+	stat = owl_uart_read(port, OWL_UART_STAT);
+	if (stat & OWL_UART_STAT_RTSS)
+		mctrl |= TIOCM_RTS;
+	if ((stat & OWL_UART_STAT_CTSS) || !(ctl & OWL_UART_CTL_AFE))
+		mctrl |= TIOCM_CTS;
+	return mctrl;
+}
+
+static unsigned int owl_uart_tx_empty(struct uart_port *port)
+{
+	unsigned long flags;
+	u32 val;
+	unsigned int ret;
+
+	spin_lock_irqsave(&port->lock, flags);
+
+	val = owl_uart_read(port, OWL_UART_STAT);
+	ret = (val & OWL_UART_STAT_TFES) ? TIOCSER_TEMT : 0;
+
+	spin_unlock_irqrestore(&port->lock, flags);
+
+	return ret;
+}
+
+static void owl_uart_stop_rx(struct uart_port *port)
+{
+	u32 val;
+
+	val = owl_uart_read(port, OWL_UART_CTL);
+	val &= ~(OWL_UART_CTL_RXIE | OWL_UART_CTL_RXDE);
+	owl_uart_write(port, val, OWL_UART_CTL);
+
+	val = owl_uart_read(port, OWL_UART_STAT);
+	val |= OWL_UART_STAT_RIP;
+	owl_uart_write(port, val, OWL_UART_STAT);
+}
+
+static void owl_uart_stop_tx(struct uart_port *port)
+{
+	u32 val;
+
+	val = owl_uart_read(port, OWL_UART_CTL);
+	val &= ~(OWL_UART_CTL_TXIE | OWL_UART_CTL_TXDE);
+	owl_uart_write(port, val, OWL_UART_CTL);
+
+	val = owl_uart_read(port, OWL_UART_STAT);
+	val |= OWL_UART_STAT_TIP;
+	owl_uart_write(port, val, OWL_UART_STAT);
+}
+
+static void owl_uart_start_tx(struct uart_port *port)
+{
+	u32 val;
+
+	if (uart_tx_stopped(port)) {
+		owl_uart_stop_tx(port);
+		return;
+	}
+
+	val = owl_uart_read(port, OWL_UART_STAT);
+	val |= OWL_UART_STAT_TIP;
+	owl_uart_write(port, val, OWL_UART_STAT);
+
+	val = owl_uart_read(port, OWL_UART_CTL);
+	val |= OWL_UART_CTL_TXIE;
+	owl_uart_write(port, val, OWL_UART_CTL);
+}
+
+static void owl_uart_send_chars(struct uart_port *port)
+{
+	struct circ_buf *xmit = &port->state->xmit;
+	unsigned int ch;
+
+	if (uart_tx_stopped(port))
+		return;
+
+	if (port->x_char) {
+		while (!(owl_uart_read(port, OWL_UART_STAT) & OWL_UART_STAT_TFFU))
+			cpu_relax();
+		owl_uart_write(port, port->x_char, OWL_UART_TXDAT);
+		port->icount.tx++;
+		port->x_char = 0;
+	}
+
+	while (!(owl_uart_read(port, OWL_UART_STAT) & OWL_UART_STAT_TFFU)) {
+		if (uart_circ_empty(xmit))
+			break;
+
+		ch = xmit->buf[xmit->tail];
+		owl_uart_write(port, ch, OWL_UART_TXDAT);
+		xmit->tail = (xmit->tail + 1) & (SERIAL_XMIT_SIZE - 1);
+		port->icount.tx++;
+	}
+
+	if (uart_circ_chars_pending(xmit) < WAKEUP_CHARS)
+		uart_write_wakeup(port);
+
+	if (uart_circ_empty(xmit))
+		owl_uart_stop_tx(port);
+}
+
+static void owl_uart_receive_chars(struct uart_port *port)
+{
+	u32 stat, val;
+
+	val = owl_uart_read(port, OWL_UART_CTL);
+	val &= ~OWL_UART_CTL_TRFS_TX;
+	owl_uart_write(port, val, OWL_UART_CTL);
+
+	stat = owl_uart_read(port, OWL_UART_STAT);
+	while (!(stat & OWL_UART_STAT_RFEM)) {
+		char flag = TTY_NORMAL;
+
+		if (stat & OWL_UART_STAT_RXER)
+			port->icount.overrun++;
+
+		if (stat & OWL_UART_STAT_RXST) {
+			/* We are not able to distinguish the error type. */
+			port->icount.brk++;
+			port->icount.frame++;
+
+			stat &= port->read_status_mask;
+			if (stat & OWL_UART_STAT_RXST)
+				flag = TTY_PARITY;
+		} else
+			port->icount.rx++;
+
+		val = owl_uart_read(port, OWL_UART_RXDAT);
+		val &= 0xff;
+
+		if ((stat & port->ignore_status_mask) == 0)
+			tty_insert_flip_char(&port->state->port, val, flag);
+
+		stat = owl_uart_read(port, OWL_UART_STAT);
+	}
+
+	spin_unlock(&port->lock);
+	tty_flip_buffer_push(&port->state->port);
+	spin_lock(&port->lock);
+}
+
+static irqreturn_t owl_uart_irq(int irq, void *dev_id)
+{
+	struct uart_port *port = dev_id;
+	unsigned long flags;
+	u32 stat;
+
+	spin_lock_irqsave(&port->lock, flags);
+
+	stat = owl_uart_read(port, OWL_UART_STAT);
+
+	if (stat & OWL_UART_STAT_RIP)
+		owl_uart_receive_chars(port);
+
+	if (stat & OWL_UART_STAT_TIP)
+		owl_uart_send_chars(port);
+
+	stat = owl_uart_read(port, OWL_UART_STAT);
+	stat |= OWL_UART_STAT_RIP | OWL_UART_STAT_TIP;
+	owl_uart_write(port, stat, OWL_UART_STAT);
+
+	spin_unlock_irqrestore(&port->lock, flags);
+
+	return IRQ_HANDLED;
+}
+
+static void owl_uart_shutdown(struct uart_port *port)
+{
+	u32 val;
+	unsigned long flags;
+
+	spin_lock_irqsave(&port->lock, flags);
+
+	val = owl_uart_read(port, OWL_UART_CTL);
+	val &= ~(OWL_UART_CTL_TXIE | OWL_UART_CTL_RXIE
+		| OWL_UART_CTL_TXDE | OWL_UART_CTL_RXDE | OWL_UART_CTL_EN);
+	owl_uart_write(port, val, OWL_UART_CTL);
+
+	spin_unlock_irqrestore(&port->lock, flags);
+
+	free_irq(port->irq, port);
+}
+
+static int owl_uart_startup(struct uart_port *port)
+{
+	u32 val;
+	unsigned long flags;
+	int ret;
+
+	ret = request_irq(port->irq, owl_uart_irq, IRQF_TRIGGER_HIGH,
+			"owl-uart", port);
+	if (ret)
+		return ret;
+
+	spin_lock_irqsave(&port->lock, flags);
+
+	val = owl_uart_read(port, OWL_UART_STAT);
+	val |= OWL_UART_STAT_RIP | OWL_UART_STAT_TIP
+		| OWL_UART_STAT_RXER | OWL_UART_STAT_TFER | OWL_UART_STAT_RXST;
+	owl_uart_write(port, val, OWL_UART_STAT);
+
+	val = owl_uart_read(port, OWL_UART_CTL);
+	val |= OWL_UART_CTL_RXIE | OWL_UART_CTL_TXIE;
+	val |= OWL_UART_CTL_EN;
+	owl_uart_write(port, val, OWL_UART_CTL);
+
+	spin_unlock_irqrestore(&port->lock, flags);
+
+	return 0;
+}
+
+static void owl_uart_change_baudrate(struct owl_uart_port *owl_port,
+				     unsigned long baud)
+{
+	clk_set_rate(owl_port->clk, baud * 8);
+}
+
+static void owl_uart_set_termios(struct uart_port *port,
+				 struct ktermios *termios,
+				 struct ktermios *old)
+{
+	struct owl_uart_port *owl_port = to_owl_uart_port(port);
+	unsigned int baud;
+	u32 ctl;
+	unsigned long flags;
+
+	spin_lock_irqsave(&port->lock, flags);
+
+	ctl = owl_uart_read(port, OWL_UART_CTL);
+
+	ctl &= ~OWL_UART_CTL_DWLS_MASK;
+	switch (termios->c_cflag & CSIZE) {
+	case CS5:
+		ctl |= OWL_UART_CTL_DWLS_5BITS;
+		break;
+	case CS6:
+		ctl |= OWL_UART_CTL_DWLS_6BITS;
+		break;
+	case CS7:
+		ctl |= OWL_UART_CTL_DWLS_7BITS;
+		break;
+	case CS8:
+	default:
+		ctl |= OWL_UART_CTL_DWLS_8BITS;
+		break;
+	}
+
+	if (termios->c_cflag & CSTOPB)
+		ctl |= OWL_UART_CTL_STPS_2BITS;
+	else
+		ctl &= ~OWL_UART_CTL_STPS_2BITS;
+
+	ctl &= ~OWL_UART_CTL_PRS_MASK;
+	if (termios->c_cflag & PARENB) {
+		if (termios->c_cflag & CMSPAR) {
+			if (termios->c_cflag & PARODD)
+				ctl |= OWL_UART_CTL_PRS_MARK;
+			else
+				ctl |= OWL_UART_CTL_PRS_SPACE;
+		} else if (termios->c_cflag & PARODD)
+			ctl |= OWL_UART_CTL_PRS_ODD;
+		else
+			ctl |= OWL_UART_CTL_PRS_EVEN;
+	} else
+		ctl |= OWL_UART_CTL_PRS_NONE;
+
+	if (termios->c_cflag & CRTSCTS)
+		ctl |= OWL_UART_CTL_AFE;
+	else
+		ctl &= ~OWL_UART_CTL_AFE;
+
+	owl_uart_write(port, ctl, OWL_UART_CTL);
+
+	baud = uart_get_baud_rate(port, termios, old, 9600, 3200000);
+	owl_uart_change_baudrate(owl_port, baud);
+
+	/* Don't rewrite B0 */
+	if (tty_termios_baud_rate(termios))
+		tty_termios_encode_baud_rate(termios, baud, baud);
+
+	port->read_status_mask |= OWL_UART_STAT_RXER;
+	if (termios->c_iflag & INPCK)
+		port->read_status_mask |= OWL_UART_STAT_RXST;
+
+	uart_update_timeout(port, termios->c_cflag, baud);
+
+	spin_unlock_irqrestore(&port->lock, flags);
+}
+
+static void owl_uart_release_port(struct uart_port *port)
+{
+	struct platform_device *pdev = to_platform_device(port->dev);
+	struct resource *res;
+
+	res = platform_get_resource(pdev, IORESOURCE_MEM, 0);
+	if (!res)
+		return;
+
+	if (port->flags & UPF_IOREMAP) {
+		devm_release_mem_region(port->dev, port->mapbase,
+			resource_size(res));
+		devm_iounmap(port->dev, port->membase);
+		port->membase = NULL;
+	}
+}
+
+static int owl_uart_request_port(struct uart_port *port)
+{
+	struct platform_device *pdev = to_platform_device(port->dev);
+	struct resource *res;
+
+	res = platform_get_resource(pdev, IORESOURCE_MEM, 0);
+	if (!res)
+		return -ENXIO;
+
+	if (!devm_request_mem_region(port->dev, port->mapbase,
+			resource_size(res), dev_name(port->dev)))
+		return -EBUSY;
+
+	if (port->flags & UPF_IOREMAP) {
+		port->membase = devm_ioremap_nocache(port->dev, port->mapbase,
+				resource_size(res));
+		if (!port->membase)
+			return -EBUSY;
+	}
+
+	return 0;
+}
+
+static const char *owl_uart_type(struct uart_port *port)
+{
+	return (port->type == PORT_OWL) ? "owl-uart" : NULL;
+}
+
+static int owl_uart_verify_port(struct uart_port *port,
+				struct serial_struct *ser)
+{
+	if (port->type != PORT_OWL)
+		return -EINVAL;
+
+	if (port->irq != ser->irq)
+		return -EINVAL;
+
+	return 0;
+}
+
+static void owl_uart_config_port(struct uart_port *port, int flags)
+{
+	if (flags & UART_CONFIG_TYPE) {
+		port->type = PORT_OWL;
+		owl_uart_request_port(port);
+	}
+}
+
+static struct uart_ops owl_uart_ops = {
+	.set_mctrl = owl_uart_set_mctrl,
+	.get_mctrl = owl_uart_get_mctrl,
+	.tx_empty = owl_uart_tx_empty,
+	.start_tx = owl_uart_start_tx,
+	.stop_rx = owl_uart_stop_rx,
+	.stop_tx = owl_uart_stop_tx,
+	.startup = owl_uart_startup,
+	.shutdown = owl_uart_shutdown,
+	.set_termios = owl_uart_set_termios,
+	.type = owl_uart_type,
+	.config_port = owl_uart_config_port,
+	.request_port = owl_uart_request_port,
+	.release_port = owl_uart_release_port,
+	.verify_port = owl_uart_verify_port,
+};
+
 #ifdef CONFIG_SERIAL_OWL_CONSOLE
 
 static void owl_console_putchar(struct uart_port *port, int ch)
@@ -110,6 +546,57 @@ static void owl_uart_port_write(struct uart_port *port, const char *s,
 	local_irq_restore(flags);
 }
 
+static void owl_uart_console_write(struct console *co, const char *s,
+				   u_int count)
+{
+	struct owl_uart_port *owl_port;
+
+	owl_port = owl_uart_ports[co->index];
+	if (!owl_port)
+		return;
+
+	owl_uart_port_write(&owl_port->port, s, count);
+}
+
+static int owl_uart_console_setup(struct console *co, char *options)
+{
+	struct owl_uart_port *owl_port;
+	int baud = 115200;
+	int bits = 8;
+	int parity = 'n';
+	int flow = 'n';
+
+	if (co->index < 0 || co->index >= OWL_UART_PORT_NUM)
+		return -EINVAL;
+
+	owl_port = owl_uart_ports[co->index];
+	if (!owl_port || !owl_port->port.membase)
+		return -ENODEV;
+
+	if (options)
+		uart_parse_options(options, &baud, &parity, &bits, &flow);
+
+	return uart_set_options(&owl_port->port, co, baud, parity, bits, flow);
+}
+
+static struct console owl_uart_console = {
+	.name = OWL_UART_DEV_NAME,
+	.write = owl_uart_console_write,
+	.device = uart_console_device,
+	.setup = owl_uart_console_setup,
+	.flags = CON_PRINTBUFFER,
+	.index = -1,
+	.data = &owl_uart_driver,
+};
+
+static int __init owl_uart_console_init(void)
+{
+	register_console(&owl_uart_console);
+
+	return 0;
+}
+console_initcall(owl_uart_console_init);
+
 static void owl_uart_early_console_write(struct console *co,
 					 const char *s,
 					 u_int count)
@@ -132,4 +619,148 @@ owl_uart_early_console_setup(struct earlycon_device *device, const char *opt)
 OF_EARLYCON_DECLARE(owl, "actions,owl-uart",
 		    owl_uart_early_console_setup);
 
-#endif /* CONFIG_SERIAL_OWL_CONSOLE */
+#define OWL_UART_CONSOLE (&owl_uart_console)
+#else
+#define OWL_UART_CONSOLE NULL
+#endif
+
+static struct uart_driver owl_uart_driver = {
+	.owner = THIS_MODULE,
+	.driver_name = "owl-uart",
+	.dev_name = OWL_UART_DEV_NAME,
+	.nr = OWL_UART_PORT_NUM,
+	.cons = OWL_UART_CONSOLE,
+};
+
+static const struct owl_uart_info owl_s500_info = {
+	.tx_fifosize = 16,
+};
+
+static const struct owl_uart_info owl_s900_info = {
+	.tx_fifosize = 32,
+};
+
+static const struct of_device_id owl_uart_dt_matches[] = {
+	{ .compatible = "actions,s500-uart", .data = &owl_s500_info },
+	{ .compatible = "actions,s900-uart", .data = &owl_s900_info },
+	{ }
+};
+MODULE_DEVICE_TABLE(of, owl_uart_dt_matches);
+
+static int owl_uart_probe(struct platform_device *pdev)
+{
+	const struct of_device_id *match;
+	const struct owl_uart_info *info = NULL;
+	struct resource *res_mem;
+	struct owl_uart_port *owl_port;
+	int ret, irq;
+
+	if (pdev->dev.of_node) {
+		pdev->id = of_alias_get_id(pdev->dev.of_node, "serial");
+		match = of_match_node(owl_uart_dt_matches, pdev->dev.of_node);
+		if (match)
+			info = match->data;
+	}
+
+	if (pdev->id < 0 || pdev->id >= OWL_UART_PORT_NUM) {
+		dev_err(&pdev->dev, "id %d out of range\n", pdev->id);
+		return -EINVAL;
+	}
+
+	res_mem = platform_get_resource(pdev, IORESOURCE_MEM, 0);
+	if (!res_mem) {
+		dev_err(&pdev->dev, "could not get mem\n");
+		return -ENODEV;
+	}
+
+	irq = platform_get_irq(pdev, 0);
+	if (irq < 0) {
+		dev_err(&pdev->dev, "could not get irq\n");
+		return irq;
+	}
+
+	if (owl_uart_ports[pdev->id]) {
+		dev_err(&pdev->dev, "port %d already allocated\n", pdev->id);
+		return -EBUSY;
+	}
+
+	owl_port = devm_kzalloc(&pdev->dev, sizeof(*owl_port), GFP_KERNEL);
+	if (!owl_port)
+		return -ENOMEM;
+
+	owl_port->clk = devm_clk_get(&pdev->dev, NULL);
+	if (IS_ERR(owl_port->clk)) {
+		dev_err(&pdev->dev, "could not get clk\n");
+		return PTR_ERR(owl_port->clk);
+	}
+
+	owl_port->port.dev = &pdev->dev;
+	owl_port->port.line = pdev->id;
+	owl_port->port.type = PORT_OWL;
+	owl_port->port.iotype = UPIO_MEM;
+	owl_port->port.mapbase = res_mem->start;
+	owl_port->port.irq = irq;
+	owl_port->port.uartclk = clk_get_rate(owl_port->clk);
+	if (owl_port->port.uartclk == 0) {
+		dev_err(&pdev->dev, "clock rate is zero\n");
+		return -EINVAL;
+	}
+	owl_port->port.flags = UPF_BOOT_AUTOCONF | UPF_IOREMAP | UPF_LOW_LATENCY;
+	owl_port->port.x_char = 0;
+	owl_port->port.fifosize = (info) ? info->tx_fifosize : 16;
+	owl_port->port.ops = &owl_uart_ops;
+
+	owl_uart_ports[pdev->id] = owl_port;
+	platform_set_drvdata(pdev, owl_port);
+
+	ret = uart_add_one_port(&owl_uart_driver, &owl_port->port);
+	if (ret)
+		owl_uart_ports[pdev->id] = NULL;
+
+	return ret;
+}
+
+static int owl_uart_remove(struct platform_device *pdev)
+{
+	struct owl_uart_port *owl_port = platform_get_drvdata(pdev);
+
+	uart_remove_one_port(&owl_uart_driver, &owl_port->port);
+	owl_uart_ports[pdev->id] = NULL;
+
+	return 0;
+}
+
+static struct platform_driver owl_uart_platform_driver = {
+	.probe = owl_uart_probe,
+	.remove = owl_uart_remove,
+	.driver = {
+		.name = "owl-uart",
+		.of_match_table = owl_uart_dt_matches,
+	},
+};
+
+static int __init owl_uart_init(void)
+{
+	int ret;
+
+	ret = uart_register_driver(&owl_uart_driver);
+	if (ret)
+		return ret;
+
+	ret = platform_driver_register(&owl_uart_platform_driver);
+	if (ret)
+		uart_unregister_driver(&owl_uart_driver);
+
+	return ret;
+}
+
+static void __init owl_uart_exit(void)
+{
+	platform_driver_unregister(&owl_uart_platform_driver);
+	uart_unregister_driver(&owl_uart_driver);
+}
+
+module_init(owl_uart_init);
+module_exit(owl_uart_exit);
+
+MODULE_LICENSE("GPL");
diff --git a/include/uapi/linux/serial_core.h b/include/uapi/linux/serial_core.h
index c34a2a3eeff5..38bea3217ead 100644
--- a/include/uapi/linux/serial_core.h
+++ b/include/uapi/linux/serial_core.h
@@ -70,6 +70,7 @@
 #define PORT_CLPS711X	33
 #define PORT_SA1100	34
 #define PORT_UART00	35
+#define PORT_OWL	36
 #define PORT_21285	37
 
 /* Sparc type numbers.  */
-- 
cgit v1.2.3


From 7744ccdbc16f0ac4adae21b3678af93775b3a386 Mon Sep 17 00:00:00 2001
From: Tom Lendacky <thomas.lendacky@amd.com>
Date: Mon, 17 Jul 2017 16:10:03 -0500
Subject: x86/mm: Add Secure Memory Encryption (SME) support
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Add support for Secure Memory Encryption (SME). This initial support
provides a Kconfig entry to build the SME support into the kernel and
defines the memory encryption mask that will be used in subsequent
patches to mark pages as encrypted.

Signed-off-by: Tom Lendacky <thomas.lendacky@amd.com>
Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Borislav Petkov <bp@suse.de>
Cc: Alexander Potapenko <glider@google.com>
Cc: Andrey Ryabinin <aryabinin@virtuozzo.com>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brijesh Singh <brijesh.singh@amd.com>
Cc: Dave Young <dyoung@redhat.com>
Cc: Dmitry Vyukov <dvyukov@google.com>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Cc: Larry Woodman <lwoodman@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Matt Fleming <matt@codeblueprint.co.uk>
Cc: Michael S. Tsirkin <mst@redhat.com>
Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Radim Krčmář <rkrcmar@redhat.com>
Cc: Rik van Riel <riel@redhat.com>
Cc: Toshimitsu Kani <toshi.kani@hpe.com>
Cc: kasan-dev@googlegroups.com
Cc: kvm@vger.kernel.org
Cc: linux-arch@vger.kernel.org
Cc: linux-doc@vger.kernel.org
Cc: linux-efi@vger.kernel.org
Cc: linux-mm@kvack.org
Link: http://lkml.kernel.org/r/a6c34d16caaed3bc3e2d6f0987554275bd291554.1500319216.git.thomas.lendacky@amd.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/Kconfig                   | 25 +++++++++++++++++++++++++
 arch/x86/include/asm/mem_encrypt.h | 30 ++++++++++++++++++++++++++++++
 arch/x86/mm/Makefile               |  1 +
 arch/x86/mm/mem_encrypt.c          | 21 +++++++++++++++++++++
 include/linux/mem_encrypt.h        | 35 +++++++++++++++++++++++++++++++++++
 5 files changed, 112 insertions(+)
 create mode 100644 arch/x86/include/asm/mem_encrypt.h
 create mode 100644 arch/x86/mm/mem_encrypt.c
 create mode 100644 include/linux/mem_encrypt.h

(limited to 'include')

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 781521b7cf9e..ba7b93d08d00 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -1415,6 +1415,31 @@ config X86_DIRECT_GBPAGES
 	  supports them), so don't confuse the user by printing
 	  that we have them enabled.
 
+config ARCH_HAS_MEM_ENCRYPT
+	def_bool y
+
+config AMD_MEM_ENCRYPT
+	bool "AMD Secure Memory Encryption (SME) support"
+	depends on X86_64 && CPU_SUP_AMD
+	---help---
+	  Say yes to enable support for the encryption of system memory.
+	  This requires an AMD processor that supports Secure Memory
+	  Encryption (SME).
+
+config AMD_MEM_ENCRYPT_ACTIVE_BY_DEFAULT
+	bool "Activate AMD Secure Memory Encryption (SME) by default"
+	default y
+	depends on AMD_MEM_ENCRYPT
+	---help---
+	  Say yes to have system memory encrypted by default if running on
+	  an AMD processor that supports Secure Memory Encryption (SME).
+
+	  If set to Y, then the encryption of system memory can be
+	  deactivated with the mem_encrypt=off command line option.
+
+	  If set to N, then the encryption of system memory can be
+	  activated with the mem_encrypt=on command line option.
+
 # Common NUMA Features
 config NUMA
 	bool "Numa Memory Allocation and Scheduler Support"
diff --git a/arch/x86/include/asm/mem_encrypt.h b/arch/x86/include/asm/mem_encrypt.h
new file mode 100644
index 000000000000..a1057961ac46
--- /dev/null
+++ b/arch/x86/include/asm/mem_encrypt.h
@@ -0,0 +1,30 @@
+/*
+ * AMD Memory Encryption Support
+ *
+ * Copyright (C) 2016 Advanced Micro Devices, Inc.
+ *
+ * Author: Tom Lendacky <thomas.lendacky@amd.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#ifndef __X86_MEM_ENCRYPT_H__
+#define __X86_MEM_ENCRYPT_H__
+
+#ifndef __ASSEMBLY__
+
+#ifdef CONFIG_AMD_MEM_ENCRYPT
+
+extern unsigned long sme_me_mask;
+
+#else	/* !CONFIG_AMD_MEM_ENCRYPT */
+
+#define sme_me_mask	0UL
+
+#endif	/* CONFIG_AMD_MEM_ENCRYPT */
+
+#endif	/* __ASSEMBLY__ */
+
+#endif	/* __X86_MEM_ENCRYPT_H__ */
diff --git a/arch/x86/mm/Makefile b/arch/x86/mm/Makefile
index 0fbdcb64f9f8..a94a7b663d5f 100644
--- a/arch/x86/mm/Makefile
+++ b/arch/x86/mm/Makefile
@@ -39,3 +39,4 @@ obj-$(CONFIG_X86_INTEL_MPX)	+= mpx.o
 obj-$(CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS) += pkeys.o
 obj-$(CONFIG_RANDOMIZE_MEMORY) += kaslr.o
 
+obj-$(CONFIG_AMD_MEM_ENCRYPT)	+= mem_encrypt.o
diff --git a/arch/x86/mm/mem_encrypt.c b/arch/x86/mm/mem_encrypt.c
new file mode 100644
index 000000000000..b99d469c73e7
--- /dev/null
+++ b/arch/x86/mm/mem_encrypt.c
@@ -0,0 +1,21 @@
+/*
+ * AMD Memory Encryption Support
+ *
+ * Copyright (C) 2016 Advanced Micro Devices, Inc.
+ *
+ * Author: Tom Lendacky <thomas.lendacky@amd.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/linkage.h>
+
+/*
+ * Since SME related variables are set early in the boot process they must
+ * reside in the .data section so as not to be zeroed out when the .bss
+ * section is later cleared.
+ */
+unsigned long sme_me_mask __section(.data) = 0;
+EXPORT_SYMBOL_GPL(sme_me_mask);
diff --git a/include/linux/mem_encrypt.h b/include/linux/mem_encrypt.h
new file mode 100644
index 000000000000..59769f7287e4
--- /dev/null
+++ b/include/linux/mem_encrypt.h
@@ -0,0 +1,35 @@
+/*
+ * AMD Memory Encryption Support
+ *
+ * Copyright (C) 2016 Advanced Micro Devices, Inc.
+ *
+ * Author: Tom Lendacky <thomas.lendacky@amd.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#ifndef __MEM_ENCRYPT_H__
+#define __MEM_ENCRYPT_H__
+
+#ifndef __ASSEMBLY__
+
+#ifdef CONFIG_ARCH_HAS_MEM_ENCRYPT
+
+#include <asm/mem_encrypt.h>
+
+#else	/* !CONFIG_ARCH_HAS_MEM_ENCRYPT */
+
+#define sme_me_mask	0UL
+
+#endif	/* CONFIG_ARCH_HAS_MEM_ENCRYPT */
+
+static inline bool sme_active(void)
+{
+	return !!sme_me_mask;
+}
+
+#endif	/* __ASSEMBLY__ */
+
+#endif	/* __MEM_ENCRYPT_H__ */
-- 
cgit v1.2.3


From 5868f3651fa0dff96a57f94d49247d3ef320ebe2 Mon Sep 17 00:00:00 2001
From: Tom Lendacky <thomas.lendacky@amd.com>
Date: Mon, 17 Jul 2017 16:10:05 -0500
Subject: x86/mm: Add support to enable SME in early boot processing
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Add support to the early boot code to use Secure Memory Encryption (SME).
Since the kernel has been loaded into memory in a decrypted state, encrypt
the kernel in place and update the early pagetables with the memory
encryption mask so that new pagetable entries will use memory encryption.

The routines to set the encryption mask and perform the encryption are
stub routines for now with functionality to be added in a later patch.

Signed-off-by: Tom Lendacky <thomas.lendacky@amd.com>
Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Alexander Potapenko <glider@google.com>
Cc: Andrey Ryabinin <aryabinin@virtuozzo.com>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brijesh Singh <brijesh.singh@amd.com>
Cc: Dave Young <dyoung@redhat.com>
Cc: Dmitry Vyukov <dvyukov@google.com>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Cc: Larry Woodman <lwoodman@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Matt Fleming <matt@codeblueprint.co.uk>
Cc: Michael S. Tsirkin <mst@redhat.com>
Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Radim Krčmář <rkrcmar@redhat.com>
Cc: Rik van Riel <riel@redhat.com>
Cc: Toshimitsu Kani <toshi.kani@hpe.com>
Cc: kasan-dev@googlegroups.com
Cc: kvm@vger.kernel.org
Cc: linux-arch@vger.kernel.org
Cc: linux-doc@vger.kernel.org
Cc: linux-efi@vger.kernel.org
Cc: linux-mm@kvack.org
Link: http://lkml.kernel.org/r/e52ad781f085224bf835b3caff9aa3aee6febccb.1500319216.git.thomas.lendacky@amd.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/include/asm/mem_encrypt.h |  8 ++++++
 arch/x86/kernel/head64.c           | 53 ++++++++++++++++++++++++++++++--------
 arch/x86/kernel/head_64.S          | 20 ++++++++++++--
 arch/x86/mm/mem_encrypt.c          |  9 +++++++
 include/linux/mem_encrypt.h        |  5 ++++
 5 files changed, 82 insertions(+), 13 deletions(-)

(limited to 'include')

diff --git a/arch/x86/include/asm/mem_encrypt.h b/arch/x86/include/asm/mem_encrypt.h
index a1057961ac46..475e34f53793 100644
--- a/arch/x86/include/asm/mem_encrypt.h
+++ b/arch/x86/include/asm/mem_encrypt.h
@@ -15,14 +15,22 @@
 
 #ifndef __ASSEMBLY__
 
+#include <linux/init.h>
+
 #ifdef CONFIG_AMD_MEM_ENCRYPT
 
 extern unsigned long sme_me_mask;
 
+void __init sme_encrypt_kernel(void);
+void __init sme_enable(void);
+
 #else	/* !CONFIG_AMD_MEM_ENCRYPT */
 
 #define sme_me_mask	0UL
 
+static inline void __init sme_encrypt_kernel(void) { }
+static inline void __init sme_enable(void) { }
+
 #endif	/* CONFIG_AMD_MEM_ENCRYPT */
 
 #endif	/* __ASSEMBLY__ */
diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c
index 46c3c73e7f43..1f0ddcc9675c 100644
--- a/arch/x86/kernel/head64.c
+++ b/arch/x86/kernel/head64.c
@@ -14,6 +14,7 @@
 #include <linux/start_kernel.h>
 #include <linux/io.h>
 #include <linux/memblock.h>
+#include <linux/mem_encrypt.h>
 
 #include <asm/processor.h>
 #include <asm/proto.h>
@@ -45,9 +46,10 @@ static void __head *fixup_pointer(void *ptr, unsigned long physaddr)
 	return ptr - (void *)_text + (void *)physaddr;
 }
 
-void __head __startup_64(unsigned long physaddr)
+unsigned long __head __startup_64(unsigned long physaddr)
 {
 	unsigned long load_delta, *p;
+	unsigned long pgtable_flags;
 	pgdval_t *pgd;
 	p4dval_t *p4d;
 	pudval_t *pud;
@@ -68,6 +70,12 @@ void __head __startup_64(unsigned long physaddr)
 	if (load_delta & ~PMD_PAGE_MASK)
 		for (;;);
 
+	/* Activate Secure Memory Encryption (SME) if supported and enabled */
+	sme_enable();
+
+	/* Include the SME encryption mask in the fixup value */
+	load_delta += sme_get_me_mask();
+
 	/* Fixup the physical addresses in the page table */
 
 	pgd = fixup_pointer(&early_top_pgt, physaddr);
@@ -94,28 +102,30 @@ void __head __startup_64(unsigned long physaddr)
 
 	pud = fixup_pointer(early_dynamic_pgts[next_early_pgt++], physaddr);
 	pmd = fixup_pointer(early_dynamic_pgts[next_early_pgt++], physaddr);
+	pgtable_flags = _KERNPG_TABLE + sme_get_me_mask();
 
 	if (IS_ENABLED(CONFIG_X86_5LEVEL)) {
 		p4d = fixup_pointer(early_dynamic_pgts[next_early_pgt++], physaddr);
 
 		i = (physaddr >> PGDIR_SHIFT) % PTRS_PER_PGD;
-		pgd[i + 0] = (pgdval_t)p4d + _KERNPG_TABLE;
-		pgd[i + 1] = (pgdval_t)p4d + _KERNPG_TABLE;
+		pgd[i + 0] = (pgdval_t)p4d + pgtable_flags;
+		pgd[i + 1] = (pgdval_t)p4d + pgtable_flags;
 
 		i = (physaddr >> P4D_SHIFT) % PTRS_PER_P4D;
-		p4d[i + 0] = (pgdval_t)pud + _KERNPG_TABLE;
-		p4d[i + 1] = (pgdval_t)pud + _KERNPG_TABLE;
+		p4d[i + 0] = (pgdval_t)pud + pgtable_flags;
+		p4d[i + 1] = (pgdval_t)pud + pgtable_flags;
 	} else {
 		i = (physaddr >> PGDIR_SHIFT) % PTRS_PER_PGD;
-		pgd[i + 0] = (pgdval_t)pud + _KERNPG_TABLE;
-		pgd[i + 1] = (pgdval_t)pud + _KERNPG_TABLE;
+		pgd[i + 0] = (pgdval_t)pud + pgtable_flags;
+		pgd[i + 1] = (pgdval_t)pud + pgtable_flags;
 	}
 
 	i = (physaddr >> PUD_SHIFT) % PTRS_PER_PUD;
-	pud[i + 0] = (pudval_t)pmd + _KERNPG_TABLE;
-	pud[i + 1] = (pudval_t)pmd + _KERNPG_TABLE;
+	pud[i + 0] = (pudval_t)pmd + pgtable_flags;
+	pud[i + 1] = (pudval_t)pmd + pgtable_flags;
 
 	pmd_entry = __PAGE_KERNEL_LARGE_EXEC & ~_PAGE_GLOBAL;
+	pmd_entry += sme_get_me_mask();
 	pmd_entry +=  physaddr;
 
 	for (i = 0; i < DIV_ROUND_UP(_end - _text, PMD_SIZE); i++) {
@@ -136,9 +146,30 @@ void __head __startup_64(unsigned long physaddr)
 			pmd[i] += load_delta;
 	}
 
-	/* Fixup phys_base */
+	/*
+	 * Fixup phys_base - remove the memory encryption mask to obtain
+	 * the true physical address.
+	 */
 	p = fixup_pointer(&phys_base, physaddr);
-	*p += load_delta;
+	*p += load_delta - sme_get_me_mask();
+
+	/* Encrypt the kernel (if SME is active) */
+	sme_encrypt_kernel();
+
+	/*
+	 * Return the SME encryption mask (if SME is active) to be used as a
+	 * modifier for the initial pgdir entry programmed into CR3.
+	 */
+	return sme_get_me_mask();
+}
+
+unsigned long __startup_secondary_64(void)
+{
+	/*
+	 * Return the SME encryption mask (if SME is active) to be used as a
+	 * modifier for the initial pgdir entry programmed into CR3.
+	 */
+	return sme_get_me_mask();
 }
 
 /* Wipe all early page tables except for the kernel symbol map */
diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S
index 6225550883df..ec5d5e90c8f1 100644
--- a/arch/x86/kernel/head_64.S
+++ b/arch/x86/kernel/head_64.S
@@ -73,12 +73,19 @@ startup_64:
 	/* Sanitize CPU configuration */
 	call verify_cpu
 
+	/*
+	 * Perform pagetable fixups. Additionally, if SME is active, encrypt
+	 * the kernel and retrieve the modifier (SME encryption mask if SME
+	 * is active) to be added to the initial pgdir entry that will be
+	 * programmed into CR3.
+	 */
 	leaq	_text(%rip), %rdi
 	pushq	%rsi
 	call	__startup_64
 	popq	%rsi
 
-	movq	$(early_top_pgt - __START_KERNEL_map), %rax
+	/* Form the CR3 value being sure to include the CR3 modifier */
+	addq	$(early_top_pgt - __START_KERNEL_map), %rax
 	jmp 1f
 ENTRY(secondary_startup_64)
 	/*
@@ -98,7 +105,16 @@ ENTRY(secondary_startup_64)
 	/* Sanitize CPU configuration */
 	call verify_cpu
 
-	movq	$(init_top_pgt - __START_KERNEL_map), %rax
+	/*
+	 * Retrieve the modifier (SME encryption mask if SME is active) to be
+	 * added to the initial pgdir entry that will be programmed into CR3.
+	 */
+	pushq	%rsi
+	call	__startup_secondary_64
+	popq	%rsi
+
+	/* Form the CR3 value being sure to include the CR3 modifier */
+	addq	$(init_top_pgt - __START_KERNEL_map), %rax
 1:
 
 	/* Enable PAE mode, PGE and LA57 */
diff --git a/arch/x86/mm/mem_encrypt.c b/arch/x86/mm/mem_encrypt.c
index b99d469c73e7..3ac6f99b095c 100644
--- a/arch/x86/mm/mem_encrypt.c
+++ b/arch/x86/mm/mem_encrypt.c
@@ -11,6 +11,7 @@
  */
 
 #include <linux/linkage.h>
+#include <linux/init.h>
 
 /*
  * Since SME related variables are set early in the boot process they must
@@ -19,3 +20,11 @@
  */
 unsigned long sme_me_mask __section(.data) = 0;
 EXPORT_SYMBOL_GPL(sme_me_mask);
+
+void __init sme_encrypt_kernel(void)
+{
+}
+
+void __init sme_enable(void)
+{
+}
diff --git a/include/linux/mem_encrypt.h b/include/linux/mem_encrypt.h
index 59769f7287e4..570f4fcff13f 100644
--- a/include/linux/mem_encrypt.h
+++ b/include/linux/mem_encrypt.h
@@ -30,6 +30,11 @@ static inline bool sme_active(void)
 	return !!sme_me_mask;
 }
 
+static inline unsigned long sme_get_me_mask(void)
+{
+	return sme_me_mask;
+}
+
 #endif	/* __ASSEMBLY__ */
 
 #endif	/* __MEM_ENCRYPT_H__ */
-- 
cgit v1.2.3


From 21729f81ce8ae76a6995681d40e16f7ce8075db4 Mon Sep 17 00:00:00 2001
From: Tom Lendacky <thomas.lendacky@amd.com>
Date: Mon, 17 Jul 2017 16:10:07 -0500
Subject: x86/mm: Provide general kernel support for memory encryption
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Changes to the existing page table macros will allow the SME support to
be enabled in a simple fashion with minimal changes to files that use these
macros.  Since the memory encryption mask will now be part of the regular
pagetable macros, we introduce two new macros (_PAGE_TABLE_NOENC and
_KERNPG_TABLE_NOENC) to allow for early pagetable creation/initialization
without the encryption mask before SME becomes active.  Two new pgprot()
macros are defined to allow setting or clearing the page encryption mask.

The FIXMAP_PAGE_NOCACHE define is introduced for use with MMIO.  SME does
not support encryption for MMIO areas so this define removes the encryption
mask from the page attribute.

Two new macros are introduced (__sme_pa() / __sme_pa_nodebug()) to allow
creating a physical address with the encryption mask.  These are used when
working with the cr3 register so that the PGD can be encrypted. The current
__va() macro is updated so that the virtual address is generated based off
of the physical address without the encryption mask thus allowing the same
virtual address to be generated regardless of whether encryption is enabled
for that physical location or not.

Also, an early initialization function is added for SME.  If SME is active,
this function:

 - Updates the early_pmd_flags so that early page faults create mappings
   with the encryption mask.

 - Updates the __supported_pte_mask to include the encryption mask.

 - Updates the protection_map entries to include the encryption mask so
   that user-space allocations will automatically have the encryption mask
   applied.

Signed-off-by: Tom Lendacky <thomas.lendacky@amd.com>
Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Borislav Petkov <bp@suse.de>
Cc: Alexander Potapenko <glider@google.com>
Cc: Andrey Ryabinin <aryabinin@virtuozzo.com>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brijesh Singh <brijesh.singh@amd.com>
Cc: Dave Young <dyoung@redhat.com>
Cc: Dmitry Vyukov <dvyukov@google.com>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Cc: Larry Woodman <lwoodman@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Matt Fleming <matt@codeblueprint.co.uk>
Cc: Michael S. Tsirkin <mst@redhat.com>
Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Radim Krčmář <rkrcmar@redhat.com>
Cc: Rik van Riel <riel@redhat.com>
Cc: Toshimitsu Kani <toshi.kani@hpe.com>
Cc: kasan-dev@googlegroups.com
Cc: kvm@vger.kernel.org
Cc: linux-arch@vger.kernel.org
Cc: linux-doc@vger.kernel.org
Cc: linux-efi@vger.kernel.org
Cc: linux-mm@kvack.org
Link: http://lkml.kernel.org/r/b36e952c4c39767ae7f0a41cf5345adf27438480.1500319216.git.thomas.lendacky@amd.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/boot/compressed/pagetable.c |  7 ++++++
 arch/x86/include/asm/fixmap.h        |  7 ++++++
 arch/x86/include/asm/mem_encrypt.h   | 13 +++++++++++
 arch/x86/include/asm/page_types.h    |  3 ++-
 arch/x86/include/asm/pgtable.h       |  9 ++++++++
 arch/x86/include/asm/pgtable_types.h | 45 +++++++++++++++++++++++-------------
 arch/x86/include/asm/processor.h     |  3 ++-
 arch/x86/kernel/espfix_64.c          |  2 +-
 arch/x86/kernel/head64.c             | 11 +++++++--
 arch/x86/kernel/head_64.S            | 20 ++++++++--------
 arch/x86/mm/kasan_init_64.c          |  4 ++--
 arch/x86/mm/mem_encrypt.c            | 17 ++++++++++++++
 arch/x86/mm/pageattr.c               |  3 +++
 arch/x86/mm/tlb.c                    |  4 ++--
 include/asm-generic/pgtable.h        | 12 ++++++++++
 include/linux/mem_encrypt.h          |  8 +++++++
 16 files changed, 133 insertions(+), 35 deletions(-)

(limited to 'include')

diff --git a/arch/x86/boot/compressed/pagetable.c b/arch/x86/boot/compressed/pagetable.c
index 28029be47fbb..f1aa43854bed 100644
--- a/arch/x86/boot/compressed/pagetable.c
+++ b/arch/x86/boot/compressed/pagetable.c
@@ -15,6 +15,13 @@
 #define __pa(x)  ((unsigned long)(x))
 #define __va(x)  ((void *)((unsigned long)(x)))
 
+/*
+ * The pgtable.h and mm/ident_map.c includes make use of the SME related
+ * information which is not used in the compressed image support. Un-define
+ * the SME support to avoid any compile and link errors.
+ */
+#undef CONFIG_AMD_MEM_ENCRYPT
+
 #include "misc.h"
 
 /* These actually do the work of building the kernel identity maps. */
diff --git a/arch/x86/include/asm/fixmap.h b/arch/x86/include/asm/fixmap.h
index b65155cc3760..d9ff226cb489 100644
--- a/arch/x86/include/asm/fixmap.h
+++ b/arch/x86/include/asm/fixmap.h
@@ -157,6 +157,13 @@ static inline void __set_fixmap(enum fixed_addresses idx,
 }
 #endif
 
+/*
+ * FIXMAP_PAGE_NOCACHE is used for MMIO. Memory encryption is not
+ * supported for MMIO addresses, so make sure that the memory encryption
+ * mask is not part of the page attributes.
+ */
+#define FIXMAP_PAGE_NOCACHE PAGE_KERNEL_IO_NOCACHE
+
 #include <asm-generic/fixmap.h>
 
 #define __late_set_fixmap(idx, phys, flags) __set_fixmap(idx, phys, flags)
diff --git a/arch/x86/include/asm/mem_encrypt.h b/arch/x86/include/asm/mem_encrypt.h
index 475e34f53793..dbae7a5a347d 100644
--- a/arch/x86/include/asm/mem_encrypt.h
+++ b/arch/x86/include/asm/mem_encrypt.h
@@ -21,6 +21,8 @@
 
 extern unsigned long sme_me_mask;
 
+void __init sme_early_init(void);
+
 void __init sme_encrypt_kernel(void);
 void __init sme_enable(void);
 
@@ -28,11 +30,22 @@ void __init sme_enable(void);
 
 #define sme_me_mask	0UL
 
+static inline void __init sme_early_init(void) { }
+
 static inline void __init sme_encrypt_kernel(void) { }
 static inline void __init sme_enable(void) { }
 
 #endif	/* CONFIG_AMD_MEM_ENCRYPT */
 
+/*
+ * The __sme_pa() and __sme_pa_nodebug() macros are meant for use when
+ * writing to or comparing values from the cr3 register.  Having the
+ * encryption mask set in cr3 enables the PGD entry to be encrypted and
+ * avoid special case handling of PGD allocations.
+ */
+#define __sme_pa(x)		(__pa(x) | sme_me_mask)
+#define __sme_pa_nodebug(x)	(__pa_nodebug(x) | sme_me_mask)
+
 #endif	/* __ASSEMBLY__ */
 
 #endif	/* __X86_MEM_ENCRYPT_H__ */
diff --git a/arch/x86/include/asm/page_types.h b/arch/x86/include/asm/page_types.h
index 7bd0099384ca..b98ed9d14630 100644
--- a/arch/x86/include/asm/page_types.h
+++ b/arch/x86/include/asm/page_types.h
@@ -3,6 +3,7 @@
 
 #include <linux/const.h>
 #include <linux/types.h>
+#include <linux/mem_encrypt.h>
 
 /* PAGE_SHIFT determines the page size */
 #define PAGE_SHIFT		12
@@ -15,7 +16,7 @@
 #define PUD_PAGE_SIZE		(_AC(1, UL) << PUD_SHIFT)
 #define PUD_PAGE_MASK		(~(PUD_PAGE_SIZE-1))
 
-#define __PHYSICAL_MASK		((phys_addr_t)((1ULL << __PHYSICAL_MASK_SHIFT) - 1))
+#define __PHYSICAL_MASK		((phys_addr_t)(__sme_clr((1ULL << __PHYSICAL_MASK_SHIFT) - 1)))
 #define __VIRTUAL_MASK		((1UL << __VIRTUAL_MASK_SHIFT) - 1)
 
 /* Cast *PAGE_MASK to a signed type so that it is sign-extended if
diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h
index b64ea527edfb..c6452cb12c0b 100644
--- a/arch/x86/include/asm/pgtable.h
+++ b/arch/x86/include/asm/pgtable.h
@@ -1,6 +1,7 @@
 #ifndef _ASM_X86_PGTABLE_H
 #define _ASM_X86_PGTABLE_H
 
+#include <linux/mem_encrypt.h>
 #include <asm/page.h>
 #include <asm/pgtable_types.h>
 
@@ -13,6 +14,12 @@
 		     cachemode2protval(_PAGE_CACHE_MODE_UC_MINUS)))	\
 	 : (prot))
 
+/*
+ * Macros to add or remove encryption attribute
+ */
+#define pgprot_encrypted(prot)	__pgprot(__sme_set(pgprot_val(prot)))
+#define pgprot_decrypted(prot)	__pgprot(__sme_clr(pgprot_val(prot)))
+
 #ifndef __ASSEMBLY__
 #include <asm/x86_init.h>
 
@@ -38,6 +45,8 @@ extern struct list_head pgd_list;
 
 extern struct mm_struct *pgd_page_get_mm(struct page *page);
 
+extern pmdval_t early_pmd_flags;
+
 #ifdef CONFIG_PARAVIRT
 #include <asm/paravirt.h>
 #else  /* !CONFIG_PARAVIRT */
diff --git a/arch/x86/include/asm/pgtable_types.h b/arch/x86/include/asm/pgtable_types.h
index bf9638e1ee42..de32ca32928a 100644
--- a/arch/x86/include/asm/pgtable_types.h
+++ b/arch/x86/include/asm/pgtable_types.h
@@ -2,6 +2,8 @@
 #define _ASM_X86_PGTABLE_DEFS_H
 
 #include <linux/const.h>
+#include <linux/mem_encrypt.h>
+
 #include <asm/page_types.h>
 
 #define FIRST_USER_ADDRESS	0UL
@@ -121,10 +123,10 @@
 
 #define _PAGE_PROTNONE	(_AT(pteval_t, 1) << _PAGE_BIT_PROTNONE)
 
-#define _PAGE_TABLE	(_PAGE_PRESENT | _PAGE_RW | _PAGE_USER |	\
-			 _PAGE_ACCESSED | _PAGE_DIRTY)
-#define _KERNPG_TABLE	(_PAGE_PRESENT | _PAGE_RW | _PAGE_ACCESSED |	\
-			 _PAGE_DIRTY)
+#define _PAGE_TABLE_NOENC	(_PAGE_PRESENT | _PAGE_RW | _PAGE_USER |\
+				 _PAGE_ACCESSED | _PAGE_DIRTY)
+#define _KERNPG_TABLE_NOENC	(_PAGE_PRESENT | _PAGE_RW |		\
+				 _PAGE_ACCESSED | _PAGE_DIRTY)
 
 /*
  * Set of bits not changed in pte_modify.  The pte's
@@ -191,18 +193,29 @@ enum page_cache_mode {
 #define __PAGE_KERNEL_IO		(__PAGE_KERNEL)
 #define __PAGE_KERNEL_IO_NOCACHE	(__PAGE_KERNEL_NOCACHE)
 
-#define PAGE_KERNEL			__pgprot(__PAGE_KERNEL)
-#define PAGE_KERNEL_RO			__pgprot(__PAGE_KERNEL_RO)
-#define PAGE_KERNEL_EXEC		__pgprot(__PAGE_KERNEL_EXEC)
-#define PAGE_KERNEL_RX			__pgprot(__PAGE_KERNEL_RX)
-#define PAGE_KERNEL_NOCACHE		__pgprot(__PAGE_KERNEL_NOCACHE)
-#define PAGE_KERNEL_LARGE		__pgprot(__PAGE_KERNEL_LARGE)
-#define PAGE_KERNEL_LARGE_EXEC		__pgprot(__PAGE_KERNEL_LARGE_EXEC)
-#define PAGE_KERNEL_VSYSCALL		__pgprot(__PAGE_KERNEL_VSYSCALL)
-#define PAGE_KERNEL_VVAR		__pgprot(__PAGE_KERNEL_VVAR)
-
-#define PAGE_KERNEL_IO			__pgprot(__PAGE_KERNEL_IO)
-#define PAGE_KERNEL_IO_NOCACHE		__pgprot(__PAGE_KERNEL_IO_NOCACHE)
+#ifndef __ASSEMBLY__
+
+#define _PAGE_ENC	(_AT(pteval_t, sme_me_mask))
+
+#define _PAGE_TABLE	(_PAGE_PRESENT | _PAGE_RW | _PAGE_USER |	\
+			 _PAGE_ACCESSED | _PAGE_DIRTY | _PAGE_ENC)
+#define _KERNPG_TABLE	(_PAGE_PRESENT | _PAGE_RW | _PAGE_ACCESSED |	\
+			 _PAGE_DIRTY | _PAGE_ENC)
+
+#define PAGE_KERNEL		__pgprot(__PAGE_KERNEL | _PAGE_ENC)
+#define PAGE_KERNEL_RO		__pgprot(__PAGE_KERNEL_RO | _PAGE_ENC)
+#define PAGE_KERNEL_EXEC	__pgprot(__PAGE_KERNEL_EXEC | _PAGE_ENC)
+#define PAGE_KERNEL_RX		__pgprot(__PAGE_KERNEL_RX | _PAGE_ENC)
+#define PAGE_KERNEL_NOCACHE	__pgprot(__PAGE_KERNEL_NOCACHE | _PAGE_ENC)
+#define PAGE_KERNEL_LARGE	__pgprot(__PAGE_KERNEL_LARGE | _PAGE_ENC)
+#define PAGE_KERNEL_LARGE_EXEC	__pgprot(__PAGE_KERNEL_LARGE_EXEC | _PAGE_ENC)
+#define PAGE_KERNEL_VSYSCALL	__pgprot(__PAGE_KERNEL_VSYSCALL | _PAGE_ENC)
+#define PAGE_KERNEL_VVAR	__pgprot(__PAGE_KERNEL_VVAR | _PAGE_ENC)
+
+#define PAGE_KERNEL_IO		__pgprot(__PAGE_KERNEL_IO)
+#define PAGE_KERNEL_IO_NOCACHE	__pgprot(__PAGE_KERNEL_IO_NOCACHE)
+
+#endif	/* __ASSEMBLY__ */
 
 /*         xwr */
 #define __P000	PAGE_NONE
diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
index 6a79547e8ee0..a68f70c3debc 100644
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -29,6 +29,7 @@ struct vm86;
 #include <linux/math64.h>
 #include <linux/err.h>
 #include <linux/irqflags.h>
+#include <linux/mem_encrypt.h>
 
 /*
  * We handle most unaligned accesses in hardware.  On the other hand
@@ -241,7 +242,7 @@ static inline unsigned long read_cr3_pa(void)
 
 static inline void load_cr3(pgd_t *pgdir)
 {
-	write_cr3(__pa(pgdir));
+	write_cr3(__sme_pa(pgdir));
 }
 
 #ifdef CONFIG_X86_32
diff --git a/arch/x86/kernel/espfix_64.c b/arch/x86/kernel/espfix_64.c
index 6b91e2eb8d3f..9c4e7ba6870c 100644
--- a/arch/x86/kernel/espfix_64.c
+++ b/arch/x86/kernel/espfix_64.c
@@ -195,7 +195,7 @@ void init_espfix_ap(int cpu)
 
 	pte_p = pte_offset_kernel(&pmd, addr);
 	stack_page = page_address(alloc_pages_node(node, GFP_KERNEL, 0));
-	pte = __pte(__pa(stack_page) | (__PAGE_KERNEL_RO & ptemask));
+	pte = __pte(__pa(stack_page) | ((__PAGE_KERNEL_RO | _PAGE_ENC) & ptemask));
 	for (n = 0; n < ESPFIX_PTE_CLONES; n++)
 		set_pte(&pte_p[n*PTE_STRIDE], pte);
 
diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c
index 1f0ddcc9675c..5cd0b72a0283 100644
--- a/arch/x86/kernel/head64.c
+++ b/arch/x86/kernel/head64.c
@@ -102,7 +102,7 @@ unsigned long __head __startup_64(unsigned long physaddr)
 
 	pud = fixup_pointer(early_dynamic_pgts[next_early_pgt++], physaddr);
 	pmd = fixup_pointer(early_dynamic_pgts[next_early_pgt++], physaddr);
-	pgtable_flags = _KERNPG_TABLE + sme_get_me_mask();
+	pgtable_flags = _KERNPG_TABLE_NOENC + sme_get_me_mask();
 
 	if (IS_ENABLED(CONFIG_X86_5LEVEL)) {
 		p4d = fixup_pointer(early_dynamic_pgts[next_early_pgt++], physaddr);
@@ -177,7 +177,7 @@ static void __init reset_early_page_tables(void)
 {
 	memset(early_top_pgt, 0, sizeof(pgd_t)*(PTRS_PER_PGD-1));
 	next_early_pgt = 0;
-	write_cr3(__pa_nodebug(early_top_pgt));
+	write_cr3(__sme_pa_nodebug(early_top_pgt));
 }
 
 /* Create a new PMD entry */
@@ -310,6 +310,13 @@ asmlinkage __visible void __init x86_64_start_kernel(char * real_mode_data)
 
 	clear_page(init_top_pgt);
 
+	/*
+	 * SME support may update early_pmd_flags to include the memory
+	 * encryption mask, so it needs to be called before anything
+	 * that may generate a page fault.
+	 */
+	sme_early_init();
+
 	kasan_early_init();
 
 	for (i = 0; i < NUM_EXCEPTION_VECTORS; i++)
diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S
index ec5d5e90c8f1..513cbb012ecc 100644
--- a/arch/x86/kernel/head_64.S
+++ b/arch/x86/kernel/head_64.S
@@ -351,9 +351,9 @@ GLOBAL(name)
 NEXT_PAGE(early_top_pgt)
 	.fill	511,8,0
 #ifdef CONFIG_X86_5LEVEL
-	.quad	level4_kernel_pgt - __START_KERNEL_map + _PAGE_TABLE
+	.quad	level4_kernel_pgt - __START_KERNEL_map + _PAGE_TABLE_NOENC
 #else
-	.quad	level3_kernel_pgt - __START_KERNEL_map + _PAGE_TABLE
+	.quad	level3_kernel_pgt - __START_KERNEL_map + _PAGE_TABLE_NOENC
 #endif
 
 NEXT_PAGE(early_dynamic_pgts)
@@ -366,15 +366,15 @@ NEXT_PAGE(init_top_pgt)
 	.fill	512,8,0
 #else
 NEXT_PAGE(init_top_pgt)
-	.quad   level3_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE
+	.quad   level3_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE_NOENC
 	.org    init_top_pgt + PGD_PAGE_OFFSET*8, 0
-	.quad   level3_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE
+	.quad   level3_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE_NOENC
 	.org    init_top_pgt + PGD_START_KERNEL*8, 0
 	/* (2^48-(2*1024*1024*1024))/(2^39) = 511 */
-	.quad   level3_kernel_pgt - __START_KERNEL_map + _PAGE_TABLE
+	.quad   level3_kernel_pgt - __START_KERNEL_map + _PAGE_TABLE_NOENC
 
 NEXT_PAGE(level3_ident_pgt)
-	.quad	level2_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE
+	.quad	level2_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE_NOENC
 	.fill	511, 8, 0
 NEXT_PAGE(level2_ident_pgt)
 	/* Since I easily can, map the first 1G.
@@ -386,14 +386,14 @@ NEXT_PAGE(level2_ident_pgt)
 #ifdef CONFIG_X86_5LEVEL
 NEXT_PAGE(level4_kernel_pgt)
 	.fill	511,8,0
-	.quad	level3_kernel_pgt - __START_KERNEL_map + _PAGE_TABLE
+	.quad	level3_kernel_pgt - __START_KERNEL_map + _PAGE_TABLE_NOENC
 #endif
 
 NEXT_PAGE(level3_kernel_pgt)
 	.fill	L3_START_KERNEL,8,0
 	/* (2^48-(2*1024*1024*1024)-((2^39)*511))/(2^30) = 510 */
-	.quad	level2_kernel_pgt - __START_KERNEL_map + _KERNPG_TABLE
-	.quad	level2_fixmap_pgt - __START_KERNEL_map + _PAGE_TABLE
+	.quad	level2_kernel_pgt - __START_KERNEL_map + _KERNPG_TABLE_NOENC
+	.quad	level2_fixmap_pgt - __START_KERNEL_map + _PAGE_TABLE_NOENC
 
 NEXT_PAGE(level2_kernel_pgt)
 	/*
@@ -411,7 +411,7 @@ NEXT_PAGE(level2_kernel_pgt)
 
 NEXT_PAGE(level2_fixmap_pgt)
 	.fill	506,8,0
-	.quad	level1_fixmap_pgt - __START_KERNEL_map + _PAGE_TABLE
+	.quad	level1_fixmap_pgt - __START_KERNEL_map + _PAGE_TABLE_NOENC
 	/* 8MB reserved for vsyscalls + a 2MB hole = 4 + 1 entries */
 	.fill	5,8,0
 
diff --git a/arch/x86/mm/kasan_init_64.c b/arch/x86/mm/kasan_init_64.c
index 02c9d7553409..39d4daf5e289 100644
--- a/arch/x86/mm/kasan_init_64.c
+++ b/arch/x86/mm/kasan_init_64.c
@@ -87,7 +87,7 @@ static struct notifier_block kasan_die_notifier = {
 void __init kasan_early_init(void)
 {
 	int i;
-	pteval_t pte_val = __pa_nodebug(kasan_zero_page) | __PAGE_KERNEL;
+	pteval_t pte_val = __pa_nodebug(kasan_zero_page) | __PAGE_KERNEL | _PAGE_ENC;
 	pmdval_t pmd_val = __pa_nodebug(kasan_zero_pte) | _KERNPG_TABLE;
 	pudval_t pud_val = __pa_nodebug(kasan_zero_pmd) | _KERNPG_TABLE;
 	p4dval_t p4d_val = __pa_nodebug(kasan_zero_pud) | _KERNPG_TABLE;
@@ -153,7 +153,7 @@ void __init kasan_init(void)
 	 */
 	memset(kasan_zero_page, 0, PAGE_SIZE);
 	for (i = 0; i < PTRS_PER_PTE; i++) {
-		pte_t pte = __pte(__pa(kasan_zero_page) | __PAGE_KERNEL_RO);
+		pte_t pte = __pte(__pa(kasan_zero_page) | __PAGE_KERNEL_RO | _PAGE_ENC);
 		set_pte(&kasan_zero_pte[i], pte);
 	}
 	/* Flush TLBs again to be sure that write protection applied. */
diff --git a/arch/x86/mm/mem_encrypt.c b/arch/x86/mm/mem_encrypt.c
index 3ac6f99b095c..f973d3dc3802 100644
--- a/arch/x86/mm/mem_encrypt.c
+++ b/arch/x86/mm/mem_encrypt.c
@@ -12,6 +12,7 @@
 
 #include <linux/linkage.h>
 #include <linux/init.h>
+#include <linux/mm.h>
 
 /*
  * Since SME related variables are set early in the boot process they must
@@ -21,6 +22,22 @@
 unsigned long sme_me_mask __section(.data) = 0;
 EXPORT_SYMBOL_GPL(sme_me_mask);
 
+void __init sme_early_init(void)
+{
+	unsigned int i;
+
+	if (!sme_me_mask)
+		return;
+
+	early_pmd_flags = __sme_set(early_pmd_flags);
+
+	__supported_pte_mask = __sme_set(__supported_pte_mask);
+
+	/* Update the protection map with memory encryption mask */
+	for (i = 0; i < ARRAY_SIZE(protection_map); i++)
+		protection_map[i] = pgprot_encrypted(protection_map[i]);
+}
+
 void __init sme_encrypt_kernel(void)
 {
 }
diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c
index 757b0bcdf712..7e2d6c0a64c4 100644
--- a/arch/x86/mm/pageattr.c
+++ b/arch/x86/mm/pageattr.c
@@ -2020,6 +2020,9 @@ int kernel_map_pages_in_pgd(pgd_t *pgd, u64 pfn, unsigned long address,
 	if (!(page_flags & _PAGE_RW))
 		cpa.mask_clr = __pgprot(_PAGE_RW);
 
+	if (!(page_flags & _PAGE_ENC))
+		cpa.mask_clr = pgprot_encrypted(cpa.mask_clr);
+
 	cpa.mask_set = __pgprot(_PAGE_PRESENT | page_flags);
 
 	retval = __change_page_attr_set_clr(&cpa, 0);
diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c
index 2c1b8881e9d3..593d2f76a54c 100644
--- a/arch/x86/mm/tlb.c
+++ b/arch/x86/mm/tlb.c
@@ -115,7 +115,7 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,
 			 */
 			this_cpu_write(cpu_tlbstate.ctxs[0].tlb_gen,
 				       next_tlb_gen);
-			write_cr3(__pa(next->pgd));
+			write_cr3(__sme_pa(next->pgd));
 			trace_tlb_flush(TLB_FLUSH_ON_TASK_SWITCH,
 					TLB_FLUSH_ALL);
 		}
@@ -157,7 +157,7 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,
 		this_cpu_write(cpu_tlbstate.ctxs[0].ctx_id, next->context.ctx_id);
 		this_cpu_write(cpu_tlbstate.ctxs[0].tlb_gen, next_tlb_gen);
 		this_cpu_write(cpu_tlbstate.loaded_mm, next);
-		write_cr3(__pa(next->pgd));
+		write_cr3(__sme_pa(next->pgd));
 
 		trace_tlb_flush(TLB_FLUSH_ON_TASK_SWITCH, TLB_FLUSH_ALL);
 	}
diff --git a/include/asm-generic/pgtable.h b/include/asm-generic/pgtable.h
index 7dfa767dc680..4d7bb98f4134 100644
--- a/include/asm-generic/pgtable.h
+++ b/include/asm-generic/pgtable.h
@@ -582,6 +582,18 @@ static inline void ptep_modify_prot_commit(struct mm_struct *mm,
 #endif /* __HAVE_ARCH_PTEP_MODIFY_PROT_TRANSACTION */
 #endif /* CONFIG_MMU */
 
+/*
+ * No-op macros that just return the current protection value. Defined here
+ * because these macros can be used used even if CONFIG_MMU is not defined.
+ */
+#ifndef pgprot_encrypted
+#define pgprot_encrypted(prot)	(prot)
+#endif
+
+#ifndef pgprot_decrypted
+#define pgprot_decrypted(prot)	(prot)
+#endif
+
 /*
  * A facility to provide lazy MMU batching.  This allows PTE updates and
  * page invalidations to be delayed until a call to leave lazy MMU mode
diff --git a/include/linux/mem_encrypt.h b/include/linux/mem_encrypt.h
index 570f4fcff13f..1255f09f5e42 100644
--- a/include/linux/mem_encrypt.h
+++ b/include/linux/mem_encrypt.h
@@ -35,6 +35,14 @@ static inline unsigned long sme_get_me_mask(void)
 	return sme_me_mask;
 }
 
+/*
+ * The __sme_set() and __sme_clr() macros are useful for adding or removing
+ * the encryption mask from a value (e.g. when dealing with pagetable
+ * entries).
+ */
+#define __sme_set(x)		((unsigned long)(x) | sme_me_mask)
+#define __sme_clr(x)		((unsigned long)(x) & ~sme_me_mask)
+
 #endif	/* __ASSEMBLY__ */
 
 #endif	/* __MEM_ENCRYPT_H__ */
-- 
cgit v1.2.3


From f88a68facd9a15b94f8c195d9d2c0b30c76c595a Mon Sep 17 00:00:00 2001
From: Tom Lendacky <thomas.lendacky@amd.com>
Date: Mon, 17 Jul 2017 16:10:09 -0500
Subject: x86/mm: Extend early_memremap() support with additional attrs
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Add early_memremap() support to be able to specify encrypted and
decrypted mappings with and without write-protection. The use of
write-protection is necessary when encrypting data "in place". The
write-protect attribute is considered cacheable for loads, but not
stores. This implies that the hardware will never give the core a
dirty line with this memtype.

Signed-off-by: Tom Lendacky <thomas.lendacky@amd.com>
Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Borislav Petkov <bp@suse.de>
Cc: Alexander Potapenko <glider@google.com>
Cc: Andrey Ryabinin <aryabinin@virtuozzo.com>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brijesh Singh <brijesh.singh@amd.com>
Cc: Dave Young <dyoung@redhat.com>
Cc: Dmitry Vyukov <dvyukov@google.com>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Cc: Larry Woodman <lwoodman@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Matt Fleming <matt@codeblueprint.co.uk>
Cc: Michael S. Tsirkin <mst@redhat.com>
Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Radim Krčmář <rkrcmar@redhat.com>
Cc: Rik van Riel <riel@redhat.com>
Cc: Toshimitsu Kani <toshi.kani@hpe.com>
Cc: kasan-dev@googlegroups.com
Cc: kvm@vger.kernel.org
Cc: linux-arch@vger.kernel.org
Cc: linux-doc@vger.kernel.org
Cc: linux-efi@vger.kernel.org
Cc: linux-mm@kvack.org
Link: http://lkml.kernel.org/r/479b5832c30fae3efa7932e48f81794e86397229.1500319216.git.thomas.lendacky@amd.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/Kconfig                     |  4 ++++
 arch/x86/include/asm/fixmap.h        | 13 +++++++++++
 arch/x86/include/asm/pgtable_types.h |  8 +++++++
 arch/x86/mm/ioremap.c                | 44 ++++++++++++++++++++++++++++++++++++
 include/asm-generic/early_ioremap.h  |  2 ++
 mm/early_ioremap.c                   | 10 ++++++++
 6 files changed, 81 insertions(+)

(limited to 'include')

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index ba7b93d08d00..8328bcb9ce8b 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -1440,6 +1440,10 @@ config AMD_MEM_ENCRYPT_ACTIVE_BY_DEFAULT
 	  If set to N, then the encryption of system memory can be
 	  activated with the mem_encrypt=on command line option.
 
+config ARCH_USE_MEMREMAP_PROT
+	def_bool y
+	depends on AMD_MEM_ENCRYPT
+
 # Common NUMA Features
 config NUMA
 	bool "Numa Memory Allocation and Scheduler Support"
diff --git a/arch/x86/include/asm/fixmap.h b/arch/x86/include/asm/fixmap.h
index d9ff226cb489..dcd9fb55e679 100644
--- a/arch/x86/include/asm/fixmap.h
+++ b/arch/x86/include/asm/fixmap.h
@@ -164,6 +164,19 @@ static inline void __set_fixmap(enum fixed_addresses idx,
  */
 #define FIXMAP_PAGE_NOCACHE PAGE_KERNEL_IO_NOCACHE
 
+/*
+ * Early memremap routines used for in-place encryption. The mappings created
+ * by these routines are intended to be used as temporary mappings.
+ */
+void __init *early_memremap_encrypted(resource_size_t phys_addr,
+				      unsigned long size);
+void __init *early_memremap_encrypted_wp(resource_size_t phys_addr,
+					 unsigned long size);
+void __init *early_memremap_decrypted(resource_size_t phys_addr,
+				      unsigned long size);
+void __init *early_memremap_decrypted_wp(resource_size_t phys_addr,
+					 unsigned long size);
+
 #include <asm-generic/fixmap.h>
 
 #define __late_set_fixmap(idx, phys, flags) __set_fixmap(idx, phys, flags)
diff --git a/arch/x86/include/asm/pgtable_types.h b/arch/x86/include/asm/pgtable_types.h
index de32ca32928a..32095af0fefb 100644
--- a/arch/x86/include/asm/pgtable_types.h
+++ b/arch/x86/include/asm/pgtable_types.h
@@ -161,6 +161,7 @@ enum page_cache_mode {
 
 #define _PAGE_CACHE_MASK	(_PAGE_PAT | _PAGE_PCD | _PAGE_PWT)
 #define _PAGE_NOCACHE		(cachemode2protval(_PAGE_CACHE_MODE_UC))
+#define _PAGE_CACHE_WP		(cachemode2protval(_PAGE_CACHE_MODE_WP))
 
 #define PAGE_NONE	__pgprot(_PAGE_PROTNONE | _PAGE_ACCESSED)
 #define PAGE_SHARED	__pgprot(_PAGE_PRESENT | _PAGE_RW | _PAGE_USER | \
@@ -189,6 +190,7 @@ enum page_cache_mode {
 #define __PAGE_KERNEL_VVAR		(__PAGE_KERNEL_RO | _PAGE_USER)
 #define __PAGE_KERNEL_LARGE		(__PAGE_KERNEL | _PAGE_PSE)
 #define __PAGE_KERNEL_LARGE_EXEC	(__PAGE_KERNEL_EXEC | _PAGE_PSE)
+#define __PAGE_KERNEL_WP		(__PAGE_KERNEL | _PAGE_CACHE_WP)
 
 #define __PAGE_KERNEL_IO		(__PAGE_KERNEL)
 #define __PAGE_KERNEL_IO_NOCACHE	(__PAGE_KERNEL_NOCACHE)
@@ -202,6 +204,12 @@ enum page_cache_mode {
 #define _KERNPG_TABLE	(_PAGE_PRESENT | _PAGE_RW | _PAGE_ACCESSED |	\
 			 _PAGE_DIRTY | _PAGE_ENC)
 
+#define __PAGE_KERNEL_ENC	(__PAGE_KERNEL | _PAGE_ENC)
+#define __PAGE_KERNEL_ENC_WP	(__PAGE_KERNEL_WP | _PAGE_ENC)
+
+#define __PAGE_KERNEL_NOENC	(__PAGE_KERNEL)
+#define __PAGE_KERNEL_NOENC_WP	(__PAGE_KERNEL_WP)
+
 #define PAGE_KERNEL		__pgprot(__PAGE_KERNEL | _PAGE_ENC)
 #define PAGE_KERNEL_RO		__pgprot(__PAGE_KERNEL_RO | _PAGE_ENC)
 #define PAGE_KERNEL_EXEC	__pgprot(__PAGE_KERNEL_EXEC | _PAGE_ENC)
diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c
index 66ddf5e8ffc8..570201bbf442 100644
--- a/arch/x86/mm/ioremap.c
+++ b/arch/x86/mm/ioremap.c
@@ -417,6 +417,50 @@ void unxlate_dev_mem_ptr(phys_addr_t phys, void *addr)
 	iounmap((void __iomem *)((unsigned long)addr & PAGE_MASK));
 }
 
+#ifdef CONFIG_ARCH_USE_MEMREMAP_PROT
+/* Remap memory with encryption */
+void __init *early_memremap_encrypted(resource_size_t phys_addr,
+				      unsigned long size)
+{
+	return early_memremap_prot(phys_addr, size, __PAGE_KERNEL_ENC);
+}
+
+/*
+ * Remap memory with encryption and write-protected - cannot be called
+ * before pat_init() is called
+ */
+void __init *early_memremap_encrypted_wp(resource_size_t phys_addr,
+					 unsigned long size)
+{
+	/* Be sure the write-protect PAT entry is set for write-protect */
+	if (__pte2cachemode_tbl[_PAGE_CACHE_MODE_WP] != _PAGE_CACHE_MODE_WP)
+		return NULL;
+
+	return early_memremap_prot(phys_addr, size, __PAGE_KERNEL_ENC_WP);
+}
+
+/* Remap memory without encryption */
+void __init *early_memremap_decrypted(resource_size_t phys_addr,
+				      unsigned long size)
+{
+	return early_memremap_prot(phys_addr, size, __PAGE_KERNEL_NOENC);
+}
+
+/*
+ * Remap memory without encryption and write-protected - cannot be called
+ * before pat_init() is called
+ */
+void __init *early_memremap_decrypted_wp(resource_size_t phys_addr,
+					 unsigned long size)
+{
+	/* Be sure the write-protect PAT entry is set for write-protect */
+	if (__pte2cachemode_tbl[_PAGE_CACHE_MODE_WP] != _PAGE_CACHE_MODE_WP)
+		return NULL;
+
+	return early_memremap_prot(phys_addr, size, __PAGE_KERNEL_NOENC_WP);
+}
+#endif	/* CONFIG_ARCH_USE_MEMREMAP_PROT */
+
 static pte_t bm_pte[PAGE_SIZE/sizeof(pte_t)] __page_aligned_bss;
 
 static inline pmd_t * __init early_ioremap_pmd(unsigned long addr)
diff --git a/include/asm-generic/early_ioremap.h b/include/asm-generic/early_ioremap.h
index 734ad4db388c..2edef8d7fa6b 100644
--- a/include/asm-generic/early_ioremap.h
+++ b/include/asm-generic/early_ioremap.h
@@ -13,6 +13,8 @@ extern void *early_memremap(resource_size_t phys_addr,
 			    unsigned long size);
 extern void *early_memremap_ro(resource_size_t phys_addr,
 			       unsigned long size);
+extern void *early_memremap_prot(resource_size_t phys_addr,
+				 unsigned long size, unsigned long prot_val);
 extern void early_iounmap(void __iomem *addr, unsigned long size);
 extern void early_memunmap(void *addr, unsigned long size);
 
diff --git a/mm/early_ioremap.c b/mm/early_ioremap.c
index 6d5717bd7197..d7d30da754ba 100644
--- a/mm/early_ioremap.c
+++ b/mm/early_ioremap.c
@@ -226,6 +226,16 @@ early_memremap_ro(resource_size_t phys_addr, unsigned long size)
 }
 #endif
 
+#ifdef CONFIG_ARCH_USE_MEMREMAP_PROT
+void __init *
+early_memremap_prot(resource_size_t phys_addr, unsigned long size,
+		    unsigned long prot_val)
+{
+	return (__force void *)__early_ioremap(phys_addr, size,
+					       __pgprot(prot_val));
+}
+#endif
+
 #define MAX_MAP_CHUNK	(NR_FIX_BTMAPS << PAGE_SHIFT)
 
 void __init copy_from_early_mem(void *dest, phys_addr_t src, unsigned long size)
-- 
cgit v1.2.3


From a19d66c56af1c52b8b463bf94d21116ae8c1aa5a Mon Sep 17 00:00:00 2001
From: Tom Lendacky <thomas.lendacky@amd.com>
Date: Mon, 17 Jul 2017 16:10:13 -0500
Subject: efi: Add an EFI table address match function
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Add a function that will determine if a supplied physical address matches
the address of an EFI table.

Signed-off-by: Tom Lendacky <thomas.lendacky@amd.com>
Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Matt Fleming <matt@codeblueprint.co.uk>
Reviewed-by: Borislav Petkov <bp@suse.de>
Cc: Alexander Potapenko <glider@google.com>
Cc: Andrey Ryabinin <aryabinin@virtuozzo.com>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brijesh Singh <brijesh.singh@amd.com>
Cc: Dave Young <dyoung@redhat.com>
Cc: Dmitry Vyukov <dvyukov@google.com>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Cc: Larry Woodman <lwoodman@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Michael S. Tsirkin <mst@redhat.com>
Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Radim Krčmář <rkrcmar@redhat.com>
Cc: Rik van Riel <riel@redhat.com>
Cc: Toshimitsu Kani <toshi.kani@hpe.com>
Cc: kasan-dev@googlegroups.com
Cc: kvm@vger.kernel.org
Cc: linux-arch@vger.kernel.org
Cc: linux-doc@vger.kernel.org
Cc: linux-efi@vger.kernel.org
Cc: linux-mm@kvack.org
Link: http://lkml.kernel.org/r/e1e06441d80f44776df391e0e4cb485b345b7518.1500319216.git.thomas.lendacky@amd.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 drivers/firmware/efi/efi.c | 33 +++++++++++++++++++++++++++++++++
 include/linux/efi.h        |  7 +++++++
 2 files changed, 40 insertions(+)

(limited to 'include')

diff --git a/drivers/firmware/efi/efi.c b/drivers/firmware/efi/efi.c
index 045d6d311bde..69d4d130e055 100644
--- a/drivers/firmware/efi/efi.c
+++ b/drivers/firmware/efi/efi.c
@@ -55,6 +55,25 @@ struct efi __read_mostly efi = {
 };
 EXPORT_SYMBOL(efi);
 
+static unsigned long *efi_tables[] = {
+	&efi.mps,
+	&efi.acpi,
+	&efi.acpi20,
+	&efi.smbios,
+	&efi.smbios3,
+	&efi.sal_systab,
+	&efi.boot_info,
+	&efi.hcdp,
+	&efi.uga,
+	&efi.uv_systab,
+	&efi.fw_vendor,
+	&efi.runtime,
+	&efi.config_table,
+	&efi.esrt,
+	&efi.properties_table,
+	&efi.mem_attr_table,
+};
+
 static bool disable_runtime;
 static int __init setup_noefi(char *arg)
 {
@@ -855,6 +874,20 @@ int efi_status_to_err(efi_status_t status)
 	return err;
 }
 
+bool efi_is_table_address(unsigned long phys_addr)
+{
+	unsigned int i;
+
+	if (phys_addr == EFI_INVALID_TABLE_ADDR)
+		return false;
+
+	for (i = 0; i < ARRAY_SIZE(efi_tables); i++)
+		if (*(efi_tables[i]) == phys_addr)
+			return true;
+
+	return false;
+}
+
 #ifdef CONFIG_KEXEC
 static int update_efi_random_seed(struct notifier_block *nb,
 				  unsigned long code, void *unused)
diff --git a/include/linux/efi.h b/include/linux/efi.h
index 8269bcb8ccf7..8e24f099bd3f 100644
--- a/include/linux/efi.h
+++ b/include/linux/efi.h
@@ -1091,6 +1091,8 @@ static inline bool efi_enabled(int feature)
 	return test_bit(feature, &efi.flags) != 0;
 }
 extern void efi_reboot(enum reboot_mode reboot_mode, const char *__unused);
+
+extern bool efi_is_table_address(unsigned long phys_addr);
 #else
 static inline bool efi_enabled(int feature)
 {
@@ -1104,6 +1106,11 @@ efi_capsule_pending(int *reset_type)
 {
 	return false;
 }
+
+static inline bool efi_is_table_address(unsigned long phys_addr)
+{
+	return false;
+}
 #endif
 
 extern int efi_status_to_err(efi_status_t status);
-- 
cgit v1.2.3


From f99afd08a45fbbd9ce35a7624ffd1d850a1906c0 Mon Sep 17 00:00:00 2001
From: Tom Lendacky <thomas.lendacky@amd.com>
Date: Mon, 17 Jul 2017 16:10:14 -0500
Subject: efi: Update efi_mem_type() to return an error rather than 0
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The efi_mem_type() function currently returns a 0, which maps to
EFI_RESERVED_TYPE, if the function is unable to find a memmap entry for
the supplied physical address. Returning EFI_RESERVED_TYPE implies that
a memmap entry exists, when it doesn't.  Instead of returning 0, change
the function to return a negative error value when no memmap entry is
found.

Signed-off-by: Tom Lendacky <thomas.lendacky@amd.com>
Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Matt Fleming <matt@codeblueprint.co.uk>
Reviewed-by: Borislav Petkov <bp@suse.de>
Cc: Alexander Potapenko <glider@google.com>
Cc: Andrey Ryabinin <aryabinin@virtuozzo.com>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brijesh Singh <brijesh.singh@amd.com>
Cc: Dave Young <dyoung@redhat.com>
Cc: Dmitry Vyukov <dvyukov@google.com>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Cc: Larry Woodman <lwoodman@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Michael S. Tsirkin <mst@redhat.com>
Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Radim Krčmář <rkrcmar@redhat.com>
Cc: Rik van Riel <riel@redhat.com>
Cc: Toshimitsu Kani <toshi.kani@hpe.com>
Cc: kasan-dev@googlegroups.com
Cc: kvm@vger.kernel.org
Cc: linux-arch@vger.kernel.org
Cc: linux-doc@vger.kernel.org
Cc: linux-efi@vger.kernel.org
Cc: linux-mm@kvack.org
Link: http://lkml.kernel.org/r/7fbf40a9dc414d5da849e1ddcd7f7c1285e4e181.1500319216.git.thomas.lendacky@amd.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/ia64/kernel/efi.c      | 4 ++--
 arch/x86/platform/efi/efi.c | 6 +++---
 include/linux/efi.h         | 2 +-
 3 files changed, 6 insertions(+), 6 deletions(-)

(limited to 'include')

diff --git a/arch/ia64/kernel/efi.c b/arch/ia64/kernel/efi.c
index 121295637d0d..81416000c5e0 100644
--- a/arch/ia64/kernel/efi.c
+++ b/arch/ia64/kernel/efi.c
@@ -757,14 +757,14 @@ efi_memmap_intersects (unsigned long phys_addr, unsigned long size)
 	return 0;
 }
 
-u32
+int
 efi_mem_type (unsigned long phys_addr)
 {
 	efi_memory_desc_t *md = efi_memory_descriptor(phys_addr);
 
 	if (md)
 		return md->type;
-	return 0;
+	return -EINVAL;
 }
 
 u64
diff --git a/arch/x86/platform/efi/efi.c b/arch/x86/platform/efi/efi.c
index f084d8718ac4..6217b23e85f6 100644
--- a/arch/x86/platform/efi/efi.c
+++ b/arch/x86/platform/efi/efi.c
@@ -1035,12 +1035,12 @@ void __init efi_enter_virtual_mode(void)
 /*
  * Convenience functions to obtain memory types and attributes
  */
-u32 efi_mem_type(unsigned long phys_addr)
+int efi_mem_type(unsigned long phys_addr)
 {
 	efi_memory_desc_t *md;
 
 	if (!efi_enabled(EFI_MEMMAP))
-		return 0;
+		return -ENOTSUPP;
 
 	for_each_efi_memory_desc(md) {
 		if ((md->phys_addr <= phys_addr) &&
@@ -1048,7 +1048,7 @@ u32 efi_mem_type(unsigned long phys_addr)
 				  (md->num_pages << EFI_PAGE_SHIFT))))
 			return md->type;
 	}
-	return 0;
+	return -EINVAL;
 }
 
 static int __init arch_parse_efi_cmdline(char *str)
diff --git a/include/linux/efi.h b/include/linux/efi.h
index 8e24f099bd3f..4e47f78430be 100644
--- a/include/linux/efi.h
+++ b/include/linux/efi.h
@@ -985,7 +985,7 @@ static inline void efi_esrt_init(void) { }
 extern int efi_config_parse_tables(void *config_tables, int count, int sz,
 				   efi_config_table_type_t *arch_tables);
 extern u64 efi_get_iobase (void);
-extern u32 efi_mem_type (unsigned long phys_addr);
+extern int efi_mem_type(unsigned long phys_addr);
 extern u64 efi_mem_attributes (unsigned long phys_addr);
 extern u64 efi_mem_attribute (unsigned long phys_addr, unsigned long size);
 extern int __init efi_uart_console_only (void);
-- 
cgit v1.2.3


From 8f716c9b5febf6ed0f5fedb7c9407cd0c25b2796 Mon Sep 17 00:00:00 2001
From: Tom Lendacky <thomas.lendacky@amd.com>
Date: Mon, 17 Jul 2017 16:10:16 -0500
Subject: x86/mm: Add support to access boot related data in the clear
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Boot data (such as EFI related data) is not encrypted when the system is
booted because UEFI/BIOS does not run with SME active. In order to access
this data properly it needs to be mapped decrypted.

Update early_memremap() to provide an arch specific routine to modify the
pagetable protection attributes before they are applied to the new
mapping. This is used to remove the encryption mask for boot related data.

Update memremap() to provide an arch specific routine to determine if RAM
remapping is allowed.  RAM remapping will cause an encrypted mapping to be
generated. By preventing RAM remapping, ioremap_cache() will be used
instead, which will provide a decrypted mapping of the boot related data.

Signed-off-by: Tom Lendacky <thomas.lendacky@amd.com>
Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Matt Fleming <matt@codeblueprint.co.uk>
Reviewed-by: Borislav Petkov <bp@suse.de>
Cc: Alexander Potapenko <glider@google.com>
Cc: Andrey Ryabinin <aryabinin@virtuozzo.com>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brijesh Singh <brijesh.singh@amd.com>
Cc: Dave Young <dyoung@redhat.com>
Cc: Dmitry Vyukov <dvyukov@google.com>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Cc: Larry Woodman <lwoodman@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Michael S. Tsirkin <mst@redhat.com>
Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Radim Krčmář <rkrcmar@redhat.com>
Cc: Rik van Riel <riel@redhat.com>
Cc: Toshimitsu Kani <toshi.kani@hpe.com>
Cc: kasan-dev@googlegroups.com
Cc: kvm@vger.kernel.org
Cc: linux-arch@vger.kernel.org
Cc: linux-doc@vger.kernel.org
Cc: linux-efi@vger.kernel.org
Cc: linux-mm@kvack.org
Link: http://lkml.kernel.org/r/81fb6b4117a5df6b9f2eda342f81bbef4b23d2e5.1500319216.git.thomas.lendacky@amd.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/include/asm/io.h |   5 ++
 arch/x86/mm/ioremap.c     | 180 ++++++++++++++++++++++++++++++++++++++++++++++
 include/linux/io.h        |   2 +
 kernel/memremap.c         |  20 ++++--
 mm/early_ioremap.c        |  18 ++++-
 5 files changed, 218 insertions(+), 7 deletions(-)

(limited to 'include')

diff --git a/arch/x86/include/asm/io.h b/arch/x86/include/asm/io.h
index 7afb0e2f07f4..09c5557b1454 100644
--- a/arch/x86/include/asm/io.h
+++ b/arch/x86/include/asm/io.h
@@ -381,4 +381,9 @@ extern void arch_io_free_memtype_wc(resource_size_t start, resource_size_t size)
 #define arch_io_reserve_memtype_wc arch_io_reserve_memtype_wc
 #endif
 
+extern bool arch_memremap_can_ram_remap(resource_size_t offset,
+					unsigned long size,
+					unsigned long flags);
+#define arch_memremap_can_ram_remap arch_memremap_can_ram_remap
+
 #endif /* _ASM_X86_IO_H */
diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c
index 570201bbf442..8986b2868944 100644
--- a/arch/x86/mm/ioremap.c
+++ b/arch/x86/mm/ioremap.c
@@ -13,6 +13,8 @@
 #include <linux/slab.h>
 #include <linux/vmalloc.h>
 #include <linux/mmiotrace.h>
+#include <linux/mem_encrypt.h>
+#include <linux/efi.h>
 
 #include <asm/set_memory.h>
 #include <asm/e820/api.h>
@@ -21,6 +23,7 @@
 #include <asm/tlbflush.h>
 #include <asm/pgalloc.h>
 #include <asm/pat.h>
+#include <asm/setup.h>
 
 #include "physaddr.h"
 
@@ -417,6 +420,183 @@ void unxlate_dev_mem_ptr(phys_addr_t phys, void *addr)
 	iounmap((void __iomem *)((unsigned long)addr & PAGE_MASK));
 }
 
+/*
+ * Examine the physical address to determine if it is an area of memory
+ * that should be mapped decrypted.  If the memory is not part of the
+ * kernel usable area it was accessed and created decrypted, so these
+ * areas should be mapped decrypted.
+ */
+static bool memremap_should_map_decrypted(resource_size_t phys_addr,
+					  unsigned long size)
+{
+	/* Check if the address is outside kernel usable area */
+	switch (e820__get_entry_type(phys_addr, phys_addr + size - 1)) {
+	case E820_TYPE_RESERVED:
+	case E820_TYPE_ACPI:
+	case E820_TYPE_NVS:
+	case E820_TYPE_UNUSABLE:
+		return true;
+	default:
+		break;
+	}
+
+	return false;
+}
+
+/*
+ * Examine the physical address to determine if it is EFI data. Check
+ * it against the boot params structure and EFI tables and memory types.
+ */
+static bool memremap_is_efi_data(resource_size_t phys_addr,
+				 unsigned long size)
+{
+	u64 paddr;
+
+	/* Check if the address is part of EFI boot/runtime data */
+	if (!efi_enabled(EFI_BOOT))
+		return false;
+
+	paddr = boot_params.efi_info.efi_memmap_hi;
+	paddr <<= 32;
+	paddr |= boot_params.efi_info.efi_memmap;
+	if (phys_addr == paddr)
+		return true;
+
+	paddr = boot_params.efi_info.efi_systab_hi;
+	paddr <<= 32;
+	paddr |= boot_params.efi_info.efi_systab;
+	if (phys_addr == paddr)
+		return true;
+
+	if (efi_is_table_address(phys_addr))
+		return true;
+
+	switch (efi_mem_type(phys_addr)) {
+	case EFI_BOOT_SERVICES_DATA:
+	case EFI_RUNTIME_SERVICES_DATA:
+		return true;
+	default:
+		break;
+	}
+
+	return false;
+}
+
+/*
+ * Examine the physical address to determine if it is boot data by checking
+ * it against the boot params setup_data chain.
+ */
+static bool memremap_is_setup_data(resource_size_t phys_addr,
+				   unsigned long size)
+{
+	struct setup_data *data;
+	u64 paddr, paddr_next;
+
+	paddr = boot_params.hdr.setup_data;
+	while (paddr) {
+		unsigned int len;
+
+		if (phys_addr == paddr)
+			return true;
+
+		data = memremap(paddr, sizeof(*data),
+				MEMREMAP_WB | MEMREMAP_DEC);
+
+		paddr_next = data->next;
+		len = data->len;
+
+		memunmap(data);
+
+		if ((phys_addr > paddr) && (phys_addr < (paddr + len)))
+			return true;
+
+		paddr = paddr_next;
+	}
+
+	return false;
+}
+
+/*
+ * Examine the physical address to determine if it is boot data by checking
+ * it against the boot params setup_data chain (early boot version).
+ */
+static bool __init early_memremap_is_setup_data(resource_size_t phys_addr,
+						unsigned long size)
+{
+	struct setup_data *data;
+	u64 paddr, paddr_next;
+
+	paddr = boot_params.hdr.setup_data;
+	while (paddr) {
+		unsigned int len;
+
+		if (phys_addr == paddr)
+			return true;
+
+		data = early_memremap_decrypted(paddr, sizeof(*data));
+
+		paddr_next = data->next;
+		len = data->len;
+
+		early_memunmap(data, sizeof(*data));
+
+		if ((phys_addr > paddr) && (phys_addr < (paddr + len)))
+			return true;
+
+		paddr = paddr_next;
+	}
+
+	return false;
+}
+
+/*
+ * Architecture function to determine if RAM remap is allowed. By default, a
+ * RAM remap will map the data as encrypted. Determine if a RAM remap should
+ * not be done so that the data will be mapped decrypted.
+ */
+bool arch_memremap_can_ram_remap(resource_size_t phys_addr, unsigned long size,
+				 unsigned long flags)
+{
+	if (!sme_active())
+		return true;
+
+	if (flags & MEMREMAP_ENC)
+		return true;
+
+	if (flags & MEMREMAP_DEC)
+		return false;
+
+	if (memremap_is_setup_data(phys_addr, size) ||
+	    memremap_is_efi_data(phys_addr, size) ||
+	    memremap_should_map_decrypted(phys_addr, size))
+		return false;
+
+	return true;
+}
+
+/*
+ * Architecture override of __weak function to adjust the protection attributes
+ * used when remapping memory. By default, early_memremap() will map the data
+ * as encrypted. Determine if an encrypted mapping should not be done and set
+ * the appropriate protection attributes.
+ */
+pgprot_t __init early_memremap_pgprot_adjust(resource_size_t phys_addr,
+					     unsigned long size,
+					     pgprot_t prot)
+{
+	if (!sme_active())
+		return prot;
+
+	if (early_memremap_is_setup_data(phys_addr, size) ||
+	    memremap_is_efi_data(phys_addr, size) ||
+	    memremap_should_map_decrypted(phys_addr, size))
+		prot = pgprot_decrypted(prot);
+	else
+		prot = pgprot_encrypted(prot);
+
+	return prot;
+}
+
 #ifdef CONFIG_ARCH_USE_MEMREMAP_PROT
 /* Remap memory with encryption */
 void __init *early_memremap_encrypted(resource_size_t phys_addr,
diff --git a/include/linux/io.h b/include/linux/io.h
index 2195d9ea4aaa..32e30e8fb9db 100644
--- a/include/linux/io.h
+++ b/include/linux/io.h
@@ -157,6 +157,8 @@ enum {
 	MEMREMAP_WB = 1 << 0,
 	MEMREMAP_WT = 1 << 1,
 	MEMREMAP_WC = 1 << 2,
+	MEMREMAP_ENC = 1 << 3,
+	MEMREMAP_DEC = 1 << 4,
 };
 
 void *memremap(resource_size_t offset, size_t size, unsigned long flags);
diff --git a/kernel/memremap.c b/kernel/memremap.c
index 124bed776532..9afdc434fb49 100644
--- a/kernel/memremap.c
+++ b/kernel/memremap.c
@@ -34,13 +34,24 @@ static void *arch_memremap_wb(resource_size_t offset, unsigned long size)
 }
 #endif
 
-static void *try_ram_remap(resource_size_t offset, size_t size)
+#ifndef arch_memremap_can_ram_remap
+static bool arch_memremap_can_ram_remap(resource_size_t offset, size_t size,
+					unsigned long flags)
+{
+	return true;
+}
+#endif
+
+static void *try_ram_remap(resource_size_t offset, size_t size,
+			   unsigned long flags)
 {
 	unsigned long pfn = PHYS_PFN(offset);
 
 	/* In the simple case just return the existing linear address */
-	if (pfn_valid(pfn) && !PageHighMem(pfn_to_page(pfn)))
+	if (pfn_valid(pfn) && !PageHighMem(pfn_to_page(pfn)) &&
+	    arch_memremap_can_ram_remap(offset, size, flags))
 		return __va(offset);
+
 	return NULL; /* fallback to arch_memremap_wb */
 }
 
@@ -48,7 +59,8 @@ static void *try_ram_remap(resource_size_t offset, size_t size)
  * memremap() - remap an iomem_resource as cacheable memory
  * @offset: iomem resource start address
  * @size: size of remap
- * @flags: any of MEMREMAP_WB, MEMREMAP_WT and MEMREMAP_WC
+ * @flags: any of MEMREMAP_WB, MEMREMAP_WT, MEMREMAP_WC,
+ *		  MEMREMAP_ENC, MEMREMAP_DEC
  *
  * memremap() is "ioremap" for cases where it is known that the resource
  * being mapped does not have i/o side effects and the __iomem
@@ -95,7 +107,7 @@ void *memremap(resource_size_t offset, size_t size, unsigned long flags)
 		 * the requested range is potentially in System RAM.
 		 */
 		if (is_ram == REGION_INTERSECTS)
-			addr = try_ram_remap(offset, size);
+			addr = try_ram_remap(offset, size, flags);
 		if (!addr)
 			addr = arch_memremap_wb(offset, size);
 	}
diff --git a/mm/early_ioremap.c b/mm/early_ioremap.c
index d7d30da754ba..b1dd4a948fc0 100644
--- a/mm/early_ioremap.c
+++ b/mm/early_ioremap.c
@@ -30,6 +30,13 @@ early_param("early_ioremap_debug", early_ioremap_debug_setup);
 
 static int after_paging_init __initdata;
 
+pgprot_t __init __weak early_memremap_pgprot_adjust(resource_size_t phys_addr,
+						    unsigned long size,
+						    pgprot_t prot)
+{
+	return prot;
+}
+
 void __init __weak early_ioremap_shutdown(void)
 {
 }
@@ -215,14 +222,19 @@ early_ioremap(resource_size_t phys_addr, unsigned long size)
 void __init *
 early_memremap(resource_size_t phys_addr, unsigned long size)
 {
-	return (__force void *)__early_ioremap(phys_addr, size,
-					       FIXMAP_PAGE_NORMAL);
+	pgprot_t prot = early_memremap_pgprot_adjust(phys_addr, size,
+						     FIXMAP_PAGE_NORMAL);
+
+	return (__force void *)__early_ioremap(phys_addr, size, prot);
 }
 #ifdef FIXMAP_PAGE_RO
 void __init *
 early_memremap_ro(resource_size_t phys_addr, unsigned long size)
 {
-	return (__force void *)__early_ioremap(phys_addr, size, FIXMAP_PAGE_RO);
+	pgprot_t prot = early_memremap_pgprot_adjust(phys_addr, size,
+						     FIXMAP_PAGE_RO);
+
+	return (__force void *)__early_ioremap(phys_addr, size, prot);
 }
 #endif
 
-- 
cgit v1.2.3


From c7753208a94c73d5beb1e4bd843081d6dc7d4678 Mon Sep 17 00:00:00 2001
From: Tom Lendacky <thomas.lendacky@amd.com>
Date: Mon, 17 Jul 2017 16:10:21 -0500
Subject: x86, swiotlb: Add memory encryption support
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Since DMA addresses will effectively look like 48-bit addresses when the
memory encryption mask is set, SWIOTLB is needed if the DMA mask of the
device performing the DMA does not support 48-bits. SWIOTLB will be
initialized to create decrypted bounce buffers for use by these devices.

Signed-off-by: Tom Lendacky <thomas.lendacky@amd.com>
Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Alexander Potapenko <glider@google.com>
Cc: Andrey Ryabinin <aryabinin@virtuozzo.com>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brijesh Singh <brijesh.singh@amd.com>
Cc: Dave Young <dyoung@redhat.com>
Cc: Dmitry Vyukov <dvyukov@google.com>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Cc: Larry Woodman <lwoodman@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Matt Fleming <matt@codeblueprint.co.uk>
Cc: Michael S. Tsirkin <mst@redhat.com>
Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Radim Krčmář <rkrcmar@redhat.com>
Cc: Rik van Riel <riel@redhat.com>
Cc: Toshimitsu Kani <toshi.kani@hpe.com>
Cc: kasan-dev@googlegroups.com
Cc: kvm@vger.kernel.org
Cc: linux-arch@vger.kernel.org
Cc: linux-doc@vger.kernel.org
Cc: linux-efi@vger.kernel.org
Cc: linux-mm@kvack.org
Link: http://lkml.kernel.org/r/aa2d29b78ae7d508db8881e46a3215231b9327a7.1500319216.git.thomas.lendacky@amd.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/include/asm/dma-mapping.h |  5 ++--
 arch/x86/include/asm/mem_encrypt.h |  5 ++++
 arch/x86/kernel/pci-dma.c          | 11 +++++---
 arch/x86/kernel/pci-nommu.c        |  2 +-
 arch/x86/kernel/pci-swiotlb.c      | 15 +++++++++--
 arch/x86/mm/mem_encrypt.c          | 22 ++++++++++++++++
 include/linux/swiotlb.h            |  1 +
 init/main.c                        | 10 +++++++
 lib/swiotlb.c                      | 54 ++++++++++++++++++++++++++++++++------
 9 files changed, 108 insertions(+), 17 deletions(-)

(limited to 'include')

diff --git a/arch/x86/include/asm/dma-mapping.h b/arch/x86/include/asm/dma-mapping.h
index 398c79889f5c..1387dafdba2d 100644
--- a/arch/x86/include/asm/dma-mapping.h
+++ b/arch/x86/include/asm/dma-mapping.h
@@ -12,6 +12,7 @@
 #include <asm/io.h>
 #include <asm/swiotlb.h>
 #include <linux/dma-contiguous.h>
+#include <linux/mem_encrypt.h>
 
 #ifdef CONFIG_ISA
 # define ISA_DMA_BIT_MASK DMA_BIT_MASK(24)
@@ -57,12 +58,12 @@ static inline bool dma_capable(struct device *dev, dma_addr_t addr, size_t size)
 
 static inline dma_addr_t phys_to_dma(struct device *dev, phys_addr_t paddr)
 {
-	return paddr;
+	return __sme_set(paddr);
 }
 
 static inline phys_addr_t dma_to_phys(struct device *dev, dma_addr_t daddr)
 {
-	return daddr;
+	return __sme_clr(daddr);
 }
 #endif /* CONFIG_X86_DMA_REMAP */
 
diff --git a/arch/x86/include/asm/mem_encrypt.h b/arch/x86/include/asm/mem_encrypt.h
index ab1fe77c2f73..70e55f6b9adf 100644
--- a/arch/x86/include/asm/mem_encrypt.h
+++ b/arch/x86/include/asm/mem_encrypt.h
@@ -34,6 +34,11 @@ void __init sme_early_init(void);
 void __init sme_encrypt_kernel(void);
 void __init sme_enable(void);
 
+/* Architecture __weak replacement functions */
+void __init mem_encrypt_init(void);
+
+void swiotlb_set_mem_attributes(void *vaddr, unsigned long size);
+
 #else	/* !CONFIG_AMD_MEM_ENCRYPT */
 
 #define sme_me_mask	0UL
diff --git a/arch/x86/kernel/pci-dma.c b/arch/x86/kernel/pci-dma.c
index 5e16d3f29594..0accc2404b92 100644
--- a/arch/x86/kernel/pci-dma.c
+++ b/arch/x86/kernel/pci-dma.c
@@ -93,9 +93,12 @@ again:
 	if (gfpflags_allow_blocking(flag)) {
 		page = dma_alloc_from_contiguous(dev, count, get_order(size),
 						 flag);
-		if (page && page_to_phys(page) + size > dma_mask) {
-			dma_release_from_contiguous(dev, page, count);
-			page = NULL;
+		if (page) {
+			addr = phys_to_dma(dev, page_to_phys(page));
+			if (addr + size > dma_mask) {
+				dma_release_from_contiguous(dev, page, count);
+				page = NULL;
+			}
 		}
 	}
 	/* fallback */
@@ -104,7 +107,7 @@ again:
 	if (!page)
 		return NULL;
 
-	addr = page_to_phys(page);
+	addr = phys_to_dma(dev, page_to_phys(page));
 	if (addr + size > dma_mask) {
 		__free_pages(page, get_order(size));
 
diff --git a/arch/x86/kernel/pci-nommu.c b/arch/x86/kernel/pci-nommu.c
index a6d404087fe3..4fc3cb60ea11 100644
--- a/arch/x86/kernel/pci-nommu.c
+++ b/arch/x86/kernel/pci-nommu.c
@@ -32,7 +32,7 @@ static dma_addr_t nommu_map_page(struct device *dev, struct page *page,
 				 enum dma_data_direction dir,
 				 unsigned long attrs)
 {
-	dma_addr_t bus = page_to_phys(page) + offset;
+	dma_addr_t bus = phys_to_dma(dev, page_to_phys(page)) + offset;
 	WARN_ON(size == 0);
 	if (!check_addr("map_single", dev, bus, size))
 		return NOMMU_MAPPING_ERROR;
diff --git a/arch/x86/kernel/pci-swiotlb.c b/arch/x86/kernel/pci-swiotlb.c
index 1e23577e17cf..677077510e30 100644
--- a/arch/x86/kernel/pci-swiotlb.c
+++ b/arch/x86/kernel/pci-swiotlb.c
@@ -6,12 +6,14 @@
 #include <linux/swiotlb.h>
 #include <linux/bootmem.h>
 #include <linux/dma-mapping.h>
+#include <linux/mem_encrypt.h>
 
 #include <asm/iommu.h>
 #include <asm/swiotlb.h>
 #include <asm/dma.h>
 #include <asm/xen/swiotlb-xen.h>
 #include <asm/iommu_table.h>
+
 int swiotlb __read_mostly;
 
 void *x86_swiotlb_alloc_coherent(struct device *hwdev, size_t size,
@@ -79,8 +81,8 @@ IOMMU_INIT_FINISH(pci_swiotlb_detect_override,
 		  pci_swiotlb_late_init);
 
 /*
- * if 4GB or more detected (and iommu=off not set) return 1
- * and set swiotlb to 1.
+ * If 4GB or more detected (and iommu=off not set) or if SME is active
+ * then set swiotlb to 1 and return 1.
  */
 int __init pci_swiotlb_detect_4gb(void)
 {
@@ -89,6 +91,15 @@ int __init pci_swiotlb_detect_4gb(void)
 	if (!no_iommu && max_possible_pfn > MAX_DMA32_PFN)
 		swiotlb = 1;
 #endif
+
+	/*
+	 * If SME is active then swiotlb will be set to 1 so that bounce
+	 * buffers are allocated and used for devices that do not support
+	 * the addressing range required for the encryption mask.
+	 */
+	if (sme_active())
+		swiotlb = 1;
+
 	return swiotlb;
 }
 IOMMU_INIT(pci_swiotlb_detect_4gb,
diff --git a/arch/x86/mm/mem_encrypt.c b/arch/x86/mm/mem_encrypt.c
index 0843d023da4a..a7400ec8538b 100644
--- a/arch/x86/mm/mem_encrypt.c
+++ b/arch/x86/mm/mem_encrypt.c
@@ -13,11 +13,14 @@
 #include <linux/linkage.h>
 #include <linux/init.h>
 #include <linux/mm.h>
+#include <linux/dma-mapping.h>
+#include <linux/swiotlb.h>
 
 #include <asm/tlbflush.h>
 #include <asm/fixmap.h>
 #include <asm/setup.h>
 #include <asm/bootparam.h>
+#include <asm/set_memory.h>
 
 /*
  * Since SME related variables are set early in the boot process they must
@@ -177,6 +180,25 @@ void __init sme_early_init(void)
 		protection_map[i] = pgprot_encrypted(protection_map[i]);
 }
 
+/* Architecture __weak replacement functions */
+void __init mem_encrypt_init(void)
+{
+	if (!sme_me_mask)
+		return;
+
+	/* Call into SWIOTLB to update the SWIOTLB DMA buffers */
+	swiotlb_update_mem_attributes();
+}
+
+void swiotlb_set_mem_attributes(void *vaddr, unsigned long size)
+{
+	WARN(PAGE_ALIGN(size) != size,
+	     "size is not page-aligned (%#lx)\n", size);
+
+	/* Make the SWIOTLB buffer area decrypted */
+	set_memory_decrypted((unsigned long)vaddr, size >> PAGE_SHIFT);
+}
+
 void __init sme_encrypt_kernel(void)
 {
 }
diff --git a/include/linux/swiotlb.h b/include/linux/swiotlb.h
index 4ee479f2f355..15e7160751a8 100644
--- a/include/linux/swiotlb.h
+++ b/include/linux/swiotlb.h
@@ -35,6 +35,7 @@ int swiotlb_init_with_tbl(char *tlb, unsigned long nslabs, int verbose);
 extern unsigned long swiotlb_nr_tbl(void);
 unsigned long swiotlb_size_or_default(void);
 extern int swiotlb_late_init_with_tbl(char *tlb, unsigned long nslabs);
+extern void __init swiotlb_update_mem_attributes(void);
 
 /*
  * Enumeration for sync targets
diff --git a/init/main.c b/init/main.c
index 052481fbe363..9789ab7fe85e 100644
--- a/init/main.c
+++ b/init/main.c
@@ -488,6 +488,8 @@ void __init __weak thread_stack_cache_init(void)
 }
 #endif
 
+void __init __weak mem_encrypt_init(void) { }
+
 /*
  * Set up kernel memory allocators
  */
@@ -641,6 +643,14 @@ asmlinkage __visible void __init start_kernel(void)
 	 */
 	locking_selftest();
 
+	/*
+	 * This needs to be called before any devices perform DMA
+	 * operations that might use the SWIOTLB bounce buffers. It will
+	 * mark the bounce buffers as decrypted so that their usage will
+	 * not cause "plain-text" data to be decrypted when accessed.
+	 */
+	mem_encrypt_init();
+
 #ifdef CONFIG_BLK_DEV_INITRD
 	if (initrd_start && !initrd_below_start_ok &&
 	    page_to_pfn(virt_to_page((void *)initrd_start)) < min_low_pfn) {
diff --git a/lib/swiotlb.c b/lib/swiotlb.c
index a8d74a733a38..04ac91acf193 100644
--- a/lib/swiotlb.c
+++ b/lib/swiotlb.c
@@ -30,6 +30,7 @@
 #include <linux/highmem.h>
 #include <linux/gfp.h>
 #include <linux/scatterlist.h>
+#include <linux/mem_encrypt.h>
 
 #include <asm/io.h>
 #include <asm/dma.h>
@@ -155,6 +156,15 @@ unsigned long swiotlb_size_or_default(void)
 	return size ? size : (IO_TLB_DEFAULT_SIZE);
 }
 
+void __weak swiotlb_set_mem_attributes(void *vaddr, unsigned long size) { }
+
+/* For swiotlb, clear memory encryption mask from dma addresses */
+static dma_addr_t swiotlb_phys_to_dma(struct device *hwdev,
+				      phys_addr_t address)
+{
+	return __sme_clr(phys_to_dma(hwdev, address));
+}
+
 /* Note that this doesn't work with highmem page */
 static dma_addr_t swiotlb_virt_to_bus(struct device *hwdev,
 				      volatile void *address)
@@ -183,6 +193,31 @@ void swiotlb_print_info(void)
 	       bytes >> 20, vstart, vend - 1);
 }
 
+/*
+ * Early SWIOTLB allocation may be too early to allow an architecture to
+ * perform the desired operations.  This function allows the architecture to
+ * call SWIOTLB when the operations are possible.  It needs to be called
+ * before the SWIOTLB memory is used.
+ */
+void __init swiotlb_update_mem_attributes(void)
+{
+	void *vaddr;
+	unsigned long bytes;
+
+	if (no_iotlb_memory || late_alloc)
+		return;
+
+	vaddr = phys_to_virt(io_tlb_start);
+	bytes = PAGE_ALIGN(io_tlb_nslabs << IO_TLB_SHIFT);
+	swiotlb_set_mem_attributes(vaddr, bytes);
+	memset(vaddr, 0, bytes);
+
+	vaddr = phys_to_virt(io_tlb_overflow_buffer);
+	bytes = PAGE_ALIGN(io_tlb_overflow);
+	swiotlb_set_mem_attributes(vaddr, bytes);
+	memset(vaddr, 0, bytes);
+}
+
 int __init swiotlb_init_with_tbl(char *tlb, unsigned long nslabs, int verbose)
 {
 	void *v_overflow_buffer;
@@ -320,6 +355,7 @@ swiotlb_late_init_with_tbl(char *tlb, unsigned long nslabs)
 	io_tlb_start = virt_to_phys(tlb);
 	io_tlb_end = io_tlb_start + bytes;
 
+	swiotlb_set_mem_attributes(tlb, bytes);
 	memset(tlb, 0, bytes);
 
 	/*
@@ -330,6 +366,8 @@ swiotlb_late_init_with_tbl(char *tlb, unsigned long nslabs)
 	if (!v_overflow_buffer)
 		goto cleanup2;
 
+	swiotlb_set_mem_attributes(v_overflow_buffer, io_tlb_overflow);
+	memset(v_overflow_buffer, 0, io_tlb_overflow);
 	io_tlb_overflow_buffer = virt_to_phys(v_overflow_buffer);
 
 	/*
@@ -581,7 +619,7 @@ map_single(struct device *hwdev, phys_addr_t phys, size_t size,
 		return SWIOTLB_MAP_ERROR;
 	}
 
-	start_dma_addr = phys_to_dma(hwdev, io_tlb_start);
+	start_dma_addr = swiotlb_phys_to_dma(hwdev, io_tlb_start);
 	return swiotlb_tbl_map_single(hwdev, start_dma_addr, phys, size,
 				      dir, attrs);
 }
@@ -702,7 +740,7 @@ swiotlb_alloc_coherent(struct device *hwdev, size_t size,
 			goto err_warn;
 
 		ret = phys_to_virt(paddr);
-		dev_addr = phys_to_dma(hwdev, paddr);
+		dev_addr = swiotlb_phys_to_dma(hwdev, paddr);
 
 		/* Confirm address can be DMA'd by device */
 		if (dev_addr + size - 1 > dma_mask) {
@@ -812,10 +850,10 @@ dma_addr_t swiotlb_map_page(struct device *dev, struct page *page,
 	map = map_single(dev, phys, size, dir, attrs);
 	if (map == SWIOTLB_MAP_ERROR) {
 		swiotlb_full(dev, size, dir, 1);
-		return phys_to_dma(dev, io_tlb_overflow_buffer);
+		return swiotlb_phys_to_dma(dev, io_tlb_overflow_buffer);
 	}
 
-	dev_addr = phys_to_dma(dev, map);
+	dev_addr = swiotlb_phys_to_dma(dev, map);
 
 	/* Ensure that the address returned is DMA'ble */
 	if (dma_capable(dev, dev_addr, size))
@@ -824,7 +862,7 @@ dma_addr_t swiotlb_map_page(struct device *dev, struct page *page,
 	attrs |= DMA_ATTR_SKIP_CPU_SYNC;
 	swiotlb_tbl_unmap_single(dev, map, size, dir, attrs);
 
-	return phys_to_dma(dev, io_tlb_overflow_buffer);
+	return swiotlb_phys_to_dma(dev, io_tlb_overflow_buffer);
 }
 EXPORT_SYMBOL_GPL(swiotlb_map_page);
 
@@ -958,7 +996,7 @@ swiotlb_map_sg_attrs(struct device *hwdev, struct scatterlist *sgl, int nelems,
 				sg_dma_len(sgl) = 0;
 				return 0;
 			}
-			sg->dma_address = phys_to_dma(hwdev, map);
+			sg->dma_address = swiotlb_phys_to_dma(hwdev, map);
 		} else
 			sg->dma_address = dev_addr;
 		sg_dma_len(sg) = sg->length;
@@ -1026,7 +1064,7 @@ EXPORT_SYMBOL(swiotlb_sync_sg_for_device);
 int
 swiotlb_dma_mapping_error(struct device *hwdev, dma_addr_t dma_addr)
 {
-	return (dma_addr == phys_to_dma(hwdev, io_tlb_overflow_buffer));
+	return (dma_addr == swiotlb_phys_to_dma(hwdev, io_tlb_overflow_buffer));
 }
 EXPORT_SYMBOL(swiotlb_dma_mapping_error);
 
@@ -1039,6 +1077,6 @@ EXPORT_SYMBOL(swiotlb_dma_mapping_error);
 int
 swiotlb_dma_supported(struct device *hwdev, u64 mask)
 {
-	return phys_to_dma(hwdev, io_tlb_end - 1) <= mask;
+	return swiotlb_phys_to_dma(hwdev, io_tlb_end - 1) <= mask;
 }
 EXPORT_SYMBOL(swiotlb_dma_supported);
-- 
cgit v1.2.3


From 648babb7078c6310d2af5b8aa01f086030916968 Mon Sep 17 00:00:00 2001
From: Tom Lendacky <thomas.lendacky@amd.com>
Date: Mon, 17 Jul 2017 16:10:22 -0500
Subject: swiotlb: Add warnings for use of bounce buffers with SME
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Add warnings to let the user know when bounce buffers are being used for
DMA when SME is active.  Since the bounce buffers are not in encrypted
memory, these notifications are to allow the user to determine some
appropriate action - if necessary.  Actions can range from utilizing an
IOMMU, replacing the device with another device that can support 64-bit
DMA, ignoring the message if the device isn't used much, etc.

Signed-off-by: Tom Lendacky <thomas.lendacky@amd.com>
Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Alexander Potapenko <glider@google.com>
Cc: Andrey Ryabinin <aryabinin@virtuozzo.com>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brijesh Singh <brijesh.singh@amd.com>
Cc: Dave Young <dyoung@redhat.com>
Cc: Dmitry Vyukov <dvyukov@google.com>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Cc: Larry Woodman <lwoodman@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Matt Fleming <matt@codeblueprint.co.uk>
Cc: Michael S. Tsirkin <mst@redhat.com>
Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Radim Krčmář <rkrcmar@redhat.com>
Cc: Rik van Riel <riel@redhat.com>
Cc: Toshimitsu Kani <toshi.kani@hpe.com>
Cc: kasan-dev@googlegroups.com
Cc: kvm@vger.kernel.org
Cc: linux-arch@vger.kernel.org
Cc: linux-doc@vger.kernel.org
Cc: linux-efi@vger.kernel.org
Cc: linux-mm@kvack.org
Link: http://lkml.kernel.org/r/d112564053c3f2e86ca634a8d4fa4abc0eb53a6a.1500319216.git.thomas.lendacky@amd.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/dma-mapping.h | 13 +++++++++++++
 lib/swiotlb.c               |  3 +++
 2 files changed, 16 insertions(+)

(limited to 'include')

diff --git a/include/linux/dma-mapping.h b/include/linux/dma-mapping.h
index 843ab866e0f4..fce2369ecf82 100644
--- a/include/linux/dma-mapping.h
+++ b/include/linux/dma-mapping.h
@@ -10,6 +10,7 @@
 #include <linux/scatterlist.h>
 #include <linux/kmemcheck.h>
 #include <linux/bug.h>
+#include <linux/mem_encrypt.h>
 
 /**
  * List of possible attributes associated with a DMA mapping. The semantics
@@ -548,6 +549,12 @@ static inline int dma_mapping_error(struct device *dev, dma_addr_t dma_addr)
 	return 0;
 }
 
+static inline void dma_check_mask(struct device *dev, u64 mask)
+{
+	if (sme_active() && (mask < (((u64)sme_get_me_mask() << 1) - 1)))
+		dev_warn(dev, "SME is active, device will require DMA bounce buffers\n");
+}
+
 static inline int dma_supported(struct device *dev, u64 mask)
 {
 	const struct dma_map_ops *ops = get_dma_ops(dev);
@@ -564,6 +571,9 @@ static inline int dma_set_mask(struct device *dev, u64 mask)
 {
 	if (!dev->dma_mask || !dma_supported(dev, mask))
 		return -EIO;
+
+	dma_check_mask(dev, mask);
+
 	*dev->dma_mask = mask;
 	return 0;
 }
@@ -583,6 +593,9 @@ static inline int dma_set_coherent_mask(struct device *dev, u64 mask)
 {
 	if (!dma_supported(dev, mask))
 		return -EIO;
+
+	dma_check_mask(dev, mask);
+
 	dev->coherent_dma_mask = mask;
 	return 0;
 }
diff --git a/lib/swiotlb.c b/lib/swiotlb.c
index 04ac91acf193..8c6c83ef57a4 100644
--- a/lib/swiotlb.c
+++ b/lib/swiotlb.c
@@ -507,6 +507,9 @@ phys_addr_t swiotlb_tbl_map_single(struct device *hwdev,
 	if (no_iotlb_memory)
 		panic("Can not allocate SWIOTLB buffer earlier and can't now provide you with the DMA bounce buffer");
 
+	if (sme_active())
+		pr_warn_once("SME is active and system is using DMA bounce buffers\n");
+
 	mask = dma_get_seg_boundary(hwdev);
 
 	tbl_dma_addr &= mask;
-- 
cgit v1.2.3


From bba4ed011a52d494aa7ef5e08cf226709bbf3f60 Mon Sep 17 00:00:00 2001
From: Tom Lendacky <thomas.lendacky@amd.com>
Date: Mon, 17 Jul 2017 16:10:28 -0500
Subject: x86/mm, kexec: Allow kexec to be used with SME
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Provide support so that kexec can be used to boot a kernel when SME is
enabled.

Support is needed to allocate pages for kexec without encryption.  This
is needed in order to be able to reboot in the kernel in the same manner
as originally booted.

Additionally, when shutting down all of the CPUs we need to be sure to
flush the caches and then halt. This is needed when booting from a state
where SME was not active into a state where SME is active (or vice-versa).
Without these steps, it is possible for cache lines to exist for the same
physical location but tagged both with and without the encryption bit. This
can cause random memory corruption when caches are flushed depending on
which cacheline is written last.

Signed-off-by: Tom Lendacky <thomas.lendacky@amd.com>
Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Borislav Petkov <bp@suse.de>
Cc: <kexec@lists.infradead.org>
Cc: Alexander Potapenko <glider@google.com>
Cc: Andrey Ryabinin <aryabinin@virtuozzo.com>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brijesh Singh <brijesh.singh@amd.com>
Cc: Dave Young <dyoung@redhat.com>
Cc: Dmitry Vyukov <dvyukov@google.com>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Cc: Larry Woodman <lwoodman@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Matt Fleming <matt@codeblueprint.co.uk>
Cc: Michael S. Tsirkin <mst@redhat.com>
Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Radim Krčmář <rkrcmar@redhat.com>
Cc: Rik van Riel <riel@redhat.com>
Cc: Toshimitsu Kani <toshi.kani@hpe.com>
Cc: kasan-dev@googlegroups.com
Cc: kvm@vger.kernel.org
Cc: linux-arch@vger.kernel.org
Cc: linux-doc@vger.kernel.org
Cc: linux-efi@vger.kernel.org
Cc: linux-mm@kvack.org
Link: http://lkml.kernel.org/r/b95ff075db3e7cd545313f2fb609a49619a09625.1500319216.git.thomas.lendacky@amd.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/include/asm/init.h          |  1 +
 arch/x86/include/asm/kexec.h         |  8 ++++++++
 arch/x86/include/asm/pgtable_types.h |  1 +
 arch/x86/kernel/machine_kexec_64.c   | 22 +++++++++++++++++++++-
 arch/x86/kernel/process.c            | 17 +++++++++++++++--
 arch/x86/mm/ident_map.c              | 12 ++++++++----
 include/linux/kexec.h                |  8 ++++++++
 kernel/kexec_core.c                  | 12 +++++++++++-
 8 files changed, 73 insertions(+), 8 deletions(-)

(limited to 'include')

diff --git a/arch/x86/include/asm/init.h b/arch/x86/include/asm/init.h
index 474eb8c66fee..05c4aa00cc86 100644
--- a/arch/x86/include/asm/init.h
+++ b/arch/x86/include/asm/init.h
@@ -7,6 +7,7 @@ struct x86_mapping_info {
 	unsigned long page_flag;	 /* page flag for PMD or PUD entry */
 	unsigned long offset;		 /* ident mapping offset */
 	bool direct_gbpages;		 /* PUD level 1GB page support */
+	unsigned long kernpg_flag;	 /* kernel pagetable flag override */
 };
 
 int kernel_ident_mapping_init(struct x86_mapping_info *info, pgd_t *pgd_page,
diff --git a/arch/x86/include/asm/kexec.h b/arch/x86/include/asm/kexec.h
index 70ef205489f0..e8183acf931f 100644
--- a/arch/x86/include/asm/kexec.h
+++ b/arch/x86/include/asm/kexec.h
@@ -207,6 +207,14 @@ struct kexec_entry64_regs {
 	uint64_t r15;
 	uint64_t rip;
 };
+
+extern int arch_kexec_post_alloc_pages(void *vaddr, unsigned int pages,
+				       gfp_t gfp);
+#define arch_kexec_post_alloc_pages arch_kexec_post_alloc_pages
+
+extern void arch_kexec_pre_free_pages(void *vaddr, unsigned int pages);
+#define arch_kexec_pre_free_pages arch_kexec_pre_free_pages
+
 #endif
 
 typedef void crash_vmclear_fn(void);
diff --git a/arch/x86/include/asm/pgtable_types.h b/arch/x86/include/asm/pgtable_types.h
index 32095af0fefb..830992fc5a06 100644
--- a/arch/x86/include/asm/pgtable_types.h
+++ b/arch/x86/include/asm/pgtable_types.h
@@ -213,6 +213,7 @@ enum page_cache_mode {
 #define PAGE_KERNEL		__pgprot(__PAGE_KERNEL | _PAGE_ENC)
 #define PAGE_KERNEL_RO		__pgprot(__PAGE_KERNEL_RO | _PAGE_ENC)
 #define PAGE_KERNEL_EXEC	__pgprot(__PAGE_KERNEL_EXEC | _PAGE_ENC)
+#define PAGE_KERNEL_EXEC_NOENC	__pgprot(__PAGE_KERNEL_EXEC)
 #define PAGE_KERNEL_RX		__pgprot(__PAGE_KERNEL_RX | _PAGE_ENC)
 #define PAGE_KERNEL_NOCACHE	__pgprot(__PAGE_KERNEL_NOCACHE | _PAGE_ENC)
 #define PAGE_KERNEL_LARGE	__pgprot(__PAGE_KERNEL_LARGE | _PAGE_ENC)
diff --git a/arch/x86/kernel/machine_kexec_64.c b/arch/x86/kernel/machine_kexec_64.c
index cb0a30473c23..9cf8daacc046 100644
--- a/arch/x86/kernel/machine_kexec_64.c
+++ b/arch/x86/kernel/machine_kexec_64.c
@@ -87,7 +87,7 @@ static int init_transition_pgtable(struct kimage *image, pgd_t *pgd)
 		set_pmd(pmd, __pmd(__pa(pte) | _KERNPG_TABLE));
 	}
 	pte = pte_offset_kernel(pmd, vaddr);
-	set_pte(pte, pfn_pte(paddr >> PAGE_SHIFT, PAGE_KERNEL_EXEC));
+	set_pte(pte, pfn_pte(paddr >> PAGE_SHIFT, PAGE_KERNEL_EXEC_NOENC));
 	return 0;
 err:
 	free_transition_pgtable(image);
@@ -115,6 +115,7 @@ static int init_pgtable(struct kimage *image, unsigned long start_pgtable)
 		.alloc_pgt_page	= alloc_pgt_page,
 		.context	= image,
 		.page_flag	= __PAGE_KERNEL_LARGE_EXEC,
+		.kernpg_flag	= _KERNPG_TABLE_NOENC,
 	};
 	unsigned long mstart, mend;
 	pgd_t *level4p;
@@ -602,3 +603,22 @@ void arch_kexec_unprotect_crashkres(void)
 {
 	kexec_mark_crashkres(false);
 }
+
+int arch_kexec_post_alloc_pages(void *vaddr, unsigned int pages, gfp_t gfp)
+{
+	/*
+	 * If SME is active we need to be sure that kexec pages are
+	 * not encrypted because when we boot to the new kernel the
+	 * pages won't be accessed encrypted (initially).
+	 */
+	return set_memory_decrypted((unsigned long)vaddr, pages);
+}
+
+void arch_kexec_pre_free_pages(void *vaddr, unsigned int pages)
+{
+	/*
+	 * If SME is active we need to reset the pages back to being
+	 * an encrypted mapping before freeing them.
+	 */
+	set_memory_encrypted((unsigned long)vaddr, pages);
+}
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index 3ca198080ea9..bd6b85fac666 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -355,6 +355,7 @@ bool xen_set_default_idle(void)
 	return ret;
 }
 #endif
+
 void stop_this_cpu(void *dummy)
 {
 	local_irq_disable();
@@ -365,8 +366,20 @@ void stop_this_cpu(void *dummy)
 	disable_local_APIC();
 	mcheck_cpu_clear(this_cpu_ptr(&cpu_info));
 
-	for (;;)
-		halt();
+	for (;;) {
+		/*
+		 * Use wbinvd followed by hlt to stop the processor. This
+		 * provides support for kexec on a processor that supports
+		 * SME. With kexec, going from SME inactive to SME active
+		 * requires clearing cache entries so that addresses without
+		 * the encryption bit set don't corrupt the same physical
+		 * address that has the encryption bit set when caches are
+		 * flushed. To achieve this a wbinvd is performed followed by
+		 * a hlt. Even if the processor is not in the kexec/SME
+		 * scenario this only adds a wbinvd to a halting processor.
+		 */
+		asm volatile("wbinvd; hlt" : : : "memory");
+	}
 }
 
 /*
diff --git a/arch/x86/mm/ident_map.c b/arch/x86/mm/ident_map.c
index adab1595f4bd..31cea988fa36 100644
--- a/arch/x86/mm/ident_map.c
+++ b/arch/x86/mm/ident_map.c
@@ -51,7 +51,7 @@ static int ident_pud_init(struct x86_mapping_info *info, pud_t *pud_page,
 		if (!pmd)
 			return -ENOMEM;
 		ident_pmd_init(info, pmd, addr, next);
-		set_pud(pud, __pud(__pa(pmd) | _KERNPG_TABLE));
+		set_pud(pud, __pud(__pa(pmd) | info->kernpg_flag));
 	}
 
 	return 0;
@@ -79,7 +79,7 @@ static int ident_p4d_init(struct x86_mapping_info *info, p4d_t *p4d_page,
 		if (!pud)
 			return -ENOMEM;
 		ident_pud_init(info, pud, addr, next);
-		set_p4d(p4d, __p4d(__pa(pud) | _KERNPG_TABLE));
+		set_p4d(p4d, __p4d(__pa(pud) | info->kernpg_flag));
 	}
 
 	return 0;
@@ -93,6 +93,10 @@ int kernel_ident_mapping_init(struct x86_mapping_info *info, pgd_t *pgd_page,
 	unsigned long next;
 	int result;
 
+	/* Set the default pagetable flags if not supplied */
+	if (!info->kernpg_flag)
+		info->kernpg_flag = _KERNPG_TABLE;
+
 	for (; addr < end; addr = next) {
 		pgd_t *pgd = pgd_page + pgd_index(addr);
 		p4d_t *p4d;
@@ -116,14 +120,14 @@ int kernel_ident_mapping_init(struct x86_mapping_info *info, pgd_t *pgd_page,
 		if (result)
 			return result;
 		if (IS_ENABLED(CONFIG_X86_5LEVEL)) {
-			set_pgd(pgd, __pgd(__pa(p4d) | _KERNPG_TABLE));
+			set_pgd(pgd, __pgd(__pa(p4d) | info->kernpg_flag));
 		} else {
 			/*
 			 * With p4d folded, pgd is equal to p4d.
 			 * The pgd entry has to point to the pud page table in this case.
 			 */
 			pud_t *pud = pud_offset(p4d, 0);
-			set_pgd(pgd, __pgd(__pa(pud) | _KERNPG_TABLE));
+			set_pgd(pgd, __pgd(__pa(pud) | info->kernpg_flag));
 		}
 	}
 
diff --git a/include/linux/kexec.h b/include/linux/kexec.h
index dd056fab9e35..2b7590f5483a 100644
--- a/include/linux/kexec.h
+++ b/include/linux/kexec.h
@@ -327,6 +327,14 @@ static inline void *boot_phys_to_virt(unsigned long entry)
 	return phys_to_virt(boot_phys_to_phys(entry));
 }
 
+#ifndef arch_kexec_post_alloc_pages
+static inline int arch_kexec_post_alloc_pages(void *vaddr, unsigned int pages, gfp_t gfp) { return 0; }
+#endif
+
+#ifndef arch_kexec_pre_free_pages
+static inline void arch_kexec_pre_free_pages(void *vaddr, unsigned int pages) { }
+#endif
+
 #else /* !CONFIG_KEXEC_CORE */
 struct pt_regs;
 struct task_struct;
diff --git a/kernel/kexec_core.c b/kernel/kexec_core.c
index 1ae7c41c33c1..20fef1a38602 100644
--- a/kernel/kexec_core.c
+++ b/kernel/kexec_core.c
@@ -301,7 +301,7 @@ static struct page *kimage_alloc_pages(gfp_t gfp_mask, unsigned int order)
 {
 	struct page *pages;
 
-	pages = alloc_pages(gfp_mask, order);
+	pages = alloc_pages(gfp_mask & ~__GFP_ZERO, order);
 	if (pages) {
 		unsigned int count, i;
 
@@ -310,6 +310,13 @@ static struct page *kimage_alloc_pages(gfp_t gfp_mask, unsigned int order)
 		count = 1 << order;
 		for (i = 0; i < count; i++)
 			SetPageReserved(pages + i);
+
+		arch_kexec_post_alloc_pages(page_address(pages), count,
+					    gfp_mask);
+
+		if (gfp_mask & __GFP_ZERO)
+			for (i = 0; i < count; i++)
+				clear_highpage(pages + i);
 	}
 
 	return pages;
@@ -321,6 +328,9 @@ static void kimage_free_pages(struct page *page)
 
 	order = page_private(page);
 	count = 1 << order;
+
+	arch_kexec_pre_free_pages(page_address(page), count);
+
 	for (i = 0; i < count; i++)
 		ClearPageReserved(page + i);
 	__free_pages(page, order);
-- 
cgit v1.2.3


From 77af0ae44e4a74f58dd741babdacec32fc8042b1 Mon Sep 17 00:00:00 2001
From: Gary R Hook <gary.hook@amd.com>
Date: Tue, 27 Jun 2017 08:58:04 -0500
Subject: crypto: ccp - Fix some line spacing

Add/remove blank lines as appropriate.

Signed-off-by: Gary R Hook <gary.hook@amd.com>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 drivers/crypto/ccp/ccp-dev.c | 1 +
 include/linux/ccp.h          | 1 -
 2 files changed, 1 insertion(+), 1 deletion(-)

(limited to 'include')

diff --git a/drivers/crypto/ccp/ccp-dev.c b/drivers/crypto/ccp/ccp-dev.c
index 2506b5025700..67cbb3e76888 100644
--- a/drivers/crypto/ccp/ccp-dev.c
+++ b/drivers/crypto/ccp/ccp-dev.c
@@ -415,6 +415,7 @@ static void ccp_do_cmd_complete(unsigned long data)
 	struct ccp_cmd *cmd = tdata->cmd;
 
 	cmd->callback(cmd->data, cmd->ret);
+
 	complete(&tdata->completion);
 }
 
diff --git a/include/linux/ccp.h b/include/linux/ccp.h
index 3285c944194a..c03ee844a99d 100644
--- a/include/linux/ccp.h
+++ b/include/linux/ccp.h
@@ -20,7 +20,6 @@
 #include <crypto/aes.h>
 #include <crypto/sha.h>
 
-
 struct ccp_device;
 struct ccp_cmd;
 
-- 
cgit v1.2.3


From 66d3994c673d211a9ce5da3d7576d90c8e0cd377 Mon Sep 17 00:00:00 2001
From: Tudor-Dan Ambarus <tudor.ambarus@microchip.com>
Date: Wed, 5 Jul 2017 13:07:58 +0300
Subject: crypto: kpp - add get/set_flags helpers

These helpers will be used for fallbacks to kpp software
implementations.

Signed-off-by: Tudor Ambarus <tudor.ambarus@microchip.com>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 include/crypto/kpp.h | 10 ++++++++++
 1 file changed, 10 insertions(+)

(limited to 'include')

diff --git a/include/crypto/kpp.h b/include/crypto/kpp.h
index 2133d17b7156..1bde0a6514fa 100644
--- a/include/crypto/kpp.h
+++ b/include/crypto/kpp.h
@@ -145,6 +145,16 @@ static inline struct crypto_kpp *crypto_kpp_reqtfm(struct kpp_request *req)
 	return __crypto_kpp_tfm(req->base.tfm);
 }
 
+static inline u32 crypto_kpp_get_flags(struct crypto_kpp *tfm)
+{
+	return crypto_tfm_get_flags(crypto_kpp_tfm(tfm));
+}
+
+static inline void crypto_kpp_set_flags(struct crypto_kpp *tfm, u32 flags)
+{
+	crypto_tfm_set_flags(crypto_kpp_tfm(tfm), flags);
+}
+
 /**
  * crypto_free_kpp() - free KPP tfm handle
  *
-- 
cgit v1.2.3


From 720419f01832f7e697cb80480b97b2a1e96045cd Mon Sep 17 00:00:00 2001
From: Brijesh Singh <brijesh.singh@amd.com>
Date: Thu, 6 Jul 2017 09:59:14 -0500
Subject: crypto: ccp - Introduce the AMD Secure Processor device

The CCP device is part of the AMD Secure Processor. In order to expand
the usage of the AMD Secure Processor, create a framework that allows
functional components of the AMD Secure Processor to be initialized and
handled appropriately.

Signed-off-by: Brijesh Singh <brijesh.singh@amd.com>
Acked-by: Gary R Hook <gary.hook@amd.com>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 drivers/crypto/Kconfig            |   6 +-
 drivers/crypto/ccp/Kconfig        |  21 +++--
 drivers/crypto/ccp/Makefile       |   4 +-
 drivers/crypto/ccp/ccp-dev-v3.c   |   4 +-
 drivers/crypto/ccp/ccp-dev-v5.c   |   5 +-
 drivers/crypto/ccp/ccp-dev.c      | 106 +++++++++-------------
 drivers/crypto/ccp/ccp-dev.h      |  21 +----
 drivers/crypto/ccp/ccp-pci.c      |  81 +++++++++++------
 drivers/crypto/ccp/ccp-platform.c |  70 ++++++++-------
 drivers/crypto/ccp/sp-dev.c       | 182 ++++++++++++++++++++++++++++++++++++++
 drivers/crypto/ccp/sp-dev.h       | 120 +++++++++++++++++++++++++
 include/linux/ccp.h               |   7 +-
 12 files changed, 463 insertions(+), 164 deletions(-)
 create mode 100644 drivers/crypto/ccp/sp-dev.c
 create mode 100644 drivers/crypto/ccp/sp-dev.h

(limited to 'include')

diff --git a/drivers/crypto/Kconfig b/drivers/crypto/Kconfig
index 49397a9ae67c..5b5393f1b87a 100644
--- a/drivers/crypto/Kconfig
+++ b/drivers/crypto/Kconfig
@@ -540,11 +540,11 @@ config CRYPTO_DEV_ATMEL_ECC
 	  will be called atmel-ecc.
 
 config CRYPTO_DEV_CCP
-	bool "Support for AMD Cryptographic Coprocessor"
+	bool "Support for AMD Secure Processor"
 	depends on ((X86 && PCI) || (ARM64 && (OF_ADDRESS || ACPI))) && HAS_IOMEM
 	help
-	  The AMD Cryptographic Coprocessor provides hardware offload support
-	  for encryption, hashing and related operations.
+	  The AMD Secure Processor provides support for the Cryptographic Coprocessor
+	  (CCP) and the Platform Security Processor (PSP) devices.
 
 if CRYPTO_DEV_CCP
 	source "drivers/crypto/ccp/Kconfig"
diff --git a/drivers/crypto/ccp/Kconfig b/drivers/crypto/ccp/Kconfig
index 2238f77aa248..15b63fd3d180 100644
--- a/drivers/crypto/ccp/Kconfig
+++ b/drivers/crypto/ccp/Kconfig
@@ -1,22 +1,29 @@
 config CRYPTO_DEV_CCP_DD
-	tristate "Cryptographic Coprocessor device driver"
-	depends on CRYPTO_DEV_CCP
+	tristate "Secure Processor device driver"
 	default m
+	help
+	  Provides AMD Secure Processor device driver.
+	  If you choose 'M' here, this module will be called ccp.
+
+config CRYPTO_DEV_SP_CCP
+	bool "Cryptographic Coprocessor device"
+	default y
+	depends on CRYPTO_DEV_CCP_DD
 	select HW_RANDOM
 	select DMA_ENGINE
 	select DMADEVICES
 	select CRYPTO_SHA1
 	select CRYPTO_SHA256
 	help
-	  Provides the interface to use the AMD Cryptographic Coprocessor
-	  which can be used to offload encryption operations such as SHA,
-	  AES and more. If you choose 'M' here, this module will be called
-	  ccp.
+	  Provides the support for AMD Cryptographic Coprocessor (CCP) device
+	  which can be used to offload encryption operations such as SHA, AES
+	  and more.
 
 config CRYPTO_DEV_CCP_CRYPTO
 	tristate "Encryption and hashing offload support"
-	depends on CRYPTO_DEV_CCP_DD
 	default m
+	depends on CRYPTO_DEV_CCP_DD
+	depends on CRYPTO_DEV_SP_CCP
 	select CRYPTO_HASH
 	select CRYPTO_BLKCIPHER
 	select CRYPTO_AUTHENC
diff --git a/drivers/crypto/ccp/Makefile b/drivers/crypto/ccp/Makefile
index 59493fd3a751..d2f1b5264362 100644
--- a/drivers/crypto/ccp/Makefile
+++ b/drivers/crypto/ccp/Makefile
@@ -1,9 +1,9 @@
 obj-$(CONFIG_CRYPTO_DEV_CCP_DD) += ccp.o
-ccp-objs := ccp-dev.o \
+ccp-objs  := sp-dev.o ccp-platform.o
+ccp-$(CONFIG_CRYPTO_DEV_SP_CCP) += ccp-dev.o \
 	    ccp-ops.o \
 	    ccp-dev-v3.o \
 	    ccp-dev-v5.o \
-	    ccp-platform.o \
 	    ccp-dmaengine.o \
 	    ccp-debugfs.o
 ccp-$(CONFIG_PCI) += ccp-pci.o
diff --git a/drivers/crypto/ccp/ccp-dev-v3.c b/drivers/crypto/ccp/ccp-dev-v3.c
index 52aa88ba13a5..57179034b604 100644
--- a/drivers/crypto/ccp/ccp-dev-v3.c
+++ b/drivers/crypto/ccp/ccp-dev-v3.c
@@ -359,8 +359,7 @@ static void ccp_irq_bh(unsigned long data)
 
 static irqreturn_t ccp_irq_handler(int irq, void *data)
 {
-	struct device *dev = data;
-	struct ccp_device *ccp = dev_get_drvdata(dev);
+	struct ccp_device *ccp = (struct ccp_device *)data;
 
 	ccp_disable_queue_interrupts(ccp);
 	if (ccp->use_tasklet)
@@ -597,6 +596,5 @@ const struct ccp_vdata ccpv3 = {
 	.version = CCP_VERSION(3, 0),
 	.setup = NULL,
 	.perform = &ccp3_actions,
-	.bar = 2,
 	.offset = 0x20000,
 };
diff --git a/drivers/crypto/ccp/ccp-dev-v5.c b/drivers/crypto/ccp/ccp-dev-v5.c
index b10d2d2075cb..8ed2b377d05f 100644
--- a/drivers/crypto/ccp/ccp-dev-v5.c
+++ b/drivers/crypto/ccp/ccp-dev-v5.c
@@ -769,8 +769,7 @@ static void ccp5_irq_bh(unsigned long data)
 
 static irqreturn_t ccp5_irq_handler(int irq, void *data)
 {
-	struct device *dev = data;
-	struct ccp_device *ccp = dev_get_drvdata(dev);
+	struct ccp_device *ccp = (struct ccp_device *)data;
 
 	ccp5_disable_queue_interrupts(ccp);
 	ccp->total_interrupts++;
@@ -1113,7 +1112,6 @@ const struct ccp_vdata ccpv5a = {
 	.version = CCP_VERSION(5, 0),
 	.setup = ccp5_config,
 	.perform = &ccp5_actions,
-	.bar = 2,
 	.offset = 0x0,
 };
 
@@ -1122,6 +1120,5 @@ const struct ccp_vdata ccpv5b = {
 	.dma_chan_attr = DMA_PRIVATE,
 	.setup = ccp5other_config,
 	.perform = &ccp5_actions,
-	.bar = 2,
 	.offset = 0x0,
 };
diff --git a/drivers/crypto/ccp/ccp-dev.c b/drivers/crypto/ccp/ccp-dev.c
index 83a0ce5661ad..6b4315f8dad9 100644
--- a/drivers/crypto/ccp/ccp-dev.c
+++ b/drivers/crypto/ccp/ccp-dev.c
@@ -111,13 +111,6 @@ static LIST_HEAD(ccp_units);
 static DEFINE_SPINLOCK(ccp_rr_lock);
 static struct ccp_device *ccp_rr;
 
-/* Ever-increasing value to produce unique unit numbers */
-static atomic_t ccp_unit_ordinal;
-static unsigned int ccp_increment_unit_ordinal(void)
-{
-	return atomic_inc_return(&ccp_unit_ordinal);
-}
-
 /**
  * ccp_add_device - add a CCP device to the list
  *
@@ -465,14 +458,17 @@ int ccp_cmd_queue_thread(void *data)
  *
  * @dev: device struct of the CCP
  */
-struct ccp_device *ccp_alloc_struct(struct device *dev)
+struct ccp_device *ccp_alloc_struct(struct sp_device *sp)
 {
+	struct device *dev = sp->dev;
 	struct ccp_device *ccp;
 
 	ccp = devm_kzalloc(dev, sizeof(*ccp), GFP_KERNEL);
 	if (!ccp)
 		return NULL;
 	ccp->dev = dev;
+	ccp->sp = sp;
+	ccp->axcache = sp->axcache;
 
 	INIT_LIST_HEAD(&ccp->cmd);
 	INIT_LIST_HEAD(&ccp->backlog);
@@ -487,9 +483,8 @@ struct ccp_device *ccp_alloc_struct(struct device *dev)
 	init_waitqueue_head(&ccp->sb_queue);
 	init_waitqueue_head(&ccp->suspend_queue);
 
-	ccp->ord = ccp_increment_unit_ordinal();
-	snprintf(ccp->name, MAX_CCP_NAME_LEN, "ccp-%u", ccp->ord);
-	snprintf(ccp->rngname, MAX_CCP_NAME_LEN, "ccp-%u-rng", ccp->ord);
+	snprintf(ccp->name, MAX_CCP_NAME_LEN, "ccp-%u", sp->ord);
+	snprintf(ccp->rngname, MAX_CCP_NAME_LEN, "ccp-%u-rng", sp->ord);
 
 	return ccp;
 }
@@ -540,8 +535,9 @@ bool ccp_queues_suspended(struct ccp_device *ccp)
 	return ccp->cmd_q_count == suspended;
 }
 
-int ccp_dev_suspend(struct ccp_device *ccp, pm_message_t state)
+int ccp_dev_suspend(struct sp_device *sp, pm_message_t state)
 {
+	struct ccp_device *ccp = sp->ccp_data;
 	unsigned long flags;
 	unsigned int i;
 
@@ -563,8 +559,9 @@ int ccp_dev_suspend(struct ccp_device *ccp, pm_message_t state)
 	return 0;
 }
 
-int ccp_dev_resume(struct ccp_device *ccp)
+int ccp_dev_resume(struct sp_device *sp)
 {
+	struct ccp_device *ccp = sp->ccp_data;
 	unsigned long flags;
 	unsigned int i;
 
@@ -584,71 +581,54 @@ int ccp_dev_resume(struct ccp_device *ccp)
 }
 #endif
 
-int ccp_dev_init(struct ccp_device *ccp)
+int ccp_dev_init(struct sp_device *sp)
 {
-	ccp->io_regs = ccp->io_map + ccp->vdata->offset;
-
-	if (ccp->vdata->setup)
-		ccp->vdata->setup(ccp);
-
-	return ccp->vdata->perform->init(ccp);
-}
+	struct device *dev = sp->dev;
+	struct ccp_device *ccp;
+	int ret;
 
-void ccp_dev_destroy(struct ccp_device *ccp)
-{
+	ret = -ENOMEM;
+	ccp = ccp_alloc_struct(sp);
 	if (!ccp)
-		return;
+		goto e_err;
+	sp->ccp_data = ccp;
+
+	ccp->vdata = (struct ccp_vdata *)sp->dev_vdata->ccp_vdata;
+	if (!ccp->vdata || !ccp->vdata->version) {
+		ret = -ENODEV;
+		dev_err(dev, "missing driver data\n");
+		goto e_err;
+	}
 
-	ccp->vdata->perform->destroy(ccp);
-}
+	ccp->get_irq = sp->get_irq;
+	ccp->free_irq = sp->free_irq;
 
-static int __init ccp_mod_init(void)
-{
-#ifdef CONFIG_X86
-	int ret;
+	ccp->io_regs = sp->io_map + ccp->vdata->offset;
+	if (ccp->vdata->setup)
+		ccp->vdata->setup(ccp);
 
-	ret = ccp_pci_init();
+	ret = ccp->vdata->perform->init(ccp);
 	if (ret)
-		return ret;
+		goto e_err;
 
-	/* Don't leave the driver loaded if init failed */
-	if (ccp_present() != 0) {
-		ccp_pci_exit();
-		return -ENODEV;
-	}
+	dev_notice(dev, "ccp enabled\n");
 
 	return 0;
-#endif
-
-#ifdef CONFIG_ARM64
-	int ret;
 
-	ret = ccp_platform_init();
-	if (ret)
-		return ret;
+e_err:
+	sp->ccp_data = NULL;
 
-	/* Don't leave the driver loaded if init failed */
-	if (ccp_present() != 0) {
-		ccp_platform_exit();
-		return -ENODEV;
-	}
-
-	return 0;
-#endif
+	dev_notice(dev, "ccp initialization failed\n");
 
-	return -ENODEV;
+	return ret;
 }
 
-static void __exit ccp_mod_exit(void)
+void ccp_dev_destroy(struct sp_device *sp)
 {
-#ifdef CONFIG_X86
-	ccp_pci_exit();
-#endif
+	struct ccp_device *ccp = sp->ccp_data;
 
-#ifdef CONFIG_ARM64
-	ccp_platform_exit();
-#endif
-}
+	if (!ccp)
+		return;
 
-module_init(ccp_mod_init);
-module_exit(ccp_mod_exit);
+	ccp->vdata->perform->destroy(ccp);
+}
diff --git a/drivers/crypto/ccp/ccp-dev.h b/drivers/crypto/ccp/ccp-dev.h
index df2e76e212ea..ca44821a376d 100644
--- a/drivers/crypto/ccp/ccp-dev.h
+++ b/drivers/crypto/ccp/ccp-dev.h
@@ -27,6 +27,8 @@
 #include <linux/irqreturn.h>
 #include <linux/dmaengine.h>
 
+#include "sp-dev.h"
+
 #define MAX_CCP_NAME_LEN		16
 #define MAX_DMAPOOL_NAME_LEN		32
 
@@ -344,6 +346,7 @@ struct ccp_device {
 	char rngname[MAX_CCP_NAME_LEN];
 
 	struct device *dev;
+	struct sp_device *sp;
 
 	/* Bus specific device information
 	 */
@@ -362,7 +365,6 @@ struct ccp_device {
 	 *   them.
 	 */
 	struct mutex req_mutex ____cacheline_aligned;
-	void __iomem *io_map;
 	void __iomem *io_regs;
 
 	/* Master lists that all cmds are queued on. Because there can be
@@ -637,7 +639,7 @@ void ccp_del_device(struct ccp_device *ccp);
 
 extern void ccp_log_error(struct ccp_device *, int);
 
-struct ccp_device *ccp_alloc_struct(struct device *dev);
+struct ccp_device *ccp_alloc_struct(struct sp_device *sp);
 bool ccp_queues_suspended(struct ccp_device *ccp);
 int ccp_cmd_queue_thread(void *data);
 int ccp_trng_read(struct hwrng *rng, void *data, size_t max, bool wait);
@@ -652,11 +654,6 @@ void ccp_dmaengine_unregister(struct ccp_device *ccp);
 void ccp5_debugfs_setup(struct ccp_device *ccp);
 void ccp5_debugfs_destroy(void);
 
-int ccp_dev_init(struct ccp_device *ccp);
-void ccp_dev_destroy(struct ccp_device *ccp);
-int ccp_dev_suspend(struct ccp_device *ccp, pm_message_t state);
-int ccp_dev_resume(struct ccp_device *ccp);
-
 /* Structure for computation functions that are device-specific */
 struct ccp_actions {
 	int (*aes)(struct ccp_op *);
@@ -674,16 +671,6 @@ struct ccp_actions {
 	irqreturn_t (*irqhandler)(int, void *);
 };
 
-/* Structure to hold CCP version-specific values */
-struct ccp_vdata {
-	const unsigned int version;
-	const unsigned int dma_chan_attr;
-	void (*setup)(struct ccp_device *);
-	const struct ccp_actions *perform;
-	const unsigned int bar;
-	const unsigned int offset;
-};
-
 extern const struct ccp_vdata ccpv3_platform;
 extern const struct ccp_vdata ccpv3;
 extern const struct ccp_vdata ccpv5a;
diff --git a/drivers/crypto/ccp/ccp-pci.c b/drivers/crypto/ccp/ccp-pci.c
index 490ad0ad2218..ab2df96c4b9d 100644
--- a/drivers/crypto/ccp/ccp-pci.c
+++ b/drivers/crypto/ccp/ccp-pci.c
@@ -40,7 +40,8 @@ struct ccp_pci {
 
 static int ccp_get_msix_irqs(struct ccp_device *ccp)
 {
-	struct ccp_pci *ccp_pci = ccp->dev_specific;
+	struct sp_device *sp = ccp->sp;
+	struct ccp_pci *ccp_pci = sp->dev_specific;
 	struct device *dev = ccp->dev;
 	struct pci_dev *pdev = to_pci_dev(dev);
 	struct msix_entry msix_entry[MSIX_VECTORS];
@@ -58,11 +59,11 @@ static int ccp_get_msix_irqs(struct ccp_device *ccp)
 	for (v = 0; v < ccp_pci->msix_count; v++) {
 		/* Set the interrupt names and request the irqs */
 		snprintf(ccp_pci->msix[v].name, name_len, "%s-%u",
-			 ccp->name, v);
+			 sp->name, v);
 		ccp_pci->msix[v].vector = msix_entry[v].vector;
 		ret = request_irq(ccp_pci->msix[v].vector,
 				  ccp->vdata->perform->irqhandler,
-				  0, ccp_pci->msix[v].name, dev);
+				  0, ccp_pci->msix[v].name, ccp);
 		if (ret) {
 			dev_notice(dev, "unable to allocate MSI-X IRQ (%d)\n",
 				   ret);
@@ -86,6 +87,7 @@ e_irq:
 
 static int ccp_get_msi_irq(struct ccp_device *ccp)
 {
+	struct sp_device *sp = ccp->sp;
 	struct device *dev = ccp->dev;
 	struct pci_dev *pdev = to_pci_dev(dev);
 	int ret;
@@ -96,7 +98,7 @@ static int ccp_get_msi_irq(struct ccp_device *ccp)
 
 	ccp->irq = pdev->irq;
 	ret = request_irq(ccp->irq, ccp->vdata->perform->irqhandler, 0,
-			  ccp->name, dev);
+			  sp->name, ccp);
 	if (ret) {
 		dev_notice(dev, "unable to allocate MSI IRQ (%d)\n", ret);
 		goto e_msi;
@@ -134,17 +136,18 @@ static int ccp_get_irqs(struct ccp_device *ccp)
 
 static void ccp_free_irqs(struct ccp_device *ccp)
 {
-	struct ccp_pci *ccp_pci = ccp->dev_specific;
+	struct sp_device *sp = ccp->sp;
+	struct ccp_pci *ccp_pci = sp->dev_specific;
 	struct device *dev = ccp->dev;
 	struct pci_dev *pdev = to_pci_dev(dev);
 
 	if (ccp_pci->msix_count) {
 		while (ccp_pci->msix_count--)
 			free_irq(ccp_pci->msix[ccp_pci->msix_count].vector,
-				 dev);
+				 ccp);
 		pci_disable_msix(pdev);
 	} else if (ccp->irq) {
-		free_irq(ccp->irq, dev);
+		free_irq(ccp->irq, ccp);
 		pci_disable_msi(pdev);
 	}
 	ccp->irq = 0;
@@ -152,7 +155,7 @@ static void ccp_free_irqs(struct ccp_device *ccp)
 
 static int ccp_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id)
 {
-	struct ccp_device *ccp;
+	struct sp_device *sp;
 	struct ccp_pci *ccp_pci;
 	struct device *dev = &pdev->dev;
 	void __iomem * const *iomap_table;
@@ -160,23 +163,23 @@ static int ccp_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id)
 	int ret;
 
 	ret = -ENOMEM;
-	ccp = ccp_alloc_struct(dev);
-	if (!ccp)
+	sp = sp_alloc_struct(dev);
+	if (!sp)
 		goto e_err;
 
 	ccp_pci = devm_kzalloc(dev, sizeof(*ccp_pci), GFP_KERNEL);
 	if (!ccp_pci)
 		goto e_err;
 
-	ccp->dev_specific = ccp_pci;
-	ccp->vdata = (struct ccp_vdata *)id->driver_data;
-	if (!ccp->vdata || !ccp->vdata->version) {
+	sp->dev_specific = ccp_pci;
+	sp->dev_vdata = (struct sp_dev_vdata *)id->driver_data;
+	if (!sp->dev_vdata) {
 		ret = -ENODEV;
 		dev_err(dev, "missing driver data\n");
 		goto e_err;
 	}
-	ccp->get_irq = ccp_get_irqs;
-	ccp->free_irq = ccp_free_irqs;
+	sp->get_irq = ccp_get_irqs;
+	sp->free_irq = ccp_free_irqs;
 
 	ret = pcim_enable_device(pdev);
 	if (ret) {
@@ -198,8 +201,8 @@ static int ccp_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id)
 		goto e_err;
 	}
 
-	ccp->io_map = iomap_table[ccp->vdata->bar];
-	if (!ccp->io_map) {
+	sp->io_map = iomap_table[sp->dev_vdata->bar];
+	if (!sp->io_map) {
 		dev_err(dev, "ioremap failed\n");
 		ret = -ENOMEM;
 		goto e_err;
@@ -217,9 +220,9 @@ static int ccp_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id)
 		}
 	}
 
-	dev_set_drvdata(dev, ccp);
+	dev_set_drvdata(dev, sp);
 
-	ret = ccp_dev_init(ccp);
+	ret = sp_init(sp);
 	if (ret)
 		goto e_err;
 
@@ -235,12 +238,12 @@ e_err:
 static void ccp_pci_remove(struct pci_dev *pdev)
 {
 	struct device *dev = &pdev->dev;
-	struct ccp_device *ccp = dev_get_drvdata(dev);
+	struct sp_device *sp = dev_get_drvdata(dev);
 
-	if (!ccp)
+	if (!sp)
 		return;
 
-	ccp_dev_destroy(ccp);
+	sp_destroy(sp);
 
 	dev_notice(dev, "disabled\n");
 }
@@ -249,24 +252,44 @@ static void ccp_pci_remove(struct pci_dev *pdev)
 static int ccp_pci_suspend(struct pci_dev *pdev, pm_message_t state)
 {
 	struct device *dev = &pdev->dev;
-	struct ccp_device *ccp = dev_get_drvdata(dev);
+	struct sp_device *sp = dev_get_drvdata(dev);
 
-	return ccp_dev_suspend(ccp, state);
+	return sp_suspend(sp, state);
 }
 
 static int ccp_pci_resume(struct pci_dev *pdev)
 {
 	struct device *dev = &pdev->dev;
-	struct ccp_device *ccp = dev_get_drvdata(dev);
+	struct sp_device *sp = dev_get_drvdata(dev);
 
-	return ccp_dev_resume(ccp);
+	return sp_resume(sp);
 }
 #endif
 
+static const struct sp_dev_vdata dev_vdata[] = {
+	{
+		.bar = 2,
+#ifdef CONFIG_CRYPTO_DEV_SP_CCP
+		.ccp_vdata = &ccpv3,
+#endif
+	},
+	{
+		.bar = 2,
+#ifdef CONFIG_CRYPTO_DEV_SP_CCP
+		.ccp_vdata = &ccpv5a,
+#endif
+	},
+	{
+		.bar = 2,
+#ifdef CONFIG_CRYPTO_DEV_SP_CCP
+		.ccp_vdata = &ccpv5b,
+#endif
+	},
+};
 static const struct pci_device_id ccp_pci_table[] = {
-	{ PCI_VDEVICE(AMD, 0x1537), (kernel_ulong_t)&ccpv3 },
-	{ PCI_VDEVICE(AMD, 0x1456), (kernel_ulong_t)&ccpv5a },
-	{ PCI_VDEVICE(AMD, 0x1468), (kernel_ulong_t)&ccpv5b },
+	{ PCI_VDEVICE(AMD, 0x1537), (kernel_ulong_t)&dev_vdata[0] },
+	{ PCI_VDEVICE(AMD, 0x1456), (kernel_ulong_t)&dev_vdata[1] },
+	{ PCI_VDEVICE(AMD, 0x1468), (kernel_ulong_t)&dev_vdata[2] },
 	/* Last entry must be zero */
 	{ 0, }
 };
diff --git a/drivers/crypto/ccp/ccp-platform.c b/drivers/crypto/ccp/ccp-platform.c
index 613188c5a624..419e00ccf532 100644
--- a/drivers/crypto/ccp/ccp-platform.c
+++ b/drivers/crypto/ccp/ccp-platform.c
@@ -35,26 +35,26 @@ struct ccp_platform {
 static const struct acpi_device_id ccp_acpi_match[];
 static const struct of_device_id ccp_of_match[];
 
-static struct ccp_vdata *ccp_get_of_version(struct platform_device *pdev)
+static struct sp_dev_vdata *ccp_get_of_version(struct platform_device *pdev)
 {
 #ifdef CONFIG_OF
 	const struct of_device_id *match;
 
 	match = of_match_node(ccp_of_match, pdev->dev.of_node);
 	if (match && match->data)
-		return (struct ccp_vdata *)match->data;
+		return (struct sp_dev_vdata *)match->data;
 #endif
 	return NULL;
 }
 
-static struct ccp_vdata *ccp_get_acpi_version(struct platform_device *pdev)
+static struct sp_dev_vdata *ccp_get_acpi_version(struct platform_device *pdev)
 {
 #ifdef CONFIG_ACPI
 	const struct acpi_device_id *match;
 
 	match = acpi_match_device(ccp_acpi_match, &pdev->dev);
 	if (match && match->driver_data)
-		return (struct ccp_vdata *)match->driver_data;
+		return (struct sp_dev_vdata *)match->driver_data;
 #endif
 	return NULL;
 }
@@ -73,7 +73,7 @@ static int ccp_get_irq(struct ccp_device *ccp)
 
 	ccp->irq = ret;
 	ret = request_irq(ccp->irq, ccp->vdata->perform->irqhandler, 0,
-			  ccp->name, dev);
+			  ccp->name, ccp);
 	if (ret) {
 		dev_notice(dev, "unable to allocate IRQ (%d)\n", ret);
 		return ret;
@@ -99,14 +99,12 @@ static int ccp_get_irqs(struct ccp_device *ccp)
 
 static void ccp_free_irqs(struct ccp_device *ccp)
 {
-	struct device *dev = ccp->dev;
-
-	free_irq(ccp->irq, dev);
+	free_irq(ccp->irq, ccp);
 }
 
 static int ccp_platform_probe(struct platform_device *pdev)
 {
-	struct ccp_device *ccp;
+	struct sp_device *sp;
 	struct ccp_platform *ccp_platform;
 	struct device *dev = &pdev->dev;
 	enum dev_dma_attr attr;
@@ -114,32 +112,31 @@ static int ccp_platform_probe(struct platform_device *pdev)
 	int ret;
 
 	ret = -ENOMEM;
-	ccp = ccp_alloc_struct(dev);
-	if (!ccp)
+	sp = sp_alloc_struct(dev);
+	if (!sp)
 		goto e_err;
 
 	ccp_platform = devm_kzalloc(dev, sizeof(*ccp_platform), GFP_KERNEL);
 	if (!ccp_platform)
 		goto e_err;
 
-	ccp->dev_specific = ccp_platform;
-	ccp->vdata = pdev->dev.of_node ? ccp_get_of_version(pdev)
+	sp->dev_specific = ccp_platform;
+	sp->dev_vdata = pdev->dev.of_node ? ccp_get_of_version(pdev)
 					 : ccp_get_acpi_version(pdev);
-	if (!ccp->vdata || !ccp->vdata->version) {
+	if (!sp->dev_vdata) {
 		ret = -ENODEV;
 		dev_err(dev, "missing driver data\n");
 		goto e_err;
 	}
-	ccp->get_irq = ccp_get_irqs;
-	ccp->free_irq = ccp_free_irqs;
+	sp->get_irq = ccp_get_irqs;
+	sp->free_irq = ccp_free_irqs;
 
 	ior = platform_get_resource(pdev, IORESOURCE_MEM, 0);
-	ccp->io_map = devm_ioremap_resource(dev, ior);
-	if (IS_ERR(ccp->io_map)) {
-		ret = PTR_ERR(ccp->io_map);
+	sp->io_map = devm_ioremap_resource(dev, ior);
+	if (IS_ERR(sp->io_map)) {
+		ret = PTR_ERR(sp->io_map);
 		goto e_err;
 	}
-	ccp->io_regs = ccp->io_map;
 
 	attr = device_get_dma_attr(dev);
 	if (attr == DEV_DMA_NOT_SUPPORTED) {
@@ -149,9 +146,9 @@ static int ccp_platform_probe(struct platform_device *pdev)
 
 	ccp_platform->coherent = (attr == DEV_DMA_COHERENT);
 	if (ccp_platform->coherent)
-		ccp->axcache = CACHE_WB_NO_ALLOC;
+		sp->axcache = CACHE_WB_NO_ALLOC;
 	else
-		ccp->axcache = CACHE_NONE;
+		sp->axcache = CACHE_NONE;
 
 	ret = dma_set_mask_and_coherent(dev, DMA_BIT_MASK(48));
 	if (ret) {
@@ -159,9 +156,9 @@ static int ccp_platform_probe(struct platform_device *pdev)
 		goto e_err;
 	}
 
-	dev_set_drvdata(dev, ccp);
+	dev_set_drvdata(dev, sp);
 
-	ret = ccp_dev_init(ccp);
+	ret = sp_init(sp);
 	if (ret)
 		goto e_err;
 
@@ -177,9 +174,9 @@ e_err:
 static int ccp_platform_remove(struct platform_device *pdev)
 {
 	struct device *dev = &pdev->dev;
-	struct ccp_device *ccp = dev_get_drvdata(dev);
+	struct sp_device *sp = dev_get_drvdata(dev);
 
-	ccp_dev_destroy(ccp);
+	sp_destroy(sp);
 
 	dev_notice(dev, "disabled\n");
 
@@ -191,23 +188,32 @@ static int ccp_platform_suspend(struct platform_device *pdev,
 				pm_message_t state)
 {
 	struct device *dev = &pdev->dev;
-	struct ccp_device *ccp = dev_get_drvdata(dev);
+	struct sp_device *sp = dev_get_drvdata(dev);
 
-	return ccp_dev_suspend(ccp, state);
+	return sp_suspend(sp, state);
 }
 
 static int ccp_platform_resume(struct platform_device *pdev)
 {
 	struct device *dev = &pdev->dev;
-	struct ccp_device *ccp = dev_get_drvdata(dev);
+	struct sp_device *sp = dev_get_drvdata(dev);
 
-	return ccp_dev_resume(ccp);
+	return sp_resume(sp);
 }
 #endif
 
+static const struct sp_dev_vdata dev_vdata[] = {
+	{
+		.bar = 0,
+#ifdef CONFIG_CRYPTO_DEV_SP_CCP
+		.ccp_vdata = &ccpv3_platform,
+#endif
+	},
+};
+
 #ifdef CONFIG_ACPI
 static const struct acpi_device_id ccp_acpi_match[] = {
-	{ "AMDI0C00", (kernel_ulong_t)&ccpv3 },
+	{ "AMDI0C00", (kernel_ulong_t)&dev_vdata[0] },
 	{ },
 };
 MODULE_DEVICE_TABLE(acpi, ccp_acpi_match);
@@ -216,7 +222,7 @@ MODULE_DEVICE_TABLE(acpi, ccp_acpi_match);
 #ifdef CONFIG_OF
 static const struct of_device_id ccp_of_match[] = {
 	{ .compatible = "amd,ccp-seattle-v1a",
-	  .data = (const void *)&ccpv3_platform },
+	  .data = (const void *)&dev_vdata[0] },
 	{ },
 };
 MODULE_DEVICE_TABLE(of, ccp_of_match);
diff --git a/drivers/crypto/ccp/sp-dev.c b/drivers/crypto/ccp/sp-dev.c
new file mode 100644
index 000000000000..63cc74ee6d2a
--- /dev/null
+++ b/drivers/crypto/ccp/sp-dev.c
@@ -0,0 +1,182 @@
+/*
+ * AMD Secure Processor driver
+ *
+ * Copyright (C) 2017 Advanced Micro Devices, Inc.
+ *
+ * Author: Tom Lendacky <thomas.lendacky@amd.com>
+ * Author: Gary R Hook <gary.hook@amd.com>
+ * Author: Brijesh Singh <brijesh.singh@amd.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/kthread.h>
+#include <linux/sched.h>
+#include <linux/interrupt.h>
+#include <linux/spinlock.h>
+#include <linux/spinlock_types.h>
+#include <linux/types.h>
+#include <linux/ccp.h>
+
+#include "ccp-dev.h"
+#include "sp-dev.h"
+
+MODULE_AUTHOR("Tom Lendacky <thomas.lendacky@amd.com>");
+MODULE_AUTHOR("Gary R Hook <gary.hook@amd.com>");
+MODULE_LICENSE("GPL");
+MODULE_VERSION("1.1.0");
+MODULE_DESCRIPTION("AMD Secure Processor driver");
+
+/* List of SPs, SP count, read-write access lock, and access functions
+ *
+ * Lock structure: get sp_unit_lock for reading whenever we need to
+ * examine the SP list.
+ */
+static DEFINE_RWLOCK(sp_unit_lock);
+static LIST_HEAD(sp_units);
+
+/* Ever-increasing value to produce unique unit numbers */
+static atomic_t sp_ordinal;
+
+static void sp_add_device(struct sp_device *sp)
+{
+	unsigned long flags;
+
+	write_lock_irqsave(&sp_unit_lock, flags);
+
+	list_add_tail(&sp->entry, &sp_units);
+
+	write_unlock_irqrestore(&sp_unit_lock, flags);
+}
+
+static void sp_del_device(struct sp_device *sp)
+{
+	unsigned long flags;
+
+	write_lock_irqsave(&sp_unit_lock, flags);
+
+	list_del(&sp->entry);
+
+	write_unlock_irqrestore(&sp_unit_lock, flags);
+}
+
+/**
+ * sp_alloc_struct - allocate and initialize the sp_device struct
+ *
+ * @dev: device struct of the SP
+ */
+struct sp_device *sp_alloc_struct(struct device *dev)
+{
+	struct sp_device *sp;
+
+	sp = devm_kzalloc(dev, sizeof(*sp), GFP_KERNEL);
+	if (!sp)
+		return NULL;
+
+	sp->dev = dev;
+	sp->ord = atomic_inc_return(&sp_ordinal);
+	snprintf(sp->name, SP_MAX_NAME_LEN, "sp-%u", sp->ord);
+
+	return sp;
+}
+
+int sp_init(struct sp_device *sp)
+{
+	sp_add_device(sp);
+
+	if (sp->dev_vdata->ccp_vdata)
+		ccp_dev_init(sp);
+
+	return 0;
+}
+
+void sp_destroy(struct sp_device *sp)
+{
+	if (sp->dev_vdata->ccp_vdata)
+		ccp_dev_destroy(sp);
+
+	sp_del_device(sp);
+}
+
+#ifdef CONFIG_PM
+int sp_suspend(struct sp_device *sp, pm_message_t state)
+{
+	int ret;
+
+	if (sp->dev_vdata->ccp_vdata) {
+		ret = ccp_dev_suspend(sp, state);
+		if (ret)
+			return ret;
+	}
+
+	return 0;
+}
+
+int sp_resume(struct sp_device *sp)
+{
+	int ret;
+
+	if (sp->dev_vdata->ccp_vdata) {
+		ret = ccp_dev_resume(sp);
+		if (ret)
+			return ret;
+	}
+
+	return 0;
+}
+#endif
+
+static int __init sp_mod_init(void)
+{
+#ifdef CONFIG_X86
+	int ret;
+
+	ret = ccp_pci_init();
+	if (ret)
+		return ret;
+
+	/* Don't leave the driver loaded if init failed */
+	if (ccp_present() != 0) {
+		ccp_pci_exit();
+		return -ENODEV;
+	}
+
+	return 0;
+#endif
+
+#ifdef CONFIG_ARM64
+	int ret;
+
+	ret = ccp_platform_init();
+	if (ret)
+		return ret;
+
+	/* Don't leave the driver loaded if init failed */
+	if (ccp_present() != 0) {
+		ccp_platform_exit();
+		return -ENODEV;
+	}
+
+	return 0;
+#endif
+
+	return -ENODEV;
+}
+
+static void __exit sp_mod_exit(void)
+{
+#ifdef CONFIG_X86
+	ccp_pci_exit();
+#endif
+
+#ifdef CONFIG_ARM64
+	ccp_platform_exit();
+#endif
+}
+
+module_init(sp_mod_init);
+module_exit(sp_mod_exit);
diff --git a/drivers/crypto/ccp/sp-dev.h b/drivers/crypto/ccp/sp-dev.h
new file mode 100644
index 000000000000..189e57eeb4fa
--- /dev/null
+++ b/drivers/crypto/ccp/sp-dev.h
@@ -0,0 +1,120 @@
+/*
+ * AMD Secure Processor driver
+ *
+ * Copyright (C) 2017 Advanced Micro Devices, Inc.
+ *
+ * Author: Tom Lendacky <thomas.lendacky@amd.com>
+ * Author: Gary R Hook <gary.hook@amd.com>
+ * Author: Brijesh Singh <brijesh.singh@amd.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#ifndef __SP_DEV_H__
+#define __SP_DEV_H__
+
+#include <linux/device.h>
+#include <linux/pci.h>
+#include <linux/spinlock.h>
+#include <linux/mutex.h>
+#include <linux/list.h>
+#include <linux/wait.h>
+#include <linux/dmapool.h>
+#include <linux/hw_random.h>
+#include <linux/bitops.h>
+#include <linux/interrupt.h>
+#include <linux/irqreturn.h>
+
+#define SP_MAX_NAME_LEN		32
+
+#define CACHE_NONE			0x00
+#define CACHE_WB_NO_ALLOC		0xb7
+
+/* Structure to hold CCP device data */
+struct ccp_device;
+struct ccp_vdata {
+	const unsigned int version;
+	const unsigned int dma_chan_attr;
+	void (*setup)(struct ccp_device *);
+	const struct ccp_actions *perform;
+	const unsigned int offset;
+};
+/* Structure to hold SP device data */
+struct sp_dev_vdata {
+	const unsigned int bar;
+
+	const struct ccp_vdata *ccp_vdata;
+	void *psp_vdata;
+};
+
+struct sp_device {
+	struct list_head entry;
+
+	struct device *dev;
+
+	struct sp_dev_vdata *dev_vdata;
+	unsigned int ord;
+	char name[SP_MAX_NAME_LEN];
+
+	/* Bus specific device information */
+	void *dev_specific;
+
+	/* I/O area used for device communication. */
+	void __iomem *io_map;
+
+	/* DMA caching attribute support */
+	unsigned int axcache;
+
+	bool irq_registered;
+
+	int (*get_irq)(struct ccp_device *ccp);
+	void (*free_irq)(struct ccp_device *ccp);
+
+	void *ccp_data;
+	void *psp_data;
+};
+
+int sp_pci_init(void);
+void sp_pci_exit(void);
+
+int sp_platform_init(void);
+void sp_platform_exit(void);
+
+struct sp_device *sp_alloc_struct(struct device *dev);
+
+int sp_init(struct sp_device *sp);
+void sp_destroy(struct sp_device *sp);
+struct sp_device *sp_get_master(void);
+
+int sp_suspend(struct sp_device *sp, pm_message_t state);
+int sp_resume(struct sp_device *sp);
+
+#ifdef CONFIG_CRYPTO_DEV_SP_CCP
+
+int ccp_dev_init(struct sp_device *sp);
+void ccp_dev_destroy(struct sp_device *sp);
+
+int ccp_dev_suspend(struct sp_device *sp, pm_message_t state);
+int ccp_dev_resume(struct sp_device *sp);
+
+#else	/* !CONFIG_CRYPTO_DEV_SP_CCP */
+
+static inline int ccp_dev_init(struct sp_device *sp)
+{
+	return 0;
+}
+static inline void ccp_dev_destroy(struct sp_device *sp) { }
+
+static inline int ccp_dev_suspend(struct sp_device *sp, pm_message_t state)
+{
+	return 0;
+}
+static inline int ccp_dev_resume(struct sp_device *sp)
+{
+	return 0;
+}
+#endif	/* CONFIG_CRYPTO_DEV_SP_CCP */
+
+#endif
diff --git a/include/linux/ccp.h b/include/linux/ccp.h
index c03ee844a99d..3fd9a6dac344 100644
--- a/include/linux/ccp.h
+++ b/include/linux/ccp.h
@@ -23,8 +23,7 @@
 struct ccp_device;
 struct ccp_cmd;
 
-#if defined(CONFIG_CRYPTO_DEV_CCP_DD) || \
-	defined(CONFIG_CRYPTO_DEV_CCP_DD_MODULE)
+#if defined(CONFIG_CRYPTO_DEV_SP_CCP)
 
 /**
  * ccp_present - check if a CCP device is present
@@ -70,7 +69,7 @@ unsigned int ccp_version(void);
  */
 int ccp_enqueue_cmd(struct ccp_cmd *cmd);
 
-#else /* CONFIG_CRYPTO_DEV_CCP_DD is not enabled */
+#else /* CONFIG_CRYPTO_DEV_CCP_SP_DEV is not enabled */
 
 static inline int ccp_present(void)
 {
@@ -87,7 +86,7 @@ static inline int ccp_enqueue_cmd(struct ccp_cmd *cmd)
 	return -ENODEV;
 }
 
-#endif /* CONFIG_CRYPTO_DEV_CCP_DD */
+#endif /* CONFIG_CRYPTO_DEV_SP_CCP */
 
 
 /***** AES engine *****/
-- 
cgit v1.2.3


From f384b352cbf0310fd20c379c4710408c70e769b6 Mon Sep 17 00:00:00 2001
From: Cyrille Pitchen <cyrille.pitchen@microchip.com>
Date: Mon, 26 Jun 2017 15:10:00 +0200
Subject: mtd: spi-nor: parse Serial Flash Discoverable Parameters (SFDP)
 tables

This patch adds support to the JESD216 rev B standard and parses the SFDP
tables to dynamically initialize the 'struct spi_nor_flash_parameter'.

Signed-off-by: Cyrille Pitchen <cyrille.pitchen@microchip.com>
Reviewed-by: Marek Vasut <marek.vasut@gmail.com>
---
 drivers/mtd/spi-nor/spi-nor.c | 775 +++++++++++++++++++++++++++++++++++++++++-
 include/linux/mtd/spi-nor.h   |   6 +
 2 files changed, 768 insertions(+), 13 deletions(-)

(limited to 'include')

diff --git a/drivers/mtd/spi-nor/spi-nor.c b/drivers/mtd/spi-nor/spi-nor.c
index 1413828ff1fb..196b52f083ae 100644
--- a/drivers/mtd/spi-nor/spi-nor.c
+++ b/drivers/mtd/spi-nor/spi-nor.c
@@ -17,6 +17,7 @@
 #include <linux/mutex.h>
 #include <linux/math64.h>
 #include <linux/sizes.h>
+#include <linux/slab.h>
 
 #include <linux/mtd/mtd.h>
 #include <linux/of_platform.h>
@@ -86,6 +87,7 @@ struct flash_info {
 					 * to support memory size above 128Mib.
 					 */
 #define NO_CHIP_ERASE		BIT(12) /* Chip does not support chip erase */
+#define SPI_NOR_SKIP_SFDP	BIT(13)	/* Skip parsing of SFDP tables */
 };
 
 #define JEDEC_MFR(info)	((info)->id[0])
@@ -1380,6 +1382,16 @@ write_err:
 	return ret;
 }
 
+/**
+ * macronix_quad_enable() - set QE bit in Status Register.
+ * @nor:	pointer to a 'struct spi_nor'
+ *
+ * Set the Quad Enable (QE) bit in the Status Register.
+ *
+ * bit 6 of the Status Register is the QE bit for Macronix like QSPI memories.
+ *
+ * Return: 0 on success, -errno otherwise.
+ */
 static int macronix_quad_enable(struct spi_nor *nor)
 {
 	int ret, val;
@@ -1413,22 +1425,13 @@ static int macronix_quad_enable(struct spi_nor *nor)
  * second byte will be written to the configuration register.
  * Return negative if error occurred.
  */
-static int write_sr_cr(struct spi_nor *nor, u16 val)
-{
-	nor->cmd_buf[0] = val & 0xff;
-	nor->cmd_buf[1] = (val >> 8);
-
-	return nor->write_reg(nor, SPINOR_OP_WRSR, nor->cmd_buf, 2);
-}
-
-static int spansion_quad_enable(struct spi_nor *nor)
+static int write_sr_cr(struct spi_nor *nor, u8 *sr_cr)
 {
 	int ret;
-	int quad_en = CR_QUAD_EN_SPAN << 8;
 
 	write_enable(nor);
 
-	ret = write_sr_cr(nor, quad_en);
+	ret = nor->write_reg(nor, SPINOR_OP_WRSR, sr_cr, 2);
 	if (ret < 0) {
 		dev_err(nor->dev,
 			"error while writing configuration register\n");
@@ -1442,6 +1445,41 @@ static int spansion_quad_enable(struct spi_nor *nor)
 		return ret;
 	}
 
+	return 0;
+}
+
+/**
+ * spansion_quad_enable() - set QE bit in Configuraiton Register.
+ * @nor:	pointer to a 'struct spi_nor'
+ *
+ * Set the Quad Enable (QE) bit in the Configuration Register.
+ * This function is kept for legacy purpose because it has been used for a
+ * long time without anybody complaining but it should be considered as
+ * deprecated and maybe buggy.
+ * First, this function doesn't care about the previous values of the Status
+ * and Configuration Registers when it sets the QE bit (bit 1) in the
+ * Configuration Register: all other bits are cleared, which may have unwanted
+ * side effects like removing some block protections.
+ * Secondly, it uses the Read Configuration Register (35h) instruction though
+ * some very old and few memories don't support this instruction. If a pull-up
+ * resistor is present on the MISO/IO1 line, we might still be able to pass the
+ * "read back" test because the QSPI memory doesn't recognize the command,
+ * so leaves the MISO/IO1 line state unchanged, hence read_cr() returns 0xFF.
+ *
+ * bit 1 of the Configuration Register is the QE bit for Spansion like QSPI
+ * memories.
+ *
+ * Return: 0 on success, -errno otherwise.
+ */
+static int spansion_quad_enable(struct spi_nor *nor)
+{
+	u8 sr_cr[2] = {0, CR_QUAD_EN_SPAN};
+	int ret;
+
+	ret = write_sr_cr(nor, sr_cr);
+	if (ret)
+		return ret;
+
 	/* read back and check it */
 	ret = read_cr(nor);
 	if (!(ret > 0 && (ret & CR_QUAD_EN_SPAN))) {
@@ -1452,6 +1490,140 @@ static int spansion_quad_enable(struct spi_nor *nor)
 	return 0;
 }
 
+/**
+ * spansion_no_read_cr_quad_enable() - set QE bit in Configuration Register.
+ * @nor:	pointer to a 'struct spi_nor'
+ *
+ * Set the Quad Enable (QE) bit in the Configuration Register.
+ * This function should be used with QSPI memories not supporting the Read
+ * Configuration Register (35h) instruction.
+ *
+ * bit 1 of the Configuration Register is the QE bit for Spansion like QSPI
+ * memories.
+ *
+ * Return: 0 on success, -errno otherwise.
+ */
+static int spansion_no_read_cr_quad_enable(struct spi_nor *nor)
+{
+	u8 sr_cr[2];
+	int ret;
+
+	/* Keep the current value of the Status Register. */
+	ret = read_sr(nor);
+	if (ret < 0) {
+		dev_err(nor->dev, "error while reading status register\n");
+		return -EINVAL;
+	}
+	sr_cr[0] = ret;
+	sr_cr[1] = CR_QUAD_EN_SPAN;
+
+	return write_sr_cr(nor, sr_cr);
+}
+
+/**
+ * spansion_read_cr_quad_enable() - set QE bit in Configuration Register.
+ * @nor:	pointer to a 'struct spi_nor'
+ *
+ * Set the Quad Enable (QE) bit in the Configuration Register.
+ * This function should be used with QSPI memories supporting the Read
+ * Configuration Register (35h) instruction.
+ *
+ * bit 1 of the Configuration Register is the QE bit for Spansion like QSPI
+ * memories.
+ *
+ * Return: 0 on success, -errno otherwise.
+ */
+static int spansion_read_cr_quad_enable(struct spi_nor *nor)
+{
+	struct device *dev = nor->dev;
+	u8 sr_cr[2];
+	int ret;
+
+	/* Check current Quad Enable bit value. */
+	ret = read_cr(nor);
+	if (ret < 0) {
+		dev_err(dev, "error while reading configuration register\n");
+		return -EINVAL;
+	}
+
+	if (ret & CR_QUAD_EN_SPAN)
+		return 0;
+
+	sr_cr[1] = ret | CR_QUAD_EN_SPAN;
+
+	/* Keep the current value of the Status Register. */
+	ret = read_sr(nor);
+	if (ret < 0) {
+		dev_err(dev, "error while reading status register\n");
+		return -EINVAL;
+	}
+	sr_cr[0] = ret;
+
+	ret = write_sr_cr(nor, sr_cr);
+	if (ret)
+		return ret;
+
+	/* Read back and check it. */
+	ret = read_cr(nor);
+	if (!(ret > 0 && (ret & CR_QUAD_EN_SPAN))) {
+		dev_err(nor->dev, "Spansion Quad bit not set\n");
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+/**
+ * sr2_bit7_quad_enable() - set QE bit in Status Register 2.
+ * @nor:	pointer to a 'struct spi_nor'
+ *
+ * Set the Quad Enable (QE) bit in the Status Register 2.
+ *
+ * This is one of the procedures to set the QE bit described in the SFDP
+ * (JESD216 rev B) specification but no manufacturer using this procedure has
+ * been identified yet, hence the name of the function.
+ *
+ * Return: 0 on success, -errno otherwise.
+ */
+static int sr2_bit7_quad_enable(struct spi_nor *nor)
+{
+	u8 sr2;
+	int ret;
+
+	/* Check current Quad Enable bit value. */
+	ret = nor->read_reg(nor, SPINOR_OP_RDSR2, &sr2, 1);
+	if (ret)
+		return ret;
+	if (sr2 & SR2_QUAD_EN_BIT7)
+		return 0;
+
+	/* Update the Quad Enable bit. */
+	sr2 |= SR2_QUAD_EN_BIT7;
+
+	write_enable(nor);
+
+	ret = nor->write_reg(nor, SPINOR_OP_WRSR2, &sr2, 1);
+	if (ret < 0) {
+		dev_err(nor->dev, "error while writing status register 2\n");
+		return -EINVAL;
+	}
+
+	ret = spi_nor_wait_till_ready(nor);
+	if (ret < 0) {
+		dev_err(nor->dev, "timeout while writing status register 2\n");
+		return ret;
+	}
+
+	/* Read back and check it. */
+	ret = nor->read_reg(nor, SPINOR_OP_RDSR2, &sr2, 1);
+	if (!(ret > 0 && (sr2 & SR2_QUAD_EN_BIT7))) {
+		dev_err(nor->dev, "SR2 Quad bit not set\n");
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
 static int spi_nor_check(struct spi_nor *nor)
 {
 	if (!nor->dev || !nor->read || !nor->write ||
@@ -1591,6 +1763,560 @@ spi_nor_set_pp_settings(struct spi_nor_pp_command *pp,
 	pp->proto = proto;
 }
 
+/*
+ * Serial Flash Discoverable Parameters (SFDP) parsing.
+ */
+
+/**
+ * spi_nor_read_sfdp() - read Serial Flash Discoverable Parameters.
+ * @nor:	pointer to a 'struct spi_nor'
+ * @addr:	offset in the SFDP area to start reading data from
+ * @len:	number of bytes to read
+ * @buf:	buffer where the SFDP data are copied into
+ *
+ * Whatever the actual numbers of bytes for address and dummy cycles are
+ * for (Fast) Read commands, the Read SFDP (5Ah) instruction is always
+ * followed by a 3-byte address and 8 dummy clock cycles.
+ *
+ * Return: 0 on success, -errno otherwise.
+ */
+static int spi_nor_read_sfdp(struct spi_nor *nor, u32 addr,
+			     size_t len, void *buf)
+{
+	u8 addr_width, read_opcode, read_dummy;
+	int ret;
+
+	read_opcode = nor->read_opcode;
+	addr_width = nor->addr_width;
+	read_dummy = nor->read_dummy;
+
+	nor->read_opcode = SPINOR_OP_RDSFDP;
+	nor->addr_width = 3;
+	nor->read_dummy = 8;
+
+	while (len) {
+		ret = nor->read(nor, addr, len, (u8 *)buf);
+		if (!ret || ret > len) {
+			ret = -EIO;
+			goto read_err;
+		}
+		if (ret < 0)
+			goto read_err;
+
+		buf += ret;
+		addr += ret;
+		len -= ret;
+	}
+	ret = 0;
+
+read_err:
+	nor->read_opcode = read_opcode;
+	nor->addr_width = addr_width;
+	nor->read_dummy = read_dummy;
+
+	return ret;
+}
+
+struct sfdp_parameter_header {
+	u8		id_lsb;
+	u8		minor;
+	u8		major;
+	u8		length; /* in double words */
+	u8		parameter_table_pointer[3]; /* byte address */
+	u8		id_msb;
+};
+
+#define SFDP_PARAM_HEADER_ID(p)	(((p)->id_msb << 8) | (p)->id_lsb)
+#define SFDP_PARAM_HEADER_PTP(p) \
+	(((p)->parameter_table_pointer[2] << 16) | \
+	 ((p)->parameter_table_pointer[1] <<  8) | \
+	 ((p)->parameter_table_pointer[0] <<  0))
+
+#define SFDP_BFPT_ID		0xff00	/* Basic Flash Parameter Table */
+#define SFDP_SECTOR_MAP_ID	0xff81	/* Sector Map Table */
+
+#define SFDP_SIGNATURE		0x50444653U
+#define SFDP_JESD216_MAJOR	1
+#define SFDP_JESD216_MINOR	0
+#define SFDP_JESD216A_MINOR	5
+#define SFDP_JESD216B_MINOR	6
+
+struct sfdp_header {
+	u32		signature; /* Ox50444653U <=> "SFDP" */
+	u8		minor;
+	u8		major;
+	u8		nph; /* 0-base number of parameter headers */
+	u8		unused;
+
+	/* Basic Flash Parameter Table. */
+	struct sfdp_parameter_header	bfpt_header;
+};
+
+/* Basic Flash Parameter Table */
+
+/*
+ * JESD216 rev B defines a Basic Flash Parameter Table of 16 DWORDs.
+ * They are indexed from 1 but C arrays are indexed from 0.
+ */
+#define BFPT_DWORD(i)		((i) - 1)
+#define BFPT_DWORD_MAX		16
+
+/* The first version of JESB216 defined only 9 DWORDs. */
+#define BFPT_DWORD_MAX_JESD216			9
+
+/* 1st DWORD. */
+#define BFPT_DWORD1_FAST_READ_1_1_2		BIT(16)
+#define BFPT_DWORD1_ADDRESS_BYTES_MASK		GENMASK(18, 17)
+#define BFPT_DWORD1_ADDRESS_BYTES_3_ONLY	(0x0UL << 17)
+#define BFPT_DWORD1_ADDRESS_BYTES_3_OR_4	(0x1UL << 17)
+#define BFPT_DWORD1_ADDRESS_BYTES_4_ONLY	(0x2UL << 17)
+#define BFPT_DWORD1_DTR				BIT(19)
+#define BFPT_DWORD1_FAST_READ_1_2_2		BIT(20)
+#define BFPT_DWORD1_FAST_READ_1_4_4		BIT(21)
+#define BFPT_DWORD1_FAST_READ_1_1_4		BIT(22)
+
+/* 5th DWORD. */
+#define BFPT_DWORD5_FAST_READ_2_2_2		BIT(0)
+#define BFPT_DWORD5_FAST_READ_4_4_4		BIT(4)
+
+/* 11th DWORD. */
+#define BFPT_DWORD11_PAGE_SIZE_SHIFT		4
+#define BFPT_DWORD11_PAGE_SIZE_MASK		GENMASK(7, 4)
+
+/* 15th DWORD. */
+
+/*
+ * (from JESD216 rev B)
+ * Quad Enable Requirements (QER):
+ * - 000b: Device does not have a QE bit. Device detects 1-1-4 and 1-4-4
+ *         reads based on instruction. DQ3/HOLD# functions are hold during
+ *         instruction phase.
+ * - 001b: QE is bit 1 of status register 2. It is set via Write Status with
+ *         two data bytes where bit 1 of the second byte is one.
+ *         [...]
+ *         Writing only one byte to the status register has the side-effect of
+ *         clearing status register 2, including the QE bit. The 100b code is
+ *         used if writing one byte to the status register does not modify
+ *         status register 2.
+ * - 010b: QE is bit 6 of status register 1. It is set via Write Status with
+ *         one data byte where bit 6 is one.
+ *         [...]
+ * - 011b: QE is bit 7 of status register 2. It is set via Write status
+ *         register 2 instruction 3Eh with one data byte where bit 7 is one.
+ *         [...]
+ *         The status register 2 is read using instruction 3Fh.
+ * - 100b: QE is bit 1 of status register 2. It is set via Write Status with
+ *         two data bytes where bit 1 of the second byte is one.
+ *         [...]
+ *         In contrast to the 001b code, writing one byte to the status
+ *         register does not modify status register 2.
+ * - 101b: QE is bit 1 of status register 2. Status register 1 is read using
+ *         Read Status instruction 05h. Status register2 is read using
+ *         instruction 35h. QE is set via Writ Status instruction 01h with
+ *         two data bytes where bit 1 of the second byte is one.
+ *         [...]
+ */
+#define BFPT_DWORD15_QER_MASK			GENMASK(22, 20)
+#define BFPT_DWORD15_QER_NONE			(0x0UL << 20) /* Micron */
+#define BFPT_DWORD15_QER_SR2_BIT1_BUGGY		(0x1UL << 20)
+#define BFPT_DWORD15_QER_SR1_BIT6		(0x2UL << 20) /* Macronix */
+#define BFPT_DWORD15_QER_SR2_BIT7		(0x3UL << 20)
+#define BFPT_DWORD15_QER_SR2_BIT1_NO_RD		(0x4UL << 20)
+#define BFPT_DWORD15_QER_SR2_BIT1		(0x5UL << 20) /* Spansion */
+
+struct sfdp_bfpt {
+	u32	dwords[BFPT_DWORD_MAX];
+};
+
+/* Fast Read settings. */
+
+static inline void
+spi_nor_set_read_settings_from_bfpt(struct spi_nor_read_command *read,
+				    u16 half,
+				    enum spi_nor_protocol proto)
+{
+	read->num_mode_clocks = (half >> 5) & 0x07;
+	read->num_wait_states = (half >> 0) & 0x1f;
+	read->opcode = (half >> 8) & 0xff;
+	read->proto = proto;
+}
+
+struct sfdp_bfpt_read {
+	/* The Fast Read x-y-z hardware capability in params->hwcaps.mask. */
+	u32			hwcaps;
+
+	/*
+	 * The <supported_bit> bit in <supported_dword> BFPT DWORD tells us
+	 * whether the Fast Read x-y-z command is supported.
+	 */
+	u32			supported_dword;
+	u32			supported_bit;
+
+	/*
+	 * The half-word at offset <setting_shift> in <setting_dword> BFPT DWORD
+	 * encodes the op code, the number of mode clocks and the number of wait
+	 * states to be used by Fast Read x-y-z command.
+	 */
+	u32			settings_dword;
+	u32			settings_shift;
+
+	/* The SPI protocol for this Fast Read x-y-z command. */
+	enum spi_nor_protocol	proto;
+};
+
+static const struct sfdp_bfpt_read sfdp_bfpt_reads[] = {
+	/* Fast Read 1-1-2 */
+	{
+		SNOR_HWCAPS_READ_1_1_2,
+		BFPT_DWORD(1), BIT(16),	/* Supported bit */
+		BFPT_DWORD(4), 0,	/* Settings */
+		SNOR_PROTO_1_1_2,
+	},
+
+	/* Fast Read 1-2-2 */
+	{
+		SNOR_HWCAPS_READ_1_2_2,
+		BFPT_DWORD(1), BIT(20),	/* Supported bit */
+		BFPT_DWORD(4), 16,	/* Settings */
+		SNOR_PROTO_1_2_2,
+	},
+
+	/* Fast Read 2-2-2 */
+	{
+		SNOR_HWCAPS_READ_2_2_2,
+		BFPT_DWORD(5),  BIT(0),	/* Supported bit */
+		BFPT_DWORD(6), 16,	/* Settings */
+		SNOR_PROTO_2_2_2,
+	},
+
+	/* Fast Read 1-1-4 */
+	{
+		SNOR_HWCAPS_READ_1_1_4,
+		BFPT_DWORD(1), BIT(22),	/* Supported bit */
+		BFPT_DWORD(3), 16,	/* Settings */
+		SNOR_PROTO_1_1_4,
+	},
+
+	/* Fast Read 1-4-4 */
+	{
+		SNOR_HWCAPS_READ_1_4_4,
+		BFPT_DWORD(1), BIT(21),	/* Supported bit */
+		BFPT_DWORD(3), 0,	/* Settings */
+		SNOR_PROTO_1_4_4,
+	},
+
+	/* Fast Read 4-4-4 */
+	{
+		SNOR_HWCAPS_READ_4_4_4,
+		BFPT_DWORD(5), BIT(4),	/* Supported bit */
+		BFPT_DWORD(7), 16,	/* Settings */
+		SNOR_PROTO_4_4_4,
+	},
+};
+
+struct sfdp_bfpt_erase {
+	/*
+	 * The half-word at offset <shift> in DWORD <dwoard> encodes the
+	 * op code and erase sector size to be used by Sector Erase commands.
+	 */
+	u32			dword;
+	u32			shift;
+};
+
+static const struct sfdp_bfpt_erase sfdp_bfpt_erases[] = {
+	/* Erase Type 1 in DWORD8 bits[15:0] */
+	{BFPT_DWORD(8), 0},
+
+	/* Erase Type 2 in DWORD8 bits[31:16] */
+	{BFPT_DWORD(8), 16},
+
+	/* Erase Type 3 in DWORD9 bits[15:0] */
+	{BFPT_DWORD(9), 0},
+
+	/* Erase Type 4 in DWORD9 bits[31:16] */
+	{BFPT_DWORD(9), 16},
+};
+
+static int spi_nor_hwcaps_read2cmd(u32 hwcaps);
+
+/**
+ * spi_nor_parse_bfpt() - read and parse the Basic Flash Parameter Table.
+ * @nor:		pointer to a 'struct spi_nor'
+ * @bfpt_header:	pointer to the 'struct sfdp_parameter_header' describing
+ *			the Basic Flash Parameter Table length and version
+ * @params:		pointer to the 'struct spi_nor_flash_parameter' to be
+ *			filled
+ *
+ * The Basic Flash Parameter Table is the main and only mandatory table as
+ * defined by the SFDP (JESD216) specification.
+ * It provides us with the total size (memory density) of the data array and
+ * the number of address bytes for Fast Read, Page Program and Sector Erase
+ * commands.
+ * For Fast READ commands, it also gives the number of mode clock cycles and
+ * wait states (regrouped in the number of dummy clock cycles) for each
+ * supported instruction op code.
+ * For Page Program, the page size is now available since JESD216 rev A, however
+ * the supported instruction op codes are still not provided.
+ * For Sector Erase commands, this table stores the supported instruction op
+ * codes and the associated sector sizes.
+ * Finally, the Quad Enable Requirements (QER) are also available since JESD216
+ * rev A. The QER bits encode the manufacturer dependent procedure to be
+ * executed to set the Quad Enable (QE) bit in some internal register of the
+ * Quad SPI memory. Indeed the QE bit, when it exists, must be set before
+ * sending any Quad SPI command to the memory. Actually, setting the QE bit
+ * tells the memory to reassign its WP# and HOLD#/RESET# pins to functions IO2
+ * and IO3 hence enabling 4 (Quad) I/O lines.
+ *
+ * Return: 0 on success, -errno otherwise.
+ */
+static int spi_nor_parse_bfpt(struct spi_nor *nor,
+			      const struct sfdp_parameter_header *bfpt_header,
+			      struct spi_nor_flash_parameter *params)
+{
+	struct mtd_info *mtd = &nor->mtd;
+	struct sfdp_bfpt bfpt;
+	size_t len;
+	int i, cmd, err;
+	u32 addr;
+	u16 half;
+
+	/* JESD216 Basic Flash Parameter Table length is at least 9 DWORDs. */
+	if (bfpt_header->length < BFPT_DWORD_MAX_JESD216)
+		return -EINVAL;
+
+	/* Read the Basic Flash Parameter Table. */
+	len = min_t(size_t, sizeof(bfpt),
+		    bfpt_header->length * sizeof(u32));
+	addr = SFDP_PARAM_HEADER_PTP(bfpt_header);
+	memset(&bfpt, 0, sizeof(bfpt));
+	err = spi_nor_read_sfdp(nor,  addr, len, &bfpt);
+	if (err < 0)
+		return err;
+
+	/* Fix endianness of the BFPT DWORDs. */
+	for (i = 0; i < BFPT_DWORD_MAX; i++)
+		bfpt.dwords[i] = le32_to_cpu(bfpt.dwords[i]);
+
+	/* Number of address bytes. */
+	switch (bfpt.dwords[BFPT_DWORD(1)] & BFPT_DWORD1_ADDRESS_BYTES_MASK) {
+	case BFPT_DWORD1_ADDRESS_BYTES_3_ONLY:
+		nor->addr_width = 3;
+		break;
+
+	case BFPT_DWORD1_ADDRESS_BYTES_4_ONLY:
+		nor->addr_width = 4;
+		break;
+
+	default:
+		break;
+	}
+
+	/* Flash Memory Density (in bits). */
+	params->size = bfpt.dwords[BFPT_DWORD(2)];
+	if (params->size & BIT(31)) {
+		params->size &= ~BIT(31);
+		params->size = 1ULL << params->size;
+	} else {
+		params->size++;
+	}
+	params->size >>= 3; /* Convert to bytes. */
+
+	/* Fast Read settings. */
+	for (i = 0; i < ARRAY_SIZE(sfdp_bfpt_reads); i++) {
+		const struct sfdp_bfpt_read *rd = &sfdp_bfpt_reads[i];
+		struct spi_nor_read_command *read;
+
+		if (!(bfpt.dwords[rd->supported_dword] & rd->supported_bit)) {
+			params->hwcaps.mask &= ~rd->hwcaps;
+			continue;
+		}
+
+		params->hwcaps.mask |= rd->hwcaps;
+		cmd = spi_nor_hwcaps_read2cmd(rd->hwcaps);
+		read = &params->reads[cmd];
+		half = bfpt.dwords[rd->settings_dword] >> rd->settings_shift;
+		spi_nor_set_read_settings_from_bfpt(read, half, rd->proto);
+	}
+
+	/* Sector Erase settings. */
+	for (i = 0; i < ARRAY_SIZE(sfdp_bfpt_erases); i++) {
+		const struct sfdp_bfpt_erase *er = &sfdp_bfpt_erases[i];
+		u32 erasesize;
+		u8 opcode;
+
+		half = bfpt.dwords[er->dword] >> er->shift;
+		erasesize = half & 0xff;
+
+		/* erasesize == 0 means this Erase Type is not supported. */
+		if (!erasesize)
+			continue;
+
+		erasesize = 1U << erasesize;
+		opcode = (half >> 8) & 0xff;
+#ifdef CONFIG_MTD_SPI_NOR_USE_4K_SECTORS
+		if (erasesize == SZ_4K) {
+			nor->erase_opcode = opcode;
+			mtd->erasesize = erasesize;
+			break;
+		}
+#endif
+		if (!mtd->erasesize || mtd->erasesize < erasesize) {
+			nor->erase_opcode = opcode;
+			mtd->erasesize = erasesize;
+		}
+	}
+
+	/* Stop here if not JESD216 rev A or later. */
+	if (bfpt_header->length < BFPT_DWORD_MAX)
+		return 0;
+
+	/* Page size: this field specifies 'N' so the page size = 2^N bytes. */
+	params->page_size = bfpt.dwords[BFPT_DWORD(11)];
+	params->page_size &= BFPT_DWORD11_PAGE_SIZE_MASK;
+	params->page_size >>= BFPT_DWORD11_PAGE_SIZE_SHIFT;
+	params->page_size = 1U << params->page_size;
+
+	/* Quad Enable Requirements. */
+	switch (bfpt.dwords[BFPT_DWORD(15)] & BFPT_DWORD15_QER_MASK) {
+	case BFPT_DWORD15_QER_NONE:
+		params->quad_enable = NULL;
+		break;
+
+	case BFPT_DWORD15_QER_SR2_BIT1_BUGGY:
+	case BFPT_DWORD15_QER_SR2_BIT1_NO_RD:
+		params->quad_enable = spansion_no_read_cr_quad_enable;
+		break;
+
+	case BFPT_DWORD15_QER_SR1_BIT6:
+		params->quad_enable = macronix_quad_enable;
+		break;
+
+	case BFPT_DWORD15_QER_SR2_BIT7:
+		params->quad_enable = sr2_bit7_quad_enable;
+		break;
+
+	case BFPT_DWORD15_QER_SR2_BIT1:
+		params->quad_enable = spansion_read_cr_quad_enable;
+		break;
+
+	default:
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+/**
+ * spi_nor_parse_sfdp() - parse the Serial Flash Discoverable Parameters.
+ * @nor:		pointer to a 'struct spi_nor'
+ * @params:		pointer to the 'struct spi_nor_flash_parameter' to be
+ *			filled
+ *
+ * The Serial Flash Discoverable Parameters are described by the JEDEC JESD216
+ * specification. This is a standard which tends to supported by almost all
+ * (Q)SPI memory manufacturers. Those hard-coded tables allow us to learn at
+ * runtime the main parameters needed to perform basic SPI flash operations such
+ * as Fast Read, Page Program or Sector Erase commands.
+ *
+ * Return: 0 on success, -errno otherwise.
+ */
+static int spi_nor_parse_sfdp(struct spi_nor *nor,
+			      struct spi_nor_flash_parameter *params)
+{
+	const struct sfdp_parameter_header *param_header, *bfpt_header;
+	struct sfdp_parameter_header *param_headers = NULL;
+	struct sfdp_header header;
+	struct device *dev = nor->dev;
+	size_t psize;
+	int i, err;
+
+	/* Get the SFDP header. */
+	err = spi_nor_read_sfdp(nor, 0, sizeof(header), &header);
+	if (err < 0)
+		return err;
+
+	/* Check the SFDP header version. */
+	if (le32_to_cpu(header.signature) != SFDP_SIGNATURE ||
+	    header.major != SFDP_JESD216_MAJOR ||
+	    header.minor < SFDP_JESD216_MINOR)
+		return -EINVAL;
+
+	/*
+	 * Verify that the first and only mandatory parameter header is a
+	 * Basic Flash Parameter Table header as specified in JESD216.
+	 */
+	bfpt_header = &header.bfpt_header;
+	if (SFDP_PARAM_HEADER_ID(bfpt_header) != SFDP_BFPT_ID ||
+	    bfpt_header->major != SFDP_JESD216_MAJOR)
+		return -EINVAL;
+
+	/*
+	 * Allocate memory then read all parameter headers with a single
+	 * Read SFDP command. These parameter headers will actually be parsed
+	 * twice: a first time to get the latest revision of the basic flash
+	 * parameter table, then a second time to handle the supported optional
+	 * tables.
+	 * Hence we read the parameter headers once for all to reduce the
+	 * processing time. Also we use kmalloc() instead of devm_kmalloc()
+	 * because we don't need to keep these parameter headers: the allocated
+	 * memory is always released with kfree() before exiting this function.
+	 */
+	if (header.nph) {
+		psize = header.nph * sizeof(*param_headers);
+
+		param_headers = kmalloc(psize, GFP_KERNEL);
+		if (!param_headers)
+			return -ENOMEM;
+
+		err = spi_nor_read_sfdp(nor, sizeof(header),
+					psize, param_headers);
+		if (err < 0) {
+			dev_err(dev, "failed to read SFDP parameter headers\n");
+			goto exit;
+		}
+	}
+
+	/*
+	 * Check other parameter headers to get the latest revision of
+	 * the basic flash parameter table.
+	 */
+	for (i = 0; i < header.nph; i++) {
+		param_header = &param_headers[i];
+
+		if (SFDP_PARAM_HEADER_ID(param_header) == SFDP_BFPT_ID &&
+		    param_header->major == SFDP_JESD216_MAJOR &&
+		    (param_header->minor > bfpt_header->minor ||
+		     (param_header->minor == bfpt_header->minor &&
+		      param_header->length > bfpt_header->length)))
+			bfpt_header = param_header;
+	}
+
+	err = spi_nor_parse_bfpt(nor, bfpt_header, params);
+	if (err)
+		goto exit;
+
+	/* Parse other parameter headers. */
+	for (i = 0; i < header.nph; i++) {
+		param_header = &param_headers[i];
+
+		switch (SFDP_PARAM_HEADER_ID(param_header)) {
+		case SFDP_SECTOR_MAP_ID:
+			dev_info(dev, "non-uniform erase sector maps are not supported yet.\n");
+			break;
+
+		default:
+			break;
+		}
+
+		if (err)
+			goto exit;
+	}
+
+exit:
+	kfree(param_headers);
+	return err;
+}
+
 static int spi_nor_init_params(struct spi_nor *nor,
 			       const struct flash_info *info,
 			       struct spi_nor_flash_parameter *params)
@@ -1646,11 +2372,28 @@ static int spi_nor_init_params(struct spi_nor *nor,
 			break;
 
 		default:
+			/* Kept only for backward compatibility purpose. */
 			params->quad_enable = spansion_quad_enable;
 			break;
 		}
 	}
 
+	/* Override the parameters with data read from SFDP tables. */
+	nor->addr_width = 0;
+	nor->mtd.erasesize = 0;
+	if ((info->flags & (SPI_NOR_DUAL_READ | SPI_NOR_QUAD_READ)) &&
+	    !(info->flags & SPI_NOR_SKIP_SFDP)) {
+		struct spi_nor_flash_parameter sfdp_params;
+
+		memcpy(&sfdp_params, params, sizeof(sfdp_params));
+		if (spi_nor_parse_sfdp(nor, &sfdp_params)) {
+			nor->addr_width = 0;
+			nor->mtd.erasesize = 0;
+		} else {
+			memcpy(params, &sfdp_params, sizeof(*params));
+		}
+	}
+
 	return 0;
 }
 
@@ -1762,6 +2505,10 @@ static int spi_nor_select_erase(struct spi_nor *nor,
 {
 	struct mtd_info *mtd = &nor->mtd;
 
+	/* Do nothing if already configured from SFDP. */
+	if (mtd->erasesize)
+		return 0;
+
 #ifdef CONFIG_MTD_SPI_NOR_USE_4K_SECTORS
 	/* prefer "small sector" erase if possible */
 	if (info->flags & SECT_4K) {
@@ -1994,9 +2741,11 @@ int spi_nor_scan(struct spi_nor *nor, const char *name,
 	if (ret)
 		return ret;
 
-	if (info->addr_width)
+	if (nor->addr_width) {
+		/* already configured from SFDP */
+	} else if (info->addr_width) {
 		nor->addr_width = info->addr_width;
-	else if (mtd->size > 0x1000000) {
+	} else if (mtd->size > 0x1000000) {
 		/* enable 4-byte addressing if the device exceeds 16MiB */
 		nor->addr_width = 4;
 		if (JEDEC_MFR(info) == SNOR_MFR_SPANSION ||
diff --git a/include/linux/mtd/spi-nor.h b/include/linux/mtd/spi-nor.h
index 55faa2f07cca..0df3638ff0b8 100644
--- a/include/linux/mtd/spi-nor.h
+++ b/include/linux/mtd/spi-nor.h
@@ -41,6 +41,8 @@
 #define SPINOR_OP_WREN		0x06	/* Write enable */
 #define SPINOR_OP_RDSR		0x05	/* Read status register */
 #define SPINOR_OP_WRSR		0x01	/* Write status register 1 byte */
+#define SPINOR_OP_RDSR2		0x3f	/* Read status register 2 */
+#define SPINOR_OP_WRSR2		0x3e	/* Write status register 2 */
 #define SPINOR_OP_READ		0x03	/* Read data bytes (low frequency) */
 #define SPINOR_OP_READ_FAST	0x0b	/* Read data bytes (high frequency) */
 #define SPINOR_OP_READ_1_1_2	0x3b	/* Read data bytes (Dual Output SPI) */
@@ -56,6 +58,7 @@
 #define SPINOR_OP_CHIP_ERASE	0xc7	/* Erase whole flash chip */
 #define SPINOR_OP_SE		0xd8	/* Sector erase (usually 64KiB) */
 #define SPINOR_OP_RDID		0x9f	/* Read JEDEC ID */
+#define SPINOR_OP_RDSFDP	0x5a	/* Read SFDP */
 #define SPINOR_OP_RDCR		0x35	/* Read configuration register */
 #define SPINOR_OP_RDFSR		0x70	/* Read flag status register */
 
@@ -128,6 +131,9 @@
 /* Configuration Register bits. */
 #define CR_QUAD_EN_SPAN		BIT(1)	/* Spansion Quad I/O */
 
+/* Status Register 2 bits. */
+#define SR2_QUAD_EN_BIT7	BIT(7)
+
 /* Supported SPI protocols */
 #define SNOR_PROTO_INST_MASK	GENMASK(23, 16)
 #define SNOR_PROTO_INST_SHIFT	16
-- 
cgit v1.2.3


From bb789e03f2a140a80ddc7d18b7ef4d626c2a71d8 Mon Sep 17 00:00:00 2001
From: Hans Verkuil <hans.verkuil@cisco.com>
Date: Tue, 11 Jul 2017 03:30:34 -0300
Subject: media: cec: improve transmit timeout logging

Kernel logging messes up the upcoming low-level CEC monitoring support
which is very time-sensitive. So change the debug level of this message
but keep a counter that is shown in the debugfs status log.

Signed-off-by: Hans Verkuil <hans.verkuil@cisco.com>
Reviewed-by: Maxime Ripard <maxime.ripard@free-electrons.com>
Signed-off-by: Mauro Carvalho Chehab <mchehab@s-opensource.com>
---
 drivers/media/cec/cec-adap.c | 17 +++++++++++++----
 include/media/cec.h          |  2 ++
 2 files changed, 15 insertions(+), 4 deletions(-)

(limited to 'include')

diff --git a/drivers/media/cec/cec-adap.c b/drivers/media/cec/cec-adap.c
index a311c52723a1..7bd4d7d8713c 100644
--- a/drivers/media/cec/cec-adap.c
+++ b/drivers/media/cec/cec-adap.c
@@ -394,13 +394,17 @@ int cec_thread_func(void *_adap)
 
 		if (adap->transmitting && timeout) {
 			/*
-			 * If we timeout, then log that. This really shouldn't
-			 * happen and is an indication of a faulty CEC adapter
-			 * driver, or the CEC bus is in some weird state.
+			 * If we timeout, then log that. Normally this does
+			 * not happen and it is an indication of a faulty CEC
+			 * adapter driver, or the CEC bus is in some weird
+			 * state. On rare occasions it can happen if there is
+			 * so much traffic on the bus that the adapter was
+			 * unable to transmit for CEC_XFER_TIMEOUT_MS (2.1s).
 			 */
-			dprintk(0, "%s: message %*ph timed out!\n", __func__,
+			dprintk(1, "%s: message %*ph timed out\n", __func__,
 				adap->transmitting->msg.len,
 				adap->transmitting->msg.msg);
+			adap->tx_timeouts++;
 			/* Just give up on this. */
 			cec_data_cancel(adap->transmitting);
 			goto unlock;
@@ -1951,6 +1955,11 @@ int cec_adap_status(struct seq_file *file, void *priv)
 	if (adap->monitor_all_cnt)
 		seq_printf(file, "file handles in Monitor All mode: %u\n",
 			   adap->monitor_all_cnt);
+	if (adap->tx_timeouts) {
+		seq_printf(file, "transmit timeouts: %u\n",
+			   adap->tx_timeouts);
+		adap->tx_timeouts = 0;
+	}
 	data = adap->transmitting;
 	if (data)
 		seq_printf(file, "transmitting message: %*ph (reply: %02x, timeout: %ums)\n",
diff --git a/include/media/cec.h b/include/media/cec.h
index 56643b27e4b8..e32b0e1a81a4 100644
--- a/include/media/cec.h
+++ b/include/media/cec.h
@@ -174,6 +174,8 @@ struct cec_adapter {
 	bool passthrough;
 	struct cec_log_addrs log_addrs;
 
+	u32 tx_timeouts;
+
 #ifdef CONFIG_CEC_NOTIFIER
 	struct cec_notifier *notifier;
 #endif
-- 
cgit v1.2.3


From 0861ad14c6cfc9dc8bcde44bc23a7a355167eb9f Mon Sep 17 00:00:00 2001
From: Hans Verkuil <hans.verkuil@cisco.com>
Date: Tue, 11 Jul 2017 03:30:35 -0300
Subject: media: cec: add *_ts variants for transmit_done/received_msg

Currently the transmit_(attempt_)done and received_msg functions set
the timestamp themselves. For the upcoming low-level pin API we need
to pass this as an argument instead. So make _ts variants that allow
the caller to specify the timestamp.

Signed-off-by: Hans Verkuil <hans.verkuil@cisco.com>
Reviewed-by: Maxime Ripard <maxime.ripard@free-electrons.com>
Signed-off-by: Mauro Carvalho Chehab <mchehab@s-opensource.com>
---
 drivers/media/cec/cec-adap.c | 35 +++++++++++++++++++----------------
 include/media/cec.h          | 32 ++++++++++++++++++++++++++++----
 2 files changed, 47 insertions(+), 20 deletions(-)

(limited to 'include')

diff --git a/drivers/media/cec/cec-adap.c b/drivers/media/cec/cec-adap.c
index 7bd4d7d8713c..82c1633f5b92 100644
--- a/drivers/media/cec/cec-adap.c
+++ b/drivers/media/cec/cec-adap.c
@@ -471,12 +471,12 @@ unlock:
 /*
  * Called by the CEC adapter if a transmit finished.
  */
-void cec_transmit_done(struct cec_adapter *adap, u8 status, u8 arb_lost_cnt,
-		       u8 nack_cnt, u8 low_drive_cnt, u8 error_cnt)
+void cec_transmit_done_ts(struct cec_adapter *adap, u8 status,
+			  u8 arb_lost_cnt, u8 nack_cnt, u8 low_drive_cnt,
+			  u8 error_cnt, ktime_t ts)
 {
 	struct cec_data *data;
 	struct cec_msg *msg;
-	u64 ts = ktime_get_ns();
 
 	dprintk(2, "%s: status %02x\n", __func__, status);
 	mutex_lock(&adap->lock);
@@ -496,7 +496,7 @@ void cec_transmit_done(struct cec_adapter *adap, u8 status, u8 arb_lost_cnt,
 
 	/* Drivers must fill in the status! */
 	WARN_ON(status == 0);
-	msg->tx_ts = ts;
+	msg->tx_ts = ktime_to_ns(ts);
 	msg->tx_status |= status;
 	msg->tx_arb_lost_cnt += arb_lost_cnt;
 	msg->tx_nack_cnt += nack_cnt;
@@ -559,25 +559,26 @@ wake_thread:
 unlock:
 	mutex_unlock(&adap->lock);
 }
-EXPORT_SYMBOL_GPL(cec_transmit_done);
+EXPORT_SYMBOL_GPL(cec_transmit_done_ts);
 
-void cec_transmit_attempt_done(struct cec_adapter *adap, u8 status)
+void cec_transmit_attempt_done_ts(struct cec_adapter *adap,
+				  u8 status, ktime_t ts)
 {
 	switch (status) {
 	case CEC_TX_STATUS_OK:
-		cec_transmit_done(adap, status, 0, 0, 0, 0);
+		cec_transmit_done_ts(adap, status, 0, 0, 0, 0, ts);
 		return;
 	case CEC_TX_STATUS_ARB_LOST:
-		cec_transmit_done(adap, status, 1, 0, 0, 0);
+		cec_transmit_done_ts(adap, status, 1, 0, 0, 0, ts);
 		return;
 	case CEC_TX_STATUS_NACK:
-		cec_transmit_done(adap, status, 0, 1, 0, 0);
+		cec_transmit_done_ts(adap, status, 0, 1, 0, 0, ts);
 		return;
 	case CEC_TX_STATUS_LOW_DRIVE:
-		cec_transmit_done(adap, status, 0, 0, 1, 0);
+		cec_transmit_done_ts(adap, status, 0, 0, 1, 0, ts);
 		return;
 	case CEC_TX_STATUS_ERROR:
-		cec_transmit_done(adap, status, 0, 0, 0, 1);
+		cec_transmit_done_ts(adap, status, 0, 0, 0, 1, ts);
 		return;
 	default:
 		/* Should never happen */
@@ -585,7 +586,7 @@ void cec_transmit_attempt_done(struct cec_adapter *adap, u8 status)
 		return;
 	}
 }
-EXPORT_SYMBOL_GPL(cec_transmit_attempt_done);
+EXPORT_SYMBOL_GPL(cec_transmit_attempt_done_ts);
 
 /*
  * Called when waiting for a reply times out.
@@ -721,7 +722,8 @@ int cec_transmit_msg_fh(struct cec_adapter *adap, struct cec_msg *msg,
 
 	if (msg->timeout)
 		dprintk(2, "%s: %*ph (wait for 0x%02x%s)\n",
-			__func__, msg->len, msg->msg, msg->reply, !block ? ", nb" : "");
+			__func__, msg->len, msg->msg, msg->reply,
+			!block ? ", nb" : "");
 	else
 		dprintk(2, "%s: %*ph%s\n",
 			__func__, msg->len, msg->msg, !block ? " (nb)" : "");
@@ -918,7 +920,8 @@ static const u8 cec_msg_size[256] = {
 };
 
 /* Called by the CEC adapter if a message is received */
-void cec_received_msg(struct cec_adapter *adap, struct cec_msg *msg)
+void cec_received_msg_ts(struct cec_adapter *adap,
+			 struct cec_msg *msg, ktime_t ts)
 {
 	struct cec_data *data;
 	u8 msg_init = cec_msg_initiator(msg);
@@ -946,7 +949,7 @@ void cec_received_msg(struct cec_adapter *adap, struct cec_msg *msg)
 	    cec_has_log_addr(adap, msg_init))
 		return;
 
-	msg->rx_ts = ktime_get_ns();
+	msg->rx_ts = ktime_to_ns(ts);
 	msg->rx_status = CEC_RX_STATUS_OK;
 	msg->sequence = msg->reply = msg->timeout = 0;
 	msg->tx_status = 0;
@@ -1111,7 +1114,7 @@ void cec_received_msg(struct cec_adapter *adap, struct cec_msg *msg)
 	 */
 	cec_receive_notify(adap, msg, is_reply);
 }
-EXPORT_SYMBOL_GPL(cec_received_msg);
+EXPORT_SYMBOL_GPL(cec_received_msg_ts);
 
 /* Logical Address Handling */
 
diff --git a/include/media/cec.h b/include/media/cec.h
index e32b0e1a81a4..e1e60dbb66c3 100644
--- a/include/media/cec.h
+++ b/include/media/cec.h
@@ -228,15 +228,39 @@ int cec_transmit_msg(struct cec_adapter *adap, struct cec_msg *msg,
 		     bool block);
 
 /* Called by the adapter */
-void cec_transmit_done(struct cec_adapter *adap, u8 status, u8 arb_lost_cnt,
-		       u8 nack_cnt, u8 low_drive_cnt, u8 error_cnt);
+void cec_transmit_done_ts(struct cec_adapter *adap, u8 status,
+			  u8 arb_lost_cnt, u8 nack_cnt, u8 low_drive_cnt,
+			  u8 error_cnt, ktime_t ts);
+
+static inline void cec_transmit_done(struct cec_adapter *adap, u8 status,
+				     u8 arb_lost_cnt, u8 nack_cnt,
+				     u8 low_drive_cnt, u8 error_cnt)
+{
+	cec_transmit_done_ts(adap, status, arb_lost_cnt, nack_cnt,
+			     low_drive_cnt, error_cnt, ktime_get());
+}
 /*
  * Simplified version of cec_transmit_done for hardware that doesn't retry
  * failed transmits. So this is always just one attempt in which case
  * the status is sufficient.
  */
-void cec_transmit_attempt_done(struct cec_adapter *adap, u8 status);
-void cec_received_msg(struct cec_adapter *adap, struct cec_msg *msg);
+void cec_transmit_attempt_done_ts(struct cec_adapter *adap,
+				  u8 status, ktime_t ts);
+
+static inline void cec_transmit_attempt_done(struct cec_adapter *adap,
+					     u8 status)
+{
+	cec_transmit_attempt_done_ts(adap, status, ktime_get());
+}
+
+void cec_received_msg_ts(struct cec_adapter *adap,
+			 struct cec_msg *msg, ktime_t ts);
+
+static inline void cec_received_msg(struct cec_adapter *adap,
+				    struct cec_msg *msg)
+{
+	cec_received_msg_ts(adap, msg, ktime_get());
+}
 
 /**
  * cec_get_edid_phys_addr() - find and return the physical address
-- 
cgit v1.2.3


From e6259b5f7ab74e399f838cb1abdc12f3e28668f4 Mon Sep 17 00:00:00 2001
From: Hans Verkuil <hans.verkuil@cisco.com>
Date: Tue, 11 Jul 2017 03:30:36 -0300
Subject: media: cec: add adap_free op

This is needed for CEC adapters that allocate resources that have
to be freed before the cec_adapter is deleted.

Signed-off-by: Hans Verkuil <hans.verkuil@cisco.com>
Reviewed-by: Maxime Ripard <maxime.ripard@free-electrons.com>
Signed-off-by: Mauro Carvalho Chehab <mchehab@s-opensource.com>
---
 drivers/media/cec/cec-core.c | 2 ++
 include/media/cec.h          | 1 +
 2 files changed, 3 insertions(+)

(limited to 'include')

diff --git a/drivers/media/cec/cec-core.c b/drivers/media/cec/cec-core.c
index b516d599d6c4..2e5765344d07 100644
--- a/drivers/media/cec/cec-core.c
+++ b/drivers/media/cec/cec-core.c
@@ -374,6 +374,8 @@ void cec_delete_adapter(struct cec_adapter *adap)
 	kthread_stop(adap->kthread);
 	if (adap->kthread_config)
 		kthread_stop(adap->kthread_config);
+	if (adap->ops->adap_free)
+		adap->ops->adap_free(adap);
 #ifdef CONFIG_MEDIA_CEC_RC
 	rc_free_device(adap->rc);
 #endif
diff --git a/include/media/cec.h b/include/media/cec.h
index e1e60dbb66c3..37768203572d 100644
--- a/include/media/cec.h
+++ b/include/media/cec.h
@@ -114,6 +114,7 @@ struct cec_adap_ops {
 	int (*adap_transmit)(struct cec_adapter *adap, u8 attempts,
 			     u32 signal_free_time, struct cec_msg *msg);
 	void (*adap_status)(struct cec_adapter *adap, struct seq_file *file);
+	void (*adap_free)(struct cec_adapter *adap);
 
 	/* High-level CEC message callback */
 	int (*received)(struct cec_adapter *adap, struct cec_msg *msg);
-- 
cgit v1.2.3


From 6303d97873d340e89acdef12effb66f88d79836f Mon Sep 17 00:00:00 2001
From: Hans Verkuil <hans.verkuil@cisco.com>
Date: Tue, 11 Jul 2017 03:30:38 -0300
Subject: media: linux/cec.h: add pin monitoring API support

Add support for low-level CEC pin monitoring. This adds a new monitor
mode, a new capability and two new events.

Signed-off-by: Hans Verkuil <hans.verkuil@cisco.com>
Reviewed-by: Maxime Ripard <maxime.ripard@free-electrons.com>
Signed-off-by: Mauro Carvalho Chehab <mchehab@s-opensource.com>
---
 include/uapi/linux/cec.h | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'include')

diff --git a/include/uapi/linux/cec.h b/include/uapi/linux/cec.h
index 44579a24f95d..bba73f33c8aa 100644
--- a/include/uapi/linux/cec.h
+++ b/include/uapi/linux/cec.h
@@ -318,6 +318,7 @@ static inline int cec_is_unconfigured(__u16 log_addr_mask)
 #define CEC_MODE_FOLLOWER		(0x1 << 4)
 #define CEC_MODE_EXCL_FOLLOWER		(0x2 << 4)
 #define CEC_MODE_EXCL_FOLLOWER_PASSTHRU	(0x3 << 4)
+#define CEC_MODE_MONITOR_PIN		(0xd << 4)
 #define CEC_MODE_MONITOR		(0xe << 4)
 #define CEC_MODE_MONITOR_ALL		(0xf << 4)
 #define CEC_MODE_FOLLOWER_MSK		0xf0
@@ -338,6 +339,8 @@ static inline int cec_is_unconfigured(__u16 log_addr_mask)
 #define CEC_CAP_MONITOR_ALL	(1 << 5)
 /* Hardware can use CEC only if the HDMI HPD pin is high. */
 #define CEC_CAP_NEEDS_HPD	(1 << 6)
+/* Hardware can monitor CEC pin transitions */
+#define CEC_CAP_MONITOR_PIN	(1 << 7)
 
 /**
  * struct cec_caps - CEC capabilities structure.
@@ -405,6 +408,8 @@ struct cec_log_addrs {
  * didn't empty the message queue in time
  */
 #define CEC_EVENT_LOST_MSGS		2
+#define CEC_EVENT_PIN_LOW		3
+#define CEC_EVENT_PIN_HIGH		4
 
 #define CEC_EVENT_FL_INITIAL_STATE	(1 << 0)
 
-- 
cgit v1.2.3


From 6b2bbb08747a56dcf4ee33606a06025eca571260 Mon Sep 17 00:00:00 2001
From: Hans Verkuil <hans.verkuil@cisco.com>
Date: Tue, 11 Jul 2017 03:30:39 -0300
Subject: media: cec: rework the cec event handling

Event handling was always fairly simplistic since there were only
two events. With the addition of pin events this needed to be redesigned.

The state_change and lost_msgs events are now core events with the
guarantee that the last state is always available. The new pin events
are a queue of events (up to 64 for each event) and the oldest event
will be dropped if the application cannot keep up. Lost events are
marked with a new event flag.

Signed-off-by: Hans Verkuil <hans.verkuil@cisco.com>
Reviewed-by: Maxime Ripard <maxime.ripard@free-electrons.com>
Signed-off-by: Mauro Carvalho Chehab <mchehab@s-opensource.com>
---
 drivers/media/cec/cec-adap.c | 128 +++++++++++++++++++++++++------------------
 drivers/media/cec/cec-api.c  |  48 +++++++++++-----
 include/media/cec.h          |  14 ++++-
 include/uapi/linux/cec.h     |   3 +-
 4 files changed, 124 insertions(+), 69 deletions(-)

(limited to 'include')

diff --git a/drivers/media/cec/cec-adap.c b/drivers/media/cec/cec-adap.c
index 82c1633f5b92..5080d7aa6105 100644
--- a/drivers/media/cec/cec-adap.c
+++ b/drivers/media/cec/cec-adap.c
@@ -78,42 +78,62 @@ static unsigned int cec_log_addr2dev(const struct cec_adapter *adap, u8 log_addr
  * Queue a new event for this filehandle. If ts == 0, then set it
  * to the current time.
  *
- * The two events that are currently defined do not need to keep track
- * of intermediate events, so no actual queue of events is needed,
- * instead just store the latest state and the total number of lost
- * messages.
- *
- * Should new events be added in the future that require intermediate
- * results to be queued as well, then a proper queue data structure is
- * required. But until then, just keep it simple.
+ * We keep a queue of at most max_event events where max_event differs
+ * per event. If the queue becomes full, then drop the oldest event and
+ * keep track of how many events we've dropped.
  */
 void cec_queue_event_fh(struct cec_fh *fh,
 			const struct cec_event *new_ev, u64 ts)
 {
-	struct cec_event *ev = &fh->events[new_ev->event - 1];
+	static const u8 max_events[CEC_NUM_EVENTS] = {
+		1, 1, 64, 64,
+	};
+	struct cec_event_entry *entry;
+	unsigned int ev_idx = new_ev->event - 1;
+
+	if (WARN_ON(ev_idx >= ARRAY_SIZE(fh->events)))
+		return;
 
 	if (ts == 0)
 		ts = ktime_get_ns();
 
 	mutex_lock(&fh->lock);
-	if (new_ev->event == CEC_EVENT_LOST_MSGS &&
-	    fh->pending_events & (1 << new_ev->event)) {
-		/*
-		 * If there is already a lost_msgs event, then just
-		 * update the lost_msgs count. This effectively
-		 * merges the old and new events into one.
-		 */
-		ev->lost_msgs.lost_msgs += new_ev->lost_msgs.lost_msgs;
-		goto unlock;
-	}
+	if (ev_idx < CEC_NUM_CORE_EVENTS)
+		entry = &fh->core_events[ev_idx];
+	else
+		entry = kmalloc(sizeof(*entry), GFP_KERNEL);
+	if (entry) {
+		if (new_ev->event == CEC_EVENT_LOST_MSGS &&
+		    fh->queued_events[ev_idx]) {
+			entry->ev.lost_msgs.lost_msgs +=
+				new_ev->lost_msgs.lost_msgs;
+			goto unlock;
+		}
+		entry->ev = *new_ev;
+		entry->ev.ts = ts;
+
+		if (fh->queued_events[ev_idx] < max_events[ev_idx]) {
+			/* Add new msg at the end of the queue */
+			list_add_tail(&entry->list, &fh->events[ev_idx]);
+			fh->queued_events[ev_idx]++;
+			fh->total_queued_events++;
+			goto unlock;
+		}
 
-	/*
-	 * Intermediate states are not interesting, so just
-	 * overwrite any older event.
-	 */
-	*ev = *new_ev;
-	ev->ts = ts;
-	fh->pending_events |= 1 << new_ev->event;
+		if (ev_idx >= CEC_NUM_CORE_EVENTS) {
+			list_add_tail(&entry->list, &fh->events[ev_idx]);
+			/* drop the oldest event */
+			entry = list_first_entry(&fh->events[ev_idx],
+						 struct cec_event_entry, list);
+			list_del(&entry->list);
+			kfree(entry);
+		}
+	}
+	/* Mark that events were lost */
+	entry = list_first_entry_or_null(&fh->events[ev_idx],
+					 struct cec_event_entry, list);
+	if (entry)
+		entry->ev.flags |= CEC_EVENT_FL_DROPPED_EVENTS;
 
 unlock:
 	mutex_unlock(&fh->lock);
@@ -134,46 +154,50 @@ static void cec_queue_event(struct cec_adapter *adap,
 }
 
 /*
- * Queue a new message for this filehandle. If there is no more room
- * in the queue, then send the LOST_MSGS event instead.
+ * Queue a new message for this filehandle.
+ *
+ * We keep a queue of at most CEC_MAX_MSG_RX_QUEUE_SZ messages. If the
+ * queue becomes full, then drop the oldest message and keep track
+ * of how many messages we've dropped.
  */
 static void cec_queue_msg_fh(struct cec_fh *fh, const struct cec_msg *msg)
 {
-	static const struct cec_event ev_lost_msg = {
-		.ts = 0,
+	static const struct cec_event ev_lost_msgs = {
 		.event = CEC_EVENT_LOST_MSGS,
-		.flags = 0,
-		{
-			.lost_msgs.lost_msgs = 1,
-		},
+		.lost_msgs.lost_msgs = 1,
 	};
 	struct cec_msg_entry *entry;
 
 	mutex_lock(&fh->lock);
 	entry = kmalloc(sizeof(*entry), GFP_KERNEL);
-	if (!entry)
-		goto lost_msgs;
-
-	entry->msg = *msg;
-	/* Add new msg at the end of the queue */
-	list_add_tail(&entry->list, &fh->msgs);
+	if (entry) {
+		entry->msg = *msg;
+		/* Add new msg at the end of the queue */
+		list_add_tail(&entry->list, &fh->msgs);
+
+		if (fh->queued_msgs < CEC_MAX_MSG_RX_QUEUE_SZ) {
+			/* All is fine if there is enough room */
+			fh->queued_msgs++;
+			mutex_unlock(&fh->lock);
+			wake_up_interruptible(&fh->wait);
+			return;
+		}
 
-	/*
-	 * if the queue now has more than CEC_MAX_MSG_RX_QUEUE_SZ
-	 * messages, drop the oldest one and send a lost message event.
-	 */
-	if (fh->queued_msgs == CEC_MAX_MSG_RX_QUEUE_SZ) {
+		/*
+		 * if the message queue is full, then drop the oldest one and
+		 * send a lost message event.
+		 */
+		entry = list_first_entry(&fh->msgs, struct cec_msg_entry, list);
 		list_del(&entry->list);
-		goto lost_msgs;
+		kfree(entry);
 	}
-	fh->queued_msgs++;
 	mutex_unlock(&fh->lock);
-	wake_up_interruptible(&fh->wait);
-	return;
 
-lost_msgs:
-	mutex_unlock(&fh->lock);
-	cec_queue_event_fh(fh, &ev_lost_msg, 0);
+	/*
+	 * We lost a message, either because kmalloc failed or the queue
+	 * was full.
+	 */
+	cec_queue_event_fh(fh, &ev_lost_msgs, ktime_get_ns());
 }
 
 /*
diff --git a/drivers/media/cec/cec-api.c b/drivers/media/cec/cec-api.c
index f7eb4c54a354..48bef1c718ad 100644
--- a/drivers/media/cec/cec-api.c
+++ b/drivers/media/cec/cec-api.c
@@ -57,7 +57,7 @@ static unsigned int cec_poll(struct file *filp,
 		res |= POLLOUT | POLLWRNORM;
 	if (fh->queued_msgs)
 		res |= POLLIN | POLLRDNORM;
-	if (fh->pending_events)
+	if (fh->total_queued_events)
 		res |= POLLPRI;
 	poll_wait(filp, &fh->wait, poll);
 	mutex_unlock(&adap->lock);
@@ -289,15 +289,17 @@ static long cec_receive(struct cec_adapter *adap, struct cec_fh *fh,
 static long cec_dqevent(struct cec_adapter *adap, struct cec_fh *fh,
 			bool block, struct cec_event __user *parg)
 {
-	struct cec_event *ev = NULL;
+	struct cec_event_entry *ev = NULL;
 	u64 ts = ~0ULL;
 	unsigned int i;
+	unsigned int ev_idx;
 	long err = 0;
 
 	mutex_lock(&fh->lock);
-	while (!fh->pending_events && block) {
+	while (!fh->total_queued_events && block) {
 		mutex_unlock(&fh->lock);
-		err = wait_event_interruptible(fh->wait, fh->pending_events);
+		err = wait_event_interruptible(fh->wait,
+					       fh->total_queued_events);
 		if (err)
 			return err;
 		mutex_lock(&fh->lock);
@@ -305,23 +307,29 @@ static long cec_dqevent(struct cec_adapter *adap, struct cec_fh *fh,
 
 	/* Find the oldest event */
 	for (i = 0; i < CEC_NUM_EVENTS; i++) {
-		if (fh->pending_events & (1 << (i + 1)) &&
-		    fh->events[i].ts <= ts) {
-			ev = &fh->events[i];
-			ts = ev->ts;
+		struct cec_event_entry *entry =
+			list_first_entry_or_null(&fh->events[i],
+						 struct cec_event_entry, list);
+
+		if (entry && entry->ev.ts <= ts) {
+			ev = entry;
+			ev_idx = i;
+			ts = ev->ev.ts;
 		}
 	}
+
 	if (!ev) {
 		err = -EAGAIN;
 		goto unlock;
 	}
+	list_del(&ev->list);
 
-	if (copy_to_user(parg, ev, sizeof(*ev))) {
+	if (copy_to_user(parg, &ev->ev, sizeof(ev->ev)))
 		err = -EFAULT;
-		goto unlock;
-	}
-
-	fh->pending_events &= ~(1 << ev->event);
+	if (ev_idx >= CEC_NUM_CORE_EVENTS)
+		kfree(ev);
+	fh->queued_events[ev_idx]--;
+	fh->total_queued_events--;
 
 unlock:
 	mutex_unlock(&fh->lock);
@@ -495,6 +503,7 @@ static int cec_open(struct inode *inode, struct file *filp)
 		.event = CEC_EVENT_STATE_CHANGE,
 		.flags = CEC_EVENT_FL_INITIAL_STATE,
 	};
+	unsigned int i;
 	int err;
 
 	if (!fh)
@@ -502,6 +511,8 @@ static int cec_open(struct inode *inode, struct file *filp)
 
 	INIT_LIST_HEAD(&fh->msgs);
 	INIT_LIST_HEAD(&fh->xfer_list);
+	for (i = 0; i < CEC_NUM_EVENTS; i++)
+		INIT_LIST_HEAD(&fh->events[i]);
 	mutex_init(&fh->lock);
 	init_waitqueue_head(&fh->wait);
 
@@ -544,6 +555,7 @@ static int cec_release(struct inode *inode, struct file *filp)
 	struct cec_devnode *devnode = cec_devnode_data(filp);
 	struct cec_adapter *adap = to_cec_adapter(devnode);
 	struct cec_fh *fh = filp->private_data;
+	unsigned int i;
 
 	mutex_lock(&adap->lock);
 	if (adap->cec_initiator == fh)
@@ -585,6 +597,16 @@ static int cec_release(struct inode *inode, struct file *filp)
 		list_del(&entry->list);
 		kfree(entry);
 	}
+	for (i = CEC_NUM_CORE_EVENTS; i < CEC_NUM_EVENTS; i++) {
+		while (!list_empty(&fh->events[i])) {
+			struct cec_event_entry *entry =
+				list_first_entry(&fh->events[i],
+						 struct cec_event_entry, list);
+
+			list_del(&entry->list);
+			kfree(entry);
+		}
+	}
 	kfree(fh);
 
 	cec_put_device(devnode);
diff --git a/include/media/cec.h b/include/media/cec.h
index 37768203572d..6cc862af74e5 100644
--- a/include/media/cec.h
+++ b/include/media/cec.h
@@ -81,7 +81,13 @@ struct cec_msg_entry {
 	struct cec_msg		msg;
 };
 
-#define CEC_NUM_EVENTS		CEC_EVENT_LOST_MSGS
+struct cec_event_entry {
+	struct list_head	list;
+	struct cec_event	ev;
+};
+
+#define CEC_NUM_CORE_EVENTS 2
+#define CEC_NUM_EVENTS CEC_EVENT_PIN_HIGH
 
 struct cec_fh {
 	struct list_head	list;
@@ -92,9 +98,11 @@ struct cec_fh {
 
 	/* Events */
 	wait_queue_head_t	wait;
-	unsigned int		pending_events;
-	struct cec_event	events[CEC_NUM_EVENTS];
 	struct mutex		lock;
+	struct list_head	events[CEC_NUM_EVENTS]; /* queued events */
+	u8			queued_events[CEC_NUM_EVENTS];
+	unsigned int		total_queued_events;
+	struct cec_event_entry	core_events[CEC_NUM_CORE_EVENTS];
 	struct list_head	msgs; /* queued messages */
 	unsigned int		queued_msgs;
 };
diff --git a/include/uapi/linux/cec.h b/include/uapi/linux/cec.h
index bba73f33c8aa..d87a67b0bb06 100644
--- a/include/uapi/linux/cec.h
+++ b/include/uapi/linux/cec.h
@@ -412,6 +412,7 @@ struct cec_log_addrs {
 #define CEC_EVENT_PIN_HIGH		4
 
 #define CEC_EVENT_FL_INITIAL_STATE	(1 << 0)
+#define CEC_EVENT_FL_DROPPED_EVENTS	(1 << 1)
 
 /**
  * struct cec_event_state_change - used when the CEC adapter changes state.
@@ -424,7 +425,7 @@ struct cec_event_state_change {
 };
 
 /**
- * struct cec_event_lost_msgs - tells you how many messages were lost due.
+ * struct cec_event_lost_msgs - tells you how many messages were lost.
  * @lost_msgs: how many messages were lost.
  */
 struct cec_event_lost_msgs {
-- 
cgit v1.2.3


From b8d62f50b1df2571afcf47107bbbe9cd60f8aafd Mon Sep 17 00:00:00 2001
From: Hans Verkuil <hans.verkuil@cisco.com>
Date: Tue, 11 Jul 2017 03:30:41 -0300
Subject: media: cec: add core support for low-level CEC pin monitoring

Add support for the new MONITOR_PIN mode.

Add the cec_pin_event function that the CEC pin code will call to queue pin
change events.

Signed-off-by: Hans Verkuil <hans.verkuil@cisco.com>
Reviewed-by: Maxime Ripard <maxime.ripard@free-electrons.com>
Signed-off-by: Mauro Carvalho Chehab <mchehab@s-opensource.com>
---
 drivers/media/cec/cec-adap.c | 16 ++++++++++++++++
 drivers/media/cec/cec-api.c  | 15 +++++++++++++--
 include/media/cec.h          | 11 +++++++++++
 3 files changed, 40 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/drivers/media/cec/cec-adap.c b/drivers/media/cec/cec-adap.c
index 5080d7aa6105..b0bd466f75e2 100644
--- a/drivers/media/cec/cec-adap.c
+++ b/drivers/media/cec/cec-adap.c
@@ -153,6 +153,22 @@ static void cec_queue_event(struct cec_adapter *adap,
 	mutex_unlock(&adap->devnode.lock);
 }
 
+/* Notify userspace that the CEC pin changed state at the given time. */
+void cec_queue_pin_event(struct cec_adapter *adap, bool is_high, ktime_t ts)
+{
+	struct cec_event ev = {
+		.event = is_high ? CEC_EVENT_PIN_HIGH : CEC_EVENT_PIN_LOW,
+	};
+	struct cec_fh *fh;
+
+	mutex_lock(&adap->devnode.lock);
+	list_for_each_entry(fh, &adap->devnode.fhs, list)
+		if (fh->mode_follower == CEC_MODE_MONITOR_PIN)
+			cec_queue_event_fh(fh, &ev, ktime_to_ns(ts));
+	mutex_unlock(&adap->devnode.lock);
+}
+EXPORT_SYMBOL_GPL(cec_queue_pin_event);
+
 /*
  * Queue a new message for this filehandle.
  *
diff --git a/drivers/media/cec/cec-api.c b/drivers/media/cec/cec-api.c
index 48bef1c718ad..14279958dca1 100644
--- a/drivers/media/cec/cec-api.c
+++ b/drivers/media/cec/cec-api.c
@@ -370,6 +370,10 @@ static long cec_s_mode(struct cec_adapter *adap, struct cec_fh *fh,
 	    !(adap->capabilities & CEC_CAP_MONITOR_ALL))
 		return -EINVAL;
 
+	if (mode_follower == CEC_MODE_MONITOR_PIN &&
+	    !(adap->capabilities & CEC_CAP_MONITOR_PIN))
+		return -EINVAL;
+
 	/* Follower modes should always be able to send CEC messages */
 	if ((mode_initiator == CEC_MODE_NO_INITIATOR ||
 	     !(adap->capabilities & CEC_CAP_TRANSMIT)) &&
@@ -378,11 +382,11 @@ static long cec_s_mode(struct cec_adapter *adap, struct cec_fh *fh,
 		return -EINVAL;
 
 	/* Monitor modes require CEC_MODE_NO_INITIATOR */
-	if (mode_initiator && mode_follower >= CEC_MODE_MONITOR)
+	if (mode_initiator && mode_follower >= CEC_MODE_MONITOR_PIN)
 		return -EINVAL;
 
 	/* Monitor modes require CAP_NET_ADMIN */
-	if (mode_follower >= CEC_MODE_MONITOR && !capable(CAP_NET_ADMIN))
+	if (mode_follower >= CEC_MODE_MONITOR_PIN && !capable(CAP_NET_ADMIN))
 		return -EPERM;
 
 	mutex_lock(&adap->lock);
@@ -421,8 +425,13 @@ static long cec_s_mode(struct cec_adapter *adap, struct cec_fh *fh,
 
 	if (fh->mode_follower == CEC_MODE_FOLLOWER)
 		adap->follower_cnt--;
+	if (fh->mode_follower == CEC_MODE_MONITOR_PIN)
+		adap->monitor_pin_cnt--;
 	if (mode_follower == CEC_MODE_FOLLOWER)
 		adap->follower_cnt++;
+	if (mode_follower == CEC_MODE_MONITOR_PIN) {
+		adap->monitor_pin_cnt++;
+	}
 	if (mode_follower == CEC_MODE_EXCL_FOLLOWER ||
 	    mode_follower == CEC_MODE_EXCL_FOLLOWER_PASSTHRU) {
 		adap->passthrough =
@@ -566,6 +575,8 @@ static int cec_release(struct inode *inode, struct file *filp)
 	}
 	if (fh->mode_follower == CEC_MODE_FOLLOWER)
 		adap->follower_cnt--;
+	if (fh->mode_follower == CEC_MODE_MONITOR_PIN)
+		adap->monitor_pin_cnt--;
 	if (fh->mode_follower == CEC_MODE_MONITOR_ALL)
 		cec_monitor_all_cnt_dec(adap);
 	mutex_unlock(&adap->lock);
diff --git a/include/media/cec.h b/include/media/cec.h
index 6cc862af74e5..d983960b37ad 100644
--- a/include/media/cec.h
+++ b/include/media/cec.h
@@ -177,6 +177,7 @@ struct cec_adapter {
 	bool is_configuring;
 	bool is_configured;
 	u32 monitor_all_cnt;
+	u32 monitor_pin_cnt;
 	u32 follower_cnt;
 	struct cec_fh *cec_follower;
 	struct cec_fh *cec_initiator;
@@ -271,6 +272,16 @@ static inline void cec_received_msg(struct cec_adapter *adap,
 	cec_received_msg_ts(adap, msg, ktime_get());
 }
 
+/**
+ * cec_queue_pin_event() - queue a pin event with a given timestamp.
+ *
+ * @adap:	pointer to the cec adapter
+ * @is_high:	when true the pin is high, otherwise it is low
+ * @ts:		the timestamp for this event
+ *
+ */
+void cec_queue_pin_event(struct cec_adapter *adap, bool is_high, ktime_t ts);
+
 /**
  * cec_get_edid_phys_addr() - find and return the physical address
  *
-- 
cgit v1.2.3


From ea5c8ef296681b53480ebeeffd06083bb60e693d Mon Sep 17 00:00:00 2001
From: Hans Verkuil <hans.verkuil@cisco.com>
Date: Tue, 11 Jul 2017 03:30:42 -0300
Subject: media: cec-pin: add low-level pin hardware support

Add support for CEC hardware that relies on low-level pin polling or
GPIO interrupts.

One example is the Allwinner SoC. But any GPIO-based CEC implementation can
use this as well.

A GPIO implementation is very suitable as well for debugging: it can use
interrupts to detect state changes and report it. Userspace can then verify
if the bus traffic is correct. This also makes error injection possible.

The disadvantage is that it is hard to get the timings right since linux
isn't a hard realtime system.

In general on an idle system it works quite well, but under load the timer
will miss its mark every so often.

The debugfs file /sys/kernel/debug/cec/cecX/status gives some statistics
with respect to the timer overruns.

When the adapter is unconfigured and the low-level driver supports
interrupts, then the interrupt will be used to detect changes. This should
be quite accurate. But when the adapter is configured a hrtimer has to be
used.

The hrtimer implements a state machine where for each state the code will
read the bus or drive the bus and go on to the next state. It will re-arm
the timer with a delay based on the next state.

Signed-off-by: Hans Verkuil <hans.verkuil@cisco.com>
Reviewed-by: Maxime Ripard <maxime.ripard@free-electrons.com>
Signed-off-by: Mauro Carvalho Chehab <mchehab@s-opensource.com>
---
 drivers/media/Kconfig       |   3 +
 drivers/media/cec/Makefile  |   4 +
 drivers/media/cec/cec-api.c |  10 +
 drivers/media/cec/cec-pin.c | 794 ++++++++++++++++++++++++++++++++++++++++++++
 include/media/cec-pin.h     | 183 ++++++++++
 include/media/cec.h         |   4 +
 6 files changed, 998 insertions(+)
 create mode 100644 drivers/media/cec/cec-pin.c
 create mode 100644 include/media/cec-pin.h

(limited to 'include')

diff --git a/drivers/media/Kconfig b/drivers/media/Kconfig
index 55d9c2b82b7e..94d4e7759127 100644
--- a/drivers/media/Kconfig
+++ b/drivers/media/Kconfig
@@ -8,6 +8,9 @@ config CEC_CORE
 config CEC_NOTIFIER
 	bool
 
+config CEC_PIN
+	bool
+
 menuconfig MEDIA_SUPPORT
 	tristate "Multimedia support"
 	depends on HAS_IOMEM
diff --git a/drivers/media/cec/Makefile b/drivers/media/cec/Makefile
index eaf408e64669..3353c1741961 100644
--- a/drivers/media/cec/Makefile
+++ b/drivers/media/cec/Makefile
@@ -4,4 +4,8 @@ ifeq ($(CONFIG_CEC_NOTIFIER),y)
   cec-objs += cec-notifier.o
 endif
 
+ifeq ($(CONFIG_CEC_PIN),y)
+  cec-objs += cec-pin.o
+endif
+
 obj-$(CONFIG_CEC_CORE) += cec.o
diff --git a/drivers/media/cec/cec-api.c b/drivers/media/cec/cec-api.c
index 14279958dca1..8dd16e263452 100644
--- a/drivers/media/cec/cec-api.c
+++ b/drivers/media/cec/cec-api.c
@@ -30,6 +30,7 @@
 #include <linux/uaccess.h>
 #include <linux/version.h>
 
+#include <media/cec-pin.h>
 #include "cec-priv.h"
 
 static inline struct cec_devnode *cec_devnode_data(struct file *filp)
@@ -430,6 +431,15 @@ static long cec_s_mode(struct cec_adapter *adap, struct cec_fh *fh,
 	if (mode_follower == CEC_MODE_FOLLOWER)
 		adap->follower_cnt++;
 	if (mode_follower == CEC_MODE_MONITOR_PIN) {
+#ifdef CONFIG_CEC_PIN
+		struct cec_event ev = {
+			.flags = CEC_EVENT_FL_INITIAL_STATE,
+		};
+
+		ev.event = adap->pin->cur_value ? CEC_EVENT_PIN_HIGH :
+						  CEC_EVENT_PIN_LOW;
+		cec_queue_event_fh(fh, &ev, 0);
+#endif
 		adap->monitor_pin_cnt++;
 	}
 	if (mode_follower == CEC_MODE_EXCL_FOLLOWER ||
diff --git a/drivers/media/cec/cec-pin.c b/drivers/media/cec/cec-pin.c
new file mode 100644
index 000000000000..03f800e5ec1f
--- /dev/null
+++ b/drivers/media/cec/cec-pin.c
@@ -0,0 +1,794 @@
+/*
+ * Copyright 2017 Cisco Systems, Inc. and/or its affiliates. All rights reserved.
+ *
+ * This program is free software; you may redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; version 2 of the License.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/delay.h>
+#include <linux/slab.h>
+#include <linux/sched/types.h>
+
+#include <media/cec-pin.h>
+
+/* All timings are in microseconds */
+
+/* start bit timings */
+#define CEC_TIM_START_BIT_LOW		3700
+#define CEC_TIM_START_BIT_LOW_MIN	3500
+#define CEC_TIM_START_BIT_LOW_MAX	3900
+#define CEC_TIM_START_BIT_TOTAL		4500
+#define CEC_TIM_START_BIT_TOTAL_MIN	4300
+#define CEC_TIM_START_BIT_TOTAL_MAX	4700
+
+/* data bit timings */
+#define CEC_TIM_DATA_BIT_0_LOW		1500
+#define CEC_TIM_DATA_BIT_0_LOW_MIN	1300
+#define CEC_TIM_DATA_BIT_0_LOW_MAX	1700
+#define CEC_TIM_DATA_BIT_1_LOW		600
+#define CEC_TIM_DATA_BIT_1_LOW_MIN	400
+#define CEC_TIM_DATA_BIT_1_LOW_MAX	800
+#define CEC_TIM_DATA_BIT_TOTAL		2400
+#define CEC_TIM_DATA_BIT_TOTAL_MIN	2050
+#define CEC_TIM_DATA_BIT_TOTAL_MAX	2750
+/* earliest safe time to sample the bit state */
+#define CEC_TIM_DATA_BIT_SAMPLE		850
+/* earliest time the bit is back to 1 (T7 + 50) */
+#define CEC_TIM_DATA_BIT_HIGH		1750
+
+/* when idle, sample once per millisecond */
+#define CEC_TIM_IDLE_SAMPLE		1000
+/* when processing the start bit, sample twice per millisecond */
+#define CEC_TIM_START_BIT_SAMPLE	500
+/* when polling for a state change, sample once every 50 micoseconds */
+#define CEC_TIM_SAMPLE			50
+
+#define CEC_TIM_LOW_DRIVE_ERROR		(1.5 * CEC_TIM_DATA_BIT_TOTAL)
+
+struct cec_state {
+	const char * const name;
+	unsigned int usecs;
+};
+
+static const struct cec_state states[CEC_PIN_STATES] = {
+	{ "Off",		   0 },
+	{ "Idle",		   CEC_TIM_IDLE_SAMPLE },
+	{ "Tx Wait",		   CEC_TIM_SAMPLE },
+	{ "Tx Wait for High",	   CEC_TIM_IDLE_SAMPLE },
+	{ "Tx Start Bit Low",	   CEC_TIM_START_BIT_LOW },
+	{ "Tx Start Bit High",	   CEC_TIM_START_BIT_TOTAL - CEC_TIM_START_BIT_LOW },
+	{ "Tx Data 0 Low",	   CEC_TIM_DATA_BIT_0_LOW },
+	{ "Tx Data 0 High",	   CEC_TIM_DATA_BIT_TOTAL - CEC_TIM_DATA_BIT_0_LOW },
+	{ "Tx Data 1 Low",	   CEC_TIM_DATA_BIT_1_LOW },
+	{ "Tx Data 1 High",	   CEC_TIM_DATA_BIT_TOTAL - CEC_TIM_DATA_BIT_1_LOW },
+	{ "Tx Data 1 Pre Sample",  CEC_TIM_DATA_BIT_SAMPLE - CEC_TIM_DATA_BIT_1_LOW },
+	{ "Tx Data 1 Post Sample", CEC_TIM_DATA_BIT_TOTAL - CEC_TIM_DATA_BIT_SAMPLE },
+	{ "Rx Start Bit Low",	   CEC_TIM_SAMPLE },
+	{ "Rx Start Bit High",	   CEC_TIM_SAMPLE },
+	{ "Rx Data Sample",	   CEC_TIM_DATA_BIT_SAMPLE },
+	{ "Rx Data Post Sample",   CEC_TIM_DATA_BIT_HIGH - CEC_TIM_DATA_BIT_SAMPLE },
+	{ "Rx Data High",	   CEC_TIM_SAMPLE },
+	{ "Rx Ack Low",		   CEC_TIM_DATA_BIT_0_LOW },
+	{ "Rx Ack Low Post",	   CEC_TIM_DATA_BIT_HIGH - CEC_TIM_DATA_BIT_0_LOW },
+	{ "Rx Ack High Post",	   CEC_TIM_DATA_BIT_HIGH },
+	{ "Rx Ack Finish",	   CEC_TIM_DATA_BIT_TOTAL_MIN - CEC_TIM_DATA_BIT_HIGH },
+	{ "Rx Low Drive",	   CEC_TIM_LOW_DRIVE_ERROR },
+	{ "Rx Irq",		   0 },
+};
+
+static void cec_pin_update(struct cec_pin *pin, bool v, bool force)
+{
+	if (!force && v == pin->cur_value)
+		return;
+
+	pin->cur_value = v;
+	if (atomic_read(&pin->work_pin_events) < CEC_NUM_PIN_EVENTS) {
+		pin->work_pin_is_high[pin->work_pin_events_wr] = v;
+		pin->work_pin_ts[pin->work_pin_events_wr] = ktime_get();
+		pin->work_pin_events_wr =
+			(pin->work_pin_events_wr + 1) % CEC_NUM_PIN_EVENTS;
+		atomic_inc(&pin->work_pin_events);
+	}
+	wake_up_interruptible(&pin->kthread_waitq);
+}
+
+static bool cec_pin_read(struct cec_pin *pin)
+{
+	bool v = pin->ops->read(pin->adap);
+
+	cec_pin_update(pin, v, false);
+	return v;
+}
+
+static void cec_pin_low(struct cec_pin *pin)
+{
+	pin->ops->low(pin->adap);
+	cec_pin_update(pin, false, false);
+}
+
+static bool cec_pin_high(struct cec_pin *pin)
+{
+	pin->ops->high(pin->adap);
+	return cec_pin_read(pin);
+}
+
+static void cec_pin_to_idle(struct cec_pin *pin)
+{
+	/*
+	 * Reset all status fields, release the bus and
+	 * go to idle state.
+	 */
+	pin->rx_bit = pin->tx_bit = 0;
+	pin->rx_msg.len = 0;
+	memset(pin->rx_msg.msg, 0, sizeof(pin->rx_msg.msg));
+	pin->state = CEC_ST_IDLE;
+	pin->ts = 0;
+}
+
+/*
+ * Handle Transmit-related states
+ *
+ * Basic state changes when transmitting:
+ *
+ * Idle -> Tx Wait (waiting for the end of signal free time) ->
+ *	Tx Start Bit Low -> Tx Start Bit High ->
+ *
+ *   Regular data bits + EOM:
+ *	Tx Data 0 Low -> Tx Data 0 High ->
+ *   or:
+ *	Tx Data 1 Low -> Tx Data 1 High ->
+ *
+ *   First 4 data bits or Ack bit:
+ *	Tx Data 0 Low -> Tx Data 0 High ->
+ *   or:
+ *	Tx Data 1 Low -> Tx Data 1 High -> Tx Data 1 Pre Sample ->
+ *		Tx Data 1 Post Sample ->
+ *
+ *   After the last Ack go to Idle.
+ *
+ * If it detects a Low Drive condition then:
+ *	Tx Wait For High -> Idle
+ *
+ * If it loses arbitration, then it switches to state Rx Data Post Sample.
+ */
+static void cec_pin_tx_states(struct cec_pin *pin, ktime_t ts)
+{
+	bool v;
+	bool is_ack_bit, ack;
+
+	switch (pin->state) {
+	case CEC_ST_TX_WAIT_FOR_HIGH:
+		if (cec_pin_read(pin))
+			cec_pin_to_idle(pin);
+		break;
+
+	case CEC_ST_TX_START_BIT_LOW:
+		pin->state = CEC_ST_TX_START_BIT_HIGH;
+		/* Generate start bit */
+		cec_pin_high(pin);
+		break;
+
+	case CEC_ST_TX_DATA_BIT_1_HIGH_POST_SAMPLE:
+		/* If the read value is 1, then all is OK */
+		if (!cec_pin_read(pin)) {
+			/*
+			 * It's 0, so someone detected an error and pulled the
+			 * line low for 1.5 times the nominal bit period.
+			 */
+			pin->tx_msg.len = 0;
+			pin->work_tx_ts = ts;
+			pin->work_tx_status = CEC_TX_STATUS_LOW_DRIVE;
+			pin->state = CEC_ST_TX_WAIT_FOR_HIGH;
+			wake_up_interruptible(&pin->kthread_waitq);
+			break;
+		}
+		if (pin->tx_nacked) {
+			cec_pin_to_idle(pin);
+			pin->tx_msg.len = 0;
+			pin->work_tx_ts = ts;
+			pin->work_tx_status = CEC_TX_STATUS_NACK;
+			wake_up_interruptible(&pin->kthread_waitq);
+			break;
+		}
+		/* fall through */
+	case CEC_ST_TX_DATA_BIT_0_HIGH:
+	case CEC_ST_TX_DATA_BIT_1_HIGH:
+		pin->tx_bit++;
+		/* fall through */
+	case CEC_ST_TX_START_BIT_HIGH:
+		if (pin->tx_bit / 10 >= pin->tx_msg.len) {
+			cec_pin_to_idle(pin);
+			pin->tx_msg.len = 0;
+			pin->work_tx_ts = ts;
+			pin->work_tx_status = CEC_TX_STATUS_OK;
+			wake_up_interruptible(&pin->kthread_waitq);
+			break;
+		}
+
+		switch (pin->tx_bit % 10) {
+		default:
+			v = pin->tx_msg.msg[pin->tx_bit / 10] &
+				(1 << (7 - (pin->tx_bit % 10)));
+			pin->state = v ? CEC_ST_TX_DATA_BIT_1_LOW :
+				CEC_ST_TX_DATA_BIT_0_LOW;
+			break;
+		case 8:
+			v = pin->tx_bit / 10 == pin->tx_msg.len - 1;
+			pin->state = v ? CEC_ST_TX_DATA_BIT_1_LOW :
+				CEC_ST_TX_DATA_BIT_0_LOW;
+			break;
+		case 9:
+			pin->state = CEC_ST_TX_DATA_BIT_1_LOW;
+			break;
+		}
+		cec_pin_low(pin);
+		break;
+
+	case CEC_ST_TX_DATA_BIT_0_LOW:
+	case CEC_ST_TX_DATA_BIT_1_LOW:
+		v = pin->state == CEC_ST_TX_DATA_BIT_1_LOW;
+		pin->state = v ? CEC_ST_TX_DATA_BIT_1_HIGH :
+			CEC_ST_TX_DATA_BIT_0_HIGH;
+		is_ack_bit = pin->tx_bit % 10 == 9;
+		if (v && (pin->tx_bit < 4 || is_ack_bit))
+			pin->state = CEC_ST_TX_DATA_BIT_1_HIGH_PRE_SAMPLE;
+		cec_pin_high(pin);
+		break;
+
+	case CEC_ST_TX_DATA_BIT_1_HIGH_PRE_SAMPLE:
+		/* Read the CEC value at the sample time */
+		v = cec_pin_read(pin);
+		is_ack_bit = pin->tx_bit % 10 == 9;
+		/*
+		 * If v == 0 and we're within the first 4 bits
+		 * of the initiator, then someone else started
+		 * transmitting and we lost the arbitration
+		 * (i.e. the logical address of the other
+		 * transmitter has more leading 0 bits in the
+		 * initiator).
+		 */
+		if (!v && !is_ack_bit) {
+			pin->tx_msg.len = 0;
+			pin->work_tx_ts = ts;
+			pin->work_tx_status = CEC_TX_STATUS_ARB_LOST;
+			wake_up_interruptible(&pin->kthread_waitq);
+			pin->rx_bit = pin->tx_bit;
+			pin->tx_bit = 0;
+			memset(pin->rx_msg.msg, 0, sizeof(pin->rx_msg.msg));
+			pin->rx_msg.msg[0] = pin->tx_msg.msg[0];
+			pin->rx_msg.msg[0] &= ~(1 << (7 - pin->rx_bit));
+			pin->rx_msg.len = 0;
+			pin->state = CEC_ST_RX_DATA_POST_SAMPLE;
+			pin->rx_bit++;
+			break;
+		}
+		pin->state = CEC_ST_TX_DATA_BIT_1_HIGH_POST_SAMPLE;
+		if (!is_ack_bit)
+			break;
+		/* Was the message ACKed? */
+		ack = cec_msg_is_broadcast(&pin->tx_msg) ? v : !v;
+		if (!ack) {
+			/*
+			 * Note: the CEC spec is ambiguous regarding
+			 * what action to take when a NACK appears
+			 * before the last byte of the payload was
+			 * transmitted: either stop transmitting
+			 * immediately, or wait until the last byte
+			 * was transmitted.
+			 *
+			 * Most CEC implementations appear to stop
+			 * immediately, and that's what we do here
+			 * as well.
+			 */
+			pin->tx_nacked = true;
+		}
+		break;
+
+	default:
+		break;
+	}
+}
+
+/*
+ * Handle Receive-related states
+ *
+ * Basic state changes when receiving:
+ *
+ *	Rx Start Bit Low -> Rx Start Bit High ->
+ *   Regular data bits + EOM:
+ *	Rx Data Sample -> Rx Data Post Sample -> Rx Data High ->
+ *   Ack bit 0:
+ *	Rx Ack Low -> Rx Ack Low Post -> Rx Data High ->
+ *   Ack bit 1:
+ *	Rx Ack High Post -> Rx Data High ->
+ *   Ack bit 0 && EOM:
+ *	Rx Ack Low -> Rx Ack Low Post -> Rx Ack Finish -> Idle
+ */
+static void cec_pin_rx_states(struct cec_pin *pin, ktime_t ts)
+{
+	s32 delta;
+	bool v;
+	bool ack;
+	bool bcast, for_us;
+	u8 dest;
+
+	switch (pin->state) {
+	/* Receive states */
+	case CEC_ST_RX_START_BIT_LOW:
+		v = cec_pin_read(pin);
+		if (!v)
+			break;
+		pin->state = CEC_ST_RX_START_BIT_HIGH;
+		delta = ktime_us_delta(ts, pin->ts);
+		pin->ts = ts;
+		/* Start bit low is too short, go back to idle */
+		if (delta < CEC_TIM_START_BIT_LOW_MIN -
+			    CEC_TIM_IDLE_SAMPLE) {
+			cec_pin_to_idle(pin);
+		}
+		break;
+
+	case CEC_ST_RX_START_BIT_HIGH:
+		v = cec_pin_read(pin);
+		delta = ktime_us_delta(ts, pin->ts);
+		if (v && delta > CEC_TIM_START_BIT_TOTAL_MAX -
+				 CEC_TIM_START_BIT_LOW_MIN) {
+			cec_pin_to_idle(pin);
+			break;
+		}
+		if (v)
+			break;
+		pin->state = CEC_ST_RX_DATA_SAMPLE;
+		pin->ts = ts;
+		pin->rx_eom = false;
+		break;
+
+	case CEC_ST_RX_DATA_SAMPLE:
+		v = cec_pin_read(pin);
+		pin->state = CEC_ST_RX_DATA_POST_SAMPLE;
+		switch (pin->rx_bit % 10) {
+		default:
+			if (pin->rx_bit / 10 < CEC_MAX_MSG_SIZE)
+				pin->rx_msg.msg[pin->rx_bit / 10] |=
+					v << (7 - (pin->rx_bit % 10));
+			break;
+		case 8:
+			pin->rx_eom = v;
+			pin->rx_msg.len = pin->rx_bit / 10 + 1;
+			break;
+		case 9:
+			break;
+		}
+		pin->rx_bit++;
+		break;
+
+	case CEC_ST_RX_DATA_POST_SAMPLE:
+		pin->state = CEC_ST_RX_DATA_HIGH;
+		break;
+
+	case CEC_ST_RX_DATA_HIGH:
+		v = cec_pin_read(pin);
+		delta = ktime_us_delta(ts, pin->ts);
+		if (v && delta > CEC_TIM_DATA_BIT_TOTAL_MAX) {
+			cec_pin_to_idle(pin);
+			break;
+		}
+		if (v)
+			break;
+		/*
+		 * Go to low drive state when the total bit time is
+		 * too short.
+		 */
+		if (delta < CEC_TIM_DATA_BIT_TOTAL_MIN) {
+			cec_pin_low(pin);
+			pin->state = CEC_ST_LOW_DRIVE;
+			break;
+		}
+		pin->ts = ts;
+		if (pin->rx_bit % 10 != 9) {
+			pin->state = CEC_ST_RX_DATA_SAMPLE;
+			break;
+		}
+
+		dest = cec_msg_destination(&pin->rx_msg);
+		bcast = dest == CEC_LOG_ADDR_BROADCAST;
+		/* for_us == broadcast or directed to us */
+		for_us = bcast || (pin->la_mask & (1 << dest));
+		/* ACK bit value */
+		ack = bcast ? 1 : !for_us;
+
+		if (ack) {
+			/* No need to write to the bus, just wait */
+			pin->state = CEC_ST_RX_ACK_HIGH_POST;
+			break;
+		}
+		cec_pin_low(pin);
+		pin->state = CEC_ST_RX_ACK_LOW;
+		break;
+
+	case CEC_ST_RX_ACK_LOW:
+		cec_pin_high(pin);
+		pin->state = CEC_ST_RX_ACK_LOW_POST;
+		break;
+
+	case CEC_ST_RX_ACK_LOW_POST:
+	case CEC_ST_RX_ACK_HIGH_POST:
+		v = cec_pin_read(pin);
+		if (v && pin->rx_eom) {
+			pin->work_rx_msg = pin->rx_msg;
+			pin->work_rx_msg.rx_ts = ts;
+			wake_up_interruptible(&pin->kthread_waitq);
+			pin->ts = ts;
+			pin->state = CEC_ST_RX_ACK_FINISH;
+			break;
+		}
+		pin->rx_bit++;
+		pin->state = CEC_ST_RX_DATA_HIGH;
+		break;
+
+	case CEC_ST_RX_ACK_FINISH:
+		cec_pin_to_idle(pin);
+		break;
+
+	default:
+		break;
+	}
+}
+
+/*
+ * Main timer function
+ *
+ */
+static enum hrtimer_restart cec_pin_timer(struct hrtimer *timer)
+{
+	struct cec_pin *pin = container_of(timer, struct cec_pin, timer);
+	struct cec_adapter *adap = pin->adap;
+	ktime_t ts;
+	s32 delta;
+
+	ts = ktime_get();
+	if (pin->timer_ts) {
+		delta = ktime_us_delta(ts, pin->timer_ts);
+		pin->timer_cnt++;
+		if (delta > 100 && pin->state != CEC_ST_IDLE) {
+			/* Keep track of timer overruns */
+			pin->timer_sum_overrun += delta;
+			pin->timer_100ms_overruns++;
+			if (delta > 300)
+				pin->timer_300ms_overruns++;
+			if (delta > pin->timer_max_overrun)
+				pin->timer_max_overrun = delta;
+		}
+	}
+	if (adap->monitor_pin_cnt)
+		cec_pin_read(pin);
+
+	if (pin->wait_usecs) {
+		/*
+		 * If we are monitoring the pin, then we have to
+		 * sample at regular intervals.
+		 */
+		if (pin->wait_usecs > 150) {
+			pin->wait_usecs -= 100;
+			pin->timer_ts = ktime_add_us(ts, 100);
+			hrtimer_forward_now(timer, 100000);
+			return HRTIMER_RESTART;
+		}
+		if (pin->wait_usecs > 100) {
+			pin->wait_usecs /= 2;
+			pin->timer_ts = ktime_add_us(ts, pin->wait_usecs);
+			hrtimer_forward_now(timer, pin->wait_usecs * 1000);
+			return HRTIMER_RESTART;
+		}
+		pin->timer_ts = ktime_add_us(ts, pin->wait_usecs);
+		hrtimer_forward_now(timer, pin->wait_usecs * 1000);
+		pin->wait_usecs = 0;
+		return HRTIMER_RESTART;
+	}
+
+	switch (pin->state) {
+	/* Transmit states */
+	case CEC_ST_TX_WAIT_FOR_HIGH:
+	case CEC_ST_TX_START_BIT_LOW:
+	case CEC_ST_TX_DATA_BIT_1_HIGH_POST_SAMPLE:
+	case CEC_ST_TX_DATA_BIT_0_HIGH:
+	case CEC_ST_TX_DATA_BIT_1_HIGH:
+	case CEC_ST_TX_START_BIT_HIGH:
+	case CEC_ST_TX_DATA_BIT_0_LOW:
+	case CEC_ST_TX_DATA_BIT_1_LOW:
+	case CEC_ST_TX_DATA_BIT_1_HIGH_PRE_SAMPLE:
+		cec_pin_tx_states(pin, ts);
+		break;
+
+	/* Receive states */
+	case CEC_ST_RX_START_BIT_LOW:
+	case CEC_ST_RX_START_BIT_HIGH:
+	case CEC_ST_RX_DATA_SAMPLE:
+	case CEC_ST_RX_DATA_POST_SAMPLE:
+	case CEC_ST_RX_DATA_HIGH:
+	case CEC_ST_RX_ACK_LOW:
+	case CEC_ST_RX_ACK_LOW_POST:
+	case CEC_ST_RX_ACK_HIGH_POST:
+	case CEC_ST_RX_ACK_FINISH:
+		cec_pin_rx_states(pin, ts);
+		break;
+
+	case CEC_ST_IDLE:
+	case CEC_ST_TX_WAIT:
+		if (!cec_pin_high(pin)) {
+			/* Start bit, switch to receive state */
+			pin->ts = ts;
+			pin->state = CEC_ST_RX_START_BIT_LOW;
+			break;
+		}
+		if (pin->ts == 0)
+			pin->ts = ts;
+		if (pin->tx_msg.len) {
+			/*
+			 * Check if the bus has been free for long enough
+			 * so we can kick off the pending transmit.
+			 */
+			delta = ktime_us_delta(ts, pin->ts);
+			if (delta / CEC_TIM_DATA_BIT_TOTAL >
+			    pin->tx_signal_free_time) {
+				pin->tx_nacked = false;
+				pin->state = CEC_ST_TX_START_BIT_LOW;
+				/* Generate start bit */
+				cec_pin_low(pin);
+				break;
+			}
+			if (delta / CEC_TIM_DATA_BIT_TOTAL >
+			    pin->tx_signal_free_time - 1)
+				pin->state = CEC_ST_TX_WAIT;
+			break;
+		}
+		if (pin->state != CEC_ST_IDLE || pin->ops->enable_irq == NULL ||
+		    pin->enable_irq_failed || adap->is_configuring ||
+		    adap->is_configured || adap->monitor_all_cnt)
+			break;
+		/* Switch to interrupt mode */
+		pin->work_enable_irq = true;
+		pin->state = CEC_ST_RX_IRQ;
+		wake_up_interruptible(&pin->kthread_waitq);
+		return HRTIMER_NORESTART;
+
+	case CEC_ST_LOW_DRIVE:
+		cec_pin_to_idle(pin);
+		break;
+
+	default:
+		break;
+	}
+	if (!adap->monitor_pin_cnt || states[pin->state].usecs <= 150) {
+		pin->wait_usecs = 0;
+		pin->timer_ts = ktime_add_us(ts, states[pin->state].usecs);
+		hrtimer_forward_now(timer, states[pin->state].usecs * 1000);
+		return HRTIMER_RESTART;
+	}
+	pin->wait_usecs = states[pin->state].usecs - 100;
+	pin->timer_ts = ktime_add_us(ts, 100);
+	hrtimer_forward_now(timer, 100000);
+	return HRTIMER_RESTART;
+}
+
+static int cec_pin_thread_func(void *_adap)
+{
+	struct cec_adapter *adap = _adap;
+	struct cec_pin *pin = adap->pin;
+
+	for (;;) {
+		wait_event_interruptible(pin->kthread_waitq,
+			kthread_should_stop() ||
+			pin->work_rx_msg.len ||
+			pin->work_tx_status ||
+			pin->work_enable_irq ||
+			atomic_read(&pin->work_pin_events));
+
+		if (pin->work_rx_msg.len) {
+			cec_received_msg_ts(adap, &pin->work_rx_msg,
+					    pin->work_rx_msg.rx_ts);
+			pin->work_rx_msg.len = 0;
+		}
+		if (pin->work_tx_status) {
+			unsigned int tx_status = pin->work_tx_status;
+
+			pin->work_tx_status = 0;
+			cec_transmit_attempt_done_ts(adap, tx_status,
+						     pin->work_tx_ts);
+		}
+		while (atomic_read(&pin->work_pin_events)) {
+			unsigned int idx = pin->work_pin_events_rd;
+
+			cec_queue_pin_event(adap, pin->work_pin_is_high[idx],
+					    pin->work_pin_ts[idx]);
+			pin->work_pin_events_rd = (idx + 1) % CEC_NUM_PIN_EVENTS;
+			atomic_dec(&pin->work_pin_events);
+		}
+		if (pin->work_enable_irq) {
+			pin->work_enable_irq = false;
+			pin->enable_irq_failed = !pin->ops->enable_irq(adap);
+			if (pin->enable_irq_failed) {
+				cec_pin_to_idle(pin);
+				hrtimer_start(&pin->timer, 0, HRTIMER_MODE_REL);
+			}
+		}
+		if (kthread_should_stop())
+			break;
+	}
+	return 0;
+}
+
+static int cec_pin_adap_enable(struct cec_adapter *adap, bool enable)
+{
+	struct cec_pin *pin = adap->pin;
+
+	pin->enabled = enable;
+	if (enable) {
+		atomic_set(&pin->work_pin_events, 0);
+		pin->work_pin_events_rd = pin->work_pin_events_wr = 0;
+		cec_pin_read(pin);
+		cec_pin_to_idle(pin);
+		pin->tx_msg.len = 0;
+		pin->timer_ts = 0;
+		pin->work_enable_irq = false;
+		pin->kthread = kthread_run(cec_pin_thread_func, adap,
+					   "cec-pin");
+		if (IS_ERR(pin->kthread)) {
+			pr_err("cec-pin: kernel_thread() failed\n");
+			return PTR_ERR(pin->kthread);
+		}
+		hrtimer_start(&pin->timer, 0, HRTIMER_MODE_REL);
+	} else {
+		if (pin->ops->disable_irq)
+			pin->ops->disable_irq(adap);
+		hrtimer_cancel(&pin->timer);
+		kthread_stop(pin->kthread);
+		cec_pin_read(pin);
+		cec_pin_to_idle(pin);
+		pin->state = CEC_ST_OFF;
+	}
+	return 0;
+}
+
+static int cec_pin_adap_log_addr(struct cec_adapter *adap, u8 log_addr)
+{
+	struct cec_pin *pin = adap->pin;
+
+	if (log_addr == CEC_LOG_ADDR_INVALID)
+		pin->la_mask = 0;
+	else
+		pin->la_mask |= (1 << log_addr);
+	return 0;
+}
+
+static int cec_pin_adap_transmit(struct cec_adapter *adap, u8 attempts,
+				      u32 signal_free_time, struct cec_msg *msg)
+{
+	struct cec_pin *pin = adap->pin;
+
+	pin->tx_signal_free_time = signal_free_time;
+	pin->tx_msg = *msg;
+	pin->work_tx_status = 0;
+	pin->tx_bit = 0;
+	if (pin->state == CEC_ST_RX_IRQ) {
+		pin->work_enable_irq = false;
+		pin->ops->disable_irq(adap);
+		cec_pin_high(pin);
+		cec_pin_to_idle(pin);
+		hrtimer_start(&pin->timer, 0, HRTIMER_MODE_REL);
+	}
+	return 0;
+}
+
+static void cec_pin_adap_status(struct cec_adapter *adap,
+				       struct seq_file *file)
+{
+	struct cec_pin *pin = adap->pin;
+
+	seq_printf(file, "state:   %s\n", states[pin->state].name);
+	seq_printf(file, "tx_bit:  %d\n", pin->tx_bit);
+	seq_printf(file, "rx_bit:  %d\n", pin->rx_bit);
+	seq_printf(file, "cec pin: %d\n", pin->ops->read(adap));
+	seq_printf(file, "irq failed: %d\n", pin->enable_irq_failed);
+	if (pin->timer_100ms_overruns) {
+		seq_printf(file, "timer overruns > 100ms: %u of %u\n",
+			   pin->timer_100ms_overruns, pin->timer_cnt);
+		seq_printf(file, "timer overruns > 300ms: %u of %u\n",
+			   pin->timer_300ms_overruns, pin->timer_cnt);
+		seq_printf(file, "max timer overrun: %u usecs\n",
+			   pin->timer_max_overrun);
+		seq_printf(file, "avg timer overrun: %u usecs\n",
+			   pin->timer_sum_overrun / pin->timer_100ms_overruns);
+	}
+	pin->timer_cnt = 0;
+	pin->timer_100ms_overruns = 0;
+	pin->timer_300ms_overruns = 0;
+	pin->timer_max_overrun = 0;
+	pin->timer_sum_overrun = 0;
+	if (pin->ops->status)
+		pin->ops->status(adap, file);
+}
+
+static int cec_pin_adap_monitor_all_enable(struct cec_adapter *adap,
+						  bool enable)
+{
+	struct cec_pin *pin = adap->pin;
+
+	pin->monitor_all = enable;
+	return 0;
+}
+
+static void cec_pin_adap_free(struct cec_adapter *adap)
+{
+	struct cec_pin *pin = adap->pin;
+
+	if (pin->ops->free)
+		pin->ops->free(adap);
+	adap->pin = NULL;
+	kfree(pin);
+}
+
+void cec_pin_changed(struct cec_adapter *adap, bool value)
+{
+	struct cec_pin *pin = adap->pin;
+
+	cec_pin_update(pin, value, false);
+	if (!value && (adap->is_configuring || adap->is_configured ||
+		       adap->monitor_all_cnt)) {
+		pin->work_enable_irq = false;
+		pin->ops->disable_irq(adap);
+		cec_pin_high(pin);
+		cec_pin_to_idle(pin);
+		hrtimer_start(&pin->timer, 0, HRTIMER_MODE_REL);
+	}
+}
+EXPORT_SYMBOL_GPL(cec_pin_changed);
+
+static const struct cec_adap_ops cec_pin_adap_ops = {
+	.adap_enable = cec_pin_adap_enable,
+	.adap_monitor_all_enable = cec_pin_adap_monitor_all_enable,
+	.adap_log_addr = cec_pin_adap_log_addr,
+	.adap_transmit = cec_pin_adap_transmit,
+	.adap_status = cec_pin_adap_status,
+	.adap_free = cec_pin_adap_free,
+};
+
+struct cec_adapter *cec_pin_allocate_adapter(const struct cec_pin_ops *pin_ops,
+					void *priv, const char *name, u32 caps)
+{
+	struct cec_adapter *adap;
+	struct cec_pin *pin = kzalloc(sizeof(*pin), GFP_KERNEL);
+
+	if (pin == NULL)
+		return ERR_PTR(-ENOMEM);
+	pin->ops = pin_ops;
+	pin->cur_value = true;
+	hrtimer_init(&pin->timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
+	pin->timer.function = cec_pin_timer;
+	init_waitqueue_head(&pin->kthread_waitq);
+
+	adap = cec_allocate_adapter(&cec_pin_adap_ops, priv, name,
+			    caps | CEC_CAP_MONITOR_ALL | CEC_CAP_MONITOR_PIN,
+			    CEC_MAX_LOG_ADDRS);
+
+	if (PTR_ERR_OR_ZERO(adap)) {
+		kfree(pin);
+		return adap;
+	}
+
+	adap->pin = pin;
+	pin->adap = adap;
+	cec_pin_update(pin, cec_pin_high(pin), true);
+	return adap;
+}
+EXPORT_SYMBOL_GPL(cec_pin_allocate_adapter);
diff --git a/include/media/cec-pin.h b/include/media/cec-pin.h
new file mode 100644
index 000000000000..44b82d29d480
--- /dev/null
+++ b/include/media/cec-pin.h
@@ -0,0 +1,183 @@
+/*
+ * cec-pin.h - low-level CEC pin control
+ *
+ * Copyright 2017 Cisco Systems, Inc. and/or its affiliates. All rights reserved.
+ *
+ * This program is free software; you may redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; version 2 of the License.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef LINUX_CEC_PIN_H
+#define LINUX_CEC_PIN_H
+
+#include <linux/types.h>
+#include <linux/atomic.h>
+#include <media/cec.h>
+
+enum cec_pin_state {
+	/* CEC is off */
+	CEC_ST_OFF,
+	/* CEC is idle, waiting for Rx or Tx */
+	CEC_ST_IDLE,
+
+	/* Tx states */
+
+	/* Pending Tx, waiting for Signal Free Time to expire */
+	CEC_ST_TX_WAIT,
+	/* Low-drive was detected, wait for bus to go high */
+	CEC_ST_TX_WAIT_FOR_HIGH,
+	/* Drive CEC low for the start bit */
+	CEC_ST_TX_START_BIT_LOW,
+	/* Drive CEC high for the start bit */
+	CEC_ST_TX_START_BIT_HIGH,
+	/* Drive CEC low for the 0 bit */
+	CEC_ST_TX_DATA_BIT_0_LOW,
+	/* Drive CEC high for the 0 bit */
+	CEC_ST_TX_DATA_BIT_0_HIGH,
+	/* Drive CEC low for the 1 bit */
+	CEC_ST_TX_DATA_BIT_1_LOW,
+	/* Drive CEC high for the 1 bit */
+	CEC_ST_TX_DATA_BIT_1_HIGH,
+	/*
+	 * Wait for start of sample time to check for Ack bit or first
+	 * four initiator bits to check for Arbitration Lost.
+	 */
+	CEC_ST_TX_DATA_BIT_1_HIGH_PRE_SAMPLE,
+	/* Wait for end of bit period after sampling */
+	CEC_ST_TX_DATA_BIT_1_HIGH_POST_SAMPLE,
+
+	/* Rx states */
+
+	/* Start bit low detected */
+	CEC_ST_RX_START_BIT_LOW,
+	/* Start bit high detected */
+	CEC_ST_RX_START_BIT_HIGH,
+	/* Wait for bit sample time */
+	CEC_ST_RX_DATA_SAMPLE,
+	/* Wait for earliest end of bit period after sampling */
+	CEC_ST_RX_DATA_POST_SAMPLE,
+	/* Wait for CEC to go high (i.e. end of bit period */
+	CEC_ST_RX_DATA_HIGH,
+	/* Drive CEC low to send 0 Ack bit */
+	CEC_ST_RX_ACK_LOW,
+	/* End of 0 Ack time, wait for earliest end of bit period */
+	CEC_ST_RX_ACK_LOW_POST,
+	/* Wait for CEC to go high (i.e. end of bit period */
+	CEC_ST_RX_ACK_HIGH_POST,
+	/* Wait for earliest end of bit period and end of message */
+	CEC_ST_RX_ACK_FINISH,
+
+	/* Start low drive */
+	CEC_ST_LOW_DRIVE,
+	/* Monitor pin using interrupts */
+	CEC_ST_RX_IRQ,
+
+	/* Total number of pin states */
+	CEC_PIN_STATES
+};
+
+/**
+ * struct cec_pin_ops - low-level CEC pin operations
+ * @read:	read the CEC pin. Return true if high, false if low.
+ * @low:	drive the CEC pin low.
+ * @high:	stop driving the CEC pin. The pull-up will drive the pin
+ *		high, unless someone else is driving the pin low.
+ * @enable_irq:	optional, enable the interrupt to detect pin voltage changes.
+ * @disable_irq: optional, disable the interrupt.
+ * @free:	optional. Free any allocated resources. Called when the
+ *		adapter is deleted.
+ * @status:	optional, log status information.
+ *
+ * These operations are used by the cec pin framework to manipulate
+ * the CEC pin.
+ */
+struct cec_pin_ops {
+	bool (*read)(struct cec_adapter *adap);
+	void (*low)(struct cec_adapter *adap);
+	void (*high)(struct cec_adapter *adap);
+	bool (*enable_irq)(struct cec_adapter *adap);
+	void (*disable_irq)(struct cec_adapter *adap);
+	void (*free)(struct cec_adapter *adap);
+	void (*status)(struct cec_adapter *adap, struct seq_file *file);
+};
+
+#define CEC_NUM_PIN_EVENTS 128
+
+struct cec_pin {
+	struct cec_adapter		*adap;
+	const struct cec_pin_ops	*ops;
+	struct task_struct		*kthread;
+	wait_queue_head_t		kthread_waitq;
+	struct hrtimer			timer;
+	ktime_t				ts;
+	unsigned int			wait_usecs;
+	u16				la_mask;
+	bool				enabled;
+	bool				monitor_all;
+	bool				cur_value;
+	bool				rx_eom;
+	bool				enable_irq_failed;
+	enum cec_pin_state		state;
+	struct cec_msg			tx_msg;
+	u32				tx_bit;
+	bool				tx_nacked;
+	u32				tx_signal_free_time;
+	struct cec_msg			rx_msg;
+	u32				rx_bit;
+
+	struct cec_msg			work_rx_msg;
+	u8				work_tx_status;
+	bool				work_enable_irq;
+	ktime_t				work_tx_ts;
+	atomic_t			work_pin_events;
+	unsigned int			work_pin_events_wr;
+	unsigned int			work_pin_events_rd;
+	ktime_t				work_pin_ts[CEC_NUM_PIN_EVENTS];
+	bool				work_pin_is_high[CEC_NUM_PIN_EVENTS];
+	ktime_t				timer_ts;
+	u32				timer_cnt;
+	u32				timer_100ms_overruns;
+	u32				timer_300ms_overruns;
+	u32				timer_max_overrun;
+	u32				timer_sum_overrun;
+};
+
+/**
+ * cec_pin_changed() - update pin state from interrupt
+ *
+ * @adap:	pointer to the cec adapter
+ * @value:	when true the pin is high, otherwise it is low
+ *
+ * If changes of the CEC voltage are detected via an interrupt, then
+ * cec_pin_changed is called from the interrupt with the new value.
+ */
+void cec_pin_changed(struct cec_adapter *adap, bool value);
+
+/**
+ * cec_pin_allocate_adapter() - allocate a pin-based cec adapter
+ *
+ * @pin_ops:	low-level pin operations
+ * @priv:	will be stored in adap->priv and can be used by the adapter ops.
+ *		Use cec_get_drvdata(adap) to get the priv pointer.
+ * @name:	the name of the CEC adapter. Note: this name will be copied.
+ * @caps:	capabilities of the CEC adapter. This will be ORed with
+ *		CEC_CAP_MONITOR_ALL and CEC_CAP_MONITOR_PIN.
+ *
+ * Allocate a cec adapter using the cec pin framework.
+ *
+ * Return: a pointer to the cec adapter or an error pointer
+ */
+struct cec_adapter *cec_pin_allocate_adapter(const struct cec_pin_ops *pin_ops,
+					void *priv, const char *name, u32 caps);
+
+#endif
diff --git a/include/media/cec.h b/include/media/cec.h
index d983960b37ad..f9cab1a9f912 100644
--- a/include/media/cec.h
+++ b/include/media/cec.h
@@ -61,6 +61,7 @@ struct cec_devnode {
 
 struct cec_adapter;
 struct cec_data;
+struct cec_pin;
 
 struct cec_data {
 	struct list_head list;
@@ -189,6 +190,9 @@ struct cec_adapter {
 #ifdef CONFIG_CEC_NOTIFIER
 	struct cec_notifier *notifier;
 #endif
+#ifdef CONFIG_CEC_PIN
+	struct cec_pin *pin;
+#endif
 
 	struct dentry *cec_dir;
 	struct dentry *status_file;
-- 
cgit v1.2.3


From 5150418593015d280dab321da1fe3cea4ed3b693 Mon Sep 17 00:00:00 2001
From: Hans Verkuil <hverkuil@xs4all.nl>
Date: Thu, 13 Jul 2017 04:07:23 -0300
Subject: media: cec: move cec_register_cec_notifier to cec-notifier.h

The cec_register_cec_notifier function was in media/cec.h, but it
has to be in cec-notifier.h.

While we are at it, also document it and add a stub function for when
the notifier is disabled or the CEC core code is unreachable.

Based on an earlier patch from Jose Abreu <Jose.Abreu@synopsys.com>.

Signed-off-by: Hans Verkuil <hans.verkuil@cisco.com>
Signed-off-by: Mauro Carvalho Chehab <mchehab@s-opensource.com>
---
 include/media/cec-notifier.h | 12 ++++++++++++
 include/media/cec.h          |  5 -----
 2 files changed, 12 insertions(+), 5 deletions(-)

(limited to 'include')

diff --git a/include/media/cec-notifier.h b/include/media/cec-notifier.h
index 298f996969df..73bc98b90afc 100644
--- a/include/media/cec-notifier.h
+++ b/include/media/cec-notifier.h
@@ -86,6 +86,14 @@ void cec_notifier_register(struct cec_notifier *n,
  */
 void cec_notifier_unregister(struct cec_notifier *n);
 
+/**
+ * cec_register_cec_notifier - register the notifier with the cec adapter.
+ * @adap: the CEC adapter
+ * @notifier: the CEC notifier
+ */
+void cec_register_cec_notifier(struct cec_adapter *adap,
+			       struct cec_notifier *notifier);
+
 #else
 static inline struct cec_notifier *cec_notifier_get(struct device *dev)
 {
@@ -116,6 +124,10 @@ static inline void cec_notifier_unregister(struct cec_notifier *n)
 {
 }
 
+static inline void cec_register_cec_notifier(struct cec_adapter *adap,
+					     struct cec_notifier *notifier)
+{
+}
 #endif
 
 #endif
diff --git a/include/media/cec.h b/include/media/cec.h
index f9cab1a9f912..224a6e225c52 100644
--- a/include/media/cec.h
+++ b/include/media/cec.h
@@ -361,11 +361,6 @@ u16 cec_phys_addr_for_input(u16 phys_addr, u8 input);
  */
 int cec_phys_addr_validate(u16 phys_addr, u16 *parent, u16 *port);
 
-#ifdef CONFIG_CEC_NOTIFIER
-void cec_register_cec_notifier(struct cec_adapter *adap,
-			       struct cec_notifier *notifier);
-#endif
-
 #else
 
 static inline int cec_register_adapter(struct cec_adapter *adap,
-- 
cgit v1.2.3


From ff60dcedbd99395c3355bc00fd302c31c96c9b92 Mon Sep 17 00:00:00 2001
From: Krzysztof Kozlowski <krzk@kernel.org>
Date: Thu, 15 Jun 2017 17:06:27 +0200
Subject: pinctrl: samsung: dt-bindings: Use better name for external interrupt
 function

On Exynos, 0xf is always used as value of external interrupt in pin mux
function thus a more descriptive macro name can be used.

Signed-off-by: Krzysztof Kozlowski <krzk@kernel.org>
---
 include/dt-bindings/pinctrl/samsung.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/dt-bindings/pinctrl/samsung.h b/include/dt-bindings/pinctrl/samsung.h
index b7aa3646208b..ceb672305f59 100644
--- a/include/dt-bindings/pinctrl/samsung.h
+++ b/include/dt-bindings/pinctrl/samsung.h
@@ -66,7 +66,8 @@
 #define EXYNOS_PIN_FUNC_4		4
 #define EXYNOS_PIN_FUNC_5		5
 #define EXYNOS_PIN_FUNC_6		6
-#define EXYNOS_PIN_FUNC_F		0xf
+#define EXYNOS_PIN_FUNC_EINT		0xf
+#define EXYNOS_PIN_FUNC_F		EXYNOS_PIN_FUNC_EINT
 
 /* Drive strengths for Exynos7 FSYS1 block */
 #define EXYNOS7_FSYS1_PIN_DRV_LV1	0
-- 
cgit v1.2.3


From 09c7570480f7544ffbf8e6db365208b0b0c154c6 Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Mon, 17 Jul 2017 13:57:26 +0200
Subject: xfrm: remove flow cache

After rcu conversions performance degradation in forward tests isn't that
noticeable anymore.

See next patch for some numbers.

A followup patcg could then also remove genid from the policies
as we do not cache bundles anymore.

Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/flow.h              |  34 ---
 include/net/flowcache.h         |  25 --
 include/net/netns/xfrm.h        |  11 -
 include/net/xfrm.h              |   8 -
 net/core/Makefile               |   1 -
 net/core/flow.c                 | 516 ----------------------------------------
 net/ipv4/xfrm4_policy.c         |   9 -
 net/ipv6/xfrm6_policy.c         |   9 -
 net/key/af_key.c                |   6 -
 net/xfrm/xfrm_device.c          |   2 -
 net/xfrm/xfrm_policy.c          | 108 +--------
 net/xfrm/xfrm_user.c            |   3 -
 security/selinux/include/xfrm.h |   4 +-
 13 files changed, 2 insertions(+), 734 deletions(-)
 delete mode 100644 include/net/flowcache.h
 delete mode 100644 net/core/flow.c

(limited to 'include')

diff --git a/include/net/flow.h b/include/net/flow.h
index bae198b3039e..f3dc61b29bb5 100644
--- a/include/net/flow.h
+++ b/include/net/flow.h
@@ -218,40 +218,6 @@ static inline unsigned int flow_key_size(u16 family)
 	return 0;
 }
 
-#define FLOW_DIR_IN	0
-#define FLOW_DIR_OUT	1
-#define FLOW_DIR_FWD	2
-
-struct net;
-struct sock;
-struct flow_cache_ops;
-
-struct flow_cache_object {
-	const struct flow_cache_ops *ops;
-};
-
-struct flow_cache_ops {
-	struct flow_cache_object *(*get)(struct flow_cache_object *);
-	int (*check)(struct flow_cache_object *);
-	void (*delete)(struct flow_cache_object *);
-};
-
-typedef struct flow_cache_object *(*flow_resolve_t)(
-		struct net *net, const struct flowi *key, u16 family,
-		u8 dir, struct flow_cache_object *oldobj, void *ctx);
-
-struct flow_cache_object *flow_cache_lookup(struct net *net,
-					    const struct flowi *key, u16 family,
-					    u8 dir, flow_resolve_t resolver,
-					    void *ctx);
-int flow_cache_init(struct net *net);
-void flow_cache_fini(struct net *net);
-void flow_cache_hp_init(void);
-
-void flow_cache_flush(struct net *net);
-void flow_cache_flush_deferred(struct net *net);
-extern atomic_t flow_cache_genid;
-
 __u32 __get_hash_from_flowi6(const struct flowi6 *fl6, struct flow_keys *keys);
 
 static inline __u32 get_hash_from_flowi6(const struct flowi6 *fl6)
diff --git a/include/net/flowcache.h b/include/net/flowcache.h
deleted file mode 100644
index 51eb971e8973..000000000000
--- a/include/net/flowcache.h
+++ /dev/null
@@ -1,25 +0,0 @@
-#ifndef _NET_FLOWCACHE_H
-#define _NET_FLOWCACHE_H
-
-#include <linux/interrupt.h>
-#include <linux/types.h>
-#include <linux/timer.h>
-#include <linux/notifier.h>
-
-struct flow_cache_percpu {
-	struct hlist_head		*hash_table;
-	unsigned int			hash_count;
-	u32				hash_rnd;
-	int				hash_rnd_recalc;
-	struct tasklet_struct		flush_tasklet;
-};
-
-struct flow_cache {
-	u32				hash_shift;
-	struct flow_cache_percpu __percpu *percpu;
-	struct hlist_node		node;
-	unsigned int			low_watermark;
-	unsigned int			high_watermark;
-	struct timer_list		rnd_timer;
-};
-#endif	/* _NET_FLOWCACHE_H */
diff --git a/include/net/netns/xfrm.h b/include/net/netns/xfrm.h
index 27bb9633c69d..611521646dd4 100644
--- a/include/net/netns/xfrm.h
+++ b/include/net/netns/xfrm.h
@@ -6,7 +6,6 @@
 #include <linux/workqueue.h>
 #include <linux/xfrm.h>
 #include <net/dst_ops.h>
-#include <net/flowcache.h>
 
 struct ctl_table_header;
 
@@ -73,16 +72,6 @@ struct netns_xfrm {
 	spinlock_t xfrm_state_lock;
 	spinlock_t xfrm_policy_lock;
 	struct mutex xfrm_cfg_mutex;
-
-	/* flow cache part */
-	struct flow_cache	flow_cache_global;
-	atomic_t		flow_cache_genid;
-	struct list_head	flow_cache_gc_list;
-	atomic_t		flow_cache_gc_count;
-	spinlock_t		flow_cache_gc_lock;
-	struct work_struct	flow_cache_gc_work;
-	struct work_struct	flow_cache_flush_work;
-	struct mutex		flow_flush_sem;
 };
 
 #endif
diff --git a/include/net/xfrm.h b/include/net/xfrm.h
index c0916ab18d32..e0feba2ce76a 100644
--- a/include/net/xfrm.h
+++ b/include/net/xfrm.h
@@ -563,7 +563,6 @@ struct xfrm_policy {
 	refcount_t		refcnt;
 	struct timer_list	timer;
 
-	struct flow_cache_object flo;
 	atomic_t		genid;
 	u32			priority;
 	u32			index;
@@ -978,7 +977,6 @@ struct xfrm_dst {
 		struct rt6_info		rt6;
 	} u;
 	struct dst_entry *route;
-	struct flow_cache_object flo;
 	struct xfrm_policy *pols[XFRM_POLICY_TYPE_MAX];
 	int num_pols, num_xfrms;
 	u32 xfrm_genid;
@@ -1226,9 +1224,6 @@ static inline void xfrm_sk_free_policy(struct sock *sk)
 	}
 }
 
-void xfrm_garbage_collect(struct net *net);
-void xfrm_garbage_collect_deferred(struct net *net);
-
 #else
 
 static inline void xfrm_sk_free_policy(struct sock *sk) {}
@@ -1263,9 +1258,6 @@ static inline int xfrm6_policy_check_reverse(struct sock *sk, int dir,
 {
 	return 1;
 }
-static inline void xfrm_garbage_collect(struct net *net)
-{
-}
 #endif
 
 static __inline__
diff --git a/net/core/Makefile b/net/core/Makefile
index 79f9479e9658..d501c4278015 100644
--- a/net/core/Makefile
+++ b/net/core/Makefile
@@ -11,7 +11,6 @@ obj-y		     += dev.o ethtool.o dev_addr_lists.o dst.o netevent.o \
 			neighbour.o rtnetlink.o utils.o link_watch.o filter.o \
 			sock_diag.o dev_ioctl.o tso.o sock_reuseport.o
 
-obj-$(CONFIG_XFRM) += flow.o
 obj-y += net-sysfs.o
 obj-$(CONFIG_PROC_FS) += net-procfs.o
 obj-$(CONFIG_NET_PKTGEN) += pktgen.o
diff --git a/net/core/flow.c b/net/core/flow.c
deleted file mode 100644
index f7f5d1932a27..000000000000
--- a/net/core/flow.c
+++ /dev/null
@@ -1,516 +0,0 @@
-/* flow.c: Generic flow cache.
- *
- * Copyright (C) 2003 Alexey N. Kuznetsov (kuznet@ms2.inr.ac.ru)
- * Copyright (C) 2003 David S. Miller (davem@redhat.com)
- */
-
-#include <linux/kernel.h>
-#include <linux/module.h>
-#include <linux/list.h>
-#include <linux/jhash.h>
-#include <linux/interrupt.h>
-#include <linux/mm.h>
-#include <linux/random.h>
-#include <linux/init.h>
-#include <linux/slab.h>
-#include <linux/smp.h>
-#include <linux/completion.h>
-#include <linux/percpu.h>
-#include <linux/bitops.h>
-#include <linux/notifier.h>
-#include <linux/cpu.h>
-#include <linux/cpumask.h>
-#include <linux/mutex.h>
-#include <net/flow.h>
-#include <linux/atomic.h>
-#include <linux/security.h>
-#include <net/net_namespace.h>
-
-struct flow_cache_entry {
-	union {
-		struct hlist_node	hlist;
-		struct list_head	gc_list;
-	} u;
-	struct net			*net;
-	u16				family;
-	u8				dir;
-	u32				genid;
-	struct flowi			key;
-	struct flow_cache_object	*object;
-};
-
-struct flow_flush_info {
-	struct flow_cache		*cache;
-	atomic_t			cpuleft;
-	struct completion		completion;
-};
-
-static struct kmem_cache *flow_cachep __read_mostly;
-
-#define flow_cache_hash_size(cache)	(1U << (cache)->hash_shift)
-#define FLOW_HASH_RND_PERIOD		(10 * 60 * HZ)
-
-static void flow_cache_new_hashrnd(unsigned long arg)
-{
-	struct flow_cache *fc = (void *) arg;
-	int i;
-
-	for_each_possible_cpu(i)
-		per_cpu_ptr(fc->percpu, i)->hash_rnd_recalc = 1;
-
-	fc->rnd_timer.expires = jiffies + FLOW_HASH_RND_PERIOD;
-	add_timer(&fc->rnd_timer);
-}
-
-static int flow_entry_valid(struct flow_cache_entry *fle,
-				struct netns_xfrm *xfrm)
-{
-	if (atomic_read(&xfrm->flow_cache_genid) != fle->genid)
-		return 0;
-	if (fle->object && !fle->object->ops->check(fle->object))
-		return 0;
-	return 1;
-}
-
-static void flow_entry_kill(struct flow_cache_entry *fle,
-				struct netns_xfrm *xfrm)
-{
-	if (fle->object)
-		fle->object->ops->delete(fle->object);
-	kmem_cache_free(flow_cachep, fle);
-}
-
-static void flow_cache_gc_task(struct work_struct *work)
-{
-	struct list_head gc_list;
-	struct flow_cache_entry *fce, *n;
-	struct netns_xfrm *xfrm = container_of(work, struct netns_xfrm,
-						flow_cache_gc_work);
-
-	INIT_LIST_HEAD(&gc_list);
-	spin_lock_bh(&xfrm->flow_cache_gc_lock);
-	list_splice_tail_init(&xfrm->flow_cache_gc_list, &gc_list);
-	spin_unlock_bh(&xfrm->flow_cache_gc_lock);
-
-	list_for_each_entry_safe(fce, n, &gc_list, u.gc_list) {
-		flow_entry_kill(fce, xfrm);
-		atomic_dec(&xfrm->flow_cache_gc_count);
-	}
-}
-
-static void flow_cache_queue_garbage(struct flow_cache_percpu *fcp,
-				     unsigned int deleted,
-				     struct list_head *gc_list,
-				     struct netns_xfrm *xfrm)
-{
-	if (deleted) {
-		atomic_add(deleted, &xfrm->flow_cache_gc_count);
-		fcp->hash_count -= deleted;
-		spin_lock_bh(&xfrm->flow_cache_gc_lock);
-		list_splice_tail(gc_list, &xfrm->flow_cache_gc_list);
-		spin_unlock_bh(&xfrm->flow_cache_gc_lock);
-		schedule_work(&xfrm->flow_cache_gc_work);
-	}
-}
-
-static void __flow_cache_shrink(struct flow_cache *fc,
-				struct flow_cache_percpu *fcp,
-				unsigned int shrink_to)
-{
-	struct flow_cache_entry *fle;
-	struct hlist_node *tmp;
-	LIST_HEAD(gc_list);
-	unsigned int deleted = 0;
-	struct netns_xfrm *xfrm = container_of(fc, struct netns_xfrm,
-						flow_cache_global);
-	unsigned int i;
-
-	for (i = 0; i < flow_cache_hash_size(fc); i++) {
-		unsigned int saved = 0;
-
-		hlist_for_each_entry_safe(fle, tmp,
-					  &fcp->hash_table[i], u.hlist) {
-			if (saved < shrink_to &&
-			    flow_entry_valid(fle, xfrm)) {
-				saved++;
-			} else {
-				deleted++;
-				hlist_del(&fle->u.hlist);
-				list_add_tail(&fle->u.gc_list, &gc_list);
-			}
-		}
-	}
-
-	flow_cache_queue_garbage(fcp, deleted, &gc_list, xfrm);
-}
-
-static void flow_cache_shrink(struct flow_cache *fc,
-			      struct flow_cache_percpu *fcp)
-{
-	unsigned int shrink_to = fc->low_watermark / flow_cache_hash_size(fc);
-
-	__flow_cache_shrink(fc, fcp, shrink_to);
-}
-
-static void flow_new_hash_rnd(struct flow_cache *fc,
-			      struct flow_cache_percpu *fcp)
-{
-	get_random_bytes(&fcp->hash_rnd, sizeof(u32));
-	fcp->hash_rnd_recalc = 0;
-	__flow_cache_shrink(fc, fcp, 0);
-}
-
-static u32 flow_hash_code(struct flow_cache *fc,
-			  struct flow_cache_percpu *fcp,
-			  const struct flowi *key,
-			  unsigned int keysize)
-{
-	const u32 *k = (const u32 *) key;
-	const u32 length = keysize * sizeof(flow_compare_t) / sizeof(u32);
-
-	return jhash2(k, length, fcp->hash_rnd)
-		& (flow_cache_hash_size(fc) - 1);
-}
-
-/* I hear what you're saying, use memcmp.  But memcmp cannot make
- * important assumptions that we can here, such as alignment.
- */
-static int flow_key_compare(const struct flowi *key1, const struct flowi *key2,
-			    unsigned int keysize)
-{
-	const flow_compare_t *k1, *k1_lim, *k2;
-
-	k1 = (const flow_compare_t *) key1;
-	k1_lim = k1 + keysize;
-
-	k2 = (const flow_compare_t *) key2;
-
-	do {
-		if (*k1++ != *k2++)
-			return 1;
-	} while (k1 < k1_lim);
-
-	return 0;
-}
-
-struct flow_cache_object *
-flow_cache_lookup(struct net *net, const struct flowi *key, u16 family, u8 dir,
-		  flow_resolve_t resolver, void *ctx)
-{
-	struct flow_cache *fc = &net->xfrm.flow_cache_global;
-	struct flow_cache_percpu *fcp;
-	struct flow_cache_entry *fle, *tfle;
-	struct flow_cache_object *flo;
-	unsigned int keysize;
-	unsigned int hash;
-
-	local_bh_disable();
-	fcp = this_cpu_ptr(fc->percpu);
-
-	fle = NULL;
-	flo = NULL;
-
-	keysize = flow_key_size(family);
-	if (!keysize)
-		goto nocache;
-
-	/* Packet really early in init?  Making flow_cache_init a
-	 * pre-smp initcall would solve this.  --RR */
-	if (!fcp->hash_table)
-		goto nocache;
-
-	if (fcp->hash_rnd_recalc)
-		flow_new_hash_rnd(fc, fcp);
-
-	hash = flow_hash_code(fc, fcp, key, keysize);
-	hlist_for_each_entry(tfle, &fcp->hash_table[hash], u.hlist) {
-		if (tfle->net == net &&
-		    tfle->family == family &&
-		    tfle->dir == dir &&
-		    flow_key_compare(key, &tfle->key, keysize) == 0) {
-			fle = tfle;
-			break;
-		}
-	}
-
-	if (unlikely(!fle)) {
-		if (fcp->hash_count > fc->high_watermark)
-			flow_cache_shrink(fc, fcp);
-
-		if (atomic_read(&net->xfrm.flow_cache_gc_count) >
-		    2 * num_online_cpus() * fc->high_watermark) {
-			flo = ERR_PTR(-ENOBUFS);
-			goto ret_object;
-		}
-
-		fle = kmem_cache_alloc(flow_cachep, GFP_ATOMIC);
-		if (fle) {
-			fle->net = net;
-			fle->family = family;
-			fle->dir = dir;
-			memcpy(&fle->key, key, keysize * sizeof(flow_compare_t));
-			fle->object = NULL;
-			hlist_add_head(&fle->u.hlist, &fcp->hash_table[hash]);
-			fcp->hash_count++;
-		}
-	} else if (likely(fle->genid == atomic_read(&net->xfrm.flow_cache_genid))) {
-		flo = fle->object;
-		if (!flo)
-			goto ret_object;
-		flo = flo->ops->get(flo);
-		if (flo)
-			goto ret_object;
-	} else if (fle->object) {
-	        flo = fle->object;
-	        flo->ops->delete(flo);
-	        fle->object = NULL;
-	}
-
-nocache:
-	flo = NULL;
-	if (fle) {
-		flo = fle->object;
-		fle->object = NULL;
-	}
-	flo = resolver(net, key, family, dir, flo, ctx);
-	if (fle) {
-		fle->genid = atomic_read(&net->xfrm.flow_cache_genid);
-		if (!IS_ERR(flo))
-			fle->object = flo;
-		else
-			fle->genid--;
-	} else {
-		if (!IS_ERR_OR_NULL(flo))
-			flo->ops->delete(flo);
-	}
-ret_object:
-	local_bh_enable();
-	return flo;
-}
-EXPORT_SYMBOL(flow_cache_lookup);
-
-static void flow_cache_flush_tasklet(unsigned long data)
-{
-	struct flow_flush_info *info = (void *)data;
-	struct flow_cache *fc = info->cache;
-	struct flow_cache_percpu *fcp;
-	struct flow_cache_entry *fle;
-	struct hlist_node *tmp;
-	LIST_HEAD(gc_list);
-	unsigned int deleted = 0;
-	struct netns_xfrm *xfrm = container_of(fc, struct netns_xfrm,
-						flow_cache_global);
-	unsigned int i;
-
-	fcp = this_cpu_ptr(fc->percpu);
-	for (i = 0; i < flow_cache_hash_size(fc); i++) {
-		hlist_for_each_entry_safe(fle, tmp,
-					  &fcp->hash_table[i], u.hlist) {
-			if (flow_entry_valid(fle, xfrm))
-				continue;
-
-			deleted++;
-			hlist_del(&fle->u.hlist);
-			list_add_tail(&fle->u.gc_list, &gc_list);
-		}
-	}
-
-	flow_cache_queue_garbage(fcp, deleted, &gc_list, xfrm);
-
-	if (atomic_dec_and_test(&info->cpuleft))
-		complete(&info->completion);
-}
-
-/*
- * Return whether a cpu needs flushing.  Conservatively, we assume
- * the presence of any entries means the core may require flushing,
- * since the flow_cache_ops.check() function may assume it's running
- * on the same core as the per-cpu cache component.
- */
-static int flow_cache_percpu_empty(struct flow_cache *fc, int cpu)
-{
-	struct flow_cache_percpu *fcp;
-	unsigned int i;
-
-	fcp = per_cpu_ptr(fc->percpu, cpu);
-	for (i = 0; i < flow_cache_hash_size(fc); i++)
-		if (!hlist_empty(&fcp->hash_table[i]))
-			return 0;
-	return 1;
-}
-
-static void flow_cache_flush_per_cpu(void *data)
-{
-	struct flow_flush_info *info = data;
-	struct tasklet_struct *tasklet;
-
-	tasklet = &this_cpu_ptr(info->cache->percpu)->flush_tasklet;
-	tasklet->data = (unsigned long)info;
-	tasklet_schedule(tasklet);
-}
-
-void flow_cache_flush(struct net *net)
-{
-	struct flow_flush_info info;
-	cpumask_var_t mask;
-	int i, self;
-
-	/* Track which cpus need flushing to avoid disturbing all cores. */
-	if (!alloc_cpumask_var(&mask, GFP_KERNEL))
-		return;
-	cpumask_clear(mask);
-
-	/* Don't want cpus going down or up during this. */
-	get_online_cpus();
-	mutex_lock(&net->xfrm.flow_flush_sem);
-	info.cache = &net->xfrm.flow_cache_global;
-	for_each_online_cpu(i)
-		if (!flow_cache_percpu_empty(info.cache, i))
-			cpumask_set_cpu(i, mask);
-	atomic_set(&info.cpuleft, cpumask_weight(mask));
-	if (atomic_read(&info.cpuleft) == 0)
-		goto done;
-
-	init_completion(&info.completion);
-
-	local_bh_disable();
-	self = cpumask_test_and_clear_cpu(smp_processor_id(), mask);
-	on_each_cpu_mask(mask, flow_cache_flush_per_cpu, &info, 0);
-	if (self)
-		flow_cache_flush_tasklet((unsigned long)&info);
-	local_bh_enable();
-
-	wait_for_completion(&info.completion);
-
-done:
-	mutex_unlock(&net->xfrm.flow_flush_sem);
-	put_online_cpus();
-	free_cpumask_var(mask);
-}
-
-static void flow_cache_flush_task(struct work_struct *work)
-{
-	struct netns_xfrm *xfrm = container_of(work, struct netns_xfrm,
-						flow_cache_flush_work);
-	struct net *net = container_of(xfrm, struct net, xfrm);
-
-	flow_cache_flush(net);
-}
-
-void flow_cache_flush_deferred(struct net *net)
-{
-	schedule_work(&net->xfrm.flow_cache_flush_work);
-}
-
-static int flow_cache_cpu_prepare(struct flow_cache *fc, int cpu)
-{
-	struct flow_cache_percpu *fcp = per_cpu_ptr(fc->percpu, cpu);
-	unsigned int sz = sizeof(struct hlist_head) * flow_cache_hash_size(fc);
-
-	if (!fcp->hash_table) {
-		fcp->hash_table = kzalloc_node(sz, GFP_KERNEL, cpu_to_node(cpu));
-		if (!fcp->hash_table) {
-			pr_err("NET: failed to allocate flow cache sz %u\n", sz);
-			return -ENOMEM;
-		}
-		fcp->hash_rnd_recalc = 1;
-		fcp->hash_count = 0;
-		tasklet_init(&fcp->flush_tasklet, flow_cache_flush_tasklet, 0);
-	}
-	return 0;
-}
-
-static int flow_cache_cpu_up_prep(unsigned int cpu, struct hlist_node *node)
-{
-	struct flow_cache *fc = hlist_entry_safe(node, struct flow_cache, node);
-
-	return flow_cache_cpu_prepare(fc, cpu);
-}
-
-static int flow_cache_cpu_dead(unsigned int cpu, struct hlist_node *node)
-{
-	struct flow_cache *fc = hlist_entry_safe(node, struct flow_cache, node);
-	struct flow_cache_percpu *fcp = per_cpu_ptr(fc->percpu, cpu);
-
-	__flow_cache_shrink(fc, fcp, 0);
-	return 0;
-}
-
-int flow_cache_init(struct net *net)
-{
-	int i;
-	struct flow_cache *fc = &net->xfrm.flow_cache_global;
-
-	if (!flow_cachep)
-		flow_cachep = kmem_cache_create("flow_cache",
-						sizeof(struct flow_cache_entry),
-						0, SLAB_PANIC, NULL);
-	spin_lock_init(&net->xfrm.flow_cache_gc_lock);
-	INIT_LIST_HEAD(&net->xfrm.flow_cache_gc_list);
-	INIT_WORK(&net->xfrm.flow_cache_gc_work, flow_cache_gc_task);
-	INIT_WORK(&net->xfrm.flow_cache_flush_work, flow_cache_flush_task);
-	mutex_init(&net->xfrm.flow_flush_sem);
-	atomic_set(&net->xfrm.flow_cache_gc_count, 0);
-
-	fc->hash_shift = 10;
-	fc->low_watermark = 2 * flow_cache_hash_size(fc);
-	fc->high_watermark = 4 * flow_cache_hash_size(fc);
-
-	fc->percpu = alloc_percpu(struct flow_cache_percpu);
-	if (!fc->percpu)
-		return -ENOMEM;
-
-	if (cpuhp_state_add_instance(CPUHP_NET_FLOW_PREPARE, &fc->node))
-		goto err;
-
-	setup_timer(&fc->rnd_timer, flow_cache_new_hashrnd,
-		    (unsigned long) fc);
-	fc->rnd_timer.expires = jiffies + FLOW_HASH_RND_PERIOD;
-	add_timer(&fc->rnd_timer);
-
-	return 0;
-
-err:
-	for_each_possible_cpu(i) {
-		struct flow_cache_percpu *fcp = per_cpu_ptr(fc->percpu, i);
-		kfree(fcp->hash_table);
-		fcp->hash_table = NULL;
-	}
-
-	free_percpu(fc->percpu);
-	fc->percpu = NULL;
-
-	return -ENOMEM;
-}
-EXPORT_SYMBOL(flow_cache_init);
-
-void flow_cache_fini(struct net *net)
-{
-	int i;
-	struct flow_cache *fc = &net->xfrm.flow_cache_global;
-
-	del_timer_sync(&fc->rnd_timer);
-
-	cpuhp_state_remove_instance_nocalls(CPUHP_NET_FLOW_PREPARE, &fc->node);
-
-	for_each_possible_cpu(i) {
-		struct flow_cache_percpu *fcp = per_cpu_ptr(fc->percpu, i);
-		kfree(fcp->hash_table);
-		fcp->hash_table = NULL;
-	}
-
-	free_percpu(fc->percpu);
-	fc->percpu = NULL;
-}
-EXPORT_SYMBOL(flow_cache_fini);
-
-void __init flow_cache_hp_init(void)
-{
-	int ret;
-
-	ret = cpuhp_setup_state_multi(CPUHP_NET_FLOW_PREPARE,
-				      "net/flow:prepare",
-				      flow_cache_cpu_up_prep,
-				      flow_cache_cpu_dead);
-	WARN_ON(ret < 0);
-}
diff --git a/net/ipv4/xfrm4_policy.c b/net/ipv4/xfrm4_policy.c
index 19455a5fc328..4aefb149fe0a 100644
--- a/net/ipv4/xfrm4_policy.c
+++ b/net/ipv4/xfrm4_policy.c
@@ -213,14 +213,6 @@ _decode_session4(struct sk_buff *skb, struct flowi *fl, int reverse)
 	fl4->flowi4_tos = iph->tos;
 }
 
-static inline int xfrm4_garbage_collect(struct dst_ops *ops)
-{
-	struct net *net = container_of(ops, struct net, xfrm.xfrm4_dst_ops);
-
-	xfrm_garbage_collect_deferred(net);
-	return (dst_entries_get_slow(ops) > ops->gc_thresh * 2);
-}
-
 static void xfrm4_update_pmtu(struct dst_entry *dst, struct sock *sk,
 			      struct sk_buff *skb, u32 mtu)
 {
@@ -259,7 +251,6 @@ static void xfrm4_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
 
 static struct dst_ops xfrm4_dst_ops_template = {
 	.family =		AF_INET,
-	.gc =			xfrm4_garbage_collect,
 	.update_pmtu =		xfrm4_update_pmtu,
 	.redirect =		xfrm4_redirect,
 	.cow_metrics =		dst_cow_metrics_generic,
diff --git a/net/ipv6/xfrm6_policy.c b/net/ipv6/xfrm6_policy.c
index ae30dc4973e8..f44b25a48478 100644
--- a/net/ipv6/xfrm6_policy.c
+++ b/net/ipv6/xfrm6_policy.c
@@ -214,14 +214,6 @@ _decode_session6(struct sk_buff *skb, struct flowi *fl, int reverse)
 	}
 }
 
-static inline int xfrm6_garbage_collect(struct dst_ops *ops)
-{
-	struct net *net = container_of(ops, struct net, xfrm.xfrm6_dst_ops);
-
-	xfrm_garbage_collect_deferred(net);
-	return dst_entries_get_fast(ops) > ops->gc_thresh * 2;
-}
-
 static void xfrm6_update_pmtu(struct dst_entry *dst, struct sock *sk,
 			      struct sk_buff *skb, u32 mtu)
 {
@@ -279,7 +271,6 @@ static void xfrm6_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
 
 static struct dst_ops xfrm6_dst_ops_template = {
 	.family =		AF_INET6,
-	.gc =			xfrm6_garbage_collect,
 	.update_pmtu =		xfrm6_update_pmtu,
 	.redirect =		xfrm6_redirect,
 	.cow_metrics =		dst_cow_metrics_generic,
diff --git a/net/key/af_key.c b/net/key/af_key.c
index ca9d3ae665e7..10d7133e4fe9 100644
--- a/net/key/af_key.c
+++ b/net/key/af_key.c
@@ -2398,8 +2398,6 @@ static int pfkey_spddelete(struct sock *sk, struct sk_buff *skb, const struct sa
 
 out:
 	xfrm_pol_put(xp);
-	if (err == 0)
-		xfrm_garbage_collect(net);
 	return err;
 }
 
@@ -2650,8 +2648,6 @@ static int pfkey_spdget(struct sock *sk, struct sk_buff *skb, const struct sadb_
 
 out:
 	xfrm_pol_put(xp);
-	if (delete && err == 0)
-		xfrm_garbage_collect(net);
 	return err;
 }
 
@@ -2751,8 +2747,6 @@ static int pfkey_spdflush(struct sock *sk, struct sk_buff *skb, const struct sad
 	int err, err2;
 
 	err = xfrm_policy_flush(net, XFRM_POLICY_TYPE_MAIN, true);
-	if (!err)
-		xfrm_garbage_collect(net);
 	err2 = unicast_flush_resp(sk, hdr);
 	if (err || err2) {
 		if (err == -ESRCH) /* empty table - old silent behavior */
diff --git a/net/xfrm/xfrm_device.c b/net/xfrm/xfrm_device.c
index 5f7e8bfa0c2d..1f9a079e08b0 100644
--- a/net/xfrm/xfrm_device.c
+++ b/net/xfrm/xfrm_device.c
@@ -175,8 +175,6 @@ static int xfrm_dev_down(struct net_device *dev)
 	if (dev->features & NETIF_F_HW_ESP)
 		xfrm_dev_state_flush(dev_net(dev), dev, true);
 
-	xfrm_garbage_collect(dev_net(dev));
-
 	return NOTIFY_DONE;
 }
 
diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c
index 145d2395f3c0..0f1db4c18b22 100644
--- a/net/xfrm/xfrm_policy.c
+++ b/net/xfrm/xfrm_policy.c
@@ -246,36 +246,6 @@ expired:
 	xfrm_pol_put(xp);
 }
 
-static struct flow_cache_object *xfrm_policy_flo_get(struct flow_cache_object *flo)
-{
-	struct xfrm_policy *pol = container_of(flo, struct xfrm_policy, flo);
-
-	if (unlikely(pol->walk.dead))
-		flo = NULL;
-	else
-		xfrm_pol_hold(pol);
-
-	return flo;
-}
-
-static int xfrm_policy_flo_check(struct flow_cache_object *flo)
-{
-	struct xfrm_policy *pol = container_of(flo, struct xfrm_policy, flo);
-
-	return !pol->walk.dead;
-}
-
-static void xfrm_policy_flo_delete(struct flow_cache_object *flo)
-{
-	xfrm_pol_put(container_of(flo, struct xfrm_policy, flo));
-}
-
-static const struct flow_cache_ops xfrm_policy_fc_ops = {
-	.get = xfrm_policy_flo_get,
-	.check = xfrm_policy_flo_check,
-	.delete = xfrm_policy_flo_delete,
-};
-
 /* Allocate xfrm_policy. Not used here, it is supposed to be used by pfkeyv2
  * SPD calls.
  */
@@ -298,7 +268,6 @@ struct xfrm_policy *xfrm_policy_alloc(struct net *net, gfp_t gfp)
 				(unsigned long)policy);
 		setup_timer(&policy->polq.hold_timer, xfrm_policy_queue_process,
 			    (unsigned long)policy);
-		policy->flo.ops = &xfrm_policy_fc_ops;
 	}
 	return policy;
 }
@@ -798,7 +767,6 @@ int xfrm_policy_insert(int dir, struct xfrm_policy *policy, int excl)
 	else
 		hlist_add_head(&policy->bydst, chain);
 	__xfrm_policy_link(policy, dir);
-	atomic_inc(&net->xfrm.flow_cache_genid);
 
 	/* After previous checking, family can either be AF_INET or AF_INET6 */
 	if (policy->family == AF_INET)
@@ -1490,58 +1458,6 @@ static int xfrm_get_tos(const struct flowi *fl, int family)
 	return tos;
 }
 
-static struct flow_cache_object *xfrm_bundle_flo_get(struct flow_cache_object *flo)
-{
-	struct xfrm_dst *xdst = container_of(flo, struct xfrm_dst, flo);
-	struct dst_entry *dst = &xdst->u.dst;
-
-	if (xdst->route == NULL) {
-		/* Dummy bundle - if it has xfrms we were not
-		 * able to build bundle as template resolution failed.
-		 * It means we need to try again resolving. */
-		if (xdst->num_xfrms > 0)
-			return NULL;
-	} else if (dst->flags & DST_XFRM_QUEUE) {
-		return NULL;
-	} else {
-		/* Real bundle */
-		if (stale_bundle(dst))
-			return NULL;
-	}
-
-	dst_hold(dst);
-	return flo;
-}
-
-static int xfrm_bundle_flo_check(struct flow_cache_object *flo)
-{
-	struct xfrm_dst *xdst = container_of(flo, struct xfrm_dst, flo);
-	struct dst_entry *dst = &xdst->u.dst;
-
-	if (!xdst->route)
-		return 0;
-	if (stale_bundle(dst))
-		return 0;
-
-	return 1;
-}
-
-static void xfrm_bundle_flo_delete(struct flow_cache_object *flo)
-{
-	struct xfrm_dst *xdst = container_of(flo, struct xfrm_dst, flo);
-	struct dst_entry *dst = &xdst->u.dst;
-
-	/* Mark DST_OBSOLETE_DEAD to fail the next xfrm_dst_check() */
-	dst->obsolete = DST_OBSOLETE_DEAD;
-	dst_release_immediate(dst);
-}
-
-static const struct flow_cache_ops xfrm_bundle_fc_ops = {
-	.get = xfrm_bundle_flo_get,
-	.check = xfrm_bundle_flo_check,
-	.delete = xfrm_bundle_flo_delete,
-};
-
 static inline struct xfrm_dst *xfrm_alloc_dst(struct net *net, int family)
 {
 	const struct xfrm_policy_afinfo *afinfo = xfrm_policy_get_afinfo(family);
@@ -1569,7 +1485,6 @@ static inline struct xfrm_dst *xfrm_alloc_dst(struct net *net, int family)
 		struct dst_entry *dst = &xdst->u.dst;
 
 		memset(dst + 1, 0, sizeof(*xdst) - sizeof(*dst));
-		xdst->flo.ops = &xfrm_bundle_fc_ops;
 	} else
 		xdst = ERR_PTR(-ENOBUFS);
 
@@ -2521,11 +2436,9 @@ static struct dst_entry *xfrm_dst_check(struct dst_entry *dst, u32 cookie)
 	 * notice.  That's what we are validating here via the
 	 * stale_bundle() check.
 	 *
-	 * When an xdst is removed from flow cache, DST_OBSOLETE_DEAD will
-	 * be marked on it.
 	 * When a dst is removed from the fib tree, DST_OBSOLETE_DEAD will
 	 * be marked on it.
-	 * Both will force stable_bundle() to fail on any xdst bundle with
+	 * This will force stale_bundle() to fail on any xdst bundle with
 	 * this dst linked in it.
 	 */
 	if (dst->obsolete < 0 && !stale_bundle(dst))
@@ -2565,18 +2478,6 @@ static struct dst_entry *xfrm_negative_advice(struct dst_entry *dst)
 	return dst;
 }
 
-void xfrm_garbage_collect(struct net *net)
-{
-	flow_cache_flush(net);
-}
-EXPORT_SYMBOL(xfrm_garbage_collect);
-
-void xfrm_garbage_collect_deferred(struct net *net)
-{
-	flow_cache_flush_deferred(net);
-}
-EXPORT_SYMBOL(xfrm_garbage_collect_deferred);
-
 static void xfrm_init_pmtu(struct dst_entry *dst)
 {
 	do {
@@ -2914,14 +2815,9 @@ static int __net_init xfrm_net_init(struct net *net)
 	rv = xfrm_sysctl_init(net);
 	if (rv < 0)
 		goto out_sysctl;
-	rv = flow_cache_init(net);
-	if (rv < 0)
-		goto out;
 
 	return 0;
 
-out:
-	xfrm_sysctl_fini(net);
 out_sysctl:
 	xfrm_policy_fini(net);
 out_policy:
@@ -2934,7 +2830,6 @@ out_statistics:
 
 static void __net_exit xfrm_net_exit(struct net *net)
 {
-	flow_cache_fini(net);
 	xfrm_sysctl_fini(net);
 	xfrm_policy_fini(net);
 	xfrm_state_fini(net);
@@ -2948,7 +2843,6 @@ static struct pernet_operations __net_initdata xfrm_net_ops = {
 
 void __init xfrm_init(void)
 {
-	flow_cache_hp_init();
 	register_pernet_subsys(&xfrm_net_ops);
 	seqcount_init(&xfrm_policy_hash_generation);
 	xfrm_input_init();
diff --git a/net/xfrm/xfrm_user.c b/net/xfrm/xfrm_user.c
index 2be4c6af008a..1b539b7dcfab 100644
--- a/net/xfrm/xfrm_user.c
+++ b/net/xfrm/xfrm_user.c
@@ -1815,8 +1815,6 @@ static int xfrm_get_policy(struct sk_buff *skb, struct nlmsghdr *nlh,
 
 out:
 	xfrm_pol_put(xp);
-	if (delete && err == 0)
-		xfrm_garbage_collect(net);
 	return err;
 }
 
@@ -2027,7 +2025,6 @@ static int xfrm_flush_policy(struct sk_buff *skb, struct nlmsghdr *nlh,
 			return 0;
 		return err;
 	}
-	xfrm_garbage_collect(net);
 
 	c.data.type = type;
 	c.event = nlh->nlmsg_type;
diff --git a/security/selinux/include/xfrm.h b/security/selinux/include/xfrm.h
index 1450f85b946d..36a7ce9e11ff 100644
--- a/security/selinux/include/xfrm.h
+++ b/security/selinux/include/xfrm.h
@@ -47,10 +47,8 @@ static inline void selinux_xfrm_notify_policyload(void)
 	struct net *net;
 
 	rtnl_lock();
-	for_each_net(net) {
-		atomic_inc(&net->xfrm.flow_cache_genid);
+	for_each_net(net)
 		rt_genid_bump_all(net);
-	}
 	rtnl_unlock();
 }
 #else
-- 
cgit v1.2.3


From ec30d78c14a813db39a647b6a348b4286ba4abf5 Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Mon, 17 Jul 2017 13:57:27 +0200
Subject: xfrm: add xdst pcpu cache

retain last used xfrm_dst in a pcpu cache.
On next request, reuse this dst if the policies are the same.

The cache will not help with strict RR workloads as there is no hit.

The cache packet-path part is reasonably small, the notifier part is
needed so we do not add long hangs when a device is dismantled but some
pcpu xdst still holds a reference, there are also calls to the flush
operation when userspace deletes SAs so modules can be removed
(there is no hit.

We need to run the dst_release on the correct cpu to avoid races with
packet path.  This is done by adding a work_struct for each cpu and then
doing the actual test/release on each affected cpu via schedule_work_on().

Test results using 4 network namespaces and null encryption:

ns1           ns2          -> ns3           -> ns4
netperf -> xfrm/null enc   -> xfrm/null dec -> netserver

what                    TCP_STREAM      UDP_STREAM      UDP_RR
Flow cache:             14644.61        294.35          327231.64
No flow cache:		14349.81	242.64		202301.72
Pcpu cache:		14629.70	292.21		205595.22

UDP tests used 64byte packets, tests ran for one minute each,
value is average over ten iterations.

'Flow cache' is 'net-next', 'No flow cache' is net-next plus this
series but without this patch.

Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/xfrm.h     |   1 +
 net/xfrm/xfrm_device.c |   2 +
 net/xfrm/xfrm_policy.c | 127 ++++++++++++++++++++++++++++++++++++++++++++++++-
 net/xfrm/xfrm_state.c  |   5 +-
 4 files changed, 132 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/include/net/xfrm.h b/include/net/xfrm.h
index e0feba2ce76a..afb4929d7232 100644
--- a/include/net/xfrm.h
+++ b/include/net/xfrm.h
@@ -317,6 +317,7 @@ int xfrm_policy_register_afinfo(const struct xfrm_policy_afinfo *afinfo, int fam
 void xfrm_policy_unregister_afinfo(const struct xfrm_policy_afinfo *afinfo);
 void km_policy_notify(struct xfrm_policy *xp, int dir,
 		      const struct km_event *c);
+void xfrm_policy_cache_flush(void);
 void km_state_notify(struct xfrm_state *x, const struct km_event *c);
 
 struct xfrm_tmpl;
diff --git a/net/xfrm/xfrm_device.c b/net/xfrm/xfrm_device.c
index 1f9a079e08b0..5cd7a244e88d 100644
--- a/net/xfrm/xfrm_device.c
+++ b/net/xfrm/xfrm_device.c
@@ -153,6 +153,7 @@ static int xfrm_dev_register(struct net_device *dev)
 
 static int xfrm_dev_unregister(struct net_device *dev)
 {
+	xfrm_policy_cache_flush();
 	return NOTIFY_DONE;
 }
 
@@ -175,6 +176,7 @@ static int xfrm_dev_down(struct net_device *dev)
 	if (dev->features & NETIF_F_HW_ESP)
 		xfrm_dev_state_flush(dev_net(dev), dev, true);
 
+	xfrm_policy_cache_flush();
 	return NOTIFY_DONE;
 }
 
diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c
index 0f1db4c18b22..06c3bf7ab86b 100644
--- a/net/xfrm/xfrm_policy.c
+++ b/net/xfrm/xfrm_policy.c
@@ -24,6 +24,7 @@
 #include <linux/netfilter.h>
 #include <linux/module.h>
 #include <linux/cache.h>
+#include <linux/cpu.h>
 #include <linux/audit.h>
 #include <net/dst.h>
 #include <net/flow.h>
@@ -44,6 +45,8 @@ struct xfrm_flo {
 	u8 flags;
 };
 
+static DEFINE_PER_CPU(struct xfrm_dst *, xfrm_last_dst);
+static struct work_struct *xfrm_pcpu_work __read_mostly;
 static DEFINE_SPINLOCK(xfrm_policy_afinfo_lock);
 static struct xfrm_policy_afinfo const __rcu *xfrm_policy_afinfo[AF_INET6 + 1]
 						__read_mostly;
@@ -972,6 +975,8 @@ int xfrm_policy_flush(struct net *net, u8 type, bool task_valid)
 	}
 	if (!cnt)
 		err = -ESRCH;
+	else
+		xfrm_policy_cache_flush();
 out:
 	spin_unlock_bh(&net->xfrm.xfrm_policy_lock);
 	return err;
@@ -1700,6 +1705,102 @@ static int xfrm_expand_policies(const struct flowi *fl, u16 family,
 
 }
 
+static void xfrm_last_dst_update(struct xfrm_dst *xdst, struct xfrm_dst *old)
+{
+	this_cpu_write(xfrm_last_dst, xdst);
+	if (old)
+		dst_release(&old->u.dst);
+}
+
+static void __xfrm_pcpu_work_fn(void)
+{
+	struct xfrm_dst *old;
+
+	old = this_cpu_read(xfrm_last_dst);
+	if (old && !xfrm_bundle_ok(old))
+		xfrm_last_dst_update(NULL, old);
+}
+
+static void xfrm_pcpu_work_fn(struct work_struct *work)
+{
+	local_bh_disable();
+	rcu_read_lock();
+	__xfrm_pcpu_work_fn();
+	rcu_read_unlock();
+	local_bh_enable();
+}
+
+void xfrm_policy_cache_flush(void)
+{
+	struct xfrm_dst *old;
+	bool found = 0;
+	int cpu;
+
+	local_bh_disable();
+	rcu_read_lock();
+	for_each_possible_cpu(cpu) {
+		old = per_cpu(xfrm_last_dst, cpu);
+		if (old && !xfrm_bundle_ok(old)) {
+			if (smp_processor_id() == cpu) {
+				__xfrm_pcpu_work_fn();
+				continue;
+			}
+			found = true;
+			break;
+		}
+	}
+
+	rcu_read_unlock();
+	local_bh_enable();
+
+	if (!found)
+		return;
+
+	get_online_cpus();
+
+	for_each_possible_cpu(cpu) {
+		bool bundle_release;
+
+		rcu_read_lock();
+		old = per_cpu(xfrm_last_dst, cpu);
+		bundle_release = old && !xfrm_bundle_ok(old);
+		rcu_read_unlock();
+
+		if (!bundle_release)
+			continue;
+
+		if (cpu_online(cpu)) {
+			schedule_work_on(cpu, &xfrm_pcpu_work[cpu]);
+			continue;
+		}
+
+		rcu_read_lock();
+		old = per_cpu(xfrm_last_dst, cpu);
+		if (old && !xfrm_bundle_ok(old)) {
+			per_cpu(xfrm_last_dst, cpu) = NULL;
+			dst_release(&old->u.dst);
+		}
+		rcu_read_unlock();
+	}
+
+	put_online_cpus();
+}
+
+static bool xfrm_pol_dead(struct xfrm_dst *xdst)
+{
+	unsigned int num_pols = xdst->num_pols;
+	unsigned int pol_dead = 0, i;
+
+	for (i = 0; i < num_pols; i++)
+		pol_dead |= xdst->pols[i]->walk.dead;
+
+	/* Mark DST_OBSOLETE_DEAD to fail the next xfrm_dst_check() */
+	if (pol_dead)
+		xdst->u.dst.obsolete = DST_OBSOLETE_DEAD;
+
+	return pol_dead;
+}
+
 static struct xfrm_dst *
 xfrm_resolve_and_create_bundle(struct xfrm_policy **pols, int num_pols,
 			       const struct flowi *fl, u16 family,
@@ -1707,10 +1808,22 @@ xfrm_resolve_and_create_bundle(struct xfrm_policy **pols, int num_pols,
 {
 	struct net *net = xp_net(pols[0]);
 	struct xfrm_state *xfrm[XFRM_MAX_DEPTH];
+	struct xfrm_dst *xdst, *old;
 	struct dst_entry *dst;
-	struct xfrm_dst *xdst;
 	int err;
 
+	xdst = this_cpu_read(xfrm_last_dst);
+	if (xdst &&
+	    xdst->u.dst.dev == dst_orig->dev &&
+	    xdst->num_pols == num_pols &&
+	    !xfrm_pol_dead(xdst) &&
+	    memcmp(xdst->pols, pols,
+		   sizeof(struct xfrm_policy *) * num_pols) == 0) {
+		dst_hold(&xdst->u.dst);
+		return xdst;
+	}
+
+	old = xdst;
 	/* Try to instantiate a bundle */
 	err = xfrm_tmpl_resolve(pols, num_pols, fl, xfrm, family);
 	if (err <= 0) {
@@ -1731,6 +1844,9 @@ xfrm_resolve_and_create_bundle(struct xfrm_policy **pols, int num_pols,
 	memcpy(xdst->pols, pols, sizeof(struct xfrm_policy *) * num_pols);
 	xdst->policy_genid = atomic_read(&pols[0]->genid);
 
+	atomic_set(&xdst->u.dst.__refcnt, 2);
+	xfrm_last_dst_update(xdst, old);
+
 	return xdst;
 }
 
@@ -2843,6 +2959,15 @@ static struct pernet_operations __net_initdata xfrm_net_ops = {
 
 void __init xfrm_init(void)
 {
+	int i;
+
+	xfrm_pcpu_work = kmalloc_array(NR_CPUS, sizeof(*xfrm_pcpu_work),
+				       GFP_KERNEL);
+	BUG_ON(!xfrm_pcpu_work);
+
+	for (i = 0; i < NR_CPUS; i++)
+		INIT_WORK(&xfrm_pcpu_work[i], xfrm_pcpu_work_fn);
+
 	register_pernet_subsys(&xfrm_net_ops);
 	seqcount_init(&xfrm_policy_hash_generation);
 	xfrm_input_init();
diff --git a/net/xfrm/xfrm_state.c b/net/xfrm/xfrm_state.c
index 6c0956d10db6..82cbbce69b79 100644
--- a/net/xfrm/xfrm_state.c
+++ b/net/xfrm/xfrm_state.c
@@ -724,9 +724,10 @@ restart:
 			}
 		}
 	}
-	if (cnt)
+	if (cnt) {
 		err = 0;
-
+		xfrm_policy_cache_flush();
+	}
 out:
 	spin_unlock_bh(&net->xfrm.xfrm_state_lock);
 	return err;
-- 
cgit v1.2.3


From 7375ae3a0b79ea072f4c672039f08f5db633b9e1 Mon Sep 17 00:00:00 2001
From: Tom Lendacky <thomas.lendacky@amd.com>
Date: Mon, 17 Jul 2017 16:10:34 -0500
Subject: compiler-gcc.h: Introduce __nostackprotector function attribute
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Create a new function attribute, __nostackprotector, that can used to turn off
stack protection on a per function basis.

Signed-off-by: Tom Lendacky <thomas.lendacky@amd.com>
Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Alexander Potapenko <glider@google.com>
Cc: Andrey Ryabinin <aryabinin@virtuozzo.com>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brijesh Singh <brijesh.singh@amd.com>
Cc: Dave Young <dyoung@redhat.com>
Cc: Dmitry Vyukov <dvyukov@google.com>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Cc: Larry Woodman <lwoodman@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Matt Fleming <matt@codeblueprint.co.uk>
Cc: Michael S. Tsirkin <mst@redhat.com>
Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Radim Krčmář <rkrcmar@redhat.com>
Cc: Rik van Riel <riel@redhat.com>
Cc: Toshimitsu Kani <toshi.kani@hpe.com>
Cc: kasan-dev@googlegroups.com
Cc: kvm@vger.kernel.org
Cc: linux-arch@vger.kernel.org
Cc: linux-doc@vger.kernel.org
Cc: linux-efi@vger.kernel.org
Cc: linux-mm@kvack.org
Link: http://lkml.kernel.org/r/0576fd5c74440ad0250f16ac6609ecf587812456.1500319216.git.thomas.lendacky@amd.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/compiler-gcc.h | 2 ++
 include/linux/compiler.h     | 4 ++++
 2 files changed, 6 insertions(+)

(limited to 'include')

diff --git a/include/linux/compiler-gcc.h b/include/linux/compiler-gcc.h
index cd4bbe8242bd..c28cedde973f 100644
--- a/include/linux/compiler-gcc.h
+++ b/include/linux/compiler-gcc.h
@@ -166,6 +166,8 @@
 
 #if GCC_VERSION >= 40100
 # define __compiletime_object_size(obj) __builtin_object_size(obj, 0)
+
+#define __nostackprotector	__attribute__((__optimize__("no-stack-protector")))
 #endif
 
 #if GCC_VERSION >= 40300
diff --git a/include/linux/compiler.h b/include/linux/compiler.h
index 219f82f3ec1a..3f8c88e29a46 100644
--- a/include/linux/compiler.h
+++ b/include/linux/compiler.h
@@ -470,6 +470,10 @@ static __always_inline void __write_once_size(volatile void *p, void *res, int s
 #define __visible
 #endif
 
+#ifndef __nostackprotector
+# define __nostackprotector
+#endif
+
 /*
  * Assume alignment of return value.
  */
-- 
cgit v1.2.3


From eb0baf8a0d9259d168523b8e7c436b55ade7c546 Mon Sep 17 00:00:00 2001
From: Jin Yao <yao.jin@linux.intel.com>
Date: Tue, 18 Jul 2017 20:13:09 +0800
Subject: perf/core: Define the common branch type classification
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

It is often useful to know the branch types while analyzing branch data.
For example, a call is very different from a conditional branch.

Currently we have to look it up in binary while the binary may later not
be available and even the binary is available but user has to take some
time. It is very useful for user to check it directly in perf report.

Perf already has support for disassembling the branch instruction to get
the x86 branch type.

To keep consistent on kernel and userspace and make the classification
more common, the patch adds the common branch type classification
in perf_event.h.

The patch only defines a minimum but most common set of branch types.

PERF_BR_UNKNOWN         : unknown
PERF_BR_COND            ：conditional
PERF_BR_UNCOND          : unconditional
PERF_BR_IND             : indirect
PERF_BR_CALL            : function call
PERF_BR_IND_CALL        : indirect function call
PERF_BR_RET             : function return
PERF_BR_SYSCALL         : syscall
PERF_BR_SYSRET          : syscall return
PERF_BR_COND_CALL       : conditional function call
PERF_BR_COND_RET        : conditional function return

The patch also adds a new field type (4 bits) in perf_branch_entry
to record the branch type.

Since the disassembling of branch instruction needs some overhead,
a new PERF_SAMPLE_BRANCH_TYPE_SAVE is introduced to indicate if it
needs to disassemble the branch instruction and record the branch
type.

Change log:

v10: Not changed.

v9: Not changed.

v8: Change PERF_BR_NONE to PERF_BR_UNKNOWN.
    No other change.

v7: Just keep the most common branch types.
    Others are removed.

v6: Not changed.

v5: Not changed. The v5 patch series just change the userspace.

v4: Comparing to previous version, the major changes are:

1. Remove the PERF_BR_JCC_FWD/PERF_BR_JCC_BWD, they will be
   computed later in userspace.

2. Remove the "cross" field in perf_branch_entry. The cross page
   computing will be done later in userspace.

Signed-off-by: Yao Jin <yao.jin@linux.intel.com>
Acked-by: Jiri Olsa <jolsa@kernel.org>
Acked-by: Michael Ellerman <mpe@ellerman.id.au>
Acked-by: Peter Zijlstra <peterz@infradead.org>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Andi Kleen <ak@linux.intel.com>
Cc: Kan Liang <kan.liang@intel.com>
Link: http://lkml.kernel.org/r/1500379995-6449-2-git-send-email-yao.jin@linux.intel.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 include/uapi/linux/perf_event.h       | 27 ++++++++++++++++++++++++++-
 tools/include/uapi/linux/perf_event.h | 27 ++++++++++++++++++++++++++-
 2 files changed, 52 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/uapi/linux/perf_event.h b/include/uapi/linux/perf_event.h
index b1c0b187acfe..642db5fa3286 100644
--- a/include/uapi/linux/perf_event.h
+++ b/include/uapi/linux/perf_event.h
@@ -174,6 +174,8 @@ enum perf_branch_sample_type_shift {
 	PERF_SAMPLE_BRANCH_NO_FLAGS_SHIFT	= 14, /* no flags */
 	PERF_SAMPLE_BRANCH_NO_CYCLES_SHIFT	= 15, /* no cycles */
 
+	PERF_SAMPLE_BRANCH_TYPE_SAVE_SHIFT	= 16, /* save branch type */
+
 	PERF_SAMPLE_BRANCH_MAX_SHIFT		/* non-ABI */
 };
 
@@ -198,9 +200,30 @@ enum perf_branch_sample_type {
 	PERF_SAMPLE_BRANCH_NO_FLAGS	= 1U << PERF_SAMPLE_BRANCH_NO_FLAGS_SHIFT,
 	PERF_SAMPLE_BRANCH_NO_CYCLES	= 1U << PERF_SAMPLE_BRANCH_NO_CYCLES_SHIFT,
 
+	PERF_SAMPLE_BRANCH_TYPE_SAVE	=
+		1U << PERF_SAMPLE_BRANCH_TYPE_SAVE_SHIFT,
+
 	PERF_SAMPLE_BRANCH_MAX		= 1U << PERF_SAMPLE_BRANCH_MAX_SHIFT,
 };
 
+/*
+ * Common flow change classification
+ */
+enum {
+	PERF_BR_UNKNOWN		= 0,	/* unknown */
+	PERF_BR_COND		= 1,	/* conditional */
+	PERF_BR_UNCOND		= 2,	/* unconditional  */
+	PERF_BR_IND		= 3,	/* indirect */
+	PERF_BR_CALL		= 4,	/* function call */
+	PERF_BR_IND_CALL	= 5,	/* indirect function call */
+	PERF_BR_RET		= 6,	/* function return */
+	PERF_BR_SYSCALL		= 7,	/* syscall */
+	PERF_BR_SYSRET		= 8,	/* syscall return */
+	PERF_BR_COND_CALL	= 9,	/* conditional function call */
+	PERF_BR_COND_RET	= 10,	/* conditional function return */
+	PERF_BR_MAX,
+};
+
 #define PERF_SAMPLE_BRANCH_PLM_ALL \
 	(PERF_SAMPLE_BRANCH_USER|\
 	 PERF_SAMPLE_BRANCH_KERNEL|\
@@ -1015,6 +1038,7 @@ union perf_mem_data_src {
  *     in_tx: running in a hardware transaction
  *     abort: aborting a hardware transaction
  *    cycles: cycles from last branch (or 0 if not supported)
+ *      type: branch type
  */
 struct perf_branch_entry {
 	__u64	from;
@@ -1024,7 +1048,8 @@ struct perf_branch_entry {
 		in_tx:1,    /* in transaction */
 		abort:1,    /* transaction abort */
 		cycles:16,  /* cycle count to last branch */
-		reserved:44;
+		type:4,     /* branch type */
+		reserved:40;
 };
 
 #endif /* _UAPI_LINUX_PERF_EVENT_H */
diff --git a/tools/include/uapi/linux/perf_event.h b/tools/include/uapi/linux/perf_event.h
index b1c0b187acfe..642db5fa3286 100644
--- a/tools/include/uapi/linux/perf_event.h
+++ b/tools/include/uapi/linux/perf_event.h
@@ -174,6 +174,8 @@ enum perf_branch_sample_type_shift {
 	PERF_SAMPLE_BRANCH_NO_FLAGS_SHIFT	= 14, /* no flags */
 	PERF_SAMPLE_BRANCH_NO_CYCLES_SHIFT	= 15, /* no cycles */
 
+	PERF_SAMPLE_BRANCH_TYPE_SAVE_SHIFT	= 16, /* save branch type */
+
 	PERF_SAMPLE_BRANCH_MAX_SHIFT		/* non-ABI */
 };
 
@@ -198,9 +200,30 @@ enum perf_branch_sample_type {
 	PERF_SAMPLE_BRANCH_NO_FLAGS	= 1U << PERF_SAMPLE_BRANCH_NO_FLAGS_SHIFT,
 	PERF_SAMPLE_BRANCH_NO_CYCLES	= 1U << PERF_SAMPLE_BRANCH_NO_CYCLES_SHIFT,
 
+	PERF_SAMPLE_BRANCH_TYPE_SAVE	=
+		1U << PERF_SAMPLE_BRANCH_TYPE_SAVE_SHIFT,
+
 	PERF_SAMPLE_BRANCH_MAX		= 1U << PERF_SAMPLE_BRANCH_MAX_SHIFT,
 };
 
+/*
+ * Common flow change classification
+ */
+enum {
+	PERF_BR_UNKNOWN		= 0,	/* unknown */
+	PERF_BR_COND		= 1,	/* conditional */
+	PERF_BR_UNCOND		= 2,	/* unconditional  */
+	PERF_BR_IND		= 3,	/* indirect */
+	PERF_BR_CALL		= 4,	/* function call */
+	PERF_BR_IND_CALL	= 5,	/* indirect function call */
+	PERF_BR_RET		= 6,	/* function return */
+	PERF_BR_SYSCALL		= 7,	/* syscall */
+	PERF_BR_SYSRET		= 8,	/* syscall return */
+	PERF_BR_COND_CALL	= 9,	/* conditional function call */
+	PERF_BR_COND_RET	= 10,	/* conditional function return */
+	PERF_BR_MAX,
+};
+
 #define PERF_SAMPLE_BRANCH_PLM_ALL \
 	(PERF_SAMPLE_BRANCH_USER|\
 	 PERF_SAMPLE_BRANCH_KERNEL|\
@@ -1015,6 +1038,7 @@ union perf_mem_data_src {
  *     in_tx: running in a hardware transaction
  *     abort: aborting a hardware transaction
  *    cycles: cycles from last branch (or 0 if not supported)
+ *      type: branch type
  */
 struct perf_branch_entry {
 	__u64	from;
@@ -1024,7 +1048,8 @@ struct perf_branch_entry {
 		in_tx:1,    /* in transaction */
 		abort:1,    /* transaction abort */
 		cycles:16,  /* cycle count to last branch */
-		reserved:44;
+		type:4,     /* branch type */
+		reserved:40;
 };
 
 #endif /* _UAPI_LINUX_PERF_EVENT_H */
-- 
cgit v1.2.3


From 17c82e206d2a3cd876b64921c59116f1ecdce6ad Mon Sep 17 00:00:00 2001
From: Vivek Gautam <vivek.gautam@codeaurora.org>
Date: Mon, 22 May 2017 16:53:25 +0530
Subject: reset: Add APIs to manage array of resets

Many devices may want to request a bunch of resets and control them. So
it's better to manage them as an array. Add APIs to _get() an array of
reset_control, reusing the _assert(), _deassert(), and _reset() APIs for
single reset controls. Since reset controls already may control multiple
reset lines with a single hardware bit, from the user perspective, reset
control arrays are not at all different from single reset controls.
Note that these APIs don't guarantee that the reset lines managed in the
array are handled in any particular order.

Cc: Felipe Balbi <balbi@kernel.org>
Cc: Jon Hunter <jonathanh@nvidia.com>
Signed-off-by: Vivek Gautam <vivek.gautam@codeaurora.org>
[p.zabel@pengutronix.de: changed API to hide reset control arrays behind
 struct reset_control]
Signed-off-by: Philipp Zabel <p.zabel@pengutronix.de>
---
 drivers/reset/core.c  | 211 +++++++++++++++++++++++++++++++++++++++++++++++++-
 include/linux/reset.h |  68 ++++++++++++++++
 2 files changed, 278 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/drivers/reset/core.c b/drivers/reset/core.c
index 0090784ff410..c8fb4426b218 100644
--- a/drivers/reset/core.c
+++ b/drivers/reset/core.c
@@ -43,10 +43,23 @@ struct reset_control {
 	unsigned int id;
 	struct kref refcnt;
 	bool shared;
+	bool array;
 	atomic_t deassert_count;
 	atomic_t triggered_count;
 };
 
+/**
+ * struct reset_control_array - an array of reset controls
+ * @base: reset control for compatibility with reset control API functions
+ * @num_rstcs: number of reset controls
+ * @rstc: array of reset controls
+ */
+struct reset_control_array {
+	struct reset_control base;
+	unsigned int num_rstcs;
+	struct reset_control *rstc[];
+};
+
 /**
  * of_reset_simple_xlate - translate reset_spec to the reset line number
  * @rcdev: a pointer to the reset controller device
@@ -135,6 +148,65 @@ int devm_reset_controller_register(struct device *dev,
 }
 EXPORT_SYMBOL_GPL(devm_reset_controller_register);
 
+static inline struct reset_control_array *
+rstc_to_array(struct reset_control *rstc) {
+	return container_of(rstc, struct reset_control_array, base);
+}
+
+static int reset_control_array_reset(struct reset_control_array *resets)
+{
+	int ret, i;
+
+	for (i = 0; i < resets->num_rstcs; i++) {
+		ret = reset_control_reset(resets->rstc[i]);
+		if (ret)
+			return ret;
+	}
+
+	return 0;
+}
+
+static int reset_control_array_assert(struct reset_control_array *resets)
+{
+	int ret, i;
+
+	for (i = 0; i < resets->num_rstcs; i++) {
+		ret = reset_control_assert(resets->rstc[i]);
+		if (ret)
+			goto err;
+	}
+
+	return 0;
+
+err:
+	while (i--)
+		reset_control_deassert(resets->rstc[i]);
+	return ret;
+}
+
+static int reset_control_array_deassert(struct reset_control_array *resets)
+{
+	int ret, i;
+
+	for (i = 0; i < resets->num_rstcs; i++) {
+		ret = reset_control_deassert(resets->rstc[i]);
+		if (ret)
+			goto err;
+	}
+
+	return 0;
+
+err:
+	while (i--)
+		reset_control_assert(resets->rstc[i]);
+	return ret;
+}
+
+static inline bool reset_control_is_array(struct reset_control *rstc)
+{
+	return rstc->array;
+}
+
 /**
  * reset_control_reset - reset the controlled device
  * @rstc: reset controller
@@ -158,6 +230,9 @@ int reset_control_reset(struct reset_control *rstc)
 	if (WARN_ON(IS_ERR(rstc)))
 		return -EINVAL;
 
+	if (reset_control_is_array(rstc))
+		return reset_control_array_reset(rstc_to_array(rstc));
+
 	if (!rstc->rcdev->ops->reset)
 		return -ENOTSUPP;
 
@@ -202,6 +277,9 @@ int reset_control_assert(struct reset_control *rstc)
 	if (WARN_ON(IS_ERR(rstc)))
 		return -EINVAL;
 
+	if (reset_control_is_array(rstc))
+		return reset_control_array_assert(rstc_to_array(rstc));
+
 	if (!rstc->rcdev->ops->assert)
 		return -ENOTSUPP;
 
@@ -240,6 +318,9 @@ int reset_control_deassert(struct reset_control *rstc)
 	if (WARN_ON(IS_ERR(rstc)))
 		return -EINVAL;
 
+	if (reset_control_is_array(rstc))
+		return reset_control_array_deassert(rstc_to_array(rstc));
+
 	if (!rstc->rcdev->ops->deassert)
 		return -ENOTSUPP;
 
@@ -266,7 +347,7 @@ int reset_control_status(struct reset_control *rstc)
 	if (!rstc)
 		return 0;
 
-	if (WARN_ON(IS_ERR(rstc)))
+	if (WARN_ON(IS_ERR(rstc)) || reset_control_is_array(rstc))
 		return -EINVAL;
 
 	if (rstc->rcdev->ops->status)
@@ -404,6 +485,16 @@ struct reset_control *__reset_control_get(struct device *dev, const char *id,
 }
 EXPORT_SYMBOL_GPL(__reset_control_get);
 
+static void reset_control_array_put(struct reset_control_array *resets)
+{
+	int i;
+
+	mutex_lock(&reset_list_mutex);
+	for (i = 0; i < resets->num_rstcs; i++)
+		__reset_control_put_internal(resets->rstc[i]);
+	mutex_unlock(&reset_list_mutex);
+}
+
 /**
  * reset_control_put - free the reset controller
  * @rstc: reset controller
@@ -413,6 +504,11 @@ void reset_control_put(struct reset_control *rstc)
 	if (IS_ERR_OR_NULL(rstc))
 		return;
 
+	if (reset_control_is_array(rstc)) {
+		reset_control_array_put(rstc_to_array(rstc));
+		return;
+	}
+
 	mutex_lock(&reset_list_mutex);
 	__reset_control_put_internal(rstc);
 	mutex_unlock(&reset_list_mutex);
@@ -472,3 +568,116 @@ int device_reset(struct device *dev)
 	return ret;
 }
 EXPORT_SYMBOL_GPL(device_reset);
+
+/**
+ * APIs to manage an array of reset controls.
+ */
+/**
+ * of_reset_control_get_count - Count number of resets available with a device
+ *
+ * @node: device node that contains 'resets'.
+ *
+ * Returns positive reset count on success, or error number on failure and
+ * on count being zero.
+ */
+static int of_reset_control_get_count(struct device_node *node)
+{
+	int count;
+
+	if (!node)
+		return -EINVAL;
+
+	count = of_count_phandle_with_args(node, "resets", "#reset-cells");
+	if (count == 0)
+		count = -ENOENT;
+
+	return count;
+}
+
+/**
+ * of_reset_control_array_get - Get a list of reset controls using
+ *				device node.
+ *
+ * @np: device node for the device that requests the reset controls array
+ * @shared: whether reset controls are shared or not
+ * @optional: whether it is optional to get the reset controls
+ *
+ * Returns pointer to allocated reset_control_array on success or
+ * error on failure
+ */
+struct reset_control *
+of_reset_control_array_get(struct device_node *np, bool shared, bool optional)
+{
+	struct reset_control_array *resets;
+	struct reset_control *rstc;
+	int num, i;
+
+	num = of_reset_control_get_count(np);
+	if (num < 0)
+		return optional ? NULL : ERR_PTR(num);
+
+	resets = kzalloc(sizeof(*resets) + sizeof(resets->rstc[0]) * num,
+			 GFP_KERNEL);
+	if (!resets)
+		return ERR_PTR(-ENOMEM);
+
+	for (i = 0; i < num; i++) {
+		rstc = __of_reset_control_get(np, NULL, i, shared, optional);
+		if (IS_ERR(rstc))
+			goto err_rst;
+		resets->rstc[i] = rstc;
+	}
+	resets->num_rstcs = num;
+	resets->base.array = true;
+
+	return &resets->base;
+
+err_rst:
+	mutex_lock(&reset_list_mutex);
+	while (--i >= 0)
+		__reset_control_put_internal(resets->rstc[i]);
+	mutex_unlock(&reset_list_mutex);
+
+	kfree(resets);
+
+	return rstc;
+}
+EXPORT_SYMBOL_GPL(of_reset_control_array_get);
+
+/**
+ * devm_reset_control_array_get - Resource managed reset control array get
+ *
+ * @dev: device that requests the list of reset controls
+ * @shared: whether reset controls are shared or not
+ * @optional: whether it is optional to get the reset controls
+ *
+ * The reset control array APIs are intended for a list of resets
+ * that just have to be asserted or deasserted, without any
+ * requirements on the order.
+ *
+ * Returns pointer to allocated reset_control_array on success or
+ * error on failure
+ */
+struct reset_control *
+devm_reset_control_array_get(struct device *dev, bool shared, bool optional)
+{
+	struct reset_control **devres;
+	struct reset_control *rstc;
+
+	devres = devres_alloc(devm_reset_control_release, sizeof(*devres),
+			      GFP_KERNEL);
+	if (!devres)
+		return ERR_PTR(-ENOMEM);
+
+	rstc = of_reset_control_array_get(dev->of_node, shared, optional);
+	if (IS_ERR(rstc)) {
+		devres_free(devres);
+		return rstc;
+	}
+
+	*devres = rstc;
+	devres_add(dev, devres);
+
+	return rstc;
+}
+EXPORT_SYMBOL_GPL(devm_reset_control_array_get);
diff --git a/include/linux/reset.h b/include/linux/reset.h
index 13d8681210d5..56463f37f3e6 100644
--- a/include/linux/reset.h
+++ b/include/linux/reset.h
@@ -25,6 +25,11 @@ struct reset_control *__devm_reset_control_get(struct device *dev,
 
 int __must_check device_reset(struct device *dev);
 
+struct reset_control *devm_reset_control_array_get(struct device *dev,
+						   bool shared, bool optional);
+struct reset_control *of_reset_control_array_get(struct device_node *np,
+						 bool shared, bool optional);
+
 static inline int device_reset_optional(struct device *dev)
 {
 	return device_reset(dev);
@@ -89,6 +94,18 @@ static inline struct reset_control *__devm_reset_control_get(
 	return optional ? NULL : ERR_PTR(-ENOTSUPP);
 }
 
+static inline struct reset_control *
+devm_reset_control_array_get(struct device *dev, bool shared, bool optional)
+{
+	return optional ? NULL : ERR_PTR(-ENOTSUPP);
+}
+
+static inline struct reset_control *
+of_reset_control_array_get(struct device_node *np, bool shared, bool optional)
+{
+	return optional ? NULL : ERR_PTR(-ENOTSUPP);
+}
+
 #endif /* CONFIG_RESET_CONTROLLER */
 
 /**
@@ -374,4 +391,55 @@ static inline struct reset_control *devm_reset_control_get_by_index(
 {
 	return devm_reset_control_get_exclusive_by_index(dev, index);
 }
+
+/*
+ * APIs to manage a list of reset controllers
+ */
+static inline struct reset_control *
+devm_reset_control_array_get_exclusive(struct device *dev)
+{
+	return devm_reset_control_array_get(dev, false, false);
+}
+
+static inline struct reset_control *
+devm_reset_control_array_get_shared(struct device *dev)
+{
+	return devm_reset_control_array_get(dev, true, false);
+}
+
+static inline struct reset_control *
+devm_reset_control_array_get_optional_exclusive(struct device *dev)
+{
+	return devm_reset_control_array_get(dev, false, true);
+}
+
+static inline struct reset_control *
+devm_reset_control_array_get_optional_shared(struct device *dev)
+{
+	return devm_reset_control_array_get(dev, true, true);
+}
+
+static inline struct reset_control *
+of_reset_control_array_get_exclusive(struct device_node *node)
+{
+	return of_reset_control_array_get(node, false, false);
+}
+
+static inline struct reset_control *
+of_reset_control_array_get_shared(struct device_node *node)
+{
+	return of_reset_control_array_get(node, true, false);
+}
+
+static inline struct reset_control *
+of_reset_control_array_get_optional_exclusive(struct device_node *node)
+{
+	return of_reset_control_array_get(node, false, true);
+}
+
+static inline struct reset_control *
+of_reset_control_array_get_optional_shared(struct device_node *node)
+{
+	return of_reset_control_array_get(node, true, true);
+}
 #endif
-- 
cgit v1.2.3


From ddc9e69b9dc23d4c0d8ed829575327746ea77a04 Mon Sep 17 00:00:00 2001
From: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Date: Tue, 18 Jul 2017 20:34:17 +0300
Subject: ASoC: rt5677: Hide platform data in the module sources

There is no user of legacy platform data.

Remove separate header and hide its content inside module sources.

Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 include/sound/rt5677.h    | 45 ---------------------------------------------
 sound/soc/codecs/rt5677.c | 29 +++++++++++++++++------------
 sound/soc/codecs/rt5677.h | 30 +++++++++++++++++++++++++++++-
 3 files changed, 46 insertions(+), 58 deletions(-)
 delete mode 100644 include/sound/rt5677.h

(limited to 'include')

diff --git a/include/sound/rt5677.h b/include/sound/rt5677.h
deleted file mode 100644
index a6207043ac3c..000000000000
--- a/include/sound/rt5677.h
+++ /dev/null
@@ -1,45 +0,0 @@
-/*
- * linux/sound/rt5677.h -- Platform data for RT5677
- *
- * Copyright 2013 Realtek Semiconductor Corp.
- * Author: Oder Chiou <oder_chiou@realtek.com>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- */
-
-#ifndef __LINUX_SND_RT5677_H
-#define __LINUX_SND_RT5677_H
-
-enum rt5677_dmic2_clk {
-	RT5677_DMIC_CLK1 = 0,
-	RT5677_DMIC_CLK2 = 1,
-};
-
-
-struct rt5677_platform_data {
-	/* IN1/IN2/LOUT1/LOUT2/LOUT3 can optionally be differential */
-	bool in1_diff;
-	bool in2_diff;
-	bool lout1_diff;
-	bool lout2_diff;
-	bool lout3_diff;
-	/* DMIC2 clock source selection */
-	enum rt5677_dmic2_clk dmic2_clk_pin;
-
-	/* configures GPIO, 0 - floating, 1 - pulldown, 2 - pullup */
-	u8 gpio_config[6];
-
-	/* jd1 can select 0 ~ 3 as OFF, GPIO1, GPIO2 and GPIO3 respectively */
-	unsigned int jd1_gpio;
-	/* jd2 and jd3 can select 0 ~ 3 as
-		OFF, GPIO4, GPIO5 and GPIO6 respectively */
-	unsigned int jd2_gpio;
-	unsigned int jd3_gpio;
-
-	/* Set MICBIAS1 VDD 1v8 or 3v3 */
-	bool micbias1_vdd_3v3;
-};
-
-#endif
diff --git a/sound/soc/codecs/rt5677.c b/sound/soc/codecs/rt5677.c
index 36e530a36c82..0f9642c2198a 100644
--- a/sound/soc/codecs/rt5677.c
+++ b/sound/soc/codecs/rt5677.c
@@ -21,6 +21,7 @@
 #include <linux/platform_device.h>
 #include <linux/spi/spi.h>
 #include <linux/firmware.h>
+#include <linux/of_device.h>
 #include <linux/property.h>
 #include <sound/core.h>
 #include <sound/pcm.h>
@@ -5019,25 +5020,21 @@ static const struct regmap_config rt5677_regmap = {
 };
 
 static const struct i2c_device_id rt5677_i2c_id[] = {
-	{ "rt5677", RT5677 },
-	{ "rt5676", RT5676 },
 	{ }
 };
 MODULE_DEVICE_TABLE(i2c, rt5677_i2c_id);
 
 static const struct of_device_id rt5677_of_match[] = {
-	{ .compatible = "realtek,rt5677", },
+	{ .compatible = "realtek,rt5677", RT5677 },
 	{ }
 };
 MODULE_DEVICE_TABLE(of, rt5677_of_match);
 
-#ifdef CONFIG_ACPI
 static const struct acpi_device_id rt5677_acpi_match[] = {
 	{ "RT5677CE", RT5677 },
 	{ }
 };
 MODULE_DEVICE_TABLE(acpi, rt5677_acpi_match);
-#endif
 
 static void rt5677_read_acpi_properties(struct rt5677_priv *rt5677,
 		struct device *dev)
@@ -5147,7 +5144,6 @@ static void rt5677_free_irq(struct i2c_client *i2c)
 static int rt5677_i2c_probe(struct i2c_client *i2c,
 		    const struct i2c_device_id *id)
 {
-	struct rt5677_platform_data *pdata = dev_get_platdata(&i2c->dev);
 	struct rt5677_priv *rt5677;
 	int ret;
 	unsigned int val;
@@ -5159,16 +5155,25 @@ static int rt5677_i2c_probe(struct i2c_client *i2c,
 
 	i2c_set_clientdata(i2c, rt5677);
 
-	rt5677->type = id->driver_data;
+	if (i2c->dev.of_node) {
+		const struct of_device_id *match_id;
+
+		match_id = of_match_device(rt5677_of_match, &i2c->dev);
+		if (match_id)
+			rt5677->type = (enum rt5677_type)match_id->data;
 
-	if (pdata)
-		rt5677->pdata = *pdata;
-	else if (i2c->dev.of_node)
 		rt5677_read_device_properties(rt5677, &i2c->dev);
-	else if (ACPI_HANDLE(&i2c->dev))
+	} else if (ACPI_HANDLE(&i2c->dev)) {
+		const struct acpi_device_id *acpi_id;
+
+		acpi_id = acpi_match_device(rt5677_acpi_match, &i2c->dev);
+		if (acpi_id)
+			rt5677->type = (enum rt5677_type)acpi_id->driver_data;
+
 		rt5677_read_acpi_properties(rt5677, &i2c->dev);
-	else
+	} else {
 		return -EINVAL;
+	}
 
 	/* pow-ldo2 and reset are optional. The codec pins may be statically
 	 * connected on the board without gpios. If the gpio device property
diff --git a/sound/soc/codecs/rt5677.h b/sound/soc/codecs/rt5677.h
index d46855a42c40..97239973edc4 100644
--- a/sound/soc/codecs/rt5677.h
+++ b/sound/soc/codecs/rt5677.h
@@ -12,7 +12,6 @@
 #ifndef __RT5677_H__
 #define __RT5677_H__
 
-#include <sound/rt5677.h>
 #include <linux/gpio/driver.h>
 #include <linux/gpio/consumer.h>
 
@@ -1761,6 +1760,35 @@ enum {
 	RT5677_I2S4_SOURCE = (0x1 << 18),
 };
 
+enum rt5677_dmic2_clk {
+	RT5677_DMIC_CLK1 = 0,
+	RT5677_DMIC_CLK2 = 1,
+};
+
+struct rt5677_platform_data {
+	/* IN1/IN2/LOUT1/LOUT2/LOUT3 can optionally be differential */
+	bool in1_diff;
+	bool in2_diff;
+	bool lout1_diff;
+	bool lout2_diff;
+	bool lout3_diff;
+	/* DMIC2 clock source selection */
+	enum rt5677_dmic2_clk dmic2_clk_pin;
+
+	/* configures GPIO, 0 - floating, 1 - pulldown, 2 - pullup */
+	u8 gpio_config[6];
+
+	/* jd1 can select 0 ~ 3 as OFF, GPIO1, GPIO2 and GPIO3 respectively */
+	unsigned int jd1_gpio;
+	/* jd2 and jd3 can select 0 ~ 3 as
+		OFF, GPIO4, GPIO5 and GPIO6 respectively */
+	unsigned int jd2_gpio;
+	unsigned int jd3_gpio;
+
+	/* Set MICBIAS1 VDD 1v8 or 3v3 */
+	bool micbias1_vdd_3v3;
+};
+
 struct rt5677_priv {
 	struct snd_soc_codec *codec;
 	struct rt5677_platform_data pdata;
-- 
cgit v1.2.3


From 97bbdf02d905a0a48cb3c953ea352d7f4994643c Mon Sep 17 00:00:00 2001
From: Sakari Ailus <sakari.ailus@iki.fi>
Date: Wed, 25 Feb 2015 14:39:11 -0500
Subject: media: v4l: Add support for CSI-1 and CCP2 busses

CCP2 and CSI-1, are older single data lane serial busses.

[mchehab@s-opensource.com: don't use spaces for identation]
Signed-off-by: Sakari Ailus <sakari.ailus@linux.intel.com>
Signed-off-by: Pavel Machek <pavel@ucw.cz>
Reviewed-by: Sebastian Reichel <sebastian.reichel@collabora.co.uk>
Signed-off-by: Mauro Carvalho Chehab <mchehab@s-opensource.com>
---
 drivers/media/platform/pxa_camera.c              |  3 ++
 drivers/media/platform/soc_camera/soc_mediabus.c |  3 ++
 drivers/media/v4l2-core/v4l2-fwnode.c            | 58 +++++++++++++++++++-----
 include/media/v4l2-fwnode.h                      | 19 ++++++++
 include/media/v4l2-mediabus.h                    |  4 ++
 5 files changed, 76 insertions(+), 11 deletions(-)

(limited to 'include')

diff --git a/drivers/media/platform/pxa_camera.c b/drivers/media/platform/pxa_camera.c
index 5c3b7cf3638b..3898a5cd8664 100644
--- a/drivers/media/platform/pxa_camera.c
+++ b/drivers/media/platform/pxa_camera.c
@@ -638,6 +638,9 @@ static unsigned int pxa_mbus_config_compatible(const struct v4l2_mbus_config *cf
 		mipi_clock = common_flags & (V4L2_MBUS_CSI2_NONCONTINUOUS_CLOCK |
 					     V4L2_MBUS_CSI2_CONTINUOUS_CLOCK);
 		return (!mipi_lanes || !mipi_clock) ? 0 : common_flags;
+	default:
+		__WARN();
+		return -EINVAL;
 	}
 	return 0;
 }
diff --git a/drivers/media/platform/soc_camera/soc_mediabus.c b/drivers/media/platform/soc_camera/soc_mediabus.c
index 57581f626f4c..43192d80beef 100644
--- a/drivers/media/platform/soc_camera/soc_mediabus.c
+++ b/drivers/media/platform/soc_camera/soc_mediabus.c
@@ -508,6 +508,9 @@ unsigned int soc_mbus_config_compatible(const struct v4l2_mbus_config *cfg,
 		mipi_clock = common_flags & (V4L2_MBUS_CSI2_NONCONTINUOUS_CLOCK |
 					     V4L2_MBUS_CSI2_CONTINUOUS_CLOCK);
 		return (!mipi_lanes || !mipi_clock) ? 0 : common_flags;
+	default:
+		__WARN();
+		return -EINVAL;
 	}
 	return 0;
 }
diff --git a/drivers/media/v4l2-core/v4l2-fwnode.c b/drivers/media/v4l2-core/v4l2-fwnode.c
index d71dd3913cd9..ca755a4832fc 100644
--- a/drivers/media/v4l2-core/v4l2-fwnode.c
+++ b/drivers/media/v4l2-core/v4l2-fwnode.c
@@ -154,6 +154,31 @@ static void v4l2_fwnode_endpoint_parse_parallel_bus(
 
 }
 
+void v4l2_fwnode_endpoint_parse_csi1_bus(struct fwnode_handle *fwnode,
+					 struct v4l2_fwnode_endpoint *vep,
+					 u32 bus_type)
+{
+	struct v4l2_fwnode_bus_mipi_csi1 *bus = &vep->bus.mipi_csi1;
+	u32 v;
+
+	if (!fwnode_property_read_u32(fwnode, "clock-inv", &v))
+		bus->clock_inv = v;
+
+	if (!fwnode_property_read_u32(fwnode, "strobe", &v))
+		bus->strobe = v;
+
+	if (!fwnode_property_read_u32(fwnode, "data-lanes", &v))
+		bus->data_lane = v;
+
+	if (!fwnode_property_read_u32(fwnode, "clock-lanes", &v))
+		bus->clock_lane = v;
+
+	if (bus_type == V4L2_FWNODE_BUS_TYPE_CCP2)
+		vep->bus_type = V4L2_MBUS_CCP2;
+	else
+		vep->bus_type = V4L2_MBUS_CSI1;
+}
+
 /**
  * v4l2_fwnode_endpoint_parse() - parse all fwnode node properties
  * @fwnode: pointer to the endpoint's fwnode handle
@@ -187,17 +212,28 @@ int v4l2_fwnode_endpoint_parse(struct fwnode_handle *fwnode,
 
 	fwnode_property_read_u32(fwnode, "bus-type", &bus_type);
 
-	rval = v4l2_fwnode_endpoint_parse_csi2_bus(fwnode, vep);
-	if (rval)
-		return rval;
-	/*
-	 * Parse the parallel video bus properties only if none
-	 * of the MIPI CSI-2 specific properties were found.
-	 */
-	if (vep->bus.mipi_csi2.flags == 0)
-		v4l2_fwnode_endpoint_parse_parallel_bus(fwnode, vep);
-
-	return 0;
+	switch (bus_type) {
+	case V4L2_FWNODE_BUS_TYPE_GUESS:
+		rval = v4l2_fwnode_endpoint_parse_csi2_bus(fwnode, vep);
+		if (rval)
+			return rval;
+		/*
+		 * Parse the parallel video bus properties only if none
+		 * of the MIPI CSI-2 specific properties were found.
+		 */
+		if (vep->bus.mipi_csi2.flags == 0)
+			v4l2_fwnode_endpoint_parse_parallel_bus(fwnode, vep);
+
+		return 0;
+	case V4L2_FWNODE_BUS_TYPE_CCP2:
+	case V4L2_FWNODE_BUS_TYPE_CSI1:
+		v4l2_fwnode_endpoint_parse_csi1_bus(fwnode, vep, bus_type);
+
+		return 0;
+	default:
+		pr_warn("unsupported bus type %u\n", bus_type);
+		return -EINVAL;
+	}
 }
 EXPORT_SYMBOL_GPL(v4l2_fwnode_endpoint_parse);
 
diff --git a/include/media/v4l2-fwnode.h b/include/media/v4l2-fwnode.h
index ecc1233a873e..29ae22bbbbaf 100644
--- a/include/media/v4l2-fwnode.h
+++ b/include/media/v4l2-fwnode.h
@@ -55,6 +55,24 @@ struct v4l2_fwnode_bus_parallel {
 	unsigned char data_shift;
 };
 
+/**
+ * struct v4l2_fwnode_bus_mipi_csi1 - CSI-1/CCP2 data bus structure
+ * @clock_inv: polarity of clock/strobe signal
+ *	       false - not inverted, true - inverted
+ * @strobe: false - data/clock, true - data/strobe
+ * @lane_polarity: the polarities of the clock (index 0) and data lanes
+		   index (1)
+ * @data_lane: the number of the data lane
+ * @clock_lane: the number of the clock lane
+ */
+struct v4l2_fwnode_bus_mipi_csi1 {
+	bool clock_inv;
+	bool strobe;
+	bool lane_polarity[2];
+	unsigned char data_lane;
+	unsigned char clock_lane;
+};
+
 /**
  * struct v4l2_fwnode_endpoint - the endpoint data structure
  * @base: fwnode endpoint of the v4l2_fwnode
@@ -72,6 +90,7 @@ struct v4l2_fwnode_endpoint {
 	enum v4l2_mbus_type bus_type;
 	union {
 		struct v4l2_fwnode_bus_parallel parallel;
+		struct v4l2_fwnode_bus_mipi_csi1 mipi_csi1;
 		struct v4l2_fwnode_bus_mipi_csi2 mipi_csi2;
 	} bus;
 	u64 *link_frequencies;
diff --git a/include/media/v4l2-mediabus.h b/include/media/v4l2-mediabus.h
index 34cc99e093ef..315c167a95dc 100644
--- a/include/media/v4l2-mediabus.h
+++ b/include/media/v4l2-mediabus.h
@@ -69,11 +69,15 @@
  * @V4L2_MBUS_PARALLEL:	parallel interface with hsync and vsync
  * @V4L2_MBUS_BT656:	parallel interface with embedded synchronisation, can
  *			also be used for BT.1120
+ * @V4L2_MBUS_CSI1:	MIPI CSI-1 serial interface
+ * @V4L2_MBUS_CCP2:	CCP2 (Compact Camera Port 2)
  * @V4L2_MBUS_CSI2:	MIPI CSI-2 serial interface
  */
 enum v4l2_mbus_type {
 	V4L2_MBUS_PARALLEL,
 	V4L2_MBUS_BT656,
+	V4L2_MBUS_CSI1,
+	V4L2_MBUS_CCP2,
 	V4L2_MBUS_CSI2,
 };
 
-- 
cgit v1.2.3


From bb4d991a28cc86a2dfbeefeff32911ca9f779c18 Mon Sep 17 00:00:00 2001
From: Yuchung Cheng <ycheng@google.com>
Date: Wed, 19 Jul 2017 15:41:26 -0700
Subject: tcp: adjust tail loss probe timeout

This patch adjusts the timeout formula to schedule the TCP loss probe
(TLP). The previous formula uses 2*SRTT or 1.5*RTT + DelayACKMax if
only one packet is in flight. It keeps a lower bound of 10 msec which
is too large for short RTT connections (e.g. within a data-center).
The new formula = 2*RTT + (inflight == 1 ? 200ms : 2ticks) which
performs better for short and fast connections.

Signed-off-by: Yuchung Cheng <ycheng@google.com>
Signed-off-by: Neal Cardwell <ncardwell@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/tcp.h       |  3 +--
 net/ipv4/tcp_output.c   | 17 ++++++++++-------
 net/ipv4/tcp_recovery.c |  2 +-
 3 files changed, 12 insertions(+), 10 deletions(-)

(limited to 'include')

diff --git a/include/net/tcp.h b/include/net/tcp.h
index 70483296157f..4f056ea79df2 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -139,6 +139,7 @@ void tcp_time_wait(struct sock *sk, int state, int timeo);
 #endif
 #define TCP_RTO_MAX	((unsigned)(120*HZ))
 #define TCP_RTO_MIN	((unsigned)(HZ/5))
+#define TCP_TIMEOUT_MIN	(2U) /* Min timeout for TCP timers in jiffies */
 #define TCP_TIMEOUT_INIT ((unsigned)(1*HZ))	/* RFC6298 2.1 initial RTO value	*/
 #define TCP_TIMEOUT_FALLBACK ((unsigned)(3*HZ))	/* RFC 1122 initial RTO value, now
 						 * used as a fallback RTO for the
@@ -150,8 +151,6 @@ void tcp_time_wait(struct sock *sk, int state, int timeo);
 #define TCP_RESOURCE_PROBE_INTERVAL ((unsigned)(HZ/2U)) /* Maximal interval between probes
 					                 * for local resources.
 					                 */
-#define TCP_REO_TIMEOUT_MIN	(2000) /* Min RACK reordering timeout in usec */
-
 #define TCP_KEEPALIVE_TIME	(120*60*HZ)	/* two hours */
 #define TCP_KEEPALIVE_PROBES	9		/* Max of 9 keepalive probes	*/
 #define TCP_KEEPALIVE_INTVL	(75*HZ)
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 4e985dea1dd2..886d874775df 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -2377,7 +2377,6 @@ bool tcp_schedule_loss_probe(struct sock *sk)
 	struct inet_connection_sock *icsk = inet_csk(sk);
 	struct tcp_sock *tp = tcp_sk(sk);
 	u32 timeout, tlp_time_stamp, rto_time_stamp;
-	u32 rtt = usecs_to_jiffies(tp->srtt_us >> 3);
 
 	/* No consecutive loss probes. */
 	if (WARN_ON(icsk->icsk_pending == ICSK_TIME_LOSS_PROBE)) {
@@ -2406,15 +2405,19 @@ bool tcp_schedule_loss_probe(struct sock *sk)
 	     tcp_send_head(sk))
 		return false;
 
-	/* Probe timeout is at least 1.5*rtt + TCP_DELACK_MAX to account
+	/* Probe timeout is 2*rtt. Add minimum RTO to account
 	 * for delayed ack when there's one outstanding packet. If no RTT
 	 * sample is available then probe after TCP_TIMEOUT_INIT.
 	 */
-	timeout = rtt << 1 ? : TCP_TIMEOUT_INIT;
-	if (tp->packets_out == 1)
-		timeout = max_t(u32, timeout,
-				(rtt + (rtt >> 1) + TCP_DELACK_MAX));
-	timeout = max_t(u32, timeout, msecs_to_jiffies(10));
+	if (tp->srtt_us) {
+		timeout = usecs_to_jiffies(tp->srtt_us >> 2);
+		if (tp->packets_out == 1)
+			timeout += TCP_RTO_MIN;
+		else
+			timeout += TCP_TIMEOUT_MIN;
+	} else {
+		timeout = TCP_TIMEOUT_INIT;
+	}
 
 	/* If RTO is shorter, just schedule TLP in its place. */
 	tlp_time_stamp = tcp_jiffies32 + timeout;
diff --git a/net/ipv4/tcp_recovery.c b/net/ipv4/tcp_recovery.c
index fe9a493d0208..449cd914d58e 100644
--- a/net/ipv4/tcp_recovery.c
+++ b/net/ipv4/tcp_recovery.c
@@ -113,7 +113,7 @@ void tcp_rack_mark_lost(struct sock *sk)
 	tp->rack.advanced = 0;
 	tcp_rack_detect_loss(sk, &timeout);
 	if (timeout) {
-		timeout = usecs_to_jiffies(timeout + TCP_REO_TIMEOUT_MIN);
+		timeout = usecs_to_jiffies(timeout) + TCP_TIMEOUT_MIN;
 		inet_csk_reset_xmit_timer(sk, ICSK_TIME_REO_TIMEOUT,
 					  timeout, inet_csk(sk)->icsk_rto);
 	}
-- 
cgit v1.2.3


From e7d53ad3239de636ea478fc003d3652b49b8e593 Mon Sep 17 00:00:00 2001
From: Vivien Didelot <vivien.didelot@savoirfairelinux.com>
Date: Tue, 18 Jul 2017 16:23:56 -0400
Subject: net: dsa: unexport dsa_is_port_initialized

The dsa_is_port_initialized helper is only used by dsa_switch_resume and
dsa_switch_suspend, if CONFIG_PM_SLEEP is enabled. Make it static to
dsa.c.

Signed-off-by: Vivien Didelot <vivien.didelot@savoirfairelinux.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/dsa.h | 5 -----
 net/dsa/dsa.c     | 5 +++++
 2 files changed, 5 insertions(+), 5 deletions(-)

(limited to 'include')

diff --git a/include/net/dsa.h b/include/net/dsa.h
index 58969b9a090c..88da272d20d0 100644
--- a/include/net/dsa.h
+++ b/include/net/dsa.h
@@ -256,11 +256,6 @@ static inline bool dsa_is_normal_port(struct dsa_switch *ds, int p)
 	return !dsa_is_cpu_port(ds, p) && !dsa_is_dsa_port(ds, p);
 }
 
-static inline bool dsa_is_port_initialized(struct dsa_switch *ds, int p)
-{
-	return ds->enabled_port_mask & (1 << p) && ds->ports[p].netdev;
-}
-
 static inline u8 dsa_upstream_port(struct dsa_switch *ds)
 {
 	struct dsa_switch_tree *dst = ds->dst;
diff --git a/net/dsa/dsa.c b/net/dsa/dsa.c
index 416ac4ef9ba9..a55e2e4087a4 100644
--- a/net/dsa/dsa.c
+++ b/net/dsa/dsa.c
@@ -220,6 +220,11 @@ static int dsa_switch_rcv(struct sk_buff *skb, struct net_device *dev,
 }
 
 #ifdef CONFIG_PM_SLEEP
+static bool dsa_is_port_initialized(struct dsa_switch *ds, int p)
+{
+	return ds->enabled_port_mask & (1 << p) && ds->ports[p].netdev;
+}
+
 int dsa_switch_suspend(struct dsa_switch *ds)
 {
 	int i, ret = 0;
-- 
cgit v1.2.3


From 7051b88a35c7dde5705923833117e14f9cc17d92 Mon Sep 17 00:00:00 2001
From: stephen hemminger <stephen@networkplumber.org>
Date: Tue, 18 Jul 2017 15:59:27 -0700
Subject: net: make dev_close and related functions void

There is no useful return value from dev_close. All paths return 0.
Change dev_close and helper functions to void.

Signed-off-by: Stephen Hemminger <sthemmin@microsoft.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netdevice.h |  4 ++--
 net/core/dev.c            | 26 +++++++++++---------------
 2 files changed, 13 insertions(+), 17 deletions(-)

(limited to 'include')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index c60351b84323..614642eb7eb7 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -2432,8 +2432,8 @@ struct net_device *dev_get_by_name_rcu(struct net *net, const char *name);
 struct net_device *__dev_get_by_name(struct net *net, const char *name);
 int dev_alloc_name(struct net_device *dev, const char *name);
 int dev_open(struct net_device *dev);
-int dev_close(struct net_device *dev);
-int dev_close_many(struct list_head *head, bool unlink);
+void dev_close(struct net_device *dev);
+void dev_close_many(struct list_head *head, bool unlink);
 void dev_disable_lro(struct net_device *dev);
 int dev_loopback_xmit(struct net *net, struct sock *sk, struct sk_buff *newskb);
 int dev_queue_xmit(struct sk_buff *skb);
diff --git a/net/core/dev.c b/net/core/dev.c
index 467420eda02e..d1b9c9b6c970 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -1413,7 +1413,7 @@ int dev_open(struct net_device *dev)
 }
 EXPORT_SYMBOL(dev_open);
 
-static int __dev_close_many(struct list_head *head)
+static void __dev_close_many(struct list_head *head)
 {
 	struct net_device *dev;
 
@@ -1455,23 +1455,18 @@ static int __dev_close_many(struct list_head *head)
 		dev->flags &= ~IFF_UP;
 		netpoll_poll_enable(dev);
 	}
-
-	return 0;
 }
 
-static int __dev_close(struct net_device *dev)
+static void __dev_close(struct net_device *dev)
 {
-	int retval;
 	LIST_HEAD(single);
 
 	list_add(&dev->close_list, &single);
-	retval = __dev_close_many(&single);
+	__dev_close_many(&single);
 	list_del(&single);
-
-	return retval;
 }
 
-int dev_close_many(struct list_head *head, bool unlink)
+void dev_close_many(struct list_head *head, bool unlink)
 {
 	struct net_device *dev, *tmp;
 
@@ -1488,8 +1483,6 @@ int dev_close_many(struct list_head *head, bool unlink)
 		if (unlink)
 			list_del_init(&dev->close_list);
 	}
-
-	return 0;
 }
 EXPORT_SYMBOL(dev_close_many);
 
@@ -1502,7 +1495,7 @@ EXPORT_SYMBOL(dev_close_many);
  *	is then deactivated and finally a %NETDEV_DOWN is sent to the notifier
  *	chain.
  */
-int dev_close(struct net_device *dev)
+void dev_close(struct net_device *dev)
 {
 	if (dev->flags & IFF_UP) {
 		LIST_HEAD(single);
@@ -1511,7 +1504,6 @@ int dev_close(struct net_device *dev)
 		dev_close_many(&single, true);
 		list_del(&single);
 	}
-	return 0;
 }
 EXPORT_SYMBOL(dev_close);
 
@@ -6725,8 +6717,12 @@ int __dev_change_flags(struct net_device *dev, unsigned int flags)
 	 */
 
 	ret = 0;
-	if ((old_flags ^ flags) & IFF_UP)
-		ret = ((old_flags & IFF_UP) ? __dev_close : __dev_open)(dev);
+	if ((old_flags ^ flags) & IFF_UP) {
+		if (old_flags & IFF_UP)
+			__dev_close(dev);
+		else
+			ret = __dev_open(dev);
+	}
 
 	if ((flags ^ dev->gflags) & IFF_PROMISC) {
 		int inc = (flags & IFF_PROMISC) ? 1 : -1;
-- 
cgit v1.2.3


From e0be864f14240cb1bd92247a672796239d6ef2fa Mon Sep 17 00:00:00 2001
From: Eugeniy Paltsev <Eugeniy.Paltsev@synopsys.com>
Date: Wed, 19 Jul 2017 21:45:11 +0300
Subject: ARC: reset: introduce HSDKv1 reset driver

The HSDK v1 periphery IPs can be reset by accessing some registers
from the CGU block.

The list of available reset lines is documented in the DT bindings.

Signed-off-by: Eugeniy Paltsev <Eugeniy.Paltsev@synopsys.com>
Signed-off-by: Philipp Zabel <p.zabel@pengutronix.de>
---
 MAINTAINERS                                    |   7 ++
 drivers/reset/Kconfig                          |   6 ++
 drivers/reset/Makefile                         |   1 +
 drivers/reset/reset-hsdk-v1.c                  | 137 +++++++++++++++++++++++++
 include/dt-bindings/reset/snps,hsdk-v1-reset.h |  17 +++
 5 files changed, 168 insertions(+)
 create mode 100644 drivers/reset/reset-hsdk-v1.c
 create mode 100644 include/dt-bindings/reset/snps,hsdk-v1-reset.h

(limited to 'include')

diff --git a/MAINTAINERS b/MAINTAINERS
index 205d3977ac46..57853844969b 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -11662,6 +11662,13 @@ L:	linux-mmc@vger.kernel.org
 S:	Maintained
 F:	drivers/mmc/host/dw_mmc*
 
+SYNOPSYS HSDK RESET CONTROLLER DRIVER
+M:	Eugeniy Paltsev <Eugeniy.Paltsev@synopsys.com>
+S:	Supported
+F:	drivers/reset/reset-hsdk-v1.c
+F:	include/dt-bindings/reset/snps,hsdk-v1-reset.h
+F:	Documentation/devicetree/bindings/reset/snps,hsdk-v1-reset.txt
+
 SYSTEM TRACE MODULE CLASS
 M:	Alexander Shishkin <alexander.shishkin@linux.intel.com>
 S:	Maintained
diff --git a/drivers/reset/Kconfig b/drivers/reset/Kconfig
index 608c071e4bbf..c4c4a4cbcf49 100644
--- a/drivers/reset/Kconfig
+++ b/drivers/reset/Kconfig
@@ -41,6 +41,12 @@ config RESET_GEMINI
 	help
 	  This enables the reset controller driver for Cortina Systems Gemini.
 
+config RESET_HSDK_V1
+	bool "HSDK v1 Reset Driver"
+	default n
+	help
+	  This enables the reset controller driver for HSDK v1.
+
 config RESET_IMX7
 	bool "i.MX7 Reset Driver" if COMPILE_TEST
 	default SOC_IMX7D
diff --git a/drivers/reset/Makefile b/drivers/reset/Makefile
index 7081f9da2599..9373040efc63 100644
--- a/drivers/reset/Makefile
+++ b/drivers/reset/Makefile
@@ -6,6 +6,7 @@ obj-$(CONFIG_RESET_A10SR) += reset-a10sr.o
 obj-$(CONFIG_RESET_ATH79) += reset-ath79.o
 obj-$(CONFIG_RESET_BERLIN) += reset-berlin.o
 obj-$(CONFIG_RESET_GEMINI) += reset-gemini.o
+obj-$(CONFIG_RESET_HSDK_V1) += reset-hsdk-v1.o
 obj-$(CONFIG_RESET_IMX7) += reset-imx7.o
 obj-$(CONFIG_RESET_LPC18XX) += reset-lpc18xx.o
 obj-$(CONFIG_RESET_MESON) += reset-meson.o
diff --git a/drivers/reset/reset-hsdk-v1.c b/drivers/reset/reset-hsdk-v1.c
new file mode 100644
index 000000000000..bca13e4bf622
--- /dev/null
+++ b/drivers/reset/reset-hsdk-v1.c
@@ -0,0 +1,137 @@
+/*
+ * Copyright (C) 2017 Synopsys.
+ *
+ * Synopsys HSDKv1 SDP reset driver.
+ *
+ * This file is licensed under the terms of the GNU General Public
+ * License version 2. This program is licensed "as is" without any
+ * warranty of any kind, whether express or implied.
+ */
+
+#include <linux/delay.h>
+#include <linux/io.h>
+#include <linux/iopoll.h>
+#include <linux/module.h>
+#include <linux/of.h>
+#include <linux/platform_device.h>
+#include <linux/reset-controller.h>
+#include <linux/slab.h>
+#include <linux/types.h>
+
+#define to_hsdkv1_rst(p)	container_of((p), struct hsdkv1_rst, rcdev)
+
+struct hsdkv1_rst {
+	void __iomem			*regs_ctl;
+	void __iomem			*regs_rst;
+	spinlock_t			lock;
+	struct reset_controller_dev	rcdev;
+};
+
+static const u32 rst_map[] = {
+	BIT(16), /* APB_RST  */
+	BIT(17), /* AXI_RST  */
+	BIT(18), /* ETH_RST  */
+	BIT(19), /* USB_RST  */
+	BIT(20), /* SDIO_RST */
+	BIT(21), /* HDMI_RST */
+	BIT(22), /* GFX_RST  */
+	BIT(25), /* DMAC_RST */
+	BIT(31), /* EBI_RST  */
+};
+
+#define HSDK_MAX_RESETS			ARRAY_SIZE(rst_map)
+
+#define CGU_SYS_RST_CTRL		0x0
+#define CGU_IP_SW_RESET			0x0
+#define CGU_IP_SW_RESET_DELAY_SHIFT	16
+#define CGU_IP_SW_RESET_DELAY_MASK	GENMASK(31, CGU_IP_SW_RESET_DELAY_SHIFT)
+#define CGU_IP_SW_RESET_DELAY		0
+#define CGU_IP_SW_RESET_RESET		BIT(0)
+#define SW_RESET_TIMEOUT		10000
+
+static void hsdkv1_reset_config(struct hsdkv1_rst *rst, unsigned long id)
+{
+	writel(rst_map[id], rst->regs_ctl + CGU_SYS_RST_CTRL);
+}
+
+static int hsdkv1_reset_do(struct hsdkv1_rst *rst)
+{
+	u32 reg;
+
+	reg = readl(rst->regs_rst + CGU_IP_SW_RESET);
+	reg &= ~CGU_IP_SW_RESET_DELAY_MASK;
+	reg |= CGU_IP_SW_RESET_DELAY << CGU_IP_SW_RESET_DELAY_SHIFT;
+	reg |= CGU_IP_SW_RESET_RESET;
+	writel(reg, rst->regs_rst + CGU_IP_SW_RESET);
+
+	/* wait till reset bit is back to 0 */
+	return readl_poll_timeout_atomic(rst->regs_rst + CGU_IP_SW_RESET, reg,
+		!(reg & CGU_IP_SW_RESET_RESET), 5, SW_RESET_TIMEOUT);
+}
+
+static int hsdkv1_reset_reset(struct reset_controller_dev *rcdev,
+			      unsigned long id)
+{
+	struct hsdkv1_rst *rst = to_hsdkv1_rst(rcdev);
+	unsigned long flags;
+	int ret;
+
+	spin_lock_irqsave(&rst->lock, flags);
+	hsdkv1_reset_config(rst, id);
+	ret = hsdkv1_reset_do(rst);
+	spin_unlock_irqrestore(&rst->lock, flags);
+
+	return ret;
+}
+
+static const struct reset_control_ops hsdkv1_reset_ops = {
+	.reset	= hsdkv1_reset_reset,
+};
+
+static int hsdkv1_reset_probe(struct platform_device *pdev)
+{
+	struct hsdkv1_rst *rst;
+	struct resource *mem;
+
+	rst = devm_kzalloc(&pdev->dev, sizeof(*rst), GFP_KERNEL);
+	if (!rst)
+		return -ENOMEM;
+
+	mem = platform_get_resource(pdev, IORESOURCE_MEM, 0);
+	rst->regs_ctl = devm_ioremap_resource(&pdev->dev, mem);
+	if (IS_ERR(rst->regs_ctl))
+		return PTR_ERR(rst->regs_ctl);
+
+	mem = platform_get_resource(pdev, IORESOURCE_MEM, 1);
+	rst->regs_rst = devm_ioremap_resource(&pdev->dev, mem);
+	if (IS_ERR(rst->regs_rst))
+		return PTR_ERR(rst->regs_rst);
+
+	spin_lock_init(&rst->lock);
+
+	rst->rcdev.owner = THIS_MODULE;
+	rst->rcdev.ops = &hsdkv1_reset_ops;
+	rst->rcdev.of_node = pdev->dev.of_node;
+	rst->rcdev.nr_resets = HSDK_MAX_RESETS;
+	rst->rcdev.of_reset_n_cells = 1;
+
+	return reset_controller_register(&rst->rcdev);
+}
+
+static const struct of_device_id hsdkv1_reset_dt_match[] = {
+	{ .compatible = "snps,hsdk-v1.0-reset" },
+	{ },
+};
+
+static struct platform_driver hsdkv1_reset_driver = {
+	.probe	= hsdkv1_reset_probe,
+	.driver	= {
+		.name = "hsdk-v1.0-reset",
+		.of_match_table = hsdkv1_reset_dt_match,
+	},
+};
+builtin_platform_driver(hsdkv1_reset_driver);
+
+MODULE_AUTHOR("Eugeniy Paltsev <Eugeniy.Paltsev@synopsys.com>");
+MODULE_DESCRIPTION("Synopsys HSDKv1 SDP reset driver");
+MODULE_LICENSE("GPL v2");
diff --git a/include/dt-bindings/reset/snps,hsdk-v1-reset.h b/include/dt-bindings/reset/snps,hsdk-v1-reset.h
new file mode 100644
index 000000000000..d898c89b7123
--- /dev/null
+++ b/include/dt-bindings/reset/snps,hsdk-v1-reset.h
@@ -0,0 +1,17 @@
+/**
+ * This header provides index for the HSDK v1 reset controller.
+ */
+#ifndef _DT_BINDINGS_RESET_CONTROLLER_HSDK_V1
+#define _DT_BINDINGS_RESET_CONTROLLER_HSDK_V1
+
+#define HSDK_V1_APB_RESET	0
+#define HSDK_V1_AXI_RESET	1
+#define HSDK_V1_ETH_RESET	2
+#define HSDK_V1_USB_RESET	3
+#define HSDK_V1_SDIO_RESET	4
+#define HSDK_V1_HDMI_RESET	5
+#define HSDK_V1_GFX_RESET	6
+#define HSDK_V1_DMAC_RESET	7
+#define HSDK_V1_EBI_RESET	8
+
+#endif /*_DT_BINDINGS_RESET_CONTROLLER_HSDK_V1*/
-- 
cgit v1.2.3


From f1923010b03c3c41c332dd11b93022d049da3104 Mon Sep 17 00:00:00 2001
From: Todor Tomov <todor.tomov@linaro.org>
Date: Fri, 7 Jul 2017 04:48:47 -0400
Subject: media: v4l2-mediabus: Add helper functions

Add helper functions for mbus to/from mplane pixel format conversion.

Signed-off-by: Todor Tomov <todor.tomov@linaro.org>
Signed-off-by: Hans Verkuil <hans.verkuil@cisco.com>
Signed-off-by: Mauro Carvalho Chehab <mchehab@s-opensource.com>
---
 include/media/v4l2-mediabus.h | 26 ++++++++++++++++++++++++++
 1 file changed, 26 insertions(+)

(limited to 'include')

diff --git a/include/media/v4l2-mediabus.h b/include/media/v4l2-mediabus.h
index 315c167a95dc..93f8afcb7a22 100644
--- a/include/media/v4l2-mediabus.h
+++ b/include/media/v4l2-mediabus.h
@@ -117,4 +117,30 @@ static inline void v4l2_fill_mbus_format(struct v4l2_mbus_framefmt *mbus_fmt,
 	mbus_fmt->code = code;
 }
 
+static inline void v4l2_fill_pix_format_mplane(
+				struct v4l2_pix_format_mplane *pix_mp_fmt,
+				const struct v4l2_mbus_framefmt *mbus_fmt)
+{
+	pix_mp_fmt->width = mbus_fmt->width;
+	pix_mp_fmt->height = mbus_fmt->height;
+	pix_mp_fmt->field = mbus_fmt->field;
+	pix_mp_fmt->colorspace = mbus_fmt->colorspace;
+	pix_mp_fmt->ycbcr_enc = mbus_fmt->ycbcr_enc;
+	pix_mp_fmt->quantization = mbus_fmt->quantization;
+	pix_mp_fmt->xfer_func = mbus_fmt->xfer_func;
+}
+
+static inline void v4l2_fill_mbus_format_mplane(
+				struct v4l2_mbus_framefmt *mbus_fmt,
+				const struct v4l2_pix_format_mplane *pix_mp_fmt)
+{
+	mbus_fmt->width = pix_mp_fmt->width;
+	mbus_fmt->height = pix_mp_fmt->height;
+	mbus_fmt->field = pix_mp_fmt->field;
+	mbus_fmt->colorspace = pix_mp_fmt->colorspace;
+	mbus_fmt->ycbcr_enc = pix_mp_fmt->ycbcr_enc;
+	mbus_fmt->quantization = pix_mp_fmt->quantization;
+	mbus_fmt->xfer_func = pix_mp_fmt->xfer_func;
+}
+
 #endif
-- 
cgit v1.2.3


From a2b426267c56773201f968fdb5eda6ab9ae94e34 Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Sat, 29 Apr 2017 14:12:15 -0500
Subject: userns,pidns: Verify the userns for new pid namespaces

It is pointless and confusing to allow a pid namespace hierarchy and
the user namespace hierarchy to get out of sync.  The owner of a child
pid namespace should be the owner of the parent pid namespace or
a descendant of the owner of the parent pid namespace.

Otherwise it is possible to construct scenarios where a process has a
capability over a parent pid namespace but does not have the
capability over a child pid namespace.  Which confusingly makes
permission checks non-transitive.

It requires use of setns into a pid namespace (but not into a user
namespace) to create such a scenario.

Add the function in_userns to help in making this determination.

v2: Optimized in_userns by using level as suggested
    by: Kirill Tkhai <ktkhai@virtuozzo.com>

Ref: 49f4d8b93ccf ("pidns: Capture the user namespace and filter ns_last_pid")
Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com>
---
 include/linux/user_namespace.h |  9 ++++++++-
 kernel/pid_namespace.c         |  4 ++++
 kernel/user_namespace.c        | 20 ++++++++++++--------
 3 files changed, 24 insertions(+), 9 deletions(-)

(limited to 'include')

diff --git a/include/linux/user_namespace.h b/include/linux/user_namespace.h
index 32354b4b4b2b..4005877bb8b6 100644
--- a/include/linux/user_namespace.h
+++ b/include/linux/user_namespace.h
@@ -112,8 +112,9 @@ extern ssize_t proc_projid_map_write(struct file *, const char __user *, size_t,
 extern ssize_t proc_setgroups_write(struct file *, const char __user *, size_t, loff_t *);
 extern int proc_setgroups_show(struct seq_file *m, void *v);
 extern bool userns_may_setgroups(const struct user_namespace *ns);
+extern bool in_userns(const struct user_namespace *ancestor,
+		       const struct user_namespace *child);
 extern bool current_in_userns(const struct user_namespace *target_ns);
-
 struct ns_common *ns_get_owner(struct ns_common *ns);
 #else
 
@@ -144,6 +145,12 @@ static inline bool userns_may_setgroups(const struct user_namespace *ns)
 	return true;
 }
 
+static inline bool in_userns(const struct user_namespace *ancestor,
+			     const struct user_namespace *child)
+{
+	return true;
+}
+
 static inline bool current_in_userns(const struct user_namespace *target_ns)
 {
 	return true;
diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c
index 74a5a7255b4d..4918314893bc 100644
--- a/kernel/pid_namespace.c
+++ b/kernel/pid_namespace.c
@@ -101,6 +101,10 @@ static struct pid_namespace *create_pid_namespace(struct user_namespace *user_ns
 	int i;
 	int err;
 
+	err = -EINVAL;
+	if (!in_userns(parent_pid_ns->user_ns, user_ns))
+		goto out;
+
 	err = -ENOSPC;
 	if (level > MAX_PID_NS_LEVEL)
 		goto out;
diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c
index 2f735cbe05e8..c490f1e4313b 100644
--- a/kernel/user_namespace.c
+++ b/kernel/user_namespace.c
@@ -986,17 +986,21 @@ bool userns_may_setgroups(const struct user_namespace *ns)
 }
 
 /*
- * Returns true if @ns is the same namespace as or a descendant of
- * @target_ns.
+ * Returns true if @child is the same namespace or a descendant of
+ * @ancestor.
  */
+bool in_userns(const struct user_namespace *ancestor,
+	       const struct user_namespace *child)
+{
+	const struct user_namespace *ns;
+	for (ns = child; ns->level > ancestor->level; ns = ns->parent)
+		;
+	return (ns == ancestor);
+}
+
 bool current_in_userns(const struct user_namespace *target_ns)
 {
-	struct user_namespace *ns;
-	for (ns = current_user_ns(); ns; ns = ns->parent) {
-		if (ns == target_ns)
-			return true;
-	}
-	return false;
+	return in_userns(target_ns, current_user_ns());
 }
 
 static inline struct user_namespace *to_user_ns(struct ns_common *ns)
-- 
cgit v1.2.3


From 0c2021c047baa75f88996f2c732ecbac3e4fba29 Mon Sep 17 00:00:00 2001
From: Robin Murphy <robin.murphy@arm.com>
Date: Mon, 10 Jul 2017 15:22:51 +0800
Subject: ACPICA: IORT: Update SMMU models for revision C

ACPICA commit d00a4eb86e64bb4fa70f57ab5e5ca0a4ca2ad8ef

IORT revision C has been published with a number of new SMMU
implementation identifiers; define them.

Link: https://github.com/acpica/acpica/commit/d00a4eb8
Signed-off-by: Robin Murphy <robin.murphy@arm.com>
Signed-off-by: Bob Moore <robert.moore@intel.com>
Signed-off-by: Lv Zheng <lv.zheng@intel.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 include/acpi/actbl2.h | 12 ++++++++++--
 1 file changed, 10 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/acpi/actbl2.h b/include/acpi/actbl2.h
index 707dda74c272..5829eedbf3ff 100644
--- a/include/acpi/actbl2.h
+++ b/include/acpi/actbl2.h
@@ -664,7 +664,7 @@ struct acpi_ibft_target {
  * IORT - IO Remapping Table
  *
  * Conforms to "IO Remapping Table System Software on ARM Platforms",
- * Document number: ARM DEN 0049B, October 2015
+ * Document number: ARM DEN 0049C, May 2017
  *
  ******************************************************************************/
 
@@ -779,6 +779,8 @@ struct acpi_iort_smmu {
 #define ACPI_IORT_SMMU_V2               0x00000001	/* Generic SMMUv2 */
 #define ACPI_IORT_SMMU_CORELINK_MMU400  0x00000002	/* ARM Corelink MMU-400 */
 #define ACPI_IORT_SMMU_CORELINK_MMU500  0x00000003	/* ARM Corelink MMU-500 */
+#define ACPI_IORT_SMMU_CORELINK_MMU401  0x00000004	/* ARM Corelink MMU-401 */
+#define ACPI_IORT_SMMU_CAVIUM_THUNDERX  0x00000005	/* Cavium thunder_x SMMUv2 */
 
 /* Masks for Flags field above */
 
@@ -799,13 +801,19 @@ struct acpi_iort_smmu_v3 {
 	u32 flags;
 	u32 reserved;
 	u64 vatos_address;
-	u32 model;		/* O: generic SMMUv3 */
+	u32 model;
 	u32 event_gsiv;
 	u32 pri_gsiv;
 	u32 gerr_gsiv;
 	u32 sync_gsiv;
 };
 
+/* Values for Model field above */
+
+#define ACPI_IORT_SMMU_V3_GENERIC           0x00000000	/* Generic SMMUv3 */
+#define ACPI_IORT_SMMU_V3_HISILICON_HI161X  0x00000001	/* hi_silicon Hi161x SMMUv3 */
+#define ACPI_IORT_SMMU_V3_CAVIUM_CN99XX     0x00000002	/* Cavium CN99xx SMMUv3 */
+
 /* Masks for Flags field above */
 
 #define ACPI_IORT_SMMU_V3_COHACC_OVERRIDE   (1)
-- 
cgit v1.2.3


From 99e597adf60e1a6f83ec746db8cab225b16a838b Mon Sep 17 00:00:00 2001
From: Kees Cook <keescook@google.com>
Date: Mon, 10 Jul 2017 15:23:08 +0800
Subject: Back port of "ACPICA: Use designated initializers"

ACPICA commit 47538f5f0773c0820d8f552e20f6e77104290c01

The following commit is not correctly linuxized by its ACPICA form (see
link #1 for reference):
  Commit: 3d867f6c5fd6535cdeceef3170e5e84e5dd80fc1
  Subject: ACPICA: Use designated initializers
Thus breaks linuxize process.

This patch is a linuxized back port result of the upstreamed ACPICA
commit (see link #2 for reference).

Link: https://github.com/acpica/acpica/pull/248/       [#1]
Link: https://github.com/acpica/acpica/commit/47538f5f [#2]
Signed-off-by: Kees Cook <keescook@google.com>
Signed-off-by: Bob Moore <robert.moore@intel.com>
Signed-off-by: Lv Zheng <lv.zheng@intel.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/acpi/acpica/hwxfsleep.c | 9 ++++++---
 include/acpi/platform/acenv.h   | 9 +++++----
 include/acpi/platform/aclinux.h | 3 +++
 3 files changed, 14 insertions(+), 7 deletions(-)

(limited to 'include')

diff --git a/drivers/acpi/acpica/hwxfsleep.c b/drivers/acpi/acpica/hwxfsleep.c
index 7ef13934968f..e5c095ca6083 100644
--- a/drivers/acpi/acpica/hwxfsleep.c
+++ b/drivers/acpi/acpica/hwxfsleep.c
@@ -72,13 +72,16 @@ static acpi_status acpi_hw_sleep_dispatch(u8 sleep_state, u32 function_id);
 static struct acpi_sleep_functions acpi_sleep_dispatch[] = {
 	{ACPI_STRUCT_INIT(legacy_function,
 			  ACPI_HW_OPTIONAL_FUNCTION(acpi_hw_legacy_sleep)),
-	 ACPI_STRUCT_INIT(extended_function, acpi_hw_extended_sleep) },
+	 ACPI_STRUCT_INIT(extended_function,
+			  acpi_hw_extended_sleep)},
 	{ACPI_STRUCT_INIT(legacy_function,
 			  ACPI_HW_OPTIONAL_FUNCTION(acpi_hw_legacy_wake_prep)),
-	 ACPI_STRUCT_INIT(extended_function, acpi_hw_extended_wake_prep) },
+	 ACPI_STRUCT_INIT(extended_function,
+			  acpi_hw_extended_wake_prep)},
 	{ACPI_STRUCT_INIT(legacy_function,
 			  ACPI_HW_OPTIONAL_FUNCTION(acpi_hw_legacy_wake)),
-	 ACPI_STRUCT_INIT(extended_function, acpi_hw_extended_wake) }
+	 ACPI_STRUCT_INIT(extended_function,
+			  acpi_hw_extended_wake)}
 };
 
 /*
diff --git a/include/acpi/platform/acenv.h b/include/acpi/platform/acenv.h
index 912563c66948..043fd559de6e 100644
--- a/include/acpi/platform/acenv.h
+++ b/include/acpi/platform/acenv.h
@@ -288,6 +288,11 @@
 #define ACPI_INLINE
 #endif
 
+/* Use ordered initialization if compiler doesn't support designated. */
+#ifndef ACPI_STRUCT_INIT
+#define ACPI_STRUCT_INIT(field, value)  value
+#endif
+
 /*
  * Configurable calling conventions:
  *
@@ -382,8 +387,4 @@
 #define ACPI_INIT_FUNCTION
 #endif
 
-#ifndef ACPI_STRUCT_INIT
-#define ACPI_STRUCT_INIT(field, value) value
-#endif
-
 #endif				/* __ACENV_H__ */
diff --git a/include/acpi/platform/aclinux.h b/include/acpi/platform/aclinux.h
index 047f13865608..afd95f28c8f0 100644
--- a/include/acpi/platform/aclinux.h
+++ b/include/acpi/platform/aclinux.h
@@ -178,6 +178,9 @@
 #define ACPI_MSG_BIOS_ERROR     KERN_ERR "ACPI BIOS Error (bug): "
 #define ACPI_MSG_BIOS_WARNING   KERN_WARNING "ACPI BIOS Warning (bug): "
 
+/*
+ * Linux wants to use designated initializers for function pointer structs.
+ */
 #define ACPI_STRUCT_INIT(field, value)	.field = value
 
 #else				/* !__KERNEL__ */
-- 
cgit v1.2.3


From 023e2ee16c51da7f6a9455ac936e4fb00295f47a Mon Sep 17 00:00:00 2001
From: Lv Zheng <lv.zheng@intel.com>
Date: Mon, 10 Jul 2017 15:23:45 +0800
Subject: ACPICA: Tables: Change table duplication check to be related to
 acpi_gbl_verify_table_checksum

ACPICA commit 3d837b5d4b1033942b4d91c7d3801a09c3157918

acpi_gbl_verify_table_checksum is used to avoid validating (mapping) an entire
table in OS boot stage. 2nd "Reload" check in acpi_tb_install_standard_table()
is prepared for the same purpose. So this patch combines them together
using a renamed acpi_gbl_enable_table_validation flag. Lv Zheng.

Link: https://github.com/acpica/acpica/commit/3d837b5d
Signed-off-by: Lv Zheng <lv.zheng@intel.com>
Signed-off-by: Bob Moore <robert.moore@intel.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/acpi/acpica/tbdata.c   |  4 ++--
 drivers/acpi/acpica/tbinstal.c |  2 +-
 drivers/acpi/acpica/tbxface.c  |  8 ++++++++
 drivers/acpi/bus.c             |  3 ---
 drivers/acpi/tables.c          |  4 ++--
 include/acpi/acpixf.h          | 15 ++++++++-------
 6 files changed, 21 insertions(+), 15 deletions(-)

(limited to 'include')

diff --git a/drivers/acpi/acpica/tbdata.c b/drivers/acpi/acpica/tbdata.c
index 7056ca01e875..24d99711b2a4 100644
--- a/drivers/acpi/acpica/tbdata.c
+++ b/drivers/acpi/acpica/tbdata.c
@@ -338,7 +338,7 @@ void acpi_tb_invalidate_table(struct acpi_table_desc *table_desc)
 acpi_status acpi_tb_validate_temp_table(struct acpi_table_desc *table_desc)
 {
 
-	if (!table_desc->pointer && !acpi_gbl_verify_table_checksum) {
+	if (!table_desc->pointer && !acpi_gbl_enable_table_validation) {
 		/*
 		 * Only validates the header of the table.
 		 * Note that Length contains the size of the mapping after invoking
@@ -394,7 +394,7 @@ acpi_tb_verify_temp_table(struct acpi_table_desc *table_desc, char *signature)
 
 	/* Verify the checksum */
 
-	if (acpi_gbl_verify_table_checksum) {
+	if (acpi_gbl_enable_table_validation) {
 		status =
 		    acpi_tb_verify_checksum(table_desc->pointer,
 					    table_desc->length);
diff --git a/drivers/acpi/acpica/tbinstal.c b/drivers/acpi/acpica/tbinstal.c
index 9d212967fad8..f7bc362bd3c5 100644
--- a/drivers/acpi/acpica/tbinstal.c
+++ b/drivers/acpi/acpica/tbinstal.c
@@ -221,7 +221,7 @@ acpi_tb_install_standard_table(acpi_physical_address address,
 
 	(void)acpi_ut_acquire_mutex(ACPI_MTX_TABLES);
 
-	if (reload) {
+	if (acpi_gbl_enable_table_validation) {
 
 		/* Check if table is already registered */
 
diff --git a/drivers/acpi/acpica/tbxface.c b/drivers/acpi/acpica/tbxface.c
index 18508b2c0c05..38c01049afd5 100644
--- a/drivers/acpi/acpica/tbxface.c
+++ b/drivers/acpi/acpica/tbxface.c
@@ -194,6 +194,14 @@ acpi_status ACPI_INIT_FUNCTION acpi_reallocate_root_table(void)
 		}
 	}
 
+	if (!acpi_gbl_enable_table_validation) {
+		/*
+		 * Now it's safe to do full table validation. We can do deferred
+		 * table initilization here once the flag is set.
+		 */
+		acpi_gbl_enable_table_validation = TRUE;
+	}
+
 	acpi_gbl_root_table_list.flags |= ACPI_ROOT_ALLOW_RESIZE;
 
 	status = acpi_tb_resize_root_table_list();
diff --git a/drivers/acpi/bus.c b/drivers/acpi/bus.c
index af74b420ec83..59f2f96fdb7e 100644
--- a/drivers/acpi/bus.c
+++ b/drivers/acpi/bus.c
@@ -995,9 +995,6 @@ void __init acpi_early_init(void)
 
 	printk(KERN_INFO PREFIX "Core revision %08x\n", ACPI_CA_VERSION);
 
-	/* It's safe to verify table checksums during late stage */
-	acpi_gbl_verify_table_checksum = TRUE;
-
 	/* enable workarounds, unless strict ACPI spec. compliance */
 	if (!acpi_strict)
 		acpi_gbl_enable_interpreter_slack = TRUE;
diff --git a/drivers/acpi/tables.c b/drivers/acpi/tables.c
index ff425390bfa8..80ce2a7d224b 100644
--- a/drivers/acpi/tables.c
+++ b/drivers/acpi/tables.c
@@ -740,10 +740,10 @@ int __init acpi_table_init(void)
 
 	if (acpi_verify_table_checksum) {
 		pr_info("Early table checksum verification enabled\n");
-		acpi_gbl_verify_table_checksum = TRUE;
+		acpi_gbl_enable_table_validation = TRUE;
 	} else {
 		pr_info("Early table checksum verification disabled\n");
-		acpi_gbl_verify_table_checksum = FALSE;
+		acpi_gbl_enable_table_validation = FALSE;
 	}
 
 	status = acpi_initialize_tables(initial_tables, ACPI_MAX_TABLES, 0);
diff --git a/include/acpi/acpixf.h b/include/acpi/acpixf.h
index a59c44c3edd8..12dac84c98c9 100644
--- a/include/acpi/acpixf.h
+++ b/include/acpi/acpixf.h
@@ -160,13 +160,14 @@ ACPI_INIT_GLOBAL(u8, acpi_gbl_create_osi_method, TRUE);
 ACPI_INIT_GLOBAL(u8, acpi_gbl_use_default_register_widths, TRUE);
 
 /*
- * Whether or not to verify the table checksum before installation. Set
- * this to TRUE to verify the table checksum before install it to the table
- * manager. Note that enabling this option causes errors to happen in some
- * OSPMs during early initialization stages. Default behavior is to do such
- * verification.
- */
-ACPI_INIT_GLOBAL(u8, acpi_gbl_verify_table_checksum, TRUE);
+ * Whether or not to validate (map) an entire table to verify
+ * checksum/duplication in early stage before install. Set this to TRUE to
+ * allow early table validation before install it to the table manager.
+ * Note that enabling this option causes errors to happen in some OSPMs
+ * during early initialization stages. Default behavior is to allow such
+ * validation.
+ */
+ACPI_INIT_GLOBAL(u8, acpi_gbl_enable_table_validation, TRUE);
 
 /*
  * Optionally enable output from the AML Debug Object.
-- 
cgit v1.2.3


From 19df56bdf0833a8ad9dfe056931696b9e0e30463 Mon Sep 17 00:00:00 2001
From: Lv Zheng <lv.zheng@intel.com>
Date: Mon, 10 Jul 2017 15:23:56 +0800
Subject: ACPICA: Tables: Add deferred table verification support

ACPICA commit 2dd6c151d5d5e76dacba8f7db9e259fc72982d17
ACPICA commit ffddee6638aced83be18b8bc88569586c1a43e03

This patch allows tables not verified in early stage verfied in
acpi_reallocate_root_table(). This is useful for OSPMs like linux where tables
cannot be verified in early stage due to early ioremp limitations on some
architectures. Reported by Hans de Geode, fixed by Lv Zheng.

Link: https://github.com/acpica/acpica/commit/2dd6c151
Link: https://github.com/acpica/acpica/commit/ffddee66
Reported-by: Hans de Goede <hdegoede@redhat.com>
Signed-off-by: Lv Zheng <lv.zheng@intel.com>
Signed-off-by: Bob Moore <robert.moore@intel.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/acpi/acpica/tbdata.c  | 34 ++++++++++++++++++++++++++++------
 drivers/acpi/acpica/tbxface.c | 27 ++++++++++++++++++++++-----
 include/acpi/actbl.h          |  1 +
 3 files changed, 51 insertions(+), 11 deletions(-)

(limited to 'include')

diff --git a/drivers/acpi/acpica/tbdata.c b/drivers/acpi/acpica/tbdata.c
index 577361b05ea9..b19a2f0ea331 100644
--- a/drivers/acpi/acpica/tbdata.c
+++ b/drivers/acpi/acpica/tbdata.c
@@ -432,6 +432,15 @@ acpi_tb_check_duplication(struct acpi_table_desc *table_desc, u32 *table_index)
 	/* Check if table is already registered */
 
 	for (i = 0; i < acpi_gbl_root_table_list.current_table_count; ++i) {
+
+		/* Do not compare with unverified tables */
+
+		if (!
+		    (acpi_gbl_root_table_list.tables[i].
+		     flags & ACPI_TABLE_IS_VERIFIED)) {
+			continue;
+		}
+
 		/*
 		 * Check for a table match on the entire table length,
 		 * not just the header.
@@ -483,6 +492,8 @@ acpi_tb_check_duplication(struct acpi_table_desc *table_desc, u32 *table_index)
  *
  * DESCRIPTION: This function is called to validate and verify the table, the
  *              returned table descriptor is in "VALIDATED" state.
+ *              Note that 'TableIndex' is required to be set to !NULL to
+ *              enable duplication check.
  *
  *****************************************************************************/
 
@@ -554,6 +565,8 @@ acpi_tb_verify_temp_table(struct acpi_table_desc *table_desc,
 				goto invalidate_and_exit;
 			}
 		}
+
+		table_desc->flags |= ACPI_TABLE_IS_VERIFIED;
 	}
 
 	return_ACPI_STATUS(status);
@@ -579,6 +592,8 @@ acpi_status acpi_tb_resize_root_table_list(void)
 {
 	struct acpi_table_desc *tables;
 	u32 table_count;
+	u32 current_table_count, max_table_count;
+	u32 i;
 
 	ACPI_FUNCTION_TRACE(tb_resize_root_table_list);
 
@@ -598,8 +613,8 @@ acpi_status acpi_tb_resize_root_table_list(void)
 		table_count = acpi_gbl_root_table_list.current_table_count;
 	}
 
-	tables = ACPI_ALLOCATE_ZEROED(((acpi_size)table_count +
-				       ACPI_ROOT_TABLE_SIZE_INCREMENT) *
+	max_table_count = table_count + ACPI_ROOT_TABLE_SIZE_INCREMENT;
+	tables = ACPI_ALLOCATE_ZEROED(((acpi_size)max_table_count) *
 				      sizeof(struct acpi_table_desc));
 	if (!tables) {
 		ACPI_ERROR((AE_INFO,
@@ -609,9 +624,16 @@ acpi_status acpi_tb_resize_root_table_list(void)
 
 	/* Copy and free the previous table array */
 
+	current_table_count = 0;
 	if (acpi_gbl_root_table_list.tables) {
-		memcpy(tables, acpi_gbl_root_table_list.tables,
-		       (acpi_size)table_count * sizeof(struct acpi_table_desc));
+		for (i = 0; i < table_count; i++) {
+			if (acpi_gbl_root_table_list.tables[i].address) {
+				memcpy(tables + current_table_count,
+				       acpi_gbl_root_table_list.tables + i,
+				       sizeof(struct acpi_table_desc));
+				current_table_count++;
+			}
+		}
 
 		if (acpi_gbl_root_table_list.flags & ACPI_ROOT_ORIGIN_ALLOCATED) {
 			ACPI_FREE(acpi_gbl_root_table_list.tables);
@@ -619,8 +641,8 @@ acpi_status acpi_tb_resize_root_table_list(void)
 	}
 
 	acpi_gbl_root_table_list.tables = tables;
-	acpi_gbl_root_table_list.max_table_count =
-	    table_count + ACPI_ROOT_TABLE_SIZE_INCREMENT;
+	acpi_gbl_root_table_list.max_table_count = max_table_count;
+	acpi_gbl_root_table_list.current_table_count = current_table_count;
 	acpi_gbl_root_table_list.flags |= ACPI_ROOT_ORIGIN_ALLOCATED;
 
 	return_ACPI_STATUS(AE_OK);
diff --git a/drivers/acpi/acpica/tbxface.c b/drivers/acpi/acpica/tbxface.c
index 38c01049afd5..26ad596c973e 100644
--- a/drivers/acpi/acpica/tbxface.c
+++ b/drivers/acpi/acpica/tbxface.c
@@ -167,7 +167,8 @@ ACPI_EXPORT_SYMBOL_INIT(acpi_initialize_tables)
 acpi_status ACPI_INIT_FUNCTION acpi_reallocate_root_table(void)
 {
 	acpi_status status;
-	u32 i;
+	struct acpi_table_desc *table_desc;
+	u32 i, j;
 
 	ACPI_FUNCTION_TRACE(acpi_reallocate_root_table);
 
@@ -179,6 +180,8 @@ acpi_status ACPI_INIT_FUNCTION acpi_reallocate_root_table(void)
 		return_ACPI_STATUS(AE_SUPPORT);
 	}
 
+	(void)acpi_ut_acquire_mutex(ACPI_MTX_TABLES);
+
 	/*
 	 * Ensure OS early boot logic, which is required by some hosts. If the
 	 * table state is reported to be wrong, developers should fix the
@@ -186,11 +189,11 @@ acpi_status ACPI_INIT_FUNCTION acpi_reallocate_root_table(void)
 	 * early stage.
 	 */
 	for (i = 0; i < acpi_gbl_root_table_list.current_table_count; ++i) {
-		if (acpi_gbl_root_table_list.tables[i].pointer) {
+		table_desc = &acpi_gbl_root_table_list.tables[i];
+		if (table_desc->pointer) {
 			ACPI_ERROR((AE_INFO,
 				    "Table [%4.4s] is not invalidated during early boot stage",
-				    acpi_gbl_root_table_list.tables[i].
-				    signature.ascii));
+				    table_desc->signature.ascii));
 		}
 	}
 
@@ -200,11 +203,25 @@ acpi_status ACPI_INIT_FUNCTION acpi_reallocate_root_table(void)
 		 * table initilization here once the flag is set.
 		 */
 		acpi_gbl_enable_table_validation = TRUE;
+		for (i = 0; i < acpi_gbl_root_table_list.current_table_count;
+		     ++i) {
+			table_desc = &acpi_gbl_root_table_list.tables[i];
+			if (!(table_desc->flags & ACPI_TABLE_IS_VERIFIED)) {
+				status =
+				    acpi_tb_verify_temp_table(table_desc, NULL,
+							      &j);
+				if (ACPI_FAILURE(status)) {
+					acpi_tb_uninstall_table(table_desc);
+				}
+			}
+		}
 	}
 
 	acpi_gbl_root_table_list.flags |= ACPI_ROOT_ALLOW_RESIZE;
-
 	status = acpi_tb_resize_root_table_list();
+	acpi_gbl_root_table_list.flags |= ACPI_ROOT_ORIGIN_ALLOCATED;
+
+	(void)acpi_ut_release_mutex(ACPI_MTX_TABLES);
 	return_ACPI_STATUS(status);
 }
 
diff --git a/include/acpi/actbl.h b/include/acpi/actbl.h
index bdc55c0da19c..89509b86cb54 100644
--- a/include/acpi/actbl.h
+++ b/include/acpi/actbl.h
@@ -394,6 +394,7 @@ struct acpi_table_desc {
 #define ACPI_TABLE_ORIGIN_INTERNAL_PHYSICAL (1)	/* Physical address, internally mapped */
 #define ACPI_TABLE_ORIGIN_INTERNAL_VIRTUAL  (2)	/* Virtual address, internallly allocated */
 #define ACPI_TABLE_ORIGIN_MASK              (3)
+#define ACPI_TABLE_IS_VERIFIED              (4)
 #define ACPI_TABLE_IS_LOADED                (8)
 
 /*
-- 
cgit v1.2.3


From c944230064eb65e4fa018d86779b4fd200b1d7e7 Mon Sep 17 00:00:00 2001
From: Ganapatrao Kulkarni <ganapatrao.kulkarni@cavium.com>
Date: Mon, 10 Jul 2017 15:24:15 +0800
Subject: ACPICA: iasl: Update to IORT SMMUv3 disassembling

ACPICA commit 8cadc4fb500e2aa52241e367c87a0f95d9760c58

ARM IORT specification has provision to define Proximity domain
in SMMUv3 IORT table. Adding required changes to decode
Proximity domain of SMMUv3 IORT table.

Link: https://github.com/acpica/acpica/commit/8cadc4fb
Signed-off-by: Ganapatrao Kulkarni <ganapatrao.kulkarni@cavium.com>
Signed-off-by: Bob Moore <robert.moore@intel.com>
Signed-off-by: Lv Zheng <lv.zheng@intel.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 include/acpi/actbl2.h | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'include')

diff --git a/include/acpi/actbl2.h b/include/acpi/actbl2.h
index 5829eedbf3ff..d568948d3a82 100644
--- a/include/acpi/actbl2.h
+++ b/include/acpi/actbl2.h
@@ -806,6 +806,9 @@ struct acpi_iort_smmu_v3 {
 	u32 pri_gsiv;
 	u32 gerr_gsiv;
 	u32 sync_gsiv;
+	u8 pxm;
+	u8 reserved1;
+	u16 reserved2;
 };
 
 /* Values for Model field above */
@@ -818,6 +821,7 @@ struct acpi_iort_smmu_v3 {
 
 #define ACPI_IORT_SMMU_V3_COHACC_OVERRIDE   (1)
 #define ACPI_IORT_SMMU_V3_HTTU_OVERRIDE     (1<<1)
+#define ACPI_IORT_SMMU_V3_PXM_VALID         (1<<3)
 
 /*******************************************************************************
  *
-- 
cgit v1.2.3


From ab539eaa50c6a97f1d6287c5ce3ee75155cb10b8 Mon Sep 17 00:00:00 2001
From: Bob Moore <robert.moore@intel.com>
Date: Mon, 10 Jul 2017 15:24:27 +0800
Subject: ACPICA: Update version to 20170629

ACPICA commit 7271c1c54c095c06ed9e7d28641f2356da840038

Version 20170629

Link: https://github.com/acpica/acpica/commit/7271c1c5
Signed-off-by: Bob Moore <robert.moore@intel.com>
Signed-off-by: Lv Zheng <lv.zheng@intel.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 include/acpi/acpixf.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/acpi/acpixf.h b/include/acpi/acpixf.h
index 12dac84c98c9..06b87adc97ef 100644
--- a/include/acpi/acpixf.h
+++ b/include/acpi/acpixf.h
@@ -46,7 +46,7 @@
 
 /* Current ACPICA subsystem version in YYYYMMDD format */
 
-#define ACPI_CA_VERSION                 0x20170531
+#define ACPI_CA_VERSION                 0x20170629
 
 #include <acpi/acconfig.h>
 #include <acpi/actypes.h>
-- 
cgit v1.2.3


From d75cf0144f150272be806b69b4e62553ba07ea1b Mon Sep 17 00:00:00 2001
From: Prabhakar Lad <prabhakar.csengg@gmail.com>
Date: Thu, 20 Jul 2017 04:56:31 -0400
Subject: media: platform: davinci: drop VPFE_CMD_S_CCDC_RAW_PARAMS

drop VPFE_CMD_S_CCDC_RAW_PARAMS ioctl from dm355/dm644x following reasons:

- This ioctl was never in public api and was only defined in kernel header.
- The function set_params constantly mixes up pointers and phys_addr_t
  numbers.
- This is part of a 'VPFE_CMD_S_CCDC_RAW_PARAMS' ioctl command that is
  described as an 'experimental ioctl that will change in future kernels'.
- The code to allocate the table never gets called after we copy_from_user
  the user input over the kernel settings, and then compare them
  for inequality.
- We then go on to use an address provided by user space as both the
  __user pointer for input and pass it through phys_to_virt to come up
  with a kernel pointer to copy the data to. This looks like a trivially
  exploitable root hole.

Signed-off-by: Lad, Prabhakar <prabhakar.csengg@gmail.com>
Signed-off-by: Hans Verkuil <hans.verkuil@cisco.com>
Signed-off-by: Mauro Carvalho Chehab <mchehab@s-opensource.com>
---
 drivers/media/platform/davinci/ccdc_hw_device.h |  10 --
 drivers/media/platform/davinci/dm355_ccdc.c     |  92 +--------------
 drivers/media/platform/davinci/dm644x_ccdc.c    | 151 +-----------------------
 drivers/media/platform/davinci/vpfe_capture.c   |  75 ------------
 include/media/davinci/dm644x_ccdc.h             |  12 --
 include/media/davinci/vpfe_capture.h            |  10 --
 6 files changed, 4 insertions(+), 346 deletions(-)

(limited to 'include')

diff --git a/drivers/media/platform/davinci/ccdc_hw_device.h b/drivers/media/platform/davinci/ccdc_hw_device.h
index 8f6688a7a111..f1b521045d64 100644
--- a/drivers/media/platform/davinci/ccdc_hw_device.h
+++ b/drivers/media/platform/davinci/ccdc_hw_device.h
@@ -42,16 +42,6 @@ struct ccdc_hw_ops {
 	int (*set_hw_if_params) (struct vpfe_hw_if_param *param);
 	/* get interface parameters */
 	int (*get_hw_if_params) (struct vpfe_hw_if_param *param);
-	/*
-	 * Pointer to function to set parameters. Used
-	 * for implementing VPFE_S_CCDC_PARAMS
-	 */
-	int (*set_params) (void *params);
-	/*
-	 * Pointer to function to get parameter. Used
-	 * for implementing VPFE_G_CCDC_PARAMS
-	 */
-	int (*get_params) (void *params);
 	/* Pointer to function to configure ccdc */
 	int (*configure) (void);
 
diff --git a/drivers/media/platform/davinci/dm355_ccdc.c b/drivers/media/platform/davinci/dm355_ccdc.c
index 73db166dc338..6d492dc4c3a9 100644
--- a/drivers/media/platform/davinci/dm355_ccdc.c
+++ b/drivers/media/platform/davinci/dm355_ccdc.c
@@ -17,12 +17,7 @@
  * This module is for configuring DM355 CCD controller of VPFE to capture
  * Raw yuv or Bayer RGB data from a decoder. CCDC has several modules
  * such as Defect Pixel Correction, Color Space Conversion etc to
- * pre-process the Bayer RGB data, before writing it to SDRAM. This
- * module also allows application to configure individual
- * module parameters through VPFE_CMD_S_CCDC_RAW_PARAMS IOCTL.
- * To do so, application include dm355_ccdc.h and vpfe_capture.h header
- * files. The setparams() API is called by vpfe_capture driver
- * to configure module parameters
+ * pre-process the Bayer RGB data, before writing it to SDRAM.
  *
  * TODO: 1) Raw bayer parameter settings and bayer capture
  * 	 2) Split module parameter structure to module specific ioctl structs
@@ -260,90 +255,6 @@ static void ccdc_setwin(struct v4l2_rect *image_win,
 	dev_dbg(ccdc_cfg.dev, "\nEnd of ccdc_setwin...");
 }
 
-static int validate_ccdc_param(struct ccdc_config_params_raw *ccdcparam)
-{
-	if (ccdcparam->datasft < CCDC_DATA_NO_SHIFT ||
-	    ccdcparam->datasft > CCDC_DATA_SHIFT_6BIT) {
-		dev_dbg(ccdc_cfg.dev, "Invalid value of data shift\n");
-		return -EINVAL;
-	}
-
-	if (ccdcparam->mfilt1 < CCDC_NO_MEDIAN_FILTER1 ||
-	    ccdcparam->mfilt1 > CCDC_MEDIAN_FILTER1) {
-		dev_dbg(ccdc_cfg.dev, "Invalid value of median filter1\n");
-		return -EINVAL;
-	}
-
-	if (ccdcparam->mfilt2 < CCDC_NO_MEDIAN_FILTER2 ||
-	    ccdcparam->mfilt2 > CCDC_MEDIAN_FILTER2) {
-		dev_dbg(ccdc_cfg.dev, "Invalid value of median filter2\n");
-		return -EINVAL;
-	}
-
-	if ((ccdcparam->med_filt_thres < 0) ||
-	   (ccdcparam->med_filt_thres > CCDC_MED_FILT_THRESH)) {
-		dev_dbg(ccdc_cfg.dev,
-			"Invalid value of median filter threshold\n");
-		return -EINVAL;
-	}
-
-	if (ccdcparam->data_sz < CCDC_DATA_16BITS ||
-	    ccdcparam->data_sz > CCDC_DATA_8BITS) {
-		dev_dbg(ccdc_cfg.dev, "Invalid value of data size\n");
-		return -EINVAL;
-	}
-
-	if (ccdcparam->alaw.enable) {
-		if (ccdcparam->alaw.gamma_wd < CCDC_GAMMA_BITS_13_4 ||
-		    ccdcparam->alaw.gamma_wd > CCDC_GAMMA_BITS_09_0) {
-			dev_dbg(ccdc_cfg.dev, "Invalid value of ALAW\n");
-			return -EINVAL;
-		}
-	}
-
-	if (ccdcparam->blk_clamp.b_clamp_enable) {
-		if (ccdcparam->blk_clamp.sample_pixel < CCDC_SAMPLE_1PIXELS ||
-		    ccdcparam->blk_clamp.sample_pixel > CCDC_SAMPLE_16PIXELS) {
-			dev_dbg(ccdc_cfg.dev,
-				"Invalid value of sample pixel\n");
-			return -EINVAL;
-		}
-		if (ccdcparam->blk_clamp.sample_ln < CCDC_SAMPLE_1LINES ||
-		    ccdcparam->blk_clamp.sample_ln > CCDC_SAMPLE_16LINES) {
-			dev_dbg(ccdc_cfg.dev,
-				"Invalid value of sample lines\n");
-			return -EINVAL;
-		}
-	}
-	return 0;
-}
-
-/* Parameter operations */
-static int ccdc_set_params(void __user *params)
-{
-	struct ccdc_config_params_raw ccdc_raw_params;
-	int x;
-
-	/* only raw module parameters can be set through the IOCTL */
-	if (ccdc_cfg.if_type != VPFE_RAW_BAYER)
-		return -EINVAL;
-
-	x = copy_from_user(&ccdc_raw_params, params, sizeof(ccdc_raw_params));
-	if (x) {
-		dev_dbg(ccdc_cfg.dev, "ccdc_set_params: error in copying ccdcparams, %d\n",
-			x);
-		return -EFAULT;
-	}
-
-	if (!validate_ccdc_param(&ccdc_raw_params)) {
-		memcpy(&ccdc_cfg.bayer.config_params,
-			&ccdc_raw_params,
-			sizeof(ccdc_raw_params));
-		return 0;
-	}
-	return -EINVAL;
-}
-
 /* This function will configure CCDC for YCbCr video capture */
 static void ccdc_config_ycbcr(void)
 {
@@ -939,7 +850,6 @@ static struct ccdc_hw_device ccdc_hw_dev = {
 		.enable = ccdc_enable,
 		.enable_out_to_sdram = ccdc_enable_output_to_sdram,
 		.set_hw_if_params = ccdc_set_hw_if_params,
-		.set_params = ccdc_set_params,
 		.configure = ccdc_configure,
 		.set_buftype = ccdc_set_buftype,
 		.get_buftype = ccdc_get_buftype,
diff --git a/drivers/media/platform/davinci/dm644x_ccdc.c b/drivers/media/platform/davinci/dm644x_ccdc.c
index 740fbc7a8c14..3b2d8a9317b8 100644
--- a/drivers/media/platform/davinci/dm644x_ccdc.c
+++ b/drivers/media/platform/davinci/dm644x_ccdc.c
@@ -17,13 +17,9 @@
  * This module is for configuring CCD controller of DM6446 VPFE to capture
  * Raw yuv or Bayer RGB data from a decoder. CCDC has several modules
  * such as Defect Pixel Correction, Color Space Conversion etc to
- * pre-process the Raw Bayer RGB data, before writing it to SDRAM. This
- * module also allows application to configure individual
- * module parameters through VPFE_CMD_S_CCDC_RAW_PARAMS IOCTL.
- * To do so, application includes dm644x_ccdc.h and vpfe_capture.h header
- * files.  The setparams() API is called by vpfe_capture driver
- * to configure module parameters. This file is named DM644x so that other
- * variants such DM6443 may be supported using the same module.
+ * pre-process the Raw Bayer RGB data, before writing it to SDRAM.
+ * This file is named DM644x so that other variants such DM6443
+ * may be supported using the same module.
  *
  * TODO: Test Raw bayer parameter settings and bayer capture
  * 	 Split module parameter structure to module specific ioctl structs
@@ -216,96 +212,8 @@ static void ccdc_readregs(void)
 	dev_notice(ccdc_cfg.dev, "\nReading 0x%x to VERT_LINES...\n", val);
 }
 
-static int validate_ccdc_param(struct ccdc_config_params_raw *ccdcparam)
-{
-	if (ccdcparam->alaw.enable) {
-		u8 max_gamma = ccdc_gamma_width_max_bit(ccdcparam->alaw.gamma_wd);
-		u8 max_data = ccdc_data_size_max_bit(ccdcparam->data_sz);
-
-		if ((ccdcparam->alaw.gamma_wd > CCDC_GAMMA_BITS_09_0) ||
-		    (ccdcparam->alaw.gamma_wd < CCDC_GAMMA_BITS_15_6) ||
-		    (max_gamma > max_data)) {
-			dev_dbg(ccdc_cfg.dev, "\nInvalid data line select");
-			return -1;
-		}
-	}
-	return 0;
-}
-
-static int ccdc_update_raw_params(struct ccdc_config_params_raw *raw_params)
-{
-	struct ccdc_config_params_raw *config_params =
-				&ccdc_cfg.bayer.config_params;
-	unsigned int *fpc_virtaddr = NULL;
-	unsigned int *fpc_physaddr = NULL;
-
-	memcpy(config_params, raw_params, sizeof(*raw_params));
-	/*
-	 * allocate memory for fault pixel table and copy the user
-	 * values to the table
-	 */
-	if (!config_params->fault_pxl.enable)
-		return 0;
-
-	fpc_physaddr = (unsigned int *)config_params->fault_pxl.fpc_table_addr;
-	fpc_virtaddr = (unsigned int *)phys_to_virt(
-				(unsigned long)fpc_physaddr);
-	/*
-	 * Allocate memory for FPC table if current
-	 * FPC table buffer is not big enough to
-	 * accommodate FPC Number requested
-	 */
-	if (raw_params->fault_pxl.fp_num != config_params->fault_pxl.fp_num) {
-		if (fpc_physaddr != NULL) {
-			free_pages((unsigned long)fpc_virtaddr,
-				   get_order
-				   (config_params->fault_pxl.fp_num *
-				   FP_NUM_BYTES));
-		}
-
-		/* Allocate memory for FPC table */
-		fpc_virtaddr =
-			(unsigned int *)__get_free_pages(GFP_KERNEL | GFP_DMA,
-							 get_order(raw_params->
-							 fault_pxl.fp_num *
-							 FP_NUM_BYTES));
-
-		if (fpc_virtaddr == NULL) {
-			dev_dbg(ccdc_cfg.dev,
-				"\nUnable to allocate memory for FPC");
-			return -EFAULT;
-		}
-		fpc_physaddr =
-		    (unsigned int *)virt_to_phys((void *)fpc_virtaddr);
-	}
-
-	/* Copy number of fault pixels and FPC table */
-	config_params->fault_pxl.fp_num = raw_params->fault_pxl.fp_num;
-	if (copy_from_user(fpc_virtaddr,
-			(void __user *)raw_params->fault_pxl.fpc_table_addr,
-			config_params->fault_pxl.fp_num * FP_NUM_BYTES)) {
-		dev_dbg(ccdc_cfg.dev, "\n copy_from_user failed");
-		return -EFAULT;
-	}
-	config_params->fault_pxl.fpc_table_addr = (unsigned long)fpc_physaddr;
-	return 0;
-}
-
 static int ccdc_close(struct device *dev)
 {
-	struct ccdc_config_params_raw *config_params =
-				&ccdc_cfg.bayer.config_params;
-	unsigned int *fpc_physaddr = NULL, *fpc_virtaddr = NULL;
-
-	fpc_physaddr = (unsigned int *)config_params->fault_pxl.fpc_table_addr;
-
-	if (fpc_physaddr != NULL) {
-		fpc_virtaddr = (unsigned int *)
-		    phys_to_virt((unsigned long)fpc_physaddr);
-		free_pages((unsigned long)fpc_virtaddr,
-			   get_order(config_params->fault_pxl.fp_num *
-			   FP_NUM_BYTES));
-	}
 	return 0;
 }
 
@@ -339,29 +247,6 @@ static void ccdc_sbl_reset(void)
 	vpss_clear_wbl_overflow(VPSS_PCR_CCDC_WBL_O);
 }
 
-/* Parameter operations */
-static int ccdc_set_params(void __user *params)
-{
-	struct ccdc_config_params_raw ccdc_raw_params;
-	int x;
-
-	if (ccdc_cfg.if_type != VPFE_RAW_BAYER)
-		return -EINVAL;
-
-	x = copy_from_user(&ccdc_raw_params, params, sizeof(ccdc_raw_params));
-	if (x) {
-		dev_dbg(ccdc_cfg.dev, "ccdc_set_params: error in copyingccdc params, %d\n",
-			x);
-		return -EFAULT;
-	}
-
-	if (!validate_ccdc_param(&ccdc_raw_params)) {
-		if (!ccdc_update_raw_params(&ccdc_raw_params))
-			return 0;
-	}
-	return -EINVAL;
-}
-
 /*
  * ccdc_config_ycbcr()
  * This function will configure CCDC for YCbCr video capture
@@ -489,32 +374,6 @@ static void ccdc_config_black_compense(struct ccdc_black_compensation *bcomp)
 	regw(val, CCDC_BLKCMP);
 }
 
-static void ccdc_config_fpc(struct ccdc_fault_pixel *fpc)
-{
-	u32 val;
-
-	/* Initially disable FPC */
-	val = CCDC_FPC_DISABLE;
-	regw(val, CCDC_FPC);
-
-	if (!fpc->enable)
-		return;
-
-	/* Configure Fault pixel if needed */
-	regw(fpc->fpc_table_addr, CCDC_FPC_ADDR);
-	dev_dbg(ccdc_cfg.dev, "\nWriting 0x%lx to FPC_ADDR...\n",
-		       (fpc->fpc_table_addr));
-	/* Write the FPC params with FPC disable */
-	val = fpc->fp_num & CCDC_FPC_FPC_NUM_MASK;
-	regw(val, CCDC_FPC);
-
-	dev_dbg(ccdc_cfg.dev, "\nWriting 0x%x to FPC...\n", val);
-	/* read the FPC register */
-	val = regr(CCDC_FPC) | CCDC_FPC_ENABLE;
-	regw(val, CCDC_FPC);
-	dev_dbg(ccdc_cfg.dev, "\nWriting 0x%x to FPC...\n", val);
-}
-
 /*
  * ccdc_config_raw()
  * This function will configure CCDC for Raw capture mode
@@ -569,9 +428,6 @@ static void ccdc_config_raw(void)
 	/* Configure Black level compensation */
 	ccdc_config_black_compense(&config_params->blk_comp);
 
-	/* Configure Fault Pixel Correction */
-	ccdc_config_fpc(&config_params->fault_pxl);
-
 	/* If data size is 8 bit then pack the data */
 	if ((config_params->data_sz == CCDC_DATA_8BITS) ||
 	     config_params->alaw.enable)
@@ -929,7 +785,6 @@ static struct ccdc_hw_device ccdc_hw_dev = {
 		.reset = ccdc_sbl_reset,
 		.enable = ccdc_enable,
 		.set_hw_if_params = ccdc_set_hw_if_params,
-		.set_params = ccdc_set_params,
 		.configure = ccdc_configure,
 		.set_buftype = ccdc_set_buftype,
 		.get_buftype = ccdc_get_buftype,
diff --git a/drivers/media/platform/davinci/vpfe_capture.c b/drivers/media/platform/davinci/vpfe_capture.c
index 1831bf5ccca5..b1bf4a7e8eb7 100644
--- a/drivers/media/platform/davinci/vpfe_capture.c
+++ b/drivers/media/platform/davinci/vpfe_capture.c
@@ -280,45 +280,6 @@ void vpfe_unregister_ccdc_device(struct ccdc_hw_device *dev)
 }
 EXPORT_SYMBOL(vpfe_unregister_ccdc_device);
 
-/*
- * vpfe_get_ccdc_image_format - Get image parameters based on CCDC settings
- */
-static int vpfe_get_ccdc_image_format(struct vpfe_device *vpfe_dev,
-				 struct v4l2_format *f)
-{
-	struct v4l2_rect image_win;
-	enum ccdc_buftype buf_type;
-	enum ccdc_frmfmt frm_fmt;
-
-	memset(f, 0, sizeof(*f));
-	f->type = V4L2_BUF_TYPE_VIDEO_OUTPUT;
-	ccdc_dev->hw_ops.get_image_window(&image_win);
-	f->fmt.pix.width = image_win.width;
-	f->fmt.pix.height = image_win.height;
-	f->fmt.pix.bytesperline = ccdc_dev->hw_ops.get_line_length();
-	f->fmt.pix.sizeimage = f->fmt.pix.bytesperline *
-				f->fmt.pix.height;
-	buf_type = ccdc_dev->hw_ops.get_buftype();
-	f->fmt.pix.pixelformat = ccdc_dev->hw_ops.get_pixel_format();
-	frm_fmt = ccdc_dev->hw_ops.get_frame_format();
-	if (frm_fmt == CCDC_FRMFMT_PROGRESSIVE)
-		f->fmt.pix.field = V4L2_FIELD_NONE;
-	else if (frm_fmt == CCDC_FRMFMT_INTERLACED) {
-		if (buf_type == CCDC_BUFTYPE_FLD_INTERLEAVED)
-			f->fmt.pix.field = V4L2_FIELD_INTERLACED;
-		else if (buf_type == CCDC_BUFTYPE_FLD_SEPARATED)
-			f->fmt.pix.field = V4L2_FIELD_SEQ_TB;
-		else {
-			v4l2_err(&vpfe_dev->v4l2_dev, "Invalid buf_type\n");
-			return -EINVAL;
-		}
-	} else {
-		v4l2_err(&vpfe_dev->v4l2_dev, "Invalid frm_fmt\n");
-		return -EINVAL;
-	}
-	return 0;
-}
-
 /*
  * vpfe_config_ccdc_image_format()
  * For a pix format, configure ccdc to setup the capture
@@ -1697,41 +1658,6 @@ unlock_out:
 	return ret;
 }
 
-
-static long vpfe_param_handler(struct file *file, void *priv,
-		bool valid_prio, unsigned int cmd, void *param)
-{
-	struct vpfe_device *vpfe_dev = video_drvdata(file);
-	int ret;
-
-	v4l2_dbg(2, debug, &vpfe_dev->v4l2_dev, "vpfe_param_handler\n");
-
-	if (vpfe_dev->started) {
-		/* only allowed if streaming is not started */
-		v4l2_dbg(1, debug, &vpfe_dev->v4l2_dev,
-			"device already started\n");
-		return -EBUSY;
-	}
-
-	ret = mutex_lock_interruptible(&vpfe_dev->lock);
-	if (ret)
-		return ret;
-
-	switch (cmd) {
-	case VPFE_CMD_S_CCDC_RAW_PARAMS:
-		ret = -EINVAL;
-		v4l2_warn(&vpfe_dev->v4l2_dev,
-			"VPFE_CMD_S_CCDC_RAW_PARAMS not supported\n");
-		break;
-	default:
-		ret = -ENOTTY;
-	}
-unlock_out:
-	mutex_unlock(&vpfe_dev->lock);
-	return ret;
-}
-
-
 /* vpfe capture ioctl operations */
 static const struct v4l2_ioctl_ops vpfe_ioctl_ops = {
 	.vidioc_querycap	 = vpfe_querycap,
@@ -1754,7 +1680,6 @@ static const struct v4l2_ioctl_ops vpfe_ioctl_ops = {
 	.vidioc_cropcap		 = vpfe_cropcap,
 	.vidioc_g_selection	 = vpfe_g_selection,
 	.vidioc_s_selection	 = vpfe_s_selection,
-	.vidioc_default		 = vpfe_param_handler,
 };
 
 static struct vpfe_device *vpfe_initialize(void)
diff --git a/include/media/davinci/dm644x_ccdc.h b/include/media/davinci/dm644x_ccdc.h
index 7c909da29d43..6ea2ce241851 100644
--- a/include/media/davinci/dm644x_ccdc.h
+++ b/include/media/davinci/dm644x_ccdc.h
@@ -103,16 +103,6 @@ struct ccdc_black_compensation {
 	char gb;
 };
 
-/* structure for fault pixel correction */
-struct ccdc_fault_pixel {
-	/* Enable or Disable fault pixel correction */
-	unsigned char enable;
-	/* Number of fault pixel */
-	unsigned short fp_num;
-	/* Address of fault pixel table */
-	unsigned long fpc_table_addr;
-};
-
 /* Structure for CCDC configuration parameters for raw capture mode passed
  * by application
  */
@@ -125,8 +115,6 @@ struct ccdc_config_params_raw {
 	struct ccdc_black_clamp blk_clamp;
 	/* Structure for Black Compensation */
 	struct ccdc_black_compensation blk_comp;
-	/* Structure for Fault Pixel Module Configuration */
-	struct ccdc_fault_pixel fault_pxl;
 };
 
 
diff --git a/include/media/davinci/vpfe_capture.h b/include/media/davinci/vpfe_capture.h
index 8e1a4d88daa0..f003533602d0 100644
--- a/include/media/davinci/vpfe_capture.h
+++ b/include/media/davinci/vpfe_capture.h
@@ -183,14 +183,4 @@ struct vpfe_config_params {
 };
 
 #endif				/* End of __KERNEL__ */
-/**
- * VPFE_CMD_S_CCDC_RAW_PARAMS - EXPERIMENTAL IOCTL to set raw capture params
- * This can be used to configure modules such as defect pixel correction,
- * color space conversion, culling etc. This is an experimental ioctl that
- * will change in future kernels. So use this ioctl with care !
- * TODO: This is to be split into multiple ioctls and also explore the
- * possibility of extending the v4l2 api to include this
- **/
-#define VPFE_CMD_S_CCDC_RAW_PARAMS _IOW('V', BASE_VIDIOC_PRIVATE + 1, \
-					void *)
 #endif				/* _DAVINCI_VPFE_H */
-- 
cgit v1.2.3


From 4ee236219f6da367093b503b52fcceb609397c6f Mon Sep 17 00:00:00 2001
From: Mauro Carvalho Chehab <mchehab@s-opensource.com>
Date: Mon, 26 Jun 2017 14:07:54 -0400
Subject: media: v4l2-fwnode: suppress a warning at OF parsing logic

smatch produce this warning:
	drivers/media/v4l2-core/v4l2-fwnode.c:76 v4l2_fwnode_endpoint_parse_csi_bus() error: buffer overflow 'array' 5 <= u16max

That's because, in thesis, the routine might have called with
some value at bus->num_data_lanes. That's not the current
case.

Yet, better to shut up this warning, and make the code more
reliable if some future changes might cause a bug.

While here, simplify the code a little bit by reading only
once from lanes-properties array.

Signed-off-by: Mauro Carvalho Chehab <mchehab@s-opensource.com>
---
 drivers/media/v4l2-core/v4l2-fwnode.c | 32 ++++++++++++++------------------
 include/media/v4l2-fwnode.h           |  6 ++++--
 2 files changed, 18 insertions(+), 20 deletions(-)

(limited to 'include')

diff --git a/drivers/media/v4l2-core/v4l2-fwnode.c b/drivers/media/v4l2-core/v4l2-fwnode.c
index ca755a4832fc..d07c54efaa99 100644
--- a/drivers/media/v4l2-core/v4l2-fwnode.c
+++ b/drivers/media/v4l2-core/v4l2-fwnode.c
@@ -48,10 +48,9 @@ static int v4l2_fwnode_endpoint_parse_csi2_bus(struct fwnode_handle *fwnode,
 
 	rval = fwnode_property_read_u32_array(fwnode, "data-lanes", NULL, 0);
 	if (rval > 0) {
-		u32 array[ARRAY_SIZE(bus->data_lanes)];
+		u32 array[MAX_DATA_LANES + 1];
 
-		bus->num_data_lanes =
-			min_t(int, ARRAY_SIZE(bus->data_lanes), rval);
+		bus->num_data_lanes = min_t(int, MAX_DATA_LANES, rval);
 
 		fwnode_property_read_u32_array(fwnode, "data-lanes", array,
 					       bus->num_data_lanes);
@@ -64,24 +63,21 @@ static int v4l2_fwnode_endpoint_parse_csi2_bus(struct fwnode_handle *fwnode,
 
 			bus->data_lanes[i] = array[i];
 		}
-	}
 
-	rval = fwnode_property_read_u32_array(fwnode, "lane-polarities", NULL,
-					      0);
-	if (rval > 0) {
-		u32 array[ARRAY_SIZE(bus->lane_polarities)];
+		rval = fwnode_property_read_u32_array(fwnode,
+						      "lane-polarities", array,
+						      1 + bus->num_data_lanes);
+		if (rval > 0) {
+			if (rval != 1 + bus->num_data_lanes /* clock + data */) {
+				pr_warn("invalid number of lane-polarities entries (need %u, got %u)\n",
+					1 + bus->num_data_lanes, rval);
+				return -EINVAL;
+			}
 
-		if (rval < 1 + bus->num_data_lanes /* clock + data */) {
-			pr_warn("too few lane-polarities entries (need %u, got %u)\n",
-				1 + bus->num_data_lanes, rval);
-			return -EINVAL;
-		}
 
-		fwnode_property_read_u32_array(fwnode, "lane-polarities", array,
-					       1 + bus->num_data_lanes);
-
-		for (i = 0; i < 1 + bus->num_data_lanes; i++)
-			bus->lane_polarities[i] = array[i];
+			for (i = 0; i < 1 + bus->num_data_lanes; i++)
+				bus->lane_polarities[i] = array[i];
+		}
 	}
 
 	if (!fwnode_property_read_u32(fwnode, "clock-lanes", &v)) {
diff --git a/include/media/v4l2-fwnode.h b/include/media/v4l2-fwnode.h
index 29ae22bbbbaf..b373c43f65e8 100644
--- a/include/media/v4l2-fwnode.h
+++ b/include/media/v4l2-fwnode.h
@@ -26,6 +26,8 @@
 
 struct fwnode_handle;
 
+#define MAX_DATA_LANES	4
+
 /**
  * struct v4l2_fwnode_bus_mipi_csi2 - MIPI CSI-2 bus data structure
  * @flags: media bus (V4L2_MBUS_*) flags
@@ -37,10 +39,10 @@ struct fwnode_handle;
  */
 struct v4l2_fwnode_bus_mipi_csi2 {
 	unsigned int flags;
-	unsigned char data_lanes[4];
+	unsigned char data_lanes[MAX_DATA_LANES];
 	unsigned char clock_lane;
 	unsigned short num_data_lanes;
-	bool lane_polarities[5];
+	bool lane_polarities[MAX_DATA_LANES + 1];
 };
 
 /**
-- 
cgit v1.2.3


From 8bf3560f6c66832baaae6eba43da552550ed82c9 Mon Sep 17 00:00:00 2001
From: Elaine Zhang <zhangqing@rock-chips.com>
Date: Fri, 14 Jul 2017 15:02:41 +0800
Subject: dt-bindings: power: add RK3366 SoCs header for power-domain

According to a description from TRM, add all the power domains.

Signed-off-by: Elaine Zhang <zhangqing@rock-chips.com>
Signed-off-by: Heiko Stuebner <heiko@sntech.de>
---
 include/dt-bindings/power/rk3366-power.h | 24 ++++++++++++++++++++++++
 1 file changed, 24 insertions(+)
 create mode 100644 include/dt-bindings/power/rk3366-power.h

(limited to 'include')

diff --git a/include/dt-bindings/power/rk3366-power.h b/include/dt-bindings/power/rk3366-power.h
new file mode 100644
index 000000000000..223a3dce049a
--- /dev/null
+++ b/include/dt-bindings/power/rk3366-power.h
@@ -0,0 +1,24 @@
+#ifndef __DT_BINDINGS_POWER_RK3366_POWER_H__
+#define __DT_BINDINGS_POWER_RK3366_POWER_H__
+
+/* VD_CORE */
+#define RK3366_PD_A53_0		0
+#define RK3366_PD_A53_1		1
+#define RK3366_PD_A53_2		2
+#define RK3366_PD_A53_3		3
+
+/* VD_LOGIC */
+#define RK3366_PD_BUS		4
+#define RK3366_PD_PERI		5
+#define RK3366_PD_VIO		6
+#define RK3366_PD_VIDEO		7
+#define RK3366_PD_RKVDEC	8
+#define RK3366_PD_WIFIBT	9
+#define RK3366_PD_VPU		10
+#define RK3366_PD_GPU		11
+#define RK3366_PD_ALIVE		12
+
+/* VD_PMU */
+#define RK3366_PD_PMU		13
+
+#endif
-- 
cgit v1.2.3


From 6538b02d210f52ef2a2e67d59fcb58be98451fbd Mon Sep 17 00:00:00 2001
From: Todor Tomov <todor.tomov@linaro.org>
Date: Mon, 3 Jul 2017 08:08:11 -0400
Subject: media: Make parameter of media_entity_remote_pad() const

The local pad parameter in media_entity_remote_pad() is not modified.
Make that explicit by adding a const modifier.

Signed-off-by: Todor Tomov <todor.tomov@linaro.org>
Acked-by: Sakari Ailus <sakari.ailus@linux.intel.com>
Signed-off-by: Mauro Carvalho Chehab <mchehab@s-opensource.com>
---
 drivers/media/media-entity.c | 2 +-
 include/media/media-entity.h | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/drivers/media/media-entity.c b/drivers/media/media-entity.c
index dd0f0ead9516..2ace0410d277 100644
--- a/drivers/media/media-entity.c
+++ b/drivers/media/media-entity.c
@@ -917,7 +917,7 @@ media_entity_find_link(struct media_pad *source, struct media_pad *sink)
 }
 EXPORT_SYMBOL_GPL(media_entity_find_link);
 
-struct media_pad *media_entity_remote_pad(struct media_pad *pad)
+struct media_pad *media_entity_remote_pad(const struct media_pad *pad)
 {
 	struct media_link *link;
 
diff --git a/include/media/media-entity.h b/include/media/media-entity.h
index 754182d29668..222d379960b7 100644
--- a/include/media/media-entity.h
+++ b/include/media/media-entity.h
@@ -805,7 +805,7 @@ struct media_link *media_entity_find_link(struct media_pad *source,
  * Return: returns a pointer to the pad at the remote end of the first found
  * enabled link, or %NULL if no enabled link has been found.
  */
-struct media_pad *media_entity_remote_pad(struct media_pad *pad);
+struct media_pad *media_entity_remote_pad(const struct media_pad *pad);
 
 /**
  * media_entity_get - Get a reference to the parent module
-- 
cgit v1.2.3


From 727f8914477e4642c7d1ff381667cdc4178b40c6 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Fri, 21 Jul 2017 10:39:26 +0100
Subject: rxrpc: Expose UAPI definitions to userspace

Move UAPI definitions from the internal header and place them in a UAPI
header file so that userspace can make use of them.

Signed-off-by: David Howells <dhowells@redhat.com>
---
 include/linux/rxrpc.h      | 79 ---------------------------------------------
 include/uapi/linux/rxrpc.h | 80 ++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 80 insertions(+), 79 deletions(-)
 delete mode 100644 include/linux/rxrpc.h
 create mode 100644 include/uapi/linux/rxrpc.h

(limited to 'include')

diff --git a/include/linux/rxrpc.h b/include/linux/rxrpc.h
deleted file mode 100644
index 7343f71783dc..000000000000
--- a/include/linux/rxrpc.h
+++ /dev/null
@@ -1,79 +0,0 @@
-/* AF_RXRPC parameters
- *
- * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved.
- * Written by David Howells (dhowells@redhat.com)
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
- */
-
-#ifndef _LINUX_RXRPC_H
-#define _LINUX_RXRPC_H
-
-#include <linux/in.h>
-#include <linux/in6.h>
-
-/*
- * RxRPC socket address
- */
-struct sockaddr_rxrpc {
-	sa_family_t	srx_family;	/* address family */
-	u16		srx_service;	/* service desired */
-	u16		transport_type;	/* type of transport socket (SOCK_DGRAM) */
-	u16		transport_len;	/* length of transport address */
-	union {
-		sa_family_t family;		/* transport address family */
-		struct sockaddr_in sin;		/* IPv4 transport address */
-		struct sockaddr_in6 sin6;	/* IPv6 transport address */
-	} transport;
-};
-
-/*
- * RxRPC socket options
- */
-#define RXRPC_SECURITY_KEY		1	/* [clnt] set client security key */
-#define RXRPC_SECURITY_KEYRING		2	/* [srvr] set ring of server security keys */
-#define RXRPC_EXCLUSIVE_CONNECTION	3	/* Deprecated; use RXRPC_EXCLUSIVE_CALL instead */
-#define RXRPC_MIN_SECURITY_LEVEL	4	/* minimum security level */
-#define RXRPC_UPGRADEABLE_SERVICE	5	/* Upgrade service[0] -> service[1] */
-#define RXRPC_SUPPORTED_CMSG		6	/* Get highest supported control message type */
-
-/*
- * RxRPC control messages
- * - If neither abort or accept are specified, the message is a data message.
- * - terminal messages mean that a user call ID tag can be recycled
- * - s/r/- indicate whether these are applicable to sendmsg() and/or recvmsg()
- */
-enum rxrpc_cmsg_type {
-	RXRPC_USER_CALL_ID	= 1,	/* sr: user call ID specifier */
-	RXRPC_ABORT		= 2,	/* sr: abort request / notification [terminal] */
-	RXRPC_ACK		= 3,	/* -r: [Service] RPC op final ACK received [terminal] */
-	RXRPC_NET_ERROR		= 5,	/* -r: network error received [terminal] */
-	RXRPC_BUSY		= 6,	/* -r: server busy received [terminal] */
-	RXRPC_LOCAL_ERROR	= 7,	/* -r: local error generated [terminal] */
-	RXRPC_NEW_CALL		= 8,	/* -r: [Service] new incoming call notification */
-	RXRPC_ACCEPT		= 9,	/* s-: [Service] accept request */
-	RXRPC_EXCLUSIVE_CALL	= 10,	/* s-: Call should be on exclusive connection */
-	RXRPC_UPGRADE_SERVICE	= 11,	/* s-: Request service upgrade for client call */
-	RXRPC_TX_LENGTH		= 12,	/* s-: Total length of Tx data */
-	RXRPC__SUPPORTED
-};
-
-/*
- * RxRPC security levels
- */
-#define RXRPC_SECURITY_PLAIN	0	/* plain secure-checksummed packets only */
-#define RXRPC_SECURITY_AUTH	1	/* authenticated packets */
-#define RXRPC_SECURITY_ENCRYPT	2	/* encrypted packets */
-
-/*
- * RxRPC security indices
- */
-#define RXRPC_SECURITY_NONE	0	/* no security protocol */
-#define RXRPC_SECURITY_RXKAD	2	/* kaserver or kerberos 4 */
-#define RXRPC_SECURITY_RXGK	4	/* gssapi-based */
-#define RXRPC_SECURITY_RXK5	5	/* kerberos 5 */
-
-#endif /* _LINUX_RXRPC_H */
diff --git a/include/uapi/linux/rxrpc.h b/include/uapi/linux/rxrpc.h
new file mode 100644
index 000000000000..08e2fb9c70ae
--- /dev/null
+++ b/include/uapi/linux/rxrpc.h
@@ -0,0 +1,80 @@
+/* Types and definitions for AF_RXRPC.
+ *
+ * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public Licence
+ * as published by the Free Software Foundation; either version
+ * 2 of the Licence, or (at your option) any later version.
+ */
+
+#ifndef _UAPI_LINUX_RXRPC_H
+#define _UAPI_LINUX_RXRPC_H
+
+#include <linux/types.h>
+#include <linux/in.h>
+#include <linux/in6.h>
+
+/*
+ * RxRPC socket address
+ */
+struct sockaddr_rxrpc {
+	sa_family_t	srx_family;	/* address family */
+	u16		srx_service;	/* service desired */
+	u16		transport_type;	/* type of transport socket (SOCK_DGRAM) */
+	u16		transport_len;	/* length of transport address */
+	union {
+		sa_family_t family;		/* transport address family */
+		struct sockaddr_in sin;		/* IPv4 transport address */
+		struct sockaddr_in6 sin6;	/* IPv6 transport address */
+	} transport;
+};
+
+/*
+ * RxRPC socket options
+ */
+#define RXRPC_SECURITY_KEY		1	/* [clnt] set client security key */
+#define RXRPC_SECURITY_KEYRING		2	/* [srvr] set ring of server security keys */
+#define RXRPC_EXCLUSIVE_CONNECTION	3	/* Deprecated; use RXRPC_EXCLUSIVE_CALL instead */
+#define RXRPC_MIN_SECURITY_LEVEL	4	/* minimum security level */
+#define RXRPC_UPGRADEABLE_SERVICE	5	/* Upgrade service[0] -> service[1] */
+#define RXRPC_SUPPORTED_CMSG		6	/* Get highest supported control message type */
+
+/*
+ * RxRPC control messages
+ * - If neither abort or accept are specified, the message is a data message.
+ * - terminal messages mean that a user call ID tag can be recycled
+ * - s/r/- indicate whether these are applicable to sendmsg() and/or recvmsg()
+ */
+enum rxrpc_cmsg_type {
+	RXRPC_USER_CALL_ID	= 1,	/* sr: user call ID specifier */
+	RXRPC_ABORT		= 2,	/* sr: abort request / notification [terminal] */
+	RXRPC_ACK		= 3,	/* -r: [Service] RPC op final ACK received [terminal] */
+	RXRPC_NET_ERROR		= 5,	/* -r: network error received [terminal] */
+	RXRPC_BUSY		= 6,	/* -r: server busy received [terminal] */
+	RXRPC_LOCAL_ERROR	= 7,	/* -r: local error generated [terminal] */
+	RXRPC_NEW_CALL		= 8,	/* -r: [Service] new incoming call notification */
+	RXRPC_ACCEPT		= 9,	/* s-: [Service] accept request */
+	RXRPC_EXCLUSIVE_CALL	= 10,	/* s-: Call should be on exclusive connection */
+	RXRPC_UPGRADE_SERVICE	= 11,	/* s-: Request service upgrade for client call */
+	RXRPC_TX_LENGTH		= 12,	/* s-: Total length of Tx data */
+	RXRPC__SUPPORTED
+};
+
+/*
+ * RxRPC security levels
+ */
+#define RXRPC_SECURITY_PLAIN	0	/* plain secure-checksummed packets only */
+#define RXRPC_SECURITY_AUTH	1	/* authenticated packets */
+#define RXRPC_SECURITY_ENCRYPT	2	/* encrypted packets */
+
+/*
+ * RxRPC security indices
+ */
+#define RXRPC_SECURITY_NONE	0	/* no security protocol */
+#define RXRPC_SECURITY_RXKAD	2	/* kaserver or kerberos 4 */
+#define RXRPC_SECURITY_RXGK	4	/* gssapi-based */
+#define RXRPC_SECURITY_RXK5	5	/* kerberos 5 */
+
+#endif /* _UAPI_LINUX_RXRPC_H */
-- 
cgit v1.2.3


From ddc6c70f07bb1f6dd39a2c6c430f7b4fa95199c8 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Fri, 21 Jul 2017 10:07:10 +0100
Subject: rxrpc: Move the packet.h include file into net/rxrpc/

Move the protocol description header file into net/rxrpc/ and rename it to
protocol.h.  It's no longer necessary to expose it as packets are no longer
exposed to kernel services (such as AFS) that use the facility.

The abort codes are transferred to the UAPI header instead as we pass these
back to userspace and also to kernel services.

Signed-off-by: David Howells <dhowells@redhat.com>
---
 fs/afs/misc.c              |   1 -
 fs/afs/rxrpc.c             |   1 -
 include/rxrpc/packet.h     | 235 ---------------------------------------------
 include/uapi/linux/rxrpc.h |  44 +++++++++
 net/rxrpc/ar-internal.h    |   2 +-
 net/rxrpc/protocol.h       | 190 ++++++++++++++++++++++++++++++++++++
 6 files changed, 235 insertions(+), 238 deletions(-)
 delete mode 100644 include/rxrpc/packet.h
 create mode 100644 net/rxrpc/protocol.h

(limited to 'include')

diff --git a/fs/afs/misc.c b/fs/afs/misc.c
index 100b207efc9e..c05f1f1c0d41 100644
--- a/fs/afs/misc.c
+++ b/fs/afs/misc.c
@@ -12,7 +12,6 @@
 #include <linux/kernel.h>
 #include <linux/module.h>
 #include <linux/errno.h>
-#include <rxrpc/packet.h>
 #include "internal.h"
 #include "afs_fs.h"
 
diff --git a/fs/afs/rxrpc.c b/fs/afs/rxrpc.c
index 02781e78ffb6..10743043d431 100644
--- a/fs/afs/rxrpc.c
+++ b/fs/afs/rxrpc.c
@@ -14,7 +14,6 @@
 
 #include <net/sock.h>
 #include <net/af_rxrpc.h>
-#include <rxrpc/packet.h>
 #include "internal.h"
 #include "afs_cm.h"
 
diff --git a/include/rxrpc/packet.h b/include/rxrpc/packet.h
deleted file mode 100644
index a2dcfb850b9f..000000000000
--- a/include/rxrpc/packet.h
+++ /dev/null
@@ -1,235 +0,0 @@
-/* packet.h: Rx packet layout and definitions
- *
- * Copyright (C) 2002, 2007 Red Hat, Inc. All Rights Reserved.
- * Written by David Howells (dhowells@redhat.com)
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
- */
-
-#ifndef _LINUX_RXRPC_PACKET_H
-#define _LINUX_RXRPC_PACKET_H
-
-typedef u32	rxrpc_seq_t;	/* Rx message sequence number */
-typedef u32	rxrpc_serial_t;	/* Rx message serial number */
-typedef __be32	rxrpc_seq_net_t; /* on-the-wire Rx message sequence number */
-typedef __be32	rxrpc_serial_net_t; /* on-the-wire Rx message serial number */
-
-/*****************************************************************************/
-/*
- * on-the-wire Rx packet header
- * - all multibyte fields should be in network byte order
- */
-struct rxrpc_wire_header {
-	__be32		epoch;		/* client boot timestamp */
-#define RXRPC_RANDOM_EPOCH	0x80000000	/* Random if set, date-based if not */
-
-	__be32		cid;		/* connection and channel ID */
-#define RXRPC_MAXCALLS		4			/* max active calls per conn */
-#define RXRPC_CHANNELMASK	(RXRPC_MAXCALLS-1)	/* mask for channel ID */
-#define RXRPC_CIDMASK		(~RXRPC_CHANNELMASK)	/* mask for connection ID */
-#define RXRPC_CIDSHIFT		ilog2(RXRPC_MAXCALLS)	/* shift for connection ID */
-#define RXRPC_CID_INC		(1 << RXRPC_CIDSHIFT)	/* connection ID increment */
-
-	__be32		callNumber;	/* call ID (0 for connection-level packets) */
-	__be32		seq;		/* sequence number of pkt in call stream */
-	__be32		serial;		/* serial number of pkt sent to network */
-
-	uint8_t		type;		/* packet type */
-#define RXRPC_PACKET_TYPE_DATA		1	/* data */
-#define RXRPC_PACKET_TYPE_ACK		2	/* ACK */
-#define RXRPC_PACKET_TYPE_BUSY		3	/* call reject */
-#define RXRPC_PACKET_TYPE_ABORT		4	/* call/connection abort */
-#define RXRPC_PACKET_TYPE_ACKALL	5	/* ACK all outstanding packets on call */
-#define RXRPC_PACKET_TYPE_CHALLENGE	6	/* connection security challenge (SRVR->CLNT) */
-#define RXRPC_PACKET_TYPE_RESPONSE	7	/* connection secutity response (CLNT->SRVR) */
-#define RXRPC_PACKET_TYPE_DEBUG		8	/* debug info request */
-#define RXRPC_PACKET_TYPE_VERSION	13	/* version string request */
-#define RXRPC_N_PACKET_TYPES		14	/* number of packet types (incl type 0) */
-
-	uint8_t		flags;		/* packet flags */
-#define RXRPC_CLIENT_INITIATED	0x01		/* signifies a packet generated by a client */
-#define RXRPC_REQUEST_ACK	0x02		/* request an unconditional ACK of this packet */
-#define RXRPC_LAST_PACKET	0x04		/* the last packet from this side for this call */
-#define RXRPC_MORE_PACKETS	0x08		/* more packets to come */
-#define RXRPC_JUMBO_PACKET	0x20		/* [DATA] this is a jumbo packet */
-#define RXRPC_SLOW_START_OK	0x20		/* [ACK] slow start supported */
-
-	uint8_t		userStatus;	/* app-layer defined status */
-#define RXRPC_USERSTATUS_SERVICE_UPGRADE 0x01	/* AuriStor service upgrade request */
-	
-	uint8_t		securityIndex;	/* security protocol ID */
-	union {
-		__be16	_rsvd;		/* reserved */
-		__be16	cksum;		/* kerberos security checksum */
-	};
-	__be16		serviceId;	/* service ID */
-
-} __packed;
-
-#define RXRPC_SUPPORTED_PACKET_TYPES (			\
-		(1 << RXRPC_PACKET_TYPE_DATA) |		\
-		(1 << RXRPC_PACKET_TYPE_ACK) |		\
-		(1 << RXRPC_PACKET_TYPE_BUSY) |		\
-		(1 << RXRPC_PACKET_TYPE_ABORT) |	\
-		(1 << RXRPC_PACKET_TYPE_ACKALL) |	\
-		(1 << RXRPC_PACKET_TYPE_CHALLENGE) |	\
-		(1 << RXRPC_PACKET_TYPE_RESPONSE) |	\
-		/*(1 << RXRPC_PACKET_TYPE_DEBUG) | */	\
-		(1 << RXRPC_PACKET_TYPE_VERSION))
-
-/*****************************************************************************/
-/*
- * jumbo packet secondary header
- * - can be mapped to read header by:
- *   - new_serial = serial + 1
- *   - new_seq = seq + 1
- *   - new_flags = j_flags
- *   - new__rsvd = j__rsvd
- *   - duplicating all other fields
- */
-struct rxrpc_jumbo_header {
-	uint8_t		flags;		/* packet flags (as per rxrpc_header) */
-	uint8_t		pad;
-	union {
-		__be16	_rsvd;		/* reserved */
-		__be16	cksum;		/* kerberos security checksum */
-	};
-};
-
-#define RXRPC_JUMBO_DATALEN	1412	/* non-terminal jumbo packet data length */
-#define RXRPC_JUMBO_SUBPKTLEN	(RXRPC_JUMBO_DATALEN + sizeof(struct rxrpc_jumbo_header))
-
-/*****************************************************************************/
-/*
- * on-the-wire Rx ACK packet data payload
- * - all multibyte fields should be in network byte order
- */
-struct rxrpc_ackpacket {
-	__be16		bufferSpace;	/* number of packet buffers available */
-	__be16		maxSkew;	/* diff between serno being ACK'd and highest serial no
-					 * received */
-	__be32		firstPacket;	/* sequence no of first ACK'd packet in attached list */
-	__be32		previousPacket;	/* sequence no of previous packet received */
-	__be32		serial;		/* serial no of packet that prompted this ACK */
-
-	uint8_t		reason;		/* reason for ACK */
-#define RXRPC_ACK_REQUESTED		1	/* ACK was requested on packet */
-#define RXRPC_ACK_DUPLICATE		2	/* duplicate packet received */
-#define RXRPC_ACK_OUT_OF_SEQUENCE	3	/* out of sequence packet received */
-#define RXRPC_ACK_EXCEEDS_WINDOW	4	/* packet received beyond end of ACK window */
-#define RXRPC_ACK_NOSPACE		5	/* packet discarded due to lack of buffer space */
-#define RXRPC_ACK_PING			6	/* keep alive ACK */
-#define RXRPC_ACK_PING_RESPONSE		7	/* response to RXRPC_ACK_PING */
-#define RXRPC_ACK_DELAY			8	/* nothing happened since received packet */
-#define RXRPC_ACK_IDLE			9	/* ACK due to fully received ACK window */
-#define RXRPC_ACK__INVALID		10	/* Representation of invalid ACK reason */
-
-	uint8_t		nAcks;		/* number of ACKs */
-#define RXRPC_MAXACKS	255
-
-	uint8_t		acks[0];	/* list of ACK/NAKs */
-#define RXRPC_ACK_TYPE_NACK		0
-#define RXRPC_ACK_TYPE_ACK		1
-
-} __packed;
-
-/* Some ACKs refer to specific packets and some are general and can be updated. */
-#define RXRPC_ACK_UPDATEABLE ((1 << RXRPC_ACK_REQUESTED)	|	\
-			      (1 << RXRPC_ACK_PING_RESPONSE)	|	\
-			      (1 << RXRPC_ACK_DELAY)		|	\
-			      (1 << RXRPC_ACK_IDLE))
-
-
-/*
- * ACK packets can have a further piece of information tagged on the end
- */
-struct rxrpc_ackinfo {
-	__be32		rxMTU;		/* maximum Rx MTU size (bytes) [AFS 3.3] */
-	__be32		maxMTU;		/* maximum interface MTU size (bytes) [AFS 3.3] */
-	__be32		rwind;		/* Rx window size (packets) [AFS 3.4] */
-	__be32		jumbo_max;	/* max packets to stick into a jumbo packet [AFS 3.5] */
-};
-
-/*****************************************************************************/
-/*
- * Kerberos security type-2 challenge packet
- */
-struct rxkad_challenge {
-	__be32		version;	/* version of this challenge type */
-	__be32		nonce;		/* encrypted random number */
-	__be32		min_level;	/* minimum security level */
-	__be32		__padding;	/* padding to 8-byte boundary */
-} __packed;
-
-/*****************************************************************************/
-/*
- * Kerberos security type-2 response packet
- */
-struct rxkad_response {
-	__be32		version;	/* version of this response type */
-	__be32		__pad;
-
-	/* encrypted bit of the response */
-	struct {
-		__be32		epoch;		/* current epoch */
-		__be32		cid;		/* parent connection ID */
-		__be32		checksum;	/* checksum */
-		__be32		securityIndex;	/* security type */
-		__be32		call_id[4];	/* encrypted call IDs */
-		__be32		inc_nonce;	/* challenge nonce + 1 */
-		__be32		level;		/* desired level */
-	} encrypted;
-
-	__be32		kvno;		/* Kerberos key version number */
-	__be32		ticket_len;	/* Kerberos ticket length  */
-} __packed;
-
-/*****************************************************************************/
-/*
- * RxRPC-level abort codes
- */
-#define RX_CALL_DEAD		-1	/* call/conn has been inactive and is shut down */
-#define RX_INVALID_OPERATION	-2	/* invalid operation requested / attempted */
-#define RX_CALL_TIMEOUT		-3	/* call timeout exceeded */
-#define RX_EOF			-4	/* unexpected end of data on read op */
-#define RX_PROTOCOL_ERROR	-5	/* low-level protocol error */
-#define RX_USER_ABORT		-6	/* generic user abort */
-#define RX_ADDRINUSE		-7	/* UDP port in use */
-#define RX_DEBUGI_BADTYPE	-8	/* bad debugging packet type */
-
-/*
- * (un)marshalling abort codes (rxgen)
- */
-#define	RXGEN_CC_MARSHAL    -450
-#define	RXGEN_CC_UNMARSHAL  -451
-#define	RXGEN_SS_MARSHAL    -452
-#define	RXGEN_SS_UNMARSHAL  -453
-#define	RXGEN_DECODE	    -454
-#define	RXGEN_OPCODE	    -455
-#define	RXGEN_SS_XDRFREE    -456
-#define	RXGEN_CC_XDRFREE    -457
-
-/*
- * Rx kerberos security abort codes
- * - unfortunately we have no generalised security abort codes to say things
- *   like "unsupported security", so we have to use these instead and hope the
- *   other side understands
- */
-#define RXKADINCONSISTENCY	19270400	/* security module structure inconsistent */
-#define RXKADPACKETSHORT	19270401	/* packet too short for security challenge */
-#define RXKADLEVELFAIL		19270402	/* security level negotiation failed */
-#define RXKADTICKETLEN		19270403	/* ticket length too short or too long */
-#define RXKADOUTOFSEQUENCE	19270404	/* packet had bad sequence number */
-#define RXKADNOAUTH		19270405	/* caller not authorised */
-#define RXKADBADKEY		19270406	/* illegal key: bad parity or weak */
-#define RXKADBADTICKET		19270407	/* security object was passed a bad ticket */
-#define RXKADUNKNOWNKEY		19270408	/* ticket contained unknown key version number */
-#define RXKADEXPIRED		19270409	/* authentication expired */
-#define RXKADSEALEDINCON	19270410	/* sealed data inconsistent */
-#define RXKADDATALEN		19270411	/* user data too long */
-#define RXKADILLEGALLEVEL	19270412	/* caller not authorised to use encrypted conns */
-
-#endif /* _LINUX_RXRPC_PACKET_H */
diff --git a/include/uapi/linux/rxrpc.h b/include/uapi/linux/rxrpc.h
index 08e2fb9c70ae..9656aad8f8f7 100644
--- a/include/uapi/linux/rxrpc.h
+++ b/include/uapi/linux/rxrpc.h
@@ -77,4 +77,48 @@ enum rxrpc_cmsg_type {
 #define RXRPC_SECURITY_RXGK	4	/* gssapi-based */
 #define RXRPC_SECURITY_RXK5	5	/* kerberos 5 */
 
+/*
+ * RxRPC-level abort codes
+ */
+#define RX_CALL_DEAD		-1	/* call/conn has been inactive and is shut down */
+#define RX_INVALID_OPERATION	-2	/* invalid operation requested / attempted */
+#define RX_CALL_TIMEOUT		-3	/* call timeout exceeded */
+#define RX_EOF			-4	/* unexpected end of data on read op */
+#define RX_PROTOCOL_ERROR	-5	/* low-level protocol error */
+#define RX_USER_ABORT		-6	/* generic user abort */
+#define RX_ADDRINUSE		-7	/* UDP port in use */
+#define RX_DEBUGI_BADTYPE	-8	/* bad debugging packet type */
+
+/*
+ * (un)marshalling abort codes (rxgen)
+ */
+#define RXGEN_CC_MARSHAL	-450
+#define RXGEN_CC_UNMARSHAL	-451
+#define RXGEN_SS_MARSHAL	-452
+#define RXGEN_SS_UNMARSHAL	-453
+#define RXGEN_DECODE		-454
+#define RXGEN_OPCODE		-455
+#define RXGEN_SS_XDRFREE	-456
+#define RXGEN_CC_XDRFREE	-457
+
+/*
+ * Rx kerberos security abort codes
+ * - unfortunately we have no generalised security abort codes to say things
+ *   like "unsupported security", so we have to use these instead and hope the
+ *   other side understands
+ */
+#define RXKADINCONSISTENCY	19270400	/* security module structure inconsistent */
+#define RXKADPACKETSHORT	19270401	/* packet too short for security challenge */
+#define RXKADLEVELFAIL		19270402	/* security level negotiation failed */
+#define RXKADTICKETLEN		19270403	/* ticket length too short or too long */
+#define RXKADOUTOFSEQUENCE	19270404	/* packet had bad sequence number */
+#define RXKADNOAUTH		19270405	/* caller not authorised */
+#define RXKADBADKEY		19270406	/* illegal key: bad parity or weak */
+#define RXKADBADTICKET		19270407	/* security object was passed a bad ticket */
+#define RXKADUNKNOWNKEY		19270408	/* ticket contained unknown key version number */
+#define RXKADEXPIRED		19270409	/* authentication expired */
+#define RXKADSEALEDINCON	19270410	/* sealed data inconsistent */
+#define RXKADDATALEN		19270411	/* user data too long */
+#define RXKADILLEGALLEVEL	19270412	/* caller not authorised to use encrypted conns */
+
 #endif /* _UAPI_LINUX_RXRPC_H */
diff --git a/net/rxrpc/ar-internal.h b/net/rxrpc/ar-internal.h
index 69b97339ff9d..8c0db9b3e4ab 100644
--- a/net/rxrpc/ar-internal.h
+++ b/net/rxrpc/ar-internal.h
@@ -15,7 +15,7 @@
 #include <net/netns/generic.h>
 #include <net/sock.h>
 #include <net/af_rxrpc.h>
-#include <rxrpc/packet.h>
+#include "protocol.h"
 
 #if 0
 #define CHECK_SLAB_OKAY(X)				     \
diff --git a/net/rxrpc/protocol.h b/net/rxrpc/protocol.h
new file mode 100644
index 000000000000..4bddcf3face3
--- /dev/null
+++ b/net/rxrpc/protocol.h
@@ -0,0 +1,190 @@
+/* packet.h: Rx packet layout and definitions
+ *
+ * Copyright (C) 2002, 2007 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#ifndef _LINUX_RXRPC_PACKET_H
+#define _LINUX_RXRPC_PACKET_H
+
+typedef u32	rxrpc_seq_t;	/* Rx message sequence number */
+typedef u32	rxrpc_serial_t;	/* Rx message serial number */
+typedef __be32	rxrpc_seq_net_t; /* on-the-wire Rx message sequence number */
+typedef __be32	rxrpc_serial_net_t; /* on-the-wire Rx message serial number */
+
+/*****************************************************************************/
+/*
+ * on-the-wire Rx packet header
+ * - all multibyte fields should be in network byte order
+ */
+struct rxrpc_wire_header {
+	__be32		epoch;		/* client boot timestamp */
+#define RXRPC_RANDOM_EPOCH	0x80000000	/* Random if set, date-based if not */
+
+	__be32		cid;		/* connection and channel ID */
+#define RXRPC_MAXCALLS		4			/* max active calls per conn */
+#define RXRPC_CHANNELMASK	(RXRPC_MAXCALLS-1)	/* mask for channel ID */
+#define RXRPC_CIDMASK		(~RXRPC_CHANNELMASK)	/* mask for connection ID */
+#define RXRPC_CIDSHIFT		ilog2(RXRPC_MAXCALLS)	/* shift for connection ID */
+#define RXRPC_CID_INC		(1 << RXRPC_CIDSHIFT)	/* connection ID increment */
+
+	__be32		callNumber;	/* call ID (0 for connection-level packets) */
+	__be32		seq;		/* sequence number of pkt in call stream */
+	__be32		serial;		/* serial number of pkt sent to network */
+
+	uint8_t		type;		/* packet type */
+#define RXRPC_PACKET_TYPE_DATA		1	/* data */
+#define RXRPC_PACKET_TYPE_ACK		2	/* ACK */
+#define RXRPC_PACKET_TYPE_BUSY		3	/* call reject */
+#define RXRPC_PACKET_TYPE_ABORT		4	/* call/connection abort */
+#define RXRPC_PACKET_TYPE_ACKALL	5	/* ACK all outstanding packets on call */
+#define RXRPC_PACKET_TYPE_CHALLENGE	6	/* connection security challenge (SRVR->CLNT) */
+#define RXRPC_PACKET_TYPE_RESPONSE	7	/* connection secutity response (CLNT->SRVR) */
+#define RXRPC_PACKET_TYPE_DEBUG		8	/* debug info request */
+#define RXRPC_PACKET_TYPE_VERSION	13	/* version string request */
+#define RXRPC_N_PACKET_TYPES		14	/* number of packet types (incl type 0) */
+
+	uint8_t		flags;		/* packet flags */
+#define RXRPC_CLIENT_INITIATED	0x01		/* signifies a packet generated by a client */
+#define RXRPC_REQUEST_ACK	0x02		/* request an unconditional ACK of this packet */
+#define RXRPC_LAST_PACKET	0x04		/* the last packet from this side for this call */
+#define RXRPC_MORE_PACKETS	0x08		/* more packets to come */
+#define RXRPC_JUMBO_PACKET	0x20		/* [DATA] this is a jumbo packet */
+#define RXRPC_SLOW_START_OK	0x20		/* [ACK] slow start supported */
+
+	uint8_t		userStatus;	/* app-layer defined status */
+#define RXRPC_USERSTATUS_SERVICE_UPGRADE 0x01	/* AuriStor service upgrade request */
+	
+	uint8_t		securityIndex;	/* security protocol ID */
+	union {
+		__be16	_rsvd;		/* reserved */
+		__be16	cksum;		/* kerberos security checksum */
+	};
+	__be16		serviceId;	/* service ID */
+
+} __packed;
+
+#define RXRPC_SUPPORTED_PACKET_TYPES (			\
+		(1 << RXRPC_PACKET_TYPE_DATA) |		\
+		(1 << RXRPC_PACKET_TYPE_ACK) |		\
+		(1 << RXRPC_PACKET_TYPE_BUSY) |		\
+		(1 << RXRPC_PACKET_TYPE_ABORT) |	\
+		(1 << RXRPC_PACKET_TYPE_ACKALL) |	\
+		(1 << RXRPC_PACKET_TYPE_CHALLENGE) |	\
+		(1 << RXRPC_PACKET_TYPE_RESPONSE) |	\
+		/*(1 << RXRPC_PACKET_TYPE_DEBUG) | */	\
+		(1 << RXRPC_PACKET_TYPE_VERSION))
+
+/*****************************************************************************/
+/*
+ * jumbo packet secondary header
+ * - can be mapped to read header by:
+ *   - new_serial = serial + 1
+ *   - new_seq = seq + 1
+ *   - new_flags = j_flags
+ *   - new__rsvd = j__rsvd
+ *   - duplicating all other fields
+ */
+struct rxrpc_jumbo_header {
+	uint8_t		flags;		/* packet flags (as per rxrpc_header) */
+	uint8_t		pad;
+	union {
+		__be16	_rsvd;		/* reserved */
+		__be16	cksum;		/* kerberos security checksum */
+	};
+};
+
+#define RXRPC_JUMBO_DATALEN	1412	/* non-terminal jumbo packet data length */
+#define RXRPC_JUMBO_SUBPKTLEN	(RXRPC_JUMBO_DATALEN + sizeof(struct rxrpc_jumbo_header))
+
+/*****************************************************************************/
+/*
+ * on-the-wire Rx ACK packet data payload
+ * - all multibyte fields should be in network byte order
+ */
+struct rxrpc_ackpacket {
+	__be16		bufferSpace;	/* number of packet buffers available */
+	__be16		maxSkew;	/* diff between serno being ACK'd and highest serial no
+					 * received */
+	__be32		firstPacket;	/* sequence no of first ACK'd packet in attached list */
+	__be32		previousPacket;	/* sequence no of previous packet received */
+	__be32		serial;		/* serial no of packet that prompted this ACK */
+
+	uint8_t		reason;		/* reason for ACK */
+#define RXRPC_ACK_REQUESTED		1	/* ACK was requested on packet */
+#define RXRPC_ACK_DUPLICATE		2	/* duplicate packet received */
+#define RXRPC_ACK_OUT_OF_SEQUENCE	3	/* out of sequence packet received */
+#define RXRPC_ACK_EXCEEDS_WINDOW	4	/* packet received beyond end of ACK window */
+#define RXRPC_ACK_NOSPACE		5	/* packet discarded due to lack of buffer space */
+#define RXRPC_ACK_PING			6	/* keep alive ACK */
+#define RXRPC_ACK_PING_RESPONSE		7	/* response to RXRPC_ACK_PING */
+#define RXRPC_ACK_DELAY			8	/* nothing happened since received packet */
+#define RXRPC_ACK_IDLE			9	/* ACK due to fully received ACK window */
+#define RXRPC_ACK__INVALID		10	/* Representation of invalid ACK reason */
+
+	uint8_t		nAcks;		/* number of ACKs */
+#define RXRPC_MAXACKS	255
+
+	uint8_t		acks[0];	/* list of ACK/NAKs */
+#define RXRPC_ACK_TYPE_NACK		0
+#define RXRPC_ACK_TYPE_ACK		1
+
+} __packed;
+
+/* Some ACKs refer to specific packets and some are general and can be updated. */
+#define RXRPC_ACK_UPDATEABLE ((1 << RXRPC_ACK_REQUESTED)	|	\
+			      (1 << RXRPC_ACK_PING_RESPONSE)	|	\
+			      (1 << RXRPC_ACK_DELAY)		|	\
+			      (1 << RXRPC_ACK_IDLE))
+
+
+/*
+ * ACK packets can have a further piece of information tagged on the end
+ */
+struct rxrpc_ackinfo {
+	__be32		rxMTU;		/* maximum Rx MTU size (bytes) [AFS 3.3] */
+	__be32		maxMTU;		/* maximum interface MTU size (bytes) [AFS 3.3] */
+	__be32		rwind;		/* Rx window size (packets) [AFS 3.4] */
+	__be32		jumbo_max;	/* max packets to stick into a jumbo packet [AFS 3.5] */
+};
+
+/*****************************************************************************/
+/*
+ * Kerberos security type-2 challenge packet
+ */
+struct rxkad_challenge {
+	__be32		version;	/* version of this challenge type */
+	__be32		nonce;		/* encrypted random number */
+	__be32		min_level;	/* minimum security level */
+	__be32		__padding;	/* padding to 8-byte boundary */
+} __packed;
+
+/*****************************************************************************/
+/*
+ * Kerberos security type-2 response packet
+ */
+struct rxkad_response {
+	__be32		version;	/* version of this response type */
+	__be32		__pad;
+
+	/* encrypted bit of the response */
+	struct {
+		__be32		epoch;		/* current epoch */
+		__be32		cid;		/* parent connection ID */
+		__be32		checksum;	/* checksum */
+		__be32		securityIndex;	/* security type */
+		__be32		call_id[4];	/* encrypted call IDs */
+		__be32		inc_nonce;	/* challenge nonce + 1 */
+		__be32		level;		/* desired level */
+	} encrypted;
+
+	__be32		kvno;		/* Kerberos key version number */
+	__be32		ticket_len;	/* Kerberos ticket length  */
+} __packed;
+
+#endif /* _LINUX_RXRPC_PACKET_H */
-- 
cgit v1.2.3


From 8382a11b2782465a0edec5c8b99b93087ad1f66c Mon Sep 17 00:00:00 2001
From: Mauro Carvalho Chehab <mchehab@s-opensource.com>
Date: Fri, 21 Jul 2017 08:03:24 -0400
Subject: media: v4l2-fwnode: fix a Sphinx warning

The kernel-doc tag is wrong there, causing this warning:
	./include/media/v4l2-fwnode.h:66: warning: bad line: 		   index (1)

Signed-off-by: Mauro Carvalho Chehab <mchehab@s-opensource.com>
---
 include/media/v4l2-fwnode.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/media/v4l2-fwnode.h b/include/media/v4l2-fwnode.h
index b373c43f65e8..cb34dcb0bb65 100644
--- a/include/media/v4l2-fwnode.h
+++ b/include/media/v4l2-fwnode.h
@@ -63,7 +63,7 @@ struct v4l2_fwnode_bus_parallel {
  *	       false - not inverted, true - inverted
  * @strobe: false - data/clock, true - data/strobe
  * @lane_polarity: the polarities of the clock (index 0) and data lanes
-		   index (1)
+ *		   index (1)
  * @data_lane: the number of the data lane
  * @clock_lane: the number of the clock lane
  */
-- 
cgit v1.2.3


From bc2fb7ed089ffd16d26e1d95b898a37d2b37d201 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Mon, 15 May 2017 09:34:01 -0400
Subject: cgroup: add @flags to css_task_iter_start() and implement
 CSS_TASK_ITER_PROCS

css_task_iter currently always walks all tasks.  With the scheduled
cgroup v2 thread support, the iterator would need to handle multiple
types of iteration.  As a preparation, add @flags to
css_task_iter_start() and implement CSS_TASK_ITER_PROCS.  If the flag
is not specified, it walks all tasks as before.  When asserted, the
iterator only walks the group leaders.

For now, the only user of the flag is cgroup v2 "cgroup.procs" file
which no longer needs to skip non-leader tasks in cgroup_procs_next().
Note that cgroup v1 "cgroup.procs" can't use the group leader walk as
v1 "cgroup.procs" doesn't mean "list all thread group leaders in the
cgroup" but "list all thread group id's with any threads in the
cgroup".

While at it, update cgroup_procs_show() to use task_pid_vnr() instead
of task_tgid_vnr().  As the iteration guarantees that the function
only sees group leaders, this doesn't change the output and will allow
sharing the function for thread iteration.

Signed-off-by: Tejun Heo <tj@kernel.org>
---
 include/linux/cgroup.h       |  6 +++++-
 kernel/cgroup/cgroup-v1.c    |  6 +++---
 kernel/cgroup/cgroup.c       | 24 ++++++++++++++----------
 kernel/cgroup/cpuset.c       |  6 +++---
 kernel/cgroup/freezer.c      |  6 +++---
 mm/memcontrol.c              |  2 +-
 net/core/netclassid_cgroup.c |  2 +-
 7 files changed, 30 insertions(+), 22 deletions(-)

(limited to 'include')

diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index 308b10797a54..cae5831ae650 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -36,9 +36,13 @@
 #define CGROUP_WEIGHT_DFL		100
 #define CGROUP_WEIGHT_MAX		10000
 
+/* walk only threadgroup leaders */
+#define CSS_TASK_ITER_PROCS		(1U << 0)
+
 /* a css_task_iter should be treated as an opaque object */
 struct css_task_iter {
 	struct cgroup_subsys		*ss;
+	unsigned int			flags;
 
 	struct list_head		*cset_pos;
 	struct list_head		*cset_head;
@@ -129,7 +133,7 @@ struct task_struct *cgroup_taskset_first(struct cgroup_taskset *tset,
 struct task_struct *cgroup_taskset_next(struct cgroup_taskset *tset,
 					struct cgroup_subsys_state **dst_cssp);
 
-void css_task_iter_start(struct cgroup_subsys_state *css,
+void css_task_iter_start(struct cgroup_subsys_state *css, unsigned int flags,
 			 struct css_task_iter *it);
 struct task_struct *css_task_iter_next(struct css_task_iter *it);
 void css_task_iter_end(struct css_task_iter *it);
diff --git a/kernel/cgroup/cgroup-v1.c b/kernel/cgroup/cgroup-v1.c
index 60f72475863e..167aaab04bf9 100644
--- a/kernel/cgroup/cgroup-v1.c
+++ b/kernel/cgroup/cgroup-v1.c
@@ -121,7 +121,7 @@ int cgroup_transfer_tasks(struct cgroup *to, struct cgroup *from)
 	 * ->can_attach() fails.
 	 */
 	do {
-		css_task_iter_start(&from->self, &it);
+		css_task_iter_start(&from->self, 0, &it);
 		task = css_task_iter_next(&it);
 		if (task)
 			get_task_struct(task);
@@ -373,7 +373,7 @@ static int pidlist_array_load(struct cgroup *cgrp, enum cgroup_filetype type,
 	if (!array)
 		return -ENOMEM;
 	/* now, populate the array */
-	css_task_iter_start(&cgrp->self, &it);
+	css_task_iter_start(&cgrp->self, 0, &it);
 	while ((tsk = css_task_iter_next(&it))) {
 		if (unlikely(n == length))
 			break;
@@ -749,7 +749,7 @@ int cgroupstats_build(struct cgroupstats *stats, struct dentry *dentry)
 	}
 	rcu_read_unlock();
 
-	css_task_iter_start(&cgrp->self, &it);
+	css_task_iter_start(&cgrp->self, 0, &it);
 	while ((tsk = css_task_iter_next(&it))) {
 		switch (tsk->state) {
 		case TASK_RUNNING:
diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
index e3bda0752501..3c5a37a9a892 100644
--- a/kernel/cgroup/cgroup.c
+++ b/kernel/cgroup/cgroup.c
@@ -3643,6 +3643,7 @@ static void css_task_iter_advance(struct css_task_iter *it)
 	lockdep_assert_held(&css_set_lock);
 	WARN_ON_ONCE(!l);
 
+repeat:
 	/*
 	 * Advance iterator to find next entry.  cset->tasks is consumed
 	 * first and then ->mg_tasks.  After ->mg_tasks, we move onto the
@@ -3657,11 +3658,18 @@ static void css_task_iter_advance(struct css_task_iter *it)
 		css_task_iter_advance_css_set(it);
 	else
 		it->task_pos = l;
+
+	/* if PROCS, skip over tasks which aren't group leaders */
+	if ((it->flags & CSS_TASK_ITER_PROCS) && it->task_pos &&
+	    !thread_group_leader(list_entry(it->task_pos, struct task_struct,
+					    cg_list)))
+		goto repeat;
 }
 
 /**
  * css_task_iter_start - initiate task iteration
  * @css: the css to walk tasks of
+ * @flags: CSS_TASK_ITER_* flags
  * @it: the task iterator to use
  *
  * Initiate iteration through the tasks of @css.  The caller can call
@@ -3669,7 +3677,7 @@ static void css_task_iter_advance(struct css_task_iter *it)
  * returns NULL.  On completion of iteration, css_task_iter_end() must be
  * called.
  */
-void css_task_iter_start(struct cgroup_subsys_state *css,
+void css_task_iter_start(struct cgroup_subsys_state *css, unsigned int flags,
 			 struct css_task_iter *it)
 {
 	/* no one should try to iterate before mounting cgroups */
@@ -3680,6 +3688,7 @@ void css_task_iter_start(struct cgroup_subsys_state *css,
 	spin_lock_irq(&css_set_lock);
 
 	it->ss = css->ss;
+	it->flags = flags;
 
 	if (it->ss)
 		it->cset_pos = &css->cgroup->e_csets[css->ss->id];
@@ -3753,13 +3762,8 @@ static void *cgroup_procs_next(struct seq_file *s, void *v, loff_t *pos)
 {
 	struct kernfs_open_file *of = s->private;
 	struct css_task_iter *it = of->priv;
-	struct task_struct *task;
-
-	do {
-		task = css_task_iter_next(it);
-	} while (task && !thread_group_leader(task));
 
-	return task;
+	return css_task_iter_next(it);
 }
 
 static void *cgroup_procs_start(struct seq_file *s, loff_t *pos)
@@ -3780,10 +3784,10 @@ static void *cgroup_procs_start(struct seq_file *s, loff_t *pos)
 		if (!it)
 			return ERR_PTR(-ENOMEM);
 		of->priv = it;
-		css_task_iter_start(&cgrp->self, it);
+		css_task_iter_start(&cgrp->self, CSS_TASK_ITER_PROCS, it);
 	} else if (!(*pos)++) {
 		css_task_iter_end(it);
-		css_task_iter_start(&cgrp->self, it);
+		css_task_iter_start(&cgrp->self, CSS_TASK_ITER_PROCS, it);
 	}
 
 	return cgroup_procs_next(s, NULL, NULL);
@@ -3791,7 +3795,7 @@ static void *cgroup_procs_start(struct seq_file *s, loff_t *pos)
 
 static int cgroup_procs_show(struct seq_file *s, void *v)
 {
-	seq_printf(s, "%d\n", task_tgid_vnr(v));
+	seq_printf(s, "%d\n", task_pid_vnr(v));
 	return 0;
 }
 
diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c
index ca8376e5008c..252d70c9a49b 100644
--- a/kernel/cgroup/cpuset.c
+++ b/kernel/cgroup/cpuset.c
@@ -861,7 +861,7 @@ static void update_tasks_cpumask(struct cpuset *cs)
 	struct css_task_iter it;
 	struct task_struct *task;
 
-	css_task_iter_start(&cs->css, &it);
+	css_task_iter_start(&cs->css, 0, &it);
 	while ((task = css_task_iter_next(&it)))
 		set_cpus_allowed_ptr(task, cs->effective_cpus);
 	css_task_iter_end(&it);
@@ -1091,7 +1091,7 @@ static void update_tasks_nodemask(struct cpuset *cs)
 	 * It's ok if we rebind the same mm twice; mpol_rebind_mm()
 	 * is idempotent.  Also migrate pages in each mm to new nodes.
 	 */
-	css_task_iter_start(&cs->css, &it);
+	css_task_iter_start(&cs->css, 0, &it);
 	while ((task = css_task_iter_next(&it))) {
 		struct mm_struct *mm;
 		bool migrate;
@@ -1284,7 +1284,7 @@ static void update_tasks_flags(struct cpuset *cs)
 	struct css_task_iter it;
 	struct task_struct *task;
 
-	css_task_iter_start(&cs->css, &it);
+	css_task_iter_start(&cs->css, 0, &it);
 	while ((task = css_task_iter_next(&it)))
 		cpuset_update_task_spread_flag(cs, task);
 	css_task_iter_end(&it);
diff --git a/kernel/cgroup/freezer.c b/kernel/cgroup/freezer.c
index 1b72d56edce5..08236798d173 100644
--- a/kernel/cgroup/freezer.c
+++ b/kernel/cgroup/freezer.c
@@ -268,7 +268,7 @@ static void update_if_frozen(struct cgroup_subsys_state *css)
 	rcu_read_unlock();
 
 	/* are all tasks frozen? */
-	css_task_iter_start(css, &it);
+	css_task_iter_start(css, 0, &it);
 
 	while ((task = css_task_iter_next(&it))) {
 		if (freezing(task)) {
@@ -320,7 +320,7 @@ static void freeze_cgroup(struct freezer *freezer)
 	struct css_task_iter it;
 	struct task_struct *task;
 
-	css_task_iter_start(&freezer->css, &it);
+	css_task_iter_start(&freezer->css, 0, &it);
 	while ((task = css_task_iter_next(&it)))
 		freeze_task(task);
 	css_task_iter_end(&it);
@@ -331,7 +331,7 @@ static void unfreeze_cgroup(struct freezer *freezer)
 	struct css_task_iter it;
 	struct task_struct *task;
 
-	css_task_iter_start(&freezer->css, &it);
+	css_task_iter_start(&freezer->css, 0, &it);
 	while ((task = css_task_iter_next(&it)))
 		__thaw_task(task);
 	css_task_iter_end(&it);
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 3df3c04d73ab..2b2f071f914b 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -917,7 +917,7 @@ int mem_cgroup_scan_tasks(struct mem_cgroup *memcg,
 		struct css_task_iter it;
 		struct task_struct *task;
 
-		css_task_iter_start(&iter->css, &it);
+		css_task_iter_start(&iter->css, 0, &it);
 		while (!ret && (task = css_task_iter_next(&it)))
 			ret = fn(task, arg);
 		css_task_iter_end(&it);
diff --git a/net/core/netclassid_cgroup.c b/net/core/netclassid_cgroup.c
index 029a61ac6cdd..5e4f04004a49 100644
--- a/net/core/netclassid_cgroup.c
+++ b/net/core/netclassid_cgroup.c
@@ -100,7 +100,7 @@ static int write_classid(struct cgroup_subsys_state *css, struct cftype *cft,
 
 	cs->classid = (u32)value;
 
-	css_task_iter_start(css, &it);
+	css_task_iter_start(css, 0, &it);
 	while ((p = css_task_iter_next(&it))) {
 		task_lock(p);
 		iterate_fd(p->files, 0, update_classid_sock,
-- 
cgit v1.2.3


From 454000adaa2a7420df6e56a42f22726d05872a3f Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Mon, 15 May 2017 09:34:02 -0400
Subject: cgroup: introduce cgroup->dom_cgrp and threaded css_set handling

cgroup v2 is in the process of growing thread granularity support.  A
threaded subtree is composed of a thread root and threaded cgroups
which are proper members of the subtree.

The root cgroup of the subtree serves as the domain cgroup to which
the processes (as opposed to threads / tasks) of the subtree
conceptually belong and domain-level resource consumptions not tied to
any specific task are charged.  Inside the subtree, threads won't be
subject to process granularity or no-internal-task constraint and can
be distributed arbitrarily across the subtree.

This patch introduces cgroup->dom_cgrp along with threaded css_set
handling.

* cgroup->dom_cgrp points to self for normal and thread roots.  For
  proper thread subtree members, points to the dom_cgrp (the thread
  root).

* css_set->dom_cset points to self if for normal and thread roots.  If
  threaded, points to the css_set which belongs to the cgrp->dom_cgrp.
  The dom_cgrp serves as the resource domain and keeps the matching
  csses available.  The dom_cset holds those csses and makes them
  easily accessible.

* All threaded csets are linked on their dom_csets to enable iteration
  of all threaded tasks.

* cgroup->nr_threaded_children keeps track of the number of threaded
  children.

This patch adds the above but doesn't actually use them yet.  The
following patches will build on top.

v4: ->nr_threaded_children added.

v3: ->proc_cgrp/cset renamed to ->dom_cgrp/cset.  Updated for the new
    enable-threaded-per-cgroup behavior.

v2: Added cgroup_is_threaded() helper.

Signed-off-by: Tejun Heo <tj@kernel.org>
---
 include/linux/cgroup-defs.h | 33 +++++++++++++++++++---
 include/linux/cgroup.h      |  3 +-
 kernel/cgroup/cgroup.c      | 69 +++++++++++++++++++++++++++++++++++++++++++--
 3 files changed, 97 insertions(+), 8 deletions(-)

(limited to 'include')

diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h
index ae7bc1e70085..651c4363c85e 100644
--- a/include/linux/cgroup-defs.h
+++ b/include/linux/cgroup-defs.h
@@ -172,6 +172,14 @@ struct css_set {
 	/* reference count */
 	refcount_t refcount;
 
+	/*
+	 * For a domain cgroup, the following points to self.  If threaded,
+	 * to the matching cset of the nearest domain ancestor.  The
+	 * dom_cset provides access to the domain cgroup and its csses to
+	 * which domain level resource consumptions should be charged.
+	 */
+	struct css_set *dom_cset;
+
 	/* the default cgroup associated with this css_set */
 	struct cgroup *dfl_cgrp;
 
@@ -200,6 +208,10 @@ struct css_set {
 	 */
 	struct list_head e_cset_node[CGROUP_SUBSYS_COUNT];
 
+	/* all threaded csets whose ->dom_cset points to this cset */
+	struct list_head threaded_csets;
+	struct list_head threaded_csets_node;
+
 	/*
 	 * List running through all cgroup groups in the same hash
 	 * slot. Protected by css_set_lock
@@ -267,12 +279,16 @@ struct cgroup {
 	 * doesn't have any tasks.
 	 *
 	 * All children which have non-zero nr_populated_csets and/or
-	 * nr_populated_children of their own contribute one to
-	 * nr_populated_children.  The counter is zero iff this cgroup's
-	 * subtree proper doesn't have any tasks.
+	 * nr_populated_children of their own contribute one to either
+	 * nr_populated_domain_children or nr_populated_threaded_children
+	 * depending on their type.  Each counter is zero iff all cgroups
+	 * of the type in the subtree proper don't have any tasks.
 	 */
 	int nr_populated_csets;
-	int nr_populated_children;
+	int nr_populated_domain_children;
+	int nr_populated_threaded_children;
+
+	int nr_threaded_children;	/* # of live threaded child cgroups */
 
 	struct kernfs_node *kn;		/* cgroup kernfs entry */
 	struct cgroup_file procs_file;	/* handle for "cgroup.procs" */
@@ -310,6 +326,15 @@ struct cgroup {
 	 */
 	struct list_head e_csets[CGROUP_SUBSYS_COUNT];
 
+	/*
+	 * If !threaded, self.  If threaded, it points to the nearest
+	 * domain ancestor.  Inside a threaded subtree, cgroups are exempt
+	 * from process granularity and no-internal-task constraint.
+	 * Domain level resource consumptions which aren't tied to a
+	 * specific task are charged to the dom_cgrp.
+	 */
+	struct cgroup *dom_cgrp;
+
 	/*
 	 * list of pidlists, up to two for each namespace (one for procs, one
 	 * for tasks); created on demand.
diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index cae5831ae650..b7dd23040cd5 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -541,7 +541,8 @@ static inline bool task_under_cgroup_hierarchy(struct task_struct *task,
 /* no synchronization, the result can only be used as a hint */
 static inline bool cgroup_is_populated(struct cgroup *cgrp)
 {
-	return cgrp->nr_populated_csets + cgrp->nr_populated_children;
+	return cgrp->nr_populated_csets + cgrp->nr_populated_domain_children +
+		cgrp->nr_populated_threaded_children;
 }
 
 /* returns ino associated with a cgroup */
diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
index 3c5a37a9a892..c7e1c243b77d 100644
--- a/kernel/cgroup/cgroup.c
+++ b/kernel/cgroup/cgroup.c
@@ -330,6 +330,11 @@ static bool cgroup_has_tasks(struct cgroup *cgrp)
 	return cgrp->nr_populated_csets;
 }
 
+static bool cgroup_is_threaded(struct cgroup *cgrp)
+{
+	return cgrp->dom_cgrp != cgrp;
+}
+
 /* subsystems visibly enabled on a cgroup */
 static u16 cgroup_control(struct cgroup *cgrp)
 {
@@ -565,9 +570,11 @@ EXPORT_SYMBOL_GPL(of_css);
  */
 struct css_set init_css_set = {
 	.refcount		= REFCOUNT_INIT(1),
+	.dom_cset		= &init_css_set,
 	.tasks			= LIST_HEAD_INIT(init_css_set.tasks),
 	.mg_tasks		= LIST_HEAD_INIT(init_css_set.mg_tasks),
 	.task_iters		= LIST_HEAD_INIT(init_css_set.task_iters),
+	.threaded_csets		= LIST_HEAD_INIT(init_css_set.threaded_csets),
 	.cgrp_links		= LIST_HEAD_INIT(init_css_set.cgrp_links),
 	.mg_preload_node	= LIST_HEAD_INIT(init_css_set.mg_preload_node),
 	.mg_node		= LIST_HEAD_INIT(init_css_set.mg_node),
@@ -575,6 +582,11 @@ struct css_set init_css_set = {
 
 static int css_set_count	= 1;	/* 1 for init_css_set */
 
+static bool css_set_threaded(struct css_set *cset)
+{
+	return cset->dom_cset != cset;
+}
+
 /**
  * css_set_populated - does a css_set contain any tasks?
  * @cset: target css_set
@@ -618,10 +630,14 @@ static void cgroup_update_populated(struct cgroup *cgrp, bool populated)
 	do {
 		bool was_populated = cgroup_is_populated(cgrp);
 
-		if (!child)
+		if (!child) {
 			cgrp->nr_populated_csets += adj;
-		else
-			cgrp->nr_populated_children += adj;
+		} else {
+			if (cgroup_is_threaded(child))
+				cgrp->nr_populated_threaded_children += adj;
+			else
+				cgrp->nr_populated_domain_children += adj;
+		}
 
 		if (was_populated == cgroup_is_populated(cgrp))
 			break;
@@ -747,6 +763,8 @@ void put_css_set_locked(struct css_set *cset)
 	if (!refcount_dec_and_test(&cset->refcount))
 		return;
 
+	WARN_ON_ONCE(!list_empty(&cset->threaded_csets));
+
 	/* This css_set is dead. unlink it and release cgroup and css refs */
 	for_each_subsys(ss, ssid) {
 		list_del(&cset->e_cset_node[ssid]);
@@ -763,6 +781,11 @@ void put_css_set_locked(struct css_set *cset)
 		kfree(link);
 	}
 
+	if (css_set_threaded(cset)) {
+		list_del(&cset->threaded_csets_node);
+		put_css_set_locked(cset->dom_cset);
+	}
+
 	kfree_rcu(cset, rcu_head);
 }
 
@@ -781,6 +804,7 @@ static bool compare_css_sets(struct css_set *cset,
 			     struct cgroup *new_cgrp,
 			     struct cgroup_subsys_state *template[])
 {
+	struct cgroup *new_dfl_cgrp;
 	struct list_head *l1, *l2;
 
 	/*
@@ -791,6 +815,16 @@ static bool compare_css_sets(struct css_set *cset,
 	if (memcmp(template, cset->subsys, sizeof(cset->subsys)))
 		return false;
 
+
+	/* @cset's domain should match the default cgroup's */
+	if (cgroup_on_dfl(new_cgrp))
+		new_dfl_cgrp = new_cgrp;
+	else
+		new_dfl_cgrp = old_cset->dfl_cgrp;
+
+	if (new_dfl_cgrp->dom_cgrp != cset->dom_cset->dfl_cgrp)
+		return false;
+
 	/*
 	 * Compare cgroup pointers in order to distinguish between
 	 * different cgroups in hierarchies.  As different cgroups may
@@ -998,9 +1032,11 @@ static struct css_set *find_css_set(struct css_set *old_cset,
 	}
 
 	refcount_set(&cset->refcount, 1);
+	cset->dom_cset = cset;
 	INIT_LIST_HEAD(&cset->tasks);
 	INIT_LIST_HEAD(&cset->mg_tasks);
 	INIT_LIST_HEAD(&cset->task_iters);
+	INIT_LIST_HEAD(&cset->threaded_csets);
 	INIT_HLIST_NODE(&cset->hlist);
 	INIT_LIST_HEAD(&cset->cgrp_links);
 	INIT_LIST_HEAD(&cset->mg_preload_node);
@@ -1038,6 +1074,28 @@ static struct css_set *find_css_set(struct css_set *old_cset,
 
 	spin_unlock_irq(&css_set_lock);
 
+	/*
+	 * If @cset should be threaded, look up the matching dom_cset and
+	 * link them up.  We first fully initialize @cset then look for the
+	 * dom_cset.  It's simpler this way and safe as @cset is guaranteed
+	 * to stay empty until we return.
+	 */
+	if (cgroup_is_threaded(cset->dfl_cgrp)) {
+		struct css_set *dcset;
+
+		dcset = find_css_set(cset, cset->dfl_cgrp->dom_cgrp);
+		if (!dcset) {
+			put_css_set(cset);
+			return NULL;
+		}
+
+		spin_lock_irq(&css_set_lock);
+		cset->dom_cset = dcset;
+		list_add_tail(&cset->threaded_csets_node,
+			      &dcset->threaded_csets);
+		spin_unlock_irq(&css_set_lock);
+	}
+
 	return cset;
 }
 
@@ -1680,6 +1738,7 @@ static void init_cgroup_housekeeping(struct cgroup *cgrp)
 	mutex_init(&cgrp->pidlist_mutex);
 	cgrp->self.cgroup = cgrp;
 	cgrp->self.flags |= CSS_ONLINE;
+	cgrp->dom_cgrp = cgrp;
 
 	for_each_subsys(ss, ssid)
 		INIT_LIST_HEAD(&cgrp->e_csets[ssid]);
@@ -4408,6 +4467,7 @@ static void kill_css(struct cgroup_subsys_state *css)
 static int cgroup_destroy_locked(struct cgroup *cgrp)
 	__releases(&cgroup_mutex) __acquires(&cgroup_mutex)
 {
+	struct cgroup *parent = cgroup_parent(cgrp);
 	struct cgroup_subsys_state *css;
 	struct cgrp_cset_link *link;
 	int ssid;
@@ -4452,6 +4512,9 @@ static int cgroup_destroy_locked(struct cgroup *cgrp)
 	 */
 	kernfs_remove(cgrp->kn);
 
+	if (parent && cgroup_is_threaded(cgrp))
+		parent->nr_threaded_children--;
+
 	cgroup1_check_for_release(cgroup_parent(cgrp));
 
 	/* put the base reference */
-- 
cgit v1.2.3


From 450ee0c1feed657894e0b4bdd48f3974af9d394c Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Mon, 15 May 2017 09:34:03 -0400
Subject: cgroup: implement CSS_TASK_ITER_THREADED

cgroup v2 is in the process of growing thread granularity support.
Once thread mode is enabled, the root cgroup of the subtree serves as
the dom_cgrp to which the processes of the subtree conceptually belong
and domain-level resource consumptions not tied to any specific task
are charged.  In the subtree, threads won't be subject to process
granularity or no-internal-task constraint and can be distributed
arbitrarily across the subtree.

This patch implements a new task iterator flag CSS_TASK_ITER_THREADED,
which, when used on a dom_cgrp, makes the iteration include the tasks
on all the associated threaded css_sets.  "cgroup.procs" read path is
updated to use it so that reading the file on a proc_cgrp lists all
processes.  This will also be used by controller implementations which
need to walk processes or tasks at the resource domain level.

Task iteration is implemented nested in css_set iteration.  If
CSS_TASK_ITER_THREADED is specified, after walking tasks of each
!threaded css_set, all the associated threaded css_sets are visited
before moving onto the next !threaded css_set.

v2: ->cur_pcset renamed to ->cur_dcset.  Updated for the new
    enable-threaded-per-cgroup behavior.

Signed-off-by: Tejun Heo <tj@kernel.org>
---
 include/linux/cgroup.h |  6 ++++
 kernel/cgroup/cgroup.c | 77 +++++++++++++++++++++++++++++++++++++++-----------
 2 files changed, 66 insertions(+), 17 deletions(-)

(limited to 'include')

diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index b7dd23040cd5..79faa6467f76 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -38,6 +38,8 @@
 
 /* walk only threadgroup leaders */
 #define CSS_TASK_ITER_PROCS		(1U << 0)
+/* walk all threaded css_sets in the domain */
+#define CSS_TASK_ITER_THREADED		(1U << 1)
 
 /* a css_task_iter should be treated as an opaque object */
 struct css_task_iter {
@@ -47,11 +49,15 @@ struct css_task_iter {
 	struct list_head		*cset_pos;
 	struct list_head		*cset_head;
 
+	struct list_head		*tcset_pos;
+	struct list_head		*tcset_head;
+
 	struct list_head		*task_pos;
 	struct list_head		*tasks_head;
 	struct list_head		*mg_tasks_head;
 
 	struct css_set			*cur_cset;
+	struct css_set			*cur_dcset;
 	struct task_struct		*cur_task;
 	struct list_head		iters_node;	/* css_set->task_iters */
 };
diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
index c7e1c243b77d..a1d59af274a9 100644
--- a/kernel/cgroup/cgroup.c
+++ b/kernel/cgroup/cgroup.c
@@ -3629,6 +3629,58 @@ bool css_has_online_children(struct cgroup_subsys_state *css)
 	return ret;
 }
 
+static struct css_set *css_task_iter_next_css_set(struct css_task_iter *it)
+{
+	struct list_head *l;
+	struct cgrp_cset_link *link;
+	struct css_set *cset;
+
+	lockdep_assert_held(&css_set_lock);
+
+	/* find the next threaded cset */
+	if (it->tcset_pos) {
+		l = it->tcset_pos->next;
+
+		if (l != it->tcset_head) {
+			it->tcset_pos = l;
+			return container_of(l, struct css_set,
+					    threaded_csets_node);
+		}
+
+		it->tcset_pos = NULL;
+	}
+
+	/* find the next cset */
+	l = it->cset_pos;
+	l = l->next;
+	if (l == it->cset_head) {
+		it->cset_pos = NULL;
+		return NULL;
+	}
+
+	if (it->ss) {
+		cset = container_of(l, struct css_set, e_cset_node[it->ss->id]);
+	} else {
+		link = list_entry(l, struct cgrp_cset_link, cset_link);
+		cset = link->cset;
+	}
+
+	it->cset_pos = l;
+
+	/* initialize threaded css_set walking */
+	if (it->flags & CSS_TASK_ITER_THREADED) {
+		if (it->cur_dcset)
+			put_css_set_locked(it->cur_dcset);
+		it->cur_dcset = cset;
+		get_css_set(cset);
+
+		it->tcset_head = &cset->threaded_csets;
+		it->tcset_pos = &cset->threaded_csets;
+	}
+
+	return cset;
+}
+
 /**
  * css_task_iter_advance_css_set - advance a task itererator to the next css_set
  * @it: the iterator to advance
@@ -3637,32 +3689,19 @@ bool css_has_online_children(struct cgroup_subsys_state *css)
  */
 static void css_task_iter_advance_css_set(struct css_task_iter *it)
 {
-	struct list_head *l = it->cset_pos;
-	struct cgrp_cset_link *link;
 	struct css_set *cset;
 
 	lockdep_assert_held(&css_set_lock);
 
 	/* Advance to the next non-empty css_set */
 	do {
-		l = l->next;
-		if (l == it->cset_head) {
-			it->cset_pos = NULL;
+		cset = css_task_iter_next_css_set(it);
+		if (!cset) {
 			it->task_pos = NULL;
 			return;
 		}
-
-		if (it->ss) {
-			cset = container_of(l, struct css_set,
-					    e_cset_node[it->ss->id]);
-		} else {
-			link = list_entry(l, struct cgrp_cset_link, cset_link);
-			cset = link->cset;
-		}
 	} while (!css_set_populated(cset));
 
-	it->cset_pos = l;
-
 	if (!list_empty(&cset->tasks))
 		it->task_pos = cset->tasks.next;
 	else
@@ -3805,6 +3844,9 @@ void css_task_iter_end(struct css_task_iter *it)
 		spin_unlock_irq(&css_set_lock);
 	}
 
+	if (it->cur_dcset)
+		put_css_set(it->cur_dcset);
+
 	if (it->cur_task)
 		put_task_struct(it->cur_task);
 }
@@ -3830,6 +3872,7 @@ static void *cgroup_procs_start(struct seq_file *s, loff_t *pos)
 	struct kernfs_open_file *of = s->private;
 	struct cgroup *cgrp = seq_css(s)->cgroup;
 	struct css_task_iter *it = of->priv;
+	unsigned iter_flags = CSS_TASK_ITER_PROCS | CSS_TASK_ITER_THREADED;
 
 	/*
 	 * When a seq_file is seeked, it's always traversed sequentially
@@ -3843,10 +3886,10 @@ static void *cgroup_procs_start(struct seq_file *s, loff_t *pos)
 		if (!it)
 			return ERR_PTR(-ENOMEM);
 		of->priv = it;
-		css_task_iter_start(&cgrp->self, CSS_TASK_ITER_PROCS, it);
+		css_task_iter_start(&cgrp->self, iter_flags, it);
 	} else if (!(*pos)++) {
 		css_task_iter_end(it);
-		css_task_iter_start(&cgrp->self, CSS_TASK_ITER_PROCS, it);
+		css_task_iter_start(&cgrp->self, iter_flags, it);
 	}
 
 	return cgroup_procs_next(s, NULL, NULL);
-- 
cgit v1.2.3


From 8cfd8147df67e741d93b8783a3ea8f3c74f93a0e Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Fri, 21 Jul 2017 11:14:51 -0400
Subject: cgroup: implement cgroup v2 thread support

This patch implements cgroup v2 thread support.  The goal of the
thread mode is supporting hierarchical accounting and control at
thread granularity while staying inside the resource domain model
which allows coordination across different resource controllers and
handling of anonymous resource consumptions.

A cgroup is always created as a domain and can be made threaded by
writing to the "cgroup.type" file.  When a cgroup becomes threaded, it
becomes a member of a threaded subtree which is anchored at the
closest ancestor which isn't threaded.

The threads of the processes which are in a threaded subtree can be
placed anywhere without being restricted by process granularity or
no-internal-process constraint.  Note that the threads aren't allowed
to escape to a different threaded subtree.  To be used inside a
threaded subtree, a controller should explicitly support threaded mode
and be able to handle internal competition in the way which is
appropriate for the resource.

The root of a threaded subtree, the nearest ancestor which isn't
threaded, is called the threaded domain and serves as the resource
domain for the whole subtree.  This is the last cgroup where domain
controllers are operational and where all the domain-level resource
consumptions in the subtree are accounted.  This allows threaded
controllers to operate at thread granularity when requested while
staying inside the scope of system-level resource distribution.

As the root cgroup is exempt from the no-internal-process constraint,
it can serve as both a threaded domain and a parent to normal cgroups,
so, unlike non-root cgroups, the root cgroup can have both domain and
threaded children.

Internally, in a threaded subtree, each css_set has its ->dom_cset
pointing to a matching css_set which belongs to the threaded domain.
This ensures that thread root level cgroup_subsys_state for all
threaded controllers are readily accessible for domain-level
operations.

This patch enables threaded mode for the pids and perf_events
controllers.  Neither has to worry about domain-level resource
consumptions and it's enough to simply set the flag.

For more details on the interface and behavior of the thread mode,
please refer to the section 2-2-2 in Documentation/cgroup-v2.txt added
by this patch.

v5: - Dropped silly no-op ->dom_cgrp init from cgroup_create().
      Spotted by Waiman.
    - Documentation updated as suggested by Waiman.
    - cgroup.type content slightly reformatted.
    - Mark the debug controller threaded.

v4: - Updated to the general idea of marking specific cgroups
      domain/threaded as suggested by PeterZ.

v3: - Dropped "join" and always make mixed children join the parent's
      threaded subtree.

v2: - After discussions with Waiman, support for mixed thread mode is
      added.  This should address the issue that Peter pointed out
      where any nesting should be avoided for thread subtrees while
      coexisting with other domain cgroups.
    - Enabling / disabling thread mode now piggy backs on the existing
      control mask update mechanism.
    - Bug fixes and cleanup.

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Waiman Long <longman@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
---
 Documentation/cgroup-v2.txt     | 185 +++++++++++++++++++--
 include/linux/cgroup-defs.h     |  12 ++
 kernel/cgroup/cgroup-internal.h |   2 +-
 kernel/cgroup/cgroup-v1.c       |   5 +-
 kernel/cgroup/cgroup.c          | 355 +++++++++++++++++++++++++++++++++++++---
 kernel/cgroup/debug.c           |   1 +
 kernel/cgroup/pids.c            |   1 +
 kernel/events/core.c            |   1 +
 8 files changed, 522 insertions(+), 40 deletions(-)

(limited to 'include')

diff --git a/Documentation/cgroup-v2.txt b/Documentation/cgroup-v2.txt
index f01f831a3b11..cb9ea281ab72 100644
--- a/Documentation/cgroup-v2.txt
+++ b/Documentation/cgroup-v2.txt
@@ -18,7 +18,9 @@ v1 is available under Documentation/cgroup-v1/.
      1-2. What is cgroup?
    2. Basic Operations
      2-1. Mounting
-     2-2. Organizing Processes
+     2-2. Organizing Processes and Threads
+       2-2-1. Processes
+       2-2-2. Threads
      2-3. [Un]populated Notification
      2-4. Controlling Controllers
        2-4-1. Enabling and Disabling
@@ -167,8 +169,11 @@ cgroup v2 currently supports the following mount options.
 	Delegation section for details.
 
 
-Organizing Processes
---------------------
+Organizing Processes and Threads
+--------------------------------
+
+Processes
+~~~~~~~~~
 
 Initially, only the root cgroup exists to which all processes belong.
 A child cgroup can be created by creating a sub-directory::
@@ -219,6 +224,104 @@ is removed subsequently, " (deleted)" is appended to the path::
   0::/test-cgroup/test-cgroup-nested (deleted)
 
 
+Threads
+~~~~~~~
+
+cgroup v2 supports thread granularity for a subset of controllers to
+support use cases requiring hierarchical resource distribution across
+the threads of a group of processes.  By default, all threads of a
+process belong to the same cgroup, which also serves as the resource
+domain to host resource consumptions which are not specific to a
+process or thread.  The thread mode allows threads to be spread across
+a subtree while still maintaining the common resource domain for them.
+
+Controllers which support thread mode are called threaded controllers.
+The ones which don't are called domain controllers.
+
+Marking a cgroup threaded makes it join the resource domain of its
+parent as a threaded cgroup.  The parent may be another threaded
+cgroup whose resource domain is further up in the hierarchy.  The root
+of a threaded subtree, that is, the nearest ancestor which is not
+threaded, is called threaded domain or thread root interchangeably and
+serves as the resource domain for the entire subtree.
+
+Inside a threaded subtree, threads of a process can be put in
+different cgroups and are not subject to the no internal process
+constraint - threaded controllers can be enabled on non-leaf cgroups
+whether they have threads in them or not.
+
+As the threaded domain cgroup hosts all the domain resource
+consumptions of the subtree, it is considered to have internal
+resource consumptions whether there are processes in it or not and
+can't have populated child cgroups which aren't threaded.  Because the
+root cgroup is not subject to no internal process constraint, it can
+serve both as a threaded domain and a parent to domain cgroups.
+
+The current operation mode or type of the cgroup is shown in the
+"cgroup.type" file which indicates whether the cgroup is a normal
+domain, a domain which is serving as the domain of a threaded subtree,
+or a threaded cgroup.
+
+On creation, a cgroup is always a domain cgroup and can be made
+threaded by writing "threaded" to the "cgroup.type" file.  The
+operation is single direction::
+
+  # echo threaded > cgroup.type
+
+Once threaded, the cgroup can't be made a domain again.  To enable the
+thread mode, the following conditions must be met.
+
+- As the cgroup will join the parent's resource domain.  The parent
+  must either be a valid (threaded) domain or a threaded cgroup.
+
+- The cgroup must be empty.  No enabled controllers, child cgroups or
+  processes.
+
+Topology-wise, a cgroup can be in an invalid state.  Please consider
+the following toplogy::
+
+  A (threaded domain) - B (threaded) - C (domain, just created)
+
+C is created as a domain but isn't connected to a parent which can
+host child domains.  C can't be used until it is turned into a
+threaded cgroup.  "cgroup.type" file will report "domain (invalid)" in
+these cases.  Operations which fail due to invalid topology use
+EOPNOTSUPP as the errno.
+
+A domain cgroup is turned into a threaded domain when one of its child
+cgroup becomes threaded or threaded controllers are enabled in the
+"cgroup.subtree_control" file while there are processes in the cgroup.
+A threaded domain reverts to a normal domain when the conditions
+clear.
+
+When read, "cgroup.threads" contains the list of the thread IDs of all
+threads in the cgroup.  Except that the operations are per-thread
+instead of per-process, "cgroup.threads" has the same format and
+behaves the same way as "cgroup.procs".  While "cgroup.threads" can be
+written to in any cgroup, as it can only move threads inside the same
+threaded domain, its operations are confined inside each threaded
+subtree.
+
+The threaded domain cgroup serves as the resource domain for the whole
+subtree, and, while the threads can be scattered across the subtree,
+all the processes are considered to be in the threaded domain cgroup.
+"cgroup.procs" in a threaded domain cgroup contains the PIDs of all
+processes in the subtree and is not readable in the subtree proper.
+However, "cgroup.procs" can be written to from anywhere in the subtree
+to migrate all threads of the matching process to the cgroup.
+
+Only threaded controllers can be enabled in a threaded subtree.  When
+a threaded controller is enabled inside a threaded subtree, it only
+accounts for and controls resource consumptions associated with the
+threads in the cgroup and its descendants.  All consumptions which
+aren't tied to a specific thread belong to the threaded domain cgroup.
+
+Because a threaded subtree is exempt from no internal process
+constraint, a threaded controller must be able to handle competition
+between threads in a non-leaf cgroup and its child cgroups.  Each
+threaded controller defines how such competitions are handled.
+
+
 [Un]populated Notification
 --------------------------
 
@@ -302,15 +405,15 @@ disabled if one or more children have it enabled.
 No Internal Process Constraint
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
-Non-root cgroups can only distribute resources to their children when
-they don't have any processes of their own.  In other words, only
-cgroups which don't contain any processes can have controllers enabled
-in their "cgroup.subtree_control" files.
+Non-root cgroups can distribute domain resources to their children
+only when they don't have any processes of their own.  In other words,
+only domain cgroups which don't contain any processes can have domain
+controllers enabled in their "cgroup.subtree_control" files.
 
-This guarantees that, when a controller is looking at the part of the
-hierarchy which has it enabled, processes are always only on the
-leaves.  This rules out situations where child cgroups compete against
-internal processes of the parent.
+This guarantees that, when a domain controller is looking at the part
+of the hierarchy which has it enabled, processes are always only on
+the leaves.  This rules out situations where child cgroups compete
+against internal processes of the parent.
 
 The root cgroup is exempt from this restriction.  Root contains
 processes and anonymous resource consumption which can't be associated
@@ -334,10 +437,10 @@ Model of Delegation
 ~~~~~~~~~~~~~~~~~~~
 
 A cgroup can be delegated in two ways.  First, to a less privileged
-user by granting write access of the directory and its "cgroup.procs"
-and "cgroup.subtree_control" files to the user.  Second, if the
-"nsdelegate" mount option is set, automatically to a cgroup namespace
-on namespace creation.
+user by granting write access of the directory and its "cgroup.procs",
+"cgroup.threads" and "cgroup.subtree_control" files to the user.
+Second, if the "nsdelegate" mount option is set, automatically to a
+cgroup namespace on namespace creation.
 
 Because the resource control interface files in a given directory
 control the distribution of the parent's resources, the delegatee
@@ -644,6 +747,29 @@ Core Interface Files
 
 All cgroup core files are prefixed with "cgroup."
 
+  cgroup.type
+
+	A read-write single value file which exists on non-root
+	cgroups.
+
+	When read, it indicates the current type of the cgroup, which
+	can be one of the following values.
+
+	- "domain" : A normal valid domain cgroup.
+
+	- "domain threaded" : A threaded domain cgroup which is
+          serving as the root of a threaded subtree.
+
+	- "domain invalid" : A cgroup which is in an invalid state.
+	  It can't be populated or have controllers enabled.  It may
+	  be allowed to become a threaded cgroup.
+
+	- "threaded" : A threaded cgroup which is a member of a
+          threaded subtree.
+
+	A cgroup can be turned into a threaded cgroup by writing
+	"threaded" to this file.
+
   cgroup.procs
 	A read-write new-line separated values file which exists on
 	all cgroups.
@@ -666,6 +792,35 @@ All cgroup core files are prefixed with "cgroup."
 	When delegating a sub-hierarchy, write access to this file
 	should be granted along with the containing directory.
 
+	In a threaded cgroup, reading this file fails with EOPNOTSUPP
+	as all the processes belong to the thread root.  Writing is
+	supported and moves every thread of the process to the cgroup.
+
+  cgroup.threads
+	A read-write new-line separated values file which exists on
+	all cgroups.
+
+	When read, it lists the TIDs of all threads which belong to
+	the cgroup one-per-line.  The TIDs are not ordered and the
+	same TID may show up more than once if the thread got moved to
+	another cgroup and then back or the TID got recycled while
+	reading.
+
+	A TID can be written to migrate the thread associated with the
+	TID to the cgroup.  The writer should match all of the
+	following conditions.
+
+	- It must have write access to the "cgroup.threads" file.
+
+	- The cgroup that the thread is currently in must be in the
+          same resource domain as the destination cgroup.
+
+	- It must have write access to the "cgroup.procs" file of the
+	  common ancestor of the source and destination cgroups.
+
+	When delegating a sub-hierarchy, write access to this file
+	should be granted along with the containing directory.
+
   cgroup.controllers
 	A read-only space separated values file which exists on all
 	cgroups.
diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h
index 651c4363c85e..9d741959f218 100644
--- a/include/linux/cgroup-defs.h
+++ b/include/linux/cgroup-defs.h
@@ -521,6 +521,18 @@ struct cgroup_subsys {
 	 */
 	bool implicit_on_dfl:1;
 
+	/*
+	 * If %true, the controller, supports threaded mode on the default
+	 * hierarchy.  In a threaded subtree, both process granularity and
+	 * no-internal-process constraint are ignored and a threaded
+	 * controllers should be able to handle that.
+	 *
+	 * Note that as an implicit controller is automatically enabled on
+	 * all cgroups on the default hierarchy, it should also be
+	 * threaded.  implicit && !threaded is not supported.
+	 */
+	bool threaded:1;
+
 	/*
 	 * If %false, this subsystem is properly hierarchical -
 	 * configuration, resource accounting and restriction on a parent
diff --git a/kernel/cgroup/cgroup-internal.h b/kernel/cgroup/cgroup-internal.h
index 0e81c6109e91..f10eb19ddf04 100644
--- a/kernel/cgroup/cgroup-internal.h
+++ b/kernel/cgroup/cgroup-internal.h
@@ -170,7 +170,7 @@ struct dentry *cgroup_do_mount(struct file_system_type *fs_type, int flags,
 			       struct cgroup_root *root, unsigned long magic,
 			       struct cgroup_namespace *ns);
 
-bool cgroup_may_migrate_to(struct cgroup *dst_cgrp);
+int cgroup_migrate_vet_dst(struct cgroup *dst_cgrp);
 void cgroup_migrate_finish(struct cgroup_mgctx *mgctx);
 void cgroup_migrate_add_src(struct css_set *src_cset, struct cgroup *dst_cgrp,
 			    struct cgroup_mgctx *mgctx);
diff --git a/kernel/cgroup/cgroup-v1.c b/kernel/cgroup/cgroup-v1.c
index 167aaab04bf9..f0e8601b13cb 100644
--- a/kernel/cgroup/cgroup-v1.c
+++ b/kernel/cgroup/cgroup-v1.c
@@ -99,8 +99,9 @@ int cgroup_transfer_tasks(struct cgroup *to, struct cgroup *from)
 	if (cgroup_on_dfl(to))
 		return -EINVAL;
 
-	if (!cgroup_may_migrate_to(to))
-		return -EBUSY;
+	ret = cgroup_migrate_vet_dst(to);
+	if (ret)
+		return ret;
 
 	mutex_lock(&cgroup_mutex);
 
diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
index a1d59af274a9..c396e701c206 100644
--- a/kernel/cgroup/cgroup.c
+++ b/kernel/cgroup/cgroup.c
@@ -162,6 +162,9 @@ static u16 cgrp_dfl_inhibit_ss_mask;
 /* some controllers are implicitly enabled on the default hierarchy */
 static u16 cgrp_dfl_implicit_ss_mask;
 
+/* some controllers can be threaded on the default hierarchy */
+static u16 cgrp_dfl_threaded_ss_mask;
+
 /* The list of hierarchy roots */
 LIST_HEAD(cgroup_roots);
 static int cgroup_root_count;
@@ -335,14 +338,93 @@ static bool cgroup_is_threaded(struct cgroup *cgrp)
 	return cgrp->dom_cgrp != cgrp;
 }
 
+/* can @cgrp host both domain and threaded children? */
+static bool cgroup_is_mixable(struct cgroup *cgrp)
+{
+	/*
+	 * Root isn't under domain level resource control exempting it from
+	 * the no-internal-process constraint, so it can serve as a thread
+	 * root and a parent of resource domains at the same time.
+	 */
+	return !cgroup_parent(cgrp);
+}
+
+/* can @cgrp become a thread root? should always be true for a thread root */
+static bool cgroup_can_be_thread_root(struct cgroup *cgrp)
+{
+	/* mixables don't care */
+	if (cgroup_is_mixable(cgrp))
+		return true;
+
+	/* domain roots can't be nested under threaded */
+	if (cgroup_is_threaded(cgrp))
+		return false;
+
+	/* can only have either domain or threaded children */
+	if (cgrp->nr_populated_domain_children)
+		return false;
+
+	/* and no domain controllers can be enabled */
+	if (cgrp->subtree_control & ~cgrp_dfl_threaded_ss_mask)
+		return false;
+
+	return true;
+}
+
+/* is @cgrp root of a threaded subtree? */
+static bool cgroup_is_thread_root(struct cgroup *cgrp)
+{
+	/* thread root should be a domain */
+	if (cgroup_is_threaded(cgrp))
+		return false;
+
+	/* a domain w/ threaded children is a thread root */
+	if (cgrp->nr_threaded_children)
+		return true;
+
+	/*
+	 * A domain which has tasks and explicit threaded controllers
+	 * enabled is a thread root.
+	 */
+	if (cgroup_has_tasks(cgrp) &&
+	    (cgrp->subtree_control & cgrp_dfl_threaded_ss_mask))
+		return true;
+
+	return false;
+}
+
+/* a domain which isn't connected to the root w/o brekage can't be used */
+static bool cgroup_is_valid_domain(struct cgroup *cgrp)
+{
+	/* the cgroup itself can be a thread root */
+	if (cgroup_is_threaded(cgrp))
+		return false;
+
+	/* but the ancestors can't be unless mixable */
+	while ((cgrp = cgroup_parent(cgrp))) {
+		if (!cgroup_is_mixable(cgrp) && cgroup_is_thread_root(cgrp))
+			return false;
+		if (cgroup_is_threaded(cgrp))
+			return false;
+	}
+
+	return true;
+}
+
 /* subsystems visibly enabled on a cgroup */
 static u16 cgroup_control(struct cgroup *cgrp)
 {
 	struct cgroup *parent = cgroup_parent(cgrp);
 	u16 root_ss_mask = cgrp->root->subsys_mask;
 
-	if (parent)
-		return parent->subtree_control;
+	if (parent) {
+		u16 ss_mask = parent->subtree_control;
+
+		/* threaded cgroups can only have threaded controllers */
+		if (cgroup_is_threaded(cgrp))
+			ss_mask &= cgrp_dfl_threaded_ss_mask;
+		return ss_mask;
+	}
 
 	if (cgroup_on_dfl(cgrp))
 		root_ss_mask &= ~(cgrp_dfl_inhibit_ss_mask |
@@ -355,8 +437,14 @@ static u16 cgroup_ss_mask(struct cgroup *cgrp)
 {
 	struct cgroup *parent = cgroup_parent(cgrp);
 
-	if (parent)
-		return parent->subtree_ss_mask;
+	if (parent) {
+		u16 ss_mask = parent->subtree_ss_mask;
+
+		/* threaded cgroups can only have threaded controllers */
+		if (cgroup_is_threaded(cgrp))
+			ss_mask &= cgrp_dfl_threaded_ss_mask;
+		return ss_mask;
+	}
 
 	return cgrp->root->subsys_mask;
 }
@@ -2237,17 +2325,40 @@ out_release_tset:
 }
 
 /**
- * cgroup_may_migrate_to - verify whether a cgroup can be migration destination
+ * cgroup_migrate_vet_dst - verify whether a cgroup can be migration destination
  * @dst_cgrp: destination cgroup to test
  *
- * On the default hierarchy, except for the root, subtree_control must be
- * zero for migration destination cgroups with tasks so that child cgroups
- * don't compete against tasks.
+ * On the default hierarchy, except for the mixable, (possible) thread root
+ * and threaded cgroups, subtree_control must be zero for migration
+ * destination cgroups with tasks so that child cgroups don't compete
+ * against tasks.
  */
-bool cgroup_may_migrate_to(struct cgroup *dst_cgrp)
+int cgroup_migrate_vet_dst(struct cgroup *dst_cgrp)
 {
-	return !cgroup_on_dfl(dst_cgrp) || !cgroup_parent(dst_cgrp) ||
-		!dst_cgrp->subtree_control;
+	/* v1 doesn't have any restriction */
+	if (!cgroup_on_dfl(dst_cgrp))
+		return 0;
+
+	/* verify @dst_cgrp can host resources */
+	if (!cgroup_is_valid_domain(dst_cgrp->dom_cgrp))
+		return -EOPNOTSUPP;
+
+	/* mixables don't care */
+	if (cgroup_is_mixable(dst_cgrp))
+		return 0;
+
+	/*
+	 * If @dst_cgrp is already or can become a thread root or is
+	 * threaded, it doesn't matter.
+	 */
+	if (cgroup_can_be_thread_root(dst_cgrp) || cgroup_is_threaded(dst_cgrp))
+		return 0;
+
+	/* apply no-internal-process constraint */
+	if (dst_cgrp->subtree_control)
+		return -EBUSY;
+
+	return 0;
 }
 
 /**
@@ -2452,8 +2563,9 @@ int cgroup_attach_task(struct cgroup *dst_cgrp, struct task_struct *leader,
 	struct task_struct *task;
 	int ret;
 
-	if (!cgroup_may_migrate_to(dst_cgrp))
-		return -EBUSY;
+	ret = cgroup_migrate_vet_dst(dst_cgrp);
+	if (ret)
+		return ret;
 
 	/* look up all src csets */
 	spin_lock_irq(&css_set_lock);
@@ -2881,6 +2993,46 @@ static void cgroup_finalize_control(struct cgroup *cgrp, int ret)
 	cgroup_apply_control_disable(cgrp);
 }
 
+static int cgroup_vet_subtree_control_enable(struct cgroup *cgrp, u16 enable)
+{
+	u16 domain_enable = enable & ~cgrp_dfl_threaded_ss_mask;
+
+	/* if nothing is getting enabled, nothing to worry about */
+	if (!enable)
+		return 0;
+
+	/* can @cgrp host any resources? */
+	if (!cgroup_is_valid_domain(cgrp->dom_cgrp))
+		return -EOPNOTSUPP;
+
+	/* mixables don't care */
+	if (cgroup_is_mixable(cgrp))
+		return 0;
+
+	if (domain_enable) {
+		/* can't enable domain controllers inside a thread subtree */
+		if (cgroup_is_thread_root(cgrp) || cgroup_is_threaded(cgrp))
+			return -EOPNOTSUPP;
+	} else {
+		/*
+		 * Threaded controllers can handle internal competitions
+		 * and are always allowed inside a (prospective) thread
+		 * subtree.
+		 */
+		if (cgroup_can_be_thread_root(cgrp) || cgroup_is_threaded(cgrp))
+			return 0;
+	}
+
+	/*
+	 * Controllers can't be enabled for a cgroup with tasks to avoid
+	 * child cgroups competing against tasks.
+	 */
+	if (cgroup_has_tasks(cgrp))
+		return -EBUSY;
+
+	return 0;
+}
+
 /* change the enabled child controllers for a cgroup in the default hierarchy */
 static ssize_t cgroup_subtree_control_write(struct kernfs_open_file *of,
 					    char *buf, size_t nbytes,
@@ -2956,14 +3108,9 @@ static ssize_t cgroup_subtree_control_write(struct kernfs_open_file *of,
 		goto out_unlock;
 	}
 
-	/*
-	 * Except for the root, subtree_control must be zero for a cgroup
-	 * with tasks so that child cgroups don't compete against tasks.
-	 */
-	if (enable && cgroup_parent(cgrp) && cgroup_has_tasks(cgrp)) {
-		ret = -EBUSY;
+	ret = cgroup_vet_subtree_control_enable(cgrp, enable);
+	if (ret)
 		goto out_unlock;
-	}
 
 	/* save and update control masks and prepare csses */
 	cgroup_save_control(cgrp);
@@ -2982,6 +3129,84 @@ out_unlock:
 	return ret ?: nbytes;
 }
 
+static int cgroup_enable_threaded(struct cgroup *cgrp)
+{
+	struct cgroup *parent = cgroup_parent(cgrp);
+	struct cgroup *dom_cgrp = parent->dom_cgrp;
+	int ret;
+
+	lockdep_assert_held(&cgroup_mutex);
+
+	/* noop if already threaded */
+	if (cgroup_is_threaded(cgrp))
+		return 0;
+
+	/* we're joining the parent's domain, ensure its validity */
+	if (!cgroup_is_valid_domain(dom_cgrp) ||
+	    !cgroup_can_be_thread_root(dom_cgrp))
+		return -EOPNOTSUPP;
+
+	/*
+	 * Allow enabling thread mode only on empty cgroups to avoid
+	 * implicit migrations and recursive operations.
+	 */
+	if (cgroup_has_tasks(cgrp) || css_has_online_children(&cgrp->self))
+		return -EBUSY;
+
+	/*
+	 * The following shouldn't cause actual migrations and should
+	 * always succeed.
+	 */
+	cgroup_save_control(cgrp);
+
+	cgrp->dom_cgrp = dom_cgrp;
+	ret = cgroup_apply_control(cgrp);
+	if (!ret)
+		parent->nr_threaded_children++;
+	else
+		cgrp->dom_cgrp = cgrp;
+
+	cgroup_finalize_control(cgrp, ret);
+	return ret;
+}
+
+static int cgroup_type_show(struct seq_file *seq, void *v)
+{
+	struct cgroup *cgrp = seq_css(seq)->cgroup;
+
+	if (cgroup_is_threaded(cgrp))
+		seq_puts(seq, "threaded\n");
+	else if (!cgroup_is_valid_domain(cgrp))
+		seq_puts(seq, "domain invalid\n");
+	else if (cgroup_is_thread_root(cgrp))
+		seq_puts(seq, "domain threaded\n");
+	else
+		seq_puts(seq, "domain\n");
+
+	return 0;
+}
+
+static ssize_t cgroup_type_write(struct kernfs_open_file *of, char *buf,
+				 size_t nbytes, loff_t off)
+{
+	struct cgroup *cgrp;
+	int ret;
+
+	/* only switching to threaded mode is supported */
+	if (strcmp(strstrip(buf), "threaded"))
+		return -EINVAL;
+
+	cgrp = cgroup_kn_lock_live(of->kn, false);
+	if (!cgrp)
+		return -ENOENT;
+
+	/* threaded can only be enabled */
+	ret = cgroup_enable_threaded(cgrp);
+
+	cgroup_kn_unlock(of->kn);
+	return ret ?: nbytes;
+}
+
 static int cgroup_events_show(struct seq_file *seq, void *v)
 {
 	seq_printf(seq, "populated %d\n",
@@ -3867,12 +4092,12 @@ static void *cgroup_procs_next(struct seq_file *s, void *v, loff_t *pos)
 	return css_task_iter_next(it);
 }
 
-static void *cgroup_procs_start(struct seq_file *s, loff_t *pos)
+static void *__cgroup_procs_start(struct seq_file *s, loff_t *pos,
+				  unsigned int iter_flags)
 {
 	struct kernfs_open_file *of = s->private;
 	struct cgroup *cgrp = seq_css(s)->cgroup;
 	struct css_task_iter *it = of->priv;
-	unsigned iter_flags = CSS_TASK_ITER_PROCS | CSS_TASK_ITER_THREADED;
 
 	/*
 	 * When a seq_file is seeked, it's always traversed sequentially
@@ -3895,6 +4120,23 @@ static void *cgroup_procs_start(struct seq_file *s, loff_t *pos)
 	return cgroup_procs_next(s, NULL, NULL);
 }
 
+static void *cgroup_procs_start(struct seq_file *s, loff_t *pos)
+{
+	struct cgroup *cgrp = seq_css(s)->cgroup;
+
+	/*
+	 * All processes of a threaded subtree belong to the domain cgroup
+	 * of the subtree.  Only threads can be distributed across the
+	 * subtree.  Reject reads on cgroup.procs in the subtree proper.
+	 * They're always empty anyway.
+	 */
+	if (cgroup_is_threaded(cgrp))
+		return ERR_PTR(-EOPNOTSUPP);
+
+	return __cgroup_procs_start(s, pos, CSS_TASK_ITER_PROCS |
+					    CSS_TASK_ITER_THREADED);
+}
+
 static int cgroup_procs_show(struct seq_file *s, void *v)
 {
 	seq_printf(s, "%d\n", task_pid_vnr(v));
@@ -3974,8 +4216,63 @@ out_unlock:
 	return ret ?: nbytes;
 }
 
+static void *cgroup_threads_start(struct seq_file *s, loff_t *pos)
+{
+	return __cgroup_procs_start(s, pos, 0);
+}
+
+static ssize_t cgroup_threads_write(struct kernfs_open_file *of,
+				    char *buf, size_t nbytes, loff_t off)
+{
+	struct cgroup *src_cgrp, *dst_cgrp;
+	struct task_struct *task;
+	ssize_t ret;
+
+	buf = strstrip(buf);
+
+	dst_cgrp = cgroup_kn_lock_live(of->kn, false);
+	if (!dst_cgrp)
+		return -ENODEV;
+
+	task = cgroup_procs_write_start(buf, false);
+	ret = PTR_ERR_OR_ZERO(task);
+	if (ret)
+		goto out_unlock;
+
+	/* find the source cgroup */
+	spin_lock_irq(&css_set_lock);
+	src_cgrp = task_cgroup_from_root(task, &cgrp_dfl_root);
+	spin_unlock_irq(&css_set_lock);
+
+	/* thread migrations follow the cgroup.procs delegation rule */
+	ret = cgroup_procs_write_permission(src_cgrp, dst_cgrp,
+					    of->file->f_path.dentry->d_sb);
+	if (ret)
+		goto out_finish;
+
+	/* and must be contained in the same domain */
+	ret = -EOPNOTSUPP;
+	if (src_cgrp->dom_cgrp != dst_cgrp->dom_cgrp)
+		goto out_finish;
+
+	ret = cgroup_attach_task(dst_cgrp, task, false);
+
+out_finish:
+	cgroup_procs_write_finish(task);
+out_unlock:
+	cgroup_kn_unlock(of->kn);
+
+	return ret ?: nbytes;
+}
+
 /* cgroup core interface files for the default hierarchy */
 static struct cftype cgroup_base_files[] = {
+	{
+		.name = "cgroup.type",
+		.flags = CFTYPE_NOT_ON_ROOT,
+		.seq_show = cgroup_type_show,
+		.write = cgroup_type_write,
+	},
 	{
 		.name = "cgroup.procs",
 		.flags = CFTYPE_NS_DELEGATABLE,
@@ -3986,6 +4283,14 @@ static struct cftype cgroup_base_files[] = {
 		.seq_show = cgroup_procs_show,
 		.write = cgroup_procs_write,
 	},
+	{
+		.name = "cgroup.threads",
+		.release = cgroup_procs_release,
+		.seq_start = cgroup_threads_start,
+		.seq_next = cgroup_procs_next,
+		.seq_show = cgroup_procs_show,
+		.write = cgroup_threads_write,
+	},
 	{
 		.name = "cgroup.controllers",
 		.seq_show = cgroup_controllers_show,
@@ -4753,11 +5058,17 @@ int __init cgroup_init(void)
 
 		cgrp_dfl_root.subsys_mask |= 1 << ss->id;
 
+		/* implicit controllers must be threaded too */
+		WARN_ON(ss->implicit_on_dfl && !ss->threaded);
+
 		if (ss->implicit_on_dfl)
 			cgrp_dfl_implicit_ss_mask |= 1 << ss->id;
 		else if (!ss->dfl_cftypes)
 			cgrp_dfl_inhibit_ss_mask |= 1 << ss->id;
 
+		if (ss->threaded)
+			cgrp_dfl_threaded_ss_mask |= 1 << ss->id;
+
 		if (ss->dfl_cftypes == ss->legacy_cftypes) {
 			WARN_ON(cgroup_add_cftypes(ss, ss->dfl_cftypes));
 		} else {
diff --git a/kernel/cgroup/debug.c b/kernel/cgroup/debug.c
index dac46af22782..787a242fa69d 100644
--- a/kernel/cgroup/debug.c
+++ b/kernel/cgroup/debug.c
@@ -352,6 +352,7 @@ static int __init enable_cgroup_debug(char *str)
 {
 	debug_cgrp_subsys.dfl_cftypes = debug_files;
 	debug_cgrp_subsys.implicit_on_dfl = true;
+	debug_cgrp_subsys.threaded = true;
 	return 1;
 }
 __setup("cgroup_debug", enable_cgroup_debug);
diff --git a/kernel/cgroup/pids.c b/kernel/cgroup/pids.c
index 2237201d66d5..9829c67ebc0a 100644
--- a/kernel/cgroup/pids.c
+++ b/kernel/cgroup/pids.c
@@ -345,4 +345,5 @@ struct cgroup_subsys pids_cgrp_subsys = {
 	.free		= pids_free,
 	.legacy_cftypes	= pids_files,
 	.dfl_cftypes	= pids_files,
+	.threaded	= true,
 };
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 1538df9b2b65..ec78247da310 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -11210,5 +11210,6 @@ struct cgroup_subsys perf_event_cgrp_subsys = {
 	 * controller is not mounted on a legacy hierarchy.
 	 */
 	.implicit_on_dfl = true,
+	.threaded	= true,
 };
 #endif /* CONFIG_CGROUP_PERF */
-- 
cgit v1.2.3


From d16645054d2f55e4011c9725ddf2dbe6177e942a Mon Sep 17 00:00:00 2001
From: Dave Gerlach <d-gerlach@ti.com>
Date: Fri, 21 Jul 2017 09:38:36 -0700
Subject: dt-bindings: Drop k2g genpd device ID macros

Commit 7cc119f29b19 ("dt-bindings: Add TI SCI PM Domains") introduced a
number of K2G_DEV_x macros to represent each device ID available on the
K2G platform for use by the genpd, clock, and reset drivers. Rather than
use these macros, which are only used in the device tree for property
values and not actually used by the drivers, let's just use the device
ID number directly in the device tree to avoid macro bloat.

Acked-by: Rob Herring <robh@kernel.org>
Signed-off-by: Dave Gerlach <d-gerlach@ti.com>
Signed-off-by: Santosh Shilimkar <ssantosh@kernel.org>
---
 .../devicetree/bindings/soc/ti/sci-pm-domain.txt   |  5 +-
 include/dt-bindings/genpd/k2g.h                    | 90 ----------------------
 2 files changed, 3 insertions(+), 92 deletions(-)
 delete mode 100644 include/dt-bindings/genpd/k2g.h

(limited to 'include')

diff --git a/Documentation/devicetree/bindings/soc/ti/sci-pm-domain.txt b/Documentation/devicetree/bindings/soc/ti/sci-pm-domain.txt
index c705db07d820..66e6265fb0aa 100644
--- a/Documentation/devicetree/bindings/soc/ti/sci-pm-domain.txt
+++ b/Documentation/devicetree/bindings/soc/ti/sci-pm-domain.txt
@@ -46,12 +46,13 @@ Required Properties:
 - power-domains: phandle pointing to the corresponding PM domain node
 		 and an ID representing the device.
 
-See dt-bindings/genpd/k2g.h for the list of valid identifiers for k2g.
+See http://processors.wiki.ti.com/index.php/TISCI#66AK2G02_Data for the list
+of valid identifiers for k2g.
 
 Example (K2G):
 --------------------
 	uart0: serial@02530c00 {
 		compatible = "ns16550a";
 		...
-		power-domains = <&k2g_pds K2G_DEV_UART0>;
+		power-domains = <&k2g_pds 0x002c>;
 	};
diff --git a/include/dt-bindings/genpd/k2g.h b/include/dt-bindings/genpd/k2g.h
deleted file mode 100644
index 1f31f17e19eb..000000000000
--- a/include/dt-bindings/genpd/k2g.h
+++ /dev/null
@@ -1,90 +0,0 @@
-/*
- * TI K2G SoC Device definitions
- *
- * Copyright (C) 2015-2017 Texas Instruments Incorporated - http://www.ti.com/
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- */
-
-#ifndef _DT_BINDINGS_GENPD_K2G_H
-#define _DT_BINDINGS_GENPD_K2G_H
-
-/* Documented in http://processors.wiki.ti.com/index.php/TISCI */
-
-#define K2G_DEV_PMMC0			0x0000
-#define K2G_DEV_MLB0			0x0001
-#define K2G_DEV_DSS0			0x0002
-#define K2G_DEV_MCBSP0			0x0003
-#define K2G_DEV_MCASP0			0x0004
-#define K2G_DEV_MCASP1			0x0005
-#define K2G_DEV_MCASP2			0x0006
-#define K2G_DEV_DCAN0			0x0008
-#define K2G_DEV_DCAN1			0x0009
-#define K2G_DEV_EMIF0			0x000a
-#define K2G_DEV_MMCHS0			0x000b
-#define K2G_DEV_MMCHS1			0x000c
-#define K2G_DEV_GPMC0			0x000d
-#define K2G_DEV_ELM0			0x000e
-#define K2G_DEV_SPI0			0x0010
-#define K2G_DEV_SPI1			0x0011
-#define K2G_DEV_SPI2			0x0012
-#define K2G_DEV_SPI3			0x0013
-#define K2G_DEV_ICSS0			0x0014
-#define K2G_DEV_ICSS1			0x0015
-#define K2G_DEV_USB0			0x0016
-#define K2G_DEV_USB1			0x0017
-#define K2G_DEV_NSS0			0x0018
-#define K2G_DEV_PCIE0			0x0019
-#define K2G_DEV_GPIO0			0x001b
-#define K2G_DEV_GPIO1			0x001c
-#define K2G_DEV_TIMER64_0		0x001d
-#define K2G_DEV_TIMER64_1		0x001e
-#define K2G_DEV_TIMER64_2		0x001f
-#define K2G_DEV_TIMER64_3		0x0020
-#define K2G_DEV_TIMER64_4		0x0021
-#define K2G_DEV_TIMER64_5		0x0022
-#define K2G_DEV_TIMER64_6		0x0023
-#define K2G_DEV_MSGMGR0			0x0025
-#define K2G_DEV_BOOTCFG0		0x0026
-#define K2G_DEV_ARM_BOOTROM0		0x0027
-#define K2G_DEV_DSP_BOOTROM0		0x0029
-#define K2G_DEV_DEBUGSS0		0x002b
-#define K2G_DEV_UART0			0x002c
-#define K2G_DEV_UART1			0x002d
-#define K2G_DEV_UART2			0x002e
-#define K2G_DEV_EHRPWM0			0x002f
-#define K2G_DEV_EHRPWM1			0x0030
-#define K2G_DEV_EHRPWM2			0x0031
-#define K2G_DEV_EHRPWM3			0x0032
-#define K2G_DEV_EHRPWM4			0x0033
-#define K2G_DEV_EHRPWM5			0x0034
-#define K2G_DEV_EQEP0			0x0035
-#define K2G_DEV_EQEP1			0x0036
-#define K2G_DEV_EQEP2			0x0037
-#define K2G_DEV_ECAP0			0x0038
-#define K2G_DEV_ECAP1			0x0039
-#define K2G_DEV_I2C0			0x003a
-#define K2G_DEV_I2C1			0x003b
-#define K2G_DEV_I2C2			0x003c
-#define K2G_DEV_EDMA0			0x003f
-#define K2G_DEV_SEMAPHORE0		0x0040
-#define K2G_DEV_INTC0			0x0041
-#define K2G_DEV_GIC0			0x0042
-#define K2G_DEV_QSPI0			0x0043
-#define K2G_DEV_ARM_64B_COUNTER0	0x0044
-#define K2G_DEV_TETRIS0			0x0045
-#define K2G_DEV_CGEM0			0x0046
-#define K2G_DEV_MSMC0			0x0047
-#define K2G_DEV_CBASS0			0x0049
-#define K2G_DEV_BOARD0			0x004c
-#define K2G_DEV_EDMA1			0x004f
-
-#endif
-- 
cgit v1.2.3


From e8e3edb95ce6a146bc774b6cfad3553f4383edc8 Mon Sep 17 00:00:00 2001
From: Mario Rugiero <mrugiero@gmail.com>
Date: Mon, 29 May 2017 08:38:41 -0300
Subject: mtd: create per-device and module-scope debugfs entries

Several MTD devices are using debugfs entries created in the root.
This commit provides the means for a standardized subtree, creating
one "mtd" entry at root, and one entry per device inside it, named
after the device.
The tree is registered in add_mtd_device, and released in
del_mtd_device.
Devices docg3, mtdswap and nandsim were updated to use this subtree
instead of custom ones, and their entries were prefixed with the
drivers' names.

Signed-off-by: Mario J. Rugiero <mrugiero@gmail.com>
Acked-by: Boris Brezillon <boris.brezillon@free-electrons.com>
Signed-off-by: Brian Norris <computersforpeace@gmail.com>
---
 drivers/mtd/devices/docg3.c | 49 ++++++++++++++++-----------------------------
 drivers/mtd/devices/docg3.h |  2 --
 drivers/mtd/mtdcore.c       | 16 +++++++++++++++
 drivers/mtd/mtdswap.c       | 18 ++++-------------
 drivers/mtd/nand/nandsim.c  | 48 +++++++++++---------------------------------
 include/linux/mtd/mtd.h     | 10 +++++++++
 6 files changed, 59 insertions(+), 84 deletions(-)

(limited to 'include')

diff --git a/drivers/mtd/devices/docg3.c b/drivers/mtd/devices/docg3.c
index b833e6cc684c..84b16133554b 100644
--- a/drivers/mtd/devices/docg3.c
+++ b/drivers/mtd/devices/docg3.c
@@ -1809,37 +1809,22 @@ static int dbg_protection_show(struct seq_file *s, void *p)
 }
 DEBUGFS_RO_ATTR(protection, dbg_protection_show);
 
-static int __init doc_dbg_register(struct docg3 *docg3)
-{
-	struct dentry *root, *entry;
-
-	root = debugfs_create_dir("docg3", NULL);
-	if (!root)
-		return -ENOMEM;
-
-	entry = debugfs_create_file("flashcontrol", S_IRUSR, root, docg3,
-				  &flashcontrol_fops);
-	if (entry)
-		entry = debugfs_create_file("asic_mode", S_IRUSR, root,
-					    docg3, &asic_mode_fops);
-	if (entry)
-		entry = debugfs_create_file("device_id", S_IRUSR, root,
-					    docg3, &device_id_fops);
-	if (entry)
-		entry = debugfs_create_file("protection", S_IRUSR, root,
-					    docg3, &protection_fops);
-	if (entry) {
-		docg3->debugfs_root = root;
-		return 0;
-	} else {
-		debugfs_remove_recursive(root);
-		return -ENOMEM;
-	}
-}
-
-static void doc_dbg_unregister(struct docg3 *docg3)
+static void __init doc_dbg_register(struct mtd_info *floor)
 {
-	debugfs_remove_recursive(docg3->debugfs_root);
+	struct dentry *root = floor->dbg.dfs_dir;
+	struct docg3 *docg3 = floor->priv;
+
+	if (IS_ERR_OR_NULL(root))
+		return;
+
+	debugfs_create_file("docg3_flashcontrol", S_IRUSR, root, docg3,
+			    &flashcontrol_fops);
+	debugfs_create_file("docg3_asic_mode", S_IRUSR, root, docg3,
+			    &asic_mode_fops);
+	debugfs_create_file("docg3_device_id", S_IRUSR, root, docg3,
+			    &device_id_fops);
+	debugfs_create_file("docg3_protection", S_IRUSR, root, docg3,
+			    &protection_fops);
 }
 
 /**
@@ -2114,6 +2099,8 @@ static int __init docg3_probe(struct platform_device *pdev)
 						0);
 		if (ret)
 			goto err_probe;
+
+		doc_dbg_register(cascade->floors[floor]);
 	}
 
 	ret = doc_register_sysfs(pdev, cascade);
@@ -2121,7 +2108,6 @@ static int __init docg3_probe(struct platform_device *pdev)
 		goto err_probe;
 
 	platform_set_drvdata(pdev, cascade);
-	doc_dbg_register(cascade->floors[0]->priv);
 	return 0;
 
 notfound:
@@ -2148,7 +2134,6 @@ static int docg3_release(struct platform_device *pdev)
 	int floor;
 
 	doc_unregister_sysfs(pdev, cascade);
-	doc_dbg_unregister(docg3);
 	for (floor = 0; floor < DOC_MAX_NBFLOORS; floor++)
 		if (cascade->floors[floor])
 			doc_release_device(cascade->floors[floor]);
diff --git a/drivers/mtd/devices/docg3.h b/drivers/mtd/devices/docg3.h
index 19fb93f96a3a..e99946575398 100644
--- a/drivers/mtd/devices/docg3.h
+++ b/drivers/mtd/devices/docg3.h
@@ -299,7 +299,6 @@ struct docg3_cascade {
  * @oob_autoecc: if 1, use only bytes 0-7, 15, and fill the others with HW ECC
  *               if 0, use all the 16 bytes.
  * @oob_write_buf: prepared OOB for next page_write
- * @debugfs_root: debugfs root node
  */
 struct docg3 {
 	struct device *dev;
@@ -312,7 +311,6 @@ struct docg3 {
 	loff_t oob_write_ofs;
 	int oob_autoecc;
 	u8 oob_write_buf[DOC_LAYOUT_OOB_SIZE];
-	struct dentry *debugfs_root;
 };
 
 #define doc_err(fmt, arg...) dev_err(docg3->dev, (fmt), ## arg)
diff --git a/drivers/mtd/mtdcore.c b/drivers/mtd/mtdcore.c
index 956382cea256..f872a99501ed 100644
--- a/drivers/mtd/mtdcore.c
+++ b/drivers/mtd/mtdcore.c
@@ -40,6 +40,7 @@
 #include <linux/slab.h>
 #include <linux/reboot.h>
 #include <linux/leds.h>
+#include <linux/debugfs.h>
 
 #include <linux/mtd/mtd.h>
 #include <linux/mtd/partitions.h>
@@ -477,6 +478,8 @@ int mtd_pairing_groups(struct mtd_info *mtd)
 }
 EXPORT_SYMBOL_GPL(mtd_pairing_groups);
 
+static struct dentry *dfs_dir_mtd;
+
 /**
  *	add_mtd_device - register an MTD device
  *	@mtd: pointer to new MTD device info structure
@@ -552,6 +555,14 @@ int add_mtd_device(struct mtd_info *mtd)
 	if (error)
 		goto fail_added;
 
+	if (!IS_ERR_OR_NULL(dfs_dir_mtd)) {
+		mtd->dbg.dfs_dir = debugfs_create_dir(dev_name(&mtd->dev), dfs_dir_mtd);
+		if (IS_ERR_OR_NULL(mtd->dbg.dfs_dir)) {
+			pr_debug("mtd device %s won't show data in debugfs\n",
+				 dev_name(&mtd->dev));
+		}
+	}
+
 	device_create(&mtd_class, mtd->dev.parent, MTD_DEVT(i) + 1, NULL,
 		      "mtd%dro", i);
 
@@ -594,6 +605,8 @@ int del_mtd_device(struct mtd_info *mtd)
 
 	mutex_lock(&mtd_table_mutex);
 
+	debugfs_remove_recursive(mtd->dbg.dfs_dir);
+
 	if (idr_find(&mtd_idr, mtd->index) != mtd) {
 		ret = -ENODEV;
 		goto out_error;
@@ -1811,6 +1824,8 @@ static int __init init_mtd(void)
 	if (ret)
 		goto out_procfs;
 
+	dfs_dir_mtd = debugfs_create_dir("mtd", NULL);
+
 	return 0;
 
 out_procfs:
@@ -1826,6 +1841,7 @@ err_reg:
 
 static void __exit cleanup_mtd(void)
 {
+	debugfs_remove_recursive(dfs_dir_mtd);
 	cleanup_mtdchar();
 	if (proc_mtd)
 		remove_proc_entry("mtd", NULL);
diff --git a/drivers/mtd/mtdswap.c b/drivers/mtd/mtdswap.c
index f12879a3d4ff..6b17932fe557 100644
--- a/drivers/mtd/mtdswap.c
+++ b/drivers/mtd/mtdswap.c
@@ -138,8 +138,6 @@ struct mtdswap_dev {
 
 	char *page_buf;
 	char *oob_buf;
-
-	struct dentry *debugfs_root;
 };
 
 struct mtdswap_oobdata {
@@ -1318,26 +1316,19 @@ static int mtdswap_add_debugfs(struct mtdswap_dev *d)
 	struct gendisk *gd = d->mbd_dev->disk;
 	struct device *dev = disk_to_dev(gd);
 
-	struct dentry *root;
+	struct dentry *root = d->mtd->dbg.dfs_dir;
 	struct dentry *dent;
 
-	root = debugfs_create_dir(gd->disk_name, NULL);
-	if (IS_ERR(root))
+	if (!IS_ENABLED(CONFIG_DEBUG_FS))
 		return 0;
 
-	if (!root) {
-		dev_err(dev, "failed to initialize debugfs\n");
+	if (IS_ERR_OR_NULL(root))
 		return -1;
-	}
-
-	d->debugfs_root = root;
 
-	dent = debugfs_create_file("stats", S_IRUSR, root, d,
+	dent = debugfs_create_file("mtdswap_stats", S_IRUSR, root, d,
 				&mtdswap_fops);
 	if (!dent) {
 		dev_err(d->dev, "debugfs_create_file failed\n");
-		debugfs_remove_recursive(root);
-		d->debugfs_root = NULL;
 		return -1;
 	}
 
@@ -1540,7 +1531,6 @@ static void mtdswap_remove_dev(struct mtd_blktrans_dev *dev)
 {
 	struct mtdswap_dev *d = MTDSWAP_MBD_TO_MTDSWAP(dev);
 
-	debugfs_remove_recursive(d->debugfs_root);
 	del_mtd_blktrans_dev(dev);
 	mtdswap_cleanup(d);
 	kfree(d);
diff --git a/drivers/mtd/nand/nandsim.c b/drivers/mtd/nand/nandsim.c
index 03a0d057bf2f..1ddf0b73c246 100644
--- a/drivers/mtd/nand/nandsim.c
+++ b/drivers/mtd/nand/nandsim.c
@@ -287,11 +287,6 @@ MODULE_PARM_DESC(bch,		 "Enable BCH ecc and set how many bits should "
 /* Maximum page cache pages needed to read or write a NAND page to the cache_file */
 #define NS_MAX_HELD_PAGES 16
 
-struct nandsim_debug_info {
-	struct dentry *dfs_root;
-	struct dentry *dfs_wear_report;
-};
-
 /*
  * A union to represent flash memory contents and flash buffer.
  */
@@ -370,8 +365,6 @@ struct nandsim {
 	void *file_buf;
 	struct page *held_pages[NS_MAX_HELD_PAGES];
 	int held_cnt;
-
-	struct nandsim_debug_info dbg;
 };
 
 /*
@@ -524,39 +517,23 @@ static const struct file_operations dfs_fops = {
  */
 static int nandsim_debugfs_create(struct nandsim *dev)
 {
-	struct nandsim_debug_info *dbg = &dev->dbg;
+	struct dentry *root = nsmtd->dbg.dfs_dir;
 	struct dentry *dent;
 
 	if (!IS_ENABLED(CONFIG_DEBUG_FS))
 		return 0;
 
-	dent = debugfs_create_dir("nandsim", NULL);
-	if (!dent) {
-		NS_ERR("cannot create \"nandsim\" debugfs directory\n");
-		return -ENODEV;
-	}
-	dbg->dfs_root = dent;
+	if (IS_ERR_OR_NULL(root))
+		return -1;
 
-	dent = debugfs_create_file("wear_report", S_IRUSR,
-				   dbg->dfs_root, dev, &dfs_fops);
-	if (!dent)
-		goto out_remove;
-	dbg->dfs_wear_report = dent;
+	dent = debugfs_create_file("nandsim_wear_report", S_IRUSR,
+				   root, dev, &dfs_fops);
+	if (IS_ERR_OR_NULL(dent)) {
+		NS_ERR("cannot create \"nandsim_wear_report\" debugfs entry\n");
+		return -1;
+	}
 
 	return 0;
-
-out_remove:
-	debugfs_remove_recursive(dbg->dfs_root);
-	return -ENODEV;
-}
-
-/**
- * nandsim_debugfs_remove - destroy all debugfs files
- */
-static void nandsim_debugfs_remove(struct nandsim *ns)
-{
-	if (IS_ENABLED(CONFIG_DEBUG_FS))
-		debugfs_remove_recursive(ns->dbg.dfs_root);
 }
 
 /*
@@ -2352,9 +2329,6 @@ static int __init ns_init_module(void)
 	if ((retval = setup_wear_reporting(nsmtd)) != 0)
 		goto err_exit;
 
-	if ((retval = nandsim_debugfs_create(nand)) != 0)
-		goto err_exit;
-
 	if ((retval = init_nandsim(nsmtd)) != 0)
 		goto err_exit;
 
@@ -2370,6 +2344,9 @@ static int __init ns_init_module(void)
 	if (retval != 0)
 		goto err_exit;
 
+	if ((retval = nandsim_debugfs_create(nand)) != 0)
+		goto err_exit;
+
         return 0;
 
 err_exit:
@@ -2395,7 +2372,6 @@ static void __exit ns_cleanup_module(void)
 	struct nandsim *ns = nand_get_controller_data(chip);
 	int i;
 
-	nandsim_debugfs_remove(ns);
 	free_nandsim(ns);    /* Free nandsim private resources */
 	nand_release(nsmtd); /* Unregister driver */
 	for (i = 0;i < ARRAY_SIZE(ns->partitions); ++i)
diff --git a/include/linux/mtd/mtd.h b/include/linux/mtd/mtd.h
index f8a2ef239c60..6cd0f6b7658b 100644
--- a/include/linux/mtd/mtd.h
+++ b/include/linux/mtd/mtd.h
@@ -206,6 +206,15 @@ struct mtd_pairing_scheme {
 
 struct module;	/* only needed for owner field in mtd_info */
 
+/**
+ * struct mtd_debug_info - debugging information for an MTD device.
+ *
+ * @dfs_dir: direntry object of the MTD device debugfs directory
+ */
+struct mtd_debug_info {
+	struct dentry *dfs_dir;
+};
+
 struct mtd_info {
 	u_char type;
 	uint32_t flags;
@@ -346,6 +355,7 @@ struct mtd_info {
 	struct module *owner;
 	struct device dev;
 	int usecount;
+	struct mtd_debug_info dbg;
 };
 
 int mtd_ooblayout_ecc(struct mtd_info *mtd, int section,
-- 
cgit v1.2.3


From b81b729164445cd94c4dd6fc4d6f10487434df4a Mon Sep 17 00:00:00 2001
From: Sakari Ailus <sakari.ailus@linux.intel.com>
Date: Fri, 21 Jul 2017 14:39:30 +0300
Subject: ACPI: Use IS_ERR_OR_NULL() instead of non-NULL check in
 is_acpi_data_node()

The is_acpi_data_node() function takes a struct fwnode_handle pointer as
its argument. The validity of the pointer is first checked. Extend the
check to cover error values as is done by similar is_acpi_node() and
is_acpi_device_node() functions.

Signed-off-by: Sakari Ailus <sakari.ailus@linux.intel.com>
Reviewed-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 include/acpi/acpi_bus.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/acpi/acpi_bus.h b/include/acpi/acpi_bus.h
index 68bc6be447fd..7569123475b3 100644
--- a/include/acpi/acpi_bus.h
+++ b/include/acpi/acpi_bus.h
@@ -414,7 +414,7 @@ static inline struct acpi_device *to_acpi_device_node(struct fwnode_handle *fwno
 
 static inline bool is_acpi_data_node(struct fwnode_handle *fwnode)
 {
-	return fwnode && fwnode->type == FWNODE_ACPI_DATA;
+	return !IS_ERR_OR_NULL(fwnode) && fwnode->type == FWNODE_ACPI_DATA;
 }
 
 static inline struct acpi_data_node *to_acpi_data_node(struct fwnode_handle *fwnode)
-- 
cgit v1.2.3


From db3e50f3234ba1a477413f56a9e5800a73dca786 Mon Sep 17 00:00:00 2001
From: Sakari Ailus <sakari.ailus@linux.intel.com>
Date: Fri, 21 Jul 2017 14:39:31 +0300
Subject: device property: Get rid of struct fwnode_handle type field

Instead of relying on the struct fwnode_handle type field, define
fwnode_operations structs for all separate types of fwnodes. To find out
the type, compare to the ops field to relevant ops structs.

This change has two benefits:

1. it avoids adding the type field to each and every instance of struct
fwnode_handle, thus saving memory and

2. makes the ops field the single factor that defines both the types of
the fwnode as well as defines the implementation of its operations,
decreasing the possibility of bugs when developing code dealing with
fwnode internals.

Suggested-by: Rob Herring <robh@kernel.org>
Signed-off-by: Sakari Ailus <sakari.ailus@linux.intel.com>
Reviewed-by: Mika Westerberg <mika.westerberg@linux.intel.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/acpi/property.c   | 47 ++++++++++++++++++++++++++++-------------------
 drivers/acpi/scan.c       |  3 +--
 drivers/base/property.c   |  5 +++--
 drivers/of/property.c     |  1 +
 include/acpi/acpi_bus.h   | 20 ++++++++++++++++----
 include/linux/acpi.h      |  8 ++------
 include/linux/fwnode.h    | 11 -----------
 include/linux/irqdomain.h |  4 +++-
 include/linux/of.h        |  3 +--
 kernel/irq/irqdomain.c    | 10 +++++-----
 10 files changed, 60 insertions(+), 52 deletions(-)

(limited to 'include')

diff --git a/drivers/acpi/property.c b/drivers/acpi/property.c
index 917c789f953d..cb6a3b38ded2 100644
--- a/drivers/acpi/property.c
+++ b/drivers/acpi/property.c
@@ -56,8 +56,7 @@ static bool acpi_nondev_subnode_extract(const union acpi_object *desc,
 		return false;
 
 	dn->name = link->package.elements[0].string.pointer;
-	dn->fwnode.type = FWNODE_ACPI_DATA;
-	dn->fwnode.ops = &acpi_fwnode_ops;
+	dn->fwnode.ops = &acpi_data_fwnode_ops;
 	dn->parent = parent;
 	INIT_LIST_HEAD(&dn->data.subnodes);
 
@@ -469,10 +468,10 @@ EXPORT_SYMBOL_GPL(acpi_dev_get_property);
 
 static struct acpi_device_data *acpi_device_data_of_node(struct fwnode_handle *fwnode)
 {
-	if (fwnode->type == FWNODE_ACPI) {
+	if (is_acpi_device_node(fwnode)) {
 		struct acpi_device *adev = to_acpi_device_node(fwnode);
 		return &adev->data;
-	} else if (fwnode->type == FWNODE_ACPI_DATA) {
+	} else if (is_acpi_data_node(fwnode)) {
 		struct acpi_data_node *dn = to_acpi_data_node(fwnode);
 		return &dn->data;
 	}
@@ -903,7 +902,7 @@ struct fwnode_handle *acpi_get_next_subnode(struct fwnode_handle *fwnode,
 	struct acpi_device *adev = to_acpi_device_node(fwnode);
 	struct list_head *head, *next;
 
-	if (!child || child->type == FWNODE_ACPI) {
+	if (!child || is_acpi_device_node(child)) {
 		if (adev)
 			head = &adev->children;
 		else
@@ -927,7 +926,7 @@ struct fwnode_handle *acpi_get_next_subnode(struct fwnode_handle *fwnode,
 	}
 
  nondev:
-	if (!child || child->type == FWNODE_ACPI_DATA) {
+	if (!child || is_acpi_data_node(child)) {
 		struct acpi_data_node *data = to_acpi_data_node(fwnode);
 		struct acpi_data_node *dn;
 
@@ -1223,16 +1222,26 @@ static int acpi_fwnode_graph_parse_endpoint(struct fwnode_handle *fwnode,
 	return 0;
 }
 
-const struct fwnode_operations acpi_fwnode_ops = {
-	.device_is_available = acpi_fwnode_device_is_available,
-	.property_present = acpi_fwnode_property_present,
-	.property_read_int_array = acpi_fwnode_property_read_int_array,
-	.property_read_string_array = acpi_fwnode_property_read_string_array,
-	.get_parent = acpi_node_get_parent,
-	.get_next_child_node = acpi_get_next_subnode,
-	.get_named_child_node = acpi_fwnode_get_named_child_node,
-	.graph_get_next_endpoint = acpi_fwnode_graph_get_next_endpoint,
-	.graph_get_remote_endpoint = acpi_fwnode_graph_get_remote_endpoint,
-	.graph_get_port_parent = acpi_node_get_parent,
-	.graph_parse_endpoint = acpi_fwnode_graph_parse_endpoint,
-};
+#define DECLARE_ACPI_FWNODE_OPS(ops) \
+	const struct fwnode_operations ops = {				\
+		.device_is_available = acpi_fwnode_device_is_available, \
+		.property_present = acpi_fwnode_property_present,	\
+		.property_read_int_array =				\
+			acpi_fwnode_property_read_int_array,		\
+		.property_read_string_array =				\
+			acpi_fwnode_property_read_string_array,		\
+		.get_parent = acpi_node_get_parent,			\
+		.get_next_child_node = acpi_get_next_subnode,		\
+		.get_named_child_node = acpi_fwnode_get_named_child_node, \
+		.graph_get_next_endpoint =				\
+			acpi_fwnode_graph_get_next_endpoint,		\
+		.graph_get_remote_endpoint =				\
+			acpi_fwnode_graph_get_remote_endpoint,		\
+		.graph_get_port_parent = acpi_node_get_parent,		\
+		.graph_parse_endpoint = acpi_fwnode_graph_parse_endpoint, \
+	};								\
+	EXPORT_SYMBOL_GPL(ops)
+
+DECLARE_ACPI_FWNODE_OPS(acpi_device_fwnode_ops);
+DECLARE_ACPI_FWNODE_OPS(acpi_data_fwnode_ops);
+const struct fwnode_operations acpi_static_fwnode_ops;
diff --git a/drivers/acpi/scan.c b/drivers/acpi/scan.c
index 33897298f03e..943536c9a2a8 100644
--- a/drivers/acpi/scan.c
+++ b/drivers/acpi/scan.c
@@ -1467,8 +1467,7 @@ void acpi_init_device_object(struct acpi_device *device, acpi_handle handle,
 	device->device_type = type;
 	device->handle = handle;
 	device->parent = acpi_bus_get_parent(handle);
-	device->fwnode.type = FWNODE_ACPI;
-	device->fwnode.ops = &acpi_fwnode_ops;
+	device->fwnode.ops = &acpi_device_fwnode_ops;
 	acpi_set_device_status(device, sta);
 	acpi_device_get_busid(device);
 	acpi_set_pnp_ids(handle, &device->pnp, type);
diff --git a/drivers/base/property.c b/drivers/base/property.c
index edf02c1b5845..857e4d39add6 100644
--- a/drivers/base/property.c
+++ b/drivers/base/property.c
@@ -25,9 +25,11 @@ struct property_set {
 	const struct property_entry *properties;
 };
 
+static const struct fwnode_operations pset_fwnode_ops;
+
 static inline bool is_pset_node(struct fwnode_handle *fwnode)
 {
-	return !IS_ERR_OR_NULL(fwnode) && fwnode->type == FWNODE_PDATA;
+	return !IS_ERR_OR_NULL(fwnode) && fwnode->ops == &pset_fwnode_ops;
 }
 
 static inline struct property_set *to_pset_node(struct fwnode_handle *fwnode)
@@ -900,7 +902,6 @@ int device_add_properties(struct device *dev,
 	if (IS_ERR(p))
 		return PTR_ERR(p);
 
-	p->fwnode.type = FWNODE_PDATA;
 	p->fwnode.ops = &pset_fwnode_ops;
 	set_secondary_fwnode(dev, &p->fwnode);
 	return 0;
diff --git a/drivers/of/property.c b/drivers/of/property.c
index eda50b4be934..2d5988820405 100644
--- a/drivers/of/property.c
+++ b/drivers/of/property.c
@@ -952,3 +952,4 @@ const struct fwnode_operations of_fwnode_ops = {
 	.graph_get_port_parent = of_fwnode_graph_get_port_parent,
 	.graph_parse_endpoint = of_fwnode_graph_parse_endpoint,
 };
+EXPORT_SYMBOL_GPL(of_fwnode_ops);
diff --git a/include/acpi/acpi_bus.h b/include/acpi/acpi_bus.h
index 7569123475b3..91b1e58e5189 100644
--- a/include/acpi/acpi_bus.h
+++ b/include/acpi/acpi_bus.h
@@ -395,15 +395,21 @@ struct acpi_data_node {
 	struct completion kobj_done;
 };
 
+extern const struct fwnode_operations acpi_device_fwnode_ops;
+extern const struct fwnode_operations acpi_data_fwnode_ops;
+extern const struct fwnode_operations acpi_static_fwnode_ops;
+
 static inline bool is_acpi_node(struct fwnode_handle *fwnode)
 {
-	return !IS_ERR_OR_NULL(fwnode) && (fwnode->type == FWNODE_ACPI
-		|| fwnode->type == FWNODE_ACPI_DATA);
+	return !IS_ERR_OR_NULL(fwnode) &&
+		(fwnode->ops == &acpi_device_fwnode_ops
+		 || fwnode->ops == &acpi_data_fwnode_ops);
 }
 
 static inline bool is_acpi_device_node(struct fwnode_handle *fwnode)
 {
-	return !IS_ERR_OR_NULL(fwnode) && fwnode->type == FWNODE_ACPI;
+	return !IS_ERR_OR_NULL(fwnode) &&
+		fwnode->ops == &acpi_device_fwnode_ops;
 }
 
 static inline struct acpi_device *to_acpi_device_node(struct fwnode_handle *fwnode)
@@ -414,7 +420,7 @@ static inline struct acpi_device *to_acpi_device_node(struct fwnode_handle *fwno
 
 static inline bool is_acpi_data_node(struct fwnode_handle *fwnode)
 {
-	return !IS_ERR_OR_NULL(fwnode) && fwnode->type == FWNODE_ACPI_DATA;
+	return !IS_ERR_OR_NULL(fwnode) && fwnode->ops == &acpi_data_fwnode_ops;
 }
 
 static inline struct acpi_data_node *to_acpi_data_node(struct fwnode_handle *fwnode)
@@ -423,6 +429,12 @@ static inline struct acpi_data_node *to_acpi_data_node(struct fwnode_handle *fwn
 		container_of(fwnode, struct acpi_data_node, fwnode) : NULL;
 }
 
+static inline bool is_acpi_static_node(struct fwnode_handle *fwnode)
+{
+	return !IS_ERR_OR_NULL(fwnode) &&
+		fwnode->ops == &acpi_static_fwnode_ops;
+}
+
 static inline bool acpi_data_node_match(struct fwnode_handle *fwnode,
 					const char *name)
 {
diff --git a/include/linux/acpi.h b/include/linux/acpi.h
index c749eef1daa1..71b763f0bee9 100644
--- a/include/linux/acpi.h
+++ b/include/linux/acpi.h
@@ -57,9 +57,6 @@ static inline acpi_handle acpi_device_handle(struct acpi_device *adev)
 	acpi_fwnode_handle(adev) : NULL)
 #define ACPI_HANDLE(dev)		acpi_device_handle(ACPI_COMPANION(dev))
 
-
-extern const struct fwnode_operations acpi_fwnode_ops;
-
 static inline struct fwnode_handle *acpi_alloc_fwnode_static(void)
 {
 	struct fwnode_handle *fwnode;
@@ -68,15 +65,14 @@ static inline struct fwnode_handle *acpi_alloc_fwnode_static(void)
 	if (!fwnode)
 		return NULL;
 
-	fwnode->type = FWNODE_ACPI_STATIC;
-	fwnode->ops = &acpi_fwnode_ops;
+	fwnode->ops = &acpi_static_fwnode_ops;
 
 	return fwnode;
 }
 
 static inline void acpi_free_fwnode_static(struct fwnode_handle *fwnode)
 {
-	if (WARN_ON(!fwnode || fwnode->type != FWNODE_ACPI_STATIC))
+	if (WARN_ON(!is_acpi_static_node(fwnode)))
 		return;
 
 	kfree(fwnode);
diff --git a/include/linux/fwnode.h b/include/linux/fwnode.h
index 50893a1646cf..c5dbc48b55dd 100644
--- a/include/linux/fwnode.h
+++ b/include/linux/fwnode.h
@@ -14,20 +14,9 @@
 
 #include <linux/types.h>
 
-enum fwnode_type {
-	FWNODE_INVALID = 0,
-	FWNODE_OF,
-	FWNODE_ACPI,
-	FWNODE_ACPI_DATA,
-	FWNODE_ACPI_STATIC,
-	FWNODE_PDATA,
-	FWNODE_IRQCHIP
-};
-
 struct fwnode_operations;
 
 struct fwnode_handle {
-	enum fwnode_type type;
 	struct fwnode_handle *secondary;
 	const struct fwnode_operations *ops;
 };
diff --git a/include/linux/irqdomain.h b/include/linux/irqdomain.h
index cac77a5c5555..d24273840b79 100644
--- a/include/linux/irqdomain.h
+++ b/include/linux/irqdomain.h
@@ -265,9 +265,11 @@ static inline struct fwnode_handle *of_node_to_fwnode(struct device_node *node)
 	return node ? &node->fwnode : NULL;
 }
 
+extern const struct fwnode_operations irqchip_fwnode_ops;
+
 static inline bool is_fwnode_irqchip(struct fwnode_handle *fwnode)
 {
-	return fwnode && fwnode->type == FWNODE_IRQCHIP;
+	return fwnode && fwnode->ops == &irqchip_fwnode_ops;
 }
 
 extern void irq_domain_update_bus_token(struct irq_domain *domain,
diff --git a/include/linux/of.h b/include/linux/of.h
index 4a8a70916237..cfc34117fc92 100644
--- a/include/linux/of.h
+++ b/include/linux/of.h
@@ -104,7 +104,6 @@ extern const struct fwnode_operations of_fwnode_ops;
 static inline void of_node_init(struct device_node *node)
 {
 	kobject_init(&node->kobj, &of_node_ktype);
-	node->fwnode.type = FWNODE_OF;
 	node->fwnode.ops = &of_fwnode_ops;
 }
 
@@ -152,7 +151,7 @@ void of_core_init(void);
 
 static inline bool is_of_node(const struct fwnode_handle *fwnode)
 {
-	return !IS_ERR_OR_NULL(fwnode) && fwnode->type == FWNODE_OF;
+	return !IS_ERR_OR_NULL(fwnode) && fwnode->ops == &of_fwnode_ops;
 }
 
 #define to_of_node(__fwnode)						\
diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c
index f1f251479aa6..e064fd1390f1 100644
--- a/kernel/irq/irqdomain.c
+++ b/kernel/irq/irqdomain.c
@@ -41,6 +41,8 @@ static inline void debugfs_add_domain_dir(struct irq_domain *d) { }
 static inline void debugfs_remove_domain_dir(struct irq_domain *d) { }
 #endif
 
+const struct fwnode_operations irqchip_fwnode_ops;
+
 /**
  * irq_domain_alloc_fwnode - Allocate a fwnode_handle suitable for
  *                           identifying an irq domain
@@ -86,7 +88,7 @@ struct fwnode_handle *__irq_domain_alloc_fwnode(unsigned int type, int id,
 	fwid->type = type;
 	fwid->name = n;
 	fwid->data = data;
-	fwid->fwnode.type = FWNODE_IRQCHIP;
+	fwid->fwnode.ops = &irqchip_fwnode_ops;
 	return &fwid->fwnode;
 }
 EXPORT_SYMBOL_GPL(__irq_domain_alloc_fwnode);
@@ -193,10 +195,8 @@ struct irq_domain *__irq_domain_add(struct fwnode_handle *fwnode, int size,
 	}
 
 	if (!domain->name) {
-		if (fwnode) {
-			pr_err("Invalid fwnode type (%d) for irqdomain\n",
-			       fwnode->type);
-		}
+		if (fwnode)
+			pr_err("Invalid fwnode type for irqdomain\n");
 		domain->name = kasprintf(GFP_KERNEL, "unknown-%d",
 					 atomic_inc_return(&unknown_domains));
 		if (!domain->name) {
-- 
cgit v1.2.3


From 8b9d6802583a1ef6977e4b059f9fa848e6882253 Mon Sep 17 00:00:00 2001
From: Sakari Ailus <sakari.ailus@linux.intel.com>
Date: Fri, 21 Jul 2017 14:39:33 +0300
Subject: ACPI: Constify acpi_bus helper functions, switch to macros

Constify arguments to is_acpi_node(), is_acpi_device_node(),
is_acpi_static_node() and acpi_data_node_match(). Make
to_acpi_device_node() and to_acpi_data_node() macros that can cope with
const and non-const arguments.

Signed-off-by: Sakari Ailus <sakari.ailus@linux.intel.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 include/acpi/acpi_bus.h | 42 +++++++++++++++++++++++++-----------------
 1 file changed, 25 insertions(+), 17 deletions(-)

(limited to 'include')

diff --git a/include/acpi/acpi_bus.h b/include/acpi/acpi_bus.h
index 91b1e58e5189..89bc2132429d 100644
--- a/include/acpi/acpi_bus.h
+++ b/include/acpi/acpi_bus.h
@@ -399,43 +399,51 @@ extern const struct fwnode_operations acpi_device_fwnode_ops;
 extern const struct fwnode_operations acpi_data_fwnode_ops;
 extern const struct fwnode_operations acpi_static_fwnode_ops;
 
-static inline bool is_acpi_node(struct fwnode_handle *fwnode)
+static inline bool is_acpi_node(const struct fwnode_handle *fwnode)
 {
 	return !IS_ERR_OR_NULL(fwnode) &&
 		(fwnode->ops == &acpi_device_fwnode_ops
 		 || fwnode->ops == &acpi_data_fwnode_ops);
 }
 
-static inline bool is_acpi_device_node(struct fwnode_handle *fwnode)
+static inline bool is_acpi_device_node(const struct fwnode_handle *fwnode)
 {
 	return !IS_ERR_OR_NULL(fwnode) &&
 		fwnode->ops == &acpi_device_fwnode_ops;
 }
 
-static inline struct acpi_device *to_acpi_device_node(struct fwnode_handle *fwnode)
-{
-	return is_acpi_device_node(fwnode) ?
-		container_of(fwnode, struct acpi_device, fwnode) : NULL;
-}
-
-static inline bool is_acpi_data_node(struct fwnode_handle *fwnode)
+#define to_acpi_device_node(__fwnode)					\
+	({								\
+		typeof(__fwnode) __to_acpi_device_node_fwnode = __fwnode; \
+									\
+		is_acpi_device_node(__to_acpi_device_node_fwnode) ?	\
+			container_of(__to_acpi_device_node_fwnode,	\
+				     struct acpi_device, fwnode) :	\
+			NULL;						\
+	})
+
+static inline bool is_acpi_data_node(const struct fwnode_handle *fwnode)
 {
 	return !IS_ERR_OR_NULL(fwnode) && fwnode->ops == &acpi_data_fwnode_ops;
 }
 
-static inline struct acpi_data_node *to_acpi_data_node(struct fwnode_handle *fwnode)
-{
-	return is_acpi_data_node(fwnode) ?
-		container_of(fwnode, struct acpi_data_node, fwnode) : NULL;
-}
-
-static inline bool is_acpi_static_node(struct fwnode_handle *fwnode)
+#define to_acpi_data_node(__fwnode)					\
+	({								\
+		typeof(__fwnode) __to_acpi_data_node_fwnode = __fwnode;	\
+									\
+		is_acpi_data_node(__to_acpi_data_node_fwnode) ?		\
+			container_of(__to_acpi_data_node_fwnode,	\
+				     struct acpi_data_node, fwnode) :	\
+			NULL;						\
+	})
+
+static inline bool is_acpi_static_node(const struct fwnode_handle *fwnode)
 {
 	return !IS_ERR_OR_NULL(fwnode) &&
 		fwnode->ops == &acpi_static_fwnode_ops;
 }
 
-static inline bool acpi_data_node_match(struct fwnode_handle *fwnode,
+static inline bool acpi_data_node_match(const struct fwnode_handle *fwnode,
 					const char *name)
 {
 	return is_acpi_data_node(fwnode) ?
-- 
cgit v1.2.3


From 99a85464693faa9b6829cb753b328c2e4434d94b Mon Sep 17 00:00:00 2001
From: Sakari Ailus <sakari.ailus@linux.intel.com>
Date: Fri, 21 Jul 2017 14:39:34 +0300
Subject: ACPI: Constify internal fwnode arguments

Constify internal ACPI fwnode arguments in preparation for the same in
fwnode API.

Signed-off-by: Sakari Ailus <sakari.ailus@linux.intel.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/acpi/property.c | 34 ++++++++++++++++++----------------
 include/linux/acpi.h    | 40 ++++++++++++++++++++++------------------
 2 files changed, 40 insertions(+), 34 deletions(-)

(limited to 'include')

diff --git a/drivers/acpi/property.c b/drivers/acpi/property.c
index 04c60a25ee87..043bfcacee66 100644
--- a/drivers/acpi/property.c
+++ b/drivers/acpi/property.c
@@ -19,7 +19,7 @@
 
 #include "internal.h"
 
-static int acpi_data_get_property_array(struct acpi_device_data *data,
+static int acpi_data_get_property_array(const struct acpi_device_data *data,
 					const char *name,
 					acpi_object_type type,
 					const union acpi_object **obj);
@@ -417,7 +417,7 @@ void acpi_free_properties(struct acpi_device *adev)
  *         %-EINVAL if the property doesn't exist,
  *         %-EPROTO if the property value type doesn't match @type.
  */
-static int acpi_data_get_property(struct acpi_device_data *data,
+static int acpi_data_get_property(const struct acpi_device_data *data,
 				  const char *name, acpi_object_type type,
 				  const union acpi_object **obj)
 {
@@ -459,20 +459,21 @@ static int acpi_data_get_property(struct acpi_device_data *data,
  * @type: Expected property type.
  * @obj: Location to store the property value (if not %NULL).
  */
-int acpi_dev_get_property(struct acpi_device *adev, const char *name,
+int acpi_dev_get_property(const struct acpi_device *adev, const char *name,
 			  acpi_object_type type, const union acpi_object **obj)
 {
 	return adev ? acpi_data_get_property(&adev->data, name, type, obj) : -EINVAL;
 }
 EXPORT_SYMBOL_GPL(acpi_dev_get_property);
 
-static struct acpi_device_data *acpi_device_data_of_node(struct fwnode_handle *fwnode)
+static const struct acpi_device_data *
+acpi_device_data_of_node(const struct fwnode_handle *fwnode)
 {
 	if (is_acpi_device_node(fwnode)) {
-		struct acpi_device *adev = to_acpi_device_node(fwnode);
+		const struct acpi_device *adev = to_acpi_device_node(fwnode);
 		return &adev->data;
 	} else if (is_acpi_data_node(fwnode)) {
-		struct acpi_data_node *dn = to_acpi_data_node(fwnode);
+		const struct acpi_data_node *dn = to_acpi_data_node(fwnode);
 		return &dn->data;
 	}
 	return NULL;
@@ -484,8 +485,8 @@ static struct acpi_device_data *acpi_device_data_of_node(struct fwnode_handle *f
  * @propname: Name of the property.
  * @valptr: Location to store a pointer to the property value (if not %NULL).
  */
-int acpi_node_prop_get(struct fwnode_handle *fwnode, const char *propname,
-		       void **valptr)
+int acpi_node_prop_get(const struct fwnode_handle *fwnode,
+		       const char *propname, void **valptr)
 {
 	return acpi_data_get_property(acpi_device_data_of_node(fwnode),
 				      propname, ACPI_TYPE_ANY,
@@ -511,7 +512,7 @@ int acpi_node_prop_get(struct fwnode_handle *fwnode, const char *propname,
  *         %-EPROTO if the property is not a package or the type of its elements
  *           doesn't match @type.
  */
-static int acpi_data_get_property_array(struct acpi_device_data *data,
+static int acpi_data_get_property_array(const struct acpi_device_data *data,
 					const char *name,
 					acpi_object_type type,
 					const union acpi_object **obj)
@@ -571,13 +572,13 @@ static int acpi_data_get_property_array(struct acpi_device_data *data,
  *
  * Return: %0 on success, negative error code on failure.
  */
-int __acpi_node_get_property_reference(struct fwnode_handle *fwnode,
+int __acpi_node_get_property_reference(const struct fwnode_handle *fwnode,
 	const char *propname, size_t index, size_t num_args,
 	struct acpi_reference_args *args)
 {
 	const union acpi_object *element, *end;
 	const union acpi_object *obj;
-	struct acpi_device_data *data;
+	const struct acpi_device_data *data;
 	struct acpi_device *device;
 	int ret, idx = 0;
 
@@ -673,7 +674,7 @@ int __acpi_node_get_property_reference(struct fwnode_handle *fwnode,
 }
 EXPORT_SYMBOL_GPL(__acpi_node_get_property_reference);
 
-static int acpi_data_prop_read_single(struct acpi_device_data *data,
+static int acpi_data_prop_read_single(const struct acpi_device_data *data,
 				      const char *propname,
 				      enum dev_prop_type proptype, void *val)
 {
@@ -812,7 +813,7 @@ static int acpi_copy_property_array_string(const union acpi_object *items,
 	return nval;
 }
 
-static int acpi_data_prop_read(struct acpi_device_data *data,
+static int acpi_data_prop_read(const struct acpi_device_data *data,
 			       const char *propname,
 			       enum dev_prop_type proptype,
 			       void *val, size_t nval)
@@ -866,7 +867,7 @@ static int acpi_data_prop_read(struct acpi_device_data *data,
 	return ret;
 }
 
-int acpi_dev_prop_read(struct acpi_device *adev, const char *propname,
+int acpi_dev_prop_read(const struct acpi_device *adev, const char *propname,
 		       enum dev_prop_type proptype, void *val, size_t nval)
 {
 	return adev ? acpi_data_prop_read(&adev->data, propname, proptype, val, nval) : -EINVAL;
@@ -884,8 +885,9 @@ int acpi_dev_prop_read(struct acpi_device *adev, const char *propname,
  * of the property.  Otherwise, read at most @nval values to the array at the
  * location pointed to by @val.
  */
-int acpi_node_prop_read(struct fwnode_handle *fwnode,  const char *propname,
-		        enum dev_prop_type proptype, void *val, size_t nval)
+int acpi_node_prop_read(const struct fwnode_handle *fwnode,
+			const char *propname, enum dev_prop_type proptype,
+			void *val, size_t nval)
 {
 	return acpi_data_prop_read(acpi_device_data_of_node(fwnode),
 				   propname, proptype, val, nval);
diff --git a/include/linux/acpi.h b/include/linux/acpi.h
index 71b763f0bee9..4d9fb610f114 100644
--- a/include/linux/acpi.h
+++ b/include/linux/acpi.h
@@ -1003,13 +1003,14 @@ struct acpi_reference_args {
 };
 
 #ifdef CONFIG_ACPI
-int acpi_dev_get_property(struct acpi_device *adev, const char *name,
+int acpi_dev_get_property(const struct acpi_device *adev, const char *name,
 			  acpi_object_type type, const union acpi_object **obj);
-int __acpi_node_get_property_reference(struct fwnode_handle *fwnode,
+int __acpi_node_get_property_reference(const struct fwnode_handle *fwnode,
 				const char *name, size_t index, size_t num_args,
 				struct acpi_reference_args *args);
 
-static inline int acpi_node_get_property_reference(struct fwnode_handle *fwnode,
+static inline int acpi_node_get_property_reference(
+				const struct fwnode_handle *fwnode,
 				const char *name, size_t index,
 				struct acpi_reference_args *args)
 {
@@ -1017,13 +1018,15 @@ static inline int acpi_node_get_property_reference(struct fwnode_handle *fwnode,
 		MAX_ACPI_REFERENCE_ARGS, args);
 }
 
-int acpi_node_prop_get(struct fwnode_handle *fwnode, const char *propname,
+int acpi_node_prop_get(const struct fwnode_handle *fwnode, const char *propname,
 		       void **valptr);
-int acpi_dev_prop_read_single(struct acpi_device *adev, const char *propname,
-			      enum dev_prop_type proptype, void *val);
-int acpi_node_prop_read(struct fwnode_handle *fwnode, const char *propname,
-		        enum dev_prop_type proptype, void *val, size_t nval);
-int acpi_dev_prop_read(struct acpi_device *adev, const char *propname,
+int acpi_dev_prop_read_single(struct acpi_device *adev,
+			      const char *propname, enum dev_prop_type proptype,
+			      void *val);
+int acpi_node_prop_read(const struct fwnode_handle *fwnode,
+			const char *propname, enum dev_prop_type proptype,
+			void *val, size_t nval);
+int acpi_dev_prop_read(const struct acpi_device *adev, const char *propname,
 		       enum dev_prop_type proptype, void *val, size_t nval);
 
 struct fwnode_handle *acpi_get_next_subnode(struct fwnode_handle *fwnode,
@@ -1100,35 +1103,36 @@ static inline int acpi_dev_get_property(struct acpi_device *adev,
 }
 
 static inline int
-__acpi_node_get_property_reference(struct fwnode_handle *fwnode,
+__acpi_node_get_property_reference(const struct fwnode_handle *fwnode,
 				const char *name, size_t index, size_t num_args,
 				struct acpi_reference_args *args)
 {
 	return -ENXIO;
 }
 
-static inline int acpi_node_get_property_reference(struct fwnode_handle *fwnode,
-				const char *name, size_t index,
-				struct acpi_reference_args *args)
+static inline int
+acpi_node_get_property_reference(const struct fwnode_handle *fwnode,
+				 const char *name, size_t index,
+				 struct acpi_reference_args *args)
 {
 	return -ENXIO;
 }
 
-static inline int acpi_node_prop_get(struct fwnode_handle *fwnode,
+static inline int acpi_node_prop_get(const struct fwnode_handle *fwnode,
 				     const char *propname,
 				     void **valptr)
 {
 	return -ENXIO;
 }
 
-static inline int acpi_dev_prop_get(struct acpi_device *adev,
+static inline int acpi_dev_prop_get(const struct acpi_device *adev,
 				    const char *propname,
 				    void **valptr)
 {
 	return -ENXIO;
 }
 
-static inline int acpi_dev_prop_read_single(struct acpi_device *adev,
+static inline int acpi_dev_prop_read_single(const struct acpi_device *adev,
 					    const char *propname,
 					    enum dev_prop_type proptype,
 					    void *val)
@@ -1136,7 +1140,7 @@ static inline int acpi_dev_prop_read_single(struct acpi_device *adev,
 	return -ENXIO;
 }
 
-static inline int acpi_node_prop_read(struct fwnode_handle *fwnode,
+static inline int acpi_node_prop_read(const struct fwnode_handle *fwnode,
 				      const char *propname,
 				      enum dev_prop_type proptype,
 				      void *val, size_t nval)
@@ -1144,7 +1148,7 @@ static inline int acpi_node_prop_read(struct fwnode_handle *fwnode,
 	return -ENXIO;
 }
 
-static inline int acpi_dev_prop_read(struct acpi_device *adev,
+static inline int acpi_dev_prop_read(const struct acpi_device *adev,
 				     const char *propname,
 				     enum dev_prop_type proptype,
 				     void *val, size_t nval)
-- 
cgit v1.2.3


From 37ba983cfb47cc7b353146422c437468fcb29c61 Mon Sep 17 00:00:00 2001
From: Sakari Ailus <sakari.ailus@linux.intel.com>
Date: Fri, 21 Jul 2017 14:39:36 +0300
Subject: device property: Constify fwnode property API

Make fwnode arguments to the fwnode property API const.

Signed-off-by: Sakari Ailus <sakari.ailus@linux.intel.com>
Reviewed-by: Rob Herring <robh@kernel.org>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/acpi/property.c  | 52 +++++++++++++++++++++++----------------
 drivers/base/property.c  | 64 ++++++++++++++++++++++++++----------------------
 drivers/of/property.c    | 34 +++++++++++++------------
 include/linux/acpi.h     | 20 ++++++++-------
 include/linux/fwnode.h   | 26 +++++++++++---------
 include/linux/property.h | 63 ++++++++++++++++++++++++-----------------------
 6 files changed, 142 insertions(+), 117 deletions(-)

(limited to 'include')

diff --git a/drivers/acpi/property.c b/drivers/acpi/property.c
index 043bfcacee66..f8d60051efb8 100644
--- a/drivers/acpi/property.c
+++ b/drivers/acpi/property.c
@@ -898,7 +898,7 @@ int acpi_node_prop_read(const struct fwnode_handle *fwnode,
  * @fwnode: Firmware node to find the next child node for.
  * @child: Handle to one of the device's child nodes or a null handle.
  */
-struct fwnode_handle *acpi_get_next_subnode(struct fwnode_handle *fwnode,
+struct fwnode_handle *acpi_get_next_subnode(const struct fwnode_handle *fwnode,
 					    struct fwnode_handle *child)
 {
 	const struct acpi_device *adev = to_acpi_device_node(fwnode);
@@ -967,7 +967,7 @@ struct fwnode_handle *acpi_get_next_subnode(struct fwnode_handle *fwnode,
  * Returns parent node of an ACPI device or data firmware node or %NULL if
  * not available.
  */
-struct fwnode_handle *acpi_node_get_parent(struct fwnode_handle *fwnode)
+struct fwnode_handle *acpi_node_get_parent(const struct fwnode_handle *fwnode)
 {
 	if (is_acpi_data_node(fwnode)) {
 		/* All data nodes have parent pointer so just return that */
@@ -996,8 +996,8 @@ struct fwnode_handle *acpi_node_get_parent(struct fwnode_handle *fwnode)
  * %NULL if there is no next endpoint, ERR_PTR() in case of error. In case
  * of success the next endpoint is returned.
  */
-struct fwnode_handle *acpi_graph_get_next_endpoint(struct fwnode_handle *fwnode,
-						   struct fwnode_handle *prev)
+struct fwnode_handle *acpi_graph_get_next_endpoint(
+	const struct fwnode_handle *fwnode, struct fwnode_handle *prev)
 {
 	struct fwnode_handle *port = NULL;
 	struct fwnode_handle *endpoint;
@@ -1044,7 +1044,8 @@ struct fwnode_handle *acpi_graph_get_next_endpoint(struct fwnode_handle *fwnode,
  * the child node on success, NULL otherwise.
  */
 static struct fwnode_handle *acpi_graph_get_child_prop_value(
-	struct fwnode_handle *fwnode, const char *prop_name, unsigned int val)
+	const struct fwnode_handle *fwnode, const char *prop_name,
+	unsigned int val)
 {
 	struct fwnode_handle *child;
 
@@ -1073,17 +1074,18 @@ static struct fwnode_handle *acpi_graph_get_child_prop_value(
  * fields requested by the caller. Returns %0 in case of success and
  * negative errno otherwise.
  */
-int acpi_graph_get_remote_endpoint(struct fwnode_handle *fwnode,
+int acpi_graph_get_remote_endpoint(const struct fwnode_handle *__fwnode,
 				   struct fwnode_handle **parent,
 				   struct fwnode_handle **port,
 				   struct fwnode_handle **endpoint)
 {
+	struct fwnode_handle *fwnode;
 	unsigned int port_nr, endpoint_nr;
 	struct acpi_reference_args args;
 	int ret;
 
 	memset(&args, 0, sizeof(args));
-	ret = acpi_node_get_property_reference(fwnode, "remote-endpoint", 0,
+	ret = acpi_node_get_property_reference(__fwnode, "remote-endpoint", 0,
 					       &args);
 	if (ret)
 		return ret;
@@ -1125,7 +1127,7 @@ int acpi_graph_get_remote_endpoint(struct fwnode_handle *fwnode,
 	return 0;
 }
 
-static bool acpi_fwnode_device_is_available(struct fwnode_handle *fwnode)
+static bool acpi_fwnode_device_is_available(const struct fwnode_handle *fwnode)
 {
 	if (!is_acpi_device_node(fwnode))
 		return false;
@@ -1133,16 +1135,17 @@ static bool acpi_fwnode_device_is_available(struct fwnode_handle *fwnode)
 	return acpi_device_is_present(to_acpi_device_node(fwnode));
 }
 
-static bool acpi_fwnode_property_present(struct fwnode_handle *fwnode,
+static bool acpi_fwnode_property_present(const struct fwnode_handle *fwnode,
 					 const char *propname)
 {
 	return !acpi_node_prop_get(fwnode, propname, NULL);
 }
 
-static int acpi_fwnode_property_read_int_array(struct fwnode_handle *fwnode,
-					       const char *propname,
-					       unsigned int elem_size,
-					       void *val, size_t nval)
+static int
+acpi_fwnode_property_read_int_array(const struct fwnode_handle *fwnode,
+				    const char *propname,
+				    unsigned int elem_size, void *val,
+				    size_t nval)
 {
 	enum dev_prop_type type;
 
@@ -1166,16 +1169,17 @@ static int acpi_fwnode_property_read_int_array(struct fwnode_handle *fwnode,
 	return acpi_node_prop_read(fwnode, propname, type, val, nval);
 }
 
-static int acpi_fwnode_property_read_string_array(struct fwnode_handle *fwnode,
-						  const char *propname,
-						  const char **val, size_t nval)
+static int
+acpi_fwnode_property_read_string_array(const struct fwnode_handle *fwnode,
+				       const char *propname, const char **val,
+				       size_t nval)
 {
 	return acpi_node_prop_read(fwnode, propname, DEV_PROP_STRING,
 				   val, nval);
 }
 
 static struct fwnode_handle *
-acpi_fwnode_get_named_child_node(struct fwnode_handle *fwnode,
+acpi_fwnode_get_named_child_node(const struct fwnode_handle *fwnode,
 				 const char *childname)
 {
 	struct fwnode_handle *child;
@@ -1192,7 +1196,7 @@ acpi_fwnode_get_named_child_node(struct fwnode_handle *fwnode,
 }
 
 static struct fwnode_handle *
-acpi_fwnode_graph_get_next_endpoint(struct fwnode_handle *fwnode,
+acpi_fwnode_graph_get_next_endpoint(const struct fwnode_handle *fwnode,
 				    struct fwnode_handle *prev)
 {
 	struct fwnode_handle *endpoint;
@@ -1205,7 +1209,7 @@ acpi_fwnode_graph_get_next_endpoint(struct fwnode_handle *fwnode,
 }
 
 static struct fwnode_handle *
-acpi_fwnode_graph_get_remote_endpoint(struct fwnode_handle *fwnode)
+acpi_fwnode_graph_get_remote_endpoint(const struct fwnode_handle *fwnode)
 {
 	struct fwnode_handle *endpoint = NULL;
 
@@ -1214,7 +1218,13 @@ acpi_fwnode_graph_get_remote_endpoint(struct fwnode_handle *fwnode)
 	return endpoint;
 }
 
-static int acpi_fwnode_graph_parse_endpoint(struct fwnode_handle *fwnode,
+static struct fwnode_handle *
+acpi_fwnode_get_parent(struct fwnode_handle *fwnode)
+{
+	return acpi_node_get_parent(fwnode);
+}
+
+static int acpi_fwnode_graph_parse_endpoint(const struct fwnode_handle *fwnode,
 					    struct fwnode_endpoint *endpoint)
 {
 	struct fwnode_handle *port_fwnode = fwnode_get_parent(fwnode);
@@ -1242,7 +1252,7 @@ static int acpi_fwnode_graph_parse_endpoint(struct fwnode_handle *fwnode,
 			acpi_fwnode_graph_get_next_endpoint,		\
 		.graph_get_remote_endpoint =				\
 			acpi_fwnode_graph_get_remote_endpoint,		\
-		.graph_get_port_parent = acpi_node_get_parent,		\
+		.graph_get_port_parent = acpi_fwnode_get_parent,	\
 		.graph_parse_endpoint = acpi_fwnode_graph_parse_endpoint, \
 	};								\
 	EXPORT_SYMBOL_GPL(ops)
diff --git a/drivers/base/property.c b/drivers/base/property.c
index 8fde824ad418..673e2353a2fb 100644
--- a/drivers/base/property.c
+++ b/drivers/base/property.c
@@ -193,18 +193,18 @@ struct fwnode_handle *dev_fwnode(struct device *dev)
 }
 EXPORT_SYMBOL_GPL(dev_fwnode);
 
-static bool pset_fwnode_property_present(struct fwnode_handle *fwnode,
+static bool pset_fwnode_property_present(const struct fwnode_handle *fwnode,
 					 const char *propname)
 {
 	return !!pset_prop_get(to_pset_node(fwnode), propname);
 }
 
-static int pset_fwnode_read_int_array(struct fwnode_handle *fwnode,
+static int pset_fwnode_read_int_array(const struct fwnode_handle *fwnode,
 				      const char *propname,
 				      unsigned int elem_size, void *val,
 				      size_t nval)
 {
-	struct property_set *node = to_pset_node(fwnode);
+	const struct property_set *node = to_pset_node(fwnode);
 
 	if (!val)
 		return pset_prop_count_elems_of_size(node, propname, elem_size);
@@ -223,9 +223,10 @@ static int pset_fwnode_read_int_array(struct fwnode_handle *fwnode,
 	return -ENXIO;
 }
 
-static int pset_fwnode_property_read_string_array(struct fwnode_handle *fwnode,
-						  const char *propname,
-						  const char **val, size_t nval)
+static int
+pset_fwnode_property_read_string_array(const struct fwnode_handle *fwnode,
+				       const char *propname,
+				       const char **val, size_t nval)
 {
 	return pset_prop_read_string_array(to_pset_node(fwnode), propname,
 					   val, nval);
@@ -255,7 +256,8 @@ EXPORT_SYMBOL_GPL(device_property_present);
  * @fwnode: Firmware node whose property to check
  * @propname: Name of the property
  */
-bool fwnode_property_present(struct fwnode_handle *fwnode, const char *propname)
+bool fwnode_property_present(const struct fwnode_handle *fwnode,
+			     const char *propname)
 {
 	bool ret;
 
@@ -437,7 +439,7 @@ int device_property_match_string(struct device *dev, const char *propname,
 }
 EXPORT_SYMBOL_GPL(device_property_match_string);
 
-static int fwnode_property_read_int_array(struct fwnode_handle *fwnode,
+static int fwnode_property_read_int_array(const struct fwnode_handle *fwnode,
 					  const char *propname,
 					  unsigned int elem_size, void *val,
 					  size_t nval)
@@ -473,7 +475,7 @@ static int fwnode_property_read_int_array(struct fwnode_handle *fwnode,
  *	   %-EOVERFLOW if the size of the property is not as expected,
  *	   %-ENXIO if no suitable firmware interface is present.
  */
-int fwnode_property_read_u8_array(struct fwnode_handle *fwnode,
+int fwnode_property_read_u8_array(const struct fwnode_handle *fwnode,
 				  const char *propname, u8 *val, size_t nval)
 {
 	return fwnode_property_read_int_array(fwnode, propname, sizeof(u8),
@@ -499,7 +501,7 @@ EXPORT_SYMBOL_GPL(fwnode_property_read_u8_array);
  *	   %-EOVERFLOW if the size of the property is not as expected,
  *	   %-ENXIO if no suitable firmware interface is present.
  */
-int fwnode_property_read_u16_array(struct fwnode_handle *fwnode,
+int fwnode_property_read_u16_array(const struct fwnode_handle *fwnode,
 				   const char *propname, u16 *val, size_t nval)
 {
 	return fwnode_property_read_int_array(fwnode, propname, sizeof(u16),
@@ -525,7 +527,7 @@ EXPORT_SYMBOL_GPL(fwnode_property_read_u16_array);
  *	   %-EOVERFLOW if the size of the property is not as expected,
  *	   %-ENXIO if no suitable firmware interface is present.
  */
-int fwnode_property_read_u32_array(struct fwnode_handle *fwnode,
+int fwnode_property_read_u32_array(const struct fwnode_handle *fwnode,
 				   const char *propname, u32 *val, size_t nval)
 {
 	return fwnode_property_read_int_array(fwnode, propname, sizeof(u32),
@@ -551,7 +553,7 @@ EXPORT_SYMBOL_GPL(fwnode_property_read_u32_array);
  *	   %-EOVERFLOW if the size of the property is not as expected,
  *	   %-ENXIO if no suitable firmware interface is present.
  */
-int fwnode_property_read_u64_array(struct fwnode_handle *fwnode,
+int fwnode_property_read_u64_array(const struct fwnode_handle *fwnode,
 				   const char *propname, u64 *val, size_t nval)
 {
 	return fwnode_property_read_int_array(fwnode, propname, sizeof(u64),
@@ -577,7 +579,7 @@ EXPORT_SYMBOL_GPL(fwnode_property_read_u64_array);
  *	   %-EOVERFLOW if the size of the property is not as expected,
  *	   %-ENXIO if no suitable firmware interface is present.
  */
-int fwnode_property_read_string_array(struct fwnode_handle *fwnode,
+int fwnode_property_read_string_array(const struct fwnode_handle *fwnode,
 				      const char *propname, const char **val,
 				      size_t nval)
 {
@@ -609,7 +611,7 @@ EXPORT_SYMBOL_GPL(fwnode_property_read_string_array);
  *	   %-EPROTO or %-EILSEQ if the property is not a string,
  *	   %-ENXIO if no suitable firmware interface is present.
  */
-int fwnode_property_read_string(struct fwnode_handle *fwnode,
+int fwnode_property_read_string(const struct fwnode_handle *fwnode,
 				const char *propname, const char **val)
 {
 	int ret = fwnode_property_read_string_array(fwnode, propname, val, 1);
@@ -633,7 +635,7 @@ EXPORT_SYMBOL_GPL(fwnode_property_read_string);
  *	   %-EPROTO if the property is not an array of strings,
  *	   %-ENXIO if no suitable firmware interface is present.
  */
-int fwnode_property_match_string(struct fwnode_handle *fwnode,
+int fwnode_property_match_string(const struct fwnode_handle *fwnode,
 	const char *propname, const char *string)
 {
 	const char **values;
@@ -940,7 +942,7 @@ EXPORT_SYMBOL_GPL(fwnode_get_next_parent);
  * Return parent firmware node of the given node if possible or %NULL if no
  * parent was available.
  */
-struct fwnode_handle *fwnode_get_parent(struct fwnode_handle *fwnode)
+struct fwnode_handle *fwnode_get_parent(const struct fwnode_handle *fwnode)
 {
 	return fwnode_call_ptr_op(fwnode, get_parent);
 }
@@ -951,8 +953,9 @@ EXPORT_SYMBOL_GPL(fwnode_get_parent);
  * @fwnode: Firmware node to find the next child node for.
  * @child: Handle to one of the node's child nodes or a %NULL handle.
  */
-struct fwnode_handle *fwnode_get_next_child_node(struct fwnode_handle *fwnode,
-						 struct fwnode_handle *child)
+struct fwnode_handle *
+fwnode_get_next_child_node(const struct fwnode_handle *fwnode,
+			   struct fwnode_handle *child)
 {
 	return fwnode_call_ptr_op(fwnode, get_next_child_node, child);
 }
@@ -983,8 +986,9 @@ EXPORT_SYMBOL_GPL(device_get_next_child_node);
  * @fwnode: Firmware node to find the named child node for.
  * @childname: String to match child node name against.
  */
-struct fwnode_handle *fwnode_get_named_child_node(struct fwnode_handle *fwnode,
-						  const char *childname)
+struct fwnode_handle *
+fwnode_get_named_child_node(const struct fwnode_handle *fwnode,
+			    const char *childname)
 {
 	return fwnode_call_ptr_op(fwnode, get_named_child_node, childname);
 }
@@ -1030,7 +1034,7 @@ EXPORT_SYMBOL_GPL(fwnode_handle_put);
  * fwnode_device_is_available - check if a device is available for use
  * @fwnode: Pointer to the fwnode of the device.
  */
-bool fwnode_device_is_available(struct fwnode_handle *fwnode)
+bool fwnode_device_is_available(const struct fwnode_handle *fwnode)
 {
 	return fwnode_call_bool_op(fwnode, device_is_available);
 }
@@ -1168,7 +1172,7 @@ EXPORT_SYMBOL(device_get_mac_address);
  * are available.
  */
 struct fwnode_handle *
-fwnode_graph_get_next_endpoint(struct fwnode_handle *fwnode,
+fwnode_graph_get_next_endpoint(const struct fwnode_handle *fwnode,
 			       struct fwnode_handle *prev)
 {
 	return fwnode_call_ptr_op(fwnode, graph_get_next_endpoint, prev);
@@ -1182,7 +1186,7 @@ EXPORT_SYMBOL_GPL(fwnode_graph_get_next_endpoint);
  * Return: the firmware node of the device the @endpoint belongs to.
  */
 struct fwnode_handle *
-fwnode_graph_get_port_parent(struct fwnode_handle *endpoint)
+fwnode_graph_get_port_parent(const struct fwnode_handle *endpoint)
 {
 	struct fwnode_handle *port, *parent;
 
@@ -1202,7 +1206,7 @@ EXPORT_SYMBOL_GPL(fwnode_graph_get_port_parent);
  * Extracts firmware node of a remote device the @fwnode points to.
  */
 struct fwnode_handle *
-fwnode_graph_get_remote_port_parent(struct fwnode_handle *fwnode)
+fwnode_graph_get_remote_port_parent(const struct fwnode_handle *fwnode)
 {
 	struct fwnode_handle *endpoint, *parent;
 
@@ -1221,7 +1225,8 @@ EXPORT_SYMBOL_GPL(fwnode_graph_get_remote_port_parent);
  *
  * Extracts firmware node of a remote port the @fwnode points to.
  */
-struct fwnode_handle *fwnode_graph_get_remote_port(struct fwnode_handle *fwnode)
+struct fwnode_handle *
+fwnode_graph_get_remote_port(const struct fwnode_handle *fwnode)
 {
 	return fwnode_get_next_parent(fwnode_graph_get_remote_endpoint(fwnode));
 }
@@ -1234,7 +1239,7 @@ EXPORT_SYMBOL_GPL(fwnode_graph_get_remote_port);
  * Extracts firmware node of a remote endpoint the @fwnode points to.
  */
 struct fwnode_handle *
-fwnode_graph_get_remote_endpoint(struct fwnode_handle *fwnode)
+fwnode_graph_get_remote_endpoint(const struct fwnode_handle *fwnode)
 {
 	return fwnode_call_ptr_op(fwnode, graph_get_remote_endpoint);
 }
@@ -1249,8 +1254,9 @@ EXPORT_SYMBOL_GPL(fwnode_graph_get_remote_endpoint);
  * Return: Remote fwnode handle associated with remote endpoint node linked
  *	   to @node. Use fwnode_node_put() on it when done.
  */
-struct fwnode_handle *fwnode_graph_get_remote_node(struct fwnode_handle *fwnode,
-						   u32 port_id, u32 endpoint_id)
+struct fwnode_handle *
+fwnode_graph_get_remote_node(const struct fwnode_handle *fwnode, u32 port_id,
+			     u32 endpoint_id)
 {
 	struct fwnode_handle *endpoint = NULL;
 
@@ -1286,7 +1292,7 @@ EXPORT_SYMBOL_GPL(fwnode_graph_get_remote_node);
  * information in @endpoint. The caller must hold a reference to
  * @fwnode.
  */
-int fwnode_graph_parse_endpoint(struct fwnode_handle *fwnode,
+int fwnode_graph_parse_endpoint(const struct fwnode_handle *fwnode,
 				struct fwnode_endpoint *endpoint)
 {
 	memset(endpoint, 0, sizeof(*endpoint));
diff --git a/drivers/of/property.c b/drivers/of/property.c
index 2d5988820405..ae46a6f0ea36 100644
--- a/drivers/of/property.c
+++ b/drivers/of/property.c
@@ -815,23 +815,23 @@ static void of_fwnode_put(struct fwnode_handle *fwnode)
 	of_node_put(to_of_node(fwnode));
 }
 
-static bool of_fwnode_device_is_available(struct fwnode_handle *fwnode)
+static bool of_fwnode_device_is_available(const struct fwnode_handle *fwnode)
 {
 	return of_device_is_available(to_of_node(fwnode));
 }
 
-static bool of_fwnode_property_present(struct fwnode_handle *fwnode,
+static bool of_fwnode_property_present(const struct fwnode_handle *fwnode,
 				       const char *propname)
 {
 	return of_property_read_bool(to_of_node(fwnode), propname);
 }
 
-static int of_fwnode_property_read_int_array(struct fwnode_handle *fwnode,
+static int of_fwnode_property_read_int_array(const struct fwnode_handle *fwnode,
 					     const char *propname,
 					     unsigned int elem_size, void *val,
 					     size_t nval)
 {
-	struct device_node *node = to_of_node(fwnode);
+	const struct device_node *node = to_of_node(fwnode);
 
 	if (!val)
 		return of_property_count_elems_of_size(node, propname,
@@ -851,24 +851,26 @@ static int of_fwnode_property_read_int_array(struct fwnode_handle *fwnode,
 	return -ENXIO;
 }
 
-static int of_fwnode_property_read_string_array(struct fwnode_handle *fwnode,
-						const char *propname,
-						const char **val, size_t nval)
+static int
+of_fwnode_property_read_string_array(const struct fwnode_handle *fwnode,
+				     const char *propname, const char **val,
+				     size_t nval)
 {
-	struct device_node *node = to_of_node(fwnode);
+	const struct device_node *node = to_of_node(fwnode);
 
 	return val ?
 		of_property_read_string_array(node, propname, val, nval) :
 		of_property_count_strings(node, propname);
 }
 
-static struct fwnode_handle *of_fwnode_get_parent(struct fwnode_handle *fwnode)
+static struct fwnode_handle *
+of_fwnode_get_parent(const struct fwnode_handle *fwnode)
 {
 	return of_fwnode_handle(of_get_parent(to_of_node(fwnode)));
 }
 
 static struct fwnode_handle *
-of_fwnode_get_next_child_node(struct fwnode_handle *fwnode,
+of_fwnode_get_next_child_node(const struct fwnode_handle *fwnode,
 			      struct fwnode_handle *child)
 {
 	return of_fwnode_handle(of_get_next_available_child(to_of_node(fwnode),
@@ -876,10 +878,10 @@ of_fwnode_get_next_child_node(struct fwnode_handle *fwnode,
 }
 
 static struct fwnode_handle *
-of_fwnode_get_named_child_node(struct fwnode_handle *fwnode,
+of_fwnode_get_named_child_node(const struct fwnode_handle *fwnode,
 			       const char *childname)
 {
-	struct device_node *node = to_of_node(fwnode);
+	const struct device_node *node = to_of_node(fwnode);
 	struct device_node *child;
 
 	for_each_available_child_of_node(node, child)
@@ -890,7 +892,7 @@ of_fwnode_get_named_child_node(struct fwnode_handle *fwnode,
 }
 
 static struct fwnode_handle *
-of_fwnode_graph_get_next_endpoint(struct fwnode_handle *fwnode,
+of_fwnode_graph_get_next_endpoint(const struct fwnode_handle *fwnode,
 				  struct fwnode_handle *prev)
 {
 	return of_fwnode_handle(of_graph_get_next_endpoint(to_of_node(fwnode),
@@ -898,7 +900,7 @@ of_fwnode_graph_get_next_endpoint(struct fwnode_handle *fwnode,
 }
 
 static struct fwnode_handle *
-of_fwnode_graph_get_remote_endpoint(struct fwnode_handle *fwnode)
+of_fwnode_graph_get_remote_endpoint(const struct fwnode_handle *fwnode)
 {
 	return of_fwnode_handle(of_parse_phandle(to_of_node(fwnode),
 						 "remote-endpoint", 0));
@@ -921,10 +923,10 @@ of_fwnode_graph_get_port_parent(struct fwnode_handle *fwnode)
 	return of_fwnode_handle(of_get_next_parent(np));
 }
 
-static int of_fwnode_graph_parse_endpoint(struct fwnode_handle *fwnode,
+static int of_fwnode_graph_parse_endpoint(const struct fwnode_handle *fwnode,
 					  struct fwnode_endpoint *endpoint)
 {
-	struct device_node *node = to_of_node(fwnode);
+	const struct device_node *node = to_of_node(fwnode);
 	struct device_node *port_node = of_get_parent(node);
 
 	endpoint->local_fwnode = fwnode;
diff --git a/include/linux/acpi.h b/include/linux/acpi.h
index 4d9fb610f114..8b9edf87b98d 100644
--- a/include/linux/acpi.h
+++ b/include/linux/acpi.h
@@ -1029,13 +1029,14 @@ int acpi_node_prop_read(const struct fwnode_handle *fwnode,
 int acpi_dev_prop_read(const struct acpi_device *adev, const char *propname,
 		       enum dev_prop_type proptype, void *val, size_t nval);
 
-struct fwnode_handle *acpi_get_next_subnode(struct fwnode_handle *fwnode,
+struct fwnode_handle *acpi_get_next_subnode(const struct fwnode_handle *fwnode,
 					    struct fwnode_handle *child);
-struct fwnode_handle *acpi_node_get_parent(struct fwnode_handle *fwnode);
+struct fwnode_handle *acpi_node_get_parent(const struct fwnode_handle *fwnode);
 
-struct fwnode_handle *acpi_graph_get_next_endpoint(struct fwnode_handle *fwnode,
-						   struct fwnode_handle *prev);
-int acpi_graph_get_remote_endpoint(struct fwnode_handle *fwnode,
+struct fwnode_handle *
+acpi_graph_get_next_endpoint(const struct fwnode_handle *fwnode,
+			     struct fwnode_handle *prev);
+int acpi_graph_get_remote_endpoint(const struct fwnode_handle *fwnode,
 				   struct fwnode_handle **remote,
 				   struct fwnode_handle **port,
 				   struct fwnode_handle **endpoint);
@@ -1157,26 +1158,27 @@ static inline int acpi_dev_prop_read(const struct acpi_device *adev,
 }
 
 static inline struct fwnode_handle *
-acpi_get_next_subnode(struct fwnode_handle *fwnode, struct fwnode_handle *child)
+acpi_get_next_subnode(const struct fwnode_handle *fwnode,
+		      struct fwnode_handle *child)
 {
 	return NULL;
 }
 
 static inline struct fwnode_handle *
-acpi_node_get_parent(struct fwnode_handle *fwnode)
+acpi_node_get_parent(const struct fwnode_handle *fwnode)
 {
 	return NULL;
 }
 
 static inline struct fwnode_handle *
-acpi_graph_get_next_endpoint(struct fwnode_handle *fwnode,
+acpi_graph_get_next_endpoint(const struct fwnode_handle *fwnode,
 			     struct fwnode_handle *prev)
 {
 	return ERR_PTR(-ENXIO);
 }
 
 static inline int
-acpi_graph_get_remote_endpoint(struct fwnode_handle *fwnode,
+acpi_graph_get_remote_endpoint(const struct fwnode_handle *fwnode,
 			       struct fwnode_handle **remote,
 			       struct fwnode_handle **port,
 			       struct fwnode_handle **endpoint)
diff --git a/include/linux/fwnode.h b/include/linux/fwnode.h
index c5dbc48b55dd..7b50ee4edcfc 100644
--- a/include/linux/fwnode.h
+++ b/include/linux/fwnode.h
@@ -55,30 +55,32 @@ struct fwnode_endpoint {
 struct fwnode_operations {
 	void (*get)(struct fwnode_handle *fwnode);
 	void (*put)(struct fwnode_handle *fwnode);
-	bool (*device_is_available)(struct fwnode_handle *fwnode);
-	bool (*property_present)(struct fwnode_handle *fwnode,
+	bool (*device_is_available)(const struct fwnode_handle *fwnode);
+	bool (*property_present)(const struct fwnode_handle *fwnode,
 				 const char *propname);
-	int (*property_read_int_array)(struct fwnode_handle *fwnode,
+	int (*property_read_int_array)(const struct fwnode_handle *fwnode,
 				       const char *propname,
 				       unsigned int elem_size, void *val,
 				       size_t nval);
-	int (*property_read_string_array)(struct fwnode_handle *fwnode_handle,
-					  const char *propname,
-					  const char **val, size_t nval);
-	struct fwnode_handle *(*get_parent)(struct fwnode_handle *fwnode);
+	int
+	(*property_read_string_array)(const struct fwnode_handle *fwnode_handle,
+				      const char *propname, const char **val,
+				      size_t nval);
+	struct fwnode_handle *(*get_parent)(const struct fwnode_handle *fwnode);
 	struct fwnode_handle *
-	(*get_next_child_node)(struct fwnode_handle *fwnode,
+	(*get_next_child_node)(const struct fwnode_handle *fwnode,
 			       struct fwnode_handle *child);
 	struct fwnode_handle *
-	(*get_named_child_node)(struct fwnode_handle *fwnode, const char *name);
+	(*get_named_child_node)(const struct fwnode_handle *fwnode,
+				const char *name);
 	struct fwnode_handle *
-	(*graph_get_next_endpoint)(struct fwnode_handle *fwnode,
+	(*graph_get_next_endpoint)(const struct fwnode_handle *fwnode,
 				   struct fwnode_handle *prev);
 	struct fwnode_handle *
-	(*graph_get_remote_endpoint)(struct fwnode_handle *fwnode);
+	(*graph_get_remote_endpoint)(const struct fwnode_handle *fwnode);
 	struct fwnode_handle *
 	(*graph_get_port_parent)(struct fwnode_handle *fwnode);
-	int (*graph_parse_endpoint)(struct fwnode_handle *fwnode,
+	int (*graph_parse_endpoint)(const struct fwnode_handle *fwnode,
 				    struct fwnode_endpoint *endpoint);
 };
 
diff --git a/include/linux/property.h b/include/linux/property.h
index 7e77039e6b81..edff3f89e755 100644
--- a/include/linux/property.h
+++ b/include/linux/property.h
@@ -51,46 +51,48 @@ int device_property_read_string(struct device *dev, const char *propname,
 int device_property_match_string(struct device *dev,
 				 const char *propname, const char *string);
 
-bool fwnode_device_is_available(struct fwnode_handle *fwnode);
-bool fwnode_property_present(struct fwnode_handle *fwnode, const char *propname);
-int fwnode_property_read_u8_array(struct fwnode_handle *fwnode,
+bool fwnode_device_is_available(const struct fwnode_handle *fwnode);
+bool fwnode_property_present(const struct fwnode_handle *fwnode,
+			     const char *propname);
+int fwnode_property_read_u8_array(const struct fwnode_handle *fwnode,
 				  const char *propname, u8 *val,
 				  size_t nval);
-int fwnode_property_read_u16_array(struct fwnode_handle *fwnode,
+int fwnode_property_read_u16_array(const struct fwnode_handle *fwnode,
 				   const char *propname, u16 *val,
 				   size_t nval);
-int fwnode_property_read_u32_array(struct fwnode_handle *fwnode,
+int fwnode_property_read_u32_array(const struct fwnode_handle *fwnode,
 				   const char *propname, u32 *val,
 				   size_t nval);
-int fwnode_property_read_u64_array(struct fwnode_handle *fwnode,
+int fwnode_property_read_u64_array(const struct fwnode_handle *fwnode,
 				   const char *propname, u64 *val,
 				   size_t nval);
-int fwnode_property_read_string_array(struct fwnode_handle *fwnode,
+int fwnode_property_read_string_array(const struct fwnode_handle *fwnode,
 				      const char *propname, const char **val,
 				      size_t nval);
-int fwnode_property_read_string(struct fwnode_handle *fwnode,
+int fwnode_property_read_string(const struct fwnode_handle *fwnode,
 				const char *propname, const char **val);
-int fwnode_property_match_string(struct fwnode_handle *fwnode,
+int fwnode_property_match_string(const struct fwnode_handle *fwnode,
 				 const char *propname, const char *string);
 
-struct fwnode_handle *fwnode_get_parent(struct fwnode_handle *fwnode);
-struct fwnode_handle *fwnode_get_next_parent(struct fwnode_handle *fwnode);
-struct fwnode_handle *fwnode_get_next_child_node(struct fwnode_handle *fwnode,
-						 struct fwnode_handle *child);
+struct fwnode_handle *fwnode_get_parent(const struct fwnode_handle *fwnode);
+struct fwnode_handle *fwnode_get_next_parent(
+	struct fwnode_handle *fwnode);
+struct fwnode_handle *fwnode_get_next_child_node(
+	const struct fwnode_handle *fwnode, struct fwnode_handle *child);
 
 #define fwnode_for_each_child_node(fwnode, child)			\
 	for (child = fwnode_get_next_child_node(fwnode, NULL); child;	\
 	     child = fwnode_get_next_child_node(fwnode, child))
 
-struct fwnode_handle *device_get_next_child_node(struct device *dev,
-						 struct fwnode_handle *child);
+struct fwnode_handle *device_get_next_child_node(
+	struct device *dev, struct fwnode_handle *child);
 
 #define device_for_each_child_node(dev, child)				\
 	for (child = device_get_next_child_node(dev, NULL); child;	\
 	     child = device_get_next_child_node(dev, child))
 
-struct fwnode_handle *fwnode_get_named_child_node(struct fwnode_handle *fwnode,
-						  const char *childname);
+struct fwnode_handle *fwnode_get_named_child_node(
+	const struct fwnode_handle *fwnode, const char *childname);
 struct fwnode_handle *device_get_named_child_node(struct device *dev,
 						  const char *childname);
 
@@ -129,31 +131,31 @@ static inline int device_property_read_u64(struct device *dev,
 	return device_property_read_u64_array(dev, propname, val, 1);
 }
 
-static inline bool fwnode_property_read_bool(struct fwnode_handle *fwnode,
+static inline bool fwnode_property_read_bool(const struct fwnode_handle *fwnode,
 					     const char *propname)
 {
 	return fwnode_property_present(fwnode, propname);
 }
 
-static inline int fwnode_property_read_u8(struct fwnode_handle *fwnode,
+static inline int fwnode_property_read_u8(const struct fwnode_handle *fwnode,
 					  const char *propname, u8 *val)
 {
 	return fwnode_property_read_u8_array(fwnode, propname, val, 1);
 }
 
-static inline int fwnode_property_read_u16(struct fwnode_handle *fwnode,
+static inline int fwnode_property_read_u16(const struct fwnode_handle *fwnode,
 					   const char *propname, u16 *val)
 {
 	return fwnode_property_read_u16_array(fwnode, propname, val, 1);
 }
 
-static inline int fwnode_property_read_u32(struct fwnode_handle *fwnode,
+static inline int fwnode_property_read_u32(const struct fwnode_handle *fwnode,
 					   const char *propname, u32 *val)
 {
 	return fwnode_property_read_u32_array(fwnode, propname, val, 1);
 }
 
-static inline int fwnode_property_read_u64(struct fwnode_handle *fwnode,
+static inline int fwnode_property_read_u64(const struct fwnode_handle *fwnode,
 					   const char *propname, u64 *val)
 {
 	return fwnode_property_read_u64_array(fwnode, propname, val, 1);
@@ -274,19 +276,20 @@ int device_get_phy_mode(struct device *dev);
 void *device_get_mac_address(struct device *dev, char *addr, int alen);
 
 struct fwnode_handle *fwnode_graph_get_next_endpoint(
-	struct fwnode_handle *fwnode, struct fwnode_handle *prev);
+	const struct fwnode_handle *fwnode, struct fwnode_handle *prev);
 struct fwnode_handle *
-fwnode_graph_get_port_parent(struct fwnode_handle *fwnode);
+fwnode_graph_get_port_parent(const struct fwnode_handle *fwnode);
 struct fwnode_handle *fwnode_graph_get_remote_port_parent(
-	struct fwnode_handle *fwnode);
+	const struct fwnode_handle *fwnode);
 struct fwnode_handle *fwnode_graph_get_remote_port(
-	struct fwnode_handle *fwnode);
+	const struct fwnode_handle *fwnode);
 struct fwnode_handle *fwnode_graph_get_remote_endpoint(
-	struct fwnode_handle *fwnode);
-struct fwnode_handle *fwnode_graph_get_remote_node(struct fwnode_handle *fwnode,
-						   u32 port, u32 endpoint);
+	const struct fwnode_handle *fwnode);
+struct fwnode_handle *
+fwnode_graph_get_remote_node(const struct fwnode_handle *fwnode, u32 port,
+			     u32 endpoint);
 
-int fwnode_graph_parse_endpoint(struct fwnode_handle *fwnode,
+int fwnode_graph_parse_endpoint(const struct fwnode_handle *fwnode,
 				struct fwnode_endpoint *endpoint);
 
 #endif /* _LINUX_PROPERTY_H_ */
-- 
cgit v1.2.3


From 3e3119d3088f41106f3581d39e7694a50ca3fc02 Mon Sep 17 00:00:00 2001
From: Sakari Ailus <sakari.ailus@linux.intel.com>
Date: Fri, 21 Jul 2017 15:11:49 +0300
Subject: device property: Introduce fwnode_property_get_reference_args

The new fwnode_property_get_reference_args() interface amends the fwnode
property API with the functionality of both of_parse_phandle_with_args()
and __acpi_node_get_property_reference().

The semantics is slightly different: the cells property is ignored on ACPI
as the number of arguments can be explicitly obtained from the firmware
interface.

Signed-off-by: Sakari Ailus <sakari.ailus@linux.intel.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/acpi/property.c  | 27 +++++++++++++++++++++++++++
 drivers/base/property.c  | 28 ++++++++++++++++++++++++++++
 drivers/of/property.c    | 31 +++++++++++++++++++++++++++++++
 include/linux/fwnode.h   | 19 +++++++++++++++++++
 include/linux/property.h |  4 ++++
 5 files changed, 109 insertions(+)

(limited to 'include')

diff --git a/drivers/acpi/property.c b/drivers/acpi/property.c
index f8d60051efb8..681a84312dee 100644
--- a/drivers/acpi/property.c
+++ b/drivers/acpi/property.c
@@ -1195,6 +1195,32 @@ acpi_fwnode_get_named_child_node(const struct fwnode_handle *fwnode,
 	return NULL;
 }
 
+static int
+acpi_fwnode_get_reference_args(const struct fwnode_handle *fwnode,
+			       const char *prop, const char *nargs_prop,
+			       unsigned int args_count, unsigned int index,
+			       struct fwnode_reference_args *args)
+{
+	struct acpi_reference_args acpi_args;
+	unsigned int i;
+	int ret;
+
+	ret = __acpi_node_get_property_reference(fwnode, prop, index,
+						 args_count, &acpi_args);
+	if (ret < 0)
+		return ret;
+	if (!args)
+		return 0;
+
+	args->nargs = acpi_args.nargs;
+	args->fwnode = acpi_fwnode_handle(acpi_args.adev);
+
+	for (i = 0; i < NR_FWNODE_REFERENCE_ARGS; i++)
+		args->args[i] = i < acpi_args.nargs ? acpi_args.args[i] : 0;
+
+	return 0;
+}
+
 static struct fwnode_handle *
 acpi_fwnode_graph_get_next_endpoint(const struct fwnode_handle *fwnode,
 				    struct fwnode_handle *prev)
@@ -1248,6 +1274,7 @@ static int acpi_fwnode_graph_parse_endpoint(const struct fwnode_handle *fwnode,
 		.get_parent = acpi_node_get_parent,			\
 		.get_next_child_node = acpi_get_next_subnode,		\
 		.get_named_child_node = acpi_fwnode_get_named_child_node, \
+		.get_reference_args = acpi_fwnode_get_reference_args,	\
 		.graph_get_next_endpoint =				\
 			acpi_fwnode_graph_get_next_endpoint,		\
 		.graph_get_remote_endpoint =				\
diff --git a/drivers/base/property.c b/drivers/base/property.c
index 673e2353a2fb..d0b65bbe7e15 100644
--- a/drivers/base/property.c
+++ b/drivers/base/property.c
@@ -665,6 +665,34 @@ out:
 }
 EXPORT_SYMBOL_GPL(fwnode_property_match_string);
 
+/**
+ * fwnode_property_get_reference_args() - Find a reference with arguments
+ * @fwnode:	Firmware node where to look for the reference
+ * @prop:	The name of the property
+ * @nargs_prop:	The name of the property telling the number of
+ *		arguments in the referred node. NULL if @nargs is known,
+ *		otherwise @nargs is ignored. Only relevant on OF.
+ * @nargs:	Number of arguments. Ignored if @nargs_prop is non-NULL.
+ * @index:	Index of the reference, from zero onwards.
+ * @args:	Result structure with reference and integer arguments.
+ *
+ * Obtain a reference based on a named property in an fwnode, with
+ * integer arguments.
+ *
+ * Caller is responsible to call fwnode_handle_put() on the returned
+ * args->fwnode pointer.
+ *
+ */
+int fwnode_property_get_reference_args(const struct fwnode_handle *fwnode,
+				       const char *prop, const char *nargs_prop,
+				       unsigned int nargs, unsigned int index,
+				       struct fwnode_reference_args *args)
+{
+	return fwnode_call_int_op(fwnode, get_reference_args, prop, nargs_prop,
+				  nargs, index, args);
+}
+EXPORT_SYMBOL_GPL(fwnode_property_get_reference_args);
+
 static int property_copy_string_array(struct property_entry *dst,
 				      const struct property_entry *src)
 {
diff --git a/drivers/of/property.c b/drivers/of/property.c
index ae46a6f0ea36..3868400972b8 100644
--- a/drivers/of/property.c
+++ b/drivers/of/property.c
@@ -891,6 +891,36 @@ of_fwnode_get_named_child_node(const struct fwnode_handle *fwnode,
 	return NULL;
 }
 
+static int
+of_fwnode_get_reference_args(const struct fwnode_handle *fwnode,
+			     const char *prop, const char *nargs_prop,
+			     unsigned int nargs, unsigned int index,
+			     struct fwnode_reference_args *args)
+{
+	struct of_phandle_args of_args;
+	unsigned int i;
+	int ret;
+
+	if (nargs_prop)
+		ret = of_parse_phandle_with_args(to_of_node(fwnode), prop,
+						 nargs_prop, index, &of_args);
+	else
+		ret = of_parse_phandle_with_fixed_args(to_of_node(fwnode), prop,
+						       nargs, index, &of_args);
+	if (ret < 0)
+		return ret;
+	if (!args)
+		return 0;
+
+	args->nargs = of_args.args_count;
+	args->fwnode = of_fwnode_handle(of_args.np);
+
+	for (i = 0; i < NR_FWNODE_REFERENCE_ARGS; i++)
+		args->args[i] = i < of_args.args_count ? of_args.args[i] : 0;
+
+	return 0;
+}
+
 static struct fwnode_handle *
 of_fwnode_graph_get_next_endpoint(const struct fwnode_handle *fwnode,
 				  struct fwnode_handle *prev)
@@ -949,6 +979,7 @@ const struct fwnode_operations of_fwnode_ops = {
 	.get_parent = of_fwnode_get_parent,
 	.get_next_child_node = of_fwnode_get_next_child_node,
 	.get_named_child_node = of_fwnode_get_named_child_node,
+	.get_reference_args = of_fwnode_get_reference_args,
 	.graph_get_next_endpoint = of_fwnode_graph_get_next_endpoint,
 	.graph_get_remote_endpoint = of_fwnode_graph_get_remote_endpoint,
 	.graph_get_port_parent = of_fwnode_graph_get_port_parent,
diff --git a/include/linux/fwnode.h b/include/linux/fwnode.h
index 7b50ee4edcfc..0c35b6caf0f6 100644
--- a/include/linux/fwnode.h
+++ b/include/linux/fwnode.h
@@ -33,6 +33,20 @@ struct fwnode_endpoint {
 	const struct fwnode_handle *local_fwnode;
 };
 
+#define NR_FWNODE_REFERENCE_ARGS	8
+
+/**
+ * struct fwnode_reference_args - Fwnode reference with additional arguments
+ * @fwnode:- A reference to the base fwnode
+ * @nargs: Number of elements in @args array
+ * @args: Integer arguments on the fwnode
+ */
+struct fwnode_reference_args {
+	struct fwnode_handle *fwnode;
+	unsigned int nargs;
+	unsigned int args[NR_FWNODE_REFERENCE_ARGS];
+};
+
 /**
  * struct fwnode_operations - Operations for fwnode interface
  * @get: Get a reference to an fwnode.
@@ -46,6 +60,7 @@ struct fwnode_endpoint {
  * @get_parent: Return the parent of an fwnode.
  * @get_next_child_node: Return the next child node in an iteration.
  * @get_named_child_node: Return a child node with a given name.
+ * @get_reference_args: Return a reference pointed to by a property, with args
  * @graph_get_next_endpoint: Return an endpoint node in an iteration.
  * @graph_get_remote_endpoint: Return the remote endpoint node of a local
  *			       endpoint node.
@@ -73,6 +88,10 @@ struct fwnode_operations {
 	struct fwnode_handle *
 	(*get_named_child_node)(const struct fwnode_handle *fwnode,
 				const char *name);
+	int (*get_reference_args)(const struct fwnode_handle *fwnode,
+				  const char *prop, const char *nargs_prop,
+				  unsigned int nargs, unsigned int index,
+				  struct fwnode_reference_args *args);
 	struct fwnode_handle *
 	(*graph_get_next_endpoint)(const struct fwnode_handle *fwnode,
 				   struct fwnode_handle *prev);
diff --git a/include/linux/property.h b/include/linux/property.h
index edff3f89e755..6bebee13c5e0 100644
--- a/include/linux/property.h
+++ b/include/linux/property.h
@@ -73,6 +73,10 @@ int fwnode_property_read_string(const struct fwnode_handle *fwnode,
 				const char *propname, const char **val);
 int fwnode_property_match_string(const struct fwnode_handle *fwnode,
 				 const char *propname, const char *string);
+int fwnode_property_get_reference_args(const struct fwnode_handle *fwnode,
+				       const char *prop, const char *nargs_prop,
+				       unsigned int nargs, unsigned int index,
+				       struct fwnode_reference_args *args);
 
 struct fwnode_handle *fwnode_get_parent(const struct fwnode_handle *fwnode);
 struct fwnode_handle *fwnode_get_next_parent(
-- 
cgit v1.2.3


From 2d045036322c29b69c22f06530f1130338d06373 Mon Sep 17 00:00:00 2001
From: Viresh Kumar <viresh.kumar@linaro.org>
Date: Wed, 19 Jul 2017 15:42:41 +0530
Subject: cpufreq: governor: Drop min_sampling_rate

The cpufreq core and governors aren't supposed to set a limit on how
fast we want to try changing the frequency. This is currently done for
the legacy governors with help of min_sampling_rate.

At worst, we may end up setting the sampling rate to a value lower than
the rate at which frequency can be changed and then one of the CPUs in
the policy will be only changing frequency for ever.

But that is something for the user to decide and there is no need to
have special handling for such cases in the core. Leave it for the user
to figure out.

Signed-off-by: Viresh Kumar <viresh.kumar@linaro.org>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 Documentation/admin-guide/pm/cpufreq.rst |  8 --------
 drivers/cpufreq/cpufreq_conservative.c   |  6 ------
 drivers/cpufreq/cpufreq_governor.c       | 10 ++--------
 drivers/cpufreq/cpufreq_governor.h       |  1 -
 drivers/cpufreq/cpufreq_ondemand.c       | 12 ------------
 include/linux/cpufreq.h                  |  2 --
 6 files changed, 2 insertions(+), 37 deletions(-)

(limited to 'include')

diff --git a/Documentation/admin-guide/pm/cpufreq.rst b/Documentation/admin-guide/pm/cpufreq.rst
index 463cf7e73db8..2eb3bf62393e 100644
--- a/Documentation/admin-guide/pm/cpufreq.rst
+++ b/Documentation/admin-guide/pm/cpufreq.rst
@@ -471,14 +471,6 @@ This governor exposes the following tunables:
 
 	# echo `$(($(cat cpuinfo_transition_latency) * 750 / 1000)) > ondemand/sampling_rate
 
-
-``min_sampling_rate``
-	The minimum value of ``sampling_rate``.
-
-	Equal to 10000 (10 ms) if :c:macro:`CONFIG_NO_HZ_COMMON` and
-	:c:data:`tick_nohz_active` are both set or to 20 times the value of
-	:c:data:`jiffies` in microseconds otherwise.
-
 ``up_threshold``
 	If the estimated CPU load is above this value (in percent), the governor
 	will set the frequency to the maximum value allowed for the policy.
diff --git a/drivers/cpufreq/cpufreq_conservative.c b/drivers/cpufreq/cpufreq_conservative.c
index 88220ff3e1c2..f20f20a77d4d 100644
--- a/drivers/cpufreq/cpufreq_conservative.c
+++ b/drivers/cpufreq/cpufreq_conservative.c
@@ -246,7 +246,6 @@ gov_show_one_common(sampling_rate);
 gov_show_one_common(sampling_down_factor);
 gov_show_one_common(up_threshold);
 gov_show_one_common(ignore_nice_load);
-gov_show_one_common(min_sampling_rate);
 gov_show_one(cs, down_threshold);
 gov_show_one(cs, freq_step);
 
@@ -254,12 +253,10 @@ gov_attr_rw(sampling_rate);
 gov_attr_rw(sampling_down_factor);
 gov_attr_rw(up_threshold);
 gov_attr_rw(ignore_nice_load);
-gov_attr_ro(min_sampling_rate);
 gov_attr_rw(down_threshold);
 gov_attr_rw(freq_step);
 
 static struct attribute *cs_attributes[] = {
-	&min_sampling_rate.attr,
 	&sampling_rate.attr,
 	&sampling_down_factor.attr,
 	&up_threshold.attr,
@@ -297,10 +294,7 @@ static int cs_init(struct dbs_data *dbs_data)
 	dbs_data->up_threshold = DEF_FREQUENCY_UP_THRESHOLD;
 	dbs_data->sampling_down_factor = DEF_SAMPLING_DOWN_FACTOR;
 	dbs_data->ignore_nice_load = 0;
-
 	dbs_data->tuners = tuners;
-	dbs_data->min_sampling_rate = MIN_SAMPLING_RATE_RATIO *
-		jiffies_to_usecs(10);
 
 	return 0;
 }
diff --git a/drivers/cpufreq/cpufreq_governor.c b/drivers/cpufreq/cpufreq_governor.c
index 47e24b5384b3..858081f9c3d7 100644
--- a/drivers/cpufreq/cpufreq_governor.c
+++ b/drivers/cpufreq/cpufreq_governor.c
@@ -47,14 +47,11 @@ ssize_t store_sampling_rate(struct gov_attr_set *attr_set, const char *buf,
 {
 	struct dbs_data *dbs_data = to_dbs_data(attr_set);
 	struct policy_dbs_info *policy_dbs;
-	unsigned int rate;
 	int ret;
-	ret = sscanf(buf, "%u", &rate);
+	ret = sscanf(buf, "%u", &dbs_data->sampling_rate);
 	if (ret != 1)
 		return -EINVAL;
 
-	dbs_data->sampling_rate = max(rate, dbs_data->min_sampling_rate);
-
 	/*
 	 * We are operating under dbs_data->mutex and so the list and its
 	 * entries can't be freed concurrently.
@@ -437,10 +434,7 @@ int cpufreq_dbs_governor_init(struct cpufreq_policy *policy)
 		latency = 1;
 
 	/* Bring kernel and HW constraints together */
-	dbs_data->min_sampling_rate = max(dbs_data->min_sampling_rate,
-					  MIN_LATENCY_MULTIPLIER * latency);
-	dbs_data->sampling_rate = max(dbs_data->min_sampling_rate,
-				      LATENCY_MULTIPLIER * latency);
+	dbs_data->sampling_rate = LATENCY_MULTIPLIER * latency;
 
 	if (!have_governor_per_policy())
 		gov->gdbs_data = dbs_data;
diff --git a/drivers/cpufreq/cpufreq_governor.h b/drivers/cpufreq/cpufreq_governor.h
index 0236ec2cd654..95f207eb820e 100644
--- a/drivers/cpufreq/cpufreq_governor.h
+++ b/drivers/cpufreq/cpufreq_governor.h
@@ -41,7 +41,6 @@ enum {OD_NORMAL_SAMPLE, OD_SUB_SAMPLE};
 struct dbs_data {
 	struct gov_attr_set attr_set;
 	void *tuners;
-	unsigned int min_sampling_rate;
 	unsigned int ignore_nice_load;
 	unsigned int sampling_rate;
 	unsigned int sampling_down_factor;
diff --git a/drivers/cpufreq/cpufreq_ondemand.c b/drivers/cpufreq/cpufreq_ondemand.c
index 3937acf7e026..6b423eebfd5d 100644
--- a/drivers/cpufreq/cpufreq_ondemand.c
+++ b/drivers/cpufreq/cpufreq_ondemand.c
@@ -319,7 +319,6 @@ gov_show_one_common(sampling_rate);
 gov_show_one_common(up_threshold);
 gov_show_one_common(sampling_down_factor);
 gov_show_one_common(ignore_nice_load);
-gov_show_one_common(min_sampling_rate);
 gov_show_one_common(io_is_busy);
 gov_show_one(od, powersave_bias);
 
@@ -329,10 +328,8 @@ gov_attr_rw(up_threshold);
 gov_attr_rw(sampling_down_factor);
 gov_attr_rw(ignore_nice_load);
 gov_attr_rw(powersave_bias);
-gov_attr_ro(min_sampling_rate);
 
 static struct attribute *od_attributes[] = {
-	&min_sampling_rate.attr,
 	&sampling_rate.attr,
 	&up_threshold.attr,
 	&sampling_down_factor.attr,
@@ -373,17 +370,8 @@ static int od_init(struct dbs_data *dbs_data)
 	if (idle_time != -1ULL) {
 		/* Idle micro accounting is supported. Use finer thresholds */
 		dbs_data->up_threshold = MICRO_FREQUENCY_UP_THRESHOLD;
-		/*
-		 * In nohz/micro accounting case we set the minimum frequency
-		 * not depending on HZ, but fixed (very low).
-		*/
-		dbs_data->min_sampling_rate = MICRO_FREQUENCY_MIN_SAMPLE_RATE;
 	} else {
 		dbs_data->up_threshold = DEF_FREQUENCY_UP_THRESHOLD;
-
-		/* For correct statistics, we need 10 ticks for each measure */
-		dbs_data->min_sampling_rate = MIN_SAMPLING_RATE_RATIO *
-			jiffies_to_usecs(10);
 	}
 
 	dbs_data->sampling_down_factor = DEF_SAMPLING_DOWN_FACTOR;
diff --git a/include/linux/cpufreq.h b/include/linux/cpufreq.h
index f10a9b3761cd..02aec384cab9 100644
--- a/include/linux/cpufreq.h
+++ b/include/linux/cpufreq.h
@@ -491,9 +491,7 @@ static inline unsigned long cpufreq_scale(unsigned long old, u_int div,
  * For CPUs with transition latency > 10ms (mostly drivers with CPUFREQ_ETERNAL)
  * the ondemand governor will not work. All times here are in us (microseconds).
  */
-#define MIN_SAMPLING_RATE_RATIO		(2)
 #define LATENCY_MULTIPLIER		(1000)
-#define MIN_LATENCY_MULTIPLIER		(20)
 #define TRANSITION_LATENCY_LIMIT	(10 * 1000 * 1000)
 
 struct cpufreq_governor {
-- 
cgit v1.2.3


From aa7519af450d3c62a057aece24877c34562fa25a Mon Sep 17 00:00:00 2001
From: Viresh Kumar <viresh.kumar@linaro.org>
Date: Wed, 19 Jul 2017 15:42:42 +0530
Subject: cpufreq: Use transition_delay_us for legacy governors as well

The policy->transition_delay_us field is used only by the schedutil
governor currently, and this field describes how fast the driver wants
the cpufreq governor to change CPUs frequency. It should rather be a
common thing across all governors, as it doesn't have any schedutil
dependency here.

Create a new helper cpufreq_policy_transition_delay_us() to get the
transition delay across all governors.

Signed-off-by: Viresh Kumar <viresh.kumar@linaro.org>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/cpufreq/cpufreq.c          | 15 +++++++++++++++
 drivers/cpufreq/cpufreq_governor.c |  9 +--------
 include/linux/cpufreq.h            |  1 +
 kernel/sched/cpufreq_schedutil.c   | 11 +----------
 4 files changed, 18 insertions(+), 18 deletions(-)

(limited to 'include')

diff --git a/drivers/cpufreq/cpufreq.c b/drivers/cpufreq/cpufreq.c
index 9bf97a366029..c426d21822f7 100644
--- a/drivers/cpufreq/cpufreq.c
+++ b/drivers/cpufreq/cpufreq.c
@@ -524,6 +524,21 @@ unsigned int cpufreq_driver_resolve_freq(struct cpufreq_policy *policy,
 }
 EXPORT_SYMBOL_GPL(cpufreq_driver_resolve_freq);
 
+unsigned int cpufreq_policy_transition_delay_us(struct cpufreq_policy *policy)
+{
+	unsigned int latency;
+
+	if (policy->transition_delay_us)
+		return policy->transition_delay_us;
+
+	latency = policy->cpuinfo.transition_latency / NSEC_PER_USEC;
+	if (latency)
+		return latency * LATENCY_MULTIPLIER;
+
+	return LATENCY_MULTIPLIER;
+}
+EXPORT_SYMBOL_GPL(cpufreq_policy_transition_delay_us);
+
 /*********************************************************************
  *                          SYSFS INTERFACE                          *
  *********************************************************************/
diff --git a/drivers/cpufreq/cpufreq_governor.c b/drivers/cpufreq/cpufreq_governor.c
index 858081f9c3d7..eed069ecfd5e 100644
--- a/drivers/cpufreq/cpufreq_governor.c
+++ b/drivers/cpufreq/cpufreq_governor.c
@@ -389,7 +389,6 @@ int cpufreq_dbs_governor_init(struct cpufreq_policy *policy)
 	struct dbs_governor *gov = dbs_governor_of(policy);
 	struct dbs_data *dbs_data;
 	struct policy_dbs_info *policy_dbs;
-	unsigned int latency;
 	int ret = 0;
 
 	/* State should be equivalent to EXIT */
@@ -428,13 +427,7 @@ int cpufreq_dbs_governor_init(struct cpufreq_policy *policy)
 	if (ret)
 		goto free_policy_dbs_info;
 
-	/* policy latency is in ns. Convert it to us first */
-	latency = policy->cpuinfo.transition_latency / 1000;
-	if (latency == 0)
-		latency = 1;
-
-	/* Bring kernel and HW constraints together */
-	dbs_data->sampling_rate = LATENCY_MULTIPLIER * latency;
+	dbs_data->sampling_rate = cpufreq_policy_transition_delay_us(policy);
 
 	if (!have_governor_per_policy())
 		gov->gdbs_data = dbs_data;
diff --git a/include/linux/cpufreq.h b/include/linux/cpufreq.h
index 02aec384cab9..aaadfc543f63 100644
--- a/include/linux/cpufreq.h
+++ b/include/linux/cpufreq.h
@@ -523,6 +523,7 @@ int __cpufreq_driver_target(struct cpufreq_policy *policy,
 				   unsigned int relation);
 unsigned int cpufreq_driver_resolve_freq(struct cpufreq_policy *policy,
 					 unsigned int target_freq);
+unsigned int cpufreq_policy_transition_delay_us(struct cpufreq_policy *policy);
 int cpufreq_register_governor(struct cpufreq_governor *governor);
 void cpufreq_unregister_governor(struct cpufreq_governor *governor);
 
diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c
index 29a397067ffa..89c4dd9777bb 100644
--- a/kernel/sched/cpufreq_schedutil.c
+++ b/kernel/sched/cpufreq_schedutil.c
@@ -528,16 +528,7 @@ static int sugov_init(struct cpufreq_policy *policy)
 		goto stop_kthread;
 	}
 
-	if (policy->transition_delay_us) {
-		tunables->rate_limit_us = policy->transition_delay_us;
-	} else {
-		unsigned int lat;
-
-		tunables->rate_limit_us = LATENCY_MULTIPLIER;
-		lat = policy->cpuinfo.transition_latency / NSEC_PER_USEC;
-		if (lat)
-			tunables->rate_limit_us *= lat;
-	}
+	tunables->rate_limit_us = cpufreq_policy_transition_delay_us(policy);
 
 	policy->governor_data = sg_policy;
 	sg_policy->tunables = tunables;
-- 
cgit v1.2.3


From bd8c9ba3b1e2037af5af4e48aea1087212838179 Mon Sep 17 00:00:00 2001
From: Florian Fainelli <f.fainelli@gmail.com>
Date: Mon, 17 Jul 2017 17:19:25 -0700
Subject: PM / suspend: Export pm_suspend_target_state

Have the core suspend/resume framework store the system-wide suspend
state (suspend_state_t) we are about to enter, and expose it to drivers
via pm_suspend_target_state in order to retrieve that. The state is
assigned in suspend_devices_and_enter().

This is useful for platform specific drivers that may need to take a
slightly different suspend/resume path based on the system's
suspend/resume state being entered.

Signed-off-by: Florian Fainelli <f.fainelli@gmail.com>
Acked-by: Pavel Machek <pavel@ucw.cz>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 include/linux/suspend.h | 1 +
 kernel/power/suspend.c  | 5 +++++
 2 files changed, 6 insertions(+)

(limited to 'include')

diff --git a/include/linux/suspend.h b/include/linux/suspend.h
index 0b1cf32edfd7..2159f6841768 100644
--- a/include/linux/suspend.h
+++ b/include/linux/suspend.h
@@ -427,6 +427,7 @@ extern int unregister_pm_notifier(struct notifier_block *nb);
 /* drivers/base/power/wakeup.c */
 extern bool events_check_enabled;
 extern unsigned int pm_wakeup_irq;
+extern suspend_state_t pm_suspend_target_state;
 
 extern bool pm_wakeup_pending(void);
 extern void pm_system_wakeup(void);
diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c
index 3ecf275d7e44..d0c0b96c2383 100644
--- a/kernel/power/suspend.c
+++ b/kernel/power/suspend.c
@@ -47,6 +47,8 @@ const char *mem_sleep_states[PM_SUSPEND_MAX];
 
 suspend_state_t mem_sleep_current = PM_SUSPEND_FREEZE;
 static suspend_state_t mem_sleep_default = PM_SUSPEND_MEM;
+suspend_state_t pm_suspend_target_state;
+EXPORT_SYMBOL_GPL(pm_suspend_target_state);
 
 unsigned int pm_suspend_global_flags;
 EXPORT_SYMBOL_GPL(pm_suspend_global_flags);
@@ -456,6 +458,8 @@ int suspend_devices_and_enter(suspend_state_t state)
 	if (!sleep_state_supported(state))
 		return -ENOSYS;
 
+	pm_suspend_target_state = state;
+
 	error = platform_suspend_begin(state);
 	if (error)
 		goto Close;
@@ -485,6 +489,7 @@ int suspend_devices_and_enter(suspend_state_t state)
 
  Close:
 	platform_resume_end(state);
+	pm_suspend_target_state = PM_SUSPEND_ON;
 	return error;
 
  Recover_platform:
-- 
cgit v1.2.3


From 8d8b2441db9647890251538f60b75a4e45fdef8d Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rafael.j.wysocki@intel.com>
Date: Wed, 19 Jul 2017 02:38:44 +0200
Subject: PM / sleep: Do not print debug messages by default

Debug messages from the system suspend/hibernation infrastructure can
fill up the entire kernel log buffer in some cases and anyway they
are only useful for debugging.  They depend on CONFIG_PM_DEBUG, but
that is set as a rule as some generally useful diagnostic facilities
depend on it too.

For this reason, avoid printing those messages by default, but make
it possible to turn them on as needed with the help of a new sysfs
attribute under /sys/power/.

Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 Documentation/ABI/testing/sysfs-power | 12 ++++++++
 drivers/base/power/main.c             | 12 +++-----
 include/linux/suspend.h               |  6 ++++
 kernel/power/hibernate.c              | 24 ++++++++--------
 kernel/power/main.c                   | 52 +++++++++++++++++++++++++++++++++++
 kernel/power/suspend.c                | 10 +++----
 6 files changed, 91 insertions(+), 25 deletions(-)

(limited to 'include')

diff --git a/Documentation/ABI/testing/sysfs-power b/Documentation/ABI/testing/sysfs-power
index f523e5a3ac33..713cab1d5f12 100644
--- a/Documentation/ABI/testing/sysfs-power
+++ b/Documentation/ABI/testing/sysfs-power
@@ -273,3 +273,15 @@ Description:
 
 		This output is useful for system wakeup diagnostics of spurious
 		wakeup interrupts.
+
+What:		/sys/power/pm_debug_messages
+Date:		July 2017
+Contact:	Rafael J. Wysocki <rjw@rjwysocki.net>
+Description:
+		The /sys/power/pm_debug_messages file controls the printing
+		of debug messages from the system suspend/hiberbation
+		infrastructure to the kernel log.
+
+		Writing a "1" to this file enables the debug messages and
+		writing a "0" (default) to it disables them.  Reads from
+		this file return the current value.
diff --git a/drivers/base/power/main.c b/drivers/base/power/main.c
index c99f8730de82..4f5fc212d684 100644
--- a/drivers/base/power/main.c
+++ b/drivers/base/power/main.c
@@ -418,7 +418,6 @@ static void pm_dev_err(struct device *dev, pm_message_t state, const char *info,
 		dev_name(dev), pm_verb(state.event), info, error);
 }
 
-#ifdef CONFIG_PM_DEBUG
 static void dpm_show_time(ktime_t starttime, pm_message_t state,
 			  const char *info)
 {
@@ -432,14 +431,11 @@ static void dpm_show_time(ktime_t starttime, pm_message_t state,
 	usecs = usecs64;
 	if (usecs == 0)
 		usecs = 1;
-	pr_info("PM: %s%s%s of devices complete after %ld.%03ld msecs\n",
-		info ?: "", info ? " " : "", pm_verb(state.event),
-		usecs / USEC_PER_MSEC, usecs % USEC_PER_MSEC);
+
+	pm_pr_dbg("%s%s%s of devices complete after %ld.%03ld msecs\n",
+		  info ?: "", info ? " " : "", pm_verb(state.event),
+		  usecs / USEC_PER_MSEC, usecs % USEC_PER_MSEC);
 }
-#else
-static inline void dpm_show_time(ktime_t starttime, pm_message_t state,
-				 const char *info) {}
-#endif /* CONFIG_PM_DEBUG */
 
 static int dpm_run_callback(pm_callback_t cb, struct device *dev,
 			    pm_message_t state, const char *info)
diff --git a/include/linux/suspend.h b/include/linux/suspend.h
index 2159f6841768..707e4cdf21c2 100644
--- a/include/linux/suspend.h
+++ b/include/linux/suspend.h
@@ -492,8 +492,14 @@ static inline void unlock_system_sleep(void) {}
 
 #ifdef CONFIG_PM_SLEEP_DEBUG
 extern bool pm_print_times_enabled;
+extern __printf(1, 2) void pm_pr_dbg(const char *fmt, ...);
 #else
 #define pm_print_times_enabled	(false)
+
+#include <linux/printk.h>
+
+#define pm_pr_dbg(fmt, ...) \
+	no_printk(KERN_DEBUG fmt, ##__VA_ARGS__)
 #endif
 
 #ifdef CONFIG_PM_AUTOSLEEP
diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c
index e1914c7b85b1..e19ee179d211 100644
--- a/kernel/power/hibernate.c
+++ b/kernel/power/hibernate.c
@@ -651,7 +651,7 @@ static int load_image_and_restore(void)
 	int error;
 	unsigned int flags;
 
-	pr_debug("Loading hibernation image.\n");
+	pm_pr_dbg("Loading hibernation image.\n");
 
 	lock_device_hotplug();
 	error = create_basic_memory_bitmaps();
@@ -681,7 +681,7 @@ int hibernate(void)
 	bool snapshot_test = false;
 
 	if (!hibernation_available()) {
-		pr_debug("Hibernation not available.\n");
+		pm_pr_dbg("Hibernation not available.\n");
 		return -EPERM;
 	}
 
@@ -727,7 +727,7 @@ int hibernate(void)
 		else
 		        flags |= SF_CRC32_MODE;
 
-		pr_debug("Writing image.\n");
+		pm_pr_dbg("Writing image.\n");
 		error = swsusp_write(flags);
 		swsusp_free();
 		if (!error) {
@@ -739,7 +739,7 @@ int hibernate(void)
 		in_suspend = 0;
 		pm_restore_gfp_mask();
 	} else {
-		pr_debug("Image restored successfully.\n");
+		pm_pr_dbg("Image restored successfully.\n");
 	}
 
  Free_bitmaps:
@@ -747,7 +747,7 @@ int hibernate(void)
  Thaw:
 	unlock_device_hotplug();
 	if (snapshot_test) {
-		pr_debug("Checking hibernation image\n");
+		pm_pr_dbg("Checking hibernation image\n");
 		error = swsusp_check();
 		if (!error)
 			error = load_image_and_restore();
@@ -811,7 +811,7 @@ static int software_resume(void)
 		goto Unlock;
 	}
 
-	pr_debug("Checking hibernation image partition %s\n", resume_file);
+	pm_pr_dbg("Checking hibernation image partition %s\n", resume_file);
 
 	if (resume_delay) {
 		pr_info("Waiting %dsec before reading resume device ...\n",
@@ -853,10 +853,10 @@ static int software_resume(void)
 	}
 
  Check_image:
-	pr_debug("Hibernation image partition %d:%d present\n",
+	pm_pr_dbg("Hibernation image partition %d:%d present\n",
 		MAJOR(swsusp_resume_device), MINOR(swsusp_resume_device));
 
-	pr_debug("Looking for hibernation image.\n");
+	pm_pr_dbg("Looking for hibernation image.\n");
 	error = swsusp_check();
 	if (error)
 		goto Unlock;
@@ -875,7 +875,7 @@ static int software_resume(void)
 		goto Close_Finish;
 	}
 
-	pr_debug("Preparing processes for restore.\n");
+	pm_pr_dbg("Preparing processes for restore.\n");
 	error = freeze_processes();
 	if (error)
 		goto Close_Finish;
@@ -888,7 +888,7 @@ static int software_resume(void)
 	/* For success case, the suspend path will release the lock */
  Unlock:
 	mutex_unlock(&pm_mutex);
-	pr_debug("Hibernation image not present or could not be loaded.\n");
+	pm_pr_dbg("Hibernation image not present or could not be loaded.\n");
 	return error;
  Close_Finish:
 	swsusp_close(FMODE_READ);
@@ -1012,8 +1012,8 @@ static ssize_t disk_store(struct kobject *kobj, struct kobj_attribute *attr,
 		error = -EINVAL;
 
 	if (!error)
-		pr_debug("Hibernation mode set to '%s'\n",
-			 hibernation_modes[mode]);
+		pm_pr_dbg("Hibernation mode set to '%s'\n",
+			       hibernation_modes[mode]);
 	unlock_system_sleep();
 	return error ? error : n;
 }
diff --git a/kernel/power/main.c b/kernel/power/main.c
index 42bd800a6755..5ce00902c7e3 100644
--- a/kernel/power/main.c
+++ b/kernel/power/main.c
@@ -361,6 +361,57 @@ static ssize_t pm_wakeup_irq_show(struct kobject *kobj,
 
 power_attr_ro(pm_wakeup_irq);
 
+static bool pm_debug_messages_on __read_mostly;
+
+static ssize_t pm_debug_messages_show(struct kobject *kobj,
+				      struct kobj_attribute *attr, char *buf)
+{
+	return sprintf(buf, "%d\n", pm_debug_messages_on);
+}
+
+static ssize_t pm_debug_messages_store(struct kobject *kobj,
+				       struct kobj_attribute *attr,
+				       const char *buf, size_t n)
+{
+	unsigned long val;
+
+	if (kstrtoul(buf, 10, &val))
+		return -EINVAL;
+
+	if (val > 1)
+		return -EINVAL;
+
+	pm_debug_messages_on = !!val;
+	return n;
+}
+
+power_attr(pm_debug_messages);
+
+/**
+ * pm_pr_dbg - Print a suspend debug message to the kernel log.
+ * @fmt: Message format.
+ *
+ * The message will be emitted if enabled through the pm_debug_messages
+ * sysfs attribute.
+ */
+void pm_pr_dbg(const char *fmt, ...)
+{
+	struct va_format vaf;
+	va_list args;
+
+	if (!pm_debug_messages_on)
+		return;
+
+	va_start(args, fmt);
+
+	vaf.fmt = fmt;
+	vaf.va = &args;
+
+	printk(KERN_DEBUG "PM: %pV", &vaf);
+
+	va_end(args);
+}
+
 #else /* !CONFIG_PM_SLEEP_DEBUG */
 static inline void pm_print_times_init(void) {}
 #endif /* CONFIG_PM_SLEEP_DEBUG */
@@ -697,6 +748,7 @@ static struct attribute * g[] = {
 #ifdef CONFIG_PM_SLEEP_DEBUG
 	&pm_print_times_attr.attr,
 	&pm_wakeup_irq_attr.attr,
+	&pm_debug_messages_attr.attr,
 #endif
 #endif
 #ifdef CONFIG_FREEZER
diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c
index d0c0b96c2383..4f10773322fa 100644
--- a/kernel/power/suspend.c
+++ b/kernel/power/suspend.c
@@ -106,7 +106,7 @@ static void freeze_enter(void)
 
 static void s2idle_loop(void)
 {
-	pr_debug("PM: suspend-to-idle\n");
+	pm_pr_dbg("suspend-to-idle\n");
 
 	do {
 		freeze_enter();
@@ -124,7 +124,7 @@ static void s2idle_loop(void)
 		pm_wakeup_clear(false);
 	} while (!dpm_suspend_noirq(PMSG_SUSPEND));
 
-	pr_debug("PM: resume from suspend-to-idle\n");
+	pm_pr_dbg("resume from suspend-to-idle\n");
 }
 
 void freeze_wake(void)
@@ -547,7 +547,7 @@ static int enter_state(suspend_state_t state)
 	trace_suspend_resume(TPS("sync_filesystems"), 0, false);
 #endif
 
-	pr_debug("PM: Preparing system for sleep (%s)\n", pm_states[state]);
+	pm_pr_dbg("Preparing system for sleep (%s)\n", pm_states[state]);
 	pm_suspend_clear_flags();
 	error = suspend_prepare(state);
 	if (error)
@@ -557,13 +557,13 @@ static int enter_state(suspend_state_t state)
 		goto Finish;
 
 	trace_suspend_resume(TPS("suspend_enter"), state, false);
-	pr_debug("PM: Suspending system (%s)\n", pm_states[state]);
+	pm_pr_dbg("Suspending system (%s)\n", pm_states[state]);
 	pm_restrict_gfp_mask();
 	error = suspend_devices_and_enter(state);
 	pm_restore_gfp_mask();
 
  Finish:
-	pr_debug("PM: Finishing wakeup.\n");
+	pm_pr_dbg("Finishing wakeup.\n");
 	suspend_finish();
  Unlock:
 	mutex_unlock(&pm_mutex);
-- 
cgit v1.2.3


From 1455cf8dbfd06aa7651dcfccbadb7a093944ca65 Mon Sep 17 00:00:00 2001
From: Dmitry Torokhov <dmitry.torokhov@gmail.com>
Date: Wed, 19 Jul 2017 17:24:30 -0700
Subject: driver core: emit uevents when device is bound to a driver

There are certain touch controllers that may come up in either normal
(application) or boot mode, depending on whether firmware/configuration is
corrupted when they are powered on. In boot mode the kernel does not create
input device instance (because it does not necessarily know the
characteristics of the input device in question).

Another number of controllers does not store firmware in a non-volatile
memory, and they similarly need to have firmware loaded before input device
instance is created. There are also other types of devices with similar
behavior.

There is a desire to be able to trigger firmware loading via udev, but it
has to happen only when driver is bound to a physical device (i2c or spi).
These udev actions can not use ADD events, as those happen too early, so we
are introducing BIND and UNBIND events that are emitted at the right
moment.

Also, many drivers create additional driver-specific device attributes
when binding to the device, to provide userspace with additional controls.
The new events allow userspace to adjust these driver-specific attributes
without worrying that they are not there yet.

Signed-off-by: Dmitry Torokhov <dmitry.torokhov@gmail.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/base/dd.c       | 4 ++++
 include/linux/kobject.h | 2 ++
 lib/kobject_uevent.c    | 2 ++
 3 files changed, 8 insertions(+)

(limited to 'include')

diff --git a/drivers/base/dd.c b/drivers/base/dd.c
index 4882f06d12df..c17fefc77345 100644
--- a/drivers/base/dd.c
+++ b/drivers/base/dd.c
@@ -259,6 +259,8 @@ static void driver_bound(struct device *dev)
 	if (dev->bus)
 		blocking_notifier_call_chain(&dev->bus->p->bus_notifier,
 					     BUS_NOTIFY_BOUND_DRIVER, dev);
+
+	kobject_uevent(&dev->kobj, KOBJ_BIND);
 }
 
 static int driver_sysfs_add(struct device *dev)
@@ -848,6 +850,8 @@ static void __device_release_driver(struct device *dev, struct device *parent)
 			blocking_notifier_call_chain(&dev->bus->p->bus_notifier,
 						     BUS_NOTIFY_UNBOUND_DRIVER,
 						     dev);
+
+		kobject_uevent(&dev->kobj, KOBJ_UNBIND);
 	}
 }
 
diff --git a/include/linux/kobject.h b/include/linux/kobject.h
index ca85cb80e99a..12f5ddccb97c 100644
--- a/include/linux/kobject.h
+++ b/include/linux/kobject.h
@@ -57,6 +57,8 @@ enum kobject_action {
 	KOBJ_MOVE,
 	KOBJ_ONLINE,
 	KOBJ_OFFLINE,
+	KOBJ_BIND,
+	KOBJ_UNBIND,
 	KOBJ_MAX
 };
 
diff --git a/lib/kobject_uevent.c b/lib/kobject_uevent.c
index 9a2b811966eb..4682e8545b5c 100644
--- a/lib/kobject_uevent.c
+++ b/lib/kobject_uevent.c
@@ -50,6 +50,8 @@ static const char *kobject_actions[] = {
 	[KOBJ_MOVE] =		"move",
 	[KOBJ_ONLINE] =		"online",
 	[KOBJ_OFFLINE] =	"offline",
+	[KOBJ_BIND] =		"bind",
+	[KOBJ_UNBIND] =		"unbind",
 };
 
 /**
-- 
cgit v1.2.3


From a7670d425b75f9e44b7d4d0aea04f4a6d5f34291 Mon Sep 17 00:00:00 2001
From: Dmitry Torokhov <dmitry.torokhov@gmail.com>
Date: Wed, 19 Jul 2017 17:24:31 -0700
Subject: driver core: make device_{add|remove}_groups() public

Many drivers create additional driver-specific device attributes when
binding to the device. To avoid them calling SYSFS API directly, let's
export these helpers.

Signed-off-by: Dmitry Torokhov <dmitry.torokhov@gmail.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/base/base.h    | 5 -----
 drivers/base/core.c    | 2 ++
 include/linux/device.h | 5 +++++
 3 files changed, 7 insertions(+), 5 deletions(-)

(limited to 'include')

diff --git a/drivers/base/base.h b/drivers/base/base.h
index e19b1008e5fb..539432a14b5c 100644
--- a/drivers/base/base.h
+++ b/drivers/base/base.h
@@ -126,11 +126,6 @@ extern int driver_add_groups(struct device_driver *drv,
 extern void driver_remove_groups(struct device_driver *drv,
 				 const struct attribute_group **groups);
 
-extern int device_add_groups(struct device *dev,
-			     const struct attribute_group **groups);
-extern void device_remove_groups(struct device *dev,
-				 const struct attribute_group **groups);
-
 extern char *make_class_name(const char *name, struct kobject *kobj);
 
 extern int devres_release_all(struct device *dev);
diff --git a/drivers/base/core.c b/drivers/base/core.c
index bbecaf9293be..14f8cf5c8b05 100644
--- a/drivers/base/core.c
+++ b/drivers/base/core.c
@@ -1026,12 +1026,14 @@ int device_add_groups(struct device *dev, const struct attribute_group **groups)
 {
 	return sysfs_create_groups(&dev->kobj, groups);
 }
+EXPORT_SYMBOL_GPL(device_add_groups);
 
 void device_remove_groups(struct device *dev,
 			  const struct attribute_group **groups)
 {
 	sysfs_remove_groups(&dev->kobj, groups);
 }
+EXPORT_SYMBOL_GPL(device_remove_groups);
 
 static int device_add_attrs(struct device *dev)
 {
diff --git a/include/linux/device.h b/include/linux/device.h
index 9ef518af5515..10cf209a4e82 100644
--- a/include/linux/device.h
+++ b/include/linux/device.h
@@ -1200,6 +1200,11 @@ struct device *device_create_with_groups(struct class *cls,
 			     const char *fmt, ...);
 extern void device_destroy(struct class *cls, dev_t devt);
 
+extern int __must_check device_add_groups(struct device *dev,
+					const struct attribute_group **groups);
+extern void device_remove_groups(struct device *dev,
+				 const struct attribute_group **groups);
+
 /*
  * Platform "fixup" functions - allow the platform to have their say
  * about devices and actions that the general device layer doesn't
-- 
cgit v1.2.3


From e323b2dddc1ce7ea7f032c986c012a04d5977dc8 Mon Sep 17 00:00:00 2001
From: Dmitry Torokhov <dmitry.torokhov@gmail.com>
Date: Wed, 19 Jul 2017 17:24:32 -0700
Subject: driver core: add device_{add|remove}_group() helpers

We have helpers that work with NULL terminated array of groups, but many
drivers only create a single supplemental group, and do not want to declare
a group array. Let's provide them with helpers working with a single group.

Signed-off-by: Dmitry Torokhov <dmitry.torokhov@gmail.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/linux/device.h | 16 ++++++++++++++++
 1 file changed, 16 insertions(+)

(limited to 'include')

diff --git a/include/linux/device.h b/include/linux/device.h
index 10cf209a4e82..7698a513b35e 100644
--- a/include/linux/device.h
+++ b/include/linux/device.h
@@ -1205,6 +1205,22 @@ extern int __must_check device_add_groups(struct device *dev,
 extern void device_remove_groups(struct device *dev,
 				 const struct attribute_group **groups);
 
+static inline int __must_check device_add_group(struct device *dev,
+					const struct attribute_group *grp)
+{
+	const struct attribute_group *groups[] = { grp, NULL };
+
+	return device_add_groups(dev, groups);
+}
+
+static inline void device_remove_group(struct device *dev,
+				       const struct attribute_group *grp)
+{
+	const struct attribute_group *groups[] = { grp, NULL };
+
+	return device_remove_groups(dev, groups);
+}
+
 /*
  * Platform "fixup" functions - allow the platform to have their say
  * about devices and actions that the general device layer doesn't
-- 
cgit v1.2.3


From 57b8ff070f9897b22e4f80fda775a489fc797008 Mon Sep 17 00:00:00 2001
From: Dmitry Torokhov <dmitry.torokhov@gmail.com>
Date: Wed, 19 Jul 2017 17:24:33 -0700
Subject: driver core: add devm_device_add_group() and friends

Many drivers create additional driver-specific device attributes when
binding to the device, and providing managed version of
device_create_group() will simplify unbinding and error handling in probe
path for such drivers.

Without managed version driver writers either have to mix manual and
managed resources, which is prone to errors, or open-code this function by
providing a wrapper to device_add_group() and use it with devm_add_action()
or devm_add_action_or_reset().

Signed-off-by: Dmitry Torokhov <dmitry.torokhov@gmail.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/base/core.c    | 130 +++++++++++++++++++++++++++++++++++++++++++++++++
 include/linux/device.h |   9 ++++
 2 files changed, 139 insertions(+)

(limited to 'include')

diff --git a/drivers/base/core.c b/drivers/base/core.c
index 14f8cf5c8b05..09723532725d 100644
--- a/drivers/base/core.c
+++ b/drivers/base/core.c
@@ -1035,6 +1035,136 @@ void device_remove_groups(struct device *dev,
 }
 EXPORT_SYMBOL_GPL(device_remove_groups);
 
+union device_attr_group_devres {
+	const struct attribute_group *group;
+	const struct attribute_group **groups;
+};
+
+static int devm_attr_group_match(struct device *dev, void *res, void *data)
+{
+	return ((union device_attr_group_devres *)res)->group == data;
+}
+
+static void devm_attr_group_remove(struct device *dev, void *res)
+{
+	union device_attr_group_devres *devres = res;
+	const struct attribute_group *group = devres->group;
+
+	dev_dbg(dev, "%s: removing group %p\n", __func__, group);
+	sysfs_remove_group(&dev->kobj, group);
+}
+
+static void devm_attr_groups_remove(struct device *dev, void *res)
+{
+	union device_attr_group_devres *devres = res;
+	const struct attribute_group **groups = devres->groups;
+
+	dev_dbg(dev, "%s: removing groups %p\n", __func__, groups);
+	sysfs_remove_groups(&dev->kobj, groups);
+}
+
+/**
+ * devm_device_add_group - given a device, create a managed attribute group
+ * @dev:	The device to create the group for
+ * @grp:	The attribute group to create
+ *
+ * This function creates a group for the first time.  It will explicitly
+ * warn and error if any of the attribute files being created already exist.
+ *
+ * Returns 0 on success or error code on failure.
+ */
+int devm_device_add_group(struct device *dev, const struct attribute_group *grp)
+{
+	union device_attr_group_devres *devres;
+	int error;
+
+	devres = devres_alloc(devm_attr_group_remove,
+			      sizeof(*devres), GFP_KERNEL);
+	if (!devres)
+		return -ENOMEM;
+
+	error = sysfs_create_group(&dev->kobj, grp);
+	if (error) {
+		devres_free(devres);
+		return error;
+	}
+
+	devres->group = grp;
+	devres_add(dev, devres);
+	return 0;
+}
+EXPORT_SYMBOL_GPL(devm_device_add_group);
+
+/**
+ * devm_device_remove_group: remove a managed group from a device
+ * @dev:	device to remove the group from
+ * @grp:	group to remove
+ *
+ * This function removes a group of attributes from a device. The attributes
+ * previously have to have been created for this group, otherwise it will fail.
+ */
+void devm_device_remove_group(struct device *dev,
+			      const struct attribute_group *grp)
+{
+	WARN_ON(devres_release(dev, devm_attr_group_remove,
+			       devm_attr_group_match,
+			       /* cast away const */ (void *)grp));
+}
+EXPORT_SYMBOL_GPL(devm_device_remove_group);
+
+/**
+ * devm_device_add_groups - create a bunch of managed attribute groups
+ * @dev:	The device to create the group for
+ * @groups:	The attribute groups to create, NULL terminated
+ *
+ * This function creates a bunch of managed attribute groups.  If an error
+ * occurs when creating a group, all previously created groups will be
+ * removed, unwinding everything back to the original state when this
+ * function was called.  It will explicitly warn and error if any of the
+ * attribute files being created already exist.
+ *
+ * Returns 0 on success or error code from sysfs_create_group on failure.
+ */
+int devm_device_add_groups(struct device *dev,
+			   const struct attribute_group **groups)
+{
+	union device_attr_group_devres *devres;
+	int error;
+
+	devres = devres_alloc(devm_attr_groups_remove,
+			      sizeof(*devres), GFP_KERNEL);
+	if (!devres)
+		return -ENOMEM;
+
+	error = sysfs_create_groups(&dev->kobj, groups);
+	if (error) {
+		devres_free(devres);
+		return error;
+	}
+
+	devres->groups = groups;
+	devres_add(dev, devres);
+	return 0;
+}
+EXPORT_SYMBOL_GPL(devm_device_add_groups);
+
+/**
+ * devm_device_remove_groups - remove a list of managed groups
+ *
+ * @dev:	The device for the groups to be removed from
+ * @groups:	NULL terminated list of groups to be removed
+ *
+ * If groups is not NULL, remove the specified groups from the device.
+ */
+void devm_device_remove_groups(struct device *dev,
+			       const struct attribute_group **groups)
+{
+	WARN_ON(devres_release(dev, devm_attr_groups_remove,
+			       devm_attr_group_match,
+			       /* cast away const */ (void *)groups));
+}
+EXPORT_SYMBOL_GPL(devm_device_remove_groups);
+
 static int device_add_attrs(struct device *dev)
 {
 	struct class *class = dev->class;
diff --git a/include/linux/device.h b/include/linux/device.h
index 7698a513b35e..f52288c24734 100644
--- a/include/linux/device.h
+++ b/include/linux/device.h
@@ -1221,6 +1221,15 @@ static inline void device_remove_group(struct device *dev,
 	return device_remove_groups(dev, groups);
 }
 
+extern int __must_check devm_device_add_groups(struct device *dev,
+					const struct attribute_group **groups);
+extern void devm_device_remove_groups(struct device *dev,
+				      const struct attribute_group **groups);
+extern int __must_check devm_device_add_group(struct device *dev,
+					const struct attribute_group *grp);
+extern void devm_device_remove_group(struct device *dev,
+				     const struct attribute_group *grp);
+
 /*
  * Platform "fixup" functions - allow the platform to have their say
  * about devices and actions that the general device layer doesn't
-- 
cgit v1.2.3


From cb08e0353c249a27aed10c6f60a13871ae449d33 Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rafael.j.wysocki@intel.com>
Date: Sun, 23 Jul 2017 00:03:43 +0200
Subject: PM / timekeeping: Print debug messages when requested

The messages printed by tk_debug_account_sleep_time() are basically
useful for system sleep debugging, so print them only when the other
debug messages from the core suspend/hibernate code are enabled.

While at it, make it clear that the messages from
tk_debug_account_sleep_time() are about timekeeping suspend
duration, because in general timekeeping may be suspeded and
resumed for multiple times during one system suspend-resume cycle.

Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 include/linux/suspend.h         | 10 ++++++++--
 kernel/power/main.c             | 10 +++++++---
 kernel/time/timekeeping_debug.c |  5 +++--
 3 files changed, 18 insertions(+), 7 deletions(-)

(limited to 'include')

diff --git a/include/linux/suspend.h b/include/linux/suspend.h
index 707e4cdf21c2..97e394feabdb 100644
--- a/include/linux/suspend.h
+++ b/include/linux/suspend.h
@@ -492,16 +492,22 @@ static inline void unlock_system_sleep(void) {}
 
 #ifdef CONFIG_PM_SLEEP_DEBUG
 extern bool pm_print_times_enabled;
-extern __printf(1, 2) void pm_pr_dbg(const char *fmt, ...);
+extern __printf(2, 3) void __pm_pr_dbg(bool defer, const char *fmt, ...);
 #else
 #define pm_print_times_enabled	(false)
 
 #include <linux/printk.h>
 
-#define pm_pr_dbg(fmt, ...) \
+#define __pm_pr_dbg(defer, fmt, ...) \
 	no_printk(KERN_DEBUG fmt, ##__VA_ARGS__)
 #endif
 
+#define pm_pr_dbg(fmt, ...) \
+	__pm_pr_dbg(false, fmt, ##__VA_ARGS__)
+
+#define pm_deferred_pr_dbg(fmt, ...) \
+	__pm_pr_dbg(true, fmt, ##__VA_ARGS__)
+
 #ifdef CONFIG_PM_AUTOSLEEP
 
 /* kernel/power/autosleep.c */
diff --git a/kernel/power/main.c b/kernel/power/main.c
index 5ce00902c7e3..b7876eaf83f3 100644
--- a/kernel/power/main.c
+++ b/kernel/power/main.c
@@ -388,13 +388,14 @@ static ssize_t pm_debug_messages_store(struct kobject *kobj,
 power_attr(pm_debug_messages);
 
 /**
- * pm_pr_dbg - Print a suspend debug message to the kernel log.
+ * __pm_pr_dbg - Print a suspend debug message to the kernel log.
+ * @defer: Whether or not to use printk_deferred() to print the message.
  * @fmt: Message format.
  *
  * The message will be emitted if enabled through the pm_debug_messages
  * sysfs attribute.
  */
-void pm_pr_dbg(const char *fmt, ...)
+void __pm_pr_dbg(bool defer, const char *fmt, ...)
 {
 	struct va_format vaf;
 	va_list args;
@@ -407,7 +408,10 @@ void pm_pr_dbg(const char *fmt, ...)
 	vaf.fmt = fmt;
 	vaf.va = &args;
 
-	printk(KERN_DEBUG "PM: %pV", &vaf);
+	if (defer)
+		printk_deferred(KERN_DEBUG "PM: %pV", &vaf);
+	else
+		printk(KERN_DEBUG "PM: %pV", &vaf);
 
 	va_end(args);
 }
diff --git a/kernel/time/timekeeping_debug.c b/kernel/time/timekeeping_debug.c
index 38bc4d2208e8..0754cadfa9e6 100644
--- a/kernel/time/timekeeping_debug.c
+++ b/kernel/time/timekeeping_debug.c
@@ -19,6 +19,7 @@
 #include <linux/init.h>
 #include <linux/kernel.h>
 #include <linux/seq_file.h>
+#include <linux/suspend.h>
 #include <linux/time.h>
 
 #include "timekeeping_internal.h"
@@ -75,7 +76,7 @@ void tk_debug_account_sleep_time(struct timespec64 *t)
 	int bin = min(fls(t->tv_sec), NUM_BINS-1);
 
 	sleep_time_bin[bin]++;
-	printk_deferred(KERN_INFO "Suspended for %lld.%03lu seconds\n",
-			(s64)t->tv_sec, t->tv_nsec / NSEC_PER_MSEC);
+	pm_deferred_pr_dbg("Timekeeping suspended for %lld.%03lu seconds\n",
+			   (s64)t->tv_sec, t->tv_nsec / NSEC_PER_MSEC);
 }
 
-- 
cgit v1.2.3


From c2327da06b33d8e1093ce2c28f395bc500d1b0d3 Mon Sep 17 00:00:00 2001
From: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Date: Fri, 30 Jun 2017 20:09:32 +0300
Subject: x86/io: Remove mem*io() duplications

Generic header defines memset_io, memcpy_fromio(). and memcpy_toio().

Reuse them from generic header and remove in x86 code.
Move the descriptions to the generic header as well.

Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Cc: Baolin Wang <baolin.wang@spreadtrum.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mika Westerberg <mika.westerberg@linux.intel.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: intel-gfx@lists.freedesktop.org
Cc: linux-i2c@vger.kernel.org
Cc: wsa@the-dreams.de
Link: http://lkml.kernel.org/r/20170630170934.83028-4-andriy.shevchenko@linux.intel.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/include/asm/io.h | 45 ---------------------------------------------
 include/asm-generic/io.h  | 24 ++++++++++++++++++++++++
 2 files changed, 24 insertions(+), 45 deletions(-)

(limited to 'include')

diff --git a/arch/x86/include/asm/io.h b/arch/x86/include/asm/io.h
index 0558d81c177e..252434b00fdb 100644
--- a/arch/x86/include/asm/io.h
+++ b/arch/x86/include/asm/io.h
@@ -220,51 +220,6 @@ extern void set_iounmap_nonlazy(void);
  */
 #define xlate_dev_kmem_ptr(p)	p
 
-/**
- * memset_io	Set a range of I/O memory to a constant value
- * @addr:	The beginning of the I/O-memory range to set
- * @val:	The value to set the memory to
- * @count:	The number of bytes to set
- *
- * Set a range of I/O memory to a given value.
- */
-static inline void
-memset_io(volatile void __iomem *addr, unsigned char val, size_t count)
-{
-	memset((void __force *)addr, val, count);
-}
-#define memset_io(dst,c,count) memset_io(dst,c,count)
-
-/**
- * memcpy_fromio	Copy a block of data from I/O memory
- * @dst:		The (RAM) destination for the copy
- * @src:		The (I/O memory) source for the data
- * @count:		The number of bytes to copy
- *
- * Copy a block of data from I/O memory.
- */
-static inline void
-memcpy_fromio(void *dst, const volatile void __iomem *src, size_t count)
-{
-	memcpy(dst, (const void __force *)src, count);
-}
-#define memcpy_fromio(to,from,count) memcpy_fromio(to,from,count)
-
-/**
- * memcpy_toio		Copy a block of data into I/O memory
- * @dst:		The (I/O memory) destination for the copy
- * @src:		The (RAM) source for the data
- * @count:		The number of bytes to copy
- *
- * Copy a block of data to I/O memory.
- */
-static inline void
-memcpy_toio(volatile void __iomem *dst, const void *src, size_t count)
-{
-	memcpy((void __force *)dst, src, count);
-}
-#define memcpy_toio(to,from,count) memcpy_toio(to,from,count)
-
 /*
  * ISA space is 'always mapped' on a typical x86 system, no need to
  * explicitly ioremap() it. The fact that the ISA IO space is mapped
diff --git a/include/asm-generic/io.h b/include/asm-generic/io.h
index 7ef015eb3403..395afc829409 100644
--- a/include/asm-generic/io.h
+++ b/include/asm-generic/io.h
@@ -954,6 +954,14 @@ static inline void *bus_to_virt(unsigned long address)
 
 #ifndef memset_io
 #define memset_io memset_io
+/**
+ * memset_io	Set a range of I/O memory to a constant value
+ * @addr:	The beginning of the I/O-memory range to set
+ * @val:	The value to set the memory to
+ * @count:	The number of bytes to set
+ *
+ * Set a range of I/O memory to a given value.
+ */
 static inline void memset_io(volatile void __iomem *addr, int value,
 			     size_t size)
 {
@@ -963,6 +971,14 @@ static inline void memset_io(volatile void __iomem *addr, int value,
 
 #ifndef memcpy_fromio
 #define memcpy_fromio memcpy_fromio
+/**
+ * memcpy_fromio	Copy a block of data from I/O memory
+ * @dst:		The (RAM) destination for the copy
+ * @src:		The (I/O memory) source for the data
+ * @count:		The number of bytes to copy
+ *
+ * Copy a block of data from I/O memory.
+ */
 static inline void memcpy_fromio(void *buffer,
 				 const volatile void __iomem *addr,
 				 size_t size)
@@ -973,6 +989,14 @@ static inline void memcpy_fromio(void *buffer,
 
 #ifndef memcpy_toio
 #define memcpy_toio memcpy_toio
+/**
+ * memcpy_toio		Copy a block of data into I/O memory
+ * @dst:		The (I/O memory) destination for the copy
+ * @src:		The (RAM) source for the data
+ * @count:		The number of bytes to copy
+ *
+ * Copy a block of data to I/O memory.
+ */
 static inline void memcpy_toio(volatile void __iomem *addr, const void *buffer,
 			       size_t size)
 {
-- 
cgit v1.2.3


From eabc2a7c49c01fc97ff8c764ef7d74276b904af6 Mon Sep 17 00:00:00 2001
From: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Date: Fri, 30 Jun 2017 20:09:33 +0300
Subject: x86/io: Remove xlate_dev_kmem_ptr() duplication

Generic header defines xlate_dev_kmem_ptr().

Reuse it from generic header and remove in x86 code.
Move a description to the generic header as well.

Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Cc: Baolin Wang <baolin.wang@spreadtrum.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mika Westerberg <mika.westerberg@linux.intel.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: intel-gfx@lists.freedesktop.org
Cc: linux-i2c@vger.kernel.org
Cc: wsa@the-dreams.de
Link: http://lkml.kernel.org/r/20170630170934.83028-5-andriy.shevchenko@linux.intel.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/include/asm/io.h | 5 -----
 include/asm-generic/io.h  | 3 +++
 2 files changed, 3 insertions(+), 5 deletions(-)

(limited to 'include')

diff --git a/arch/x86/include/asm/io.h b/arch/x86/include/asm/io.h
index 252434b00fdb..b3bba2f87e18 100644
--- a/arch/x86/include/asm/io.h
+++ b/arch/x86/include/asm/io.h
@@ -215,11 +215,6 @@ extern void set_iounmap_nonlazy(void);
 
 #include <asm-generic/iomap.h>
 
-/*
- * Convert a virtual cached pointer to an uncached pointer
- */
-#define xlate_dev_kmem_ptr(p)	p
-
 /*
  * ISA space is 'always mapped' on a typical x86 system, no need to
  * explicitly ioremap() it. The fact that the ISA IO space is mapped
diff --git a/include/asm-generic/io.h b/include/asm-generic/io.h
index 395afc829409..b4531e3b2120 100644
--- a/include/asm-generic/io.h
+++ b/include/asm-generic/io.h
@@ -915,6 +915,9 @@ extern void ioport_unmap(void __iomem *p);
 #endif /* CONFIG_GENERIC_IOMAP */
 #endif /* CONFIG_HAS_IOPORT_MAP */
 
+/*
+ * Convert a virtual cached pointer to an uncached pointer
+ */
 #ifndef xlate_dev_kmem_ptr
 #define xlate_dev_kmem_ptr xlate_dev_kmem_ptr
 static inline void *xlate_dev_kmem_ptr(void *addr)
-- 
cgit v1.2.3


From 9f08ea848117ab521efcfd3e004d8e1a0edc640c Mon Sep 17 00:00:00 2001
From: Pablo Neira Ayuso <pablo@netfilter.org>
Date: Tue, 18 Jul 2017 20:18:09 +0200
Subject: netfilter: nf_tables: keep chain counters away from hot path

These chain counters are only used by the iptables-compat tool, that
allow users to use the x_tables extensions from the existing nf_tables
framework. This patch makes nf_tables by ~5% for the general usecase,
ie. native nft users, where no chain counters are used at all.

Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/net/netfilter/nf_tables_core.h |  2 ++
 net/netfilter/nf_tables_api.c          | 11 +++--------
 net/netfilter/nf_tables_core.c         | 26 ++++++++++++++++++--------
 3 files changed, 23 insertions(+), 16 deletions(-)

(limited to 'include')

diff --git a/include/net/netfilter/nf_tables_core.h b/include/net/netfilter/nf_tables_core.h
index 8f690effec37..424684c33771 100644
--- a/include/net/netfilter/nf_tables_core.h
+++ b/include/net/netfilter/nf_tables_core.h
@@ -49,6 +49,8 @@ struct nft_payload_set {
 };
 
 extern const struct nft_expr_ops nft_payload_fast_ops;
+
+extern struct static_key_false nft_counters_enabled;
 extern struct static_key_false nft_trace_enabled;
 
 #endif /* _NET_NF_TABLES_CORE_H */
diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c
index 7843efa33c59..7fbf0070aba1 100644
--- a/net/netfilter/nf_tables_api.c
+++ b/net/netfilter/nf_tables_api.c
@@ -1240,6 +1240,8 @@ static void nf_tables_chain_destroy(struct nft_chain *chain)
 
 		module_put(basechain->type->owner);
 		free_percpu(basechain->stats);
+		if (basechain->stats)
+			static_branch_dec(&nft_counters_enabled);
 		if (basechain->ops[0].dev != NULL)
 			dev_put(basechain->ops[0].dev);
 		kfree(basechain);
@@ -1504,14 +1506,7 @@ static int nf_tables_newchain(struct net *net, struct sock *nlsk,
 				return PTR_ERR(stats);
 			}
 			basechain->stats = stats;
-		} else {
-			stats = netdev_alloc_pcpu_stats(struct nft_stats);
-			if (stats == NULL) {
-				nft_chain_release_hook(&hook);
-				kfree(basechain);
-				return -ENOMEM;
-			}
-			rcu_assign_pointer(basechain->stats, stats);
+			static_branch_inc(&nft_counters_enabled);
 		}
 
 		hookfn = hook.type->hooks[hook.num];
diff --git a/net/netfilter/nf_tables_core.c b/net/netfilter/nf_tables_core.c
index 65dbeadcb118..c5bab08b0d73 100644
--- a/net/netfilter/nf_tables_core.c
+++ b/net/netfilter/nf_tables_core.c
@@ -114,6 +114,22 @@ static bool nft_payload_fast_eval(const struct nft_expr *expr,
 	return true;
 }
 
+DEFINE_STATIC_KEY_FALSE(nft_counters_enabled);
+
+static noinline void nft_update_chain_stats(const struct nft_chain *chain,
+					    const struct nft_pktinfo *pkt)
+{
+	struct nft_stats *stats;
+
+	local_bh_disable();
+	stats = this_cpu_ptr(rcu_dereference(nft_base_chain(chain)->stats));
+	u64_stats_update_begin(&stats->syncp);
+	stats->pkts++;
+	stats->bytes += pkt->skb->len;
+	u64_stats_update_end(&stats->syncp);
+	local_bh_enable();
+}
+
 struct nft_jumpstack {
 	const struct nft_chain	*chain;
 	const struct nft_rule	*rule;
@@ -130,7 +146,6 @@ nft_do_chain(struct nft_pktinfo *pkt, void *priv)
 	struct nft_regs regs;
 	unsigned int stackptr = 0;
 	struct nft_jumpstack jumpstack[NFT_JUMP_STACK_SIZE];
-	struct nft_stats *stats;
 	int rulenum;
 	unsigned int gencursor = nft_genmask_cur(net);
 	struct nft_traceinfo info;
@@ -220,13 +235,8 @@ next_rule:
 	nft_trace_packet(&info, basechain, NULL, -1,
 			 NFT_TRACETYPE_POLICY);
 
-	rcu_read_lock_bh();
-	stats = this_cpu_ptr(rcu_dereference(nft_base_chain(basechain)->stats));
-	u64_stats_update_begin(&stats->syncp);
-	stats->pkts++;
-	stats->bytes += pkt->skb->len;
-	u64_stats_update_end(&stats->syncp);
-	rcu_read_unlock_bh();
+	if (static_branch_unlikely(&nft_counters_enabled))
+		nft_update_chain_stats(basechain, pkt);
 
 	return nft_base_chain(basechain)->policy;
 }
-- 
cgit v1.2.3


From 784b4e612d42a2b7578d7fab2ed78940e10536bc Mon Sep 17 00:00:00 2001
From: Phil Sutter <phil@nwl.cc>
Date: Wed, 19 Jul 2017 16:32:23 +0200
Subject: netfilter: nf_tables: Attach process info to NFT_MSG_NEWGEN
 notifications

This is helpful for 'nft monitor' to track which process caused a given
change to the ruleset.

Signed-off-by: Phil Sutter <phil@nwl.cc>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/uapi/linux/netfilter/nf_tables.h | 2 ++
 net/netfilter/nf_tables_api.c            | 5 ++++-
 2 files changed, 6 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/uapi/linux/netfilter/nf_tables.h b/include/uapi/linux/netfilter/nf_tables.h
index 683f6f88fcac..6f0a950e21c3 100644
--- a/include/uapi/linux/netfilter/nf_tables.h
+++ b/include/uapi/linux/netfilter/nf_tables.h
@@ -1221,6 +1221,8 @@ enum nft_objref_attributes {
 enum nft_gen_attributes {
 	NFTA_GEN_UNSPEC,
 	NFTA_GEN_ID,
+	NFTA_GEN_PROC_PID,
+	NFTA_GEN_PROC_NAME,
 	__NFTA_GEN_MAX
 };
 #define NFTA_GEN_MAX		(__NFTA_GEN_MAX - 1)
diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c
index 7fbf0070aba1..b77ad0813564 100644
--- a/net/netfilter/nf_tables_api.c
+++ b/net/netfilter/nf_tables_api.c
@@ -4657,6 +4657,7 @@ static int nf_tables_fill_gen_info(struct sk_buff *skb, struct net *net,
 {
 	struct nlmsghdr *nlh;
 	struct nfgenmsg *nfmsg;
+	char buf[TASK_COMM_LEN];
 	int event = nfnl_msg_type(NFNL_SUBSYS_NFTABLES, NFT_MSG_NEWGEN);
 
 	nlh = nlmsg_put(skb, portid, seq, event, sizeof(struct nfgenmsg), 0);
@@ -4668,7 +4669,9 @@ static int nf_tables_fill_gen_info(struct sk_buff *skb, struct net *net,
 	nfmsg->version		= NFNETLINK_V0;
 	nfmsg->res_id		= htons(net->nft.base_seq & 0xffff);
 
-	if (nla_put_be32(skb, NFTA_GEN_ID, htonl(net->nft.base_seq)))
+	if (nla_put_be32(skb, NFTA_GEN_ID, htonl(net->nft.base_seq)) ||
+	    nla_put_be32(skb, NFTA_GEN_PROC_PID, htonl(task_pid_nr(current))) ||
+	    nla_put_string(skb, NFTA_GEN_PROC_NAME, get_task_comm(buf, current)))
 		goto nla_put_failure;
 
 	nlmsg_end(skb, nlh);
-- 
cgit v1.2.3


From d41861942fc55c14b6280d9568a0d0112037f065 Mon Sep 17 00:00:00 2001
From: Yuval Shaia <yuval.shaia@oracle.com>
Date: Wed, 14 Jun 2017 23:13:34 +0300
Subject: IB/core: Add generic function to extract IB speed from netdev

Logic of retrieving netdev speed from net_device and translating it to
IB speed is implemented in rxe, in usnic and in bnxt drivers.

Define new function which merges all.

Signed-off-by: Yuval Shaia <yuval.shaia@oracle.com>
Reviewed-by: Christian Benvenuti <benve@cisco.com>
Reviewed-by: Selvin Xavier <selvin.xavier@broadcom.com>
Reviewed-by: Moni Shoua <monis@mellanox.com>
Signed-off-by: Doug Ledford <dledford@redhat.com>
---
 drivers/infiniband/core/roce_gid_mgmt.c      |  2 +
 drivers/infiniband/core/verbs.c              | 55 ++++++++++++++++++++++++++++
 drivers/infiniband/hw/bnxt_re/ib_verbs.c     | 49 ++-----------------------
 drivers/infiniband/hw/usnic/usnic_ib_verbs.c | 31 +++-------------
 drivers/infiniband/sw/rxe/rxe_verbs.c        | 53 +++------------------------
 include/rdma/ib_verbs.h                      |  1 +
 6 files changed, 73 insertions(+), 118 deletions(-)

(limited to 'include')

diff --git a/drivers/infiniband/core/roce_gid_mgmt.c b/drivers/infiniband/core/roce_gid_mgmt.c
index 94a9eefb3cfc..90e3889b7fbe 100644
--- a/drivers/infiniband/core/roce_gid_mgmt.c
+++ b/drivers/infiniband/core/roce_gid_mgmt.c
@@ -44,6 +44,8 @@
 
 static struct workqueue_struct *gid_cache_wq;
 
+static struct workqueue_struct *gid_cache_wq;
+
 enum gid_op_type {
 	GID_DEL = 0,
 	GID_ADD
diff --git a/drivers/infiniband/core/verbs.c b/drivers/infiniband/core/verbs.c
index fb98ed67d5bc..40de69bf07cd 100644
--- a/drivers/infiniband/core/verbs.c
+++ b/drivers/infiniband/core/verbs.c
@@ -1302,6 +1302,61 @@ int ib_modify_qp_with_udata(struct ib_qp *qp, struct ib_qp_attr *attr,
 }
 EXPORT_SYMBOL(ib_modify_qp_with_udata);
 
+int ib_get_eth_speed(struct ib_device *dev, u8 port_num, u8 *speed, u8 *width)
+{
+	int rc;
+	u32 netdev_speed;
+	struct net_device *netdev;
+	struct ethtool_link_ksettings lksettings;
+
+	if (rdma_port_get_link_layer(dev, port_num) != IB_LINK_LAYER_ETHERNET)
+		return -EINVAL;
+
+	if (!dev->get_netdev)
+		return -EOPNOTSUPP;
+
+	netdev = dev->get_netdev(dev, port_num);
+	if (!netdev)
+		return -ENODEV;
+
+	rtnl_lock();
+	rc = __ethtool_get_link_ksettings(netdev, &lksettings);
+	rtnl_unlock();
+
+	dev_put(netdev);
+
+	if (!rc) {
+		netdev_speed = lksettings.base.speed;
+	} else {
+		netdev_speed = SPEED_1000;
+		pr_warn("%s speed is unknown, defaulting to %d\n", netdev->name,
+			netdev_speed);
+	}
+
+	if (netdev_speed <= SPEED_1000) {
+		*width = IB_WIDTH_1X;
+		*speed = IB_SPEED_SDR;
+	} else if (netdev_speed <= SPEED_10000) {
+		*width = IB_WIDTH_1X;
+		*speed = IB_SPEED_FDR10;
+	} else if (netdev_speed <= SPEED_20000) {
+		*width = IB_WIDTH_4X;
+		*speed = IB_SPEED_DDR;
+	} else if (netdev_speed <= SPEED_25000) {
+		*width = IB_WIDTH_1X;
+		*speed = IB_SPEED_EDR;
+	} else if (netdev_speed <= SPEED_40000) {
+		*width = IB_WIDTH_4X;
+		*speed = IB_SPEED_FDR10;
+	} else {
+		*width = IB_WIDTH_4X;
+		*speed = IB_SPEED_EDR;
+	}
+
+	return 0;
+}
+EXPORT_SYMBOL(ib_get_eth_speed);
+
 int ib_modify_qp(struct ib_qp *qp,
 		 struct ib_qp_attr *qp_attr,
 		 int qp_attr_mask)
diff --git a/drivers/infiniband/hw/bnxt_re/ib_verbs.c b/drivers/infiniband/hw/bnxt_re/ib_verbs.c
index 5dc6e7ce3ab9..b10e1a6dce84 100644
--- a/drivers/infiniband/hw/bnxt_re/ib_verbs.c
+++ b/drivers/infiniband/hw/bnxt_re/ib_verbs.c
@@ -223,50 +223,6 @@ int bnxt_re_modify_device(struct ib_device *ibdev,
 	return 0;
 }
 
-static void __to_ib_speed_width(struct net_device *netdev, u8 *speed, u8 *width)
-{
-	struct ethtool_link_ksettings lksettings;
-	u32 espeed;
-
-	if (netdev->ethtool_ops && netdev->ethtool_ops->get_link_ksettings) {
-		memset(&lksettings, 0, sizeof(lksettings));
-		rtnl_lock();
-		netdev->ethtool_ops->get_link_ksettings(netdev, &lksettings);
-		rtnl_unlock();
-		espeed = lksettings.base.speed;
-	} else {
-		espeed = SPEED_UNKNOWN;
-	}
-	switch (espeed) {
-	case SPEED_1000:
-		*speed = IB_SPEED_SDR;
-		*width = IB_WIDTH_1X;
-		break;
-	case SPEED_10000:
-		*speed = IB_SPEED_QDR;
-		*width = IB_WIDTH_1X;
-		break;
-	case SPEED_20000:
-		*speed = IB_SPEED_DDR;
-		*width = IB_WIDTH_4X;
-		break;
-	case SPEED_25000:
-		*speed = IB_SPEED_EDR;
-		*width = IB_WIDTH_1X;
-		break;
-	case SPEED_40000:
-		*speed = IB_SPEED_QDR;
-		*width = IB_WIDTH_4X;
-		break;
-	case SPEED_50000:
-		break;
-	default:
-		*speed = IB_SPEED_SDR;
-		*width = IB_WIDTH_1X;
-		break;
-	}
-}
-
 /* Port */
 int bnxt_re_query_port(struct ib_device *ibdev, u8 port_num,
 		       struct ib_port_attr *port_attr)
@@ -308,8 +264,9 @@ int bnxt_re_query_port(struct ib_device *ibdev, u8 port_num,
 	 * IB stack to avoid race in the NETDEV_UNREG path
 	 */
 	if (test_bit(BNXT_RE_FLAG_IBDEV_REGISTERED, &rdev->flags))
-		__to_ib_speed_width(rdev->netdev, &port_attr->active_speed,
-				    &port_attr->active_width);
+		if (!ib_get_eth_speed(ibdev, port_num, &port_attr->active_speed,
+				      &port_attr->active_width))
+			return -EINVAL;
 	return 0;
 }
 
diff --git a/drivers/infiniband/hw/usnic/usnic_ib_verbs.c b/drivers/infiniband/hw/usnic/usnic_ib_verbs.c
index f9dc1e80c3b7..e5f57dd49980 100644
--- a/drivers/infiniband/hw/usnic/usnic_ib_verbs.c
+++ b/drivers/infiniband/hw/usnic/usnic_ib_verbs.c
@@ -226,27 +226,6 @@ static void qp_grp_destroy(struct usnic_ib_qp_grp *qp_grp)
 	spin_unlock(&vf->lock);
 }
 
-static void eth_speed_to_ib_speed(int speed, u8 *active_speed,
-					u8 *active_width)
-{
-	if (speed <= 10000) {
-		*active_width = IB_WIDTH_1X;
-		*active_speed = IB_SPEED_FDR10;
-	} else if (speed <= 20000) {
-		*active_width = IB_WIDTH_4X;
-		*active_speed = IB_SPEED_DDR;
-	} else if (speed <= 30000) {
-		*active_width = IB_WIDTH_4X;
-		*active_speed = IB_SPEED_QDR;
-	} else if (speed <= 40000) {
-		*active_width = IB_WIDTH_4X;
-		*active_speed = IB_SPEED_FDR10;
-	} else {
-		*active_width = IB_WIDTH_4X;
-		*active_speed = IB_SPEED_EDR;
-	}
-}
-
 static int create_qp_validate_user_data(struct usnic_ib_create_qp_cmd cmd)
 {
 	if (cmd.spec.trans_type <= USNIC_TRANSPORT_UNKNOWN ||
@@ -326,12 +305,16 @@ int usnic_ib_query_port(struct ib_device *ibdev, u8 port,
 				struct ib_port_attr *props)
 {
 	struct usnic_ib_dev *us_ibdev = to_usdev(ibdev);
-	struct ethtool_link_ksettings cmd;
 
 	usnic_dbg("\n");
 
 	mutex_lock(&us_ibdev->usdev_lock);
-	__ethtool_get_link_ksettings(us_ibdev->netdev, &cmd);
+	if (!ib_get_eth_speed(ibdev, port, &props->active_speed,
+			      &props->active_width)) {
+		mutex_unlock(&us_ibdev->usdev_lock);
+		return -EINVAL;
+	}
+
 	/* props being zeroed by the caller, avoid zeroing it here */
 
 	props->lid = 0;
@@ -355,8 +338,6 @@ int usnic_ib_query_port(struct ib_device *ibdev, u8 port,
 	props->pkey_tbl_len = 1;
 	props->bad_pkey_cntr = 0;
 	props->qkey_viol_cntr = 0;
-	eth_speed_to_ib_speed(cmd.base.speed, &props->active_speed,
-			      &props->active_width);
 	props->max_mtu = IB_MTU_4096;
 	props->active_mtu = iboe_get_mtu(us_ibdev->ufdev->mtu);
 	/* Userspace will adjust for hdrs */
diff --git a/drivers/infiniband/sw/rxe/rxe_verbs.c b/drivers/infiniband/sw/rxe/rxe_verbs.c
index af90a7d42b96..e6c10e43a6d7 100644
--- a/drivers/infiniband/sw/rxe/rxe_verbs.c
+++ b/drivers/infiniband/sw/rxe/rxe_verbs.c
@@ -51,40 +51,16 @@ static int rxe_query_device(struct ib_device *dev,
 	return 0;
 }
 
-static void rxe_eth_speed_to_ib_speed(int speed, u8 *active_speed,
-				      u8 *active_width)
-{
-	if (speed <= 1000) {
-		*active_width = IB_WIDTH_1X;
-		*active_speed = IB_SPEED_SDR;
-	} else if (speed <= 10000) {
-		*active_width = IB_WIDTH_1X;
-		*active_speed = IB_SPEED_FDR10;
-	} else if (speed <= 20000) {
-		*active_width = IB_WIDTH_4X;
-		*active_speed = IB_SPEED_DDR;
-	} else if (speed <= 30000) {
-		*active_width = IB_WIDTH_4X;
-		*active_speed = IB_SPEED_QDR;
-	} else if (speed <= 40000) {
-		*active_width = IB_WIDTH_4X;
-		*active_speed = IB_SPEED_FDR10;
-	} else {
-		*active_width = IB_WIDTH_4X;
-		*active_speed = IB_SPEED_EDR;
-	}
-}
-
 static int rxe_query_port(struct ib_device *dev,
 			  u8 port_num, struct ib_port_attr *attr)
 {
 	struct rxe_dev *rxe = to_rdev(dev);
 	struct rxe_port *port;
-	u32 speed;
+	int rc = -EINVAL;
 
 	if (unlikely(port_num != 1)) {
 		pr_warn("invalid port_number %d\n", port_num);
-		goto err1;
+		goto out;
 	}
 
 	port = &rxe->port;
@@ -93,29 +69,12 @@ static int rxe_query_port(struct ib_device *dev,
 	*attr = port->attr;
 
 	mutex_lock(&rxe->usdev_lock);
-	if (rxe->ndev->ethtool_ops->get_link_ksettings) {
-		struct ethtool_link_ksettings ks;
-
-		rxe->ndev->ethtool_ops->get_link_ksettings(rxe->ndev, &ks);
-		speed = ks.base.speed;
-	} else if (rxe->ndev->ethtool_ops->get_settings) {
-		struct ethtool_cmd cmd;
-
-		rxe->ndev->ethtool_ops->get_settings(rxe->ndev, &cmd);
-		speed = cmd.speed;
-	} else {
-		pr_warn("%s speed is unknown, defaulting to 1000\n",
-			rxe->ndev->name);
-		speed = 1000;
-	}
-	rxe_eth_speed_to_ib_speed(speed, &attr->active_speed,
-				  &attr->active_width);
+	rc = ib_get_eth_speed(dev, port_num, &attr->active_speed,
+			      &attr->active_width);
 	mutex_unlock(&rxe->usdev_lock);
 
-	return 0;
-
-err1:
-	return -EINVAL;
+out:
+	return rc;
 }
 
 static int rxe_query_gid(struct ib_device *device,
diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h
index b5732432bb29..68d947dac9a2 100644
--- a/include/rdma/ib_verbs.h
+++ b/include/rdma/ib_verbs.h
@@ -3555,6 +3555,7 @@ void ib_drain_qp(struct ib_qp *qp);
 
 int ib_resolve_eth_dmac(struct ib_device *device,
 			struct rdma_ah_attr *ah_attr);
+int ib_get_eth_speed(struct ib_device *dev, u8 port_num, u8 *speed, u8 *width);
 
 static inline u8 *rdma_ah_retrieve_dmac(struct rdma_ah_attr *attr)
 {
-- 
cgit v1.2.3


From bded747bb432bc5f7ad6d84ea747368b70ed9df2 Mon Sep 17 00:00:00 2001
From: Huy Nguyen <huyn@mellanox.com>
Date: Tue, 30 May 2017 09:42:53 +0300
Subject: net/mlx5: Add raw ethernet local loopback firmware command

Add support for raw ethernet local loopback firmware command.

Signed-off-by: Huy Nguyen <huyn@mellanox.com>
Reviewed-by: Daniel Jurgens <danielj@mellanox.com>
Signed-off-by: Leon Romanovsky <leon@kernel.org>
Signed-off-by: Doug Ledford <dledford@redhat.com>
---
 drivers/net/ethernet/mellanox/mlx5/core/vport.c | 62 +++++++++++++++++++++++++
 include/linux/mlx5/mlx5_ifc.h                   | 11 +++--
 include/linux/mlx5/vport.h                      |  3 +-
 3 files changed, 72 insertions(+), 4 deletions(-)

(limited to 'include')

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/vport.c b/drivers/net/ethernet/mellanox/mlx5/core/vport.c
index 5abfec1c3399..d653b0025b13 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/vport.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/vport.c
@@ -897,6 +897,68 @@ int mlx5_modify_nic_vport_promisc(struct mlx5_core_dev *mdev,
 }
 EXPORT_SYMBOL_GPL(mlx5_modify_nic_vport_promisc);
 
+enum {
+	UC_LOCAL_LB,
+	MC_LOCAL_LB
+};
+
+int mlx5_nic_vport_update_local_lb(struct mlx5_core_dev *mdev, bool enable)
+{
+	int inlen = MLX5_ST_SZ_BYTES(modify_nic_vport_context_in);
+	void *in;
+	int err;
+
+	mlx5_core_dbg(mdev, "%s local_lb\n", enable ? "enable" : "disable");
+	in = kvzalloc(inlen, GFP_KERNEL);
+	if (!in)
+		return -ENOMEM;
+
+	MLX5_SET(modify_nic_vport_context_in, in,
+		 field_select.disable_mc_local_lb, 1);
+	MLX5_SET(modify_nic_vport_context_in, in,
+		 nic_vport_context.disable_mc_local_lb, !enable);
+
+	MLX5_SET(modify_nic_vport_context_in, in,
+		 field_select.disable_uc_local_lb, 1);
+	MLX5_SET(modify_nic_vport_context_in, in,
+		 nic_vport_context.disable_uc_local_lb, !enable);
+
+	err = mlx5_modify_nic_vport_context(mdev, in, inlen);
+
+	kvfree(in);
+	return err;
+}
+EXPORT_SYMBOL_GPL(mlx5_nic_vport_update_local_lb);
+
+int mlx5_nic_vport_query_local_lb(struct mlx5_core_dev *mdev, bool *status)
+{
+	int outlen = MLX5_ST_SZ_BYTES(query_nic_vport_context_out);
+	u32 *out;
+	int value;
+	int err;
+
+	out = kzalloc(outlen, GFP_KERNEL);
+	if (!out)
+		return -ENOMEM;
+
+	err = mlx5_query_nic_vport_context(mdev, 0, out, outlen);
+	if (err)
+		goto out;
+
+	value = MLX5_GET(query_nic_vport_context_out, out,
+			 nic_vport_context.disable_mc_local_lb) << MC_LOCAL_LB;
+
+	value |= MLX5_GET(query_nic_vport_context_out, out,
+			  nic_vport_context.disable_uc_local_lb) << UC_LOCAL_LB;
+
+	*status = !value;
+
+out:
+	kfree(out);
+	return err;
+}
+EXPORT_SYMBOL_GPL(mlx5_nic_vport_query_local_lb);
+
 enum mlx5_vport_roce_state {
 	MLX5_VPORT_ROCE_DISABLED = 0,
 	MLX5_VPORT_ROCE_ENABLED  = 1,
diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h
index 87869c04849a..57c75e8b3c19 100644
--- a/include/linux/mlx5/mlx5_ifc.h
+++ b/include/linux/mlx5/mlx5_ifc.h
@@ -1016,7 +1016,8 @@ struct mlx5_ifc_cmd_hca_cap_bits {
 	u8         log_max_wq_sz[0x5];
 
 	u8         nic_vport_change_event[0x1];
-	u8         reserved_at_3e1[0xa];
+	u8         disable_local_lb[0x1];
+	u8         reserved_at_3e2[0x9];
 	u8         log_max_vlan_list[0x5];
 	u8         reserved_at_3f0[0x3];
 	u8         log_max_current_mc_list[0x5];
@@ -2562,7 +2563,9 @@ struct mlx5_ifc_rmpc_bits {
 struct mlx5_ifc_nic_vport_context_bits {
 	u8         reserved_at_0[0x5];
 	u8         min_wqe_inline_mode[0x3];
-	u8         reserved_at_8[0x17];
+	u8         reserved_at_8[0x15];
+	u8         disable_mc_local_lb[0x1];
+	u8         disable_uc_local_lb[0x1];
 	u8         roce_en[0x1];
 
 	u8         arm_change_event[0x1];
@@ -5229,7 +5232,9 @@ struct mlx5_ifc_modify_nic_vport_context_out_bits {
 };
 
 struct mlx5_ifc_modify_nic_vport_field_select_bits {
-	u8         reserved_at_0[0x16];
+	u8         reserved_at_0[0x14];
+	u8         disable_uc_local_lb[0x1];
+	u8         disable_mc_local_lb[0x1];
 	u8         node_guid[0x1];
 	u8         port_guid[0x1];
 	u8         min_inline[0x1];
diff --git a/include/linux/mlx5/vport.h b/include/linux/mlx5/vport.h
index 656c70b65dd2..aaa0bb9e7655 100644
--- a/include/linux/mlx5/vport.h
+++ b/include/linux/mlx5/vport.h
@@ -114,5 +114,6 @@ int mlx5_core_modify_hca_vport_context(struct mlx5_core_dev *dev,
 				       u8 other_vport, u8 port_num,
 				       int vf,
 				       struct mlx5_hca_vport_context *req);
-
+int mlx5_nic_vport_update_local_lb(struct mlx5_core_dev *mdev, bool enable);
+int mlx5_nic_vport_query_local_lb(struct mlx5_core_dev *mdev, bool *status);
 #endif /* __MLX5_VPORT_H__ */
-- 
cgit v1.2.3


From 4a2da0b8c0782816f3ae6846ae7942fcbb0f8172 Mon Sep 17 00:00:00 2001
From: Parav Pandit <parav@mellanox.com>
Date: Tue, 30 May 2017 10:05:15 +0300
Subject: IB/mlx5: Add debug control parameters for congestion control

This patch adds debug control parameters for congestion control which
can be read or written through debugfs. They are for reaction point and
notification point nodes.

These control parameters are as below:
 +------------------------------+-----------------------------------------+
 |      Name                    |           Description                   |
 |------------------------------+-----------------------------------------|
 |rp_clamp_tgt_rate             | When set target rate is updated to      |
 |                              | current rate                            |
 |------------------------------+-----------------------------------------|
 |rp_clamp_tgt_rate_ati         | When set update target rate based on    |
 |                              | timer as well                           |
 |------------------------------+-----------------------------------------|
 |rp_time_reset                 | time between rate increase if no        |
 |                              | CNP is received unit in usec            |
 |------------------------------+-----------------------------------------|
 |rp_byte_reset                 | Number of bytes between rate inease if  |
 |                              | no CNP is received                      |
 |------------------------------+-----------------------------------------|
 |rp_threshold                  | Threshold for reaction point rate       |
 |                              | control                                 |
 |------------------------------+-----------------------------------------|
 |rp_ai_rate                    | Rate for target rate, unit in Mbps      |
 |------------------------------+-----------------------------------------|
 |rp_hai_rate                   | Rate for hyper increase state           |
 |                              | unit in Mbps                            |
 |------------------------------+-----------------------------------------|
 |rp_min_dec_fac                | Minimum factor by which the current     |
 |                              | transmit rate can be changed when       |
 |                              | processing a CNP, unit is percerntage   |
 |------------------------------+-----------------------------------------|
 |rp_min_rate                   | Minimum value for rate limit,           |
 |                              | unit in Mbps                            |
 |------------------------------+-----------------------------------------|
 |rp_rate_to_set_on_first_cnp   | Rate that is set when first CNP is      |
 |                              | received, unit is Mbps                  |
 |------------------------------+-----------------------------------------|
 |rp_dce_tcp_g                  | Used to calculate alpha                 |
 |------------------------------+-----------------------------------------|
 |rp_dce_tcp_rtt                | Time between updates of alpha value,    |
 |                              | unit is usec                            |
 |------------------------------+-----------------------------------------|
 |rp_rate_reduce_monitor_period | Minimum time between consecutive rate   |
 |                              | reductions                              |
 |------------------------------+-----------------------------------------|
 |rp_initial_alpha_value        | Initial value of alpha                  |
 |------------------------------+-----------------------------------------|
 |rp_gd                         | When CNP is received, flow rate is      |
 |                              | reduced based on gd, rp_gd is given as  |
 |                              | log2(rp_gd)                             |
 |------------------------------+-----------------------------------------|
 |np_cnp_dscp                   | dscp code point for generated cnp       |
 |------------------------------+-----------------------------------------|
 |np_cnp_prio_mode              | 802.1p priority for generated cnp       |
 |------------------------------+-----------------------------------------|
 |np_cnp_prio                   | cnp priority mode                       |
 +------------------------------+-----------------------------------------+

Signed-off-by: Parav Pandit <parav@mellanox.com>
Reviewed-by: Daniel Jurgens <danielj@mellanox.com>
Reviewed-by: Eli Cohen <eli@mellanox.com>
Signed-off-by: Leon Romanovsky <leon@kernel.org>
Signed-off-by: Doug Ledford <dledford@redhat.com>
---
 drivers/infiniband/hw/mlx5/Makefile  |   2 +-
 drivers/infiniband/hw/mlx5/cmd.c     |  20 ++
 drivers/infiniband/hw/mlx5/cmd.h     |   4 +
 drivers/infiniband/hw/mlx5/cong.c    | 421 +++++++++++++++++++++++++++++++++++
 drivers/infiniband/hw/mlx5/main.c    |   9 +-
 drivers/infiniband/hw/mlx5/mlx5_ib.h |  37 +++
 include/linux/mlx5/mlx5_ifc.h        |   3 +-
 7 files changed, 493 insertions(+), 3 deletions(-)
 create mode 100644 drivers/infiniband/hw/mlx5/cong.c

(limited to 'include')

diff --git a/drivers/infiniband/hw/mlx5/Makefile b/drivers/infiniband/hw/mlx5/Makefile
index 90ad2adc752f..bc6299697dda 100644
--- a/drivers/infiniband/hw/mlx5/Makefile
+++ b/drivers/infiniband/hw/mlx5/Makefile
@@ -1,4 +1,4 @@
 obj-$(CONFIG_MLX5_INFINIBAND)	+= mlx5_ib.o
 
-mlx5_ib-y :=	main.o cq.o doorbell.o qp.o mem.o srq.o mr.o ah.o mad.o gsi.o ib_virt.o cmd.o
+mlx5_ib-y :=	main.o cq.o doorbell.o qp.o mem.o srq.o mr.o ah.o mad.o gsi.o ib_virt.o cmd.o cong.o
 mlx5_ib-$(CONFIG_INFINIBAND_ON_DEMAND_PAGING) += odp.o
diff --git a/drivers/infiniband/hw/mlx5/cmd.c b/drivers/infiniband/hw/mlx5/cmd.c
index 18d5e1db93ed..470995fa38d2 100644
--- a/drivers/infiniband/hw/mlx5/cmd.c
+++ b/drivers/infiniband/hw/mlx5/cmd.c
@@ -57,3 +57,23 @@ int mlx5_cmd_query_cong_counter(struct mlx5_core_dev *dev,
 	MLX5_SET(query_cong_statistics_in, in, clear, reset);
 	return mlx5_cmd_exec(dev, in, sizeof(in), out, out_size);
 }
+
+int mlx5_cmd_query_cong_params(struct mlx5_core_dev *dev, int cong_point,
+			       void *out, int out_size)
+{
+	u32 in[MLX5_ST_SZ_DW(query_cong_params_in)] = { };
+
+	MLX5_SET(query_cong_params_in, in, opcode,
+		 MLX5_CMD_OP_QUERY_CONG_PARAMS);
+	MLX5_SET(query_cong_params_in, in, cong_protocol, cong_point);
+
+	return mlx5_cmd_exec(dev, in, sizeof(in), out, out_size);
+}
+
+int mlx5_cmd_modify_cong_params(struct mlx5_core_dev *dev,
+				void *in, int in_size)
+{
+	u32 out[MLX5_ST_SZ_DW(modify_cong_params_out)] = { };
+
+	return mlx5_cmd_exec(dev, in, in_size, out, sizeof(out));
+}
diff --git a/drivers/infiniband/hw/mlx5/cmd.h b/drivers/infiniband/hw/mlx5/cmd.h
index fa09228193a6..af4c24596274 100644
--- a/drivers/infiniband/hw/mlx5/cmd.h
+++ b/drivers/infiniband/hw/mlx5/cmd.h
@@ -39,4 +39,8 @@
 int mlx5_cmd_null_mkey(struct mlx5_core_dev *dev, u32 *null_mkey);
 int mlx5_cmd_query_cong_counter(struct mlx5_core_dev *dev,
 				bool reset, void *out, int out_size);
+int mlx5_cmd_query_cong_params(struct mlx5_core_dev *dev, int cong_point,
+			       void *out, int out_size);
+int mlx5_cmd_modify_cong_params(struct mlx5_core_dev *mdev,
+				void *in, int in_size);
 #endif /* MLX5_IB_CMD_H */
diff --git a/drivers/infiniband/hw/mlx5/cong.c b/drivers/infiniband/hw/mlx5/cong.c
new file mode 100644
index 000000000000..2d32b519bb61
--- /dev/null
+++ b/drivers/infiniband/hw/mlx5/cong.c
@@ -0,0 +1,421 @@
+/*
+ * Copyright (c) 2013-2017, Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/debugfs.h>
+
+#include "mlx5_ib.h"
+#include "cmd.h"
+
+enum mlx5_ib_cong_node_type {
+	MLX5_IB_RROCE_ECN_RP = 1,
+	MLX5_IB_RROCE_ECN_NP = 2,
+};
+
+static const char * const mlx5_ib_dbg_cc_name[] = {
+	"rp_clamp_tgt_rate",
+	"rp_clamp_tgt_rate_ati",
+	"rp_time_reset",
+	"rp_byte_reset",
+	"rp_threshold",
+	"rp_ai_rate",
+	"rp_hai_rate",
+	"rp_min_dec_fac",
+	"rp_min_rate",
+	"rp_rate_to_set_on_first_cnp",
+	"rp_dce_tcp_g",
+	"rp_dce_tcp_rtt",
+	"rp_rate_reduce_monitor_period",
+	"rp_initial_alpha_value",
+	"rp_gd",
+	"np_cnp_dscp",
+	"np_cnp_prio_mode",
+	"np_cnp_prio",
+};
+
+#define MLX5_IB_RP_CLAMP_TGT_RATE_ATTR			BIT(1)
+#define MLX5_IB_RP_CLAMP_TGT_RATE_ATI_ATTR		BIT(2)
+#define MLX5_IB_RP_TIME_RESET_ATTR			BIT(3)
+#define MLX5_IB_RP_BYTE_RESET_ATTR			BIT(4)
+#define MLX5_IB_RP_THRESHOLD_ATTR			BIT(5)
+#define MLX5_IB_RP_AI_RATE_ATTR				BIT(7)
+#define MLX5_IB_RP_HAI_RATE_ATTR			BIT(8)
+#define MLX5_IB_RP_MIN_DEC_FAC_ATTR			BIT(9)
+#define MLX5_IB_RP_MIN_RATE_ATTR			BIT(10)
+#define MLX5_IB_RP_RATE_TO_SET_ON_FIRST_CNP_ATTR	BIT(11)
+#define MLX5_IB_RP_DCE_TCP_G_ATTR			BIT(12)
+#define MLX5_IB_RP_DCE_TCP_RTT_ATTR			BIT(13)
+#define MLX5_IB_RP_RATE_REDUCE_MONITOR_PERIOD_ATTR	BIT(14)
+#define MLX5_IB_RP_INITIAL_ALPHA_VALUE_ATTR		BIT(15)
+#define MLX5_IB_RP_GD_ATTR				BIT(16)
+
+#define MLX5_IB_NP_CNP_DSCP_ATTR			BIT(3)
+#define MLX5_IB_NP_CNP_PRIO_MODE_ATTR			BIT(4)
+
+static enum mlx5_ib_cong_node_type
+mlx5_ib_param_to_node(enum mlx5_ib_dbg_cc_types param_offset)
+{
+	if (param_offset >= MLX5_IB_DBG_CC_RP_CLAMP_TGT_RATE &&
+	    param_offset <= MLX5_IB_DBG_CC_RP_GD)
+		return MLX5_IB_RROCE_ECN_RP;
+	else
+		return MLX5_IB_RROCE_ECN_NP;
+}
+
+static u32 mlx5_get_cc_param_val(void *field, int offset)
+{
+	switch (offset) {
+	case MLX5_IB_DBG_CC_RP_CLAMP_TGT_RATE:
+		return MLX5_GET(cong_control_r_roce_ecn_rp, field,
+				clamp_tgt_rate);
+	case MLX5_IB_DBG_CC_RP_CLAMP_TGT_RATE_ATI:
+		return MLX5_GET(cong_control_r_roce_ecn_rp, field,
+				clamp_tgt_rate_after_time_inc);
+	case MLX5_IB_DBG_CC_RP_TIME_RESET:
+		return MLX5_GET(cong_control_r_roce_ecn_rp, field,
+				rpg_time_reset);
+	case MLX5_IB_DBG_CC_RP_BYTE_RESET:
+		return MLX5_GET(cong_control_r_roce_ecn_rp, field,
+				rpg_byte_reset);
+	case MLX5_IB_DBG_CC_RP_THRESHOLD:
+		return MLX5_GET(cong_control_r_roce_ecn_rp, field,
+				rpg_threshold);
+	case MLX5_IB_DBG_CC_RP_AI_RATE:
+		return MLX5_GET(cong_control_r_roce_ecn_rp, field,
+				rpg_ai_rate);
+	case MLX5_IB_DBG_CC_RP_HAI_RATE:
+		return MLX5_GET(cong_control_r_roce_ecn_rp, field,
+				rpg_hai_rate);
+	case MLX5_IB_DBG_CC_RP_MIN_DEC_FAC:
+		return MLX5_GET(cong_control_r_roce_ecn_rp, field,
+				rpg_min_dec_fac);
+	case MLX5_IB_DBG_CC_RP_MIN_RATE:
+		return MLX5_GET(cong_control_r_roce_ecn_rp, field,
+				rpg_min_rate);
+	case MLX5_IB_DBG_CC_RP_RATE_TO_SET_ON_FIRST_CNP:
+		return MLX5_GET(cong_control_r_roce_ecn_rp, field,
+				rate_to_set_on_first_cnp);
+	case MLX5_IB_DBG_CC_RP_DCE_TCP_G:
+		return MLX5_GET(cong_control_r_roce_ecn_rp, field,
+				dce_tcp_g);
+	case MLX5_IB_DBG_CC_RP_DCE_TCP_RTT:
+		return MLX5_GET(cong_control_r_roce_ecn_rp, field,
+				dce_tcp_rtt);
+	case MLX5_IB_DBG_CC_RP_RATE_REDUCE_MONITOR_PERIOD:
+		return MLX5_GET(cong_control_r_roce_ecn_rp, field,
+				rate_reduce_monitor_period);
+	case MLX5_IB_DBG_CC_RP_INITIAL_ALPHA_VALUE:
+		return MLX5_GET(cong_control_r_roce_ecn_rp, field,
+				initial_alpha_value);
+	case MLX5_IB_DBG_CC_RP_GD:
+		return MLX5_GET(cong_control_r_roce_ecn_rp, field,
+				rpg_gd);
+	case MLX5_IB_DBG_CC_NP_CNP_DSCP:
+		return MLX5_GET(cong_control_r_roce_ecn_np, field,
+				cnp_dscp);
+	case MLX5_IB_DBG_CC_NP_CNP_PRIO_MODE:
+		return MLX5_GET(cong_control_r_roce_ecn_np, field,
+				cnp_prio_mode);
+	case MLX5_IB_DBG_CC_NP_CNP_PRIO:
+		return MLX5_GET(cong_control_r_roce_ecn_np, field,
+				cnp_802p_prio);
+	default:
+		return 0;
+	}
+}
+
+static void mlx5_ib_set_cc_param_mask_val(void *field, int offset,
+					  u32 var, u32 *attr_mask)
+{
+	switch (offset) {
+	case MLX5_IB_DBG_CC_RP_CLAMP_TGT_RATE:
+		*attr_mask |= MLX5_IB_RP_CLAMP_TGT_RATE_ATTR;
+		MLX5_SET(cong_control_r_roce_ecn_rp, field,
+			 clamp_tgt_rate, var);
+		break;
+	case MLX5_IB_DBG_CC_RP_CLAMP_TGT_RATE_ATI:
+		*attr_mask |= MLX5_IB_RP_CLAMP_TGT_RATE_ATI_ATTR;
+		MLX5_SET(cong_control_r_roce_ecn_rp, field,
+			 clamp_tgt_rate_after_time_inc, var);
+		break;
+	case MLX5_IB_DBG_CC_RP_TIME_RESET:
+		*attr_mask |= MLX5_IB_RP_TIME_RESET_ATTR;
+		MLX5_SET(cong_control_r_roce_ecn_rp, field,
+			 rpg_time_reset, var);
+		break;
+	case MLX5_IB_DBG_CC_RP_BYTE_RESET:
+		*attr_mask |= MLX5_IB_RP_BYTE_RESET_ATTR;
+		MLX5_SET(cong_control_r_roce_ecn_rp, field,
+			 rpg_byte_reset, var);
+		break;
+	case MLX5_IB_DBG_CC_RP_THRESHOLD:
+		*attr_mask |= MLX5_IB_RP_THRESHOLD_ATTR;
+		MLX5_SET(cong_control_r_roce_ecn_rp, field,
+			 rpg_threshold, var);
+		break;
+	case MLX5_IB_DBG_CC_RP_AI_RATE:
+		*attr_mask |= MLX5_IB_RP_AI_RATE_ATTR;
+		MLX5_SET(cong_control_r_roce_ecn_rp, field,
+			 rpg_ai_rate, var);
+		break;
+	case MLX5_IB_DBG_CC_RP_HAI_RATE:
+		*attr_mask |= MLX5_IB_RP_HAI_RATE_ATTR;
+		MLX5_SET(cong_control_r_roce_ecn_rp, field,
+			 rpg_hai_rate, var);
+		break;
+	case MLX5_IB_DBG_CC_RP_MIN_DEC_FAC:
+		*attr_mask |= MLX5_IB_RP_MIN_DEC_FAC_ATTR;
+		MLX5_SET(cong_control_r_roce_ecn_rp, field,
+			 rpg_min_dec_fac, var);
+		break;
+	case MLX5_IB_DBG_CC_RP_MIN_RATE:
+		*attr_mask |= MLX5_IB_RP_MIN_RATE_ATTR;
+		MLX5_SET(cong_control_r_roce_ecn_rp, field,
+			 rpg_min_rate, var);
+		break;
+	case MLX5_IB_DBG_CC_RP_RATE_TO_SET_ON_FIRST_CNP:
+		*attr_mask |= MLX5_IB_RP_RATE_TO_SET_ON_FIRST_CNP_ATTR;
+		MLX5_SET(cong_control_r_roce_ecn_rp, field,
+			 rate_to_set_on_first_cnp, var);
+		break;
+	case MLX5_IB_DBG_CC_RP_DCE_TCP_G:
+		*attr_mask |= MLX5_IB_RP_DCE_TCP_G_ATTR;
+		MLX5_SET(cong_control_r_roce_ecn_rp, field,
+			 dce_tcp_g, var);
+		break;
+	case MLX5_IB_DBG_CC_RP_DCE_TCP_RTT:
+		*attr_mask |= MLX5_IB_RP_DCE_TCP_RTT_ATTR;
+		MLX5_SET(cong_control_r_roce_ecn_rp, field,
+			 dce_tcp_rtt, var);
+		break;
+	case MLX5_IB_DBG_CC_RP_RATE_REDUCE_MONITOR_PERIOD:
+		*attr_mask |= MLX5_IB_RP_RATE_REDUCE_MONITOR_PERIOD_ATTR;
+		MLX5_SET(cong_control_r_roce_ecn_rp, field,
+			 rate_reduce_monitor_period, var);
+		break;
+	case MLX5_IB_DBG_CC_RP_INITIAL_ALPHA_VALUE:
+		*attr_mask |= MLX5_IB_RP_INITIAL_ALPHA_VALUE_ATTR;
+		MLX5_SET(cong_control_r_roce_ecn_rp, field,
+			 initial_alpha_value, var);
+		break;
+	case MLX5_IB_DBG_CC_RP_GD:
+		*attr_mask |= MLX5_IB_RP_GD_ATTR;
+		MLX5_SET(cong_control_r_roce_ecn_rp, field,
+			 rpg_gd, var);
+		break;
+	case MLX5_IB_DBG_CC_NP_CNP_DSCP:
+		*attr_mask |= MLX5_IB_NP_CNP_DSCP_ATTR;
+		MLX5_SET(cong_control_r_roce_ecn_np, field, cnp_dscp, var);
+		break;
+	case MLX5_IB_DBG_CC_NP_CNP_PRIO_MODE:
+		*attr_mask |= MLX5_IB_NP_CNP_PRIO_MODE_ATTR;
+		MLX5_SET(cong_control_r_roce_ecn_np, field, cnp_prio_mode, var);
+		break;
+	case MLX5_IB_DBG_CC_NP_CNP_PRIO:
+		*attr_mask |= MLX5_IB_NP_CNP_PRIO_MODE_ATTR;
+		MLX5_SET(cong_control_r_roce_ecn_np, field, cnp_prio_mode, 0);
+		MLX5_SET(cong_control_r_roce_ecn_np, field, cnp_802p_prio, var);
+		break;
+	}
+}
+
+static int mlx5_ib_get_cc_params(struct mlx5_ib_dev *dev, int offset, u32 *var)
+{
+	int outlen = MLX5_ST_SZ_BYTES(query_cong_params_out);
+	void *out;
+	void *field;
+	int err;
+	enum mlx5_ib_cong_node_type node;
+
+	out = kvzalloc(outlen, GFP_KERNEL);
+	if (!out)
+		return -ENOMEM;
+
+	node = mlx5_ib_param_to_node(offset);
+
+	err = mlx5_cmd_query_cong_params(dev->mdev, node, out, outlen);
+	if (err)
+		goto free;
+
+	field = MLX5_ADDR_OF(query_cong_params_out, out, congestion_parameters);
+	*var = mlx5_get_cc_param_val(field, offset);
+
+free:
+	kvfree(out);
+	return err;
+}
+
+static int mlx5_ib_set_cc_params(struct mlx5_ib_dev *dev, int offset, u32 var)
+{
+	int inlen = MLX5_ST_SZ_BYTES(modify_cong_params_in);
+	void *in;
+	void *field;
+	enum mlx5_ib_cong_node_type node;
+	u32 attr_mask = 0;
+	int err;
+
+	in = kvzalloc(inlen, GFP_KERNEL);
+	if (!in)
+		return -ENOMEM;
+
+	MLX5_SET(modify_cong_params_in, in, opcode,
+		 MLX5_CMD_OP_MODIFY_CONG_PARAMS);
+
+	node = mlx5_ib_param_to_node(offset);
+	MLX5_SET(modify_cong_params_in, in, cong_protocol, node);
+
+	field = MLX5_ADDR_OF(modify_cong_params_in, in, congestion_parameters);
+	mlx5_ib_set_cc_param_mask_val(field, offset, var, &attr_mask);
+
+	field = MLX5_ADDR_OF(modify_cong_params_in, in, field_select);
+	MLX5_SET(field_select_r_roce_rp, field, field_select_r_roce_rp,
+		 attr_mask);
+
+	err = mlx5_cmd_modify_cong_params(dev->mdev, in, inlen);
+	kvfree(in);
+	return err;
+}
+
+static ssize_t set_param(struct file *filp, const char __user *buf,
+			 size_t count, loff_t *pos)
+{
+	struct mlx5_ib_dbg_param *param = filp->private_data;
+	int offset = param->offset;
+	char lbuf[11] = { };
+	u32 var;
+	int ret;
+
+	if (count > sizeof(lbuf))
+		return -EINVAL;
+
+	if (copy_from_user(lbuf, buf, count))
+		return -EFAULT;
+
+	lbuf[sizeof(lbuf) - 1] = '\0';
+
+	if (kstrtou32(lbuf, 0, &var))
+		return -EINVAL;
+
+	ret = mlx5_ib_set_cc_params(param->dev, offset, var);
+	return ret ? ret : count;
+}
+
+static ssize_t get_param(struct file *filp, char __user *buf, size_t count,
+			 loff_t *pos)
+{
+	struct mlx5_ib_dbg_param *param = filp->private_data;
+	int offset = param->offset;
+	u32 var = 0;
+	int ret;
+	char lbuf[11];
+
+	if (*pos)
+		return 0;
+
+	ret = mlx5_ib_get_cc_params(param->dev, offset, &var);
+	if (ret)
+		return ret;
+
+	ret = snprintf(lbuf, sizeof(lbuf), "%d\n", var);
+	if (ret < 0)
+		return ret;
+
+	if (copy_to_user(buf, lbuf, ret))
+		return -EFAULT;
+
+	*pos += ret;
+	return ret;
+}
+
+static const struct file_operations dbg_cc_fops = {
+	.owner	= THIS_MODULE,
+	.open	= simple_open,
+	.write	= set_param,
+	.read	= get_param,
+};
+
+void mlx5_ib_cleanup_cong_debugfs(struct mlx5_ib_dev *dev)
+{
+	if (!mlx5_debugfs_root ||
+	    !dev->dbg_cc_params ||
+	    !dev->dbg_cc_params->root)
+		return;
+
+	debugfs_remove_recursive(dev->dbg_cc_params->root);
+	kfree(dev->dbg_cc_params);
+	dev->dbg_cc_params = NULL;
+}
+
+int mlx5_ib_init_cong_debugfs(struct mlx5_ib_dev *dev)
+{
+	struct mlx5_ib_dbg_cc_params *dbg_cc_params;
+	int i;
+
+	if (!mlx5_debugfs_root)
+		goto out;
+
+	if (!MLX5_CAP_GEN(dev->mdev, cc_query_allowed) ||
+	    !MLX5_CAP_GEN(dev->mdev, cc_modify_allowed))
+		goto out;
+
+	dbg_cc_params = kzalloc(sizeof(*dbg_cc_params), GFP_KERNEL);
+	if (!dbg_cc_params)
+		goto out;
+
+	dev->dbg_cc_params = dbg_cc_params;
+
+	dbg_cc_params->root = debugfs_create_dir("cc_params",
+						 dev->mdev->priv.dbg_root);
+	if (!dbg_cc_params->root)
+		goto err;
+
+	for (i = 0; i < MLX5_IB_DBG_CC_MAX; i++) {
+		dbg_cc_params->params[i].offset = i;
+		dbg_cc_params->params[i].dev = dev;
+		dbg_cc_params->params[i].dentry =
+			debugfs_create_file(mlx5_ib_dbg_cc_name[i],
+					    0600, dbg_cc_params->root,
+					    &dbg_cc_params->params[i],
+					    &dbg_cc_fops);
+		if (!dbg_cc_params->params[i].dentry)
+			goto err;
+	}
+out:	return 0;
+
+err:
+	mlx5_ib_warn(dev, "cong debugfs failure\n");
+	mlx5_ib_cleanup_cong_debugfs(dev);
+	/*
+	 * We don't want to fail driver if debugfs failed to initialize,
+	 * so we are not forwarding error to the user.
+	 */
+	return 0;
+}
diff --git a/drivers/infiniband/hw/mlx5/main.c b/drivers/infiniband/hw/mlx5/main.c
index bfd9117da10c..a903728627f1 100644
--- a/drivers/infiniband/hw/mlx5/main.c
+++ b/drivers/infiniband/hw/mlx5/main.c
@@ -3838,9 +3838,13 @@ static void *mlx5_ib_add(struct mlx5_core_dev *mdev)
 			goto err_odp;
 	}
 
+	err = mlx5_ib_init_cong_debugfs(dev);
+	if (err)
+		goto err_cnt;
+
 	dev->mdev->priv.uar = mlx5_get_uars_page(dev->mdev);
 	if (!dev->mdev->priv.uar)
-		goto err_cnt;
+		goto err_cong;
 
 	err = mlx5_alloc_bfreg(dev->mdev, &dev->bfreg, false, false);
 	if (err)
@@ -3889,6 +3893,8 @@ err_uar_page:
 	mlx5_put_uars_page(dev->mdev, dev->mdev->priv.uar);
 
 err_cnt:
+	mlx5_ib_cleanup_cong_debugfs(dev);
+err_cong:
 	if (MLX5_CAP_GEN(dev->mdev, max_qp_cnt))
 		mlx5_ib_dealloc_counters(dev);
 
@@ -3923,6 +3929,7 @@ static void mlx5_ib_remove(struct mlx5_core_dev *mdev, void *context)
 	mlx5_free_bfreg(dev->mdev, &dev->fp_bfreg);
 	mlx5_free_bfreg(dev->mdev, &dev->bfreg);
 	mlx5_put_uars_page(dev->mdev, mdev->priv.uar);
+	mlx5_ib_cleanup_cong_debugfs(dev);
 	if (MLX5_CAP_GEN(dev->mdev, max_qp_cnt))
 		mlx5_ib_dealloc_counters(dev);
 	destroy_umrc_res(dev);
diff --git a/drivers/infiniband/hw/mlx5/mlx5_ib.h b/drivers/infiniband/hw/mlx5/mlx5_ib.h
index 5fb4547bbcb8..f0682f383b52 100644
--- a/drivers/infiniband/hw/mlx5/mlx5_ib.h
+++ b/drivers/infiniband/hw/mlx5/mlx5_ib.h
@@ -619,6 +619,39 @@ struct mlx5_roce {
 	enum ib_port_state last_port_state;
 };
 
+struct mlx5_ib_dbg_param {
+	int			offset;
+	struct mlx5_ib_dev	*dev;
+	struct dentry		*dentry;
+};
+
+enum mlx5_ib_dbg_cc_types {
+	MLX5_IB_DBG_CC_RP_CLAMP_TGT_RATE,
+	MLX5_IB_DBG_CC_RP_CLAMP_TGT_RATE_ATI,
+	MLX5_IB_DBG_CC_RP_TIME_RESET,
+	MLX5_IB_DBG_CC_RP_BYTE_RESET,
+	MLX5_IB_DBG_CC_RP_THRESHOLD,
+	MLX5_IB_DBG_CC_RP_AI_RATE,
+	MLX5_IB_DBG_CC_RP_HAI_RATE,
+	MLX5_IB_DBG_CC_RP_MIN_DEC_FAC,
+	MLX5_IB_DBG_CC_RP_MIN_RATE,
+	MLX5_IB_DBG_CC_RP_RATE_TO_SET_ON_FIRST_CNP,
+	MLX5_IB_DBG_CC_RP_DCE_TCP_G,
+	MLX5_IB_DBG_CC_RP_DCE_TCP_RTT,
+	MLX5_IB_DBG_CC_RP_RATE_REDUCE_MONITOR_PERIOD,
+	MLX5_IB_DBG_CC_RP_INITIAL_ALPHA_VALUE,
+	MLX5_IB_DBG_CC_RP_GD,
+	MLX5_IB_DBG_CC_NP_CNP_DSCP,
+	MLX5_IB_DBG_CC_NP_CNP_PRIO_MODE,
+	MLX5_IB_DBG_CC_NP_CNP_PRIO,
+	MLX5_IB_DBG_CC_MAX,
+};
+
+struct mlx5_ib_dbg_cc_params {
+	struct dentry			*root;
+	struct mlx5_ib_dbg_param	params[MLX5_IB_DBG_CC_MAX];
+};
+
 struct mlx5_ib_dev {
 	struct ib_device		ib_dev;
 	struct mlx5_core_dev		*mdev;
@@ -655,6 +688,7 @@ struct mlx5_ib_dev {
 	struct mlx5_ib_port	*port;
 	struct mlx5_sq_bfreg	bfreg;
 	struct mlx5_sq_bfreg	fp_bfreg;
+	struct mlx5_ib_dbg_cc_params	*dbg_cc_params;
 
 	/* protect the user_td */
 	struct mutex		lb_mutex;
@@ -909,6 +943,9 @@ __be16 mlx5_get_roce_udp_sport(struct mlx5_ib_dev *dev, u8 port_num,
 int mlx5_get_roce_gid_type(struct mlx5_ib_dev *dev, u8 port_num,
 			   int index, enum ib_gid_type *gid_type);
 
+void mlx5_ib_cleanup_cong_debugfs(struct mlx5_ib_dev *dev);
+int mlx5_ib_init_cong_debugfs(struct mlx5_ib_dev *dev);
+
 /* GSI QP helper functions */
 struct ib_qp *mlx5_ib_gsi_create_qp(struct ib_pd *pd,
 				    struct ib_qp_init_attr *init_attr);
diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h
index 57c75e8b3c19..3e697f672de5 100644
--- a/include/linux/mlx5/mlx5_ifc.h
+++ b/include/linux/mlx5/mlx5_ifc.h
@@ -1188,7 +1188,8 @@ struct mlx5_ifc_cong_control_r_roce_ecn_np_bits {
 
 	u8         reserved_at_c0[0x12];
 	u8         cnp_dscp[0x6];
-	u8         reserved_at_d8[0x5];
+	u8         reserved_at_d8[0x4];
+	u8         cnp_prio_mode[0x1];
 	u8         cnp_802p_prio[0x3];
 
 	u8         reserved_at_e0[0x720];
-- 
cgit v1.2.3


From 7ecf6d8ff154e6f7471ee537a400d43a5f3b1c57 Mon Sep 17 00:00:00 2001
From: Bodong Wang <bodong@mellanox.com>
Date: Tue, 30 May 2017 10:18:24 +0300
Subject: IB/mlx5: Restore IB guid/policy for virtual functions

When a user sets port_guid, node_guid or policy of an IB virtual
function, save this information in "struct mlx5_vf_context".

This information will be restored later when pci_resume is called.
To make sure this works, one can use aer-inject to generate PCI
errors on mlx5 devices and verify if relevant fields are restored
after PCI resume.

Signed-off-by: Bodong Wang <bodong@mellanox.com>
Reviewed-by: Eli Cohen <eli@mellanox.com>
Signed-off-by: Leon Romanovsky <leon@kernel.org>
Signed-off-by: Doug Ledford <dledford@redhat.com>
---
 drivers/infiniband/hw/mlx5/ib_virt.c            |  9 ++++++
 drivers/net/ethernet/mellanox/mlx5/core/sriov.c | 42 +++++++++++++++++++++++++
 include/linux/mlx5/driver.h                     | 17 +++++-----
 3 files changed, 61 insertions(+), 7 deletions(-)

(limited to 'include')

diff --git a/drivers/infiniband/hw/mlx5/ib_virt.c b/drivers/infiniband/hw/mlx5/ib_virt.c
index c1b9de800fe5..649a3364f838 100644
--- a/drivers/infiniband/hw/mlx5/ib_virt.c
+++ b/drivers/infiniband/hw/mlx5/ib_virt.c
@@ -96,6 +96,7 @@ int mlx5_ib_set_vf_link_state(struct ib_device *device, int vf,
 	struct mlx5_ib_dev *dev = to_mdev(device);
 	struct mlx5_core_dev *mdev = dev->mdev;
 	struct mlx5_hca_vport_context *in;
+	struct mlx5_vf_context *vfs_ctx = mdev->priv.sriov.vfs_ctx;
 	int err;
 
 	in = kzalloc(sizeof(*in), GFP_KERNEL);
@@ -109,6 +110,8 @@ int mlx5_ib_set_vf_link_state(struct ib_device *device, int vf,
 	}
 	in->field_select = MLX5_HCA_VPORT_SEL_STATE_POLICY;
 	err = mlx5_core_modify_hca_vport_context(mdev, 1, 1, vf + 1, in);
+	if (!err)
+		vfs_ctx[vf].policy = in->policy;
 
 out:
 	kfree(in);
@@ -151,6 +154,7 @@ static int set_vf_node_guid(struct ib_device *device, int vf, u8 port, u64 guid)
 	struct mlx5_ib_dev *dev = to_mdev(device);
 	struct mlx5_core_dev *mdev = dev->mdev;
 	struct mlx5_hca_vport_context *in;
+	struct mlx5_vf_context *vfs_ctx = mdev->priv.sriov.vfs_ctx;
 	int err;
 
 	in = kzalloc(sizeof(*in), GFP_KERNEL);
@@ -160,6 +164,8 @@ static int set_vf_node_guid(struct ib_device *device, int vf, u8 port, u64 guid)
 	in->field_select = MLX5_HCA_VPORT_SEL_NODE_GUID;
 	in->node_guid = guid;
 	err = mlx5_core_modify_hca_vport_context(mdev, 1, 1, vf + 1, in);
+	if (!err)
+		vfs_ctx[vf].node_guid = guid;
 	kfree(in);
 	return err;
 }
@@ -169,6 +175,7 @@ static int set_vf_port_guid(struct ib_device *device, int vf, u8 port, u64 guid)
 	struct mlx5_ib_dev *dev = to_mdev(device);
 	struct mlx5_core_dev *mdev = dev->mdev;
 	struct mlx5_hca_vport_context *in;
+	struct mlx5_vf_context *vfs_ctx = mdev->priv.sriov.vfs_ctx;
 	int err;
 
 	in = kzalloc(sizeof(*in), GFP_KERNEL);
@@ -178,6 +185,8 @@ static int set_vf_port_guid(struct ib_device *device, int vf, u8 port, u64 guid)
 	in->field_select = MLX5_HCA_VPORT_SEL_PORT_GUID;
 	in->port_guid = guid;
 	err = mlx5_core_modify_hca_vport_context(mdev, 1, 1, vf + 1, in);
+	if (!err)
+		vfs_ctx[vf].port_guid = guid;
 	kfree(in);
 	return err;
 }
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/sriov.c b/drivers/net/ethernet/mellanox/mlx5/core/sriov.c
index bcdf7779c48d..090b29e05a6a 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/sriov.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/sriov.c
@@ -32,6 +32,7 @@
 
 #include <linux/pci.h>
 #include <linux/mlx5/driver.h>
+#include <linux/mlx5/vport.h>
 #include "mlx5_core.h"
 #ifdef CONFIG_MLX5_CORE_EN
 #include "eswitch.h"
@@ -44,6 +45,38 @@ bool mlx5_sriov_is_enabled(struct mlx5_core_dev *dev)
 	return !!sriov->num_vfs;
 }
 
+static int sriov_restore_guids(struct mlx5_core_dev *dev, int vf)
+{
+	struct mlx5_core_sriov *sriov = &dev->priv.sriov;
+	struct mlx5_hca_vport_context *in;
+	int err = 0;
+
+	/* Restore sriov guid and policy settings */
+	if (sriov->vfs_ctx[vf].node_guid ||
+	    sriov->vfs_ctx[vf].port_guid ||
+	    sriov->vfs_ctx[vf].policy != MLX5_POLICY_INVALID) {
+		in = kzalloc(sizeof(*in), GFP_KERNEL);
+		if (!in)
+			return -ENOMEM;
+
+		in->node_guid = sriov->vfs_ctx[vf].node_guid;
+		in->port_guid = sriov->vfs_ctx[vf].port_guid;
+		in->policy = sriov->vfs_ctx[vf].policy;
+		in->field_select =
+			!!(in->port_guid) * MLX5_HCA_VPORT_SEL_PORT_GUID |
+			!!(in->node_guid) * MLX5_HCA_VPORT_SEL_NODE_GUID |
+			!!(in->policy) * MLX5_HCA_VPORT_SEL_STATE_POLICY;
+
+		err = mlx5_core_modify_hca_vport_context(dev, 1, 1, vf + 1, in);
+		if (err)
+			mlx5_core_warn(dev, "modify vport context failed, unable to restore VF %d settings\n", vf);
+
+		kfree(in);
+	}
+
+	return err;
+}
+
 static int mlx5_device_enable_sriov(struct mlx5_core_dev *dev, int num_vfs)
 {
 	struct mlx5_core_sriov *sriov = &dev->priv.sriov;
@@ -74,6 +107,15 @@ static int mlx5_device_enable_sriov(struct mlx5_core_dev *dev, int num_vfs)
 		}
 		sriov->vfs_ctx[vf].enabled = 1;
 		sriov->enabled_vfs++;
+		if (MLX5_CAP_GEN(dev, port_type) == MLX5_CAP_PORT_TYPE_IB) {
+			err = sriov_restore_guids(dev, vf);
+			if (err) {
+				mlx5_core_warn(dev,
+					       "failed to restore VF %d settings, err %d\n",
+					       vf, err);
+			continue;
+			}
+		}
 		mlx5_core_dbg(dev, "successfully enabled VF* %d\n", vf);
 
 	}
diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h
index df6ce59a1f95..54221be5f69e 100644
--- a/include/linux/mlx5/driver.h
+++ b/include/linux/mlx5/driver.h
@@ -162,6 +162,13 @@ enum dbg_rsc_type {
 	MLX5_DBG_RSC_CQ,
 };
 
+enum port_state_policy {
+	MLX5_POLICY_DOWN	= 0,
+	MLX5_POLICY_UP		= 1,
+	MLX5_POLICY_FOLLOW	= 2,
+	MLX5_POLICY_INVALID	= 0xffffffff
+};
+
 struct mlx5_field_desc {
 	struct dentry	       *dent;
 	int			i;
@@ -525,6 +532,9 @@ struct mlx5_mkey_table {
 
 struct mlx5_vf_context {
 	int	enabled;
+	u64	port_guid;
+	u64	node_guid;
+	enum port_state_policy	policy;
 };
 
 struct mlx5_core_sriov {
@@ -842,13 +852,6 @@ struct mlx5_pas {
 	u8	log_sz;
 };
 
-enum port_state_policy {
-	MLX5_POLICY_DOWN	= 0,
-	MLX5_POLICY_UP		= 1,
-	MLX5_POLICY_FOLLOW	= 2,
-	MLX5_POLICY_INVALID	= 0xffffffff
-};
-
 enum phy_port_state {
 	MLX5_AAA_111
 };
-- 
cgit v1.2.3


From 7d9336d80b0b35d3537d37ff35e08dcb425073ed Mon Sep 17 00:00:00 2001
From: Maor Gottlieb <maorg@mellanox.com>
Date: Tue, 30 May 2017 10:29:10 +0300
Subject: IB/core: Introduce delay drop for a WQ

Work queue which is created with IB_WQ_FLAGS_DELAY_DROP won't
cause packet drops when there aren't receive WQEs, but will wait until
posting of receive WQEs or for some period of time that the device
was configured with.

It includes:
 * Add a new creation flag to enable delay drop functionality in a WQ.
 * A new capability was introduced - IB_RAW_PACKET_CAP_DELAY_DROP, which
   is the device's ability to delay packet drops when there aren't receive
   WQEs.

Signed-off-by: Maor Gottlieb <maorg@mellanox.com>
Reviewed-by: Yishai Hadas <yishaih@mellanox.com>
Signed-off-by: Leon Romanovsky <leon@kernel.org>
Signed-off-by: Doug Ledford <dledford@redhat.com>
---
 include/rdma/ib_verbs.h | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'include')

diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h
index b5732432bb29..4a1444456af7 100644
--- a/include/rdma/ib_verbs.h
+++ b/include/rdma/ib_verbs.h
@@ -1546,6 +1546,10 @@ enum ib_raw_packet_caps {
 	IB_RAW_PACKET_CAP_SCATTER_FCS		= (1 << 1),
 	/* Checksum offloads are supported (for both send and receive). */
 	IB_RAW_PACKET_CAP_IP_CSUM		= (1 << 2),
+	/* When a packet is received for an RQ with no receive WQEs, the
+	 * packet processing is delayed.
+	 */
+	IB_RAW_PACKET_CAP_DELAY_DROP		= (1 << 3),
 };
 
 enum ib_wq_type {
@@ -1574,6 +1578,7 @@ struct ib_wq {
 enum ib_wq_flags {
 	IB_WQ_FLAGS_CVLAN_STRIPPING	= 1 << 0,
 	IB_WQ_FLAGS_SCATTER_FCS		= 1 << 1,
+	IB_WQ_FLAGS_DELAY_DROP		= 1 << 2,
 };
 
 struct ib_wq_init_attr {
-- 
cgit v1.2.3


From c1e0bfc1312d0e06bdb24e6e4e7e10b0b4313ec6 Mon Sep 17 00:00:00 2001
From: Maor Gottlieb <maorg@mellanox.com>
Date: Tue, 30 May 2017 10:29:11 +0300
Subject: net/mlx5: Introduce set delay drop command

Add support to SET_DELAY_DROP command.

This command will be used in downstream patches for delay packet drop.
The timeout value should be indicated by delay_drop_timeout field.
Packet processing will be delayed till timeout value passed or until
more WQEs are posted.

Setting this value to 0 disables the feature.

Signed-off-by: Maor Gottlieb <maorg@mellanox.com>
Reviewed-by: Yishai Hadas <yishaih@mellanox.com>
Signed-off-by: Leon Romanovsky <leon@kernel.org>
Signed-off-by: Doug Ledford <dledford@redhat.com>
---
 drivers/net/ethernet/mellanox/mlx5/core/qp.c | 14 ++++++++++++++
 include/linux/mlx5/mlx5_ifc.h                | 25 ++++++++++++++++++++++++-
 include/linux/mlx5/qp.h                      |  3 +++
 3 files changed, 41 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/qp.c b/drivers/net/ethernet/mellanox/mlx5/core/qp.c
index 340f281c9801..db9e665ab104 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/qp.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/qp.c
@@ -242,6 +242,20 @@ int mlx5_core_destroy_qp(struct mlx5_core_dev *dev,
 }
 EXPORT_SYMBOL_GPL(mlx5_core_destroy_qp);
 
+int mlx5_core_set_delay_drop(struct mlx5_core_dev *dev,
+			     u32 timeout_usec)
+{
+	u32 out[MLX5_ST_SZ_DW(set_delay_drop_params_out)] = {0};
+	u32 in[MLX5_ST_SZ_DW(set_delay_drop_params_in)]   = {0};
+
+	MLX5_SET(set_delay_drop_params_in, in, opcode,
+		 MLX5_CMD_OP_SET_DELAY_DROP_PARAMS);
+	MLX5_SET(set_delay_drop_params_in, in, delay_drop_timeout,
+		 timeout_usec / 100);
+	return mlx5_cmd_exec(dev, in, sizeof(in), out, sizeof(out));
+}
+EXPORT_SYMBOL_GPL(mlx5_core_set_delay_drop);
+
 struct mbox_info {
 	u32 *in;
 	u32 *out;
diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h
index 3e697f672de5..40c05d29d5bb 100644
--- a/include/linux/mlx5/mlx5_ifc.h
+++ b/include/linux/mlx5/mlx5_ifc.h
@@ -200,6 +200,7 @@ enum {
 	MLX5_CMD_OP_QUERY_SQ                      = 0x907,
 	MLX5_CMD_OP_CREATE_RQ                     = 0x908,
 	MLX5_CMD_OP_MODIFY_RQ                     = 0x909,
+	MLX5_CMD_OP_SET_DELAY_DROP_PARAMS         = 0x910,
 	MLX5_CMD_OP_DESTROY_RQ                    = 0x90a,
 	MLX5_CMD_OP_QUERY_RQ                      = 0x90b,
 	MLX5_CMD_OP_CREATE_RMP                    = 0x90c,
@@ -840,7 +841,7 @@ struct mlx5_ifc_cmd_hca_cap_bits {
 	u8         retransmission_q_counters[0x1];
 	u8         reserved_at_183[0x1];
 	u8         modify_rq_counter_set_id[0x1];
-	u8         reserved_at_185[0x1];
+	u8         rq_delay_drop[0x1];
 	u8         max_qp_cnt[0xa];
 	u8         pkey_table_size[0x10];
 
@@ -5853,6 +5854,28 @@ struct mlx5_ifc_destroy_rq_in_bits {
 	u8         reserved_at_60[0x20];
 };
 
+struct mlx5_ifc_set_delay_drop_params_in_bits {
+	u8         opcode[0x10];
+	u8         reserved_at_10[0x10];
+
+	u8         reserved_at_20[0x10];
+	u8         op_mod[0x10];
+
+	u8         reserved_at_40[0x20];
+
+	u8         reserved_at_60[0x10];
+	u8         delay_drop_timeout[0x10];
+};
+
+struct mlx5_ifc_set_delay_drop_params_out_bits {
+	u8         status[0x8];
+	u8         reserved_at_8[0x18];
+
+	u8         syndrome[0x20];
+
+	u8         reserved_at_40[0x40];
+};
+
 struct mlx5_ifc_destroy_rmp_out_bits {
 	u8         status[0x8];
 	u8         reserved_at_8[0x18];
diff --git a/include/linux/mlx5/qp.h b/include/linux/mlx5/qp.h
index 6f41270d80c0..fff4ec13f620 100644
--- a/include/linux/mlx5/qp.h
+++ b/include/linux/mlx5/qp.h
@@ -561,6 +561,9 @@ int mlx5_core_destroy_qp(struct mlx5_core_dev *dev,
 int mlx5_core_qp_query(struct mlx5_core_dev *dev, struct mlx5_core_qp *qp,
 		       u32 *out, int outlen);
 
+int mlx5_core_set_delay_drop(struct mlx5_core_dev *dev,
+			     u32 timeout_usec);
+
 int mlx5_core_xrcd_alloc(struct mlx5_core_dev *dev, u32 *xrcdn);
 int mlx5_core_xrcd_dealloc(struct mlx5_core_dev *dev, u32 xrcdn);
 void mlx5_init_qp_table(struct mlx5_core_dev *dev);
-- 
cgit v1.2.3


From 246ac9814c5b2c0e9916dca5fbf8d6a40245fad1 Mon Sep 17 00:00:00 2001
From: Maor Gottlieb <maorg@mellanox.com>
Date: Tue, 30 May 2017 10:29:12 +0300
Subject: net/mlx5: Introduce general notification event

When delay drop timeout is expired, the firmware raises
general notification event of DELAY_DROP_TIMEOUT subtype.
In addition the feature is disable so the driver have to
reactivate the timeout.

Signed-off-by: Maor Gottlieb <maorg@mellanox.com>
Reviewed-by: Yishai Hadas <yishaih@mellanox.com>
Signed-off-by: Leon Romanovsky <leon@kernel.org>
Signed-off-by: Doug Ledford <dledford@redhat.com>
---
 drivers/net/ethernet/mellanox/mlx5/core/eq.c | 23 +++++++++++++++++++++++
 include/linux/mlx5/device.h                  |  5 +++++
 include/linux/mlx5/driver.h                  |  1 +
 include/linux/mlx5/mlx5_ifc.h                |  3 ++-
 4 files changed, 31 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eq.c b/drivers/net/ethernet/mellanox/mlx5/core/eq.c
index af51a5d2b912..849417425811 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/eq.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/eq.c
@@ -161,6 +161,8 @@ static const char *eqe_type_str(u8 type)
 		return "MLX5_EVENT_TYPE_NIC_VPORT_CHANGE";
 	case MLX5_EVENT_TYPE_FPGA_ERROR:
 		return "MLX5_EVENT_TYPE_FPGA_ERROR";
+	case MLX5_EVENT_TYPE_GENERAL_EVENT:
+		return "MLX5_EVENT_TYPE_GENERAL_EVENT";
 	default:
 		return "Unrecognized event";
 	}
@@ -378,6 +380,20 @@ int mlx5_core_page_fault_resume(struct mlx5_core_dev *dev, u32 token,
 EXPORT_SYMBOL_GPL(mlx5_core_page_fault_resume);
 #endif
 
+static void general_event_handler(struct mlx5_core_dev *dev,
+				  struct mlx5_eqe *eqe)
+{
+	switch (eqe->sub_type) {
+	case MLX5_GENERAL_SUBTYPE_DELAY_DROP_TIMEOUT:
+		if (dev->event)
+			dev->event(dev, MLX5_DEV_EVENT_DELAY_DROP_TIMEOUT, 0);
+		break;
+	default:
+		mlx5_core_dbg(dev, "General event with unrecognized subtype: sub_type %d\n",
+			      eqe->sub_type);
+	}
+}
+
 static irqreturn_t mlx5_eq_int(int irq, void *eq_ptr)
 {
 	struct mlx5_eq *eq = eq_ptr;
@@ -486,6 +502,9 @@ static irqreturn_t mlx5_eq_int(int irq, void *eq_ptr)
 			mlx5_fpga_event(dev, eqe->type, &eqe->data.raw);
 			break;
 
+		case MLX5_EVENT_TYPE_GENERAL_EVENT:
+			general_event_handler(dev, eqe);
+			break;
 		default:
 			mlx5_core_warn(dev, "Unhandled event 0x%x on EQ 0x%x\n",
 				       eqe->type, eq->eqn);
@@ -693,6 +712,10 @@ int mlx5_start_eqs(struct mlx5_core_dev *dev)
 	    mlx5_core_is_pf(dev))
 		async_event_mask |= (1ull << MLX5_EVENT_TYPE_NIC_VPORT_CHANGE);
 
+	if (MLX5_CAP_GEN(dev, port_type) == MLX5_CAP_PORT_TYPE_ETH &&
+	    MLX5_CAP_GEN(dev, general_notification_event))
+		async_event_mask |= (1ull << MLX5_EVENT_TYPE_GENERAL_EVENT);
+
 	if (MLX5_CAP_GEN(dev, port_module_event))
 		async_event_mask |= (1ull << MLX5_EVENT_TYPE_PORT_MODULE_EVENT);
 	else
diff --git a/include/linux/mlx5/device.h b/include/linux/mlx5/device.h
index f31a0b5377e1..a47b9ab9f2c9 100644
--- a/include/linux/mlx5/device.h
+++ b/include/linux/mlx5/device.h
@@ -290,6 +290,7 @@ enum mlx5_event {
 	MLX5_EVENT_TYPE_GPIO_EVENT	   = 0x15,
 	MLX5_EVENT_TYPE_PORT_MODULE_EVENT  = 0x16,
 	MLX5_EVENT_TYPE_REMOTE_CONFIG	   = 0x19,
+	MLX5_EVENT_TYPE_GENERAL_EVENT	   = 0x22,
 	MLX5_EVENT_TYPE_PPS_EVENT          = 0x25,
 
 	MLX5_EVENT_TYPE_DB_BF_CONGESTION   = 0x1a,
@@ -304,6 +305,10 @@ enum mlx5_event {
 	MLX5_EVENT_TYPE_FPGA_ERROR         = 0x20,
 };
 
+enum {
+	MLX5_GENERAL_SUBTYPE_DELAY_DROP_TIMEOUT = 0x1,
+};
+
 enum {
 	MLX5_PORT_CHANGE_SUBTYPE_DOWN		= 1,
 	MLX5_PORT_CHANGE_SUBTYPE_ACTIVE		= 4,
diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h
index 54221be5f69e..758ef40f9316 100644
--- a/include/linux/mlx5/driver.h
+++ b/include/linux/mlx5/driver.h
@@ -192,6 +192,7 @@ enum mlx5_dev_event {
 	MLX5_DEV_EVENT_GUID_CHANGE,
 	MLX5_DEV_EVENT_CLIENT_REREG,
 	MLX5_DEV_EVENT_PPS,
+	MLX5_DEV_EVENT_DELAY_DROP_TIMEOUT,
 };
 
 enum mlx5_port_status {
diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h
index 40c05d29d5bb..4bc57647fad8 100644
--- a/include/linux/mlx5/mlx5_ifc.h
+++ b/include/linux/mlx5/mlx5_ifc.h
@@ -874,7 +874,8 @@ struct mlx5_ifc_cmd_hca_cap_bits {
 	u8         max_tc[0x4];
 	u8         reserved_at_1d0[0x1];
 	u8         dcbx[0x1];
-	u8         reserved_at_1d2[0x3];
+	u8         general_notification_event[0x1];
+	u8         reserved_at_1d3[0x2];
 	u8         fpga[0x1];
 	u8         rol_s[0x1];
 	u8         rol_g[0x1];
-- 
cgit v1.2.3


From 03404e8ae652e02a5e3388224836cef53d7a0988 Mon Sep 17 00:00:00 2001
From: Maor Gottlieb <maorg@mellanox.com>
Date: Tue, 30 May 2017 10:29:13 +0300
Subject: IB/mlx5: Add support to dropless RQ

RQs that were configured for "delay drop" will prevent packet drops
when their WQEs are depleted.
Marking an RQ to be drop-less is done by setting delay_drop_en in RQ
context using CREATE_RQ command.

Since this feature is globally activated/deactivated by using the
SET_DELAY_DROP command on all the marked RQs, we activated/deactivated
it according to the number of RQs with 'delay_drop' enabled.

When timeout is expired, then the feature is deactivated. Therefore
the driver handles the delay drop timeout event and reactivate it.

Signed-off-by: Maor Gottlieb <maorg@mellanox.com>
Reviewed-by: Yishai Hadas <yishaih@mellanox.com>
Signed-off-by: Leon Romanovsky <leon@kernel.org>
Signed-off-by: Doug Ledford <dledford@redhat.com>
---
 drivers/infiniband/hw/mlx5/main.c    | 60 +++++++++++++++++++++++++++++++++---
 drivers/infiniband/hw/mlx5/mlx5_ib.h | 19 ++++++++++++
 drivers/infiniband/hw/mlx5/qp.c      | 37 ++++++++++++++++++++++
 include/linux/mlx5/mlx5_ifc.h        |  2 +-
 4 files changed, 113 insertions(+), 5 deletions(-)

(limited to 'include')

diff --git a/drivers/infiniband/hw/mlx5/main.c b/drivers/infiniband/hw/mlx5/main.c
index a903728627f1..ad4b12decc23 100644
--- a/drivers/infiniband/hw/mlx5/main.c
+++ b/drivers/infiniband/hw/mlx5/main.c
@@ -697,6 +697,10 @@ static int mlx5_ib_query_device(struct ib_device *ibdev,
 		props->device_cap_flags |= IB_DEVICE_UD_TSO;
 	}
 
+	if (MLX5_CAP_GEN(dev->mdev, rq_delay_drop) &&
+	    MLX5_CAP_GEN(dev->mdev, general_notification_event))
+		props->raw_packet_caps |= IB_RAW_PACKET_CAP_DELAY_DROP;
+
 	if (MLX5_CAP_GEN(dev->mdev, eth_net_offloads) &&
 	    MLX5_CAP_ETH(dev->mdev, scatter_fcs)) {
 		/* Legacy bit to support old userspace libraries */
@@ -2752,6 +2756,24 @@ static void mlx5_ib_handle_internal_error(struct mlx5_ib_dev *ibdev)
 	spin_unlock_irqrestore(&ibdev->reset_flow_resource_lock, flags);
 }
 
+static void delay_drop_handler(struct work_struct *work)
+{
+	int err;
+	struct mlx5_ib_delay_drop *delay_drop =
+		container_of(work, struct mlx5_ib_delay_drop,
+			     delay_drop_work);
+
+	mutex_lock(&delay_drop->lock);
+	err = mlx5_core_set_delay_drop(delay_drop->dev->mdev,
+				       delay_drop->timeout);
+	if (err) {
+		mlx5_ib_warn(delay_drop->dev, "Failed to set delay drop, timeout=%u\n",
+			     delay_drop->timeout);
+		delay_drop->activate = false;
+	}
+	mutex_unlock(&delay_drop->lock);
+}
+
 static void mlx5_ib_event(struct mlx5_core_dev *dev, void *context,
 			  enum mlx5_dev_event event, unsigned long param)
 {
@@ -2804,8 +2826,11 @@ static void mlx5_ib_event(struct mlx5_core_dev *dev, void *context,
 		ibev.event = IB_EVENT_CLIENT_REREGISTER;
 		port = (u8)param;
 		break;
+	case MLX5_DEV_EVENT_DELAY_DROP_TIMEOUT:
+		schedule_work(&ibdev->delay_drop.delay_drop_work);
+		goto out;
 	default:
-		return;
+		goto out;
 	}
 
 	ibev.device	      = &ibdev->ib_dev;
@@ -2813,7 +2838,7 @@ static void mlx5_ib_event(struct mlx5_core_dev *dev, void *context,
 
 	if (port < 1 || port > ibdev->num_ports) {
 		mlx5_ib_warn(ibdev, "warning: event on port %d\n", port);
-		return;
+		goto out;
 	}
 
 	if (ibdev->ib_active)
@@ -2821,6 +2846,9 @@ static void mlx5_ib_event(struct mlx5_core_dev *dev, void *context,
 
 	if (fatal)
 		ibdev->ib_active = false;
+
+out:
+	return;
 }
 
 static int set_has_smi_cap(struct mlx5_ib_dev *dev)
@@ -3623,6 +3651,26 @@ mlx5_ib_alloc_rdma_netdev(struct ib_device *hca,
 	return netdev;
 }
 
+static void cancel_delay_drop(struct mlx5_ib_dev *dev)
+{
+	if (!(dev->ib_dev.attrs.raw_packet_caps & IB_RAW_PACKET_CAP_DELAY_DROP))
+		return;
+
+	cancel_work_sync(&dev->delay_drop.delay_drop_work);
+}
+
+static void init_delay_drop(struct mlx5_ib_dev *dev)
+{
+	if (!(dev->ib_dev.attrs.raw_packet_caps & IB_RAW_PACKET_CAP_DELAY_DROP))
+		return;
+
+	mutex_init(&dev->delay_drop.lock);
+	dev->delay_drop.dev = dev;
+	dev->delay_drop.activate = false;
+	dev->delay_drop.timeout = MLX5_MAX_DELAY_DROP_TIMEOUT_MS * 1000;
+	INIT_WORK(&dev->delay_drop.delay_drop_work, delay_drop_handler);
+}
+
 static void *mlx5_ib_add(struct mlx5_core_dev *mdev)
 {
 	struct mlx5_ib_dev *dev;
@@ -3862,11 +3910,13 @@ static void *mlx5_ib_add(struct mlx5_core_dev *mdev)
 	if (err)
 		goto err_dev;
 
+	init_delay_drop(dev);
+
 	for (i = 0; i < ARRAY_SIZE(mlx5_class_attributes); i++) {
 		err = device_create_file(&dev->ib_dev.dev,
 					 mlx5_class_attributes[i]);
 		if (err)
-			goto err_umrc;
+			goto err_delay_drop;
 	}
 
 	if ((MLX5_CAP_GEN(mdev, port_type) == MLX5_CAP_PORT_TYPE_ETH) &&
@@ -3877,7 +3927,8 @@ static void *mlx5_ib_add(struct mlx5_core_dev *mdev)
 
 	return dev;
 
-err_umrc:
+err_delay_drop:
+	cancel_delay_drop(dev);
 	destroy_umrc_res(dev);
 
 err_dev:
@@ -3924,6 +3975,7 @@ static void mlx5_ib_remove(struct mlx5_core_dev *mdev, void *context)
 	struct mlx5_ib_dev *dev = context;
 	enum rdma_link_layer ll = mlx5_ib_port_link_layer(&dev->ib_dev, 1);
 
+	cancel_delay_drop(dev);
 	mlx5_remove_netdev_notifier(dev);
 	ib_unregister_device(&dev->ib_dev);
 	mlx5_free_bfreg(dev->mdev, &dev->fp_bfreg);
diff --git a/drivers/infiniband/hw/mlx5/mlx5_ib.h b/drivers/infiniband/hw/mlx5/mlx5_ib.h
index f0682f383b52..097f12dc65a6 100644
--- a/drivers/infiniband/hw/mlx5/mlx5_ib.h
+++ b/drivers/infiniband/hw/mlx5/mlx5_ib.h
@@ -247,6 +247,10 @@ struct mlx5_ib_wq {
 	void		       *qend;
 };
 
+enum mlx5_ib_wq_flags {
+	MLX5_IB_WQ_FLAGS_DELAY_DROP = 0x1,
+};
+
 struct mlx5_ib_rwq {
 	struct ib_wq		ibwq;
 	struct mlx5_core_qp	core_qp;
@@ -264,6 +268,7 @@ struct mlx5_ib_rwq {
 	u32			wqe_count;
 	u32			wqe_shift;
 	int			wq_sig;
+	u32			create_flags; /* Use enum mlx5_ib_wq_flags */
 };
 
 enum {
@@ -652,6 +657,19 @@ struct mlx5_ib_dbg_cc_params {
 	struct mlx5_ib_dbg_param	params[MLX5_IB_DBG_CC_MAX];
 };
 
+enum {
+	MLX5_MAX_DELAY_DROP_TIMEOUT_MS = 100,
+};
+
+struct mlx5_ib_delay_drop {
+	struct mlx5_ib_dev     *dev;
+	struct work_struct	delay_drop_work;
+	/* serialize setting of delay drop */
+	struct mutex		lock;
+	u32			timeout;
+	bool			activate;
+};
+
 struct mlx5_ib_dev {
 	struct ib_device		ib_dev;
 	struct mlx5_core_dev		*mdev;
@@ -688,6 +706,7 @@ struct mlx5_ib_dev {
 	struct mlx5_ib_port	*port;
 	struct mlx5_sq_bfreg	bfreg;
 	struct mlx5_sq_bfreg	fp_bfreg;
+	struct mlx5_ib_delay_drop	delay_drop;
 	struct mlx5_ib_dbg_cc_params	*dbg_cc_params;
 
 	/* protect the user_td */
diff --git a/drivers/infiniband/hw/mlx5/qp.c b/drivers/infiniband/hw/mlx5/qp.c
index 0889ff367c86..939553d5c25f 100644
--- a/drivers/infiniband/hw/mlx5/qp.c
+++ b/drivers/infiniband/hw/mlx5/qp.c
@@ -4597,6 +4597,24 @@ static void mlx5_ib_wq_event(struct mlx5_core_qp *core_qp, int type)
 	}
 }
 
+static int set_delay_drop(struct mlx5_ib_dev *dev)
+{
+	int err = 0;
+
+	mutex_lock(&dev->delay_drop.lock);
+	if (dev->delay_drop.activate)
+		goto out;
+
+	err = mlx5_core_set_delay_drop(dev->mdev, dev->delay_drop.timeout);
+	if (err)
+		goto out;
+
+	dev->delay_drop.activate = true;
+out:
+	mutex_unlock(&dev->delay_drop.lock);
+	return err;
+}
+
 static int  create_rq(struct mlx5_ib_rwq *rwq, struct ib_pd *pd,
 		      struct ib_wq_init_attr *init_attr)
 {
@@ -4651,9 +4669,28 @@ static int  create_rq(struct mlx5_ib_rwq *rwq, struct ib_pd *pd,
 		}
 		MLX5_SET(rqc, rqc, scatter_fcs, 1);
 	}
+	if (init_attr->create_flags & IB_WQ_FLAGS_DELAY_DROP) {
+		if (!(dev->ib_dev.attrs.raw_packet_caps &
+		      IB_RAW_PACKET_CAP_DELAY_DROP)) {
+			mlx5_ib_dbg(dev, "Delay drop is not supported\n");
+			err = -EOPNOTSUPP;
+			goto out;
+		}
+		MLX5_SET(rqc, rqc, delay_drop_en, 1);
+	}
 	rq_pas0 = (__be64 *)MLX5_ADDR_OF(wq, wq, pas);
 	mlx5_ib_populate_pas(dev, rwq->umem, rwq->page_shift, rq_pas0, 0);
 	err = mlx5_core_create_rq_tracked(dev->mdev, in, inlen, &rwq->core_qp);
+	if (!err && init_attr->create_flags & IB_WQ_FLAGS_DELAY_DROP) {
+		err = set_delay_drop(dev);
+		if (err) {
+			mlx5_ib_warn(dev, "Failed to enable delay drop err=%d\n",
+				     err);
+			mlx5_core_destroy_rq_tracked(dev->mdev, &rwq->core_qp);
+		} else {
+			rwq->create_flags |= MLX5_IB_WQ_FLAGS_DELAY_DROP;
+		}
+	}
 out:
 	kvfree(in);
 	return err;
diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h
index 4bc57647fad8..f350688a12fa 100644
--- a/include/linux/mlx5/mlx5_ifc.h
+++ b/include/linux/mlx5/mlx5_ifc.h
@@ -2519,7 +2519,7 @@ enum {
 
 struct mlx5_ifc_rqc_bits {
 	u8         rlky[0x1];
-	u8         reserved_at_1[0x1];
+	u8	   delay_drop_en[0x1];
 	u8         scatter_fcs[0x1];
 	u8         vsd[0x1];
 	u8         mem_rq_type[0x4];
-- 
cgit v1.2.3


From be1d325a335840a86c133a56c6a911c368bac0fd Mon Sep 17 00:00:00 2001
From: Noa Osherovich <noaos@mellanox.com>
Date: Mon, 12 Jun 2017 11:14:03 +0300
Subject: IB/core: Set RoCEv2 MGID according to spec

RoCEv2 Annex states that for RoCEv2 over IPv4, the corresponding
IPv4 address is encoded into the GID according to the following rule:
GID= :ffff:<IPv4 address>

Remove the 0xff0e prefix for RoCEv2 packets with IPv4 and leave it
zeroed and change rdma_is_multicast_addr() to consider the new logic.

Signed-off-by: Noa Osherovich <noaos@mellanox.com>
Reviewed-by: Moni Shoua <monis@mellanox.com>
Signed-off-by: Leon Romanovsky <leon@kernel.org>
Signed-off-by: Doug Ledford <dledford@redhat.com>
---
 drivers/infiniband/core/cma.c   | 13 +++++++------
 drivers/infiniband/core/verbs.c | 10 ++++++----
 include/rdma/ib_addr.h          |  8 +++++++-
 3 files changed, 20 insertions(+), 11 deletions(-)

(limited to 'include')

diff --git a/drivers/infiniband/core/cma.c b/drivers/infiniband/core/cma.c
index 0eb393237ba2..a8c2f0ccd225 100644
--- a/drivers/infiniband/core/cma.c
+++ b/drivers/infiniband/core/cma.c
@@ -3998,7 +3998,8 @@ static void iboe_mcast_work_handler(struct work_struct *work)
 	kfree(mw);
 }
 
-static void cma_iboe_set_mgid(struct sockaddr *addr, union ib_gid *mgid)
+static void cma_iboe_set_mgid(struct sockaddr *addr, union ib_gid *mgid,
+			      enum ib_gid_type gid_type)
 {
 	struct sockaddr_in *sin = (struct sockaddr_in *)addr;
 	struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)addr;
@@ -4008,8 +4009,8 @@ static void cma_iboe_set_mgid(struct sockaddr *addr, union ib_gid *mgid)
 	} else if (addr->sa_family == AF_INET6) {
 		memcpy(mgid, &sin6->sin6_addr, sizeof *mgid);
 	} else {
-		mgid->raw[0] = 0xff;
-		mgid->raw[1] = 0x0e;
+		mgid->raw[0] = (gid_type == IB_GID_TYPE_IB) ? 0xff : 0;
+		mgid->raw[1] = (gid_type == IB_GID_TYPE_IB) ? 0x0e : 0;
 		mgid->raw[2] = 0;
 		mgid->raw[3] = 0;
 		mgid->raw[4] = 0;
@@ -4050,7 +4051,9 @@ static int cma_iboe_join_multicast(struct rdma_id_private *id_priv,
 		goto out1;
 	}
 
-	cma_iboe_set_mgid(addr, &mc->multicast.ib->rec.mgid);
+	gid_type = id_priv->cma_dev->default_gid_type[id_priv->id.port_num -
+		   rdma_start_port(id_priv->cma_dev->device)];
+	cma_iboe_set_mgid(addr, &mc->multicast.ib->rec.mgid, gid_type);
 
 	mc->multicast.ib->rec.pkey = cpu_to_be16(0xffff);
 	if (id_priv->id.ps == RDMA_PS_UDP)
@@ -4066,8 +4069,6 @@ static int cma_iboe_join_multicast(struct rdma_id_private *id_priv,
 	mc->multicast.ib->rec.hop_limit = 1;
 	mc->multicast.ib->rec.mtu = iboe_get_mtu(ndev->mtu);
 
-	gid_type = id_priv->cma_dev->default_gid_type[id_priv->id.port_num -
-		   rdma_start_port(id_priv->cma_dev->device)];
 	if (addr->sa_family == AF_INET) {
 		if (gid_type == IB_GID_TYPE_ROCE_UDP_ENCAP) {
 			mc->multicast.ib->rec.hop_limit = IPV6_DEFAULT_HOPLIMIT;
diff --git a/drivers/infiniband/core/verbs.c b/drivers/infiniband/core/verbs.c
index 802bdc397a57..30fdc3ae1bbd 100644
--- a/drivers/infiniband/core/verbs.c
+++ b/drivers/infiniband/core/verbs.c
@@ -1613,8 +1613,9 @@ int ib_attach_mcast(struct ib_qp *qp, union ib_gid *gid, u16 lid)
 
 	if (!qp->device->attach_mcast)
 		return -ENOSYS;
-	if (gid->raw[0] != 0xff || qp->qp_type != IB_QPT_UD ||
-	    !is_valid_mcast_lid(qp, lid))
+
+	if (!rdma_is_multicast_addr((struct in6_addr *)gid->raw) ||
+	    qp->qp_type != IB_QPT_UD || !is_valid_mcast_lid(qp, lid))
 		return -EINVAL;
 
 	ret = qp->device->attach_mcast(qp, gid, lid);
@@ -1630,8 +1631,9 @@ int ib_detach_mcast(struct ib_qp *qp, union ib_gid *gid, u16 lid)
 
 	if (!qp->device->detach_mcast)
 		return -ENOSYS;
-	if (gid->raw[0] != 0xff || qp->qp_type != IB_QPT_UD ||
-	    !is_valid_mcast_lid(qp, lid))
+
+	if (!rdma_is_multicast_addr((struct in6_addr *)gid->raw) ||
+	    qp->qp_type != IB_QPT_UD || !is_valid_mcast_lid(qp, lid))
 		return -EINVAL;
 
 	ret = qp->device->detach_mcast(qp, gid, lid);
diff --git a/include/rdma/ib_addr.h b/include/rdma/ib_addr.h
index b73a14edc85e..7aca12188ef3 100644
--- a/include/rdma/ib_addr.h
+++ b/include/rdma/ib_addr.h
@@ -304,7 +304,13 @@ static inline void rdma_get_ll_mac(struct in6_addr *addr, u8 *mac)
 
 static inline int rdma_is_multicast_addr(struct in6_addr *addr)
 {
-	return addr->s6_addr[0] == 0xff;
+	u32 ipv4_addr;
+
+	if (addr->s6_addr[0] == 0xff)
+		return 1;
+
+	memcpy(&ipv4_addr, addr->s6_addr + 12, 4);
+	return (ipv6_addr_v4mapped(addr) && ipv4_is_multicast(ipv4_addr));
 }
 
 static inline void rdma_get_mcast_mac(struct in6_addr *addr, u8 *mac)
-- 
cgit v1.2.3


From 02984cc7b3d62418bd72abacaf875c3a9eccdb66 Mon Sep 17 00:00:00 2001
From: Yishai Hadas <yishaih@mellanox.com>
Date: Thu, 8 Jun 2017 16:15:06 +0300
Subject: IB/core: Enable QP creation with a given source QP number

Enable QP creation with a given source QP number.
The created QP will use the source QPN as its wire QP number.

This comes as a pre-patch for downstream patches in this series to
allow user space applications to accelerate traffic which is typically
handled by IPoIB ULP.

Signed-off-by: Yishai Hadas <yishaih@mellanox.com>
Reviewed-by: Maor Gottlieb <maorg@mellanox.com>
Signed-off-by: Leon Romanovsky <leon@kernel.org>
Signed-off-by: Doug Ledford <dledford@redhat.com>
---
 include/rdma/ib_verbs.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include')

diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h
index 4a1444456af7..76a74c783c50 100644
--- a/include/rdma/ib_verbs.h
+++ b/include/rdma/ib_verbs.h
@@ -1059,6 +1059,7 @@ enum ib_qp_create_flags {
 	/* FREE					= 1 << 7, */
 	IB_QP_CREATE_SCATTER_FCS		= 1 << 8,
 	IB_QP_CREATE_CVLAN_STRIPPING		= 1 << 9,
+	IB_QP_CREATE_SOURCE_QPN			= 1 << 10,
 	/* reserve bits 26-31 for low level drivers' internal use */
 	IB_QP_CREATE_RESERVED_START		= 1 << 26,
 	IB_QP_CREATE_RESERVED_END		= 1 << 31,
@@ -1086,6 +1087,7 @@ struct ib_qp_init_attr {
 	 */
 	u8			port_num;
 	struct ib_rwq_ind_table *rwq_ind_tbl;
+	u32			source_qpn;
 };
 
 struct ib_qp_open_attr {
-- 
cgit v1.2.3


From 2dee0e545894c23b1a2cc2019ac87dffb42e5984 Mon Sep 17 00:00:00 2001
From: Yishai Hadas <yishaih@mellanox.com>
Date: Thu, 8 Jun 2017 16:15:07 +0300
Subject: IB/uverbs: Enable QP creation with a given source QP number

Enable QP creation with a given source QP number, the created QP will
use the source QPN as its wire QP number.

To create such a QP, root privileges (i.e. CAP_NET_RAW) are required
from the user application.

This comes as a pre-patch for downstream patches in this series to
allow user space applications to accelerate traffic which is typically
handled by IPoIB ULP.

Signed-off-by: Yishai Hadas <yishaih@mellanox.com>
Reviewed-by: Maor Gottlieb <maorg@mellanox.com>
Signed-off-by: Leon Romanovsky <leon@kernel.org>
Signed-off-by: Doug Ledford <dledford@redhat.com>
---
 drivers/infiniband/core/uverbs_cmd.c | 17 ++++++++++++++---
 include/uapi/rdma/ib_user_verbs.h    |  2 +-
 2 files changed, 15 insertions(+), 4 deletions(-)

(limited to 'include')

diff --git a/drivers/infiniband/core/uverbs_cmd.c b/drivers/infiniband/core/uverbs_cmd.c
index 2c98533a0203..60535c754db3 100644
--- a/drivers/infiniband/core/uverbs_cmd.c
+++ b/drivers/infiniband/core/uverbs_cmd.c
@@ -1383,8 +1383,9 @@ static int create_qp(struct ib_uverbs_file *file,
 		attr.rwq_ind_tbl = ind_tbl;
 	}
 
-	if ((cmd_sz >= offsetof(typeof(*cmd), reserved1) +
-		       sizeof(cmd->reserved1)) && cmd->reserved1) {
+	if (cmd_sz > sizeof(*cmd) &&
+	    !ib_is_udata_cleared(ucore, sizeof(*cmd),
+				 cmd_sz - sizeof(*cmd))) {
 		ret = -EOPNOTSUPP;
 		goto err_put;
 	}
@@ -1482,11 +1483,21 @@ static int create_qp(struct ib_uverbs_file *file,
 				IB_QP_CREATE_MANAGED_SEND |
 				IB_QP_CREATE_MANAGED_RECV |
 				IB_QP_CREATE_SCATTER_FCS |
-				IB_QP_CREATE_CVLAN_STRIPPING)) {
+				IB_QP_CREATE_CVLAN_STRIPPING |
+				IB_QP_CREATE_SOURCE_QPN)) {
 		ret = -EINVAL;
 		goto err_put;
 	}
 
+	if (attr.create_flags & IB_QP_CREATE_SOURCE_QPN) {
+		if (!capable(CAP_NET_RAW)) {
+			ret = -EPERM;
+			goto err_put;
+		}
+
+		attr.source_qpn = cmd->source_qpn;
+	}
+
 	buf = (void *)cmd + sizeof(*cmd);
 	if (cmd_sz > sizeof(*cmd))
 		if (!(buf[0] == 0 && !memcmp(buf, buf + 1,
diff --git a/include/uapi/rdma/ib_user_verbs.h b/include/uapi/rdma/ib_user_verbs.h
index 270c350bedc6..63656d2e8705 100644
--- a/include/uapi/rdma/ib_user_verbs.h
+++ b/include/uapi/rdma/ib_user_verbs.h
@@ -578,7 +578,7 @@ struct ib_uverbs_ex_create_qp {
 	__u32 comp_mask;
 	__u32 create_flags;
 	__u32 rwq_ind_tbl_handle;
-	__u32  reserved1;
+	__u32  source_qpn;
 };
 
 struct ib_uverbs_open_qp {
-- 
cgit v1.2.3


From 4ce749bd94f697772ac2be4fb7e7a92726e94bfb Mon Sep 17 00:00:00 2001
From: Yishai Hadas <yishaih@mellanox.com>
Date: Thu, 8 Jun 2017 16:15:10 +0300
Subject: net/mlx5: Report enhanced capabilities for IPoIB

Report 'ipoib_enhanced_offloads' capabilities from
the core layer, it will be used in the next patch from this series.

Signed-off-by: Yishai Hadas <yishaih@mellanox.com>
Reviewed-by: Maor Gottlieb <maorg@mellanox.com>
Signed-off-by: Leon Romanovsky <leon@kernel.org>
Signed-off-by: Doug Ledford <dledford@redhat.com>
---
 drivers/net/ethernet/mellanox/mlx5/core/fw.c | 6 ++++++
 include/linux/mlx5/device.h                  | 6 +++++-
 2 files changed, 11 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fw.c b/drivers/net/ethernet/mellanox/mlx5/core/fw.c
index fa33d59ab485..2c71557d1cee 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/fw.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/fw.c
@@ -120,6 +120,12 @@ int mlx5_query_hca_caps(struct mlx5_core_dev *dev)
 			return err;
 	}
 
+	if (MLX5_CAP_GEN(dev, ipoib_enhanced_offloads)) {
+		err = mlx5_core_get_caps(dev, MLX5_CAP_IPOIB_ENHANCED_OFFLOADS);
+		if (err)
+			return err;
+	}
+
 	if (MLX5_CAP_GEN(dev, pg)) {
 		err = mlx5_core_get_caps(dev, MLX5_CAP_ODP);
 		if (err)
diff --git a/include/linux/mlx5/device.h b/include/linux/mlx5/device.h
index a47b9ab9f2c9..c13d71deaeca 100644
--- a/include/linux/mlx5/device.h
+++ b/include/linux/mlx5/device.h
@@ -973,7 +973,7 @@ enum mlx5_cap_type {
 	MLX5_CAP_ATOMIC,
 	MLX5_CAP_ROCE,
 	MLX5_CAP_IPOIB_OFFLOADS,
-	MLX5_CAP_EOIB_OFFLOADS,
+	MLX5_CAP_IPOIB_ENHANCED_OFFLOADS,
 	MLX5_CAP_FLOW_TABLE,
 	MLX5_CAP_ESWITCH_FLOW_TABLE,
 	MLX5_CAP_ESWITCH,
@@ -1016,6 +1016,10 @@ enum mlx5_mcam_feature_groups {
 	MLX5_GET(per_protocol_networking_offload_caps,\
 		 mdev->caps.hca_max[MLX5_CAP_ETHERNET_OFFLOADS], cap)
 
+#define MLX5_CAP_IPOIB_ENHANCED(mdev, cap) \
+	MLX5_GET(per_protocol_networking_offload_caps,\
+		 mdev->caps.hca_cur[MLX5_CAP_IPOIB_ENHANCED_OFFLOADS], cap)
+
 #define MLX5_CAP_ROCE(mdev, cap) \
 	MLX5_GET(roce_cap, mdev->caps.hca_cur[MLX5_CAP_ROCE], cap)
 
-- 
cgit v1.2.3


From 3fffc82ad6c78fcc9d5d4eca089f00db14ab0358 Mon Sep 17 00:00:00 2001
From: Leon Romanovsky <leonro@mellanox.com>
Date: Mon, 12 Jun 2017 10:36:16 +0300
Subject: IB/mlx5: Fix existence check for extended address vector

The extended address vector is the highest bit in be32 variable,
but it was compared with the lowest. This patch fixes the endianness
of that check and removes already declared define.

Fixes: 17d2f88f92ce ("IB/mlx5: Add ODP atomics support")
Reviewed-by: Artemy Kovalyov <artemyko@mellanox.com>
Signed-off-by: Leon Romanovsky <leonro@mellanox.com>
Signed-off-by: Doug Ledford <dledford@redhat.com>
---
 drivers/infiniband/hw/mlx5/odp.c | 2 +-
 include/linux/mlx5/qp.h          | 1 -
 2 files changed, 1 insertion(+), 2 deletions(-)

(limited to 'include')

diff --git a/drivers/infiniband/hw/mlx5/odp.c b/drivers/infiniband/hw/mlx5/odp.c
index ae0746754008..3d701c7a4c91 100644
--- a/drivers/infiniband/hw/mlx5/odp.c
+++ b/drivers/infiniband/hw/mlx5/odp.c
@@ -939,7 +939,7 @@ static int mlx5_ib_mr_initiator_pfault_handler(
 
 	if (qp->ibqp.qp_type != IB_QPT_RC) {
 		av = *wqe;
-		if (av->dqp_dct & be32_to_cpu(MLX5_WQE_AV_EXT))
+		if (av->dqp_dct & cpu_to_be32(MLX5_EXTENDED_UD_AV))
 			*wqe += sizeof(struct mlx5_av);
 		else
 			*wqe += sizeof(struct mlx5_base_av);
diff --git a/include/linux/mlx5/qp.h b/include/linux/mlx5/qp.h
index fff4ec13f620..66d19b611fe4 100644
--- a/include/linux/mlx5/qp.h
+++ b/include/linux/mlx5/qp.h
@@ -212,7 +212,6 @@ struct mlx5_wqe_ctrl_seg {
 #define MLX5_WQE_CTRL_OPCODE_MASK 0xff
 #define MLX5_WQE_CTRL_WQE_INDEX_MASK 0x00ffff00
 #define MLX5_WQE_CTRL_WQE_INDEX_SHIFT 8
-#define MLX5_WQE_AV_EXT 0x80000000
 
 enum {
 	MLX5_ETH_WQE_L3_INNER_CSUM      = 1 << 4,
-- 
cgit v1.2.3


From 58dcb60a226ee48cc66c96c27b751f06ec2bc5a9 Mon Sep 17 00:00:00 2001
From: Parav Pandit <parav@mellanox.com>
Date: Mon, 19 Jun 2017 07:19:37 +0300
Subject: IB/mlx5: Expose extended error counters

This patch adds below requester and responder side error counters,
which will be exposed by hardware counters interface and are supported
as part of query Q counters command extension.

 +---------------------------+-------------------------------------+
 |      Name                 |           Description               |
 |---------------------------+-------------------------------------|
 |resp_local_length_error    | Number of times responder detected  |
 |                           | local length errors                 |
 |---------------------------+-------------------------------------|
 |resp_cqe_error             | Number of CQEs completed with error |
 |                           | at responder                        |
 |---------------------------+-------------------------------------|
 |req_cqe_error              | Number of CQEs completed with error |
 |                           | at requester                        |
 |---------------------------+-------------------------------------|
 |req_remote_invalid_request | Number of times requester detected  |
 |                           | remote invalid request error        |
 |---------------------------+-------------------------------------|
 |req_remote_access_error    | Number of times requester detected  |
 |                           | remote access error                 |
 |---------------------------+-------------------------------------|
 |resp_remote_access_error   | Number of times responder detected  |
 |                           | remote access error                 |
 |---------------------------+-------------------------------------|
 |resp_cqe_flush_error       | Number of CQEs completed with       |
 |                           | flushed with error at responder     |
 |---------------------------+-------------------------------------|
 |req_cqe_flush_error        | Number of CQEs completed with       |
 |                           | flushed with error at requester     |
 +---------------------------+-------------------------------------+

Signed-off-by: Parav Pandit <parav@mellanox.com>
Reviewed-by: Daniel Jurgens <danielj@mellanox.com>
Reviewed-by: Eli Cohen <eli@mellanox.com>
Signed-off-by: Leon Romanovsky <leon@kernel.org>
Signed-off-by: Doug Ledford <dledford@redhat.com>
---
 drivers/infiniband/hw/mlx5/main.c | 22 ++++++++++++++++++++
 include/linux/mlx5/mlx5_ifc.h     | 44 +++++++++++++++++++++++++++++++++++++--
 2 files changed, 64 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/drivers/infiniband/hw/mlx5/main.c b/drivers/infiniband/hw/mlx5/main.c
index 2757d445b042..9dd9759459fb 100644
--- a/drivers/infiniband/hw/mlx5/main.c
+++ b/drivers/infiniband/hw/mlx5/main.c
@@ -3432,6 +3432,17 @@ static const struct mlx5_ib_counter cong_cnts[] = {
 	INIT_CONG_COUNTER(np_cnp_sent),
 };
 
+static const struct mlx5_ib_counter extended_err_cnts[] = {
+	INIT_Q_COUNTER(resp_local_length_error),
+	INIT_Q_COUNTER(resp_cqe_error),
+	INIT_Q_COUNTER(req_cqe_error),
+	INIT_Q_COUNTER(req_remote_invalid_request),
+	INIT_Q_COUNTER(req_remote_access_errors),
+	INIT_Q_COUNTER(resp_remote_access_errors),
+	INIT_Q_COUNTER(resp_cqe_flush_error),
+	INIT_Q_COUNTER(req_cqe_flush_error),
+};
+
 static void mlx5_ib_dealloc_counters(struct mlx5_ib_dev *dev)
 {
 	unsigned int i;
@@ -3456,6 +3467,10 @@ static int __mlx5_ib_alloc_counters(struct mlx5_ib_dev *dev,
 
 	if (MLX5_CAP_GEN(dev->mdev, retransmission_q_counters))
 		num_counters += ARRAY_SIZE(retrans_q_cnts);
+
+	if (MLX5_CAP_GEN(dev->mdev, enhanced_error_q_counters))
+		num_counters += ARRAY_SIZE(extended_err_cnts);
+
 	cnts->num_q_counters = num_counters;
 
 	if (MLX5_CAP_GEN(dev->mdev, cc_query_allowed)) {
@@ -3505,6 +3520,13 @@ static void mlx5_ib_fill_counters(struct mlx5_ib_dev *dev,
 		}
 	}
 
+	if (MLX5_CAP_GEN(dev->mdev, enhanced_error_q_counters)) {
+		for (i = 0; i < ARRAY_SIZE(extended_err_cnts); i++, j++) {
+			names[j] = extended_err_cnts[i].name;
+			offsets[j] = extended_err_cnts[i].offset;
+		}
+	}
+
 	if (MLX5_CAP_GEN(dev->mdev, cc_query_allowed)) {
 		for (i = 0; i < ARRAY_SIZE(cong_cnts); i++, j++) {
 			names[j] = cong_cnts[i].name;
diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h
index f350688a12fa..5bae70eb25af 100644
--- a/include/linux/mlx5/mlx5_ifc.h
+++ b/include/linux/mlx5/mlx5_ifc.h
@@ -858,7 +858,7 @@ struct mlx5_ifc_cmd_hca_cap_bits {
 	u8         pcam_reg[0x1];
 	u8         local_ca_ack_delay[0x5];
 	u8         port_module_event[0x1];
-	u8         reserved_at_1b1[0x1];
+	u8         enhanced_error_q_counters[0x1];
 	u8         ports_check[0x1];
 	u8         reserved_at_1b3[0x1];
 	u8         disable_link_up[0x1];
@@ -3953,7 +3953,47 @@ struct mlx5_ifc_query_q_counter_out_bits {
 
 	u8         local_ack_timeout_err[0x20];
 
-	u8         reserved_at_320[0x4e0];
+	u8         reserved_at_320[0xa0];
+
+	u8         resp_local_length_error[0x20];
+
+	u8         req_local_length_error[0x20];
+
+	u8         resp_local_qp_error[0x20];
+
+	u8         local_operation_error[0x20];
+
+	u8         resp_local_protection[0x20];
+
+	u8         req_local_protection[0x20];
+
+	u8         resp_cqe_error[0x20];
+
+	u8         req_cqe_error[0x20];
+
+	u8         req_mw_binding[0x20];
+
+	u8         req_bad_response[0x20];
+
+	u8         req_remote_invalid_request[0x20];
+
+	u8         resp_remote_invalid_request[0x20];
+
+	u8         req_remote_access_errors[0x20];
+
+	u8	   resp_remote_access_errors[0x20];
+
+	u8         req_remote_operation_errors[0x20];
+
+	u8         req_transport_retries_exceeded[0x20];
+
+	u8         cq_overflow[0x20];
+
+	u8         resp_cqe_flush_error[0x20];
+
+	u8         req_cqe_flush_error[0x20];
+
+	u8         reserved_at_620[0x1e0];
 };
 
 struct mlx5_ifc_query_q_counter_in_bits {
-- 
cgit v1.2.3


From ea30b966f7dd6bcfb20c98a7f99608c7bb10bfac Mon Sep 17 00:00:00 2001
From: Maor Gottlieb <maorg@mellanox.com>
Date: Wed, 21 Jun 2017 09:26:28 +0300
Subject: IB/mlx4: Add inline-receive support

When inline-receive is enabled, the HCA may write received
data into the receive WQE.

Inline-receive is enabled by setting its matching bit in
the QP context and each single-packet message with payload
not exceeding the receive WQE size will be delivered to
the WQE.

The completion report will indicate that the payload was placed to the WQE.

It includes:
1) Return maximum supported size of inline-receive by the hardware
in query_device vendor's data part.
2) Enable the feature when requested by the vendor data input.

Signed-off-by: Maor Gottlieb <maorg@mellanox.com>
Reviewed-by: Yishai Hadas <yishaih@mellanox.com>
Signed-off-by: Leon Romanovsky <leon@kernel.org>
Signed-off-by: Doug Ledford <dledford@redhat.com>
---
 drivers/infiniband/hw/mlx4/main.c    |  7 +++++++
 drivers/infiniband/hw/mlx4/mlx4_ib.h |  3 +++
 drivers/infiniband/hw/mlx4/qp.c      | 32 +++++++++++++++++++++++++-------
 include/uapi/rdma/mlx4-abi.h         |  3 ++-
 4 files changed, 37 insertions(+), 8 deletions(-)

(limited to 'include')

diff --git a/drivers/infiniband/hw/mlx4/main.c b/drivers/infiniband/hw/mlx4/main.c
index d1b43cbbfea7..e8c290edb1e1 100644
--- a/drivers/infiniband/hw/mlx4/main.c
+++ b/drivers/infiniband/hw/mlx4/main.c
@@ -563,6 +563,13 @@ static int mlx4_ib_query_device(struct ib_device *ibdev,
 		}
 	}
 
+	if (uhw->outlen >= resp.response_length +
+	    sizeof(resp.max_inl_recv_sz)) {
+		resp.response_length += sizeof(resp.max_inl_recv_sz);
+		resp.max_inl_recv_sz  = dev->dev->caps.max_rq_sg *
+			sizeof(struct mlx4_wqe_data_seg);
+	}
+
 	if (uhw->outlen) {
 		err = ib_copy_to_udata(uhw, &resp, resp.response_length);
 		if (err)
diff --git a/drivers/infiniband/hw/mlx4/mlx4_ib.h b/drivers/infiniband/hw/mlx4/mlx4_ib.h
index 9db82e67e959..a6337f3161cf 100644
--- a/drivers/infiniband/hw/mlx4/mlx4_ib.h
+++ b/drivers/infiniband/hw/mlx4/mlx4_ib.h
@@ -318,6 +318,7 @@ struct mlx4_ib_qp {
 	u8			sq_no_prefetch;
 	u8			state;
 	int			mlx_type;
+	u32			inl_recv_sz;
 	struct list_head	gid_list;
 	struct list_head	steering_rules;
 	struct mlx4_ib_buf	*sqp_proxy_rcv;
@@ -623,6 +624,8 @@ struct mlx4_uverbs_ex_query_device_resp {
 	__u32 comp_mask;
 	__u32 response_length;
 	__u64 hca_core_clock_offset;
+	__u32 max_inl_recv_sz;
+	__u32 reserved;
 };
 
 static inline struct mlx4_ib_dev *to_mdev(struct ib_device *ibdev)
diff --git a/drivers/infiniband/hw/mlx4/qp.c b/drivers/infiniband/hw/mlx4/qp.c
index 75c0e6c5dd56..d1caa39a3943 100644
--- a/drivers/infiniband/hw/mlx4/qp.c
+++ b/drivers/infiniband/hw/mlx4/qp.c
@@ -377,7 +377,8 @@ static int send_wqe_overhead(enum mlx4_ib_qp_type type, u32 flags)
 }
 
 static int set_rq_size(struct mlx4_ib_dev *dev, struct ib_qp_cap *cap,
-		       int is_user, int has_rq, struct mlx4_ib_qp *qp)
+		       int is_user, int has_rq, struct mlx4_ib_qp *qp,
+		       u32 inl_recv_sz)
 {
 	/* Sanity check RQ size before proceeding */
 	if (cap->max_recv_wr > dev->dev->caps.max_wqes - MLX4_IB_SQ_MAX_SPARE ||
@@ -385,18 +386,24 @@ static int set_rq_size(struct mlx4_ib_dev *dev, struct ib_qp_cap *cap,
 		return -EINVAL;
 
 	if (!has_rq) {
-		if (cap->max_recv_wr)
+		if (cap->max_recv_wr || inl_recv_sz)
 			return -EINVAL;
 
 		qp->rq.wqe_cnt = qp->rq.max_gs = 0;
 	} else {
+		u32 max_inl_recv_sz = dev->dev->caps.max_rq_sg *
+			sizeof(struct mlx4_wqe_data_seg);
+		u32 wqe_size;
+
 		/* HW requires >= 1 RQ entry with >= 1 gather entry */
-		if (is_user && (!cap->max_recv_wr || !cap->max_recv_sge))
+		if (is_user && (!cap->max_recv_wr || !cap->max_recv_sge ||
+				inl_recv_sz > max_inl_recv_sz))
 			return -EINVAL;
 
 		qp->rq.wqe_cnt	 = roundup_pow_of_two(max(1U, cap->max_recv_wr));
 		qp->rq.max_gs	 = roundup_pow_of_two(max(1U, cap->max_recv_sge));
-		qp->rq.wqe_shift = ilog2(qp->rq.max_gs * sizeof (struct mlx4_wqe_data_seg));
+		wqe_size = qp->rq.max_gs * sizeof(struct mlx4_wqe_data_seg);
+		qp->rq.wqe_shift = ilog2(max_t(u32, wqe_size, inl_recv_sz));
 	}
 
 	/* leave userspace return values as they were, so as not to break ABI */
@@ -719,9 +726,6 @@ static int create_qp_common(struct mlx4_ib_dev *dev, struct ib_pd *pd,
 	if (init_attr->sq_sig_type == IB_SIGNAL_ALL_WR)
 		qp->sq_signal_bits = cpu_to_be32(MLX4_WQE_CTRL_CQ_UPDATE);
 
-	err = set_rq_size(dev, &init_attr->cap, !!pd->uobject, qp_has_rq(init_attr), qp);
-	if (err)
-		goto err;
 
 	if (pd->uobject) {
 		struct mlx4_ib_create_qp ucmd;
@@ -731,6 +735,12 @@ static int create_qp_common(struct mlx4_ib_dev *dev, struct ib_pd *pd,
 			goto err;
 		}
 
+		err = set_rq_size(dev, &init_attr->cap, !!pd->uobject,
+				  qp_has_rq(init_attr), qp, ucmd.inl_recv_sz);
+		if (err)
+			goto err;
+
+		qp->inl_recv_sz = ucmd.inl_recv_sz;
 		qp->sq_no_prefetch = ucmd.sq_no_prefetch;
 
 		err = set_user_sq_size(dev, qp, &ucmd);
@@ -760,6 +770,11 @@ static int create_qp_common(struct mlx4_ib_dev *dev, struct ib_pd *pd,
 				goto err_mtt;
 		}
 	} else {
+		err = set_rq_size(dev, &init_attr->cap, !!pd->uobject,
+				  qp_has_rq(init_attr), qp, 0);
+		if (err)
+			goto err;
+
 		qp->sq_no_prefetch = 0;
 
 		if (init_attr->create_flags & IB_QP_CREATE_IPOIB_UD_LSO)
@@ -1651,6 +1666,9 @@ static int __mlx4_ib_modify_qp(struct ib_qp *ibqp,
 		}
 	}
 
+	if (qp->inl_recv_sz)
+		context->param3 |= cpu_to_be32(1 << 25);
+
 	if (ibqp->qp_type == IB_QPT_GSI || ibqp->qp_type == IB_QPT_SMI)
 		context->mtu_msgmax = (IB_MTU_4096 << 5) | 11;
 	else if (ibqp->qp_type == IB_QPT_RAW_PACKET)
diff --git a/include/uapi/rdma/mlx4-abi.h b/include/uapi/rdma/mlx4-abi.h
index af431752655c..bf3bdba2f326 100644
--- a/include/uapi/rdma/mlx4-abi.h
+++ b/include/uapi/rdma/mlx4-abi.h
@@ -101,7 +101,8 @@ struct mlx4_ib_create_qp {
 	__u8	log_sq_bb_count;
 	__u8	log_sq_stride;
 	__u8	sq_no_prefetch;
-	__u8	reserved[5];
+	__u32	inl_recv_sz;
+	__u8	reserved;
 };
 
 #endif /* MLX4_ABI_USER_H */
-- 
cgit v1.2.3


From f3301870161ca293cd14b20a802c5646da02407f Mon Sep 17 00:00:00 2001
From: Moshe Shemesh <moshe@mellanox.com>
Date: Wed, 21 Jun 2017 09:29:36 +0300
Subject: (IB, net)/mlx4: Add resource utilization support

Adding visibility of resource usage of QPs, CQs and counters used by
virtual functions. This feature will be used to give the PF administrator
more data while debugging VF status. Usage info was added to ALLOC_RES
command, to notify the PF if the resource which is being reserved or
allocated for the VF will be used by kernel driver or by user verbs.

Updated reservation and allocation functions of QP, CQ and counter with
additional usage parameter.

Signed-off-by: Moshe Shemesh <moshe@mellanox.com>
Signed-off-by: Leon Romanovsky <leon@kernel.org>
Signed-off-by: Doug Ledford <dledford@redhat.com>
---
 drivers/infiniband/hw/mlx4/cq.c                |  2 ++
 drivers/infiniband/hw/mlx4/main.c              |  6 ++++--
 drivers/infiniband/hw/mlx4/qp.c                | 13 +++++++++----
 drivers/net/ethernet/mellanox/mlx4/cq.c        |  7 ++++---
 drivers/net/ethernet/mellanox/mlx4/en_cq.c     |  1 +
 drivers/net/ethernet/mellanox/mlx4/en_netdev.c |  3 ++-
 drivers/net/ethernet/mellanox/mlx4/en_rx.c     |  6 ++++--
 drivers/net/ethernet/mellanox/mlx4/en_tx.c     |  3 ++-
 drivers/net/ethernet/mellanox/mlx4/main.c      |  7 ++++---
 drivers/net/ethernet/mellanox/mlx4/qp.c        |  5 +++--
 include/linux/mlx4/device.h                    | 12 ++++++++++--
 11 files changed, 45 insertions(+), 20 deletions(-)

(limited to 'include')

diff --git a/drivers/infiniband/hw/mlx4/cq.c b/drivers/infiniband/hw/mlx4/cq.c
index ff931c580557..95382faa7ad1 100644
--- a/drivers/infiniband/hw/mlx4/cq.c
+++ b/drivers/infiniband/hw/mlx4/cq.c
@@ -218,6 +218,7 @@ struct ib_cq *mlx4_ib_create_cq(struct ib_device *ibdev,
 			goto err_mtt;
 
 		uar = &to_mucontext(context)->uar;
+		cq->mcq.usage = MLX4_RES_USAGE_USER_VERBS;
 	} else {
 		err = mlx4_db_alloc(dev->dev, &cq->db, 1);
 		if (err)
@@ -233,6 +234,7 @@ struct ib_cq *mlx4_ib_create_cq(struct ib_device *ibdev,
 			goto err_db;
 
 		uar = &dev->priv_uar;
+		cq->mcq.usage = MLX4_RES_USAGE_DRIVER;
 	}
 
 	if (dev->eq_table)
diff --git a/drivers/infiniband/hw/mlx4/main.c b/drivers/infiniband/hw/mlx4/main.c
index e8c290edb1e1..0944e224c0df 100644
--- a/drivers/infiniband/hw/mlx4/main.c
+++ b/drivers/infiniband/hw/mlx4/main.c
@@ -2779,7 +2779,8 @@ static void *mlx4_ib_add(struct mlx4_dev *dev)
 		allocated = 0;
 		if (mlx4_ib_port_link_layer(&ibdev->ib_dev, i + 1) ==
 						IB_LINK_LAYER_ETHERNET) {
-			err = mlx4_counter_alloc(ibdev->dev, &counter_index);
+			err = mlx4_counter_alloc(ibdev->dev, &counter_index,
+						 MLX4_RES_USAGE_DRIVER);
 			/* if failed to allocate a new counter, use default */
 			if (err)
 				counter_index =
@@ -2834,7 +2835,8 @@ static void *mlx4_ib_add(struct mlx4_dev *dev)
 		ibdev->steer_qpn_count = MLX4_IB_UC_MAX_NUM_QPS;
 		err = mlx4_qp_reserve_range(dev, ibdev->steer_qpn_count,
 					    MLX4_IB_UC_STEER_QPN_ALIGN,
-					    &ibdev->steer_qpn_base, 0);
+					    &ibdev->steer_qpn_base, 0,
+					    MLX4_RES_USAGE_DRIVER);
 		if (err)
 			goto err_counter;
 
diff --git a/drivers/infiniband/hw/mlx4/qp.c b/drivers/infiniband/hw/mlx4/qp.c
index d1caa39a3943..247b9132e9de 100644
--- a/drivers/infiniband/hw/mlx4/qp.c
+++ b/drivers/infiniband/hw/mlx4/qp.c
@@ -769,6 +769,7 @@ static int create_qp_common(struct mlx4_ib_dev *dev, struct ib_pd *pd,
 			if (err)
 				goto err_mtt;
 		}
+		qp->mqp.usage = MLX4_RES_USAGE_USER_VERBS;
 	} else {
 		err = set_rq_size(dev, &init_attr->cap, !!pd->uobject,
 				  qp_has_rq(init_attr), qp, 0);
@@ -841,6 +842,7 @@ static int create_qp_common(struct mlx4_ib_dev *dev, struct ib_pd *pd,
 			err = -ENOMEM;
 			goto err_wrid;
 		}
+		qp->mqp.usage = MLX4_RES_USAGE_DRIVER;
 	}
 
 	if (sqpn) {
@@ -860,13 +862,14 @@ static int create_qp_common(struct mlx4_ib_dev *dev, struct ib_pd *pd,
 						    (init_attr->cap.max_send_wr ?
 						     MLX4_RESERVE_ETH_BF_QP : 0) |
 						    (init_attr->cap.max_recv_wr ?
-						     MLX4_RESERVE_A0_QP : 0));
+						     MLX4_RESERVE_A0_QP : 0),
+						    qp->mqp.usage);
 		else
 			if (qp->flags & MLX4_IB_QP_NETIF)
 				err = mlx4_ib_steer_qp_alloc(dev, 1, &qpn);
 			else
 				err = mlx4_qp_reserve_range(dev->dev, 1, 1,
-							    &qpn, 0);
+							    &qpn, 0, qp->mqp.usage);
 		if (err)
 			goto err_proxy;
 	}
@@ -1218,7 +1221,9 @@ static struct ib_qp *_mlx4_ib_create_qp(struct ib_pd *pd,
 		if (udata)
 			return ERR_PTR(-EINVAL);
 		if (init_attr->create_flags & MLX4_IB_QP_CREATE_ROCE_V2_GSI) {
-			int res = mlx4_qp_reserve_range(to_mdev(pd->device)->dev, 1, 1, &sqpn, 0);
+			int res = mlx4_qp_reserve_range(to_mdev(pd->device)->dev,
+							1, 1, &sqpn, 0,
+							MLX4_RES_USAGE_DRIVER);
 
 			if (res)
 				return ERR_PTR(res);
@@ -1581,7 +1586,7 @@ static int create_qp_lb_counter(struct mlx4_ib_dev *dev, struct mlx4_ib_qp *qp)
 	    !(dev->dev->caps.flags2 & MLX4_DEV_CAP_FLAG2_LB_SRC_CHK))
 		return 0;
 
-	err = mlx4_counter_alloc(dev->dev, &tmp_idx);
+	err = mlx4_counter_alloc(dev->dev, &tmp_idx, MLX4_RES_USAGE_DRIVER);
 	if (err)
 		return err;
 
diff --git a/drivers/net/ethernet/mellanox/mlx4/cq.c b/drivers/net/ethernet/mellanox/mlx4/cq.c
index c56a511b918e..72eb50cd5ecd 100644
--- a/drivers/net/ethernet/mellanox/mlx4/cq.c
+++ b/drivers/net/ethernet/mellanox/mlx4/cq.c
@@ -241,13 +241,14 @@ err_out:
 	return err;
 }
 
-static int mlx4_cq_alloc_icm(struct mlx4_dev *dev, int *cqn)
+static int mlx4_cq_alloc_icm(struct mlx4_dev *dev, int *cqn, u8 usage)
 {
+	u32 in_modifier = RES_CQ | (((u32)usage & 3) << 30);
 	u64 out_param;
 	int err;
 
 	if (mlx4_is_mfunc(dev)) {
-		err = mlx4_cmd_imm(dev, 0, &out_param, RES_CQ,
+		err = mlx4_cmd_imm(dev, 0, &out_param, in_modifier,
 				   RES_OP_RESERVE_AND_MAP, MLX4_CMD_ALLOC_RES,
 				   MLX4_CMD_TIME_CLASS_A, MLX4_CMD_WRAPPED);
 		if (err)
@@ -303,7 +304,7 @@ int mlx4_cq_alloc(struct mlx4_dev *dev, int nent,
 
 	cq->vector = vector;
 
-	err = mlx4_cq_alloc_icm(dev, &cq->cqn);
+	err = mlx4_cq_alloc_icm(dev, &cq->cqn, cq->usage);
 	if (err)
 		return err;
 
diff --git a/drivers/net/ethernet/mellanox/mlx4/en_cq.c b/drivers/net/ethernet/mellanox/mlx4/en_cq.c
index 85fe17e4dcfb..f849eec21824 100644
--- a/drivers/net/ethernet/mellanox/mlx4/en_cq.c
+++ b/drivers/net/ethernet/mellanox/mlx4/en_cq.c
@@ -140,6 +140,7 @@ int mlx4_en_activate_cq(struct mlx4_en_priv *priv, struct mlx4_en_cq *cq,
 	    (cq->type == RX && priv->hwtstamp_config.rx_filter))
 		timestamp_en = 1;
 
+	cq->mcq.usage = MLX4_RES_USAGE_DRIVER;
 	err = mlx4_cq_alloc(mdev->dev, cq->size, &cq->wqres.mtt,
 			    &mdev->priv_uar, cq->wqres.db.dma, &cq->mcq,
 			    cq->vector, 0, timestamp_en);
diff --git a/drivers/net/ethernet/mellanox/mlx4/en_netdev.c b/drivers/net/ethernet/mellanox/mlx4/en_netdev.c
index 3a291fc1780a..e3e6d9fa69fd 100644
--- a/drivers/net/ethernet/mellanox/mlx4/en_netdev.c
+++ b/drivers/net/ethernet/mellanox/mlx4/en_netdev.c
@@ -651,7 +651,8 @@ static int mlx4_en_get_qp(struct mlx4_en_priv *priv)
 		return 0;
 	}
 
-	err = mlx4_qp_reserve_range(dev, 1, 1, qpn, MLX4_RESERVE_A0_QP);
+	err = mlx4_qp_reserve_range(dev, 1, 1, qpn, MLX4_RESERVE_A0_QP,
+				    MLX4_RES_USAGE_DRIVER);
 	en_dbg(DRV, priv, "Reserved qp %d\n", *qpn);
 	if (err) {
 		en_err(priv, "Failed to reserve qp for mac registration\n");
diff --git a/drivers/net/ethernet/mellanox/mlx4/en_rx.c b/drivers/net/ethernet/mellanox/mlx4/en_rx.c
index 436f7689a032..ad1ffd5857cb 100644
--- a/drivers/net/ethernet/mellanox/mlx4/en_rx.c
+++ b/drivers/net/ethernet/mellanox/mlx4/en_rx.c
@@ -1081,7 +1081,8 @@ int mlx4_en_create_drop_qp(struct mlx4_en_priv *priv)
 	u32 qpn;
 
 	err = mlx4_qp_reserve_range(priv->mdev->dev, 1, 1, &qpn,
-				    MLX4_RESERVE_A0_QP);
+				    MLX4_RESERVE_A0_QP,
+				    MLX4_RES_USAGE_DRIVER);
 	if (err) {
 		en_err(priv, "Failed reserving drop qpn\n");
 		return err;
@@ -1127,7 +1128,8 @@ int mlx4_en_config_rss_steer(struct mlx4_en_priv *priv)
 	flags = priv->rx_ring_num == 1 ? MLX4_RESERVE_A0_QP : 0;
 	err = mlx4_qp_reserve_range(mdev->dev, priv->rx_ring_num,
 				    priv->rx_ring_num,
-				    &rss_map->base_qpn, flags);
+				    &rss_map->base_qpn, flags,
+				    MLX4_RES_USAGE_DRIVER);
 	if (err) {
 		en_err(priv, "Failed reserving %d qps\n", priv->rx_ring_num);
 		return err;
diff --git a/drivers/net/ethernet/mellanox/mlx4/en_tx.c b/drivers/net/ethernet/mellanox/mlx4/en_tx.c
index 73faa3d77921..a81db2582555 100644
--- a/drivers/net/ethernet/mellanox/mlx4/en_tx.c
+++ b/drivers/net/ethernet/mellanox/mlx4/en_tx.c
@@ -105,7 +105,8 @@ int mlx4_en_create_tx_ring(struct mlx4_en_priv *priv,
 	       (unsigned long long) ring->sp_wqres.buf.direct.map);
 
 	err = mlx4_qp_reserve_range(mdev->dev, 1, 1, &ring->qpn,
-				    MLX4_RESERVE_ETH_BF_QP);
+				    MLX4_RESERVE_ETH_BF_QP,
+				    MLX4_RES_USAGE_DRIVER);
 	if (err) {
 		en_err(priv, "failed reserving qp for TX ring\n");
 		goto err_hwq_res;
diff --git a/drivers/net/ethernet/mellanox/mlx4/main.c b/drivers/net/ethernet/mellanox/mlx4/main.c
index a27c9c13a36e..fb2591d0e735 100644
--- a/drivers/net/ethernet/mellanox/mlx4/main.c
+++ b/drivers/net/ethernet/mellanox/mlx4/main.c
@@ -2475,7 +2475,7 @@ static int mlx4_allocate_default_counters(struct mlx4_dev *dev)
 		priv->def_counter[port] = -1;
 
 	for (port = 0; port < dev->caps.num_ports; port++) {
-		err = mlx4_counter_alloc(dev, &idx);
+		err = mlx4_counter_alloc(dev, &idx, MLX4_RES_USAGE_DRIVER);
 
 		if (!err || err == -ENOSPC) {
 			priv->def_counter[port] = idx;
@@ -2517,13 +2517,14 @@ int __mlx4_counter_alloc(struct mlx4_dev *dev, u32 *idx)
 	return 0;
 }
 
-int mlx4_counter_alloc(struct mlx4_dev *dev, u32 *idx)
+int mlx4_counter_alloc(struct mlx4_dev *dev, u32 *idx, u8 usage)
 {
+	u32 in_modifier = RES_COUNTER | (((u32)usage & 3) << 30);
 	u64 out_param;
 	int err;
 
 	if (mlx4_is_mfunc(dev)) {
-		err = mlx4_cmd_imm(dev, 0, &out_param, RES_COUNTER,
+		err = mlx4_cmd_imm(dev, 0, &out_param, in_modifier,
 				   RES_OP_RESERVE, MLX4_CMD_ALLOC_RES,
 				   MLX4_CMD_TIME_CLASS_A, MLX4_CMD_WRAPPED);
 		if (!err)
diff --git a/drivers/net/ethernet/mellanox/mlx4/qp.c b/drivers/net/ethernet/mellanox/mlx4/qp.c
index 26747212526b..5e5b4475b85e 100644
--- a/drivers/net/ethernet/mellanox/mlx4/qp.c
+++ b/drivers/net/ethernet/mellanox/mlx4/qp.c
@@ -245,8 +245,9 @@ int __mlx4_qp_reserve_range(struct mlx4_dev *dev, int cnt, int align,
 }
 
 int mlx4_qp_reserve_range(struct mlx4_dev *dev, int cnt, int align,
-			  int *base, u8 flags)
+			  int *base, u8 flags, u8 usage)
 {
+	u32 in_modifier = RES_QP | (((u32)usage & 3) << 30);
 	u64 in_param = 0;
 	u64 out_param;
 	int err;
@@ -258,7 +259,7 @@ int mlx4_qp_reserve_range(struct mlx4_dev *dev, int cnt, int align,
 		set_param_l(&in_param, (((u32)flags) << 24) | (u32)cnt);
 		set_param_h(&in_param, align);
 		err = mlx4_cmd_imm(dev, in_param, &out_param,
-				   RES_QP, RES_OP_RESERVE,
+				   in_modifier, RES_OP_RESERVE,
 				   MLX4_CMD_ALLOC_RES,
 				   MLX4_CMD_TIME_CLASS_A, MLX4_CMD_WRAPPED);
 		if (err)
diff --git a/include/linux/mlx4/device.h b/include/linux/mlx4/device.h
index aad5d81dfb44..3607da001ad3 100644
--- a/include/linux/mlx4/device.h
+++ b/include/linux/mlx4/device.h
@@ -428,6 +428,12 @@ enum mlx4_steer_type {
 	MLX4_NUM_STEERS
 };
 
+enum mlx4_resource_usage {
+	MLX4_RES_USAGE_NONE,
+	MLX4_RES_USAGE_DRIVER,
+	MLX4_RES_USAGE_USER_VERBS,
+};
+
 enum {
 	MLX4_NUM_FEXCH          = 64 * 1024,
 };
@@ -748,6 +754,7 @@ struct mlx4_cq {
 	} tasklet_ctx;
 	int		reset_notify_added;
 	struct list_head	reset_notify;
+	u8			usage;
 };
 
 struct mlx4_qp {
@@ -757,6 +764,7 @@ struct mlx4_qp {
 
 	atomic_t		refcount;
 	struct completion	free;
+	u8			usage;
 };
 
 struct mlx4_srq {
@@ -1120,7 +1128,7 @@ int mlx4_cq_alloc(struct mlx4_dev *dev, int nent, struct mlx4_mtt *mtt,
 		  unsigned vector, int collapsed, int timestamp_en);
 void mlx4_cq_free(struct mlx4_dev *dev, struct mlx4_cq *cq);
 int mlx4_qp_reserve_range(struct mlx4_dev *dev, int cnt, int align,
-			  int *base, u8 flags);
+			  int *base, u8 flags, u8 usage);
 void mlx4_qp_release_range(struct mlx4_dev *dev, int base_qpn, int cnt);
 
 int mlx4_qp_alloc(struct mlx4_dev *dev, int qpn, struct mlx4_qp *qp);
@@ -1417,7 +1425,7 @@ int mlx4_get_phys_port_id(struct mlx4_dev *dev);
 int mlx4_wol_read(struct mlx4_dev *dev, u64 *config, int port);
 int mlx4_wol_write(struct mlx4_dev *dev, u64 config, int port);
 
-int mlx4_counter_alloc(struct mlx4_dev *dev, u32 *idx);
+int mlx4_counter_alloc(struct mlx4_dev *dev, u32 *idx, u8 usage);
 void mlx4_counter_free(struct mlx4_dev *dev, u32 idx);
 int mlx4_get_default_counter_index(struct mlx4_dev *dev, int port);
 
-- 
cgit v1.2.3


From 400b1ebcfe31279895f1baa8ecaa390d9a4a0eef Mon Sep 17 00:00:00 2001
From: Guy Levi <guyle@mellanox.com>
Date: Tue, 4 Jul 2017 16:24:24 +0300
Subject: IB/mlx4: Add support for WQ related verbs

Support create/modify/destroy WQ related verbs.

The base IB object to enable RSS functionality is a WQ (i.e. ib_wq).
This patch implements the related WQ verbs as of create, modify and
destroy.

In downstream patches the WQ will be used as part of an indirection
table (i.e. ib_rwq_ind_table) to enable RSS QP creation.

Notes:
ConnectX-3 hardware requires consecutive WQNs list as receive descriptor
queues for the RSS QP. Hence, the driver manages consecutive ranges lists
per context which the user must respect.
Destroying the WQ does not return its WQN back to its range for
reusing. However, destroying all WQs from the same range releases the
range and in turn releases its WQNs for reusing.

Since the WQ object is not a natural object in the hardware, the driver
implements the WQ by the hardware QP.

As such, the WQ inherits its port from its RSS QP parent upon its
RST->INIT transition and by that time its state is applied to the
hardware.

Signed-off-by: Guy Levi <guyle@mellanox.com>
Reviewed-by: Yishai Hadas <yishaih@mellanox.com>
Signed-off-by: Leon Romanovsky <leon@kernel.org>
Signed-off-by: Doug Ledford <dledford@redhat.com>
---
 drivers/infiniband/hw/mlx4/main.c    |  24 ++
 drivers/infiniband/hw/mlx4/mlx4_ib.h |  25 +-
 drivers/infiniband/hw/mlx4/qp.c      | 505 +++++++++++++++++++++++++++++++----
 include/uapi/rdma/mlx4-abi.h         |  14 +
 4 files changed, 511 insertions(+), 57 deletions(-)

(limited to 'include')

diff --git a/drivers/infiniband/hw/mlx4/main.c b/drivers/infiniband/hw/mlx4/main.c
index 0944e224c0df..7c6f929ebd3e 100644
--- a/drivers/infiniband/hw/mlx4/main.c
+++ b/drivers/infiniband/hw/mlx4/main.c
@@ -81,6 +81,8 @@ static const char mlx4_ib_version[] =
 	DRV_VERSION "\n";
 
 static void do_slave_init(struct mlx4_ib_dev *ibdev, int slave, int do_init);
+static enum rdma_link_layer mlx4_ib_port_link_layer(struct ib_device *device,
+						    u8 port_num);
 
 static struct workqueue_struct *wq;
 
@@ -552,6 +554,11 @@ static int mlx4_ib_query_device(struct ib_device *ibdev,
 	props->timestamp_mask = 0xFFFFFFFFFFFFULL;
 	props->max_ah = INT_MAX;
 
+	if ((dev->dev->caps.flags2 & MLX4_DEV_CAP_FLAG2_RSS) &&
+	    (mlx4_ib_port_link_layer(ibdev, 1) == IB_LINK_LAYER_ETHERNET ||
+	     mlx4_ib_port_link_layer(ibdev, 2) == IB_LINK_LAYER_ETHERNET))
+		props->max_wq_type_rq = props->max_qp;
+
 	if (!mlx4_is_slave(dev->dev))
 		err = mlx4_get_internal_clock_params(dev->dev, &clock_params);
 
@@ -1076,6 +1083,9 @@ static struct ib_ucontext *mlx4_ib_alloc_ucontext(struct ib_device *ibdev,
 	INIT_LIST_HEAD(&context->db_page_list);
 	mutex_init(&context->db_page_mutex);
 
+	INIT_LIST_HEAD(&context->wqn_ranges_list);
+	mutex_init(&context->wqn_ranges_mutex);
+
 	if (ibdev->uverbs_abi_ver == MLX4_IB_UVERBS_NO_DEV_CAPS_ABI_VERSION)
 		err = ib_copy_to_udata(udata, &resp_v3, sizeof(resp_v3));
 	else
@@ -2720,6 +2730,20 @@ static void *mlx4_ib_add(struct mlx4_dev *dev)
 	ibdev->ib_dev.get_dev_fw_str    = get_fw_ver_str;
 	ibdev->ib_dev.disassociate_ucontext = mlx4_ib_disassociate_ucontext;
 
+	if ((dev->caps.flags2 & MLX4_DEV_CAP_FLAG2_RSS) &&
+	    ((mlx4_ib_port_link_layer(&ibdev->ib_dev, 1) ==
+	    IB_LINK_LAYER_ETHERNET) ||
+	    (mlx4_ib_port_link_layer(&ibdev->ib_dev, 2) ==
+	    IB_LINK_LAYER_ETHERNET))) {
+		ibdev->ib_dev.create_wq		= mlx4_ib_create_wq;
+		ibdev->ib_dev.modify_wq		= mlx4_ib_modify_wq;
+		ibdev->ib_dev.destroy_wq	= mlx4_ib_destroy_wq;
+		ibdev->ib_dev.uverbs_ex_cmd_mask |=
+			(1ull << IB_USER_VERBS_EX_CMD_CREATE_WQ) |
+			(1ull << IB_USER_VERBS_EX_CMD_MODIFY_WQ) |
+			(1ull << IB_USER_VERBS_EX_CMD_DESTROY_WQ);
+	}
+
 	if (!mlx4_is_slave(ibdev->dev)) {
 		ibdev->ib_dev.alloc_fmr		= mlx4_ib_fmr_alloc;
 		ibdev->ib_dev.map_phys_fmr	= mlx4_ib_map_phys_fmr;
diff --git a/drivers/infiniband/hw/mlx4/mlx4_ib.h b/drivers/infiniband/hw/mlx4/mlx4_ib.h
index a6337f3161cf..ba4e78c064d2 100644
--- a/drivers/infiniband/hw/mlx4/mlx4_ib.h
+++ b/drivers/infiniband/hw/mlx4/mlx4_ib.h
@@ -88,6 +88,8 @@ struct mlx4_ib_ucontext {
 	struct list_head	db_page_list;
 	struct mutex		db_page_mutex;
 	struct mlx4_ib_vma_private_data hw_bar_info[HW_BAR_COUNT];
+	struct list_head	wqn_ranges_list;
+	struct mutex		wqn_ranges_mutex; /* protect wqn_ranges_list */
 };
 
 struct mlx4_ib_pd {
@@ -289,8 +291,19 @@ struct mlx4_roce_smac_vlan_info {
 	int update_vid;
 };
 
+struct mlx4_wqn_range {
+	int			base_wqn;
+	int			size;
+	int			refcount;
+	bool			dirty;
+	struct list_head	list;
+};
+
 struct mlx4_ib_qp {
-	struct ib_qp		ibqp;
+	union {
+		struct ib_qp	ibqp;
+		struct ib_wq	ibwq;
+	};
 	struct mlx4_qp		mqp;
 	struct mlx4_buf		buf;
 
@@ -329,6 +342,9 @@ struct mlx4_ib_qp {
 	struct list_head	cq_recv_list;
 	struct list_head	cq_send_list;
 	struct counter_index	*counter_index;
+	struct mlx4_wqn_range	*wqn_range;
+	/* Number of RSS QP parents that uses this WQ */
+	u32			rss_usecnt;
 };
 
 struct mlx4_ib_srq {
@@ -893,4 +909,11 @@ void mlx4_sched_ib_sl2vl_update_work(struct mlx4_ib_dev *ibdev,
 
 void mlx4_ib_sl2vl_update(struct mlx4_ib_dev *mdev, int port);
 
+struct ib_wq *mlx4_ib_create_wq(struct ib_pd *pd,
+				struct ib_wq_init_attr *init_attr,
+				struct ib_udata *udata);
+int mlx4_ib_destroy_wq(struct ib_wq *wq);
+int mlx4_ib_modify_wq(struct ib_wq *wq, struct ib_wq_attr *wq_attr,
+		      u32 wq_attr_mask, struct ib_udata *udata);
+
 #endif /* MLX4_IB_H */
diff --git a/drivers/infiniband/hw/mlx4/qp.c b/drivers/infiniband/hw/mlx4/qp.c
index 247b9132e9de..65e4ec549368 100644
--- a/drivers/infiniband/hw/mlx4/qp.c
+++ b/drivers/infiniband/hw/mlx4/qp.c
@@ -116,6 +116,11 @@ static const __be32 mlx4_ib_opcode[] = {
 	[IB_WR_MASKED_ATOMIC_FETCH_AND_ADD]	= cpu_to_be32(MLX4_OPCODE_MASKED_ATOMIC_FA),
 };
 
+enum mlx4_ib_source_type {
+	MLX4_IB_QP_SRC	= 0,
+	MLX4_IB_RWQ_SRC	= 1,
+};
+
 static struct mlx4_ib_sqp *to_msqp(struct mlx4_ib_qp *mqp)
 {
 	return container_of(mqp, struct mlx4_ib_sqp, qp);
@@ -330,6 +335,12 @@ static void mlx4_ib_qp_event(struct mlx4_qp *qp, enum mlx4_event type)
 	}
 }
 
+static void mlx4_ib_wq_event(struct mlx4_qp *qp, enum mlx4_event type)
+{
+	pr_warn_ratelimited("Unexpected event type %d on WQ 0x%06x. Events are not supported for WQs\n",
+			    type, qp->qpn);
+}
+
 static int send_wqe_overhead(enum mlx4_ib_qp_type type, u32 flags)
 {
 	/*
@@ -639,7 +650,91 @@ static void mlx4_ib_free_qp_counter(struct mlx4_ib_dev *dev,
 	qp->counter_index = NULL;
 }
 
+/*
+ * This function allocates a WQN from a range which is consecutive and aligned
+ * to its size. In case the range is full, then it creates a new range and
+ * allocates WQN from it. The new range will be used for following allocations.
+ */
+static int mlx4_ib_alloc_wqn(struct mlx4_ib_ucontext *context,
+			     struct mlx4_ib_qp *qp, int range_size, int *wqn)
+{
+	struct mlx4_ib_dev *dev = to_mdev(context->ibucontext.device);
+	struct mlx4_wqn_range *range;
+	int err = 0;
+
+	mutex_lock(&context->wqn_ranges_mutex);
+
+	range = list_first_entry_or_null(&context->wqn_ranges_list,
+					 struct mlx4_wqn_range, list);
+
+	if (!range || (range->refcount == range->size) || range->dirty) {
+		range = kzalloc(sizeof(*range), GFP_KERNEL);
+		if (!range) {
+			err = -ENOMEM;
+			goto out;
+		}
+
+		err = mlx4_qp_reserve_range(dev->dev, range_size,
+					    range_size, &range->base_wqn, 0,
+					    qp->mqp.usage);
+		if (err) {
+			kfree(range);
+			goto out;
+		}
+
+		range->size = range_size;
+		list_add(&range->list, &context->wqn_ranges_list);
+	} else if (range_size != 1) {
+		/*
+		 * Requesting a new range (>1) when last range is still open, is
+		 * not valid.
+		 */
+		err = -EINVAL;
+		goto out;
+	}
+
+	qp->wqn_range = range;
+
+	*wqn = range->base_wqn + range->refcount;
+
+	range->refcount++;
+
+out:
+	mutex_unlock(&context->wqn_ranges_mutex);
+
+	return err;
+}
+
+static void mlx4_ib_release_wqn(struct mlx4_ib_ucontext *context,
+				struct mlx4_ib_qp *qp, bool dirty_release)
+{
+	struct mlx4_ib_dev *dev = to_mdev(context->ibucontext.device);
+	struct mlx4_wqn_range *range;
+
+	mutex_lock(&context->wqn_ranges_mutex);
+
+	range = qp->wqn_range;
+
+	range->refcount--;
+	if (!range->refcount) {
+		mlx4_qp_release_range(dev->dev, range->base_wqn,
+				      range->size);
+		list_del(&range->list);
+		kfree(range);
+	} else if (dirty_release) {
+	/*
+	 * A range which one of its WQNs is destroyed, won't be able to be
+	 * reused for further WQN allocations.
+	 * The next created WQ will allocate a new range.
+	 */
+		range->dirty = 1;
+	}
+
+	mutex_unlock(&context->wqn_ranges_mutex);
+}
+
 static int create_qp_common(struct mlx4_ib_dev *dev, struct ib_pd *pd,
+			    enum mlx4_ib_source_type src,
 			    struct ib_qp_init_attr *init_attr,
 			    struct ib_udata *udata, int sqpn,
 			    struct mlx4_ib_qp **caller_qp)
@@ -652,6 +747,7 @@ static int create_qp_common(struct mlx4_ib_dev *dev, struct ib_pd *pd,
 	enum mlx4_ib_qp_type qp_type = (enum mlx4_ib_qp_type) init_attr->qp_type;
 	struct mlx4_ib_cq *mcq;
 	unsigned long flags;
+	int range_size = 0;
 
 	/* When tunneling special qps, we use a plain UD qp */
 	if (sqpn) {
@@ -728,27 +824,69 @@ static int create_qp_common(struct mlx4_ib_dev *dev, struct ib_pd *pd,
 
 
 	if (pd->uobject) {
-		struct mlx4_ib_create_qp ucmd;
+		union {
+			struct mlx4_ib_create_qp qp;
+			struct mlx4_ib_create_wq wq;
+		} ucmd;
+		size_t copy_len;
+
+		copy_len = (src == MLX4_IB_QP_SRC) ?
+			   sizeof(struct mlx4_ib_create_qp) :
+			   min(sizeof(struct mlx4_ib_create_wq), udata->inlen);
 
-		if (ib_copy_from_udata(&ucmd, udata, sizeof ucmd)) {
+		if (ib_copy_from_udata(&ucmd, udata, copy_len)) {
 			err = -EFAULT;
 			goto err;
 		}
 
-		err = set_rq_size(dev, &init_attr->cap, !!pd->uobject,
-				  qp_has_rq(init_attr), qp, ucmd.inl_recv_sz);
-		if (err)
-			goto err;
+		if (src == MLX4_IB_RWQ_SRC) {
+			if (ucmd.wq.comp_mask || ucmd.wq.reserved1 ||
+			    ucmd.wq.reserved[0] || ucmd.wq.reserved[1] ||
+			    ucmd.wq.reserved[2]) {
+				pr_debug("user command isn't supported\n");
+				err = -EOPNOTSUPP;
+				goto err;
+			}
 
-		qp->inl_recv_sz = ucmd.inl_recv_sz;
-		qp->sq_no_prefetch = ucmd.sq_no_prefetch;
+			if (ucmd.wq.log_range_size >
+			    ilog2(dev->dev->caps.max_rss_tbl_sz)) {
+				pr_debug("WQN range size must be equal or smaller than %d\n",
+					 dev->dev->caps.max_rss_tbl_sz);
+				err = -EOPNOTSUPP;
+				goto err;
+			}
+			range_size = 1 << ucmd.wq.log_range_size;
+		} else {
+			qp->inl_recv_sz = ucmd.qp.inl_recv_sz;
+		}
 
-		err = set_user_sq_size(dev, qp, &ucmd);
+		err = set_rq_size(dev, &init_attr->cap, !!pd->uobject,
+				  qp_has_rq(init_attr), qp, qp->inl_recv_sz);
 		if (err)
 			goto err;
 
-		qp->umem = ib_umem_get(pd->uobject->context, ucmd.buf_addr,
-				       qp->buf_size, 0, 0);
+		if (src == MLX4_IB_QP_SRC) {
+			qp->sq_no_prefetch = ucmd.qp.sq_no_prefetch;
+
+			err = set_user_sq_size(dev, qp,
+					       (struct mlx4_ib_create_qp *)
+					       &ucmd);
+			if (err)
+				goto err;
+		} else {
+			qp->sq_no_prefetch = 1;
+			qp->sq.wqe_cnt = 1;
+			qp->sq.wqe_shift = MLX4_IB_MIN_SQ_STRIDE;
+			/* Allocated buffer expects to have at least that SQ
+			 * size.
+			 */
+			qp->buf_size = (qp->rq.wqe_cnt << qp->rq.wqe_shift) +
+				(qp->sq.wqe_cnt << qp->sq.wqe_shift);
+		}
+
+		qp->umem = ib_umem_get(pd->uobject->context,
+				(src == MLX4_IB_QP_SRC) ? ucmd.qp.buf_addr :
+				ucmd.wq.buf_addr, qp->buf_size, 0, 0);
 		if (IS_ERR(qp->umem)) {
 			err = PTR_ERR(qp->umem);
 			goto err;
@@ -765,7 +903,8 @@ static int create_qp_common(struct mlx4_ib_dev *dev, struct ib_pd *pd,
 
 		if (qp_has_rq(init_attr)) {
 			err = mlx4_ib_db_map_user(to_mucontext(pd->uobject->context),
-						  ucmd.db_addr, &qp->db);
+				(src == MLX4_IB_QP_SRC) ? ucmd.qp.db_addr :
+				ucmd.wq.db_addr, &qp->db);
 			if (err)
 				goto err_mtt;
 		}
@@ -853,6 +992,11 @@ static int create_qp_common(struct mlx4_ib_dev *dev, struct ib_pd *pd,
 				goto err_wrid;
 			}
 		}
+	} else if (src == MLX4_IB_RWQ_SRC) {
+		err = mlx4_ib_alloc_wqn(to_mucontext(pd->uobject->context), qp,
+					range_size, &qpn);
+		if (err)
+			goto err_wrid;
 	} else {
 		/* Raw packet QPNs may not have bits 6,7 set in their qp_num;
 		 * otherwise, the WQE BlueFlame setup flow wrongly causes
@@ -891,7 +1035,9 @@ static int create_qp_common(struct mlx4_ib_dev *dev, struct ib_pd *pd,
 	 */
 	qp->doorbell_qpn = swab32(qp->mqp.qpn << 8);
 
-	qp->mqp.event = mlx4_ib_qp_event;
+	qp->mqp.event = (src == MLX4_IB_QP_SRC) ? mlx4_ib_qp_event :
+						  mlx4_ib_wq_event;
+
 	if (!*caller_qp)
 		*caller_qp = qp;
 
@@ -918,6 +1064,9 @@ err_qpn:
 	if (!sqpn) {
 		if (qp->flags & MLX4_IB_QP_NETIF)
 			mlx4_ib_steer_qp_free(dev, qpn, 1);
+		else if (src == MLX4_IB_RWQ_SRC)
+			mlx4_ib_release_wqn(to_mucontext(pd->uobject->context),
+					    qp, 0);
 		else
 			mlx4_qp_release_range(dev->dev, qpn, 1);
 	}
@@ -1016,7 +1165,7 @@ static struct mlx4_ib_pd *get_pd(struct mlx4_ib_qp *qp)
 		return to_mpd(qp->ibqp.pd);
 }
 
-static void get_cqs(struct mlx4_ib_qp *qp,
+static void get_cqs(struct mlx4_ib_qp *qp, enum mlx4_ib_source_type src,
 		    struct mlx4_ib_cq **send_cq, struct mlx4_ib_cq **recv_cq)
 {
 	switch (qp->ibqp.qp_type) {
@@ -1029,14 +1178,16 @@ static void get_cqs(struct mlx4_ib_qp *qp,
 		*recv_cq = *send_cq;
 		break;
 	default:
-		*send_cq = to_mcq(qp->ibqp.send_cq);
-		*recv_cq = to_mcq(qp->ibqp.recv_cq);
+		*recv_cq = (src == MLX4_IB_QP_SRC) ? to_mcq(qp->ibqp.recv_cq) :
+						     to_mcq(qp->ibwq.cq);
+		*send_cq = (src == MLX4_IB_QP_SRC) ? to_mcq(qp->ibqp.send_cq) :
+						     *recv_cq;
 		break;
 	}
 }
 
 static void destroy_qp_common(struct mlx4_ib_dev *dev, struct mlx4_ib_qp *qp,
-			      int is_user)
+			      enum mlx4_ib_source_type src, int is_user)
 {
 	struct mlx4_ib_cq *send_cq, *recv_cq;
 	unsigned long flags;
@@ -1069,7 +1220,7 @@ static void destroy_qp_common(struct mlx4_ib_dev *dev, struct mlx4_ib_qp *qp,
 		}
 	}
 
-	get_cqs(qp, &send_cq, &recv_cq);
+	get_cqs(qp, src, &send_cq, &recv_cq);
 
 	spin_lock_irqsave(&dev->reset_flow_resource_lock, flags);
 	mlx4_ib_lock_cqs(send_cq, recv_cq);
@@ -1095,6 +1246,9 @@ static void destroy_qp_common(struct mlx4_ib_dev *dev, struct mlx4_ib_qp *qp,
 	if (!is_sqp(dev, qp) && !is_tunnel_qp(dev, qp)) {
 		if (qp->flags & MLX4_IB_QP_NETIF)
 			mlx4_ib_steer_qp_free(dev, qp->mqp.qpn, 1);
+		else if (src == MLX4_IB_RWQ_SRC)
+			mlx4_ib_release_wqn(to_mucontext(
+					    qp->ibwq.uobject->context), qp, 1);
 		else
 			mlx4_qp_release_range(dev->dev, qp->mqp.qpn, 1);
 	}
@@ -1102,9 +1256,12 @@ static void destroy_qp_common(struct mlx4_ib_dev *dev, struct mlx4_ib_qp *qp,
 	mlx4_mtt_cleanup(dev->dev, &qp->mtt);
 
 	if (is_user) {
-		if (qp->rq.wqe_cnt)
-			mlx4_ib_db_unmap_user(to_mucontext(qp->ibqp.uobject->context),
-					      &qp->db);
+		if (qp->rq.wqe_cnt) {
+			struct mlx4_ib_ucontext *mcontext = !src ?
+				to_mucontext(qp->ibqp.uobject->context) :
+				to_mucontext(qp->ibwq.uobject->context);
+			mlx4_ib_db_unmap_user(mcontext, &qp->db);
+		}
 		ib_umem_release(qp->umem);
 	} else {
 		kvfree(qp->sq.wrid);
@@ -1200,8 +1357,8 @@ static struct ib_qp *_mlx4_ib_create_qp(struct ib_pd *pd,
 		/* fall through */
 	case IB_QPT_UD:
 	{
-		err = create_qp_common(to_mdev(pd->device), pd, init_attr,
-				       udata, 0, &qp);
+		err = create_qp_common(to_mdev(pd->device), pd,	MLX4_IB_QP_SRC,
+				       init_attr, udata, 0, &qp);
 		if (err) {
 			kfree(qp);
 			return ERR_PTR(err);
@@ -1231,8 +1388,8 @@ static struct ib_qp *_mlx4_ib_create_qp(struct ib_pd *pd,
 			sqpn = get_sqp_num(to_mdev(pd->device), init_attr);
 		}
 
-		err = create_qp_common(to_mdev(pd->device), pd, init_attr, udata,
-				       sqpn, &qp);
+		err = create_qp_common(to_mdev(pd->device), pd, MLX4_IB_QP_SRC,
+				       init_attr, udata, sqpn, &qp);
 		if (err)
 			return ERR_PTR(err);
 
@@ -1303,7 +1460,7 @@ static int _mlx4_ib_destroy_qp(struct ib_qp *qp)
 		mlx4_ib_free_qp_counter(dev, mqp);
 
 	pd = get_pd(mqp);
-	destroy_qp_common(dev, mqp, !!pd->ibpd.uobject);
+	destroy_qp_common(dev, mqp, MLX4_IB_QP_SRC, !!pd->ibpd.uobject);
 
 	if (is_sqp(dev, mqp))
 		kfree(to_msqp(mqp));
@@ -1626,12 +1783,15 @@ static u8 gid_type_to_qpc(enum ib_gid_type gid_type)
 	}
 }
 
-static int __mlx4_ib_modify_qp(struct ib_qp *ibqp,
+static int __mlx4_ib_modify_qp(void *src, enum mlx4_ib_source_type src_type,
 			       const struct ib_qp_attr *attr, int attr_mask,
 			       enum ib_qp_state cur_state, enum ib_qp_state new_state)
 {
-	struct mlx4_ib_dev *dev = to_mdev(ibqp->device);
-	struct mlx4_ib_qp *qp = to_mqp(ibqp);
+	struct ib_uobject *ibuobject;
+	struct ib_srq  *ibsrq;
+	enum ib_qp_type qp_type;
+	struct mlx4_ib_dev *dev;
+	struct mlx4_ib_qp *qp;
 	struct mlx4_ib_pd *pd;
 	struct mlx4_ib_cq *send_cq, *recv_cq;
 	struct mlx4_qp_context *context;
@@ -1641,6 +1801,28 @@ static int __mlx4_ib_modify_qp(struct ib_qp *ibqp,
 	int err = -EINVAL;
 	int counter_index;
 
+	if (src_type == MLX4_IB_RWQ_SRC) {
+		struct ib_wq *ibwq;
+
+		ibwq	   = (struct ib_wq *)src;
+		ibuobject  = ibwq->uobject;
+		ibsrq	   = NULL;
+		qp_type    = IB_QPT_RAW_PACKET;
+		qp	   = to_mqp((struct ib_qp *)ibwq);
+		dev	   = to_mdev(ibwq->device);
+		pd	   = to_mpd(ibwq->pd);
+	} else {
+		struct ib_qp *ibqp;
+
+		ibqp	   = (struct ib_qp *)src;
+		ibuobject  = ibqp->uobject;
+		ibsrq	   = ibqp->srq;
+		qp_type    = ibqp->qp_type;
+		qp	   = to_mqp(ibqp);
+		dev	   = to_mdev(ibqp->device);
+		pd	   = get_pd(qp);
+	}
+
 	/* APM is not supported under RoCE */
 	if (attr_mask & IB_QP_ALT_PATH &&
 	    rdma_port_get_link_layer(&dev->ib_dev, qp->port) ==
@@ -1674,11 +1856,11 @@ static int __mlx4_ib_modify_qp(struct ib_qp *ibqp,
 	if (qp->inl_recv_sz)
 		context->param3 |= cpu_to_be32(1 << 25);
 
-	if (ibqp->qp_type == IB_QPT_GSI || ibqp->qp_type == IB_QPT_SMI)
+	if (qp_type == IB_QPT_GSI || qp_type == IB_QPT_SMI)
 		context->mtu_msgmax = (IB_MTU_4096 << 5) | 11;
-	else if (ibqp->qp_type == IB_QPT_RAW_PACKET)
+	else if (qp_type == IB_QPT_RAW_PACKET)
 		context->mtu_msgmax = (MLX4_RAW_QP_MTU << 5) | MLX4_RAW_QP_MSGMAX;
-	else if (ibqp->qp_type == IB_QPT_UD) {
+	else if (qp_type == IB_QPT_UD) {
 		if (qp->flags & MLX4_IB_QP_LSO)
 			context->mtu_msgmax = (IB_MTU_4096 << 5) |
 					      ilog2(dev->dev->caps.max_gso_sz);
@@ -1708,14 +1890,15 @@ static int __mlx4_ib_modify_qp(struct ib_qp *ibqp,
 	if (cur_state == IB_QPS_RESET && new_state == IB_QPS_INIT) {
 		context->sq_size_stride |= !!qp->sq_no_prefetch << 7;
 		context->xrcd = cpu_to_be32((u32) qp->xrcdn);
-		if (ibqp->qp_type == IB_QPT_RAW_PACKET)
+		if (qp_type == IB_QPT_RAW_PACKET)
 			context->param3 |= cpu_to_be32(1 << 30);
 	}
 
-	if (qp->ibqp.uobject)
+	if (ibuobject)
 		context->usr_page = cpu_to_be32(
 			mlx4_to_hw_uar_index(dev->dev,
-					     to_mucontext(ibqp->uobject->context)->uar.index));
+					     to_mucontext(ibuobject->context)
+					     ->uar.index));
 	else
 		context->usr_page = cpu_to_be32(
 			mlx4_to_hw_uar_index(dev->dev, dev->priv_uar.index));
@@ -1759,7 +1942,7 @@ static int __mlx4_ib_modify_qp(struct ib_qp *ibqp,
 			steer_qp = 1;
 		}
 
-		if (ibqp->qp_type == IB_QPT_GSI) {
+		if (qp_type == IB_QPT_GSI) {
 			enum ib_gid_type gid_type = qp->flags & MLX4_IB_ROCE_V2_GSI_QP ?
 				IB_GID_TYPE_ROCE_UDP_ENCAP : IB_GID_TYPE_ROCE;
 			u8 qpc_roce_mode = gid_type_to_qpc(gid_type);
@@ -1776,7 +1959,7 @@ static int __mlx4_ib_modify_qp(struct ib_qp *ibqp,
 	}
 
 	if (attr_mask & IB_QP_AV) {
-		u8 port_num = mlx4_is_bonded(to_mdev(ibqp->device)->dev) ? 1 :
+		u8 port_num = mlx4_is_bonded(dev->dev) ? 1 :
 			attr_mask & IB_QP_PORT ? attr->port_num : qp->port;
 		union ib_gid gid;
 		struct ib_gid_attr gid_attr = {.gid_type = IB_GID_TYPE_IB};
@@ -1791,7 +1974,7 @@ static int __mlx4_ib_modify_qp(struct ib_qp *ibqp,
 			int index =
 				rdma_ah_read_grh(&attr->ah_attr)->sgid_index;
 
-			status = ib_get_cached_gid(ibqp->device, port_num,
+			status = ib_get_cached_gid(&dev->ib_dev, port_num,
 						   index, &gid, &gid_attr);
 			if (!status && !memcmp(&gid, &zgid, sizeof(gid)))
 				status = -ENOENT;
@@ -1848,15 +2031,14 @@ static int __mlx4_ib_modify_qp(struct ib_qp *ibqp,
 		optpar |= MLX4_QP_OPTPAR_ALT_ADDR_PATH;
 	}
 
-	pd = get_pd(qp);
-	get_cqs(qp, &send_cq, &recv_cq);
+	get_cqs(qp, src_type, &send_cq, &recv_cq);
 	context->pd       = cpu_to_be32(pd->pdn);
 	context->cqn_send = cpu_to_be32(send_cq->mcq.cqn);
 	context->cqn_recv = cpu_to_be32(recv_cq->mcq.cqn);
 	context->params1  = cpu_to_be32(MLX4_IB_ACK_REQ_FREQ << 28);
 
 	/* Set "fast registration enabled" for all kernel QPs */
-	if (!qp->ibqp.uobject)
+	if (!ibuobject)
 		context->params1 |= cpu_to_be32(1 << 11);
 
 	if (attr_mask & IB_QP_RNR_RETRY) {
@@ -1891,7 +2073,7 @@ static int __mlx4_ib_modify_qp(struct ib_qp *ibqp,
 		optpar |= MLX4_QP_OPTPAR_RWE | MLX4_QP_OPTPAR_RRE | MLX4_QP_OPTPAR_RAE;
 	}
 
-	if (ibqp->srq)
+	if (ibsrq)
 		context->params2 |= cpu_to_be32(MLX4_QP_BIT_RIC);
 
 	if (attr_mask & IB_QP_MIN_RNR_TIMER) {
@@ -1922,17 +2104,19 @@ static int __mlx4_ib_modify_qp(struct ib_qp *ibqp,
 		optpar |= MLX4_QP_OPTPAR_Q_KEY;
 	}
 
-	if (ibqp->srq)
-		context->srqn = cpu_to_be32(1 << 24 | to_msrq(ibqp->srq)->msrq.srqn);
+	if (ibsrq)
+		context->srqn = cpu_to_be32(1 << 24 |
+					    to_msrq(ibsrq)->msrq.srqn);
 
-	if (qp->rq.wqe_cnt && cur_state == IB_QPS_RESET && new_state == IB_QPS_INIT)
+	if (qp->rq.wqe_cnt &&
+	    cur_state == IB_QPS_RESET &&
+	    new_state == IB_QPS_INIT)
 		context->db_rec_addr = cpu_to_be64(qp->db.dma);
 
 	if (cur_state == IB_QPS_INIT &&
 	    new_state == IB_QPS_RTR  &&
-	    (ibqp->qp_type == IB_QPT_GSI || ibqp->qp_type == IB_QPT_SMI ||
-	     ibqp->qp_type == IB_QPT_UD ||
-	     ibqp->qp_type == IB_QPT_RAW_PACKET)) {
+	    (qp_type == IB_QPT_GSI || qp_type == IB_QPT_SMI ||
+	     qp_type == IB_QPT_UD || qp_type == IB_QPT_RAW_PACKET)) {
 		context->pri_path.sched_queue = (qp->port - 1) << 6;
 		if (qp->mlx4_ib_qp_type == MLX4_IB_QPT_SMI ||
 		    qp->mlx4_ib_qp_type &
@@ -1965,7 +2149,7 @@ static int __mlx4_ib_modify_qp(struct ib_qp *ibqp,
 		}
 	}
 
-	if (qp->ibqp.qp_type == IB_QPT_RAW_PACKET) {
+	if (qp_type == IB_QPT_RAW_PACKET) {
 		context->pri_path.ackto = (context->pri_path.ackto & 0xf8) |
 					MLX4_IB_LINK_TYPE_ETH;
 		if (dev->dev->caps.tunnel_offload_mode ==  MLX4_TUNNEL_OFFLOAD_MODE_VXLAN) {
@@ -1975,7 +2159,7 @@ static int __mlx4_ib_modify_qp(struct ib_qp *ibqp,
 		}
 	}
 
-	if (ibqp->qp_type == IB_QPT_UD && (new_state == IB_QPS_RTR)) {
+	if (qp_type == IB_QPT_UD && (new_state == IB_QPS_RTR)) {
 		int is_eth = rdma_port_get_link_layer(
 				&dev->ib_dev, qp->port) ==
 				IB_LINK_LAYER_ETHERNET;
@@ -1985,14 +2169,15 @@ static int __mlx4_ib_modify_qp(struct ib_qp *ibqp,
 		}
 	}
 
-
 	if (cur_state == IB_QPS_RTS && new_state == IB_QPS_SQD	&&
 	    attr_mask & IB_QP_EN_SQD_ASYNC_NOTIFY && attr->en_sqd_async_notify)
 		sqd_event = 1;
 	else
 		sqd_event = 0;
 
-	if (!ibqp->uobject && cur_state == IB_QPS_RESET && new_state == IB_QPS_INIT)
+	if (!ibuobject &&
+	    cur_state == IB_QPS_RESET &&
+	    new_state == IB_QPS_INIT)
 		context->rlkey_roce_mode |= (1 << 4);
 
 	/*
@@ -2001,7 +2186,9 @@ static int __mlx4_ib_modify_qp(struct ib_qp *ibqp,
 	 * headroom is stamped so that the hardware doesn't start
 	 * processing stale work requests.
 	 */
-	if (!ibqp->uobject && cur_state == IB_QPS_RESET && new_state == IB_QPS_INIT) {
+	if (!ibuobject &&
+	    cur_state == IB_QPS_RESET &&
+	    new_state == IB_QPS_INIT) {
 		struct mlx4_wqe_ctrl_seg *ctrl;
 		int i;
 
@@ -2058,9 +2245,9 @@ static int __mlx4_ib_modify_qp(struct ib_qp *ibqp,
 	 * entries and reinitialize the QP.
 	 */
 	if (new_state == IB_QPS_RESET) {
-		if (!ibqp->uobject) {
+		if (!ibuobject) {
 			mlx4_ib_cq_clean(recv_cq, qp->mqp.qpn,
-					 ibqp->srq ? to_msrq(ibqp->srq) : NULL);
+					 ibsrq ? to_msrq(ibsrq) : NULL);
 			if (send_cq != recv_cq)
 				mlx4_ib_cq_clean(send_cq, qp->mqp.qpn, NULL);
 
@@ -2265,7 +2452,8 @@ static int _mlx4_ib_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr,
 		goto out;
 	}
 
-	err = __mlx4_ib_modify_qp(ibqp, attr, attr_mask, cur_state, new_state);
+	err = __mlx4_ib_modify_qp(ibqp, MLX4_IB_QP_SRC, attr, attr_mask,
+				  cur_state, new_state);
 
 	if (mlx4_is_bonded(dev->dev) && (attr_mask & IB_QP_PORT))
 		attr->port_num = 1;
@@ -3550,3 +3738,208 @@ out:
 	return err;
 }
 
+struct ib_wq *mlx4_ib_create_wq(struct ib_pd *pd,
+				struct ib_wq_init_attr *init_attr,
+				struct ib_udata *udata)
+{
+	struct mlx4_ib_dev *dev;
+	struct ib_qp_init_attr ib_qp_init_attr;
+	struct mlx4_ib_qp *qp;
+	struct mlx4_ib_create_wq ucmd;
+	int err, required_cmd_sz;
+
+	if (!(udata && pd->uobject))
+		return ERR_PTR(-EINVAL);
+
+	required_cmd_sz = offsetof(typeof(ucmd), reserved) +
+			  sizeof(ucmd.reserved);
+	if (udata->inlen < required_cmd_sz) {
+		pr_debug("invalid inlen\n");
+		return ERR_PTR(-EINVAL);
+	}
+
+	if (udata->inlen > sizeof(ucmd) &&
+	    !ib_is_udata_cleared(udata, sizeof(ucmd),
+				 udata->inlen - sizeof(ucmd))) {
+		pr_debug("inlen is not supported\n");
+		return ERR_PTR(-EOPNOTSUPP);
+	}
+
+	if (udata->outlen)
+		return ERR_PTR(-EOPNOTSUPP);
+
+	dev = to_mdev(pd->device);
+
+	if (init_attr->wq_type != IB_WQT_RQ) {
+		pr_debug("unsupported wq type %d\n", init_attr->wq_type);
+		return ERR_PTR(-EOPNOTSUPP);
+	}
+
+	if (init_attr->create_flags) {
+		pr_debug("unsupported create_flags %u\n",
+			 init_attr->create_flags);
+		return ERR_PTR(-EOPNOTSUPP);
+	}
+
+	qp = kzalloc(sizeof(*qp), GFP_KERNEL);
+	if (!qp)
+		return ERR_PTR(-ENOMEM);
+
+	qp->pri.vid = 0xFFFF;
+	qp->alt.vid = 0xFFFF;
+
+	memset(&ib_qp_init_attr, 0, sizeof(ib_qp_init_attr));
+	ib_qp_init_attr.qp_context = init_attr->wq_context;
+	ib_qp_init_attr.qp_type = IB_QPT_RAW_PACKET;
+	ib_qp_init_attr.cap.max_recv_wr = init_attr->max_wr;
+	ib_qp_init_attr.cap.max_recv_sge = init_attr->max_sge;
+	ib_qp_init_attr.recv_cq = init_attr->cq;
+	ib_qp_init_attr.send_cq = ib_qp_init_attr.recv_cq; /* Dummy CQ */
+
+	err = create_qp_common(dev, pd, MLX4_IB_RWQ_SRC, &ib_qp_init_attr,
+			       udata, 0, &qp);
+	if (err) {
+		kfree(qp);
+		return ERR_PTR(err);
+	}
+
+	qp->ibwq.event_handler = init_attr->event_handler;
+	qp->ibwq.wq_num = qp->mqp.qpn;
+	qp->ibwq.state = IB_WQS_RESET;
+
+	return &qp->ibwq;
+}
+
+static int ib_wq2qp_state(enum ib_wq_state state)
+{
+	switch (state) {
+	case IB_WQS_RESET:
+		return IB_QPS_RESET;
+	case IB_WQS_RDY:
+		return IB_QPS_RTR;
+	default:
+		return IB_QPS_ERR;
+	}
+}
+
+static int _mlx4_ib_modify_wq(struct ib_wq *ibwq, enum ib_wq_state new_state)
+{
+	struct mlx4_ib_qp *qp = to_mqp((struct ib_qp *)ibwq);
+	enum ib_qp_state qp_cur_state;
+	enum ib_qp_state qp_new_state;
+	int attr_mask;
+	int err;
+
+	/* ib_qp.state represents the WQ HW state while ib_wq.state represents
+	 * the WQ logic state.
+	 */
+	qp_cur_state = qp->state;
+	qp_new_state = ib_wq2qp_state(new_state);
+
+	if (ib_wq2qp_state(new_state) == qp_cur_state)
+		return 0;
+
+	if (new_state == IB_WQS_RDY) {
+		struct ib_qp_attr attr = {};
+
+		attr.port_num = qp->port;
+		attr_mask = IB_QP_PORT;
+
+		err = __mlx4_ib_modify_qp(ibwq, MLX4_IB_RWQ_SRC, &attr,
+					  attr_mask, IB_QPS_RESET, IB_QPS_INIT);
+		if (err) {
+			pr_debug("WQN=0x%06x failed to apply RST->INIT on the HW QP\n",
+				 ibwq->wq_num);
+			return err;
+		}
+
+		qp_cur_state = IB_QPS_INIT;
+	}
+
+	attr_mask = 0;
+	err = __mlx4_ib_modify_qp(ibwq, MLX4_IB_RWQ_SRC, NULL, attr_mask,
+				  qp_cur_state,  qp_new_state);
+
+	if (err && (qp_cur_state == IB_QPS_INIT)) {
+		qp_new_state = IB_QPS_RESET;
+		if (__mlx4_ib_modify_qp(ibwq, MLX4_IB_RWQ_SRC, NULL,
+					attr_mask, IB_QPS_INIT, IB_QPS_RESET)) {
+			pr_warn("WQN=0x%06x failed with reverting HW's resources failure\n",
+				ibwq->wq_num);
+			qp_new_state = IB_QPS_INIT;
+		}
+	}
+
+	qp->state = qp_new_state;
+
+	return err;
+}
+
+int mlx4_ib_modify_wq(struct ib_wq *ibwq, struct ib_wq_attr *wq_attr,
+		      u32 wq_attr_mask, struct ib_udata *udata)
+{
+	struct mlx4_ib_qp *qp = to_mqp((struct ib_qp *)ibwq);
+	struct mlx4_ib_modify_wq ucmd = {};
+	size_t required_cmd_sz;
+	enum ib_wq_state cur_state, new_state;
+	int err = 0;
+
+	required_cmd_sz = offsetof(typeof(ucmd), reserved) +
+				   sizeof(ucmd.reserved);
+	if (udata->inlen < required_cmd_sz)
+		return -EINVAL;
+
+	if (udata->inlen > sizeof(ucmd) &&
+	    !ib_is_udata_cleared(udata, sizeof(ucmd),
+				 udata->inlen - sizeof(ucmd)))
+		return -EOPNOTSUPP;
+
+	if (ib_copy_from_udata(&ucmd, udata, min(sizeof(ucmd), udata->inlen)))
+		return -EFAULT;
+
+	if (ucmd.comp_mask || ucmd.reserved)
+		return -EOPNOTSUPP;
+
+	if (wq_attr_mask & IB_WQ_FLAGS)
+		return -EOPNOTSUPP;
+
+	cur_state = wq_attr_mask & IB_WQ_CUR_STATE ? wq_attr->curr_wq_state :
+						     ibwq->state;
+	new_state = wq_attr_mask & IB_WQ_STATE ? wq_attr->wq_state : cur_state;
+
+	if (cur_state  < IB_WQS_RESET || cur_state  > IB_WQS_ERR ||
+	    new_state < IB_WQS_RESET || new_state > IB_WQS_ERR)
+		return -EINVAL;
+
+	if ((new_state == IB_WQS_RDY) && (cur_state == IB_WQS_ERR))
+		return -EINVAL;
+
+	if ((new_state == IB_WQS_ERR) && (cur_state == IB_WQS_RESET))
+		return -EINVAL;
+
+	/* Can update HW state only if a RSS QP has already associated to this
+	 * WQ, so we can apply its port on the WQ.
+	 */
+	if (qp->rss_usecnt)
+		err = _mlx4_ib_modify_wq(ibwq, new_state);
+
+	if (!err)
+		ibwq->state = new_state;
+
+	return err;
+}
+
+int mlx4_ib_destroy_wq(struct ib_wq *ibwq)
+{
+	struct mlx4_ib_dev *dev = to_mdev(ibwq->device);
+	struct mlx4_ib_qp *qp = to_mqp((struct ib_qp *)ibwq);
+
+	if (qp->counter_index)
+		mlx4_ib_free_qp_counter(dev, qp);
+
+	destroy_qp_common(dev, qp, MLX4_IB_RWQ_SRC, 1);
+
+	kfree(qp);
+
+	return 0;
+}
diff --git a/include/uapi/rdma/mlx4-abi.h b/include/uapi/rdma/mlx4-abi.h
index bf3bdba2f326..c9702a5f0bda 100644
--- a/include/uapi/rdma/mlx4-abi.h
+++ b/include/uapi/rdma/mlx4-abi.h
@@ -105,4 +105,18 @@ struct mlx4_ib_create_qp {
 	__u8	reserved;
 };
 
+struct mlx4_ib_create_wq {
+	__u64	buf_addr;
+	__u64	db_addr;
+	__u8	log_range_size;
+	__u8	reserved[3];
+	__u32   comp_mask;
+	__u32   reserved1;
+};
+
+struct mlx4_ib_modify_wq {
+	__u32	comp_mask;
+	__u32	reserved;
+};
+
 #endif /* MLX4_ABI_USER_H */
-- 
cgit v1.2.3


From b8d46ca035060e70f5f0da849d86720752d5aa17 Mon Sep 17 00:00:00 2001
From: Guy Levi <guyle@mellanox.com>
Date: Tue, 4 Jul 2017 16:24:25 +0300
Subject: IB/mlx4: Add support for WQ indirection table related verbs

To enable RSS functionality the IB indirection table object (i.e.
ib_rwq_ind_table) should be used.
This patch implements the related verbs as of create and destroy an
indirection table.

In downstream patches the indirection table will be used as part of RSS
QP creation.

Signed-off-by: Guy Levi <guyle@mellanox.com>
Reviewed-by: Yishai Hadas <yishaih@mellanox.com>
Signed-off-by: Leon Romanovsky <leon@kernel.org>
Signed-off-by: Doug Ledford <dledford@redhat.com>
---
 drivers/infiniband/hw/mlx4/main.c    | 12 +++++--
 drivers/infiniband/hw/mlx4/mlx4_ib.h |  6 ++++
 drivers/infiniband/hw/mlx4/qp.c      | 70 ++++++++++++++++++++++++++++++++++++
 include/uapi/rdma/mlx4-abi.h         |  4 +++
 4 files changed, 89 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/drivers/infiniband/hw/mlx4/main.c b/drivers/infiniband/hw/mlx4/main.c
index 7c6f929ebd3e..b42234571a8c 100644
--- a/drivers/infiniband/hw/mlx4/main.c
+++ b/drivers/infiniband/hw/mlx4/main.c
@@ -2738,10 +2738,16 @@ static void *mlx4_ib_add(struct mlx4_dev *dev)
 		ibdev->ib_dev.create_wq		= mlx4_ib_create_wq;
 		ibdev->ib_dev.modify_wq		= mlx4_ib_modify_wq;
 		ibdev->ib_dev.destroy_wq	= mlx4_ib_destroy_wq;
+		ibdev->ib_dev.create_rwq_ind_table  =
+			mlx4_ib_create_rwq_ind_table;
+		ibdev->ib_dev.destroy_rwq_ind_table =
+			mlx4_ib_destroy_rwq_ind_table;
 		ibdev->ib_dev.uverbs_ex_cmd_mask |=
-			(1ull << IB_USER_VERBS_EX_CMD_CREATE_WQ) |
-			(1ull << IB_USER_VERBS_EX_CMD_MODIFY_WQ) |
-			(1ull << IB_USER_VERBS_EX_CMD_DESTROY_WQ);
+			(1ull << IB_USER_VERBS_EX_CMD_CREATE_WQ)	  |
+			(1ull << IB_USER_VERBS_EX_CMD_MODIFY_WQ)	  |
+			(1ull << IB_USER_VERBS_EX_CMD_DESTROY_WQ)	  |
+			(1ull << IB_USER_VERBS_EX_CMD_CREATE_RWQ_IND_TBL) |
+			(1ull << IB_USER_VERBS_EX_CMD_DESTROY_RWQ_IND_TBL);
 	}
 
 	if (!mlx4_is_slave(ibdev->dev)) {
diff --git a/drivers/infiniband/hw/mlx4/mlx4_ib.h b/drivers/infiniband/hw/mlx4/mlx4_ib.h
index ba4e78c064d2..85525bc400a0 100644
--- a/drivers/infiniband/hw/mlx4/mlx4_ib.h
+++ b/drivers/infiniband/hw/mlx4/mlx4_ib.h
@@ -916,4 +916,10 @@ int mlx4_ib_destroy_wq(struct ib_wq *wq);
 int mlx4_ib_modify_wq(struct ib_wq *wq, struct ib_wq_attr *wq_attr,
 		      u32 wq_attr_mask, struct ib_udata *udata);
 
+struct ib_rwq_ind_table
+*mlx4_ib_create_rwq_ind_table(struct ib_device *device,
+			      struct ib_rwq_ind_table_init_attr *init_attr,
+			      struct ib_udata *udata);
+int mlx4_ib_destroy_rwq_ind_table(struct ib_rwq_ind_table *wq_ind_table);
+
 #endif /* MLX4_IB_H */
diff --git a/drivers/infiniband/hw/mlx4/qp.c b/drivers/infiniband/hw/mlx4/qp.c
index 65e4ec549368..519919d15474 100644
--- a/drivers/infiniband/hw/mlx4/qp.c
+++ b/drivers/infiniband/hw/mlx4/qp.c
@@ -3943,3 +3943,73 @@ int mlx4_ib_destroy_wq(struct ib_wq *ibwq)
 
 	return 0;
 }
+
+struct ib_rwq_ind_table
+*mlx4_ib_create_rwq_ind_table(struct ib_device *device,
+			      struct ib_rwq_ind_table_init_attr *init_attr,
+			      struct ib_udata *udata)
+{
+	struct ib_rwq_ind_table *rwq_ind_table;
+	struct mlx4_ib_create_rwq_ind_tbl_resp resp = {};
+	unsigned int ind_tbl_size = 1 << init_attr->log_ind_tbl_size;
+	unsigned int base_wqn;
+	size_t min_resp_len;
+	int i;
+	int err;
+
+	if (udata->inlen > 0 &&
+	    !ib_is_udata_cleared(udata, 0,
+				 udata->inlen))
+		return ERR_PTR(-EOPNOTSUPP);
+
+	min_resp_len = offsetof(typeof(resp), reserved) + sizeof(resp.reserved);
+	if (udata->outlen && udata->outlen < min_resp_len)
+		return ERR_PTR(-EINVAL);
+
+	if (ind_tbl_size >
+	    device->attrs.rss_caps.max_rwq_indirection_table_size) {
+		pr_debug("log_ind_tbl_size = %d is bigger than supported = %d\n",
+			 ind_tbl_size,
+			 device->attrs.rss_caps.max_rwq_indirection_table_size);
+		return ERR_PTR(-EINVAL);
+	}
+
+	base_wqn = init_attr->ind_tbl[0]->wq_num;
+
+	if (base_wqn % ind_tbl_size) {
+		pr_debug("WQN=0x%x isn't aligned with indirection table size\n",
+			 base_wqn);
+		return ERR_PTR(-EINVAL);
+	}
+
+	for (i = 1; i < ind_tbl_size; i++) {
+		if (++base_wqn != init_attr->ind_tbl[i]->wq_num) {
+			pr_debug("indirection table's WQNs aren't consecutive\n");
+			return ERR_PTR(-EINVAL);
+		}
+	}
+
+	rwq_ind_table = kzalloc(sizeof(*rwq_ind_table), GFP_KERNEL);
+	if (!rwq_ind_table)
+		return ERR_PTR(-ENOMEM);
+
+	if (udata->outlen) {
+		resp.response_length = offsetof(typeof(resp), response_length) +
+					sizeof(resp.response_length);
+		err = ib_copy_to_udata(udata, &resp, resp.response_length);
+		if (err)
+			goto err;
+	}
+
+	return rwq_ind_table;
+
+err:
+	kfree(rwq_ind_table);
+	return ERR_PTR(err);
+}
+
+int mlx4_ib_destroy_rwq_ind_table(struct ib_rwq_ind_table *ib_rwq_ind_tbl)
+{
+	kfree(ib_rwq_ind_tbl);
+	return 0;
+}
diff --git a/include/uapi/rdma/mlx4-abi.h b/include/uapi/rdma/mlx4-abi.h
index c9702a5f0bda..5591d955ba00 100644
--- a/include/uapi/rdma/mlx4-abi.h
+++ b/include/uapi/rdma/mlx4-abi.h
@@ -119,4 +119,8 @@ struct mlx4_ib_modify_wq {
 	__u32	reserved;
 };
 
+struct mlx4_ib_create_rwq_ind_tbl_resp {
+	__u32	response_length;
+	__u32	reserved;
+};
 #endif /* MLX4_ABI_USER_H */
-- 
cgit v1.2.3


From 3078f5f1bd8b6c8aef77b8ef4d49671fa6eb058e Mon Sep 17 00:00:00 2001
From: Guy Levi <guyle@mellanox.com>
Date: Tue, 4 Jul 2017 16:24:26 +0300
Subject: IB/mlx4: Add support for RSS QP

Add support to work with a RSS QP by using an indirection table object
upon QP creation. Other related QP verbs (e.g. modify/destroy/query) were
updated as well for that QP mode.

Notes:
- The RX hash properties are supplied as driver private data.
- The RSS QP port is used on the associated WQs in its indirection
  table. Applying different ports during WQ life time is not allowed.
- The expected RSS QP flow is: create, modify(RST->INIT),
  modify(RST->RTR), destroy.

Signed-off-by: Guy Levi <guyle@mellanox.com>
Reviewed-by: Yishai Hadas <yishaih@mellanox.com>
Signed-off-by: Leon Romanovsky <leon@kernel.org>
Signed-off-by: Doug Ledford <dledford@redhat.com>
---
 drivers/infiniband/hw/mlx4/mlx4_ib.h |   8 +
 drivers/infiniband/hw/mlx4/qp.c      | 453 +++++++++++++++++++++++++++++++++--
 include/uapi/rdma/mlx4-abi.h         |  33 +++
 3 files changed, 472 insertions(+), 22 deletions(-)

(limited to 'include')

diff --git a/drivers/infiniband/hw/mlx4/mlx4_ib.h b/drivers/infiniband/hw/mlx4/mlx4_ib.h
index 85525bc400a0..1fa19820355a 100644
--- a/drivers/infiniband/hw/mlx4/mlx4_ib.h
+++ b/drivers/infiniband/hw/mlx4/mlx4_ib.h
@@ -46,6 +46,7 @@
 
 #include <linux/mlx4/device.h>
 #include <linux/mlx4/doorbell.h>
+#include <linux/mlx4/qp.h>
 
 #define MLX4_IB_DRV_NAME	"mlx4_ib"
 
@@ -299,6 +300,12 @@ struct mlx4_wqn_range {
 	struct list_head	list;
 };
 
+struct mlx4_ib_rss {
+	unsigned int		base_qpn_tbl_sz;
+	u8			flags;
+	u8			rss_key[MLX4_EN_RSS_KEY_SIZE];
+};
+
 struct mlx4_ib_qp {
 	union {
 		struct ib_qp	ibqp;
@@ -345,6 +352,7 @@ struct mlx4_ib_qp {
 	struct mlx4_wqn_range	*wqn_range;
 	/* Number of RSS QP parents that uses this WQ */
 	u32			rss_usecnt;
+	struct mlx4_ib_rss	*rss_ctx;
 };
 
 struct mlx4_ib_srq {
diff --git a/drivers/infiniband/hw/mlx4/qp.c b/drivers/infiniband/hw/mlx4/qp.c
index 519919d15474..e42acfb20588 100644
--- a/drivers/infiniband/hw/mlx4/qp.c
+++ b/drivers/infiniband/hw/mlx4/qp.c
@@ -53,6 +53,7 @@ static void mlx4_ib_lock_cqs(struct mlx4_ib_cq *send_cq,
 			     struct mlx4_ib_cq *recv_cq);
 static void mlx4_ib_unlock_cqs(struct mlx4_ib_cq *send_cq,
 			       struct mlx4_ib_cq *recv_cq);
+static int _mlx4_ib_modify_wq(struct ib_wq *ibwq, enum ib_wq_state new_state);
 
 enum {
 	MLX4_IB_ACK_REQ_FREQ	= 8,
@@ -650,6 +651,212 @@ static void mlx4_ib_free_qp_counter(struct mlx4_ib_dev *dev,
 	qp->counter_index = NULL;
 }
 
+static int set_qp_rss(struct mlx4_ib_dev *dev, struct mlx4_ib_rss *rss_ctx,
+		      struct ib_qp_init_attr *init_attr,
+		      struct mlx4_ib_create_qp_rss *ucmd)
+{
+	rss_ctx->base_qpn_tbl_sz = init_attr->rwq_ind_tbl->ind_tbl[0]->wq_num |
+		(init_attr->rwq_ind_tbl->log_ind_tbl_size << 24);
+
+	if ((ucmd->rx_hash_function == MLX4_IB_RX_HASH_FUNC_TOEPLITZ) &&
+	    (dev->dev->caps.flags2 & MLX4_DEV_CAP_FLAG2_RSS_TOP)) {
+		memcpy(rss_ctx->rss_key, ucmd->rx_hash_key,
+		       MLX4_EN_RSS_KEY_SIZE);
+	} else {
+		pr_debug("RX Hash function is not supported\n");
+		return (-EOPNOTSUPP);
+	}
+
+	if ((ucmd->rx_hash_fields_mask & MLX4_IB_RX_HASH_SRC_IPV4) &&
+	    (ucmd->rx_hash_fields_mask & MLX4_IB_RX_HASH_DST_IPV4)) {
+		rss_ctx->flags = MLX4_RSS_IPV4;
+	} else if ((ucmd->rx_hash_fields_mask & MLX4_IB_RX_HASH_SRC_IPV4) ||
+		   (ucmd->rx_hash_fields_mask & MLX4_IB_RX_HASH_DST_IPV4)) {
+		pr_debug("RX Hash fields_mask is not supported - both IPv4 SRC and DST must be set\n");
+		return (-EOPNOTSUPP);
+	}
+
+	if ((ucmd->rx_hash_fields_mask & MLX4_IB_RX_HASH_SRC_IPV6) &&
+	    (ucmd->rx_hash_fields_mask & MLX4_IB_RX_HASH_DST_IPV6)) {
+		rss_ctx->flags |= MLX4_RSS_IPV6;
+	} else if ((ucmd->rx_hash_fields_mask & MLX4_IB_RX_HASH_SRC_IPV6) ||
+		   (ucmd->rx_hash_fields_mask & MLX4_IB_RX_HASH_DST_IPV6)) {
+		pr_debug("RX Hash fields_mask is not supported - both IPv6 SRC and DST must be set\n");
+		return (-EOPNOTSUPP);
+	}
+
+	if ((ucmd->rx_hash_fields_mask & MLX4_IB_RX_HASH_SRC_PORT_UDP) &&
+	    (ucmd->rx_hash_fields_mask & MLX4_IB_RX_HASH_DST_PORT_UDP)) {
+		if (!(dev->dev->caps.flags & MLX4_DEV_CAP_FLAG_UDP_RSS)) {
+			pr_debug("RX Hash fields_mask for UDP is not supported\n");
+			return (-EOPNOTSUPP);
+		}
+
+		if (rss_ctx->flags & MLX4_RSS_IPV4) {
+			rss_ctx->flags |= MLX4_RSS_UDP_IPV4;
+		} else if (rss_ctx->flags & MLX4_RSS_IPV6) {
+			rss_ctx->flags |= MLX4_RSS_UDP_IPV6;
+		} else {
+			pr_debug("RX Hash fields_mask is not supported - UDP must be set with IPv4 or IPv6\n");
+			return (-EOPNOTSUPP);
+		}
+	} else if ((ucmd->rx_hash_fields_mask & MLX4_IB_RX_HASH_SRC_PORT_UDP) ||
+		   (ucmd->rx_hash_fields_mask & MLX4_IB_RX_HASH_DST_PORT_UDP)) {
+		pr_debug("RX Hash fields_mask is not supported - both UDP SRC and DST must be set\n");
+		return (-EOPNOTSUPP);
+	}
+
+	if ((ucmd->rx_hash_fields_mask & MLX4_IB_RX_HASH_SRC_PORT_TCP) &&
+	    (ucmd->rx_hash_fields_mask & MLX4_IB_RX_HASH_DST_PORT_TCP)) {
+		if (rss_ctx->flags & MLX4_RSS_IPV4) {
+			rss_ctx->flags |= MLX4_RSS_TCP_IPV4;
+		} else if (rss_ctx->flags & MLX4_RSS_IPV6) {
+			rss_ctx->flags |= MLX4_RSS_TCP_IPV6;
+		} else {
+			pr_debug("RX Hash fields_mask is not supported - TCP must be set with IPv4 or IPv6\n");
+			return (-EOPNOTSUPP);
+		}
+
+	} else if ((ucmd->rx_hash_fields_mask & MLX4_IB_RX_HASH_SRC_PORT_TCP) ||
+		   (ucmd->rx_hash_fields_mask & MLX4_IB_RX_HASH_DST_PORT_TCP)) {
+		pr_debug("RX Hash fields_mask is not supported - both TCP SRC and DST must be set\n");
+		return (-EOPNOTSUPP);
+	}
+
+	return 0;
+}
+
+static int create_qp_rss(struct mlx4_ib_dev *dev, struct ib_pd *ibpd,
+			 struct ib_qp_init_attr *init_attr,
+			 struct mlx4_ib_create_qp_rss *ucmd,
+			 struct mlx4_ib_qp *qp)
+{
+	int qpn;
+	int err;
+
+	qp->mqp.usage = MLX4_RES_USAGE_USER_VERBS;
+
+	err = mlx4_qp_reserve_range(dev->dev, 1, 1, &qpn, 0, qp->mqp.usage);
+	if (err)
+		return err;
+
+	err = mlx4_qp_alloc(dev->dev, qpn, &qp->mqp);
+	if (err)
+		goto err_qpn;
+
+	mutex_init(&qp->mutex);
+
+	INIT_LIST_HEAD(&qp->gid_list);
+	INIT_LIST_HEAD(&qp->steering_rules);
+
+	qp->mlx4_ib_qp_type = MLX4_IB_QPT_RAW_ETHERTYPE;
+	qp->state = IB_QPS_RESET;
+
+	/* Set dummy send resources to be compatible with HV and PRM */
+	qp->sq_no_prefetch = 1;
+	qp->sq.wqe_cnt = 1;
+	qp->sq.wqe_shift = MLX4_IB_MIN_SQ_STRIDE;
+	qp->buf_size = qp->sq.wqe_cnt << MLX4_IB_MIN_SQ_STRIDE;
+	qp->mtt = (to_mqp(
+		   (struct ib_qp *)init_attr->rwq_ind_tbl->ind_tbl[0]))->mtt;
+
+	qp->rss_ctx = kzalloc(sizeof(*qp->rss_ctx), GFP_KERNEL);
+	if (!qp->rss_ctx) {
+		err = -ENOMEM;
+		goto err_qp_alloc;
+	}
+
+	err = set_qp_rss(dev, qp->rss_ctx, init_attr, ucmd);
+	if (err)
+		goto err;
+
+	return 0;
+
+err:
+	kfree(qp->rss_ctx);
+
+err_qp_alloc:
+	mlx4_qp_remove(dev->dev, &qp->mqp);
+	mlx4_qp_free(dev->dev, &qp->mqp);
+
+err_qpn:
+	mlx4_qp_release_range(dev->dev, qpn, 1);
+	return err;
+}
+
+static struct ib_qp *_mlx4_ib_create_qp_rss(struct ib_pd *pd,
+					    struct ib_qp_init_attr *init_attr,
+					    struct ib_udata *udata)
+{
+	struct mlx4_ib_qp *qp;
+	struct mlx4_ib_create_qp_rss ucmd = {};
+	size_t required_cmd_sz;
+	int err;
+
+	if (!udata) {
+		pr_debug("RSS QP with NULL udata\n");
+		return ERR_PTR(-EINVAL);
+	}
+
+	if (udata->outlen)
+		return ERR_PTR(-EOPNOTSUPP);
+
+	required_cmd_sz = offsetof(typeof(ucmd), reserved1) +
+					sizeof(ucmd.reserved1);
+	if (udata->inlen < required_cmd_sz) {
+		pr_debug("invalid inlen\n");
+		return ERR_PTR(-EINVAL);
+	}
+
+	if (ib_copy_from_udata(&ucmd, udata, min(sizeof(ucmd), udata->inlen))) {
+		pr_debug("copy failed\n");
+		return ERR_PTR(-EFAULT);
+	}
+
+	if (ucmd.comp_mask || ucmd.reserved1)
+		return ERR_PTR(-EOPNOTSUPP);
+
+	if (udata->inlen > sizeof(ucmd) &&
+	    !ib_is_udata_cleared(udata, sizeof(ucmd),
+				 udata->inlen - sizeof(ucmd))) {
+		pr_debug("inlen is not supported\n");
+		return ERR_PTR(-EOPNOTSUPP);
+	}
+
+	if (init_attr->qp_type != IB_QPT_RAW_PACKET) {
+		pr_debug("RSS QP with unsupported QP type %d\n",
+			 init_attr->qp_type);
+		return ERR_PTR(-EOPNOTSUPP);
+	}
+
+	if (init_attr->create_flags) {
+		pr_debug("RSS QP doesn't support create flags\n");
+		return ERR_PTR(-EOPNOTSUPP);
+	}
+
+	if (init_attr->send_cq || init_attr->cap.max_send_wr) {
+		pr_debug("RSS QP with unsupported send attributes\n");
+		return ERR_PTR(-EOPNOTSUPP);
+	}
+
+	qp = kzalloc(sizeof(*qp), GFP_KERNEL);
+	if (!qp)
+		return ERR_PTR(-ENOMEM);
+
+	qp->pri.vid = 0xFFFF;
+	qp->alt.vid = 0xFFFF;
+
+	err = create_qp_rss(to_mdev(pd->device), pd, init_attr, &ucmd, qp);
+	if (err) {
+		kfree(qp);
+		return ERR_PTR(err);
+	}
+
+	qp->ibqp.qp_num = qp->mqp.qpn;
+
+	return &qp->ibqp;
+}
+
 /*
  * This function allocates a WQN from a range which is consecutive and aligned
  * to its size. In case the range is full, then it creates a new range and
@@ -1186,6 +1393,36 @@ static void get_cqs(struct mlx4_ib_qp *qp, enum mlx4_ib_source_type src,
 	}
 }
 
+static void destroy_qp_rss(struct mlx4_ib_dev *dev, struct mlx4_ib_qp *qp)
+{
+	if (qp->state != IB_QPS_RESET) {
+		int i;
+
+		for (i = 0; i < (1 << qp->ibqp.rwq_ind_tbl->log_ind_tbl_size);
+		     i++) {
+			struct ib_wq *ibwq = qp->ibqp.rwq_ind_tbl->ind_tbl[i];
+			struct mlx4_ib_qp *wq =	to_mqp((struct ib_qp *)ibwq);
+
+			mutex_lock(&wq->mutex);
+
+			wq->rss_usecnt--;
+
+			mutex_unlock(&wq->mutex);
+		}
+
+		if (mlx4_qp_modify(dev->dev, NULL, to_mlx4_state(qp->state),
+				   MLX4_QP_STATE_RST, NULL, 0, 0, &qp->mqp))
+			pr_warn("modify QP %06x to RESET failed.\n",
+				qp->mqp.qpn);
+	}
+
+	mlx4_qp_remove(dev->dev, &qp->mqp);
+	mlx4_qp_free(dev->dev, &qp->mqp);
+	mlx4_qp_release_range(dev->dev, qp->mqp.qpn, 1);
+	del_gid_entries(qp);
+	kfree(qp->rss_ctx);
+}
+
 static void destroy_qp_common(struct mlx4_ib_dev *dev, struct mlx4_ib_qp *qp,
 			      enum mlx4_ib_source_type src, int is_user)
 {
@@ -1303,6 +1540,9 @@ static struct ib_qp *_mlx4_ib_create_qp(struct ib_pd *pd,
 	int sup_u_create_flags = MLX4_IB_QP_BLOCK_MULTICAST_LOOPBACK;
 	u16 xrcdn = 0;
 
+	if (init_attr->rwq_ind_tbl)
+		return _mlx4_ib_create_qp_rss(pd, init_attr, udata);
+
 	/*
 	 * We only support LSO, vendor flag1, and multicast loopback blocking,
 	 * and only for kernel UD QPs.
@@ -1444,7 +1684,6 @@ static int _mlx4_ib_destroy_qp(struct ib_qp *qp)
 {
 	struct mlx4_ib_dev *dev = to_mdev(qp->device);
 	struct mlx4_ib_qp *mqp = to_mqp(qp);
-	struct mlx4_ib_pd *pd;
 
 	if (is_qp0(dev, mqp))
 		mlx4_CLOSE_PORT(dev->dev, mqp->port);
@@ -1459,8 +1698,14 @@ static int _mlx4_ib_destroy_qp(struct ib_qp *qp)
 	if (mqp->counter_index)
 		mlx4_ib_free_qp_counter(dev, mqp);
 
-	pd = get_pd(mqp);
-	destroy_qp_common(dev, mqp, MLX4_IB_QP_SRC, !!pd->ibpd.uobject);
+	if (qp->rwq_ind_tbl) {
+		destroy_qp_rss(dev, mqp);
+	} else {
+		struct mlx4_ib_pd *pd;
+
+		pd = get_pd(mqp);
+		destroy_qp_common(dev, mqp, MLX4_IB_QP_SRC, !!pd->ibpd.uobject);
+	}
 
 	if (is_sqp(dev, mqp))
 		kfree(to_msqp(mqp));
@@ -1783,12 +2028,116 @@ static u8 gid_type_to_qpc(enum ib_gid_type gid_type)
 	}
 }
 
+/*
+ * Go over all RSS QP's childes (WQs) and apply their HW state according to
+ * their logic state if the RSS QP is the first RSS QP associated for the WQ.
+ */
+static int bringup_rss_rwqs(struct ib_rwq_ind_table *ind_tbl, u8 port_num)
+{
+	int i;
+	int err;
+
+	for (i = 0; i < (1 << ind_tbl->log_ind_tbl_size); i++) {
+		struct ib_wq *ibwq = ind_tbl->ind_tbl[i];
+		struct mlx4_ib_qp *wq = to_mqp((struct ib_qp *)ibwq);
+
+		mutex_lock(&wq->mutex);
+
+		/* Mlx4_ib restrictions:
+		 * WQ's is associated to a port according to the RSS QP it is
+		 * associates to.
+		 * In case the WQ is associated to a different port by another
+		 * RSS QP, return a failure.
+		 */
+		if ((wq->rss_usecnt > 0) && (wq->port != port_num)) {
+			err = -EINVAL;
+			mutex_unlock(&wq->mutex);
+			break;
+		}
+		wq->port = port_num;
+		if ((wq->rss_usecnt == 0) && (ibwq->state == IB_WQS_RDY)) {
+			err = _mlx4_ib_modify_wq(ibwq, IB_WQS_RDY);
+			if (err) {
+				mutex_unlock(&wq->mutex);
+				break;
+			}
+		}
+		wq->rss_usecnt++;
+
+		mutex_unlock(&wq->mutex);
+	}
+
+	if (i && err) {
+		int j;
+
+		for (j = (i - 1); j >= 0; j--) {
+			struct ib_wq *ibwq = ind_tbl->ind_tbl[j];
+			struct mlx4_ib_qp *wq = to_mqp((struct ib_qp *)ibwq);
+
+			mutex_lock(&wq->mutex);
+
+			if ((wq->rss_usecnt == 1) &&
+			    (ibwq->state == IB_WQS_RDY))
+				if (_mlx4_ib_modify_wq(ibwq, IB_WQS_RESET))
+					pr_warn("failed to reverse WQN=0x%06x\n",
+						ibwq->wq_num);
+			wq->rss_usecnt--;
+
+			mutex_unlock(&wq->mutex);
+		}
+	}
+
+	return err;
+}
+
+static void bring_down_rss_rwqs(struct ib_rwq_ind_table *ind_tbl)
+{
+	int i;
+
+	for (i = 0; i < (1 << ind_tbl->log_ind_tbl_size); i++) {
+		struct ib_wq *ibwq = ind_tbl->ind_tbl[i];
+		struct mlx4_ib_qp *wq = to_mqp((struct ib_qp *)ibwq);
+
+		mutex_lock(&wq->mutex);
+
+		if ((wq->rss_usecnt == 1) && (ibwq->state == IB_WQS_RDY))
+			if (_mlx4_ib_modify_wq(ibwq, IB_WQS_RESET))
+				pr_warn("failed to reverse WQN=%x\n",
+					ibwq->wq_num);
+		wq->rss_usecnt--;
+
+		mutex_unlock(&wq->mutex);
+	}
+}
+
+static void fill_qp_rss_context(struct mlx4_qp_context *context,
+				struct mlx4_ib_qp *qp)
+{
+	struct mlx4_rss_context *rss_context;
+
+	rss_context = (void *)context + offsetof(struct mlx4_qp_context,
+			pri_path) + MLX4_RSS_OFFSET_IN_QPC_PRI_PATH;
+
+	rss_context->base_qpn = cpu_to_be32(qp->rss_ctx->base_qpn_tbl_sz);
+	rss_context->default_qpn =
+		cpu_to_be32(qp->rss_ctx->base_qpn_tbl_sz & 0xffffff);
+	if (qp->rss_ctx->flags & (MLX4_RSS_UDP_IPV4 | MLX4_RSS_UDP_IPV6))
+		rss_context->base_qpn_udp = rss_context->default_qpn;
+	rss_context->flags = qp->rss_ctx->flags;
+	/* Currently support just toeplitz */
+	rss_context->hash_fn = MLX4_RSS_HASH_TOP;
+
+	memcpy(rss_context->rss_key, qp->rss_ctx->rss_key,
+	       MLX4_EN_RSS_KEY_SIZE);
+}
+
 static int __mlx4_ib_modify_qp(void *src, enum mlx4_ib_source_type src_type,
 			       const struct ib_qp_attr *attr, int attr_mask,
 			       enum ib_qp_state cur_state, enum ib_qp_state new_state)
 {
 	struct ib_uobject *ibuobject;
 	struct ib_srq  *ibsrq;
+	struct ib_rwq_ind_table *rwq_ind_tbl;
 	enum ib_qp_type qp_type;
 	struct mlx4_ib_dev *dev;
 	struct mlx4_ib_qp *qp;
@@ -1804,23 +2153,25 @@ static int __mlx4_ib_modify_qp(void *src, enum mlx4_ib_source_type src_type,
 	if (src_type == MLX4_IB_RWQ_SRC) {
 		struct ib_wq *ibwq;
 
-		ibwq	   = (struct ib_wq *)src;
-		ibuobject  = ibwq->uobject;
-		ibsrq	   = NULL;
-		qp_type    = IB_QPT_RAW_PACKET;
-		qp	   = to_mqp((struct ib_qp *)ibwq);
-		dev	   = to_mdev(ibwq->device);
-		pd	   = to_mpd(ibwq->pd);
+		ibwq	    = (struct ib_wq *)src;
+		ibuobject   = ibwq->uobject;
+		ibsrq	    = NULL;
+		rwq_ind_tbl = NULL;
+		qp_type     = IB_QPT_RAW_PACKET;
+		qp	    = to_mqp((struct ib_qp *)ibwq);
+		dev	    = to_mdev(ibwq->device);
+		pd	    = to_mpd(ibwq->pd);
 	} else {
 		struct ib_qp *ibqp;
 
-		ibqp	   = (struct ib_qp *)src;
-		ibuobject  = ibqp->uobject;
-		ibsrq	   = ibqp->srq;
-		qp_type    = ibqp->qp_type;
-		qp	   = to_mqp(ibqp);
-		dev	   = to_mdev(ibqp->device);
-		pd	   = get_pd(qp);
+		ibqp	    = (struct ib_qp *)src;
+		ibuobject   = ibqp->uobject;
+		ibsrq	    = ibqp->srq;
+		rwq_ind_tbl = ibqp->rwq_ind_tbl;
+		qp_type     = ibqp->qp_type;
+		qp	    = to_mqp(ibqp);
+		dev	    = to_mdev(ibqp->device);
+		pd	    = get_pd(qp);
 	}
 
 	/* APM is not supported under RoCE */
@@ -1836,6 +2187,11 @@ static int __mlx4_ib_modify_qp(void *src, enum mlx4_ib_source_type src_type,
 	context->flags = cpu_to_be32((to_mlx4_state(new_state) << 28) |
 				     (to_mlx4_st(dev, qp->mlx4_ib_qp_type) << 16));
 
+	if (rwq_ind_tbl) {
+		fill_qp_rss_context(context, qp);
+		context->flags |= cpu_to_be32(1 << MLX4_RSS_QPC_FLAG_OFFSET);
+	}
+
 	if (!(attr_mask & IB_QP_PATH_MIG_STATE))
 		context->flags |= cpu_to_be32(MLX4_QP_PM_MIGRATED << 11);
 	else {
@@ -1876,9 +2232,11 @@ static int __mlx4_ib_modify_qp(void *src, enum mlx4_ib_source_type src_type,
 			ilog2(dev->dev->caps.max_msg_sz);
 	}
 
-	if (qp->rq.wqe_cnt)
-		context->rq_size_stride = ilog2(qp->rq.wqe_cnt) << 3;
-	context->rq_size_stride |= qp->rq.wqe_shift - 4;
+	if (!rwq_ind_tbl) { /* PRM RSS receive side should be left zeros */
+		if (qp->rq.wqe_cnt)
+			context->rq_size_stride = ilog2(qp->rq.wqe_cnt) << 3;
+		context->rq_size_stride |= qp->rq.wqe_shift - 4;
+	}
 
 	if (qp->sq.wqe_cnt)
 		context->sq_size_stride = ilog2(qp->sq.wqe_cnt) << 3;
@@ -2031,8 +2389,14 @@ static int __mlx4_ib_modify_qp(void *src, enum mlx4_ib_source_type src_type,
 		optpar |= MLX4_QP_OPTPAR_ALT_ADDR_PATH;
 	}
 
-	get_cqs(qp, src_type, &send_cq, &recv_cq);
-	context->pd       = cpu_to_be32(pd->pdn);
+	context->pd = cpu_to_be32(pd->pdn);
+
+	if (!rwq_ind_tbl) {
+		get_cqs(qp, src_type, &send_cq, &recv_cq);
+	} else { /* Set dummy CQs to be compatible with HV and PRM */
+		send_cq = to_mcq(rwq_ind_tbl->ind_tbl[0]->cq);
+		recv_cq = send_cq;
+	}
 	context->cqn_send = cpu_to_be32(send_cq->mcq.cqn);
 	context->cqn_recv = cpu_to_be32(recv_cq->mcq.cqn);
 	context->params1  = cpu_to_be32(MLX4_IB_ACK_REQ_FREQ << 28);
@@ -2358,6 +2722,11 @@ out:
 	return err;
 }
 
+enum {
+	MLX4_IB_MODIFY_QP_RSS_SUP_ATTR_MSK = (IB_QP_STATE	|
+					      IB_QP_PORT),
+};
+
 static int _mlx4_ib_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr,
 			      int attr_mask, struct ib_udata *udata)
 {
@@ -2388,6 +2757,27 @@ static int _mlx4_ib_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr,
 		goto out;
 	}
 
+	if (ibqp->rwq_ind_tbl) {
+		if (!(((cur_state == IB_QPS_RESET) &&
+		       (new_state == IB_QPS_INIT)) ||
+		      ((cur_state == IB_QPS_INIT)  &&
+		       (new_state == IB_QPS_RTR)))) {
+			pr_debug("qpn 0x%x: RSS QP unsupported transition %d to %d\n",
+				 ibqp->qp_num, cur_state, new_state);
+
+			err = -EOPNOTSUPP;
+			goto out;
+		}
+
+		if (attr_mask & ~MLX4_IB_MODIFY_QP_RSS_SUP_ATTR_MSK) {
+			pr_debug("qpn 0x%x: RSS QP unsupported attribute mask 0x%x for transition %d to %d\n",
+				 ibqp->qp_num, attr_mask, cur_state, new_state);
+
+			err = -EOPNOTSUPP;
+			goto out;
+		}
+	}
+
 	if (mlx4_is_bonded(dev->dev) && (attr_mask & IB_QP_PORT)) {
 		if ((cur_state == IB_QPS_RESET) && (new_state == IB_QPS_INIT)) {
 			if ((ibqp->qp_type == IB_QPT_RC) ||
@@ -2452,9 +2842,18 @@ static int _mlx4_ib_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr,
 		goto out;
 	}
 
+	if (ibqp->rwq_ind_tbl && (new_state == IB_QPS_INIT)) {
+		err = bringup_rss_rwqs(ibqp->rwq_ind_tbl, attr->port_num);
+		if (err)
+			goto out;
+	}
+
 	err = __mlx4_ib_modify_qp(ibqp, MLX4_IB_QP_SRC, attr, attr_mask,
 				  cur_state, new_state);
 
+	if (ibqp->rwq_ind_tbl && err)
+		bring_down_rss_rwqs(ibqp->rwq_ind_tbl);
+
 	if (mlx4_is_bonded(dev->dev) && (attr_mask & IB_QP_PORT))
 		attr->port_num = 1;
 
@@ -3643,6 +4042,9 @@ int mlx4_ib_query_qp(struct ib_qp *ibqp, struct ib_qp_attr *qp_attr, int qp_attr
 	int mlx4_state;
 	int err = 0;
 
+	if (ibqp->rwq_ind_tbl)
+		return -EOPNOTSUPP;
+
 	mutex_lock(&qp->mutex);
 
 	if (qp->state == IB_QPS_RESET) {
@@ -3917,6 +4319,11 @@ int mlx4_ib_modify_wq(struct ib_wq *ibwq, struct ib_wq_attr *wq_attr,
 	if ((new_state == IB_WQS_ERR) && (cur_state == IB_WQS_RESET))
 		return -EINVAL;
 
+	/* Need to protect against the parent RSS which also may modify WQ
+	 * state.
+	 */
+	mutex_lock(&qp->mutex);
+
 	/* Can update HW state only if a RSS QP has already associated to this
 	 * WQ, so we can apply its port on the WQ.
 	 */
@@ -3926,6 +4333,8 @@ int mlx4_ib_modify_wq(struct ib_wq *ibwq, struct ib_wq_attr *wq_attr,
 	if (!err)
 		ibwq->state = new_state;
 
+	mutex_unlock(&qp->mutex);
+
 	return err;
 }
 
diff --git a/include/uapi/rdma/mlx4-abi.h b/include/uapi/rdma/mlx4-abi.h
index 5591d955ba00..d915cab37ec3 100644
--- a/include/uapi/rdma/mlx4-abi.h
+++ b/include/uapi/rdma/mlx4-abi.h
@@ -95,6 +95,16 @@ struct mlx4_ib_create_srq_resp {
 	__u32	reserved;
 };
 
+struct mlx4_ib_create_qp_rss {
+	__u64   rx_hash_fields_mask;
+	__u8    rx_hash_function;
+	__u8    rx_key_len;
+	__u8    reserved[6];
+	__u8    rx_hash_key[40];
+	__u32   comp_mask;
+	__u32   reserved1;
+};
+
 struct mlx4_ib_create_qp {
 	__u64	buf_addr;
 	__u64	db_addr;
@@ -123,4 +133,27 @@ struct mlx4_ib_create_rwq_ind_tbl_resp {
 	__u32	response_length;
 	__u32	reserved;
 };
+
+/* RX Hash function flags */
+enum mlx4_ib_rx_hash_function_flags {
+	MLX4_IB_RX_HASH_FUNC_TOEPLITZ	= 1 << 0,
+};
+
+/*
+ * RX Hash flags, these flags allows to set which incoming packet's field should
+ * participates in RX Hash. Each flag represent certain packet's field,
+ * when the flag is set the field that is represented by the flag will
+ * participate in RX Hash calculation.
+ */
+enum mlx4_ib_rx_hash_fields {
+	MLX4_IB_RX_HASH_SRC_IPV4	= 1 << 0,
+	MLX4_IB_RX_HASH_DST_IPV4	= 1 << 1,
+	MLX4_IB_RX_HASH_SRC_IPV6	= 1 << 2,
+	MLX4_IB_RX_HASH_DST_IPV6	= 1 << 3,
+	MLX4_IB_RX_HASH_SRC_PORT_TCP	= 1 << 4,
+	MLX4_IB_RX_HASH_DST_PORT_TCP	= 1 << 5,
+	MLX4_IB_RX_HASH_SRC_PORT_UDP	= 1 << 6,
+	MLX4_IB_RX_HASH_DST_PORT_UDP	= 1 << 7
+};
+
 #endif /* MLX4_ABI_USER_H */
-- 
cgit v1.2.3


From d08477aa975e97f1dc64c0ae59cebf98520456ce Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Thu, 29 Jun 2017 09:28:50 -0500
Subject: fcntl: Don't use ambiguous SIG_POLL si_codes

We have a weird and problematic intersection of features that when
they all come together result in ambiguous siginfo values, that
we can not support properly.

- Supporting fcntl(F_SETSIG,...) with arbitrary valid signals.

- Using positive values for POLL_IN, POLL_OUT, POLL_MSG, ..., etc
  that imply they are signal specific si_codes and using the
  aforementioned arbitrary signal to deliver them.

- Supporting injection of arbitrary siginfo values for debugging and
  checkpoint/restore.

The result is that just looking at siginfo si_codes of 1 to 6 are
ambigious.  It could either be a signal specific si_code or it could
be a generic si_code.

For most of the kernel this is a non-issue but for sending signals
with siginfo it is impossible to play back the kernel signals and
get the same result.

Strictly speaking when the si_code was changed from SI_SIGIO to
POLL_IN and friends between 2.2 and 2.4 this functionality was not
ambiguous, as only real time signals were supported.  Before 2.4 was
released the kernel began supporting siginfo with non realtime signals
so they could give details of why the signal was sent.

The result is that if F_SETSIG is set to one of the signals with signal
specific si_codes then user space can not know why the signal was sent.

I grepped through a bunch of userspace programs using debian code
search to get a feel for how often people choose a signal that results
in an ambiguous si_code.  I only found one program doing so and it was
using SIGCHLD to test the F_SETSIG functionality, and did not appear
to be a real world usage.

Therefore the ambiguity does not appears to be a real world problem in
practice.  Remove the ambiguity while introducing the smallest chance
of breakage by changing the si_code to SI_SIGIO when signals with
signal specific si_codes are targeted.

Fixes: v2.3.40 -- Added support for queueing non-rt signals
Fixes: v2.3.21 -- Changed the si_code from SI_SIGIO
Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com>
---
 fs/fcntl.c                         | 13 ++++++++++++-
 include/linux/signal.h             |  8 ++++++++
 include/uapi/asm-generic/siginfo.h |  4 ++--
 3 files changed, 22 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/fs/fcntl.c b/fs/fcntl.c
index 3b01b646e528..0491da3b28c3 100644
--- a/fs/fcntl.c
+++ b/fs/fcntl.c
@@ -741,10 +741,21 @@ static void send_sigio_to_task(struct task_struct *p,
 			si.si_signo = signum;
 			si.si_errno = 0;
 		        si.si_code  = reason;
+			/*
+			 * Posix definies POLL_IN and friends to be signal
+			 * specific si_codes for SIG_POLL.  Linux extended
+			 * these si_codes to other signals in a way that is
+			 * ambiguous if other signals also have signal
+			 * specific si_codes.  In that case use SI_SIGIO instead
+			 * to remove the ambiguity.
+			 */
+			if (sig_specific_sicodes(signum))
+				si.si_code = SI_SIGIO;
+
 			/* Make sure we are called with one of the POLL_*
 			   reasons, otherwise we could leak kernel stack into
 			   userspace.  */
-			BUG_ON((reason & __SI_MASK) != __SI_POLL);
+			BUG_ON((reason < POLL_IN) || ((reason - POLL_IN) >= NSIGPOLL));
 			if (reason - POLL_IN >= NSIGPOLL)
 				si.si_band  = ~0L;
 			else
diff --git a/include/linux/signal.h b/include/linux/signal.h
index e2678b5dbb21..c97cc20369c0 100644
--- a/include/linux/signal.h
+++ b/include/linux/signal.h
@@ -380,10 +380,18 @@ int unhandled_signal(struct task_struct *tsk, int sig);
         rt_sigmask(SIGCONT)   |  rt_sigmask(SIGCHLD)   | \
 	rt_sigmask(SIGWINCH)  |  rt_sigmask(SIGURG)    )
 
+#define SIG_SPECIFIC_SICODES_MASK (\
+	rt_sigmask(SIGILL)    |  rt_sigmask(SIGFPE)    | \
+	rt_sigmask(SIGSEGV)   |  rt_sigmask(SIGBUS)    | \
+	rt_sigmask(SIGTRAP)   |  rt_sigmask(SIGCHLD)   | \
+	rt_sigmask(SIGPOLL)   |  rt_sigmask(SIGSYS)    | \
+	SIGEMT_MASK                                    )
+
 #define sig_kernel_only(sig)		siginmask(sig, SIG_KERNEL_ONLY_MASK)
 #define sig_kernel_coredump(sig)	siginmask(sig, SIG_KERNEL_COREDUMP_MASK)
 #define sig_kernel_ignore(sig)		siginmask(sig, SIG_KERNEL_IGNORE_MASK)
 #define sig_kernel_stop(sig)		siginmask(sig, SIG_KERNEL_STOP_MASK)
+#define sig_specific_sicodes(sig)	siginmask(sig, SIG_SPECIFIC_SICODES_MASK)
 
 #define sig_fatal(t, signr) \
 	(!siginmask(signr, SIG_KERNEL_IGNORE_MASK|SIG_KERNEL_STOP_MASK) && \
diff --git a/include/uapi/asm-generic/siginfo.h b/include/uapi/asm-generic/siginfo.h
index 9c4eca6b374a..9e956ea94d57 100644
--- a/include/uapi/asm-generic/siginfo.h
+++ b/include/uapi/asm-generic/siginfo.h
@@ -184,7 +184,7 @@ typedef struct siginfo {
 #define SI_TIMER __SI_CODE(__SI_TIMER,-2) /* sent by timer expiration */
 #define SI_MESGQ __SI_CODE(__SI_MESGQ,-3) /* sent by real time mesq state change */
 #define SI_ASYNCIO	-4		/* sent by AIO completion */
-#define SI_SIGIO	-5		/* sent by queued SIGIO */
+#define SI_SIGIO __SI_CODE(__SI_POLL,-5) /* sent by queued SIGIO */
 #define SI_TKILL	-6		/* sent by tkill system call */
 #define SI_DETHREAD	-7		/* sent by execve() killing subsidiary threads */
 
@@ -259,7 +259,7 @@ typedef struct siginfo {
 #define NSIGCHLD	6
 
 /*
- * SIGPOLL si_codes
+ * SIGPOLL (or any other signal without signal specific si_codes) si_codes
  */
 #define POLL_IN		(__SI_POLL|1)	/* data input available */
 #define POLL_OUT	(__SI_POLL|2)	/* output buffers available */
-- 
cgit v1.2.3


From cc731525f26af85a1c3537da41e0abd1d35e0bdb Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Sun, 16 Jul 2017 22:36:59 -0500
Subject: signal: Remove kernel interal si_code magic

struct siginfo is a union and the kernel since 2.4 has been hiding a union
tag in the high 16bits of si_code using the values:
__SI_KILL
__SI_TIMER
__SI_POLL
__SI_FAULT
__SI_CHLD
__SI_RT
__SI_MESGQ
__SI_SYS

While this looks plausible on the surface, in practice this situation has
not worked well.

- Injected positive signals are not copied to user space properly
  unless they have these magic high bits set.

- Injected positive signals are not reported properly by signalfd
  unless they have these magic high bits set.

- These kernel internal values leaked to userspace via ptrace_peek_siginfo

- It was possible to inject these kernel internal values and cause the
  the kernel to misbehave.

- Kernel developers got confused and expected these kernel internal values
  in userspace in kernel self tests.

- Kernel developers got confused and set si_code to __SI_FAULT which
  is SI_USER in userspace which causes userspace to think an ordinary user
  sent the signal and that it was not kernel generated.

- The values make it impossible to reorganize the code to transform
  siginfo_copy_to_user into a plain copy_to_user.  As si_code must
  be massaged before being passed to userspace.

So remove these kernel internal si codes and make the kernel code simpler
and more maintainable.

To replace these kernel internal magic si_codes introduce the helper
function siginfo_layout, that takes a signal number and an si_code and
computes which union member of siginfo is being used.  Have
siginfo_layout return an enumeration so that gcc will have enough
information to warn if a switch statement does not handle all of union
members.

A couple of architectures have a messed up ABI that defines signal
specific duplications of SI_USER which causes more special cases in
siginfo_layout than I would like.  The good news is only problem
architectures pay the cost.

Update all of the code that used the previous magic __SI_ values to
use the new SIL_ values and to call siginfo_layout to get those
values.  Escept where not all of the cases are handled remove the
defaults in the switch statements so that if a new case is missed in
the future the lack will show up at compile time.

Modify the code that copies siginfo si_code to userspace to just copy
the value and not cast si_code to a short first.  The high bits are no
longer used to hold a magic union member.

Fixup the siginfo header files to stop including the __SI_ values in
their constants and for the headers that were missing it to properly
update the number of si_codes for each signal type.

The fixes to copy_siginfo_from_user32 implementations has the
interesting property that several of them perviously should never have
worked as the __SI_ values they depended up where kernel internal.
With that dependency gone those implementations should work much
better.

The idea of not passing the __SI_ values out to userspace and then
not reinserting them has been tested with criu and criu worked without
changes.

Ref: 2.4.0-test1
Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com>
---
 arch/alpha/include/uapi/asm/siginfo.h    |   4 +-
 arch/arm64/kernel/signal32.c             |  23 +++----
 arch/blackfin/include/uapi/asm/siginfo.h |  30 +++++---
 arch/frv/include/uapi/asm/siginfo.h      |   2 +-
 arch/ia64/include/uapi/asm/siginfo.h     |  20 +++---
 arch/ia64/kernel/signal.c                |  17 +++--
 arch/mips/include/uapi/asm/siginfo.h     |   6 +-
 arch/mips/kernel/signal32.c              |  19 +++--
 arch/parisc/kernel/signal32.c            |  31 ++++-----
 arch/powerpc/kernel/signal_32.c          |  20 +++---
 arch/s390/kernel/compat_signal.c         |  32 ++++-----
 arch/sparc/include/uapi/asm/siginfo.h    |   4 +-
 arch/sparc/kernel/signal32.c             |  16 ++---
 arch/tile/include/uapi/asm/siginfo.h     |   4 +-
 arch/tile/kernel/compat_signal.c         |  18 +++--
 arch/tile/kernel/traps.c                 |   2 +-
 arch/x86/kernel/signal_compat.c          |  21 +++---
 fs/signalfd.c                            |  22 +++---
 include/linux/signal.h                   |  14 ++++
 include/uapi/asm-generic/siginfo.h       | 115 +++++++++++++------------------
 kernel/exit.c                            |   4 +-
 kernel/ptrace.c                          |   6 +-
 kernel/signal.c                          |  72 ++++++++++++++-----
 23 files changed, 257 insertions(+), 245 deletions(-)

(limited to 'include')

diff --git a/arch/alpha/include/uapi/asm/siginfo.h b/arch/alpha/include/uapi/asm/siginfo.h
index 972f547d9e41..70494d1d8f29 100644
--- a/arch/alpha/include/uapi/asm/siginfo.h
+++ b/arch/alpha/include/uapi/asm/siginfo.h
@@ -10,14 +10,14 @@
  * SIGFPE si_codes
  */
 #ifdef __KERNEL__
-#define FPE_FIXME	(__SI_FAULT|0)	/* Broken dup of SI_USER */
+#define FPE_FIXME	0	/* Broken dup of SI_USER */
 #endif /* __KERNEL__ */
 
 /*
  * SIGTRAP si_codes
  */
 #ifdef __KERNEL__
-#define TRAP_FIXME	(__SI_FAULT|0)	/* Broken dup of SI_USER */
+#define TRAP_FIXME	0	/* Broken dup of SI_USER */
 #endif /* __KERNEL__ */
 
 #endif
diff --git a/arch/arm64/kernel/signal32.c b/arch/arm64/kernel/signal32.c
index c747a0fc5d7d..9b95a935c21d 100644
--- a/arch/arm64/kernel/signal32.c
+++ b/arch/arm64/kernel/signal32.c
@@ -142,25 +142,25 @@ int copy_siginfo_to_user32(compat_siginfo_t __user *to, const siginfo_t *from)
 	 */
 	err = __put_user(from->si_signo, &to->si_signo);
 	err |= __put_user(from->si_errno, &to->si_errno);
-	err |= __put_user((short)from->si_code, &to->si_code);
+	err |= __put_user(from->si_code, &to->si_code);
 	if (from->si_code < 0)
 		err |= __copy_to_user(&to->_sifields._pad, &from->_sifields._pad,
 				      SI_PAD_SIZE);
-	else switch (from->si_code & __SI_MASK) {
-	case __SI_KILL:
+	else switch (siginfo_layout(from->si_signo, from->si_code)) {
+	case SIL_KILL:
 		err |= __put_user(from->si_pid, &to->si_pid);
 		err |= __put_user(from->si_uid, &to->si_uid);
 		break;
-	case __SI_TIMER:
+	case SIL_TIMER:
 		 err |= __put_user(from->si_tid, &to->si_tid);
 		 err |= __put_user(from->si_overrun, &to->si_overrun);
 		 err |= __put_user(from->si_int, &to->si_int);
 		break;
-	case __SI_POLL:
+	case SIL_POLL:
 		err |= __put_user(from->si_band, &to->si_band);
 		err |= __put_user(from->si_fd, &to->si_fd);
 		break;
-	case __SI_FAULT:
+	case SIL_FAULT:
 		err |= __put_user((compat_uptr_t)(unsigned long)from->si_addr,
 				  &to->si_addr);
 #ifdef BUS_MCEERR_AO
@@ -173,29 +173,24 @@ int copy_siginfo_to_user32(compat_siginfo_t __user *to, const siginfo_t *from)
 			err |= __put_user(from->si_addr_lsb, &to->si_addr_lsb);
 #endif
 		break;
-	case __SI_CHLD:
+	case SIL_CHLD:
 		err |= __put_user(from->si_pid, &to->si_pid);
 		err |= __put_user(from->si_uid, &to->si_uid);
 		err |= __put_user(from->si_status, &to->si_status);
 		err |= __put_user(from->si_utime, &to->si_utime);
 		err |= __put_user(from->si_stime, &to->si_stime);
 		break;
-	case __SI_RT: /* This is not generated by the kernel as of now. */
-	case __SI_MESGQ: /* But this is */
+	case SIL_RT:
 		err |= __put_user(from->si_pid, &to->si_pid);
 		err |= __put_user(from->si_uid, &to->si_uid);
 		err |= __put_user(from->si_int, &to->si_int);
 		break;
-	case __SI_SYS:
+	case SIL_SYS:
 		err |= __put_user((compat_uptr_t)(unsigned long)
 				from->si_call_addr, &to->si_call_addr);
 		err |= __put_user(from->si_syscall, &to->si_syscall);
 		err |= __put_user(from->si_arch, &to->si_arch);
 		break;
-	default: /* this is just in case for now ... */
-		err |= __put_user(from->si_pid, &to->si_pid);
-		err |= __put_user(from->si_uid, &to->si_uid);
-		break;
 	}
 	return err;
 }
diff --git a/arch/blackfin/include/uapi/asm/siginfo.h b/arch/blackfin/include/uapi/asm/siginfo.h
index c72f4e6e386f..79dfe3979123 100644
--- a/arch/blackfin/include/uapi/asm/siginfo.h
+++ b/arch/blackfin/include/uapi/asm/siginfo.h
@@ -14,28 +14,36 @@
 
 #define si_uid16	_sifields._kill._uid
 
-#define ILL_ILLPARAOP	(__SI_FAULT|2)	/* illegal opcode combine ********** */
-#define ILL_ILLEXCPT	(__SI_FAULT|4)	/* unrecoverable exception ********** */
-#define ILL_CPLB_VI	(__SI_FAULT|9)	/* D/I CPLB protect violation ******** */
-#define ILL_CPLB_MISS	(__SI_FAULT|10)	/* D/I CPLB miss ******** */
-#define ILL_CPLB_MULHIT	(__SI_FAULT|11)	/* D/I CPLB multiple hit ******** */
+#define ILL_ILLPARAOP	2	/* illegal opcode combine ********** */
+#define ILL_ILLEXCPT	4	/* unrecoverable exception ********** */
+#define ILL_CPLB_VI	9	/* D/I CPLB protect violation ******** */
+#define ILL_CPLB_MISS	10	/* D/I CPLB miss ******** */
+#define ILL_CPLB_MULHIT	11	/* D/I CPLB multiple hit ******** */
+#undef NSIGILL
+#define NSIGILL         11
 
 /*
  * SIGBUS si_codes
  */
-#define BUS_OPFETCH	(__SI_FAULT|4)	/* error from instruction fetch ******** */
+#define BUS_OPFETCH	4	/* error from instruction fetch ******** */
+#undef NSIGBUS
+#define NSIGBUS		4
 
 /*
  * SIGTRAP si_codes
  */
-#define TRAP_STEP	(__SI_FAULT|1)	/* single-step breakpoint************* */
-#define TRAP_TRACEFLOW	(__SI_FAULT|2)	/* trace buffer overflow ************* */
-#define TRAP_WATCHPT	(__SI_FAULT|3)	/* watchpoint match      ************* */
-#define TRAP_ILLTRAP	(__SI_FAULT|4)	/* illegal trap          ************* */
+#define TRAP_STEP	1	/* single-step breakpoint************* */
+#define TRAP_TRACEFLOW	2	/* trace buffer overflow ************* */
+#define TRAP_WATCHPT	3	/* watchpoint match      ************* */
+#define TRAP_ILLTRAP	4	/* illegal trap          ************* */
+#undef NSIGTRAP
+#define NSIGTRAP	4
 
 /*
  * SIGSEGV si_codes
  */
-#define SEGV_STACKFLOW	(__SI_FAULT|3)	/* stack overflow */
+#define SEGV_STACKFLOW	3	/* stack overflow */
+#undef NSIGSEGV
+#define NSIGSEGV	3
 
 #endif /* _UAPI_BFIN_SIGINFO_H */
diff --git a/arch/frv/include/uapi/asm/siginfo.h b/arch/frv/include/uapi/asm/siginfo.h
index d3fd1ca45653..f55d9e0e9068 100644
--- a/arch/frv/include/uapi/asm/siginfo.h
+++ b/arch/frv/include/uapi/asm/siginfo.h
@@ -4,7 +4,7 @@
 #include <linux/types.h>
 #include <asm-generic/siginfo.h>
 
-#define FPE_MDAOVF	(__SI_FAULT|9)	/* media overflow */
+#define FPE_MDAOVF	9	/* media overflow */
 #undef NSIGFPE
 #define NSIGFPE		9
 
diff --git a/arch/ia64/include/uapi/asm/siginfo.h b/arch/ia64/include/uapi/asm/siginfo.h
index 3282f8b992fc..33389fc36f23 100644
--- a/arch/ia64/include/uapi/asm/siginfo.h
+++ b/arch/ia64/include/uapi/asm/siginfo.h
@@ -98,9 +98,9 @@ typedef struct siginfo {
 /*
  * SIGILL si_codes
  */
-#define ILL_BADIADDR	(__SI_FAULT|9)	/* unimplemented instruction address */
-#define __ILL_BREAK	(__SI_FAULT|10)	/* illegal break */
-#define __ILL_BNDMOD	(__SI_FAULT|11)	/* bundle-update (modification) in progress */
+#define ILL_BADIADDR	9	/* unimplemented instruction address */
+#define __ILL_BREAK	10	/* illegal break */
+#define __ILL_BNDMOD	11	/* bundle-update (modification) in progress */
 #undef NSIGILL
 #define NSIGILL		11
 
@@ -108,20 +108,20 @@ typedef struct siginfo {
  * SIGFPE si_codes
  */
 #ifdef __KERNEL__
-#define FPE_FIXME	(__SI_FAULT|0)	/* Broken dup of SI_USER */
+#define FPE_FIXME	0	/* Broken dup of SI_USER */
 #endif /* __KERNEL__ */
-#define __FPE_DECOVF	(__SI_FAULT|9)	/* decimal overflow */
-#define __FPE_DECDIV	(__SI_FAULT|10)	/* decimal division by zero */
-#define __FPE_DECERR	(__SI_FAULT|11)	/* packed decimal error */
-#define __FPE_INVASC	(__SI_FAULT|12)	/* invalid ASCII digit */
-#define __FPE_INVDEC	(__SI_FAULT|13)	/* invalid decimal digit */
+#define __FPE_DECOVF	9	/* decimal overflow */
+#define __FPE_DECDIV	10	/* decimal division by zero */
+#define __FPE_DECERR	11	/* packed decimal error */
+#define __FPE_INVASC	12	/* invalid ASCII digit */
+#define __FPE_INVDEC	13	/* invalid decimal digit */
 #undef NSIGFPE
 #define NSIGFPE		13
 
 /*
  * SIGSEGV si_codes
  */
-#define __SEGV_PSTKOVF	(__SI_FAULT|4)	/* paragraph stack overflow */
+#define __SEGV_PSTKOVF	4	/* paragraph stack overflow */
 #undef NSIGSEGV
 #define NSIGSEGV	4
 
diff --git a/arch/ia64/kernel/signal.c b/arch/ia64/kernel/signal.c
index 5db52c6813c4..6146d53b6ad7 100644
--- a/arch/ia64/kernel/signal.c
+++ b/arch/ia64/kernel/signal.c
@@ -124,31 +124,30 @@ copy_siginfo_to_user (siginfo_t __user *to, const siginfo_t *from)
 		 */
 		err = __put_user(from->si_signo, &to->si_signo);
 		err |= __put_user(from->si_errno, &to->si_errno);
-		err |= __put_user((short)from->si_code, &to->si_code);
-		switch (from->si_code >> 16) {
-		      case __SI_FAULT >> 16:
+		err |= __put_user(from->si_code, &to->si_code);
+		switch (siginfo_layout(from->si_signo, from->si_code)) {
+		      case SIL_FAULT:
 			err |= __put_user(from->si_flags, &to->si_flags);
 			err |= __put_user(from->si_isr, &to->si_isr);
-		      case __SI_POLL >> 16:
+		      case SIL_POLL:
 			err |= __put_user(from->si_addr, &to->si_addr);
 			err |= __put_user(from->si_imm, &to->si_imm);
 			break;
-		      case __SI_TIMER >> 16:
+		      case SIL_TIMER:
 			err |= __put_user(from->si_tid, &to->si_tid);
 			err |= __put_user(from->si_overrun, &to->si_overrun);
 			err |= __put_user(from->si_ptr, &to->si_ptr);
 			break;
-		      case __SI_RT >> 16:	/* Not generated by the kernel as of now.  */
-		      case __SI_MESGQ >> 16:
+		      case SIL_RT:
 			err |= __put_user(from->si_uid, &to->si_uid);
 			err |= __put_user(from->si_pid, &to->si_pid);
 			err |= __put_user(from->si_ptr, &to->si_ptr);
 			break;
-		      case __SI_CHLD >> 16:
+		      case SIL_CHLD:
 			err |= __put_user(from->si_utime, &to->si_utime);
 			err |= __put_user(from->si_stime, &to->si_stime);
 			err |= __put_user(from->si_status, &to->si_status);
-		      default:
+		      case SIL_KILL:
 			err |= __put_user(from->si_uid, &to->si_uid);
 			err |= __put_user(from->si_pid, &to->si_pid);
 			break;
diff --git a/arch/mips/include/uapi/asm/siginfo.h b/arch/mips/include/uapi/asm/siginfo.h
index 9becfd102132..22a86d84a504 100644
--- a/arch/mips/include/uapi/asm/siginfo.h
+++ b/arch/mips/include/uapi/asm/siginfo.h
@@ -120,14 +120,14 @@ typedef struct siginfo {
 #undef SI_TIMER
 #undef SI_MESGQ
 #define SI_ASYNCIO	-2	/* sent by AIO completion */
-#define SI_TIMER __SI_CODE(__SI_TIMER, -3) /* sent by timer expiration */
-#define SI_MESGQ __SI_CODE(__SI_MESGQ, -4) /* sent by real time mesq state change */
+#define SI_TIMER	-3	/* sent by timer expiration */
+#define SI_MESGQ	-4	/* sent by real time mesq state change */
 
 /*
  * SIGFPE si_codes
  */
 #ifdef __KERNEL__
-#define FPE_FIXME	(__SI_FAULT|0)	/* Broken dup of SI_USER */
+#define FPE_FIXME	0	/* Broken dup of SI_USER */
 #endif /* __KERNEL__ */
 
 #endif /* _UAPI_ASM_SIGINFO_H */
diff --git a/arch/mips/kernel/signal32.c b/arch/mips/kernel/signal32.c
index 84165f2b31ff..cf5c7c05e5a3 100644
--- a/arch/mips/kernel/signal32.c
+++ b/arch/mips/kernel/signal32.c
@@ -93,38 +93,37 @@ int copy_siginfo_to_user32(compat_siginfo_t __user *to, const siginfo_t *from)
 	   at the same time.  */
 	err = __put_user(from->si_signo, &to->si_signo);
 	err |= __put_user(from->si_errno, &to->si_errno);
-	err |= __put_user((short)from->si_code, &to->si_code);
+	err |= __put_user(from->si_code, &to->si_code);
 	if (from->si_code < 0)
 		err |= __copy_to_user(&to->_sifields._pad, &from->_sifields._pad, SI_PAD_SIZE);
 	else {
-		switch (from->si_code >> 16) {
-		case __SI_TIMER >> 16:
+		switch (siginfo_layout(from->si_signo, from->si_code)) {
+		case SIL_TIMER:
 			err |= __put_user(from->si_tid, &to->si_tid);
 			err |= __put_user(from->si_overrun, &to->si_overrun);
 			err |= __put_user(from->si_int, &to->si_int);
 			break;
-		case __SI_CHLD >> 16:
+		case SIL_CHLD:
 			err |= __put_user(from->si_utime, &to->si_utime);
 			err |= __put_user(from->si_stime, &to->si_stime);
 			err |= __put_user(from->si_status, &to->si_status);
-		default:
+		case SIL_KILL:
 			err |= __put_user(from->si_pid, &to->si_pid);
 			err |= __put_user(from->si_uid, &to->si_uid);
 			break;
-		case __SI_FAULT >> 16:
+		case SIL_FAULT:
 			err |= __put_user((unsigned long)from->si_addr, &to->si_addr);
 			break;
-		case __SI_POLL >> 16:
+		case SIL_POLL:
 			err |= __put_user(from->si_band, &to->si_band);
 			err |= __put_user(from->si_fd, &to->si_fd);
 			break;
-		case __SI_RT >> 16: /* This is not generated by the kernel as of now.  */
-		case __SI_MESGQ >> 16:
+		case SIL_RT:
 			err |= __put_user(from->si_pid, &to->si_pid);
 			err |= __put_user(from->si_uid, &to->si_uid);
 			err |= __put_user(from->si_int, &to->si_int);
 			break;
-		case __SI_SYS >> 16:
+		case SIL_SYS:
 			err |= __copy_to_user(&to->si_call_addr, &from->si_call_addr,
 					      sizeof(compat_uptr_t));
 			err |= __put_user(from->si_syscall, &to->si_syscall);
diff --git a/arch/parisc/kernel/signal32.c b/arch/parisc/kernel/signal32.c
index 70aaabb8b3cb..9e0cb6a577d6 100644
--- a/arch/parisc/kernel/signal32.c
+++ b/arch/parisc/kernel/signal32.c
@@ -290,25 +290,25 @@ copy_siginfo_from_user32 (siginfo_t *to, compat_siginfo_t __user *from)
 	if (to->si_code < 0)
 		err |= __copy_from_user(&to->_sifields._pad, &from->_sifields._pad, SI_PAD_SIZE);
 	else {
-		switch (to->si_code >> 16) {
-		      case __SI_CHLD >> 16:
+		switch (siginfo_layout(to->si_signo, to->si_code)) {
+		      case SIL_CHLD:
 			err |= __get_user(to->si_utime, &from->si_utime);
 			err |= __get_user(to->si_stime, &from->si_stime);
 			err |= __get_user(to->si_status, &from->si_status);
 		      default:
+		      case SIL_KILL:
 			err |= __get_user(to->si_pid, &from->si_pid);
 			err |= __get_user(to->si_uid, &from->si_uid);
 			break;
-		      case __SI_FAULT >> 16:
+		      case SIL_FAULT:
 			err |= __get_user(addr, &from->si_addr);
 			to->si_addr = compat_ptr(addr);
 			break;
-		      case __SI_POLL >> 16:
+		      case SIL_POLL:
 			err |= __get_user(to->si_band, &from->si_band);
 			err |= __get_user(to->si_fd, &from->si_fd);
 			break;
-		      case __SI_RT >> 16: /* This is not generated by the kernel as of now.  */
-		      case __SI_MESGQ >> 16:
+		      case SIL_RT:
 			err |= __get_user(to->si_pid, &from->si_pid);
 			err |= __get_user(to->si_uid, &from->si_uid);
 			err |= __get_user(to->si_int, &from->si_int);
@@ -337,41 +337,40 @@ copy_siginfo_to_user32 (compat_siginfo_t __user *to, const siginfo_t *from)
 	   at the same time.  */
 	err = __put_user(from->si_signo, &to->si_signo);
 	err |= __put_user(from->si_errno, &to->si_errno);
-	err |= __put_user((short)from->si_code, &to->si_code);
+	err |= __put_user(from->si_code, &to->si_code);
 	if (from->si_code < 0)
 		err |= __copy_to_user(&to->_sifields._pad, &from->_sifields._pad, SI_PAD_SIZE);
 	else {
-		switch (from->si_code >> 16) {
-		case __SI_CHLD >> 16:
+		switch (siginfo_layout(from->si_signo, from->si_code)) {
+		case SIL_CHLD:
 			err |= __put_user(from->si_utime, &to->si_utime);
 			err |= __put_user(from->si_stime, &to->si_stime);
 			err |= __put_user(from->si_status, &to->si_status);
-		default:
+		case SIL_KILL:
 			err |= __put_user(from->si_pid, &to->si_pid);
 			err |= __put_user(from->si_uid, &to->si_uid);
 			break;
-		case __SI_FAULT >> 16:
+		case SIL_FAULT:
 			addr = ptr_to_compat(from->si_addr);
 			err |= __put_user(addr, &to->si_addr);
 			break;
-		case __SI_POLL >> 16:
+		case SIL_POLL:
 			err |= __put_user(from->si_band, &to->si_band);
 			err |= __put_user(from->si_fd, &to->si_fd);
 			break;
-		case __SI_TIMER >> 16:
+		case SIL_TIMER:
 			err |= __put_user(from->si_tid, &to->si_tid);
 			err |= __put_user(from->si_overrun, &to->si_overrun);
 			val = (compat_int_t)from->si_int;
 			err |= __put_user(val, &to->si_int);
 			break;
-		case __SI_RT >> 16:	/* Not generated by the kernel as of now.  */
-		case __SI_MESGQ >> 16:
+		case SIL_RT:
 			err |= __put_user(from->si_uid, &to->si_uid);
 			err |= __put_user(from->si_pid, &to->si_pid);
 			val = (compat_int_t)from->si_int;
 			err |= __put_user(val, &to->si_int);
 			break;
-		case __SI_SYS >> 16:
+		case SIL_SYS:
 			err |= __put_user(ptr_to_compat(from->si_call_addr), &to->si_call_addr);
 			err |= __put_user(from->si_syscall, &to->si_syscall);
 			err |= __put_user(from->si_arch, &to->si_arch);
diff --git a/arch/powerpc/kernel/signal_32.c b/arch/powerpc/kernel/signal_32.c
index 97bb1385e771..92fb1c8dbbd8 100644
--- a/arch/powerpc/kernel/signal_32.c
+++ b/arch/powerpc/kernel/signal_32.c
@@ -913,42 +913,40 @@ int copy_siginfo_to_user32(struct compat_siginfo __user *d, const siginfo_t *s)
 	 */
 	err = __put_user(s->si_signo, &d->si_signo);
 	err |= __put_user(s->si_errno, &d->si_errno);
-	err |= __put_user((short)s->si_code, &d->si_code);
+	err |= __put_user(s->si_code, &d->si_code);
 	if (s->si_code < 0)
 		err |= __copy_to_user(&d->_sifields._pad, &s->_sifields._pad,
 				      SI_PAD_SIZE32);
-	else switch(s->si_code >> 16) {
-	case __SI_CHLD >> 16:
+	else switch(siginfo_layout(s->si_signo, s->si_code)) {
+	case SIL_CHLD:
 		err |= __put_user(s->si_pid, &d->si_pid);
 		err |= __put_user(s->si_uid, &d->si_uid);
 		err |= __put_user(s->si_utime, &d->si_utime);
 		err |= __put_user(s->si_stime, &d->si_stime);
 		err |= __put_user(s->si_status, &d->si_status);
 		break;
-	case __SI_FAULT >> 16:
+	case SIL_FAULT:
 		err |= __put_user((unsigned int)(unsigned long)s->si_addr,
 				  &d->si_addr);
 		break;
-	case __SI_POLL >> 16:
+	case SIL_POLL:
 		err |= __put_user(s->si_band, &d->si_band);
 		err |= __put_user(s->si_fd, &d->si_fd);
 		break;
-	case __SI_TIMER >> 16:
+	case SIL_TIMER:
 		err |= __put_user(s->si_tid, &d->si_tid);
 		err |= __put_user(s->si_overrun, &d->si_overrun);
 		err |= __put_user(s->si_int, &d->si_int);
 		break;
-	case __SI_SYS >> 16:
+	case SIL_SYS:
 		err |= __put_user(ptr_to_compat(s->si_call_addr), &d->si_call_addr);
 		err |= __put_user(s->si_syscall, &d->si_syscall);
 		err |= __put_user(s->si_arch, &d->si_arch);
 		break;
-	case __SI_RT >> 16: /* This is not generated by the kernel as of now.  */
-	case __SI_MESGQ >> 16:
+	case SIL_RT:
 		err |= __put_user(s->si_int, &d->si_int);
 		/* fallthrough */
-	case __SI_KILL >> 16:
-	default:
+	case SIL_KILL:
 		err |= __put_user(s->si_pid, &d->si_pid);
 		err |= __put_user(s->si_uid, &d->si_uid);
 		break;
diff --git a/arch/s390/kernel/compat_signal.c b/arch/s390/kernel/compat_signal.c
index c620049c61f2..f549c4657376 100644
--- a/arch/s390/kernel/compat_signal.c
+++ b/arch/s390/kernel/compat_signal.c
@@ -75,35 +75,34 @@ int copy_siginfo_to_user32(compat_siginfo_t __user *to, const siginfo_t *from)
 	   at the same time.  */
 	err = __put_user(from->si_signo, &to->si_signo);
 	err |= __put_user(from->si_errno, &to->si_errno);
-	err |= __put_user((short)from->si_code, &to->si_code);
+	err |= __put_user(from->si_code, &to->si_code);
 	if (from->si_code < 0)
 		err |= __copy_to_user(&to->_sifields._pad, &from->_sifields._pad, SI_PAD_SIZE);
 	else {
-		switch (from->si_code >> 16) {
-		case __SI_RT >> 16: /* This is not generated by the kernel as of now.  */
-		case __SI_MESGQ >> 16:
+		switch (siginfo_layout(from->si_signo, from->si_code)) {
+		case SIL_RT:
 			err |= __put_user(from->si_int, &to->si_int);
 			/* fallthrough */
-		case __SI_KILL >> 16:
+		case SIL_KILL:
 			err |= __put_user(from->si_pid, &to->si_pid);
 			err |= __put_user(from->si_uid, &to->si_uid);
 			break;
-		case __SI_CHLD >> 16:
+		case SIL_CHLD:
 			err |= __put_user(from->si_pid, &to->si_pid);
 			err |= __put_user(from->si_uid, &to->si_uid);
 			err |= __put_user(from->si_utime, &to->si_utime);
 			err |= __put_user(from->si_stime, &to->si_stime);
 			err |= __put_user(from->si_status, &to->si_status);
 			break;
-		case __SI_FAULT >> 16:
+		case SIL_FAULT:
 			err |= __put_user((unsigned long) from->si_addr,
 					  &to->si_addr);
 			break;
-		case __SI_POLL >> 16:
+		case SIL_POLL:
 			err |= __put_user(from->si_band, &to->si_band);
 			err |= __put_user(from->si_fd, &to->si_fd);
 			break;
-		case __SI_TIMER >> 16:
+		case SIL_TIMER:
 			err |= __put_user(from->si_tid, &to->si_tid);
 			err |= __put_user(from->si_overrun, &to->si_overrun);
 			err |= __put_user(from->si_int, &to->si_int);
@@ -127,32 +126,31 @@ int copy_siginfo_from_user32(siginfo_t *to, compat_siginfo_t __user *from)
 	if (to->si_code < 0)
 		err |= __copy_from_user(&to->_sifields._pad, &from->_sifields._pad, SI_PAD_SIZE);
 	else {
-		switch (to->si_code >> 16) {
-		case __SI_RT >> 16: /* This is not generated by the kernel as of now.  */
-		case __SI_MESGQ >> 16:
+		switch (siginfo_layout(to->si_signo, to->si_code)) {
+		case SIL_RT:
 			err |= __get_user(to->si_int, &from->si_int);
 			/* fallthrough */
-		case __SI_KILL >> 16:
+		case SIL_KILL:
 			err |= __get_user(to->si_pid, &from->si_pid);
 			err |= __get_user(to->si_uid, &from->si_uid);
 			break;
-		case __SI_CHLD >> 16:
+		case SIL_CHLD:
 			err |= __get_user(to->si_pid, &from->si_pid);
 			err |= __get_user(to->si_uid, &from->si_uid);
 			err |= __get_user(to->si_utime, &from->si_utime);
 			err |= __get_user(to->si_stime, &from->si_stime);
 			err |= __get_user(to->si_status, &from->si_status);
 			break;
-		case __SI_FAULT >> 16:
+		case SIL_FAULT:
 			err |= __get_user(tmp, &from->si_addr);
 			to->si_addr = (void __force __user *)
 				(u64) (tmp & PSW32_ADDR_INSN);
 			break;
-		case __SI_POLL >> 16:
+		case SIL_POLL:
 			err |= __get_user(to->si_band, &from->si_band);
 			err |= __get_user(to->si_fd, &from->si_fd);
 			break;
-		case __SI_TIMER >> 16:
+		case SIL_TIMER:
 			err |= __get_user(to->si_tid, &from->si_tid);
 			err |= __get_user(to->si_overrun, &from->si_overrun);
 			err |= __get_user(to->si_int, &from->si_int);
diff --git a/arch/sparc/include/uapi/asm/siginfo.h b/arch/sparc/include/uapi/asm/siginfo.h
index da2126e0c536..157f46fe374f 100644
--- a/arch/sparc/include/uapi/asm/siginfo.h
+++ b/arch/sparc/include/uapi/asm/siginfo.h
@@ -20,13 +20,13 @@
  * SIGFPE si_codes
  */
 #ifdef __KERNEL__
-#define FPE_FIXME	(__SI_FAULT|0)	/* Broken dup of SI_USER */
+#define FPE_FIXME	0	/* Broken dup of SI_USER */
 #endif /* __KERNEL__ */
 
 /*
  * SIGEMT si_codes
  */
-#define EMT_TAGOVF	(__SI_FAULT|1)	/* tag overflow */
+#define EMT_TAGOVF	1	/* tag overflow */
 #define NSIGEMT		1
 
 #endif /* _UAPI__SPARC_SIGINFO_H */
diff --git a/arch/sparc/kernel/signal32.c b/arch/sparc/kernel/signal32.c
index b4096bb665b2..0e4c08c45a37 100644
--- a/arch/sparc/kernel/signal32.c
+++ b/arch/sparc/kernel/signal32.c
@@ -85,34 +85,34 @@ int copy_siginfo_to_user32(compat_siginfo_t __user *to, const siginfo_t *from)
 	   at the same time.  */
 	err = __put_user(from->si_signo, &to->si_signo);
 	err |= __put_user(from->si_errno, &to->si_errno);
-	err |= __put_user((short)from->si_code, &to->si_code);
+	err |= __put_user(from->si_code, &to->si_code);
 	if (from->si_code < 0)
 		err |= __copy_to_user(&to->_sifields._pad, &from->_sifields._pad, SI_PAD_SIZE);
 	else {
-		switch (from->si_code >> 16) {
-		case __SI_TIMER >> 16:
+		switch (siginfo_layout(from->si_signo, from->si_code)) {
+		case SIL_TIMER:
 			err |= __put_user(from->si_tid, &to->si_tid);
 			err |= __put_user(from->si_overrun, &to->si_overrun);
 			err |= __put_user(from->si_int, &to->si_int);
 			break;
-		case __SI_CHLD >> 16:
+		case SIL_CHLD:
 			err |= __put_user(from->si_utime, &to->si_utime);
 			err |= __put_user(from->si_stime, &to->si_stime);
 			err |= __put_user(from->si_status, &to->si_status);
 		default:
+		case SIL_KILL:
 			err |= __put_user(from->si_pid, &to->si_pid);
 			err |= __put_user(from->si_uid, &to->si_uid);
 			break;
-		case __SI_FAULT >> 16:
+		case SIL_FAULT:
 			err |= __put_user(from->si_trapno, &to->si_trapno);
 			err |= __put_user((unsigned long)from->si_addr, &to->si_addr);
 			break;
-		case __SI_POLL >> 16:
+		case SIL_POLL:
 			err |= __put_user(from->si_band, &to->si_band);
 			err |= __put_user(from->si_fd, &to->si_fd);
 			break;
-		case __SI_RT >> 16: /* This is not generated by the kernel as of now.  */
-		case __SI_MESGQ >> 16:
+		case SIL_RT:
 			err |= __put_user(from->si_pid, &to->si_pid);
 			err |= __put_user(from->si_uid, &to->si_uid);
 			err |= __put_user(from->si_int, &to->si_int);
diff --git a/arch/tile/include/uapi/asm/siginfo.h b/arch/tile/include/uapi/asm/siginfo.h
index 56d661bb010b..e83f931aa1f0 100644
--- a/arch/tile/include/uapi/asm/siginfo.h
+++ b/arch/tile/include/uapi/asm/siginfo.h
@@ -26,8 +26,8 @@
 /*
  * Additional Tile-specific SIGILL si_codes
  */
-#define ILL_DBLFLT	(__SI_FAULT|9)	/* double fault */
-#define ILL_HARDWALL	(__SI_FAULT|10)	/* user networks hardwall violation */
+#define ILL_DBLFLT	9	/* double fault */
+#define ILL_HARDWALL	10	/* user networks hardwall violation */
 #undef NSIGILL
 #define NSIGILL		10
 
diff --git a/arch/tile/kernel/compat_signal.c b/arch/tile/kernel/compat_signal.c
index 0e863f1ee08c..971d87a1d8cf 100644
--- a/arch/tile/kernel/compat_signal.c
+++ b/arch/tile/kernel/compat_signal.c
@@ -64,7 +64,7 @@ int copy_siginfo_to_user32(struct compat_siginfo __user *to, const siginfo_t *fr
 	   3 ints plus the relevant union member.  */
 	err = __put_user(from->si_signo, &to->si_signo);
 	err |= __put_user(from->si_errno, &to->si_errno);
-	err |= __put_user((short)from->si_code, &to->si_code);
+	err |= __put_user(from->si_code, &to->si_code);
 
 	if (from->si_code < 0) {
 		err |= __put_user(from->si_pid, &to->si_pid);
@@ -77,28 +77,26 @@ int copy_siginfo_to_user32(struct compat_siginfo __user *to, const siginfo_t *fr
 		 */
 		err |= __put_user(from->_sifields._pad[0],
 				  &to->_sifields._pad[0]);
-		switch (from->si_code >> 16) {
-		case __SI_FAULT >> 16:
+		switch (siginfo_layout(from->si_signo, from->si_code)) {
+		case SIL_FAULT:
 			break;
-		case __SI_CHLD >> 16:
+		case SIL_CHLD:
 			err |= __put_user(from->si_utime, &to->si_utime);
 			err |= __put_user(from->si_stime, &to->si_stime);
 			err |= __put_user(from->si_status, &to->si_status);
 			/* FALL THROUGH */
 		default:
-		case __SI_KILL >> 16:
+		case SIL_KILL:
 			err |= __put_user(from->si_uid, &to->si_uid);
 			break;
-		case __SI_POLL >> 16:
+		case SIL_POLL:
 			err |= __put_user(from->si_fd, &to->si_fd);
 			break;
-		case __SI_TIMER >> 16:
+		case SIL_TIMER:
 			err |= __put_user(from->si_overrun, &to->si_overrun);
 			err |= __put_user(from->si_int, &to->si_int);
 			break;
-			 /* This is not generated by the kernel as of now.  */
-		case __SI_RT >> 16:
-		case __SI_MESGQ >> 16:
+		case SIL_RT:
 			err |= __put_user(from->si_uid, &to->si_uid);
 			err |= __put_user(from->si_int, &to->si_int);
 			break;
diff --git a/arch/tile/kernel/traps.c b/arch/tile/kernel/traps.c
index 54804866f238..9b08c6055f15 100644
--- a/arch/tile/kernel/traps.c
+++ b/arch/tile/kernel/traps.c
@@ -188,7 +188,7 @@ static int special_ill(tile_bundle_bits bundle, int *sigp, int *codep)
 
 	/* Make it the requested signal. */
 	*sigp = sig;
-	*codep = code | __SI_FAULT;
+	*codep = code;
 	return 1;
 }
 
diff --git a/arch/x86/kernel/signal_compat.c b/arch/x86/kernel/signal_compat.c
index 71beb28600d4..ab9feb5887b1 100644
--- a/arch/x86/kernel/signal_compat.c
+++ b/arch/x86/kernel/signal_compat.c
@@ -129,7 +129,7 @@ int __copy_siginfo_to_user32(compat_siginfo_t __user *to, const siginfo_t *from,
 		   3 ints plus the relevant union member.  */
 		put_user_ex(from->si_signo, &to->si_signo);
 		put_user_ex(from->si_errno, &to->si_errno);
-		put_user_ex((short)from->si_code, &to->si_code);
+		put_user_ex(from->si_code, &to->si_code);
 
 		if (from->si_code < 0) {
 			put_user_ex(from->si_pid, &to->si_pid);
@@ -142,8 +142,8 @@ int __copy_siginfo_to_user32(compat_siginfo_t __user *to, const siginfo_t *from,
 			 */
 			put_user_ex(from->_sifields._pad[0],
 					  &to->_sifields._pad[0]);
-			switch (from->si_code >> 16) {
-			case __SI_FAULT >> 16:
+			switch (siginfo_layout(from->si_signo, from->si_code)) {
+			case SIL_FAULT:
 				if (from->si_signo == SIGBUS &&
 				    (from->si_code == BUS_MCEERR_AR ||
 				     from->si_code == BUS_MCEERR_AO))
@@ -160,11 +160,11 @@ int __copy_siginfo_to_user32(compat_siginfo_t __user *to, const siginfo_t *from,
 						put_user_ex(from->si_pkey, &to->si_pkey);
 				}
 				break;
-			case __SI_SYS >> 16:
+			case SIL_SYS:
 				put_user_ex(from->si_syscall, &to->si_syscall);
 				put_user_ex(from->si_arch, &to->si_arch);
 				break;
-			case __SI_CHLD >> 16:
+			case SIL_CHLD:
 				if (!x32_ABI) {
 					put_user_ex(from->si_utime, &to->si_utime);
 					put_user_ex(from->si_stime, &to->si_stime);
@@ -174,21 +174,18 @@ int __copy_siginfo_to_user32(compat_siginfo_t __user *to, const siginfo_t *from,
 				}
 				put_user_ex(from->si_status, &to->si_status);
 				/* FALL THROUGH */
-			default:
-			case __SI_KILL >> 16:
+			case SIL_KILL:
 				put_user_ex(from->si_uid, &to->si_uid);
 				break;
-			case __SI_POLL >> 16:
+			case SIL_POLL:
 				put_user_ex(from->si_fd, &to->si_fd);
 				break;
-			case __SI_TIMER >> 16:
+			case SIL_TIMER:
 				put_user_ex(from->si_overrun, &to->si_overrun);
 				put_user_ex(ptr_to_compat(from->si_ptr),
 					    &to->si_ptr);
 				break;
-				 /* This is not generated by the kernel as of now.  */
-			case __SI_RT >> 16:
-			case __SI_MESGQ >> 16:
+			case SIL_RT:
 				put_user_ex(from->si_uid, &to->si_uid);
 				put_user_ex(from->si_int, &to->si_int);
 				break;
diff --git a/fs/signalfd.c b/fs/signalfd.c
index 593b022ac11b..d2c434112f42 100644
--- a/fs/signalfd.c
+++ b/fs/signalfd.c
@@ -95,23 +95,23 @@ static int signalfd_copyinfo(struct signalfd_siginfo __user *uinfo,
 	 */
 	err |= __put_user(kinfo->si_signo, &uinfo->ssi_signo);
 	err |= __put_user(kinfo->si_errno, &uinfo->ssi_errno);
-	err |= __put_user((short) kinfo->si_code, &uinfo->ssi_code);
-	switch (kinfo->si_code & __SI_MASK) {
-	case __SI_KILL:
+	err |= __put_user(kinfo->si_code, &uinfo->ssi_code);
+	switch (siginfo_layout(kinfo->si_signo, kinfo->si_code)) {
+	case SIL_KILL:
 		err |= __put_user(kinfo->si_pid, &uinfo->ssi_pid);
 		err |= __put_user(kinfo->si_uid, &uinfo->ssi_uid);
 		break;
-	case __SI_TIMER:
+	case SIL_TIMER:
 		 err |= __put_user(kinfo->si_tid, &uinfo->ssi_tid);
 		 err |= __put_user(kinfo->si_overrun, &uinfo->ssi_overrun);
 		 err |= __put_user((long) kinfo->si_ptr, &uinfo->ssi_ptr);
 		 err |= __put_user(kinfo->si_int, &uinfo->ssi_int);
 		break;
-	case __SI_POLL:
+	case SIL_POLL:
 		err |= __put_user(kinfo->si_band, &uinfo->ssi_band);
 		err |= __put_user(kinfo->si_fd, &uinfo->ssi_fd);
 		break;
-	case __SI_FAULT:
+	case SIL_FAULT:
 		err |= __put_user((long) kinfo->si_addr, &uinfo->ssi_addr);
 #ifdef __ARCH_SI_TRAPNO
 		err |= __put_user(kinfo->si_trapno, &uinfo->ssi_trapno);
@@ -128,20 +128,14 @@ static int signalfd_copyinfo(struct signalfd_siginfo __user *uinfo,
 					  &uinfo->ssi_addr_lsb);
 #endif
 		break;
-	case __SI_CHLD:
+	case SIL_CHLD:
 		err |= __put_user(kinfo->si_pid, &uinfo->ssi_pid);
 		err |= __put_user(kinfo->si_uid, &uinfo->ssi_uid);
 		err |= __put_user(kinfo->si_status, &uinfo->ssi_status);
 		err |= __put_user(kinfo->si_utime, &uinfo->ssi_utime);
 		err |= __put_user(kinfo->si_stime, &uinfo->ssi_stime);
 		break;
-	case __SI_RT: /* This is not generated by the kernel as of now. */
-	case __SI_MESGQ: /* But this is */
-		err |= __put_user(kinfo->si_pid, &uinfo->ssi_pid);
-		err |= __put_user(kinfo->si_uid, &uinfo->ssi_uid);
-		err |= __put_user((long) kinfo->si_ptr, &uinfo->ssi_ptr);
-		err |= __put_user(kinfo->si_int, &uinfo->ssi_int);
-		break;
+	case SIL_RT:
 	default:
 		/*
 		 * This case catches also the signals queued by sigqueue().
diff --git a/include/linux/signal.h b/include/linux/signal.h
index c97cc20369c0..38564e3e54c7 100644
--- a/include/linux/signal.h
+++ b/include/linux/signal.h
@@ -21,6 +21,20 @@ static inline void copy_siginfo(struct siginfo *to, struct siginfo *from)
 
 int copy_siginfo_to_user(struct siginfo __user *to, const struct siginfo *from);
 
+enum siginfo_layout {
+	SIL_KILL,
+	SIL_TIMER,
+	SIL_POLL,
+	SIL_FAULT,
+	SIL_CHLD,
+	SIL_RT,
+#ifdef __ARCH_SIGSYS
+	SIL_SYS,
+#endif
+};
+
+enum siginfo_layout siginfo_layout(int sig, int si_code);
+
 /*
  * Define some primitives to manipulate sigset_t.
  */
diff --git a/include/uapi/asm-generic/siginfo.h b/include/uapi/asm-generic/siginfo.h
index 9e956ea94d57..e5aa6794cea4 100644
--- a/include/uapi/asm-generic/siginfo.h
+++ b/include/uapi/asm-generic/siginfo.h
@@ -151,29 +151,6 @@ typedef struct siginfo {
 #define si_arch		_sifields._sigsys._arch
 #endif
 
-#ifdef __KERNEL__
-#define __SI_MASK	0xffff0000u
-#define __SI_KILL	(0 << 16)
-#define __SI_TIMER	(1 << 16)
-#define __SI_POLL	(2 << 16)
-#define __SI_FAULT	(3 << 16)
-#define __SI_CHLD	(4 << 16)
-#define __SI_RT		(5 << 16)
-#define __SI_MESGQ	(6 << 16)
-#define __SI_SYS	(7 << 16)
-#define __SI_CODE(T,N)	((T) | ((N) & 0xffff))
-#else /* __KERNEL__ */
-#define __SI_KILL	0
-#define __SI_TIMER	0
-#define __SI_POLL	0
-#define __SI_FAULT	0
-#define __SI_CHLD	0
-#define __SI_RT		0
-#define __SI_MESGQ	0
-#define __SI_SYS	0
-#define __SI_CODE(T,N)	(N)
-#endif /* __KERNEL__ */
-
 /*
  * si_code values
  * Digital reserves positive values for kernel-generated signals.
@@ -181,10 +158,10 @@ typedef struct siginfo {
 #define SI_USER		0		/* sent by kill, sigsend, raise */
 #define SI_KERNEL	0x80		/* sent by the kernel from somewhere */
 #define SI_QUEUE	-1		/* sent by sigqueue */
-#define SI_TIMER __SI_CODE(__SI_TIMER,-2) /* sent by timer expiration */
-#define SI_MESGQ __SI_CODE(__SI_MESGQ,-3) /* sent by real time mesq state change */
+#define SI_TIMER	-2		/* sent by timer expiration */
+#define SI_MESGQ	-3		/* sent by real time mesq state change */
 #define SI_ASYNCIO	-4		/* sent by AIO completion */
-#define SI_SIGIO __SI_CODE(__SI_POLL,-5) /* sent by queued SIGIO */
+#define SI_SIGIO	-5		/* sent by queued SIGIO */
 #define SI_TKILL	-6		/* sent by tkill system call */
 #define SI_DETHREAD	-7		/* sent by execve() killing subsidiary threads */
 
@@ -194,86 +171,86 @@ typedef struct siginfo {
 /*
  * SIGILL si_codes
  */
-#define ILL_ILLOPC	(__SI_FAULT|1)	/* illegal opcode */
-#define ILL_ILLOPN	(__SI_FAULT|2)	/* illegal operand */
-#define ILL_ILLADR	(__SI_FAULT|3)	/* illegal addressing mode */
-#define ILL_ILLTRP	(__SI_FAULT|4)	/* illegal trap */
-#define ILL_PRVOPC	(__SI_FAULT|5)	/* privileged opcode */
-#define ILL_PRVREG	(__SI_FAULT|6)	/* privileged register */
-#define ILL_COPROC	(__SI_FAULT|7)	/* coprocessor error */
-#define ILL_BADSTK	(__SI_FAULT|8)	/* internal stack error */
+#define ILL_ILLOPC	1	/* illegal opcode */
+#define ILL_ILLOPN	2	/* illegal operand */
+#define ILL_ILLADR	3	/* illegal addressing mode */
+#define ILL_ILLTRP	4	/* illegal trap */
+#define ILL_PRVOPC	5	/* privileged opcode */
+#define ILL_PRVREG	6	/* privileged register */
+#define ILL_COPROC	7	/* coprocessor error */
+#define ILL_BADSTK	8	/* internal stack error */
 #define NSIGILL		8
 
 /*
  * SIGFPE si_codes
  */
-#define FPE_INTDIV	(__SI_FAULT|1)	/* integer divide by zero */
-#define FPE_INTOVF	(__SI_FAULT|2)	/* integer overflow */
-#define FPE_FLTDIV	(__SI_FAULT|3)	/* floating point divide by zero */
-#define FPE_FLTOVF	(__SI_FAULT|4)	/* floating point overflow */
-#define FPE_FLTUND	(__SI_FAULT|5)	/* floating point underflow */
-#define FPE_FLTRES	(__SI_FAULT|6)	/* floating point inexact result */
-#define FPE_FLTINV	(__SI_FAULT|7)	/* floating point invalid operation */
-#define FPE_FLTSUB	(__SI_FAULT|8)	/* subscript out of range */
+#define FPE_INTDIV	1	/* integer divide by zero */
+#define FPE_INTOVF	2	/* integer overflow */
+#define FPE_FLTDIV	3	/* floating point divide by zero */
+#define FPE_FLTOVF	4	/* floating point overflow */
+#define FPE_FLTUND	5	/* floating point underflow */
+#define FPE_FLTRES	6	/* floating point inexact result */
+#define FPE_FLTINV	7	/* floating point invalid operation */
+#define FPE_FLTSUB	8	/* subscript out of range */
 #define NSIGFPE		8
 
 /*
  * SIGSEGV si_codes
  */
-#define SEGV_MAPERR	(__SI_FAULT|1)	/* address not mapped to object */
-#define SEGV_ACCERR	(__SI_FAULT|2)	/* invalid permissions for mapped object */
-#define SEGV_BNDERR	(__SI_FAULT|3)  /* failed address bound checks */
-#define SEGV_PKUERR	(__SI_FAULT|4)  /* failed protection key checks */
+#define SEGV_MAPERR	1	/* address not mapped to object */
+#define SEGV_ACCERR	2	/* invalid permissions for mapped object */
+#define SEGV_BNDERR	3	/* failed address bound checks */
+#define SEGV_PKUERR	4	/* failed protection key checks */
 #define NSIGSEGV	4
 
 /*
  * SIGBUS si_codes
  */
-#define BUS_ADRALN	(__SI_FAULT|1)	/* invalid address alignment */
-#define BUS_ADRERR	(__SI_FAULT|2)	/* non-existent physical address */
-#define BUS_OBJERR	(__SI_FAULT|3)	/* object specific hardware error */
+#define BUS_ADRALN	1	/* invalid address alignment */
+#define BUS_ADRERR	2	/* non-existent physical address */
+#define BUS_OBJERR	3	/* object specific hardware error */
 /* hardware memory error consumed on a machine check: action required */
-#define BUS_MCEERR_AR	(__SI_FAULT|4)
+#define BUS_MCEERR_AR	4
 /* hardware memory error detected in process but not consumed: action optional*/
-#define BUS_MCEERR_AO	(__SI_FAULT|5)
+#define BUS_MCEERR_AO	5
 #define NSIGBUS		5
 
 /*
  * SIGTRAP si_codes
  */
-#define TRAP_BRKPT	(__SI_FAULT|1)	/* process breakpoint */
-#define TRAP_TRACE	(__SI_FAULT|2)	/* process trace trap */
-#define TRAP_BRANCH     (__SI_FAULT|3)  /* process taken branch trap */
-#define TRAP_HWBKPT     (__SI_FAULT|4)  /* hardware breakpoint/watchpoint */
+#define TRAP_BRKPT	1	/* process breakpoint */
+#define TRAP_TRACE	2	/* process trace trap */
+#define TRAP_BRANCH     3	/* process taken branch trap */
+#define TRAP_HWBKPT     4	/* hardware breakpoint/watchpoint */
 #define NSIGTRAP	4
 
 /*
  * SIGCHLD si_codes
  */
-#define CLD_EXITED	(__SI_CHLD|1)	/* child has exited */
-#define CLD_KILLED	(__SI_CHLD|2)	/* child was killed */
-#define CLD_DUMPED	(__SI_CHLD|3)	/* child terminated abnormally */
-#define CLD_TRAPPED	(__SI_CHLD|4)	/* traced child has trapped */
-#define CLD_STOPPED	(__SI_CHLD|5)	/* child has stopped */
-#define CLD_CONTINUED	(__SI_CHLD|6)	/* stopped child has continued */
+#define CLD_EXITED	1	/* child has exited */
+#define CLD_KILLED	2	/* child was killed */
+#define CLD_DUMPED	3	/* child terminated abnormally */
+#define CLD_TRAPPED	4	/* traced child has trapped */
+#define CLD_STOPPED	5	/* child has stopped */
+#define CLD_CONTINUED	6	/* stopped child has continued */
 #define NSIGCHLD	6
 
 /*
  * SIGPOLL (or any other signal without signal specific si_codes) si_codes
  */
-#define POLL_IN		(__SI_POLL|1)	/* data input available */
-#define POLL_OUT	(__SI_POLL|2)	/* output buffers available */
-#define POLL_MSG	(__SI_POLL|3)	/* input message available */
-#define POLL_ERR	(__SI_POLL|4)	/* i/o error */
-#define POLL_PRI	(__SI_POLL|5)	/* high priority input available */
-#define POLL_HUP	(__SI_POLL|6)	/* device disconnected */
+#define POLL_IN		1	/* data input available */
+#define POLL_OUT	2	/* output buffers available */
+#define POLL_MSG	3	/* input message available */
+#define POLL_ERR	4	/* i/o error */
+#define POLL_PRI	5	/* high priority input available */
+#define POLL_HUP	6	/* device disconnected */
 #define NSIGPOLL	6
 
 /*
  * SIGSYS si_codes
  */
-#define SYS_SECCOMP		(__SI_SYS|1)	/* seccomp triggered */
-#define NSIGSYS	1
+#define SYS_SECCOMP	1	/* seccomp triggered */
+#define NSIGSYS		1
 
 /*
  * sigevent definitions
diff --git a/kernel/exit.c b/kernel/exit.c
index c5548faa9f37..c8f23613df5b 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -1616,7 +1616,7 @@ SYSCALL_DEFINE5(waitid, int, which, pid_t, upid, struct siginfo __user *,
 	user_access_begin();
 	unsafe_put_user(signo, &infop->si_signo, Efault);
 	unsafe_put_user(0, &infop->si_errno, Efault);
-	unsafe_put_user((short)info.cause, &infop->si_code, Efault);
+	unsafe_put_user(info.cause, &infop->si_code, Efault);
 	unsafe_put_user(info.pid, &infop->si_pid, Efault);
 	unsafe_put_user(info.uid, &infop->si_uid, Efault);
 	unsafe_put_user(info.status, &infop->si_status, Efault);
@@ -1742,7 +1742,7 @@ COMPAT_SYSCALL_DEFINE5(waitid,
 	user_access_begin();
 	unsafe_put_user(signo, &infop->si_signo, Efault);
 	unsafe_put_user(0, &infop->si_errno, Efault);
-	unsafe_put_user((short)info.cause, &infop->si_code, Efault);
+	unsafe_put_user(info.cause, &infop->si_code, Efault);
 	unsafe_put_user(info.pid, &infop->si_pid, Efault);
 	unsafe_put_user(info.uid, &infop->si_uid, Efault);
 	unsafe_put_user(info.status, &infop->si_status, Efault);
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index 60f356d91060..84b1367935e4 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -728,8 +728,7 @@ static int ptrace_peek_siginfo(struct task_struct *child,
 		if (unlikely(in_compat_syscall())) {
 			compat_siginfo_t __user *uinfo = compat_ptr(data);
 
-			if (copy_siginfo_to_user32(uinfo, &info) ||
-			    __put_user(info.si_code, &uinfo->si_code)) {
+			if (copy_siginfo_to_user32(uinfo, &info)) {
 				ret = -EFAULT;
 				break;
 			}
@@ -739,8 +738,7 @@ static int ptrace_peek_siginfo(struct task_struct *child,
 		{
 			siginfo_t __user *uinfo = (siginfo_t __user *) data;
 
-			if (copy_siginfo_to_user(uinfo, &info) ||
-			    __put_user(info.si_code, &uinfo->si_code)) {
+			if (copy_siginfo_to_user(uinfo, &info)) {
 				ret = -EFAULT;
 				break;
 			}
diff --git a/kernel/signal.c b/kernel/signal.c
index caed9133ae52..6bd53c8189f0 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -2682,6 +2682,51 @@ COMPAT_SYSCALL_DEFINE2(rt_sigpending, compat_sigset_t __user *, uset,
 }
 #endif
 
+enum siginfo_layout siginfo_layout(int sig, int si_code)
+{
+	enum siginfo_layout layout = SIL_KILL;
+	if ((si_code > SI_USER) && (si_code < SI_KERNEL)) {
+		static const struct {
+			unsigned char limit, layout;
+		} filter[] = {
+			[SIGILL]  = { NSIGILL,  SIL_FAULT },
+			[SIGFPE]  = { NSIGFPE,  SIL_FAULT },
+			[SIGSEGV] = { NSIGSEGV, SIL_FAULT },
+			[SIGBUS]  = { NSIGBUS,  SIL_FAULT },
+			[SIGTRAP] = { NSIGTRAP, SIL_FAULT },
+#if defined(SIGMET) && defined(NSIGEMT)
+			[SIGEMT]  = { NSIGEMT,  SIL_FAULT },
+#endif
+			[SIGCHLD] = { NSIGCHLD, SIL_CHLD },
+			[SIGPOLL] = { NSIGPOLL, SIL_POLL },
+#ifdef __ARCH_SIGSYS
+			[SIGSYS]  = { NSIGSYS,  SIL_SYS },
+#endif
+		};
+		if ((sig < ARRAY_SIZE(filter)) && (si_code <= filter[sig].limit))
+			layout = filter[sig].layout;
+		else if (si_code <= NSIGPOLL)
+			layout = SIL_POLL;
+	} else {
+		if (si_code == SI_TIMER)
+			layout = SIL_TIMER;
+		else if (si_code == SI_SIGIO)
+			layout = SIL_POLL;
+		else if (si_code < 0)
+			layout = SIL_RT;
+		/* Tests to support buggy kernel ABIs */
+#ifdef TRAP_FIXME
+		if ((sig == SIGTRAP) && (si_code == TRAP_FIXME))
+			layout = SIL_FAULT;
+#endif
+#ifdef FPE_FIXME
+		if ((sig == SIGFPE) && (si_code == FPE_FIXME))
+			layout = SIL_FAULT;
+#endif
+	}
+	return layout;
+}
+
 #ifndef HAVE_ARCH_COPY_SIGINFO_TO_USER
 
 int copy_siginfo_to_user(siginfo_t __user *to, const siginfo_t *from)
@@ -2704,22 +2749,20 @@ int copy_siginfo_to_user(siginfo_t __user *to, const siginfo_t *from)
 	 */
 	err = __put_user(from->si_signo, &to->si_signo);
 	err |= __put_user(from->si_errno, &to->si_errno);
-	err |= __put_user((short)from->si_code, &to->si_code);
-	switch (from->si_code & __SI_MASK) {
-	case __SI_KILL:
+	err |= __put_user(from->si_code, &to->si_code);
+	switch (siginfo_layout(from->si_signo, from->si_code)) {
+	case SIL_KILL:
 		err |= __put_user(from->si_pid, &to->si_pid);
 		err |= __put_user(from->si_uid, &to->si_uid);
 		break;
-	case __SI_TIMER:
-		 err |= __put_user(from->si_tid, &to->si_tid);
-		 err |= __put_user(from->si_overrun, &to->si_overrun);
-		 err |= __put_user(from->si_ptr, &to->si_ptr);
+	case SIL_TIMER:
+		/* Unreached SI_TIMER is negative */
 		break;
-	case __SI_POLL:
+	case SIL_POLL:
 		err |= __put_user(from->si_band, &to->si_band);
 		err |= __put_user(from->si_fd, &to->si_fd);
 		break;
-	case __SI_FAULT:
+	case SIL_FAULT:
 		err |= __put_user(from->si_addr, &to->si_addr);
 #ifdef __ARCH_SI_TRAPNO
 		err |= __put_user(from->si_trapno, &to->si_trapno);
@@ -2744,30 +2787,25 @@ int copy_siginfo_to_user(siginfo_t __user *to, const siginfo_t *from)
 			err |= __put_user(from->si_pkey, &to->si_pkey);
 #endif
 		break;
-	case __SI_CHLD:
+	case SIL_CHLD:
 		err |= __put_user(from->si_pid, &to->si_pid);
 		err |= __put_user(from->si_uid, &to->si_uid);
 		err |= __put_user(from->si_status, &to->si_status);
 		err |= __put_user(from->si_utime, &to->si_utime);
 		err |= __put_user(from->si_stime, &to->si_stime);
 		break;
-	case __SI_RT: /* This is not generated by the kernel as of now. */
-	case __SI_MESGQ: /* But this is */
+	case SIL_RT:
 		err |= __put_user(from->si_pid, &to->si_pid);
 		err |= __put_user(from->si_uid, &to->si_uid);
 		err |= __put_user(from->si_ptr, &to->si_ptr);
 		break;
 #ifdef __ARCH_SIGSYS
-	case __SI_SYS:
+	case SIL_SYS:
 		err |= __put_user(from->si_call_addr, &to->si_call_addr);
 		err |= __put_user(from->si_syscall, &to->si_syscall);
 		err |= __put_user(from->si_arch, &to->si_arch);
 		break;
 #endif
-	default: /* this is just in case for now ... */
-		err |= __put_user(from->si_pid, &to->si_pid);
-		err |= __put_user(from->si_uid, &to->si_uid);
-		break;
 	}
 	return err;
 }
-- 
cgit v1.2.3


From 6c9a58e84e59a2fc937798b623f72ae4b436547d Mon Sep 17 00:00:00 2001
From: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Date: Tue, 18 Jul 2017 18:04:17 +0300
Subject: ACPI / boot: Correct address space of __acpi_map_table()

Sparse complains about wrong address space used in __acpi_map_table()
and in __acpi_unmap_table().

arch/x86/kernel/acpi/boot.c:127:29: warning: incorrect type in return expression (different address spaces)
arch/x86/kernel/acpi/boot.c:127:29:    expected char *
arch/x86/kernel/acpi/boot.c:127:29:    got void [noderef] <asn:2>*
arch/x86/kernel/acpi/boot.c:135:23: warning: incorrect type in argument 1 (different address spaces)
arch/x86/kernel/acpi/boot.c:135:23:    expected void [noderef] <asn:2>*addr
arch/x86/kernel/acpi/boot.c:135:23:    got char *map

Correct address space to be in align of type of returned and passed
parameter.

Reviewed-by: Hanjun Guo <guohanjun@huawei.com>
Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 arch/arm64/kernel/acpi.c    | 4 ++--
 arch/ia64/kernel/acpi.c     | 6 +++---
 arch/x86/kernel/acpi/boot.c | 4 ++--
 include/linux/acpi.h        | 4 ++--
 4 files changed, 9 insertions(+), 9 deletions(-)

(limited to 'include')

diff --git a/arch/arm64/kernel/acpi.c b/arch/arm64/kernel/acpi.c
index e25c11e727fe..b3162715ed78 100644
--- a/arch/arm64/kernel/acpi.c
+++ b/arch/arm64/kernel/acpi.c
@@ -95,7 +95,7 @@ static int __init dt_scan_depth1_nodes(unsigned long node,
  * __acpi_map_table() will be called before page_init(), so early_ioremap()
  * or early_memremap() should be called here to for ACPI table mapping.
  */
-char *__init __acpi_map_table(unsigned long phys, unsigned long size)
+void __init __iomem *__acpi_map_table(unsigned long phys, unsigned long size)
 {
 	if (!size)
 		return NULL;
@@ -103,7 +103,7 @@ char *__init __acpi_map_table(unsigned long phys, unsigned long size)
 	return early_memremap(phys, size);
 }
 
-void __init __acpi_unmap_table(char *map, unsigned long size)
+void __init __acpi_unmap_table(void __iomem *map, unsigned long size)
 {
 	if (!map || !size)
 		return;
diff --git a/arch/ia64/kernel/acpi.c b/arch/ia64/kernel/acpi.c
index 7508c306aa9e..1d29b2f8726b 100644
--- a/arch/ia64/kernel/acpi.c
+++ b/arch/ia64/kernel/acpi.c
@@ -159,12 +159,12 @@ int acpi_request_vector(u32 int_type)
 	return vector;
 }
 
-char *__init __acpi_map_table(unsigned long phys_addr, unsigned long size)
+void __init __iomem *__acpi_map_table(unsigned long phys, unsigned long size)
 {
-	return __va(phys_addr);
+	return __va(phys);
 }
 
-void __init __acpi_unmap_table(char *map, unsigned long size)
+void __init __acpi_unmap_table(void __iomem *map, unsigned long size)
 {
 }
 
diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c
index 27f0240773b3..666dcc3e14e1 100644
--- a/arch/x86/kernel/acpi/boot.c
+++ b/arch/x86/kernel/acpi/boot.c
@@ -118,7 +118,7 @@ static u32 isa_irq_to_gsi[NR_IRQS_LEGACY] __read_mostly = {
  * This is just a simple wrapper around early_ioremap(),
  * with sanity checks for phys == 0 and size == 0.
  */
-char *__init __acpi_map_table(unsigned long phys, unsigned long size)
+void __init __iomem *__acpi_map_table(unsigned long phys, unsigned long size)
 {
 
 	if (!phys || !size)
@@ -127,7 +127,7 @@ char *__init __acpi_map_table(unsigned long phys, unsigned long size)
 	return early_ioremap(phys, size);
 }
 
-void __init __acpi_unmap_table(char *map, unsigned long size)
+void __init __acpi_unmap_table(void __iomem *map, unsigned long size)
 {
 	if (!map || !size)
 		return;
diff --git a/include/linux/acpi.h b/include/linux/acpi.h
index c749eef1daa1..7443af1d16e7 100644
--- a/include/linux/acpi.h
+++ b/include/linux/acpi.h
@@ -228,8 +228,8 @@ struct acpi_subtable_proc {
 	int count;
 };
 
-char * __acpi_map_table (unsigned long phys_addr, unsigned long size);
-void __acpi_unmap_table(char *map, unsigned long size);
+void __iomem *__acpi_map_table(unsigned long phys, unsigned long size);
+void __acpi_unmap_table(void __iomem *map, unsigned long size);
 int early_acpi_boot_init(void);
 int acpi_boot_init (void);
 void acpi_boot_table_init (void);
-- 
cgit v1.2.3


From d764a122cc7af7ab1c40c08745f0fcd33cc2f7db Mon Sep 17 00:00:00 2001
From: Sabrina Dubroca <sd@queasysnail.net>
Date: Fri, 21 Jul 2017 12:49:28 +0200
Subject: net: add new netdevice feature for offload of RX port for UDP tunnels

This adds a new netdevice feature, so that the offloading of RX port for
UDP tunnels can be disabled by the administrator on some netdevices,
using the "rx-udp_tunnel-port-offload" feature in ethtool.

This feature is set for all devices that provide ndo_udp_tunnel_add.

Signed-off-by: Sabrina Dubroca <sd@queasysnail.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netdev_features.h | 2 ++
 net/core/dev.c                  | 6 ++++++
 net/core/ethtool.c              | 1 +
 3 files changed, 9 insertions(+)

(limited to 'include')

diff --git a/include/linux/netdev_features.h b/include/linux/netdev_features.h
index ebd273627334..dc8b4896b77b 100644
--- a/include/linux/netdev_features.h
+++ b/include/linux/netdev_features.h
@@ -75,6 +75,7 @@ enum {
 	NETIF_F_HW_TC_BIT,		/* Offload TC infrastructure */
 	NETIF_F_HW_ESP_BIT,		/* Hardware ESP transformation offload */
 	NETIF_F_HW_ESP_TX_CSUM_BIT,	/* ESP with TX checksum offload */
+	NETIF_F_RX_UDP_TUNNEL_PORT_BIT, /* Offload of RX port for UDP tunnels */
 
 	/*
 	 * Add your fresh new feature above and remember to update
@@ -138,6 +139,7 @@ enum {
 #define NETIF_F_HW_TC		__NETIF_F(HW_TC)
 #define NETIF_F_HW_ESP		__NETIF_F(HW_ESP)
 #define NETIF_F_HW_ESP_TX_CSUM	__NETIF_F(HW_ESP_TX_CSUM)
+#define	NETIF_F_RX_UDP_TUNNEL_PORT  __NETIF_F(RX_UDP_TUNNEL_PORT)
 
 #define for_each_netdev_feature(mask_addr, bit)	\
 	for_each_set_bit(bit, (unsigned long *)mask_addr, NETDEV_FEATURE_COUNT)
diff --git a/net/core/dev.c b/net/core/dev.c
index 509af6ce8831..9081134adc0d 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -7530,6 +7530,12 @@ int register_netdevice(struct net_device *dev)
 	 */
 	dev->hw_features |= NETIF_F_SOFT_FEATURES;
 	dev->features |= NETIF_F_SOFT_FEATURES;
+
+	if (dev->netdev_ops->ndo_udp_tunnel_add) {
+		dev->features |= NETIF_F_RX_UDP_TUNNEL_PORT;
+		dev->hw_features |= NETIF_F_RX_UDP_TUNNEL_PORT;
+	}
+
 	dev->wanted_features = dev->features & dev->hw_features;
 
 	if (!(dev->flags & IFF_LOOPBACK))
diff --git a/net/core/ethtool.c b/net/core/ethtool.c
index 78408ab77a10..b987bc475fc8 100644
--- a/net/core/ethtool.c
+++ b/net/core/ethtool.c
@@ -105,6 +105,7 @@ static const char netdev_features_strings[NETDEV_FEATURE_COUNT][ETH_GSTRING_LEN]
 	[NETIF_F_HW_TC_BIT] =		 "hw-tc-offload",
 	[NETIF_F_HW_ESP_BIT] =		 "esp-hw-offload",
 	[NETIF_F_HW_ESP_TX_CSUM_BIT] =	 "esp-tx-csum-hw-offload",
+	[NETIF_F_RX_UDP_TUNNEL_PORT_BIT] =	 "rx-udp_tunnel-port-offload",
 };
 
 static const char
-- 
cgit v1.2.3


From 296d8ee37c50f139d934bdefbab85509b2e4a525 Mon Sep 17 00:00:00 2001
From: Sabrina Dubroca <sd@queasysnail.net>
Date: Fri, 21 Jul 2017 12:49:30 +0200
Subject: net: add infrastructure to un-offload UDP tunnel port

This adds a new NETDEV_UDP_TUNNEL_DROP_INFO event, similar to
NETDEV_UDP_TUNNEL_PUSH_INFO, to signal to un-offload ports.

This also adds udp_tunnel_drop_rx_port(), which calls
ndo_udp_tunnel_del.

Signed-off-by: Sabrina Dubroca <sd@queasysnail.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netdevice.h |  1 +
 include/net/udp_tunnel.h  |  8 ++++++++
 net/ipv4/udp_tunnel.c     | 18 ++++++++++++++++++
 3 files changed, 27 insertions(+)

(limited to 'include')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 614642eb7eb7..3a3cdc1b1f31 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -2317,6 +2317,7 @@ struct netdev_lag_lower_state_info {
 #define NETDEV_PRECHANGEUPPER	0x001A
 #define NETDEV_CHANGELOWERSTATE	0x001B
 #define NETDEV_UDP_TUNNEL_PUSH_INFO	0x001C
+#define NETDEV_UDP_TUNNEL_DROP_INFO	0x001D
 #define NETDEV_CHANGE_TX_QUEUE_LEN	0x001E
 
 int register_netdevice_notifier(struct notifier_block *nb);
diff --git a/include/net/udp_tunnel.h b/include/net/udp_tunnel.h
index 02c5be037451..10cce0dd4450 100644
--- a/include/net/udp_tunnel.h
+++ b/include/net/udp_tunnel.h
@@ -115,6 +115,8 @@ struct udp_tunnel_info {
 /* Notify network devices of offloadable types */
 void udp_tunnel_push_rx_port(struct net_device *dev, struct socket *sock,
 			     unsigned short type);
+void udp_tunnel_drop_rx_port(struct net_device *dev, struct socket *sock,
+			     unsigned short type);
 void udp_tunnel_notify_add_rx_port(struct socket *sock, unsigned short type);
 void udp_tunnel_notify_del_rx_port(struct socket *sock, unsigned short type);
 
@@ -124,6 +126,12 @@ static inline void udp_tunnel_get_rx_info(struct net_device *dev)
 	call_netdevice_notifiers(NETDEV_UDP_TUNNEL_PUSH_INFO, dev);
 }
 
+static inline void udp_tunnel_drop_rx_info(struct net_device *dev)
+{
+	ASSERT_RTNL();
+	call_netdevice_notifiers(NETDEV_UDP_TUNNEL_DROP_INFO, dev);
+}
+
 /* Transmit the skb using UDP encapsulation. */
 void udp_tunnel_xmit_skb(struct rtable *rt, struct sock *sk, struct sk_buff *skb,
 			 __be32 src, __be32 dst, __u8 tos, __u8 ttl,
diff --git a/net/ipv4/udp_tunnel.c b/net/ipv4/udp_tunnel.c
index 0d3f14cdc524..6539ff15e9a3 100644
--- a/net/ipv4/udp_tunnel.c
+++ b/net/ipv4/udp_tunnel.c
@@ -94,6 +94,24 @@ void udp_tunnel_push_rx_port(struct net_device *dev, struct socket *sock,
 }
 EXPORT_SYMBOL_GPL(udp_tunnel_push_rx_port);
 
+void udp_tunnel_drop_rx_port(struct net_device *dev, struct socket *sock,
+			     unsigned short type)
+{
+	struct sock *sk = sock->sk;
+	struct udp_tunnel_info ti;
+
+	if (!dev->netdev_ops->ndo_udp_tunnel_del ||
+	    !(dev->features & NETIF_F_RX_UDP_TUNNEL_PORT))
+		return;
+
+	ti.type = type;
+	ti.sa_family = sk->sk_family;
+	ti.port = inet_sk(sk)->inet_sport;
+
+	dev->netdev_ops->ndo_udp_tunnel_del(dev, &ti);
+}
+EXPORT_SYMBOL_GPL(udp_tunnel_drop_rx_port);
+
 /* Notify netdevs that UDP port started listening */
 void udp_tunnel_notify_add_rx_port(struct socket *sock, unsigned short type)
 {
-- 
cgit v1.2.3


From afece3ab9a3640f9c1e064f56c8cdd4d783f6144 Mon Sep 17 00:00:00 2001
From: Thara Gopinath <thara.gopinath@linaro.org>
Date: Fri, 14 Jul 2017 13:10:15 -0400
Subject: PM / Domains: Add time accounting to various genpd states

This patch adds support to calculate the time spent by the generic
power domains in on and various idle states.

Signed-off-by: Thara Gopinath <thara.gopinath@linaro.org>
Acked-by: Ulf Hansson <ulf.hansson@linaro.org>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/base/power/domain.c | 32 ++++++++++++++++++++++++++++++++
 include/linux/pm_domain.h   |  3 +++
 2 files changed, 35 insertions(+)

(limited to 'include')

diff --git a/drivers/base/power/domain.c b/drivers/base/power/domain.c
index 60303aa28587..431488914982 100644
--- a/drivers/base/power/domain.c
+++ b/drivers/base/power/domain.c
@@ -209,6 +209,34 @@ static void genpd_sd_counter_inc(struct generic_pm_domain *genpd)
 	smp_mb__after_atomic();
 }
 
+#ifdef CONFIG_DEBUG_FS
+static void genpd_update_accounting(struct generic_pm_domain *genpd)
+{
+	ktime_t delta, now;
+
+	now = ktime_get();
+	delta = ktime_sub(now, genpd->accounting_time);
+
+	/*
+	 * If genpd->status is active, it means we are just
+	 * out of off and so update the idle time and vice
+	 * versa.
+	 */
+	if (genpd->status == GPD_STATE_ACTIVE) {
+		int state_idx = genpd->state_idx;
+
+		genpd->states[state_idx].idle_time =
+			ktime_add(genpd->states[state_idx].idle_time, delta);
+	} else {
+		genpd->on_time = ktime_add(genpd->on_time, delta);
+	}
+
+	genpd->accounting_time = now;
+}
+#else
+static inline void genpd_update_accounting(struct generic_pm_domain *genpd) {}
+#endif
+
 static int _genpd_power_on(struct generic_pm_domain *genpd, bool timed)
 {
 	unsigned int state_idx = genpd->state_idx;
@@ -361,6 +389,7 @@ static int genpd_power_off(struct generic_pm_domain *genpd, bool one_dev_on,
 	}
 
 	genpd->status = GPD_STATE_POWER_OFF;
+	genpd_update_accounting(genpd);
 
 	list_for_each_entry(link, &genpd->slave_links, slave_node) {
 		genpd_sd_counter_dec(link->master);
@@ -413,6 +442,8 @@ static int genpd_power_on(struct generic_pm_domain *genpd, unsigned int depth)
 		goto err;
 
 	genpd->status = GPD_STATE_ACTIVE;
+	genpd_update_accounting(genpd);
+
 	return 0;
 
  err:
@@ -1540,6 +1571,7 @@ int pm_genpd_init(struct generic_pm_domain *genpd,
 	genpd->max_off_time_changed = true;
 	genpd->provider = NULL;
 	genpd->has_provider = false;
+	genpd->accounting_time = ktime_get();
 	genpd->domain.ops.runtime_suspend = genpd_runtime_suspend;
 	genpd->domain.ops.runtime_resume = genpd_runtime_resume;
 	genpd->domain.ops.prepare = pm_genpd_prepare;
diff --git a/include/linux/pm_domain.h b/include/linux/pm_domain.h
index 41004d97cefa..84f423d5633e 100644
--- a/include/linux/pm_domain.h
+++ b/include/linux/pm_domain.h
@@ -43,6 +43,7 @@ struct genpd_power_state {
 	s64 power_on_latency_ns;
 	s64 residency_ns;
 	struct fwnode_handle *fwnode;
+	ktime_t idle_time;
 };
 
 struct genpd_lock_ops;
@@ -78,6 +79,8 @@ struct generic_pm_domain {
 	unsigned int state_count; /* number of states */
 	unsigned int state_idx; /* state that genpd will go to when off */
 	void *free; /* Free the state that was allocated for default */
+	ktime_t on_time;
+	ktime_t accounting_time;
 	const struct genpd_lock_ops *lock_ops;
 	union {
 		struct mutex mlock;
-- 
cgit v1.2.3


From 786f41fb6b008ea4341355b99083a38853a5311d Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rafael.j.wysocki@intel.com>
Date: Fri, 21 Jul 2017 02:09:22 +0200
Subject: PM / core: Split dpm_suspend_noirq() and dpm_resume_noirq()

Put the device interrupts disabling and enabling as well as
cpuidle_pause() and cpuidle_resume() called during the "noirq"
stages of system suspend into separate functions to allow the
core suspend-to-idle code to be optimized (later).

The only functional difference this makes is that debug facilities
and diagnostic tools will not include the above operations into the
"noirq" device suspend/resume duration measurements.

Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/base/power/main.c | 67 ++++++++++++++++++++++++++++++++---------------
 include/linux/pm.h        |  4 +++
 2 files changed, 50 insertions(+), 21 deletions(-)

(limited to 'include')

diff --git a/drivers/base/power/main.c b/drivers/base/power/main.c
index 4f5fc212d684..9b194e9f49a9 100644
--- a/drivers/base/power/main.c
+++ b/drivers/base/power/main.c
@@ -598,14 +598,7 @@ static void async_resume_noirq(void *data, async_cookie_t cookie)
 	put_device(dev);
 }
 
-/**
- * dpm_resume_noirq - Execute "noirq resume" callbacks for all devices.
- * @state: PM transition of the system being carried out.
- *
- * Call the "noirq" resume handlers for all devices in dpm_noirq_list and
- * enable device drivers to receive interrupts.
- */
-void dpm_resume_noirq(pm_message_t state)
+void dpm_noirq_resume_devices(pm_message_t state)
 {
 	struct device *dev;
 	ktime_t starttime = ktime_get();
@@ -651,10 +644,27 @@ void dpm_resume_noirq(pm_message_t state)
 	mutex_unlock(&dpm_list_mtx);
 	async_synchronize_full();
 	dpm_show_time(starttime, state, "noirq");
+	trace_suspend_resume(TPS("dpm_resume_noirq"), state.event, false);
+}
+
+void dpm_noirq_end(void)
+{
 	resume_device_irqs();
 	device_wakeup_disarm_wake_irqs();
 	cpuidle_resume();
-	trace_suspend_resume(TPS("dpm_resume_noirq"), state.event, false);
+}
+
+/**
+ * dpm_resume_noirq - Execute "noirq resume" callbacks for all devices.
+ * @state: PM transition of the system being carried out.
+ *
+ * Invoke the "noirq" resume callbacks for all devices in dpm_noirq_list and
+ * allow device drivers' interrupt handlers to be called.
+ */
+void dpm_resume_noirq(pm_message_t state)
+{
+	dpm_noirq_resume_devices(state);
+	dpm_noirq_end();
 }
 
 /**
@@ -1154,22 +1164,19 @@ static int device_suspend_noirq(struct device *dev)
 	return __device_suspend_noirq(dev, pm_transition, false);
 }
 
-/**
- * dpm_suspend_noirq - Execute "noirq suspend" callbacks for all devices.
- * @state: PM transition of the system being carried out.
- *
- * Prevent device drivers from receiving interrupts and call the "noirq" suspend
- * handlers for all non-sysdev devices.
- */
-int dpm_suspend_noirq(pm_message_t state)
+void dpm_noirq_begin(void)
+{
+	cpuidle_pause();
+	device_wakeup_arm_wake_irqs();
+	suspend_device_irqs();
+}
+
+int dpm_noirq_suspend_devices(pm_message_t state)
 {
 	ktime_t starttime = ktime_get();
 	int error = 0;
 
 	trace_suspend_resume(TPS("dpm_suspend_noirq"), state.event, true);
-	cpuidle_pause();
-	device_wakeup_arm_wake_irqs();
-	suspend_device_irqs();
 	mutex_lock(&dpm_list_mtx);
 	pm_transition = state;
 	async_error = 0;
@@ -1204,7 +1211,6 @@ int dpm_suspend_noirq(pm_message_t state)
 	if (error) {
 		suspend_stats.failed_suspend_noirq++;
 		dpm_save_failed_step(SUSPEND_SUSPEND_NOIRQ);
-		dpm_resume_noirq(resume_event(state));
 	} else {
 		dpm_show_time(starttime, state, "noirq");
 	}
@@ -1212,6 +1218,25 @@ int dpm_suspend_noirq(pm_message_t state)
 	return error;
 }
 
+/**
+ * dpm_suspend_noirq - Execute "noirq suspend" callbacks for all devices.
+ * @state: PM transition of the system being carried out.
+ *
+ * Prevent device drivers' interrupt handlers from being called and invoke
+ * "noirq" suspend callbacks for all non-sysdev devices.
+ */
+int dpm_suspend_noirq(pm_message_t state)
+{
+	int ret;
+
+	dpm_noirq_begin();
+	ret = dpm_noirq_suspend_devices(state);
+	if (ret)
+		dpm_resume_noirq(resume_event(state));
+
+	return ret;
+}
+
 /**
  * device_suspend_late - Execute a "late suspend" callback for given device.
  * @dev: Device to handle.
diff --git a/include/linux/pm.h b/include/linux/pm.h
index b8b4df09fd8f..47ded8aa8a5d 100644
--- a/include/linux/pm.h
+++ b/include/linux/pm.h
@@ -689,6 +689,8 @@ struct dev_pm_domain {
 extern void device_pm_lock(void);
 extern void dpm_resume_start(pm_message_t state);
 extern void dpm_resume_end(pm_message_t state);
+extern void dpm_noirq_resume_devices(pm_message_t state);
+extern void dpm_noirq_end(void);
 extern void dpm_resume_noirq(pm_message_t state);
 extern void dpm_resume_early(pm_message_t state);
 extern void dpm_resume(pm_message_t state);
@@ -697,6 +699,8 @@ extern void dpm_complete(pm_message_t state);
 extern void device_pm_unlock(void);
 extern int dpm_suspend_end(pm_message_t state);
 extern int dpm_suspend_start(pm_message_t state);
+extern void dpm_noirq_begin(void);
+extern int dpm_noirq_suspend_devices(pm_message_t state);
 extern int dpm_suspend_noirq(pm_message_t state);
 extern int dpm_suspend_late(pm_message_t state);
 extern int dpm_suspend(pm_message_t state);
-- 
cgit v1.2.3


From cb1844c47279fb59129f8a021a0b09bcf2011ad7 Mon Sep 17 00:00:00 2001
From: Xin Long <lucien.xin@gmail.com>
Date: Sun, 23 Jul 2017 09:34:26 +0800
Subject: sctp: remove the typedef sctp_initack_chunk_t

This patch is to remove the typedef sctp_initack_chunk_t, and
replace with struct sctp_initack_chunk in the places where it's
using this typedef.

It is also to use sizeof(variable) instead of sizeof(type).

Signed-off-by: Xin Long <lucien.xin@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/sctp.h    |  5 ++++-
 net/sctp/sm_statefuns.c | 13 ++++++-------
 2 files changed, 10 insertions(+), 8 deletions(-)

(limited to 'include')

diff --git a/include/linux/sctp.h b/include/linux/sctp.h
index 913474dfc96c..05c2099be537 100644
--- a/include/linux/sctp.h
+++ b/include/linux/sctp.h
@@ -336,7 +336,10 @@ struct sctp_hmac_algo_param {
  *   The INIT ACK chunk is used to acknowledge the initiation of an SCTP
  *   association.
  */
-typedef struct sctp_init_chunk sctp_initack_chunk_t;
+struct sctp_initack_chunk {
+	struct sctp_chunkhdr chunk_hdr;
+	struct sctp_inithdr init_hdr;
+};
 
 /* Section 3.3.3.1 State Cookie (7) */
 typedef struct sctp_cookie_param {
diff --git a/net/sctp/sm_statefuns.c b/net/sctp/sm_statefuns.c
index ae4c48c4f657..6568fc395901 100644
--- a/net/sctp/sm_statefuns.c
+++ b/net/sctp/sm_statefuns.c
@@ -518,7 +518,7 @@ sctp_disposition_t sctp_sf_do_5_1C_ack(struct net *net,
 		return sctp_sf_violation_chunk(net, ep, asoc, type, arg, commands);
 
 	/* Make sure that the INIT-ACK chunk has a valid length */
-	if (!sctp_chunk_length_valid(chunk, sizeof(sctp_initack_chunk_t)))
+	if (!sctp_chunk_length_valid(chunk, sizeof(struct sctp_initack_chunk)))
 		return sctp_sf_violation_chunklen(net, ep, asoc, type, arg,
 						  commands);
 	/* Grab the INIT header.  */
@@ -4453,11 +4453,10 @@ static sctp_disposition_t sctp_sf_abort_violation(
 		/* Treat INIT-ACK as a special case during COOKIE-WAIT. */
 		if (chunk->chunk_hdr->type == SCTP_CID_INIT_ACK &&
 		    !asoc->peer.i.init_tag) {
-			sctp_initack_chunk_t *initack;
+			struct sctp_initack_chunk *initack;
 
-			initack = (sctp_initack_chunk_t *)chunk->chunk_hdr;
-			if (!sctp_chunk_length_valid(chunk,
-						     sizeof(sctp_initack_chunk_t)))
+			initack = (struct sctp_initack_chunk *)chunk->chunk_hdr;
+			if (!sctp_chunk_length_valid(chunk, sizeof(*initack)))
 				abort->chunk_hdr->flags |= SCTP_CHUNK_FLAG_T;
 			else {
 				unsigned int inittag;
@@ -6106,9 +6105,9 @@ static struct sctp_packet *sctp_ootb_pkt_new(struct net *net,
 		switch (chunk->chunk_hdr->type) {
 		case SCTP_CID_INIT_ACK:
 		{
-			sctp_initack_chunk_t *initack;
+			struct sctp_initack_chunk *initack;
 
-			initack = (sctp_initack_chunk_t *)chunk->chunk_hdr;
+			initack = (struct sctp_initack_chunk *)chunk->chunk_hdr;
 			vtag = ntohl(initack->init_hdr.init_tag);
 			break;
 		}
-- 
cgit v1.2.3


From f48ef4c7f7979e8e658b7e038a82f096ab292d70 Mon Sep 17 00:00:00 2001
From: Xin Long <lucien.xin@gmail.com>
Date: Sun, 23 Jul 2017 09:34:27 +0800
Subject: sctp: remove the typedef sctp_cookie_param_t

This patch is to remove the typedef sctp_cookie_param_t, and
replace with struct sctp_cookie_param in the places where it's
using this typedef.

Signed-off-by: Xin Long <lucien.xin@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/sctp.h     |  4 ++--
 net/sctp/sm_make_chunk.c | 18 ++++++++++--------
 2 files changed, 12 insertions(+), 10 deletions(-)

(limited to 'include')

diff --git a/include/linux/sctp.h b/include/linux/sctp.h
index 05c2099be537..9e77abda2111 100644
--- a/include/linux/sctp.h
+++ b/include/linux/sctp.h
@@ -342,10 +342,10 @@ struct sctp_initack_chunk {
 };
 
 /* Section 3.3.3.1 State Cookie (7) */
-typedef struct sctp_cookie_param {
+struct sctp_cookie_param {
 	struct sctp_paramhdr p;
 	__u8 body[0];
-} sctp_cookie_param_t;
+};
 
 /* Section 3.3.3.1 Unrecognized Parameters (8) */
 typedef struct sctp_unrecognized_param {
diff --git a/net/sctp/sm_make_chunk.c b/net/sctp/sm_make_chunk.c
index 0b36e96cb0df..163004e7047c 100644
--- a/net/sctp/sm_make_chunk.c
+++ b/net/sctp/sm_make_chunk.c
@@ -69,7 +69,8 @@ static struct sctp_chunk *sctp_make_data(const struct sctp_association *asoc,
 static struct sctp_chunk *_sctp_make_chunk(const struct sctp_association *asoc,
 					   __u8 type, __u8 flags, int paylen,
 					   gfp_t gfp);
-static sctp_cookie_param_t *sctp_pack_cookie(const struct sctp_endpoint *ep,
+static struct sctp_cookie_param *sctp_pack_cookie(
+					const struct sctp_endpoint *ep,
 					const struct sctp_association *asoc,
 					const struct sctp_chunk *init_chunk,
 					int *cookie_len,
@@ -387,7 +388,7 @@ struct sctp_chunk *sctp_make_init_ack(const struct sctp_association *asoc,
 	union sctp_params addrs;
 	struct sctp_sock *sp;
 	int addrs_len;
-	sctp_cookie_param_t *cookie;
+	struct sctp_cookie_param *cookie;
 	int cookie_len;
 	size_t chunksize;
 	struct sctp_adaptation_ind_param aiparam;
@@ -1595,14 +1596,15 @@ nodata:
 /* Build a cookie representing asoc.
  * This INCLUDES the param header needed to put the cookie in the INIT ACK.
  */
-static sctp_cookie_param_t *sctp_pack_cookie(const struct sctp_endpoint *ep,
-				      const struct sctp_association *asoc,
-				      const struct sctp_chunk *init_chunk,
-				      int *cookie_len,
-				      const __u8 *raw_addrs, int addrs_len)
+static struct sctp_cookie_param *sctp_pack_cookie(
+					const struct sctp_endpoint *ep,
+					const struct sctp_association *asoc,
+					const struct sctp_chunk *init_chunk,
+					int *cookie_len,
+					const __u8 *raw_addrs, int addrs_len)
 {
-	sctp_cookie_param_t *retval;
 	struct sctp_signed_cookie *cookie;
+	struct sctp_cookie_param *retval;
 	int headersize, bodysize;
 
 	/* Header size is static data prior to the actual cookie, including
-- 
cgit v1.2.3


From 62e6b7e4ee244f8043c169c049d3b2c2c798cd60 Mon Sep 17 00:00:00 2001
From: Xin Long <lucien.xin@gmail.com>
Date: Sun, 23 Jul 2017 09:34:28 +0800
Subject: sctp: remove the typedef sctp_unrecognized_param_t

This patch is to remove the typedef sctp_unrecognized_param_t, and
replace with struct sctp_unrecognized_param in the places where it's
using this typedef.

It is also to fix some indents in sctp_sf_do_unexpected_init() and
sctp_sf_do_5_1B_init().

Signed-off-by: Xin Long <lucien.xin@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/sctp.h    |  4 ++--
 net/sctp/sm_statefuns.c | 18 +++++++-----------
 2 files changed, 9 insertions(+), 13 deletions(-)

(limited to 'include')

diff --git a/include/linux/sctp.h b/include/linux/sctp.h
index 9e77abda2111..c323b3e3ecdb 100644
--- a/include/linux/sctp.h
+++ b/include/linux/sctp.h
@@ -348,10 +348,10 @@ struct sctp_cookie_param {
 };
 
 /* Section 3.3.3.1 Unrecognized Parameters (8) */
-typedef struct sctp_unrecognized_param {
+struct sctp_unrecognized_param {
 	struct sctp_paramhdr param_hdr;
 	struct sctp_paramhdr unrecognized;
-} sctp_unrecognized_param_t;
+};
 
 
diff --git a/net/sctp/sm_statefuns.c b/net/sctp/sm_statefuns.c
index 6568fc395901..7f852392f56a 100644
--- a/net/sctp/sm_statefuns.c
+++ b/net/sctp/sm_statefuns.c
@@ -306,12 +306,10 @@ sctp_disposition_t sctp_sf_do_5_1B_init(struct net *net,
 					void *arg,
 					sctp_cmd_seq_t *commands)
 {
-	struct sctp_chunk *chunk = arg;
-	struct sctp_chunk *repl;
+	struct sctp_chunk *chunk = arg, *repl, *err_chunk;
+	struct sctp_unrecognized_param *unk_param;
 	struct sctp_association *new_asoc;
-	struct sctp_chunk *err_chunk;
 	struct sctp_packet *packet;
-	sctp_unrecognized_param_t *unk_param;
 	int len;
 
 	/* 6.10 Bundling
@@ -435,7 +433,7 @@ sctp_disposition_t sctp_sf_do_5_1B_init(struct net *net,
 		 * construct the parameters in INIT ACK by copying the
 		 * ERROR causes over.
 		 */
-		unk_param = (sctp_unrecognized_param_t *)
+		unk_param = (struct sctp_unrecognized_param *)
 			    ((__u8 *)(err_chunk->chunk_hdr) +
 			    sizeof(struct sctp_chunkhdr));
 		/* Replace the cause code with the "Unrecognized parameter"
@@ -1419,13 +1417,11 @@ static sctp_disposition_t sctp_sf_do_unexpected_init(
 	const sctp_subtype_t type,
 	void *arg, sctp_cmd_seq_t *commands)
 {
-	sctp_disposition_t retval;
-	struct sctp_chunk *chunk = arg;
-	struct sctp_chunk *repl;
+	struct sctp_chunk *chunk = arg, *repl, *err_chunk;
+	struct sctp_unrecognized_param *unk_param;
 	struct sctp_association *new_asoc;
-	struct sctp_chunk *err_chunk;
 	struct sctp_packet *packet;
-	sctp_unrecognized_param_t *unk_param;
+	sctp_disposition_t retval;
 	int len;
 
 	/* 6.10 Bundling
@@ -1555,7 +1551,7 @@ static sctp_disposition_t sctp_sf_do_unexpected_init(
 		 * construct the parameters in INIT ACK by copying the
 		 * ERROR causes over.
 		 */
-		unk_param = (sctp_unrecognized_param_t *)
+		unk_param = (struct sctp_unrecognized_param *)
 			    ((__u8 *)(err_chunk->chunk_hdr) +
 			    sizeof(struct sctp_chunkhdr));
 		/* Replace the cause code with the "Unrecognized parameter"
-- 
cgit v1.2.3


From fe9a0fe7210d803adb3d5817da029fe39b8a4133 Mon Sep 17 00:00:00 2001
From: Xin Long <lucien.xin@gmail.com>
Date: Sun, 23 Jul 2017 09:34:29 +0800
Subject: sctp: remove the typedef sctp_gap_ack_block_t

This patch is to remove the typedef sctp_gap_ack_block_t, and
replace with struct sctp_gap_ack_block in the places where it's
using this typedef.

Signed-off-by: Xin Long <lucien.xin@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/sctp.h | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'include')

diff --git a/include/linux/sctp.h b/include/linux/sctp.h
index c323b3e3ecdb..b84b8e8340da 100644
--- a/include/linux/sctp.h
+++ b/include/linux/sctp.h
@@ -363,16 +363,16 @@ struct sctp_unrecognized_param {
  *  subsequences of DATA chunks as represented by their TSNs.
  */
 
-typedef struct sctp_gap_ack_block {
+struct sctp_gap_ack_block {
 	__be16 start;
 	__be16 end;
-} sctp_gap_ack_block_t;
+};
 
 typedef __be32 sctp_dup_tsn_t;
 
 typedef union {
-	sctp_gap_ack_block_t	gab;
-        sctp_dup_tsn_t		dup;
+	struct sctp_gap_ack_block gab;
+	sctp_dup_tsn_t dup;
 } sctp_sack_variable_t;
 
 typedef struct sctp_sackhdr {
-- 
cgit v1.2.3


From 9b41515636563ae76e730dbcb97fd303b94ed7d9 Mon Sep 17 00:00:00 2001
From: Xin Long <lucien.xin@gmail.com>
Date: Sun, 23 Jul 2017 09:34:30 +0800
Subject: sctp: remove the typedef sctp_dup_tsn_t

This patch is to remove the typedef sctp_dup_tsn_t, and replace
with __be32 in the places where it's using this typedef.

Signed-off-by: Xin Long <lucien.xin@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/sctp.h | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

(limited to 'include')

diff --git a/include/linux/sctp.h b/include/linux/sctp.h
index b84b8e8340da..8faf74eff63d 100644
--- a/include/linux/sctp.h
+++ b/include/linux/sctp.h
@@ -368,11 +368,9 @@ struct sctp_gap_ack_block {
 	__be16 end;
 };
 
-typedef __be32 sctp_dup_tsn_t;
-
 typedef union {
 	struct sctp_gap_ack_block gab;
-	sctp_dup_tsn_t dup;
+	__be32 dup;
 } sctp_sack_variable_t;
 
 typedef struct sctp_sackhdr {
-- 
cgit v1.2.3


From afd93b7be6e24731d82d9fd84b8a5ea73a68214b Mon Sep 17 00:00:00 2001
From: Xin Long <lucien.xin@gmail.com>
Date: Sun, 23 Jul 2017 09:34:31 +0800
Subject: sctp: remove the typedef sctp_sack_variable_t

This patch is to remove the typedef sctp_sack_variable_t, and
replace with union sctp_sack_variable in the places where it's
using this typedef.

It is also to fix some indents in sctp_acked().

Signed-off-by: Xin Long <lucien.xin@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/sctp.h |  6 +++---
 net/sctp/outqueue.c  | 10 +++++-----
 2 files changed, 8 insertions(+), 8 deletions(-)

(limited to 'include')

diff --git a/include/linux/sctp.h b/include/linux/sctp.h
index 8faf74eff63d..8df6ac53f05b 100644
--- a/include/linux/sctp.h
+++ b/include/linux/sctp.h
@@ -368,17 +368,17 @@ struct sctp_gap_ack_block {
 	__be16 end;
 };
 
-typedef union {
+union sctp_sack_variable {
 	struct sctp_gap_ack_block gab;
 	__be32 dup;
-} sctp_sack_variable_t;
+};
 
 typedef struct sctp_sackhdr {
 	__be32 cum_tsn_ack;
 	__be32 a_rwnd;
 	__be16 num_gap_ack_blocks;
 	__be16 num_dup_tsns;
-	sctp_sack_variable_t variable[0];
+	union sctp_sack_variable variable[0];
 } sctp_sackhdr_t;
 
 typedef struct sctp_sack_chunk {
diff --git a/net/sctp/outqueue.c b/net/sctp/outqueue.c
index e8762702a313..d2a8adfd4a6f 100644
--- a/net/sctp/outqueue.c
+++ b/net/sctp/outqueue.c
@@ -1197,7 +1197,7 @@ sctp_flush_out:
 static void sctp_sack_update_unack_data(struct sctp_association *assoc,
 					struct sctp_sackhdr *sack)
 {
-	sctp_sack_variable_t *frags;
+	union sctp_sack_variable *frags;
 	__u16 unack_data;
 	int i;
 
@@ -1224,7 +1224,7 @@ int sctp_outq_sack(struct sctp_outq *q, struct sctp_chunk *chunk)
 	struct sctp_transport *transport;
 	struct sctp_chunk *tchunk = NULL;
 	struct list_head *lchunk, *transport_list, *temp;
-	sctp_sack_variable_t *frags = sack->variable;
+	union sctp_sack_variable *frags = sack->variable;
 	__u32 sack_ctsn, ctsn, tsn;
 	__u32 highest_tsn, highest_new_tsn;
 	__u32 sack_a_rwnd;
@@ -1736,10 +1736,10 @@ static void sctp_mark_missing(struct sctp_outq *q,
 /* Is the given TSN acked by this packet?  */
 static int sctp_acked(struct sctp_sackhdr *sack, __u32 tsn)
 {
-	int i;
-	sctp_sack_variable_t *frags;
-	__u16 tsn_offset, blocks;
 	__u32 ctsn = ntohl(sack->cum_tsn_ack);
+	union sctp_sack_variable *frags;
+	__u16 tsn_offset, blocks;
+	int i;
 
 	if (TSN_lte(tsn, ctsn))
 		goto pass;
-- 
cgit v1.2.3


From 787310859d8d1a72545db2343fb3ac8f765b0f35 Mon Sep 17 00:00:00 2001
From: Xin Long <lucien.xin@gmail.com>
Date: Sun, 23 Jul 2017 09:34:32 +0800
Subject: sctp: remove the typedef sctp_sackhdr_t

This patch is to remove the typedef sctp_sackhdr_t, and replace
with struct sctp_sackhdr in the places where it's using this
typedef.

Signed-off-by: Xin Long <lucien.xin@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/sctp.h       | 6 +++---
 include/net/sctp/command.h | 4 ++--
 net/sctp/sm_statefuns.c    | 2 +-
 3 files changed, 6 insertions(+), 6 deletions(-)

(limited to 'include')

diff --git a/include/linux/sctp.h b/include/linux/sctp.h
index 8df6ac53f05b..a2e43129d11a 100644
--- a/include/linux/sctp.h
+++ b/include/linux/sctp.h
@@ -373,17 +373,17 @@ union sctp_sack_variable {
 	__be32 dup;
 };
 
-typedef struct sctp_sackhdr {
+struct sctp_sackhdr {
 	__be32 cum_tsn_ack;
 	__be32 a_rwnd;
 	__be16 num_gap_ack_blocks;
 	__be16 num_dup_tsns;
 	union sctp_sack_variable variable[0];
-} sctp_sackhdr_t;
+};
 
 typedef struct sctp_sack_chunk {
 	struct sctp_chunkhdr chunk_hdr;
-	sctp_sackhdr_t sack_hdr;
+	struct sctp_sackhdr sack_hdr;
 } sctp_sack_chunk_t;
 
 
diff --git a/include/net/sctp/command.h b/include/net/sctp/command.h
index d4679e7a5ed5..1d5f6ff3f440 100644
--- a/include/net/sctp/command.h
+++ b/include/net/sctp/command.h
@@ -135,7 +135,7 @@ typedef union {
 	struct sctp_init_chunk *init;
 	struct sctp_ulpevent *ulpevent;
 	struct sctp_packet *packet;
-	sctp_sackhdr_t *sackh;
+	struct sctp_sackhdr *sackh;
 	struct sctp_datamsg *msg;
 } sctp_arg_t;
 
@@ -176,7 +176,7 @@ SCTP_ARG_CONSTRUCTOR(BA,	struct sctp_bind_addr *, bp)
 SCTP_ARG_CONSTRUCTOR(PEER_INIT,	struct sctp_init_chunk *, init)
 SCTP_ARG_CONSTRUCTOR(ULPEVENT,  struct sctp_ulpevent *, ulpevent)
 SCTP_ARG_CONSTRUCTOR(PACKET,	struct sctp_packet *, packet)
-SCTP_ARG_CONSTRUCTOR(SACKH,	sctp_sackhdr_t *, sackh)
+SCTP_ARG_CONSTRUCTOR(SACKH,	struct sctp_sackhdr *, sackh)
 SCTP_ARG_CONSTRUCTOR(DATAMSG,	struct sctp_datamsg *, msg)
 
 static inline sctp_arg_t SCTP_FORCE(void)
diff --git a/net/sctp/sm_statefuns.c b/net/sctp/sm_statefuns.c
index 7f852392f56a..c09dfe6ebac2 100644
--- a/net/sctp/sm_statefuns.c
+++ b/net/sctp/sm_statefuns.c
@@ -3187,7 +3187,7 @@ sctp_disposition_t sctp_sf_eat_sack_6_2(struct net *net,
 					sctp_cmd_seq_t *commands)
 {
 	struct sctp_chunk *chunk = arg;
-	sctp_sackhdr_t *sackh;
+	struct sctp_sackhdr *sackh;
 	__u32 ctsn;
 
 	if (!sctp_vtag_verify(chunk, asoc))
-- 
cgit v1.2.3


From d4d6c61489e7e4a8944360312e572988889558a8 Mon Sep 17 00:00:00 2001
From: Xin Long <lucien.xin@gmail.com>
Date: Sun, 23 Jul 2017 09:34:33 +0800
Subject: sctp: remove the typedef sctp_sack_chunk_t

This patch is to remove the typedef sctp_sack_chunk_t, and
replace with struct sctp_sack_chunk in the places where it's
using this typedef.

Signed-off-by: Xin Long <lucien.xin@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/sctp.h    | 4 ++--
 net/sctp/chunk.c        | 2 +-
 net/sctp/sm_statefuns.c | 4 ++--
 3 files changed, 5 insertions(+), 5 deletions(-)

(limited to 'include')

diff --git a/include/linux/sctp.h b/include/linux/sctp.h
index a2e43129d11a..48f6560dd880 100644
--- a/include/linux/sctp.h
+++ b/include/linux/sctp.h
@@ -381,10 +381,10 @@ struct sctp_sackhdr {
 	union sctp_sack_variable variable[0];
 };
 
-typedef struct sctp_sack_chunk {
+struct sctp_sack_chunk {
 	struct sctp_chunkhdr chunk_hdr;
 	struct sctp_sackhdr sack_hdr;
-} sctp_sack_chunk_t;
+};
 
 
 /* RFC 2960.  Section 3.3.5 Heartbeat Request (HEARTBEAT) (4):
diff --git a/net/sctp/chunk.c b/net/sctp/chunk.c
index 1323d41e68b8..681b181e7ae3 100644
--- a/net/sctp/chunk.c
+++ b/net/sctp/chunk.c
@@ -221,7 +221,7 @@ struct sctp_datamsg *sctp_datamsg_from_user(struct sctp_association *asoc,
 	    asoc->outqueue.out_qlen == 0 &&
 	    list_empty(&asoc->outqueue.retransmit) &&
 	    msg_len > max_data)
-		first_len -= SCTP_PAD4(sizeof(sctp_sack_chunk_t));
+		first_len -= SCTP_PAD4(sizeof(struct sctp_sack_chunk));
 
 	/* Encourage Cookie-ECHO bundling. */
 	if (asoc->state < SCTP_STATE_COOKIE_ECHOED)
diff --git a/net/sctp/sm_statefuns.c b/net/sctp/sm_statefuns.c
index c09dfe6ebac2..08ebe8cd96c7 100644
--- a/net/sctp/sm_statefuns.c
+++ b/net/sctp/sm_statefuns.c
@@ -3194,7 +3194,7 @@ sctp_disposition_t sctp_sf_eat_sack_6_2(struct net *net,
 		return sctp_sf_pdiscard(net, ep, asoc, type, arg, commands);
 
 	/* Make sure that the SACK chunk has a valid length. */
-	if (!sctp_chunk_length_valid(chunk, sizeof(sctp_sack_chunk_t)))
+	if (!sctp_chunk_length_valid(chunk, sizeof(struct sctp_sack_chunk)))
 		return sctp_sf_violation_chunklen(net, ep, asoc, type, arg,
 						  commands);
 
@@ -4515,7 +4515,7 @@ nomem:
  * Handle a protocol violation when the chunk length is invalid.
  * "Invalid" length is identified as smaller than the minimal length a
  * given chunk can be.  For example, a SACK chunk has invalid length
- * if its length is set to be smaller than the size of sctp_sack_chunk_t.
+ * if its length is set to be smaller than the size of struct sctp_sack_chunk.
  *
  * We inform the other end by sending an ABORT with a Protocol Violation
  * error code.
-- 
cgit v1.2.3


From 4d2dcdf4e04c938b266f06f271000e4b0f3a288f Mon Sep 17 00:00:00 2001
From: Xin Long <lucien.xin@gmail.com>
Date: Sun, 23 Jul 2017 09:34:34 +0800
Subject: sctp: remove the typedef sctp_heartbeathdr_t

This patch is to remove the typedef sctp_heartbeathdr_t, and
replace with struct sctp_heartbeathdr in the places where it's
using this typedef.

Signed-off-by: Xin Long <lucien.xin@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/sctp.h    | 6 +++---
 net/sctp/sm_statefuns.c | 2 +-
 2 files changed, 4 insertions(+), 4 deletions(-)

(limited to 'include')

diff --git a/include/linux/sctp.h b/include/linux/sctp.h
index 48f6560dd880..6e26b86770f1 100644
--- a/include/linux/sctp.h
+++ b/include/linux/sctp.h
@@ -394,13 +394,13 @@ struct sctp_sack_chunk {
  *  the present association.
  */
 
-typedef struct sctp_heartbeathdr {
+struct sctp_heartbeathdr {
 	struct sctp_paramhdr info;
-} sctp_heartbeathdr_t;
+};
 
 typedef struct sctp_heartbeat_chunk {
 	struct sctp_chunkhdr chunk_hdr;
-	sctp_heartbeathdr_t hb_hdr;
+	struct sctp_heartbeathdr hb_hdr;
 } sctp_heartbeat_chunk_t;
 
 
diff --git a/net/sctp/sm_statefuns.c b/net/sctp/sm_statefuns.c
index 08ebe8cd96c7..32ac90b22654 100644
--- a/net/sctp/sm_statefuns.c
+++ b/net/sctp/sm_statefuns.c
@@ -1096,7 +1096,7 @@ sctp_disposition_t sctp_sf_beat_8_3(struct net *net,
 	 * respond with a HEARTBEAT ACK that contains the Heartbeat
 	 * Information field copied from the received HEARTBEAT chunk.
 	 */
-	chunk->subh.hb_hdr = (sctp_heartbeathdr_t *)chunk->skb->data;
+	chunk->subh.hb_hdr = (struct sctp_heartbeathdr *)chunk->skb->data;
 	param_hdr = (struct sctp_paramhdr *)chunk->subh.hb_hdr;
 	paylen = ntohs(chunk->chunk_hdr->length) - sizeof(struct sctp_chunkhdr);
 
-- 
cgit v1.2.3


From 38c00f7482281801d6f7fe410c7a4b61ae25218e Mon Sep 17 00:00:00 2001
From: Xin Long <lucien.xin@gmail.com>
Date: Sun, 23 Jul 2017 09:34:35 +0800
Subject: sctp: remove the typedef sctp_heartbeat_chunk_t

This patch is to remove the typedef sctp_heartbeat_chunk_t, and
replace with struct sctp_heartbeat_chunk in the places where it's
using this typedef.

Signed-off-by: Xin Long <lucien.xin@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/sctp.h    | 4 ++--
 net/sctp/sm_statefuns.c | 3 ++-
 2 files changed, 4 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/include/linux/sctp.h b/include/linux/sctp.h
index 6e26b86770f1..bfda7c66960c 100644
--- a/include/linux/sctp.h
+++ b/include/linux/sctp.h
@@ -398,10 +398,10 @@ struct sctp_heartbeathdr {
 	struct sctp_paramhdr info;
 };
 
-typedef struct sctp_heartbeat_chunk {
+struct sctp_heartbeat_chunk {
 	struct sctp_chunkhdr chunk_hdr;
 	struct sctp_heartbeathdr hb_hdr;
-} sctp_heartbeat_chunk_t;
+};
 
 
 /* For the abort and shutdown ACK we must carry the init tag in the
diff --git a/net/sctp/sm_statefuns.c b/net/sctp/sm_statefuns.c
index 32ac90b22654..7bbee085b476 100644
--- a/net/sctp/sm_statefuns.c
+++ b/net/sctp/sm_statefuns.c
@@ -1088,7 +1088,8 @@ sctp_disposition_t sctp_sf_beat_8_3(struct net *net,
 		return sctp_sf_pdiscard(net, ep, asoc, type, arg, commands);
 
 	/* Make sure that the HEARTBEAT chunk has a valid length. */
-	if (!sctp_chunk_length_valid(chunk, sizeof(sctp_heartbeat_chunk_t)))
+	if (!sctp_chunk_length_valid(chunk,
+				     sizeof(struct sctp_heartbeat_chunk)))
 		return sctp_sf_violation_chunklen(net, ep, asoc, type, arg,
 						  commands);
 
-- 
cgit v1.2.3


From 441ae65ae000dd325a609306bd0cd850df50cfc4 Mon Sep 17 00:00:00 2001
From: Xin Long <lucien.xin@gmail.com>
Date: Sun, 23 Jul 2017 09:34:36 +0800
Subject: sctp: remove the typedef sctp_abort_chunk_t

This patch is to remove the typedef sctp_abort_chunk_t, and
replace with struct sctp_abort_chunk in the places where it's
using this typedef.

Signed-off-by: Xin Long <lucien.xin@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/sctp.h    | 4 ++--
 net/sctp/sm_statefuns.c | 8 ++++----
 2 files changed, 6 insertions(+), 6 deletions(-)

(limited to 'include')

diff --git a/include/linux/sctp.h b/include/linux/sctp.h
index bfda7c66960c..ba007163acfd 100644
--- a/include/linux/sctp.h
+++ b/include/linux/sctp.h
@@ -408,9 +408,9 @@ struct sctp_heartbeat_chunk {
  * common header. Just the common header is all that is needed with a
  * chunk descriptor.
  */
-typedef struct sctp_abort_chunk {
+struct sctp_abort_chunk {
 	struct sctp_chunkhdr uh;
-} sctp_abort_chunk_t;
+};
 
 
 /* For the graceful shutdown we must carry the tag (in common header)
diff --git a/net/sctp/sm_statefuns.c b/net/sctp/sm_statefuns.c
index 7bbee085b476..dc0c2c4188d8 100644
--- a/net/sctp/sm_statefuns.c
+++ b/net/sctp/sm_statefuns.c
@@ -2164,7 +2164,7 @@ sctp_disposition_t sctp_sf_shutdown_pending_abort(
 	 * as we do not know its true length.  So, to be safe, discard the
 	 * packet.
 	 */
-	if (!sctp_chunk_length_valid(chunk, sizeof(sctp_abort_chunk_t)))
+	if (!sctp_chunk_length_valid(chunk, sizeof(struct sctp_abort_chunk)))
 		return sctp_sf_pdiscard(net, ep, asoc, type, arg, commands);
 
 	/* ADD-IP: Special case for ABORT chunks
@@ -2206,7 +2206,7 @@ sctp_disposition_t sctp_sf_shutdown_sent_abort(struct net *net,
 	 * as we do not know its true length.  So, to be safe, discard the
 	 * packet.
 	 */
-	if (!sctp_chunk_length_valid(chunk, sizeof(sctp_abort_chunk_t)))
+	if (!sctp_chunk_length_valid(chunk, sizeof(struct sctp_abort_chunk)))
 		return sctp_sf_pdiscard(net, ep, asoc, type, arg, commands);
 
 	/* ADD-IP: Special case for ABORT chunks
@@ -2470,7 +2470,7 @@ sctp_disposition_t sctp_sf_do_9_1_abort(struct net *net,
 	 * as we do not know its true length.  So, to be safe, discard the
 	 * packet.
 	 */
-	if (!sctp_chunk_length_valid(chunk, sizeof(sctp_abort_chunk_t)))
+	if (!sctp_chunk_length_valid(chunk, sizeof(struct sctp_abort_chunk)))
 		return sctp_sf_pdiscard(net, ep, asoc, type, arg, commands);
 
 	/* ADD-IP: Special case for ABORT chunks
@@ -2546,7 +2546,7 @@ sctp_disposition_t sctp_sf_cookie_wait_abort(struct net *net,
 	 * as we do not know its true length.  So, to be safe, discard the
 	 * packet.
 	 */
-	if (!sctp_chunk_length_valid(chunk, sizeof(sctp_abort_chunk_t)))
+	if (!sctp_chunk_length_valid(chunk, sizeof(struct sctp_abort_chunk)))
 		return sctp_sf_pdiscard(net, ep, asoc, type, arg, commands);
 
 	/* See if we have an error cause code in the chunk.  */
-- 
cgit v1.2.3


From 4e3274705324c695f56b3e1c194cd99b76eeea0f Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Wed, 7 Jun 2017 16:02:24 -0700
Subject: init_task: Remove redundant INIT_TASK_RCU_TREE_PREEMPT() macro

Back in the dim distant past, the task_struct structure's RCU-related
fields optionally included those needed for CONFIG_RCU_BOOST, even in
CONFIG_PREEMPT_RCU builds.  The INIT_TASK_RCU_TREE_PREEMPT() macro was
used to provide initializers for those optional CONFIG_RCU_BOOST fields.
However, the CONFIG_RCU_BOOST fields are now included unconditionally
in CONFIG_PREEMPT_RCU builds, so there is no longer any need fro the
INIT_TASK_RCU_TREE_PREEMPT() macro.  This commit therefore removes it
in favor of initializing the ->rcu_blocked_node field directly in the
INIT_TASK_RCU_PREEMPT() macro.

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---
 include/linux/init_task.h | 8 +-------
 1 file changed, 1 insertion(+), 7 deletions(-)

(limited to 'include')

diff --git a/include/linux/init_task.h b/include/linux/init_task.h
index a2f6707e9fc0..0e849715e5be 100644
--- a/include/linux/init_task.h
+++ b/include/linux/init_task.h
@@ -125,18 +125,12 @@ extern struct group_info init_groups;
 #define INIT_IDS
 #endif
 
-#ifdef CONFIG_PREEMPT_RCU
-#define INIT_TASK_RCU_TREE_PREEMPT()					\
-	.rcu_blocked_node = NULL,
-#else
-#define INIT_TASK_RCU_TREE_PREEMPT(tsk)
-#endif
 #ifdef CONFIG_PREEMPT_RCU
 #define INIT_TASK_RCU_PREEMPT(tsk)					\
 	.rcu_read_lock_nesting = 0,					\
 	.rcu_read_unlock_special.s = 0,					\
 	.rcu_node_entry = LIST_HEAD_INIT(tsk.rcu_node_entry),		\
-	INIT_TASK_RCU_TREE_PREEMPT()
+	.rcu_blocked_node = NULL,
 #else
 #define INIT_TASK_RCU_PREEMPT(tsk)
 #endif
-- 
cgit v1.2.3


From 825c5bd2fd47d30148db15fc121216c483682b01 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Fri, 26 May 2017 16:16:40 -0700
Subject: srcu: Move rcu_scheduler_starting() from Tiny RCU to Tiny SRCU

Other than lockdep support, Tiny RCU has no need for the
scheduler status.  However, Tiny SRCU will need this to control
boot-time behavior independent of lockdep.  Therefore, this commit
moves rcu_scheduler_starting() from kernel/rcu/tiny_plugin.h to
kernel/rcu/srcutiny.c.  This in turn allows the complete removal of
kernel/rcu/tiny_plugin.h.

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---
 include/linux/rcupdate.h |  1 +
 include/linux/rcutiny.h  |  8 +++-----
 kernel/rcu/srcutiny.c    |  8 ++++++++
 kernel/rcu/tiny.c        |  2 --
 kernel/rcu/tiny_plugin.h | 47 -----------------------------------------------
 5 files changed, 12 insertions(+), 54 deletions(-)
 delete mode 100644 kernel/rcu/tiny_plugin.h

(limited to 'include')

diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h
index f816fc72b51e..55c29e1650b9 100644
--- a/include/linux/rcupdate.h
+++ b/include/linux/rcupdate.h
@@ -105,6 +105,7 @@ static inline int rcu_preempt_depth(void)
 
 /* Internal to kernel */
 void rcu_init(void);
+extern int rcu_scheduler_active __read_mostly;
 void rcu_sched_qs(void);
 void rcu_bh_qs(void);
 void rcu_check_callbacks(int user);
diff --git a/include/linux/rcutiny.h b/include/linux/rcutiny.h
index 5becbbccb998..b3dbf9502fd0 100644
--- a/include/linux/rcutiny.h
+++ b/include/linux/rcutiny.h
@@ -116,13 +116,11 @@ static inline void rcu_irq_exit_irqson(void) { }
 static inline void rcu_irq_enter_irqson(void) { }
 static inline void rcu_irq_exit(void) { }
 static inline void exit_rcu(void) { }
-
-#if defined(CONFIG_DEBUG_LOCK_ALLOC) || defined(CONFIG_SRCU)
-extern int rcu_scheduler_active __read_mostly;
+#ifdef CONFIG_SRCU
 void rcu_scheduler_starting(void);
-#else /* #if defined(CONFIG_DEBUG_LOCK_ALLOC) || defined(CONFIG_SRCU) */
+#else /* #ifndef CONFIG_SRCU */
 static inline void rcu_scheduler_starting(void) { }
-#endif /* #else #if defined(CONFIG_DEBUG_LOCK_ALLOC) || defined(CONFIG_SRCU) */
+#endif /* #else #ifndef CONFIG_SRCU */
 static inline void rcu_end_inkernel_boot(void) { }
 static inline bool rcu_is_watching(void) { return true; }
 
diff --git a/kernel/rcu/srcutiny.c b/kernel/rcu/srcutiny.c
index 1a1c1047d2ed..76ac5f50b2c7 100644
--- a/kernel/rcu/srcutiny.c
+++ b/kernel/rcu/srcutiny.c
@@ -33,6 +33,8 @@
 #include "rcu_segcblist.h"
 #include "rcu.h"
 
+int rcu_scheduler_active __read_mostly;
+
 static int init_srcu_struct_fields(struct srcu_struct *sp)
 {
 	sp->srcu_lock_nesting[0] = 0;
@@ -193,3 +195,9 @@ void synchronize_srcu(struct srcu_struct *sp)
 	destroy_rcu_head_on_stack(&rs.head);
 }
 EXPORT_SYMBOL_GPL(synchronize_srcu);
+
+/* Lockdep diagnostics.  */
+void __init rcu_scheduler_starting(void)
+{
+	rcu_scheduler_active = RCU_SCHEDULER_RUNNING;
+}
diff --git a/kernel/rcu/tiny.c b/kernel/rcu/tiny.c
index f8488965250f..a64eee0db39e 100644
--- a/kernel/rcu/tiny.c
+++ b/kernel/rcu/tiny.c
@@ -56,8 +56,6 @@ static struct rcu_ctrlblk rcu_bh_ctrlblk = {
 	.curtail	= &rcu_bh_ctrlblk.rcucblist,
 };
 
-#include "tiny_plugin.h"
-
 void rcu_barrier_bh(void)
 {
 	wait_rcu_gp(call_rcu_bh);
diff --git a/kernel/rcu/tiny_plugin.h b/kernel/rcu/tiny_plugin.h
deleted file mode 100644
index f0a01b2a3062..000000000000
--- a/kernel/rcu/tiny_plugin.h
+++ /dev/null
@@ -1,47 +0,0 @@
-/*
- * Read-Copy Update mechanism for mutual exclusion, the Bloatwatch edition
- * Internal non-public definitions that provide either classic
- * or preemptible semantics.
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, you can access it online at
- * http://www.gnu.org/licenses/gpl-2.0.html.
- *
- * Copyright (c) 2010 Linaro
- *
- * Author: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
- */
-
-#if defined(CONFIG_DEBUG_LOCK_ALLOC) || defined(CONFIG_SRCU)
-#include <linux/kernel_stat.h>
-
-int rcu_scheduler_active __read_mostly;
-EXPORT_SYMBOL_GPL(rcu_scheduler_active);
-
-/*
- * During boot, we forgive RCU lockdep issues.  After this function is
- * invoked, we start taking RCU lockdep issues seriously.  Note that unlike
- * Tree RCU, Tiny RCU transitions directly from RCU_SCHEDULER_INACTIVE
- * to RCU_SCHEDULER_RUNNING, skipping the RCU_SCHEDULER_INIT stage.
- * The reason for this is that Tiny RCU does not need kthreads, so does
- * not have to care about the fact that the scheduler is half-initialized
- * at a certain phase of the boot process.  Unless SRCU is in the mix.
- */
-void __init rcu_scheduler_starting(void)
-{
-	WARN_ON(nr_context_switches() > 0);
-	rcu_scheduler_active = IS_ENABLED(CONFIG_SRCU)
-		? RCU_SCHEDULER_INIT : RCU_SCHEDULER_RUNNING;
-}
-
-#endif /* #if defined(CONFIG_DEBUG_LOCK_ALLOC) || defined(CONFIG_SRCU) */
-- 
cgit v1.2.3


From 0d8a1e831e21d955af68f4ae5658b58b7ec25557 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Thu, 15 Jun 2017 17:06:38 -0700
Subject: srcu: Make process_srcu() be static

The function process_srcu() is not invoked outside of srcutree.c, so
this commit makes it static and drops the EXPORT_SYMBOL_GPL().

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---
 include/linux/srcutree.h | 2 --
 kernel/rcu/srcutree.c    | 4 ++--
 2 files changed, 2 insertions(+), 4 deletions(-)

(limited to 'include')

diff --git a/include/linux/srcutree.h b/include/linux/srcutree.h
index 42973f787e7e..a026a9493bde 100644
--- a/include/linux/srcutree.h
+++ b/include/linux/srcutree.h
@@ -104,8 +104,6 @@ struct srcu_struct {
 #define SRCU_STATE_SCAN1	1
 #define SRCU_STATE_SCAN2	2
 
-void process_srcu(struct work_struct *work);
-
 #define __SRCU_STRUCT_INIT(name)					\
 	{								\
 		.sda = &name##_srcu_data,				\
diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c
index d0ca524bf042..94bd6ed43ea3 100644
--- a/kernel/rcu/srcutree.c
+++ b/kernel/rcu/srcutree.c
@@ -51,6 +51,7 @@ module_param(counter_wrap_check, ulong, 0444);
 
 static void srcu_invoke_callbacks(struct work_struct *work);
 static void srcu_reschedule(struct srcu_struct *sp, unsigned long delay);
+static void process_srcu(struct work_struct *work);
 
 /*
  * Initialize SRCU combining tree.  Note that statically allocated
@@ -1194,7 +1195,7 @@ static void srcu_reschedule(struct srcu_struct *sp, unsigned long delay)
 /*
  * This is the work-queue function that handles SRCU grace periods.
  */
-void process_srcu(struct work_struct *work)
+static void process_srcu(struct work_struct *work)
 {
 	struct srcu_struct *sp;
 
@@ -1203,7 +1204,6 @@ void process_srcu(struct work_struct *work)
 	srcu_advance_state(sp);
 	srcu_reschedule(sp, srcu_get_delay(sp));
 }
-EXPORT_SYMBOL_GPL(process_srcu);
 
 void srcutorture_get_gp_data(enum rcutorture_type test_type,
 			     struct srcu_struct *sp, int *flags,
-- 
cgit v1.2.3


From 115a1a5285664f1931c30457081b4ae1e648f1f9 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Mon, 22 May 2017 13:31:03 -0700
Subject: rcutorture: Move SRCU status printing to SRCU implementations

This commit gets rid of some ugly #ifdefs in rcutorture.c by moving
the SRCU status printing to the SRCU implementations.

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---
 include/linux/srcutiny.h | 13 +++++++++++++
 include/linux/srcutree.h |  1 +
 kernel/rcu/rcutorture.c  | 39 +--------------------------------------
 kernel/rcu/srcutree.c    | 34 ++++++++++++++++++++++++++++++++++
 4 files changed, 49 insertions(+), 38 deletions(-)

(limited to 'include')

diff --git a/include/linux/srcutiny.h b/include/linux/srcutiny.h
index cfbfc540cafc..261471f407a5 100644
--- a/include/linux/srcutiny.h
+++ b/include/linux/srcutiny.h
@@ -87,4 +87,17 @@ static inline void srcu_barrier(struct srcu_struct *sp)
 	synchronize_srcu(sp);
 }
 
+/* Defined here to avoid size increase for non-torture kernels. */
+static inline void srcu_torture_stats_print(struct srcu_struct *sp,
+					    char *tt, char *tf)
+{
+	int idx;
+
+	idx = READ_ONCE(sp->srcu_idx) & 0x1;
+	pr_alert("%s%s Tiny SRCU per-CPU(idx=%d): (%hd,%hd)\n",
+		 tt, tf, idx,
+		 READ_ONCE(sp->srcu_lock_nesting[!idx]),
+		 READ_ONCE(sp->srcu_lock_nesting[idx]));
+}
+
 #endif
diff --git a/include/linux/srcutree.h b/include/linux/srcutree.h
index 42973f787e7e..7886356bdc84 100644
--- a/include/linux/srcutree.h
+++ b/include/linux/srcutree.h
@@ -141,5 +141,6 @@ void process_srcu(struct work_struct *work);
 
 void synchronize_srcu_expedited(struct srcu_struct *sp);
 void srcu_barrier(struct srcu_struct *sp);
+void srcu_torture_stats_print(struct srcu_struct *sp, char *tt, char *tf);
 
 #endif
diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c
index b8f7f8ce8575..aedc8f2ad955 100644
--- a/kernel/rcu/rcutorture.c
+++ b/kernel/rcu/rcutorture.c
@@ -561,44 +561,7 @@ static void srcu_torture_barrier(void)
 
 static void srcu_torture_stats(void)
 {
-	int __maybe_unused cpu;
-	int idx;
-
-#ifdef CONFIG_TREE_SRCU
-	idx = srcu_ctlp->srcu_idx & 0x1;
-	pr_alert("%s%s Tree SRCU per-CPU(idx=%d):",
-		 torture_type, TORTURE_FLAG, idx);
-	for_each_possible_cpu(cpu) {
-		unsigned long l0, l1;
-		unsigned long u0, u1;
-		long c0, c1;
-		struct srcu_data *counts;
-
-		counts = per_cpu_ptr(srcu_ctlp->sda, cpu);
-		u0 = counts->srcu_unlock_count[!idx];
-		u1 = counts->srcu_unlock_count[idx];
-
-		/*
-		 * Make sure that a lock is always counted if the corresponding
-		 * unlock is counted.
-		 */
-		smp_rmb();
-
-		l0 = counts->srcu_lock_count[!idx];
-		l1 = counts->srcu_lock_count[idx];
-
-		c0 = l0 - u0;
-		c1 = l1 - u1;
-		pr_cont(" %d(%ld,%ld)", cpu, c0, c1);
-	}
-	pr_cont("\n");
-#elif defined(CONFIG_TINY_SRCU)
-	idx = READ_ONCE(srcu_ctlp->srcu_idx) & 0x1;
-	pr_alert("%s%s Tiny SRCU per-CPU(idx=%d): (%hd,%hd)\n",
-		 torture_type, TORTURE_FLAG, idx,
-		 READ_ONCE(srcu_ctlp->srcu_lock_nesting[!idx]),
-		 READ_ONCE(srcu_ctlp->srcu_lock_nesting[idx]));
-#endif
+	srcu_torture_stats_print(srcu_ctlp, torture_type, TORTURE_FLAG);
 }
 
 static void srcu_torture_synchronize_expedited(void)
diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c
index d0ca524bf042..8f6fd11c338a 100644
--- a/kernel/rcu/srcutree.c
+++ b/kernel/rcu/srcutree.c
@@ -1217,6 +1217,40 @@ void srcutorture_get_gp_data(enum rcutorture_type test_type,
 }
 EXPORT_SYMBOL_GPL(srcutorture_get_gp_data);
 
+void srcu_torture_stats_print(struct srcu_struct *sp, char *tt, char *tf)
+{
+	int cpu;
+	int idx;
+
+	idx = sp->srcu_idx & 0x1;
+	pr_alert("%s%s Tree SRCU per-CPU(idx=%d):", tt, tf, idx);
+	for_each_possible_cpu(cpu) {
+		unsigned long l0, l1;
+		unsigned long u0, u1;
+		long c0, c1;
+		struct srcu_data *counts;
+
+		counts = per_cpu_ptr(sp->sda, cpu);
+		u0 = counts->srcu_unlock_count[!idx];
+		u1 = counts->srcu_unlock_count[idx];
+
+		/*
+		 * Make sure that a lock is always counted if the corresponding
+		 * unlock is counted.
+		 */
+		smp_rmb();
+
+		l0 = counts->srcu_lock_count[!idx];
+		l1 = counts->srcu_lock_count[idx];
+
+		c0 = l0 - u0;
+		c1 = l1 - u1;
+		pr_cont(" %d(%ld,%ld)", cpu, c0, c1);
+	}
+	pr_cont("\n");
+}
+EXPORT_SYMBOL_GPL(srcu_torture_stats_print);
+
 static int __init srcu_bootup_announce(void)
 {
 	pr_info("Hierarchical SRCU implementation.\n");
-- 
cgit v1.2.3


From b3c983142d4584c9d506b1ed31b65f4292b4aea8 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Tue, 6 Jun 2017 16:39:00 -0700
Subject: rcutorture: Place event-traced strings into trace buffer

Strings used in event tracing need to be specially handled, for example,
being copied to the trace buffer instead of being pointed to by the trace
buffer.  Although the TPS() macro can be used to "launder" pointed-to
strings, this might not be all that effective within a loadable module.
This commit therefore copies rcutorture's strings to the trace buffer.

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
---
 include/trace/events/rcu.h                                 | 7 +++++--
 kernel/rcu/rcutorture.c                                    | 2 +-
 tools/testing/selftests/rcutorture/configs/rcu/BUSTED.boot | 2 +-
 3 files changed, 7 insertions(+), 4 deletions(-)

(limited to 'include')

diff --git a/include/trace/events/rcu.h b/include/trace/events/rcu.h
index 91dc089d65b7..e91ae1f2290d 100644
--- a/include/trace/events/rcu.h
+++ b/include/trace/events/rcu.h
@@ -703,6 +703,7 @@ TRACE_EVENT(rcu_batch_end,
  * at the beginning and end of the read, respectively.  Note that the
  * callback address can be NULL.
  */
+#define RCUTORTURENAME_LEN 8
 TRACE_EVENT(rcu_torture_read,
 
 	TP_PROTO(const char *rcutorturename, struct rcu_head *rhp,
@@ -711,7 +712,7 @@ TRACE_EVENT(rcu_torture_read,
 	TP_ARGS(rcutorturename, rhp, secs, c_old, c),
 
 	TP_STRUCT__entry(
-		__field(const char *, rcutorturename)
+		__field(char, rcutorturename[RCUTORTURENAME_LEN])
 		__field(struct rcu_head *, rhp)
 		__field(unsigned long, secs)
 		__field(unsigned long, c_old)
@@ -719,7 +720,9 @@ TRACE_EVENT(rcu_torture_read,
 	),
 
 	TP_fast_assign(
-		__entry->rcutorturename = rcutorturename;
+		strncpy(__entry->rcutorturename, rcutorturename,
+			RCUTORTURENAME_LEN);
+		__entry->rcutorturename[RCUTORTURENAME_LEN - 1] = 0;
 		__entry->rhp = rhp;
 		__entry->secs = secs;
 		__entry->c_old = c_old;
diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c
index 8d59c82bec0b..b48d2107f176 100644
--- a/kernel/rcu/rcutorture.c
+++ b/kernel/rcu/rcutorture.c
@@ -496,7 +496,7 @@ static struct rcu_torture_ops rcu_busted_ops = {
 	.fqs		= NULL,
 	.stats		= NULL,
 	.irq_capable	= 1,
-	.name		= "rcu_busted"
+	.name		= "busted"
 };
 
 /*
diff --git a/tools/testing/selftests/rcutorture/configs/rcu/BUSTED.boot b/tools/testing/selftests/rcutorture/configs/rcu/BUSTED.boot
index 6804f9dcfc1b..be7728db42fd 100644
--- a/tools/testing/selftests/rcutorture/configs/rcu/BUSTED.boot
+++ b/tools/testing/selftests/rcutorture/configs/rcu/BUSTED.boot
@@ -1 +1 @@
-rcutorture.torture_type=rcu_busted
+rcutorture.torture_type=busted
-- 
cgit v1.2.3


From e42e24c3cc072088756d84ef07b492ac2a3ae2e5 Mon Sep 17 00:00:00 2001
From: Matvejchikov Ilya <matvejchikov@gmail.com>
Date: Mon, 24 Jul 2017 16:02:12 +0400
Subject: tcp: remove redundant argument from tcp_rcv_established()

The last (4th) argument of tcp_rcv_established() is redundant as it
always equals to skb->len and the skb itself is always passed as 2th
agrument. There is no reason to have it.

Signed-off-by: Ilya V. Matveychikov <matvejchikov@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/tcp.h    | 2 +-
 net/ipv4/tcp_input.c | 3 ++-
 net/ipv4/tcp_ipv4.c  | 2 +-
 net/ipv4/tcp_probe.c | 5 +++--
 net/ipv6/tcp_ipv6.c  | 2 +-
 5 files changed, 8 insertions(+), 6 deletions(-)

(limited to 'include')

diff --git a/include/net/tcp.h b/include/net/tcp.h
index 4f056ea79df2..12d68335acd4 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -362,7 +362,7 @@ void tcp_delack_timer_handler(struct sock *sk);
 int tcp_ioctl(struct sock *sk, int cmd, unsigned long arg);
 int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb);
 void tcp_rcv_established(struct sock *sk, struct sk_buff *skb,
-			 const struct tcphdr *th, unsigned int len);
+			 const struct tcphdr *th);
 void tcp_rcv_space_adjust(struct sock *sk);
 int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp);
 void tcp_twsk_destructor(struct sock *sk);
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 2920e0cb09f8..adc3f3e9468c 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -5358,8 +5358,9 @@ discard:
  *	tcp_data_queue when everything is OK.
  */
 void tcp_rcv_established(struct sock *sk, struct sk_buff *skb,
-			 const struct tcphdr *th, unsigned int len)
+			 const struct tcphdr *th)
 {
+	unsigned int len = skb->len;
 	struct tcp_sock *tp = tcp_sk(sk);
 
 	tcp_mstamp_refresh(tp);
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index a20e7f03d5f7..3a19ea28339f 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -1458,7 +1458,7 @@ int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
 				sk->sk_rx_dst = NULL;
 			}
 		}
-		tcp_rcv_established(sk, skb, tcp_hdr(skb), skb->len);
+		tcp_rcv_established(sk, skb, tcp_hdr(skb));
 		return 0;
 	}
 
diff --git a/net/ipv4/tcp_probe.c b/net/ipv4/tcp_probe.c
index f6c50af24a64..697f4c67b2e3 100644
--- a/net/ipv4/tcp_probe.c
+++ b/net/ipv4/tcp_probe.c
@@ -105,8 +105,9 @@ static inline int tcp_probe_avail(void)
  * Note: arguments must match tcp_rcv_established()!
  */
 static void jtcp_rcv_established(struct sock *sk, struct sk_buff *skb,
-				 const struct tcphdr *th, unsigned int len)
+				 const struct tcphdr *th)
 {
+	unsigned int len = skb->len;
 	const struct tcp_sock *tp = tcp_sk(sk);
 	const struct inet_sock *inet = inet_sk(sk);
 
@@ -145,7 +146,7 @@ static void jtcp_rcv_established(struct sock *sk, struct sk_buff *skb,
 				BUG();
 			}
 
-			p->length = skb->len;
+			p->length = len;
 			p->snd_nxt = tp->snd_nxt;
 			p->snd_una = tp->snd_una;
 			p->snd_cwnd = tp->snd_cwnd;
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index 2521690d62d6..90a32576c3d0 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -1296,7 +1296,7 @@ static int tcp_v6_do_rcv(struct sock *sk, struct sk_buff *skb)
 			}
 		}
 
-		tcp_rcv_established(sk, skb, tcp_hdr(skb), skb->len);
+		tcp_rcv_established(sk, skb, tcp_hdr(skb));
 		if (opt_skb)
 			goto ipv6_pktoptions;
 		return 0;
-- 
cgit v1.2.3


From aa5d1b81500e6059190f18fe25a7617682321910 Mon Sep 17 00:00:00 2001
From: Kees Cook <keescook@chromium.org>
Date: Mon, 24 Jul 2017 11:35:48 -0700
Subject: x86/asm: Add ASM_UNREACHABLE

This creates an unreachable annotation in asm for CONFIG_STACK_VALIDATION=y.
While here, adjust earlier uses of \t\n into \n\t.

Suggested-by: Josh Poimboeuf <jpoimboe@redhat.com>
Signed-off-by: Kees Cook <keescook@chromium.org>
Cc: Alexey Dobriyan <adobriyan@gmail.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Christoph Hellwig <hch@infradead.org>
Cc: David S. Miller <davem@davemloft.net>
Cc: Davidlohr Bueso <dave@stgolabs.net>
Cc: Elena Reshetova <elena.reshetova@intel.com>
Cc: Eric Biggers <ebiggers3@gmail.com>
Cc: Eric W. Biederman <ebiederm@xmission.com>
Cc: Greg KH <gregkh@linuxfoundation.org>
Cc: Hans Liljestrand <ishkamiel@gmail.com>
Cc: James Bottomley <James.Bottomley@hansenpartnership.com>
Cc: Jann Horn <jannh@google.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Manfred Spraul <manfred@colorfullife.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rik van Riel <riel@redhat.com>
Cc: Serge E. Hallyn <serge@hallyn.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: arozansk@redhat.com
Cc: axboe@kernel.dk
Cc: kernel-hardening@lists.openwall.com
Cc: linux-arch <linux-arch@vger.kernel.org>
Link: http://lkml.kernel.org/r/1500921349-10803-3-git-send-email-keescook@chromium.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/compiler-gcc.h | 13 +++++++++----
 include/linux/compiler.h     |  3 +++
 2 files changed, 12 insertions(+), 4 deletions(-)

(limited to 'include')

diff --git a/include/linux/compiler-gcc.h b/include/linux/compiler-gcc.h
index cd4bbe8242bd..179375b2d862 100644
--- a/include/linux/compiler-gcc.h
+++ b/include/linux/compiler-gcc.h
@@ -203,11 +203,16 @@
 
 #ifdef CONFIG_STACK_VALIDATION
 #define annotate_unreachable() ({					\
-	asm("%c0:\t\n"							\
-	    ".pushsection .discard.unreachable\t\n"			\
-	    ".long %c0b - .\t\n"					\
-	    ".popsection\t\n" : : "i" (__LINE__));			\
+	asm("%c0:\n\t"							\
+	    ".pushsection .discard.unreachable\n\t"			\
+	    ".long %c0b - .\n\t"					\
+	    ".popsection\n\t" : : "i" (__LINE__));			\
 })
+#define ASM_UNREACHABLE							\
+	"999:\n\t"							\
+	".pushsection .discard.unreachable\n\t"				\
+	".long 999b - .\n\t"						\
+	".popsection\n\t"
 #else
 #define annotate_unreachable()
 #endif
diff --git a/include/linux/compiler.h b/include/linux/compiler.h
index 219f82f3ec1a..641f5912d75f 100644
--- a/include/linux/compiler.h
+++ b/include/linux/compiler.h
@@ -185,6 +185,9 @@ void ftrace_likely_update(struct ftrace_likely_data *f, int val,
 #endif
 
 /* Unreachable code */
+#ifndef ASM_UNREACHABLE
+# define ASM_UNREACHABLE
+#endif
 #ifndef unreachable
 # define unreachable() do { } while (1)
 #endif
-- 
cgit v1.2.3


From 885dcd709ba9120b9935415b8b0f9d1b94e5826b Mon Sep 17 00:00:00 2001
From: Anju T Sudhakar <anju@linux.vnet.ibm.com>
Date: Wed, 19 Jul 2017 03:06:34 +0530
Subject: powerpc/perf: Add nest IMC PMU support

Add support to register Nest In-Memory Collection PMU counters.
Patch adds a new device file called "imc-pmu.c" under powerpc/perf
folder to contain all the device PMU functions.

Device tree parser code added to parse the PMU events information
and create sysfs event attributes for the PMU.

Cpumask attribute added along with Cpu hotplug online/offline functions
specific for nest PMU. A new state "CPUHP_AP_PERF_POWERPC_NEST_IMC_ONLINE"
added for the cpu hotplug callbacks. Error handle path frees the memory
and unregisters the CPU hotplug callbacks.

Signed-off-by: Anju T Sudhakar <anju@linux.vnet.ibm.com>
Signed-off-by: Hemant Kumar <hemant@linux.vnet.ibm.com>
Signed-off-by: Madhavan Srinivasan <maddy@linux.vnet.ibm.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/perf/Makefile                |   1 +
 arch/powerpc/perf/imc-pmu.c               | 749 ++++++++++++++++++++++++++++++
 arch/powerpc/platforms/powernv/opal-imc.c |   5 +
 include/linux/cpuhotplug.h                |   1 +
 4 files changed, 756 insertions(+)
 create mode 100644 arch/powerpc/perf/imc-pmu.c

(limited to 'include')

diff --git a/arch/powerpc/perf/Makefile b/arch/powerpc/perf/Makefile
index 4d606b99a5cb..3f3a5ce66495 100644
--- a/arch/powerpc/perf/Makefile
+++ b/arch/powerpc/perf/Makefile
@@ -8,6 +8,7 @@ obj64-$(CONFIG_PPC_PERF_CTRS)	+= power4-pmu.o ppc970-pmu.o power5-pmu.o \
 				   isa207-common.o power8-pmu.o power9-pmu.o
 obj32-$(CONFIG_PPC_PERF_CTRS)	+= mpc7450-pmu.o
 
+obj-$(CONFIG_PPC_POWERNV)	+= imc-pmu.o
 obj-$(CONFIG_FSL_EMB_PERF_EVENT) += core-fsl-emb.o
 obj-$(CONFIG_FSL_EMB_PERF_EVENT_E500) += e500-pmu.o e6500-pmu.o
 
diff --git a/arch/powerpc/perf/imc-pmu.c b/arch/powerpc/perf/imc-pmu.c
new file mode 100644
index 000000000000..4543faa1bb0d
--- /dev/null
+++ b/arch/powerpc/perf/imc-pmu.c
@@ -0,0 +1,749 @@
+/*
+ * In-Memory Collection (IMC) Performance Monitor counter support.
+ *
+ * Copyright (C) 2017 Madhavan Srinivasan, IBM Corporation.
+ *           (C) 2017 Anju T Sudhakar, IBM Corporation.
+ *           (C) 2017 Hemant K Shaw, IBM Corporation.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or later version.
+ */
+#include <linux/perf_event.h>
+#include <linux/slab.h>
+#include <asm/opal.h>
+#include <asm/imc-pmu.h>
+#include <asm/cputhreads.h>
+#include <asm/smp.h>
+#include <linux/string.h>
+
+/* Nest IMC data structures and variables */
+
+/*
+ * Used to avoid races in counting the nest-pmu units during hotplug
+ * register and unregister
+ */
+static DEFINE_MUTEX(nest_init_lock);
+static DEFINE_PER_CPU(struct imc_pmu_ref *, local_nest_imc_refc);
+static struct imc_pmu *per_nest_pmu_arr[IMC_MAX_PMUS];
+static cpumask_t nest_imc_cpumask;
+struct imc_pmu_ref *nest_imc_refc;
+static int nest_pmus;
+
+struct imc_pmu *imc_event_to_pmu(struct perf_event *event)
+{
+	return container_of(event->pmu, struct imc_pmu, pmu);
+}
+
+PMU_FORMAT_ATTR(event, "config:0-40");
+PMU_FORMAT_ATTR(offset, "config:0-31");
+PMU_FORMAT_ATTR(rvalue, "config:32");
+PMU_FORMAT_ATTR(mode, "config:33-40");
+static struct attribute *imc_format_attrs[] = {
+	&format_attr_event.attr,
+	&format_attr_offset.attr,
+	&format_attr_rvalue.attr,
+	&format_attr_mode.attr,
+	NULL,
+};
+
+static struct attribute_group imc_format_group = {
+	.name = "format",
+	.attrs = imc_format_attrs,
+};
+
+/* Get the cpumask printed to a buffer "buf" */
+static ssize_t imc_pmu_cpumask_get_attr(struct device *dev,
+					struct device_attribute *attr,
+					char *buf)
+{
+	struct pmu *pmu = dev_get_drvdata(dev);
+	struct imc_pmu *imc_pmu = container_of(pmu, struct imc_pmu, pmu);
+	cpumask_t *active_mask;
+
+	/* Subsequenct patch will add more pmu types here */
+	switch(imc_pmu->domain){
+	case IMC_DOMAIN_NEST:
+		active_mask = &nest_imc_cpumask;
+		break;
+	default:
+		return 0;
+	}
+
+	return cpumap_print_to_pagebuf(true, buf, active_mask);
+}
+
+static DEVICE_ATTR(cpumask, S_IRUGO, imc_pmu_cpumask_get_attr, NULL);
+
+static struct attribute *imc_pmu_cpumask_attrs[] = {
+	&dev_attr_cpumask.attr,
+	NULL,
+};
+
+static struct attribute_group imc_pmu_cpumask_attr_group = {
+	.attrs = imc_pmu_cpumask_attrs,
+};
+
+/* device_str_attr_create : Populate event "name" and string "str" in attribute */
+static struct attribute *device_str_attr_create(const char *name, const char *str)
+{
+	struct perf_pmu_events_attr *attr;
+
+	attr = kzalloc(sizeof(*attr), GFP_KERNEL);
+	if (!attr)
+		return NULL;
+	sysfs_attr_init(&attr->attr.attr);
+
+	attr->event_str = str;
+	attr->attr.attr.name = name;
+	attr->attr.attr.mode = 0444;
+	attr->attr.show = perf_event_sysfs_show;
+
+	return &attr->attr.attr;
+}
+
+struct imc_events *imc_parse_event(struct device_node *np, const char *scale,
+				  const char *unit, const char *prefix, u32 base)
+{
+	struct imc_events *event;
+	const char *s;
+	u32 reg;
+
+	event = kzalloc(sizeof(struct imc_events), GFP_KERNEL);
+	if (!event)
+		return NULL;
+
+	if (of_property_read_u32(np, "reg", &reg))
+		goto error;
+	/* Add the base_reg value to the "reg" */
+	event->value = base + reg;
+
+	if (of_property_read_string(np, "event-name", &s))
+		goto error;
+
+	event->name = kasprintf(GFP_KERNEL, "%s%s", prefix, s);
+	if (!event->name)
+		goto error;
+
+	if (of_property_read_string(np, "scale", &s))
+		s = scale;
+
+	if (s) {
+		event->scale = kstrdup(s, GFP_KERNEL);
+		if (!event->scale)
+			goto error;
+	}
+
+	if (of_property_read_string(np, "unit", &s))
+		s = unit;
+
+	if (s) {
+		event->unit = kstrdup(s, GFP_KERNEL);
+		if (!event->unit)
+			goto error;
+	}
+
+	return event;
+error:
+	kfree(event->unit);
+	kfree(event->scale);
+	kfree(event->name);
+	kfree(event);
+
+	return NULL;
+}
+
+/*
+ * update_events_in_group: Update the "events" information in an attr_group
+ *                         and assign the attr_group to the pmu "pmu".
+ */
+static int update_events_in_group(struct device_node *node, struct imc_pmu *pmu)
+{
+	struct attribute_group *attr_group;
+	struct attribute **attrs, *dev_str;
+	struct device_node *np, *pmu_events;
+	struct imc_events *ev;
+	u32 handle, base_reg;
+	int i=0, j=0, ct;
+	const char *prefix, *g_scale, *g_unit;
+	const char *ev_val_str, *ev_scale_str, *ev_unit_str;
+
+	if (!of_property_read_u32(node, "events", &handle))
+		pmu_events = of_find_node_by_phandle(handle);
+	else
+		return 0;
+
+	/* Did not find any node with a given phandle */
+	if (!pmu_events)
+		return 0;
+
+	/* Get a count of number of child nodes */
+	ct = of_get_child_count(pmu_events);
+
+	/* Get the event prefix */
+	if (of_property_read_string(node, "events-prefix", &prefix))
+		return 0;
+
+	/* Get a global unit and scale data if available */
+	if (of_property_read_string(node, "scale", &g_scale))
+		g_scale = NULL;
+
+	if (of_property_read_string(node, "unit", &g_unit))
+		g_unit = NULL;
+
+	/* "reg" property gives out the base offset of the counters data */
+	of_property_read_u32(node, "reg", &base_reg);
+
+	/* Allocate memory for the events */
+	pmu->events = kcalloc(ct, sizeof(struct imc_events), GFP_KERNEL);
+	if (!pmu->events)
+		return -ENOMEM;
+
+	ct = 0;
+	/* Parse the events and update the struct */
+	for_each_child_of_node(pmu_events, np) {
+		ev = imc_parse_event(np, g_scale, g_unit, prefix, base_reg);
+		if (ev)
+			pmu->events[ct++] = ev;
+	}
+
+	/* Allocate memory for attribute group */
+	attr_group = kzalloc(sizeof(*attr_group), GFP_KERNEL);
+	if (!attr_group)
+		return -ENOMEM;
+
+	/*
+	 * Allocate memory for attributes.
+	 * Since we have count of events for this pmu, we also allocate
+	 * memory for the scale and unit attribute for now.
+	 * "ct" has the total event structs added from the events-parent node.
+	 * So allocate three times the "ct" (this includes event, event_scale and
+	 * event_unit).
+	 */
+	attrs = kcalloc(((ct * 3) + 1), sizeof(struct attribute *), GFP_KERNEL);
+	if (!attrs) {
+		kfree(attr_group);
+		kfree(pmu->events);
+		return -ENOMEM;
+	}
+
+	attr_group->name = "events";
+	attr_group->attrs = attrs;
+	do {
+		ev_val_str = kasprintf(GFP_KERNEL, "event=0x%x", pmu->events[i]->value);
+		dev_str = device_str_attr_create(pmu->events[i]->name, ev_val_str);
+		if (!dev_str)
+			continue;
+
+		attrs[j++] = dev_str;
+		if (pmu->events[i]->scale) {
+			ev_scale_str = kasprintf(GFP_KERNEL, "%s.scale",pmu->events[i]->name);
+			dev_str = device_str_attr_create(ev_scale_str, pmu->events[i]->scale);
+			if (!dev_str)
+				continue;
+
+			attrs[j++] = dev_str;
+		}
+
+		if (pmu->events[i]->unit) {
+			ev_unit_str = kasprintf(GFP_KERNEL, "%s.unit",pmu->events[i]->name);
+			dev_str = device_str_attr_create(ev_unit_str, pmu->events[i]->unit);
+			if (!dev_str)
+				continue;
+
+			attrs[j++] = dev_str;
+		}
+	} while (++i < ct);
+
+	/* Save the event attribute */
+	pmu->attr_groups[IMC_EVENT_ATTR] = attr_group;
+
+	kfree(pmu->events);
+	return 0;
+}
+
+/* get_nest_pmu_ref: Return the imc_pmu_ref struct for the given node */
+static struct imc_pmu_ref *get_nest_pmu_ref(int cpu)
+{
+	return per_cpu(local_nest_imc_refc, cpu);
+}
+
+static void nest_change_cpu_context(int old_cpu, int new_cpu)
+{
+	struct imc_pmu **pn = per_nest_pmu_arr;
+	int i;
+
+	if (old_cpu < 0 || new_cpu < 0)
+		return;
+
+	for (i = 0; *pn && i < IMC_MAX_PMUS; i++, pn++)
+		perf_pmu_migrate_context(&(*pn)->pmu, old_cpu, new_cpu);
+}
+
+static int ppc_nest_imc_cpu_offline(unsigned int cpu)
+{
+	int nid, target = -1;
+	const struct cpumask *l_cpumask;
+	struct imc_pmu_ref *ref;
+
+	/*
+	 * Check in the designated list for this cpu. Dont bother
+	 * if not one of them.
+	 */
+	if (!cpumask_test_and_clear_cpu(cpu, &nest_imc_cpumask))
+		return 0;
+
+	/*
+	 * Now that this cpu is one of the designated,
+	 * find a next cpu a) which is online and b) in same chip.
+	 */
+	nid = cpu_to_node(cpu);
+	l_cpumask = cpumask_of_node(nid);
+	target = cpumask_any_but(l_cpumask, cpu);
+
+	/*
+	 * Update the cpumask with the target cpu and
+	 * migrate the context if needed
+	 */
+	if (target >= 0 && target < nr_cpu_ids) {
+		cpumask_set_cpu(target, &nest_imc_cpumask);
+		nest_change_cpu_context(cpu, target);
+	} else {
+		opal_imc_counters_stop(OPAL_IMC_COUNTERS_NEST,
+				       get_hard_smp_processor_id(cpu));
+		/*
+		 * If this is the last cpu in this chip then, skip the reference
+		 * count mutex lock and make the reference count on this chip zero.
+		 */
+		ref = get_nest_pmu_ref(cpu);
+		if (!ref)
+			return -EINVAL;
+
+		ref->refc = 0;
+	}
+	return 0;
+}
+
+static int ppc_nest_imc_cpu_online(unsigned int cpu)
+{
+	const struct cpumask *l_cpumask;
+	static struct cpumask tmp_mask;
+	int res;
+
+	/* Get the cpumask of this node */
+	l_cpumask = cpumask_of_node(cpu_to_node(cpu));
+
+	/*
+	 * If this is not the first online CPU on this node, then
+	 * just return.
+	 */
+	if (cpumask_and(&tmp_mask, l_cpumask, &nest_imc_cpumask))
+		return 0;
+
+	/*
+	 * If this is the first online cpu on this node
+	 * disable the nest counters by making an OPAL call.
+	 */
+	res = opal_imc_counters_stop(OPAL_IMC_COUNTERS_NEST,
+				     get_hard_smp_processor_id(cpu));
+	if (res)
+		return res;
+
+	/* Make this CPU the designated target for counter collection */
+	cpumask_set_cpu(cpu, &nest_imc_cpumask);
+	return 0;
+}
+
+static int nest_pmu_cpumask_init(void)
+{
+	return cpuhp_setup_state(CPUHP_AP_PERF_POWERPC_NEST_IMC_ONLINE,
+				 "perf/powerpc/imc:online",
+				 ppc_nest_imc_cpu_online,
+				 ppc_nest_imc_cpu_offline);
+}
+
+static void nest_imc_counters_release(struct perf_event *event)
+{
+	int rc, node_id;
+	struct imc_pmu_ref *ref;
+
+	if (event->cpu < 0)
+		return;
+
+	node_id = cpu_to_node(event->cpu);
+
+	/*
+	 * See if we need to disable the nest PMU.
+	 * If no events are currently in use, then we have to take a
+	 * mutex to ensure that we don't race with another task doing
+	 * enable or disable the nest counters.
+	 */
+	ref = get_nest_pmu_ref(event->cpu);
+	if (!ref)
+		return;
+
+	/* Take the mutex lock for this node and then decrement the reference count */
+	mutex_lock(&ref->lock);
+	ref->refc--;
+	if (ref->refc == 0) {
+		rc = opal_imc_counters_stop(OPAL_IMC_COUNTERS_NEST,
+					    get_hard_smp_processor_id(event->cpu));
+		if (rc) {
+			mutex_unlock(&nest_imc_refc[node_id].lock);
+			pr_err("nest-imc: Unable to stop the counters for core %d\n", node_id);
+			return;
+		}
+	} else if (ref->refc < 0) {
+		WARN(1, "nest-imc: Invalid event reference count\n");
+		ref->refc = 0;
+	}
+	mutex_unlock(&ref->lock);
+}
+
+static int nest_imc_event_init(struct perf_event *event)
+{
+	int chip_id, rc, node_id;
+	u32 l_config, config = event->attr.config;
+	struct imc_mem_info *pcni;
+	struct imc_pmu *pmu;
+	struct imc_pmu_ref *ref;
+	bool flag = false;
+
+	if (event->attr.type != event->pmu->type)
+		return -ENOENT;
+
+	/* Sampling not supported */
+	if (event->hw.sample_period)
+		return -EINVAL;
+
+	/* unsupported modes and filters */
+	if (event->attr.exclude_user   ||
+	    event->attr.exclude_kernel ||
+	    event->attr.exclude_hv     ||
+	    event->attr.exclude_idle   ||
+	    event->attr.exclude_host   ||
+	    event->attr.exclude_guest)
+		return -EINVAL;
+
+	if (event->cpu < 0)
+		return -EINVAL;
+
+	pmu = imc_event_to_pmu(event);
+
+	/* Sanity check for config (event offset) */
+	if ((config & IMC_EVENT_OFFSET_MASK) > pmu->counter_mem_size)
+		return -EINVAL;
+
+	/*
+	 * Nest HW counter memory resides in a per-chip reserve-memory (HOMER).
+	 * Get the base memory addresss for this cpu.
+	 */
+	chip_id = topology_physical_package_id(event->cpu);
+	pcni = pmu->mem_info;
+	do {
+		if (pcni->id == chip_id) {
+			flag = true;
+			break;
+		}
+		pcni++;
+	} while (pcni);
+
+	if (!flag)
+		return -ENODEV;
+
+	/*
+	 * Add the event offset to the base address.
+	 */
+	l_config = config & IMC_EVENT_OFFSET_MASK;
+	event->hw.event_base = (u64)pcni->vbase + l_config;
+	node_id = cpu_to_node(event->cpu);
+
+	/*
+	 * Get the imc_pmu_ref struct for this node.
+	 * Take the mutex lock and then increment the count of nest pmu events
+	 * inited.
+	 */
+	ref = get_nest_pmu_ref(event->cpu);
+	if (!ref)
+		return -EINVAL;
+
+	mutex_lock(&ref->lock);
+	if (ref->refc == 0) {
+		rc = opal_imc_counters_start(OPAL_IMC_COUNTERS_NEST,
+					     get_hard_smp_processor_id(event->cpu));
+		if (rc) {
+			mutex_unlock(&nest_imc_refc[node_id].lock);
+			pr_err("nest-imc: Unable to start the counters for node %d\n",
+									node_id);
+			return rc;
+		}
+	}
+	++ref->refc;
+	mutex_unlock(&ref->lock);
+
+	event->destroy = nest_imc_counters_release;
+	return 0;
+}
+
+static u64 * get_event_base_addr(struct perf_event *event)
+{
+	/*
+	 * Subsequent patch will add code to detect caller imc pmu
+	 * and return accordingly.
+	 */
+	return (u64 *)event->hw.event_base;
+}
+
+static u64 imc_read_counter(struct perf_event *event)
+{
+	u64 *addr, data;
+
+	/*
+	 * In-Memory Collection (IMC) counters are free flowing counters.
+	 * So we take a snapshot of the counter value on enable and save it
+	 * to calculate the delta at later stage to present the event counter
+	 * value.
+	 */
+	addr = get_event_base_addr(event);
+	data = be64_to_cpu(READ_ONCE(*addr));
+	local64_set(&event->hw.prev_count, data);
+
+	return data;
+}
+
+static void imc_event_update(struct perf_event *event)
+{
+	u64 counter_prev, counter_new, final_count;
+
+	counter_prev = local64_read(&event->hw.prev_count);
+	counter_new = imc_read_counter(event);
+	final_count = counter_new - counter_prev;
+
+	/* Update the delta to the event count */
+	local64_add(final_count, &event->count);
+}
+
+static void imc_event_start(struct perf_event *event, int flags)
+{
+	/*
+	 * In Memory Counters are free flowing counters. HW or the microcode
+	 * keeps adding to the counter offset in memory. To get event
+	 * counter value, we snapshot the value here and we calculate
+	 * delta at later point.
+	 */
+	imc_read_counter(event);
+}
+
+static void imc_event_stop(struct perf_event *event, int flags)
+{
+	/*
+	 * Take a snapshot and calculate the delta and update
+	 * the event counter values.
+	 */
+	imc_event_update(event);
+}
+
+static int imc_event_add(struct perf_event *event, int flags)
+{
+	if (flags & PERF_EF_START)
+		imc_event_start(event, flags);
+
+	return 0;
+}
+
+/* update_pmu_ops : Populate the appropriate operations for "pmu" */
+static int update_pmu_ops(struct imc_pmu *pmu)
+{
+	pmu->pmu.task_ctx_nr = perf_invalid_context;
+	pmu->pmu.add = imc_event_add;
+	pmu->pmu.del = imc_event_stop;
+	pmu->pmu.start = imc_event_start;
+	pmu->pmu.stop = imc_event_stop;
+	pmu->pmu.read = imc_event_update;
+	pmu->pmu.attr_groups = pmu->attr_groups;
+	pmu->attr_groups[IMC_FORMAT_ATTR] = &imc_format_group;
+
+	/* Subsequenct patch will add more pmu types here */
+	switch (pmu->domain) {
+	case IMC_DOMAIN_NEST:
+		pmu->pmu.event_init = nest_imc_event_init;
+		pmu->attr_groups[IMC_CPUMASK_ATTR] = &imc_pmu_cpumask_attr_group;
+		break;
+	default:
+		break;
+	}
+
+	return 0;
+}
+
+/* init_nest_pmu_ref: Initialize the imc_pmu_ref struct for all the nodes */
+static int init_nest_pmu_ref(void)
+{
+	int nid, i, cpu;
+
+	nest_imc_refc = kcalloc(num_possible_nodes(), sizeof(*nest_imc_refc),
+								GFP_KERNEL);
+
+	if (!nest_imc_refc)
+		return -ENOMEM;
+
+	i = 0;
+	for_each_node(nid) {
+		/*
+		 * Mutex lock to avoid races while tracking the number of
+		 * sessions using the chip's nest pmu units.
+		 */
+		mutex_init(&nest_imc_refc[i].lock);
+
+		/*
+		 * Loop to init the "id" with the node_id. Variable "i" initialized to
+		 * 0 and will be used as index to the array. "i" will not go off the
+		 * end of the array since the "for_each_node" loops for "N_POSSIBLE"
+		 * nodes only.
+		 */
+		nest_imc_refc[i++].id = nid;
+	}
+
+	/*
+	 * Loop to init the per_cpu "local_nest_imc_refc" with the proper
+	 * "nest_imc_refc" index. This makes get_nest_pmu_ref() alot simple.
+	 */
+	for_each_possible_cpu(cpu) {
+		nid = cpu_to_node(cpu);
+		for_each_online_node(i) {
+			if (nest_imc_refc[i].id == nid) {
+				per_cpu(local_nest_imc_refc, cpu) = &nest_imc_refc[i];
+				break;
+			}
+		}
+	}
+	return 0;
+}
+
+/*
+ * Common function to unregister cpu hotplug callback and
+ * free the memory.
+ * TODO: Need to handle pmu unregistering, which will be
+ * done in followup series.
+ */
+static void imc_common_cpuhp_mem_free(struct imc_pmu *pmu_ptr)
+{
+	if (pmu_ptr->domain == IMC_DOMAIN_NEST) {
+		mutex_unlock(&nest_init_lock);
+		if (nest_pmus == 1) {
+			cpuhp_remove_state(CPUHP_AP_PERF_POWERPC_NEST_IMC_ONLINE);
+			kfree(nest_imc_refc);
+		}
+
+		if (nest_pmus > 0)
+			nest_pmus--;
+		mutex_unlock(&nest_init_lock);
+	}
+
+	/* Only free the attr_groups which are dynamically allocated  */
+	kfree(pmu_ptr->attr_groups[IMC_EVENT_ATTR]->attrs);
+	kfree(pmu_ptr->attr_groups[IMC_EVENT_ATTR]);
+	kfree(pmu_ptr);
+	return;
+}
+
+
+/*
+ * imc_mem_init : Function to support memory allocation for core imc.
+ */
+static int imc_mem_init(struct imc_pmu *pmu_ptr, struct device_node *parent,
+								int pmu_index)
+{
+	const char *s;
+
+	if (of_property_read_string(parent, "name", &s))
+		return -ENODEV;
+
+	/* Subsequenct patch will add more pmu types here */
+	switch (pmu_ptr->domain) {
+	case IMC_DOMAIN_NEST:
+		/* Update the pmu name */
+		pmu_ptr->pmu.name = kasprintf(GFP_KERNEL, "%s%s_imc", "nest_", s);
+		if (!pmu_ptr->pmu.name)
+			return -ENOMEM;
+
+		/* Needed for hotplug/migration */
+		per_nest_pmu_arr[pmu_index] = pmu_ptr;
+		break;
+	default:
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+/*
+ * init_imc_pmu : Setup and register the IMC pmu device.
+ *
+ * @parent:	Device tree unit node
+ * @pmu_ptr:	memory allocated for this pmu
+ * @pmu_idx:	Count of nest pmc registered
+ *
+ * init_imc_pmu() setup pmu cpumask and registers for a cpu hotplug callback.
+ * Handles failure cases and accordingly frees memory.
+ */
+int init_imc_pmu(struct device_node *parent, struct imc_pmu *pmu_ptr, int pmu_idx)
+{
+	int ret;
+
+	ret = imc_mem_init(pmu_ptr, parent, pmu_idx);
+	if (ret)
+		goto err_free;
+
+	/* Subsequenct patch will add more pmu types here */
+	switch (pmu_ptr->domain) {
+	case IMC_DOMAIN_NEST:
+		/*
+		* Nest imc pmu need only one cpu per chip, we initialize the
+		* cpumask for the first nest imc pmu and use the same for the
+		* rest. To handle the cpuhotplug callback unregister, we track
+		* the number of nest pmus in "nest_pmus".
+		*/
+		mutex_lock(&nest_init_lock);
+		if (nest_pmus == 0) {
+			ret = init_nest_pmu_ref();
+			if (ret) {
+				mutex_unlock(&nest_init_lock);
+				goto err_free;
+			}
+			/* Register for cpu hotplug notification. */
+			ret = nest_pmu_cpumask_init();
+			if (ret) {
+				mutex_unlock(&nest_init_lock);
+				goto err_free;
+			}
+		}
+		nest_pmus++;
+		mutex_unlock(&nest_init_lock);
+		break;
+	default:
+		return  -1;	/* Unknown domain */
+	}
+
+	ret = update_events_in_group(parent, pmu_ptr);
+	if (ret)
+		goto err_free;
+
+	ret = update_pmu_ops(pmu_ptr);
+	if (ret)
+		goto err_free;
+
+	ret = perf_pmu_register(&pmu_ptr->pmu, pmu_ptr->pmu.name, -1);
+	if (ret)
+		goto err_free;
+
+	pr_info("%s performance monitor hardware support registered\n",
+							pmu_ptr->pmu.name);
+
+	return 0;
+
+err_free:
+	imc_common_cpuhp_mem_free(pmu_ptr);
+	return ret;
+}
diff --git a/arch/powerpc/platforms/powernv/opal-imc.c b/arch/powerpc/platforms/powernv/opal-imc.c
index f57a6fbd3f57..b903bf5e6006 100644
--- a/arch/powerpc/platforms/powernv/opal-imc.c
+++ b/arch/powerpc/platforms/powernv/opal-imc.c
@@ -108,6 +108,11 @@ static int imc_pmu_create(struct device_node *parent, int pmu_index, int domain)
 		}
 	}
 
+	/* Function to register IMC pmu */
+	ret = init_imc_pmu(parent, pmu_ptr, pmu_index);
+	if (ret)
+		pr_err("IMC PMU %s Register failed\n", pmu_ptr->pmu.name);
+
 	return 0;
 
 free_pmu:
diff --git a/include/linux/cpuhotplug.h b/include/linux/cpuhotplug.h
index b56573bf440d..0853a14b1fa1 100644
--- a/include/linux/cpuhotplug.h
+++ b/include/linux/cpuhotplug.h
@@ -139,6 +139,7 @@ enum cpuhp_state {
 	CPUHP_AP_PERF_ARM_L2X0_ONLINE,
 	CPUHP_AP_PERF_ARM_QCOM_L2_ONLINE,
 	CPUHP_AP_PERF_ARM_QCOM_L3_ONLINE,
+	CPUHP_AP_PERF_POWERPC_NEST_IMC_ONLINE,
 	CPUHP_AP_WORKQUEUE_ONLINE,
 	CPUHP_AP_RCUTREE_ONLINE,
 	CPUHP_AP_ONLINE_DYN,
-- 
cgit v1.2.3


From 39a846db1d574a498511ffccd75223a35cdcb059 Mon Sep 17 00:00:00 2001
From: Anju T Sudhakar <anju@linux.vnet.ibm.com>
Date: Wed, 19 Jul 2017 03:06:35 +0530
Subject: powerpc/perf: Add core IMC PMU support

Add support to register Core In-Memory Collection PMU counters.
Patch adds core IMC specific data structures, along with memory
init functions and CPU hotplug support.

Signed-off-by: Anju T Sudhakar <anju@linux.vnet.ibm.com>
Signed-off-by: Hemant Kumar <hemant@linux.vnet.ibm.com>
Signed-off-by: Madhavan Srinivasan <maddy@linux.vnet.ibm.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/perf/imc-pmu.c | 303 +++++++++++++++++++++++++++++++++++++++++++-
 include/linux/cpuhotplug.h  |   1 +
 2 files changed, 300 insertions(+), 4 deletions(-)

(limited to 'include')

diff --git a/arch/powerpc/perf/imc-pmu.c b/arch/powerpc/perf/imc-pmu.c
index 4543faa1bb0d..482f8d6d5e65 100644
--- a/arch/powerpc/perf/imc-pmu.c
+++ b/arch/powerpc/perf/imc-pmu.c
@@ -31,6 +31,12 @@ static cpumask_t nest_imc_cpumask;
 struct imc_pmu_ref *nest_imc_refc;
 static int nest_pmus;
 
+/* Core IMC data structures and variables */
+
+static cpumask_t core_imc_cpumask;
+struct imc_pmu_ref *core_imc_refc;
+static struct imc_pmu *core_imc_pmu;
+
 struct imc_pmu *imc_event_to_pmu(struct perf_event *event)
 {
 	return container_of(event->pmu, struct imc_pmu, pmu);
@@ -62,11 +68,13 @@ static ssize_t imc_pmu_cpumask_get_attr(struct device *dev,
 	struct imc_pmu *imc_pmu = container_of(pmu, struct imc_pmu, pmu);
 	cpumask_t *active_mask;
 
-	/* Subsequenct patch will add more pmu types here */
 	switch(imc_pmu->domain){
 	case IMC_DOMAIN_NEST:
 		active_mask = &nest_imc_cpumask;
 		break;
+	case IMC_DOMAIN_CORE:
+		active_mask = &core_imc_cpumask;
+		break;
 	default:
 		return 0;
 	}
@@ -486,6 +494,240 @@ static int nest_imc_event_init(struct perf_event *event)
 	return 0;
 }
 
+/*
+ * core_imc_mem_init : Initializes memory for the current core.
+ *
+ * Uses alloc_pages_node() and uses the returned address as an argument to
+ * an opal call to configure the pdbar. The address sent as an argument is
+ * converted to physical address before the opal call is made. This is the
+ * base address at which the core imc counters are populated.
+ */
+static int core_imc_mem_init(int cpu, int size)
+{
+	int phys_id, rc = 0, core_id = (cpu / threads_per_core);
+	struct imc_mem_info *mem_info;
+
+	/*
+	 * alloc_pages_node() will allocate memory for core in the
+	 * local node only.
+	 */
+	phys_id = topology_physical_package_id(cpu);
+	mem_info = &core_imc_pmu->mem_info[core_id];
+	mem_info->id = core_id;
+
+	/* We need only vbase for core counters */
+	mem_info->vbase = page_address(alloc_pages_node(phys_id,
+					  GFP_KERNEL | __GFP_ZERO | __GFP_THISNODE,
+					  get_order(size)));
+	if (!mem_info->vbase)
+		return -ENOMEM;
+
+	/* Init the mutex */
+	core_imc_refc[core_id].id = core_id;
+	mutex_init(&core_imc_refc[core_id].lock);
+
+	rc = opal_imc_counters_init(OPAL_IMC_COUNTERS_CORE,
+				__pa((void *)mem_info->vbase),
+				get_hard_smp_processor_id(cpu));
+	if (rc) {
+		free_pages((u64)mem_info->vbase, get_order(size));
+		mem_info->vbase = NULL;
+	}
+
+	return rc;
+}
+
+static bool is_core_imc_mem_inited(int cpu)
+{
+	struct imc_mem_info *mem_info;
+	int core_id = (cpu / threads_per_core);
+
+	mem_info = &core_imc_pmu->mem_info[core_id];
+	if (!mem_info->vbase)
+		return false;
+
+	return true;
+}
+
+static int ppc_core_imc_cpu_online(unsigned int cpu)
+{
+	const struct cpumask *l_cpumask;
+	static struct cpumask tmp_mask;
+	int ret = 0;
+
+	/* Get the cpumask for this core */
+	l_cpumask = cpu_sibling_mask(cpu);
+
+	/* If a cpu for this core is already set, then, don't do anything */
+	if (cpumask_and(&tmp_mask, l_cpumask, &core_imc_cpumask))
+		return 0;
+
+	if (!is_core_imc_mem_inited(cpu)) {
+		ret = core_imc_mem_init(cpu, core_imc_pmu->counter_mem_size);
+		if (ret) {
+			pr_info("core_imc memory allocation for cpu %d failed\n", cpu);
+			return ret;
+		}
+	}
+
+	/* set the cpu in the mask */
+	cpumask_set_cpu(cpu, &core_imc_cpumask);
+	return 0;
+}
+
+static int ppc_core_imc_cpu_offline(unsigned int cpu)
+{
+	unsigned int ncpu, core_id;
+	struct imc_pmu_ref *ref;
+
+	/*
+	 * clear this cpu out of the mask, if not present in the mask,
+	 * don't bother doing anything.
+	 */
+	if (!cpumask_test_and_clear_cpu(cpu, &core_imc_cpumask))
+		return 0;
+
+	/* Find any online cpu in that core except the current "cpu" */
+	ncpu = cpumask_any_but(cpu_sibling_mask(cpu), cpu);
+
+	if (ncpu >= 0 && ncpu < nr_cpu_ids) {
+		cpumask_set_cpu(ncpu, &core_imc_cpumask);
+		perf_pmu_migrate_context(&core_imc_pmu->pmu, cpu, ncpu);
+	} else {
+		/*
+		 * If this is the last cpu in this core then, skip taking refernce
+		 * count mutex lock for this core and directly zero "refc" for
+		 * this core.
+		 */
+		opal_imc_counters_stop(OPAL_IMC_COUNTERS_CORE,
+				       get_hard_smp_processor_id(cpu));
+		core_id = cpu / threads_per_core;
+		ref = &core_imc_refc[core_id];
+		if (!ref)
+			return -EINVAL;
+
+		ref->refc = 0;
+	}
+	return 0;
+}
+
+static int core_imc_pmu_cpumask_init(void)
+{
+	return cpuhp_setup_state(CPUHP_AP_PERF_POWERPC_CORE_IMC_ONLINE,
+				 "perf/powerpc/imc_core:online",
+				 ppc_core_imc_cpu_online,
+				 ppc_core_imc_cpu_offline);
+}
+
+static void core_imc_counters_release(struct perf_event *event)
+{
+	int rc, core_id;
+	struct imc_pmu_ref *ref;
+
+	if (event->cpu < 0)
+		return;
+	/*
+	 * See if we need to disable the IMC PMU.
+	 * If no events are currently in use, then we have to take a
+	 * mutex to ensure that we don't race with another task doing
+	 * enable or disable the core counters.
+	 */
+	core_id = event->cpu / threads_per_core;
+
+	/* Take the mutex lock and decrement the refernce count for this core */
+	ref = &core_imc_refc[core_id];
+	if (!ref)
+		return;
+
+	mutex_lock(&ref->lock);
+	ref->refc--;
+	if (ref->refc == 0) {
+		rc = opal_imc_counters_stop(OPAL_IMC_COUNTERS_CORE,
+					    get_hard_smp_processor_id(event->cpu));
+		if (rc) {
+			mutex_unlock(&ref->lock);
+			pr_err("IMC: Unable to stop the counters for core %d\n", core_id);
+			return;
+		}
+	} else if (ref->refc < 0) {
+		WARN(1, "core-imc: Invalid event reference count\n");
+		ref->refc = 0;
+	}
+	mutex_unlock(&ref->lock);
+}
+
+static int core_imc_event_init(struct perf_event *event)
+{
+	int core_id, rc;
+	u64 config = event->attr.config;
+	struct imc_mem_info *pcmi;
+	struct imc_pmu *pmu;
+	struct imc_pmu_ref *ref;
+
+	if (event->attr.type != event->pmu->type)
+		return -ENOENT;
+
+	/* Sampling not supported */
+	if (event->hw.sample_period)
+		return -EINVAL;
+
+	/* unsupported modes and filters */
+	if (event->attr.exclude_user   ||
+	    event->attr.exclude_kernel ||
+	    event->attr.exclude_hv     ||
+	    event->attr.exclude_idle   ||
+	    event->attr.exclude_host   ||
+	    event->attr.exclude_guest)
+		return -EINVAL;
+
+	if (event->cpu < 0)
+		return -EINVAL;
+
+	event->hw.idx = -1;
+	pmu = imc_event_to_pmu(event);
+
+	/* Sanity check for config (event offset) */
+	if (((config & IMC_EVENT_OFFSET_MASK) > pmu->counter_mem_size))
+		return -EINVAL;
+
+	if (!is_core_imc_mem_inited(event->cpu))
+		return -ENODEV;
+
+	core_id = event->cpu / threads_per_core;
+	pcmi = &core_imc_pmu->mem_info[core_id];
+	if ((!pcmi->vbase))
+		return -ENODEV;
+
+	/* Get the core_imc mutex for this core */
+	ref = &core_imc_refc[core_id];
+	if (!ref)
+		return -EINVAL;
+
+	/*
+	 * Core pmu units are enabled only when it is used.
+	 * See if this is triggered for the first time.
+	 * If yes, take the mutex lock and enable the core counters.
+	 * If not, just increment the count in core_imc_refc struct.
+	 */
+	mutex_lock(&ref->lock);
+	if (ref->refc == 0) {
+		rc = opal_imc_counters_start(OPAL_IMC_COUNTERS_CORE,
+					     get_hard_smp_processor_id(event->cpu));
+		if (rc) {
+			mutex_unlock(&ref->lock);
+			pr_err("core-imc: Unable to start the counters for core %d\n",
+									core_id);
+			return rc;
+		}
+	}
+	++ref->refc;
+	mutex_unlock(&ref->lock);
+
+	event->hw.event_base = (u64)pcmi->vbase + (config & IMC_EVENT_OFFSET_MASK);
+	event->destroy = core_imc_counters_release;
+	return 0;
+}
+
 static u64 * get_event_base_addr(struct perf_event *event)
 {
 	/*
@@ -564,12 +806,15 @@ static int update_pmu_ops(struct imc_pmu *pmu)
 	pmu->pmu.attr_groups = pmu->attr_groups;
 	pmu->attr_groups[IMC_FORMAT_ATTR] = &imc_format_group;
 
-	/* Subsequenct patch will add more pmu types here */
 	switch (pmu->domain) {
 	case IMC_DOMAIN_NEST:
 		pmu->pmu.event_init = nest_imc_event_init;
 		pmu->attr_groups[IMC_CPUMASK_ATTR] = &imc_pmu_cpumask_attr_group;
 		break;
+	case IMC_DOMAIN_CORE:
+		pmu->pmu.event_init = core_imc_event_init;
+		pmu->attr_groups[IMC_CPUMASK_ATTR] = &imc_pmu_cpumask_attr_group;
+		break;
 	default:
 		break;
 	}
@@ -621,6 +866,22 @@ static int init_nest_pmu_ref(void)
 	return 0;
 }
 
+static void cleanup_all_core_imc_memory(void)
+{
+	int i, nr_cores = num_present_cpus() / threads_per_core;
+	struct imc_mem_info *ptr = core_imc_pmu->mem_info;
+	int size = core_imc_pmu->counter_mem_size;
+
+	/* mem_info will never be NULL */
+	for (i = 0; i < nr_cores; i++) {
+		if (ptr[i].vbase)
+			free_pages((u64)ptr->vbase, get_order(size));
+	}
+
+	kfree(ptr);
+	kfree(core_imc_refc);
+}
+
 /*
  * Common function to unregister cpu hotplug callback and
  * free the memory.
@@ -641,6 +902,12 @@ static void imc_common_cpuhp_mem_free(struct imc_pmu *pmu_ptr)
 		mutex_unlock(&nest_init_lock);
 	}
 
+	/* Free core_imc memory */
+	if (pmu_ptr->domain == IMC_DOMAIN_CORE) {
+		cpuhp_remove_state(CPUHP_AP_PERF_POWERPC_CORE_IMC_ONLINE);
+		cleanup_all_core_imc_memory();
+	}
+
 	/* Only free the attr_groups which are dynamically allocated  */
 	kfree(pmu_ptr->attr_groups[IMC_EVENT_ATTR]->attrs);
 	kfree(pmu_ptr->attr_groups[IMC_EVENT_ATTR]);
@@ -656,11 +923,11 @@ static int imc_mem_init(struct imc_pmu *pmu_ptr, struct device_node *parent,
 								int pmu_index)
 {
 	const char *s;
+	int nr_cores;
 
 	if (of_property_read_string(parent, "name", &s))
 		return -ENODEV;
 
-	/* Subsequenct patch will add more pmu types here */
 	switch (pmu_ptr->domain) {
 	case IMC_DOMAIN_NEST:
 		/* Update the pmu name */
@@ -671,6 +938,27 @@ static int imc_mem_init(struct imc_pmu *pmu_ptr, struct device_node *parent,
 		/* Needed for hotplug/migration */
 		per_nest_pmu_arr[pmu_index] = pmu_ptr;
 		break;
+	case IMC_DOMAIN_CORE:
+		/* Update the pmu name */
+		pmu_ptr->pmu.name = kasprintf(GFP_KERNEL, "%s%s", s, "_imc");
+		if (!pmu_ptr->pmu.name)
+			return -ENOMEM;
+
+		nr_cores = num_present_cpus() / threads_per_core;
+		pmu_ptr->mem_info = kcalloc(nr_cores, sizeof(struct imc_mem_info),
+								GFP_KERNEL);
+
+		if (!pmu_ptr->mem_info)
+			return -ENOMEM;
+
+		core_imc_refc = kcalloc(nr_cores, sizeof(struct imc_pmu_ref),
+								GFP_KERNEL);
+
+		if (!core_imc_refc)
+			return -ENOMEM;
+
+		core_imc_pmu = pmu_ptr;
+		break;
 	default:
 		return -EINVAL;
 	}
@@ -696,7 +984,6 @@ int init_imc_pmu(struct device_node *parent, struct imc_pmu *pmu_ptr, int pmu_id
 	if (ret)
 		goto err_free;
 
-	/* Subsequenct patch will add more pmu types here */
 	switch (pmu_ptr->domain) {
 	case IMC_DOMAIN_NEST:
 		/*
@@ -721,6 +1008,14 @@ int init_imc_pmu(struct device_node *parent, struct imc_pmu *pmu_ptr, int pmu_id
 		}
 		nest_pmus++;
 		mutex_unlock(&nest_init_lock);
+		break;
+	case IMC_DOMAIN_CORE:
+		ret = core_imc_pmu_cpumask_init();
+		if (ret) {
+			cleanup_all_core_imc_memory();
+			return ret;
+		}
+
 		break;
 	default:
 		return  -1;	/* Unknown domain */
diff --git a/include/linux/cpuhotplug.h b/include/linux/cpuhotplug.h
index 0853a14b1fa1..1be505db0090 100644
--- a/include/linux/cpuhotplug.h
+++ b/include/linux/cpuhotplug.h
@@ -140,6 +140,7 @@ enum cpuhp_state {
 	CPUHP_AP_PERF_ARM_QCOM_L2_ONLINE,
 	CPUHP_AP_PERF_ARM_QCOM_L3_ONLINE,
 	CPUHP_AP_PERF_POWERPC_NEST_IMC_ONLINE,
+	CPUHP_AP_PERF_POWERPC_CORE_IMC_ONLINE,
 	CPUHP_AP_WORKQUEUE_ONLINE,
 	CPUHP_AP_RCUTREE_ONLINE,
 	CPUHP_AP_ONLINE_DYN,
-- 
cgit v1.2.3


From f74c89bd80fb3f1328fdf4a44eeba793cdce4222 Mon Sep 17 00:00:00 2001
From: Anju T Sudhakar <anju@linux.vnet.ibm.com>
Date: Wed, 19 Jul 2017 03:06:36 +0530
Subject: powerpc/perf: Add thread IMC PMU support

Add support to register Thread In-Memory Collection PMU counters.
Patch adds thread IMC specific data structures, along with memory
init functions and CPU hotplug support.

Signed-off-by: Anju T Sudhakar <anju@linux.vnet.ibm.com>
Signed-off-by: Hemant Kumar <hemant@linux.vnet.ibm.com>
Signed-off-by: Madhavan Srinivasan <maddy@linux.vnet.ibm.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/perf/imc-pmu.c | 270 +++++++++++++++++++++++++++++++++++++++++++-
 include/linux/cpuhotplug.h  |   1 +
 2 files changed, 267 insertions(+), 4 deletions(-)

(limited to 'include')

diff --git a/arch/powerpc/perf/imc-pmu.c b/arch/powerpc/perf/imc-pmu.c
index 482f8d6d5e65..46cd912af060 100644
--- a/arch/powerpc/perf/imc-pmu.c
+++ b/arch/powerpc/perf/imc-pmu.c
@@ -37,6 +37,12 @@ static cpumask_t core_imc_cpumask;
 struct imc_pmu_ref *core_imc_refc;
 static struct imc_pmu *core_imc_pmu;
 
+/* Thread IMC data structures and variables */
+
+static DEFINE_PER_CPU(u64 *, thread_imc_mem);
+static struct imc_pmu *thread_imc_pmu;
+static int thread_imc_mem_size;
+
 struct imc_pmu *imc_event_to_pmu(struct perf_event *event)
 {
 	return container_of(event->pmu, struct imc_pmu, pmu);
@@ -728,15 +734,188 @@ static int core_imc_event_init(struct perf_event *event)
 	return 0;
 }
 
-static u64 * get_event_base_addr(struct perf_event *event)
+/*
+ * Allocates a page of memory for each of the online cpus, and write the
+ * physical base address of that page to the LDBAR for that cpu.
+ *
+ * LDBAR Register Layout:
+ *
+ *  0          4         8         12        16        20        24        28
+ * | - - - - | - - - - | - - - - | - - - - | - - - - | - - - - | - - - - | - - - - |
+ *   | |       [   ]    [                   Counter Address [8:50]
+ *   | * Mode    |
+ *   |           * PB Scope
+ *   * Enable/Disable
+ *
+ *  32        36        40        44        48        52        56        60
+ * | - - - - | - - - - | - - - - | - - - - | - - - - | - - - - | - - - - | - - - - |
+ *           Counter Address [8:50]              ]
+ *
+ */
+static int thread_imc_mem_alloc(int cpu_id, int size)
+{
+	u64 ldbar_value, *local_mem = per_cpu(thread_imc_mem, cpu_id);
+	int phys_id = topology_physical_package_id(cpu_id);
+
+	if (!local_mem) {
+		/*
+		 * This case could happen only once at start, since we dont
+		 * free the memory in cpu offline path.
+		 */
+		local_mem = page_address(alloc_pages_node(phys_id,
+				  GFP_KERNEL | __GFP_ZERO | __GFP_THISNODE,
+				  get_order(size)));
+		if (!local_mem)
+			return -ENOMEM;
+
+		per_cpu(thread_imc_mem, cpu_id) = local_mem;
+	}
+
+	ldbar_value = ((u64)local_mem & THREAD_IMC_LDBAR_MASK) | THREAD_IMC_ENABLE;
+
+	mtspr(SPRN_LDBAR, ldbar_value);
+	return 0;
+}
+
+static int ppc_thread_imc_cpu_online(unsigned int cpu)
 {
+	return thread_imc_mem_alloc(cpu, thread_imc_mem_size);
+}
+
+static int ppc_thread_imc_cpu_offline(unsigned int cpu)
+{
+	mtspr(SPRN_LDBAR, 0);
+	return 0;
+}
+
+static int thread_imc_cpu_init(void)
+{
+	return cpuhp_setup_state(CPUHP_AP_PERF_POWERPC_THREAD_IMC_ONLINE,
+			  "perf/powerpc/imc_thread:online",
+			  ppc_thread_imc_cpu_online,
+			  ppc_thread_imc_cpu_offline);
+}
+
+void thread_imc_pmu_sched_task(struct perf_event_context *ctx,
+				      bool sched_in)
+{
+	int core_id;
+	struct imc_pmu_ref *ref;
+
+	if (!is_core_imc_mem_inited(smp_processor_id()))
+		return;
+
+	core_id = smp_processor_id() / threads_per_core;
 	/*
-	 * Subsequent patch will add code to detect caller imc pmu
-	 * and return accordingly.
+	 * imc pmus are enabled only when it is used.
+	 * See if this is triggered for the first time.
+	 * If yes, take the mutex lock and enable the counters.
+	 * If not, just increment the count in ref count struct.
 	 */
+	ref = &core_imc_refc[core_id];
+	if (!ref)
+		return;
+
+	if (sched_in) {
+		mutex_lock(&ref->lock);
+		if (ref->refc == 0) {
+			if (opal_imc_counters_start(OPAL_IMC_COUNTERS_CORE,
+			     get_hard_smp_processor_id(smp_processor_id()))) {
+				mutex_unlock(&ref->lock);
+				pr_err("thread-imc: Unable to start the counter\
+							for core %d\n", core_id);
+				return;
+			}
+		}
+		++ref->refc;
+		mutex_unlock(&ref->lock);
+	} else {
+		mutex_lock(&ref->lock);
+		ref->refc--;
+		if (ref->refc == 0) {
+			if (opal_imc_counters_stop(OPAL_IMC_COUNTERS_CORE,
+			    get_hard_smp_processor_id(smp_processor_id()))) {
+				mutex_unlock(&ref->lock);
+				pr_err("thread-imc: Unable to stop the counters\
+							for core %d\n", core_id);
+				return;
+			}
+		} else if (ref->refc < 0) {
+			ref->refc = 0;
+		}
+		mutex_unlock(&ref->lock);
+	}
+
+	return;
+}
+
+static int thread_imc_event_init(struct perf_event *event)
+{
+	u32 config = event->attr.config;
+	struct task_struct *target;
+	struct imc_pmu *pmu;
+
+	if (event->attr.type != event->pmu->type)
+		return -ENOENT;
+
+	/* Sampling not supported */
+	if (event->hw.sample_period)
+		return -EINVAL;
+
+	event->hw.idx = -1;
+	pmu = imc_event_to_pmu(event);
+
+	/* Sanity check for config offset */
+	if (((config & IMC_EVENT_OFFSET_MASK) > pmu->counter_mem_size))
+		return -EINVAL;
+
+	target = event->hw.target;
+	if (!target)
+		return -EINVAL;
+
+	event->pmu->task_ctx_nr = perf_sw_context;
+	return 0;
+}
+
+static bool is_thread_imc_pmu(struct perf_event *event)
+{
+	if (!strncmp(event->pmu->name, "thread_imc", strlen("thread_imc")))
+		return true;
+
+	return false;
+}
+
+static u64 * get_event_base_addr(struct perf_event *event)
+{
+	u64 addr;
+
+	if (is_thread_imc_pmu(event)) {
+		addr = (u64)per_cpu(thread_imc_mem, smp_processor_id());
+		return (u64 *)(addr + (event->attr.config & IMC_EVENT_OFFSET_MASK));
+	}
+
 	return (u64 *)event->hw.event_base;
 }
 
+static void thread_imc_pmu_start_txn(struct pmu *pmu,
+				     unsigned int txn_flags)
+{
+	if (txn_flags & ~PERF_PMU_TXN_ADD)
+		return;
+	perf_pmu_disable(pmu);
+}
+
+static void thread_imc_pmu_cancel_txn(struct pmu *pmu)
+{
+	perf_pmu_enable(pmu);
+}
+
+static int thread_imc_pmu_commit_txn(struct pmu *pmu)
+{
+	perf_pmu_enable(pmu);
+	return 0;
+}
+
 static u64 imc_read_counter(struct perf_event *event)
 {
 	u64 *addr, data;
@@ -794,6 +973,26 @@ static int imc_event_add(struct perf_event *event, int flags)
 	return 0;
 }
 
+static int thread_imc_event_add(struct perf_event *event, int flags)
+{
+	if (flags & PERF_EF_START)
+		imc_event_start(event, flags);
+
+	/* Enable the sched_task to start the engine */
+	perf_sched_cb_inc(event->ctx->pmu);
+	return 0;
+}
+
+static void thread_imc_event_del(struct perf_event *event, int flags)
+{
+	/*
+	 * Take a snapshot and calculate the delta and update
+	 * the event counter values.
+	 */
+	imc_event_update(event);
+	perf_sched_cb_dec(event->ctx->pmu);
+}
+
 /* update_pmu_ops : Populate the appropriate operations for "pmu" */
 static int update_pmu_ops(struct imc_pmu *pmu)
 {
@@ -815,6 +1014,15 @@ static int update_pmu_ops(struct imc_pmu *pmu)
 		pmu->pmu.event_init = core_imc_event_init;
 		pmu->attr_groups[IMC_CPUMASK_ATTR] = &imc_pmu_cpumask_attr_group;
 		break;
+	case IMC_DOMAIN_THREAD:
+		pmu->pmu.event_init = thread_imc_event_init;
+		pmu->pmu.sched_task = thread_imc_pmu_sched_task;
+		pmu->pmu.add = thread_imc_event_add;
+		pmu->pmu.del = thread_imc_event_del;
+		pmu->pmu.start_txn = thread_imc_pmu_start_txn;
+		pmu->pmu.cancel_txn = thread_imc_pmu_cancel_txn;
+		pmu->pmu.commit_txn = thread_imc_pmu_commit_txn;
+		break;
 	default:
 		break;
 	}
@@ -882,6 +1090,31 @@ static void cleanup_all_core_imc_memory(void)
 	kfree(core_imc_refc);
 }
 
+static void thread_imc_ldbar_disable(void *dummy)
+{
+	/*
+	 * By Zeroing LDBAR, we disable thread-imc
+	 * updates.
+	 */
+	mtspr(SPRN_LDBAR, 0);
+}
+
+void thread_imc_disable(void)
+{
+	on_each_cpu(thread_imc_ldbar_disable, NULL, 1);
+}
+
+static void cleanup_all_thread_imc_memory(void)
+{
+	int i, order = get_order(thread_imc_mem_size);
+
+	for_each_online_cpu(i) {
+		if (per_cpu(thread_imc_mem, i))
+			free_pages((u64)per_cpu(thread_imc_mem, i), order);
+
+	}
+}
+
 /*
  * Common function to unregister cpu hotplug callback and
  * free the memory.
@@ -908,6 +1141,12 @@ static void imc_common_cpuhp_mem_free(struct imc_pmu *pmu_ptr)
 		cleanup_all_core_imc_memory();
 	}
 
+	/* Free thread_imc memory */
+	if (pmu_ptr->domain == IMC_DOMAIN_THREAD) {
+		cpuhp_remove_state(CPUHP_AP_PERF_POWERPC_THREAD_IMC_ONLINE);
+		cleanup_all_thread_imc_memory();
+	}
+
 	/* Only free the attr_groups which are dynamically allocated  */
 	kfree(pmu_ptr->attr_groups[IMC_EVENT_ATTR]->attrs);
 	kfree(pmu_ptr->attr_groups[IMC_EVENT_ATTR]);
@@ -923,7 +1162,7 @@ static int imc_mem_init(struct imc_pmu *pmu_ptr, struct device_node *parent,
 								int pmu_index)
 {
 	const char *s;
-	int nr_cores;
+	int nr_cores, cpu, res;
 
 	if (of_property_read_string(parent, "name", &s))
 		return -ENODEV;
@@ -959,6 +1198,21 @@ static int imc_mem_init(struct imc_pmu *pmu_ptr, struct device_node *parent,
 
 		core_imc_pmu = pmu_ptr;
 		break;
+	case IMC_DOMAIN_THREAD:
+		/* Update the pmu name */
+		pmu_ptr->pmu.name = kasprintf(GFP_KERNEL, "%s%s", s, "_imc");
+		if (!pmu_ptr->pmu.name)
+			return -ENOMEM;
+
+		thread_imc_mem_size = pmu_ptr->counter_mem_size;
+		for_each_online_cpu(cpu) {
+			res = thread_imc_mem_alloc(cpu, pmu_ptr->counter_mem_size);
+			if (res)
+				return res;
+		}
+
+		thread_imc_pmu = pmu_ptr;
+		break;
 	default:
 		return -EINVAL;
 	}
@@ -1016,6 +1270,14 @@ int init_imc_pmu(struct device_node *parent, struct imc_pmu *pmu_ptr, int pmu_id
 			return ret;
 		}
 
+		break;
+	case IMC_DOMAIN_THREAD:
+		ret = thread_imc_cpu_init();
+		if (ret) {
+			cleanup_all_thread_imc_memory();
+			return ret;
+		}
+
 		break;
 	default:
 		return  -1;	/* Unknown domain */
diff --git a/include/linux/cpuhotplug.h b/include/linux/cpuhotplug.h
index 1be505db0090..1bc7dcfbf7b3 100644
--- a/include/linux/cpuhotplug.h
+++ b/include/linux/cpuhotplug.h
@@ -141,6 +141,7 @@ enum cpuhp_state {
 	CPUHP_AP_PERF_ARM_QCOM_L3_ONLINE,
 	CPUHP_AP_PERF_POWERPC_NEST_IMC_ONLINE,
 	CPUHP_AP_PERF_POWERPC_CORE_IMC_ONLINE,
+	CPUHP_AP_PERF_POWERPC_THREAD_IMC_ONLINE,
 	CPUHP_AP_WORKQUEUE_ONLINE,
 	CPUHP_AP_RCUTREE_ONLINE,
 	CPUHP_AP_ONLINE_DYN,
-- 
cgit v1.2.3


From 67bd22c09ac1b54b5f42445783d2dc8e011f7a71 Mon Sep 17 00:00:00 2001
From: "Andrew F. Davis" <afd@ti.com>
Date: Wed, 19 Jul 2017 12:04:07 -0500
Subject: power: supply: bq27xxx: move platform driver code into
 bq27xxx_battery_hdq.c
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

When the BQ27xxx driver was originally written the w1 subsystem only
allowed device drivers for w1 attached devices to live in the w1
subsystem. Kernel driver subsystems expect that the driver for a device
live in the directory of the subsystem for which it implements
functionality, not in the directory of the bus that it is attached. To
work around this, the BQ27xxx driver was implemented as a platform device
driver and the interface driver would instantiate this device from within
the w1 directory, then pass a w1 read callback as platform data.

As we can now have the w1 interface driver in the power/supply directory
(like we do already with the i2c interface driver) we can remove this
middle-layer platform driver.

Signed-off-by: Andrew F. Davis <afd@ti.com>
Acked-by: Pali Rohár <pali.rohar@gmail.com>
Signed-off-by: Sebastian Reichel <sebastian.reichel@collabora.co.uk>
---
 drivers/power/supply/bq27xxx_battery.c     | 104 -----------------------
 drivers/power/supply/bq27xxx_battery_hdq.c | 130 +++++++++++++++++------------
 include/linux/power/bq27xxx_battery.h      |  17 ----
 3 files changed, 75 insertions(+), 176 deletions(-)

(limited to 'include')

diff --git a/drivers/power/supply/bq27xxx_battery.c b/drivers/power/supply/bq27xxx_battery.c
index ed44439d0112..079083b31293 100644
--- a/drivers/power/supply/bq27xxx_battery.c
+++ b/drivers/power/supply/bq27xxx_battery.c
@@ -1938,110 +1938,6 @@ void bq27xxx_battery_teardown(struct bq27xxx_device_info *di)
 }
 EXPORT_SYMBOL_GPL(bq27xxx_battery_teardown);
 
-static int bq27xxx_battery_platform_read(struct bq27xxx_device_info *di, u8 reg,
-					 bool single)
-{
-	struct device *dev = di->dev;
-	struct bq27xxx_platform_data *pdata = dev->platform_data;
-	unsigned int timeout = 3;
-	int upper, lower;
-	int temp;
-
-	if (!single) {
-		/* Make sure the value has not changed in between reading the
-		 * lower and the upper part */
-		upper = pdata->read(dev, reg + 1);
-		do {
-			temp = upper;
-			if (upper < 0)
-				return upper;
-
-			lower = pdata->read(dev, reg);
-			if (lower < 0)
-				return lower;
-
-			upper = pdata->read(dev, reg + 1);
-		} while (temp != upper && --timeout);
-
-		if (timeout == 0)
-			return -EIO;
-
-		return (upper << 8) | lower;
-	}
-
-	return pdata->read(dev, reg);
-}
-
-static int bq27xxx_battery_platform_probe(struct platform_device *pdev)
-{
-	struct bq27xxx_device_info *di;
-	struct bq27xxx_platform_data *pdata = pdev->dev.platform_data;
-
-	if (!pdata) {
-		dev_err(&pdev->dev, "no platform_data supplied\n");
-		return -EINVAL;
-	}
-
-	if (!pdata->read) {
-		dev_err(&pdev->dev, "no hdq read callback supplied\n");
-		return -EINVAL;
-	}
-
-	if (!pdata->chip) {
-		dev_err(&pdev->dev, "no device supplied\n");
-		return -EINVAL;
-	}
-
-	di = devm_kzalloc(&pdev->dev, sizeof(*di), GFP_KERNEL);
-	if (!di)
-		return -ENOMEM;
-
-	platform_set_drvdata(pdev, di);
-
-	di->dev = &pdev->dev;
-	di->chip = pdata->chip;
-	di->name = pdata->name ?: dev_name(&pdev->dev);
-	di->bus.read = bq27xxx_battery_platform_read;
-
-	return bq27xxx_battery_setup(di);
-}
-
-static int bq27xxx_battery_platform_remove(struct platform_device *pdev)
-{
-	struct bq27xxx_device_info *di = platform_get_drvdata(pdev);
-
-	bq27xxx_battery_teardown(di);
-
-	return 0;
-}
-
-static const struct platform_device_id bq27xxx_battery_platform_id_table[] = {
-	{ "bq27000-battery", },
-	{ /* sentinel */ }
-};
-MODULE_DEVICE_TABLE(platform, bq27xxx_battery_platform_id_table);
-
-#ifdef CONFIG_OF
-static const struct of_device_id bq27xxx_battery_platform_of_match_table[] = {
-	{ .compatible = "ti,bq27000" },
-	{},
-};
-MODULE_DEVICE_TABLE(of, bq27xxx_battery_platform_of_match_table);
-#endif
-
-static struct platform_driver bq27xxx_battery_platform_driver = {
-	.probe	= bq27xxx_battery_platform_probe,
-	.remove = bq27xxx_battery_platform_remove,
-	.driver = {
-		.name = "bq27000-battery",
-		.of_match_table = of_match_ptr(bq27xxx_battery_platform_of_match_table),
-	},
-	.id_table = bq27xxx_battery_platform_id_table,
-};
-module_platform_driver(bq27xxx_battery_platform_driver);
-
-MODULE_ALIAS("platform:bq27000-battery");
-
 MODULE_AUTHOR("Rodolfo Giometti <giometti@linux.it>");
 MODULE_DESCRIPTION("BQ27xxx battery monitor driver");
 MODULE_LICENSE("GPL");
diff --git a/drivers/power/supply/bq27xxx_battery_hdq.c b/drivers/power/supply/bq27xxx_battery_hdq.c
index f4df67eb9d2c..9aff896c9802 100644
--- a/drivers/power/supply/bq27xxx_battery_hdq.c
+++ b/drivers/power/supply/bq27xxx_battery_hdq.c
@@ -1,10 +1,16 @@
 /*
- * Copyright (C) 2007 Texas Instruments, Inc.
+ * BQ27xxx battery monitor HDQ/1-wire driver
  *
- * This file is licensed under the terms of the GNU General Public License
- * version 2. This program is licensed "as is" without any warranty of any
- * kind, whether express or implied.
+ * Copyright (C) 2007-2017 Texas Instruments Incorporated - http://www.ti.com/
  *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed "as is" WITHOUT ANY WARRANTY of any
+ * kind, whether express or implied; without even the implied warranty
+ * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
  */
 
 #include <linux/kernel.h>
@@ -19,17 +25,16 @@
 
 #define W1_FAMILY_BQ27000	0x01
 
-#define HDQ_CMD_READ	(0)
-#define HDQ_CMD_WRITE	(1<<7)
+#define HDQ_CMD_READ	(0 << 7)
+#define HDQ_CMD_WRITE	(1 << 7)
 
 static int F_ID;
 module_param(F_ID, int, S_IRUSR);
-MODULE_PARM_DESC(F_ID, "1-wire slave FID for BQ device");
+MODULE_PARM_DESC(F_ID, "1-wire slave FID for BQ27xxx device");
 
-static int w1_bq27000_read(struct device *dev, unsigned int reg)
+static int w1_bq27000_read(struct w1_slave *sl, unsigned int reg)
 {
 	u8 val;
-	struct w1_slave *sl = container_of(dev->parent, struct w1_slave, dev);
 
 	mutex_lock(&sl->master->bus_mutex);
 	w1_write_8(sl->master, HDQ_CMD_READ | reg);
@@ -39,77 +44,92 @@ static int w1_bq27000_read(struct device *dev, unsigned int reg)
 	return val;
 }
 
-static struct bq27xxx_platform_data bq27000_battery_info = {
-	.read   = w1_bq27000_read,
-	.name   = "bq27000-battery",
-	.chip   = BQ27000,
-};
-
-static int w1_bq27000_add_slave(struct w1_slave *sl)
+static int bq27xxx_battery_hdq_read(struct bq27xxx_device_info *di, u8 reg,
+				    bool single)
 {
-	int ret;
-	struct platform_device *pdev;
-
-	pdev = platform_device_alloc("bq27000-battery", -1);
-	if (!pdev) {
-		ret = -ENOMEM;
-		return ret;
+	struct w1_slave *sl = dev_to_w1_slave(di->dev);
+	unsigned int timeout = 3;
+	int upper, lower;
+	int temp;
+
+	if (!single) {
+		/*
+		 * Make sure the value has not changed in between reading the
+		 * lower and the upper part
+		 */
+		upper = w1_bq27000_read(sl, reg + 1);
+		do {
+			temp = upper;
+			if (upper < 0)
+				return upper;
+
+			lower = w1_bq27000_read(sl, reg);
+			if (lower < 0)
+				return lower;
+
+			upper = w1_bq27000_read(sl, reg + 1);
+		} while (temp != upper && --timeout);
+
+		if (timeout == 0)
+			return -EIO;
+
+		return (upper << 8) | lower;
 	}
-	ret = platform_device_add_data(pdev,
-				       &bq27000_battery_info,
-				       sizeof(bq27000_battery_info));
-	if (ret)
-		goto pdev_add_failed;
-	pdev->dev.parent = &sl->dev;
 
-	ret = platform_device_add(pdev);
-	if (ret)
-		goto pdev_add_failed;
+	return w1_bq27000_read(sl, reg);
+}
 
-	dev_set_drvdata(&sl->dev, pdev);
+static int bq27xxx_battery_hdq_add_slave(struct w1_slave *sl)
+{
+	struct bq27xxx_device_info *di;
+
+	di = devm_kzalloc(&sl->dev, sizeof(*di), GFP_KERNEL);
+	if (!di)
+		return -ENOMEM;
 
-	goto success;
+	dev_set_drvdata(&sl->dev, di);
 
-pdev_add_failed:
-	platform_device_put(pdev);
-success:
-	return ret;
+	di->dev = &sl->dev;
+	di->chip = BQ27000;
+	di->name = "bq27000-battery";
+	di->bus.read = bq27xxx_battery_hdq_read;
+
+	return bq27xxx_battery_setup(di);
 }
 
-static void w1_bq27000_remove_slave(struct w1_slave *sl)
+static void bq27xxx_battery_hdq_remove_slave(struct w1_slave *sl)
 {
-	struct platform_device *pdev = dev_get_drvdata(&sl->dev);
+	struct bq27xxx_device_info *di = dev_get_drvdata(&sl->dev);
 
-	platform_device_unregister(pdev);
+	bq27xxx_battery_teardown(di);
 }
 
-static struct w1_family_ops w1_bq27000_fops = {
-	.add_slave	= w1_bq27000_add_slave,
-	.remove_slave	= w1_bq27000_remove_slave,
+static struct w1_family_ops bq27xxx_battery_hdq_fops = {
+	.add_slave	= bq27xxx_battery_hdq_add_slave,
+	.remove_slave	= bq27xxx_battery_hdq_remove_slave,
 };
 
-static struct w1_family w1_bq27000_family = {
+static struct w1_family bq27xxx_battery_hdq_family = {
 	.fid = W1_FAMILY_BQ27000,
-	.fops = &w1_bq27000_fops,
+	.fops = &bq27xxx_battery_hdq_fops,
 };
 
-static int __init w1_bq27000_init(void)
+static int __init bq27xxx_battery_hdq_init(void)
 {
 	if (F_ID)
-		w1_bq27000_family.fid = F_ID;
+		bq27xxx_battery_hdq_family.fid = F_ID;
 
-	return w1_register_family(&w1_bq27000_family);
+	return w1_register_family(&bq27xxx_battery_hdq_family);
 }
+module_init(bq27xxx_battery_hdq_init);
 
-static void __exit w1_bq27000_exit(void)
+static void __exit bq27xxx_battery_hdq_exit(void)
 {
-	w1_unregister_family(&w1_bq27000_family);
+	w1_unregister_family(&bq27xxx_battery_hdq_family);
 }
-
-module_init(w1_bq27000_init);
-module_exit(w1_bq27000_exit);
+module_exit(bq27xxx_battery_hdq_exit);
 
 MODULE_AUTHOR("Texas Instruments Ltd");
-MODULE_DESCRIPTION("HDQ/1-wire slave driver bq27000 battery monitor chip");
+MODULE_DESCRIPTION("BQ27xxx battery monitor HDQ/1-wire driver");
 MODULE_LICENSE("GPL");
 MODULE_ALIAS("w1-family-" __stringify(W1_FAMILY_BQ27000));
diff --git a/include/linux/power/bq27xxx_battery.h b/include/linux/power/bq27xxx_battery.h
index 11e11685dd1d..89890437f9ab 100644
--- a/include/linux/power/bq27xxx_battery.h
+++ b/include/linux/power/bq27xxx_battery.h
@@ -20,23 +20,6 @@ enum bq27xxx_chip {
 	BQ27421, /* bq27421, bq27425, bq27441, bq27621 */
 };
 
-/**
- * struct bq27xxx_plaform_data - Platform data for bq27xxx devices
- * @name: Name of the battery.
- * @chip: Chip class number of this device.
- * @read: HDQ read callback.
- *	This function should provide access to the HDQ bus the battery is
- *	connected to.
- *	The first parameter is a pointer to the battery device, the second the
- *	register to be read. The return value should either be the content of
- *	the passed register or an error value.
- */
-struct bq27xxx_platform_data {
-	const char *name;
-	enum bq27xxx_chip chip;
-	int (*read)(struct device *dev, unsigned int);
-};
-
 struct bq27xxx_device_info;
 struct bq27xxx_access_methods {
 	int (*read)(struct bq27xxx_device_info *di, u8 reg, bool single);
-- 
cgit v1.2.3


From 1ee6f00d1164955b7bdadd36fc0f2736754784d9 Mon Sep 17 00:00:00 2001
From: Josh Poimboeuf <jpoimboe@redhat.com>
Date: Tue, 25 Jul 2017 09:44:51 -0500
Subject: x86/asm: Make objtool unreachable macros independent from GCC version

The ASM_UNREACHABLE macro isn't GCC version-specific, so move it outside
the GCC 4.5+ check.  Otherwise the 0-day robot will report objtool
warnings for uses of ASM_UNREACHABLE with GCC 4.4.

Also move the annotate_unreachable() macro so the related macros can
stay together.

Reported-by: kbuild test robot <fengguang.wu@intel.com>
Signed-off-by: Josh Poimboeuf <jpoimboe@redhat.com>
Cc: Kees Cook <keescook@chromium.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Fixes: aa5d1b81500e ("x86/asm: Add ASM_UNREACHABLE")
Link: http://lkml.kernel.org/r/fb18337dbf230fd36450d9faf19a2b2533dbcba1.1500993873.git.jpoimboe@redhat.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/compiler-gcc.h | 32 ++++++++++++++++----------------
 1 file changed, 16 insertions(+), 16 deletions(-)

(limited to 'include')

diff --git a/include/linux/compiler-gcc.h b/include/linux/compiler-gcc.h
index 179375b2d862..17cbe3cffc0e 100644
--- a/include/linux/compiler-gcc.h
+++ b/include/linux/compiler-gcc.h
@@ -128,6 +128,22 @@
 #define __always_unused		__attribute__((unused))
 #define __mode(x)               __attribute__((mode(x)))
 
+#ifdef CONFIG_STACK_VALIDATION
+#define annotate_unreachable() ({					\
+	asm("%c0:\n\t"							\
+	    ".pushsection .discard.unreachable\n\t"			\
+	    ".long %c0b - .\n\t"					\
+	    ".popsection\n\t" : : "i" (__LINE__));			\
+})
+#define ASM_UNREACHABLE							\
+	"999:\n\t"							\
+	".pushsection .discard.unreachable\n\t"				\
+	".long 999b - .\n\t"						\
+	".popsection\n\t"
+#else
+#define annotate_unreachable()
+#endif
+
 /* gcc version specific checks */
 
 #if GCC_VERSION < 30200
@@ -201,22 +217,6 @@
 #endif
 #endif
 
-#ifdef CONFIG_STACK_VALIDATION
-#define annotate_unreachable() ({					\
-	asm("%c0:\n\t"							\
-	    ".pushsection .discard.unreachable\n\t"			\
-	    ".long %c0b - .\n\t"					\
-	    ".popsection\n\t" : : "i" (__LINE__));			\
-})
-#define ASM_UNREACHABLE							\
-	"999:\n\t"							\
-	".pushsection .discard.unreachable\n\t"				\
-	".long 999b - .\n\t"						\
-	".popsection\n\t"
-#else
-#define annotate_unreachable()
-#endif
-
 /*
  * Mark a position in code as unreachable.  This can be used to
  * suppress control flow warnings after asm blocks that transfer
-- 
cgit v1.2.3


From 931ab4a5ce2d93cb434dc7fc067a702117453b13 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Tue, 25 Jul 2017 09:59:08 -0700
Subject: atomics: Revert addition of comment header to spin_unlock_wait()

There is still considerable confusion as to the semantics of
spin_unlock_wait(), but there seems to be universal agreement that
it is not that of a lock/unlock pair.  This commit therefore removes
the comment added by 6016ffc3874d ("atomics: Add header comment so
spin_unlock_wait()") in order to prevent at least that flavor of
confusion.

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---
 include/linux/spinlock.h | 20 --------------------
 1 file changed, 20 deletions(-)

(limited to 'include')

diff --git a/include/linux/spinlock.h b/include/linux/spinlock.h
index d9510e8522d4..59248dcc6ef3 100644
--- a/include/linux/spinlock.h
+++ b/include/linux/spinlock.h
@@ -369,26 +369,6 @@ static __always_inline int spin_trylock_irq(spinlock_t *lock)
 	raw_spin_trylock_irqsave(spinlock_check(lock), flags); \
 })
 
-/**
- * spin_unlock_wait - Interpose between successive critical sections
- * @lock: the spinlock whose critical sections are to be interposed.
- *
- * Semantically this is equivalent to a spin_lock() immediately
- * followed by a spin_unlock().  However, most architectures have
- * more efficient implementations in which the spin_unlock_wait()
- * cannot block concurrent lock acquisition, and in some cases
- * where spin_unlock_wait() does not write to the lock variable.
- * Nevertheless, spin_unlock_wait() can have high overhead, so if
- * you feel the need to use it, please check to see if there is
- * a better way to get your job done.
- *
- * The ordering guarantees provided by spin_unlock_wait() are:
- *
- * 1.  All accesses preceding the spin_unlock_wait() happen before
- *     any accesses in later critical sections for this same lock.
- * 2.  All accesses following the spin_unlock_wait() happen after
- *     any accesses in earlier critical sections for this same lock.
- */
 static __always_inline void spin_unlock_wait(spinlock_t *lock)
 {
 	raw_spin_unlock_wait(&lock->rlock);
-- 
cgit v1.2.3


From a58163d8ca2c8d288ee9f95989712f98473a5ac2 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Tue, 20 Jun 2017 12:11:34 -0700
Subject: rcu: Migrate callbacks earlier in the CPU-offline timeline

RCU callbacks must be migrated away from an outgoing CPU, and this is
done near the end of the CPU-hotplug operation, after the outgoing CPU is
long gone.  Unfortunately, this means that other CPU-hotplug callbacks
can execute while the outgoing CPU's callbacks are still immobilized
on the long-gone CPU's callback lists.  If any of these CPU-hotplug
callbacks must wait, either directly or indirectly, for the invocation
of any of the immobilized RCU callbacks, the system will hang.

This commit avoids such hangs by migrating the callbacks away from the
outgoing CPU immediately upon its departure, shortly after the return
from __cpu_die() in takedown_cpu().  Thus, RCU is able to advance these
callbacks and invoke them, which allows all the after-the-fact CPU-hotplug
callbacks to wait on these RCU callbacks without risk of a hang.

While in the neighborhood, this commit also moves rcu_send_cbs_to_orphanage()
and rcu_adopt_orphan_cbs() under a pre-existing #ifdef to avoid including
dead code on the one hand and to avoid define-without-use warnings on the
other hand.

Reported-by: Jeffrey Hugo <jhugo@codeaurora.org>
Link: http://lkml.kernel.org/r/db9c91f6-1b17-6136-84f0-03c3c2581ab4@codeaurora.org
Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Anna-Maria Gleixner <anna-maria@linutronix.de>
Cc: Boris Ostrovsky <boris.ostrovsky@oracle.com>
Cc: Richard Weinberger <richard@nod.at>
---
 include/linux/rcupdate.h |   1 +
 kernel/cpu.c             |   1 +
 kernel/rcu/tree.c        | 209 +++++++++++++++++++++++++----------------------
 3 files changed, 115 insertions(+), 96 deletions(-)

(limited to 'include')

diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h
index f816fc72b51e..cf307ebf345d 100644
--- a/include/linux/rcupdate.h
+++ b/include/linux/rcupdate.h
@@ -110,6 +110,7 @@ void rcu_bh_qs(void);
 void rcu_check_callbacks(int user);
 void rcu_report_dead(unsigned int cpu);
 void rcu_cpu_starting(unsigned int cpu);
+void rcutree_migrate_callbacks(int cpu);
 
 #ifdef CONFIG_RCU_STALL_COMMON
 void rcu_sysrq_start(void);
diff --git a/kernel/cpu.c b/kernel/cpu.c
index eee033134262..bfbd649ccdc8 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -650,6 +650,7 @@ static int takedown_cpu(unsigned int cpu)
 	__cpu_die(cpu);
 
 	tick_cleanup_dead_cpu(cpu);
+	rcutree_migrate_callbacks(cpu);
 	return 0;
 }
 
diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index 51d4c3acf32d..9bb5dff50815 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -2562,85 +2562,6 @@ rcu_check_quiescent_state(struct rcu_state *rsp, struct rcu_data *rdp)
 	rcu_report_qs_rdp(rdp->cpu, rsp, rdp);
 }
 
-/*
- * Send the specified CPU's RCU callbacks to the orphanage.  The
- * specified CPU must be offline, and the caller must hold the
- * ->orphan_lock.
- */
-static void
-rcu_send_cbs_to_orphanage(int cpu, struct rcu_state *rsp,
-			  struct rcu_node *rnp, struct rcu_data *rdp)
-{
-	lockdep_assert_held(&rsp->orphan_lock);
-
-	/* No-CBs CPUs do not have orphanable callbacks. */
-	if (!IS_ENABLED(CONFIG_HOTPLUG_CPU) || rcu_is_nocb_cpu(rdp->cpu))
-		return;
-
-	/*
-	 * Orphan the callbacks.  First adjust the counts.  This is safe
-	 * because _rcu_barrier() excludes CPU-hotplug operations, so it
-	 * cannot be running now.  Thus no memory barrier is required.
-	 */
-	rdp->n_cbs_orphaned += rcu_segcblist_n_cbs(&rdp->cblist);
-	rcu_segcblist_extract_count(&rdp->cblist, &rsp->orphan_done);
-
-	/*
-	 * Next, move those callbacks still needing a grace period to
-	 * the orphanage, where some other CPU will pick them up.
-	 * Some of the callbacks might have gone partway through a grace
-	 * period, but that is too bad.  They get to start over because we
-	 * cannot assume that grace periods are synchronized across CPUs.
-	 */
-	rcu_segcblist_extract_pend_cbs(&rdp->cblist, &rsp->orphan_pend);
-
-	/*
-	 * Then move the ready-to-invoke callbacks to the orphanage,
-	 * where some other CPU will pick them up.  These will not be
-	 * required to pass though another grace period: They are done.
-	 */
-	rcu_segcblist_extract_done_cbs(&rdp->cblist, &rsp->orphan_done);
-
-	/* Finally, disallow further callbacks on this CPU.  */
-	rcu_segcblist_disable(&rdp->cblist);
-}
-
-/*
- * Adopt the RCU callbacks from the specified rcu_state structure's
- * orphanage.  The caller must hold the ->orphan_lock.
- */
-static void rcu_adopt_orphan_cbs(struct rcu_state *rsp, unsigned long flags)
-{
-	struct rcu_data *rdp = raw_cpu_ptr(rsp->rda);
-
-	lockdep_assert_held(&rsp->orphan_lock);
-
-	/* No-CBs CPUs are handled specially. */
-	if (!IS_ENABLED(CONFIG_HOTPLUG_CPU) ||
-	    rcu_nocb_adopt_orphan_cbs(rsp, rdp, flags))
-		return;
-
-	/* Do the accounting first. */
-	rdp->n_cbs_adopted += rsp->orphan_done.len;
-	if (rsp->orphan_done.len_lazy != rsp->orphan_done.len)
-		rcu_idle_count_callbacks_posted();
-	rcu_segcblist_insert_count(&rdp->cblist, &rsp->orphan_done);
-
-	/*
-	 * We do not need a memory barrier here because the only way we
-	 * can get here if there is an rcu_barrier() in flight is if
-	 * we are the task doing the rcu_barrier().
-	 */
-
-	/* First adopt the ready-to-invoke callbacks, then the done ones. */
-	rcu_segcblist_insert_done_cbs(&rdp->cblist, &rsp->orphan_done);
-	WARN_ON_ONCE(rsp->orphan_done.head);
-	rcu_segcblist_insert_pend_cbs(&rdp->cblist, &rsp->orphan_pend);
-	WARN_ON_ONCE(rsp->orphan_pend.head);
-	WARN_ON_ONCE(rcu_segcblist_empty(&rdp->cblist) !=
-		     !rcu_segcblist_n_cbs(&rdp->cblist));
-}
-
 /*
  * Trace the fact that this CPU is going offline.
  */
@@ -2704,14 +2625,12 @@ static void rcu_cleanup_dead_rnp(struct rcu_node *rnp_leaf)
 
 /*
  * The CPU has been completely removed, and some other CPU is reporting
- * this fact from process context.  Do the remainder of the cleanup,
- * including orphaning the outgoing CPU's RCU callbacks, and also
- * adopting them.  There can only be one CPU hotplug operation at a time,
- * so no other CPU can be attempting to update rcu_cpu_kthread_task.
+ * this fact from process context.  Do the remainder of the cleanup.
+ * There can only be one CPU hotplug operation at a time, so no need for
+ * explicit locking.
  */
 static void rcu_cleanup_dead_cpu(int cpu, struct rcu_state *rsp)
 {
-	unsigned long flags;
 	struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu);
 	struct rcu_node *rnp = rdp->mynode;  /* Outgoing CPU's rdp & rnp. */
 
@@ -2720,18 +2639,6 @@ static void rcu_cleanup_dead_cpu(int cpu, struct rcu_state *rsp)
 
 	/* Adjust any no-longer-needed kthreads. */
 	rcu_boost_kthread_setaffinity(rnp, -1);
-
-	/* Orphan the dead CPU's callbacks, and adopt them if appropriate. */
-	raw_spin_lock_irqsave(&rsp->orphan_lock, flags);
-	rcu_send_cbs_to_orphanage(cpu, rsp, rnp, rdp);
-	rcu_adopt_orphan_cbs(rsp, flags);
-	raw_spin_unlock_irqrestore(&rsp->orphan_lock, flags);
-
-	WARN_ONCE(rcu_segcblist_n_cbs(&rdp->cblist) != 0 ||
-		  !rcu_segcblist_empty(&rdp->cblist),
-		  "rcu_cleanup_dead_cpu: Callbacks on offline CPU %d: qlen=%lu, 1stCB=%p\n",
-		  cpu, rcu_segcblist_n_cbs(&rdp->cblist),
-		  rcu_segcblist_first_cb(&rdp->cblist));
 }
 
 /*
@@ -3937,6 +3844,116 @@ void rcu_report_dead(unsigned int cpu)
 	for_each_rcu_flavor(rsp)
 		rcu_cleanup_dying_idle_cpu(cpu, rsp);
 }
+
+/*
+ * Send the specified CPU's RCU callbacks to the orphanage.  The
+ * specified CPU must be offline, and the caller must hold the
+ * ->orphan_lock.
+ */
+static void
+rcu_send_cbs_to_orphanage(int cpu, struct rcu_state *rsp,
+			  struct rcu_node *rnp, struct rcu_data *rdp)
+{
+	lockdep_assert_held(&rsp->orphan_lock);
+
+	/* No-CBs CPUs do not have orphanable callbacks. */
+	if (!IS_ENABLED(CONFIG_HOTPLUG_CPU) || rcu_is_nocb_cpu(rdp->cpu))
+		return;
+
+	/*
+	 * Orphan the callbacks.  First adjust the counts.  This is safe
+	 * because _rcu_barrier() excludes CPU-hotplug operations, so it
+	 * cannot be running now.  Thus no memory barrier is required.
+	 */
+	rdp->n_cbs_orphaned += rcu_segcblist_n_cbs(&rdp->cblist);
+	rcu_segcblist_extract_count(&rdp->cblist, &rsp->orphan_done);
+
+	/*
+	 * Next, move those callbacks still needing a grace period to
+	 * the orphanage, where some other CPU will pick them up.
+	 * Some of the callbacks might have gone partway through a grace
+	 * period, but that is too bad.  They get to start over because we
+	 * cannot assume that grace periods are synchronized across CPUs.
+	 */
+	rcu_segcblist_extract_pend_cbs(&rdp->cblist, &rsp->orphan_pend);
+
+	/*
+	 * Then move the ready-to-invoke callbacks to the orphanage,
+	 * where some other CPU will pick them up.  These will not be
+	 * required to pass though another grace period: They are done.
+	 */
+	rcu_segcblist_extract_done_cbs(&rdp->cblist, &rsp->orphan_done);
+
+	/* Finally, disallow further callbacks on this CPU.  */
+	rcu_segcblist_disable(&rdp->cblist);
+}
+
+/*
+ * Adopt the RCU callbacks from the specified rcu_state structure's
+ * orphanage.  The caller must hold the ->orphan_lock.
+ */
+static void rcu_adopt_orphan_cbs(struct rcu_state *rsp, unsigned long flags)
+{
+	struct rcu_data *rdp = raw_cpu_ptr(rsp->rda);
+
+	lockdep_assert_held(&rsp->orphan_lock);
+
+	/* No-CBs CPUs are handled specially. */
+	if (!IS_ENABLED(CONFIG_HOTPLUG_CPU) ||
+	    rcu_nocb_adopt_orphan_cbs(rsp, rdp, flags))
+		return;
+
+	/* Do the accounting first. */
+	rdp->n_cbs_adopted += rsp->orphan_done.len;
+	if (rsp->orphan_done.len_lazy != rsp->orphan_done.len)
+		rcu_idle_count_callbacks_posted();
+	rcu_segcblist_insert_count(&rdp->cblist, &rsp->orphan_done);
+
+	/*
+	 * We do not need a memory barrier here because the only way we
+	 * can get here if there is an rcu_barrier() in flight is if
+	 * we are the task doing the rcu_barrier().
+	 */
+
+	/* First adopt the ready-to-invoke callbacks, then the done ones. */
+	rcu_segcblist_insert_done_cbs(&rdp->cblist, &rsp->orphan_done);
+	WARN_ON_ONCE(rsp->orphan_done.head);
+	rcu_segcblist_insert_pend_cbs(&rdp->cblist, &rsp->orphan_pend);
+	WARN_ON_ONCE(rsp->orphan_pend.head);
+	WARN_ON_ONCE(rcu_segcblist_empty(&rdp->cblist) !=
+		     !rcu_segcblist_n_cbs(&rdp->cblist));
+}
+
+/* Orphan the dead CPU's callbacks, and then adopt them. */
+static void rcu_migrate_callbacks(int cpu, struct rcu_state *rsp)
+{
+	unsigned long flags;
+	struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu);
+	struct rcu_node *rnp = rdp->mynode;  /* Outgoing CPU's rdp & rnp. */
+
+	raw_spin_lock_irqsave(&rsp->orphan_lock, flags);
+	rcu_send_cbs_to_orphanage(cpu, rsp, rnp, rdp);
+	rcu_adopt_orphan_cbs(rsp, flags);
+	raw_spin_unlock_irqrestore(&rsp->orphan_lock, flags);
+	WARN_ONCE(rcu_segcblist_n_cbs(&rdp->cblist) != 0 ||
+		  !rcu_segcblist_empty(&rdp->cblist),
+		  "rcu_cleanup_dead_cpu: Callbacks on offline CPU %d: qlen=%lu, 1stCB=%p\n",
+		  cpu, rcu_segcblist_n_cbs(&rdp->cblist),
+		  rcu_segcblist_first_cb(&rdp->cblist));
+}
+
+/*
+ * The outgoing CPU has just passed through the dying-idle state,
+ * and we are being invoked from the CPU that was IPIed to continue the
+ * offline operation.  We need to migrate the outgoing CPU's callbacks.
+ */
+void rcutree_migrate_callbacks(int cpu)
+{
+	struct rcu_state *rsp;
+
+	for_each_rcu_flavor(rsp)
+		rcu_migrate_callbacks(cpu, rsp);
+}
 #endif
 
 /*
-- 
cgit v1.2.3


From ed4676e254630c1f00a4fc8f3821890fff7e3643 Mon Sep 17 00:00:00 2001
From: Viresh Kumar <viresh.kumar@linaro.org>
Date: Wed, 19 Jul 2017 15:42:46 +0530
Subject: cpufreq: Replace "max_transition_latency" with "dynamic_switching"

There is no limitation in the ondemand or conservative governors which
disallow the transition_latency to be greater than 10 ms.

The max_transition_latency field is rather used to disallow automatic
dynamic frequency switching for platforms which didn't wanted these
governors to run.

Replace max_transition_latency with a boolean (dynamic_switching) and
check for transition_latency == CPUFREQ_ETERNAL along with that. This
makes it pretty straight forward to read/understand now.

Signed-off-by: Viresh Kumar <viresh.kumar@linaro.org>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/cpufreq/cpufreq.c          | 8 ++++----
 drivers/cpufreq/cpufreq_governor.h | 2 +-
 include/linux/cpufreq.h            | 9 ++-------
 3 files changed, 7 insertions(+), 12 deletions(-)

(limited to 'include')

diff --git a/drivers/cpufreq/cpufreq.c b/drivers/cpufreq/cpufreq.c
index c426d21822f7..88139e5e87da 100644
--- a/drivers/cpufreq/cpufreq.c
+++ b/drivers/cpufreq/cpufreq.c
@@ -2003,13 +2003,13 @@ static int cpufreq_init_governor(struct cpufreq_policy *policy)
 	if (!policy->governor)
 		return -EINVAL;
 
-	if (policy->governor->max_transition_latency &&
-	    policy->cpuinfo.transition_latency >
-	    policy->governor->max_transition_latency) {
+	/* Platform doesn't want dynamic frequency switching ? */
+	if (policy->governor->dynamic_switching &&
+	    policy->cpuinfo.transition_latency == CPUFREQ_ETERNAL) {
 		struct cpufreq_governor *gov = cpufreq_fallback_governor();
 
 		if (gov) {
-			pr_warn("%s governor failed, too long transition latency of HW, fallback to %s governor\n",
+			pr_warn("Transition latency set to CPUFREQ_ETERNAL, can't use %s governor. Fallback to %s governor\n",
 				policy->governor->name, gov->name);
 			policy->governor = gov;
 		} else {
diff --git a/drivers/cpufreq/cpufreq_governor.h b/drivers/cpufreq/cpufreq_governor.h
index 95f207eb820e..8463f5def0f5 100644
--- a/drivers/cpufreq/cpufreq_governor.h
+++ b/drivers/cpufreq/cpufreq_governor.h
@@ -159,7 +159,7 @@ void cpufreq_dbs_governor_limits(struct cpufreq_policy *policy);
 #define CPUFREQ_DBS_GOVERNOR_INITIALIZER(_name_)			\
 	{								\
 		.name = _name_,						\
-		.max_transition_latency	= TRANSITION_LATENCY_LIMIT,	\
+		.dynamic_switching = true,				\
 		.owner = THIS_MODULE,					\
 		.init = cpufreq_dbs_governor_init,			\
 		.exit = cpufreq_dbs_governor_exit,			\
diff --git a/include/linux/cpufreq.h b/include/linux/cpufreq.h
index aaadfc543f63..e141dbbb9d1c 100644
--- a/include/linux/cpufreq.h
+++ b/include/linux/cpufreq.h
@@ -487,12 +487,8 @@ static inline unsigned long cpufreq_scale(unsigned long old, u_int div,
  * polling frequency is 1000 times the transition latency of the processor. The
  * ondemand governor will work on any processor with transition latency <= 10ms,
  * using appropriate sampling rate.
- *
- * For CPUs with transition latency > 10ms (mostly drivers with CPUFREQ_ETERNAL)
- * the ondemand governor will not work. All times here are in us (microseconds).
  */
 #define LATENCY_MULTIPLIER		(1000)
-#define TRANSITION_LATENCY_LIMIT	(10 * 1000 * 1000)
 
 struct cpufreq_governor {
 	char	name[CPUFREQ_NAME_LEN];
@@ -505,9 +501,8 @@ struct cpufreq_governor {
 					 char *buf);
 	int	(*store_setspeed)	(struct cpufreq_policy *policy,
 					 unsigned int freq);
-	unsigned int max_transition_latency; /* HW must be able to switch to
-			next freq faster than this value in nano secs or we
-			will fallback to performance governor */
+	/* For governors which change frequency dynamically by themselves */
+	bool			dynamic_switching;
 	struct list_head	governor_list;
 	struct module		*owner;
 };
-- 
cgit v1.2.3


From fe829ed8ef1f3c7ac22843bd594ef2f6c4044288 Mon Sep 17 00:00:00 2001
From: Viresh Kumar <viresh.kumar@linaro.org>
Date: Wed, 19 Jul 2017 15:42:48 +0530
Subject: cpufreq: Add CPUFREQ_NO_AUTO_DYNAMIC_SWITCHING cpufreq driver flag

The policy->transition_latency field is used for multiple purposes
today and its not straight forward at all. This is how it is used:

A. Set the correct transition_latency value.

B. Set it to CPUFREQ_ETERNAL because:
   1. We don't want automatic dynamic switching (with
      ondemand/conservative) to happen at all.
   2. We don't know the transition latency.

This patch handles the B.1. case in a more readable way. A new flag for
the cpufreq drivers is added to disallow use of cpufreq governors which
have dynamic_switching flag set.

All the current cpufreq drivers which are setting transition_latency
unconditionally to CPUFREQ_ETERNAL are updated to use it. They don't
need to set transition_latency anymore.

There shouldn't be any functional change after this patch.

Signed-off-by: Viresh Kumar <viresh.kumar@linaro.org>
Reviewed-by: Dominik Brodowski <linux@dominikbrodowski.net>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/cpufreq/cpufreq-nforce2.c  | 2 +-
 drivers/cpufreq/cpufreq.c          | 5 +++--
 drivers/cpufreq/elanfreq.c         | 4 +---
 drivers/cpufreq/gx-suspmod.c       | 2 +-
 drivers/cpufreq/pmac32-cpufreq.c   | 7 +++++--
 drivers/cpufreq/sa1100-cpufreq.c   | 5 +++--
 drivers/cpufreq/sa1110-cpufreq.c   | 5 +++--
 drivers/cpufreq/sh-cpufreq.c       | 3 +--
 drivers/cpufreq/speedstep-smi.c    | 2 +-
 drivers/cpufreq/unicore2-cpufreq.c | 3 +--
 include/linux/cpufreq.h            | 6 ++++++
 11 files changed, 26 insertions(+), 18 deletions(-)

(limited to 'include')

diff --git a/drivers/cpufreq/cpufreq-nforce2.c b/drivers/cpufreq/cpufreq-nforce2.c
index 5503d491b016..dbf82f36d270 100644
--- a/drivers/cpufreq/cpufreq-nforce2.c
+++ b/drivers/cpufreq/cpufreq-nforce2.c
@@ -357,7 +357,6 @@ static int nforce2_cpu_init(struct cpufreq_policy *policy)
 	/* cpuinfo and default policy values */
 	policy->min = policy->cpuinfo.min_freq = min_fsb * fid * 100;
 	policy->max = policy->cpuinfo.max_freq = max_fsb * fid * 100;
-	policy->cpuinfo.transition_latency = CPUFREQ_ETERNAL;
 
 	return 0;
 }
@@ -369,6 +368,7 @@ static int nforce2_cpu_exit(struct cpufreq_policy *policy)
 
 static struct cpufreq_driver nforce2_driver = {
 	.name = "nforce2",
+	.flags = CPUFREQ_NO_AUTO_DYNAMIC_SWITCHING,
 	.verify = nforce2_verify,
 	.target = nforce2_target,
 	.get = nforce2_get,
diff --git a/drivers/cpufreq/cpufreq.c b/drivers/cpufreq/cpufreq.c
index 88139e5e87da..6ec589c048b2 100644
--- a/drivers/cpufreq/cpufreq.c
+++ b/drivers/cpufreq/cpufreq.c
@@ -2005,11 +2005,12 @@ static int cpufreq_init_governor(struct cpufreq_policy *policy)
 
 	/* Platform doesn't want dynamic frequency switching ? */
 	if (policy->governor->dynamic_switching &&
-	    policy->cpuinfo.transition_latency == CPUFREQ_ETERNAL) {
+	    (cpufreq_driver->flags & CPUFREQ_NO_AUTO_DYNAMIC_SWITCHING ||
+	    policy->cpuinfo.transition_latency == CPUFREQ_ETERNAL)) {
 		struct cpufreq_governor *gov = cpufreq_fallback_governor();
 
 		if (gov) {
-			pr_warn("Transition latency set to CPUFREQ_ETERNAL, can't use %s governor. Fallback to %s governor\n",
+			pr_warn("Can't use %s governor as dynamic switching is disallowed. Fallback to %s governor\n",
 				policy->governor->name, gov->name);
 			policy->governor = gov;
 		} else {
diff --git a/drivers/cpufreq/elanfreq.c b/drivers/cpufreq/elanfreq.c
index bfce11cba1df..45e2ca62515e 100644
--- a/drivers/cpufreq/elanfreq.c
+++ b/drivers/cpufreq/elanfreq.c
@@ -165,9 +165,6 @@ static int elanfreq_cpu_init(struct cpufreq_policy *policy)
 		if (pos->frequency > max_freq)
 			pos->frequency = CPUFREQ_ENTRY_INVALID;
 
-	/* cpuinfo and default policy values */
-	policy->cpuinfo.transition_latency = CPUFREQ_ETERNAL;
-
 	return cpufreq_table_validate_and_show(policy, elanfreq_table);
 }
 
@@ -196,6 +193,7 @@ __setup("elanfreq=", elanfreq_setup);
 
 static struct cpufreq_driver elanfreq_driver = {
 	.get		= elanfreq_get_cpu_frequency,
+	.flags		= CPUFREQ_NO_AUTO_DYNAMIC_SWITCHING,
 	.verify		= cpufreq_generic_frequency_table_verify,
 	.target_index	= elanfreq_target,
 	.init		= elanfreq_cpu_init,
diff --git a/drivers/cpufreq/gx-suspmod.c b/drivers/cpufreq/gx-suspmod.c
index 3488c9c175eb..8f52a06664e3 100644
--- a/drivers/cpufreq/gx-suspmod.c
+++ b/drivers/cpufreq/gx-suspmod.c
@@ -428,7 +428,6 @@ static int cpufreq_gx_cpu_init(struct cpufreq_policy *policy)
 	policy->max = maxfreq;
 	policy->cpuinfo.min_freq = maxfreq / max_duration;
 	policy->cpuinfo.max_freq = maxfreq;
-	policy->cpuinfo.transition_latency = CPUFREQ_ETERNAL;
 
 	return 0;
 }
@@ -438,6 +437,7 @@ static int cpufreq_gx_cpu_init(struct cpufreq_policy *policy)
  *   MediaGX/Geode GX initialize cpufreq driver
  */
 static struct cpufreq_driver gx_suspmod_driver = {
+	.flags		= CPUFREQ_NO_AUTO_DYNAMIC_SWITCHING,
 	.get		= gx_get_cpuspeed,
 	.verify		= cpufreq_gx_verify,
 	.target		= cpufreq_gx_target,
diff --git a/drivers/cpufreq/pmac32-cpufreq.c b/drivers/cpufreq/pmac32-cpufreq.c
index ff44016ea031..61ae06ca008e 100644
--- a/drivers/cpufreq/pmac32-cpufreq.c
+++ b/drivers/cpufreq/pmac32-cpufreq.c
@@ -442,7 +442,8 @@ static struct cpufreq_driver pmac_cpufreq_driver = {
 	.init		= pmac_cpufreq_cpu_init,
 	.suspend	= pmac_cpufreq_suspend,
 	.resume		= pmac_cpufreq_resume,
-	.flags		= CPUFREQ_PM_NO_WARN,
+	.flags		= CPUFREQ_PM_NO_WARN |
+			  CPUFREQ_NO_AUTO_DYNAMIC_SWITCHING,
 	.attr		= cpufreq_generic_attr,
 	.name		= "powermac",
 };
@@ -626,14 +627,16 @@ static int __init pmac_cpufreq_setup(void)
 	if (!value)
 		goto out;
 	cur_freq = (*value) / 1000;
-	transition_latency = CPUFREQ_ETERNAL;
 
 	/*  Check for 7447A based MacRISC3 */
 	if (of_machine_is_compatible("MacRISC3") &&
 	    of_get_property(cpunode, "dynamic-power-step", NULL) &&
 	    PVR_VER(mfspr(SPRN_PVR)) == 0x8003) {
 		pmac_cpufreq_init_7447A(cpunode);
+
+		/* Allow dynamic switching */
 		transition_latency = 8000000;
+		pmac_cpufreq_driver.flags &= ~CPUFREQ_NO_AUTO_DYNAMIC_SWITCHING;
 	/* Check for other MacRISC3 machines */
 	} else if (of_machine_is_compatible("PowerBook3,4") ||
 		   of_machine_is_compatible("PowerBook3,5") ||
diff --git a/drivers/cpufreq/sa1100-cpufreq.c b/drivers/cpufreq/sa1100-cpufreq.c
index 728eab77e8e0..e2d8a77c36d5 100644
--- a/drivers/cpufreq/sa1100-cpufreq.c
+++ b/drivers/cpufreq/sa1100-cpufreq.c
@@ -197,11 +197,12 @@ static int sa1100_target(struct cpufreq_policy *policy, unsigned int ppcr)
 
 static int __init sa1100_cpu_init(struct cpufreq_policy *policy)
 {
-	return cpufreq_generic_init(policy, sa11x0_freq_table, CPUFREQ_ETERNAL);
+	return cpufreq_generic_init(policy, sa11x0_freq_table, 0);
 }
 
 static struct cpufreq_driver sa1100_driver __refdata = {
-	.flags		= CPUFREQ_STICKY | CPUFREQ_NEED_INITIAL_FREQ_CHECK,
+	.flags		= CPUFREQ_STICKY | CPUFREQ_NEED_INITIAL_FREQ_CHECK |
+			  CPUFREQ_NO_AUTO_DYNAMIC_SWITCHING,
 	.verify		= cpufreq_generic_frequency_table_verify,
 	.target_index	= sa1100_target,
 	.get		= sa11x0_getspeed,
diff --git a/drivers/cpufreq/sa1110-cpufreq.c b/drivers/cpufreq/sa1110-cpufreq.c
index 2bac9b6cfeea..66e5fb088ecc 100644
--- a/drivers/cpufreq/sa1110-cpufreq.c
+++ b/drivers/cpufreq/sa1110-cpufreq.c
@@ -306,13 +306,14 @@ static int sa1110_target(struct cpufreq_policy *policy, unsigned int ppcr)
 
 static int __init sa1110_cpu_init(struct cpufreq_policy *policy)
 {
-	return cpufreq_generic_init(policy, sa11x0_freq_table, CPUFREQ_ETERNAL);
+	return cpufreq_generic_init(policy, sa11x0_freq_table, 0);
 }
 
 /* sa1110_driver needs __refdata because it must remain after init registers
  * it with cpufreq_register_driver() */
 static struct cpufreq_driver sa1110_driver __refdata = {
-	.flags		= CPUFREQ_STICKY | CPUFREQ_NEED_INITIAL_FREQ_CHECK,
+	.flags		= CPUFREQ_STICKY | CPUFREQ_NEED_INITIAL_FREQ_CHECK |
+			  CPUFREQ_NO_AUTO_DYNAMIC_SWITCHING,
 	.verify		= cpufreq_generic_frequency_table_verify,
 	.target_index	= sa1110_target,
 	.get		= sa11x0_getspeed,
diff --git a/drivers/cpufreq/sh-cpufreq.c b/drivers/cpufreq/sh-cpufreq.c
index 719c3d9f07fb..28893d435cf5 100644
--- a/drivers/cpufreq/sh-cpufreq.c
+++ b/drivers/cpufreq/sh-cpufreq.c
@@ -137,8 +137,6 @@ static int sh_cpufreq_cpu_init(struct cpufreq_policy *policy)
 			(clk_round_rate(cpuclk, ~0UL) + 500) / 1000;
 	}
 
-	policy->cpuinfo.transition_latency = CPUFREQ_ETERNAL;
-
 	dev_info(dev, "CPU Frequencies - Minimum %u.%03u MHz, "
 	       "Maximum %u.%03u MHz.\n",
 	       policy->min / 1000, policy->min % 1000,
@@ -159,6 +157,7 @@ static int sh_cpufreq_cpu_exit(struct cpufreq_policy *policy)
 
 static struct cpufreq_driver sh_cpufreq_driver = {
 	.name		= "sh",
+	.flags		= CPUFREQ_NO_AUTO_DYNAMIC_SWITCHING,
 	.get		= sh_cpufreq_get,
 	.target		= sh_cpufreq_target,
 	.verify		= sh_cpufreq_verify,
diff --git a/drivers/cpufreq/speedstep-smi.c b/drivers/cpufreq/speedstep-smi.c
index 37b30071c220..d23f24ccff38 100644
--- a/drivers/cpufreq/speedstep-smi.c
+++ b/drivers/cpufreq/speedstep-smi.c
@@ -266,7 +266,6 @@ static int speedstep_cpu_init(struct cpufreq_policy *policy)
 			pr_debug("workaround worked.\n");
 	}
 
-	policy->cpuinfo.transition_latency = CPUFREQ_ETERNAL;
 	return cpufreq_table_validate_and_show(policy, speedstep_freqs);
 }
 
@@ -290,6 +289,7 @@ static int speedstep_resume(struct cpufreq_policy *policy)
 
 static struct cpufreq_driver speedstep_driver = {
 	.name		= "speedstep-smi",
+	.flags		= CPUFREQ_NO_AUTO_DYNAMIC_SWITCHING,
 	.verify		= cpufreq_generic_frequency_table_verify,
 	.target_index	= speedstep_target,
 	.init		= speedstep_cpu_init,
diff --git a/drivers/cpufreq/unicore2-cpufreq.c b/drivers/cpufreq/unicore2-cpufreq.c
index 6f9dfa80563a..db62d9844751 100644
--- a/drivers/cpufreq/unicore2-cpufreq.c
+++ b/drivers/cpufreq/unicore2-cpufreq.c
@@ -58,13 +58,12 @@ static int __init ucv2_cpu_init(struct cpufreq_policy *policy)
 
 	policy->min = policy->cpuinfo.min_freq = 250000;
 	policy->max = policy->cpuinfo.max_freq = 1000000;
-	policy->cpuinfo.transition_latency = CPUFREQ_ETERNAL;
 	policy->clk = clk_get(NULL, "MAIN_CLK");
 	return PTR_ERR_OR_ZERO(policy->clk);
 }
 
 static struct cpufreq_driver ucv2_driver = {
-	.flags		= CPUFREQ_STICKY,
+	.flags		= CPUFREQ_STICKY | CPUFREQ_NO_AUTO_DYNAMIC_SWITCHING,
 	.verify		= ucv2_verify_speed,
 	.target		= ucv2_target,
 	.get		= cpufreq_generic_get,
diff --git a/include/linux/cpufreq.h b/include/linux/cpufreq.h
index e141dbbb9d1c..5f40522ec98c 100644
--- a/include/linux/cpufreq.h
+++ b/include/linux/cpufreq.h
@@ -370,6 +370,12 @@ struct cpufreq_driver {
  */
 #define CPUFREQ_NEED_INITIAL_FREQ_CHECK	(1 << 5)
 
+/*
+ * Set by drivers to disallow use of governors with "dynamic_switching" flag
+ * set.
+ */
+#define CPUFREQ_NO_AUTO_DYNAMIC_SWITCHING (1 << 6)
+
 int cpufreq_register_driver(struct cpufreq_driver *driver_data);
 int cpufreq_unregister_driver(struct cpufreq_driver *driver_data);
 
-- 
cgit v1.2.3


From ebae3e830a991116526646f09dd67f68f332b330 Mon Sep 17 00:00:00 2001
From: Dmitry Osipenko <digetx@gmail.com>
Date: Wed, 5 Jul 2017 20:27:53 +0300
Subject: iommu: Correct iommu_map / iommu_unmap prototypes

Commit 7d3002cc8c16 ("iommu/core: split mapping to page sizes as supported
by the hardware") replaced 'int gfp_order' with a 'size_t size' of
iommu_map / iommu_unmap function arguments, but missed the function
prototypes for the disabled CONFIG_IOMMU_API case, let's correct them
for consistency.

Signed-off-by: Dmitry Osipenko <digetx@gmail.com>
Signed-off-by: Joerg Roedel <jroedel@suse.de>
---
 include/linux/iommu.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/linux/iommu.h b/include/linux/iommu.h
index 2cb54adc4a33..f1ce8e517d8d 100644
--- a/include/linux/iommu.h
+++ b/include/linux/iommu.h
@@ -425,13 +425,13 @@ static inline struct iommu_domain *iommu_get_domain_for_dev(struct device *dev)
 }
 
 static inline int iommu_map(struct iommu_domain *domain, unsigned long iova,
-			    phys_addr_t paddr, int gfp_order, int prot)
+			    phys_addr_t paddr, size_t size, int prot)
 {
 	return -ENODEV;
 }
 
 static inline int iommu_unmap(struct iommu_domain *domain, unsigned long iova,
-			      int gfp_order)
+			      size_t size)
 {
 	return -ENODEV;
 }
-- 
cgit v1.2.3


From f06e8c584fa0d05312c11ea66194f3d2efb93c21 Mon Sep 17 00:00:00 2001
From: Dmitry Vyukov <dvyukov@google.com>
Date: Thu, 22 Jun 2017 16:14:17 +0200
Subject: kasan: Allow kasan_check_read/write() to accept pointers to volatiles

Currently kasan_check_read/write() accept 'const void*', make them
accept 'const volatile void*'. This is required for instrumentation
of atomic operations and there is just no reason to not allow that.

Signed-off-by: Dmitry Vyukov <dvyukov@google.com>
Reviewed-by: Andrey Ryabinin <aryabinin@virtuozzo.com>
Acked-by: Mark Rutland <mark.rutland@arm.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: kasan-dev@googlegroups.com
Cc: linux-mm@kvack.org
Cc: will.deacon@arm.com
Link: http://lkml.kernel.org/r/33e5ec275c1ee89299245b2ebbccd63709c6021f.1498140838.git.dvyukov@google.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/kasan-checks.h | 10 ++++++----
 mm/kasan/kasan.c             |  4 ++--
 2 files changed, 8 insertions(+), 6 deletions(-)

(limited to 'include')

diff --git a/include/linux/kasan-checks.h b/include/linux/kasan-checks.h
index b7f8aced7870..41960fecf783 100644
--- a/include/linux/kasan-checks.h
+++ b/include/linux/kasan-checks.h
@@ -2,11 +2,13 @@
 #define _LINUX_KASAN_CHECKS_H
 
 #ifdef CONFIG_KASAN
-void kasan_check_read(const void *p, unsigned int size);
-void kasan_check_write(const void *p, unsigned int size);
+void kasan_check_read(const volatile void *p, unsigned int size);
+void kasan_check_write(const volatile void *p, unsigned int size);
 #else
-static inline void kasan_check_read(const void *p, unsigned int size) { }
-static inline void kasan_check_write(const void *p, unsigned int size) { }
+static inline void kasan_check_read(const volatile void *p, unsigned int size)
+{ }
+static inline void kasan_check_write(const volatile void *p, unsigned int size)
+{ }
 #endif
 
 #endif
diff --git a/mm/kasan/kasan.c b/mm/kasan/kasan.c
index ca11bc4ce205..6f319fb81718 100644
--- a/mm/kasan/kasan.c
+++ b/mm/kasan/kasan.c
@@ -267,13 +267,13 @@ static void check_memory_region(unsigned long addr,
 	check_memory_region_inline(addr, size, write, ret_ip);
 }
 
-void kasan_check_read(const void *p, unsigned int size)
+void kasan_check_read(const volatile void *p, unsigned int size)
 {
 	check_memory_region((unsigned long)p, size, false, _RET_IP_);
 }
 EXPORT_SYMBOL(kasan_check_read);
 
-void kasan_check_write(const void *p, unsigned int size)
+void kasan_check_write(const volatile void *p, unsigned int size)
 {
 	check_memory_region((unsigned long)p, size, true, _RET_IP_);
 }
-- 
cgit v1.2.3


From ee9f8fce99640811b2b8e79d0d1dbe8bab69ba67 Mon Sep 17 00:00:00 2001
From: Josh Poimboeuf <jpoimboe@redhat.com>
Date: Mon, 24 Jul 2017 18:36:57 -0500
Subject: x86/unwind: Add the ORC unwinder

Add the new ORC unwinder which is enabled by CONFIG_ORC_UNWINDER=y.
It plugs into the existing x86 unwinder framework.

It relies on objtool to generate the needed .orc_unwind and
.orc_unwind_ip sections.

For more details on why ORC is used instead of DWARF, see
Documentation/x86/orc-unwinder.txt - but the short version is
that it's a simplified, fundamentally more robust debugninfo
data structure, which also allows up to two orders of magnitude
faster lookups than the DWARF unwinder - which matters to
profiling workloads like perf.

Thanks to Andy Lutomirski for the performance improvement ideas:
splitting the ORC unwind table into two parallel arrays and creating a
fast lookup table to search a subset of the unwind table.

Signed-off-by: Josh Poimboeuf <jpoimboe@redhat.com>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Jiri Slaby <jslaby@suse.cz>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: live-patching@vger.kernel.org
Link: http://lkml.kernel.org/r/0a6cbfb40f8da99b7a45a1a8302dc6aef16ec812.1500938583.git.jpoimboe@redhat.com
[ Extended the changelog. ]
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 Documentation/x86/orc-unwinder.txt | 179 ++++++++++++
 arch/um/include/asm/unwind.h       |   8 +
 arch/x86/Kconfig                   |   1 +
 arch/x86/Kconfig.debug             |  25 ++
 arch/x86/include/asm/module.h      |   9 +
 arch/x86/include/asm/orc_lookup.h  |  46 +++
 arch/x86/include/asm/orc_types.h   |   2 +-
 arch/x86/include/asm/unwind.h      |  76 +++--
 arch/x86/kernel/Makefile           |   8 +-
 arch/x86/kernel/module.c           |  11 +-
 arch/x86/kernel/setup.c            |   3 +
 arch/x86/kernel/unwind_frame.c     |  39 +--
 arch/x86/kernel/unwind_guess.c     |   5 +
 arch/x86/kernel/unwind_orc.c       | 582 +++++++++++++++++++++++++++++++++++++
 arch/x86/kernel/vmlinux.lds.S      |   3 +
 include/asm-generic/vmlinux.lds.h  |  27 +-
 lib/Kconfig.debug                  |   3 +
 scripts/Makefile.build             |  14 +-
 18 files changed, 977 insertions(+), 64 deletions(-)
 create mode 100644 Documentation/x86/orc-unwinder.txt
 create mode 100644 arch/um/include/asm/unwind.h
 create mode 100644 arch/x86/include/asm/orc_lookup.h
 create mode 100644 arch/x86/kernel/unwind_orc.c

(limited to 'include')

diff --git a/Documentation/x86/orc-unwinder.txt b/Documentation/x86/orc-unwinder.txt
new file mode 100644
index 000000000000..af0c9a4c65a6
--- /dev/null
+++ b/Documentation/x86/orc-unwinder.txt
@@ -0,0 +1,179 @@
+ORC unwinder
+============
+
+Overview
+--------
+
+The kernel CONFIG_ORC_UNWINDER option enables the ORC unwinder, which is
+similar in concept to a DWARF unwinder.  The difference is that the
+format of the ORC data is much simpler than DWARF, which in turn allows
+the ORC unwinder to be much simpler and faster.
+
+The ORC data consists of unwind tables which are generated by objtool.
+They contain out-of-band data which is used by the in-kernel ORC
+unwinder.  Objtool generates the ORC data by first doing compile-time
+stack metadata validation (CONFIG_STACK_VALIDATION).  After analyzing
+all the code paths of a .o file, it determines information about the
+stack state at each instruction address in the file and outputs that
+information to the .orc_unwind and .orc_unwind_ip sections.
+
+The per-object ORC sections are combined at link time and are sorted and
+post-processed at boot time.  The unwinder uses the resulting data to
+correlate instruction addresses with their stack states at run time.
+
+
+ORC vs frame pointers
+---------------------
+
+With frame pointers enabled, GCC adds instrumentation code to every
+function in the kernel.  The kernel's .text size increases by about
+3.2%, resulting in a broad kernel-wide slowdown.  Measurements by Mel
+Gorman [1] have shown a slowdown of 5-10% for some workloads.
+
+In contrast, the ORC unwinder has no effect on text size or runtime
+performance, because the debuginfo is out of band.  So if you disable
+frame pointers and enable the ORC unwinder, you get a nice performance
+improvement across the board, and still have reliable stack traces.
+
+Ingo Molnar says:
+
+  "Note that it's not just a performance improvement, but also an
+  instruction cache locality improvement: 3.2% .text savings almost
+  directly transform into a similarly sized reduction in cache
+  footprint. That can transform to even higher speedups for workloads
+  whose cache locality is borderline."
+
+Another benefit of ORC compared to frame pointers is that it can
+reliably unwind across interrupts and exceptions.  Frame pointer based
+unwinds can sometimes skip the caller of the interrupted function, if it
+was a leaf function or if the interrupt hit before the frame pointer was
+saved.
+
+The main disadvantage of the ORC unwinder compared to frame pointers is
+that it needs more memory to store the ORC unwind tables: roughly 2-4MB
+depending on the kernel config.
+
+
+ORC vs DWARF
+------------
+
+ORC debuginfo's advantage over DWARF itself is that it's much simpler.
+It gets rid of the complex DWARF CFI state machine and also gets rid of
+the tracking of unnecessary registers.  This allows the unwinder to be
+much simpler, meaning fewer bugs, which is especially important for
+mission critical oops code.
+
+The simpler debuginfo format also enables the unwinder to be much faster
+than DWARF, which is important for perf and lockdep.  In a basic
+performance test by Jiri Slaby [2], the ORC unwinder was about 20x
+faster than an out-of-tree DWARF unwinder.  (Note: That measurement was
+taken before some performance tweaks were added, which doubled
+performance, so the speedup over DWARF may be closer to 40x.)
+
+The ORC data format does have a few downsides compared to DWARF.  ORC
+unwind tables take up ~50% more RAM (+1.3MB on an x86 defconfig kernel)
+than DWARF-based eh_frame tables.
+
+Another potential downside is that, as GCC evolves, it's conceivable
+that the ORC data may end up being *too* simple to describe the state of
+the stack for certain optimizations.  But IMO this is unlikely because
+GCC saves the frame pointer for any unusual stack adjustments it does,
+so I suspect we'll really only ever need to keep track of the stack
+pointer and the frame pointer between call frames.  But even if we do
+end up having to track all the registers DWARF tracks, at least we will
+still be able to control the format, e.g. no complex state machines.
+
+
+ORC unwind table generation
+---------------------------
+
+The ORC data is generated by objtool.  With the existing compile-time
+stack metadata validation feature, objtool already follows all code
+paths, and so it already has all the information it needs to be able to
+generate ORC data from scratch.  So it's an easy step to go from stack
+validation to ORC data generation.
+
+It should be possible to instead generate the ORC data with a simple
+tool which converts DWARF to ORC data.  However, such a solution would
+be incomplete due to the kernel's extensive use of asm, inline asm, and
+special sections like exception tables.
+
+That could be rectified by manually annotating those special code paths
+using GNU assembler .cfi annotations in .S files, and homegrown
+annotations for inline asm in .c files.  But asm annotations were tried
+in the past and were found to be unmaintainable.  They were often
+incorrect/incomplete and made the code harder to read and keep updated.
+And based on looking at glibc code, annotating inline asm in .c files
+might be even worse.
+
+Objtool still needs a few annotations, but only in code which does
+unusual things to the stack like entry code.  And even then, far fewer
+annotations are needed than what DWARF would need, so they're much more
+maintainable than DWARF CFI annotations.
+
+So the advantages of using objtool to generate ORC data are that it
+gives more accurate debuginfo, with very few annotations.  It also
+insulates the kernel from toolchain bugs which can be very painful to
+deal with in the kernel since we often have to workaround issues in
+older versions of the toolchain for years.
+
+The downside is that the unwinder now becomes dependent on objtool's
+ability to reverse engineer GCC code flow.  If GCC optimizations become
+too complicated for objtool to follow, the ORC data generation might
+stop working or become incomplete.  (It's worth noting that livepatch
+already has such a dependency on objtool's ability to follow GCC code
+flow.)
+
+If newer versions of GCC come up with some optimizations which break
+objtool, we may need to revisit the current implementation.  Some
+possible solutions would be asking GCC to make the optimizations more
+palatable, or having objtool use DWARF as an additional input, or
+creating a GCC plugin to assist objtool with its analysis.  But for now,
+objtool follows GCC code quite well.
+
+
+Unwinder implementation details
+-------------------------------
+
+Objtool generates the ORC data by integrating with the compile-time
+stack metadata validation feature, which is described in detail in
+tools/objtool/Documentation/stack-validation.txt.  After analyzing all
+the code paths of a .o file, it creates an array of orc_entry structs,
+and a parallel array of instruction addresses associated with those
+structs, and writes them to the .orc_unwind and .orc_unwind_ip sections
+respectively.
+
+The ORC data is split into the two arrays for performance reasons, to
+make the searchable part of the data (.orc_unwind_ip) more compact.  The
+arrays are sorted in parallel at boot time.
+
+Performance is further improved by the use of a fast lookup table which
+is created at runtime.  The fast lookup table associates a given address
+with a range of indices for the .orc_unwind table, so that only a small
+subset of the table needs to be searched.
+
+
+Etymology
+---------
+
+Orcs, fearsome creatures of medieval folklore, are the Dwarves' natural
+enemies.  Similarly, the ORC unwinder was created in opposition to the
+complexity and slowness of DWARF.
+
+"Although Orcs rarely consider multiple solutions to a problem, they do
+excel at getting things done because they are creatures of action, not
+thought." [3]  Similarly, unlike the esoteric DWARF unwinder, the
+veracious ORC unwinder wastes no time or siloconic effort decoding
+variable-length zero-extended unsigned-integer byte-coded
+state-machine-based debug information entries.
+
+Similar to how Orcs frequently unravel the well-intentioned plans of
+their adversaries, the ORC unwinder frequently unravels stacks with
+brutal, unyielding efficiency.
+
+ORC stands for Oops Rewind Capability.
+
+
+[1] https://lkml.kernel.org/r/20170602104048.jkkzssljsompjdwy@suse.de
+[2] https://lkml.kernel.org/r/d2ca5435-6386-29b8-db87-7f227c2b713a@suse.cz
+[3] http://dustin.wikidot.com/half-orcs-and-orcs
diff --git a/arch/um/include/asm/unwind.h b/arch/um/include/asm/unwind.h
new file mode 100644
index 000000000000..7ffa5437b761
--- /dev/null
+++ b/arch/um/include/asm/unwind.h
@@ -0,0 +1,8 @@
+#ifndef _ASM_UML_UNWIND_H
+#define _ASM_UML_UNWIND_H
+
+static inline void
+unwind_module_init(struct module *mod, void *orc_ip, size_t orc_ip_size,
+		   void *orc, size_t orc_size) {}
+
+#endif /* _ASM_UML_UNWIND_H */
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 781521b7cf9e..7ccf26a427cb 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -157,6 +157,7 @@ config X86
 	select HAVE_MEMBLOCK
 	select HAVE_MEMBLOCK_NODE_MAP
 	select HAVE_MIXED_BREAKPOINTS_REGS
+	select HAVE_MOD_ARCH_SPECIFIC
 	select HAVE_NMI
 	select HAVE_OPROFILE
 	select HAVE_OPTPROBES
diff --git a/arch/x86/Kconfig.debug b/arch/x86/Kconfig.debug
index 353ed09f5fba..dc10ec6ead6e 100644
--- a/arch/x86/Kconfig.debug
+++ b/arch/x86/Kconfig.debug
@@ -355,4 +355,29 @@ config PUNIT_ATOM_DEBUG
 	  The current power state can be read from
 	  /sys/kernel/debug/punit_atom/dev_power_state
 
+config ORC_UNWINDER
+	bool "ORC unwinder"
+	depends on X86_64
+	select STACK_VALIDATION
+	---help---
+	  This option enables the ORC (Oops Rewind Capability) unwinder for
+	  unwinding kernel stack traces.  It uses a custom data format which is
+	  a simplified version of the DWARF Call Frame Information standard.
+
+	  This unwinder is more accurate across interrupt entry frames than the
+	  frame pointer unwinder.  It can also enable a 5-10% performance
+	  improvement across the entire kernel if CONFIG_FRAME_POINTER is
+	  disabled.
+
+	  Enabling this option will increase the kernel's runtime memory usage
+	  by roughly 2-4MB, depending on your kernel config.
+
+config FRAME_POINTER_UNWINDER
+	def_bool y
+	depends on !ORC_UNWINDER && FRAME_POINTER
+
+config GUESS_UNWINDER
+	def_bool y
+	depends on !ORC_UNWINDER && !FRAME_POINTER
+
 endmenu
diff --git a/arch/x86/include/asm/module.h b/arch/x86/include/asm/module.h
index e3b7819caeef..9eb7c718aaf8 100644
--- a/arch/x86/include/asm/module.h
+++ b/arch/x86/include/asm/module.h
@@ -2,6 +2,15 @@
 #define _ASM_X86_MODULE_H
 
 #include <asm-generic/module.h>
+#include <asm/orc_types.h>
+
+struct mod_arch_specific {
+#ifdef CONFIG_ORC_UNWINDER
+	unsigned int num_orcs;
+	int *orc_unwind_ip;
+	struct orc_entry *orc_unwind;
+#endif
+};
 
 #ifdef CONFIG_X86_64
 /* X86_64 does not define MODULE_PROC_FAMILY */
diff --git a/arch/x86/include/asm/orc_lookup.h b/arch/x86/include/asm/orc_lookup.h
new file mode 100644
index 000000000000..91c8d868424d
--- /dev/null
+++ b/arch/x86/include/asm/orc_lookup.h
@@ -0,0 +1,46 @@
+/*
+ * Copyright (C) 2017 Josh Poimboeuf <jpoimboe@redhat.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+#ifndef _ORC_LOOKUP_H
+#define _ORC_LOOKUP_H
+
+/*
+ * This is a lookup table for speeding up access to the .orc_unwind table.
+ * Given an input address offset, the corresponding lookup table entry
+ * specifies a subset of the .orc_unwind table to search.
+ *
+ * Each block represents the end of the previous range and the start of the
+ * next range.  An extra block is added to give the last range an end.
+ *
+ * The block size should be a power of 2 to avoid a costly 'div' instruction.
+ *
+ * A block size of 256 was chosen because it roughly doubles unwinder
+ * performance while only adding ~5% to the ORC data footprint.
+ */
+#define LOOKUP_BLOCK_ORDER	8
+#define LOOKUP_BLOCK_SIZE	(1 << LOOKUP_BLOCK_ORDER)
+
+#ifndef LINKER_SCRIPT
+
+extern unsigned int orc_lookup[];
+extern unsigned int orc_lookup_end[];
+
+#define LOOKUP_START_IP		(unsigned long)_stext
+#define LOOKUP_STOP_IP		(unsigned long)_etext
+
+#endif /* LINKER_SCRIPT */
+
+#endif /* _ORC_LOOKUP_H */
diff --git a/arch/x86/include/asm/orc_types.h b/arch/x86/include/asm/orc_types.h
index 7dc777a6cb40..9c9dc579bd7d 100644
--- a/arch/x86/include/asm/orc_types.h
+++ b/arch/x86/include/asm/orc_types.h
@@ -88,7 +88,7 @@ struct orc_entry {
 	unsigned	sp_reg:4;
 	unsigned	bp_reg:4;
 	unsigned	type:2;
-};
+} __packed;
 
 /*
  * This struct is used by asm and inline asm code to manually annotate the
diff --git a/arch/x86/include/asm/unwind.h b/arch/x86/include/asm/unwind.h
index e6676495b125..25b8d31a007d 100644
--- a/arch/x86/include/asm/unwind.h
+++ b/arch/x86/include/asm/unwind.h
@@ -12,11 +12,14 @@ struct unwind_state {
 	struct task_struct *task;
 	int graph_idx;
 	bool error;
-#ifdef CONFIG_FRAME_POINTER
+#if defined(CONFIG_ORC_UNWINDER)
+	bool signal, full_regs;
+	unsigned long sp, bp, ip;
+	struct pt_regs *regs;
+#elif defined(CONFIG_FRAME_POINTER)
 	bool got_irq;
-	unsigned long *bp, *orig_sp;
+	unsigned long *bp, *orig_sp, ip;
 	struct pt_regs *regs;
-	unsigned long ip;
 #else
 	unsigned long *sp;
 #endif
@@ -24,41 +27,30 @@ struct unwind_state {
 
 void __unwind_start(struct unwind_state *state, struct task_struct *task,
 		    struct pt_regs *regs, unsigned long *first_frame);
-
 bool unwind_next_frame(struct unwind_state *state);
-
 unsigned long unwind_get_return_address(struct unwind_state *state);
+unsigned long *unwind_get_return_address_ptr(struct unwind_state *state);
 
 static inline bool unwind_done(struct unwind_state *state)
 {
 	return state->stack_info.type == STACK_TYPE_UNKNOWN;
 }
 
-static inline
-void unwind_start(struct unwind_state *state, struct task_struct *task,
-		  struct pt_regs *regs, unsigned long *first_frame)
-{
-	first_frame = first_frame ? : get_stack_pointer(task, regs);
-
-	__unwind_start(state, task, regs, first_frame);
-}
-
 static inline bool unwind_error(struct unwind_state *state)
 {
 	return state->error;
 }
 
-#ifdef CONFIG_FRAME_POINTER
-
 static inline
-unsigned long *unwind_get_return_address_ptr(struct unwind_state *state)
+void unwind_start(struct unwind_state *state, struct task_struct *task,
+		  struct pt_regs *regs, unsigned long *first_frame)
 {
-	if (unwind_done(state))
-		return NULL;
+	first_frame = first_frame ? : get_stack_pointer(task, regs);
 
-	return state->regs ? &state->regs->ip : state->bp + 1;
+	__unwind_start(state, task, regs, first_frame);
 }
 
+#if defined(CONFIG_ORC_UNWINDER) || defined(CONFIG_FRAME_POINTER)
 static inline struct pt_regs *unwind_get_entry_regs(struct unwind_state *state)
 {
 	if (unwind_done(state))
@@ -66,20 +58,46 @@ static inline struct pt_regs *unwind_get_entry_regs(struct unwind_state *state)
 
 	return state->regs;
 }
-
-#else /* !CONFIG_FRAME_POINTER */
-
-static inline
-unsigned long *unwind_get_return_address_ptr(struct unwind_state *state)
+#else
+static inline struct pt_regs *unwind_get_entry_regs(struct unwind_state *state)
 {
 	return NULL;
 }
+#endif
 
-static inline struct pt_regs *unwind_get_entry_regs(struct unwind_state *state)
+#ifdef CONFIG_ORC_UNWINDER
+void unwind_init(void);
+void unwind_module_init(struct module *mod, void *orc_ip, size_t orc_ip_size,
+			void *orc, size_t orc_size);
+#else
+static inline void unwind_init(void) {}
+static inline
+void unwind_module_init(struct module *mod, void *orc_ip, size_t orc_ip_size,
+			void *orc, size_t orc_size) {}
+#endif
+
+/*
+ * This disables KASAN checking when reading a value from another task's stack,
+ * since the other task could be running on another CPU and could have poisoned
+ * the stack in the meantime.
+ */
+#define READ_ONCE_TASK_STACK(task, x)			\
+({							\
+	unsigned long val;				\
+	if (task == current)				\
+		val = READ_ONCE(x);			\
+	else						\
+		val = READ_ONCE_NOCHECK(x);		\
+	val;						\
+})
+
+static inline bool task_on_another_cpu(struct task_struct *task)
 {
-	return NULL;
+#ifdef CONFIG_SMP
+	return task != current && task->on_cpu;
+#else
+	return false;
+#endif
 }
 
-#endif /* CONFIG_FRAME_POINTER */
-
 #endif /* _ASM_X86_UNWIND_H */
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index a01892bdd61a..287eac7d207f 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -126,11 +126,9 @@ obj-$(CONFIG_PERF_EVENTS)		+= perf_regs.o
 obj-$(CONFIG_TRACING)			+= tracepoint.o
 obj-$(CONFIG_SCHED_MC_PRIO)		+= itmt.o
 
-ifdef CONFIG_FRAME_POINTER
-obj-y					+= unwind_frame.o
-else
-obj-y					+= unwind_guess.o
-endif
+obj-$(CONFIG_ORC_UNWINDER)		+= unwind_orc.o
+obj-$(CONFIG_FRAME_POINTER_UNWINDER)	+= unwind_frame.o
+obj-$(CONFIG_GUESS_UNWINDER)		+= unwind_guess.o
 
 ###
 # 64 bit specific files
diff --git a/arch/x86/kernel/module.c b/arch/x86/kernel/module.c
index f67bd3205df7..62e7d70aadd5 100644
--- a/arch/x86/kernel/module.c
+++ b/arch/x86/kernel/module.c
@@ -35,6 +35,7 @@
 #include <asm/page.h>
 #include <asm/pgtable.h>
 #include <asm/setup.h>
+#include <asm/unwind.h>
 
 #if 0
 #define DEBUGP(fmt, ...)				\
@@ -213,7 +214,7 @@ int module_finalize(const Elf_Ehdr *hdr,
 		    struct module *me)
 {
 	const Elf_Shdr *s, *text = NULL, *alt = NULL, *locks = NULL,
-		*para = NULL;
+		*para = NULL, *orc = NULL, *orc_ip = NULL;
 	char *secstrings = (void *)hdr + sechdrs[hdr->e_shstrndx].sh_offset;
 
 	for (s = sechdrs; s < sechdrs + hdr->e_shnum; s++) {
@@ -225,6 +226,10 @@ int module_finalize(const Elf_Ehdr *hdr,
 			locks = s;
 		if (!strcmp(".parainstructions", secstrings + s->sh_name))
 			para = s;
+		if (!strcmp(".orc_unwind", secstrings + s->sh_name))
+			orc = s;
+		if (!strcmp(".orc_unwind_ip", secstrings + s->sh_name))
+			orc_ip = s;
 	}
 
 	if (alt) {
@@ -248,6 +253,10 @@ int module_finalize(const Elf_Ehdr *hdr,
 	/* make jump label nops */
 	jump_label_apply_nops(me);
 
+	if (orc && orc_ip)
+		unwind_module_init(me, (void *)orc_ip->sh_addr, orc_ip->sh_size,
+				   (void *)orc->sh_addr, orc->sh_size);
+
 	return 0;
 }
 
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index 3486d0498800..ecab32282f0f 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -115,6 +115,7 @@
 #include <asm/microcode.h>
 #include <asm/mmu_context.h>
 #include <asm/kaslr.h>
+#include <asm/unwind.h>
 
 /*
  * max_low_pfn_mapped: highest direct mapped pfn under 4GB
@@ -1310,6 +1311,8 @@ void __init setup_arch(char **cmdline_p)
 	if (efi_enabled(EFI_BOOT))
 		efi_apply_memmap_quirks();
 #endif
+
+	unwind_init();
 }
 
 #ifdef CONFIG_X86_32
diff --git a/arch/x86/kernel/unwind_frame.c b/arch/x86/kernel/unwind_frame.c
index b9389d72b2f7..7574ef5f16ec 100644
--- a/arch/x86/kernel/unwind_frame.c
+++ b/arch/x86/kernel/unwind_frame.c
@@ -10,20 +10,22 @@
 
 #define FRAME_HEADER_SIZE (sizeof(long) * 2)
 
-/*
- * This disables KASAN checking when reading a value from another task's stack,
- * since the other task could be running on another CPU and could have poisoned
- * the stack in the meantime.
- */
-#define READ_ONCE_TASK_STACK(task, x)			\
-({							\
-	unsigned long val;				\
-	if (task == current)				\
-		val = READ_ONCE(x);			\
-	else						\
-		val = READ_ONCE_NOCHECK(x);		\
-	val;						\
-})
+unsigned long unwind_get_return_address(struct unwind_state *state)
+{
+	if (unwind_done(state))
+		return 0;
+
+	return __kernel_text_address(state->ip) ? state->ip : 0;
+}
+EXPORT_SYMBOL_GPL(unwind_get_return_address);
+
+unsigned long *unwind_get_return_address_ptr(struct unwind_state *state)
+{
+	if (unwind_done(state))
+		return NULL;
+
+	return state->regs ? &state->regs->ip : state->bp + 1;
+}
 
 static void unwind_dump(struct unwind_state *state)
 {
@@ -66,15 +68,6 @@ static void unwind_dump(struct unwind_state *state)
 	}
 }
 
-unsigned long unwind_get_return_address(struct unwind_state *state)
-{
-	if (unwind_done(state))
-		return 0;
-
-	return __kernel_text_address(state->ip) ? state->ip : 0;
-}
-EXPORT_SYMBOL_GPL(unwind_get_return_address);
-
 static size_t regs_size(struct pt_regs *regs)
 {
 	/* x86_32 regs from kernel mode are two words shorter: */
diff --git a/arch/x86/kernel/unwind_guess.c b/arch/x86/kernel/unwind_guess.c
index 039f36738e49..4f0e17b90463 100644
--- a/arch/x86/kernel/unwind_guess.c
+++ b/arch/x86/kernel/unwind_guess.c
@@ -19,6 +19,11 @@ unsigned long unwind_get_return_address(struct unwind_state *state)
 }
 EXPORT_SYMBOL_GPL(unwind_get_return_address);
 
+unsigned long *unwind_get_return_address_ptr(struct unwind_state *state)
+{
+	return NULL;
+}
+
 bool unwind_next_frame(struct unwind_state *state)
 {
 	struct stack_info *info = &state->stack_info;
diff --git a/arch/x86/kernel/unwind_orc.c b/arch/x86/kernel/unwind_orc.c
new file mode 100644
index 000000000000..570b70d3f604
--- /dev/null
+++ b/arch/x86/kernel/unwind_orc.c
@@ -0,0 +1,582 @@
+#include <linux/module.h>
+#include <linux/sort.h>
+#include <asm/ptrace.h>
+#include <asm/stacktrace.h>
+#include <asm/unwind.h>
+#include <asm/orc_types.h>
+#include <asm/orc_lookup.h>
+#include <asm/sections.h>
+
+#define orc_warn(fmt, ...) \
+	printk_deferred_once(KERN_WARNING pr_fmt("WARNING: " fmt), ##__VA_ARGS__)
+
+extern int __start_orc_unwind_ip[];
+extern int __stop_orc_unwind_ip[];
+extern struct orc_entry __start_orc_unwind[];
+extern struct orc_entry __stop_orc_unwind[];
+
+static DEFINE_MUTEX(sort_mutex);
+int *cur_orc_ip_table = __start_orc_unwind_ip;
+struct orc_entry *cur_orc_table = __start_orc_unwind;
+
+unsigned int lookup_num_blocks;
+bool orc_init;
+
+static inline unsigned long orc_ip(const int *ip)
+{
+	return (unsigned long)ip + *ip;
+}
+
+static struct orc_entry *__orc_find(int *ip_table, struct orc_entry *u_table,
+				    unsigned int num_entries, unsigned long ip)
+{
+	int *first = ip_table;
+	int *last = ip_table + num_entries - 1;
+	int *mid = first, *found = first;
+
+	if (!num_entries)
+		return NULL;
+
+	/*
+	 * Do a binary range search to find the rightmost duplicate of a given
+	 * starting address.  Some entries are section terminators which are
+	 * "weak" entries for ensuring there are no gaps.  They should be
+	 * ignored when they conflict with a real entry.
+	 */
+	while (first <= last) {
+		mid = first + ((last - first) / 2);
+
+		if (orc_ip(mid) <= ip) {
+			found = mid;
+			first = mid + 1;
+		} else
+			last = mid - 1;
+	}
+
+	return u_table + (found - ip_table);
+}
+
+#ifdef CONFIG_MODULES
+static struct orc_entry *orc_module_find(unsigned long ip)
+{
+	struct module *mod;
+
+	mod = __module_address(ip);
+	if (!mod || !mod->arch.orc_unwind || !mod->arch.orc_unwind_ip)
+		return NULL;
+	return __orc_find(mod->arch.orc_unwind_ip, mod->arch.orc_unwind,
+			  mod->arch.num_orcs, ip);
+}
+#else
+static struct orc_entry *orc_module_find(unsigned long ip)
+{
+	return NULL;
+}
+#endif
+
+static struct orc_entry *orc_find(unsigned long ip)
+{
+	if (!orc_init)
+		return NULL;
+
+	/* For non-init vmlinux addresses, use the fast lookup table: */
+	if (ip >= LOOKUP_START_IP && ip < LOOKUP_STOP_IP) {
+		unsigned int idx, start, stop;
+
+		idx = (ip - LOOKUP_START_IP) / LOOKUP_BLOCK_SIZE;
+
+		if (unlikely((idx >= lookup_num_blocks-1))) {
+			orc_warn("WARNING: bad lookup idx: idx=%u num=%u ip=%lx\n",
+				 idx, lookup_num_blocks, ip);
+			return NULL;
+		}
+
+		start = orc_lookup[idx];
+		stop = orc_lookup[idx + 1] + 1;
+
+		if (unlikely((__start_orc_unwind + start >= __stop_orc_unwind) ||
+			     (__start_orc_unwind + stop > __stop_orc_unwind))) {
+			orc_warn("WARNING: bad lookup value: idx=%u num=%u start=%u stop=%u ip=%lx\n",
+				 idx, lookup_num_blocks, start, stop, ip);
+			return NULL;
+		}
+
+		return __orc_find(__start_orc_unwind_ip + start,
+				  __start_orc_unwind + start, stop - start, ip);
+	}
+
+	/* vmlinux .init slow lookup: */
+	if (ip >= (unsigned long)_sinittext && ip < (unsigned long)_einittext)
+		return __orc_find(__start_orc_unwind_ip, __start_orc_unwind,
+				  __stop_orc_unwind_ip - __start_orc_unwind_ip, ip);
+
+	/* Module lookup: */
+	return orc_module_find(ip);
+}
+
+static void orc_sort_swap(void *_a, void *_b, int size)
+{
+	struct orc_entry *orc_a, *orc_b;
+	struct orc_entry orc_tmp;
+	int *a = _a, *b = _b, tmp;
+	int delta = _b - _a;
+
+	/* Swap the .orc_unwind_ip entries: */
+	tmp = *a;
+	*a = *b + delta;
+	*b = tmp - delta;
+
+	/* Swap the corresponding .orc_unwind entries: */
+	orc_a = cur_orc_table + (a - cur_orc_ip_table);
+	orc_b = cur_orc_table + (b - cur_orc_ip_table);
+	orc_tmp = *orc_a;
+	*orc_a = *orc_b;
+	*orc_b = orc_tmp;
+}
+
+static int orc_sort_cmp(const void *_a, const void *_b)
+{
+	struct orc_entry *orc_a;
+	const int *a = _a, *b = _b;
+	unsigned long a_val = orc_ip(a);
+	unsigned long b_val = orc_ip(b);
+
+	if (a_val > b_val)
+		return 1;
+	if (a_val < b_val)
+		return -1;
+
+	/*
+	 * The "weak" section terminator entries need to always be on the left
+	 * to ensure the lookup code skips them in favor of real entries.
+	 * These terminator entries exist to handle any gaps created by
+	 * whitelisted .o files which didn't get objtool generation.
+	 */
+	orc_a = cur_orc_table + (a - cur_orc_ip_table);
+	return orc_a->sp_reg == ORC_REG_UNDEFINED ? -1 : 1;
+}
+
+#ifdef CONFIG_MODULES
+void unwind_module_init(struct module *mod, void *_orc_ip, size_t orc_ip_size,
+			void *_orc, size_t orc_size)
+{
+	int *orc_ip = _orc_ip;
+	struct orc_entry *orc = _orc;
+	unsigned int num_entries = orc_ip_size / sizeof(int);
+
+	WARN_ON_ONCE(orc_ip_size % sizeof(int) != 0 ||
+		     orc_size % sizeof(*orc) != 0 ||
+		     num_entries != orc_size / sizeof(*orc));
+
+	/*
+	 * The 'cur_orc_*' globals allow the orc_sort_swap() callback to
+	 * associate an .orc_unwind_ip table entry with its corresponding
+	 * .orc_unwind entry so they can both be swapped.
+	 */
+	mutex_lock(&sort_mutex);
+	cur_orc_ip_table = orc_ip;
+	cur_orc_table = orc;
+	sort(orc_ip, num_entries, sizeof(int), orc_sort_cmp, orc_sort_swap);
+	mutex_unlock(&sort_mutex);
+
+	mod->arch.orc_unwind_ip = orc_ip;
+	mod->arch.orc_unwind = orc;
+	mod->arch.num_orcs = num_entries;
+}
+#endif
+
+void __init unwind_init(void)
+{
+	size_t orc_ip_size = (void *)__stop_orc_unwind_ip - (void *)__start_orc_unwind_ip;
+	size_t orc_size = (void *)__stop_orc_unwind - (void *)__start_orc_unwind;
+	size_t num_entries = orc_ip_size / sizeof(int);
+	struct orc_entry *orc;
+	int i;
+
+	if (!num_entries || orc_ip_size % sizeof(int) != 0 ||
+	    orc_size % sizeof(struct orc_entry) != 0 ||
+	    num_entries != orc_size / sizeof(struct orc_entry)) {
+		orc_warn("WARNING: Bad or missing .orc_unwind table.  Disabling unwinder.\n");
+		return;
+	}
+
+	/* Sort the .orc_unwind and .orc_unwind_ip tables: */
+	sort(__start_orc_unwind_ip, num_entries, sizeof(int), orc_sort_cmp,
+	     orc_sort_swap);
+
+	/* Initialize the fast lookup table: */
+	lookup_num_blocks = orc_lookup_end - orc_lookup;
+	for (i = 0; i < lookup_num_blocks-1; i++) {
+		orc = __orc_find(__start_orc_unwind_ip, __start_orc_unwind,
+				 num_entries,
+				 LOOKUP_START_IP + (LOOKUP_BLOCK_SIZE * i));
+		if (!orc) {
+			orc_warn("WARNING: Corrupt .orc_unwind table.  Disabling unwinder.\n");
+			return;
+		}
+
+		orc_lookup[i] = orc - __start_orc_unwind;
+	}
+
+	/* Initialize the ending block: */
+	orc = __orc_find(__start_orc_unwind_ip, __start_orc_unwind, num_entries,
+			 LOOKUP_STOP_IP);
+	if (!orc) {
+		orc_warn("WARNING: Corrupt .orc_unwind table.  Disabling unwinder.\n");
+		return;
+	}
+	orc_lookup[lookup_num_blocks-1] = orc - __start_orc_unwind;
+
+	orc_init = true;
+}
+
+unsigned long unwind_get_return_address(struct unwind_state *state)
+{
+	if (unwind_done(state))
+		return 0;
+
+	return __kernel_text_address(state->ip) ? state->ip : 0;
+}
+EXPORT_SYMBOL_GPL(unwind_get_return_address);
+
+unsigned long *unwind_get_return_address_ptr(struct unwind_state *state)
+{
+	if (unwind_done(state))
+		return NULL;
+
+	if (state->regs)
+		return &state->regs->ip;
+
+	if (state->sp)
+		return (unsigned long *)state->sp - 1;
+
+	return NULL;
+}
+
+static bool stack_access_ok(struct unwind_state *state, unsigned long addr,
+			    size_t len)
+{
+	struct stack_info *info = &state->stack_info;
+
+	/*
+	 * If the address isn't on the current stack, switch to the next one.
+	 *
+	 * We may have to traverse multiple stacks to deal with the possibility
+	 * that info->next_sp could point to an empty stack and the address
+	 * could be on a subsequent stack.
+	 */
+	while (!on_stack(info, (void *)addr, len))
+		if (get_stack_info(info->next_sp, state->task, info,
+				   &state->stack_mask))
+			return false;
+
+	return true;
+}
+
+static bool deref_stack_reg(struct unwind_state *state, unsigned long addr,
+			    unsigned long *val)
+{
+	if (!stack_access_ok(state, addr, sizeof(long)))
+		return false;
+
+	*val = READ_ONCE_TASK_STACK(state->task, *(unsigned long *)addr);
+	return true;
+}
+
+#define REGS_SIZE (sizeof(struct pt_regs))
+#define SP_OFFSET (offsetof(struct pt_regs, sp))
+#define IRET_REGS_SIZE (REGS_SIZE - offsetof(struct pt_regs, ip))
+#define IRET_SP_OFFSET (SP_OFFSET - offsetof(struct pt_regs, ip))
+
+static bool deref_stack_regs(struct unwind_state *state, unsigned long addr,
+			     unsigned long *ip, unsigned long *sp, bool full)
+{
+	size_t regs_size = full ? REGS_SIZE : IRET_REGS_SIZE;
+	size_t sp_offset = full ? SP_OFFSET : IRET_SP_OFFSET;
+	struct pt_regs *regs = (struct pt_regs *)(addr + regs_size - REGS_SIZE);
+
+	if (IS_ENABLED(CONFIG_X86_64)) {
+		if (!stack_access_ok(state, addr, regs_size))
+			return false;
+
+		*ip = regs->ip;
+		*sp = regs->sp;
+
+		return true;
+	}
+
+	if (!stack_access_ok(state, addr, sp_offset))
+		return false;
+
+	*ip = regs->ip;
+
+	if (user_mode(regs)) {
+		if (!stack_access_ok(state, addr + sp_offset,
+				     REGS_SIZE - SP_OFFSET))
+			return false;
+
+		*sp = regs->sp;
+	} else
+		*sp = (unsigned long)&regs->sp;
+
+	return true;
+}
+
+bool unwind_next_frame(struct unwind_state *state)
+{
+	unsigned long ip_p, sp, orig_ip, prev_sp = state->sp;
+	enum stack_type prev_type = state->stack_info.type;
+	struct orc_entry *orc;
+	struct pt_regs *ptregs;
+	bool indirect = false;
+
+	if (unwind_done(state))
+		return false;
+
+	/* Don't let modules unload while we're reading their ORC data. */
+	preempt_disable();
+
+	/* Have we reached the end? */
+	if (state->regs && user_mode(state->regs))
+		goto done;
+
+	/*
+	 * Find the orc_entry associated with the text address.
+	 *
+	 * Decrement call return addresses by one so they work for sibling
+	 * calls and calls to noreturn functions.
+	 */
+	orc = orc_find(state->signal ? state->ip : state->ip - 1);
+	if (!orc || orc->sp_reg == ORC_REG_UNDEFINED)
+		goto done;
+	orig_ip = state->ip;
+
+	/* Find the previous frame's stack: */
+	switch (orc->sp_reg) {
+	case ORC_REG_SP:
+		sp = state->sp + orc->sp_offset;
+		break;
+
+	case ORC_REG_BP:
+		sp = state->bp + orc->sp_offset;
+		break;
+
+	case ORC_REG_SP_INDIRECT:
+		sp = state->sp + orc->sp_offset;
+		indirect = true;
+		break;
+
+	case ORC_REG_BP_INDIRECT:
+		sp = state->bp + orc->sp_offset;
+		indirect = true;
+		break;
+
+	case ORC_REG_R10:
+		if (!state->regs || !state->full_regs) {
+			orc_warn("missing regs for base reg R10 at ip %p\n",
+				 (void *)state->ip);
+			goto done;
+		}
+		sp = state->regs->r10;
+		break;
+
+	case ORC_REG_R13:
+		if (!state->regs || !state->full_regs) {
+			orc_warn("missing regs for base reg R13 at ip %p\n",
+				 (void *)state->ip);
+			goto done;
+		}
+		sp = state->regs->r13;
+		break;
+
+	case ORC_REG_DI:
+		if (!state->regs || !state->full_regs) {
+			orc_warn("missing regs for base reg DI at ip %p\n",
+				 (void *)state->ip);
+			goto done;
+		}
+		sp = state->regs->di;
+		break;
+
+	case ORC_REG_DX:
+		if (!state->regs || !state->full_regs) {
+			orc_warn("missing regs for base reg DX at ip %p\n",
+				 (void *)state->ip);
+			goto done;
+		}
+		sp = state->regs->dx;
+		break;
+
+	default:
+		orc_warn("unknown SP base reg %d for ip %p\n",
+			 orc->sp_reg, (void *)state->ip);
+		goto done;
+	}
+
+	if (indirect) {
+		if (!deref_stack_reg(state, sp, &sp))
+			goto done;
+	}
+
+	/* Find IP, SP and possibly regs: */
+	switch (orc->type) {
+	case ORC_TYPE_CALL:
+		ip_p = sp - sizeof(long);
+
+		if (!deref_stack_reg(state, ip_p, &state->ip))
+			goto done;
+
+		state->ip = ftrace_graph_ret_addr(state->task, &state->graph_idx,
+						  state->ip, (void *)ip_p);
+
+		state->sp = sp;
+		state->regs = NULL;
+		state->signal = false;
+		break;
+
+	case ORC_TYPE_REGS:
+		if (!deref_stack_regs(state, sp, &state->ip, &state->sp, true)) {
+			orc_warn("can't dereference registers at %p for ip %p\n",
+				 (void *)sp, (void *)orig_ip);
+			goto done;
+		}
+
+		state->regs = (struct pt_regs *)sp;
+		state->full_regs = true;
+		state->signal = true;
+		break;
+
+	case ORC_TYPE_REGS_IRET:
+		if (!deref_stack_regs(state, sp, &state->ip, &state->sp, false)) {
+			orc_warn("can't dereference iret registers at %p for ip %p\n",
+				 (void *)sp, (void *)orig_ip);
+			goto done;
+		}
+
+		ptregs = container_of((void *)sp, struct pt_regs, ip);
+		if ((unsigned long)ptregs >= prev_sp &&
+		    on_stack(&state->stack_info, ptregs, REGS_SIZE)) {
+			state->regs = ptregs;
+			state->full_regs = false;
+		} else
+			state->regs = NULL;
+
+		state->signal = true;
+		break;
+
+	default:
+		orc_warn("unknown .orc_unwind entry type %d\n", orc->type);
+		break;
+	}
+
+	/* Find BP: */
+	switch (orc->bp_reg) {
+	case ORC_REG_UNDEFINED:
+		if (state->regs && state->full_regs)
+			state->bp = state->regs->bp;
+		break;
+
+	case ORC_REG_PREV_SP:
+		if (!deref_stack_reg(state, sp + orc->bp_offset, &state->bp))
+			goto done;
+		break;
+
+	case ORC_REG_BP:
+		if (!deref_stack_reg(state, state->bp + orc->bp_offset, &state->bp))
+			goto done;
+		break;
+
+	default:
+		orc_warn("unknown BP base reg %d for ip %p\n",
+			 orc->bp_reg, (void *)orig_ip);
+		goto done;
+	}
+
+	/* Prevent a recursive loop due to bad ORC data: */
+	if (state->stack_info.type == prev_type &&
+	    on_stack(&state->stack_info, (void *)state->sp, sizeof(long)) &&
+	    state->sp <= prev_sp) {
+		orc_warn("stack going in the wrong direction? ip=%p\n",
+			 (void *)orig_ip);
+		goto done;
+	}
+
+	preempt_enable();
+	return true;
+
+done:
+	preempt_enable();
+	state->stack_info.type = STACK_TYPE_UNKNOWN;
+	return false;
+}
+EXPORT_SYMBOL_GPL(unwind_next_frame);
+
+void __unwind_start(struct unwind_state *state, struct task_struct *task,
+		    struct pt_regs *regs, unsigned long *first_frame)
+{
+	memset(state, 0, sizeof(*state));
+	state->task = task;
+
+	/*
+	 * Refuse to unwind the stack of a task while it's executing on another
+	 * CPU.  This check is racy, but that's ok: the unwinder has other
+	 * checks to prevent it from going off the rails.
+	 */
+	if (task_on_another_cpu(task))
+		goto done;
+
+	if (regs) {
+		if (user_mode(regs))
+			goto done;
+
+		state->ip = regs->ip;
+		state->sp = kernel_stack_pointer(regs);
+		state->bp = regs->bp;
+		state->regs = regs;
+		state->full_regs = true;
+		state->signal = true;
+
+	} else if (task == current) {
+		asm volatile("lea (%%rip), %0\n\t"
+			     "mov %%rsp, %1\n\t"
+			     "mov %%rbp, %2\n\t"
+			     : "=r" (state->ip), "=r" (state->sp),
+			       "=r" (state->bp));
+
+	} else {
+		struct inactive_task_frame *frame = (void *)task->thread.sp;
+
+		state->sp = task->thread.sp;
+		state->bp = READ_ONCE_NOCHECK(frame->bp);
+		state->ip = READ_ONCE_NOCHECK(frame->ret_addr);
+	}
+
+	if (get_stack_info((unsigned long *)state->sp, state->task,
+			   &state->stack_info, &state->stack_mask))
+		return;
+
+	/*
+	 * The caller can provide the address of the first frame directly
+	 * (first_frame) or indirectly (regs->sp) to indicate which stack frame
+	 * to start unwinding at.  Skip ahead until we reach it.
+	 */
+
+	/* When starting from regs, skip the regs frame: */
+	if (regs) {
+		unwind_next_frame(state);
+		return;
+	}
+
+	/* Otherwise, skip ahead to the user-specified starting frame: */
+	while (!unwind_done(state) &&
+	       (!on_stack(&state->stack_info, first_frame, sizeof(long)) ||
+			state->sp <= (unsigned long)first_frame))
+		unwind_next_frame(state);
+
+	return;
+
+done:
+	state->stack_info.type = STACK_TYPE_UNKNOWN;
+	return;
+}
+EXPORT_SYMBOL_GPL(__unwind_start);
diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S
index c8a3b61be0aa..f05f00acac89 100644
--- a/arch/x86/kernel/vmlinux.lds.S
+++ b/arch/x86/kernel/vmlinux.lds.S
@@ -24,6 +24,7 @@
 #include <asm/asm-offsets.h>
 #include <asm/thread_info.h>
 #include <asm/page_types.h>
+#include <asm/orc_lookup.h>
 #include <asm/cache.h>
 #include <asm/boot.h>
 
@@ -148,6 +149,8 @@ SECTIONS
 
 	BUG_TABLE
 
+	ORC_UNWIND_TABLE
+
 	. = ALIGN(PAGE_SIZE);
 	__vvar_page = .;
 
diff --git a/include/asm-generic/vmlinux.lds.h b/include/asm-generic/vmlinux.lds.h
index da0be9a8d1de..fffc9bdae025 100644
--- a/include/asm-generic/vmlinux.lds.h
+++ b/include/asm-generic/vmlinux.lds.h
@@ -680,6 +680,31 @@
 #define BUG_TABLE
 #endif
 
+#ifdef CONFIG_ORC_UNWINDER
+#define ORC_UNWIND_TABLE						\
+	. = ALIGN(4);							\
+	.orc_unwind_ip : AT(ADDR(.orc_unwind_ip) - LOAD_OFFSET) {	\
+		VMLINUX_SYMBOL(__start_orc_unwind_ip) = .;		\
+		KEEP(*(.orc_unwind_ip))					\
+		VMLINUX_SYMBOL(__stop_orc_unwind_ip) = .;		\
+	}								\
+	. = ALIGN(6);							\
+	.orc_unwind : AT(ADDR(.orc_unwind) - LOAD_OFFSET) {		\
+		VMLINUX_SYMBOL(__start_orc_unwind) = .;			\
+		KEEP(*(.orc_unwind))					\
+		VMLINUX_SYMBOL(__stop_orc_unwind) = .;			\
+	}								\
+	. = ALIGN(4);							\
+	.orc_lookup : AT(ADDR(.orc_lookup) - LOAD_OFFSET) {		\
+		VMLINUX_SYMBOL(orc_lookup) = .;				\
+		. += (((SIZEOF(.text) + LOOKUP_BLOCK_SIZE - 1) /	\
+			LOOKUP_BLOCK_SIZE) + 1) * 4;			\
+		VMLINUX_SYMBOL(orc_lookup_end) = .;			\
+	}
+#else
+#define ORC_UNWIND_TABLE
+#endif
+
 #ifdef CONFIG_PM_TRACE
 #define TRACEDATA							\
 	. = ALIGN(4);							\
@@ -866,7 +891,7 @@
 		DATA_DATA						\
 		CONSTRUCTORS						\
 	}								\
-	BUG_TABLE
+	BUG_TABLE							\
 
 #define INIT_TEXT_SECTION(inittext_align)				\
 	. = ALIGN(inittext_align);					\
diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
index 98fe715522e8..0f0d019ffb99 100644
--- a/lib/Kconfig.debug
+++ b/lib/Kconfig.debug
@@ -374,6 +374,9 @@ config STACK_VALIDATION
 	  pointers (if CONFIG_FRAME_POINTER is enabled).  This helps ensure
 	  that runtime stack traces are more reliable.
 
+	  This is also a prerequisite for generation of ORC unwind data, which
+	  is needed for CONFIG_ORC_UNWINDER.
+
 	  For more information, see
 	  tools/objtool/Documentation/stack-validation.txt.
 
diff --git a/scripts/Makefile.build b/scripts/Makefile.build
index 854608d42e85..80ed14809a05 100644
--- a/scripts/Makefile.build
+++ b/scripts/Makefile.build
@@ -258,7 +258,8 @@ ifneq ($(SKIP_STACK_VALIDATION),1)
 
 __objtool_obj := $(objtree)/tools/objtool/objtool
 
-objtool_args = check
+objtool_args = $(if $(CONFIG_ORC_UNWINDER),orc generate,check)
+
 ifndef CONFIG_FRAME_POINTER
 objtool_args += --no-fp
 endif
@@ -279,6 +280,11 @@ objtool_obj = $(if $(patsubst y%,, \
 endif # SKIP_STACK_VALIDATION
 endif # CONFIG_STACK_VALIDATION
 
+# Rebuild all objects when objtool changes, or is enabled/disabled.
+objtool_dep = $(objtool_obj)					\
+	      $(wildcard include/config/orc/unwinder.h		\
+			 include/config/stack/validation.h)
+
 define rule_cc_o_c
 	$(call echo-cmd,checksrc) $(cmd_checksrc)			  \
 	$(call cmd_and_fixdep,cc_o_c)					  \
@@ -301,13 +307,13 @@ cmd_undef_syms = echo
 endif
 
 # Built-in and composite module parts
-$(obj)/%.o: $(src)/%.c $(recordmcount_source) $(objtool_obj) FORCE
+$(obj)/%.o: $(src)/%.c $(recordmcount_source) $(objtool_dep) FORCE
 	$(call cmd,force_checksrc)
 	$(call if_changed_rule,cc_o_c)
 
 # Single-part modules are special since we need to mark them in $(MODVERDIR)
 
-$(single-used-m): $(obj)/%.o: $(src)/%.c $(recordmcount_source) $(objtool_obj) FORCE
+$(single-used-m): $(obj)/%.o: $(src)/%.c $(recordmcount_source) $(objtool_dep) FORCE
 	$(call cmd,force_checksrc)
 	$(call if_changed_rule,cc_o_c)
 	@{ echo $(@:.o=.ko); echo $@; \
@@ -402,7 +408,7 @@ cmd_modversions_S =								\
 endif
 endif
 
-$(obj)/%.o: $(src)/%.S $(objtool_obj) FORCE
+$(obj)/%.o: $(src)/%.S $(objtool_dep) FORCE
 	$(call if_changed_rule,as_o_S)
 
 targets += $(real-objs-y) $(real-objs-m) $(lib-y)
-- 
cgit v1.2.3


From a7e1149b64091ac87dfb18b177abb63ba4cbbd3d Mon Sep 17 00:00:00 2001
From: Kuninori Morimoto <kuninori.morimoto.gx@renesas.com>
Date: Wed, 26 Jul 2017 05:17:08 +0000
Subject: ASoC: remove cache_bypass from snd_soc_codec

snd_soc_codec .cache_bypass related operation code has been removed.
Let's remove remaining code.

Signed-off-by: Kuninori Morimoto <kuninori.morimoto.gx@renesas.com>
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 include/sound/soc.h         | 1 -
 sound/soc/codecs/max98927.c | 1 -
 2 files changed, 2 deletions(-)

(limited to 'include')

diff --git a/include/sound/soc.h b/include/sound/soc.h
index 9c94b97c17f8..10b71beb9f13 100644
--- a/include/sound/soc.h
+++ b/include/sound/soc.h
@@ -894,7 +894,6 @@ struct snd_soc_codec {
 	struct list_head list;
 
 	/* runtime */
-	unsigned int cache_bypass:1; /* Suppress access to the cache */
 	unsigned int cache_init:1; /* codec cache has been initialized */
 
 	/* codec IO */
diff --git a/sound/soc/codecs/max98927.c b/sound/soc/codecs/max98927.c
index b5ee29499e16..b0380b560bf5 100644
--- a/sound/soc/codecs/max98927.c
+++ b/sound/soc/codecs/max98927.c
@@ -577,7 +577,6 @@ static int max98927_probe(struct snd_soc_codec *codec)
 
 	max98927->codec = codec;
 	codec->control_data = max98927->regmap;
-	codec->cache_bypass = 1;
 
 	/* Software Reset */
 	regmap_write(max98927->regmap,
-- 
cgit v1.2.3


From 85f7ff9702bcc5e899bd0bf6b6e383ecb2ac436a Mon Sep 17 00:00:00 2001
From: Sakari Ailus <sakari.ailus@linux.intel.com>
Date: Fri, 2 Jun 2017 05:30:02 -0400
Subject: media: v4l2-flash: Use led_classdev instead of led_classdev_flash for
 indicator

The V4L2 flash class initialisation expects struct led_classdev_flash that
describes an indicator but only uses struct led_classdev which is a field
iled_cdev in the struct. Use struct iled_cdev only.

Signed-off-by: Sakari Ailus <sakari.ailus@linux.intel.com>
Reviewed-by: Jacek Anaszewski <jacek.anaszewski@gmail.com>
Reviewed-by: Sebastian Reichel <sebastian.reichel@collabora.co.uk>
Signed-off-by: Mauro Carvalho Chehab <mchehab@s-opensource.com>
---
 drivers/media/v4l2-core/v4l2-flash-led-class.c | 19 +++++++------------
 drivers/staging/greybus/light.c                |  4 ++--
 include/media/v4l2-flash-led-class.h           |  6 +++---
 3 files changed, 12 insertions(+), 17 deletions(-)

(limited to 'include')

diff --git a/drivers/media/v4l2-core/v4l2-flash-led-class.c b/drivers/media/v4l2-core/v4l2-flash-led-class.c
index 7b8288108e8a..6d69119ff097 100644
--- a/drivers/media/v4l2-core/v4l2-flash-led-class.c
+++ b/drivers/media/v4l2-core/v4l2-flash-led-class.c
@@ -110,7 +110,7 @@ static void v4l2_flash_set_led_brightness(struct v4l2_flash *v4l2_flash,
 		led_set_brightness_sync(&v4l2_flash->fled_cdev->led_cdev,
 					brightness);
 	} else {
-		led_set_brightness_sync(&v4l2_flash->iled_cdev->led_cdev,
+		led_set_brightness_sync(v4l2_flash->iled_cdev,
 					brightness);
 	}
 }
@@ -133,7 +133,7 @@ static int v4l2_flash_update_led_brightness(struct v4l2_flash *v4l2_flash,
 			return 0;
 		led_cdev = &v4l2_flash->fled_cdev->led_cdev;
 	} else {
-		led_cdev = &v4l2_flash->iled_cdev->led_cdev;
+		led_cdev = v4l2_flash->iled_cdev;
 	}
 
 	ret = led_update_brightness(led_cdev);
@@ -529,8 +529,7 @@ static int v4l2_flash_open(struct v4l2_subdev *sd, struct v4l2_subdev_fh *fh)
 	struct v4l2_flash *v4l2_flash = v4l2_subdev_to_v4l2_flash(sd);
 	struct led_classdev_flash *fled_cdev = v4l2_flash->fled_cdev;
 	struct led_classdev *led_cdev = &fled_cdev->led_cdev;
-	struct led_classdev_flash *iled_cdev = v4l2_flash->iled_cdev;
-	struct led_classdev *led_cdev_ind = NULL;
+	struct led_classdev *led_cdev_ind = v4l2_flash->iled_cdev;
 	int ret = 0;
 
 	if (!v4l2_fh_is_singular(&fh->vfh))
@@ -543,9 +542,7 @@ static int v4l2_flash_open(struct v4l2_subdev *sd, struct v4l2_subdev_fh *fh)
 
 	mutex_unlock(&led_cdev->led_access);
 
-	if (iled_cdev) {
-		led_cdev_ind = &iled_cdev->led_cdev;
-
+	if (led_cdev_ind) {
 		mutex_lock(&led_cdev_ind->led_access);
 
 		led_sysfs_disable(led_cdev_ind);
@@ -578,7 +575,7 @@ static int v4l2_flash_close(struct v4l2_subdev *sd, struct v4l2_subdev_fh *fh)
 	struct v4l2_flash *v4l2_flash = v4l2_subdev_to_v4l2_flash(sd);
 	struct led_classdev_flash *fled_cdev = v4l2_flash->fled_cdev;
 	struct led_classdev *led_cdev = &fled_cdev->led_cdev;
-	struct led_classdev_flash *iled_cdev = v4l2_flash->iled_cdev;
+	struct led_classdev *led_cdev_ind = v4l2_flash->iled_cdev;
 	int ret = 0;
 
 	if (!v4l2_fh_is_singular(&fh->vfh))
@@ -593,9 +590,7 @@ static int v4l2_flash_close(struct v4l2_subdev *sd, struct v4l2_subdev_fh *fh)
 
 	mutex_unlock(&led_cdev->led_access);
 
-	if (iled_cdev) {
-		struct led_classdev *led_cdev_ind = &iled_cdev->led_cdev;
-
+	if (led_cdev_ind) {
 		mutex_lock(&led_cdev_ind->led_access);
 		led_sysfs_enable(led_cdev_ind);
 		mutex_unlock(&led_cdev_ind->led_access);
@@ -614,7 +609,7 @@ static const struct v4l2_subdev_ops v4l2_flash_subdev_ops;
 struct v4l2_flash *v4l2_flash_init(
 	struct device *dev, struct fwnode_handle *fwn,
 	struct led_classdev_flash *fled_cdev,
-	struct led_classdev_flash *iled_cdev,
+	struct led_classdev *iled_cdev,
 	const struct v4l2_flash_ops *ops,
 	struct v4l2_flash_config *config)
 {
diff --git a/drivers/staging/greybus/light.c b/drivers/staging/greybus/light.c
index 861a249e6ef1..129ceed39829 100644
--- a/drivers/staging/greybus/light.c
+++ b/drivers/staging/greybus/light.c
@@ -536,7 +536,7 @@ static int gb_lights_light_v4l2_register(struct gb_light *light)
 	struct device *dev = &connection->bundle->dev;
 	struct v4l2_flash_config *sd_cfg;
 	struct led_classdev_flash *fled;
-	struct led_classdev_flash *iled = NULL;
+	struct led_classdev *iled = NULL;
 	struct gb_channel *channel_torch, *channel_ind, *channel_flash;
 	int ret = 0;
 
@@ -553,7 +553,7 @@ static int gb_lights_light_v4l2_register(struct gb_light *light)
 	if (channel_ind) {
 		__gb_lights_channel_v4l2_config(&channel_ind->intensity_uA,
 						&sd_cfg->indicator_intensity);
-		iled = &channel_ind->fled;
+		iled = &channel_ind->fled.led_cdev;
 	}
 
 	channel_flash = get_channel_from_mode(light, GB_CHANNEL_MODE_FLASH);
diff --git a/include/media/v4l2-flash-led-class.h b/include/media/v4l2-flash-led-class.h
index f9dcd54c1745..54e31a805a88 100644
--- a/include/media/v4l2-flash-led-class.h
+++ b/include/media/v4l2-flash-led-class.h
@@ -85,7 +85,7 @@ struct v4l2_flash_config {
  */
 struct v4l2_flash {
 	struct led_classdev_flash *fled_cdev;
-	struct led_classdev_flash *iled_cdev;
+	struct led_classdev *iled_cdev;
 	const struct v4l2_flash_ops *ops;
 
 	struct v4l2_subdev sd;
@@ -124,7 +124,7 @@ static inline struct v4l2_flash *v4l2_ctrl_to_v4l2_flash(struct v4l2_ctrl *c)
 struct v4l2_flash *v4l2_flash_init(
 	struct device *dev, struct fwnode_handle *fwn,
 	struct led_classdev_flash *fled_cdev,
-	struct led_classdev_flash *iled_cdev,
+	struct led_classdev *iled_cdev,
 	const struct v4l2_flash_ops *ops,
 	struct v4l2_flash_config *config);
 
@@ -140,7 +140,7 @@ void v4l2_flash_release(struct v4l2_flash *v4l2_flash);
 static inline struct v4l2_flash *v4l2_flash_init(
 	struct device *dev, struct fwnode_handle *fwn,
 	struct led_classdev_flash *fled_cdev,
-	struct led_classdev_flash *iled_cdev,
+	struct led_classdev *iled_cdev,
 	const struct v4l2_flash_ops *ops,
 	struct v4l2_flash_config *config)
 {
-- 
cgit v1.2.3


From d2f3c3849461baefdbb39123abde1054d46bf22e Mon Sep 17 00:00:00 2001
From: "Dennis Zhou (Facebook)" <dennisszhou@gmail.com>
Date: Mon, 24 Jul 2017 19:02:09 -0400
Subject: percpu: increase minimum percpu allocation size and align first
 regions

This patch increases the minimum allocation size of percpu memory to
4-bytes. This change will help minimize the metadata overhead
associated with the bitmap allocator. The assumption is that most
allocations will be of objects or structs greater than 2 bytes with
integers or longs being used rather than shorts.

The first chunk regions are now aligned with the minimum allocation
size. The reserved region is expected to be set as a multiple of the
minimum allocation size. The static region is aligned up and the delta
is removed from the dynamic size. This works because the dynamic size is
increased to be page aligned. If the static size is not minimum
allocation size aligned, then there must be a gap that is added to the
dynamic size. The dynamic size will never be smaller than the set value.

Signed-off-by: Dennis Zhou <dennisszhou@gmail.com>
Reviewed-by: Josef Bacik <jbacik@fb.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 include/linux/percpu.h |  4 ++++
 mm/percpu.c            | 27 ++++++++++++++++++++-------
 2 files changed, 24 insertions(+), 7 deletions(-)

(limited to 'include')

diff --git a/include/linux/percpu.h b/include/linux/percpu.h
index 491b3f5a5f8a..90e0cb0f7802 100644
--- a/include/linux/percpu.h
+++ b/include/linux/percpu.h
@@ -21,6 +21,10 @@
 /* minimum unit size, also is the maximum supported allocation size */
 #define PCPU_MIN_UNIT_SIZE		PFN_ALIGN(32 << 10)
 
+/* minimum allocation size and shift in bytes */
+#define PCPU_MIN_ALLOC_SHIFT		2
+#define PCPU_MIN_ALLOC_SIZE		(1 << PCPU_MIN_ALLOC_SHIFT)
+
 /*
  * Percpu allocator can serve percpu allocations before slab is
  * initialized which allows slab to depend on the percpu allocator.
diff --git a/mm/percpu.c b/mm/percpu.c
index 657ab0821cf0..dc755721c333 100644
--- a/mm/percpu.c
+++ b/mm/percpu.c
@@ -956,10 +956,10 @@ static void __percpu *pcpu_alloc(size_t size, size_t align, bool reserved,
 	 * We want the lowest bit of offset available for in-use/free
 	 * indicator, so force >= 16bit alignment and make size even.
 	 */
-	if (unlikely(align < 2))
-		align = 2;
+	if (unlikely(align < PCPU_MIN_ALLOC_SIZE))
+		align = PCPU_MIN_ALLOC_SIZE;
 
-	size = ALIGN(size, 2);
+	size = ALIGN(size, PCPU_MIN_ALLOC_SIZE);
 
 	if (unlikely(!size || size > PCPU_MIN_UNIT_SIZE || align > PAGE_SIZE ||
 		     !is_power_of_2(align))) {
@@ -1653,6 +1653,7 @@ int __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai,
 	static int smap[PERCPU_DYNAMIC_EARLY_SLOTS] __initdata;
 	static int dmap[PERCPU_DYNAMIC_EARLY_SLOTS] __initdata;
 	size_t size_sum = ai->static_size + ai->reserved_size + ai->dyn_size;
+	size_t static_size, dyn_size;
 	struct pcpu_chunk *chunk;
 	unsigned long *group_offsets;
 	size_t *group_sizes;
@@ -1686,6 +1687,7 @@ int __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai,
 	PCPU_SETUP_BUG_ON(ai->unit_size < PCPU_MIN_UNIT_SIZE);
 	PCPU_SETUP_BUG_ON(ai->dyn_size < PERCPU_DYNAMIC_EARLY_SIZE);
 	PCPU_SETUP_BUG_ON(!ai->dyn_size);
+	PCPU_SETUP_BUG_ON(!IS_ALIGNED(ai->reserved_size, PCPU_MIN_ALLOC_SIZE));
 	PCPU_SETUP_BUG_ON(pcpu_verify_alloc_info(ai) < 0);
 
 	/* process group information and build config tables accordingly */
@@ -1763,6 +1765,17 @@ int __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai,
 	for (i = 0; i < pcpu_nr_slots; i++)
 		INIT_LIST_HEAD(&pcpu_slot[i]);
 
+	/*
+	 * The end of the static region needs to be aligned with the
+	 * minimum allocation size as this offsets the reserved and
+	 * dynamic region.  The first chunk ends page aligned by
+	 * expanding the dynamic region, therefore the dynamic region
+	 * can be shrunk to compensate while still staying above the
+	 * configured sizes.
+	 */
+	static_size = ALIGN(ai->static_size, PCPU_MIN_ALLOC_SIZE);
+	dyn_size = ai->dyn_size - (static_size - ai->static_size);
+
 	/*
 	 * Initialize first chunk.
 	 * If the reserved_size is non-zero, this initializes the reserved
@@ -1771,8 +1784,8 @@ int __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai,
 	 * pcpu_first_chunk, will always point to the chunk that serves
 	 * the dynamic region.
 	 */
-	tmp_addr = (unsigned long)base_addr + ai->static_size;
-	map_size = ai->reserved_size ?: ai->dyn_size;
+	tmp_addr = (unsigned long)base_addr + static_size;
+	map_size = ai->reserved_size ?: dyn_size;
 	chunk = pcpu_alloc_first_chunk(tmp_addr, map_size, smap,
 				       ARRAY_SIZE(smap));
 
@@ -1780,9 +1793,9 @@ int __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai,
 	if (ai->reserved_size) {
 		pcpu_reserved_chunk = chunk;
 
-		tmp_addr = (unsigned long)base_addr + ai->static_size +
+		tmp_addr = (unsigned long)base_addr + static_size +
 			   ai->reserved_size;
-		map_size = ai->dyn_size;
+		map_size = dyn_size;
 		chunk = pcpu_alloc_first_chunk(tmp_addr, map_size, dmap,
 					       ARRAY_SIZE(dmap));
 	}
-- 
cgit v1.2.3


From 3acdfd280fe7d807237f2cb7a09d6f8f7f1b484f Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@redhat.com>
Date: Mon, 24 Jul 2017 06:22:15 -0400
Subject: errseq: rename __errseq_set to errseq_set

Nothing calls this wrapper anymore, so just remove it and rename the
old function to get rid of the double underscore prefix.

Signed-off-by: Jeff Layton <jlayton@redhat.com>
---
 include/linux/errseq.h |  9 +--------
 lib/errseq.c           | 17 +++++++----------
 mm/filemap.c           |  2 +-
 3 files changed, 9 insertions(+), 19 deletions(-)

(limited to 'include')

diff --git a/include/linux/errseq.h b/include/linux/errseq.h
index 9e0d444ac88d..784f0860527b 100644
--- a/include/linux/errseq.h
+++ b/include/linux/errseq.h
@@ -5,14 +5,7 @@
 
 typedef u32	errseq_t;
 
-errseq_t __errseq_set(errseq_t *eseq, int err);
-static inline void errseq_set(errseq_t *eseq, int err)
-{
-	/* Optimize for the common case of no error */
-	if (unlikely(err))
-		__errseq_set(eseq, err);
-}
-
+errseq_t errseq_set(errseq_t *eseq, int err);
 errseq_t errseq_sample(errseq_t *eseq);
 int errseq_check(errseq_t *eseq, errseq_t since);
 int errseq_check_and_advance(errseq_t *eseq, errseq_t *since);
diff --git a/lib/errseq.c b/lib/errseq.c
index 841fa24e6e00..7b900c2a277a 100644
--- a/lib/errseq.c
+++ b/lib/errseq.c
@@ -41,23 +41,20 @@
 #define ERRSEQ_CTR_INC		(1 << (ERRSEQ_SHIFT + 1))
 
 /**
- * __errseq_set - set a errseq_t for later reporting
+ * errseq_set - set a errseq_t for later reporting
  * @eseq: errseq_t field that should be set
- * @err: error to set
+ * @err: error to set (must be between -1 and -MAX_ERRNO)
  *
  * This function sets the error in *eseq, and increments the sequence counter
  * if the last sequence was sampled at some point in the past.
  *
  * Any error set will always overwrite an existing error.
  *
- * Most callers will want to use the errseq_set inline wrapper to efficiently
- * handle the common case where err is 0.
- *
- * We do return an errseq_t here, primarily for debugging purposes. The return
- * value should not be used as a previously sampled value in later calls as it
- * will not have the SEEN flag set.
+ * We do return the latest value here, primarily for debugging purposes. The
+ * return value should not be used as a previously sampled value in later calls
+ * as it will not have the SEEN flag set.
  */
-errseq_t __errseq_set(errseq_t *eseq, int err)
+errseq_t errseq_set(errseq_t *eseq, int err)
 {
 	errseq_t cur, old;
 
@@ -107,7 +104,7 @@ errseq_t __errseq_set(errseq_t *eseq, int err)
 	}
 	return cur;
 }
-EXPORT_SYMBOL(__errseq_set);
+EXPORT_SYMBOL(errseq_set);
 
 /**
  * errseq_sample - grab current errseq_t value
diff --git a/mm/filemap.c b/mm/filemap.c
index a49702445ce0..e1cca770688f 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -589,7 +589,7 @@ EXPORT_SYMBOL(filemap_write_and_wait_range);
 
 void __filemap_set_wb_err(struct address_space *mapping, int err)
 {
-	errseq_t eseq = __errseq_set(&mapping->wb_err, err);
+	errseq_t eseq = errseq_set(&mapping->wb_err, err);
 
 	trace_filemap_set_wb_err(mapping, eseq);
 }
-- 
cgit v1.2.3


From da48c948c263c9d87dfc64566b3373a858cc8aa2 Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Wed, 19 Jul 2017 15:23:27 -0400
Subject: media: fix warning on v4l2_subdev_call() result interpreted as bool

v4l2_subdev_call is a macro returning whatever the callback return
type is, usually 'int'. With gcc-7 and ccache, this can lead to
many wanings like:

media/platform/pxa_camera.c: In function 'pxa_mbus_build_fmts_xlate':
media/platform/pxa_camera.c:766:27: error: ?: using integer constants in boolean context [-Werror=int-in-bool-context]
  while (!v4l2_subdev_call(subdev, pad, enum_mbus_code, NULL, &code)) {
media/atomisp/pci/atomisp2/atomisp_cmd.c: In function 'atomisp_s_ae_window':
media/atomisp/pci/atomisp2/atomisp_cmd.c:6414:52: error: ?: using integer constants in boolean context [-Werror=int-in-bool-context]
  if (v4l2_subdev_call(isp->inputs[asd->input_curr].camera,

The problem here is that after preprocessing, we the compiler
sees a variation of

	if (a ? 0 : 2)

that it thinks is suspicious.

This replaces the ?: operator with an different expression that
does the same thing in a more easily readable way that cannot
tigger the warning

Link: https://lkml.org/lkml/2017/7/14/156

Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Signed-off-by: Mauro Carvalho Chehab <mchehab@s-opensource.com>
---
 include/media/v4l2-subdev.h | 12 ++++++++++--
 1 file changed, 10 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/media/v4l2-subdev.h b/include/media/v4l2-subdev.h
index 0f92ebd2d710..e83872078376 100644
--- a/include/media/v4l2-subdev.h
+++ b/include/media/v4l2-subdev.h
@@ -982,8 +982,16 @@ void v4l2_subdev_init(struct v4l2_subdev *sd,
  * Example: err = v4l2_subdev_call(sd, video, s_std, norm);
  */
 #define v4l2_subdev_call(sd, o, f, args...)				\
-	(!(sd) ? -ENODEV : (((sd)->ops->o && (sd)->ops->o->f) ?	\
-		(sd)->ops->o->f((sd), ##args) : -ENOIOCTLCMD))
+	({								\
+		int __result;						\
+		if (!(sd))						\
+			__result = -ENODEV;				\
+		else if (!((sd)->ops->o && (sd)->ops->o->f))		\
+			__result = -ENOIOCTLCMD;			\
+		else							\
+			__result = (sd)->ops->o->f((sd), ##args);	\
+		__result;						\
+	})
 
 #define v4l2_subdev_has_op(sd, o, f) \
 	((sd)->ops->o && (sd)->ops->o->f)
-- 
cgit v1.2.3


From 40064aeca35c5c14789e2adcf3a1d7e5d4bd65f2 Mon Sep 17 00:00:00 2001
From: "Dennis Zhou (Facebook)" <dennisszhou@gmail.com>
Date: Wed, 12 Jul 2017 11:27:32 -0700
Subject: percpu: replace area map allocator with bitmap

The percpu memory allocator is experiencing scalability issues when
allocating and freeing large numbers of counters as in BPF.
Additionally, there is a corner case where iteration is triggered over
all chunks if the contig_hint is the right size, but wrong alignment.

This patch replaces the area map allocator with a basic bitmap allocator
implementation. Each subsequent patch will introduce new features and
replace full scanning functions with faster non-scanning options when
possible.

Implementation:
This patchset removes the area map allocator in favor of a bitmap
allocator backed by metadata blocks. The primary goal is to provide
consistency in performance and memory footprint with a focus on small
allocations (< 64 bytes). The bitmap removes the heavy memmove from the
freeing critical path and provides a consistent memory footprint. The
metadata blocks provide a bound on the amount of scanning required by
maintaining a set of hints.

In an effort to make freeing fast, the metadata is updated on the free
path if the new free area makes a page free, a block free, or spans
across blocks. This causes the chunk's contig hint to potentially be
smaller than what it could allocate by up to the smaller of a page or a
block. If the chunk's contig hint is contained within a block, a check
occurs and the hint is kept accurate. Metadata is always kept accurate
on allocation, so there will not be a situation where a chunk has a
later contig hint than available.

Evaluation:
I have primarily done testing against a simple workload of allocation of
1 million objects (2^20) of varying size. Deallocation was done by in
order, alternating, and in reverse. These numbers were collected after
rebasing ontop of a80099a152. I present the worst-case numbers here:

  Area Map Allocator:

        Object Size | Alloc Time (ms) | Free Time (ms)
        ----------------------------------------------
              4B    |        310      |     4770
             16B    |        557      |     1325
             64B    |        436      |      273
            256B    |        776      |      131
           1024B    |       3280      |      122

  Bitmap Allocator:

        Object Size | Alloc Time (ms) | Free Time (ms)
        ----------------------------------------------
              4B    |        490      |       70
             16B    |        515      |       75
             64B    |        610      |       80
            256B    |        950      |      100
           1024B    |       3520      |      200

This data demonstrates the inability for the area map allocator to
handle less than ideal situations. In the best case of reverse
deallocation, the area map allocator was able to perform within range
of the bitmap allocator. In the worst case situation, freeing took
nearly 5 seconds for 1 million 4-byte objects. The bitmap allocator
dramatically improves the consistency of the free path. The small
allocations performed nearly identical regardless of the freeing
pattern.

While it does add to the allocation latency, the allocation scenario
here is optimal for the area map allocator. The area map allocator runs
into trouble when it is allocating in chunks where the latter half is
full. It is difficult to replicate this, so I present a variant where
the pages are second half filled. Freeing was done sequentially. Below
are the numbers for this scenario:

  Area Map Allocator:

        Object Size | Alloc Time (ms) | Free Time (ms)
        ----------------------------------------------
              4B    |       4118      |     4892
             16B    |       1651      |     1163
             64B    |        598      |      285
            256B    |        771      |      158
           1024B    |       3034      |      160

  Bitmap Allocator:

        Object Size | Alloc Time (ms) | Free Time (ms)
        ----------------------------------------------
              4B    |        481      |       67
             16B    |        506      |       69
             64B    |        636      |       75
            256B    |        892      |       90
           1024B    |       3262      |      147

The data shows a parabolic curve of performance for the area map
allocator. This is due to the memmove operation being the dominant cost
with the lower object sizes as more objects are packed in a chunk and at
higher object sizes, the traversal of the chunk slots is the dominating
cost. The bitmap allocator suffers this problem as well. The above data
shows the inability to scale for the allocation path with the area map
allocator and that the bitmap allocator demonstrates consistent
performance in general.

The second problem of additional scanning can result in the area map
allocator completing in 52 minutes when trying to allocate 1 million
4-byte objects with 8-byte alignment. The same workload takes
approximately 16 seconds to complete for the bitmap allocator.

V2:
Fixed a bug in pcpu_alloc_first_chunk end_offset was setting the bitmap
using bytes instead of bits.

Added a comment to pcpu_cnt_pop_pages to explain bitmap_weight.

Signed-off-by: Dennis Zhou <dennisszhou@gmail.com>
Reviewed-by: Josef Bacik <jbacik@fb.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 include/linux/percpu.h |   1 -
 init/main.c            |   1 -
 mm/percpu-internal.h   |  34 ++-
 mm/percpu-km.c         |   2 +-
 mm/percpu-stats.c      |  99 ++++---
 mm/percpu.c            | 729 ++++++++++++++++++-------------------------------
 6 files changed, 362 insertions(+), 504 deletions(-)

(limited to 'include')

diff --git a/include/linux/percpu.h b/include/linux/percpu.h
index 90e0cb0f7802..b7e6c98722d1 100644
--- a/include/linux/percpu.h
+++ b/include/linux/percpu.h
@@ -120,7 +120,6 @@ extern bool is_kernel_percpu_address(unsigned long addr);
 #if !defined(CONFIG_SMP) || !defined(CONFIG_HAVE_SETUP_PER_CPU_AREA)
 extern void __init setup_per_cpu_areas(void);
 #endif
-extern void __init percpu_init_late(void);
 
 extern void __percpu *__alloc_percpu_gfp(size_t size, size_t align, gfp_t gfp);
 extern void __percpu *__alloc_percpu(size_t size, size_t align);
diff --git a/init/main.c b/init/main.c
index 052481fbe363..c9a9ffff6ec6 100644
--- a/init/main.c
+++ b/init/main.c
@@ -500,7 +500,6 @@ static void __init mm_init(void)
 	page_ext_init_flatmem();
 	mem_init();
 	kmem_cache_init();
-	percpu_init_late();
 	pgtable_init();
 	vmalloc_init();
 	ioremap_huge_init();
diff --git a/mm/percpu-internal.h b/mm/percpu-internal.h
index c4c8fc49780b..2e9d9bcb6fa2 100644
--- a/mm/percpu-internal.h
+++ b/mm/percpu-internal.h
@@ -11,14 +11,12 @@ struct pcpu_chunk {
 #endif
 
 	struct list_head	list;		/* linked to pcpu_slot lists */
-	int			free_size;	/* free bytes in the chunk */
-	int			contig_hint;	/* max contiguous size hint */
+	int			free_bytes;	/* free bytes in the chunk */
+	int			contig_bits;	/* max contiguous size hint */
 	void			*base_addr;	/* base address of this chunk */
 
-	int			map_used;	/* # of map entries used before the sentry */
-	int			map_alloc;	/* # of map entries allocated */
-	int			*map;		/* allocation map */
-	struct list_head	map_extend_list;/* on pcpu_map_extend_chunks */
+	unsigned long		*alloc_map;	/* allocation map */
+	unsigned long		*bound_map;	/* boundary map */
 
 	void			*data;		/* chunk data */
 	int			first_free;	/* no free below this */
@@ -45,6 +43,30 @@ extern int pcpu_nr_empty_pop_pages;
 extern struct pcpu_chunk *pcpu_first_chunk;
 extern struct pcpu_chunk *pcpu_reserved_chunk;
 
+/**
+ * pcpu_nr_pages_to_map_bits - converts the pages to size of bitmap
+ * @pages: number of physical pages
+ *
+ * This conversion is from physical pages to the number of bits
+ * required in the bitmap.
+ */
+static inline int pcpu_nr_pages_to_map_bits(int pages)
+{
+	return pages * PAGE_SIZE / PCPU_MIN_ALLOC_SIZE;
+}
+
+/**
+ * pcpu_chunk_map_bits - helper to convert nr_pages to size of bitmap
+ * @chunk: chunk of interest
+ *
+ * This conversion is from the number of physical pages that the chunk
+ * serves to the number of bits in the bitmap.
+ */
+static inline int pcpu_chunk_map_bits(struct pcpu_chunk *chunk)
+{
+	return pcpu_nr_pages_to_map_bits(chunk->nr_pages);
+}
+
 #ifdef CONFIG_PERCPU_STATS
 
 #include <linux/spinlock.h>
diff --git a/mm/percpu-km.c b/mm/percpu-km.c
index eb58aa4c0997..d2a76642c4ae 100644
--- a/mm/percpu-km.c
+++ b/mm/percpu-km.c
@@ -69,7 +69,7 @@ static struct pcpu_chunk *pcpu_create_chunk(void)
 	chunk->base_addr = page_address(pages) - pcpu_group_offsets[0];
 
 	spin_lock_irq(&pcpu_lock);
-	pcpu_chunk_populated(chunk, 0, nr_pages);
+	pcpu_chunk_populated(chunk, 0, nr_pages, false);
 	spin_unlock_irq(&pcpu_lock);
 
 	pcpu_stats_chunk_alloc();
diff --git a/mm/percpu-stats.c b/mm/percpu-stats.c
index e146b585fd18..ad03d73aa5fe 100644
--- a/mm/percpu-stats.c
+++ b/mm/percpu-stats.c
@@ -29,65 +29,85 @@ static int cmpint(const void *a, const void *b)
 }
 
 /*
- * Iterates over all chunks to find the max # of map entries used.
+ * Iterates over all chunks to find the max nr_alloc entries.
  */
-static int find_max_map_used(void)
+static int find_max_nr_alloc(void)
 {
 	struct pcpu_chunk *chunk;
-	int slot, max_map_used;
+	int slot, max_nr_alloc;
 
-	max_map_used = 0;
+	max_nr_alloc = 0;
 	for (slot = 0; slot < pcpu_nr_slots; slot++)
 		list_for_each_entry(chunk, &pcpu_slot[slot], list)
-			max_map_used = max(max_map_used, chunk->map_used);
+			max_nr_alloc = max(max_nr_alloc, chunk->nr_alloc);
 
-	return max_map_used;
+	return max_nr_alloc;
 }
 
 /*
  * Prints out chunk state. Fragmentation is considered between
  * the beginning of the chunk to the last allocation.
+ *
+ * All statistics are in bytes unless stated otherwise.
  */
 static void chunk_map_stats(struct seq_file *m, struct pcpu_chunk *chunk,
 			    int *buffer)
 {
-	int i, s_index, e_index, last_alloc, alloc_sign, as_len;
+	int i, last_alloc, as_len, start, end;
 	int *alloc_sizes, *p;
 	/* statistics */
 	int sum_frag = 0, max_frag = 0;
 	int cur_min_alloc = 0, cur_med_alloc = 0, cur_max_alloc = 0;
 
 	alloc_sizes = buffer;
-	s_index = (chunk->start_offset) ? 1 : 0;
-	e_index = chunk->map_used - ((chunk->end_offset) ? 1 : 0);
-
-	/* find last allocation */
-	last_alloc = -1;
-	for (i = e_index - 1; i >= s_index; i--) {
-		if (chunk->map[i] & 1) {
-			last_alloc = i;
-			break;
-		}
-	}
 
-	/* if the chunk is not empty - ignoring reserve */
-	if (last_alloc >= s_index) {
-		as_len = last_alloc + 1 - s_index;
-
-		/*
-		 * Iterate through chunk map computing size info.
-		 * The first bit is overloaded to be a used flag.
-		 * negative = free space, positive = allocated
-		 */
-		for (i = 0, p = chunk->map + s_index; i < as_len; i++, p++) {
-			alloc_sign = (*p & 1) ? 1 : -1;
-			alloc_sizes[i] = alloc_sign *
-				((p[1] & ~1) - (p[0] & ~1));
+	/*
+	 * find_last_bit returns the start value if nothing found.
+	 * Therefore, we must determine if it is a failure of find_last_bit
+	 * and set the appropriate value.
+	 */
+	last_alloc = find_last_bit(chunk->alloc_map,
+				   pcpu_chunk_map_bits(chunk) -
+				   chunk->end_offset / PCPU_MIN_ALLOC_SIZE - 1);
+	last_alloc = test_bit(last_alloc, chunk->alloc_map) ?
+		     last_alloc + 1 : 0;
+
+	as_len = 0;
+	start = chunk->start_offset;
+
+	/*
+	 * If a bit is set in the allocation map, the bound_map identifies
+	 * where the allocation ends.  If the allocation is not set, the
+	 * bound_map does not identify free areas as it is only kept accurate
+	 * on allocation, not free.
+	 *
+	 * Positive values are allocations and negative values are free
+	 * fragments.
+	 */
+	while (start < last_alloc) {
+		if (test_bit(start, chunk->alloc_map)) {
+			end = find_next_bit(chunk->bound_map, last_alloc,
+					    start + 1);
+			alloc_sizes[as_len] = 1;
+		} else {
+			end = find_next_bit(chunk->alloc_map, last_alloc,
+					    start + 1);
+			alloc_sizes[as_len] = -1;
 		}
 
-		sort(alloc_sizes, as_len, sizeof(chunk->map[0]), cmpint, NULL);
+		alloc_sizes[as_len++] *= (end - start) * PCPU_MIN_ALLOC_SIZE;
+
+		start = end;
+	}
+
+	/*
+	 * The negative values are free fragments and thus sorting gives the
+	 * free fragments at the beginning in largest first order.
+	 */
+	if (as_len > 0) {
+		sort(alloc_sizes, as_len, sizeof(int), cmpint, NULL);
 
-		/* Iterate through the unallocated fragements. */
+		/* iterate through the unallocated fragments */
 		for (i = 0, p = alloc_sizes; *p < 0 && i < as_len; i++, p++) {
 			sum_frag -= *p;
 			max_frag = max(max_frag, -1 * (*p));
@@ -101,8 +121,8 @@ static void chunk_map_stats(struct seq_file *m, struct pcpu_chunk *chunk,
 	P("nr_alloc", chunk->nr_alloc);
 	P("max_alloc_size", chunk->max_alloc_size);
 	P("empty_pop_pages", chunk->nr_empty_pop_pages);
-	P("free_size", chunk->free_size);
-	P("contig_hint", chunk->contig_hint);
+	P("free_bytes", chunk->free_bytes);
+	P("contig_bytes", chunk->contig_bits * PCPU_MIN_ALLOC_SIZE);
 	P("sum_frag", sum_frag);
 	P("max_frag", max_frag);
 	P("cur_min_alloc", cur_min_alloc);
@@ -114,22 +134,23 @@ static void chunk_map_stats(struct seq_file *m, struct pcpu_chunk *chunk,
 static int percpu_stats_show(struct seq_file *m, void *v)
 {
 	struct pcpu_chunk *chunk;
-	int slot, max_map_used;
+	int slot, max_nr_alloc;
 	int *buffer;
 
 alloc_buffer:
 	spin_lock_irq(&pcpu_lock);
-	max_map_used = find_max_map_used();
+	max_nr_alloc = find_max_nr_alloc();
 	spin_unlock_irq(&pcpu_lock);
 
-	buffer = vmalloc(max_map_used * sizeof(pcpu_first_chunk->map[0]));
+	/* there can be at most this many free and allocated fragments */
+	buffer = vmalloc((2 * max_nr_alloc + 1) * sizeof(int));
 	if (!buffer)
 		return -ENOMEM;
 
 	spin_lock_irq(&pcpu_lock);
 
 	/* if the buffer allocated earlier is too small */
-	if (max_map_used < find_max_map_used()) {
+	if (max_nr_alloc < find_max_nr_alloc()) {
 		spin_unlock_irq(&pcpu_lock);
 		vfree(buffer);
 		goto alloc_buffer;
diff --git a/mm/percpu.c b/mm/percpu.c
index 84cc2559d4aa..986d900e6680 100644
--- a/mm/percpu.c
+++ b/mm/percpu.c
@@ -86,10 +86,9 @@
 
 #include "percpu-internal.h"
 
-#define PCPU_SLOT_BASE_SHIFT		5	/* 1-31 shares the same slot */
-#define PCPU_DFL_MAP_ALLOC		16	/* start a map with 16 ents */
-#define PCPU_ATOMIC_MAP_MARGIN_LOW	32
-#define PCPU_ATOMIC_MAP_MARGIN_HIGH	64
+/* the slots are sorted by free bytes left, 1-31 bytes share the same slot */
+#define PCPU_SLOT_BASE_SHIFT		5
+
 #define PCPU_EMPTY_POP_PAGES_LOW	2
 #define PCPU_EMPTY_POP_PAGES_HIGH	4
 
@@ -218,10 +217,10 @@ static int pcpu_size_to_slot(int size)
 
 static int pcpu_chunk_slot(const struct pcpu_chunk *chunk)
 {
-	if (chunk->free_size < sizeof(int) || chunk->contig_hint < sizeof(int))
+	if (chunk->free_bytes < PCPU_MIN_ALLOC_SIZE || chunk->contig_bits == 0)
 		return 0;
 
-	return pcpu_size_to_slot(chunk->free_size);
+	return pcpu_size_to_slot(chunk->free_bytes);
 }
 
 /* set the pointer to a chunk in a page struct */
@@ -316,38 +315,6 @@ static void pcpu_mem_free(void *ptr)
 	kvfree(ptr);
 }
 
-/**
- * pcpu_count_occupied_pages - count the number of pages an area occupies
- * @chunk: chunk of interest
- * @i: index of the area in question
- *
- * Count the number of pages chunk's @i'th area occupies.  When the area's
- * start and/or end address isn't aligned to page boundary, the straddled
- * page is included in the count iff the rest of the page is free.
- */
-static int pcpu_count_occupied_pages(struct pcpu_chunk *chunk, int i)
-{
-	int off = chunk->map[i] & ~1;
-	int end = chunk->map[i + 1] & ~1;
-
-	if (!PAGE_ALIGNED(off) && i > 0) {
-		int prev = chunk->map[i - 1];
-
-		if (!(prev & 1) && prev <= round_down(off, PAGE_SIZE))
-			off = round_down(off, PAGE_SIZE);
-	}
-
-	if (!PAGE_ALIGNED(end) && i + 1 < chunk->map_used) {
-		int next = chunk->map[i + 1];
-		int nend = chunk->map[i + 2] & ~1;
-
-		if (!(next & 1) && nend >= round_up(end, PAGE_SIZE))
-			end = round_up(end, PAGE_SIZE);
-	}
-
-	return max_t(int, PFN_DOWN(end) - PFN_UP(off), 0);
-}
-
 /**
  * pcpu_chunk_relocate - put chunk in the appropriate chunk slot
  * @chunk: chunk of interest
@@ -374,358 +341,270 @@ static void pcpu_chunk_relocate(struct pcpu_chunk *chunk, int oslot)
 }
 
 /**
- * pcpu_need_to_extend - determine whether chunk area map needs to be extended
+ * pcpu_cnt_pop_pages- counts populated backing pages in range
  * @chunk: chunk of interest
- * @is_atomic: the allocation context
+ * @bit_off: start offset
+ * @bits: size of area to check
  *
- * Determine whether area map of @chunk needs to be extended.  If
- * @is_atomic, only the amount necessary for a new allocation is
- * considered; however, async extension is scheduled if the left amount is
- * low.  If !@is_atomic, it aims for more empty space.  Combined, this
- * ensures that the map is likely to have enough available space to
- * accomodate atomic allocations which can't extend maps directly.
- *
- * CONTEXT:
- * pcpu_lock.
+ * Calculates the number of populated pages in the region
+ * [page_start, page_end).  This keeps track of how many empty populated
+ * pages are available and decide if async work should be scheduled.
  *
  * RETURNS:
- * New target map allocation length if extension is necessary, 0
- * otherwise.
+ * The nr of populated pages.
  */
-static int pcpu_need_to_extend(struct pcpu_chunk *chunk, bool is_atomic)
+static inline int pcpu_cnt_pop_pages(struct pcpu_chunk *chunk, int bit_off,
+				     int bits)
 {
-	int margin, new_alloc;
-
-	lockdep_assert_held(&pcpu_lock);
+	int page_start = PFN_UP(bit_off * PCPU_MIN_ALLOC_SIZE);
+	int page_end = PFN_DOWN((bit_off + bits) * PCPU_MIN_ALLOC_SIZE);
 
-	if (is_atomic) {
-		margin = 3;
-
-		if (chunk->map_alloc <
-		    chunk->map_used + PCPU_ATOMIC_MAP_MARGIN_LOW) {
-			if (list_empty(&chunk->map_extend_list)) {
-				list_add_tail(&chunk->map_extend_list,
-					      &pcpu_map_extend_chunks);
-				pcpu_schedule_balance_work();
-			}
-		}
-	} else {
-		margin = PCPU_ATOMIC_MAP_MARGIN_HIGH;
-	}
-
-	if (chunk->map_alloc >= chunk->map_used + margin)
+	if (page_start >= page_end)
 		return 0;
 
-	new_alloc = PCPU_DFL_MAP_ALLOC;
-	while (new_alloc < chunk->map_used + margin)
-		new_alloc *= 2;
-
-	return new_alloc;
+	/*
+	 * bitmap_weight counts the number of bits set in a bitmap up to
+	 * the specified number of bits.  This is counting the populated
+	 * pages up to page_end and then subtracting the populated pages
+	 * up to page_start to count the populated pages in
+	 * [page_start, page_end).
+	 */
+	return bitmap_weight(chunk->populated, page_end) -
+	       bitmap_weight(chunk->populated, page_start);
 }
 
 /**
- * pcpu_extend_area_map - extend area map of a chunk
+ * pcpu_chunk_update - updates the chunk metadata given a free area
  * @chunk: chunk of interest
- * @new_alloc: new target allocation length of the area map
+ * @bit_off: chunk offset
+ * @bits: size of free area
  *
- * Extend area map of @chunk to have @new_alloc entries.
+ * This updates the chunk's contig hint given a free area.
+ */
+static void pcpu_chunk_update(struct pcpu_chunk *chunk, int bit_off, int bits)
+{
+	if (bits > chunk->contig_bits)
+		chunk->contig_bits = bits;
+}
+
+/**
+ * pcpu_chunk_refresh_hint - updates metadata about a chunk
+ * @chunk: chunk of interest
  *
- * CONTEXT:
- * Does GFP_KERNEL allocation.  Grabs and releases pcpu_lock.
+ * Iterates over the chunk to find the largest free area.
  *
- * RETURNS:
- * 0 on success, -errno on failure.
+ * Updates:
+ *      chunk->contig_bits
+ *      nr_empty_pop_pages
  */
-static int pcpu_extend_area_map(struct pcpu_chunk *chunk, int new_alloc)
+static void pcpu_chunk_refresh_hint(struct pcpu_chunk *chunk)
 {
-	int *old = NULL, *new = NULL;
-	size_t old_size = 0, new_size = new_alloc * sizeof(new[0]);
-	unsigned long flags;
+	int bits, nr_empty_pop_pages;
+	int rs, re; /* region start, region end */
 
-	lockdep_assert_held(&pcpu_alloc_mutex);
+	/* clear metadata */
+	chunk->contig_bits = 0;
 
-	new = pcpu_mem_zalloc(new_size);
-	if (!new)
-		return -ENOMEM;
+	bits = nr_empty_pop_pages = 0;
+	pcpu_for_each_unpop_region(chunk->alloc_map, rs, re, 0,
+				   pcpu_chunk_map_bits(chunk)) {
+		bits = re - rs;
 
-	/* acquire pcpu_lock and switch to new area map */
-	spin_lock_irqsave(&pcpu_lock, flags);
+		pcpu_chunk_update(chunk, rs, bits);
 
-	if (new_alloc <= chunk->map_alloc)
-		goto out_unlock;
+		nr_empty_pop_pages += pcpu_cnt_pop_pages(chunk, rs, bits);
+	}
 
-	old_size = chunk->map_alloc * sizeof(chunk->map[0]);
-	old = chunk->map;
+	/*
+	 * Keep track of nr_empty_pop_pages.
+	 *
+	 * The chunk maintains the previous number of free pages it held,
+	 * so the delta is used to update the global counter.  The reserved
+	 * chunk is not part of the free page count as they are populated
+	 * at init and are special to serving reserved allocations.
+	 */
+	if (chunk != pcpu_reserved_chunk)
+		pcpu_nr_empty_pop_pages +=
+			(nr_empty_pop_pages - chunk->nr_empty_pop_pages);
 
-	memcpy(new, old, old_size);
+	chunk->nr_empty_pop_pages = nr_empty_pop_pages;
+}
 
-	chunk->map_alloc = new_alloc;
-	chunk->map = new;
-	new = NULL;
+/**
+ * pcpu_is_populated - determines if the region is populated
+ * @chunk: chunk of interest
+ * @bit_off: chunk offset
+ * @bits: size of area
+ * @next_off: return value for the next offset to start searching
+ *
+ * For atomic allocations, check if the backing pages are populated.
+ *
+ * RETURNS:
+ * Bool if the backing pages are populated.
+ * next_index is to skip over unpopulated blocks in pcpu_find_block_fit.
+ */
+static bool pcpu_is_populated(struct pcpu_chunk *chunk, int bit_off, int bits,
+			      int *next_off)
+{
+	int page_start, page_end, rs, re;
 
-out_unlock:
-	spin_unlock_irqrestore(&pcpu_lock, flags);
+	page_start = PFN_DOWN(bit_off * PCPU_MIN_ALLOC_SIZE);
+	page_end = PFN_UP((bit_off + bits) * PCPU_MIN_ALLOC_SIZE);
 
-	/*
-	 * pcpu_mem_free() might end up calling vfree() which uses
-	 * IRQ-unsafe lock and thus can't be called under pcpu_lock.
-	 */
-	pcpu_mem_free(old);
-	pcpu_mem_free(new);
+	rs = page_start;
+	pcpu_next_unpop(chunk->populated, &rs, &re, page_end);
+	if (rs >= page_end)
+		return true;
 
-	return 0;
+	*next_off = re * PAGE_SIZE / PCPU_MIN_ALLOC_SIZE;
+	return false;
 }
 
 /**
- * pcpu_fit_in_area - try to fit the requested allocation in a candidate area
- * @chunk: chunk the candidate area belongs to
- * @off: the offset to the start of the candidate area
- * @this_size: the size of the candidate area
- * @size: the size of the target allocation
- * @align: the alignment of the target allocation
- * @pop_only: only allocate from already populated region
- *
- * We're trying to allocate @size bytes aligned at @align.  @chunk's area
- * at @off sized @this_size is a candidate.  This function determines
- * whether the target allocation fits in the candidate area and returns the
- * number of bytes to pad after @off.  If the target area doesn't fit, -1
- * is returned.
- *
- * If @pop_only is %true, this function only considers the already
- * populated part of the candidate area.
+ * pcpu_find_block_fit - finds the block index to start searching
+ * @chunk: chunk of interest
+ * @alloc_bits: size of request in allocation units
+ * @align: alignment of area (max PAGE_SIZE bytes)
+ * @pop_only: use populated regions only
+ *
+ * RETURNS:
+ * The offset in the bitmap to begin searching.
+ * -1 if no offset is found.
  */
-static int pcpu_fit_in_area(struct pcpu_chunk *chunk, int off, int this_size,
-			    int size, int align, bool pop_only)
+static int pcpu_find_block_fit(struct pcpu_chunk *chunk, int alloc_bits,
+			       size_t align, bool pop_only)
 {
-	int cand_off = off;
+	int bit_off, bits;
+	int re; /* region end */
 
-	while (true) {
-		int head = ALIGN(cand_off, align) - off;
-		int page_start, page_end, rs, re;
+	pcpu_for_each_unpop_region(chunk->alloc_map, bit_off, re, 0,
+				   pcpu_chunk_map_bits(chunk)) {
+		bits = re - bit_off;
 
-		if (this_size < head + size)
-			return -1;
+		/* check alignment */
+		bits -= ALIGN(bit_off, align) - bit_off;
+		bit_off = ALIGN(bit_off, align);
+		if (bits < alloc_bits)
+			continue;
 
-		if (!pop_only)
-			return head;
+		bits = alloc_bits;
+		if (!pop_only || pcpu_is_populated(chunk, bit_off, bits,
+						   &bit_off))
+			break;
 
-		/*
-		 * If the first unpopulated page is beyond the end of the
-		 * allocation, the whole allocation is populated;
-		 * otherwise, retry from the end of the unpopulated area.
-		 */
-		page_start = PFN_DOWN(head + off);
-		page_end = PFN_UP(head + off + size);
-
-		rs = page_start;
-		pcpu_next_unpop(chunk->populated, &rs, &re,
-				PFN_UP(off + this_size));
-		if (rs >= page_end)
-			return head;
-		cand_off = re * PAGE_SIZE;
+		bits = 0;
 	}
+
+	if (bit_off == pcpu_chunk_map_bits(chunk))
+		return -1;
+
+	return bit_off;
 }
 
 /**
- * pcpu_alloc_area - allocate area from a pcpu_chunk
+ * pcpu_alloc_area - allocates an area from a pcpu_chunk
  * @chunk: chunk of interest
- * @size: wanted size in bytes
- * @align: wanted align
- * @pop_only: allocate only from the populated area
- * @occ_pages_p: out param for the number of pages the area occupies
- *
- * Try to allocate @size bytes area aligned at @align from @chunk.
- * Note that this function only allocates the offset.  It doesn't
- * populate or map the area.
- *
- * @chunk->map must have at least two free slots.
+ * @alloc_bits: size of request in allocation units
+ * @align: alignment of area (max PAGE_SIZE)
+ * @start: bit_off to start searching
  *
- * CONTEXT:
- * pcpu_lock.
+ * This function takes in a @start offset to begin searching to fit an
+ * allocation of @alloc_bits with alignment @align.  If it confirms a
+ * valid free area, it then updates the allocation and boundary maps
+ * accordingly.
  *
  * RETURNS:
- * Allocated offset in @chunk on success, -1 if no matching area is
- * found.
+ * Allocated addr offset in @chunk on success.
+ * -1 if no matching area is found.
  */
-static int pcpu_alloc_area(struct pcpu_chunk *chunk, int size, int align,
-			   bool pop_only, int *occ_pages_p)
+static int pcpu_alloc_area(struct pcpu_chunk *chunk, int alloc_bits,
+			   size_t align, int start)
 {
-	int oslot = pcpu_chunk_slot(chunk);
-	int max_contig = 0;
-	int i, off;
-	bool seen_free = false;
-	int *p;
-
-	for (i = chunk->first_free, p = chunk->map + i; i < chunk->map_used; i++, p++) {
-		int head, tail;
-		int this_size;
-
-		off = *p;
-		if (off & 1)
-			continue;
-
-		this_size = (p[1] & ~1) - off;
+	size_t align_mask = (align) ? (align - 1) : 0;
+	int bit_off, end, oslot;
 
-		head = pcpu_fit_in_area(chunk, off, this_size, size, align,
-					pop_only);
-		if (head < 0) {
-			if (!seen_free) {
-				chunk->first_free = i;
-				seen_free = true;
-			}
-			max_contig = max(this_size, max_contig);
-			continue;
-		}
-
-		/*
-		 * If head is small or the previous block is free,
-		 * merge'em.  Note that 'small' is defined as smaller
-		 * than sizeof(int), which is very small but isn't too
-		 * uncommon for percpu allocations.
-		 */
-		if (head && (head < sizeof(int) || !(p[-1] & 1))) {
-			*p = off += head;
-			if (p[-1] & 1)
-				chunk->free_size -= head;
-			else
-				max_contig = max(*p - p[-1], max_contig);
-			this_size -= head;
-			head = 0;
-		}
+	lockdep_assert_held(&pcpu_lock);
 
-		/* if tail is small, just keep it around */
-		tail = this_size - head - size;
-		if (tail < sizeof(int)) {
-			tail = 0;
-			size = this_size - head;
-		}
+	oslot = pcpu_chunk_slot(chunk);
 
-		/* split if warranted */
-		if (head || tail) {
-			int nr_extra = !!head + !!tail;
-
-			/* insert new subblocks */
-			memmove(p + nr_extra + 1, p + 1,
-				sizeof(chunk->map[0]) * (chunk->map_used - i));
-			chunk->map_used += nr_extra;
-
-			if (head) {
-				if (!seen_free) {
-					chunk->first_free = i;
-					seen_free = true;
-				}
-				*++p = off += head;
-				++i;
-				max_contig = max(head, max_contig);
-			}
-			if (tail) {
-				p[1] = off + size;
-				max_contig = max(tail, max_contig);
-			}
-		}
+	/*
+	 * Search to find a fit.
+	 */
+	end = start + alloc_bits;
+	bit_off = bitmap_find_next_zero_area(chunk->alloc_map, end, start,
+					     alloc_bits, align_mask);
+	if (bit_off >= end)
+		return -1;
 
-		if (!seen_free)
-			chunk->first_free = i + 1;
+	/* update alloc map */
+	bitmap_set(chunk->alloc_map, bit_off, alloc_bits);
 
-		/* update hint and mark allocated */
-		if (i + 1 == chunk->map_used)
-			chunk->contig_hint = max_contig; /* fully scanned */
-		else
-			chunk->contig_hint = max(chunk->contig_hint,
-						 max_contig);
+	/* update boundary map */
+	set_bit(bit_off, chunk->bound_map);
+	bitmap_clear(chunk->bound_map, bit_off + 1, alloc_bits - 1);
+	set_bit(bit_off + alloc_bits, chunk->bound_map);
 
-		chunk->free_size -= size;
-		*p |= 1;
+	chunk->free_bytes -= alloc_bits * PCPU_MIN_ALLOC_SIZE;
 
-		*occ_pages_p = pcpu_count_occupied_pages(chunk, i);
-		pcpu_chunk_relocate(chunk, oslot);
-		return off;
-	}
+	pcpu_chunk_refresh_hint(chunk);
 
-	chunk->contig_hint = max_contig;	/* fully scanned */
 	pcpu_chunk_relocate(chunk, oslot);
 
-	/* tell the upper layer that this chunk has no matching area */
-	return -1;
+	return bit_off * PCPU_MIN_ALLOC_SIZE;
 }
 
 /**
- * pcpu_free_area - free area to a pcpu_chunk
+ * pcpu_free_area - frees the corresponding offset
  * @chunk: chunk of interest
- * @freeme: offset of area to free
- * @occ_pages_p: out param for the number of pages the area occupies
- *
- * Free area starting from @freeme to @chunk.  Note that this function
- * only modifies the allocation map.  It doesn't depopulate or unmap
- * the area.
+ * @off: addr offset into chunk
  *
- * CONTEXT:
- * pcpu_lock.
+ * This function determines the size of an allocation to free using
+ * the boundary bitmap and clears the allocation map.
  */
-static void pcpu_free_area(struct pcpu_chunk *chunk, int freeme,
-			   int *occ_pages_p)
+static void pcpu_free_area(struct pcpu_chunk *chunk, int off)
 {
-	int oslot = pcpu_chunk_slot(chunk);
-	int off = 0;
-	unsigned i, j;
-	int to_free = 0;
-	int *p;
+	int bit_off, bits, end, oslot;
 
 	lockdep_assert_held(&pcpu_lock);
 	pcpu_stats_area_dealloc(chunk);
 
-	freeme |= 1;	/* we are searching for <given offset, in use> pair */
-
-	i = 0;
-	j = chunk->map_used;
-	while (i != j) {
-		unsigned k = (i + j) / 2;
-		off = chunk->map[k];
-		if (off < freeme)
-			i = k + 1;
-		else if (off > freeme)
-			j = k;
-		else
-			i = j = k;
-	}
-	BUG_ON(off != freeme);
+	oslot = pcpu_chunk_slot(chunk);
 
-	if (i < chunk->first_free)
-		chunk->first_free = i;
+	bit_off = off / PCPU_MIN_ALLOC_SIZE;
 
-	p = chunk->map + i;
-	*p = off &= ~1;
-	chunk->free_size += (p[1] & ~1) - off;
+	/* find end index */
+	end = find_next_bit(chunk->bound_map, pcpu_chunk_map_bits(chunk),
+			    bit_off + 1);
+	bits = end - bit_off;
+	bitmap_clear(chunk->alloc_map, bit_off, bits);
 
-	*occ_pages_p = pcpu_count_occupied_pages(chunk, i);
+	/* update metadata */
+	chunk->free_bytes += bits * PCPU_MIN_ALLOC_SIZE;
 
-	/* merge with next? */
-	if (!(p[1] & 1))
-		to_free++;
-	/* merge with previous? */
-	if (i > 0 && !(p[-1] & 1)) {
-		to_free++;
-		i--;
-		p--;
-	}
-	if (to_free) {
-		chunk->map_used -= to_free;
-		memmove(p + 1, p + 1 + to_free,
-			(chunk->map_used - i) * sizeof(chunk->map[0]));
-	}
+	pcpu_chunk_refresh_hint(chunk);
 
-	chunk->contig_hint = max(chunk->map[i + 1] - chunk->map[i] - 1, chunk->contig_hint);
 	pcpu_chunk_relocate(chunk, oslot);
 }
 
+/**
+ * pcpu_alloc_first_chunk - creates chunks that serve the first chunk
+ * @tmp_addr: the start of the region served
+ * @map_size: size of the region served
+ *
+ * This is responsible for creating the chunks that serve the first chunk.  The
+ * base_addr is page aligned down of @tmp_addr while the region end is page
+ * aligned up.  Offsets are kept track of to determine the region served. All
+ * this is done to appease the bitmap allocator in avoiding partial blocks.
+ *
+ * RETURNS:
+ * Chunk serving the region at @tmp_addr of @map_size.
+ */
 static struct pcpu_chunk * __init pcpu_alloc_first_chunk(unsigned long tmp_addr,
-							 int map_size,
-							 int *map,
-							 int init_map_size)
+							 int map_size)
 {
 	struct pcpu_chunk *chunk;
 	unsigned long aligned_addr;
-	int start_offset, region_size;
+	int start_offset, offset_bits, region_size, region_bits;
 
 	/* region calculations */
 	aligned_addr = tmp_addr & PAGE_MASK;
@@ -740,83 +619,99 @@ static struct pcpu_chunk * __init pcpu_alloc_first_chunk(unsigned long tmp_addr,
 				    0);
 
 	INIT_LIST_HEAD(&chunk->list);
-	INIT_LIST_HEAD(&chunk->map_extend_list);
 
 	chunk->base_addr = (void *)aligned_addr;
 	chunk->start_offset = start_offset;
 	chunk->end_offset = region_size - chunk->start_offset - map_size;
 
 	chunk->nr_pages = region_size >> PAGE_SHIFT;
+	region_bits = pcpu_chunk_map_bits(chunk);
 
-	chunk->map = map;
-	chunk->map_alloc = init_map_size;
+	chunk->alloc_map = memblock_virt_alloc(
+				BITS_TO_LONGS(region_bits) *
+				sizeof(chunk->alloc_map[0]), 0);
+	chunk->bound_map = memblock_virt_alloc(
+				BITS_TO_LONGS(region_bits + 1) *
+				sizeof(chunk->bound_map[0]), 0);
 
 	/* manage populated page bitmap */
 	chunk->immutable = true;
 	bitmap_fill(chunk->populated, chunk->nr_pages);
 	chunk->nr_populated = chunk->nr_pages;
-	chunk->nr_empty_pop_pages = chunk->nr_pages;
+	chunk->nr_empty_pop_pages =
+		pcpu_cnt_pop_pages(chunk, start_offset / PCPU_MIN_ALLOC_SIZE,
+				   map_size / PCPU_MIN_ALLOC_SIZE);
 
-	chunk->contig_hint = chunk->free_size = map_size;
+	chunk->contig_bits = map_size / PCPU_MIN_ALLOC_SIZE;
+	chunk->free_bytes = map_size;
 
 	if (chunk->start_offset) {
 		/* hide the beginning of the bitmap */
-		chunk->nr_empty_pop_pages--;
-
-		chunk->map[0] = 1;
-		chunk->map[1] = chunk->start_offset;
-		chunk->map_used = 1;
+		offset_bits = chunk->start_offset / PCPU_MIN_ALLOC_SIZE;
+		bitmap_set(chunk->alloc_map, 0, offset_bits);
+		set_bit(0, chunk->bound_map);
+		set_bit(offset_bits, chunk->bound_map);
 	}
 
-	/* set chunk's free region */
-	chunk->map[++chunk->map_used] =
-		(chunk->start_offset + chunk->free_size) | 1;
-
 	if (chunk->end_offset) {
 		/* hide the end of the bitmap */
-		chunk->nr_empty_pop_pages--;
-
-		chunk->map[++chunk->map_used] = region_size | 1;
+		offset_bits = chunk->end_offset / PCPU_MIN_ALLOC_SIZE;
+		bitmap_set(chunk->alloc_map,
+			   pcpu_chunk_map_bits(chunk) - offset_bits,
+			   offset_bits);
+		set_bit((start_offset + map_size) / PCPU_MIN_ALLOC_SIZE,
+			chunk->bound_map);
+		set_bit(region_bits, chunk->bound_map);
 	}
 
+	pcpu_chunk_refresh_hint(chunk);
+
 	return chunk;
 }
 
 static struct pcpu_chunk *pcpu_alloc_chunk(void)
 {
 	struct pcpu_chunk *chunk;
+	int region_bits;
 
 	chunk = pcpu_mem_zalloc(pcpu_chunk_struct_size);
 	if (!chunk)
 		return NULL;
 
-	chunk->map = pcpu_mem_zalloc(PCPU_DFL_MAP_ALLOC *
-						sizeof(chunk->map[0]));
-	if (!chunk->map) {
-		pcpu_mem_free(chunk);
-		return NULL;
-	}
+	INIT_LIST_HEAD(&chunk->list);
+	chunk->nr_pages = pcpu_unit_pages;
+	region_bits = pcpu_chunk_map_bits(chunk);
 
-	chunk->map_alloc = PCPU_DFL_MAP_ALLOC;
-	chunk->map[0] = 0;
-	chunk->map[1] = pcpu_unit_size | 1;
-	chunk->map_used = 1;
+	chunk->alloc_map = pcpu_mem_zalloc(BITS_TO_LONGS(region_bits) *
+					   sizeof(chunk->alloc_map[0]));
+	if (!chunk->alloc_map)
+		goto alloc_map_fail;
 
-	INIT_LIST_HEAD(&chunk->list);
-	INIT_LIST_HEAD(&chunk->map_extend_list);
-	chunk->free_size = pcpu_unit_size;
-	chunk->contig_hint = pcpu_unit_size;
+	chunk->bound_map = pcpu_mem_zalloc(BITS_TO_LONGS(region_bits + 1) *
+					   sizeof(chunk->bound_map[0]));
+	if (!chunk->bound_map)
+		goto bound_map_fail;
 
-	chunk->nr_pages = pcpu_unit_pages;
+	/* init metadata */
+	chunk->contig_bits = region_bits;
+	chunk->free_bytes = chunk->nr_pages * PAGE_SIZE;
 
 	return chunk;
+
+bound_map_fail:
+	pcpu_mem_free(chunk->alloc_map);
+alloc_map_fail:
+	pcpu_mem_free(chunk);
+
+	return NULL;
 }
 
 static void pcpu_free_chunk(struct pcpu_chunk *chunk)
 {
 	if (!chunk)
 		return;
-	pcpu_mem_free(chunk->map);
+	pcpu_mem_free(chunk->bound_map);
+	pcpu_mem_free(chunk->alloc_map);
 	pcpu_mem_free(chunk);
 }
 
@@ -825,13 +720,17 @@ static void pcpu_free_chunk(struct pcpu_chunk *chunk)
  * @chunk: pcpu_chunk which got populated
  * @page_start: the start page
  * @page_end: the end page
+ * @for_alloc: if this is to populate for allocation
  *
  * Pages in [@page_start,@page_end) have been populated to @chunk.  Update
  * the bookkeeping information accordingly.  Must be called after each
  * successful population.
+ *
+ * If this is @for_alloc, do not increment pcpu_nr_empty_pop_pages because it
+ * is to serve an allocation in that area.
  */
-static void pcpu_chunk_populated(struct pcpu_chunk *chunk,
-				 int page_start, int page_end)
+static void pcpu_chunk_populated(struct pcpu_chunk *chunk, int page_start,
+				 int page_end, bool for_alloc)
 {
 	int nr = page_end - page_start;
 
@@ -839,8 +738,11 @@ static void pcpu_chunk_populated(struct pcpu_chunk *chunk,
 
 	bitmap_set(chunk->populated, page_start, nr);
 	chunk->nr_populated += nr;
-	chunk->nr_empty_pop_pages += nr;
-	pcpu_nr_empty_pop_pages += nr;
+
+	if (!for_alloc) {
+		chunk->nr_empty_pop_pages += nr;
+		pcpu_nr_empty_pop_pages += nr;
+	}
 }
 
 /**
@@ -945,19 +847,23 @@ static void __percpu *pcpu_alloc(size_t size, size_t align, bool reserved,
 	struct pcpu_chunk *chunk;
 	const char *err;
 	bool is_atomic = (gfp & GFP_KERNEL) != GFP_KERNEL;
-	int occ_pages = 0;
-	int slot, off, new_alloc, cpu, ret;
+	int slot, off, cpu, ret;
 	unsigned long flags;
 	void __percpu *ptr;
+	size_t bits, bit_align;
 
 	/*
-	 * We want the lowest bit of offset available for in-use/free
-	 * indicator, so force >= 16bit alignment and make size even.
+	 * There is now a minimum allocation size of PCPU_MIN_ALLOC_SIZE,
+	 * therefore alignment must be a minimum of that many bytes.
+	 * An allocation may have internal fragmentation from rounding up
+	 * of up to PCPU_MIN_ALLOC_SIZE - 1 bytes.
 	 */
 	if (unlikely(align < PCPU_MIN_ALLOC_SIZE))
 		align = PCPU_MIN_ALLOC_SIZE;
 
 	size = ALIGN(size, PCPU_MIN_ALLOC_SIZE);
+	bits = size >> PCPU_MIN_ALLOC_SHIFT;
+	bit_align = align >> PCPU_MIN_ALLOC_SHIFT;
 
 	if (unlikely(!size || size > PCPU_MIN_UNIT_SIZE || align > PAGE_SIZE ||
 		     !is_power_of_2(align))) {
@@ -975,23 +881,13 @@ static void __percpu *pcpu_alloc(size_t size, size_t align, bool reserved,
 	if (reserved && pcpu_reserved_chunk) {
 		chunk = pcpu_reserved_chunk;
 
-		if (size > chunk->contig_hint) {
+		off = pcpu_find_block_fit(chunk, bits, bit_align, is_atomic);
+		if (off < 0) {
 			err = "alloc from reserved chunk failed";
 			goto fail_unlock;
 		}
 
-		while ((new_alloc = pcpu_need_to_extend(chunk, is_atomic))) {
-			spin_unlock_irqrestore(&pcpu_lock, flags);
-			if (is_atomic ||
-			    pcpu_extend_area_map(chunk, new_alloc) < 0) {
-				err = "failed to extend area map of reserved chunk";
-				goto fail;
-			}
-			spin_lock_irqsave(&pcpu_lock, flags);
-		}
-
-		off = pcpu_alloc_area(chunk, size, align, is_atomic,
-				      &occ_pages);
+		off = pcpu_alloc_area(chunk, bits, bit_align, off);
 		if (off >= 0)
 			goto area_found;
 
@@ -1003,31 +899,15 @@ restart:
 	/* search through normal chunks */
 	for (slot = pcpu_size_to_slot(size); slot < pcpu_nr_slots; slot++) {
 		list_for_each_entry(chunk, &pcpu_slot[slot], list) {
-			if (size > chunk->contig_hint)
+			off = pcpu_find_block_fit(chunk, bits, bit_align,
+						  is_atomic);
+			if (off < 0)
 				continue;
 
-			new_alloc = pcpu_need_to_extend(chunk, is_atomic);
-			if (new_alloc) {
-				if (is_atomic)
-					continue;
-				spin_unlock_irqrestore(&pcpu_lock, flags);
-				if (pcpu_extend_area_map(chunk,
-							 new_alloc) < 0) {
-					err = "failed to extend area map";
-					goto fail;
-				}
-				spin_lock_irqsave(&pcpu_lock, flags);
-				/*
-				 * pcpu_lock has been dropped, need to
-				 * restart cpu_slot list walking.
-				 */
-				goto restart;
-			}
-
-			off = pcpu_alloc_area(chunk, size, align, is_atomic,
-					      &occ_pages);
+			off = pcpu_alloc_area(chunk, bits, bit_align, off);
 			if (off >= 0)
 				goto area_found;
+
 		}
 	}
 
@@ -1077,23 +957,17 @@ area_found:
 
 			spin_lock_irqsave(&pcpu_lock, flags);
 			if (ret) {
-				pcpu_free_area(chunk, off, &occ_pages);
+				pcpu_free_area(chunk, off);
 				err = "failed to populate";
 				goto fail_unlock;
 			}
-			pcpu_chunk_populated(chunk, rs, re);
+			pcpu_chunk_populated(chunk, rs, re, true);
 			spin_unlock_irqrestore(&pcpu_lock, flags);
 		}
 
 		mutex_unlock(&pcpu_alloc_mutex);
 	}
 
-	if (chunk != pcpu_reserved_chunk) {
-		spin_lock_irqsave(&pcpu_lock, flags);
-		pcpu_nr_empty_pop_pages -= occ_pages;
-		spin_unlock_irqrestore(&pcpu_lock, flags);
-	}
-
 	if (pcpu_nr_empty_pop_pages < PCPU_EMPTY_POP_PAGES_LOW)
 		pcpu_schedule_balance_work();
 
@@ -1211,7 +1085,6 @@ static void pcpu_balance_workfn(struct work_struct *work)
 		if (chunk == list_first_entry(free_head, struct pcpu_chunk, list))
 			continue;
 
-		list_del_init(&chunk->map_extend_list);
 		list_move(&chunk->list, &to_free);
 	}
 
@@ -1230,25 +1103,6 @@ static void pcpu_balance_workfn(struct work_struct *work)
 		pcpu_destroy_chunk(chunk);
 	}
 
-	/* service chunks which requested async area map extension */
-	do {
-		int new_alloc = 0;
-
-		spin_lock_irq(&pcpu_lock);
-
-		chunk = list_first_entry_or_null(&pcpu_map_extend_chunks,
-					struct pcpu_chunk, map_extend_list);
-		if (chunk) {
-			list_del_init(&chunk->map_extend_list);
-			new_alloc = pcpu_need_to_extend(chunk, false);
-		}
-
-		spin_unlock_irq(&pcpu_lock);
-
-		if (new_alloc)
-			pcpu_extend_area_map(chunk, new_alloc);
-	} while (chunk);
-
 	/*
 	 * Ensure there are certain number of free populated pages for
 	 * atomic allocs.  Fill up from the most packed so that atomic
@@ -1296,7 +1150,7 @@ retry_pop:
 			if (!ret) {
 				nr_to_pop -= nr;
 				spin_lock_irq(&pcpu_lock);
-				pcpu_chunk_populated(chunk, rs, rs + nr);
+				pcpu_chunk_populated(chunk, rs, rs + nr, false);
 				spin_unlock_irq(&pcpu_lock);
 			} else {
 				nr_to_pop = 0;
@@ -1335,7 +1189,7 @@ void free_percpu(void __percpu *ptr)
 	void *addr;
 	struct pcpu_chunk *chunk;
 	unsigned long flags;
-	int off, occ_pages;
+	int off;
 
 	if (!ptr)
 		return;
@@ -1349,13 +1203,10 @@ void free_percpu(void __percpu *ptr)
 	chunk = pcpu_chunk_addr_search(addr);
 	off = addr - chunk->base_addr;
 
-	pcpu_free_area(chunk, off, &occ_pages);
-
-	if (chunk != pcpu_reserved_chunk)
-		pcpu_nr_empty_pop_pages += occ_pages;
+	pcpu_free_area(chunk, off);
 
 	/* if there are more than one fully free chunks, wake up grim reaper */
-	if (chunk->free_size == pcpu_unit_size) {
+	if (chunk->free_bytes == pcpu_unit_size) {
 		struct pcpu_chunk *pos;
 
 		list_for_each_entry(pos, &pcpu_slot[pcpu_nr_slots - 1], list)
@@ -1651,8 +1502,6 @@ static void pcpu_dump_alloc_info(const char *lvl,
 int __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai,
 				  void *base_addr)
 {
-	static int smap[PERCPU_DYNAMIC_EARLY_SLOTS] __initdata;
-	static int dmap[PERCPU_DYNAMIC_EARLY_SLOTS] __initdata;
 	size_t size_sum = ai->static_size + ai->reserved_size + ai->dyn_size;
 	size_t static_size, dyn_size;
 	struct pcpu_chunk *chunk;
@@ -1787,8 +1636,7 @@ int __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai,
 	 */
 	tmp_addr = (unsigned long)base_addr + static_size;
 	map_size = ai->reserved_size ?: dyn_size;
-	chunk = pcpu_alloc_first_chunk(tmp_addr, map_size, smap,
-				       ARRAY_SIZE(smap));
+	chunk = pcpu_alloc_first_chunk(tmp_addr, map_size);
 
 	/* init dynamic chunk if necessary */
 	if (ai->reserved_size) {
@@ -1797,8 +1645,7 @@ int __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai,
 		tmp_addr = (unsigned long)base_addr + static_size +
 			   ai->reserved_size;
 		map_size = dyn_size;
-		chunk = pcpu_alloc_first_chunk(tmp_addr, map_size, dmap,
-					       ARRAY_SIZE(dmap));
+		chunk = pcpu_alloc_first_chunk(tmp_addr, map_size);
 	}
 
 	/* link the first chunk in */
@@ -2374,36 +2221,6 @@ void __init setup_per_cpu_areas(void)
 
 #endif	/* CONFIG_SMP */
 
-/*
- * First and reserved chunks are initialized with temporary allocation
- * map in initdata so that they can be used before slab is online.
- * This function is called after slab is brought up and replaces those
- * with properly allocated maps.
- */
-void __init percpu_init_late(void)
-{
-	struct pcpu_chunk *target_chunks[] =
-		{ pcpu_first_chunk, pcpu_reserved_chunk, NULL };
-	struct pcpu_chunk *chunk;
-	unsigned long flags;
-	int i;
-
-	for (i = 0; (chunk = target_chunks[i]); i++) {
-		int *map;
-		const size_t size = PERCPU_DYNAMIC_EARLY_SLOTS * sizeof(map[0]);
-
-		BUILD_BUG_ON(size > PAGE_SIZE);
-
-		map = pcpu_mem_zalloc(size);
-		BUG_ON(!map);
-
-		spin_lock_irqsave(&pcpu_lock, flags);
-		memcpy(map, chunk->map, size);
-		chunk->map = map;
-		spin_unlock_irqrestore(&pcpu_lock, flags);
-	}
-}
-
 /*
  * Percpu allocator is initialized early during boot when neither slab or
  * workqueue is available.  Plug async management until everything is up
-- 
cgit v1.2.3


From ca460b3c96274d79f84b31a3fea23a6eed479917 Mon Sep 17 00:00:00 2001
From: "Dennis Zhou (Facebook)" <dennisszhou@gmail.com>
Date: Mon, 24 Jul 2017 19:02:12 -0400
Subject: percpu: introduce bitmap metadata blocks

This patch introduces the bitmap metadata blocks and adds the skeleton
of the code that will be used to maintain these blocks.  Each chunk's
bitmap is made up of full metadata blocks. These blocks maintain basic
metadata to help prevent scanning unnecssarily to update hints. Full
scanning methods are used for the skeleton and will be replaced in the
coming patches. A number of helper functions are added as well to do
conversion of pages to blocks and manage offsets. Comments will be
updated as the final version of each function is added.

There exists a relationship between PAGE_SIZE, PCPU_BITMAP_BLOCK_SIZE,
the region size, and unit_size. Every chunk's region (including offsets)
is page aligned at the beginning to preserve alignment. The end is
aligned to LCM(PAGE_SIZE, PCPU_BITMAP_BLOCK_SIZE) to ensure that the end
can fit with the populated page map which is by page and every metadata
block is fully accounted for. The unit_size is already page aligned, but
must also be aligned with PCPU_BITMAP_BLOCK_SIZE to ensure full metadata
blocks.

Signed-off-by: Dennis Zhou <dennisszhou@gmail.com>
Reviewed-by: Josef Bacik <jbacik@fb.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 include/linux/percpu.h |  12 +++
 mm/percpu-internal.h   |  29 +++++++
 mm/percpu.c            | 228 ++++++++++++++++++++++++++++++++++++++++++++++---
 3 files changed, 257 insertions(+), 12 deletions(-)

(limited to 'include')

diff --git a/include/linux/percpu.h b/include/linux/percpu.h
index b7e6c98722d1..31795e619273 100644
--- a/include/linux/percpu.h
+++ b/include/linux/percpu.h
@@ -25,6 +25,18 @@
 #define PCPU_MIN_ALLOC_SHIFT		2
 #define PCPU_MIN_ALLOC_SIZE		(1 << PCPU_MIN_ALLOC_SHIFT)
 
+/*
+ * This determines the size of each metadata block.  There are several subtle
+ * constraints around this constant.  The reserved region must be a multiple of
+ * PCPU_BITMAP_BLOCK_SIZE.  Additionally, PCPU_BITMAP_BLOCK_SIZE must be a
+ * multiple of PAGE_SIZE or PAGE_SIZE must be a multiple of
+ * PCPU_BITMAP_BLOCK_SIZE to align with the populated page map. The unit_size
+ * also has to be a multiple of PCPU_BITMAP_BLOCK_SIZE to ensure full blocks.
+ */
+#define PCPU_BITMAP_BLOCK_SIZE		PAGE_SIZE
+#define PCPU_BITMAP_BLOCK_BITS		(PCPU_BITMAP_BLOCK_SIZE >>	\
+					 PCPU_MIN_ALLOC_SHIFT)
+
 /*
  * Percpu allocator can serve percpu allocations before slab is
  * initialized which allows slab to depend on the percpu allocator.
diff --git a/mm/percpu-internal.h b/mm/percpu-internal.h
index 2e9d9bcb6fa2..252ae9e960e0 100644
--- a/mm/percpu-internal.h
+++ b/mm/percpu-internal.h
@@ -4,6 +4,22 @@
 #include <linux/types.h>
 #include <linux/percpu.h>
 
+/*
+ * pcpu_block_md is the metadata block struct.
+ * Each chunk's bitmap is split into a number of full blocks.
+ * All units are in terms of bits.
+ */
+struct pcpu_block_md {
+	int                     contig_hint;    /* contig hint for block */
+	int                     contig_hint_start; /* block relative starting
+						      position of the contig hint */
+	int                     left_free;      /* size of free space along
+						   the left side of the block */
+	int                     right_free;     /* size of free space along
+						   the right side of the block */
+	int                     first_free;     /* block position of first free */
+};
+
 struct pcpu_chunk {
 #ifdef CONFIG_PERCPU_STATS
 	int			nr_alloc;	/* # of allocations */
@@ -17,6 +33,7 @@ struct pcpu_chunk {
 
 	unsigned long		*alloc_map;	/* allocation map */
 	unsigned long		*bound_map;	/* boundary map */
+	struct pcpu_block_md	*md_blocks;	/* metadata blocks */
 
 	void			*data;		/* chunk data */
 	int			first_free;	/* no free below this */
@@ -43,6 +60,18 @@ extern int pcpu_nr_empty_pop_pages;
 extern struct pcpu_chunk *pcpu_first_chunk;
 extern struct pcpu_chunk *pcpu_reserved_chunk;
 
+/**
+ * pcpu_chunk_nr_blocks - converts nr_pages to # of md_blocks
+ * @chunk: chunk of interest
+ *
+ * This conversion is from the number of physical pages that the chunk
+ * serves to the number of bitmap blocks used.
+ */
+static inline int pcpu_chunk_nr_blocks(struct pcpu_chunk *chunk)
+{
+	return chunk->nr_pages * PAGE_SIZE / PCPU_BITMAP_BLOCK_SIZE;
+}
+
 /**
  * pcpu_nr_pages_to_map_bits - converts the pages to size of bitmap
  * @pages: number of physical pages
diff --git a/mm/percpu.c b/mm/percpu.c
index 986d900e6680..708c6de237b9 100644
--- a/mm/percpu.c
+++ b/mm/percpu.c
@@ -63,6 +63,7 @@
 #include <linux/bitmap.h>
 #include <linux/bootmem.h>
 #include <linux/err.h>
+#include <linux/lcm.h>
 #include <linux/list.h>
 #include <linux/log2.h>
 #include <linux/mm.h>
@@ -279,6 +280,26 @@ static void pcpu_next_pop(unsigned long *bitmap, int *rs, int *re, int end)
 	     (rs) < (re);						     \
 	     (rs) = (re) + 1, pcpu_next_pop((bitmap), &(rs), &(re), (end)))
 
+/*
+ * The following are helper functions to help access bitmaps and convert
+ * between bitmap offsets to address offsets.
+ */
+static unsigned long *pcpu_index_alloc_map(struct pcpu_chunk *chunk, int index)
+{
+	return chunk->alloc_map +
+	       (index * PCPU_BITMAP_BLOCK_BITS / BITS_PER_LONG);
+}
+
+static unsigned long pcpu_off_to_block_index(int off)
+{
+	return off / PCPU_BITMAP_BLOCK_BITS;
+}
+
+static unsigned long pcpu_off_to_block_off(int off)
+{
+	return off & (PCPU_BITMAP_BLOCK_BITS - 1);
+}
+
 /**
  * pcpu_mem_zalloc - allocate memory
  * @size: bytes to allocate
@@ -430,6 +451,154 @@ static void pcpu_chunk_refresh_hint(struct pcpu_chunk *chunk)
 	chunk->nr_empty_pop_pages = nr_empty_pop_pages;
 }
 
+/**
+ * pcpu_block_update - updates a block given a free area
+ * @block: block of interest
+ * @start: start offset in block
+ * @end: end offset in block
+ *
+ * Updates a block given a known free area.  The region [start, end) is
+ * expected to be the entirety of the free area within a block.
+ */
+static void pcpu_block_update(struct pcpu_block_md *block, int start, int end)
+{
+	int contig = end - start;
+
+	block->first_free = min(block->first_free, start);
+	if (start == 0)
+		block->left_free = contig;
+
+	if (end == PCPU_BITMAP_BLOCK_BITS)
+		block->right_free = contig;
+
+	if (contig > block->contig_hint) {
+		block->contig_hint_start = start;
+		block->contig_hint = contig;
+	}
+}
+
+/**
+ * pcpu_block_refresh_hint
+ * @chunk: chunk of interest
+ * @index: index of the metadata block
+ *
+ * Scans over the block beginning at first_free and updates the block
+ * metadata accordingly.
+ */
+static void pcpu_block_refresh_hint(struct pcpu_chunk *chunk, int index)
+{
+	struct pcpu_block_md *block = chunk->md_blocks + index;
+	unsigned long *alloc_map = pcpu_index_alloc_map(chunk, index);
+	int rs, re;	/* region start, region end */
+
+	/* clear hints */
+	block->contig_hint = 0;
+	block->left_free = block->right_free = 0;
+
+	/* iterate over free areas and update the contig hints */
+	pcpu_for_each_unpop_region(alloc_map, rs, re, block->first_free,
+				   PCPU_BITMAP_BLOCK_BITS) {
+		pcpu_block_update(block, rs, re);
+	}
+}
+
+/**
+ * pcpu_block_update_hint_alloc - update hint on allocation path
+ * @chunk: chunk of interest
+ * @bit_off: chunk offset
+ * @bits: size of request
+ */
+static void pcpu_block_update_hint_alloc(struct pcpu_chunk *chunk, int bit_off,
+					 int bits)
+{
+	struct pcpu_block_md *s_block, *e_block, *block;
+	int s_index, e_index;	/* block indexes of the freed allocation */
+	int s_off, e_off;	/* block offsets of the freed allocation */
+
+	/*
+	 * Calculate per block offsets.
+	 * The calculation uses an inclusive range, but the resulting offsets
+	 * are [start, end).  e_index always points to the last block in the
+	 * range.
+	 */
+	s_index = pcpu_off_to_block_index(bit_off);
+	e_index = pcpu_off_to_block_index(bit_off + bits - 1);
+	s_off = pcpu_off_to_block_off(bit_off);
+	e_off = pcpu_off_to_block_off(bit_off + bits - 1) + 1;
+
+	s_block = chunk->md_blocks + s_index;
+	e_block = chunk->md_blocks + e_index;
+
+	/*
+	 * Update s_block.
+	 */
+	pcpu_block_refresh_hint(chunk, s_index);
+
+	/*
+	 * Update e_block.
+	 */
+	if (s_index != e_index) {
+		pcpu_block_refresh_hint(chunk, e_index);
+
+		/* update in-between md_blocks */
+		for (block = s_block + 1; block < e_block; block++) {
+			block->contig_hint = 0;
+			block->left_free = 0;
+			block->right_free = 0;
+		}
+	}
+
+	pcpu_chunk_refresh_hint(chunk);
+}
+
+/**
+ * pcpu_block_update_hint_free - updates the block hints on the free path
+ * @chunk: chunk of interest
+ * @bit_off: chunk offset
+ * @bits: size of request
+ */
+static void pcpu_block_update_hint_free(struct pcpu_chunk *chunk, int bit_off,
+					int bits)
+{
+	struct pcpu_block_md *s_block, *e_block, *block;
+	int s_index, e_index;	/* block indexes of the freed allocation */
+	int s_off, e_off;	/* block offsets of the freed allocation */
+
+	/*
+	 * Calculate per block offsets.
+	 * The calculation uses an inclusive range, but the resulting offsets
+	 * are [start, end).  e_index always points to the last block in the
+	 * range.
+	 */
+	s_index = pcpu_off_to_block_index(bit_off);
+	e_index = pcpu_off_to_block_index(bit_off + bits - 1);
+	s_off = pcpu_off_to_block_off(bit_off);
+	e_off = pcpu_off_to_block_off(bit_off + bits - 1) + 1;
+
+	s_block = chunk->md_blocks + s_index;
+	e_block = chunk->md_blocks + e_index;
+
+	/* update s_block */
+	pcpu_block_refresh_hint(chunk, s_index);
+
+	/* freeing in the same block */
+	if (s_index != e_index) {
+		/* update e_block */
+		pcpu_block_refresh_hint(chunk, e_index);
+
+		/* reset md_blocks in the middle */
+		for (block = s_block + 1; block < e_block; block++) {
+			block->first_free = 0;
+			block->contig_hint_start = 0;
+			block->contig_hint = PCPU_BITMAP_BLOCK_BITS;
+			block->left_free = PCPU_BITMAP_BLOCK_BITS;
+			block->right_free = PCPU_BITMAP_BLOCK_BITS;
+		}
+	}
+
+	pcpu_chunk_refresh_hint(chunk);
+}
+
 /**
  * pcpu_is_populated - determines if the region is populated
  * @chunk: chunk of interest
@@ -546,7 +715,7 @@ static int pcpu_alloc_area(struct pcpu_chunk *chunk, int alloc_bits,
 
 	chunk->free_bytes -= alloc_bits * PCPU_MIN_ALLOC_SIZE;
 
-	pcpu_chunk_refresh_hint(chunk);
+	pcpu_block_update_hint_alloc(chunk, bit_off, alloc_bits);
 
 	pcpu_chunk_relocate(chunk, oslot);
 
@@ -581,11 +750,24 @@ static void pcpu_free_area(struct pcpu_chunk *chunk, int off)
 	/* update metadata */
 	chunk->free_bytes += bits * PCPU_MIN_ALLOC_SIZE;
 
-	pcpu_chunk_refresh_hint(chunk);
+	pcpu_block_update_hint_free(chunk, bit_off, bits);
 
 	pcpu_chunk_relocate(chunk, oslot);
 }
 
+static void pcpu_init_md_blocks(struct pcpu_chunk *chunk)
+{
+	struct pcpu_block_md *md_block;
+
+	for (md_block = chunk->md_blocks;
+	     md_block != chunk->md_blocks + pcpu_chunk_nr_blocks(chunk);
+	     md_block++) {
+		md_block->contig_hint = PCPU_BITMAP_BLOCK_BITS;
+		md_block->left_free = PCPU_BITMAP_BLOCK_BITS;
+		md_block->right_free = PCPU_BITMAP_BLOCK_BITS;
+	}
+}
+
 /**
  * pcpu_alloc_first_chunk - creates chunks that serve the first chunk
  * @tmp_addr: the start of the region served
@@ -603,7 +785,7 @@ static struct pcpu_chunk * __init pcpu_alloc_first_chunk(unsigned long tmp_addr,
 							 int map_size)
 {
 	struct pcpu_chunk *chunk;
-	unsigned long aligned_addr;
+	unsigned long aligned_addr, lcm_align;
 	int start_offset, offset_bits, region_size, region_bits;
 
 	/* region calculations */
@@ -611,7 +793,13 @@ static struct pcpu_chunk * __init pcpu_alloc_first_chunk(unsigned long tmp_addr,
 
 	start_offset = tmp_addr - aligned_addr;
 
-	region_size = PFN_ALIGN(start_offset + map_size);
+	/*
+	 * Align the end of the region with the LCM of PAGE_SIZE and
+	 * PCPU_BITMAP_BLOCK_SIZE.  One of these constants is a multiple of
+	 * the other.
+	 */
+	lcm_align = lcm(PAGE_SIZE, PCPU_BITMAP_BLOCK_SIZE);
+	region_size = ALIGN(start_offset + map_size, lcm_align);
 
 	/* allocate chunk */
 	chunk = memblock_virt_alloc(sizeof(struct pcpu_chunk) +
@@ -627,12 +815,13 @@ static struct pcpu_chunk * __init pcpu_alloc_first_chunk(unsigned long tmp_addr,
 	chunk->nr_pages = region_size >> PAGE_SHIFT;
 	region_bits = pcpu_chunk_map_bits(chunk);
 
-	chunk->alloc_map = memblock_virt_alloc(
-				BITS_TO_LONGS(region_bits) *
-				sizeof(chunk->alloc_map[0]), 0);
-	chunk->bound_map = memblock_virt_alloc(
-				BITS_TO_LONGS(region_bits + 1) *
-				sizeof(chunk->bound_map[0]), 0);
+	chunk->alloc_map = memblock_virt_alloc(BITS_TO_LONGS(region_bits) *
+					       sizeof(chunk->alloc_map[0]), 0);
+	chunk->bound_map = memblock_virt_alloc(BITS_TO_LONGS(region_bits + 1) *
+					       sizeof(chunk->bound_map[0]), 0);
+	chunk->md_blocks = memblock_virt_alloc(pcpu_chunk_nr_blocks(chunk) *
+					       sizeof(chunk->md_blocks[0]), 0);
+	pcpu_init_md_blocks(chunk);
 
 	/* manage populated page bitmap */
 	chunk->immutable = true;
@@ -651,6 +840,8 @@ static struct pcpu_chunk * __init pcpu_alloc_first_chunk(unsigned long tmp_addr,
 		bitmap_set(chunk->alloc_map, 0, offset_bits);
 		set_bit(0, chunk->bound_map);
 		set_bit(offset_bits, chunk->bound_map);
+
+		pcpu_block_update_hint_alloc(chunk, 0, offset_bits);
 	}
 
 	if (chunk->end_offset) {
@@ -662,9 +853,10 @@ static struct pcpu_chunk * __init pcpu_alloc_first_chunk(unsigned long tmp_addr,
 		set_bit((start_offset + map_size) / PCPU_MIN_ALLOC_SIZE,
 			chunk->bound_map);
 		set_bit(region_bits, chunk->bound_map);
-	}
 
-	pcpu_chunk_refresh_hint(chunk);
+		pcpu_block_update_hint_alloc(chunk, pcpu_chunk_map_bits(chunk)
+					     - offset_bits, offset_bits);
+	}
 
 	return chunk;
 }
@@ -692,12 +884,21 @@ static struct pcpu_chunk *pcpu_alloc_chunk(void)
 	if (!chunk->bound_map)
 		goto bound_map_fail;
 
+	chunk->md_blocks = pcpu_mem_zalloc(pcpu_chunk_nr_blocks(chunk) *
+					   sizeof(chunk->md_blocks[0]));
+	if (!chunk->md_blocks)
+		goto md_blocks_fail;
+
+	pcpu_init_md_blocks(chunk);
+
 	/* init metadata */
 	chunk->contig_bits = region_bits;
 	chunk->free_bytes = chunk->nr_pages * PAGE_SIZE;
 
 	return chunk;
 
+md_blocks_fail:
+	pcpu_mem_free(chunk->bound_map);
 bound_map_fail:
 	pcpu_mem_free(chunk->alloc_map);
 alloc_map_fail:
@@ -1535,9 +1736,12 @@ int __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai,
 	PCPU_SETUP_BUG_ON(ai->unit_size < size_sum);
 	PCPU_SETUP_BUG_ON(offset_in_page(ai->unit_size));
 	PCPU_SETUP_BUG_ON(ai->unit_size < PCPU_MIN_UNIT_SIZE);
+	PCPU_SETUP_BUG_ON(!IS_ALIGNED(ai->unit_size, PCPU_BITMAP_BLOCK_SIZE));
 	PCPU_SETUP_BUG_ON(ai->dyn_size < PERCPU_DYNAMIC_EARLY_SIZE);
 	PCPU_SETUP_BUG_ON(!ai->dyn_size);
 	PCPU_SETUP_BUG_ON(!IS_ALIGNED(ai->reserved_size, PCPU_MIN_ALLOC_SIZE));
+	PCPU_SETUP_BUG_ON(!(IS_ALIGNED(PCPU_BITMAP_BLOCK_SIZE, PAGE_SIZE) ||
+			    IS_ALIGNED(PAGE_SIZE, PCPU_BITMAP_BLOCK_SIZE)));
 	PCPU_SETUP_BUG_ON(pcpu_verify_alloc_info(ai) < 0);
 
 	/* process group information and build config tables accordingly */
-- 
cgit v1.2.3


From b185cd0dc61c14875155e7bcc3f2c139b6feefd2 Mon Sep 17 00:00:00 2001
From: "Dennis Zhou (Facebook)" <dennisszhou@gmail.com>
Date: Mon, 24 Jul 2017 19:02:17 -0400
Subject: percpu: update free path to take advantage of contig hints

The bitmap allocator must keep metadata consistent. The easiest way is
to scan after every allocation for each affected block and the entire
chunk. This is rather expensive.

The free path can take advantage of current contig hints to prevent
scanning within the start and end block.  If a scan is needed, it can
be done by scanning backwards from the start and forwards from the end
to identify the entire free area this can be combined with. The blocks
can then be updated by some basic checks rather than complete block
scans.

A chunk scan happens when the freed area makes a page free, a block
free, or spans across blocks. This is necessary as the contig hint at
this point could span across blocks. The check uses the minimum of page
size and the block size to allow for variable sized blocks. There is a
tradeoff here with not updating after every free. It is possible a
contig hint in one block can be merged with the contig hint in the next
block. This means the contig hint can be off by up to a page. However,
if the chunk's contig hint is contained in one block, the contig hint
will be accurate.

Signed-off-by: Dennis Zhou <dennisszhou@gmail.com>
Reviewed-by: Josef Bacik <jbacik@fb.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 include/linux/percpu.h |  3 +++
 mm/percpu.c            | 68 +++++++++++++++++++++++++++++++++++++++++++++++---
 2 files changed, 68 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/include/linux/percpu.h b/include/linux/percpu.h
index 31795e619273..6a5fb939d3e5 100644
--- a/include/linux/percpu.h
+++ b/include/linux/percpu.h
@@ -25,6 +25,9 @@
 #define PCPU_MIN_ALLOC_SHIFT		2
 #define PCPU_MIN_ALLOC_SIZE		(1 << PCPU_MIN_ALLOC_SHIFT)
 
+/* number of bits per page, used to trigger a scan if blocks are > PAGE_SIZE */
+#define PCPU_BITS_PER_PAGE		(PAGE_SIZE >> PCPU_MIN_ALLOC_SHIFT)
+
 /*
  * This determines the size of each metadata block.  There are several subtle
  * constraints around this constant.  The reserved region must be a multiple of
diff --git a/mm/percpu.c b/mm/percpu.c
index f38f47a65642..57b3168eae08 100644
--- a/mm/percpu.c
+++ b/mm/percpu.c
@@ -300,6 +300,11 @@ static unsigned long pcpu_off_to_block_off(int off)
 	return off & (PCPU_BITMAP_BLOCK_BITS - 1);
 }
 
+static unsigned long pcpu_block_off_to_off(int index, int off)
+{
+	return index * PCPU_BITMAP_BLOCK_BITS + off;
+}
+
 /**
  * pcpu_mem_zalloc - allocate memory
  * @size: bytes to allocate
@@ -623,6 +628,17 @@ static void pcpu_block_update_hint_alloc(struct pcpu_chunk *chunk, int bit_off,
  * @chunk: chunk of interest
  * @bit_off: chunk offset
  * @bits: size of request
+ *
+ * Updates metadata for the allocation path.  This avoids a blind block
+ * refresh by making use of the block contig hints.  If this fails, it scans
+ * forward and backward to determine the extent of the free area.  This is
+ * capped at the boundary of blocks.
+ *
+ * A chunk update is triggered if a page becomes free, a block becomes free,
+ * or the free spans across blocks.  This tradeoff is to minimize iterating
+ * over the block metadata to update chunk->contig_bits.  chunk->contig_bits
+ * may be off by up to a page, but it will never be more than the available
+ * space.  If the contig hint is contained in one block, it will be accurate.
  */
 static void pcpu_block_update_hint_free(struct pcpu_chunk *chunk, int bit_off,
 					int bits)
@@ -630,6 +646,7 @@ static void pcpu_block_update_hint_free(struct pcpu_chunk *chunk, int bit_off,
 	struct pcpu_block_md *s_block, *e_block, *block;
 	int s_index, e_index;	/* block indexes of the freed allocation */
 	int s_off, e_off;	/* block offsets of the freed allocation */
+	int start, end;		/* start and end of the whole free area */
 
 	/*
 	 * Calculate per block offsets.
@@ -645,13 +662,46 @@ static void pcpu_block_update_hint_free(struct pcpu_chunk *chunk, int bit_off,
 	s_block = chunk->md_blocks + s_index;
 	e_block = chunk->md_blocks + e_index;
 
+	/*
+	 * Check if the freed area aligns with the block->contig_hint.
+	 * If it does, then the scan to find the beginning/end of the
+	 * larger free area can be avoided.
+	 *
+	 * start and end refer to beginning and end of the free area
+	 * within each their respective blocks.  This is not necessarily
+	 * the entire free area as it may span blocks past the beginning
+	 * or end of the block.
+	 */
+	start = s_off;
+	if (s_off == s_block->contig_hint + s_block->contig_hint_start) {
+		start = s_block->contig_hint_start;
+	} else {
+		/*
+		 * Scan backwards to find the extent of the free area.
+		 * find_last_bit returns the starting bit, so if the start bit
+		 * is returned, that means there was no last bit and the
+		 * remainder of the chunk is free.
+		 */
+		int l_bit = find_last_bit(pcpu_index_alloc_map(chunk, s_index),
+					  start);
+		start = (start == l_bit) ? 0 : l_bit + 1;
+	}
+
+	end = e_off;
+	if (e_off == e_block->contig_hint_start)
+		end = e_block->contig_hint_start + e_block->contig_hint;
+	else
+		end = find_next_bit(pcpu_index_alloc_map(chunk, e_index),
+				    PCPU_BITMAP_BLOCK_BITS, end);
+
 	/* update s_block */
-	pcpu_block_refresh_hint(chunk, s_index);
+	e_off = (s_index == e_index) ? end : PCPU_BITMAP_BLOCK_BITS;
+	pcpu_block_update(s_block, start, e_off);
 
 	/* freeing in the same block */
 	if (s_index != e_index) {
 		/* update e_block */
-		pcpu_block_refresh_hint(chunk, e_index);
+		pcpu_block_update(e_block, 0, end);
 
 		/* reset md_blocks in the middle */
 		for (block = s_block + 1; block < e_block; block++) {
@@ -663,7 +713,19 @@ static void pcpu_block_update_hint_free(struct pcpu_chunk *chunk, int bit_off,
 		}
 	}
 
-	pcpu_chunk_refresh_hint(chunk);
+	/*
+	 * Refresh chunk metadata when the free makes a page free, a block
+	 * free, or spans across blocks.  The contig hint may be off by up to
+	 * a page, but if the hint is contained in a block, it will be accurate
+	 * with the else condition below.
+	 */
+	if ((ALIGN_DOWN(end, min(PCPU_BITS_PER_PAGE, PCPU_BITMAP_BLOCK_BITS)) >
+	     ALIGN(start, min(PCPU_BITS_PER_PAGE, PCPU_BITMAP_BLOCK_BITS))) ||
+	    s_index != e_index)
+		pcpu_chunk_refresh_hint(chunk);
+	else
+		pcpu_chunk_update(chunk, pcpu_block_off_to_off(s_index, start),
+				  s_block->contig_hint);
 }
 
 /**
-- 
cgit v1.2.3


From 645874e5807ae5ffa09fba2ccd23f01e4eb16d58 Mon Sep 17 00:00:00 2001
From: Sudarsana Reddy Kalluru <sudarsana.kalluru@cavium.com>
Date: Wed, 26 Jul 2017 06:07:11 -0700
Subject: qed: Add support for Energy efficient ethernet.

The patch adds required driver support for reading/configuring the
Energy Efficient Ethernet (EEE) parameters.

Signed-off-by: Sudarsana Reddy Kalluru <sudarsana.kalluru@cavium.com>
Signed-off-by: Yuval Mintz <yuval.mintz@cavium.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/qlogic/qed/qed_dev.c  | 72 ++++++++++++++++++++++++++++--
 drivers/net/ethernet/qlogic/qed/qed_hsi.h  | 49 ++++++++++++++++++++
 drivers/net/ethernet/qlogic/qed/qed_main.c | 19 ++++++++
 drivers/net/ethernet/qlogic/qed/qed_mcp.c  | 66 +++++++++++++++++++++++++++
 drivers/net/ethernet/qlogic/qed/qed_mcp.h  | 37 +++++++++++++--
 include/linux/qed/qed_if.h                 | 20 +++++++++
 6 files changed, 256 insertions(+), 7 deletions(-)

(limited to 'include')

diff --git a/drivers/net/ethernet/qlogic/qed/qed_dev.c b/drivers/net/ethernet/qlogic/qed/qed_dev.c
index 6c87bed13bd2..f545607100e4 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_dev.c
+++ b/drivers/net/ethernet/qlogic/qed/qed_dev.c
@@ -1684,6 +1684,8 @@ int qed_hw_init(struct qed_dev *cdev, struct qed_hw_init_params *p_params)
 			   "Load request was sent. Load code: 0x%x\n",
 			   load_code);
 
+		qed_mcp_set_capabilities(p_hwfn, p_hwfn->p_main_ptt);
+
 		qed_reset_mb_shadow(p_hwfn, p_hwfn->p_main_ptt);
 
 		p_hwfn->first_on_engine = (load_code ==
@@ -2472,6 +2474,7 @@ static int qed_hw_get_nvm_info(struct qed_hwfn *p_hwfn, struct qed_ptt *p_ptt)
 {
 	u32 port_cfg_addr, link_temp, nvm_cfg_addr, device_capabilities;
 	u32 nvm_cfg1_offset, mf_mode, addr, generic_cont0, core_cfg;
+	struct qed_mcp_link_capabilities *p_caps;
 	struct qed_mcp_link_params *link;
 
 	/* Read global nvm_cfg address */
@@ -2534,6 +2537,7 @@ static int qed_hw_get_nvm_info(struct qed_hwfn *p_hwfn, struct qed_ptt *p_ptt)
 
 	/* Read default link configuration */
 	link = &p_hwfn->mcp_info->link_input;
+	p_caps = &p_hwfn->mcp_info->link_capabilities;
 	port_cfg_addr = MCP_REG_SCRATCH + nvm_cfg1_offset +
 			offsetof(struct nvm_cfg1, port[MFW_PORT(p_hwfn)]);
 	link_temp = qed_rd(p_hwfn, p_ptt,
@@ -2588,10 +2592,45 @@ static int qed_hw_get_nvm_info(struct qed_hwfn *p_hwfn, struct qed_ptt *p_ptt)
 				   NVM_CFG1_PORT_DRV_FLOW_CONTROL_TX);
 	link->loopback_mode = 0;
 
-	DP_VERBOSE(p_hwfn, NETIF_MSG_LINK,
-		   "Read default link: Speed 0x%08x, Adv. Speed 0x%08x, AN: 0x%02x, PAUSE AN: 0x%02x\n",
-		   link->speed.forced_speed, link->speed.advertised_speeds,
-		   link->speed.autoneg, link->pause.autoneg);
+	if (p_hwfn->mcp_info->capabilities & FW_MB_PARAM_FEATURE_SUPPORT_EEE) {
+		link_temp = qed_rd(p_hwfn, p_ptt, port_cfg_addr +
+				   offsetof(struct nvm_cfg1_port, ext_phy));
+		link_temp &= NVM_CFG1_PORT_EEE_POWER_SAVING_MODE_MASK;
+		link_temp >>= NVM_CFG1_PORT_EEE_POWER_SAVING_MODE_OFFSET;
+		p_caps->default_eee = QED_MCP_EEE_ENABLED;
+		link->eee.enable = true;
+		switch (link_temp) {
+		case NVM_CFG1_PORT_EEE_POWER_SAVING_MODE_DISABLED:
+			p_caps->default_eee = QED_MCP_EEE_DISABLED;
+			link->eee.enable = false;
+			break;
+		case NVM_CFG1_PORT_EEE_POWER_SAVING_MODE_BALANCED:
+			p_caps->eee_lpi_timer = EEE_TX_TIMER_USEC_BALANCED_TIME;
+			break;
+		case NVM_CFG1_PORT_EEE_POWER_SAVING_MODE_AGGRESSIVE:
+			p_caps->eee_lpi_timer =
+			    EEE_TX_TIMER_USEC_AGGRESSIVE_TIME;
+			break;
+		case NVM_CFG1_PORT_EEE_POWER_SAVING_MODE_LOW_LATENCY:
+			p_caps->eee_lpi_timer = EEE_TX_TIMER_USEC_LATENCY_TIME;
+			break;
+		}
+
+		link->eee.tx_lpi_timer = p_caps->eee_lpi_timer;
+		link->eee.tx_lpi_enable = link->eee.enable;
+		link->eee.adv_caps = QED_EEE_1G_ADV | QED_EEE_10G_ADV;
+	} else {
+		p_caps->default_eee = QED_MCP_EEE_UNSUPPORTED;
+	}
+
+	DP_VERBOSE(p_hwfn,
+		   NETIF_MSG_LINK,
+		   "Read default link: Speed 0x%08x, Adv. Speed 0x%08x, AN: 0x%02x, PAUSE AN: 0x%02x EEE: %02x [%08x usec]\n",
+		   link->speed.forced_speed,
+		   link->speed.advertised_speeds,
+		   link->speed.autoneg,
+		   link->pause.autoneg,
+		   p_caps->default_eee, p_caps->eee_lpi_timer);
 
 	/* Read Multi-function information from shmem */
 	addr = MCP_REG_SCRATCH + nvm_cfg1_offset +
@@ -2751,6 +2790,27 @@ static void qed_hw_info_port_num(struct qed_hwfn *p_hwfn, struct qed_ptt *p_ptt)
 		qed_hw_info_port_num_ah(p_hwfn, p_ptt);
 }
 
+static void qed_get_eee_caps(struct qed_hwfn *p_hwfn, struct qed_ptt *p_ptt)
+{
+	struct qed_mcp_link_capabilities *p_caps;
+	u32 eee_status;
+
+	p_caps = &p_hwfn->mcp_info->link_capabilities;
+	if (p_caps->default_eee == QED_MCP_EEE_UNSUPPORTED)
+		return;
+
+	p_caps->eee_speed_caps = 0;
+	eee_status = qed_rd(p_hwfn, p_ptt, p_hwfn->mcp_info->port_addr +
+			    offsetof(struct public_port, eee_status));
+	eee_status = (eee_status & EEE_SUPPORTED_SPEED_MASK) >>
+			EEE_SUPPORTED_SPEED_OFFSET;
+
+	if (eee_status & EEE_1G_SUPPORTED)
+		p_caps->eee_speed_caps |= QED_EEE_1G_ADV;
+	if (eee_status & EEE_10G_ADV)
+		p_caps->eee_speed_caps |= QED_EEE_10G_ADV;
+}
+
 static int
 qed_get_hw_info(struct qed_hwfn *p_hwfn,
 		struct qed_ptt *p_ptt,
@@ -2767,6 +2827,8 @@ qed_get_hw_info(struct qed_hwfn *p_hwfn,
 
 	qed_hw_info_port_num(p_hwfn, p_ptt);
 
+	qed_mcp_get_capabilities(p_hwfn, p_ptt);
+
 	qed_hw_get_nvm_info(p_hwfn, p_ptt);
 
 	rc = qed_int_igu_read_cam(p_hwfn, p_ptt);
@@ -2785,6 +2847,8 @@ qed_get_hw_info(struct qed_hwfn *p_hwfn,
 				p_hwfn->mcp_info->func_info.ovlan;
 
 		qed_mcp_cmd_port_init(p_hwfn, p_ptt);
+
+		qed_get_eee_caps(p_hwfn, p_ptt);
 	}
 
 	if (qed_mcp_is_init(p_hwfn)) {
diff --git a/drivers/net/ethernet/qlogic/qed/qed_hsi.h b/drivers/net/ethernet/qlogic/qed/qed_hsi.h
index 31fb0bffa098..3427fe7049b5 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_hsi.h
+++ b/drivers/net/ethernet/qlogic/qed/qed_hsi.h
@@ -10825,6 +10825,17 @@ struct eth_phy_cfg {
 #define ETH_LOOPBACK_EXT		(3)
 #define ETH_LOOPBACK_MAC		(4)
 
+	u32 eee_cfg;
+#define EEE_CFG_EEE_ENABLED			BIT(0)
+#define EEE_CFG_TX_LPI				BIT(1)
+#define EEE_CFG_ADV_SPEED_1G			BIT(2)
+#define EEE_CFG_ADV_SPEED_10G			BIT(3)
+#define EEE_TX_TIMER_USEC_MASK			(0xfffffff0)
+#define EEE_TX_TIMER_USEC_OFFSET		4
+#define EEE_TX_TIMER_USEC_BALANCED_TIME		(0xa00)
+#define EEE_TX_TIMER_USEC_AGGRESSIVE_TIME	(0x100)
+#define EEE_TX_TIMER_USEC_LATENCY_TIME		(0x6000)
+
 	u32 feature_config_flags;
 #define ETH_EEE_MODE_ADV_LPI		(1 << 0)
 };
@@ -11242,6 +11253,25 @@ struct public_port {
 	u32 wol_pkt_len;
 	u32 wol_pkt_details;
 	struct dcb_dscp_map dcb_dscp_map;
+
+	u32 eee_status;
+#define EEE_ACTIVE_BIT			BIT(0)
+#define EEE_LD_ADV_STATUS_MASK		0x000000f0
+#define EEE_LD_ADV_STATUS_OFFSET	4
+#define EEE_1G_ADV			BIT(1)
+#define EEE_10G_ADV			BIT(2)
+#define EEE_LP_ADV_STATUS_MASK		0x00000f00
+#define EEE_LP_ADV_STATUS_OFFSET	8
+#define EEE_SUPPORTED_SPEED_MASK	0x0000f000
+#define EEE_SUPPORTED_SPEED_OFFSET	12
+#define EEE_1G_SUPPORTED		BIT(1)
+#define EEE_10G_SUPPORTED		BIT(2)
+
+	u32 eee_remote;
+#define EEE_REMOTE_TW_TX_MASK   0x0000ffff
+#define EEE_REMOTE_TW_TX_OFFSET 0
+#define EEE_REMOTE_TW_RX_MASK   0xffff0000
+#define EEE_REMOTE_TW_RX_OFFSET 16
 };
 
 struct public_func {
@@ -11570,6 +11600,9 @@ struct public_drv_mb {
 #define DRV_MSG_CODE_GET_PF_RDMA_PROTOCOL	0x002b0000
 #define DRV_MSG_CODE_OS_WOL			0x002e0000
 
+#define DRV_MSG_CODE_FEATURE_SUPPORT		0x00300000
+#define DRV_MSG_CODE_GET_MFW_FEATURE_SUPPORT	0x00310000
+
 #define DRV_MSG_SEQ_NUMBER_MASK			0x0000ffff
 
 	u32 drv_mb_param;
@@ -11653,6 +11686,10 @@ struct public_drv_mb {
 #define DRV_MB_PARAM_BIST_TEST_IMAGE_INDEX_SHIFT	8
 #define DRV_MB_PARAM_BIST_TEST_IMAGE_INDEX_MASK		0x0000FF00
 
+#define DRV_MB_PARAM_FEATURE_SUPPORT_PORT_MASK		0x0000FFFF
+#define DRV_MB_PARAM_FEATURE_SUPPORT_PORT_OFFSET	0
+#define DRV_MB_PARAM_FEATURE_SUPPORT_PORT_EEE		0x00000002
+
 	u32 fw_mb_header;
 #define FW_MSG_CODE_MASK			0xffff0000
 #define FW_MSG_CODE_UNSUPPORTED                 0x00000000
@@ -11696,6 +11733,9 @@ struct public_drv_mb {
 #define FW_MB_PARAM_GET_PF_RDMA_IWARP		0x2
 #define FW_MB_PARAM_GET_PF_RDMA_BOTH		0x3
 
+/* get MFW feature support response */
+#define FW_MB_PARAM_FEATURE_SUPPORT_EEE		0x00000002
+
 #define FW_MB_PARAM_LOAD_DONE_DID_EFUSE_ERROR	(1 << 0)
 
 	u32 drv_pulse_mb;
@@ -11891,7 +11931,16 @@ struct nvm_cfg1_port {
 #define NVM_CFG1_PORT_DRV_FLOW_CONTROL_TX			0x4
 	u32 phy_cfg;
 	u32 mgmt_traffic;
+
 	u32 ext_phy;
+	/* EEE power saving mode */
+#define NVM_CFG1_PORT_EEE_POWER_SAVING_MODE_MASK		0x00FF0000
+#define NVM_CFG1_PORT_EEE_POWER_SAVING_MODE_OFFSET		16
+#define NVM_CFG1_PORT_EEE_POWER_SAVING_MODE_DISABLED		0x0
+#define NVM_CFG1_PORT_EEE_POWER_SAVING_MODE_BALANCED		0x1
+#define NVM_CFG1_PORT_EEE_POWER_SAVING_MODE_AGGRESSIVE		0x2
+#define NVM_CFG1_PORT_EEE_POWER_SAVING_MODE_LOW_LATENCY		0x3
+
 	u32 mba_cfg1;
 	u32 mba_cfg2;
 	u32 vf_cfg;
diff --git a/drivers/net/ethernet/qlogic/qed/qed_main.c b/drivers/net/ethernet/qlogic/qed/qed_main.c
index 1bddf9372fc9..0a06683abfa0 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_main.c
+++ b/drivers/net/ethernet/qlogic/qed/qed_main.c
@@ -1297,6 +1297,10 @@ static int qed_set_link(struct qed_dev *cdev, struct qed_link_params *params)
 		}
 	}
 
+	if (params->override_flags & QED_LINK_OVERRIDE_EEE_CONFIG)
+		memcpy(&link_params->eee, &params->eee,
+		       sizeof(link_params->eee));
+
 	rc = qed_mcp_set_link(hwfn, ptt, params->link_up);
 
 	qed_ptt_release(hwfn, ptt);
@@ -1483,6 +1487,21 @@ static void qed_fill_link(struct qed_hwfn *hwfn,
 	if (link.partner_adv_pause == QED_LINK_PARTNER_ASYMMETRIC_PAUSE ||
 	    link.partner_adv_pause == QED_LINK_PARTNER_BOTH_PAUSE)
 		if_link->lp_caps |= QED_LM_Asym_Pause_BIT;
+
+	if (link_caps.default_eee == QED_MCP_EEE_UNSUPPORTED) {
+		if_link->eee_supported = false;
+	} else {
+		if_link->eee_supported = true;
+		if_link->eee_active = link.eee_active;
+		if_link->sup_caps = link_caps.eee_speed_caps;
+		/* MFW clears adv_caps on eee disable; use configured value */
+		if_link->eee.adv_caps = link.eee_adv_caps ? link.eee_adv_caps :
+					params.eee.adv_caps;
+		if_link->eee.lp_adv_caps = link.eee_lp_adv_caps;
+		if_link->eee.enable = params.eee.enable;
+		if_link->eee.tx_lpi_enable = params.eee.tx_lpi_enable;
+		if_link->eee.tx_lpi_timer = params.eee.tx_lpi_timer;
+	}
 }
 
 static void qed_get_current_link(struct qed_dev *cdev,
diff --git a/drivers/net/ethernet/qlogic/qed/qed_mcp.c b/drivers/net/ethernet/qlogic/qed/qed_mcp.c
index 9da91045d167..c1ecce6b9141 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_mcp.c
+++ b/drivers/net/ethernet/qlogic/qed/qed_mcp.c
@@ -1097,6 +1097,31 @@ static void qed_mcp_handle_transceiver_change(struct qed_hwfn *p_hwfn,
 		DP_NOTICE(p_hwfn, "Transceiver is unplugged.\n");
 }
 
+static void qed_mcp_read_eee_config(struct qed_hwfn *p_hwfn,
+				    struct qed_ptt *p_ptt,
+				    struct qed_mcp_link_state *p_link)
+{
+	u32 eee_status, val;
+
+	p_link->eee_adv_caps = 0;
+	p_link->eee_lp_adv_caps = 0;
+	eee_status = qed_rd(p_hwfn,
+			    p_ptt,
+			    p_hwfn->mcp_info->port_addr +
+			    offsetof(struct public_port, eee_status));
+	p_link->eee_active = !!(eee_status & EEE_ACTIVE_BIT);
+	val = (eee_status & EEE_LD_ADV_STATUS_MASK) >> EEE_LD_ADV_STATUS_OFFSET;
+	if (val & EEE_1G_ADV)
+		p_link->eee_adv_caps |= QED_EEE_1G_ADV;
+	if (val & EEE_10G_ADV)
+		p_link->eee_adv_caps |= QED_EEE_10G_ADV;
+	val = (eee_status & EEE_LP_ADV_STATUS_MASK) >> EEE_LP_ADV_STATUS_OFFSET;
+	if (val & EEE_1G_ADV)
+		p_link->eee_lp_adv_caps |= QED_EEE_1G_ADV;
+	if (val & EEE_10G_ADV)
+		p_link->eee_lp_adv_caps |= QED_EEE_10G_ADV;
+}
+
 static void qed_mcp_handle_link_change(struct qed_hwfn *p_hwfn,
 				       struct qed_ptt *p_ptt, bool b_reset)
 {
@@ -1228,6 +1253,9 @@ static void qed_mcp_handle_link_change(struct qed_hwfn *p_hwfn,
 
 	p_link->sfp_tx_fault = !!(status & LINK_STATUS_SFP_TX_FAULT);
 
+	if (p_hwfn->mcp_info->capabilities & FW_MB_PARAM_FEATURE_SUPPORT_EEE)
+		qed_mcp_read_eee_config(p_hwfn, p_ptt, p_link);
+
 	qed_link_update(p_hwfn);
 out:
 	spin_unlock_bh(&p_hwfn->mcp_info->link_lock);
@@ -1251,6 +1279,19 @@ int qed_mcp_set_link(struct qed_hwfn *p_hwfn, struct qed_ptt *p_ptt, bool b_up)
 	phy_cfg.pause |= (params->pause.forced_tx) ? ETH_PAUSE_TX : 0;
 	phy_cfg.adv_speed = params->speed.advertised_speeds;
 	phy_cfg.loopback_mode = params->loopback_mode;
+	if (p_hwfn->mcp_info->capabilities & FW_MB_PARAM_FEATURE_SUPPORT_EEE) {
+		if (params->eee.enable)
+			phy_cfg.eee_cfg |= EEE_CFG_EEE_ENABLED;
+		if (params->eee.tx_lpi_enable)
+			phy_cfg.eee_cfg |= EEE_CFG_TX_LPI;
+		if (params->eee.adv_caps & QED_EEE_1G_ADV)
+			phy_cfg.eee_cfg |= EEE_CFG_ADV_SPEED_1G;
+		if (params->eee.adv_caps & QED_EEE_10G_ADV)
+			phy_cfg.eee_cfg |= EEE_CFG_ADV_SPEED_10G;
+		phy_cfg.eee_cfg |= (params->eee.tx_lpi_timer <<
+				    EEE_TX_TIMER_USEC_OFFSET) &
+				   EEE_TX_TIMER_USEC_MASK;
+	}
 
 	p_hwfn->b_drv_link_init = b_up;
 
@@ -2822,3 +2863,28 @@ void qed_mcp_resc_lock_default_init(struct qed_resc_lock_params *p_lock,
 		p_unlock->resource = resource;
 	}
 }
+
+int qed_mcp_get_capabilities(struct qed_hwfn *p_hwfn, struct qed_ptt *p_ptt)
+{
+	u32 mcp_resp;
+	int rc;
+
+	rc = qed_mcp_cmd(p_hwfn, p_ptt, DRV_MSG_CODE_GET_MFW_FEATURE_SUPPORT,
+			 0, &mcp_resp, &p_hwfn->mcp_info->capabilities);
+	if (!rc)
+		DP_VERBOSE(p_hwfn, (QED_MSG_SP | NETIF_MSG_PROBE),
+			   "MFW supported features: %08x\n",
+			   p_hwfn->mcp_info->capabilities);
+
+	return rc;
+}
+
+int qed_mcp_set_capabilities(struct qed_hwfn *p_hwfn, struct qed_ptt *p_ptt)
+{
+	u32 mcp_resp, mcp_param, features;
+
+	features = DRV_MB_PARAM_FEATURE_SUPPORT_PORT_EEE;
+
+	return qed_mcp_cmd(p_hwfn, p_ptt, DRV_MSG_CODE_FEATURE_SUPPORT,
+			   features, &mcp_resp, &mcp_param);
+}
diff --git a/drivers/net/ethernet/qlogic/qed/qed_mcp.h b/drivers/net/ethernet/qlogic/qed/qed_mcp.h
index af03b3651411..c7ec2395d1ce 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_mcp.h
+++ b/drivers/net/ethernet/qlogic/qed/qed_mcp.h
@@ -53,15 +53,25 @@ struct qed_mcp_link_pause_params {
 	bool    forced_tx;
 };
 
+enum qed_mcp_eee_mode {
+	QED_MCP_EEE_DISABLED,
+	QED_MCP_EEE_ENABLED,
+	QED_MCP_EEE_UNSUPPORTED
+};
+
 struct qed_mcp_link_params {
-	struct qed_mcp_link_speed_params	speed;
-	struct qed_mcp_link_pause_params	pause;
-	u32				     loopback_mode;
+	struct qed_mcp_link_speed_params speed;
+	struct qed_mcp_link_pause_params pause;
+	u32 loopback_mode;
+	struct qed_link_eee_params eee;
 };
 
 struct qed_mcp_link_capabilities {
 	u32 speed_capabilities;
 	bool default_speed_autoneg;
+	enum qed_mcp_eee_mode default_eee;
+	u32 eee_lpi_timer;
+	u8 eee_speed_caps;
 };
 
 struct qed_mcp_link_state {
@@ -102,6 +112,9 @@ struct qed_mcp_link_state {
 	u8      partner_adv_pause;
 
 	bool    sfp_tx_fault;
+	bool    eee_active;
+	u8      eee_adv_caps;
+	u8      eee_lp_adv_caps;
 };
 
 struct qed_mcp_function_info {
@@ -546,6 +559,9 @@ struct qed_mcp_info {
 	u8					*mfw_mb_shadow;
 	u16					mfw_mb_length;
 	u32					mcp_hist;
+
+	/* Capabilties negotiated with the MFW */
+	u32					capabilities;
 };
 
 struct qed_mcp_mb_params {
@@ -925,5 +941,20 @@ void qed_mcp_resc_lock_default_init(struct qed_resc_lock_params *p_lock,
 				    struct qed_resc_unlock_params *p_unlock,
 				    enum qed_resc_lock
 				    resource, bool b_is_permanent);
+/**
+ * @brief Learn of supported MFW features; To be done during early init
+ *
+ * @param p_hwfn
+ * @param p_ptt
+ */
+int qed_mcp_get_capabilities(struct qed_hwfn *p_hwfn, struct qed_ptt *p_ptt);
 
+/**
+ * @brief Inform MFW of set of features supported by driver. Should be done
+ * inside the content of the LOAD_REQ.
+ *
+ * @param p_hwfn
+ * @param p_ptt
+ */
+int qed_mcp_set_capabilities(struct qed_hwfn *p_hwfn, struct qed_ptt *p_ptt);
 #endif
diff --git a/include/linux/qed/qed_if.h b/include/linux/qed/qed_if.h
index ef39c7f40ae6..9f3276271b02 100644
--- a/include/linux/qed/qed_if.h
+++ b/include/linux/qed/qed_if.h
@@ -161,6 +161,18 @@ enum qed_nvm_images {
 	QED_NVM_IMAGE_FCOE_CFG,
 };
 
+struct qed_link_eee_params {
+	u32 tx_lpi_timer;
+#define QED_EEE_1G_ADV		BIT(0)
+#define QED_EEE_10G_ADV		BIT(1)
+
+	/* Capabilities are represented using QED_EEE_*_ADV values */
+	u8 adv_caps;
+	u8 lp_adv_caps;
+	bool enable;
+	bool tx_lpi_enable;
+};
+
 enum qed_led_mode {
 	QED_LED_MODE_OFF,
 	QED_LED_MODE_ON,
@@ -408,6 +420,7 @@ struct qed_link_params {
 #define QED_LINK_OVERRIDE_SPEED_FORCED_SPEED    BIT(2)
 #define QED_LINK_OVERRIDE_PAUSE_CONFIG          BIT(3)
 #define QED_LINK_OVERRIDE_LOOPBACK_MODE         BIT(4)
+#define QED_LINK_OVERRIDE_EEE_CONFIG            BIT(5)
 	u32	override_flags;
 	bool	autoneg;
 	u32	adv_speeds;
@@ -422,6 +435,7 @@ struct qed_link_params {
 #define QED_LINK_LOOPBACK_EXT                   BIT(3)
 #define QED_LINK_LOOPBACK_MAC                   BIT(4)
 	u32	loopback_mode;
+	struct qed_link_eee_params eee;
 };
 
 struct qed_link_output {
@@ -437,6 +451,12 @@ struct qed_link_output {
 	u8	port;                   /* In PORT defs */
 	bool	autoneg;
 	u32	pause_config;
+
+	/* EEE - capability & param */
+	bool eee_supported;
+	bool eee_active;
+	u8 sup_caps;
+	struct qed_link_eee_params eee;
 };
 
 struct qed_probe_params {
-- 
cgit v1.2.3


From 477f2d1460a636abd08f03eafabe0c51366fa5de Mon Sep 17 00:00:00 2001
From: Rahul Verma <Rahul.Verma@cavium.com>
Date: Wed, 26 Jul 2017 06:07:13 -0700
Subject: qed: Add support for vf coalesce configuration.

This patch add the ethtool support to set RX/Tx coalesce
value to the VF associated Rx/Tx queues.

Signed-off-by: Rahul Verma <Rahul.Verma@cavium.com>
Signed-off-by: Yuval Mintz <yuval.mintz@cavium.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/qlogic/qed/qed_dev.c       | 73 +++++++++++++++-------
 drivers/net/ethernet/qlogic/qed/qed_dev_api.h   | 41 ++++--------
 drivers/net/ethernet/qlogic/qed/qed_l2.h        |  7 +++
 drivers/net/ethernet/qlogic/qed/qed_main.c      | 24 +------
 drivers/net/ethernet/qlogic/qed/qed_sriov.c     | 83 +++++++++++++++++++++++++
 drivers/net/ethernet/qlogic/qed/qed_vf.c        | 44 +++++++++++++
 drivers/net/ethernet/qlogic/qed/qed_vf.h        | 24 ++++++-
 drivers/net/ethernet/qlogic/qede/qede_ethtool.c | 38 ++++++++---
 include/linux/qed/qed_if.h                      |  4 +-
 9 files changed, 251 insertions(+), 87 deletions(-)

(limited to 'include')

diff --git a/drivers/net/ethernet/qlogic/qed/qed_dev.c b/drivers/net/ethernet/qlogic/qed/qed_dev.c
index f545607100e4..58a689fb04db 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_dev.c
+++ b/drivers/net/ethernet/qlogic/qed/qed_dev.c
@@ -3694,7 +3694,7 @@ static int qed_set_coalesce(struct qed_hwfn *p_hwfn, struct qed_ptt *p_ptt,
 	}
 
 	p_coal_timeset = p_eth_qzone;
-	memset(p_coal_timeset, 0, eth_qzone_size);
+	memset(p_eth_qzone, 0, eth_qzone_size);
 	SET_FIELD(p_coal_timeset->value, COALESCING_TIMESET_TIMESET, timeset);
 	SET_FIELD(p_coal_timeset->value, COALESCING_TIMESET_VALID, 1);
 	qed_memcpy_to(p_hwfn, p_ptt, hw_addr, p_eth_qzone, eth_qzone_size);
@@ -3702,12 +3702,46 @@ static int qed_set_coalesce(struct qed_hwfn *p_hwfn, struct qed_ptt *p_ptt,
 	return 0;
 }
 
-int qed_set_rxq_coalesce(struct qed_hwfn *p_hwfn, struct qed_ptt *p_ptt,
-			 u16 coalesce, u16 qid, u16 sb_id)
+int qed_set_queue_coalesce(u16 rx_coal, u16 tx_coal, void *p_handle)
+{
+	struct qed_queue_cid *p_cid = p_handle;
+	struct qed_hwfn *p_hwfn;
+	struct qed_ptt *p_ptt;
+	int rc = 0;
+
+	p_hwfn = p_cid->p_owner;
+
+	if (IS_VF(p_hwfn->cdev))
+		return qed_vf_pf_set_coalesce(p_hwfn, rx_coal, tx_coal, p_cid);
+
+	p_ptt = qed_ptt_acquire(p_hwfn);
+	if (!p_ptt)
+		return -EAGAIN;
+
+	if (rx_coal) {
+		rc = qed_set_rxq_coalesce(p_hwfn, p_ptt, rx_coal, p_cid);
+		if (rc)
+			goto out;
+		p_hwfn->cdev->rx_coalesce_usecs = rx_coal;
+	}
+
+	if (tx_coal) {
+		rc = qed_set_txq_coalesce(p_hwfn, p_ptt, tx_coal, p_cid);
+		if (rc)
+			goto out;
+		p_hwfn->cdev->tx_coalesce_usecs = tx_coal;
+	}
+out:
+	qed_ptt_release(p_hwfn, p_ptt);
+	return rc;
+}
+
+int qed_set_rxq_coalesce(struct qed_hwfn *p_hwfn,
+			 struct qed_ptt *p_ptt,
+			 u16 coalesce, struct qed_queue_cid *p_cid)
 {
 	struct ustorm_eth_queue_zone eth_qzone;
 	u8 timeset, timer_res;
-	u16 fw_qid = 0;
 	u32 address;
 	int rc;
 
@@ -3724,32 +3758,29 @@ int qed_set_rxq_coalesce(struct qed_hwfn *p_hwfn, struct qed_ptt *p_ptt,
 	}
 	timeset = (u8)(coalesce >> timer_res);
 
-	rc = qed_fw_l2_queue(p_hwfn, qid, &fw_qid);
-	if (rc)
-		return rc;
-
-	rc = qed_int_set_timer_res(p_hwfn, p_ptt, timer_res, sb_id, false);
+	rc = qed_int_set_timer_res(p_hwfn, p_ptt, timer_res,
+				   p_cid->sb_igu_id, false);
 	if (rc)
 		goto out;
 
-	address = BAR0_MAP_REG_USDM_RAM + USTORM_ETH_QUEUE_ZONE_OFFSET(fw_qid);
+	address = BAR0_MAP_REG_USDM_RAM +
+		  USTORM_ETH_QUEUE_ZONE_OFFSET(p_cid->abs.queue_id);
 
 	rc = qed_set_coalesce(p_hwfn, p_ptt, address, &eth_qzone,
 			      sizeof(struct ustorm_eth_queue_zone), timeset);
 	if (rc)
 		goto out;
 
-	p_hwfn->cdev->rx_coalesce_usecs = coalesce;
 out:
 	return rc;
 }
 
-int qed_set_txq_coalesce(struct qed_hwfn *p_hwfn, struct qed_ptt *p_ptt,
-			 u16 coalesce, u16 qid, u16 sb_id)
+int qed_set_txq_coalesce(struct qed_hwfn *p_hwfn,
+			 struct qed_ptt *p_ptt,
+			 u16 coalesce, struct qed_queue_cid *p_cid)
 {
 	struct xstorm_eth_queue_zone eth_qzone;
 	u8 timeset, timer_res;
-	u16 fw_qid = 0;
 	u32 address;
 	int rc;
 
@@ -3766,22 +3797,16 @@ int qed_set_txq_coalesce(struct qed_hwfn *p_hwfn, struct qed_ptt *p_ptt,
 	}
 	timeset = (u8)(coalesce >> timer_res);
 
-	rc = qed_fw_l2_queue(p_hwfn, qid, &fw_qid);
-	if (rc)
-		return rc;
-
-	rc = qed_int_set_timer_res(p_hwfn, p_ptt, timer_res, sb_id, true);
+	rc = qed_int_set_timer_res(p_hwfn, p_ptt, timer_res,
+				   p_cid->sb_igu_id, true);
 	if (rc)
 		goto out;
 
-	address = BAR0_MAP_REG_XSDM_RAM + XSTORM_ETH_QUEUE_ZONE_OFFSET(fw_qid);
+	address = BAR0_MAP_REG_XSDM_RAM +
+		  XSTORM_ETH_QUEUE_ZONE_OFFSET(p_cid->abs.queue_id);
 
 	rc = qed_set_coalesce(p_hwfn, p_ptt, address, &eth_qzone,
 			      sizeof(struct xstorm_eth_queue_zone), timeset);
-	if (rc)
-		goto out;
-
-	p_hwfn->cdev->tx_coalesce_usecs = coalesce;
 out:
 	return rc;
 }
diff --git a/drivers/net/ethernet/qlogic/qed/qed_dev_api.h b/drivers/net/ethernet/qlogic/qed/qed_dev_api.h
index 1f1df1bf127c..e6b3c83c5db8 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_dev_api.h
+++ b/drivers/net/ethernet/qlogic/qed/qed_dev_api.h
@@ -443,38 +443,23 @@ int qed_final_cleanup(struct qed_hwfn *p_hwfn,
 		      struct qed_ptt *p_ptt, u16 id, bool is_vf);
 
 /**
- * @brief qed_set_rxq_coalesce - Configure coalesce parameters for an Rx queue
- * The fact that we can configure coalescing to up to 511, but on varying
- * accuracy [the bigger the value the less accurate] up to a mistake of 3usec
- * for the highest values.
- *
- * @param p_hwfn
- * @param p_ptt
- * @param coalesce - Coalesce value in micro seconds.
- * @param qid - Queue index.
- * @param qid - SB Id
- *
- * @return int
- */
-int qed_set_rxq_coalesce(struct qed_hwfn *p_hwfn, struct qed_ptt *p_ptt,
-			 u16 coalesce, u16 qid, u16 sb_id);
-
-/**
- * @brief qed_set_txq_coalesce - Configure coalesce parameters for a Tx queue
- * While the API allows setting coalescing per-qid, all tx queues sharing a
- * SB should be in same range [i.e., either 0-0x7f, 0x80-0xff or 0x100-0x1ff]
+ * @brief qed_set_queue_coalesce - Configure coalesce parameters for Rx or
+ * Tx queue. We can configure coalescing to up to 511, but on
+ * varying accuracy [the bigger the value the less accurate] up to a mistake
+ * of 3usec for the highest values.
+ * While the API allows setting coalescing per-qid, all queues sharing a SB
+ * should be in same range [i.e., either 0-0x7f, 0x80-0xff or 0x100-0x1ff]
  * otherwise configuration would break.
  *
- * @param p_hwfn
- * @param p_ptt
- * @param coalesce - Coalesce value in micro seconds.
- * @param qid - Queue index.
- * @param qid - SB Id
+ * @param rx_coal - Rx Coalesce value in micro seconds.
+ * @param tx_coal - TX Coalesce value in micro seconds.
+ * @param p_handle
  *
  * @return int
- */
-int qed_set_txq_coalesce(struct qed_hwfn *p_hwfn, struct qed_ptt *p_ptt,
-			 u16 coalesce, u16 qid, u16 sb_id);
+ **/
+int
+qed_set_queue_coalesce(u16 rx_coal, u16 tx_coal, void *p_handle);
+
 
 const char *qed_hw_get_resc_name(enum qed_resources res_id);
 #endif
diff --git a/drivers/net/ethernet/qlogic/qed/qed_l2.h b/drivers/net/ethernet/qlogic/qed/qed_l2.h
index f8f09aadced7..60ea72ce3e2c 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_l2.h
+++ b/drivers/net/ethernet/qlogic/qed/qed_l2.h
@@ -400,4 +400,11 @@ qed_eth_txq_start_ramrod(struct qed_hwfn *p_hwfn,
 
 u8 qed_mcast_bin_from_mac(u8 *mac);
 
+int qed_set_rxq_coalesce(struct qed_hwfn *p_hwfn,
+			 struct qed_ptt *p_ptt,
+			 u16 coalesce, struct qed_queue_cid *p_cid);
+
+int qed_set_txq_coalesce(struct qed_hwfn *p_hwfn,
+			 struct qed_ptt *p_ptt,
+			 u16 coalesce, struct qed_queue_cid *p_cid);
 #endif /* _QED_L2_H */
diff --git a/drivers/net/ethernet/qlogic/qed/qed_main.c b/drivers/net/ethernet/qlogic/qed/qed_main.c
index 0a06683abfa0..448810a235b8 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_main.c
+++ b/drivers/net/ethernet/qlogic/qed/qed_main.c
@@ -1575,29 +1575,9 @@ static void qed_get_coalesce(struct qed_dev *cdev, u16 *rx_coal, u16 *tx_coal)
 }
 
 static int qed_set_coalesce(struct qed_dev *cdev, u16 rx_coal, u16 tx_coal,
-			    u16 qid, u16 sb_id)
+			    void *handle)
 {
-	struct qed_hwfn *hwfn;
-	struct qed_ptt *ptt;
-	int hwfn_index;
-	int status = 0;
-
-	hwfn_index = qid % cdev->num_hwfns;
-	hwfn = &cdev->hwfns[hwfn_index];
-	ptt = qed_ptt_acquire(hwfn);
-	if (!ptt)
-		return -EAGAIN;
-
-	status = qed_set_rxq_coalesce(hwfn, ptt, rx_coal,
-				      qid / cdev->num_hwfns, sb_id);
-	if (status)
-		goto out;
-	status = qed_set_txq_coalesce(hwfn, ptt, tx_coal,
-				      qid / cdev->num_hwfns, sb_id);
-out:
-	qed_ptt_release(hwfn, ptt);
-
-	return status;
+		return qed_set_queue_coalesce(rx_coal, tx_coal, handle);
 }
 
 static int qed_set_led(struct qed_dev *cdev, enum qed_led_mode mode)
diff --git a/drivers/net/ethernet/qlogic/qed/qed_sriov.c b/drivers/net/ethernet/qlogic/qed/qed_sriov.c
index 2cfd3bd9a031..5feef783623b 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_sriov.c
+++ b/drivers/net/ethernet/qlogic/qed/qed_sriov.c
@@ -3400,6 +3400,86 @@ static void qed_iov_vf_mbx_release(struct qed_hwfn *p_hwfn,
 			     length, status);
 }
 
+static void qed_iov_vf_pf_set_coalesce(struct qed_hwfn *p_hwfn,
+				       struct qed_ptt *p_ptt,
+				       struct qed_vf_info *vf)
+{
+	struct qed_iov_vf_mbx *mbx = &vf->vf_mbx;
+	struct vfpf_update_coalesce *req;
+	u8 status = PFVF_STATUS_FAILURE;
+	struct qed_queue_cid *p_cid;
+	u16 rx_coal, tx_coal;
+	int rc = 0, i;
+	u16 qid;
+
+	req = &mbx->req_virt->update_coalesce;
+
+	rx_coal = req->rx_coal;
+	tx_coal = req->tx_coal;
+	qid = req->qid;
+
+	if (!qed_iov_validate_rxq(p_hwfn, vf, qid,
+				  QED_IOV_VALIDATE_Q_ENABLE) && rx_coal) {
+		DP_VERBOSE(p_hwfn, QED_MSG_IOV,
+			   "VF[%d]: Invalid Rx queue_id = %d\n",
+			   vf->abs_vf_id, qid);
+		goto out;
+	}
+
+	if (!qed_iov_validate_txq(p_hwfn, vf, qid,
+				  QED_IOV_VALIDATE_Q_ENABLE) && tx_coal) {
+		DP_VERBOSE(p_hwfn, QED_MSG_IOV,
+			   "VF[%d]: Invalid Tx queue_id = %d\n",
+			   vf->abs_vf_id, qid);
+		goto out;
+	}
+
+	DP_VERBOSE(p_hwfn,
+		   QED_MSG_IOV,
+		   "VF[%d]: Setting coalesce for VF rx_coal = %d, tx_coal = %d at queue = %d\n",
+		   vf->abs_vf_id, rx_coal, tx_coal, qid);
+
+	if (rx_coal) {
+		p_cid = qed_iov_get_vf_rx_queue_cid(&vf->vf_queues[qid]);
+
+		rc = qed_set_rxq_coalesce(p_hwfn, p_ptt, rx_coal, p_cid);
+		if (rc) {
+			DP_VERBOSE(p_hwfn,
+				   QED_MSG_IOV,
+				   "VF[%d]: Unable to set rx queue = %d coalesce\n",
+				   vf->abs_vf_id, vf->vf_queues[qid].fw_rx_qid);
+			goto out;
+		}
+	}
+
+	if (tx_coal) {
+		struct qed_vf_queue *p_queue = &vf->vf_queues[qid];
+
+		for (i = 0; i < MAX_QUEUES_PER_QZONE; i++) {
+			if (!p_queue->cids[i].p_cid)
+				continue;
+
+			if (!p_queue->cids[i].b_is_tx)
+				continue;
+
+			rc = qed_set_txq_coalesce(p_hwfn, p_ptt, tx_coal,
+						  p_queue->cids[i].p_cid);
+
+			if (rc) {
+				DP_VERBOSE(p_hwfn,
+					   QED_MSG_IOV,
+					   "VF[%d]: Unable to set tx queue coalesce\n",
+					   vf->abs_vf_id);
+				goto out;
+			}
+		}
+	}
+
+	status = PFVF_STATUS_SUCCESS;
+out:
+	qed_iov_prepare_resp(p_hwfn, p_ptt, vf, CHANNEL_TLV_COALESCE_UPDATE,
+			     sizeof(struct pfvf_def_resp_tlv), status);
+}
 static int
 qed_iov_vf_flr_poll_dorq(struct qed_hwfn *p_hwfn,
 			 struct qed_vf_info *p_vf, struct qed_ptt *p_ptt)
@@ -3725,6 +3805,9 @@ static void qed_iov_process_mbx_req(struct qed_hwfn *p_hwfn,
 		case CHANNEL_TLV_UPDATE_TUNN_PARAM:
 			qed_iov_vf_mbx_update_tunn_param(p_hwfn, p_ptt, p_vf);
 			break;
+		case CHANNEL_TLV_COALESCE_UPDATE:
+			qed_iov_vf_pf_set_coalesce(p_hwfn, p_ptt, p_vf);
+			break;
 		}
 	} else if (qed_iov_tlv_supported(mbx->first_tlv.tl.type)) {
 		DP_VERBOSE(p_hwfn, QED_MSG_IOV,
diff --git a/drivers/net/ethernet/qlogic/qed/qed_vf.c b/drivers/net/ethernet/qlogic/qed/qed_vf.c
index 1926d1ed439f..0a7bbc0f19b0 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_vf.c
+++ b/drivers/net/ethernet/qlogic/qed/qed_vf.c
@@ -1343,6 +1343,50 @@ exit:
 	return rc;
 }
 
+int
+qed_vf_pf_set_coalesce(struct qed_hwfn *p_hwfn,
+		       u16 rx_coal, u16 tx_coal, struct qed_queue_cid *p_cid)
+{
+	struct qed_vf_iov *p_iov = p_hwfn->vf_iov_info;
+	struct vfpf_update_coalesce *req;
+	struct pfvf_def_resp_tlv *resp;
+	int rc;
+
+	/* clear mailbox and prep header tlv */
+	req = qed_vf_pf_prep(p_hwfn, CHANNEL_TLV_COALESCE_UPDATE, sizeof(*req));
+
+	req->rx_coal = rx_coal;
+	req->tx_coal = tx_coal;
+	req->qid = p_cid->rel.queue_id;
+
+	DP_VERBOSE(p_hwfn,
+		   QED_MSG_IOV,
+		   "Setting coalesce rx_coal = %d, tx_coal = %d at queue = %d\n",
+		   rx_coal, tx_coal, req->qid);
+
+	/* add list termination tlv */
+	qed_add_tlv(p_hwfn, &p_iov->offset, CHANNEL_TLV_LIST_END,
+		    sizeof(struct channel_list_end_tlv));
+
+	resp = &p_iov->pf2vf_reply->default_resp;
+	rc = qed_send_msg2pf(p_hwfn, &resp->hdr.status, sizeof(*resp));
+	if (rc)
+		goto exit;
+
+	if (resp->hdr.status != PFVF_STATUS_SUCCESS)
+		goto exit;
+
+	if (rx_coal)
+		p_hwfn->cdev->rx_coalesce_usecs = rx_coal;
+
+	if (tx_coal)
+		p_hwfn->cdev->tx_coalesce_usecs = tx_coal;
+
+exit:
+	qed_vf_pf_req_end(p_hwfn, rc);
+	return rc;
+}
+
 u16 qed_vf_get_igu_sb_id(struct qed_hwfn *p_hwfn, u16 sb_id)
 {
 	struct qed_vf_iov *p_iov = p_hwfn->vf_iov_info;
diff --git a/drivers/net/ethernet/qlogic/qed/qed_vf.h b/drivers/net/ethernet/qlogic/qed/qed_vf.h
index 34d9b882a780..2d9fdd62f56d 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_vf.h
+++ b/drivers/net/ethernet/qlogic/qed/qed_vf.h
@@ -497,6 +497,13 @@ struct tlv_buffer_size {
 	u8 tlv_buffer[TLV_BUFFER_SIZE];
 };
 
+struct vfpf_update_coalesce {
+	struct vfpf_first_tlv first_tlv;
+	u16 rx_coal;
+	u16 tx_coal;
+	u16 qid;
+	u8 padding[2];
+};
 union vfpf_tlvs {
 	struct vfpf_first_tlv first_tlv;
 	struct vfpf_acquire_tlv acquire;
@@ -509,6 +516,7 @@ union vfpf_tlvs {
 	struct vfpf_vport_update_tlv vport_update;
 	struct vfpf_ucast_filter_tlv ucast_filter;
 	struct vfpf_update_tunn_param_tlv tunn_param_update;
+	struct vfpf_update_coalesce update_coalesce;
 	struct channel_list_end_tlv list_end;
 	struct tlv_buffer_size tlv_buf_size;
 };
@@ -624,7 +632,7 @@ enum {
 	CHANNEL_TLV_VPORT_UPDATE_ACCEPT_ANY_VLAN,
 	CHANNEL_TLV_VPORT_UPDATE_SGE_TPA,
 	CHANNEL_TLV_UPDATE_TUNN_PARAM,
-	CHANNEL_TLV_RESERVED,
+	CHANNEL_TLV_COALESCE_UPDATE,
 	CHANNEL_TLV_QID,
 	CHANNEL_TLV_MAX,
 
@@ -677,6 +685,20 @@ struct qed_vf_iov {
 	bool b_doorbell_bar;
 };
 
+/**
+ * @brief VF - Set Rx/Tx coalesce per VF's relative queue.
+ *             Coalesce value '0' will omit the configuration.
+ *
+ * @param p_hwfn
+ * @param rx_coal - coalesce value in micro second for rx queue
+ * @param tx_coal - coalesce value in micro second for tx queue
+ * @param p_cid   - queue cid
+ *
+ **/
+int qed_vf_pf_set_coalesce(struct qed_hwfn *p_hwfn,
+			   u16 rx_coal,
+			   u16 tx_coal, struct qed_queue_cid *p_cid);
+
 #ifdef CONFIG_QED_SRIOV
 /**
  * @brief Read the VF bulletin and act on it if needed
diff --git a/drivers/net/ethernet/qlogic/qede/qede_ethtool.c b/drivers/net/ethernet/qlogic/qede/qede_ethtool.c
index 55fa2ef19d8a..76e0b132e8cc 100644
--- a/drivers/net/ethernet/qlogic/qede/qede_ethtool.c
+++ b/drivers/net/ethernet/qlogic/qede/qede_ethtool.c
@@ -718,8 +718,9 @@ static int qede_set_coalesce(struct net_device *dev,
 			     struct ethtool_coalesce *coal)
 {
 	struct qede_dev *edev = netdev_priv(dev);
+	struct qede_fastpath *fp;
 	int i, rc = 0;
-	u16 rxc, txc, sb_id;
+	u16 rxc, txc;
 
 	if (!netif_running(dev)) {
 		DP_INFO(edev, "Interface is down\n");
@@ -730,21 +731,36 @@ static int qede_set_coalesce(struct net_device *dev,
 	    coal->tx_coalesce_usecs > QED_COALESCE_MAX) {
 		DP_INFO(edev,
 			"Can't support requested %s coalesce value [max supported value %d]\n",
-			coal->rx_coalesce_usecs > QED_COALESCE_MAX ? "rx"
-								   : "tx",
-			QED_COALESCE_MAX);
+			coal->rx_coalesce_usecs > QED_COALESCE_MAX ? "rx" :
+			"tx", QED_COALESCE_MAX);
 		return -EINVAL;
 	}
 
 	rxc = (u16)coal->rx_coalesce_usecs;
 	txc = (u16)coal->tx_coalesce_usecs;
 	for_each_queue(i) {
-		sb_id = edev->fp_array[i].sb_info->igu_sb_id;
-		rc = edev->ops->common->set_coalesce(edev->cdev, rxc, txc,
-						     (u16)i, sb_id);
-		if (rc) {
-			DP_INFO(edev, "Set coalesce error, rc = %d\n", rc);
-			return rc;
+		fp = &edev->fp_array[i];
+
+		if (edev->fp_array[i].type & QEDE_FASTPATH_RX) {
+			rc = edev->ops->common->set_coalesce(edev->cdev,
+							     rxc, 0,
+							     fp->rxq->handle);
+			if (rc) {
+				DP_INFO(edev,
+					"Set RX coalesce error, rc = %d\n", rc);
+				return rc;
+			}
+		}
+
+		if (edev->fp_array[i].type & QEDE_FASTPATH_TX) {
+			rc = edev->ops->common->set_coalesce(edev->cdev,
+							     0, txc,
+							     fp->txq->handle);
+			if (rc) {
+				DP_INFO(edev,
+					"Set TX coalesce error, rc = %d\n", rc);
+				return rc;
+			}
 		}
 	}
 
@@ -1758,6 +1774,8 @@ static const struct ethtool_ops qede_vf_ethtool_ops = {
 	.get_msglevel = qede_get_msglevel,
 	.set_msglevel = qede_set_msglevel,
 	.get_link = qede_get_link,
+	.get_coalesce = qede_get_coalesce,
+	.set_coalesce = qede_set_coalesce,
 	.get_ringparam = qede_get_ringparam,
 	.set_ringparam = qede_set_ringparam,
 	.get_strings = qede_get_strings,
diff --git a/include/linux/qed/qed_if.h b/include/linux/qed/qed_if.h
index 9f3276271b02..4d59ca16134c 100644
--- a/include/linux/qed/qed_if.h
+++ b/include/linux/qed/qed_if.h
@@ -694,8 +694,8 @@ struct qed_common_ops {
  *
  * @return 0 on success, error otherwise.
  */
-	int (*set_coalesce)(struct qed_dev *cdev, u16 rx_coal, u16 tx_coal,
-			    u16 qid, u16 sb_id);
+	int (*set_coalesce)(struct qed_dev *cdev,
+			    u16 rx_coal, u16 tx_coal, void *handle);
 
 /**
  * @brief set_led - Configure LED mode
-- 
cgit v1.2.3


From bf5a94bfe26a9fcd4af91ae6bccd4f3d600d2262 Mon Sep 17 00:00:00 2001
From: Rahul Verma <Rahul.Verma@cavium.com>
Date: Wed, 26 Jul 2017 06:07:14 -0700
Subject: qed: Read per queue coalesce from hardware

Retrieve the actual coalesce value from hardware for every Rx/Tx
queue, instead of Rx/Tx coalesce value cached during set coalesce.

Signed-off-by: Rahul Verma <Rahul.Verma@cavium.com>
Signed-off-by: Yuval Mintz <yuval.mintz@cavium.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/qlogic/qed/qed_dev_api.h   |  26 ++++--
 drivers/net/ethernet/qlogic/qed/qed_l2.c        | 115 ++++++++++++++++++++++++
 drivers/net/ethernet/qlogic/qed/qed_l2.h        |  11 ++-
 drivers/net/ethernet/qlogic/qed/qed_main.c      |   7 --
 drivers/net/ethernet/qlogic/qed/qed_sriov.c     |  74 +++++++++++++++
 drivers/net/ethernet/qlogic/qed/qed_sriov.h     |   3 +
 drivers/net/ethernet/qlogic/qed/qed_vf.c        |  31 +++++++
 drivers/net/ethernet/qlogic/qed/qed_vf.h        |  29 +++++-
 drivers/net/ethernet/qlogic/qede/qede_ethtool.c |  47 ++++++++--
 include/linux/qed/qed_eth_if.h                  |   1 +
 include/linux/qed/qed_if.h                      |  11 +--
 11 files changed, 324 insertions(+), 31 deletions(-)

(limited to 'include')

diff --git a/drivers/net/ethernet/qlogic/qed/qed_dev_api.h b/drivers/net/ethernet/qlogic/qed/qed_dev_api.h
index e6b3c83c5db8..defdda1ffaa2 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_dev_api.h
+++ b/drivers/net/ethernet/qlogic/qed/qed_dev_api.h
@@ -443,13 +443,25 @@ int qed_final_cleanup(struct qed_hwfn *p_hwfn,
 		      struct qed_ptt *p_ptt, u16 id, bool is_vf);
 
 /**
- * @brief qed_set_queue_coalesce - Configure coalesce parameters for Rx or
- * Tx queue. We can configure coalescing to up to 511, but on
- * varying accuracy [the bigger the value the less accurate] up to a mistake
- * of 3usec for the highest values.
- * While the API allows setting coalescing per-qid, all queues sharing a SB
- * should be in same range [i.e., either 0-0x7f, 0x80-0xff or 0x100-0x1ff]
- * otherwise configuration would break.
+ * @brief qed_get_queue_coalesce - Retrieve coalesce value for a given queue.
+ *
+ * @param p_hwfn
+ * @param p_coal - store coalesce value read from the hardware.
+ * @param p_handle
+ *
+ * @return int
+ **/
+int qed_get_queue_coalesce(struct qed_hwfn *p_hwfn, u16 *coal, void *handle);
+
+/**
+ * @brief qed_set_queue_coalesce - Configure coalesce parameters for Rx and
+ *    Tx queue. The fact that we can configure coalescing to up to 511, but on
+ *    varying accuracy [the bigger the value the less accurate] up to a mistake
+ *    of 3usec for the highest values.
+ *    While the API allows setting coalescing per-qid, all queues sharing a SB
+ *    should be in same range [i.e., either 0-0x7f, 0x80-0xff or 0x100-0x1ff]
+ *    otherwise configuration would break.
+ *
  *
  * @param rx_coal - Rx Coalesce value in micro seconds.
  * @param tx_coal - TX Coalesce value in micro seconds.
diff --git a/drivers/net/ethernet/qlogic/qed/qed_l2.c b/drivers/net/ethernet/qlogic/qed/qed_l2.c
index 0ba5ec8a9814..9a1645852015 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_l2.c
+++ b/drivers/net/ethernet/qlogic/qed/qed_l2.c
@@ -2047,6 +2047,106 @@ qed_configure_rfs_ntuple_filter(struct qed_hwfn *p_hwfn, struct qed_ptt *p_ptt,
 	return qed_spq_post(p_hwfn, p_ent, NULL);
 }
 
+int qed_get_rxq_coalesce(struct qed_hwfn *p_hwfn,
+			 struct qed_ptt *p_ptt,
+			 struct qed_queue_cid *p_cid, u16 *p_rx_coal)
+{
+	u32 coalesce, address, is_valid;
+	struct cau_sb_entry sb_entry;
+	u8 timer_res;
+	int rc;
+
+	rc = qed_dmae_grc2host(p_hwfn, p_ptt, CAU_REG_SB_VAR_MEMORY +
+			       p_cid->sb_igu_id * sizeof(u64),
+			       (u64)(uintptr_t)&sb_entry, 2, 0);
+	if (rc) {
+		DP_ERR(p_hwfn, "dmae_grc2host failed %d\n", rc);
+		return rc;
+	}
+
+	timer_res = GET_FIELD(sb_entry.params, CAU_SB_ENTRY_TIMER_RES0);
+
+	address = BAR0_MAP_REG_USDM_RAM +
+		  USTORM_ETH_QUEUE_ZONE_OFFSET(p_cid->abs.queue_id);
+	coalesce = qed_rd(p_hwfn, p_ptt, address);
+
+	is_valid = GET_FIELD(coalesce, COALESCING_TIMESET_VALID);
+	if (!is_valid)
+		return -EINVAL;
+
+	coalesce = GET_FIELD(coalesce, COALESCING_TIMESET_TIMESET);
+	*p_rx_coal = (u16)(coalesce << timer_res);
+
+	return 0;
+}
+
+int qed_get_txq_coalesce(struct qed_hwfn *p_hwfn,
+			 struct qed_ptt *p_ptt,
+			 struct qed_queue_cid *p_cid, u16 *p_tx_coal)
+{
+	u32 coalesce, address, is_valid;
+	struct cau_sb_entry sb_entry;
+	u8 timer_res;
+	int rc;
+
+	rc = qed_dmae_grc2host(p_hwfn, p_ptt, CAU_REG_SB_VAR_MEMORY +
+			       p_cid->sb_igu_id * sizeof(u64),
+			       (u64)(uintptr_t)&sb_entry, 2, 0);
+	if (rc) {
+		DP_ERR(p_hwfn, "dmae_grc2host failed %d\n", rc);
+		return rc;
+	}
+
+	timer_res = GET_FIELD(sb_entry.params, CAU_SB_ENTRY_TIMER_RES1);
+
+	address = BAR0_MAP_REG_XSDM_RAM +
+		  XSTORM_ETH_QUEUE_ZONE_OFFSET(p_cid->abs.queue_id);
+	coalesce = qed_rd(p_hwfn, p_ptt, address);
+
+	is_valid = GET_FIELD(coalesce, COALESCING_TIMESET_VALID);
+	if (!is_valid)
+		return -EINVAL;
+
+	coalesce = GET_FIELD(coalesce, COALESCING_TIMESET_TIMESET);
+	*p_tx_coal = (u16)(coalesce << timer_res);
+
+	return 0;
+}
+
+int qed_get_queue_coalesce(struct qed_hwfn *p_hwfn, u16 *p_coal, void *handle)
+{
+	struct qed_queue_cid *p_cid = handle;
+	struct qed_ptt *p_ptt;
+	int rc = 0;
+
+	if (IS_VF(p_hwfn->cdev)) {
+		rc = qed_vf_pf_get_coalesce(p_hwfn, p_coal, p_cid);
+		if (rc)
+			DP_NOTICE(p_hwfn, "Unable to read queue coalescing\n");
+
+		return rc;
+	}
+
+	p_ptt = qed_ptt_acquire(p_hwfn);
+	if (!p_ptt)
+		return -EAGAIN;
+
+	if (p_cid->b_is_rx) {
+		rc = qed_get_rxq_coalesce(p_hwfn, p_ptt, p_cid, p_coal);
+		if (rc)
+			goto out;
+	} else {
+		rc = qed_get_txq_coalesce(p_hwfn, p_ptt, p_cid, p_coal);
+		if (rc)
+			goto out;
+	}
+
+out:
+	qed_ptt_release(p_hwfn, p_ptt);
+
+	return rc;
+}
+
 static int qed_fill_eth_dev_info(struct qed_dev *cdev,
 				 struct qed_dev_eth_info *info)
 {
@@ -2696,6 +2796,20 @@ static int qed_ntuple_arfs_filter_config(struct qed_dev *cdev, void *cookie,
 	return rc;
 }
 
+static int qed_get_coalesce(struct qed_dev *cdev, u16 *coal, void *handle)
+{
+	struct qed_queue_cid *p_cid = handle;
+	struct qed_hwfn *p_hwfn;
+	int rc;
+
+	p_hwfn = p_cid->p_owner;
+	rc = qed_get_queue_coalesce(p_hwfn, coal, handle);
+	if (rc)
+		DP_NOTICE(p_hwfn, "Unable to read queue calescing\n");
+
+	return rc;
+}
+
 static int qed_fp_cqe_completion(struct qed_dev *dev,
 				 u8 rss_id, struct eth_slow_path_rx_cqe *cqe)
 {
@@ -2739,6 +2853,7 @@ static const struct qed_eth_ops qed_eth_ops_pass = {
 	.tunn_config = &qed_tunn_configure,
 	.ntuple_filter_config = &qed_ntuple_arfs_filter_config,
 	.configure_arfs_searcher = &qed_configure_arfs_searcher,
+	.get_coalesce = &qed_get_coalesce,
 };
 
 const struct qed_eth_ops *qed_get_eth_ops(void)
diff --git a/drivers/net/ethernet/qlogic/qed/qed_l2.h b/drivers/net/ethernet/qlogic/qed/qed_l2.h
index 60ea72ce3e2c..cc1f248551c9 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_l2.h
+++ b/drivers/net/ethernet/qlogic/qed/qed_l2.h
@@ -407,4 +407,13 @@ int qed_set_rxq_coalesce(struct qed_hwfn *p_hwfn,
 int qed_set_txq_coalesce(struct qed_hwfn *p_hwfn,
 			 struct qed_ptt *p_ptt,
 			 u16 coalesce, struct qed_queue_cid *p_cid);
-#endif /* _QED_L2_H */
+
+int qed_get_rxq_coalesce(struct qed_hwfn *p_hwfn,
+			 struct qed_ptt *p_ptt,
+			 struct qed_queue_cid *p_cid, u16 *p_hw_coal);
+
+int qed_get_txq_coalesce(struct qed_hwfn *p_hwfn,
+			 struct qed_ptt *p_ptt,
+			 struct qed_queue_cid *p_cid, u16 *p_hw_coal);
+
+#endif
diff --git a/drivers/net/ethernet/qlogic/qed/qed_main.c b/drivers/net/ethernet/qlogic/qed/qed_main.c
index 448810a235b8..27832885a87f 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_main.c
+++ b/drivers/net/ethernet/qlogic/qed/qed_main.c
@@ -1568,12 +1568,6 @@ static int qed_nvm_get_image(struct qed_dev *cdev, enum qed_nvm_images type,
 	return rc;
 }
 
-static void qed_get_coalesce(struct qed_dev *cdev, u16 *rx_coal, u16 *tx_coal)
-{
-	*rx_coal = cdev->rx_coalesce_usecs;
-	*tx_coal = cdev->tx_coalesce_usecs;
-}
-
 static int qed_set_coalesce(struct qed_dev *cdev, u16 rx_coal, u16 tx_coal,
 			    void *handle)
 {
@@ -1726,7 +1720,6 @@ const struct qed_common_ops qed_common_ops_pass = {
 	.chain_alloc = &qed_chain_alloc,
 	.chain_free = &qed_chain_free,
 	.nvm_get_image = &qed_nvm_get_image,
-	.get_coalesce = &qed_get_coalesce,
 	.set_coalesce = &qed_set_coalesce,
 	.set_led = &qed_set_led,
 	.update_drv_state = &qed_update_drv_state,
diff --git a/drivers/net/ethernet/qlogic/qed/qed_sriov.c b/drivers/net/ethernet/qlogic/qed/qed_sriov.c
index 5feef783623b..3f40b1de7957 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_sriov.c
+++ b/drivers/net/ethernet/qlogic/qed/qed_sriov.c
@@ -3400,6 +3400,75 @@ static void qed_iov_vf_mbx_release(struct qed_hwfn *p_hwfn,
 			     length, status);
 }
 
+static void qed_iov_vf_pf_get_coalesce(struct qed_hwfn *p_hwfn,
+				       struct qed_ptt *p_ptt,
+				       struct qed_vf_info *p_vf)
+{
+	struct qed_iov_vf_mbx *mbx = &p_vf->vf_mbx;
+	struct pfvf_read_coal_resp_tlv *p_resp;
+	struct vfpf_read_coal_req_tlv *req;
+	u8 status = PFVF_STATUS_FAILURE;
+	struct qed_vf_queue *p_queue;
+	struct qed_queue_cid *p_cid;
+	u16 coal = 0, qid, i;
+	bool b_is_rx;
+	int rc = 0;
+
+	mbx->offset = (u8 *)mbx->reply_virt;
+	req = &mbx->req_virt->read_coal_req;
+
+	qid = req->qid;
+	b_is_rx = req->is_rx ? true : false;
+
+	if (b_is_rx) {
+		if (!qed_iov_validate_rxq(p_hwfn, p_vf, qid,
+					  QED_IOV_VALIDATE_Q_ENABLE)) {
+			DP_VERBOSE(p_hwfn, QED_MSG_IOV,
+				   "VF[%d]: Invalid Rx queue_id = %d\n",
+				   p_vf->abs_vf_id, qid);
+			goto send_resp;
+		}
+
+		p_cid = qed_iov_get_vf_rx_queue_cid(&p_vf->vf_queues[qid]);
+		rc = qed_get_rxq_coalesce(p_hwfn, p_ptt, p_cid, &coal);
+		if (rc)
+			goto send_resp;
+	} else {
+		if (!qed_iov_validate_txq(p_hwfn, p_vf, qid,
+					  QED_IOV_VALIDATE_Q_ENABLE)) {
+			DP_VERBOSE(p_hwfn, QED_MSG_IOV,
+				   "VF[%d]: Invalid Tx queue_id = %d\n",
+				   p_vf->abs_vf_id, qid);
+			goto send_resp;
+		}
+		for (i = 0; i < MAX_QUEUES_PER_QZONE; i++) {
+			p_queue = &p_vf->vf_queues[qid];
+			if ((!p_queue->cids[i].p_cid) ||
+			    (!p_queue->cids[i].b_is_tx))
+				continue;
+
+			p_cid = p_queue->cids[i].p_cid;
+
+			rc = qed_get_txq_coalesce(p_hwfn, p_ptt, p_cid, &coal);
+			if (rc)
+				goto send_resp;
+			break;
+		}
+	}
+
+	status = PFVF_STATUS_SUCCESS;
+
+send_resp:
+	p_resp = qed_add_tlv(p_hwfn, &mbx->offset, CHANNEL_TLV_COALESCE_READ,
+			     sizeof(*p_resp));
+	p_resp->coal = coal;
+
+	qed_add_tlv(p_hwfn, &mbx->offset, CHANNEL_TLV_LIST_END,
+		    sizeof(struct channel_list_end_tlv));
+
+	qed_iov_send_response(p_hwfn, p_ptt, p_vf, sizeof(*p_resp), status);
+}
+
 static void qed_iov_vf_pf_set_coalesce(struct qed_hwfn *p_hwfn,
 				       struct qed_ptt *p_ptt,
 				       struct qed_vf_info *vf)
@@ -3450,6 +3519,7 @@ static void qed_iov_vf_pf_set_coalesce(struct qed_hwfn *p_hwfn,
 				   vf->abs_vf_id, vf->vf_queues[qid].fw_rx_qid);
 			goto out;
 		}
+		vf->rx_coal = rx_coal;
 	}
 
 	if (tx_coal) {
@@ -3473,6 +3543,7 @@ static void qed_iov_vf_pf_set_coalesce(struct qed_hwfn *p_hwfn,
 				goto out;
 			}
 		}
+		vf->tx_coal = tx_coal;
 	}
 
 	status = PFVF_STATUS_SUCCESS;
@@ -3808,6 +3879,9 @@ static void qed_iov_process_mbx_req(struct qed_hwfn *p_hwfn,
 		case CHANNEL_TLV_COALESCE_UPDATE:
 			qed_iov_vf_pf_set_coalesce(p_hwfn, p_ptt, p_vf);
 			break;
+		case CHANNEL_TLV_COALESCE_READ:
+			qed_iov_vf_pf_get_coalesce(p_hwfn, p_ptt, p_vf);
+			break;
 		}
 	} else if (qed_iov_tlv_supported(mbx->first_tlv.tl.type)) {
 		DP_VERBOSE(p_hwfn, QED_MSG_IOV,
diff --git a/drivers/net/ethernet/qlogic/qed/qed_sriov.h b/drivers/net/ethernet/qlogic/qed/qed_sriov.h
index c2e44bce398c..3955929ba892 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_sriov.h
+++ b/drivers/net/ethernet/qlogic/qed/qed_sriov.h
@@ -217,6 +217,9 @@ struct qed_vf_info {
 	u8 num_rxqs;
 	u8 num_txqs;
 
+	u16 rx_coal;
+	u16 tx_coal;
+
 	u8 num_sbs;
 
 	u8 num_mac_filters;
diff --git a/drivers/net/ethernet/qlogic/qed/qed_vf.c b/drivers/net/ethernet/qlogic/qed/qed_vf.c
index 0a7bbc0f19b0..91b5e9f02a62 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_vf.c
+++ b/drivers/net/ethernet/qlogic/qed/qed_vf.c
@@ -1343,6 +1343,37 @@ exit:
 	return rc;
 }
 
+int qed_vf_pf_get_coalesce(struct qed_hwfn *p_hwfn,
+			   u16 *p_coal, struct qed_queue_cid *p_cid)
+{
+	struct qed_vf_iov *p_iov = p_hwfn->vf_iov_info;
+	struct pfvf_read_coal_resp_tlv *resp;
+	struct vfpf_read_coal_req_tlv *req;
+	int rc;
+
+	/* clear mailbox and prep header tlv */
+	req = qed_vf_pf_prep(p_hwfn, CHANNEL_TLV_COALESCE_READ, sizeof(*req));
+	req->qid = p_cid->rel.queue_id;
+	req->is_rx = p_cid->b_is_rx ? 1 : 0;
+
+	qed_add_tlv(p_hwfn, &p_iov->offset, CHANNEL_TLV_LIST_END,
+		    sizeof(struct channel_list_end_tlv));
+	resp = &p_iov->pf2vf_reply->read_coal_resp;
+
+	rc = qed_send_msg2pf(p_hwfn, &resp->hdr.status, sizeof(*resp));
+	if (rc)
+		goto exit;
+
+	if (resp->hdr.status != PFVF_STATUS_SUCCESS)
+		goto exit;
+
+	*p_coal = resp->coal;
+exit:
+	qed_vf_pf_req_end(p_hwfn, rc);
+
+	return rc;
+}
+
 int
 qed_vf_pf_set_coalesce(struct qed_hwfn *p_hwfn,
 		       u16 rx_coal, u16 tx_coal, struct qed_queue_cid *p_cid)
diff --git a/drivers/net/ethernet/qlogic/qed/qed_vf.h b/drivers/net/ethernet/qlogic/qed/qed_vf.h
index 2d9fdd62f56d..97d44dfb38ca 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_vf.h
+++ b/drivers/net/ethernet/qlogic/qed/qed_vf.h
@@ -504,6 +504,20 @@ struct vfpf_update_coalesce {
 	u16 qid;
 	u8 padding[2];
 };
+
+struct vfpf_read_coal_req_tlv {
+	struct vfpf_first_tlv first_tlv;
+	u16 qid;
+	u8 is_rx;
+	u8 padding[5];
+};
+
+struct pfvf_read_coal_resp_tlv {
+	struct pfvf_tlv hdr;
+	u16 coal;
+	u8 padding[6];
+};
+
 union vfpf_tlvs {
 	struct vfpf_first_tlv first_tlv;
 	struct vfpf_acquire_tlv acquire;
@@ -517,7 +531,7 @@ union vfpf_tlvs {
 	struct vfpf_ucast_filter_tlv ucast_filter;
 	struct vfpf_update_tunn_param_tlv tunn_param_update;
 	struct vfpf_update_coalesce update_coalesce;
-	struct channel_list_end_tlv list_end;
+	struct vfpf_read_coal_req_tlv read_coal_req;
 	struct tlv_buffer_size tlv_buf_size;
 };
 
@@ -527,6 +541,7 @@ union pfvf_tlvs {
 	struct tlv_buffer_size tlv_buf_size;
 	struct pfvf_start_queue_resp_tlv queue_start;
 	struct pfvf_update_tunn_param_tlv tunn_param_resp;
+	struct pfvf_read_coal_resp_tlv read_coal_resp;
 };
 
 enum qed_bulletin_bit {
@@ -634,6 +649,7 @@ enum {
 	CHANNEL_TLV_UPDATE_TUNN_PARAM,
 	CHANNEL_TLV_COALESCE_UPDATE,
 	CHANNEL_TLV_QID,
+	CHANNEL_TLV_COALESCE_READ,
 	CHANNEL_TLV_MAX,
 
 	/* Required for iterating over vport-update tlvs.
@@ -699,6 +715,17 @@ int qed_vf_pf_set_coalesce(struct qed_hwfn *p_hwfn,
 			   u16 rx_coal,
 			   u16 tx_coal, struct qed_queue_cid *p_cid);
 
+/**
+ * @brief VF - Get coalesce per VF's relative queue.
+ *
+ * @param p_hwfn
+ * @param p_coal - coalesce value in micro second for VF queues.
+ * @param p_cid  - queue cid
+ *
+ **/
+int qed_vf_pf_get_coalesce(struct qed_hwfn *p_hwfn,
+			   u16 *p_coal, struct qed_queue_cid *p_cid);
+
 #ifdef CONFIG_QED_SRIOV
 /**
  * @brief Read the VF bulletin and act on it if needed
diff --git a/drivers/net/ethernet/qlogic/qede/qede_ethtool.c b/drivers/net/ethernet/qlogic/qede/qede_ethtool.c
index 76e0b132e8cc..dae741270022 100644
--- a/drivers/net/ethernet/qlogic/qede/qede_ethtool.c
+++ b/drivers/net/ethernet/qlogic/qede/qede_ethtool.c
@@ -702,16 +702,53 @@ static u32 qede_get_link(struct net_device *dev)
 static int qede_get_coalesce(struct net_device *dev,
 			     struct ethtool_coalesce *coal)
 {
+	void *rx_handle = NULL, *tx_handle = NULL;
 	struct qede_dev *edev = netdev_priv(dev);
-	u16 rxc, txc;
+	u16 rx_coal, tx_coal, i, rc = 0;
+	struct qede_fastpath *fp;
+
+	rx_coal = QED_DEFAULT_RX_USECS;
+	tx_coal = QED_DEFAULT_TX_USECS;
 
 	memset(coal, 0, sizeof(struct ethtool_coalesce));
-	edev->ops->common->get_coalesce(edev->cdev, &rxc, &txc);
 
-	coal->rx_coalesce_usecs = rxc;
-	coal->tx_coalesce_usecs = txc;
+	__qede_lock(edev);
+	if (edev->state == QEDE_STATE_OPEN) {
+		for_each_queue(i) {
+			fp = &edev->fp_array[i];
+
+			if (fp->type & QEDE_FASTPATH_RX) {
+				rx_handle = fp->rxq->handle;
+				break;
+			}
+		}
 
-	return 0;
+		rc = edev->ops->get_coalesce(edev->cdev, &rx_coal, rx_handle);
+		if (rc) {
+			DP_INFO(edev, "Read Rx coalesce error\n");
+			goto out;
+		}
+
+		for_each_queue(i) {
+			fp = &edev->fp_array[i];
+			if (fp->type & QEDE_FASTPATH_TX) {
+				tx_handle = fp->txq->handle;
+				break;
+			}
+		}
+
+		rc = edev->ops->get_coalesce(edev->cdev, &tx_coal, tx_handle);
+		if (rc)
+			DP_INFO(edev, "Read Tx coalesce error\n");
+	}
+
+out:
+	__qede_unlock(edev);
+
+	coal->rx_coalesce_usecs = rx_coal;
+	coal->tx_coalesce_usecs = tx_coal;
+
+	return rc;
 }
 
 static int qede_set_coalesce(struct net_device *dev,
diff --git a/include/linux/qed/qed_eth_if.h b/include/linux/qed/qed_eth_if.h
index 0eef0a2b1901..d60de4a39810 100644
--- a/include/linux/qed/qed_eth_if.h
+++ b/include/linux/qed/qed_eth_if.h
@@ -323,6 +323,7 @@ struct qed_eth_ops {
 
 	int (*configure_arfs_searcher)(struct qed_dev *cdev,
 				       bool en_searcher);
+	int (*get_coalesce)(struct qed_dev *cdev, u16 *coal, void *handle);
 };
 
 const struct qed_eth_ops *qed_get_eth_ops(void);
diff --git a/include/linux/qed/qed_if.h b/include/linux/qed/qed_if.h
index 4d59ca16134c..2b4720bb8b40 100644
--- a/include/linux/qed/qed_if.h
+++ b/include/linux/qed/qed_if.h
@@ -186,6 +186,7 @@ enum qed_led_mode {
 
 #define QED_COALESCE_MAX 0xFF
 #define QED_DEFAULT_RX_USECS 12
+#define QED_DEFAULT_TX_USECS 48
 
 /* forward */
 struct qed_dev;
@@ -673,16 +674,6 @@ struct qed_common_ops {
 	int (*nvm_get_image)(struct qed_dev *cdev,
 			     enum qed_nvm_images type, u8 *buf, u16 len);
 
-/**
- * @brief get_coalesce - Get coalesce parameters in usec
- *
- * @param cdev
- * @param rx_coal - Rx coalesce value in usec
- * @param tx_coal - Tx coalesce value in usec
- *
- */
-	void (*get_coalesce)(struct qed_dev *cdev, u16 *rx_coal, u16 *tx_coal);
-
 /**
  * @brief set_coalesce - Configure Rx coalesce value in usec
  *
-- 
cgit v1.2.3


From 41822878b2ff40a1f57be81f1a1f0f040847b912 Mon Sep 17 00:00:00 2001
From: Rahul Verma <rahul.verma@cavium.com>
Date: Wed, 26 Jul 2017 06:07:15 -0700
Subject: qed: enhanced per queue max coalesce value.

Maximum coalesce per Rx/Tx queue is extended from
255 to 511.

Signed-off-by: Rahul Verma <rahul.verma@cavium.com>
Signed-off-by: Yuval Mintz <yuval.mintz@cavium.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/qed/qed_if.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/qed/qed_if.h b/include/linux/qed/qed_if.h
index 2b4720bb8b40..cc646ca97974 100644
--- a/include/linux/qed/qed_if.h
+++ b/include/linux/qed/qed_if.h
@@ -184,7 +184,7 @@ enum qed_led_mode {
 
 #define DIRECT_REG_RD(reg_addr) readl((void __iomem *)(reg_addr))
 
-#define QED_COALESCE_MAX 0xFF
+#define QED_COALESCE_MAX 0x1FF
 #define QED_DEFAULT_RX_USECS 12
 #define QED_DEFAULT_TX_USECS 48
 
-- 
cgit v1.2.3


From ad84dad2160d5f36bb471b391462d651c887d693 Mon Sep 17 00:00:00 2001
From: "Amrani, Ram" <Ram.Amrani@cavium.com>
Date: Mon, 26 Jun 2017 19:05:05 +0300
Subject: RDMA/qedr: notify user application if DPM is supported

Direct Packet Mode support may be disabled, e.g, due to limited
resources. Notifying the user application prevents wasting cycles
on attempting to send these kind of packets.

Signed-off-by: Ram Amrani <Ram.Amrani@cavium.com>
Signed-off-by: Doug Ledford <dledford@redhat.com>
---
 drivers/infiniband/hw/qedr/main.c  | 1 +
 drivers/infiniband/hw/qedr/qedr.h  | 2 ++
 drivers/infiniband/hw/qedr/verbs.c | 1 +
 include/uapi/rdma/qedr-abi.h       | 1 +
 4 files changed, 5 insertions(+)

(limited to 'include')

diff --git a/drivers/infiniband/hw/qedr/main.c b/drivers/infiniband/hw/qedr/main.c
index 0ae30f5c8cbc..199b6edbef92 100644
--- a/drivers/infiniband/hw/qedr/main.c
+++ b/drivers/infiniband/hw/qedr/main.c
@@ -777,6 +777,7 @@ static struct qedr_dev *qedr_add(struct qed_dev *cdev, struct pci_dev *pdev,
 	if (rc)
 		goto init_err;
 
+	dev->user_dpm_enabled = dev_info.user_dpm_enabled;
 	dev->num_hwfns = dev_info.common.num_hwfns;
 	dev->rdma_ctx = dev->ops->rdma_get_rdma_ctx(cdev);
 
diff --git a/drivers/infiniband/hw/qedr/qedr.h b/drivers/infiniband/hw/qedr/qedr.h
index 392e76e26840..b2bb42e2805d 100644
--- a/drivers/infiniband/hw/qedr/qedr.h
+++ b/drivers/infiniband/hw/qedr/qedr.h
@@ -162,6 +162,8 @@ struct qedr_dev {
 	struct qedr_qp		*gsi_qp;
 
 	unsigned long enet_state;
+
+	u8 user_dpm_enabled;
 };
 
 #define QEDR_MAX_SQ_PBL			(0x8000)
diff --git a/drivers/infiniband/hw/qedr/verbs.c b/drivers/infiniband/hw/qedr/verbs.c
index 2ae71b8f1ba8..4322ee00498e 100644
--- a/drivers/infiniband/hw/qedr/verbs.c
+++ b/drivers/infiniband/hw/qedr/verbs.c
@@ -376,6 +376,7 @@ struct ib_ucontext *qedr_alloc_ucontext(struct ib_device *ibdev,
 
 	memset(&uresp, 0, sizeof(uresp));
 
+	uresp.dpm_enabled = dev->user_dpm_enabled;
 	uresp.db_pa = ctx->dpi_phys_addr;
 	uresp.db_size = ctx->dpi_size;
 	uresp.max_send_wr = dev->attr.max_sqe;
diff --git a/include/uapi/rdma/qedr-abi.h b/include/uapi/rdma/qedr-abi.h
index 75c270d839c8..2684004ec4fd 100644
--- a/include/uapi/rdma/qedr-abi.h
+++ b/include/uapi/rdma/qedr-abi.h
@@ -49,6 +49,7 @@ struct qedr_alloc_ucontext_resp {
 	__u32 sges_per_recv_wr;
 	__u32 sges_per_srq_wr;
 	__u32 max_cqes;
+	__u8 dpm_enabled;
 };
 
 struct qedr_alloc_pd_ureq {
-- 
cgit v1.2.3


From 67cbe3532c2cd84303a2073cedad6b8bcad13be3 Mon Sep 17 00:00:00 2001
From: "Amrani, Ram" <Ram.Amrani@cavium.com>
Date: Mon, 26 Jun 2017 19:05:06 +0300
Subject: RDMA/qedr: notify user application of supported WIDs

The number of supported WIDs, if they are supported at all, can be
limited due to resources. Notifying the user space application the
number of available WIDs allows it to utilize them correctly.

Signed-off-by: Ram Amrani <Ram.Amrani@cavium.com>
Signed-off-by: Doug Ledford <dledford@redhat.com>
---
 drivers/infiniband/hw/qedr/verbs.c | 2 ++
 include/uapi/rdma/qedr-abi.h       | 2 ++
 2 files changed, 4 insertions(+)

(limited to 'include')

diff --git a/drivers/infiniband/hw/qedr/verbs.c b/drivers/infiniband/hw/qedr/verbs.c
index 4322ee00498e..9ee2dce3e5bb 100644
--- a/drivers/infiniband/hw/qedr/verbs.c
+++ b/drivers/infiniband/hw/qedr/verbs.c
@@ -377,6 +377,8 @@ struct ib_ucontext *qedr_alloc_ucontext(struct ib_device *ibdev,
 	memset(&uresp, 0, sizeof(uresp));
 
 	uresp.dpm_enabled = dev->user_dpm_enabled;
+	uresp.wids_enabled = 1;
+	uresp.wid_count = oparams.wid_count;
 	uresp.db_pa = ctx->dpi_phys_addr;
 	uresp.db_size = ctx->dpi_size;
 	uresp.max_send_wr = dev->attr.max_sqe;
diff --git a/include/uapi/rdma/qedr-abi.h b/include/uapi/rdma/qedr-abi.h
index 2684004ec4fd..54b64357ab24 100644
--- a/include/uapi/rdma/qedr-abi.h
+++ b/include/uapi/rdma/qedr-abi.h
@@ -50,6 +50,8 @@ struct qedr_alloc_ucontext_resp {
 	__u32 sges_per_srq_wr;
 	__u32 max_cqes;
 	__u8 dpm_enabled;
+	__u8 wids_enabled;
+	__u16 wid_count;
 };
 
 struct qedr_alloc_pd_ureq {
-- 
cgit v1.2.3


From fc2237a724a9e448599076d7d23497f51e2f7441 Mon Sep 17 00:00:00 2001
From: Jason Gerecke <killertofu@gmail.com>
Date: Mon, 24 Jul 2017 09:46:18 -0700
Subject: HID: introduce hid_is_using_ll_driver

Although HID itself is transport-agnostic, occasionally a driver may
want to interact with the low-level transport that a device is connected
through. To do this, we need to know what kind of bus is in use. The
first guess may be to look at the 'bus' field of the 'struct hid_device',
but this field may be emulated in some cases (e.g. uhid).

More ideally, we can check which ll_driver a device is using. This
function introduces a 'hid_is_using_ll_driver' function and makes the
'struct hid_ll_driver' of the four most common transports accessible
through hid.h.

Signed-off-by: Jason Gerecke <jason.gerecke@wacom.com>
Acked-By: Benjamin Tissoires <benjamin.tissoires@redhat.com>
Signed-off-by: Jiri Kosina <jkosina@suse.cz>
---
 drivers/hid/i2c-hid/i2c-hid.c |  3 ++-
 drivers/hid/uhid.c            |  3 ++-
 drivers/hid/usbhid/hid-core.c |  3 ++-
 include/linux/hid.h           | 11 +++++++++++
 net/bluetooth/hidp/core.c     |  3 ++-
 5 files changed, 19 insertions(+), 4 deletions(-)

(limited to 'include')

diff --git a/drivers/hid/i2c-hid/i2c-hid.c b/drivers/hid/i2c-hid/i2c-hid.c
index 046f692fd0a2..77396145d2d0 100644
--- a/drivers/hid/i2c-hid/i2c-hid.c
+++ b/drivers/hid/i2c-hid/i2c-hid.c
@@ -780,7 +780,7 @@ static int i2c_hid_power(struct hid_device *hid, int lvl)
 	return 0;
 }
 
-static struct hid_ll_driver i2c_hid_ll_driver = {
+struct hid_ll_driver i2c_hid_ll_driver = {
 	.parse = i2c_hid_parse,
 	.start = i2c_hid_start,
 	.stop = i2c_hid_stop,
@@ -790,6 +790,7 @@ static struct hid_ll_driver i2c_hid_ll_driver = {
 	.output_report = i2c_hid_output_report,
 	.raw_request = i2c_hid_raw_request,
 };
+EXPORT_SYMBOL_GPL(i2c_hid_ll_driver);
 
 static int i2c_hid_init_irq(struct i2c_client *client)
 {
diff --git a/drivers/hid/uhid.c b/drivers/hid/uhid.c
index 7f8ff39ed44b..6f819f144cb4 100644
--- a/drivers/hid/uhid.c
+++ b/drivers/hid/uhid.c
@@ -369,7 +369,7 @@ static int uhid_hid_output_report(struct hid_device *hid, __u8 *buf,
 	return uhid_hid_output_raw(hid, buf, count, HID_OUTPUT_REPORT);
 }
 
-static struct hid_ll_driver uhid_hid_driver = {
+struct hid_ll_driver uhid_hid_driver = {
 	.start = uhid_hid_start,
 	.stop = uhid_hid_stop,
 	.open = uhid_hid_open,
@@ -378,6 +378,7 @@ static struct hid_ll_driver uhid_hid_driver = {
 	.raw_request = uhid_hid_raw_request,
 	.output_report = uhid_hid_output_report,
 };
+EXPORT_SYMBOL_GPL(uhid_hid_driver);
 
 #ifdef CONFIG_COMPAT
 
diff --git a/drivers/hid/usbhid/hid-core.c b/drivers/hid/usbhid/hid-core.c
index 76013eb5cb7f..e1047ad0d59b 100644
--- a/drivers/hid/usbhid/hid-core.c
+++ b/drivers/hid/usbhid/hid-core.c
@@ -1261,7 +1261,7 @@ static int usbhid_idle(struct hid_device *hid, int report, int idle,
 	return hid_set_idle(dev, ifnum, report, idle);
 }
 
-static struct hid_ll_driver usb_hid_driver = {
+struct hid_ll_driver usb_hid_driver = {
 	.parse = usbhid_parse,
 	.start = usbhid_start,
 	.stop = usbhid_stop,
@@ -1274,6 +1274,7 @@ static struct hid_ll_driver usb_hid_driver = {
 	.output_report = usbhid_output_report,
 	.idle = usbhid_idle,
 };
+EXPORT_SYMBOL_GPL(usb_hid_driver);
 
 static int usbhid_probe(struct usb_interface *intf, const struct usb_device_id *id)
 {
diff --git a/include/linux/hid.h b/include/linux/hid.h
index 5006f9b5d837..3853408daf7f 100644
--- a/include/linux/hid.h
+++ b/include/linux/hid.h
@@ -777,6 +777,17 @@ struct hid_ll_driver {
 	int (*idle)(struct hid_device *hdev, int report, int idle, int reqtype);
 };
 
+extern struct hid_ll_driver i2c_hid_ll_driver;
+extern struct hid_ll_driver hidp_hid_driver;
+extern struct hid_ll_driver uhid_hid_driver;
+extern struct hid_ll_driver usb_hid_driver;
+
+static inline bool hid_is_using_ll_driver(struct hid_device *hdev,
+		struct hid_ll_driver *driver)
+{
+	return hdev->ll_driver == driver;
+}
+
 #define	PM_HINT_FULLON	1<<5
 #define PM_HINT_NORMAL	1<<1
 
diff --git a/net/bluetooth/hidp/core.c b/net/bluetooth/hidp/core.c
index 002743ea509c..8112893037bd 100644
--- a/net/bluetooth/hidp/core.c
+++ b/net/bluetooth/hidp/core.c
@@ -734,7 +734,7 @@ static void hidp_stop(struct hid_device *hid)
 	hid->claimed = 0;
 }
 
-static struct hid_ll_driver hidp_hid_driver = {
+struct hid_ll_driver hidp_hid_driver = {
 	.parse = hidp_parse,
 	.start = hidp_start,
 	.stop = hidp_stop,
@@ -743,6 +743,7 @@ static struct hid_ll_driver hidp_hid_driver = {
 	.raw_request = hidp_raw_request,
 	.output_report = hidp_output_report,
 };
+EXPORT_SYMBOL_GPL(hidp_hid_driver);
 
 /* This function sets up the hid device. It does not add it
    to the HID system. That is done in hidp_add_connection(). */
-- 
cgit v1.2.3


From ee011c5b4c43b215a985c7d4368f77a572c5c216 Mon Sep 17 00:00:00 2001
From: Geert Uytterhoeven <geert+renesas@glider.be>
Date: Thu, 20 Jul 2017 14:34:52 +0200
Subject: soc: renesas: Add r8a77995 SYSC PM Domain Binding Definitions

Add power domain indices for R-Car D3.

Signed-off-by: Geert Uytterhoeven <geert+renesas@glider.be>
Signed-off-by: Simon Horman <horms+renesas@verge.net.au>
---
 include/dt-bindings/power/r8a77995-sysc.h | 23 +++++++++++++++++++++++
 1 file changed, 23 insertions(+)
 create mode 100644 include/dt-bindings/power/r8a77995-sysc.h

(limited to 'include')

diff --git a/include/dt-bindings/power/r8a77995-sysc.h b/include/dt-bindings/power/r8a77995-sysc.h
new file mode 100644
index 000000000000..09d0ed575b73
--- /dev/null
+++ b/include/dt-bindings/power/r8a77995-sysc.h
@@ -0,0 +1,23 @@
+/*
+ * Copyright (C) 2017 Glider bvba
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; version 2 of the License.
+ */
+#ifndef __DT_BINDINGS_POWER_R8A77995_SYSC_H__
+#define __DT_BINDINGS_POWER_R8A77995_SYSC_H__
+
+/*
+ * These power domain indices match the numbers of the interrupt bits
+ * representing the power areas in the various Interrupt Registers
+ * (e.g. SYSCISR, Interrupt Status Register)
+ */
+
+#define R8A77995_PD_CA53_CPU0		 5
+#define R8A77995_PD_CA53_SCU		21
+
+/* Always-on power area */
+#define R8A77995_PD_ALWAYS_ON		32
+
+#endif /* __DT_BINDINGS_POWER_R8A77995_SYSC_H__ */
-- 
cgit v1.2.3


From 649ea4d5a624f061a111b1f1cb0e47cfdc3ac21b Mon Sep 17 00:00:00 2001
From: Josh Poimboeuf <jpoimboe@redhat.com>
Date: Thu, 27 Jul 2017 15:56:53 -0500
Subject: objtool: Assume unannotated UD2 instructions are dead ends

Arnd reported some false positive warnings with GCC 7:

  drivers/hid/wacom_wac.o: warning: objtool: wacom_bpt3_touch()+0x2a5: stack state mismatch: cfa1=7+8 cfa2=6+16
  drivers/iio/adc/vf610_adc.o: warning: objtool: vf610_adc_calculate_rates() falls through to next function vf610_adc_sample_set()
  drivers/pwm/pwm-hibvt.o: warning: objtool: hibvt_pwm_get_state() falls through to next function hibvt_pwm_remove()
  drivers/pwm/pwm-mediatek.o: warning: objtool: mtk_pwm_config() falls through to next function mtk_pwm_enable()
  drivers/spi/spi-bcm2835.o: warning: objtool: .text: unexpected end of section
  drivers/spi/spi-bcm2835aux.o: warning: objtool: .text: unexpected end of section
  drivers/watchdog/digicolor_wdt.o: warning: objtool: dc_wdt_get_timeleft() falls through to next function dc_wdt_restart()

When GCC 7 detects a potential divide-by-zero condition, it sometimes
inserts a UD2 instruction for the case where the divisor is zero,
instead of letting the hardware trap on the divide instruction.

Objtool doesn't consider UD2 to be fatal unless it's annotated with
unreachable().  So it considers the GCC-generated UD2 to be non-fatal,
and it tries to follow the control flow past the UD2 and gets
confused.

Previously, objtool *did* assume UD2 was always a dead end.  That
changed with the following commit:

  d1091c7fa3d5 ("objtool: Improve detection of BUG() and other dead ends")

The motivation behind that change was that Peter was planning on using
UD2 for __WARN(), which is *not* a dead end.  However, it turns out
that some emulators rely on UD2 being fatal, so he ended up using
'ud0' instead:

  9a93848fe787 ("x86/debug: Implement __WARN() using UD0")

For GCC 4.5+, it should be safe to go back to the previous assumption
that UD2 is fatal, even when it's not annotated with unreachable().

But for pre-4.5 versions of GCC, the unreachable() macro isn't
supported, so such cases of UD2 need to be explicitly annotated as
reachable.

Reported-by: Arnd Bergmann <arnd@arndb.de>
Signed-off-by: Josh Poimboeuf <jpoimboe@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Fixes: d1091c7fa3d5 ("objtool: Improve detection of BUG() and other dead ends")
Link: http://lkml.kernel.org/r/e57fa9dfede25f79487da8126ee9cdf7b856db65.1501188854.git.jpoimboe@redhat.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/compiler-gcc.h    | 16 ------------
 include/linux/compiler.h        | 25 +++++++++++++++++-
 tools/objtool/arch.h            |  5 ++--
 tools/objtool/arch/x86/decode.c | 17 ++++++++----
 tools/objtool/check.c           | 57 +++++++++++++++++++++++++++++++++++++++--
 5 files changed, 94 insertions(+), 26 deletions(-)

(limited to 'include')

diff --git a/include/linux/compiler-gcc.h b/include/linux/compiler-gcc.h
index 17cbe3cffc0e..dd4b4c59c2f0 100644
--- a/include/linux/compiler-gcc.h
+++ b/include/linux/compiler-gcc.h
@@ -128,22 +128,6 @@
 #define __always_unused		__attribute__((unused))
 #define __mode(x)               __attribute__((mode(x)))
 
-#ifdef CONFIG_STACK_VALIDATION
-#define annotate_unreachable() ({					\
-	asm("%c0:\n\t"							\
-	    ".pushsection .discard.unreachable\n\t"			\
-	    ".long %c0b - .\n\t"					\
-	    ".popsection\n\t" : : "i" (__LINE__));			\
-})
-#define ASM_UNREACHABLE							\
-	"999:\n\t"							\
-	".pushsection .discard.unreachable\n\t"				\
-	".long 999b - .\n\t"						\
-	".popsection\n\t"
-#else
-#define annotate_unreachable()
-#endif
-
 /* gcc version specific checks */
 
 #if GCC_VERSION < 30200
diff --git a/include/linux/compiler.h b/include/linux/compiler.h
index 641f5912d75f..70f070ddd7c7 100644
--- a/include/linux/compiler.h
+++ b/include/linux/compiler.h
@@ -185,11 +185,34 @@ void ftrace_likely_update(struct ftrace_likely_data *f, int val,
 #endif
 
 /* Unreachable code */
+#ifdef CONFIG_STACK_VALIDATION
+#define annotate_reachable() ({						\
+	asm("%c0:\n\t"							\
+	    ".pushsection .discard.reachable\n\t"			\
+	    ".long %c0b - .\n\t"					\
+	    ".popsection\n\t" : : "i" (__LINE__));			\
+})
+#define annotate_unreachable() ({					\
+	asm("%c0:\n\t"							\
+	    ".pushsection .discard.unreachable\n\t"			\
+	    ".long %c0b - .\n\t"					\
+	    ".popsection\n\t" : : "i" (__LINE__));			\
+})
+#define ASM_UNREACHABLE							\
+	"999:\n\t"							\
+	".pushsection .discard.unreachable\n\t"				\
+	".long 999b - .\n\t"						\
+	".popsection\n\t"
+#else
+#define annotate_reachable()
+#define annotate_unreachable()
+#endif
+
 #ifndef ASM_UNREACHABLE
 # define ASM_UNREACHABLE
 #endif
 #ifndef unreachable
-# define unreachable() do { } while (1)
+# define unreachable() do { annotate_reachable(); do { } while (1); } while (0)
 #endif
 
 /*
diff --git a/tools/objtool/arch.h b/tools/objtool/arch.h
index 21aeca874edb..b0d7dc3d71b5 100644
--- a/tools/objtool/arch.h
+++ b/tools/objtool/arch.h
@@ -31,8 +31,9 @@
 #define INSN_RETURN		6
 #define INSN_CONTEXT_SWITCH	7
 #define INSN_STACK		8
-#define INSN_NOP		9
-#define INSN_OTHER		10
+#define INSN_BUG		9
+#define INSN_NOP		10
+#define INSN_OTHER		11
 #define INSN_LAST		INSN_OTHER
 
 enum op_dest_type {
diff --git a/tools/objtool/arch/x86/decode.c b/tools/objtool/arch/x86/decode.c
index a36c2eba64e7..e4b400b67e41 100644
--- a/tools/objtool/arch/x86/decode.c
+++ b/tools/objtool/arch/x86/decode.c
@@ -382,20 +382,27 @@ int arch_decode_instruction(struct elf *elf, struct section *sec,
 
 	case 0x0f:
 
-		if (op2 >= 0x80 && op2 <= 0x8f)
+		if (op2 >= 0x80 && op2 <= 0x8f) {
+
 			*type = INSN_JUMP_CONDITIONAL;
-		else if (op2 == 0x05 || op2 == 0x07 || op2 == 0x34 ||
-			 op2 == 0x35)
+
+		} else if (op2 == 0x05 || op2 == 0x07 || op2 == 0x34 ||
+			   op2 == 0x35) {
 
 			/* sysenter, sysret */
 			*type = INSN_CONTEXT_SWITCH;
 
-		else if (op2 == 0x0d || op2 == 0x1f)
+		} else if (op2 == 0x0b || op2 == 0xb9) {
+
+			/* ud2 */
+			*type = INSN_BUG;
+
+		} else if (op2 == 0x0d || op2 == 0x1f) {
 
 			/* nopl/nopw */
 			*type = INSN_NOP;
 
-		else if (op2 == 0xa0 || op2 == 0xa8) {
+		} else if (op2 == 0xa0 || op2 == 0xa8) {
 
 			/* push fs/gs */
 			*type = INSN_STACK;
diff --git a/tools/objtool/check.c b/tools/objtool/check.c
index 3436a942b606..d07bf4a62b45 100644
--- a/tools/objtool/check.c
+++ b/tools/objtool/check.c
@@ -296,7 +296,7 @@ static int decode_instructions(struct objtool_file *file)
 }
 
 /*
- * Find all uses of the unreachable() macro, which are code path dead ends.
+ * Mark "ud2" instructions and manually annotated dead ends.
  */
 static int add_dead_ends(struct objtool_file *file)
 {
@@ -305,9 +305,20 @@ static int add_dead_ends(struct objtool_file *file)
 	struct instruction *insn;
 	bool found;
 
+	/*
+	 * By default, "ud2" is a dead end unless otherwise annotated, because
+	 * GCC 7 inserts it for certain divide-by-zero cases.
+	 */
+	for_each_insn(file, insn)
+		if (insn->type == INSN_BUG)
+			insn->dead_end = true;
+
+	/*
+	 * Check for manually annotated dead ends.
+	 */
 	sec = find_section_by_name(file->elf, ".rela.discard.unreachable");
 	if (!sec)
-		return 0;
+		goto reachable;
 
 	list_for_each_entry(rela, &sec->rela_list, list) {
 		if (rela->sym->type != STT_SECTION) {
@@ -340,6 +351,48 @@ static int add_dead_ends(struct objtool_file *file)
 		insn->dead_end = true;
 	}
 
+reachable:
+	/*
+	 * These manually annotated reachable checks are needed for GCC 4.4,
+	 * where the Linux unreachable() macro isn't supported.  In that case
+	 * GCC doesn't know the "ud2" is fatal, so it generates code as if it's
+	 * not a dead end.
+	 */
+	sec = find_section_by_name(file->elf, ".rela.discard.reachable");
+	if (!sec)
+		return 0;
+
+	list_for_each_entry(rela, &sec->rela_list, list) {
+		if (rela->sym->type != STT_SECTION) {
+			WARN("unexpected relocation symbol type in %s", sec->name);
+			return -1;
+		}
+		insn = find_insn(file, rela->sym->sec, rela->addend);
+		if (insn)
+			insn = list_prev_entry(insn, list);
+		else if (rela->addend == rela->sym->sec->len) {
+			found = false;
+			list_for_each_entry_reverse(insn, &file->insn_list, list) {
+				if (insn->sec == rela->sym->sec) {
+					found = true;
+					break;
+				}
+			}
+
+			if (!found) {
+				WARN("can't find reachable insn at %s+0x%x",
+				     rela->sym->sec->name, rela->addend);
+				return -1;
+			}
+		} else {
+			WARN("can't find reachable insn at %s+0x%x",
+			     rela->sym->sec->name, rela->addend);
+			return -1;
+		}
+
+		insn->dead_end = false;
+	}
+
 	return 0;
 }
 
-- 
cgit v1.2.3


From 333706b8ed81b64b6c4241e493791a81bc8e6d43 Mon Sep 17 00:00:00 2001
From: Gary R Hook <gary.hook@amd.com>
Date: Mon, 17 Jul 2017 15:16:21 -0500
Subject: crypto: Add akcipher_set_reqsize() function

Signed-off-by: Gary R Hook <gary.hook@amd.com>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 include/crypto/internal/akcipher.h | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'include')

diff --git a/include/crypto/internal/akcipher.h b/include/crypto/internal/akcipher.h
index 479a0078f0f7..805686ba2be4 100644
--- a/include/crypto/internal/akcipher.h
+++ b/include/crypto/internal/akcipher.h
@@ -38,6 +38,12 @@ static inline void *akcipher_request_ctx(struct akcipher_request *req)
 	return req->__ctx;
 }
 
+static inline void akcipher_set_reqsize(struct crypto_akcipher *akcipher,
+					unsigned int reqsize)
+{
+	crypto_akcipher_alg(akcipher)->reqsize = reqsize;
+}
+
 static inline void *akcipher_tfm_ctx(struct crypto_akcipher *tfm)
 {
 	return tfm->base.__crt_ctx;
-- 
cgit v1.2.3


From 80aafd50b6a4fa6b6bba4b451b553d5d221f59ff Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@redhat.com>
Date: Mon, 24 Jul 2017 06:22:16 -0400
Subject: Documentation: add some docs for errseq_t

...and fix up a few comments in the code.

Signed-off-by: Jeff Layton <jlayton@redhat.com>
---
 Documentation/errseq.rst | 149 +++++++++++++++++++++++++++++++++++++++++++++++
 include/linux/errseq.h   |   5 +-
 include/linux/fs.h       |   2 -
 3 files changed, 152 insertions(+), 4 deletions(-)
 create mode 100644 Documentation/errseq.rst

(limited to 'include')

diff --git a/Documentation/errseq.rst b/Documentation/errseq.rst
new file mode 100644
index 000000000000..4c29bd5afbc5
--- /dev/null
+++ b/Documentation/errseq.rst
@@ -0,0 +1,149 @@
+The errseq_t datatype
+=====================
+An errseq_t is a way of recording errors in one place, and allowing any
+number of "subscribers" to tell whether it has changed since a previous
+point where it was sampled.
+
+The initial use case for this is tracking errors for file
+synchronization syscalls (fsync, fdatasync, msync and sync_file_range),
+but it may be usable in other situations.
+
+It's implemented as an unsigned 32-bit value.  The low order bits are
+designated to hold an error code (between 1 and MAX_ERRNO).  The upper bits
+are used as a counter.  This is done with atomics instead of locking so that
+these functions can be called from any context.
+
+Note that there is a risk of collisions if new errors are being recorded
+frequently, since we have so few bits to use as a counter.
+
+To mitigate this, the bit between the error value and counter is used as
+a flag to tell whether the value has been sampled since a new value was
+recorded.  That allows us to avoid bumping the counter if no one has
+sampled it since the last time an error was recorded.
+
+Thus we end up with a value that looks something like this::
+
+    bit:  31..13        12        11..0
+    +-----------------+----+----------------+
+    |     counter     | SF |      errno     |
+    +-----------------+----+----------------+
+
+The general idea is for "watchers" to sample an errseq_t value and keep
+it as a running cursor.  That value can later be used to tell whether
+any new errors have occurred since that sampling was done, and atomically
+record the state at the time that it was checked.  This allows us to
+record errors in one place, and then have a number of "watchers" that
+can tell whether the value has changed since they last checked it.
+
+A new errseq_t should always be zeroed out.  An errseq_t value of all zeroes
+is the special (but common) case where there has never been an error. An all
+zero value thus serves as the "epoch" if one wishes to know whether there
+has ever been an error set since it was first initialized.
+
+API usage
+=========
+Let me tell you a story about a worker drone.  Now, he's a good worker
+overall, but the company is a little...management heavy.  He has to
+report to 77 supervisors today, and tomorrow the "big boss" is coming in
+from out of town and he's sure to test the poor fellow too.
+
+They're all handing him work to do -- so much he can't keep track of who
+handed him what, but that's not really a big problem.  The supervisors
+just want to know when he's finished all of the work they've handed him so
+far and whether he made any mistakes since they last asked.
+
+He might have made the mistake on work they didn't actually hand him,
+but he can't keep track of things at that level of detail, all he can
+remember is the most recent mistake that he made.
+
+Here's our worker_drone representation::
+
+        struct worker_drone {
+                errseq_t        wd_err; /* for recording errors */
+        };
+
+Every day, the worker_drone starts out with a blank slate::
+
+        struct worker_drone wd;
+
+        wd.wd_err = (errseq_t)0;
+
+The supervisors come in and get an initial read for the day.  They
+don't care about anything that happened before their watch begins::
+
+        struct supervisor {
+                errseq_t        s_wd_err; /* private "cursor" for wd_err */
+                spinlock_t      s_wd_err_lock; /* protects s_wd_err */
+        }
+
+        struct supervisor       su;
+
+        su.s_wd_err = errseq_sample(&wd.wd_err);
+        spin_lock_init(&su.s_wd_err_lock);
+
+Now they start handing him tasks to do.  Every few minutes they ask him to
+finish up all of the work they've handed him so far.  Then they ask him
+whether he made any mistakes on any of it::
+
+        spin_lock(&su.su_wd_err_lock);
+        err = errseq_check_and_advance(&wd.wd_err, &su.s_wd_err);
+        spin_unlock(&su.su_wd_err_lock);
+
+Up to this point, that just keeps returning 0.
+
+Now, the owners of this company are quite miserly and have given him
+substandard equipment with which to do his job. Occasionally it
+glitches and he makes a mistake.  He sighs a heavy sigh, and marks it
+down::
+
+        errseq_set(&wd.wd_err, -EIO);
+
+...and then gets back to work.  The supervisors eventually poll again
+and they each get the error when they next check.  Subsequent calls will
+return 0, until another error is recorded, at which point it's reported
+to each of them once.
+
+Note that the supervisors can't tell how many mistakes he made, only
+whether one was made since they last checked, and the latest value
+recorded.
+
+Occasionally the big boss comes in for a spot check and asks the worker
+to do a one-off job for him. He's not really watching the worker
+full-time like the supervisors, but he does need to know whether a
+mistake occurred while his job was processing.
+
+He can just sample the current errseq_t in the worker, and then use that
+to tell whether an error has occurred later::
+
+        errseq_t since = errseq_sample(&wd.wd_err);
+        /* submit some work and wait for it to complete */
+        err = errseq_check(&wd.wd_err, since);
+
+Since he's just going to discard "since" after that point, he doesn't
+need to advance it here. He also doesn't need any locking since it's
+not usable by anyone else.
+
+Serializing errseq_t cursor updates
+===================================
+Note that the errseq_t API does not protect the errseq_t cursor during a
+check_and_advance_operation. Only the canonical error code is handled
+atomically.  In a situation where more than one task might be using the
+same errseq_t cursor at the same time, it's important to serialize
+updates to that cursor.
+
+If that's not done, then it's possible for the cursor to go backward
+in which case the same error could be reported more than once.
+
+Because of this, it's often advantageous to first do an errseq_check to
+see if anything has changed, and only later do an
+errseq_check_and_advance after taking the lock. e.g.::
+
+        if (errseq_check(&wd.wd_err, READ_ONCE(su.s_wd_err)) {
+                /* su.s_wd_err is protected by s_wd_err_lock */
+                spin_lock(&su.s_wd_err_lock);
+                err = errseq_check_and_advance(&wd.wd_err, &su.s_wd_err);
+                spin_unlock(&su.s_wd_err_lock);
+        }
+
+That avoids the spinlock in the common case where nothing has changed
+since the last time it was checked.
diff --git a/include/linux/errseq.h b/include/linux/errseq.h
index 784f0860527b..f746bd8fe4d0 100644
--- a/include/linux/errseq.h
+++ b/include/linux/errseq.h
@@ -1,8 +1,9 @@
+/*
+ * See Documentation/errseq.rst and lib/errseq.c
+ */
 #ifndef _LINUX_ERRSEQ_H
 #define _LINUX_ERRSEQ_H
 
-/* See lib/errseq.c for more info */
-
 typedef u32	errseq_t;
 
 errseq_t errseq_set(errseq_t *eseq, int err);
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 7b5d6816542b..21e7df1ad613 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -2571,8 +2571,6 @@ extern int __must_check file_write_and_wait_range(struct file *file,
  * When a writeback error occurs, most filesystems will want to call
  * filemap_set_wb_err to record the error in the mapping so that it will be
  * automatically reported whenever fsync is called on the file.
- *
- * FIXME: mention FS_* flag here?
  */
 static inline void filemap_set_wb_err(struct address_space *mapping, int err)
 {
-- 
cgit v1.2.3


From 7d35079f8277b653d6a3075eea9edd4dbf7c2b29 Mon Sep 17 00:00:00 2001
From: Shaohua Li <shli@fb.com>
Date: Wed, 12 Jul 2017 11:49:46 -0700
Subject: kernfs: use idr instead of ida to manage inode number

kernfs uses ida to manage inode number. The problem is we can't get
kernfs_node from inode number with ida. Switching to use idr, next patch
will add an API to get kernfs_node from inode number.

Acked-by: Tejun Heo <tj@kernel.org>
Acked-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Signed-off-by: Shaohua Li <shli@fb.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/kernfs/dir.c        | 17 ++++++++++++-----
 include/linux/kernfs.h |  2 +-
 2 files changed, 13 insertions(+), 6 deletions(-)

(limited to 'include')

diff --git a/fs/kernfs/dir.c b/fs/kernfs/dir.c
index db5900aaa55a..8ad7a17895fe 100644
--- a/fs/kernfs/dir.c
+++ b/fs/kernfs/dir.c
@@ -21,6 +21,7 @@
 DEFINE_MUTEX(kernfs_mutex);
 static DEFINE_SPINLOCK(kernfs_rename_lock);	/* kn->parent and ->name */
 static char kernfs_pr_cont_buf[PATH_MAX];	/* protected by rename_lock */
+static DEFINE_SPINLOCK(kernfs_idr_lock);	/* root->ino_idr */
 
 #define rb_to_kn(X) rb_entry((X), struct kernfs_node, rb)
 
@@ -533,7 +534,9 @@ void kernfs_put(struct kernfs_node *kn)
 		simple_xattrs_free(&kn->iattr->xattrs);
 	}
 	kfree(kn->iattr);
-	ida_simple_remove(&root->ino_ida, kn->ino);
+	spin_lock(&kernfs_idr_lock);
+	idr_remove(&root->ino_idr, kn->ino);
+	spin_unlock(&kernfs_idr_lock);
 	kmem_cache_free(kernfs_node_cache, kn);
 
 	kn = parent;
@@ -542,7 +545,7 @@ void kernfs_put(struct kernfs_node *kn)
 			goto repeat;
 	} else {
 		/* just released the root kn, free @root too */
-		ida_destroy(&root->ino_ida);
+		idr_destroy(&root->ino_idr);
 		kfree(root);
 	}
 }
@@ -630,7 +633,11 @@ static struct kernfs_node *__kernfs_new_node(struct kernfs_root *root,
 	if (!kn)
 		goto err_out1;
 
-	ret = ida_simple_get(&root->ino_ida, 1, 0, GFP_KERNEL);
+	idr_preload(GFP_KERNEL);
+	spin_lock(&kernfs_idr_lock);
+	ret = idr_alloc(&root->ino_idr, kn, 1, 0, GFP_ATOMIC);
+	spin_unlock(&kernfs_idr_lock);
+	idr_preload_end();
 	if (ret < 0)
 		goto err_out2;
 	kn->ino = ret;
@@ -875,13 +882,13 @@ struct kernfs_root *kernfs_create_root(struct kernfs_syscall_ops *scops,
 	if (!root)
 		return ERR_PTR(-ENOMEM);
 
-	ida_init(&root->ino_ida);
+	idr_init(&root->ino_idr);
 	INIT_LIST_HEAD(&root->supers);
 
 	kn = __kernfs_new_node(root, "", S_IFDIR | S_IRUGO | S_IXUGO,
 			       KERNFS_DIR);
 	if (!kn) {
-		ida_destroy(&root->ino_ida);
+		idr_destroy(&root->ino_idr);
 		kfree(root);
 		return ERR_PTR(-ENOMEM);
 	}
diff --git a/include/linux/kernfs.h b/include/linux/kernfs.h
index a9b11b8d06f2..5f5d602eb433 100644
--- a/include/linux/kernfs.h
+++ b/include/linux/kernfs.h
@@ -163,7 +163,7 @@ struct kernfs_root {
 	unsigned int		flags;	/* KERNFS_ROOT_* flags */
 
 	/* private fields, do not use outside kernfs proper */
-	struct ida		ino_ida;
+	struct idr		ino_idr;
 	struct kernfs_syscall_ops *syscall_ops;
 
 	/* list of kernfs_super_info of this root, protected by kernfs_mutex */
-- 
cgit v1.2.3


From 4a3ef68acacf31570066e69593de5cc49cc91638 Mon Sep 17 00:00:00 2001
From: Shaohua Li <shli@fb.com>
Date: Wed, 12 Jul 2017 11:49:47 -0700
Subject: kernfs: implement i_generation

Set i_generation for kernfs inode. This is required to implement
exportfs operations. The generation is 32-bit, so it's possible the
generation wraps up and we find stale files. To reduce the posssibility,
we don't reuse inode numer immediately. When the inode number allocation
wraps, we increase generation number. In this way generation/inode
number consist of a 64-bit number which is unlikely duplicated. This
does make the idr tree more sparse and waste some memory. Since idr
manages 32-bit keys, idr uses a 6-level radix tree, each level covers 6
bits of the key. In a 100k inode kernfs, the worst case will have around
300k radix tree node. Each node is 576bytes, so the tree will use about
~150M memory. Sounds not too bad, if this really is a problem, we should
find better data structure.

Acked-by: Tejun Heo <tj@kernel.org>
Acked-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Signed-off-by: Shaohua Li <shli@fb.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/kernfs/dir.c        | 10 +++++++++-
 fs/kernfs/inode.c      |  1 +
 include/linux/kernfs.h |  2 ++
 3 files changed, 12 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/fs/kernfs/dir.c b/fs/kernfs/dir.c
index 8ad7a17895fe..33f711f6b86e 100644
--- a/fs/kernfs/dir.c
+++ b/fs/kernfs/dir.c
@@ -623,6 +623,8 @@ static struct kernfs_node *__kernfs_new_node(struct kernfs_root *root,
 					     unsigned flags)
 {
 	struct kernfs_node *kn;
+	u32 gen;
+	int cursor;
 	int ret;
 
 	name = kstrdup_const(name, GFP_KERNEL);
@@ -635,12 +637,17 @@ static struct kernfs_node *__kernfs_new_node(struct kernfs_root *root,
 
 	idr_preload(GFP_KERNEL);
 	spin_lock(&kernfs_idr_lock);
-	ret = idr_alloc(&root->ino_idr, kn, 1, 0, GFP_ATOMIC);
+	cursor = idr_get_cursor(&root->ino_idr);
+	ret = idr_alloc_cyclic(&root->ino_idr, kn, 1, 0, GFP_ATOMIC);
+	if (ret >= 0 && ret < cursor)
+		root->next_generation++;
+	gen = root->next_generation;
 	spin_unlock(&kernfs_idr_lock);
 	idr_preload_end();
 	if (ret < 0)
 		goto err_out2;
 	kn->ino = ret;
+	kn->generation = gen;
 
 	atomic_set(&kn->count, 1);
 	atomic_set(&kn->active, KN_DEACTIVATED_BIAS);
@@ -884,6 +891,7 @@ struct kernfs_root *kernfs_create_root(struct kernfs_syscall_ops *scops,
 
 	idr_init(&root->ino_idr);
 	INIT_LIST_HEAD(&root->supers);
+	root->next_generation = 1;
 
 	kn = __kernfs_new_node(root, "", S_IFDIR | S_IRUGO | S_IXUGO,
 			       KERNFS_DIR);
diff --git a/fs/kernfs/inode.c b/fs/kernfs/inode.c
index fb4b4a79a0d6..79cdae4758fb 100644
--- a/fs/kernfs/inode.c
+++ b/fs/kernfs/inode.c
@@ -220,6 +220,7 @@ static void kernfs_init_inode(struct kernfs_node *kn, struct inode *inode)
 	inode->i_private = kn;
 	inode->i_mapping->a_ops = &kernfs_aops;
 	inode->i_op = &kernfs_iops;
+	inode->i_generation = kn->generation;
 
 	set_default_inode_attr(inode, kn->mode);
 	kernfs_refresh_inode(kn, inode);
diff --git a/include/linux/kernfs.h b/include/linux/kernfs.h
index 5f5d602eb433..8c00d28f468a 100644
--- a/include/linux/kernfs.h
+++ b/include/linux/kernfs.h
@@ -135,6 +135,7 @@ struct kernfs_node {
 	umode_t			mode;
 	unsigned int		ino;
 	struct kernfs_iattrs	*iattr;
+	u32			generation;
 };
 
 /*
@@ -164,6 +165,7 @@ struct kernfs_root {
 
 	/* private fields, do not use outside kernfs proper */
 	struct idr		ino_idr;
+	u32			next_generation;
 	struct kernfs_syscall_ops *syscall_ops;
 
 	/* list of kernfs_super_info of this root, protected by kernfs_mutex */
-- 
cgit v1.2.3


From c53cd490b1a491ebf1d8e30da97e7231459a4208 Mon Sep 17 00:00:00 2001
From: Shaohua Li <shli@fb.com>
Date: Wed, 12 Jul 2017 11:49:50 -0700
Subject: kernfs: introduce kernfs_node_id

inode number and generation can identify a kernfs node. We are going to
export the identification by exportfs operations, so put ino and
generation into a separate structure. It's convenient when later patches
use the identification.

Acked-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Signed-off-by: Shaohua Li <shli@fb.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/kernfs/dir.c                  | 10 +++++-----
 fs/kernfs/file.c                 |  4 ++--
 fs/kernfs/inode.c                |  4 ++--
 include/linux/cgroup.h           |  2 +-
 include/linux/kernfs.h           | 12 ++++++++++--
 include/trace/events/writeback.h |  2 +-
 6 files changed, 21 insertions(+), 13 deletions(-)

(limited to 'include')

diff --git a/fs/kernfs/dir.c b/fs/kernfs/dir.c
index b61a7efceb7a..89d1dc19340b 100644
--- a/fs/kernfs/dir.c
+++ b/fs/kernfs/dir.c
@@ -539,7 +539,7 @@ void kernfs_put(struct kernfs_node *kn)
 	}
 	kfree(kn->iattr);
 	spin_lock(&kernfs_idr_lock);
-	idr_remove(&root->ino_idr, kn->ino);
+	idr_remove(&root->ino_idr, kn->id.ino);
 	spin_unlock(&kernfs_idr_lock);
 	kmem_cache_free(kernfs_node_cache, kn);
 
@@ -645,8 +645,8 @@ static struct kernfs_node *__kernfs_new_node(struct kernfs_root *root,
 	idr_preload_end();
 	if (ret < 0)
 		goto err_out2;
-	kn->ino = ret;
-	kn->generation = gen;
+	kn->id.ino = ret;
+	kn->id.generation = gen;
 
 	/*
 	 * set ino first. This barrier is paired with atomic_inc_not_zero in
@@ -721,7 +721,7 @@ struct kernfs_node *kernfs_find_and_get_node_by_ino(struct kernfs_root *root,
 	 * before 'count'. So if 'count' is uptodate, 'ino' should be uptodate,
 	 * hence we can use 'ino' to filter stale node.
 	 */
-	if (kn->ino != ino)
+	if (kn->id.ino != ino)
 		goto out;
 	rcu_read_unlock();
 
@@ -1654,7 +1654,7 @@ static int kernfs_fop_readdir(struct file *file, struct dir_context *ctx)
 		const char *name = pos->name;
 		unsigned int type = dt_type(pos);
 		int len = strlen(name);
-		ino_t ino = pos->ino;
+		ino_t ino = pos->id.ino;
 
 		ctx->pos = pos->hash;
 		file->private_data = pos;
diff --git a/fs/kernfs/file.c b/fs/kernfs/file.c
index 7f90d4de86b6..744192539010 100644
--- a/fs/kernfs/file.c
+++ b/fs/kernfs/file.c
@@ -895,7 +895,7 @@ repeat:
 		 * have the matching @file available.  Look up the inodes
 		 * and generate the events manually.
 		 */
-		inode = ilookup(info->sb, kn->ino);
+		inode = ilookup(info->sb, kn->id.ino);
 		if (!inode)
 			continue;
 
@@ -903,7 +903,7 @@ repeat:
 		if (parent) {
 			struct inode *p_inode;
 
-			p_inode = ilookup(info->sb, parent->ino);
+			p_inode = ilookup(info->sb, parent->id.ino);
 			if (p_inode) {
 				fsnotify(p_inode, FS_MODIFY | FS_EVENT_ON_CHILD,
 					 inode, FSNOTIFY_EVENT_INODE, kn->name, 0);
diff --git a/fs/kernfs/inode.c b/fs/kernfs/inode.c
index 4c8b51085a86..a34303981deb 100644
--- a/fs/kernfs/inode.c
+++ b/fs/kernfs/inode.c
@@ -220,7 +220,7 @@ static void kernfs_init_inode(struct kernfs_node *kn, struct inode *inode)
 	inode->i_private = kn;
 	inode->i_mapping->a_ops = &kernfs_aops;
 	inode->i_op = &kernfs_iops;
-	inode->i_generation = kn->generation;
+	inode->i_generation = kn->id.generation;
 
 	set_default_inode_attr(inode, kn->mode);
 	kernfs_refresh_inode(kn, inode);
@@ -266,7 +266,7 @@ struct inode *kernfs_get_inode(struct super_block *sb, struct kernfs_node *kn)
 {
 	struct inode *inode;
 
-	inode = iget_locked(sb, kn->ino);
+	inode = iget_locked(sb, kn->id.ino);
 	if (inode && (inode->i_state & I_NEW))
 		kernfs_init_inode(kn, inode);
 
diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index 710a005c6b7a..30c68773fd1e 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -543,7 +543,7 @@ static inline bool cgroup_is_populated(struct cgroup *cgrp)
 /* returns ino associated with a cgroup */
 static inline ino_t cgroup_ino(struct cgroup *cgrp)
 {
-	return cgrp->kn->ino;
+	return cgrp->kn->id.ino;
 }
 
 /* cft/css accessors for cftype->write() operation */
diff --git a/include/linux/kernfs.h b/include/linux/kernfs.h
index 8c00d28f468a..06a0c5913e1d 100644
--- a/include/linux/kernfs.h
+++ b/include/linux/kernfs.h
@@ -95,6 +95,15 @@ struct kernfs_elem_attr {
 	struct kernfs_node	*notify_next;	/* for kernfs_notify() */
 };
 
+/* represent a kernfs node */
+union kernfs_node_id {
+	struct {
+		u32		ino;
+		u32		generation;
+	};
+	u64			id;
+};
+
 /*
  * kernfs_node - the building block of kernfs hierarchy.  Each and every
  * kernfs node is represented by single kernfs_node.  Most fields are
@@ -131,11 +140,10 @@ struct kernfs_node {
 
 	void			*priv;
 
+	union kernfs_node_id	id;
 	unsigned short		flags;
 	umode_t			mode;
-	unsigned int		ino;
 	struct kernfs_iattrs	*iattr;
-	u32			generation;
 };
 
 /*
diff --git a/include/trace/events/writeback.h b/include/trace/events/writeback.h
index 7bd8783a590f..9b57f014d79d 100644
--- a/include/trace/events/writeback.h
+++ b/include/trace/events/writeback.h
@@ -136,7 +136,7 @@ DEFINE_EVENT(writeback_dirty_inode_template, writeback_dirty_inode,
 
 static inline unsigned int __trace_wb_assign_cgroup(struct bdi_writeback *wb)
 {
-	return wb->memcg_css->cgroup->kn->ino;
+	return wb->memcg_css->cgroup->kn->id.ino;
 }
 
 static inline unsigned int __trace_wbc_assign_cgroup(struct writeback_control *wbc)
-- 
cgit v1.2.3


From aa8188253474b4053bc2900d9fcb545ce68bdf5c Mon Sep 17 00:00:00 2001
From: Shaohua Li <shli@fb.com>
Date: Wed, 12 Jul 2017 11:49:51 -0700
Subject: kernfs: add exportfs operations

Now we have the facilities to implement exportfs operations. The idea is
cgroup can export the fhandle info to userspace, then userspace uses
fhandle to find the cgroup name. Another example is userspace can get
fhandle for a cgroup and BPF uses the fhandle to filter info for the
cgroup.

Acked-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Signed-off-by: Shaohua Li <shli@fb.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/kernfs/mount.c      | 56 ++++++++++++++++++++++++++++++++++++++++++++++++++
 include/linux/kernfs.h | 12 +++++++++++
 kernel/cgroup/cgroup.c |  3 ++-
 3 files changed, 70 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/fs/kernfs/mount.c b/fs/kernfs/mount.c
index acd542625fd8..fa323589704f 100644
--- a/fs/kernfs/mount.c
+++ b/fs/kernfs/mount.c
@@ -16,6 +16,7 @@
 #include <linux/pagemap.h>
 #include <linux/namei.h>
 #include <linux/seq_file.h>
+#include <linux/exportfs.h>
 
 #include "kernfs-internal.h"
 
@@ -64,6 +65,59 @@ const struct super_operations kernfs_sops = {
 	.show_path	= kernfs_sop_show_path,
 };
 
+static struct inode *kernfs_fh_get_inode(struct super_block *sb,
+		u64 ino, u32 generation)
+{
+	struct kernfs_super_info *info = kernfs_info(sb);
+	struct inode *inode;
+	struct kernfs_node *kn;
+
+	if (ino == 0)
+		return ERR_PTR(-ESTALE);
+
+	kn = kernfs_find_and_get_node_by_ino(info->root, ino);
+	if (!kn)
+		return ERR_PTR(-ESTALE);
+	inode = kernfs_get_inode(sb, kn);
+	kernfs_put(kn);
+	if (IS_ERR(inode))
+		return ERR_CAST(inode);
+
+	if (generation && inode->i_generation != generation) {
+		/* we didn't find the right inode.. */
+		iput(inode);
+		return ERR_PTR(-ESTALE);
+	}
+	return inode;
+}
+
+static struct dentry *kernfs_fh_to_dentry(struct super_block *sb, struct fid *fid,
+		int fh_len, int fh_type)
+{
+	return generic_fh_to_dentry(sb, fid, fh_len, fh_type,
+				    kernfs_fh_get_inode);
+}
+
+static struct dentry *kernfs_fh_to_parent(struct super_block *sb, struct fid *fid,
+		int fh_len, int fh_type)
+{
+	return generic_fh_to_parent(sb, fid, fh_len, fh_type,
+				    kernfs_fh_get_inode);
+}
+
+static struct dentry *kernfs_get_parent_dentry(struct dentry *child)
+{
+	struct kernfs_node *kn = kernfs_dentry_node(child);
+
+	return d_obtain_alias(kernfs_get_inode(child->d_sb, kn->parent));
+}
+
+static const struct export_operations kernfs_export_ops = {
+	.fh_to_dentry	= kernfs_fh_to_dentry,
+	.fh_to_parent	= kernfs_fh_to_parent,
+	.get_parent	= kernfs_get_parent_dentry,
+};
+
 /**
  * kernfs_root_from_sb - determine kernfs_root associated with a super_block
  * @sb: the super_block in question
@@ -159,6 +213,8 @@ static int kernfs_fill_super(struct super_block *sb, unsigned long magic)
 	sb->s_magic = magic;
 	sb->s_op = &kernfs_sops;
 	sb->s_xattr = kernfs_xattr_handlers;
+	if (info->root->flags & KERNFS_ROOT_SUPPORT_EXPORTOP)
+		sb->s_export_op = &kernfs_export_ops;
 	sb->s_time_gran = 1;
 
 	/* get root inode, initialize and unlock it */
diff --git a/include/linux/kernfs.h b/include/linux/kernfs.h
index 06a0c5913e1d..d149361e5875 100644
--- a/include/linux/kernfs.h
+++ b/include/linux/kernfs.h
@@ -69,6 +69,12 @@ enum kernfs_root_flag {
 	 * following flag enables that behavior.
 	 */
 	KERNFS_ROOT_EXTRA_OPEN_PERM_CHECK	= 0x0002,
+
+	/*
+	 * The filesystem supports exportfs operation, so userspace can use
+	 * fhandle to access nodes of the fs.
+	 */
+	KERNFS_ROOT_SUPPORT_EXPORTOP		= 0x0004,
 };
 
 /* type-specific structures for kernfs_node union members */
@@ -98,6 +104,12 @@ struct kernfs_elem_attr {
 /* represent a kernfs node */
 union kernfs_node_id {
 	struct {
+		/*
+		 * blktrace will export this struct as a simplified 'struct
+		 * fid' (which is a big data struction), so userspace can use
+		 * it to find kernfs node. The layout must match the first two
+		 * fields of 'struct fid' exactly.
+		 */
 		u32		ino;
 		u32		generation;
 	};
diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
index 620794a20a33..6cefa277f39c 100644
--- a/kernel/cgroup/cgroup.c
+++ b/kernel/cgroup/cgroup.c
@@ -1737,7 +1737,8 @@ int cgroup_setup_root(struct cgroup_root *root, u16 ss_mask, int ref_flags)
 		&cgroup_kf_syscall_ops : &cgroup1_kf_syscall_ops;
 
 	root->kf_root = kernfs_create_root(kf_sops,
-					   KERNFS_ROOT_CREATE_DEACTIVATED,
+					   KERNFS_ROOT_CREATE_DEACTIVATED |
+					   KERNFS_ROOT_SUPPORT_EXPORTOP,
 					   root_cgrp);
 	if (IS_ERR(root->kf_root)) {
 		ret = PTR_ERR(root->kf_root);
-- 
cgit v1.2.3


From 121508df44d074245a72eda6b067478218480a40 Mon Sep 17 00:00:00 2001
From: Shaohua Li <shli@fb.com>
Date: Wed, 12 Jul 2017 11:49:52 -0700
Subject: cgroup: export fhandle info for a cgroup

Add an API to export cgroup fhandle info. We don't export a full 'struct
file_handle', there are unrequired info. Sepcifically, cgroup is always
a directory, so we don't need a 'FILEID_INO32_GEN_PARENT' type fhandle,
we only need export the inode number and generation number just like
what generic_fh_to_dentry does. And we can avoid the overhead of getting
an inode too, since kernfs_node_id (ino and generation) has all the info
required.

Acked-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Shaohua Li <shli@fb.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/cgroup.h | 8 ++++++++
 1 file changed, 8 insertions(+)

(limited to 'include')

diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index 30c68773fd1e..52ef9a68ff14 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -609,6 +609,10 @@ static inline void cgroup_kthread_ready(void)
 	current->no_cgroup_migration = 0;
 }
 
+static inline union kernfs_node_id *cgroup_get_kernfs_id(struct cgroup *cgrp)
+{
+	return &cgrp->kn->id;
+}
 #else /* !CONFIG_CGROUPS */
 
 struct cgroup_subsys_state;
@@ -631,6 +635,10 @@ static inline int cgroup_init_early(void) { return 0; }
 static inline int cgroup_init(void) { return 0; }
 static inline void cgroup_init_kthreadd(void) {}
 static inline void cgroup_kthread_ready(void) {}
+static inline union kernfs_node_id *cgroup_get_kernfs_id(struct cgroup *cgrp)
+{
+	return NULL;
+}
 
 static inline bool task_under_cgroup_hierarchy(struct task_struct *task,
 					       struct cgroup *ancestor)
-- 
cgit v1.2.3


From ca1136c99b66b1566781ff12ecddc635d570f932 Mon Sep 17 00:00:00 2001
From: Shaohua Li <shli@fb.com>
Date: Wed, 12 Jul 2017 11:49:53 -0700
Subject: blktrace: export cgroup info in trace

Currently blktrace isn't cgroup aware. blktrace prints out task name of
current context, but the task of current context isn't always in the
cgroup where the BIO comes from. We can't use task name to find out IO
cgroup. For example, Writeback BIOs always comes from flusher thread but
the BIOs are for different blk cgroups. Request could be requeued and
dispatched from completely different tasks. MD/DM are another examples.

This patch tries to fix the gap. We print out cgroup fhandle info in
blktrace. Userspace can use open_by_handle_at() syscall to find the
cgroup by fhandle. Or userspace can use name_to_handle_at() syscall to
find fhandle for a cgroup and use a BPF program to filter out blktrace
for a specific cgroup.

We add a new 'blk_cgroup' trace option for blk tracer. It's default off.
Application which doesn't know the new option isn't affected.  When it's
on, we output fhandle info right after blk_io_trace with an extra bit
set in event action. So from application point of view, blktrace with
the option will output new actions.

I didn't change blk trace event yet, since I'm not sure if changing the
trace event output is an ABI issue. If not, I'll do it later.

Acked-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
Signed-off-by: Shaohua Li <shli@fb.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/uapi/linux/blktrace_api.h |   3 +
 kernel/trace/blktrace.c           | 231 ++++++++++++++++++++++++++------------
 2 files changed, 161 insertions(+), 73 deletions(-)

(limited to 'include')

diff --git a/include/uapi/linux/blktrace_api.h b/include/uapi/linux/blktrace_api.h
index c590ca6bfbd9..9cdaedeadb84 100644
--- a/include/uapi/linux/blktrace_api.h
+++ b/include/uapi/linux/blktrace_api.h
@@ -52,6 +52,7 @@ enum blktrace_act {
 	__BLK_TA_REMAP,			/* bio was remapped */
 	__BLK_TA_ABORT,			/* request aborted */
 	__BLK_TA_DRV_DATA,		/* driver-specific binary data */
+	__BLK_TA_CGROUP = 1 << 8,	/* from a cgroup*/
 };
 
 /*
@@ -61,6 +62,7 @@ enum blktrace_notify {
 	__BLK_TN_PROCESS = 0,		/* establish pid/name mapping */
 	__BLK_TN_TIMESTAMP,		/* include system clock */
 	__BLK_TN_MESSAGE,		/* Character string message */
+	__BLK_TN_CGROUP = __BLK_TA_CGROUP, /* from a cgroup */
 };
 
 
@@ -107,6 +109,7 @@ struct blk_io_trace {
 	__u32 cpu;		/* on what cpu did it happen */
 	__u16 error;		/* completion error */
 	__u16 pdu_len;		/* length of data after this trace */
+	/* cgroup id will be stored here if exists */
 };
 
 /*
diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c
index bc364f86100a..f393d7a43695 100644
--- a/kernel/trace/blktrace.c
+++ b/kernel/trace/blktrace.c
@@ -27,6 +27,7 @@
 #include <linux/time.h>
 #include <linux/uaccess.h>
 #include <linux/list.h>
+#include <linux/blk-cgroup.h>
 
 #include "../../block/blk.h"
 
@@ -46,10 +47,14 @@ static __cacheline_aligned_in_smp DEFINE_SPINLOCK(running_trace_lock);
 
 /* Select an alternative, minimalistic output than the original one */
 #define TRACE_BLK_OPT_CLASSIC	0x1
+#define TRACE_BLK_OPT_CGROUP	0x2
 
 static struct tracer_opt blk_tracer_opts[] = {
 	/* Default disable the minimalistic output */
 	{ TRACER_OPT(blk_classic, TRACE_BLK_OPT_CLASSIC) },
+#ifdef CONFIG_BLK_CGROUP
+	{ TRACER_OPT(blk_cgroup, TRACE_BLK_OPT_CGROUP) },
+#endif
 	{ }
 };
 
@@ -68,7 +73,8 @@ static void blk_unregister_tracepoints(void);
  * Send out a notify message.
  */
 static void trace_note(struct blk_trace *bt, pid_t pid, int action,
-		       const void *data, size_t len)
+		       const void *data, size_t len,
+		       union kernfs_node_id *cgid)
 {
 	struct blk_io_trace *t;
 	struct ring_buffer_event *event = NULL;
@@ -76,12 +82,13 @@ static void trace_note(struct blk_trace *bt, pid_t pid, int action,
 	int pc = 0;
 	int cpu = smp_processor_id();
 	bool blk_tracer = blk_tracer_enabled;
+	ssize_t cgid_len = cgid ? sizeof(*cgid) : 0;
 
 	if (blk_tracer) {
 		buffer = blk_tr->trace_buffer.buffer;
 		pc = preempt_count();
 		event = trace_buffer_lock_reserve(buffer, TRACE_BLK,
-						  sizeof(*t) + len,
+						  sizeof(*t) + len + cgid_len,
 						  0, pc);
 		if (!event)
 			return;
@@ -92,17 +99,19 @@ static void trace_note(struct blk_trace *bt, pid_t pid, int action,
 	if (!bt->rchan)
 		return;
 
-	t = relay_reserve(bt->rchan, sizeof(*t) + len);
+	t = relay_reserve(bt->rchan, sizeof(*t) + len + cgid_len);
 	if (t) {
 		t->magic = BLK_IO_TRACE_MAGIC | BLK_IO_TRACE_VERSION;
 		t->time = ktime_to_ns(ktime_get());
 record_it:
 		t->device = bt->dev;
-		t->action = action;
+		t->action = action | (cgid ? __BLK_TN_CGROUP : 0);
 		t->pid = pid;
 		t->cpu = cpu;
-		t->pdu_len = len;
-		memcpy((void *) t + sizeof(*t), data, len);
+		t->pdu_len = len + cgid_len;
+		if (cgid)
+			memcpy((void *)t + sizeof(*t), cgid, cgid_len);
+		memcpy((void *) t + sizeof(*t) + cgid_len, data, len);
 
 		if (blk_tracer)
 			trace_buffer_unlock_commit(blk_tr, buffer, event, 0, pc);
@@ -122,7 +131,7 @@ static void trace_note_tsk(struct task_struct *tsk)
 	spin_lock_irqsave(&running_trace_lock, flags);
 	list_for_each_entry(bt, &running_trace_list, running_list) {
 		trace_note(bt, tsk->pid, BLK_TN_PROCESS, tsk->comm,
-			   sizeof(tsk->comm));
+			   sizeof(tsk->comm), NULL);
 	}
 	spin_unlock_irqrestore(&running_trace_lock, flags);
 }
@@ -139,7 +148,7 @@ static void trace_note_time(struct blk_trace *bt)
 	words[1] = now.tv_nsec;
 
 	local_irq_save(flags);
-	trace_note(bt, 0, BLK_TN_TIMESTAMP, words, sizeof(words));
+	trace_note(bt, 0, BLK_TN_TIMESTAMP, words, sizeof(words), NULL);
 	local_irq_restore(flags);
 }
 
@@ -167,7 +176,7 @@ void __trace_note_message(struct blk_trace *bt, const char *fmt, ...)
 	n = vscnprintf(buf, BLK_TN_MAX_MSG, fmt, args);
 	va_end(args);
 
-	trace_note(bt, 0, BLK_TN_MESSAGE, buf, n);
+	trace_note(bt, 0, BLK_TN_MESSAGE, buf, n, NULL);
 	local_irq_restore(flags);
 }
 EXPORT_SYMBOL_GPL(__trace_note_message);
@@ -204,7 +213,7 @@ static const u32 ddir_act[2] = { BLK_TC_ACT(BLK_TC_READ),
  */
 static void __blk_add_trace(struct blk_trace *bt, sector_t sector, int bytes,
 		     int op, int op_flags, u32 what, int error, int pdu_len,
-		     void *pdu_data)
+		     void *pdu_data, union kernfs_node_id *cgid)
 {
 	struct task_struct *tsk = current;
 	struct ring_buffer_event *event = NULL;
@@ -215,6 +224,7 @@ static void __blk_add_trace(struct blk_trace *bt, sector_t sector, int bytes,
 	pid_t pid;
 	int cpu, pc = 0;
 	bool blk_tracer = blk_tracer_enabled;
+	ssize_t cgid_len = cgid ? sizeof(*cgid) : 0;
 
 	if (unlikely(bt->trace_state != Blktrace_running && !blk_tracer))
 		return;
@@ -229,6 +239,8 @@ static void __blk_add_trace(struct blk_trace *bt, sector_t sector, int bytes,
 		what |= BLK_TC_ACT(BLK_TC_DISCARD);
 	if (op == REQ_OP_FLUSH)
 		what |= BLK_TC_ACT(BLK_TC_FLUSH);
+	if (cgid)
+		what |= __BLK_TA_CGROUP;
 
 	pid = tsk->pid;
 	if (act_log_check(bt, what, sector, pid))
@@ -241,7 +253,7 @@ static void __blk_add_trace(struct blk_trace *bt, sector_t sector, int bytes,
 		buffer = blk_tr->trace_buffer.buffer;
 		pc = preempt_count();
 		event = trace_buffer_lock_reserve(buffer, TRACE_BLK,
-						  sizeof(*t) + pdu_len,
+						  sizeof(*t) + pdu_len + cgid_len,
 						  0, pc);
 		if (!event)
 			return;
@@ -258,7 +270,7 @@ static void __blk_add_trace(struct blk_trace *bt, sector_t sector, int bytes,
 	 * from coming in and stepping on our toes.
 	 */
 	local_irq_save(flags);
-	t = relay_reserve(bt->rchan, sizeof(*t) + pdu_len);
+	t = relay_reserve(bt->rchan, sizeof(*t) + pdu_len + cgid_len);
 	if (t) {
 		sequence = per_cpu_ptr(bt->sequence, cpu);
 
@@ -280,10 +292,12 @@ record_it:
 		t->action = what;
 		t->device = bt->dev;
 		t->error = error;
-		t->pdu_len = pdu_len;
+		t->pdu_len = pdu_len + cgid_len;
 
+		if (cgid_len)
+			memcpy((void *)t + sizeof(*t), cgid, cgid_len);
 		if (pdu_len)
-			memcpy((void *) t + sizeof(*t), pdu_data, pdu_len);
+			memcpy((void *)t + sizeof(*t) + cgid_len, pdu_data, pdu_len);
 
 		if (blk_tracer) {
 			trace_buffer_unlock_commit(blk_tr, buffer, event, 0, pc);
@@ -684,6 +698,36 @@ void blk_trace_shutdown(struct request_queue *q)
 	}
 }
 
+#ifdef CONFIG_BLK_CGROUP
+static union kernfs_node_id *
+blk_trace_bio_get_cgid(struct request_queue *q, struct bio *bio)
+{
+	struct blk_trace *bt = q->blk_trace;
+
+	if (!bt || !(blk_tracer_flags.val & TRACE_BLK_OPT_CGROUP))
+		return NULL;
+
+	if (!bio->bi_css)
+		return NULL;
+	return cgroup_get_kernfs_id(bio->bi_css->cgroup);
+}
+#else
+static union kernfs_node_id *
+blk_trace_bio_get_cgid(struct request_queue *q, struct bio *bio)
+{
+	return NULL;
+}
+#endif
+
+static union kernfs_node_id *
+blk_trace_request_get_cgid(struct request_queue *q, struct request *rq)
+{
+	if (!rq->bio)
+		return NULL;
+	/* Use the first bio */
+	return blk_trace_bio_get_cgid(q, rq->bio);
+}
+
 /*
  * blktrace probes
  */
@@ -694,13 +738,15 @@ void blk_trace_shutdown(struct request_queue *q)
  * @error:	return status to log
  * @nr_bytes:	number of completed bytes
  * @what:	the action
+ * @cgid:	the cgroup info
  *
  * Description:
  *     Records an action against a request. Will log the bio offset + size.
  *
  **/
 static void blk_add_trace_rq(struct request *rq, int error,
-			     unsigned int nr_bytes, u32 what)
+			     unsigned int nr_bytes, u32 what,
+			     union kernfs_node_id *cgid)
 {
 	struct blk_trace *bt = rq->q->blk_trace;
 
@@ -713,32 +759,36 @@ static void blk_add_trace_rq(struct request *rq, int error,
 		what |= BLK_TC_ACT(BLK_TC_FS);
 
 	__blk_add_trace(bt, blk_rq_trace_sector(rq), nr_bytes, req_op(rq),
-			rq->cmd_flags, what, error, 0, NULL);
+			rq->cmd_flags, what, error, 0, NULL, cgid);
 }
 
 static void blk_add_trace_rq_insert(void *ignore,
 				    struct request_queue *q, struct request *rq)
 {
-	blk_add_trace_rq(rq, 0, blk_rq_bytes(rq), BLK_TA_INSERT);
+	blk_add_trace_rq(rq, 0, blk_rq_bytes(rq), BLK_TA_INSERT,
+			 blk_trace_request_get_cgid(q, rq));
 }
 
 static void blk_add_trace_rq_issue(void *ignore,
 				   struct request_queue *q, struct request *rq)
 {
-	blk_add_trace_rq(rq, 0, blk_rq_bytes(rq), BLK_TA_ISSUE);
+	blk_add_trace_rq(rq, 0, blk_rq_bytes(rq), BLK_TA_ISSUE,
+			 blk_trace_request_get_cgid(q, rq));
 }
 
 static void blk_add_trace_rq_requeue(void *ignore,
 				     struct request_queue *q,
 				     struct request *rq)
 {
-	blk_add_trace_rq(rq, 0, blk_rq_bytes(rq), BLK_TA_REQUEUE);
+	blk_add_trace_rq(rq, 0, blk_rq_bytes(rq), BLK_TA_REQUEUE,
+			 blk_trace_request_get_cgid(q, rq));
 }
 
 static void blk_add_trace_rq_complete(void *ignore, struct request *rq,
 			int error, unsigned int nr_bytes)
 {
-	blk_add_trace_rq(rq, error, nr_bytes, BLK_TA_COMPLETE);
+	blk_add_trace_rq(rq, error, nr_bytes, BLK_TA_COMPLETE,
+			 blk_trace_request_get_cgid(rq->q, rq));
 }
 
 /**
@@ -753,7 +803,7 @@ static void blk_add_trace_rq_complete(void *ignore, struct request *rq,
  *
  **/
 static void blk_add_trace_bio(struct request_queue *q, struct bio *bio,
-			      u32 what, int error)
+			      u32 what, int error, union kernfs_node_id *cgid)
 {
 	struct blk_trace *bt = q->blk_trace;
 
@@ -761,20 +811,22 @@ static void blk_add_trace_bio(struct request_queue *q, struct bio *bio,
 		return;
 
 	__blk_add_trace(bt, bio->bi_iter.bi_sector, bio->bi_iter.bi_size,
-			bio_op(bio), bio->bi_opf, what, error, 0, NULL);
+			bio_op(bio), bio->bi_opf, what, error, 0, NULL, cgid);
 }
 
 static void blk_add_trace_bio_bounce(void *ignore,
 				     struct request_queue *q, struct bio *bio)
 {
-	blk_add_trace_bio(q, bio, BLK_TA_BOUNCE, 0);
+	blk_add_trace_bio(q, bio, BLK_TA_BOUNCE, 0,
+			  blk_trace_bio_get_cgid(q, bio));
 }
 
 static void blk_add_trace_bio_complete(void *ignore,
 				       struct request_queue *q, struct bio *bio,
 				       int error)
 {
-	blk_add_trace_bio(q, bio, BLK_TA_COMPLETE, error);
+	blk_add_trace_bio(q, bio, BLK_TA_COMPLETE, error,
+			  blk_trace_bio_get_cgid(q, bio));
 }
 
 static void blk_add_trace_bio_backmerge(void *ignore,
@@ -782,7 +834,8 @@ static void blk_add_trace_bio_backmerge(void *ignore,
 					struct request *rq,
 					struct bio *bio)
 {
-	blk_add_trace_bio(q, bio, BLK_TA_BACKMERGE, 0);
+	blk_add_trace_bio(q, bio, BLK_TA_BACKMERGE, 0,
+			 blk_trace_bio_get_cgid(q, bio));
 }
 
 static void blk_add_trace_bio_frontmerge(void *ignore,
@@ -790,13 +843,15 @@ static void blk_add_trace_bio_frontmerge(void *ignore,
 					 struct request *rq,
 					 struct bio *bio)
 {
-	blk_add_trace_bio(q, bio, BLK_TA_FRONTMERGE, 0);
+	blk_add_trace_bio(q, bio, BLK_TA_FRONTMERGE, 0,
+			  blk_trace_bio_get_cgid(q, bio));
 }
 
 static void blk_add_trace_bio_queue(void *ignore,
 				    struct request_queue *q, struct bio *bio)
 {
-	blk_add_trace_bio(q, bio, BLK_TA_QUEUE, 0);
+	blk_add_trace_bio(q, bio, BLK_TA_QUEUE, 0,
+			  blk_trace_bio_get_cgid(q, bio));
 }
 
 static void blk_add_trace_getrq(void *ignore,
@@ -804,13 +859,14 @@ static void blk_add_trace_getrq(void *ignore,
 				struct bio *bio, int rw)
 {
 	if (bio)
-		blk_add_trace_bio(q, bio, BLK_TA_GETRQ, 0);
+		blk_add_trace_bio(q, bio, BLK_TA_GETRQ, 0,
+				  blk_trace_bio_get_cgid(q, bio));
 	else {
 		struct blk_trace *bt = q->blk_trace;
 
 		if (bt)
 			__blk_add_trace(bt, 0, 0, rw, 0, BLK_TA_GETRQ, 0, 0,
-					NULL);
+					NULL, NULL);
 	}
 }
 
@@ -820,13 +876,14 @@ static void blk_add_trace_sleeprq(void *ignore,
 				  struct bio *bio, int rw)
 {
 	if (bio)
-		blk_add_trace_bio(q, bio, BLK_TA_SLEEPRQ, 0);
+		blk_add_trace_bio(q, bio, BLK_TA_SLEEPRQ, 0,
+				  blk_trace_bio_get_cgid(q, bio));
 	else {
 		struct blk_trace *bt = q->blk_trace;
 
 		if (bt)
 			__blk_add_trace(bt, 0, 0, rw, 0, BLK_TA_SLEEPRQ,
-					0, 0, NULL);
+					0, 0, NULL, NULL);
 	}
 }
 
@@ -835,7 +892,7 @@ static void blk_add_trace_plug(void *ignore, struct request_queue *q)
 	struct blk_trace *bt = q->blk_trace;
 
 	if (bt)
-		__blk_add_trace(bt, 0, 0, 0, 0, BLK_TA_PLUG, 0, 0, NULL);
+		__blk_add_trace(bt, 0, 0, 0, 0, BLK_TA_PLUG, 0, 0, NULL, NULL);
 }
 
 static void blk_add_trace_unplug(void *ignore, struct request_queue *q,
@@ -852,7 +909,7 @@ static void blk_add_trace_unplug(void *ignore, struct request_queue *q,
 		else
 			what = BLK_TA_UNPLUG_TIMER;
 
-		__blk_add_trace(bt, 0, 0, 0, 0, what, 0, sizeof(rpdu), &rpdu);
+		__blk_add_trace(bt, 0, 0, 0, 0, what, 0, sizeof(rpdu), &rpdu, NULL);
 	}
 }
 
@@ -868,7 +925,7 @@ static void blk_add_trace_split(void *ignore,
 		__blk_add_trace(bt, bio->bi_iter.bi_sector,
 				bio->bi_iter.bi_size, bio_op(bio), bio->bi_opf,
 				BLK_TA_SPLIT, bio->bi_status, sizeof(rpdu),
-				&rpdu);
+				&rpdu, blk_trace_bio_get_cgid(q, bio));
 	}
 }
 
@@ -901,7 +958,7 @@ static void blk_add_trace_bio_remap(void *ignore,
 
 	__blk_add_trace(bt, bio->bi_iter.bi_sector, bio->bi_iter.bi_size,
 			bio_op(bio), bio->bi_opf, BLK_TA_REMAP, bio->bi_status,
-			sizeof(r), &r);
+			sizeof(r), &r, blk_trace_bio_get_cgid(q, bio));
 }
 
 /**
@@ -934,7 +991,7 @@ static void blk_add_trace_rq_remap(void *ignore,
 
 	__blk_add_trace(bt, blk_rq_pos(rq), blk_rq_bytes(rq),
 			rq_data_dir(rq), 0, BLK_TA_REMAP, 0,
-			sizeof(r), &r);
+			sizeof(r), &r, blk_trace_request_get_cgid(q, rq));
 }
 
 /**
@@ -958,7 +1015,8 @@ void blk_add_driver_data(struct request_queue *q,
 		return;
 
 	__blk_add_trace(bt, blk_rq_trace_sector(rq), blk_rq_bytes(rq), 0, 0,
-				BLK_TA_DRV_DATA, 0, len, data);
+				BLK_TA_DRV_DATA, 0, len, data,
+				blk_trace_request_get_cgid(q, rq));
 }
 EXPORT_SYMBOL_GPL(blk_add_driver_data);
 
@@ -1031,7 +1089,7 @@ static void fill_rwbs(char *rwbs, const struct blk_io_trace *t)
 	int i = 0;
 	int tc = t->action >> BLK_TC_SHIFT;
 
-	if (t->action == BLK_TN_MESSAGE) {
+	if ((t->action & ~__BLK_TN_CGROUP) == BLK_TN_MESSAGE) {
 		rwbs[i++] = 'N';
 		goto out;
 	}
@@ -1066,9 +1124,21 @@ const struct blk_io_trace *te_blk_io_trace(const struct trace_entry *ent)
 	return (const struct blk_io_trace *)ent;
 }
 
-static inline const void *pdu_start(const struct trace_entry *ent)
+static inline const void *pdu_start(const struct trace_entry *ent, bool has_cg)
 {
-	return te_blk_io_trace(ent) + 1;
+	return (void *)(te_blk_io_trace(ent) + 1) +
+		(has_cg ? sizeof(union kernfs_node_id) : 0);
+}
+
+static inline const void *cgid_start(const struct trace_entry *ent)
+{
+	return (void *)(te_blk_io_trace(ent) + 1);
+}
+
+static inline int pdu_real_len(const struct trace_entry *ent, bool has_cg)
+{
+	return te_blk_io_trace(ent)->pdu_len -
+			(has_cg ? sizeof(union kernfs_node_id) : 0);
 }
 
 static inline u32 t_action(const struct trace_entry *ent)
@@ -1096,16 +1166,16 @@ static inline __u16 t_error(const struct trace_entry *ent)
 	return te_blk_io_trace(ent)->error;
 }
 
-static __u64 get_pdu_int(const struct trace_entry *ent)
+static __u64 get_pdu_int(const struct trace_entry *ent, bool has_cg)
 {
-	const __u64 *val = pdu_start(ent);
+	const __u64 *val = pdu_start(ent, has_cg);
 	return be64_to_cpu(*val);
 }
 
 static void get_pdu_remap(const struct trace_entry *ent,
-			  struct blk_io_trace_remap *r)
+			  struct blk_io_trace_remap *r, bool has_cg)
 {
-	const struct blk_io_trace_remap *__r = pdu_start(ent);
+	const struct blk_io_trace_remap *__r = pdu_start(ent, has_cg);
 	__u64 sector_from = __r->sector_from;
 
 	r->device_from = be32_to_cpu(__r->device_from);
@@ -1113,9 +1183,11 @@ static void get_pdu_remap(const struct trace_entry *ent,
 	r->sector_from = be64_to_cpu(sector_from);
 }
 
-typedef void (blk_log_action_t) (struct trace_iterator *iter, const char *act);
+typedef void (blk_log_action_t) (struct trace_iterator *iter, const char *act,
+	bool has_cg);
 
-static void blk_log_action_classic(struct trace_iterator *iter, const char *act)
+static void blk_log_action_classic(struct trace_iterator *iter, const char *act,
+	bool has_cg)
 {
 	char rwbs[RWBS_LEN];
 	unsigned long long ts  = iter->ts;
@@ -1131,24 +1203,33 @@ static void blk_log_action_classic(struct trace_iterator *iter, const char *act)
 			 secs, nsec_rem, iter->ent->pid, act, rwbs);
 }
 
-static void blk_log_action(struct trace_iterator *iter, const char *act)
+static void blk_log_action(struct trace_iterator *iter, const char *act,
+	bool has_cg)
 {
 	char rwbs[RWBS_LEN];
 	const struct blk_io_trace *t = te_blk_io_trace(iter->ent);
 
 	fill_rwbs(rwbs, t);
-	trace_seq_printf(&iter->seq, "%3d,%-3d %2s %3s ",
-			 MAJOR(t->device), MINOR(t->device), act, rwbs);
+	if (has_cg) {
+		const union kernfs_node_id *id = cgid_start(iter->ent);
+
+		trace_seq_printf(&iter->seq, "%3d,%-3d %x,%-x %2s %3s ",
+				 MAJOR(t->device), MINOR(t->device),
+				 id->ino, id->generation, act, rwbs);
+	} else
+		trace_seq_printf(&iter->seq, "%3d,%-3d %2s %3s ",
+				 MAJOR(t->device), MINOR(t->device), act, rwbs);
 }
 
-static void blk_log_dump_pdu(struct trace_seq *s, const struct trace_entry *ent)
+static void blk_log_dump_pdu(struct trace_seq *s,
+	const struct trace_entry *ent, bool has_cg)
 {
 	const unsigned char *pdu_buf;
 	int pdu_len;
 	int i, end;
 
-	pdu_buf = pdu_start(ent);
-	pdu_len = te_blk_io_trace(ent)->pdu_len;
+	pdu_buf = pdu_start(ent, has_cg);
+	pdu_len = pdu_real_len(ent, has_cg);
 
 	if (!pdu_len)
 		return;
@@ -1179,7 +1260,7 @@ static void blk_log_dump_pdu(struct trace_seq *s, const struct trace_entry *ent)
 	trace_seq_puts(s, ") ");
 }
 
-static void blk_log_generic(struct trace_seq *s, const struct trace_entry *ent)
+static void blk_log_generic(struct trace_seq *s, const struct trace_entry *ent, bool has_cg)
 {
 	char cmd[TASK_COMM_LEN];
 
@@ -1187,7 +1268,7 @@ static void blk_log_generic(struct trace_seq *s, const struct trace_entry *ent)
 
 	if (t_action(ent) & BLK_TC_ACT(BLK_TC_PC)) {
 		trace_seq_printf(s, "%u ", t_bytes(ent));
-		blk_log_dump_pdu(s, ent);
+		blk_log_dump_pdu(s, ent, has_cg);
 		trace_seq_printf(s, "[%s]\n", cmd);
 	} else {
 		if (t_sec(ent))
@@ -1199,10 +1280,10 @@ static void blk_log_generic(struct trace_seq *s, const struct trace_entry *ent)
 }
 
 static void blk_log_with_error(struct trace_seq *s,
-			      const struct trace_entry *ent)
+			      const struct trace_entry *ent, bool has_cg)
 {
 	if (t_action(ent) & BLK_TC_ACT(BLK_TC_PC)) {
-		blk_log_dump_pdu(s, ent);
+		blk_log_dump_pdu(s, ent, has_cg);
 		trace_seq_printf(s, "[%d]\n", t_error(ent));
 	} else {
 		if (t_sec(ent))
@@ -1215,18 +1296,18 @@ static void blk_log_with_error(struct trace_seq *s,
 	}
 }
 
-static void blk_log_remap(struct trace_seq *s, const struct trace_entry *ent)
+static void blk_log_remap(struct trace_seq *s, const struct trace_entry *ent, bool has_cg)
 {
 	struct blk_io_trace_remap r = { .device_from = 0, };
 
-	get_pdu_remap(ent, &r);
+	get_pdu_remap(ent, &r, has_cg);
 	trace_seq_printf(s, "%llu + %u <- (%d,%d) %llu\n",
 			 t_sector(ent), t_sec(ent),
 			 MAJOR(r.device_from), MINOR(r.device_from),
 			 (unsigned long long)r.sector_from);
 }
 
-static void blk_log_plug(struct trace_seq *s, const struct trace_entry *ent)
+static void blk_log_plug(struct trace_seq *s, const struct trace_entry *ent, bool has_cg)
 {
 	char cmd[TASK_COMM_LEN];
 
@@ -1235,30 +1316,31 @@ static void blk_log_plug(struct trace_seq *s, const struct trace_entry *ent)
 	trace_seq_printf(s, "[%s]\n", cmd);
 }
 
-static void blk_log_unplug(struct trace_seq *s, const struct trace_entry *ent)
+static void blk_log_unplug(struct trace_seq *s, const struct trace_entry *ent, bool has_cg)
 {
 	char cmd[TASK_COMM_LEN];
 
 	trace_find_cmdline(ent->pid, cmd);
 
-	trace_seq_printf(s, "[%s] %llu\n", cmd, get_pdu_int(ent));
+	trace_seq_printf(s, "[%s] %llu\n", cmd, get_pdu_int(ent, has_cg));
 }
 
-static void blk_log_split(struct trace_seq *s, const struct trace_entry *ent)
+static void blk_log_split(struct trace_seq *s, const struct trace_entry *ent, bool has_cg)
 {
 	char cmd[TASK_COMM_LEN];
 
 	trace_find_cmdline(ent->pid, cmd);
 
 	trace_seq_printf(s, "%llu / %llu [%s]\n", t_sector(ent),
-			 get_pdu_int(ent), cmd);
+			 get_pdu_int(ent, has_cg), cmd);
 }
 
-static void blk_log_msg(struct trace_seq *s, const struct trace_entry *ent)
+static void blk_log_msg(struct trace_seq *s, const struct trace_entry *ent,
+			bool has_cg)
 {
-	const struct blk_io_trace *t = te_blk_io_trace(ent);
 
-	trace_seq_putmem(s, t + 1, t->pdu_len);
+	trace_seq_putmem(s, pdu_start(ent, has_cg),
+		pdu_real_len(ent, has_cg));
 	trace_seq_putc(s, '\n');
 }
 
@@ -1298,7 +1380,8 @@ static void blk_tracer_reset(struct trace_array *tr)
 
 static const struct {
 	const char *act[2];
-	void	   (*print)(struct trace_seq *s, const struct trace_entry *ent);
+	void	   (*print)(struct trace_seq *s, const struct trace_entry *ent,
+			    bool has_cg);
 } what2act[] = {
 	[__BLK_TA_QUEUE]	= {{  "Q", "queue" },	   blk_log_generic },
 	[__BLK_TA_BACKMERGE]	= {{  "M", "backmerge" },  blk_log_generic },
@@ -1326,23 +1409,25 @@ static enum print_line_t print_one_line(struct trace_iterator *iter,
 	u16 what;
 	bool long_act;
 	blk_log_action_t *log_action;
+	bool has_cg;
 
 	t	   = te_blk_io_trace(iter->ent);
-	what	   = t->action & ((1 << BLK_TC_SHIFT) - 1);
+	what	   = (t->action & ((1 << BLK_TC_SHIFT) - 1)) & ~__BLK_TA_CGROUP;
 	long_act   = !!(tr->trace_flags & TRACE_ITER_VERBOSE);
 	log_action = classic ? &blk_log_action_classic : &blk_log_action;
+	has_cg	   = t->action & __BLK_TA_CGROUP;
 
-	if (t->action == BLK_TN_MESSAGE) {
-		log_action(iter, long_act ? "message" : "m");
-		blk_log_msg(s, iter->ent);
+	if ((t->action & ~__BLK_TN_CGROUP) == BLK_TN_MESSAGE) {
+		log_action(iter, long_act ? "message" : "m", has_cg);
+		blk_log_msg(s, iter->ent, has_cg);
 		return trace_handle_return(s);
 	}
 
 	if (unlikely(what == 0 || what >= ARRAY_SIZE(what2act)))
 		trace_seq_printf(s, "Unknown action %x\n", what);
 	else {
-		log_action(iter, what2act[what].act[long_act]);
-		what2act[what].print(s, iter->ent);
+		log_action(iter, what2act[what].act[long_act], has_cg);
+		what2act[what].print(s, iter->ent, has_cg);
 	}
 
 	return trace_handle_return(s);
-- 
cgit v1.2.3


From 007cc56b7eeca8848021bc43aca2b8607fbe5589 Mon Sep 17 00:00:00 2001
From: Shaohua Li <shli@fb.com>
Date: Wed, 12 Jul 2017 11:49:54 -0700
Subject: block: always attach cgroup info into bio

blkcg_bio_issue_check() already gets blkcg for a BIO.
bio_associate_blkcg() uses a percpu refcounter, so it's a very cheap
operation. There is no point we don't attach the cgroup info into bio at
blkcg_bio_issue_check. This also makes blktrace outputs correct cgroup
info.

Acked-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Shaohua Li <shli@fb.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-throttle.c       | 7 +------
 include/linux/blk-cgroup.h | 3 +++
 2 files changed, 4 insertions(+), 6 deletions(-)

(limited to 'include')

diff --git a/block/blk-throttle.c b/block/blk-throttle.c
index a7285bf2831c..a6ebd2bdb4df 100644
--- a/block/blk-throttle.c
+++ b/block/blk-throttle.c
@@ -2104,14 +2104,9 @@ static inline void throtl_update_latency_buckets(struct throtl_data *td)
 static void blk_throtl_assoc_bio(struct throtl_grp *tg, struct bio *bio)
 {
 #ifdef CONFIG_BLK_DEV_THROTTLING_LOW
-	int ret;
-
-	ret = bio_associate_current(bio);
-	if (ret == 0 || ret == -EBUSY)
+	if (bio->bi_css)
 		bio->bi_cg_private = tg;
 	blk_stat_set_issue(&bio->bi_issue_stat, bio_sectors(bio));
-#else
-	bio_associate_current(bio);
 #endif
 }
 
diff --git a/include/linux/blk-cgroup.h b/include/linux/blk-cgroup.h
index 7104bea8dab1..9d92153dd856 100644
--- a/include/linux/blk-cgroup.h
+++ b/include/linux/blk-cgroup.h
@@ -691,6 +691,9 @@ static inline bool blkcg_bio_issue_check(struct request_queue *q,
 	rcu_read_lock();
 	blkcg = bio_blkcg(bio);
 
+	/* associate blkcg if bio hasn't attached one */
+	bio_associate_blkcg(bio, &blkcg->css);
+
 	blkg = blkg_lookup(blkcg, q);
 	if (unlikely(!blkg)) {
 		spin_lock_irq(q->queue_lock);
-- 
cgit v1.2.3


From 69fd5c391763bd94a40dd152bc72a7f230137150 Mon Sep 17 00:00:00 2001
From: Shaohua Li <shli@fb.com>
Date: Wed, 12 Jul 2017 11:49:55 -0700
Subject: blktrace: add an option to allow displaying cgroup path

By default we output cgroup id in blktrace. This adds an option to
display cgroup path. Since get cgroup path is a relativly heavy
operation, we don't enable it by default.

with the option enabled, blktrace will output something like this:
dd-1353  [007] d..2   293.015252:   8,0   /test/level  D   R 24 + 8 [dd]

Signed-off-by: Shaohua Li <shli@fb.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/kernfs/mount.c       | 19 +++++++++++++++++++
 include/linux/cgroup.h  |  6 ++++++
 include/linux/kernfs.h  |  2 ++
 kernel/cgroup/cgroup.c  | 12 ++++++++++++
 kernel/trace/blktrace.c | 14 +++++++++++++-
 5 files changed, 52 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/fs/kernfs/mount.c b/fs/kernfs/mount.c
index fa323589704f..7c452f4d83e9 100644
--- a/fs/kernfs/mount.c
+++ b/fs/kernfs/mount.c
@@ -65,6 +65,25 @@ const struct super_operations kernfs_sops = {
 	.show_path	= kernfs_sop_show_path,
 };
 
+/*
+ * Similar to kernfs_fh_get_inode, this one gets kernfs node from inode
+ * number and generation
+ */
+struct kernfs_node *kernfs_get_node_by_id(struct kernfs_root *root,
+	const union kernfs_node_id *id)
+{
+	struct kernfs_node *kn;
+
+	kn = kernfs_find_and_get_node_by_ino(root, id->ino);
+	if (!kn)
+		return NULL;
+	if (kn->id.generation != id->generation) {
+		kernfs_put(kn);
+		return NULL;
+	}
+	return kn;
+}
+
 static struct inode *kernfs_fh_get_inode(struct super_block *sb,
 		u64 ino, u32 generation)
 {
diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index 52ef9a68ff14..6144fe923b73 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -613,6 +613,9 @@ static inline union kernfs_node_id *cgroup_get_kernfs_id(struct cgroup *cgrp)
 {
 	return &cgrp->kn->id;
 }
+
+void cgroup_path_from_kernfs_id(const union kernfs_node_id *id,
+					char *buf, size_t buflen);
 #else /* !CONFIG_CGROUPS */
 
 struct cgroup_subsys_state;
@@ -645,6 +648,9 @@ static inline bool task_under_cgroup_hierarchy(struct task_struct *task,
 {
 	return true;
 }
+
+static inline void cgroup_path_from_kernfs_id(const union kernfs_node_id *id,
+	char *buf, size_t buflen) {}
 #endif /* !CONFIG_CGROUPS */
 
 /*
diff --git a/include/linux/kernfs.h b/include/linux/kernfs.h
index d149361e5875..ab25c8b6d9e3 100644
--- a/include/linux/kernfs.h
+++ b/include/linux/kernfs.h
@@ -358,6 +358,8 @@ struct super_block *kernfs_pin_sb(struct kernfs_root *root, const void *ns);
 
 void kernfs_init(void);
 
+struct kernfs_node *kernfs_get_node_by_id(struct kernfs_root *root,
+	const union kernfs_node_id *id);
 #else	/* CONFIG_KERNFS */
 
 static inline enum kernfs_node_type kernfs_type(struct kernfs_node *kn)
diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
index 6cefa277f39c..2aba1c519138 100644
--- a/kernel/cgroup/cgroup.c
+++ b/kernel/cgroup/cgroup.c
@@ -4701,6 +4701,18 @@ static int __init cgroup_wq_init(void)
 }
 core_initcall(cgroup_wq_init);
 
+void cgroup_path_from_kernfs_id(const union kernfs_node_id *id,
+					char *buf, size_t buflen)
+{
+	struct kernfs_node *kn;
+
+	kn = kernfs_get_node_by_id(cgrp_dfl_root.kf_root, id);
+	if (!kn)
+		return;
+	kernfs_path(kn, buf, buflen);
+	kernfs_put(kn);
+}
+
 /*
  * proc_cgroup_show()
  *  - Print task's cgroup paths into seq_file, one line for each hierarchy
diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c
index f393d7a43695..e90974ed4532 100644
--- a/kernel/trace/blktrace.c
+++ b/kernel/trace/blktrace.c
@@ -48,12 +48,14 @@ static __cacheline_aligned_in_smp DEFINE_SPINLOCK(running_trace_lock);
 /* Select an alternative, minimalistic output than the original one */
 #define TRACE_BLK_OPT_CLASSIC	0x1
 #define TRACE_BLK_OPT_CGROUP	0x2
+#define TRACE_BLK_OPT_CGNAME	0x4
 
 static struct tracer_opt blk_tracer_opts[] = {
 	/* Default disable the minimalistic output */
 	{ TRACER_OPT(blk_classic, TRACE_BLK_OPT_CLASSIC) },
 #ifdef CONFIG_BLK_CGROUP
 	{ TRACER_OPT(blk_cgroup, TRACE_BLK_OPT_CGROUP) },
+	{ TRACER_OPT(blk_cgname, TRACE_BLK_OPT_CGNAME) },
 #endif
 	{ }
 };
@@ -1213,7 +1215,17 @@ static void blk_log_action(struct trace_iterator *iter, const char *act,
 	if (has_cg) {
 		const union kernfs_node_id *id = cgid_start(iter->ent);
 
-		trace_seq_printf(&iter->seq, "%3d,%-3d %x,%-x %2s %3s ",
+		if (blk_tracer_flags.val & TRACE_BLK_OPT_CGNAME) {
+			char blkcg_name_buf[NAME_MAX + 1] = "<...>";
+
+			cgroup_path_from_kernfs_id(id, blkcg_name_buf,
+				sizeof(blkcg_name_buf));
+			trace_seq_printf(&iter->seq, "%3d,%-3d %s %2s %3s ",
+				 MAJOR(t->device), MINOR(t->device),
+				 blkcg_name_buf, act, rwbs);
+		} else
+			trace_seq_printf(&iter->seq,
+				 "%3d,%-3d %x,%-x %2s %3s ",
 				 MAJOR(t->device), MINOR(t->device),
 				 id->ino, id->generation, act, rwbs);
 	} else
-- 
cgit v1.2.3


From 35fe6d763229e8fc0eb5f9b93a401673cfcb5e1e Mon Sep 17 00:00:00 2001
From: Shaohua Li <shli@fb.com>
Date: Wed, 12 Jul 2017 11:49:56 -0700
Subject: block: use standard blktrace API to output cgroup info for debug
 notes

Currently cfq/bfq/blk-throttle output cgroup info in trace in their own
way. Now we have standard blktrace API for this, so convert them to use
it.

Note, this changes the behavior a little bit. cgroup info isn't output
by default, we only do this with 'blk_cgroup' option enabled. cgroup
info isn't output as a string by default too, we only do this with
'blk_cgname' option enabled. Also cgroup info is output in different
position of the note string. I think these behavior changes aren't a big
issue (actually we make trace data shorter which is good), since the
blktrace note is solely for debugging.

Signed-off-by: Shaohua Li <shli@fb.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/bfq-iosched.h          | 13 ++++++++-----
 block/blk-throttle.c         |  6 ++----
 block/cfq-iosched.c          | 15 ++++++---------
 include/linux/blktrace_api.h | 13 +++++++++----
 kernel/trace/blktrace.c      | 12 ++++++++++--
 5 files changed, 35 insertions(+), 24 deletions(-)

(limited to 'include')

diff --git a/block/bfq-iosched.h b/block/bfq-iosched.h
index 63e771ab56d8..1f74d71b45cd 100644
--- a/block/bfq-iosched.h
+++ b/block/bfq-iosched.h
@@ -917,13 +917,16 @@ void bfq_add_bfqq_busy(struct bfq_data *bfqd, struct bfq_queue *bfqq);
 struct bfq_group *bfqq_group(struct bfq_queue *bfqq);
 
 #define bfq_log_bfqq(bfqd, bfqq, fmt, args...)	do {			\
-	blk_add_trace_msg((bfqd)->queue, "bfq%d%c %s " fmt, (bfqq)->pid,\
-			bfq_bfqq_sync((bfqq)) ? 'S' : 'A',		\
-			bfqq_group(bfqq)->blkg_path, ##args);		\
+	blk_add_cgroup_trace_msg((bfqd)->queue,				\
+			bfqg_to_blkg(bfqq_group(bfqq))->blkcg,		\
+			"bfq%d%c " fmt, (bfqq)->pid,			\
+			bfq_bfqq_sync((bfqq)) ? 'S' : 'A', ##args);	\
 } while (0)
 
-#define bfq_log_bfqg(bfqd, bfqg, fmt, args...)	\
-	blk_add_trace_msg((bfqd)->queue, "%s " fmt, (bfqg)->blkg_path, ##args)
+#define bfq_log_bfqg(bfqd, bfqg, fmt, args...)	do {			\
+	blk_add_cgroup_trace_msg((bfqd)->queue,				\
+		bfqg_to_blkg(bfqg)->blkcg, fmt, ##args);		\
+} while (0)
 
 #else /* CONFIG_BFQ_GROUP_IOSCHED */
 
diff --git a/block/blk-throttle.c b/block/blk-throttle.c
index a6ebd2bdb4df..6a4c4c493dd5 100644
--- a/block/blk-throttle.c
+++ b/block/blk-throttle.c
@@ -373,10 +373,8 @@ static unsigned int tg_iops_limit(struct throtl_grp *tg, int rw)
 	if (likely(!blk_trace_note_message_enabled(__td->queue)))	\
 		break;							\
 	if ((__tg)) {							\
-		char __pbuf[128];					\
-									\
-		blkg_path(tg_to_blkg(__tg), __pbuf, sizeof(__pbuf));	\
-		blk_add_trace_msg(__td->queue, "throtl %s " fmt, __pbuf, ##args); \
+		blk_add_cgroup_trace_msg(__td->queue,			\
+			tg_to_blkg(__tg)->blkcg, "throtl " fmt, ##args);\
 	} else {							\
 		blk_add_trace_msg(__td->queue, "throtl " fmt, ##args);	\
 	}								\
diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index 3d5c28945719..0fb78fb3c03c 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -656,20 +656,17 @@ static inline void cfqg_put(struct cfq_group *cfqg)
 }
 
 #define cfq_log_cfqq(cfqd, cfqq, fmt, args...)	do {			\
-	char __pbuf[128];						\
-									\
-	blkg_path(cfqg_to_blkg((cfqq)->cfqg), __pbuf, sizeof(__pbuf));	\
-	blk_add_trace_msg((cfqd)->queue, "cfq%d%c%c %s " fmt, (cfqq)->pid, \
+	blk_add_cgroup_trace_msg((cfqd)->queue,				\
+			cfqg_to_blkg((cfqq)->cfqg)->blkcg,		\
+			"cfq%d%c%c " fmt, (cfqq)->pid,			\
 			cfq_cfqq_sync((cfqq)) ? 'S' : 'A',		\
 			cfqq_type((cfqq)) == SYNC_NOIDLE_WORKLOAD ? 'N' : ' ',\
-			  __pbuf, ##args);				\
+			  ##args);					\
 } while (0)
 
 #define cfq_log_cfqg(cfqd, cfqg, fmt, args...)	do {			\
-	char __pbuf[128];						\
-									\
-	blkg_path(cfqg_to_blkg(cfqg), __pbuf, sizeof(__pbuf));		\
-	blk_add_trace_msg((cfqd)->queue, "%s " fmt, __pbuf, ##args);	\
+	blk_add_cgroup_trace_msg((cfqd)->queue,				\
+			cfqg_to_blkg(cfqg)->blkcg, fmt, ##args);	\
 } while (0)
 
 static inline void cfqg_stats_update_io_add(struct cfq_group *cfqg,
diff --git a/include/linux/blktrace_api.h b/include/linux/blktrace_api.h
index d2e908586e3d..67b4d4dfc19c 100644
--- a/include/linux/blktrace_api.h
+++ b/include/linux/blktrace_api.h
@@ -28,10 +28,12 @@ struct blk_trace {
 	atomic_t dropped;
 };
 
+struct blkcg;
+
 extern int blk_trace_ioctl(struct block_device *, unsigned, char __user *);
 extern void blk_trace_shutdown(struct request_queue *);
-extern __printf(2, 3)
-void __trace_note_message(struct blk_trace *, const char *fmt, ...);
+extern __printf(3, 4)
+void __trace_note_message(struct blk_trace *, struct blkcg *blkcg, const char *fmt, ...);
 
 /**
  * blk_add_trace_msg - Add a (simple) message to the blktrace stream
@@ -46,12 +48,14 @@ void __trace_note_message(struct blk_trace *, const char *fmt, ...);
  *     NOTE: Can not use 'static inline' due to presence of var args...
  *
  **/
-#define blk_add_trace_msg(q, fmt, ...)					\
+#define blk_add_cgroup_trace_msg(q, cg, fmt, ...)			\
 	do {								\
 		struct blk_trace *bt = (q)->blk_trace;			\
 		if (unlikely(bt))					\
-			__trace_note_message(bt, fmt, ##__VA_ARGS__);	\
+			__trace_note_message(bt, cg, fmt, ##__VA_ARGS__);\
 	} while (0)
+#define blk_add_trace_msg(q, fmt, ...)					\
+	blk_add_cgroup_trace_msg(q, NULL, fmt, ##__VA_ARGS__)
 #define BLK_TN_MAX_MSG		128
 
 static inline bool blk_trace_note_message_enabled(struct request_queue *q)
@@ -82,6 +86,7 @@ extern struct attribute_group blk_trace_attr_group;
 # define blk_trace_startstop(q, start)			(-ENOTTY)
 # define blk_trace_remove(q)				(-ENOTTY)
 # define blk_add_trace_msg(q, fmt, ...)			do { } while (0)
+# define blk_add_cgroup_trace_msg(q, cg, fmt, ...)	do { } while (0)
 # define blk_trace_remove_sysfs(dev)			do { } while (0)
 # define blk_trace_note_message_enabled(q)		(false)
 static inline int blk_trace_init_sysfs(struct device *dev)
diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c
index e90974ed4532..7724de18d2fe 100644
--- a/kernel/trace/blktrace.c
+++ b/kernel/trace/blktrace.c
@@ -154,7 +154,8 @@ static void trace_note_time(struct blk_trace *bt)
 	local_irq_restore(flags);
 }
 
-void __trace_note_message(struct blk_trace *bt, const char *fmt, ...)
+void __trace_note_message(struct blk_trace *bt, struct blkcg *blkcg,
+	const char *fmt, ...)
 {
 	int n;
 	va_list args;
@@ -178,7 +179,14 @@ void __trace_note_message(struct blk_trace *bt, const char *fmt, ...)
 	n = vscnprintf(buf, BLK_TN_MAX_MSG, fmt, args);
 	va_end(args);
 
+	if (!(blk_tracer_flags.val & TRACE_BLK_OPT_CGROUP))
+		blkcg = NULL;
+#ifdef CONFIG_BLK_CGROUP
+	trace_note(bt, 0, BLK_TN_MESSAGE, buf, n,
+		blkcg ? cgroup_get_kernfs_id(blkcg->css.cgroup) : NULL);
+#else
 	trace_note(bt, 0, BLK_TN_MESSAGE, buf, n, NULL);
+#endif
 	local_irq_restore(flags);
 }
 EXPORT_SYMBOL_GPL(__trace_note_message);
@@ -375,7 +383,7 @@ static ssize_t blk_msg_write(struct file *filp, const char __user *buffer,
 		return PTR_ERR(msg);
 
 	bt = filp->private_data;
-	__trace_note_message(bt, "%s", msg);
+	__trace_note_message(bt, NULL, "%s", msg);
 	kfree(msg);
 
 	return count;
-- 
cgit v1.2.3


From 0bf8bf50eddc7511b52461bae798cbfaa0157a34 Mon Sep 17 00:00:00 2001
From: Matthias Kaehlcke <mka@chromium.org>
Date: Mon, 24 Jul 2017 18:27:25 -0700
Subject: module: Remove const attribute from alias for MODULE_DEVICE_TABLE

MODULE_DEVICE_TABLE(type, name) creates an alias of type 'extern const
typeof(name)'. If 'name' is already constant the 'const' attribute is
specified twice, which is not allowed in C89 (see discussion at
https://lkml.org/lkml/2017/5/23/1440). Since the kernel is built with
-std=gnu89 clang generates warnings like this:

drivers/thermal/x86_pkg_temp_thermal.c:509:1: warning: duplicate 'const'
  declaration specifier
      [-Wduplicate-decl-specifier]
MODULE_DEVICE_TABLE(x86cpu, pkg_temp_thermal_ids);
^
./include/linux/module.h:212:8: note: expanded from macro 'MODULE_DEVICE_TABLE'
extern const typeof(name) __mod_##type##__##name##_device_table

Remove the const attribute from the alias to avoid the duplicate
specifier. After all it is only an alias and the attribute shouldn't
have any effect.

Signed-off-by: Matthias Kaehlcke <mka@chromium.org>
Signed-off-by: Jessica Yu <jeyu@kernel.org>
---
 include/linux/module.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/module.h b/include/linux/module.h
index e7bdd549e527..fe5aa3736707 100644
--- a/include/linux/module.h
+++ b/include/linux/module.h
@@ -209,7 +209,7 @@ extern void cleanup_module(void);
 #ifdef MODULE
 /* Creates an alias so file2alias.c can find device table. */
 #define MODULE_DEVICE_TABLE(type, name)					\
-extern const typeof(name) __mod_##type##__##name##_device_table		\
+extern typeof(name) __mod_##type##__##name##_device_table		\
   __attribute__ ((unused, alias(__stringify(name))))
 #else  /* !MODULE */
 #define MODULE_DEVICE_TABLE(type, name)
-- 
cgit v1.2.3


From 1a5f3da20bd966220931239fbd31e6ac6ff42251 Mon Sep 17 00:00:00 2001
From: Vidya Sagar Ravipati <vidya.chowdary@gmail.com>
Date: Thu, 27 Jul 2017 16:47:26 -0700
Subject: net: ethtool: add support for forward error correction modes

Forward Error Correction (FEC) modes i.e Base-R
and Reed-Solomon modes are introduced in 25G/40G/100G standards
for providing good BER at high speeds. Various networking devices
which support 25G/40G/100G provides ability to manage supported FEC
modes and the lack of FEC encoding control and reporting today is a
source for interoperability issues for many vendors.
FEC capability as well as specific FEC mode i.e. Base-R
or RS modes can be requested or advertised through bits D44:47 of
base link codeword.

This patch set intends to provide option under ethtool to manage
and report FEC encoding settings for networking devices as per
IEEE 802.3 bj, bm and by specs.

set-fec/show-fec option(s) are designed to provide control and
report the FEC encoding on the link.

SET FEC option:
root@tor: ethtool --set-fec  swp1 encoding [off | RS | BaseR | auto]

Encoding: Types of encoding
Off    :  Turning off any encoding
RS     :  enforcing RS-FEC encoding on supported speeds
BaseR  :  enforcing Base R encoding on supported speeds
Auto   :  IEEE defaults for the speed/medium combination

Here are a few examples of what we would expect if encoding=auto:
- if autoneg is on, we are  expecting FEC to be negotiated as on or off
  as long as protocol supports it
- if the hardware is capable of detecting the FEC encoding on it's
      receiver it will reconfigure its encoder to match
- in absence of the above, the configuration would be set to IEEE
  defaults.

>From our  understanding , this is essentially what most hardware/driver
combinations are doing today in the absence of a way for users to
control the behavior.

SHOW FEC option:
root@tor: ethtool --show-fec  swp1
FEC parameters for swp1:
Active FEC encodings: RS
Configured FEC encodings:  RS | BaseR

ETHTOOL DEVNAME output modification:

ethtool devname output:
root@tor:~# ethtool swp1
Settings for swp1:
root@hpe-7712-03:~# ethtool swp18
Settings for swp18:
    Supported ports: [ FIBRE ]
    Supported link modes:   40000baseCR4/Full
                            40000baseSR4/Full
                            40000baseLR4/Full
                            100000baseSR4/Full
                            100000baseCR4/Full
                            100000baseLR4_ER4/Full
    Supported pause frame use: No
    Supports auto-negotiation: Yes
    Supported FEC modes: [RS | BaseR | None | Not reported]
    Advertised link modes:  Not reported
    Advertised pause frame use: No
    Advertised auto-negotiation: No
    Advertised FEC modes: [RS | BaseR | None | Not reported]
<<<< One or more FEC modes
    Speed: 100000Mb/s
    Duplex: Full
    Port: FIBRE
    PHYAD: 106
    Transceiver: internal
    Auto-negotiation: off
    Link detected: yes

This patch includes following changes
a) New ETHTOOL_SFECPARAM/SFECPARAM API, handled by
  the new get_fecparam/set_fecparam callbacks, provides support
  for configuration of forward error correction modes.
b) Link mode bits for FEC modes i.e. None (No FEC mode), RS, BaseR/FC
  are defined so that users can configure these fec modes for supported
  and advertising fields as part of link autonegotiation.

Signed-off-by: Vidya Sagar Ravipati <vidya.chowdary@gmail.com>
Signed-off-by: Dustin Byford <dustin@cumulusnetworks.com>
Signed-off-by: Roopa Prabhu <roopa@cumulusnetworks.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/ethtool.h      |  4 ++++
 include/uapi/linux/ethtool.h | 48 +++++++++++++++++++++++++++++++++++++++++++-
 net/core/ethtool.c           | 34 +++++++++++++++++++++++++++++++
 3 files changed, 85 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/ethtool.h b/include/linux/ethtool.h
index 83cc9863444b..afdbb701fdb4 100644
--- a/include/linux/ethtool.h
+++ b/include/linux/ethtool.h
@@ -374,5 +374,9 @@ struct ethtool_ops {
 				      struct ethtool_link_ksettings *);
 	int	(*set_link_ksettings)(struct net_device *,
 				      const struct ethtool_link_ksettings *);
+	int	(*get_fecparam)(struct net_device *,
+				      struct ethtool_fecparam *);
+	int	(*set_fecparam)(struct net_device *,
+				      struct ethtool_fecparam *);
 };
 #endif /* _LINUX_ETHTOOL_H */
diff --git a/include/uapi/linux/ethtool.h b/include/uapi/linux/ethtool.h
index 7d4a594d5d58..9c041dae8e2c 100644
--- a/include/uapi/linux/ethtool.h
+++ b/include/uapi/linux/ethtool.h
@@ -1238,6 +1238,47 @@ struct ethtool_per_queue_op {
 	char	data[];
 };
 
+/**
+ * struct ethtool_fecparam - Ethernet forward error correction(fec) parameters
+ * @cmd: Command number = %ETHTOOL_GFECPARAM or %ETHTOOL_SFECPARAM
+ * @active_fec: FEC mode which is active on porte
+ * @fec: Bitmask of supported/configured FEC modes
+ * @rsvd: Reserved for future extensions. i.e FEC bypass feature.
+ *
+ * Drivers should reject a non-zero setting of @autoneg when
+ * autoneogotiation is disabled (or not supported) for the link.
+ *
+ */
+struct ethtool_fecparam {
+	__u32   cmd;
+	/* bitmask of FEC modes */
+	__u32   active_fec;
+	__u32   fec;
+	__u32   reserved;
+};
+
+/**
+ * enum ethtool_fec_config_bits - flags definition of ethtool_fec_configuration
+ * @ETHTOOL_FEC_NONE: FEC mode configuration is not supported
+ * @ETHTOOL_FEC_AUTO: Default/Best FEC mode provided by driver
+ * @ETHTOOL_FEC_OFF: No FEC Mode
+ * @ETHTOOL_FEC_RS: Reed-Solomon Forward Error Detection mode
+ * @ETHTOOL_FEC_BASER: Base-R/Reed-Solomon Forward Error Detection mode
+ */
+enum ethtool_fec_config_bits {
+	ETHTOOL_FEC_NONE_BIT,
+	ETHTOOL_FEC_AUTO_BIT,
+	ETHTOOL_FEC_OFF_BIT,
+	ETHTOOL_FEC_RS_BIT,
+	ETHTOOL_FEC_BASER_BIT,
+};
+
+#define ETHTOOL_FEC_NONE		(1 << ETHTOOL_FEC_NONE_BIT)
+#define ETHTOOL_FEC_AUTO		(1 << ETHTOOL_FEC_AUTO_BIT)
+#define ETHTOOL_FEC_OFF			(1 << ETHTOOL_FEC_OFF_BIT)
+#define ETHTOOL_FEC_RS			(1 << ETHTOOL_FEC_RS_BIT)
+#define ETHTOOL_FEC_BASER		(1 << ETHTOOL_FEC_BASER_BIT)
+
 /* CMDs currently supported */
 #define ETHTOOL_GSET		0x00000001 /* DEPRECATED, Get settings.
 					    * Please use ETHTOOL_GLINKSETTINGS
@@ -1330,6 +1371,8 @@ struct ethtool_per_queue_op {
 #define ETHTOOL_SLINKSETTINGS	0x0000004d /* Set ethtool_link_settings */
 #define ETHTOOL_PHY_GTUNABLE	0x0000004e /* Get PHY tunable configuration */
 #define ETHTOOL_PHY_STUNABLE	0x0000004f /* Set PHY tunable configuration */
+#define ETHTOOL_GFECPARAM	0x00000050 /* Get FEC settings */
+#define ETHTOOL_SFECPARAM	0x00000051 /* Set FEC settings */
 
 /* compatibility with older code */
 #define SPARC_ETH_GSET		ETHTOOL_GSET
@@ -1387,6 +1430,9 @@ enum ethtool_link_mode_bit_indices {
 	ETHTOOL_LINK_MODE_2500baseT_Full_BIT	= 47,
 	ETHTOOL_LINK_MODE_5000baseT_Full_BIT	= 48,
 
+	ETHTOOL_LINK_MODE_FEC_NONE_BIT	= 49,
+	ETHTOOL_LINK_MODE_FEC_RS_BIT	= 50,
+	ETHTOOL_LINK_MODE_FEC_BASER_BIT	= 51,
 
 	/* Last allowed bit for __ETHTOOL_LINK_MODE_LEGACY_MASK is bit
 	 * 31. Please do NOT define any SUPPORTED_* or ADVERTISED_*
@@ -1395,7 +1441,7 @@ enum ethtool_link_mode_bit_indices {
 	 */
 
 	__ETHTOOL_LINK_MODE_LAST
-	  = ETHTOOL_LINK_MODE_5000baseT_Full_BIT,
+	  = ETHTOOL_LINK_MODE_FEC_BASER_BIT,
 };
 
 #define __ETHTOOL_LINK_MODE_LEGACY_MASK(base_name)	\
diff --git a/net/core/ethtool.c b/net/core/ethtool.c
index b987bc475fc8..6a582ae4c5d9 100644
--- a/net/core/ethtool.c
+++ b/net/core/ethtool.c
@@ -2512,6 +2512,33 @@ static int set_phy_tunable(struct net_device *dev, void __user *useraddr)
 	return ret;
 }
 
+static int ethtool_get_fecparam(struct net_device *dev, void __user *useraddr)
+{
+	struct ethtool_fecparam fecparam = { ETHTOOL_GFECPARAM };
+
+	if (!dev->ethtool_ops->get_fecparam)
+		return -EOPNOTSUPP;
+
+	dev->ethtool_ops->get_fecparam(dev, &fecparam);
+
+	if (copy_to_user(useraddr, &fecparam, sizeof(fecparam)))
+		return -EFAULT;
+	return 0;
+}
+
+static int ethtool_set_fecparam(struct net_device *dev, void __user *useraddr)
+{
+	struct ethtool_fecparam fecparam;
+
+	if (!dev->ethtool_ops->set_fecparam)
+		return -EOPNOTSUPP;
+
+	if (copy_from_user(&fecparam, useraddr, sizeof(fecparam)))
+		return -EFAULT;
+
+	return dev->ethtool_ops->set_fecparam(dev, &fecparam);
+}
+
 /* The main entry point in this file.  Called from net/core/dev_ioctl.c */
 
 int dev_ethtool(struct net *net, struct ifreq *ifr)
@@ -2570,6 +2597,7 @@ int dev_ethtool(struct net *net, struct ifreq *ifr)
 	case ETHTOOL_GTUNABLE:
 	case ETHTOOL_PHY_GTUNABLE:
 	case ETHTOOL_GLINKSETTINGS:
+	case ETHTOOL_GFECPARAM:
 		break;
 	default:
 		if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
@@ -2779,6 +2807,12 @@ int dev_ethtool(struct net *net, struct ifreq *ifr)
 	case ETHTOOL_PHY_STUNABLE:
 		rc = set_phy_tunable(dev, useraddr);
 		break;
+	case ETHTOOL_GFECPARAM:
+		rc = ethtool_get_fecparam(dev, useraddr);
+		break;
+	case ETHTOOL_SFECPARAM:
+		rc = ethtool_set_fecparam(dev, useraddr);
+		break;
 	default:
 		rc = -EOPNOTSUPP;
 	}
-- 
cgit v1.2.3


From c7ac15ce89242e3c25fcc49fa4d00028285fab05 Mon Sep 17 00:00:00 2001
From: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Date: Tue, 25 Jul 2017 20:39:58 +0300
Subject: serial: core: move UPF_NO_TXEN_TEST to quirks and rename

First 16 bits in the flags field are user-visible except
UPF_NO_TXEN_TEST. To keep it clean we introduce internal quirks and move
UPF_NO_TXEN_TEST to them. Rename the constant to UPQ_NO_TXEN_TEST to
distinguish with port flags. Users are converted accordingly.

The quirks field might be extended later to hold the additional ones.

Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/tty/serial/8250/8250_core.c | 16 ++++++++--------
 drivers/tty/serial/8250/8250_pci.c  |  2 +-
 drivers/tty/serial/8250/8250_port.c |  2 +-
 include/linux/serial_core.h         |  9 +++++----
 4 files changed, 15 insertions(+), 14 deletions(-)

(limited to 'include')

diff --git a/drivers/tty/serial/8250/8250_core.c b/drivers/tty/serial/8250/8250_core.c
index b5def356af63..3db9d6e19660 100644
--- a/drivers/tty/serial/8250/8250_core.c
+++ b/drivers/tty/serial/8250/8250_core.c
@@ -497,6 +497,11 @@ static void univ8250_rsa_support(struct uart_ops *ops)
 #define univ8250_rsa_support(x)		do { } while (0)
 #endif /* CONFIG_SERIAL_8250_RSA */
 
+static inline void serial8250_apply_quirks(struct uart_8250_port *up)
+{
+	up->port.quirks |= skip_txen_test ? UPQ_NO_TXEN_TEST : 0;
+}
+
 static void __init serial8250_isa_init_ports(void)
 {
 	struct uart_8250_port *up;
@@ -577,9 +582,7 @@ serial8250_register_ports(struct uart_driver *drv, struct device *dev)
 
 		up->port.dev = dev;
 
-		if (skip_txen_test)
-			up->port.flags |= UPF_NO_TXEN_TEST;
-
+		serial8250_apply_quirks(up);
 		uart_add_one_port(drv, &up->port);
 	}
 }
@@ -1006,9 +1009,6 @@ int serial8250_register_8250_port(struct uart_8250_port *up)
 		if (up->port.dev)
 			uart->port.dev = up->port.dev;
 
-		if (skip_txen_test)
-			uart->port.flags |= UPF_NO_TXEN_TEST;
-
 		if (up->port.flags & UPF_FIXED_TYPE)
 			uart->port.type = up->port.type;
 
@@ -1047,6 +1047,7 @@ int serial8250_register_8250_port(struct uart_8250_port *up)
 			serial8250_isa_config(0, &uart->port,
 					&uart->capabilities);
 
+		serial8250_apply_quirks(uart);
 		ret = uart_add_one_port(&serial8250_reg, &uart->port);
 		if (ret == 0)
 			ret = uart->port.line;
@@ -1081,11 +1082,10 @@ void serial8250_unregister_port(int line)
 	uart_remove_one_port(&serial8250_reg, &uart->port);
 	if (serial8250_isa_devs) {
 		uart->port.flags &= ~UPF_BOOT_AUTOCONF;
-		if (skip_txen_test)
-			uart->port.flags |= UPF_NO_TXEN_TEST;
 		uart->port.type = PORT_UNKNOWN;
 		uart->port.dev = &serial8250_isa_devs->dev;
 		uart->capabilities = 0;
+		serial8250_apply_quirks(uart);
 		uart_add_one_port(&serial8250_reg, &uart->port);
 	} else {
 		uart->port.dev = NULL;
diff --git a/drivers/tty/serial/8250/8250_pci.c b/drivers/tty/serial/8250/8250_pci.c
index 553823c12c2f..e82d7d521010 100644
--- a/drivers/tty/serial/8250/8250_pci.c
+++ b/drivers/tty/serial/8250/8250_pci.c
@@ -1548,7 +1548,7 @@ static int skip_tx_en_setup(struct serial_private *priv,
 			const struct pciserial_board *board,
 			struct uart_8250_port *port, int idx)
 {
-	port->port.flags |= UPF_NO_TXEN_TEST;
+	port->port.quirks |= UPQ_NO_TXEN_TEST;
 	dev_dbg(&priv->dev->dev,
 		"serial8250: skipping TxEn test for device [%04x:%04x] subsystem [%04x:%04x]\n",
 		priv->dev->vendor, priv->dev->device,
diff --git a/drivers/tty/serial/8250/8250_port.c b/drivers/tty/serial/8250/8250_port.c
index a5fe0e66c607..25aae660cb9e 100644
--- a/drivers/tty/serial/8250/8250_port.c
+++ b/drivers/tty/serial/8250/8250_port.c
@@ -2304,7 +2304,7 @@ int serial8250_do_startup(struct uart_port *port)
 	 * test if we receive TX irq.  This way, we'll never enable
 	 * UART_BUG_TXEN.
 	 */
-	if (up->port.flags & UPF_NO_TXEN_TEST)
+	if (up->port.quirks & UPQ_NO_TXEN_TEST)
 		goto dont_test_tx_en;
 
 	/*
diff --git a/include/linux/serial_core.h b/include/linux/serial_core.h
index 1775500294bb..8dc24c855a70 100644
--- a/include/linux/serial_core.h
+++ b/include/linux/serial_core.h
@@ -20,7 +20,7 @@
 #ifndef LINUX_SERIAL_CORE_H
 #define LINUX_SERIAL_CORE_H
 
-
+#include <linux/bitops.h>
 #include <linux/compiler.h>
 #include <linux/interrupt.h>
 #include <linux/circ_buf.h>
@@ -144,7 +144,7 @@ struct uart_port {
 	unsigned char		x_char;			/* xon/xoff char */
 	unsigned char		regshift;		/* reg offset shift */
 	unsigned char		iotype;			/* io access style */
-	unsigned char		unused1;
+	unsigned char		quirks;			/* internal quirks */
 
 #define UPIO_PORT		(SERIAL_IO_PORT)	/* 8b I/O port access */
 #define UPIO_HUB6		(SERIAL_IO_HUB6)	/* Hub6 ISA card */
@@ -155,6 +155,9 @@ struct uart_port {
 #define UPIO_MEM32BE		(SERIAL_IO_MEM32BE)	/* 32b big endian */
 #define UPIO_MEM16		(SERIAL_IO_MEM16)	/* 16b little endian */
 
+	/* quirks must be updated while holding port mutex */
+#define UPQ_NO_TXEN_TEST	BIT(0)
+
 	unsigned int		read_status_mask;	/* driver specific */
 	unsigned int		ignore_status_mask;	/* driver specific */
 	struct uart_state	*state;			/* pointer to parent state */
@@ -175,7 +178,6 @@ struct uart_port {
 	 * [for bit definitions in the UPF_CHANGE_MASK]
 	 *
 	 * Bits [0..UPF_LAST_USER] are userspace defined/visible/changeable
-	 * except bit 15 (UPF_NO_TXEN_TEST) which is masked off.
 	 * The remaining bits are serial-core specific and not modifiable by
 	 * userspace.
 	 */
@@ -192,7 +194,6 @@ struct uart_port {
 #define UPF_SPD_SHI		((__force upf_t) ASYNC_SPD_SHI        /* 12 */ )
 #define UPF_LOW_LATENCY		((__force upf_t) ASYNC_LOW_LATENCY    /* 13 */ )
 #define UPF_BUGGY_UART		((__force upf_t) ASYNC_BUGGY_UART     /* 14 */ )
-#define UPF_NO_TXEN_TEST	((__force upf_t) (1 << 15))
 #define UPF_MAGIC_MULTIPLIER	((__force upf_t) ASYNC_MAGIC_MULTIPLIER /* 16 */ )
 
 #define UPF_NO_THRE_TEST	((__force upf_t) (1 << 19))
-- 
cgit v1.2.3


From 979990c6284814617d8f2179d197f72ff62b5d85 Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Tue, 20 Jun 2017 23:10:41 +0200
Subject: tty: improve tty_insert_flip_char() fast path

kernelci.org reports a crazy stack usage for the VT code when CONFIG_KASAN
is enabled:

drivers/tty/vt/keyboard.c: In function 'kbd_keycode':
drivers/tty/vt/keyboard.c:1452:1: error: the frame size of 2240 bytes is larger than 2048 bytes [-Werror=frame-larger-than=]

The problem is that tty_insert_flip_char() gets inlined many times into
kbd_keycode(), and also into other functions, and each copy requires 128
bytes for stack redzone to check for a possible out-of-bounds access on
the 'ch' and 'flags' arguments that are passed into
tty_insert_flip_string_flags as a variable-length string.

This introduces a new __tty_insert_flip_char() function for the slow
path, which receives the two arguments by value. This completely avoids
the problem and the stack usage goes back down to around 100 bytes.

Without KASAN, this is also slightly better, as we don't have to
spill the arguments to the stack but can simply pass 'ch' and 'flag'
in registers, saving a few bytes in .text for each call site.

This should be backported to linux-4.0 or later, which first introduced
the stack sanitizer in the kernel.

Cc: stable@vger.kernel.org
Fixes: c420f167db8c ("kasan: enable stack instrumentation")
Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/tty/tty_buffer.c | 24 ++++++++++++++++++++++++
 include/linux/tty_flip.h |  3 ++-
 2 files changed, 26 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/drivers/tty/tty_buffer.c b/drivers/tty/tty_buffer.c
index 4e7a4e9dcf4d..2da05fa37aec 100644
--- a/drivers/tty/tty_buffer.c
+++ b/drivers/tty/tty_buffer.c
@@ -361,6 +361,30 @@ int tty_insert_flip_string_flags(struct tty_port *port,
 }
 EXPORT_SYMBOL(tty_insert_flip_string_flags);
 
+/**
+ *	__tty_insert_flip_char   -	Add one character to the tty buffer
+ *	@port: tty port
+ *	@ch: character
+ *	@flag: flag byte
+ *
+ *	Queue a single byte to the tty buffering, with an optional flag.
+ *	This is the slow path of tty_insert_flip_char.
+ */
+int __tty_insert_flip_char(struct tty_port *port, unsigned char ch, char flag)
+{
+	struct tty_buffer *tb = port->buf.tail;
+	int flags = (flag == TTY_NORMAL) ? TTYB_NORMAL : 0;
+
+	if (!tty_buffer_request_room(port, 1))
+		return 0;
+
+	*flag_buf_ptr(tb, tb->used) = flag;
+	*char_buf_ptr(tb, tb->used++) = ch;
+
+	return 1;
+}
+EXPORT_SYMBOL(__tty_insert_flip_char);
+
 /**
  *	tty_schedule_flip	-	push characters to ldisc
  *	@port: tty port to push from
diff --git a/include/linux/tty_flip.h b/include/linux/tty_flip.h
index c28dd523f96e..d43837f2ce3a 100644
--- a/include/linux/tty_flip.h
+++ b/include/linux/tty_flip.h
@@ -12,6 +12,7 @@ extern int tty_prepare_flip_string(struct tty_port *port,
 		unsigned char **chars, size_t size);
 extern void tty_flip_buffer_push(struct tty_port *port);
 void tty_schedule_flip(struct tty_port *port);
+int __tty_insert_flip_char(struct tty_port *port, unsigned char ch, char flag);
 
 static inline int tty_insert_flip_char(struct tty_port *port,
 					unsigned char ch, char flag)
@@ -26,7 +27,7 @@ static inline int tty_insert_flip_char(struct tty_port *port,
 		*char_buf_ptr(tb, tb->used++) = ch;
 		return 1;
 	}
-	return tty_insert_flip_string_flags(port, &ch, &flag, 1);
+	return __tty_insert_flip_char(port, ch, flag);
 }
 
 static inline int tty_insert_flip_string(struct tty_port *port,
-- 
cgit v1.2.3


From 87840fb5a9dc934948ef7e067a993423a7a39df9 Mon Sep 17 00:00:00 2001
From: Laurentiu Tudor <laurentiu.tudor@nxp.com>
Date: Wed, 19 Jul 2017 14:42:25 +0300
Subject: staging: fsl-mc: add missing fsl_mc comment in struct msi_desc

The mc-bus specific field, fsl_mc in struct msi_desc is missing its
comment so add it.

Signed-off-by: Laurentiu Tudor <laurentiu.tudor@nxp.com>
Acked-by: Arnd Bergmann <arnd@arndb.de>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/linux/msi.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include')

diff --git a/include/linux/msi.h b/include/linux/msi.h
index df6d59201d31..80e3b562bef6 100644
--- a/include/linux/msi.h
+++ b/include/linux/msi.h
@@ -66,6 +66,7 @@ struct fsl_mc_msi_desc {
  * @mask_pos:	[PCI MSI]   Mask register position
  * @mask_base:	[PCI MSI-X] Mask register base address
  * @platform:	[platform]  Platform device specific msi descriptor data
+ * @fsl_mc:	[fsl-mc]    FSL MC device specific msi descriptor data
  */
 struct msi_desc {
 	/* Shared device/bus type independent data */
-- 
cgit v1.2.3


From 64c83d837329531252a1a0f0dfdd4fd607e1d8e9 Mon Sep 17 00:00:00 2001
From: Jamal Hadi Salim <jhs@mojatatu.com>
Date: Sun, 30 Jul 2017 13:24:49 -0400
Subject: net netlink: Add new type NLA_BITFIELD32

Generic bitflags attribute content sent to the kernel by user.
With this netlink attr type the user can either set or unset a
flag in the kernel.

The value is a bitmap that defines the bit values being set
The selector is a bitmask that defines which value bit is to be
considered.

A check is made to ensure the rules that a kernel subsystem always
conforms to bitflags the kernel already knows about. i.e
if the user tries to set a bit flag that is not understood then
the _it will be rejected_.

In the most basic form, the user specifies the attribute policy as:
[ATTR_GOO] = { .type = NLA_BITFIELD32, .validation_data = &myvalidflags },

where myvalidflags is the bit mask of the flags the kernel understands.

If the user _does not_ provide myvalidflags then the attribute will
also be rejected.

Examples:
value = 0x0, and selector = 0x1
implies we are selecting bit 1 and we want to set its value to 0.

value = 0x2, and selector = 0x2
implies we are selecting bit 2 and we want to set its value to 1.

Suggested-by: Jiri Pirko <jiri@mellanox.com>
Signed-off-by: Jamal Hadi Salim <jhs@mojatatu.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/netlink.h        | 16 ++++++++++++++++
 include/uapi/linux/netlink.h | 17 +++++++++++++++++
 lib/nlattr.c                 | 30 ++++++++++++++++++++++++++++++
 3 files changed, 63 insertions(+)

(limited to 'include')

diff --git a/include/net/netlink.h b/include/net/netlink.h
index ef8e6c3a80a6..82dd298b40c7 100644
--- a/include/net/netlink.h
+++ b/include/net/netlink.h
@@ -178,6 +178,7 @@ enum {
 	NLA_S16,
 	NLA_S32,
 	NLA_S64,
+	NLA_BITFIELD32,
 	__NLA_TYPE_MAX,
 };
 
@@ -206,6 +207,7 @@ enum {
  *    NLA_MSECS            Leaving the length field zero will verify the
  *                         given type fits, using it verifies minimum length
  *                         just like "All other"
+ *    NLA_BITFIELD32      A 32-bit bitmap/bitselector attribute
  *    All other            Minimum length of attribute payload
  *
  * Example:
@@ -213,11 +215,13 @@ enum {
  * 	[ATTR_FOO] = { .type = NLA_U16 },
  *	[ATTR_BAR] = { .type = NLA_STRING, .len = BARSIZ },
  *	[ATTR_BAZ] = { .len = sizeof(struct mystruct) },
+ *	[ATTR_GOO] = { .type = NLA_BITFIELD32, .validation_data = &myvalidflags },
  * };
  */
 struct nla_policy {
 	u16		type;
 	u16		len;
+	void            *validation_data;
 };
 
 /**
@@ -1202,6 +1206,18 @@ static inline struct in6_addr nla_get_in6_addr(const struct nlattr *nla)
 	return tmp;
 }
 
+/**
+ * nla_get_bitfield32 - return payload of 32 bitfield attribute
+ * @nla: nla_bitfield32 attribute
+ */
+static inline struct nla_bitfield32 nla_get_bitfield32(const struct nlattr *nla)
+{
+	struct nla_bitfield32 tmp;
+
+	nla_memcpy(&tmp, nla, sizeof(tmp));
+	return tmp;
+}
+
 /**
  * nla_memdup - duplicate attribute memory (kmemdup)
  * @src: netlink attribute to duplicate from
diff --git a/include/uapi/linux/netlink.h b/include/uapi/linux/netlink.h
index f86127a46cfc..f4fc9c9e123d 100644
--- a/include/uapi/linux/netlink.h
+++ b/include/uapi/linux/netlink.h
@@ -226,5 +226,22 @@ struct nlattr {
 #define NLA_ALIGN(len)		(((len) + NLA_ALIGNTO - 1) & ~(NLA_ALIGNTO - 1))
 #define NLA_HDRLEN		((int) NLA_ALIGN(sizeof(struct nlattr)))
 
+/* Generic 32 bitflags attribute content sent to the kernel.
+ *
+ * The value is a bitmap that defines the values being set
+ * The selector is a bitmask that defines which value is legit
+ *
+ * Examples:
+ *  value = 0x0, and selector = 0x1
+ *  implies we are selecting bit 1 and we want to set its value to 0.
+ *
+ *  value = 0x2, and selector = 0x2
+ *  implies we are selecting bit 2 and we want to set its value to 1.
+ *
+ */
+struct nla_bitfield32 {
+	__u32 value;
+	__u32 selector;
+};
 
 #endif /* _UAPI__LINUX_NETLINK_H */
diff --git a/lib/nlattr.c b/lib/nlattr.c
index fb52435be42d..ee79b7a3c6b0 100644
--- a/lib/nlattr.c
+++ b/lib/nlattr.c
@@ -27,6 +27,30 @@ static const u8 nla_attr_minlen[NLA_TYPE_MAX+1] = {
 	[NLA_S64]	= sizeof(s64),
 };
 
+static int validate_nla_bitfield32(const struct nlattr *nla,
+				   u32 *valid_flags_allowed)
+{
+	const struct nla_bitfield32 *bf = nla_data(nla);
+	u32 *valid_flags_mask = valid_flags_allowed;
+
+	if (!valid_flags_allowed)
+		return -EINVAL;
+
+	/*disallow invalid bit selector */
+	if (bf->selector & ~*valid_flags_mask)
+		return -EINVAL;
+
+	/*disallow invalid bit values */
+	if (bf->value & ~*valid_flags_mask)
+		return -EINVAL;
+
+	/*disallow valid bit values that are not selected*/
+	if (bf->value & ~bf->selector)
+		return -EINVAL;
+
+	return 0;
+}
+
 static int validate_nla(const struct nlattr *nla, int maxtype,
 			const struct nla_policy *policy)
 {
@@ -46,6 +70,12 @@ static int validate_nla(const struct nlattr *nla, int maxtype,
 			return -ERANGE;
 		break;
 
+	case NLA_BITFIELD32:
+		if (attrlen != sizeof(struct nla_bitfield32))
+			return -ERANGE;
+
+		return validate_nla_bitfield32(nla, pt->validation_data);
+
 	case NLA_NUL_STRING:
 		if (pt->len)
 			minlen = min_t(int, attrlen, pt->len + 1);
-- 
cgit v1.2.3


From 90825b23a887f06f6c05bdde77b200c5fe9b6217 Mon Sep 17 00:00:00 2001
From: Jamal Hadi Salim <jhs@mojatatu.com>
Date: Sun, 30 Jul 2017 13:24:51 -0400
Subject: net sched actions: dump more than TCA_ACT_MAX_PRIO actions per batch

When you dump hundreds of thousands of actions, getting only 32 per
dump batch even when the socket buffer and memory allocations allow
is inefficient.

With this change, the user will get as many as possibly fitting
within the given constraints available to the kernel.

The top level action TLV space is extended. An attribute
TCA_ROOT_FLAGS is used to carry flags; flag TCA_FLAG_LARGE_DUMP_ON
is set by the user indicating the user is capable of processing
these large dumps. Older user space which doesnt set this flag
doesnt get the large (than 32) batches.
The kernel uses the TCA_ROOT_COUNT attribute to tell the user how many
actions are put in a single batch. As such user space app knows how long
to iterate (independent of the type of action being dumped)
instead of hardcoded maximum of 32 thus maintaining backward compat.

Some results dumping 1.5M actions below:
first an unpatched tc which doesnt understand these features...

prompt$ time -p tc actions ls action gact | grep index | wc -l
1500000
real 1388.43
user 2.07
sys 1386.79

Now lets see a patched tc which sets the correct flags when requesting
a dump:

prompt$ time -p updatedtc actions ls action gact | grep index | wc -l
1500000
real 178.13
user 2.02
sys 176.96

That is about 8x performance improvement for tc app which sets its
receive buffer to about 32K.

Signed-off-by: Jamal Hadi Salim <jhs@mojatatu.com>
Reviewed-by: Jiri Pirko <jiri@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/rtnetlink.h | 22 +++++++++++++++++--
 net/sched/act_api.c            | 50 +++++++++++++++++++++++++++++++++---------
 2 files changed, 60 insertions(+), 12 deletions(-)

(limited to 'include')

diff --git a/include/uapi/linux/rtnetlink.h b/include/uapi/linux/rtnetlink.h
index d148505010a7..bfa80a6164d9 100644
--- a/include/uapi/linux/rtnetlink.h
+++ b/include/uapi/linux/rtnetlink.h
@@ -683,10 +683,28 @@ struct tcamsg {
 	unsigned char	tca__pad1;
 	unsigned short	tca__pad2;
 };
+
+enum {
+	TCA_ROOT_UNSPEC,
+	TCA_ROOT_TAB,
+#define TCA_ACT_TAB TCA_ROOT_TAB
+#define TCAA_MAX TCA_ROOT_TAB
+	TCA_ROOT_FLAGS,
+	TCA_ROOT_COUNT,
+	__TCA_ROOT_MAX,
+#define	TCA_ROOT_MAX (__TCA_ROOT_MAX - 1)
+};
+
 #define TA_RTA(r)  ((struct rtattr*)(((char*)(r)) + NLMSG_ALIGN(sizeof(struct tcamsg))))
 #define TA_PAYLOAD(n) NLMSG_PAYLOAD(n,sizeof(struct tcamsg))
-#define TCA_ACT_TAB 1 /* attr type must be >=1 */	
-#define TCAA_MAX 1
+/* tcamsg flags stored in attribute TCA_ROOT_FLAGS
+ *
+ * TCA_FLAG_LARGE_DUMP_ON user->kernel to request for larger than TCA_ACT_MAX_PRIO
+ * actions in a dump. All dump responses will contain the number of actions
+ * being dumped stored in for user app's consumption in TCA_ROOT_COUNT
+ *
+ */
+#define TCA_FLAG_LARGE_DUMP_ON		(1 << 0)
 
 /* New extended info filters for IFLA_EXT_MASK */
 #define RTEXT_FILTER_VF		(1 << 0)
diff --git a/net/sched/act_api.c b/net/sched/act_api.c
index 848370e2fcca..d53653a73c4f 100644
--- a/net/sched/act_api.c
+++ b/net/sched/act_api.c
@@ -110,6 +110,7 @@ static int tcf_dump_walker(struct tcf_hashinfo *hinfo, struct sk_buff *skb,
 			   struct netlink_callback *cb)
 {
 	int err = 0, index = -1, i = 0, s_i = 0, n_i = 0;
+	u32 act_flags = cb->args[2];
 	struct nlattr *nest;
 
 	spin_lock_bh(&hinfo->lock);
@@ -138,14 +139,18 @@ static int tcf_dump_walker(struct tcf_hashinfo *hinfo, struct sk_buff *skb,
 			}
 			nla_nest_end(skb, nest);
 			n_i++;
-			if (n_i >= TCA_ACT_MAX_PRIO)
+			if (!(act_flags & TCA_FLAG_LARGE_DUMP_ON) &&
+			    n_i >= TCA_ACT_MAX_PRIO)
 				goto done;
 		}
 	}
 done:
 	spin_unlock_bh(&hinfo->lock);
-	if (n_i)
+	if (n_i) {
 		cb->args[0] += n_i;
+		if (act_flags & TCA_FLAG_LARGE_DUMP_ON)
+			cb->args[1] = n_i;
+	}
 	return n_i;
 
 nla_put_failure:
@@ -1068,11 +1073,17 @@ static int tcf_action_add(struct net *net, struct nlattr *nla,
 	return tcf_add_notify(net, n, &actions, portid);
 }
 
+static u32 tcaa_root_flags_allowed = TCA_FLAG_LARGE_DUMP_ON;
+static const struct nla_policy tcaa_policy[TCA_ROOT_MAX + 1] = {
+	[TCA_ROOT_FLAGS] = { .type = NLA_BITFIELD32,
+			     .validation_data = &tcaa_root_flags_allowed },
+};
+
 static int tc_ctl_action(struct sk_buff *skb, struct nlmsghdr *n,
 			 struct netlink_ext_ack *extack)
 {
 	struct net *net = sock_net(skb->sk);
-	struct nlattr *tca[TCAA_MAX + 1];
+	struct nlattr *tca[TCA_ROOT_MAX + 1];
 	u32 portid = skb ? NETLINK_CB(skb).portid : 0;
 	int ret = 0, ovr = 0;
 
@@ -1080,7 +1091,7 @@ static int tc_ctl_action(struct sk_buff *skb, struct nlmsghdr *n,
 	    !netlink_capable(skb, CAP_NET_ADMIN))
 		return -EPERM;
 
-	ret = nlmsg_parse(n, sizeof(struct tcamsg), tca, TCAA_MAX, NULL,
+	ret = nlmsg_parse(n, sizeof(struct tcamsg), tca, TCA_ROOT_MAX, NULL,
 			  extack);
 	if (ret < 0)
 		return ret;
@@ -1121,16 +1132,12 @@ replay:
 	return ret;
 }
 
-static struct nlattr *find_dump_kind(const struct nlmsghdr *n)
+static struct nlattr *find_dump_kind(struct nlattr **nla)
 {
 	struct nlattr *tb1, *tb2[TCA_ACT_MAX + 1];
 	struct nlattr *tb[TCA_ACT_MAX_PRIO + 1];
-	struct nlattr *nla[TCAA_MAX + 1];
 	struct nlattr *kind;
 
-	if (nlmsg_parse(n, sizeof(struct tcamsg), nla, TCAA_MAX,
-			NULL, NULL) < 0)
-		return NULL;
 	tb1 = nla[TCA_ACT_TAB];
 	if (tb1 == NULL)
 		return NULL;
@@ -1157,8 +1164,18 @@ static int tc_dump_action(struct sk_buff *skb, struct netlink_callback *cb)
 	struct tc_action_ops *a_o;
 	int ret = 0;
 	struct tcamsg *t = (struct tcamsg *) nlmsg_data(cb->nlh);
-	struct nlattr *kind = find_dump_kind(cb->nlh);
+	struct nlattr *tb[TCA_ROOT_MAX + 1];
+	struct nlattr *count_attr = NULL;
+	struct nlattr *kind = NULL;
+	struct nla_bitfield32 bf;
+	u32 act_count = 0;
+
+	ret = nlmsg_parse(cb->nlh, sizeof(struct tcamsg), tb, TCA_ROOT_MAX,
+			  tcaa_policy, NULL);
+	if (ret < 0)
+		return ret;
 
+	kind = find_dump_kind(tb);
 	if (kind == NULL) {
 		pr_info("tc_dump_action: action bad kind\n");
 		return 0;
@@ -1168,14 +1185,24 @@ static int tc_dump_action(struct sk_buff *skb, struct netlink_callback *cb)
 	if (a_o == NULL)
 		return 0;
 
+	cb->args[2] = 0;
+	if (tb[TCA_ROOT_FLAGS]) {
+		bf = nla_get_bitfield32(tb[TCA_ROOT_FLAGS]);
+		cb->args[2] = bf.value;
+	}
+
 	nlh = nlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq,
 			cb->nlh->nlmsg_type, sizeof(*t), 0);
 	if (!nlh)
 		goto out_module_put;
+
 	t = nlmsg_data(nlh);
 	t->tca_family = AF_UNSPEC;
 	t->tca__pad1 = 0;
 	t->tca__pad2 = 0;
+	count_attr = nla_reserve(skb, TCA_ROOT_COUNT, sizeof(u32));
+	if (!count_attr)
+		goto out_module_put;
 
 	nest = nla_nest_start(skb, TCA_ACT_TAB);
 	if (nest == NULL)
@@ -1188,6 +1215,9 @@ static int tc_dump_action(struct sk_buff *skb, struct netlink_callback *cb)
 	if (ret > 0) {
 		nla_nest_end(skb, nest);
 		ret = skb->len;
+		act_count = cb->args[1];
+		memcpy(nla_data(count_attr), &act_count, sizeof(u32));
+		cb->args[1] = 0;
 	} else
 		nlmsg_trim(skb, b);
 
-- 
cgit v1.2.3


From e62e484df04964ac947c679ef4f00c54ae5395aa Mon Sep 17 00:00:00 2001
From: Jamal Hadi Salim <jhs@mojatatu.com>
Date: Sun, 30 Jul 2017 13:24:52 -0400
Subject: net sched actions: add time filter for action dumping

This patch adds support for filtering based on time since last used.
When we are dumping a large number of actions it is useful to
have the option of filtering based on when the action was last
used to reduce the amount of data crossing to user space.

With this patch the user space app sets the TCA_ROOT_TIME_DELTA
attribute with the value in milliseconds with "time of interest
since now".  The kernel converts this to jiffies and does the
filtering comparison matching entries that have seen activity
since then and returns them to user space.
Old kernels and old tc continue to work in legacy mode since
they dont specify this attribute.

Some example (we have 400 actions bound to 400 filters); at
installation time. Using updated when tc setting the time of
interest to 120 seconds earlier (we see 400 actions):
prompt$ hackedtc actions ls action gact since 120000| grep index | wc -l
400

go get some coffee and wait for > 120 seconds and try again:

prompt$ hackedtc actions ls action gact since 120000 | grep index | wc -l
0

Lets see a filter bound to one of these actions:
....
filter pref 10 u32
filter pref 10 u32 fh 800: ht divisor 1
filter pref 10 u32 fh 800::800 order 2048 key ht 800 bkt 0 flowid 1:10  (rule hit 2 success 1)
  match 7f000002/ffffffff at 12 (success 1 )
    action order 1: gact action pass
     random type none pass val 0
     index 23 ref 2 bind 1 installed 1145 sec used 802 sec
    Action statistics:
    Sent 84 bytes 1 pkt (dropped 0, overlimits 0 requeues 0)
    backlog 0b 0p requeues 0
....

that coffee took long, no? It was good.

Now lets ping -c 1 127.0.0.2, then run the actions again:
prompt$ hackedtc actions ls action gact since 120 | grep index | wc -l
1

More details please:
prompt$ hackedtc -s actions ls action gact since 120000

    action order 0: gact action pass
     random type none pass val 0
     index 23 ref 2 bind 1 installed 1270 sec used 30 sec
    Action statistics:
    Sent 168 bytes 2 pkt (dropped 0, overlimits 0 requeues 0)
    backlog 0b 0p requeues 0

And the filter?

filter pref 10 u32
filter pref 10 u32 fh 800: ht divisor 1
filter pref 10 u32 fh 800::800 order 2048 key ht 800 bkt 0 flowid 1:10  (rule hit 4 success 2)
  match 7f000002/ffffffff at 12 (success 2 )
    action order 1: gact action pass
     random type none pass val 0
     index 23 ref 2 bind 1 installed 1324 sec used 84 sec
    Action statistics:
    Sent 168 bytes 2 pkt (dropped 0, overlimits 0 requeues 0)
    backlog 0b 0p requeues 0

Signed-off-by: Jamal Hadi Salim <jhs@mojatatu.com>
Reviewed-by: Jiri Pirko <jiri@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/rtnetlink.h |  1 +
 net/sched/act_api.c            | 21 ++++++++++++++++++++-
 2 files changed, 21 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/uapi/linux/rtnetlink.h b/include/uapi/linux/rtnetlink.h
index bfa80a6164d9..dab7dad9e01a 100644
--- a/include/uapi/linux/rtnetlink.h
+++ b/include/uapi/linux/rtnetlink.h
@@ -691,6 +691,7 @@ enum {
 #define TCAA_MAX TCA_ROOT_TAB
 	TCA_ROOT_FLAGS,
 	TCA_ROOT_COUNT,
+	TCA_ROOT_TIME_DELTA, /* in msecs */
 	__TCA_ROOT_MAX,
 #define	TCA_ROOT_MAX (__TCA_ROOT_MAX - 1)
 };
diff --git a/net/sched/act_api.c b/net/sched/act_api.c
index d53653a73c4f..f19b118df414 100644
--- a/net/sched/act_api.c
+++ b/net/sched/act_api.c
@@ -111,6 +111,7 @@ static int tcf_dump_walker(struct tcf_hashinfo *hinfo, struct sk_buff *skb,
 {
 	int err = 0, index = -1, i = 0, s_i = 0, n_i = 0;
 	u32 act_flags = cb->args[2];
+	unsigned long jiffy_since = cb->args[3];
 	struct nlattr *nest;
 
 	spin_lock_bh(&hinfo->lock);
@@ -128,6 +129,11 @@ static int tcf_dump_walker(struct tcf_hashinfo *hinfo, struct sk_buff *skb,
 			if (index < s_i)
 				continue;
 
+			if (jiffy_since &&
+			    time_after(jiffy_since,
+				       (unsigned long)p->tcfa_tm.lastuse))
+				continue;
+
 			nest = nla_nest_start(skb, n_i);
 			if (nest == NULL)
 				goto nla_put_failure;
@@ -145,9 +151,11 @@ static int tcf_dump_walker(struct tcf_hashinfo *hinfo, struct sk_buff *skb,
 		}
 	}
 done:
+	if (index >= 0)
+		cb->args[0] = index + 1;
+
 	spin_unlock_bh(&hinfo->lock);
 	if (n_i) {
-		cb->args[0] += n_i;
 		if (act_flags & TCA_FLAG_LARGE_DUMP_ON)
 			cb->args[1] = n_i;
 	}
@@ -1077,6 +1085,7 @@ static u32 tcaa_root_flags_allowed = TCA_FLAG_LARGE_DUMP_ON;
 static const struct nla_policy tcaa_policy[TCA_ROOT_MAX + 1] = {
 	[TCA_ROOT_FLAGS] = { .type = NLA_BITFIELD32,
 			     .validation_data = &tcaa_root_flags_allowed },
+	[TCA_ROOT_TIME_DELTA]      = { .type = NLA_U32 },
 };
 
 static int tc_ctl_action(struct sk_buff *skb, struct nlmsghdr *n,
@@ -1166,8 +1175,10 @@ static int tc_dump_action(struct sk_buff *skb, struct netlink_callback *cb)
 	struct tcamsg *t = (struct tcamsg *) nlmsg_data(cb->nlh);
 	struct nlattr *tb[TCA_ROOT_MAX + 1];
 	struct nlattr *count_attr = NULL;
+	unsigned long jiffy_since = 0;
 	struct nlattr *kind = NULL;
 	struct nla_bitfield32 bf;
+	u32 msecs_since = 0;
 	u32 act_count = 0;
 
 	ret = nlmsg_parse(cb->nlh, sizeof(struct tcamsg), tb, TCA_ROOT_MAX,
@@ -1191,15 +1202,23 @@ static int tc_dump_action(struct sk_buff *skb, struct netlink_callback *cb)
 		cb->args[2] = bf.value;
 	}
 
+	if (tb[TCA_ROOT_TIME_DELTA]) {
+		msecs_since = nla_get_u32(tb[TCA_ROOT_TIME_DELTA]);
+	}
+
 	nlh = nlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq,
 			cb->nlh->nlmsg_type, sizeof(*t), 0);
 	if (!nlh)
 		goto out_module_put;
 
+	if (msecs_since)
+		jiffy_since = jiffies - msecs_to_jiffies(msecs_since);
+
 	t = nlmsg_data(nlh);
 	t->tca_family = AF_UNSPEC;
 	t->tca__pad1 = 0;
 	t->tca__pad2 = 0;
+	cb->args[3] = jiffy_since;
 	count_attr = nla_reserve(skb, TCA_ROOT_COUNT, sizeof(u32));
 	if (!count_attr)
 		goto out_module_put;
-- 
cgit v1.2.3


From 0f9b973b65910eeb0f73bb1a2178bc0826a1b791 Mon Sep 17 00:00:00 2001
From: Martin Blumenstingl <martin.blumenstingl@googlemail.com>
Date: Fri, 28 Jul 2017 23:13:11 +0200
Subject: dt-bindings: clock: meson8b: describe the embedded reset controller

The Amlogic Meson8/Meson8b/Meson8m2 clock controller provides some reset
lines. These are used for example to boot the secondary CPU cores.

This patch describes the reset controller which is embedded into the
clock controller on these SoCs.
A header file is provided which provides preprocessor macros for each
reset line (to make the .dts files easier to read).

Signed-off-by: Martin Blumenstingl <martin.blumenstingl@googlemail.com>
Reviewed-by: Neil Armstrong <narmstrong@baylibre.com>
Signed-off-by: Neil Armstrong <narmstrong@baylibre.com>
---
 .../bindings/clock/amlogic,meson8b-clkc.txt        |  9 +++++++-
 .../dt-bindings/reset/amlogic,meson8b-clkc-reset.h | 27 ++++++++++++++++++++++
 2 files changed, 35 insertions(+), 1 deletion(-)
 create mode 100644 include/dt-bindings/reset/amlogic,meson8b-clkc-reset.h

(limited to 'include')

diff --git a/Documentation/devicetree/bindings/clock/amlogic,meson8b-clkc.txt b/Documentation/devicetree/bindings/clock/amlogic,meson8b-clkc.txt
index 606da38c0959..c858fd64f680 100644
--- a/Documentation/devicetree/bindings/clock/amlogic,meson8b-clkc.txt
+++ b/Documentation/devicetree/bindings/clock/amlogic,meson8b-clkc.txt
@@ -16,18 +16,25 @@ Required Properties:
 	   mapped region.
 
 - #clock-cells: should be 1.
+- #reset-cells: should be 1.
 
 Each clock is assigned an identifier and client nodes can use this identifier
 to specify the clock which they consume. All available clocks are defined as
 preprocessor macros in the dt-bindings/clock/meson8b-clkc.h header and can be
 used in device tree sources.
 
+Similarly a preprocessor macro for each reset line is defined in
+dt-bindings/reset/amlogic,meson8b-clkc-reset.h (which can be used from the
+device tree sources).
+
+
 Example: Clock controller node:
 
 	clkc: clock-controller@c1104000 {
-		#clock-cells = <1>;
 		compatible = "amlogic,meson8b-clkc";
 		reg = <0xc1108000 0x4>, <0xc1104000 0x460>;
+		#clock-cells = <1>;
+		#reset-cells = <1>;
 	};
 
 
diff --git a/include/dt-bindings/reset/amlogic,meson8b-clkc-reset.h b/include/dt-bindings/reset/amlogic,meson8b-clkc-reset.h
new file mode 100644
index 000000000000..1f1b56e57346
--- /dev/null
+++ b/include/dt-bindings/reset/amlogic,meson8b-clkc-reset.h
@@ -0,0 +1,27 @@
+/*
+ * Copyright (c) 2017 Martin Blumenstingl <martin.blumenstingl@googlemail.com>.
+ *
+ * SPDX-License-Identifier: (GPL-2.0+ OR MIT)
+ */
+
+#ifndef _DT_BINDINGS_AMLOGIC_MESON8B_CLKC_RESET_H
+#define _DT_BINDINGS_AMLOGIC_MESON8B_CLKC_RESET_H
+
+#define CLKC_RESET_L2_CACHE_SOFT_RESET				0
+#define CLKC_RESET_AXI_64_TO_128_BRIDGE_A5_SOFT_RESET		1
+#define CLKC_RESET_SCU_SOFT_RESET				2
+#define CLKC_RESET_CPU0_SOFT_RESET				3
+#define CLKC_RESET_CPU1_SOFT_RESET				4
+#define CLKC_RESET_CPU2_SOFT_RESET				5
+#define CLKC_RESET_CPU3_SOFT_RESET				6
+#define CLKC_RESET_A5_GLOBAL_RESET				7
+#define CLKC_RESET_A5_AXI_SOFT_RESET				8
+#define CLKC_RESET_A5_ABP_SOFT_RESET				9
+#define CLKC_RESET_AXI_64_TO_128_BRIDGE_MMC_SOFT_RESET		10
+#define CLKC_RESET_VID_CLK_CNTL_SOFT_RESET			11
+#define CLKC_RESET_VID_DIVIDER_CNTL_SOFT_RESET_POST		12
+#define CLKC_RESET_VID_DIVIDER_CNTL_SOFT_RESET_PRE		13
+#define CLKC_RESET_VID_DIVIDER_CNTL_RESET_N_POST		14
+#define CLKC_RESET_VID_DIVIDER_CNTL_RESET_N_PRE			15
+
+#endif /* _DT_BINDINGS_AMLOGIC_MESON8B_CLKC_RESET_H */
-- 
cgit v1.2.3


From f1e3f409d61372b742697d0b5e5cd17d09de37db Mon Sep 17 00:00:00 2001
From: Fabio Estevam <fabio.estevam@nxp.com>
Date: Sat, 29 Jul 2017 11:40:55 -0300
Subject: ASoC: soc-pcm: Remove unused 'debugfs_dpcm_state' entry

'debugfs_dpcm_state' member from structure snd_soc_pcm_runtime
is never used at all, so it is safe to remove it.

Signed-off-by: Fabio Estevam <fabio.estevam@nxp.com>
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 include/sound/soc.h | 1 -
 sound/soc/soc-pcm.c | 5 ++---
 2 files changed, 2 insertions(+), 4 deletions(-)

(limited to 'include')

diff --git a/include/sound/soc.h b/include/sound/soc.h
index 9c94b97c17f8..230b8eedd9f2 100644
--- a/include/sound/soc.h
+++ b/include/sound/soc.h
@@ -1240,7 +1240,6 @@ struct snd_soc_pcm_runtime {
 	struct delayed_work delayed_work;
 #ifdef CONFIG_DEBUG_FS
 	struct dentry *debugfs_dpcm_root;
-	struct dentry *debugfs_dpcm_state;
 #endif
 
 	unsigned int num; /* 0-based and monotonic increasing */
diff --git a/sound/soc/soc-pcm.c b/sound/soc/soc-pcm.c
index dcc5ece08668..2ddf6aae230d 100644
--- a/sound/soc/soc-pcm.c
+++ b/sound/soc/soc-pcm.c
@@ -3010,8 +3010,7 @@ void soc_dpcm_debugfs_add(struct snd_soc_pcm_runtime *rtd)
 		return;
 	}
 
-	rtd->debugfs_dpcm_state = debugfs_create_file("state", 0444,
-						rtd->debugfs_dpcm_root,
-						rtd, &dpcm_state_fops);
+	debugfs_create_file("state", 0444, rtd->debugfs_dpcm_root,
+			    rtd, &dpcm_state_fops);
 }
 #endif
-- 
cgit v1.2.3


From ac7b848390036dadd4351899d2a23748075916bd Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Wed, 26 Jul 2017 00:02:31 +0200
Subject: netfilter: expect: add and use nf_ct_expect_iterate helpers

We have several spots that open-code a expect walk, add a helper
that is similar to nf_ct_iterate_destroy/nf_ct_iterate_cleanup.

Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/net/netfilter/nf_conntrack_expect.h |  5 +++
 net/netfilter/nf_conntrack_expect.c         | 54 +++++++++++++++++++++++++
 net/netfilter/nf_conntrack_helper.c         | 34 +++++++---------
 net/netfilter/nf_conntrack_netlink.c        | 63 ++++++++++-------------------
 4 files changed, 95 insertions(+), 61 deletions(-)

(limited to 'include')

diff --git a/include/net/netfilter/nf_conntrack_expect.h b/include/net/netfilter/nf_conntrack_expect.h
index 2ba54feaccd8..818def011110 100644
--- a/include/net/netfilter/nf_conntrack_expect.h
+++ b/include/net/netfilter/nf_conntrack_expect.h
@@ -107,6 +107,11 @@ void nf_ct_remove_expectations(struct nf_conn *ct);
 void nf_ct_unexpect_related(struct nf_conntrack_expect *exp);
 bool nf_ct_remove_expect(struct nf_conntrack_expect *exp);
 
+void nf_ct_expect_iterate_destroy(bool (*iter)(struct nf_conntrack_expect *e, void *data), void *data);
+void nf_ct_expect_iterate_net(struct net *net,
+			      bool (*iter)(struct nf_conntrack_expect *e, void *data),
+                              void *data, u32 portid, int report);
+
 /* Allocate space for an expectation: this is mandatory before calling
    nf_ct_expect_related.  You will have to call put afterwards. */
 struct nf_conntrack_expect *nf_ct_expect_alloc(struct nf_conn *me);
diff --git a/net/netfilter/nf_conntrack_expect.c b/net/netfilter/nf_conntrack_expect.c
index 2c63808bea96..dad2c0c22ad5 100644
--- a/net/netfilter/nf_conntrack_expect.c
+++ b/net/netfilter/nf_conntrack_expect.c
@@ -474,6 +474,60 @@ out:
 }
 EXPORT_SYMBOL_GPL(nf_ct_expect_related_report);
 
+void nf_ct_expect_iterate_destroy(bool (*iter)(struct nf_conntrack_expect *e, void *data),
+				  void *data)
+{
+	struct nf_conntrack_expect *exp;
+	const struct hlist_node *next;
+	unsigned int i;
+
+	spin_lock_bh(&nf_conntrack_expect_lock);
+
+	for (i = 0; i < nf_ct_expect_hsize; i++) {
+		hlist_for_each_entry_safe(exp, next,
+					  &nf_ct_expect_hash[i],
+					  hnode) {
+			if (iter(exp, data) && del_timer(&exp->timeout)) {
+				nf_ct_unlink_expect(exp);
+				nf_ct_expect_put(exp);
+			}
+		}
+	}
+
+	spin_unlock_bh(&nf_conntrack_expect_lock);
+}
+EXPORT_SYMBOL_GPL(nf_ct_expect_iterate_destroy);
+
+void nf_ct_expect_iterate_net(struct net *net,
+			      bool (*iter)(struct nf_conntrack_expect *e, void *data),
+			      void *data,
+			      u32 portid, int report)
+{
+	struct nf_conntrack_expect *exp;
+	const struct hlist_node *next;
+	unsigned int i;
+
+	spin_lock_bh(&nf_conntrack_expect_lock);
+
+	for (i = 0; i < nf_ct_expect_hsize; i++) {
+		hlist_for_each_entry_safe(exp, next,
+					  &nf_ct_expect_hash[i],
+					  hnode) {
+
+			if (!net_eq(nf_ct_exp_net(exp), net))
+				continue;
+
+			if (iter(exp, data) && del_timer(&exp->timeout)) {
+				nf_ct_unlink_expect_report(exp, portid, report);
+				nf_ct_expect_put(exp);
+			}
+		}
+	}
+
+	spin_unlock_bh(&nf_conntrack_expect_lock);
+}
+EXPORT_SYMBOL_GPL(nf_ct_expect_iterate_net);
+
 #ifdef CONFIG_NF_CONNTRACK_PROCFS
 struct ct_expect_iter_state {
 	struct seq_net_private p;
diff --git a/net/netfilter/nf_conntrack_helper.c b/net/netfilter/nf_conntrack_helper.c
index 9129bb3b5153..551a1eddf0fa 100644
--- a/net/netfilter/nf_conntrack_helper.c
+++ b/net/netfilter/nf_conntrack_helper.c
@@ -437,12 +437,22 @@ out:
 }
 EXPORT_SYMBOL_GPL(nf_conntrack_helper_register);
 
-void nf_conntrack_helper_unregister(struct nf_conntrack_helper *me)
+static bool expect_iter_me(struct nf_conntrack_expect *exp, void *data)
 {
-	struct nf_conntrack_expect *exp;
-	const struct hlist_node *next;
-	unsigned int i;
+	struct nf_conn_help *help = nfct_help(exp->master);
+	const struct nf_conntrack_helper *me = data;
+	const struct nf_conntrack_helper *this;
+
+	if (exp->helper == me)
+		return true;
 
+	this = rcu_dereference_protected(help->helper,
+					 lockdep_is_held(&nf_conntrack_expect_lock));
+	return this == me;
+}
+
+void nf_conntrack_helper_unregister(struct nf_conntrack_helper *me)
+{
 	mutex_lock(&nf_ct_helper_mutex);
 	hlist_del_rcu(&me->hnode);
 	nf_ct_helper_count--;
@@ -453,21 +463,7 @@ void nf_conntrack_helper_unregister(struct nf_conntrack_helper *me)
 	 */
 	synchronize_rcu();
 
-	/* Get rid of expectations */
-	spin_lock_bh(&nf_conntrack_expect_lock);
-	for (i = 0; i < nf_ct_expect_hsize; i++) {
-		hlist_for_each_entry_safe(exp, next,
-					  &nf_ct_expect_hash[i], hnode) {
-			struct nf_conn_help *help = nfct_help(exp->master);
-			if ((rcu_dereference_protected(
-					help->helper,
-					lockdep_is_held(&nf_conntrack_expect_lock)
-					) == me || exp->helper == me))
-				nf_ct_remove_expect(exp);
-		}
-	}
-	spin_unlock_bh(&nf_conntrack_expect_lock);
-
+	nf_ct_expect_iterate_destroy(expect_iter_me, NULL);
 	nf_ct_iterate_destroy(unhelp, me);
 }
 EXPORT_SYMBOL_GPL(nf_conntrack_helper_unregister);
diff --git a/net/netfilter/nf_conntrack_netlink.c b/net/netfilter/nf_conntrack_netlink.c
index 4dba71de4de7..4922c8aefb2a 100644
--- a/net/netfilter/nf_conntrack_netlink.c
+++ b/net/netfilter/nf_conntrack_netlink.c
@@ -2898,6 +2898,21 @@ out:
 	return err == -EAGAIN ? -ENOBUFS : err;
 }
 
+static bool expect_iter_name(struct nf_conntrack_expect *exp, void *data)
+{
+	const struct nf_conn_help *m_help;
+	const char *name = data;
+
+	m_help = nfct_help(exp->master);
+
+	return strcmp(m_help->helper->name, name) == 0;
+}
+
+static bool expect_iter_all(struct nf_conntrack_expect *exp, void *data)
+{
+	return true;
+}
+
 static int ctnetlink_del_expect(struct net *net, struct sock *ctnl,
 				struct sk_buff *skb, const struct nlmsghdr *nlh,
 				const struct nlattr * const cda[],
@@ -2906,10 +2921,8 @@ static int ctnetlink_del_expect(struct net *net, struct sock *ctnl,
 	struct nf_conntrack_expect *exp;
 	struct nf_conntrack_tuple tuple;
 	struct nfgenmsg *nfmsg = nlmsg_data(nlh);
-	struct hlist_node *next;
 	u_int8_t u3 = nfmsg->nfgen_family;
 	struct nf_conntrack_zone zone;
-	unsigned int i;
 	int err;
 
 	if (cda[CTA_EXPECT_TUPLE]) {
@@ -2949,49 +2962,15 @@ static int ctnetlink_del_expect(struct net *net, struct sock *ctnl,
 		nf_ct_expect_put(exp);
 	} else if (cda[CTA_EXPECT_HELP_NAME]) {
 		char *name = nla_data(cda[CTA_EXPECT_HELP_NAME]);
-		struct nf_conn_help *m_help;
 
-		/* delete all expectations for this helper */
-		spin_lock_bh(&nf_conntrack_expect_lock);
-		for (i = 0; i < nf_ct_expect_hsize; i++) {
-			hlist_for_each_entry_safe(exp, next,
-						  &nf_ct_expect_hash[i],
-						  hnode) {
-
-				if (!net_eq(nf_ct_exp_net(exp), net))
-					continue;
-
-				m_help = nfct_help(exp->master);
-				if (!strcmp(m_help->helper->name, name) &&
-				    del_timer(&exp->timeout)) {
-					nf_ct_unlink_expect_report(exp,
-							NETLINK_CB(skb).portid,
-							nlmsg_report(nlh));
-					nf_ct_expect_put(exp);
-				}
-			}
-		}
-		spin_unlock_bh(&nf_conntrack_expect_lock);
+		nf_ct_expect_iterate_net(net, expect_iter_name, name,
+					 NETLINK_CB(skb).portid,
+					 nlmsg_report(nlh));
 	} else {
 		/* This basically means we have to flush everything*/
-		spin_lock_bh(&nf_conntrack_expect_lock);
-		for (i = 0; i < nf_ct_expect_hsize; i++) {
-			hlist_for_each_entry_safe(exp, next,
-						  &nf_ct_expect_hash[i],
-						  hnode) {
-
-				if (!net_eq(nf_ct_exp_net(exp), net))
-					continue;
-
-				if (del_timer(&exp->timeout)) {
-					nf_ct_unlink_expect_report(exp,
-							NETLINK_CB(skb).portid,
-							nlmsg_report(nlh));
-					nf_ct_expect_put(exp);
-				}
-			}
-		}
-		spin_unlock_bh(&nf_conntrack_expect_lock);
+		nf_ct_expect_iterate_net(net, expect_iter_all, NULL,
+					 NETLINK_CB(skb).portid,
+					 nlmsg_report(nlh));
 	}
 
 	return 0;
-- 
cgit v1.2.3


From 84657984c26fd0b64743a397f3a1a587fa4b575a Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Wed, 26 Jul 2017 00:02:32 +0200
Subject: netfilter: add and use nf_ct_unconfirmed_destroy

This also removes __nf_ct_unconfirmed_destroy() call from
nf_ct_iterate_cleanup_net, so that function can be used only
when missing conntracks from unconfirmed list isn't a problem.

Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/net/netfilter/nf_conntrack.h |  3 +++
 net/netfilter/nf_conntrack_core.c    | 15 +++++++++++----
 net/netfilter/nfnetlink_cttimeout.c  |  1 +
 3 files changed, 15 insertions(+), 4 deletions(-)

(limited to 'include')

diff --git a/include/net/netfilter/nf_conntrack.h b/include/net/netfilter/nf_conntrack.h
index 48407569585d..6e6f678aaac7 100644
--- a/include/net/netfilter/nf_conntrack.h
+++ b/include/net/netfilter/nf_conntrack.h
@@ -224,6 +224,9 @@ extern s32 (*nf_ct_nat_offset)(const struct nf_conn *ct,
 			       enum ip_conntrack_dir dir,
 			       u32 seq);
 
+/* Set all unconfirmed conntrack as dying */
+void nf_ct_unconfirmed_destroy(struct net *);
+
 /* Iterate over all conntracks: if iter returns true, it's deleted. */
 void nf_ct_iterate_cleanup_net(struct net *net,
 			       int (*iter)(struct nf_conn *i, void *data),
diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c
index c6f1cf0bff56..80ab4e937765 100644
--- a/net/netfilter/nf_conntrack_core.c
+++ b/net/netfilter/nf_conntrack_core.c
@@ -1686,6 +1686,17 @@ __nf_ct_unconfirmed_destroy(struct net *net)
 	}
 }
 
+void nf_ct_unconfirmed_destroy(struct net *net)
+{
+	might_sleep();
+
+	if (atomic_read(&net->ct.count) > 0) {
+		__nf_ct_unconfirmed_destroy(net);
+		synchronize_net();
+	}
+}
+EXPORT_SYMBOL_GPL(nf_ct_unconfirmed_destroy);
+
 void nf_ct_iterate_cleanup_net(struct net *net,
 			       int (*iter)(struct nf_conn *i, void *data),
 			       void *data, u32 portid, int report)
@@ -1697,14 +1708,10 @@ void nf_ct_iterate_cleanup_net(struct net *net,
 	if (atomic_read(&net->ct.count) == 0)
 		return;
 
-	__nf_ct_unconfirmed_destroy(net);
-
 	d.iter = iter;
 	d.data = data;
 	d.net = net;
 
-	synchronize_net();
-
 	nf_ct_iterate_cleanup(iter_net_only, &d, portid, report);
 }
 EXPORT_SYMBOL_GPL(nf_ct_iterate_cleanup_net);
diff --git a/net/netfilter/nfnetlink_cttimeout.c b/net/netfilter/nfnetlink_cttimeout.c
index 7ce9e86d374c..f4fb6d4dd0b9 100644
--- a/net/netfilter/nfnetlink_cttimeout.c
+++ b/net/netfilter/nfnetlink_cttimeout.c
@@ -570,6 +570,7 @@ static void __net_exit cttimeout_net_exit(struct net *net)
 {
 	struct ctnl_timeout *cur, *tmp;
 
+	nf_ct_unconfirmed_destroy(net);
 	ctnl_untimeout(net, NULL);
 
 	list_for_each_entry_safe(cur, tmp, &net->nfct_timeout_list, head) {
-- 
cgit v1.2.3


From 2cf0c8b3e6942ecafe6ebb1a6d0328a81641bf39 Mon Sep 17 00:00:00 2001
From: Phil Sutter <phil@nwl.cc>
Date: Thu, 27 Jul 2017 16:56:40 +0200
Subject: netlink: Introduce nla_strdup()

This is similar to strdup() for netlink string attributes.

Signed-off-by: Phil Sutter <phil@nwl.cc>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/net/netlink.h |  1 +
 lib/nlattr.c          | 24 ++++++++++++++++++++++++
 2 files changed, 25 insertions(+)

(limited to 'include')

diff --git a/include/net/netlink.h b/include/net/netlink.h
index ef8e6c3a80a6..c8c2eb5ae55e 100644
--- a/include/net/netlink.h
+++ b/include/net/netlink.h
@@ -247,6 +247,7 @@ int nla_parse(struct nlattr **tb, int maxtype, const struct nlattr *head,
 int nla_policy_len(const struct nla_policy *, int);
 struct nlattr *nla_find(const struct nlattr *head, int len, int attrtype);
 size_t nla_strlcpy(char *dst, const struct nlattr *nla, size_t dstsize);
+char *nla_strdup(const struct nlattr *nla, gfp_t flags);
 int nla_memcpy(void *dest, const struct nlattr *src, int count);
 int nla_memcmp(const struct nlattr *nla, const void *data, size_t size);
 int nla_strcmp(const struct nlattr *nla, const char *str);
diff --git a/lib/nlattr.c b/lib/nlattr.c
index fb52435be42d..f13013f7e21a 100644
--- a/lib/nlattr.c
+++ b/lib/nlattr.c
@@ -271,6 +271,30 @@ size_t nla_strlcpy(char *dst, const struct nlattr *nla, size_t dstsize)
 }
 EXPORT_SYMBOL(nla_strlcpy);
 
+/**
+ * nla_strdup - Copy string attribute payload into a newly allocated buffer
+ * @nla: attribute to copy the string from
+ * @flags: the type of memory to allocate (see kmalloc).
+ *
+ * Returns a pointer to the allocated buffer or NULL on error.
+ */
+char *nla_strdup(const struct nlattr *nla, gfp_t flags)
+{
+	size_t srclen = nla_len(nla);
+	char *src = nla_data(nla), *dst;
+
+	if (srclen > 0 && src[srclen - 1] == '\0')
+		srclen--;
+
+	dst = kmalloc(srclen + 1, flags);
+	if (dst != NULL) {
+		memcpy(dst, src, srclen);
+		dst[srclen] = '\0';
+	}
+	return dst;
+}
+EXPORT_SYMBOL(nla_strdup);
+
 /**
  * nla_memcpy - Copy a netlink attribute into another memory area
  * @dest: where to copy to memcpy
-- 
cgit v1.2.3


From e46abbcc05aa8a16b0e7f5c94e86d11af9aa2770 Mon Sep 17 00:00:00 2001
From: Phil Sutter <phil@nwl.cc>
Date: Thu, 27 Jul 2017 16:56:41 +0200
Subject: netfilter: nf_tables: Allow table names of up to 255 chars

Allocate all table names dynamically to allow for arbitrary lengths but
introduce NFT_NAME_MAXLEN as an upper sanity boundary. It's value was
chosen to allow using a domain name as per RFC 1035.

Signed-off-by: Phil Sutter <phil@nwl.cc>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/net/netfilter/nf_tables.h        |  2 +-
 include/uapi/linux/netfilter/nf_tables.h |  3 +-
 net/netfilter/nf_tables_api.c            | 49 +++++++++++++++++++++++---------
 net/netfilter/nf_tables_trace.c          |  2 +-
 4 files changed, 40 insertions(+), 16 deletions(-)

(limited to 'include')

diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h
index bd5be0d691d5..05ecf78ec078 100644
--- a/include/net/netfilter/nf_tables.h
+++ b/include/net/netfilter/nf_tables.h
@@ -957,7 +957,7 @@ struct nft_table {
 	u32				use;
 	u16				flags:14,
 					genmask:2;
-	char				name[NFT_TABLE_MAXNAMELEN];
+	char				*name;
 };
 
 enum nft_af_flags {
diff --git a/include/uapi/linux/netfilter/nf_tables.h b/include/uapi/linux/netfilter/nf_tables.h
index 6f0a950e21c3..0b94e572ef16 100644
--- a/include/uapi/linux/netfilter/nf_tables.h
+++ b/include/uapi/linux/netfilter/nf_tables.h
@@ -1,7 +1,8 @@
 #ifndef _LINUX_NF_TABLES_H
 #define _LINUX_NF_TABLES_H
 
-#define NFT_TABLE_MAXNAMELEN	32
+#define NFT_NAME_MAXLEN		256
+#define NFT_TABLE_MAXNAMELEN	NFT_NAME_MAXLEN
 #define NFT_CHAIN_MAXNAMELEN	32
 #define NFT_SET_MAXNAMELEN	32
 #define NFT_OBJ_MAXNAMELEN	32
diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c
index b77ad0813564..c2e392d5e512 100644
--- a/net/netfilter/nf_tables_api.c
+++ b/net/netfilter/nf_tables_api.c
@@ -726,7 +726,10 @@ static int nf_tables_newtable(struct net *net, struct sock *nlsk,
 	if (table == NULL)
 		goto err2;
 
-	nla_strlcpy(table->name, name, NFT_TABLE_MAXNAMELEN);
+	table->name = nla_strdup(name, GFP_KERNEL);
+	if (table->name == NULL)
+		goto err3;
+
 	INIT_LIST_HEAD(&table->chains);
 	INIT_LIST_HEAD(&table->sets);
 	INIT_LIST_HEAD(&table->objects);
@@ -735,10 +738,12 @@ static int nf_tables_newtable(struct net *net, struct sock *nlsk,
 	nft_ctx_init(&ctx, net, skb, nlh, afi, table, NULL, nla);
 	err = nft_trans_table_add(&ctx, NFT_MSG_NEWTABLE);
 	if (err < 0)
-		goto err3;
+		goto err4;
 
 	list_add_tail_rcu(&table->list, &afi->tables);
 	return 0;
+err4:
+	kfree(table->name);
 err3:
 	kfree(table);
 err2:
@@ -865,6 +870,7 @@ static void nf_tables_table_destroy(struct nft_ctx *ctx)
 {
 	BUG_ON(ctx->table->use > 0);
 
+	kfree(ctx->table->name);
 	kfree(ctx->table);
 	module_put(ctx->afi->owner);
 }
@@ -1972,7 +1978,7 @@ err:
 }
 
 struct nft_rule_dump_ctx {
-	char table[NFT_TABLE_MAXNAMELEN];
+	char *table;
 	char chain[NFT_CHAIN_MAXNAMELEN];
 };
 
@@ -1997,7 +2003,7 @@ static int nf_tables_dump_rules(struct sk_buff *skb,
 			continue;
 
 		list_for_each_entry_rcu(table, &afi->tables, list) {
-			if (ctx && ctx->table[0] &&
+			if (ctx && ctx->table &&
 			    strcmp(ctx->table, table->name) != 0)
 				continue;
 
@@ -2037,7 +2043,12 @@ done:
 
 static int nf_tables_dump_rules_done(struct netlink_callback *cb)
 {
-	kfree(cb->data);
+	struct nft_rule_dump_ctx *ctx = cb->data;
+
+	if (ctx) {
+		kfree(ctx->table);
+		kfree(ctx);
+	}
 	return 0;
 }
 
@@ -2069,9 +2080,14 @@ static int nf_tables_getrule(struct net *net, struct sock *nlsk,
 			if (!ctx)
 				return -ENOMEM;
 
-			if (nla[NFTA_RULE_TABLE])
-				nla_strlcpy(ctx->table, nla[NFTA_RULE_TABLE],
-					    sizeof(ctx->table));
+			if (nla[NFTA_RULE_TABLE]) {
+				ctx->table = nla_strdup(nla[NFTA_RULE_TABLE],
+							GFP_KERNEL);
+				if (!ctx->table) {
+					kfree(ctx);
+					return -ENOMEM;
+				}
+			}
 			if (nla[NFTA_RULE_CHAIN])
 				nla_strlcpy(ctx->chain, nla[NFTA_RULE_CHAIN],
 					    sizeof(ctx->chain));
@@ -4410,7 +4426,7 @@ nla_put_failure:
 }
 
 struct nft_obj_filter {
-	char		table[NFT_OBJ_MAXNAMELEN];
+	char		*table;
 	u32		type;
 };
 
@@ -4475,7 +4491,10 @@ done:
 
 static int nf_tables_dump_obj_done(struct netlink_callback *cb)
 {
-	kfree(cb->data);
+	struct nft_obj_filter *filter = cb->data;
+
+	kfree(filter->table);
+	kfree(filter);
 
 	return 0;
 }
@@ -4489,9 +4508,13 @@ nft_obj_filter_alloc(const struct nlattr * const nla[])
 	if (!filter)
 		return ERR_PTR(-ENOMEM);
 
-	if (nla[NFTA_OBJ_TABLE])
-		nla_strlcpy(filter->table, nla[NFTA_OBJ_TABLE],
-			    NFT_TABLE_MAXNAMELEN);
+	if (nla[NFTA_OBJ_TABLE]) {
+		filter->table = nla_strdup(nla[NFTA_OBJ_TABLE], GFP_KERNEL);
+		if (!filter->table) {
+			kfree(filter);
+			return ERR_PTR(-ENOMEM);
+		}
+	}
 	if (nla[NFTA_OBJ_TYPE])
 		filter->type = ntohl(nla_get_be32(nla[NFTA_OBJ_TYPE]));
 
diff --git a/net/netfilter/nf_tables_trace.c b/net/netfilter/nf_tables_trace.c
index 0c3a0049e4aa..62787d985e9d 100644
--- a/net/netfilter/nf_tables_trace.c
+++ b/net/netfilter/nf_tables_trace.c
@@ -175,7 +175,7 @@ void nft_trace_notify(struct nft_traceinfo *info)
 		return;
 
 	size = nlmsg_total_size(sizeof(struct nfgenmsg)) +
-		nla_total_size(NFT_TABLE_MAXNAMELEN) +
+		nla_total_size(strlen(info->chain->table->name)) +
 		nla_total_size(NFT_CHAIN_MAXNAMELEN) +
 		nla_total_size_64bit(sizeof(__be64)) +	/* rule handle */
 		nla_total_size(sizeof(__be32)) +	/* trace type */
-- 
cgit v1.2.3


From b7263e071aba736cea9e71cdf2e76dfa7aebd039 Mon Sep 17 00:00:00 2001
From: Phil Sutter <phil@nwl.cc>
Date: Thu, 27 Jul 2017 16:56:42 +0200
Subject: netfilter: nf_tables: Allow chain name of up to 255 chars

Same conversion as for table names, use NFT_NAME_MAXLEN as upper
boundary as well.

Signed-off-by: Phil Sutter <phil@nwl.cc>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/net/netfilter/nf_tables.h        |  4 ++--
 include/uapi/linux/netfilter/nf_tables.h |  2 +-
 net/netfilter/nf_tables_api.c            | 34 ++++++++++++++++++++++++--------
 net/netfilter/nf_tables_trace.c          | 27 +++++++++++++++++++++++--
 4 files changed, 54 insertions(+), 13 deletions(-)

(limited to 'include')

diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h
index 05ecf78ec078..be1610162ee0 100644
--- a/include/net/netfilter/nf_tables.h
+++ b/include/net/netfilter/nf_tables.h
@@ -859,7 +859,7 @@ struct nft_chain {
 	u16				level;
 	u8				flags:6,
 					genmask:2;
-	char				name[NFT_CHAIN_MAXNAMELEN];
+	char				*name;
 };
 
 enum nft_chain_type {
@@ -1272,7 +1272,7 @@ struct nft_trans_set {
 
 struct nft_trans_chain {
 	bool				update;
-	char				name[NFT_CHAIN_MAXNAMELEN];
+	char				*name;
 	struct nft_stats __percpu	*stats;
 	u8				policy;
 };
diff --git a/include/uapi/linux/netfilter/nf_tables.h b/include/uapi/linux/netfilter/nf_tables.h
index 0b94e572ef16..d9c03a8608ee 100644
--- a/include/uapi/linux/netfilter/nf_tables.h
+++ b/include/uapi/linux/netfilter/nf_tables.h
@@ -3,7 +3,7 @@
 
 #define NFT_NAME_MAXLEN		256
 #define NFT_TABLE_MAXNAMELEN	NFT_NAME_MAXLEN
-#define NFT_CHAIN_MAXNAMELEN	32
+#define NFT_CHAIN_MAXNAMELEN	NFT_NAME_MAXLEN
 #define NFT_SET_MAXNAMELEN	32
 #define NFT_OBJ_MAXNAMELEN	32
 #define NFT_USERDATA_MAXLEN	256
diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c
index c2e392d5e512..747499039709 100644
--- a/net/netfilter/nf_tables_api.c
+++ b/net/netfilter/nf_tables_api.c
@@ -1250,8 +1250,10 @@ static void nf_tables_chain_destroy(struct nft_chain *chain)
 			static_branch_dec(&nft_counters_enabled);
 		if (basechain->ops[0].dev != NULL)
 			dev_put(basechain->ops[0].dev);
+		kfree(chain->name);
 		kfree(basechain);
 	} else {
+		kfree(chain->name);
 		kfree(chain);
 	}
 }
@@ -1476,8 +1478,13 @@ static int nf_tables_newchain(struct net *net, struct sock *nlsk,
 			nft_trans_chain_policy(trans) = -1;
 
 		if (nla[NFTA_CHAIN_HANDLE] && name) {
-			nla_strlcpy(nft_trans_chain_name(trans), name,
-				    NFT_CHAIN_MAXNAMELEN);
+			nft_trans_chain_name(trans) =
+				nla_strdup(name, GFP_KERNEL);
+			if (!nft_trans_chain_name(trans)) {
+				kfree(trans);
+				free_percpu(stats);
+				return -ENOMEM;
+			}
 		}
 		list_add_tail(&trans->list, &net->nft.commit_list);
 		return 0;
@@ -1544,7 +1551,11 @@ static int nf_tables_newchain(struct net *net, struct sock *nlsk,
 	INIT_LIST_HEAD(&chain->rules);
 	chain->handle = nf_tables_alloc_handle(table);
 	chain->table = table;
-	nla_strlcpy(chain->name, name, NFT_CHAIN_MAXNAMELEN);
+	chain->name = nla_strdup(name, GFP_KERNEL);
+	if (!chain->name) {
+		err = -ENOMEM;
+		goto err1;
+	}
 
 	err = nf_tables_register_hooks(net, table, chain, afi->nops);
 	if (err < 0)
@@ -1979,7 +1990,7 @@ err:
 
 struct nft_rule_dump_ctx {
 	char *table;
-	char chain[NFT_CHAIN_MAXNAMELEN];
+	char *chain;
 };
 
 static int nf_tables_dump_rules(struct sk_buff *skb,
@@ -2047,6 +2058,7 @@ static int nf_tables_dump_rules_done(struct netlink_callback *cb)
 
 	if (ctx) {
 		kfree(ctx->table);
+		kfree(ctx->chain);
 		kfree(ctx);
 	}
 	return 0;
@@ -2088,9 +2100,15 @@ static int nf_tables_getrule(struct net *net, struct sock *nlsk,
 					return -ENOMEM;
 				}
 			}
-			if (nla[NFTA_RULE_CHAIN])
-				nla_strlcpy(ctx->chain, nla[NFTA_RULE_CHAIN],
-					    sizeof(ctx->chain));
+			if (nla[NFTA_RULE_CHAIN]) {
+				ctx->chain = nla_strdup(nla[NFTA_RULE_CHAIN],
+							GFP_KERNEL);
+				if (!ctx->chain) {
+					kfree(ctx->table);
+					kfree(ctx);
+					return -ENOMEM;
+				}
+			}
 			c.data = ctx;
 		}
 
@@ -4863,7 +4881,7 @@ static void nft_chain_commit_update(struct nft_trans *trans)
 {
 	struct nft_base_chain *basechain;
 
-	if (nft_trans_chain_name(trans)[0])
+	if (nft_trans_chain_name(trans))
 		strcpy(trans->ctx.chain->name, nft_trans_chain_name(trans));
 
 	if (!nft_is_base_chain(trans->ctx.chain))
diff --git a/net/netfilter/nf_tables_trace.c b/net/netfilter/nf_tables_trace.c
index 62787d985e9d..e1dc527a493b 100644
--- a/net/netfilter/nf_tables_trace.c
+++ b/net/netfilter/nf_tables_trace.c
@@ -162,6 +162,27 @@ static int nf_trace_fill_rule_info(struct sk_buff *nlskb,
 			    NFTA_TRACE_PAD);
 }
 
+static bool nft_trace_have_verdict_chain(struct nft_traceinfo *info)
+{
+	switch (info->type) {
+	case NFT_TRACETYPE_RETURN:
+	case NFT_TRACETYPE_RULE:
+		break;
+	default:
+		return false;
+	}
+
+	switch (info->verdict->code) {
+	case NFT_JUMP:
+	case NFT_GOTO:
+		break;
+	default:
+		return false;
+	}
+
+	return true;
+}
+
 void nft_trace_notify(struct nft_traceinfo *info)
 {
 	const struct nft_pktinfo *pkt = info->pkt;
@@ -176,12 +197,11 @@ void nft_trace_notify(struct nft_traceinfo *info)
 
 	size = nlmsg_total_size(sizeof(struct nfgenmsg)) +
 		nla_total_size(strlen(info->chain->table->name)) +
-		nla_total_size(NFT_CHAIN_MAXNAMELEN) +
+		nla_total_size(strlen(info->chain->name)) +
 		nla_total_size_64bit(sizeof(__be64)) +	/* rule handle */
 		nla_total_size(sizeof(__be32)) +	/* trace type */
 		nla_total_size(0) +			/* VERDICT, nested */
 			nla_total_size(sizeof(u32)) +	/* verdict code */
-			nla_total_size(NFT_CHAIN_MAXNAMELEN) + /* jump target */
 		nla_total_size(sizeof(u32)) +		/* id */
 		nla_total_size(NFT_TRACETYPE_LL_HSIZE) +
 		nla_total_size(NFT_TRACETYPE_NETWORK_HSIZE) +
@@ -194,6 +214,9 @@ void nft_trace_notify(struct nft_traceinfo *info)
 		nla_total_size(sizeof(u32)) +		/* nfproto */
 		nla_total_size(sizeof(u32));		/* policy */
 
+	if (nft_trace_have_verdict_chain(info))
+		size += nla_total_size(strlen(info->verdict->chain->name)); /* jump target */
+
 	skb = nlmsg_new(size, GFP_ATOMIC);
 	if (!skb)
 		return;
-- 
cgit v1.2.3


From 387454901bd62022ac1b04e15bd8d4fcc60bbed4 Mon Sep 17 00:00:00 2001
From: Phil Sutter <phil@nwl.cc>
Date: Thu, 27 Jul 2017 16:56:43 +0200
Subject: netfilter: nf_tables: Allow set names of up to 255 chars

Same conversion as for table names, use NFT_NAME_MAXLEN as upper
boundary as well.

Signed-off-by: Phil Sutter <phil@nwl.cc>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/net/netfilter/nf_tables.h        |  2 +-
 include/uapi/linux/netfilter/nf_tables.h |  2 +-
 net/netfilter/nf_tables_api.c            | 18 ++++++++++++++----
 3 files changed, 16 insertions(+), 6 deletions(-)

(limited to 'include')

diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h
index be1610162ee0..66ba62fa7d90 100644
--- a/include/net/netfilter/nf_tables.h
+++ b/include/net/netfilter/nf_tables.h
@@ -396,7 +396,7 @@ void nft_unregister_set(struct nft_set_type *type);
 struct nft_set {
 	struct list_head		list;
 	struct list_head		bindings;
-	char				name[NFT_SET_MAXNAMELEN];
+	char				*name;
 	u32				ktype;
 	u32				dtype;
 	u32				objtype;
diff --git a/include/uapi/linux/netfilter/nf_tables.h b/include/uapi/linux/netfilter/nf_tables.h
index d9c03a8608ee..b5e73e80b7b6 100644
--- a/include/uapi/linux/netfilter/nf_tables.h
+++ b/include/uapi/linux/netfilter/nf_tables.h
@@ -4,7 +4,7 @@
 #define NFT_NAME_MAXLEN		256
 #define NFT_TABLE_MAXNAMELEN	NFT_NAME_MAXLEN
 #define NFT_CHAIN_MAXNAMELEN	NFT_NAME_MAXLEN
-#define NFT_SET_MAXNAMELEN	32
+#define NFT_SET_MAXNAMELEN	NFT_NAME_MAXLEN
 #define NFT_OBJ_MAXNAMELEN	32
 #define NFT_USERDATA_MAXLEN	256
 
diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c
index 747499039709..e6a07f27b1a3 100644
--- a/net/netfilter/nf_tables_api.c
+++ b/net/netfilter/nf_tables_api.c
@@ -2650,7 +2650,7 @@ static int nf_tables_set_alloc_name(struct nft_ctx *ctx, struct nft_set *set,
 	unsigned long *inuse;
 	unsigned int n = 0, min = 0;
 
-	p = strnchr(name, NFT_SET_MAXNAMELEN, '%');
+	p = strchr(name, '%');
 	if (p != NULL) {
 		if (p[1] != 'd' || strchr(p + 2, '%'))
 			return -EINVAL;
@@ -2681,7 +2681,10 @@ cont:
 		free_page((unsigned long)inuse);
 	}
 
-	snprintf(set->name, sizeof(set->name), name, min + n);
+	set->name = kasprintf(GFP_KERNEL, name, min + n);
+	if (!set->name)
+		return -ENOMEM;
+
 	list_for_each_entry(i, &ctx->table->sets, list) {
 		if (!nft_is_active_next(ctx->net, i))
 			continue;
@@ -2958,7 +2961,7 @@ static int nf_tables_newset(struct net *net, struct sock *nlsk,
 	struct nft_table *table;
 	struct nft_set *set;
 	struct nft_ctx ctx;
-	char name[NFT_SET_MAXNAMELEN];
+	char *name;
 	unsigned int size;
 	bool create;
 	u64 timeout;
@@ -3104,8 +3107,14 @@ static int nf_tables_newset(struct net *net, struct sock *nlsk,
 		goto err1;
 	}
 
-	nla_strlcpy(name, nla[NFTA_SET_NAME], sizeof(set->name));
+	name = nla_strdup(nla[NFTA_SET_NAME], GFP_KERNEL);
+	if (!name) {
+		err = -ENOMEM;
+		goto err2;
+	}
+
 	err = nf_tables_set_alloc_name(&ctx, set, name);
+	kfree(name);
 	if (err < 0)
 		goto err2;
 
@@ -3155,6 +3164,7 @@ static void nft_set_destroy(struct nft_set *set)
 {
 	set->ops->destroy(set);
 	module_put(set->ops->type->owner);
+	kfree(set->name);
 	kvfree(set);
 }
 
-- 
cgit v1.2.3


From 615095752100748e221028fc96163c2b78185ae4 Mon Sep 17 00:00:00 2001
From: Phil Sutter <phil@nwl.cc>
Date: Thu, 27 Jul 2017 16:56:44 +0200
Subject: netfilter: nf_tables: Allow object names of up to 255 chars

Same conversion as for table names, use NFT_NAME_MAXLEN as upper
boundary as well.

Signed-off-by: Phil Sutter <phil@nwl.cc>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/net/netfilter/nf_tables.h        |  2 +-
 include/uapi/linux/netfilter/nf_tables.h |  2 +-
 net/netfilter/nf_tables_api.c            | 11 +++++++++--
 3 files changed, 11 insertions(+), 4 deletions(-)

(limited to 'include')

diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h
index 66ba62fa7d90..f9795fe394f3 100644
--- a/include/net/netfilter/nf_tables.h
+++ b/include/net/netfilter/nf_tables.h
@@ -1016,7 +1016,7 @@ int nft_verdict_dump(struct sk_buff *skb, int type,
  */
 struct nft_object {
 	struct list_head		list;
-	char				name[NFT_OBJ_MAXNAMELEN];
+	char				*name;
 	struct nft_table		*table;
 	u32				genmask:2,
 					use:30;
diff --git a/include/uapi/linux/netfilter/nf_tables.h b/include/uapi/linux/netfilter/nf_tables.h
index b5e73e80b7b6..be25cf69295b 100644
--- a/include/uapi/linux/netfilter/nf_tables.h
+++ b/include/uapi/linux/netfilter/nf_tables.h
@@ -5,7 +5,7 @@
 #define NFT_TABLE_MAXNAMELEN	NFT_NAME_MAXLEN
 #define NFT_CHAIN_MAXNAMELEN	NFT_NAME_MAXLEN
 #define NFT_SET_MAXNAMELEN	NFT_NAME_MAXLEN
-#define NFT_OBJ_MAXNAMELEN	32
+#define NFT_OBJ_MAXNAMELEN	NFT_NAME_MAXLEN
 #define NFT_USERDATA_MAXLEN	256
 
 /**
diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c
index e6a07f27b1a3..149785ff1c7b 100644
--- a/net/netfilter/nf_tables_api.c
+++ b/net/netfilter/nf_tables_api.c
@@ -4402,15 +4402,21 @@ static int nf_tables_newobj(struct net *net, struct sock *nlsk,
 		goto err1;
 	}
 	obj->table = table;
-	nla_strlcpy(obj->name, nla[NFTA_OBJ_NAME], NFT_OBJ_MAXNAMELEN);
+	obj->name = nla_strdup(nla[NFTA_OBJ_NAME], GFP_KERNEL);
+	if (!obj->name) {
+		err = -ENOMEM;
+		goto err2;
+	}
 
 	err = nft_trans_obj_add(&ctx, NFT_MSG_NEWOBJ, obj);
 	if (err < 0)
-		goto err2;
+		goto err3;
 
 	list_add_tail_rcu(&obj->list, &table->objects);
 	table->use++;
 	return 0;
+err3:
+	kfree(obj->name);
 err2:
 	if (obj->type->destroy)
 		obj->type->destroy(obj);
@@ -4626,6 +4632,7 @@ static void nft_obj_destroy(struct nft_object *obj)
 		obj->type->destroy(obj);
 
 	module_put(obj->type->owner);
+	kfree(obj->name);
 	kfree(obj);
 }
 
-- 
cgit v1.2.3


From 4d3a57f23dec59f0a2362e63540b2d01b37afe0a Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Fri, 28 Jul 2017 11:22:04 +0200
Subject: netfilter: conntrack: do not enable connection tracking unless needed

Discussion during NFWS 2017 in Faro has shown that the current
conntrack behaviour is unreasonable.

Even if conntrack module is loaded on behalf of a single net namespace,
its turned on for all namespaces, which is expensive.  Commit
481fa373476 ("netfilter: conntrack: add nf_conntrack_default_on sysctl")
attempted to provide an alternative to the 'default on' behaviour by
adding a sysctl to change it.

However, as Eric points out, the sysctl only becomes available
once the module is loaded, and then its too late.

So we either have to move the sysctl to the core, or, alternatively,
change conntrack to become active only once the rule set requires this.

This does the latter, conntrack is only enabled when a rule needs it.

Reported-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 Documentation/networking/nf_conntrack-sysctl.txt | 11 ---------
 include/net/netfilter/nf_conntrack_l3proto.h     | 15 ------------
 net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c   | 16 ++-----------
 net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c   | 17 ++------------
 net/netfilter/nf_conntrack_proto.c               | 29 ------------------------
 net/netfilter/nf_conntrack_standalone.c          | 10 --------
 6 files changed, 4 insertions(+), 94 deletions(-)

(limited to 'include')

diff --git a/Documentation/networking/nf_conntrack-sysctl.txt b/Documentation/networking/nf_conntrack-sysctl.txt
index 497d668288f9..433b6724797a 100644
--- a/Documentation/networking/nf_conntrack-sysctl.txt
+++ b/Documentation/networking/nf_conntrack-sysctl.txt
@@ -96,17 +96,6 @@ nf_conntrack_max - INTEGER
 	Size of connection tracking table.  Default value is
 	nf_conntrack_buckets value * 4.
 
-nf_conntrack_default_on - BOOLEAN
-	0 - don't register conntrack in new net namespaces
-	1 - register conntrack in new net namespaces (default)
-
-	This controls wheter newly created network namespaces have connection
-	tracking enabled by default.  It will be enabled automatically
-	regardless of this setting if the new net namespace requires
-	connection tracking, e.g. when NAT rules are created.
-	This setting is only visible in initial user namespace, it has no
-	effect on existing namespaces.
-
 nf_conntrack_tcp_be_liberal - BOOLEAN
 	0 - disabled (default)
 	not 0 - enabled
diff --git a/include/net/netfilter/nf_conntrack_l3proto.h b/include/net/netfilter/nf_conntrack_l3proto.h
index 6d14b36e3a49..1b8de164d744 100644
--- a/include/net/netfilter/nf_conntrack_l3proto.h
+++ b/include/net/netfilter/nf_conntrack_l3proto.h
@@ -73,21 +73,6 @@ struct nf_conntrack_l3proto {
 
 extern struct nf_conntrack_l3proto __rcu *nf_ct_l3protos[NFPROTO_NUMPROTO];
 
-#ifdef CONFIG_SYSCTL
-/* Protocol pernet registration. */
-int nf_ct_l3proto_pernet_register(struct net *net,
-				  struct nf_conntrack_l3proto *proto);
-#else
-static inline int nf_ct_l3proto_pernet_register(struct net *n,
-						struct nf_conntrack_l3proto *p)
-{
-	return 0;
-}
-#endif
-
-void nf_ct_l3proto_pernet_unregister(struct net *net,
-				     struct nf_conntrack_l3proto *proto);
-
 /* Protocol global registration. */
 int nf_ct_l3proto_register(struct nf_conntrack_l3proto *proto);
 void nf_ct_l3proto_unregister(struct nf_conntrack_l3proto *proto);
diff --git a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c
index 63e4ea0e01f8..de5f0e6ddd1b 100644
--- a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c
+++ b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c
@@ -398,24 +398,12 @@ static struct nf_conntrack_l4proto *builtin_l4proto4[] = {
 
 static int ipv4_net_init(struct net *net)
 {
-	int ret = 0;
-
-	ret = nf_ct_l4proto_pernet_register(net, builtin_l4proto4,
-					    ARRAY_SIZE(builtin_l4proto4));
-	if (ret < 0)
-		return ret;
-	ret = nf_ct_l3proto_pernet_register(net, &nf_conntrack_l3proto_ipv4);
-	if (ret < 0) {
-		pr_err("nf_conntrack_ipv4: pernet registration failed\n");
-		nf_ct_l4proto_pernet_unregister(net, builtin_l4proto4,
-						ARRAY_SIZE(builtin_l4proto4));
-	}
-	return ret;
+	return nf_ct_l4proto_pernet_register(net, builtin_l4proto4,
+					     ARRAY_SIZE(builtin_l4proto4));
 }
 
 static void ipv4_net_exit(struct net *net)
 {
-	nf_ct_l3proto_pernet_unregister(net, &nf_conntrack_l3proto_ipv4);
 	nf_ct_l4proto_pernet_unregister(net, builtin_l4proto4,
 					ARRAY_SIZE(builtin_l4proto4));
 }
diff --git a/net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c b/net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c
index f2d2f4a9294b..ddef5ee9e0a8 100644
--- a/net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c
+++ b/net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c
@@ -398,25 +398,12 @@ static struct nf_conntrack_l4proto *builtin_l4proto6[] = {
 
 static int ipv6_net_init(struct net *net)
 {
-	int ret = 0;
-
-	ret = nf_ct_l4proto_pernet_register(net, builtin_l4proto6,
-					    ARRAY_SIZE(builtin_l4proto6));
-	if (ret < 0)
-		return ret;
-
-	ret = nf_ct_l3proto_pernet_register(net, &nf_conntrack_l3proto_ipv6);
-	if (ret < 0) {
-		pr_err("nf_conntrack_ipv6: pernet registration failed.\n");
-		nf_ct_l4proto_pernet_unregister(net, builtin_l4proto6,
-						ARRAY_SIZE(builtin_l4proto6));
-	}
-	return ret;
+	return nf_ct_l4proto_pernet_register(net, builtin_l4proto6,
+					     ARRAY_SIZE(builtin_l4proto6));
 }
 
 static void ipv6_net_exit(struct net *net)
 {
-	nf_ct_l3proto_pernet_unregister(net, &nf_conntrack_l3proto_ipv6);
 	nf_ct_l4proto_pernet_unregister(net, builtin_l4proto6,
 					ARRAY_SIZE(builtin_l4proto6));
 }
diff --git a/net/netfilter/nf_conntrack_proto.c b/net/netfilter/nf_conntrack_proto.c
index 1dcad229c3cc..7c89dade6fd3 100644
--- a/net/netfilter/nf_conntrack_proto.c
+++ b/net/netfilter/nf_conntrack_proto.c
@@ -238,20 +238,6 @@ out_unlock:
 }
 EXPORT_SYMBOL_GPL(nf_ct_l3proto_register);
 
-#ifdef CONFIG_SYSCTL
-extern unsigned int nf_conntrack_default_on;
-
-int nf_ct_l3proto_pernet_register(struct net *net,
-				  struct nf_conntrack_l3proto *proto)
-{
-	if (nf_conntrack_default_on == 0)
-		return 0;
-
-	return proto->net_ns_get ? proto->net_ns_get(net) : 0;
-}
-EXPORT_SYMBOL_GPL(nf_ct_l3proto_pernet_register);
-#endif
-
 void nf_ct_l3proto_unregister(struct nf_conntrack_l3proto *proto)
 {
 	BUG_ON(proto->l3proto >= NFPROTO_NUMPROTO);
@@ -270,21 +256,6 @@ void nf_ct_l3proto_unregister(struct nf_conntrack_l3proto *proto)
 }
 EXPORT_SYMBOL_GPL(nf_ct_l3proto_unregister);
 
-void nf_ct_l3proto_pernet_unregister(struct net *net,
-				     struct nf_conntrack_l3proto *proto)
-{
-	/*
-	 * nf_conntrack_default_on *might* have registered hooks.
-	 * ->net_ns_put must cope with more puts() than get(), i.e.
-	 * if nf_conntrack_default_on was 0 at time of
-	 * nf_ct_l3proto_pernet_register invocation this net_ns_put()
-	 * should be a noop.
-	 */
-	if (proto->net_ns_put)
-		proto->net_ns_put(net);
-}
-EXPORT_SYMBOL_GPL(nf_ct_l3proto_pernet_unregister);
-
 static struct nf_proto_net *nf_ct_l4proto_net(struct net *net,
 					      struct nf_conntrack_l4proto *l4proto)
 {
diff --git a/net/netfilter/nf_conntrack_standalone.c b/net/netfilter/nf_conntrack_standalone.c
index ccb5cb9043e0..5b6c675d55b1 100644
--- a/net/netfilter/nf_conntrack_standalone.c
+++ b/net/netfilter/nf_conntrack_standalone.c
@@ -452,9 +452,6 @@ static int log_invalid_proto_max __read_mostly = 255;
 /* size the user *wants to set */
 static unsigned int nf_conntrack_htable_size_user __read_mostly;
 
-extern unsigned int nf_conntrack_default_on;
-unsigned int nf_conntrack_default_on __read_mostly = 1;
-
 static int
 nf_conntrack_hash_sysctl(struct ctl_table *table, int write,
 			 void __user *buffer, size_t *lenp, loff_t *ppos)
@@ -520,13 +517,6 @@ static struct ctl_table nf_ct_sysctl_table[] = {
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec,
 	},
-	{
-		.procname	= "nf_conntrack_default_on",
-		.data		= &nf_conntrack_default_on,
-		.maxlen		= sizeof(unsigned int),
-		.mode		= 0644,
-		.proc_handler	= proc_dointvec,
-	},
 	{ }
 };
 
-- 
cgit v1.2.3


From bf90aadd630c2c9f7f965ba1e90d41b5b46db7c9 Mon Sep 17 00:00:00 2001
From: "Michael J. Ruhl" <michael.j.ruhl@intel.com>
Date: Mon, 24 Jul 2017 07:46:12 -0700
Subject: IB/hfi1: Send MAD traps until repressed

A trap should be sent to the FM until the FM sends a repress message.
This is in line with the IBTA 13.4.9.

Add the ability to resend traps until a repress message is received.

Reviewed-by: Dennis Dalessandro <dennis.dalessandro@intel.com>
Reviewed-by: Michael N. Henry <michael.n.henry@intel.com>
Signed-off-by: Michael J. Ruhl <michael.j.ruhl@intel.com>
Signed-off-by: Dennis Dalessandro <dennis.dalessandro@intel.com>
Signed-off-by: Doug Ledford <dledford@redhat.com>
---
 drivers/infiniband/hw/hfi1/mad.c   | 373 ++++++++++++++++++++++++++++---------
 drivers/infiniband/hw/hfi1/mad.h   |   3 +-
 drivers/infiniband/hw/hfi1/verbs.c |   5 +
 include/rdma/rdma_vt.h             |  17 ++
 4 files changed, 310 insertions(+), 88 deletions(-)

(limited to 'include')

diff --git a/drivers/infiniband/hw/hfi1/mad.c b/drivers/infiniband/hw/hfi1/mad.c
index a081a98d728a..0a3e2dfdf56e 100644
--- a/drivers/infiniband/hw/hfi1/mad.c
+++ b/drivers/infiniband/hw/hfi1/mad.c
@@ -59,6 +59,16 @@
 #define OPA_LINK_WIDTH_RESET_OLD 0x0fff
 #define OPA_LINK_WIDTH_RESET 0xffff
 
+struct trap_node {
+	struct list_head list;
+	struct opa_mad_notice_attr data;
+	__be64 tid;
+	int len;
+	u32 retry;
+	u8 in_use;
+	u8 repress;
+};
+
 static int smp_length_check(u32 data_size, u32 request_len)
 {
 	if (unlikely(request_len < data_size))
@@ -97,28 +107,156 @@ void hfi1_event_pkey_change(struct hfi1_devdata *dd, u8 port)
 	ib_dispatch_event(&event);
 }
 
-static void send_trap(struct hfi1_ibport *ibp, void *data, unsigned len)
+/*
+ * If the port is down, clean up all pending traps.  We need to be careful
+ * with the given trap, because it may be queued.
+ */
+static void cleanup_traps(struct hfi1_ibport *ibp, struct trap_node *trap)
+{
+	struct trap_node *node, *q;
+	unsigned long flags;
+	struct list_head trap_list;
+	int i;
+
+	for (i = 0; i < RVT_MAX_TRAP_LISTS; i++) {
+		spin_lock_irqsave(&ibp->rvp.lock, flags);
+		list_replace_init(&ibp->rvp.trap_lists[i].list, &trap_list);
+		ibp->rvp.trap_lists[i].list_len = 0;
+		spin_unlock_irqrestore(&ibp->rvp.lock, flags);
+
+		/*
+		 * Remove all items from the list, freeing all the non-given
+		 * traps.
+		 */
+		list_for_each_entry_safe(node, q, &trap_list, list) {
+			list_del(&node->list);
+			if (node != trap)
+				kfree(node);
+		}
+	}
+
+	/*
+	 * If this wasn't on one of the lists it would not be freed.  If it
+	 * was on the list, it is now safe to free.
+	 */
+	kfree(trap);
+}
+
+static struct trap_node *check_and_add_trap(struct hfi1_ibport *ibp,
+					    struct trap_node *trap)
+{
+	struct trap_node *node;
+	struct trap_list *trap_list;
+	unsigned long flags;
+	unsigned long timeout;
+	int found = 0;
+
+	/*
+	 * Since the retry (handle timeout) does not remove a trap request
+	 * from the list, all we have to do is compare the node.
+	 */
+	spin_lock_irqsave(&ibp->rvp.lock, flags);
+	trap_list = &ibp->rvp.trap_lists[trap->data.generic_type & 0x0F];
+
+	list_for_each_entry(node, &trap_list->list, list) {
+		if (node == trap) {
+			node->retry++;
+			found = 1;
+			break;
+		}
+	}
+
+	/* If it is not on the list, add it, limited to RVT-MAX_TRAP_LEN. */
+	if (!found) {
+		if (trap_list->list_len < RVT_MAX_TRAP_LEN) {
+			trap_list->list_len++;
+			list_add_tail(&trap->list, &trap_list->list);
+		} else {
+			pr_warn_ratelimited("hfi1: Maximim trap limit reached for 0x%0x traps\n",
+					    trap->data.generic_type);
+			kfree(trap);
+		}
+	}
+
+	/*
+	 * Next check to see if there is a timer pending.  If not, set it up
+	 * and get the first trap from the list.
+	 */
+	node = NULL;
+	if (!timer_pending(&ibp->rvp.trap_timer)) {
+		/*
+		 * o14-2
+		 * If the time out is set we have to wait until it expires
+		 * before the trap can be sent.
+		 * This should be > RVT_TRAP_TIMEOUT
+		 */
+		timeout = (RVT_TRAP_TIMEOUT *
+			   (1UL << ibp->rvp.subnet_timeout)) / 1000;
+		mod_timer(&ibp->rvp.trap_timer,
+			  jiffies + usecs_to_jiffies(timeout));
+		node = list_first_entry(&trap_list->list, struct trap_node,
+					list);
+		node->in_use = 1;
+	}
+	spin_unlock_irqrestore(&ibp->rvp.lock, flags);
+
+	return node;
+}
+
+static void subn_handle_opa_trap_repress(struct hfi1_ibport *ibp,
+					 struct opa_smp *smp)
+{
+	struct trap_list *trap_list;
+	struct trap_node *trap;
+	unsigned long flags;
+	int i;
+
+	if (smp->attr_id != IB_SMP_ATTR_NOTICE)
+		return;
+
+	spin_lock_irqsave(&ibp->rvp.lock, flags);
+	for (i = 0; i < RVT_MAX_TRAP_LISTS; i++) {
+		trap_list = &ibp->rvp.trap_lists[i];
+		trap = list_first_entry_or_null(&trap_list->list,
+						struct trap_node, list);
+		if (trap && trap->tid == smp->tid) {
+			if (trap->in_use) {
+				trap->repress = 1;
+			} else {
+				trap_list->list_len--;
+				list_del(&trap->list);
+				kfree(trap);
+			}
+			break;
+		}
+	}
+	spin_unlock_irqrestore(&ibp->rvp.lock, flags);
+}
+
+static void send_trap(struct hfi1_ibport *ibp, struct trap_node *trap)
 {
 	struct ib_mad_send_buf *send_buf;
 	struct ib_mad_agent *agent;
 	struct opa_smp *smp;
-	int ret;
 	unsigned long flags;
-	unsigned long timeout;
 	int pkey_idx;
 	u32 qpn = ppd_from_ibp(ibp)->sm_trap_qp;
 
 	agent = ibp->rvp.send_agent;
-	if (!agent)
+	if (!agent) {
+		cleanup_traps(ibp, trap);
 		return;
+	}
 
 	/* o14-3.2.1 */
-	if (driver_lstate(ppd_from_ibp(ibp)) != IB_PORT_ACTIVE)
+	if (driver_lstate(ppd_from_ibp(ibp)) != IB_PORT_ACTIVE) {
+		cleanup_traps(ibp, trap);
 		return;
+	}
 
-	/* o14-2 */
-	if (ibp->rvp.trap_timeout && time_before(jiffies,
-						 ibp->rvp.trap_timeout))
+	/* Add the trap to the list if necessary and see if we can send it */
+	trap = check_and_add_trap(ibp, trap);
+	if (!trap)
 		return;
 
 	pkey_idx = hfi1_lookup_pkey_idx(ibp, LIM_MGMT_P_KEY);
@@ -139,11 +277,21 @@ static void send_trap(struct hfi1_ibport *ibp, void *data, unsigned len)
 	smp->mgmt_class = IB_MGMT_CLASS_SUBN_LID_ROUTED;
 	smp->class_version = OPA_SM_CLASS_VERSION;
 	smp->method = IB_MGMT_METHOD_TRAP;
-	ibp->rvp.tid++;
-	smp->tid = cpu_to_be64(ibp->rvp.tid);
+
+	/* Only update the transaction ID for new traps (o13-5). */
+	if (trap->tid == 0) {
+		ibp->rvp.tid++;
+		/* make sure that tid != 0 */
+		if (ibp->rvp.tid == 0)
+			ibp->rvp.tid++;
+		trap->tid = cpu_to_be64(ibp->rvp.tid);
+	}
+	smp->tid = trap->tid;
+
 	smp->attr_id = IB_SMP_ATTR_NOTICE;
 	/* o14-1: smp->mkey = 0; */
-	memcpy(smp->route.lid.data, data, len);
+
+	memcpy(smp->route.lid.data, &trap->data, trap->len);
 
 	spin_lock_irqsave(&ibp->rvp.lock, flags);
 	if (!ibp->rvp.sm_ah) {
@@ -152,31 +300,72 @@ static void send_trap(struct hfi1_ibport *ibp, void *data, unsigned len)
 
 			ah = hfi1_create_qp0_ah(ibp, ibp->rvp.sm_lid);
 			if (IS_ERR(ah)) {
-				ret = PTR_ERR(ah);
-			} else {
-				send_buf->ah = ah;
-				ibp->rvp.sm_ah = ibah_to_rvtah(ah);
-				ret = 0;
+				spin_unlock_irqrestore(&ibp->rvp.lock, flags);
+				return;
 			}
+			send_buf->ah = ah;
+			ibp->rvp.sm_ah = ibah_to_rvtah(ah);
 		} else {
-			ret = -EINVAL;
+			spin_unlock_irqrestore(&ibp->rvp.lock, flags);
+			return;
 		}
 	} else {
 		send_buf->ah = &ibp->rvp.sm_ah->ibah;
-		ret = 0;
 	}
+
+	/*
+	 * If the trap was repressed while things were getting set up, don't
+	 * bother sending it. This could happen for a retry.
+	 */
+	if (trap->repress) {
+		list_del(&trap->list);
+		spin_unlock_irqrestore(&ibp->rvp.lock, flags);
+		kfree(trap);
+		ib_free_send_mad(send_buf);
+		return;
+	}
+
+	trap->in_use = 0;
 	spin_unlock_irqrestore(&ibp->rvp.lock, flags);
 
-	if (!ret)
-		ret = ib_post_send_mad(send_buf, NULL);
-	if (!ret) {
-		/* 4.096 usec. */
-		timeout = (4096 * (1UL << ibp->rvp.subnet_timeout)) / 1000;
-		ibp->rvp.trap_timeout = jiffies + usecs_to_jiffies(timeout);
-	} else {
+	if (ib_post_send_mad(send_buf, NULL))
 		ib_free_send_mad(send_buf);
-		ibp->rvp.trap_timeout = 0;
+}
+
+void hfi1_handle_trap_timer(unsigned long data)
+{
+	struct hfi1_ibport *ibp = (struct hfi1_ibport *)data;
+	struct trap_node *trap = NULL;
+	unsigned long flags;
+	int i;
+
+	/* Find the trap with the highest priority */
+	spin_lock_irqsave(&ibp->rvp.lock, flags);
+	for (i = 0; !trap && i < RVT_MAX_TRAP_LISTS; i++) {
+		trap = list_first_entry_or_null(&ibp->rvp.trap_lists[i].list,
+						struct trap_node, list);
 	}
+	spin_unlock_irqrestore(&ibp->rvp.lock, flags);
+
+	if (trap)
+		send_trap(ibp, trap);
+}
+
+static struct trap_node *create_trap_node(u8 type, __be16 trap_num, u32 lid)
+{
+	struct trap_node *trap;
+
+	trap = kzalloc(sizeof(*trap), GFP_ATOMIC);
+	if (!trap)
+		return NULL;
+
+	INIT_LIST_HEAD(&trap->list);
+	trap->data.generic_type = type;
+	trap->data.prod_type_lsb = IB_NOTICE_PROD_CA;
+	trap->data.trap_num = trap_num;
+	trap->data.issuer_lid = cpu_to_be32(lid);
+
+	return trap;
 }
 
 /*
@@ -185,28 +374,29 @@ static void send_trap(struct hfi1_ibport *ibp, void *data, unsigned len)
 void hfi1_bad_pkey(struct hfi1_ibport *ibp, u32 key, u32 sl,
 		   u32 qp1, u32 qp2, u16 lid1, u16 lid2)
 {
-	struct opa_mad_notice_attr data;
+	struct trap_node *trap;
 	u32 lid = ppd_from_ibp(ibp)->lid;
 	u32 _lid1 = lid1;
 	u32 _lid2 = lid2;
 
-	memset(&data, 0, sizeof(data));
 	ibp->rvp.n_pkt_drops++;
 	ibp->rvp.pkey_violations++;
 
+	trap = create_trap_node(IB_NOTICE_TYPE_SECURITY, OPA_TRAP_BAD_P_KEY,
+				lid);
+	if (!trap)
+		return;
+
 	/* Send violation trap */
-	data.generic_type = IB_NOTICE_TYPE_SECURITY;
-	data.prod_type_lsb = IB_NOTICE_PROD_CA;
-	data.trap_num = OPA_TRAP_BAD_P_KEY;
-	data.issuer_lid = cpu_to_be32(lid);
-	data.ntc_257_258.lid1 = cpu_to_be32(_lid1);
-	data.ntc_257_258.lid2 = cpu_to_be32(_lid2);
-	data.ntc_257_258.key = cpu_to_be32(key);
-	data.ntc_257_258.sl = sl << 3;
-	data.ntc_257_258.qp1 = cpu_to_be32(qp1);
-	data.ntc_257_258.qp2 = cpu_to_be32(qp2);
-
-	send_trap(ibp, &data, sizeof(data));
+	trap->data.ntc_257_258.lid1 = cpu_to_be32(_lid1);
+	trap->data.ntc_257_258.lid2 = cpu_to_be32(_lid2);
+	trap->data.ntc_257_258.key = cpu_to_be32(key);
+	trap->data.ntc_257_258.sl = sl << 3;
+	trap->data.ntc_257_258.qp1 = cpu_to_be32(qp1);
+	trap->data.ntc_257_258.qp2 = cpu_to_be32(qp2);
+
+	trap->len = sizeof(trap->data);
+	send_trap(ibp, trap);
 }
 
 /*
@@ -215,34 +405,36 @@ void hfi1_bad_pkey(struct hfi1_ibport *ibp, u32 key, u32 sl,
 static void bad_mkey(struct hfi1_ibport *ibp, struct ib_mad_hdr *mad,
 		     __be64 mkey, __be32 dr_slid, u8 return_path[], u8 hop_cnt)
 {
-	struct opa_mad_notice_attr data;
+	struct trap_node *trap;
 	u32 lid = ppd_from_ibp(ibp)->lid;
 
-	memset(&data, 0, sizeof(data));
+	trap = create_trap_node(IB_NOTICE_TYPE_SECURITY, OPA_TRAP_BAD_M_KEY,
+				lid);
+	if (!trap)
+		return;
+
 	/* Send violation trap */
-	data.generic_type = IB_NOTICE_TYPE_SECURITY;
-	data.prod_type_lsb = IB_NOTICE_PROD_CA;
-	data.trap_num = OPA_TRAP_BAD_M_KEY;
-	data.issuer_lid = cpu_to_be32(lid);
-	data.ntc_256.lid = data.issuer_lid;
-	data.ntc_256.method = mad->method;
-	data.ntc_256.attr_id = mad->attr_id;
-	data.ntc_256.attr_mod = mad->attr_mod;
-	data.ntc_256.mkey = mkey;
+	trap->data.ntc_256.lid = trap->data.issuer_lid;
+	trap->data.ntc_256.method = mad->method;
+	trap->data.ntc_256.attr_id = mad->attr_id;
+	trap->data.ntc_256.attr_mod = mad->attr_mod;
+	trap->data.ntc_256.mkey = mkey;
 	if (mad->mgmt_class == IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE) {
-		data.ntc_256.dr_slid = dr_slid;
-		data.ntc_256.dr_trunc_hop = IB_NOTICE_TRAP_DR_NOTICE;
-		if (hop_cnt > ARRAY_SIZE(data.ntc_256.dr_rtn_path)) {
-			data.ntc_256.dr_trunc_hop |=
+		trap->data.ntc_256.dr_slid = dr_slid;
+		trap->data.ntc_256.dr_trunc_hop = IB_NOTICE_TRAP_DR_NOTICE;
+		if (hop_cnt > ARRAY_SIZE(trap->data.ntc_256.dr_rtn_path)) {
+			trap->data.ntc_256.dr_trunc_hop |=
 				IB_NOTICE_TRAP_DR_TRUNC;
-			hop_cnt = ARRAY_SIZE(data.ntc_256.dr_rtn_path);
+			hop_cnt = ARRAY_SIZE(trap->data.ntc_256.dr_rtn_path);
 		}
-		data.ntc_256.dr_trunc_hop |= hop_cnt;
-		memcpy(data.ntc_256.dr_rtn_path, return_path,
+		trap->data.ntc_256.dr_trunc_hop |= hop_cnt;
+		memcpy(trap->data.ntc_256.dr_rtn_path, return_path,
 		       hop_cnt);
 	}
 
-	send_trap(ibp, &data, sizeof(data));
+	trap->len = sizeof(trap->data);
+
+	send_trap(ibp, trap);
 }
 
 /*
@@ -250,23 +442,24 @@ static void bad_mkey(struct hfi1_ibport *ibp, struct ib_mad_hdr *mad,
  */
 void hfi1_cap_mask_chg(struct rvt_dev_info *rdi, u8 port_num)
 {
-	struct opa_mad_notice_attr data;
+	struct trap_node *trap;
 	struct hfi1_ibdev *verbs_dev = dev_from_rdi(rdi);
 	struct hfi1_devdata *dd = dd_from_dev(verbs_dev);
 	struct hfi1_ibport *ibp = &dd->pport[port_num - 1].ibport_data;
 	u32 lid = ppd_from_ibp(ibp)->lid;
 
-	memset(&data, 0, sizeof(data));
+	trap = create_trap_node(IB_NOTICE_TYPE_INFO,
+				OPA_TRAP_CHANGE_CAPABILITY,
+				lid);
+	if (!trap)
+		return;
 
-	data.generic_type = IB_NOTICE_TYPE_INFO;
-	data.prod_type_lsb = IB_NOTICE_PROD_CA;
-	data.trap_num = OPA_TRAP_CHANGE_CAPABILITY;
-	data.issuer_lid = cpu_to_be32(lid);
-	data.ntc_144.lid = data.issuer_lid;
-	data.ntc_144.new_cap_mask = cpu_to_be32(ibp->rvp.port_cap_flags);
-	data.ntc_144.cap_mask3 = cpu_to_be16(ibp->rvp.port_cap3_flags);
+	trap->data.ntc_144.lid = trap->data.issuer_lid;
+	trap->data.ntc_144.new_cap_mask = cpu_to_be32(ibp->rvp.port_cap_flags);
+	trap->data.ntc_144.cap_mask3 = cpu_to_be16(ibp->rvp.port_cap3_flags);
 
-	send_trap(ibp, &data, sizeof(data));
+	trap->len = sizeof(trap->data);
+	send_trap(ibp, trap);
 }
 
 /*
@@ -274,19 +467,19 @@ void hfi1_cap_mask_chg(struct rvt_dev_info *rdi, u8 port_num)
  */
 void hfi1_sys_guid_chg(struct hfi1_ibport *ibp)
 {
-	struct opa_mad_notice_attr data;
+	struct trap_node *trap;
 	u32 lid = ppd_from_ibp(ibp)->lid;
 
-	memset(&data, 0, sizeof(data));
+	trap = create_trap_node(IB_NOTICE_TYPE_INFO, OPA_TRAP_CHANGE_SYSGUID,
+				lid);
+	if (!trap)
+		return;
 
-	data.generic_type = IB_NOTICE_TYPE_INFO;
-	data.prod_type_lsb = IB_NOTICE_PROD_CA;
-	data.trap_num = OPA_TRAP_CHANGE_SYSGUID;
-	data.issuer_lid = cpu_to_be32(lid);
-	data.ntc_145.new_sys_guid = ib_hfi1_sys_image_guid;
-	data.ntc_145.lid = data.issuer_lid;
+	trap->data.ntc_145.new_sys_guid = ib_hfi1_sys_image_guid;
+	trap->data.ntc_145.lid = trap->data.issuer_lid;
 
-	send_trap(ibp, &data, sizeof(data));
+	trap->len = sizeof(trap->data);
+	send_trap(ibp, trap);
 }
 
 /*
@@ -294,20 +487,21 @@ void hfi1_sys_guid_chg(struct hfi1_ibport *ibp)
  */
 void hfi1_node_desc_chg(struct hfi1_ibport *ibp)
 {
-	struct opa_mad_notice_attr data;
+	struct trap_node *trap;
 	u32 lid = ppd_from_ibp(ibp)->lid;
 
-	memset(&data, 0, sizeof(data));
+	trap = create_trap_node(IB_NOTICE_TYPE_INFO,
+				OPA_TRAP_CHANGE_CAPABILITY,
+				lid);
+	if (!trap)
+		return;
 
-	data.generic_type = IB_NOTICE_TYPE_INFO;
-	data.prod_type_lsb = IB_NOTICE_PROD_CA;
-	data.trap_num = OPA_TRAP_CHANGE_CAPABILITY;
-	data.issuer_lid = cpu_to_be32(lid);
-	data.ntc_144.lid = data.issuer_lid;
-	data.ntc_144.change_flags =
+	trap->data.ntc_144.lid = trap->data.issuer_lid;
+	trap->data.ntc_144.change_flags =
 		cpu_to_be16(OPA_NOTICE_TRAP_NODE_DESC_CHG);
 
-	send_trap(ibp, &data, sizeof(data));
+	trap->len = sizeof(trap->data);
+	send_trap(ibp, trap);
 }
 
 static int __subn_get_opa_nodedesc(struct opa_smp *smp, u32 am,
@@ -4144,6 +4338,11 @@ static int process_subn_opa(struct ib_device *ibdev, int mad_flags,
 		 */
 		ret = IB_MAD_RESULT_SUCCESS;
 		break;
+	case IB_MGMT_METHOD_TRAP_REPRESS:
+		subn_handle_opa_trap_repress(ibp, smp);
+		/* Always successful */
+		ret = IB_MAD_RESULT_SUCCESS;
+		break;
 	default:
 		smp->status |= IB_SMP_UNSUP_METHOD;
 		ret = reply((struct ib_mad_hdr *)smp);
diff --git a/drivers/infiniband/hw/hfi1/mad.h b/drivers/infiniband/hw/hfi1/mad.h
index a4e2506bd5ca..4c1245072093 100644
--- a/drivers/infiniband/hw/hfi1/mad.h
+++ b/drivers/infiniband/hw/hfi1/mad.h
@@ -1,5 +1,5 @@
 /*
- * Copyright(c) 2015, 2016 Intel Corporation.
+ * Copyright(c) 2015 - 2017 Intel Corporation.
  *
  * This file is provided under a dual BSD/GPLv2 license.  When using or
  * redistributing this file, you may do so under either license.
@@ -428,5 +428,6 @@ struct sc2vlnt {
 		    COUNTER_MASK(1, 4))
 
 void hfi1_event_pkey_change(struct hfi1_devdata *dd, u8 port);
+void hfi1_handle_trap_timer(unsigned long data);
 
 #endif				/* _HFI1_MAD_H */
diff --git a/drivers/infiniband/hw/hfi1/verbs.c b/drivers/infiniband/hw/hfi1/verbs.c
index 3ef6384eae40..dc51bf247006 100644
--- a/drivers/infiniband/hw/hfi1/verbs.c
+++ b/drivers/infiniband/hw/hfi1/verbs.c
@@ -1535,6 +1535,11 @@ static void init_ibport(struct hfi1_pportdata *ppd)
 		ibp->sc_to_sl[i] = i;
 	}
 
+	for (i = 0; i < RVT_MAX_TRAP_LISTS ; i++)
+		INIT_LIST_HEAD(&ibp->rvp.trap_lists[i].list);
+	setup_timer(&ibp->rvp.trap_timer, hfi1_handle_trap_timer,
+		    (unsigned long)ibp);
+
 	spin_lock_init(&ibp->rvp.lock);
 	/* Set the prefix to the default value (see ch. 4.1.1) */
 	ibp->rvp.gid_prefix = IB_DEFAULT_GID_PREFIX;
diff --git a/include/rdma/rdma_vt.h b/include/rdma/rdma_vt.h
index 22fb15ff5e8b..fdfac0fd2f82 100644
--- a/include/rdma/rdma_vt.h
+++ b/include/rdma/rdma_vt.h
@@ -57,11 +57,21 @@
 #include <linux/list.h>
 #include <linux/hash.h>
 #include <rdma/ib_verbs.h>
+#include <rdma/ib_mad.h>
 #include <rdma/rdmavt_mr.h>
 #include <rdma/rdmavt_qp.h>
 
 #define RVT_MAX_PKEY_VALUES 16
 
+#define RVT_MAX_TRAP_LEN 100 /* Limit pending trap list */
+#define RVT_MAX_TRAP_LISTS ((IB_NOTICE_TYPE_INFO & 0x0F) + 1)
+#define RVT_TRAP_TIMEOUT 4096 /* 4.096 usec */
+
+struct trap_list {
+	u32 list_len;
+	struct list_head list;
+};
+
 struct rvt_ibport {
 	struct rvt_qp __rcu *qp[2];
 	struct ib_mad_agent *send_agent;	/* agent for SMI (traps) */
@@ -128,6 +138,13 @@ struct rvt_ibport {
 	u16 *pkey_table;
 
 	struct rvt_ah *sm_ah;
+
+	/*
+	 * Keep a list of traps that have not been repressed.  They will be
+	 * resent based on trap_timer.
+	 */
+	struct trap_list trap_lists[RVT_MAX_TRAP_LISTS];
+	struct timer_list trap_timer;
 };
 
 #define RVT_CQN_MAX 16 /* maximum length of cq name */
-- 
cgit v1.2.3


From c5dc3c69f17a7e77359f10c342d1816390bc8846 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Mon, 19 Jun 2017 20:04:58 +0200
Subject: PCI/portdrv: Move error handler methods to struct
 pcie_port_service_driver

Move the error handler methods to struct pcie_port_service_driver and avoid
the detour through the mostly unused pci_error_handlers structure.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
---
 drivers/pci/pcie/aer/aerdrv.c  |  25 +---------
 drivers/pci/pcie/portdrv_pci.c | 105 +++--------------------------------------
 include/linux/pcieport_if.h    |   4 +-
 3 files changed, 9 insertions(+), 125 deletions(-)

(limited to 'include')

diff --git a/drivers/pci/pcie/aer/aerdrv.c b/drivers/pci/pcie/aer/aerdrv.c
index dea186a9d6b6..6ff5f5b4f5e6 100644
--- a/drivers/pci/pcie/aer/aerdrv.c
+++ b/drivers/pci/pcie/aer/aerdrv.c
@@ -32,16 +32,9 @@
 
 static int aer_probe(struct pcie_device *dev);
 static void aer_remove(struct pcie_device *dev);
-static pci_ers_result_t aer_error_detected(struct pci_dev *dev,
-	enum pci_channel_state error);
 static void aer_error_resume(struct pci_dev *dev);
 static pci_ers_result_t aer_root_reset(struct pci_dev *dev);
 
-static const struct pci_error_handlers aer_error_handlers = {
-	.error_detected = aer_error_detected,
-	.resume		= aer_error_resume,
-};
-
 static struct pcie_port_service_driver aerdriver = {
 	.name		= "aer",
 	.port_type	= PCI_EXP_TYPE_ROOT_PORT,
@@ -49,9 +42,7 @@ static struct pcie_port_service_driver aerdriver = {
 
 	.probe		= aer_probe,
 	.remove		= aer_remove,
-
-	.err_handler	= &aer_error_handlers,
-
+	.error_resume	= aer_error_resume,
 	.reset_link	= aer_root_reset,
 };
 
@@ -349,20 +340,6 @@ static pci_ers_result_t aer_root_reset(struct pci_dev *dev)
 	return PCI_ERS_RESULT_RECOVERED;
 }
 
-/**
- * aer_error_detected - update severity status
- * @dev: pointer to Root Port's pci_dev data structure
- * @error: error severity being notified by port bus
- *
- * Invoked by Port Bus driver during error recovery.
- */
-static pci_ers_result_t aer_error_detected(struct pci_dev *dev,
-			enum pci_channel_state error)
-{
-	/* Root Port has no impact. Always recovers. */
-	return PCI_ERS_RESULT_CAN_RECOVER;
-}
-
 /**
  * aer_error_resume - clean up corresponding error status bits
  * @dev: pointer to Root Port's pci_dev data structure
diff --git a/drivers/pci/pcie/portdrv_pci.c b/drivers/pci/pcie/portdrv_pci.c
index 8aa3f14bc87d..be635f017756 100644
--- a/drivers/pci/pcie/portdrv_pci.c
+++ b/drivers/pci/pcie/portdrv_pci.c
@@ -21,7 +21,6 @@
 
 #include "../pci.h"
 #include "portdrv.h"
-#include "aer/aerdrv.h"
 
 /* If this switch is set, PCIe port native services should not be enabled. */
 bool pcie_ports_disabled;
@@ -177,108 +176,20 @@ static void pcie_portdrv_remove(struct pci_dev *dev)
 	pcie_port_device_remove(dev);
 }
 
-static int error_detected_iter(struct device *device, void *data)
-{
-	struct pcie_device *pcie_device;
-	struct pcie_port_service_driver *driver;
-	struct aer_broadcast_data *result_data;
-	pci_ers_result_t status;
-
-	result_data = (struct aer_broadcast_data *) data;
-
-	if (device->bus == &pcie_port_bus_type && device->driver) {
-		driver = to_service_driver(device->driver);
-		if (!driver ||
-			!driver->err_handler ||
-			!driver->err_handler->error_detected)
-			return 0;
-
-		pcie_device = to_pcie_device(device);
-
-		/* Forward error detected message to service drivers */
-		status = driver->err_handler->error_detected(
-			pcie_device->port,
-			result_data->state);
-		result_data->result =
-			merge_result(result_data->result, status);
-	}
-
-	return 0;
-}
-
 static pci_ers_result_t pcie_portdrv_error_detected(struct pci_dev *dev,
 					enum pci_channel_state error)
 {
-	struct aer_broadcast_data data = {error, PCI_ERS_RESULT_CAN_RECOVER};
-
-	/* get true return value from &data */
-	device_for_each_child(&dev->dev, &data, error_detected_iter);
-	return data.result;
-}
-
-static int mmio_enabled_iter(struct device *device, void *data)
-{
-	struct pcie_device *pcie_device;
-	struct pcie_port_service_driver *driver;
-	pci_ers_result_t status, *result;
-
-	result = (pci_ers_result_t *) data;
-
-	if (device->bus == &pcie_port_bus_type && device->driver) {
-		driver = to_service_driver(device->driver);
-		if (driver &&
-			driver->err_handler &&
-			driver->err_handler->mmio_enabled) {
-			pcie_device = to_pcie_device(device);
-
-			/* Forward error message to service drivers */
-			status = driver->err_handler->mmio_enabled(
-					pcie_device->port);
-			*result = merge_result(*result, status);
-		}
-	}
-
-	return 0;
+	/* Root Port has no impact. Always recovers. */
+	return PCI_ERS_RESULT_CAN_RECOVER;
 }
 
 static pci_ers_result_t pcie_portdrv_mmio_enabled(struct pci_dev *dev)
 {
-	pci_ers_result_t status = PCI_ERS_RESULT_RECOVERED;
-
-	/* get true return value from &status */
-	device_for_each_child(&dev->dev, &status, mmio_enabled_iter);
-	return status;
-}
-
-static int slot_reset_iter(struct device *device, void *data)
-{
-	struct pcie_device *pcie_device;
-	struct pcie_port_service_driver *driver;
-	pci_ers_result_t status, *result;
-
-	result = (pci_ers_result_t *) data;
-
-	if (device->bus == &pcie_port_bus_type && device->driver) {
-		driver = to_service_driver(device->driver);
-		if (driver &&
-			driver->err_handler &&
-			driver->err_handler->slot_reset) {
-			pcie_device = to_pcie_device(device);
-
-			/* Forward error message to service drivers */
-			status = driver->err_handler->slot_reset(
-					pcie_device->port);
-			*result = merge_result(*result, status);
-		}
-	}
-
-	return 0;
+	return PCI_ERS_RESULT_RECOVERED;
 }
 
 static pci_ers_result_t pcie_portdrv_slot_reset(struct pci_dev *dev)
 {
-	pci_ers_result_t status = PCI_ERS_RESULT_RECOVERED;
-
 	/* If fatal, restore cfg space for possible link reset at upstream */
 	if (dev->error_state == pci_channel_io_frozen) {
 		dev->state_saved = true;
@@ -287,9 +198,7 @@ static pci_ers_result_t pcie_portdrv_slot_reset(struct pci_dev *dev)
 		pci_enable_pcie_error_reporting(dev);
 	}
 
-	/* get true return value from &status */
-	device_for_each_child(&dev->dev, &status, slot_reset_iter);
-	return status;
+	return PCI_ERS_RESULT_RECOVERED;
 }
 
 static int resume_iter(struct device *device, void *data)
@@ -299,13 +208,11 @@ static int resume_iter(struct device *device, void *data)
 
 	if (device->bus == &pcie_port_bus_type && device->driver) {
 		driver = to_service_driver(device->driver);
-		if (driver &&
-			driver->err_handler &&
-			driver->err_handler->resume) {
+		if (driver && driver->error_resume) {
 			pcie_device = to_pcie_device(device);
 
 			/* Forward error message to service drivers */
-			driver->err_handler->resume(pcie_device->port);
+			driver->error_resume(pcie_device->port);
 		}
 	}
 
diff --git a/include/linux/pcieport_if.h b/include/linux/pcieport_if.h
index afcd130ab3a9..18edc651c070 100644
--- a/include/linux/pcieport_if.h
+++ b/include/linux/pcieport_if.h
@@ -50,8 +50,8 @@ struct pcie_port_service_driver {
 	int (*suspend) (struct pcie_device *dev);
 	int (*resume) (struct pcie_device *dev);
 
-	/* Service Error Recovery Handler */
-	const struct pci_error_handlers *err_handler;
+	/* Device driver may resume normal operations */
+	void (*error_resume)(struct pci_dev *dev);
 
 	/* Link Reset Capability - AER service driver specific */
 	pci_ers_result_t (*reset_link) (struct pci_dev *dev);
-- 
cgit v1.2.3


From 62ce94a7a5a54aac80975f5e6731707225d4077e Mon Sep 17 00:00:00 2001
From: Sinan Kaya <okaya@codeaurora.org>
Date: Wed, 12 Jul 2017 00:04:14 -0400
Subject: PCI: Mark Broadcom HT2100 Root Port Extended Tags as broken

Per PCIe r3.1, sec 2.2.6.2 and 7.8.4, a Requester may not use 8-bit Tags
unless its Extended Tag Field Enable is set, but all Receivers/Completers
must handle 8-bit Tags correctly regardless of their Extended Tag Field
Enable.

Some devices do not handle 8-bit Tags as Completers, so add a quirk for
them.  If we find such a device, we disable Extended Tags for the entire
hierarchy to make peer-to-peer DMA possible.

The Broadcom HT2100 seems to have issues with handling 8-bit tags.  Mark it
as broken.

The pci_walk_bus() in the quirk handles devices we've enumerated in the
past, and pci_configure_device() handles devices we enumerate in the
future.

Fixes: 60db3a4d8cc9 ("PCI: Enable PCIe Extended Tags if supported")
Link: https://bugzilla.redhat.com/show_bug.cgi?id=1467674
Reported-and-tested-by: Wim ten Have <wim.ten.have@oracle.com>
Signed-off-by: Sinan Kaya <okaya@codeaurora.org>
[bhelgaas: changelog, tweak messages, rename bit and quirk]
Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
---
 drivers/pci/pci.h    |  1 +
 drivers/pci/probe.c  | 43 ++++++++++++++++++++++++++++++++++++-------
 drivers/pci/quirks.c | 16 ++++++++++++++++
 include/linux/pci.h  |  1 +
 4 files changed, 54 insertions(+), 7 deletions(-)

(limited to 'include')

diff --git a/drivers/pci/pci.h b/drivers/pci/pci.h
index 22e061738c6f..a6560c9baa52 100644
--- a/drivers/pci/pci.h
+++ b/drivers/pci/pci.h
@@ -235,6 +235,7 @@ enum pci_bar_type {
 	pci_bar_mem64,		/* A 64-bit memory BAR */
 };
 
+int pci_configure_extended_tags(struct pci_dev *dev, void *ign);
 bool pci_bus_read_dev_vendor_id(struct pci_bus *bus, int devfn, u32 *pl,
 				int crs_timeout);
 int pci_setup_device(struct pci_dev *dev);
diff --git a/drivers/pci/probe.c b/drivers/pci/probe.c
index c31310db0404..c81c9835f4c7 100644
--- a/drivers/pci/probe.c
+++ b/drivers/pci/probe.c
@@ -1745,21 +1745,50 @@ static void program_hpp_type2(struct pci_dev *dev, struct hpp_type2 *hpp)
 	 */
 }
 
-static void pci_configure_extended_tags(struct pci_dev *dev)
+int pci_configure_extended_tags(struct pci_dev *dev, void *ign)
 {
-	u32 dev_cap;
+	struct pci_host_bridge *host;
+	u32 cap;
+	u16 ctl;
 	int ret;
 
 	if (!pci_is_pcie(dev))
-		return;
+		return 0;
 
-	ret = pcie_capability_read_dword(dev, PCI_EXP_DEVCAP, &dev_cap);
+	ret = pcie_capability_read_dword(dev, PCI_EXP_DEVCAP, &cap);
 	if (ret)
-		return;
+		return 0;
+
+	if (!(cap & PCI_EXP_DEVCAP_EXT_TAG))
+		return 0;
 
-	if (dev_cap & PCI_EXP_DEVCAP_EXT_TAG)
+	ret = pcie_capability_read_word(dev, PCI_EXP_DEVCTL, &ctl);
+	if (ret)
+		return 0;
+
+	host = pci_find_host_bridge(dev->bus);
+	if (!host)
+		return 0;
+
+	/*
+	 * If some device in the hierarchy doesn't handle Extended Tags
+	 * correctly, make sure they're disabled.
+	 */
+	if (host->no_ext_tags) {
+		if (ctl & PCI_EXP_DEVCTL_EXT_TAG) {
+			dev_info(&dev->dev, "disabling Extended Tags\n");
+			pcie_capability_clear_word(dev, PCI_EXP_DEVCTL,
+						   PCI_EXP_DEVCTL_EXT_TAG);
+		}
+		return 0;
+	}
+
+	if (!(ctl & PCI_EXP_DEVCTL_EXT_TAG)) {
+		dev_info(&dev->dev, "enabling Extended Tags\n");
 		pcie_capability_set_word(dev, PCI_EXP_DEVCTL,
 					 PCI_EXP_DEVCTL_EXT_TAG);
+	}
+	return 0;
 }
 
 static void pci_configure_device(struct pci_dev *dev)
@@ -1768,7 +1797,7 @@ static void pci_configure_device(struct pci_dev *dev)
 	int ret;
 
 	pci_configure_mps(dev);
-	pci_configure_extended_tags(dev);
+	pci_configure_extended_tags(dev, NULL);
 
 	memset(&hpp, 0, sizeof(hpp));
 	ret = pci_get_hp_params(dev, &hpp);
diff --git a/drivers/pci/quirks.c b/drivers/pci/quirks.c
index 6967c6b4cf6b..f135765555c9 100644
--- a/drivers/pci/quirks.c
+++ b/drivers/pci/quirks.c
@@ -4681,3 +4681,19 @@ static void quirk_intel_no_flr(struct pci_dev *dev)
 }
 DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_INTEL, 0x1502, quirk_intel_no_flr);
 DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_INTEL, 0x1503, quirk_intel_no_flr);
+
+static void quirk_no_ext_tags(struct pci_dev *pdev)
+{
+	struct pci_host_bridge *bridge = pci_find_host_bridge(pdev->bus);
+
+	if (!bridge)
+		return;
+
+	bridge->no_ext_tags = 1;
+	dev_info(&pdev->dev, "disabling Extended Tags (this device can't handle them)\n");
+
+	pci_walk_bus(bridge->bus, pci_configure_extended_tags, NULL);
+}
+DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_SERVERWORKS, 0x0140, quirk_no_ext_tags);
+DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_SERVERWORKS, 0x0142, quirk_no_ext_tags);
+DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_SERVERWORKS, 0x0144, quirk_no_ext_tags);
diff --git a/include/linux/pci.h b/include/linux/pci.h
index 4869e66dd659..3b968d435895 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -451,6 +451,7 @@ struct pci_host_bridge {
 	void *release_data;
 	struct msi_controller *msi;
 	unsigned int ignore_reset_delay:1;	/* for entire hierarchy */
+	unsigned int no_ext_tags:1;		/* no Extended Tags */
 	/* Resource alignment requirements */
 	resource_size_t (*align_resource)(struct pci_dev *dev,
 			const struct resource *res,
-- 
cgit v1.2.3


From e7942d0633c47c791ece6afa038be9cf977226de Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Sun, 30 Jul 2017 03:57:18 +0200
Subject: tcp: remove prequeue support

prequeue is a tcp receive optimization that moves part of rx processing
from bh to process context.

This only works if the socket being processed belongs to a process that
is blocked in recv on that socket.

In practice, this doesn't happen anymore that often because nowadays
servers tend to use an event driven (epoll) model.

Even normal client applications (web browsers) commonly use many tcp
connections in parallel.

This has measureable impact only in netperf (which uses plain recv and
thus allows prequeue use) from host to locally running vm (~4%), however,
there were no changes when using netperf between two physical hosts with
ixgbe interfaces.

Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/tcp.h      |   9 ----
 include/net/tcp.h        |  11 -----
 net/ipv4/tcp.c           | 105 -----------------------------------------------
 net/ipv4/tcp_input.c     |  62 ----------------------------
 net/ipv4/tcp_ipv4.c      |  61 +--------------------------
 net/ipv4/tcp_minisocks.c |   1 -
 net/ipv4/tcp_timer.c     |  12 ------
 net/ipv6/tcp_ipv6.c      |   3 +-
 8 files changed, 2 insertions(+), 262 deletions(-)

(limited to 'include')

diff --git a/include/linux/tcp.h b/include/linux/tcp.h
index 542ca1ae02c4..32fb37cfb0d1 100644
--- a/include/linux/tcp.h
+++ b/include/linux/tcp.h
@@ -192,15 +192,6 @@ struct tcp_sock {
 
 	struct list_head tsq_node; /* anchor in tsq_tasklet.head list */
 
-	/* Data for direct copy to user */
-	struct {
-		struct sk_buff_head	prequeue;
-		struct task_struct	*task;
-		struct msghdr		*msg;
-		int			memory;
-		int			len;
-	} ucopy;
-
 	u32	snd_wl1;	/* Sequence for window update		*/
 	u32	snd_wnd;	/* The window we expect to receive	*/
 	u32	max_window;	/* Maximal window ever seen from peer	*/
diff --git a/include/net/tcp.h b/include/net/tcp.h
index 12d68335acd4..93f115cfc8f8 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -1244,17 +1244,6 @@ static inline bool tcp_checksum_complete(struct sk_buff *skb)
 		__tcp_checksum_complete(skb);
 }
 
-/* Prequeue for VJ style copy to user, combined with checksumming. */
-
-static inline void tcp_prequeue_init(struct tcp_sock *tp)
-{
-	tp->ucopy.task = NULL;
-	tp->ucopy.len = 0;
-	tp->ucopy.memory = 0;
-	skb_queue_head_init(&tp->ucopy.prequeue);
-}
-
-bool tcp_prequeue(struct sock *sk, struct sk_buff *skb);
 bool tcp_add_backlog(struct sock *sk, struct sk_buff *skb);
 int tcp_filter(struct sock *sk, struct sk_buff *skb);
 
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 71ce33decd97..62018ea6f45f 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -400,7 +400,6 @@ void tcp_init_sock(struct sock *sk)
 
 	tp->out_of_order_queue = RB_ROOT;
 	tcp_init_xmit_timers(sk);
-	tcp_prequeue_init(tp);
 	INIT_LIST_HEAD(&tp->tsq_node);
 
 	icsk->icsk_rto = TCP_TIMEOUT_INIT;
@@ -1525,20 +1524,6 @@ static void tcp_cleanup_rbuf(struct sock *sk, int copied)
 		tcp_send_ack(sk);
 }
 
-static void tcp_prequeue_process(struct sock *sk)
-{
-	struct sk_buff *skb;
-	struct tcp_sock *tp = tcp_sk(sk);
-
-	NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPPREQUEUED);
-
-	while ((skb = __skb_dequeue(&tp->ucopy.prequeue)) != NULL)
-		sk_backlog_rcv(sk, skb);
-
-	/* Clear memory counter. */
-	tp->ucopy.memory = 0;
-}
-
 static struct sk_buff *tcp_recv_skb(struct sock *sk, u32 seq, u32 *off)
 {
 	struct sk_buff *skb;
@@ -1671,7 +1656,6 @@ int tcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int nonblock,
 	int err;
 	int target;		/* Read at least this many bytes */
 	long timeo;
-	struct task_struct *user_recv = NULL;
 	struct sk_buff *skb, *last;
 	u32 urg_hole = 0;
 
@@ -1806,51 +1790,6 @@ int tcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int nonblock,
 
 		tcp_cleanup_rbuf(sk, copied);
 
-		if (!sysctl_tcp_low_latency && tp->ucopy.task == user_recv) {
-			/* Install new reader */
-			if (!user_recv && !(flags & (MSG_TRUNC | MSG_PEEK))) {
-				user_recv = current;
-				tp->ucopy.task = user_recv;
-				tp->ucopy.msg = msg;
-			}
-
-			tp->ucopy.len = len;
-
-			WARN_ON(tp->copied_seq != tp->rcv_nxt &&
-				!(flags & (MSG_PEEK | MSG_TRUNC)));
-
-			/* Ugly... If prequeue is not empty, we have to
-			 * process it before releasing socket, otherwise
-			 * order will be broken at second iteration.
-			 * More elegant solution is required!!!
-			 *
-			 * Look: we have the following (pseudo)queues:
-			 *
-			 * 1. packets in flight
-			 * 2. backlog
-			 * 3. prequeue
-			 * 4. receive_queue
-			 *
-			 * Each queue can be processed only if the next ones
-			 * are empty. At this point we have empty receive_queue.
-			 * But prequeue _can_ be not empty after 2nd iteration,
-			 * when we jumped to start of loop because backlog
-			 * processing added something to receive_queue.
-			 * We cannot release_sock(), because backlog contains
-			 * packets arrived _after_ prequeued ones.
-			 *
-			 * Shortly, algorithm is clear --- to process all
-			 * the queues in order. We could make it more directly,
-			 * requeueing packets from backlog to prequeue, if
-			 * is not empty. It is more elegant, but eats cycles,
-			 * unfortunately.
-			 */
-			if (!skb_queue_empty(&tp->ucopy.prequeue))
-				goto do_prequeue;
-
-			/* __ Set realtime policy in scheduler __ */
-		}
-
 		if (copied >= target) {
 			/* Do not sleep, just process backlog. */
 			release_sock(sk);
@@ -1859,31 +1798,6 @@ int tcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int nonblock,
 			sk_wait_data(sk, &timeo, last);
 		}
 
-		if (user_recv) {
-			int chunk;
-
-			/* __ Restore normal policy in scheduler __ */
-
-			chunk = len - tp->ucopy.len;
-			if (chunk != 0) {
-				NET_ADD_STATS(sock_net(sk), LINUX_MIB_TCPDIRECTCOPYFROMBACKLOG, chunk);
-				len -= chunk;
-				copied += chunk;
-			}
-
-			if (tp->rcv_nxt == tp->copied_seq &&
-			    !skb_queue_empty(&tp->ucopy.prequeue)) {
-do_prequeue:
-				tcp_prequeue_process(sk);
-
-				chunk = len - tp->ucopy.len;
-				if (chunk != 0) {
-					NET_ADD_STATS(sock_net(sk), LINUX_MIB_TCPDIRECTCOPYFROMPREQUEUE, chunk);
-					len -= chunk;
-					copied += chunk;
-				}
-			}
-		}
 		if ((flags & MSG_PEEK) &&
 		    (peek_seq - copied - urg_hole != tp->copied_seq)) {
 			net_dbg_ratelimited("TCP(%s:%d): Application bug, race in MSG_PEEK\n",
@@ -1955,25 +1869,6 @@ skip_copy:
 		break;
 	} while (len > 0);
 
-	if (user_recv) {
-		if (!skb_queue_empty(&tp->ucopy.prequeue)) {
-			int chunk;
-
-			tp->ucopy.len = copied > 0 ? len : 0;
-
-			tcp_prequeue_process(sk);
-
-			if (copied > 0 && (chunk = len - tp->ucopy.len) != 0) {
-				NET_ADD_STATS(sock_net(sk), LINUX_MIB_TCPDIRECTCOPYFROMPREQUEUE, chunk);
-				len -= chunk;
-				copied += chunk;
-			}
-		}
-
-		tp->ucopy.task = NULL;
-		tp->ucopy.len = 0;
-	}
-
 	/* According to UNIX98, msg_name/msg_namelen are ignored
 	 * on connected socket. I was just happy when found this 8) --ANK
 	 */
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index adc3f3e9468c..770ce6cb3eca 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -4611,22 +4611,6 @@ static void tcp_data_queue(struct sock *sk, struct sk_buff *skb)
 			goto out_of_window;
 
 		/* Ok. In sequence. In window. */
-		if (tp->ucopy.task == current &&
-		    tp->copied_seq == tp->rcv_nxt && tp->ucopy.len &&
-		    sock_owned_by_user(sk) && !tp->urg_data) {
-			int chunk = min_t(unsigned int, skb->len,
-					  tp->ucopy.len);
-
-			__set_current_state(TASK_RUNNING);
-
-			if (!skb_copy_datagram_msg(skb, 0, tp->ucopy.msg, chunk)) {
-				tp->ucopy.len -= chunk;
-				tp->copied_seq += chunk;
-				eaten = (chunk == skb->len);
-				tcp_rcv_space_adjust(sk);
-			}
-		}
-
 		if (eaten <= 0) {
 queue_and_out:
 			if (eaten < 0) {
@@ -5186,26 +5170,6 @@ static void tcp_urg(struct sock *sk, struct sk_buff *skb, const struct tcphdr *t
 	}
 }
 
-static int tcp_copy_to_iovec(struct sock *sk, struct sk_buff *skb, int hlen)
-{
-	struct tcp_sock *tp = tcp_sk(sk);
-	int chunk = skb->len - hlen;
-	int err;
-
-	if (skb_csum_unnecessary(skb))
-		err = skb_copy_datagram_msg(skb, hlen, tp->ucopy.msg, chunk);
-	else
-		err = skb_copy_and_csum_datagram_msg(skb, hlen, tp->ucopy.msg);
-
-	if (!err) {
-		tp->ucopy.len -= chunk;
-		tp->copied_seq += chunk;
-		tcp_rcv_space_adjust(sk);
-	}
-
-	return err;
-}
-
 /* Accept RST for rcv_nxt - 1 after a FIN.
  * When tcp connections are abruptly terminated from Mac OSX (via ^C), a
  * FIN is sent followed by a RST packet. The RST is sent with the same
@@ -5446,32 +5410,6 @@ void tcp_rcv_established(struct sock *sk, struct sk_buff *skb,
 			int eaten = 0;
 			bool fragstolen = false;
 
-			if (tp->ucopy.task == current &&
-			    tp->copied_seq == tp->rcv_nxt &&
-			    len - tcp_header_len <= tp->ucopy.len &&
-			    sock_owned_by_user(sk)) {
-				__set_current_state(TASK_RUNNING);
-
-				if (!tcp_copy_to_iovec(sk, skb, tcp_header_len)) {
-					/* Predicted packet is in window by definition.
-					 * seq == rcv_nxt and rcv_wup <= rcv_nxt.
-					 * Hence, check seq<=rcv_wup reduces to:
-					 */
-					if (tcp_header_len ==
-					    (sizeof(struct tcphdr) +
-					     TCPOLEN_TSTAMP_ALIGNED) &&
-					    tp->rcv_nxt == tp->rcv_wup)
-						tcp_store_ts_recent(tp);
-
-					tcp_rcv_rtt_measure_ts(sk, skb);
-
-					__skb_pull(skb, tcp_header_len);
-					tcp_rcv_nxt_update(tp, TCP_SKB_CB(skb)->end_seq);
-					NET_INC_STATS(sock_net(sk),
-							LINUX_MIB_TCPHPHITSTOUSER);
-					eaten = 1;
-				}
-			}
 			if (!eaten) {
 				if (tcp_checksum_complete(skb))
 					goto csum_error;
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 3a19ea28339f..a68eb4577d36 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -1541,61 +1541,6 @@ void tcp_v4_early_demux(struct sk_buff *skb)
 	}
 }
 
-/* Packet is added to VJ-style prequeue for processing in process
- * context, if a reader task is waiting. Apparently, this exciting
- * idea (VJ's mail "Re: query about TCP header on tcp-ip" of 07 Sep 93)
- * failed somewhere. Latency? Burstiness? Well, at least now we will
- * see, why it failed. 8)8)				  --ANK
- *
- */
-bool tcp_prequeue(struct sock *sk, struct sk_buff *skb)
-{
-	struct tcp_sock *tp = tcp_sk(sk);
-
-	if (sysctl_tcp_low_latency || !tp->ucopy.task)
-		return false;
-
-	if (skb->len <= tcp_hdrlen(skb) &&
-	    skb_queue_len(&tp->ucopy.prequeue) == 0)
-		return false;
-
-	/* Before escaping RCU protected region, we need to take care of skb
-	 * dst. Prequeue is only enabled for established sockets.
-	 * For such sockets, we might need the skb dst only to set sk->sk_rx_dst
-	 * Instead of doing full sk_rx_dst validity here, let's perform
-	 * an optimistic check.
-	 */
-	if (likely(sk->sk_rx_dst))
-		skb_dst_drop(skb);
-	else
-		skb_dst_force_safe(skb);
-
-	__skb_queue_tail(&tp->ucopy.prequeue, skb);
-	tp->ucopy.memory += skb->truesize;
-	if (skb_queue_len(&tp->ucopy.prequeue) >= 32 ||
-	    tp->ucopy.memory + atomic_read(&sk->sk_rmem_alloc) > sk->sk_rcvbuf) {
-		struct sk_buff *skb1;
-
-		BUG_ON(sock_owned_by_user(sk));
-		__NET_ADD_STATS(sock_net(sk), LINUX_MIB_TCPPREQUEUEDROPPED,
-				skb_queue_len(&tp->ucopy.prequeue));
-
-		while ((skb1 = __skb_dequeue(&tp->ucopy.prequeue)) != NULL)
-			sk_backlog_rcv(sk, skb1);
-
-		tp->ucopy.memory = 0;
-	} else if (skb_queue_len(&tp->ucopy.prequeue) == 1) {
-		wake_up_interruptible_sync_poll(sk_sleep(sk),
-					   POLLIN | POLLRDNORM | POLLRDBAND);
-		if (!inet_csk_ack_scheduled(sk))
-			inet_csk_reset_xmit_timer(sk, ICSK_TIME_DACK,
-						  (3 * tcp_rto_min(sk)) / 4,
-						  TCP_RTO_MAX);
-	}
-	return true;
-}
-EXPORT_SYMBOL(tcp_prequeue);
-
 bool tcp_add_backlog(struct sock *sk, struct sk_buff *skb)
 {
 	u32 limit = sk->sk_rcvbuf + sk->sk_sndbuf;
@@ -1770,8 +1715,7 @@ process:
 	tcp_segs_in(tcp_sk(sk), skb);
 	ret = 0;
 	if (!sock_owned_by_user(sk)) {
-		if (!tcp_prequeue(sk, skb))
-			ret = tcp_v4_do_rcv(sk, skb);
+		ret = tcp_v4_do_rcv(sk, skb);
 	} else if (tcp_add_backlog(sk, skb)) {
 		goto discard_and_relse;
 	}
@@ -1936,9 +1880,6 @@ void tcp_v4_destroy_sock(struct sock *sk)
 	}
 #endif
 
-	/* Clean prequeue, it must be empty really */
-	__skb_queue_purge(&tp->ucopy.prequeue);
-
 	/* Clean up a referenced TCP bind bucket. */
 	if (inet_csk(sk)->icsk_bind_hash)
 		inet_put_port(sk);
diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c
index 0ff83c1637d8..188a6f31356d 100644
--- a/net/ipv4/tcp_minisocks.c
+++ b/net/ipv4/tcp_minisocks.c
@@ -445,7 +445,6 @@ struct sock *tcp_create_openreq_child(const struct sock *sk,
 		newtp->snd_sml = newtp->snd_una =
 		newtp->snd_nxt = newtp->snd_up = treq->snt_isn + 1;
 
-		tcp_prequeue_init(newtp);
 		INIT_LIST_HEAD(&newtp->tsq_node);
 
 		tcp_init_wl(newtp, treq->rcv_isn);
diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c
index c0feeeef962a..f753f9d2fee3 100644
--- a/net/ipv4/tcp_timer.c
+++ b/net/ipv4/tcp_timer.c
@@ -239,7 +239,6 @@ static int tcp_write_timeout(struct sock *sk)
 /* Called with BH disabled */
 void tcp_delack_timer_handler(struct sock *sk)
 {
-	struct tcp_sock *tp = tcp_sk(sk);
 	struct inet_connection_sock *icsk = inet_csk(sk);
 
 	sk_mem_reclaim_partial(sk);
@@ -254,17 +253,6 @@ void tcp_delack_timer_handler(struct sock *sk)
 	}
 	icsk->icsk_ack.pending &= ~ICSK_ACK_TIMER;
 
-	if (!skb_queue_empty(&tp->ucopy.prequeue)) {
-		struct sk_buff *skb;
-
-		__NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPSCHEDULERFAILED);
-
-		while ((skb = __skb_dequeue(&tp->ucopy.prequeue)) != NULL)
-			sk_backlog_rcv(sk, skb);
-
-		tp->ucopy.memory = 0;
-	}
-
 	if (inet_csk_ack_scheduled(sk)) {
 		if (!icsk->icsk_ack.pingpong) {
 			/* Delayed ACK missed: inflate ATO. */
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index 2968a33cca7d..39ee8e7fc4bd 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -1505,8 +1505,7 @@ process:
 	tcp_segs_in(tcp_sk(sk), skb);
 	ret = 0;
 	if (!sock_owned_by_user(sk)) {
-		if (!tcp_prequeue(sk, skb))
-			ret = tcp_v6_do_rcv(sk, skb);
+		ret = tcp_v6_do_rcv(sk, skb);
 	} else if (tcp_add_backlog(sk, skb)) {
 		goto discard_and_relse;
 	}
-- 
cgit v1.2.3


From b6690b14386698ce2c19309abad3f17656bdfaea Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Sun, 30 Jul 2017 03:57:20 +0200
Subject: tcp: remove low_latency sysctl

Was only checked by the removed prequeue code.

Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 Documentation/networking/ip-sysctl.txt | 7 +------
 include/net/tcp.h                      | 1 -
 net/ipv4/sysctl_net_ipv4.c             | 3 +++
 net/ipv4/tcp_ipv4.c                    | 2 --
 4 files changed, 4 insertions(+), 9 deletions(-)

(limited to 'include')

diff --git a/Documentation/networking/ip-sysctl.txt b/Documentation/networking/ip-sysctl.txt
index f485d553e65c..84c9b8cee780 100644
--- a/Documentation/networking/ip-sysctl.txt
+++ b/Documentation/networking/ip-sysctl.txt
@@ -353,12 +353,7 @@ tcp_l3mdev_accept - BOOLEAN
 	compiled with CONFIG_NET_L3_MASTER_DEV.
 
 tcp_low_latency - BOOLEAN
-	If set, the TCP stack makes decisions that prefer lower
-	latency as opposed to higher throughput.  By default, this
-	option is not set meaning that higher throughput is preferred.
-	An example of an application where this default should be
-	changed would be a Beowulf compute cluster.
-	Default: 0
+	This is a legacy option, it has no effect anymore.
 
 tcp_max_orphans - INTEGER
 	Maximal number of TCP sockets not attached to any user file handle,
diff --git a/include/net/tcp.h b/include/net/tcp.h
index 93f115cfc8f8..8507c81fb0e9 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -256,7 +256,6 @@ extern int sysctl_tcp_rmem[3];
 extern int sysctl_tcp_app_win;
 extern int sysctl_tcp_adv_win_scale;
 extern int sysctl_tcp_frto;
-extern int sysctl_tcp_low_latency;
 extern int sysctl_tcp_nometrics_save;
 extern int sysctl_tcp_moderate_rcvbuf;
 extern int sysctl_tcp_tso_win_divisor;
diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
index 9bf809726066..0d3c038d7b04 100644
--- a/net/ipv4/sysctl_net_ipv4.c
+++ b/net/ipv4/sysctl_net_ipv4.c
@@ -45,6 +45,9 @@ static int tcp_syn_retries_max = MAX_TCP_SYNCNT;
 static int ip_ping_group_range_min[] = { 0, 0 };
 static int ip_ping_group_range_max[] = { GID_T_MAX, GID_T_MAX };
 
+/* obsolete */
+static int sysctl_tcp_low_latency __read_mostly;
+
 /* Update system visible IP port range */
 static void set_local_port_range(struct net *net, int range[2])
 {
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index a68eb4577d36..9b51663cd5a4 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -85,8 +85,6 @@
 #include <crypto/hash.h>
 #include <linux/scatterlist.h>
 
-int sysctl_tcp_low_latency __read_mostly;
-
 #ifdef CONFIG_TCP_MD5SIG
 static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
 			       __be32 daddr, __be32 saddr, const struct tcphdr *th);
-- 
cgit v1.2.3


From 45f119bf936b1f9f546a0b139c5b56f9bb2bdc78 Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Sun, 30 Jul 2017 03:57:21 +0200
Subject: tcp: remove header prediction

Like prequeue, I am not sure this is overly useful nowadays.

If we receive a train of packets, GRO will aggregate them if the
headers are the same (HP predates GRO by several years) so we don't
get a per-packet benefit, only a per-aggregated-packet one.

Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/tcp.h      |   6 --
 include/net/tcp.h        |  23 ------
 net/ipv4/tcp.c           |   4 +-
 net/ipv4/tcp_input.c     | 192 +++--------------------------------------------
 net/ipv4/tcp_minisocks.c |   2 -
 net/ipv4/tcp_output.c    |   2 -
 6 files changed, 10 insertions(+), 219 deletions(-)

(limited to 'include')

diff --git a/include/linux/tcp.h b/include/linux/tcp.h
index 32fb37cfb0d1..d7389ea36e10 100644
--- a/include/linux/tcp.h
+++ b/include/linux/tcp.h
@@ -147,12 +147,6 @@ struct tcp_sock {
 	u16	tcp_header_len;	/* Bytes of tcp header to send		*/
 	u16	gso_segs;	/* Max number of segs per GSO packet	*/
 
-/*
- *	Header prediction flags
- *	0x5?10 << 16 + snd_wnd in net byte order
- */
-	__be32	pred_flags;
-
 /*
  *	RFC793 variables by their proper names. This means you can
  *	read the code and the spec side by side (and laugh ...)
diff --git a/include/net/tcp.h b/include/net/tcp.h
index 8507c81fb0e9..8f11b82b5b5a 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -631,29 +631,6 @@ static inline u32 __tcp_set_rto(const struct tcp_sock *tp)
 	return usecs_to_jiffies((tp->srtt_us >> 3) + tp->rttvar_us);
 }
 
-static inline void __tcp_fast_path_on(struct tcp_sock *tp, u32 snd_wnd)
-{
-	tp->pred_flags = htonl((tp->tcp_header_len << 26) |
-			       ntohl(TCP_FLAG_ACK) |
-			       snd_wnd);
-}
-
-static inline void tcp_fast_path_on(struct tcp_sock *tp)
-{
-	__tcp_fast_path_on(tp, tp->snd_wnd >> tp->rx_opt.snd_wscale);
-}
-
-static inline void tcp_fast_path_check(struct sock *sk)
-{
-	struct tcp_sock *tp = tcp_sk(sk);
-
-	if (RB_EMPTY_ROOT(&tp->out_of_order_queue) &&
-	    tp->rcv_wnd &&
-	    atomic_read(&sk->sk_rmem_alloc) < sk->sk_rcvbuf &&
-	    !tp->urg_data)
-		tcp_fast_path_on(tp);
-}
-
 /* Compute the actual rto_min value */
 static inline u32 tcp_rto_min(struct sock *sk)
 {
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 62018ea6f45f..e022874d509f 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -1848,10 +1848,8 @@ int tcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int nonblock,
 		tcp_rcv_space_adjust(sk);
 
 skip_copy:
-		if (tp->urg_data && after(tp->copied_seq, tp->urg_seq)) {
+		if (tp->urg_data && after(tp->copied_seq, tp->urg_seq))
 			tp->urg_data = 0;
-			tcp_fast_path_check(sk);
-		}
 		if (used + offset < skb->len)
 			continue;
 
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 87efde9f5a90..bfde9d7d210e 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -103,7 +103,6 @@ int sysctl_tcp_invalid_ratelimit __read_mostly = HZ/2;
 #define FLAG_DATA_SACKED	0x20 /* New SACK.				*/
 #define FLAG_ECE		0x40 /* ECE in this ACK				*/
 #define FLAG_LOST_RETRANS	0x80 /* This ACK marks some retransmission lost */
-#define FLAG_SLOWPATH		0x100 /* Do not skip RFC checks for window update.*/
 #define FLAG_ORIG_SACK_ACKED	0x200 /* Never retransmitted data are (s)acked	*/
 #define FLAG_SND_UNA_ADVANCED	0x400 /* Snd_una was changed (!= FLAG_DATA_ACKED) */
 #define FLAG_DSACKING_ACK	0x800 /* SACK blocks contained D-SACK info */
@@ -3367,12 +3366,6 @@ static int tcp_ack_update_window(struct sock *sk, const struct sk_buff *skb, u32
 		if (tp->snd_wnd != nwin) {
 			tp->snd_wnd = nwin;
 
-			/* Note, it is the only place, where
-			 * fast path is recovered for sending TCP.
-			 */
-			tp->pred_flags = 0;
-			tcp_fast_path_check(sk);
-
 			if (tcp_send_head(sk))
 				tcp_slow_start_after_idle_check(sk);
 
@@ -3597,19 +3590,7 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
 	if (flag & FLAG_UPDATE_TS_RECENT)
 		tcp_replace_ts_recent(tp, TCP_SKB_CB(skb)->seq);
 
-	if (!(flag & FLAG_SLOWPATH) && after(ack, prior_snd_una)) {
-		/* Window is constant, pure forward advance.
-		 * No more checks are required.
-		 * Note, we use the fact that SND.UNA>=SND.WL2.
-		 */
-		tcp_update_wl(tp, ack_seq);
-		tcp_snd_una_update(tp, ack);
-		flag |= FLAG_WIN_UPDATE;
-
-		tcp_in_ack_event(sk, CA_ACK_WIN_UPDATE);
-
-		NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPHPACKS);
-	} else {
+	{
 		u32 ack_ev_flags = CA_ACK_SLOWPATH;
 
 		if (ack_seq != TCP_SKB_CB(skb)->end_seq)
@@ -4398,8 +4379,6 @@ static void tcp_data_queue_ofo(struct sock *sk, struct sk_buff *skb)
 		return;
 	}
 
-	/* Disable header prediction. */
-	tp->pred_flags = 0;
 	inet_csk_schedule_ack(sk);
 
 	NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPOFOQUEUE);
@@ -4638,8 +4617,6 @@ queue_and_out:
 		if (tp->rx_opt.num_sacks)
 			tcp_sack_remove(tp);
 
-		tcp_fast_path_check(sk);
-
 		if (eaten > 0)
 			kfree_skb_partial(skb, fragstolen);
 		if (!sock_flag(sk, SOCK_DEAD))
@@ -4965,7 +4942,6 @@ static int tcp_prune_queue(struct sock *sk)
 	NET_INC_STATS(sock_net(sk), LINUX_MIB_RCVPRUNED);
 
 	/* Massive buffer overcommit. */
-	tp->pred_flags = 0;
 	return -1;
 }
 
@@ -5137,9 +5113,6 @@ static void tcp_check_urg(struct sock *sk, const struct tcphdr *th)
 
 	tp->urg_data = TCP_URG_NOTYET;
 	tp->urg_seq = ptr;
-
-	/* Disable header prediction. */
-	tp->pred_flags = 0;
 }
 
 /* This is the 'fast' part of urgent handling. */
@@ -5298,26 +5271,6 @@ discard:
 
 /*
  *	TCP receive function for the ESTABLISHED state.
- *
- *	It is split into a fast path and a slow path. The fast path is
- * 	disabled when:
- *	- A zero window was announced from us - zero window probing
- *        is only handled properly in the slow path.
- *	- Out of order segments arrived.
- *	- Urgent data is expected.
- *	- There is no buffer space left
- *	- Unexpected TCP flags/window values/header lengths are received
- *	  (detected by checking the TCP header against pred_flags)
- *	- Data is sent in both directions. Fast path only supports pure senders
- *	  or pure receivers (this means either the sequence number or the ack
- *	  value must stay constant)
- *	- Unexpected TCP option.
- *
- *	When these conditions are not satisfied it drops into a standard
- *	receive procedure patterned after RFC793 to handle all cases.
- *	The first three cases are guaranteed by proper pred_flags setting,
- *	the rest is checked inline. Fast processing is turned on in
- *	tcp_data_queue when everything is OK.
  */
 void tcp_rcv_established(struct sock *sk, struct sk_buff *skb,
 			 const struct tcphdr *th)
@@ -5328,144 +5281,19 @@ void tcp_rcv_established(struct sock *sk, struct sk_buff *skb,
 	tcp_mstamp_refresh(tp);
 	if (unlikely(!sk->sk_rx_dst))
 		inet_csk(sk)->icsk_af_ops->sk_rx_dst_set(sk, skb);
-	/*
-	 *	Header prediction.
-	 *	The code loosely follows the one in the famous
-	 *	"30 instruction TCP receive" Van Jacobson mail.
-	 *
-	 *	Van's trick is to deposit buffers into socket queue
-	 *	on a device interrupt, to call tcp_recv function
-	 *	on the receive process context and checksum and copy
-	 *	the buffer to user space. smart...
-	 *
-	 *	Our current scheme is not silly either but we take the
-	 *	extra cost of the net_bh soft interrupt processing...
-	 *	We do checksum and copy also but from device to kernel.
-	 */
 
 	tp->rx_opt.saw_tstamp = 0;
 
-	/*	pred_flags is 0xS?10 << 16 + snd_wnd
-	 *	if header_prediction is to be made
-	 *	'S' will always be tp->tcp_header_len >> 2
-	 *	'?' will be 0 for the fast path, otherwise pred_flags is 0 to
-	 *  turn it off	(when there are holes in the receive
-	 *	 space for instance)
-	 *	PSH flag is ignored.
-	 */
-
-	if ((tcp_flag_word(th) & TCP_HP_BITS) == tp->pred_flags &&
-	    TCP_SKB_CB(skb)->seq == tp->rcv_nxt &&
-	    !after(TCP_SKB_CB(skb)->ack_seq, tp->snd_nxt)) {
-		int tcp_header_len = tp->tcp_header_len;
-
-		/* Timestamp header prediction: tcp_header_len
-		 * is automatically equal to th->doff*4 due to pred_flags
-		 * match.
-		 */
-
-		/* Check timestamp */
-		if (tcp_header_len == sizeof(struct tcphdr) + TCPOLEN_TSTAMP_ALIGNED) {
-			/* No? Slow path! */
-			if (!tcp_parse_aligned_timestamp(tp, th))
-				goto slow_path;
-
-			/* If PAWS failed, check it more carefully in slow path */
-			if ((s32)(tp->rx_opt.rcv_tsval - tp->rx_opt.ts_recent) < 0)
-				goto slow_path;
-
-			/* DO NOT update ts_recent here, if checksum fails
-			 * and timestamp was corrupted part, it will result
-			 * in a hung connection since we will drop all
-			 * future packets due to the PAWS test.
-			 */
-		}
-
-		if (len <= tcp_header_len) {
-			/* Bulk data transfer: sender */
-			if (len == tcp_header_len) {
-				/* Predicted packet is in window by definition.
-				 * seq == rcv_nxt and rcv_wup <= rcv_nxt.
-				 * Hence, check seq<=rcv_wup reduces to:
-				 */
-				if (tcp_header_len ==
-				    (sizeof(struct tcphdr) + TCPOLEN_TSTAMP_ALIGNED) &&
-				    tp->rcv_nxt == tp->rcv_wup)
-					tcp_store_ts_recent(tp);
-
-				/* We know that such packets are checksummed
-				 * on entry.
-				 */
-				tcp_ack(sk, skb, 0);
-				__kfree_skb(skb);
-				tcp_data_snd_check(sk);
-				return;
-			} else { /* Header too small */
-				TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
-				goto discard;
-			}
-		} else {
-			int eaten = 0;
-			bool fragstolen = false;
-
-			if (tcp_checksum_complete(skb))
-				goto csum_error;
-
-			if ((int)skb->truesize > sk->sk_forward_alloc)
-				goto step5;
-
-			/* Predicted packet is in window by definition.
-			 * seq == rcv_nxt and rcv_wup <= rcv_nxt.
-			 * Hence, check seq<=rcv_wup reduces to:
-			 */
-			if (tcp_header_len ==
-			    (sizeof(struct tcphdr) + TCPOLEN_TSTAMP_ALIGNED) &&
-			    tp->rcv_nxt == tp->rcv_wup)
-				tcp_store_ts_recent(tp);
-
-			tcp_rcv_rtt_measure_ts(sk, skb);
-
-			NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPHPHITS);
-
-			/* Bulk data transfer: receiver */
-			eaten = tcp_queue_rcv(sk, skb, tcp_header_len,
-					      &fragstolen);
-
-			tcp_event_data_recv(sk, skb);
-
-			if (TCP_SKB_CB(skb)->ack_seq != tp->snd_una) {
-				/* Well, only one small jumplet in fast path... */
-				tcp_ack(sk, skb, FLAG_DATA);
-				tcp_data_snd_check(sk);
-				if (!inet_csk_ack_scheduled(sk))
-					goto no_ack;
-			}
-
-			__tcp_ack_snd_check(sk, 0);
-no_ack:
-			if (eaten)
-				kfree_skb_partial(skb, fragstolen);
-			sk->sk_data_ready(sk);
-			return;
-		}
-	}
-
-slow_path:
 	if (len < (th->doff << 2) || tcp_checksum_complete(skb))
 		goto csum_error;
 
 	if (!th->ack && !th->rst && !th->syn)
 		goto discard;
 
-	/*
-	 *	Standard slow path.
-	 */
-
 	if (!tcp_validate_incoming(sk, skb, th, 1))
 		return;
 
-step5:
-	if (tcp_ack(sk, skb, FLAG_SLOWPATH | FLAG_UPDATE_TS_RECENT) < 0)
+	if (tcp_ack(sk, skb, FLAG_UPDATE_TS_RECENT) < 0)
 		goto discard;
 
 	tcp_rcv_rtt_measure_ts(sk, skb);
@@ -5519,11 +5347,10 @@ void tcp_finish_connect(struct sock *sk, struct sk_buff *skb)
 	if (sock_flag(sk, SOCK_KEEPOPEN))
 		inet_csk_reset_keepalive_timer(sk, keepalive_time_when(tp));
 
-	if (!tp->rx_opt.snd_wscale)
-		__tcp_fast_path_on(tp, tp->snd_wnd);
-	else
-		tp->pred_flags = 0;
-
+	if (!sock_flag(sk, SOCK_DEAD)) {
+		sk->sk_state_change(sk);
+		sk_wake_async(sk, SOCK_WAKE_IO, POLL_OUT);
+	}
 }
 
 static bool tcp_rcv_fastopen_synack(struct sock *sk, struct sk_buff *synack,
@@ -5652,7 +5479,7 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb,
 		tcp_ecn_rcv_synack(tp, th);
 
 		tcp_init_wl(tp, TCP_SKB_CB(skb)->seq);
-		tcp_ack(sk, skb, FLAG_SLOWPATH);
+		tcp_ack(sk, skb, 0);
 
 		/* Ok.. it's good. Set up sequence numbers and
 		 * move to established.
@@ -5888,8 +5715,8 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb)
 		return 0;
 
 	/* step 5: check the ACK field */
-	acceptable = tcp_ack(sk, skb, FLAG_SLOWPATH |
-				      FLAG_UPDATE_TS_RECENT |
+
+	acceptable = tcp_ack(sk, skb, FLAG_UPDATE_TS_RECENT |
 				      FLAG_NO_CHALLENGE_ACK) > 0;
 
 	if (!acceptable) {
@@ -5957,7 +5784,6 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb)
 		tp->lsndtime = tcp_jiffies32;
 
 		tcp_initialize_rcv_mss(sk);
-		tcp_fast_path_on(tp);
 		break;
 
 	case TCP_FIN_WAIT1: {
diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c
index 188a6f31356d..1537b87c657f 100644
--- a/net/ipv4/tcp_minisocks.c
+++ b/net/ipv4/tcp_minisocks.c
@@ -436,8 +436,6 @@ struct sock *tcp_create_openreq_child(const struct sock *sk,
 		struct tcp_sock *newtp = tcp_sk(newsk);
 
 		/* Now setup tcp_sock */
-		newtp->pred_flags = 0;
-
 		newtp->rcv_wup = newtp->copied_seq =
 		newtp->rcv_nxt = treq->rcv_isn + 1;
 		newtp->segs_in = 1;
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 886d874775df..8380464aead1 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -295,9 +295,7 @@ static u16 tcp_select_window(struct sock *sk)
 	/* RFC1323 scaling applied */
 	new_win >>= tp->rx_opt.rcv_wscale;
 
-	/* If we advertise zero window, disable fast path. */
 	if (new_win == 0) {
-		tp->pred_flags = 0;
 		if (old_win)
 			NET_INC_STATS(sock_net(sk),
 				      LINUX_MIB_TCPTOZEROWINDOWADV);
-- 
cgit v1.2.3


From 573aeb0492be3d0e5be9796a0c91abde794c1e36 Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Sun, 30 Jul 2017 03:57:22 +0200
Subject: tcp: remove CA_ACK_SLOWPATH

re-indent tcp_ack, and remove CA_ACK_SLOWPATH; it is always set now.

Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/tcp.h       |  5 ++---
 net/ipv4/tcp_input.c    | 35 ++++++++++++++++-------------------
 net/ipv4/tcp_westwood.c | 31 ++++---------------------------
 3 files changed, 22 insertions(+), 49 deletions(-)

(limited to 'include')

diff --git a/include/net/tcp.h b/include/net/tcp.h
index 8f11b82b5b5a..3ecb62811004 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -880,9 +880,8 @@ enum tcp_ca_event {
 
 /* Information about inbound ACK, passed to cong_ops->in_ack_event() */
 enum tcp_ca_ack_event_flags {
-	CA_ACK_SLOWPATH		= (1 << 0),	/* In slow path processing */
-	CA_ACK_WIN_UPDATE	= (1 << 1),	/* ACK updated window */
-	CA_ACK_ECE		= (1 << 2),	/* ECE bit is set on ack */
+	CA_ACK_WIN_UPDATE	= (1 << 0),	/* ACK updated window */
+	CA_ACK_ECE		= (1 << 1),	/* ECE bit is set on ack */
 };
 
 /*
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index bfde9d7d210e..af0a98d54b62 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -3547,6 +3547,7 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
 	u32 lost = tp->lost;
 	int acked = 0; /* Number of packets newly acked */
 	int rexmit = REXMIT_NONE; /* Flag to (re)transmit to recover losses */
+	u32 ack_ev_flags = 0;
 
 	sack_state.first_sackt = 0;
 	sack_state.rate = &rs;
@@ -3590,30 +3591,26 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
 	if (flag & FLAG_UPDATE_TS_RECENT)
 		tcp_replace_ts_recent(tp, TCP_SKB_CB(skb)->seq);
 
-	{
-		u32 ack_ev_flags = CA_ACK_SLOWPATH;
-
-		if (ack_seq != TCP_SKB_CB(skb)->end_seq)
-			flag |= FLAG_DATA;
-		else
-			NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPPUREACKS);
+	if (ack_seq != TCP_SKB_CB(skb)->end_seq)
+		flag |= FLAG_DATA;
+	else
+		NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPPUREACKS);
 
-		flag |= tcp_ack_update_window(sk, skb, ack, ack_seq);
+	flag |= tcp_ack_update_window(sk, skb, ack, ack_seq);
 
-		if (TCP_SKB_CB(skb)->sacked)
-			flag |= tcp_sacktag_write_queue(sk, skb, prior_snd_una,
-							&sack_state);
+	if (TCP_SKB_CB(skb)->sacked)
+		flag |= tcp_sacktag_write_queue(sk, skb, prior_snd_una,
+						&sack_state);
 
-		if (tcp_ecn_rcv_ecn_echo(tp, tcp_hdr(skb))) {
-			flag |= FLAG_ECE;
-			ack_ev_flags |= CA_ACK_ECE;
-		}
+	if (tcp_ecn_rcv_ecn_echo(tp, tcp_hdr(skb))) {
+		flag |= FLAG_ECE;
+		ack_ev_flags = CA_ACK_ECE;
+	}
 
-		if (flag & FLAG_WIN_UPDATE)
-			ack_ev_flags |= CA_ACK_WIN_UPDATE;
+	if (flag & FLAG_WIN_UPDATE)
+		ack_ev_flags |= CA_ACK_WIN_UPDATE;
 
-		tcp_in_ack_event(sk, ack_ev_flags);
-	}
+	tcp_in_ack_event(sk, ack_ev_flags);
 
 	/* We passed data and got it acked, remove any soft error
 	 * log. Something worked...
diff --git a/net/ipv4/tcp_westwood.c b/net/ipv4/tcp_westwood.c
index bec9cafbe3f9..e5de84310949 100644
--- a/net/ipv4/tcp_westwood.c
+++ b/net/ipv4/tcp_westwood.c
@@ -153,24 +153,6 @@ static inline void update_rtt_min(struct westwood *w)
 		w->rtt_min = min(w->rtt, w->rtt_min);
 }
 
-/*
- * @westwood_fast_bw
- * It is called when we are in fast path. In particular it is called when
- * header prediction is successful. In such case in fact update is
- * straight forward and doesn't need any particular care.
- */
-static inline void westwood_fast_bw(struct sock *sk)
-{
-	const struct tcp_sock *tp = tcp_sk(sk);
-	struct westwood *w = inet_csk_ca(sk);
-
-	westwood_update_window(sk);
-
-	w->bk += tp->snd_una - w->snd_una;
-	w->snd_una = tp->snd_una;
-	update_rtt_min(w);
-}
-
 /*
  * @westwood_acked_count
  * This function evaluates cumul_ack for evaluating bk in case of
@@ -223,17 +205,12 @@ static u32 tcp_westwood_bw_rttmin(const struct sock *sk)
 
 static void tcp_westwood_ack(struct sock *sk, u32 ack_flags)
 {
-	if (ack_flags & CA_ACK_SLOWPATH) {
-		struct westwood *w = inet_csk_ca(sk);
-
-		westwood_update_window(sk);
-		w->bk += westwood_acked_count(sk);
+	struct westwood *w = inet_csk_ca(sk);
 
-		update_rtt_min(w);
-		return;
-	}
+	westwood_update_window(sk);
+	w->bk += westwood_acked_count(sk);
 
-	westwood_fast_bw(sk);
+	update_rtt_min(w);
 }
 
 static void tcp_westwood_event(struct sock *sk, enum tcp_ca_event event)
-- 
cgit v1.2.3


From 3282e65558b3651e230ee985c174c35cb2fedaf1 Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Sun, 30 Jul 2017 03:57:23 +0200
Subject: tcp: remove unused mib counters

was used by tcp prequeue and header prediction.
TCPFORWARDRETRANS use was removed in january.

Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/snmp.h | 9 ---------
 net/ipv4/proc.c           | 9 ---------
 2 files changed, 18 deletions(-)

(limited to 'include')

diff --git a/include/uapi/linux/snmp.h b/include/uapi/linux/snmp.h
index d85693295798..b3f346fb9fe3 100644
--- a/include/uapi/linux/snmp.h
+++ b/include/uapi/linux/snmp.h
@@ -184,14 +184,7 @@ enum
 	LINUX_MIB_DELAYEDACKLOST,		/* DelayedACKLost */
 	LINUX_MIB_LISTENOVERFLOWS,		/* ListenOverflows */
 	LINUX_MIB_LISTENDROPS,			/* ListenDrops */
-	LINUX_MIB_TCPPREQUEUED,			/* TCPPrequeued */
-	LINUX_MIB_TCPDIRECTCOPYFROMBACKLOG,	/* TCPDirectCopyFromBacklog */
-	LINUX_MIB_TCPDIRECTCOPYFROMPREQUEUE,	/* TCPDirectCopyFromPrequeue */
-	LINUX_MIB_TCPPREQUEUEDROPPED,		/* TCPPrequeueDropped */
-	LINUX_MIB_TCPHPHITS,			/* TCPHPHits */
-	LINUX_MIB_TCPHPHITSTOUSER,		/* TCPHPHitsToUser */
 	LINUX_MIB_TCPPUREACKS,			/* TCPPureAcks */
-	LINUX_MIB_TCPHPACKS,			/* TCPHPAcks */
 	LINUX_MIB_TCPRENORECOVERY,		/* TCPRenoRecovery */
 	LINUX_MIB_TCPSACKRECOVERY,		/* TCPSackRecovery */
 	LINUX_MIB_TCPSACKRENEGING,		/* TCPSACKReneging */
@@ -208,14 +201,12 @@ enum
 	LINUX_MIB_TCPSACKFAILURES,		/* TCPSackFailures */
 	LINUX_MIB_TCPLOSSFAILURES,		/* TCPLossFailures */
 	LINUX_MIB_TCPFASTRETRANS,		/* TCPFastRetrans */
-	LINUX_MIB_TCPFORWARDRETRANS,		/* TCPForwardRetrans */
 	LINUX_MIB_TCPSLOWSTARTRETRANS,		/* TCPSlowStartRetrans */
 	LINUX_MIB_TCPTIMEOUTS,			/* TCPTimeouts */
 	LINUX_MIB_TCPLOSSPROBES,		/* TCPLossProbes */
 	LINUX_MIB_TCPLOSSPROBERECOVERY,		/* TCPLossProbeRecovery */
 	LINUX_MIB_TCPRENORECOVERYFAIL,		/* TCPRenoRecoveryFail */
 	LINUX_MIB_TCPSACKRECOVERYFAIL,		/* TCPSackRecoveryFail */
-	LINUX_MIB_TCPSCHEDULERFAILED,		/* TCPSchedulerFailed */
 	LINUX_MIB_TCPRCVCOLLAPSED,		/* TCPRcvCollapsed */
 	LINUX_MIB_TCPDSACKOLDSENT,		/* TCPDSACKOldSent */
 	LINUX_MIB_TCPDSACKOFOSENT,		/* TCPDSACKOfoSent */
diff --git a/net/ipv4/proc.c b/net/ipv4/proc.c
index 43eb6567b3a0..b6d3fe03feb3 100644
--- a/net/ipv4/proc.c
+++ b/net/ipv4/proc.c
@@ -206,14 +206,7 @@ static const struct snmp_mib snmp4_net_list[] = {
 	SNMP_MIB_ITEM("DelayedACKLost", LINUX_MIB_DELAYEDACKLOST),
 	SNMP_MIB_ITEM("ListenOverflows", LINUX_MIB_LISTENOVERFLOWS),
 	SNMP_MIB_ITEM("ListenDrops", LINUX_MIB_LISTENDROPS),
-	SNMP_MIB_ITEM("TCPPrequeued", LINUX_MIB_TCPPREQUEUED),
-	SNMP_MIB_ITEM("TCPDirectCopyFromBacklog", LINUX_MIB_TCPDIRECTCOPYFROMBACKLOG),
-	SNMP_MIB_ITEM("TCPDirectCopyFromPrequeue", LINUX_MIB_TCPDIRECTCOPYFROMPREQUEUE),
-	SNMP_MIB_ITEM("TCPPrequeueDropped", LINUX_MIB_TCPPREQUEUEDROPPED),
-	SNMP_MIB_ITEM("TCPHPHits", LINUX_MIB_TCPHPHITS),
-	SNMP_MIB_ITEM("TCPHPHitsToUser", LINUX_MIB_TCPHPHITSTOUSER),
 	SNMP_MIB_ITEM("TCPPureAcks", LINUX_MIB_TCPPUREACKS),
-	SNMP_MIB_ITEM("TCPHPAcks", LINUX_MIB_TCPHPACKS),
 	SNMP_MIB_ITEM("TCPRenoRecovery", LINUX_MIB_TCPRENORECOVERY),
 	SNMP_MIB_ITEM("TCPSackRecovery", LINUX_MIB_TCPSACKRECOVERY),
 	SNMP_MIB_ITEM("TCPSACKReneging", LINUX_MIB_TCPSACKRENEGING),
@@ -230,14 +223,12 @@ static const struct snmp_mib snmp4_net_list[] = {
 	SNMP_MIB_ITEM("TCPSackFailures", LINUX_MIB_TCPSACKFAILURES),
 	SNMP_MIB_ITEM("TCPLossFailures", LINUX_MIB_TCPLOSSFAILURES),
 	SNMP_MIB_ITEM("TCPFastRetrans", LINUX_MIB_TCPFASTRETRANS),
-	SNMP_MIB_ITEM("TCPForwardRetrans", LINUX_MIB_TCPFORWARDRETRANS),
 	SNMP_MIB_ITEM("TCPSlowStartRetrans", LINUX_MIB_TCPSLOWSTARTRETRANS),
 	SNMP_MIB_ITEM("TCPTimeouts", LINUX_MIB_TCPTIMEOUTS),
 	SNMP_MIB_ITEM("TCPLossProbes", LINUX_MIB_TCPLOSSPROBES),
 	SNMP_MIB_ITEM("TCPLossProbeRecovery", LINUX_MIB_TCPLOSSPROBERECOVERY),
 	SNMP_MIB_ITEM("TCPRenoRecoveryFail", LINUX_MIB_TCPRENORECOVERYFAIL),
 	SNMP_MIB_ITEM("TCPSackRecoveryFail", LINUX_MIB_TCPSACKRECOVERYFAIL),
-	SNMP_MIB_ITEM("TCPSchedulerFailed", LINUX_MIB_TCPSCHEDULERFAILED),
 	SNMP_MIB_ITEM("TCPRcvCollapsed", LINUX_MIB_TCPRCVCOLLAPSED),
 	SNMP_MIB_ITEM("TCPDSACKOldSent", LINUX_MIB_TCPDSACKOLDSENT),
 	SNMP_MIB_ITEM("TCPDSACKOfoSent", LINUX_MIB_TCPDSACKOFOSENT),
-- 
cgit v1.2.3


From f248aff86d1fd6e60b656c2af278f1723c3b84c2 Mon Sep 17 00:00:00 2001
From: Florian Fainelli <f.fainelli@gmail.com>
Date: Mon, 31 Jul 2017 12:04:25 -0700
Subject: net: phy: mdio-bcm-unimac: Allow specifying platform data

In preparation for having the bcmgenet driver migrate over the
mdio-bcm-unimac driver, add a platform data structure which allows
passing integrating specific details like bus name, wait function to
complete MDIO operations and PHY mask.

We also define what the platform device name contract is by defining
UNIMAC_MDIO_DRV_NAME and moving it to the platform_data header.

Signed-off-by: Florian Fainelli <f.fainelli@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 MAINTAINERS                                   |  1 +
 drivers/net/phy/mdio-bcm-unimac.c             | 28 +++++++++++++++++++++------
 include/linux/platform_data/mdio-bcm-unimac.h | 13 +++++++++++++
 3 files changed, 36 insertions(+), 6 deletions(-)
 create mode 100644 include/linux/platform_data/mdio-bcm-unimac.h

(limited to 'include')

diff --git a/MAINTAINERS b/MAINTAINERS
index 297e610c9163..7d72fdbed6e6 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -5140,6 +5140,7 @@ L:	netdev@vger.kernel.org
 S:	Maintained
 F:	include/linux/phy.h
 F:	include/linux/phy_fixed.h
+F:	include/linux/platform_data/mdio-bcm-unimac.h
 F:	drivers/net/phy/
 F:	Documentation/networking/phy.txt
 F:	drivers/of/of_mdio.c
diff --git a/drivers/net/phy/mdio-bcm-unimac.c b/drivers/net/phy/mdio-bcm-unimac.c
index 4e52692f9eea..89425ca48412 100644
--- a/drivers/net/phy/mdio-bcm-unimac.c
+++ b/drivers/net/phy/mdio-bcm-unimac.c
@@ -21,6 +21,8 @@
 #include <linux/of_platform.h>
 #include <linux/of_mdio.h>
 
+#include <linux/platform_data/mdio-bcm-unimac.h>
+
 #define MDIO_CMD		0x00
 #define  MDIO_START_BUSY	(1 << 29)
 #define  MDIO_READ_FAIL		(1 << 28)
@@ -41,6 +43,8 @@
 struct unimac_mdio_priv {
 	struct mii_bus		*mii_bus;
 	void __iomem		*base;
+	int (*wait_func)	(void *wait_func_data);
+	void			*wait_func_data;
 };
 
 static inline void unimac_mdio_start(struct unimac_mdio_priv *priv)
@@ -57,8 +61,9 @@ static inline unsigned int unimac_mdio_busy(struct unimac_mdio_priv *priv)
 	return __raw_readl(priv->base + MDIO_CMD) & MDIO_START_BUSY;
 }
 
-static int unimac_mdio_poll(struct unimac_mdio_priv *priv)
+static int unimac_mdio_poll(void *wait_func_data)
 {
+	struct unimac_mdio_priv *priv = wait_func_data;
 	unsigned int timeout = 1000;
 
 	do {
@@ -77,6 +82,7 @@ static int unimac_mdio_poll(struct unimac_mdio_priv *priv)
 static int unimac_mdio_read(struct mii_bus *bus, int phy_id, int reg)
 {
 	struct unimac_mdio_priv *priv = bus->priv;
+	int ret;
 	u32 cmd;
 
 	/* Prepare the read operation */
@@ -86,7 +92,7 @@ static int unimac_mdio_read(struct mii_bus *bus, int phy_id, int reg)
 	/* Start MDIO transaction */
 	unimac_mdio_start(priv);
 
-	ret = unimac_mdio_poll(priv);
+	ret = priv->wait_func(priv->wait_func_data);
 	if (ret)
 		return ret;
 
@@ -116,7 +122,7 @@ static int unimac_mdio_write(struct mii_bus *bus, int phy_id,
 
 	unimac_mdio_start(priv);
 
-	return unimac_mdio_poll(priv);
+	return priv->wait_func(priv->wait_func_data);
 }
 
 /* Workaround for integrated BCM7xxx Gigabit PHYs which have a problem with
@@ -165,6 +171,7 @@ static int unimac_mdio_reset(struct mii_bus *bus)
 
 static int unimac_mdio_probe(struct platform_device *pdev)
 {
+	struct unimac_mdio_pdata *pdata = pdev->dev.platform_data;
 	struct unimac_mdio_priv *priv;
 	struct device_node *np;
 	struct mii_bus *bus;
@@ -194,7 +201,16 @@ static int unimac_mdio_probe(struct platform_device *pdev)
 
 	bus = priv->mii_bus;
 	bus->priv = priv;
-	bus->name = "unimac MII bus";
+	if (pdata) {
+		bus->name = pdata->bus_name;
+		priv->wait_func = pdata->wait_func;
+		priv->wait_func_data = pdata->wait_func_data;
+		bus->phy_mask = ~pdata->phy_mask;
+	} else {
+		bus->name = "unimac MII bus";
+		priv->wait_func_data = priv;
+		priv->wait_func = unimac_mdio_poll;
+	}
 	bus->parent = &pdev->dev;
 	bus->read = unimac_mdio_read;
 	bus->write = unimac_mdio_write;
@@ -241,7 +257,7 @@ MODULE_DEVICE_TABLE(of, unimac_mdio_ids);
 
 static struct platform_driver unimac_mdio_driver = {
 	.driver = {
-		.name = "unimac-mdio",
+		.name = UNIMAC_MDIO_DRV_NAME,
 		.of_match_table = unimac_mdio_ids,
 	},
 	.probe	= unimac_mdio_probe,
@@ -252,4 +268,4 @@ module_platform_driver(unimac_mdio_driver);
 MODULE_AUTHOR("Broadcom Corporation");
 MODULE_DESCRIPTION("Broadcom UniMAC MDIO bus controller");
 MODULE_LICENSE("GPL");
-MODULE_ALIAS("platform:unimac-mdio");
+MODULE_ALIAS("platform:" UNIMAC_MDIO_DRV_NAME);
diff --git a/include/linux/platform_data/mdio-bcm-unimac.h b/include/linux/platform_data/mdio-bcm-unimac.h
new file mode 100644
index 000000000000..8a5f9f0b2c52
--- /dev/null
+++ b/include/linux/platform_data/mdio-bcm-unimac.h
@@ -0,0 +1,13 @@
+#ifndef __MDIO_BCM_UNIMAC_PDATA_H
+#define __MDIO_BCM_UNIMAC_PDATA_H
+
+struct unimac_mdio_pdata {
+	u32 phy_mask;
+	int (*wait_func)(void *data);
+	void *wait_func_data;
+	const char *bus_name;
+};
+
+#define UNIMAC_MDIO_DRV_NAME	"unimac-mdio"
+
+#endif /* __MDIO_BCM_UNIMAC_PDATA_H */
-- 
cgit v1.2.3


From a823e4589e68996436d16db4ede9a43b646332f9 Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@redhat.com>
Date: Fri, 28 Jul 2017 07:24:43 -0400
Subject: mm: add file_fdatawait_range and file_write_and_wait

Necessary now for gfs2_fsync and sync_file_range, but there will
eventually be other callers.

Reviewed-by: Jan Kara <jack@suse.cz>
Signed-off-by: Jeff Layton <jlayton@redhat.com>
---
 include/linux/fs.h | 11 ++++++++++-
 mm/filemap.c       | 23 +++++++++++++++++++++++
 2 files changed, 33 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/fs.h b/include/linux/fs.h
index 21e7df1ad613..af592ca3d509 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -2544,6 +2544,8 @@ extern int filemap_fdatawait_range(struct address_space *, loff_t lstart,
 				   loff_t lend);
 extern bool filemap_range_has_page(struct address_space *, loff_t lstart,
 				  loff_t lend);
+extern int __must_check file_fdatawait_range(struct file *file, loff_t lstart,
+						loff_t lend);
 extern int filemap_write_and_wait(struct address_space *mapping);
 extern int filemap_write_and_wait_range(struct address_space *mapping,
 				        loff_t lstart, loff_t lend);
@@ -2552,12 +2554,19 @@ extern int __filemap_fdatawrite_range(struct address_space *mapping,
 extern int filemap_fdatawrite_range(struct address_space *mapping,
 				loff_t start, loff_t end);
 extern int filemap_check_errors(struct address_space *mapping);
-
 extern void __filemap_set_wb_err(struct address_space *mapping, int err);
+
+extern int __must_check file_fdatawait_range(struct file *file, loff_t lstart,
+						loff_t lend);
 extern int __must_check file_check_and_advance_wb_err(struct file *file);
 extern int __must_check file_write_and_wait_range(struct file *file,
 						loff_t start, loff_t end);
 
+static inline int file_write_and_wait(struct file *file)
+{
+	return file_write_and_wait_range(file, 0, LLONG_MAX);
+}
+
 /**
  * filemap_set_wb_err - set a writeback error on an address_space
  * @mapping: mapping in which to set writeback error
diff --git a/mm/filemap.c b/mm/filemap.c
index 72e46e6f0d9a..394bb5e96f87 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -475,6 +475,29 @@ int filemap_fdatawait_range(struct address_space *mapping, loff_t start_byte,
 }
 EXPORT_SYMBOL(filemap_fdatawait_range);
 
+/**
+ * file_fdatawait_range - wait for writeback to complete
+ * @file:		file pointing to address space structure to wait for
+ * @start_byte:		offset in bytes where the range starts
+ * @end_byte:		offset in bytes where the range ends (inclusive)
+ *
+ * Walk the list of under-writeback pages of the address space that file
+ * refers to, in the given range and wait for all of them.  Check error
+ * status of the address space vs. the file->f_wb_err cursor and return it.
+ *
+ * Since the error status of the file is advanced by this function,
+ * callers are responsible for checking the return value and handling and/or
+ * reporting the error.
+ */
+int file_fdatawait_range(struct file *file, loff_t start_byte, loff_t end_byte)
+{
+	struct address_space *mapping = file->f_mapping;
+
+	__filemap_fdatawait_range(mapping, start_byte, end_byte);
+	return file_check_and_advance_wb_err(file);
+}
+EXPORT_SYMBOL(file_fdatawait_range);
+
 /**
  * filemap_fdatawait_keep_errors - wait for writeback without clearing errors
  * @mapping: address space structure to wait for
-- 
cgit v1.2.3


From f247037120ecd3dcbbc196b51ded8b57edf4904f Mon Sep 17 00:00:00 2001
From: Chao Yu <yuchao0@huawei.com>
Date: Wed, 19 Jul 2017 00:19:05 +0800
Subject: f2fs: make max inline size changeable

This patch tries to make below macros calculating max inline size,
inline dentry field size considerring reserving size-changeable
space:
- MAX_INLINE_DATA
- NR_INLINE_DENTRY
- INLINE_DENTRY_BITMAP_SIZE
- INLINE_RESERVED_SIZE

Then, when inline_{data,dentry} options is enabled, it allows us to
reserve inline space with different size flexibly for adding newly
introduced inode attribute.

Signed-off-by: Chao Yu <yuchao0@huawei.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
 fs/f2fs/data.c          |  4 +--
 fs/f2fs/f2fs.h          | 48 ++++++++++++++++++++-----
 fs/f2fs/inline.c        | 95 +++++++++++++++++++++++++------------------------
 fs/f2fs/inode.c         |  4 +--
 fs/f2fs/super.c         |  3 ++
 include/linux/f2fs_fs.h | 23 +-----------
 6 files changed, 96 insertions(+), 81 deletions(-)

(limited to 'include')

diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c
index 87c1f4150c64..ca978c32ae00 100644
--- a/fs/f2fs/data.c
+++ b/fs/f2fs/data.c
@@ -813,7 +813,7 @@ int f2fs_preallocate_blocks(struct kiocb *iocb, struct iov_iter *from)
 				F2FS_GET_BLOCK_PRE_AIO :
 				F2FS_GET_BLOCK_PRE_DIO);
 	}
-	if (iocb->ki_pos + iov_iter_count(from) > MAX_INLINE_DATA) {
+	if (iocb->ki_pos + iov_iter_count(from) > MAX_INLINE_DATA(inode)) {
 		err = f2fs_convert_inline_inode(inode);
 		if (err)
 			return err;
@@ -1857,7 +1857,7 @@ restart:
 	set_new_dnode(&dn, inode, ipage, ipage, 0);
 
 	if (f2fs_has_inline_data(inode)) {
-		if (pos + len <= MAX_INLINE_DATA) {
+		if (pos + len <= MAX_INLINE_DATA(inode)) {
 			read_inline_data(page, ipage);
 			set_inode_flag(inode, FI_DATA_EXIST);
 			if (inode->i_nlink)
diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
index f2937986d73c..b8e63739d1c1 100644
--- a/fs/f2fs/f2fs.h
+++ b/fs/f2fs/f2fs.h
@@ -357,6 +357,25 @@ struct f2fs_flush_device {
 	u32 segments;		/* # of segments to flush */
 };
 
+/* for inline stuff */
+#define DEF_INLINE_RESERVED_SIZE	1
+
+static inline int get_inline_reserved_size(struct inode *inode);
+#define MAX_INLINE_DATA(inode)	(sizeof(__le32) * (DEF_ADDRS_PER_INODE -\
+				get_inline_reserved_size(inode) -\
+				F2FS_INLINE_XATTR_ADDRS))
+
+/* for inline dir */
+#define NR_INLINE_DENTRY(inode)	(MAX_INLINE_DATA(inode) * BITS_PER_BYTE / \
+				((SIZE_OF_DIR_ENTRY + F2FS_SLOT_LEN) * \
+				BITS_PER_BYTE + 1))
+#define INLINE_DENTRY_BITMAP_SIZE(inode)	((NR_INLINE_DENTRY(inode) + \
+					BITS_PER_BYTE - 1) / BITS_PER_BYTE)
+#define INLINE_RESERVED_SIZE(inode)	(MAX_INLINE_DATA(inode) - \
+				((SIZE_OF_DIR_ENTRY + F2FS_SLOT_LEN) * \
+				NR_INLINE_DENTRY(inode) + \
+				INLINE_DENTRY_BITMAP_SIZE(inode)))
+
 /*
  * For INODE and NODE manager
  */
@@ -382,14 +401,19 @@ static inline void make_dentry_ptr_block(struct inode *inode,
 }
 
 static inline void make_dentry_ptr_inline(struct inode *inode,
-		struct f2fs_dentry_ptr *d, struct f2fs_inline_dentry *t)
+					struct f2fs_dentry_ptr *d, void *t)
 {
+	int entry_cnt = NR_INLINE_DENTRY(inode);
+	int bitmap_size = INLINE_DENTRY_BITMAP_SIZE(inode);
+	int reserved_size = INLINE_RESERVED_SIZE(inode);
+
 	d->inode = inode;
-	d->max = NR_INLINE_DENTRY;
-	d->nr_bitmap = INLINE_DENTRY_BITMAP_SIZE;
-	d->bitmap = &t->dentry_bitmap;
-	d->dentry = t->dentry;
-	d->filename = t->filename;
+	d->max = entry_cnt;
+	d->nr_bitmap = bitmap_size;
+	d->bitmap = t;
+	d->dentry = t + bitmap_size + reserved_size;
+	d->filename = t + bitmap_size + reserved_size +
+					SIZE_OF_DIR_ENTRY * entry_cnt;
 }
 
 /*
@@ -543,6 +567,8 @@ struct f2fs_inode_info {
 	struct extent_tree *extent_tree;	/* cached extent_tree entry */
 	struct rw_semaphore dio_rwsem[2];/* avoid racing between dio and gc */
 	struct rw_semaphore i_mmap_sem;
+
+	int i_inline_reserved;		/* reserved size in inline data */
 };
 
 static inline void get_extent_info(struct extent_info *ext,
@@ -2075,11 +2101,12 @@ static inline bool f2fs_is_drop_cache(struct inode *inode)
 	return is_inode_flag_set(inode, FI_DROP_CACHE);
 }
 
-static inline void *inline_data_addr(struct page *page)
+static inline void *inline_data_addr(struct inode *inode, struct page *page)
 {
 	struct f2fs_inode *ri = F2FS_INODE(page);
+	int reserved_size = get_inline_reserved_size(inode);
 
-	return (void *)&(ri->i_addr[1]);
+	return (void *)&(ri->i_addr[reserved_size]);
 }
 
 static inline int f2fs_has_inline_dentry(struct inode *inode)
@@ -2170,6 +2197,11 @@ static inline void *f2fs_kmalloc(struct f2fs_sb_info *sbi,
 	return kmalloc(size, flags);
 }
 
+static inline int get_inline_reserved_size(struct inode *inode)
+{
+	return F2FS_I(inode)->i_inline_reserved;
+}
+
 #define get_inode_mode(i) \
 	((is_inode_flag_set(i, FI_ACL_MODE)) ? \
 	 (F2FS_I(i)->i_acl_mode) : ((i)->i_mode))
diff --git a/fs/f2fs/inline.c b/fs/f2fs/inline.c
index 2082816a504a..f38be791fdf9 100644
--- a/fs/f2fs/inline.c
+++ b/fs/f2fs/inline.c
@@ -22,7 +22,7 @@ bool f2fs_may_inline_data(struct inode *inode)
 	if (!S_ISREG(inode->i_mode) && !S_ISLNK(inode->i_mode))
 		return false;
 
-	if (i_size_read(inode) > MAX_INLINE_DATA)
+	if (i_size_read(inode) > MAX_INLINE_DATA(inode))
 		return false;
 
 	if (f2fs_encrypted_inode(inode) && S_ISREG(inode->i_mode))
@@ -44,6 +44,7 @@ bool f2fs_may_inline_dentry(struct inode *inode)
 
 void read_inline_data(struct page *page, struct page *ipage)
 {
+	struct inode *inode = page->mapping->host;
 	void *src_addr, *dst_addr;
 
 	if (PageUptodate(page))
@@ -51,12 +52,12 @@ void read_inline_data(struct page *page, struct page *ipage)
 
 	f2fs_bug_on(F2FS_P_SB(page), page->index);
 
-	zero_user_segment(page, MAX_INLINE_DATA, PAGE_SIZE);
+	zero_user_segment(page, MAX_INLINE_DATA(inode), PAGE_SIZE);
 
 	/* Copy the whole inline data block */
-	src_addr = inline_data_addr(ipage);
+	src_addr = inline_data_addr(inode, ipage);
 	dst_addr = kmap_atomic(page);
-	memcpy(dst_addr, src_addr, MAX_INLINE_DATA);
+	memcpy(dst_addr, src_addr, MAX_INLINE_DATA(inode));
 	flush_dcache_page(page);
 	kunmap_atomic(dst_addr);
 	if (!PageUptodate(page))
@@ -67,13 +68,13 @@ void truncate_inline_inode(struct inode *inode, struct page *ipage, u64 from)
 {
 	void *addr;
 
-	if (from >= MAX_INLINE_DATA)
+	if (from >= MAX_INLINE_DATA(inode))
 		return;
 
-	addr = inline_data_addr(ipage);
+	addr = inline_data_addr(inode, ipage);
 
 	f2fs_wait_on_page_writeback(ipage, NODE, true);
-	memset(addr + from, 0, MAX_INLINE_DATA - from);
+	memset(addr + from, 0, MAX_INLINE_DATA(inode) - from);
 	set_page_dirty(ipage);
 
 	if (from == 0)
@@ -216,8 +217,8 @@ int f2fs_write_inline_data(struct inode *inode, struct page *page)
 
 	f2fs_wait_on_page_writeback(dn.inode_page, NODE, true);
 	src_addr = kmap_atomic(page);
-	dst_addr = inline_data_addr(dn.inode_page);
-	memcpy(dst_addr, src_addr, MAX_INLINE_DATA);
+	dst_addr = inline_data_addr(inode, dn.inode_page);
+	memcpy(dst_addr, src_addr, MAX_INLINE_DATA(inode));
 	kunmap_atomic(src_addr);
 	set_page_dirty(dn.inode_page);
 
@@ -255,9 +256,9 @@ process_inline:
 
 		f2fs_wait_on_page_writeback(ipage, NODE, true);
 
-		src_addr = inline_data_addr(npage);
-		dst_addr = inline_data_addr(ipage);
-		memcpy(dst_addr, src_addr, MAX_INLINE_DATA);
+		src_addr = inline_data_addr(inode, npage);
+		dst_addr = inline_data_addr(inode, ipage);
+		memcpy(dst_addr, src_addr, MAX_INLINE_DATA(inode));
 
 		set_inode_flag(inode, FI_INLINE_DATA);
 		set_inode_flag(inode, FI_DATA_EXIST);
@@ -285,11 +286,11 @@ struct f2fs_dir_entry *find_in_inline_dir(struct inode *dir,
 			struct fscrypt_name *fname, struct page **res_page)
 {
 	struct f2fs_sb_info *sbi = F2FS_SB(dir->i_sb);
-	struct f2fs_inline_dentry *inline_dentry;
 	struct qstr name = FSTR_TO_QSTR(&fname->disk_name);
 	struct f2fs_dir_entry *de;
 	struct f2fs_dentry_ptr d;
 	struct page *ipage;
+	void *inline_dentry;
 	f2fs_hash_t namehash;
 
 	ipage = get_node_page(sbi, dir->i_ino);
@@ -300,9 +301,9 @@ struct f2fs_dir_entry *find_in_inline_dir(struct inode *dir,
 
 	namehash = f2fs_dentry_hash(&name, fname);
 
-	inline_dentry = inline_data_addr(ipage);
+	inline_dentry = inline_data_addr(dir, ipage);
 
-	make_dentry_ptr_inline(NULL, &d, inline_dentry);
+	make_dentry_ptr_inline(dir, &d, inline_dentry);
 	de = find_target_dentry(fname, namehash, NULL, &d);
 	unlock_page(ipage);
 	if (de)
@@ -316,19 +317,19 @@ struct f2fs_dir_entry *find_in_inline_dir(struct inode *dir,
 int make_empty_inline_dir(struct inode *inode, struct inode *parent,
 							struct page *ipage)
 {
-	struct f2fs_inline_dentry *inline_dentry;
 	struct f2fs_dentry_ptr d;
+	void *inline_dentry;
 
-	inline_dentry = inline_data_addr(ipage);
+	inline_dentry = inline_data_addr(inode, ipage);
 
-	make_dentry_ptr_inline(NULL, &d, inline_dentry);
+	make_dentry_ptr_inline(inode, &d, inline_dentry);
 	do_make_empty_dir(inode, parent, &d);
 
 	set_page_dirty(ipage);
 
 	/* update i_size to MAX_INLINE_DATA */
-	if (i_size_read(inode) < MAX_INLINE_DATA)
-		f2fs_i_size_write(inode, MAX_INLINE_DATA);
+	if (i_size_read(inode) < MAX_INLINE_DATA(inode))
+		f2fs_i_size_write(inode, MAX_INLINE_DATA(inode));
 	return 0;
 }
 
@@ -337,7 +338,7 @@ int make_empty_inline_dir(struct inode *inode, struct inode *parent,
  * release ipage in this function.
  */
 static int f2fs_move_inline_dirents(struct inode *dir, struct page *ipage,
-				struct f2fs_inline_dentry *inline_dentry)
+							void *inline_dentry)
 {
 	struct page *page;
 	struct dnode_of_data dn;
@@ -357,12 +358,12 @@ static int f2fs_move_inline_dirents(struct inode *dir, struct page *ipage,
 		goto out;
 
 	f2fs_wait_on_page_writeback(page, DATA, true);
-	zero_user_segment(page, MAX_INLINE_DATA, PAGE_SIZE);
+	zero_user_segment(page, MAX_INLINE_DATA(dir), PAGE_SIZE);
 
 	dentry_blk = kmap_atomic(page);
 
-	make_dentry_ptr_inline(NULL, &src, inline_dentry);
-	make_dentry_ptr_block(NULL, &dst, dentry_blk);
+	make_dentry_ptr_inline(dir, &src, inline_dentry);
+	make_dentry_ptr_block(dir, &dst, dentry_blk);
 
 	/* copy data from inline dentry block to new dentry block */
 	memcpy(dst.bitmap, src.bitmap, src.nr_bitmap);
@@ -395,14 +396,13 @@ out:
 	return err;
 }
 
-static int f2fs_add_inline_entries(struct inode *dir,
-			struct f2fs_inline_dentry *inline_dentry)
+static int f2fs_add_inline_entries(struct inode *dir, void *inline_dentry)
 {
 	struct f2fs_dentry_ptr d;
 	unsigned long bit_pos = 0;
 	int err = 0;
 
-	make_dentry_ptr_inline(NULL, &d, inline_dentry);
+	make_dentry_ptr_inline(dir, &d, inline_dentry);
 
 	while (bit_pos < d.max) {
 		struct f2fs_dir_entry *de;
@@ -444,19 +444,19 @@ punch_dentry_pages:
 }
 
 static int f2fs_move_rehashed_dirents(struct inode *dir, struct page *ipage,
-				struct f2fs_inline_dentry *inline_dentry)
+							void *inline_dentry)
 {
-	struct f2fs_inline_dentry *backup_dentry;
+	void *backup_dentry;
 	int err;
 
 	backup_dentry = f2fs_kmalloc(F2FS_I_SB(dir),
-			sizeof(struct f2fs_inline_dentry), GFP_F2FS_ZERO);
+				MAX_INLINE_DATA(dir), GFP_F2FS_ZERO);
 	if (!backup_dentry) {
 		f2fs_put_page(ipage, 1);
 		return -ENOMEM;
 	}
 
-	memcpy(backup_dentry, inline_dentry, MAX_INLINE_DATA);
+	memcpy(backup_dentry, inline_dentry, MAX_INLINE_DATA(dir));
 	truncate_inline_inode(dir, ipage, 0);
 
 	unlock_page(ipage);
@@ -473,9 +473,9 @@ static int f2fs_move_rehashed_dirents(struct inode *dir, struct page *ipage,
 	return 0;
 recover:
 	lock_page(ipage);
-	memcpy(inline_dentry, backup_dentry, MAX_INLINE_DATA);
+	memcpy(inline_dentry, backup_dentry, MAX_INLINE_DATA(dir));
 	f2fs_i_depth_write(dir, 0);
-	f2fs_i_size_write(dir, MAX_INLINE_DATA);
+	f2fs_i_size_write(dir, MAX_INLINE_DATA(dir));
 	set_page_dirty(ipage);
 	f2fs_put_page(ipage, 1);
 
@@ -484,7 +484,7 @@ recover:
 }
 
 static int f2fs_convert_inline_dir(struct inode *dir, struct page *ipage,
-				struct f2fs_inline_dentry *inline_dentry)
+							void *inline_dentry)
 {
 	if (!F2FS_I(dir)->i_dir_level)
 		return f2fs_move_inline_dirents(dir, ipage, inline_dentry);
@@ -500,7 +500,7 @@ int f2fs_add_inline_entry(struct inode *dir, const struct qstr *new_name,
 	struct page *ipage;
 	unsigned int bit_pos;
 	f2fs_hash_t name_hash;
-	struct f2fs_inline_dentry *inline_dentry = NULL;
+	void *inline_dentry = NULL;
 	struct f2fs_dentry_ptr d;
 	int slots = GET_DENTRY_SLOTS(new_name->len);
 	struct page *page = NULL;
@@ -510,8 +510,8 @@ int f2fs_add_inline_entry(struct inode *dir, const struct qstr *new_name,
 	if (IS_ERR(ipage))
 		return PTR_ERR(ipage);
 
-	inline_dentry = inline_data_addr(ipage);
-	make_dentry_ptr_inline(NULL, &d, inline_dentry);
+	inline_dentry = inline_data_addr(dir, ipage);
+	make_dentry_ptr_inline(dir, &d, inline_dentry);
 
 	bit_pos = room_for_filename(d.bitmap, slots, d.max);
 	if (bit_pos >= d.max) {
@@ -557,8 +557,8 @@ out:
 void f2fs_delete_inline_entry(struct f2fs_dir_entry *dentry, struct page *page,
 					struct inode *dir, struct inode *inode)
 {
-	struct f2fs_inline_dentry *inline_dentry;
 	struct f2fs_dentry_ptr d;
+	void *inline_dentry;
 	int slots = GET_DENTRY_SLOTS(le16_to_cpu(dentry->name_len));
 	unsigned int bit_pos;
 	int i;
@@ -566,8 +566,8 @@ void f2fs_delete_inline_entry(struct f2fs_dir_entry *dentry, struct page *page,
 	lock_page(page);
 	f2fs_wait_on_page_writeback(page, NODE, true);
 
-	inline_dentry = inline_data_addr(page);
-	make_dentry_ptr_inline(NULL, &d, inline_dentry);
+	inline_dentry = inline_data_addr(dir, page);
+	make_dentry_ptr_inline(dir, &d, inline_dentry);
 
 	bit_pos = dentry - d.dentry;
 	for (i = 0; i < slots; i++)
@@ -588,15 +588,15 @@ bool f2fs_empty_inline_dir(struct inode *dir)
 	struct f2fs_sb_info *sbi = F2FS_I_SB(dir);
 	struct page *ipage;
 	unsigned int bit_pos = 2;
-	struct f2fs_inline_dentry *inline_dentry;
+	void *inline_dentry;
 	struct f2fs_dentry_ptr d;
 
 	ipage = get_node_page(sbi, dir->i_ino);
 	if (IS_ERR(ipage))
 		return false;
 
-	inline_dentry = inline_data_addr(ipage);
-	make_dentry_ptr_inline(NULL, &d, inline_dentry);
+	inline_dentry = inline_data_addr(dir, ipage);
+	make_dentry_ptr_inline(dir, &d, inline_dentry);
 
 	bit_pos = find_next_bit_le(d.bitmap, d.max, bit_pos);
 
@@ -612,9 +612,9 @@ int f2fs_read_inline_dir(struct file *file, struct dir_context *ctx,
 				struct fscrypt_str *fstr)
 {
 	struct inode *inode = file_inode(file);
-	struct f2fs_inline_dentry *inline_dentry = NULL;
 	struct page *ipage = NULL;
 	struct f2fs_dentry_ptr d;
+	void *inline_dentry = NULL;
 	int err;
 
 	make_dentry_ptr_inline(inode, &d, inline_dentry);
@@ -626,7 +626,7 @@ int f2fs_read_inline_dir(struct file *file, struct dir_context *ctx,
 	if (IS_ERR(ipage))
 		return PTR_ERR(ipage);
 
-	inline_dentry = inline_data_addr(ipage);
+	inline_dentry = inline_data_addr(inode, ipage);
 
 	make_dentry_ptr_inline(inode, &d, inline_dentry);
 
@@ -657,7 +657,7 @@ int f2fs_inline_data_fiemap(struct inode *inode,
 		goto out;
 	}
 
-	ilen = min_t(size_t, MAX_INLINE_DATA, i_size_read(inode));
+	ilen = min_t(size_t, MAX_INLINE_DATA(inode), i_size_read(inode));
 	if (start >= ilen)
 		goto out;
 	if (start + len < ilen)
@@ -666,7 +666,8 @@ int f2fs_inline_data_fiemap(struct inode *inode,
 
 	get_node_info(F2FS_I_SB(inode), inode->i_ino, &ni);
 	byteaddr = (__u64)ni.blk_addr << inode->i_sb->s_blocksize_bits;
-	byteaddr += (char *)inline_data_addr(ipage) - (char *)F2FS_INODE(ipage);
+	byteaddr += (char *)inline_data_addr(inode, ipage) -
+					(char *)F2FS_INODE(ipage);
 	err = fiemap_fill_next_extent(fieinfo, start, byteaddr, ilen, flags);
 out:
 	f2fs_put_page(ipage, 1);
diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c
index 6cd312a17c69..32ec6b23fe01 100644
--- a/fs/f2fs/inode.c
+++ b/fs/f2fs/inode.c
@@ -87,9 +87,9 @@ static void __set_inode_rdev(struct inode *inode, struct f2fs_inode *ri)
 
 static void __recover_inline_status(struct inode *inode, struct page *ipage)
 {
-	void *inline_data = inline_data_addr(ipage);
+	void *inline_data = inline_data_addr(inode, ipage);
 	__le32 *start = inline_data;
-	__le32 *end = start + MAX_INLINE_DATA / sizeof(__le32);
+	__le32 *end = start + MAX_INLINE_DATA(inode) / sizeof(__le32);
 
 	while (start < end) {
 		if (*start++) {
diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c
index 32e4c025e97e..2d236384f938 100644
--- a/fs/f2fs/super.c
+++ b/fs/f2fs/super.c
@@ -446,6 +446,9 @@ static struct inode *f2fs_alloc_inode(struct super_block *sb)
 #endif
 	/* Will be used by directory only */
 	fi->i_dir_level = F2FS_SB(sb)->dir_level;
+
+	fi->i_inline_reserved = DEF_INLINE_RESERVED_SIZE;
+
 	return &fi->vfs_inode;
 }
 
diff --git a/include/linux/f2fs_fs.h b/include/linux/f2fs_fs.h
index b6feed6547ce..060b1c5b0836 100644
--- a/include/linux/f2fs_fs.h
+++ b/include/linux/f2fs_fs.h
@@ -206,9 +206,6 @@ struct f2fs_extent {
 #define F2FS_DATA_EXIST		0x08	/* file inline data exist flag */
 #define F2FS_INLINE_DOTS	0x10	/* file having implicit dot dentries */
 
-#define MAX_INLINE_DATA		(sizeof(__le32) * (DEF_ADDRS_PER_INODE - \
-						F2FS_INLINE_XATTR_ADDRS - 1))
-
 struct f2fs_inode {
 	__le16 i_mode;			/* file mode */
 	__u8 i_advise;			/* file hints */
@@ -465,7 +462,7 @@ typedef __le32	f2fs_hash_t;
 #define MAX_DIR_BUCKETS		(1 << ((MAX_DIR_HASH_DEPTH / 2) - 1))
 
 /*
- * space utilization of regular dentry and inline dentry
+ * space utilization of regular dentry and inline dentry (w/o extra reservation)
  *		regular dentry			inline dentry
  * bitmap	1 * 27 = 27			1 * 23 = 23
  * reserved	1 * 3 = 3			1 * 7 = 7
@@ -501,24 +498,6 @@ struct f2fs_dentry_block {
 	__u8 filename[NR_DENTRY_IN_BLOCK][F2FS_SLOT_LEN];
 } __packed;
 
-/* for inline dir */
-#define NR_INLINE_DENTRY	(MAX_INLINE_DATA * BITS_PER_BYTE / \
-				((SIZE_OF_DIR_ENTRY + F2FS_SLOT_LEN) * \
-				BITS_PER_BYTE + 1))
-#define INLINE_DENTRY_BITMAP_SIZE	((NR_INLINE_DENTRY + \
-					BITS_PER_BYTE - 1) / BITS_PER_BYTE)
-#define INLINE_RESERVED_SIZE	(MAX_INLINE_DATA - \
-				((SIZE_OF_DIR_ENTRY + F2FS_SLOT_LEN) * \
-				NR_INLINE_DENTRY + INLINE_DENTRY_BITMAP_SIZE))
-
-/* inline directory entry structure */
-struct f2fs_inline_dentry {
-	__u8 dentry_bitmap[INLINE_DENTRY_BITMAP_SIZE];
-	__u8 reserved[INLINE_RESERVED_SIZE];
-	struct f2fs_dir_entry dentry[NR_INLINE_DENTRY];
-	__u8 filename[NR_INLINE_DENTRY][F2FS_SLOT_LEN];
-} __packed;
-
 /* file types used in inode_info->flags */
 enum {
 	F2FS_FT_UNKNOWN,
-- 
cgit v1.2.3


From 7a2af766af15887754f7f7a0869b4603b390876a Mon Sep 17 00:00:00 2001
From: Chao Yu <yuchao0@huawei.com>
Date: Wed, 19 Jul 2017 00:19:06 +0800
Subject: f2fs: enhance on-disk inode structure scalability

This patch add new flag F2FS_EXTRA_ATTR storing in inode.i_inline
to indicate that on-disk structure of current inode is extended.

In order to extend, we changed the inode structure a bit:

Original one:

struct f2fs_inode {
	...
	struct f2fs_extent i_ext;
	__le32 i_addr[DEF_ADDRS_PER_INODE];
	__le32 i_nid[DEF_NIDS_PER_INODE];
}

Extended one:

struct f2fs_inode {
        ...
        struct f2fs_extent i_ext;
	union {
		struct {
			__le16 i_extra_isize;
			__le16 i_padding;
			__le32 i_extra_end[0];
		};
		__le32 i_addr[DEF_ADDRS_PER_INODE];
	};
        __le32 i_nid[DEF_NIDS_PER_INODE];
}

Once F2FS_EXTRA_ATTR is set, we will steal four bytes in the head of
i_addr field for storing i_extra_isize and i_padding. with i_extra_isize,
we can calculate actual size of reserved space in i_addr, available
attribute fields included in total extra attribute fields for current
inode can be described as below:

  +--------------------+
  | .i_mode            |
  | ...                |
  | .i_ext             |
  +--------------------+
  | .i_extra_isize     |-----+
  | .i_padding         |     |
  | .i_prjid           |     |
  | .i_atime_extra     |     |
  | .i_ctime_extra     |     |
  | .i_mtime_extra     |<----+
  | .i_inode_cs        |<----- store blkaddr/inline from here
  | .i_xattr_cs        |
  | ...                |
  +--------------------+
  |                    |
  |    block address   |
  |                    |
  +--------------------+
  | .i_nid             |
  +--------------------+
  |   node_footer      |
  | (nid, ino, offset) |
  +--------------------+

Hence, with this patch, we would enhance scalability of f2fs inode for
storing more newly added attribute.

Signed-off-by: Chao Yu <yuchao0@huawei.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
 fs/f2fs/data.c          | 15 +++++++----
 fs/f2fs/f2fs.h          | 72 +++++++++++++++++++++++++++++++++++++------------
 fs/f2fs/file.c          | 23 ++++++++++------
 fs/f2fs/gc.c            |  2 +-
 fs/f2fs/inode.c         | 32 ++++++++++++++--------
 fs/f2fs/namei.c         |  5 ++++
 fs/f2fs/node.c          |  7 +++--
 fs/f2fs/recovery.c      |  7 ++---
 fs/f2fs/super.c         | 11 +++++---
 include/linux/f2fs_fs.h | 13 +++++++--
 10 files changed, 135 insertions(+), 52 deletions(-)

(limited to 'include')

diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c
index ca978c32ae00..aefc2a5745d3 100644
--- a/fs/f2fs/data.c
+++ b/fs/f2fs/data.c
@@ -460,10 +460,14 @@ static void __set_data_blkaddr(struct dnode_of_data *dn)
 {
 	struct f2fs_node *rn = F2FS_NODE(dn->node_page);
 	__le32 *addr_array;
+	int base = 0;
+
+	if (IS_INODE(dn->node_page) && f2fs_has_extra_attr(dn->inode))
+		base = get_extra_isize(dn->inode);
 
 	/* Get physical address of data block */
 	addr_array = blkaddr_in_node(rn);
-	addr_array[dn->ofs_in_node] = cpu_to_le32(dn->data_blkaddr);
+	addr_array[base + dn->ofs_in_node] = cpu_to_le32(dn->data_blkaddr);
 }
 
 /*
@@ -507,8 +511,8 @@ int reserve_new_blocks(struct dnode_of_data *dn, blkcnt_t count)
 	f2fs_wait_on_page_writeback(dn->node_page, NODE, true);
 
 	for (; count > 0; dn->ofs_in_node++) {
-		block_t blkaddr =
-			datablock_addr(dn->node_page, dn->ofs_in_node);
+		block_t blkaddr = datablock_addr(dn->inode,
+					dn->node_page, dn->ofs_in_node);
 		if (blkaddr == NULL_ADDR) {
 			dn->data_blkaddr = NEW_ADDR;
 			__set_data_blkaddr(dn);
@@ -755,7 +759,8 @@ static int __allocate_data_block(struct dnode_of_data *dn)
 	if (unlikely(is_inode_flag_set(dn->inode, FI_NO_ALLOC)))
 		return -EPERM;
 
-	dn->data_blkaddr = datablock_addr(dn->node_page, dn->ofs_in_node);
+	dn->data_blkaddr = datablock_addr(dn->inode,
+				dn->node_page, dn->ofs_in_node);
 	if (dn->data_blkaddr == NEW_ADDR)
 		goto alloc;
 
@@ -902,7 +907,7 @@ next_dnode:
 	end_offset = ADDRS_PER_PAGE(dn.node_page, inode);
 
 next_block:
-	blkaddr = datablock_addr(dn.node_page, dn.ofs_in_node);
+	blkaddr = datablock_addr(dn.inode, dn.node_page, dn.ofs_in_node);
 
 	if (blkaddr == NEW_ADDR || blkaddr == NULL_ADDR) {
 		if (create) {
diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
index b8e63739d1c1..d0682fd4c690 100644
--- a/fs/f2fs/f2fs.h
+++ b/fs/f2fs/f2fs.h
@@ -110,9 +110,10 @@ struct f2fs_mount_info {
 	unsigned int	opt;
 };
 
-#define F2FS_FEATURE_ENCRYPT	0x0001
-#define F2FS_FEATURE_BLKZONED	0x0002
-#define F2FS_FEATURE_ATOMIC_WRITE 0x0004
+#define F2FS_FEATURE_ENCRYPT		0x0001
+#define F2FS_FEATURE_BLKZONED		0x0002
+#define F2FS_FEATURE_ATOMIC_WRITE	0x0004
+#define F2FS_FEATURE_EXTRA_ATTR		0x0008
 
 #define F2FS_HAS_FEATURE(sb, mask)					\
 	((F2FS_SB(sb)->raw_super->feature & cpu_to_le32(mask)) != 0)
@@ -359,10 +360,10 @@ struct f2fs_flush_device {
 
 /* for inline stuff */
 #define DEF_INLINE_RESERVED_SIZE	1
-
-static inline int get_inline_reserved_size(struct inode *inode);
-#define MAX_INLINE_DATA(inode)	(sizeof(__le32) * (DEF_ADDRS_PER_INODE -\
-				get_inline_reserved_size(inode) -\
+static inline int get_extra_isize(struct inode *inode);
+#define MAX_INLINE_DATA(inode)	(sizeof(__le32) * \
+				(CUR_ADDRS_PER_INODE(inode) - \
+				DEF_INLINE_RESERVED_SIZE - \
 				F2FS_INLINE_XATTR_ADDRS))
 
 /* for inline dir */
@@ -568,7 +569,7 @@ struct f2fs_inode_info {
 	struct rw_semaphore dio_rwsem[2];/* avoid racing between dio and gc */
 	struct rw_semaphore i_mmap_sem;
 
-	int i_inline_reserved;		/* reserved size in inline data */
+	int i_extra_isize;		/* size of extra space located in i_addr */
 };
 
 static inline void get_extent_info(struct extent_info *ext,
@@ -1792,20 +1793,38 @@ static inline bool IS_INODE(struct page *page)
 	return RAW_IS_INODE(p);
 }
 
+static inline int offset_in_addr(struct f2fs_inode *i)
+{
+	return (i->i_inline & F2FS_EXTRA_ATTR) ?
+			(le16_to_cpu(i->i_extra_isize) / sizeof(__le32)) : 0;
+}
+
 static inline __le32 *blkaddr_in_node(struct f2fs_node *node)
 {
 	return RAW_IS_INODE(node) ? node->i.i_addr : node->dn.addr;
 }
 
-static inline block_t datablock_addr(struct page *node_page,
-		unsigned int offset)
+static inline int f2fs_has_extra_attr(struct inode *inode);
+static inline block_t datablock_addr(struct inode *inode,
+			struct page *node_page, unsigned int offset)
 {
 	struct f2fs_node *raw_node;
 	__le32 *addr_array;
+	int base = 0;
+	bool is_inode = IS_INODE(node_page);
 
 	raw_node = F2FS_NODE(node_page);
+
+	/* from GC path only */
+	if (!inode) {
+		if (is_inode)
+			base = offset_in_addr(&raw_node->i);
+	} else if (f2fs_has_extra_attr(inode) && is_inode) {
+		base = get_extra_isize(inode);
+	}
+
 	addr_array = blkaddr_in_node(raw_node);
-	return le32_to_cpu(addr_array[offset]);
+	return le32_to_cpu(addr_array[base + offset]);
 }
 
 static inline int f2fs_test_bit(unsigned int nr, char *addr)
@@ -1896,6 +1915,7 @@ enum {
 	FI_DIRTY_FILE,		/* indicate regular/symlink has dirty pages */
 	FI_NO_PREALLOC,		/* indicate skipped preallocated blocks */
 	FI_HOT_DATA,		/* indicate file is hot */
+	FI_EXTRA_ATTR,		/* indicate file has extra attribute */
 };
 
 static inline void __mark_inode_dirty_flag(struct inode *inode,
@@ -2015,6 +2035,8 @@ static inline void get_inline_info(struct inode *inode, struct f2fs_inode *ri)
 		set_bit(FI_DATA_EXIST, &fi->flags);
 	if (ri->i_inline & F2FS_INLINE_DOTS)
 		set_bit(FI_INLINE_DOTS, &fi->flags);
+	if (ri->i_inline & F2FS_EXTRA_ATTR)
+		set_bit(FI_EXTRA_ATTR, &fi->flags);
 }
 
 static inline void set_raw_inline(struct inode *inode, struct f2fs_inode *ri)
@@ -2031,6 +2053,13 @@ static inline void set_raw_inline(struct inode *inode, struct f2fs_inode *ri)
 		ri->i_inline |= F2FS_DATA_EXIST;
 	if (is_inode_flag_set(inode, FI_INLINE_DOTS))
 		ri->i_inline |= F2FS_INLINE_DOTS;
+	if (is_inode_flag_set(inode, FI_EXTRA_ATTR))
+		ri->i_inline |= F2FS_EXTRA_ATTR;
+}
+
+static inline int f2fs_has_extra_attr(struct inode *inode)
+{
+	return is_inode_flag_set(inode, FI_EXTRA_ATTR);
 }
 
 static inline int f2fs_has_inline_xattr(struct inode *inode)
@@ -2041,8 +2070,8 @@ static inline int f2fs_has_inline_xattr(struct inode *inode)
 static inline unsigned int addrs_per_inode(struct inode *inode)
 {
 	if (f2fs_has_inline_xattr(inode))
-		return DEF_ADDRS_PER_INODE - F2FS_INLINE_XATTR_ADDRS;
-	return DEF_ADDRS_PER_INODE;
+		return CUR_ADDRS_PER_INODE(inode) - F2FS_INLINE_XATTR_ADDRS;
+	return CUR_ADDRS_PER_INODE(inode);
 }
 
 static inline void *inline_xattr_addr(struct page *page)
@@ -2104,9 +2133,9 @@ static inline bool f2fs_is_drop_cache(struct inode *inode)
 static inline void *inline_data_addr(struct inode *inode, struct page *page)
 {
 	struct f2fs_inode *ri = F2FS_INODE(page);
-	int reserved_size = get_inline_reserved_size(inode);
+	int extra_size = get_extra_isize(inode);
 
-	return (void *)&(ri->i_addr[reserved_size]);
+	return (void *)&(ri->i_addr[extra_size + DEF_INLINE_RESERVED_SIZE]);
 }
 
 static inline int f2fs_has_inline_dentry(struct inode *inode)
@@ -2197,15 +2226,19 @@ static inline void *f2fs_kmalloc(struct f2fs_sb_info *sbi,
 	return kmalloc(size, flags);
 }
 
-static inline int get_inline_reserved_size(struct inode *inode)
+static inline int get_extra_isize(struct inode *inode)
 {
-	return F2FS_I(inode)->i_inline_reserved;
+	return F2FS_I(inode)->i_extra_isize / sizeof(__le32);
 }
 
 #define get_inode_mode(i) \
 	((is_inode_flag_set(i, FI_ACL_MODE)) ? \
 	 (F2FS_I(i)->i_acl_mode) : ((i)->i_mode))
 
+#define F2FS_TOTAL_EXTRA_ATTR_SIZE			\
+	(offsetof(struct f2fs_inode, i_extra_end) -	\
+	offsetof(struct f2fs_inode, i_extra_isize))	\
+
 /*
  * file.c
  */
@@ -2798,6 +2831,11 @@ static inline int f2fs_sb_mounted_blkzoned(struct super_block *sb)
 	return F2FS_HAS_FEATURE(sb, F2FS_FEATURE_BLKZONED);
 }
 
+static inline int f2fs_sb_has_extra_attr(struct super_block *sb)
+{
+	return F2FS_HAS_FEATURE(sb, F2FS_FEATURE_EXTRA_ATTR);
+}
+
 #ifdef CONFIG_BLK_DEV_ZONED
 static inline int get_blkz_type(struct f2fs_sb_info *sbi,
 			struct block_device *bdev, block_t blkaddr)
diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c
index 0a5faf21dd4d..d2ca4b71cb53 100644
--- a/fs/f2fs/file.c
+++ b/fs/f2fs/file.c
@@ -382,7 +382,8 @@ static loff_t f2fs_seek_block(struct file *file, loff_t offset, int whence)
 				dn.ofs_in_node++, pgofs++,
 				data_ofs = (loff_t)pgofs << PAGE_SHIFT) {
 			block_t blkaddr;
-			blkaddr = datablock_addr(dn.node_page, dn.ofs_in_node);
+			blkaddr = datablock_addr(dn.inode,
+					dn.node_page, dn.ofs_in_node);
 
 			if (__found_offset(blkaddr, dirty, pgofs, whence)) {
 				f2fs_put_dnode(&dn);
@@ -467,9 +468,13 @@ int truncate_data_blocks_range(struct dnode_of_data *dn, int count)
 	struct f2fs_node *raw_node;
 	int nr_free = 0, ofs = dn->ofs_in_node, len = count;
 	__le32 *addr;
+	int base = 0;
+
+	if (IS_INODE(dn->node_page) && f2fs_has_extra_attr(dn->inode))
+		base = get_extra_isize(dn->inode);
 
 	raw_node = F2FS_NODE(dn->node_page);
-	addr = blkaddr_in_node(raw_node) + ofs;
+	addr = blkaddr_in_node(raw_node) + base + ofs;
 
 	for (; count > 0; count--, addr++, dn->ofs_in_node++) {
 		block_t blkaddr = le32_to_cpu(*addr);
@@ -927,7 +932,8 @@ next_dnode:
 	done = min((pgoff_t)ADDRS_PER_PAGE(dn.node_page, inode) -
 							dn.ofs_in_node, len);
 	for (i = 0; i < done; i++, blkaddr++, do_replace++, dn.ofs_in_node++) {
-		*blkaddr = datablock_addr(dn.node_page, dn.ofs_in_node);
+		*blkaddr = datablock_addr(dn.inode,
+					dn.node_page, dn.ofs_in_node);
 		if (!is_checkpointed_data(sbi, *blkaddr)) {
 
 			if (test_opt(sbi, LFS)) {
@@ -1003,8 +1009,8 @@ static int __clone_blkaddrs(struct inode *src_inode, struct inode *dst_inode,
 				ADDRS_PER_PAGE(dn.node_page, dst_inode) -
 						dn.ofs_in_node, len - i);
 			do {
-				dn.data_blkaddr = datablock_addr(dn.node_page,
-								dn.ofs_in_node);
+				dn.data_blkaddr = datablock_addr(dn.inode,
+						dn.node_page, dn.ofs_in_node);
 				truncate_data_blocks_range(&dn, 1);
 
 				if (do_replace[i]) {
@@ -1173,7 +1179,8 @@ static int f2fs_do_zero_range(struct dnode_of_data *dn, pgoff_t start,
 	int ret;
 
 	for (; index < end; index++, dn->ofs_in_node++) {
-		if (datablock_addr(dn->node_page, dn->ofs_in_node) == NULL_ADDR)
+		if (datablock_addr(dn->inode, dn->node_page,
+					dn->ofs_in_node) == NULL_ADDR)
 			count++;
 	}
 
@@ -1184,8 +1191,8 @@ static int f2fs_do_zero_range(struct dnode_of_data *dn, pgoff_t start,
 
 	dn->ofs_in_node = ofs_in_node;
 	for (index = start; index < end; index++, dn->ofs_in_node++) {
-		dn->data_blkaddr =
-				datablock_addr(dn->node_page, dn->ofs_in_node);
+		dn->data_blkaddr = datablock_addr(dn->inode,
+					dn->node_page, dn->ofs_in_node);
 		/*
 		 * reserve_new_blocks will not guarantee entire block
 		 * allocation.
diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c
index 41c649c9c55c..f57cadae1a30 100644
--- a/fs/f2fs/gc.c
+++ b/fs/f2fs/gc.c
@@ -587,7 +587,7 @@ static bool is_alive(struct f2fs_sb_info *sbi, struct f2fs_summary *sum,
 	}
 
 	*nofs = ofs_of_node(node_page);
-	source_blkaddr = datablock_addr(node_page, ofs_in_node);
+	source_blkaddr = datablock_addr(NULL, node_page, ofs_in_node);
 	f2fs_put_page(node_page, 1);
 
 	if (source_blkaddr != blkaddr)
diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c
index 32ec6b23fe01..0a6699a23dfb 100644
--- a/fs/f2fs/inode.c
+++ b/fs/f2fs/inode.c
@@ -49,20 +49,22 @@ void f2fs_set_inode_flags(struct inode *inode)
 
 static void __get_inode_rdev(struct inode *inode, struct f2fs_inode *ri)
 {
+	int extra_size = get_extra_isize(inode);
+
 	if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode) ||
 			S_ISFIFO(inode->i_mode) || S_ISSOCK(inode->i_mode)) {
-		if (ri->i_addr[0])
-			inode->i_rdev =
-				old_decode_dev(le32_to_cpu(ri->i_addr[0]));
+		if (ri->i_addr[extra_size])
+			inode->i_rdev = old_decode_dev(
+				le32_to_cpu(ri->i_addr[extra_size]));
 		else
-			inode->i_rdev =
-				new_decode_dev(le32_to_cpu(ri->i_addr[1]));
+			inode->i_rdev = new_decode_dev(
+				le32_to_cpu(ri->i_addr[extra_size + 1]));
 	}
 }
 
 static bool __written_first_block(struct f2fs_inode *ri)
 {
-	block_t addr = le32_to_cpu(ri->i_addr[0]);
+	block_t addr = le32_to_cpu(ri->i_addr[offset_in_addr(ri)]);
 
 	if (addr != NEW_ADDR && addr != NULL_ADDR)
 		return true;
@@ -71,16 +73,18 @@ static bool __written_first_block(struct f2fs_inode *ri)
 
 static void __set_inode_rdev(struct inode *inode, struct f2fs_inode *ri)
 {
+	int extra_size = get_extra_isize(inode);
+
 	if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode)) {
 		if (old_valid_dev(inode->i_rdev)) {
-			ri->i_addr[0] =
+			ri->i_addr[extra_size] =
 				cpu_to_le32(old_encode_dev(inode->i_rdev));
-			ri->i_addr[1] = 0;
+			ri->i_addr[extra_size + 1] = 0;
 		} else {
-			ri->i_addr[0] = 0;
-			ri->i_addr[1] =
+			ri->i_addr[extra_size] = 0;
+			ri->i_addr[extra_size + 1] =
 				cpu_to_le32(new_encode_dev(inode->i_rdev));
-			ri->i_addr[2] = 0;
+			ri->i_addr[extra_size + 2] = 0;
 		}
 	}
 }
@@ -153,6 +157,9 @@ static int do_read_inode(struct inode *inode)
 
 	get_inline_info(inode, ri);
 
+	fi->i_extra_isize = f2fs_has_extra_attr(inode) ?
+					le16_to_cpu(ri->i_extra_isize) : 0;
+
 	/* check data exist */
 	if (f2fs_has_inline_data(inode) && !f2fs_exist_data(inode))
 		__recover_inline_status(inode, node_page);
@@ -292,6 +299,9 @@ int update_inode(struct inode *inode, struct page *node_page)
 	ri->i_generation = cpu_to_le32(inode->i_generation);
 	ri->i_dir_level = F2FS_I(inode)->i_dir_level;
 
+	if (f2fs_has_extra_attr(inode))
+		ri->i_extra_isize = cpu_to_le16(F2FS_I(inode)->i_extra_isize);
+
 	__set_inode_rdev(inode, ri);
 	set_cold_node(inode, node_page);
 
diff --git a/fs/f2fs/namei.c b/fs/f2fs/namei.c
index 760d85223c81..26b4d2b54812 100644
--- a/fs/f2fs/namei.c
+++ b/fs/f2fs/namei.c
@@ -72,6 +72,11 @@ static struct inode *f2fs_new_inode(struct inode *dir, umode_t mode)
 
 	set_inode_flag(inode, FI_NEW_INODE);
 
+	if (f2fs_sb_has_extra_attr(sbi->sb)) {
+		set_inode_flag(inode, FI_EXTRA_ATTR);
+		F2FS_I(inode)->i_extra_isize = F2FS_TOTAL_EXTRA_ATTR_SIZE;
+	}
+
 	if (test_opt(sbi, INLINE_XATTR))
 		set_inode_flag(inode, FI_INLINE_XATTR);
 	if (test_opt(sbi, INLINE_DATA) && f2fs_may_inline_data(inode))
diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c
index 5396611df0c3..547b84fc10f6 100644
--- a/fs/f2fs/node.c
+++ b/fs/f2fs/node.c
@@ -655,7 +655,8 @@ int get_dnode_of_data(struct dnode_of_data *dn, pgoff_t index, int mode)
 	dn->nid = nids[level];
 	dn->ofs_in_node = offset[level];
 	dn->node_page = npage[level];
-	dn->data_blkaddr = datablock_addr(dn->node_page, dn->ofs_in_node);
+	dn->data_blkaddr = datablock_addr(dn->inode,
+				dn->node_page, dn->ofs_in_node);
 	return 0;
 
 release_pages:
@@ -2263,7 +2264,9 @@ retry:
 	dst->i_blocks = cpu_to_le64(1);
 	dst->i_links = cpu_to_le32(1);
 	dst->i_xattr_nid = 0;
-	dst->i_inline = src->i_inline & F2FS_INLINE_XATTR;
+	dst->i_inline = src->i_inline & (F2FS_INLINE_XATTR | F2FS_EXTRA_ATTR);
+	if (dst->i_inline & F2FS_EXTRA_ATTR)
+		dst->i_extra_isize = src->i_extra_isize;
 
 	new_ni = old_ni;
 	new_ni.ino = ino;
diff --git a/fs/f2fs/recovery.c b/fs/f2fs/recovery.c
index 907d6b7dde6a..2d9b8182691f 100644
--- a/fs/f2fs/recovery.c
+++ b/fs/f2fs/recovery.c
@@ -361,7 +361,8 @@ out:
 	return 0;
 
 truncate_out:
-	if (datablock_addr(tdn.node_page, tdn.ofs_in_node) == blkaddr)
+	if (datablock_addr(tdn.inode, tdn.node_page,
+					tdn.ofs_in_node) == blkaddr)
 		truncate_data_blocks_range(&tdn, 1);
 	if (dn->inode->i_ino == nid && !dn->inode_page_locked)
 		unlock_page(dn->inode_page);
@@ -414,8 +415,8 @@ retry_dn:
 	for (; start < end; start++, dn.ofs_in_node++) {
 		block_t src, dest;
 
-		src = datablock_addr(dn.node_page, dn.ofs_in_node);
-		dest = datablock_addr(page, dn.ofs_in_node);
+		src = datablock_addr(dn.inode, dn.node_page, dn.ofs_in_node);
+		dest = datablock_addr(dn.inode, page, dn.ofs_in_node);
 
 		/* skip recovering if dest is the same as src */
 		if (src == dest)
diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c
index 2d236384f938..b30097190605 100644
--- a/fs/f2fs/super.c
+++ b/fs/f2fs/super.c
@@ -447,8 +447,6 @@ static struct inode *f2fs_alloc_inode(struct super_block *sb)
 	/* Will be used by directory only */
 	fi->i_dir_level = F2FS_SB(sb)->dir_level;
 
-	fi->i_inline_reserved = DEF_INLINE_RESERVED_SIZE;
-
 	return &fi->vfs_inode;
 }
 
@@ -1306,9 +1304,16 @@ static const struct export_operations f2fs_export_ops = {
 
 static loff_t max_file_blocks(void)
 {
-	loff_t result = (DEF_ADDRS_PER_INODE - F2FS_INLINE_XATTR_ADDRS);
+	loff_t result = 0;
 	loff_t leaf_count = ADDRS_PER_BLOCK;
 
+	/*
+	 * note: previously, result is equal to (DEF_ADDRS_PER_INODE -
+	 * F2FS_INLINE_XATTR_ADDRS), but now f2fs try to reserve more
+	 * space in inode.i_addr, it will be more safe to reassign
+	 * result as zero.
+	 */
+
 	/* two direct node blocks */
 	result += (leaf_count * 2);
 
diff --git a/include/linux/f2fs_fs.h b/include/linux/f2fs_fs.h
index 060b1c5b0836..0c79ddd40b70 100644
--- a/include/linux/f2fs_fs.h
+++ b/include/linux/f2fs_fs.h
@@ -186,6 +186,8 @@ struct f2fs_extent {
 #define F2FS_NAME_LEN		255
 #define F2FS_INLINE_XATTR_ADDRS	50	/* 200 bytes for inline xattrs */
 #define DEF_ADDRS_PER_INODE	923	/* Address Pointers in an Inode */
+#define CUR_ADDRS_PER_INODE(inode)	(DEF_ADDRS_PER_INODE - \
+					get_extra_isize(inode))
 #define DEF_NIDS_PER_INODE	5	/* Node IDs in an Inode */
 #define ADDRS_PER_INODE(inode)	addrs_per_inode(inode)
 #define ADDRS_PER_BLOCK		1018	/* Address Pointers in a Direct Block */
@@ -205,6 +207,7 @@ struct f2fs_extent {
 #define F2FS_INLINE_DENTRY	0x04	/* file inline dentry flag */
 #define F2FS_DATA_EXIST		0x08	/* file inline data exist flag */
 #define F2FS_INLINE_DOTS	0x10	/* file having implicit dot dentries */
+#define F2FS_EXTRA_ATTR		0x20	/* file having extra attribute */
 
 struct f2fs_inode {
 	__le16 i_mode;			/* file mode */
@@ -232,8 +235,14 @@ struct f2fs_inode {
 
 	struct f2fs_extent i_ext;	/* caching a largest extent */
 
-	__le32 i_addr[DEF_ADDRS_PER_INODE];	/* Pointers to data blocks */
-
+	union {
+		struct {
+			__le16 i_extra_isize;	/* extra inode attribute size */
+			__le16 i_padding;	/* padding */
+			__le32 i_extra_end[0];	/* for attribute size calculation */
+		};
+		__le32 i_addr[DEF_ADDRS_PER_INODE];	/* Pointers to data blocks */
+	};
 	__le32 i_nid[DEF_NIDS_PER_INODE];	/* direct(2), indirect(2),
 						double_indirect(1) node id */
 } __packed;
-- 
cgit v1.2.3


From 5c57132eaf5265937e46340bfbfb97ffb078c423 Mon Sep 17 00:00:00 2001
From: Chao Yu <yuchao0@huawei.com>
Date: Wed, 26 Jul 2017 00:01:41 +0800
Subject: f2fs: support project quota

This patch adds to support plain project quota.

Signed-off-by: Chao Yu <yuchao0@huawei.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
 Documentation/filesystems/f2fs.txt |  1 +
 fs/f2fs/f2fs.h                     | 29 +++++++++++++++++++++++++++++
 fs/f2fs/file.c                     | 13 -------------
 fs/f2fs/inode.c                    | 24 +++++++++++++++++++++++-
 fs/f2fs/namei.c                    | 31 +++++++++++++++++++++++++++++++
 fs/f2fs/node.c                     |  7 ++++++-
 fs/f2fs/super.c                    | 17 ++++++++++++++++-
 include/linux/f2fs_fs.h            |  3 +++
 8 files changed, 109 insertions(+), 16 deletions(-)

(limited to 'include')

diff --git a/Documentation/filesystems/f2fs.txt b/Documentation/filesystems/f2fs.txt
index 273ccb26885e..b8f495a8b67d 100644
--- a/Documentation/filesystems/f2fs.txt
+++ b/Documentation/filesystems/f2fs.txt
@@ -164,6 +164,7 @@ io_bits=%u             Set the bit size of write IO requests. It should be set
                        with "mode=lfs".
 usrquota               Enable plain user disk quota accounting.
 grpquota               Enable plain group disk quota accounting.
+prjquota               Enable plain project quota accounting.
 
 ================================================================================
 DEBUGFS ENTRIES
diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
index d0682fd4c690..d486539086b6 100644
--- a/fs/f2fs/f2fs.h
+++ b/fs/f2fs/f2fs.h
@@ -91,6 +91,7 @@ extern char *fault_name[FAULT_MAX];
 #define F2FS_MOUNT_LFS			0x00040000
 #define F2FS_MOUNT_USRQUOTA		0x00080000
 #define F2FS_MOUNT_GRPQUOTA		0x00100000
+#define F2FS_MOUNT_PRJQUOTA		0x00200000
 
 #define clear_opt(sbi, option)	((sbi)->mount_opt.opt &= ~F2FS_MOUNT_##option)
 #define set_opt(sbi, option)	((sbi)->mount_opt.opt |= F2FS_MOUNT_##option)
@@ -114,6 +115,7 @@ struct f2fs_mount_info {
 #define F2FS_FEATURE_BLKZONED		0x0002
 #define F2FS_FEATURE_ATOMIC_WRITE	0x0004
 #define F2FS_FEATURE_EXTRA_ATTR		0x0008
+#define F2FS_FEATURE_PRJQUOTA		0x0010
 
 #define F2FS_HAS_FEATURE(sb, mask)					\
 	((F2FS_SB(sb)->raw_super->feature & cpu_to_le32(mask)) != 0)
@@ -570,6 +572,7 @@ struct f2fs_inode_info {
 	struct rw_semaphore i_mmap_sem;
 
 	int i_extra_isize;		/* size of extra space located in i_addr */
+	kprojid_t i_projid;		/* id for project quota */
 };
 
 static inline void get_extent_info(struct extent_info *ext,
@@ -1887,6 +1890,20 @@ static inline void f2fs_change_bit(unsigned int nr, char *addr)
 	*addr ^= mask;
 }
 
+#define F2FS_REG_FLMASK		(~(FS_DIRSYNC_FL | FS_TOPDIR_FL))
+#define F2FS_OTHER_FLMASK	(FS_NODUMP_FL | FS_NOATIME_FL)
+#define F2FS_FL_INHERITED	(FS_PROJINHERIT_FL)
+
+static inline __u32 f2fs_mask_flags(umode_t mode, __u32 flags)
+{
+	if (S_ISDIR(mode))
+		return flags;
+	else if (S_ISREG(mode))
+		return flags & F2FS_REG_FLMASK;
+	else
+		return flags & F2FS_OTHER_FLMASK;
+}
+
 /* used for f2fs_inode_info->flags */
 enum {
 	FI_NEW_INODE,		/* indicate newly allocated inode */
@@ -1916,6 +1933,7 @@ enum {
 	FI_NO_PREALLOC,		/* indicate skipped preallocated blocks */
 	FI_HOT_DATA,		/* indicate file is hot */
 	FI_EXTRA_ATTR,		/* indicate file has extra attribute */
+	FI_PROJ_INHERIT,	/* indicate file inherits projectid */
 };
 
 static inline void __mark_inode_dirty_flag(struct inode *inode,
@@ -2239,6 +2257,12 @@ static inline int get_extra_isize(struct inode *inode)
 	(offsetof(struct f2fs_inode, i_extra_end) -	\
 	offsetof(struct f2fs_inode, i_extra_isize))	\
 
+#define F2FS_OLD_ATTRIBUTE_SIZE	(offsetof(struct f2fs_inode, i_addr))
+#define F2FS_FITS_IN_INODE(f2fs_inode, extra_isize, field)		\
+		((offsetof(typeof(*f2fs_inode), field) +	\
+		sizeof((f2fs_inode)->field))			\
+		<= (F2FS_OLD_ATTRIBUTE_SIZE + extra_isize))	\
+
 /*
  * file.c
  */
@@ -2836,6 +2860,11 @@ static inline int f2fs_sb_has_extra_attr(struct super_block *sb)
 	return F2FS_HAS_FEATURE(sb, F2FS_FEATURE_EXTRA_ATTR);
 }
 
+static inline int f2fs_sb_has_project_quota(struct super_block *sb)
+{
+	return F2FS_HAS_FEATURE(sb, F2FS_FEATURE_PRJQUOTA);
+}
+
 #ifdef CONFIG_BLK_DEV_ZONED
 static inline int get_blkz_type(struct f2fs_sb_info *sbi,
 			struct block_device *bdev, block_t blkaddr)
diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c
index d2ca4b71cb53..a0413c951458 100644
--- a/fs/f2fs/file.c
+++ b/fs/f2fs/file.c
@@ -1518,19 +1518,6 @@ static int f2fs_file_flush(struct file *file, fl_owner_t id)
 	return 0;
 }
 
-#define F2FS_REG_FLMASK		(~(FS_DIRSYNC_FL | FS_TOPDIR_FL))
-#define F2FS_OTHER_FLMASK	(FS_NODUMP_FL | FS_NOATIME_FL)
-
-static inline __u32 f2fs_mask_flags(umode_t mode, __u32 flags)
-{
-	if (S_ISDIR(mode))
-		return flags;
-	else if (S_ISREG(mode))
-		return flags & F2FS_REG_FLMASK;
-	else
-		return flags & F2FS_OTHER_FLMASK;
-}
-
 static int f2fs_ioc_getflags(struct file *filp, unsigned long arg)
 {
 	struct inode *inode = file_inode(filp);
diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c
index 0a6699a23dfb..f15e663a1a15 100644
--- a/fs/f2fs/inode.c
+++ b/fs/f2fs/inode.c
@@ -114,6 +114,7 @@ static int do_read_inode(struct inode *inode)
 	struct f2fs_inode_info *fi = F2FS_I(inode);
 	struct page *node_page;
 	struct f2fs_inode *ri;
+	projid_t i_projid;
 
 	/* Check if ino is within scope */
 	if (check_nid_range(sbi, inode->i_ino)) {
@@ -173,6 +174,16 @@ static int do_read_inode(struct inode *inode)
 	if (!need_inode_block_update(sbi, inode->i_ino))
 		fi->last_disk_size = inode->i_size;
 
+	if (fi->i_flags & FS_PROJINHERIT_FL)
+		set_inode_flag(inode, FI_PROJ_INHERIT);
+
+	if (f2fs_has_extra_attr(inode) && f2fs_sb_has_project_quota(sbi->sb) &&
+			F2FS_FITS_IN_INODE(ri, fi->i_extra_isize, i_projid))
+		i_projid = (projid_t)le32_to_cpu(ri->i_projid);
+	else
+		i_projid = F2FS_DEF_PROJID;
+	fi->i_projid = make_kprojid(&init_user_ns, i_projid);
+
 	f2fs_put_page(node_page, 1);
 
 	stat_inc_inline_xattr(inode);
@@ -299,9 +310,20 @@ int update_inode(struct inode *inode, struct page *node_page)
 	ri->i_generation = cpu_to_le32(inode->i_generation);
 	ri->i_dir_level = F2FS_I(inode)->i_dir_level;
 
-	if (f2fs_has_extra_attr(inode))
+	if (f2fs_has_extra_attr(inode)) {
 		ri->i_extra_isize = cpu_to_le16(F2FS_I(inode)->i_extra_isize);
 
+		if (f2fs_sb_has_project_quota(F2FS_I_SB(inode)->sb) &&
+			F2FS_FITS_IN_INODE(ri, F2FS_I(inode)->i_extra_isize,
+								i_projid)) {
+			projid_t i_projid;
+
+			i_projid = from_kprojid(&init_user_ns,
+						F2FS_I(inode)->i_projid);
+			ri->i_projid = cpu_to_le32(i_projid);
+		}
+	}
+
 	__set_inode_rdev(inode, ri);
 	set_cold_node(inode, node_page);
 
diff --git a/fs/f2fs/namei.c b/fs/f2fs/namei.c
index 98e13f123d87..e241c1b2c636 100644
--- a/fs/f2fs/namei.c
+++ b/fs/f2fs/namei.c
@@ -58,6 +58,13 @@ static struct inode *f2fs_new_inode(struct inode *dir, umode_t mode)
 		goto fail;
 	}
 
+	if (f2fs_sb_has_project_quota(sbi->sb) &&
+		(F2FS_I(dir)->i_flags & FS_PROJINHERIT_FL))
+		F2FS_I(inode)->i_projid = F2FS_I(dir)->i_projid;
+	else
+		F2FS_I(inode)->i_projid = make_kprojid(&init_user_ns,
+							F2FS_DEF_PROJID);
+
 	err = dquot_initialize(inode);
 	if (err)
 		goto fail_drop;
@@ -90,6 +97,12 @@ static struct inode *f2fs_new_inode(struct inode *dir, umode_t mode)
 	stat_inc_inline_inode(inode);
 	stat_inc_inline_dir(inode);
 
+	F2FS_I(inode)->i_flags =
+		f2fs_mask_flags(mode, F2FS_I(dir)->i_flags & F2FS_FL_INHERITED);
+
+	if (F2FS_I(inode)->i_flags & FS_PROJINHERIT_FL)
+		set_inode_flag(inode, FI_PROJ_INHERIT);
+
 	trace_f2fs_new_inode(inode, 0);
 	return inode;
 
@@ -209,6 +222,11 @@ static int f2fs_link(struct dentry *old_dentry, struct inode *dir,
 			!fscrypt_has_permitted_context(dir, inode))
 		return -EPERM;
 
+	if (is_inode_flag_set(dir, FI_PROJ_INHERIT) &&
+			(!projid_eq(F2FS_I(dir)->i_projid,
+			F2FS_I(old_dentry->d_inode)->i_projid)))
+		return -EXDEV;
+
 	err = dquot_initialize(dir);
 	if (err)
 		return err;
@@ -733,6 +751,11 @@ static int f2fs_rename(struct inode *old_dir, struct dentry *old_dentry,
 		goto out;
 	}
 
+	if (is_inode_flag_set(new_dir, FI_PROJ_INHERIT) &&
+			(!projid_eq(F2FS_I(new_dir)->i_projid,
+			F2FS_I(old_dentry->d_inode)->i_projid)))
+		return -EXDEV;
+
 	err = dquot_initialize(old_dir);
 	if (err)
 		goto out;
@@ -921,6 +944,14 @@ static int f2fs_cross_rename(struct inode *old_dir, struct dentry *old_dentry,
 			 !fscrypt_has_permitted_context(old_dir, new_inode)))
 		return -EPERM;
 
+	if ((is_inode_flag_set(new_dir, FI_PROJ_INHERIT) &&
+			!projid_eq(F2FS_I(new_dir)->i_projid,
+			F2FS_I(old_dentry->d_inode)->i_projid)) ||
+	    (is_inode_flag_set(new_dir, FI_PROJ_INHERIT) &&
+			!projid_eq(F2FS_I(old_dir)->i_projid,
+			F2FS_I(new_dentry->d_inode)->i_projid)))
+		return -EXDEV;
+
 	err = dquot_initialize(old_dir);
 	if (err)
 		goto out;
diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c
index 547b84fc10f6..11e2d49203b3 100644
--- a/fs/f2fs/node.c
+++ b/fs/f2fs/node.c
@@ -2265,8 +2265,13 @@ retry:
 	dst->i_links = cpu_to_le32(1);
 	dst->i_xattr_nid = 0;
 	dst->i_inline = src->i_inline & (F2FS_INLINE_XATTR | F2FS_EXTRA_ATTR);
-	if (dst->i_inline & F2FS_EXTRA_ATTR)
+	if (dst->i_inline & F2FS_EXTRA_ATTR) {
 		dst->i_extra_isize = src->i_extra_isize;
+		if (f2fs_sb_has_project_quota(sbi->sb) &&
+			F2FS_FITS_IN_INODE(src, le16_to_cpu(src->i_extra_isize),
+								i_projid))
+			dst->i_projid = src->i_projid;
+	}
 
 	new_ni = old_ni;
 	new_ni.ino = ino;
diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c
index b30097190605..1241739f4d54 100644
--- a/fs/f2fs/super.c
+++ b/fs/f2fs/super.c
@@ -109,6 +109,7 @@ enum {
 	Opt_nolazytime,
 	Opt_usrquota,
 	Opt_grpquota,
+	Opt_prjquota,
 	Opt_err,
 };
 
@@ -146,6 +147,7 @@ static match_table_t f2fs_tokens = {
 	{Opt_nolazytime, "nolazytime"},
 	{Opt_usrquota, "usrquota"},
 	{Opt_grpquota, "grpquota"},
+	{Opt_prjquota, "prjquota"},
 	{Opt_err, NULL},
 };
 
@@ -392,9 +394,13 @@ static int parse_options(struct super_block *sb, char *options)
 		case Opt_grpquota:
 			set_opt(sbi, GRPQUOTA);
 			break;
+		case Opt_prjquota:
+			set_opt(sbi, PRJQUOTA);
+			break;
 #else
 		case Opt_usrquota:
 		case Opt_grpquota:
+		case Opt_prjquota:
 			f2fs_msg(sb, KERN_INFO,
 					"quota operations not supported");
 			break;
@@ -814,6 +820,8 @@ static int f2fs_show_options(struct seq_file *seq, struct dentry *root)
 		seq_puts(seq, ",usrquota");
 	if (test_opt(sbi, GRPQUOTA))
 		seq_puts(seq, ",grpquota");
+	if (test_opt(sbi, PRJQUOTA))
+		seq_puts(seq, ",prjquota");
 #endif
 
 	return 0;
@@ -1172,6 +1180,12 @@ static void f2fs_quota_off_umount(struct super_block *sb)
 		f2fs_quota_off(sb, type);
 }
 
+int f2fs_get_projid(struct inode *inode, kprojid_t *projid)
+{
+	*projid = F2FS_I(inode)->i_projid;
+	return 0;
+}
+
 static const struct dquot_operations f2fs_quota_operations = {
 	.get_reserved_space = f2fs_get_reserved_space,
 	.write_dquot	= dquot_commit,
@@ -1181,6 +1195,7 @@ static const struct dquot_operations f2fs_quota_operations = {
 	.write_info	= dquot_commit_info,
 	.alloc_dquot	= dquot_alloc,
 	.destroy_dquot	= dquot_destroy,
+	.get_projid	= f2fs_get_projid,
 	.get_next_id	= dquot_get_next_id,
 };
 
@@ -1964,7 +1979,7 @@ try_onemore:
 #ifdef CONFIG_QUOTA
 	sb->dq_op = &f2fs_quota_operations;
 	sb->s_qcop = &f2fs_quotactl_ops;
-	sb->s_quota_types = QTYPE_MASK_USR | QTYPE_MASK_GRP;
+	sb->s_quota_types = QTYPE_MASK_USR | QTYPE_MASK_GRP | QTYPE_MASK_PRJ;
 #endif
 
 	sb->s_op = &f2fs_sops;
diff --git a/include/linux/f2fs_fs.h b/include/linux/f2fs_fs.h
index 0c79ddd40b70..50f176683676 100644
--- a/include/linux/f2fs_fs.h
+++ b/include/linux/f2fs_fs.h
@@ -239,6 +239,7 @@ struct f2fs_inode {
 		struct {
 			__le16 i_extra_isize;	/* extra inode attribute size */
 			__le16 i_padding;	/* padding */
+			__le32 i_projid;	/* project id */
 			__le32 i_extra_end[0];	/* for attribute size calculation */
 		};
 		__le32 i_addr[DEF_ADDRS_PER_INODE];	/* Pointers to data blocks */
@@ -522,4 +523,6 @@ enum {
 
 #define S_SHIFT 12
 
+#define	F2FS_DEF_PROJID		0	/* default project ID */
+
 #endif  /* _LINUX_F2FS_FS_H */
-- 
cgit v1.2.3


From bb7c19f96012720b895111300b9d9f3f858c3a69 Mon Sep 17 00:00:00 2001
From: Wei Wang <weiwan@google.com>
Date: Fri, 28 Jul 2017 10:28:21 -0700
Subject: tcp: add related fields into SCM_TIMESTAMPING_OPT_STATS

Add the following stats into SCM_TIMESTAMPING_OPT_STATS control msg:
    TCP_NLA_PACING_RATE
    TCP_NLA_DELIVERY_RATE
    TCP_NLA_SND_CWND
    TCP_NLA_REORDERING
    TCP_NLA_MIN_RTT
    TCP_NLA_RECUR_RETRANS
    TCP_NLA_DELIVERY_RATE_APP_LMT

Signed-off-by: Wei Wang <weiwan@google.com>
Acked-by: Yuchung Cheng <ycheng@google.com>
Acked-by: Soheil Hassas Yeganeh <soheil@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/tcp.h |  8 ++++++++
 net/ipv4/tcp.c           | 20 +++++++++++++++++++-
 2 files changed, 27 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/uapi/linux/tcp.h b/include/uapi/linux/tcp.h
index a5507c977497..030e594bab45 100644
--- a/include/uapi/linux/tcp.h
+++ b/include/uapi/linux/tcp.h
@@ -231,6 +231,14 @@ enum {
 	TCP_NLA_SNDBUF_LIMITED,	/* Time (usec) limited by send buffer */
 	TCP_NLA_DATA_SEGS_OUT,	/* Data pkts sent including retransmission */
 	TCP_NLA_TOTAL_RETRANS,	/* Data pkts retransmitted */
+	TCP_NLA_PACING_RATE,    /* Pacing rate in bytes per second */
+	TCP_NLA_DELIVERY_RATE,  /* Delivery rate in bytes per second */
+	TCP_NLA_SND_CWND,       /* Sending congestion window */
+	TCP_NLA_REORDERING,     /* Reordering metric */
+	TCP_NLA_MIN_RTT,        /* minimum RTT */
+	TCP_NLA_RECUR_RETRANS,  /* Recurring retransmits for the current pkt */
+	TCP_NLA_DELIVERY_RATE_APP_LMT, /* delivery rate application limited ? */
+
 };
 
 /* for TCP_MD5SIG socket option */
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index acee7acdcba6..5326b50a3450 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -2840,8 +2840,12 @@ struct sk_buff *tcp_get_timestamping_opt_stats(const struct sock *sk)
 	const struct tcp_sock *tp = tcp_sk(sk);
 	struct sk_buff *stats;
 	struct tcp_info info;
+	u64 rate64;
+	u32 rate;
 
-	stats = alloc_skb(5 * nla_total_size_64bit(sizeof(u64)), GFP_ATOMIC);
+	stats = alloc_skb(7 * nla_total_size_64bit(sizeof(u64)) +
+			  3 * nla_total_size(sizeof(u32)) +
+			  2 * nla_total_size(sizeof(u8)), GFP_ATOMIC);
 	if (!stats)
 		return NULL;
 
@@ -2856,6 +2860,20 @@ struct sk_buff *tcp_get_timestamping_opt_stats(const struct sock *sk)
 			  tp->data_segs_out, TCP_NLA_PAD);
 	nla_put_u64_64bit(stats, TCP_NLA_TOTAL_RETRANS,
 			  tp->total_retrans, TCP_NLA_PAD);
+
+	rate = READ_ONCE(sk->sk_pacing_rate);
+	rate64 = rate != ~0U ? rate : ~0ULL;
+	nla_put_u64_64bit(stats, TCP_NLA_PACING_RATE, rate64, TCP_NLA_PAD);
+
+	rate64 = tcp_compute_delivery_rate(tp);
+	nla_put_u64_64bit(stats, TCP_NLA_DELIVERY_RATE, rate64, TCP_NLA_PAD);
+
+	nla_put_u32(stats, TCP_NLA_SND_CWND, tp->snd_cwnd);
+	nla_put_u32(stats, TCP_NLA_REORDERING, tp->reordering);
+	nla_put_u32(stats, TCP_NLA_MIN_RTT, tcp_min_rtt(tp));
+
+	nla_put_u8(stats, TCP_NLA_RECUR_RETRANS, inet_csk(sk)->icsk_retransmits);
+	nla_put_u8(stats, TCP_NLA_DELIVERY_RATE_APP_LMT, !!tp->rate_app_limited);
 	return stats;
 }
 
-- 
cgit v1.2.3


From 6f68f0ac72087c29a94d1736324902e3ec07a4ec Mon Sep 17 00:00:00 2001
From: Binoy Jayan <binoy.jayan@linaro.org>
Date: Wed, 14 Jun 2017 12:59:51 +0530
Subject: HID: Remove the semaphore driver_lock

The semaphore 'driver_lock' is used as a simple mutex, and also unnecessary as
suggested by Arnd. Hence removing it, as the concurrency between the probe and
remove is already handled in the driver core.

Suggested-by: Arnd Bergmann <arnd@arndb.de>
Signed-off-by: Binoy Jayan <binoy.jayan@linaro.org>
Acked-by: Benjamin Tissoires <benjamin.tissoires@redhat.com>
Reviewed-by: David Herrmann <dh.herrmann@gmail.com>
Signed-off-by: Jiri Kosina <jkosina@suse.cz>
---
 drivers/hid/hid-core.c | 15 ++++-----------
 include/linux/hid.h    |  3 +--
 2 files changed, 5 insertions(+), 13 deletions(-)

(limited to 'include')

diff --git a/drivers/hid/hid-core.c b/drivers/hid/hid-core.c
index 9017dcc14502..96e5457c2cc3 100644
--- a/drivers/hid/hid-core.c
+++ b/drivers/hid/hid-core.c
@@ -2487,11 +2487,9 @@ static int hid_device_probe(struct device *dev)
 	const struct hid_device_id *id;
 	int ret = 0;
 
-	if (down_interruptible(&hdev->driver_lock))
-		return -EINTR;
 	if (down_interruptible(&hdev->driver_input_lock)) {
 		ret = -EINTR;
-		goto unlock_driver_lock;
+		goto end;
 	}
 	hdev->io_started = false;
 
@@ -2518,8 +2516,7 @@ static int hid_device_probe(struct device *dev)
 unlock:
 	if (!hdev->io_started)
 		up(&hdev->driver_input_lock);
-unlock_driver_lock:
-	up(&hdev->driver_lock);
+end:
 	return ret;
 }
 
@@ -2529,11 +2526,9 @@ static int hid_device_remove(struct device *dev)
 	struct hid_driver *hdrv;
 	int ret = 0;
 
-	if (down_interruptible(&hdev->driver_lock))
-		return -EINTR;
 	if (down_interruptible(&hdev->driver_input_lock)) {
 		ret = -EINTR;
-		goto unlock_driver_lock;
+		goto end;
 	}
 	hdev->io_started = false;
 
@@ -2549,8 +2544,7 @@ static int hid_device_remove(struct device *dev)
 
 	if (!hdev->io_started)
 		up(&hdev->driver_input_lock);
-unlock_driver_lock:
-	up(&hdev->driver_lock);
+end:
 	return ret;
 }
 
@@ -3008,7 +3002,6 @@ struct hid_device *hid_allocate_device(void)
 	init_waitqueue_head(&hdev->debug_wait);
 	INIT_LIST_HEAD(&hdev->debug_list);
 	spin_lock_init(&hdev->debug_list_lock);
-	sema_init(&hdev->driver_lock, 1);
 	sema_init(&hdev->driver_input_lock, 1);
 	mutex_init(&hdev->ll_open_lock);
 
diff --git a/include/linux/hid.h b/include/linux/hid.h
index 5006f9b5d837..142409fc1ff3 100644
--- a/include/linux/hid.h
+++ b/include/linux/hid.h
@@ -526,7 +526,6 @@ struct hid_device {							/* device report descriptor */
 	struct hid_report_enum report_enum[HID_REPORT_TYPES];
 	struct work_struct led_work;					/* delayed LED worker */
 
-	struct semaphore driver_lock;					/* protects the current driver, except during input */
 	struct semaphore driver_input_lock;				/* protects the current driver */
 	struct device dev;						/* device */
 	struct hid_driver *driver;
@@ -551,7 +550,7 @@ struct hid_device {							/* device report descriptor */
 	unsigned int status;						/* see STAT flags above */
 	unsigned claimed;						/* Claimed by hidinput, hiddev? */
 	unsigned quirks;						/* Various quirks the device can pull on us */
-	bool io_started;						/* Protected by driver_lock. If IO has started */
+	bool io_started;						/* If IO has started */
 
 	struct list_head inputs;					/* The list of inputs */
 	void *hiddev;							/* The hiddev structure */
-- 
cgit v1.2.3


From 99d8845e756cb91e2865f430401d084cd6a8ccc9 Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rafael.j.wysocki@intel.com>
Date: Fri, 21 Jul 2017 14:40:49 +0200
Subject: ACPI / PM: Split acpi_device_wakeup()

To prepare for a subsequent change and make the code somewhat easier
to follow, do the following in the ACPI device wakeup handling code:

 * Replace wakeup.flags.enabled under struct acpi_device with
   wakeup.enable_count as that will be necessary going forward.

   For now, wakeup.enable_count is not allowed to grow beyond 1,
   so the current behavior is retained.

 * Split acpi_device_wakeup() into acpi_device_wakeup_enable()
   and acpi_device_wakeup_disable() and modify the callers of
   it accordingly.

 * Introduce a new acpi_wakeup_lock mutex to protect the wakeup
   enabling/disabling code from races in case it is executed
   more than once in parallel for the same device (which may
   happen for bridges theoretically).

Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Reviewed-by: Mika Westerberg <mika.westerberg@linux.intel.com>
---
 drivers/acpi/device_pm.c | 121 +++++++++++++++++++++++++++++++----------------
 include/acpi/acpi_bus.h  |   2 +-
 2 files changed, 80 insertions(+), 43 deletions(-)

(limited to 'include')

diff --git a/drivers/acpi/device_pm.c b/drivers/acpi/device_pm.c
index 2ed6935d4483..4cd4bdab053d 100644
--- a/drivers/acpi/device_pm.c
+++ b/drivers/acpi/device_pm.c
@@ -682,47 +682,74 @@ static void acpi_pm_notify_work_func(struct acpi_device_wakeup_context *context)
 	}
 }
 
+static DEFINE_MUTEX(acpi_wakeup_lock);
+
 /**
- * acpi_device_wakeup - Enable/disable wakeup functionality for device.
- * @adev: ACPI device to enable/disable wakeup functionality for.
+ * acpi_device_wakeup_enable - Enable wakeup functionality for device.
+ * @adev: ACPI device to enable wakeup functionality for.
  * @target_state: State the system is transitioning into.
- * @enable: Whether to enable or disable the wakeup functionality.
  *
- * Enable/disable the GPE associated with @adev so that it can generate
- * wakeup signals for the device in response to external (remote) events and
- * enable/disable device wakeup power.
+ * Enable the GPE associated with @adev so that it can generate wakeup signals
+ * for the device in response to external (remote) events and enable wakeup
+ * power for it.
  *
  * Callers must ensure that @adev is a valid ACPI device node before executing
  * this function.
  */
-static int acpi_device_wakeup(struct acpi_device *adev, u32 target_state,
-			      bool enable)
+static int acpi_device_wakeup_enable(struct acpi_device *adev, u32 target_state)
 {
 	struct acpi_device_wakeup *wakeup = &adev->wakeup;
+	acpi_status status;
+	int error = 0;
 
-	if (enable) {
-		acpi_status res;
-		int error;
+	mutex_lock(&acpi_wakeup_lock);
 
-		if (adev->wakeup.flags.enabled)
-			return 0;
+	if (wakeup->enable_count > 0)
+		goto out;
 
-		error = acpi_enable_wakeup_device_power(adev, target_state);
-		if (error)
-			return error;
+	error = acpi_enable_wakeup_device_power(adev, target_state);
+	if (error)
+		goto out;
 
-		res = acpi_enable_gpe(wakeup->gpe_device, wakeup->gpe_number);
-		if (ACPI_FAILURE(res)) {
-			acpi_disable_wakeup_device_power(adev);
-			return -EIO;
-		}
-		adev->wakeup.flags.enabled = 1;
-	} else if (adev->wakeup.flags.enabled) {
-		acpi_disable_gpe(wakeup->gpe_device, wakeup->gpe_number);
+	status = acpi_enable_gpe(wakeup->gpe_device, wakeup->gpe_number);
+	if (ACPI_FAILURE(status)) {
 		acpi_disable_wakeup_device_power(adev);
-		adev->wakeup.flags.enabled = 0;
+		error = -EIO;
+		goto out;
 	}
-	return 0;
+
+	wakeup->enable_count++;
+
+out:
+	mutex_unlock(&acpi_wakeup_lock);
+	return error;
+}
+
+/**
+ * acpi_device_wakeup_disable - Disable wakeup functionality for device.
+ * @adev: ACPI device to disable wakeup functionality for.
+ *
+ * Disable the GPE associated with @adev and disable wakeup power for it.
+ *
+ * Callers must ensure that @adev is a valid ACPI device node before executing
+ * this function.
+ */
+static void acpi_device_wakeup_disable(struct acpi_device *adev)
+{
+	struct acpi_device_wakeup *wakeup = &adev->wakeup;
+
+	mutex_lock(&acpi_wakeup_lock);
+
+	if (!wakeup->enable_count)
+		goto out;
+
+	acpi_disable_gpe(wakeup->gpe_device, wakeup->gpe_number);
+	acpi_disable_wakeup_device_power(adev);
+
+	wakeup->enable_count--;
+
+out:
+	mutex_unlock(&acpi_wakeup_lock);
 }
 
 /**
@@ -744,9 +771,15 @@ int acpi_pm_set_device_wakeup(struct device *dev, bool enable)
 	if (!acpi_device_can_wakeup(adev))
 		return -EINVAL;
 
-	error = acpi_device_wakeup(adev, acpi_target_system_state(), enable);
+	if (!enable) {
+		acpi_device_wakeup_disable(adev);
+		dev_dbg(dev, "Wakeup disabled by ACPI\n");
+		return 0;
+	}
+
+	error = acpi_device_wakeup_enable(adev, acpi_target_system_state());
 	if (!error)
-		dev_dbg(dev, "Wakeup %s by ACPI\n", enable ? "enabled" : "disabled");
+		dev_dbg(dev, "Wakeup enabled by ACPI\n");
 
 	return error;
 }
@@ -800,13 +833,15 @@ int acpi_dev_runtime_suspend(struct device *dev)
 
 	remote_wakeup = dev_pm_qos_flags(dev, PM_QOS_FLAG_REMOTE_WAKEUP) >
 				PM_QOS_FLAGS_NONE;
-	error = acpi_device_wakeup(adev, ACPI_STATE_S0, remote_wakeup);
-	if (remote_wakeup && error)
-		return -EAGAIN;
+	if (remote_wakeup) {
+		error = acpi_device_wakeup_enable(adev, ACPI_STATE_S0);
+		if (error)
+			return -EAGAIN;
+	}
 
 	error = acpi_dev_pm_low_power(dev, adev, ACPI_STATE_S0);
-	if (error)
-		acpi_device_wakeup(adev, ACPI_STATE_S0, false);
+	if (error && remote_wakeup)
+		acpi_device_wakeup_disable(adev);
 
 	return error;
 }
@@ -829,7 +864,7 @@ int acpi_dev_runtime_resume(struct device *dev)
 		return 0;
 
 	error = acpi_dev_pm_full_power(adev);
-	acpi_device_wakeup(adev, ACPI_STATE_S0, false);
+	acpi_device_wakeup_disable(adev);
 	return error;
 }
 EXPORT_SYMBOL_GPL(acpi_dev_runtime_resume);
@@ -884,13 +919,15 @@ int acpi_dev_suspend_late(struct device *dev)
 
 	target_state = acpi_target_system_state();
 	wakeup = device_may_wakeup(dev) && acpi_device_can_wakeup(adev);
-	error = acpi_device_wakeup(adev, target_state, wakeup);
-	if (wakeup && error)
-		return error;
+	if (wakeup) {
+		error = acpi_device_wakeup_enable(adev, target_state);
+		if (error)
+			return error;
+	}
 
 	error = acpi_dev_pm_low_power(dev, adev, target_state);
-	if (error)
-		acpi_device_wakeup(adev, ACPI_STATE_UNKNOWN, false);
+	if (error && wakeup)
+		acpi_device_wakeup_disable(adev);
 
 	return error;
 }
@@ -913,7 +950,7 @@ int acpi_dev_resume_early(struct device *dev)
 		return 0;
 
 	error = acpi_dev_pm_full_power(adev);
-	acpi_device_wakeup(adev, ACPI_STATE_UNKNOWN, false);
+	acpi_device_wakeup_disable(adev);
 	return error;
 }
 EXPORT_SYMBOL_GPL(acpi_dev_resume_early);
@@ -1056,7 +1093,7 @@ static void acpi_dev_pm_detach(struct device *dev, bool power_off)
 			 */
 			dev_pm_qos_hide_latency_limit(dev);
 			dev_pm_qos_hide_flags(dev);
-			acpi_device_wakeup(adev, ACPI_STATE_S0, false);
+			acpi_device_wakeup_disable(adev);
 			acpi_dev_pm_low_power(dev, adev, ACPI_STATE_S0);
 		}
 	}
@@ -1100,7 +1137,7 @@ int acpi_dev_pm_attach(struct device *dev, bool power_on)
 	dev_pm_domain_set(dev, &acpi_general_pm_domain);
 	if (power_on) {
 		acpi_dev_pm_full_power(adev);
-		acpi_device_wakeup(adev, ACPI_STATE_S0, false);
+		acpi_device_wakeup_disable(adev);
 	}
 
 	dev->pm_domain->detach = acpi_dev_pm_detach;
diff --git a/include/acpi/acpi_bus.h b/include/acpi/acpi_bus.h
index 68bc6be447fd..b7df95dbe7e9 100644
--- a/include/acpi/acpi_bus.h
+++ b/include/acpi/acpi_bus.h
@@ -316,7 +316,6 @@ struct acpi_device_perf {
 struct acpi_device_wakeup_flags {
 	u8 valid:1;		/* Can successfully enable wakeup? */
 	u8 notifier_present:1;  /* Wake-up notify handler has been installed */
-	u8 enabled:1;		/* Enabled for wakeup */
 };
 
 struct acpi_device_wakeup_context {
@@ -333,6 +332,7 @@ struct acpi_device_wakeup {
 	struct acpi_device_wakeup_context context;
 	struct wakeup_source *ws;
 	int prepare_count;
+	int enable_count;
 };
 
 struct acpi_device_physical_node {
-- 
cgit v1.2.3


From 1ba51a7c1496fd8f6d5bdd58dafcb1894275b7f0 Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rafael.j.wysocki@intel.com>
Date: Tue, 1 Aug 2017 02:56:18 +0200
Subject: ACPI / PCI / PM: Rework acpi_pci_propagate_wakeup()

The acpi_pci_propagate_wakeup() routine is there to handle cases in
which PCI bridges (or PCIe ports) are expected to signal wakeup
for devices below them, but currently it doesn't do that correctly.

The problem is that acpi_pci_propagate_wakeup() uses
acpi_pm_set_device_wakeup() for bridges and if that routine is
called for multiple times to disable wakeup for the same device,
it will disable it on the first invocation and the next calls
will have no effect (it works analogously when called to enable
wakeup, but that is not a problem).

Now, say acpi_pci_propagate_wakeup() has been called for two
different devices under the same bridge and it has called
acpi_pm_set_device_wakeup() for that bridge each time.  The
bridge is now enabled to generate wakeup signals.  Next,
suppose that one of the devices below it resumes and
acpi_pci_propagate_wakeup() is called to disable wakeup for that
device.  It will then call acpi_pm_set_device_wakeup() for the bridge
and that will effectively disable remote wakeup for all devices under
it even though some of them may still be suspended and remote wakeup
may be expected to work for them.

To address this (arguably theoretical) issue, allow
wakeup.enable_count under struct acpi_device to grow beyond 1 in
certain situations.  In particular, allow that to happen in
acpi_pci_propagate_wakeup() when wakeup is enabled or disabled
for PCI bridges, so that wakeup is actually disabled for the
bridge when all devices under it resume and not when just one
of them does that.

Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Reviewed-by: Andy Shevchenko <andy.shevchenko@gmail.com>
Reviewed-by: Mika Westerberg <mika.westerberg@linux.intel.com>
Acked-by: Bjorn Helgaas <bhelgaas@google.com>
---
 drivers/acpi/device_pm.c | 72 +++++++++++++++++++++++++++++++++---------------
 drivers/pci/pci-acpi.c   |  4 +--
 include/acpi/acpi_bus.h  |  5 ++++
 3 files changed, 57 insertions(+), 24 deletions(-)

(limited to 'include')

diff --git a/drivers/acpi/device_pm.c b/drivers/acpi/device_pm.c
index 4cd4bdab053d..112fd6c55c2c 100644
--- a/drivers/acpi/device_pm.c
+++ b/drivers/acpi/device_pm.c
@@ -684,19 +684,8 @@ static void acpi_pm_notify_work_func(struct acpi_device_wakeup_context *context)
 
 static DEFINE_MUTEX(acpi_wakeup_lock);
 
-/**
- * acpi_device_wakeup_enable - Enable wakeup functionality for device.
- * @adev: ACPI device to enable wakeup functionality for.
- * @target_state: State the system is transitioning into.
- *
- * Enable the GPE associated with @adev so that it can generate wakeup signals
- * for the device in response to external (remote) events and enable wakeup
- * power for it.
- *
- * Callers must ensure that @adev is a valid ACPI device node before executing
- * this function.
- */
-static int acpi_device_wakeup_enable(struct acpi_device *adev, u32 target_state)
+static int __acpi_device_wakeup_enable(struct acpi_device *adev,
+				       u32 target_state, int max_count)
 {
 	struct acpi_device_wakeup *wakeup = &adev->wakeup;
 	acpi_status status;
@@ -704,9 +693,12 @@ static int acpi_device_wakeup_enable(struct acpi_device *adev, u32 target_state)
 
 	mutex_lock(&acpi_wakeup_lock);
 
-	if (wakeup->enable_count > 0)
+	if (wakeup->enable_count >= max_count)
 		goto out;
 
+	if (wakeup->enable_count > 0)
+		goto inc;
+
 	error = acpi_enable_wakeup_device_power(adev, target_state);
 	if (error)
 		goto out;
@@ -718,6 +710,7 @@ static int acpi_device_wakeup_enable(struct acpi_device *adev, u32 target_state)
 		goto out;
 	}
 
+inc:
 	wakeup->enable_count++;
 
 out:
@@ -725,6 +718,23 @@ out:
 	return error;
 }
 
+/**
+ * acpi_device_wakeup_enable - Enable wakeup functionality for device.
+ * @adev: ACPI device to enable wakeup functionality for.
+ * @target_state: State the system is transitioning into.
+ *
+ * Enable the GPE associated with @adev so that it can generate wakeup signals
+ * for the device in response to external (remote) events and enable wakeup
+ * power for it.
+ *
+ * Callers must ensure that @adev is a valid ACPI device node before executing
+ * this function.
+ */
+static int acpi_device_wakeup_enable(struct acpi_device *adev, u32 target_state)
+{
+	return __acpi_device_wakeup_enable(adev, target_state, 1);
+}
+
 /**
  * acpi_device_wakeup_disable - Disable wakeup functionality for device.
  * @adev: ACPI device to disable wakeup functionality for.
@@ -752,12 +762,8 @@ out:
 	mutex_unlock(&acpi_wakeup_lock);
 }
 
-/**
- * acpi_pm_set_device_wakeup - Enable/disable remote wakeup for given device.
- * @dev: Device to enable/disable to generate wakeup events.
- * @enable: Whether to enable or disable the wakeup functionality.
- */
-int acpi_pm_set_device_wakeup(struct device *dev, bool enable)
+static int __acpi_pm_set_device_wakeup(struct device *dev, bool enable,
+				       int max_count)
 {
 	struct acpi_device *adev;
 	int error;
@@ -777,13 +783,35 @@ int acpi_pm_set_device_wakeup(struct device *dev, bool enable)
 		return 0;
 	}
 
-	error = acpi_device_wakeup_enable(adev, acpi_target_system_state());
+	error = __acpi_device_wakeup_enable(adev, acpi_target_system_state(),
+					    max_count);
 	if (!error)
 		dev_dbg(dev, "Wakeup enabled by ACPI\n");
 
 	return error;
 }
-EXPORT_SYMBOL(acpi_pm_set_device_wakeup);
+
+/**
+ * acpi_pm_set_device_wakeup - Enable/disable remote wakeup for given device.
+ * @dev: Device to enable/disable to generate wakeup events.
+ * @enable: Whether to enable or disable the wakeup functionality.
+ */
+int acpi_pm_set_device_wakeup(struct device *dev, bool enable)
+{
+	return __acpi_pm_set_device_wakeup(dev, enable, 1);
+}
+EXPORT_SYMBOL_GPL(acpi_pm_set_device_wakeup);
+
+/**
+ * acpi_pm_set_bridge_wakeup - Enable/disable remote wakeup for given bridge.
+ * @dev: Bridge device to enable/disable to generate wakeup events.
+ * @enable: Whether to enable or disable the wakeup functionality.
+ */
+int acpi_pm_set_bridge_wakeup(struct device *dev, bool enable)
+{
+	return __acpi_pm_set_device_wakeup(dev, enable, INT_MAX);
+}
+EXPORT_SYMBOL_GPL(acpi_pm_set_bridge_wakeup);
 
 /**
  * acpi_dev_pm_low_power - Put ACPI device into a low-power state.
diff --git a/drivers/pci/pci-acpi.c b/drivers/pci/pci-acpi.c
index e70c1c7ba1bf..a8da543b3814 100644
--- a/drivers/pci/pci-acpi.c
+++ b/drivers/pci/pci-acpi.c
@@ -573,7 +573,7 @@ static int acpi_pci_propagate_wakeup(struct pci_bus *bus, bool enable)
 {
 	while (bus->parent) {
 		if (acpi_pm_device_can_wakeup(&bus->self->dev))
-			return acpi_pm_set_device_wakeup(&bus->self->dev, enable);
+			return acpi_pm_set_bridge_wakeup(&bus->self->dev, enable);
 
 		bus = bus->parent;
 	}
@@ -581,7 +581,7 @@ static int acpi_pci_propagate_wakeup(struct pci_bus *bus, bool enable)
 	/* We have reached the root bus. */
 	if (bus->bridge) {
 		if (acpi_pm_device_can_wakeup(bus->bridge))
-			return acpi_pm_set_device_wakeup(bus->bridge, enable);
+			return acpi_pm_set_bridge_wakeup(bus->bridge, enable);
 	}
 	return 0;
 }
diff --git a/include/acpi/acpi_bus.h b/include/acpi/acpi_bus.h
index b7df95dbe7e9..de7e8ac9d806 100644
--- a/include/acpi/acpi_bus.h
+++ b/include/acpi/acpi_bus.h
@@ -606,6 +606,7 @@ acpi_status acpi_remove_pm_notifier(struct acpi_device *adev);
 bool acpi_pm_device_can_wakeup(struct device *dev);
 int acpi_pm_device_sleep_state(struct device *, int *, int);
 int acpi_pm_set_device_wakeup(struct device *dev, bool enable);
+int acpi_pm_set_bridge_wakeup(struct device *dev, bool enable);
 #else
 static inline void acpi_pm_wakeup_event(struct device *dev)
 {
@@ -636,6 +637,10 @@ static inline int acpi_pm_set_device_wakeup(struct device *dev, bool enable)
 {
 	return -ENODEV;
 }
+static inline int acpi_pm_set_bridge_wakeup(struct device *dev, bool enable)
+{
+	return -ENODEV;
+}
 #endif
 
 #ifdef CONFIG_ACPI_SLEEP
-- 
cgit v1.2.3


From 674e75411fc260b0d4532701228cfe12fc090da8 Mon Sep 17 00:00:00 2001
From: Viresh Kumar <viresh.kumar@linaro.org>
Date: Fri, 28 Jul 2017 12:16:38 +0530
Subject: sched: cpufreq: Allow remote cpufreq callbacks

With Android UI and benchmarks the latency of cpufreq response to
certain scheduling events can become very critical. Currently, callbacks
into cpufreq governors are only made from the scheduler if the target
CPU of the event is the same as the current CPU. This means there are
certain situations where a target CPU may not run the cpufreq governor
for some time.

One testcase to show this behavior is where a task starts running on
CPU0, then a new task is also spawned on CPU0 by a task on CPU1. If the
system is configured such that the new tasks should receive maximum
demand initially, this should result in CPU0 increasing frequency
immediately. But because of the above mentioned limitation though, this
does not occur.

This patch updates the scheduler core to call the cpufreq callbacks for
remote CPUs as well.

The schedutil, ondemand and conservative governors are updated to
process cpufreq utilization update hooks called for remote CPUs where
the remote CPU is managed by the cpufreq policy of the local CPU.

The intel_pstate driver is updated to always reject remote callbacks.

This is tested with couple of usecases (Android: hackbench, recentfling,
galleryfling, vellamo, Ubuntu: hackbench) on ARM hikey board (64 bit
octa-core, single policy). Only galleryfling showed minor improvements,
while others didn't had much deviation.

The reason being that this patch only targets a corner case, where
following are required to be true to improve performance and that
doesn't happen too often with these tests:

- Task is migrated to another CPU.
- The task has high demand, and should take the target CPU to higher
  OPPs.
- And the target CPU doesn't call into the cpufreq governor until the
  next tick.

Based on initial work from Steve Muckle.

Signed-off-by: Viresh Kumar <viresh.kumar@linaro.org>
Acked-by: Saravana Kannan <skannan@codeaurora.org>
Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/cpufreq/cpufreq_governor.c |  3 +++
 drivers/cpufreq/intel_pstate.c     |  8 ++++++++
 include/linux/cpufreq.h            |  9 +++++++++
 kernel/sched/cpufreq_schedutil.c   | 31 ++++++++++++++++++++++++++-----
 kernel/sched/deadline.c            |  2 +-
 kernel/sched/fair.c                |  8 +++++---
 kernel/sched/rt.c                  |  2 +-
 kernel/sched/sched.h               | 10 ++--------
 8 files changed, 55 insertions(+), 18 deletions(-)

(limited to 'include')

diff --git a/drivers/cpufreq/cpufreq_governor.c b/drivers/cpufreq/cpufreq_governor.c
index 47e24b5384b3..ce5f3ec7ce71 100644
--- a/drivers/cpufreq/cpufreq_governor.c
+++ b/drivers/cpufreq/cpufreq_governor.c
@@ -275,6 +275,9 @@ static void dbs_update_util_handler(struct update_util_data *data, u64 time,
 	struct policy_dbs_info *policy_dbs = cdbs->policy_dbs;
 	u64 delta_ns, lst;
 
+	if (!cpufreq_can_do_remote_dvfs(policy_dbs->policy))
+		return;
+
 	/*
 	 * The work may not be allowed to be queued up right now.
 	 * Possible reasons:
diff --git a/drivers/cpufreq/intel_pstate.c b/drivers/cpufreq/intel_pstate.c
index 6cd503525638..d299b86a5a00 100644
--- a/drivers/cpufreq/intel_pstate.c
+++ b/drivers/cpufreq/intel_pstate.c
@@ -1747,6 +1747,10 @@ static void intel_pstate_update_util_pid(struct update_util_data *data,
 	struct cpudata *cpu = container_of(data, struct cpudata, update_util);
 	u64 delta_ns = time - cpu->sample.time;
 
+	/* Don't allow remote callbacks */
+	if (smp_processor_id() != cpu->cpu)
+		return;
+
 	if ((s64)delta_ns < pid_params.sample_rate_ns)
 		return;
 
@@ -1764,6 +1768,10 @@ static void intel_pstate_update_util(struct update_util_data *data, u64 time,
 	struct cpudata *cpu = container_of(data, struct cpudata, update_util);
 	u64 delta_ns;
 
+	/* Don't allow remote callbacks */
+	if (smp_processor_id() != cpu->cpu)
+		return;
+
 	if (flags & SCHED_CPUFREQ_IOWAIT) {
 		cpu->iowait_boost = int_tofp(1);
 	} else if (cpu->iowait_boost) {
diff --git a/include/linux/cpufreq.h b/include/linux/cpufreq.h
index f10a9b3761cd..c4035964e6b3 100644
--- a/include/linux/cpufreq.h
+++ b/include/linux/cpufreq.h
@@ -562,6 +562,15 @@ struct governor_attr {
 			 size_t count);
 };
 
+static inline bool cpufreq_can_do_remote_dvfs(struct cpufreq_policy *policy)
+{
+	/* Allow remote callbacks only on the CPUs sharing cpufreq policy */
+	if (cpumask_test_cpu(smp_processor_id(), policy->cpus))
+		return true;
+
+	return false;
+}
+
 /*********************************************************************
  *                     FREQUENCY TABLE HELPERS                       *
  *********************************************************************/
diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c
index ddd385f2a985..7dbc76801f86 100644
--- a/kernel/sched/cpufreq_schedutil.c
+++ b/kernel/sched/cpufreq_schedutil.c
@@ -52,6 +52,7 @@ struct sugov_policy {
 struct sugov_cpu {
 	struct update_util_data update_util;
 	struct sugov_policy *sg_policy;
+	unsigned int cpu;
 
 	bool iowait_boost_pending;
 	unsigned int iowait_boost;
@@ -77,6 +78,21 @@ static bool sugov_should_update_freq(struct sugov_policy *sg_policy, u64 time)
 {
 	s64 delta_ns;
 
+	/*
+	 * Since cpufreq_update_util() is called with rq->lock held for
+	 * the @target_cpu, our per-cpu data is fully serialized.
+	 *
+	 * However, drivers cannot in general deal with cross-cpu
+	 * requests, so while get_next_freq() will work, our
+	 * sugov_update_commit() call may not.
+	 *
+	 * Hence stop here for remote requests if they aren't supported
+	 * by the hardware, as calculating the frequency is pointless if
+	 * we cannot in fact act on it.
+	 */
+	if (!cpufreq_can_do_remote_dvfs(sg_policy->policy))
+		return false;
+
 	if (sg_policy->work_in_progress)
 		return false;
 
@@ -155,12 +171,12 @@ static unsigned int get_next_freq(struct sugov_policy *sg_policy,
 	return cpufreq_driver_resolve_freq(policy, freq);
 }
 
-static void sugov_get_util(unsigned long *util, unsigned long *max)
+static void sugov_get_util(unsigned long *util, unsigned long *max, int cpu)
 {
-	struct rq *rq = this_rq();
+	struct rq *rq = cpu_rq(cpu);
 	unsigned long cfs_max;
 
-	cfs_max = arch_scale_cpu_capacity(NULL, smp_processor_id());
+	cfs_max = arch_scale_cpu_capacity(NULL, cpu);
 
 	*util = min(rq->cfs.avg.util_avg, cfs_max);
 	*max = cfs_max;
@@ -254,7 +270,7 @@ static void sugov_update_single(struct update_util_data *hook, u64 time,
 	if (flags & SCHED_CPUFREQ_RT_DL) {
 		next_f = policy->cpuinfo.max_freq;
 	} else {
-		sugov_get_util(&util, &max);
+		sugov_get_util(&util, &max, sg_cpu->cpu);
 		sugov_iowait_boost(sg_cpu, &util, &max);
 		next_f = get_next_freq(sg_policy, util, max);
 		/*
@@ -316,7 +332,7 @@ static void sugov_update_shared(struct update_util_data *hook, u64 time,
 	unsigned long util, max;
 	unsigned int next_f;
 
-	sugov_get_util(&util, &max);
+	sugov_get_util(&util, &max, sg_cpu->cpu);
 
 	raw_spin_lock(&sg_policy->update_lock);
 
@@ -697,6 +713,11 @@ struct cpufreq_governor *cpufreq_default_governor(void)
 
 static int __init sugov_register(void)
 {
+	int cpu;
+
+	for_each_possible_cpu(cpu)
+		per_cpu(sugov_cpu, cpu).cpu = cpu;
+
 	return cpufreq_register_governor(&schedutil_gov);
 }
 fs_initcall(sugov_register);
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index 755bd3f1a1a9..5c3bf4bd0327 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -1136,7 +1136,7 @@ static void update_curr_dl(struct rq *rq)
 	}
 
 	/* kick cpufreq (see the comment in kernel/sched/sched.h). */
-	cpufreq_update_this_cpu(rq, SCHED_CPUFREQ_DL);
+	cpufreq_update_util(rq, SCHED_CPUFREQ_DL);
 
 	schedstat_set(curr->se.statistics.exec_max,
 		      max(curr->se.statistics.exec_max, delta_exec));
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index c95880e216f6..d378d02fdfcb 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -3278,7 +3278,9 @@ static inline void set_tg_cfs_propagate(struct cfs_rq *cfs_rq) {}
 
 static inline void cfs_rq_util_change(struct cfs_rq *cfs_rq)
 {
-	if (&this_rq()->cfs == cfs_rq) {
+	struct rq *rq = rq_of(cfs_rq);
+
+	if (&rq->cfs == cfs_rq) {
 		/*
 		 * There are a few boundary cases this might miss but it should
 		 * get called often enough that that should (hopefully) not be
@@ -3295,7 +3297,7 @@ static inline void cfs_rq_util_change(struct cfs_rq *cfs_rq)
 		 *
 		 * See cpu_util().
 		 */
-		cpufreq_update_util(rq_of(cfs_rq), 0);
+		cpufreq_update_util(rq, 0);
 	}
 }
 
@@ -4875,7 +4877,7 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
 	 * passed.
 	 */
 	if (p->in_iowait)
-		cpufreq_update_this_cpu(rq, SCHED_CPUFREQ_IOWAIT);
+		cpufreq_update_util(rq, SCHED_CPUFREQ_IOWAIT);
 
 	for_each_sched_entity(se) {
 		if (se->on_rq)
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index 45caf937ef90..0af5ca9e3e3f 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -970,7 +970,7 @@ static void update_curr_rt(struct rq *rq)
 		return;
 
 	/* Kick cpufreq (see the comment in kernel/sched/sched.h). */
-	cpufreq_update_this_cpu(rq, SCHED_CPUFREQ_RT);
+	cpufreq_update_util(rq, SCHED_CPUFREQ_RT);
 
 	schedstat_set(curr->se.statistics.exec_max,
 		      max(curr->se.statistics.exec_max, delta_exec));
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index eeef1a3086d1..aa9d5b87b4f8 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -2070,19 +2070,13 @@ static inline void cpufreq_update_util(struct rq *rq, unsigned int flags)
 {
 	struct update_util_data *data;
 
-	data = rcu_dereference_sched(*this_cpu_ptr(&cpufreq_update_util_data));
+	data = rcu_dereference_sched(*per_cpu_ptr(&cpufreq_update_util_data,
+						  cpu_of(rq)));
 	if (data)
 		data->func(data, rq_clock(rq), flags);
 }
-
-static inline void cpufreq_update_this_cpu(struct rq *rq, unsigned int flags)
-{
-	if (cpu_of(rq) == smp_processor_id())
-		cpufreq_update_util(rq, flags);
-}
 #else
 static inline void cpufreq_update_util(struct rq *rq, unsigned int flags) {}
-static inline void cpufreq_update_this_cpu(struct rq *rq, unsigned int flags) {}
 #endif /* CONFIG_CPU_FREQ */
 
 #ifdef arch_scale_freq_capacity
-- 
cgit v1.2.3


From 99d14d0e16fadb4de001bacc0ac0786a9ebecf2a Mon Sep 17 00:00:00 2001
From: Viresh Kumar <viresh.kumar@linaro.org>
Date: Fri, 28 Jul 2017 12:16:39 +0530
Subject: cpufreq: Process remote callbacks from any CPU if the platform
 permits

On many platforms, CPUs can do DVFS across cpufreq policies. i.e CPU
from policy-A can change frequency of CPUs belonging to policy-B.

This is quite common in case of ARM platforms where we don't
configure any per-cpu register.

Add a flag to identify such platforms and update
cpufreq_can_do_remote_dvfs() to allow remote callbacks if this flag is
set.

Also enable the flag for cpufreq-dt driver which is used only on ARM
platforms currently.

Signed-off-by: Viresh Kumar <viresh.kumar@linaro.org>
Acked-by: Saravana Kannan <skannan@codeaurora.org>
Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/cpufreq/cpufreq-dt.c |  1 +
 include/linux/cpufreq.h      | 18 ++++++++++++++++--
 2 files changed, 17 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/drivers/cpufreq/cpufreq-dt.c b/drivers/cpufreq/cpufreq-dt.c
index fef3c2160691..d83ab94d041a 100644
--- a/drivers/cpufreq/cpufreq-dt.c
+++ b/drivers/cpufreq/cpufreq-dt.c
@@ -274,6 +274,7 @@ static int cpufreq_init(struct cpufreq_policy *policy)
 		transition_latency = CPUFREQ_ETERNAL;
 
 	policy->cpuinfo.transition_latency = transition_latency;
+	policy->dvfs_possible_from_any_cpu = true;
 
 	return 0;
 
diff --git a/include/linux/cpufreq.h b/include/linux/cpufreq.h
index c4035964e6b3..1981a4a167ce 100644
--- a/include/linux/cpufreq.h
+++ b/include/linux/cpufreq.h
@@ -127,6 +127,15 @@ struct cpufreq_policy {
 	 */
 	unsigned int		transition_delay_us;
 
+	/*
+	 * Remote DVFS flag (Not added to the driver structure as we don't want
+	 * to access another structure from scheduler hotpath).
+	 *
+	 * Should be set if CPUs can do DVFS on behalf of other CPUs from
+	 * different cpufreq policies.
+	 */
+	bool			dvfs_possible_from_any_cpu;
+
 	 /* Cached frequency lookup from cpufreq_driver_resolve_freq. */
 	unsigned int cached_target_freq;
 	int cached_resolved_idx;
@@ -564,8 +573,13 @@ struct governor_attr {
 
 static inline bool cpufreq_can_do_remote_dvfs(struct cpufreq_policy *policy)
 {
-	/* Allow remote callbacks only on the CPUs sharing cpufreq policy */
-	if (cpumask_test_cpu(smp_processor_id(), policy->cpus))
+	/*
+	 * Allow remote callbacks if:
+	 * - dvfs_possible_from_any_cpu flag is set
+	 * - the local and remote CPUs share cpufreq policy
+	 */
+	if (policy->dvfs_possible_from_any_cpu ||
+	    cpumask_test_cpu(smp_processor_id(), policy->cpus))
 		return true;
 
 	return false;
-- 
cgit v1.2.3


From bc2eecd7ecce40af43b6eb3d256b6076257df846 Mon Sep 17 00:00:00 2001
From: Nicolas Pitre <nicolas.pitre@linaro.org>
Date: Tue, 1 Aug 2017 00:31:32 -0400
Subject: futex: Allow for compiling out PI support

This makes it possible to preserve basic futex support and compile out the
PI support when RT mutexes are not available.

Signed-off-by: Nicolas Pitre <nico@linaro.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Darren Hart <dvhart@infradead.org>
Link: http://lkml.kernel.org/r/alpine.LFD.2.20.1708010024190.5981@knanqh.ubzr
---
 include/linux/futex.h           |  7 ++++++-
 init/Kconfig                    |  7 ++++++-
 kernel/futex.c                  | 22 ++++++++++++++++++++++
 kernel/locking/rtmutex_common.h | 29 +++++++++++++++++++++++++++++
 4 files changed, 63 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/linux/futex.h b/include/linux/futex.h
index 7c5b694864cd..f36bfd26f998 100644
--- a/include/linux/futex.h
+++ b/include/linux/futex.h
@@ -54,7 +54,6 @@ union futex_key {
 
 #ifdef CONFIG_FUTEX
 extern void exit_robust_list(struct task_struct *curr);
-extern void exit_pi_state_list(struct task_struct *curr);
 #ifdef CONFIG_HAVE_FUTEX_CMPXCHG
 #define futex_cmpxchg_enabled 1
 #else
@@ -64,8 +63,14 @@ extern int futex_cmpxchg_enabled;
 static inline void exit_robust_list(struct task_struct *curr)
 {
 }
+#endif
+
+#ifdef CONFIG_FUTEX_PI
+extern void exit_pi_state_list(struct task_struct *curr);
+#else
 static inline void exit_pi_state_list(struct task_struct *curr)
 {
 }
 #endif
+
 #endif
diff --git a/init/Kconfig b/init/Kconfig
index 8514b25db21c..5f0ef850e808 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -1275,12 +1275,17 @@ config BASE_FULL
 config FUTEX
 	bool "Enable futex support" if EXPERT
 	default y
-	select RT_MUTEXES
+	imply RT_MUTEXES
 	help
 	  Disabling this option will cause the kernel to be built without
 	  support for "fast userspace mutexes".  The resulting kernel may not
 	  run glibc-based applications correctly.
 
+config FUTEX_PI
+	bool
+	depends on FUTEX && RT_MUTEXES
+	default y
+
 config HAVE_FUTEX_CMPXCHG
 	bool
 	depends on FUTEX
diff --git a/kernel/futex.c b/kernel/futex.c
index 16dbe4c93895..ad0af4df1b9d 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -875,6 +875,8 @@ static struct task_struct *futex_find_get_task(pid_t pid)
 	return p;
 }
 
+#ifdef CONFIG_FUTEX_PI
+
 /*
  * This task is holding PI mutexes at exit time => bad.
  * Kernel cleans up PI-state, but userspace is likely hosed.
@@ -932,6 +934,8 @@ void exit_pi_state_list(struct task_struct *curr)
 	raw_spin_unlock_irq(&curr->pi_lock);
 }
 
+#endif
+
 /*
  * We need to check the following states:
  *
@@ -1799,6 +1803,15 @@ static int futex_requeue(u32 __user *uaddr1, unsigned int flags,
 	struct futex_q *this, *next;
 	DEFINE_WAKE_Q(wake_q);
 
+	/*
+	 * When PI not supported: return -ENOSYS if requeue_pi is true,
+	 * consequently the compiler knows requeue_pi is always false past
+	 * this point which will optimize away all the conditional code
+	 * further down.
+	 */
+	if (!IS_ENABLED(CONFIG_FUTEX_PI) && requeue_pi)
+		return -ENOSYS;
+
 	if (requeue_pi) {
 		/*
 		 * Requeue PI only works on two distinct uaddrs. This
@@ -2594,6 +2607,9 @@ static int futex_lock_pi(u32 __user *uaddr, unsigned int flags,
 	struct futex_q q = futex_q_init;
 	int res, ret;
 
+	if (!IS_ENABLED(CONFIG_FUTEX_PI))
+		return -ENOSYS;
+
 	if (refill_pi_state_cache())
 		return -ENOMEM;
 
@@ -2773,6 +2789,9 @@ static int futex_unlock_pi(u32 __user *uaddr, unsigned int flags)
 	struct futex_q *top_waiter;
 	int ret;
 
+	if (!IS_ENABLED(CONFIG_FUTEX_PI))
+		return -ENOSYS;
+
 retry:
 	if (get_user(uval, uaddr))
 		return -EFAULT;
@@ -2983,6 +3002,9 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags,
 	struct futex_q q = futex_q_init;
 	int res, ret;
 
+	if (!IS_ENABLED(CONFIG_FUTEX_PI))
+		return -ENOSYS;
+
 	if (uaddr == uaddr2)
 		return -EINVAL;
 
diff --git a/kernel/locking/rtmutex_common.h b/kernel/locking/rtmutex_common.h
index 72ad45a9a794..8d039b928d61 100644
--- a/kernel/locking/rtmutex_common.h
+++ b/kernel/locking/rtmutex_common.h
@@ -40,6 +40,9 @@ struct rt_mutex_waiter {
 /*
  * Various helpers to access the waiters-tree:
  */
+
+#ifdef CONFIG_RT_MUTEXES
+
 static inline int rt_mutex_has_waiters(struct rt_mutex *lock)
 {
 	return !RB_EMPTY_ROOT(&lock->waiters);
@@ -69,6 +72,32 @@ task_top_pi_waiter(struct task_struct *p)
 			pi_tree_entry);
 }
 
+#else
+
+static inline int rt_mutex_has_waiters(struct rt_mutex *lock)
+{
+	return false;
+}
+
+static inline struct rt_mutex_waiter *
+rt_mutex_top_waiter(struct rt_mutex *lock)
+{
+	return NULL;
+}
+
+static inline int task_has_pi_waiters(struct task_struct *p)
+{
+	return false;
+}
+
+static inline struct rt_mutex_waiter *
+task_top_pi_waiter(struct task_struct *p)
+{
+	return NULL;
+}
+
+#endif
+
 /*
  * lock->owner state tracking:
  */
-- 
cgit v1.2.3


From ffb959bbdf923b4f89a08a04aba2501b1b16d164 Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@redhat.com>
Date: Mon, 31 Jul 2017 10:29:38 -0400
Subject: mm: remove optimizations based on i_size in mapping writeback waits

Marcelo added this i_size based optimization with a patch in 2004
(commitid is from the linux-history tree):

    commit 765dad09b4ac101a32d87af2bb793c3060497d3c
    Author: Marcelo Tosatti <marcelo.tosatti@cyclades.com>
    Date:   Tue Sep 7 17:51:17 2004 -0700

	small wait_on_page_writeback_range() optimization

	filemap_fdatawait() calls wait_on_page_writeback_range() with -1
	as "end" parameter.  This is not needed since we know the EOF
	from the inode.  Use that instead.

There may be races here, particularly with clustered or network
filesystems. It also seems like a bit of a layering violation since
we're operating on an address_space here, not an inode.

Finally, it's also questionable whether this optimization really helps
on workloads that we care about. Should we be optimizing for writeback
vs. truncate races in a codepath where we expect to wait anyway? It
doesn't seem worth the risk.

Remove this optimization from the filemap_fdatawait codepaths. This
means that filemap_fdatawait becomes a trivial wrapper around
filemap_fdatawait_range.

Reviewed-by: Jan Kara <jack@suse.cz>
Signed-off-by: Jeff Layton <jlayton@redhat.com>
---
 include/linux/fs.h |  7 ++++++-
 mm/filemap.c       | 30 +-----------------------------
 2 files changed, 7 insertions(+), 30 deletions(-)

(limited to 'include')

diff --git a/include/linux/fs.h b/include/linux/fs.h
index af592ca3d509..909210bd6366 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -2538,10 +2538,15 @@ extern int invalidate_inode_pages2_range(struct address_space *mapping,
 extern int write_inode_now(struct inode *, int);
 extern int filemap_fdatawrite(struct address_space *);
 extern int filemap_flush(struct address_space *);
-extern int filemap_fdatawait(struct address_space *);
 extern int filemap_fdatawait_keep_errors(struct address_space *mapping);
 extern int filemap_fdatawait_range(struct address_space *, loff_t lstart,
 				   loff_t lend);
+
+static inline int filemap_fdatawait(struct address_space *mapping)
+{
+	return filemap_fdatawait_range(mapping, 0, LLONG_MAX);
+}
+
 extern bool filemap_range_has_page(struct address_space *, loff_t lstart,
 				  loff_t lend);
 extern int __must_check file_fdatawait_range(struct file *file, loff_t lstart,
diff --git a/mm/filemap.c b/mm/filemap.c
index 394bb5e96f87..85dfe3bee324 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -512,39 +512,11 @@ EXPORT_SYMBOL(file_fdatawait_range);
  */
 int filemap_fdatawait_keep_errors(struct address_space *mapping)
 {
-	loff_t i_size = i_size_read(mapping->host);
-
-	if (i_size == 0)
-		return 0;
-
-	__filemap_fdatawait_range(mapping, 0, i_size - 1);
+	__filemap_fdatawait_range(mapping, 0, LLONG_MAX);
 	return filemap_check_and_keep_errors(mapping);
 }
 EXPORT_SYMBOL(filemap_fdatawait_keep_errors);
 
-/**
- * filemap_fdatawait - wait for all under-writeback pages to complete
- * @mapping: address space structure to wait for
- *
- * Walk the list of under-writeback pages of the given address space
- * and wait for all of them.  Check error status of the address space
- * and return it.
- *
- * Since the error status of the address space is cleared by this function,
- * callers are responsible for checking the return value and handling and/or
- * reporting the error.
- */
-int filemap_fdatawait(struct address_space *mapping)
-{
-	loff_t i_size = i_size_read(mapping->host);
-
-	if (i_size == 0)
-		return 0;
-
-	return filemap_fdatawait_range(mapping, 0, i_size - 1);
-}
-EXPORT_SYMBOL(filemap_fdatawait);
-
 static bool mapping_needs_writeback(struct address_space *mapping)
 {
 	return (!dax_mapping(mapping) && mapping->nrpages) ||
-- 
cgit v1.2.3


From 6104c37094e729f3d4ce65797002112735d49cd1 Mon Sep 17 00:00:00 2001
From: Daniel Vetter <daniel.vetter@ffwll.ch>
Date: Tue, 1 Aug 2017 17:32:07 +0200
Subject: fbcon: Make fbcon a built-time depency for fbdev

There's a bunch of folks who're trying to make printk less
contended and faster, but there's a problem: printk uses the
console_lock, and the console lock has become the BKL for all things
fbdev/fbcon, which in turn pulled in half the drm subsystem under that
lock. That's awkward.

There reasons for that is probably just a historical accident:

- fbcon is a runtime option of fbdev, i.e. at runtime you can pick
  whether your fbdev driver instances are used as kernel consoles.
  Unfortunately this wasn't implemented with some module option, but
  through some module loading magic: As long as you don't load
  fbcon.ko, there's no fbdev console support, but loading it (in any
  order wrt fbdev drivers) will create console instances for all fbdev
  drivers.

- This was implemented through a notifier chain. fbcon.ko enumerates
  all fbdev instances at load time and also registers itself as
  listener in the fbdev notifier. The fbdev core tries to register new
  fbdev instances with fbcon using the notifier.

- On top of that the modifier chain is also used at runtime by the
  fbdev subsystem to e.g. control backlights for panels.

- The problem is that the notifier puts a mutex locking context
  between fbdev and fbcon, which mixes up the locking contexts for
  both the runtime usage and the register time usage to notify fbcon.
  And at runtime fbcon (through the fbdev core) might call into the
  notifier from a printk critical section while console_lock is held.

- This means console_lock must be an outer lock for the entire fbdev
  subsystem, which also means it must be acquired when registering a
  new framebuffer driver as the outermost lock since we might call
  into fbcon (through the notifier) which would result in a locking
  inversion if fbcon would acquire the console_lock from its notifier
  callback (which it needs to register the console).

- console_lock can be held anywhere, since printk can be called
  anywhere, and through the above story, plus drm/kms being an fbdev
  driver, we pull in a shocking amount of locking hiercharchy
  underneath the console_lock. Which makes cleaning up printk really
  hard (not even splitting console_lock into an rwsem is all that
  useful due to this).

There's various ways to address this, but the cleanest would be to
make fbcon a compile-time option, where fbdev directly calls the fbcon
register functions from register_framebuffer, or dummy static inline
versions if fbcon is disabled. Maybe augmented with a runtime knob to
disable fbcon, if that's needed (for debugging perhaps).

But this could break some users who rely on the magic "loading
fbcon.ko enables/disables fbdev framebuffers at runtime" thing, even
if that's unlikely. Hence we must be careful:

1. Create a compile-time dependency between fbcon and fbdev in the
least minimal way. This is what this patch does.

2. Wait at least 1 year to give possible users time to scream about
how we broke their setup. Unlikely, since all distros make fbcon
compile-in, and embedded platforms only compile stuff they know they
need anyway. But still.

3. Convert the notifier to direct functions calls, with dummy static
inlines if fbcon is disabled. We'll still need the fb notifier for the
other uses (like backlights), but we can probably move it into the fb
core (atm it must be built-into vmlinux).

4. Push console_lock down the call-chain, until it is down in
console_register again.

5. Finally start to clean up and rework the printk/console locking.

For context of this saga see

commit 50e244cc793d511b86adea24972f3a7264cae114
Author: Alan Cox <alan@linux.intel.com>
Date:   Fri Jan 25 10:28:15 2013 +1000

    fb: rework locking to fix lock ordering on takeover

plus the pile of commits on top that tried to make this all work
without terminally upsetting lockdep. We've uncovered all this when
console_lock lockdep annotations where added in

commit daee779718a319ff9f83e1ba3339334ac650bb22
Author: Daniel Vetter <daniel.vetter@ffwll.ch>
Date:   Sat Sep 22 19:52:11 2012 +0200

    console: implement lockdep support for console_lock

On the patch itself:
- Switch CONFIG_FRAMEBUFFER_CONSOLE to be a boolean, using the overall
  CONFIG_FB tristate to decided whether it should be a module or
  built-in.

- At first I thought I could force the build depency with just a dummy
  symbol that fbcon.ko exports and fb.ko uses. But that leads to a
  module depency cycle (it works fine when built-in).

  Since this tight binding is the entire goal the simplest solution is
  to move all the fbcon modules (and there's a bunch of optinal
  source-files which are each modules of their own, for no good
  reason) into the overall fb.ko core module. That's a bit more than
  what I would have liked to do in this patch, but oh well.

Signed-off-by: Daniel Vetter <daniel.vetter@intel.com>
Cc: Alan Cox <alan@lxorguk.ukuu.org.uk>
Cc: Sergey Senozhatsky <sergey.senozhatsky.work@gmail.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Reviewed-by: Sean Paul <seanpaul@chromium.org>
Signed-off-by: Bartlomiej Zolnierkiewicz <b.zolnierkie@samsung.com>
---
 drivers/video/console/Kconfig           |    2 +-
 drivers/video/console/Makefile          |    8 -
 drivers/video/console/bitblit.c         |  422 ----
 drivers/video/console/fbcon.c           | 3665 -------------------------------
 drivers/video/console/fbcon.h           |  265 ---
 drivers/video/console/fbcon_ccw.c       |  424 ----
 drivers/video/console/fbcon_cw.c        |  407 ----
 drivers/video/console/fbcon_rotate.c    |  116 -
 drivers/video/console/fbcon_rotate.h    |   96 -
 drivers/video/console/fbcon_ud.c        |  452 ----
 drivers/video/console/softcursor.c      |   82 -
 drivers/video/console/tileblit.c        |  159 --
 drivers/video/fbdev/core/Makefile       |   11 +
 drivers/video/fbdev/core/bitblit.c      |  418 ++++
 drivers/video/fbdev/core/fbcon.c        | 3658 ++++++++++++++++++++++++++++++
 drivers/video/fbdev/core/fbcon.h        |  265 +++
 drivers/video/fbdev/core/fbcon_ccw.c    |  420 ++++
 drivers/video/fbdev/core/fbcon_cw.c     |  403 ++++
 drivers/video/fbdev/core/fbcon_rotate.c |  112 +
 drivers/video/fbdev/core/fbcon_rotate.h |   96 +
 drivers/video/fbdev/core/fbcon_ud.c     |  448 ++++
 drivers/video/fbdev/core/fbmem.c        |    6 +
 drivers/video/fbdev/core/softcursor.c   |   78 +
 drivers/video/fbdev/core/tileblit.c     |  154 ++
 include/linux/fbcon.h                   |   12 +
 25 files changed, 6082 insertions(+), 6097 deletions(-)
 delete mode 100644 drivers/video/console/bitblit.c
 delete mode 100644 drivers/video/console/fbcon.c
 delete mode 100644 drivers/video/console/fbcon.h
 delete mode 100644 drivers/video/console/fbcon_ccw.c
 delete mode 100644 drivers/video/console/fbcon_cw.c
 delete mode 100644 drivers/video/console/fbcon_rotate.c
 delete mode 100644 drivers/video/console/fbcon_rotate.h
 delete mode 100644 drivers/video/console/fbcon_ud.c
 delete mode 100644 drivers/video/console/softcursor.c
 delete mode 100644 drivers/video/console/tileblit.c
 create mode 100644 drivers/video/fbdev/core/bitblit.c
 create mode 100644 drivers/video/fbdev/core/fbcon.c
 create mode 100644 drivers/video/fbdev/core/fbcon.h
 create mode 100644 drivers/video/fbdev/core/fbcon_ccw.c
 create mode 100644 drivers/video/fbdev/core/fbcon_cw.c
 create mode 100644 drivers/video/fbdev/core/fbcon_rotate.c
 create mode 100644 drivers/video/fbdev/core/fbcon_rotate.h
 create mode 100644 drivers/video/fbdev/core/fbcon_ud.c
 create mode 100644 drivers/video/fbdev/core/softcursor.c
 create mode 100644 drivers/video/fbdev/core/tileblit.c
 create mode 100644 include/linux/fbcon.h

(limited to 'include')

diff --git a/drivers/video/console/Kconfig b/drivers/video/console/Kconfig
index 2111d06f8c81..7f1f1fbcef9e 100644
--- a/drivers/video/console/Kconfig
+++ b/drivers/video/console/Kconfig
@@ -117,7 +117,7 @@ config DUMMY_CONSOLE_ROWS
           Select 25 if you use a 640x480 resolution by default.
 
 config FRAMEBUFFER_CONSOLE
-	tristate "Framebuffer Console support"
+	bool "Framebuffer Console support"
 	depends on FB && !UML
 	select VT_HW_CONSOLE_BINDING
 	select CRC32
diff --git a/drivers/video/console/Makefile b/drivers/video/console/Makefile
index 43bfa485db96..eb2cbec52643 100644
--- a/drivers/video/console/Makefile
+++ b/drivers/video/console/Makefile
@@ -7,13 +7,5 @@ obj-$(CONFIG_SGI_NEWPORT_CONSOLE) += newport_con.o
 obj-$(CONFIG_STI_CONSOLE)         += sticon.o sticore.o
 obj-$(CONFIG_VGA_CONSOLE)         += vgacon.o
 obj-$(CONFIG_MDA_CONSOLE)         += mdacon.o
-obj-$(CONFIG_FRAMEBUFFER_CONSOLE) += fbcon.o bitblit.o softcursor.o
-ifeq ($(CONFIG_FB_TILEBLITTING),y)
-obj-$(CONFIG_FRAMEBUFFER_CONSOLE)     += tileblit.o
-endif
-ifeq ($(CONFIG_FRAMEBUFFER_CONSOLE_ROTATION),y)
-obj-$(CONFIG_FRAMEBUFFER_CONSOLE)     += fbcon_rotate.o fbcon_cw.o fbcon_ud.o \
-                                         fbcon_ccw.o
-endif
 
 obj-$(CONFIG_FB_STI)              += sticore.o
diff --git a/drivers/video/console/bitblit.c b/drivers/video/console/bitblit.c
deleted file mode 100644
index dbfe4eecf12e..000000000000
--- a/drivers/video/console/bitblit.c
+++ /dev/null
@@ -1,422 +0,0 @@
-/*
- *  linux/drivers/video/console/bitblit.c -- BitBlitting Operation
- *
- *  Originally from the 'accel_*' routines in drivers/video/console/fbcon.c
- *
- *      Copyright (C) 2004 Antonino Daplas <adaplas @pol.net>
- *
- *  This file is subject to the terms and conditions of the GNU General Public
- *  License.  See the file COPYING in the main directory of this archive for
- *  more details.
- */
-
-#include <linux/module.h>
-#include <linux/slab.h>
-#include <linux/string.h>
-#include <linux/fb.h>
-#include <linux/vt_kern.h>
-#include <linux/console.h>
-#include <asm/types.h>
-#include "fbcon.h"
-
-/*
- * Accelerated handlers.
- */
-static void update_attr(u8 *dst, u8 *src, int attribute,
-			       struct vc_data *vc)
-{
-	int i, offset = (vc->vc_font.height < 10) ? 1 : 2;
-	int width = DIV_ROUND_UP(vc->vc_font.width, 8);
-	unsigned int cellsize = vc->vc_font.height * width;
-	u8 c;
-
-	offset = cellsize - (offset * width);
-	for (i = 0; i < cellsize; i++) {
-		c = src[i];
-		if (attribute & FBCON_ATTRIBUTE_UNDERLINE && i >= offset)
-			c = 0xff;
-		if (attribute & FBCON_ATTRIBUTE_BOLD)
-			c |= c >> 1;
-		if (attribute & FBCON_ATTRIBUTE_REVERSE)
-			c = ~c;
-		dst[i] = c;
-	}
-}
-
-static void bit_bmove(struct vc_data *vc, struct fb_info *info, int sy,
-		      int sx, int dy, int dx, int height, int width)
-{
-	struct fb_copyarea area;
-
-	area.sx = sx * vc->vc_font.width;
-	area.sy = sy * vc->vc_font.height;
-	area.dx = dx * vc->vc_font.width;
-	area.dy = dy * vc->vc_font.height;
-	area.height = height * vc->vc_font.height;
-	area.width = width * vc->vc_font.width;
-
-	info->fbops->fb_copyarea(info, &area);
-}
-
-static void bit_clear(struct vc_data *vc, struct fb_info *info, int sy,
-		      int sx, int height, int width)
-{
-	int bgshift = (vc->vc_hi_font_mask) ? 13 : 12;
-	struct fb_fillrect region;
-
-	region.color = attr_bgcol_ec(bgshift, vc, info);
-	region.dx = sx * vc->vc_font.width;
-	region.dy = sy * vc->vc_font.height;
-	region.width = width * vc->vc_font.width;
-	region.height = height * vc->vc_font.height;
-	region.rop = ROP_COPY;
-
-	info->fbops->fb_fillrect(info, &region);
-}
-
-static inline void bit_putcs_aligned(struct vc_data *vc, struct fb_info *info,
-				     const u16 *s, u32 attr, u32 cnt,
-				     u32 d_pitch, u32 s_pitch, u32 cellsize,
-				     struct fb_image *image, u8 *buf, u8 *dst)
-{
-	u16 charmask = vc->vc_hi_font_mask ? 0x1ff : 0xff;
-	u32 idx = vc->vc_font.width >> 3;
-	u8 *src;
-
-	while (cnt--) {
-		src = vc->vc_font.data + (scr_readw(s++)&
-					  charmask)*cellsize;
-
-		if (attr) {
-			update_attr(buf, src, attr, vc);
-			src = buf;
-		}
-
-		if (likely(idx == 1))
-			__fb_pad_aligned_buffer(dst, d_pitch, src, idx,
-						image->height);
-		else
-			fb_pad_aligned_buffer(dst, d_pitch, src, idx,
-					      image->height);
-
-		dst += s_pitch;
-	}
-
-	info->fbops->fb_imageblit(info, image);
-}
-
-static inline void bit_putcs_unaligned(struct vc_data *vc,
-				       struct fb_info *info, const u16 *s,
-				       u32 attr, u32 cnt, u32 d_pitch,
-				       u32 s_pitch, u32 cellsize,
-				       struct fb_image *image, u8 *buf,
-				       u8 *dst)
-{
-	u16 charmask = vc->vc_hi_font_mask ? 0x1ff : 0xff;
-	u32 shift_low = 0, mod = vc->vc_font.width % 8;
-	u32 shift_high = 8;
-	u32 idx = vc->vc_font.width >> 3;
-	u8 *src;
-
-	while (cnt--) {
-		src = vc->vc_font.data + (scr_readw(s++)&
-					  charmask)*cellsize;
-
-		if (attr) {
-			update_attr(buf, src, attr, vc);
-			src = buf;
-		}
-
-		fb_pad_unaligned_buffer(dst, d_pitch, src, idx,
-					image->height, shift_high,
-					shift_low, mod);
-		shift_low += mod;
-		dst += (shift_low >= 8) ? s_pitch : s_pitch - 1;
-		shift_low &= 7;
-		shift_high = 8 - shift_low;
-	}
-
-	info->fbops->fb_imageblit(info, image);
-
-}
-
-static void bit_putcs(struct vc_data *vc, struct fb_info *info,
-		      const unsigned short *s, int count, int yy, int xx,
-		      int fg, int bg)
-{
-	struct fb_image image;
-	u32 width = DIV_ROUND_UP(vc->vc_font.width, 8);
-	u32 cellsize = width * vc->vc_font.height;
-	u32 maxcnt = info->pixmap.size/cellsize;
-	u32 scan_align = info->pixmap.scan_align - 1;
-	u32 buf_align = info->pixmap.buf_align - 1;
-	u32 mod = vc->vc_font.width % 8, cnt, pitch, size;
-	u32 attribute = get_attribute(info, scr_readw(s));
-	u8 *dst, *buf = NULL;
-
-	image.fg_color = fg;
-	image.bg_color = bg;
-	image.dx = xx * vc->vc_font.width;
-	image.dy = yy * vc->vc_font.height;
-	image.height = vc->vc_font.height;
-	image.depth = 1;
-
-	if (attribute) {
-		buf = kmalloc(cellsize, GFP_ATOMIC);
-		if (!buf)
-			return;
-	}
-
-	while (count) {
-		if (count > maxcnt)
-			cnt = maxcnt;
-		else
-			cnt = count;
-
-		image.width = vc->vc_font.width * cnt;
-		pitch = DIV_ROUND_UP(image.width, 8) + scan_align;
-		pitch &= ~scan_align;
-		size = pitch * image.height + buf_align;
-		size &= ~buf_align;
-		dst = fb_get_buffer_offset(info, &info->pixmap, size);
-		image.data = dst;
-
-		if (!mod)
-			bit_putcs_aligned(vc, info, s, attribute, cnt, pitch,
-					  width, cellsize, &image, buf, dst);
-		else
-			bit_putcs_unaligned(vc, info, s, attribute, cnt,
-					    pitch, width, cellsize, &image,
-					    buf, dst);
-
-		image.dx += cnt * vc->vc_font.width;
-		count -= cnt;
-		s += cnt;
-	}
-
-	/* buf is always NULL except when in monochrome mode, so in this case
-	   it's a gain to check buf against NULL even though kfree() handles
-	   NULL pointers just fine */
-	if (unlikely(buf))
-		kfree(buf);
-
-}
-
-static void bit_clear_margins(struct vc_data *vc, struct fb_info *info,
-			      int bottom_only)
-{
-	unsigned int cw = vc->vc_font.width;
-	unsigned int ch = vc->vc_font.height;
-	unsigned int rw = info->var.xres - (vc->vc_cols*cw);
-	unsigned int bh = info->var.yres - (vc->vc_rows*ch);
-	unsigned int rs = info->var.xres - rw;
-	unsigned int bs = info->var.yres - bh;
-	struct fb_fillrect region;
-
-	region.color = 0;
-	region.rop = ROP_COPY;
-
-	if (rw && !bottom_only) {
-		region.dx = info->var.xoffset + rs;
-		region.dy = 0;
-		region.width = rw;
-		region.height = info->var.yres_virtual;
-		info->fbops->fb_fillrect(info, &region);
-	}
-
-	if (bh) {
-		region.dx = info->var.xoffset;
-		region.dy = info->var.yoffset + bs;
-		region.width = rs;
-		region.height = bh;
-		info->fbops->fb_fillrect(info, &region);
-	}
-}
-
-static void bit_cursor(struct vc_data *vc, struct fb_info *info, int mode,
-		       int softback_lines, int fg, int bg)
-{
-	struct fb_cursor cursor;
-	struct fbcon_ops *ops = info->fbcon_par;
-	unsigned short charmask = vc->vc_hi_font_mask ? 0x1ff : 0xff;
-	int w = DIV_ROUND_UP(vc->vc_font.width, 8), c;
-	int y = real_y(ops->p, vc->vc_y);
-	int attribute, use_sw = (vc->vc_cursor_type & 0x10);
-	int err = 1;
-	char *src;
-
-	cursor.set = 0;
-
-	if (softback_lines) {
-		if (y + softback_lines >= vc->vc_rows) {
-			mode = CM_ERASE;
-			ops->cursor_flash = 0;
-			return;
-		} else
-			y += softback_lines;
-	}
-
- 	c = scr_readw((u16 *) vc->vc_pos);
-	attribute = get_attribute(info, c);
-	src = vc->vc_font.data + ((c & charmask) * (w * vc->vc_font.height));
-
-	if (ops->cursor_state.image.data != src ||
-	    ops->cursor_reset) {
-	    ops->cursor_state.image.data = src;
-	    cursor.set |= FB_CUR_SETIMAGE;
-	}
-
-	if (attribute) {
-		u8 *dst;
-
-		dst = kmalloc(w * vc->vc_font.height, GFP_ATOMIC);
-		if (!dst)
-			return;
-		kfree(ops->cursor_data);
-		ops->cursor_data = dst;
-		update_attr(dst, src, attribute, vc);
-		src = dst;
-	}
-
-	if (ops->cursor_state.image.fg_color != fg ||
-	    ops->cursor_state.image.bg_color != bg ||
-	    ops->cursor_reset) {
-		ops->cursor_state.image.fg_color = fg;
-		ops->cursor_state.image.bg_color = bg;
-		cursor.set |= FB_CUR_SETCMAP;
-	}
-
-	if ((ops->cursor_state.image.dx != (vc->vc_font.width * vc->vc_x)) ||
-	    (ops->cursor_state.image.dy != (vc->vc_font.height * y)) ||
-	    ops->cursor_reset) {
-		ops->cursor_state.image.dx = vc->vc_font.width * vc->vc_x;
-		ops->cursor_state.image.dy = vc->vc_font.height * y;
-		cursor.set |= FB_CUR_SETPOS;
-	}
-
-	if (ops->cursor_state.image.height != vc->vc_font.height ||
-	    ops->cursor_state.image.width != vc->vc_font.width ||
-	    ops->cursor_reset) {
-		ops->cursor_state.image.height = vc->vc_font.height;
-		ops->cursor_state.image.width = vc->vc_font.width;
-		cursor.set |= FB_CUR_SETSIZE;
-	}
-
-	if (ops->cursor_state.hot.x || ops->cursor_state.hot.y ||
-	    ops->cursor_reset) {
-		ops->cursor_state.hot.x = cursor.hot.y = 0;
-		cursor.set |= FB_CUR_SETHOT;
-	}
-
-	if (cursor.set & FB_CUR_SETSIZE ||
-	    vc->vc_cursor_type != ops->p->cursor_shape ||
-	    ops->cursor_state.mask == NULL ||
-	    ops->cursor_reset) {
-		char *mask = kmalloc(w*vc->vc_font.height, GFP_ATOMIC);
-		int cur_height, size, i = 0;
-		u8 msk = 0xff;
-
-		if (!mask)
-			return;
-
-		kfree(ops->cursor_state.mask);
-		ops->cursor_state.mask = mask;
-
-		ops->p->cursor_shape = vc->vc_cursor_type;
-		cursor.set |= FB_CUR_SETSHAPE;
-
-		switch (ops->p->cursor_shape & CUR_HWMASK) {
-		case CUR_NONE:
-			cur_height = 0;
-			break;
-		case CUR_UNDERLINE:
-			cur_height = (vc->vc_font.height < 10) ? 1 : 2;
-			break;
-		case CUR_LOWER_THIRD:
-			cur_height = vc->vc_font.height/3;
-			break;
-		case CUR_LOWER_HALF:
-			cur_height = vc->vc_font.height >> 1;
-			break;
-		case CUR_TWO_THIRDS:
-			cur_height = (vc->vc_font.height << 1)/3;
-			break;
-		case CUR_BLOCK:
-		default:
-			cur_height = vc->vc_font.height;
-			break;
-		}
-		size = (vc->vc_font.height - cur_height) * w;
-		while (size--)
-			mask[i++] = ~msk;
-		size = cur_height * w;
-		while (size--)
-			mask[i++] = msk;
-	}
-
-	switch (mode) {
-	case CM_ERASE:
-		ops->cursor_state.enable = 0;
-		break;
-	case CM_DRAW:
-	case CM_MOVE:
-	default:
-		ops->cursor_state.enable = (use_sw) ? 0 : 1;
-		break;
-	}
-
-	cursor.image.data = src;
-	cursor.image.fg_color = ops->cursor_state.image.fg_color;
-	cursor.image.bg_color = ops->cursor_state.image.bg_color;
-	cursor.image.dx = ops->cursor_state.image.dx;
-	cursor.image.dy = ops->cursor_state.image.dy;
-	cursor.image.height = ops->cursor_state.image.height;
-	cursor.image.width = ops->cursor_state.image.width;
-	cursor.hot.x = ops->cursor_state.hot.x;
-	cursor.hot.y = ops->cursor_state.hot.y;
-	cursor.mask = ops->cursor_state.mask;
-	cursor.enable = ops->cursor_state.enable;
-	cursor.image.depth = 1;
-	cursor.rop = ROP_XOR;
-
-	if (info->fbops->fb_cursor)
-		err = info->fbops->fb_cursor(info, &cursor);
-
-	if (err)
-		soft_cursor(info, &cursor);
-
-	ops->cursor_reset = 0;
-}
-
-static int bit_update_start(struct fb_info *info)
-{
-	struct fbcon_ops *ops = info->fbcon_par;
-	int err;
-
-	err = fb_pan_display(info, &ops->var);
-	ops->var.xoffset = info->var.xoffset;
-	ops->var.yoffset = info->var.yoffset;
-	ops->var.vmode = info->var.vmode;
-	return err;
-}
-
-void fbcon_set_bitops(struct fbcon_ops *ops)
-{
-	ops->bmove = bit_bmove;
-	ops->clear = bit_clear;
-	ops->putcs = bit_putcs;
-	ops->clear_margins = bit_clear_margins;
-	ops->cursor = bit_cursor;
-	ops->update_start = bit_update_start;
-	ops->rotate_font = NULL;
-
-	if (ops->rotate)
-		fbcon_set_rotate(ops);
-}
-
-EXPORT_SYMBOL(fbcon_set_bitops);
-
-MODULE_AUTHOR("Antonino Daplas <adaplas@pol.net>");
-MODULE_DESCRIPTION("Bit Blitting Operation");
-MODULE_LICENSE("GPL");
-
diff --git a/drivers/video/console/fbcon.c b/drivers/video/console/fbcon.c
deleted file mode 100644
index 12ded23f1aaf..000000000000
--- a/drivers/video/console/fbcon.c
+++ /dev/null
@@ -1,3665 +0,0 @@
-/*
- *  linux/drivers/video/fbcon.c -- Low level frame buffer based console driver
- *
- *	Copyright (C) 1995 Geert Uytterhoeven
- *
- *
- *  This file is based on the original Amiga console driver (amicon.c):
- *
- *	Copyright (C) 1993 Hamish Macdonald
- *			   Greg Harp
- *	Copyright (C) 1994 David Carter [carter@compsci.bristol.ac.uk]
- *
- *	      with work by William Rucklidge (wjr@cs.cornell.edu)
- *			   Geert Uytterhoeven
- *			   Jes Sorensen (jds@kom.auc.dk)
- *			   Martin Apel
- *
- *  and on the original Atari console driver (atacon.c):
- *
- *	Copyright (C) 1993 Bjoern Brauel
- *			   Roman Hodek
- *
- *	      with work by Guenther Kelleter
- *			   Martin Schaller
- *			   Andreas Schwab
- *
- *  Hardware cursor support added by Emmanuel Marty (core@ggi-project.org)
- *  Smart redraw scrolling, arbitrary font width support, 512char font support
- *  and software scrollback added by 
- *                         Jakub Jelinek (jj@ultra.linux.cz)
- *
- *  Random hacking by Martin Mares <mj@ucw.cz>
- *
- *	2001 - Documented with DocBook
- *	- Brad Douglas <brad@neruo.com>
- *
- *  The low level operations for the various display memory organizations are
- *  now in separate source files.
- *
- *  Currently the following organizations are supported:
- *
- *    o afb			Amiga bitplanes
- *    o cfb{2,4,8,16,24,32}	Packed pixels
- *    o ilbm			Amiga interleaved bitplanes
- *    o iplan2p[248]		Atari interleaved bitplanes
- *    o mfb			Monochrome
- *    o vga			VGA characters/attributes
- *
- *  To do:
- *
- *    - Implement 16 plane mode (iplan2p16)
- *
- *
- *  This file is subject to the terms and conditions of the GNU General Public
- *  License.  See the file COPYING in the main directory of this archive for
- *  more details.
- */
-
-#undef FBCONDEBUG
-
-#include <linux/module.h>
-#include <linux/types.h>
-#include <linux/fs.h>
-#include <linux/kernel.h>
-#include <linux/delay.h>	/* MSch: for IRQ probe */
-#include <linux/console.h>
-#include <linux/string.h>
-#include <linux/kd.h>
-#include <linux/slab.h>
-#include <linux/fb.h>
-#include <linux/vt_kern.h>
-#include <linux/selection.h>
-#include <linux/font.h>
-#include <linux/smp.h>
-#include <linux/init.h>
-#include <linux/interrupt.h>
-#include <linux/crc32.h> /* For counting font checksums */
-#include <asm/fb.h>
-#include <asm/irq.h>
-
-#include "fbcon.h"
-
-#ifdef FBCONDEBUG
-#  define DPRINTK(fmt, args...) printk(KERN_DEBUG "%s: " fmt, __func__ , ## args)
-#else
-#  define DPRINTK(fmt, args...)
-#endif
-
-enum {
-	FBCON_LOGO_CANSHOW	= -1,	/* the logo can be shown */
-	FBCON_LOGO_DRAW		= -2,	/* draw the logo to a console */
-	FBCON_LOGO_DONTSHOW	= -3	/* do not show the logo */
-};
-
-static struct display fb_display[MAX_NR_CONSOLES];
-
-static signed char con2fb_map[MAX_NR_CONSOLES];
-static signed char con2fb_map_boot[MAX_NR_CONSOLES];
-
-static int logo_lines;
-/* logo_shown is an index to vc_cons when >= 0; otherwise follows FBCON_LOGO
-   enums.  */
-static int logo_shown = FBCON_LOGO_CANSHOW;
-/* Software scrollback */
-static int fbcon_softback_size = 32768;
-static unsigned long softback_buf, softback_curr;
-static unsigned long softback_in;
-static unsigned long softback_top, softback_end;
-static int softback_lines;
-/* console mappings */
-static int first_fb_vc;
-static int last_fb_vc = MAX_NR_CONSOLES - 1;
-static int fbcon_is_default = 1; 
-static int fbcon_has_exited;
-static int primary_device = -1;
-static int fbcon_has_console_bind;
-
-#ifdef CONFIG_FRAMEBUFFER_CONSOLE_DETECT_PRIMARY
-static int map_override;
-
-static inline void fbcon_map_override(void)
-{
-	map_override = 1;
-}
-#else
-static inline void fbcon_map_override(void)
-{
-}
-#endif /* CONFIG_FRAMEBUFFER_CONSOLE_DETECT_PRIMARY */
-
-/* font data */
-static char fontname[40];
-
-/* current fb_info */
-static int info_idx = -1;
-
-/* console rotation */
-static int initial_rotation;
-static int fbcon_has_sysfs;
-
-static const struct consw fb_con;
-
-#define CM_SOFTBACK	(8)
-
-#define advance_row(p, delta) (unsigned short *)((unsigned long)(p) + (delta) * vc->vc_size_row)
-
-static int fbcon_set_origin(struct vc_data *);
-
-static int fbcon_cursor_noblink;
-
-#define divides(a, b)	((!(a) || (b)%(a)) ? 0 : 1)
-
-/*
- *  Interface used by the world
- */
-
-static const char *fbcon_startup(void);
-static void fbcon_init(struct vc_data *vc, int init);
-static void fbcon_deinit(struct vc_data *vc);
-static void fbcon_clear(struct vc_data *vc, int sy, int sx, int height,
-			int width);
-static void fbcon_putc(struct vc_data *vc, int c, int ypos, int xpos);
-static void fbcon_putcs(struct vc_data *vc, const unsigned short *s,
-			int count, int ypos, int xpos);
-static void fbcon_clear_margins(struct vc_data *vc, int bottom_only);
-static void fbcon_cursor(struct vc_data *vc, int mode);
-static void fbcon_bmove(struct vc_data *vc, int sy, int sx, int dy, int dx,
-			int height, int width);
-static int fbcon_switch(struct vc_data *vc);
-static int fbcon_blank(struct vc_data *vc, int blank, int mode_switch);
-static void fbcon_set_palette(struct vc_data *vc, const unsigned char *table);
-
-/*
- *  Internal routines
- */
-static __inline__ void ywrap_up(struct vc_data *vc, int count);
-static __inline__ void ywrap_down(struct vc_data *vc, int count);
-static __inline__ void ypan_up(struct vc_data *vc, int count);
-static __inline__ void ypan_down(struct vc_data *vc, int count);
-static void fbcon_bmove_rec(struct vc_data *vc, struct display *p, int sy, int sx,
-			    int dy, int dx, int height, int width, u_int y_break);
-static void fbcon_set_disp(struct fb_info *info, struct fb_var_screeninfo *var,
-			   int unit);
-static void fbcon_redraw_move(struct vc_data *vc, struct display *p,
-			      int line, int count, int dy);
-static void fbcon_modechanged(struct fb_info *info);
-static void fbcon_set_all_vcs(struct fb_info *info);
-static void fbcon_start(void);
-static void fbcon_exit(void);
-static struct device *fbcon_device;
-
-#ifdef CONFIG_FRAMEBUFFER_CONSOLE_ROTATION
-static inline void fbcon_set_rotation(struct fb_info *info)
-{
-	struct fbcon_ops *ops = info->fbcon_par;
-
-	if (!(info->flags & FBINFO_MISC_TILEBLITTING) &&
-	    ops->p->con_rotate < 4)
-		ops->rotate = ops->p->con_rotate;
-	else
-		ops->rotate = 0;
-}
-
-static void fbcon_rotate(struct fb_info *info, u32 rotate)
-{
-	struct fbcon_ops *ops= info->fbcon_par;
-	struct fb_info *fb_info;
-
-	if (!ops || ops->currcon == -1)
-		return;
-
-	fb_info = registered_fb[con2fb_map[ops->currcon]];
-
-	if (info == fb_info) {
-		struct display *p = &fb_display[ops->currcon];
-
-		if (rotate < 4)
-			p->con_rotate = rotate;
-		else
-			p->con_rotate = 0;
-
-		fbcon_modechanged(info);
-	}
-}
-
-static void fbcon_rotate_all(struct fb_info *info, u32 rotate)
-{
-	struct fbcon_ops *ops = info->fbcon_par;
-	struct vc_data *vc;
-	struct display *p;
-	int i;
-
-	if (!ops || ops->currcon < 0 || rotate > 3)
-		return;
-
-	for (i = first_fb_vc; i <= last_fb_vc; i++) {
-		vc = vc_cons[i].d;
-		if (!vc || vc->vc_mode != KD_TEXT ||
-		    registered_fb[con2fb_map[i]] != info)
-			continue;
-
-		p = &fb_display[vc->vc_num];
-		p->con_rotate = rotate;
-	}
-
-	fbcon_set_all_vcs(info);
-}
-#else
-static inline void fbcon_set_rotation(struct fb_info *info)
-{
-	struct fbcon_ops *ops = info->fbcon_par;
-
-	ops->rotate = FB_ROTATE_UR;
-}
-
-static void fbcon_rotate(struct fb_info *info, u32 rotate)
-{
-	return;
-}
-
-static void fbcon_rotate_all(struct fb_info *info, u32 rotate)
-{
-	return;
-}
-#endif /* CONFIG_FRAMEBUFFER_CONSOLE_ROTATION */
-
-static int fbcon_get_rotate(struct fb_info *info)
-{
-	struct fbcon_ops *ops = info->fbcon_par;
-
-	return (ops) ? ops->rotate : 0;
-}
-
-static inline int fbcon_is_inactive(struct vc_data *vc, struct fb_info *info)
-{
-	struct fbcon_ops *ops = info->fbcon_par;
-
-	return (info->state != FBINFO_STATE_RUNNING ||
-		vc->vc_mode != KD_TEXT || ops->graphics) &&
-		!vt_force_oops_output(vc);
-}
-
-static int get_color(struct vc_data *vc, struct fb_info *info,
-	      u16 c, int is_fg)
-{
-	int depth = fb_get_color_depth(&info->var, &info->fix);
-	int color = 0;
-
-	if (console_blanked) {
-		unsigned short charmask = vc->vc_hi_font_mask ? 0x1ff : 0xff;
-
-		c = vc->vc_video_erase_char & charmask;
-	}
-
-	if (depth != 1)
-		color = (is_fg) ? attr_fgcol((vc->vc_hi_font_mask) ? 9 : 8, c)
-			: attr_bgcol((vc->vc_hi_font_mask) ? 13 : 12, c);
-
-	switch (depth) {
-	case 1:
-	{
-		int col = mono_col(info);
-		/* 0 or 1 */
-		int fg = (info->fix.visual != FB_VISUAL_MONO01) ? col : 0;
-		int bg = (info->fix.visual != FB_VISUAL_MONO01) ? 0 : col;
-
-		if (console_blanked)
-			fg = bg;
-
-		color = (is_fg) ? fg : bg;
-		break;
-	}
-	case 2:
-		/*
-		 * Scale down 16-colors to 4 colors. Default 4-color palette
-		 * is grayscale. However, simply dividing the values by 4
-		 * will not work, as colors 1, 2 and 3 will be scaled-down
-		 * to zero rendering them invisible.  So empirically convert
-		 * colors to a sane 4-level grayscale.
-		 */
-		switch (color) {
-		case 0:
-			color = 0; /* black */
-			break;
-		case 1 ... 6:
-			color = 2; /* white */
-			break;
-		case 7 ... 8:
-			color = 1; /* gray */
-			break;
-		default:
-			color = 3; /* intense white */
-			break;
-		}
-		break;
-	case 3:
-		/*
-		 * Last 8 entries of default 16-color palette is a more intense
-		 * version of the first 8 (i.e., same chrominance, different
-		 * luminance).
-		 */
-		color &= 7;
-		break;
-	}
-
-
-	return color;
-}
-
-static void fbcon_update_softback(struct vc_data *vc)
-{
-	int l = fbcon_softback_size / vc->vc_size_row;
-
-	if (l > 5)
-		softback_end = softback_buf + l * vc->vc_size_row;
-	else
-		/* Smaller scrollback makes no sense, and 0 would screw
-		   the operation totally */
-		softback_top = 0;
-}
-
-static void fb_flashcursor(struct work_struct *work)
-{
-	struct fb_info *info = container_of(work, struct fb_info, queue);
-	struct fbcon_ops *ops = info->fbcon_par;
-	struct vc_data *vc = NULL;
-	int c;
-	int mode;
-	int ret;
-
-	/* FIXME: we should sort out the unbind locking instead */
-	/* instead we just fail to flash the cursor if we can't get
-	 * the lock instead of blocking fbcon deinit */
-	ret = console_trylock();
-	if (ret == 0)
-		return;
-
-	if (ops && ops->currcon != -1)
-		vc = vc_cons[ops->currcon].d;
-
-	if (!vc || !con_is_visible(vc) ||
- 	    registered_fb[con2fb_map[vc->vc_num]] != info ||
-	    vc->vc_deccm != 1) {
-		console_unlock();
-		return;
-	}
-
-	c = scr_readw((u16 *) vc->vc_pos);
-	mode = (!ops->cursor_flash || ops->cursor_state.enable) ?
-		CM_ERASE : CM_DRAW;
-	ops->cursor(vc, info, mode, softback_lines, get_color(vc, info, c, 1),
-		    get_color(vc, info, c, 0));
-	console_unlock();
-}
-
-static void cursor_timer_handler(unsigned long dev_addr)
-{
-	struct fb_info *info = (struct fb_info *) dev_addr;
-	struct fbcon_ops *ops = info->fbcon_par;
-
-	queue_work(system_power_efficient_wq, &info->queue);
-	mod_timer(&ops->cursor_timer, jiffies + ops->cur_blink_jiffies);
-}
-
-static void fbcon_add_cursor_timer(struct fb_info *info)
-{
-	struct fbcon_ops *ops = info->fbcon_par;
-
-	if ((!info->queue.func || info->queue.func == fb_flashcursor) &&
-	    !(ops->flags & FBCON_FLAGS_CURSOR_TIMER) &&
-	    !fbcon_cursor_noblink) {
-		if (!info->queue.func)
-			INIT_WORK(&info->queue, fb_flashcursor);
-
-		setup_timer(&ops->cursor_timer, cursor_timer_handler,
-			    (unsigned long) info);
-		mod_timer(&ops->cursor_timer, jiffies + ops->cur_blink_jiffies);
-		ops->flags |= FBCON_FLAGS_CURSOR_TIMER;
-	}
-}
-
-static void fbcon_del_cursor_timer(struct fb_info *info)
-{
-	struct fbcon_ops *ops = info->fbcon_par;
-
-	if (info->queue.func == fb_flashcursor &&
-	    ops->flags & FBCON_FLAGS_CURSOR_TIMER) {
-		del_timer_sync(&ops->cursor_timer);
-		ops->flags &= ~FBCON_FLAGS_CURSOR_TIMER;
-	}
-}
-
-#ifndef MODULE
-static int __init fb_console_setup(char *this_opt)
-{
-	char *options;
-	int i, j;
-
-	if (!this_opt || !*this_opt)
-		return 1;
-
-	while ((options = strsep(&this_opt, ",")) != NULL) {
-		if (!strncmp(options, "font:", 5)) {
-			strlcpy(fontname, options + 5, sizeof(fontname));
-			continue;
-		}
-		
-		if (!strncmp(options, "scrollback:", 11)) {
-			options += 11;
-			if (*options) {
-				fbcon_softback_size = simple_strtoul(options, &options, 0);
-				if (*options == 'k' || *options == 'K') {
-					fbcon_softback_size *= 1024;
-				}
-			}
-			continue;
-		}
-		
-		if (!strncmp(options, "map:", 4)) {
-			options += 4;
-			if (*options) {
-				for (i = 0, j = 0; i < MAX_NR_CONSOLES; i++) {
-					if (!options[j])
-						j = 0;
-					con2fb_map_boot[i] =
-						(options[j++]-'0') % FB_MAX;
-				}
-
-				fbcon_map_override();
-			}
-			continue;
-		}
-
-		if (!strncmp(options, "vc:", 3)) {
-			options += 3;
-			if (*options)
-				first_fb_vc = simple_strtoul(options, &options, 10) - 1;
-			if (first_fb_vc < 0)
-				first_fb_vc = 0;
-			if (*options++ == '-')
-				last_fb_vc = simple_strtoul(options, &options, 10) - 1;
-			fbcon_is_default = 0; 
-			continue;
-		}
-
-		if (!strncmp(options, "rotate:", 7)) {
-			options += 7;
-			if (*options)
-				initial_rotation = simple_strtoul(options, &options, 0);
-			if (initial_rotation > 3)
-				initial_rotation = 0;
-			continue;
-		}
-	}
-	return 1;
-}
-
-__setup("fbcon=", fb_console_setup);
-#endif
-
-static int search_fb_in_map(int idx)
-{
-	int i, retval = 0;
-
-	for (i = first_fb_vc; i <= last_fb_vc; i++) {
-		if (con2fb_map[i] == idx)
-			retval = 1;
-	}
-	return retval;
-}
-
-static int search_for_mapped_con(void)
-{
-	int i, retval = 0;
-
-	for (i = first_fb_vc; i <= last_fb_vc; i++) {
-		if (con2fb_map[i] != -1)
-			retval = 1;
-	}
-	return retval;
-}
-
-static int do_fbcon_takeover(int show_logo)
-{
-	int err, i;
-
-	if (!num_registered_fb)
-		return -ENODEV;
-
-	if (!show_logo)
-		logo_shown = FBCON_LOGO_DONTSHOW;
-
-	for (i = first_fb_vc; i <= last_fb_vc; i++)
-		con2fb_map[i] = info_idx;
-
-	err = do_take_over_console(&fb_con, first_fb_vc, last_fb_vc,
-				fbcon_is_default);
-
-	if (err) {
-		for (i = first_fb_vc; i <= last_fb_vc; i++)
-			con2fb_map[i] = -1;
-		info_idx = -1;
-	} else {
-		fbcon_has_console_bind = 1;
-	}
-
-	return err;
-}
-
-#ifdef MODULE
-static void fbcon_prepare_logo(struct vc_data *vc, struct fb_info *info,
-			       int cols, int rows, int new_cols, int new_rows)
-{
-	logo_shown = FBCON_LOGO_DONTSHOW;
-}
-#else
-static void fbcon_prepare_logo(struct vc_data *vc, struct fb_info *info,
-			       int cols, int rows, int new_cols, int new_rows)
-{
-	/* Need to make room for the logo */
-	struct fbcon_ops *ops = info->fbcon_par;
-	int cnt, erase = vc->vc_video_erase_char, step;
-	unsigned short *save = NULL, *r, *q;
-	int logo_height;
-
-	if (info->flags & FBINFO_MODULE) {
-		logo_shown = FBCON_LOGO_DONTSHOW;
-		return;
-	}
-
-	/*
-	 * remove underline attribute from erase character
-	 * if black and white framebuffer.
-	 */
-	if (fb_get_color_depth(&info->var, &info->fix) == 1)
-		erase &= ~0x400;
-	logo_height = fb_prepare_logo(info, ops->rotate);
-	logo_lines = DIV_ROUND_UP(logo_height, vc->vc_font.height);
-	q = (unsigned short *) (vc->vc_origin +
-				vc->vc_size_row * rows);
-	step = logo_lines * cols;
-	for (r = q - logo_lines * cols; r < q; r++)
-		if (scr_readw(r) != vc->vc_video_erase_char)
-			break;
-	if (r != q && new_rows >= rows + logo_lines) {
-		save = kmalloc(logo_lines * new_cols * 2, GFP_KERNEL);
-		if (save) {
-			int i = cols < new_cols ? cols : new_cols;
-			scr_memsetw(save, erase, logo_lines * new_cols * 2);
-			r = q - step;
-			for (cnt = 0; cnt < logo_lines; cnt++, r += i)
-				scr_memcpyw(save + cnt * new_cols, r, 2 * i);
-			r = q;
-		}
-	}
-	if (r == q) {
-		/* We can scroll screen down */
-		r = q - step - cols;
-		for (cnt = rows - logo_lines; cnt > 0; cnt--) {
-			scr_memcpyw(r + step, r, vc->vc_size_row);
-			r -= cols;
-		}
-		if (!save) {
-			int lines;
-			if (vc->vc_y + logo_lines >= rows)
-				lines = rows - vc->vc_y - 1;
-			else
-				lines = logo_lines;
-			vc->vc_y += lines;
-			vc->vc_pos += lines * vc->vc_size_row;
-		}
-	}
-	scr_memsetw((unsigned short *) vc->vc_origin,
-		    erase,
-		    vc->vc_size_row * logo_lines);
-
-	if (con_is_visible(vc) && vc->vc_mode == KD_TEXT) {
-		fbcon_clear_margins(vc, 0);
-		update_screen(vc);
-	}
-
-	if (save) {
-		q = (unsigned short *) (vc->vc_origin +
-					vc->vc_size_row *
-					rows);
-		scr_memcpyw(q, save, logo_lines * new_cols * 2);
-		vc->vc_y += logo_lines;
-		vc->vc_pos += logo_lines * vc->vc_size_row;
-		kfree(save);
-	}
-
-	if (logo_lines > vc->vc_bottom) {
-		logo_shown = FBCON_LOGO_CANSHOW;
-		printk(KERN_INFO
-		       "fbcon_init: disable boot-logo (boot-logo bigger than screen).\n");
-	} else if (logo_shown != FBCON_LOGO_DONTSHOW) {
-		logo_shown = FBCON_LOGO_DRAW;
-		vc->vc_top = logo_lines;
-	}
-}
-#endif /* MODULE */
-
-#ifdef CONFIG_FB_TILEBLITTING
-static void set_blitting_type(struct vc_data *vc, struct fb_info *info)
-{
-	struct fbcon_ops *ops = info->fbcon_par;
-
-	ops->p = &fb_display[vc->vc_num];
-
-	if ((info->flags & FBINFO_MISC_TILEBLITTING))
-		fbcon_set_tileops(vc, info);
-	else {
-		fbcon_set_rotation(info);
-		fbcon_set_bitops(ops);
-	}
-}
-
-static int fbcon_invalid_charcount(struct fb_info *info, unsigned charcount)
-{
-	int err = 0;
-
-	if (info->flags & FBINFO_MISC_TILEBLITTING &&
-	    info->tileops->fb_get_tilemax(info) < charcount)
-		err = 1;
-
-	return err;
-}
-#else
-static void set_blitting_type(struct vc_data *vc, struct fb_info *info)
-{
-	struct fbcon_ops *ops = info->fbcon_par;
-
-	info->flags &= ~FBINFO_MISC_TILEBLITTING;
-	ops->p = &fb_display[vc->vc_num];
-	fbcon_set_rotation(info);
-	fbcon_set_bitops(ops);
-}
-
-static int fbcon_invalid_charcount(struct fb_info *info, unsigned charcount)
-{
-	return 0;
-}
-
-#endif /* CONFIG_MISC_TILEBLITTING */
-
-
-static int con2fb_acquire_newinfo(struct vc_data *vc, struct fb_info *info,
-				  int unit, int oldidx)
-{
-	struct fbcon_ops *ops = NULL;
-	int err = 0;
-
-	if (!try_module_get(info->fbops->owner))
-		err = -ENODEV;
-
-	if (!err && info->fbops->fb_open &&
-	    info->fbops->fb_open(info, 0))
-		err = -ENODEV;
-
-	if (!err) {
-		ops = kzalloc(sizeof(struct fbcon_ops), GFP_KERNEL);
-		if (!ops)
-			err = -ENOMEM;
-	}
-
-	if (!err) {
-		ops->cur_blink_jiffies = HZ / 5;
-		info->fbcon_par = ops;
-
-		if (vc)
-			set_blitting_type(vc, info);
-	}
-
-	if (err) {
-		con2fb_map[unit] = oldidx;
-		module_put(info->fbops->owner);
-	}
-
-	return err;
-}
-
-static int con2fb_release_oldinfo(struct vc_data *vc, struct fb_info *oldinfo,
-				  struct fb_info *newinfo, int unit,
-				  int oldidx, int found)
-{
-	struct fbcon_ops *ops = oldinfo->fbcon_par;
-	int err = 0, ret;
-
-	if (oldinfo->fbops->fb_release &&
-	    oldinfo->fbops->fb_release(oldinfo, 0)) {
-		con2fb_map[unit] = oldidx;
-		if (!found && newinfo->fbops->fb_release)
-			newinfo->fbops->fb_release(newinfo, 0);
-		if (!found)
-			module_put(newinfo->fbops->owner);
-		err = -ENODEV;
-	}
-
-	if (!err) {
-		fbcon_del_cursor_timer(oldinfo);
-		kfree(ops->cursor_state.mask);
-		kfree(ops->cursor_data);
-		kfree(ops->cursor_src);
-		kfree(ops->fontbuffer);
-		kfree(oldinfo->fbcon_par);
-		oldinfo->fbcon_par = NULL;
-		module_put(oldinfo->fbops->owner);
-		/*
-		  If oldinfo and newinfo are driving the same hardware,
-		  the fb_release() method of oldinfo may attempt to
-		  restore the hardware state.  This will leave the
-		  newinfo in an undefined state. Thus, a call to
-		  fb_set_par() may be needed for the newinfo.
-		*/
-		if (newinfo && newinfo->fbops->fb_set_par) {
-			ret = newinfo->fbops->fb_set_par(newinfo);
-
-			if (ret)
-				printk(KERN_ERR "con2fb_release_oldinfo: "
-					"detected unhandled fb_set_par error, "
-					"error code %d\n", ret);
-		}
-	}
-
-	return err;
-}
-
-static void con2fb_init_display(struct vc_data *vc, struct fb_info *info,
-				int unit, int show_logo)
-{
-	struct fbcon_ops *ops = info->fbcon_par;
-	int ret;
-
-	ops->currcon = fg_console;
-
-	if (info->fbops->fb_set_par && !(ops->flags & FBCON_FLAGS_INIT)) {
-		ret = info->fbops->fb_set_par(info);
-
-		if (ret)
-			printk(KERN_ERR "con2fb_init_display: detected "
-				"unhandled fb_set_par error, "
-				"error code %d\n", ret);
-	}
-
-	ops->flags |= FBCON_FLAGS_INIT;
-	ops->graphics = 0;
-	fbcon_set_disp(info, &info->var, unit);
-
-	if (show_logo) {
-		struct vc_data *fg_vc = vc_cons[fg_console].d;
-		struct fb_info *fg_info =
-			registered_fb[con2fb_map[fg_console]];
-
-		fbcon_prepare_logo(fg_vc, fg_info, fg_vc->vc_cols,
-				   fg_vc->vc_rows, fg_vc->vc_cols,
-				   fg_vc->vc_rows);
-	}
-
-	update_screen(vc_cons[fg_console].d);
-}
-
-/**
- *	set_con2fb_map - map console to frame buffer device
- *	@unit: virtual console number to map
- *	@newidx: frame buffer index to map virtual console to
- *      @user: user request
- *
- *	Maps a virtual console @unit to a frame buffer device
- *	@newidx.
- *
- *	This should be called with the console lock held.
- */
-static int set_con2fb_map(int unit, int newidx, int user)
-{
-	struct vc_data *vc = vc_cons[unit].d;
-	int oldidx = con2fb_map[unit];
-	struct fb_info *info = registered_fb[newidx];
-	struct fb_info *oldinfo = NULL;
- 	int found, err = 0;
-
-	if (oldidx == newidx)
-		return 0;
-
-	if (!info)
-		return -EINVAL;
-
-	if (!search_for_mapped_con() || !con_is_bound(&fb_con)) {
-		info_idx = newidx;
-		return do_fbcon_takeover(0);
-	}
-
-	if (oldidx != -1)
-		oldinfo = registered_fb[oldidx];
-
-	found = search_fb_in_map(newidx);
-
-	con2fb_map[unit] = newidx;
-	if (!err && !found)
- 		err = con2fb_acquire_newinfo(vc, info, unit, oldidx);
-
-
-	/*
-	 * If old fb is not mapped to any of the consoles,
-	 * fbcon should release it.
-	 */
- 	if (!err && oldinfo && !search_fb_in_map(oldidx))
- 		err = con2fb_release_oldinfo(vc, oldinfo, info, unit, oldidx,
- 					     found);
-
- 	if (!err) {
- 		int show_logo = (fg_console == 0 && !user &&
- 				 logo_shown != FBCON_LOGO_DONTSHOW);
-
- 		if (!found)
- 			fbcon_add_cursor_timer(info);
- 		con2fb_map_boot[unit] = newidx;
- 		con2fb_init_display(vc, info, unit, show_logo);
-	}
-
-	if (!search_fb_in_map(info_idx))
-		info_idx = newidx;
-
- 	return err;
-}
-
-/*
- *  Low Level Operations
- */
-/* NOTE: fbcon cannot be __init: it may be called from do_take_over_console later */
-static int var_to_display(struct display *disp,
-			  struct fb_var_screeninfo *var,
-			  struct fb_info *info)
-{
-	disp->xres_virtual = var->xres_virtual;
-	disp->yres_virtual = var->yres_virtual;
-	disp->bits_per_pixel = var->bits_per_pixel;
-	disp->grayscale = var->grayscale;
-	disp->nonstd = var->nonstd;
-	disp->accel_flags = var->accel_flags;
-	disp->height = var->height;
-	disp->width = var->width;
-	disp->red = var->red;
-	disp->green = var->green;
-	disp->blue = var->blue;
-	disp->transp = var->transp;
-	disp->rotate = var->rotate;
-	disp->mode = fb_match_mode(var, &info->modelist);
-	if (disp->mode == NULL)
-		/* This should not happen */
-		return -EINVAL;
-	return 0;
-}
-
-static void display_to_var(struct fb_var_screeninfo *var,
-			   struct display *disp)
-{
-	fb_videomode_to_var(var, disp->mode);
-	var->xres_virtual = disp->xres_virtual;
-	var->yres_virtual = disp->yres_virtual;
-	var->bits_per_pixel = disp->bits_per_pixel;
-	var->grayscale = disp->grayscale;
-	var->nonstd = disp->nonstd;
-	var->accel_flags = disp->accel_flags;
-	var->height = disp->height;
-	var->width = disp->width;
-	var->red = disp->red;
-	var->green = disp->green;
-	var->blue = disp->blue;
-	var->transp = disp->transp;
-	var->rotate = disp->rotate;
-}
-
-static const char *fbcon_startup(void)
-{
-	const char *display_desc = "frame buffer device";
-	struct display *p = &fb_display[fg_console];
-	struct vc_data *vc = vc_cons[fg_console].d;
-	const struct font_desc *font = NULL;
-	struct module *owner;
-	struct fb_info *info = NULL;
-	struct fbcon_ops *ops;
-	int rows, cols;
-
-	/*
-	 *  If num_registered_fb is zero, this is a call for the dummy part.
-	 *  The frame buffer devices weren't initialized yet.
-	 */
-	if (!num_registered_fb || info_idx == -1)
-		return display_desc;
-	/*
-	 * Instead of blindly using registered_fb[0], we use info_idx, set by
-	 * fb_console_init();
-	 */
-	info = registered_fb[info_idx];
-	if (!info)
-		return NULL;
-	
-	owner = info->fbops->owner;
-	if (!try_module_get(owner))
-		return NULL;
-	if (info->fbops->fb_open && info->fbops->fb_open(info, 0)) {
-		module_put(owner);
-		return NULL;
-	}
-
-	ops = kzalloc(sizeof(struct fbcon_ops), GFP_KERNEL);
-	if (!ops) {
-		module_put(owner);
-		return NULL;
-	}
-
-	ops->currcon = -1;
-	ops->graphics = 1;
-	ops->cur_rotate = -1;
-	ops->cur_blink_jiffies = HZ / 5;
-	info->fbcon_par = ops;
-	p->con_rotate = initial_rotation;
-	set_blitting_type(vc, info);
-
-	if (info->fix.type != FB_TYPE_TEXT) {
-		if (fbcon_softback_size) {
-			if (!softback_buf) {
-				softback_buf =
-				    (unsigned long)
-				    kmalloc(fbcon_softback_size,
-					    GFP_KERNEL);
-				if (!softback_buf) {
-					fbcon_softback_size = 0;
-					softback_top = 0;
-				}
-			}
-		} else {
-			if (softback_buf) {
-				kfree((void *) softback_buf);
-				softback_buf = 0;
-				softback_top = 0;
-			}
-		}
-		if (softback_buf)
-			softback_in = softback_top = softback_curr =
-			    softback_buf;
-		softback_lines = 0;
-	}
-
-	/* Setup default font */
-	if (!p->fontdata && !vc->vc_font.data) {
-		if (!fontname[0] || !(font = find_font(fontname)))
-			font = get_default_font(info->var.xres,
-						info->var.yres,
-						info->pixmap.blit_x,
-						info->pixmap.blit_y);
-		vc->vc_font.width = font->width;
-		vc->vc_font.height = font->height;
-		vc->vc_font.data = (void *)(p->fontdata = font->data);
-		vc->vc_font.charcount = 256; /* FIXME  Need to support more fonts */
-	} else {
-		p->fontdata = vc->vc_font.data;
-	}
-
-	cols = FBCON_SWAP(ops->rotate, info->var.xres, info->var.yres);
-	rows = FBCON_SWAP(ops->rotate, info->var.yres, info->var.xres);
-	cols /= vc->vc_font.width;
-	rows /= vc->vc_font.height;
-	vc_resize(vc, cols, rows);
-
-	DPRINTK("mode:   %s\n", info->fix.id);
-	DPRINTK("visual: %d\n", info->fix.visual);
-	DPRINTK("res:    %dx%d-%d\n", info->var.xres,
-		info->var.yres,
-		info->var.bits_per_pixel);
-
-	fbcon_add_cursor_timer(info);
-	fbcon_has_exited = 0;
-	return display_desc;
-}
-
-static void fbcon_init(struct vc_data *vc, int init)
-{
-	struct fb_info *info = registered_fb[con2fb_map[vc->vc_num]];
-	struct fbcon_ops *ops;
-	struct vc_data **default_mode = vc->vc_display_fg;
-	struct vc_data *svc = *default_mode;
-	struct display *t, *p = &fb_display[vc->vc_num];
-	int logo = 1, new_rows, new_cols, rows, cols, charcnt = 256;
-	int cap, ret;
-
-	if (info_idx == -1 || info == NULL)
-	    return;
-
-	cap = info->flags;
-
-	if (vc != svc || logo_shown == FBCON_LOGO_DONTSHOW ||
-	    (info->fix.type == FB_TYPE_TEXT))
-		logo = 0;
-
-	if (var_to_display(p, &info->var, info))
-		return;
-
-	if (!info->fbcon_par)
-		con2fb_acquire_newinfo(vc, info, vc->vc_num, -1);
-
-	/* If we are not the first console on this
-	   fb, copy the font from that console */
-	t = &fb_display[fg_console];
-	if (!p->fontdata) {
-		if (t->fontdata) {
-			struct vc_data *fvc = vc_cons[fg_console].d;
-
-			vc->vc_font.data = (void *)(p->fontdata =
-						    fvc->vc_font.data);
-			vc->vc_font.width = fvc->vc_font.width;
-			vc->vc_font.height = fvc->vc_font.height;
-			p->userfont = t->userfont;
-
-			if (p->userfont)
-				REFCOUNT(p->fontdata)++;
-		} else {
-			const struct font_desc *font = NULL;
-
-			if (!fontname[0] || !(font = find_font(fontname)))
-				font = get_default_font(info->var.xres,
-							info->var.yres,
-							info->pixmap.blit_x,
-							info->pixmap.blit_y);
-			vc->vc_font.width = font->width;
-			vc->vc_font.height = font->height;
-			vc->vc_font.data = (void *)(p->fontdata = font->data);
-			vc->vc_font.charcount = 256; /* FIXME  Need to
-							support more fonts */
-		}
-	}
-
-	if (p->userfont)
-		charcnt = FNTCHARCNT(p->fontdata);
-
-	vc->vc_panic_force_write = !!(info->flags & FBINFO_CAN_FORCE_OUTPUT);
-	vc->vc_can_do_color = (fb_get_color_depth(&info->var, &info->fix)!=1);
-	vc->vc_complement_mask = vc->vc_can_do_color ? 0x7700 : 0x0800;
-	if (charcnt == 256) {
-		vc->vc_hi_font_mask = 0;
-	} else {
-		vc->vc_hi_font_mask = 0x100;
-		if (vc->vc_can_do_color)
-			vc->vc_complement_mask <<= 1;
-	}
-
-	if (!*svc->vc_uni_pagedir_loc)
-		con_set_default_unimap(svc);
-	if (!*vc->vc_uni_pagedir_loc)
-		con_copy_unimap(vc, svc);
-
-	ops = info->fbcon_par;
-	ops->cur_blink_jiffies = msecs_to_jiffies(vc->vc_cur_blink_ms);
-	p->con_rotate = initial_rotation;
-	set_blitting_type(vc, info);
-
-	cols = vc->vc_cols;
-	rows = vc->vc_rows;
-	new_cols = FBCON_SWAP(ops->rotate, info->var.xres, info->var.yres);
-	new_rows = FBCON_SWAP(ops->rotate, info->var.yres, info->var.xres);
-	new_cols /= vc->vc_font.width;
-	new_rows /= vc->vc_font.height;
-
-	/*
-	 * We must always set the mode. The mode of the previous console
-	 * driver could be in the same resolution but we are using different
-	 * hardware so we have to initialize the hardware.
-	 *
-	 * We need to do it in fbcon_init() to prevent screen corruption.
-	 */
-	if (con_is_visible(vc) && vc->vc_mode == KD_TEXT) {
-		if (info->fbops->fb_set_par &&
-		    !(ops->flags & FBCON_FLAGS_INIT)) {
-			ret = info->fbops->fb_set_par(info);
-
-			if (ret)
-				printk(KERN_ERR "fbcon_init: detected "
-					"unhandled fb_set_par error, "
-					"error code %d\n", ret);
-		}
-
-		ops->flags |= FBCON_FLAGS_INIT;
-	}
-
-	ops->graphics = 0;
-
-	if ((cap & FBINFO_HWACCEL_COPYAREA) &&
-	    !(cap & FBINFO_HWACCEL_DISABLED))
-		p->scrollmode = SCROLL_MOVE;
-	else /* default to something safe */
-		p->scrollmode = SCROLL_REDRAW;
-
-	/*
-	 *  ++guenther: console.c:vc_allocate() relies on initializing
-	 *  vc_{cols,rows}, but we must not set those if we are only
-	 *  resizing the console.
-	 */
-	if (init) {
-		vc->vc_cols = new_cols;
-		vc->vc_rows = new_rows;
-	} else
-		vc_resize(vc, new_cols, new_rows);
-
-	if (logo)
-		fbcon_prepare_logo(vc, info, cols, rows, new_cols, new_rows);
-
-	if (vc == svc && softback_buf)
-		fbcon_update_softback(vc);
-
-	if (ops->rotate_font && ops->rotate_font(info, vc)) {
-		ops->rotate = FB_ROTATE_UR;
-		set_blitting_type(vc, info);
-	}
-
-	ops->p = &fb_display[fg_console];
-}
-
-static void fbcon_free_font(struct display *p, bool freefont)
-{
-	if (freefont && p->userfont && p->fontdata && (--REFCOUNT(p->fontdata) == 0))
-		kfree(p->fontdata - FONT_EXTRA_WORDS * sizeof(int));
-	p->fontdata = NULL;
-	p->userfont = 0;
-}
-
-static void set_vc_hi_font(struct vc_data *vc, bool set);
-
-static void fbcon_deinit(struct vc_data *vc)
-{
-	struct display *p = &fb_display[vc->vc_num];
-	struct fb_info *info;
-	struct fbcon_ops *ops;
-	int idx;
-	bool free_font = true;
-
-	idx = con2fb_map[vc->vc_num];
-
-	if (idx == -1)
-		goto finished;
-
-	info = registered_fb[idx];
-
-	if (!info)
-		goto finished;
-
-	if (info->flags & FBINFO_MISC_FIRMWARE)
-		free_font = false;
-	ops = info->fbcon_par;
-
-	if (!ops)
-		goto finished;
-
-	if (con_is_visible(vc))
-		fbcon_del_cursor_timer(info);
-
-	ops->flags &= ~FBCON_FLAGS_INIT;
-finished:
-
-	fbcon_free_font(p, free_font);
-	if (free_font)
-		vc->vc_font.data = NULL;
-
-	if (vc->vc_hi_font_mask)
-		set_vc_hi_font(vc, false);
-
-	if (!con_is_bound(&fb_con))
-		fbcon_exit();
-
-	return;
-}
-
-/* ====================================================================== */
-
-/*  fbcon_XXX routines - interface used by the world
- *
- *  This system is now divided into two levels because of complications
- *  caused by hardware scrolling. Top level functions:
- *
- *	fbcon_bmove(), fbcon_clear(), fbcon_putc(), fbcon_clear_margins()
- *
- *  handles y values in range [0, scr_height-1] that correspond to real
- *  screen positions. y_wrap shift means that first line of bitmap may be
- *  anywhere on this display. These functions convert lineoffsets to
- *  bitmap offsets and deal with the wrap-around case by splitting blits.
- *
- *	fbcon_bmove_physical_8()    -- These functions fast implementations
- *	fbcon_clear_physical_8()    -- of original fbcon_XXX fns.
- *	fbcon_putc_physical_8()	    -- (font width != 8) may be added later
- *
- *  WARNING:
- *
- *  At the moment fbcon_putc() cannot blit across vertical wrap boundary
- *  Implies should only really hardware scroll in rows. Only reason for
- *  restriction is simplicity & efficiency at the moment.
- */
-
-static void fbcon_clear(struct vc_data *vc, int sy, int sx, int height,
-			int width)
-{
-	struct fb_info *info = registered_fb[con2fb_map[vc->vc_num]];
-	struct fbcon_ops *ops = info->fbcon_par;
-
-	struct display *p = &fb_display[vc->vc_num];
-	u_int y_break;
-
-	if (fbcon_is_inactive(vc, info))
-		return;
-
-	if (!height || !width)
-		return;
-
-	if (sy < vc->vc_top && vc->vc_top == logo_lines) {
-		vc->vc_top = 0;
-		/*
-		 * If the font dimensions are not an integral of the display
-		 * dimensions then the ops->clear below won't end up clearing
-		 * the margins.  Call clear_margins here in case the logo
-		 * bitmap stretched into the margin area.
-		 */
-		fbcon_clear_margins(vc, 0);
-	}
-
-	/* Split blits that cross physical y_wrap boundary */
-
-	y_break = p->vrows - p->yscroll;
-	if (sy < y_break && sy + height - 1 >= y_break) {
-		u_int b = y_break - sy;
-		ops->clear(vc, info, real_y(p, sy), sx, b, width);
-		ops->clear(vc, info, real_y(p, sy + b), sx, height - b,
-				 width);
-	} else
-		ops->clear(vc, info, real_y(p, sy), sx, height, width);
-}
-
-static void fbcon_putcs(struct vc_data *vc, const unsigned short *s,
-			int count, int ypos, int xpos)
-{
-	struct fb_info *info = registered_fb[con2fb_map[vc->vc_num]];
-	struct display *p = &fb_display[vc->vc_num];
-	struct fbcon_ops *ops = info->fbcon_par;
-
-	if (!fbcon_is_inactive(vc, info))
-		ops->putcs(vc, info, s, count, real_y(p, ypos), xpos,
-			   get_color(vc, info, scr_readw(s), 1),
-			   get_color(vc, info, scr_readw(s), 0));
-}
-
-static void fbcon_putc(struct vc_data *vc, int c, int ypos, int xpos)
-{
-	unsigned short chr;
-
-	scr_writew(c, &chr);
-	fbcon_putcs(vc, &chr, 1, ypos, xpos);
-}
-
-static void fbcon_clear_margins(struct vc_data *vc, int bottom_only)
-{
-	struct fb_info *info = registered_fb[con2fb_map[vc->vc_num]];
-	struct fbcon_ops *ops = info->fbcon_par;
-
-	if (!fbcon_is_inactive(vc, info))
-		ops->clear_margins(vc, info, bottom_only);
-}
-
-static void fbcon_cursor(struct vc_data *vc, int mode)
-{
-	struct fb_info *info = registered_fb[con2fb_map[vc->vc_num]];
-	struct fbcon_ops *ops = info->fbcon_par;
-	int y;
- 	int c = scr_readw((u16 *) vc->vc_pos);
-
-	ops->cur_blink_jiffies = msecs_to_jiffies(vc->vc_cur_blink_ms);
-
-	if (fbcon_is_inactive(vc, info) || vc->vc_deccm != 1)
-		return;
-
-	if (vc->vc_cursor_type & 0x10)
-		fbcon_del_cursor_timer(info);
-	else
-		fbcon_add_cursor_timer(info);
-
-	ops->cursor_flash = (mode == CM_ERASE) ? 0 : 1;
-	if (mode & CM_SOFTBACK) {
-		mode &= ~CM_SOFTBACK;
-		y = softback_lines;
-	} else {
-		if (softback_lines)
-			fbcon_set_origin(vc);
-		y = 0;
-	}
-
-	ops->cursor(vc, info, mode, y, get_color(vc, info, c, 1),
-		    get_color(vc, info, c, 0));
-}
-
-static int scrollback_phys_max = 0;
-static int scrollback_max = 0;
-static int scrollback_current = 0;
-
-static void fbcon_set_disp(struct fb_info *info, struct fb_var_screeninfo *var,
-			   int unit)
-{
-	struct display *p, *t;
-	struct vc_data **default_mode, *vc;
-	struct vc_data *svc;
-	struct fbcon_ops *ops = info->fbcon_par;
-	int rows, cols, charcnt = 256;
-
-	p = &fb_display[unit];
-
-	if (var_to_display(p, var, info))
-		return;
-
-	vc = vc_cons[unit].d;
-
-	if (!vc)
-		return;
-
-	default_mode = vc->vc_display_fg;
-	svc = *default_mode;
-	t = &fb_display[svc->vc_num];
-
-	if (!vc->vc_font.data) {
-		vc->vc_font.data = (void *)(p->fontdata = t->fontdata);
-		vc->vc_font.width = (*default_mode)->vc_font.width;
-		vc->vc_font.height = (*default_mode)->vc_font.height;
-		p->userfont = t->userfont;
-		if (p->userfont)
-			REFCOUNT(p->fontdata)++;
-	}
-	if (p->userfont)
-		charcnt = FNTCHARCNT(p->fontdata);
-
-	var->activate = FB_ACTIVATE_NOW;
-	info->var.activate = var->activate;
-	var->yoffset = info->var.yoffset;
-	var->xoffset = info->var.xoffset;
-	fb_set_var(info, var);
-	ops->var = info->var;
-	vc->vc_can_do_color = (fb_get_color_depth(&info->var, &info->fix)!=1);
-	vc->vc_complement_mask = vc->vc_can_do_color ? 0x7700 : 0x0800;
-	if (charcnt == 256) {
-		vc->vc_hi_font_mask = 0;
-	} else {
-		vc->vc_hi_font_mask = 0x100;
-		if (vc->vc_can_do_color)
-			vc->vc_complement_mask <<= 1;
-	}
-
-	if (!*svc->vc_uni_pagedir_loc)
-		con_set_default_unimap(svc);
-	if (!*vc->vc_uni_pagedir_loc)
-		con_copy_unimap(vc, svc);
-
-	cols = FBCON_SWAP(ops->rotate, info->var.xres, info->var.yres);
-	rows = FBCON_SWAP(ops->rotate, info->var.yres, info->var.xres);
-	cols /= vc->vc_font.width;
-	rows /= vc->vc_font.height;
-	vc_resize(vc, cols, rows);
-
-	if (con_is_visible(vc)) {
-		update_screen(vc);
-		if (softback_buf)
-			fbcon_update_softback(vc);
-	}
-}
-
-static __inline__ void ywrap_up(struct vc_data *vc, int count)
-{
-	struct fb_info *info = registered_fb[con2fb_map[vc->vc_num]];
-	struct fbcon_ops *ops = info->fbcon_par;
-	struct display *p = &fb_display[vc->vc_num];
-	
-	p->yscroll += count;
-	if (p->yscroll >= p->vrows)	/* Deal with wrap */
-		p->yscroll -= p->vrows;
-	ops->var.xoffset = 0;
-	ops->var.yoffset = p->yscroll * vc->vc_font.height;
-	ops->var.vmode |= FB_VMODE_YWRAP;
-	ops->update_start(info);
-	scrollback_max += count;
-	if (scrollback_max > scrollback_phys_max)
-		scrollback_max = scrollback_phys_max;
-	scrollback_current = 0;
-}
-
-static __inline__ void ywrap_down(struct vc_data *vc, int count)
-{
-	struct fb_info *info = registered_fb[con2fb_map[vc->vc_num]];
-	struct fbcon_ops *ops = info->fbcon_par;
-	struct display *p = &fb_display[vc->vc_num];
-	
-	p->yscroll -= count;
-	if (p->yscroll < 0)	/* Deal with wrap */
-		p->yscroll += p->vrows;
-	ops->var.xoffset = 0;
-	ops->var.yoffset = p->yscroll * vc->vc_font.height;
-	ops->var.vmode |= FB_VMODE_YWRAP;
-	ops->update_start(info);
-	scrollback_max -= count;
-	if (scrollback_max < 0)
-		scrollback_max = 0;
-	scrollback_current = 0;
-}
-
-static __inline__ void ypan_up(struct vc_data *vc, int count)
-{
-	struct fb_info *info = registered_fb[con2fb_map[vc->vc_num]];
-	struct display *p = &fb_display[vc->vc_num];
-	struct fbcon_ops *ops = info->fbcon_par;
-
-	p->yscroll += count;
-	if (p->yscroll > p->vrows - vc->vc_rows) {
-		ops->bmove(vc, info, p->vrows - vc->vc_rows,
-			    0, 0, 0, vc->vc_rows, vc->vc_cols);
-		p->yscroll -= p->vrows - vc->vc_rows;
-	}
-
-	ops->var.xoffset = 0;
-	ops->var.yoffset = p->yscroll * vc->vc_font.height;
-	ops->var.vmode &= ~FB_VMODE_YWRAP;
-	ops->update_start(info);
-	fbcon_clear_margins(vc, 1);
-	scrollback_max += count;
-	if (scrollback_max > scrollback_phys_max)
-		scrollback_max = scrollback_phys_max;
-	scrollback_current = 0;
-}
-
-static __inline__ void ypan_up_redraw(struct vc_data *vc, int t, int count)
-{
-	struct fb_info *info = registered_fb[con2fb_map[vc->vc_num]];
-	struct fbcon_ops *ops = info->fbcon_par;
-	struct display *p = &fb_display[vc->vc_num];
-
-	p->yscroll += count;
-
-	if (p->yscroll > p->vrows - vc->vc_rows) {
-		p->yscroll -= p->vrows - vc->vc_rows;
-		fbcon_redraw_move(vc, p, t + count, vc->vc_rows - count, t);
-	}
-
-	ops->var.xoffset = 0;
-	ops->var.yoffset = p->yscroll * vc->vc_font.height;
-	ops->var.vmode &= ~FB_VMODE_YWRAP;
-	ops->update_start(info);
-	fbcon_clear_margins(vc, 1);
-	scrollback_max += count;
-	if (scrollback_max > scrollback_phys_max)
-		scrollback_max = scrollback_phys_max;
-	scrollback_current = 0;
-}
-
-static __inline__ void ypan_down(struct vc_data *vc, int count)
-{
-	struct fb_info *info = registered_fb[con2fb_map[vc->vc_num]];
-	struct display *p = &fb_display[vc->vc_num];
-	struct fbcon_ops *ops = info->fbcon_par;
-	
-	p->yscroll -= count;
-	if (p->yscroll < 0) {
-		ops->bmove(vc, info, 0, 0, p->vrows - vc->vc_rows,
-			    0, vc->vc_rows, vc->vc_cols);
-		p->yscroll += p->vrows - vc->vc_rows;
-	}
-
-	ops->var.xoffset = 0;
-	ops->var.yoffset = p->yscroll * vc->vc_font.height;
-	ops->var.vmode &= ~FB_VMODE_YWRAP;
-	ops->update_start(info);
-	fbcon_clear_margins(vc, 1);
-	scrollback_max -= count;
-	if (scrollback_max < 0)
-		scrollback_max = 0;
-	scrollback_current = 0;
-}
-
-static __inline__ void ypan_down_redraw(struct vc_data *vc, int t, int count)
-{
-	struct fb_info *info = registered_fb[con2fb_map[vc->vc_num]];
-	struct fbcon_ops *ops = info->fbcon_par;
-	struct display *p = &fb_display[vc->vc_num];
-
-	p->yscroll -= count;
-
-	if (p->yscroll < 0) {
-		p->yscroll += p->vrows - vc->vc_rows;
-		fbcon_redraw_move(vc, p, t, vc->vc_rows - count, t + count);
-	}
-
-	ops->var.xoffset = 0;
-	ops->var.yoffset = p->yscroll * vc->vc_font.height;
-	ops->var.vmode &= ~FB_VMODE_YWRAP;
-	ops->update_start(info);
-	fbcon_clear_margins(vc, 1);
-	scrollback_max -= count;
-	if (scrollback_max < 0)
-		scrollback_max = 0;
-	scrollback_current = 0;
-}
-
-static void fbcon_redraw_softback(struct vc_data *vc, struct display *p,
-				  long delta)
-{
-	int count = vc->vc_rows;
-	unsigned short *d, *s;
-	unsigned long n;
-	int line = 0;
-
-	d = (u16 *) softback_curr;
-	if (d == (u16 *) softback_in)
-		d = (u16 *) vc->vc_origin;
-	n = softback_curr + delta * vc->vc_size_row;
-	softback_lines -= delta;
-	if (delta < 0) {
-		if (softback_curr < softback_top && n < softback_buf) {
-			n += softback_end - softback_buf;
-			if (n < softback_top) {
-				softback_lines -=
-				    (softback_top - n) / vc->vc_size_row;
-				n = softback_top;
-			}
-		} else if (softback_curr >= softback_top
-			   && n < softback_top) {
-			softback_lines -=
-			    (softback_top - n) / vc->vc_size_row;
-			n = softback_top;
-		}
-	} else {
-		if (softback_curr > softback_in && n >= softback_end) {
-			n += softback_buf - softback_end;
-			if (n > softback_in) {
-				n = softback_in;
-				softback_lines = 0;
-			}
-		} else if (softback_curr <= softback_in && n > softback_in) {
-			n = softback_in;
-			softback_lines = 0;
-		}
-	}
-	if (n == softback_curr)
-		return;
-	softback_curr = n;
-	s = (u16 *) softback_curr;
-	if (s == (u16 *) softback_in)
-		s = (u16 *) vc->vc_origin;
-	while (count--) {
-		unsigned short *start;
-		unsigned short *le;
-		unsigned short c;
-		int x = 0;
-		unsigned short attr = 1;
-
-		start = s;
-		le = advance_row(s, 1);
-		do {
-			c = scr_readw(s);
-			if (attr != (c & 0xff00)) {
-				attr = c & 0xff00;
-				if (s > start) {
-					fbcon_putcs(vc, start, s - start,
-						    line, x);
-					x += s - start;
-					start = s;
-				}
-			}
-			if (c == scr_readw(d)) {
-				if (s > start) {
-					fbcon_putcs(vc, start, s - start,
-						    line, x);
-					x += s - start + 1;
-					start = s + 1;
-				} else {
-					x++;
-					start++;
-				}
-			}
-			s++;
-			d++;
-		} while (s < le);
-		if (s > start)
-			fbcon_putcs(vc, start, s - start, line, x);
-		line++;
-		if (d == (u16 *) softback_end)
-			d = (u16 *) softback_buf;
-		if (d == (u16 *) softback_in)
-			d = (u16 *) vc->vc_origin;
-		if (s == (u16 *) softback_end)
-			s = (u16 *) softback_buf;
-		if (s == (u16 *) softback_in)
-			s = (u16 *) vc->vc_origin;
-	}
-}
-
-static void fbcon_redraw_move(struct vc_data *vc, struct display *p,
-			      int line, int count, int dy)
-{
-	unsigned short *s = (unsigned short *)
-		(vc->vc_origin + vc->vc_size_row * line);
-
-	while (count--) {
-		unsigned short *start = s;
-		unsigned short *le = advance_row(s, 1);
-		unsigned short c;
-		int x = 0;
-		unsigned short attr = 1;
-
-		do {
-			c = scr_readw(s);
-			if (attr != (c & 0xff00)) {
-				attr = c & 0xff00;
-				if (s > start) {
-					fbcon_putcs(vc, start, s - start,
-						    dy, x);
-					x += s - start;
-					start = s;
-				}
-			}
-			console_conditional_schedule();
-			s++;
-		} while (s < le);
-		if (s > start)
-			fbcon_putcs(vc, start, s - start, dy, x);
-		console_conditional_schedule();
-		dy++;
-	}
-}
-
-static void fbcon_redraw_blit(struct vc_data *vc, struct fb_info *info,
-			struct display *p, int line, int count, int ycount)
-{
-	int offset = ycount * vc->vc_cols;
-	unsigned short *d = (unsigned short *)
-	    (vc->vc_origin + vc->vc_size_row * line);
-	unsigned short *s = d + offset;
-	struct fbcon_ops *ops = info->fbcon_par;
-
-	while (count--) {
-		unsigned short *start = s;
-		unsigned short *le = advance_row(s, 1);
-		unsigned short c;
-		int x = 0;
-
-		do {
-			c = scr_readw(s);
-
-			if (c == scr_readw(d)) {
-				if (s > start) {
-					ops->bmove(vc, info, line + ycount, x,
-						   line, x, 1, s-start);
-					x += s - start + 1;
-					start = s + 1;
-				} else {
-					x++;
-					start++;
-				}
-			}
-
-			scr_writew(c, d);
-			console_conditional_schedule();
-			s++;
-			d++;
-		} while (s < le);
-		if (s > start)
-			ops->bmove(vc, info, line + ycount, x, line, x, 1,
-				   s-start);
-		console_conditional_schedule();
-		if (ycount > 0)
-			line++;
-		else {
-			line--;
-			/* NOTE: We subtract two lines from these pointers */
-			s -= vc->vc_size_row;
-			d -= vc->vc_size_row;
-		}
-	}
-}
-
-static void fbcon_redraw(struct vc_data *vc, struct display *p,
-			 int line, int count, int offset)
-{
-	unsigned short *d = (unsigned short *)
-	    (vc->vc_origin + vc->vc_size_row * line);
-	unsigned short *s = d + offset;
-
-	while (count--) {
-		unsigned short *start = s;
-		unsigned short *le = advance_row(s, 1);
-		unsigned short c;
-		int x = 0;
-		unsigned short attr = 1;
-
-		do {
-			c = scr_readw(s);
-			if (attr != (c & 0xff00)) {
-				attr = c & 0xff00;
-				if (s > start) {
-					fbcon_putcs(vc, start, s - start,
-						    line, x);
-					x += s - start;
-					start = s;
-				}
-			}
-			if (c == scr_readw(d)) {
-				if (s > start) {
-					fbcon_putcs(vc, start, s - start,
-						     line, x);
-					x += s - start + 1;
-					start = s + 1;
-				} else {
-					x++;
-					start++;
-				}
-			}
-			scr_writew(c, d);
-			console_conditional_schedule();
-			s++;
-			d++;
-		} while (s < le);
-		if (s > start)
-			fbcon_putcs(vc, start, s - start, line, x);
-		console_conditional_schedule();
-		if (offset > 0)
-			line++;
-		else {
-			line--;
-			/* NOTE: We subtract two lines from these pointers */
-			s -= vc->vc_size_row;
-			d -= vc->vc_size_row;
-		}
-	}
-}
-
-static inline void fbcon_softback_note(struct vc_data *vc, int t,
-				       int count)
-{
-	unsigned short *p;
-
-	if (vc->vc_num != fg_console)
-		return;
-	p = (unsigned short *) (vc->vc_origin + t * vc->vc_size_row);
-
-	while (count) {
-		scr_memcpyw((u16 *) softback_in, p, vc->vc_size_row);
-		count--;
-		p = advance_row(p, 1);
-		softback_in += vc->vc_size_row;
-		if (softback_in == softback_end)
-			softback_in = softback_buf;
-		if (softback_in == softback_top) {
-			softback_top += vc->vc_size_row;
-			if (softback_top == softback_end)
-				softback_top = softback_buf;
-		}
-	}
-	softback_curr = softback_in;
-}
-
-static bool fbcon_scroll(struct vc_data *vc, unsigned int t, unsigned int b,
-		enum con_scroll dir, unsigned int count)
-{
-	struct fb_info *info = registered_fb[con2fb_map[vc->vc_num]];
-	struct display *p = &fb_display[vc->vc_num];
-	int scroll_partial = info->flags & FBINFO_PARTIAL_PAN_OK;
-
-	if (fbcon_is_inactive(vc, info))
-		return true;
-
-	fbcon_cursor(vc, CM_ERASE);
-
-	/*
-	 * ++Geert: Only use ywrap/ypan if the console is in text mode
-	 * ++Andrew: Only use ypan on hardware text mode when scrolling the
-	 *           whole screen (prevents flicker).
-	 */
-
-	switch (dir) {
-	case SM_UP:
-		if (count > vc->vc_rows)	/* Maximum realistic size */
-			count = vc->vc_rows;
-		if (softback_top)
-			fbcon_softback_note(vc, t, count);
-		if (logo_shown >= 0)
-			goto redraw_up;
-		switch (p->scrollmode) {
-		case SCROLL_MOVE:
-			fbcon_redraw_blit(vc, info, p, t, b - t - count,
-				     count);
-			fbcon_clear(vc, b - count, 0, count, vc->vc_cols);
-			scr_memsetw((unsigned short *) (vc->vc_origin +
-							vc->vc_size_row *
-							(b - count)),
-				    vc->vc_video_erase_char,
-				    vc->vc_size_row * count);
-			return true;
-			break;
-
-		case SCROLL_WRAP_MOVE:
-			if (b - t - count > 3 * vc->vc_rows >> 2) {
-				if (t > 0)
-					fbcon_bmove(vc, 0, 0, count, 0, t,
-						    vc->vc_cols);
-				ywrap_up(vc, count);
-				if (vc->vc_rows - b > 0)
-					fbcon_bmove(vc, b - count, 0, b, 0,
-						    vc->vc_rows - b,
-						    vc->vc_cols);
-			} else if (info->flags & FBINFO_READS_FAST)
-				fbcon_bmove(vc, t + count, 0, t, 0,
-					    b - t - count, vc->vc_cols);
-			else
-				goto redraw_up;
-			fbcon_clear(vc, b - count, 0, count, vc->vc_cols);
-			break;
-
-		case SCROLL_PAN_REDRAW:
-			if ((p->yscroll + count <=
-			     2 * (p->vrows - vc->vc_rows))
-			    && ((!scroll_partial && (b - t == vc->vc_rows))
-				|| (scroll_partial
-				    && (b - t - count >
-					3 * vc->vc_rows >> 2)))) {
-				if (t > 0)
-					fbcon_redraw_move(vc, p, 0, t, count);
-				ypan_up_redraw(vc, t, count);
-				if (vc->vc_rows - b > 0)
-					fbcon_redraw_move(vc, p, b,
-							  vc->vc_rows - b, b);
-			} else
-				fbcon_redraw_move(vc, p, t + count, b - t - count, t);
-			fbcon_clear(vc, b - count, 0, count, vc->vc_cols);
-			break;
-
-		case SCROLL_PAN_MOVE:
-			if ((p->yscroll + count <=
-			     2 * (p->vrows - vc->vc_rows))
-			    && ((!scroll_partial && (b - t == vc->vc_rows))
-				|| (scroll_partial
-				    && (b - t - count >
-					3 * vc->vc_rows >> 2)))) {
-				if (t > 0)
-					fbcon_bmove(vc, 0, 0, count, 0, t,
-						    vc->vc_cols);
-				ypan_up(vc, count);
-				if (vc->vc_rows - b > 0)
-					fbcon_bmove(vc, b - count, 0, b, 0,
-						    vc->vc_rows - b,
-						    vc->vc_cols);
-			} else if (info->flags & FBINFO_READS_FAST)
-				fbcon_bmove(vc, t + count, 0, t, 0,
-					    b - t - count, vc->vc_cols);
-			else
-				goto redraw_up;
-			fbcon_clear(vc, b - count, 0, count, vc->vc_cols);
-			break;
-
-		case SCROLL_REDRAW:
-		      redraw_up:
-			fbcon_redraw(vc, p, t, b - t - count,
-				     count * vc->vc_cols);
-			fbcon_clear(vc, b - count, 0, count, vc->vc_cols);
-			scr_memsetw((unsigned short *) (vc->vc_origin +
-							vc->vc_size_row *
-							(b - count)),
-				    vc->vc_video_erase_char,
-				    vc->vc_size_row * count);
-			return true;
-		}
-		break;
-
-	case SM_DOWN:
-		if (count > vc->vc_rows)	/* Maximum realistic size */
-			count = vc->vc_rows;
-		if (logo_shown >= 0)
-			goto redraw_down;
-		switch (p->scrollmode) {
-		case SCROLL_MOVE:
-			fbcon_redraw_blit(vc, info, p, b - 1, b - t - count,
-				     -count);
-			fbcon_clear(vc, t, 0, count, vc->vc_cols);
-			scr_memsetw((unsigned short *) (vc->vc_origin +
-							vc->vc_size_row *
-							t),
-				    vc->vc_video_erase_char,
-				    vc->vc_size_row * count);
-			return true;
-			break;
-
-		case SCROLL_WRAP_MOVE:
-			if (b - t - count > 3 * vc->vc_rows >> 2) {
-				if (vc->vc_rows - b > 0)
-					fbcon_bmove(vc, b, 0, b - count, 0,
-						    vc->vc_rows - b,
-						    vc->vc_cols);
-				ywrap_down(vc, count);
-				if (t > 0)
-					fbcon_bmove(vc, count, 0, 0, 0, t,
-						    vc->vc_cols);
-			} else if (info->flags & FBINFO_READS_FAST)
-				fbcon_bmove(vc, t, 0, t + count, 0,
-					    b - t - count, vc->vc_cols);
-			else
-				goto redraw_down;
-			fbcon_clear(vc, t, 0, count, vc->vc_cols);
-			break;
-
-		case SCROLL_PAN_MOVE:
-			if ((count - p->yscroll <= p->vrows - vc->vc_rows)
-			    && ((!scroll_partial && (b - t == vc->vc_rows))
-				|| (scroll_partial
-				    && (b - t - count >
-					3 * vc->vc_rows >> 2)))) {
-				if (vc->vc_rows - b > 0)
-					fbcon_bmove(vc, b, 0, b - count, 0,
-						    vc->vc_rows - b,
-						    vc->vc_cols);
-				ypan_down(vc, count);
-				if (t > 0)
-					fbcon_bmove(vc, count, 0, 0, 0, t,
-						    vc->vc_cols);
-			} else if (info->flags & FBINFO_READS_FAST)
-				fbcon_bmove(vc, t, 0, t + count, 0,
-					    b - t - count, vc->vc_cols);
-			else
-				goto redraw_down;
-			fbcon_clear(vc, t, 0, count, vc->vc_cols);
-			break;
-
-		case SCROLL_PAN_REDRAW:
-			if ((count - p->yscroll <= p->vrows - vc->vc_rows)
-			    && ((!scroll_partial && (b - t == vc->vc_rows))
-				|| (scroll_partial
-				    && (b - t - count >
-					3 * vc->vc_rows >> 2)))) {
-				if (vc->vc_rows - b > 0)
-					fbcon_redraw_move(vc, p, b, vc->vc_rows - b,
-							  b - count);
-				ypan_down_redraw(vc, t, count);
-				if (t > 0)
-					fbcon_redraw_move(vc, p, count, t, 0);
-			} else
-				fbcon_redraw_move(vc, p, t, b - t - count, t + count);
-			fbcon_clear(vc, t, 0, count, vc->vc_cols);
-			break;
-
-		case SCROLL_REDRAW:
-		      redraw_down:
-			fbcon_redraw(vc, p, b - 1, b - t - count,
-				     -count * vc->vc_cols);
-			fbcon_clear(vc, t, 0, count, vc->vc_cols);
-			scr_memsetw((unsigned short *) (vc->vc_origin +
-							vc->vc_size_row *
-							t),
-				    vc->vc_video_erase_char,
-				    vc->vc_size_row * count);
-			return true;
-		}
-	}
-	return false;
-}
-
-
-static void fbcon_bmove(struct vc_data *vc, int sy, int sx, int dy, int dx,
-			int height, int width)
-{
-	struct fb_info *info = registered_fb[con2fb_map[vc->vc_num]];
-	struct display *p = &fb_display[vc->vc_num];
-	
-	if (fbcon_is_inactive(vc, info))
-		return;
-
-	if (!width || !height)
-		return;
-
-	/*  Split blits that cross physical y_wrap case.
-	 *  Pathological case involves 4 blits, better to use recursive
-	 *  code rather than unrolled case
-	 *
-	 *  Recursive invocations don't need to erase the cursor over and
-	 *  over again, so we use fbcon_bmove_rec()
-	 */
-	fbcon_bmove_rec(vc, p, sy, sx, dy, dx, height, width,
-			p->vrows - p->yscroll);
-}
-
-static void fbcon_bmove_rec(struct vc_data *vc, struct display *p, int sy, int sx, 
-			    int dy, int dx, int height, int width, u_int y_break)
-{
-	struct fb_info *info = registered_fb[con2fb_map[vc->vc_num]];
-	struct fbcon_ops *ops = info->fbcon_par;
-	u_int b;
-
-	if (sy < y_break && sy + height > y_break) {
-		b = y_break - sy;
-		if (dy < sy) {	/* Avoid trashing self */
-			fbcon_bmove_rec(vc, p, sy, sx, dy, dx, b, width,
-					y_break);
-			fbcon_bmove_rec(vc, p, sy + b, sx, dy + b, dx,
-					height - b, width, y_break);
-		} else {
-			fbcon_bmove_rec(vc, p, sy + b, sx, dy + b, dx,
-					height - b, width, y_break);
-			fbcon_bmove_rec(vc, p, sy, sx, dy, dx, b, width,
-					y_break);
-		}
-		return;
-	}
-
-	if (dy < y_break && dy + height > y_break) {
-		b = y_break - dy;
-		if (dy < sy) {	/* Avoid trashing self */
-			fbcon_bmove_rec(vc, p, sy, sx, dy, dx, b, width,
-					y_break);
-			fbcon_bmove_rec(vc, p, sy + b, sx, dy + b, dx,
-					height - b, width, y_break);
-		} else {
-			fbcon_bmove_rec(vc, p, sy + b, sx, dy + b, dx,
-					height - b, width, y_break);
-			fbcon_bmove_rec(vc, p, sy, sx, dy, dx, b, width,
-					y_break);
-		}
-		return;
-	}
-	ops->bmove(vc, info, real_y(p, sy), sx, real_y(p, dy), dx,
-		   height, width);
-}
-
-static void updatescrollmode(struct display *p,
-					struct fb_info *info,
-					struct vc_data *vc)
-{
-	struct fbcon_ops *ops = info->fbcon_par;
-	int fh = vc->vc_font.height;
-	int cap = info->flags;
-	u16 t = 0;
-	int ypan = FBCON_SWAP(ops->rotate, info->fix.ypanstep,
-				  info->fix.xpanstep);
-	int ywrap = FBCON_SWAP(ops->rotate, info->fix.ywrapstep, t);
-	int yres = FBCON_SWAP(ops->rotate, info->var.yres, info->var.xres);
-	int vyres = FBCON_SWAP(ops->rotate, info->var.yres_virtual,
-				   info->var.xres_virtual);
-	int good_pan = (cap & FBINFO_HWACCEL_YPAN) &&
-		divides(ypan, vc->vc_font.height) && vyres > yres;
-	int good_wrap = (cap & FBINFO_HWACCEL_YWRAP) &&
-		divides(ywrap, vc->vc_font.height) &&
-		divides(vc->vc_font.height, vyres) &&
-		divides(vc->vc_font.height, yres);
-	int reading_fast = cap & FBINFO_READS_FAST;
-	int fast_copyarea = (cap & FBINFO_HWACCEL_COPYAREA) &&
-		!(cap & FBINFO_HWACCEL_DISABLED);
-	int fast_imageblit = (cap & FBINFO_HWACCEL_IMAGEBLIT) &&
-		!(cap & FBINFO_HWACCEL_DISABLED);
-
-	p->vrows = vyres/fh;
-	if (yres > (fh * (vc->vc_rows + 1)))
-		p->vrows -= (yres - (fh * vc->vc_rows)) / fh;
-	if ((yres % fh) && (vyres % fh < yres % fh))
-		p->vrows--;
-
-	if (good_wrap || good_pan) {
-		if (reading_fast || fast_copyarea)
-			p->scrollmode = good_wrap ?
-				SCROLL_WRAP_MOVE : SCROLL_PAN_MOVE;
-		else
-			p->scrollmode = good_wrap ? SCROLL_REDRAW :
-				SCROLL_PAN_REDRAW;
-	} else {
-		if (reading_fast || (fast_copyarea && !fast_imageblit))
-			p->scrollmode = SCROLL_MOVE;
-		else
-			p->scrollmode = SCROLL_REDRAW;
-	}
-}
-
-static int fbcon_resize(struct vc_data *vc, unsigned int width, 
-			unsigned int height, unsigned int user)
-{
-	struct fb_info *info = registered_fb[con2fb_map[vc->vc_num]];
-	struct fbcon_ops *ops = info->fbcon_par;
-	struct display *p = &fb_display[vc->vc_num];
-	struct fb_var_screeninfo var = info->var;
-	int x_diff, y_diff, virt_w, virt_h, virt_fw, virt_fh;
-
-	virt_w = FBCON_SWAP(ops->rotate, width, height);
-	virt_h = FBCON_SWAP(ops->rotate, height, width);
-	virt_fw = FBCON_SWAP(ops->rotate, vc->vc_font.width,
-				 vc->vc_font.height);
-	virt_fh = FBCON_SWAP(ops->rotate, vc->vc_font.height,
-				 vc->vc_font.width);
-	var.xres = virt_w * virt_fw;
-	var.yres = virt_h * virt_fh;
-	x_diff = info->var.xres - var.xres;
-	y_diff = info->var.yres - var.yres;
-	if (x_diff < 0 || x_diff > virt_fw ||
-	    y_diff < 0 || y_diff > virt_fh) {
-		const struct fb_videomode *mode;
-
-		DPRINTK("attempting resize %ix%i\n", var.xres, var.yres);
-		mode = fb_find_best_mode(&var, &info->modelist);
-		if (mode == NULL)
-			return -EINVAL;
-		display_to_var(&var, p);
-		fb_videomode_to_var(&var, mode);
-
-		if (virt_w > var.xres/virt_fw || virt_h > var.yres/virt_fh)
-			return -EINVAL;
-
-		DPRINTK("resize now %ix%i\n", var.xres, var.yres);
-		if (con_is_visible(vc)) {
-			var.activate = FB_ACTIVATE_NOW |
-				FB_ACTIVATE_FORCE;
-			fb_set_var(info, &var);
-		}
-		var_to_display(p, &info->var, info);
-		ops->var = info->var;
-	}
-	updatescrollmode(p, info, vc);
-	return 0;
-}
-
-static int fbcon_switch(struct vc_data *vc)
-{
-	struct fb_info *info, *old_info = NULL;
-	struct fbcon_ops *ops;
-	struct display *p = &fb_display[vc->vc_num];
-	struct fb_var_screeninfo var;
-	int i, ret, prev_console, charcnt = 256;
-
-	info = registered_fb[con2fb_map[vc->vc_num]];
-	ops = info->fbcon_par;
-
-	if (softback_top) {
-		if (softback_lines)
-			fbcon_set_origin(vc);
-		softback_top = softback_curr = softback_in = softback_buf;
-		softback_lines = 0;
-		fbcon_update_softback(vc);
-	}
-
-	if (logo_shown >= 0) {
-		struct vc_data *conp2 = vc_cons[logo_shown].d;
-
-		if (conp2->vc_top == logo_lines
-		    && conp2->vc_bottom == conp2->vc_rows)
-			conp2->vc_top = 0;
-		logo_shown = FBCON_LOGO_CANSHOW;
-	}
-
-	prev_console = ops->currcon;
-	if (prev_console != -1)
-		old_info = registered_fb[con2fb_map[prev_console]];
-	/*
-	 * FIXME: If we have multiple fbdev's loaded, we need to
-	 * update all info->currcon.  Perhaps, we can place this
-	 * in a centralized structure, but this might break some
-	 * drivers.
-	 *
-	 * info->currcon = vc->vc_num;
-	 */
-	for (i = 0; i < FB_MAX; i++) {
-		if (registered_fb[i] != NULL && registered_fb[i]->fbcon_par) {
-			struct fbcon_ops *o = registered_fb[i]->fbcon_par;
-
-			o->currcon = vc->vc_num;
-		}
-	}
-	memset(&var, 0, sizeof(struct fb_var_screeninfo));
-	display_to_var(&var, p);
-	var.activate = FB_ACTIVATE_NOW;
-
-	/*
-	 * make sure we don't unnecessarily trip the memcmp()
-	 * in fb_set_var()
-	 */
-	info->var.activate = var.activate;
-	var.vmode |= info->var.vmode & ~FB_VMODE_MASK;
-	fb_set_var(info, &var);
-	ops->var = info->var;
-
-	if (old_info != NULL && (old_info != info ||
-				 info->flags & FBINFO_MISC_ALWAYS_SETPAR)) {
-		if (info->fbops->fb_set_par) {
-			ret = info->fbops->fb_set_par(info);
-
-			if (ret)
-				printk(KERN_ERR "fbcon_switch: detected "
-					"unhandled fb_set_par error, "
-					"error code %d\n", ret);
-		}
-
-		if (old_info != info)
-			fbcon_del_cursor_timer(old_info);
-	}
-
-	if (fbcon_is_inactive(vc, info) ||
-	    ops->blank_state != FB_BLANK_UNBLANK)
-		fbcon_del_cursor_timer(info);
-	else
-		fbcon_add_cursor_timer(info);
-
-	set_blitting_type(vc, info);
-	ops->cursor_reset = 1;
-
-	if (ops->rotate_font && ops->rotate_font(info, vc)) {
-		ops->rotate = FB_ROTATE_UR;
-		set_blitting_type(vc, info);
-	}
-
-	vc->vc_can_do_color = (fb_get_color_depth(&info->var, &info->fix)!=1);
-	vc->vc_complement_mask = vc->vc_can_do_color ? 0x7700 : 0x0800;
-
-	if (p->userfont)
-		charcnt = FNTCHARCNT(vc->vc_font.data);
-
-	if (charcnt > 256)
-		vc->vc_complement_mask <<= 1;
-
-	updatescrollmode(p, info, vc);
-
-	switch (p->scrollmode) {
-	case SCROLL_WRAP_MOVE:
-		scrollback_phys_max = p->vrows - vc->vc_rows;
-		break;
-	case SCROLL_PAN_MOVE:
-	case SCROLL_PAN_REDRAW:
-		scrollback_phys_max = p->vrows - 2 * vc->vc_rows;
-		if (scrollback_phys_max < 0)
-			scrollback_phys_max = 0;
-		break;
-	default:
-		scrollback_phys_max = 0;
-		break;
-	}
-
-	scrollback_max = 0;
-	scrollback_current = 0;
-
-	if (!fbcon_is_inactive(vc, info)) {
-	    ops->var.xoffset = ops->var.yoffset = p->yscroll = 0;
-	    ops->update_start(info);
-	}
-
-	fbcon_set_palette(vc, color_table); 	
-	fbcon_clear_margins(vc, 0);
-
-	if (logo_shown == FBCON_LOGO_DRAW) {
-
-		logo_shown = fg_console;
-		/* This is protected above by initmem_freed */
-		fb_show_logo(info, ops->rotate);
-		update_region(vc,
-			      vc->vc_origin + vc->vc_size_row * vc->vc_top,
-			      vc->vc_size_row * (vc->vc_bottom -
-						 vc->vc_top) / 2);
-		return 0;
-	}
-	return 1;
-}
-
-static void fbcon_generic_blank(struct vc_data *vc, struct fb_info *info,
-				int blank)
-{
-	struct fb_event event;
-
-	if (blank) {
-		unsigned short charmask = vc->vc_hi_font_mask ?
-			0x1ff : 0xff;
-		unsigned short oldc;
-
-		oldc = vc->vc_video_erase_char;
-		vc->vc_video_erase_char &= charmask;
-		fbcon_clear(vc, 0, 0, vc->vc_rows, vc->vc_cols);
-		vc->vc_video_erase_char = oldc;
-	}
-
-
-	if (!lock_fb_info(info))
-		return;
-	event.info = info;
-	event.data = &blank;
-	fb_notifier_call_chain(FB_EVENT_CONBLANK, &event);
-	unlock_fb_info(info);
-}
-
-static int fbcon_blank(struct vc_data *vc, int blank, int mode_switch)
-{
-	struct fb_info *info = registered_fb[con2fb_map[vc->vc_num]];
-	struct fbcon_ops *ops = info->fbcon_par;
-
-	if (mode_switch) {
-		struct fb_var_screeninfo var = info->var;
-
-		ops->graphics = 1;
-
-		if (!blank) {
-			var.activate = FB_ACTIVATE_NOW | FB_ACTIVATE_FORCE;
-			fb_set_var(info, &var);
-			ops->graphics = 0;
-			ops->var = info->var;
-		}
-	}
-
- 	if (!fbcon_is_inactive(vc, info)) {
-		if (ops->blank_state != blank) {
-			ops->blank_state = blank;
-			fbcon_cursor(vc, blank ? CM_ERASE : CM_DRAW);
-			ops->cursor_flash = (!blank);
-
-			if (!(info->flags & FBINFO_MISC_USEREVENT))
-				if (fb_blank(info, blank))
-					fbcon_generic_blank(vc, info, blank);
-		}
-
-		if (!blank)
-			update_screen(vc);
-	}
-
-	if (mode_switch || fbcon_is_inactive(vc, info) ||
-	    ops->blank_state != FB_BLANK_UNBLANK)
-		fbcon_del_cursor_timer(info);
-	else
-		fbcon_add_cursor_timer(info);
-
-	return 0;
-}
-
-static int fbcon_debug_enter(struct vc_data *vc)
-{
-	struct fb_info *info = registered_fb[con2fb_map[vc->vc_num]];
-	struct fbcon_ops *ops = info->fbcon_par;
-
-	ops->save_graphics = ops->graphics;
-	ops->graphics = 0;
-	if (info->fbops->fb_debug_enter)
-		info->fbops->fb_debug_enter(info);
-	fbcon_set_palette(vc, color_table);
-	return 0;
-}
-
-static int fbcon_debug_leave(struct vc_data *vc)
-{
-	struct fb_info *info = registered_fb[con2fb_map[vc->vc_num]];
-	struct fbcon_ops *ops = info->fbcon_par;
-
-	ops->graphics = ops->save_graphics;
-	if (info->fbops->fb_debug_leave)
-		info->fbops->fb_debug_leave(info);
-	return 0;
-}
-
-static int fbcon_get_font(struct vc_data *vc, struct console_font *font)
-{
-	u8 *fontdata = vc->vc_font.data;
-	u8 *data = font->data;
-	int i, j;
-
-	font->width = vc->vc_font.width;
-	font->height = vc->vc_font.height;
-	font->charcount = vc->vc_hi_font_mask ? 512 : 256;
-	if (!font->data)
-		return 0;
-
-	if (font->width <= 8) {
-		j = vc->vc_font.height;
-		for (i = 0; i < font->charcount; i++) {
-			memcpy(data, fontdata, j);
-			memset(data + j, 0, 32 - j);
-			data += 32;
-			fontdata += j;
-		}
-	} else if (font->width <= 16) {
-		j = vc->vc_font.height * 2;
-		for (i = 0; i < font->charcount; i++) {
-			memcpy(data, fontdata, j);
-			memset(data + j, 0, 64 - j);
-			data += 64;
-			fontdata += j;
-		}
-	} else if (font->width <= 24) {
-		for (i = 0; i < font->charcount; i++) {
-			for (j = 0; j < vc->vc_font.height; j++) {
-				*data++ = fontdata[0];
-				*data++ = fontdata[1];
-				*data++ = fontdata[2];
-				fontdata += sizeof(u32);
-			}
-			memset(data, 0, 3 * (32 - j));
-			data += 3 * (32 - j);
-		}
-	} else {
-		j = vc->vc_font.height * 4;
-		for (i = 0; i < font->charcount; i++) {
-			memcpy(data, fontdata, j);
-			memset(data + j, 0, 128 - j);
-			data += 128;
-			fontdata += j;
-		}
-	}
-	return 0;
-}
-
-/* set/clear vc_hi_font_mask and update vc attrs accordingly */
-static void set_vc_hi_font(struct vc_data *vc, bool set)
-{
-	if (!set) {
-		vc->vc_hi_font_mask = 0;
-		if (vc->vc_can_do_color) {
-			vc->vc_complement_mask >>= 1;
-			vc->vc_s_complement_mask >>= 1;
-		}
-			
-		/* ++Edmund: reorder the attribute bits */
-		if (vc->vc_can_do_color) {
-			unsigned short *cp =
-			    (unsigned short *) vc->vc_origin;
-			int count = vc->vc_screenbuf_size / 2;
-			unsigned short c;
-			for (; count > 0; count--, cp++) {
-				c = scr_readw(cp);
-				scr_writew(((c & 0xfe00) >> 1) |
-					   (c & 0xff), cp);
-			}
-			c = vc->vc_video_erase_char;
-			vc->vc_video_erase_char =
-			    ((c & 0xfe00) >> 1) | (c & 0xff);
-			vc->vc_attr >>= 1;
-		}
-	} else {
-		vc->vc_hi_font_mask = 0x100;
-		if (vc->vc_can_do_color) {
-			vc->vc_complement_mask <<= 1;
-			vc->vc_s_complement_mask <<= 1;
-		}
-			
-		/* ++Edmund: reorder the attribute bits */
-		{
-			unsigned short *cp =
-			    (unsigned short *) vc->vc_origin;
-			int count = vc->vc_screenbuf_size / 2;
-			unsigned short c;
-			for (; count > 0; count--, cp++) {
-				unsigned short newc;
-				c = scr_readw(cp);
-				if (vc->vc_can_do_color)
-					newc =
-					    ((c & 0xff00) << 1) | (c &
-								   0xff);
-				else
-					newc = c & ~0x100;
-				scr_writew(newc, cp);
-			}
-			c = vc->vc_video_erase_char;
-			if (vc->vc_can_do_color) {
-				vc->vc_video_erase_char =
-				    ((c & 0xff00) << 1) | (c & 0xff);
-				vc->vc_attr <<= 1;
-			} else
-				vc->vc_video_erase_char = c & ~0x100;
-		}
-	}
-}
-
-static int fbcon_do_set_font(struct vc_data *vc, int w, int h,
-			     const u8 * data, int userfont)
-{
-	struct fb_info *info = registered_fb[con2fb_map[vc->vc_num]];
-	struct fbcon_ops *ops = info->fbcon_par;
-	struct display *p = &fb_display[vc->vc_num];
-	int resize;
-	int cnt;
-	char *old_data = NULL;
-
-	if (con_is_visible(vc) && softback_lines)
-		fbcon_set_origin(vc);
-
-	resize = (w != vc->vc_font.width) || (h != vc->vc_font.height);
-	if (p->userfont)
-		old_data = vc->vc_font.data;
-	if (userfont)
-		cnt = FNTCHARCNT(data);
-	else
-		cnt = 256;
-	vc->vc_font.data = (void *)(p->fontdata = data);
-	if ((p->userfont = userfont))
-		REFCOUNT(data)++;
-	vc->vc_font.width = w;
-	vc->vc_font.height = h;
-	if (vc->vc_hi_font_mask && cnt == 256)
-		set_vc_hi_font(vc, false);
-	else if (!vc->vc_hi_font_mask && cnt == 512)
-		set_vc_hi_font(vc, true);
-
-	if (resize) {
-		int cols, rows;
-
-		cols = FBCON_SWAP(ops->rotate, info->var.xres, info->var.yres);
-		rows = FBCON_SWAP(ops->rotate, info->var.yres, info->var.xres);
-		cols /= w;
-		rows /= h;
-		vc_resize(vc, cols, rows);
-		if (con_is_visible(vc) && softback_buf)
-			fbcon_update_softback(vc);
-	} else if (con_is_visible(vc)
-		   && vc->vc_mode == KD_TEXT) {
-		fbcon_clear_margins(vc, 0);
-		update_screen(vc);
-	}
-
-	if (old_data && (--REFCOUNT(old_data) == 0))
-		kfree(old_data - FONT_EXTRA_WORDS * sizeof(int));
-	return 0;
-}
-
-static int fbcon_copy_font(struct vc_data *vc, int con)
-{
-	struct display *od = &fb_display[con];
-	struct console_font *f = &vc->vc_font;
-
-	if (od->fontdata == f->data)
-		return 0;	/* already the same font... */
-	return fbcon_do_set_font(vc, f->width, f->height, od->fontdata, od->userfont);
-}
-
-/*
- *  User asked to set font; we are guaranteed that
- *	a) width and height are in range 1..32
- *	b) charcount does not exceed 512
- *  but lets not assume that, since someone might someday want to use larger
- *  fonts. And charcount of 512 is small for unicode support.
- *
- *  However, user space gives the font in 32 rows , regardless of
- *  actual font height. So a new API is needed if support for larger fonts
- *  is ever implemented.
- */
-
-static int fbcon_set_font(struct vc_data *vc, struct console_font *font, unsigned flags)
-{
-	struct fb_info *info = registered_fb[con2fb_map[vc->vc_num]];
-	unsigned charcount = font->charcount;
-	int w = font->width;
-	int h = font->height;
-	int size;
-	int i, csum;
-	u8 *new_data, *data = font->data;
-	int pitch = (font->width+7) >> 3;
-
-	/* Is there a reason why fbconsole couldn't handle any charcount >256?
-	 * If not this check should be changed to charcount < 256 */
-	if (charcount != 256 && charcount != 512)
-		return -EINVAL;
-
-	/* Make sure drawing engine can handle the font */
-	if (!(info->pixmap.blit_x & (1 << (font->width - 1))) ||
-	    !(info->pixmap.blit_y & (1 << (font->height - 1))))
-		return -EINVAL;
-
-	/* Make sure driver can handle the font length */
-	if (fbcon_invalid_charcount(info, charcount))
-		return -EINVAL;
-
-	size = h * pitch * charcount;
-
-	new_data = kmalloc(FONT_EXTRA_WORDS * sizeof(int) + size, GFP_USER);
-
-	if (!new_data)
-		return -ENOMEM;
-
-	new_data += FONT_EXTRA_WORDS * sizeof(int);
-	FNTSIZE(new_data) = size;
-	FNTCHARCNT(new_data) = charcount;
-	REFCOUNT(new_data) = 0;	/* usage counter */
-	for (i=0; i< charcount; i++) {
-		memcpy(new_data + i*h*pitch, data +  i*32*pitch, h*pitch);
-	}
-
-	/* Since linux has a nice crc32 function use it for counting font
-	 * checksums. */
-	csum = crc32(0, new_data, size);
-
-	FNTSUM(new_data) = csum;
-	/* Check if the same font is on some other console already */
-	for (i = first_fb_vc; i <= last_fb_vc; i++) {
-		struct vc_data *tmp = vc_cons[i].d;
-		
-		if (fb_display[i].userfont &&
-		    fb_display[i].fontdata &&
-		    FNTSUM(fb_display[i].fontdata) == csum &&
-		    FNTSIZE(fb_display[i].fontdata) == size &&
-		    tmp->vc_font.width == w &&
-		    !memcmp(fb_display[i].fontdata, new_data, size)) {
-			kfree(new_data - FONT_EXTRA_WORDS * sizeof(int));
-			new_data = (u8 *)fb_display[i].fontdata;
-			break;
-		}
-	}
-	return fbcon_do_set_font(vc, font->width, font->height, new_data, 1);
-}
-
-static int fbcon_set_def_font(struct vc_data *vc, struct console_font *font, char *name)
-{
-	struct fb_info *info = registered_fb[con2fb_map[vc->vc_num]];
-	const struct font_desc *f;
-
-	if (!name)
-		f = get_default_font(info->var.xres, info->var.yres,
-				     info->pixmap.blit_x, info->pixmap.blit_y);
-	else if (!(f = find_font(name)))
-		return -ENOENT;
-
-	font->width = f->width;
-	font->height = f->height;
-	return fbcon_do_set_font(vc, f->width, f->height, f->data, 0);
-}
-
-static u16 palette_red[16];
-static u16 palette_green[16];
-static u16 palette_blue[16];
-
-static struct fb_cmap palette_cmap = {
-	0, 16, palette_red, palette_green, palette_blue, NULL
-};
-
-static void fbcon_set_palette(struct vc_data *vc, const unsigned char *table)
-{
-	struct fb_info *info = registered_fb[con2fb_map[vc->vc_num]];
-	int i, j, k, depth;
-	u8 val;
-
-	if (fbcon_is_inactive(vc, info))
-		return;
-
-	if (!con_is_visible(vc))
-		return;
-
-	depth = fb_get_color_depth(&info->var, &info->fix);
-	if (depth > 3) {
-		for (i = j = 0; i < 16; i++) {
-			k = table[i];
-			val = vc->vc_palette[j++];
-			palette_red[k] = (val << 8) | val;
-			val = vc->vc_palette[j++];
-			palette_green[k] = (val << 8) | val;
-			val = vc->vc_palette[j++];
-			palette_blue[k] = (val << 8) | val;
-		}
-		palette_cmap.len = 16;
-		palette_cmap.start = 0;
-	/*
-	 * If framebuffer is capable of less than 16 colors,
-	 * use default palette of fbcon.
-	 */
-	} else
-		fb_copy_cmap(fb_default_cmap(1 << depth), &palette_cmap);
-
-	fb_set_cmap(&palette_cmap, info);
-}
-
-static u16 *fbcon_screen_pos(struct vc_data *vc, int offset)
-{
-	unsigned long p;
-	int line;
-	
-	if (vc->vc_num != fg_console || !softback_lines)
-		return (u16 *) (vc->vc_origin + offset);
-	line = offset / vc->vc_size_row;
-	if (line >= softback_lines)
-		return (u16 *) (vc->vc_origin + offset -
-				softback_lines * vc->vc_size_row);
-	p = softback_curr + offset;
-	if (p >= softback_end)
-		p += softback_buf - softback_end;
-	return (u16 *) p;
-}
-
-static unsigned long fbcon_getxy(struct vc_data *vc, unsigned long pos,
-				 int *px, int *py)
-{
-	unsigned long ret;
-	int x, y;
-
-	if (pos >= vc->vc_origin && pos < vc->vc_scr_end) {
-		unsigned long offset = (pos - vc->vc_origin) / 2;
-
-		x = offset % vc->vc_cols;
-		y = offset / vc->vc_cols;
-		if (vc->vc_num == fg_console)
-			y += softback_lines;
-		ret = pos + (vc->vc_cols - x) * 2;
-	} else if (vc->vc_num == fg_console && softback_lines) {
-		unsigned long offset = pos - softback_curr;
-
-		if (pos < softback_curr)
-			offset += softback_end - softback_buf;
-		offset /= 2;
-		x = offset % vc->vc_cols;
-		y = offset / vc->vc_cols;
-		ret = pos + (vc->vc_cols - x) * 2;
-		if (ret == softback_end)
-			ret = softback_buf;
-		if (ret == softback_in)
-			ret = vc->vc_origin;
-	} else {
-		/* Should not happen */
-		x = y = 0;
-		ret = vc->vc_origin;
-	}
-	if (px)
-		*px = x;
-	if (py)
-		*py = y;
-	return ret;
-}
-
-/* As we might be inside of softback, we may work with non-contiguous buffer,
-   that's why we have to use a separate routine. */
-static void fbcon_invert_region(struct vc_data *vc, u16 * p, int cnt)
-{
-	while (cnt--) {
-		u16 a = scr_readw(p);
-		if (!vc->vc_can_do_color)
-			a ^= 0x0800;
-		else if (vc->vc_hi_font_mask == 0x100)
-			a = ((a) & 0x11ff) | (((a) & 0xe000) >> 4) |
-			    (((a) & 0x0e00) << 4);
-		else
-			a = ((a) & 0x88ff) | (((a) & 0x7000) >> 4) |
-			    (((a) & 0x0700) << 4);
-		scr_writew(a, p++);
-		if (p == (u16 *) softback_end)
-			p = (u16 *) softback_buf;
-		if (p == (u16 *) softback_in)
-			p = (u16 *) vc->vc_origin;
-	}
-}
-
-static void fbcon_scrolldelta(struct vc_data *vc, int lines)
-{
-	struct fb_info *info = registered_fb[con2fb_map[fg_console]];
-	struct fbcon_ops *ops = info->fbcon_par;
-	struct display *disp = &fb_display[fg_console];
-	int offset, limit, scrollback_old;
-
-	if (softback_top) {
-		if (vc->vc_num != fg_console)
-			return;
-		if (vc->vc_mode != KD_TEXT || !lines)
-			return;
-		if (logo_shown >= 0) {
-			struct vc_data *conp2 = vc_cons[logo_shown].d;
-
-			if (conp2->vc_top == logo_lines
-			    && conp2->vc_bottom == conp2->vc_rows)
-				conp2->vc_top = 0;
-			if (logo_shown == vc->vc_num) {
-				unsigned long p, q;
-				int i;
-
-				p = softback_in;
-				q = vc->vc_origin +
-				    logo_lines * vc->vc_size_row;
-				for (i = 0; i < logo_lines; i++) {
-					if (p == softback_top)
-						break;
-					if (p == softback_buf)
-						p = softback_end;
-					p -= vc->vc_size_row;
-					q -= vc->vc_size_row;
-					scr_memcpyw((u16 *) q, (u16 *) p,
-						    vc->vc_size_row);
-				}
-				softback_in = softback_curr = p;
-				update_region(vc, vc->vc_origin,
-					      logo_lines * vc->vc_cols);
-			}
-			logo_shown = FBCON_LOGO_CANSHOW;
-		}
-		fbcon_cursor(vc, CM_ERASE | CM_SOFTBACK);
-		fbcon_redraw_softback(vc, disp, lines);
-		fbcon_cursor(vc, CM_DRAW | CM_SOFTBACK);
-		return;
-	}
-
-	if (!scrollback_phys_max)
-		return;
-
-	scrollback_old = scrollback_current;
-	scrollback_current -= lines;
-	if (scrollback_current < 0)
-		scrollback_current = 0;
-	else if (scrollback_current > scrollback_max)
-		scrollback_current = scrollback_max;
-	if (scrollback_current == scrollback_old)
-		return;
-
-	if (fbcon_is_inactive(vc, info))
-		return;
-
-	fbcon_cursor(vc, CM_ERASE);
-
-	offset = disp->yscroll - scrollback_current;
-	limit = disp->vrows;
-	switch (disp->scrollmode) {
-	case SCROLL_WRAP_MOVE:
-		info->var.vmode |= FB_VMODE_YWRAP;
-		break;
-	case SCROLL_PAN_MOVE:
-	case SCROLL_PAN_REDRAW:
-		limit -= vc->vc_rows;
-		info->var.vmode &= ~FB_VMODE_YWRAP;
-		break;
-	}
-	if (offset < 0)
-		offset += limit;
-	else if (offset >= limit)
-		offset -= limit;
-
-	ops->var.xoffset = 0;
-	ops->var.yoffset = offset * vc->vc_font.height;
-	ops->update_start(info);
-
-	if (!scrollback_current)
-		fbcon_cursor(vc, CM_DRAW);
-}
-
-static int fbcon_set_origin(struct vc_data *vc)
-{
-	if (softback_lines)
-		fbcon_scrolldelta(vc, softback_lines);
-	return 0;
-}
-
-static void fbcon_suspended(struct fb_info *info)
-{
-	struct vc_data *vc = NULL;
-	struct fbcon_ops *ops = info->fbcon_par;
-
-	if (!ops || ops->currcon < 0)
-		return;
-	vc = vc_cons[ops->currcon].d;
-
-	/* Clear cursor, restore saved data */
-	fbcon_cursor(vc, CM_ERASE);
-}
-
-static void fbcon_resumed(struct fb_info *info)
-{
-	struct vc_data *vc;
-	struct fbcon_ops *ops = info->fbcon_par;
-
-	if (!ops || ops->currcon < 0)
-		return;
-	vc = vc_cons[ops->currcon].d;
-
-	update_screen(vc);
-}
-
-static void fbcon_modechanged(struct fb_info *info)
-{
-	struct fbcon_ops *ops = info->fbcon_par;
-	struct vc_data *vc;
-	struct display *p;
-	int rows, cols;
-
-	if (!ops || ops->currcon < 0)
-		return;
-	vc = vc_cons[ops->currcon].d;
-	if (vc->vc_mode != KD_TEXT ||
-	    registered_fb[con2fb_map[ops->currcon]] != info)
-		return;
-
-	p = &fb_display[vc->vc_num];
-	set_blitting_type(vc, info);
-
-	if (con_is_visible(vc)) {
-		var_to_display(p, &info->var, info);
-		cols = FBCON_SWAP(ops->rotate, info->var.xres, info->var.yres);
-		rows = FBCON_SWAP(ops->rotate, info->var.yres, info->var.xres);
-		cols /= vc->vc_font.width;
-		rows /= vc->vc_font.height;
-		vc_resize(vc, cols, rows);
-		updatescrollmode(p, info, vc);
-		scrollback_max = 0;
-		scrollback_current = 0;
-
-		if (!fbcon_is_inactive(vc, info)) {
-		    ops->var.xoffset = ops->var.yoffset = p->yscroll = 0;
-		    ops->update_start(info);
-		}
-
-		fbcon_set_palette(vc, color_table);
-		update_screen(vc);
-		if (softback_buf)
-			fbcon_update_softback(vc);
-	}
-}
-
-static void fbcon_set_all_vcs(struct fb_info *info)
-{
-	struct fbcon_ops *ops = info->fbcon_par;
-	struct vc_data *vc;
-	struct display *p;
-	int i, rows, cols, fg = -1;
-
-	if (!ops || ops->currcon < 0)
-		return;
-
-	for (i = first_fb_vc; i <= last_fb_vc; i++) {
-		vc = vc_cons[i].d;
-		if (!vc || vc->vc_mode != KD_TEXT ||
-		    registered_fb[con2fb_map[i]] != info)
-			continue;
-
-		if (con_is_visible(vc)) {
-			fg = i;
-			continue;
-		}
-
-		p = &fb_display[vc->vc_num];
-		set_blitting_type(vc, info);
-		var_to_display(p, &info->var, info);
-		cols = FBCON_SWAP(ops->rotate, info->var.xres, info->var.yres);
-		rows = FBCON_SWAP(ops->rotate, info->var.yres, info->var.xres);
-		cols /= vc->vc_font.width;
-		rows /= vc->vc_font.height;
-		vc_resize(vc, cols, rows);
-	}
-
-	if (fg != -1)
-		fbcon_modechanged(info);
-}
-
-static int fbcon_mode_deleted(struct fb_info *info,
-			      struct fb_videomode *mode)
-{
-	struct fb_info *fb_info;
-	struct display *p;
-	int i, j, found = 0;
-
-	/* before deletion, ensure that mode is not in use */
-	for (i = first_fb_vc; i <= last_fb_vc; i++) {
-		j = con2fb_map[i];
-		if (j == -1)
-			continue;
-		fb_info = registered_fb[j];
-		if (fb_info != info)
-			continue;
-		p = &fb_display[i];
-		if (!p || !p->mode)
-			continue;
-		if (fb_mode_is_equal(p->mode, mode)) {
-			found = 1;
-			break;
-		}
-	}
-	return found;
-}
-
-#ifdef CONFIG_VT_HW_CONSOLE_BINDING
-static int fbcon_unbind(void)
-{
-	int ret;
-
-	ret = do_unbind_con_driver(&fb_con, first_fb_vc, last_fb_vc,
-				fbcon_is_default);
-
-	if (!ret)
-		fbcon_has_console_bind = 0;
-
-	return ret;
-}
-#else
-static inline int fbcon_unbind(void)
-{
-	return -EINVAL;
-}
-#endif /* CONFIG_VT_HW_CONSOLE_BINDING */
-
-/* called with console_lock held */
-static int fbcon_fb_unbind(int idx)
-{
-	int i, new_idx = -1, ret = 0;
-
-	if (!fbcon_has_console_bind)
-		return 0;
-
-	for (i = first_fb_vc; i <= last_fb_vc; i++) {
-		if (con2fb_map[i] != idx &&
-		    con2fb_map[i] != -1) {
-			new_idx = i;
-			break;
-		}
-	}
-
-	if (new_idx != -1) {
-		for (i = first_fb_vc; i <= last_fb_vc; i++) {
-			if (con2fb_map[i] == idx)
-				set_con2fb_map(i, new_idx, 0);
-		}
-	} else {
-		struct fb_info *info = registered_fb[idx];
-
-		/* This is sort of like set_con2fb_map, except it maps
-		 * the consoles to no device and then releases the
-		 * oldinfo to free memory and cancel the cursor blink
-		 * timer. I can imagine this just becoming part of
-		 * set_con2fb_map where new_idx is -1
-		 */
-		for (i = first_fb_vc; i <= last_fb_vc; i++) {
-			if (con2fb_map[i] == idx) {
-				con2fb_map[i] = -1;
-				if (!search_fb_in_map(idx)) {
-					ret = con2fb_release_oldinfo(vc_cons[i].d,
-								     info, NULL, i,
-								     idx, 0);
-					if (ret) {
-						con2fb_map[i] = idx;
-						return ret;
-					}
-				}
-			}
-		}
-		ret = fbcon_unbind();
-	}
-
-	return ret;
-}
-
-/* called with console_lock held */
-static int fbcon_fb_unregistered(struct fb_info *info)
-{
-	int i, idx;
-
-	idx = info->node;
-	for (i = first_fb_vc; i <= last_fb_vc; i++) {
-		if (con2fb_map[i] == idx)
-			con2fb_map[i] = -1;
-	}
-
-	if (idx == info_idx) {
-		info_idx = -1;
-
-		for (i = 0; i < FB_MAX; i++) {
-			if (registered_fb[i] != NULL) {
-				info_idx = i;
-				break;
-			}
-		}
-	}
-
-	if (info_idx != -1) {
-		for (i = first_fb_vc; i <= last_fb_vc; i++) {
-			if (con2fb_map[i] == -1)
-				con2fb_map[i] = info_idx;
-		}
-	}
-
-	if (primary_device == idx)
-		primary_device = -1;
-
-	if (!num_registered_fb)
-		do_unregister_con_driver(&fb_con);
-
-	return 0;
-}
-
-/* called with console_lock held */
-static void fbcon_remap_all(int idx)
-{
-	int i;
-	for (i = first_fb_vc; i <= last_fb_vc; i++)
-		set_con2fb_map(i, idx, 0);
-
-	if (con_is_bound(&fb_con)) {
-		printk(KERN_INFO "fbcon: Remapping primary device, "
-		       "fb%i, to tty %i-%i\n", idx,
-		       first_fb_vc + 1, last_fb_vc + 1);
-		info_idx = idx;
-	}
-}
-
-#ifdef CONFIG_FRAMEBUFFER_CONSOLE_DETECT_PRIMARY
-static void fbcon_select_primary(struct fb_info *info)
-{
-	if (!map_override && primary_device == -1 &&
-	    fb_is_primary_device(info)) {
-		int i;
-
-		printk(KERN_INFO "fbcon: %s (fb%i) is primary device\n",
-		       info->fix.id, info->node);
-		primary_device = info->node;
-
-		for (i = first_fb_vc; i <= last_fb_vc; i++)
-			con2fb_map_boot[i] = primary_device;
-
-		if (con_is_bound(&fb_con)) {
-			printk(KERN_INFO "fbcon: Remapping primary device, "
-			       "fb%i, to tty %i-%i\n", info->node,
-			       first_fb_vc + 1, last_fb_vc + 1);
-			info_idx = primary_device;
-		}
-	}
-
-}
-#else
-static inline void fbcon_select_primary(struct fb_info *info)
-{
-	return;
-}
-#endif /* CONFIG_FRAMEBUFFER_DETECT_PRIMARY */
-
-/* called with console_lock held */
-static int fbcon_fb_registered(struct fb_info *info)
-{
-	int ret = 0, i, idx;
-
-	idx = info->node;
-	fbcon_select_primary(info);
-
-	if (info_idx == -1) {
-		for (i = first_fb_vc; i <= last_fb_vc; i++) {
-			if (con2fb_map_boot[i] == idx) {
-				info_idx = idx;
-				break;
-			}
-		}
-
-		if (info_idx != -1)
-			ret = do_fbcon_takeover(1);
-	} else {
-		for (i = first_fb_vc; i <= last_fb_vc; i++) {
-			if (con2fb_map_boot[i] == idx)
-				set_con2fb_map(i, idx, 0);
-		}
-	}
-
-	return ret;
-}
-
-static void fbcon_fb_blanked(struct fb_info *info, int blank)
-{
-	struct fbcon_ops *ops = info->fbcon_par;
-	struct vc_data *vc;
-
-	if (!ops || ops->currcon < 0)
-		return;
-
-	vc = vc_cons[ops->currcon].d;
-	if (vc->vc_mode != KD_TEXT ||
-			registered_fb[con2fb_map[ops->currcon]] != info)
-		return;
-
-	if (con_is_visible(vc)) {
-		if (blank)
-			do_blank_screen(0);
-		else
-			do_unblank_screen(0);
-	}
-	ops->blank_state = blank;
-}
-
-static void fbcon_new_modelist(struct fb_info *info)
-{
-	int i;
-	struct vc_data *vc;
-	struct fb_var_screeninfo var;
-	const struct fb_videomode *mode;
-
-	for (i = first_fb_vc; i <= last_fb_vc; i++) {
-		if (registered_fb[con2fb_map[i]] != info)
-			continue;
-		if (!fb_display[i].mode)
-			continue;
-		vc = vc_cons[i].d;
-		display_to_var(&var, &fb_display[i]);
-		mode = fb_find_nearest_mode(fb_display[i].mode,
-					    &info->modelist);
-		fb_videomode_to_var(&var, mode);
-		fbcon_set_disp(info, &var, vc->vc_num);
-	}
-}
-
-static void fbcon_get_requirement(struct fb_info *info,
-				  struct fb_blit_caps *caps)
-{
-	struct vc_data *vc;
-	struct display *p;
-
-	if (caps->flags) {
-		int i, charcnt;
-
-		for (i = first_fb_vc; i <= last_fb_vc; i++) {
-			vc = vc_cons[i].d;
-			if (vc && vc->vc_mode == KD_TEXT &&
-			    info->node == con2fb_map[i]) {
-				p = &fb_display[i];
-				caps->x |= 1 << (vc->vc_font.width - 1);
-				caps->y |= 1 << (vc->vc_font.height - 1);
-				charcnt = (p->userfont) ?
-					FNTCHARCNT(p->fontdata) : 256;
-				if (caps->len < charcnt)
-					caps->len = charcnt;
-			}
-		}
-	} else {
-		vc = vc_cons[fg_console].d;
-
-		if (vc && vc->vc_mode == KD_TEXT &&
-		    info->node == con2fb_map[fg_console]) {
-			p = &fb_display[fg_console];
-			caps->x = 1 << (vc->vc_font.width - 1);
-			caps->y = 1 << (vc->vc_font.height - 1);
-			caps->len = (p->userfont) ?
-				FNTCHARCNT(p->fontdata) : 256;
-		}
-	}
-}
-
-static int fbcon_event_notify(struct notifier_block *self,
-			      unsigned long action, void *data)
-{
-	struct fb_event *event = data;
-	struct fb_info *info = event->info;
-	struct fb_videomode *mode;
-	struct fb_con2fbmap *con2fb;
-	struct fb_blit_caps *caps;
-	int idx, ret = 0;
-
-	/*
-	 * ignore all events except driver registration and deregistration
-	 * if fbcon is not active
-	 */
-	if (fbcon_has_exited && !(action == FB_EVENT_FB_REGISTERED ||
-				  action == FB_EVENT_FB_UNREGISTERED))
-		goto done;
-
-	switch(action) {
-	case FB_EVENT_SUSPEND:
-		fbcon_suspended(info);
-		break;
-	case FB_EVENT_RESUME:
-		fbcon_resumed(info);
-		break;
-	case FB_EVENT_MODE_CHANGE:
-		fbcon_modechanged(info);
-		break;
-	case FB_EVENT_MODE_CHANGE_ALL:
-		fbcon_set_all_vcs(info);
-		break;
-	case FB_EVENT_MODE_DELETE:
-		mode = event->data;
-		ret = fbcon_mode_deleted(info, mode);
-		break;
-	case FB_EVENT_FB_UNBIND:
-		idx = info->node;
-		ret = fbcon_fb_unbind(idx);
-		break;
-	case FB_EVENT_FB_REGISTERED:
-		ret = fbcon_fb_registered(info);
-		break;
-	case FB_EVENT_FB_UNREGISTERED:
-		ret = fbcon_fb_unregistered(info);
-		break;
-	case FB_EVENT_SET_CONSOLE_MAP:
-		/* called with console lock held */
-		con2fb = event->data;
-		ret = set_con2fb_map(con2fb->console - 1,
-				     con2fb->framebuffer, 1);
-		break;
-	case FB_EVENT_GET_CONSOLE_MAP:
-		con2fb = event->data;
-		con2fb->framebuffer = con2fb_map[con2fb->console - 1];
-		break;
-	case FB_EVENT_BLANK:
-		fbcon_fb_blanked(info, *(int *)event->data);
-		break;
-	case FB_EVENT_NEW_MODELIST:
-		fbcon_new_modelist(info);
-		break;
-	case FB_EVENT_GET_REQ:
-		caps = event->data;
-		fbcon_get_requirement(info, caps);
-		break;
-	case FB_EVENT_REMAP_ALL_CONSOLE:
-		idx = info->node;
-		fbcon_remap_all(idx);
-		break;
-	}
-done:
-	return ret;
-}
-
-/*
- *  The console `switch' structure for the frame buffer based console
- */
-
-static const struct consw fb_con = {
-	.owner			= THIS_MODULE,
-	.con_startup 		= fbcon_startup,
-	.con_init 		= fbcon_init,
-	.con_deinit 		= fbcon_deinit,
-	.con_clear 		= fbcon_clear,
-	.con_putc 		= fbcon_putc,
-	.con_putcs 		= fbcon_putcs,
-	.con_cursor 		= fbcon_cursor,
-	.con_scroll 		= fbcon_scroll,
-	.con_switch 		= fbcon_switch,
-	.con_blank 		= fbcon_blank,
-	.con_font_set 		= fbcon_set_font,
-	.con_font_get 		= fbcon_get_font,
-	.con_font_default	= fbcon_set_def_font,
-	.con_font_copy 		= fbcon_copy_font,
-	.con_set_palette 	= fbcon_set_palette,
-	.con_scrolldelta 	= fbcon_scrolldelta,
-	.con_set_origin 	= fbcon_set_origin,
-	.con_invert_region 	= fbcon_invert_region,
-	.con_screen_pos 	= fbcon_screen_pos,
-	.con_getxy 		= fbcon_getxy,
-	.con_resize             = fbcon_resize,
-	.con_debug_enter	= fbcon_debug_enter,
-	.con_debug_leave	= fbcon_debug_leave,
-};
-
-static struct notifier_block fbcon_event_notifier = {
-	.notifier_call	= fbcon_event_notify,
-};
-
-static ssize_t store_rotate(struct device *device,
-			    struct device_attribute *attr, const char *buf,
-			    size_t count)
-{
-	struct fb_info *info;
-	int rotate, idx;
-	char **last = NULL;
-
-	if (fbcon_has_exited)
-		return count;
-
-	console_lock();
-	idx = con2fb_map[fg_console];
-
-	if (idx == -1 || registered_fb[idx] == NULL)
-		goto err;
-
-	info = registered_fb[idx];
-	rotate = simple_strtoul(buf, last, 0);
-	fbcon_rotate(info, rotate);
-err:
-	console_unlock();
-	return count;
-}
-
-static ssize_t store_rotate_all(struct device *device,
-				struct device_attribute *attr,const char *buf,
-				size_t count)
-{
-	struct fb_info *info;
-	int rotate, idx;
-	char **last = NULL;
-
-	if (fbcon_has_exited)
-		return count;
-
-	console_lock();
-	idx = con2fb_map[fg_console];
-
-	if (idx == -1 || registered_fb[idx] == NULL)
-		goto err;
-
-	info = registered_fb[idx];
-	rotate = simple_strtoul(buf, last, 0);
-	fbcon_rotate_all(info, rotate);
-err:
-	console_unlock();
-	return count;
-}
-
-static ssize_t show_rotate(struct device *device,
-			   struct device_attribute *attr,char *buf)
-{
-	struct fb_info *info;
-	int rotate = 0, idx;
-
-	if (fbcon_has_exited)
-		return 0;
-
-	console_lock();
-	idx = con2fb_map[fg_console];
-
-	if (idx == -1 || registered_fb[idx] == NULL)
-		goto err;
-
-	info = registered_fb[idx];
-	rotate = fbcon_get_rotate(info);
-err:
-	console_unlock();
-	return snprintf(buf, PAGE_SIZE, "%d\n", rotate);
-}
-
-static ssize_t show_cursor_blink(struct device *device,
-				 struct device_attribute *attr, char *buf)
-{
-	struct fb_info *info;
-	struct fbcon_ops *ops;
-	int idx, blink = -1;
-
-	if (fbcon_has_exited)
-		return 0;
-
-	console_lock();
-	idx = con2fb_map[fg_console];
-
-	if (idx == -1 || registered_fb[idx] == NULL)
-		goto err;
-
-	info = registered_fb[idx];
-	ops = info->fbcon_par;
-
-	if (!ops)
-		goto err;
-
-	blink = (ops->flags & FBCON_FLAGS_CURSOR_TIMER) ? 1 : 0;
-err:
-	console_unlock();
-	return snprintf(buf, PAGE_SIZE, "%d\n", blink);
-}
-
-static ssize_t store_cursor_blink(struct device *device,
-				  struct device_attribute *attr,
-				  const char *buf, size_t count)
-{
-	struct fb_info *info;
-	int blink, idx;
-	char **last = NULL;
-
-	if (fbcon_has_exited)
-		return count;
-
-	console_lock();
-	idx = con2fb_map[fg_console];
-
-	if (idx == -1 || registered_fb[idx] == NULL)
-		goto err;
-
-	info = registered_fb[idx];
-
-	if (!info->fbcon_par)
-		goto err;
-
-	blink = simple_strtoul(buf, last, 0);
-
-	if (blink) {
-		fbcon_cursor_noblink = 0;
-		fbcon_add_cursor_timer(info);
-	} else {
-		fbcon_cursor_noblink = 1;
-		fbcon_del_cursor_timer(info);
-	}
-
-err:
-	console_unlock();
-	return count;
-}
-
-static struct device_attribute device_attrs[] = {
-	__ATTR(rotate, S_IRUGO|S_IWUSR, show_rotate, store_rotate),
-	__ATTR(rotate_all, S_IWUSR, NULL, store_rotate_all),
-	__ATTR(cursor_blink, S_IRUGO|S_IWUSR, show_cursor_blink,
-	       store_cursor_blink),
-};
-
-static int fbcon_init_device(void)
-{
-	int i, error = 0;
-
-	fbcon_has_sysfs = 1;
-
-	for (i = 0; i < ARRAY_SIZE(device_attrs); i++) {
-		error = device_create_file(fbcon_device, &device_attrs[i]);
-
-		if (error)
-			break;
-	}
-
-	if (error) {
-		while (--i >= 0)
-			device_remove_file(fbcon_device, &device_attrs[i]);
-
-		fbcon_has_sysfs = 0;
-	}
-
-	return 0;
-}
-
-static void fbcon_start(void)
-{
-	if (num_registered_fb) {
-		int i;
-
-		console_lock();
-
-		for (i = 0; i < FB_MAX; i++) {
-			if (registered_fb[i] != NULL) {
-				info_idx = i;
-				break;
-			}
-		}
-
-		do_fbcon_takeover(0);
-		console_unlock();
-
-	}
-}
-
-static void fbcon_exit(void)
-{
-	struct fb_info *info;
-	int i, j, mapped;
-
-	if (fbcon_has_exited)
-		return;
-
-	kfree((void *)softback_buf);
-	softback_buf = 0UL;
-
-	for (i = 0; i < FB_MAX; i++) {
-		int pending = 0;
-
-		mapped = 0;
-		info = registered_fb[i];
-
-		if (info == NULL)
-			continue;
-
-		if (info->queue.func)
-			pending = cancel_work_sync(&info->queue);
-		DPRINTK("fbcon: %s pending work\n", (pending ? "canceled" :
-			"no"));
-
-		for (j = first_fb_vc; j <= last_fb_vc; j++) {
-			if (con2fb_map[j] == i) {
-				mapped = 1;
-				break;
-			}
-		}
-
-		if (mapped) {
-			if (info->fbops->fb_release)
-				info->fbops->fb_release(info, 0);
-			module_put(info->fbops->owner);
-
-			if (info->fbcon_par) {
-				struct fbcon_ops *ops = info->fbcon_par;
-
-				fbcon_del_cursor_timer(info);
-				kfree(ops->cursor_src);
-				kfree(ops->cursor_state.mask);
-				kfree(info->fbcon_par);
-				info->fbcon_par = NULL;
-			}
-
-			if (info->queue.func == fb_flashcursor)
-				info->queue.func = NULL;
-		}
-	}
-
-	fbcon_has_exited = 1;
-}
-
-static int __init fb_console_init(void)
-{
-	int i;
-
-	console_lock();
-	fb_register_client(&fbcon_event_notifier);
-	fbcon_device = device_create(fb_class, NULL, MKDEV(0, 0), NULL,
-				     "fbcon");
-
-	if (IS_ERR(fbcon_device)) {
-		printk(KERN_WARNING "Unable to create device "
-		       "for fbcon; errno = %ld\n",
-		       PTR_ERR(fbcon_device));
-		fbcon_device = NULL;
-	} else
-		fbcon_init_device();
-
-	for (i = 0; i < MAX_NR_CONSOLES; i++)
-		con2fb_map[i] = -1;
-
-	console_unlock();
-	fbcon_start();
-	return 0;
-}
-
-fs_initcall(fb_console_init);
-
-#ifdef MODULE
-
-static void __exit fbcon_deinit_device(void)
-{
-	int i;
-
-	if (fbcon_has_sysfs) {
-		for (i = 0; i < ARRAY_SIZE(device_attrs); i++)
-			device_remove_file(fbcon_device, &device_attrs[i]);
-
-		fbcon_has_sysfs = 0;
-	}
-}
-
-static void __exit fb_console_exit(void)
-{
-	console_lock();
-	fb_unregister_client(&fbcon_event_notifier);
-	fbcon_deinit_device();
-	device_destroy(fb_class, MKDEV(0, 0));
-	fbcon_exit();
-	do_unregister_con_driver(&fb_con);
-	console_unlock();
-}	
-
-module_exit(fb_console_exit);
-
-#endif
-
-MODULE_LICENSE("GPL");
diff --git a/drivers/video/console/fbcon.h b/drivers/video/console/fbcon.h
deleted file mode 100644
index 7aaa4eabbba0..000000000000
--- a/drivers/video/console/fbcon.h
+++ /dev/null
@@ -1,265 +0,0 @@
-/*
- *  linux/drivers/video/console/fbcon.h -- Low level frame buffer based console driver
- *
- *	Copyright (C) 1997 Geert Uytterhoeven
- *
- *  This file is subject to the terms and conditions of the GNU General Public
- *  License.  See the file COPYING in the main directory of this archive
- *  for more details.
- */
-
-#ifndef _VIDEO_FBCON_H
-#define _VIDEO_FBCON_H
-
-#include <linux/types.h>
-#include <linux/vt_buffer.h>
-#include <linux/vt_kern.h>
-
-#include <asm/io.h>
-
-#define FBCON_FLAGS_INIT         1
-#define FBCON_FLAGS_CURSOR_TIMER 2
-
-   /*
-    *    This is the interface between the low-level console driver and the
-    *    low-level frame buffer device
-    */
-
-struct display {
-    /* Filled in by the low-level console driver */
-    const u_char *fontdata;
-    int userfont;                   /* != 0 if fontdata kmalloc()ed */
-    u_short scrollmode;             /* Scroll Method */
-    u_short inverse;                /* != 0 text black on white as default */
-    short yscroll;                  /* Hardware scrolling */
-    int vrows;                      /* number of virtual rows */
-    int cursor_shape;
-    int con_rotate;
-    u32 xres_virtual;
-    u32 yres_virtual;
-    u32 height;
-    u32 width;
-    u32 bits_per_pixel;
-    u32 grayscale;
-    u32 nonstd;
-    u32 accel_flags;
-    u32 rotate;
-    struct fb_bitfield red;
-    struct fb_bitfield green;
-    struct fb_bitfield blue;
-    struct fb_bitfield transp;
-    const struct fb_videomode *mode;
-};
-
-struct fbcon_ops {
-	void (*bmove)(struct vc_data *vc, struct fb_info *info, int sy,
-		      int sx, int dy, int dx, int height, int width);
-	void (*clear)(struct vc_data *vc, struct fb_info *info, int sy,
-		      int sx, int height, int width);
-	void (*putcs)(struct vc_data *vc, struct fb_info *info,
-		      const unsigned short *s, int count, int yy, int xx,
-		      int fg, int bg);
-	void (*clear_margins)(struct vc_data *vc, struct fb_info *info,
-			      int bottom_only);
-	void (*cursor)(struct vc_data *vc, struct fb_info *info, int mode,
-		       int softback_lines, int fg, int bg);
-	int  (*update_start)(struct fb_info *info);
-	int  (*rotate_font)(struct fb_info *info, struct vc_data *vc);
-	struct fb_var_screeninfo var;  /* copy of the current fb_var_screeninfo */
-	struct timer_list cursor_timer; /* Cursor timer */
-	struct fb_cursor cursor_state;
-	struct display *p;
-        int    currcon;	                /* Current VC. */
-	int    cur_blink_jiffies;
-	int    cursor_flash;
-	int    cursor_reset;
-	int    blank_state;
-	int    graphics;
-	int    save_graphics; /* for debug enter/leave */
-	int    flags;
-	int    rotate;
-	int    cur_rotate;
-	char  *cursor_data;
-	u8    *fontbuffer;
-	u8    *fontdata;
-	u8    *cursor_src;
-	u32    cursor_size;
-	u32    fd_size;
-};
-    /*
-     *  Attribute Decoding
-     */
-
-/* Color */
-#define attr_fgcol(fgshift,s)    \
-	(((s) >> (fgshift)) & 0x0f)
-#define attr_bgcol(bgshift,s)    \
-	(((s) >> (bgshift)) & 0x0f)
-
-/* Monochrome */
-#define attr_bold(s) \
-	((s) & 0x200)
-#define attr_reverse(s) \
-	((s) & 0x800)
-#define attr_underline(s) \
-	((s) & 0x400)
-#define attr_blink(s) \
-	((s) & 0x8000)
-	
-
-static inline int mono_col(const struct fb_info *info)
-{
-	__u32 max_len;
-	max_len = max(info->var.green.length, info->var.red.length);
-	max_len = max(info->var.blue.length, max_len);
-	return (~(0xfff << max_len)) & 0xff;
-}
-
-static inline int attr_col_ec(int shift, struct vc_data *vc,
-			      struct fb_info *info, int is_fg)
-{
-	int is_mono01;
-	int col;
-	int fg;
-	int bg;
-
-	if (!vc)
-		return 0;
-
-	if (vc->vc_can_do_color)
-		return is_fg ? attr_fgcol(shift,vc->vc_video_erase_char)
-			: attr_bgcol(shift,vc->vc_video_erase_char);
-
-	if (!info)
-		return 0;
-
-	col = mono_col(info);
-	is_mono01 = info->fix.visual == FB_VISUAL_MONO01;
-
-	if (attr_reverse(vc->vc_video_erase_char)) {
-		fg = is_mono01 ? col : 0;
-		bg = is_mono01 ? 0 : col;
-	}
-	else {
-		fg = is_mono01 ? 0 : col;
-		bg = is_mono01 ? col : 0;
-	}
-
-	return is_fg ? fg : bg;
-}
-
-#define attr_bgcol_ec(bgshift, vc, info) attr_col_ec(bgshift, vc, info, 0)
-#define attr_fgcol_ec(fgshift, vc, info) attr_col_ec(fgshift, vc, info, 1)
-
-/* Font */
-#define REFCOUNT(fd)	(((int *)(fd))[-1])
-#define FNTSIZE(fd)	(((int *)(fd))[-2])
-#define FNTCHARCNT(fd)	(((int *)(fd))[-3])
-#define FNTSUM(fd)	(((int *)(fd))[-4])
-#define FONT_EXTRA_WORDS 4
-
-    /*
-     *  Scroll Method
-     */
-     
-/* There are several methods fbcon can use to move text around the screen:
- *
- *                     Operation   Pan    Wrap
- *---------------------------------------------
- * SCROLL_MOVE         copyarea    No     No
- * SCROLL_PAN_MOVE     copyarea    Yes    No
- * SCROLL_WRAP_MOVE    copyarea    No     Yes
- * SCROLL_REDRAW       imageblit   No     No
- * SCROLL_PAN_REDRAW   imageblit   Yes    No
- * SCROLL_WRAP_REDRAW  imageblit   No     Yes
- *
- * (SCROLL_WRAP_REDRAW is not implemented yet)
- *
- * In general, fbcon will choose the best scrolling
- * method based on the rule below:
- *
- * Pan/Wrap > accel imageblit > accel copyarea >
- * soft imageblit > (soft copyarea)
- *
- * Exception to the rule: Pan + accel copyarea is
- * preferred over Pan + accel imageblit.
- *
- * The above is typical for PCI/AGP cards. Unless
- * overridden, fbcon will never use soft copyarea.
- *
- * If you need to override the above rule, set the
- * appropriate flags in fb_info->flags.  For example,
- * to prefer copyarea over imageblit, set
- * FBINFO_READS_FAST.
- *
- * Other notes:
- * + use the hardware engine to move the text
- *    (hw-accelerated copyarea() and fillrect())
- * + use hardware-supported panning on a large virtual screen
- * + amifb can not only pan, but also wrap the display by N lines
- *    (i.e. visible line i = physical line (i+N) % yres).
- * + read what's already rendered on the screen and
- *     write it in a different place (this is cfb_copyarea())
- * + re-render the text to the screen
- *
- * Whether to use wrapping or panning can only be figured out at
- * runtime (when we know whether our font height is a multiple
- * of the pan/wrap step)
- *
- */
-
-#define SCROLL_MOVE	   0x001
-#define SCROLL_PAN_MOVE	   0x002
-#define SCROLL_WRAP_MOVE   0x003
-#define SCROLL_REDRAW	   0x004
-#define SCROLL_PAN_REDRAW  0x005
-
-#ifdef CONFIG_FB_TILEBLITTING
-extern void fbcon_set_tileops(struct vc_data *vc, struct fb_info *info);
-#endif
-extern void fbcon_set_bitops(struct fbcon_ops *ops);
-extern int  soft_cursor(struct fb_info *info, struct fb_cursor *cursor);
-
-#define FBCON_ATTRIBUTE_UNDERLINE 1
-#define FBCON_ATTRIBUTE_REVERSE   2
-#define FBCON_ATTRIBUTE_BOLD      4
-
-static inline int real_y(struct display *p, int ypos)
-{
-	int rows = p->vrows;
-
-	ypos += p->yscroll;
-	return ypos < rows ? ypos : ypos - rows;
-}
-
-
-static inline int get_attribute(struct fb_info *info, u16 c)
-{
-	int attribute = 0;
-
-	if (fb_get_color_depth(&info->var, &info->fix) == 1) {
-		if (attr_underline(c))
-			attribute |= FBCON_ATTRIBUTE_UNDERLINE;
-		if (attr_reverse(c))
-			attribute |= FBCON_ATTRIBUTE_REVERSE;
-		if (attr_bold(c))
-			attribute |= FBCON_ATTRIBUTE_BOLD;
-	}
-
-	return attribute;
-}
-
-#define FBCON_SWAP(i,r,v) ({ \
-        typeof(r) _r = (r);  \
-        typeof(v) _v = (v);  \
-        (void) (&_r == &_v); \
-        (i == FB_ROTATE_UR || i == FB_ROTATE_UD) ? _r : _v; })
-
-#ifdef CONFIG_FRAMEBUFFER_CONSOLE_ROTATION
-extern void fbcon_set_rotate(struct fbcon_ops *ops);
-#else
-#define fbcon_set_rotate(x) do {} while(0)
-#endif /* CONFIG_FRAMEBUFFER_CONSOLE_ROTATION */
-
-#endif /* _VIDEO_FBCON_H */
-
diff --git a/drivers/video/console/fbcon_ccw.c b/drivers/video/console/fbcon_ccw.c
deleted file mode 100644
index 5a3cbf6dff4d..000000000000
--- a/drivers/video/console/fbcon_ccw.c
+++ /dev/null
@@ -1,424 +0,0 @@
-/*
- *  linux/drivers/video/console/fbcon_ccw.c -- Software Rotation - 270 degrees
- *
- *      Copyright (C) 2005 Antonino Daplas <adaplas @pol.net>
- *
- *  This file is subject to the terms and conditions of the GNU General Public
- *  License.  See the file COPYING in the main directory of this archive for
- *  more details.
- */
-
-#include <linux/module.h>
-#include <linux/slab.h>
-#include <linux/string.h>
-#include <linux/fb.h>
-#include <linux/vt_kern.h>
-#include <linux/console.h>
-#include <asm/types.h>
-#include "fbcon.h"
-#include "fbcon_rotate.h"
-
-/*
- * Rotation 270 degrees
- */
-
-static void ccw_update_attr(u8 *dst, u8 *src, int attribute,
-				  struct vc_data *vc)
-{
-	int i, j, offset = (vc->vc_font.height < 10) ? 1 : 2;
-	int width = (vc->vc_font.height + 7) >> 3;
-	int mod = vc->vc_font.height % 8;
-	u8 c, msk = ~(0xff << offset), msk1 = 0;
-
-	if (mod)
-		msk <<= (8 - mod);
-
-	if (offset > mod)
-		msk1 |= 0x01;
-
-	for (i = 0; i < vc->vc_font.width; i++) {
-		for (j = 0; j < width; j++) {
-			c = *src;
-
-			if (attribute & FBCON_ATTRIBUTE_UNDERLINE) {
-				if (j == width - 1)
-					c |= msk;
-
-				if (msk1 && j == width - 2)
-					c |= msk1;
-			}
-
-			if (attribute & FBCON_ATTRIBUTE_BOLD && i)
-				*(dst - width) |= c;
-
-			if (attribute & FBCON_ATTRIBUTE_REVERSE)
-				c = ~c;
-			src++;
-			*dst++ = c;
-		}
-	}
-}
-
-
-static void ccw_bmove(struct vc_data *vc, struct fb_info *info, int sy,
-		     int sx, int dy, int dx, int height, int width)
-{
-	struct fbcon_ops *ops = info->fbcon_par;
-	struct fb_copyarea area;
-	u32 vyres = GETVYRES(ops->p->scrollmode, info);
-
-	area.sx = sy * vc->vc_font.height;
-	area.sy = vyres - ((sx + width) * vc->vc_font.width);
-	area.dx = dy * vc->vc_font.height;
-	area.dy = vyres - ((dx + width) * vc->vc_font.width);
-	area.width = height * vc->vc_font.height;
-	area.height  = width * vc->vc_font.width;
-
-	info->fbops->fb_copyarea(info, &area);
-}
-
-static void ccw_clear(struct vc_data *vc, struct fb_info *info, int sy,
-		     int sx, int height, int width)
-{
-	struct fbcon_ops *ops = info->fbcon_par;
-	struct fb_fillrect region;
-	int bgshift = (vc->vc_hi_font_mask) ? 13 : 12;
-	u32 vyres = GETVYRES(ops->p->scrollmode, info);
-
-	region.color = attr_bgcol_ec(bgshift,vc,info);
-	region.dx = sy * vc->vc_font.height;
-	region.dy = vyres - ((sx + width) * vc->vc_font.width);
-	region.height = width * vc->vc_font.width;
-	region.width = height * vc->vc_font.height;
-	region.rop = ROP_COPY;
-
-	info->fbops->fb_fillrect(info, &region);
-}
-
-static inline void ccw_putcs_aligned(struct vc_data *vc, struct fb_info *info,
-				    const u16 *s, u32 attr, u32 cnt,
-				    u32 d_pitch, u32 s_pitch, u32 cellsize,
-				    struct fb_image *image, u8 *buf, u8 *dst)
-{
-	struct fbcon_ops *ops = info->fbcon_par;
-	u16 charmask = vc->vc_hi_font_mask ? 0x1ff : 0xff;
-	u32 idx = (vc->vc_font.height + 7) >> 3;
-	u8 *src;
-
-	while (cnt--) {
-		src = ops->fontbuffer + (scr_readw(s--) & charmask)*cellsize;
-
-		if (attr) {
-			ccw_update_attr(buf, src, attr, vc);
-			src = buf;
-		}
-
-		if (likely(idx == 1))
-			__fb_pad_aligned_buffer(dst, d_pitch, src, idx,
-						vc->vc_font.width);
-		else
-			fb_pad_aligned_buffer(dst, d_pitch, src, idx,
-					      vc->vc_font.width);
-
-		dst += d_pitch * vc->vc_font.width;
-	}
-
-	info->fbops->fb_imageblit(info, image);
-}
-
-static void ccw_putcs(struct vc_data *vc, struct fb_info *info,
-		      const unsigned short *s, int count, int yy, int xx,
-		      int fg, int bg)
-{
-	struct fb_image image;
-	struct fbcon_ops *ops = info->fbcon_par;
-	u32 width = (vc->vc_font.height + 7)/8;
-	u32 cellsize = width * vc->vc_font.width;
-	u32 maxcnt = info->pixmap.size/cellsize;
-	u32 scan_align = info->pixmap.scan_align - 1;
-	u32 buf_align = info->pixmap.buf_align - 1;
-	u32 cnt, pitch, size;
-	u32 attribute = get_attribute(info, scr_readw(s));
-	u8 *dst, *buf = NULL;
-	u32 vyres = GETVYRES(ops->p->scrollmode, info);
-
-	if (!ops->fontbuffer)
-		return;
-
-	image.fg_color = fg;
-	image.bg_color = bg;
-	image.dx = yy * vc->vc_font.height;
-	image.dy = vyres - ((xx + count) * vc->vc_font.width);
-	image.width = vc->vc_font.height;
-	image.depth = 1;
-
-	if (attribute) {
-		buf = kmalloc(cellsize, GFP_KERNEL);
-		if (!buf)
-			return;
-	}
-
-	s += count - 1;
-
-	while (count) {
-		if (count > maxcnt)
-			cnt = maxcnt;
-		else
-			cnt = count;
-
-		image.height = vc->vc_font.width * cnt;
-		pitch = ((image.width + 7) >> 3) + scan_align;
-		pitch &= ~scan_align;
-		size = pitch * image.height + buf_align;
-		size &= ~buf_align;
-		dst = fb_get_buffer_offset(info, &info->pixmap, size);
-		image.data = dst;
-		ccw_putcs_aligned(vc, info, s, attribute, cnt, pitch,
-				 width, cellsize, &image, buf, dst);
-		image.dy += image.height;
-		count -= cnt;
-		s -= cnt;
-	}
-
-	/* buf is always NULL except when in monochrome mode, so in this case
-	   it's a gain to check buf against NULL even though kfree() handles
-	   NULL pointers just fine */
-	if (unlikely(buf))
-		kfree(buf);
-
-}
-
-static void ccw_clear_margins(struct vc_data *vc, struct fb_info *info,
-			     int bottom_only)
-{
-	unsigned int cw = vc->vc_font.width;
-	unsigned int ch = vc->vc_font.height;
-	unsigned int rw = info->var.yres - (vc->vc_cols*cw);
-	unsigned int bh = info->var.xres - (vc->vc_rows*ch);
-	unsigned int bs = vc->vc_rows*ch;
-	struct fb_fillrect region;
-
-	region.color = 0;
-	region.rop = ROP_COPY;
-
-	if (rw && !bottom_only) {
-		region.dx = 0;
-		region.dy = info->var.yoffset;
-		region.height = rw;
-		region.width = info->var.xres_virtual;
-		info->fbops->fb_fillrect(info, &region);
-	}
-
-	if (bh) {
-		region.dx = info->var.xoffset + bs;
-		region.dy = 0;
-                region.height = info->var.yres_virtual;
-                region.width = bh;
-		info->fbops->fb_fillrect(info, &region);
-	}
-}
-
-static void ccw_cursor(struct vc_data *vc, struct fb_info *info, int mode,
-		       int softback_lines, int fg, int bg)
-{
-	struct fb_cursor cursor;
-	struct fbcon_ops *ops = info->fbcon_par;
-	unsigned short charmask = vc->vc_hi_font_mask ? 0x1ff : 0xff;
-	int w = (vc->vc_font.height + 7) >> 3, c;
-	int y = real_y(ops->p, vc->vc_y);
-	int attribute, use_sw = (vc->vc_cursor_type & 0x10);
-	int err = 1, dx, dy;
-	char *src;
-	u32 vyres = GETVYRES(ops->p->scrollmode, info);
-
-	if (!ops->fontbuffer)
-		return;
-
-	cursor.set = 0;
-
-	if (softback_lines) {
-		if (y + softback_lines >= vc->vc_rows) {
-			mode = CM_ERASE;
-			ops->cursor_flash = 0;
-			return;
-		} else
-			y += softback_lines;
-	}
-
- 	c = scr_readw((u16 *) vc->vc_pos);
-	attribute = get_attribute(info, c);
-	src = ops->fontbuffer + ((c & charmask) * (w * vc->vc_font.width));
-
-	if (ops->cursor_state.image.data != src ||
-	    ops->cursor_reset) {
-	    ops->cursor_state.image.data = src;
-	    cursor.set |= FB_CUR_SETIMAGE;
-	}
-
-	if (attribute) {
-		u8 *dst;
-
-		dst = kmalloc(w * vc->vc_font.width, GFP_ATOMIC);
-		if (!dst)
-			return;
-		kfree(ops->cursor_data);
-		ops->cursor_data = dst;
-		ccw_update_attr(dst, src, attribute, vc);
-		src = dst;
-	}
-
-	if (ops->cursor_state.image.fg_color != fg ||
-	    ops->cursor_state.image.bg_color != bg ||
-	    ops->cursor_reset) {
-		ops->cursor_state.image.fg_color = fg;
-		ops->cursor_state.image.bg_color = bg;
-		cursor.set |= FB_CUR_SETCMAP;
-	}
-
-	if (ops->cursor_state.image.height != vc->vc_font.width ||
-	    ops->cursor_state.image.width != vc->vc_font.height ||
-	    ops->cursor_reset) {
-		ops->cursor_state.image.height = vc->vc_font.width;
-		ops->cursor_state.image.width = vc->vc_font.height;
-		cursor.set |= FB_CUR_SETSIZE;
-	}
-
-	dx = y * vc->vc_font.height;
-	dy = vyres - ((vc->vc_x + 1) * vc->vc_font.width);
-
-	if (ops->cursor_state.image.dx != dx ||
-	    ops->cursor_state.image.dy != dy ||
-	    ops->cursor_reset) {
-		ops->cursor_state.image.dx = dx;
-		ops->cursor_state.image.dy = dy;
-		cursor.set |= FB_CUR_SETPOS;
-	}
-
-	if (ops->cursor_state.hot.x || ops->cursor_state.hot.y ||
-	    ops->cursor_reset) {
-		ops->cursor_state.hot.x = cursor.hot.y = 0;
-		cursor.set |= FB_CUR_SETHOT;
-	}
-
-	if (cursor.set & FB_CUR_SETSIZE ||
-	    vc->vc_cursor_type != ops->p->cursor_shape ||
-	    ops->cursor_state.mask == NULL ||
-	    ops->cursor_reset) {
-		char *tmp, *mask = kmalloc(w*vc->vc_font.width, GFP_ATOMIC);
-		int cur_height, size, i = 0;
-		int width = (vc->vc_font.width + 7)/8;
-
-		if (!mask)
-			return;
-
-		tmp = kmalloc(width * vc->vc_font.height, GFP_ATOMIC);
-
-		if (!tmp) {
-			kfree(mask);
-			return;
-		}
-
-		kfree(ops->cursor_state.mask);
-		ops->cursor_state.mask = mask;
-
-		ops->p->cursor_shape = vc->vc_cursor_type;
-		cursor.set |= FB_CUR_SETSHAPE;
-
-		switch (ops->p->cursor_shape & CUR_HWMASK) {
-		case CUR_NONE:
-			cur_height = 0;
-			break;
-		case CUR_UNDERLINE:
-			cur_height = (vc->vc_font.height < 10) ? 1 : 2;
-			break;
-		case CUR_LOWER_THIRD:
-			cur_height = vc->vc_font.height/3;
-			break;
-		case CUR_LOWER_HALF:
-			cur_height = vc->vc_font.height >> 1;
-			break;
-		case CUR_TWO_THIRDS:
-			cur_height = (vc->vc_font.height << 1)/3;
-			break;
-		case CUR_BLOCK:
-		default:
-			cur_height = vc->vc_font.height;
-			break;
-		}
-
-		size = (vc->vc_font.height - cur_height) * width;
-		while (size--)
-			tmp[i++] = 0;
-		size = cur_height * width;
-		while (size--)
-			tmp[i++] = 0xff;
-		memset(mask, 0, w * vc->vc_font.width);
-		rotate_ccw(tmp, mask, vc->vc_font.width, vc->vc_font.height);
-		kfree(tmp);
-	}
-
-	switch (mode) {
-	case CM_ERASE:
-		ops->cursor_state.enable = 0;
-		break;
-	case CM_DRAW:
-	case CM_MOVE:
-	default:
-		ops->cursor_state.enable = (use_sw) ? 0 : 1;
-		break;
-	}
-
-	cursor.image.data = src;
-	cursor.image.fg_color = ops->cursor_state.image.fg_color;
-	cursor.image.bg_color = ops->cursor_state.image.bg_color;
-	cursor.image.dx = ops->cursor_state.image.dx;
-	cursor.image.dy = ops->cursor_state.image.dy;
-	cursor.image.height = ops->cursor_state.image.height;
-	cursor.image.width = ops->cursor_state.image.width;
-	cursor.hot.x = ops->cursor_state.hot.x;
-	cursor.hot.y = ops->cursor_state.hot.y;
-	cursor.mask = ops->cursor_state.mask;
-	cursor.enable = ops->cursor_state.enable;
-	cursor.image.depth = 1;
-	cursor.rop = ROP_XOR;
-
-	if (info->fbops->fb_cursor)
-		err = info->fbops->fb_cursor(info, &cursor);
-
-	if (err)
-		soft_cursor(info, &cursor);
-
-	ops->cursor_reset = 0;
-}
-
-static int ccw_update_start(struct fb_info *info)
-{
-	struct fbcon_ops *ops = info->fbcon_par;
-	u32 yoffset;
-	u32 vyres = GETVYRES(ops->p->scrollmode, info);
-	int err;
-
-	yoffset = (vyres - info->var.yres) - ops->var.xoffset;
-	ops->var.xoffset = ops->var.yoffset;
-	ops->var.yoffset = yoffset;
-	err = fb_pan_display(info, &ops->var);
-	ops->var.xoffset = info->var.xoffset;
-	ops->var.yoffset = info->var.yoffset;
-	ops->var.vmode = info->var.vmode;
-	return err;
-}
-
-void fbcon_rotate_ccw(struct fbcon_ops *ops)
-{
-	ops->bmove = ccw_bmove;
-	ops->clear = ccw_clear;
-	ops->putcs = ccw_putcs;
-	ops->clear_margins = ccw_clear_margins;
-	ops->cursor = ccw_cursor;
-	ops->update_start = ccw_update_start;
-}
-EXPORT_SYMBOL(fbcon_rotate_ccw);
-
-MODULE_AUTHOR("Antonino Daplas <adaplas@pol.net>");
-MODULE_DESCRIPTION("Console Rotation (270 degrees) Support");
-MODULE_LICENSE("GPL");
diff --git a/drivers/video/console/fbcon_cw.c b/drivers/video/console/fbcon_cw.c
deleted file mode 100644
index e7ee44db4e98..000000000000
--- a/drivers/video/console/fbcon_cw.c
+++ /dev/null
@@ -1,407 +0,0 @@
-/*
- *  linux/drivers/video/console/fbcon_ud.c -- Software Rotation - 90 degrees
- *
- *      Copyright (C) 2005 Antonino Daplas <adaplas @pol.net>
- *
- *  This file is subject to the terms and conditions of the GNU General Public
- *  License.  See the file COPYING in the main directory of this archive for
- *  more details.
- */
-
-#include <linux/module.h>
-#include <linux/slab.h>
-#include <linux/string.h>
-#include <linux/fb.h>
-#include <linux/vt_kern.h>
-#include <linux/console.h>
-#include <asm/types.h>
-#include "fbcon.h"
-#include "fbcon_rotate.h"
-
-/*
- * Rotation 90 degrees
- */
-
-static void cw_update_attr(u8 *dst, u8 *src, int attribute,
-				  struct vc_data *vc)
-{
-	int i, j, offset = (vc->vc_font.height < 10) ? 1 : 2;
-	int width = (vc->vc_font.height + 7) >> 3;
-	u8 c, msk = ~(0xff >> offset);
-
-	for (i = 0; i < vc->vc_font.width; i++) {
-		for (j = 0; j < width; j++) {
-			c = *src;
-			if (attribute & FBCON_ATTRIBUTE_UNDERLINE && !j)
-				c |= msk;
-			if (attribute & FBCON_ATTRIBUTE_BOLD && i)
-				c |= *(src-width);
-			if (attribute & FBCON_ATTRIBUTE_REVERSE)
-				c = ~c;
-			src++;
-			*dst++ = c;
-		}
-	}
-}
-
-
-static void cw_bmove(struct vc_data *vc, struct fb_info *info, int sy,
-		     int sx, int dy, int dx, int height, int width)
-{
-	struct fbcon_ops *ops = info->fbcon_par;
-	struct fb_copyarea area;
-	u32 vxres = GETVXRES(ops->p->scrollmode, info);
-
-	area.sx = vxres - ((sy + height) * vc->vc_font.height);
-	area.sy = sx * vc->vc_font.width;
-	area.dx = vxres - ((dy + height) * vc->vc_font.height);
-	area.dy = dx * vc->vc_font.width;
-	area.width = height * vc->vc_font.height;
-	area.height  = width * vc->vc_font.width;
-
-	info->fbops->fb_copyarea(info, &area);
-}
-
-static void cw_clear(struct vc_data *vc, struct fb_info *info, int sy,
-		     int sx, int height, int width)
-{
-	struct fbcon_ops *ops = info->fbcon_par;
-	struct fb_fillrect region;
-	int bgshift = (vc->vc_hi_font_mask) ? 13 : 12;
-	u32 vxres = GETVXRES(ops->p->scrollmode, info);
-
-	region.color = attr_bgcol_ec(bgshift,vc,info);
-	region.dx = vxres - ((sy + height) * vc->vc_font.height);
-	region.dy = sx *  vc->vc_font.width;
-	region.height = width * vc->vc_font.width;
-	region.width = height * vc->vc_font.height;
-	region.rop = ROP_COPY;
-
-	info->fbops->fb_fillrect(info, &region);
-}
-
-static inline void cw_putcs_aligned(struct vc_data *vc, struct fb_info *info,
-				    const u16 *s, u32 attr, u32 cnt,
-				    u32 d_pitch, u32 s_pitch, u32 cellsize,
-				    struct fb_image *image, u8 *buf, u8 *dst)
-{
-	struct fbcon_ops *ops = info->fbcon_par;
-	u16 charmask = vc->vc_hi_font_mask ? 0x1ff : 0xff;
-	u32 idx = (vc->vc_font.height + 7) >> 3;
-	u8 *src;
-
-	while (cnt--) {
-		src = ops->fontbuffer + (scr_readw(s++) & charmask)*cellsize;
-
-		if (attr) {
-			cw_update_attr(buf, src, attr, vc);
-			src = buf;
-		}
-
-		if (likely(idx == 1))
-			__fb_pad_aligned_buffer(dst, d_pitch, src, idx,
-						vc->vc_font.width);
-		else
-			fb_pad_aligned_buffer(dst, d_pitch, src, idx,
-					      vc->vc_font.width);
-
-		dst += d_pitch * vc->vc_font.width;
-	}
-
-	info->fbops->fb_imageblit(info, image);
-}
-
-static void cw_putcs(struct vc_data *vc, struct fb_info *info,
-		      const unsigned short *s, int count, int yy, int xx,
-		      int fg, int bg)
-{
-	struct fb_image image;
-	struct fbcon_ops *ops = info->fbcon_par;
-	u32 width = (vc->vc_font.height + 7)/8;
-	u32 cellsize = width * vc->vc_font.width;
-	u32 maxcnt = info->pixmap.size/cellsize;
-	u32 scan_align = info->pixmap.scan_align - 1;
-	u32 buf_align = info->pixmap.buf_align - 1;
-	u32 cnt, pitch, size;
-	u32 attribute = get_attribute(info, scr_readw(s));
-	u8 *dst, *buf = NULL;
-	u32 vxres = GETVXRES(ops->p->scrollmode, info);
-
-	if (!ops->fontbuffer)
-		return;
-
-	image.fg_color = fg;
-	image.bg_color = bg;
-	image.dx = vxres - ((yy + 1) * vc->vc_font.height);
-	image.dy = xx * vc->vc_font.width;
-	image.width = vc->vc_font.height;
-	image.depth = 1;
-
-	if (attribute) {
-		buf = kmalloc(cellsize, GFP_KERNEL);
-		if (!buf)
-			return;
-	}
-
-	while (count) {
-		if (count > maxcnt)
-			cnt = maxcnt;
-		else
-			cnt = count;
-
-		image.height = vc->vc_font.width * cnt;
-		pitch = ((image.width + 7) >> 3) + scan_align;
-		pitch &= ~scan_align;
-		size = pitch * image.height + buf_align;
-		size &= ~buf_align;
-		dst = fb_get_buffer_offset(info, &info->pixmap, size);
-		image.data = dst;
-		cw_putcs_aligned(vc, info, s, attribute, cnt, pitch,
-				 width, cellsize, &image, buf, dst);
-		image.dy += image.height;
-		count -= cnt;
-		s += cnt;
-	}
-
-	/* buf is always NULL except when in monochrome mode, so in this case
-	   it's a gain to check buf against NULL even though kfree() handles
-	   NULL pointers just fine */
-	if (unlikely(buf))
-		kfree(buf);
-
-}
-
-static void cw_clear_margins(struct vc_data *vc, struct fb_info *info,
-			     int bottom_only)
-{
-	unsigned int cw = vc->vc_font.width;
-	unsigned int ch = vc->vc_font.height;
-	unsigned int rw = info->var.yres - (vc->vc_cols*cw);
-	unsigned int bh = info->var.xres - (vc->vc_rows*ch);
-	unsigned int rs = info->var.yres - rw;
-	struct fb_fillrect region;
-
-	region.color = 0;
-	region.rop = ROP_COPY;
-
-	if (rw && !bottom_only) {
-		region.dx = 0;
-		region.dy = info->var.yoffset + rs;
-		region.height = rw;
-		region.width = info->var.xres_virtual;
-		info->fbops->fb_fillrect(info, &region);
-	}
-
-	if (bh) {
-		region.dx = info->var.xoffset;
-		region.dy = info->var.yoffset;
-                region.height = info->var.yres;
-                region.width = bh;
-		info->fbops->fb_fillrect(info, &region);
-	}
-}
-
-static void cw_cursor(struct vc_data *vc, struct fb_info *info, int mode,
-		      int softback_lines, int fg, int bg)
-{
-	struct fb_cursor cursor;
-	struct fbcon_ops *ops = info->fbcon_par;
-	unsigned short charmask = vc->vc_hi_font_mask ? 0x1ff : 0xff;
-	int w = (vc->vc_font.height + 7) >> 3, c;
-	int y = real_y(ops->p, vc->vc_y);
-	int attribute, use_sw = (vc->vc_cursor_type & 0x10);
-	int err = 1, dx, dy;
-	char *src;
-	u32 vxres = GETVXRES(ops->p->scrollmode, info);
-
-	if (!ops->fontbuffer)
-		return;
-
-	cursor.set = 0;
-
-	if (softback_lines) {
-		if (y + softback_lines >= vc->vc_rows) {
-			mode = CM_ERASE;
-			ops->cursor_flash = 0;
-			return;
-		} else
-			y += softback_lines;
-	}
-
- 	c = scr_readw((u16 *) vc->vc_pos);
-	attribute = get_attribute(info, c);
-	src = ops->fontbuffer + ((c & charmask) * (w * vc->vc_font.width));
-
-	if (ops->cursor_state.image.data != src ||
-	    ops->cursor_reset) {
-	    ops->cursor_state.image.data = src;
-	    cursor.set |= FB_CUR_SETIMAGE;
-	}
-
-	if (attribute) {
-		u8 *dst;
-
-		dst = kmalloc(w * vc->vc_font.width, GFP_ATOMIC);
-		if (!dst)
-			return;
-		kfree(ops->cursor_data);
-		ops->cursor_data = dst;
-		cw_update_attr(dst, src, attribute, vc);
-		src = dst;
-	}
-
-	if (ops->cursor_state.image.fg_color != fg ||
-	    ops->cursor_state.image.bg_color != bg ||
-	    ops->cursor_reset) {
-		ops->cursor_state.image.fg_color = fg;
-		ops->cursor_state.image.bg_color = bg;
-		cursor.set |= FB_CUR_SETCMAP;
-	}
-
-	if (ops->cursor_state.image.height != vc->vc_font.width ||
-	    ops->cursor_state.image.width != vc->vc_font.height ||
-	    ops->cursor_reset) {
-		ops->cursor_state.image.height = vc->vc_font.width;
-		ops->cursor_state.image.width = vc->vc_font.height;
-		cursor.set |= FB_CUR_SETSIZE;
-	}
-
-	dx = vxres - ((y * vc->vc_font.height) + vc->vc_font.height);
-	dy = vc->vc_x * vc->vc_font.width;
-
-	if (ops->cursor_state.image.dx != dx ||
-	    ops->cursor_state.image.dy != dy ||
-	    ops->cursor_reset) {
-		ops->cursor_state.image.dx = dx;
-		ops->cursor_state.image.dy = dy;
-		cursor.set |= FB_CUR_SETPOS;
-	}
-
-	if (ops->cursor_state.hot.x || ops->cursor_state.hot.y ||
-	    ops->cursor_reset) {
-		ops->cursor_state.hot.x = cursor.hot.y = 0;
-		cursor.set |= FB_CUR_SETHOT;
-	}
-
-	if (cursor.set & FB_CUR_SETSIZE ||
-	    vc->vc_cursor_type != ops->p->cursor_shape ||
-	    ops->cursor_state.mask == NULL ||
-	    ops->cursor_reset) {
-		char *tmp, *mask = kmalloc(w*vc->vc_font.width, GFP_ATOMIC);
-		int cur_height, size, i = 0;
-		int width = (vc->vc_font.width + 7)/8;
-
-		if (!mask)
-			return;
-
-		tmp = kmalloc(width * vc->vc_font.height, GFP_ATOMIC);
-
-		if (!tmp) {
-			kfree(mask);
-			return;
-		}
-
-		kfree(ops->cursor_state.mask);
-		ops->cursor_state.mask = mask;
-
-		ops->p->cursor_shape = vc->vc_cursor_type;
-		cursor.set |= FB_CUR_SETSHAPE;
-
-		switch (ops->p->cursor_shape & CUR_HWMASK) {
-		case CUR_NONE:
-			cur_height = 0;
-			break;
-		case CUR_UNDERLINE:
-			cur_height = (vc->vc_font.height < 10) ? 1 : 2;
-			break;
-		case CUR_LOWER_THIRD:
-			cur_height = vc->vc_font.height/3;
-			break;
-		case CUR_LOWER_HALF:
-			cur_height = vc->vc_font.height >> 1;
-			break;
-		case CUR_TWO_THIRDS:
-			cur_height = (vc->vc_font.height << 1)/3;
-			break;
-		case CUR_BLOCK:
-		default:
-			cur_height = vc->vc_font.height;
-			break;
-		}
-
-		size = (vc->vc_font.height - cur_height) * width;
-		while (size--)
-			tmp[i++] = 0;
-		size = cur_height * width;
-		while (size--)
-			tmp[i++] = 0xff;
-		memset(mask, 0, w * vc->vc_font.width);
-		rotate_cw(tmp, mask, vc->vc_font.width, vc->vc_font.height);
-		kfree(tmp);
-	}
-
-	switch (mode) {
-	case CM_ERASE:
-		ops->cursor_state.enable = 0;
-		break;
-	case CM_DRAW:
-	case CM_MOVE:
-	default:
-		ops->cursor_state.enable = (use_sw) ? 0 : 1;
-		break;
-	}
-
-	cursor.image.data = src;
-	cursor.image.fg_color = ops->cursor_state.image.fg_color;
-	cursor.image.bg_color = ops->cursor_state.image.bg_color;
-	cursor.image.dx = ops->cursor_state.image.dx;
-	cursor.image.dy = ops->cursor_state.image.dy;
-	cursor.image.height = ops->cursor_state.image.height;
-	cursor.image.width = ops->cursor_state.image.width;
-	cursor.hot.x = ops->cursor_state.hot.x;
-	cursor.hot.y = ops->cursor_state.hot.y;
-	cursor.mask = ops->cursor_state.mask;
-	cursor.enable = ops->cursor_state.enable;
-	cursor.image.depth = 1;
-	cursor.rop = ROP_XOR;
-
-	if (info->fbops->fb_cursor)
-		err = info->fbops->fb_cursor(info, &cursor);
-
-	if (err)
-		soft_cursor(info, &cursor);
-
-	ops->cursor_reset = 0;
-}
-
-static int cw_update_start(struct fb_info *info)
-{
-	struct fbcon_ops *ops = info->fbcon_par;
-	u32 vxres = GETVXRES(ops->p->scrollmode, info);
-	u32 xoffset;
-	int err;
-
-	xoffset = vxres - (info->var.xres + ops->var.yoffset);
-	ops->var.yoffset = ops->var.xoffset;
-	ops->var.xoffset = xoffset;
-	err = fb_pan_display(info, &ops->var);
-	ops->var.xoffset = info->var.xoffset;
-	ops->var.yoffset = info->var.yoffset;
-	ops->var.vmode = info->var.vmode;
-	return err;
-}
-
-void fbcon_rotate_cw(struct fbcon_ops *ops)
-{
-	ops->bmove = cw_bmove;
-	ops->clear = cw_clear;
-	ops->putcs = cw_putcs;
-	ops->clear_margins = cw_clear_margins;
-	ops->cursor = cw_cursor;
-	ops->update_start = cw_update_start;
-}
-EXPORT_SYMBOL(fbcon_rotate_cw);
-
-MODULE_AUTHOR("Antonino Daplas <adaplas@pol.net>");
-MODULE_DESCRIPTION("Console Rotation (90 degrees) Support");
-MODULE_LICENSE("GPL");
diff --git a/drivers/video/console/fbcon_rotate.c b/drivers/video/console/fbcon_rotate.c
deleted file mode 100644
index db6528f2d3f2..000000000000
--- a/drivers/video/console/fbcon_rotate.c
+++ /dev/null
@@ -1,116 +0,0 @@
-/*
- *  linux/drivers/video/console/fbcon_rotate.c -- Software Rotation
- *
- *      Copyright (C) 2005 Antonino Daplas <adaplas @pol.net>
- *
- *  This file is subject to the terms and conditions of the GNU General Public
- *  License.  See the file COPYING in the main directory of this archive for
- *  more details.
- */
-
-#include <linux/module.h>
-#include <linux/slab.h>
-#include <linux/string.h>
-#include <linux/fb.h>
-#include <linux/vt_kern.h>
-#include <linux/console.h>
-#include <asm/types.h>
-#include "fbcon.h"
-#include "fbcon_rotate.h"
-
-static int fbcon_rotate_font(struct fb_info *info, struct vc_data *vc)
-{
-	struct fbcon_ops *ops = info->fbcon_par;
-	int len, err = 0;
-	int s_cellsize, d_cellsize, i;
-	const u8 *src;
-	u8 *dst;
-
-	if (vc->vc_font.data == ops->fontdata &&
-	    ops->p->con_rotate == ops->cur_rotate)
-		goto finished;
-
-	src = ops->fontdata = vc->vc_font.data;
-	ops->cur_rotate = ops->p->con_rotate;
-	len = (!ops->p->userfont) ? 256 : FNTCHARCNT(src);
-	s_cellsize = ((vc->vc_font.width + 7)/8) *
-		vc->vc_font.height;
-	d_cellsize = s_cellsize;
-
-	if (ops->rotate == FB_ROTATE_CW ||
-	    ops->rotate == FB_ROTATE_CCW)
-		d_cellsize = ((vc->vc_font.height + 7)/8) *
-			vc->vc_font.width;
-
-	if (info->fbops->fb_sync)
-		info->fbops->fb_sync(info);
-
-	if (ops->fd_size < d_cellsize * len) {
-		dst = kmalloc(d_cellsize * len, GFP_KERNEL);
-
-		if (dst == NULL) {
-			err = -ENOMEM;
-			goto finished;
-		}
-
-		ops->fd_size = d_cellsize * len;
-		kfree(ops->fontbuffer);
-		ops->fontbuffer = dst;
-	}
-
-	dst = ops->fontbuffer;
-	memset(dst, 0, ops->fd_size);
-
-	switch (ops->rotate) {
-	case FB_ROTATE_UD:
-		for (i = len; i--; ) {
-			rotate_ud(src, dst, vc->vc_font.width,
-				  vc->vc_font.height);
-
-			src += s_cellsize;
-			dst += d_cellsize;
-		}
-		break;
-	case FB_ROTATE_CW:
-		for (i = len; i--; ) {
-			rotate_cw(src, dst, vc->vc_font.width,
-				  vc->vc_font.height);
-			src += s_cellsize;
-			dst += d_cellsize;
-		}
-		break;
-	case FB_ROTATE_CCW:
-		for (i = len; i--; ) {
-			rotate_ccw(src, dst, vc->vc_font.width,
-				   vc->vc_font.height);
-			src += s_cellsize;
-			dst += d_cellsize;
-		}
-		break;
-	}
-
-finished:
-	return err;
-}
-
-void fbcon_set_rotate(struct fbcon_ops *ops)
-{
-	ops->rotate_font = fbcon_rotate_font;
-
-	switch(ops->rotate) {
-	case FB_ROTATE_CW:
-		fbcon_rotate_cw(ops);
-		break;
-	case FB_ROTATE_UD:
-		fbcon_rotate_ud(ops);
-		break;
-	case FB_ROTATE_CCW:
-		fbcon_rotate_ccw(ops);
-		break;
-	}
-}
-EXPORT_SYMBOL(fbcon_set_rotate);
-
-MODULE_AUTHOR("Antonino Daplas <adaplas@pol.net>");
-MODULE_DESCRIPTION("Console Rotation Support");
-MODULE_LICENSE("GPL");
diff --git a/drivers/video/console/fbcon_rotate.h b/drivers/video/console/fbcon_rotate.h
deleted file mode 100644
index e233444cda66..000000000000
--- a/drivers/video/console/fbcon_rotate.h
+++ /dev/null
@@ -1,96 +0,0 @@
-/*
- *  linux/drivers/video/console/fbcon_rotate.h -- Software Display Rotation
- *
- *	Copyright (C) 2005 Antonino Daplas <adaplas@pol.net>
- *
- *  This file is subject to the terms and conditions of the GNU General Public
- *  License.  See the file COPYING in the main directory of this archive
- *  for more details.
- */
-
-#ifndef _FBCON_ROTATE_H
-#define _FBCON_ROTATE_H
-
-#define GETVYRES(s,i) ({                           \
-        (s == SCROLL_REDRAW || s == SCROLL_MOVE) ? \
-        (i)->var.yres : (i)->var.yres_virtual; })
-
-#define GETVXRES(s,i) ({                           \
-        (s == SCROLL_REDRAW || s == SCROLL_MOVE || !(i)->fix.xpanstep) ? \
-        (i)->var.xres : (i)->var.xres_virtual; })
-
-
-static inline int pattern_test_bit(u32 x, u32 y, u32 pitch, const char *pat)
-{
-	u32 tmp = (y * pitch) + x, index = tmp / 8,  bit = tmp % 8;
-
-	pat +=index;
-	return (*pat) & (0x80 >> bit);
-}
-
-static inline void pattern_set_bit(u32 x, u32 y, u32 pitch, char *pat)
-{
-	u32 tmp = (y * pitch) + x, index = tmp / 8, bit = tmp % 8;
-
-	pat += index;
-
-	(*pat) |= 0x80 >> bit;
-}
-
-static inline void rotate_ud(const char *in, char *out, u32 width, u32 height)
-{
-	int i, j;
-	int shift = (8 - (width % 8)) & 7;
-
-	width = (width + 7) & ~7;
-
-	for (i = 0; i < height; i++) {
-		for (j = 0; j < width - shift; j++) {
-			if (pattern_test_bit(j, i, width, in))
-				pattern_set_bit(width - (1 + j + shift),
-						height - (1 + i),
-						width, out);
-		}
-
-	}
-}
-
-static inline void rotate_cw(const char *in, char *out, u32 width, u32 height)
-{
-	int i, j, h = height, w = width;
-	int shift = (8 - (height % 8)) & 7;
-
-	width = (width + 7) & ~7;
-	height = (height + 7) & ~7;
-
-	for (i = 0; i < h; i++) {
-		for (j = 0; j < w; j++) {
-			if (pattern_test_bit(j, i, width, in))
-				pattern_set_bit(height - 1 - i - shift, j,
-						height, out);
-
-		}
-	}
-}
-
-static inline void rotate_ccw(const char *in, char *out, u32 width, u32 height)
-{
-	int i, j, h = height, w = width;
-	int shift = (8 - (width % 8)) & 7;
-
-	width = (width + 7) & ~7;
-	height = (height + 7) & ~7;
-
-	for (i = 0; i < h; i++) {
-		for (j = 0; j < w; j++) {
-			if (pattern_test_bit(j, i, width, in))
-				pattern_set_bit(i, width - 1 - j - shift,
-						height, out);
-		}
-	}
-}
-
-extern void fbcon_rotate_cw(struct fbcon_ops *ops);
-extern void fbcon_rotate_ud(struct fbcon_ops *ops);
-extern void fbcon_rotate_ccw(struct fbcon_ops *ops);
-#endif
diff --git a/drivers/video/console/fbcon_ud.c b/drivers/video/console/fbcon_ud.c
deleted file mode 100644
index 19e3714abfe8..000000000000
--- a/drivers/video/console/fbcon_ud.c
+++ /dev/null
@@ -1,452 +0,0 @@
-/*
- *  linux/drivers/video/console/fbcon_ud.c -- Software Rotation - 180 degrees
- *
- *      Copyright (C) 2005 Antonino Daplas <adaplas @pol.net>
- *
- *  This file is subject to the terms and conditions of the GNU General Public
- *  License.  See the file COPYING in the main directory of this archive for
- *  more details.
- */
-
-#include <linux/module.h>
-#include <linux/slab.h>
-#include <linux/string.h>
-#include <linux/fb.h>
-#include <linux/vt_kern.h>
-#include <linux/console.h>
-#include <asm/types.h>
-#include "fbcon.h"
-#include "fbcon_rotate.h"
-
-/*
- * Rotation 180 degrees
- */
-
-static void ud_update_attr(u8 *dst, u8 *src, int attribute,
-				  struct vc_data *vc)
-{
-	int i, offset = (vc->vc_font.height < 10) ? 1 : 2;
-	int width = (vc->vc_font.width + 7) >> 3;
-	unsigned int cellsize = vc->vc_font.height * width;
-	u8 c;
-
-	offset = offset * width;
-
-	for (i = 0; i < cellsize; i++) {
-		c = src[i];
-		if (attribute & FBCON_ATTRIBUTE_UNDERLINE && i < offset)
-			c = 0xff;
-		if (attribute & FBCON_ATTRIBUTE_BOLD)
-			c |= c << 1;
-		if (attribute & FBCON_ATTRIBUTE_REVERSE)
-			c = ~c;
-		dst[i] = c;
-	}
-}
-
-
-static void ud_bmove(struct vc_data *vc, struct fb_info *info, int sy,
-		     int sx, int dy, int dx, int height, int width)
-{
-	struct fbcon_ops *ops = info->fbcon_par;
-	struct fb_copyarea area;
-	u32 vyres = GETVYRES(ops->p->scrollmode, info);
-	u32 vxres = GETVXRES(ops->p->scrollmode, info);
-
-	area.sy = vyres - ((sy + height) * vc->vc_font.height);
-	area.sx = vxres - ((sx + width) * vc->vc_font.width);
-	area.dy = vyres - ((dy + height) * vc->vc_font.height);
-	area.dx = vxres - ((dx + width) * vc->vc_font.width);
-	area.height = height * vc->vc_font.height;
-	area.width  = width * vc->vc_font.width;
-
-	info->fbops->fb_copyarea(info, &area);
-}
-
-static void ud_clear(struct vc_data *vc, struct fb_info *info, int sy,
-		     int sx, int height, int width)
-{
-	struct fbcon_ops *ops = info->fbcon_par;
-	struct fb_fillrect region;
-	int bgshift = (vc->vc_hi_font_mask) ? 13 : 12;
-	u32 vyres = GETVYRES(ops->p->scrollmode, info);
-	u32 vxres = GETVXRES(ops->p->scrollmode, info);
-
-	region.color = attr_bgcol_ec(bgshift,vc,info);
-	region.dy = vyres - ((sy + height) * vc->vc_font.height);
-	region.dx = vxres - ((sx + width) *  vc->vc_font.width);
-	region.width = width * vc->vc_font.width;
-	region.height = height * vc->vc_font.height;
-	region.rop = ROP_COPY;
-
-	info->fbops->fb_fillrect(info, &region);
-}
-
-static inline void ud_putcs_aligned(struct vc_data *vc, struct fb_info *info,
-				    const u16 *s, u32 attr, u32 cnt,
-				    u32 d_pitch, u32 s_pitch, u32 cellsize,
-				    struct fb_image *image, u8 *buf, u8 *dst)
-{
-	struct fbcon_ops *ops = info->fbcon_par;
-	u16 charmask = vc->vc_hi_font_mask ? 0x1ff : 0xff;
-	u32 idx = vc->vc_font.width >> 3;
-	u8 *src;
-
-	while (cnt--) {
-		src = ops->fontbuffer + (scr_readw(s--) & charmask)*cellsize;
-
-		if (attr) {
-			ud_update_attr(buf, src, attr, vc);
-			src = buf;
-		}
-
-		if (likely(idx == 1))
-			__fb_pad_aligned_buffer(dst, d_pitch, src, idx,
-						image->height);
-		else
-			fb_pad_aligned_buffer(dst, d_pitch, src, idx,
-					      image->height);
-
-		dst += s_pitch;
-	}
-
-	info->fbops->fb_imageblit(info, image);
-}
-
-static inline void ud_putcs_unaligned(struct vc_data *vc,
-				      struct fb_info *info, const u16 *s,
-				      u32 attr, u32 cnt, u32 d_pitch,
-				      u32 s_pitch, u32 cellsize,
-				      struct fb_image *image, u8 *buf,
-				      u8 *dst)
-{
-	struct fbcon_ops *ops = info->fbcon_par;
-	u16 charmask = vc->vc_hi_font_mask ? 0x1ff : 0xff;
-	u32 shift_low = 0, mod = vc->vc_font.width % 8;
-	u32 shift_high = 8;
-	u32 idx = vc->vc_font.width >> 3;
-	u8 *src;
-
-	while (cnt--) {
-		src = ops->fontbuffer + (scr_readw(s--) & charmask)*cellsize;
-
-		if (attr) {
-			ud_update_attr(buf, src, attr, vc);
-			src = buf;
-		}
-
-		fb_pad_unaligned_buffer(dst, d_pitch, src, idx,
-					image->height, shift_high,
-					shift_low, mod);
-		shift_low += mod;
-		dst += (shift_low >= 8) ? s_pitch : s_pitch - 1;
-		shift_low &= 7;
-		shift_high = 8 - shift_low;
-	}
-
-	info->fbops->fb_imageblit(info, image);
-
-}
-
-static void ud_putcs(struct vc_data *vc, struct fb_info *info,
-		      const unsigned short *s, int count, int yy, int xx,
-		      int fg, int bg)
-{
-	struct fb_image image;
-	struct fbcon_ops *ops = info->fbcon_par;
-	u32 width = (vc->vc_font.width + 7)/8;
-	u32 cellsize = width * vc->vc_font.height;
-	u32 maxcnt = info->pixmap.size/cellsize;
-	u32 scan_align = info->pixmap.scan_align - 1;
-	u32 buf_align = info->pixmap.buf_align - 1;
-	u32 mod = vc->vc_font.width % 8, cnt, pitch, size;
-	u32 attribute = get_attribute(info, scr_readw(s));
-	u8 *dst, *buf = NULL;
-	u32 vyres = GETVYRES(ops->p->scrollmode, info);
-	u32 vxres = GETVXRES(ops->p->scrollmode, info);
-
-	if (!ops->fontbuffer)
-		return;
-
-	image.fg_color = fg;
-	image.bg_color = bg;
-	image.dy = vyres - ((yy * vc->vc_font.height) + vc->vc_font.height);
-	image.dx = vxres - ((xx + count) * vc->vc_font.width);
-	image.height = vc->vc_font.height;
-	image.depth = 1;
-
-	if (attribute) {
-		buf = kmalloc(cellsize, GFP_KERNEL);
-		if (!buf)
-			return;
-	}
-
-	s += count - 1;
-
-	while (count) {
-		if (count > maxcnt)
-			cnt = maxcnt;
-		else
-			cnt = count;
-
-		image.width = vc->vc_font.width * cnt;
-		pitch = ((image.width + 7) >> 3) + scan_align;
-		pitch &= ~scan_align;
-		size = pitch * image.height + buf_align;
-		size &= ~buf_align;
-		dst = fb_get_buffer_offset(info, &info->pixmap, size);
-		image.data = dst;
-
-		if (!mod)
-			ud_putcs_aligned(vc, info, s, attribute, cnt, pitch,
-					 width, cellsize, &image, buf, dst);
-		else
-			ud_putcs_unaligned(vc, info, s, attribute, cnt, pitch,
-					   width, cellsize, &image,
-					   buf, dst);
-
-		image.dx += image.width;
-		count -= cnt;
-		s -= cnt;
-		xx += cnt;
-	}
-
-	/* buf is always NULL except when in monochrome mode, so in this case
-	   it's a gain to check buf against NULL even though kfree() handles
-	   NULL pointers just fine */
-	if (unlikely(buf))
-		kfree(buf);
-
-}
-
-static void ud_clear_margins(struct vc_data *vc, struct fb_info *info,
-			     int bottom_only)
-{
-	unsigned int cw = vc->vc_font.width;
-	unsigned int ch = vc->vc_font.height;
-	unsigned int rw = info->var.xres - (vc->vc_cols*cw);
-	unsigned int bh = info->var.yres - (vc->vc_rows*ch);
-	struct fb_fillrect region;
-
-	region.color = 0;
-	region.rop = ROP_COPY;
-
-	if (rw && !bottom_only) {
-		region.dy = 0;
-		region.dx = info->var.xoffset;
-		region.width  = rw;
-		region.height = info->var.yres_virtual;
-		info->fbops->fb_fillrect(info, &region);
-	}
-
-	if (bh) {
-		region.dy = info->var.yoffset;
-		region.dx = info->var.xoffset;
-                region.height  = bh;
-                region.width = info->var.xres;
-		info->fbops->fb_fillrect(info, &region);
-	}
-}
-
-static void ud_cursor(struct vc_data *vc, struct fb_info *info, int mode,
-		      int softback_lines, int fg, int bg)
-{
-	struct fb_cursor cursor;
-	struct fbcon_ops *ops = info->fbcon_par;
-	unsigned short charmask = vc->vc_hi_font_mask ? 0x1ff : 0xff;
-	int w = (vc->vc_font.width + 7) >> 3, c;
-	int y = real_y(ops->p, vc->vc_y);
-	int attribute, use_sw = (vc->vc_cursor_type & 0x10);
-	int err = 1, dx, dy;
-	char *src;
-	u32 vyres = GETVYRES(ops->p->scrollmode, info);
-	u32 vxres = GETVXRES(ops->p->scrollmode, info);
-
-	if (!ops->fontbuffer)
-		return;
-
-	cursor.set = 0;
-
-	if (softback_lines) {
-		if (y + softback_lines >= vc->vc_rows) {
-			mode = CM_ERASE;
-			ops->cursor_flash = 0;
-			return;
-		} else
-			y += softback_lines;
-	}
-
- 	c = scr_readw((u16 *) vc->vc_pos);
-	attribute = get_attribute(info, c);
-	src = ops->fontbuffer + ((c & charmask) * (w * vc->vc_font.height));
-
-	if (ops->cursor_state.image.data != src ||
-	    ops->cursor_reset) {
-	    ops->cursor_state.image.data = src;
-	    cursor.set |= FB_CUR_SETIMAGE;
-	}
-
-	if (attribute) {
-		u8 *dst;
-
-		dst = kmalloc(w * vc->vc_font.height, GFP_ATOMIC);
-		if (!dst)
-			return;
-		kfree(ops->cursor_data);
-		ops->cursor_data = dst;
-		ud_update_attr(dst, src, attribute, vc);
-		src = dst;
-	}
-
-	if (ops->cursor_state.image.fg_color != fg ||
-	    ops->cursor_state.image.bg_color != bg ||
-	    ops->cursor_reset) {
-		ops->cursor_state.image.fg_color = fg;
-		ops->cursor_state.image.bg_color = bg;
-		cursor.set |= FB_CUR_SETCMAP;
-	}
-
-	if (ops->cursor_state.image.height != vc->vc_font.height ||
-	    ops->cursor_state.image.width != vc->vc_font.width ||
-	    ops->cursor_reset) {
-		ops->cursor_state.image.height = vc->vc_font.height;
-		ops->cursor_state.image.width = vc->vc_font.width;
-		cursor.set |= FB_CUR_SETSIZE;
-	}
-
-	dy = vyres - ((y * vc->vc_font.height) + vc->vc_font.height);
-	dx = vxres - ((vc->vc_x * vc->vc_font.width) + vc->vc_font.width);
-
-	if (ops->cursor_state.image.dx != dx ||
-	    ops->cursor_state.image.dy != dy ||
-	    ops->cursor_reset) {
-		ops->cursor_state.image.dx = dx;
-		ops->cursor_state.image.dy = dy;
-		cursor.set |= FB_CUR_SETPOS;
-	}
-
-	if (ops->cursor_state.hot.x || ops->cursor_state.hot.y ||
-	    ops->cursor_reset) {
-		ops->cursor_state.hot.x = cursor.hot.y = 0;
-		cursor.set |= FB_CUR_SETHOT;
-	}
-
-	if (cursor.set & FB_CUR_SETSIZE ||
-	    vc->vc_cursor_type != ops->p->cursor_shape ||
-	    ops->cursor_state.mask == NULL ||
-	    ops->cursor_reset) {
-		char *mask = kmalloc(w*vc->vc_font.height, GFP_ATOMIC);
-		int cur_height, size, i = 0;
-		u8 msk = 0xff;
-
-		if (!mask)
-			return;
-
-		kfree(ops->cursor_state.mask);
-		ops->cursor_state.mask = mask;
-
-		ops->p->cursor_shape = vc->vc_cursor_type;
-		cursor.set |= FB_CUR_SETSHAPE;
-
-		switch (ops->p->cursor_shape & CUR_HWMASK) {
-		case CUR_NONE:
-			cur_height = 0;
-			break;
-		case CUR_UNDERLINE:
-			cur_height = (vc->vc_font.height < 10) ? 1 : 2;
-			break;
-		case CUR_LOWER_THIRD:
-			cur_height = vc->vc_font.height/3;
-			break;
-		case CUR_LOWER_HALF:
-			cur_height = vc->vc_font.height >> 1;
-			break;
-		case CUR_TWO_THIRDS:
-			cur_height = (vc->vc_font.height << 1)/3;
-			break;
-		case CUR_BLOCK:
-		default:
-			cur_height = vc->vc_font.height;
-			break;
-		}
-
-		size = cur_height * w;
-
-		while (size--)
-			mask[i++] = msk;
-
-		size = (vc->vc_font.height - cur_height) * w;
-
-		while (size--)
-			mask[i++] = ~msk;
-	}
-
-	switch (mode) {
-	case CM_ERASE:
-		ops->cursor_state.enable = 0;
-		break;
-	case CM_DRAW:
-	case CM_MOVE:
-	default:
-		ops->cursor_state.enable = (use_sw) ? 0 : 1;
-		break;
-	}
-
-	cursor.image.data = src;
-	cursor.image.fg_color = ops->cursor_state.image.fg_color;
-	cursor.image.bg_color = ops->cursor_state.image.bg_color;
-	cursor.image.dx = ops->cursor_state.image.dx;
-	cursor.image.dy = ops->cursor_state.image.dy;
-	cursor.image.height = ops->cursor_state.image.height;
-	cursor.image.width = ops->cursor_state.image.width;
-	cursor.hot.x = ops->cursor_state.hot.x;
-	cursor.hot.y = ops->cursor_state.hot.y;
-	cursor.mask = ops->cursor_state.mask;
-	cursor.enable = ops->cursor_state.enable;
-	cursor.image.depth = 1;
-	cursor.rop = ROP_XOR;
-
-	if (info->fbops->fb_cursor)
-		err = info->fbops->fb_cursor(info, &cursor);
-
-	if (err)
-		soft_cursor(info, &cursor);
-
-	ops->cursor_reset = 0;
-}
-
-static int ud_update_start(struct fb_info *info)
-{
-	struct fbcon_ops *ops = info->fbcon_par;
-	int xoffset, yoffset;
-	u32 vyres = GETVYRES(ops->p->scrollmode, info);
-	u32 vxres = GETVXRES(ops->p->scrollmode, info);
-	int err;
-
-	xoffset = vxres - info->var.xres - ops->var.xoffset;
-	yoffset = vyres - info->var.yres - ops->var.yoffset;
-	if (yoffset < 0)
-		yoffset += vyres;
-	ops->var.xoffset = xoffset;
-	ops->var.yoffset = yoffset;
-	err = fb_pan_display(info, &ops->var);
-	ops->var.xoffset = info->var.xoffset;
-	ops->var.yoffset = info->var.yoffset;
-	ops->var.vmode = info->var.vmode;
-	return err;
-}
-
-void fbcon_rotate_ud(struct fbcon_ops *ops)
-{
-	ops->bmove = ud_bmove;
-	ops->clear = ud_clear;
-	ops->putcs = ud_putcs;
-	ops->clear_margins = ud_clear_margins;
-	ops->cursor = ud_cursor;
-	ops->update_start = ud_update_start;
-}
-EXPORT_SYMBOL(fbcon_rotate_ud);
-
-MODULE_AUTHOR("Antonino Daplas <adaplas@pol.net>");
-MODULE_DESCRIPTION("Console Rotation (180 degrees) Support");
-MODULE_LICENSE("GPL");
diff --git a/drivers/video/console/softcursor.c b/drivers/video/console/softcursor.c
deleted file mode 100644
index 46dd8f5d2e9e..000000000000
--- a/drivers/video/console/softcursor.c
+++ /dev/null
@@ -1,82 +0,0 @@
-/*
- * linux/drivers/video/console/softcursor.c
- *
- * Generic software cursor for frame buffer devices
- *
- *  Created 14 Nov 2002 by James Simmons
- *
- * This file is subject to the terms and conditions of the GNU General
- * Public License.  See the file COPYING in the main directory of this
- * archive for more details.
- */
-
-#include <linux/module.h>
-#include <linux/string.h>
-#include <linux/fb.h>
-#include <linux/slab.h>
-
-#include <asm/io.h>
-
-#include "fbcon.h"
-
-int soft_cursor(struct fb_info *info, struct fb_cursor *cursor)
-{
-	struct fbcon_ops *ops = info->fbcon_par;
-	unsigned int scan_align = info->pixmap.scan_align - 1;
-	unsigned int buf_align = info->pixmap.buf_align - 1;
-	unsigned int i, size, dsize, s_pitch, d_pitch;
-	struct fb_image *image;
-	u8 *src, *dst;
-
-	if (info->state != FBINFO_STATE_RUNNING)
-		return 0;
-
-	s_pitch = (cursor->image.width + 7) >> 3;
-	dsize = s_pitch * cursor->image.height;
-
-	if (dsize + sizeof(struct fb_image) != ops->cursor_size) {
-		kfree(ops->cursor_src);
-		ops->cursor_size = dsize + sizeof(struct fb_image);
-
-		ops->cursor_src = kmalloc(ops->cursor_size, GFP_ATOMIC);
-		if (!ops->cursor_src) {
-			ops->cursor_size = 0;
-			return -ENOMEM;
-		}
-	}
-
-	src = ops->cursor_src + sizeof(struct fb_image);
-	image = (struct fb_image *)ops->cursor_src;
-	*image = cursor->image;
-	d_pitch = (s_pitch + scan_align) & ~scan_align;
-
-	size = d_pitch * image->height + buf_align;
-	size &= ~buf_align;
-	dst = fb_get_buffer_offset(info, &info->pixmap, size);
-
-	if (cursor->enable) {
-		switch (cursor->rop) {
-		case ROP_XOR:
-			for (i = 0; i < dsize; i++)
-				src[i] = image->data[i] ^ cursor->mask[i];
-			break;
-		case ROP_COPY:
-		default:
-			for (i = 0; i < dsize; i++)
-				src[i] = image->data[i] & cursor->mask[i];
-			break;
-		}
-	} else
-		memcpy(src, image->data, dsize);
-
-	fb_pad_aligned_buffer(dst, d_pitch, src, s_pitch, image->height);
-	image->data = dst;
-	info->fbops->fb_imageblit(info, image);
-	return 0;
-}
-
-EXPORT_SYMBOL(soft_cursor);
-
-MODULE_AUTHOR("James Simmons <jsimmons@users.sf.net>");
-MODULE_DESCRIPTION("Generic software cursor");
-MODULE_LICENSE("GPL");
diff --git a/drivers/video/console/tileblit.c b/drivers/video/console/tileblit.c
deleted file mode 100644
index 15e8e1a89c45..000000000000
--- a/drivers/video/console/tileblit.c
+++ /dev/null
@@ -1,159 +0,0 @@
-/*
- *  linux/drivers/video/console/tileblit.c -- Tile Blitting Operation
- *
- *      Copyright (C) 2004 Antonino Daplas <adaplas @pol.net>
- *
- *  This file is subject to the terms and conditions of the GNU General Public
- *  License.  See the file COPYING in the main directory of this archive for
- *  more details.
- */
-
-#include <linux/module.h>
-#include <linux/string.h>
-#include <linux/fb.h>
-#include <linux/vt_kern.h>
-#include <linux/console.h>
-#include <asm/types.h>
-#include "fbcon.h"
-
-static void tile_bmove(struct vc_data *vc, struct fb_info *info, int sy,
-		       int sx, int dy, int dx, int height, int width)
-{
-	struct fb_tilearea area;
-
-	area.sx = sx;
-	area.sy = sy;
-	area.dx = dx;
-	area.dy = dy;
-	area.height = height;
-	area.width = width;
-
-	info->tileops->fb_tilecopy(info, &area);
-}
-
-static void tile_clear(struct vc_data *vc, struct fb_info *info, int sy,
-		       int sx, int height, int width)
-{
-	struct fb_tilerect rect;
-	int bgshift = (vc->vc_hi_font_mask) ? 13 : 12;
-	int fgshift = (vc->vc_hi_font_mask) ? 9 : 8;
-
-	rect.index = vc->vc_video_erase_char &
-		((vc->vc_hi_font_mask) ? 0x1ff : 0xff);
-	rect.fg = attr_fgcol_ec(fgshift, vc, info);
-	rect.bg = attr_bgcol_ec(bgshift, vc, info);
-	rect.sx = sx;
-	rect.sy = sy;
-	rect.width = width;
-	rect.height = height;
-	rect.rop = ROP_COPY;
-
-	info->tileops->fb_tilefill(info, &rect);
-}
-
-static void tile_putcs(struct vc_data *vc, struct fb_info *info,
-		       const unsigned short *s, int count, int yy, int xx,
-		       int fg, int bg)
-{
-	struct fb_tileblit blit;
-	unsigned short charmask = vc->vc_hi_font_mask ? 0x1ff : 0xff;
-	int size = sizeof(u32) * count, i;
-
-	blit.sx = xx;
-	blit.sy = yy;
-	blit.width = count;
-	blit.height = 1;
-	blit.fg = fg;
-	blit.bg = bg;
-	blit.length = count;
-	blit.indices = (u32 *) fb_get_buffer_offset(info, &info->pixmap, size);
-	for (i = 0; i < count; i++)
-		blit.indices[i] = (u32)(scr_readw(s++) & charmask);
-
-	info->tileops->fb_tileblit(info, &blit);
-}
-
-static void tile_clear_margins(struct vc_data *vc, struct fb_info *info,
-			       int bottom_only)
-{
-	return;
-}
-
-static void tile_cursor(struct vc_data *vc, struct fb_info *info, int mode,
-			int softback_lines, int fg, int bg)
-{
-	struct fb_tilecursor cursor;
-	int use_sw = (vc->vc_cursor_type & 0x10);
-
-	cursor.sx = vc->vc_x;
-	cursor.sy = vc->vc_y;
-	cursor.mode = (mode == CM_ERASE || use_sw) ? 0 : 1;
-	cursor.fg = fg;
-	cursor.bg = bg;
-
-	switch (vc->vc_cursor_type & 0x0f) {
-	case CUR_NONE:
-		cursor.shape = FB_TILE_CURSOR_NONE;
-		break;
-	case CUR_UNDERLINE:
-		cursor.shape = FB_TILE_CURSOR_UNDERLINE;
-		break;
-	case CUR_LOWER_THIRD:
-		cursor.shape = FB_TILE_CURSOR_LOWER_THIRD;
-		break;
-	case CUR_LOWER_HALF:
-		cursor.shape = FB_TILE_CURSOR_LOWER_HALF;
-		break;
-	case CUR_TWO_THIRDS:
-		cursor.shape = FB_TILE_CURSOR_TWO_THIRDS;
-		break;
-	case CUR_BLOCK:
-	default:
-		cursor.shape = FB_TILE_CURSOR_BLOCK;
-		break;
-	}
-
-	info->tileops->fb_tilecursor(info, &cursor);
-}
-
-static int tile_update_start(struct fb_info *info)
-{
-	struct fbcon_ops *ops = info->fbcon_par;
-	int err;
-
-	err = fb_pan_display(info, &ops->var);
-	ops->var.xoffset = info->var.xoffset;
-	ops->var.yoffset = info->var.yoffset;
-	ops->var.vmode = info->var.vmode;
-	return err;
-}
-
-void fbcon_set_tileops(struct vc_data *vc, struct fb_info *info)
-{
-	struct fb_tilemap map;
-	struct fbcon_ops *ops = info->fbcon_par;
-
-	ops->bmove = tile_bmove;
-	ops->clear = tile_clear;
-	ops->putcs = tile_putcs;
-	ops->clear_margins = tile_clear_margins;
-	ops->cursor = tile_cursor;
-	ops->update_start = tile_update_start;
-
-	if (ops->p) {
-		map.width = vc->vc_font.width;
-		map.height = vc->vc_font.height;
-		map.depth = 1;
-		map.length = (ops->p->userfont) ?
-			FNTCHARCNT(ops->p->fontdata) : 256;
-		map.data = ops->p->fontdata;
-		info->tileops->fb_settile(info, &map);
-	}
-}
-
-EXPORT_SYMBOL(fbcon_set_tileops);
-
-MODULE_AUTHOR("Antonino Daplas <adaplas@pol.net>");
-MODULE_DESCRIPTION("Tile Blitting Operation");
-MODULE_LICENSE("GPL");
-
diff --git a/drivers/video/fbdev/core/Makefile b/drivers/video/fbdev/core/Makefile
index 9e3ddf225393..0214b863ac3f 100644
--- a/drivers/video/fbdev/core/Makefile
+++ b/drivers/video/fbdev/core/Makefile
@@ -4,6 +4,17 @@ obj-$(CONFIG_FB)                  += fb.o
 fb-y                              := fbmem.o fbmon.o fbcmap.o fbsysfs.o \
                                      modedb.o fbcvt.o
 fb-$(CONFIG_FB_DEFERRED_IO)       += fb_defio.o
+
+ifeq ($(CONFIG_FRAMEBUFFER_CONSOLE),y)
+fb-y				  += fbcon.o bitblit.o softcursor.o
+ifeq ($(CONFIG_FB_TILEBLITTING),y)
+fb-y				  += tileblit.o
+endif
+ifeq ($(CONFIG_FRAMEBUFFER_CONSOLE_ROTATION),y)
+fb-y				  += fbcon_rotate.o fbcon_cw.o fbcon_ud.o \
+				     fbcon_ccw.o
+endif
+endif
 fb-objs                           := $(fb-y)
 
 obj-$(CONFIG_FB_CFB_FILLRECT)  += cfbfillrect.o
diff --git a/drivers/video/fbdev/core/bitblit.c b/drivers/video/fbdev/core/bitblit.c
new file mode 100644
index 000000000000..99f3a1c3d093
--- /dev/null
+++ b/drivers/video/fbdev/core/bitblit.c
@@ -0,0 +1,418 @@
+/*
+ *  linux/drivers/video/console/bitblit.c -- BitBlitting Operation
+ *
+ *  Originally from the 'accel_*' routines in drivers/video/console/fbcon.c
+ *
+ *      Copyright (C) 2004 Antonino Daplas <adaplas @pol.net>
+ *
+ *  This file is subject to the terms and conditions of the GNU General Public
+ *  License.  See the file COPYING in the main directory of this archive for
+ *  more details.
+ */
+
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/string.h>
+#include <linux/fb.h>
+#include <linux/vt_kern.h>
+#include <linux/console.h>
+#include <asm/types.h>
+#include "fbcon.h"
+
+/*
+ * Accelerated handlers.
+ */
+static void update_attr(u8 *dst, u8 *src, int attribute,
+			       struct vc_data *vc)
+{
+	int i, offset = (vc->vc_font.height < 10) ? 1 : 2;
+	int width = DIV_ROUND_UP(vc->vc_font.width, 8);
+	unsigned int cellsize = vc->vc_font.height * width;
+	u8 c;
+
+	offset = cellsize - (offset * width);
+	for (i = 0; i < cellsize; i++) {
+		c = src[i];
+		if (attribute & FBCON_ATTRIBUTE_UNDERLINE && i >= offset)
+			c = 0xff;
+		if (attribute & FBCON_ATTRIBUTE_BOLD)
+			c |= c >> 1;
+		if (attribute & FBCON_ATTRIBUTE_REVERSE)
+			c = ~c;
+		dst[i] = c;
+	}
+}
+
+static void bit_bmove(struct vc_data *vc, struct fb_info *info, int sy,
+		      int sx, int dy, int dx, int height, int width)
+{
+	struct fb_copyarea area;
+
+	area.sx = sx * vc->vc_font.width;
+	area.sy = sy * vc->vc_font.height;
+	area.dx = dx * vc->vc_font.width;
+	area.dy = dy * vc->vc_font.height;
+	area.height = height * vc->vc_font.height;
+	area.width = width * vc->vc_font.width;
+
+	info->fbops->fb_copyarea(info, &area);
+}
+
+static void bit_clear(struct vc_data *vc, struct fb_info *info, int sy,
+		      int sx, int height, int width)
+{
+	int bgshift = (vc->vc_hi_font_mask) ? 13 : 12;
+	struct fb_fillrect region;
+
+	region.color = attr_bgcol_ec(bgshift, vc, info);
+	region.dx = sx * vc->vc_font.width;
+	region.dy = sy * vc->vc_font.height;
+	region.width = width * vc->vc_font.width;
+	region.height = height * vc->vc_font.height;
+	region.rop = ROP_COPY;
+
+	info->fbops->fb_fillrect(info, &region);
+}
+
+static inline void bit_putcs_aligned(struct vc_data *vc, struct fb_info *info,
+				     const u16 *s, u32 attr, u32 cnt,
+				     u32 d_pitch, u32 s_pitch, u32 cellsize,
+				     struct fb_image *image, u8 *buf, u8 *dst)
+{
+	u16 charmask = vc->vc_hi_font_mask ? 0x1ff : 0xff;
+	u32 idx = vc->vc_font.width >> 3;
+	u8 *src;
+
+	while (cnt--) {
+		src = vc->vc_font.data + (scr_readw(s++)&
+					  charmask)*cellsize;
+
+		if (attr) {
+			update_attr(buf, src, attr, vc);
+			src = buf;
+		}
+
+		if (likely(idx == 1))
+			__fb_pad_aligned_buffer(dst, d_pitch, src, idx,
+						image->height);
+		else
+			fb_pad_aligned_buffer(dst, d_pitch, src, idx,
+					      image->height);
+
+		dst += s_pitch;
+	}
+
+	info->fbops->fb_imageblit(info, image);
+}
+
+static inline void bit_putcs_unaligned(struct vc_data *vc,
+				       struct fb_info *info, const u16 *s,
+				       u32 attr, u32 cnt, u32 d_pitch,
+				       u32 s_pitch, u32 cellsize,
+				       struct fb_image *image, u8 *buf,
+				       u8 *dst)
+{
+	u16 charmask = vc->vc_hi_font_mask ? 0x1ff : 0xff;
+	u32 shift_low = 0, mod = vc->vc_font.width % 8;
+	u32 shift_high = 8;
+	u32 idx = vc->vc_font.width >> 3;
+	u8 *src;
+
+	while (cnt--) {
+		src = vc->vc_font.data + (scr_readw(s++)&
+					  charmask)*cellsize;
+
+		if (attr) {
+			update_attr(buf, src, attr, vc);
+			src = buf;
+		}
+
+		fb_pad_unaligned_buffer(dst, d_pitch, src, idx,
+					image->height, shift_high,
+					shift_low, mod);
+		shift_low += mod;
+		dst += (shift_low >= 8) ? s_pitch : s_pitch - 1;
+		shift_low &= 7;
+		shift_high = 8 - shift_low;
+	}
+
+	info->fbops->fb_imageblit(info, image);
+
+}
+
+static void bit_putcs(struct vc_data *vc, struct fb_info *info,
+		      const unsigned short *s, int count, int yy, int xx,
+		      int fg, int bg)
+{
+	struct fb_image image;
+	u32 width = DIV_ROUND_UP(vc->vc_font.width, 8);
+	u32 cellsize = width * vc->vc_font.height;
+	u32 maxcnt = info->pixmap.size/cellsize;
+	u32 scan_align = info->pixmap.scan_align - 1;
+	u32 buf_align = info->pixmap.buf_align - 1;
+	u32 mod = vc->vc_font.width % 8, cnt, pitch, size;
+	u32 attribute = get_attribute(info, scr_readw(s));
+	u8 *dst, *buf = NULL;
+
+	image.fg_color = fg;
+	image.bg_color = bg;
+	image.dx = xx * vc->vc_font.width;
+	image.dy = yy * vc->vc_font.height;
+	image.height = vc->vc_font.height;
+	image.depth = 1;
+
+	if (attribute) {
+		buf = kmalloc(cellsize, GFP_ATOMIC);
+		if (!buf)
+			return;
+	}
+
+	while (count) {
+		if (count > maxcnt)
+			cnt = maxcnt;
+		else
+			cnt = count;
+
+		image.width = vc->vc_font.width * cnt;
+		pitch = DIV_ROUND_UP(image.width, 8) + scan_align;
+		pitch &= ~scan_align;
+		size = pitch * image.height + buf_align;
+		size &= ~buf_align;
+		dst = fb_get_buffer_offset(info, &info->pixmap, size);
+		image.data = dst;
+
+		if (!mod)
+			bit_putcs_aligned(vc, info, s, attribute, cnt, pitch,
+					  width, cellsize, &image, buf, dst);
+		else
+			bit_putcs_unaligned(vc, info, s, attribute, cnt,
+					    pitch, width, cellsize, &image,
+					    buf, dst);
+
+		image.dx += cnt * vc->vc_font.width;
+		count -= cnt;
+		s += cnt;
+	}
+
+	/* buf is always NULL except when in monochrome mode, so in this case
+	   it's a gain to check buf against NULL even though kfree() handles
+	   NULL pointers just fine */
+	if (unlikely(buf))
+		kfree(buf);
+
+}
+
+static void bit_clear_margins(struct vc_data *vc, struct fb_info *info,
+			      int bottom_only)
+{
+	unsigned int cw = vc->vc_font.width;
+	unsigned int ch = vc->vc_font.height;
+	unsigned int rw = info->var.xres - (vc->vc_cols*cw);
+	unsigned int bh = info->var.yres - (vc->vc_rows*ch);
+	unsigned int rs = info->var.xres - rw;
+	unsigned int bs = info->var.yres - bh;
+	struct fb_fillrect region;
+
+	region.color = 0;
+	region.rop = ROP_COPY;
+
+	if (rw && !bottom_only) {
+		region.dx = info->var.xoffset + rs;
+		region.dy = 0;
+		region.width = rw;
+		region.height = info->var.yres_virtual;
+		info->fbops->fb_fillrect(info, &region);
+	}
+
+	if (bh) {
+		region.dx = info->var.xoffset;
+		region.dy = info->var.yoffset + bs;
+		region.width = rs;
+		region.height = bh;
+		info->fbops->fb_fillrect(info, &region);
+	}
+}
+
+static void bit_cursor(struct vc_data *vc, struct fb_info *info, int mode,
+		       int softback_lines, int fg, int bg)
+{
+	struct fb_cursor cursor;
+	struct fbcon_ops *ops = info->fbcon_par;
+	unsigned short charmask = vc->vc_hi_font_mask ? 0x1ff : 0xff;
+	int w = DIV_ROUND_UP(vc->vc_font.width, 8), c;
+	int y = real_y(ops->p, vc->vc_y);
+	int attribute, use_sw = (vc->vc_cursor_type & 0x10);
+	int err = 1;
+	char *src;
+
+	cursor.set = 0;
+
+	if (softback_lines) {
+		if (y + softback_lines >= vc->vc_rows) {
+			mode = CM_ERASE;
+			ops->cursor_flash = 0;
+			return;
+		} else
+			y += softback_lines;
+	}
+
+ 	c = scr_readw((u16 *) vc->vc_pos);
+	attribute = get_attribute(info, c);
+	src = vc->vc_font.data + ((c & charmask) * (w * vc->vc_font.height));
+
+	if (ops->cursor_state.image.data != src ||
+	    ops->cursor_reset) {
+	    ops->cursor_state.image.data = src;
+	    cursor.set |= FB_CUR_SETIMAGE;
+	}
+
+	if (attribute) {
+		u8 *dst;
+
+		dst = kmalloc(w * vc->vc_font.height, GFP_ATOMIC);
+		if (!dst)
+			return;
+		kfree(ops->cursor_data);
+		ops->cursor_data = dst;
+		update_attr(dst, src, attribute, vc);
+		src = dst;
+	}
+
+	if (ops->cursor_state.image.fg_color != fg ||
+	    ops->cursor_state.image.bg_color != bg ||
+	    ops->cursor_reset) {
+		ops->cursor_state.image.fg_color = fg;
+		ops->cursor_state.image.bg_color = bg;
+		cursor.set |= FB_CUR_SETCMAP;
+	}
+
+	if ((ops->cursor_state.image.dx != (vc->vc_font.width * vc->vc_x)) ||
+	    (ops->cursor_state.image.dy != (vc->vc_font.height * y)) ||
+	    ops->cursor_reset) {
+		ops->cursor_state.image.dx = vc->vc_font.width * vc->vc_x;
+		ops->cursor_state.image.dy = vc->vc_font.height * y;
+		cursor.set |= FB_CUR_SETPOS;
+	}
+
+	if (ops->cursor_state.image.height != vc->vc_font.height ||
+	    ops->cursor_state.image.width != vc->vc_font.width ||
+	    ops->cursor_reset) {
+		ops->cursor_state.image.height = vc->vc_font.height;
+		ops->cursor_state.image.width = vc->vc_font.width;
+		cursor.set |= FB_CUR_SETSIZE;
+	}
+
+	if (ops->cursor_state.hot.x || ops->cursor_state.hot.y ||
+	    ops->cursor_reset) {
+		ops->cursor_state.hot.x = cursor.hot.y = 0;
+		cursor.set |= FB_CUR_SETHOT;
+	}
+
+	if (cursor.set & FB_CUR_SETSIZE ||
+	    vc->vc_cursor_type != ops->p->cursor_shape ||
+	    ops->cursor_state.mask == NULL ||
+	    ops->cursor_reset) {
+		char *mask = kmalloc(w*vc->vc_font.height, GFP_ATOMIC);
+		int cur_height, size, i = 0;
+		u8 msk = 0xff;
+
+		if (!mask)
+			return;
+
+		kfree(ops->cursor_state.mask);
+		ops->cursor_state.mask = mask;
+
+		ops->p->cursor_shape = vc->vc_cursor_type;
+		cursor.set |= FB_CUR_SETSHAPE;
+
+		switch (ops->p->cursor_shape & CUR_HWMASK) {
+		case CUR_NONE:
+			cur_height = 0;
+			break;
+		case CUR_UNDERLINE:
+			cur_height = (vc->vc_font.height < 10) ? 1 : 2;
+			break;
+		case CUR_LOWER_THIRD:
+			cur_height = vc->vc_font.height/3;
+			break;
+		case CUR_LOWER_HALF:
+			cur_height = vc->vc_font.height >> 1;
+			break;
+		case CUR_TWO_THIRDS:
+			cur_height = (vc->vc_font.height << 1)/3;
+			break;
+		case CUR_BLOCK:
+		default:
+			cur_height = vc->vc_font.height;
+			break;
+		}
+		size = (vc->vc_font.height - cur_height) * w;
+		while (size--)
+			mask[i++] = ~msk;
+		size = cur_height * w;
+		while (size--)
+			mask[i++] = msk;
+	}
+
+	switch (mode) {
+	case CM_ERASE:
+		ops->cursor_state.enable = 0;
+		break;
+	case CM_DRAW:
+	case CM_MOVE:
+	default:
+		ops->cursor_state.enable = (use_sw) ? 0 : 1;
+		break;
+	}
+
+	cursor.image.data = src;
+	cursor.image.fg_color = ops->cursor_state.image.fg_color;
+	cursor.image.bg_color = ops->cursor_state.image.bg_color;
+	cursor.image.dx = ops->cursor_state.image.dx;
+	cursor.image.dy = ops->cursor_state.image.dy;
+	cursor.image.height = ops->cursor_state.image.height;
+	cursor.image.width = ops->cursor_state.image.width;
+	cursor.hot.x = ops->cursor_state.hot.x;
+	cursor.hot.y = ops->cursor_state.hot.y;
+	cursor.mask = ops->cursor_state.mask;
+	cursor.enable = ops->cursor_state.enable;
+	cursor.image.depth = 1;
+	cursor.rop = ROP_XOR;
+
+	if (info->fbops->fb_cursor)
+		err = info->fbops->fb_cursor(info, &cursor);
+
+	if (err)
+		soft_cursor(info, &cursor);
+
+	ops->cursor_reset = 0;
+}
+
+static int bit_update_start(struct fb_info *info)
+{
+	struct fbcon_ops *ops = info->fbcon_par;
+	int err;
+
+	err = fb_pan_display(info, &ops->var);
+	ops->var.xoffset = info->var.xoffset;
+	ops->var.yoffset = info->var.yoffset;
+	ops->var.vmode = info->var.vmode;
+	return err;
+}
+
+void fbcon_set_bitops(struct fbcon_ops *ops)
+{
+	ops->bmove = bit_bmove;
+	ops->clear = bit_clear;
+	ops->putcs = bit_putcs;
+	ops->clear_margins = bit_clear_margins;
+	ops->cursor = bit_cursor;
+	ops->update_start = bit_update_start;
+	ops->rotate_font = NULL;
+
+	if (ops->rotate)
+		fbcon_set_rotate(ops);
+}
+
+EXPORT_SYMBOL(fbcon_set_bitops);
+
diff --git a/drivers/video/fbdev/core/fbcon.c b/drivers/video/fbdev/core/fbcon.c
new file mode 100644
index 000000000000..86b3bcbd01a8
--- /dev/null
+++ b/drivers/video/fbdev/core/fbcon.c
@@ -0,0 +1,3658 @@
+/*
+ *  linux/drivers/video/fbcon.c -- Low level frame buffer based console driver
+ *
+ *	Copyright (C) 1995 Geert Uytterhoeven
+ *
+ *
+ *  This file is based on the original Amiga console driver (amicon.c):
+ *
+ *	Copyright (C) 1993 Hamish Macdonald
+ *			   Greg Harp
+ *	Copyright (C) 1994 David Carter [carter@compsci.bristol.ac.uk]
+ *
+ *	      with work by William Rucklidge (wjr@cs.cornell.edu)
+ *			   Geert Uytterhoeven
+ *			   Jes Sorensen (jds@kom.auc.dk)
+ *			   Martin Apel
+ *
+ *  and on the original Atari console driver (atacon.c):
+ *
+ *	Copyright (C) 1993 Bjoern Brauel
+ *			   Roman Hodek
+ *
+ *	      with work by Guenther Kelleter
+ *			   Martin Schaller
+ *			   Andreas Schwab
+ *
+ *  Hardware cursor support added by Emmanuel Marty (core@ggi-project.org)
+ *  Smart redraw scrolling, arbitrary font width support, 512char font support
+ *  and software scrollback added by 
+ *                         Jakub Jelinek (jj@ultra.linux.cz)
+ *
+ *  Random hacking by Martin Mares <mj@ucw.cz>
+ *
+ *	2001 - Documented with DocBook
+ *	- Brad Douglas <brad@neruo.com>
+ *
+ *  The low level operations for the various display memory organizations are
+ *  now in separate source files.
+ *
+ *  Currently the following organizations are supported:
+ *
+ *    o afb			Amiga bitplanes
+ *    o cfb{2,4,8,16,24,32}	Packed pixels
+ *    o ilbm			Amiga interleaved bitplanes
+ *    o iplan2p[248]		Atari interleaved bitplanes
+ *    o mfb			Monochrome
+ *    o vga			VGA characters/attributes
+ *
+ *  To do:
+ *
+ *    - Implement 16 plane mode (iplan2p16)
+ *
+ *
+ *  This file is subject to the terms and conditions of the GNU General Public
+ *  License.  See the file COPYING in the main directory of this archive for
+ *  more details.
+ */
+
+#undef FBCONDEBUG
+
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/fs.h>
+#include <linux/kernel.h>
+#include <linux/delay.h>	/* MSch: for IRQ probe */
+#include <linux/console.h>
+#include <linux/string.h>
+#include <linux/kd.h>
+#include <linux/slab.h>
+#include <linux/fb.h>
+#include <linux/fbcon.h>
+#include <linux/vt_kern.h>
+#include <linux/selection.h>
+#include <linux/font.h>
+#include <linux/smp.h>
+#include <linux/init.h>
+#include <linux/interrupt.h>
+#include <linux/crc32.h> /* For counting font checksums */
+#include <asm/fb.h>
+#include <asm/irq.h>
+
+#include "fbcon.h"
+
+#ifdef FBCONDEBUG
+#  define DPRINTK(fmt, args...) printk(KERN_DEBUG "%s: " fmt, __func__ , ## args)
+#else
+#  define DPRINTK(fmt, args...)
+#endif
+
+enum {
+	FBCON_LOGO_CANSHOW	= -1,	/* the logo can be shown */
+	FBCON_LOGO_DRAW		= -2,	/* draw the logo to a console */
+	FBCON_LOGO_DONTSHOW	= -3	/* do not show the logo */
+};
+
+static struct display fb_display[MAX_NR_CONSOLES];
+
+static signed char con2fb_map[MAX_NR_CONSOLES];
+static signed char con2fb_map_boot[MAX_NR_CONSOLES];
+
+static int logo_lines;
+/* logo_shown is an index to vc_cons when >= 0; otherwise follows FBCON_LOGO
+   enums.  */
+static int logo_shown = FBCON_LOGO_CANSHOW;
+/* Software scrollback */
+static int fbcon_softback_size = 32768;
+static unsigned long softback_buf, softback_curr;
+static unsigned long softback_in;
+static unsigned long softback_top, softback_end;
+static int softback_lines;
+/* console mappings */
+static int first_fb_vc;
+static int last_fb_vc = MAX_NR_CONSOLES - 1;
+static int fbcon_is_default = 1; 
+static int fbcon_has_exited;
+static int primary_device = -1;
+static int fbcon_has_console_bind;
+
+#ifdef CONFIG_FRAMEBUFFER_CONSOLE_DETECT_PRIMARY
+static int map_override;
+
+static inline void fbcon_map_override(void)
+{
+	map_override = 1;
+}
+#else
+static inline void fbcon_map_override(void)
+{
+}
+#endif /* CONFIG_FRAMEBUFFER_CONSOLE_DETECT_PRIMARY */
+
+/* font data */
+static char fontname[40];
+
+/* current fb_info */
+static int info_idx = -1;
+
+/* console rotation */
+static int initial_rotation;
+static int fbcon_has_sysfs;
+
+static const struct consw fb_con;
+
+#define CM_SOFTBACK	(8)
+
+#define advance_row(p, delta) (unsigned short *)((unsigned long)(p) + (delta) * vc->vc_size_row)
+
+static int fbcon_set_origin(struct vc_data *);
+
+static int fbcon_cursor_noblink;
+
+#define divides(a, b)	((!(a) || (b)%(a)) ? 0 : 1)
+
+/*
+ *  Interface used by the world
+ */
+
+static const char *fbcon_startup(void);
+static void fbcon_init(struct vc_data *vc, int init);
+static void fbcon_deinit(struct vc_data *vc);
+static void fbcon_clear(struct vc_data *vc, int sy, int sx, int height,
+			int width);
+static void fbcon_putc(struct vc_data *vc, int c, int ypos, int xpos);
+static void fbcon_putcs(struct vc_data *vc, const unsigned short *s,
+			int count, int ypos, int xpos);
+static void fbcon_clear_margins(struct vc_data *vc, int bottom_only);
+static void fbcon_cursor(struct vc_data *vc, int mode);
+static void fbcon_bmove(struct vc_data *vc, int sy, int sx, int dy, int dx,
+			int height, int width);
+static int fbcon_switch(struct vc_data *vc);
+static int fbcon_blank(struct vc_data *vc, int blank, int mode_switch);
+static void fbcon_set_palette(struct vc_data *vc, const unsigned char *table);
+
+/*
+ *  Internal routines
+ */
+static __inline__ void ywrap_up(struct vc_data *vc, int count);
+static __inline__ void ywrap_down(struct vc_data *vc, int count);
+static __inline__ void ypan_up(struct vc_data *vc, int count);
+static __inline__ void ypan_down(struct vc_data *vc, int count);
+static void fbcon_bmove_rec(struct vc_data *vc, struct display *p, int sy, int sx,
+			    int dy, int dx, int height, int width, u_int y_break);
+static void fbcon_set_disp(struct fb_info *info, struct fb_var_screeninfo *var,
+			   int unit);
+static void fbcon_redraw_move(struct vc_data *vc, struct display *p,
+			      int line, int count, int dy);
+static void fbcon_modechanged(struct fb_info *info);
+static void fbcon_set_all_vcs(struct fb_info *info);
+static void fbcon_start(void);
+static void fbcon_exit(void);
+static struct device *fbcon_device;
+
+#ifdef CONFIG_FRAMEBUFFER_CONSOLE_ROTATION
+static inline void fbcon_set_rotation(struct fb_info *info)
+{
+	struct fbcon_ops *ops = info->fbcon_par;
+
+	if (!(info->flags & FBINFO_MISC_TILEBLITTING) &&
+	    ops->p->con_rotate < 4)
+		ops->rotate = ops->p->con_rotate;
+	else
+		ops->rotate = 0;
+}
+
+static void fbcon_rotate(struct fb_info *info, u32 rotate)
+{
+	struct fbcon_ops *ops= info->fbcon_par;
+	struct fb_info *fb_info;
+
+	if (!ops || ops->currcon == -1)
+		return;
+
+	fb_info = registered_fb[con2fb_map[ops->currcon]];
+
+	if (info == fb_info) {
+		struct display *p = &fb_display[ops->currcon];
+
+		if (rotate < 4)
+			p->con_rotate = rotate;
+		else
+			p->con_rotate = 0;
+
+		fbcon_modechanged(info);
+	}
+}
+
+static void fbcon_rotate_all(struct fb_info *info, u32 rotate)
+{
+	struct fbcon_ops *ops = info->fbcon_par;
+	struct vc_data *vc;
+	struct display *p;
+	int i;
+
+	if (!ops || ops->currcon < 0 || rotate > 3)
+		return;
+
+	for (i = first_fb_vc; i <= last_fb_vc; i++) {
+		vc = vc_cons[i].d;
+		if (!vc || vc->vc_mode != KD_TEXT ||
+		    registered_fb[con2fb_map[i]] != info)
+			continue;
+
+		p = &fb_display[vc->vc_num];
+		p->con_rotate = rotate;
+	}
+
+	fbcon_set_all_vcs(info);
+}
+#else
+static inline void fbcon_set_rotation(struct fb_info *info)
+{
+	struct fbcon_ops *ops = info->fbcon_par;
+
+	ops->rotate = FB_ROTATE_UR;
+}
+
+static void fbcon_rotate(struct fb_info *info, u32 rotate)
+{
+	return;
+}
+
+static void fbcon_rotate_all(struct fb_info *info, u32 rotate)
+{
+	return;
+}
+#endif /* CONFIG_FRAMEBUFFER_CONSOLE_ROTATION */
+
+static int fbcon_get_rotate(struct fb_info *info)
+{
+	struct fbcon_ops *ops = info->fbcon_par;
+
+	return (ops) ? ops->rotate : 0;
+}
+
+static inline int fbcon_is_inactive(struct vc_data *vc, struct fb_info *info)
+{
+	struct fbcon_ops *ops = info->fbcon_par;
+
+	return (info->state != FBINFO_STATE_RUNNING ||
+		vc->vc_mode != KD_TEXT || ops->graphics) &&
+		!vt_force_oops_output(vc);
+}
+
+static int get_color(struct vc_data *vc, struct fb_info *info,
+	      u16 c, int is_fg)
+{
+	int depth = fb_get_color_depth(&info->var, &info->fix);
+	int color = 0;
+
+	if (console_blanked) {
+		unsigned short charmask = vc->vc_hi_font_mask ? 0x1ff : 0xff;
+
+		c = vc->vc_video_erase_char & charmask;
+	}
+
+	if (depth != 1)
+		color = (is_fg) ? attr_fgcol((vc->vc_hi_font_mask) ? 9 : 8, c)
+			: attr_bgcol((vc->vc_hi_font_mask) ? 13 : 12, c);
+
+	switch (depth) {
+	case 1:
+	{
+		int col = mono_col(info);
+		/* 0 or 1 */
+		int fg = (info->fix.visual != FB_VISUAL_MONO01) ? col : 0;
+		int bg = (info->fix.visual != FB_VISUAL_MONO01) ? 0 : col;
+
+		if (console_blanked)
+			fg = bg;
+
+		color = (is_fg) ? fg : bg;
+		break;
+	}
+	case 2:
+		/*
+		 * Scale down 16-colors to 4 colors. Default 4-color palette
+		 * is grayscale. However, simply dividing the values by 4
+		 * will not work, as colors 1, 2 and 3 will be scaled-down
+		 * to zero rendering them invisible.  So empirically convert
+		 * colors to a sane 4-level grayscale.
+		 */
+		switch (color) {
+		case 0:
+			color = 0; /* black */
+			break;
+		case 1 ... 6:
+			color = 2; /* white */
+			break;
+		case 7 ... 8:
+			color = 1; /* gray */
+			break;
+		default:
+			color = 3; /* intense white */
+			break;
+		}
+		break;
+	case 3:
+		/*
+		 * Last 8 entries of default 16-color palette is a more intense
+		 * version of the first 8 (i.e., same chrominance, different
+		 * luminance).
+		 */
+		color &= 7;
+		break;
+	}
+
+
+	return color;
+}
+
+static void fbcon_update_softback(struct vc_data *vc)
+{
+	int l = fbcon_softback_size / vc->vc_size_row;
+
+	if (l > 5)
+		softback_end = softback_buf + l * vc->vc_size_row;
+	else
+		/* Smaller scrollback makes no sense, and 0 would screw
+		   the operation totally */
+		softback_top = 0;
+}
+
+static void fb_flashcursor(struct work_struct *work)
+{
+	struct fb_info *info = container_of(work, struct fb_info, queue);
+	struct fbcon_ops *ops = info->fbcon_par;
+	struct vc_data *vc = NULL;
+	int c;
+	int mode;
+	int ret;
+
+	/* FIXME: we should sort out the unbind locking instead */
+	/* instead we just fail to flash the cursor if we can't get
+	 * the lock instead of blocking fbcon deinit */
+	ret = console_trylock();
+	if (ret == 0)
+		return;
+
+	if (ops && ops->currcon != -1)
+		vc = vc_cons[ops->currcon].d;
+
+	if (!vc || !con_is_visible(vc) ||
+ 	    registered_fb[con2fb_map[vc->vc_num]] != info ||
+	    vc->vc_deccm != 1) {
+		console_unlock();
+		return;
+	}
+
+	c = scr_readw((u16 *) vc->vc_pos);
+	mode = (!ops->cursor_flash || ops->cursor_state.enable) ?
+		CM_ERASE : CM_DRAW;
+	ops->cursor(vc, info, mode, softback_lines, get_color(vc, info, c, 1),
+		    get_color(vc, info, c, 0));
+	console_unlock();
+}
+
+static void cursor_timer_handler(unsigned long dev_addr)
+{
+	struct fb_info *info = (struct fb_info *) dev_addr;
+	struct fbcon_ops *ops = info->fbcon_par;
+
+	queue_work(system_power_efficient_wq, &info->queue);
+	mod_timer(&ops->cursor_timer, jiffies + ops->cur_blink_jiffies);
+}
+
+static void fbcon_add_cursor_timer(struct fb_info *info)
+{
+	struct fbcon_ops *ops = info->fbcon_par;
+
+	if ((!info->queue.func || info->queue.func == fb_flashcursor) &&
+	    !(ops->flags & FBCON_FLAGS_CURSOR_TIMER) &&
+	    !fbcon_cursor_noblink) {
+		if (!info->queue.func)
+			INIT_WORK(&info->queue, fb_flashcursor);
+
+		setup_timer(&ops->cursor_timer, cursor_timer_handler,
+			    (unsigned long) info);
+		mod_timer(&ops->cursor_timer, jiffies + ops->cur_blink_jiffies);
+		ops->flags |= FBCON_FLAGS_CURSOR_TIMER;
+	}
+}
+
+static void fbcon_del_cursor_timer(struct fb_info *info)
+{
+	struct fbcon_ops *ops = info->fbcon_par;
+
+	if (info->queue.func == fb_flashcursor &&
+	    ops->flags & FBCON_FLAGS_CURSOR_TIMER) {
+		del_timer_sync(&ops->cursor_timer);
+		ops->flags &= ~FBCON_FLAGS_CURSOR_TIMER;
+	}
+}
+
+#ifndef MODULE
+static int __init fb_console_setup(char *this_opt)
+{
+	char *options;
+	int i, j;
+
+	if (!this_opt || !*this_opt)
+		return 1;
+
+	while ((options = strsep(&this_opt, ",")) != NULL) {
+		if (!strncmp(options, "font:", 5)) {
+			strlcpy(fontname, options + 5, sizeof(fontname));
+			continue;
+		}
+		
+		if (!strncmp(options, "scrollback:", 11)) {
+			options += 11;
+			if (*options) {
+				fbcon_softback_size = simple_strtoul(options, &options, 0);
+				if (*options == 'k' || *options == 'K') {
+					fbcon_softback_size *= 1024;
+				}
+			}
+			continue;
+		}
+		
+		if (!strncmp(options, "map:", 4)) {
+			options += 4;
+			if (*options) {
+				for (i = 0, j = 0; i < MAX_NR_CONSOLES; i++) {
+					if (!options[j])
+						j = 0;
+					con2fb_map_boot[i] =
+						(options[j++]-'0') % FB_MAX;
+				}
+
+				fbcon_map_override();
+			}
+			continue;
+		}
+
+		if (!strncmp(options, "vc:", 3)) {
+			options += 3;
+			if (*options)
+				first_fb_vc = simple_strtoul(options, &options, 10) - 1;
+			if (first_fb_vc < 0)
+				first_fb_vc = 0;
+			if (*options++ == '-')
+				last_fb_vc = simple_strtoul(options, &options, 10) - 1;
+			fbcon_is_default = 0; 
+			continue;
+		}
+
+		if (!strncmp(options, "rotate:", 7)) {
+			options += 7;
+			if (*options)
+				initial_rotation = simple_strtoul(options, &options, 0);
+			if (initial_rotation > 3)
+				initial_rotation = 0;
+			continue;
+		}
+	}
+	return 1;
+}
+
+__setup("fbcon=", fb_console_setup);
+#endif
+
+static int search_fb_in_map(int idx)
+{
+	int i, retval = 0;
+
+	for (i = first_fb_vc; i <= last_fb_vc; i++) {
+		if (con2fb_map[i] == idx)
+			retval = 1;
+	}
+	return retval;
+}
+
+static int search_for_mapped_con(void)
+{
+	int i, retval = 0;
+
+	for (i = first_fb_vc; i <= last_fb_vc; i++) {
+		if (con2fb_map[i] != -1)
+			retval = 1;
+	}
+	return retval;
+}
+
+static int do_fbcon_takeover(int show_logo)
+{
+	int err, i;
+
+	if (!num_registered_fb)
+		return -ENODEV;
+
+	if (!show_logo)
+		logo_shown = FBCON_LOGO_DONTSHOW;
+
+	for (i = first_fb_vc; i <= last_fb_vc; i++)
+		con2fb_map[i] = info_idx;
+
+	err = do_take_over_console(&fb_con, first_fb_vc, last_fb_vc,
+				fbcon_is_default);
+
+	if (err) {
+		for (i = first_fb_vc; i <= last_fb_vc; i++)
+			con2fb_map[i] = -1;
+		info_idx = -1;
+	} else {
+		fbcon_has_console_bind = 1;
+	}
+
+	return err;
+}
+
+#ifdef MODULE
+static void fbcon_prepare_logo(struct vc_data *vc, struct fb_info *info,
+			       int cols, int rows, int new_cols, int new_rows)
+{
+	logo_shown = FBCON_LOGO_DONTSHOW;
+}
+#else
+static void fbcon_prepare_logo(struct vc_data *vc, struct fb_info *info,
+			       int cols, int rows, int new_cols, int new_rows)
+{
+	/* Need to make room for the logo */
+	struct fbcon_ops *ops = info->fbcon_par;
+	int cnt, erase = vc->vc_video_erase_char, step;
+	unsigned short *save = NULL, *r, *q;
+	int logo_height;
+
+	if (info->flags & FBINFO_MODULE) {
+		logo_shown = FBCON_LOGO_DONTSHOW;
+		return;
+	}
+
+	/*
+	 * remove underline attribute from erase character
+	 * if black and white framebuffer.
+	 */
+	if (fb_get_color_depth(&info->var, &info->fix) == 1)
+		erase &= ~0x400;
+	logo_height = fb_prepare_logo(info, ops->rotate);
+	logo_lines = DIV_ROUND_UP(logo_height, vc->vc_font.height);
+	q = (unsigned short *) (vc->vc_origin +
+				vc->vc_size_row * rows);
+	step = logo_lines * cols;
+	for (r = q - logo_lines * cols; r < q; r++)
+		if (scr_readw(r) != vc->vc_video_erase_char)
+			break;
+	if (r != q && new_rows >= rows + logo_lines) {
+		save = kmalloc(logo_lines * new_cols * 2, GFP_KERNEL);
+		if (save) {
+			int i = cols < new_cols ? cols : new_cols;
+			scr_memsetw(save, erase, logo_lines * new_cols * 2);
+			r = q - step;
+			for (cnt = 0; cnt < logo_lines; cnt++, r += i)
+				scr_memcpyw(save + cnt * new_cols, r, 2 * i);
+			r = q;
+		}
+	}
+	if (r == q) {
+		/* We can scroll screen down */
+		r = q - step - cols;
+		for (cnt = rows - logo_lines; cnt > 0; cnt--) {
+			scr_memcpyw(r + step, r, vc->vc_size_row);
+			r -= cols;
+		}
+		if (!save) {
+			int lines;
+			if (vc->vc_y + logo_lines >= rows)
+				lines = rows - vc->vc_y - 1;
+			else
+				lines = logo_lines;
+			vc->vc_y += lines;
+			vc->vc_pos += lines * vc->vc_size_row;
+		}
+	}
+	scr_memsetw((unsigned short *) vc->vc_origin,
+		    erase,
+		    vc->vc_size_row * logo_lines);
+
+	if (con_is_visible(vc) && vc->vc_mode == KD_TEXT) {
+		fbcon_clear_margins(vc, 0);
+		update_screen(vc);
+	}
+
+	if (save) {
+		q = (unsigned short *) (vc->vc_origin +
+					vc->vc_size_row *
+					rows);
+		scr_memcpyw(q, save, logo_lines * new_cols * 2);
+		vc->vc_y += logo_lines;
+		vc->vc_pos += logo_lines * vc->vc_size_row;
+		kfree(save);
+	}
+
+	if (logo_lines > vc->vc_bottom) {
+		logo_shown = FBCON_LOGO_CANSHOW;
+		printk(KERN_INFO
+		       "fbcon_init: disable boot-logo (boot-logo bigger than screen).\n");
+	} else if (logo_shown != FBCON_LOGO_DONTSHOW) {
+		logo_shown = FBCON_LOGO_DRAW;
+		vc->vc_top = logo_lines;
+	}
+}
+#endif /* MODULE */
+
+#ifdef CONFIG_FB_TILEBLITTING
+static void set_blitting_type(struct vc_data *vc, struct fb_info *info)
+{
+	struct fbcon_ops *ops = info->fbcon_par;
+
+	ops->p = &fb_display[vc->vc_num];
+
+	if ((info->flags & FBINFO_MISC_TILEBLITTING))
+		fbcon_set_tileops(vc, info);
+	else {
+		fbcon_set_rotation(info);
+		fbcon_set_bitops(ops);
+	}
+}
+
+static int fbcon_invalid_charcount(struct fb_info *info, unsigned charcount)
+{
+	int err = 0;
+
+	if (info->flags & FBINFO_MISC_TILEBLITTING &&
+	    info->tileops->fb_get_tilemax(info) < charcount)
+		err = 1;
+
+	return err;
+}
+#else
+static void set_blitting_type(struct vc_data *vc, struct fb_info *info)
+{
+	struct fbcon_ops *ops = info->fbcon_par;
+
+	info->flags &= ~FBINFO_MISC_TILEBLITTING;
+	ops->p = &fb_display[vc->vc_num];
+	fbcon_set_rotation(info);
+	fbcon_set_bitops(ops);
+}
+
+static int fbcon_invalid_charcount(struct fb_info *info, unsigned charcount)
+{
+	return 0;
+}
+
+#endif /* CONFIG_MISC_TILEBLITTING */
+
+
+static int con2fb_acquire_newinfo(struct vc_data *vc, struct fb_info *info,
+				  int unit, int oldidx)
+{
+	struct fbcon_ops *ops = NULL;
+	int err = 0;
+
+	if (!try_module_get(info->fbops->owner))
+		err = -ENODEV;
+
+	if (!err && info->fbops->fb_open &&
+	    info->fbops->fb_open(info, 0))
+		err = -ENODEV;
+
+	if (!err) {
+		ops = kzalloc(sizeof(struct fbcon_ops), GFP_KERNEL);
+		if (!ops)
+			err = -ENOMEM;
+	}
+
+	if (!err) {
+		ops->cur_blink_jiffies = HZ / 5;
+		info->fbcon_par = ops;
+
+		if (vc)
+			set_blitting_type(vc, info);
+	}
+
+	if (err) {
+		con2fb_map[unit] = oldidx;
+		module_put(info->fbops->owner);
+	}
+
+	return err;
+}
+
+static int con2fb_release_oldinfo(struct vc_data *vc, struct fb_info *oldinfo,
+				  struct fb_info *newinfo, int unit,
+				  int oldidx, int found)
+{
+	struct fbcon_ops *ops = oldinfo->fbcon_par;
+	int err = 0, ret;
+
+	if (oldinfo->fbops->fb_release &&
+	    oldinfo->fbops->fb_release(oldinfo, 0)) {
+		con2fb_map[unit] = oldidx;
+		if (!found && newinfo->fbops->fb_release)
+			newinfo->fbops->fb_release(newinfo, 0);
+		if (!found)
+			module_put(newinfo->fbops->owner);
+		err = -ENODEV;
+	}
+
+	if (!err) {
+		fbcon_del_cursor_timer(oldinfo);
+		kfree(ops->cursor_state.mask);
+		kfree(ops->cursor_data);
+		kfree(ops->cursor_src);
+		kfree(ops->fontbuffer);
+		kfree(oldinfo->fbcon_par);
+		oldinfo->fbcon_par = NULL;
+		module_put(oldinfo->fbops->owner);
+		/*
+		  If oldinfo and newinfo are driving the same hardware,
+		  the fb_release() method of oldinfo may attempt to
+		  restore the hardware state.  This will leave the
+		  newinfo in an undefined state. Thus, a call to
+		  fb_set_par() may be needed for the newinfo.
+		*/
+		if (newinfo && newinfo->fbops->fb_set_par) {
+			ret = newinfo->fbops->fb_set_par(newinfo);
+
+			if (ret)
+				printk(KERN_ERR "con2fb_release_oldinfo: "
+					"detected unhandled fb_set_par error, "
+					"error code %d\n", ret);
+		}
+	}
+
+	return err;
+}
+
+static void con2fb_init_display(struct vc_data *vc, struct fb_info *info,
+				int unit, int show_logo)
+{
+	struct fbcon_ops *ops = info->fbcon_par;
+	int ret;
+
+	ops->currcon = fg_console;
+
+	if (info->fbops->fb_set_par && !(ops->flags & FBCON_FLAGS_INIT)) {
+		ret = info->fbops->fb_set_par(info);
+
+		if (ret)
+			printk(KERN_ERR "con2fb_init_display: detected "
+				"unhandled fb_set_par error, "
+				"error code %d\n", ret);
+	}
+
+	ops->flags |= FBCON_FLAGS_INIT;
+	ops->graphics = 0;
+	fbcon_set_disp(info, &info->var, unit);
+
+	if (show_logo) {
+		struct vc_data *fg_vc = vc_cons[fg_console].d;
+		struct fb_info *fg_info =
+			registered_fb[con2fb_map[fg_console]];
+
+		fbcon_prepare_logo(fg_vc, fg_info, fg_vc->vc_cols,
+				   fg_vc->vc_rows, fg_vc->vc_cols,
+				   fg_vc->vc_rows);
+	}
+
+	update_screen(vc_cons[fg_console].d);
+}
+
+/**
+ *	set_con2fb_map - map console to frame buffer device
+ *	@unit: virtual console number to map
+ *	@newidx: frame buffer index to map virtual console to
+ *      @user: user request
+ *
+ *	Maps a virtual console @unit to a frame buffer device
+ *	@newidx.
+ *
+ *	This should be called with the console lock held.
+ */
+static int set_con2fb_map(int unit, int newidx, int user)
+{
+	struct vc_data *vc = vc_cons[unit].d;
+	int oldidx = con2fb_map[unit];
+	struct fb_info *info = registered_fb[newidx];
+	struct fb_info *oldinfo = NULL;
+ 	int found, err = 0;
+
+	if (oldidx == newidx)
+		return 0;
+
+	if (!info)
+		return -EINVAL;
+
+	if (!search_for_mapped_con() || !con_is_bound(&fb_con)) {
+		info_idx = newidx;
+		return do_fbcon_takeover(0);
+	}
+
+	if (oldidx != -1)
+		oldinfo = registered_fb[oldidx];
+
+	found = search_fb_in_map(newidx);
+
+	con2fb_map[unit] = newidx;
+	if (!err && !found)
+ 		err = con2fb_acquire_newinfo(vc, info, unit, oldidx);
+
+
+	/*
+	 * If old fb is not mapped to any of the consoles,
+	 * fbcon should release it.
+	 */
+ 	if (!err && oldinfo && !search_fb_in_map(oldidx))
+ 		err = con2fb_release_oldinfo(vc, oldinfo, info, unit, oldidx,
+ 					     found);
+
+ 	if (!err) {
+ 		int show_logo = (fg_console == 0 && !user &&
+ 				 logo_shown != FBCON_LOGO_DONTSHOW);
+
+ 		if (!found)
+ 			fbcon_add_cursor_timer(info);
+ 		con2fb_map_boot[unit] = newidx;
+ 		con2fb_init_display(vc, info, unit, show_logo);
+	}
+
+	if (!search_fb_in_map(info_idx))
+		info_idx = newidx;
+
+ 	return err;
+}
+
+/*
+ *  Low Level Operations
+ */
+/* NOTE: fbcon cannot be __init: it may be called from do_take_over_console later */
+static int var_to_display(struct display *disp,
+			  struct fb_var_screeninfo *var,
+			  struct fb_info *info)
+{
+	disp->xres_virtual = var->xres_virtual;
+	disp->yres_virtual = var->yres_virtual;
+	disp->bits_per_pixel = var->bits_per_pixel;
+	disp->grayscale = var->grayscale;
+	disp->nonstd = var->nonstd;
+	disp->accel_flags = var->accel_flags;
+	disp->height = var->height;
+	disp->width = var->width;
+	disp->red = var->red;
+	disp->green = var->green;
+	disp->blue = var->blue;
+	disp->transp = var->transp;
+	disp->rotate = var->rotate;
+	disp->mode = fb_match_mode(var, &info->modelist);
+	if (disp->mode == NULL)
+		/* This should not happen */
+		return -EINVAL;
+	return 0;
+}
+
+static void display_to_var(struct fb_var_screeninfo *var,
+			   struct display *disp)
+{
+	fb_videomode_to_var(var, disp->mode);
+	var->xres_virtual = disp->xres_virtual;
+	var->yres_virtual = disp->yres_virtual;
+	var->bits_per_pixel = disp->bits_per_pixel;
+	var->grayscale = disp->grayscale;
+	var->nonstd = disp->nonstd;
+	var->accel_flags = disp->accel_flags;
+	var->height = disp->height;
+	var->width = disp->width;
+	var->red = disp->red;
+	var->green = disp->green;
+	var->blue = disp->blue;
+	var->transp = disp->transp;
+	var->rotate = disp->rotate;
+}
+
+static const char *fbcon_startup(void)
+{
+	const char *display_desc = "frame buffer device";
+	struct display *p = &fb_display[fg_console];
+	struct vc_data *vc = vc_cons[fg_console].d;
+	const struct font_desc *font = NULL;
+	struct module *owner;
+	struct fb_info *info = NULL;
+	struct fbcon_ops *ops;
+	int rows, cols;
+
+	/*
+	 *  If num_registered_fb is zero, this is a call for the dummy part.
+	 *  The frame buffer devices weren't initialized yet.
+	 */
+	if (!num_registered_fb || info_idx == -1)
+		return display_desc;
+	/*
+	 * Instead of blindly using registered_fb[0], we use info_idx, set by
+	 * fb_console_init();
+	 */
+	info = registered_fb[info_idx];
+	if (!info)
+		return NULL;
+	
+	owner = info->fbops->owner;
+	if (!try_module_get(owner))
+		return NULL;
+	if (info->fbops->fb_open && info->fbops->fb_open(info, 0)) {
+		module_put(owner);
+		return NULL;
+	}
+
+	ops = kzalloc(sizeof(struct fbcon_ops), GFP_KERNEL);
+	if (!ops) {
+		module_put(owner);
+		return NULL;
+	}
+
+	ops->currcon = -1;
+	ops->graphics = 1;
+	ops->cur_rotate = -1;
+	ops->cur_blink_jiffies = HZ / 5;
+	info->fbcon_par = ops;
+	p->con_rotate = initial_rotation;
+	set_blitting_type(vc, info);
+
+	if (info->fix.type != FB_TYPE_TEXT) {
+		if (fbcon_softback_size) {
+			if (!softback_buf) {
+				softback_buf =
+				    (unsigned long)
+				    kmalloc(fbcon_softback_size,
+					    GFP_KERNEL);
+				if (!softback_buf) {
+					fbcon_softback_size = 0;
+					softback_top = 0;
+				}
+			}
+		} else {
+			if (softback_buf) {
+				kfree((void *) softback_buf);
+				softback_buf = 0;
+				softback_top = 0;
+			}
+		}
+		if (softback_buf)
+			softback_in = softback_top = softback_curr =
+			    softback_buf;
+		softback_lines = 0;
+	}
+
+	/* Setup default font */
+	if (!p->fontdata && !vc->vc_font.data) {
+		if (!fontname[0] || !(font = find_font(fontname)))
+			font = get_default_font(info->var.xres,
+						info->var.yres,
+						info->pixmap.blit_x,
+						info->pixmap.blit_y);
+		vc->vc_font.width = font->width;
+		vc->vc_font.height = font->height;
+		vc->vc_font.data = (void *)(p->fontdata = font->data);
+		vc->vc_font.charcount = 256; /* FIXME  Need to support more fonts */
+	} else {
+		p->fontdata = vc->vc_font.data;
+	}
+
+	cols = FBCON_SWAP(ops->rotate, info->var.xres, info->var.yres);
+	rows = FBCON_SWAP(ops->rotate, info->var.yres, info->var.xres);
+	cols /= vc->vc_font.width;
+	rows /= vc->vc_font.height;
+	vc_resize(vc, cols, rows);
+
+	DPRINTK("mode:   %s\n", info->fix.id);
+	DPRINTK("visual: %d\n", info->fix.visual);
+	DPRINTK("res:    %dx%d-%d\n", info->var.xres,
+		info->var.yres,
+		info->var.bits_per_pixel);
+
+	fbcon_add_cursor_timer(info);
+	fbcon_has_exited = 0;
+	return display_desc;
+}
+
+static void fbcon_init(struct vc_data *vc, int init)
+{
+	struct fb_info *info = registered_fb[con2fb_map[vc->vc_num]];
+	struct fbcon_ops *ops;
+	struct vc_data **default_mode = vc->vc_display_fg;
+	struct vc_data *svc = *default_mode;
+	struct display *t, *p = &fb_display[vc->vc_num];
+	int logo = 1, new_rows, new_cols, rows, cols, charcnt = 256;
+	int cap, ret;
+
+	if (info_idx == -1 || info == NULL)
+	    return;
+
+	cap = info->flags;
+
+	if (vc != svc || logo_shown == FBCON_LOGO_DONTSHOW ||
+	    (info->fix.type == FB_TYPE_TEXT))
+		logo = 0;
+
+	if (var_to_display(p, &info->var, info))
+		return;
+
+	if (!info->fbcon_par)
+		con2fb_acquire_newinfo(vc, info, vc->vc_num, -1);
+
+	/* If we are not the first console on this
+	   fb, copy the font from that console */
+	t = &fb_display[fg_console];
+	if (!p->fontdata) {
+		if (t->fontdata) {
+			struct vc_data *fvc = vc_cons[fg_console].d;
+
+			vc->vc_font.data = (void *)(p->fontdata =
+						    fvc->vc_font.data);
+			vc->vc_font.width = fvc->vc_font.width;
+			vc->vc_font.height = fvc->vc_font.height;
+			p->userfont = t->userfont;
+
+			if (p->userfont)
+				REFCOUNT(p->fontdata)++;
+		} else {
+			const struct font_desc *font = NULL;
+
+			if (!fontname[0] || !(font = find_font(fontname)))
+				font = get_default_font(info->var.xres,
+							info->var.yres,
+							info->pixmap.blit_x,
+							info->pixmap.blit_y);
+			vc->vc_font.width = font->width;
+			vc->vc_font.height = font->height;
+			vc->vc_font.data = (void *)(p->fontdata = font->data);
+			vc->vc_font.charcount = 256; /* FIXME  Need to
+							support more fonts */
+		}
+	}
+
+	if (p->userfont)
+		charcnt = FNTCHARCNT(p->fontdata);
+
+	vc->vc_panic_force_write = !!(info->flags & FBINFO_CAN_FORCE_OUTPUT);
+	vc->vc_can_do_color = (fb_get_color_depth(&info->var, &info->fix)!=1);
+	vc->vc_complement_mask = vc->vc_can_do_color ? 0x7700 : 0x0800;
+	if (charcnt == 256) {
+		vc->vc_hi_font_mask = 0;
+	} else {
+		vc->vc_hi_font_mask = 0x100;
+		if (vc->vc_can_do_color)
+			vc->vc_complement_mask <<= 1;
+	}
+
+	if (!*svc->vc_uni_pagedir_loc)
+		con_set_default_unimap(svc);
+	if (!*vc->vc_uni_pagedir_loc)
+		con_copy_unimap(vc, svc);
+
+	ops = info->fbcon_par;
+	ops->cur_blink_jiffies = msecs_to_jiffies(vc->vc_cur_blink_ms);
+	p->con_rotate = initial_rotation;
+	set_blitting_type(vc, info);
+
+	cols = vc->vc_cols;
+	rows = vc->vc_rows;
+	new_cols = FBCON_SWAP(ops->rotate, info->var.xres, info->var.yres);
+	new_rows = FBCON_SWAP(ops->rotate, info->var.yres, info->var.xres);
+	new_cols /= vc->vc_font.width;
+	new_rows /= vc->vc_font.height;
+
+	/*
+	 * We must always set the mode. The mode of the previous console
+	 * driver could be in the same resolution but we are using different
+	 * hardware so we have to initialize the hardware.
+	 *
+	 * We need to do it in fbcon_init() to prevent screen corruption.
+	 */
+	if (con_is_visible(vc) && vc->vc_mode == KD_TEXT) {
+		if (info->fbops->fb_set_par &&
+		    !(ops->flags & FBCON_FLAGS_INIT)) {
+			ret = info->fbops->fb_set_par(info);
+
+			if (ret)
+				printk(KERN_ERR "fbcon_init: detected "
+					"unhandled fb_set_par error, "
+					"error code %d\n", ret);
+		}
+
+		ops->flags |= FBCON_FLAGS_INIT;
+	}
+
+	ops->graphics = 0;
+
+	if ((cap & FBINFO_HWACCEL_COPYAREA) &&
+	    !(cap & FBINFO_HWACCEL_DISABLED))
+		p->scrollmode = SCROLL_MOVE;
+	else /* default to something safe */
+		p->scrollmode = SCROLL_REDRAW;
+
+	/*
+	 *  ++guenther: console.c:vc_allocate() relies on initializing
+	 *  vc_{cols,rows}, but we must not set those if we are only
+	 *  resizing the console.
+	 */
+	if (init) {
+		vc->vc_cols = new_cols;
+		vc->vc_rows = new_rows;
+	} else
+		vc_resize(vc, new_cols, new_rows);
+
+	if (logo)
+		fbcon_prepare_logo(vc, info, cols, rows, new_cols, new_rows);
+
+	if (vc == svc && softback_buf)
+		fbcon_update_softback(vc);
+
+	if (ops->rotate_font && ops->rotate_font(info, vc)) {
+		ops->rotate = FB_ROTATE_UR;
+		set_blitting_type(vc, info);
+	}
+
+	ops->p = &fb_display[fg_console];
+}
+
+static void fbcon_free_font(struct display *p, bool freefont)
+{
+	if (freefont && p->userfont && p->fontdata && (--REFCOUNT(p->fontdata) == 0))
+		kfree(p->fontdata - FONT_EXTRA_WORDS * sizeof(int));
+	p->fontdata = NULL;
+	p->userfont = 0;
+}
+
+static void set_vc_hi_font(struct vc_data *vc, bool set);
+
+static void fbcon_deinit(struct vc_data *vc)
+{
+	struct display *p = &fb_display[vc->vc_num];
+	struct fb_info *info;
+	struct fbcon_ops *ops;
+	int idx;
+	bool free_font = true;
+
+	idx = con2fb_map[vc->vc_num];
+
+	if (idx == -1)
+		goto finished;
+
+	info = registered_fb[idx];
+
+	if (!info)
+		goto finished;
+
+	if (info->flags & FBINFO_MISC_FIRMWARE)
+		free_font = false;
+	ops = info->fbcon_par;
+
+	if (!ops)
+		goto finished;
+
+	if (con_is_visible(vc))
+		fbcon_del_cursor_timer(info);
+
+	ops->flags &= ~FBCON_FLAGS_INIT;
+finished:
+
+	fbcon_free_font(p, free_font);
+	if (free_font)
+		vc->vc_font.data = NULL;
+
+	if (vc->vc_hi_font_mask)
+		set_vc_hi_font(vc, false);
+
+	if (!con_is_bound(&fb_con))
+		fbcon_exit();
+
+	return;
+}
+
+/* ====================================================================== */
+
+/*  fbcon_XXX routines - interface used by the world
+ *
+ *  This system is now divided into two levels because of complications
+ *  caused by hardware scrolling. Top level functions:
+ *
+ *	fbcon_bmove(), fbcon_clear(), fbcon_putc(), fbcon_clear_margins()
+ *
+ *  handles y values in range [0, scr_height-1] that correspond to real
+ *  screen positions. y_wrap shift means that first line of bitmap may be
+ *  anywhere on this display. These functions convert lineoffsets to
+ *  bitmap offsets and deal with the wrap-around case by splitting blits.
+ *
+ *	fbcon_bmove_physical_8()    -- These functions fast implementations
+ *	fbcon_clear_physical_8()    -- of original fbcon_XXX fns.
+ *	fbcon_putc_physical_8()	    -- (font width != 8) may be added later
+ *
+ *  WARNING:
+ *
+ *  At the moment fbcon_putc() cannot blit across vertical wrap boundary
+ *  Implies should only really hardware scroll in rows. Only reason for
+ *  restriction is simplicity & efficiency at the moment.
+ */
+
+static void fbcon_clear(struct vc_data *vc, int sy, int sx, int height,
+			int width)
+{
+	struct fb_info *info = registered_fb[con2fb_map[vc->vc_num]];
+	struct fbcon_ops *ops = info->fbcon_par;
+
+	struct display *p = &fb_display[vc->vc_num];
+	u_int y_break;
+
+	if (fbcon_is_inactive(vc, info))
+		return;
+
+	if (!height || !width)
+		return;
+
+	if (sy < vc->vc_top && vc->vc_top == logo_lines) {
+		vc->vc_top = 0;
+		/*
+		 * If the font dimensions are not an integral of the display
+		 * dimensions then the ops->clear below won't end up clearing
+		 * the margins.  Call clear_margins here in case the logo
+		 * bitmap stretched into the margin area.
+		 */
+		fbcon_clear_margins(vc, 0);
+	}
+
+	/* Split blits that cross physical y_wrap boundary */
+
+	y_break = p->vrows - p->yscroll;
+	if (sy < y_break && sy + height - 1 >= y_break) {
+		u_int b = y_break - sy;
+		ops->clear(vc, info, real_y(p, sy), sx, b, width);
+		ops->clear(vc, info, real_y(p, sy + b), sx, height - b,
+				 width);
+	} else
+		ops->clear(vc, info, real_y(p, sy), sx, height, width);
+}
+
+static void fbcon_putcs(struct vc_data *vc, const unsigned short *s,
+			int count, int ypos, int xpos)
+{
+	struct fb_info *info = registered_fb[con2fb_map[vc->vc_num]];
+	struct display *p = &fb_display[vc->vc_num];
+	struct fbcon_ops *ops = info->fbcon_par;
+
+	if (!fbcon_is_inactive(vc, info))
+		ops->putcs(vc, info, s, count, real_y(p, ypos), xpos,
+			   get_color(vc, info, scr_readw(s), 1),
+			   get_color(vc, info, scr_readw(s), 0));
+}
+
+static void fbcon_putc(struct vc_data *vc, int c, int ypos, int xpos)
+{
+	unsigned short chr;
+
+	scr_writew(c, &chr);
+	fbcon_putcs(vc, &chr, 1, ypos, xpos);
+}
+
+static void fbcon_clear_margins(struct vc_data *vc, int bottom_only)
+{
+	struct fb_info *info = registered_fb[con2fb_map[vc->vc_num]];
+	struct fbcon_ops *ops = info->fbcon_par;
+
+	if (!fbcon_is_inactive(vc, info))
+		ops->clear_margins(vc, info, bottom_only);
+}
+
+static void fbcon_cursor(struct vc_data *vc, int mode)
+{
+	struct fb_info *info = registered_fb[con2fb_map[vc->vc_num]];
+	struct fbcon_ops *ops = info->fbcon_par;
+	int y;
+ 	int c = scr_readw((u16 *) vc->vc_pos);
+
+	ops->cur_blink_jiffies = msecs_to_jiffies(vc->vc_cur_blink_ms);
+
+	if (fbcon_is_inactive(vc, info) || vc->vc_deccm != 1)
+		return;
+
+	if (vc->vc_cursor_type & 0x10)
+		fbcon_del_cursor_timer(info);
+	else
+		fbcon_add_cursor_timer(info);
+
+	ops->cursor_flash = (mode == CM_ERASE) ? 0 : 1;
+	if (mode & CM_SOFTBACK) {
+		mode &= ~CM_SOFTBACK;
+		y = softback_lines;
+	} else {
+		if (softback_lines)
+			fbcon_set_origin(vc);
+		y = 0;
+	}
+
+	ops->cursor(vc, info, mode, y, get_color(vc, info, c, 1),
+		    get_color(vc, info, c, 0));
+}
+
+static int scrollback_phys_max = 0;
+static int scrollback_max = 0;
+static int scrollback_current = 0;
+
+static void fbcon_set_disp(struct fb_info *info, struct fb_var_screeninfo *var,
+			   int unit)
+{
+	struct display *p, *t;
+	struct vc_data **default_mode, *vc;
+	struct vc_data *svc;
+	struct fbcon_ops *ops = info->fbcon_par;
+	int rows, cols, charcnt = 256;
+
+	p = &fb_display[unit];
+
+	if (var_to_display(p, var, info))
+		return;
+
+	vc = vc_cons[unit].d;
+
+	if (!vc)
+		return;
+
+	default_mode = vc->vc_display_fg;
+	svc = *default_mode;
+	t = &fb_display[svc->vc_num];
+
+	if (!vc->vc_font.data) {
+		vc->vc_font.data = (void *)(p->fontdata = t->fontdata);
+		vc->vc_font.width = (*default_mode)->vc_font.width;
+		vc->vc_font.height = (*default_mode)->vc_font.height;
+		p->userfont = t->userfont;
+		if (p->userfont)
+			REFCOUNT(p->fontdata)++;
+	}
+	if (p->userfont)
+		charcnt = FNTCHARCNT(p->fontdata);
+
+	var->activate = FB_ACTIVATE_NOW;
+	info->var.activate = var->activate;
+	var->yoffset = info->var.yoffset;
+	var->xoffset = info->var.xoffset;
+	fb_set_var(info, var);
+	ops->var = info->var;
+	vc->vc_can_do_color = (fb_get_color_depth(&info->var, &info->fix)!=1);
+	vc->vc_complement_mask = vc->vc_can_do_color ? 0x7700 : 0x0800;
+	if (charcnt == 256) {
+		vc->vc_hi_font_mask = 0;
+	} else {
+		vc->vc_hi_font_mask = 0x100;
+		if (vc->vc_can_do_color)
+			vc->vc_complement_mask <<= 1;
+	}
+
+	if (!*svc->vc_uni_pagedir_loc)
+		con_set_default_unimap(svc);
+	if (!*vc->vc_uni_pagedir_loc)
+		con_copy_unimap(vc, svc);
+
+	cols = FBCON_SWAP(ops->rotate, info->var.xres, info->var.yres);
+	rows = FBCON_SWAP(ops->rotate, info->var.yres, info->var.xres);
+	cols /= vc->vc_font.width;
+	rows /= vc->vc_font.height;
+	vc_resize(vc, cols, rows);
+
+	if (con_is_visible(vc)) {
+		update_screen(vc);
+		if (softback_buf)
+			fbcon_update_softback(vc);
+	}
+}
+
+static __inline__ void ywrap_up(struct vc_data *vc, int count)
+{
+	struct fb_info *info = registered_fb[con2fb_map[vc->vc_num]];
+	struct fbcon_ops *ops = info->fbcon_par;
+	struct display *p = &fb_display[vc->vc_num];
+	
+	p->yscroll += count;
+	if (p->yscroll >= p->vrows)	/* Deal with wrap */
+		p->yscroll -= p->vrows;
+	ops->var.xoffset = 0;
+	ops->var.yoffset = p->yscroll * vc->vc_font.height;
+	ops->var.vmode |= FB_VMODE_YWRAP;
+	ops->update_start(info);
+	scrollback_max += count;
+	if (scrollback_max > scrollback_phys_max)
+		scrollback_max = scrollback_phys_max;
+	scrollback_current = 0;
+}
+
+static __inline__ void ywrap_down(struct vc_data *vc, int count)
+{
+	struct fb_info *info = registered_fb[con2fb_map[vc->vc_num]];
+	struct fbcon_ops *ops = info->fbcon_par;
+	struct display *p = &fb_display[vc->vc_num];
+	
+	p->yscroll -= count;
+	if (p->yscroll < 0)	/* Deal with wrap */
+		p->yscroll += p->vrows;
+	ops->var.xoffset = 0;
+	ops->var.yoffset = p->yscroll * vc->vc_font.height;
+	ops->var.vmode |= FB_VMODE_YWRAP;
+	ops->update_start(info);
+	scrollback_max -= count;
+	if (scrollback_max < 0)
+		scrollback_max = 0;
+	scrollback_current = 0;
+}
+
+static __inline__ void ypan_up(struct vc_data *vc, int count)
+{
+	struct fb_info *info = registered_fb[con2fb_map[vc->vc_num]];
+	struct display *p = &fb_display[vc->vc_num];
+	struct fbcon_ops *ops = info->fbcon_par;
+
+	p->yscroll += count;
+	if (p->yscroll > p->vrows - vc->vc_rows) {
+		ops->bmove(vc, info, p->vrows - vc->vc_rows,
+			    0, 0, 0, vc->vc_rows, vc->vc_cols);
+		p->yscroll -= p->vrows - vc->vc_rows;
+	}
+
+	ops->var.xoffset = 0;
+	ops->var.yoffset = p->yscroll * vc->vc_font.height;
+	ops->var.vmode &= ~FB_VMODE_YWRAP;
+	ops->update_start(info);
+	fbcon_clear_margins(vc, 1);
+	scrollback_max += count;
+	if (scrollback_max > scrollback_phys_max)
+		scrollback_max = scrollback_phys_max;
+	scrollback_current = 0;
+}
+
+static __inline__ void ypan_up_redraw(struct vc_data *vc, int t, int count)
+{
+	struct fb_info *info = registered_fb[con2fb_map[vc->vc_num]];
+	struct fbcon_ops *ops = info->fbcon_par;
+	struct display *p = &fb_display[vc->vc_num];
+
+	p->yscroll += count;
+
+	if (p->yscroll > p->vrows - vc->vc_rows) {
+		p->yscroll -= p->vrows - vc->vc_rows;
+		fbcon_redraw_move(vc, p, t + count, vc->vc_rows - count, t);
+	}
+
+	ops->var.xoffset = 0;
+	ops->var.yoffset = p->yscroll * vc->vc_font.height;
+	ops->var.vmode &= ~FB_VMODE_YWRAP;
+	ops->update_start(info);
+	fbcon_clear_margins(vc, 1);
+	scrollback_max += count;
+	if (scrollback_max > scrollback_phys_max)
+		scrollback_max = scrollback_phys_max;
+	scrollback_current = 0;
+}
+
+static __inline__ void ypan_down(struct vc_data *vc, int count)
+{
+	struct fb_info *info = registered_fb[con2fb_map[vc->vc_num]];
+	struct display *p = &fb_display[vc->vc_num];
+	struct fbcon_ops *ops = info->fbcon_par;
+	
+	p->yscroll -= count;
+	if (p->yscroll < 0) {
+		ops->bmove(vc, info, 0, 0, p->vrows - vc->vc_rows,
+			    0, vc->vc_rows, vc->vc_cols);
+		p->yscroll += p->vrows - vc->vc_rows;
+	}
+
+	ops->var.xoffset = 0;
+	ops->var.yoffset = p->yscroll * vc->vc_font.height;
+	ops->var.vmode &= ~FB_VMODE_YWRAP;
+	ops->update_start(info);
+	fbcon_clear_margins(vc, 1);
+	scrollback_max -= count;
+	if (scrollback_max < 0)
+		scrollback_max = 0;
+	scrollback_current = 0;
+}
+
+static __inline__ void ypan_down_redraw(struct vc_data *vc, int t, int count)
+{
+	struct fb_info *info = registered_fb[con2fb_map[vc->vc_num]];
+	struct fbcon_ops *ops = info->fbcon_par;
+	struct display *p = &fb_display[vc->vc_num];
+
+	p->yscroll -= count;
+
+	if (p->yscroll < 0) {
+		p->yscroll += p->vrows - vc->vc_rows;
+		fbcon_redraw_move(vc, p, t, vc->vc_rows - count, t + count);
+	}
+
+	ops->var.xoffset = 0;
+	ops->var.yoffset = p->yscroll * vc->vc_font.height;
+	ops->var.vmode &= ~FB_VMODE_YWRAP;
+	ops->update_start(info);
+	fbcon_clear_margins(vc, 1);
+	scrollback_max -= count;
+	if (scrollback_max < 0)
+		scrollback_max = 0;
+	scrollback_current = 0;
+}
+
+static void fbcon_redraw_softback(struct vc_data *vc, struct display *p,
+				  long delta)
+{
+	int count = vc->vc_rows;
+	unsigned short *d, *s;
+	unsigned long n;
+	int line = 0;
+
+	d = (u16 *) softback_curr;
+	if (d == (u16 *) softback_in)
+		d = (u16 *) vc->vc_origin;
+	n = softback_curr + delta * vc->vc_size_row;
+	softback_lines -= delta;
+	if (delta < 0) {
+		if (softback_curr < softback_top && n < softback_buf) {
+			n += softback_end - softback_buf;
+			if (n < softback_top) {
+				softback_lines -=
+				    (softback_top - n) / vc->vc_size_row;
+				n = softback_top;
+			}
+		} else if (softback_curr >= softback_top
+			   && n < softback_top) {
+			softback_lines -=
+			    (softback_top - n) / vc->vc_size_row;
+			n = softback_top;
+		}
+	} else {
+		if (softback_curr > softback_in && n >= softback_end) {
+			n += softback_buf - softback_end;
+			if (n > softback_in) {
+				n = softback_in;
+				softback_lines = 0;
+			}
+		} else if (softback_curr <= softback_in && n > softback_in) {
+			n = softback_in;
+			softback_lines = 0;
+		}
+	}
+	if (n == softback_curr)
+		return;
+	softback_curr = n;
+	s = (u16 *) softback_curr;
+	if (s == (u16 *) softback_in)
+		s = (u16 *) vc->vc_origin;
+	while (count--) {
+		unsigned short *start;
+		unsigned short *le;
+		unsigned short c;
+		int x = 0;
+		unsigned short attr = 1;
+
+		start = s;
+		le = advance_row(s, 1);
+		do {
+			c = scr_readw(s);
+			if (attr != (c & 0xff00)) {
+				attr = c & 0xff00;
+				if (s > start) {
+					fbcon_putcs(vc, start, s - start,
+						    line, x);
+					x += s - start;
+					start = s;
+				}
+			}
+			if (c == scr_readw(d)) {
+				if (s > start) {
+					fbcon_putcs(vc, start, s - start,
+						    line, x);
+					x += s - start + 1;
+					start = s + 1;
+				} else {
+					x++;
+					start++;
+				}
+			}
+			s++;
+			d++;
+		} while (s < le);
+		if (s > start)
+			fbcon_putcs(vc, start, s - start, line, x);
+		line++;
+		if (d == (u16 *) softback_end)
+			d = (u16 *) softback_buf;
+		if (d == (u16 *) softback_in)
+			d = (u16 *) vc->vc_origin;
+		if (s == (u16 *) softback_end)
+			s = (u16 *) softback_buf;
+		if (s == (u16 *) softback_in)
+			s = (u16 *) vc->vc_origin;
+	}
+}
+
+static void fbcon_redraw_move(struct vc_data *vc, struct display *p,
+			      int line, int count, int dy)
+{
+	unsigned short *s = (unsigned short *)
+		(vc->vc_origin + vc->vc_size_row * line);
+
+	while (count--) {
+		unsigned short *start = s;
+		unsigned short *le = advance_row(s, 1);
+		unsigned short c;
+		int x = 0;
+		unsigned short attr = 1;
+
+		do {
+			c = scr_readw(s);
+			if (attr != (c & 0xff00)) {
+				attr = c & 0xff00;
+				if (s > start) {
+					fbcon_putcs(vc, start, s - start,
+						    dy, x);
+					x += s - start;
+					start = s;
+				}
+			}
+			console_conditional_schedule();
+			s++;
+		} while (s < le);
+		if (s > start)
+			fbcon_putcs(vc, start, s - start, dy, x);
+		console_conditional_schedule();
+		dy++;
+	}
+}
+
+static void fbcon_redraw_blit(struct vc_data *vc, struct fb_info *info,
+			struct display *p, int line, int count, int ycount)
+{
+	int offset = ycount * vc->vc_cols;
+	unsigned short *d = (unsigned short *)
+	    (vc->vc_origin + vc->vc_size_row * line);
+	unsigned short *s = d + offset;
+	struct fbcon_ops *ops = info->fbcon_par;
+
+	while (count--) {
+		unsigned short *start = s;
+		unsigned short *le = advance_row(s, 1);
+		unsigned short c;
+		int x = 0;
+
+		do {
+			c = scr_readw(s);
+
+			if (c == scr_readw(d)) {
+				if (s > start) {
+					ops->bmove(vc, info, line + ycount, x,
+						   line, x, 1, s-start);
+					x += s - start + 1;
+					start = s + 1;
+				} else {
+					x++;
+					start++;
+				}
+			}
+
+			scr_writew(c, d);
+			console_conditional_schedule();
+			s++;
+			d++;
+		} while (s < le);
+		if (s > start)
+			ops->bmove(vc, info, line + ycount, x, line, x, 1,
+				   s-start);
+		console_conditional_schedule();
+		if (ycount > 0)
+			line++;
+		else {
+			line--;
+			/* NOTE: We subtract two lines from these pointers */
+			s -= vc->vc_size_row;
+			d -= vc->vc_size_row;
+		}
+	}
+}
+
+static void fbcon_redraw(struct vc_data *vc, struct display *p,
+			 int line, int count, int offset)
+{
+	unsigned short *d = (unsigned short *)
+	    (vc->vc_origin + vc->vc_size_row * line);
+	unsigned short *s = d + offset;
+
+	while (count--) {
+		unsigned short *start = s;
+		unsigned short *le = advance_row(s, 1);
+		unsigned short c;
+		int x = 0;
+		unsigned short attr = 1;
+
+		do {
+			c = scr_readw(s);
+			if (attr != (c & 0xff00)) {
+				attr = c & 0xff00;
+				if (s > start) {
+					fbcon_putcs(vc, start, s - start,
+						    line, x);
+					x += s - start;
+					start = s;
+				}
+			}
+			if (c == scr_readw(d)) {
+				if (s > start) {
+					fbcon_putcs(vc, start, s - start,
+						     line, x);
+					x += s - start + 1;
+					start = s + 1;
+				} else {
+					x++;
+					start++;
+				}
+			}
+			scr_writew(c, d);
+			console_conditional_schedule();
+			s++;
+			d++;
+		} while (s < le);
+		if (s > start)
+			fbcon_putcs(vc, start, s - start, line, x);
+		console_conditional_schedule();
+		if (offset > 0)
+			line++;
+		else {
+			line--;
+			/* NOTE: We subtract two lines from these pointers */
+			s -= vc->vc_size_row;
+			d -= vc->vc_size_row;
+		}
+	}
+}
+
+static inline void fbcon_softback_note(struct vc_data *vc, int t,
+				       int count)
+{
+	unsigned short *p;
+
+	if (vc->vc_num != fg_console)
+		return;
+	p = (unsigned short *) (vc->vc_origin + t * vc->vc_size_row);
+
+	while (count) {
+		scr_memcpyw((u16 *) softback_in, p, vc->vc_size_row);
+		count--;
+		p = advance_row(p, 1);
+		softback_in += vc->vc_size_row;
+		if (softback_in == softback_end)
+			softback_in = softback_buf;
+		if (softback_in == softback_top) {
+			softback_top += vc->vc_size_row;
+			if (softback_top == softback_end)
+				softback_top = softback_buf;
+		}
+	}
+	softback_curr = softback_in;
+}
+
+static bool fbcon_scroll(struct vc_data *vc, unsigned int t, unsigned int b,
+		enum con_scroll dir, unsigned int count)
+{
+	struct fb_info *info = registered_fb[con2fb_map[vc->vc_num]];
+	struct display *p = &fb_display[vc->vc_num];
+	int scroll_partial = info->flags & FBINFO_PARTIAL_PAN_OK;
+
+	if (fbcon_is_inactive(vc, info))
+		return true;
+
+	fbcon_cursor(vc, CM_ERASE);
+
+	/*
+	 * ++Geert: Only use ywrap/ypan if the console is in text mode
+	 * ++Andrew: Only use ypan on hardware text mode when scrolling the
+	 *           whole screen (prevents flicker).
+	 */
+
+	switch (dir) {
+	case SM_UP:
+		if (count > vc->vc_rows)	/* Maximum realistic size */
+			count = vc->vc_rows;
+		if (softback_top)
+			fbcon_softback_note(vc, t, count);
+		if (logo_shown >= 0)
+			goto redraw_up;
+		switch (p->scrollmode) {
+		case SCROLL_MOVE:
+			fbcon_redraw_blit(vc, info, p, t, b - t - count,
+				     count);
+			fbcon_clear(vc, b - count, 0, count, vc->vc_cols);
+			scr_memsetw((unsigned short *) (vc->vc_origin +
+							vc->vc_size_row *
+							(b - count)),
+				    vc->vc_video_erase_char,
+				    vc->vc_size_row * count);
+			return true;
+			break;
+
+		case SCROLL_WRAP_MOVE:
+			if (b - t - count > 3 * vc->vc_rows >> 2) {
+				if (t > 0)
+					fbcon_bmove(vc, 0, 0, count, 0, t,
+						    vc->vc_cols);
+				ywrap_up(vc, count);
+				if (vc->vc_rows - b > 0)
+					fbcon_bmove(vc, b - count, 0, b, 0,
+						    vc->vc_rows - b,
+						    vc->vc_cols);
+			} else if (info->flags & FBINFO_READS_FAST)
+				fbcon_bmove(vc, t + count, 0, t, 0,
+					    b - t - count, vc->vc_cols);
+			else
+				goto redraw_up;
+			fbcon_clear(vc, b - count, 0, count, vc->vc_cols);
+			break;
+
+		case SCROLL_PAN_REDRAW:
+			if ((p->yscroll + count <=
+			     2 * (p->vrows - vc->vc_rows))
+			    && ((!scroll_partial && (b - t == vc->vc_rows))
+				|| (scroll_partial
+				    && (b - t - count >
+					3 * vc->vc_rows >> 2)))) {
+				if (t > 0)
+					fbcon_redraw_move(vc, p, 0, t, count);
+				ypan_up_redraw(vc, t, count);
+				if (vc->vc_rows - b > 0)
+					fbcon_redraw_move(vc, p, b,
+							  vc->vc_rows - b, b);
+			} else
+				fbcon_redraw_move(vc, p, t + count, b - t - count, t);
+			fbcon_clear(vc, b - count, 0, count, vc->vc_cols);
+			break;
+
+		case SCROLL_PAN_MOVE:
+			if ((p->yscroll + count <=
+			     2 * (p->vrows - vc->vc_rows))
+			    && ((!scroll_partial && (b - t == vc->vc_rows))
+				|| (scroll_partial
+				    && (b - t - count >
+					3 * vc->vc_rows >> 2)))) {
+				if (t > 0)
+					fbcon_bmove(vc, 0, 0, count, 0, t,
+						    vc->vc_cols);
+				ypan_up(vc, count);
+				if (vc->vc_rows - b > 0)
+					fbcon_bmove(vc, b - count, 0, b, 0,
+						    vc->vc_rows - b,
+						    vc->vc_cols);
+			} else if (info->flags & FBINFO_READS_FAST)
+				fbcon_bmove(vc, t + count, 0, t, 0,
+					    b - t - count, vc->vc_cols);
+			else
+				goto redraw_up;
+			fbcon_clear(vc, b - count, 0, count, vc->vc_cols);
+			break;
+
+		case SCROLL_REDRAW:
+		      redraw_up:
+			fbcon_redraw(vc, p, t, b - t - count,
+				     count * vc->vc_cols);
+			fbcon_clear(vc, b - count, 0, count, vc->vc_cols);
+			scr_memsetw((unsigned short *) (vc->vc_origin +
+							vc->vc_size_row *
+							(b - count)),
+				    vc->vc_video_erase_char,
+				    vc->vc_size_row * count);
+			return true;
+		}
+		break;
+
+	case SM_DOWN:
+		if (count > vc->vc_rows)	/* Maximum realistic size */
+			count = vc->vc_rows;
+		if (logo_shown >= 0)
+			goto redraw_down;
+		switch (p->scrollmode) {
+		case SCROLL_MOVE:
+			fbcon_redraw_blit(vc, info, p, b - 1, b - t - count,
+				     -count);
+			fbcon_clear(vc, t, 0, count, vc->vc_cols);
+			scr_memsetw((unsigned short *) (vc->vc_origin +
+							vc->vc_size_row *
+							t),
+				    vc->vc_video_erase_char,
+				    vc->vc_size_row * count);
+			return true;
+			break;
+
+		case SCROLL_WRAP_MOVE:
+			if (b - t - count > 3 * vc->vc_rows >> 2) {
+				if (vc->vc_rows - b > 0)
+					fbcon_bmove(vc, b, 0, b - count, 0,
+						    vc->vc_rows - b,
+						    vc->vc_cols);
+				ywrap_down(vc, count);
+				if (t > 0)
+					fbcon_bmove(vc, count, 0, 0, 0, t,
+						    vc->vc_cols);
+			} else if (info->flags & FBINFO_READS_FAST)
+				fbcon_bmove(vc, t, 0, t + count, 0,
+					    b - t - count, vc->vc_cols);
+			else
+				goto redraw_down;
+			fbcon_clear(vc, t, 0, count, vc->vc_cols);
+			break;
+
+		case SCROLL_PAN_MOVE:
+			if ((count - p->yscroll <= p->vrows - vc->vc_rows)
+			    && ((!scroll_partial && (b - t == vc->vc_rows))
+				|| (scroll_partial
+				    && (b - t - count >
+					3 * vc->vc_rows >> 2)))) {
+				if (vc->vc_rows - b > 0)
+					fbcon_bmove(vc, b, 0, b - count, 0,
+						    vc->vc_rows - b,
+						    vc->vc_cols);
+				ypan_down(vc, count);
+				if (t > 0)
+					fbcon_bmove(vc, count, 0, 0, 0, t,
+						    vc->vc_cols);
+			} else if (info->flags & FBINFO_READS_FAST)
+				fbcon_bmove(vc, t, 0, t + count, 0,
+					    b - t - count, vc->vc_cols);
+			else
+				goto redraw_down;
+			fbcon_clear(vc, t, 0, count, vc->vc_cols);
+			break;
+
+		case SCROLL_PAN_REDRAW:
+			if ((count - p->yscroll <= p->vrows - vc->vc_rows)
+			    && ((!scroll_partial && (b - t == vc->vc_rows))
+				|| (scroll_partial
+				    && (b - t - count >
+					3 * vc->vc_rows >> 2)))) {
+				if (vc->vc_rows - b > 0)
+					fbcon_redraw_move(vc, p, b, vc->vc_rows - b,
+							  b - count);
+				ypan_down_redraw(vc, t, count);
+				if (t > 0)
+					fbcon_redraw_move(vc, p, count, t, 0);
+			} else
+				fbcon_redraw_move(vc, p, t, b - t - count, t + count);
+			fbcon_clear(vc, t, 0, count, vc->vc_cols);
+			break;
+
+		case SCROLL_REDRAW:
+		      redraw_down:
+			fbcon_redraw(vc, p, b - 1, b - t - count,
+				     -count * vc->vc_cols);
+			fbcon_clear(vc, t, 0, count, vc->vc_cols);
+			scr_memsetw((unsigned short *) (vc->vc_origin +
+							vc->vc_size_row *
+							t),
+				    vc->vc_video_erase_char,
+				    vc->vc_size_row * count);
+			return true;
+		}
+	}
+	return false;
+}
+
+
+static void fbcon_bmove(struct vc_data *vc, int sy, int sx, int dy, int dx,
+			int height, int width)
+{
+	struct fb_info *info = registered_fb[con2fb_map[vc->vc_num]];
+	struct display *p = &fb_display[vc->vc_num];
+	
+	if (fbcon_is_inactive(vc, info))
+		return;
+
+	if (!width || !height)
+		return;
+
+	/*  Split blits that cross physical y_wrap case.
+	 *  Pathological case involves 4 blits, better to use recursive
+	 *  code rather than unrolled case
+	 *
+	 *  Recursive invocations don't need to erase the cursor over and
+	 *  over again, so we use fbcon_bmove_rec()
+	 */
+	fbcon_bmove_rec(vc, p, sy, sx, dy, dx, height, width,
+			p->vrows - p->yscroll);
+}
+
+static void fbcon_bmove_rec(struct vc_data *vc, struct display *p, int sy, int sx, 
+			    int dy, int dx, int height, int width, u_int y_break)
+{
+	struct fb_info *info = registered_fb[con2fb_map[vc->vc_num]];
+	struct fbcon_ops *ops = info->fbcon_par;
+	u_int b;
+
+	if (sy < y_break && sy + height > y_break) {
+		b = y_break - sy;
+		if (dy < sy) {	/* Avoid trashing self */
+			fbcon_bmove_rec(vc, p, sy, sx, dy, dx, b, width,
+					y_break);
+			fbcon_bmove_rec(vc, p, sy + b, sx, dy + b, dx,
+					height - b, width, y_break);
+		} else {
+			fbcon_bmove_rec(vc, p, sy + b, sx, dy + b, dx,
+					height - b, width, y_break);
+			fbcon_bmove_rec(vc, p, sy, sx, dy, dx, b, width,
+					y_break);
+		}
+		return;
+	}
+
+	if (dy < y_break && dy + height > y_break) {
+		b = y_break - dy;
+		if (dy < sy) {	/* Avoid trashing self */
+			fbcon_bmove_rec(vc, p, sy, sx, dy, dx, b, width,
+					y_break);
+			fbcon_bmove_rec(vc, p, sy + b, sx, dy + b, dx,
+					height - b, width, y_break);
+		} else {
+			fbcon_bmove_rec(vc, p, sy + b, sx, dy + b, dx,
+					height - b, width, y_break);
+			fbcon_bmove_rec(vc, p, sy, sx, dy, dx, b, width,
+					y_break);
+		}
+		return;
+	}
+	ops->bmove(vc, info, real_y(p, sy), sx, real_y(p, dy), dx,
+		   height, width);
+}
+
+static void updatescrollmode(struct display *p,
+					struct fb_info *info,
+					struct vc_data *vc)
+{
+	struct fbcon_ops *ops = info->fbcon_par;
+	int fh = vc->vc_font.height;
+	int cap = info->flags;
+	u16 t = 0;
+	int ypan = FBCON_SWAP(ops->rotate, info->fix.ypanstep,
+				  info->fix.xpanstep);
+	int ywrap = FBCON_SWAP(ops->rotate, info->fix.ywrapstep, t);
+	int yres = FBCON_SWAP(ops->rotate, info->var.yres, info->var.xres);
+	int vyres = FBCON_SWAP(ops->rotate, info->var.yres_virtual,
+				   info->var.xres_virtual);
+	int good_pan = (cap & FBINFO_HWACCEL_YPAN) &&
+		divides(ypan, vc->vc_font.height) && vyres > yres;
+	int good_wrap = (cap & FBINFO_HWACCEL_YWRAP) &&
+		divides(ywrap, vc->vc_font.height) &&
+		divides(vc->vc_font.height, vyres) &&
+		divides(vc->vc_font.height, yres);
+	int reading_fast = cap & FBINFO_READS_FAST;
+	int fast_copyarea = (cap & FBINFO_HWACCEL_COPYAREA) &&
+		!(cap & FBINFO_HWACCEL_DISABLED);
+	int fast_imageblit = (cap & FBINFO_HWACCEL_IMAGEBLIT) &&
+		!(cap & FBINFO_HWACCEL_DISABLED);
+
+	p->vrows = vyres/fh;
+	if (yres > (fh * (vc->vc_rows + 1)))
+		p->vrows -= (yres - (fh * vc->vc_rows)) / fh;
+	if ((yres % fh) && (vyres % fh < yres % fh))
+		p->vrows--;
+
+	if (good_wrap || good_pan) {
+		if (reading_fast || fast_copyarea)
+			p->scrollmode = good_wrap ?
+				SCROLL_WRAP_MOVE : SCROLL_PAN_MOVE;
+		else
+			p->scrollmode = good_wrap ? SCROLL_REDRAW :
+				SCROLL_PAN_REDRAW;
+	} else {
+		if (reading_fast || (fast_copyarea && !fast_imageblit))
+			p->scrollmode = SCROLL_MOVE;
+		else
+			p->scrollmode = SCROLL_REDRAW;
+	}
+}
+
+static int fbcon_resize(struct vc_data *vc, unsigned int width, 
+			unsigned int height, unsigned int user)
+{
+	struct fb_info *info = registered_fb[con2fb_map[vc->vc_num]];
+	struct fbcon_ops *ops = info->fbcon_par;
+	struct display *p = &fb_display[vc->vc_num];
+	struct fb_var_screeninfo var = info->var;
+	int x_diff, y_diff, virt_w, virt_h, virt_fw, virt_fh;
+
+	virt_w = FBCON_SWAP(ops->rotate, width, height);
+	virt_h = FBCON_SWAP(ops->rotate, height, width);
+	virt_fw = FBCON_SWAP(ops->rotate, vc->vc_font.width,
+				 vc->vc_font.height);
+	virt_fh = FBCON_SWAP(ops->rotate, vc->vc_font.height,
+				 vc->vc_font.width);
+	var.xres = virt_w * virt_fw;
+	var.yres = virt_h * virt_fh;
+	x_diff = info->var.xres - var.xres;
+	y_diff = info->var.yres - var.yres;
+	if (x_diff < 0 || x_diff > virt_fw ||
+	    y_diff < 0 || y_diff > virt_fh) {
+		const struct fb_videomode *mode;
+
+		DPRINTK("attempting resize %ix%i\n", var.xres, var.yres);
+		mode = fb_find_best_mode(&var, &info->modelist);
+		if (mode == NULL)
+			return -EINVAL;
+		display_to_var(&var, p);
+		fb_videomode_to_var(&var, mode);
+
+		if (virt_w > var.xres/virt_fw || virt_h > var.yres/virt_fh)
+			return -EINVAL;
+
+		DPRINTK("resize now %ix%i\n", var.xres, var.yres);
+		if (con_is_visible(vc)) {
+			var.activate = FB_ACTIVATE_NOW |
+				FB_ACTIVATE_FORCE;
+			fb_set_var(info, &var);
+		}
+		var_to_display(p, &info->var, info);
+		ops->var = info->var;
+	}
+	updatescrollmode(p, info, vc);
+	return 0;
+}
+
+static int fbcon_switch(struct vc_data *vc)
+{
+	struct fb_info *info, *old_info = NULL;
+	struct fbcon_ops *ops;
+	struct display *p = &fb_display[vc->vc_num];
+	struct fb_var_screeninfo var;
+	int i, ret, prev_console, charcnt = 256;
+
+	info = registered_fb[con2fb_map[vc->vc_num]];
+	ops = info->fbcon_par;
+
+	if (softback_top) {
+		if (softback_lines)
+			fbcon_set_origin(vc);
+		softback_top = softback_curr = softback_in = softback_buf;
+		softback_lines = 0;
+		fbcon_update_softback(vc);
+	}
+
+	if (logo_shown >= 0) {
+		struct vc_data *conp2 = vc_cons[logo_shown].d;
+
+		if (conp2->vc_top == logo_lines
+		    && conp2->vc_bottom == conp2->vc_rows)
+			conp2->vc_top = 0;
+		logo_shown = FBCON_LOGO_CANSHOW;
+	}
+
+	prev_console = ops->currcon;
+	if (prev_console != -1)
+		old_info = registered_fb[con2fb_map[prev_console]];
+	/*
+	 * FIXME: If we have multiple fbdev's loaded, we need to
+	 * update all info->currcon.  Perhaps, we can place this
+	 * in a centralized structure, but this might break some
+	 * drivers.
+	 *
+	 * info->currcon = vc->vc_num;
+	 */
+	for (i = 0; i < FB_MAX; i++) {
+		if (registered_fb[i] != NULL && registered_fb[i]->fbcon_par) {
+			struct fbcon_ops *o = registered_fb[i]->fbcon_par;
+
+			o->currcon = vc->vc_num;
+		}
+	}
+	memset(&var, 0, sizeof(struct fb_var_screeninfo));
+	display_to_var(&var, p);
+	var.activate = FB_ACTIVATE_NOW;
+
+	/*
+	 * make sure we don't unnecessarily trip the memcmp()
+	 * in fb_set_var()
+	 */
+	info->var.activate = var.activate;
+	var.vmode |= info->var.vmode & ~FB_VMODE_MASK;
+	fb_set_var(info, &var);
+	ops->var = info->var;
+
+	if (old_info != NULL && (old_info != info ||
+				 info->flags & FBINFO_MISC_ALWAYS_SETPAR)) {
+		if (info->fbops->fb_set_par) {
+			ret = info->fbops->fb_set_par(info);
+
+			if (ret)
+				printk(KERN_ERR "fbcon_switch: detected "
+					"unhandled fb_set_par error, "
+					"error code %d\n", ret);
+		}
+
+		if (old_info != info)
+			fbcon_del_cursor_timer(old_info);
+	}
+
+	if (fbcon_is_inactive(vc, info) ||
+	    ops->blank_state != FB_BLANK_UNBLANK)
+		fbcon_del_cursor_timer(info);
+	else
+		fbcon_add_cursor_timer(info);
+
+	set_blitting_type(vc, info);
+	ops->cursor_reset = 1;
+
+	if (ops->rotate_font && ops->rotate_font(info, vc)) {
+		ops->rotate = FB_ROTATE_UR;
+		set_blitting_type(vc, info);
+	}
+
+	vc->vc_can_do_color = (fb_get_color_depth(&info->var, &info->fix)!=1);
+	vc->vc_complement_mask = vc->vc_can_do_color ? 0x7700 : 0x0800;
+
+	if (p->userfont)
+		charcnt = FNTCHARCNT(vc->vc_font.data);
+
+	if (charcnt > 256)
+		vc->vc_complement_mask <<= 1;
+
+	updatescrollmode(p, info, vc);
+
+	switch (p->scrollmode) {
+	case SCROLL_WRAP_MOVE:
+		scrollback_phys_max = p->vrows - vc->vc_rows;
+		break;
+	case SCROLL_PAN_MOVE:
+	case SCROLL_PAN_REDRAW:
+		scrollback_phys_max = p->vrows - 2 * vc->vc_rows;
+		if (scrollback_phys_max < 0)
+			scrollback_phys_max = 0;
+		break;
+	default:
+		scrollback_phys_max = 0;
+		break;
+	}
+
+	scrollback_max = 0;
+	scrollback_current = 0;
+
+	if (!fbcon_is_inactive(vc, info)) {
+	    ops->var.xoffset = ops->var.yoffset = p->yscroll = 0;
+	    ops->update_start(info);
+	}
+
+	fbcon_set_palette(vc, color_table); 	
+	fbcon_clear_margins(vc, 0);
+
+	if (logo_shown == FBCON_LOGO_DRAW) {
+
+		logo_shown = fg_console;
+		/* This is protected above by initmem_freed */
+		fb_show_logo(info, ops->rotate);
+		update_region(vc,
+			      vc->vc_origin + vc->vc_size_row * vc->vc_top,
+			      vc->vc_size_row * (vc->vc_bottom -
+						 vc->vc_top) / 2);
+		return 0;
+	}
+	return 1;
+}
+
+static void fbcon_generic_blank(struct vc_data *vc, struct fb_info *info,
+				int blank)
+{
+	struct fb_event event;
+
+	if (blank) {
+		unsigned short charmask = vc->vc_hi_font_mask ?
+			0x1ff : 0xff;
+		unsigned short oldc;
+
+		oldc = vc->vc_video_erase_char;
+		vc->vc_video_erase_char &= charmask;
+		fbcon_clear(vc, 0, 0, vc->vc_rows, vc->vc_cols);
+		vc->vc_video_erase_char = oldc;
+	}
+
+
+	if (!lock_fb_info(info))
+		return;
+	event.info = info;
+	event.data = &blank;
+	fb_notifier_call_chain(FB_EVENT_CONBLANK, &event);
+	unlock_fb_info(info);
+}
+
+static int fbcon_blank(struct vc_data *vc, int blank, int mode_switch)
+{
+	struct fb_info *info = registered_fb[con2fb_map[vc->vc_num]];
+	struct fbcon_ops *ops = info->fbcon_par;
+
+	if (mode_switch) {
+		struct fb_var_screeninfo var = info->var;
+
+		ops->graphics = 1;
+
+		if (!blank) {
+			var.activate = FB_ACTIVATE_NOW | FB_ACTIVATE_FORCE;
+			fb_set_var(info, &var);
+			ops->graphics = 0;
+			ops->var = info->var;
+		}
+	}
+
+ 	if (!fbcon_is_inactive(vc, info)) {
+		if (ops->blank_state != blank) {
+			ops->blank_state = blank;
+			fbcon_cursor(vc, blank ? CM_ERASE : CM_DRAW);
+			ops->cursor_flash = (!blank);
+
+			if (!(info->flags & FBINFO_MISC_USEREVENT))
+				if (fb_blank(info, blank))
+					fbcon_generic_blank(vc, info, blank);
+		}
+
+		if (!blank)
+			update_screen(vc);
+	}
+
+	if (mode_switch || fbcon_is_inactive(vc, info) ||
+	    ops->blank_state != FB_BLANK_UNBLANK)
+		fbcon_del_cursor_timer(info);
+	else
+		fbcon_add_cursor_timer(info);
+
+	return 0;
+}
+
+static int fbcon_debug_enter(struct vc_data *vc)
+{
+	struct fb_info *info = registered_fb[con2fb_map[vc->vc_num]];
+	struct fbcon_ops *ops = info->fbcon_par;
+
+	ops->save_graphics = ops->graphics;
+	ops->graphics = 0;
+	if (info->fbops->fb_debug_enter)
+		info->fbops->fb_debug_enter(info);
+	fbcon_set_palette(vc, color_table);
+	return 0;
+}
+
+static int fbcon_debug_leave(struct vc_data *vc)
+{
+	struct fb_info *info = registered_fb[con2fb_map[vc->vc_num]];
+	struct fbcon_ops *ops = info->fbcon_par;
+
+	ops->graphics = ops->save_graphics;
+	if (info->fbops->fb_debug_leave)
+		info->fbops->fb_debug_leave(info);
+	return 0;
+}
+
+static int fbcon_get_font(struct vc_data *vc, struct console_font *font)
+{
+	u8 *fontdata = vc->vc_font.data;
+	u8 *data = font->data;
+	int i, j;
+
+	font->width = vc->vc_font.width;
+	font->height = vc->vc_font.height;
+	font->charcount = vc->vc_hi_font_mask ? 512 : 256;
+	if (!font->data)
+		return 0;
+
+	if (font->width <= 8) {
+		j = vc->vc_font.height;
+		for (i = 0; i < font->charcount; i++) {
+			memcpy(data, fontdata, j);
+			memset(data + j, 0, 32 - j);
+			data += 32;
+			fontdata += j;
+		}
+	} else if (font->width <= 16) {
+		j = vc->vc_font.height * 2;
+		for (i = 0; i < font->charcount; i++) {
+			memcpy(data, fontdata, j);
+			memset(data + j, 0, 64 - j);
+			data += 64;
+			fontdata += j;
+		}
+	} else if (font->width <= 24) {
+		for (i = 0; i < font->charcount; i++) {
+			for (j = 0; j < vc->vc_font.height; j++) {
+				*data++ = fontdata[0];
+				*data++ = fontdata[1];
+				*data++ = fontdata[2];
+				fontdata += sizeof(u32);
+			}
+			memset(data, 0, 3 * (32 - j));
+			data += 3 * (32 - j);
+		}
+	} else {
+		j = vc->vc_font.height * 4;
+		for (i = 0; i < font->charcount; i++) {
+			memcpy(data, fontdata, j);
+			memset(data + j, 0, 128 - j);
+			data += 128;
+			fontdata += j;
+		}
+	}
+	return 0;
+}
+
+/* set/clear vc_hi_font_mask and update vc attrs accordingly */
+static void set_vc_hi_font(struct vc_data *vc, bool set)
+{
+	if (!set) {
+		vc->vc_hi_font_mask = 0;
+		if (vc->vc_can_do_color) {
+			vc->vc_complement_mask >>= 1;
+			vc->vc_s_complement_mask >>= 1;
+		}
+			
+		/* ++Edmund: reorder the attribute bits */
+		if (vc->vc_can_do_color) {
+			unsigned short *cp =
+			    (unsigned short *) vc->vc_origin;
+			int count = vc->vc_screenbuf_size / 2;
+			unsigned short c;
+			for (; count > 0; count--, cp++) {
+				c = scr_readw(cp);
+				scr_writew(((c & 0xfe00) >> 1) |
+					   (c & 0xff), cp);
+			}
+			c = vc->vc_video_erase_char;
+			vc->vc_video_erase_char =
+			    ((c & 0xfe00) >> 1) | (c & 0xff);
+			vc->vc_attr >>= 1;
+		}
+	} else {
+		vc->vc_hi_font_mask = 0x100;
+		if (vc->vc_can_do_color) {
+			vc->vc_complement_mask <<= 1;
+			vc->vc_s_complement_mask <<= 1;
+		}
+			
+		/* ++Edmund: reorder the attribute bits */
+		{
+			unsigned short *cp =
+			    (unsigned short *) vc->vc_origin;
+			int count = vc->vc_screenbuf_size / 2;
+			unsigned short c;
+			for (; count > 0; count--, cp++) {
+				unsigned short newc;
+				c = scr_readw(cp);
+				if (vc->vc_can_do_color)
+					newc =
+					    ((c & 0xff00) << 1) | (c &
+								   0xff);
+				else
+					newc = c & ~0x100;
+				scr_writew(newc, cp);
+			}
+			c = vc->vc_video_erase_char;
+			if (vc->vc_can_do_color) {
+				vc->vc_video_erase_char =
+				    ((c & 0xff00) << 1) | (c & 0xff);
+				vc->vc_attr <<= 1;
+			} else
+				vc->vc_video_erase_char = c & ~0x100;
+		}
+	}
+}
+
+static int fbcon_do_set_font(struct vc_data *vc, int w, int h,
+			     const u8 * data, int userfont)
+{
+	struct fb_info *info = registered_fb[con2fb_map[vc->vc_num]];
+	struct fbcon_ops *ops = info->fbcon_par;
+	struct display *p = &fb_display[vc->vc_num];
+	int resize;
+	int cnt;
+	char *old_data = NULL;
+
+	if (con_is_visible(vc) && softback_lines)
+		fbcon_set_origin(vc);
+
+	resize = (w != vc->vc_font.width) || (h != vc->vc_font.height);
+	if (p->userfont)
+		old_data = vc->vc_font.data;
+	if (userfont)
+		cnt = FNTCHARCNT(data);
+	else
+		cnt = 256;
+	vc->vc_font.data = (void *)(p->fontdata = data);
+	if ((p->userfont = userfont))
+		REFCOUNT(data)++;
+	vc->vc_font.width = w;
+	vc->vc_font.height = h;
+	if (vc->vc_hi_font_mask && cnt == 256)
+		set_vc_hi_font(vc, false);
+	else if (!vc->vc_hi_font_mask && cnt == 512)
+		set_vc_hi_font(vc, true);
+
+	if (resize) {
+		int cols, rows;
+
+		cols = FBCON_SWAP(ops->rotate, info->var.xres, info->var.yres);
+		rows = FBCON_SWAP(ops->rotate, info->var.yres, info->var.xres);
+		cols /= w;
+		rows /= h;
+		vc_resize(vc, cols, rows);
+		if (con_is_visible(vc) && softback_buf)
+			fbcon_update_softback(vc);
+	} else if (con_is_visible(vc)
+		   && vc->vc_mode == KD_TEXT) {
+		fbcon_clear_margins(vc, 0);
+		update_screen(vc);
+	}
+
+	if (old_data && (--REFCOUNT(old_data) == 0))
+		kfree(old_data - FONT_EXTRA_WORDS * sizeof(int));
+	return 0;
+}
+
+static int fbcon_copy_font(struct vc_data *vc, int con)
+{
+	struct display *od = &fb_display[con];
+	struct console_font *f = &vc->vc_font;
+
+	if (od->fontdata == f->data)
+		return 0;	/* already the same font... */
+	return fbcon_do_set_font(vc, f->width, f->height, od->fontdata, od->userfont);
+}
+
+/*
+ *  User asked to set font; we are guaranteed that
+ *	a) width and height are in range 1..32
+ *	b) charcount does not exceed 512
+ *  but lets not assume that, since someone might someday want to use larger
+ *  fonts. And charcount of 512 is small for unicode support.
+ *
+ *  However, user space gives the font in 32 rows , regardless of
+ *  actual font height. So a new API is needed if support for larger fonts
+ *  is ever implemented.
+ */
+
+static int fbcon_set_font(struct vc_data *vc, struct console_font *font, unsigned flags)
+{
+	struct fb_info *info = registered_fb[con2fb_map[vc->vc_num]];
+	unsigned charcount = font->charcount;
+	int w = font->width;
+	int h = font->height;
+	int size;
+	int i, csum;
+	u8 *new_data, *data = font->data;
+	int pitch = (font->width+7) >> 3;
+
+	/* Is there a reason why fbconsole couldn't handle any charcount >256?
+	 * If not this check should be changed to charcount < 256 */
+	if (charcount != 256 && charcount != 512)
+		return -EINVAL;
+
+	/* Make sure drawing engine can handle the font */
+	if (!(info->pixmap.blit_x & (1 << (font->width - 1))) ||
+	    !(info->pixmap.blit_y & (1 << (font->height - 1))))
+		return -EINVAL;
+
+	/* Make sure driver can handle the font length */
+	if (fbcon_invalid_charcount(info, charcount))
+		return -EINVAL;
+
+	size = h * pitch * charcount;
+
+	new_data = kmalloc(FONT_EXTRA_WORDS * sizeof(int) + size, GFP_USER);
+
+	if (!new_data)
+		return -ENOMEM;
+
+	new_data += FONT_EXTRA_WORDS * sizeof(int);
+	FNTSIZE(new_data) = size;
+	FNTCHARCNT(new_data) = charcount;
+	REFCOUNT(new_data) = 0;	/* usage counter */
+	for (i=0; i< charcount; i++) {
+		memcpy(new_data + i*h*pitch, data +  i*32*pitch, h*pitch);
+	}
+
+	/* Since linux has a nice crc32 function use it for counting font
+	 * checksums. */
+	csum = crc32(0, new_data, size);
+
+	FNTSUM(new_data) = csum;
+	/* Check if the same font is on some other console already */
+	for (i = first_fb_vc; i <= last_fb_vc; i++) {
+		struct vc_data *tmp = vc_cons[i].d;
+		
+		if (fb_display[i].userfont &&
+		    fb_display[i].fontdata &&
+		    FNTSUM(fb_display[i].fontdata) == csum &&
+		    FNTSIZE(fb_display[i].fontdata) == size &&
+		    tmp->vc_font.width == w &&
+		    !memcmp(fb_display[i].fontdata, new_data, size)) {
+			kfree(new_data - FONT_EXTRA_WORDS * sizeof(int));
+			new_data = (u8 *)fb_display[i].fontdata;
+			break;
+		}
+	}
+	return fbcon_do_set_font(vc, font->width, font->height, new_data, 1);
+}
+
+static int fbcon_set_def_font(struct vc_data *vc, struct console_font *font, char *name)
+{
+	struct fb_info *info = registered_fb[con2fb_map[vc->vc_num]];
+	const struct font_desc *f;
+
+	if (!name)
+		f = get_default_font(info->var.xres, info->var.yres,
+				     info->pixmap.blit_x, info->pixmap.blit_y);
+	else if (!(f = find_font(name)))
+		return -ENOENT;
+
+	font->width = f->width;
+	font->height = f->height;
+	return fbcon_do_set_font(vc, f->width, f->height, f->data, 0);
+}
+
+static u16 palette_red[16];
+static u16 palette_green[16];
+static u16 palette_blue[16];
+
+static struct fb_cmap palette_cmap = {
+	0, 16, palette_red, palette_green, palette_blue, NULL
+};
+
+static void fbcon_set_palette(struct vc_data *vc, const unsigned char *table)
+{
+	struct fb_info *info = registered_fb[con2fb_map[vc->vc_num]];
+	int i, j, k, depth;
+	u8 val;
+
+	if (fbcon_is_inactive(vc, info))
+		return;
+
+	if (!con_is_visible(vc))
+		return;
+
+	depth = fb_get_color_depth(&info->var, &info->fix);
+	if (depth > 3) {
+		for (i = j = 0; i < 16; i++) {
+			k = table[i];
+			val = vc->vc_palette[j++];
+			palette_red[k] = (val << 8) | val;
+			val = vc->vc_palette[j++];
+			palette_green[k] = (val << 8) | val;
+			val = vc->vc_palette[j++];
+			palette_blue[k] = (val << 8) | val;
+		}
+		palette_cmap.len = 16;
+		palette_cmap.start = 0;
+	/*
+	 * If framebuffer is capable of less than 16 colors,
+	 * use default palette of fbcon.
+	 */
+	} else
+		fb_copy_cmap(fb_default_cmap(1 << depth), &palette_cmap);
+
+	fb_set_cmap(&palette_cmap, info);
+}
+
+static u16 *fbcon_screen_pos(struct vc_data *vc, int offset)
+{
+	unsigned long p;
+	int line;
+	
+	if (vc->vc_num != fg_console || !softback_lines)
+		return (u16 *) (vc->vc_origin + offset);
+	line = offset / vc->vc_size_row;
+	if (line >= softback_lines)
+		return (u16 *) (vc->vc_origin + offset -
+				softback_lines * vc->vc_size_row);
+	p = softback_curr + offset;
+	if (p >= softback_end)
+		p += softback_buf - softback_end;
+	return (u16 *) p;
+}
+
+static unsigned long fbcon_getxy(struct vc_data *vc, unsigned long pos,
+				 int *px, int *py)
+{
+	unsigned long ret;
+	int x, y;
+
+	if (pos >= vc->vc_origin && pos < vc->vc_scr_end) {
+		unsigned long offset = (pos - vc->vc_origin) / 2;
+
+		x = offset % vc->vc_cols;
+		y = offset / vc->vc_cols;
+		if (vc->vc_num == fg_console)
+			y += softback_lines;
+		ret = pos + (vc->vc_cols - x) * 2;
+	} else if (vc->vc_num == fg_console && softback_lines) {
+		unsigned long offset = pos - softback_curr;
+
+		if (pos < softback_curr)
+			offset += softback_end - softback_buf;
+		offset /= 2;
+		x = offset % vc->vc_cols;
+		y = offset / vc->vc_cols;
+		ret = pos + (vc->vc_cols - x) * 2;
+		if (ret == softback_end)
+			ret = softback_buf;
+		if (ret == softback_in)
+			ret = vc->vc_origin;
+	} else {
+		/* Should not happen */
+		x = y = 0;
+		ret = vc->vc_origin;
+	}
+	if (px)
+		*px = x;
+	if (py)
+		*py = y;
+	return ret;
+}
+
+/* As we might be inside of softback, we may work with non-contiguous buffer,
+   that's why we have to use a separate routine. */
+static void fbcon_invert_region(struct vc_data *vc, u16 * p, int cnt)
+{
+	while (cnt--) {
+		u16 a = scr_readw(p);
+		if (!vc->vc_can_do_color)
+			a ^= 0x0800;
+		else if (vc->vc_hi_font_mask == 0x100)
+			a = ((a) & 0x11ff) | (((a) & 0xe000) >> 4) |
+			    (((a) & 0x0e00) << 4);
+		else
+			a = ((a) & 0x88ff) | (((a) & 0x7000) >> 4) |
+			    (((a) & 0x0700) << 4);
+		scr_writew(a, p++);
+		if (p == (u16 *) softback_end)
+			p = (u16 *) softback_buf;
+		if (p == (u16 *) softback_in)
+			p = (u16 *) vc->vc_origin;
+	}
+}
+
+static void fbcon_scrolldelta(struct vc_data *vc, int lines)
+{
+	struct fb_info *info = registered_fb[con2fb_map[fg_console]];
+	struct fbcon_ops *ops = info->fbcon_par;
+	struct display *disp = &fb_display[fg_console];
+	int offset, limit, scrollback_old;
+
+	if (softback_top) {
+		if (vc->vc_num != fg_console)
+			return;
+		if (vc->vc_mode != KD_TEXT || !lines)
+			return;
+		if (logo_shown >= 0) {
+			struct vc_data *conp2 = vc_cons[logo_shown].d;
+
+			if (conp2->vc_top == logo_lines
+			    && conp2->vc_bottom == conp2->vc_rows)
+				conp2->vc_top = 0;
+			if (logo_shown == vc->vc_num) {
+				unsigned long p, q;
+				int i;
+
+				p = softback_in;
+				q = vc->vc_origin +
+				    logo_lines * vc->vc_size_row;
+				for (i = 0; i < logo_lines; i++) {
+					if (p == softback_top)
+						break;
+					if (p == softback_buf)
+						p = softback_end;
+					p -= vc->vc_size_row;
+					q -= vc->vc_size_row;
+					scr_memcpyw((u16 *) q, (u16 *) p,
+						    vc->vc_size_row);
+				}
+				softback_in = softback_curr = p;
+				update_region(vc, vc->vc_origin,
+					      logo_lines * vc->vc_cols);
+			}
+			logo_shown = FBCON_LOGO_CANSHOW;
+		}
+		fbcon_cursor(vc, CM_ERASE | CM_SOFTBACK);
+		fbcon_redraw_softback(vc, disp, lines);
+		fbcon_cursor(vc, CM_DRAW | CM_SOFTBACK);
+		return;
+	}
+
+	if (!scrollback_phys_max)
+		return;
+
+	scrollback_old = scrollback_current;
+	scrollback_current -= lines;
+	if (scrollback_current < 0)
+		scrollback_current = 0;
+	else if (scrollback_current > scrollback_max)
+		scrollback_current = scrollback_max;
+	if (scrollback_current == scrollback_old)
+		return;
+
+	if (fbcon_is_inactive(vc, info))
+		return;
+
+	fbcon_cursor(vc, CM_ERASE);
+
+	offset = disp->yscroll - scrollback_current;
+	limit = disp->vrows;
+	switch (disp->scrollmode) {
+	case SCROLL_WRAP_MOVE:
+		info->var.vmode |= FB_VMODE_YWRAP;
+		break;
+	case SCROLL_PAN_MOVE:
+	case SCROLL_PAN_REDRAW:
+		limit -= vc->vc_rows;
+		info->var.vmode &= ~FB_VMODE_YWRAP;
+		break;
+	}
+	if (offset < 0)
+		offset += limit;
+	else if (offset >= limit)
+		offset -= limit;
+
+	ops->var.xoffset = 0;
+	ops->var.yoffset = offset * vc->vc_font.height;
+	ops->update_start(info);
+
+	if (!scrollback_current)
+		fbcon_cursor(vc, CM_DRAW);
+}
+
+static int fbcon_set_origin(struct vc_data *vc)
+{
+	if (softback_lines)
+		fbcon_scrolldelta(vc, softback_lines);
+	return 0;
+}
+
+static void fbcon_suspended(struct fb_info *info)
+{
+	struct vc_data *vc = NULL;
+	struct fbcon_ops *ops = info->fbcon_par;
+
+	if (!ops || ops->currcon < 0)
+		return;
+	vc = vc_cons[ops->currcon].d;
+
+	/* Clear cursor, restore saved data */
+	fbcon_cursor(vc, CM_ERASE);
+}
+
+static void fbcon_resumed(struct fb_info *info)
+{
+	struct vc_data *vc;
+	struct fbcon_ops *ops = info->fbcon_par;
+
+	if (!ops || ops->currcon < 0)
+		return;
+	vc = vc_cons[ops->currcon].d;
+
+	update_screen(vc);
+}
+
+static void fbcon_modechanged(struct fb_info *info)
+{
+	struct fbcon_ops *ops = info->fbcon_par;
+	struct vc_data *vc;
+	struct display *p;
+	int rows, cols;
+
+	if (!ops || ops->currcon < 0)
+		return;
+	vc = vc_cons[ops->currcon].d;
+	if (vc->vc_mode != KD_TEXT ||
+	    registered_fb[con2fb_map[ops->currcon]] != info)
+		return;
+
+	p = &fb_display[vc->vc_num];
+	set_blitting_type(vc, info);
+
+	if (con_is_visible(vc)) {
+		var_to_display(p, &info->var, info);
+		cols = FBCON_SWAP(ops->rotate, info->var.xres, info->var.yres);
+		rows = FBCON_SWAP(ops->rotate, info->var.yres, info->var.xres);
+		cols /= vc->vc_font.width;
+		rows /= vc->vc_font.height;
+		vc_resize(vc, cols, rows);
+		updatescrollmode(p, info, vc);
+		scrollback_max = 0;
+		scrollback_current = 0;
+
+		if (!fbcon_is_inactive(vc, info)) {
+		    ops->var.xoffset = ops->var.yoffset = p->yscroll = 0;
+		    ops->update_start(info);
+		}
+
+		fbcon_set_palette(vc, color_table);
+		update_screen(vc);
+		if (softback_buf)
+			fbcon_update_softback(vc);
+	}
+}
+
+static void fbcon_set_all_vcs(struct fb_info *info)
+{
+	struct fbcon_ops *ops = info->fbcon_par;
+	struct vc_data *vc;
+	struct display *p;
+	int i, rows, cols, fg = -1;
+
+	if (!ops || ops->currcon < 0)
+		return;
+
+	for (i = first_fb_vc; i <= last_fb_vc; i++) {
+		vc = vc_cons[i].d;
+		if (!vc || vc->vc_mode != KD_TEXT ||
+		    registered_fb[con2fb_map[i]] != info)
+			continue;
+
+		if (con_is_visible(vc)) {
+			fg = i;
+			continue;
+		}
+
+		p = &fb_display[vc->vc_num];
+		set_blitting_type(vc, info);
+		var_to_display(p, &info->var, info);
+		cols = FBCON_SWAP(ops->rotate, info->var.xres, info->var.yres);
+		rows = FBCON_SWAP(ops->rotate, info->var.yres, info->var.xres);
+		cols /= vc->vc_font.width;
+		rows /= vc->vc_font.height;
+		vc_resize(vc, cols, rows);
+	}
+
+	if (fg != -1)
+		fbcon_modechanged(info);
+}
+
+static int fbcon_mode_deleted(struct fb_info *info,
+			      struct fb_videomode *mode)
+{
+	struct fb_info *fb_info;
+	struct display *p;
+	int i, j, found = 0;
+
+	/* before deletion, ensure that mode is not in use */
+	for (i = first_fb_vc; i <= last_fb_vc; i++) {
+		j = con2fb_map[i];
+		if (j == -1)
+			continue;
+		fb_info = registered_fb[j];
+		if (fb_info != info)
+			continue;
+		p = &fb_display[i];
+		if (!p || !p->mode)
+			continue;
+		if (fb_mode_is_equal(p->mode, mode)) {
+			found = 1;
+			break;
+		}
+	}
+	return found;
+}
+
+#ifdef CONFIG_VT_HW_CONSOLE_BINDING
+static int fbcon_unbind(void)
+{
+	int ret;
+
+	ret = do_unbind_con_driver(&fb_con, first_fb_vc, last_fb_vc,
+				fbcon_is_default);
+
+	if (!ret)
+		fbcon_has_console_bind = 0;
+
+	return ret;
+}
+#else
+static inline int fbcon_unbind(void)
+{
+	return -EINVAL;
+}
+#endif /* CONFIG_VT_HW_CONSOLE_BINDING */
+
+/* called with console_lock held */
+static int fbcon_fb_unbind(int idx)
+{
+	int i, new_idx = -1, ret = 0;
+
+	if (!fbcon_has_console_bind)
+		return 0;
+
+	for (i = first_fb_vc; i <= last_fb_vc; i++) {
+		if (con2fb_map[i] != idx &&
+		    con2fb_map[i] != -1) {
+			new_idx = i;
+			break;
+		}
+	}
+
+	if (new_idx != -1) {
+		for (i = first_fb_vc; i <= last_fb_vc; i++) {
+			if (con2fb_map[i] == idx)
+				set_con2fb_map(i, new_idx, 0);
+		}
+	} else {
+		struct fb_info *info = registered_fb[idx];
+
+		/* This is sort of like set_con2fb_map, except it maps
+		 * the consoles to no device and then releases the
+		 * oldinfo to free memory and cancel the cursor blink
+		 * timer. I can imagine this just becoming part of
+		 * set_con2fb_map where new_idx is -1
+		 */
+		for (i = first_fb_vc; i <= last_fb_vc; i++) {
+			if (con2fb_map[i] == idx) {
+				con2fb_map[i] = -1;
+				if (!search_fb_in_map(idx)) {
+					ret = con2fb_release_oldinfo(vc_cons[i].d,
+								     info, NULL, i,
+								     idx, 0);
+					if (ret) {
+						con2fb_map[i] = idx;
+						return ret;
+					}
+				}
+			}
+		}
+		ret = fbcon_unbind();
+	}
+
+	return ret;
+}
+
+/* called with console_lock held */
+static int fbcon_fb_unregistered(struct fb_info *info)
+{
+	int i, idx;
+
+	idx = info->node;
+	for (i = first_fb_vc; i <= last_fb_vc; i++) {
+		if (con2fb_map[i] == idx)
+			con2fb_map[i] = -1;
+	}
+
+	if (idx == info_idx) {
+		info_idx = -1;
+
+		for (i = 0; i < FB_MAX; i++) {
+			if (registered_fb[i] != NULL) {
+				info_idx = i;
+				break;
+			}
+		}
+	}
+
+	if (info_idx != -1) {
+		for (i = first_fb_vc; i <= last_fb_vc; i++) {
+			if (con2fb_map[i] == -1)
+				con2fb_map[i] = info_idx;
+		}
+	}
+
+	if (primary_device == idx)
+		primary_device = -1;
+
+	if (!num_registered_fb)
+		do_unregister_con_driver(&fb_con);
+
+	return 0;
+}
+
+/* called with console_lock held */
+static void fbcon_remap_all(int idx)
+{
+	int i;
+	for (i = first_fb_vc; i <= last_fb_vc; i++)
+		set_con2fb_map(i, idx, 0);
+
+	if (con_is_bound(&fb_con)) {
+		printk(KERN_INFO "fbcon: Remapping primary device, "
+		       "fb%i, to tty %i-%i\n", idx,
+		       first_fb_vc + 1, last_fb_vc + 1);
+		info_idx = idx;
+	}
+}
+
+#ifdef CONFIG_FRAMEBUFFER_CONSOLE_DETECT_PRIMARY
+static void fbcon_select_primary(struct fb_info *info)
+{
+	if (!map_override && primary_device == -1 &&
+	    fb_is_primary_device(info)) {
+		int i;
+
+		printk(KERN_INFO "fbcon: %s (fb%i) is primary device\n",
+		       info->fix.id, info->node);
+		primary_device = info->node;
+
+		for (i = first_fb_vc; i <= last_fb_vc; i++)
+			con2fb_map_boot[i] = primary_device;
+
+		if (con_is_bound(&fb_con)) {
+			printk(KERN_INFO "fbcon: Remapping primary device, "
+			       "fb%i, to tty %i-%i\n", info->node,
+			       first_fb_vc + 1, last_fb_vc + 1);
+			info_idx = primary_device;
+		}
+	}
+
+}
+#else
+static inline void fbcon_select_primary(struct fb_info *info)
+{
+	return;
+}
+#endif /* CONFIG_FRAMEBUFFER_DETECT_PRIMARY */
+
+/* called with console_lock held */
+static int fbcon_fb_registered(struct fb_info *info)
+{
+	int ret = 0, i, idx;
+
+	idx = info->node;
+	fbcon_select_primary(info);
+
+	if (info_idx == -1) {
+		for (i = first_fb_vc; i <= last_fb_vc; i++) {
+			if (con2fb_map_boot[i] == idx) {
+				info_idx = idx;
+				break;
+			}
+		}
+
+		if (info_idx != -1)
+			ret = do_fbcon_takeover(1);
+	} else {
+		for (i = first_fb_vc; i <= last_fb_vc; i++) {
+			if (con2fb_map_boot[i] == idx)
+				set_con2fb_map(i, idx, 0);
+		}
+	}
+
+	return ret;
+}
+
+static void fbcon_fb_blanked(struct fb_info *info, int blank)
+{
+	struct fbcon_ops *ops = info->fbcon_par;
+	struct vc_data *vc;
+
+	if (!ops || ops->currcon < 0)
+		return;
+
+	vc = vc_cons[ops->currcon].d;
+	if (vc->vc_mode != KD_TEXT ||
+			registered_fb[con2fb_map[ops->currcon]] != info)
+		return;
+
+	if (con_is_visible(vc)) {
+		if (blank)
+			do_blank_screen(0);
+		else
+			do_unblank_screen(0);
+	}
+	ops->blank_state = blank;
+}
+
+static void fbcon_new_modelist(struct fb_info *info)
+{
+	int i;
+	struct vc_data *vc;
+	struct fb_var_screeninfo var;
+	const struct fb_videomode *mode;
+
+	for (i = first_fb_vc; i <= last_fb_vc; i++) {
+		if (registered_fb[con2fb_map[i]] != info)
+			continue;
+		if (!fb_display[i].mode)
+			continue;
+		vc = vc_cons[i].d;
+		display_to_var(&var, &fb_display[i]);
+		mode = fb_find_nearest_mode(fb_display[i].mode,
+					    &info->modelist);
+		fb_videomode_to_var(&var, mode);
+		fbcon_set_disp(info, &var, vc->vc_num);
+	}
+}
+
+static void fbcon_get_requirement(struct fb_info *info,
+				  struct fb_blit_caps *caps)
+{
+	struct vc_data *vc;
+	struct display *p;
+
+	if (caps->flags) {
+		int i, charcnt;
+
+		for (i = first_fb_vc; i <= last_fb_vc; i++) {
+			vc = vc_cons[i].d;
+			if (vc && vc->vc_mode == KD_TEXT &&
+			    info->node == con2fb_map[i]) {
+				p = &fb_display[i];
+				caps->x |= 1 << (vc->vc_font.width - 1);
+				caps->y |= 1 << (vc->vc_font.height - 1);
+				charcnt = (p->userfont) ?
+					FNTCHARCNT(p->fontdata) : 256;
+				if (caps->len < charcnt)
+					caps->len = charcnt;
+			}
+		}
+	} else {
+		vc = vc_cons[fg_console].d;
+
+		if (vc && vc->vc_mode == KD_TEXT &&
+		    info->node == con2fb_map[fg_console]) {
+			p = &fb_display[fg_console];
+			caps->x = 1 << (vc->vc_font.width - 1);
+			caps->y = 1 << (vc->vc_font.height - 1);
+			caps->len = (p->userfont) ?
+				FNTCHARCNT(p->fontdata) : 256;
+		}
+	}
+}
+
+static int fbcon_event_notify(struct notifier_block *self,
+			      unsigned long action, void *data)
+{
+	struct fb_event *event = data;
+	struct fb_info *info = event->info;
+	struct fb_videomode *mode;
+	struct fb_con2fbmap *con2fb;
+	struct fb_blit_caps *caps;
+	int idx, ret = 0;
+
+	/*
+	 * ignore all events except driver registration and deregistration
+	 * if fbcon is not active
+	 */
+	if (fbcon_has_exited && !(action == FB_EVENT_FB_REGISTERED ||
+				  action == FB_EVENT_FB_UNREGISTERED))
+		goto done;
+
+	switch(action) {
+	case FB_EVENT_SUSPEND:
+		fbcon_suspended(info);
+		break;
+	case FB_EVENT_RESUME:
+		fbcon_resumed(info);
+		break;
+	case FB_EVENT_MODE_CHANGE:
+		fbcon_modechanged(info);
+		break;
+	case FB_EVENT_MODE_CHANGE_ALL:
+		fbcon_set_all_vcs(info);
+		break;
+	case FB_EVENT_MODE_DELETE:
+		mode = event->data;
+		ret = fbcon_mode_deleted(info, mode);
+		break;
+	case FB_EVENT_FB_UNBIND:
+		idx = info->node;
+		ret = fbcon_fb_unbind(idx);
+		break;
+	case FB_EVENT_FB_REGISTERED:
+		ret = fbcon_fb_registered(info);
+		break;
+	case FB_EVENT_FB_UNREGISTERED:
+		ret = fbcon_fb_unregistered(info);
+		break;
+	case FB_EVENT_SET_CONSOLE_MAP:
+		/* called with console lock held */
+		con2fb = event->data;
+		ret = set_con2fb_map(con2fb->console - 1,
+				     con2fb->framebuffer, 1);
+		break;
+	case FB_EVENT_GET_CONSOLE_MAP:
+		con2fb = event->data;
+		con2fb->framebuffer = con2fb_map[con2fb->console - 1];
+		break;
+	case FB_EVENT_BLANK:
+		fbcon_fb_blanked(info, *(int *)event->data);
+		break;
+	case FB_EVENT_NEW_MODELIST:
+		fbcon_new_modelist(info);
+		break;
+	case FB_EVENT_GET_REQ:
+		caps = event->data;
+		fbcon_get_requirement(info, caps);
+		break;
+	case FB_EVENT_REMAP_ALL_CONSOLE:
+		idx = info->node;
+		fbcon_remap_all(idx);
+		break;
+	}
+done:
+	return ret;
+}
+
+/*
+ *  The console `switch' structure for the frame buffer based console
+ */
+
+static const struct consw fb_con = {
+	.owner			= THIS_MODULE,
+	.con_startup 		= fbcon_startup,
+	.con_init 		= fbcon_init,
+	.con_deinit 		= fbcon_deinit,
+	.con_clear 		= fbcon_clear,
+	.con_putc 		= fbcon_putc,
+	.con_putcs 		= fbcon_putcs,
+	.con_cursor 		= fbcon_cursor,
+	.con_scroll 		= fbcon_scroll,
+	.con_switch 		= fbcon_switch,
+	.con_blank 		= fbcon_blank,
+	.con_font_set 		= fbcon_set_font,
+	.con_font_get 		= fbcon_get_font,
+	.con_font_default	= fbcon_set_def_font,
+	.con_font_copy 		= fbcon_copy_font,
+	.con_set_palette 	= fbcon_set_palette,
+	.con_scrolldelta 	= fbcon_scrolldelta,
+	.con_set_origin 	= fbcon_set_origin,
+	.con_invert_region 	= fbcon_invert_region,
+	.con_screen_pos 	= fbcon_screen_pos,
+	.con_getxy 		= fbcon_getxy,
+	.con_resize             = fbcon_resize,
+	.con_debug_enter	= fbcon_debug_enter,
+	.con_debug_leave	= fbcon_debug_leave,
+};
+
+static struct notifier_block fbcon_event_notifier = {
+	.notifier_call	= fbcon_event_notify,
+};
+
+static ssize_t store_rotate(struct device *device,
+			    struct device_attribute *attr, const char *buf,
+			    size_t count)
+{
+	struct fb_info *info;
+	int rotate, idx;
+	char **last = NULL;
+
+	if (fbcon_has_exited)
+		return count;
+
+	console_lock();
+	idx = con2fb_map[fg_console];
+
+	if (idx == -1 || registered_fb[idx] == NULL)
+		goto err;
+
+	info = registered_fb[idx];
+	rotate = simple_strtoul(buf, last, 0);
+	fbcon_rotate(info, rotate);
+err:
+	console_unlock();
+	return count;
+}
+
+static ssize_t store_rotate_all(struct device *device,
+				struct device_attribute *attr,const char *buf,
+				size_t count)
+{
+	struct fb_info *info;
+	int rotate, idx;
+	char **last = NULL;
+
+	if (fbcon_has_exited)
+		return count;
+
+	console_lock();
+	idx = con2fb_map[fg_console];
+
+	if (idx == -1 || registered_fb[idx] == NULL)
+		goto err;
+
+	info = registered_fb[idx];
+	rotate = simple_strtoul(buf, last, 0);
+	fbcon_rotate_all(info, rotate);
+err:
+	console_unlock();
+	return count;
+}
+
+static ssize_t show_rotate(struct device *device,
+			   struct device_attribute *attr,char *buf)
+{
+	struct fb_info *info;
+	int rotate = 0, idx;
+
+	if (fbcon_has_exited)
+		return 0;
+
+	console_lock();
+	idx = con2fb_map[fg_console];
+
+	if (idx == -1 || registered_fb[idx] == NULL)
+		goto err;
+
+	info = registered_fb[idx];
+	rotate = fbcon_get_rotate(info);
+err:
+	console_unlock();
+	return snprintf(buf, PAGE_SIZE, "%d\n", rotate);
+}
+
+static ssize_t show_cursor_blink(struct device *device,
+				 struct device_attribute *attr, char *buf)
+{
+	struct fb_info *info;
+	struct fbcon_ops *ops;
+	int idx, blink = -1;
+
+	if (fbcon_has_exited)
+		return 0;
+
+	console_lock();
+	idx = con2fb_map[fg_console];
+
+	if (idx == -1 || registered_fb[idx] == NULL)
+		goto err;
+
+	info = registered_fb[idx];
+	ops = info->fbcon_par;
+
+	if (!ops)
+		goto err;
+
+	blink = (ops->flags & FBCON_FLAGS_CURSOR_TIMER) ? 1 : 0;
+err:
+	console_unlock();
+	return snprintf(buf, PAGE_SIZE, "%d\n", blink);
+}
+
+static ssize_t store_cursor_blink(struct device *device,
+				  struct device_attribute *attr,
+				  const char *buf, size_t count)
+{
+	struct fb_info *info;
+	int blink, idx;
+	char **last = NULL;
+
+	if (fbcon_has_exited)
+		return count;
+
+	console_lock();
+	idx = con2fb_map[fg_console];
+
+	if (idx == -1 || registered_fb[idx] == NULL)
+		goto err;
+
+	info = registered_fb[idx];
+
+	if (!info->fbcon_par)
+		goto err;
+
+	blink = simple_strtoul(buf, last, 0);
+
+	if (blink) {
+		fbcon_cursor_noblink = 0;
+		fbcon_add_cursor_timer(info);
+	} else {
+		fbcon_cursor_noblink = 1;
+		fbcon_del_cursor_timer(info);
+	}
+
+err:
+	console_unlock();
+	return count;
+}
+
+static struct device_attribute device_attrs[] = {
+	__ATTR(rotate, S_IRUGO|S_IWUSR, show_rotate, store_rotate),
+	__ATTR(rotate_all, S_IWUSR, NULL, store_rotate_all),
+	__ATTR(cursor_blink, S_IRUGO|S_IWUSR, show_cursor_blink,
+	       store_cursor_blink),
+};
+
+static int fbcon_init_device(void)
+{
+	int i, error = 0;
+
+	fbcon_has_sysfs = 1;
+
+	for (i = 0; i < ARRAY_SIZE(device_attrs); i++) {
+		error = device_create_file(fbcon_device, &device_attrs[i]);
+
+		if (error)
+			break;
+	}
+
+	if (error) {
+		while (--i >= 0)
+			device_remove_file(fbcon_device, &device_attrs[i]);
+
+		fbcon_has_sysfs = 0;
+	}
+
+	return 0;
+}
+
+static void fbcon_start(void)
+{
+	if (num_registered_fb) {
+		int i;
+
+		console_lock();
+
+		for (i = 0; i < FB_MAX; i++) {
+			if (registered_fb[i] != NULL) {
+				info_idx = i;
+				break;
+			}
+		}
+
+		do_fbcon_takeover(0);
+		console_unlock();
+
+	}
+}
+
+static void fbcon_exit(void)
+{
+	struct fb_info *info;
+	int i, j, mapped;
+
+	if (fbcon_has_exited)
+		return;
+
+	kfree((void *)softback_buf);
+	softback_buf = 0UL;
+
+	for (i = 0; i < FB_MAX; i++) {
+		int pending = 0;
+
+		mapped = 0;
+		info = registered_fb[i];
+
+		if (info == NULL)
+			continue;
+
+		if (info->queue.func)
+			pending = cancel_work_sync(&info->queue);
+		DPRINTK("fbcon: %s pending work\n", (pending ? "canceled" :
+			"no"));
+
+		for (j = first_fb_vc; j <= last_fb_vc; j++) {
+			if (con2fb_map[j] == i) {
+				mapped = 1;
+				break;
+			}
+		}
+
+		if (mapped) {
+			if (info->fbops->fb_release)
+				info->fbops->fb_release(info, 0);
+			module_put(info->fbops->owner);
+
+			if (info->fbcon_par) {
+				struct fbcon_ops *ops = info->fbcon_par;
+
+				fbcon_del_cursor_timer(info);
+				kfree(ops->cursor_src);
+				kfree(ops->cursor_state.mask);
+				kfree(info->fbcon_par);
+				info->fbcon_par = NULL;
+			}
+
+			if (info->queue.func == fb_flashcursor)
+				info->queue.func = NULL;
+		}
+	}
+
+	fbcon_has_exited = 1;
+}
+
+void __init fb_console_init(void)
+{
+	int i;
+
+	console_lock();
+	fb_register_client(&fbcon_event_notifier);
+	fbcon_device = device_create(fb_class, NULL, MKDEV(0, 0), NULL,
+				     "fbcon");
+
+	if (IS_ERR(fbcon_device)) {
+		printk(KERN_WARNING "Unable to create device "
+		       "for fbcon; errno = %ld\n",
+		       PTR_ERR(fbcon_device));
+		fbcon_device = NULL;
+	} else
+		fbcon_init_device();
+
+	for (i = 0; i < MAX_NR_CONSOLES; i++)
+		con2fb_map[i] = -1;
+
+	console_unlock();
+	fbcon_start();
+}
+
+#ifdef MODULE
+
+static void __exit fbcon_deinit_device(void)
+{
+	int i;
+
+	if (fbcon_has_sysfs) {
+		for (i = 0; i < ARRAY_SIZE(device_attrs); i++)
+			device_remove_file(fbcon_device, &device_attrs[i]);
+
+		fbcon_has_sysfs = 0;
+	}
+}
+
+void __exit fb_console_exit(void)
+{
+	console_lock();
+	fb_unregister_client(&fbcon_event_notifier);
+	fbcon_deinit_device();
+	device_destroy(fb_class, MKDEV(0, 0));
+	fbcon_exit();
+	do_unregister_con_driver(&fb_con);
+	console_unlock();
+}	
+#endif
diff --git a/drivers/video/fbdev/core/fbcon.h b/drivers/video/fbdev/core/fbcon.h
new file mode 100644
index 000000000000..7aaa4eabbba0
--- /dev/null
+++ b/drivers/video/fbdev/core/fbcon.h
@@ -0,0 +1,265 @@
+/*
+ *  linux/drivers/video/console/fbcon.h -- Low level frame buffer based console driver
+ *
+ *	Copyright (C) 1997 Geert Uytterhoeven
+ *
+ *  This file is subject to the terms and conditions of the GNU General Public
+ *  License.  See the file COPYING in the main directory of this archive
+ *  for more details.
+ */
+
+#ifndef _VIDEO_FBCON_H
+#define _VIDEO_FBCON_H
+
+#include <linux/types.h>
+#include <linux/vt_buffer.h>
+#include <linux/vt_kern.h>
+
+#include <asm/io.h>
+
+#define FBCON_FLAGS_INIT         1
+#define FBCON_FLAGS_CURSOR_TIMER 2
+
+   /*
+    *    This is the interface between the low-level console driver and the
+    *    low-level frame buffer device
+    */
+
+struct display {
+    /* Filled in by the low-level console driver */
+    const u_char *fontdata;
+    int userfont;                   /* != 0 if fontdata kmalloc()ed */
+    u_short scrollmode;             /* Scroll Method */
+    u_short inverse;                /* != 0 text black on white as default */
+    short yscroll;                  /* Hardware scrolling */
+    int vrows;                      /* number of virtual rows */
+    int cursor_shape;
+    int con_rotate;
+    u32 xres_virtual;
+    u32 yres_virtual;
+    u32 height;
+    u32 width;
+    u32 bits_per_pixel;
+    u32 grayscale;
+    u32 nonstd;
+    u32 accel_flags;
+    u32 rotate;
+    struct fb_bitfield red;
+    struct fb_bitfield green;
+    struct fb_bitfield blue;
+    struct fb_bitfield transp;
+    const struct fb_videomode *mode;
+};
+
+struct fbcon_ops {
+	void (*bmove)(struct vc_data *vc, struct fb_info *info, int sy,
+		      int sx, int dy, int dx, int height, int width);
+	void (*clear)(struct vc_data *vc, struct fb_info *info, int sy,
+		      int sx, int height, int width);
+	void (*putcs)(struct vc_data *vc, struct fb_info *info,
+		      const unsigned short *s, int count, int yy, int xx,
+		      int fg, int bg);
+	void (*clear_margins)(struct vc_data *vc, struct fb_info *info,
+			      int bottom_only);
+	void (*cursor)(struct vc_data *vc, struct fb_info *info, int mode,
+		       int softback_lines, int fg, int bg);
+	int  (*update_start)(struct fb_info *info);
+	int  (*rotate_font)(struct fb_info *info, struct vc_data *vc);
+	struct fb_var_screeninfo var;  /* copy of the current fb_var_screeninfo */
+	struct timer_list cursor_timer; /* Cursor timer */
+	struct fb_cursor cursor_state;
+	struct display *p;
+        int    currcon;	                /* Current VC. */
+	int    cur_blink_jiffies;
+	int    cursor_flash;
+	int    cursor_reset;
+	int    blank_state;
+	int    graphics;
+	int    save_graphics; /* for debug enter/leave */
+	int    flags;
+	int    rotate;
+	int    cur_rotate;
+	char  *cursor_data;
+	u8    *fontbuffer;
+	u8    *fontdata;
+	u8    *cursor_src;
+	u32    cursor_size;
+	u32    fd_size;
+};
+    /*
+     *  Attribute Decoding
+     */
+
+/* Color */
+#define attr_fgcol(fgshift,s)    \
+	(((s) >> (fgshift)) & 0x0f)
+#define attr_bgcol(bgshift,s)    \
+	(((s) >> (bgshift)) & 0x0f)
+
+/* Monochrome */
+#define attr_bold(s) \
+	((s) & 0x200)
+#define attr_reverse(s) \
+	((s) & 0x800)
+#define attr_underline(s) \
+	((s) & 0x400)
+#define attr_blink(s) \
+	((s) & 0x8000)
+	
+
+static inline int mono_col(const struct fb_info *info)
+{
+	__u32 max_len;
+	max_len = max(info->var.green.length, info->var.red.length);
+	max_len = max(info->var.blue.length, max_len);
+	return (~(0xfff << max_len)) & 0xff;
+}
+
+static inline int attr_col_ec(int shift, struct vc_data *vc,
+			      struct fb_info *info, int is_fg)
+{
+	int is_mono01;
+	int col;
+	int fg;
+	int bg;
+
+	if (!vc)
+		return 0;
+
+	if (vc->vc_can_do_color)
+		return is_fg ? attr_fgcol(shift,vc->vc_video_erase_char)
+			: attr_bgcol(shift,vc->vc_video_erase_char);
+
+	if (!info)
+		return 0;
+
+	col = mono_col(info);
+	is_mono01 = info->fix.visual == FB_VISUAL_MONO01;
+
+	if (attr_reverse(vc->vc_video_erase_char)) {
+		fg = is_mono01 ? col : 0;
+		bg = is_mono01 ? 0 : col;
+	}
+	else {
+		fg = is_mono01 ? 0 : col;
+		bg = is_mono01 ? col : 0;
+	}
+
+	return is_fg ? fg : bg;
+}
+
+#define attr_bgcol_ec(bgshift, vc, info) attr_col_ec(bgshift, vc, info, 0)
+#define attr_fgcol_ec(fgshift, vc, info) attr_col_ec(fgshift, vc, info, 1)
+
+/* Font */
+#define REFCOUNT(fd)	(((int *)(fd))[-1])
+#define FNTSIZE(fd)	(((int *)(fd))[-2])
+#define FNTCHARCNT(fd)	(((int *)(fd))[-3])
+#define FNTSUM(fd)	(((int *)(fd))[-4])
+#define FONT_EXTRA_WORDS 4
+
+    /*
+     *  Scroll Method
+     */
+     
+/* There are several methods fbcon can use to move text around the screen:
+ *
+ *                     Operation   Pan    Wrap
+ *---------------------------------------------
+ * SCROLL_MOVE         copyarea    No     No
+ * SCROLL_PAN_MOVE     copyarea    Yes    No
+ * SCROLL_WRAP_MOVE    copyarea    No     Yes
+ * SCROLL_REDRAW       imageblit   No     No
+ * SCROLL_PAN_REDRAW   imageblit   Yes    No
+ * SCROLL_WRAP_REDRAW  imageblit   No     Yes
+ *
+ * (SCROLL_WRAP_REDRAW is not implemented yet)
+ *
+ * In general, fbcon will choose the best scrolling
+ * method based on the rule below:
+ *
+ * Pan/Wrap > accel imageblit > accel copyarea >
+ * soft imageblit > (soft copyarea)
+ *
+ * Exception to the rule: Pan + accel copyarea is
+ * preferred over Pan + accel imageblit.
+ *
+ * The above is typical for PCI/AGP cards. Unless
+ * overridden, fbcon will never use soft copyarea.
+ *
+ * If you need to override the above rule, set the
+ * appropriate flags in fb_info->flags.  For example,
+ * to prefer copyarea over imageblit, set
+ * FBINFO_READS_FAST.
+ *
+ * Other notes:
+ * + use the hardware engine to move the text
+ *    (hw-accelerated copyarea() and fillrect())
+ * + use hardware-supported panning on a large virtual screen
+ * + amifb can not only pan, but also wrap the display by N lines
+ *    (i.e. visible line i = physical line (i+N) % yres).
+ * + read what's already rendered on the screen and
+ *     write it in a different place (this is cfb_copyarea())
+ * + re-render the text to the screen
+ *
+ * Whether to use wrapping or panning can only be figured out at
+ * runtime (when we know whether our font height is a multiple
+ * of the pan/wrap step)
+ *
+ */
+
+#define SCROLL_MOVE	   0x001
+#define SCROLL_PAN_MOVE	   0x002
+#define SCROLL_WRAP_MOVE   0x003
+#define SCROLL_REDRAW	   0x004
+#define SCROLL_PAN_REDRAW  0x005
+
+#ifdef CONFIG_FB_TILEBLITTING
+extern void fbcon_set_tileops(struct vc_data *vc, struct fb_info *info);
+#endif
+extern void fbcon_set_bitops(struct fbcon_ops *ops);
+extern int  soft_cursor(struct fb_info *info, struct fb_cursor *cursor);
+
+#define FBCON_ATTRIBUTE_UNDERLINE 1
+#define FBCON_ATTRIBUTE_REVERSE   2
+#define FBCON_ATTRIBUTE_BOLD      4
+
+static inline int real_y(struct display *p, int ypos)
+{
+	int rows = p->vrows;
+
+	ypos += p->yscroll;
+	return ypos < rows ? ypos : ypos - rows;
+}
+
+
+static inline int get_attribute(struct fb_info *info, u16 c)
+{
+	int attribute = 0;
+
+	if (fb_get_color_depth(&info->var, &info->fix) == 1) {
+		if (attr_underline(c))
+			attribute |= FBCON_ATTRIBUTE_UNDERLINE;
+		if (attr_reverse(c))
+			attribute |= FBCON_ATTRIBUTE_REVERSE;
+		if (attr_bold(c))
+			attribute |= FBCON_ATTRIBUTE_BOLD;
+	}
+
+	return attribute;
+}
+
+#define FBCON_SWAP(i,r,v) ({ \
+        typeof(r) _r = (r);  \
+        typeof(v) _v = (v);  \
+        (void) (&_r == &_v); \
+        (i == FB_ROTATE_UR || i == FB_ROTATE_UD) ? _r : _v; })
+
+#ifdef CONFIG_FRAMEBUFFER_CONSOLE_ROTATION
+extern void fbcon_set_rotate(struct fbcon_ops *ops);
+#else
+#define fbcon_set_rotate(x) do {} while(0)
+#endif /* CONFIG_FRAMEBUFFER_CONSOLE_ROTATION */
+
+#endif /* _VIDEO_FBCON_H */
+
diff --git a/drivers/video/fbdev/core/fbcon_ccw.c b/drivers/video/fbdev/core/fbcon_ccw.c
new file mode 100644
index 000000000000..2eeefa97bd4a
--- /dev/null
+++ b/drivers/video/fbdev/core/fbcon_ccw.c
@@ -0,0 +1,420 @@
+/*
+ *  linux/drivers/video/console/fbcon_ccw.c -- Software Rotation - 270 degrees
+ *
+ *      Copyright (C) 2005 Antonino Daplas <adaplas @pol.net>
+ *
+ *  This file is subject to the terms and conditions of the GNU General Public
+ *  License.  See the file COPYING in the main directory of this archive for
+ *  more details.
+ */
+
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/string.h>
+#include <linux/fb.h>
+#include <linux/vt_kern.h>
+#include <linux/console.h>
+#include <asm/types.h>
+#include "fbcon.h"
+#include "fbcon_rotate.h"
+
+/*
+ * Rotation 270 degrees
+ */
+
+static void ccw_update_attr(u8 *dst, u8 *src, int attribute,
+				  struct vc_data *vc)
+{
+	int i, j, offset = (vc->vc_font.height < 10) ? 1 : 2;
+	int width = (vc->vc_font.height + 7) >> 3;
+	int mod = vc->vc_font.height % 8;
+	u8 c, msk = ~(0xff << offset), msk1 = 0;
+
+	if (mod)
+		msk <<= (8 - mod);
+
+	if (offset > mod)
+		msk1 |= 0x01;
+
+	for (i = 0; i < vc->vc_font.width; i++) {
+		for (j = 0; j < width; j++) {
+			c = *src;
+
+			if (attribute & FBCON_ATTRIBUTE_UNDERLINE) {
+				if (j == width - 1)
+					c |= msk;
+
+				if (msk1 && j == width - 2)
+					c |= msk1;
+			}
+
+			if (attribute & FBCON_ATTRIBUTE_BOLD && i)
+				*(dst - width) |= c;
+
+			if (attribute & FBCON_ATTRIBUTE_REVERSE)
+				c = ~c;
+			src++;
+			*dst++ = c;
+		}
+	}
+}
+
+
+static void ccw_bmove(struct vc_data *vc, struct fb_info *info, int sy,
+		     int sx, int dy, int dx, int height, int width)
+{
+	struct fbcon_ops *ops = info->fbcon_par;
+	struct fb_copyarea area;
+	u32 vyres = GETVYRES(ops->p->scrollmode, info);
+
+	area.sx = sy * vc->vc_font.height;
+	area.sy = vyres - ((sx + width) * vc->vc_font.width);
+	area.dx = dy * vc->vc_font.height;
+	area.dy = vyres - ((dx + width) * vc->vc_font.width);
+	area.width = height * vc->vc_font.height;
+	area.height  = width * vc->vc_font.width;
+
+	info->fbops->fb_copyarea(info, &area);
+}
+
+static void ccw_clear(struct vc_data *vc, struct fb_info *info, int sy,
+		     int sx, int height, int width)
+{
+	struct fbcon_ops *ops = info->fbcon_par;
+	struct fb_fillrect region;
+	int bgshift = (vc->vc_hi_font_mask) ? 13 : 12;
+	u32 vyres = GETVYRES(ops->p->scrollmode, info);
+
+	region.color = attr_bgcol_ec(bgshift,vc,info);
+	region.dx = sy * vc->vc_font.height;
+	region.dy = vyres - ((sx + width) * vc->vc_font.width);
+	region.height = width * vc->vc_font.width;
+	region.width = height * vc->vc_font.height;
+	region.rop = ROP_COPY;
+
+	info->fbops->fb_fillrect(info, &region);
+}
+
+static inline void ccw_putcs_aligned(struct vc_data *vc, struct fb_info *info,
+				    const u16 *s, u32 attr, u32 cnt,
+				    u32 d_pitch, u32 s_pitch, u32 cellsize,
+				    struct fb_image *image, u8 *buf, u8 *dst)
+{
+	struct fbcon_ops *ops = info->fbcon_par;
+	u16 charmask = vc->vc_hi_font_mask ? 0x1ff : 0xff;
+	u32 idx = (vc->vc_font.height + 7) >> 3;
+	u8 *src;
+
+	while (cnt--) {
+		src = ops->fontbuffer + (scr_readw(s--) & charmask)*cellsize;
+
+		if (attr) {
+			ccw_update_attr(buf, src, attr, vc);
+			src = buf;
+		}
+
+		if (likely(idx == 1))
+			__fb_pad_aligned_buffer(dst, d_pitch, src, idx,
+						vc->vc_font.width);
+		else
+			fb_pad_aligned_buffer(dst, d_pitch, src, idx,
+					      vc->vc_font.width);
+
+		dst += d_pitch * vc->vc_font.width;
+	}
+
+	info->fbops->fb_imageblit(info, image);
+}
+
+static void ccw_putcs(struct vc_data *vc, struct fb_info *info,
+		      const unsigned short *s, int count, int yy, int xx,
+		      int fg, int bg)
+{
+	struct fb_image image;
+	struct fbcon_ops *ops = info->fbcon_par;
+	u32 width = (vc->vc_font.height + 7)/8;
+	u32 cellsize = width * vc->vc_font.width;
+	u32 maxcnt = info->pixmap.size/cellsize;
+	u32 scan_align = info->pixmap.scan_align - 1;
+	u32 buf_align = info->pixmap.buf_align - 1;
+	u32 cnt, pitch, size;
+	u32 attribute = get_attribute(info, scr_readw(s));
+	u8 *dst, *buf = NULL;
+	u32 vyres = GETVYRES(ops->p->scrollmode, info);
+
+	if (!ops->fontbuffer)
+		return;
+
+	image.fg_color = fg;
+	image.bg_color = bg;
+	image.dx = yy * vc->vc_font.height;
+	image.dy = vyres - ((xx + count) * vc->vc_font.width);
+	image.width = vc->vc_font.height;
+	image.depth = 1;
+
+	if (attribute) {
+		buf = kmalloc(cellsize, GFP_KERNEL);
+		if (!buf)
+			return;
+	}
+
+	s += count - 1;
+
+	while (count) {
+		if (count > maxcnt)
+			cnt = maxcnt;
+		else
+			cnt = count;
+
+		image.height = vc->vc_font.width * cnt;
+		pitch = ((image.width + 7) >> 3) + scan_align;
+		pitch &= ~scan_align;
+		size = pitch * image.height + buf_align;
+		size &= ~buf_align;
+		dst = fb_get_buffer_offset(info, &info->pixmap, size);
+		image.data = dst;
+		ccw_putcs_aligned(vc, info, s, attribute, cnt, pitch,
+				 width, cellsize, &image, buf, dst);
+		image.dy += image.height;
+		count -= cnt;
+		s -= cnt;
+	}
+
+	/* buf is always NULL except when in monochrome mode, so in this case
+	   it's a gain to check buf against NULL even though kfree() handles
+	   NULL pointers just fine */
+	if (unlikely(buf))
+		kfree(buf);
+
+}
+
+static void ccw_clear_margins(struct vc_data *vc, struct fb_info *info,
+			     int bottom_only)
+{
+	unsigned int cw = vc->vc_font.width;
+	unsigned int ch = vc->vc_font.height;
+	unsigned int rw = info->var.yres - (vc->vc_cols*cw);
+	unsigned int bh = info->var.xres - (vc->vc_rows*ch);
+	unsigned int bs = vc->vc_rows*ch;
+	struct fb_fillrect region;
+
+	region.color = 0;
+	region.rop = ROP_COPY;
+
+	if (rw && !bottom_only) {
+		region.dx = 0;
+		region.dy = info->var.yoffset;
+		region.height = rw;
+		region.width = info->var.xres_virtual;
+		info->fbops->fb_fillrect(info, &region);
+	}
+
+	if (bh) {
+		region.dx = info->var.xoffset + bs;
+		region.dy = 0;
+                region.height = info->var.yres_virtual;
+                region.width = bh;
+		info->fbops->fb_fillrect(info, &region);
+	}
+}
+
+static void ccw_cursor(struct vc_data *vc, struct fb_info *info, int mode,
+		       int softback_lines, int fg, int bg)
+{
+	struct fb_cursor cursor;
+	struct fbcon_ops *ops = info->fbcon_par;
+	unsigned short charmask = vc->vc_hi_font_mask ? 0x1ff : 0xff;
+	int w = (vc->vc_font.height + 7) >> 3, c;
+	int y = real_y(ops->p, vc->vc_y);
+	int attribute, use_sw = (vc->vc_cursor_type & 0x10);
+	int err = 1, dx, dy;
+	char *src;
+	u32 vyres = GETVYRES(ops->p->scrollmode, info);
+
+	if (!ops->fontbuffer)
+		return;
+
+	cursor.set = 0;
+
+	if (softback_lines) {
+		if (y + softback_lines >= vc->vc_rows) {
+			mode = CM_ERASE;
+			ops->cursor_flash = 0;
+			return;
+		} else
+			y += softback_lines;
+	}
+
+ 	c = scr_readw((u16 *) vc->vc_pos);
+	attribute = get_attribute(info, c);
+	src = ops->fontbuffer + ((c & charmask) * (w * vc->vc_font.width));
+
+	if (ops->cursor_state.image.data != src ||
+	    ops->cursor_reset) {
+	    ops->cursor_state.image.data = src;
+	    cursor.set |= FB_CUR_SETIMAGE;
+	}
+
+	if (attribute) {
+		u8 *dst;
+
+		dst = kmalloc(w * vc->vc_font.width, GFP_ATOMIC);
+		if (!dst)
+			return;
+		kfree(ops->cursor_data);
+		ops->cursor_data = dst;
+		ccw_update_attr(dst, src, attribute, vc);
+		src = dst;
+	}
+
+	if (ops->cursor_state.image.fg_color != fg ||
+	    ops->cursor_state.image.bg_color != bg ||
+	    ops->cursor_reset) {
+		ops->cursor_state.image.fg_color = fg;
+		ops->cursor_state.image.bg_color = bg;
+		cursor.set |= FB_CUR_SETCMAP;
+	}
+
+	if (ops->cursor_state.image.height != vc->vc_font.width ||
+	    ops->cursor_state.image.width != vc->vc_font.height ||
+	    ops->cursor_reset) {
+		ops->cursor_state.image.height = vc->vc_font.width;
+		ops->cursor_state.image.width = vc->vc_font.height;
+		cursor.set |= FB_CUR_SETSIZE;
+	}
+
+	dx = y * vc->vc_font.height;
+	dy = vyres - ((vc->vc_x + 1) * vc->vc_font.width);
+
+	if (ops->cursor_state.image.dx != dx ||
+	    ops->cursor_state.image.dy != dy ||
+	    ops->cursor_reset) {
+		ops->cursor_state.image.dx = dx;
+		ops->cursor_state.image.dy = dy;
+		cursor.set |= FB_CUR_SETPOS;
+	}
+
+	if (ops->cursor_state.hot.x || ops->cursor_state.hot.y ||
+	    ops->cursor_reset) {
+		ops->cursor_state.hot.x = cursor.hot.y = 0;
+		cursor.set |= FB_CUR_SETHOT;
+	}
+
+	if (cursor.set & FB_CUR_SETSIZE ||
+	    vc->vc_cursor_type != ops->p->cursor_shape ||
+	    ops->cursor_state.mask == NULL ||
+	    ops->cursor_reset) {
+		char *tmp, *mask = kmalloc(w*vc->vc_font.width, GFP_ATOMIC);
+		int cur_height, size, i = 0;
+		int width = (vc->vc_font.width + 7)/8;
+
+		if (!mask)
+			return;
+
+		tmp = kmalloc(width * vc->vc_font.height, GFP_ATOMIC);
+
+		if (!tmp) {
+			kfree(mask);
+			return;
+		}
+
+		kfree(ops->cursor_state.mask);
+		ops->cursor_state.mask = mask;
+
+		ops->p->cursor_shape = vc->vc_cursor_type;
+		cursor.set |= FB_CUR_SETSHAPE;
+
+		switch (ops->p->cursor_shape & CUR_HWMASK) {
+		case CUR_NONE:
+			cur_height = 0;
+			break;
+		case CUR_UNDERLINE:
+			cur_height = (vc->vc_font.height < 10) ? 1 : 2;
+			break;
+		case CUR_LOWER_THIRD:
+			cur_height = vc->vc_font.height/3;
+			break;
+		case CUR_LOWER_HALF:
+			cur_height = vc->vc_font.height >> 1;
+			break;
+		case CUR_TWO_THIRDS:
+			cur_height = (vc->vc_font.height << 1)/3;
+			break;
+		case CUR_BLOCK:
+		default:
+			cur_height = vc->vc_font.height;
+			break;
+		}
+
+		size = (vc->vc_font.height - cur_height) * width;
+		while (size--)
+			tmp[i++] = 0;
+		size = cur_height * width;
+		while (size--)
+			tmp[i++] = 0xff;
+		memset(mask, 0, w * vc->vc_font.width);
+		rotate_ccw(tmp, mask, vc->vc_font.width, vc->vc_font.height);
+		kfree(tmp);
+	}
+
+	switch (mode) {
+	case CM_ERASE:
+		ops->cursor_state.enable = 0;
+		break;
+	case CM_DRAW:
+	case CM_MOVE:
+	default:
+		ops->cursor_state.enable = (use_sw) ? 0 : 1;
+		break;
+	}
+
+	cursor.image.data = src;
+	cursor.image.fg_color = ops->cursor_state.image.fg_color;
+	cursor.image.bg_color = ops->cursor_state.image.bg_color;
+	cursor.image.dx = ops->cursor_state.image.dx;
+	cursor.image.dy = ops->cursor_state.image.dy;
+	cursor.image.height = ops->cursor_state.image.height;
+	cursor.image.width = ops->cursor_state.image.width;
+	cursor.hot.x = ops->cursor_state.hot.x;
+	cursor.hot.y = ops->cursor_state.hot.y;
+	cursor.mask = ops->cursor_state.mask;
+	cursor.enable = ops->cursor_state.enable;
+	cursor.image.depth = 1;
+	cursor.rop = ROP_XOR;
+
+	if (info->fbops->fb_cursor)
+		err = info->fbops->fb_cursor(info, &cursor);
+
+	if (err)
+		soft_cursor(info, &cursor);
+
+	ops->cursor_reset = 0;
+}
+
+static int ccw_update_start(struct fb_info *info)
+{
+	struct fbcon_ops *ops = info->fbcon_par;
+	u32 yoffset;
+	u32 vyres = GETVYRES(ops->p->scrollmode, info);
+	int err;
+
+	yoffset = (vyres - info->var.yres) - ops->var.xoffset;
+	ops->var.xoffset = ops->var.yoffset;
+	ops->var.yoffset = yoffset;
+	err = fb_pan_display(info, &ops->var);
+	ops->var.xoffset = info->var.xoffset;
+	ops->var.yoffset = info->var.yoffset;
+	ops->var.vmode = info->var.vmode;
+	return err;
+}
+
+void fbcon_rotate_ccw(struct fbcon_ops *ops)
+{
+	ops->bmove = ccw_bmove;
+	ops->clear = ccw_clear;
+	ops->putcs = ccw_putcs;
+	ops->clear_margins = ccw_clear_margins;
+	ops->cursor = ccw_cursor;
+	ops->update_start = ccw_update_start;
+}
+EXPORT_SYMBOL(fbcon_rotate_ccw);
diff --git a/drivers/video/fbdev/core/fbcon_cw.c b/drivers/video/fbdev/core/fbcon_cw.c
new file mode 100644
index 000000000000..c321f7c59e5c
--- /dev/null
+++ b/drivers/video/fbdev/core/fbcon_cw.c
@@ -0,0 +1,403 @@
+/*
+ *  linux/drivers/video/console/fbcon_ud.c -- Software Rotation - 90 degrees
+ *
+ *      Copyright (C) 2005 Antonino Daplas <adaplas @pol.net>
+ *
+ *  This file is subject to the terms and conditions of the GNU General Public
+ *  License.  See the file COPYING in the main directory of this archive for
+ *  more details.
+ */
+
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/string.h>
+#include <linux/fb.h>
+#include <linux/vt_kern.h>
+#include <linux/console.h>
+#include <asm/types.h>
+#include "fbcon.h"
+#include "fbcon_rotate.h"
+
+/*
+ * Rotation 90 degrees
+ */
+
+static void cw_update_attr(u8 *dst, u8 *src, int attribute,
+				  struct vc_data *vc)
+{
+	int i, j, offset = (vc->vc_font.height < 10) ? 1 : 2;
+	int width = (vc->vc_font.height + 7) >> 3;
+	u8 c, msk = ~(0xff >> offset);
+
+	for (i = 0; i < vc->vc_font.width; i++) {
+		for (j = 0; j < width; j++) {
+			c = *src;
+			if (attribute & FBCON_ATTRIBUTE_UNDERLINE && !j)
+				c |= msk;
+			if (attribute & FBCON_ATTRIBUTE_BOLD && i)
+				c |= *(src-width);
+			if (attribute & FBCON_ATTRIBUTE_REVERSE)
+				c = ~c;
+			src++;
+			*dst++ = c;
+		}
+	}
+}
+
+
+static void cw_bmove(struct vc_data *vc, struct fb_info *info, int sy,
+		     int sx, int dy, int dx, int height, int width)
+{
+	struct fbcon_ops *ops = info->fbcon_par;
+	struct fb_copyarea area;
+	u32 vxres = GETVXRES(ops->p->scrollmode, info);
+
+	area.sx = vxres - ((sy + height) * vc->vc_font.height);
+	area.sy = sx * vc->vc_font.width;
+	area.dx = vxres - ((dy + height) * vc->vc_font.height);
+	area.dy = dx * vc->vc_font.width;
+	area.width = height * vc->vc_font.height;
+	area.height  = width * vc->vc_font.width;
+
+	info->fbops->fb_copyarea(info, &area);
+}
+
+static void cw_clear(struct vc_data *vc, struct fb_info *info, int sy,
+		     int sx, int height, int width)
+{
+	struct fbcon_ops *ops = info->fbcon_par;
+	struct fb_fillrect region;
+	int bgshift = (vc->vc_hi_font_mask) ? 13 : 12;
+	u32 vxres = GETVXRES(ops->p->scrollmode, info);
+
+	region.color = attr_bgcol_ec(bgshift,vc,info);
+	region.dx = vxres - ((sy + height) * vc->vc_font.height);
+	region.dy = sx *  vc->vc_font.width;
+	region.height = width * vc->vc_font.width;
+	region.width = height * vc->vc_font.height;
+	region.rop = ROP_COPY;
+
+	info->fbops->fb_fillrect(info, &region);
+}
+
+static inline void cw_putcs_aligned(struct vc_data *vc, struct fb_info *info,
+				    const u16 *s, u32 attr, u32 cnt,
+				    u32 d_pitch, u32 s_pitch, u32 cellsize,
+				    struct fb_image *image, u8 *buf, u8 *dst)
+{
+	struct fbcon_ops *ops = info->fbcon_par;
+	u16 charmask = vc->vc_hi_font_mask ? 0x1ff : 0xff;
+	u32 idx = (vc->vc_font.height + 7) >> 3;
+	u8 *src;
+
+	while (cnt--) {
+		src = ops->fontbuffer + (scr_readw(s++) & charmask)*cellsize;
+
+		if (attr) {
+			cw_update_attr(buf, src, attr, vc);
+			src = buf;
+		}
+
+		if (likely(idx == 1))
+			__fb_pad_aligned_buffer(dst, d_pitch, src, idx,
+						vc->vc_font.width);
+		else
+			fb_pad_aligned_buffer(dst, d_pitch, src, idx,
+					      vc->vc_font.width);
+
+		dst += d_pitch * vc->vc_font.width;
+	}
+
+	info->fbops->fb_imageblit(info, image);
+}
+
+static void cw_putcs(struct vc_data *vc, struct fb_info *info,
+		      const unsigned short *s, int count, int yy, int xx,
+		      int fg, int bg)
+{
+	struct fb_image image;
+	struct fbcon_ops *ops = info->fbcon_par;
+	u32 width = (vc->vc_font.height + 7)/8;
+	u32 cellsize = width * vc->vc_font.width;
+	u32 maxcnt = info->pixmap.size/cellsize;
+	u32 scan_align = info->pixmap.scan_align - 1;
+	u32 buf_align = info->pixmap.buf_align - 1;
+	u32 cnt, pitch, size;
+	u32 attribute = get_attribute(info, scr_readw(s));
+	u8 *dst, *buf = NULL;
+	u32 vxres = GETVXRES(ops->p->scrollmode, info);
+
+	if (!ops->fontbuffer)
+		return;
+
+	image.fg_color = fg;
+	image.bg_color = bg;
+	image.dx = vxres - ((yy + 1) * vc->vc_font.height);
+	image.dy = xx * vc->vc_font.width;
+	image.width = vc->vc_font.height;
+	image.depth = 1;
+
+	if (attribute) {
+		buf = kmalloc(cellsize, GFP_KERNEL);
+		if (!buf)
+			return;
+	}
+
+	while (count) {
+		if (count > maxcnt)
+			cnt = maxcnt;
+		else
+			cnt = count;
+
+		image.height = vc->vc_font.width * cnt;
+		pitch = ((image.width + 7) >> 3) + scan_align;
+		pitch &= ~scan_align;
+		size = pitch * image.height + buf_align;
+		size &= ~buf_align;
+		dst = fb_get_buffer_offset(info, &info->pixmap, size);
+		image.data = dst;
+		cw_putcs_aligned(vc, info, s, attribute, cnt, pitch,
+				 width, cellsize, &image, buf, dst);
+		image.dy += image.height;
+		count -= cnt;
+		s += cnt;
+	}
+
+	/* buf is always NULL except when in monochrome mode, so in this case
+	   it's a gain to check buf against NULL even though kfree() handles
+	   NULL pointers just fine */
+	if (unlikely(buf))
+		kfree(buf);
+
+}
+
+static void cw_clear_margins(struct vc_data *vc, struct fb_info *info,
+			     int bottom_only)
+{
+	unsigned int cw = vc->vc_font.width;
+	unsigned int ch = vc->vc_font.height;
+	unsigned int rw = info->var.yres - (vc->vc_cols*cw);
+	unsigned int bh = info->var.xres - (vc->vc_rows*ch);
+	unsigned int rs = info->var.yres - rw;
+	struct fb_fillrect region;
+
+	region.color = 0;
+	region.rop = ROP_COPY;
+
+	if (rw && !bottom_only) {
+		region.dx = 0;
+		region.dy = info->var.yoffset + rs;
+		region.height = rw;
+		region.width = info->var.xres_virtual;
+		info->fbops->fb_fillrect(info, &region);
+	}
+
+	if (bh) {
+		region.dx = info->var.xoffset;
+		region.dy = info->var.yoffset;
+                region.height = info->var.yres;
+                region.width = bh;
+		info->fbops->fb_fillrect(info, &region);
+	}
+}
+
+static void cw_cursor(struct vc_data *vc, struct fb_info *info, int mode,
+		      int softback_lines, int fg, int bg)
+{
+	struct fb_cursor cursor;
+	struct fbcon_ops *ops = info->fbcon_par;
+	unsigned short charmask = vc->vc_hi_font_mask ? 0x1ff : 0xff;
+	int w = (vc->vc_font.height + 7) >> 3, c;
+	int y = real_y(ops->p, vc->vc_y);
+	int attribute, use_sw = (vc->vc_cursor_type & 0x10);
+	int err = 1, dx, dy;
+	char *src;
+	u32 vxres = GETVXRES(ops->p->scrollmode, info);
+
+	if (!ops->fontbuffer)
+		return;
+
+	cursor.set = 0;
+
+	if (softback_lines) {
+		if (y + softback_lines >= vc->vc_rows) {
+			mode = CM_ERASE;
+			ops->cursor_flash = 0;
+			return;
+		} else
+			y += softback_lines;
+	}
+
+ 	c = scr_readw((u16 *) vc->vc_pos);
+	attribute = get_attribute(info, c);
+	src = ops->fontbuffer + ((c & charmask) * (w * vc->vc_font.width));
+
+	if (ops->cursor_state.image.data != src ||
+	    ops->cursor_reset) {
+	    ops->cursor_state.image.data = src;
+	    cursor.set |= FB_CUR_SETIMAGE;
+	}
+
+	if (attribute) {
+		u8 *dst;
+
+		dst = kmalloc(w * vc->vc_font.width, GFP_ATOMIC);
+		if (!dst)
+			return;
+		kfree(ops->cursor_data);
+		ops->cursor_data = dst;
+		cw_update_attr(dst, src, attribute, vc);
+		src = dst;
+	}
+
+	if (ops->cursor_state.image.fg_color != fg ||
+	    ops->cursor_state.image.bg_color != bg ||
+	    ops->cursor_reset) {
+		ops->cursor_state.image.fg_color = fg;
+		ops->cursor_state.image.bg_color = bg;
+		cursor.set |= FB_CUR_SETCMAP;
+	}
+
+	if (ops->cursor_state.image.height != vc->vc_font.width ||
+	    ops->cursor_state.image.width != vc->vc_font.height ||
+	    ops->cursor_reset) {
+		ops->cursor_state.image.height = vc->vc_font.width;
+		ops->cursor_state.image.width = vc->vc_font.height;
+		cursor.set |= FB_CUR_SETSIZE;
+	}
+
+	dx = vxres - ((y * vc->vc_font.height) + vc->vc_font.height);
+	dy = vc->vc_x * vc->vc_font.width;
+
+	if (ops->cursor_state.image.dx != dx ||
+	    ops->cursor_state.image.dy != dy ||
+	    ops->cursor_reset) {
+		ops->cursor_state.image.dx = dx;
+		ops->cursor_state.image.dy = dy;
+		cursor.set |= FB_CUR_SETPOS;
+	}
+
+	if (ops->cursor_state.hot.x || ops->cursor_state.hot.y ||
+	    ops->cursor_reset) {
+		ops->cursor_state.hot.x = cursor.hot.y = 0;
+		cursor.set |= FB_CUR_SETHOT;
+	}
+
+	if (cursor.set & FB_CUR_SETSIZE ||
+	    vc->vc_cursor_type != ops->p->cursor_shape ||
+	    ops->cursor_state.mask == NULL ||
+	    ops->cursor_reset) {
+		char *tmp, *mask = kmalloc(w*vc->vc_font.width, GFP_ATOMIC);
+		int cur_height, size, i = 0;
+		int width = (vc->vc_font.width + 7)/8;
+
+		if (!mask)
+			return;
+
+		tmp = kmalloc(width * vc->vc_font.height, GFP_ATOMIC);
+
+		if (!tmp) {
+			kfree(mask);
+			return;
+		}
+
+		kfree(ops->cursor_state.mask);
+		ops->cursor_state.mask = mask;
+
+		ops->p->cursor_shape = vc->vc_cursor_type;
+		cursor.set |= FB_CUR_SETSHAPE;
+
+		switch (ops->p->cursor_shape & CUR_HWMASK) {
+		case CUR_NONE:
+			cur_height = 0;
+			break;
+		case CUR_UNDERLINE:
+			cur_height = (vc->vc_font.height < 10) ? 1 : 2;
+			break;
+		case CUR_LOWER_THIRD:
+			cur_height = vc->vc_font.height/3;
+			break;
+		case CUR_LOWER_HALF:
+			cur_height = vc->vc_font.height >> 1;
+			break;
+		case CUR_TWO_THIRDS:
+			cur_height = (vc->vc_font.height << 1)/3;
+			break;
+		case CUR_BLOCK:
+		default:
+			cur_height = vc->vc_font.height;
+			break;
+		}
+
+		size = (vc->vc_font.height - cur_height) * width;
+		while (size--)
+			tmp[i++] = 0;
+		size = cur_height * width;
+		while (size--)
+			tmp[i++] = 0xff;
+		memset(mask, 0, w * vc->vc_font.width);
+		rotate_cw(tmp, mask, vc->vc_font.width, vc->vc_font.height);
+		kfree(tmp);
+	}
+
+	switch (mode) {
+	case CM_ERASE:
+		ops->cursor_state.enable = 0;
+		break;
+	case CM_DRAW:
+	case CM_MOVE:
+	default:
+		ops->cursor_state.enable = (use_sw) ? 0 : 1;
+		break;
+	}
+
+	cursor.image.data = src;
+	cursor.image.fg_color = ops->cursor_state.image.fg_color;
+	cursor.image.bg_color = ops->cursor_state.image.bg_color;
+	cursor.image.dx = ops->cursor_state.image.dx;
+	cursor.image.dy = ops->cursor_state.image.dy;
+	cursor.image.height = ops->cursor_state.image.height;
+	cursor.image.width = ops->cursor_state.image.width;
+	cursor.hot.x = ops->cursor_state.hot.x;
+	cursor.hot.y = ops->cursor_state.hot.y;
+	cursor.mask = ops->cursor_state.mask;
+	cursor.enable = ops->cursor_state.enable;
+	cursor.image.depth = 1;
+	cursor.rop = ROP_XOR;
+
+	if (info->fbops->fb_cursor)
+		err = info->fbops->fb_cursor(info, &cursor);
+
+	if (err)
+		soft_cursor(info, &cursor);
+
+	ops->cursor_reset = 0;
+}
+
+static int cw_update_start(struct fb_info *info)
+{
+	struct fbcon_ops *ops = info->fbcon_par;
+	u32 vxres = GETVXRES(ops->p->scrollmode, info);
+	u32 xoffset;
+	int err;
+
+	xoffset = vxres - (info->var.xres + ops->var.yoffset);
+	ops->var.yoffset = ops->var.xoffset;
+	ops->var.xoffset = xoffset;
+	err = fb_pan_display(info, &ops->var);
+	ops->var.xoffset = info->var.xoffset;
+	ops->var.yoffset = info->var.yoffset;
+	ops->var.vmode = info->var.vmode;
+	return err;
+}
+
+void fbcon_rotate_cw(struct fbcon_ops *ops)
+{
+	ops->bmove = cw_bmove;
+	ops->clear = cw_clear;
+	ops->putcs = cw_putcs;
+	ops->clear_margins = cw_clear_margins;
+	ops->cursor = cw_cursor;
+	ops->update_start = cw_update_start;
+}
+EXPORT_SYMBOL(fbcon_rotate_cw);
diff --git a/drivers/video/fbdev/core/fbcon_rotate.c b/drivers/video/fbdev/core/fbcon_rotate.c
new file mode 100644
index 000000000000..8a51e4d95cc5
--- /dev/null
+++ b/drivers/video/fbdev/core/fbcon_rotate.c
@@ -0,0 +1,112 @@
+/*
+ *  linux/drivers/video/console/fbcon_rotate.c -- Software Rotation
+ *
+ *      Copyright (C) 2005 Antonino Daplas <adaplas @pol.net>
+ *
+ *  This file is subject to the terms and conditions of the GNU General Public
+ *  License.  See the file COPYING in the main directory of this archive for
+ *  more details.
+ */
+
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/string.h>
+#include <linux/fb.h>
+#include <linux/vt_kern.h>
+#include <linux/console.h>
+#include <asm/types.h>
+#include "fbcon.h"
+#include "fbcon_rotate.h"
+
+static int fbcon_rotate_font(struct fb_info *info, struct vc_data *vc)
+{
+	struct fbcon_ops *ops = info->fbcon_par;
+	int len, err = 0;
+	int s_cellsize, d_cellsize, i;
+	const u8 *src;
+	u8 *dst;
+
+	if (vc->vc_font.data == ops->fontdata &&
+	    ops->p->con_rotate == ops->cur_rotate)
+		goto finished;
+
+	src = ops->fontdata = vc->vc_font.data;
+	ops->cur_rotate = ops->p->con_rotate;
+	len = (!ops->p->userfont) ? 256 : FNTCHARCNT(src);
+	s_cellsize = ((vc->vc_font.width + 7)/8) *
+		vc->vc_font.height;
+	d_cellsize = s_cellsize;
+
+	if (ops->rotate == FB_ROTATE_CW ||
+	    ops->rotate == FB_ROTATE_CCW)
+		d_cellsize = ((vc->vc_font.height + 7)/8) *
+			vc->vc_font.width;
+
+	if (info->fbops->fb_sync)
+		info->fbops->fb_sync(info);
+
+	if (ops->fd_size < d_cellsize * len) {
+		dst = kmalloc(d_cellsize * len, GFP_KERNEL);
+
+		if (dst == NULL) {
+			err = -ENOMEM;
+			goto finished;
+		}
+
+		ops->fd_size = d_cellsize * len;
+		kfree(ops->fontbuffer);
+		ops->fontbuffer = dst;
+	}
+
+	dst = ops->fontbuffer;
+	memset(dst, 0, ops->fd_size);
+
+	switch (ops->rotate) {
+	case FB_ROTATE_UD:
+		for (i = len; i--; ) {
+			rotate_ud(src, dst, vc->vc_font.width,
+				  vc->vc_font.height);
+
+			src += s_cellsize;
+			dst += d_cellsize;
+		}
+		break;
+	case FB_ROTATE_CW:
+		for (i = len; i--; ) {
+			rotate_cw(src, dst, vc->vc_font.width,
+				  vc->vc_font.height);
+			src += s_cellsize;
+			dst += d_cellsize;
+		}
+		break;
+	case FB_ROTATE_CCW:
+		for (i = len; i--; ) {
+			rotate_ccw(src, dst, vc->vc_font.width,
+				   vc->vc_font.height);
+			src += s_cellsize;
+			dst += d_cellsize;
+		}
+		break;
+	}
+
+finished:
+	return err;
+}
+
+void fbcon_set_rotate(struct fbcon_ops *ops)
+{
+	ops->rotate_font = fbcon_rotate_font;
+
+	switch(ops->rotate) {
+	case FB_ROTATE_CW:
+		fbcon_rotate_cw(ops);
+		break;
+	case FB_ROTATE_UD:
+		fbcon_rotate_ud(ops);
+		break;
+	case FB_ROTATE_CCW:
+		fbcon_rotate_ccw(ops);
+		break;
+	}
+}
+EXPORT_SYMBOL(fbcon_set_rotate);
diff --git a/drivers/video/fbdev/core/fbcon_rotate.h b/drivers/video/fbdev/core/fbcon_rotate.h
new file mode 100644
index 000000000000..e233444cda66
--- /dev/null
+++ b/drivers/video/fbdev/core/fbcon_rotate.h
@@ -0,0 +1,96 @@
+/*
+ *  linux/drivers/video/console/fbcon_rotate.h -- Software Display Rotation
+ *
+ *	Copyright (C) 2005 Antonino Daplas <adaplas@pol.net>
+ *
+ *  This file is subject to the terms and conditions of the GNU General Public
+ *  License.  See the file COPYING in the main directory of this archive
+ *  for more details.
+ */
+
+#ifndef _FBCON_ROTATE_H
+#define _FBCON_ROTATE_H
+
+#define GETVYRES(s,i) ({                           \
+        (s == SCROLL_REDRAW || s == SCROLL_MOVE) ? \
+        (i)->var.yres : (i)->var.yres_virtual; })
+
+#define GETVXRES(s,i) ({                           \
+        (s == SCROLL_REDRAW || s == SCROLL_MOVE || !(i)->fix.xpanstep) ? \
+        (i)->var.xres : (i)->var.xres_virtual; })
+
+
+static inline int pattern_test_bit(u32 x, u32 y, u32 pitch, const char *pat)
+{
+	u32 tmp = (y * pitch) + x, index = tmp / 8,  bit = tmp % 8;
+
+	pat +=index;
+	return (*pat) & (0x80 >> bit);
+}
+
+static inline void pattern_set_bit(u32 x, u32 y, u32 pitch, char *pat)
+{
+	u32 tmp = (y * pitch) + x, index = tmp / 8, bit = tmp % 8;
+
+	pat += index;
+
+	(*pat) |= 0x80 >> bit;
+}
+
+static inline void rotate_ud(const char *in, char *out, u32 width, u32 height)
+{
+	int i, j;
+	int shift = (8 - (width % 8)) & 7;
+
+	width = (width + 7) & ~7;
+
+	for (i = 0; i < height; i++) {
+		for (j = 0; j < width - shift; j++) {
+			if (pattern_test_bit(j, i, width, in))
+				pattern_set_bit(width - (1 + j + shift),
+						height - (1 + i),
+						width, out);
+		}
+
+	}
+}
+
+static inline void rotate_cw(const char *in, char *out, u32 width, u32 height)
+{
+	int i, j, h = height, w = width;
+	int shift = (8 - (height % 8)) & 7;
+
+	width = (width + 7) & ~7;
+	height = (height + 7) & ~7;
+
+	for (i = 0; i < h; i++) {
+		for (j = 0; j < w; j++) {
+			if (pattern_test_bit(j, i, width, in))
+				pattern_set_bit(height - 1 - i - shift, j,
+						height, out);
+
+		}
+	}
+}
+
+static inline void rotate_ccw(const char *in, char *out, u32 width, u32 height)
+{
+	int i, j, h = height, w = width;
+	int shift = (8 - (width % 8)) & 7;
+
+	width = (width + 7) & ~7;
+	height = (height + 7) & ~7;
+
+	for (i = 0; i < h; i++) {
+		for (j = 0; j < w; j++) {
+			if (pattern_test_bit(j, i, width, in))
+				pattern_set_bit(i, width - 1 - j - shift,
+						height, out);
+		}
+	}
+}
+
+extern void fbcon_rotate_cw(struct fbcon_ops *ops);
+extern void fbcon_rotate_ud(struct fbcon_ops *ops);
+extern void fbcon_rotate_ccw(struct fbcon_ops *ops);
+#endif
diff --git a/drivers/video/fbdev/core/fbcon_ud.c b/drivers/video/fbdev/core/fbcon_ud.c
new file mode 100644
index 000000000000..c0b605d49cb3
--- /dev/null
+++ b/drivers/video/fbdev/core/fbcon_ud.c
@@ -0,0 +1,448 @@
+/*
+ *  linux/drivers/video/console/fbcon_ud.c -- Software Rotation - 180 degrees
+ *
+ *      Copyright (C) 2005 Antonino Daplas <adaplas @pol.net>
+ *
+ *  This file is subject to the terms and conditions of the GNU General Public
+ *  License.  See the file COPYING in the main directory of this archive for
+ *  more details.
+ */
+
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/string.h>
+#include <linux/fb.h>
+#include <linux/vt_kern.h>
+#include <linux/console.h>
+#include <asm/types.h>
+#include "fbcon.h"
+#include "fbcon_rotate.h"
+
+/*
+ * Rotation 180 degrees
+ */
+
+static void ud_update_attr(u8 *dst, u8 *src, int attribute,
+				  struct vc_data *vc)
+{
+	int i, offset = (vc->vc_font.height < 10) ? 1 : 2;
+	int width = (vc->vc_font.width + 7) >> 3;
+	unsigned int cellsize = vc->vc_font.height * width;
+	u8 c;
+
+	offset = offset * width;
+
+	for (i = 0; i < cellsize; i++) {
+		c = src[i];
+		if (attribute & FBCON_ATTRIBUTE_UNDERLINE && i < offset)
+			c = 0xff;
+		if (attribute & FBCON_ATTRIBUTE_BOLD)
+			c |= c << 1;
+		if (attribute & FBCON_ATTRIBUTE_REVERSE)
+			c = ~c;
+		dst[i] = c;
+	}
+}
+
+
+static void ud_bmove(struct vc_data *vc, struct fb_info *info, int sy,
+		     int sx, int dy, int dx, int height, int width)
+{
+	struct fbcon_ops *ops = info->fbcon_par;
+	struct fb_copyarea area;
+	u32 vyres = GETVYRES(ops->p->scrollmode, info);
+	u32 vxres = GETVXRES(ops->p->scrollmode, info);
+
+	area.sy = vyres - ((sy + height) * vc->vc_font.height);
+	area.sx = vxres - ((sx + width) * vc->vc_font.width);
+	area.dy = vyres - ((dy + height) * vc->vc_font.height);
+	area.dx = vxres - ((dx + width) * vc->vc_font.width);
+	area.height = height * vc->vc_font.height;
+	area.width  = width * vc->vc_font.width;
+
+	info->fbops->fb_copyarea(info, &area);
+}
+
+static void ud_clear(struct vc_data *vc, struct fb_info *info, int sy,
+		     int sx, int height, int width)
+{
+	struct fbcon_ops *ops = info->fbcon_par;
+	struct fb_fillrect region;
+	int bgshift = (vc->vc_hi_font_mask) ? 13 : 12;
+	u32 vyres = GETVYRES(ops->p->scrollmode, info);
+	u32 vxres = GETVXRES(ops->p->scrollmode, info);
+
+	region.color = attr_bgcol_ec(bgshift,vc,info);
+	region.dy = vyres - ((sy + height) * vc->vc_font.height);
+	region.dx = vxres - ((sx + width) *  vc->vc_font.width);
+	region.width = width * vc->vc_font.width;
+	region.height = height * vc->vc_font.height;
+	region.rop = ROP_COPY;
+
+	info->fbops->fb_fillrect(info, &region);
+}
+
+static inline void ud_putcs_aligned(struct vc_data *vc, struct fb_info *info,
+				    const u16 *s, u32 attr, u32 cnt,
+				    u32 d_pitch, u32 s_pitch, u32 cellsize,
+				    struct fb_image *image, u8 *buf, u8 *dst)
+{
+	struct fbcon_ops *ops = info->fbcon_par;
+	u16 charmask = vc->vc_hi_font_mask ? 0x1ff : 0xff;
+	u32 idx = vc->vc_font.width >> 3;
+	u8 *src;
+
+	while (cnt--) {
+		src = ops->fontbuffer + (scr_readw(s--) & charmask)*cellsize;
+
+		if (attr) {
+			ud_update_attr(buf, src, attr, vc);
+			src = buf;
+		}
+
+		if (likely(idx == 1))
+			__fb_pad_aligned_buffer(dst, d_pitch, src, idx,
+						image->height);
+		else
+			fb_pad_aligned_buffer(dst, d_pitch, src, idx,
+					      image->height);
+
+		dst += s_pitch;
+	}
+
+	info->fbops->fb_imageblit(info, image);
+}
+
+static inline void ud_putcs_unaligned(struct vc_data *vc,
+				      struct fb_info *info, const u16 *s,
+				      u32 attr, u32 cnt, u32 d_pitch,
+				      u32 s_pitch, u32 cellsize,
+				      struct fb_image *image, u8 *buf,
+				      u8 *dst)
+{
+	struct fbcon_ops *ops = info->fbcon_par;
+	u16 charmask = vc->vc_hi_font_mask ? 0x1ff : 0xff;
+	u32 shift_low = 0, mod = vc->vc_font.width % 8;
+	u32 shift_high = 8;
+	u32 idx = vc->vc_font.width >> 3;
+	u8 *src;
+
+	while (cnt--) {
+		src = ops->fontbuffer + (scr_readw(s--) & charmask)*cellsize;
+
+		if (attr) {
+			ud_update_attr(buf, src, attr, vc);
+			src = buf;
+		}
+
+		fb_pad_unaligned_buffer(dst, d_pitch, src, idx,
+					image->height, shift_high,
+					shift_low, mod);
+		shift_low += mod;
+		dst += (shift_low >= 8) ? s_pitch : s_pitch - 1;
+		shift_low &= 7;
+		shift_high = 8 - shift_low;
+	}
+
+	info->fbops->fb_imageblit(info, image);
+
+}
+
+static void ud_putcs(struct vc_data *vc, struct fb_info *info,
+		      const unsigned short *s, int count, int yy, int xx,
+		      int fg, int bg)
+{
+	struct fb_image image;
+	struct fbcon_ops *ops = info->fbcon_par;
+	u32 width = (vc->vc_font.width + 7)/8;
+	u32 cellsize = width * vc->vc_font.height;
+	u32 maxcnt = info->pixmap.size/cellsize;
+	u32 scan_align = info->pixmap.scan_align - 1;
+	u32 buf_align = info->pixmap.buf_align - 1;
+	u32 mod = vc->vc_font.width % 8, cnt, pitch, size;
+	u32 attribute = get_attribute(info, scr_readw(s));
+	u8 *dst, *buf = NULL;
+	u32 vyres = GETVYRES(ops->p->scrollmode, info);
+	u32 vxres = GETVXRES(ops->p->scrollmode, info);
+
+	if (!ops->fontbuffer)
+		return;
+
+	image.fg_color = fg;
+	image.bg_color = bg;
+	image.dy = vyres - ((yy * vc->vc_font.height) + vc->vc_font.height);
+	image.dx = vxres - ((xx + count) * vc->vc_font.width);
+	image.height = vc->vc_font.height;
+	image.depth = 1;
+
+	if (attribute) {
+		buf = kmalloc(cellsize, GFP_KERNEL);
+		if (!buf)
+			return;
+	}
+
+	s += count - 1;
+
+	while (count) {
+		if (count > maxcnt)
+			cnt = maxcnt;
+		else
+			cnt = count;
+
+		image.width = vc->vc_font.width * cnt;
+		pitch = ((image.width + 7) >> 3) + scan_align;
+		pitch &= ~scan_align;
+		size = pitch * image.height + buf_align;
+		size &= ~buf_align;
+		dst = fb_get_buffer_offset(info, &info->pixmap, size);
+		image.data = dst;
+
+		if (!mod)
+			ud_putcs_aligned(vc, info, s, attribute, cnt, pitch,
+					 width, cellsize, &image, buf, dst);
+		else
+			ud_putcs_unaligned(vc, info, s, attribute, cnt, pitch,
+					   width, cellsize, &image,
+					   buf, dst);
+
+		image.dx += image.width;
+		count -= cnt;
+		s -= cnt;
+		xx += cnt;
+	}
+
+	/* buf is always NULL except when in monochrome mode, so in this case
+	   it's a gain to check buf against NULL even though kfree() handles
+	   NULL pointers just fine */
+	if (unlikely(buf))
+		kfree(buf);
+
+}
+
+static void ud_clear_margins(struct vc_data *vc, struct fb_info *info,
+			     int bottom_only)
+{
+	unsigned int cw = vc->vc_font.width;
+	unsigned int ch = vc->vc_font.height;
+	unsigned int rw = info->var.xres - (vc->vc_cols*cw);
+	unsigned int bh = info->var.yres - (vc->vc_rows*ch);
+	struct fb_fillrect region;
+
+	region.color = 0;
+	region.rop = ROP_COPY;
+
+	if (rw && !bottom_only) {
+		region.dy = 0;
+		region.dx = info->var.xoffset;
+		region.width  = rw;
+		region.height = info->var.yres_virtual;
+		info->fbops->fb_fillrect(info, &region);
+	}
+
+	if (bh) {
+		region.dy = info->var.yoffset;
+		region.dx = info->var.xoffset;
+                region.height  = bh;
+                region.width = info->var.xres;
+		info->fbops->fb_fillrect(info, &region);
+	}
+}
+
+static void ud_cursor(struct vc_data *vc, struct fb_info *info, int mode,
+		      int softback_lines, int fg, int bg)
+{
+	struct fb_cursor cursor;
+	struct fbcon_ops *ops = info->fbcon_par;
+	unsigned short charmask = vc->vc_hi_font_mask ? 0x1ff : 0xff;
+	int w = (vc->vc_font.width + 7) >> 3, c;
+	int y = real_y(ops->p, vc->vc_y);
+	int attribute, use_sw = (vc->vc_cursor_type & 0x10);
+	int err = 1, dx, dy;
+	char *src;
+	u32 vyres = GETVYRES(ops->p->scrollmode, info);
+	u32 vxres = GETVXRES(ops->p->scrollmode, info);
+
+	if (!ops->fontbuffer)
+		return;
+
+	cursor.set = 0;
+
+	if (softback_lines) {
+		if (y + softback_lines >= vc->vc_rows) {
+			mode = CM_ERASE;
+			ops->cursor_flash = 0;
+			return;
+		} else
+			y += softback_lines;
+	}
+
+ 	c = scr_readw((u16 *) vc->vc_pos);
+	attribute = get_attribute(info, c);
+	src = ops->fontbuffer + ((c & charmask) * (w * vc->vc_font.height));
+
+	if (ops->cursor_state.image.data != src ||
+	    ops->cursor_reset) {
+	    ops->cursor_state.image.data = src;
+	    cursor.set |= FB_CUR_SETIMAGE;
+	}
+
+	if (attribute) {
+		u8 *dst;
+
+		dst = kmalloc(w * vc->vc_font.height, GFP_ATOMIC);
+		if (!dst)
+			return;
+		kfree(ops->cursor_data);
+		ops->cursor_data = dst;
+		ud_update_attr(dst, src, attribute, vc);
+		src = dst;
+	}
+
+	if (ops->cursor_state.image.fg_color != fg ||
+	    ops->cursor_state.image.bg_color != bg ||
+	    ops->cursor_reset) {
+		ops->cursor_state.image.fg_color = fg;
+		ops->cursor_state.image.bg_color = bg;
+		cursor.set |= FB_CUR_SETCMAP;
+	}
+
+	if (ops->cursor_state.image.height != vc->vc_font.height ||
+	    ops->cursor_state.image.width != vc->vc_font.width ||
+	    ops->cursor_reset) {
+		ops->cursor_state.image.height = vc->vc_font.height;
+		ops->cursor_state.image.width = vc->vc_font.width;
+		cursor.set |= FB_CUR_SETSIZE;
+	}
+
+	dy = vyres - ((y * vc->vc_font.height) + vc->vc_font.height);
+	dx = vxres - ((vc->vc_x * vc->vc_font.width) + vc->vc_font.width);
+
+	if (ops->cursor_state.image.dx != dx ||
+	    ops->cursor_state.image.dy != dy ||
+	    ops->cursor_reset) {
+		ops->cursor_state.image.dx = dx;
+		ops->cursor_state.image.dy = dy;
+		cursor.set |= FB_CUR_SETPOS;
+	}
+
+	if (ops->cursor_state.hot.x || ops->cursor_state.hot.y ||
+	    ops->cursor_reset) {
+		ops->cursor_state.hot.x = cursor.hot.y = 0;
+		cursor.set |= FB_CUR_SETHOT;
+	}
+
+	if (cursor.set & FB_CUR_SETSIZE ||
+	    vc->vc_cursor_type != ops->p->cursor_shape ||
+	    ops->cursor_state.mask == NULL ||
+	    ops->cursor_reset) {
+		char *mask = kmalloc(w*vc->vc_font.height, GFP_ATOMIC);
+		int cur_height, size, i = 0;
+		u8 msk = 0xff;
+
+		if (!mask)
+			return;
+
+		kfree(ops->cursor_state.mask);
+		ops->cursor_state.mask = mask;
+
+		ops->p->cursor_shape = vc->vc_cursor_type;
+		cursor.set |= FB_CUR_SETSHAPE;
+
+		switch (ops->p->cursor_shape & CUR_HWMASK) {
+		case CUR_NONE:
+			cur_height = 0;
+			break;
+		case CUR_UNDERLINE:
+			cur_height = (vc->vc_font.height < 10) ? 1 : 2;
+			break;
+		case CUR_LOWER_THIRD:
+			cur_height = vc->vc_font.height/3;
+			break;
+		case CUR_LOWER_HALF:
+			cur_height = vc->vc_font.height >> 1;
+			break;
+		case CUR_TWO_THIRDS:
+			cur_height = (vc->vc_font.height << 1)/3;
+			break;
+		case CUR_BLOCK:
+		default:
+			cur_height = vc->vc_font.height;
+			break;
+		}
+
+		size = cur_height * w;
+
+		while (size--)
+			mask[i++] = msk;
+
+		size = (vc->vc_font.height - cur_height) * w;
+
+		while (size--)
+			mask[i++] = ~msk;
+	}
+
+	switch (mode) {
+	case CM_ERASE:
+		ops->cursor_state.enable = 0;
+		break;
+	case CM_DRAW:
+	case CM_MOVE:
+	default:
+		ops->cursor_state.enable = (use_sw) ? 0 : 1;
+		break;
+	}
+
+	cursor.image.data = src;
+	cursor.image.fg_color = ops->cursor_state.image.fg_color;
+	cursor.image.bg_color = ops->cursor_state.image.bg_color;
+	cursor.image.dx = ops->cursor_state.image.dx;
+	cursor.image.dy = ops->cursor_state.image.dy;
+	cursor.image.height = ops->cursor_state.image.height;
+	cursor.image.width = ops->cursor_state.image.width;
+	cursor.hot.x = ops->cursor_state.hot.x;
+	cursor.hot.y = ops->cursor_state.hot.y;
+	cursor.mask = ops->cursor_state.mask;
+	cursor.enable = ops->cursor_state.enable;
+	cursor.image.depth = 1;
+	cursor.rop = ROP_XOR;
+
+	if (info->fbops->fb_cursor)
+		err = info->fbops->fb_cursor(info, &cursor);
+
+	if (err)
+		soft_cursor(info, &cursor);
+
+	ops->cursor_reset = 0;
+}
+
+static int ud_update_start(struct fb_info *info)
+{
+	struct fbcon_ops *ops = info->fbcon_par;
+	int xoffset, yoffset;
+	u32 vyres = GETVYRES(ops->p->scrollmode, info);
+	u32 vxres = GETVXRES(ops->p->scrollmode, info);
+	int err;
+
+	xoffset = vxres - info->var.xres - ops->var.xoffset;
+	yoffset = vyres - info->var.yres - ops->var.yoffset;
+	if (yoffset < 0)
+		yoffset += vyres;
+	ops->var.xoffset = xoffset;
+	ops->var.yoffset = yoffset;
+	err = fb_pan_display(info, &ops->var);
+	ops->var.xoffset = info->var.xoffset;
+	ops->var.yoffset = info->var.yoffset;
+	ops->var.vmode = info->var.vmode;
+	return err;
+}
+
+void fbcon_rotate_ud(struct fbcon_ops *ops)
+{
+	ops->bmove = ud_bmove;
+	ops->clear = ud_clear;
+	ops->putcs = ud_putcs;
+	ops->clear_margins = ud_clear_margins;
+	ops->cursor = ud_cursor;
+	ops->update_start = ud_update_start;
+}
+EXPORT_SYMBOL(fbcon_rotate_ud);
diff --git a/drivers/video/fbdev/core/fbmem.c b/drivers/video/fbdev/core/fbmem.c
index 7a42238db446..231bb7e48c0d 100644
--- a/drivers/video/fbdev/core/fbmem.c
+++ b/drivers/video/fbdev/core/fbmem.c
@@ -32,6 +32,7 @@
 #include <linux/device.h>
 #include <linux/efi.h>
 #include <linux/fb.h>
+#include <linux/fbcon.h>
 
 #include <asm/fb.h>
 
@@ -1880,6 +1881,9 @@ fbmem_init(void)
 		fb_class = NULL;
 		goto err_class;
 	}
+
+	fb_console_init();
+
 	return 0;
 
 err_class:
@@ -1894,6 +1898,8 @@ module_init(fbmem_init);
 static void __exit
 fbmem_exit(void)
 {
+	fb_console_exit();
+
 	remove_proc_entry("fb", NULL);
 	class_destroy(fb_class);
 	unregister_chrdev(FB_MAJOR, "fb");
diff --git a/drivers/video/fbdev/core/softcursor.c b/drivers/video/fbdev/core/softcursor.c
new file mode 100644
index 000000000000..fc93f254498e
--- /dev/null
+++ b/drivers/video/fbdev/core/softcursor.c
@@ -0,0 +1,78 @@
+/*
+ * linux/drivers/video/console/softcursor.c
+ *
+ * Generic software cursor for frame buffer devices
+ *
+ *  Created 14 Nov 2002 by James Simmons
+ *
+ * This file is subject to the terms and conditions of the GNU General
+ * Public License.  See the file COPYING in the main directory of this
+ * archive for more details.
+ */
+
+#include <linux/module.h>
+#include <linux/string.h>
+#include <linux/fb.h>
+#include <linux/slab.h>
+
+#include <asm/io.h>
+
+#include "fbcon.h"
+
+int soft_cursor(struct fb_info *info, struct fb_cursor *cursor)
+{
+	struct fbcon_ops *ops = info->fbcon_par;
+	unsigned int scan_align = info->pixmap.scan_align - 1;
+	unsigned int buf_align = info->pixmap.buf_align - 1;
+	unsigned int i, size, dsize, s_pitch, d_pitch;
+	struct fb_image *image;
+	u8 *src, *dst;
+
+	if (info->state != FBINFO_STATE_RUNNING)
+		return 0;
+
+	s_pitch = (cursor->image.width + 7) >> 3;
+	dsize = s_pitch * cursor->image.height;
+
+	if (dsize + sizeof(struct fb_image) != ops->cursor_size) {
+		kfree(ops->cursor_src);
+		ops->cursor_size = dsize + sizeof(struct fb_image);
+
+		ops->cursor_src = kmalloc(ops->cursor_size, GFP_ATOMIC);
+		if (!ops->cursor_src) {
+			ops->cursor_size = 0;
+			return -ENOMEM;
+		}
+	}
+
+	src = ops->cursor_src + sizeof(struct fb_image);
+	image = (struct fb_image *)ops->cursor_src;
+	*image = cursor->image;
+	d_pitch = (s_pitch + scan_align) & ~scan_align;
+
+	size = d_pitch * image->height + buf_align;
+	size &= ~buf_align;
+	dst = fb_get_buffer_offset(info, &info->pixmap, size);
+
+	if (cursor->enable) {
+		switch (cursor->rop) {
+		case ROP_XOR:
+			for (i = 0; i < dsize; i++)
+				src[i] = image->data[i] ^ cursor->mask[i];
+			break;
+		case ROP_COPY:
+		default:
+			for (i = 0; i < dsize; i++)
+				src[i] = image->data[i] & cursor->mask[i];
+			break;
+		}
+	} else
+		memcpy(src, image->data, dsize);
+
+	fb_pad_aligned_buffer(dst, d_pitch, src, s_pitch, image->height);
+	image->data = dst;
+	info->fbops->fb_imageblit(info, image);
+	return 0;
+}
+
+EXPORT_SYMBOL(soft_cursor);
diff --git a/drivers/video/fbdev/core/tileblit.c b/drivers/video/fbdev/core/tileblit.c
new file mode 100644
index 000000000000..4636a6110c27
--- /dev/null
+++ b/drivers/video/fbdev/core/tileblit.c
@@ -0,0 +1,154 @@
+/*
+ *  linux/drivers/video/console/tileblit.c -- Tile Blitting Operation
+ *
+ *      Copyright (C) 2004 Antonino Daplas <adaplas @pol.net>
+ *
+ *  This file is subject to the terms and conditions of the GNU General Public
+ *  License.  See the file COPYING in the main directory of this archive for
+ *  more details.
+ */
+
+#include <linux/module.h>
+#include <linux/string.h>
+#include <linux/fb.h>
+#include <linux/vt_kern.h>
+#include <linux/console.h>
+#include <asm/types.h>
+#include "fbcon.h"
+
+static void tile_bmove(struct vc_data *vc, struct fb_info *info, int sy,
+		       int sx, int dy, int dx, int height, int width)
+{
+	struct fb_tilearea area;
+
+	area.sx = sx;
+	area.sy = sy;
+	area.dx = dx;
+	area.dy = dy;
+	area.height = height;
+	area.width = width;
+
+	info->tileops->fb_tilecopy(info, &area);
+}
+
+static void tile_clear(struct vc_data *vc, struct fb_info *info, int sy,
+		       int sx, int height, int width)
+{
+	struct fb_tilerect rect;
+	int bgshift = (vc->vc_hi_font_mask) ? 13 : 12;
+	int fgshift = (vc->vc_hi_font_mask) ? 9 : 8;
+
+	rect.index = vc->vc_video_erase_char &
+		((vc->vc_hi_font_mask) ? 0x1ff : 0xff);
+	rect.fg = attr_fgcol_ec(fgshift, vc, info);
+	rect.bg = attr_bgcol_ec(bgshift, vc, info);
+	rect.sx = sx;
+	rect.sy = sy;
+	rect.width = width;
+	rect.height = height;
+	rect.rop = ROP_COPY;
+
+	info->tileops->fb_tilefill(info, &rect);
+}
+
+static void tile_putcs(struct vc_data *vc, struct fb_info *info,
+		       const unsigned short *s, int count, int yy, int xx,
+		       int fg, int bg)
+{
+	struct fb_tileblit blit;
+	unsigned short charmask = vc->vc_hi_font_mask ? 0x1ff : 0xff;
+	int size = sizeof(u32) * count, i;
+
+	blit.sx = xx;
+	blit.sy = yy;
+	blit.width = count;
+	blit.height = 1;
+	blit.fg = fg;
+	blit.bg = bg;
+	blit.length = count;
+	blit.indices = (u32 *) fb_get_buffer_offset(info, &info->pixmap, size);
+	for (i = 0; i < count; i++)
+		blit.indices[i] = (u32)(scr_readw(s++) & charmask);
+
+	info->tileops->fb_tileblit(info, &blit);
+}
+
+static void tile_clear_margins(struct vc_data *vc, struct fb_info *info,
+			       int bottom_only)
+{
+	return;
+}
+
+static void tile_cursor(struct vc_data *vc, struct fb_info *info, int mode,
+			int softback_lines, int fg, int bg)
+{
+	struct fb_tilecursor cursor;
+	int use_sw = (vc->vc_cursor_type & 0x10);
+
+	cursor.sx = vc->vc_x;
+	cursor.sy = vc->vc_y;
+	cursor.mode = (mode == CM_ERASE || use_sw) ? 0 : 1;
+	cursor.fg = fg;
+	cursor.bg = bg;
+
+	switch (vc->vc_cursor_type & 0x0f) {
+	case CUR_NONE:
+		cursor.shape = FB_TILE_CURSOR_NONE;
+		break;
+	case CUR_UNDERLINE:
+		cursor.shape = FB_TILE_CURSOR_UNDERLINE;
+		break;
+	case CUR_LOWER_THIRD:
+		cursor.shape = FB_TILE_CURSOR_LOWER_THIRD;
+		break;
+	case CUR_LOWER_HALF:
+		cursor.shape = FB_TILE_CURSOR_LOWER_HALF;
+		break;
+	case CUR_TWO_THIRDS:
+		cursor.shape = FB_TILE_CURSOR_TWO_THIRDS;
+		break;
+	case CUR_BLOCK:
+	default:
+		cursor.shape = FB_TILE_CURSOR_BLOCK;
+		break;
+	}
+
+	info->tileops->fb_tilecursor(info, &cursor);
+}
+
+static int tile_update_start(struct fb_info *info)
+{
+	struct fbcon_ops *ops = info->fbcon_par;
+	int err;
+
+	err = fb_pan_display(info, &ops->var);
+	ops->var.xoffset = info->var.xoffset;
+	ops->var.yoffset = info->var.yoffset;
+	ops->var.vmode = info->var.vmode;
+	return err;
+}
+
+void fbcon_set_tileops(struct vc_data *vc, struct fb_info *info)
+{
+	struct fb_tilemap map;
+	struct fbcon_ops *ops = info->fbcon_par;
+
+	ops->bmove = tile_bmove;
+	ops->clear = tile_clear;
+	ops->putcs = tile_putcs;
+	ops->clear_margins = tile_clear_margins;
+	ops->cursor = tile_cursor;
+	ops->update_start = tile_update_start;
+
+	if (ops->p) {
+		map.width = vc->vc_font.width;
+		map.height = vc->vc_font.height;
+		map.depth = 1;
+		map.length = (ops->p->userfont) ?
+			FNTCHARCNT(ops->p->fontdata) : 256;
+		map.data = ops->p->fontdata;
+		info->tileops->fb_settile(info, &map);
+	}
+}
+
+EXPORT_SYMBOL(fbcon_set_tileops);
diff --git a/include/linux/fbcon.h b/include/linux/fbcon.h
new file mode 100644
index 000000000000..0fac6305d51c
--- /dev/null
+++ b/include/linux/fbcon.h
@@ -0,0 +1,12 @@
+#ifndef _LINUX_FBCON_H
+#define _LINUX_FBCON_H
+
+#ifdef CONFIG_FRAMEBUFFER_CONSOLE
+void __init fb_console_init(void);
+void __exit fb_console_exit(void);
+#else
+static void fb_console_init(void) {}
+static void fb_console_exit(void) {}
+#endif
+
+#endif /* _LINUX_FBCON_H */
-- 
cgit v1.2.3


From 376b3ff54c9a1cd54d71b1102e061fccf56d535e Mon Sep 17 00:00:00 2001
From: Daniel Vetter <daniel.vetter@ffwll.ch>
Date: Tue, 1 Aug 2017 17:33:02 +0200
Subject: fbdev: Nuke FBINFO_MODULE

Instead check info->ops->owner, which amounts to the same.

Spotted because I want to remove the pile of broken and cargo-culted
fb_info->flags assignments in drm drivers.

Signed-off-by: Daniel Vetter <daniel.vetter@intel.com>
Reviewed-by: Sean Paul <seanpaul@chromium.org>
Signed-off-by: Bartlomiej Zolnierkiewicz <b.zolnierkie@samsung.com>
---
 drivers/video/fbdev/core/fbcon.c           |  2 +-
 drivers/video/fbdev/core/fbmem.c           |  4 ++--
 drivers/video/fbdev/matrox/matroxfb_base.c |  4 +---
 include/linux/fb.h                         | 10 +---------
 4 files changed, 5 insertions(+), 15 deletions(-)

(limited to 'include')

diff --git a/drivers/video/fbdev/core/fbcon.c b/drivers/video/fbdev/core/fbcon.c
index 86b3bcbd01a8..431a1533a2fe 100644
--- a/drivers/video/fbdev/core/fbcon.c
+++ b/drivers/video/fbdev/core/fbcon.c
@@ -564,7 +564,7 @@ static void fbcon_prepare_logo(struct vc_data *vc, struct fb_info *info,
 	unsigned short *save = NULL, *r, *q;
 	int logo_height;
 
-	if (info->flags & FBINFO_MODULE) {
+	if (info->fbops->owner) {
 		logo_shown = FBCON_LOGO_DONTSHOW;
 		return;
 	}
diff --git a/drivers/video/fbdev/core/fbmem.c b/drivers/video/fbdev/core/fbmem.c
index 231bb7e48c0d..f8ced1434edf 100644
--- a/drivers/video/fbdev/core/fbmem.c
+++ b/drivers/video/fbdev/core/fbmem.c
@@ -463,7 +463,7 @@ static int fb_show_logo_line(struct fb_info *info, int rotate,
 
 	/* Return if the frame buffer is not mapped or suspended */
 	if (logo == NULL || info->state != FBINFO_STATE_RUNNING ||
-	    info->flags & FBINFO_MODULE)
+	    info->fbops->owner)
 		return 0;
 
 	image.depth = 8;
@@ -601,7 +601,7 @@ int fb_prepare_logo(struct fb_info *info, int rotate)
 	memset(&fb_logo, 0, sizeof(struct logo_data));
 
 	if (info->flags & FBINFO_MISC_TILEBLITTING ||
-	    info->flags & FBINFO_MODULE)
+	    info->fbops->owner)
 		return 0;
 
 	if (info->fix.visual == FB_VISUAL_DIRECTCOLOR) {
diff --git a/drivers/video/fbdev/matrox/matroxfb_base.c b/drivers/video/fbdev/matrox/matroxfb_base.c
index dce12a2cf48f..fd1589fcdf15 100644
--- a/drivers/video/fbdev/matrox/matroxfb_base.c
+++ b/drivers/video/fbdev/matrox/matroxfb_base.c
@@ -1794,9 +1794,7 @@ static int initMatrox2(struct matrox_fb_info *minfo, struct board *b)
 	minfo->fbops = matroxfb_ops;
 	minfo->fbcon.fbops = &minfo->fbops;
 	minfo->fbcon.pseudo_palette = minfo->cmap;
-	/* after __init time we are like module... no logo */
-	minfo->fbcon.flags = hotplug ? FBINFO_FLAG_MODULE : FBINFO_FLAG_DEFAULT;
-	minfo->fbcon.flags |= FBINFO_PARTIAL_PAN_OK | 	 /* Prefer panning for scroll under MC viewer/edit */
+	minfo->fbcon.flags = FBINFO_PARTIAL_PAN_OK | 	 /* Prefer panning for scroll under MC viewer/edit */
 				      FBINFO_HWACCEL_COPYAREA |  /* We have hw-assisted bmove */
 				      FBINFO_HWACCEL_FILLRECT |  /* And fillrect */
 				      FBINFO_HWACCEL_IMAGEBLIT | /* And imageblit */
diff --git a/include/linux/fb.h b/include/linux/fb.h
index a964d076b4dc..f4386b0ccf40 100644
--- a/include/linux/fb.h
+++ b/include/linux/fb.h
@@ -400,7 +400,7 @@ struct fb_tile_ops {
 #endif /* CONFIG_FB_TILEBLITTING */
 
 /* FBINFO_* = fb_info.flags bit flags */
-#define FBINFO_MODULE		0x0001	/* Low-level driver is a module */
+#define FBINFO_DEFAULT		0
 #define FBINFO_HWACCEL_DISABLED	0x0002
 	/* When FBINFO_HWACCEL_DISABLED is set:
 	 *  Hardware acceleration is turned off.  Software implementations
@@ -533,14 +533,6 @@ static inline struct apertures_struct *alloc_apertures(unsigned int max_num) {
 	return a;
 }
 
-#ifdef MODULE
-#define FBINFO_DEFAULT	FBINFO_MODULE
-#else
-#define FBINFO_DEFAULT	0
-#endif
-
-// This will go away
-#define FBINFO_FLAG_MODULE	FBINFO_MODULE
 #define FBINFO_FLAG_DEFAULT	FBINFO_DEFAULT
 
 /* This will go away
-- 
cgit v1.2.3


From ddb4a1442def2a78b91a85b4251fb712ef23662b Mon Sep 17 00:00:00 2001
From: Kees Cook <keescook@chromium.org>
Date: Tue, 18 Jul 2017 15:25:23 -0700
Subject: exec: Rename bprm->cred_prepared to called_set_creds

The cred_prepared bprm flag has a misleading name. It has nothing to do
with the bprm_prepare_cred hook, and actually tracks if bprm_set_creds has
been called. Rename this flag and improve its comment.

Cc: David Howells <dhowells@redhat.com>
Cc: Stephen Smalley <sds@tycho.nsa.gov>
Cc: Casey Schaufler <casey@schaufler-ca.com>
Signed-off-by: Kees Cook <keescook@chromium.org>
Acked-by: John Johansen <john.johansen@canonical.com>
Acked-by: James Morris <james.l.morris@oracle.com>
Acked-by: Paul Moore <paul@paul-moore.com>
Acked-by: Serge Hallyn <serge@hallyn.com>
---
 fs/binfmt_flat.c           | 2 +-
 fs/exec.c                  | 2 +-
 include/linux/binfmts.h    | 8 ++++++--
 security/apparmor/domain.c | 2 +-
 security/selinux/hooks.c   | 2 +-
 security/smack/smack_lsm.c | 2 +-
 security/tomoyo/tomoyo.c   | 2 +-
 7 files changed, 12 insertions(+), 8 deletions(-)

(limited to 'include')

diff --git a/fs/binfmt_flat.c b/fs/binfmt_flat.c
index a1e6860b6f46..604a176df0c2 100644
--- a/fs/binfmt_flat.c
+++ b/fs/binfmt_flat.c
@@ -890,7 +890,7 @@ static int load_flat_shared_library(int id, struct lib_info *libs)
 	 * as we're past the point of no return and are dealing with shared
 	 * libraries.
 	 */
-	bprm.cred_prepared = 1;
+	bprm.called_set_creds = 1;
 
 	res = prepare_binprm(&bprm);
 
diff --git a/fs/exec.c b/fs/exec.c
index 62175cbcc801..a0fff86269e4 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -1548,7 +1548,7 @@ int prepare_binprm(struct linux_binprm *bprm)
 	retval = security_bprm_set_creds(bprm);
 	if (retval)
 		return retval;
-	bprm->cred_prepared = 1;
+	bprm->called_set_creds = 1;
 
 	memset(bprm->buf, 0, BINPRM_BUF_SIZE);
 	return kernel_read(bprm->file, 0, bprm->buf, BINPRM_BUF_SIZE);
diff --git a/include/linux/binfmts.h b/include/linux/binfmts.h
index 3ae9013eeaaa..9023e1d2d5cd 100644
--- a/include/linux/binfmts.h
+++ b/include/linux/binfmts.h
@@ -25,8 +25,12 @@ struct linux_binprm {
 	struct mm_struct *mm;
 	unsigned long p; /* current top of mem */
 	unsigned int
-		cred_prepared:1,/* true if creds already prepared (multiple
-				 * preps happen for interpreters) */
+		/*
+		 * True after the bprm_set_creds hook has been called once
+		 * (multiple calls can be made via prepare_binprm() for
+		 * binfmt_script/misc).
+		 */
+		called_set_creds:1,
 		cap_effective:1;/* true if has elevated effective capabilities,
 				 * false if not; except for init which inherits
 				 * its parent's caps anyway */
diff --git a/security/apparmor/domain.c b/security/apparmor/domain.c
index d0594446ae3f..67ec52cfc523 100644
--- a/security/apparmor/domain.c
+++ b/security/apparmor/domain.c
@@ -758,7 +758,7 @@ int apparmor_bprm_set_creds(struct linux_binprm *bprm)
 		file_inode(bprm->file)->i_mode
 	};
 
-	if (bprm->cred_prepared)
+	if (bprm->called_set_creds)
 		return 0;
 
 	ctx = cred_ctx(bprm->cred);
diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c
index 33fd061305c4..1db40195d178 100644
--- a/security/selinux/hooks.c
+++ b/security/selinux/hooks.c
@@ -2356,7 +2356,7 @@ static int selinux_bprm_set_creds(struct linux_binprm *bprm)
 
 	/* SELinux context only depends on initial program or script and not
 	 * the script interpreter */
-	if (bprm->cred_prepared)
+	if (bprm->called_set_creds)
 		return 0;
 
 	old_tsec = current_security();
diff --git a/security/smack/smack_lsm.c b/security/smack/smack_lsm.c
index 463af86812c7..db8e16ec223b 100644
--- a/security/smack/smack_lsm.c
+++ b/security/smack/smack_lsm.c
@@ -917,7 +917,7 @@ static int smack_bprm_set_creds(struct linux_binprm *bprm)
 	struct superblock_smack *sbsp;
 	int rc;
 
-	if (bprm->cred_prepared)
+	if (bprm->called_set_creds)
 		return 0;
 
 	isp = inode->i_security;
diff --git a/security/tomoyo/tomoyo.c b/security/tomoyo/tomoyo.c
index 130b4fa4f65f..d25b705360e0 100644
--- a/security/tomoyo/tomoyo.c
+++ b/security/tomoyo/tomoyo.c
@@ -76,7 +76,7 @@ static int tomoyo_bprm_set_creds(struct linux_binprm *bprm)
 	 * Do only if this function is called for the first time of an execve
 	 * operation.
 	 */
-	if (bprm->cred_prepared)
+	if (bprm->called_set_creds)
 		return 0;
 #ifndef CONFIG_SECURITY_TOMOYO_OMIT_USERSPACE_LOADER
 	/*
-- 
cgit v1.2.3


From c425e189ffd7720c881fe9ccd7143cea577f6d03 Mon Sep 17 00:00:00 2001
From: Kees Cook <keescook@chromium.org>
Date: Tue, 18 Jul 2017 15:25:22 -0700
Subject: binfmt: Introduce secureexec flag

The bprm_secureexec hook can be moved earlier. Right now, it is called
during create_elf_tables(), via load_binary(), via search_binary_handler(),
via exec_binprm(). Nearly all (see exception below) state used by
bprm_secureexec is created during the bprm_set_creds hook, called from
prepare_binprm().

For all LSMs (except commoncaps described next), only the first execution
of bprm_set_creds takes any effect (they all check bprm->called_set_creds
which prepare_binprm() sets after the first call to the bprm_set_creds
hook).  However, all these LSMs also only do anything with bprm_secureexec
when they detected a secure state during their first run of bprm_set_creds.
Therefore, it is functionally identical to move the detection into
bprm_set_creds, since the results from secureexec here only need to be
based on the first call to the LSM's bprm_set_creds hook.

The single exception is that the commoncaps secureexec hook also examines
euid/uid and egid/gid differences which are controlled by bprm_fill_uid(),
via prepare_binprm(), which can be called multiple times (e.g.
binfmt_script, binfmt_misc), and may clear the euid/egid for the final
load (i.e. the script interpreter). However, while commoncaps specifically
ignores bprm->cred_prepared, and runs its bprm_set_creds hook each time
prepare_binprm() may get called, it needs to base the secureexec decision
on the final call to bprm_set_creds. As a result, it will need special
handling.

To begin this refactoring, this adds the secureexec flag to the bprm
struct, and calls the secureexec hook during setup_new_exec(). This is
safe since all the cred work is finished (and past the point of no return).
This explicit call will be removed in later patches once the hook has been
removed.

Cc: David Howells <dhowells@redhat.com>
Signed-off-by: Kees Cook <keescook@chromium.org>
Reviewed-by: John Johansen <john.johansen@canonical.com>
Acked-by: Serge Hallyn <serge@hallyn.com>
Reviewed-by: James Morris <james.l.morris@oracle.com>
---
 fs/binfmt_elf.c         | 2 +-
 fs/binfmt_elf_fdpic.c   | 2 +-
 fs/exec.c               | 2 ++
 include/linux/binfmts.h | 8 +++++++-
 4 files changed, 11 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c
index 879ff9c7ffd0..3b7dda91b07b 100644
--- a/fs/binfmt_elf.c
+++ b/fs/binfmt_elf.c
@@ -252,7 +252,7 @@ create_elf_tables(struct linux_binprm *bprm, struct elfhdr *exec,
 	NEW_AUX_ENT(AT_EUID, from_kuid_munged(cred->user_ns, cred->euid));
 	NEW_AUX_ENT(AT_GID, from_kgid_munged(cred->user_ns, cred->gid));
 	NEW_AUX_ENT(AT_EGID, from_kgid_munged(cred->user_ns, cred->egid));
- 	NEW_AUX_ENT(AT_SECURE, security_bprm_secureexec(bprm));
+	NEW_AUX_ENT(AT_SECURE, bprm->secureexec);
 	NEW_AUX_ENT(AT_RANDOM, (elf_addr_t)(unsigned long)u_rand_bytes);
 #ifdef ELF_HWCAP2
 	NEW_AUX_ENT(AT_HWCAP2, ELF_HWCAP2);
diff --git a/fs/binfmt_elf_fdpic.c b/fs/binfmt_elf_fdpic.c
index cf93a4fad012..5aa9199dfb13 100644
--- a/fs/binfmt_elf_fdpic.c
+++ b/fs/binfmt_elf_fdpic.c
@@ -650,7 +650,7 @@ static int create_elf_fdpic_tables(struct linux_binprm *bprm,
 	NEW_AUX_ENT(AT_EUID,	(elf_addr_t) from_kuid_munged(cred->user_ns, cred->euid));
 	NEW_AUX_ENT(AT_GID,	(elf_addr_t) from_kgid_munged(cred->user_ns, cred->gid));
 	NEW_AUX_ENT(AT_EGID,	(elf_addr_t) from_kgid_munged(cred->user_ns, cred->egid));
-	NEW_AUX_ENT(AT_SECURE,	security_bprm_secureexec(bprm));
+	NEW_AUX_ENT(AT_SECURE,	bprm->secureexec);
 	NEW_AUX_ENT(AT_EXECFN,	bprm->exec);
 
 #ifdef ARCH_DLINFO
diff --git a/fs/exec.c b/fs/exec.c
index 26b98072be50..0f361115c88f 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -1343,6 +1343,8 @@ EXPORT_SYMBOL(would_dump);
 
 void setup_new_exec(struct linux_binprm * bprm)
 {
+	bprm->secureexec |= security_bprm_secureexec(bprm);
+
 	arch_pick_mmap_layout(current->mm);
 
 	current->sas_ss_sp = current->sas_ss_size = 0;
diff --git a/include/linux/binfmts.h b/include/linux/binfmts.h
index 9023e1d2d5cd..16838ba7ee75 100644
--- a/include/linux/binfmts.h
+++ b/include/linux/binfmts.h
@@ -31,9 +31,15 @@ struct linux_binprm {
 		 * binfmt_script/misc).
 		 */
 		called_set_creds:1,
-		cap_effective:1;/* true if has elevated effective capabilities,
+		cap_effective:1,/* true if has elevated effective capabilities,
 				 * false if not; except for init which inherits
 				 * its parent's caps anyway */
+		/*
+		 * Set by bprm_set_creds hook to indicate a privilege-gaining
+		 * exec has happened. Used to sanitize execution environment
+		 * and to set AT_SECURE auxv for glibc.
+		 */
+		secureexec:1;
 #ifdef __alpha__
 	unsigned int taso:1;
 #endif
-- 
cgit v1.2.3


From 46d98eb4e1d2bc225f661879e0e157a952107598 Mon Sep 17 00:00:00 2001
From: Kees Cook <keescook@chromium.org>
Date: Tue, 18 Jul 2017 15:25:27 -0700
Subject: commoncap: Refactor to remove bprm_secureexec hook

The commoncap implementation of the bprm_secureexec hook is the only LSM
that depends on the final call to its bprm_set_creds hook (since it may
be called for multiple files, it ignores bprm->called_set_creds). As a
result, it cannot safely _clear_ bprm->secureexec since other LSMs may
have set it.  Instead, remove the bprm_secureexec hook by introducing a
new flag to bprm specific to commoncap: cap_elevated. This is similar to
cap_effective, but that is used for a specific subset of elevated
privileges, and exists solely to track state from bprm_set_creds to
bprm_secureexec. As such, it will be removed in the next patch.

Here, set the new bprm->cap_elevated flag when setuid/setgid has happened
from bprm_fill_uid() or fscapabilities have been prepared. This temporarily
moves the bprm_secureexec hook to a static inline. The helper will be
removed in the next patch; this makes the step easier to review and bisect,
since this does not introduce any changes to inputs nor outputs to the
"elevated privileges" calculation.

The new flag is merged with the bprm->secureexec flag in setup_new_exec()
since this marks the end of any further prepare_binprm() calls.

Cc: Andy Lutomirski <luto@kernel.org>
Signed-off-by: Kees Cook <keescook@chromium.org>
Reviewed-by: Andy Lutomirski <luto@kernel.org>
Acked-by: James Morris <james.l.morris@oracle.com>
Acked-by: Serge Hallyn <serge@hallyn.com>
---
 fs/exec.c                |  7 +++++++
 include/linux/binfmts.h  |  7 +++++++
 include/linux/security.h |  3 +--
 security/commoncap.c     | 12 ++++++++----
 4 files changed, 23 insertions(+), 6 deletions(-)

(limited to 'include')

diff --git a/fs/exec.c b/fs/exec.c
index 0f361115c88f..1536bc4502cc 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -1345,6 +1345,13 @@ void setup_new_exec(struct linux_binprm * bprm)
 {
 	bprm->secureexec |= security_bprm_secureexec(bprm);
 
+	/*
+	 * Once here, prepare_binrpm() will not be called any more, so
+	 * the final state of setuid/setgid/fscaps can be merged into the
+	 * secureexec flag.
+	 */
+	bprm->secureexec |= bprm->cap_elevated;
+
 	arch_pick_mmap_layout(current->mm);
 
 	current->sas_ss_sp = current->sas_ss_size = 0;
diff --git a/include/linux/binfmts.h b/include/linux/binfmts.h
index 16838ba7ee75..213c61fa3780 100644
--- a/include/linux/binfmts.h
+++ b/include/linux/binfmts.h
@@ -34,6 +34,13 @@ struct linux_binprm {
 		cap_effective:1,/* true if has elevated effective capabilities,
 				 * false if not; except for init which inherits
 				 * its parent's caps anyway */
+		/*
+		 * True if most recent call to the commoncaps bprm_set_creds
+		 * hook (due to multiple prepare_binprm() calls from the
+		 * binfmt_script/misc handlers) resulted in elevated
+		 * privileges.
+		 */
+		cap_elevated:1,
 		/*
 		 * Set by bprm_set_creds hook to indicate a privilege-gaining
 		 * exec has happened. Used to sanitize execution environment
diff --git a/include/linux/security.h b/include/linux/security.h
index b6ea1dc9cc9d..f89832ccdf55 100644
--- a/include/linux/security.h
+++ b/include/linux/security.h
@@ -85,7 +85,6 @@ extern int cap_capset(struct cred *new, const struct cred *old,
 		      const kernel_cap_t *inheritable,
 		      const kernel_cap_t *permitted);
 extern int cap_bprm_set_creds(struct linux_binprm *bprm);
-extern int cap_bprm_secureexec(struct linux_binprm *bprm);
 extern int cap_inode_setxattr(struct dentry *dentry, const char *name,
 			      const void *value, size_t size, int flags);
 extern int cap_inode_removexattr(struct dentry *dentry, const char *name);
@@ -543,7 +542,7 @@ static inline void security_bprm_committed_creds(struct linux_binprm *bprm)
 
 static inline int security_bprm_secureexec(struct linux_binprm *bprm)
 {
-	return cap_bprm_secureexec(bprm);
+	return 0;
 }
 
 static inline int security_sb_alloc(struct super_block *sb)
diff --git a/security/commoncap.c b/security/commoncap.c
index 7abebd782d5e..abb6050c8083 100644
--- a/security/commoncap.c
+++ b/security/commoncap.c
@@ -481,6 +481,8 @@ out:
 	return rc;
 }
 
+static int is_secureexec(struct linux_binprm *bprm);
+
 /**
  * cap_bprm_set_creds - Set up the proposed credentials for execve().
  * @bprm: The execution parameters, including the proposed creds
@@ -614,11 +616,14 @@ skip:
 	if (WARN_ON(!cap_ambient_invariant_ok(new)))
 		return -EPERM;
 
+	/* Check for privilege-elevated exec. */
+	bprm->cap_elevated = is_secureexec(bprm);
+
 	return 0;
 }
 
 /**
- * cap_bprm_secureexec - Determine whether a secure execution is required
+ * is_secureexec - Determine whether a secure execution is required
  * @bprm: The execution parameters
  *
  * Determine whether a secure execution is required, return 1 if it is, and 0
@@ -627,9 +632,9 @@ skip:
  * The credentials have been committed by this point, and so are no longer
  * available through @bprm->cred.
  */
-int cap_bprm_secureexec(struct linux_binprm *bprm)
+static int is_secureexec(struct linux_binprm *bprm)
 {
-	const struct cred *cred = current_cred();
+	const struct cred *cred = bprm->cred;
 	kuid_t root_uid = make_kuid(cred->user_ns, 0);
 
 	if (!uid_eq(cred->uid, root_uid)) {
@@ -1079,7 +1084,6 @@ struct security_hook_list capability_hooks[] __lsm_ro_after_init = {
 	LSM_HOOK_INIT(capget, cap_capget),
 	LSM_HOOK_INIT(capset, cap_capset),
 	LSM_HOOK_INIT(bprm_set_creds, cap_bprm_set_creds),
-	LSM_HOOK_INIT(bprm_secureexec, cap_bprm_secureexec),
 	LSM_HOOK_INIT(inode_need_killpriv, cap_inode_need_killpriv),
 	LSM_HOOK_INIT(inode_killpriv, cap_inode_killpriv),
 	LSM_HOOK_INIT(mmap_addr, cap_mmap_addr),
-- 
cgit v1.2.3


From ee67ae7ef6ff499137292ac8a9dfe86096796283 Mon Sep 17 00:00:00 2001
From: Kees Cook <keescook@chromium.org>
Date: Tue, 18 Jul 2017 15:25:28 -0700
Subject: commoncap: Move cap_elevated calculation into bprm_set_creds

Instead of a separate function, open-code the cap_elevated test, which
lets us entirely remove bprm->cap_effective (to use the local "effective"
variable instead), and more accurately examine euid/egid changes via the
existing local "is_setid".

The following LTP tests were run to validate the changes:

	# ./runltp -f syscalls -s cap
	# ./runltp -f securebits
	# ./runltp -f cap_bounds
	# ./runltp -f filecaps

All kernel selftests for capabilities and exec continue to pass as well.

Signed-off-by: Kees Cook <keescook@chromium.org>
Reviewed-by: James Morris <james.l.morris@oracle.com>
Acked-by: Serge Hallyn <serge@hallyn.com>
Reviewed-by: Andy Lutomirski <luto@kernel.org>
---
 include/linux/binfmts.h |  3 ---
 security/commoncap.c    | 52 ++++++++++---------------------------------------
 2 files changed, 10 insertions(+), 45 deletions(-)

(limited to 'include')

diff --git a/include/linux/binfmts.h b/include/linux/binfmts.h
index 213c61fa3780..fb44d6180ca0 100644
--- a/include/linux/binfmts.h
+++ b/include/linux/binfmts.h
@@ -31,9 +31,6 @@ struct linux_binprm {
 		 * binfmt_script/misc).
 		 */
 		called_set_creds:1,
-		cap_effective:1,/* true if has elevated effective capabilities,
-				 * false if not; except for init which inherits
-				 * its parent's caps anyway */
 		/*
 		 * True if most recent call to the commoncaps bprm_set_creds
 		 * hook (due to multiple prepare_binprm() calls from the
diff --git a/security/commoncap.c b/security/commoncap.c
index abb6050c8083..d8e26fb9781d 100644
--- a/security/commoncap.c
+++ b/security/commoncap.c
@@ -285,15 +285,6 @@ int cap_capset(struct cred *new,
 	return 0;
 }
 
-/*
- * Clear proposed capability sets for execve().
- */
-static inline void bprm_clear_caps(struct linux_binprm *bprm)
-{
-	cap_clear(bprm->cred->cap_permitted);
-	bprm->cap_effective = false;
-}
-
 /**
  * cap_inode_need_killpriv - Determine if inode change affects privileges
  * @dentry: The inode/dentry in being changed with change marked ATTR_KILL_PRIV
@@ -443,7 +434,7 @@ static int get_file_caps(struct linux_binprm *bprm, bool *effective, bool *has_c
 	int rc = 0;
 	struct cpu_vfs_cap_data vcaps;
 
-	bprm_clear_caps(bprm);
+	cap_clear(bprm->cred->cap_permitted);
 
 	if (!file_caps_enabled)
 		return 0;
@@ -476,13 +467,11 @@ static int get_file_caps(struct linux_binprm *bprm, bool *effective, bool *has_c
 
 out:
 	if (rc)
-		bprm_clear_caps(bprm);
+		cap_clear(bprm->cred->cap_permitted);
 
 	return rc;
 }
 
-static int is_secureexec(struct linux_binprm *bprm);
-
 /**
  * cap_bprm_set_creds - Set up the proposed credentials for execve().
  * @bprm: The execution parameters, including the proposed creds
@@ -587,8 +576,6 @@ skip:
 	if (WARN_ON(!cap_ambient_invariant_ok(new)))
 		return -EPERM;
 
-	bprm->cap_effective = effective;
-
 	/*
 	 * Audit candidate if current->cap_effective is set
 	 *
@@ -617,35 +604,16 @@ skip:
 		return -EPERM;
 
 	/* Check for privilege-elevated exec. */
-	bprm->cap_elevated = is_secureexec(bprm);
-
-	return 0;
-}
-
-/**
- * is_secureexec - Determine whether a secure execution is required
- * @bprm: The execution parameters
- *
- * Determine whether a secure execution is required, return 1 if it is, and 0
- * if it is not.
- *
- * The credentials have been committed by this point, and so are no longer
- * available through @bprm->cred.
- */
-static int is_secureexec(struct linux_binprm *bprm)
-{
-	const struct cred *cred = bprm->cred;
-	kuid_t root_uid = make_kuid(cred->user_ns, 0);
-
-	if (!uid_eq(cred->uid, root_uid)) {
-		if (bprm->cap_effective)
-			return 1;
-		if (!cap_issubset(cred->cap_permitted, cred->cap_ambient))
-			return 1;
+	bprm->cap_elevated = 0;
+	if (is_setid) {
+		bprm->cap_elevated = 1;
+	} else if (!uid_eq(new->uid, root_uid)) {
+		if (effective ||
+		    !cap_issubset(new->cap_permitted, new->cap_ambient))
+			bprm->cap_elevated = 1;
 	}
 
-	return (!uid_eq(cred->euid, cred->uid) ||
-		!gid_eq(cred->egid, cred->gid));
+	return 0;
 }
 
 /**
-- 
cgit v1.2.3


From 2af622802696e1dbe28d81c8ea6355dc30800396 Mon Sep 17 00:00:00 2001
From: Kees Cook <keescook@chromium.org>
Date: Tue, 18 Jul 2017 15:25:29 -0700
Subject: LSM: drop bprm_secureexec hook

This removes the bprm_secureexec hook since the logic has been folded into
the bprm_set_creds hook for all LSMs now.

Cc: Eric W. Biederman <ebiederm@xmission.com>
Signed-off-by: Kees Cook <keescook@chromium.org>
Reviewed-by: John Johansen <john.johansen@canonical.com>
Acked-by: James Morris <james.l.morris@oracle.com>
Acked-by: Serge Hallyn <serge@hallyn.com>
---
 fs/exec.c                 |  2 --
 include/linux/lsm_hooks.h | 14 +++++---------
 include/linux/security.h  |  6 ------
 security/security.c       |  5 -----
 4 files changed, 5 insertions(+), 22 deletions(-)

(limited to 'include')

diff --git a/fs/exec.c b/fs/exec.c
index 1536bc4502cc..eca0cb550a06 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -1343,8 +1343,6 @@ EXPORT_SYMBOL(would_dump);
 
 void setup_new_exec(struct linux_binprm * bprm)
 {
-	bprm->secureexec |= security_bprm_secureexec(bprm);
-
 	/*
 	 * Once here, prepare_binrpm() will not be called any more, so
 	 * the final state of setuid/setgid/fscaps can be merged into the
diff --git a/include/linux/lsm_hooks.h b/include/linux/lsm_hooks.h
index 3a90febadbe2..d1c7bef25691 100644
--- a/include/linux/lsm_hooks.h
+++ b/include/linux/lsm_hooks.h
@@ -43,7 +43,11 @@
  *	interpreters.  The hook can tell whether it has already been called by
  *	checking to see if @bprm->security is non-NULL.  If so, then the hook
  *	may decide either to retain the security information saved earlier or
- *	to replace it.
+ *	to replace it.  The hook must set @bprm->secureexec to 1 if a "secure
+ *	exec" has happened as a result of this hook call.  The flag is used to
+ *	indicate the need for a sanitized execution environment, and is also
+ *	passed in the ELF auxiliary table on the initial stack to indicate
+ *	whether libc should enable secure mode.
  *	@bprm contains the linux_binprm structure.
  *	Return 0 if the hook is successful and permission is granted.
  * @bprm_check_security:
@@ -71,12 +75,6 @@
  *	linux_binprm structure.  This hook is a good place to perform state
  *	changes on the process such as clearing out non-inheritable signal
  *	state.  This is called immediately after commit_creds().
- * @bprm_secureexec:
- *	Return a boolean value (0 or 1) indicating whether a "secure exec"
- *	is required.  The flag is passed in the auxiliary table
- *	on the initial stack to the ELF interpreter to indicate whether libc
- *	should enable secure mode.
- *	@bprm contains the linux_binprm structure.
  *
  * Security hooks for filesystem operations.
  *
@@ -1388,7 +1386,6 @@ union security_list_options {
 
 	int (*bprm_set_creds)(struct linux_binprm *bprm);
 	int (*bprm_check_security)(struct linux_binprm *bprm);
-	int (*bprm_secureexec)(struct linux_binprm *bprm);
 	void (*bprm_committing_creds)(struct linux_binprm *bprm);
 	void (*bprm_committed_creds)(struct linux_binprm *bprm);
 
@@ -1710,7 +1707,6 @@ struct security_hook_heads {
 	struct list_head vm_enough_memory;
 	struct list_head bprm_set_creds;
 	struct list_head bprm_check_security;
-	struct list_head bprm_secureexec;
 	struct list_head bprm_committing_creds;
 	struct list_head bprm_committed_creds;
 	struct list_head sb_alloc_security;
diff --git a/include/linux/security.h b/include/linux/security.h
index f89832ccdf55..974bb9b0996c 100644
--- a/include/linux/security.h
+++ b/include/linux/security.h
@@ -231,7 +231,6 @@ int security_bprm_set_creds(struct linux_binprm *bprm);
 int security_bprm_check(struct linux_binprm *bprm);
 void security_bprm_committing_creds(struct linux_binprm *bprm);
 void security_bprm_committed_creds(struct linux_binprm *bprm);
-int security_bprm_secureexec(struct linux_binprm *bprm);
 int security_sb_alloc(struct super_block *sb);
 void security_sb_free(struct super_block *sb);
 int security_sb_copy_data(char *orig, char *copy);
@@ -540,11 +539,6 @@ static inline void security_bprm_committed_creds(struct linux_binprm *bprm)
 {
 }
 
-static inline int security_bprm_secureexec(struct linux_binprm *bprm)
-{
-	return 0;
-}
-
 static inline int security_sb_alloc(struct super_block *sb)
 {
 	return 0;
diff --git a/security/security.c b/security/security.c
index 30132378d103..afc34f46c6c5 100644
--- a/security/security.c
+++ b/security/security.c
@@ -351,11 +351,6 @@ void security_bprm_committed_creds(struct linux_binprm *bprm)
 	call_void_hook(bprm_committed_creds, bprm);
 }
 
-int security_bprm_secureexec(struct linux_binprm *bprm)
-{
-	return call_int_hook(bprm_secureexec, 0, bprm);
-}
-
 int security_sb_alloc(struct super_block *sb)
 {
 	return call_int_hook(sb_alloc_security, 0, sb);
-- 
cgit v1.2.3


From c4b3eacc1dfef5f36dbdf9a99be37834d3e23ed0 Mon Sep 17 00:00:00 2001
From: Alexander Sverdlin <alexander.sverdlin@nokia.com>
Date: Mon, 17 Jul 2017 17:54:07 +0200
Subject: mtd: spi-nor: Recover from Spansion/Cypress errors

S25FL{128|256|512}S datasheets say:
"When P_ERR or E_ERR bits are set to one, the WIP bit will remain set to
one indicating the device remains busy and unable to receive new operation
commands. A Clear Status Register (CLSR) command must be received to return
the device to standby mode."

Current spi-nor code works until first error occurs, but write/erase errors
are not just rare hardware failures, they also occur if user tries to flash
write-protected areas. After such attempt no SPI command can be executed
any more and even read fails. This patch adds support for P_ERR and E_ERR
bits in Status Register 1 (so that operation fails immediately and not
after a long timeout) and proper recovery from the error condition.

Tested on Spansion S25FS128S, which is supported by S25FL129P entry.

Signed-off-by: Alexander Sverdlin <alexander.sverdlin@nokia.com>
Signed-off-by: Cyrille Pitchen <cyrille.pitchen@wedev4u.fr>
---
 drivers/mtd/spi-nor/spi-nor.c | 29 +++++++++++++++++++++--------
 include/linux/mtd/spi-nor.h   |  5 +++++
 2 files changed, 26 insertions(+), 8 deletions(-)

(limited to 'include')

diff --git a/drivers/mtd/spi-nor/spi-nor.c b/drivers/mtd/spi-nor/spi-nor.c
index 196b52f083ae..c70c2ebc8661 100644
--- a/drivers/mtd/spi-nor/spi-nor.c
+++ b/drivers/mtd/spi-nor/spi-nor.c
@@ -88,6 +88,7 @@ struct flash_info {
 					 */
 #define NO_CHIP_ERASE		BIT(12) /* Chip does not support chip erase */
 #define SPI_NOR_SKIP_SFDP	BIT(13)	/* Skip parsing of SFDP tables */
+#define USE_CLSR		BIT(14)	/* use CLSR command */
 };
 
 #define JEDEC_MFR(info)	((info)->id[0])
@@ -308,8 +309,18 @@ static inline int spi_nor_sr_ready(struct spi_nor *nor)
 	int sr = read_sr(nor);
 	if (sr < 0)
 		return sr;
-	else
-		return !(sr & SR_WIP);
+
+	if (nor->flags & SNOR_F_USE_CLSR && sr & (SR_E_ERR | SR_P_ERR)) {
+		if (sr & SR_E_ERR)
+			dev_err(nor->dev, "Erase Error occurred\n");
+		else
+			dev_err(nor->dev, "Programming Error occurred\n");
+
+		nor->write_reg(nor, SPINOR_OP_CLSR, NULL, 0);
+		return -EIO;
+	}
+
+	return !(sr & SR_WIP);
 }
 
 static inline int spi_nor_fsr_ready(struct spi_nor *nor)
@@ -1043,15 +1054,15 @@ static const struct flash_info spi_nor_ids[] = {
 	 */
 	{ "s25sl032p",  INFO(0x010215, 0x4d00,  64 * 1024,  64, SPI_NOR_DUAL_READ | SPI_NOR_QUAD_READ) },
 	{ "s25sl064p",  INFO(0x010216, 0x4d00,  64 * 1024, 128, SPI_NOR_DUAL_READ | SPI_NOR_QUAD_READ) },
-	{ "s25fl256s0", INFO(0x010219, 0x4d00, 256 * 1024, 128, 0) },
-	{ "s25fl256s1", INFO(0x010219, 0x4d01,  64 * 1024, 512, SPI_NOR_DUAL_READ | SPI_NOR_QUAD_READ) },
-	{ "s25fl512s",  INFO(0x010220, 0x4d00, 256 * 1024, 256, SPI_NOR_DUAL_READ | SPI_NOR_QUAD_READ) },
+	{ "s25fl256s0", INFO(0x010219, 0x4d00, 256 * 1024, 128, USE_CLSR) },
+	{ "s25fl256s1", INFO(0x010219, 0x4d01,  64 * 1024, 512, SPI_NOR_DUAL_READ | SPI_NOR_QUAD_READ | USE_CLSR) },
+	{ "s25fl512s",  INFO(0x010220, 0x4d00, 256 * 1024, 256, SPI_NOR_DUAL_READ | SPI_NOR_QUAD_READ | USE_CLSR) },
 	{ "s70fl01gs",  INFO(0x010221, 0x4d00, 256 * 1024, 256, 0) },
 	{ "s25sl12800", INFO(0x012018, 0x0300, 256 * 1024,  64, 0) },
 	{ "s25sl12801", INFO(0x012018, 0x0301,  64 * 1024, 256, 0) },
-	{ "s25fl128s",	INFO6(0x012018, 0x4d0180, 64 * 1024, 256, SPI_NOR_DUAL_READ | SPI_NOR_QUAD_READ) },
-	{ "s25fl129p0", INFO(0x012018, 0x4d00, 256 * 1024,  64, SPI_NOR_DUAL_READ | SPI_NOR_QUAD_READ) },
-	{ "s25fl129p1", INFO(0x012018, 0x4d01,  64 * 1024, 256, SPI_NOR_DUAL_READ | SPI_NOR_QUAD_READ) },
+	{ "s25fl128s",  INFO6(0x012018, 0x4d0180, 64 * 1024, 256, SPI_NOR_DUAL_READ | SPI_NOR_QUAD_READ | USE_CLSR) },
+	{ "s25fl129p0", INFO(0x012018, 0x4d00, 256 * 1024,  64, SPI_NOR_DUAL_READ | SPI_NOR_QUAD_READ | USE_CLSR) },
+	{ "s25fl129p1", INFO(0x012018, 0x4d01,  64 * 1024, 256, SPI_NOR_DUAL_READ | SPI_NOR_QUAD_READ | USE_CLSR) },
 	{ "s25sl004a",  INFO(0x010212,      0,  64 * 1024,   8, 0) },
 	{ "s25sl008a",  INFO(0x010213,      0,  64 * 1024,  16, 0) },
 	{ "s25sl016a",  INFO(0x010214,      0,  64 * 1024,  32, 0) },
@@ -2707,6 +2718,8 @@ int spi_nor_scan(struct spi_nor *nor, const char *name,
 		nor->flags |= SNOR_F_HAS_SR_TB;
 	if (info->flags & NO_CHIP_ERASE)
 		nor->flags |= SNOR_F_NO_OP_CHIP_ERASE;
+	if (info->flags & USE_CLSR)
+		nor->flags |= SNOR_F_USE_CLSR;
 
 	if (info->flags & SPI_NOR_NO_ERASE)
 		mtd->flags |= MTD_NO_ERASE;
diff --git a/include/linux/mtd/spi-nor.h b/include/linux/mtd/spi-nor.h
index 0df3638ff0b8..1f0a7fc7772f 100644
--- a/include/linux/mtd/spi-nor.h
+++ b/include/linux/mtd/spi-nor.h
@@ -105,6 +105,7 @@
 
 /* Used for Spansion flashes only. */
 #define SPINOR_OP_BRWR		0x17	/* Bank register write */
+#define SPINOR_OP_CLSR		0x30	/* Clear status register 1 */
 
 /* Used for Micron flashes only. */
 #define SPINOR_OP_RD_EVCR      0x65    /* Read EVCR register */
@@ -119,6 +120,9 @@
 #define SR_BP2			BIT(4)	/* Block protect 2 */
 #define SR_TB			BIT(5)	/* Top/Bottom protect */
 #define SR_SRWD			BIT(7)	/* SR write protect */
+/* Spansion/Cypress specific status bits */
+#define SR_E_ERR		BIT(5)
+#define SR_P_ERR		BIT(6)
 
 #define SR_QUAD_EN_MX		BIT(6)	/* Macronix Quad I/O */
 
@@ -224,6 +228,7 @@ enum spi_nor_option_flags {
 	SNOR_F_NO_OP_CHIP_ERASE	= BIT(2),
 	SNOR_F_S3AN_ADDR_DEFAULT = BIT(3),
 	SNOR_F_READY_XSR_RDY	= BIT(4),
+	SNOR_F_USE_CLSR		= BIT(5),
 };
 
 /**
-- 
cgit v1.2.3


From d31ae2548142b7cd12404929ef3a13ae27c9d961 Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Tue, 1 Aug 2017 12:00:39 -0400
Subject: sunrpc: Const-ify all instances of struct rpc_xprt_ops

After transport instance creation, these function pointers never
change. Mark them as constant to prevent their use as an attack
vector for code injections.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: Anna Schumaker <Anna.Schumaker@Netapp.com>
---
 include/linux/sunrpc/xprt.h                | 2 +-
 net/sunrpc/xprtrdma/svc_rdma_backchannel.c | 2 +-
 net/sunrpc/xprtrdma/transport.c            | 4 ++--
 net/sunrpc/xprtsock.c                      | 8 ++++----
 4 files changed, 8 insertions(+), 8 deletions(-)

(limited to 'include')

diff --git a/include/linux/sunrpc/xprt.h b/include/linux/sunrpc/xprt.h
index eab1c749e192..f60c55ca2e2e 100644
--- a/include/linux/sunrpc/xprt.h
+++ b/include/linux/sunrpc/xprt.h
@@ -174,7 +174,7 @@ enum xprt_transports {
 
 struct rpc_xprt {
 	struct kref		kref;		/* Reference count */
-	struct rpc_xprt_ops *	ops;		/* transport methods */
+	const struct rpc_xprt_ops *ops;		/* transport methods */
 
 	const struct rpc_timeout *timeout;	/* timeout parms */
 	struct sockaddr_storage	addr;		/* server address */
diff --git a/net/sunrpc/xprtrdma/svc_rdma_backchannel.c b/net/sunrpc/xprtrdma/svc_rdma_backchannel.c
index c676ed0efb5a..ca41e28d2b36 100644
--- a/net/sunrpc/xprtrdma/svc_rdma_backchannel.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_backchannel.c
@@ -266,7 +266,7 @@ xprt_rdma_bc_put(struct rpc_xprt *xprt)
 	module_put(THIS_MODULE);
 }
 
-static struct rpc_xprt_ops xprt_rdma_bc_procs = {
+static const struct rpc_xprt_ops xprt_rdma_bc_procs = {
 	.reserve_xprt		= xprt_reserve_xprt_cong,
 	.release_xprt		= xprt_release_xprt_cong,
 	.alloc_slot		= xprt_alloc_slot,
diff --git a/net/sunrpc/xprtrdma/transport.c b/net/sunrpc/xprtrdma/transport.c
index d1c458e5ec4d..42752e4cc996 100644
--- a/net/sunrpc/xprtrdma/transport.c
+++ b/net/sunrpc/xprtrdma/transport.c
@@ -149,7 +149,7 @@ static struct ctl_table sunrpc_table[] = {
 
 #endif
 
-static struct rpc_xprt_ops xprt_rdma_procs;	/*forward reference */
+static const struct rpc_xprt_ops xprt_rdma_procs;
 
 static void
 xprt_rdma_format_addresses4(struct rpc_xprt *xprt, struct sockaddr *sap)
@@ -811,7 +811,7 @@ xprt_rdma_disable_swap(struct rpc_xprt *xprt)
  * Plumbing for rpc transport switch and kernel module
  */
 
-static struct rpc_xprt_ops xprt_rdma_procs = {
+static const struct rpc_xprt_ops xprt_rdma_procs = {
 	.reserve_xprt		= xprt_reserve_xprt_cong,
 	.release_xprt		= xprt_release_xprt_cong, /* sunrpc/xprt.c */
 	.alloc_slot		= xprt_alloc_slot,
diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c
index 4f154d388748..5cf17001f0e2 100644
--- a/net/sunrpc/xprtsock.c
+++ b/net/sunrpc/xprtsock.c
@@ -2724,7 +2724,7 @@ static void bc_destroy(struct rpc_xprt *xprt)
 	module_put(THIS_MODULE);
 }
 
-static struct rpc_xprt_ops xs_local_ops = {
+static const struct rpc_xprt_ops xs_local_ops = {
 	.reserve_xprt		= xprt_reserve_xprt,
 	.release_xprt		= xs_tcp_release_xprt,
 	.alloc_slot		= xprt_alloc_slot,
@@ -2742,7 +2742,7 @@ static struct rpc_xprt_ops xs_local_ops = {
 	.disable_swap		= xs_disable_swap,
 };
 
-static struct rpc_xprt_ops xs_udp_ops = {
+static const struct rpc_xprt_ops xs_udp_ops = {
 	.set_buffer_size	= xs_udp_set_buffer_size,
 	.reserve_xprt		= xprt_reserve_xprt_cong,
 	.release_xprt		= xprt_release_xprt_cong,
@@ -2764,7 +2764,7 @@ static struct rpc_xprt_ops xs_udp_ops = {
 	.inject_disconnect	= xs_inject_disconnect,
 };
 
-static struct rpc_xprt_ops xs_tcp_ops = {
+static const struct rpc_xprt_ops xs_tcp_ops = {
 	.reserve_xprt		= xprt_reserve_xprt,
 	.release_xprt		= xs_tcp_release_xprt,
 	.alloc_slot		= xprt_lock_and_alloc_slot,
@@ -2795,7 +2795,7 @@ static struct rpc_xprt_ops xs_tcp_ops = {
  * The rpc_xprt_ops for the server backchannel
  */
 
-static struct rpc_xprt_ops bc_tcp_ops = {
+static const struct rpc_xprt_ops bc_tcp_ops = {
 	.reserve_xprt		= xprt_reserve_xprt,
 	.release_xprt		= xprt_release_xprt,
 	.alloc_slot		= xprt_alloc_slot,
-- 
cgit v1.2.3


From c39a0e2c8850f08249383f2425dbd8dbe4baad69 Mon Sep 17 00:00:00 2001
From: Vikas Shivappa <vikas.shivappa@linux.intel.com>
Date: Tue, 25 Jul 2017 14:14:20 -0700
Subject: x86/perf/cqm: Wipe out perf based cqm

'perf cqm' never worked due to the incompatibility between perf
infrastructure and cqm hardware support.  The hardware uses RMIDs to
track the llc occupancy of tasks and these RMIDs are per package. This
makes monitoring a hierarchy like cgroup along with monitoring of tasks
separately difficult and several patches sent to lkml to fix them were
NACKed. Further more, the following issues in the current perf cqm make
it almost unusable:

    1. No support to monitor the same group of tasks for which we do
    allocation using resctrl.

    2. It gives random and inaccurate data (mostly 0s) once we run out
    of RMIDs due to issues in Recycling.

    3. Recycling results in inaccuracy of data because we cannot
    guarantee that the RMID was stolen from a task when it was not
    pulling data into cache or even when it pulled the least data. Also
    for monitoring llc_occupancy, if we stop using an RMID_x and then
    start using an RMID_y after we reclaim an RMID from an other event,
    we miss accounting all the occupancy that was tagged to RMID_x at a
    later perf_count.

    2. Recycling code makes the monitoring code complex including
    scheduling because the event can lose RMID any time. Since MBM
    counters count bandwidth for a period of time by taking snap shot of
    total bytes at two different times, recycling complicates the way we
    count MBM in a hierarchy. Also we need a spin lock while we do the
    processing to account for MBM counter overflow. We also currently
    use a spin lock in scheduling to prevent the RMID from being taken
    away.

    4. Lack of support when we run different kind of event like task,
    system-wide and cgroup events together. Data mostly prints 0s. This
    is also because we can have only one RMID tied to a cpu as defined
    by the cqm hardware but a perf can at the same time tie multiple
    events during one sched_in.

    5. No support of monitoring a group of tasks. There is partial support
    for cgroup but it does not work once there is a hierarchy of cgroups
    or if we want to monitor a task in a cgroup and the cgroup itself.

    6. No support for monitoring tasks for the lifetime without perf
    overhead.

    7. It reported the aggregate cache occupancy or memory bandwidth over
    all sockets. But most cloud and VMM based use cases want to know the
    individual per-socket usage.

Signed-off-by: Vikas Shivappa <vikas.shivappa@linux.intel.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: ravi.v.shankar@intel.com
Cc: tony.luck@intel.com
Cc: fenghua.yu@intel.com
Cc: peterz@infradead.org
Cc: eranian@google.com
Cc: vikas.shivappa@intel.com
Cc: ak@linux.intel.com
Cc: davidcc@google.com
Cc: reinette.chatre@intel.com
Link: http://lkml.kernel.org/r/1501017287-28083-2-git-send-email-vikas.shivappa@linux.intel.com
---
 arch/x86/events/intel/Makefile          |    2 +-
 arch/x86/events/intel/cqm.c             | 1766 -------------------------------
 arch/x86/include/asm/intel_rdt_common.h |    2 -
 arch/x86/kernel/cpu/intel_rdt.c         |    8 +
 include/linux/perf_event.h              |   18 -
 kernel/events/core.c                    |   14 +-
 6 files changed, 10 insertions(+), 1800 deletions(-)
 delete mode 100644 arch/x86/events/intel/cqm.c

(limited to 'include')

diff --git a/arch/x86/events/intel/Makefile b/arch/x86/events/intel/Makefile
index 06c2baa51814..e9d8520a801a 100644
--- a/arch/x86/events/intel/Makefile
+++ b/arch/x86/events/intel/Makefile
@@ -1,4 +1,4 @@
-obj-$(CONFIG_CPU_SUP_INTEL)		+= core.o bts.o cqm.o
+obj-$(CONFIG_CPU_SUP_INTEL)		+= core.o bts.o
 obj-$(CONFIG_CPU_SUP_INTEL)		+= ds.o knc.o
 obj-$(CONFIG_CPU_SUP_INTEL)		+= lbr.o p4.o p6.o pt.o
 obj-$(CONFIG_PERF_EVENTS_INTEL_RAPL)	+= intel-rapl-perf.o
diff --git a/arch/x86/events/intel/cqm.c b/arch/x86/events/intel/cqm.c
deleted file mode 100644
index 2521f771f2f5..000000000000
--- a/arch/x86/events/intel/cqm.c
+++ /dev/null
@@ -1,1766 +0,0 @@
-/*
- * Intel Cache Quality-of-Service Monitoring (CQM) support.
- *
- * Based very, very heavily on work by Peter Zijlstra.
- */
-
-#include <linux/perf_event.h>
-#include <linux/slab.h>
-#include <asm/cpu_device_id.h>
-#include <asm/intel_rdt_common.h>
-#include "../perf_event.h"
-
-#define MSR_IA32_QM_CTR		0x0c8e
-#define MSR_IA32_QM_EVTSEL	0x0c8d
-
-#define MBM_CNTR_WIDTH		24
-/*
- * Guaranteed time in ms as per SDM where MBM counters will not overflow.
- */
-#define MBM_CTR_OVERFLOW_TIME	1000
-
-static u32 cqm_max_rmid = -1;
-static unsigned int cqm_l3_scale; /* supposedly cacheline size */
-static bool cqm_enabled, mbm_enabled;
-unsigned int mbm_socket_max;
-
-/*
- * The cached intel_pqr_state is strictly per CPU and can never be
- * updated from a remote CPU. Both functions which modify the state
- * (intel_cqm_event_start and intel_cqm_event_stop) are called with
- * interrupts disabled, which is sufficient for the protection.
- */
-DEFINE_PER_CPU(struct intel_pqr_state, pqr_state);
-static struct hrtimer *mbm_timers;
-/**
- * struct sample - mbm event's (local or total) data
- * @total_bytes    #bytes since we began monitoring
- * @prev_msr       previous value of MSR
- */
-struct sample {
-	u64	total_bytes;
-	u64	prev_msr;
-};
-
-/*
- * samples profiled for total memory bandwidth type events
- */
-static struct sample *mbm_total;
-/*
- * samples profiled for local memory bandwidth type events
- */
-static struct sample *mbm_local;
-
-#define pkg_id	topology_physical_package_id(smp_processor_id())
-/*
- * rmid_2_index returns the index for the rmid in mbm_local/mbm_total array.
- * mbm_total[] and mbm_local[] are linearly indexed by socket# * max number of
- * rmids per socket, an example is given below
- * RMID1 of Socket0:  vrmid =  1
- * RMID1 of Socket1:  vrmid =  1 * (cqm_max_rmid + 1) + 1
- * RMID1 of Socket2:  vrmid =  2 * (cqm_max_rmid + 1) + 1
- */
-#define rmid_2_index(rmid)  ((pkg_id * (cqm_max_rmid + 1)) + rmid)
-/*
- * Protects cache_cgroups and cqm_rmid_free_lru and cqm_rmid_limbo_lru.
- * Also protects event->hw.cqm_rmid
- *
- * Hold either for stability, both for modification of ->hw.cqm_rmid.
- */
-static DEFINE_MUTEX(cache_mutex);
-static DEFINE_RAW_SPINLOCK(cache_lock);
-
-/*
- * Groups of events that have the same target(s), one RMID per group.
- */
-static LIST_HEAD(cache_groups);
-
-/*
- * Mask of CPUs for reading CQM values. We only need one per-socket.
- */
-static cpumask_t cqm_cpumask;
-
-#define RMID_VAL_ERROR		(1ULL << 63)
-#define RMID_VAL_UNAVAIL	(1ULL << 62)
-
-/*
- * Event IDs are used to program IA32_QM_EVTSEL before reading event
- * counter from IA32_QM_CTR
- */
-#define QOS_L3_OCCUP_EVENT_ID	0x01
-#define QOS_MBM_TOTAL_EVENT_ID	0x02
-#define QOS_MBM_LOCAL_EVENT_ID	0x03
-
-/*
- * This is central to the rotation algorithm in __intel_cqm_rmid_rotate().
- *
- * This rmid is always free and is guaranteed to have an associated
- * near-zero occupancy value, i.e. no cachelines are tagged with this
- * RMID, once __intel_cqm_rmid_rotate() returns.
- */
-static u32 intel_cqm_rotation_rmid;
-
-#define INVALID_RMID		(-1)
-
-/*
- * Is @rmid valid for programming the hardware?
- *
- * rmid 0 is reserved by the hardware for all non-monitored tasks, which
- * means that we should never come across an rmid with that value.
- * Likewise, an rmid value of -1 is used to indicate "no rmid currently
- * assigned" and is used as part of the rotation code.
- */
-static inline bool __rmid_valid(u32 rmid)
-{
-	if (!rmid || rmid == INVALID_RMID)
-		return false;
-
-	return true;
-}
-
-static u64 __rmid_read(u32 rmid)
-{
-	u64 val;
-
-	/*
-	 * Ignore the SDM, this thing is _NOTHING_ like a regular perfcnt,
-	 * it just says that to increase confusion.
-	 */
-	wrmsr(MSR_IA32_QM_EVTSEL, QOS_L3_OCCUP_EVENT_ID, rmid);
-	rdmsrl(MSR_IA32_QM_CTR, val);
-
-	/*
-	 * Aside from the ERROR and UNAVAIL bits, assume this thing returns
-	 * the number of cachelines tagged with @rmid.
-	 */
-	return val;
-}
-
-enum rmid_recycle_state {
-	RMID_YOUNG = 0,
-	RMID_AVAILABLE,
-	RMID_DIRTY,
-};
-
-struct cqm_rmid_entry {
-	u32 rmid;
-	enum rmid_recycle_state state;
-	struct list_head list;
-	unsigned long queue_time;
-};
-
-/*
- * cqm_rmid_free_lru - A least recently used list of RMIDs.
- *
- * Oldest entry at the head, newest (most recently used) entry at the
- * tail. This list is never traversed, it's only used to keep track of
- * the lru order. That is, we only pick entries of the head or insert
- * them on the tail.
- *
- * All entries on the list are 'free', and their RMIDs are not currently
- * in use. To mark an RMID as in use, remove its entry from the lru
- * list.
- *
- *
- * cqm_rmid_limbo_lru - list of currently unused but (potentially) dirty RMIDs.
- *
- * This list is contains RMIDs that no one is currently using but that
- * may have a non-zero occupancy value associated with them. The
- * rotation worker moves RMIDs from the limbo list to the free list once
- * the occupancy value drops below __intel_cqm_threshold.
- *
- * Both lists are protected by cache_mutex.
- */
-static LIST_HEAD(cqm_rmid_free_lru);
-static LIST_HEAD(cqm_rmid_limbo_lru);
-
-/*
- * We use a simple array of pointers so that we can lookup a struct
- * cqm_rmid_entry in O(1). This alleviates the callers of __get_rmid()
- * and __put_rmid() from having to worry about dealing with struct
- * cqm_rmid_entry - they just deal with rmids, i.e. integers.
- *
- * Once this array is initialized it is read-only. No locks are required
- * to access it.
- *
- * All entries for all RMIDs can be looked up in the this array at all
- * times.
- */
-static struct cqm_rmid_entry **cqm_rmid_ptrs;
-
-static inline struct cqm_rmid_entry *__rmid_entry(u32 rmid)
-{
-	struct cqm_rmid_entry *entry;
-
-	entry = cqm_rmid_ptrs[rmid];
-	WARN_ON(entry->rmid != rmid);
-
-	return entry;
-}
-
-/*
- * Returns < 0 on fail.
- *
- * We expect to be called with cache_mutex held.
- */
-static u32 __get_rmid(void)
-{
-	struct cqm_rmid_entry *entry;
-
-	lockdep_assert_held(&cache_mutex);
-
-	if (list_empty(&cqm_rmid_free_lru))
-		return INVALID_RMID;
-
-	entry = list_first_entry(&cqm_rmid_free_lru, struct cqm_rmid_entry, list);
-	list_del(&entry->list);
-
-	return entry->rmid;
-}
-
-static void __put_rmid(u32 rmid)
-{
-	struct cqm_rmid_entry *entry;
-
-	lockdep_assert_held(&cache_mutex);
-
-	WARN_ON(!__rmid_valid(rmid));
-	entry = __rmid_entry(rmid);
-
-	entry->queue_time = jiffies;
-	entry->state = RMID_YOUNG;
-
-	list_add_tail(&entry->list, &cqm_rmid_limbo_lru);
-}
-
-static void cqm_cleanup(void)
-{
-	int i;
-
-	if (!cqm_rmid_ptrs)
-		return;
-
-	for (i = 0; i < cqm_max_rmid; i++)
-		kfree(cqm_rmid_ptrs[i]);
-
-	kfree(cqm_rmid_ptrs);
-	cqm_rmid_ptrs = NULL;
-	cqm_enabled = false;
-}
-
-static int intel_cqm_setup_rmid_cache(void)
-{
-	struct cqm_rmid_entry *entry;
-	unsigned int nr_rmids;
-	int r = 0;
-
-	nr_rmids = cqm_max_rmid + 1;
-	cqm_rmid_ptrs = kzalloc(sizeof(struct cqm_rmid_entry *) *
-				nr_rmids, GFP_KERNEL);
-	if (!cqm_rmid_ptrs)
-		return -ENOMEM;
-
-	for (; r <= cqm_max_rmid; r++) {
-		struct cqm_rmid_entry *entry;
-
-		entry = kmalloc(sizeof(*entry), GFP_KERNEL);
-		if (!entry)
-			goto fail;
-
-		INIT_LIST_HEAD(&entry->list);
-		entry->rmid = r;
-		cqm_rmid_ptrs[r] = entry;
-
-		list_add_tail(&entry->list, &cqm_rmid_free_lru);
-	}
-
-	/*
-	 * RMID 0 is special and is always allocated. It's used for all
-	 * tasks that are not monitored.
-	 */
-	entry = __rmid_entry(0);
-	list_del(&entry->list);
-
-	mutex_lock(&cache_mutex);
-	intel_cqm_rotation_rmid = __get_rmid();
-	mutex_unlock(&cache_mutex);
-
-	return 0;
-
-fail:
-	cqm_cleanup();
-	return -ENOMEM;
-}
-
-/*
- * Determine if @a and @b measure the same set of tasks.
- *
- * If @a and @b measure the same set of tasks then we want to share a
- * single RMID.
- */
-static bool __match_event(struct perf_event *a, struct perf_event *b)
-{
-	/* Per-cpu and task events don't mix */
-	if ((a->attach_state & PERF_ATTACH_TASK) !=
-	    (b->attach_state & PERF_ATTACH_TASK))
-		return false;
-
-#ifdef CONFIG_CGROUP_PERF
-	if (a->cgrp != b->cgrp)
-		return false;
-#endif
-
-	/* If not task event, we're machine wide */
-	if (!(b->attach_state & PERF_ATTACH_TASK))
-		return true;
-
-	/*
-	 * Events that target same task are placed into the same cache group.
-	 * Mark it as a multi event group, so that we update ->count
-	 * for every event rather than just the group leader later.
-	 */
-	if (a->hw.target == b->hw.target) {
-		b->hw.is_group_event = true;
-		return true;
-	}
-
-	/*
-	 * Are we an inherited event?
-	 */
-	if (b->parent == a)
-		return true;
-
-	return false;
-}
-
-#ifdef CONFIG_CGROUP_PERF
-static inline struct perf_cgroup *event_to_cgroup(struct perf_event *event)
-{
-	if (event->attach_state & PERF_ATTACH_TASK)
-		return perf_cgroup_from_task(event->hw.target, event->ctx);
-
-	return event->cgrp;
-}
-#endif
-
-/*
- * Determine if @a's tasks intersect with @b's tasks
- *
- * There are combinations of events that we explicitly prohibit,
- *
- *		   PROHIBITS
- *     system-wide    -> 	cgroup and task
- *     cgroup 	      ->	system-wide
- *     		      ->	task in cgroup
- *     task 	      -> 	system-wide
- *     		      ->	task in cgroup
- *
- * Call this function before allocating an RMID.
- */
-static bool __conflict_event(struct perf_event *a, struct perf_event *b)
-{
-#ifdef CONFIG_CGROUP_PERF
-	/*
-	 * We can have any number of cgroups but only one system-wide
-	 * event at a time.
-	 */
-	if (a->cgrp && b->cgrp) {
-		struct perf_cgroup *ac = a->cgrp;
-		struct perf_cgroup *bc = b->cgrp;
-
-		/*
-		 * This condition should have been caught in
-		 * __match_event() and we should be sharing an RMID.
-		 */
-		WARN_ON_ONCE(ac == bc);
-
-		if (cgroup_is_descendant(ac->css.cgroup, bc->css.cgroup) ||
-		    cgroup_is_descendant(bc->css.cgroup, ac->css.cgroup))
-			return true;
-
-		return false;
-	}
-
-	if (a->cgrp || b->cgrp) {
-		struct perf_cgroup *ac, *bc;
-
-		/*
-		 * cgroup and system-wide events are mutually exclusive
-		 */
-		if ((a->cgrp && !(b->attach_state & PERF_ATTACH_TASK)) ||
-		    (b->cgrp && !(a->attach_state & PERF_ATTACH_TASK)))
-			return true;
-
-		/*
-		 * Ensure neither event is part of the other's cgroup
-		 */
-		ac = event_to_cgroup(a);
-		bc = event_to_cgroup(b);
-		if (ac == bc)
-			return true;
-
-		/*
-		 * Must have cgroup and non-intersecting task events.
-		 */
-		if (!ac || !bc)
-			return false;
-
-		/*
-		 * We have cgroup and task events, and the task belongs
-		 * to a cgroup. Check for for overlap.
-		 */
-		if (cgroup_is_descendant(ac->css.cgroup, bc->css.cgroup) ||
-		    cgroup_is_descendant(bc->css.cgroup, ac->css.cgroup))
-			return true;
-
-		return false;
-	}
-#endif
-	/*
-	 * If one of them is not a task, same story as above with cgroups.
-	 */
-	if (!(a->attach_state & PERF_ATTACH_TASK) ||
-	    !(b->attach_state & PERF_ATTACH_TASK))
-		return true;
-
-	/*
-	 * Must be non-overlapping.
-	 */
-	return false;
-}
-
-struct rmid_read {
-	u32 rmid;
-	u32 evt_type;
-	atomic64_t value;
-};
-
-static void __intel_cqm_event_count(void *info);
-static void init_mbm_sample(u32 rmid, u32 evt_type);
-static void __intel_mbm_event_count(void *info);
-
-static bool is_cqm_event(int e)
-{
-	return (e == QOS_L3_OCCUP_EVENT_ID);
-}
-
-static bool is_mbm_event(int e)
-{
-	return (e >= QOS_MBM_TOTAL_EVENT_ID && e <= QOS_MBM_LOCAL_EVENT_ID);
-}
-
-static void cqm_mask_call(struct rmid_read *rr)
-{
-	if (is_mbm_event(rr->evt_type))
-		on_each_cpu_mask(&cqm_cpumask, __intel_mbm_event_count, rr, 1);
-	else
-		on_each_cpu_mask(&cqm_cpumask, __intel_cqm_event_count, rr, 1);
-}
-
-/*
- * Exchange the RMID of a group of events.
- */
-static u32 intel_cqm_xchg_rmid(struct perf_event *group, u32 rmid)
-{
-	struct perf_event *event;
-	struct list_head *head = &group->hw.cqm_group_entry;
-	u32 old_rmid = group->hw.cqm_rmid;
-
-	lockdep_assert_held(&cache_mutex);
-
-	/*
-	 * If our RMID is being deallocated, perform a read now.
-	 */
-	if (__rmid_valid(old_rmid) && !__rmid_valid(rmid)) {
-		struct rmid_read rr = {
-			.rmid = old_rmid,
-			.evt_type = group->attr.config,
-			.value = ATOMIC64_INIT(0),
-		};
-
-		cqm_mask_call(&rr);
-		local64_set(&group->count, atomic64_read(&rr.value));
-	}
-
-	raw_spin_lock_irq(&cache_lock);
-
-	group->hw.cqm_rmid = rmid;
-	list_for_each_entry(event, head, hw.cqm_group_entry)
-		event->hw.cqm_rmid = rmid;
-
-	raw_spin_unlock_irq(&cache_lock);
-
-	/*
-	 * If the allocation is for mbm, init the mbm stats.
-	 * Need to check if each event in the group is mbm event
-	 * because there could be multiple type of events in the same group.
-	 */
-	if (__rmid_valid(rmid)) {
-		event = group;
-		if (is_mbm_event(event->attr.config))
-			init_mbm_sample(rmid, event->attr.config);
-
-		list_for_each_entry(event, head, hw.cqm_group_entry) {
-			if (is_mbm_event(event->attr.config))
-				init_mbm_sample(rmid, event->attr.config);
-		}
-	}
-
-	return old_rmid;
-}
-
-/*
- * If we fail to assign a new RMID for intel_cqm_rotation_rmid because
- * cachelines are still tagged with RMIDs in limbo, we progressively
- * increment the threshold until we find an RMID in limbo with <=
- * __intel_cqm_threshold lines tagged. This is designed to mitigate the
- * problem where cachelines tagged with an RMID are not steadily being
- * evicted.
- *
- * On successful rotations we decrease the threshold back towards zero.
- *
- * __intel_cqm_max_threshold provides an upper bound on the threshold,
- * and is measured in bytes because it's exposed to userland.
- */
-static unsigned int __intel_cqm_threshold;
-static unsigned int __intel_cqm_max_threshold;
-
-/*
- * Test whether an RMID has a zero occupancy value on this cpu.
- */
-static void intel_cqm_stable(void *arg)
-{
-	struct cqm_rmid_entry *entry;
-
-	list_for_each_entry(entry, &cqm_rmid_limbo_lru, list) {
-		if (entry->state != RMID_AVAILABLE)
-			break;
-
-		if (__rmid_read(entry->rmid) > __intel_cqm_threshold)
-			entry->state = RMID_DIRTY;
-	}
-}
-
-/*
- * If we have group events waiting for an RMID that don't conflict with
- * events already running, assign @rmid.
- */
-static bool intel_cqm_sched_in_event(u32 rmid)
-{
-	struct perf_event *leader, *event;
-
-	lockdep_assert_held(&cache_mutex);
-
-	leader = list_first_entry(&cache_groups, struct perf_event,
-				  hw.cqm_groups_entry);
-	event = leader;
-
-	list_for_each_entry_continue(event, &cache_groups,
-				     hw.cqm_groups_entry) {
-		if (__rmid_valid(event->hw.cqm_rmid))
-			continue;
-
-		if (__conflict_event(event, leader))
-			continue;
-
-		intel_cqm_xchg_rmid(event, rmid);
-		return true;
-	}
-
-	return false;
-}
-
-/*
- * Initially use this constant for both the limbo queue time and the
- * rotation timer interval, pmu::hrtimer_interval_ms.
- *
- * They don't need to be the same, but the two are related since if you
- * rotate faster than you recycle RMIDs, you may run out of available
- * RMIDs.
- */
-#define RMID_DEFAULT_QUEUE_TIME 250	/* ms */
-
-static unsigned int __rmid_queue_time_ms = RMID_DEFAULT_QUEUE_TIME;
-
-/*
- * intel_cqm_rmid_stabilize - move RMIDs from limbo to free list
- * @nr_available: number of freeable RMIDs on the limbo list
- *
- * Quiescent state; wait for all 'freed' RMIDs to become unused, i.e. no
- * cachelines are tagged with those RMIDs. After this we can reuse them
- * and know that the current set of active RMIDs is stable.
- *
- * Return %true or %false depending on whether stabilization needs to be
- * reattempted.
- *
- * If we return %true then @nr_available is updated to indicate the
- * number of RMIDs on the limbo list that have been queued for the
- * minimum queue time (RMID_AVAILABLE), but whose data occupancy values
- * are above __intel_cqm_threshold.
- */
-static bool intel_cqm_rmid_stabilize(unsigned int *available)
-{
-	struct cqm_rmid_entry *entry, *tmp;
-
-	lockdep_assert_held(&cache_mutex);
-
-	*available = 0;
-	list_for_each_entry(entry, &cqm_rmid_limbo_lru, list) {
-		unsigned long min_queue_time;
-		unsigned long now = jiffies;
-
-		/*
-		 * We hold RMIDs placed into limbo for a minimum queue
-		 * time. Before the minimum queue time has elapsed we do
-		 * not recycle RMIDs.
-		 *
-		 * The reasoning is that until a sufficient time has
-		 * passed since we stopped using an RMID, any RMID
-		 * placed onto the limbo list will likely still have
-		 * data tagged in the cache, which means we'll probably
-		 * fail to recycle it anyway.
-		 *
-		 * We can save ourselves an expensive IPI by skipping
-		 * any RMIDs that have not been queued for the minimum
-		 * time.
-		 */
-		min_queue_time = entry->queue_time +
-			msecs_to_jiffies(__rmid_queue_time_ms);
-
-		if (time_after(min_queue_time, now))
-			break;
-
-		entry->state = RMID_AVAILABLE;
-		(*available)++;
-	}
-
-	/*
-	 * Fast return if none of the RMIDs on the limbo list have been
-	 * sitting on the queue for the minimum queue time.
-	 */
-	if (!*available)
-		return false;
-
-	/*
-	 * Test whether an RMID is free for each package.
-	 */
-	on_each_cpu_mask(&cqm_cpumask, intel_cqm_stable, NULL, true);
-
-	list_for_each_entry_safe(entry, tmp, &cqm_rmid_limbo_lru, list) {
-		/*
-		 * Exhausted all RMIDs that have waited min queue time.
-		 */
-		if (entry->state == RMID_YOUNG)
-			break;
-
-		if (entry->state == RMID_DIRTY)
-			continue;
-
-		list_del(&entry->list);	/* remove from limbo */
-
-		/*
-		 * The rotation RMID gets priority if it's
-		 * currently invalid. In which case, skip adding
-		 * the RMID to the the free lru.
-		 */
-		if (!__rmid_valid(intel_cqm_rotation_rmid)) {
-			intel_cqm_rotation_rmid = entry->rmid;
-			continue;
-		}
-
-		/*
-		 * If we have groups waiting for RMIDs, hand
-		 * them one now provided they don't conflict.
-		 */
-		if (intel_cqm_sched_in_event(entry->rmid))
-			continue;
-
-		/*
-		 * Otherwise place it onto the free list.
-		 */
-		list_add_tail(&entry->list, &cqm_rmid_free_lru);
-	}
-
-
-	return __rmid_valid(intel_cqm_rotation_rmid);
-}
-
-/*
- * Pick a victim group and move it to the tail of the group list.
- * @next: The first group without an RMID
- */
-static void __intel_cqm_pick_and_rotate(struct perf_event *next)
-{
-	struct perf_event *rotor;
-	u32 rmid;
-
-	lockdep_assert_held(&cache_mutex);
-
-	rotor = list_first_entry(&cache_groups, struct perf_event,
-				 hw.cqm_groups_entry);
-
-	/*
-	 * The group at the front of the list should always have a valid
-	 * RMID. If it doesn't then no groups have RMIDs assigned and we
-	 * don't need to rotate the list.
-	 */
-	if (next == rotor)
-		return;
-
-	rmid = intel_cqm_xchg_rmid(rotor, INVALID_RMID);
-	__put_rmid(rmid);
-
-	list_rotate_left(&cache_groups);
-}
-
-/*
- * Deallocate the RMIDs from any events that conflict with @event, and
- * place them on the back of the group list.
- */
-static void intel_cqm_sched_out_conflicting_events(struct perf_event *event)
-{
-	struct perf_event *group, *g;
-	u32 rmid;
-
-	lockdep_assert_held(&cache_mutex);
-
-	list_for_each_entry_safe(group, g, &cache_groups, hw.cqm_groups_entry) {
-		if (group == event)
-			continue;
-
-		rmid = group->hw.cqm_rmid;
-
-		/*
-		 * Skip events that don't have a valid RMID.
-		 */
-		if (!__rmid_valid(rmid))
-			continue;
-
-		/*
-		 * No conflict? No problem! Leave the event alone.
-		 */
-		if (!__conflict_event(group, event))
-			continue;
-
-		intel_cqm_xchg_rmid(group, INVALID_RMID);
-		__put_rmid(rmid);
-	}
-}
-
-/*
- * Attempt to rotate the groups and assign new RMIDs.
- *
- * We rotate for two reasons,
- *   1. To handle the scheduling of conflicting events
- *   2. To recycle RMIDs
- *
- * Rotating RMIDs is complicated because the hardware doesn't give us
- * any clues.
- *
- * There's problems with the hardware interface; when you change the
- * task:RMID map cachelines retain their 'old' tags, giving a skewed
- * picture. In order to work around this, we must always keep one free
- * RMID - intel_cqm_rotation_rmid.
- *
- * Rotation works by taking away an RMID from a group (the old RMID),
- * and assigning the free RMID to another group (the new RMID). We must
- * then wait for the old RMID to not be used (no cachelines tagged).
- * This ensure that all cachelines are tagged with 'active' RMIDs. At
- * this point we can start reading values for the new RMID and treat the
- * old RMID as the free RMID for the next rotation.
- *
- * Return %true or %false depending on whether we did any rotating.
- */
-static bool __intel_cqm_rmid_rotate(void)
-{
-	struct perf_event *group, *start = NULL;
-	unsigned int threshold_limit;
-	unsigned int nr_needed = 0;
-	unsigned int nr_available;
-	bool rotated = false;
-
-	mutex_lock(&cache_mutex);
-
-again:
-	/*
-	 * Fast path through this function if there are no groups and no
-	 * RMIDs that need cleaning.
-	 */
-	if (list_empty(&cache_groups) && list_empty(&cqm_rmid_limbo_lru))
-		goto out;
-
-	list_for_each_entry(group, &cache_groups, hw.cqm_groups_entry) {
-		if (!__rmid_valid(group->hw.cqm_rmid)) {
-			if (!start)
-				start = group;
-			nr_needed++;
-		}
-	}
-
-	/*
-	 * We have some event groups, but they all have RMIDs assigned
-	 * and no RMIDs need cleaning.
-	 */
-	if (!nr_needed && list_empty(&cqm_rmid_limbo_lru))
-		goto out;
-
-	if (!nr_needed)
-		goto stabilize;
-
-	/*
-	 * We have more event groups without RMIDs than available RMIDs,
-	 * or we have event groups that conflict with the ones currently
-	 * scheduled.
-	 *
-	 * We force deallocate the rmid of the group at the head of
-	 * cache_groups. The first event group without an RMID then gets
-	 * assigned intel_cqm_rotation_rmid. This ensures we always make
-	 * forward progress.
-	 *
-	 * Rotate the cache_groups list so the previous head is now the
-	 * tail.
-	 */
-	__intel_cqm_pick_and_rotate(start);
-
-	/*
-	 * If the rotation is going to succeed, reduce the threshold so
-	 * that we don't needlessly reuse dirty RMIDs.
-	 */
-	if (__rmid_valid(intel_cqm_rotation_rmid)) {
-		intel_cqm_xchg_rmid(start, intel_cqm_rotation_rmid);
-		intel_cqm_rotation_rmid = __get_rmid();
-
-		intel_cqm_sched_out_conflicting_events(start);
-
-		if (__intel_cqm_threshold)
-			__intel_cqm_threshold--;
-	}
-
-	rotated = true;
-
-stabilize:
-	/*
-	 * We now need to stablize the RMID we freed above (if any) to
-	 * ensure that the next time we rotate we have an RMID with zero
-	 * occupancy value.
-	 *
-	 * Alternatively, if we didn't need to perform any rotation,
-	 * we'll have a bunch of RMIDs in limbo that need stabilizing.
-	 */
-	threshold_limit = __intel_cqm_max_threshold / cqm_l3_scale;
-
-	while (intel_cqm_rmid_stabilize(&nr_available) &&
-	       __intel_cqm_threshold < threshold_limit) {
-		unsigned int steal_limit;
-
-		/*
-		 * Don't spin if nobody is actively waiting for an RMID,
-		 * the rotation worker will be kicked as soon as an
-		 * event needs an RMID anyway.
-		 */
-		if (!nr_needed)
-			break;
-
-		/* Allow max 25% of RMIDs to be in limbo. */
-		steal_limit = (cqm_max_rmid + 1) / 4;
-
-		/*
-		 * We failed to stabilize any RMIDs so our rotation
-		 * logic is now stuck. In order to make forward progress
-		 * we have a few options:
-		 *
-		 *   1. rotate ("steal") another RMID
-		 *   2. increase the threshold
-		 *   3. do nothing
-		 *
-		 * We do both of 1. and 2. until we hit the steal limit.
-		 *
-		 * The steal limit prevents all RMIDs ending up on the
-		 * limbo list. This can happen if every RMID has a
-		 * non-zero occupancy above threshold_limit, and the
-		 * occupancy values aren't dropping fast enough.
-		 *
-		 * Note that there is prioritisation at work here - we'd
-		 * rather increase the number of RMIDs on the limbo list
-		 * than increase the threshold, because increasing the
-		 * threshold skews the event data (because we reuse
-		 * dirty RMIDs) - threshold bumps are a last resort.
-		 */
-		if (nr_available < steal_limit)
-			goto again;
-
-		__intel_cqm_threshold++;
-	}
-
-out:
-	mutex_unlock(&cache_mutex);
-	return rotated;
-}
-
-static void intel_cqm_rmid_rotate(struct work_struct *work);
-
-static DECLARE_DELAYED_WORK(intel_cqm_rmid_work, intel_cqm_rmid_rotate);
-
-static struct pmu intel_cqm_pmu;
-
-static void intel_cqm_rmid_rotate(struct work_struct *work)
-{
-	unsigned long delay;
-
-	__intel_cqm_rmid_rotate();
-
-	delay = msecs_to_jiffies(intel_cqm_pmu.hrtimer_interval_ms);
-	schedule_delayed_work(&intel_cqm_rmid_work, delay);
-}
-
-static u64 update_sample(unsigned int rmid, u32 evt_type, int first)
-{
-	struct sample *mbm_current;
-	u32 vrmid = rmid_2_index(rmid);
-	u64 val, bytes, shift;
-	u32 eventid;
-
-	if (evt_type == QOS_MBM_LOCAL_EVENT_ID) {
-		mbm_current = &mbm_local[vrmid];
-		eventid     = QOS_MBM_LOCAL_EVENT_ID;
-	} else {
-		mbm_current = &mbm_total[vrmid];
-		eventid     = QOS_MBM_TOTAL_EVENT_ID;
-	}
-
-	wrmsr(MSR_IA32_QM_EVTSEL, eventid, rmid);
-	rdmsrl(MSR_IA32_QM_CTR, val);
-	if (val & (RMID_VAL_ERROR | RMID_VAL_UNAVAIL))
-		return mbm_current->total_bytes;
-
-	if (first) {
-		mbm_current->prev_msr = val;
-		mbm_current->total_bytes = 0;
-		return mbm_current->total_bytes;
-	}
-
-	/*
-	 * The h/w guarantees that counters will not overflow
-	 * so long as we poll them at least once per second.
-	 */
-	shift = 64 - MBM_CNTR_WIDTH;
-	bytes = (val << shift) - (mbm_current->prev_msr << shift);
-	bytes >>= shift;
-
-	bytes *= cqm_l3_scale;
-
-	mbm_current->total_bytes += bytes;
-	mbm_current->prev_msr = val;
-
-	return mbm_current->total_bytes;
-}
-
-static u64 rmid_read_mbm(unsigned int rmid, u32 evt_type)
-{
-	return update_sample(rmid, evt_type, 0);
-}
-
-static void __intel_mbm_event_init(void *info)
-{
-	struct rmid_read *rr = info;
-
-	update_sample(rr->rmid, rr->evt_type, 1);
-}
-
-static void init_mbm_sample(u32 rmid, u32 evt_type)
-{
-	struct rmid_read rr = {
-		.rmid = rmid,
-		.evt_type = evt_type,
-		.value = ATOMIC64_INIT(0),
-	};
-
-	/* on each socket, init sample */
-	on_each_cpu_mask(&cqm_cpumask, __intel_mbm_event_init, &rr, 1);
-}
-
-/*
- * Find a group and setup RMID.
- *
- * If we're part of a group, we use the group's RMID.
- */
-static void intel_cqm_setup_event(struct perf_event *event,
-				  struct perf_event **group)
-{
-	struct perf_event *iter;
-	bool conflict = false;
-	u32 rmid;
-
-	event->hw.is_group_event = false;
-	list_for_each_entry(iter, &cache_groups, hw.cqm_groups_entry) {
-		rmid = iter->hw.cqm_rmid;
-
-		if (__match_event(iter, event)) {
-			/* All tasks in a group share an RMID */
-			event->hw.cqm_rmid = rmid;
-			*group = iter;
-			if (is_mbm_event(event->attr.config) && __rmid_valid(rmid))
-				init_mbm_sample(rmid, event->attr.config);
-			return;
-		}
-
-		/*
-		 * We only care about conflicts for events that are
-		 * actually scheduled in (and hence have a valid RMID).
-		 */
-		if (__conflict_event(iter, event) && __rmid_valid(rmid))
-			conflict = true;
-	}
-
-	if (conflict)
-		rmid = INVALID_RMID;
-	else
-		rmid = __get_rmid();
-
-	if (is_mbm_event(event->attr.config) && __rmid_valid(rmid))
-		init_mbm_sample(rmid, event->attr.config);
-
-	event->hw.cqm_rmid = rmid;
-}
-
-static void intel_cqm_event_read(struct perf_event *event)
-{
-	unsigned long flags;
-	u32 rmid;
-	u64 val;
-
-	/*
-	 * Task events are handled by intel_cqm_event_count().
-	 */
-	if (event->cpu == -1)
-		return;
-
-	raw_spin_lock_irqsave(&cache_lock, flags);
-	rmid = event->hw.cqm_rmid;
-
-	if (!__rmid_valid(rmid))
-		goto out;
-
-	if (is_mbm_event(event->attr.config))
-		val = rmid_read_mbm(rmid, event->attr.config);
-	else
-		val = __rmid_read(rmid);
-
-	/*
-	 * Ignore this reading on error states and do not update the value.
-	 */
-	if (val & (RMID_VAL_ERROR | RMID_VAL_UNAVAIL))
-		goto out;
-
-	local64_set(&event->count, val);
-out:
-	raw_spin_unlock_irqrestore(&cache_lock, flags);
-}
-
-static void __intel_cqm_event_count(void *info)
-{
-	struct rmid_read *rr = info;
-	u64 val;
-
-	val = __rmid_read(rr->rmid);
-
-	if (val & (RMID_VAL_ERROR | RMID_VAL_UNAVAIL))
-		return;
-
-	atomic64_add(val, &rr->value);
-}
-
-static inline bool cqm_group_leader(struct perf_event *event)
-{
-	return !list_empty(&event->hw.cqm_groups_entry);
-}
-
-static void __intel_mbm_event_count(void *info)
-{
-	struct rmid_read *rr = info;
-	u64 val;
-
-	val = rmid_read_mbm(rr->rmid, rr->evt_type);
-	if (val & (RMID_VAL_ERROR | RMID_VAL_UNAVAIL))
-		return;
-	atomic64_add(val, &rr->value);
-}
-
-static enum hrtimer_restart mbm_hrtimer_handle(struct hrtimer *hrtimer)
-{
-	struct perf_event *iter, *iter1;
-	int ret = HRTIMER_RESTART;
-	struct list_head *head;
-	unsigned long flags;
-	u32 grp_rmid;
-
-	/*
-	 * Need to cache_lock as the timer Event Select MSR reads
-	 * can race with the mbm/cqm count() and mbm_init() reads.
-	 */
-	raw_spin_lock_irqsave(&cache_lock, flags);
-
-	if (list_empty(&cache_groups)) {
-		ret = HRTIMER_NORESTART;
-		goto out;
-	}
-
-	list_for_each_entry(iter, &cache_groups, hw.cqm_groups_entry) {
-		grp_rmid = iter->hw.cqm_rmid;
-		if (!__rmid_valid(grp_rmid))
-			continue;
-		if (is_mbm_event(iter->attr.config))
-			update_sample(grp_rmid, iter->attr.config, 0);
-
-		head = &iter->hw.cqm_group_entry;
-		if (list_empty(head))
-			continue;
-		list_for_each_entry(iter1, head, hw.cqm_group_entry) {
-			if (!iter1->hw.is_group_event)
-				break;
-			if (is_mbm_event(iter1->attr.config))
-				update_sample(iter1->hw.cqm_rmid,
-					      iter1->attr.config, 0);
-		}
-	}
-
-	hrtimer_forward_now(hrtimer, ms_to_ktime(MBM_CTR_OVERFLOW_TIME));
-out:
-	raw_spin_unlock_irqrestore(&cache_lock, flags);
-
-	return ret;
-}
-
-static void __mbm_start_timer(void *info)
-{
-	hrtimer_start(&mbm_timers[pkg_id], ms_to_ktime(MBM_CTR_OVERFLOW_TIME),
-			     HRTIMER_MODE_REL_PINNED);
-}
-
-static void __mbm_stop_timer(void *info)
-{
-	hrtimer_cancel(&mbm_timers[pkg_id]);
-}
-
-static void mbm_start_timers(void)
-{
-	on_each_cpu_mask(&cqm_cpumask, __mbm_start_timer, NULL, 1);
-}
-
-static void mbm_stop_timers(void)
-{
-	on_each_cpu_mask(&cqm_cpumask, __mbm_stop_timer, NULL, 1);
-}
-
-static void mbm_hrtimer_init(void)
-{
-	struct hrtimer *hr;
-	int i;
-
-	for (i = 0; i < mbm_socket_max; i++) {
-		hr = &mbm_timers[i];
-		hrtimer_init(hr, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
-		hr->function = mbm_hrtimer_handle;
-	}
-}
-
-static u64 intel_cqm_event_count(struct perf_event *event)
-{
-	unsigned long flags;
-	struct rmid_read rr = {
-		.evt_type = event->attr.config,
-		.value = ATOMIC64_INIT(0),
-	};
-
-	/*
-	 * We only need to worry about task events. System-wide events
-	 * are handled like usual, i.e. entirely with
-	 * intel_cqm_event_read().
-	 */
-	if (event->cpu != -1)
-		return __perf_event_count(event);
-
-	/*
-	 * Only the group leader gets to report values except in case of
-	 * multiple events in the same group, we still need to read the
-	 * other events.This stops us
-	 * reporting duplicate values to userspace, and gives us a clear
-	 * rule for which task gets to report the values.
-	 *
-	 * Note that it is impossible to attribute these values to
-	 * specific packages - we forfeit that ability when we create
-	 * task events.
-	 */
-	if (!cqm_group_leader(event) && !event->hw.is_group_event)
-		return 0;
-
-	/*
-	 * Getting up-to-date values requires an SMP IPI which is not
-	 * possible if we're being called in interrupt context. Return
-	 * the cached values instead.
-	 */
-	if (unlikely(in_interrupt()))
-		goto out;
-
-	/*
-	 * Notice that we don't perform the reading of an RMID
-	 * atomically, because we can't hold a spin lock across the
-	 * IPIs.
-	 *
-	 * Speculatively perform the read, since @event might be
-	 * assigned a different (possibly invalid) RMID while we're
-	 * busying performing the IPI calls. It's therefore necessary to
-	 * check @event's RMID afterwards, and if it has changed,
-	 * discard the result of the read.
-	 */
-	rr.rmid = ACCESS_ONCE(event->hw.cqm_rmid);
-
-	if (!__rmid_valid(rr.rmid))
-		goto out;
-
-	cqm_mask_call(&rr);
-
-	raw_spin_lock_irqsave(&cache_lock, flags);
-	if (event->hw.cqm_rmid == rr.rmid)
-		local64_set(&event->count, atomic64_read(&rr.value));
-	raw_spin_unlock_irqrestore(&cache_lock, flags);
-out:
-	return __perf_event_count(event);
-}
-
-static void intel_cqm_event_start(struct perf_event *event, int mode)
-{
-	struct intel_pqr_state *state = this_cpu_ptr(&pqr_state);
-	u32 rmid = event->hw.cqm_rmid;
-
-	if (!(event->hw.cqm_state & PERF_HES_STOPPED))
-		return;
-
-	event->hw.cqm_state &= ~PERF_HES_STOPPED;
-
-	if (state->rmid_usecnt++) {
-		if (!WARN_ON_ONCE(state->rmid != rmid))
-			return;
-	} else {
-		WARN_ON_ONCE(state->rmid);
-	}
-
-	state->rmid = rmid;
-	wrmsr(MSR_IA32_PQR_ASSOC, rmid, state->closid);
-}
-
-static void intel_cqm_event_stop(struct perf_event *event, int mode)
-{
-	struct intel_pqr_state *state = this_cpu_ptr(&pqr_state);
-
-	if (event->hw.cqm_state & PERF_HES_STOPPED)
-		return;
-
-	event->hw.cqm_state |= PERF_HES_STOPPED;
-
-	intel_cqm_event_read(event);
-
-	if (!--state->rmid_usecnt) {
-		state->rmid = 0;
-		wrmsr(MSR_IA32_PQR_ASSOC, 0, state->closid);
-	} else {
-		WARN_ON_ONCE(!state->rmid);
-	}
-}
-
-static int intel_cqm_event_add(struct perf_event *event, int mode)
-{
-	unsigned long flags;
-	u32 rmid;
-
-	raw_spin_lock_irqsave(&cache_lock, flags);
-
-	event->hw.cqm_state = PERF_HES_STOPPED;
-	rmid = event->hw.cqm_rmid;
-
-	if (__rmid_valid(rmid) && (mode & PERF_EF_START))
-		intel_cqm_event_start(event, mode);
-
-	raw_spin_unlock_irqrestore(&cache_lock, flags);
-
-	return 0;
-}
-
-static void intel_cqm_event_destroy(struct perf_event *event)
-{
-	struct perf_event *group_other = NULL;
-	unsigned long flags;
-
-	mutex_lock(&cache_mutex);
-	/*
-	* Hold the cache_lock as mbm timer handlers could be
-	* scanning the list of events.
-	*/
-	raw_spin_lock_irqsave(&cache_lock, flags);
-
-	/*
-	 * If there's another event in this group...
-	 */
-	if (!list_empty(&event->hw.cqm_group_entry)) {
-		group_other = list_first_entry(&event->hw.cqm_group_entry,
-					       struct perf_event,
-					       hw.cqm_group_entry);
-		list_del(&event->hw.cqm_group_entry);
-	}
-
-	/*
-	 * And we're the group leader..
-	 */
-	if (cqm_group_leader(event)) {
-		/*
-		 * If there was a group_other, make that leader, otherwise
-		 * destroy the group and return the RMID.
-		 */
-		if (group_other) {
-			list_replace(&event->hw.cqm_groups_entry,
-				     &group_other->hw.cqm_groups_entry);
-		} else {
-			u32 rmid = event->hw.cqm_rmid;
-
-			if (__rmid_valid(rmid))
-				__put_rmid(rmid);
-			list_del(&event->hw.cqm_groups_entry);
-		}
-	}
-
-	raw_spin_unlock_irqrestore(&cache_lock, flags);
-
-	/*
-	 * Stop the mbm overflow timers when the last event is destroyed.
-	*/
-	if (mbm_enabled && list_empty(&cache_groups))
-		mbm_stop_timers();
-
-	mutex_unlock(&cache_mutex);
-}
-
-static int intel_cqm_event_init(struct perf_event *event)
-{
-	struct perf_event *group = NULL;
-	bool rotate = false;
-	unsigned long flags;
-
-	if (event->attr.type != intel_cqm_pmu.type)
-		return -ENOENT;
-
-	if ((event->attr.config < QOS_L3_OCCUP_EVENT_ID) ||
-	     (event->attr.config > QOS_MBM_LOCAL_EVENT_ID))
-		return -EINVAL;
-
-	if ((is_cqm_event(event->attr.config) && !cqm_enabled) ||
-	    (is_mbm_event(event->attr.config) && !mbm_enabled))
-		return -EINVAL;
-
-	/* unsupported modes and filters */
-	if (event->attr.exclude_user   ||
-	    event->attr.exclude_kernel ||
-	    event->attr.exclude_hv     ||
-	    event->attr.exclude_idle   ||
-	    event->attr.exclude_host   ||
-	    event->attr.exclude_guest  ||
-	    event->attr.sample_period) /* no sampling */
-		return -EINVAL;
-
-	INIT_LIST_HEAD(&event->hw.cqm_group_entry);
-	INIT_LIST_HEAD(&event->hw.cqm_groups_entry);
-
-	event->destroy = intel_cqm_event_destroy;
-
-	mutex_lock(&cache_mutex);
-
-	/*
-	 * Start the mbm overflow timers when the first event is created.
-	*/
-	if (mbm_enabled && list_empty(&cache_groups))
-		mbm_start_timers();
-
-	/* Will also set rmid */
-	intel_cqm_setup_event(event, &group);
-
-	/*
-	* Hold the cache_lock as mbm timer handlers be
-	* scanning the list of events.
-	*/
-	raw_spin_lock_irqsave(&cache_lock, flags);
-
-	if (group) {
-		list_add_tail(&event->hw.cqm_group_entry,
-			      &group->hw.cqm_group_entry);
-	} else {
-		list_add_tail(&event->hw.cqm_groups_entry,
-			      &cache_groups);
-
-		/*
-		 * All RMIDs are either in use or have recently been
-		 * used. Kick the rotation worker to clean/free some.
-		 *
-		 * We only do this for the group leader, rather than for
-		 * every event in a group to save on needless work.
-		 */
-		if (!__rmid_valid(event->hw.cqm_rmid))
-			rotate = true;
-	}
-
-	raw_spin_unlock_irqrestore(&cache_lock, flags);
-	mutex_unlock(&cache_mutex);
-
-	if (rotate)
-		schedule_delayed_work(&intel_cqm_rmid_work, 0);
-
-	return 0;
-}
-
-EVENT_ATTR_STR(llc_occupancy, intel_cqm_llc, "event=0x01");
-EVENT_ATTR_STR(llc_occupancy.per-pkg, intel_cqm_llc_pkg, "1");
-EVENT_ATTR_STR(llc_occupancy.unit, intel_cqm_llc_unit, "Bytes");
-EVENT_ATTR_STR(llc_occupancy.scale, intel_cqm_llc_scale, NULL);
-EVENT_ATTR_STR(llc_occupancy.snapshot, intel_cqm_llc_snapshot, "1");
-
-EVENT_ATTR_STR(total_bytes, intel_cqm_total_bytes, "event=0x02");
-EVENT_ATTR_STR(total_bytes.per-pkg, intel_cqm_total_bytes_pkg, "1");
-EVENT_ATTR_STR(total_bytes.unit, intel_cqm_total_bytes_unit, "MB");
-EVENT_ATTR_STR(total_bytes.scale, intel_cqm_total_bytes_scale, "1e-6");
-
-EVENT_ATTR_STR(local_bytes, intel_cqm_local_bytes, "event=0x03");
-EVENT_ATTR_STR(local_bytes.per-pkg, intel_cqm_local_bytes_pkg, "1");
-EVENT_ATTR_STR(local_bytes.unit, intel_cqm_local_bytes_unit, "MB");
-EVENT_ATTR_STR(local_bytes.scale, intel_cqm_local_bytes_scale, "1e-6");
-
-static struct attribute *intel_cqm_events_attr[] = {
-	EVENT_PTR(intel_cqm_llc),
-	EVENT_PTR(intel_cqm_llc_pkg),
-	EVENT_PTR(intel_cqm_llc_unit),
-	EVENT_PTR(intel_cqm_llc_scale),
-	EVENT_PTR(intel_cqm_llc_snapshot),
-	NULL,
-};
-
-static struct attribute *intel_mbm_events_attr[] = {
-	EVENT_PTR(intel_cqm_total_bytes),
-	EVENT_PTR(intel_cqm_local_bytes),
-	EVENT_PTR(intel_cqm_total_bytes_pkg),
-	EVENT_PTR(intel_cqm_local_bytes_pkg),
-	EVENT_PTR(intel_cqm_total_bytes_unit),
-	EVENT_PTR(intel_cqm_local_bytes_unit),
-	EVENT_PTR(intel_cqm_total_bytes_scale),
-	EVENT_PTR(intel_cqm_local_bytes_scale),
-	NULL,
-};
-
-static struct attribute *intel_cmt_mbm_events_attr[] = {
-	EVENT_PTR(intel_cqm_llc),
-	EVENT_PTR(intel_cqm_total_bytes),
-	EVENT_PTR(intel_cqm_local_bytes),
-	EVENT_PTR(intel_cqm_llc_pkg),
-	EVENT_PTR(intel_cqm_total_bytes_pkg),
-	EVENT_PTR(intel_cqm_local_bytes_pkg),
-	EVENT_PTR(intel_cqm_llc_unit),
-	EVENT_PTR(intel_cqm_total_bytes_unit),
-	EVENT_PTR(intel_cqm_local_bytes_unit),
-	EVENT_PTR(intel_cqm_llc_scale),
-	EVENT_PTR(intel_cqm_total_bytes_scale),
-	EVENT_PTR(intel_cqm_local_bytes_scale),
-	EVENT_PTR(intel_cqm_llc_snapshot),
-	NULL,
-};
-
-static struct attribute_group intel_cqm_events_group = {
-	.name = "events",
-	.attrs = NULL,
-};
-
-PMU_FORMAT_ATTR(event, "config:0-7");
-static struct attribute *intel_cqm_formats_attr[] = {
-	&format_attr_event.attr,
-	NULL,
-};
-
-static struct attribute_group intel_cqm_format_group = {
-	.name = "format",
-	.attrs = intel_cqm_formats_attr,
-};
-
-static ssize_t
-max_recycle_threshold_show(struct device *dev, struct device_attribute *attr,
-			   char *page)
-{
-	ssize_t rv;
-
-	mutex_lock(&cache_mutex);
-	rv = snprintf(page, PAGE_SIZE-1, "%u\n", __intel_cqm_max_threshold);
-	mutex_unlock(&cache_mutex);
-
-	return rv;
-}
-
-static ssize_t
-max_recycle_threshold_store(struct device *dev,
-			    struct device_attribute *attr,
-			    const char *buf, size_t count)
-{
-	unsigned int bytes, cachelines;
-	int ret;
-
-	ret = kstrtouint(buf, 0, &bytes);
-	if (ret)
-		return ret;
-
-	mutex_lock(&cache_mutex);
-
-	__intel_cqm_max_threshold = bytes;
-	cachelines = bytes / cqm_l3_scale;
-
-	/*
-	 * The new maximum takes effect immediately.
-	 */
-	if (__intel_cqm_threshold > cachelines)
-		__intel_cqm_threshold = cachelines;
-
-	mutex_unlock(&cache_mutex);
-
-	return count;
-}
-
-static DEVICE_ATTR_RW(max_recycle_threshold);
-
-static struct attribute *intel_cqm_attrs[] = {
-	&dev_attr_max_recycle_threshold.attr,
-	NULL,
-};
-
-static const struct attribute_group intel_cqm_group = {
-	.attrs = intel_cqm_attrs,
-};
-
-static const struct attribute_group *intel_cqm_attr_groups[] = {
-	&intel_cqm_events_group,
-	&intel_cqm_format_group,
-	&intel_cqm_group,
-	NULL,
-};
-
-static struct pmu intel_cqm_pmu = {
-	.hrtimer_interval_ms = RMID_DEFAULT_QUEUE_TIME,
-	.attr_groups	     = intel_cqm_attr_groups,
-	.task_ctx_nr	     = perf_sw_context,
-	.event_init	     = intel_cqm_event_init,
-	.add		     = intel_cqm_event_add,
-	.del		     = intel_cqm_event_stop,
-	.start		     = intel_cqm_event_start,
-	.stop		     = intel_cqm_event_stop,
-	.read		     = intel_cqm_event_read,
-	.count		     = intel_cqm_event_count,
-};
-
-static inline void cqm_pick_event_reader(int cpu)
-{
-	int reader;
-
-	/* First online cpu in package becomes the reader */
-	reader = cpumask_any_and(&cqm_cpumask, topology_core_cpumask(cpu));
-	if (reader >= nr_cpu_ids)
-		cpumask_set_cpu(cpu, &cqm_cpumask);
-}
-
-static int intel_cqm_cpu_starting(unsigned int cpu)
-{
-	struct intel_pqr_state *state = &per_cpu(pqr_state, cpu);
-	struct cpuinfo_x86 *c = &cpu_data(cpu);
-
-	state->rmid = 0;
-	state->closid = 0;
-	state->rmid_usecnt = 0;
-
-	WARN_ON(c->x86_cache_max_rmid != cqm_max_rmid);
-	WARN_ON(c->x86_cache_occ_scale != cqm_l3_scale);
-
-	cqm_pick_event_reader(cpu);
-	return 0;
-}
-
-static int intel_cqm_cpu_exit(unsigned int cpu)
-{
-	int target;
-
-	/* Is @cpu the current cqm reader for this package ? */
-	if (!cpumask_test_and_clear_cpu(cpu, &cqm_cpumask))
-		return 0;
-
-	/* Find another online reader in this package */
-	target = cpumask_any_but(topology_core_cpumask(cpu), cpu);
-
-	if (target < nr_cpu_ids)
-		cpumask_set_cpu(target, &cqm_cpumask);
-
-	return 0;
-}
-
-static const struct x86_cpu_id intel_cqm_match[] = {
-	{ .vendor = X86_VENDOR_INTEL, .feature = X86_FEATURE_CQM_OCCUP_LLC },
-	{}
-};
-
-static void mbm_cleanup(void)
-{
-	if (!mbm_enabled)
-		return;
-
-	kfree(mbm_local);
-	kfree(mbm_total);
-	mbm_enabled = false;
-}
-
-static const struct x86_cpu_id intel_mbm_local_match[] = {
-	{ .vendor = X86_VENDOR_INTEL, .feature = X86_FEATURE_CQM_MBM_LOCAL },
-	{}
-};
-
-static const struct x86_cpu_id intel_mbm_total_match[] = {
-	{ .vendor = X86_VENDOR_INTEL, .feature = X86_FEATURE_CQM_MBM_TOTAL },
-	{}
-};
-
-static int intel_mbm_init(void)
-{
-	int ret = 0, array_size, maxid = cqm_max_rmid + 1;
-
-	mbm_socket_max = topology_max_packages();
-	array_size = sizeof(struct sample) * maxid * mbm_socket_max;
-	mbm_local = kmalloc(array_size, GFP_KERNEL);
-	if (!mbm_local)
-		return -ENOMEM;
-
-	mbm_total = kmalloc(array_size, GFP_KERNEL);
-	if (!mbm_total) {
-		ret = -ENOMEM;
-		goto out;
-	}
-
-	array_size = sizeof(struct hrtimer) * mbm_socket_max;
-	mbm_timers = kmalloc(array_size, GFP_KERNEL);
-	if (!mbm_timers) {
-		ret = -ENOMEM;
-		goto out;
-	}
-	mbm_hrtimer_init();
-
-out:
-	if (ret)
-		mbm_cleanup();
-
-	return ret;
-}
-
-static int __init intel_cqm_init(void)
-{
-	char *str = NULL, scale[20];
-	int cpu, ret;
-
-	if (x86_match_cpu(intel_cqm_match))
-		cqm_enabled = true;
-
-	if (x86_match_cpu(intel_mbm_local_match) &&
-	     x86_match_cpu(intel_mbm_total_match))
-		mbm_enabled = true;
-
-	if (!cqm_enabled && !mbm_enabled)
-		return -ENODEV;
-
-	cqm_l3_scale = boot_cpu_data.x86_cache_occ_scale;
-
-	/*
-	 * It's possible that not all resources support the same number
-	 * of RMIDs. Instead of making scheduling much more complicated
-	 * (where we have to match a task's RMID to a cpu that supports
-	 * that many RMIDs) just find the minimum RMIDs supported across
-	 * all cpus.
-	 *
-	 * Also, check that the scales match on all cpus.
-	 */
-	cpus_read_lock();
-	for_each_online_cpu(cpu) {
-		struct cpuinfo_x86 *c = &cpu_data(cpu);
-
-		if (c->x86_cache_max_rmid < cqm_max_rmid)
-			cqm_max_rmid = c->x86_cache_max_rmid;
-
-		if (c->x86_cache_occ_scale != cqm_l3_scale) {
-			pr_err("Multiple LLC scale values, disabling\n");
-			ret = -EINVAL;
-			goto out;
-		}
-	}
-
-	/*
-	 * A reasonable upper limit on the max threshold is the number
-	 * of lines tagged per RMID if all RMIDs have the same number of
-	 * lines tagged in the LLC.
-	 *
-	 * For a 35MB LLC and 56 RMIDs, this is ~1.8% of the LLC.
-	 */
-	__intel_cqm_max_threshold =
-		boot_cpu_data.x86_cache_size * 1024 / (cqm_max_rmid + 1);
-
-	snprintf(scale, sizeof(scale), "%u", cqm_l3_scale);
-	str = kstrdup(scale, GFP_KERNEL);
-	if (!str) {
-		ret = -ENOMEM;
-		goto out;
-	}
-
-	event_attr_intel_cqm_llc_scale.event_str = str;
-
-	ret = intel_cqm_setup_rmid_cache();
-	if (ret)
-		goto out;
-
-	if (mbm_enabled)
-		ret = intel_mbm_init();
-	if (ret && !cqm_enabled)
-		goto out;
-
-	if (cqm_enabled && mbm_enabled)
-		intel_cqm_events_group.attrs = intel_cmt_mbm_events_attr;
-	else if (!cqm_enabled && mbm_enabled)
-		intel_cqm_events_group.attrs = intel_mbm_events_attr;
-	else if (cqm_enabled && !mbm_enabled)
-		intel_cqm_events_group.attrs = intel_cqm_events_attr;
-
-	ret = perf_pmu_register(&intel_cqm_pmu, "intel_cqm", -1);
-	if (ret) {
-		pr_err("Intel CQM perf registration failed: %d\n", ret);
-		goto out;
-	}
-
-	if (cqm_enabled)
-		pr_info("Intel CQM monitoring enabled\n");
-	if (mbm_enabled)
-		pr_info("Intel MBM enabled\n");
-
-	/*
-	 * Setup the hot cpu notifier once we are sure cqm
-	 * is enabled to avoid notifier leak.
-	 */
-	cpuhp_setup_state_cpuslocked(CPUHP_AP_PERF_X86_CQM_STARTING,
-				     "perf/x86/cqm:starting",
-				     intel_cqm_cpu_starting, NULL);
-	cpuhp_setup_state_cpuslocked(CPUHP_AP_PERF_X86_CQM_ONLINE,
-				     "perf/x86/cqm:online",
-				     NULL, intel_cqm_cpu_exit);
-out:
-	cpus_read_unlock();
-
-	if (ret) {
-		kfree(str);
-		cqm_cleanup();
-		mbm_cleanup();
-	}
-
-	return ret;
-}
-device_initcall(intel_cqm_init);
diff --git a/arch/x86/include/asm/intel_rdt_common.h b/arch/x86/include/asm/intel_rdt_common.h
index b31081b89407..c95321870842 100644
--- a/arch/x86/include/asm/intel_rdt_common.h
+++ b/arch/x86/include/asm/intel_rdt_common.h
@@ -7,7 +7,6 @@
  * struct intel_pqr_state - State cache for the PQR MSR
  * @rmid:		The cached Resource Monitoring ID
  * @closid:		The cached Class Of Service ID
- * @rmid_usecnt:	The usage counter for rmid
  *
  * The upper 32 bits of MSR_IA32_PQR_ASSOC contain closid and the
  * lower 10 bits rmid. The update to MSR_IA32_PQR_ASSOC always
@@ -19,7 +18,6 @@
 struct intel_pqr_state {
 	u32			rmid;
 	u32			closid;
-	int			rmid_usecnt;
 };
 
 DECLARE_PER_CPU(struct intel_pqr_state, pqr_state);
diff --git a/arch/x86/kernel/cpu/intel_rdt.c b/arch/x86/kernel/cpu/intel_rdt.c
index 5b366462f579..989a997fce47 100644
--- a/arch/x86/kernel/cpu/intel_rdt.c
+++ b/arch/x86/kernel/cpu/intel_rdt.c
@@ -40,6 +40,14 @@ DEFINE_MUTEX(rdtgroup_mutex);
 
 DEFINE_PER_CPU_READ_MOSTLY(int, cpu_closid);
 
+/*
+ * The cached intel_pqr_state is strictly per CPU and can never be
+ * updated from a remote CPU. Functions which modify the state
+ * are called with interrupts disabled and no preemption, which
+ * is sufficient for the protection.
+ */
+DEFINE_PER_CPU(struct intel_pqr_state, pqr_state);
+
 /*
  * Used to store the max resource name width and max resource data width
  * to display the schemata in a tabular format
diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index a3b873fc59e4..4572dbabd4e8 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -139,14 +139,6 @@ struct hw_perf_event {
 			/* for tp_event->class */
 			struct list_head	tp_list;
 		};
-		struct { /* intel_cqm */
-			int			cqm_state;
-			u32			cqm_rmid;
-			int			is_group_event;
-			struct list_head	cqm_events_entry;
-			struct list_head	cqm_groups_entry;
-			struct list_head	cqm_group_entry;
-		};
 		struct { /* itrace */
 			int			itrace_started;
 		};
@@ -416,11 +408,6 @@ struct pmu {
 	size_t				task_ctx_size;
 
 
-	/*
-	 * Return the count value for a counter.
-	 */
-	u64 (*count)			(struct perf_event *event); /*optional*/
-
 	/*
 	 * Set up pmu-private data structures for an AUX area
 	 */
@@ -1111,11 +1098,6 @@ static inline void perf_event_task_sched_out(struct task_struct *prev,
 		__perf_event_task_sched_out(prev, next);
 }
 
-static inline u64 __perf_event_count(struct perf_event *event)
-{
-	return local64_read(&event->count) + atomic64_read(&event->child_count);
-}
-
 extern void perf_event_mmap(struct vm_area_struct *vma);
 extern struct perf_guest_info_callbacks *perf_guest_cbs;
 extern int perf_register_guest_info_callbacks(struct perf_guest_info_callbacks *callbacks);
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 426c2ffba16d..6e171540c0af 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -3625,10 +3625,7 @@ unlock:
 
 static inline u64 perf_event_count(struct perf_event *event)
 {
-	if (event->pmu->count)
-		return event->pmu->count(event);
-
-	return __perf_event_count(event);
+	return local64_read(&event->count) + atomic64_read(&event->child_count);
 }
 
 /*
@@ -3659,15 +3656,6 @@ int perf_event_read_local(struct perf_event *event, u64 *value)
 		goto out;
 	}
 
-	/*
-	 * It must not have a pmu::count method, those are not
-	 * NMI safe.
-	 */
-	if (event->pmu->count) {
-		ret = -EOPNOTSUPP;
-		goto out;
-	}
-
 	/* If this is a per-task event, it must be for current */
 	if ((event->attach_state & PERF_ATTACH_TASK) &&
 	    event->hw.target != current) {
-- 
cgit v1.2.3


From f01d7d51f577b5dc0fa5919ab8a9228e2bf49f3e Mon Sep 17 00:00:00 2001
From: Vikas Shivappa <vikas.shivappa@linux.intel.com>
Date: Tue, 25 Jul 2017 14:14:22 -0700
Subject: x86/intel_rdt: Introduce a common compile option for RDT

We currently have a CONFIG_RDT_A which is for RDT(Resource directory
technology) allocation based resctrl filesystem interface. As a
preparation to add support for RDT monitoring as well into the same
resctrl filesystem, change the config option to be CONFIG_RDT which
would include both RDT allocation and monitoring code.

No functional change.

Signed-off-by: Vikas Shivappa <vikas.shivappa@linux.intel.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: ravi.v.shankar@intel.com
Cc: tony.luck@intel.com
Cc: fenghua.yu@intel.com
Cc: peterz@infradead.org
Cc: eranian@google.com
Cc: vikas.shivappa@intel.com
Cc: ak@linux.intel.com
Cc: davidcc@google.com
Cc: reinette.chatre@intel.com
Link: http://lkml.kernel.org/r/1501017287-28083-4-git-send-email-vikas.shivappa@linux.intel.com
---
 arch/x86/Kconfig                 | 12 ++++++------
 arch/x86/include/asm/intel_rdt.h |  4 ++--
 arch/x86/kernel/cpu/Makefile     |  2 +-
 include/linux/sched.h            |  2 +-
 4 files changed, 10 insertions(+), 10 deletions(-)

(limited to 'include')

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 781521b7cf9e..b1abda70c2d1 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -424,16 +424,16 @@ config GOLDFISH
        def_bool y
        depends on X86_GOLDFISH
 
-config INTEL_RDT_A
-	bool "Intel Resource Director Technology Allocation support"
+config INTEL_RDT
+	bool "Intel Resource Director Technology support"
 	default n
 	depends on X86 && CPU_SUP_INTEL
 	select KERNFS
 	help
-	  Select to enable resource allocation which is a sub-feature of
-	  Intel Resource Director Technology(RDT). More information about
-	  RDT can be found in the Intel x86 Architecture Software
-	  Developer Manual.
+	  Select to enable resource allocation and monitoring which are
+	  sub-features of Intel Resource Director Technology(RDT). More
+	  information about RDT can be found in the Intel x86
+	  Architecture Software Developer Manual.
 
 	  Say N if unsure.
 
diff --git a/arch/x86/include/asm/intel_rdt.h b/arch/x86/include/asm/intel_rdt.h
index 597dc4995678..ae1efc3609d3 100644
--- a/arch/x86/include/asm/intel_rdt.h
+++ b/arch/x86/include/asm/intel_rdt.h
@@ -1,7 +1,7 @@
 #ifndef _ASM_X86_INTEL_RDT_H
 #define _ASM_X86_INTEL_RDT_H
 
-#ifdef CONFIG_INTEL_RDT_A
+#ifdef CONFIG_INTEL_RDT
 
 #include <linux/sched.h>
 #include <linux/kernfs.h>
@@ -282,5 +282,5 @@ static inline void intel_rdt_sched_in(void)
 
 static inline void intel_rdt_sched_in(void) {}
 
-#endif /* CONFIG_INTEL_RDT_A */
+#endif /* CONFIG_INTEL_RDT */
 #endif /* _ASM_X86_INTEL_RDT_H */
diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile
index cdf82492b770..3622ca2b1555 100644
--- a/arch/x86/kernel/cpu/Makefile
+++ b/arch/x86/kernel/cpu/Makefile
@@ -33,7 +33,7 @@ obj-$(CONFIG_CPU_SUP_CENTAUR)		+= centaur.o
 obj-$(CONFIG_CPU_SUP_TRANSMETA_32)	+= transmeta.o
 obj-$(CONFIG_CPU_SUP_UMC_32)		+= umc.o
 
-obj-$(CONFIG_INTEL_RDT_A)	+= intel_rdt.o intel_rdt_rdtgroup.o intel_rdt_schemata.o
+obj-$(CONFIG_INTEL_RDT)	+= intel_rdt.o intel_rdt_rdtgroup.o intel_rdt_schemata.o
 
 obj-$(CONFIG_X86_MCE)			+= mcheck/
 obj-$(CONFIG_MTRR)			+= mtrr/
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 8337e2db0bb2..20b2ff2f4fde 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -898,7 +898,7 @@ struct task_struct {
 	/* cg_list protected by css_set_lock and tsk->alloc_lock: */
 	struct list_head		cg_list;
 #endif
-#ifdef CONFIG_INTEL_RDT_A
+#ifdef CONFIG_INTEL_RDT
 	int				closid;
 #endif
 #ifdef CONFIG_FUTEX
-- 
cgit v1.2.3


From 0734ded1abee9439b0c5d7b62af1ead78aab895b Mon Sep 17 00:00:00 2001
From: Vikas Shivappa <vikas.shivappa@linux.intel.com>
Date: Tue, 25 Jul 2017 14:14:33 -0700
Subject: x86/intel_rdt: Change closid type from int to u32

OS associates a CLOSid(Class of service id) to a task by writing the
high 32 bits of per CPU IA32_PQR_ASSOC MSR when a task is scheduled in.
CPUID.(EAX=10H, ECX=1):EDX[15:0] enumerates the max CLOSID supported and
it is zero indexed. Hence change the type to u32 from int.

Signed-off-by: Vikas Shivappa <vikas.shivappa@linux.intel.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: ravi.v.shankar@intel.com
Cc: tony.luck@intel.com
Cc: fenghua.yu@intel.com
Cc: peterz@infradead.org
Cc: eranian@google.com
Cc: vikas.shivappa@intel.com
Cc: ak@linux.intel.com
Cc: davidcc@google.com
Cc: reinette.chatre@intel.com
Link: http://lkml.kernel.org/r/1501017287-28083-15-git-send-email-vikas.shivappa@linux.intel.com
---
 arch/x86/include/asm/intel_rdt_sched.h   | 2 +-
 arch/x86/kernel/cpu/intel_rdt.h          | 2 +-
 arch/x86/kernel/cpu/intel_rdt_rdtgroup.c | 2 +-
 include/linux/sched.h                    | 2 +-
 4 files changed, 4 insertions(+), 4 deletions(-)

(limited to 'include')

diff --git a/arch/x86/include/asm/intel_rdt_sched.h b/arch/x86/include/asm/intel_rdt_sched.h
index 4dee77b6de07..1d8d45a773c2 100644
--- a/arch/x86/include/asm/intel_rdt_sched.h
+++ b/arch/x86/include/asm/intel_rdt_sched.h
@@ -46,7 +46,7 @@ static inline void intel_rdt_sched_in(void)
 {
 	if (static_branch_likely(&rdt_alloc_enable_key)) {
 		struct intel_pqr_state *state = this_cpu_ptr(&pqr_state);
-		int closid;
+		u32 closid;
 
 		/*
 		 * If this task has a closid assigned, use it.
diff --git a/arch/x86/kernel/cpu/intel_rdt.h b/arch/x86/kernel/cpu/intel_rdt.h
index e8fb08f9ea89..b2a2de360961 100644
--- a/arch/x86/kernel/cpu/intel_rdt.h
+++ b/arch/x86/kernel/cpu/intel_rdt.h
@@ -72,7 +72,7 @@ struct mongroup {
 struct rdtgroup {
 	struct kernfs_node	*kn;
 	struct list_head	rdtgroup_list;
-	int			closid;
+	u32			closid;
 	struct cpumask		cpu_mask;
 	int			flags;
 	atomic_t		waitcount;
diff --git a/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c b/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c
index bee9f885dd20..840405421841 100644
--- a/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c
+++ b/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c
@@ -77,7 +77,7 @@ static void closid_init(void)
 
 static int closid_alloc(void)
 {
-	int closid = ffs(closid_free_map);
+	u32 closid = ffs(closid_free_map);
 
 	if (closid == 0)
 		return -ENOSPC;
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 20b2ff2f4fde..8839fd09540c 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -899,7 +899,7 @@ struct task_struct {
 	struct list_head		cg_list;
 #endif
 #ifdef CONFIG_INTEL_RDT
-	int				closid;
+	u32				closid;
 #endif
 #ifdef CONFIG_FUTEX
 	struct robust_list_head __user	*robust_list;
-- 
cgit v1.2.3


From d6aaba615a482ce7d3ec218cf7b8d02d0d5753b8 Mon Sep 17 00:00:00 2001
From: Vikas Shivappa <vikas.shivappa@linux.intel.com>
Date: Tue, 25 Jul 2017 14:14:34 -0700
Subject: x86/intel_rdt/cqm: Add tasks file support

The root directory, ctrl_mon and monitor groups are populated
with a read/write file named "tasks". When read, it shows all the task
IDs assigned to the resource group.

Tasks can be added to groups by writing the PID to the file. A task can
be present in one "ctrl_mon" group "and" one "monitor" group. IOW a
PID_x can be seen in a ctrl_mon group and a monitor group at the same
time. When a task is added to a ctrl_mon group, it is automatically
removed from the previous ctrl_mon group where it belonged. Similarly if
a task is moved to a monitor group it is removed from the previous
monitor group . Also since the monitor groups can only have subset of
tasks of parent ctrl_mon group, a task can be moved to a monitor group
only if its already present in the parent ctrl_mon group.

Task membership is indicated by a new field in the task_struct "u32
rmid" which holds the RMID for the task. RMID=0 is reserved for the
default root group where the tasks belong to at mount.

[tony: zero the rmid if rdtgroup was deleted when task was being moved]

Signed-off-by: Tony Luck <tony.luck@linux.intel.com>
Signed-off-by: Vikas Shivappa <vikas.shivappa@linux.intel.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: ravi.v.shankar@intel.com
Cc: tony.luck@intel.com
Cc: fenghua.yu@intel.com
Cc: peterz@infradead.org
Cc: eranian@google.com
Cc: vikas.shivappa@intel.com
Cc: ak@linux.intel.com
Cc: davidcc@google.com
Cc: reinette.chatre@intel.com
Link: http://lkml.kernel.org/r/1501017287-28083-16-git-send-email-vikas.shivappa@linux.intel.com
---
 arch/x86/kernel/cpu/intel_rdt_rdtgroup.c | 19 +++++++++++++++++--
 include/linux/sched.h                    |  1 +
 2 files changed, 18 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c b/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c
index 840405421841..aedad1d17880 100644
--- a/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c
+++ b/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c
@@ -314,6 +314,7 @@ static void move_myself(struct callback_head *head)
 	if (atomic_dec_and_test(&rdtgrp->waitcount) &&
 	    (rdtgrp->flags & RDT_DELETED)) {
 		current->closid = 0;
+		current->rmid = 0;
 		kfree(rdtgrp);
 	}
 
@@ -352,7 +353,20 @@ static int __rdtgroup_move_task(struct task_struct *tsk,
 		atomic_dec(&rdtgrp->waitcount);
 		kfree(callback);
 	} else {
-		tsk->closid = rdtgrp->closid;
+		/*
+		 * For ctrl_mon groups move both closid and rmid.
+		 * For monitor groups, can move the tasks only from
+		 * their parent CTRL group.
+		 */
+		if (rdtgrp->type == RDTCTRL_GROUP) {
+			tsk->closid = rdtgrp->closid;
+			tsk->rmid = rdtgrp->mon.rmid;
+		} else if (rdtgrp->type == RDTMON_GROUP) {
+			if (rdtgrp->mon.parent->closid == tsk->closid)
+				tsk->rmid = rdtgrp->mon.rmid;
+			else
+				ret = -EINVAL;
+		}
 	}
 	return ret;
 }
@@ -432,7 +446,8 @@ static void show_rdt_tasks(struct rdtgroup *r, struct seq_file *s)
 
 	rcu_read_lock();
 	for_each_process_thread(p, t) {
-		if (t->closid == r->closid)
+		if ((r->type == RDTCTRL_GROUP && t->closid == r->closid) ||
+		    (r->type == RDTMON_GROUP && t->rmid == r->mon.rmid))
 			seq_printf(s, "%d\n", t->pid);
 	}
 	rcu_read_unlock();
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 8839fd09540c..067a41ac5fd4 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -900,6 +900,7 @@ struct task_struct {
 #endif
 #ifdef CONFIG_INTEL_RDT
 	u32				closid;
+	u32				rmid;
 #endif
 #ifdef CONFIG_FUTEX
 	struct robust_list_head __user	*robust_list;
-- 
cgit v1.2.3


From 306b13eb3cf9515a8214bbf5d69d811371d05792 Mon Sep 17 00:00:00 2001
From: Tom Herbert <tom@quantonium.net>
Date: Fri, 28 Jul 2017 16:22:41 -0700
Subject: proto_ops: Add locked held versions of sendmsg and sendpage

Add new proto_ops sendmsg_locked and sendpage_locked that can be
called when the socket lock is already held. Correspondingly, add
kernel_sendmsg_locked and kernel_sendpage_locked as front end
functions.

These functions will be used in zero proxy so that we can take
the socket lock in a ULP sendmsg/sendpage and then directly call the
backend transport proto_ops functions.

Signed-off-by: Tom Herbert <tom@quantonium.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/net.h | 12 ++++++++++++
 include/net/sock.h  |  3 +++
 include/net/tcp.h   |  3 +++
 net/core/sock.c     | 22 ++++++++++++++++++++++
 net/ipv4/af_inet.c  |  2 ++
 net/ipv4/tcp.c      | 39 ++++++++++++++++++++++++++-------------
 net/socket.c        | 27 +++++++++++++++++++++++++++
 7 files changed, 95 insertions(+), 13 deletions(-)

(limited to 'include')

diff --git a/include/linux/net.h b/include/linux/net.h
index dda2cc939a53..b5c15b31709b 100644
--- a/include/linux/net.h
+++ b/include/linux/net.h
@@ -190,8 +190,16 @@ struct proto_ops {
 				       struct pipe_inode_info *pipe, size_t len, unsigned int flags);
 	int		(*set_peek_off)(struct sock *sk, int val);
 	int		(*peek_len)(struct socket *sock);
+
+	/* The following functions are called internally by kernel with
+	 * sock lock already held.
+	 */
 	int		(*read_sock)(struct sock *sk, read_descriptor_t *desc,
 				     sk_read_actor_t recv_actor);
+	int		(*sendpage_locked)(struct sock *sk, struct page *page,
+					   int offset, size_t size, int flags);
+	int		(*sendmsg_locked)(struct sock *sk, struct msghdr *msg,
+					  size_t size);
 };
 
 #define DECLARE_SOCKADDR(type, dst, src)	\
@@ -279,6 +287,8 @@ do {									\
 
 int kernel_sendmsg(struct socket *sock, struct msghdr *msg, struct kvec *vec,
 		   size_t num, size_t len);
+int kernel_sendmsg_locked(struct sock *sk, struct msghdr *msg,
+			  struct kvec *vec, size_t num, size_t len);
 int kernel_recvmsg(struct socket *sock, struct msghdr *msg, struct kvec *vec,
 		   size_t num, size_t len, int flags);
 
@@ -297,6 +307,8 @@ int kernel_setsockopt(struct socket *sock, int level, int optname, char *optval,
 		      unsigned int optlen);
 int kernel_sendpage(struct socket *sock, struct page *page, int offset,
 		    size_t size, int flags);
+int kernel_sendpage_locked(struct sock *sk, struct page *page, int offset,
+			   size_t size, int flags);
 int kernel_sock_ioctl(struct socket *sock, int cmd, unsigned long arg);
 int kernel_sock_shutdown(struct socket *sock, enum sock_shutdown_cmd how);
 
diff --git a/include/net/sock.h b/include/net/sock.h
index 7c0632c7e870..393c38e9f6aa 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -1582,11 +1582,14 @@ int sock_no_shutdown(struct socket *, int);
 int sock_no_getsockopt(struct socket *, int , int, char __user *, int __user *);
 int sock_no_setsockopt(struct socket *, int, int, char __user *, unsigned int);
 int sock_no_sendmsg(struct socket *, struct msghdr *, size_t);
+int sock_no_sendmsg_locked(struct sock *sk, struct msghdr *msg, size_t len);
 int sock_no_recvmsg(struct socket *, struct msghdr *, size_t, int);
 int sock_no_mmap(struct file *file, struct socket *sock,
 		 struct vm_area_struct *vma);
 ssize_t sock_no_sendpage(struct socket *sock, struct page *page, int offset,
 			 size_t size, int flags);
+ssize_t sock_no_sendpage_locked(struct sock *sk, struct page *page,
+				int offset, size_t size, int flags);
 
 /*
  * Functions to fill in entries in struct proto_ops when a protocol
diff --git a/include/net/tcp.h b/include/net/tcp.h
index 3ecb62811004..bb1881b4ce48 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -350,8 +350,11 @@ int tcp_v4_rcv(struct sk_buff *skb);
 
 int tcp_v4_tw_remember_stamp(struct inet_timewait_sock *tw);
 int tcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t size);
+int tcp_sendmsg_locked(struct sock *sk, struct msghdr *msg, size_t size);
 int tcp_sendpage(struct sock *sk, struct page *page, int offset, size_t size,
 		 int flags);
+int tcp_sendpage_locked(struct sock *sk, struct page *page, int offset,
+			size_t size, int flags);
 ssize_t do_tcp_sendpages(struct sock *sk, struct page *page, int offset,
 		 size_t size, int flags);
 void tcp_release_cb(struct sock *sk);
diff --git a/net/core/sock.c b/net/core/sock.c
index ac2a404c73eb..742f68c9c84a 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -2500,6 +2500,12 @@ int sock_no_sendmsg(struct socket *sock, struct msghdr *m, size_t len)
 }
 EXPORT_SYMBOL(sock_no_sendmsg);
 
+int sock_no_sendmsg_locked(struct sock *sk, struct msghdr *m, size_t len)
+{
+	return -EOPNOTSUPP;
+}
+EXPORT_SYMBOL(sock_no_sendmsg_locked);
+
 int sock_no_recvmsg(struct socket *sock, struct msghdr *m, size_t len,
 		    int flags)
 {
@@ -2528,6 +2534,22 @@ ssize_t sock_no_sendpage(struct socket *sock, struct page *page, int offset, siz
 }
 EXPORT_SYMBOL(sock_no_sendpage);
 
+ssize_t sock_no_sendpage_locked(struct sock *sk, struct page *page,
+				int offset, size_t size, int flags)
+{
+	ssize_t res;
+	struct msghdr msg = {.msg_flags = flags};
+	struct kvec iov;
+	char *kaddr = kmap(page);
+
+	iov.iov_base = kaddr + offset;
+	iov.iov_len = size;
+	res = kernel_sendmsg_locked(sk, &msg, &iov, 1, size);
+	kunmap(page);
+	return res;
+}
+EXPORT_SYMBOL(sock_no_sendpage_locked);
+
 /*
  *	Default Socket Callbacks
  */
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
index 5ce44fb7d498..f0103ffe1cdb 100644
--- a/net/ipv4/af_inet.c
+++ b/net/ipv4/af_inet.c
@@ -944,6 +944,8 @@ const struct proto_ops inet_stream_ops = {
 	.sendpage	   = inet_sendpage,
 	.splice_read	   = tcp_splice_read,
 	.read_sock	   = tcp_read_sock,
+	.sendmsg_locked    = tcp_sendmsg_locked,
+	.sendpage_locked   = tcp_sendpage_locked,
 	.peek_len	   = tcp_peek_len,
 #ifdef CONFIG_COMPAT
 	.compat_setsockopt = compat_sock_common_setsockopt,
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 5326b50a3450..9dd6f4dba9b1 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -1046,23 +1046,29 @@ out_err:
 }
 EXPORT_SYMBOL_GPL(do_tcp_sendpages);
 
-int tcp_sendpage(struct sock *sk, struct page *page, int offset,
-		 size_t size, int flags)
+int tcp_sendpage_locked(struct sock *sk, struct page *page, int offset,
+			size_t size, int flags)
 {
-	ssize_t res;
-
 	if (!(sk->sk_route_caps & NETIF_F_SG) ||
 	    !sk_check_csum_caps(sk))
 		return sock_no_sendpage(sk->sk_socket, page, offset, size,
 					flags);
 
-	lock_sock(sk);
-
 	tcp_rate_check_app_limited(sk);  /* is sending application-limited? */
 
-	res = do_tcp_sendpages(sk, page, offset, size, flags);
+	return do_tcp_sendpages(sk, page, offset, size, flags);
+}
+
+int tcp_sendpage(struct sock *sk, struct page *page, int offset,
+		 size_t size, int flags)
+{
+	int ret;
+
+	lock_sock(sk);
+	ret = tcp_sendpage_locked(sk, page, offset, size, flags);
 	release_sock(sk);
-	return res;
+
+	return ret;
 }
 EXPORT_SYMBOL(tcp_sendpage);
 
@@ -1156,7 +1162,7 @@ static int tcp_sendmsg_fastopen(struct sock *sk, struct msghdr *msg,
 	return err;
 }
 
-int tcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t size)
+int tcp_sendmsg_locked(struct sock *sk, struct msghdr *msg, size_t size)
 {
 	struct tcp_sock *tp = tcp_sk(sk);
 	struct sk_buff *skb;
@@ -1167,8 +1173,6 @@ int tcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t size)
 	bool sg;
 	long timeo;
 
-	lock_sock(sk);
-
 	flags = msg->msg_flags;
 	if (unlikely(flags & MSG_FASTOPEN || inet_sk(sk)->defer_connect)) {
 		err = tcp_sendmsg_fastopen(sk, msg, &copied_syn, size);
@@ -1377,7 +1381,6 @@ out:
 		tcp_push(sk, flags, mss_now, tp->nonagle, size_goal);
 	}
 out_nopush:
-	release_sock(sk);
 	return copied + copied_syn;
 
 do_fault:
@@ -1401,9 +1404,19 @@ out_err:
 		sk->sk_write_space(sk);
 		tcp_chrono_stop(sk, TCP_CHRONO_SNDBUF_LIMITED);
 	}
-	release_sock(sk);
 	return err;
 }
+
+int tcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t size)
+{
+	int ret;
+
+	lock_sock(sk);
+	ret = tcp_sendmsg_locked(sk, msg, size);
+	release_sock(sk);
+
+	return ret;
+}
 EXPORT_SYMBOL(tcp_sendmsg);
 
 /*
diff --git a/net/socket.c b/net/socket.c
index cb0fdf799f40..b332d1e8e4e4 100644
--- a/net/socket.c
+++ b/net/socket.c
@@ -652,6 +652,20 @@ int kernel_sendmsg(struct socket *sock, struct msghdr *msg,
 }
 EXPORT_SYMBOL(kernel_sendmsg);
 
+int kernel_sendmsg_locked(struct sock *sk, struct msghdr *msg,
+			  struct kvec *vec, size_t num, size_t size)
+{
+	struct socket *sock = sk->sk_socket;
+
+	if (!sock->ops->sendmsg_locked)
+		sock_no_sendmsg_locked(sk, msg, size);
+
+	iov_iter_kvec(&msg->msg_iter, WRITE | ITER_KVEC, vec, num, size);
+
+	return sock->ops->sendmsg_locked(sk, msg, msg_data_left(msg));
+}
+EXPORT_SYMBOL(kernel_sendmsg_locked);
+
 static bool skb_is_err_queue(const struct sk_buff *skb)
 {
 	/* pkt_type of skbs enqueued on the error queue are set to
@@ -3376,6 +3390,19 @@ int kernel_sendpage(struct socket *sock, struct page *page, int offset,
 }
 EXPORT_SYMBOL(kernel_sendpage);
 
+int kernel_sendpage_locked(struct sock *sk, struct page *page, int offset,
+			   size_t size, int flags)
+{
+	struct socket *sock = sk->sk_socket;
+
+	if (sock->ops->sendpage_locked)
+		return sock->ops->sendpage_locked(sk, page, offset, size,
+						  flags);
+
+	return sock_no_sendpage_locked(sk, page, offset, size, flags);
+}
+EXPORT_SYMBOL(kernel_sendpage_locked);
+
 int kernel_sock_ioctl(struct socket *sock, int cmd, unsigned long arg)
 {
 	mm_segment_t oldfs = get_fs();
-- 
cgit v1.2.3


From 20bf50de3028cb15fa81e1d1e63ab6e0c85257fc Mon Sep 17 00:00:00 2001
From: Tom Herbert <tom@quantonium.net>
Date: Fri, 28 Jul 2017 16:22:42 -0700
Subject: skbuff: Function to send an skbuf on a socket

Add skb_send_sock to send an skbuff on a socket within the kernel.
Arguments include an offset so that an skbuf might be sent in mulitple
calls (e.g. send buffer limit is hit).

Signed-off-by: Tom Herbert <tom@quantonium.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/skbuff.h |   3 ++
 net/core/skbuff.c      | 101 +++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 104 insertions(+)

(limited to 'include')

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 4093552be1de..18e76bf9574e 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -3113,6 +3113,9 @@ __wsum skb_copy_and_csum_bits(const struct sk_buff *skb, int offset, u8 *to,
 int skb_splice_bits(struct sk_buff *skb, struct sock *sk, unsigned int offset,
 		    struct pipe_inode_info *pipe, unsigned int len,
 		    unsigned int flags);
+int skb_send_sock_locked(struct sock *sk, struct sk_buff *skb, int offset,
+			 int len);
+int skb_send_sock(struct sock *sk, struct sk_buff *skb, int offset, int len);
 void skb_copy_and_csum_dev(const struct sk_buff *skb, u8 *to);
 unsigned int skb_zerocopy_headlen(const struct sk_buff *from);
 int skb_zerocopy(struct sk_buff *to, struct sk_buff *from,
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index c27da51d14e4..9c0e015ff3fe 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -1982,6 +1982,107 @@ int skb_splice_bits(struct sk_buff *skb, struct sock *sk, unsigned int offset,
 }
 EXPORT_SYMBOL_GPL(skb_splice_bits);
 
+/* Send skb data on a socket. Socket must be locked. */
+int skb_send_sock_locked(struct sock *sk, struct sk_buff *skb, int offset,
+			 int len)
+{
+	unsigned int orig_len = len;
+	struct sk_buff *head = skb;
+	unsigned short fragidx;
+	int slen, ret;
+
+do_frag_list:
+
+	/* Deal with head data */
+	while (offset < skb_headlen(skb) && len) {
+		struct kvec kv;
+		struct msghdr msg;
+
+		slen = min_t(int, len, skb_headlen(skb) - offset);
+		kv.iov_base = skb->data + offset;
+		kv.iov_len = len;
+		memset(&msg, 0, sizeof(msg));
+
+		ret = kernel_sendmsg_locked(sk, &msg, &kv, 1, slen);
+		if (ret <= 0)
+			goto error;
+
+		offset += ret;
+		len -= ret;
+	}
+
+	/* All the data was skb head? */
+	if (!len)
+		goto out;
+
+	/* Make offset relative to start of frags */
+	offset -= skb_headlen(skb);
+
+	/* Find where we are in frag list */
+	for (fragidx = 0; fragidx < skb_shinfo(skb)->nr_frags; fragidx++) {
+		skb_frag_t *frag  = &skb_shinfo(skb)->frags[fragidx];
+
+		if (offset < frag->size)
+			break;
+
+		offset -= frag->size;
+	}
+
+	for (; len && fragidx < skb_shinfo(skb)->nr_frags; fragidx++) {
+		skb_frag_t *frag  = &skb_shinfo(skb)->frags[fragidx];
+
+		slen = min_t(size_t, len, frag->size - offset);
+
+		while (slen) {
+			ret = kernel_sendpage_locked(sk, frag->page.p,
+						     frag->page_offset + offset,
+						     slen, MSG_DONTWAIT);
+			if (ret <= 0)
+				goto error;
+
+			len -= ret;
+			offset += ret;
+			slen -= ret;
+		}
+
+		offset = 0;
+	}
+
+	if (len) {
+		/* Process any frag lists */
+
+		if (skb == head) {
+			if (skb_has_frag_list(skb)) {
+				skb = skb_shinfo(skb)->frag_list;
+				goto do_frag_list;
+			}
+		} else if (skb->next) {
+			skb = skb->next;
+			goto do_frag_list;
+		}
+	}
+
+out:
+	return orig_len - len;
+
+error:
+	return orig_len == len ? ret : orig_len - len;
+}
+EXPORT_SYMBOL_GPL(skb_send_sock_locked);
+
+/* Send skb data on a socket. */
+int skb_send_sock(struct sock *sk, struct sk_buff *skb, int offset, int len)
+{
+	int ret = 0;
+
+	lock_sock(sk);
+	ret = skb_send_sock_locked(sk, skb, offset, len);
+	release_sock(sk);
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(skb_send_sock);
+
 /**
  *	skb_store_bits - store bits from kernel buffer to skb
  *	@skb: destination buffer
-- 
cgit v1.2.3


From bbb03029a899679d73e62d7e6ae80348cc5d0054 Mon Sep 17 00:00:00 2001
From: Tom Herbert <tom@quantonium.net>
Date: Fri, 28 Jul 2017 16:22:43 -0700
Subject: strparser: Generalize strparser

Generalize strparser from more than just being used in conjunction
with read_sock. strparser will also be used in the send path with
zero proxy. The primary change is to create strp_process function
that performs the critical processing on skbs. The documentation
is also updated to reflect the new uses.

Signed-off-by: Tom Herbert <tom@quantonium.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 Documentation/networking/strparser.txt | 207 +++++++++++++++-------
 include/net/strparser.h                | 119 +++++++------
 net/kcm/kcmproc.c                      |  34 ++--
 net/kcm/kcmsock.c                      |  38 ++--
 net/strparser/strparser.c              | 313 ++++++++++++++++++++-------------
 5 files changed, 424 insertions(+), 287 deletions(-)

(limited to 'include')

diff --git a/Documentation/networking/strparser.txt b/Documentation/networking/strparser.txt
index a0bf573dfa61..fe01302471ae 100644
--- a/Documentation/networking/strparser.txt
+++ b/Documentation/networking/strparser.txt
@@ -1,45 +1,107 @@
-Stream Parser
--------------
+Stream Parser (strparser)
+
+Introduction
+============
 
 The stream parser (strparser) is a utility that parses messages of an
-application layer protocol running over a TCP connection. The stream
+application layer protocol running over a data stream. The stream
 parser works in conjunction with an upper layer in the kernel to provide
 kernel support for application layer messages. For instance, Kernel
 Connection Multiplexor (KCM) uses the Stream Parser to parse messages
 using a BPF program.
 
+The strparser works in one of two modes: receive callback or general
+mode.
+
+In receive callback mode, the strparser is called from the data_ready
+callback of a TCP socket. Messages are parsed and delivered as they are
+received on the socket.
+
+In general mode, a sequence of skbs are fed to strparser from an
+outside source. Message are parsed and delivered as the sequence is
+processed. This modes allows strparser to be applied to arbitrary
+streams of data.
+
 Interface
----------
+=========
 
 The API includes a context structure, a set of callbacks, utility
-functions, and a data_ready function. The callbacks include
-a parse_msg function that is called to perform parsing (e.g.
-BPF parsing in case of KCM), and a rcv_msg function that is called
-when a full message has been completed.
+functions, and a data_ready function for receive callback mode. The
+callbacks include a parse_msg function that is called to perform
+parsing (e.g.  BPF parsing in case of KCM), and a rcv_msg function
+that is called when a full message has been completed.
 
-A stream parser can be instantiated for a TCP connection. This is done
-by:
+Functions
+=========
 
-strp_init(struct strparser *strp, struct sock *csk,
+strp_init(struct strparser *strp, struct sock *sk,
 	  struct strp_callbacks *cb)
 
-strp is a struct of type strparser that is allocated by the upper layer.
-csk is the TCP socket associated with the stream parser. Callbacks are
-called by the stream parser.
+     Called to initialize a stream parser. strp is a struct of type
+     strparser that is allocated by the upper layer. sk is the TCP
+     socket associated with the stream parser for use with receive
+     callback mode; in general mode this is set to NULL. Callbacks
+     are called by the stream parser (the callbacks are listed below).
+
+void strp_pause(struct strparser *strp)
+
+     Temporarily pause a stream parser. Message parsing is suspended
+     and no new messages are delivered to the upper layer.
+
+void strp_pause(struct strparser *strp)
+
+     Unpause a paused stream parser.
+
+void strp_stop(struct strparser *strp);
+
+     strp_stop is called to completely stop stream parser operations.
+     This is called internally when the stream parser encounters an
+     error, and it is called from the upper layer to stop parsing
+     operations.
+
+void strp_done(struct strparser *strp);
+
+     strp_done is called to release any resources held by the stream
+     parser instance. This must be called after the stream processor
+     has been stopped.
+
+int strp_process(struct strparser *strp, struct sk_buff *orig_skb,
+		 unsigned int orig_offset, size_t orig_len,
+		 size_t max_msg_size, long timeo)
+
+    strp_process is called in general mode for a stream parser to
+    parse an sk_buff. The number of bytes processed or a negative
+    error number is returned. Note that strp_process does not
+    consume the sk_buff. max_msg_size is maximum size the stream
+    parser will parse. timeo is timeout for completing a message.
+
+void strp_data_ready(struct strparser *strp);
+
+    The upper layer calls strp_tcp_data_ready when data is ready on
+    the lower socket for strparser to process. This should be called
+    from a data_ready callback that is set on the socket. Note that
+    maximum messages size is the limit of the receive socket
+    buffer and message timeout is the receive timeout for the socket.
+
+void strp_check_rcv(struct strparser *strp);
+
+    strp_check_rcv is called to check for new messages on the socket.
+    This is normally called at initialization of a stream parser
+    instance or after strp_unpause.
 
 Callbacks
----------
+=========
 
-There are four callbacks:
+There are six callbacks:
 
 int (*parse_msg)(struct strparser *strp, struct sk_buff *skb);
 
     parse_msg is called to determine the length of the next message
     in the stream. The upper layer must implement this function. It
     should parse the sk_buff as containing the headers for the
-    next application layer messages in the stream.
+    next application layer message in the stream.
 
-    The skb->cb in the input skb is a struct strp_rx_msg. Only
+    The skb->cb in the input skb is a struct strp_msg. Only
     the offset field is relevant in parse_msg and gives the offset
     where the message starts in the skb.
 
@@ -50,26 +112,41 @@ int (*parse_msg)(struct strparser *strp, struct sk_buff *skb);
     -ESTRPIPE : current message should not be processed by the
           kernel, return control of the socket to userspace which
           can proceed to read the messages itself
-    other < 0 : Error is parsing, give control back to userspace
+    other < 0 : Error in parsing, give control back to userspace
           assuming that synchronization is lost and the stream
           is unrecoverable (application expected to close TCP socket)
 
     In the case that an error is returned (return value is less than
-    zero) the stream parser will set the error on TCP socket and wake
-    it up. If parse_msg returned -ESTRPIPE and the stream parser had
-    previously read some bytes for the current message, then the error
-    set on the attached socket is ENODATA since the stream is
-    unrecoverable in that case.
+    zero) and the parser is in receive callback mode, then it will set
+    the error on TCP socket and wake it up. If parse_msg returned
+    -ESTRPIPE and the stream parser had previously read some bytes for
+    the current message, then the error set on the attached socket is
+    ENODATA since the stream is unrecoverable in that case.
+
+void (*lock)(struct strparser *strp)
+
+    The lock callback is called to lock the strp structure when
+    the strparser is performing an asynchronous operation (such as
+    processing a timeout). In receive callback mode the default
+    function is to lock_sock for the associated socket. In general
+    mode the callback must be set appropriately.
+
+void (*unlock)(struct strparser *strp)
+
+    The unlock callback is called to release the lock obtained
+    by the lock callback. In receive callback mode the default
+    function is release_sock for the associated socket. In general
+    mode the callback must be set appropriately.
 
 void (*rcv_msg)(struct strparser *strp, struct sk_buff *skb);
 
     rcv_msg is called when a full message has been received and
     is queued. The callee must consume the sk_buff; it can
     call strp_pause to prevent any further messages from being
-    received in rcv_msg (see strp_pause below). This callback
+    received in rcv_msg (see strp_pause above). This callback
     must be set.
 
-    The skb->cb in the input skb is a struct strp_rx_msg. This
+    The skb->cb in the input skb is a struct strp_msg. This
     struct contains two fields: offset and full_len. Offset is
     where the message starts in the skb, and full_len is the
     the length of the message. skb->len - offset may be greater
@@ -78,59 +155,53 @@ void (*rcv_msg)(struct strparser *strp, struct sk_buff *skb);
 int (*read_sock_done)(struct strparser *strp, int err);
 
      read_sock_done is called when the stream parser is done reading
-     the TCP socket. The stream parser may read multiple messages
-     in a loop and this function allows cleanup to occur when existing
-     the loop. If the callback is not set (NULL in strp_init) a
-     default function is used.
+     the TCP socket in receive callback mode. The stream parser may
+     read multiple messages in a loop and this function allows cleanup
+     to occur when exiting the loop. If the callback is not set (NULL
+     in strp_init) a default function is used.
 
 void (*abort_parser)(struct strparser *strp, int err);
 
      This function is called when stream parser encounters an error
-     in parsing. The default function stops the stream parser for the
-     TCP socket and sets the error in the socket. The default function
-     can be changed by setting the callback to non-NULL in strp_init.
+     in parsing. The default function stops the stream parser and
+     sets the error in the socket if the parser is in receive callback
+     mode. The default function can be changed by setting the callback
+     to non-NULL in strp_init.
 
-Functions
----------
+Statistics
+==========
 
-The upper layer calls strp_tcp_data_ready when data is ready on the lower
-socket for strparser to process. This should be called from a data_ready
-callback that is set on the socket.
+Various counters are kept for each stream parser instance. These are in
+the strp_stats structure. strp_aggr_stats is a convenience structure for
+accumulating statistics for multiple stream parser instances.
+save_strp_stats and aggregate_strp_stats are helper functions to save
+and aggregate statistics.
 
-strp_stop is called to completely stop stream parser operations. This
-is called internally when the stream parser encounters an error, and
-it is called from the upper layer when unattaching a TCP socket.
+Message assembly limits
+=======================
 
-strp_done is called to unattach the stream parser from the TCP socket.
-This must be called after the stream processor has be stopped.
+The stream parser provide mechanisms to limit the resources consumed by
+message assembly.
 
-strp_check_rcv is called to check for new messages on the socket. This
-is normally called at initialization of the a stream parser instance
-of after strp_unpause.
+A timer is set when assembly starts for a new message. In receive
+callback mode the message timeout is taken from rcvtime for the
+associated TCP socket. In general mode, the timeout is passed as an
+argument in strp_process. If the timer fires before assembly completes
+the stream parser is aborted and the ETIMEDOUT error is set on the TCP
+socket if in receive callback mode.
 
-Statistics
-----------
+In receive callback mode, message length is limited to the receive
+buffer size of the associated TCP socket. If the length returned by
+parse_msg is greater than the socket buffer size then the stream parser
+is aborted with EMSGSIZE error set on the TCP socket. Note that this
+makes the maximum size of receive skbuffs for a socket with a stream
+parser to be 2*sk_rcvbuf of the TCP socket.
 
-Various counters are kept for each stream parser for a TCP socket.
-These are in the strp_stats structure. strp_aggr_stats is a convenience
-structure for accumulating statistics for multiple stream parser
-instances. save_strp_stats and aggregate_strp_stats are helper functions
-to save and aggregate statistics.
+In general mode the message length limit is passed in as an argument
+to strp_process.
 
-Message assembly limits
------------------------
+Author
+======
 
-The stream parser provide mechanisms to limit the resources consumed by
-message assembly.
+Tom Herbert (tom@quantonium.net)
 
-A timer is set when assembly starts for a new message. The message
-timeout is taken from rcvtime for the associated TCP socket. If the
-timer fires before assembly completes the stream parser is aborted
-and the ETIMEDOUT error is set on the TCP socket.
-
-Message length is limited to the receive buffer size of the associated
-TCP socket. If the length returned by parse_msg is greater than
-the socket buffer size then the stream parser is aborted with
-EMSGSIZE error set on the TCP socket. Note that this makes the
-maximum size of receive skbuffs for a socket with a stream parser
-to be 2*sk_rcvbuf of the TCP socket.
diff --git a/include/net/strparser.h b/include/net/strparser.h
index 0c28ad97c52f..4fe966a0ad92 100644
--- a/include/net/strparser.h
+++ b/include/net/strparser.h
@@ -18,26 +18,26 @@
 #define STRP_STATS_INCR(stat) ((stat)++)
 
 struct strp_stats {
-	unsigned long long rx_msgs;
-	unsigned long long rx_bytes;
-	unsigned int rx_mem_fail;
-	unsigned int rx_need_more_hdr;
-	unsigned int rx_msg_too_big;
-	unsigned int rx_msg_timeouts;
-	unsigned int rx_bad_hdr_len;
+	unsigned long long msgs;
+	unsigned long long bytes;
+	unsigned int mem_fail;
+	unsigned int need_more_hdr;
+	unsigned int msg_too_big;
+	unsigned int msg_timeouts;
+	unsigned int bad_hdr_len;
 };
 
 struct strp_aggr_stats {
-	unsigned long long rx_msgs;
-	unsigned long long rx_bytes;
-	unsigned int rx_mem_fail;
-	unsigned int rx_need_more_hdr;
-	unsigned int rx_msg_too_big;
-	unsigned int rx_msg_timeouts;
-	unsigned int rx_bad_hdr_len;
-	unsigned int rx_aborts;
-	unsigned int rx_interrupted;
-	unsigned int rx_unrecov_intr;
+	unsigned long long msgs;
+	unsigned long long bytes;
+	unsigned int mem_fail;
+	unsigned int need_more_hdr;
+	unsigned int msg_too_big;
+	unsigned int msg_timeouts;
+	unsigned int bad_hdr_len;
+	unsigned int aborts;
+	unsigned int interrupted;
+	unsigned int unrecov_intr;
 };
 
 struct strparser;
@@ -48,16 +48,18 @@ struct strp_callbacks {
 	void (*rcv_msg)(struct strparser *strp, struct sk_buff *skb);
 	int (*read_sock_done)(struct strparser *strp, int err);
 	void (*abort_parser)(struct strparser *strp, int err);
+	void (*lock)(struct strparser *strp);
+	void (*unlock)(struct strparser *strp);
 };
 
-struct strp_rx_msg {
+struct strp_msg {
 	int full_len;
 	int offset;
 };
 
-static inline struct strp_rx_msg *strp_rx_msg(struct sk_buff *skb)
+static inline struct strp_msg *strp_msg(struct sk_buff *skb)
 {
-	return (struct strp_rx_msg *)((void *)skb->cb +
+	return (struct strp_msg *)((void *)skb->cb +
 		offsetof(struct qdisc_skb_cb, data));
 }
 
@@ -65,18 +67,18 @@ static inline struct strp_rx_msg *strp_rx_msg(struct sk_buff *skb)
 struct strparser {
 	struct sock *sk;
 
-	u32 rx_stopped : 1;
-	u32 rx_paused : 1;
-	u32 rx_aborted : 1;
-	u32 rx_interrupted : 1;
-	u32 rx_unrecov_intr : 1;
-
-	struct sk_buff **rx_skb_nextp;
-	struct timer_list rx_msg_timer;
-	struct sk_buff *rx_skb_head;
-	unsigned int rx_need_bytes;
-	struct delayed_work rx_delayed_work;
-	struct work_struct rx_work;
+	u32 stopped : 1;
+	u32 paused : 1;
+	u32 aborted : 1;
+	u32 interrupted : 1;
+	u32 unrecov_intr : 1;
+
+	struct sk_buff **skb_nextp;
+	struct timer_list msg_timer;
+	struct sk_buff *skb_head;
+	unsigned int need_bytes;
+	struct delayed_work delayed_work;
+	struct work_struct work;
 	struct strp_stats stats;
 	struct strp_callbacks cb;
 };
@@ -84,7 +86,7 @@ struct strparser {
 /* Must be called with lock held for attached socket */
 static inline void strp_pause(struct strparser *strp)
 {
-	strp->rx_paused = 1;
+	strp->paused = 1;
 }
 
 /* May be called without holding lock for attached socket */
@@ -97,37 +99,37 @@ static inline void save_strp_stats(struct strparser *strp,
 
 #define SAVE_PSOCK_STATS(_stat) (agg_stats->_stat +=		\
 				 strp->stats._stat)
-	SAVE_PSOCK_STATS(rx_msgs);
-	SAVE_PSOCK_STATS(rx_bytes);
-	SAVE_PSOCK_STATS(rx_mem_fail);
-	SAVE_PSOCK_STATS(rx_need_more_hdr);
-	SAVE_PSOCK_STATS(rx_msg_too_big);
-	SAVE_PSOCK_STATS(rx_msg_timeouts);
-	SAVE_PSOCK_STATS(rx_bad_hdr_len);
+	SAVE_PSOCK_STATS(msgs);
+	SAVE_PSOCK_STATS(bytes);
+	SAVE_PSOCK_STATS(mem_fail);
+	SAVE_PSOCK_STATS(need_more_hdr);
+	SAVE_PSOCK_STATS(msg_too_big);
+	SAVE_PSOCK_STATS(msg_timeouts);
+	SAVE_PSOCK_STATS(bad_hdr_len);
 #undef SAVE_PSOCK_STATS
 
-	if (strp->rx_aborted)
-		agg_stats->rx_aborts++;
-	if (strp->rx_interrupted)
-		agg_stats->rx_interrupted++;
-	if (strp->rx_unrecov_intr)
-		agg_stats->rx_unrecov_intr++;
+	if (strp->aborted)
+		agg_stats->aborts++;
+	if (strp->interrupted)
+		agg_stats->interrupted++;
+	if (strp->unrecov_intr)
+		agg_stats->unrecov_intr++;
 }
 
 static inline void aggregate_strp_stats(struct strp_aggr_stats *stats,
 					struct strp_aggr_stats *agg_stats)
 {
 #define SAVE_PSOCK_STATS(_stat) (agg_stats->_stat += stats->_stat)
-	SAVE_PSOCK_STATS(rx_msgs);
-	SAVE_PSOCK_STATS(rx_bytes);
-	SAVE_PSOCK_STATS(rx_mem_fail);
-	SAVE_PSOCK_STATS(rx_need_more_hdr);
-	SAVE_PSOCK_STATS(rx_msg_too_big);
-	SAVE_PSOCK_STATS(rx_msg_timeouts);
-	SAVE_PSOCK_STATS(rx_bad_hdr_len);
-	SAVE_PSOCK_STATS(rx_aborts);
-	SAVE_PSOCK_STATS(rx_interrupted);
-	SAVE_PSOCK_STATS(rx_unrecov_intr);
+	SAVE_PSOCK_STATS(msgs);
+	SAVE_PSOCK_STATS(bytes);
+	SAVE_PSOCK_STATS(mem_fail);
+	SAVE_PSOCK_STATS(need_more_hdr);
+	SAVE_PSOCK_STATS(msg_too_big);
+	SAVE_PSOCK_STATS(msg_timeouts);
+	SAVE_PSOCK_STATS(bad_hdr_len);
+	SAVE_PSOCK_STATS(aborts);
+	SAVE_PSOCK_STATS(interrupted);
+	SAVE_PSOCK_STATS(unrecov_intr);
 #undef SAVE_PSOCK_STATS
 
 }
@@ -135,8 +137,11 @@ static inline void aggregate_strp_stats(struct strp_aggr_stats *stats,
 void strp_done(struct strparser *strp);
 void strp_stop(struct strparser *strp);
 void strp_check_rcv(struct strparser *strp);
-int strp_init(struct strparser *strp, struct sock *csk,
+int strp_init(struct strparser *strp, struct sock *sk,
 	      struct strp_callbacks *cb);
 void strp_data_ready(struct strparser *strp);
+int strp_process(struct strparser *strp, struct sk_buff *orig_skb,
+		 unsigned int orig_offset, size_t orig_len,
+		 size_t max_msg_size, long timeo);
 
 #endif /* __NET_STRPARSER_H_ */
diff --git a/net/kcm/kcmproc.c b/net/kcm/kcmproc.c
index c343ac60bf50..c748e8a6a72c 100644
--- a/net/kcm/kcmproc.c
+++ b/net/kcm/kcmproc.c
@@ -155,8 +155,8 @@ static void kcm_format_psock(struct kcm_psock *psock, struct seq_file *seq,
 	seq_printf(seq,
 		   "   psock-%-5u %-10llu %-16llu %-10llu %-16llu %-8d %-8d %-8d %-8d ",
 		   psock->index,
-		   psock->strp.stats.rx_msgs,
-		   psock->strp.stats.rx_bytes,
+		   psock->strp.stats.msgs,
+		   psock->strp.stats.bytes,
 		   psock->stats.tx_msgs,
 		   psock->stats.tx_bytes,
 		   psock->sk->sk_receive_queue.qlen,
@@ -170,22 +170,22 @@ static void kcm_format_psock(struct kcm_psock *psock, struct seq_file *seq,
 	if (psock->tx_stopped)
 		seq_puts(seq, "TxStop ");
 
-	if (psock->strp.rx_stopped)
+	if (psock->strp.stopped)
 		seq_puts(seq, "RxStop ");
 
 	if (psock->tx_kcm)
 		seq_printf(seq, "Rsvd-%d ", psock->tx_kcm->index);
 
-	if (!psock->strp.rx_paused && !psock->ready_rx_msg) {
+	if (!psock->strp.paused && !psock->ready_rx_msg) {
 		if (psock->sk->sk_receive_queue.qlen) {
-			if (psock->strp.rx_need_bytes)
+			if (psock->strp.need_bytes)
 				seq_printf(seq, "RxWait=%u ",
-					   psock->strp.rx_need_bytes);
+					   psock->strp.need_bytes);
 			else
 				seq_printf(seq, "RxWait ");
 		}
 	} else  {
-		if (psock->strp.rx_paused)
+		if (psock->strp.paused)
 			seq_puts(seq, "RxPause ");
 
 		if (psock->ready_rx_msg)
@@ -371,20 +371,20 @@ static int kcm_stats_seq_show(struct seq_file *seq, void *v)
 	seq_printf(seq,
 		   "%-8s %-10llu %-16llu %-10llu %-16llu %-10llu %-10llu %-10u %-10u %-10u %-10u %-10u %-10u %-10u %-10u %-10u\n",
 		   "",
-		   strp_stats.rx_msgs,
-		   strp_stats.rx_bytes,
+		   strp_stats.msgs,
+		   strp_stats.bytes,
 		   psock_stats.tx_msgs,
 		   psock_stats.tx_bytes,
 		   psock_stats.reserved,
 		   psock_stats.unreserved,
-		   strp_stats.rx_aborts,
-		   strp_stats.rx_interrupted,
-		   strp_stats.rx_unrecov_intr,
-		   strp_stats.rx_mem_fail,
-		   strp_stats.rx_need_more_hdr,
-		   strp_stats.rx_bad_hdr_len,
-		   strp_stats.rx_msg_too_big,
-		   strp_stats.rx_msg_timeouts,
+		   strp_stats.aborts,
+		   strp_stats.interrupted,
+		   strp_stats.unrecov_intr,
+		   strp_stats.mem_fail,
+		   strp_stats.need_more_hdr,
+		   strp_stats.bad_hdr_len,
+		   strp_stats.msg_too_big,
+		   strp_stats.msg_timeouts,
 		   psock_stats.tx_aborts);
 
 	return 0;
diff --git a/net/kcm/kcmsock.c b/net/kcm/kcmsock.c
index da49191f7ad0..88ce73288247 100644
--- a/net/kcm/kcmsock.c
+++ b/net/kcm/kcmsock.c
@@ -96,12 +96,12 @@ static void kcm_update_rx_mux_stats(struct kcm_mux *mux,
 				    struct kcm_psock *psock)
 {
 	STRP_STATS_ADD(mux->stats.rx_bytes,
-		       psock->strp.stats.rx_bytes -
+		       psock->strp.stats.bytes -
 		       psock->saved_rx_bytes);
 	mux->stats.rx_msgs +=
-		psock->strp.stats.rx_msgs - psock->saved_rx_msgs;
-	psock->saved_rx_msgs = psock->strp.stats.rx_msgs;
-	psock->saved_rx_bytes = psock->strp.stats.rx_bytes;
+		psock->strp.stats.msgs - psock->saved_rx_msgs;
+	psock->saved_rx_msgs = psock->strp.stats.msgs;
+	psock->saved_rx_bytes = psock->strp.stats.bytes;
 }
 
 static void kcm_update_tx_mux_stats(struct kcm_mux *mux,
@@ -1118,7 +1118,7 @@ static int kcm_recvmsg(struct socket *sock, struct msghdr *msg,
 	struct kcm_sock *kcm = kcm_sk(sk);
 	int err = 0;
 	long timeo;
-	struct strp_rx_msg *rxm;
+	struct strp_msg *stm;
 	int copied = 0;
 	struct sk_buff *skb;
 
@@ -1132,26 +1132,26 @@ static int kcm_recvmsg(struct socket *sock, struct msghdr *msg,
 
 	/* Okay, have a message on the receive queue */
 
-	rxm = strp_rx_msg(skb);
+	stm = strp_msg(skb);
 
-	if (len > rxm->full_len)
-		len = rxm->full_len;
+	if (len > stm->full_len)
+		len = stm->full_len;
 
-	err = skb_copy_datagram_msg(skb, rxm->offset, msg, len);
+	err = skb_copy_datagram_msg(skb, stm->offset, msg, len);
 	if (err < 0)
 		goto out;
 
 	copied = len;
 	if (likely(!(flags & MSG_PEEK))) {
 		KCM_STATS_ADD(kcm->stats.rx_bytes, copied);
-		if (copied < rxm->full_len) {
+		if (copied < stm->full_len) {
 			if (sock->type == SOCK_DGRAM) {
 				/* Truncated message */
 				msg->msg_flags |= MSG_TRUNC;
 				goto msg_finished;
 			}
-			rxm->offset += copied;
-			rxm->full_len -= copied;
+			stm->offset += copied;
+			stm->full_len -= copied;
 		} else {
 msg_finished:
 			/* Finished with message */
@@ -1175,7 +1175,7 @@ static ssize_t kcm_splice_read(struct socket *sock, loff_t *ppos,
 	struct sock *sk = sock->sk;
 	struct kcm_sock *kcm = kcm_sk(sk);
 	long timeo;
-	struct strp_rx_msg *rxm;
+	struct strp_msg *stm;
 	int err = 0;
 	ssize_t copied;
 	struct sk_buff *skb;
@@ -1192,12 +1192,12 @@ static ssize_t kcm_splice_read(struct socket *sock, loff_t *ppos,
 
 	/* Okay, have a message on the receive queue */
 
-	rxm = strp_rx_msg(skb);
+	stm = strp_msg(skb);
 
-	if (len > rxm->full_len)
-		len = rxm->full_len;
+	if (len > stm->full_len)
+		len = stm->full_len;
 
-	copied = skb_splice_bits(skb, sk, rxm->offset, pipe, len, flags);
+	copied = skb_splice_bits(skb, sk, stm->offset, pipe, len, flags);
 	if (copied < 0) {
 		err = copied;
 		goto err_out;
@@ -1205,8 +1205,8 @@ static ssize_t kcm_splice_read(struct socket *sock, loff_t *ppos,
 
 	KCM_STATS_ADD(kcm->stats.rx_bytes, copied);
 
-	rxm->offset += copied;
-	rxm->full_len -= copied;
+	stm->offset += copied;
+	stm->full_len -= copied;
 
 	/* We have no way to return MSG_EOR. If all the bytes have been
 	 * read we still leave the message in the receive socket buffer.
diff --git a/net/strparser/strparser.c b/net/strparser/strparser.c
index b5c279b22680..0d18fbc6f870 100644
--- a/net/strparser/strparser.c
+++ b/net/strparser/strparser.c
@@ -29,44 +29,46 @@
 
 static struct workqueue_struct *strp_wq;
 
-struct _strp_rx_msg {
-	/* Internal cb structure. struct strp_rx_msg must be first for passing
+struct _strp_msg {
+	/* Internal cb structure. struct strp_msg must be first for passing
 	 * to upper layer.
 	 */
-	struct strp_rx_msg strp;
+	struct strp_msg strp;
 	int accum_len;
 	int early_eaten;
 };
 
-static inline struct _strp_rx_msg *_strp_rx_msg(struct sk_buff *skb)
+static inline struct _strp_msg *_strp_msg(struct sk_buff *skb)
 {
-	return (struct _strp_rx_msg *)((void *)skb->cb +
+	return (struct _strp_msg *)((void *)skb->cb +
 		offsetof(struct qdisc_skb_cb, data));
 }
 
 /* Lower lock held */
-static void strp_abort_rx_strp(struct strparser *strp, int err)
+static void strp_abort_strp(struct strparser *strp, int err)
 {
-	struct sock *csk = strp->sk;
-
 	/* Unrecoverable error in receive */
 
-	del_timer(&strp->rx_msg_timer);
+	del_timer(&strp->msg_timer);
 
-	if (strp->rx_stopped)
+	if (strp->stopped)
 		return;
 
-	strp->rx_stopped = 1;
+	strp->stopped = 1;
+
+	if (strp->sk) {
+		struct sock *sk = strp->sk;
 
-	/* Report an error on the lower socket */
-	csk->sk_err = err;
-	csk->sk_error_report(csk);
+		/* Report an error on the lower socket */
+		sk->sk_err = err;
+		sk->sk_error_report(sk);
+	}
 }
 
-static void strp_start_rx_timer(struct strparser *strp)
+static void strp_start_timer(struct strparser *strp, long timeo)
 {
-	if (strp->sk->sk_rcvtimeo)
-		mod_timer(&strp->rx_msg_timer, strp->sk->sk_rcvtimeo);
+	if (timeo)
+		mod_timer(&strp->msg_timer, timeo);
 }
 
 /* Lower lock held */
@@ -74,46 +76,55 @@ static void strp_parser_err(struct strparser *strp, int err,
 			    read_descriptor_t *desc)
 {
 	desc->error = err;
-	kfree_skb(strp->rx_skb_head);
-	strp->rx_skb_head = NULL;
+	kfree_skb(strp->skb_head);
+	strp->skb_head = NULL;
 	strp->cb.abort_parser(strp, err);
 }
 
 static inline int strp_peek_len(struct strparser *strp)
 {
-	struct socket *sock = strp->sk->sk_socket;
+	if (strp->sk) {
+		struct socket *sock = strp->sk->sk_socket;
+
+		return sock->ops->peek_len(sock);
+	}
+
+	/* If we don't have an associated socket there's nothing to peek.
+	 * Return int max to avoid stopping the strparser.
+	 */
 
-	return sock->ops->peek_len(sock);
+	return INT_MAX;
 }
 
 /* Lower socket lock held */
-static int strp_recv(read_descriptor_t *desc, struct sk_buff *orig_skb,
-		     unsigned int orig_offset, size_t orig_len)
+static int __strp_recv(read_descriptor_t *desc, struct sk_buff *orig_skb,
+		       unsigned int orig_offset, size_t orig_len,
+		       size_t max_msg_size, long timeo)
 {
 	struct strparser *strp = (struct strparser *)desc->arg.data;
-	struct _strp_rx_msg *rxm;
+	struct _strp_msg *stm;
 	struct sk_buff *head, *skb;
 	size_t eaten = 0, cand_len;
 	ssize_t extra;
 	int err;
 	bool cloned_orig = false;
 
-	if (strp->rx_paused)
+	if (strp->paused)
 		return 0;
 
-	head = strp->rx_skb_head;
+	head = strp->skb_head;
 	if (head) {
 		/* Message already in progress */
 
-		rxm = _strp_rx_msg(head);
-		if (unlikely(rxm->early_eaten)) {
+		stm = _strp_msg(head);
+		if (unlikely(stm->early_eaten)) {
 			/* Already some number of bytes on the receive sock
-			 * data saved in rx_skb_head, just indicate they
+			 * data saved in skb_head, just indicate they
 			 * are consumed.
 			 */
-			eaten = orig_len <= rxm->early_eaten ?
-				orig_len : rxm->early_eaten;
-			rxm->early_eaten -= eaten;
+			eaten = orig_len <= stm->early_eaten ?
+				orig_len : stm->early_eaten;
+			stm->early_eaten -= eaten;
 
 			return eaten;
 		}
@@ -126,12 +137,12 @@ static int strp_recv(read_descriptor_t *desc, struct sk_buff *orig_skb,
 			 */
 			orig_skb = skb_clone(orig_skb, GFP_ATOMIC);
 			if (!orig_skb) {
-				STRP_STATS_INCR(strp->stats.rx_mem_fail);
+				STRP_STATS_INCR(strp->stats.mem_fail);
 				desc->error = -ENOMEM;
 				return 0;
 			}
 			if (!pskb_pull(orig_skb, orig_offset)) {
-				STRP_STATS_INCR(strp->stats.rx_mem_fail);
+				STRP_STATS_INCR(strp->stats.mem_fail);
 				kfree_skb(orig_skb);
 				desc->error = -ENOMEM;
 				return 0;
@@ -140,13 +151,13 @@ static int strp_recv(read_descriptor_t *desc, struct sk_buff *orig_skb,
 			orig_offset = 0;
 		}
 
-		if (!strp->rx_skb_nextp) {
+		if (!strp->skb_nextp) {
 			/* We are going to append to the frags_list of head.
 			 * Need to unshare the frag_list.
 			 */
 			err = skb_unclone(head, GFP_ATOMIC);
 			if (err) {
-				STRP_STATS_INCR(strp->stats.rx_mem_fail);
+				STRP_STATS_INCR(strp->stats.mem_fail);
 				desc->error = err;
 				return 0;
 			}
@@ -165,20 +176,20 @@ static int strp_recv(read_descriptor_t *desc, struct sk_buff *orig_skb,
 
 				skb = alloc_skb(0, GFP_ATOMIC);
 				if (!skb) {
-					STRP_STATS_INCR(strp->stats.rx_mem_fail);
+					STRP_STATS_INCR(strp->stats.mem_fail);
 					desc->error = -ENOMEM;
 					return 0;
 				}
 				skb->len = head->len;
 				skb->data_len = head->len;
 				skb->truesize = head->truesize;
-				*_strp_rx_msg(skb) = *_strp_rx_msg(head);
-				strp->rx_skb_nextp = &head->next;
+				*_strp_msg(skb) = *_strp_msg(head);
+				strp->skb_nextp = &head->next;
 				skb_shinfo(skb)->frag_list = head;
-				strp->rx_skb_head = skb;
+				strp->skb_head = skb;
 				head = skb;
 			} else {
-				strp->rx_skb_nextp =
+				strp->skb_nextp =
 				    &skb_shinfo(head)->frag_list;
 			}
 		}
@@ -188,112 +199,112 @@ static int strp_recv(read_descriptor_t *desc, struct sk_buff *orig_skb,
 		/* Always clone since we will consume something */
 		skb = skb_clone(orig_skb, GFP_ATOMIC);
 		if (!skb) {
-			STRP_STATS_INCR(strp->stats.rx_mem_fail);
+			STRP_STATS_INCR(strp->stats.mem_fail);
 			desc->error = -ENOMEM;
 			break;
 		}
 
 		cand_len = orig_len - eaten;
 
-		head = strp->rx_skb_head;
+		head = strp->skb_head;
 		if (!head) {
 			head = skb;
-			strp->rx_skb_head = head;
-			/* Will set rx_skb_nextp on next packet if needed */
-			strp->rx_skb_nextp = NULL;
-			rxm = _strp_rx_msg(head);
-			memset(rxm, 0, sizeof(*rxm));
-			rxm->strp.offset = orig_offset + eaten;
+			strp->skb_head = head;
+			/* Will set skb_nextp on next packet if needed */
+			strp->skb_nextp = NULL;
+			stm = _strp_msg(head);
+			memset(stm, 0, sizeof(*stm));
+			stm->strp.offset = orig_offset + eaten;
 		} else {
 			/* Unclone since we may be appending to an skb that we
 			 * already share a frag_list with.
 			 */
 			err = skb_unclone(skb, GFP_ATOMIC);
 			if (err) {
-				STRP_STATS_INCR(strp->stats.rx_mem_fail);
+				STRP_STATS_INCR(strp->stats.mem_fail);
 				desc->error = err;
 				break;
 			}
 
-			rxm = _strp_rx_msg(head);
-			*strp->rx_skb_nextp = skb;
-			strp->rx_skb_nextp = &skb->next;
+			stm = _strp_msg(head);
+			*strp->skb_nextp = skb;
+			strp->skb_nextp = &skb->next;
 			head->data_len += skb->len;
 			head->len += skb->len;
 			head->truesize += skb->truesize;
 		}
 
-		if (!rxm->strp.full_len) {
+		if (!stm->strp.full_len) {
 			ssize_t len;
 
 			len = (*strp->cb.parse_msg)(strp, head);
 
 			if (!len) {
 				/* Need more header to determine length */
-				if (!rxm->accum_len) {
+				if (!stm->accum_len) {
 					/* Start RX timer for new message */
-					strp_start_rx_timer(strp);
+					strp_start_timer(strp, timeo);
 				}
-				rxm->accum_len += cand_len;
+				stm->accum_len += cand_len;
 				eaten += cand_len;
-				STRP_STATS_INCR(strp->stats.rx_need_more_hdr);
+				STRP_STATS_INCR(strp->stats.need_more_hdr);
 				WARN_ON(eaten != orig_len);
 				break;
 			} else if (len < 0) {
-				if (len == -ESTRPIPE && rxm->accum_len) {
+				if (len == -ESTRPIPE && stm->accum_len) {
 					len = -ENODATA;
-					strp->rx_unrecov_intr = 1;
+					strp->unrecov_intr = 1;
 				} else {
-					strp->rx_interrupted = 1;
+					strp->interrupted = 1;
 				}
 				strp_parser_err(strp, len, desc);
 				break;
-			} else if (len > strp->sk->sk_rcvbuf) {
+			} else if (len > max_msg_size) {
 				/* Message length exceeds maximum allowed */
-				STRP_STATS_INCR(strp->stats.rx_msg_too_big);
+				STRP_STATS_INCR(strp->stats.msg_too_big);
 				strp_parser_err(strp, -EMSGSIZE, desc);
 				break;
 			} else if (len <= (ssize_t)head->len -
-					  skb->len - rxm->strp.offset) {
+					  skb->len - stm->strp.offset) {
 				/* Length must be into new skb (and also
 				 * greater than zero)
 				 */
-				STRP_STATS_INCR(strp->stats.rx_bad_hdr_len);
+				STRP_STATS_INCR(strp->stats.bad_hdr_len);
 				strp_parser_err(strp, -EPROTO, desc);
 				break;
 			}
 
-			rxm->strp.full_len = len;
+			stm->strp.full_len = len;
 		}
 
-		extra = (ssize_t)(rxm->accum_len + cand_len) -
-			rxm->strp.full_len;
+		extra = (ssize_t)(stm->accum_len + cand_len) -
+			stm->strp.full_len;
 
 		if (extra < 0) {
 			/* Message not complete yet. */
-			if (rxm->strp.full_len - rxm->accum_len >
+			if (stm->strp.full_len - stm->accum_len >
 			    strp_peek_len(strp)) {
-				/* Don't have the whole messages in the socket
-				 * buffer. Set strp->rx_need_bytes to wait for
+				/* Don't have the whole message in the socket
+				 * buffer. Set strp->need_bytes to wait for
 				 * the rest of the message. Also, set "early
 				 * eaten" since we've already buffered the skb
 				 * but don't consume yet per strp_read_sock.
 				 */
 
-				if (!rxm->accum_len) {
+				if (!stm->accum_len) {
 					/* Start RX timer for new message */
-					strp_start_rx_timer(strp);
+					strp_start_timer(strp, timeo);
 				}
 
-				strp->rx_need_bytes = rxm->strp.full_len -
-						       rxm->accum_len;
-				rxm->accum_len += cand_len;
-				rxm->early_eaten = cand_len;
-				STRP_STATS_ADD(strp->stats.rx_bytes, cand_len);
+				strp->need_bytes = stm->strp.full_len -
+						       stm->accum_len;
+				stm->accum_len += cand_len;
+				stm->early_eaten = cand_len;
+				STRP_STATS_ADD(strp->stats.bytes, cand_len);
 				desc->count = 0; /* Stop reading socket */
 				break;
 			}
-			rxm->accum_len += cand_len;
+			stm->accum_len += cand_len;
 			eaten += cand_len;
 			WARN_ON(eaten != orig_len);
 			break;
@@ -308,14 +319,14 @@ static int strp_recv(read_descriptor_t *desc, struct sk_buff *orig_skb,
 		eaten += (cand_len - extra);
 
 		/* Hurray, we have a new message! */
-		del_timer(&strp->rx_msg_timer);
-		strp->rx_skb_head = NULL;
-		STRP_STATS_INCR(strp->stats.rx_msgs);
+		del_timer(&strp->msg_timer);
+		strp->skb_head = NULL;
+		STRP_STATS_INCR(strp->stats.msgs);
 
 		/* Give skb to upper layer */
 		strp->cb.rcv_msg(strp, head);
 
-		if (unlikely(strp->rx_paused)) {
+		if (unlikely(strp->paused)) {
 			/* Upper layer paused strp */
 			break;
 		}
@@ -324,11 +335,33 @@ static int strp_recv(read_descriptor_t *desc, struct sk_buff *orig_skb,
 	if (cloned_orig)
 		kfree_skb(orig_skb);
 
-	STRP_STATS_ADD(strp->stats.rx_bytes, eaten);
+	STRP_STATS_ADD(strp->stats.bytes, eaten);
 
 	return eaten;
 }
 
+int strp_process(struct strparser *strp, struct sk_buff *orig_skb,
+		 unsigned int orig_offset, size_t orig_len,
+		 size_t max_msg_size, long timeo)
+{
+	read_descriptor_t desc; /* Dummy arg to strp_recv */
+
+	desc.arg.data = strp;
+
+	return __strp_recv(&desc, orig_skb, orig_offset, orig_len,
+			   max_msg_size, timeo);
+}
+EXPORT_SYMBOL_GPL(strp_process);
+
+static int strp_recv(read_descriptor_t *desc, struct sk_buff *orig_skb,
+		     unsigned int orig_offset, size_t orig_len)
+{
+	struct strparser *strp = (struct strparser *)desc->arg.data;
+
+	return __strp_recv(desc, orig_skb, orig_offset, orig_len,
+			   strp->sk->sk_rcvbuf, strp->sk->sk_rcvtimeo);
+}
+
 static int default_read_sock_done(struct strparser *strp, int err)
 {
 	return err;
@@ -355,101 +388,129 @@ static int strp_read_sock(struct strparser *strp)
 /* Lower sock lock held */
 void strp_data_ready(struct strparser *strp)
 {
-	if (unlikely(strp->rx_stopped))
+	if (unlikely(strp->stopped))
 		return;
 
-	/* This check is needed to synchronize with do_strp_rx_work.
-	 * do_strp_rx_work acquires a process lock (lock_sock) whereas
+	/* This check is needed to synchronize with do_strp_work.
+	 * do_strp_work acquires a process lock (lock_sock) whereas
 	 * the lock held here is bh_lock_sock. The two locks can be
 	 * held by different threads at the same time, but bh_lock_sock
 	 * allows a thread in BH context to safely check if the process
 	 * lock is held. In this case, if the lock is held, queue work.
 	 */
 	if (sock_owned_by_user(strp->sk)) {
-		queue_work(strp_wq, &strp->rx_work);
+		queue_work(strp_wq, &strp->work);
 		return;
 	}
 
-	if (strp->rx_paused)
+	if (strp->paused)
 		return;
 
-	if (strp->rx_need_bytes) {
-		if (strp_peek_len(strp) >= strp->rx_need_bytes)
-			strp->rx_need_bytes = 0;
+	if (strp->need_bytes) {
+		if (strp_peek_len(strp) >= strp->need_bytes)
+			strp->need_bytes = 0;
 		else
 			return;
 	}
 
 	if (strp_read_sock(strp) == -ENOMEM)
-		queue_work(strp_wq, &strp->rx_work);
+		queue_work(strp_wq, &strp->work);
 }
 EXPORT_SYMBOL_GPL(strp_data_ready);
 
-static void do_strp_rx_work(struct strparser *strp)
+static void do_strp_work(struct strparser *strp)
 {
 	read_descriptor_t rd_desc;
-	struct sock *csk = strp->sk;
 
 	/* We need the read lock to synchronize with strp_data_ready. We
 	 * need the socket lock for calling strp_read_sock.
 	 */
-	lock_sock(csk);
+	strp->cb.lock(strp);
 
-	if (unlikely(strp->rx_stopped))
+	if (unlikely(strp->stopped))
 		goto out;
 
-	if (strp->rx_paused)
+	if (strp->paused)
 		goto out;
 
 	rd_desc.arg.data = strp;
 
 	if (strp_read_sock(strp) == -ENOMEM)
-		queue_work(strp_wq, &strp->rx_work);
+		queue_work(strp_wq, &strp->work);
 
 out:
-	release_sock(csk);
+	strp->cb.unlock(strp);
 }
 
-static void strp_rx_work(struct work_struct *w)
+static void strp_work(struct work_struct *w)
 {
-	do_strp_rx_work(container_of(w, struct strparser, rx_work));
+	do_strp_work(container_of(w, struct strparser, work));
 }
 
-static void strp_rx_msg_timeout(unsigned long arg)
+static void strp_msg_timeout(unsigned long arg)
 {
 	struct strparser *strp = (struct strparser *)arg;
 
 	/* Message assembly timed out */
-	STRP_STATS_INCR(strp->stats.rx_msg_timeouts);
-	lock_sock(strp->sk);
+	STRP_STATS_INCR(strp->stats.msg_timeouts);
+	strp->cb.lock(strp);
 	strp->cb.abort_parser(strp, ETIMEDOUT);
+	strp->cb.unlock(strp);
+}
+
+static void strp_sock_lock(struct strparser *strp)
+{
+	lock_sock(strp->sk);
+}
+
+static void strp_sock_unlock(struct strparser *strp)
+{
 	release_sock(strp->sk);
 }
 
-int strp_init(struct strparser *strp, struct sock *csk,
+int strp_init(struct strparser *strp, struct sock *sk,
 	      struct strp_callbacks *cb)
 {
-	struct socket *sock = csk->sk_socket;
 
 	if (!cb || !cb->rcv_msg || !cb->parse_msg)
 		return -EINVAL;
 
-	if (!sock->ops->read_sock || !sock->ops->peek_len)
-		return -EAFNOSUPPORT;
+	/* The sk (sock) arg determines the mode of the stream parser.
+	 *
+	 * If the sock is set then the strparser is in receive callback mode.
+	 * The upper layer calls strp_data_ready to kick receive processing
+	 * and strparser calls the read_sock function on the socket to
+	 * get packets.
+	 *
+	 * If the sock is not set then the strparser is in general mode.
+	 * The upper layer calls strp_process for each skb to be parsed.
+	 */
 
-	memset(strp, 0, sizeof(*strp));
+	if (sk) {
+		struct socket *sock = sk->sk_socket;
 
-	strp->sk = csk;
+		if (!sock->ops->read_sock || !sock->ops->peek_len)
+			return -EAFNOSUPPORT;
+	} else {
+		if (!cb->lock || !cb->unlock)
+			return -EINVAL;
+	}
 
-	setup_timer(&strp->rx_msg_timer, strp_rx_msg_timeout,
-		    (unsigned long)strp);
+	memset(strp, 0, sizeof(*strp));
 
-	INIT_WORK(&strp->rx_work, strp_rx_work);
+	strp->sk = sk;
 
+	strp->cb.lock = cb->lock ? : strp_sock_lock;
+	strp->cb.unlock = cb->unlock ? : strp_sock_unlock;
 	strp->cb.rcv_msg = cb->rcv_msg;
 	strp->cb.parse_msg = cb->parse_msg;
 	strp->cb.read_sock_done = cb->read_sock_done ? : default_read_sock_done;
-	strp->cb.abort_parser = cb->abort_parser ? : strp_abort_rx_strp;
+	strp->cb.abort_parser = cb->abort_parser ? : strp_abort_strp;
+
+	setup_timer(&strp->msg_timer, strp_msg_timeout,
+		    (unsigned long)strp);
+
+	INIT_WORK(&strp->work, strp_work);
 
 	return 0;
 }
@@ -457,12 +518,12 @@ EXPORT_SYMBOL_GPL(strp_init);
 
 void strp_unpause(struct strparser *strp)
 {
-	strp->rx_paused = 0;
+	strp->paused = 0;
 
-	/* Sync setting rx_paused with RX work */
+	/* Sync setting paused with RX work */
 	smp_mb();
 
-	queue_work(strp_wq, &strp->rx_work);
+	queue_work(strp_wq, &strp->work);
 }
 EXPORT_SYMBOL_GPL(strp_unpause);
 
@@ -471,27 +532,27 @@ EXPORT_SYMBOL_GPL(strp_unpause);
  */
 void strp_done(struct strparser *strp)
 {
-	WARN_ON(!strp->rx_stopped);
+	WARN_ON(!strp->stopped);
 
-	del_timer_sync(&strp->rx_msg_timer);
-	cancel_work_sync(&strp->rx_work);
+	del_timer_sync(&strp->msg_timer);
+	cancel_work_sync(&strp->work);
 
-	if (strp->rx_skb_head) {
-		kfree_skb(strp->rx_skb_head);
-		strp->rx_skb_head = NULL;
+	if (strp->skb_head) {
+		kfree_skb(strp->skb_head);
+		strp->skb_head = NULL;
 	}
 }
 EXPORT_SYMBOL_GPL(strp_done);
 
 void strp_stop(struct strparser *strp)
 {
-	strp->rx_stopped = 1;
+	strp->stopped = 1;
 }
 EXPORT_SYMBOL_GPL(strp_stop);
 
 void strp_check_rcv(struct strparser *strp)
 {
-	queue_work(strp_wq, &strp->rx_work);
+	queue_work(strp_wq, &strp->work);
 }
 EXPORT_SYMBOL_GPL(strp_check_rcv);
 
-- 
cgit v1.2.3


From c613c209c3f351d47158f728271d0c73b6dd24c6 Mon Sep 17 00:00:00 2001
From: Willem de Bruijn <willemb@google.com>
Date: Mon, 31 Jul 2017 08:15:47 -0400
Subject: net: add skb_frag_foreach_page and use with kmap_atomic

Skb frags may contain compound pages. Various operations map frags
temporarily using kmap_atomic, but this function works on single
pages, not whole compound pages. The distinction is only relevant
for high mem pages that require temporary mappings.

Introduce a looping mechanism that for compound highmem pages maps
one page at a time, does not change behavior on other pages.
Use the loop in the kmap_atomic callers in net/core/skbuff.c.

Verified by triggering skb_copy_bits with

    tcpdump -n -c 100 -i ${DEV} -w /dev/null &
    netperf -t TCP_STREAM -H ${HOST}

  and by triggering __skb_checksum with

    ethtool -K ${DEV} tx off

  repeated the tests with looping on a non-highmem platform
  (x86_64) by making skb_frag_must_loop always return true.

Signed-off-by: Willem de Bruijn <willemb@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/skbuff.h | 36 +++++++++++++++++++++
 net/core/skbuff.c      | 88 +++++++++++++++++++++++++++++++++-----------------
 2 files changed, 95 insertions(+), 29 deletions(-)

(limited to 'include')

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 18e76bf9574e..6f9f1b2715ec 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -345,6 +345,42 @@ static inline void skb_frag_size_sub(skb_frag_t *frag, int delta)
 	frag->size -= delta;
 }
 
+static inline bool skb_frag_must_loop(struct page *p)
+{
+#if defined(CONFIG_HIGHMEM)
+	if (PageHighMem(p))
+		return true;
+#endif
+	return false;
+}
+
+/**
+ *	skb_frag_foreach_page - loop over pages in a fragment
+ *
+ *	@f:		skb frag to operate on
+ *	@f_off:		offset from start of f->page.p
+ *	@f_len:		length from f_off to loop over
+ *	@p:		(temp var) current page
+ *	@p_off:		(temp var) offset from start of current page,
+ *	                           non-zero only on first page.
+ *	@p_len:		(temp var) length in current page,
+ *				   < PAGE_SIZE only on first and last page.
+ *	@copied:	(temp var) length so far, excluding current p_len.
+ *
+ *	A fragment can hold a compound page, in which case per-page
+ *	operations, notably kmap_atomic, must be called for each
+ *	regular page.
+ */
+#define skb_frag_foreach_page(f, f_off, f_len, p, p_off, p_len, copied)	\
+	for (p = skb_frag_page(f) + ((f_off) >> PAGE_SHIFT),		\
+	     p_off = (f_off) & (PAGE_SIZE - 1),				\
+	     p_len = skb_frag_must_loop(p) ?				\
+	     min_t(u32, f_len, PAGE_SIZE - p_off) : f_len,		\
+	     copied = 0;						\
+	     copied < f_len;						\
+	     copied += p_len, p++, p_off = 0,				\
+	     p_len = min_t(u32, f_len - copied, PAGE_SIZE))		\
+
 #define HAVE_HW_TIME_STAMP
 
 /**
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index 9c0e015ff3fe..0f0933b338d7 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -938,8 +938,10 @@ int skb_copy_ubufs(struct sk_buff *skb, gfp_t gfp_mask)
 	struct ubuf_info *uarg = skb_shinfo(skb)->destructor_arg;
 
 	for (i = 0; i < num_frags; i++) {
-		u8 *vaddr;
 		skb_frag_t *f = &skb_shinfo(skb)->frags[i];
+		u32 p_off, p_len, copied;
+		struct page *p;
+		u8 *vaddr;
 
 		page = alloc_page(gfp_mask);
 		if (!page) {
@@ -950,10 +952,15 @@ int skb_copy_ubufs(struct sk_buff *skb, gfp_t gfp_mask)
 			}
 			return -ENOMEM;
 		}
-		vaddr = kmap_atomic(skb_frag_page(f));
-		memcpy(page_address(page),
-		       vaddr + f->page_offset, skb_frag_size(f));
-		kunmap_atomic(vaddr);
+
+		skb_frag_foreach_page(f, f->page_offset, skb_frag_size(f),
+				      p, p_off, p_len, copied) {
+			vaddr = kmap_atomic(p);
+			memcpy(page_address(page) + copied, vaddr + p_off,
+			       p_len);
+			kunmap_atomic(vaddr);
+		}
+
 		set_page_private(page, (unsigned long)head);
 		head = page;
 	}
@@ -1753,16 +1760,20 @@ int skb_copy_bits(const struct sk_buff *skb, int offset, void *to, int len)
 
 		end = start + skb_frag_size(f);
 		if ((copy = end - offset) > 0) {
+			u32 p_off, p_len, copied;
+			struct page *p;
 			u8 *vaddr;
 
 			if (copy > len)
 				copy = len;
 
-			vaddr = kmap_atomic(skb_frag_page(f));
-			memcpy(to,
-			       vaddr + f->page_offset + offset - start,
-			       copy);
-			kunmap_atomic(vaddr);
+			skb_frag_foreach_page(f,
+					      f->page_offset + offset - start,
+					      copy, p, p_off, p_len, copied) {
+				vaddr = kmap_atomic(p);
+				memcpy(to + copied, vaddr + p_off, p_len);
+				kunmap_atomic(vaddr);
+			}
 
 			if ((len -= copy) == 0)
 				return 0;
@@ -2122,15 +2133,20 @@ int skb_store_bits(struct sk_buff *skb, int offset, const void *from, int len)
 
 		end = start + skb_frag_size(frag);
 		if ((copy = end - offset) > 0) {
+			u32 p_off, p_len, copied;
+			struct page *p;
 			u8 *vaddr;
 
 			if (copy > len)
 				copy = len;
 
-			vaddr = kmap_atomic(skb_frag_page(frag));
-			memcpy(vaddr + frag->page_offset + offset - start,
-			       from, copy);
-			kunmap_atomic(vaddr);
+			skb_frag_foreach_page(frag,
+					      frag->page_offset + offset - start,
+					      copy, p, p_off, p_len, copied) {
+				vaddr = kmap_atomic(p);
+				memcpy(vaddr + p_off, from + copied, p_len);
+				kunmap_atomic(vaddr);
+			}
 
 			if ((len -= copy) == 0)
 				return 0;
@@ -2195,20 +2211,27 @@ __wsum __skb_checksum(const struct sk_buff *skb, int offset, int len,
 
 		end = start + skb_frag_size(frag);
 		if ((copy = end - offset) > 0) {
+			u32 p_off, p_len, copied;
+			struct page *p;
 			__wsum csum2;
 			u8 *vaddr;
 
 			if (copy > len)
 				copy = len;
-			vaddr = kmap_atomic(skb_frag_page(frag));
-			csum2 = ops->update(vaddr + frag->page_offset +
-					    offset - start, copy, 0);
-			kunmap_atomic(vaddr);
-			csum = ops->combine(csum, csum2, pos, copy);
+
+			skb_frag_foreach_page(frag,
+					      frag->page_offset + offset - start,
+					      copy, p, p_off, p_len, copied) {
+				vaddr = kmap_atomic(p);
+				csum2 = ops->update(vaddr + p_off, p_len, 0);
+				kunmap_atomic(vaddr);
+				csum = ops->combine(csum, csum2, pos, p_len);
+				pos += p_len;
+			}
+
 			if (!(len -= copy))
 				return csum;
 			offset += copy;
-			pos    += copy;
 		}
 		start = end;
 	}
@@ -2281,24 +2304,31 @@ __wsum skb_copy_and_csum_bits(const struct sk_buff *skb, int offset,
 
 		end = start + skb_frag_size(&skb_shinfo(skb)->frags[i]);
 		if ((copy = end - offset) > 0) {
+			skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
+			u32 p_off, p_len, copied;
+			struct page *p;
 			__wsum csum2;
 			u8 *vaddr;
-			skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
 
 			if (copy > len)
 				copy = len;
-			vaddr = kmap_atomic(skb_frag_page(frag));
-			csum2 = csum_partial_copy_nocheck(vaddr +
-							  frag->page_offset +
-							  offset - start, to,
-							  copy, 0);
-			kunmap_atomic(vaddr);
-			csum = csum_block_add(csum, csum2, pos);
+
+			skb_frag_foreach_page(frag,
+					      frag->page_offset + offset - start,
+					      copy, p, p_off, p_len, copied) {
+				vaddr = kmap_atomic(p);
+				csum2 = csum_partial_copy_nocheck(vaddr + p_off,
+								  to + copied,
+								  p_len, 0);
+				kunmap_atomic(vaddr);
+				csum = csum_block_add(csum, csum2, pos);
+				pos += p_len;
+			}
+
 			if (!(len -= copy))
 				return csum;
 			offset += copy;
 			to     += copy;
-			pos    += copy;
 		}
 		start = end;
 	}
-- 
cgit v1.2.3


From 46587e4a312780a63132483bc8678c397eca6aff Mon Sep 17 00:00:00 2001
From: Vivien Didelot <vivien.didelot@savoirfairelinux.com>
Date: Tue, 1 Aug 2017 16:32:39 -0400
Subject: net: dsa: remove PHY device argument from .set_eee

The DSA switch operations for EEE are only meant to configure a port's
MAC EEE settings. The port's PHY EEE settings are accessed by the DSA
layer and must be made available via a proper PHY driver.

In order to reduce this confusion, remove the phy_device argument from
the .set_eee operation.

Signed-off-by: Vivien Didelot <vivien.didelot@savoirfairelinux.com>
Reviewed-by: Andrew Lunn <andrew@lunn.ch>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/dsa/bcm_sf2.c        |  1 -
 drivers/net/dsa/mv88e6xxx/chip.c |  2 +-
 drivers/net/dsa/qca8k.c          | 14 +++-----------
 include/net/dsa.h                |  1 -
 net/dsa/slave.c                  |  2 +-
 5 files changed, 5 insertions(+), 15 deletions(-)

(limited to 'include')

diff --git a/drivers/net/dsa/bcm_sf2.c b/drivers/net/dsa/bcm_sf2.c
index 9d10aac8f241..ce886345d8d2 100644
--- a/drivers/net/dsa/bcm_sf2.c
+++ b/drivers/net/dsa/bcm_sf2.c
@@ -353,7 +353,6 @@ static int bcm_sf2_sw_get_eee(struct dsa_switch *ds, int port,
 }
 
 static int bcm_sf2_sw_set_eee(struct dsa_switch *ds, int port,
-			      struct phy_device *phydev,
 			      struct ethtool_eee *e)
 {
 	struct bcm_sf2_priv *priv = bcm_sf2_to_priv(ds);
diff --git a/drivers/net/dsa/mv88e6xxx/chip.c b/drivers/net/dsa/mv88e6xxx/chip.c
index 647d5d45c1d6..aaa96487f21f 100644
--- a/drivers/net/dsa/mv88e6xxx/chip.c
+++ b/drivers/net/dsa/mv88e6xxx/chip.c
@@ -850,7 +850,7 @@ static int mv88e6xxx_get_eee(struct dsa_switch *ds, int port,
 }
 
 static int mv88e6xxx_set_eee(struct dsa_switch *ds, int port,
-			     struct phy_device *phydev, struct ethtool_eee *e)
+			     struct ethtool_eee *e)
 {
 	struct mv88e6xxx_chip *chip = ds->priv;
 	int err;
diff --git a/drivers/net/dsa/qca8k.c b/drivers/net/dsa/qca8k.c
index bfe0172ae6cc..e209e229ed4c 100644
--- a/drivers/net/dsa/qca8k.c
+++ b/drivers/net/dsa/qca8k.c
@@ -637,8 +637,8 @@ qca8k_get_sset_count(struct dsa_switch *ds)
 	return ARRAY_SIZE(ar8327_mib);
 }
 
-static void
-qca8k_eee_enable_set(struct dsa_switch *ds, int port, bool enable)
+static int
+qca8k_set_eee(struct dsa_switch *ds, int port, struct ethtool_eee *eee)
 {
 	struct qca8k_priv *priv = (struct qca8k_priv *)ds->priv;
 	u32 lpi_en = QCA8K_REG_EEE_CTRL_LPI_EN(port);
@@ -646,20 +646,12 @@ qca8k_eee_enable_set(struct dsa_switch *ds, int port, bool enable)
 
 	mutex_lock(&priv->reg_mutex);
 	reg = qca8k_read(priv, QCA8K_REG_EEE_CTRL);
-	if (enable)
+	if (eee->eee_enabled)
 		reg |= lpi_en;
 	else
 		reg &= ~lpi_en;
 	qca8k_write(priv, QCA8K_REG_EEE_CTRL, reg);
 	mutex_unlock(&priv->reg_mutex);
-}
-
-static int
-qca8k_set_eee(struct dsa_switch *ds, int port,
-	      struct phy_device *phydev,
-	      struct ethtool_eee *e)
-{
-	qca8k_eee_enable_set(ds, port, e->eee_enabled);
 
 	return 0;
 }
diff --git a/include/net/dsa.h b/include/net/dsa.h
index 88da272d20d0..ce46db323394 100644
--- a/include/net/dsa.h
+++ b/include/net/dsa.h
@@ -335,7 +335,6 @@ struct dsa_switch_ops {
 	 * EEE setttings
 	 */
 	int	(*set_eee)(struct dsa_switch *ds, int port,
-			   struct phy_device *phydev,
 			   struct ethtool_eee *e);
 	int	(*get_eee)(struct dsa_switch *ds, int port,
 			   struct ethtool_eee *e);
diff --git a/net/dsa/slave.c b/net/dsa/slave.c
index ad5caaf384d7..9ddc584e70b0 100644
--- a/net/dsa/slave.c
+++ b/net/dsa/slave.c
@@ -655,7 +655,7 @@ static int dsa_slave_set_eee(struct net_device *dev, struct ethtool_eee *e)
 	if (!ds->ops->set_eee)
 		return -EOPNOTSUPP;
 
-	ret = ds->ops->set_eee(ds, p->dp->index, p->phy, e);
+	ret = ds->ops->set_eee(ds, p->dp->index, e);
 	if (ret)
 		return ret;
 
-- 
cgit v1.2.3


From 08f500610f39809c107f206cba1f799c98c38054 Mon Sep 17 00:00:00 2001
From: Vivien Didelot <vivien.didelot@savoirfairelinux.com>
Date: Tue, 1 Aug 2017 16:32:41 -0400
Subject: net: dsa: rename switch EEE ops

To avoid confusion with the PHY EEE settings, rename the .set_eee and
.get_eee ops to respectively .set_mac_eee and .get_mac_eee.

Signed-off-by: Vivien Didelot <vivien.didelot@savoirfairelinux.com>
Reviewed-by: Andrew Lunn <andrew@lunn.ch>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/dsa/bcm_sf2.c        | 12 ++++++------
 drivers/net/dsa/mv88e6xxx/chip.c | 12 ++++++------
 drivers/net/dsa/qca8k.c          |  9 ++++-----
 include/net/dsa.h                | 10 +++++-----
 net/dsa/slave.c                  |  8 ++++----
 5 files changed, 25 insertions(+), 26 deletions(-)

(limited to 'include')

diff --git a/drivers/net/dsa/bcm_sf2.c b/drivers/net/dsa/bcm_sf2.c
index ce886345d8d2..6bbfa6ea1efb 100644
--- a/drivers/net/dsa/bcm_sf2.c
+++ b/drivers/net/dsa/bcm_sf2.c
@@ -338,8 +338,8 @@ static int bcm_sf2_eee_init(struct dsa_switch *ds, int port,
 	return 1;
 }
 
-static int bcm_sf2_sw_get_eee(struct dsa_switch *ds, int port,
-			      struct ethtool_eee *e)
+static int bcm_sf2_sw_get_mac_eee(struct dsa_switch *ds, int port,
+				  struct ethtool_eee *e)
 {
 	struct bcm_sf2_priv *priv = bcm_sf2_to_priv(ds);
 	struct ethtool_eee *p = &priv->port_sts[port].eee;
@@ -352,8 +352,8 @@ static int bcm_sf2_sw_get_eee(struct dsa_switch *ds, int port,
 	return 0;
 }
 
-static int bcm_sf2_sw_set_eee(struct dsa_switch *ds, int port,
-			      struct ethtool_eee *e)
+static int bcm_sf2_sw_set_mac_eee(struct dsa_switch *ds, int port,
+				  struct ethtool_eee *e)
 {
 	struct bcm_sf2_priv *priv = bcm_sf2_to_priv(ds);
 	struct ethtool_eee *p = &priv->port_sts[port].eee;
@@ -1011,8 +1011,8 @@ static const struct dsa_switch_ops bcm_sf2_ops = {
 	.set_wol		= bcm_sf2_sw_set_wol,
 	.port_enable		= bcm_sf2_port_setup,
 	.port_disable		= bcm_sf2_port_disable,
-	.get_eee		= bcm_sf2_sw_get_eee,
-	.set_eee		= bcm_sf2_sw_set_eee,
+	.get_mac_eee		= bcm_sf2_sw_get_mac_eee,
+	.set_mac_eee		= bcm_sf2_sw_set_mac_eee,
 	.port_bridge_join	= b53_br_join,
 	.port_bridge_leave	= b53_br_leave,
 	.port_stp_state_set	= b53_br_set_stp_state,
diff --git a/drivers/net/dsa/mv88e6xxx/chip.c b/drivers/net/dsa/mv88e6xxx/chip.c
index aa0c5493fb9d..521738c4cd17 100644
--- a/drivers/net/dsa/mv88e6xxx/chip.c
+++ b/drivers/net/dsa/mv88e6xxx/chip.c
@@ -810,15 +810,15 @@ static void mv88e6xxx_get_regs(struct dsa_switch *ds, int port,
 	mutex_unlock(&chip->reg_lock);
 }
 
-static int mv88e6xxx_get_eee(struct dsa_switch *ds, int port,
-			     struct ethtool_eee *e)
+static int mv88e6xxx_get_mac_eee(struct dsa_switch *ds, int port,
+				 struct ethtool_eee *e)
 {
 	/* Nothing to do on the port's MAC */
 	return 0;
 }
 
-static int mv88e6xxx_set_eee(struct dsa_switch *ds, int port,
-			     struct ethtool_eee *e)
+static int mv88e6xxx_set_mac_eee(struct dsa_switch *ds, int port,
+				 struct ethtool_eee *e)
 {
 	/* Nothing to do on the port's MAC */
 	return 0;
@@ -3890,8 +3890,8 @@ static const struct dsa_switch_ops mv88e6xxx_switch_ops = {
 	.get_sset_count		= mv88e6xxx_get_sset_count,
 	.port_enable		= mv88e6xxx_port_enable,
 	.port_disable		= mv88e6xxx_port_disable,
-	.set_eee		= mv88e6xxx_set_eee,
-	.get_eee		= mv88e6xxx_get_eee,
+	.get_mac_eee		= mv88e6xxx_get_mac_eee,
+	.set_mac_eee		= mv88e6xxx_set_mac_eee,
 	.get_eeprom_len		= mv88e6xxx_get_eeprom_len,
 	.get_eeprom		= mv88e6xxx_get_eeprom,
 	.set_eeprom		= mv88e6xxx_set_eeprom,
diff --git a/drivers/net/dsa/qca8k.c b/drivers/net/dsa/qca8k.c
index e209e229ed4c..36c169b0c705 100644
--- a/drivers/net/dsa/qca8k.c
+++ b/drivers/net/dsa/qca8k.c
@@ -638,7 +638,7 @@ qca8k_get_sset_count(struct dsa_switch *ds)
 }
 
 static int
-qca8k_set_eee(struct dsa_switch *ds, int port, struct ethtool_eee *eee)
+qca8k_set_mac_eee(struct dsa_switch *ds, int port, struct ethtool_eee *eee)
 {
 	struct qca8k_priv *priv = (struct qca8k_priv *)ds->priv;
 	u32 lpi_en = QCA8K_REG_EEE_CTRL_LPI_EN(port);
@@ -657,8 +657,7 @@ qca8k_set_eee(struct dsa_switch *ds, int port, struct ethtool_eee *eee)
 }
 
 static int
-qca8k_get_eee(struct dsa_switch *ds, int port,
-	      struct ethtool_eee *e)
+qca8k_get_mac_eee(struct dsa_switch *ds, int port, struct ethtool_eee *e)
 {
 	/* Nothing to do on the port's MAC */
 	return 0;
@@ -863,8 +862,8 @@ static const struct dsa_switch_ops qca8k_switch_ops = {
 	.phy_write		= qca8k_phy_write,
 	.get_ethtool_stats	= qca8k_get_ethtool_stats,
 	.get_sset_count		= qca8k_get_sset_count,
-	.get_eee		= qca8k_get_eee,
-	.set_eee		= qca8k_set_eee,
+	.get_mac_eee		= qca8k_get_mac_eee,
+	.set_mac_eee		= qca8k_set_mac_eee,
 	.port_enable		= qca8k_port_enable,
 	.port_disable		= qca8k_port_disable,
 	.port_stp_state_set	= qca8k_port_stp_state_set,
diff --git a/include/net/dsa.h b/include/net/dsa.h
index ce46db323394..0b1a0622b33c 100644
--- a/include/net/dsa.h
+++ b/include/net/dsa.h
@@ -332,12 +332,12 @@ struct dsa_switch_ops {
 				struct phy_device *phy);
 
 	/*
-	 * EEE setttings
+	 * Port's MAC EEE settings
 	 */
-	int	(*set_eee)(struct dsa_switch *ds, int port,
-			   struct ethtool_eee *e);
-	int	(*get_eee)(struct dsa_switch *ds, int port,
-			   struct ethtool_eee *e);
+	int	(*set_mac_eee)(struct dsa_switch *ds, int port,
+			       struct ethtool_eee *e);
+	int	(*get_mac_eee)(struct dsa_switch *ds, int port,
+			       struct ethtool_eee *e);
 
 	/* EEPROM access */
 	int	(*get_eeprom_len)(struct dsa_switch *ds);
diff --git a/net/dsa/slave.c b/net/dsa/slave.c
index 9ddc584e70b0..cc4bad3dadb4 100644
--- a/net/dsa/slave.c
+++ b/net/dsa/slave.c
@@ -652,10 +652,10 @@ static int dsa_slave_set_eee(struct net_device *dev, struct ethtool_eee *e)
 	if (!p->phy)
 		return -ENODEV;
 
-	if (!ds->ops->set_eee)
+	if (!ds->ops->set_mac_eee)
 		return -EOPNOTSUPP;
 
-	ret = ds->ops->set_eee(ds, p->dp->index, e);
+	ret = ds->ops->set_mac_eee(ds, p->dp->index, e);
 	if (ret)
 		return ret;
 
@@ -678,10 +678,10 @@ static int dsa_slave_get_eee(struct net_device *dev, struct ethtool_eee *e)
 	if (!p->phy)
 		return -ENODEV;
 
-	if (!ds->ops->get_eee)
+	if (!ds->ops->get_mac_eee)
 		return -EOPNOTSUPP;
 
-	ret = ds->ops->get_eee(ds, p->dp->index, e);
+	ret = ds->ops->get_mac_eee(ds, p->dp->index, e);
 	if (ret)
 		return ret;
 
-- 
cgit v1.2.3


From 581c4484769e692eade761c17c22549aaefe6749 Mon Sep 17 00:00:00 2001
From: Dmitry Torokhov <dmitry.torokhov@gmail.com>
Date: Tue, 1 Aug 2017 15:38:01 -0700
Subject: HID: input: map digitizer battery usage

We already mapped battery strength reports from the generic device
control page, but we did not update capacity from input reports, nor we
mapped the battery strength report from the digitizer page, so let's
implement this now.

Batteries driven by the input reports will now start in "unknown" state,
and will get updated once we receive first report containing battery
strength from the device.

Signed-off-by: Dmitry Torokhov <dmitry.torokhov@gmail.com>
Signed-off-by: Jiri Kosina <jkosina@suse.cz>
---
 drivers/hid/hid-input.c | 184 ++++++++++++++++++++++++++++++++----------------
 include/linux/hid.h     |   2 +
 2 files changed, 127 insertions(+), 59 deletions(-)

(limited to 'include')

diff --git a/drivers/hid/hid-input.c b/drivers/hid/hid-input.c
index ccdff1ee1f0c..2158ec766dd5 100644
--- a/drivers/hid/hid-input.c
+++ b/drivers/hid/hid-input.c
@@ -340,13 +340,45 @@ static unsigned find_battery_quirk(struct hid_device *hdev)
 	return quirks;
 }
 
+static int hidinput_scale_battery_capacity(struct hid_device *dev,
+					   int value)
+{
+	if (dev->battery_min < dev->battery_max &&
+	    value >= dev->battery_min && value <= dev->battery_max)
+		value = ((value - dev->battery_min) * 100) /
+			(dev->battery_max - dev->battery_min);
+
+	return value;
+}
+
+static int hidinput_query_battery_capacity(struct hid_device *dev)
+{
+	u8 *buf;
+	int ret;
+
+	buf = kmalloc(2, GFP_KERNEL);
+	if (!buf)
+		return -ENOMEM;
+
+	ret = hid_hw_raw_request(dev, dev->battery_report_id, buf, 2,
+				 dev->battery_report_type, HID_REQ_GET_REPORT);
+	if (ret != 2) {
+		kfree(buf);
+		return -ENODATA;
+	}
+
+	ret = hidinput_scale_battery_capacity(dev, buf[1]);
+	kfree(buf);
+	return ret;
+}
+
 static int hidinput_get_battery_property(struct power_supply *psy,
 					 enum power_supply_property prop,
 					 union power_supply_propval *val)
 {
 	struct hid_device *dev = power_supply_get_drvdata(psy);
+	int value;
 	int ret = 0;
-	__u8 *buf;
 
 	switch (prop) {
 	case POWER_SUPPLY_PROP_PRESENT:
@@ -355,29 +387,15 @@ static int hidinput_get_battery_property(struct power_supply *psy,
 		break;
 
 	case POWER_SUPPLY_PROP_CAPACITY:
-
-		buf = kmalloc(2 * sizeof(__u8), GFP_KERNEL);
-		if (!buf) {
-			ret = -ENOMEM;
-			break;
-		}
-		ret = hid_hw_raw_request(dev, dev->battery_report_id, buf, 2,
-					 dev->battery_report_type,
-					 HID_REQ_GET_REPORT);
-
-		if (ret != 2) {
-			ret = -ENODATA;
-			kfree(buf);
-			break;
+		if (dev->battery_report_type == HID_FEATURE_REPORT) {
+			value = hidinput_query_battery_capacity(dev);
+			if (value < 0)
+				return value;
+		} else  {
+			value = dev->battery_capacity;
 		}
-		ret = 0;
 
-		if (dev->battery_min < dev->battery_max &&
-		    buf[1] >= dev->battery_min &&
-		    buf[1] <= dev->battery_max)
-			val->intval = (100 * (buf[1] - dev->battery_min)) /
-				(dev->battery_max - dev->battery_min);
-		kfree(buf);
+		val->intval = value;
 		break;
 
 	case POWER_SUPPLY_PROP_MODEL_NAME:
@@ -385,7 +403,22 @@ static int hidinput_get_battery_property(struct power_supply *psy,
 		break;
 
 	case POWER_SUPPLY_PROP_STATUS:
-		val->intval = POWER_SUPPLY_STATUS_DISCHARGING;
+		if (!dev->battery_reported &&
+		    dev->battery_report_type == HID_FEATURE_REPORT) {
+			value = hidinput_query_battery_capacity(dev);
+			if (value < 0)
+				return value;
+
+			dev->battery_capacity = value;
+			dev->battery_reported = true;
+		}
+
+		if (!dev->battery_reported)
+			val->intval = POWER_SUPPLY_STATUS_UNKNOWN;
+		else if (dev->battery_capacity == 100)
+			val->intval = POWER_SUPPLY_STATUS_FULL;
+		else
+			val->intval = POWER_SUPPLY_STATUS_DISCHARGING;
 		break;
 
 	case POWER_SUPPLY_PROP_SCOPE:
@@ -400,18 +433,16 @@ static int hidinput_get_battery_property(struct power_supply *psy,
 	return ret;
 }
 
-static bool hidinput_setup_battery(struct hid_device *dev, unsigned report_type, struct hid_field *field)
+static int hidinput_setup_battery(struct hid_device *dev, unsigned report_type, struct hid_field *field)
 {
-	struct power_supply_desc *psy_desc = NULL;
+	struct power_supply_desc *psy_desc;
 	struct power_supply_config psy_cfg = { .drv_data = dev, };
 	unsigned quirks;
 	s32 min, max;
+	int error;
 
-	if (field->usage->hid != HID_DC_BATTERYSTRENGTH)
-		return false;	/* no match */
-
-	if (dev->battery != NULL)
-		goto out;	/* already initialized? */
+	if (dev->battery)
+		return 0;	/* already initialized? */
 
 	quirks = find_battery_quirk(dev);
 
@@ -419,16 +450,16 @@ static bool hidinput_setup_battery(struct hid_device *dev, unsigned report_type,
 		dev->bus, dev->vendor, dev->product, dev->version, quirks);
 
 	if (quirks & HID_BATTERY_QUIRK_IGNORE)
-		goto out;
+		return 0;
 
 	psy_desc = kzalloc(sizeof(*psy_desc), GFP_KERNEL);
-	if (psy_desc == NULL)
-		goto out;
+	if (!psy_desc)
+		return -ENOMEM;
 
 	psy_desc->name = kasprintf(GFP_KERNEL, "hid-%s-battery", dev->uniq);
-	if (psy_desc->name == NULL) {
-		kfree(psy_desc);
-		goto out;
+	if (!psy_desc->name) {
+		error = -ENOMEM;
+		goto err_free_mem;
 	}
 
 	psy_desc->type = POWER_SUPPLY_TYPE_BATTERY;
@@ -455,17 +486,20 @@ static bool hidinput_setup_battery(struct hid_device *dev, unsigned report_type,
 
 	dev->battery = power_supply_register(&dev->dev, psy_desc, &psy_cfg);
 	if (IS_ERR(dev->battery)) {
-		hid_warn(dev, "can't register power supply: %ld\n",
-				PTR_ERR(dev->battery));
-		kfree(psy_desc->name);
-		kfree(psy_desc);
-		dev->battery = NULL;
-	} else {
-		power_supply_powers(dev->battery, &dev->dev);
+		error = PTR_ERR(dev->battery);
+		hid_warn(dev, "can't register power supply: %d\n", error);
+		goto err_free_name;
 	}
 
-out:
-	return true;
+	power_supply_powers(dev->battery, &dev->dev);
+	return 0;
+
+err_free_name:
+	kfree(psy_desc->name);
+err_free_mem:
+	kfree(psy_desc);
+	dev->battery = NULL;
+	return error;
 }
 
 static void hidinput_cleanup_battery(struct hid_device *dev)
@@ -481,16 +515,33 @@ static void hidinput_cleanup_battery(struct hid_device *dev)
 	kfree(psy_desc);
 	dev->battery = NULL;
 }
+
+static void hidinput_update_battery(struct hid_device *dev, int value)
+{
+	if (!dev->battery)
+		return;
+
+	if (value == 0 || value < dev->battery_min || value > dev->battery_max)
+		return;
+
+	dev->battery_capacity = hidinput_scale_battery_capacity(dev, value);
+	dev->battery_reported = true;
+	power_supply_changed(dev->battery);
+}
 #else  /* !CONFIG_HID_BATTERY_STRENGTH */
-static bool hidinput_setup_battery(struct hid_device *dev, unsigned report_type,
-				   struct hid_field *field)
+static int hidinput_setup_battery(struct hid_device *dev, unsigned report_type,
+				  struct hid_field *field)
 {
-	return false;
+	return 0;
 }
 
 static void hidinput_cleanup_battery(struct hid_device *dev)
 {
 }
+
+static void hidinput_update_battery(struct hid_device *dev, int value)
+{
+}
 #endif	/* CONFIG_HID_BATTERY_STRENGTH */
 
 static void hidinput_configure_usage(struct hid_input *hidinput, struct hid_field *field,
@@ -710,6 +761,11 @@ static void hidinput_configure_usage(struct hid_input *hidinput, struct hid_fiel
 			}
 			break;
 
+		case 0x3b: /* Battery Strength */
+			hidinput_setup_battery(device, HID_INPUT_REPORT, field);
+			usage->type = EV_PWR;
+			goto ignore;
+
 		case 0x3c: /* Invert */
 			map_key_clear(BTN_TOOL_RUBBER);
 			break;
@@ -944,11 +1000,13 @@ static void hidinput_configure_usage(struct hid_input *hidinput, struct hid_fiel
 		break;
 
 	case HID_UP_GENDEVCTRLS:
-		if (hidinput_setup_battery(device, HID_INPUT_REPORT, field))
+		switch (usage->hid) {
+		case HID_DC_BATTERYSTRENGTH:
+			hidinput_setup_battery(device, HID_INPUT_REPORT, field);
+			usage->type = EV_PWR;
 			goto ignore;
-		else
-			goto unknown;
-		break;
+		}
+		goto unknown;
 
 	case HID_UP_HPVENDOR:	/* Reported on a Dutch layout HP5308 */
 		set_bit(EV_REP, input->evbit);
@@ -1031,7 +1089,6 @@ mapped:
 	if (usage->code > max)
 		goto ignore;
 
-
 	if (usage->type == EV_ABS) {
 
 		int a = field->logical_minimum;
@@ -1090,14 +1147,19 @@ void hidinput_hid_event(struct hid_device *hid, struct hid_field *field, struct
 	struct input_dev *input;
 	unsigned *quirks = &hid->quirks;
 
-	if (!field->hidinput)
+	if (!usage->type)
 		return;
 
-	input = field->hidinput->input;
+	if (usage->type == EV_PWR) {
+		hidinput_update_battery(hid, value);
+		return;
+	}
 
-	if (!usage->type)
+	if (!field->hidinput)
 		return;
 
+	input = field->hidinput->input;
+
 	if (usage->hat_min < usage->hat_max || usage->hat_dir) {
 		int hat_dir = usage->hat_dir;
 		if (!hat_dir)
@@ -1373,6 +1435,7 @@ static void report_features(struct hid_device *hid)
 	struct hid_driver *drv = hid->driver;
 	struct hid_report_enum *rep_enum;
 	struct hid_report *rep;
+	struct hid_usage *usage;
 	int i, j;
 
 	rep_enum = &hid->report_enum[HID_FEATURE_REPORT];
@@ -1383,12 +1446,15 @@ static void report_features(struct hid_device *hid)
 				continue;
 
 			for (j = 0; j < rep->field[i]->maxusage; j++) {
+				usage = &rep->field[i]->usage[j];
+
 				/* Verify if Battery Strength feature is available */
-				hidinput_setup_battery(hid, HID_FEATURE_REPORT, rep->field[i]);
+				if (usage->hid == HID_DC_BATTERYSTRENGTH)
+					hidinput_setup_battery(hid, HID_FEATURE_REPORT,
+							       rep->field[i]);
 
 				if (drv->feature_mapping)
-					drv->feature_mapping(hid, rep->field[i],
-							     rep->field[i]->usage + j);
+					drv->feature_mapping(hid, rep->field[i], usage);
 			}
 		}
 }
diff --git a/include/linux/hid.h b/include/linux/hid.h
index 5006f9b5d837..281d1ffcbe02 100644
--- a/include/linux/hid.h
+++ b/include/linux/hid.h
@@ -542,10 +542,12 @@ struct hid_device {							/* device report descriptor */
 	 * battery is non-NULL.
 	 */
 	struct power_supply *battery;
+	__s32 battery_capacity;
 	__s32 battery_min;
 	__s32 battery_max;
 	__s32 battery_report_type;
 	__s32 battery_report_id;
+	bool battery_reported;
 #endif
 
 	unsigned int status;						/* see STAT flags above */
-- 
cgit v1.2.3


From ffdb5211da1c20354f1b40c204b6cf6c29c68161 Mon Sep 17 00:00:00 2001
From: Ilan Tayari <ilant@mellanox.com>
Date: Tue, 1 Aug 2017 12:49:08 +0300
Subject: xfrm: Auto-load xfrm offload modules

IPSec crypto offload depends on the protocol-specific
offload module (such as esp_offload.ko).

When the user installs an SA with crypto-offload, load
the offload module automatically, in the same way
that the protocol module is loaded (such as esp.ko)

Signed-off-by: Ilan Tayari <ilant@mellanox.com>
Signed-off-by: Steffen Klassert <steffen.klassert@secunet.com>
---
 include/net/xfrm.h      |  4 +++-
 net/ipv4/esp4_offload.c |  1 +
 net/ipv6/esp6_offload.c |  1 +
 net/xfrm/xfrm_device.c  |  2 +-
 net/xfrm/xfrm_state.c   | 16 ++++++++++++----
 net/xfrm/xfrm_user.c    |  2 +-
 6 files changed, 19 insertions(+), 7 deletions(-)

(limited to 'include')

diff --git a/include/net/xfrm.h b/include/net/xfrm.h
index afb4929d7232..5a360100136c 100644
--- a/include/net/xfrm.h
+++ b/include/net/xfrm.h
@@ -43,6 +43,8 @@
 	MODULE_ALIAS("xfrm-mode-" __stringify(family) "-" __stringify(encap))
 #define MODULE_ALIAS_XFRM_TYPE(family, proto) \
 	MODULE_ALIAS("xfrm-type-" __stringify(family) "-" __stringify(proto))
+#define MODULE_ALIAS_XFRM_OFFLOAD_TYPE(family, proto) \
+	MODULE_ALIAS("xfrm-offload-" __stringify(family) "-" __stringify(proto))
 
 #ifdef CONFIG_XFRM_STATISTICS
 #define XFRM_INC_STATS(net, field)	SNMP_INC_STATS((net)->mib.xfrm_statistics, field)
@@ -1558,7 +1560,7 @@ void xfrm_spd_getinfo(struct net *net, struct xfrmk_spdinfo *si);
 u32 xfrm_replay_seqhi(struct xfrm_state *x, __be32 net_seq);
 int xfrm_init_replay(struct xfrm_state *x);
 int xfrm_state_mtu(struct xfrm_state *x, int mtu);
-int __xfrm_init_state(struct xfrm_state *x, bool init_replay);
+int __xfrm_init_state(struct xfrm_state *x, bool init_replay, bool offload);
 int xfrm_init_state(struct xfrm_state *x);
 int xfrm_prepare_input(struct xfrm_state *x, struct sk_buff *skb);
 int xfrm_input(struct sk_buff *skb, int nexthdr, __be32 spi, int encap_type);
diff --git a/net/ipv4/esp4_offload.c b/net/ipv4/esp4_offload.c
index 05831dea00f4..aca1c85f0795 100644
--- a/net/ipv4/esp4_offload.c
+++ b/net/ipv4/esp4_offload.c
@@ -305,3 +305,4 @@ module_init(esp4_offload_init);
 module_exit(esp4_offload_exit);
 MODULE_LICENSE("GPL");
 MODULE_AUTHOR("Steffen Klassert <steffen.klassert@secunet.com>");
+MODULE_ALIAS_XFRM_OFFLOAD_TYPE(AF_INET, XFRM_PROTO_ESP);
diff --git a/net/ipv6/esp6_offload.c b/net/ipv6/esp6_offload.c
index eec3add177fe..8d4e2ba9163d 100644
--- a/net/ipv6/esp6_offload.c
+++ b/net/ipv6/esp6_offload.c
@@ -334,3 +334,4 @@ module_init(esp6_offload_init);
 module_exit(esp6_offload_exit);
 MODULE_LICENSE("GPL");
 MODULE_AUTHOR("Steffen Klassert <steffen.klassert@secunet.com>");
+MODULE_ALIAS_XFRM_OFFLOAD_TYPE(AF_INET6, XFRM_PROTO_ESP);
diff --git a/net/xfrm/xfrm_device.c b/net/xfrm/xfrm_device.c
index 5cd7a244e88d..1904127f5fb8 100644
--- a/net/xfrm/xfrm_device.c
+++ b/net/xfrm/xfrm_device.c
@@ -63,7 +63,7 @@ int xfrm_dev_state_add(struct net *net, struct xfrm_state *x,
 	xfrm_address_t *daddr;
 
 	if (!x->type_offload)
-		return 0;
+		return -EINVAL;
 
 	/* We don't yet support UDP encapsulation, TFC padding and ESN. */
 	if (x->encap || x->tfcpad || (x->props.flags & XFRM_STATE_ESN))
diff --git a/net/xfrm/xfrm_state.c b/net/xfrm/xfrm_state.c
index 82cbbce69b79..a41e2ef789c0 100644
--- a/net/xfrm/xfrm_state.c
+++ b/net/xfrm/xfrm_state.c
@@ -296,12 +296,14 @@ int xfrm_unregister_type_offload(const struct xfrm_type_offload *type,
 }
 EXPORT_SYMBOL(xfrm_unregister_type_offload);
 
-static const struct xfrm_type_offload *xfrm_get_type_offload(u8 proto, unsigned short family)
+static const struct xfrm_type_offload *
+xfrm_get_type_offload(u8 proto, unsigned short family, bool try_load)
 {
 	struct xfrm_state_afinfo *afinfo;
 	const struct xfrm_type_offload **typemap;
 	const struct xfrm_type_offload *type;
 
+retry:
 	afinfo = xfrm_state_get_afinfo(family);
 	if (unlikely(afinfo == NULL))
 		return NULL;
@@ -311,6 +313,12 @@ static const struct xfrm_type_offload *xfrm_get_type_offload(u8 proto, unsigned
 	if ((type && !try_module_get(type->owner)))
 		type = NULL;
 
+	if (!type && try_load) {
+		request_module("xfrm-offload-%d-%d", family, proto);
+		try_load = 0;
+		goto retry;
+	}
+
 	rcu_read_unlock();
 	return type;
 }
@@ -2165,7 +2173,7 @@ int xfrm_state_mtu(struct xfrm_state *x, int mtu)
 	return mtu - x->props.header_len;
 }
 
-int __xfrm_init_state(struct xfrm_state *x, bool init_replay)
+int __xfrm_init_state(struct xfrm_state *x, bool init_replay, bool offload)
 {
 	struct xfrm_state_afinfo *afinfo;
 	struct xfrm_mode *inner_mode;
@@ -2230,7 +2238,7 @@ int __xfrm_init_state(struct xfrm_state *x, bool init_replay)
 	if (x->type == NULL)
 		goto error;
 
-	x->type_offload = xfrm_get_type_offload(x->id.proto, family);
+	x->type_offload = xfrm_get_type_offload(x->id.proto, family, offload);
 
 	err = x->type->init_state(x);
 	if (err)
@@ -2258,7 +2266,7 @@ EXPORT_SYMBOL(__xfrm_init_state);
 
 int xfrm_init_state(struct xfrm_state *x)
 {
-	return __xfrm_init_state(x, true);
+	return __xfrm_init_state(x, true, false);
 }
 
 EXPORT_SYMBOL(xfrm_init_state);
diff --git a/net/xfrm/xfrm_user.c b/net/xfrm/xfrm_user.c
index 1b539b7dcfab..ffe8d5ef09eb 100644
--- a/net/xfrm/xfrm_user.c
+++ b/net/xfrm/xfrm_user.c
@@ -584,7 +584,7 @@ static struct xfrm_state *xfrm_state_construct(struct net *net,
 
 	xfrm_mark_get(attrs, &x->mark);
 
-	err = __xfrm_init_state(x, false);
+	err = __xfrm_init_state(x, false, attrs[XFRMA_OFFLOAD_DEV]);
 	if (err)
 		goto error;
 
-- 
cgit v1.2.3


From f70f250a77313b542531e1ff7a449cd0ccd83ec0 Mon Sep 17 00:00:00 2001
From: Steffen Klassert <steffen.klassert@secunet.com>
Date: Tue, 1 Aug 2017 12:49:10 +0300
Subject: net: Allow IPsec GSO for local sockets

This patch allows local sockets to make use of XFRM GSO code path.

Signed-off-by: Steffen Klassert <steffen.klassert@secunet.com>
Signed-off-by: Ilan Tayari <ilant@mellanox.com>
---
 include/net/xfrm.h | 19 +++++++++++++++++++
 net/core/sock.c    |  2 +-
 2 files changed, 20 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/net/xfrm.h b/include/net/xfrm.h
index 5a360100136c..18d7de34a5c3 100644
--- a/include/net/xfrm.h
+++ b/include/net/xfrm.h
@@ -1858,6 +1858,20 @@ int xfrm_dev_state_add(struct net *net, struct xfrm_state *x,
 		       struct xfrm_user_offload *xuo);
 bool xfrm_dev_offload_ok(struct sk_buff *skb, struct xfrm_state *x);
 
+static inline bool xfrm_dst_offload_ok(struct dst_entry *dst)
+{
+	struct xfrm_state *x = dst->xfrm;
+
+	if (!x || !x->type_offload)
+		return false;
+
+	if (x->xso.offload_handle && (x->xso.dev == dst->path->dev) &&
+	    !dst->child->xfrm)
+		return true;
+
+	return false;
+}
+
 static inline void xfrm_dev_state_delete(struct xfrm_state *x)
 {
 	struct xfrm_state_offload *xso = &x->xso;
@@ -1900,6 +1914,11 @@ static inline bool xfrm_dev_offload_ok(struct sk_buff *skb, struct xfrm_state *x
 {
 	return false;
 }
+
+static inline bool xfrm_dst_offload_ok(struct dst_entry *dst)
+{
+	return false;
+}
 #endif
 
 static inline int xfrm_mark_get(struct nlattr **attrs, struct xfrm_mark *m)
diff --git a/net/core/sock.c b/net/core/sock.c
index 742f68c9c84a..564f835f408a 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -1757,7 +1757,7 @@ void sk_setup_caps(struct sock *sk, struct dst_entry *dst)
 		sk->sk_route_caps |= NETIF_F_GSO_SOFTWARE;
 	sk->sk_route_caps &= ~sk->sk_route_nocaps;
 	if (sk_can_gso(sk)) {
-		if (dst->header_len) {
+		if (dst->header_len && !xfrm_dst_offload_ok(dst)) {
 			sk->sk_route_caps &= ~NETIF_F_GSO_MASK;
 		} else {
 			sk->sk_route_caps |= NETIF_F_SG | NETIF_F_HW_CSUM;
-- 
cgit v1.2.3


From 278982b50b28d654ff8cca0efb1ff6e835c6cc4c Mon Sep 17 00:00:00 2001
From: "oder_chiou@realtek.com" <oder_chiou@realtek.com>
Date: Wed, 2 Aug 2017 16:01:27 +0800
Subject: ASoC: rt5663: Seprate the DC offset between headphone and headset

The patch seprates the DC offset between headphone and headset.

Signed-off-by: Oder Chiou <oder_chiou@realtek.com>
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 Documentation/devicetree/bindings/sound/rt5663.txt |  5 ++-
 include/sound/rt5663.h                             |  2 +
 sound/soc/codecs/rt5663.c                          | 52 ++++++++++++++++------
 3 files changed, 44 insertions(+), 15 deletions(-)

(limited to 'include')

diff --git a/Documentation/devicetree/bindings/sound/rt5663.txt b/Documentation/devicetree/bindings/sound/rt5663.txt
index f7c7cc513ee8..ff381718c517 100644
--- a/Documentation/devicetree/bindings/sound/rt5663.txt
+++ b/Documentation/devicetree/bindings/sound/rt5663.txt
@@ -14,8 +14,11 @@ Optional properties:
 
 - "realtek,dc_offset_l_manual"
 - "realtek,dc_offset_r_manual"
+- "realtek,dc_offset_l_manual_mic"
+- "realtek,dc_offset_r_manual_mic"
   Based on the different PCB layout, add the manual offset value to
-  compensate the DC offset for each L and R channel.
+  compensate the DC offset for each L and R channel, and they are different
+  between headphone and headset.
 
 Pins on the device (for linking into audio routes) for RT5663:
 
diff --git a/include/sound/rt5663.h b/include/sound/rt5663.h
index be6f1b2d8fbd..7d00e5849706 100644
--- a/include/sound/rt5663.h
+++ b/include/sound/rt5663.h
@@ -14,6 +14,8 @@
 struct rt5663_platform_data {
 	unsigned int dc_offset_l_manual;
 	unsigned int dc_offset_r_manual;
+	unsigned int dc_offset_l_manual_mic;
+	unsigned int dc_offset_r_manual_mic;
 };
 
 #endif
diff --git a/sound/soc/codecs/rt5663.c b/sound/soc/codecs/rt5663.c
index c1f0a72c0133..be93231b0f58 100644
--- a/sound/soc/codecs/rt5663.c
+++ b/sound/soc/codecs/rt5663.c
@@ -1570,9 +1570,43 @@ static int rt5663_jack_detect(struct snd_soc_codec *codec, int jack_insert)
 		case 2:
 			rt5663->jack_type = SND_JACK_HEADSET;
 			rt5663_enable_push_button_irq(codec, true);
+
+			if (rt5663->pdata.dc_offset_l_manual_mic) {
+				regmap_write(rt5663->regmap, RT5663_MIC_DECRO_2,
+					rt5663->pdata.dc_offset_l_manual_mic >>
+					16);
+				regmap_write(rt5663->regmap, RT5663_MIC_DECRO_3,
+					rt5663->pdata.dc_offset_l_manual_mic &
+					0xffff);
+			}
+
+			if (rt5663->pdata.dc_offset_r_manual_mic) {
+				regmap_write(rt5663->regmap, RT5663_MIC_DECRO_5,
+					rt5663->pdata.dc_offset_r_manual_mic >>
+					16);
+				regmap_write(rt5663->regmap, RT5663_MIC_DECRO_6,
+					rt5663->pdata.dc_offset_r_manual_mic &
+					0xffff);
+			}
 			break;
 		default:
 			rt5663->jack_type = SND_JACK_HEADPHONE;
+
+			if (rt5663->pdata.dc_offset_l_manual) {
+				regmap_write(rt5663->regmap, RT5663_MIC_DECRO_2,
+					rt5663->pdata.dc_offset_l_manual >> 16);
+				regmap_write(rt5663->regmap, RT5663_MIC_DECRO_3,
+					rt5663->pdata.dc_offset_l_manual &
+					0xffff);
+			}
+
+			if (rt5663->pdata.dc_offset_r_manual) {
+				regmap_write(rt5663->regmap, RT5663_MIC_DECRO_5,
+					rt5663->pdata.dc_offset_r_manual >> 16);
+				regmap_write(rt5663->regmap, RT5663_MIC_DECRO_6,
+					rt5663->pdata.dc_offset_r_manual &
+					0xffff);
+			}
 			break;
 		}
 	} else {
@@ -3133,6 +3167,10 @@ static int rt5663_parse_dp(struct rt5663_priv *rt5663, struct device *dev)
 		&rt5663->pdata.dc_offset_l_manual);
 	device_property_read_u32(dev, "realtek,dc_offset_r_manual",
 		&rt5663->pdata.dc_offset_r_manual);
+	device_property_read_u32(dev, "realtek,dc_offset_l_manual_mic",
+		&rt5663->pdata.dc_offset_l_manual_mic);
+	device_property_read_u32(dev, "realtek,dc_offset_r_manual_mic",
+		&rt5663->pdata.dc_offset_r_manual_mic);
 
 	return 0;
 }
@@ -3221,20 +3259,6 @@ static int rt5663_i2c_probe(struct i2c_client *i2c,
 		dev_err(&i2c->dev, "%s:Unknown codec type\n", __func__);
 	}
 
-	if (rt5663->pdata.dc_offset_l_manual) {
-		regmap_write(rt5663->regmap, RT5663_MIC_DECRO_2,
-			rt5663->pdata.dc_offset_l_manual >> 16);
-		regmap_write(rt5663->regmap, RT5663_MIC_DECRO_3,
-			rt5663->pdata.dc_offset_l_manual & 0xffff);
-	}
-
-	if (rt5663->pdata.dc_offset_r_manual) {
-		regmap_write(rt5663->regmap, RT5663_MIC_DECRO_5,
-			rt5663->pdata.dc_offset_r_manual >> 16);
-		regmap_write(rt5663->regmap, RT5663_MIC_DECRO_6,
-			rt5663->pdata.dc_offset_r_manual & 0xffff);
-	}
-
 	/* GPIO1 as IRQ */
 	regmap_update_bits(rt5663->regmap, RT5663_GPIO_1, RT5663_GP1_PIN_MASK,
 		RT5663_GP1_PIN_IRQ);
-- 
cgit v1.2.3


From 2a04aabf5c96c9e25df488949b21223bcc623815 Mon Sep 17 00:00:00 2001
From: Julia Lawall <julia.lawall@lip6.fr>
Date: Tue, 1 Aug 2017 12:25:01 +0200
Subject: netfilter: constify nf_conntrack_l3/4proto parameters

When a nf_conntrack_l3/4proto parameter is not on the left hand side
of an assignment, its address is not taken, and it is not passed to a
function that may modify its fields, then it can be declared as const.

This change is useful from a documentation point of view, and can
possibly facilitate making some nf_conntrack_l3/4proto structures const
subsequently.

Done with the help of Coccinelle.

Signed-off-by: Julia Lawall <Julia.Lawall@lip6.fr>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/net/netfilter/nf_conntrack_l4proto.h | 14 +++++++-------
 include/net/netfilter/nf_conntrack_timeout.h |  2 +-
 net/netfilter/nf_conntrack_core.c            |  8 ++++----
 net/netfilter/nf_conntrack_netlink.c         |  6 +++---
 net/netfilter/nf_conntrack_proto.c           | 24 ++++++++++++------------
 net/netfilter/nfnetlink_cttimeout.c          |  5 +++--
 6 files changed, 30 insertions(+), 29 deletions(-)

(limited to 'include')

diff --git a/include/net/netfilter/nf_conntrack_l4proto.h b/include/net/netfilter/nf_conntrack_l4proto.h
index 7032e044bbe2..b6e27cafb1d9 100644
--- a/include/net/netfilter/nf_conntrack_l4proto.h
+++ b/include/net/netfilter/nf_conntrack_l4proto.h
@@ -125,23 +125,23 @@ struct nf_conntrack_l4proto *__nf_ct_l4proto_find(u_int16_t l3proto,
 
 struct nf_conntrack_l4proto *nf_ct_l4proto_find_get(u_int16_t l3proto,
 						    u_int8_t l4proto);
-void nf_ct_l4proto_put(struct nf_conntrack_l4proto *p);
+void nf_ct_l4proto_put(const struct nf_conntrack_l4proto *p);
 
 /* Protocol pernet registration. */
 int nf_ct_l4proto_pernet_register_one(struct net *net,
-				      struct nf_conntrack_l4proto *proto);
+				const struct nf_conntrack_l4proto *proto);
 void nf_ct_l4proto_pernet_unregister_one(struct net *net,
-					 struct nf_conntrack_l4proto *proto);
+				const struct nf_conntrack_l4proto *proto);
 int nf_ct_l4proto_pernet_register(struct net *net,
-				  struct nf_conntrack_l4proto *proto[],
+				  struct nf_conntrack_l4proto *const proto[],
 				  unsigned int num_proto);
 void nf_ct_l4proto_pernet_unregister(struct net *net,
-				     struct nf_conntrack_l4proto *proto[],
-				     unsigned int num_proto);
+				struct nf_conntrack_l4proto *const proto[],
+				unsigned int num_proto);
 
 /* Protocol global registration. */
 int nf_ct_l4proto_register_one(struct nf_conntrack_l4proto *proto);
-void nf_ct_l4proto_unregister_one(struct nf_conntrack_l4proto *proto);
+void nf_ct_l4proto_unregister_one(const struct nf_conntrack_l4proto *proto);
 int nf_ct_l4proto_register(struct nf_conntrack_l4proto *proto[],
 			   unsigned int num_proto);
 void nf_ct_l4proto_unregister(struct nf_conntrack_l4proto *proto[],
diff --git a/include/net/netfilter/nf_conntrack_timeout.h b/include/net/netfilter/nf_conntrack_timeout.h
index d40b89355fdd..b222957062b5 100644
--- a/include/net/netfilter/nf_conntrack_timeout.h
+++ b/include/net/netfilter/nf_conntrack_timeout.h
@@ -68,7 +68,7 @@ struct nf_conn_timeout *nf_ct_timeout_ext_add(struct nf_conn *ct,
 
 static inline unsigned int *
 nf_ct_timeout_lookup(struct net *net, struct nf_conn *ct,
-		     struct nf_conntrack_l4proto *l4proto)
+		     const struct nf_conntrack_l4proto *l4proto)
 {
 #ifdef CONFIG_NF_CONNTRACK_TIMEOUT
 	struct nf_conn_timeout *timeout_ext;
diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c
index 2bc499186186..f2f00eaf217d 100644
--- a/net/netfilter/nf_conntrack_core.c
+++ b/net/netfilter/nf_conntrack_core.c
@@ -1176,8 +1176,8 @@ EXPORT_SYMBOL_GPL(nf_conntrack_free);
 static noinline struct nf_conntrack_tuple_hash *
 init_conntrack(struct net *net, struct nf_conn *tmpl,
 	       const struct nf_conntrack_tuple *tuple,
-	       struct nf_conntrack_l3proto *l3proto,
-	       struct nf_conntrack_l4proto *l4proto,
+	       const struct nf_conntrack_l3proto *l3proto,
+	       const struct nf_conntrack_l4proto *l4proto,
 	       struct sk_buff *skb,
 	       unsigned int dataoff, u32 hash)
 {
@@ -1288,8 +1288,8 @@ resolve_normal_ct(struct net *net, struct nf_conn *tmpl,
 		  unsigned int dataoff,
 		  u_int16_t l3num,
 		  u_int8_t protonum,
-		  struct nf_conntrack_l3proto *l3proto,
-		  struct nf_conntrack_l4proto *l4proto)
+		  const struct nf_conntrack_l3proto *l3proto,
+		  const struct nf_conntrack_l4proto *l4proto)
 {
 	const struct nf_conntrack_zone *zone;
 	struct nf_conntrack_tuple tuple;
diff --git a/net/netfilter/nf_conntrack_netlink.c b/net/netfilter/nf_conntrack_netlink.c
index 4922c8aefb2a..f4ca48817f66 100644
--- a/net/netfilter/nf_conntrack_netlink.c
+++ b/net/netfilter/nf_conntrack_netlink.c
@@ -61,8 +61,8 @@ MODULE_LICENSE("GPL");
 static char __initdata version[] = "0.93";
 
 static int ctnetlink_dump_tuples_proto(struct sk_buff *skb,
-				       const struct nf_conntrack_tuple *tuple,
-				       struct nf_conntrack_l4proto *l4proto)
+				const struct nf_conntrack_tuple *tuple,
+				const struct nf_conntrack_l4proto *l4proto)
 {
 	int ret = 0;
 	struct nlattr *nest_parms;
@@ -86,7 +86,7 @@ nla_put_failure:
 
 static int ctnetlink_dump_tuples_ip(struct sk_buff *skb,
 				    const struct nf_conntrack_tuple *tuple,
-				    struct nf_conntrack_l3proto *l3proto)
+				    const struct nf_conntrack_l3proto *l3proto)
 {
 	int ret = 0;
 	struct nlattr *nest_parms;
diff --git a/net/netfilter/nf_conntrack_proto.c b/net/netfilter/nf_conntrack_proto.c
index 7c89dade6fd3..27810cf816a6 100644
--- a/net/netfilter/nf_conntrack_proto.c
+++ b/net/netfilter/nf_conntrack_proto.c
@@ -188,7 +188,7 @@ nf_ct_l4proto_find_get(u_int16_t l3num, u_int8_t l4num)
 }
 EXPORT_SYMBOL_GPL(nf_ct_l4proto_find_get);
 
-void nf_ct_l4proto_put(struct nf_conntrack_l4proto *p)
+void nf_ct_l4proto_put(const struct nf_conntrack_l4proto *p)
 {
 	module_put(p->me);
 }
@@ -257,7 +257,7 @@ void nf_ct_l3proto_unregister(struct nf_conntrack_l3proto *proto)
 EXPORT_SYMBOL_GPL(nf_ct_l3proto_unregister);
 
 static struct nf_proto_net *nf_ct_l4proto_net(struct net *net,
-					      struct nf_conntrack_l4proto *l4proto)
+				const struct nf_conntrack_l4proto *l4proto)
 {
 	if (l4proto->get_net_proto) {
 		/* statically built-in protocols use static per-net */
@@ -272,7 +272,7 @@ static struct nf_proto_net *nf_ct_l4proto_net(struct net *net,
 static
 int nf_ct_l4proto_register_sysctl(struct net *net,
 				  struct nf_proto_net *pn,
-				  struct nf_conntrack_l4proto *l4proto)
+				  const struct nf_conntrack_l4proto *l4proto)
 {
 	int err = 0;
 
@@ -295,8 +295,8 @@ int nf_ct_l4proto_register_sysctl(struct net *net,
 
 static
 void nf_ct_l4proto_unregister_sysctl(struct net *net,
-				     struct nf_proto_net *pn,
-				     struct nf_conntrack_l4proto *l4proto)
+				struct nf_proto_net *pn,
+				const struct nf_conntrack_l4proto *l4proto)
 {
 #ifdef CONFIG_SYSCTL
 	if (pn->ctl_table_header != NULL)
@@ -366,7 +366,7 @@ out_unlock:
 EXPORT_SYMBOL_GPL(nf_ct_l4proto_register_one);
 
 int nf_ct_l4proto_pernet_register_one(struct net *net,
-				      struct nf_conntrack_l4proto *l4proto)
+				const struct nf_conntrack_l4proto *l4proto)
 {
 	int ret = 0;
 	struct nf_proto_net *pn = NULL;
@@ -391,7 +391,7 @@ out:
 }
 EXPORT_SYMBOL_GPL(nf_ct_l4proto_pernet_register_one);
 
-static void __nf_ct_l4proto_unregister_one(struct nf_conntrack_l4proto *l4proto)
+static void __nf_ct_l4proto_unregister_one(const struct nf_conntrack_l4proto *l4proto)
 
 {
 	BUG_ON(l4proto->l3proto >= ARRAY_SIZE(nf_ct_protos));
@@ -404,7 +404,7 @@ static void __nf_ct_l4proto_unregister_one(struct nf_conntrack_l4proto *l4proto)
 			   &nf_conntrack_l4proto_generic);
 }
 
-void nf_ct_l4proto_unregister_one(struct nf_conntrack_l4proto *l4proto)
+void nf_ct_l4proto_unregister_one(const struct nf_conntrack_l4proto *l4proto)
 {
 	mutex_lock(&nf_ct_proto_mutex);
 	__nf_ct_l4proto_unregister_one(l4proto);
@@ -415,7 +415,7 @@ void nf_ct_l4proto_unregister_one(struct nf_conntrack_l4proto *l4proto)
 EXPORT_SYMBOL_GPL(nf_ct_l4proto_unregister_one);
 
 void nf_ct_l4proto_pernet_unregister_one(struct net *net,
-					 struct nf_conntrack_l4proto *l4proto)
+				const struct nf_conntrack_l4proto *l4proto)
 {
 	struct nf_proto_net *pn = nf_ct_l4proto_net(net, l4proto);
 
@@ -449,7 +449,7 @@ int nf_ct_l4proto_register(struct nf_conntrack_l4proto *l4proto[],
 EXPORT_SYMBOL_GPL(nf_ct_l4proto_register);
 
 int nf_ct_l4proto_pernet_register(struct net *net,
-				  struct nf_conntrack_l4proto *l4proto[],
+				  struct nf_conntrack_l4proto *const l4proto[],
 				  unsigned int num_proto)
 {
 	int ret = -EINVAL;
@@ -485,8 +485,8 @@ void nf_ct_l4proto_unregister(struct nf_conntrack_l4proto *l4proto[],
 EXPORT_SYMBOL_GPL(nf_ct_l4proto_unregister);
 
 void nf_ct_l4proto_pernet_unregister(struct net *net,
-				     struct nf_conntrack_l4proto *l4proto[],
-				     unsigned int num_proto)
+				struct nf_conntrack_l4proto *const l4proto[],
+				unsigned int num_proto)
 {
 	while (num_proto-- != 0)
 		nf_ct_l4proto_pernet_unregister_one(net, l4proto[num_proto]);
diff --git a/net/netfilter/nfnetlink_cttimeout.c b/net/netfilter/nfnetlink_cttimeout.c
index f4fb6d4dd0b9..fcabccc99f0d 100644
--- a/net/netfilter/nfnetlink_cttimeout.c
+++ b/net/netfilter/nfnetlink_cttimeout.c
@@ -47,7 +47,8 @@ static const struct nla_policy cttimeout_nla_policy[CTA_TIMEOUT_MAX+1] = {
 };
 
 static int
-ctnl_timeout_parse_policy(void *timeouts, struct nf_conntrack_l4proto *l4proto,
+ctnl_timeout_parse_policy(void *timeouts,
+			  const struct nf_conntrack_l4proto *l4proto,
 			  struct net *net, const struct nlattr *attr)
 {
 	int ret = 0;
@@ -401,7 +402,7 @@ err:
 static int
 cttimeout_default_fill_info(struct net *net, struct sk_buff *skb, u32 portid,
 			    u32 seq, u32 type, int event,
-			    struct nf_conntrack_l4proto *l4proto)
+			    const struct nf_conntrack_l4proto *l4proto)
 {
 	struct nlmsghdr *nlh;
 	struct nfgenmsg *nfmsg;
-- 
cgit v1.2.3


From b2f9d432deebab5096aad5942c2f2b1ec2865f5a Mon Sep 17 00:00:00 2001
From: WANG Cong <xiyou.wangcong@gmail.com>
Date: Tue, 1 Aug 2017 13:18:09 -0700
Subject: flow_dissector: remove unused functions

They are introduced by commit f70ea018da06
("net: Add functions to get skb->hash based on flow structures")
but never gets used in tree.

Signed-off-by: Cong Wang <xiyou.wangcong@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/skbuff.h    | 16 ----------------
 net/core/flow_dissector.c | 45 ---------------------------------------------
 2 files changed, 61 deletions(-)

(limited to 'include')

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 6f9f1b2715ec..be76082f48aa 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -1158,8 +1158,6 @@ static inline __u32 skb_get_hash(struct sk_buff *skb)
 	return skb->hash;
 }
 
-__u32 __skb_get_hash_flowi6(struct sk_buff *skb, const struct flowi6 *fl6);
-
 static inline __u32 skb_get_hash_flowi6(struct sk_buff *skb, const struct flowi6 *fl6)
 {
 	if (!skb->l4_hash && !skb->sw_hash) {
@@ -1172,20 +1170,6 @@ static inline __u32 skb_get_hash_flowi6(struct sk_buff *skb, const struct flowi6
 	return skb->hash;
 }
 
-__u32 __skb_get_hash_flowi4(struct sk_buff *skb, const struct flowi4 *fl);
-
-static inline __u32 skb_get_hash_flowi4(struct sk_buff *skb, const struct flowi4 *fl4)
-{
-	if (!skb->l4_hash && !skb->sw_hash) {
-		struct flow_keys keys;
-		__u32 hash = __get_hash_from_flowi4(fl4, &keys);
-
-		__skb_set_sw_hash(skb, hash, flow_keys_have_l4(&keys));
-	}
-
-	return skb->hash;
-}
-
 __u32 skb_get_hash_perturb(const struct sk_buff *skb, u32 perturb);
 
 static inline __u32 skb_get_hash_raw(const struct sk_buff *skb)
diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c
index fc5fc4594c90..0cc672aba1f0 100644
--- a/net/core/flow_dissector.c
+++ b/net/core/flow_dissector.c
@@ -998,51 +998,6 @@ __u32 skb_get_hash_perturb(const struct sk_buff *skb, u32 perturb)
 }
 EXPORT_SYMBOL(skb_get_hash_perturb);
 
-__u32 __skb_get_hash_flowi6(struct sk_buff *skb, const struct flowi6 *fl6)
-{
-	struct flow_keys keys;
-
-	memset(&keys, 0, sizeof(keys));
-
-	memcpy(&keys.addrs.v6addrs.src, &fl6->saddr,
-	       sizeof(keys.addrs.v6addrs.src));
-	memcpy(&keys.addrs.v6addrs.dst, &fl6->daddr,
-	       sizeof(keys.addrs.v6addrs.dst));
-	keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
-	keys.ports.src = fl6->fl6_sport;
-	keys.ports.dst = fl6->fl6_dport;
-	keys.keyid.keyid = fl6->fl6_gre_key;
-	keys.tags.flow_label = (__force u32)fl6->flowlabel;
-	keys.basic.ip_proto = fl6->flowi6_proto;
-
-	__skb_set_sw_hash(skb, flow_hash_from_keys(&keys),
-			  flow_keys_have_l4(&keys));
-
-	return skb->hash;
-}
-EXPORT_SYMBOL(__skb_get_hash_flowi6);
-
-__u32 __skb_get_hash_flowi4(struct sk_buff *skb, const struct flowi4 *fl4)
-{
-	struct flow_keys keys;
-
-	memset(&keys, 0, sizeof(keys));
-
-	keys.addrs.v4addrs.src = fl4->saddr;
-	keys.addrs.v4addrs.dst = fl4->daddr;
-	keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
-	keys.ports.src = fl4->fl4_sport;
-	keys.ports.dst = fl4->fl4_dport;
-	keys.keyid.keyid = fl4->fl4_gre_key;
-	keys.basic.ip_proto = fl4->flowi4_proto;
-
-	__skb_set_sw_hash(skb, flow_hash_from_keys(&keys),
-			  flow_keys_have_l4(&keys));
-
-	return skb->hash;
-}
-EXPORT_SYMBOL(__skb_get_hash_flowi4);
-
 u32 __skb_get_poff(const struct sk_buff *skb, void *data,
 		   const struct flow_keys *keys, int hlen)
 {
-- 
cgit v1.2.3


From 0679dee03c6d706d57145ea92c23d08fa10a1999 Mon Sep 17 00:00:00 2001
From: Roman Gushchin <guro@fb.com>
Date: Wed, 2 Aug 2017 17:55:29 +0100
Subject: cgroup: keep track of number of descent cgroups

Keep track of the number of online and dying descent cgroups.

This data will be used later to add an ability to control cgroup
hierarchy (limit the depth and the number of descent cgroups)
and display hierarchy stats.

Signed-off-by: Roman Gushchin <guro@fb.com>
Suggested-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Zefan Li <lizefan@huawei.com>
Cc: Waiman Long <longman@redhat.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: kernel-team@fb.com
Cc: cgroups@vger.kernel.org
Cc: linux-doc@vger.kernel.org
Cc: linux-kernel@vger.kernel.org
---
 include/linux/cgroup-defs.h |  8 ++++++++
 kernel/cgroup/cgroup.c      | 19 +++++++++++++++++--
 2 files changed, 25 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h
index 9d741959f218..58b4c425a155 100644
--- a/include/linux/cgroup-defs.h
+++ b/include/linux/cgroup-defs.h
@@ -273,6 +273,14 @@ struct cgroup {
 	 */
 	int level;
 
+	/*
+	 * Keep track of total numbers of visible and dying descent cgroups.
+	 * Dying cgroups are cgroups which were deleted by a user,
+	 * but are still existing because someone else is holding a reference.
+	 */
+	int nr_descendants;
+	int nr_dying_descendants;
+
 	/*
 	 * Each non-empty css_set associated with this cgroup contributes
 	 * one to nr_populated_csets.  The counter is zero iff this cgroup
diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
index 85f6a112344b..cfdbb1e780de 100644
--- a/kernel/cgroup/cgroup.c
+++ b/kernel/cgroup/cgroup.c
@@ -4408,9 +4408,15 @@ static void css_release_work_fn(struct work_struct *work)
 		if (ss->css_released)
 			ss->css_released(css);
 	} else {
+		struct cgroup *tcgrp;
+
 		/* cgroup release path */
 		trace_cgroup_release(cgrp);
 
+		for (tcgrp = cgroup_parent(cgrp); tcgrp;
+		     tcgrp = cgroup_parent(tcgrp))
+			tcgrp->nr_dying_descendants--;
+
 		cgroup_idr_remove(&cgrp->root->cgroup_idr, cgrp->id);
 		cgrp->id = -1;
 
@@ -4609,9 +4615,13 @@ static struct cgroup *cgroup_create(struct cgroup *parent)
 	cgrp->root = root;
 	cgrp->level = level;
 
-	for (tcgrp = cgrp; tcgrp; tcgrp = cgroup_parent(tcgrp))
+	for (tcgrp = cgrp; tcgrp; tcgrp = cgroup_parent(tcgrp)) {
 		cgrp->ancestor_ids[tcgrp->level] = tcgrp->id;
 
+		if (tcgrp != cgrp)
+			tcgrp->nr_descendants++;
+	}
+
 	if (notify_on_release(parent))
 		set_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags);
 
@@ -4817,7 +4827,7 @@ static void kill_css(struct cgroup_subsys_state *css)
 static int cgroup_destroy_locked(struct cgroup *cgrp)
 	__releases(&cgroup_mutex) __acquires(&cgroup_mutex)
 {
-	struct cgroup *parent = cgroup_parent(cgrp);
+	struct cgroup *tcgrp, *parent = cgroup_parent(cgrp);
 	struct cgroup_subsys_state *css;
 	struct cgrp_cset_link *link;
 	int ssid;
@@ -4865,6 +4875,11 @@ static int cgroup_destroy_locked(struct cgroup *cgrp)
 	if (parent && cgroup_is_threaded(cgrp))
 		parent->nr_threaded_children--;
 
+	for (tcgrp = cgroup_parent(cgrp); tcgrp; tcgrp = cgroup_parent(tcgrp)) {
+		tcgrp->nr_descendants--;
+		tcgrp->nr_dying_descendants++;
+	}
+
 	cgroup1_check_for_release(cgroup_parent(cgrp));
 
 	/* put the base reference */
-- 
cgit v1.2.3


From 1a926e0bbab83bae8207d05a533173425e0496d1 Mon Sep 17 00:00:00 2001
From: Roman Gushchin <guro@fb.com>
Date: Fri, 28 Jul 2017 18:28:44 +0100
Subject: cgroup: implement hierarchy limits

Creating cgroup hierearchies of unreasonable size can affect
overall system performance. A user might want to limit the
size of cgroup hierarchy. This is especially important if a user
is delegating some cgroup sub-tree.

To address this issue, introduce an ability to control
the size of cgroup hierarchy.

The cgroup.max.descendants control file allows to set the maximum
allowed number of descendant cgroups.
The cgroup.max.depth file controls the maximum depth of the cgroup
tree. Both are single value r/w files, with "max" default value.

The control files exist on each hierarchy level (including root).
When a new cgroup is created, we check the total descendants
and depth limits on each level, and if none of them are exceeded,
a new cgroup is created.

Only alive cgroups are counted, removed (dying) cgroups are
ignored.

Signed-off-by: Roman Gushchin <guro@fb.com>
Suggested-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Zefan Li <lizefan@huawei.com>
Cc: Waiman Long <longman@redhat.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: kernel-team@fb.com
Cc: cgroups@vger.kernel.org
Cc: linux-doc@vger.kernel.org
Cc: linux-kernel@vger.kernel.org
---
 Documentation/cgroup-v2.txt |  14 +++++
 include/linux/cgroup-defs.h |   5 ++
 kernel/cgroup/cgroup.c      | 126 ++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 145 insertions(+)

(limited to 'include')

diff --git a/Documentation/cgroup-v2.txt b/Documentation/cgroup-v2.txt
index dec5afdaa36d..46ec3f76211c 100644
--- a/Documentation/cgroup-v2.txt
+++ b/Documentation/cgroup-v2.txt
@@ -854,6 +854,20 @@ All cgroup core files are prefixed with "cgroup."
 		1 if the cgroup or its descendants contains any live
 		processes; otherwise, 0.
 
+  cgroup.max.descendants
+	A read-write single value files.  The default is "max".
+
+	Maximum allowed number of descent cgroups.
+	If the actual number of descendants is equal or larger,
+	an attempt to create a new cgroup in the hierarchy will fail.
+
+  cgroup.max.depth
+	A read-write single value files.  The default is "max".
+
+	Maximum allowed descent depth below the current cgroup.
+	If the actual descent depth is equal or larger,
+	an attempt to create a new child cgroup will fail.
+
 
 Controllers
 ===========
diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h
index 58b4c425a155..59e4ad9e7bac 100644
--- a/include/linux/cgroup-defs.h
+++ b/include/linux/cgroup-defs.h
@@ -273,13 +273,18 @@ struct cgroup {
 	 */
 	int level;
 
+	/* Maximum allowed descent tree depth */
+	int max_depth;
+
 	/*
 	 * Keep track of total numbers of visible and dying descent cgroups.
 	 * Dying cgroups are cgroups which were deleted by a user,
 	 * but are still existing because someone else is holding a reference.
+	 * max_descendants is a maximum allowed number of descent cgroups.
 	 */
 	int nr_descendants;
 	int nr_dying_descendants;
+	int max_descendants;
 
 	/*
 	 * Each non-empty css_set associated with this cgroup contributes
diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
index cfdbb1e780de..0fd9134e1720 100644
--- a/kernel/cgroup/cgroup.c
+++ b/kernel/cgroup/cgroup.c
@@ -1827,6 +1827,8 @@ static void init_cgroup_housekeeping(struct cgroup *cgrp)
 	cgrp->self.cgroup = cgrp;
 	cgrp->self.flags |= CSS_ONLINE;
 	cgrp->dom_cgrp = cgrp;
+	cgrp->max_descendants = INT_MAX;
+	cgrp->max_depth = INT_MAX;
 
 	for_each_subsys(ss, ssid)
 		INIT_LIST_HEAD(&cgrp->e_csets[ssid]);
@@ -3209,6 +3211,92 @@ static ssize_t cgroup_type_write(struct kernfs_open_file *of, char *buf,
 	return ret ?: nbytes;
 }
 
+static int cgroup_max_descendants_show(struct seq_file *seq, void *v)
+{
+	struct cgroup *cgrp = seq_css(seq)->cgroup;
+	int descendants = READ_ONCE(cgrp->max_descendants);
+
+	if (descendants == INT_MAX)
+		seq_puts(seq, "max\n");
+	else
+		seq_printf(seq, "%d\n", descendants);
+
+	return 0;
+}
+
+static ssize_t cgroup_max_descendants_write(struct kernfs_open_file *of,
+					   char *buf, size_t nbytes, loff_t off)
+{
+	struct cgroup *cgrp;
+	int descendants;
+	ssize_t ret;
+
+	buf = strstrip(buf);
+	if (!strcmp(buf, "max")) {
+		descendants = INT_MAX;
+	} else {
+		ret = kstrtoint(buf, 0, &descendants);
+		if (ret)
+			return ret;
+	}
+
+	if (descendants < 0 || descendants > INT_MAX)
+		return -ERANGE;
+
+	cgrp = cgroup_kn_lock_live(of->kn, false);
+	if (!cgrp)
+		return -ENOENT;
+
+	cgrp->max_descendants = descendants;
+
+	cgroup_kn_unlock(of->kn);
+
+	return nbytes;
+}
+
+static int cgroup_max_depth_show(struct seq_file *seq, void *v)
+{
+	struct cgroup *cgrp = seq_css(seq)->cgroup;
+	int depth = READ_ONCE(cgrp->max_depth);
+
+	if (depth == INT_MAX)
+		seq_puts(seq, "max\n");
+	else
+		seq_printf(seq, "%d\n", depth);
+
+	return 0;
+}
+
+static ssize_t cgroup_max_depth_write(struct kernfs_open_file *of,
+				      char *buf, size_t nbytes, loff_t off)
+{
+	struct cgroup *cgrp;
+	ssize_t ret;
+	int depth;
+
+	buf = strstrip(buf);
+	if (!strcmp(buf, "max")) {
+		depth = INT_MAX;
+	} else {
+		ret = kstrtoint(buf, 0, &depth);
+		if (ret)
+			return ret;
+	}
+
+	if (depth < 0 || depth > INT_MAX)
+		return -ERANGE;
+
+	cgrp = cgroup_kn_lock_live(of->kn, false);
+	if (!cgrp)
+		return -ENOENT;
+
+	cgrp->max_depth = depth;
+
+	cgroup_kn_unlock(of->kn);
+
+	return nbytes;
+}
+
 static int cgroup_events_show(struct seq_file *seq, void *v)
 {
 	seq_printf(seq, "populated %d\n",
@@ -4309,6 +4397,16 @@ static struct cftype cgroup_base_files[] = {
 		.file_offset = offsetof(struct cgroup, events_file),
 		.seq_show = cgroup_events_show,
 	},
+	{
+		.name = "cgroup.max.descendants",
+		.seq_show = cgroup_max_descendants_show,
+		.write = cgroup_max_descendants_write,
+	},
+	{
+		.name = "cgroup.max.depth",
+		.seq_show = cgroup_max_depth_show,
+		.write = cgroup_max_depth_write,
+	},
 	{ }	/* terminate */
 };
 
@@ -4662,6 +4760,29 @@ out_free_cgrp:
 	return ERR_PTR(ret);
 }
 
+static bool cgroup_check_hierarchy_limits(struct cgroup *parent)
+{
+	struct cgroup *cgroup;
+	int ret = false;
+	int level = 1;
+
+	lockdep_assert_held(&cgroup_mutex);
+
+	for (cgroup = parent; cgroup; cgroup = cgroup_parent(cgroup)) {
+		if (cgroup->nr_descendants >= cgroup->max_descendants)
+			goto fail;
+
+		if (level > cgroup->max_depth)
+			goto fail;
+
+		level++;
+	}
+
+	ret = true;
+fail:
+	return ret;
+}
+
 int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name, umode_t mode)
 {
 	struct cgroup *parent, *cgrp;
@@ -4676,6 +4797,11 @@ int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name, umode_t mode)
 	if (!parent)
 		return -ENODEV;
 
+	if (!cgroup_check_hierarchy_limits(parent)) {
+		ret = -EAGAIN;
+		goto out_unlock;
+	}
+
 	cgrp = cgroup_create(parent);
 	if (IS_ERR(cgrp)) {
 		ret = PTR_ERR(cgrp);
-- 
cgit v1.2.3


From 1e140df0496541c473e5d40b0da5d2d626b2e343 Mon Sep 17 00:00:00 2001
From: Bjorn Andersson <bjorn.andersson@linaro.org>
Date: Mon, 24 Jul 2017 22:56:43 -0700
Subject: remoteproc: qcom: Add support for SSR notifications

This adds the remoteproc part of subsystem restart, which is responsible
for emitting notifications to other processors in the system about a
dying remoteproc instance.

These notifications are propagated to the various communication systems
in the various remote processors to shut down communication links that
was left in a dangling state as the remoteproc was stopped (or crashed).

Signed-off-by: Bjorn Andersson <bjorn.andersson@linaro.org>
---
 drivers/remoteproc/qcom_adsp_pil.c    |  6 +++
 drivers/remoteproc/qcom_common.c      | 71 +++++++++++++++++++++++++++++++++++
 drivers/remoteproc/qcom_common.h      | 10 +++++
 drivers/remoteproc/qcom_q6v5_pil.c    |  3 ++
 include/linux/remoteproc/qcom_rproc.h | 22 +++++++++++
 5 files changed, 112 insertions(+)
 create mode 100644 include/linux/remoteproc/qcom_rproc.h

(limited to 'include')

diff --git a/drivers/remoteproc/qcom_adsp_pil.c b/drivers/remoteproc/qcom_adsp_pil.c
index 49fe2f807e1d..a41d399766fd 100644
--- a/drivers/remoteproc/qcom_adsp_pil.c
+++ b/drivers/remoteproc/qcom_adsp_pil.c
@@ -38,6 +38,7 @@ struct adsp_data {
 	const char *firmware_name;
 	int pas_id;
 	bool has_aggre2_clk;
+	const char *ssr_name;
 };
 
 struct qcom_adsp {
@@ -72,6 +73,7 @@ struct qcom_adsp {
 	size_t mem_size;
 
 	struct qcom_rproc_subdev smd_subdev;
+	struct qcom_rproc_ssr ssr_subdev;
 };
 
 static int adsp_load(struct rproc *rproc, const struct firmware *fw)
@@ -402,6 +404,7 @@ static int adsp_probe(struct platform_device *pdev)
 	}
 
 	qcom_add_smd_subdev(rproc, &adsp->smd_subdev);
+	qcom_add_ssr_subdev(rproc, &adsp->ssr_subdev, desc->ssr_name);
 
 	ret = rproc_add(rproc);
 	if (ret)
@@ -423,6 +426,7 @@ static int adsp_remove(struct platform_device *pdev)
 	rproc_del(adsp->rproc);
 
 	qcom_remove_smd_subdev(adsp->rproc, &adsp->smd_subdev);
+	qcom_remove_ssr_subdev(adsp->rproc, &adsp->ssr_subdev);
 	rproc_free(adsp->rproc);
 
 	return 0;
@@ -433,6 +437,7 @@ static const struct adsp_data adsp_resource_init = {
 		.firmware_name = "adsp.mdt",
 		.pas_id = 1,
 		.has_aggre2_clk = false,
+		.ssr_name = "lpass",
 };
 
 static const struct adsp_data slpi_resource_init = {
@@ -440,6 +445,7 @@ static const struct adsp_data slpi_resource_init = {
 		.firmware_name = "slpi.mdt",
 		.pas_id = 12,
 		.has_aggre2_clk = true,
+		.ssr_name = "dsps",
 };
 
 static const struct of_device_id adsp_of_match[] = {
diff --git a/drivers/remoteproc/qcom_common.c b/drivers/remoteproc/qcom_common.c
index bb90481215c6..31b8291dfc8b 100644
--- a/drivers/remoteproc/qcom_common.c
+++ b/drivers/remoteproc/qcom_common.c
@@ -18,6 +18,7 @@
 #include <linux/firmware.h>
 #include <linux/kernel.h>
 #include <linux/module.h>
+#include <linux/notifier.h>
 #include <linux/remoteproc.h>
 #include <linux/rpmsg/qcom_smd.h>
 
@@ -25,6 +26,9 @@
 #include "qcom_common.h"
 
 #define to_smd_subdev(d) container_of(d, struct qcom_rproc_subdev, subdev)
+#define to_ssr_subdev(d) container_of(d, struct qcom_rproc_ssr, subdev)
+
+BLOCKING_NOTIFIER_HEAD(ssr_notifiers);
 
 /**
  * qcom_mdt_find_rsc_table() - provide dummy resource table for remoteproc
@@ -92,5 +96,72 @@ void qcom_remove_smd_subdev(struct rproc *rproc, struct qcom_rproc_subdev *smd)
 }
 EXPORT_SYMBOL_GPL(qcom_remove_smd_subdev);
 
+/**
+ * qcom_register_ssr_notifier() - register SSR notification handler
+ * @nb:		notifier_block to notify for restart notifications
+ *
+ * Returns 0 on success, negative errno on failure.
+ *
+ * This register the @notify function as handler for restart notifications. As
+ * remote processors are stopped this function will be called, with the SSR
+ * name passed as a parameter.
+ */
+int qcom_register_ssr_notifier(struct notifier_block *nb)
+{
+	return blocking_notifier_chain_register(&ssr_notifiers, nb);
+}
+EXPORT_SYMBOL_GPL(qcom_register_ssr_notifier);
+
+/**
+ * qcom_unregister_ssr_notifier() - unregister SSR notification handler
+ * @nb:		notifier_block to unregister
+ */
+void qcom_unregister_ssr_notifier(struct notifier_block *nb)
+{
+	blocking_notifier_chain_unregister(&ssr_notifiers, nb);
+}
+EXPORT_SYMBOL_GPL(qcom_unregister_ssr_notifier);
+
+static int ssr_notify_start(struct rproc_subdev *subdev)
+{
+	return  0;
+}
+
+static void ssr_notify_stop(struct rproc_subdev *subdev)
+{
+	struct qcom_rproc_ssr *ssr = to_ssr_subdev(subdev);
+
+	blocking_notifier_call_chain(&ssr_notifiers, 0, (void *)ssr->name);
+}
+
+/**
+ * qcom_add_ssr_subdev() - register subdevice as restart notification source
+ * @rproc:	rproc handle
+ * @ssr:	SSR subdevice handle
+ * @ssr_name:	identifier to use for notifications originating from @rproc
+ *
+ * As the @ssr is registered with the @rproc SSR events will be sent to all
+ * registered listeners in the system as the remoteproc is shut down.
+ */
+void qcom_add_ssr_subdev(struct rproc *rproc, struct qcom_rproc_ssr *ssr,
+			 const char *ssr_name)
+{
+	ssr->name = ssr_name;
+
+	rproc_add_subdev(rproc, &ssr->subdev, ssr_notify_start, ssr_notify_stop);
+}
+EXPORT_SYMBOL_GPL(qcom_add_ssr_subdev);
+
+/**
+ * qcom_remove_ssr_subdev() - remove subdevice as restart notification source
+ * @rproc:	rproc handle
+ * @ssr:	SSR subdevice handle
+ */
+void qcom_remove_ssr_subdev(struct rproc *rproc, struct qcom_rproc_ssr *ssr)
+{
+	rproc_remove_subdev(rproc, &ssr->subdev);
+}
+EXPORT_SYMBOL_GPL(qcom_remove_ssr_subdev);
+
 MODULE_DESCRIPTION("Qualcomm Remoteproc helper driver");
 MODULE_LICENSE("GPL v2");
diff --git a/drivers/remoteproc/qcom_common.h b/drivers/remoteproc/qcom_common.h
index db5c826d5cd4..fab28b64b8ea 100644
--- a/drivers/remoteproc/qcom_common.h
+++ b/drivers/remoteproc/qcom_common.h
@@ -12,6 +12,12 @@ struct qcom_rproc_subdev {
 	struct qcom_smd_edge *edge;
 };
 
+struct qcom_rproc_ssr {
+	struct rproc_subdev subdev;
+
+	const char *name;
+};
+
 struct resource_table *qcom_mdt_find_rsc_table(struct rproc *rproc,
 					       const struct firmware *fw,
 					       int *tablesz);
@@ -19,4 +25,8 @@ struct resource_table *qcom_mdt_find_rsc_table(struct rproc *rproc,
 void qcom_add_smd_subdev(struct rproc *rproc, struct qcom_rproc_subdev *smd);
 void qcom_remove_smd_subdev(struct rproc *rproc, struct qcom_rproc_subdev *smd);
 
+void qcom_add_ssr_subdev(struct rproc *rproc, struct qcom_rproc_ssr *ssr,
+			 const char *ssr_name);
+void qcom_remove_ssr_subdev(struct rproc *rproc, struct qcom_rproc_ssr *ssr);
+
 #endif
diff --git a/drivers/remoteproc/qcom_q6v5_pil.c b/drivers/remoteproc/qcom_q6v5_pil.c
index 8fd697a3cf8f..fc3ef1e81433 100644
--- a/drivers/remoteproc/qcom_q6v5_pil.c
+++ b/drivers/remoteproc/qcom_q6v5_pil.c
@@ -153,6 +153,7 @@ struct q6v5 {
 	size_t mpss_size;
 
 	struct qcom_rproc_subdev smd_subdev;
+	struct qcom_rproc_ssr ssr_subdev;
 };
 
 static int q6v5_regulator_init(struct device *dev, struct reg_info *regs,
@@ -1038,6 +1039,7 @@ static int q6v5_probe(struct platform_device *pdev)
 	}
 
 	qcom_add_smd_subdev(rproc, &qproc->smd_subdev);
+	qcom_add_ssr_subdev(rproc, &qproc->ssr_subdev, "mpss");
 
 	ret = rproc_add(rproc);
 	if (ret)
@@ -1058,6 +1060,7 @@ static int q6v5_remove(struct platform_device *pdev)
 	rproc_del(qproc->rproc);
 
 	qcom_remove_smd_subdev(qproc->rproc, &qproc->smd_subdev);
+	qcom_remove_ssr_subdev(qproc->rproc, &qproc->ssr_subdev);
 	rproc_free(qproc->rproc);
 
 	return 0;
diff --git a/include/linux/remoteproc/qcom_rproc.h b/include/linux/remoteproc/qcom_rproc.h
new file mode 100644
index 000000000000..fa8e38681b4b
--- /dev/null
+++ b/include/linux/remoteproc/qcom_rproc.h
@@ -0,0 +1,22 @@
+#ifndef __QCOM_RPROC_H__
+#define __QCOM_RPROC_H__
+
+struct notifier_block;
+
+#if IS_ENABLED(CONFIG_QCOM_RPROC_COMMON)
+
+int qcom_register_ssr_notifier(struct notifier_block *nb);
+void qcom_unregister_ssr_notifier(struct notifier_block *nb);
+
+#else
+
+static inline int qcom_register_ssr_notifier(struct notifier_block *nb)
+{
+	return 0;
+}
+
+static inline void qcom_unregister_ssr_notifier(struct notifier_block *nb) {}
+
+#endif
+
+#endif
-- 
cgit v1.2.3


From 2202e35d47fff379fc744e6bd3c111b018cc77df Mon Sep 17 00:00:00 2001
From: Ido Schimmel <idosch@mellanox.com>
Date: Wed, 2 Aug 2017 09:56:06 +0200
Subject: ipv4: fib: Remove unused functions

Previous patches converted users of these functions to provide offload
indication using the nexthop's flags instead of the FIB info's.

Signed-off-by: Ido Schimmel <idosch@mellanox.com>
Signed-off-by: Jiri Pirko <jiri@mellanox.com>
Acked-by: David Ahern <dsahern@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/ip_fib.h | 13 -------------
 1 file changed, 13 deletions(-)

(limited to 'include')

diff --git a/include/net/ip_fib.h b/include/net/ip_fib.h
index 41d580c6185f..ef8992d49bc3 100644
--- a/include/net/ip_fib.h
+++ b/include/net/ip_fib.h
@@ -124,7 +124,6 @@ struct fib_info {
 #ifdef CONFIG_IP_ROUTE_MULTIPATH
 	int			fib_weight;
 #endif
-	unsigned int		fib_offload_cnt;
 	struct rcu_head		rcu;
 	struct fib_nh		fib_nh[0];
 #define fib_dev		fib_nh[0].nh_dev
@@ -177,18 +176,6 @@ struct fib_result_nl {
 
 __be32 fib_info_update_nh_saddr(struct net *net, struct fib_nh *nh);
 
-static inline void fib_info_offload_inc(struct fib_info *fi)
-{
-	fi->fib_offload_cnt++;
-	fi->fib_flags |= RTNH_F_OFFLOAD;
-}
-
-static inline void fib_info_offload_dec(struct fib_info *fi)
-{
-	if (--fi->fib_offload_cnt == 0)
-		fi->fib_flags &= ~RTNH_F_OFFLOAD;
-}
-
 #define FIB_RES_SADDR(net, res)				\
 	((FIB_RES_NH(res).nh_saddr_genid ==		\
 	  atomic_read(&(net)->ipv4.dev_addr_genid)) ?	\
-- 
cgit v1.2.3


From 957b8dffa4e3d191f0f1571d006d0e520790dcb9 Mon Sep 17 00:00:00 2001
From: João Paulo Rechi Vita <jprvita@gmail.com>
Date: Mon, 24 Jul 2017 14:22:25 -0700
Subject: HID: multitouch: Support Asus T304UA media keys
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The Asus T304UA convertible sports a magnetic detachable keyboard with
touchpad, which is connected over USB. Most of the keyboard hotkeys are
exposed through the same USB interface as the touchpad, defined in the
report descriptor as follows:

0x06, 0x31, 0xFF,  // Usage Page (Vendor Defined 0xFF31)
0x09, 0x76,        // Usage (0x76)
0xA1, 0x01,        // Collection (Application)
0x05, 0xFF,        //   Usage Page (Reserved 0xFF)
0x85, 0x5A,        //   Report ID (90)
0x19, 0x00,        //   Usage Minimum (0x00)
0x2A, 0xFF, 0x00,  //   Usage Maximum (0xFF)
0x15, 0x00,        //   Logical Minimum (0)
0x26, 0xFF, 0x00,  //   Logical Maximum (255)
0x75, 0x08,        //   Report Size (8)
0x95, 0x0F,        //   Report Count (15)
0xB1, 0x02,        //   Feature (Data,Var,Abs,No Wrap,Linear,Preferred State,No Null Position,Non-volatile)
0x05, 0xFF,        //   Usage Page (Reserved 0xFF)
0x85, 0x5A,        //   Report ID (90)
0x19, 0x00,        //   Usage Minimum (0x00)
0x2A, 0xFF, 0x00,  //   Usage Maximum (0xFF)
0x15, 0x00,        //   Logical Minimum (0)
0x26, 0xFF, 0x00,  //   Logical Maximum (255)
0x75, 0x08,        //   Report Size (8)
0x95, 0x02,        //   Report Count (2)
0x81, 0x02,        //   Input (Data,Var,Abs,No Wrap,Linear,Preferred State,No Null Position)
0xC0,              // End Collection

This UsagePage is declared as a variable, but we need to treat it as an
array to be able to map each Usage we care about to its corresponding
input key.

Signed-off-by: João Paulo Rechi Vita <jprvita@endlessm.com>
Signed-off-by: Jiri Kosina <jkosina@suse.cz>
---
 drivers/hid/hid-ids.h        |  1 +
 drivers/hid/hid-multitouch.c | 44 +++++++++++++++++++++++++++++++++++++++++++-
 include/linux/hid.h          |  2 ++
 3 files changed, 46 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/drivers/hid/hid-ids.h b/drivers/hid/hid-ids.h
index c9ba4c6db74c..90fcf7457908 100644
--- a/drivers/hid/hid-ids.h
+++ b/drivers/hid/hid-ids.h
@@ -176,6 +176,7 @@
 #define USB_DEVICE_ID_ASUSTEK_LCM	0x1726
 #define USB_DEVICE_ID_ASUSTEK_LCM2	0x175b
 #define USB_DEVICE_ID_ASUSTEK_T100_KEYBOARD	0x17e0
+#define USB_DEVICE_ID_ASUSTEK_T304_KEYBOARD	0x184a
 #define USB_DEVICE_ID_ASUSTEK_I2C_KEYBOARD	0x8585
 #define USB_DEVICE_ID_ASUSTEK_I2C_TOUCHPAD	0x0101
 #define USB_DEVICE_ID_ASUSTEK_ROG_KEYBOARD1 0x1854
diff --git a/drivers/hid/hid-multitouch.c b/drivers/hid/hid-multitouch.c
index 152d33120a55..6b3de7b01571 100644
--- a/drivers/hid/hid-multitouch.c
+++ b/drivers/hid/hid-multitouch.c
@@ -72,6 +72,7 @@ MODULE_LICENSE("GPL");
 #define MT_QUIRK_FIX_CONST_CONTACT_ID	BIT(14)
 #define MT_QUIRK_TOUCH_SIZE_SCALING	BIT(15)
 #define MT_QUIRK_STICKY_FINGERS		BIT(16)
+#define MT_QUIRK_ASUS_CUSTOM_UP		BIT(17)
 
 #define MT_INPUTMODE_TOUCHSCREEN	0x02
 #define MT_INPUTMODE_TOUCHPAD		0x03
@@ -169,6 +170,7 @@ static void mt_post_parse(struct mt_device *td);
 #define MT_CLS_GENERALTOUCH_TWOFINGERS		0x0108
 #define MT_CLS_GENERALTOUCH_PWT_TENFINGERS	0x0109
 #define MT_CLS_LG				0x010a
+#define MT_CLS_ASUS				0x010b
 #define MT_CLS_VTL				0x0110
 #define MT_CLS_GOOGLE				0x0111
 
@@ -290,6 +292,10 @@ static struct mt_class mt_classes[] = {
 			MT_QUIRK_IGNORE_DUPLICATES |
 			MT_QUIRK_HOVERING |
 			MT_QUIRK_CONTACT_CNT_ACCURATE },
+	{ .name = MT_CLS_ASUS,
+		.quirks = MT_QUIRK_ALWAYS_VALID |
+			MT_QUIRK_CONTACT_CNT_ACCURATE |
+			MT_QUIRK_ASUS_CUSTOM_UP },
 	{ .name = MT_CLS_VTL,
 		.quirks = MT_QUIRK_ALWAYS_VALID |
 			MT_QUIRK_CONTACT_CNT_ACCURATE |
@@ -905,6 +911,8 @@ static int mt_touch_input_configured(struct hid_device *hdev,
 	return 0;
 }
 
+#define mt_map_key_clear(c)	hid_map_usage_clear(hi, usage, bit, \
+						    max, EV_KEY, (c))
 static int mt_input_mapping(struct hid_device *hdev, struct hid_input *hi,
 		struct hid_field *field, struct hid_usage *usage,
 		unsigned long **bit, int *max)
@@ -923,9 +931,34 @@ static int mt_input_mapping(struct hid_device *hdev, struct hid_input *hi,
 	    field->application != HID_DG_TOUCHPAD &&
 	    field->application != HID_GD_KEYBOARD &&
 	    field->application != HID_CP_CONSUMER_CONTROL &&
-	    field->application != HID_GD_WIRELESS_RADIO_CTLS)
+	    field->application != HID_GD_WIRELESS_RADIO_CTLS &&
+	    !(field->application == HID_VD_ASUS_CUSTOM_MEDIA_KEYS &&
+	      td->mtclass.quirks & MT_QUIRK_ASUS_CUSTOM_UP))
 		return -1;
 
+	/*
+	 * Some Asus keyboard+touchpad devices have the hotkeys defined in the
+	 * touchpad report descriptor. We need to treat these as an array to
+	 * map usages to input keys.
+	 */
+	if (field->application == 0xff310076 &&
+	    td->mtclass.quirks & MT_QUIRK_ASUS_CUSTOM_UP &&
+	    (usage->hid & HID_USAGE_PAGE) == HID_UP_CUSTOM) {
+		set_bit(EV_REP, hi->input->evbit);
+		if (field->flags & HID_MAIN_ITEM_VARIABLE)
+			field->flags &= ~HID_MAIN_ITEM_VARIABLE;
+		switch (usage->hid & HID_USAGE) {
+		case 0x10: mt_map_key_clear(KEY_BRIGHTNESSDOWN);	break;
+		case 0x20: mt_map_key_clear(KEY_BRIGHTNESSUP);		break;
+		case 0x35: mt_map_key_clear(KEY_DISPLAY_OFF);		break;
+		case 0x6b: mt_map_key_clear(KEY_F21);			break;
+		case 0x6c: mt_map_key_clear(KEY_SLEEP);			break;
+		default:
+			return -1;
+		}
+		return 1;
+	}
+
 	/*
 	 * some egalax touchscreens have "application == HID_DG_TOUCHSCREEN"
 	 * for the stylus.
@@ -1137,6 +1170,9 @@ static int mt_input_configured(struct hid_device *hdev, struct hid_input *hi)
 		case HID_GD_WIRELESS_RADIO_CTLS:
 			suffix = "Wireless Radio Control";
 			break;
+		case HID_VD_ASUS_CUSTOM_MEDIA_KEYS:
+			suffix = "Custom Media Keys";
+			break;
 		default:
 			suffix = "UNKNOWN";
 			break;
@@ -1388,6 +1424,12 @@ static const struct hid_device_id mt_devices[] = {
 		MT_USB_DEVICE(USB_VENDOR_ID_ANTON,
 			USB_DEVICE_ID_ANTON_TOUCH_PAD) },
 
+	/* Asus T304UA */
+	{ .driver_data = MT_CLS_ASUS,
+		HID_DEVICE(BUS_USB, HID_GROUP_MULTITOUCH_WIN_8,
+			USB_VENDOR_ID_ASUSTEK,
+			USB_DEVICE_ID_ASUSTEK_T304_KEYBOARD) },
+
 	/* Atmel panels */
 	{ .driver_data = MT_CLS_SERIAL,
 		MT_USB_DEVICE(USB_VENDOR_ID_ATMEL,
diff --git a/include/linux/hid.h b/include/linux/hid.h
index 5006f9b5d837..a08e6b15d98d 100644
--- a/include/linux/hid.h
+++ b/include/linux/hid.h
@@ -173,6 +173,7 @@ struct hid_item {
 #define HID_UP_LOGIVENDOR3   0xff430000
 #define HID_UP_LNVENDOR		0xffa00000
 #define HID_UP_SENSOR		0x00200000
+#define HID_UP_ASUSVENDOR	0xff310000
 
 #define HID_USAGE		0x0000ffff
 
@@ -292,6 +293,7 @@ struct hid_item {
 #define HID_DG_BARRELSWITCH2	0x000d005a
 #define HID_DG_TOOLSERIALNUMBER	0x000d005b
 
+#define HID_VD_ASUS_CUSTOM_MEDIA_KEYS	0xff310076
 /*
  * HID report types --- Ouch! HID spec says 1 2 3!
  */
-- 
cgit v1.2.3


From d4cb37e71662dac72049bf7c55a9038bd7d2bcb5 Mon Sep 17 00:00:00 2001
From: Boris Brezillon <boris.brezillon@free-electrons.com>
Date: Tue, 16 May 2017 00:17:42 +0200
Subject: mtd: nand: Remove support for block locking/unlocking

Commit 7d70f334ad2b ("mtd: nand: add lock/unlock routines") introduced
support for the Micron LOCK/UNLOCK commands but no one ever used the
nand_lock/unlock() functions.

Remove support for these vendor-specific operations from the core. If
one ever wants to add them back they should be put in nand_micron.c and
mtd->_lock/_unlock should be directly assigned from there instead of
exporting the functions.

Signed-off-by: Boris Brezillon <boris.brezillon@free-electrons.com>
---
 drivers/mtd/nand/nand_base.c | 173 -------------------------------------------
 include/linux/mtd/nand.h     |  10 ---
 2 files changed, 183 deletions(-)

(limited to 'include')

diff --git a/drivers/mtd/nand/nand_base.c b/drivers/mtd/nand/nand_base.c
index 5fa5ddc94834..54d6194fb180 100644
--- a/drivers/mtd/nand/nand_base.c
+++ b/drivers/mtd/nand/nand_base.c
@@ -1239,179 +1239,6 @@ int nand_reset(struct nand_chip *chip, int chipnr)
 	return 0;
 }
 
-/**
- * __nand_unlock - [REPLACEABLE] unlocks specified locked blocks
- * @mtd: mtd info
- * @ofs: offset to start unlock from
- * @len: length to unlock
- * @invert:
- *        - when = 0, unlock the range of blocks within the lower and
- *                    upper boundary address
- *        - when = 1, unlock the range of blocks outside the boundaries
- *                    of the lower and upper boundary address
- *
- * Returs unlock status.
- */
-static int __nand_unlock(struct mtd_info *mtd, loff_t ofs,
-					uint64_t len, int invert)
-{
-	int ret = 0;
-	int status, page;
-	struct nand_chip *chip = mtd_to_nand(mtd);
-
-	/* Submit address of first page to unlock */
-	page = ofs >> chip->page_shift;
-	chip->cmdfunc(mtd, NAND_CMD_UNLOCK1, -1, page & chip->pagemask);
-
-	/* Submit address of last page to unlock */
-	page = (ofs + len) >> chip->page_shift;
-	chip->cmdfunc(mtd, NAND_CMD_UNLOCK2, -1,
-				(page | invert) & chip->pagemask);
-
-	/* Call wait ready function */
-	status = chip->waitfunc(mtd, chip);
-	/* See if device thinks it succeeded */
-	if (status & NAND_STATUS_FAIL) {
-		pr_debug("%s: error status = 0x%08x\n",
-					__func__, status);
-		ret = -EIO;
-	}
-
-	return ret;
-}
-
-/**
- * nand_unlock - [REPLACEABLE] unlocks specified locked blocks
- * @mtd: mtd info
- * @ofs: offset to start unlock from
- * @len: length to unlock
- *
- * Returns unlock status.
- */
-int nand_unlock(struct mtd_info *mtd, loff_t ofs, uint64_t len)
-{
-	int ret = 0;
-	int chipnr;
-	struct nand_chip *chip = mtd_to_nand(mtd);
-
-	pr_debug("%s: start = 0x%012llx, len = %llu\n",
-			__func__, (unsigned long long)ofs, len);
-
-	if (check_offs_len(mtd, ofs, len))
-		return -EINVAL;
-
-	/* Align to last block address if size addresses end of the device */
-	if (ofs + len == mtd->size)
-		len -= mtd->erasesize;
-
-	nand_get_device(mtd, FL_UNLOCKING);
-
-	/* Shift to get chip number */
-	chipnr = ofs >> chip->chip_shift;
-
-	/*
-	 * Reset the chip.
-	 * If we want to check the WP through READ STATUS and check the bit 7
-	 * we must reset the chip
-	 * some operation can also clear the bit 7 of status register
-	 * eg. erase/program a locked block
-	 */
-	nand_reset(chip, chipnr);
-
-	chip->select_chip(mtd, chipnr);
-
-	/* Check, if it is write protected */
-	if (nand_check_wp(mtd)) {
-		pr_debug("%s: device is write protected!\n",
-					__func__);
-		ret = -EIO;
-		goto out;
-	}
-
-	ret = __nand_unlock(mtd, ofs, len, 0);
-
-out:
-	chip->select_chip(mtd, -1);
-	nand_release_device(mtd);
-
-	return ret;
-}
-EXPORT_SYMBOL(nand_unlock);
-
-/**
- * nand_lock - [REPLACEABLE] locks all blocks present in the device
- * @mtd: mtd info
- * @ofs: offset to start unlock from
- * @len: length to unlock
- *
- * This feature is not supported in many NAND parts. 'Micron' NAND parts do
- * have this feature, but it allows only to lock all blocks, not for specified
- * range for block. Implementing 'lock' feature by making use of 'unlock', for
- * now.
- *
- * Returns lock status.
- */
-int nand_lock(struct mtd_info *mtd, loff_t ofs, uint64_t len)
-{
-	int ret = 0;
-	int chipnr, status, page;
-	struct nand_chip *chip = mtd_to_nand(mtd);
-
-	pr_debug("%s: start = 0x%012llx, len = %llu\n",
-			__func__, (unsigned long long)ofs, len);
-
-	if (check_offs_len(mtd, ofs, len))
-		return -EINVAL;
-
-	nand_get_device(mtd, FL_LOCKING);
-
-	/* Shift to get chip number */
-	chipnr = ofs >> chip->chip_shift;
-
-	/*
-	 * Reset the chip.
-	 * If we want to check the WP through READ STATUS and check the bit 7
-	 * we must reset the chip
-	 * some operation can also clear the bit 7 of status register
-	 * eg. erase/program a locked block
-	 */
-	nand_reset(chip, chipnr);
-
-	chip->select_chip(mtd, chipnr);
-
-	/* Check, if it is write protected */
-	if (nand_check_wp(mtd)) {
-		pr_debug("%s: device is write protected!\n",
-					__func__);
-		status = MTD_ERASE_FAILED;
-		ret = -EIO;
-		goto out;
-	}
-
-	/* Submit address of first page to lock */
-	page = ofs >> chip->page_shift;
-	chip->cmdfunc(mtd, NAND_CMD_LOCK, -1, page & chip->pagemask);
-
-	/* Call wait ready function */
-	status = chip->waitfunc(mtd, chip);
-	/* See if device thinks it succeeded */
-	if (status & NAND_STATUS_FAIL) {
-		pr_debug("%s: error status = 0x%08x\n",
-					__func__, status);
-		ret = -EIO;
-		goto out;
-	}
-
-	ret = __nand_unlock(mtd, ofs, len, 0x1);
-
-out:
-	chip->select_chip(mtd, -1);
-	nand_release_device(mtd);
-
-	return ret;
-}
-EXPORT_SYMBOL(nand_lock);
-
 /**
  * nand_check_erased_buf - check if a buffer contains (almost) only 0xff data
  * @buf: buffer to test
diff --git a/include/linux/mtd/nand.h b/include/linux/mtd/nand.h
index 892148c448cc..8a6522e82f03 100644
--- a/include/linux/mtd/nand.h
+++ b/include/linux/mtd/nand.h
@@ -44,12 +44,6 @@ void nand_release(struct mtd_info *mtd);
 /* Internal helper for board drivers which need to override command function */
 void nand_wait_ready(struct mtd_info *mtd);
 
-/* locks all blocks present in the device */
-int nand_lock(struct mtd_info *mtd, loff_t ofs, uint64_t len);
-
-/* unlocks specified locked blocks */
-int nand_unlock(struct mtd_info *mtd, loff_t ofs, uint64_t len);
-
 /* The maximum number of NAND chips in an array */
 #define NAND_MAX_CHIPS		8
 
@@ -89,10 +83,6 @@ int nand_unlock(struct mtd_info *mtd, loff_t ofs, uint64_t len);
 #define NAND_CMD_SET_FEATURES	0xef
 #define NAND_CMD_RESET		0xff
 
-#define NAND_CMD_LOCK		0x2a
-#define NAND_CMD_UNLOCK1	0x23
-#define NAND_CMD_UNLOCK2	0x24
-
 /* Extended commands for large page devices */
 #define NAND_CMD_READSTART	0x30
 #define NAND_CMD_RNDOUTSTART	0xE0
-- 
cgit v1.2.3


From e61e4055b165f4c645ce2a85890b313abf841f67 Mon Sep 17 00:00:00 2001
From: Xin Long <lucien.xin@gmail.com>
Date: Thu, 3 Aug 2017 15:42:09 +0800
Subject: sctp: remove the typedef sctp_shutdownhdr_t

This patch is to remove the typedef sctp_shutdownhdr_t, and
replace with struct sctp_shutdownhdr in the places where it's
using this typedef.

It is also to use sizeof(variable) instead of sizeof(type).

Signed-off-by: Xin Long <lucien.xin@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/sctp.h     |  6 +++---
 net/sctp/sm_make_chunk.c |  4 ++--
 net/sctp/sm_statefuns.c  | 10 +++++-----
 3 files changed, 10 insertions(+), 10 deletions(-)

(limited to 'include')

diff --git a/include/linux/sctp.h b/include/linux/sctp.h
index ba007163acfd..7a586ba7dcd4 100644
--- a/include/linux/sctp.h
+++ b/include/linux/sctp.h
@@ -416,13 +416,13 @@ struct sctp_abort_chunk {
 /* For the graceful shutdown we must carry the tag (in common header)
  * and the highest consecutive acking value.
  */
-typedef struct sctp_shutdownhdr {
+struct sctp_shutdownhdr {
 	__be32 cum_tsn_ack;
-} sctp_shutdownhdr_t;
+};
 
 struct sctp_shutdown_chunk_t {
 	struct sctp_chunkhdr chunk_hdr;
-	sctp_shutdownhdr_t shutdown_hdr;
+	struct sctp_shutdownhdr shutdown_hdr;
 };
 
 /* RFC 2960.  Section 3.3.10 Operation Error (ERROR) (9) */
diff --git a/net/sctp/sm_make_chunk.c b/net/sctp/sm_make_chunk.c
index 163004e7047c..8f1c6b639ac1 100644
--- a/net/sctp/sm_make_chunk.c
+++ b/net/sctp/sm_make_chunk.c
@@ -857,15 +857,15 @@ nodata:
 struct sctp_chunk *sctp_make_shutdown(const struct sctp_association *asoc,
 				      const struct sctp_chunk *chunk)
 {
+	struct sctp_shutdownhdr shut;
 	struct sctp_chunk *retval;
-	sctp_shutdownhdr_t shut;
 	__u32 ctsn;
 
 	ctsn = sctp_tsnmap_get_ctsn(&asoc->peer.tsn_map);
 	shut.cum_tsn_ack = htonl(ctsn);
 
 	retval = sctp_make_control(asoc, SCTP_CID_SHUTDOWN, 0,
-				   sizeof(sctp_shutdownhdr_t), GFP_ATOMIC);
+				   sizeof(shut), GFP_ATOMIC);
 	if (!retval)
 		goto nodata;
 
diff --git a/net/sctp/sm_statefuns.c b/net/sctp/sm_statefuns.c
index dc0c2c4188d8..5b95e2d8c227 100644
--- a/net/sctp/sm_statefuns.c
+++ b/net/sctp/sm_statefuns.c
@@ -2656,8 +2656,8 @@ sctp_disposition_t sctp_sf_do_9_2_shutdown(struct net *net,
 					   sctp_cmd_seq_t *commands)
 {
 	struct sctp_chunk *chunk = arg;
-	sctp_shutdownhdr_t *sdh;
 	sctp_disposition_t disposition;
+	struct sctp_shutdownhdr *sdh;
 	struct sctp_ulpevent *ev;
 	__u32 ctsn;
 
@@ -2671,8 +2671,8 @@ sctp_disposition_t sctp_sf_do_9_2_shutdown(struct net *net,
 						  commands);
 
 	/* Convert the elaborate header.  */
-	sdh = (sctp_shutdownhdr_t *)chunk->skb->data;
-	skb_pull(chunk->skb, sizeof(sctp_shutdownhdr_t));
+	sdh = (struct sctp_shutdownhdr *)chunk->skb->data;
+	skb_pull(chunk->skb, sizeof(*sdh));
 	chunk->subh.shutdown_hdr = sdh;
 	ctsn = ntohl(sdh->cum_tsn_ack);
 
@@ -2746,7 +2746,7 @@ sctp_disposition_t sctp_sf_do_9_2_shut_ctsn(struct net *net,
 					   sctp_cmd_seq_t *commands)
 {
 	struct sctp_chunk *chunk = arg;
-	sctp_shutdownhdr_t *sdh;
+	struct sctp_shutdownhdr *sdh;
 	__u32 ctsn;
 
 	if (!sctp_vtag_verify(chunk, asoc))
@@ -2758,7 +2758,7 @@ sctp_disposition_t sctp_sf_do_9_2_shut_ctsn(struct net *net,
 		return sctp_sf_violation_chunklen(net, ep, asoc, type, arg,
 						  commands);
 
-	sdh = (sctp_shutdownhdr_t *)chunk->skb->data;
+	sdh = (struct sctp_shutdownhdr *)chunk->skb->data;
 	ctsn = ntohl(sdh->cum_tsn_ack);
 
 	if (TSN_lt(ctsn, asoc->ctsn_ack_point)) {
-- 
cgit v1.2.3


From ac23e68133f4570f40ec0910286ced08ced2d378 Mon Sep 17 00:00:00 2001
From: Xin Long <lucien.xin@gmail.com>
Date: Thu, 3 Aug 2017 15:42:10 +0800
Subject: sctp: fix the name of struct sctp_shutdown_chunk_t

This patch is to fix the name of struct sctp_shutdown_chunk_t
, replace with struct sctp_initack_chunk in the places where
it's using it.

It is also to fix some indent problem.

Signed-off-by: Xin Long <lucien.xin@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/sctp.h    |  2 +-
 net/sctp/sm_statefuns.c | 16 ++++++++--------
 2 files changed, 9 insertions(+), 9 deletions(-)

(limited to 'include')

diff --git a/include/linux/sctp.h b/include/linux/sctp.h
index 7a586ba7dcd4..553020cbf6f7 100644
--- a/include/linux/sctp.h
+++ b/include/linux/sctp.h
@@ -420,7 +420,7 @@ struct sctp_shutdownhdr {
 	__be32 cum_tsn_ack;
 };
 
-struct sctp_shutdown_chunk_t {
+struct sctp_shutdown_chunk {
 	struct sctp_chunkhdr chunk_hdr;
 	struct sctp_shutdownhdr shutdown_hdr;
 };
diff --git a/net/sctp/sm_statefuns.c b/net/sctp/sm_statefuns.c
index 5b95e2d8c227..d4d8fab52a11 100644
--- a/net/sctp/sm_statefuns.c
+++ b/net/sctp/sm_statefuns.c
@@ -2665,8 +2665,7 @@ sctp_disposition_t sctp_sf_do_9_2_shutdown(struct net *net,
 		return sctp_sf_pdiscard(net, ep, asoc, type, arg, commands);
 
 	/* Make sure that the SHUTDOWN chunk has a valid length. */
-	if (!sctp_chunk_length_valid(chunk,
-				      sizeof(struct sctp_shutdown_chunk_t)))
+	if (!sctp_chunk_length_valid(chunk, sizeof(struct sctp_shutdown_chunk)))
 		return sctp_sf_violation_chunklen(net, ep, asoc, type, arg,
 						  commands);
 
@@ -2753,8 +2752,7 @@ sctp_disposition_t sctp_sf_do_9_2_shut_ctsn(struct net *net,
 		return sctp_sf_pdiscard(net, ep, asoc, type, arg, commands);
 
 	/* Make sure that the SHUTDOWN chunk has a valid length. */
-	if (!sctp_chunk_length_valid(chunk,
-				      sizeof(struct sctp_shutdown_chunk_t)))
+	if (!sctp_chunk_length_valid(chunk, sizeof(struct sctp_shutdown_chunk)))
 		return sctp_sf_violation_chunklen(net, ep, asoc, type, arg,
 						  commands);
 
@@ -5419,12 +5417,14 @@ sctp_disposition_t sctp_sf_do_9_2_shutdown_ack(
 	 */
 	if (chunk) {
 		if (!sctp_vtag_verify(chunk, asoc))
-			return sctp_sf_pdiscard(net, ep, asoc, type, arg, commands);
+			return sctp_sf_pdiscard(net, ep, asoc, type, arg,
+						commands);
 
 		/* Make sure that the SHUTDOWN chunk has a valid length. */
-		if (!sctp_chunk_length_valid(chunk, sizeof(struct sctp_shutdown_chunk_t)))
-			return sctp_sf_violation_chunklen(net, ep, asoc, type, arg,
-							  commands);
+		if (!sctp_chunk_length_valid(
+				chunk, sizeof(struct sctp_shutdown_chunk)))
+			return sctp_sf_violation_chunklen(net, ep, asoc, type,
+							  arg, commands);
 	}
 
 	/* If it has no more outstanding DATA chunks, the SHUTDOWN receiver
-- 
cgit v1.2.3


From d8238d9dab8fbea22dd04f4e77639c7f7b83eef7 Mon Sep 17 00:00:00 2001
From: Xin Long <lucien.xin@gmail.com>
Date: Thu, 3 Aug 2017 15:42:11 +0800
Subject: sctp: remove the typedef sctp_errhdr_t

This patch is to remove the typedef sctp_errhdr_t, and replace
with struct sctp_errhdr in the places where it's using this
typedef.

It is also to use sizeof(variable) instead of sizeof(type).

Signed-off-by: Xin Long <lucien.xin@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/sctp.h     |  6 +++---
 include/net/sctp/sctp.h  |  8 ++++----
 net/sctp/sm_make_chunk.c | 38 ++++++++++++++++++++------------------
 net/sctp/sm_sideeffect.c |  2 +-
 net/sctp/sm_statefuns.c  | 29 +++++++++++++++--------------
 net/sctp/ulpevent.c      | 10 +++++-----
 6 files changed, 48 insertions(+), 45 deletions(-)

(limited to 'include')

diff --git a/include/linux/sctp.h b/include/linux/sctp.h
index 553020cbf6f7..d35bdd30fa0f 100644
--- a/include/linux/sctp.h
+++ b/include/linux/sctp.h
@@ -427,15 +427,15 @@ struct sctp_shutdown_chunk {
 
 /* RFC 2960.  Section 3.3.10 Operation Error (ERROR) (9) */
 
-typedef struct sctp_errhdr {
+struct sctp_errhdr {
 	__be16 cause;
 	__be16 length;
 	__u8  variable[0];
-} sctp_errhdr_t;
+};
 
 typedef struct sctp_operr_chunk {
 	struct sctp_chunkhdr chunk_hdr;
-	sctp_errhdr_t err_hdr;
+	struct sctp_errhdr err_hdr;
 } sctp_operr_chunk_t;
 
 /* RFC 2960 3.3.10 - Operation Error
diff --git a/include/net/sctp/sctp.h b/include/net/sctp/sctp.h
index 45fd4c6056b5..84650fed1e6a 100644
--- a/include/net/sctp/sctp.h
+++ b/include/net/sctp/sctp.h
@@ -479,13 +479,13 @@ for (pos.v = chunk->member;\
 _sctp_walk_errors((err), (chunk_hdr), ntohs((chunk_hdr)->length))
 
 #define _sctp_walk_errors(err, chunk_hdr, end)\
-for (err = (sctp_errhdr_t *)((void *)chunk_hdr + \
+for (err = (struct sctp_errhdr *)((void *)chunk_hdr + \
 	    sizeof(struct sctp_chunkhdr));\
-     ((void *)err + offsetof(sctp_errhdr_t, length) + sizeof(err->length) <=\
+     ((void *)err + offsetof(struct sctp_errhdr, length) + sizeof(err->length) <=\
       (void *)chunk_hdr + end) &&\
      (void *)err <= (void *)chunk_hdr + end - ntohs(err->length) &&\
-     ntohs(err->length) >= sizeof(sctp_errhdr_t); \
-     err = (sctp_errhdr_t *)((void *)err + SCTP_PAD4(ntohs(err->length))))
+     ntohs(err->length) >= sizeof(struct sctp_errhdr); \
+     err = (struct sctp_errhdr *)((void *)err + SCTP_PAD4(ntohs(err->length))))
 
 #define sctp_walk_fwdtsn(pos, chunk)\
 _sctp_walk_fwdtsn((pos), (chunk), ntohs((chunk)->chunk_hdr->length) - sizeof(struct sctp_fwdtsn_chunk))
diff --git a/net/sctp/sm_make_chunk.c b/net/sctp/sm_make_chunk.c
index 8f1c6b639ac1..0b2298bbb84e 100644
--- a/net/sctp/sm_make_chunk.c
+++ b/net/sctp/sm_make_chunk.c
@@ -135,14 +135,14 @@ static const struct sctp_paramhdr prsctp_param = {
 void  sctp_init_cause(struct sctp_chunk *chunk, __be16 cause_code,
 		      size_t paylen)
 {
-	sctp_errhdr_t err;
+	struct sctp_errhdr err;
 	__u16 len;
 
 	/* Cause code constants are now defined in network order.  */
 	err.cause = cause_code;
-	len = sizeof(sctp_errhdr_t) + paylen;
+	len = sizeof(err) + paylen;
 	err.length  = htons(len);
-	chunk->subh.err_hdr = sctp_addto_chunk(chunk, sizeof(sctp_errhdr_t), &err);
+	chunk->subh.err_hdr = sctp_addto_chunk(chunk, sizeof(err), &err);
 }
 
 /* A helper to initialize an op error inside a
@@ -153,19 +153,19 @@ void  sctp_init_cause(struct sctp_chunk *chunk, __be16 cause_code,
 static int sctp_init_cause_fixed(struct sctp_chunk *chunk, __be16 cause_code,
 		      size_t paylen)
 {
-	sctp_errhdr_t err;
+	struct sctp_errhdr err;
 	__u16 len;
 
 	/* Cause code constants are now defined in network order.  */
 	err.cause = cause_code;
-	len = sizeof(sctp_errhdr_t) + paylen;
+	len = sizeof(err) + paylen;
 	err.length  = htons(len);
 
 	if (skb_tailroom(chunk->skb) < len)
 		return -ENOSPC;
-	chunk->subh.err_hdr = sctp_addto_chunk_fixed(chunk,
-						     sizeof(sctp_errhdr_t),
-						     &err);
+
+	chunk->subh.err_hdr = sctp_addto_chunk_fixed(chunk, sizeof(err), &err);
+
 	return 0;
 }
 /* 3.3.2 Initiation (INIT) (1)
@@ -979,8 +979,8 @@ struct sctp_chunk *sctp_make_abort_no_data(
 	struct sctp_chunk *retval;
 	__be32 payload;
 
-	retval = sctp_make_abort(asoc, chunk, sizeof(sctp_errhdr_t)
-				 + sizeof(tsn));
+	retval = sctp_make_abort(asoc, chunk,
+				 sizeof(struct sctp_errhdr) + sizeof(tsn));
 
 	if (!retval)
 		goto no_mem;
@@ -1015,7 +1015,8 @@ struct sctp_chunk *sctp_make_abort_user(const struct sctp_association *asoc,
 	void *payload = NULL;
 	int err;
 
-	retval = sctp_make_abort(asoc, NULL, sizeof(sctp_errhdr_t) + paylen);
+	retval = sctp_make_abort(asoc, NULL,
+				 sizeof(struct sctp_errhdr) + paylen);
 	if (!retval)
 		goto err_chunk;
 
@@ -1080,8 +1081,8 @@ struct sctp_chunk *sctp_make_abort_violation(
 	struct sctp_chunk  *retval;
 	struct sctp_paramhdr phdr;
 
-	retval = sctp_make_abort(asoc, chunk, sizeof(sctp_errhdr_t) + paylen +
-					      sizeof(phdr));
+	retval = sctp_make_abort(asoc, chunk, sizeof(struct sctp_errhdr) +
+					      paylen + sizeof(phdr));
 	if (!retval)
 		goto end;
 
@@ -1104,7 +1105,7 @@ struct sctp_chunk *sctp_make_violation_paramlen(
 {
 	struct sctp_chunk *retval;
 	static const char error[] = "The following parameter had invalid length:";
-	size_t payload_len = sizeof(error) + sizeof(sctp_errhdr_t) +
+	size_t payload_len = sizeof(error) + sizeof(struct sctp_errhdr) +
 			     sizeof(*param);
 
 	retval = sctp_make_abort(asoc, chunk, payload_len);
@@ -1126,7 +1127,7 @@ struct sctp_chunk *sctp_make_violation_max_retrans(
 {
 	struct sctp_chunk *retval;
 	static const char error[] = "Association exceeded its max_retans count";
-	size_t payload_len = sizeof(error) + sizeof(sctp_errhdr_t);
+	size_t payload_len = sizeof(error) + sizeof(struct sctp_errhdr);
 
 	retval = sctp_make_abort(asoc, chunk, payload_len);
 	if (!retval)
@@ -1209,7 +1210,8 @@ static struct sctp_chunk *sctp_make_op_error_space(
 	struct sctp_chunk *retval;
 
 	retval = sctp_make_control(asoc, SCTP_CID_ERROR, 0,
-				   sizeof(sctp_errhdr_t) + size, GFP_ATOMIC);
+				   sizeof(struct sctp_errhdr) + size,
+				   GFP_ATOMIC);
 	if (!retval)
 		goto nodata;
 
@@ -2966,7 +2968,7 @@ static void sctp_add_asconf_response(struct sctp_chunk *chunk, __be32 crr_id,
 			      __be16 err_code, sctp_addip_param_t *asconf_param)
 {
 	sctp_addip_param_t 	ack_param;
-	sctp_errhdr_t		err_param;
+	struct sctp_errhdr	err_param;
 	int			asconf_param_len = 0;
 	int			err_param_len = 0;
 	__be16			response_type;
@@ -3351,7 +3353,7 @@ static __be16 sctp_get_asconf_response(struct sctp_chunk *asconf_ack,
 				      int no_err)
 {
 	sctp_addip_param_t	*asconf_ack_param;
-	sctp_errhdr_t		*err_param;
+	struct sctp_errhdr	*err_param;
 	int			length;
 	int			asconf_ack_len;
 	__be16			err_code;
diff --git a/net/sctp/sm_sideeffect.c b/net/sctp/sm_sideeffect.c
index d6e5e9e0fd6d..5dda8c42b5f6 100644
--- a/net/sctp/sm_sideeffect.c
+++ b/net/sctp/sm_sideeffect.c
@@ -828,7 +828,7 @@ static void sctp_cmd_assoc_update(sctp_cmd_seq_t *cmds,
 	if (!sctp_assoc_update(asoc, new))
 		return;
 
-	abort = sctp_make_abort(asoc, NULL, sizeof(sctp_errhdr_t));
+	abort = sctp_make_abort(asoc, NULL, sizeof(struct sctp_errhdr));
 	if (abort) {
 		sctp_init_cause(abort, SCTP_ERROR_RSRC_LOW, 0);
 		sctp_add_cmd_sf(cmds, SCTP_CMD_REPLY, SCTP_CHUNK(abort));
diff --git a/net/sctp/sm_statefuns.c b/net/sctp/sm_statefuns.c
index d4d8fab52a11..7a2ba4c187d0 100644
--- a/net/sctp/sm_statefuns.c
+++ b/net/sctp/sm_statefuns.c
@@ -1233,7 +1233,7 @@ static int sctp_sf_send_restart_abort(struct net *net, union sctp_addr *ssa,
 	union sctp_addr_param *addrparm;
 	struct sctp_errhdr *errhdr;
 	struct sctp_endpoint *ep;
-	char buffer[sizeof(struct sctp_errhdr)+sizeof(union sctp_addr_param)];
+	char buffer[sizeof(*errhdr) + sizeof(*addrparm)];
 	struct sctp_af *af = sctp_get_af_specific(ssa->v4.sin_family);
 
 	/* Build the error on the stack.   We are way to malloc crazy
@@ -1244,7 +1244,7 @@ static int sctp_sf_send_restart_abort(struct net *net, union sctp_addr *ssa,
 
 	/* Copy into a parm format. */
 	len = af->to_addr_param(ssa, addrparm);
-	len += sizeof(sctp_errhdr_t);
+	len += sizeof(*errhdr);
 
 	errhdr->cause = SCTP_ERROR_RESTART;
 	errhdr->length = htons(len);
@@ -2270,7 +2270,7 @@ sctp_disposition_t sctp_sf_cookie_echoed_err(struct net *net,
 					sctp_cmd_seq_t *commands)
 {
 	struct sctp_chunk *chunk = arg;
-	sctp_errhdr_t *err;
+	struct sctp_errhdr *err;
 
 	if (!sctp_vtag_verify(chunk, asoc))
 		return sctp_sf_pdiscard(net, ep, asoc, type, arg, commands);
@@ -2337,7 +2337,7 @@ static sctp_disposition_t sctp_sf_do_5_2_6_stale(struct net *net,
 	struct sctp_chunk *chunk = arg, *reply;
 	struct sctp_cookie_preserve_param bht;
 	struct sctp_bind_addr *bp;
-	sctp_errhdr_t *err;
+	struct sctp_errhdr *err;
 	u32 stale;
 
 	if (attempts > asoc->max_init_attempts) {
@@ -2348,7 +2348,7 @@ static sctp_disposition_t sctp_sf_do_5_2_6_stale(struct net *net,
 		return SCTP_DISPOSITION_DELETE_TCB;
 	}
 
-	err = (sctp_errhdr_t *)(chunk->skb->data);
+	err = (struct sctp_errhdr *)(chunk->skb->data);
 
 	/* When calculating the time extension, an implementation
 	 * SHOULD use the RTT information measured based on the
@@ -2364,7 +2364,7 @@ static sctp_disposition_t sctp_sf_do_5_2_6_stale(struct net *net,
 	 * to give ample time to retransmit the new cookie and thus
 	 * yield a higher probability of success on the reattempt.
 	 */
-	stale = ntohl(*(__be32 *)((u8 *)err + sizeof(sctp_errhdr_t)));
+	stale = ntohl(*(__be32 *)((u8 *)err + sizeof(*err)));
 	stale = (stale * 2) / 1000;
 
 	bht.param_hdr.type = SCTP_PARAM_COOKIE_PRESERVATIVE;
@@ -2499,13 +2499,14 @@ static sctp_disposition_t __sctp_sf_do_9_1_abort(struct net *net,
 	/* See if we have an error cause code in the chunk.  */
 	len = ntohs(chunk->chunk_hdr->length);
 	if (len >= sizeof(struct sctp_chunkhdr) + sizeof(struct sctp_errhdr)) {
+		struct sctp_errhdr *err;
 
-		sctp_errhdr_t *err;
 		sctp_walk_errors(err, chunk->chunk_hdr);
 		if ((void *)err != (void *)chunk->chunk_end)
-			return sctp_sf_pdiscard(net, ep, asoc, type, arg, commands);
+			return sctp_sf_pdiscard(net, ep, asoc, type, arg,
+						commands);
 
-		error = ((sctp_errhdr_t *)chunk->skb->data)->cause;
+		error = ((struct sctp_errhdr *)chunk->skb->data)->cause;
 	}
 
 	sctp_add_cmd_sf(commands, SCTP_CMD_SET_SK_ERR, SCTP_ERROR(ECONNRESET));
@@ -2552,7 +2553,7 @@ sctp_disposition_t sctp_sf_cookie_wait_abort(struct net *net,
 	/* See if we have an error cause code in the chunk.  */
 	len = ntohs(chunk->chunk_hdr->length);
 	if (len >= sizeof(struct sctp_chunkhdr) + sizeof(struct sctp_errhdr))
-		error = ((sctp_errhdr_t *)chunk->skb->data)->cause;
+		error = ((struct sctp_errhdr *)chunk->skb->data)->cause;
 
 	return sctp_stop_t1_and_abort(net, commands, error, ECONNREFUSED, asoc,
 				      chunk->transport);
@@ -3310,7 +3311,7 @@ sctp_disposition_t sctp_sf_operr_notify(struct net *net,
 					sctp_cmd_seq_t *commands)
 {
 	struct sctp_chunk *chunk = arg;
-	sctp_errhdr_t *err;
+	struct sctp_errhdr *err;
 
 	if (!sctp_vtag_verify(chunk, asoc))
 		return sctp_sf_pdiscard(net, ep, asoc, type, arg, commands);
@@ -3433,7 +3434,7 @@ sctp_disposition_t sctp_sf_ootb(struct net *net,
 	struct sctp_chunk *chunk = arg;
 	struct sk_buff *skb = chunk->skb;
 	struct sctp_chunkhdr *ch;
-	sctp_errhdr_t *err;
+	struct sctp_errhdr *err;
 	__u8 *ch_end;
 	int ootb_shut_ack = 0;
 	int ootb_cookie_ack = 0;
@@ -3776,7 +3777,7 @@ sctp_disposition_t sctp_sf_do_asconf_ack(struct net *net,
 	if (ADDIP_SERIAL_gte(rcvd_serial, sent_serial + 1) &&
 	    !(asoc->addip_last_asconf)) {
 		abort = sctp_make_abort(asoc, asconf_ack,
-					sizeof(sctp_errhdr_t));
+					sizeof(struct sctp_errhdr));
 		if (abort) {
 			sctp_init_cause(abort, SCTP_ERROR_ASCONF_ACK, 0);
 			sctp_add_cmd_sf(commands, SCTP_CMD_REPLY,
@@ -3812,7 +3813,7 @@ sctp_disposition_t sctp_sf_do_asconf_ack(struct net *net,
 		}
 
 		abort = sctp_make_abort(asoc, asconf_ack,
-					sizeof(sctp_errhdr_t));
+					sizeof(struct sctp_errhdr));
 		if (abort) {
 			sctp_init_cause(abort, SCTP_ERROR_RSRC_LOW, 0);
 			sctp_add_cmd_sf(commands, SCTP_CMD_REPLY,
diff --git a/net/sctp/ulpevent.c b/net/sctp/ulpevent.c
index 5f86c5062a98..67abc0194f30 100644
--- a/net/sctp/ulpevent.c
+++ b/net/sctp/ulpevent.c
@@ -371,19 +371,19 @@ sctp_ulpevent_make_remote_error(const struct sctp_association *asoc,
 				struct sctp_chunk *chunk, __u16 flags,
 				gfp_t gfp)
 {
-	struct sctp_ulpevent *event;
 	struct sctp_remote_error *sre;
+	struct sctp_ulpevent *event;
+	struct sctp_errhdr *ch;
 	struct sk_buff *skb;
-	sctp_errhdr_t *ch;
 	__be16 cause;
 	int elen;
 
-	ch = (sctp_errhdr_t *)(chunk->skb->data);
+	ch = (struct sctp_errhdr *)(chunk->skb->data);
 	cause = ch->cause;
-	elen = SCTP_PAD4(ntohs(ch->length)) - sizeof(sctp_errhdr_t);
+	elen = SCTP_PAD4(ntohs(ch->length)) - sizeof(*ch);
 
 	/* Pull off the ERROR header.  */
-	skb_pull(chunk->skb, sizeof(sctp_errhdr_t));
+	skb_pull(chunk->skb, sizeof(*ch));
 
 	/* Copy the skb to a new skb with room for us to prepend
 	 * notification with.
-- 
cgit v1.2.3


From 87caeba7914979ef6a5765dfa3163b51a30133ab Mon Sep 17 00:00:00 2001
From: Xin Long <lucien.xin@gmail.com>
Date: Thu, 3 Aug 2017 15:42:12 +0800
Subject: sctp: remove the typedef sctp_operr_chunk_t

This patch is to remove the typedef sctp_operr_chunk_t, and
replace with struct sctp_operr_chunk in the places where it's
using this typedef.

Signed-off-by: Xin Long <lucien.xin@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/sctp.h    | 4 ++--
 net/sctp/sm_statefuns.c | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

(limited to 'include')

diff --git a/include/linux/sctp.h b/include/linux/sctp.h
index d35bdd30fa0f..07c12e98fa85 100644
--- a/include/linux/sctp.h
+++ b/include/linux/sctp.h
@@ -433,10 +433,10 @@ struct sctp_errhdr {
 	__u8  variable[0];
 };
 
-typedef struct sctp_operr_chunk {
+struct sctp_operr_chunk {
 	struct sctp_chunkhdr chunk_hdr;
 	struct sctp_errhdr err_hdr;
-} sctp_operr_chunk_t;
+};
 
 /* RFC 2960 3.3.10 - Operation Error
  *
diff --git a/net/sctp/sm_statefuns.c b/net/sctp/sm_statefuns.c
index 7a2ba4c187d0..3b121d2ca309 100644
--- a/net/sctp/sm_statefuns.c
+++ b/net/sctp/sm_statefuns.c
@@ -2278,7 +2278,7 @@ sctp_disposition_t sctp_sf_cookie_echoed_err(struct net *net,
 	/* Make sure that the ERROR chunk has a valid length.
 	 * The parameter walking depends on this as well.
 	 */
-	if (!sctp_chunk_length_valid(chunk, sizeof(sctp_operr_chunk_t)))
+	if (!sctp_chunk_length_valid(chunk, sizeof(struct sctp_operr_chunk)))
 		return sctp_sf_violation_chunklen(net, ep, asoc, type, arg,
 						  commands);
 
@@ -3317,7 +3317,7 @@ sctp_disposition_t sctp_sf_operr_notify(struct net *net,
 		return sctp_sf_pdiscard(net, ep, asoc, type, arg, commands);
 
 	/* Make sure that the ERROR chunk has a valid length. */
-	if (!sctp_chunk_length_valid(chunk, sizeof(sctp_operr_chunk_t)))
+	if (!sctp_chunk_length_valid(chunk, sizeof(struct sctp_operr_chunk)))
 		return sctp_sf_violation_chunklen(net, ep, asoc, type, arg,
 						  commands);
 	sctp_walk_errors(err, chunk->chunk_hdr);
-- 
cgit v1.2.3


From 2a4932167772874c5bc4b3dfebf61cfadb5554b9 Mon Sep 17 00:00:00 2001
From: Xin Long <lucien.xin@gmail.com>
Date: Thu, 3 Aug 2017 15:42:13 +0800
Subject: sctp: remove the typedef sctp_error_t

This patch is to remove the typedef sctp_error_t, and replace
with enum sctp_error in the places where it's using this typedef.

Signed-off-by: Xin Long <lucien.xin@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/sctp.h    | 4 ++--
 net/sctp/sm_statefuns.c | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/include/linux/sctp.h b/include/linux/sctp.h
index 07c12e98fa85..c74ea93e36eb 100644
--- a/include/linux/sctp.h
+++ b/include/linux/sctp.h
@@ -457,7 +457,7 @@ struct sctp_operr_chunk {
  *      9              No User Data
  *     10              Cookie Received While Shutting Down
  */
-typedef enum {
+enum sctp_error {
 
 	SCTP_ERROR_NO_ERROR	   = cpu_to_be16(0x00),
 	SCTP_ERROR_INV_STRM	   = cpu_to_be16(0x01),
@@ -512,7 +512,7 @@ typedef enum {
 	 * 0x0105          Unsupported HMAC Identifier
 	 */
 	 SCTP_ERROR_UNSUP_HMAC	= cpu_to_be16(0x0105)
-} sctp_error_t;
+};
 
 
diff --git a/net/sctp/sm_statefuns.c b/net/sctp/sm_statefuns.c
index 3b121d2ca309..1d1249962993 100644
--- a/net/sctp/sm_statefuns.c
+++ b/net/sctp/sm_statefuns.c
@@ -528,7 +528,7 @@ sctp_disposition_t sctp_sf_do_5_1C_ack(struct net *net,
 			      (struct sctp_init_chunk *)chunk->chunk_hdr, chunk,
 			      &err_chunk)) {
 
-		sctp_error_t error = SCTP_ERROR_NO_RESOURCE;
+		enum sctp_error error = SCTP_ERROR_NO_RESOURCE;
 
 		/* This chunk contains fatal error. It is to be discarded.
 		 * Send an ABORT, with causes.  If there are no causes,
-- 
cgit v1.2.3


From 1fb6d83bd37dc7d0949b8a8005f7c24dddc3ee1e Mon Sep 17 00:00:00 2001
From: Xin Long <lucien.xin@gmail.com>
Date: Thu, 3 Aug 2017 15:42:14 +0800
Subject: sctp: remove the typedef sctp_ecnehdr_t

This patch is to remove the typedef sctp_ecnehdr_t, and
replace with struct sctp_ecnehdr in the places where it's
using this typedef.

It is also to use sizeof(variable) instead of sizeof(type).

Signed-off-by: Xin Long <lucien.xin@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/sctp.h     | 6 +++---
 net/sctp/sm_make_chunk.c | 4 ++--
 net/sctp/sm_statefuns.c  | 6 +++---
 3 files changed, 8 insertions(+), 8 deletions(-)

(limited to 'include')

diff --git a/include/linux/sctp.h b/include/linux/sctp.h
index c74ea93e36eb..5ea739b99f8d 100644
--- a/include/linux/sctp.h
+++ b/include/linux/sctp.h
@@ -519,13 +519,13 @@ enum sctp_error {
 /* RFC 2960.  Appendix A.  Explicit Congestion Notification.
  *   Explicit Congestion Notification Echo (ECNE) (12)
  */
-typedef struct sctp_ecnehdr {
+struct sctp_ecnehdr {
 	__be32 lowest_tsn;
-} sctp_ecnehdr_t;
+};
 
 typedef struct sctp_ecne_chunk {
 	struct sctp_chunkhdr chunk_hdr;
-	sctp_ecnehdr_t ence_hdr;
+	struct sctp_ecnehdr ence_hdr;
 } sctp_ecne_chunk_t;
 
 /* RFC 2960.  Appendix A.  Explicit Congestion Notification.
diff --git a/net/sctp/sm_make_chunk.c b/net/sctp/sm_make_chunk.c
index 0b2298bbb84e..1c7cc6a48bde 100644
--- a/net/sctp/sm_make_chunk.c
+++ b/net/sctp/sm_make_chunk.c
@@ -697,11 +697,11 @@ struct sctp_chunk *sctp_make_ecne(const struct sctp_association *asoc,
 			     const __u32 lowest_tsn)
 {
 	struct sctp_chunk *retval;
-	sctp_ecnehdr_t ecne;
+	struct sctp_ecnehdr ecne;
 
 	ecne.lowest_tsn = htonl(lowest_tsn);
 	retval = sctp_make_control(asoc, SCTP_CID_ECN_ECNE, 0,
-				   sizeof(sctp_ecnehdr_t), GFP_ATOMIC);
+				   sizeof(ecne), GFP_ATOMIC);
 	if (!retval)
 		goto nodata;
 	retval->subh.ecne_hdr =
diff --git a/net/sctp/sm_statefuns.c b/net/sctp/sm_statefuns.c
index 1d1249962993..de3e5bf88484 100644
--- a/net/sctp/sm_statefuns.c
+++ b/net/sctp/sm_statefuns.c
@@ -2918,8 +2918,8 @@ sctp_disposition_t sctp_sf_do_ecne(struct net *net,
 				   void *arg,
 				   sctp_cmd_seq_t *commands)
 {
-	sctp_ecnehdr_t *ecne;
 	struct sctp_chunk *chunk = arg;
+	struct sctp_ecnehdr *ecne;
 
 	if (!sctp_vtag_verify(chunk, asoc))
 		return sctp_sf_pdiscard(net, ep, asoc, type, arg, commands);
@@ -2928,8 +2928,8 @@ sctp_disposition_t sctp_sf_do_ecne(struct net *net,
 		return sctp_sf_violation_chunklen(net, ep, asoc, type, arg,
 						  commands);
 
-	ecne = (sctp_ecnehdr_t *) chunk->skb->data;
-	skb_pull(chunk->skb, sizeof(sctp_ecnehdr_t));
+	ecne = (struct sctp_ecnehdr *)chunk->skb->data;
+	skb_pull(chunk->skb, sizeof(*ecne));
 
 	/* If this is a newer ECNE than the last CWR packet we sent out */
 	sctp_add_cmd_sf(commands, SCTP_CMD_ECN_ECNE,
-- 
cgit v1.2.3


From b515fd27591dea06bc9f0178fba361b43b76ab6b Mon Sep 17 00:00:00 2001
From: Xin Long <lucien.xin@gmail.com>
Date: Thu, 3 Aug 2017 15:42:15 +0800
Subject: sctp: remove the typedef sctp_ecne_chunk_t

This patch is to remove the typedef sctp_ecne_chunk_t, and
replace with struct sctp_ecne_chunk in the places where it's
using this typedef.

Signed-off-by: Xin Long <lucien.xin@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/sctp.h    | 4 ++--
 net/sctp/sm_statefuns.c | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

(limited to 'include')

diff --git a/include/linux/sctp.h b/include/linux/sctp.h
index 5ea739b99f8d..026bbdfcaf44 100644
--- a/include/linux/sctp.h
+++ b/include/linux/sctp.h
@@ -523,10 +523,10 @@ struct sctp_ecnehdr {
 	__be32 lowest_tsn;
 };
 
-typedef struct sctp_ecne_chunk {
+struct sctp_ecne_chunk {
 	struct sctp_chunkhdr chunk_hdr;
 	struct sctp_ecnehdr ence_hdr;
-} sctp_ecne_chunk_t;
+};
 
 /* RFC 2960.  Appendix A.  Explicit Congestion Notification.
  *   Congestion Window Reduced (CWR) (13)
diff --git a/net/sctp/sm_statefuns.c b/net/sctp/sm_statefuns.c
index de3e5bf88484..286dce14c5cc 100644
--- a/net/sctp/sm_statefuns.c
+++ b/net/sctp/sm_statefuns.c
@@ -2869,7 +2869,7 @@ sctp_disposition_t sctp_sf_do_ecn_cwr(struct net *net,
 	if (!sctp_vtag_verify(chunk, asoc))
 		return sctp_sf_pdiscard(net, ep, asoc, type, arg, commands);
 
-	if (!sctp_chunk_length_valid(chunk, sizeof(sctp_ecne_chunk_t)))
+	if (!sctp_chunk_length_valid(chunk, sizeof(struct sctp_ecne_chunk)))
 		return sctp_sf_violation_chunklen(net, ep, asoc, type, arg,
 						  commands);
 
@@ -2924,7 +2924,7 @@ sctp_disposition_t sctp_sf_do_ecne(struct net *net,
 	if (!sctp_vtag_verify(chunk, asoc))
 		return sctp_sf_pdiscard(net, ep, asoc, type, arg, commands);
 
-	if (!sctp_chunk_length_valid(chunk, sizeof(sctp_ecne_chunk_t)))
+	if (!sctp_chunk_length_valid(chunk, sizeof(struct sctp_ecne_chunk)))
 		return sctp_sf_violation_chunklen(net, ep, asoc, type, arg,
 						  commands);
 
-- 
cgit v1.2.3


From 65f77105438a7793836d7ba2d9a05fa585b8caf9 Mon Sep 17 00:00:00 2001
From: Xin Long <lucien.xin@gmail.com>
Date: Thu, 3 Aug 2017 15:42:16 +0800
Subject: sctp: remove the typedef sctp_cwrhdr_t

This patch is to remove the typedef sctp_cwrhdr_t, and
replace with struct sctp_cwrhdr in the places where it's
using this typedef.

It is also to use sizeof(variable) instead of sizeof(type).

Signed-off-by: Xin Long <lucien.xin@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/sctp.h     | 6 +++---
 net/sctp/sm_make_chunk.c | 4 ++--
 net/sctp/sm_statefuns.c  | 6 +++---
 3 files changed, 8 insertions(+), 8 deletions(-)

(limited to 'include')

diff --git a/include/linux/sctp.h b/include/linux/sctp.h
index 026bbdfcaf44..3c8c418425e6 100644
--- a/include/linux/sctp.h
+++ b/include/linux/sctp.h
@@ -531,13 +531,13 @@ struct sctp_ecne_chunk {
 /* RFC 2960.  Appendix A.  Explicit Congestion Notification.
  *   Congestion Window Reduced (CWR) (13)
  */
-typedef struct sctp_cwrhdr {
+struct sctp_cwrhdr {
 	__be32 lowest_tsn;
-} sctp_cwrhdr_t;
+};
 
 typedef struct sctp_cwr_chunk {
 	struct sctp_chunkhdr chunk_hdr;
-	sctp_cwrhdr_t cwr_hdr;
+	struct sctp_cwrhdr cwr_hdr;
 } sctp_cwr_chunk_t;
 
 /* PR-SCTP
diff --git a/net/sctp/sm_make_chunk.c b/net/sctp/sm_make_chunk.c
index 1c7cc6a48bde..e8e506522193 100644
--- a/net/sctp/sm_make_chunk.c
+++ b/net/sctp/sm_make_chunk.c
@@ -663,11 +663,11 @@ struct sctp_chunk *sctp_make_cwr(const struct sctp_association *asoc,
 			    const struct sctp_chunk *chunk)
 {
 	struct sctp_chunk *retval;
-	sctp_cwrhdr_t cwr;
+	struct sctp_cwrhdr cwr;
 
 	cwr.lowest_tsn = htonl(lowest_tsn);
 	retval = sctp_make_control(asoc, SCTP_CID_ECN_CWR, 0,
-				   sizeof(sctp_cwrhdr_t), GFP_ATOMIC);
+				   sizeof(cwr), GFP_ATOMIC);
 
 	if (!retval)
 		goto nodata;
diff --git a/net/sctp/sm_statefuns.c b/net/sctp/sm_statefuns.c
index 286dce14c5cc..e13c83f9a6ee 100644
--- a/net/sctp/sm_statefuns.c
+++ b/net/sctp/sm_statefuns.c
@@ -2862,8 +2862,8 @@ sctp_disposition_t sctp_sf_do_ecn_cwr(struct net *net,
 				      void *arg,
 				      sctp_cmd_seq_t *commands)
 {
-	sctp_cwrhdr_t *cwr;
 	struct sctp_chunk *chunk = arg;
+	struct sctp_cwrhdr *cwr;
 	u32 lowest_tsn;
 
 	if (!sctp_vtag_verify(chunk, asoc))
@@ -2873,8 +2873,8 @@ sctp_disposition_t sctp_sf_do_ecn_cwr(struct net *net,
 		return sctp_sf_violation_chunklen(net, ep, asoc, type, arg,
 						  commands);
 
-	cwr = (sctp_cwrhdr_t *) chunk->skb->data;
-	skb_pull(chunk->skb, sizeof(sctp_cwrhdr_t));
+	cwr = (struct sctp_cwrhdr *)chunk->skb->data;
+	skb_pull(chunk->skb, sizeof(*cwr));
 
 	lowest_tsn = ntohl(cwr->lowest_tsn);
 
-- 
cgit v1.2.3


From 05b25d0ba68d5b6d5d06f7c6cc3fec49ffa2df38 Mon Sep 17 00:00:00 2001
From: Xin Long <lucien.xin@gmail.com>
Date: Thu, 3 Aug 2017 15:42:17 +0800
Subject: sctp: remove the typedef sctp_cwr_chunk_t

Remove this typedef including the struct, there is even no places
using it.

Signed-off-by: Xin Long <lucien.xin@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/sctp.h | 5 -----
 1 file changed, 5 deletions(-)

(limited to 'include')

diff --git a/include/linux/sctp.h b/include/linux/sctp.h
index 3c8c418425e6..2a97639eb035 100644
--- a/include/linux/sctp.h
+++ b/include/linux/sctp.h
@@ -535,11 +535,6 @@ struct sctp_cwrhdr {
 	__be32 lowest_tsn;
 };
 
-typedef struct sctp_cwr_chunk {
-	struct sctp_chunkhdr chunk_hdr;
-	struct sctp_cwrhdr cwr_hdr;
-} sctp_cwr_chunk_t;
-
 /* PR-SCTP
  * 3.2 Forward Cumulative TSN Chunk Definition (FORWARD TSN)
  *
-- 
cgit v1.2.3


From 8b32f2348a0441ef202562618b13d1bb494f3a47 Mon Sep 17 00:00:00 2001
From: Xin Long <lucien.xin@gmail.com>
Date: Thu, 3 Aug 2017 15:42:18 +0800
Subject: sctp: remove the typedef sctp_addip_param_t

This patch is to remove the typedef sctp_addip_param_t, and
replace with struct sctp_addip_param in the places where it's
using this typedef.

It is to use sizeof(variable) instead of sizeof(type), and
also fix some indent problems.

Signed-off-by: Xin Long <lucien.xin@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/sctp.h     |  8 ++++----
 net/sctp/sm_make_chunk.c | 39 ++++++++++++++++++++-------------------
 2 files changed, 24 insertions(+), 23 deletions(-)

(limited to 'include')

diff --git a/include/linux/sctp.h b/include/linux/sctp.h
index 2a97639eb035..5b7d6d9d3fc5 100644
--- a/include/linux/sctp.h
+++ b/include/linux/sctp.h
@@ -629,10 +629,10 @@ struct sctp_fwdtsn_chunk {
  *	The ASCONF Parameter Response is used in the ASCONF-ACK to
  *	report status of ASCONF processing.
  */
-typedef struct sctp_addip_param {
-	struct sctp_paramhdr	param_hdr;
-	__be32		crr_id;
-} sctp_addip_param_t;
+struct sctp_addip_param {
+	struct sctp_paramhdr param_hdr;
+	__be32 crr_id;
+};
 
 typedef struct sctp_addiphdr {
 	__be32	serial;
diff --git a/net/sctp/sm_make_chunk.c b/net/sctp/sm_make_chunk.c
index e8e506522193..bab3354ecdc3 100644
--- a/net/sctp/sm_make_chunk.c
+++ b/net/sctp/sm_make_chunk.c
@@ -2615,7 +2615,7 @@ do_addr_param:
 		if (!net->sctp.addip_enable)
 			goto fall_through;
 
-		addr_param = param.v + sizeof(sctp_addip_param_t);
+		addr_param = param.v + sizeof(struct sctp_addip_param);
 
 		af = sctp_get_af_specific(param_type2af(addr_param->p.type));
 		if (af == NULL)
@@ -2810,7 +2810,7 @@ struct sctp_chunk *sctp_make_asconf_update_ip(struct sctp_association *asoc,
 					      int		      addrcnt,
 					      __be16		      flags)
 {
-	sctp_addip_param_t	param;
+	struct sctp_addip_param	param;
 	struct sctp_chunk	*retval;
 	union sctp_addr_param	addr_param;
 	union sctp_addr		*addr;
@@ -2896,7 +2896,7 @@ struct sctp_chunk *sctp_make_asconf_update_ip(struct sctp_association *asoc,
 struct sctp_chunk *sctp_make_asconf_set_prim(struct sctp_association *asoc,
 					     union sctp_addr *addr)
 {
-	sctp_addip_param_t	param;
+	struct sctp_addip_param	param;
 	struct sctp_chunk 	*retval;
 	int 			len = sizeof(param);
 	union sctp_addr_param	addrparam;
@@ -2965,9 +2965,10 @@ static struct sctp_chunk *sctp_make_asconf_ack(const struct sctp_association *as
 
 /* Add response parameters to an ASCONF_ACK chunk. */
 static void sctp_add_asconf_response(struct sctp_chunk *chunk, __be32 crr_id,
-			      __be16 err_code, sctp_addip_param_t *asconf_param)
+				     __be16 err_code,
+				     struct sctp_addip_param *asconf_param)
 {
-	sctp_addip_param_t 	ack_param;
+	struct sctp_addip_param ack_param;
 	struct sctp_errhdr	err_param;
 	int			asconf_param_len = 0;
 	int			err_param_len = 0;
@@ -3006,15 +3007,15 @@ static void sctp_add_asconf_response(struct sctp_chunk *chunk, __be32 crr_id,
 
 /* Process a asconf parameter. */
 static __be16 sctp_process_asconf_param(struct sctp_association *asoc,
-				       struct sctp_chunk *asconf,
-				       sctp_addip_param_t *asconf_param)
+					struct sctp_chunk *asconf,
+					struct sctp_addip_param *asconf_param)
 {
 	struct sctp_transport *peer;
 	struct sctp_af *af;
 	union sctp_addr	addr;
 	union sctp_addr_param *addr_param;
 
-	addr_param = (void *)asconf_param + sizeof(sctp_addip_param_t);
+	addr_param = (void *)asconf_param + sizeof(*asconf_param);
 
 	if (asconf_param->param_hdr.type != SCTP_PARAM_ADD_IP &&
 	    asconf_param->param_hdr.type != SCTP_PARAM_DEL_IP &&
@@ -3174,13 +3175,13 @@ bool sctp_verify_asconf(const struct sctp_association *asoc,
 			if (addr_param_needed && !addr_param_seen)
 				return false;
 			length = ntohs(param.addip->param_hdr.length);
-			if (length < sizeof(sctp_addip_param_t) +
+			if (length < sizeof(struct sctp_addip_param) +
 				     sizeof(**errp))
 				return false;
 			break;
 		case SCTP_PARAM_SUCCESS_REPORT:
 		case SCTP_PARAM_ADAPTATION_LAYER_IND:
-			if (length != sizeof(sctp_addip_param_t))
+			if (length != sizeof(struct sctp_addip_param))
 				return false;
 			break;
 		default:
@@ -3289,7 +3290,7 @@ done:
 
 /* Process a asconf parameter that is successfully acked. */
 static void sctp_asconf_param_success(struct sctp_association *asoc,
-				     sctp_addip_param_t *asconf_param)
+				      struct sctp_addip_param *asconf_param)
 {
 	struct sctp_af *af;
 	union sctp_addr	addr;
@@ -3298,7 +3299,7 @@ static void sctp_asconf_param_success(struct sctp_association *asoc,
 	struct sctp_transport *transport;
 	struct sctp_sockaddr_entry *saddr;
 
-	addr_param = (void *)asconf_param + sizeof(sctp_addip_param_t);
+	addr_param = (void *)asconf_param + sizeof(*asconf_param);
 
 	/* We have checked the packet before, so we do not check again.	*/
 	af = sctp_get_af_specific(param_type2af(addr_param->p.type));
@@ -3349,10 +3350,10 @@ static void sctp_asconf_param_success(struct sctp_association *asoc,
  * specific success indication is present for the parameter.
  */
 static __be16 sctp_get_asconf_response(struct sctp_chunk *asconf_ack,
-				      sctp_addip_param_t *asconf_param,
-				      int no_err)
+				       struct sctp_addip_param *asconf_param,
+				       int no_err)
 {
-	sctp_addip_param_t	*asconf_ack_param;
+	struct sctp_addip_param	*asconf_ack_param;
 	struct sctp_errhdr	*err_param;
 	int			length;
 	int			asconf_ack_len;
@@ -3370,8 +3371,8 @@ static __be16 sctp_get_asconf_response(struct sctp_chunk *asconf_ack,
 	 * the first asconf_ack parameter.
 	 */
 	length = sizeof(sctp_addiphdr_t);
-	asconf_ack_param = (sctp_addip_param_t *)(asconf_ack->skb->data +
-						  length);
+	asconf_ack_param = (struct sctp_addip_param *)(asconf_ack->skb->data +
+						       length);
 	asconf_ack_len -= length;
 
 	while (asconf_ack_len > 0) {
@@ -3380,7 +3381,7 @@ static __be16 sctp_get_asconf_response(struct sctp_chunk *asconf_ack,
 			case SCTP_PARAM_SUCCESS_REPORT:
 				return SCTP_ERROR_NO_ERROR;
 			case SCTP_PARAM_ERR_CAUSE:
-				length = sizeof(sctp_addip_param_t);
+				length = sizeof(*asconf_ack_param);
 				err_param = (void *)asconf_ack_param + length;
 				asconf_ack_len -= length;
 				if (asconf_ack_len > 0)
@@ -3407,7 +3408,7 @@ int sctp_process_asconf_ack(struct sctp_association *asoc,
 {
 	struct sctp_chunk	*asconf = asoc->addip_last_asconf;
 	union sctp_addr_param	*addr_param;
-	sctp_addip_param_t	*asconf_param;
+	struct sctp_addip_param	*asconf_param;
 	int	length = 0;
 	int	asconf_len = asconf->skb->len;
 	int	all_param_pass = 0;
-- 
cgit v1.2.3


From 65205cc465e9b37abbdbb3d595c46081b97e35bc Mon Sep 17 00:00:00 2001
From: Xin Long <lucien.xin@gmail.com>
Date: Thu, 3 Aug 2017 15:42:19 +0800
Subject: sctp: remove the typedef sctp_addiphdr_t

This patch is to remove the typedef sctp_addiphdr_t, and
replace with struct sctp_addiphdr in the places where it's
using this typedef.

It is also to use sizeof(variable) instead of sizeof(type).

Signed-off-by: Xin Long <lucien.xin@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/sctp.h     |  6 +++---
 net/sctp/sm_make_chunk.c | 14 +++++++-------
 net/sctp/sm_statefuns.c  | 10 +++++-----
 3 files changed, 15 insertions(+), 15 deletions(-)

(limited to 'include')

diff --git a/include/linux/sctp.h b/include/linux/sctp.h
index 5b7d6d9d3fc5..83dac9b0b4bd 100644
--- a/include/linux/sctp.h
+++ b/include/linux/sctp.h
@@ -634,14 +634,14 @@ struct sctp_addip_param {
 	__be32 crr_id;
 };
 
-typedef struct sctp_addiphdr {
+struct sctp_addiphdr {
 	__be32	serial;
 	__u8	params[0];
-} sctp_addiphdr_t;
+};
 
 typedef struct sctp_addip_chunk {
 	struct sctp_chunkhdr chunk_hdr;
-	sctp_addiphdr_t addip_hdr;
+	struct sctp_addiphdr addip_hdr;
 } sctp_addip_chunk_t;
 
 /* AUTH
diff --git a/net/sctp/sm_make_chunk.c b/net/sctp/sm_make_chunk.c
index bab3354ecdc3..0e71e5054ead 100644
--- a/net/sctp/sm_make_chunk.c
+++ b/net/sctp/sm_make_chunk.c
@@ -2752,7 +2752,7 @@ static struct sctp_chunk *sctp_make_asconf(struct sctp_association *asoc,
 					   union sctp_addr *addr,
 					   int vparam_len)
 {
-	sctp_addiphdr_t asconf;
+	struct sctp_addiphdr asconf;
 	struct sctp_chunk *retval;
 	int length = sizeof(asconf) + vparam_len;
 	union sctp_addr_param addrparam;
@@ -2945,7 +2945,7 @@ struct sctp_chunk *sctp_make_asconf_set_prim(struct sctp_association *asoc,
 static struct sctp_chunk *sctp_make_asconf_ack(const struct sctp_association *asoc,
 					       __u32 serial, int vparam_len)
 {
-	sctp_addiphdr_t		asconf;
+	struct sctp_addiphdr	asconf;
 	struct sctp_chunk	*retval;
 	int			length = sizeof(asconf) + vparam_len;
 
@@ -3210,7 +3210,7 @@ struct sctp_chunk *sctp_process_asconf(struct sctp_association *asoc,
 	sctp_addip_chunk_t *addip = (sctp_addip_chunk_t *) asconf->chunk_hdr;
 	bool all_param_pass = true;
 	union sctp_params param;
-	sctp_addiphdr_t		*hdr;
+	struct sctp_addiphdr	*hdr;
 	union sctp_addr_param	*addr_param;
 	struct sctp_chunk	*asconf_ack;
 	__be16	err_code;
@@ -3220,11 +3220,11 @@ struct sctp_chunk *sctp_process_asconf(struct sctp_association *asoc,
 
 	chunk_len = ntohs(asconf->chunk_hdr->length) -
 		    sizeof(struct sctp_chunkhdr);
-	hdr = (sctp_addiphdr_t *)asconf->skb->data;
+	hdr = (struct sctp_addiphdr *)asconf->skb->data;
 	serial = ntohl(hdr->serial);
 
 	/* Skip the addiphdr and store a pointer to address parameter.  */
-	length = sizeof(sctp_addiphdr_t);
+	length = sizeof(*hdr);
 	addr_param = (union sctp_addr_param *)(asconf->skb->data + length);
 	chunk_len -= length;
 
@@ -3370,7 +3370,7 @@ static __be16 sctp_get_asconf_response(struct sctp_chunk *asconf_ack,
 	/* Skip the addiphdr from the asconf_ack chunk and store a pointer to
 	 * the first asconf_ack parameter.
 	 */
-	length = sizeof(sctp_addiphdr_t);
+	length = sizeof(struct sctp_addiphdr);
 	asconf_ack_param = (struct sctp_addip_param *)(asconf_ack->skb->data +
 						       length);
 	asconf_ack_len -= length;
@@ -3435,7 +3435,7 @@ int sctp_process_asconf_ack(struct sctp_association *asoc,
 	 * failures are indicated, then all request(s) are considered
 	 * successful.
 	 */
-	if (asconf_ack->skb->len == sizeof(sctp_addiphdr_t))
+	if (asconf_ack->skb->len == sizeof(struct sctp_addiphdr))
 		all_param_pass = 1;
 
 	/* Process the TLVs contained in the last sent ASCONF chunk. */
diff --git a/net/sctp/sm_statefuns.c b/net/sctp/sm_statefuns.c
index e13c83f9a6ee..d722f380b36e 100644
--- a/net/sctp/sm_statefuns.c
+++ b/net/sctp/sm_statefuns.c
@@ -3613,7 +3613,7 @@ sctp_disposition_t sctp_sf_do_asconf(struct net *net,
 	struct sctp_chunk	*chunk = arg;
 	struct sctp_chunk	*asconf_ack = NULL;
 	struct sctp_paramhdr	*err_param = NULL;
-	sctp_addiphdr_t		*hdr;
+	struct sctp_addiphdr	*hdr;
 	__u32			serial;
 
 	if (!sctp_vtag_verify(chunk, asoc)) {
@@ -3636,7 +3636,7 @@ sctp_disposition_t sctp_sf_do_asconf(struct net *net,
 		return sctp_sf_violation_chunklen(net, ep, asoc, type, arg,
 						  commands);
 
-	hdr = (sctp_addiphdr_t *)chunk->skb->data;
+	hdr = (struct sctp_addiphdr *)chunk->skb->data;
 	serial = ntohl(hdr->serial);
 
 	/* Verify the ASCONF chunk before processing it. */
@@ -3730,7 +3730,7 @@ sctp_disposition_t sctp_sf_do_asconf_ack(struct net *net,
 	struct sctp_chunk	*last_asconf = asoc->addip_last_asconf;
 	struct sctp_chunk	*abort;
 	struct sctp_paramhdr	*err_param = NULL;
-	sctp_addiphdr_t		*addip_hdr;
+	struct sctp_addiphdr	*addip_hdr;
 	__u32			sent_serial, rcvd_serial;
 
 	if (!sctp_vtag_verify(asconf_ack, asoc)) {
@@ -3753,7 +3753,7 @@ sctp_disposition_t sctp_sf_do_asconf_ack(struct net *net,
 		return sctp_sf_violation_chunklen(net, ep, asoc, type, arg,
 						  commands);
 
-	addip_hdr = (sctp_addiphdr_t *)asconf_ack->skb->data;
+	addip_hdr = (struct sctp_addiphdr *)asconf_ack->skb->data;
 	rcvd_serial = ntohl(addip_hdr->serial);
 
 	/* Verify the ASCONF-ACK chunk before processing it. */
@@ -3762,7 +3762,7 @@ sctp_disposition_t sctp_sf_do_asconf_ack(struct net *net,
 			   (void *)err_param, commands);
 
 	if (last_asconf) {
-		addip_hdr = (sctp_addiphdr_t *)last_asconf->subh.addip_hdr;
+		addip_hdr = (struct sctp_addiphdr *)last_asconf->subh.addip_hdr;
 		sent_serial = ntohl(addip_hdr->serial);
 	} else {
 		sent_serial = asoc->addip_serial - 1;
-- 
cgit v1.2.3


From 68d75469468620c37cd58dc352f1dcec8c3896b6 Mon Sep 17 00:00:00 2001
From: Xin Long <lucien.xin@gmail.com>
Date: Thu, 3 Aug 2017 15:42:20 +0800
Subject: sctp: remove the typedef sctp_addip_chunk_t

This patch is to remove the typedef sctp_addip_chunk_t, and
replace with struct sctp_addip_chunk in the places where it's
using this typedef.

Signed-off-by: Xin Long <lucien.xin@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/sctp.h     |  4 ++--
 net/sctp/input.c         |  2 +-
 net/sctp/sm_make_chunk.c |  8 +++++---
 net/sctp/sm_statefuns.c  | 11 +++++++----
 4 files changed, 15 insertions(+), 10 deletions(-)

(limited to 'include')

diff --git a/include/linux/sctp.h b/include/linux/sctp.h
index 83dac9b0b4bd..e7b439ed6371 100644
--- a/include/linux/sctp.h
+++ b/include/linux/sctp.h
@@ -639,10 +639,10 @@ struct sctp_addiphdr {
 	__u8	params[0];
 };
 
-typedef struct sctp_addip_chunk {
+struct sctp_addip_chunk {
 	struct sctp_chunkhdr chunk_hdr;
 	struct sctp_addiphdr addip_hdr;
-} sctp_addip_chunk_t;
+};
 
 /* AUTH
  * Section 4.1  Authentication Chunk (AUTH)
diff --git a/net/sctp/input.c b/net/sctp/input.c
index 41eb2ec10460..92a07141fd07 100644
--- a/net/sctp/input.c
+++ b/net/sctp/input.c
@@ -1111,7 +1111,7 @@ static struct sctp_association *__sctp_rcv_asconf_lookup(
 					__be16 peer_port,
 					struct sctp_transport **transportp)
 {
-	sctp_addip_chunk_t *asconf = (struct sctp_addip_chunk *)ch;
+	struct sctp_addip_chunk *asconf = (struct sctp_addip_chunk *)ch;
 	struct sctp_af *af;
 	union sctp_addr_param *param;
 	union sctp_addr paddr;
diff --git a/net/sctp/sm_make_chunk.c b/net/sctp/sm_make_chunk.c
index 0e71e5054ead..ae54c6ecd26a 100644
--- a/net/sctp/sm_make_chunk.c
+++ b/net/sctp/sm_make_chunk.c
@@ -3140,10 +3140,11 @@ bool sctp_verify_asconf(const struct sctp_association *asoc,
 			struct sctp_chunk *chunk, bool addr_param_needed,
 			struct sctp_paramhdr **errp)
 {
-	sctp_addip_chunk_t *addip = (sctp_addip_chunk_t *) chunk->chunk_hdr;
+	struct sctp_addip_chunk *addip;
 	union sctp_params param;
 	bool addr_param_seen = false;
 
+	addip = (struct sctp_addip_chunk *)chunk->chunk_hdr;
 	sctp_walk_params(param, addip, addip_hdr.params) {
 		size_t length = ntohs(param.p->length);
 
@@ -3207,7 +3208,7 @@ bool sctp_verify_asconf(const struct sctp_association *asoc,
 struct sctp_chunk *sctp_process_asconf(struct sctp_association *asoc,
 				       struct sctp_chunk *asconf)
 {
-	sctp_addip_chunk_t *addip = (sctp_addip_chunk_t *) asconf->chunk_hdr;
+	struct sctp_addip_chunk *addip;
 	bool all_param_pass = true;
 	union sctp_params param;
 	struct sctp_addiphdr	*hdr;
@@ -3218,6 +3219,7 @@ struct sctp_chunk *sctp_process_asconf(struct sctp_association *asoc,
 	int	chunk_len;
 	__u32	serial;
 
+	addip = (struct sctp_addip_chunk *)asconf->chunk_hdr;
 	chunk_len = ntohs(asconf->chunk_hdr->length) -
 		    sizeof(struct sctp_chunkhdr);
 	hdr = (struct sctp_addiphdr *)asconf->skb->data;
@@ -3419,7 +3421,7 @@ int sctp_process_asconf_ack(struct sctp_association *asoc,
 	/* Skip the chunkhdr and addiphdr from the last asconf sent and store
 	 * a pointer to address parameter.
 	 */
-	length = sizeof(sctp_addip_chunk_t);
+	length = sizeof(struct sctp_addip_chunk);
 	addr_param = (union sctp_addr_param *)(asconf->skb->data + length);
 	asconf_len -= length;
 
diff --git a/net/sctp/sm_statefuns.c b/net/sctp/sm_statefuns.c
index d722f380b36e..9c235bb7aafe 100644
--- a/net/sctp/sm_statefuns.c
+++ b/net/sctp/sm_statefuns.c
@@ -3629,10 +3629,11 @@ sctp_disposition_t sctp_sf_do_asconf(struct net *net,
 	 * described in [I-D.ietf-tsvwg-sctp-auth].
 	 */
 	if (!net->sctp.addip_noauth && !chunk->auth)
-		return sctp_sf_discard_chunk(net, ep, asoc, type, arg, commands);
+		return sctp_sf_discard_chunk(net, ep, asoc, type, arg,
+					     commands);
 
 	/* Make sure that the ASCONF ADDIP chunk has a valid length.  */
-	if (!sctp_chunk_length_valid(chunk, sizeof(sctp_addip_chunk_t)))
+	if (!sctp_chunk_length_valid(chunk, sizeof(struct sctp_addip_chunk)))
 		return sctp_sf_violation_chunklen(net, ep, asoc, type, arg,
 						  commands);
 
@@ -3746,10 +3747,12 @@ sctp_disposition_t sctp_sf_do_asconf_ack(struct net *net,
 	 * described in [I-D.ietf-tsvwg-sctp-auth].
 	 */
 	if (!net->sctp.addip_noauth && !asconf_ack->auth)
-		return sctp_sf_discard_chunk(net, ep, asoc, type, arg, commands);
+		return sctp_sf_discard_chunk(net, ep, asoc, type, arg,
+					     commands);
 
 	/* Make sure that the ADDIP chunk has a valid length.  */
-	if (!sctp_chunk_length_valid(asconf_ack, sizeof(sctp_addip_chunk_t)))
+	if (!sctp_chunk_length_valid(asconf_ack,
+				     sizeof(struct sctp_addip_chunk)))
 		return sctp_sf_violation_chunklen(net, ep, asoc, type, arg,
 						  commands);
 
-- 
cgit v1.2.3


From 96f7ef4d58100d7168e0c4d01a59fbd8589f5756 Mon Sep 17 00:00:00 2001
From: Xin Long <lucien.xin@gmail.com>
Date: Thu, 3 Aug 2017 15:42:21 +0800
Subject: sctp: remove the typedef sctp_authhdr_t

This patch is to remove the typedef sctp_authhdr_t, and
replace with struct sctp_authhdr in the places where it's
using this typedef.

It is also to use sizeof(variable) instead of sizeof(type).

Signed-off-by: Xin Long <lucien.xin@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/sctp.h     | 6 +++---
 net/sctp/sm_make_chunk.c | 8 ++++----
 2 files changed, 7 insertions(+), 7 deletions(-)

(limited to 'include')

diff --git a/include/linux/sctp.h b/include/linux/sctp.h
index e7b439ed6371..b7603f5efebe 100644
--- a/include/linux/sctp.h
+++ b/include/linux/sctp.h
@@ -693,15 +693,15 @@ struct sctp_addip_chunk {
  *   HMAC: n bytes (unsigned integer) This hold the result of the HMAC
  *      calculation.
  */
-typedef struct sctp_authhdr {
+struct sctp_authhdr {
 	__be16 shkey_id;
 	__be16 hmac_id;
 	__u8   hmac[0];
-} sctp_authhdr_t;
+};
 
 typedef struct sctp_auth_chunk {
 	struct sctp_chunkhdr chunk_hdr;
-	sctp_authhdr_t auth_hdr;
+	struct sctp_authhdr auth_hdr;
 } sctp_auth_chunk_t;
 
 struct sctp_infox {
diff --git a/net/sctp/sm_make_chunk.c b/net/sctp/sm_make_chunk.c
index ae54c6ecd26a..d17e8d1f2ed9 100644
--- a/net/sctp/sm_make_chunk.c
+++ b/net/sctp/sm_make_chunk.c
@@ -1282,16 +1282,16 @@ struct sctp_chunk *sctp_make_auth(const struct sctp_association *asoc)
 		return NULL;
 
 	retval = sctp_make_control(asoc, SCTP_CID_AUTH, 0,
-			hmac_desc->hmac_len + sizeof(sctp_authhdr_t),
-			GFP_ATOMIC);
+				   hmac_desc->hmac_len + sizeof(auth_hdr),
+				   GFP_ATOMIC);
 	if (!retval)
 		return NULL;
 
 	auth_hdr.hmac_id = htons(hmac_desc->hmac_id);
 	auth_hdr.shkey_id = htons(asoc->active_key_id);
 
-	retval->subh.auth_hdr = sctp_addto_chunk(retval, sizeof(sctp_authhdr_t),
-						&auth_hdr);
+	retval->subh.auth_hdr = sctp_addto_chunk(retval, sizeof(auth_hdr),
+						 &auth_hdr);
 
 	hmac = skb_put_zero(retval->skb, hmac_desc->hmac_len);
 
-- 
cgit v1.2.3


From bb96dec74543bb3ceb4ac5caf39341dadb4cb559 Mon Sep 17 00:00:00 2001
From: Xin Long <lucien.xin@gmail.com>
Date: Thu, 3 Aug 2017 15:42:22 +0800
Subject: sctp: remove the typedef sctp_auth_chunk_t

This patch is to remove the typedef sctp_auth_chunk_t, and
replace with struct sctp_auth_chunk in the places where it's
using this typedef.

It is also to use sizeof(variable) instead of sizeof(type).

Signed-off-by: Xin Long <lucien.xin@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/sctp.h    | 4 ++--
 net/sctp/chunk.c        | 2 +-
 net/sctp/sm_statefuns.c | 9 +++++----
 3 files changed, 8 insertions(+), 7 deletions(-)

(limited to 'include')

diff --git a/include/linux/sctp.h b/include/linux/sctp.h
index b7603f5efebe..82b171e1aa0b 100644
--- a/include/linux/sctp.h
+++ b/include/linux/sctp.h
@@ -699,10 +699,10 @@ struct sctp_authhdr {
 	__u8   hmac[0];
 };
 
-typedef struct sctp_auth_chunk {
+struct sctp_auth_chunk {
 	struct sctp_chunkhdr chunk_hdr;
 	struct sctp_authhdr auth_hdr;
-} sctp_auth_chunk_t;
+};
 
 struct sctp_infox {
 	struct sctp_info *sctpinfo;
diff --git a/net/sctp/chunk.c b/net/sctp/chunk.c
index 681b181e7ae3..3afac275ee82 100644
--- a/net/sctp/chunk.c
+++ b/net/sctp/chunk.c
@@ -201,7 +201,7 @@ struct sctp_datamsg *sctp_datamsg_from_user(struct sctp_association *asoc,
 		struct sctp_hmac *hmac_desc = sctp_auth_asoc_get_hmac(asoc);
 
 		if (hmac_desc)
-			max_data -= SCTP_PAD4(sizeof(sctp_auth_chunk_t) +
+			max_data -= SCTP_PAD4(sizeof(struct sctp_auth_chunk) +
 					      hmac_desc->hmac_len);
 	}
 
diff --git a/net/sctp/sm_statefuns.c b/net/sctp/sm_statefuns.c
index 9c235bb7aafe..8af90a5f23cd 100644
--- a/net/sctp/sm_statefuns.c
+++ b/net/sctp/sm_statefuns.c
@@ -4093,7 +4093,7 @@ static sctp_ierror_t sctp_sf_authenticate(struct net *net,
 	/* Pull in the auth header, so we can do some more verification */
 	auth_hdr = (struct sctp_authhdr *)chunk->skb->data;
 	chunk->subh.auth_hdr = auth_hdr;
-	skb_pull(chunk->skb, sizeof(struct sctp_authhdr));
+	skb_pull(chunk->skb, sizeof(*auth_hdr));
 
 	/* Make sure that we support the HMAC algorithm from the auth
 	 * chunk.
@@ -4112,7 +4112,8 @@ static sctp_ierror_t sctp_sf_authenticate(struct net *net,
 	/* Make sure that the length of the signature matches what
 	 * we expect.
 	 */
-	sig_len = ntohs(chunk->chunk_hdr->length) - sizeof(sctp_auth_chunk_t);
+	sig_len = ntohs(chunk->chunk_hdr->length) -
+		  sizeof(struct sctp_auth_chunk);
 	hmac = sctp_auth_get_hmac(ntohs(auth_hdr->hmac_id));
 	if (sig_len != hmac->hmac_len)
 		return SCTP_IERROR_PROTO_VIOLATION;
@@ -4134,8 +4135,8 @@ static sctp_ierror_t sctp_sf_authenticate(struct net *net,
 	memset(digest, 0, sig_len);
 
 	sctp_auth_calculate_hmac(asoc, chunk->skb,
-				(struct sctp_auth_chunk *)chunk->chunk_hdr,
-				GFP_ATOMIC);
+				 (struct sctp_auth_chunk *)chunk->chunk_hdr,
+				 GFP_ATOMIC);
 
 	/* Discard the packet if the digests do not match */
 	if (memcmp(save_digest, digest, sig_len)) {
-- 
cgit v1.2.3


From 630b3aff8a51c90ef15b59c9560ac35e40e7ec09 Mon Sep 17 00:00:00 2001
From: Lukas Wunner <lukas@wunner.de>
Date: Tue, 1 Aug 2017 14:10:41 +0200
Subject: treewide: Consolidate Apple DMI checks

We're about to amend ACPI bus scan with DMI checks whether we're running
on a Mac to support Apple device properties in AML.  The DMI checks are
performed for every single device, adding overhead for everything x86
that isn't Apple, which is the majority.  Rafael and Andy therefore
request to perform the DMI match only once and cache the result.

Outside of ACPI various other Apple DMI checks exist and it seems
reasonable to use the cached value there as well.  Rafael, Andy and
Darren suggest performing the DMI check in arch code and making it
available with a header in include/linux/platform_data/x86/.

To this end, add early_platform_quirks() to arch/x86/kernel/quirks.c
to perform the DMI check and invoke it from setup_arch().  Switch over
all existing Apple DMI checks, thereby fixing two deficiencies:

* They are now #defined to false on non-x86 arches and can thus be
  optimized away if they're located in cross-arch code.

* Some of them only match "Apple Inc." but not "Apple Computer, Inc.",
  which is used by BIOSes released between January 2006 (when the first
  x86 Macs started shipping) and January 2007 (when the company name
  changed upon introduction of the iPhone).

Suggested-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Suggested-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Suggested-by: Darren Hart <dvhart@infradead.org>
Signed-off-by: Lukas Wunner <lukas@wunner.de>
Acked-by: Mika Westerberg <mika.westerberg@linux.intel.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 arch/x86/include/asm/setup.h            |  1 +
 arch/x86/kernel/early-quirks.c          |  4 ++--
 arch/x86/kernel/quirks.c                | 10 +++++++++
 arch/x86/kernel/setup.c                 |  2 ++
 drivers/acpi/osi.c                      | 37 +++++++--------------------------
 drivers/acpi/pci_root.c                 |  4 ++--
 drivers/acpi/sbs.c                      | 25 ++--------------------
 drivers/firmware/efi/apple-properties.c |  5 ++---
 drivers/pci/quirks.c                    |  5 +++--
 drivers/thunderbolt/icm.c               | 13 ++++--------
 drivers/thunderbolt/tb.c                |  4 ++--
 include/linux/platform_data/x86/apple.h | 13 ++++++++++++
 12 files changed, 51 insertions(+), 72 deletions(-)
 create mode 100644 include/linux/platform_data/x86/apple.h

(limited to 'include')

diff --git a/arch/x86/include/asm/setup.h b/arch/x86/include/asm/setup.h
index e4585a393965..a65cf544686a 100644
--- a/arch/x86/include/asm/setup.h
+++ b/arch/x86/include/asm/setup.h
@@ -39,6 +39,7 @@ static inline void vsmp_init(void) { }
 #endif
 
 void setup_bios_corruption_check(void);
+void early_platform_quirks(void);
 
 extern unsigned long saved_video_mode;
 
diff --git a/arch/x86/kernel/early-quirks.c b/arch/x86/kernel/early-quirks.c
index d907c3d8633f..a70a65ae798a 100644
--- a/arch/x86/kernel/early-quirks.c
+++ b/arch/x86/kernel/early-quirks.c
@@ -12,10 +12,10 @@
 #include <linux/pci.h>
 #include <linux/acpi.h>
 #include <linux/delay.h>
-#include <linux/dmi.h>
 #include <linux/pci_ids.h>
 #include <linux/bcma/bcma.h>
 #include <linux/bcma/bcma_regs.h>
+#include <linux/platform_data/x86/apple.h>
 #include <drm/i915_drm.h>
 #include <asm/pci-direct.h>
 #include <asm/dma.h>
@@ -593,7 +593,7 @@ static void __init apple_airport_reset(int bus, int slot, int func)
 	u64 addr;
 	int i;
 
-	if (!dmi_match(DMI_SYS_VENDOR, "Apple Inc."))
+	if (!x86_apple_machine)
 		return;
 
 	/* Card may have been put into PCI_D3hot by grub quirk */
diff --git a/arch/x86/kernel/quirks.c b/arch/x86/kernel/quirks.c
index 0bee04d41bed..eaa591cfd98b 100644
--- a/arch/x86/kernel/quirks.c
+++ b/arch/x86/kernel/quirks.c
@@ -1,6 +1,7 @@
 /*
  * This file contains work-arounds for x86 and x86_64 platform bugs.
  */
+#include <linux/dmi.h>
 #include <linux/pci.h>
 #include <linux/irq.h>
 
@@ -656,3 +657,12 @@ DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_INTEL, 0x6fc0, quirk_intel_brickland_xeon_
 DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_INTEL, 0x2083, quirk_intel_purley_xeon_ras_cap);
 #endif
 #endif
+
+bool x86_apple_machine;
+EXPORT_SYMBOL(x86_apple_machine);
+
+void __init early_platform_quirks(void)
+{
+	x86_apple_machine = dmi_match(DMI_SYS_VENDOR, "Apple Inc.") ||
+			    dmi_match(DMI_SYS_VENDOR, "Apple Computer, Inc.");
+}
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index 3486d0498800..77b3c3af7cba 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -1206,6 +1206,8 @@ void __init setup_arch(char **cmdline_p)
 
 	io_delay_init();
 
+	early_platform_quirks();
+
 	/*
 	 * Parse the ACPI tables for possible boot-time SMP configuration.
 	 */
diff --git a/drivers/acpi/osi.c b/drivers/acpi/osi.c
index 723bee58bbcf..19cdd8a783a9 100644
--- a/drivers/acpi/osi.c
+++ b/drivers/acpi/osi.c
@@ -27,6 +27,7 @@
 #include <linux/kernel.h>
 #include <linux/acpi.h>
 #include <linux/dmi.h>
+#include <linux/platform_data/x86/apple.h>
 
 #include "internal.h"
 
@@ -257,12 +258,11 @@ bool acpi_osi_is_win8(void)
 }
 EXPORT_SYMBOL(acpi_osi_is_win8);
 
-static void __init acpi_osi_dmi_darwin(bool enable,
-				       const struct dmi_system_id *d)
+static void __init acpi_osi_dmi_darwin(void)
 {
-	pr_notice("DMI detected to setup _OSI(\"Darwin\"): %s\n", d->ident);
+	pr_notice("DMI detected to setup _OSI(\"Darwin\"): Apple hardware\n");
 	osi_config.darwin_dmi = 1;
-	__acpi_osi_setup_darwin(enable);
+	__acpi_osi_setup_darwin(true);
 }
 
 static void __init acpi_osi_dmi_linux(bool enable,
@@ -273,13 +273,6 @@ static void __init acpi_osi_dmi_linux(bool enable,
 	__acpi_osi_setup_linux(enable);
 }
 
-static int __init dmi_enable_osi_darwin(const struct dmi_system_id *d)
-{
-	acpi_osi_dmi_darwin(true, d);
-
-	return 0;
-}
-
 static int __init dmi_enable_osi_linux(const struct dmi_system_id *d)
 {
 	acpi_osi_dmi_linux(true, d);
@@ -481,30 +474,16 @@ static struct dmi_system_id acpi_osi_dmi_table[] __initdata = {
 		     DMI_MATCH(DMI_PRODUCT_NAME, "1015PX"),
 		},
 	},
-
-	/*
-	 * Enable _OSI("Darwin") for all apple platforms.
-	 */
-	{
-	.callback = dmi_enable_osi_darwin,
-	.ident = "Apple hardware",
-	.matches = {
-		     DMI_MATCH(DMI_SYS_VENDOR, "Apple Inc."),
-		},
-	},
-	{
-	.callback = dmi_enable_osi_darwin,
-	.ident = "Apple hardware",
-	.matches = {
-		     DMI_MATCH(DMI_SYS_VENDOR, "Apple Computer, Inc."),
-		},
-	},
 	{}
 };
 
 static __init void acpi_osi_dmi_blacklisted(void)
 {
 	dmi_check_system(acpi_osi_dmi_table);
+
+	/* Enable _OSI("Darwin") for Apple platforms. */
+	if (x86_apple_machine)
+		acpi_osi_dmi_darwin();
 }
 
 int __init early_acpi_osi_init(void)
diff --git a/drivers/acpi/pci_root.c b/drivers/acpi/pci_root.c
index 9eec3095e6c3..6fc204a52493 100644
--- a/drivers/acpi/pci_root.c
+++ b/drivers/acpi/pci_root.c
@@ -33,6 +33,7 @@
 #include <linux/acpi.h>
 #include <linux/slab.h>
 #include <linux/dmi.h>
+#include <linux/platform_data/x86/apple.h>
 #include <acpi/apei.h>	/* for acpi_hest_init() */
 
 #include "internal.h"
@@ -431,8 +432,7 @@ static void negotiate_os_control(struct acpi_pci_root *root, int *no_aspm)
 	 * been called successfully. We know the feature set supported by the
 	 * platform, so avoid calling _OSC at all
 	 */
-
-	if (dmi_match(DMI_SYS_VENDOR, "Apple Inc.")) {
+	if (x86_apple_machine) {
 		root->osc_control_set = ~OSC_PCI_EXPRESS_PME_CONTROL;
 		decode_osc_control(root, "OS assumes control of",
 				   root->osc_control_set);
diff --git a/drivers/acpi/sbs.c b/drivers/acpi/sbs.c
index ad0b13ad4bbb..a18463799ad7 100644
--- a/drivers/acpi/sbs.c
+++ b/drivers/acpi/sbs.c
@@ -31,7 +31,7 @@
 #include <linux/jiffies.h>
 #include <linux/delay.h>
 #include <linux/power_supply.h>
-#include <linux/dmi.h>
+#include <linux/platform_data/x86/apple.h>
 
 #include "sbshc.h"
 #include "battery.h"
@@ -58,8 +58,6 @@ static unsigned int cache_time = 1000;
 module_param(cache_time, uint, 0644);
 MODULE_PARM_DESC(cache_time, "cache time in milliseconds");
 
-static bool sbs_manager_broken;
-
 #define MAX_SBS_BAT			4
 #define ACPI_SBS_BLOCK_MAX		32
 
@@ -632,31 +630,12 @@ static void acpi_sbs_callback(void *context)
 	}
 }
 
-static int disable_sbs_manager(const struct dmi_system_id *d)
-{
-	sbs_manager_broken = true;
-	return 0;
-}
-
-static struct dmi_system_id acpi_sbs_dmi_table[] = {
-	{
-		.callback = disable_sbs_manager,
-		.ident = "Apple",
-		.matches = {
-			DMI_MATCH(DMI_SYS_VENDOR, "Apple Inc.")
-		},
-	},
-	{ },
-};
-
 static int acpi_sbs_add(struct acpi_device *device)
 {
 	struct acpi_sbs *sbs;
 	int result = 0;
 	int id;
 
-	dmi_check_system(acpi_sbs_dmi_table);
-
 	sbs = kzalloc(sizeof(struct acpi_sbs), GFP_KERNEL);
 	if (!sbs) {
 		result = -ENOMEM;
@@ -677,7 +656,7 @@ static int acpi_sbs_add(struct acpi_device *device)
 
 	result = 0;
 
-	if (!sbs_manager_broken) {
+	if (!x86_apple_machine) {
 		result = acpi_manager_get_info(sbs);
 		if (!result) {
 			sbs->manager_present = 1;
diff --git a/drivers/firmware/efi/apple-properties.c b/drivers/firmware/efi/apple-properties.c
index c473f4c5ca34..9f6bcf173b0e 100644
--- a/drivers/firmware/efi/apple-properties.c
+++ b/drivers/firmware/efi/apple-properties.c
@@ -18,8 +18,8 @@
 #define pr_fmt(fmt) "apple-properties: " fmt
 
 #include <linux/bootmem.h>
-#include <linux/dmi.h>
 #include <linux/efi.h>
+#include <linux/platform_data/x86/apple.h>
 #include <linux/property.h>
 #include <linux/slab.h>
 #include <linux/ucs2_string.h>
@@ -191,8 +191,7 @@ static int __init map_properties(void)
 	u64 pa_data;
 	int ret;
 
-	if (!dmi_match(DMI_SYS_VENDOR, "Apple Inc.") &&
-	    !dmi_match(DMI_SYS_VENDOR, "Apple Computer, Inc."))
+	if (!x86_apple_machine)
 		return 0;
 
 	pa_data = boot_params.hdr.setup_data;
diff --git a/drivers/pci/quirks.c b/drivers/pci/quirks.c
index 6967c6b4cf6b..ba7b7c64379e 100644
--- a/drivers/pci/quirks.c
+++ b/drivers/pci/quirks.c
@@ -25,6 +25,7 @@
 #include <linux/sched.h>
 #include <linux/ktime.h>
 #include <linux/mm.h>
+#include <linux/platform_data/x86/apple.h>
 #include <asm/dma.h>	/* isa_dma_bridge_buggy */
 #include "pci.h"
 
@@ -3447,7 +3448,7 @@ static void quirk_apple_poweroff_thunderbolt(struct pci_dev *dev)
 {
 	acpi_handle bridge, SXIO, SXFP, SXLV;
 
-	if (!dmi_match(DMI_BOARD_VENDOR, "Apple Inc."))
+	if (!x86_apple_machine)
 		return;
 	if (pci_pcie_type(dev) != PCI_EXP_TYPE_UPSTREAM)
 		return;
@@ -3492,7 +3493,7 @@ static void quirk_apple_wait_for_thunderbolt(struct pci_dev *dev)
 	struct pci_dev *sibling = NULL;
 	struct pci_dev *nhi = NULL;
 
-	if (!dmi_match(DMI_BOARD_VENDOR, "Apple Inc."))
+	if (!x86_apple_machine)
 		return;
 	if (pci_pcie_type(dev) != PCI_EXP_TYPE_DOWNSTREAM)
 		return;
diff --git a/drivers/thunderbolt/icm.c b/drivers/thunderbolt/icm.c
index 8ee340290219..17eba072e1ed 100644
--- a/drivers/thunderbolt/icm.c
+++ b/drivers/thunderbolt/icm.c
@@ -13,9 +13,9 @@
  */
 
 #include <linux/delay.h>
-#include <linux/dmi.h>
 #include <linux/mutex.h>
 #include <linux/pci.h>
+#include <linux/platform_data/x86/apple.h>
 #include <linux/sizes.h>
 #include <linux/slab.h>
 #include <linux/workqueue.h>
@@ -102,11 +102,6 @@ static inline u64 get_route(u32 route_hi, u32 route_lo)
 	return (u64)route_hi << 32 | route_lo;
 }
 
-static inline bool is_apple(void)
-{
-	return dmi_match(DMI_BOARD_VENDOR, "Apple Inc.");
-}
-
 static bool icm_match(const struct tb_cfg_request *req,
 		      const struct ctl_pkg *pkg)
 {
@@ -176,7 +171,7 @@ static int icm_request(struct tb *tb, const void *request, size_t request_size,
 
 static bool icm_fr_is_supported(struct tb *tb)
 {
-	return !is_apple();
+	return !x86_apple_machine;
 }
 
 static inline int icm_fr_get_switch_index(u32 port)
@@ -517,7 +512,7 @@ static bool icm_ar_is_supported(struct tb *tb)
 	 * Starting from Alpine Ridge we can use ICM on Apple machines
 	 * as well. We just need to reset and re-enable it first.
 	 */
-	if (!is_apple())
+	if (!x86_apple_machine)
 		return true;
 
 	/*
@@ -1004,7 +999,7 @@ static int icm_start(struct tb *tb)
 	 * don't provide images publicly either. To be on the safe side
 	 * prevent root switch NVM upgrade on Macs for now.
 	 */
-	tb->root_switch->no_nvm_upgrade = is_apple();
+	tb->root_switch->no_nvm_upgrade = x86_apple_machine;
 
 	ret = tb_switch_add(tb->root_switch);
 	if (ret)
diff --git a/drivers/thunderbolt/tb.c b/drivers/thunderbolt/tb.c
index 1b02ca0b6129..0b22ad9d68b4 100644
--- a/drivers/thunderbolt/tb.c
+++ b/drivers/thunderbolt/tb.c
@@ -7,7 +7,7 @@
 #include <linux/slab.h>
 #include <linux/errno.h>
 #include <linux/delay.h>
-#include <linux/dmi.h>
+#include <linux/platform_data/x86/apple.h>
 
 #include "tb.h"
 #include "tb_regs.h"
@@ -453,7 +453,7 @@ struct tb *tb_probe(struct tb_nhi *nhi)
 	struct tb_cm *tcm;
 	struct tb *tb;
 
-	if (!dmi_match(DMI_BOARD_VENDOR, "Apple Inc."))
+	if (!x86_apple_machine)
 		return NULL;
 
 	tb = tb_domain_alloc(nhi, sizeof(*tcm));
diff --git a/include/linux/platform_data/x86/apple.h b/include/linux/platform_data/x86/apple.h
new file mode 100644
index 000000000000..079e816c3c21
--- /dev/null
+++ b/include/linux/platform_data/x86/apple.h
@@ -0,0 +1,13 @@
+#ifndef PLATFORM_DATA_X86_APPLE_H
+#define PLATFORM_DATA_X86_APPLE_H
+
+#ifdef CONFIG_X86
+/**
+ * x86_apple_machine - whether the machine is an x86 Apple Macintosh
+ */
+extern bool x86_apple_machine;
+#else
+#define x86_apple_machine false
+#endif
+
+#endif
-- 
cgit v1.2.3


From 004cb784018eaa2a078912180bd9999707c3c483 Mon Sep 17 00:00:00 2001
From: Matthew Minter <matt@masarand.com>
Date: Mon, 31 Jul 2017 17:37:57 +0100
Subject: PCI: Remove unused pci_fixup_irqs() function

Now we have removed all callers of pci_fixup_irqs() and migrated everything
to pci_assign_irq(), delete the pci_fixup_irqs() function completely.

Signed-off-by: Matthew Minter <matt@masarand.com>
[lorenzo.pieralisi@arm.com: updated commit log]
Signed-off-by: Lorenzo Pieralisi <lorenzo.pieralisi@arm.com>
Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
---
 drivers/pci/setup-irq.c | 24 ------------------------
 include/linux/pci.h     |  2 --
 2 files changed, 26 deletions(-)

(limited to 'include')

diff --git a/drivers/pci/setup-irq.c b/drivers/pci/setup-irq.c
index 81eda3d93a5d..69e3b56c32a4 100644
--- a/drivers/pci/setup-irq.c
+++ b/drivers/pci/setup-irq.c
@@ -67,27 +67,3 @@ void pci_assign_irq(struct pci_dev *dev)
 	   the real IRQ to use; the device does not use it. */
 	pcibios_update_irq(dev, irq);
 }
-
-void pci_fixup_irqs(u8 (*swizzle)(struct pci_dev *, u8 *),
-		    int (*map_irq)(const struct pci_dev *, u8, u8))
-{
-	/*
-	 * Implement pci_fixup_irqs() through pci_assign_irq().
-	 * This code should be remove eventually, it is a wrapper
-	 * around pci_assign_irq() interface to keep current
-	 * pci_fixup_irqs() behaviour unchanged on architecture
-	 * code still relying on its interface.
-	 */
-	struct pci_dev *dev = NULL;
-	struct pci_host_bridge *hbrg = NULL;
-
-	for_each_pci_dev(dev) {
-		hbrg = pci_find_host_bridge(dev->bus);
-		hbrg->swizzle_irq = swizzle;
-		hbrg->map_irq = map_irq;
-		pci_assign_irq(dev);
-		hbrg->swizzle_irq = NULL;
-		hbrg->map_irq = NULL;
-	}
-}
-EXPORT_SYMBOL_GPL(pci_fixup_irqs);
diff --git a/include/linux/pci.h b/include/linux/pci.h
index 4869e66dd659..69034ab8a68e 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -1160,8 +1160,6 @@ void pci_assign_unassigned_bus_resources(struct pci_bus *bus);
 void pci_assign_unassigned_root_bus_resources(struct pci_bus *bus);
 void pdev_enable_device(struct pci_dev *);
 int pci_enable_resources(struct pci_dev *, int mask);
-void pci_fixup_irqs(u8 (*)(struct pci_dev *, u8 *),
-		    int (*)(const struct pci_dev *, u8, u8));
 void pci_assign_irq(struct pci_dev *dev);
 struct resource *pci_find_resource(struct pci_dev *dev, struct resource *res);
 #define HAVE_PCI_REQ_REGIONS	2
-- 
cgit v1.2.3


From 4eebedd8f1a6609739c2e9a9b020791b23cbcceb Mon Sep 17 00:00:00 2001
From: Lv Zheng <lv.zheng@intel.com>
Date: Thu, 3 Aug 2017 14:26:19 +0800
Subject: ACPICA: Divergences: reduce access size definitions

ACPICA commit cf27b3c98883d2a15d932016792fcb8272ace96d

The following commit introduces definition of access width to ACPICA.
  Commit: 2bece49394872d36bbc5767fd643deac05920c55
  Subject: ACPI: SPCR: Use access width to determine mmio usage

Actually the access bit width can be calculated via access width. It
would be better to define a macro calculating bit width rather than
defining fixed values. This patch thus cleans up the definitions to
reduce divergences.

Link: https://github.com/acpica/acpica/commit/cf27b3c9
Signed-off-by: Lv Zheng <lv.zheng@intel.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/acpi/acpica/hwregs.c | 2 +-
 drivers/acpi/spcr.c          | 9 +++++----
 include/acpi/acrestyp.h      | 7 -------
 include/acpi/actypes.h       | 7 +++++++
 4 files changed, 13 insertions(+), 12 deletions(-)

(limited to 'include')

diff --git a/drivers/acpi/acpica/hwregs.c b/drivers/acpi/acpica/hwregs.c
index de74a4c25085..acb417b58bbb 100644
--- a/drivers/acpi/acpica/hwregs.c
+++ b/drivers/acpi/acpica/hwregs.c
@@ -107,7 +107,7 @@ acpi_hw_get_access_bit_width(u64 address,
 	    ACPI_IS_ALIGNED(reg->bit_width, 8)) {
 		access_bit_width = reg->bit_width;
 	} else if (reg->access_width) {
-		access_bit_width = (1 << (reg->access_width + 2));
+		access_bit_width = ACPI_ACCESS_BIT_WIDTH(reg->access_width);
 	} else {
 		access_bit_width =
 		    ACPI_ROUND_UP_POWER_OF_TWO_8(reg->bit_offset +
diff --git a/drivers/acpi/spcr.c b/drivers/acpi/spcr.c
index 4ac3e06b41d8..1230f969256b 100644
--- a/drivers/acpi/spcr.c
+++ b/drivers/acpi/spcr.c
@@ -95,16 +95,17 @@ int __init parse_spcr(bool earlycon)
 	}
 
 	if (table->serial_port.space_id == ACPI_ADR_SPACE_SYSTEM_MEMORY) {
-		switch (table->serial_port.access_width) {
+		switch (ACPI_ACCESS_BIT_WIDTH((
+			table->serial_port.access_width))) {
 		default:
 			pr_err("Unexpected SPCR Access Width.  Defaulting to byte size\n");
-		case ACPI_ACCESS_SIZE_BYTE:
+		case 8:
 			iotype = "mmio";
 			break;
-		case ACPI_ACCESS_SIZE_WORD:
+		case 16:
 			iotype = "mmio16";
 			break;
-		case ACPI_ACCESS_SIZE_DWORD:
+		case 32:
 			iotype = "mmio32";
 			break;
 		}
diff --git a/include/acpi/acrestyp.h b/include/acpi/acrestyp.h
index 4f7f39a02820..343dbdcef20c 100644
--- a/include/acpi/acrestyp.h
+++ b/include/acpi/acrestyp.h
@@ -377,13 +377,6 @@ struct acpi_resource_generic_register {
 	u64 address;
 };
 
-/* Generic Address Space Access Sizes */
-#define ACPI_ACCESS_SIZE_UNDEFINED		0
-#define ACPI_ACCESS_SIZE_BYTE			1
-#define ACPI_ACCESS_SIZE_WORD			2
-#define ACPI_ACCESS_SIZE_DWORD			3
-#define ACPI_ACCESS_SIZE_QWORD			4
-
 struct acpi_resource_gpio {
 	u8 revision_id;
 	u8 connection_type;
diff --git a/include/acpi/actypes.h b/include/acpi/actypes.h
index 2fcbaec8b368..0260be595d40 100644
--- a/include/acpi/actypes.h
+++ b/include/acpi/actypes.h
@@ -554,6 +554,13 @@ typedef u64 acpi_integer;
 #define ACPI_VALIDATE_RSDP_SIG(a)       (!strncmp (ACPI_CAST_PTR (char, (a)), ACPI_SIG_RSDP, 8))
 #define ACPI_MAKE_RSDP_SIG(dest)        (memcpy (ACPI_CAST_PTR (char, (dest)), ACPI_SIG_RSDP, 8))
 
+/*
+ * Algorithm to obtain access bit width.
+ * Can be used with access_width of struct acpi_generic_address and access_size of
+ * struct acpi_resource_generic_register.
+ */
+#define ACPI_ACCESS_BIT_WIDTH(size)     (1 << ((size) + 2))
+
 /*******************************************************************************
  *
  * Miscellaneous constants
-- 
cgit v1.2.3


From 3bd38469df0a0f4b3c73e490538c8de104914a4a Mon Sep 17 00:00:00 2001
From: James Morse <james.morse@arm.com>
Date: Thu, 3 Aug 2017 14:26:25 +0800
Subject: ACPICA: iASL: Add support for the SDEI table

ACPICA commit 5ad4f0b7bf9e7ba175bd320cf7950f3b38799ff3

ACPI 6.2 adds support for the Software Delegated Exception Interface,
which is described by "Software Delegated Exception Interface (SDEI)"
ARM DEN0054A.

Add the necessary types in the ACPICA header files and support for
compiling/decompiling the table.

Link: https://github.com/acpica/acpica/commit/5ad4f0b7
Signed-off-by: James Morse <james.morse@arm.com>
Signed-off-by: Bob Moore <robert.moore@intel.com>
Signed-off-by: Lv Zheng <lv.zheng@intel.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 include/acpi/actbl2.h | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

(limited to 'include')

diff --git a/include/acpi/actbl2.h b/include/acpi/actbl2.h
index d568948d3a82..686b6f8c09dc 100644
--- a/include/acpi/actbl2.h
+++ b/include/acpi/actbl2.h
@@ -76,6 +76,7 @@
 #define ACPI_SIG_MCHI           "MCHI"	/* Management Controller Host Interface table */
 #define ACPI_SIG_MSDM           "MSDM"	/* Microsoft Data Management Table */
 #define ACPI_SIG_MTMR           "MTMR"	/* MID Timer table */
+#define ACPI_SIG_SDEI           "SDEI"	/* Software Delegated Exception Interface Table */
 #define ACPI_SIG_SLIC           "SLIC"	/* Software Licensing Description Table */
 #define ACPI_SIG_SPCR           "SPCR"	/* Serial Port Console Redirection table */
 #define ACPI_SIG_SPMI           "SPMI"	/* Server Platform Management Interface table */
@@ -1132,6 +1133,19 @@ struct acpi_mtmr_entry {
 	u32 irq;
 };
 
+/*******************************************************************************
+ *
+ * SDEI - Software Delegated Exception Interface Descriptor Table
+ *
+ * Conforms to "Software Delegated Exception Interface (SDEI)" ARM DEN0054A,
+ * May 8th, 2017. Copyright 2017 ARM Ltd.
+ *
+ ******************************************************************************/
+
+struct acpi_table_sdei {
+	struct acpi_table_header header;	/* Common ACPI table header */
+};
+
 /*******************************************************************************
  *
  * SLIC - Software Licensing Description Table
-- 
cgit v1.2.3


From d5f23fe192b5b549a85e98ee9f0a3b8a0f9c04c3 Mon Sep 17 00:00:00 2001
From: Shao Ming <smbest163@163.com>
Date: Thu, 3 Aug 2017 14:26:44 +0800
Subject: ACPICA: EFI/EDK2: Sort acpi.h inclusion order

ACPICA commit 43ff22215f0fcd8ca80abec5712a07d2cc9a801d

acpi.h inclusion order need to be changed to build EDK-II ports of
acpidump on Windows as va_list is used before it's definition in that
environment. As we only need to ensure order of acenv.h/acenvex.h to
be pre/post ACPICA type definitions, inclusion order is changed to make
MSVC builds happy. Lv Zheng.

Link: https://github.com/acpica/acpica/commit/43ff2221
Signed-off-by: "Shao, Ming" <smbest163@163.com>
Signed-off-by: Lv Zheng <lv.zheng@intel.com>
Signed-off-by: Shao Ming <smbest163@163.com>
Signed-off-by: Bob Moore <robert.moore@intel.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 include/acpi/acpi.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/acpi/acpi.h b/include/acpi/acpi.h
index ca036620703c..0887d7cbb7e1 100644
--- a/include/acpi/acpi.h
+++ b/include/acpi/acpi.h
@@ -58,10 +58,10 @@
 #include <acpi/actypes.h>		/* ACPICA data types and structures */
 #include <acpi/acexcep.h>		/* ACPICA exceptions */
 #include <acpi/actbl.h>		/* ACPI table definitions */
-#include <acpi/acoutput.h>		/* Error output and Debug macros */
 #include <acpi/acrestyp.h>		/* Resource Descriptor structs */
+#include <acpi/platform/acenvex.h>	/* Extra environment-specific items */
+#include <acpi/acoutput.h>		/* Error output and Debug macros */
 #include <acpi/acpiosxf.h>		/* OSL interfaces (ACPICA-to-OS) */
 #include <acpi/acpixf.h>		/* ACPI core subsystem external interfaces */
-#include <acpi/platform/acenvex.h>	/* Extra environment-specific items */
 
 #endif				/* __ACPI_H__ */
-- 
cgit v1.2.3


From 65082bfcb486ef06f93b7603a9045cdf0c0022e8 Mon Sep 17 00:00:00 2001
From: Lv Zheng <lv.zheng@intel.com>
Date: Thu, 3 Aug 2017 14:26:50 +0800
Subject: ACPICA: CLib: Add short multiply/shift support

ACPICA commit 01b8f5a2350b9cc329cd8402ac8faec36fc501f5

In order to build ACPICA EFI tools with EDK-II on Windows, 64-bit
multiply/shift supports are also required to be implemented. Otherwise,
MSVC complains:
 acpidump.lib(utstrtoul64.obj) : error LNK2001: unresolved external symbol __allmul
 acpidump.lib(uthex.obj) : error LNK2001: unresolved external symbol __aullshr

Note:
1. This patch also splits _EDK2_EFI from _GNU_EFI as they might have
   different math64 supports.
2. Support of gcc math64 is not included in this patch.
3. Support of EDK2 arch independent math64 is done via linking to base_lib.

This patch fixes this issue. Reported by Shao Ming, fixed by Lv Zheng.

For Linux kernel, this patch is a functional no-op.

Link: https://github.com/acpica/acpica/commit/01b8f5a2
Tested-by: "Shao, Ming" <smbest163@163.com>
Signed-off-by: Lv Zheng <lv.zheng@intel.com>
Signed-off-by: Bob Moore <robert.moore@intel.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/acpi/acpica/acutils.h             |   7 +
 drivers/acpi/acpica/uthex.c               |   4 +-
 drivers/acpi/acpica/utmath.c              | 222 ++++++++++++++++++++++++++++--
 drivers/acpi/acpica/utprint.c             |   2 +-
 drivers/acpi/acpica/utstrtoul64.c         |   9 +-
 include/acpi/actypes.h                    |   1 +
 include/acpi/platform/acgcc.h             |   4 +
 include/acpi/platform/aclinux.h           |   2 +
 tools/power/acpi/tools/acpidump/apfiles.c |   2 +-
 tools/power/acpi/tools/acpidump/apmain.c  |   2 +-
 10 files changed, 237 insertions(+), 18 deletions(-)

(limited to 'include')

diff --git a/drivers/acpi/acpica/acutils.h b/drivers/acpi/acpica/acutils.h
index 2a3cc4296481..0a5601405584 100644
--- a/drivers/acpi/acpica/acutils.h
+++ b/drivers/acpi/acpica/acutils.h
@@ -538,6 +538,13 @@ acpi_status
 acpi_ut_short_divide(u64 in_dividend,
 		     u32 divisor, u64 *out_quotient, u32 *out_remainder);
 
+acpi_status
+acpi_ut_short_multiply(u64 in_multiplicand, u32 multiplier, u64 *outproduct);
+
+acpi_status acpi_ut_short_shift_left(u64 operand, u32 count, u64 *out_result);
+
+acpi_status acpi_ut_short_shift_right(u64 operand, u32 count, u64 *out_result);
+
 /*
  * utmisc
  */
diff --git a/drivers/acpi/acpica/uthex.c b/drivers/acpi/acpica/uthex.c
index 6600bc257516..fb406daf47fa 100644
--- a/drivers/acpi/acpica/uthex.c
+++ b/drivers/acpi/acpica/uthex.c
@@ -69,8 +69,10 @@ static const char acpi_gbl_hex_to_ascii[] = {
 
 char acpi_ut_hex_to_ascii_char(u64 integer, u32 position)
 {
+	u64 index;
 
-	return (acpi_gbl_hex_to_ascii[(integer >> position) & 0xF]);
+	acpi_ut_short_shift_right(integer, position, &index);
+	return (acpi_gbl_hex_to_ascii[index & 0xF]);
 }
 
 /*******************************************************************************
diff --git a/drivers/acpi/acpica/utmath.c b/drivers/acpi/acpica/utmath.c
index aa0502d1d019..5f9c680076c4 100644
--- a/drivers/acpi/acpica/utmath.c
+++ b/drivers/acpi/acpica/utmath.c
@@ -47,15 +47,6 @@
 #define _COMPONENT          ACPI_UTILITIES
 ACPI_MODULE_NAME("utmath")
 
-/*
- * Optional support for 64-bit double-precision integer divide. This code
- * is configurable and is implemented in order to support 32-bit kernel
- * environments where a 64-bit double-precision math library is not available.
- *
- * Support for a more normal 64-bit divide/modulo (with check for a divide-
- * by-zero) appears after this optional section of code.
- */
-#ifndef ACPI_USE_NATIVE_DIVIDE
 /* Structures used only for 64-bit divide */
 typedef struct uint64_struct {
 	u32 lo;
@@ -69,6 +60,217 @@ typedef union uint64_overlay {
 
 } uint64_overlay;
 
+/*
+ * Optional support for 64-bit double-precision integer multiply and shift.
+ * This code is configurable and is implemented in order to support 32-bit
+ * kernel environments where a 64-bit double-precision math library is not
+ * available.
+ */
+#ifndef ACPI_USE_NATIVE_MATH64
+
+/*******************************************************************************
+ *
+ * FUNCTION:    acpi_ut_short_multiply
+ *
+ * PARAMETERS:  multiplicand        - 64-bit multiplicand
+ *              multiplier          - 32-bit multiplier
+ *              out_product         - Pointer to where the product is returned
+ *
+ * DESCRIPTION: Perform a short multiply.
+ *
+ ******************************************************************************/
+
+acpi_status
+acpi_ut_short_multiply(u64 multiplicand, u32 multiplier, u64 *out_product)
+{
+	union uint64_overlay multiplicand_ovl;
+	union uint64_overlay product;
+	u32 carry32;
+
+	ACPI_FUNCTION_TRACE(ut_short_multiply);
+
+	multiplicand_ovl.full = multiplicand;
+
+	/*
+	 * The Product is 64 bits, the carry is always 32 bits,
+	 * and is generated by the second multiply.
+	 */
+	ACPI_MUL_64_BY_32(0, multiplicand_ovl.part.hi, multiplier,
+			  product.part.hi, carry32);
+
+	ACPI_MUL_64_BY_32(0, multiplicand_ovl.part.lo, multiplier,
+			  product.part.lo, carry32);
+
+	product.part.hi += carry32;
+
+	/* Return only what was requested */
+
+	if (out_product) {
+		*out_product = product.full;
+	}
+
+	return_ACPI_STATUS(AE_OK);
+}
+
+/*******************************************************************************
+ *
+ * FUNCTION:    acpi_ut_short_shift_left
+ *
+ * PARAMETERS:  operand             - 64-bit shift operand
+ *              count               - 32-bit shift count
+ *              out_result          - Pointer to where the result is returned
+ *
+ * DESCRIPTION: Perform a short left shift.
+ *
+ ******************************************************************************/
+
+acpi_status acpi_ut_short_shift_left(u64 operand, u32 count, u64 *out_result)
+{
+	union uint64_overlay operand_ovl;
+
+	ACPI_FUNCTION_TRACE(ut_short_shift_left);
+
+	operand_ovl.full = operand;
+
+	if ((count & 63) >= 32) {
+		operand_ovl.part.hi = operand_ovl.part.lo;
+		operand_ovl.part.lo ^= operand_ovl.part.lo;
+		count = (count & 63) - 32;
+	}
+	ACPI_SHIFT_LEFT_64_BY_32(operand_ovl.part.hi,
+				 operand_ovl.part.lo, count);
+
+	/* Return only what was requested */
+
+	if (out_result) {
+		*out_result = operand_ovl.full;
+	}
+
+	return_ACPI_STATUS(AE_OK);
+}
+
+/*******************************************************************************
+ *
+ * FUNCTION:    acpi_ut_short_shift_right
+ *
+ * PARAMETERS:  operand             - 64-bit shift operand
+ *              count               - 32-bit shift count
+ *              out_result          - Pointer to where the result is returned
+ *
+ * DESCRIPTION: Perform a short right shift.
+ *
+ ******************************************************************************/
+
+acpi_status acpi_ut_short_shift_right(u64 operand, u32 count, u64 *out_result)
+{
+	union uint64_overlay operand_ovl;
+
+	ACPI_FUNCTION_TRACE(ut_short_shift_right);
+
+	operand_ovl.full = operand;
+
+	if ((count & 63) >= 32) {
+		operand_ovl.part.lo = operand_ovl.part.hi;
+		operand_ovl.part.hi ^= operand_ovl.part.hi;
+		count = (count & 63) - 32;
+	}
+	ACPI_SHIFT_RIGHT_64_BY_32(operand_ovl.part.hi,
+				  operand_ovl.part.lo, count);
+
+	/* Return only what was requested */
+
+	if (out_result) {
+		*out_result = operand_ovl.full;
+	}
+
+	return_ACPI_STATUS(AE_OK);
+}
+#else
+
+/*******************************************************************************
+ *
+ * FUNCTION:    acpi_ut_short_multiply
+ *
+ * PARAMETERS:  See function headers above
+ *
+ * DESCRIPTION: Native version of the ut_short_multiply function.
+ *
+ ******************************************************************************/
+
+acpi_status
+acpi_ut_short_multiply(u64 multiplicand, u32 multiplier, u64 *out_product)
+{
+
+	ACPI_FUNCTION_TRACE(ut_short_multiply);
+
+	/* Return only what was requested */
+
+	if (out_product) {
+		*out_product = multiplicand * multiplier;
+	}
+
+	return_ACPI_STATUS(AE_OK);
+}
+
+/*******************************************************************************
+ *
+ * FUNCTION:    acpi_ut_short_shift_left
+ *
+ * PARAMETERS:  See function headers above
+ *
+ * DESCRIPTION: Native version of the ut_short_shift_left function.
+ *
+ ******************************************************************************/
+
+acpi_status acpi_ut_short_shift_left(u64 operand, u32 count, u64 *out_result)
+{
+
+	ACPI_FUNCTION_TRACE(ut_short_shift_left);
+
+	/* Return only what was requested */
+
+	if (out_result) {
+		*out_result = operand << count;
+	}
+
+	return_ACPI_STATUS(AE_OK);
+}
+
+/*******************************************************************************
+ *
+ * FUNCTION:    acpi_ut_short_shift_right
+ *
+ * PARAMETERS:  See function headers above
+ *
+ * DESCRIPTION: Native version of the ut_short_shift_right function.
+ *
+ ******************************************************************************/
+
+acpi_status acpi_ut_short_shift_right(u64 operand, u32 count, u64 *out_result)
+{
+
+	ACPI_FUNCTION_TRACE(ut_short_shift_right);
+
+	/* Return only what was requested */
+
+	if (out_result) {
+		*out_result = operand >> count;
+	}
+
+	return_ACPI_STATUS(AE_OK);
+}
+#endif
+
+/*
+ * Optional support for 64-bit double-precision integer divide. This code
+ * is configurable and is implemented in order to support 32-bit kernel
+ * environments where a 64-bit double-precision math library is not available.
+ *
+ * Support for a more normal 64-bit divide/modulo (with check for a divide-
+ * by-zero) appears after this optional section of code.
+ */
+#ifndef ACPI_USE_NATIVE_DIVIDE
+
 /*******************************************************************************
  *
  * FUNCTION:    acpi_ut_short_divide
@@ -258,6 +460,7 @@ acpi_ut_divide(u64 in_dividend,
 }
 
 #else
+
 /*******************************************************************************
  *
  * FUNCTION:    acpi_ut_short_divide, acpi_ut_divide
@@ -272,6 +475,7 @@ acpi_ut_divide(u64 in_dividend,
  *                 perform the divide.
  *
  ******************************************************************************/
+
 acpi_status
 acpi_ut_short_divide(u64 in_dividend,
 		     u32 divisor, u64 *out_quotient, u32 *out_remainder)
diff --git a/drivers/acpi/acpica/utprint.c b/drivers/acpi/acpica/utprint.c
index 7e6e1ae6140f..23cb472e5dc8 100644
--- a/drivers/acpi/acpica/utprint.c
+++ b/drivers/acpi/acpica/utprint.c
@@ -176,7 +176,7 @@ const char *acpi_ut_scan_number(const char *string, u64 *number_ptr)
 	u64 number = 0;
 
 	while (isdigit((int)*string)) {
-		number *= 10;
+		acpi_ut_short_multiply(number, 10, &number);
 		number += *(string++) - '0';
 	}
 
diff --git a/drivers/acpi/acpica/utstrtoul64.c b/drivers/acpi/acpica/utstrtoul64.c
index f42be01d99fd..9633ee142855 100644
--- a/drivers/acpi/acpica/utstrtoul64.c
+++ b/drivers/acpi/acpica/utstrtoul64.c
@@ -276,8 +276,8 @@ static u64 acpi_ut_strtoul_base10(char *string, u32 flags)
 
 		/* Convert and insert (add) the decimal digit */
 
-		next_value =
-		    (return_value * 10) + (ascii_digit - ACPI_ASCII_ZERO);
+		acpi_ut_short_multiply(return_value, 10, &next_value);
+		next_value += (ascii_digit - ACPI_ASCII_ZERO);
 
 		/* Check for overflow (32 or 64 bit) - return current converted value */
 
@@ -335,9 +335,8 @@ static u64 acpi_ut_strtoul_base16(char *string, u32 flags)
 
 		/* Convert and insert the hex digit */
 
-		return_value =
-		    (return_value << 4) |
-		    acpi_ut_ascii_char_to_hex(ascii_digit);
+		acpi_ut_short_shift_left(return_value, 4, &return_value);
+		return_value |= acpi_ut_ascii_char_to_hex(ascii_digit);
 
 		string++;
 		valid_digits++;
diff --git a/include/acpi/actypes.h b/include/acpi/actypes.h
index 0260be595d40..dc3bb153420a 100644
--- a/include/acpi/actypes.h
+++ b/include/acpi/actypes.h
@@ -166,6 +166,7 @@ typedef u64 acpi_physical_address;
 #define ACPI_SIZE_MAX                   ACPI_UINT64_MAX
 
 #define ACPI_USE_NATIVE_DIVIDE	/* Has native 64-bit integer support */
+#define ACPI_USE_NATIVE_MATH64	/* Has native 64-bit integer support */
 
 /*
  * In the case of the Itanium Processor Family (IPF), the hardware does not
diff --git a/include/acpi/platform/acgcc.h b/include/acpi/platform/acgcc.h
index 97a7e21cfbe0..9c8f8b79644e 100644
--- a/include/acpi/platform/acgcc.h
+++ b/include/acpi/platform/acgcc.h
@@ -84,4 +84,8 @@ typedef __builtin_va_list va_list;
 
 #define COMPILER_VA_MACRO               1
 
+/* GCC supports native multiply/shift on 32-bit platforms */
+
+#define ACPI_USE_NATIVE_MATH64
+
 #endif				/* __ACGCC_H__ */
diff --git a/include/acpi/platform/aclinux.h b/include/acpi/platform/aclinux.h
index afd95f28c8f0..1b473efd9eb6 100644
--- a/include/acpi/platform/aclinux.h
+++ b/include/acpi/platform/aclinux.h
@@ -128,6 +128,7 @@
 /* Host-dependent types and defines for in-kernel ACPICA */
 
 #define ACPI_MACHINE_WIDTH          BITS_PER_LONG
+#define ACPI_USE_NATIVE_MATH64
 #define ACPI_EXPORT_SYMBOL(symbol)  EXPORT_SYMBOL(symbol);
 #define strtoul                     simple_strtoul
 
@@ -216,6 +217,7 @@
 #define COMPILER_DEPENDENT_INT64    long long
 #define COMPILER_DEPENDENT_UINT64   unsigned long long
 #define ACPI_USE_NATIVE_DIVIDE
+#define ACPI_USE_NATIVE_MATH64
 #endif
 
 #ifndef __cdecl
diff --git a/tools/power/acpi/tools/acpidump/apfiles.c b/tools/power/acpi/tools/acpidump/apfiles.c
index 31b5a7f74015..d686e11936c4 100644
--- a/tools/power/acpi/tools/acpidump/apfiles.c
+++ b/tools/power/acpi/tools/acpidump/apfiles.c
@@ -61,7 +61,7 @@ static int ap_is_existing_file(char *pathname);
 
 static int ap_is_existing_file(char *pathname)
 {
-#ifndef _GNU_EFI
+#if !defined(_GNU_EFI) && !defined(_EDK2_EFI)
 	struct stat stat_info;
 
 	if (!stat(pathname, &stat_info)) {
diff --git a/tools/power/acpi/tools/acpidump/apmain.c b/tools/power/acpi/tools/acpidump/apmain.c
index dd82afa897bd..943b6b614683 100644
--- a/tools/power/acpi/tools/acpidump/apmain.c
+++ b/tools/power/acpi/tools/acpidump/apmain.c
@@ -300,7 +300,7 @@ static int ap_do_options(int argc, char **argv)
  *
  ******************************************************************************/
 
-#ifndef _GNU_EFI
+#if !defined(_GNU_EFI) && !defined(_EDK2_EFI)
 int ACPI_SYSTEM_XFACE main(int argc, char *argv[])
 #else
 int ACPI_SYSTEM_XFACE acpi_main(int argc, char *argv[])
-- 
cgit v1.2.3


From 137c78352ea98db70d0f2aa648f723bb444faee8 Mon Sep 17 00:00:00 2001
From: Bob Moore <robert.moore@intel.com>
Date: Thu, 3 Aug 2017 14:27:35 +0800
Subject: ACPICA: Resources: Allow _DMA method in walk resources

ACPICA commit af661c00afac7aa481a961fa48c6540a99ad64a6

The _DMA object contains a resource template, this change
adds support for the walk resources function so that ACPI
devices containing a _DMA object can actually parse it to
detect DMA ranges for the respective bus.

Link: https://github.com/acpica/acpica/commit/af661c00
Signed-off-by: Lorenzo Pieralisi <lorenzo.pieralisi@arm.com>
Signed-off-by: Bob Moore <robert.moore@intel.com>
Signed-off-by: Lv Zheng <lv.zheng@intel.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/acpi/acpica/rsxface.c | 7 ++++---
 include/acpi/acnames.h        | 1 +
 2 files changed, 5 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/drivers/acpi/acpica/rsxface.c b/drivers/acpi/acpica/rsxface.c
index 59a4f9ed06a7..be65e65e216e 100644
--- a/drivers/acpi/acpica/rsxface.c
+++ b/drivers/acpi/acpica/rsxface.c
@@ -615,7 +615,7 @@ ACPI_EXPORT_SYMBOL(acpi_walk_resource_buffer)
  *                                device we are querying
  *              name            - Method name of the resources we want.
  *                                (METHOD_NAME__CRS, METHOD_NAME__PRS, or
- *                                METHOD_NAME__AEI)
+ *                                METHOD_NAME__AEI or METHOD_NAME__DMA)
  *              user_function   - Called for each resource
  *              context         - Passed to user_function
  *
@@ -641,11 +641,12 @@ acpi_walk_resources(acpi_handle device_handle,
 	if (!device_handle || !user_function || !name ||
 	    (!ACPI_COMPARE_NAME(name, METHOD_NAME__CRS) &&
 	     !ACPI_COMPARE_NAME(name, METHOD_NAME__PRS) &&
-	     !ACPI_COMPARE_NAME(name, METHOD_NAME__AEI))) {
+	     !ACPI_COMPARE_NAME(name, METHOD_NAME__AEI) &&
+	     !ACPI_COMPARE_NAME(name, METHOD_NAME__DMA))) {
 		return_ACPI_STATUS(AE_BAD_PARAMETER);
 	}
 
-	/* Get the _CRS/_PRS/_AEI resource list */
+	/* Get the _CRS/_PRS/_AEI/_DMA resource list */
 
 	buffer.length = ACPI_ALLOCATE_LOCAL_BUFFER;
 	status = acpi_rs_get_method_data(device_handle, name, &buffer);
diff --git a/include/acpi/acnames.h b/include/acpi/acnames.h
index b421584033a5..d8dd3bf51ca7 100644
--- a/include/acpi/acnames.h
+++ b/include/acpi/acnames.h
@@ -54,6 +54,7 @@
 #define METHOD_NAME__CLS        "_CLS"
 #define METHOD_NAME__CRS        "_CRS"
 #define METHOD_NAME__DDN        "_DDN"
+#define METHOD_NAME__DMA        "_DMA"
 #define METHOD_NAME__HID        "_HID"
 #define METHOD_NAME__INI        "_INI"
 #define METHOD_NAME__PLD        "_PLD"
-- 
cgit v1.2.3


From 4a9673dd50c2362f5bd2d03c89099267b49d400a Mon Sep 17 00:00:00 2001
From: Bob Moore <robert.moore@intel.com>
Date: Thu, 3 Aug 2017 14:27:48 +0800
Subject: ACPICA: Update version to 20170728

ACPICA commit 50f61636543a55d143c792a4814da7700a7fafc2

Version 20170728.

Link: https://github.com/acpica/acpica/commit/50f61636
Signed-off-by: Bob Moore <robert.moore@intel.com>
Signed-off-by: Lv Zheng <lv.zheng@intel.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 include/acpi/acpixf.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/acpi/acpixf.h b/include/acpi/acpixf.h
index 06b87adc97ef..53c5e2f7bcec 100644
--- a/include/acpi/acpixf.h
+++ b/include/acpi/acpixf.h
@@ -46,7 +46,7 @@
 
 /* Current ACPICA subsystem version in YYYYMMDD format */
 
-#define ACPI_CA_VERSION                 0x20170629
+#define ACPI_CA_VERSION                 0x20170728
 
 #include <acpi/acconfig.h>
 #include <acpi/actypes.h>
-- 
cgit v1.2.3


From 04b1d4e50e82536c12da00ee04a77510c459c844 Mon Sep 17 00:00:00 2001
From: Ido Schimmel <idosch@mellanox.com>
Date: Thu, 3 Aug 2017 13:28:11 +0200
Subject: net: core: Make the FIB notification chain generic

The FIB notification chain is currently soley used by IPv4 code.
However, we're going to introduce IPv6 FIB offload support, which
requires these notification as well.

As explained in commit c3852ef7f2f8 ("ipv4: fib: Replay events when
registering FIB notifier"), upon registration to the chain, the callee
receives a full dump of the FIB tables and rules by traversing all the
net namespaces. The integrity of the dump is ensured by a per-namespace
sequence counter that is incremented whenever a change to the tables or
rules occurs.

In order to allow more address families to use the chain, each family is
expected to register its fib_notifier_ops in its pernet init. These
operations allow the common code to read the family's sequence counter
as well as dump its tables and rules in the given net namespace.

Additionally, a 'family' parameter is added to sent notifications, so
that listeners could distinguish between the different families.

Implement the common code that allows listeners to register to the chain
and for address families to register their fib_notifier_ops. Subsequent
patches will implement these operations in IPv6.

In the future, ipmr and ip6mr will be extended to provide these
notifications as well.

Signed-off-by: Ido Schimmel <idosch@mellanox.com>
Signed-off-by: Jiri Pirko <jiri@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 .../net/ethernet/mellanox/mlxsw/spectrum_router.c  |   1 +
 drivers/net/ethernet/rocker/rocker_main.c          |   1 +
 include/net/fib_notifier.h                         |  44 ++++++
 include/net/ip_fib.h                               |  30 +---
 include/net/net_namespace.h                        |   1 +
 include/net/netns/ipv4.h                           |   1 +
 net/core/Makefile                                  |   3 +-
 net/core/fib_notifier.c                            | 164 +++++++++++++++++++++
 net/ipv4/fib_frontend.c                            |  17 ++-
 net/ipv4/fib_notifier.c                            |  94 +++++-------
 net/ipv4/fib_rules.c                               |   5 +-
 net/ipv4/fib_semantics.c                           |   9 +-
 net/ipv4/fib_trie.c                                |   5 +-
 13 files changed, 282 insertions(+), 93 deletions(-)
 create mode 100644 include/net/fib_notifier.h
 create mode 100644 net/core/fib_notifier.c

(limited to 'include')

diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c
index 2f03c7e71584..b79f9b67f285 100644
--- a/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c
+++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c
@@ -53,6 +53,7 @@
 #include <net/addrconf.h>
 #include <net/ndisc.h>
 #include <net/ipv6.h>
+#include <net/fib_notifier.h>
 
 #include "spectrum.h"
 #include "core.h"
diff --git a/drivers/net/ethernet/rocker/rocker_main.c b/drivers/net/ethernet/rocker/rocker_main.c
index b1e5c07099fa..ef38c1a41bdd 100644
--- a/drivers/net/ethernet/rocker/rocker_main.c
+++ b/drivers/net/ethernet/rocker/rocker_main.c
@@ -34,6 +34,7 @@
 #include <net/netevent.h>
 #include <net/arp.h>
 #include <net/fib_rules.h>
+#include <net/fib_notifier.h>
 #include <linux/io-64-nonatomic-lo-hi.h>
 #include <generated/utsrelease.h>
 
diff --git a/include/net/fib_notifier.h b/include/net/fib_notifier.h
new file mode 100644
index 000000000000..241475224f74
--- /dev/null
+++ b/include/net/fib_notifier.h
@@ -0,0 +1,44 @@
+#ifndef __NET_FIB_NOTIFIER_H
+#define __NET_FIB_NOTIFIER_H
+
+#include <linux/types.h>
+#include <linux/notifier.h>
+#include <net/net_namespace.h>
+
+struct fib_notifier_info {
+	struct net *net;
+	int family;
+};
+
+enum fib_event_type {
+	FIB_EVENT_ENTRY_REPLACE,
+	FIB_EVENT_ENTRY_APPEND,
+	FIB_EVENT_ENTRY_ADD,
+	FIB_EVENT_ENTRY_DEL,
+	FIB_EVENT_RULE_ADD,
+	FIB_EVENT_RULE_DEL,
+	FIB_EVENT_NH_ADD,
+	FIB_EVENT_NH_DEL,
+};
+
+struct fib_notifier_ops {
+	int family;
+	struct list_head list;
+	unsigned int (*fib_seq_read)(struct net *net);
+	int (*fib_dump)(struct net *net, struct notifier_block *nb);
+	struct rcu_head rcu;
+};
+
+int call_fib_notifier(struct notifier_block *nb, struct net *net,
+		      enum fib_event_type event_type,
+		      struct fib_notifier_info *info);
+int call_fib_notifiers(struct net *net, enum fib_event_type event_type,
+		       struct fib_notifier_info *info);
+int register_fib_notifier(struct notifier_block *nb,
+			  void (*cb)(struct notifier_block *nb));
+int unregister_fib_notifier(struct notifier_block *nb);
+struct fib_notifier_ops *
+fib_notifier_ops_register(const struct fib_notifier_ops *tmpl, struct net *net);
+void fib_notifier_ops_unregister(struct fib_notifier_ops *ops);
+
+#endif
diff --git a/include/net/ip_fib.h b/include/net/ip_fib.h
index ef8992d49bc3..c0295c3ec5f3 100644
--- a/include/net/ip_fib.h
+++ b/include/net/ip_fib.h
@@ -19,6 +19,7 @@
 #include <net/flow.h>
 #include <linux/seq_file.h>
 #include <linux/rcupdate.h>
+#include <net/fib_notifier.h>
 #include <net/fib_rules.h>
 #include <net/inetpeer.h>
 #include <linux/percpu.h>
@@ -188,10 +189,6 @@ __be32 fib_info_update_nh_saddr(struct net *net, struct fib_nh *nh);
 #define FIB_RES_PREFSRC(net, res)	((res).fi->fib_prefsrc ? : \
 					 FIB_RES_SADDR(net, res))
 
-struct fib_notifier_info {
-	struct net *net;
-};
-
 struct fib_entry_notifier_info {
 	struct fib_notifier_info info; /* must be first */
 	u32 dst;
@@ -212,25 +209,14 @@ struct fib_nh_notifier_info {
 	struct fib_nh *fib_nh;
 };
 
-enum fib_event_type {
-	FIB_EVENT_ENTRY_REPLACE,
-	FIB_EVENT_ENTRY_APPEND,
-	FIB_EVENT_ENTRY_ADD,
-	FIB_EVENT_ENTRY_DEL,
-	FIB_EVENT_RULE_ADD,
-	FIB_EVENT_RULE_DEL,
-	FIB_EVENT_NH_ADD,
-	FIB_EVENT_NH_DEL,
-};
-
-int register_fib_notifier(struct notifier_block *nb,
-			  void (*cb)(struct notifier_block *nb));
-int unregister_fib_notifier(struct notifier_block *nb);
-int call_fib_notifier(struct notifier_block *nb, struct net *net,
-		      enum fib_event_type event_type,
-		      struct fib_notifier_info *info);
-int call_fib_notifiers(struct net *net, enum fib_event_type event_type,
+int call_fib4_notifier(struct notifier_block *nb, struct net *net,
+		       enum fib_event_type event_type,
 		       struct fib_notifier_info *info);
+int call_fib4_notifiers(struct net *net, enum fib_event_type event_type,
+			struct fib_notifier_info *info);
+
+int __net_init fib4_notifier_init(struct net *net);
+void __net_exit fib4_notifier_exit(struct net *net);
 
 void fib_notify(struct net *net, struct notifier_block *nb);
 #ifdef CONFIG_IP_MULTIPLE_TABLES
diff --git a/include/net/net_namespace.h b/include/net/net_namespace.h
index 1c401bd4c2e0..57faa375eab9 100644
--- a/include/net/net_namespace.h
+++ b/include/net/net_namespace.h
@@ -88,6 +88,7 @@ struct net {
 	/* core fib_rules */
 	struct list_head	rules_ops;
 
+	struct list_head	fib_notifier_ops;  /* protected by net_mutex */
 
 	struct net_device       *loopback_dev;          /* The loopback */
 	struct netns_core	core;
diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h
index 9a14a0850b0e..20d061c805e3 100644
--- a/include/net/netns/ipv4.h
+++ b/include/net/netns/ipv4.h
@@ -159,6 +159,7 @@ struct netns_ipv4 {
 	int sysctl_fib_multipath_hash_policy;
 #endif
 
+	struct fib_notifier_ops	*notifier_ops;
 	unsigned int	fib_seq;	/* protected by rtnl_mutex */
 
 	atomic_t	rt_genid;
diff --git a/net/core/Makefile b/net/core/Makefile
index d501c4278015..56d771a887b6 100644
--- a/net/core/Makefile
+++ b/net/core/Makefile
@@ -9,7 +9,8 @@ obj-$(CONFIG_SYSCTL) += sysctl_net_core.o
 
 obj-y		     += dev.o ethtool.o dev_addr_lists.o dst.o netevent.o \
 			neighbour.o rtnetlink.o utils.o link_watch.o filter.o \
-			sock_diag.o dev_ioctl.o tso.o sock_reuseport.o
+			sock_diag.o dev_ioctl.o tso.o sock_reuseport.o \
+			fib_notifier.o
 
 obj-y += net-sysfs.o
 obj-$(CONFIG_PROC_FS) += net-procfs.o
diff --git a/net/core/fib_notifier.c b/net/core/fib_notifier.c
new file mode 100644
index 000000000000..292aab83702f
--- /dev/null
+++ b/net/core/fib_notifier.c
@@ -0,0 +1,164 @@
+#include <linux/rtnetlink.h>
+#include <linux/notifier.h>
+#include <linux/rcupdate.h>
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <net/net_namespace.h>
+#include <net/fib_notifier.h>
+
+static ATOMIC_NOTIFIER_HEAD(fib_chain);
+
+int call_fib_notifier(struct notifier_block *nb, struct net *net,
+		      enum fib_event_type event_type,
+		      struct fib_notifier_info *info)
+{
+	info->net = net;
+	return nb->notifier_call(nb, event_type, info);
+}
+EXPORT_SYMBOL(call_fib_notifier);
+
+int call_fib_notifiers(struct net *net, enum fib_event_type event_type,
+		       struct fib_notifier_info *info)
+{
+	info->net = net;
+	return atomic_notifier_call_chain(&fib_chain, event_type, info);
+}
+EXPORT_SYMBOL(call_fib_notifiers);
+
+static unsigned int fib_seq_sum(void)
+{
+	struct fib_notifier_ops *ops;
+	unsigned int fib_seq = 0;
+	struct net *net;
+
+	rtnl_lock();
+	for_each_net(net) {
+		list_for_each_entry(ops, &net->fib_notifier_ops, list)
+			fib_seq += ops->fib_seq_read(net);
+	}
+	rtnl_unlock();
+
+	return fib_seq;
+}
+
+static int fib_net_dump(struct net *net, struct notifier_block *nb)
+{
+	struct fib_notifier_ops *ops;
+
+	list_for_each_entry_rcu(ops, &net->fib_notifier_ops, list) {
+		int err = ops->fib_dump(net, nb);
+
+		if (err)
+			return err;
+	}
+
+	return 0;
+}
+
+static bool fib_dump_is_consistent(struct notifier_block *nb,
+				   void (*cb)(struct notifier_block *nb),
+				   unsigned int fib_seq)
+{
+	atomic_notifier_chain_register(&fib_chain, nb);
+	if (fib_seq == fib_seq_sum())
+		return true;
+	atomic_notifier_chain_unregister(&fib_chain, nb);
+	if (cb)
+		cb(nb);
+	return false;
+}
+
+#define FIB_DUMP_MAX_RETRIES 5
+int register_fib_notifier(struct notifier_block *nb,
+			  void (*cb)(struct notifier_block *nb))
+{
+	int retries = 0;
+	int err;
+
+	do {
+		unsigned int fib_seq = fib_seq_sum();
+		struct net *net;
+
+		rcu_read_lock();
+		for_each_net_rcu(net) {
+			err = fib_net_dump(net, nb);
+			if (err)
+				goto err_fib_net_dump;
+		}
+		rcu_read_unlock();
+
+		if (fib_dump_is_consistent(nb, cb, fib_seq))
+			return 0;
+	} while (++retries < FIB_DUMP_MAX_RETRIES);
+
+	return -EBUSY;
+
+err_fib_net_dump:
+	rcu_read_unlock();
+	return err;
+}
+EXPORT_SYMBOL(register_fib_notifier);
+
+int unregister_fib_notifier(struct notifier_block *nb)
+{
+	return atomic_notifier_chain_unregister(&fib_chain, nb);
+}
+EXPORT_SYMBOL(unregister_fib_notifier);
+
+static int __fib_notifier_ops_register(struct fib_notifier_ops *ops,
+				       struct net *net)
+{
+	struct fib_notifier_ops *o;
+
+	list_for_each_entry(o, &net->fib_notifier_ops, list)
+		if (ops->family == o->family)
+			return -EEXIST;
+	list_add_tail_rcu(&ops->list, &net->fib_notifier_ops);
+	return 0;
+}
+
+struct fib_notifier_ops *
+fib_notifier_ops_register(const struct fib_notifier_ops *tmpl, struct net *net)
+{
+	struct fib_notifier_ops *ops;
+	int err;
+
+	ops = kmemdup(tmpl, sizeof(*ops), GFP_KERNEL);
+	if (!ops)
+		return ERR_PTR(-ENOMEM);
+
+	err = __fib_notifier_ops_register(ops, net);
+	if (err)
+		goto err_register;
+
+	return ops;
+
+err_register:
+	kfree(ops);
+	return ERR_PTR(err);
+}
+EXPORT_SYMBOL(fib_notifier_ops_register);
+
+void fib_notifier_ops_unregister(struct fib_notifier_ops *ops)
+{
+	list_del_rcu(&ops->list);
+	kfree_rcu(ops, rcu);
+}
+EXPORT_SYMBOL(fib_notifier_ops_unregister);
+
+static int __net_init fib_notifier_net_init(struct net *net)
+{
+	INIT_LIST_HEAD(&net->fib_notifier_ops);
+	return 0;
+}
+
+static struct pernet_operations fib_notifier_net_ops = {
+	.init = fib_notifier_net_init,
+};
+
+static int __init fib_notifier_init(void)
+{
+	return register_pernet_subsys(&fib_notifier_net_ops);
+}
+
+subsys_initcall(fib_notifier_init);
diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c
index 044d2a159a3c..2cba559f14df 100644
--- a/net/ipv4/fib_frontend.c
+++ b/net/ipv4/fib_frontend.c
@@ -1247,22 +1247,28 @@ static int __net_init ip_fib_net_init(struct net *net)
 	int err;
 	size_t size = sizeof(struct hlist_head) * FIB_TABLE_HASHSZ;
 
-	net->ipv4.fib_seq = 0;
+	err = fib4_notifier_init(net);
+	if (err)
+		return err;
 
 	/* Avoid false sharing : Use at least a full cache line */
 	size = max_t(size_t, size, L1_CACHE_BYTES);
 
 	net->ipv4.fib_table_hash = kzalloc(size, GFP_KERNEL);
-	if (!net->ipv4.fib_table_hash)
-		return -ENOMEM;
+	if (!net->ipv4.fib_table_hash) {
+		err = -ENOMEM;
+		goto err_table_hash_alloc;
+	}
 
 	err = fib4_rules_init(net);
 	if (err < 0)
-		goto fail;
+		goto err_rules_init;
 	return 0;
 
-fail:
+err_rules_init:
 	kfree(net->ipv4.fib_table_hash);
+err_table_hash_alloc:
+	fib4_notifier_exit(net);
 	return err;
 }
 
@@ -1292,6 +1298,7 @@ static void ip_fib_net_exit(struct net *net)
 #endif
 	rtnl_unlock();
 	kfree(net->ipv4.fib_table_hash);
+	fib4_notifier_exit(net);
 }
 
 static int __net_init fib_net_init(struct net *net)
diff --git a/net/ipv4/fib_notifier.c b/net/ipv4/fib_notifier.c
index e0714d975947..7cf1954bbadc 100644
--- a/net/ipv4/fib_notifier.c
+++ b/net/ipv4/fib_notifier.c
@@ -1,86 +1,66 @@
 #include <linux/rtnetlink.h>
 #include <linux/notifier.h>
-#include <linux/rcupdate.h>
+#include <linux/socket.h>
 #include <linux/kernel.h>
 #include <net/net_namespace.h>
+#include <net/fib_notifier.h>
 #include <net/netns/ipv4.h>
 #include <net/ip_fib.h>
 
-static ATOMIC_NOTIFIER_HEAD(fib_chain);
-
-int call_fib_notifier(struct notifier_block *nb, struct net *net,
-		      enum fib_event_type event_type,
-		      struct fib_notifier_info *info)
+int call_fib4_notifier(struct notifier_block *nb, struct net *net,
+		       enum fib_event_type event_type,
+		       struct fib_notifier_info *info)
 {
-	info->net = net;
-	return nb->notifier_call(nb, event_type, info);
+	info->family = AF_INET;
+	return call_fib_notifier(nb, net, event_type, info);
 }
 
-int call_fib_notifiers(struct net *net, enum fib_event_type event_type,
-		       struct fib_notifier_info *info)
+int call_fib4_notifiers(struct net *net, enum fib_event_type event_type,
+			struct fib_notifier_info *info)
 {
+	ASSERT_RTNL();
+
+	info->family = AF_INET;
 	net->ipv4.fib_seq++;
-	info->net = net;
-	return atomic_notifier_call_chain(&fib_chain, event_type, info);
+	return call_fib_notifiers(net, event_type, info);
 }
 
-static unsigned int fib_seq_sum(void)
+static unsigned int fib4_seq_read(struct net *net)
 {
-	unsigned int fib_seq = 0;
-	struct net *net;
+	ASSERT_RTNL();
 
-	rtnl_lock();
-	for_each_net(net)
-		fib_seq += net->ipv4.fib_seq;
-	rtnl_unlock();
-
-	return fib_seq;
+	return net->ipv4.fib_seq;
 }
 
-static bool fib_dump_is_consistent(struct notifier_block *nb,
-				   void (*cb)(struct notifier_block *nb),
-				   unsigned int fib_seq)
+static int fib4_dump(struct net *net, struct notifier_block *nb)
 {
-	atomic_notifier_chain_register(&fib_chain, nb);
-	if (fib_seq == fib_seq_sum())
-		return true;
-	atomic_notifier_chain_unregister(&fib_chain, nb);
-	if (cb)
-		cb(nb);
-	return false;
+	fib_rules_notify(net, nb);
+	fib_notify(net, nb);
+
+	return 0;
 }
 
-#define FIB_DUMP_MAX_RETRIES 5
-int register_fib_notifier(struct notifier_block *nb,
-			  void (*cb)(struct notifier_block *nb))
-{
-	int retries = 0;
+static const struct fib_notifier_ops fib4_notifier_ops_template = {
+	.family		= AF_INET,
+	.fib_seq_read	= fib4_seq_read,
+	.fib_dump	= fib4_dump,
+};
 
-	do {
-		unsigned int fib_seq = fib_seq_sum();
-		struct net *net;
+int __net_init fib4_notifier_init(struct net *net)
+{
+	struct fib_notifier_ops *ops;
 
-		/* Mutex semantics guarantee that every change done to
-		 * FIB tries before we read the change sequence counter
-		 * is now visible to us.
-		 */
-		rcu_read_lock();
-		for_each_net_rcu(net) {
-			fib_rules_notify(net, nb);
-			fib_notify(net, nb);
-		}
-		rcu_read_unlock();
+	net->ipv4.fib_seq = 0;
 
-		if (fib_dump_is_consistent(nb, cb, fib_seq))
-			return 0;
-	} while (++retries < FIB_DUMP_MAX_RETRIES);
+	ops = fib_notifier_ops_register(&fib4_notifier_ops_template, net);
+	if (IS_ERR(ops))
+		return PTR_ERR(ops);
+	net->ipv4.notifier_ops = ops;
 
-	return -EBUSY;
+	return 0;
 }
-EXPORT_SYMBOL(register_fib_notifier);
 
-int unregister_fib_notifier(struct notifier_block *nb)
+void __net_exit fib4_notifier_exit(struct net *net)
 {
-	return atomic_notifier_chain_unregister(&fib_chain, nb);
+	fib_notifier_ops_unregister(net->ipv4.notifier_ops);
 }
-EXPORT_SYMBOL(unregister_fib_notifier);
diff --git a/net/ipv4/fib_rules.c b/net/ipv4/fib_rules.c
index 778ecf977eb2..acdbf5a24ac9 100644
--- a/net/ipv4/fib_rules.c
+++ b/net/ipv4/fib_rules.c
@@ -32,6 +32,7 @@
 #include <net/tcp.h>
 #include <net/ip_fib.h>
 #include <net/fib_rules.h>
+#include <net/fib_notifier.h>
 
 struct fib4_rule {
 	struct fib_rule		common;
@@ -193,7 +194,7 @@ static int call_fib_rule_notifier(struct notifier_block *nb, struct net *net,
 		.rule = rule,
 	};
 
-	return call_fib_notifier(nb, net, event_type, &info.info);
+	return call_fib4_notifier(nb, net, event_type, &info.info);
 }
 
 static int call_fib_rule_notifiers(struct net *net,
@@ -204,7 +205,7 @@ static int call_fib_rule_notifiers(struct net *net,
 		.rule = rule,
 	};
 
-	return call_fib_notifiers(net, event_type, &info.info);
+	return call_fib4_notifiers(net, event_type, &info.info);
 }
 
 /* Called with rcu_read_lock() */
diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c
index f62dc2463280..632b454ce77c 100644
--- a/net/ipv4/fib_semantics.c
+++ b/net/ipv4/fib_semantics.c
@@ -44,6 +44,7 @@
 #include <net/netlink.h>
 #include <net/nexthop.h>
 #include <net/lwtunnel.h>
+#include <net/fib_notifier.h>
 
 #include "fib_lookup.h"
 
@@ -1451,14 +1452,14 @@ static int call_fib_nh_notifiers(struct fib_nh *fib_nh,
 		if (IN_DEV_IGNORE_ROUTES_WITH_LINKDOWN(in_dev) &&
 		    fib_nh->nh_flags & RTNH_F_LINKDOWN)
 			break;
-		return call_fib_notifiers(dev_net(fib_nh->nh_dev), event_type,
-					  &info.info);
+		return call_fib4_notifiers(dev_net(fib_nh->nh_dev), event_type,
+					   &info.info);
 	case FIB_EVENT_NH_DEL:
 		if ((in_dev && IN_DEV_IGNORE_ROUTES_WITH_LINKDOWN(in_dev) &&
 		     fib_nh->nh_flags & RTNH_F_LINKDOWN) ||
 		    (fib_nh->nh_flags & RTNH_F_DEAD))
-			return call_fib_notifiers(dev_net(fib_nh->nh_dev),
-						  event_type, &info.info);
+			return call_fib4_notifiers(dev_net(fib_nh->nh_dev),
+						   event_type, &info.info);
 	default:
 		break;
 	}
diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c
index 64668c69dda6..1a6ffb0dab9c 100644
--- a/net/ipv4/fib_trie.c
+++ b/net/ipv4/fib_trie.c
@@ -81,6 +81,7 @@
 #include <net/tcp.h>
 #include <net/sock.h>
 #include <net/ip_fib.h>
+#include <net/fib_notifier.h>
 #include <trace/events/fib.h>
 #include "fib_lookup.h"
 
@@ -97,7 +98,7 @@ static int call_fib_entry_notifier(struct notifier_block *nb, struct net *net,
 		.type = type,
 		.tb_id = tb_id,
 	};
-	return call_fib_notifier(nb, net, event_type, &info.info);
+	return call_fib4_notifier(nb, net, event_type, &info.info);
 }
 
 static int call_fib_entry_notifiers(struct net *net,
@@ -113,7 +114,7 @@ static int call_fib_entry_notifiers(struct net *net,
 		.type = type,
 		.tb_id = tb_id,
 	};
-	return call_fib_notifiers(net, event_type, &info.info);
+	return call_fib4_notifiers(net, event_type, &info.info);
 }
 
 #define MAX_STAT_DEPTH 32
-- 
cgit v1.2.3


From 1b2a4440858857f2f93bb2ec5bb3a60f4fcc25be Mon Sep 17 00:00:00 2001
From: Ido Schimmel <idosch@mellanox.com>
Date: Thu, 3 Aug 2017 13:28:14 +0200
Subject: net: fib_rules: Implement notification logic in core

Unlike the routing tables, the FIB rules share a common core, so instead
of replicating the same logic for each address family we can simply dump
the rules and send notifications from the core itself.

To protect the integrity of the dump, a rules-specific sequence counter
is added for each address family and incremented whenever a rule is
added or deleted (under RTNL).

Signed-off-by: Ido Schimmel <idosch@mellanox.com>
Signed-off-by: Jiri Pirko <jiri@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/fib_rules.h |  9 +++++++
 include/net/ip_fib.h    | 24 +++++++++----------
 net/core/fib_rules.c    | 63 +++++++++++++++++++++++++++++++++++++++++++++++++
 net/ipv4/fib_notifier.c |  9 +++++--
 net/ipv4/fib_rules.c    | 45 ++++++++---------------------------
 5 files changed, 101 insertions(+), 49 deletions(-)

(limited to 'include')

diff --git a/include/net/fib_rules.h b/include/net/fib_rules.h
index c487bfa2f479..3d7f1cefc6f5 100644
--- a/include/net/fib_rules.h
+++ b/include/net/fib_rules.h
@@ -8,6 +8,7 @@
 #include <linux/refcount.h>
 #include <net/flow.h>
 #include <net/rtnetlink.h>
+#include <net/fib_notifier.h>
 
 struct fib_kuid_range {
 	kuid_t start;
@@ -57,6 +58,7 @@ struct fib_rules_ops {
 	int			addr_size;
 	int			unresolved_rules;
 	int			nr_goto_rules;
+	unsigned int		fib_rules_seq;
 
 	int			(*action)(struct fib_rule *,
 					  struct flowi *, int,
@@ -89,6 +91,11 @@ struct fib_rules_ops {
 	struct rcu_head		rcu;
 };
 
+struct fib_rule_notifier_info {
+	struct fib_notifier_info info; /* must be first */
+	struct fib_rule *rule;
+};
+
 #define FRA_GENERIC_POLICY \
 	[FRA_IIFNAME]	= { .type = NLA_STRING, .len = IFNAMSIZ - 1 }, \
 	[FRA_OIFNAME]	= { .type = NLA_STRING, .len = IFNAMSIZ - 1 }, \
@@ -143,6 +150,8 @@ int fib_rules_lookup(struct fib_rules_ops *, struct flowi *, int flags,
 int fib_default_rule_add(struct fib_rules_ops *, u32 pref, u32 table,
 			 u32 flags);
 bool fib_rule_matchall(const struct fib_rule *rule);
+int fib_rules_dump(struct net *net, struct notifier_block *nb, int family);
+unsigned int fib_rules_seq_read(struct net *net, int family);
 
 int fib_nl_newrule(struct sk_buff *skb, struct nlmsghdr *nlh,
 		   struct netlink_ext_ack *extack);
diff --git a/include/net/ip_fib.h b/include/net/ip_fib.h
index c0295c3ec5f3..1a7f7e424320 100644
--- a/include/net/ip_fib.h
+++ b/include/net/ip_fib.h
@@ -199,11 +199,6 @@ struct fib_entry_notifier_info {
 	u32 tb_id;
 };
 
-struct fib_rule_notifier_info {
-	struct fib_notifier_info info; /* must be first */
-	struct fib_rule *rule;
-};
-
 struct fib_nh_notifier_info {
 	struct fib_notifier_info info; /* must be first */
 	struct fib_nh *fib_nh;
@@ -219,13 +214,6 @@ int __net_init fib4_notifier_init(struct net *net);
 void __net_exit fib4_notifier_exit(struct net *net);
 
 void fib_notify(struct net *net, struct notifier_block *nb);
-#ifdef CONFIG_IP_MULTIPLE_TABLES
-void fib_rules_notify(struct net *net, struct notifier_block *nb);
-#else
-static inline void fib_rules_notify(struct net *net, struct notifier_block *nb)
-{
-}
-#endif
 
 struct fib_table {
 	struct hlist_node	tb_hlist;
@@ -298,6 +286,16 @@ static inline bool fib4_rule_default(const struct fib_rule *rule)
 	return true;
 }
 
+static inline int fib4_rules_dump(struct net *net, struct notifier_block *nb)
+{
+	return 0;
+}
+
+static inline unsigned int fib4_rules_seq_read(struct net *net)
+{
+	return 0;
+}
+
 #else /* CONFIG_IP_MULTIPLE_TABLES */
 int __net_init fib4_rules_init(struct net *net);
 void __net_exit fib4_rules_exit(struct net *net);
@@ -343,6 +341,8 @@ out:
 }
 
 bool fib4_rule_default(const struct fib_rule *rule);
+int fib4_rules_dump(struct net *net, struct notifier_block *nb);
+unsigned int fib4_rules_seq_read(struct net *net);
 
 #endif /* CONFIG_IP_MULTIPLE_TABLES */
 
diff --git a/net/core/fib_rules.c b/net/core/fib_rules.c
index fdcb1bcd2afa..fc0b65093417 100644
--- a/net/core/fib_rules.c
+++ b/net/core/fib_rules.c
@@ -299,6 +299,67 @@ out:
 }
 EXPORT_SYMBOL_GPL(fib_rules_lookup);
 
+static int call_fib_rule_notifier(struct notifier_block *nb, struct net *net,
+				  enum fib_event_type event_type,
+				  struct fib_rule *rule, int family)
+{
+	struct fib_rule_notifier_info info = {
+		.info.family = family,
+		.rule = rule,
+	};
+
+	return call_fib_notifier(nb, net, event_type, &info.info);
+}
+
+static int call_fib_rule_notifiers(struct net *net,
+				   enum fib_event_type event_type,
+				   struct fib_rule *rule,
+				   struct fib_rules_ops *ops)
+{
+	struct fib_rule_notifier_info info = {
+		.info.family = ops->family,
+		.rule = rule,
+	};
+
+	ops->fib_rules_seq++;
+	return call_fib_notifiers(net, event_type, &info.info);
+}
+
+/* Called with rcu_read_lock() */
+int fib_rules_dump(struct net *net, struct notifier_block *nb, int family)
+{
+	struct fib_rules_ops *ops;
+	struct fib_rule *rule;
+
+	ops = lookup_rules_ops(net, family);
+	if (!ops)
+		return -EAFNOSUPPORT;
+	list_for_each_entry_rcu(rule, &ops->rules_list, list)
+		call_fib_rule_notifier(nb, net, FIB_EVENT_RULE_ADD, rule,
+				       family);
+	rules_ops_put(ops);
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(fib_rules_dump);
+
+unsigned int fib_rules_seq_read(struct net *net, int family)
+{
+	unsigned int fib_rules_seq;
+	struct fib_rules_ops *ops;
+
+	ASSERT_RTNL();
+
+	ops = lookup_rules_ops(net, family);
+	if (!ops)
+		return 0;
+	fib_rules_seq = ops->fib_rules_seq;
+	rules_ops_put(ops);
+
+	return fib_rules_seq;
+}
+EXPORT_SYMBOL_GPL(fib_rules_seq_read);
+
 static int validate_rulemsg(struct fib_rule_hdr *frh, struct nlattr **tb,
 			    struct fib_rules_ops *ops)
 {
@@ -548,6 +609,7 @@ int fib_nl_newrule(struct sk_buff *skb, struct nlmsghdr *nlh,
 	if (rule->tun_id)
 		ip_tunnel_need_metadata();
 
+	call_fib_rule_notifiers(net, FIB_EVENT_RULE_ADD, rule, ops);
 	notify_rule_change(RTM_NEWRULE, rule, ops, nlh, NETLINK_CB(skb).portid);
 	flush_route_cache(ops);
 	rules_ops_put(ops);
@@ -687,6 +749,7 @@ int fib_nl_delrule(struct sk_buff *skb, struct nlmsghdr *nlh,
 			}
 		}
 
+		call_fib_rule_notifiers(net, FIB_EVENT_RULE_DEL, rule, ops);
 		notify_rule_change(RTM_DELRULE, rule, ops, nlh,
 				   NETLINK_CB(skb).portid);
 		fib_rule_put(rule);
diff --git a/net/ipv4/fib_notifier.c b/net/ipv4/fib_notifier.c
index 7cf1954bbadc..5d7afb145562 100644
--- a/net/ipv4/fib_notifier.c
+++ b/net/ipv4/fib_notifier.c
@@ -29,12 +29,17 @@ static unsigned int fib4_seq_read(struct net *net)
 {
 	ASSERT_RTNL();
 
-	return net->ipv4.fib_seq;
+	return net->ipv4.fib_seq + fib4_rules_seq_read(net);
 }
 
 static int fib4_dump(struct net *net, struct notifier_block *nb)
 {
-	fib_rules_notify(net, nb);
+	int err;
+
+	err = fib4_rules_dump(net, nb);
+	if (err)
+		return err;
+
 	fib_notify(net, nb);
 
 	return 0;
diff --git a/net/ipv4/fib_rules.c b/net/ipv4/fib_rules.c
index acdbf5a24ac9..35d646a62ad4 100644
--- a/net/ipv4/fib_rules.c
+++ b/net/ipv4/fib_rules.c
@@ -32,7 +32,6 @@
 #include <net/tcp.h>
 #include <net/ip_fib.h>
 #include <net/fib_rules.h>
-#include <net/fib_notifier.h>
 
 struct fib4_rule {
 	struct fib_rule		common;
@@ -69,6 +68,16 @@ bool fib4_rule_default(const struct fib_rule *rule)
 }
 EXPORT_SYMBOL_GPL(fib4_rule_default);
 
+int fib4_rules_dump(struct net *net, struct notifier_block *nb)
+{
+	return fib_rules_dump(net, nb, AF_INET);
+}
+
+unsigned int fib4_rules_seq_read(struct net *net)
+{
+	return fib_rules_seq_read(net, AF_INET);
+}
+
 int __fib_lookup(struct net *net, struct flowi4 *flp,
 		 struct fib_result *res, unsigned int flags)
 {
@@ -186,38 +195,6 @@ static struct fib_table *fib_empty_table(struct net *net)
 	return NULL;
 }
 
-static int call_fib_rule_notifier(struct notifier_block *nb, struct net *net,
-				  enum fib_event_type event_type,
-				  struct fib_rule *rule)
-{
-	struct fib_rule_notifier_info info = {
-		.rule = rule,
-	};
-
-	return call_fib4_notifier(nb, net, event_type, &info.info);
-}
-
-static int call_fib_rule_notifiers(struct net *net,
-				   enum fib_event_type event_type,
-				   struct fib_rule *rule)
-{
-	struct fib_rule_notifier_info info = {
-		.rule = rule,
-	};
-
-	return call_fib4_notifiers(net, event_type, &info.info);
-}
-
-/* Called with rcu_read_lock() */
-void fib_rules_notify(struct net *net, struct notifier_block *nb)
-{
-	struct fib_rules_ops *ops = net->ipv4.rules_ops;
-	struct fib_rule *rule;
-
-	list_for_each_entry_rcu(rule, &ops->rules_list, list)
-		call_fib_rule_notifier(nb, net, FIB_EVENT_RULE_ADD, rule);
-}
-
 static const struct nla_policy fib4_rule_policy[FRA_MAX+1] = {
 	FRA_GENERIC_POLICY,
 	[FRA_FLOW]	= { .type = NLA_U32 },
@@ -274,7 +251,6 @@ static int fib4_rule_configure(struct fib_rule *rule, struct sk_buff *skb,
 	rule4->tos = frh->tos;
 
 	net->ipv4.fib_has_custom_rules = true;
-	call_fib_rule_notifiers(net, FIB_EVENT_RULE_ADD, rule);
 
 	err = 0;
 errout:
@@ -296,7 +272,6 @@ static int fib4_rule_delete(struct fib_rule *rule)
 		net->ipv4.fib_num_tclassid_users--;
 #endif
 	net->ipv4.fib_has_custom_rules = true;
-	call_fib_rule_notifiers(net, FIB_EVENT_RULE_DEL, rule);
 errout:
 	return err;
 }
-- 
cgit v1.2.3


From e3ea973159d53559c5ae9a9dbc824da9aba6cac0 Mon Sep 17 00:00:00 2001
From: Ido Schimmel <idosch@mellanox.com>
Date: Thu, 3 Aug 2017 13:28:15 +0200
Subject: ipv6: fib_rules: Check if rule is a default rule

As explained in commit 3c71006d15fd ("ipv4: fib_rules: Check if rule is
a default rule"), drivers supporting IPv6 FIB offload need to be able to
sanitize the rules they don't support and potentially flush their
tables.

Add an IPv6 helper to check if a FIB rule is a default rule.

Signed-off-by: Ido Schimmel <idosch@mellanox.com>
Signed-off-by: Jiri Pirko <jiri@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/ip6_fib.h |  5 +++++
 net/ipv6/fib6_rules.c | 20 ++++++++++++++++++++
 2 files changed, 25 insertions(+)

(limited to 'include')

diff --git a/include/net/ip6_fib.h b/include/net/ip6_fib.h
index 1a88008cc6f5..6000b0dc51ee 100644
--- a/include/net/ip6_fib.h
+++ b/include/net/ip6_fib.h
@@ -295,6 +295,7 @@ int ipv6_route_open(struct inode *inode, struct file *file);
 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
 int fib6_rules_init(void);
 void fib6_rules_cleanup(void);
+bool fib6_rule_default(const struct fib_rule *rule);
 #else
 static inline int               fib6_rules_init(void)
 {
@@ -304,5 +305,9 @@ static inline void              fib6_rules_cleanup(void)
 {
 	return ;
 }
+static inline bool fib6_rule_default(const struct fib_rule *rule)
+{
+	return true;
+}
 #endif
 #endif
diff --git a/net/ipv6/fib6_rules.c b/net/ipv6/fib6_rules.c
index ec849d88a662..ef1fcee6bf16 100644
--- a/net/ipv6/fib6_rules.c
+++ b/net/ipv6/fib6_rules.c
@@ -29,6 +29,26 @@ struct fib6_rule {
 	u8			tclass;
 };
 
+static bool fib6_rule_matchall(const struct fib_rule *rule)
+{
+	struct fib6_rule *r = container_of(rule, struct fib6_rule, common);
+
+	if (r->dst.plen || r->src.plen || r->tclass)
+		return false;
+	return fib_rule_matchall(rule);
+}
+
+bool fib6_rule_default(const struct fib_rule *rule)
+{
+	if (!fib6_rule_matchall(rule) || rule->action != FR_ACT_TO_TBL ||
+	    rule->l3mdev)
+		return false;
+	if (rule->table != RT6_TABLE_LOCAL && rule->table != RT6_TABLE_MAIN)
+		return false;
+	return true;
+}
+EXPORT_SYMBOL_GPL(fib6_rule_default);
+
 struct dst_entry *fib6_rule_lookup(struct net *net, struct flowi6 *fl6,
 				   int flags, pol_lookup_t lookup)
 {
-- 
cgit v1.2.3


From 16ab6d7d4d8cc037bb4be12c2b849ac92787e1ff Mon Sep 17 00:00:00 2001
From: Ido Schimmel <idosch@mellanox.com>
Date: Thu, 3 Aug 2017 13:28:16 +0200
Subject: ipv6: fib: Add FIB notifiers callbacks

We're about to add IPv6 FIB offload support, so implement the necessary
callbacks in IPv6 code, which will later allow us to add routes and
rules notifications.

Signed-off-by: Ido Schimmel <idosch@mellanox.com>
Signed-off-by: Jiri Pirko <jiri@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/ip6_fib.h    | 11 ++++++++++
 include/net/netns/ipv6.h |  1 +
 net/ipv6/Makefile        |  2 +-
 net/ipv6/fib6_notifier.c | 55 ++++++++++++++++++++++++++++++++++++++++++++++++
 net/ipv6/ip6_fib.c       |  7 ++++++
 5 files changed, 75 insertions(+), 1 deletion(-)
 create mode 100644 net/ipv6/fib6_notifier.c

(limited to 'include')

diff --git a/include/net/ip6_fib.h b/include/net/ip6_fib.h
index 6000b0dc51ee..be8ddf3253dc 100644
--- a/include/net/ip6_fib.h
+++ b/include/net/ip6_fib.h
@@ -16,10 +16,12 @@
 #include <linux/ipv6_route.h>
 #include <linux/rtnetlink.h>
 #include <linux/spinlock.h>
+#include <linux/notifier.h>
 #include <net/dst.h>
 #include <net/flow.h>
 #include <net/netlink.h>
 #include <net/inetpeer.h>
+#include <net/fib_notifier.h>
 
 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
 #define FIB6_TABLE_HASHSZ 256
@@ -292,6 +294,15 @@ int fib6_init(void);
 
 int ipv6_route_open(struct inode *inode, struct file *file);
 
+int call_fib6_notifier(struct notifier_block *nb, struct net *net,
+		       enum fib_event_type event_type,
+		       struct fib_notifier_info *info);
+int call_fib6_notifiers(struct net *net, enum fib_event_type event_type,
+			struct fib_notifier_info *info);
+
+int __net_init fib6_notifier_init(struct net *net);
+void __net_exit fib6_notifier_exit(struct net *net);
+
 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
 int fib6_rules_init(void);
 void fib6_rules_cleanup(void);
diff --git a/include/net/netns/ipv6.h b/include/net/netns/ipv6.h
index de7745e2edcc..abdf3b40303b 100644
--- a/include/net/netns/ipv6.h
+++ b/include/net/netns/ipv6.h
@@ -86,6 +86,7 @@ struct netns_ipv6 {
 	atomic_t		dev_addr_genid;
 	atomic_t		fib6_sernum;
 	struct seg6_pernet_data *seg6_data;
+	struct fib_notifier_ops	*notifier_ops;
 };
 
 #if IS_ENABLED(CONFIG_NF_DEFRAG_IPV6)
diff --git a/net/ipv6/Makefile b/net/ipv6/Makefile
index 217e9ff0e24b..f8b24c2e0d77 100644
--- a/net/ipv6/Makefile
+++ b/net/ipv6/Makefile
@@ -9,7 +9,7 @@ ipv6-objs :=	af_inet6.o anycast.o ip6_output.o ip6_input.o addrconf.o \
 		route.o ip6_fib.o ipv6_sockglue.o ndisc.o udp.o udplite.o \
 		raw.o icmp.o mcast.o reassembly.o tcp_ipv6.o ping.o \
 		exthdrs.o datagram.o ip6_flowlabel.o inet6_connection_sock.o \
-		udp_offload.o seg6.o
+		udp_offload.o seg6.o fib6_notifier.o
 
 ipv6-offload :=	ip6_offload.o tcpv6_offload.o exthdrs_offload.o
 
diff --git a/net/ipv6/fib6_notifier.c b/net/ipv6/fib6_notifier.c
new file mode 100644
index 000000000000..c2bb1ab5b5eb
--- /dev/null
+++ b/net/ipv6/fib6_notifier.c
@@ -0,0 +1,55 @@
+#include <linux/notifier.h>
+#include <linux/socket.h>
+#include <linux/kernel.h>
+#include <net/net_namespace.h>
+#include <net/fib_notifier.h>
+#include <net/netns/ipv6.h>
+#include <net/ip6_fib.h>
+
+int call_fib6_notifier(struct notifier_block *nb, struct net *net,
+		       enum fib_event_type event_type,
+		       struct fib_notifier_info *info)
+{
+	info->family = AF_INET6;
+	return call_fib_notifier(nb, net, event_type, info);
+}
+
+int call_fib6_notifiers(struct net *net, enum fib_event_type event_type,
+			struct fib_notifier_info *info)
+{
+	info->family = AF_INET6;
+	return call_fib_notifiers(net, event_type, info);
+}
+
+static unsigned int fib6_seq_read(struct net *net)
+{
+	return 0;
+}
+
+static int fib6_dump(struct net *net, struct notifier_block *nb)
+{
+	return 0;
+}
+
+static const struct fib_notifier_ops fib6_notifier_ops_template = {
+	.family		= AF_INET6,
+	.fib_seq_read	= fib6_seq_read,
+	.fib_dump	= fib6_dump,
+};
+
+int __net_init fib6_notifier_init(struct net *net)
+{
+	struct fib_notifier_ops *ops;
+
+	ops = fib_notifier_ops_register(&fib6_notifier_ops_template, net);
+	if (IS_ERR(ops))
+		return PTR_ERR(ops);
+	net->ipv6.notifier_ops = ops;
+
+	return 0;
+}
+
+void __net_exit fib6_notifier_exit(struct net *net)
+{
+	fib_notifier_ops_unregister(net->ipv6.notifier_ops);
+}
diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c
index ebb299cf72b7..f93976e3f65c 100644
--- a/net/ipv6/ip6_fib.c
+++ b/net/ipv6/ip6_fib.c
@@ -1839,6 +1839,11 @@ static void fib6_gc_timer_cb(unsigned long arg)
 static int __net_init fib6_net_init(struct net *net)
 {
 	size_t size = sizeof(struct hlist_head) * FIB6_TABLE_HASHSZ;
+	int err;
+
+	err = fib6_notifier_init(net);
+	if (err)
+		return err;
 
 	spin_lock_init(&net->ipv6.fib6_gc_lock);
 	rwlock_init(&net->ipv6.fib6_walker_lock);
@@ -1891,6 +1896,7 @@ out_fib_table_hash:
 out_rt6_stats:
 	kfree(net->ipv6.rt6_stats);
 out_timer:
+	fib6_notifier_exit(net);
 	return -ENOMEM;
 }
 
@@ -1907,6 +1913,7 @@ static void fib6_net_exit(struct net *net)
 	kfree(net->ipv6.fib6_main_tbl);
 	kfree(net->ipv6.fib_table_hash);
 	kfree(net->ipv6.rt6_stats);
+	fib6_notifier_exit(net);
 }
 
 static struct pernet_operations fib6_net_ops = {
-- 
cgit v1.2.3


From df77fe4d9865c6354372876632bcbceeda84f6c8 Mon Sep 17 00:00:00 2001
From: Ido Schimmel <idosch@mellanox.com>
Date: Thu, 3 Aug 2017 13:28:17 +0200
Subject: ipv6: fib: Add in-kernel notifications for route add / delete

As with IPv4, allow listeners of the FIB notification chain to receive
notifications whenever a route is added, replaced or deleted. This is
done by placing calls to the FIB notification chain in the two lowest
level functions that end up performing these operations - namely,
fib6_add_rt2node() and fib6_del_route().

Unlike IPv4, APPEND notifications aren't sent as the kernel doesn't
distinguish between "append" (NLM_F_CREATE|NLM_F_APPEND) and "prepend"
(NLM_F_CREATE). If NLM_F_EXCL isn't set, duplicate routes are always
added after the existing duplicate routes.

Signed-off-by: Ido Schimmel <idosch@mellanox.com>
Signed-off-by: Jiri Pirko <jiri@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/ip6_fib.h |  5 +++++
 net/ipv6/ip6_fib.c    | 17 +++++++++++++++++
 2 files changed, 22 insertions(+)

(limited to 'include')

diff --git a/include/net/ip6_fib.h b/include/net/ip6_fib.h
index be8ddf3253dc..e2b292b79e99 100644
--- a/include/net/ip6_fib.h
+++ b/include/net/ip6_fib.h
@@ -258,6 +258,11 @@ typedef struct rt6_info *(*pol_lookup_t)(struct net *,
 					 struct fib6_table *,
 					 struct flowi6 *, int);
 
+struct fib6_entry_notifier_info {
+	struct fib_notifier_info info; /* must be first */
+	struct rt6_info *rt;
+};
+
 /*
  *	exported functions
  */
diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c
index f93976e3f65c..595a57cbbc7b 100644
--- a/net/ipv6/ip6_fib.c
+++ b/net/ipv6/ip6_fib.c
@@ -33,6 +33,7 @@
 #include <net/ndisc.h>
 #include <net/addrconf.h>
 #include <net/lwtunnel.h>
+#include <net/fib_notifier.h>
 
 #include <net/ip6_fib.h>
 #include <net/ip6_route.h>
@@ -302,6 +303,17 @@ static void __net_init fib6_tables_init(struct net *net)
 
 #endif
 
+static int call_fib6_entry_notifiers(struct net *net,
+				     enum fib_event_type event_type,
+				     struct rt6_info *rt)
+{
+	struct fib6_entry_notifier_info info = {
+		.rt = rt,
+	};
+
+	return call_fib6_notifiers(net, event_type, &info.info);
+}
+
 static int fib6_dump_node(struct fib6_walker *w)
 {
 	int res;
@@ -879,6 +891,8 @@ add:
 		*ins = rt;
 		rt->rt6i_node = fn;
 		atomic_inc(&rt->rt6i_ref);
+		call_fib6_entry_notifiers(info->nl_net, FIB_EVENT_ENTRY_ADD,
+					  rt);
 		if (!info->skip_notify)
 			inet6_rt_notify(RTM_NEWROUTE, rt, info, nlflags);
 		info->nl_net->ipv6.rt6_stats->fib_rt_entries++;
@@ -906,6 +920,8 @@ add:
 		rt->rt6i_node = fn;
 		rt->dst.rt6_next = iter->dst.rt6_next;
 		atomic_inc(&rt->rt6i_ref);
+		call_fib6_entry_notifiers(info->nl_net, FIB_EVENT_ENTRY_REPLACE,
+					  rt);
 		if (!info->skip_notify)
 			inet6_rt_notify(RTM_NEWROUTE, rt, info, NLM_F_REPLACE);
 		if (!(fn->fn_flags & RTN_RTINFO)) {
@@ -1459,6 +1475,7 @@ static void fib6_del_route(struct fib6_node *fn, struct rt6_info **rtp,
 
 	fib6_purge_rt(rt, fn, net);
 
+	call_fib6_entry_notifiers(net, FIB_EVENT_ENTRY_DEL, rt);
 	if (!info->skip_notify)
 		inet6_rt_notify(RTM_DELROUTE, rt, info, 0);
 	rt6_release(rt);
-- 
cgit v1.2.3


From dcb18f762f6ac83a6dc9cdc26dd694dcc167beb7 Mon Sep 17 00:00:00 2001
From: Ido Schimmel <idosch@mellanox.com>
Date: Thu, 3 Aug 2017 13:28:18 +0200
Subject: ipv6: fib_rules: Dump rules during registration to FIB chain

Allow users of the FIB notification chain to receive a complete view of
the IPv6 FIB rules upon registration to the chain.

The integrity of the dump is ensured by a per-family sequence counter
that is incremented (under RTNL) whenever a rule is added or deleted.

All the sequence counters are read (under RTNL) and summed, prior and
after the dump. In case the counters differ, then the dump is either
restarted or the registration fails.

Signed-off-by: Ido Schimmel <idosch@mellanox.com>
Signed-off-by: Jiri Pirko <jiri@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/ip6_fib.h    | 10 ++++++++++
 net/ipv6/fib6_notifier.c |  4 ++--
 net/ipv6/fib6_rules.c    | 11 +++++++++++
 3 files changed, 23 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/net/ip6_fib.h b/include/net/ip6_fib.h
index e2b292b79e99..dbe5537809f5 100644
--- a/include/net/ip6_fib.h
+++ b/include/net/ip6_fib.h
@@ -312,6 +312,8 @@ void __net_exit fib6_notifier_exit(struct net *net);
 int fib6_rules_init(void);
 void fib6_rules_cleanup(void);
 bool fib6_rule_default(const struct fib_rule *rule);
+int fib6_rules_dump(struct net *net, struct notifier_block *nb);
+unsigned int fib6_rules_seq_read(struct net *net);
 #else
 static inline int               fib6_rules_init(void)
 {
@@ -325,5 +327,13 @@ static inline bool fib6_rule_default(const struct fib_rule *rule)
 {
 	return true;
 }
+static inline int fib6_rules_dump(struct net *net, struct notifier_block *nb)
+{
+	return 0;
+}
+static inline unsigned int fib6_rules_seq_read(struct net *net)
+{
+	return 0;
+}
 #endif
 #endif
diff --git a/net/ipv6/fib6_notifier.c b/net/ipv6/fib6_notifier.c
index c2bb1ab5b5eb..298efc678f3b 100644
--- a/net/ipv6/fib6_notifier.c
+++ b/net/ipv6/fib6_notifier.c
@@ -23,12 +23,12 @@ int call_fib6_notifiers(struct net *net, enum fib_event_type event_type,
 
 static unsigned int fib6_seq_read(struct net *net)
 {
-	return 0;
+	return fib6_rules_seq_read(net);
 }
 
 static int fib6_dump(struct net *net, struct notifier_block *nb)
 {
-	return 0;
+	return fib6_rules_dump(net, nb);
 }
 
 static const struct fib_notifier_ops fib6_notifier_ops_template = {
diff --git a/net/ipv6/fib6_rules.c b/net/ipv6/fib6_rules.c
index ef1fcee6bf16..2f29e4e33bd3 100644
--- a/net/ipv6/fib6_rules.c
+++ b/net/ipv6/fib6_rules.c
@@ -14,6 +14,7 @@
  */
 
 #include <linux/netdevice.h>
+#include <linux/notifier.h>
 #include <linux/export.h>
 
 #include <net/fib_rules.h>
@@ -49,6 +50,16 @@ bool fib6_rule_default(const struct fib_rule *rule)
 }
 EXPORT_SYMBOL_GPL(fib6_rule_default);
 
+int fib6_rules_dump(struct net *net, struct notifier_block *nb)
+{
+	return fib_rules_dump(net, nb, AF_INET6);
+}
+
+unsigned int fib6_rules_seq_read(struct net *net)
+{
+	return fib_rules_seq_read(net, AF_INET6);
+}
+
 struct dst_entry *fib6_rule_lookup(struct net *net, struct flowi6 *fl6,
 				   int flags, pol_lookup_t lookup)
 {
-- 
cgit v1.2.3


From e1ee0a5ba35d999caef94d659b4cb842e63aeb68 Mon Sep 17 00:00:00 2001
From: Ido Schimmel <idosch@mellanox.com>
Date: Thu, 3 Aug 2017 13:28:19 +0200
Subject: ipv6: fib: Dump tables during registration to FIB chain

Dump all the FIB tables in each net namespace upon registration to the
FIB notification chain so that the callee will have a complete view of
the tables.

The integrity of the dump is ensured by a per-table sequence counter
that is incremented (under write lock) whenever a route is added or
deleted from the table.

All the sequence counters are read (under each table's read lock) and
summed, prior and after the dump. In case the counters differ, then the
dump is either restarted or the registration fails.

While it's possible for a table to be modified after its counter has
been read, this isn't really a problem. In case it happened before it
was read the second time, then the comparison at the end will fail. If
it happened afterwards, then we're guaranteed to be notified about the
change, as the notification block is registered prior to the second
read.

Signed-off-by: Ido Schimmel <idosch@mellanox.com>
Signed-off-by: Jiri Pirko <jiri@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/ip6_fib.h    |  4 +++
 net/ipv6/fib6_notifier.c | 10 ++++--
 net/ipv6/ip6_fib.c       | 92 ++++++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 104 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/net/ip6_fib.h b/include/net/ip6_fib.h
index dbe5537809f5..0b3052157e6b 100644
--- a/include/net/ip6_fib.h
+++ b/include/net/ip6_fib.h
@@ -235,6 +235,7 @@ struct fib6_table {
 	struct fib6_node	tb6_root;
 	struct inet_peer_base	tb6_peers;
 	unsigned int		flags;
+	unsigned int		fib_seq;
 #define RT6_TABLE_HAS_DFLT_ROUTER	BIT(0)
 };
 
@@ -308,6 +309,9 @@ int call_fib6_notifiers(struct net *net, enum fib_event_type event_type,
 int __net_init fib6_notifier_init(struct net *net);
 void __net_exit fib6_notifier_exit(struct net *net);
 
+unsigned int fib6_tables_seq_read(struct net *net);
+int fib6_tables_dump(struct net *net, struct notifier_block *nb);
+
 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
 int fib6_rules_init(void);
 void fib6_rules_cleanup(void);
diff --git a/net/ipv6/fib6_notifier.c b/net/ipv6/fib6_notifier.c
index 298efc678f3b..66a103ef7e86 100644
--- a/net/ipv6/fib6_notifier.c
+++ b/net/ipv6/fib6_notifier.c
@@ -23,12 +23,18 @@ int call_fib6_notifiers(struct net *net, enum fib_event_type event_type,
 
 static unsigned int fib6_seq_read(struct net *net)
 {
-	return fib6_rules_seq_read(net);
+	return fib6_tables_seq_read(net) + fib6_rules_seq_read(net);
 }
 
 static int fib6_dump(struct net *net, struct notifier_block *nb)
 {
-	return fib6_rules_dump(net, nb);
+	int err;
+
+	err = fib6_rules_dump(net, nb);
+	if (err)
+		return err;
+
+	return fib6_tables_dump(net, nb);
 }
 
 static const struct fib_notifier_ops fib6_notifier_ops_template = {
diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c
index 595a57cbbc7b..719c10480c74 100644
--- a/net/ipv6/ip6_fib.c
+++ b/net/ipv6/ip6_fib.c
@@ -303,6 +303,37 @@ static void __net_init fib6_tables_init(struct net *net)
 
 #endif
 
+unsigned int fib6_tables_seq_read(struct net *net)
+{
+	unsigned int h, fib_seq = 0;
+
+	rcu_read_lock();
+	for (h = 0; h < FIB6_TABLE_HASHSZ; h++) {
+		struct hlist_head *head = &net->ipv6.fib_table_hash[h];
+		struct fib6_table *tb;
+
+		hlist_for_each_entry_rcu(tb, head, tb6_hlist) {
+			read_lock_bh(&tb->tb6_lock);
+			fib_seq += tb->fib_seq;
+			read_unlock_bh(&tb->tb6_lock);
+		}
+	}
+	rcu_read_unlock();
+
+	return fib_seq;
+}
+
+static int call_fib6_entry_notifier(struct notifier_block *nb, struct net *net,
+				    enum fib_event_type event_type,
+				    struct rt6_info *rt)
+{
+	struct fib6_entry_notifier_info info = {
+		.rt = rt,
+	};
+
+	return call_fib6_notifier(nb, net, event_type, &info.info);
+}
+
 static int call_fib6_entry_notifiers(struct net *net,
 				     enum fib_event_type event_type,
 				     struct rt6_info *rt)
@@ -311,9 +342,70 @@ static int call_fib6_entry_notifiers(struct net *net,
 		.rt = rt,
 	};
 
+	rt->rt6i_table->fib_seq++;
 	return call_fib6_notifiers(net, event_type, &info.info);
 }
 
+struct fib6_dump_arg {
+	struct net *net;
+	struct notifier_block *nb;
+};
+
+static void fib6_rt_dump(struct rt6_info *rt, struct fib6_dump_arg *arg)
+{
+	if (rt == arg->net->ipv6.ip6_null_entry)
+		return;
+	call_fib6_entry_notifier(arg->nb, arg->net, FIB_EVENT_ENTRY_ADD, rt);
+}
+
+static int fib6_node_dump(struct fib6_walker *w)
+{
+	struct rt6_info *rt;
+
+	for (rt = w->leaf; rt; rt = rt->dst.rt6_next)
+		fib6_rt_dump(rt, w->args);
+	w->leaf = NULL;
+	return 0;
+}
+
+static void fib6_table_dump(struct net *net, struct fib6_table *tb,
+			    struct fib6_walker *w)
+{
+	w->root = &tb->tb6_root;
+	read_lock_bh(&tb->tb6_lock);
+	fib6_walk(net, w);
+	read_unlock_bh(&tb->tb6_lock);
+}
+
+/* Called with rcu_read_lock() */
+int fib6_tables_dump(struct net *net, struct notifier_block *nb)
+{
+	struct fib6_dump_arg arg;
+	struct fib6_walker *w;
+	unsigned int h;
+
+	w = kzalloc(sizeof(*w), GFP_ATOMIC);
+	if (!w)
+		return -ENOMEM;
+
+	w->func = fib6_node_dump;
+	arg.net = net;
+	arg.nb = nb;
+	w->args = &arg;
+
+	for (h = 0; h < FIB6_TABLE_HASHSZ; h++) {
+		struct hlist_head *head = &net->ipv6.fib_table_hash[h];
+		struct fib6_table *tb;
+
+		hlist_for_each_entry_rcu(tb, head, tb6_hlist)
+			fib6_table_dump(net, tb, w);
+	}
+
+	kfree(w);
+
+	return 0;
+}
+
 static int fib6_dump_node(struct fib6_walker *w)
 {
 	int res;
-- 
cgit v1.2.3


From 61e4d01e16acddadb9723143637a20417fa67ac9 Mon Sep 17 00:00:00 2001
From: Ido Schimmel <idosch@mellanox.com>
Date: Thu, 3 Aug 2017 13:28:20 +0200
Subject: ipv6: fib: Add offload indication to routes

Allow user space applications to see which routes are offloaded and
which aren't by setting the RTNH_F_OFFLOAD flag when dumping them.

To be consistent with IPv4, offload indication is provided on a
per-nexthop basis.

Signed-off-by: Ido Schimmel <idosch@mellanox.com>
Signed-off-by: Jiri Pirko <jiri@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/ipv6_route.h | 1 +
 net/ipv6/route.c                | 8 ++++++++
 2 files changed, 9 insertions(+)

(limited to 'include')

diff --git a/include/uapi/linux/ipv6_route.h b/include/uapi/linux/ipv6_route.h
index d496c02e14bc..33e2a5732bd1 100644
--- a/include/uapi/linux/ipv6_route.h
+++ b/include/uapi/linux/ipv6_route.h
@@ -35,6 +35,7 @@
 #define RTF_PREF(pref)	((pref) << 27)
 #define RTF_PREF_MASK	0x18000000
 
+#define RTF_OFFLOAD	0x20000000	/* offloaded route		*/
 #define RTF_PCPU	0x40000000	/* read-only: can not be set by user */
 #define RTF_LOCAL	0x80000000
 
diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index 4d30c96a819d..aba07fce67fb 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -1820,6 +1820,11 @@ static struct rt6_info *ip6_route_info_create(struct fib6_config *cfg,
 		goto out;
 	}
 
+	if (cfg->fc_flags & RTF_OFFLOAD) {
+		NL_SET_ERR_MSG(extack, "Userspace can not set RTF_OFFLOAD");
+		goto out;
+	}
+
 	if (cfg->fc_dst_len > 128) {
 		NL_SET_ERR_MSG(extack, "Invalid prefix length");
 		goto out;
@@ -3327,6 +3332,9 @@ static int rt6_nexthop_info(struct sk_buff *skb, struct rt6_info *rt,
 			goto nla_put_failure;
 	}
 
+	if (rt->rt6i_flags & RTF_OFFLOAD)
+		*flags |= RTNH_F_OFFLOAD;
+
 	/* not needed for multipath encoding b/c it has a rtnexthop struct */
 	if (!skip_oif && rt->dst.dev &&
 	    nla_put_u32(skb, RTA_OIF, rt->dst.dev->ifindex))
-- 
cgit v1.2.3


From a460aa83963b185a32a6377eb486b6e613ac8e38 Mon Sep 17 00:00:00 2001
From: Ido Schimmel <idosch@mellanox.com>
Date: Thu, 3 Aug 2017 13:28:25 +0200
Subject: ipv6: fib: Add helpers to hold / drop a reference on rt6_info

Similar to commit 1c677b3d2828 ("ipv4: fib: Add fib_info_hold() helper")
and commit b423cb10807b ("ipv4: fib: Export free_fib_info()") add an
helper to hold a reference on rt6_info and export rt6_release() to drop
it and potentially release the route.

This is needed so that drivers capable of FIB offload could hold a
reference on the route before queueing it for offload and drop it after
the route has been programmed to the device's tables.

Signed-off-by: Ido Schimmel <idosch@mellanox.com>
Signed-off-by: Jiri Pirko <jiri@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/ip6_fib.h | 16 ++++++++++++++++
 net/ipv6/ip6_fib.c    | 12 ++----------
 2 files changed, 18 insertions(+), 10 deletions(-)

(limited to 'include')

diff --git a/include/net/ip6_fib.h b/include/net/ip6_fib.h
index 0b3052157e6b..1d790ea40ea7 100644
--- a/include/net/ip6_fib.h
+++ b/include/net/ip6_fib.h
@@ -187,6 +187,22 @@ static inline void ip6_rt_put(struct rt6_info *rt)
 	dst_release(&rt->dst);
 }
 
+void rt6_free_pcpu(struct rt6_info *non_pcpu_rt);
+
+static inline void rt6_hold(struct rt6_info *rt)
+{
+	atomic_inc(&rt->rt6i_ref);
+}
+
+static inline void rt6_release(struct rt6_info *rt)
+{
+	if (atomic_dec_and_test(&rt->rt6i_ref)) {
+		rt6_free_pcpu(rt);
+		dst_dev_put(&rt->dst);
+		dst_release(&rt->dst);
+	}
+}
+
 enum fib6_walk_state {
 #ifdef CONFIG_IPV6_SUBTREES
 	FWS_S,
diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c
index fe193b48ef61..69ed0043d117 100644
--- a/net/ipv6/ip6_fib.c
+++ b/net/ipv6/ip6_fib.c
@@ -154,7 +154,7 @@ static void node_free(struct fib6_node *fn)
 	kmem_cache_free(fib6_node_kmem, fn);
 }
 
-static void rt6_free_pcpu(struct rt6_info *non_pcpu_rt)
+void rt6_free_pcpu(struct rt6_info *non_pcpu_rt)
 {
 	int cpu;
 
@@ -177,15 +177,7 @@ static void rt6_free_pcpu(struct rt6_info *non_pcpu_rt)
 	free_percpu(non_pcpu_rt->rt6i_pcpu);
 	non_pcpu_rt->rt6i_pcpu = NULL;
 }
-
-static void rt6_release(struct rt6_info *rt)
-{
-	if (atomic_dec_and_test(&rt->rt6i_ref)) {
-		rt6_free_pcpu(rt);
-		dst_dev_put(&rt->dst);
-		dst_release(&rt->dst);
-	}
-}
+EXPORT_SYMBOL_GPL(rt6_free_pcpu);
 
 static void fib6_link_table(struct net *net, struct fib6_table *tb)
 {
-- 
cgit v1.2.3


From a7c391f04fe3259fb0417d71fec78ae28f25780e Mon Sep 17 00:00:00 2001
From: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Date: Mon, 24 Jul 2017 11:28:03 +0100
Subject: crypto: algapi - use separate dst and src operands for __crypto_xor()

In preparation of introducing crypto_xor_cpy(), which will use separate
operands for input and output, modify the __crypto_xor() implementation,
which it will share with the existing crypto_xor(), which provides the
actual functionality when not using the inline version.

Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 crypto/algapi.c         | 25 +++++++++++++++----------
 include/crypto/algapi.h |  4 ++--
 2 files changed, 17 insertions(+), 12 deletions(-)

(limited to 'include')

diff --git a/crypto/algapi.c b/crypto/algapi.c
index e4cc7615a139..aa699ff6c876 100644
--- a/crypto/algapi.c
+++ b/crypto/algapi.c
@@ -975,13 +975,15 @@ void crypto_inc(u8 *a, unsigned int size)
 }
 EXPORT_SYMBOL_GPL(crypto_inc);
 
-void __crypto_xor(u8 *dst, const u8 *src, unsigned int len)
+void __crypto_xor(u8 *dst, const u8 *src1, const u8 *src2, unsigned int len)
 {
 	int relalign = 0;
 
 	if (!IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS)) {
 		int size = sizeof(unsigned long);
-		int d = ((unsigned long)dst ^ (unsigned long)src) & (size - 1);
+		int d = (((unsigned long)dst ^ (unsigned long)src1) |
+			 ((unsigned long)dst ^ (unsigned long)src2)) &
+			(size - 1);
 
 		relalign = d ? 1 << __ffs(d) : size;
 
@@ -992,34 +994,37 @@ void __crypto_xor(u8 *dst, const u8 *src, unsigned int len)
 		 * process the remainder of the input using optimal strides.
 		 */
 		while (((unsigned long)dst & (relalign - 1)) && len > 0) {
-			*dst++ ^= *src++;
+			*dst++ = *src1++ ^ *src2++;
 			len--;
 		}
 	}
 
 	while (IS_ENABLED(CONFIG_64BIT) && len >= 8 && !(relalign & 7)) {
-		*(u64 *)dst ^= *(u64 *)src;
+		*(u64 *)dst = *(u64 *)src1 ^  *(u64 *)src2;
 		dst += 8;
-		src += 8;
+		src1 += 8;
+		src2 += 8;
 		len -= 8;
 	}
 
 	while (len >= 4 && !(relalign & 3)) {
-		*(u32 *)dst ^= *(u32 *)src;
+		*(u32 *)dst = *(u32 *)src1 ^ *(u32 *)src2;
 		dst += 4;
-		src += 4;
+		src1 += 4;
+		src2 += 4;
 		len -= 4;
 	}
 
 	while (len >= 2 && !(relalign & 1)) {
-		*(u16 *)dst ^= *(u16 *)src;
+		*(u16 *)dst = *(u16 *)src1 ^ *(u16 *)src2;
 		dst += 2;
-		src += 2;
+		src1 += 2;
+		src2 += 2;
 		len -= 2;
 	}
 
 	while (len--)
-		*dst++ ^= *src++;
+		*dst++ = *src1++ ^ *src2++;
 }
 EXPORT_SYMBOL_GPL(__crypto_xor);
 
diff --git a/include/crypto/algapi.h b/include/crypto/algapi.h
index 436c4c2683c7..fd547f946bf8 100644
--- a/include/crypto/algapi.h
+++ b/include/crypto/algapi.h
@@ -192,7 +192,7 @@ static inline unsigned int crypto_queue_len(struct crypto_queue *queue)
 }
 
 void crypto_inc(u8 *a, unsigned int size);
-void __crypto_xor(u8 *dst, const u8 *src, unsigned int size);
+void __crypto_xor(u8 *dst, const u8 *src1, const u8 *src2, unsigned int size);
 
 static inline void crypto_xor(u8 *dst, const u8 *src, unsigned int size)
 {
@@ -207,7 +207,7 @@ static inline void crypto_xor(u8 *dst, const u8 *src, unsigned int size)
 			size -= sizeof(unsigned long);
 		}
 	} else {
-		__crypto_xor(dst, src, size);
+		__crypto_xor(dst, dst, src, size);
 	}
 }
 
-- 
cgit v1.2.3


From 45fe93dff2fb58b22de04c729f8447ba0f773d93 Mon Sep 17 00:00:00 2001
From: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Date: Mon, 24 Jul 2017 11:28:04 +0100
Subject: crypto: algapi - make crypto_xor() take separate dst and src
 arguments

There are quite a number of occurrences in the kernel of the pattern

  if (dst != src)
          memcpy(dst, src, walk.total % AES_BLOCK_SIZE);
  crypto_xor(dst, final, walk.total % AES_BLOCK_SIZE);

or

  crypto_xor(keystream, src, nbytes);
  memcpy(dst, keystream, nbytes);

where crypto_xor() is preceded or followed by a memcpy() invocation
that is only there because crypto_xor() uses its output parameter as
one of the inputs. To avoid having to add new instances of this pattern
in the arm64 code, which will be refactored to implement non-SIMD
fallbacks, add an alternative implementation called crypto_xor_cpy(),
taking separate input and output arguments. This removes the need for
the separate memcpy().

Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 arch/arm/crypto/aes-ce-glue.c       |  4 +---
 arch/arm/crypto/aes-neonbs-glue.c   |  5 ++---
 arch/arm64/crypto/aes-glue.c        |  4 +---
 arch/arm64/crypto/aes-neonbs-glue.c |  5 ++---
 arch/sparc/crypto/aes_glue.c        |  3 +--
 arch/x86/crypto/aesni-intel_glue.c  |  4 ++--
 arch/x86/crypto/blowfish_glue.c     |  3 +--
 arch/x86/crypto/cast5_avx_glue.c    |  3 +--
 arch/x86/crypto/des3_ede_glue.c     |  3 +--
 crypto/ctr.c                        |  3 +--
 crypto/pcbc.c                       | 12 ++++--------
 drivers/crypto/vmx/aes_ctr.c        |  3 +--
 drivers/md/dm-crypt.c               | 11 +++++------
 include/crypto/algapi.h             | 19 +++++++++++++++++++
 14 files changed, 42 insertions(+), 40 deletions(-)

(limited to 'include')

diff --git a/arch/arm/crypto/aes-ce-glue.c b/arch/arm/crypto/aes-ce-glue.c
index 0f966a8ca1ce..d0a9cec73707 100644
--- a/arch/arm/crypto/aes-ce-glue.c
+++ b/arch/arm/crypto/aes-ce-glue.c
@@ -285,9 +285,7 @@ static int ctr_encrypt(struct skcipher_request *req)
 
 		ce_aes_ctr_encrypt(tail, NULL, (u8 *)ctx->key_enc,
 				   num_rounds(ctx), blocks, walk.iv);
-		if (tdst != tsrc)
-			memcpy(tdst, tsrc, nbytes);
-		crypto_xor(tdst, tail, nbytes);
+		crypto_xor_cpy(tdst, tsrc, tail, nbytes);
 		err = skcipher_walk_done(&walk, 0);
 	}
 	kernel_neon_end();
diff --git a/arch/arm/crypto/aes-neonbs-glue.c b/arch/arm/crypto/aes-neonbs-glue.c
index c76377961444..18768f330449 100644
--- a/arch/arm/crypto/aes-neonbs-glue.c
+++ b/arch/arm/crypto/aes-neonbs-glue.c
@@ -221,9 +221,8 @@ static int ctr_encrypt(struct skcipher_request *req)
 			u8 *dst = walk.dst.virt.addr + blocks * AES_BLOCK_SIZE;
 			u8 *src = walk.src.virt.addr + blocks * AES_BLOCK_SIZE;
 
-			if (dst != src)
-				memcpy(dst, src, walk.total % AES_BLOCK_SIZE);
-			crypto_xor(dst, final, walk.total % AES_BLOCK_SIZE);
+			crypto_xor_cpy(dst, src, final,
+				       walk.total % AES_BLOCK_SIZE);
 
 			err = skcipher_walk_done(&walk, 0);
 			break;
diff --git a/arch/arm64/crypto/aes-glue.c b/arch/arm64/crypto/aes-glue.c
index bcf596b0197e..0da30e3b0e4b 100644
--- a/arch/arm64/crypto/aes-glue.c
+++ b/arch/arm64/crypto/aes-glue.c
@@ -241,9 +241,7 @@ static int ctr_encrypt(struct skcipher_request *req)
 
 		aes_ctr_encrypt(tail, NULL, (u8 *)ctx->key_enc, rounds,
 				blocks, walk.iv, first);
-		if (tdst != tsrc)
-			memcpy(tdst, tsrc, nbytes);
-		crypto_xor(tdst, tail, nbytes);
+		crypto_xor_cpy(tdst, tsrc, tail, nbytes);
 		err = skcipher_walk_done(&walk, 0);
 	}
 	kernel_neon_end();
diff --git a/arch/arm64/crypto/aes-neonbs-glue.c b/arch/arm64/crypto/aes-neonbs-glue.c
index db2501d93550..9001aec16007 100644
--- a/arch/arm64/crypto/aes-neonbs-glue.c
+++ b/arch/arm64/crypto/aes-neonbs-glue.c
@@ -224,9 +224,8 @@ static int ctr_encrypt(struct skcipher_request *req)
 			u8 *dst = walk.dst.virt.addr + blocks * AES_BLOCK_SIZE;
 			u8 *src = walk.src.virt.addr + blocks * AES_BLOCK_SIZE;
 
-			if (dst != src)
-				memcpy(dst, src, walk.total % AES_BLOCK_SIZE);
-			crypto_xor(dst, final, walk.total % AES_BLOCK_SIZE);
+			crypto_xor_cpy(dst, src, final,
+				       walk.total % AES_BLOCK_SIZE);
 
 			err = skcipher_walk_done(&walk, 0);
 			break;
diff --git a/arch/sparc/crypto/aes_glue.c b/arch/sparc/crypto/aes_glue.c
index c90930de76ba..3cd4f6b198b6 100644
--- a/arch/sparc/crypto/aes_glue.c
+++ b/arch/sparc/crypto/aes_glue.c
@@ -344,8 +344,7 @@ static void ctr_crypt_final(struct crypto_sparc64_aes_ctx *ctx,
 
 	ctx->ops->ecb_encrypt(&ctx->key[0], (const u64 *)ctrblk,
 			      keystream, AES_BLOCK_SIZE);
-	crypto_xor((u8 *) keystream, src, nbytes);
-	memcpy(dst, keystream, nbytes);
+	crypto_xor_cpy(dst, (u8 *) keystream, src, nbytes);
 	crypto_inc(ctrblk, AES_BLOCK_SIZE);
 }
 
diff --git a/arch/x86/crypto/aesni-intel_glue.c b/arch/x86/crypto/aesni-intel_glue.c
index 4a55cdcdc008..5c15d6b57329 100644
--- a/arch/x86/crypto/aesni-intel_glue.c
+++ b/arch/x86/crypto/aesni-intel_glue.c
@@ -475,8 +475,8 @@ static void ctr_crypt_final(struct crypto_aes_ctx *ctx,
 	unsigned int nbytes = walk->nbytes;
 
 	aesni_enc(ctx, keystream, ctrblk);
-	crypto_xor(keystream, src, nbytes);
-	memcpy(dst, keystream, nbytes);
+	crypto_xor_cpy(dst, keystream, src, nbytes);
+
 	crypto_inc(ctrblk, AES_BLOCK_SIZE);
 }
 
diff --git a/arch/x86/crypto/blowfish_glue.c b/arch/x86/crypto/blowfish_glue.c
index 17c05531dfd1..f9eca34301e2 100644
--- a/arch/x86/crypto/blowfish_glue.c
+++ b/arch/x86/crypto/blowfish_glue.c
@@ -271,8 +271,7 @@ static void ctr_crypt_final(struct bf_ctx *ctx, struct blkcipher_walk *walk)
 	unsigned int nbytes = walk->nbytes;
 
 	blowfish_enc_blk(ctx, keystream, ctrblk);
-	crypto_xor(keystream, src, nbytes);
-	memcpy(dst, keystream, nbytes);
+	crypto_xor_cpy(dst, keystream, src, nbytes);
 
 	crypto_inc(ctrblk, BF_BLOCK_SIZE);
 }
diff --git a/arch/x86/crypto/cast5_avx_glue.c b/arch/x86/crypto/cast5_avx_glue.c
index 8648158f3916..dbea6020ffe7 100644
--- a/arch/x86/crypto/cast5_avx_glue.c
+++ b/arch/x86/crypto/cast5_avx_glue.c
@@ -256,8 +256,7 @@ static void ctr_crypt_final(struct blkcipher_desc *desc,
 	unsigned int nbytes = walk->nbytes;
 
 	__cast5_encrypt(ctx, keystream, ctrblk);
-	crypto_xor(keystream, src, nbytes);
-	memcpy(dst, keystream, nbytes);
+	crypto_xor_cpy(dst, keystream, src, nbytes);
 
 	crypto_inc(ctrblk, CAST5_BLOCK_SIZE);
 }
diff --git a/arch/x86/crypto/des3_ede_glue.c b/arch/x86/crypto/des3_ede_glue.c
index d6fc59aaaadf..30c0a37f4882 100644
--- a/arch/x86/crypto/des3_ede_glue.c
+++ b/arch/x86/crypto/des3_ede_glue.c
@@ -277,8 +277,7 @@ static void ctr_crypt_final(struct des3_ede_x86_ctx *ctx,
 	unsigned int nbytes = walk->nbytes;
 
 	des3_ede_enc_blk(ctx, keystream, ctrblk);
-	crypto_xor(keystream, src, nbytes);
-	memcpy(dst, keystream, nbytes);
+	crypto_xor_cpy(dst, keystream, src, nbytes);
 
 	crypto_inc(ctrblk, DES3_EDE_BLOCK_SIZE);
 }
diff --git a/crypto/ctr.c b/crypto/ctr.c
index 477d9226ccaa..854d924f9d8e 100644
--- a/crypto/ctr.c
+++ b/crypto/ctr.c
@@ -65,8 +65,7 @@ static void crypto_ctr_crypt_final(struct blkcipher_walk *walk,
 	unsigned int nbytes = walk->nbytes;
 
 	crypto_cipher_encrypt_one(tfm, keystream, ctrblk);
-	crypto_xor(keystream, src, nbytes);
-	memcpy(dst, keystream, nbytes);
+	crypto_xor_cpy(dst, keystream, src, nbytes);
 
 	crypto_inc(ctrblk, bsize);
 }
diff --git a/crypto/pcbc.c b/crypto/pcbc.c
index 29dd2b4a3b85..d9e45a958720 100644
--- a/crypto/pcbc.c
+++ b/crypto/pcbc.c
@@ -55,8 +55,7 @@ static int crypto_pcbc_encrypt_segment(struct skcipher_request *req,
 	do {
 		crypto_xor(iv, src, bsize);
 		crypto_cipher_encrypt_one(tfm, dst, iv);
-		memcpy(iv, dst, bsize);
-		crypto_xor(iv, src, bsize);
+		crypto_xor_cpy(iv, dst, src, bsize);
 
 		src += bsize;
 		dst += bsize;
@@ -79,8 +78,7 @@ static int crypto_pcbc_encrypt_inplace(struct skcipher_request *req,
 		memcpy(tmpbuf, src, bsize);
 		crypto_xor(iv, src, bsize);
 		crypto_cipher_encrypt_one(tfm, src, iv);
-		memcpy(iv, tmpbuf, bsize);
-		crypto_xor(iv, src, bsize);
+		crypto_xor_cpy(iv, tmpbuf, src, bsize);
 
 		src += bsize;
 	} while ((nbytes -= bsize) >= bsize);
@@ -127,8 +125,7 @@ static int crypto_pcbc_decrypt_segment(struct skcipher_request *req,
 	do {
 		crypto_cipher_decrypt_one(tfm, dst, src);
 		crypto_xor(dst, iv, bsize);
-		memcpy(iv, src, bsize);
-		crypto_xor(iv, dst, bsize);
+		crypto_xor_cpy(iv, dst, src, bsize);
 
 		src += bsize;
 		dst += bsize;
@@ -153,8 +150,7 @@ static int crypto_pcbc_decrypt_inplace(struct skcipher_request *req,
 		memcpy(tmpbuf, src, bsize);
 		crypto_cipher_decrypt_one(tfm, src, src);
 		crypto_xor(src, iv, bsize);
-		memcpy(iv, tmpbuf, bsize);
-		crypto_xor(iv, src, bsize);
+		crypto_xor_cpy(iv, src, tmpbuf, bsize);
 
 		src += bsize;
 	} while ((nbytes -= bsize) >= bsize);
diff --git a/drivers/crypto/vmx/aes_ctr.c b/drivers/crypto/vmx/aes_ctr.c
index 9c26d9e8dbea..17d84217dd76 100644
--- a/drivers/crypto/vmx/aes_ctr.c
+++ b/drivers/crypto/vmx/aes_ctr.c
@@ -104,8 +104,7 @@ static void p8_aes_ctr_final(struct p8_aes_ctr_ctx *ctx,
 	pagefault_enable();
 	preempt_enable();
 
-	crypto_xor(keystream, src, nbytes);
-	memcpy(dst, keystream, nbytes);
+	crypto_xor_cpy(dst, keystream, src, nbytes);
 	crypto_inc(ctrblk, AES_BLOCK_SIZE);
 }
 
diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c
index cdf6b1e12460..fa17e5452796 100644
--- a/drivers/md/dm-crypt.c
+++ b/drivers/md/dm-crypt.c
@@ -758,9 +758,8 @@ static int crypt_iv_tcw_whitening(struct crypt_config *cc,
 	int i, r;
 
 	/* xor whitening with sector number */
-	memcpy(buf, tcw->whitening, TCW_WHITENING_SIZE);
-	crypto_xor(buf, (u8 *)&sector, 8);
-	crypto_xor(&buf[8], (u8 *)&sector, 8);
+	crypto_xor_cpy(buf, tcw->whitening, (u8 *)&sector, 8);
+	crypto_xor_cpy(&buf[8], tcw->whitening + 8, (u8 *)&sector, 8);
 
 	/* calculate crc32 for every 32bit part and xor it */
 	desc->tfm = tcw->crc32_tfm;
@@ -805,10 +804,10 @@ static int crypt_iv_tcw_gen(struct crypt_config *cc, u8 *iv,
 	}
 
 	/* Calculate IV */
-	memcpy(iv, tcw->iv_seed, cc->iv_size);
-	crypto_xor(iv, (u8 *)&sector, 8);
+	crypto_xor_cpy(iv, tcw->iv_seed, (u8 *)&sector, 8);
 	if (cc->iv_size > 8)
-		crypto_xor(&iv[8], (u8 *)&sector, cc->iv_size - 8);
+		crypto_xor_cpy(&iv[8], tcw->iv_seed + 8, (u8 *)&sector,
+			       cc->iv_size - 8);
 
 	return r;
 }
diff --git a/include/crypto/algapi.h b/include/crypto/algapi.h
index fd547f946bf8..e3cebf640c00 100644
--- a/include/crypto/algapi.h
+++ b/include/crypto/algapi.h
@@ -211,6 +211,25 @@ static inline void crypto_xor(u8 *dst, const u8 *src, unsigned int size)
 	}
 }
 
+static inline void crypto_xor_cpy(u8 *dst, const u8 *src1, const u8 *src2,
+				  unsigned int size)
+{
+	if (IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) &&
+	    __builtin_constant_p(size) &&
+	    (size % sizeof(unsigned long)) == 0) {
+		unsigned long *d = (unsigned long *)dst;
+		unsigned long *s1 = (unsigned long *)src1;
+		unsigned long *s2 = (unsigned long *)src2;
+
+		while (size > 0) {
+			*d++ = *s1++ ^ *s2++;
+			size -= sizeof(unsigned long);
+		}
+	} else {
+		__crypto_xor(dst, src1, src2, size);
+	}
+}
+
 int blkcipher_walk_done(struct blkcipher_desc *desc,
 			struct blkcipher_walk *walk, int err);
 int blkcipher_walk_virt(struct blkcipher_desc *desc,
-- 
cgit v1.2.3


From e652399edba99a5497f0d80f240c9075d3b43493 Mon Sep 17 00:00:00 2001
From: Gary R Hook <gary.hook@amd.com>
Date: Tue, 25 Jul 2017 14:12:11 -0500
Subject: crypto: ccp - Fix XTS-AES-128 support on v5 CCPs

Version 5 CCPs have some new requirements for XTS-AES: the type field
must be specified, and the key requires 512 bits, with each part
occupying 256 bits and padded with zeroes.

cc: <stable@vger.kernel.org> # 4.9.x+

Signed-off-by: Gary R Hook <ghook@amd.com>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 drivers/crypto/ccp/ccp-crypto-aes-xts.c |  4 ++-
 drivers/crypto/ccp/ccp-dev-v5.c         |  2 ++
 drivers/crypto/ccp/ccp-dev.h            |  2 ++
 drivers/crypto/ccp/ccp-ops.c            | 43 ++++++++++++++++++++++++++-------
 include/linux/ccp.h                     |  3 ++-
 5 files changed, 43 insertions(+), 11 deletions(-)

(limited to 'include')

diff --git a/drivers/crypto/ccp/ccp-crypto-aes-xts.c b/drivers/crypto/ccp/ccp-crypto-aes-xts.c
index 58a4244b4752..3f26a415ef44 100644
--- a/drivers/crypto/ccp/ccp-crypto-aes-xts.c
+++ b/drivers/crypto/ccp/ccp-crypto-aes-xts.c
@@ -1,8 +1,9 @@
 /*
  * AMD Cryptographic Coprocessor (CCP) AES XTS crypto API support
  *
- * Copyright (C) 2013 Advanced Micro Devices, Inc.
+ * Copyright (C) 2013,2017 Advanced Micro Devices, Inc.
  *
+ * Author: Gary R Hook <gary.hook@amd.com>
  * Author: Tom Lendacky <thomas.lendacky@amd.com>
  *
  * This program is free software; you can redistribute it and/or modify
@@ -164,6 +165,7 @@ static int ccp_aes_xts_crypt(struct ablkcipher_request *req,
 	memset(&rctx->cmd, 0, sizeof(rctx->cmd));
 	INIT_LIST_HEAD(&rctx->cmd.entry);
 	rctx->cmd.engine = CCP_ENGINE_XTS_AES_128;
+	rctx->cmd.u.xts.type = CCP_AES_TYPE_128;
 	rctx->cmd.u.xts.action = (encrypt) ? CCP_AES_ACTION_ENCRYPT
 					   : CCP_AES_ACTION_DECRYPT;
 	rctx->cmd.u.xts.unit_size = unit_size;
diff --git a/drivers/crypto/ccp/ccp-dev-v5.c b/drivers/crypto/ccp/ccp-dev-v5.c
index 6ace6dd5a239..65604fc65e8f 100644
--- a/drivers/crypto/ccp/ccp-dev-v5.c
+++ b/drivers/crypto/ccp/ccp-dev-v5.c
@@ -145,6 +145,7 @@ union ccp_function {
 #define	CCP_AES_MODE(p)		((p)->aes.mode)
 #define	CCP_AES_TYPE(p)		((p)->aes.type)
 #define	CCP_XTS_SIZE(p)		((p)->aes_xts.size)
+#define	CCP_XTS_TYPE(p)		((p)->aes_xts.type)
 #define	CCP_XTS_ENCRYPT(p)	((p)->aes_xts.encrypt)
 #define	CCP_DES3_SIZE(p)	((p)->des3.size)
 #define	CCP_DES3_ENCRYPT(p)	((p)->des3.encrypt)
@@ -344,6 +345,7 @@ static int ccp5_perform_xts_aes(struct ccp_op *op)
 	CCP5_CMD_PROT(&desc) = 0;
 
 	function.raw = 0;
+	CCP_XTS_TYPE(&function) = op->u.xts.type;
 	CCP_XTS_ENCRYPT(&function) = op->u.xts.action;
 	CCP_XTS_SIZE(&function) = op->u.xts.unit_size;
 	CCP5_CMD_FUNCTION(&desc) = function.raw;
diff --git a/drivers/crypto/ccp/ccp-dev.h b/drivers/crypto/ccp/ccp-dev.h
index 1f6deee117a7..6810b65c1939 100644
--- a/drivers/crypto/ccp/ccp-dev.h
+++ b/drivers/crypto/ccp/ccp-dev.h
@@ -194,6 +194,7 @@
 #define CCP_AES_CTX_SB_COUNT		1
 
 #define CCP_XTS_AES_KEY_SB_COUNT	1
+#define CCP5_XTS_AES_KEY_SB_COUNT	2
 #define CCP_XTS_AES_CTX_SB_COUNT	1
 
 #define CCP_DES3_KEY_SB_COUNT		1
@@ -498,6 +499,7 @@ struct ccp_aes_op {
 };
 
 struct ccp_xts_aes_op {
+	enum ccp_aes_type type;
 	enum ccp_aes_action action;
 	enum ccp_xts_aes_unit_size unit_size;
 };
diff --git a/drivers/crypto/ccp/ccp-ops.c b/drivers/crypto/ccp/ccp-ops.c
index 40c062ad8726..10e3be8d93ce 100644
--- a/drivers/crypto/ccp/ccp-ops.c
+++ b/drivers/crypto/ccp/ccp-ops.c
@@ -1038,6 +1038,8 @@ static int ccp_run_xts_aes_cmd(struct ccp_cmd_queue *cmd_q,
 	struct ccp_op op;
 	unsigned int unit_size, dm_offset;
 	bool in_place = false;
+	unsigned int sb_count;
+	enum ccp_aes_type aestype;
 	int ret;
 
 	switch (xts->unit_size) {
@@ -1061,7 +1063,9 @@ static int ccp_run_xts_aes_cmd(struct ccp_cmd_queue *cmd_q,
 		return -EINVAL;
 	}
 
-	if (xts->key_len != AES_KEYSIZE_128)
+	if (xts->key_len == AES_KEYSIZE_128)
+		aestype = CCP_AES_TYPE_128;
+	else
 		return -EINVAL;
 
 	if (!xts->final && (xts->src_len & (AES_BLOCK_SIZE - 1)))
@@ -1083,23 +1087,44 @@ static int ccp_run_xts_aes_cmd(struct ccp_cmd_queue *cmd_q,
 	op.sb_key = cmd_q->sb_key;
 	op.sb_ctx = cmd_q->sb_ctx;
 	op.init = 1;
+	op.u.xts.type = aestype;
 	op.u.xts.action = xts->action;
 	op.u.xts.unit_size = xts->unit_size;
 
-	/* All supported key sizes fit in a single (32-byte) SB entry
-	 * and must be in little endian format. Use the 256-bit byte
-	 * swap passthru option to convert from big endian to little
-	 * endian.
+	/* A version 3 device only supports 128-bit keys, which fits into a
+	 * single SB entry. A version 5 device uses a 512-bit vector, so two
+	 * SB entries.
 	 */
+	if (cmd_q->ccp->vdata->version == CCP_VERSION(3, 0))
+		sb_count = CCP_XTS_AES_KEY_SB_COUNT;
+	else
+		sb_count = CCP5_XTS_AES_KEY_SB_COUNT;
 	ret = ccp_init_dm_workarea(&key, cmd_q,
-				   CCP_XTS_AES_KEY_SB_COUNT * CCP_SB_BYTES,
+				   sb_count * CCP_SB_BYTES,
 				   DMA_TO_DEVICE);
 	if (ret)
 		return ret;
 
-	dm_offset = CCP_SB_BYTES - AES_KEYSIZE_128;
-	ccp_set_dm_area(&key, dm_offset, xts->key, 0, xts->key_len);
-	ccp_set_dm_area(&key, 0, xts->key, dm_offset, xts->key_len);
+	if (cmd_q->ccp->vdata->version == CCP_VERSION(3, 0)) {
+		/* All supported key sizes must be in little endian format.
+		 * Use the 256-bit byte swap passthru option to convert from
+		 * big endian to little endian.
+		 */
+		dm_offset = CCP_SB_BYTES - AES_KEYSIZE_128;
+		ccp_set_dm_area(&key, dm_offset, xts->key, 0, xts->key_len);
+		ccp_set_dm_area(&key, 0, xts->key, xts->key_len, xts->key_len);
+	} else {
+		/* Version 5 CCPs use a 512-bit space for the key: each portion
+		 * occupies 256 bits, or one entire slot, and is zero-padded.
+		 */
+		unsigned int pad;
+
+		dm_offset = CCP_SB_BYTES;
+		pad = dm_offset - xts->key_len;
+		ccp_set_dm_area(&key, pad, xts->key, 0, xts->key_len);
+		ccp_set_dm_area(&key, dm_offset + pad, xts->key, xts->key_len,
+				xts->key_len);
+	}
 	ret = ccp_copy_to_sb(cmd_q, &key, op.jobid, op.sb_key,
 			     CCP_PASSTHRU_BYTESWAP_256BIT);
 	if (ret) {
diff --git a/include/linux/ccp.h b/include/linux/ccp.h
index 3fd9a6dac344..7e9c991c95e0 100644
--- a/include/linux/ccp.h
+++ b/include/linux/ccp.h
@@ -1,7 +1,7 @@
 /*
  * AMD Cryptographic Coprocessor (CCP) driver
  *
- * Copyright (C) 2013,2016 Advanced Micro Devices, Inc.
+ * Copyright (C) 2013,2017 Advanced Micro Devices, Inc.
  *
  * Author: Tom Lendacky <thomas.lendacky@amd.com>
  * Author: Gary R Hook <gary.hook@amd.com>
@@ -229,6 +229,7 @@ enum ccp_xts_aes_unit_size {
  * AES operation the new IV overwrites the old IV.
  */
 struct ccp_xts_aes_engine {
+	enum ccp_aes_type type;
 	enum ccp_aes_action action;
 	enum ccp_xts_aes_unit_size unit_size;
 
-- 
cgit v1.2.3


From 704956ecf5bcdc14d14650f39f2b545b34c96265 Mon Sep 17 00:00:00 2001
From: Chao Yu <yuchao0@huawei.com>
Date: Mon, 31 Jul 2017 20:19:09 +0800
Subject: f2fs: support inode checksum

This patch adds to support inode checksum in f2fs.

Signed-off-by: Chao Yu <yuchao0@huawei.com>
[Jaegeuk Kim: fix verification flow]
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
 fs/f2fs/f2fs.h          | 32 ++++++++++++++++++++++
 fs/f2fs/inode.c         | 70 +++++++++++++++++++++++++++++++++++++++++++++++++
 fs/f2fs/node.c          |  7 +++++
 fs/f2fs/segment.c       |  5 +++-
 fs/f2fs/super.c         |  5 ++++
 include/linux/f2fs_fs.h |  1 +
 6 files changed, 119 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
index 1c31e4e7bde6..7c1244ba92a8 100644
--- a/fs/f2fs/f2fs.h
+++ b/fs/f2fs/f2fs.h
@@ -116,6 +116,7 @@ struct f2fs_mount_info {
 #define F2FS_FEATURE_ATOMIC_WRITE	0x0004
 #define F2FS_FEATURE_EXTRA_ATTR		0x0008
 #define F2FS_FEATURE_PRJQUOTA		0x0010
+#define F2FS_FEATURE_INODE_CHKSUM	0x0020
 
 #define F2FS_HAS_FEATURE(sb, mask)					\
 	((F2FS_SB(sb)->raw_super->feature & cpu_to_le32(mask)) != 0)
@@ -1085,6 +1086,9 @@ struct f2fs_sb_info {
 	/* Reference to checksum algorithm driver via cryptoapi */
 	struct crypto_shash *s_chksum_driver;
 
+	/* Precomputed FS UUID checksum for seeding other checksums */
+	__u32 s_chksum_seed;
+
 	/* For fault injection */
 #ifdef CONFIG_F2FS_FAULT_INJECTION
 	struct f2fs_fault_info fault_info;
@@ -1176,6 +1180,27 @@ static inline bool f2fs_crc_valid(struct f2fs_sb_info *sbi, __u32 blk_crc,
 	return f2fs_crc32(sbi, buf, buf_size) == blk_crc;
 }
 
+static inline u32 f2fs_chksum(struct f2fs_sb_info *sbi, u32 crc,
+			      const void *address, unsigned int length)
+{
+	struct {
+		struct shash_desc shash;
+		char ctx[4];
+	} desc;
+	int err;
+
+	BUG_ON(crypto_shash_descsize(sbi->s_chksum_driver) != sizeof(desc.ctx));
+
+	desc.shash.tfm = sbi->s_chksum_driver;
+	desc.shash.flags = 0;
+	*(u32 *)desc.ctx = crc;
+
+	err = crypto_shash_update(&desc.shash, address, length);
+	BUG_ON(err);
+
+	return *(u32 *)desc.ctx;
+}
+
 static inline struct f2fs_inode_info *F2FS_I(struct inode *inode)
 {
 	return container_of(inode, struct f2fs_inode_info, vfs_inode);
@@ -2285,6 +2310,8 @@ long f2fs_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg);
  * inode.c
  */
 void f2fs_set_inode_flags(struct inode *inode);
+bool f2fs_inode_chksum_verify(struct f2fs_sb_info *sbi, struct page *page);
+void f2fs_inode_chksum_set(struct f2fs_sb_info *sbi, struct page *page);
 struct inode *f2fs_iget(struct super_block *sb, unsigned long ino);
 struct inode *f2fs_iget_retry(struct super_block *sb, unsigned long ino);
 int try_to_free_nats(struct f2fs_sb_info *sbi, int nr_shrink);
@@ -2869,6 +2896,11 @@ static inline int f2fs_sb_has_project_quota(struct super_block *sb)
 	return F2FS_HAS_FEATURE(sb, F2FS_FEATURE_PRJQUOTA);
 }
 
+static inline int f2fs_sb_has_inode_chksum(struct super_block *sb)
+{
+	return F2FS_HAS_FEATURE(sb, F2FS_FEATURE_INODE_CHKSUM);
+}
+
 #ifdef CONFIG_BLK_DEV_ZONED
 static inline int get_blkz_type(struct f2fs_sb_info *sbi,
 			struct block_device *bdev, block_t blkaddr)
diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c
index f15e663a1a15..b4c401d456e7 100644
--- a/fs/f2fs/inode.c
+++ b/fs/f2fs/inode.c
@@ -108,6 +108,76 @@ static void __recover_inline_status(struct inode *inode, struct page *ipage)
 	return;
 }
 
+static bool f2fs_enable_inode_chksum(struct f2fs_sb_info *sbi, struct page *page)
+{
+	struct f2fs_inode *ri = &F2FS_NODE(page)->i;
+	int extra_isize = le32_to_cpu(ri->i_extra_isize);
+
+	if (!f2fs_sb_has_inode_chksum(sbi->sb))
+		return false;
+
+	if (!RAW_IS_INODE(F2FS_NODE(page)) || !(ri->i_inline & F2FS_EXTRA_ATTR))
+		return false;
+
+	if (!F2FS_FITS_IN_INODE(ri, extra_isize, i_inode_checksum))
+		return false;
+
+	return true;
+}
+
+static __u32 f2fs_inode_chksum(struct f2fs_sb_info *sbi, struct page *page)
+{
+	struct f2fs_node *node = F2FS_NODE(page);
+	struct f2fs_inode *ri = &node->i;
+	__le32 ino = node->footer.ino;
+	__le32 gen = ri->i_generation;
+	__u32 chksum, chksum_seed;
+	__u32 dummy_cs = 0;
+	unsigned int offset = offsetof(struct f2fs_inode, i_inode_checksum);
+	unsigned int cs_size = sizeof(dummy_cs);
+
+	chksum = f2fs_chksum(sbi, sbi->s_chksum_seed, (__u8 *)&ino,
+							sizeof(ino));
+	chksum_seed = f2fs_chksum(sbi, chksum, (__u8 *)&gen, sizeof(gen));
+
+	chksum = f2fs_chksum(sbi, chksum_seed, (__u8 *)ri, offset);
+	chksum = f2fs_chksum(sbi, chksum, (__u8 *)&dummy_cs, cs_size);
+	offset += cs_size;
+	chksum = f2fs_chksum(sbi, chksum, (__u8 *)ri + offset,
+						F2FS_BLKSIZE - offset);
+	return chksum;
+}
+
+bool f2fs_inode_chksum_verify(struct f2fs_sb_info *sbi, struct page *page)
+{
+	struct f2fs_inode *ri;
+	__u32 provided, calculated;
+
+	if (!f2fs_enable_inode_chksum(sbi, page))
+		return true;
+
+	ri = &F2FS_NODE(page)->i;
+	provided = le32_to_cpu(ri->i_inode_checksum);
+	calculated = f2fs_inode_chksum(sbi, page);
+
+	if (provided != calculated)
+		f2fs_msg(sbi->sb, KERN_WARNING,
+			"checksum invalid, ino = %x, %x vs. %x",
+			ino_of_node(page), provided, calculated);
+
+	return provided == calculated;
+}
+
+void f2fs_inode_chksum_set(struct f2fs_sb_info *sbi, struct page *page)
+{
+	struct f2fs_inode *ri = &F2FS_NODE(page)->i;
+
+	if (!f2fs_enable_inode_chksum(sbi, page))
+		return;
+
+	ri->i_inode_checksum = cpu_to_le32(f2fs_inode_chksum(sbi, page));
+}
+
 static int do_read_inode(struct inode *inode)
 {
 	struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c
index 14b79a396a3f..0176035d8ced 100644
--- a/fs/f2fs/node.c
+++ b/fs/f2fs/node.c
@@ -1171,6 +1171,11 @@ repeat:
 		err = -EIO;
 		goto out_err;
 	}
+
+	if (!f2fs_inode_chksum_verify(sbi, page)) {
+		err = -EBADMSG;
+		goto out_err;
+	}
 page_hit:
 	if(unlikely(nid != nid_of_node(page))) {
 		f2fs_msg(sbi->sb, KERN_WARNING, "inconsistent node block, "
@@ -2275,6 +2280,8 @@ retry:
 			F2FS_FITS_IN_INODE(src, le16_to_cpu(src->i_extra_isize),
 								i_projid))
 			dst->i_projid = src->i_projid;
+
+		f2fs_inode_chksum_set(sbi, ipage);
 	}
 
 	new_ni = old_ni;
diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c
index 2a5672a20a62..8911f50ddef6 100644
--- a/fs/f2fs/segment.c
+++ b/fs/f2fs/segment.c
@@ -2214,9 +2214,12 @@ void allocate_data_block(struct f2fs_sb_info *sbi, struct page *page,
 
 	mutex_unlock(&sit_i->sentry_lock);
 
-	if (page && IS_NODESEG(type))
+	if (page && IS_NODESEG(type)) {
 		fill_node_footer_blkaddr(page, NEXT_FREE_BLKADDR(sbi, curseg));
 
+		f2fs_inode_chksum_set(sbi, page);
+	}
+
 	if (add_list) {
 		struct f2fs_bio_info *io;
 
diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c
index 4c28576c5b30..fc757c8861b7 100644
--- a/fs/f2fs/super.c
+++ b/fs/f2fs/super.c
@@ -1993,6 +1993,11 @@ try_onemore:
 	sb->s_fs_info = sbi;
 	sbi->raw_super = raw_super;
 
+	/* precompute checksum seed for metadata */
+	if (f2fs_sb_has_inode_chksum(sb))
+		sbi->s_chksum_seed = f2fs_chksum(sbi, ~0, raw_super->uuid,
+						sizeof(raw_super->uuid));
+
 	/*
 	 * The BLKZONED feature indicates that the drive was formatted with
 	 * zone alignment optimization. This is optional for host-aware
diff --git a/include/linux/f2fs_fs.h b/include/linux/f2fs_fs.h
index 50f176683676..2a0c453d7235 100644
--- a/include/linux/f2fs_fs.h
+++ b/include/linux/f2fs_fs.h
@@ -240,6 +240,7 @@ struct f2fs_inode {
 			__le16 i_extra_isize;	/* extra inode attribute size */
 			__le16 i_padding;	/* padding */
 			__le32 i_projid;	/* project id */
+			__le32 i_inode_checksum;/* inode meta checksum */
 			__le32 i_extra_end[0];	/* for attribute size calculation */
 		};
 		__le32 i_addr[DEF_ADDRS_PER_INODE];	/* Pointers to data blocks */
-- 
cgit v1.2.3


From 98ba0bd5505dcbb90322a4be07bcfe6b8a18c73f Mon Sep 17 00:00:00 2001
From: Willem de Bruijn <willemb@google.com>
Date: Thu, 3 Aug 2017 16:29:37 -0400
Subject: sock: allocate skbs from optmem

Add sock_omalloc and sock_ofree to be able to allocate control skbs,
for instance for looping errors onto sk_error_queue.

The transmit budget (sk_wmem_alloc) is involved in transmit skb
shaping, most notably in TCP Small Queues. Using this budget for
control packets would impact transmission.

Signed-off-by: Willem de Bruijn <willemb@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/sock.h |  2 ++
 net/core/sock.c    | 27 +++++++++++++++++++++++++++
 2 files changed, 29 insertions(+)

(limited to 'include')

diff --git a/include/net/sock.h b/include/net/sock.h
index 393c38e9f6aa..0f778d3c4300 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -1531,6 +1531,8 @@ struct sk_buff *sock_wmalloc(struct sock *sk, unsigned long size, int force,
 			     gfp_t priority);
 void __sock_wfree(struct sk_buff *skb);
 void sock_wfree(struct sk_buff *skb);
+struct sk_buff *sock_omalloc(struct sock *sk, unsigned long size,
+			     gfp_t priority);
 void skb_orphan_partial(struct sk_buff *skb);
 void sock_rfree(struct sk_buff *skb);
 void sock_efree(struct sk_buff *skb);
diff --git a/net/core/sock.c b/net/core/sock.c
index 742f68c9c84a..1261880bdcc8 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -1923,6 +1923,33 @@ struct sk_buff *sock_wmalloc(struct sock *sk, unsigned long size, int force,
 }
 EXPORT_SYMBOL(sock_wmalloc);
 
+static void sock_ofree(struct sk_buff *skb)
+{
+	struct sock *sk = skb->sk;
+
+	atomic_sub(skb->truesize, &sk->sk_omem_alloc);
+}
+
+struct sk_buff *sock_omalloc(struct sock *sk, unsigned long size,
+			     gfp_t priority)
+{
+	struct sk_buff *skb;
+
+	/* small safe race: SKB_TRUESIZE may differ from final skb->truesize */
+	if (atomic_read(&sk->sk_omem_alloc) + SKB_TRUESIZE(size) >
+	    sysctl_optmem_max)
+		return NULL;
+
+	skb = alloc_skb(size, priority);
+	if (!skb)
+		return NULL;
+
+	atomic_add(skb->truesize, &sk->sk_omem_alloc);
+	skb->sk = sk;
+	skb->destructor = sock_ofree;
+	return skb;
+}
+
 /*
  * Allocate a memory block from the socket's option memory buffer.
  */
-- 
cgit v1.2.3


From 3ece782693c4b64d588dd217868558ab9a19bfe7 Mon Sep 17 00:00:00 2001
From: Willem de Bruijn <willemb@google.com>
Date: Thu, 3 Aug 2017 16:29:38 -0400
Subject: sock: skb_copy_ubufs support for compound pages

Refine skb_copy_ubufs to support compound pages. With upcoming TCP
zerocopy sendmsg, such fragments may appear.

The existing code replaces each page one for one. Splitting each
compound page into an independent number of regular pages can result
in exceeding limit MAX_SKB_FRAGS if data is not exactly page aligned.

Instead, fill all destination pages but the last to PAGE_SIZE.
Split the existing alloc + copy loop into separate stages:
1. compute bytelength and minimum number of pages to store this.
2. allocate
3. copy, filling each page except the last to PAGE_SIZE bytes
4. update skb frag array

Signed-off-by: Willem de Bruijn <willemb@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/skbuff.h |  9 +++++++--
 net/core/skbuff.c      | 53 ++++++++++++++++++++++++++++++++++++--------------
 2 files changed, 45 insertions(+), 17 deletions(-)

(limited to 'include')

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index be76082f48aa..2f64e2bbb592 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -1796,13 +1796,18 @@ static inline unsigned int skb_headlen(const struct sk_buff *skb)
 	return skb->len - skb->data_len;
 }
 
-static inline unsigned int skb_pagelen(const struct sk_buff *skb)
+static inline unsigned int __skb_pagelen(const struct sk_buff *skb)
 {
 	unsigned int i, len = 0;
 
 	for (i = skb_shinfo(skb)->nr_frags - 1; (int)i >= 0; i--)
 		len += skb_frag_size(&skb_shinfo(skb)->frags[i]);
-	return len + skb_headlen(skb);
+	return len;
+}
+
+static inline unsigned int skb_pagelen(const struct sk_buff *skb)
+{
+	return skb_headlen(skb) + __skb_pagelen(skb);
 }
 
 /**
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index 0f0933b338d7..a95877a8ac8b 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -932,17 +932,20 @@ EXPORT_SYMBOL_GPL(skb_morph);
  */
 int skb_copy_ubufs(struct sk_buff *skb, gfp_t gfp_mask)
 {
-	int i;
+	struct ubuf_info *uarg = skb_shinfo(skb)->destructor_arg;
 	int num_frags = skb_shinfo(skb)->nr_frags;
 	struct page *page, *head = NULL;
-	struct ubuf_info *uarg = skb_shinfo(skb)->destructor_arg;
+	int i, new_frags;
+	u32 d_off;
 
-	for (i = 0; i < num_frags; i++) {
-		skb_frag_t *f = &skb_shinfo(skb)->frags[i];
-		u32 p_off, p_len, copied;
-		struct page *p;
-		u8 *vaddr;
+	if (!num_frags)
+		return 0;
+
+	if (skb_shared(skb) || skb_unclone(skb, gfp_mask))
+		return -EINVAL;
 
+	new_frags = (__skb_pagelen(skb) + PAGE_SIZE - 1) >> PAGE_SHIFT;
+	for (i = 0; i < new_frags; i++) {
 		page = alloc_page(gfp_mask);
 		if (!page) {
 			while (head) {
@@ -952,17 +955,36 @@ int skb_copy_ubufs(struct sk_buff *skb, gfp_t gfp_mask)
 			}
 			return -ENOMEM;
 		}
+		set_page_private(page, (unsigned long)head);
+		head = page;
+	}
+
+	page = head;
+	d_off = 0;
+	for (i = 0; i < num_frags; i++) {
+		skb_frag_t *f = &skb_shinfo(skb)->frags[i];
+		u32 p_off, p_len, copied;
+		struct page *p;
+		u8 *vaddr;
 
 		skb_frag_foreach_page(f, f->page_offset, skb_frag_size(f),
 				      p, p_off, p_len, copied) {
+			u32 copy, done = 0;
 			vaddr = kmap_atomic(p);
-			memcpy(page_address(page) + copied, vaddr + p_off,
-			       p_len);
+
+			while (done < p_len) {
+				if (d_off == PAGE_SIZE) {
+					d_off = 0;
+					page = (struct page *)page_private(page);
+				}
+				copy = min_t(u32, PAGE_SIZE - d_off, p_len - done);
+				memcpy(page_address(page) + d_off,
+				       vaddr + p_off + done, copy);
+				done += copy;
+				d_off += copy;
+			}
 			kunmap_atomic(vaddr);
 		}
-
-		set_page_private(page, (unsigned long)head);
-		head = page;
 	}
 
 	/* skb frags release userspace buffers */
@@ -972,11 +994,12 @@ int skb_copy_ubufs(struct sk_buff *skb, gfp_t gfp_mask)
 	uarg->callback(uarg, false);
 
 	/* skb frags point to kernel buffers */
-	for (i = num_frags - 1; i >= 0; i--) {
-		__skb_fill_page_desc(skb, i, head, 0,
-				     skb_shinfo(skb)->frags[i].size);
+	for (i = 0; i < new_frags - 1; i++) {
+		__skb_fill_page_desc(skb, i, head, 0, PAGE_SIZE);
 		head = (struct page *)page_private(head);
 	}
+	__skb_fill_page_desc(skb, new_frags - 1, head, 0, d_off);
+	skb_shinfo(skb)->nr_frags = new_frags;
 
 	skb_shinfo(skb)->tx_flags &= ~SKBTX_DEV_ZEROCOPY;
 	return 0;
-- 
cgit v1.2.3


From 52267790ef52d7513879238ca9fac22c1733e0e3 Mon Sep 17 00:00:00 2001
From: Willem de Bruijn <willemb@google.com>
Date: Thu, 3 Aug 2017 16:29:39 -0400
Subject: sock: add MSG_ZEROCOPY

The kernel supports zerocopy sendmsg in virtio and tap. Expand the
infrastructure to support other socket types. Introduce a completion
notification channel over the socket error queue. Notifications are
returned with ee_origin SO_EE_ORIGIN_ZEROCOPY. ee_errno is 0 to avoid
blocking the send/recv path on receiving notifications.

Add reference counting, to support the skb split, merge, resize and
clone operations possible with SOCK_STREAM and other socket types.

The patch does not yet modify any datapaths.

Signed-off-by: Willem de Bruijn <willemb@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/skbuff.h        |  60 +++++++++++++++++++
 include/linux/socket.h        |   1 +
 include/net/sock.h            |   2 +
 include/uapi/linux/errqueue.h |   3 +
 net/core/datagram.c           |  55 ++++++++++-------
 net/core/skbuff.c             | 133 ++++++++++++++++++++++++++++++++++++++++++
 net/core/sock.c               |   2 +
 7 files changed, 235 insertions(+), 21 deletions(-)

(limited to 'include')

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 2f64e2bbb592..59cff7aa494e 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -429,6 +429,7 @@ enum {
 	SKBTX_SCHED_TSTAMP = 1 << 6,
 };
 
+#define SKBTX_ZEROCOPY_FRAG	(SKBTX_DEV_ZEROCOPY | SKBTX_SHARED_FRAG)
 #define SKBTX_ANY_SW_TSTAMP	(SKBTX_SW_TSTAMP    | \
 				 SKBTX_SCHED_TSTAMP)
 #define SKBTX_ANY_TSTAMP	(SKBTX_HW_TSTAMP | SKBTX_ANY_SW_TSTAMP)
@@ -445,8 +446,28 @@ struct ubuf_info {
 	void (*callback)(struct ubuf_info *, bool zerocopy_success);
 	void *ctx;
 	unsigned long desc;
+	u16 zerocopy:1;
+	atomic_t refcnt;
 };
 
+#define skb_uarg(SKB)	((struct ubuf_info *)(skb_shinfo(SKB)->destructor_arg))
+
+struct ubuf_info *sock_zerocopy_alloc(struct sock *sk, size_t size);
+
+static inline void sock_zerocopy_get(struct ubuf_info *uarg)
+{
+	atomic_inc(&uarg->refcnt);
+}
+
+void sock_zerocopy_put(struct ubuf_info *uarg);
+void sock_zerocopy_put_abort(struct ubuf_info *uarg);
+
+void sock_zerocopy_callback(struct ubuf_info *uarg, bool success);
+
+int skb_zerocopy_iter_stream(struct sock *sk, struct sk_buff *skb,
+			     struct msghdr *msg, int len,
+			     struct ubuf_info *uarg);
+
 /* This data is invariant across clones and lives at
  * the end of the header data, ie. at skb->end.
  */
@@ -1214,6 +1235,45 @@ static inline struct skb_shared_hwtstamps *skb_hwtstamps(struct sk_buff *skb)
 	return &skb_shinfo(skb)->hwtstamps;
 }
 
+static inline struct ubuf_info *skb_zcopy(struct sk_buff *skb)
+{
+	bool is_zcopy = skb && skb_shinfo(skb)->tx_flags & SKBTX_DEV_ZEROCOPY;
+
+	return is_zcopy ? skb_uarg(skb) : NULL;
+}
+
+static inline void skb_zcopy_set(struct sk_buff *skb, struct ubuf_info *uarg)
+{
+	if (skb && uarg && !skb_zcopy(skb)) {
+		sock_zerocopy_get(uarg);
+		skb_shinfo(skb)->destructor_arg = uarg;
+		skb_shinfo(skb)->tx_flags |= SKBTX_ZEROCOPY_FRAG;
+	}
+}
+
+/* Release a reference on a zerocopy structure */
+static inline void skb_zcopy_clear(struct sk_buff *skb, bool zerocopy)
+{
+	struct ubuf_info *uarg = skb_zcopy(skb);
+
+	if (uarg) {
+		uarg->zerocopy = uarg->zerocopy && zerocopy;
+		sock_zerocopy_put(uarg);
+		skb_shinfo(skb)->tx_flags &= ~SKBTX_ZEROCOPY_FRAG;
+	}
+}
+
+/* Abort a zerocopy operation and revert zckey on error in send syscall */
+static inline void skb_zcopy_abort(struct sk_buff *skb)
+{
+	struct ubuf_info *uarg = skb_zcopy(skb);
+
+	if (uarg) {
+		sock_zerocopy_put_abort(uarg);
+		skb_shinfo(skb)->tx_flags &= ~SKBTX_ZEROCOPY_FRAG;
+	}
+}
+
 /**
  *	skb_queue_empty - check if a queue is empty
  *	@list: queue head
diff --git a/include/linux/socket.h b/include/linux/socket.h
index 8b13db5163cc..8ad963cdc88c 100644
--- a/include/linux/socket.h
+++ b/include/linux/socket.h
@@ -287,6 +287,7 @@ struct ucred {
 #define MSG_BATCH	0x40000 /* sendmmsg(): more messages coming */
 #define MSG_EOF         MSG_FIN
 
+#define MSG_ZEROCOPY	0x4000000	/* Use user data in kernel path */
 #define MSG_FASTOPEN	0x20000000	/* Send data in TCP SYN */
 #define MSG_CMSG_CLOEXEC 0x40000000	/* Set close_on_exec for file
 					   descriptor received through
diff --git a/include/net/sock.h b/include/net/sock.h
index 0f778d3c4300..fe1a0bc25cd3 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -294,6 +294,7 @@ struct sock_common {
   *	@sk_stamp: time stamp of last packet received
   *	@sk_tsflags: SO_TIMESTAMPING socket options
   *	@sk_tskey: counter to disambiguate concurrent tstamp requests
+  *	@sk_zckey: counter to order MSG_ZEROCOPY notifications
   *	@sk_socket: Identd and reporting IO signals
   *	@sk_user_data: RPC layer private data
   *	@sk_frag: cached page frag
@@ -462,6 +463,7 @@ struct sock {
 	u16			sk_tsflags;
 	u8			sk_shutdown;
 	u32			sk_tskey;
+	atomic_t		sk_zckey;
 	struct socket		*sk_socket;
 	void			*sk_user_data;
 #ifdef CONFIG_SECURITY
diff --git a/include/uapi/linux/errqueue.h b/include/uapi/linux/errqueue.h
index 07bdce1f444a..78fdf52d6b2f 100644
--- a/include/uapi/linux/errqueue.h
+++ b/include/uapi/linux/errqueue.h
@@ -18,10 +18,13 @@ struct sock_extended_err {
 #define SO_EE_ORIGIN_ICMP	2
 #define SO_EE_ORIGIN_ICMP6	3
 #define SO_EE_ORIGIN_TXSTATUS	4
+#define SO_EE_ORIGIN_ZEROCOPY	5
 #define SO_EE_ORIGIN_TIMESTAMPING SO_EE_ORIGIN_TXSTATUS
 
 #define SO_EE_OFFENDER(ee)	((struct sockaddr*)((ee)+1))
 
+#define SO_EE_CODE_ZEROCOPY_COPIED	1
+
 /**
  *	struct scm_timestamping - timestamps exposed through cmsg
  *
diff --git a/net/core/datagram.c b/net/core/datagram.c
index ee5647bd91b3..2f3277945d35 100644
--- a/net/core/datagram.c
+++ b/net/core/datagram.c
@@ -573,27 +573,12 @@ fault:
 }
 EXPORT_SYMBOL(skb_copy_datagram_from_iter);
 
-/**
- *	zerocopy_sg_from_iter - Build a zerocopy datagram from an iov_iter
- *	@skb: buffer to copy
- *	@from: the source to copy from
- *
- *	The function will first copy up to headlen, and then pin the userspace
- *	pages and build frags through them.
- *
- *	Returns 0, -EFAULT or -EMSGSIZE.
- */
-int zerocopy_sg_from_iter(struct sk_buff *skb, struct iov_iter *from)
+int __zerocopy_sg_from_iter(struct sock *sk, struct sk_buff *skb,
+			    struct iov_iter *from, size_t length)
 {
-	int len = iov_iter_count(from);
-	int copy = min_t(int, skb_headlen(skb), len);
-	int frag = 0;
+	int frag = skb_shinfo(skb)->nr_frags;
 
-	/* copy up to skb headlen */
-	if (skb_copy_datagram_from_iter(skb, 0, from, copy))
-		return -EFAULT;
-
-	while (iov_iter_count(from)) {
+	while (length && iov_iter_count(from)) {
 		struct page *pages[MAX_SKB_FRAGS];
 		size_t start;
 		ssize_t copied;
@@ -603,18 +588,24 @@ int zerocopy_sg_from_iter(struct sk_buff *skb, struct iov_iter *from)
 		if (frag == MAX_SKB_FRAGS)
 			return -EMSGSIZE;
 
-		copied = iov_iter_get_pages(from, pages, ~0U,
+		copied = iov_iter_get_pages(from, pages, length,
 					    MAX_SKB_FRAGS - frag, &start);
 		if (copied < 0)
 			return -EFAULT;
 
 		iov_iter_advance(from, copied);
+		length -= copied;
 
 		truesize = PAGE_ALIGN(copied + start);
 		skb->data_len += copied;
 		skb->len += copied;
 		skb->truesize += truesize;
-		refcount_add(truesize, &skb->sk->sk_wmem_alloc);
+		if (sk && sk->sk_type == SOCK_STREAM) {
+			sk->sk_wmem_queued += truesize;
+			sk_mem_charge(sk, truesize);
+		} else {
+			refcount_add(truesize, &skb->sk->sk_wmem_alloc);
+		}
 		while (copied) {
 			int size = min_t(int, copied, PAGE_SIZE - start);
 			skb_fill_page_desc(skb, frag++, pages[n], start, size);
@@ -625,6 +616,28 @@ int zerocopy_sg_from_iter(struct sk_buff *skb, struct iov_iter *from)
 	}
 	return 0;
 }
+EXPORT_SYMBOL(__zerocopy_sg_from_iter);
+
+/**
+ *	zerocopy_sg_from_iter - Build a zerocopy datagram from an iov_iter
+ *	@skb: buffer to copy
+ *	@from: the source to copy from
+ *
+ *	The function will first copy up to headlen, and then pin the userspace
+ *	pages and build frags through them.
+ *
+ *	Returns 0, -EFAULT or -EMSGSIZE.
+ */
+int zerocopy_sg_from_iter(struct sk_buff *skb, struct iov_iter *from)
+{
+	int copy = min_t(int, skb_headlen(skb), iov_iter_count(from));
+
+	/* copy up to skb headlen */
+	if (skb_copy_datagram_from_iter(skb, 0, from, copy))
+		return -EFAULT;
+
+	return __zerocopy_sg_from_iter(NULL, skb, from, ~0U);
+}
 EXPORT_SYMBOL(zerocopy_sg_from_iter);
 
 static int skb_copy_and_csum_datagram(const struct sk_buff *skb, int offset,
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index a95877a8ac8b..0603e44950da 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -915,6 +915,139 @@ struct sk_buff *skb_morph(struct sk_buff *dst, struct sk_buff *src)
 }
 EXPORT_SYMBOL_GPL(skb_morph);
 
+struct ubuf_info *sock_zerocopy_alloc(struct sock *sk, size_t size)
+{
+	struct ubuf_info *uarg;
+	struct sk_buff *skb;
+
+	WARN_ON_ONCE(!in_task());
+
+	skb = sock_omalloc(sk, 0, GFP_KERNEL);
+	if (!skb)
+		return NULL;
+
+	BUILD_BUG_ON(sizeof(*uarg) > sizeof(skb->cb));
+	uarg = (void *)skb->cb;
+
+	uarg->callback = sock_zerocopy_callback;
+	uarg->desc = atomic_inc_return(&sk->sk_zckey) - 1;
+	uarg->zerocopy = 1;
+	atomic_set(&uarg->refcnt, 0);
+	sock_hold(sk);
+
+	return uarg;
+}
+EXPORT_SYMBOL_GPL(sock_zerocopy_alloc);
+
+static inline struct sk_buff *skb_from_uarg(struct ubuf_info *uarg)
+{
+	return container_of((void *)uarg, struct sk_buff, cb);
+}
+
+void sock_zerocopy_callback(struct ubuf_info *uarg, bool success)
+{
+	struct sk_buff *skb = skb_from_uarg(uarg);
+	struct sock_exterr_skb *serr;
+	struct sock *sk = skb->sk;
+	u16 id = uarg->desc;
+
+	if (sock_flag(sk, SOCK_DEAD))
+		goto release;
+
+	serr = SKB_EXT_ERR(skb);
+	memset(serr, 0, sizeof(*serr));
+	serr->ee.ee_errno = 0;
+	serr->ee.ee_origin = SO_EE_ORIGIN_ZEROCOPY;
+	serr->ee.ee_data = id;
+	if (!success)
+		serr->ee.ee_code |= SO_EE_CODE_ZEROCOPY_COPIED;
+
+	skb_queue_tail(&sk->sk_error_queue, skb);
+	skb = NULL;
+
+	sk->sk_error_report(sk);
+
+release:
+	consume_skb(skb);
+	sock_put(sk);
+}
+EXPORT_SYMBOL_GPL(sock_zerocopy_callback);
+
+void sock_zerocopy_put(struct ubuf_info *uarg)
+{
+	if (uarg && atomic_dec_and_test(&uarg->refcnt)) {
+		if (uarg->callback)
+			uarg->callback(uarg, uarg->zerocopy);
+		else
+			consume_skb(skb_from_uarg(uarg));
+	}
+}
+EXPORT_SYMBOL_GPL(sock_zerocopy_put);
+
+void sock_zerocopy_put_abort(struct ubuf_info *uarg)
+{
+	if (uarg) {
+		struct sock *sk = skb_from_uarg(uarg)->sk;
+
+		atomic_dec(&sk->sk_zckey);
+
+		/* sock_zerocopy_put expects a ref. Most sockets take one per
+		 * skb, which is zero on abort. tcp_sendmsg holds one extra, to
+		 * avoid an skb send inside the main loop triggering uarg free.
+		 */
+		if (sk->sk_type != SOCK_STREAM)
+			atomic_inc(&uarg->refcnt);
+
+		sock_zerocopy_put(uarg);
+	}
+}
+EXPORT_SYMBOL_GPL(sock_zerocopy_put_abort);
+
+extern int __zerocopy_sg_from_iter(struct sock *sk, struct sk_buff *skb,
+				   struct iov_iter *from, size_t length);
+
+int skb_zerocopy_iter_stream(struct sock *sk, struct sk_buff *skb,
+			     struct msghdr *msg, int len,
+			     struct ubuf_info *uarg)
+{
+	struct iov_iter orig_iter = msg->msg_iter;
+	int err, orig_len = skb->len;
+
+	err = __zerocopy_sg_from_iter(sk, skb, &msg->msg_iter, len);
+	if (err == -EFAULT || (err == -EMSGSIZE && skb->len == orig_len)) {
+		/* Streams do not free skb on error. Reset to prev state. */
+		msg->msg_iter = orig_iter;
+		___pskb_trim(skb, orig_len);
+		return err;
+	}
+
+	skb_zcopy_set(skb, uarg);
+	return skb->len - orig_len;
+}
+EXPORT_SYMBOL_GPL(skb_zerocopy_iter_stream);
+
+/* unused only until next patch in the series; will remove attribute */
+static int __attribute__((unused))
+	   skb_zerocopy_clone(struct sk_buff *nskb, struct sk_buff *orig,
+			      gfp_t gfp_mask)
+{
+	if (skb_zcopy(orig)) {
+		if (skb_zcopy(nskb)) {
+			/* !gfp_mask callers are verified to !skb_zcopy(nskb) */
+			if (!gfp_mask) {
+				WARN_ON_ONCE(1);
+				return -ENOMEM;
+			}
+			if (skb_uarg(nskb) == skb_uarg(orig))
+				return 0;
+			if (skb_copy_ubufs(nskb, GFP_ATOMIC))
+				return -EIO;
+		}
+		skb_zcopy_set(nskb, skb_uarg(orig));
+	}
+	return 0;
+}
+
 /**
  *	skb_copy_ubufs	-	copy userspace skb frags buffers to kernel
  *	@skb: the skb to modify
diff --git a/net/core/sock.c b/net/core/sock.c
index 1261880bdcc8..e8b696858cad 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -1670,6 +1670,7 @@ struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority)
 		atomic_set(&newsk->sk_drops, 0);
 		newsk->sk_send_head	= NULL;
 		newsk->sk_userlocks	= sk->sk_userlocks & ~SOCK_BINDPORT_LOCK;
+		atomic_set(&newsk->sk_zckey, 0);
 
 		sock_reset_flag(newsk, SOCK_DONE);
 
@@ -2722,6 +2723,7 @@ void sock_init_data(struct socket *sock, struct sock *sk)
 	sk->sk_sndtimeo		=	MAX_SCHEDULE_TIMEOUT;
 
 	sk->sk_stamp = SK_DEFAULT_STAMP;
+	atomic_set(&sk->sk_zckey, 0);
 
 #ifdef CONFIG_NET_RX_BUSY_POLL
 	sk->sk_napi_id		=	0;
-- 
cgit v1.2.3


From 76851d1212c11365362525e1e2c0a18c97478e6b Mon Sep 17 00:00:00 2001
From: Willem de Bruijn <willemb@google.com>
Date: Thu, 3 Aug 2017 16:29:40 -0400
Subject: sock: add SOCK_ZEROCOPY sockopt

The send call ignores unknown flags. Legacy applications may already
unwittingly pass MSG_ZEROCOPY. Continue to ignore this flag unless a
socket opts in to zerocopy.

Introduce socket option SO_ZEROCOPY to enable MSG_ZEROCOPY processing.
Processes can also query this socket option to detect kernel support
for the feature. Older kernels will return ENOPROTOOPT.

Signed-off-by: Willem de Bruijn <willemb@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 arch/alpha/include/uapi/asm/socket.h   |  2 ++
 arch/frv/include/uapi/asm/socket.h     |  2 ++
 arch/ia64/include/uapi/asm/socket.h    |  2 ++
 arch/m32r/include/uapi/asm/socket.h    |  2 ++
 arch/mips/include/uapi/asm/socket.h    |  2 ++
 arch/mn10300/include/uapi/asm/socket.h |  2 ++
 arch/parisc/include/uapi/asm/socket.h  |  2 ++
 arch/s390/include/uapi/asm/socket.h    |  2 ++
 arch/sparc/include/uapi/asm/socket.h   |  2 ++
 arch/xtensa/include/uapi/asm/socket.h  |  2 ++
 include/uapi/asm-generic/socket.h      |  2 ++
 net/core/skbuff.c                      |  3 +++
 net/core/sock.c                        | 18 ++++++++++++++++++
 13 files changed, 43 insertions(+)

(limited to 'include')

diff --git a/arch/alpha/include/uapi/asm/socket.h b/arch/alpha/include/uapi/asm/socket.h
index 7b285dd4fe05..c6133a045352 100644
--- a/arch/alpha/include/uapi/asm/socket.h
+++ b/arch/alpha/include/uapi/asm/socket.h
@@ -109,4 +109,6 @@
 
 #define SO_PEERGROUPS		59
 
+#define SO_ZEROCOPY		60
+
 #endif /* _UAPI_ASM_SOCKET_H */
diff --git a/arch/frv/include/uapi/asm/socket.h b/arch/frv/include/uapi/asm/socket.h
index f1e3b20dce9f..9abf02d6855a 100644
--- a/arch/frv/include/uapi/asm/socket.h
+++ b/arch/frv/include/uapi/asm/socket.h
@@ -102,5 +102,7 @@
 
 #define SO_PEERGROUPS		59
 
+#define SO_ZEROCOPY		60
+
 #endif /* _ASM_SOCKET_H */
 
diff --git a/arch/ia64/include/uapi/asm/socket.h b/arch/ia64/include/uapi/asm/socket.h
index 5dd5c5d0d642..002eb85a6941 100644
--- a/arch/ia64/include/uapi/asm/socket.h
+++ b/arch/ia64/include/uapi/asm/socket.h
@@ -111,4 +111,6 @@
 
 #define SO_PEERGROUPS		59
 
+#define SO_ZEROCOPY		60
+
 #endif /* _ASM_IA64_SOCKET_H */
diff --git a/arch/m32r/include/uapi/asm/socket.h b/arch/m32r/include/uapi/asm/socket.h
index f8f7b47e247f..e268e51a38d1 100644
--- a/arch/m32r/include/uapi/asm/socket.h
+++ b/arch/m32r/include/uapi/asm/socket.h
@@ -102,4 +102,6 @@
 
 #define SO_PEERGROUPS		59
 
+#define SO_ZEROCOPY		60
+
 #endif /* _ASM_M32R_SOCKET_H */
diff --git a/arch/mips/include/uapi/asm/socket.h b/arch/mips/include/uapi/asm/socket.h
index 882823bec153..6c755bc07975 100644
--- a/arch/mips/include/uapi/asm/socket.h
+++ b/arch/mips/include/uapi/asm/socket.h
@@ -120,4 +120,6 @@
 
 #define SO_PEERGROUPS		59
 
+#define SO_ZEROCOPY		60
+
 #endif /* _UAPI_ASM_SOCKET_H */
diff --git a/arch/mn10300/include/uapi/asm/socket.h b/arch/mn10300/include/uapi/asm/socket.h
index c710db354ff2..ac82a3f26dbf 100644
--- a/arch/mn10300/include/uapi/asm/socket.h
+++ b/arch/mn10300/include/uapi/asm/socket.h
@@ -102,4 +102,6 @@
 
 #define SO_PEERGROUPS		59
 
+#define SO_ZEROCOPY		60
+
 #endif /* _ASM_SOCKET_H */
diff --git a/arch/parisc/include/uapi/asm/socket.h b/arch/parisc/include/uapi/asm/socket.h
index a0d4dc9f4eb2..3b2bf7ae703b 100644
--- a/arch/parisc/include/uapi/asm/socket.h
+++ b/arch/parisc/include/uapi/asm/socket.h
@@ -101,4 +101,6 @@
 
 #define SO_PEERGROUPS		0x4034
 
+#define SO_ZEROCOPY		0x4035
+
 #endif /* _UAPI_ASM_SOCKET_H */
diff --git a/arch/s390/include/uapi/asm/socket.h b/arch/s390/include/uapi/asm/socket.h
index 52a63f4175cb..a56916c83565 100644
--- a/arch/s390/include/uapi/asm/socket.h
+++ b/arch/s390/include/uapi/asm/socket.h
@@ -108,4 +108,6 @@
 
 #define SO_PEERGROUPS		59
 
+#define SO_ZEROCOPY		60
+
 #endif /* _ASM_SOCKET_H */
diff --git a/arch/sparc/include/uapi/asm/socket.h b/arch/sparc/include/uapi/asm/socket.h
index 186fd8199f54..b2f5c50d0947 100644
--- a/arch/sparc/include/uapi/asm/socket.h
+++ b/arch/sparc/include/uapi/asm/socket.h
@@ -98,6 +98,8 @@
 
 #define SO_PEERGROUPS		0x003d
 
+#define SO_ZEROCOPY		0x003e
+
 /* Security levels - as per NRL IPv6 - don't actually do anything */
 #define SO_SECURITY_AUTHENTICATION		0x5001
 #define SO_SECURITY_ENCRYPTION_TRANSPORT	0x5002
diff --git a/arch/xtensa/include/uapi/asm/socket.h b/arch/xtensa/include/uapi/asm/socket.h
index 3eed2761c149..220059999e74 100644
--- a/arch/xtensa/include/uapi/asm/socket.h
+++ b/arch/xtensa/include/uapi/asm/socket.h
@@ -113,4 +113,6 @@
 
 #define SO_PEERGROUPS		59
 
+#define SO_ZEROCOPY		60
+
 #endif	/* _XTENSA_SOCKET_H */
diff --git a/include/uapi/asm-generic/socket.h b/include/uapi/asm-generic/socket.h
index 9861be8da65e..e47c9e436221 100644
--- a/include/uapi/asm-generic/socket.h
+++ b/include/uapi/asm-generic/socket.h
@@ -104,4 +104,6 @@
 
 #define SO_PEERGROUPS		59
 
+#define SO_ZEROCOPY		60
+
 #endif /* __ASM_GENERIC_SOCKET_H */
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index 0603e44950da..29e34bc6a17c 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -922,6 +922,9 @@ struct ubuf_info *sock_zerocopy_alloc(struct sock *sk, size_t size)
 
 	WARN_ON_ONCE(!in_task());
 
+	if (!sock_flag(sk, SOCK_ZEROCOPY))
+		return NULL;
+
 	skb = sock_omalloc(sk, 0, GFP_KERNEL);
 	if (!skb)
 		return NULL;
diff --git a/net/core/sock.c b/net/core/sock.c
index e8b696858cad..9ea988d25b0a 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -1055,6 +1055,20 @@ set_rcvbuf:
 		if (val == 1)
 			dst_negative_advice(sk);
 		break;
+
+	case SO_ZEROCOPY:
+		if (sk->sk_family != PF_INET && sk->sk_family != PF_INET6)
+			ret = -ENOTSUPP;
+		else if (sk->sk_protocol != IPPROTO_TCP)
+			ret = -ENOTSUPP;
+		else if (sk->sk_state != TCP_CLOSE)
+			ret = -EBUSY;
+		else if (val < 0 || val > 1)
+			ret = -EINVAL;
+		else
+			sock_valbool_flag(sk, SOCK_ZEROCOPY, valbool);
+		break;
+
 	default:
 		ret = -ENOPROTOOPT;
 		break;
@@ -1383,6 +1397,10 @@ int sock_getsockopt(struct socket *sock, int level, int optname,
 		v.val64 = sock_gen_cookie(sk);
 		break;
 
+	case SO_ZEROCOPY:
+		v.val = sock_flag(sk, SOCK_ZEROCOPY);
+		break;
+
 	default:
 		/* We implement the SO_SNDLOWAT etc to not be settable
 		 * (1003.1g 7).
-- 
cgit v1.2.3


From 1f8b977ab32dc5d148f103326e80d9097f1cefb5 Mon Sep 17 00:00:00 2001
From: Willem de Bruijn <willemb@google.com>
Date: Thu, 3 Aug 2017 16:29:41 -0400
Subject: sock: enable MSG_ZEROCOPY

Prepare the datapath for refcounted ubuf_info. Clone ubuf_info with
skb_zerocopy_clone() wherever needed due to skb split, merge, resize
or clone.

Split skb_orphan_frags into two variants. The split, merge, .. paths
support reference counted zerocopy buffers, so do not do a deep copy.
Add skb_orphan_frags_rx for paths that may loop packets to receive
sockets. That is not allowed, as it may cause unbounded latency.
Deep copy all zerocopy copy buffers, ref-counted or not, in this path.

The exact locations to modify were chosen by exhaustively searching
through all code that might modify skb_frag references and/or the
the SKBTX_DEV_ZEROCOPY tx_flags bit.

The changes err on the safe side, in two ways.

(1) legacy ubuf_info paths virtio and tap are not modified. They keep
    a 1:1 ubuf_info to sk_buff relationship. Calls to skb_orphan_frags
    still call skb_copy_ubufs and thus copy frags in this case.

(2) not all copies deep in the stack are addressed yet. skb_shift,
    skb_split and skb_try_coalesce can be refined to avoid copying.
    These are not in the hot path and this patch is hairy enough as
    is, so that is left for future refinement.

Signed-off-by: Willem de Bruijn <willemb@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/tun.c      |  2 +-
 drivers/vhost/net.c    |  1 +
 include/linux/skbuff.h | 14 +++++++++++++-
 net/core/dev.c         |  4 ++--
 net/core/skbuff.c      | 48 +++++++++++++++++++-----------------------------
 5 files changed, 36 insertions(+), 33 deletions(-)

(limited to 'include')

diff --git a/drivers/net/tun.c b/drivers/net/tun.c
index 68add55f8460..d21510d47aa2 100644
--- a/drivers/net/tun.c
+++ b/drivers/net/tun.c
@@ -892,7 +892,7 @@ static netdev_tx_t tun_net_xmit(struct sk_buff *skb, struct net_device *dev)
 	    sk_filter(tfile->socket.sk, skb))
 		goto drop;
 
-	if (unlikely(skb_orphan_frags(skb, GFP_ATOMIC)))
+	if (unlikely(skb_orphan_frags_rx(skb, GFP_ATOMIC)))
 		goto drop;
 
 	skb_tx_timestamp(skb);
diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
index 06d044862e58..ba08b78ed630 100644
--- a/drivers/vhost/net.c
+++ b/drivers/vhost/net.c
@@ -533,6 +533,7 @@ static void handle_tx(struct vhost_net *net)
 			ubuf->callback = vhost_zerocopy_callback;
 			ubuf->ctx = nvq->ubufs;
 			ubuf->desc = nvq->upend_idx;
+			atomic_set(&ubuf->refcnt, 1);
 			msg.msg_control = ubuf;
 			msg.msg_controllen = sizeof(ubuf);
 			ubufs = nvq->ubufs;
diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 59cff7aa494e..e5387932c266 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -2512,7 +2512,17 @@ static inline void skb_orphan(struct sk_buff *skb)
  */
 static inline int skb_orphan_frags(struct sk_buff *skb, gfp_t gfp_mask)
 {
-	if (likely(!(skb_shinfo(skb)->tx_flags & SKBTX_DEV_ZEROCOPY)))
+	if (likely(!skb_zcopy(skb)))
+		return 0;
+	if (skb_uarg(skb)->callback == sock_zerocopy_callback)
+		return 0;
+	return skb_copy_ubufs(skb, gfp_mask);
+}
+
+/* Frags must be orphaned, even if refcounted, if skb might loop to rx path */
+static inline int skb_orphan_frags_rx(struct sk_buff *skb, gfp_t gfp_mask)
+{
+	if (likely(!skb_zcopy(skb)))
 		return 0;
 	return skb_copy_ubufs(skb, gfp_mask);
 }
@@ -2944,6 +2954,8 @@ static inline int skb_add_data(struct sk_buff *skb,
 static inline bool skb_can_coalesce(struct sk_buff *skb, int i,
 				    const struct page *page, int off)
 {
+	if (skb_zcopy(skb))
+		return false;
 	if (i) {
 		const struct skb_frag_struct *frag = &skb_shinfo(skb)->frags[i - 1];
 
diff --git a/net/core/dev.c b/net/core/dev.c
index 8ea6b4b42611..1d75499add72 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -1853,7 +1853,7 @@ static inline int deliver_skb(struct sk_buff *skb,
 			      struct packet_type *pt_prev,
 			      struct net_device *orig_dev)
 {
-	if (unlikely(skb_orphan_frags(skb, GFP_ATOMIC)))
+	if (unlikely(skb_orphan_frags_rx(skb, GFP_ATOMIC)))
 		return -ENOMEM;
 	refcount_inc(&skb->users);
 	return pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
@@ -4412,7 +4412,7 @@ skip_classify:
 	}
 
 	if (pt_prev) {
-		if (unlikely(skb_orphan_frags(skb, GFP_ATOMIC)))
+		if (unlikely(skb_orphan_frags_rx(skb, GFP_ATOMIC)))
 			goto drop;
 		else
 			ret = pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index 29e34bc6a17c..74d3c36f8419 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -567,21 +567,10 @@ static void skb_release_data(struct sk_buff *skb)
 	for (i = 0; i < shinfo->nr_frags; i++)
 		__skb_frag_unref(&shinfo->frags[i]);
 
-	/*
-	 * If skb buf is from userspace, we need to notify the caller
-	 * the lower device DMA has done;
-	 */
-	if (shinfo->tx_flags & SKBTX_DEV_ZEROCOPY) {
-		struct ubuf_info *uarg;
-
-		uarg = shinfo->destructor_arg;
-		if (uarg->callback)
-			uarg->callback(uarg, true);
-	}
-
 	if (shinfo->frag_list)
 		kfree_skb_list(shinfo->frag_list);
 
+	skb_zcopy_clear(skb, true);
 	skb_free_head(skb);
 }
 
@@ -695,14 +684,7 @@ EXPORT_SYMBOL(kfree_skb_list);
  */
 void skb_tx_error(struct sk_buff *skb)
 {
-	if (skb_shinfo(skb)->tx_flags & SKBTX_DEV_ZEROCOPY) {
-		struct ubuf_info *uarg;
-
-		uarg = skb_shinfo(skb)->destructor_arg;
-		if (uarg->callback)
-			uarg->callback(uarg, false);
-		skb_shinfo(skb)->tx_flags &= ~SKBTX_DEV_ZEROCOPY;
-	}
+	skb_zcopy_clear(skb, true);
 }
 EXPORT_SYMBOL(skb_tx_error);
 
@@ -1029,9 +1011,7 @@ int skb_zerocopy_iter_stream(struct sock *sk, struct sk_buff *skb,
 }
 EXPORT_SYMBOL_GPL(skb_zerocopy_iter_stream);
 
-/* unused only until next patch in the series; will remove attribute */
-static int __attribute__((unused))
-	   skb_zerocopy_clone(struct sk_buff *nskb, struct sk_buff *orig,
+static int skb_zerocopy_clone(struct sk_buff *nskb, struct sk_buff *orig,
 			      gfp_t gfp_mask)
 {
 	if (skb_zcopy(orig)) {
@@ -1068,7 +1048,6 @@ static int __attribute__((unused))
  */
 int skb_copy_ubufs(struct sk_buff *skb, gfp_t gfp_mask)
 {
-	struct ubuf_info *uarg = skb_shinfo(skb)->destructor_arg;
 	int num_frags = skb_shinfo(skb)->nr_frags;
 	struct page *page, *head = NULL;
 	int i, new_frags;
@@ -1127,8 +1106,6 @@ int skb_copy_ubufs(struct sk_buff *skb, gfp_t gfp_mask)
 	for (i = 0; i < num_frags; i++)
 		skb_frag_unref(skb, i);
 
-	uarg->callback(uarg, false);
-
 	/* skb frags point to kernel buffers */
 	for (i = 0; i < new_frags - 1; i++) {
 		__skb_fill_page_desc(skb, i, head, 0, PAGE_SIZE);
@@ -1137,7 +1114,7 @@ int skb_copy_ubufs(struct sk_buff *skb, gfp_t gfp_mask)
 	__skb_fill_page_desc(skb, new_frags - 1, head, 0, d_off);
 	skb_shinfo(skb)->nr_frags = new_frags;
 
-	skb_shinfo(skb)->tx_flags &= ~SKBTX_DEV_ZEROCOPY;
+	skb_zcopy_clear(skb, false);
 	return 0;
 }
 EXPORT_SYMBOL_GPL(skb_copy_ubufs);
@@ -1298,7 +1275,8 @@ struct sk_buff *__pskb_copy_fclone(struct sk_buff *skb, int headroom,
 	if (skb_shinfo(skb)->nr_frags) {
 		int i;
 
-		if (skb_orphan_frags(skb, gfp_mask)) {
+		if (skb_orphan_frags(skb, gfp_mask) ||
+		    skb_zerocopy_clone(n, skb, gfp_mask)) {
 			kfree_skb(n);
 			n = NULL;
 			goto out;
@@ -1375,9 +1353,10 @@ int pskb_expand_head(struct sk_buff *skb, int nhead, int ntail,
 	 * be since all we did is relocate the values
 	 */
 	if (skb_cloned(skb)) {
-		/* copy this zero copy skb frags */
 		if (skb_orphan_frags(skb, gfp_mask))
 			goto nofrags;
+		if (skb_zcopy(skb))
+			atomic_inc(&skb_uarg(skb)->refcnt);
 		for (i = 0; i < skb_shinfo(skb)->nr_frags; i++)
 			skb_frag_ref(skb, i);
 
@@ -1872,6 +1851,9 @@ end:
 	skb->tail     += delta;
 	skb->data_len -= delta;
 
+	if (!skb->data_len)
+		skb_zcopy_clear(skb, false);
+
 	return skb_tail_pointer(skb);
 }
 EXPORT_SYMBOL(__pskb_pull_tail);
@@ -2627,6 +2609,7 @@ skb_zerocopy(struct sk_buff *to, struct sk_buff *from, int len, int hlen)
 		skb_tx_error(from);
 		return -ENOMEM;
 	}
+	skb_zerocopy_clone(to, from, GFP_ATOMIC);
 
 	for (i = 0; i < skb_shinfo(from)->nr_frags; i++) {
 		if (!len)
@@ -2924,6 +2907,7 @@ void skb_split(struct sk_buff *skb, struct sk_buff *skb1, const u32 len)
 
 	skb_shinfo(skb1)->tx_flags |= skb_shinfo(skb)->tx_flags &
 				      SKBTX_SHARED_FRAG;
+	skb_zerocopy_clone(skb1, skb, 0);
 	if (len < pos)	/* Split line is inside header. */
 		skb_split_inside_header(skb, skb1, len, pos);
 	else		/* Second chunk has no header, nothing to copy. */
@@ -2967,6 +2951,8 @@ int skb_shift(struct sk_buff *tgt, struct sk_buff *skb, int shiftlen)
 
 	if (skb_headlen(skb))
 		return 0;
+	if (skb_zcopy(tgt) || skb_zcopy(skb))
+		return 0;
 
 	todo = shiftlen;
 	from = 0;
@@ -3540,6 +3526,8 @@ normal:
 
 		skb_shinfo(nskb)->tx_flags |= skb_shinfo(head_skb)->tx_flags &
 					      SKBTX_SHARED_FRAG;
+		if (skb_zerocopy_clone(nskb, head_skb, GFP_ATOMIC))
+			goto err;
 
 		while (pos < offset + len) {
 			if (i >= nfrags) {
@@ -4663,6 +4651,8 @@ bool skb_try_coalesce(struct sk_buff *to, struct sk_buff *from,
 
 	if (skb_has_frag_list(to) || skb_has_frag_list(from))
 		return false;
+	if (skb_zcopy(to) || skb_zcopy(from))
+		return false;
 
 	if (skb_headlen(from) != 0) {
 		struct page *page;
-- 
cgit v1.2.3


From 4ab6c99d99bb1bf0fbba8ff4e52114c66109992f Mon Sep 17 00:00:00 2001
From: Willem de Bruijn <willemb@google.com>
Date: Thu, 3 Aug 2017 16:29:42 -0400
Subject: sock: MSG_ZEROCOPY notification coalescing

In the simple case, each sendmsg() call generates data and eventually
a zerocopy ready notification N, where N indicates the Nth successful
invocation of sendmsg() with the MSG_ZEROCOPY flag on this socket.

TCP and corked sockets can cause send() calls to append new data to an
existing sk_buff and, thus, ubuf_info. In that case the notification
must hold a range. odify ubuf_info to store a inclusive range [N..N+m]
and add skb_zerocopy_realloc() to optionally extend an existing range.

Also coalesce notifications in this common case: if a notification
[1, 1] is about to be queued while [0, 0] is the queue tail, just modify
the head of the queue to read [0, 1].

Coalescing is limited to a few TSO frames worth of data to bound
notification latency.

Signed-off-by: Willem de Bruijn <willemb@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/skbuff.h | 17 +++++++--
 net/core/skbuff.c      | 99 ++++++++++++++++++++++++++++++++++++++++++++++----
 2 files changed, 106 insertions(+), 10 deletions(-)

(limited to 'include')

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index e5387932c266..f5bdd93a87da 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -444,15 +444,26 @@ enum {
  */
 struct ubuf_info {
 	void (*callback)(struct ubuf_info *, bool zerocopy_success);
-	void *ctx;
-	unsigned long desc;
-	u16 zerocopy:1;
+	union {
+		struct {
+			unsigned long desc;
+			void *ctx;
+		};
+		struct {
+			u32 id;
+			u16 len;
+			u16 zerocopy:1;
+			u32 bytelen;
+		};
+	};
 	atomic_t refcnt;
 };
 
 #define skb_uarg(SKB)	((struct ubuf_info *)(skb_shinfo(SKB)->destructor_arg))
 
 struct ubuf_info *sock_zerocopy_alloc(struct sock *sk, size_t size);
+struct ubuf_info *sock_zerocopy_realloc(struct sock *sk, size_t size,
+					struct ubuf_info *uarg);
 
 static inline void sock_zerocopy_get(struct ubuf_info *uarg)
 {
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index 74d3c36f8419..dcee0f64f1fa 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -915,7 +915,9 @@ struct ubuf_info *sock_zerocopy_alloc(struct sock *sk, size_t size)
 	uarg = (void *)skb->cb;
 
 	uarg->callback = sock_zerocopy_callback;
-	uarg->desc = atomic_inc_return(&sk->sk_zckey) - 1;
+	uarg->id = ((u32)atomic_inc_return(&sk->sk_zckey)) - 1;
+	uarg->len = 1;
+	uarg->bytelen = size;
 	uarg->zerocopy = 1;
 	atomic_set(&uarg->refcnt, 0);
 	sock_hold(sk);
@@ -929,26 +931,101 @@ static inline struct sk_buff *skb_from_uarg(struct ubuf_info *uarg)
 	return container_of((void *)uarg, struct sk_buff, cb);
 }
 
+struct ubuf_info *sock_zerocopy_realloc(struct sock *sk, size_t size,
+					struct ubuf_info *uarg)
+{
+	if (uarg) {
+		const u32 byte_limit = 1 << 19;		/* limit to a few TSO */
+		u32 bytelen, next;
+
+		/* realloc only when socket is locked (TCP, UDP cork),
+		 * so uarg->len and sk_zckey access is serialized
+		 */
+		if (!sock_owned_by_user(sk)) {
+			WARN_ON_ONCE(1);
+			return NULL;
+		}
+
+		bytelen = uarg->bytelen + size;
+		if (uarg->len == USHRT_MAX - 1 || bytelen > byte_limit) {
+			/* TCP can create new skb to attach new uarg */
+			if (sk->sk_type == SOCK_STREAM)
+				goto new_alloc;
+			return NULL;
+		}
+
+		next = (u32)atomic_read(&sk->sk_zckey);
+		if ((u32)(uarg->id + uarg->len) == next) {
+			uarg->len++;
+			uarg->bytelen = bytelen;
+			atomic_set(&sk->sk_zckey, ++next);
+			return uarg;
+		}
+	}
+
+new_alloc:
+	return sock_zerocopy_alloc(sk, size);
+}
+EXPORT_SYMBOL_GPL(sock_zerocopy_realloc);
+
+static bool skb_zerocopy_notify_extend(struct sk_buff *skb, u32 lo, u16 len)
+{
+	struct sock_exterr_skb *serr = SKB_EXT_ERR(skb);
+	u32 old_lo, old_hi;
+	u64 sum_len;
+
+	old_lo = serr->ee.ee_info;
+	old_hi = serr->ee.ee_data;
+	sum_len = old_hi - old_lo + 1ULL + len;
+
+	if (sum_len >= (1ULL << 32))
+		return false;
+
+	if (lo != old_hi + 1)
+		return false;
+
+	serr->ee.ee_data += len;
+	return true;
+}
+
 void sock_zerocopy_callback(struct ubuf_info *uarg, bool success)
 {
-	struct sk_buff *skb = skb_from_uarg(uarg);
+	struct sk_buff *tail, *skb = skb_from_uarg(uarg);
 	struct sock_exterr_skb *serr;
 	struct sock *sk = skb->sk;
-	u16 id = uarg->desc;
+	struct sk_buff_head *q;
+	unsigned long flags;
+	u32 lo, hi;
+	u16 len;
 
-	if (sock_flag(sk, SOCK_DEAD))
+	/* if !len, there was only 1 call, and it was aborted
+	 * so do not queue a completion notification
+	 */
+	if (!uarg->len || sock_flag(sk, SOCK_DEAD))
 		goto release;
 
+	len = uarg->len;
+	lo = uarg->id;
+	hi = uarg->id + len - 1;
+
 	serr = SKB_EXT_ERR(skb);
 	memset(serr, 0, sizeof(*serr));
 	serr->ee.ee_errno = 0;
 	serr->ee.ee_origin = SO_EE_ORIGIN_ZEROCOPY;
-	serr->ee.ee_data = id;
+	serr->ee.ee_data = hi;
+	serr->ee.ee_info = lo;
 	if (!success)
 		serr->ee.ee_code |= SO_EE_CODE_ZEROCOPY_COPIED;
 
-	skb_queue_tail(&sk->sk_error_queue, skb);
-	skb = NULL;
+	q = &sk->sk_error_queue;
+	spin_lock_irqsave(&q->lock, flags);
+	tail = skb_peek_tail(q);
+	if (!tail || SKB_EXT_ERR(tail)->ee.ee_origin != SO_EE_ORIGIN_ZEROCOPY ||
+	    !skb_zerocopy_notify_extend(tail, lo, len)) {
+		__skb_queue_tail(q, skb);
+		skb = NULL;
+	}
+	spin_unlock_irqrestore(&q->lock, flags);
 
 	sk->sk_error_report(sk);
 
@@ -975,6 +1052,7 @@ void sock_zerocopy_put_abort(struct ubuf_info *uarg)
 		struct sock *sk = skb_from_uarg(uarg)->sk;
 
 		atomic_dec(&sk->sk_zckey);
+		uarg->len--;
 
 		/* sock_zerocopy_put expects a ref. Most sockets take one per
 		 * skb, which is zero on abort. tcp_sendmsg holds one extra, to
@@ -995,9 +1073,16 @@ int skb_zerocopy_iter_stream(struct sock *sk, struct sk_buff *skb,
 			     struct msghdr *msg, int len,
 			     struct ubuf_info *uarg)
 {
+	struct ubuf_info *orig_uarg = skb_zcopy(skb);
 	struct iov_iter orig_iter = msg->msg_iter;
 	int err, orig_len = skb->len;
 
+	/* An skb can only point to one uarg. This edge case happens when
+	 * TCP appends to an skb, but zerocopy_realloc triggered a new alloc.
+	 */
+	if (orig_uarg && uarg != orig_uarg)
+		return -EEXIST;
+
 	err = __zerocopy_sg_from_iter(sk, skb, &msg->msg_iter, len);
 	if (err == -EFAULT || (err == -EMSGSIZE && skb->len == orig_len)) {
 		/* Streams do not free skb on error. Reset to prev state. */
-- 
cgit v1.2.3


From a91dbff551a6f1865b68fa82b654591490b59901 Mon Sep 17 00:00:00 2001
From: Willem de Bruijn <willemb@google.com>
Date: Thu, 3 Aug 2017 16:29:43 -0400
Subject: sock: ulimit on MSG_ZEROCOPY pages

Bound the number of pages that a user may pin.

Follow the lead of perf tools to maintain a per-user bound on memory
locked pages commit 789f90fcf6b0 ("perf_counter: per user mlock gift")

Signed-off-by: Willem de Bruijn <willemb@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/sched/user.h |  3 ++-
 include/linux/skbuff.h     |  5 +++++
 net/core/skbuff.c          | 48 ++++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 55 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/sched/user.h b/include/linux/sched/user.h
index 5d5415e129d4..3c07e4135127 100644
--- a/include/linux/sched/user.h
+++ b/include/linux/sched/user.h
@@ -36,7 +36,8 @@ struct user_struct {
 	struct hlist_node uidhash_node;
 	kuid_t uid;
 
-#if defined(CONFIG_PERF_EVENTS) || defined(CONFIG_BPF_SYSCALL)
+#if defined(CONFIG_PERF_EVENTS) || defined(CONFIG_BPF_SYSCALL) || \
+    defined(CONFIG_NET)
 	atomic_long_t locked_vm;
 #endif
 };
diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index f5bdd93a87da..8c0708d2e5e6 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -457,6 +457,11 @@ struct ubuf_info {
 		};
 	};
 	atomic_t refcnt;
+
+	struct mmpin {
+		struct user_struct *user;
+		unsigned int num_pg;
+	} mmp;
 };
 
 #define skb_uarg(SKB)	((struct ubuf_info *)(skb_shinfo(SKB)->destructor_arg))
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index dcee0f64f1fa..42b62c716a33 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -897,6 +897,44 @@ struct sk_buff *skb_morph(struct sk_buff *dst, struct sk_buff *src)
 }
 EXPORT_SYMBOL_GPL(skb_morph);
 
+static int mm_account_pinned_pages(struct mmpin *mmp, size_t size)
+{
+	unsigned long max_pg, num_pg, new_pg, old_pg;
+	struct user_struct *user;
+
+	if (capable(CAP_IPC_LOCK) || !size)
+		return 0;
+
+	num_pg = (size >> PAGE_SHIFT) + 2;	/* worst case */
+	max_pg = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
+	user = mmp->user ? : current_user();
+
+	do {
+		old_pg = atomic_long_read(&user->locked_vm);
+		new_pg = old_pg + num_pg;
+		if (new_pg > max_pg)
+			return -ENOBUFS;
+	} while (atomic_long_cmpxchg(&user->locked_vm, old_pg, new_pg) !=
+		 old_pg);
+
+	if (!mmp->user) {
+		mmp->user = get_uid(user);
+		mmp->num_pg = num_pg;
+	} else {
+		mmp->num_pg += num_pg;
+	}
+
+	return 0;
+}
+
+static void mm_unaccount_pinned_pages(struct mmpin *mmp)
+{
+	if (mmp->user) {
+		atomic_long_sub(mmp->num_pg, &mmp->user->locked_vm);
+		free_uid(mmp->user);
+	}
+}
+
 struct ubuf_info *sock_zerocopy_alloc(struct sock *sk, size_t size)
 {
 	struct ubuf_info *uarg;
@@ -913,6 +951,12 @@ struct ubuf_info *sock_zerocopy_alloc(struct sock *sk, size_t size)
 
 	BUILD_BUG_ON(sizeof(*uarg) > sizeof(skb->cb));
 	uarg = (void *)skb->cb;
+	uarg->mmp.user = NULL;
+
+	if (mm_account_pinned_pages(&uarg->mmp, size)) {
+		kfree_skb(skb);
+		return NULL;
+	}
 
 	uarg->callback = sock_zerocopy_callback;
 	uarg->id = ((u32)atomic_inc_return(&sk->sk_zckey)) - 1;
@@ -956,6 +1000,8 @@ struct ubuf_info *sock_zerocopy_realloc(struct sock *sk, size_t size,
 
 		next = (u32)atomic_read(&sk->sk_zckey);
 		if ((u32)(uarg->id + uarg->len) == next) {
+			if (mm_account_pinned_pages(&uarg->mmp, size))
+				return NULL;
 			uarg->len++;
 			uarg->bytelen = bytelen;
 			atomic_set(&sk->sk_zckey, ++next);
@@ -1038,6 +1084,8 @@ EXPORT_SYMBOL_GPL(sock_zerocopy_callback);
 void sock_zerocopy_put(struct ubuf_info *uarg)
 {
 	if (uarg && atomic_dec_and_test(&uarg->refcnt)) {
+		mm_unaccount_pinned_pages(&uarg->mmp);
+
 		if (uarg->callback)
 			uarg->callback(uarg, uarg->zerocopy);
 		else
-- 
cgit v1.2.3


From 999616b8536cf3b9a1d0d74d5542ea009df482ff Mon Sep 17 00:00:00 2001
From: Jerome Forissier <jerome.forissier@linaro.org>
Date: Wed, 31 May 2017 13:21:05 +0200
Subject: tee: add forward declaration for struct device

tee_drv.h references struct device, but does not include device.h nor
platform_device.h. Therefore, if tee_drv.h is included by some file
that does not pull device.h nor platform_device.h beforehand, we have a
compile warning. Fix this by adding a forward declaration.

Signed-off-by: Jerome Forissier <jerome.forissier@linaro.org>
Signed-off-by: Jens Wiklander <jens.wiklander@linaro.org>
---
 include/linux/tee_drv.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include')

diff --git a/include/linux/tee_drv.h b/include/linux/tee_drv.h
index 0f175b8f6456..cb889afe576b 100644
--- a/include/linux/tee_drv.h
+++ b/include/linux/tee_drv.h
@@ -28,6 +28,7 @@
 #define TEE_SHM_MAPPED		0x1	/* Memory mapped by the kernel */
 #define TEE_SHM_DMA_BUF		0x2	/* Memory with dma-buf handle */
 
+struct device;
 struct tee_device;
 struct tee_shm;
 struct tee_shm_pool;
-- 
cgit v1.2.3


From 059cf566e123ca7eb7434285c6455d7afafb4e02 Mon Sep 17 00:00:00 2001
From: Jens Wiklander <jens.wiklander@linaro.org>
Date: Thu, 16 Feb 2017 09:07:02 +0100
Subject: tee: indicate privileged dev in gen_caps

Mirrors the TEE_DESC_PRIVILEGED bit of struct tee_desc:flags into struct
tee_ioctl_version_data:gen_caps as TEE_GEN_CAP_PRIVILEGED in
tee_ioctl_version()

Reviewed-by: Jerome Forissier <jerome.forissier@linaro.org>
Signed-off-by: Jens Wiklander <jens.wiklander@linaro.org>
---
 drivers/tee/tee_core.c   | 5 +++++
 include/uapi/linux/tee.h | 1 +
 2 files changed, 6 insertions(+)

(limited to 'include')

diff --git a/drivers/tee/tee_core.c b/drivers/tee/tee_core.c
index 5c60bf4423e6..58a5009eacc3 100644
--- a/drivers/tee/tee_core.c
+++ b/drivers/tee/tee_core.c
@@ -90,8 +90,13 @@ static int tee_ioctl_version(struct tee_context *ctx,
 	struct tee_ioctl_version_data vers;
 
 	ctx->teedev->desc->ops->get_version(ctx->teedev, &vers);
+
+	if (ctx->teedev->desc->flags & TEE_DESC_PRIVILEGED)
+		vers.gen_caps |= TEE_GEN_CAP_PRIVILEGED;
+
 	if (copy_to_user(uvers, &vers, sizeof(vers)))
 		return -EFAULT;
+
 	return 0;
 }
 
diff --git a/include/uapi/linux/tee.h b/include/uapi/linux/tee.h
index 370d8845ab21..688782e90140 100644
--- a/include/uapi/linux/tee.h
+++ b/include/uapi/linux/tee.h
@@ -49,6 +49,7 @@
 #define TEE_MAX_ARG_SIZE	1024
 
 #define TEE_GEN_CAP_GP		(1 << 0)/* GlobalPlatform compliant TEE */
+#define TEE_GEN_CAP_PRIVILEGED	(1 << 1)/* Privileged device (for supplicant) */
 
 /*
  * TEE Implementation ID
-- 
cgit v1.2.3


From 30d8340b5857550007ab6fe9744e4384dfa6e55c Mon Sep 17 00:00:00 2001
From: Takashi Sakamoto <o-takashi@sakamocchi.jp>
Date: Thu, 3 Aug 2017 20:20:42 +0900
Subject: ALSA: control: obsolete user_ctl_lock

At a previous commit, concurrent requests for TLV data are maintained
exclusively between read requests and write/command requests. TLV
callback handlers in each driver has no risk from concurrent access for
reference/change.

In current implementation, 'struct snd_card' has a mutex to control
concurrent accesses to user-defined element sets. This commit obsoletes it.

Signed-off-by: Takashi Sakamoto <o-takashi@sakamocchi.jp>
Signed-off-by: Takashi Iwai <tiwai@suse.de>
---
 include/sound/core.h |  2 --
 sound/core/control.c | 37 +++++++++++++------------------------
 sound/core/init.c    |  1 -
 3 files changed, 13 insertions(+), 27 deletions(-)

(limited to 'include')

diff --git a/include/sound/core.h b/include/sound/core.h
index 55385588eefa..357f36b5ee80 100644
--- a/include/sound/core.h
+++ b/include/sound/core.h
@@ -118,8 +118,6 @@ struct snd_card {
 	int user_ctl_count;		/* count of all user controls */
 	struct list_head controls;	/* all controls for this card */
 	struct list_head ctl_files;	/* active control files */
-	struct mutex user_ctl_lock;	/* protects user controls against
-					   concurrent access */
 
 	struct snd_info_entry *proc_root;	/* root for soundcard specific files */
 	struct snd_info_entry *proc_id;	/* the card id */
diff --git a/sound/core/control.c b/sound/core/control.c
index f3bd9bdba9a7..989292fe33c2 100644
--- a/sound/core/control.c
+++ b/sound/core/control.c
@@ -1095,9 +1095,7 @@ static int snd_ctl_elem_user_get(struct snd_kcontrol *kcontrol,
 	char *src = ue->elem_data +
 			snd_ctl_get_ioff(kcontrol, &ucontrol->id) * size;
 
-	mutex_lock(&ue->card->user_ctl_lock);
 	memcpy(&ucontrol->value, src, size);
-	mutex_unlock(&ue->card->user_ctl_lock);
 	return 0;
 }
 
@@ -1110,11 +1108,9 @@ static int snd_ctl_elem_user_put(struct snd_kcontrol *kcontrol,
 	char *dst = ue->elem_data +
 			snd_ctl_get_ioff(kcontrol, &ucontrol->id) * size;
 
-	mutex_lock(&ue->card->user_ctl_lock);
 	change = memcmp(&ucontrol->value, dst, size) != 0;
 	if (change)
 		memcpy(dst, &ucontrol->value, size);
-	mutex_unlock(&ue->card->user_ctl_lock);
 	return change;
 }
 
@@ -1124,44 +1120,37 @@ static int snd_ctl_elem_user_tlv(struct snd_kcontrol *kcontrol,
 				 unsigned int __user *tlv)
 {
 	struct user_element *ue = kcontrol->private_data;
-	int change = 0;
-	void *new_data;
 
 	if (op_flag == SNDRV_CTL_TLV_OP_WRITE) {
+		int change;
+		void *new_data;
+
 		if (size > 1024 * 128)	/* sane value */
 			return -EINVAL;
 
 		new_data = memdup_user(tlv, size);
 		if (IS_ERR(new_data))
 			return PTR_ERR(new_data);
-		mutex_lock(&ue->card->user_ctl_lock);
 		change = ue->tlv_data_size != size;
 		if (!change)
 			change = memcmp(ue->tlv_data, new_data, size);
 		kfree(ue->tlv_data);
 		ue->tlv_data = new_data;
 		ue->tlv_data_size = size;
-		mutex_unlock(&ue->card->user_ctl_lock);
+
+		return change;
 	} else {
-		int ret = 0;
+		if (!ue->tlv_data_size || !ue->tlv_data)
+			return -ENXIO;
+
+		if (size < ue->tlv_data_size)
+			return -ENOSPC;
 
-		mutex_lock(&ue->card->user_ctl_lock);
-		if (!ue->tlv_data_size || !ue->tlv_data) {
-			ret = -ENXIO;
-			goto err_unlock;
-		}
-		if (size < ue->tlv_data_size) {
-			ret = -ENOSPC;
-			goto err_unlock;
-		}
 		if (copy_to_user(tlv, ue->tlv_data, ue->tlv_data_size))
-			ret = -EFAULT;
-err_unlock:
-		mutex_unlock(&ue->card->user_ctl_lock);
-		if (ret)
-			return ret;
+			return -EFAULT;
+
+		return 0;
 	}
-	return change;
 }
 
 static int snd_ctl_elem_init_enum_names(struct user_element *ue)
diff --git a/sound/core/init.c b/sound/core/init.c
index b4365bcf28a7..6e219dc23f96 100644
--- a/sound/core/init.c
+++ b/sound/core/init.c
@@ -248,7 +248,6 @@ int snd_card_new(struct device *parent, int idx, const char *xid,
 	INIT_LIST_HEAD(&card->devices);
 	init_rwsem(&card->controls_rwsem);
 	rwlock_init(&card->ctl_files_rwlock);
-	mutex_init(&card->user_ctl_lock);
 	INIT_LIST_HEAD(&card->controls);
 	INIT_LIST_HEAD(&card->ctl_files);
 	spin_lock_init(&card->files_lock);
-- 
cgit v1.2.3


From 31128822ce46eda32b69a3f5ff5d29b05bc1facb Mon Sep 17 00:00:00 2001
From: Jerome Brunet <jbrunet@baylibre.com>
Date: Mon, 31 Jul 2017 13:38:31 +0200
Subject: clk: meson8b: expose every clock in the bindings

Expose all clocks which maybe used as DT bindings
Only clock ids internal the controller remain un-exposed (none on this
particular controller at the moment)

Acked-by: Neil Armstrong <narmstrong@baylibre.com>
Acked-by: Arnd Bergmann <arnd@arndb.de>
Signed-off-by: Jerome Brunet <jbrunet@baylibre.com>
Signed-off-by: Neil Armstrong <narmstrong@baylibre.com>
---
 drivers/clk/meson/meson8b.h              | 103 ++-----------------------------
 include/dt-bindings/clock/meson8b-clkc.h |  70 +++++++++++++++++++++
 2 files changed, 74 insertions(+), 99 deletions(-)

(limited to 'include')

diff --git a/drivers/clk/meson/meson8b.h b/drivers/clk/meson/meson8b.h
index a687e02547dc..c139bb3273ca 100644
--- a/drivers/clk/meson/meson8b.h
+++ b/drivers/clk/meson/meson8b.h
@@ -60,107 +60,12 @@
  * CLKID index values
  *
  * These indices are entirely contrived and do not map onto the hardware.
- * Migrate them out of this header and into the DT header file when they need
- * to be exposed to client nodes in DT: include/dt-bindings/clock/meson8b-clkc.h
+ * It has now been decided to expose everything by default in the DT header:
+ * include/dt-bindings/clock/gxbb-clkc.h. Only the clocks ids we don't want
+ * to expose, such as the internal muxes and dividers of composite clocks,
+ * will remain defined here.
  */
 
-/* CLKID_UNUSED */
-/* CLKID_XTAL */
-/* CLKID_PLL_FIXED */
-/* CLKID_PLL_VID */
-/* CLKID_PLL_SYS */
-/* CLKID_FCLK_DIV2 */
-/* CLKID_FCLK_DIV3 */
-/* CLKID_FCLK_DIV4 */
-/* CLKID_FCLK_DIV5 */
-/* CLKID_FCLK_DIV7 */
-/* CLKID_CLK81 */
-/* CLKID_MALI */
-/* CLKID_CPUCLK */
-/* CLKID_ZERO */
-/* CLKID_MPEG_SEL */
-/* CLKID_MPEG_DIV */
-#define CLKID_DDR		16
-#define CLKID_DOS		17
-#define CLKID_ISA		18
-#define CLKID_PL301		19
-#define CLKID_PERIPHS		20
-#define CLKID_SPICC		21
-#define CLKID_I2C		22
-/* #define CLKID_SAR_ADC */
-#define CLKID_SMART_CARD	24
-/* #define CLKID_RNG0 */
-#define CLKID_UART0		26
-#define CLKID_SDHC		27
-#define CLKID_STREAM		28
-#define CLKID_ASYNC_FIFO	29
-/* #define CLKID_SDIO */
-#define CLKID_ABUF		31
-#define CLKID_HIU_IFACE		32
-#define CLKID_ASSIST_MISC	33
-#define CLKID_SPI		34
-#define CLKID_I2S_SPDIF		35
-/* #define CLKID_ETH */
-#define CLKID_DEMUX		37
-#define CLKID_AIU_GLUE		38
-#define CLKID_IEC958		39
-#define CLKID_I2S_OUT		40
-#define CLKID_AMCLK		41
-#define CLKID_AIFIFO2		42
-#define CLKID_MIXER		43
-#define CLKID_MIXER_IFACE	44
-#define CLKID_ADC		45
-#define CLKID_BLKMV		46
-#define CLKID_AIU		47
-#define CLKID_UART1		48
-#define CLKID_G2D		49
-/* #define CLKID_USB0 */
-/* #define CLKID_USB1 */
-#define CLKID_RESET		52
-#define CLKID_NAND		53
-#define CLKID_DOS_PARSER	54
-/* #define CLKID_USB */
-#define CLKID_VDIN1		56
-#define CLKID_AHB_ARB0		57
-#define CLKID_EFUSE		58
-#define CLKID_BOOT_ROM		59
-#define CLKID_AHB_DATA_BUS	60
-#define CLKID_AHB_CTRL_BUS	61
-#define CLKID_HDMI_INTR_SYNC	62
-#define CLKID_HDMI_PCLK		63
-/* CLKID_USB1_DDR_BRIDGE */
-/* CLKID_USB0_DDR_BRIDGE */
-#define CLKID_MMC_PCLK		66
-#define CLKID_DVIN		67
-#define CLKID_UART2		68
-/* #define CLKID_SANA */
-#define CLKID_VPU_INTR		70
-#define CLKID_SEC_AHB_AHB3_BRIDGE	71
-#define CLKID_CLK81_A9		72
-#define CLKID_VCLK2_VENCI0	73
-#define CLKID_VCLK2_VENCI1	74
-#define CLKID_VCLK2_VENCP0	75
-#define CLKID_VCLK2_VENCP1	76
-#define CLKID_GCLK_VENCI_INT	77
-#define CLKID_GCLK_VENCP_INT	78
-#define CLKID_DAC_CLK		79
-#define CLKID_AOCLK_GATE	80
-#define CLKID_IEC958_GATE	81
-#define CLKID_ENC480P		82
-#define CLKID_RNG1		83
-#define CLKID_GCLK_VENCL_INT	84
-#define CLKID_VCLK2_VENCLMCC	85
-#define CLKID_VCLK2_VENCL	86
-#define CLKID_VCLK2_OTHER	87
-#define CLKID_EDP		88
-#define CLKID_AO_MEDIA_CPU	89
-#define CLKID_AO_AHB_SRAM	90
-#define CLKID_AO_AHB_BUS	91
-#define CLKID_AO_IFACE		92
-#define CLKID_MPLL0		93
-#define CLKID_MPLL1		94
-#define CLKID_MPLL2		95
-
 #define CLK_NR_CLKS		96
 
 /* include the CLKIDs that have been made part of the stable DT binding */
diff --git a/include/dt-bindings/clock/meson8b-clkc.h b/include/dt-bindings/clock/meson8b-clkc.h
index e29227fb52a1..a9c0306330b6 100644
--- a/include/dt-bindings/clock/meson8b-clkc.h
+++ b/include/dt-bindings/clock/meson8b-clkc.h
@@ -21,15 +21,85 @@
 #define CLKID_ZERO		13
 #define CLKID_MPEG_SEL		14
 #define CLKID_MPEG_DIV		15
+#define CLKID_DDR		16
+#define CLKID_DOS		17
+#define CLKID_ISA		18
+#define CLKID_PL301		19
+#define CLKID_PERIPHS		20
+#define CLKID_SPICC		21
+#define CLKID_I2C		22
 #define CLKID_SAR_ADC		23
+#define CLKID_SMART_CARD	24
 #define CLKID_RNG0		25
+#define CLKID_UART0		26
+#define CLKID_SDHC		27
+#define CLKID_STREAM		28
+#define CLKID_ASYNC_FIFO	29
 #define CLKID_SDIO		30
+#define CLKID_ABUF		31
+#define CLKID_HIU_IFACE		32
+#define CLKID_ASSIST_MISC	33
+#define CLKID_SPI		34
+#define CLKID_I2S_SPDIF		35
 #define CLKID_ETH		36
+#define CLKID_DEMUX		37
+#define CLKID_AIU_GLUE		38
+#define CLKID_IEC958		39
+#define CLKID_I2S_OUT		40
+#define CLKID_AMCLK		41
+#define CLKID_AIFIFO2		42
+#define CLKID_MIXER		43
+#define CLKID_MIXER_IFACE	44
+#define CLKID_ADC		45
+#define CLKID_BLKMV		46
+#define CLKID_AIU		47
+#define CLKID_UART1		48
+#define CLKID_G2D		49
 #define CLKID_USB0		50
 #define CLKID_USB1		51
+#define CLKID_RESET		52
+#define CLKID_NAND		53
+#define CLKID_DOS_PARSER	54
 #define CLKID_USB		55
+#define CLKID_VDIN1		56
+#define CLKID_AHB_ARB0		57
+#define CLKID_EFUSE		58
+#define CLKID_BOOT_ROM		59
+#define CLKID_AHB_DATA_BUS	60
+#define CLKID_AHB_CTRL_BUS	61
+#define CLKID_HDMI_INTR_SYNC	62
+#define CLKID_HDMI_PCLK		63
 #define CLKID_USB1_DDR_BRIDGE	64
 #define CLKID_USB0_DDR_BRIDGE	65
+#define CLKID_MMC_PCLK		66
+#define CLKID_DVIN		67
+#define CLKID_UART2		68
 #define CLKID_SANA		69
+#define CLKID_VPU_INTR		70
+#define CLKID_SEC_AHB_AHB3_BRIDGE	71
+#define CLKID_CLK81_A9		72
+#define CLKID_VCLK2_VENCI0	73
+#define CLKID_VCLK2_VENCI1	74
+#define CLKID_VCLK2_VENCP0	75
+#define CLKID_VCLK2_VENCP1	76
+#define CLKID_GCLK_VENCI_INT	77
+#define CLKID_GCLK_VENCP_INT	78
+#define CLKID_DAC_CLK		79
+#define CLKID_AOCLK_GATE	80
+#define CLKID_IEC958_GATE	81
+#define CLKID_ENC480P		82
+#define CLKID_RNG1		83
+#define CLKID_GCLK_VENCL_INT	84
+#define CLKID_VCLK2_VENCLMCC	85
+#define CLKID_VCLK2_VENCL	86
+#define CLKID_VCLK2_OTHER	87
+#define CLKID_EDP		88
+#define CLKID_AO_MEDIA_CPU	89
+#define CLKID_AO_AHB_SRAM	90
+#define CLKID_AO_AHB_BUS	91
+#define CLKID_AO_IFACE		92
+#define CLKID_MPLL0		93
+#define CLKID_MPLL1		94
+#define CLKID_MPLL2		95
 
 #endif /* __MESON8B_CLKC_H */
-- 
cgit v1.2.3


From 90640fd05ef27c7feb9da503ca6e14f16911bd6f Mon Sep 17 00:00:00 2001
From: Jerome Brunet <jbrunet@baylibre.com>
Date: Mon, 31 Jul 2017 13:38:32 +0200
Subject: clk: meson-gxbb: expose almost every clock in the bindings

Expose all clocks which maybe used as DT bindings
Only clock ids internal the controller remain un-exposed

Acked-by: Neil Armstrong <narmstrong@baylibre.com>
Acked-by: Arnd Bergmann <arnd@arndb.de>
Signed-off-by: Jerome Brunet <jbrunet@baylibre.com>
Signed-off-by: Neil Armstrong <narmstrong@baylibre.com>
---
 drivers/clk/meson/gxbb.h              | 117 ++--------------------------------
 include/dt-bindings/clock/gxbb-clkc.h |  60 +++++++++++++++++
 2 files changed, 67 insertions(+), 110 deletions(-)

(limited to 'include')

diff --git a/drivers/clk/meson/gxbb.h b/drivers/clk/meson/gxbb.h
index d63e77e8433d..2c8986d3232c 100644
--- a/drivers/clk/meson/gxbb.h
+++ b/drivers/clk/meson/gxbb.h
@@ -167,130 +167,27 @@
  * CLKID index values
  *
  * These indices are entirely contrived and do not map onto the hardware.
- * Migrate them out of this header and into the DT header file when they need
- * to be exposed to client nodes in DT: include/dt-bindings/clock/gxbb-clkc.h
+ * It has now been decided to expose everything by default in the DT header:
+ * include/dt-bindings/clock/gxbb-clkc.h. Only the clocks ids we don't want
+ * to expose, such as the internal muxes and dividers of composite clocks,
+ * will remain defined here.
  */
-#define CLKID_SYS_PLL		  0
 /* ID 1 is unused (it was used by the non-existing CLKID_CPUCLK before) */
-/* CLKID_HDMI_PLL */
-#define CLKID_FIXED_PLL		  3
-/* CLKID_FCLK_DIV2 */
-/* CLKID_FCLK_DIV3 */
-/* CLKID_FCLK_DIV4 */
-#define CLKID_FCLK_DIV5		  7
-#define CLKID_FCLK_DIV7		  8
-/* CLKID_GP0_PLL */
 #define CLKID_MPEG_SEL		  10
 #define CLKID_MPEG_DIV		  11
-/* CLKID_CLK81 */
-#define CLKID_MPLL0		  13
-#define CLKID_MPLL1		  14
-/* CLKID_MPLL2 */
-#define CLKID_DDR		  16
-#define CLKID_DOS		  17
-#define CLKID_ISA		  18
-#define CLKID_PL301		  19
-#define CLKID_PERIPHS		  20
-/* CLKID_SPICC */
-/* CLKID_I2C */
-/* #define CLKID_SAR_ADC */
-#define CLKID_SMART_CARD	  24
-/* CLKID_RNG0 */
-/* CLKID_UART0 */
-#define CLKID_SDHC		  27
-#define CLKID_STREAM		  28
-#define CLKID_ASYNC_FIFO	  29
-#define CLKID_SDIO		  30
-#define CLKID_ABUF		  31
-#define CLKID_HIU_IFACE		  32
-#define CLKID_ASSIST_MISC	  33
-/* CLKID_SPI */
-#define CLKID_I2S_SPDIF		  35
-/* CLKID_ETH */
-#define CLKID_DEMUX		  37
-/* CLKID_AIU_GLUE */
-/* CLKID_IEC958 */
-/* CLKID_I2S_OUT */
-#define CLKID_AMCLK		  41
-#define CLKID_AIFIFO2		  42
-#define CLKID_MIXER		  43
-/* CLKID_MIXER_IFACE */
-#define CLKID_ADC		  45
-#define CLKID_BLKMV		  46
-/* CLKID_AIU */
-/* CLKID_UART1 */
-#define CLKID_G2D		  49
-/* CLKID_USB0 */
-/* CLKID_USB1 */
-#define CLKID_RESET		  52
-#define CLKID_NAND		  53
-#define CLKID_DOS_PARSER	  54
-/* CLKID_USB */
-#define CLKID_VDIN1		  56
-#define CLKID_AHB_ARB0		  57
-#define CLKID_EFUSE		  58
-#define CLKID_BOOT_ROM		  59
-#define CLKID_AHB_DATA_BUS	  60
-#define CLKID_AHB_CTRL_BUS	  61
-#define CLKID_HDMI_INTR_SYNC	  62
-/* CLKID_HDMI_PCLK */
-/* CLKID_USB1_DDR_BRIDGE */
-/* CLKID_USB0_DDR_BRIDGE */
-#define CLKID_MMC_PCLK		  66
-#define CLKID_DVIN		  67
-/* CLKID_UART2 */
-/* #define CLKID_SANA */
-#define CLKID_VPU_INTR		  70
-#define CLKID_SEC_AHB_AHB3_BRIDGE 71
-#define CLKID_CLK81_A53		  72
-#define CLKID_VCLK2_VENCI0	  73
-#define CLKID_VCLK2_VENCI1	  74
-#define CLKID_VCLK2_VENCP0	  75
-#define CLKID_VCLK2_VENCP1	  76
-/* CLKID_GCLK_VENCI_INT0 */
-#define CLKID_GCLK_VENCI_INT	  78
-#define CLKID_DAC_CLK		  79
-/* CLKID_AOCLK_GATE */
-/* CLKID_IEC958_GATE */
-#define CLKID_ENC480P		  82
-#define CLKID_RNG1		  83
-#define CLKID_GCLK_VENCI_INT1	  84
-#define CLKID_VCLK2_VENCLMCC	  85
-#define CLKID_VCLK2_VENCL	  86
-#define CLKID_VCLK_OTHER	  87
-#define CLKID_EDP		  88
-#define CLKID_AO_MEDIA_CPU	  89
-#define CLKID_AO_AHB_SRAM	  90
-#define CLKID_AO_AHB_BUS	  91
-#define CLKID_AO_IFACE		  92
-/* CLKID_AO_I2C */
-/* CLKID_SD_EMMC_A */
-/* CLKID_SD_EMMC_B */
-/* CLKID_SD_EMMC_C */
-/* CLKID_SAR_ADC_CLK */
-/* CLKID_SAR_ADC_SEL */
 #define CLKID_SAR_ADC_DIV	  99
-/* CLKID_MALI_0_SEL */
-#define CLKID_MALI_0_DIV	 101
-/* CLKID_MALI_0	*/
-/* CLKID_MALI_1_SEL */
-#define CLKID_MALI_1_DIV	 104
-/* CLKID_MALI_1	*/
-/* CLKID_MALI	*/
-/* CLKID_CTS_AMCLK */
+#define CLKID_MALI_0_DIV	  101
+#define CLKID_MALI_1_DIV	  104
 #define CLKID_CTS_AMCLK_SEL	  108
 #define CLKID_CTS_AMCLK_DIV	  109
-/* CLKID_CTS_MCLK_I958 */
 #define CLKID_CTS_MCLK_I958_SEL	  111
 #define CLKID_CTS_MCLK_I958_DIV	  112
-/* CLKID_CTS_I958 */
-#define CLKID_32K_CLK		  114
 #define CLKID_32K_CLK_SEL	  115
 #define CLKID_32K_CLK_DIV	  116
 
 #define NR_CLKS			  117
 
-/* include the CLKIDs that have been made part of the stable DT binding */
+/* include the CLKIDs that have been made part of the DT binding */
 #include <dt-bindings/clock/gxbb-clkc.h>
 
 #endif /* __GXBB_H */
diff --git a/include/dt-bindings/clock/gxbb-clkc.h b/include/dt-bindings/clock/gxbb-clkc.h
index e3e9f7919c31..e07fea011ebd 100644
--- a/include/dt-bindings/clock/gxbb-clkc.h
+++ b/include/dt-bindings/clock/gxbb-clkc.h
@@ -5,37 +5,96 @@
 #ifndef __GXBB_CLKC_H
 #define __GXBB_CLKC_H
 
+#define CLKID_SYS_PLL		0
 #define CLKID_HDMI_PLL		2
+#define CLKID_FIXED_PLL		3
 #define CLKID_FCLK_DIV2		4
 #define CLKID_FCLK_DIV3		5
 #define CLKID_FCLK_DIV4		6
+#define CLKID_FCLK_DIV5		7
+#define CLKID_FCLK_DIV7		8
 #define CLKID_GP0_PLL		9
 #define CLKID_CLK81		12
+#define CLKID_MPLL0		13
+#define CLKID_MPLL1		14
 #define CLKID_MPLL2		15
+#define CLKID_DDR		16
+#define CLKID_DOS		17
+#define CLKID_ISA		18
+#define CLKID_PL301		19
+#define CLKID_PERIPHS		20
 #define CLKID_SPICC		21
 #define CLKID_I2C		22
 #define CLKID_SAR_ADC		23
+#define CLKID_SMART_CARD	24
 #define CLKID_RNG0		25
 #define CLKID_UART0		26
+#define CLKID_SDHC		27
+#define CLKID_STREAM		28
+#define CLKID_ASYNC_FIFO	29
+#define CLKID_SDIO		30
+#define CLKID_ABUF		31
+#define CLKID_HIU_IFACE		32
+#define CLKID_ASSIST_MISC	33
 #define CLKID_SPI		34
 #define CLKID_ETH		36
+#define CLKID_I2S_SPDIF		35
+#define CLKID_DEMUX		37
 #define CLKID_AIU_GLUE		38
 #define CLKID_IEC958		39
 #define CLKID_I2S_OUT		40
+#define CLKID_AMCLK		41
+#define CLKID_AIFIFO2		42
+#define CLKID_MIXER		43
 #define CLKID_MIXER_IFACE	44
+#define CLKID_ADC		45
+#define CLKID_BLKMV		46
 #define CLKID_AIU		47
 #define CLKID_UART1		48
+#define CLKID_G2D		49
 #define CLKID_USB0		50
 #define CLKID_USB1		51
+#define CLKID_RESET		52
+#define CLKID_NAND		53
+#define CLKID_DOS_PARSER	54
 #define CLKID_USB		55
+#define CLKID_VDIN1		56
+#define CLKID_AHB_ARB0		57
+#define CLKID_EFUSE		58
+#define CLKID_BOOT_ROM		59
+#define CLKID_AHB_DATA_BUS	60
+#define CLKID_AHB_CTRL_BUS	61
+#define CLKID_HDMI_INTR_SYNC	62
 #define CLKID_HDMI_PCLK		63
 #define CLKID_USB1_DDR_BRIDGE	64
 #define CLKID_USB0_DDR_BRIDGE	65
+#define CLKID_MMC_PCLK		66
+#define CLKID_DVIN		67
 #define CLKID_UART2		68
 #define CLKID_SANA		69
+#define CLKID_VPU_INTR		70
+#define CLKID_SEC_AHB_AHB3_BRIDGE 71
+#define CLKID_CLK81_A53		72
+#define CLKID_VCLK2_VENCI0	73
+#define CLKID_VCLK2_VENCI1	74
+#define CLKID_VCLK2_VENCP0	75
+#define CLKID_VCLK2_VENCP1	76
 #define CLKID_GCLK_VENCI_INT0	77
+#define CLKID_GCLK_VENCI_INT	78
+#define CLKID_DAC_CLK		79
 #define CLKID_AOCLK_GATE	80
 #define CLKID_IEC958_GATE	81
+#define CLKID_ENC480P		82
+#define CLKID_RNG1		83
+#define CLKID_GCLK_VENCI_INT1	84
+#define CLKID_VCLK2_VENCLMCC	85
+#define CLKID_VCLK2_VENCL	86
+#define CLKID_VCLK_OTHER	87
+#define CLKID_EDP		88
+#define CLKID_AO_MEDIA_CPU	89
+#define CLKID_AO_AHB_SRAM	90
+#define CLKID_AO_AHB_BUS	91
+#define CLKID_AO_IFACE		92
 #define CLKID_AO_I2C		93
 #define CLKID_SD_EMMC_A		94
 #define CLKID_SD_EMMC_B		95
@@ -50,5 +109,6 @@
 #define CLKID_CTS_AMCLK		107
 #define CLKID_CTS_MCLK_I958	110
 #define CLKID_CTS_I958		113
+#define CLKID_32K_CLK		114
 
 #endif /* __GXBB_CLKC_H */
-- 
cgit v1.2.3


From a5841de6911fb7759514325869d11c6c9d7df51a Mon Sep 17 00:00:00 2001
From: Jerome Brunet <jbrunet@baylibre.com>
Date: Mon, 31 Jul 2017 13:56:02 +0200
Subject: clk: meson: gxbb: Add sd_emmc clk0 clkids

Add the clkids for the clocks feeding the input0 of the mmc controllers

Signed-off-by: Jerome Brunet <jbrunet@baylibre.com>
Signed-off-by: Neil Armstrong <narmstrong@baylibre.com>
---
 drivers/clk/meson/gxbb.h              | 10 ++++++++--
 include/dt-bindings/clock/gxbb-clkc.h |  3 +++
 2 files changed, 11 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/drivers/clk/meson/gxbb.h b/drivers/clk/meson/gxbb.h
index 2c8986d3232c..5b1d4b374d1c 100644
--- a/drivers/clk/meson/gxbb.h
+++ b/drivers/clk/meson/gxbb.h
@@ -184,8 +184,14 @@
 #define CLKID_CTS_MCLK_I958_DIV	  112
 #define CLKID_32K_CLK_SEL	  115
 #define CLKID_32K_CLK_DIV	  116
-
-#define NR_CLKS			  117
+#define CLKID_SD_EMMC_A_CLK0_SEL  117
+#define CLKID_SD_EMMC_A_CLK0_DIV  118
+#define CLKID_SD_EMMC_B_CLK0_SEL  120
+#define CLKID_SD_EMMC_B_CLK0_DIV  121
+#define CLKID_SD_EMMC_C_CLK0_SEL  123
+#define CLKID_SD_EMMC_C_CLK0_DIV  124
+
+#define NR_CLKS			  126
 
 /* include the CLKIDs that have been made part of the DT binding */
 #include <dt-bindings/clock/gxbb-clkc.h>
diff --git a/include/dt-bindings/clock/gxbb-clkc.h b/include/dt-bindings/clock/gxbb-clkc.h
index e07fea011ebd..c04a76d8facf 100644
--- a/include/dt-bindings/clock/gxbb-clkc.h
+++ b/include/dt-bindings/clock/gxbb-clkc.h
@@ -110,5 +110,8 @@
 #define CLKID_CTS_MCLK_I958	110
 #define CLKID_CTS_I958		113
 #define CLKID_32K_CLK		114
+#define CLKID_SD_EMMC_A_CLK0	119
+#define CLKID_SD_EMMC_B_CLK0	122
+#define CLKID_SD_EMMC_C_CLK0	125
 
 #endif /* __GXBB_CLKC_H */
-- 
cgit v1.2.3


From 596f2b78da7b658b5e7758d18e8135401ca12fad Mon Sep 17 00:00:00 2001
From: Neil Armstrong <narmstrong@baylibre.com>
Date: Tue, 1 Aug 2017 13:56:58 +0200
Subject: dt-bindings: clock: gxbb-aoclk: Add CEC 32k clock

This patchadds the clock binding entry for the CEC 32K AO Clock.

Signed-off-by: Neil Armstrong <narmstrong@baylibre.com>
---
 include/dt-bindings/clock/gxbb-aoclkc.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include')

diff --git a/include/dt-bindings/clock/gxbb-aoclkc.h b/include/dt-bindings/clock/gxbb-aoclkc.h
index 31751482d13c..9d15e2221fdb 100644
--- a/include/dt-bindings/clock/gxbb-aoclkc.h
+++ b/include/dt-bindings/clock/gxbb-aoclkc.h
@@ -62,5 +62,6 @@
 #define CLKID_AO_UART1		3
 #define CLKID_AO_UART2		4
 #define CLKID_AO_IR_BLASTER	5
+#define CLKID_AO_CEC_32K	6
 
 #endif
-- 
cgit v1.2.3


From 4ebc1e3cfcd8778e2150bdb799b19e85348b8efa Mon Sep 17 00:00:00 2001
From: Jiri Pirko <jiri@mellanox.com>
Date: Fri, 4 Aug 2017 14:28:57 +0200
Subject: net: sched: remove unneeded tcf_em_tree_change

Since tcf_em_tree_validate could be always called on a newly created
filter, there is no need for this change function.

Signed-off-by: Jiri Pirko <jiri@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/pkt_cls.h  | 21 ---------------------
 net/sched/cls_basic.c  |  4 +---
 net/sched/cls_cgroup.c |  4 +---
 net/sched/cls_flow.c   | 12 +++++-------
 4 files changed, 7 insertions(+), 34 deletions(-)

(limited to 'include')

diff --git a/include/net/pkt_cls.h b/include/net/pkt_cls.h
index 537d0a0ad4c4..f4462ec8b2f4 100644
--- a/include/net/pkt_cls.h
+++ b/include/net/pkt_cls.h
@@ -332,26 +332,6 @@ int tcf_em_tree_dump(struct sk_buff *, struct tcf_ematch_tree *, int);
 int __tcf_em_tree_match(struct sk_buff *, struct tcf_ematch_tree *,
 			struct tcf_pkt_info *);
 
-/**
- * tcf_em_tree_change - replace ematch tree of a running classifier
- *
- * @tp: classifier kind handle
- * @dst: destination ematch tree variable
- * @src: source ematch tree (temporary tree from tcf_em_tree_validate)
- *
- * This functions replaces the ematch tree in @dst with the ematch
- * tree in @src. The classifier in charge of the ematch tree may be
- * running.
- */
-static inline void tcf_em_tree_change(struct tcf_proto *tp,
-				      struct tcf_ematch_tree *dst,
-				      struct tcf_ematch_tree *src)
-{
-	tcf_tree_lock(tp);
-	memcpy(dst, src, sizeof(*dst));
-	tcf_tree_unlock(tp);
-}
-
 /**
  * tcf_em_tree_match - evaulate an ematch tree
  *
@@ -386,7 +366,6 @@ struct tcf_ematch_tree {
 #define tcf_em_tree_validate(tp, tb, t) ((void)(t), 0)
 #define tcf_em_tree_destroy(t) do { (void)(t); } while(0)
 #define tcf_em_tree_dump(skb, t, tlv) (0)
-#define tcf_em_tree_change(tp, dst, src) do { } while(0)
 #define tcf_em_tree_match(skb, t, info) ((void)(info), 1)
 
 #endif /* CONFIG_NET_EMATCH */
diff --git a/net/sched/cls_basic.c b/net/sched/cls_basic.c
index c4fd63a068f9..979cd2683b46 100644
--- a/net/sched/cls_basic.c
+++ b/net/sched/cls_basic.c
@@ -130,7 +130,6 @@ static int basic_set_parms(struct net *net, struct tcf_proto *tp,
 {
 	int err;
 	struct tcf_exts e;
-	struct tcf_ematch_tree t;
 
 	err = tcf_exts_init(&e, TCA_BASIC_ACT, TCA_BASIC_POLICE);
 	if (err < 0)
@@ -139,7 +138,7 @@ static int basic_set_parms(struct net *net, struct tcf_proto *tp,
 	if (err < 0)
 		goto errout;
 
-	err = tcf_em_tree_validate(tp, tb[TCA_BASIC_EMATCHES], &t);
+	err = tcf_em_tree_validate(tp, tb[TCA_BASIC_EMATCHES], &f->ematches);
 	if (err < 0)
 		goto errout;
 
@@ -149,7 +148,6 @@ static int basic_set_parms(struct net *net, struct tcf_proto *tp,
 	}
 
 	tcf_exts_change(tp, &f->exts, &e);
-	tcf_em_tree_change(tp, &f->ematches, &t);
 	f->tp = tp;
 
 	return 0;
diff --git a/net/sched/cls_cgroup.c b/net/sched/cls_cgroup.c
index 12ce547eea04..ce7d38beab95 100644
--- a/net/sched/cls_cgroup.c
+++ b/net/sched/cls_cgroup.c
@@ -76,7 +76,6 @@ static int cls_cgroup_change(struct net *net, struct sk_buff *in_skb,
 	struct nlattr *tb[TCA_CGROUP_MAX + 1];
 	struct cls_cgroup_head *head = rtnl_dereference(tp->root);
 	struct cls_cgroup_head *new;
-	struct tcf_ematch_tree t;
 	struct tcf_exts e;
 	int err;
 
@@ -112,14 +111,13 @@ static int cls_cgroup_change(struct net *net, struct sk_buff *in_skb,
 		goto errout;
 	}
 
-	err = tcf_em_tree_validate(tp, tb[TCA_CGROUP_EMATCHES], &t);
+	err = tcf_em_tree_validate(tp, tb[TCA_CGROUP_EMATCHES], &new->ematches);
 	if (err < 0) {
 		tcf_exts_destroy(&e);
 		goto errout;
 	}
 
 	tcf_exts_change(tp, &new->exts, &e);
-	tcf_em_tree_change(tp, &new->ematches, &t);
 
 	rcu_assign_pointer(tp->root, new);
 	if (head)
diff --git a/net/sched/cls_flow.c b/net/sched/cls_flow.c
index 3065752b9cda..71fd1af01726 100644
--- a/net/sched/cls_flow.c
+++ b/net/sched/cls_flow.c
@@ -389,7 +389,6 @@ static int flow_change(struct net *net, struct sk_buff *in_skb,
 	struct nlattr *opt = tca[TCA_OPTIONS];
 	struct nlattr *tb[TCA_FLOW_MAX + 1];
 	struct tcf_exts e;
-	struct tcf_ematch_tree t;
 	unsigned int nkeys = 0;
 	unsigned int perturb_period = 0;
 	u32 baseclass = 0;
@@ -432,13 +431,13 @@ static int flow_change(struct net *net, struct sk_buff *in_skb,
 	if (err < 0)
 		goto err1;
 
-	err = tcf_em_tree_validate(tp, tb[TCA_FLOW_EMATCHES], &t);
-	if (err < 0)
-		goto err1;
-
 	err = -ENOBUFS;
 	fnew = kzalloc(sizeof(*fnew), GFP_KERNEL);
 	if (!fnew)
+		goto err1;
+
+	err = tcf_em_tree_validate(tp, tb[TCA_FLOW_EMATCHES], &fnew->ematches);
+	if (err < 0)
 		goto err2;
 
 	err = tcf_exts_init(&fnew->exts, TCA_FLOW_ACT, TCA_FLOW_POLICE);
@@ -512,7 +511,6 @@ static int flow_change(struct net *net, struct sk_buff *in_skb,
 			       (unsigned long)fnew);
 
 	tcf_exts_change(tp, &fnew->exts, &e);
-	tcf_em_tree_change(tp, &fnew->ematches, &t);
 
 	netif_keep_dst(qdisc_dev(tp->q));
 
@@ -554,8 +552,8 @@ static int flow_change(struct net *net, struct sk_buff *in_skb,
 
 err3:
 	tcf_exts_destroy(&fnew->exts);
+	tcf_em_tree_destroy(&fnew->ematches);
 err2:
-	tcf_em_tree_destroy(&t);
 	kfree(fnew);
 err1:
 	tcf_exts_destroy(&e);
-- 
cgit v1.2.3


From 3bcc0cec818fa969fe555b44443347211ed787a3 Mon Sep 17 00:00:00 2001
From: Jiri Pirko <jiri@mellanox.com>
Date: Fri, 4 Aug 2017 14:28:58 +0200
Subject: net: sched: change names of action number helpers to be aligned with
 the rest

The rest of the helpers are named tcf_exts_*, so change the name of
the action number helpers to be aligned. While at it, change to inline
functions.

Signed-off-by: Jiri Pirko <jiri@mellanox.com>
Acked-by: Jamal Hadi Salim <jhs@mojatatu.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/chelsio/cxgb4/cxgb4_tc_u32.c  |  2 +-
 drivers/net/ethernet/intel/ixgbe/ixgbe_main.c      |  2 +-
 drivers/net/ethernet/mellanox/mlx5/core/en_tc.c    |  4 +--
 drivers/net/ethernet/mellanox/mlxsw/spectrum.c     |  2 +-
 .../net/ethernet/mellanox/mlxsw/spectrum_flower.c  |  2 +-
 drivers/net/ethernet/netronome/nfp/bpf/offload.c   |  4 +--
 include/net/pkt_cls.h                              | 36 ++++++++++++++++------
 net/dsa/slave.c                                    |  2 +-
 net/sched/cls_api.c                                |  2 +-
 9 files changed, 37 insertions(+), 19 deletions(-)

(limited to 'include')

diff --git a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_tc_u32.c b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_tc_u32.c
index ef06ce8247ab..6f734c52ef25 100644
--- a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_tc_u32.c
+++ b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_tc_u32.c
@@ -96,7 +96,7 @@ static int fill_action_fields(struct adapter *adap,
 	LIST_HEAD(actions);
 
 	exts = cls->knode.exts;
-	if (tc_no_actions(exts))
+	if (!tcf_exts_has_actions(exts))
 		return -EINVAL;
 
 	tcf_exts_to_list(exts, &actions);
diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
index 96606e3eb965..091fcc7e6e43 100644
--- a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
+++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
@@ -8953,7 +8953,7 @@ static int parse_tc_actions(struct ixgbe_adapter *adapter,
 	LIST_HEAD(actions);
 	int err;
 
-	if (tc_no_actions(exts))
+	if (!tcf_exts_has_actions(exts))
 		return -EINVAL;
 
 	tcf_exts_to_list(exts, &actions);
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c
index 3c536f560dd2..78f50d9f621d 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c
@@ -1326,7 +1326,7 @@ static int parse_tc_nic_actions(struct mlx5e_priv *priv, struct tcf_exts *exts,
 	LIST_HEAD(actions);
 	int err;
 
-	if (tc_no_actions(exts))
+	if (!tcf_exts_has_actions(exts))
 		return -EINVAL;
 
 	attr->flow_tag = MLX5_FS_DEFAULT_FLOW_TAG;
@@ -1839,7 +1839,7 @@ static int parse_tc_fdb_actions(struct mlx5e_priv *priv, struct tcf_exts *exts,
 	bool encap = false;
 	int err = 0;
 
-	if (tc_no_actions(exts))
+	if (!tcf_exts_has_actions(exts))
 		return -EINVAL;
 
 	memset(attr, 0, sizeof(*attr));
diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum.c
index 88b668ba0d8a..66d511d45c25 100644
--- a/drivers/net/ethernet/mellanox/mlxsw/spectrum.c
+++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum.c
@@ -1626,7 +1626,7 @@ static int mlxsw_sp_port_add_cls_matchall(struct mlxsw_sp_port *mlxsw_sp_port,
 	LIST_HEAD(actions);
 	int err;
 
-	if (!tc_single_action(cls->exts)) {
+	if (!tcf_exts_has_one_action(cls->exts)) {
 		netdev_err(mlxsw_sp_port->dev, "only singular actions are supported\n");
 		return -EOPNOTSUPP;
 	}
diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum_flower.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum_flower.c
index 400ad4081660..9be48d2e43ca 100644
--- a/drivers/net/ethernet/mellanox/mlxsw/spectrum_flower.c
+++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum_flower.c
@@ -53,7 +53,7 @@ static int mlxsw_sp_flower_parse_actions(struct mlxsw_sp *mlxsw_sp,
 	LIST_HEAD(actions);
 	int err;
 
-	if (tc_no_actions(exts))
+	if (!tcf_exts_has_actions(exts))
 		return 0;
 
 	/* Count action is inserted first */
diff --git a/drivers/net/ethernet/netronome/nfp/bpf/offload.c b/drivers/net/ethernet/netronome/nfp/bpf/offload.c
index 78d80a364edb..a88bb5bc0082 100644
--- a/drivers/net/ethernet/netronome/nfp/bpf/offload.c
+++ b/drivers/net/ethernet/netronome/nfp/bpf/offload.c
@@ -115,14 +115,14 @@ nfp_net_bpf_get_act(struct nfp_net *nn, struct tc_cls_bpf_offload *cls_bpf)
 
 	/* TC direct action */
 	if (cls_bpf->exts_integrated) {
-		if (tc_no_actions(cls_bpf->exts))
+		if (!tcf_exts_has_actions(cls_bpf->exts))
 			return NN_ACT_DIRECT;
 
 		return -EOPNOTSUPP;
 	}
 
 	/* TC legacy mode */
-	if (!tc_single_action(cls_bpf->exts))
+	if (!tcf_exts_has_one_action(cls_bpf->exts))
 		return -EOPNOTSUPP;
 
 	tcf_exts_to_list(cls_bpf->exts, &actions);
diff --git a/include/net/pkt_cls.h b/include/net/pkt_cls.h
index f4462ec8b2f4..7f2563636df0 100644
--- a/include/net/pkt_cls.h
+++ b/include/net/pkt_cls.h
@@ -199,17 +199,35 @@ tcf_exts_exec(struct sk_buff *skb, struct tcf_exts *exts,
 	return 0;
 }
 
+/**
+ * tcf_exts_has_actions - check if at least one action is present
+ * @exts: tc filter extensions handle
+ *
+ * Returns true if at least one action is present.
+ */
+static inline bool tcf_exts_has_actions(struct tcf_exts *exts)
+{
 #ifdef CONFIG_NET_CLS_ACT
+	return exts->nr_actions;
+#else
+	return false;
+#endif
+}
 
-#define tc_no_actions(_exts)  ((_exts)->nr_actions == 0)
-#define tc_single_action(_exts) ((_exts)->nr_actions == 1)
-
-#else /* CONFIG_NET_CLS_ACT */
-
-#define tc_no_actions(_exts) true
-#define tc_single_action(_exts) false
-
-#endif /* CONFIG_NET_CLS_ACT */
+/**
+ * tcf_exts_has_one_action - check if exactly one action is present
+ * @exts: tc filter extensions handle
+ *
+ * Returns true if exactly one action is present.
+ */
+static inline bool tcf_exts_has_one_action(struct tcf_exts *exts)
+{
+#ifdef CONFIG_NET_CLS_ACT
+	return exts->nr_actions == 1;
+#else
+	return false;
+#endif
+}
 
 int tcf_exts_validate(struct net *net, struct tcf_proto *tp,
 		      struct nlattr **tb, struct nlattr *rate_tlv,
diff --git a/net/dsa/slave.c b/net/dsa/slave.c
index e196562035b1..83252e8426d7 100644
--- a/net/dsa/slave.c
+++ b/net/dsa/slave.c
@@ -779,7 +779,7 @@ static int dsa_slave_add_cls_matchall(struct net_device *dev,
 	if (!ds->ops->port_mirror_add)
 		return err;
 
-	if (!tc_single_action(cls->exts))
+	if (!tcf_exts_has_one_action(cls->exts))
 		return err;
 
 	tcf_exts_to_list(cls->exts, &actions);
diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c
index 39da0c5801c9..287ae6cbf73b 100644
--- a/net/sched/cls_api.c
+++ b/net/sched/cls_api.c
@@ -972,7 +972,7 @@ int tcf_exts_get_dev(struct net_device *dev, struct tcf_exts *exts,
 	const struct tc_action *a;
 	LIST_HEAD(actions);
 
-	if (tc_no_actions(exts))
+	if (!tcf_exts_has_actions(exts))
 		return -EINVAL;
 
 	tcf_exts_to_list(exts, &actions);
-- 
cgit v1.2.3


From af69afc551eb9f9b1f2cc3295e2dfcdaa1dc948d Mon Sep 17 00:00:00 2001
From: Jiri Pirko <jiri@mellanox.com>
Date: Fri, 4 Aug 2017 14:28:59 +0200
Subject: net: sched: use tcf_exts_has_actions in tcf_exts_exec

Use the tcf_exts_has_actions helper instead or directly testing
exts->nr_actions in tcf_exts_exec.

Signed-off-by: Jiri Pirko <jiri@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/pkt_cls.h | 46 +++++++++++++++++++++++-----------------------
 1 file changed, 23 insertions(+), 23 deletions(-)

(limited to 'include')

diff --git a/include/net/pkt_cls.h b/include/net/pkt_cls.h
index 7f2563636df0..322a2823cc6a 100644
--- a/include/net/pkt_cls.h
+++ b/include/net/pkt_cls.h
@@ -176,29 +176,6 @@ tcf_exts_stats_update(const struct tcf_exts *exts,
 #endif
 }
 
-/**
- * tcf_exts_exec - execute tc filter extensions
- * @skb: socket buffer
- * @exts: tc filter extensions handle
- * @res: desired result
- *
- * Executes all configured extensions. Returns 0 on a normal execution,
- * a negative number if the filter must be considered unmatched or
- * a positive action code (TC_ACT_*) which must be returned to the
- * underlying layer.
- */
-static inline int
-tcf_exts_exec(struct sk_buff *skb, struct tcf_exts *exts,
-	       struct tcf_result *res)
-{
-#ifdef CONFIG_NET_CLS_ACT
-	if (exts->nr_actions)
-		return tcf_action_exec(skb, exts->actions, exts->nr_actions,
-				       res);
-#endif
-	return 0;
-}
-
 /**
  * tcf_exts_has_actions - check if at least one action is present
  * @exts: tc filter extensions handle
@@ -229,6 +206,29 @@ static inline bool tcf_exts_has_one_action(struct tcf_exts *exts)
 #endif
 }
 
+/**
+ * tcf_exts_exec - execute tc filter extensions
+ * @skb: socket buffer
+ * @exts: tc filter extensions handle
+ * @res: desired result
+ *
+ * Executes all configured extensions. Returns 0 on a normal execution,
+ * a negative number if the filter must be considered unmatched or
+ * a positive action code (TC_ACT_*) which must be returned to the
+ * underlying layer.
+ */
+static inline int
+tcf_exts_exec(struct sk_buff *skb, struct tcf_exts *exts,
+	      struct tcf_result *res)
+{
+#ifdef CONFIG_NET_CLS_ACT
+	if (tcf_exts_has_actions(exts))
+		return tcf_action_exec(skb, exts->actions, exts->nr_actions,
+				       res);
+#endif
+	return 0;
+}
+
 int tcf_exts_validate(struct net *net, struct tcf_proto *tp,
 		      struct nlattr **tb, struct nlattr *rate_tlv,
 		      struct tcf_exts *exts, bool ovr);
-- 
cgit v1.2.3


From 6fc6d06e5371507e68c6904a3423622b0e465b64 Mon Sep 17 00:00:00 2001
From: Jiri Pirko <jiri@mellanox.com>
Date: Fri, 4 Aug 2017 14:29:00 +0200
Subject: net: sched: remove redundant helpers tcf_exts_is_predicative and
 tcf_exts_is_available

These two helpers are doing the same as tcf_exts_has_actions, so remove
them and use tcf_exts_has_actions instead.

Signed-off-by: Jiri Pirko <jiri@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/pkt_cls.h   | 30 ------------------------------
 net/sched/cls_fw.c      |  2 +-
 net/sched/cls_route.c   |  2 +-
 net/sched/cls_tcindex.c |  2 +-
 4 files changed, 3 insertions(+), 33 deletions(-)

(limited to 'include')

diff --git a/include/net/pkt_cls.h b/include/net/pkt_cls.h
index 322a2823cc6a..817badf733b5 100644
--- a/include/net/pkt_cls.h
+++ b/include/net/pkt_cls.h
@@ -113,36 +113,6 @@ static inline int tcf_exts_init(struct tcf_exts *exts, int action, int police)
 	return 0;
 }
 
-/**
- * tcf_exts_is_predicative - check if a predicative extension is present
- * @exts: tc filter extensions handle
- *
- * Returns 1 if a predicative extension is present, i.e. an extension which
- * might cause further actions and thus overrule the regular tcf_result.
- */
-static inline int
-tcf_exts_is_predicative(struct tcf_exts *exts)
-{
-#ifdef CONFIG_NET_CLS_ACT
-	return exts->nr_actions;
-#else
-	return 0;
-#endif
-}
-
-/**
- * tcf_exts_is_available - check if at least one extension is present
- * @exts: tc filter extensions handle
- *
- * Returns 1 if at least one extension is present.
- */
-static inline int
-tcf_exts_is_available(struct tcf_exts *exts)
-{
-	/* All non-predicative extensions must be added here. */
-	return tcf_exts_is_predicative(exts);
-}
-
 static inline void tcf_exts_to_list(const struct tcf_exts *exts,
 				    struct list_head *actions)
 {
diff --git a/net/sched/cls_fw.c b/net/sched/cls_fw.c
index d3885362e017..a53fa75dfc7b 100644
--- a/net/sched/cls_fw.c
+++ b/net/sched/cls_fw.c
@@ -387,7 +387,7 @@ static int fw_dump(struct net *net, struct tcf_proto *tp, unsigned long fh,
 
 	t->tcm_handle = f->id;
 
-	if (!f->res.classid && !tcf_exts_is_available(&f->exts))
+	if (!f->res.classid && !tcf_exts_has_actions(&f->exts))
 		return skb->len;
 
 	nest = nla_nest_start(skb, TCA_OPTIONS);
diff --git a/net/sched/cls_route.c b/net/sched/cls_route.c
index d63d5502ee02..26f863634862 100644
--- a/net/sched/cls_route.c
+++ b/net/sched/cls_route.c
@@ -113,7 +113,7 @@ static inline int route4_hash_wild(void)
 #define ROUTE4_APPLY_RESULT()					\
 {								\
 	*res = f->res;						\
-	if (tcf_exts_is_available(&f->exts)) {			\
+	if (tcf_exts_has_actions(&f->exts)) {			\
 		int r = tcf_exts_exec(skb, &f->exts, res);	\
 		if (r < 0) {					\
 			dont_cache = 1;				\
diff --git a/net/sched/cls_tcindex.c b/net/sched/cls_tcindex.c
index 8a8a58357c39..66924d147e97 100644
--- a/net/sched/cls_tcindex.c
+++ b/net/sched/cls_tcindex.c
@@ -52,7 +52,7 @@ struct tcindex_data {
 
 static inline int tcindex_filter_is_set(struct tcindex_filter_result *r)
 {
-	return tcf_exts_is_predicative(&r->exts) || r->res.classid;
+	return tcf_exts_has_actions(&r->exts) || r->res.classid;
 }
 
 static struct tcindex_filter_result *tcindex_lookup(struct tcindex_data *p,
-- 
cgit v1.2.3


From af089e701adfb5898fee00a56ea4bb421edc308d Mon Sep 17 00:00:00 2001
From: Jiri Pirko <jiri@mellanox.com>
Date: Fri, 4 Aug 2017 14:29:01 +0200
Subject: net: sched: fix return value of tcf_exts_exec

Return the defined TC_ACT_OK instead of 0.

Signed-off-by: Jiri Pirko <jiri@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/pkt_cls.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/net/pkt_cls.h b/include/net/pkt_cls.h
index 817badf733b5..61ce521688b2 100644
--- a/include/net/pkt_cls.h
+++ b/include/net/pkt_cls.h
@@ -182,7 +182,7 @@ static inline bool tcf_exts_has_one_action(struct tcf_exts *exts)
  * @exts: tc filter extensions handle
  * @res: desired result
  *
- * Executes all configured extensions. Returns 0 on a normal execution,
+ * Executes all configured extensions. Returns TC_ACT_OK on a normal execution,
  * a negative number if the filter must be considered unmatched or
  * a positive action code (TC_ACT_*) which must be returned to the
  * underlying layer.
@@ -196,7 +196,7 @@ tcf_exts_exec(struct sk_buff *skb, struct tcf_exts *exts,
 		return tcf_action_exec(skb, exts->actions, exts->nr_actions,
 				       res);
 #endif
-	return 0;
+	return TC_ACT_OK;
 }
 
 int tcf_exts_validate(struct net *net, struct tcf_proto *tp,
-- 
cgit v1.2.3


From ec1a9cca0e13391167567964fd04e61a39d6a4ae Mon Sep 17 00:00:00 2001
From: Jiri Pirko <jiri@mellanox.com>
Date: Fri, 4 Aug 2017 14:29:02 +0200
Subject: net: sched: remove check for number of actions in tcf_exts_exec

Leave it to tcf_action_exec to return TC_ACT_OK in case there is no
action present.

Signed-off-by: Jiri Pirko <jiri@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/pkt_cls.h | 4 +---
 net/sched/act_api.c   | 3 ++-
 2 files changed, 3 insertions(+), 4 deletions(-)

(limited to 'include')

diff --git a/include/net/pkt_cls.h b/include/net/pkt_cls.h
index 61ce521688b2..b8959c9a190d 100644
--- a/include/net/pkt_cls.h
+++ b/include/net/pkt_cls.h
@@ -192,9 +192,7 @@ tcf_exts_exec(struct sk_buff *skb, struct tcf_exts *exts,
 	      struct tcf_result *res)
 {
 #ifdef CONFIG_NET_CLS_ACT
-	if (tcf_exts_has_actions(exts))
-		return tcf_action_exec(skb, exts->actions, exts->nr_actions,
-				       res);
+	return tcf_action_exec(skb, exts->actions, exts->nr_actions, res);
 #endif
 	return TC_ACT_OK;
 }
diff --git a/net/sched/act_api.c b/net/sched/act_api.c
index f19b118df414..a2915d958279 100644
--- a/net/sched/act_api.c
+++ b/net/sched/act_api.c
@@ -473,9 +473,10 @@ static struct tc_action_ops *tc_lookup_action(struct nlattr *kind)
 int tcf_action_exec(struct sk_buff *skb, struct tc_action **actions,
 		    int nr_actions, struct tcf_result *res)
 {
-	int ret = -1, i;
 	u32 jmp_prgcnt = 0;
 	u32 jmp_ttl = TCA_ACT_MAX_PRIO; /*matches actions per filter */
+	int i;
+	int ret = TC_ACT_OK;
 
 	if (skb_skip_tc_classify(skb))
 		return TC_ACT_OK;
-- 
cgit v1.2.3


From 9b0d4446b56904b59ae3809913b0ac760fa941a6 Mon Sep 17 00:00:00 2001
From: Jiri Pirko <jiri@mellanox.com>
Date: Fri, 4 Aug 2017 14:29:15 +0200
Subject: net: sched: avoid atomic swap in tcf_exts_change

tcf_exts_change is always called on newly created exts, which are not used
on fastpath. Therefore, simple struct copy is enough.

Signed-off-by: Jiri Pirko <jiri@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/pkt_cls.h   |  3 +--
 net/sched/cls_api.c     | 10 ++--------
 net/sched/cls_rsvp.h    |  4 ++--
 net/sched/cls_tcindex.c |  6 +++---
 4 files changed, 8 insertions(+), 15 deletions(-)

(limited to 'include')

diff --git a/include/net/pkt_cls.h b/include/net/pkt_cls.h
index b8959c9a190d..e0c54f111467 100644
--- a/include/net/pkt_cls.h
+++ b/include/net/pkt_cls.h
@@ -201,8 +201,7 @@ int tcf_exts_validate(struct net *net, struct tcf_proto *tp,
 		      struct nlattr **tb, struct nlattr *rate_tlv,
 		      struct tcf_exts *exts, bool ovr);
 void tcf_exts_destroy(struct tcf_exts *exts);
-void tcf_exts_change(struct tcf_proto *tp, struct tcf_exts *dst,
-		     struct tcf_exts *src);
+void tcf_exts_change(struct tcf_exts *dst, struct tcf_exts *src);
 int tcf_exts_dump(struct sk_buff *skb, struct tcf_exts *exts);
 int tcf_exts_dump_stats(struct sk_buff *skb, struct tcf_exts *exts);
 int tcf_exts_get_dev(struct net_device *dev, struct tcf_exts *exts,
diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c
index 735d556a5283..e655221c654e 100644
--- a/net/sched/cls_api.c
+++ b/net/sched/cls_api.c
@@ -883,18 +883,12 @@ int tcf_exts_validate(struct net *net, struct tcf_proto *tp, struct nlattr **tb,
 }
 EXPORT_SYMBOL(tcf_exts_validate);
 
-void tcf_exts_change(struct tcf_proto *tp, struct tcf_exts *dst,
-		     struct tcf_exts *src)
+void tcf_exts_change(struct tcf_exts *dst, struct tcf_exts *src)
 {
 #ifdef CONFIG_NET_CLS_ACT
 	struct tcf_exts old = *dst;
 
-	tcf_tree_lock(tp);
-	dst->nr_actions = src->nr_actions;
-	dst->actions = src->actions;
-	dst->type = src->type;
-	tcf_tree_unlock(tp);
-
+	*dst = *src;
 	tcf_exts_destroy(&old);
 #endif
 }
diff --git a/net/sched/cls_rsvp.h b/net/sched/cls_rsvp.h
index 0d9d07798699..4adb67a73491 100644
--- a/net/sched/cls_rsvp.h
+++ b/net/sched/cls_rsvp.h
@@ -518,7 +518,7 @@ static int rsvp_change(struct net *net, struct sk_buff *in_skb,
 			tcf_bind_filter(tp, &n->res, base);
 		}
 
-		tcf_exts_change(tp, &n->exts, &e);
+		tcf_exts_change(&n->exts, &e);
 		rsvp_replace(tp, n, handle);
 		return 0;
 	}
@@ -591,7 +591,7 @@ insert:
 			if (f->tunnelhdr == 0)
 				tcf_bind_filter(tp, &f->res, base);
 
-			tcf_exts_change(tp, &f->exts, &e);
+			tcf_exts_change(&f->exts, &e);
 
 			fp = &s->ht[h2];
 			for (nfp = rtnl_dereference(*fp); nfp;
diff --git a/net/sched/cls_tcindex.c b/net/sched/cls_tcindex.c
index 66924d147e97..d69f828f3fed 100644
--- a/net/sched/cls_tcindex.c
+++ b/net/sched/cls_tcindex.c
@@ -419,9 +419,9 @@ tcindex_set_parms(struct net *net, struct tcf_proto *tp, unsigned long base,
 	}
 
 	if (old_r)
-		tcf_exts_change(tp, &r->exts, &e);
+		tcf_exts_change(&r->exts, &e);
 	else
-		tcf_exts_change(tp, &cr.exts, &e);
+		tcf_exts_change(&cr.exts, &e);
 
 	if (old_r && old_r != r) {
 		err = tcindex_filter_result_init(old_r);
@@ -439,7 +439,7 @@ tcindex_set_parms(struct net *net, struct tcf_proto *tp, unsigned long base,
 		struct tcindex_filter *nfp;
 		struct tcindex_filter __rcu **fp;
 
-		tcf_exts_change(tp, &f->result.exts, &r->exts);
+		tcf_exts_change(&f->result.exts, &r->exts);
 
 		fp = cp->h + (handle % cp->hash);
 		for (nfp = rtnl_dereference(*fp);
-- 
cgit v1.2.3


From 56ce097c1caede1f9c191a7c9699b950e7c36ad9 Mon Sep 17 00:00:00 2001
From: John Fastabend <john.fastabend@gmail.com>
Date: Fri, 4 Aug 2017 08:24:05 -0700
Subject: net: comment fixes against BPF devmap helper calls

Update BPF comments to accurately reflect XDP usage.

Fixes: 97f91a7cf04ff ("bpf: add bpf_redirect_map helper routine")
Reported-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: John Fastabend <john.fastabend@gmail.com>
Acked-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/bpf.h | 16 +++++++++++-----
 1 file changed, 11 insertions(+), 5 deletions(-)

(limited to 'include')

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 1106a8c4cd36..1d06be1569b1 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -345,14 +345,20 @@ union bpf_attr {
  * int bpf_redirect(ifindex, flags)
  *     redirect to another netdev
  *     @ifindex: ifindex of the net device
- *     @flags: bit 0 - if set, redirect to ingress instead of egress
- *             other bits - reserved
- *     Return: TC_ACT_REDIRECT
- * int bpf_redirect_map(key, map, flags)
+ *     @flags:
+ *	  cls_bpf:
+ *          bit 0 - if set, redirect to ingress instead of egress
+ *          other bits - reserved
+ *	  xdp_bpf:
+ *	    all bits - reserved
+ *     Return: cls_bpf: TC_ACT_REDIRECT on success or TC_ACT_SHOT on error
+ *	       xdp_bfp: XDP_REDIRECT on success or XDP_ABORT on error
+ * int bpf_redirect_map(map, key, flags)
  *     redirect to endpoint in map
+ *     @map: pointer to dev map
  *     @key: index in map to lookup
- *     @map: fd of map to do lookup in
  *     @flags: --
+ *     Return: XDP_REDIRECT on success or XDP_ABORT on error
  *
  * u32 bpf_get_route_realm(skb)
  *     retrieve a dst's tclassid
-- 
cgit v1.2.3


From 8eede5bc4ef04281fbba7ddfe157052d0e76075d Mon Sep 17 00:00:00 2001
From: Nate Watterson <nwatters@codeaurora.org>
Date: Thu, 20 Jul 2017 15:26:24 -0400
Subject: ata: ahci_platform: Add shutdown handler

The newly introduced ahci_platform_shutdown() method is called during
system shutdown to disable host controller DMA and interrupts in order
to avoid potentially corrupting or otherwise interfering with a new
kernel being started with kexec.

Signed-off-by: Nate Watterson <nwatters@codeaurora.org>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 drivers/ata/ahci_platform.c    |  1 +
 drivers/ata/libahci_platform.c | 34 ++++++++++++++++++++++++++++++++++
 include/linux/ahci_platform.h  |  2 ++
 3 files changed, 37 insertions(+)

(limited to 'include')

diff --git a/drivers/ata/ahci_platform.c b/drivers/ata/ahci_platform.c
index 62a04c8fb5c9..99f9a895a459 100644
--- a/drivers/ata/ahci_platform.c
+++ b/drivers/ata/ahci_platform.c
@@ -93,6 +93,7 @@ MODULE_DEVICE_TABLE(acpi, ahci_acpi_match);
 static struct platform_driver ahci_driver = {
 	.probe = ahci_probe,
 	.remove = ata_platform_remove_one,
+	.shutdown = ahci_platform_shutdown,
 	.driver = {
 		.name = DRV_NAME,
 		.of_match_table = ahci_of_match,
diff --git a/drivers/ata/libahci_platform.c b/drivers/ata/libahci_platform.c
index cd2eab6aa92e..a270a1173c8c 100644
--- a/drivers/ata/libahci_platform.c
+++ b/drivers/ata/libahci_platform.c
@@ -602,6 +602,40 @@ static void ahci_host_stop(struct ata_host *host)
 	ahci_platform_disable_resources(hpriv);
 }
 
+/**
+ * ahci_platform_shutdown - Disable interrupts and stop DMA for host ports
+ * @dev: platform device pointer for the host
+ *
+ * This function is called during system shutdown and performs the minimal
+ * deconfiguration required to ensure that an ahci_platform host cannot
+ * corrupt or otherwise interfere with a new kernel being started with kexec.
+ */
+void ahci_platform_shutdown(struct platform_device *pdev)
+{
+	struct ata_host *host = platform_get_drvdata(pdev);
+	struct ahci_host_priv *hpriv = host->private_data;
+	void __iomem *mmio = hpriv->mmio;
+	int i;
+
+	for (i = 0; i < host->n_ports; i++) {
+		struct ata_port *ap = host->ports[i];
+
+		/* Disable port interrupts */
+		if (ap->ops->freeze)
+			ap->ops->freeze(ap);
+
+		/* Stop the port DMA engines */
+		if (ap->ops->port_stop)
+			ap->ops->port_stop(ap);
+	}
+
+	/* Disable and clear host interrupts */
+	writel(readl(mmio + HOST_CTL) & ~HOST_IRQ_EN, mmio + HOST_CTL);
+	readl(mmio + HOST_CTL); /* flush */
+	writel(GENMASK(host->n_ports, 0), mmio + HOST_IRQ_STAT);
+}
+EXPORT_SYMBOL_GPL(ahci_platform_shutdown);
+
 #ifdef CONFIG_PM_SLEEP
 /**
  * ahci_platform_suspend_host - Suspend an ahci-platform host
diff --git a/include/linux/ahci_platform.h b/include/linux/ahci_platform.h
index a270f25ee7c7..1b0a17b22cd3 100644
--- a/include/linux/ahci_platform.h
+++ b/include/linux/ahci_platform.h
@@ -36,6 +36,8 @@ int ahci_platform_init_host(struct platform_device *pdev,
 			    const struct ata_port_info *pi_template,
 			    struct scsi_host_template *sht);
 
+void ahci_platform_shutdown(struct platform_device *pdev);
+
 int ahci_platform_suspend_host(struct device *dev);
 int ahci_platform_resume_host(struct device *dev);
 int ahci_platform_suspend(struct device *dev);
-- 
cgit v1.2.3


From e870c6c87cf9484090d28f2a68aa29e008960c93 Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rafael.j.wysocki@intel.com>
Date: Mon, 31 Jul 2017 23:43:18 +0200
Subject: ACPI / PM: Prefer suspend-to-idle over S3 on some systems

Modify the ACPI system sleep support setup code to select
suspend-to-idle as the default system sleep state if
(1) the ACPI_FADT_LOW_POWER_S0 flag is set in the FADT and
(2) the Low Power Idle S0 _DSM interface has been discovered and
(3) the default sleep state was not selected from the kernel command
line.

The main motivation for this change is that systems where the (1) and
(2) conditions are met typically ship with OSes that don't exercise
the S3 path in the platform firmware which remains untested and turns
out to be non-functional at least in some cases.

Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Tested-by: Mario Limonciello <mario.limonciello@dell.com>
---
 Documentation/power/states.txt | 4 +++-
 drivers/acpi/sleep.c           | 6 ++++++
 include/linux/suspend.h        | 3 +++
 kernel/power/power.h           | 1 -
 kernel/power/suspend.c         | 4 ++--
 5 files changed, 14 insertions(+), 4 deletions(-)

(limited to 'include')

diff --git a/Documentation/power/states.txt b/Documentation/power/states.txt
index bc4548245a24..205e45ad7c65 100644
--- a/Documentation/power/states.txt
+++ b/Documentation/power/states.txt
@@ -35,7 +35,9 @@ only one way to cause the system to go into the Suspend-To-RAM state (write
 The default suspend mode (ie. the one to be used without writing anything into
 /sys/power/mem_sleep) is either "deep" (if Suspend-To-RAM is supported) or
 "s2idle", but it can be overridden by the value of the "mem_sleep_default"
-parameter in the kernel command line.
+parameter in the kernel command line.  On some ACPI-based systems, depending on
+the information in the FADT, the default may be "s2idle" even if Suspend-To-RAM
+is supported.
 
 The properties of all of the sleep states are described below.
 
diff --git a/drivers/acpi/sleep.c b/drivers/acpi/sleep.c
index be17664736b2..b363283dfcd9 100644
--- a/drivers/acpi/sleep.c
+++ b/drivers/acpi/sleep.c
@@ -714,6 +714,12 @@ static int lps0_device_attach(struct acpi_device *adev,
 		if ((bitmask & ACPI_S2IDLE_FUNC_MASK) == ACPI_S2IDLE_FUNC_MASK) {
 			lps0_dsm_func_mask = bitmask;
 			lps0_device_handle = adev->handle;
+			/*
+			 * Use suspend-to-idle by default if the default
+			 * suspend mode was not set from the command line.
+			 */
+			if (mem_sleep_default > PM_SUSPEND_MEM)
+				mem_sleep_current = PM_SUSPEND_FREEZE;
 		}
 
 		acpi_handle_debug(adev->handle, "_DSM function mask: 0x%x\n",
diff --git a/include/linux/suspend.h b/include/linux/suspend.h
index 97e394feabdb..8c3b0b1e6786 100644
--- a/include/linux/suspend.h
+++ b/include/linux/suspend.h
@@ -196,6 +196,9 @@ struct platform_freeze_ops {
 };
 
 #ifdef CONFIG_SUSPEND
+extern suspend_state_t mem_sleep_current;
+extern suspend_state_t mem_sleep_default;
+
 /**
  * suspend_set_ops - set platform dependent suspend operations
  * @ops: The new suspend operations to set.
diff --git a/kernel/power/power.h b/kernel/power/power.h
index 268c1b0afc28..1d2d761e3c25 100644
--- a/kernel/power/power.h
+++ b/kernel/power/power.h
@@ -192,7 +192,6 @@ extern void swsusp_show_speed(ktime_t, ktime_t, unsigned int, char *);
 extern const char * const pm_labels[];
 extern const char *pm_states[];
 extern const char *mem_sleep_states[];
-extern suspend_state_t mem_sleep_current;
 
 extern int suspend_devices_and_enter(suspend_state_t state);
 #else /* !CONFIG_SUSPEND */
diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c
index 4bce46ddc2cd..0639d3a79852 100644
--- a/kernel/power/suspend.c
+++ b/kernel/power/suspend.c
@@ -48,7 +48,7 @@ static const char * const mem_sleep_labels[] = {
 const char *mem_sleep_states[PM_SUSPEND_MAX];
 
 suspend_state_t mem_sleep_current = PM_SUSPEND_FREEZE;
-static suspend_state_t mem_sleep_default = PM_SUSPEND_MEM;
+suspend_state_t mem_sleep_default = PM_SUSPEND_MAX;
 suspend_state_t pm_suspend_target_state;
 EXPORT_SYMBOL_GPL(pm_suspend_target_state);
 
@@ -216,7 +216,7 @@ void suspend_set_ops(const struct platform_suspend_ops *ops)
 	}
 	if (valid_state(PM_SUSPEND_MEM)) {
 		mem_sleep_states[PM_SUSPEND_MEM] = mem_sleep_labels[PM_SUSPEND_MEM];
-		if (mem_sleep_default == PM_SUSPEND_MEM)
+		if (mem_sleep_default >= PM_SUSPEND_MEM)
 			mem_sleep_current = PM_SUSPEND_MEM;
 	}
 
-- 
cgit v1.2.3


From 401c0a19c6c22efcaff85d5a64a396f9130da2ca Mon Sep 17 00:00:00 2001
From: Dan Williams <dan.j.williams@intel.com>
Date: Fri, 4 Aug 2017 17:20:16 -0700
Subject: nfit, libnvdimm, region: export 'position' in mapping info

It is useful to be able to know the position of a DIMM in an
interleave-set. Consider the case where the order of the DIMMs changes
causing a namespace to be invalidated because the interleave-set cookie no
longer matches. If the before and after state of each DIMM position is
known this state debugged by the system owner.

Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 drivers/acpi/nfit/core.c     | 24 ++++++++++++++++++++++++
 drivers/nvdimm/nd.h          |  1 +
 drivers/nvdimm/region_devs.c |  6 ++++--
 include/linux/libnvdimm.h    |  1 +
 4 files changed, 30 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/drivers/acpi/nfit/core.c b/drivers/acpi/nfit/core.c
index 19182d091587..be231a549eb0 100644
--- a/drivers/acpi/nfit/core.c
+++ b/drivers/acpi/nfit/core.c
@@ -1835,6 +1835,30 @@ static int acpi_nfit_init_interleave_set(struct acpi_nfit_desc *acpi_desc,
 			cmp_map_compat, NULL);
 	nd_set->altcookie = nd_fletcher64(info, sizeof_nfit_set_info(nr), 0);
 
+	/* record the result of the sort for the mapping position */
+	for (i = 0; i < nr; i++) {
+		struct nfit_set_info_map2 *map2 = &info2->mapping[i];
+		int j;
+
+		for (j = 0; j < nr; j++) {
+			struct nd_mapping_desc *mapping = &ndr_desc->mapping[j];
+			struct nvdimm *nvdimm = mapping->nvdimm;
+			struct nfit_mem *nfit_mem = nvdimm_provider_data(nvdimm);
+
+			if (map2->serial_number
+				== nfit_mem->dcr->serial_number &&
+			    map2->vendor_id
+				== nfit_mem->dcr->vendor_id &&
+			    map2->manufacturing_date
+				== nfit_mem->dcr->manufacturing_date &&
+			    map2->manufacturing_location
+				== nfit_mem->dcr->manufacturing_location) {
+				mapping->position = i;
+				break;
+			}
+		}
+	}
+
 	ndr_desc->nd_set = nd_set;
 	devm_kfree(dev, info);
 	devm_kfree(dev, info2);
diff --git a/drivers/nvdimm/nd.h b/drivers/nvdimm/nd.h
index e9fa9e84b364..a08fc2e24fb3 100644
--- a/drivers/nvdimm/nd.h
+++ b/drivers/nvdimm/nd.h
@@ -134,6 +134,7 @@ struct nd_mapping {
 	struct nvdimm *nvdimm;
 	u64 start;
 	u64 size;
+	int position;
 	struct list_head labels;
 	struct mutex lock;
 	/*
diff --git a/drivers/nvdimm/region_devs.c b/drivers/nvdimm/region_devs.c
index 5954cfbea3fc..829d760f651c 100644
--- a/drivers/nvdimm/region_devs.c
+++ b/drivers/nvdimm/region_devs.c
@@ -723,8 +723,9 @@ static ssize_t mappingN(struct device *dev, char *buf, int n)
 	nd_mapping = &nd_region->mapping[n];
 	nvdimm = nd_mapping->nvdimm;
 
-	return sprintf(buf, "%s,%llu,%llu\n", dev_name(&nvdimm->dev),
-			nd_mapping->start, nd_mapping->size);
+	return sprintf(buf, "%s,%llu,%llu,%d\n", dev_name(&nvdimm->dev),
+			nd_mapping->start, nd_mapping->size,
+			nd_mapping->position);
 }
 
 #define REGION_MAPPING(idx) \
@@ -965,6 +966,7 @@ static struct nd_region *nd_region_create(struct nvdimm_bus *nvdimm_bus,
 		nd_region->mapping[i].nvdimm = nvdimm;
 		nd_region->mapping[i].start = mapping->start;
 		nd_region->mapping[i].size = mapping->size;
+		nd_region->mapping[i].position = mapping->position;
 		INIT_LIST_HEAD(&nd_region->mapping[i].labels);
 		mutex_init(&nd_region->mapping[i].lock);
 
diff --git a/include/linux/libnvdimm.h b/include/linux/libnvdimm.h
index f3d3e6af8838..9b8d81a7b80e 100644
--- a/include/linux/libnvdimm.h
+++ b/include/linux/libnvdimm.h
@@ -87,6 +87,7 @@ struct nd_mapping_desc {
 	struct nvdimm *nvdimm;
 	u64 start;
 	u64 size;
+	int position;
 };
 
 struct nd_region_desc {
-- 
cgit v1.2.3


From 472b46c352c9ff0b6fa57dbf85d77c51901a3368 Mon Sep 17 00:00:00 2001
From: Mikko Rapeli <mikko.rapeli@iki.fi>
Date: Sun, 6 Aug 2017 18:44:27 +0200
Subject: uapi linux/kfd_ioctl.h: only use __u32 and __u64
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Include <drm/drm.h> instead of <linux/types.h> which on Linux includes
<linux/types.h> and on non-Linux platforms defines __u32 etc types.

Fixes user space compilation errors like:

linux/kfd_ioctl.h:33:2: error: unknown type name ‘uint32_t’
  uint32_t major_version; /* from KFD */
  ^~~~~~~~

Signed-off-by: Mikko Rapeli <mikko.rapeli@iki.fi>
Acked-by: Arnd Bergmann <arnd@arndb.de>
Signed-off-by: Oded Gabbay <oded.gabbay@gmail.com>
---
 include/uapi/linux/kfd_ioctl.h | 172 ++++++++++++++++++++---------------------
 1 file changed, 86 insertions(+), 86 deletions(-)

(limited to 'include')

diff --git a/include/uapi/linux/kfd_ioctl.h b/include/uapi/linux/kfd_ioctl.h
index 7b4567bacfc2..26283fefdf5f 100644
--- a/include/uapi/linux/kfd_ioctl.h
+++ b/include/uapi/linux/kfd_ioctl.h
@@ -23,15 +23,15 @@
 #ifndef KFD_IOCTL_H_INCLUDED
 #define KFD_IOCTL_H_INCLUDED
 
-#include <linux/types.h>
+#include <drm/drm.h>
 #include <linux/ioctl.h>
 
 #define KFD_IOCTL_MAJOR_VERSION 1
 #define KFD_IOCTL_MINOR_VERSION 1
 
 struct kfd_ioctl_get_version_args {
-	uint32_t major_version;	/* from KFD */
-	uint32_t minor_version;	/* from KFD */
+	__u32 major_version;	/* from KFD */
+	__u32 minor_version;	/* from KFD */
 };
 
 /* For kfd_ioctl_create_queue_args.queue_type. */
@@ -43,36 +43,36 @@ struct kfd_ioctl_get_version_args {
 #define KFD_MAX_QUEUE_PRIORITY		15
 
 struct kfd_ioctl_create_queue_args {
-	uint64_t ring_base_address;	/* to KFD */
-	uint64_t write_pointer_address;	/* from KFD */
-	uint64_t read_pointer_address;	/* from KFD */
-	uint64_t doorbell_offset;	/* from KFD */
-
-	uint32_t ring_size;		/* to KFD */
-	uint32_t gpu_id;		/* to KFD */
-	uint32_t queue_type;		/* to KFD */
-	uint32_t queue_percentage;	/* to KFD */
-	uint32_t queue_priority;	/* to KFD */
-	uint32_t queue_id;		/* from KFD */
-
-	uint64_t eop_buffer_address;	/* to KFD */
-	uint64_t eop_buffer_size;	/* to KFD */
-	uint64_t ctx_save_restore_address; /* to KFD */
-	uint64_t ctx_save_restore_size;	/* to KFD */
+	__u64 ring_base_address;	/* to KFD */
+	__u64 write_pointer_address;	/* from KFD */
+	__u64 read_pointer_address;	/* from KFD */
+	__u64 doorbell_offset;	/* from KFD */
+
+	__u32 ring_size;		/* to KFD */
+	__u32 gpu_id;		/* to KFD */
+	__u32 queue_type;		/* to KFD */
+	__u32 queue_percentage;	/* to KFD */
+	__u32 queue_priority;	/* to KFD */
+	__u32 queue_id;		/* from KFD */
+
+	__u64 eop_buffer_address;	/* to KFD */
+	__u64 eop_buffer_size;	/* to KFD */
+	__u64 ctx_save_restore_address; /* to KFD */
+	__u64 ctx_save_restore_size;	/* to KFD */
 };
 
 struct kfd_ioctl_destroy_queue_args {
-	uint32_t queue_id;		/* to KFD */
-	uint32_t pad;
+	__u32 queue_id;		/* to KFD */
+	__u32 pad;
 };
 
 struct kfd_ioctl_update_queue_args {
-	uint64_t ring_base_address;	/* to KFD */
+	__u64 ring_base_address;	/* to KFD */
 
-	uint32_t queue_id;		/* to KFD */
-	uint32_t ring_size;		/* to KFD */
-	uint32_t queue_percentage;	/* to KFD */
-	uint32_t queue_priority;	/* to KFD */
+	__u32 queue_id;		/* to KFD */
+	__u32 ring_size;		/* to KFD */
+	__u32 queue_percentage;	/* to KFD */
+	__u32 queue_priority;	/* to KFD */
 };
 
 /* For kfd_ioctl_set_memory_policy_args.default_policy and alternate_policy */
@@ -80,13 +80,13 @@ struct kfd_ioctl_update_queue_args {
 #define KFD_IOC_CACHE_POLICY_NONCOHERENT 1
 
 struct kfd_ioctl_set_memory_policy_args {
-	uint64_t alternate_aperture_base;	/* to KFD */
-	uint64_t alternate_aperture_size;	/* to KFD */
+	__u64 alternate_aperture_base;	/* to KFD */
+	__u64 alternate_aperture_size;	/* to KFD */
 
-	uint32_t gpu_id;			/* to KFD */
-	uint32_t default_policy;		/* to KFD */
-	uint32_t alternate_policy;		/* to KFD */
-	uint32_t pad;
+	__u32 gpu_id;			/* to KFD */
+	__u32 default_policy;		/* to KFD */
+	__u32 alternate_policy;		/* to KFD */
+	__u32 pad;
 };
 
 /*
@@ -97,26 +97,26 @@ struct kfd_ioctl_set_memory_policy_args {
  */
 
 struct kfd_ioctl_get_clock_counters_args {
-	uint64_t gpu_clock_counter;	/* from KFD */
-	uint64_t cpu_clock_counter;	/* from KFD */
-	uint64_t system_clock_counter;	/* from KFD */
-	uint64_t system_clock_freq;	/* from KFD */
+	__u64 gpu_clock_counter;	/* from KFD */
+	__u64 cpu_clock_counter;	/* from KFD */
+	__u64 system_clock_counter;	/* from KFD */
+	__u64 system_clock_freq;	/* from KFD */
 
-	uint32_t gpu_id;		/* to KFD */
-	uint32_t pad;
+	__u32 gpu_id;		/* to KFD */
+	__u32 pad;
 };
 
 #define NUM_OF_SUPPORTED_GPUS 7
 
 struct kfd_process_device_apertures {
-	uint64_t lds_base;		/* from KFD */
-	uint64_t lds_limit;		/* from KFD */
-	uint64_t scratch_base;		/* from KFD */
-	uint64_t scratch_limit;		/* from KFD */
-	uint64_t gpuvm_base;		/* from KFD */
-	uint64_t gpuvm_limit;		/* from KFD */
-	uint32_t gpu_id;		/* from KFD */
-	uint32_t pad;
+	__u64 lds_base;		/* from KFD */
+	__u64 lds_limit;		/* from KFD */
+	__u64 scratch_base;		/* from KFD */
+	__u64 scratch_limit;		/* from KFD */
+	__u64 gpuvm_base;		/* from KFD */
+	__u64 gpuvm_limit;		/* from KFD */
+	__u32 gpu_id;		/* from KFD */
+	__u32 pad;
 };
 
 struct kfd_ioctl_get_process_apertures_args {
@@ -124,8 +124,8 @@ struct kfd_ioctl_get_process_apertures_args {
 			process_apertures[NUM_OF_SUPPORTED_GPUS];/* from KFD */
 
 	/* from KFD, should be in the range [1 - NUM_OF_SUPPORTED_GPUS] */
-	uint32_t num_of_nodes;
-	uint32_t pad;
+	__u32 num_of_nodes;
+	__u32 pad;
 };
 
 #define MAX_ALLOWED_NUM_POINTS    100
@@ -133,25 +133,25 @@ struct kfd_ioctl_get_process_apertures_args {
 #define MAX_ALLOWED_WAC_BUFF_SIZE  128
 
 struct kfd_ioctl_dbg_register_args {
-	uint32_t gpu_id;		/* to KFD */
-	uint32_t pad;
+	__u32 gpu_id;		/* to KFD */
+	__u32 pad;
 };
 
 struct kfd_ioctl_dbg_unregister_args {
-	uint32_t gpu_id;		/* to KFD */
-	uint32_t pad;
+	__u32 gpu_id;		/* to KFD */
+	__u32 pad;
 };
 
 struct kfd_ioctl_dbg_address_watch_args {
-	uint64_t content_ptr;		/* a pointer to the actual content */
-	uint32_t gpu_id;		/* to KFD */
-	uint32_t buf_size_in_bytes;	/*including gpu_id and buf_size */
+	__u64 content_ptr;		/* a pointer to the actual content */
+	__u32 gpu_id;		/* to KFD */
+	__u32 buf_size_in_bytes;	/*including gpu_id and buf_size */
 };
 
 struct kfd_ioctl_dbg_wave_control_args {
-	uint64_t content_ptr;		/* a pointer to the actual content */
-	uint32_t gpu_id;		/* to KFD */
-	uint32_t buf_size_in_bytes;	/*including gpu_id and buf_size */
+	__u64 content_ptr;		/* a pointer to the actual content */
+	__u32 gpu_id;		/* to KFD */
+	__u32 buf_size_in_bytes;	/*including gpu_id and buf_size */
 };
 
 /* Matching HSA_EVENTTYPE */
@@ -172,44 +172,44 @@ struct kfd_ioctl_dbg_wave_control_args {
 #define KFD_SIGNAL_EVENT_LIMIT			256
 
 struct kfd_ioctl_create_event_args {
-	uint64_t event_page_offset;	/* from KFD */
-	uint32_t event_trigger_data;	/* from KFD - signal events only */
-	uint32_t event_type;		/* to KFD */
-	uint32_t auto_reset;		/* to KFD */
-	uint32_t node_id;		/* to KFD - only valid for certain
+	__u64 event_page_offset;	/* from KFD */
+	__u32 event_trigger_data;	/* from KFD - signal events only */
+	__u32 event_type;		/* to KFD */
+	__u32 auto_reset;		/* to KFD */
+	__u32 node_id;		/* to KFD - only valid for certain
 							event types */
-	uint32_t event_id;		/* from KFD */
-	uint32_t event_slot_index;	/* from KFD */
+	__u32 event_id;		/* from KFD */
+	__u32 event_slot_index;	/* from KFD */
 };
 
 struct kfd_ioctl_destroy_event_args {
-	uint32_t event_id;		/* to KFD */
-	uint32_t pad;
+	__u32 event_id;		/* to KFD */
+	__u32 pad;
 };
 
 struct kfd_ioctl_set_event_args {
-	uint32_t event_id;		/* to KFD */
-	uint32_t pad;
+	__u32 event_id;		/* to KFD */
+	__u32 pad;
 };
 
 struct kfd_ioctl_reset_event_args {
-	uint32_t event_id;		/* to KFD */
-	uint32_t pad;
+	__u32 event_id;		/* to KFD */
+	__u32 pad;
 };
 
 struct kfd_memory_exception_failure {
-	uint32_t NotPresent;	/* Page not present or supervisor privilege */
-	uint32_t ReadOnly;	/* Write access to a read-only page */
-	uint32_t NoExecute;	/* Execute access to a page marked NX */
-	uint32_t pad;
+	__u32 NotPresent;	/* Page not present or supervisor privilege */
+	__u32 ReadOnly;	/* Write access to a read-only page */
+	__u32 NoExecute;	/* Execute access to a page marked NX */
+	__u32 pad;
 };
 
 /* memory exception data*/
 struct kfd_hsa_memory_exception_data {
 	struct kfd_memory_exception_failure failure;
-	uint64_t va;
-	uint32_t gpu_id;
-	uint32_t pad;
+	__u64 va;
+	__u32 gpu_id;
+	__u32 pad;
 };
 
 /* Event data*/
@@ -217,19 +217,19 @@ struct kfd_event_data {
 	union {
 		struct kfd_hsa_memory_exception_data memory_exception_data;
 	};				/* From KFD */
-	uint64_t kfd_event_data_ext;	/* pointer to an extension structure
+	__u64 kfd_event_data_ext;	/* pointer to an extension structure
 					   for future exception types */
-	uint32_t event_id;		/* to KFD */
-	uint32_t pad;
+	__u32 event_id;		/* to KFD */
+	__u32 pad;
 };
 
 struct kfd_ioctl_wait_events_args {
-	uint64_t events_ptr;		/* pointed to struct
+	__u64 events_ptr;		/* pointed to struct
 					   kfd_event_data array, to KFD */
-	uint32_t num_events;		/* to KFD */
-	uint32_t wait_for_all;		/* to KFD */
-	uint32_t timeout;		/* to KFD */
-	uint32_t wait_result;		/* from KFD */
+	__u32 num_events;		/* to KFD */
+	__u32 wait_for_all;		/* to KFD */
+	__u32 timeout;		/* to KFD */
+	__u32 wait_result;		/* from KFD */
 };
 
 struct kfd_ioctl_set_scratch_backing_va_args {
-- 
cgit v1.2.3


From cbbd6c2f551c27199c90f680851609f888a3048b Mon Sep 17 00:00:00 2001
From: Elaine Zhang <zhangqing@rock-chips.com>
Date: Wed, 2 Aug 2017 16:28:39 +0800
Subject: clk: rockchip: add more clk ids for rv1108

Add new clk ids for the peripherals on rv1108 soc.

Signed-off-by: Elaine Zhang <zhangqing@rock-chips.com>
Signed-off-by: Andy Yan <andy.yan@rock-chips.com>
Signed-off-by: Heiko Stuebner <heiko@sntech.de>
---
 include/dt-bindings/clock/rv1108-cru.h | 93 +++++++++++++++++++++++++++++++++-
 1 file changed, 92 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/dt-bindings/clock/rv1108-cru.h b/include/dt-bindings/clock/rv1108-cru.h
index ae26f8105914..fe013cf51e2b 100644
--- a/include/dt-bindings/clock/rv1108-cru.h
+++ b/include/dt-bindings/clock/rv1108-cru.h
@@ -43,12 +43,73 @@
 #define SCLK_SDMMC_SAMPLE		84
 #define SCLK_SDIO_SAMPLE		85
 #define SCLK_EMMC_SAMPLE		86
+#define SCLK_VENC_CORE			87
+#define SCLK_HEVC_CORE			88
+#define SCLK_HEVC_CABAC			89
+#define SCLK_PWM0_PMU			90
+#define SCLK_I2C0_PMU			91
+#define SCLK_WIFI			92
+#define SCLK_CIFOUT			93
+#define SCLK_MIPI_CSI_OUT		94
+#define SCLK_CIF0			95
+#define SCLK_CIF1			96
+#define SCLK_CIF2			97
+#define SCLK_CIF3			98
+#define SCLK_DSP			99
+#define SCLK_DSP_IOP			100
+#define SCLK_DSP_EPP			101
+#define SCLK_DSP_EDP			102
+#define SCLK_DSP_EDAP			103
+#define SCLK_CVBS_HOST			104
+#define SCLK_HDMI_SFR			105
+#define SCLK_HDMI_CEC			106
+#define SCLK_CRYPTO			107
+#define SCLK_SPI			108
+#define SCLK_SARADC			109
+#define SCLK_TSADC			110
+#define SCLK_MACPHY_PRE			111
+#define SCLK_MACPHY			112
+#define SCLK_MACPHY_RX			113
+#define SCLK_MAC_REF			114
+#define SCLK_MAC_REFOUT			115
+#define SCLK_DSP_PFM			116
+#define SCLK_RGA			117
+#define SCLK_I2C1			118
+#define SCLK_I2C2			119
+#define SCLK_I2C3			120
+#define SCLK_PWM			121
+#define SCLK_ISP			122
+#define SCLK_USBPHY			123
+#define SCLK_I2S0_SRC			124
+#define SCLK_I2S1_SRC			125
+#define SCLK_I2S2_SRC			126
+#define SCLK_UART0_SRC			127
+#define SCLK_UART1_SRC			128
+#define SCLK_UART2_SRC			129
+
+#define DCLK_VOP_SRC			185
+#define DCLK_HDMIPHY			186
+#define DCLK_VOP			187
 
 /* aclk gates */
 #define ACLK_DMAC			192
 #define ACLK_PRE			193
 #define ACLK_CORE			194
 #define ACLK_ENMCORE			195
+#define ACLK_RKVENC			196
+#define ACLK_RKVDEC			197
+#define ACLK_VPU			198
+#define ACLK_CIF0			199
+#define ACLK_VIO0			200
+#define ACLK_VIO1			201
+#define ACLK_VOP			202
+#define ACLK_IEP			203
+#define ACLK_RGA			204
+#define ACLK_ISP			205
+#define ACLK_CIF1			206
+#define ACLK_CIF2			207
+#define ACLK_CIF3			208
+#define ACLK_PERI			209
 
 /* pclk gates */
 #define PCLK_GPIO1			256
@@ -67,6 +128,19 @@
 #define PCLK_PWM			269
 #define PCLK_TIMER			270
 #define PCLK_PERI			271
+#define PCLK_GPIO0_PMU			272
+#define PCLK_I2C0_PMU			273
+#define PCLK_PWM0_PMU			274
+#define PCLK_ISP			275
+#define PCLK_VIO			276
+#define PCLK_MIPI_DSI			277
+#define PCLK_HDMI_CTRL			278
+#define PCLK_SARADC			279
+#define PCLK_DSP_CFG			280
+#define PCLK_BUS			281
+#define PCLK_EFUSE0			282
+#define PCLK_EFUSE1			283
+#define PCLK_WDT			284
 
 /* hclk gates */
 #define HCLK_I2S0_8CH			320
@@ -78,8 +152,25 @@
 #define HCLK_EMMC			326
 #define HCLK_PERI			327
 #define HCLK_SFC			328
+#define HCLK_RKVENC			329
+#define HCLK_RKVDEC			330
+#define HCLK_CIF0			331
+#define HCLK_VIO			332
+#define HCLK_VOP			333
+#define HCLK_IEP			334
+#define HCLK_RGA			335
+#define HCLK_ISP			336
+#define HCLK_CRYPTO_MST			337
+#define HCLK_CRYPTO_SLV			338
+#define HCLK_HOST0			339
+#define HCLK_OTG			340
+#define HCLK_CIF1			341
+#define HCLK_CIF2			342
+#define HCLK_CIF3			343
+#define HCLK_BUS			344
+#define HCLK_VPU			345
 
-#define CLK_NR_CLKS			(HCLK_SFC + 1)
+#define CLK_NR_CLKS			(HCLK_VPU + 1)
 
 /* reset id */
 #define SRST_CORE_PO_AD		0
-- 
cgit v1.2.3


From 1b6428a286ae3577b2ba3328a054115d561b8650 Mon Sep 17 00:00:00 2001
From: Elaine Zhang <zhangqing@rock-chips.com>
Date: Wed, 2 Aug 2017 16:29:48 +0800
Subject: clk: rockchip: rename the clk id for HCLK_I2S1_2CH

i2s1 has 2 channels but not 8 channels.

Signed-off-by: Elaine Zhang <zhangqing@rock-chips.com>
Signed-off-by: Andy Yan <andy.yan@rock-chips.com>

[and the clock id hasn't been used in either clock-driver nor dts,
 so is safe to rename]
Signed-off-by: Heiko Stuebner <heiko@sntech.de>
---
 include/dt-bindings/clock/rv1108-cru.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/dt-bindings/clock/rv1108-cru.h b/include/dt-bindings/clock/rv1108-cru.h
index fe013cf51e2b..e8aa6d56990a 100644
--- a/include/dt-bindings/clock/rv1108-cru.h
+++ b/include/dt-bindings/clock/rv1108-cru.h
@@ -144,7 +144,7 @@
 
 /* hclk gates */
 #define HCLK_I2S0_8CH			320
-#define HCLK_I2S1_8CH			321
+#define HCLK_I2S1_2CH			321
 #define HCLK_I2S2_2CH			322
 #define HCLK_NANDC			323
 #define HCLK_SDMMC			324
-- 
cgit v1.2.3


From a376a4b0453ffac35ba6215dd1d1e53e37c5e810 Mon Sep 17 00:00:00 2001
From: Elaine Zhang <zhangqing@rock-chips.com>
Date: Wed, 2 Aug 2017 16:30:33 +0800
Subject: clk: rockchip: fix up indentation of some RV1108 clock-ids

Make the code look better.

Signed-off-by: Elaine Zhang <zhangqing@rock-chips.com>
Signed-off-by: Andy Yan <andy.yan@rock-chips.com>
Signed-off-by: Heiko Stuebner <heiko@sntech.de>
---
 include/dt-bindings/clock/rv1108-cru.h | 28 ++++++++++++++--------------
 1 file changed, 14 insertions(+), 14 deletions(-)

(limited to 'include')

diff --git a/include/dt-bindings/clock/rv1108-cru.h b/include/dt-bindings/clock/rv1108-cru.h
index e8aa6d56990a..f269d833e41a 100644
--- a/include/dt-bindings/clock/rv1108-cru.h
+++ b/include/dt-bindings/clock/rv1108-cru.h
@@ -173,16 +173,16 @@
 #define CLK_NR_CLKS			(HCLK_VPU + 1)
 
 /* reset id */
-#define SRST_CORE_PO_AD		0
+#define SRST_CORE_PO_AD			0
 #define SRST_CORE_AD			1
 #define SRST_L2_AD			2
-#define SRST_CPU_NIU_AD		3
+#define SRST_CPU_NIU_AD			3
 #define SRST_CORE_PO			4
 #define SRST_CORE			5
-#define SRST_L2			6
+#define SRST_L2				6
 #define SRST_CORE_DBG			8
 #define PRST_DBG			9
-#define RST_DAP			10
+#define RST_DAP				10
 #define PRST_DBG_NIU			11
 #define ARST_STRC_SYS_AD		15
 
@@ -249,9 +249,9 @@
 #define HRST_SYSBUS			75
 #define PRST_USBGRF			76
 
-#define ARST_PERIPH_NIU		80
-#define HRST_PERIPH_NIU		81
-#define PRST_PERIPH_NIU		82
+#define ARST_PERIPH_NIU			80
+#define HRST_PERIPH_NIU			81
+#define PRST_PERIPH_NIU			82
 #define HRST_PERIPH			83
 #define HRST_SDMMC			84
 #define HRST_SDIO			85
@@ -269,7 +269,7 @@
 #define HRST_HOST0_AUX			96
 #define HRST_HOST0_ARB			97
 #define SRST_HOST0_EHCIPHY		98
-#define SRST_HOST0_UTMI		99
+#define SRST_HOST0_UTMI			99
 #define SRST_USBPOR			100
 #define SRST_UTMI0			101
 #define SRST_UTMI1			102
@@ -316,21 +316,21 @@
 #define HRST_VPU_NIU			141
 #define ARST_VPU			142
 #define HRST_VPU			143
-#define ARST_RKVDEC_NIU		144
-#define HRST_RKVDEC_NIU		145
+#define ARST_RKVDEC_NIU			144
+#define HRST_RKVDEC_NIU			145
 #define ARST_RKVDEC			146
 #define HRST_RKVDEC			147
 #define SRST_RKVDEC_CABAC		148
 #define SRST_RKVDEC_CORE		149
-#define ARST_RKVENC_NIU		150
-#define HRST_RKVENC_NIU		151
+#define ARST_RKVENC_NIU			150
+#define HRST_RKVENC_NIU			151
 #define ARST_RKVENC			152
 #define HRST_RKVENC			153
 #define SRST_RKVENC_CORE		154
 
 #define SRST_DSP_CORE			156
 #define SRST_DSP_SYS			157
-#define SRST_DSP_GLOBAL		158
+#define SRST_DSP_GLOBAL			158
 #define SRST_DSP_OECM			159
 #define PRST_DSP_IOP_NIU		160
 #define ARST_DSP_EPP_NIU		161
@@ -348,7 +348,7 @@
 #define SRST_PMU_I2C0			173
 #define PRST_PMU_I2C0			174
 #define PRST_PMU_GPIO0			175
-#define PRST_PMU_INTMEM		176
+#define PRST_PMU_INTMEM			176
 #define PRST_PMU_PWM0			177
 #define SRST_PMU_PWM0			178
 #define PRST_PMU_GRF			179
-- 
cgit v1.2.3


From 91ed1e666a4ea2e260452a7d7d311ac5ae852cba Mon Sep 17 00:00:00 2001
From: Paolo Abeni <pabeni@redhat.com>
Date: Thu, 3 Aug 2017 18:07:06 +0200
Subject: ip/options: explicitly provide net ns to __ip_options_echo()

__ip_options_echo() uses the current network namespace, and
currently retrives it via skb->dst->dev.

This commit adds an explicit 'net' argument to __ip_options_echo()
and update all the call sites to provide it, usually via a simpler
sock_net().

After this change, __ip_options_echo() no more needs to access
skb->dst and we can drop a couple of hack to preserve such
info in the rx path.

Signed-off-by: Paolo Abeni <pabeni@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/ip.h       | 9 +++++----
 include/net/tcp.h      | 5 +++--
 net/ipv4/icmp.c        | 4 ++--
 net/ipv4/ip_options.c  | 6 +++---
 net/ipv4/ip_output.c   | 2 +-
 net/ipv4/ip_sockglue.c | 7 ++++---
 net/ipv4/syncookies.c  | 2 +-
 net/ipv4/tcp_ipv4.c    | 2 +-
 8 files changed, 20 insertions(+), 17 deletions(-)

(limited to 'include')

diff --git a/include/net/ip.h b/include/net/ip.h
index 821cedcc8e73..9e59dcf1787a 100644
--- a/include/net/ip.h
+++ b/include/net/ip.h
@@ -567,11 +567,12 @@ int ip_forward(struct sk_buff *skb);
 void ip_options_build(struct sk_buff *skb, struct ip_options *opt,
 		      __be32 daddr, struct rtable *rt, int is_frag);
 
-int __ip_options_echo(struct ip_options *dopt, struct sk_buff *skb,
-		      const struct ip_options *sopt);
-static inline int ip_options_echo(struct ip_options *dopt, struct sk_buff *skb)
+int __ip_options_echo(struct net *net, struct ip_options *dopt,
+		      struct sk_buff *skb, const struct ip_options *sopt);
+static inline int ip_options_echo(struct net *net, struct ip_options *dopt,
+				  struct sk_buff *skb)
 {
-	return __ip_options_echo(dopt, skb, &IPCB(skb)->opt);
+	return __ip_options_echo(net, dopt, skb, &IPCB(skb)->opt);
 }
 
 void ip_options_fragment(struct sk_buff *skb);
diff --git a/include/net/tcp.h b/include/net/tcp.h
index bb1881b4ce48..5173fecde495 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -1885,7 +1885,8 @@ extern void tcp_rack_reo_timeout(struct sock *sk);
 /*
  * Save and compile IPv4 options, return a pointer to it
  */
-static inline struct ip_options_rcu *tcp_v4_save_options(struct sk_buff *skb)
+static inline struct ip_options_rcu *tcp_v4_save_options(struct net *net,
+							 struct sk_buff *skb)
 {
 	const struct ip_options *opt = &TCP_SKB_CB(skb)->header.h4.opt;
 	struct ip_options_rcu *dopt = NULL;
@@ -1894,7 +1895,7 @@ static inline struct ip_options_rcu *tcp_v4_save_options(struct sk_buff *skb)
 		int opt_size = sizeof(*dopt) + opt->optlen;
 
 		dopt = kmalloc(opt_size, GFP_ATOMIC);
-		if (dopt && __ip_options_echo(&dopt->opt, skb, opt)) {
+		if (dopt && __ip_options_echo(net, &dopt->opt, skb, opt)) {
 			kfree(dopt);
 			dopt = NULL;
 		}
diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c
index c2be26b98b5f..681e33998e03 100644
--- a/net/ipv4/icmp.c
+++ b/net/ipv4/icmp.c
@@ -412,7 +412,7 @@ static void icmp_reply(struct icmp_bxm *icmp_param, struct sk_buff *skb)
 	int type = icmp_param->data.icmph.type;
 	int code = icmp_param->data.icmph.code;
 
-	if (ip_options_echo(&icmp_param->replyopts.opt.opt, skb))
+	if (ip_options_echo(net, &icmp_param->replyopts.opt.opt, skb))
 		return;
 
 	/* Needed by both icmp_global_allow and icmp_xmit_lock */
@@ -694,7 +694,7 @@ void icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info)
 					  iph->tos;
 	mark = IP4_REPLY_MARK(net, skb_in->mark);
 
-	if (ip_options_echo(&icmp_param.replyopts.opt.opt, skb_in))
+	if (ip_options_echo(net, &icmp_param.replyopts.opt.opt, skb_in))
 		goto out_unlock;
 
 
diff --git a/net/ipv4/ip_options.c b/net/ipv4/ip_options.c
index fdda97308c0b..525ae88d1e58 100644
--- a/net/ipv4/ip_options.c
+++ b/net/ipv4/ip_options.c
@@ -86,8 +86,8 @@ void ip_options_build(struct sk_buff *skb, struct ip_options *opt,
  * NOTE: dopt cannot point to skb.
  */
 
-int __ip_options_echo(struct ip_options *dopt, struct sk_buff *skb,
-		      const struct ip_options *sopt)
+int __ip_options_echo(struct net *net, struct ip_options *dopt,
+		      struct sk_buff *skb, const struct ip_options *sopt)
 {
 	unsigned char *sptr, *dptr;
 	int soffset, doffset;
@@ -140,7 +140,7 @@ int __ip_options_echo(struct ip_options *dopt, struct sk_buff *skb,
 						__be32 addr;
 
 						memcpy(&addr, dptr+soffset-1, 4);
-						if (inet_addr_type(dev_net(skb_dst(skb)->dev), addr) != RTN_UNICAST) {
+						if (inet_addr_type(net, addr) != RTN_UNICAST) {
 							dopt->ts_needtime = 1;
 							soffset += 8;
 						}
diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c
index b631ec685d77..73b0b15245b6 100644
--- a/net/ipv4/ip_output.c
+++ b/net/ipv4/ip_output.c
@@ -1525,7 +1525,7 @@ void ip_send_unicast_reply(struct sock *sk, struct sk_buff *skb,
 	int err;
 	int oif;
 
-	if (__ip_options_echo(&replyopts.opt.opt, skb, sopt))
+	if (__ip_options_echo(net, &replyopts.opt.opt, skb, sopt))
 		return;
 
 	ipc.addr = daddr;
diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c
index ecc4b4a2413e..1c3354d028a4 100644
--- a/net/ipv4/ip_sockglue.c
+++ b/net/ipv4/ip_sockglue.c
@@ -80,7 +80,8 @@ static void ip_cmsg_recv_opts(struct msghdr *msg, struct sk_buff *skb)
 }
 
 
-static void ip_cmsg_recv_retopts(struct msghdr *msg, struct sk_buff *skb)
+static void ip_cmsg_recv_retopts(struct net *net, struct msghdr *msg,
+				 struct sk_buff *skb)
 {
 	unsigned char optbuf[sizeof(struct ip_options) + 40];
 	struct ip_options *opt = (struct ip_options *)optbuf;
@@ -88,7 +89,7 @@ static void ip_cmsg_recv_retopts(struct msghdr *msg, struct sk_buff *skb)
 	if (IPCB(skb)->opt.optlen == 0)
 		return;
 
-	if (ip_options_echo(opt, skb)) {
+	if (ip_options_echo(net, opt, skb)) {
 		msg->msg_flags |= MSG_CTRUNC;
 		return;
 	}
@@ -204,7 +205,7 @@ void ip_cmsg_recv_offset(struct msghdr *msg, struct sock *sk,
 	}
 
 	if (flags & IP_CMSG_RETOPTS) {
-		ip_cmsg_recv_retopts(msg, skb);
+		ip_cmsg_recv_retopts(sock_net(sk), msg, skb);
 
 		flags &= ~IP_CMSG_RETOPTS;
 		if (!flags)
diff --git a/net/ipv4/syncookies.c b/net/ipv4/syncookies.c
index 03ad8778c395..b1bb1b3a1082 100644
--- a/net/ipv4/syncookies.c
+++ b/net/ipv4/syncookies.c
@@ -355,7 +355,7 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb)
 	/* We throwed the options of the initial SYN away, so we hope
 	 * the ACK carries the same options again (see RFC1122 4.2.3.8)
 	 */
-	ireq->opt = tcp_v4_save_options(skb);
+	ireq->opt = tcp_v4_save_options(sock_net(sk), skb);
 
 	if (security_inet_conn_request(sk, skb, req)) {
 		reqsk_free(req);
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 9b51663cd5a4..5f708c85110e 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -1267,7 +1267,7 @@ static void tcp_v4_init_req(struct request_sock *req,
 
 	sk_rcv_saddr_set(req_to_sk(req), ip_hdr(skb)->daddr);
 	sk_daddr_set(req_to_sk(req), ip_hdr(skb)->saddr);
-	ireq->opt = tcp_v4_save_options(skb);
+	ireq->opt = tcp_v4_save_options(sock_net(sk_listener), skb);
 }
 
 static struct dst_entry *tcp_v4_route_req(const struct sock *sk,
-- 
cgit v1.2.3


From da4625ac2637e4e5249dc08a10f8dce7643603d2 Mon Sep 17 00:00:00 2001
From: Russell King <rmk+kernel@armlinux.org.uk>
Date: Tue, 25 Jul 2017 15:02:42 +0100
Subject: net: phy: split out PHY speed and duplex string generation

Other code would like to make use of this, so make the speed and duplex
string generation visible, and place it in a separate file.

Signed-off-by: Russell King <rmk+kernel@armlinux.org.uk>
Reviewed-by: Andrew Lunn <andrew@lunn.ch>
Reviewed-by: Florian Fainelli <f.fainelli@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/phy/phy-core.c | 49 ++++++++++++++++++++++++++++++++++++++++++++++
 drivers/net/phy/phy.c      | 38 +----------------------------------
 include/linux/phy.h        |  3 +++
 3 files changed, 53 insertions(+), 37 deletions(-)

(limited to 'include')

diff --git a/drivers/net/phy/phy-core.c b/drivers/net/phy/phy-core.c
index 6739b738bbaf..bf01a24f21ce 100644
--- a/drivers/net/phy/phy-core.c
+++ b/drivers/net/phy/phy-core.c
@@ -9,6 +9,55 @@
 #include <linux/export.h>
 #include <linux/phy.h>
 
+const char *phy_speed_to_str(int speed)
+{
+	switch (speed) {
+	case SPEED_10:
+		return "10Mbps";
+	case SPEED_100:
+		return "100Mbps";
+	case SPEED_1000:
+		return "1Gbps";
+	case SPEED_2500:
+		return "2.5Gbps";
+	case SPEED_5000:
+		return "5Gbps";
+	case SPEED_10000:
+		return "10Gbps";
+	case SPEED_14000:
+		return "14Gbps";
+	case SPEED_20000:
+		return "20Gbps";
+	case SPEED_25000:
+		return "25Gbps";
+	case SPEED_40000:
+		return "40Gbps";
+	case SPEED_50000:
+		return "50Gbps";
+	case SPEED_56000:
+		return "56Gbps";
+	case SPEED_100000:
+		return "100Gbps";
+	case SPEED_UNKNOWN:
+		return "Unknown";
+	default:
+		return "Unsupported (update phy-core.c)";
+	}
+}
+EXPORT_SYMBOL_GPL(phy_speed_to_str);
+
+const char *phy_duplex_to_str(unsigned int duplex)
+{
+	if (duplex == DUPLEX_HALF)
+		return "Half";
+	if (duplex == DUPLEX_FULL)
+		return "Full";
+	if (duplex == DUPLEX_UNKNOWN)
+		return "Unknown";
+	return "Unsupported (update phy-core.c)";
+}
+EXPORT_SYMBOL_GPL(phy_duplex_to_str);
+
 static void mmd_phy_indirect(struct mii_bus *bus, int phy_addr, int devad,
 			     u16 regnum)
 {
diff --git a/drivers/net/phy/phy.c b/drivers/net/phy/phy.c
index abae9167b5fc..7b7fe6beae7e 100644
--- a/drivers/net/phy/phy.c
+++ b/drivers/net/phy/phy.c
@@ -38,42 +38,6 @@
 
 #include <asm/irq.h>
 
-static const char *phy_speed_to_str(int speed)
-{
-	switch (speed) {
-	case SPEED_10:
-		return "10Mbps";
-	case SPEED_100:
-		return "100Mbps";
-	case SPEED_1000:
-		return "1Gbps";
-	case SPEED_2500:
-		return "2.5Gbps";
-	case SPEED_5000:
-		return "5Gbps";
-	case SPEED_10000:
-		return "10Gbps";
-	case SPEED_14000:
-		return "14Gbps";
-	case SPEED_20000:
-		return "20Gbps";
-	case SPEED_25000:
-		return "25Gbps";
-	case SPEED_40000:
-		return "40Gbps";
-	case SPEED_50000:
-		return "50Gbps";
-	case SPEED_56000:
-		return "56Gbps";
-	case SPEED_100000:
-		return "100Gbps";
-	case SPEED_UNKNOWN:
-		return "Unknown";
-	default:
-		return "Unsupported (update phy.c)";
-	}
-}
-
 #define PHY_STATE_STR(_state)			\
 	case PHY_##_state:			\
 		return __stringify(_state);	\
@@ -109,7 +73,7 @@ void phy_print_status(struct phy_device *phydev)
 		netdev_info(phydev->attached_dev,
 			"Link is Up - %s/%s - flow control %s\n",
 			phy_speed_to_str(phydev->speed),
-			DUPLEX_FULL == phydev->duplex ? "Full" : "Half",
+			phy_duplex_to_str(phydev->duplex),
 			phydev->pause ? "rx/tx" : "off");
 	} else	{
 		netdev_info(phydev->attached_dev, "Link is Down\n");
diff --git a/include/linux/phy.h b/include/linux/phy.h
index 0bb5b212ab42..e8264c78b75b 100644
--- a/include/linux/phy.h
+++ b/include/linux/phy.h
@@ -667,6 +667,9 @@ struct phy_fixup {
 	int (*run)(struct phy_device *phydev);
 };
 
+const char *phy_speed_to_str(int speed);
+const char *phy_duplex_to_str(unsigned int duplex);
+
 /**
  * phy_read_mmd - Convenience function for reading a register
  * from an MMD on a given PHY.
-- 
cgit v1.2.3


From 0ccb4fc65d2799a315d5ee8732d75f35a114379c Mon Sep 17 00:00:00 2001
From: Russell King <rmk+kernel@armlinux.org.uk>
Date: Tue, 25 Jul 2017 15:02:47 +0100
Subject: net: phy: move phy_lookup_setting() and guts of
 phy_supported_speeds() to phy-core

phy_lookup_setting() provides useful functionality in ethtool code
outside phylib.  Move it to phy-core and allow it to be re-used (eg,
in phylink) rather than duplicated elsewhere.  Note that this supports
the larger linkmode space.

As we move the phy settings table, we also need to move the guts of
phy_supported_speeds() as well.

Signed-off-by: Russell King <rmk+kernel@armlinux.org.uk>
Reviewed-by: Andrew Lunn <andrew@lunn.ch>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/phy/phy-core.c | 126 +++++++++++++++++++++++++++++++++++++++++++
 drivers/net/phy/phy.c      | 130 +--------------------------------------------
 include/linux/phy.h        |  15 ++++++
 3 files changed, 142 insertions(+), 129 deletions(-)

(limited to 'include')

diff --git a/drivers/net/phy/phy-core.c b/drivers/net/phy/phy-core.c
index bf01a24f21ce..c07845e77004 100644
--- a/drivers/net/phy/phy-core.c
+++ b/drivers/net/phy/phy-core.c
@@ -58,6 +58,132 @@ const char *phy_duplex_to_str(unsigned int duplex)
 }
 EXPORT_SYMBOL_GPL(phy_duplex_to_str);
 
+/* A mapping of all SUPPORTED settings to speed/duplex.  This table
+ * must be grouped by speed and sorted in descending match priority
+ * - iow, descending speed. */
+static const struct phy_setting settings[] = {
+	{
+		.speed = SPEED_10000,
+		.duplex = DUPLEX_FULL,
+		.bit = ETHTOOL_LINK_MODE_10000baseKR_Full_BIT,
+	},
+	{
+		.speed = SPEED_10000,
+		.duplex = DUPLEX_FULL,
+		.bit = ETHTOOL_LINK_MODE_10000baseKX4_Full_BIT,
+	},
+	{
+		.speed = SPEED_10000,
+		.duplex = DUPLEX_FULL,
+		.bit = ETHTOOL_LINK_MODE_10000baseT_Full_BIT,
+	},
+	{
+		.speed = SPEED_2500,
+		.duplex = DUPLEX_FULL,
+		.bit = ETHTOOL_LINK_MODE_2500baseX_Full_BIT,
+	},
+	{
+		.speed = SPEED_1000,
+		.duplex = DUPLEX_FULL,
+		.bit = ETHTOOL_LINK_MODE_1000baseKX_Full_BIT,
+	},
+	{
+		.speed = SPEED_1000,
+		.duplex = DUPLEX_FULL,
+		.bit = ETHTOOL_LINK_MODE_1000baseT_Full_BIT,
+	},
+	{
+		.speed = SPEED_1000,
+		.duplex = DUPLEX_HALF,
+		.bit = ETHTOOL_LINK_MODE_1000baseT_Half_BIT,
+	},
+	{
+		.speed = SPEED_100,
+		.duplex = DUPLEX_FULL,
+		.bit = ETHTOOL_LINK_MODE_100baseT_Full_BIT,
+	},
+	{
+		.speed = SPEED_100,
+		.duplex = DUPLEX_HALF,
+		.bit = ETHTOOL_LINK_MODE_100baseT_Half_BIT,
+	},
+	{
+		.speed = SPEED_10,
+		.duplex = DUPLEX_FULL,
+		.bit = ETHTOOL_LINK_MODE_10baseT_Full_BIT,
+	},
+	{
+		.speed = SPEED_10,
+		.duplex = DUPLEX_HALF,
+		.bit = ETHTOOL_LINK_MODE_10baseT_Half_BIT,
+	},
+};
+
+/**
+ * phy_lookup_setting - lookup a PHY setting
+ * @speed: speed to match
+ * @duplex: duplex to match
+ * @mask: allowed link modes
+ * @maxbit: bit size of link modes
+ * @exact: an exact match is required
+ *
+ * Search the settings array for a setting that matches the speed and
+ * duplex, and which is supported.
+ *
+ * If @exact is unset, either an exact match or %NULL for no match will
+ * be returned.
+ *
+ * If @exact is set, an exact match, the fastest supported setting at
+ * or below the specified speed, the slowest supported setting, or if
+ * they all fail, %NULL will be returned.
+ */
+const struct phy_setting *
+phy_lookup_setting(int speed, int duplex, const unsigned long *mask,
+		   size_t maxbit, bool exact)
+{
+	const struct phy_setting *p, *match = NULL, *last = NULL;
+	int i;
+
+	for (i = 0, p = settings; i < ARRAY_SIZE(settings); i++, p++) {
+		if (p->bit < maxbit && test_bit(p->bit, mask)) {
+			last = p;
+			if (p->speed == speed && p->duplex == duplex) {
+				/* Exact match for speed and duplex */
+				match = p;
+				break;
+			} else if (!exact) {
+				if (!match && p->speed <= speed)
+					/* Candidate */
+					match = p;
+
+				if (p->speed < speed)
+					break;
+			}
+		}
+	}
+
+	if (!match && !exact)
+		match = last;
+
+	return match;
+}
+EXPORT_SYMBOL_GPL(phy_lookup_setting);
+
+size_t phy_speeds(unsigned int *speeds, size_t size,
+		  unsigned long *mask, size_t maxbit)
+{
+	size_t count;
+	int i;
+
+	for (i = 0, count = 0; i < ARRAY_SIZE(settings) && count < size; i++)
+		if (settings[i].bit < maxbit &&
+		    test_bit(settings[i].bit, mask) &&
+		    (count == 0 || speeds[count - 1] != settings[i].speed))
+			speeds[count++] = settings[i].speed;
+
+	return count;
+}
+
 static void mmd_phy_indirect(struct mii_bus *bus, int phy_addr, int devad,
 			     u16 regnum)
 {
diff --git a/drivers/net/phy/phy.c b/drivers/net/phy/phy.c
index 7b7fe6beae7e..71c64a774856 100644
--- a/drivers/net/phy/phy.c
+++ b/drivers/net/phy/phy.c
@@ -157,125 +157,6 @@ int phy_aneg_done(struct phy_device *phydev)
 }
 EXPORT_SYMBOL(phy_aneg_done);
 
-/* A structure for mapping a particular speed and duplex
- * combination to a particular SUPPORTED and ADVERTISED value
- */
-struct phy_setting {
-	int speed;
-	int duplex;
-	int bit;
-};
-
-/* A mapping of all SUPPORTED settings to speed/duplex.  This table
- * must be grouped by speed and sorted in descending match priority
- * - iow, descending speed. */
-static const struct phy_setting settings[] = {
-	{
-		.speed = SPEED_10000,
-		.duplex = DUPLEX_FULL,
-		.bit = ETHTOOL_LINK_MODE_10000baseKR_Full_BIT,
-	},
-	{
-		.speed = SPEED_10000,
-		.duplex = DUPLEX_FULL,
-		.bit = ETHTOOL_LINK_MODE_10000baseKX4_Full_BIT,
-	},
-	{
-		.speed = SPEED_10000,
-		.duplex = DUPLEX_FULL,
-		.bit = ETHTOOL_LINK_MODE_10000baseT_Full_BIT,
-	},
-	{
-		.speed = SPEED_2500,
-		.duplex = DUPLEX_FULL,
-		.bit = ETHTOOL_LINK_MODE_2500baseX_Full_BIT,
-	},
-	{
-		.speed = SPEED_1000,
-		.duplex = DUPLEX_FULL,
-		.bit = ETHTOOL_LINK_MODE_1000baseKX_Full_BIT,
-	},
-	{
-		.speed = SPEED_1000,
-		.duplex = DUPLEX_FULL,
-		.bit = ETHTOOL_LINK_MODE_1000baseT_Full_BIT,
-	},
-	{
-		.speed = SPEED_1000,
-		.duplex = DUPLEX_HALF,
-		.bit = ETHTOOL_LINK_MODE_1000baseT_Half_BIT,
-	},
-	{
-		.speed = SPEED_100,
-		.duplex = DUPLEX_FULL,
-		.bit = ETHTOOL_LINK_MODE_100baseT_Full_BIT,
-	},
-	{
-		.speed = SPEED_100,
-		.duplex = DUPLEX_HALF,
-		.bit = ETHTOOL_LINK_MODE_100baseT_Half_BIT,
-	},
-	{
-		.speed = SPEED_10,
-		.duplex = DUPLEX_FULL,
-		.bit = ETHTOOL_LINK_MODE_10baseT_Full_BIT,
-	},
-	{
-		.speed = SPEED_10,
-		.duplex = DUPLEX_HALF,
-		.bit = ETHTOOL_LINK_MODE_10baseT_Half_BIT,
-	},
-};
-
-/**
- * phy_lookup_setting - lookup a PHY setting
- * @speed: speed to match
- * @duplex: duplex to match
- * @mask: allowed link modes
- * @maxbit: bit size of link modes
- * @exact: an exact match is required
- *
- * Search the settings array for a setting that matches the speed and
- * duplex, and which is supported.
- *
- * If @exact is unset, either an exact match or %NULL for no match will
- * be returned.
- *
- * If @exact is set, an exact match, the fastest supported setting at
- * or below the specified speed, the slowest supported setting, or if
- * they all fail, %NULL will be returned.
- */
-static const struct phy_setting *
-phy_lookup_setting(int speed, int duplex, const unsigned long *mask,
-		   size_t maxbit, bool exact)
-{
-	const struct phy_setting *p, *match = NULL, *last = NULL;
-	int i;
-
-	for (i = 0, p = settings; i < ARRAY_SIZE(settings); i++, p++) {
-		if (p->bit < maxbit && test_bit(p->bit, mask)) {
-			last = p;
-			if (p->speed == speed && p->duplex == duplex) {
-				/* Exact match for speed and duplex */
-				match = p;
-				break;
-			} else if (!exact) {
-				if (!match && p->speed <= speed)
-					/* Candidate */
-					match = p;
-
-				if (p->speed < speed)
-					break;
-			}
-		}
-	}
-
-	if (!match && !exact)
-		match = last;
-
-	return match;
-}
-
 /**
  * phy_find_valid - find a PHY setting that matches the requested parameters
  * @speed: desired speed
@@ -312,17 +193,8 @@ unsigned int phy_supported_speeds(struct phy_device *phy,
 				  unsigned int size)
 {
 	unsigned long supported = phy->supported;
-	unsigned int count = 0;
-	unsigned int idx = 0;
-
-	for (idx = 0; idx < ARRAY_SIZE(settings) && count < size; idx++)
-		/* Assumes settings are grouped by speed */
-		if (settings[idx].bit < BITS_PER_LONG &&
-		    !test_bit(settings[idx].bit, &supported) &&
-		    (count == 0 || speeds[count - 1] != settings[idx].speed))
-			speeds[count++] = settings[idx].speed;
 
-	return count;
+	return phy_speeds(speeds, size, &supported, BITS_PER_LONG);
 }
 
 /**
diff --git a/include/linux/phy.h b/include/linux/phy.h
index e8264c78b75b..8a280257778c 100644
--- a/include/linux/phy.h
+++ b/include/linux/phy.h
@@ -670,6 +670,21 @@ struct phy_fixup {
 const char *phy_speed_to_str(int speed);
 const char *phy_duplex_to_str(unsigned int duplex);
 
+/* A structure for mapping a particular speed and duplex
+ * combination to a particular SUPPORTED and ADVERTISED value
+ */
+struct phy_setting {
+	u32 speed;
+	u8 duplex;
+	u8 bit;
+};
+
+const struct phy_setting *
+phy_lookup_setting(int speed, int duplex, const unsigned long *mask,
+		   size_t maxbit, bool exact);
+size_t phy_speeds(unsigned int *speeds, size_t size,
+		  unsigned long *mask, size_t maxbit);
+
 /**
  * phy_read_mmd - Convenience function for reading a register
  * from an MMD on a given PHY.
-- 
cgit v1.2.3


From a81497bee70eb15039594b3116913133aa9c9b29 Mon Sep 17 00:00:00 2001
From: Russell King <rmk+kernel@arm.linux.org.uk>
Date: Tue, 25 Jul 2017 15:02:58 +0100
Subject: net: phy: provide a hook for link up/link down events

Sometimes, we need to do additional work between the PHY coming up and
marking the carrier present - for example, we may need to wait for the
PHY to MAC link to finish negotiation.  This changes phylib to provide
a notification function pointer which avoids the built-in
netif_carrier_on() and netif_carrier_off() functions.

Standard ->adjust_link functionality is provided by hooking a helper
into the new ->phy_link_change method.

Reviewed-by: Florian Fainelli <f.fainelli@gmail.com>
Signed-off-by: Russell King <rmk+kernel@arm.linux.org.uk>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/phy/phy.c        | 42 ++++++++++++++++++++----------------------
 drivers/net/phy/phy_device.c | 14 ++++++++++++++
 include/linux/phy.h          |  1 +
 3 files changed, 35 insertions(+), 22 deletions(-)

(limited to 'include')

diff --git a/drivers/net/phy/phy.c b/drivers/net/phy/phy.c
index 71c64a774856..d5f2af2c5ddb 100644
--- a/drivers/net/phy/phy.c
+++ b/drivers/net/phy/phy.c
@@ -865,9 +865,15 @@ void phy_start(struct phy_device *phydev)
 }
 EXPORT_SYMBOL(phy_start);
 
-static void phy_adjust_link(struct phy_device *phydev)
+static void phy_link_up(struct phy_device *phydev)
 {
-	phydev->adjust_link(phydev->attached_dev);
+	phydev->phy_link_change(phydev, true, true);
+	phy_led_trigger_change_speed(phydev);
+}
+
+static void phy_link_down(struct phy_device *phydev, bool do_carrier)
+{
+	phydev->phy_link_change(phydev, false, do_carrier);
 	phy_led_trigger_change_speed(phydev);
 }
 
@@ -912,8 +918,7 @@ void phy_state_machine(struct work_struct *work)
 		/* If the link is down, give up on negotiation for now */
 		if (!phydev->link) {
 			phydev->state = PHY_NOLINK;
-			netif_carrier_off(phydev->attached_dev);
-			phy_adjust_link(phydev);
+			phy_link_down(phydev, true);
 			break;
 		}
 
@@ -925,9 +930,7 @@ void phy_state_machine(struct work_struct *work)
 		/* If AN is done, we're running */
 		if (err > 0) {
 			phydev->state = PHY_RUNNING;
-			netif_carrier_on(phydev->attached_dev);
-			phy_adjust_link(phydev);
-
+			phy_link_up(phydev);
 		} else if (0 == phydev->link_timeout--)
 			needs_aneg = true;
 		break;
@@ -952,8 +955,7 @@ void phy_state_machine(struct work_struct *work)
 				}
 			}
 			phydev->state = PHY_RUNNING;
-			netif_carrier_on(phydev->attached_dev);
-			phy_adjust_link(phydev);
+			phy_link_up(phydev);
 		}
 		break;
 	case PHY_FORCING:
@@ -963,13 +965,12 @@ void phy_state_machine(struct work_struct *work)
 
 		if (phydev->link) {
 			phydev->state = PHY_RUNNING;
-			netif_carrier_on(phydev->attached_dev);
+			phy_link_up(phydev);
 		} else {
 			if (0 == phydev->link_timeout--)
 				needs_aneg = true;
+			phy_link_down(phydev, false);
 		}
-
-		phy_adjust_link(phydev);
 		break;
 	case PHY_RUNNING:
 		/* Only register a CHANGE if we are polling and link changed
@@ -1001,14 +1002,12 @@ void phy_state_machine(struct work_struct *work)
 
 		if (phydev->link) {
 			phydev->state = PHY_RUNNING;
-			netif_carrier_on(phydev->attached_dev);
+			phy_link_up(phydev);
 		} else {
 			phydev->state = PHY_NOLINK;
-			netif_carrier_off(phydev->attached_dev);
+			phy_link_down(phydev, true);
 		}
 
-		phy_adjust_link(phydev);
-
 		if (phy_interrupt_is_valid(phydev))
 			err = phy_config_interrupt(phydev,
 						   PHY_INTERRUPT_ENABLED);
@@ -1016,8 +1015,7 @@ void phy_state_machine(struct work_struct *work)
 	case PHY_HALTED:
 		if (phydev->link) {
 			phydev->link = 0;
-			netif_carrier_off(phydev->attached_dev);
-			phy_adjust_link(phydev);
+			phy_link_down(phydev, true);
 			do_suspend = true;
 		}
 		break;
@@ -1037,11 +1035,11 @@ void phy_state_machine(struct work_struct *work)
 
 				if (phydev->link) {
 					phydev->state = PHY_RUNNING;
-					netif_carrier_on(phydev->attached_dev);
+					phy_link_up(phydev);
 				} else	{
 					phydev->state = PHY_NOLINK;
+					phy_link_down(phydev, false);
 				}
-				phy_adjust_link(phydev);
 			} else {
 				phydev->state = PHY_AN;
 				phydev->link_timeout = PHY_AN_TIMEOUT;
@@ -1053,11 +1051,11 @@ void phy_state_machine(struct work_struct *work)
 
 			if (phydev->link) {
 				phydev->state = PHY_RUNNING;
-				netif_carrier_on(phydev->attached_dev);
+				phy_link_up(phydev);
 			} else	{
 				phydev->state = PHY_NOLINK;
+				phy_link_down(phydev, false);
 			}
-			phy_adjust_link(phydev);
 		}
 		break;
 	}
diff --git a/drivers/net/phy/phy_device.c b/drivers/net/phy/phy_device.c
index 1790f7fec125..d536a9a7cd2b 100644
--- a/drivers/net/phy/phy_device.c
+++ b/drivers/net/phy/phy_device.c
@@ -688,6 +688,19 @@ struct phy_device *phy_find_first(struct mii_bus *bus)
 }
 EXPORT_SYMBOL(phy_find_first);
 
+static void phy_link_change(struct phy_device *phydev, bool up, bool do_carrier)
+{
+	struct net_device *netdev = phydev->attached_dev;
+
+	if (do_carrier) {
+		if (up)
+			netif_carrier_on(netdev);
+		else
+			netif_carrier_off(netdev);
+	}
+	phydev->adjust_link(netdev);
+}
+
 /**
  * phy_prepare_link - prepares the PHY layer to monitor link status
  * @phydev: target phy_device struct
@@ -951,6 +964,7 @@ int phy_attach_direct(struct net_device *dev, struct phy_device *phydev,
 		goto error;
 	}
 
+	phydev->phy_link_change = phy_link_change;
 	phydev->attached_dev = dev;
 	dev->phydev = phydev;
 
diff --git a/include/linux/phy.h b/include/linux/phy.h
index 8a280257778c..0a5e8e62c9e0 100644
--- a/include/linux/phy.h
+++ b/include/linux/phy.h
@@ -474,6 +474,7 @@ struct phy_device {
 	u8 mdix;
 	u8 mdix_ctrl;
 
+	void (*phy_link_change)(struct phy_device *, bool up, bool do_carrier);
 	void (*adjust_link)(struct net_device *dev);
 };
 #define to_phy_device(d) container_of(to_mdio_device(d), \
-- 
cgit v1.2.3


From 9525ae83959b60c6061fe2f2caabdc8f69a48bc6 Mon Sep 17 00:00:00 2001
From: Russell King <rmk+kernel@arm.linux.org.uk>
Date: Tue, 25 Jul 2017 15:03:13 +0100
Subject: phylink: add phylink infrastructure

The link between the ethernet MAC and its PHY has become more complex
as the interface evolves.  This is especially true with serdes links,
where the part of the PHY is effectively integrated into the MAC.

Serdes links can be connected to a variety of devices, including SFF
modules soldered down onto the board with the MAC, a SFP cage with
a hotpluggable SFP module which may contain a PHY or directly modulate
the serdes signals onto optical media with or without a PHY, or even
a classical PHY connection.

Moreover, the negotiation information on serdes links comes in two
varieties - SGMII mode, where the PHY provides its speed/duplex/flow
control information to the MAC, and 1000base-X mode where both ends
exchange their abilities and each resolve the link capabilities.

This means we need a more flexible means to support these arrangements,
particularly with the hotpluggable nature of SFP, where the PHY can
be attached or detached after the network device has been brought up.

Ethtool information can come from multiple sources:
- we may have a PHY operating in either SGMII or 1000base-X mode, in
  which case we take ethtool/mii data directly from the PHY.
- we may have a optical SFP module without a PHY, with the MAC
  operating in 1000base-X mode - the ethtool/mii data needs to come
  from the MAC.
- we may have a copper SFP module with a PHY whic can't be accessed,
  which means we need to take ethtool/mii data from the MAC.

Phylink aims to solve this by providing an intermediary between the
MAC and PHY, providing a safe way for PHYs to be hotplugged, and
allowing a SFP driver to reconfigure the serdes connection.

Phylink also takes over support of fixed link connections, where the
speed/duplex/flow control are fixed, but link status may be controlled
by a GPIO signal.  By avoiding the fixed-phy implementation, phylink
can provide a faster response to link events: fixed-phy has to wait for
phylib to operate its state machine, which can take several seconds.
In comparison, phylink takes milliseconds.

Signed-off-by: Russell King <rmk+kernel@arm.linux.org.uk>

- remove sync status
- rework supported and advertisment handling
- add 1000base-x speed for fixed links
- use functionality exported from phy-core, reworking
  __phylink_ethtool_ksettings_set for it
Reviewed-by: Andrew Lunn <andrew@lunn.ch>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/phy/Kconfig      |   10 +
 drivers/net/phy/Makefile     |    1 +
 drivers/net/phy/phy_device.c |    1 +
 drivers/net/phy/phylink.c    | 1169 ++++++++++++++++++++++++++++++++++++++++++
 include/linux/phy.h          |    2 +
 include/linux/phylink.h      |  145 ++++++
 6 files changed, 1328 insertions(+)
 create mode 100644 drivers/net/phy/phylink.c
 create mode 100644 include/linux/phylink.h

(limited to 'include')

diff --git a/drivers/net/phy/Kconfig b/drivers/net/phy/Kconfig
index a1d6fdba8980..a0a9e03e2f80 100644
--- a/drivers/net/phy/Kconfig
+++ b/drivers/net/phy/Kconfig
@@ -169,6 +169,16 @@ menuconfig PHYLIB
 	  devices.  This option provides infrastructure for
 	  managing PHY devices.
 
+config PHYLINK
+	tristate
+	depends on NETDEVICES
+	select PHYLIB
+	select SWPHY
+	help
+	  PHYlink models the link between the PHY and MAC, allowing fixed
+	  configuration links, PHYs, and Serdes links with MAC level
+	  autonegotiation modes.
+
 if PHYLIB
 
 config SWPHY
diff --git a/drivers/net/phy/Makefile b/drivers/net/phy/Makefile
index 113e8d525c5e..c43e5b99fda4 100644
--- a/drivers/net/phy/Makefile
+++ b/drivers/net/phy/Makefile
@@ -18,6 +18,7 @@ endif
 libphy-$(CONFIG_SWPHY)		+= swphy.o
 libphy-$(CONFIG_LED_TRIGGER_PHY)	+= phy_led_triggers.o
 
+obj-$(CONFIG_PHYLINK)		+= phylink.o
 obj-$(CONFIG_PHYLIB)		+= libphy.o
 
 obj-$(CONFIG_MDIO_BCM_IPROC)	+= mdio-bcm-iproc.o
diff --git a/drivers/net/phy/phy_device.c b/drivers/net/phy/phy_device.c
index d536a9a7cd2b..9493fb369682 100644
--- a/drivers/net/phy/phy_device.c
+++ b/drivers/net/phy/phy_device.c
@@ -1084,6 +1084,7 @@ void phy_detach(struct phy_device *phydev)
 	phydev->attached_dev->phydev = NULL;
 	phydev->attached_dev = NULL;
 	phy_suspend(phydev);
+	phydev->phylink = NULL;
 
 	phy_led_triggers_unregister(phydev);
 
diff --git a/drivers/net/phy/phylink.c b/drivers/net/phy/phylink.c
new file mode 100644
index 000000000000..af61d7d400af
--- /dev/null
+++ b/drivers/net/phy/phylink.c
@@ -0,0 +1,1169 @@
+/*
+ * phylink models the MAC to optional PHY connection, supporting
+ * technologies such as SFP cages where the PHY is hot-pluggable.
+ *
+ * Copyright (C) 2015 Russell King
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+#include <linux/ethtool.h>
+#include <linux/export.h>
+#include <linux/gpio/consumer.h>
+#include <linux/netdevice.h>
+#include <linux/of.h>
+#include <linux/of_mdio.h>
+#include <linux/phy.h>
+#include <linux/phy_fixed.h>
+#include <linux/phylink.h>
+#include <linux/rtnetlink.h>
+#include <linux/spinlock.h>
+#include <linux/workqueue.h>
+
+#include "swphy.h"
+
+#define SUPPORTED_INTERFACES \
+	(SUPPORTED_TP | SUPPORTED_MII | SUPPORTED_FIBRE | \
+	 SUPPORTED_BNC | SUPPORTED_AUI | SUPPORTED_Backplane)
+#define ADVERTISED_INTERFACES \
+	(ADVERTISED_TP | ADVERTISED_MII | ADVERTISED_FIBRE | \
+	 ADVERTISED_BNC | ADVERTISED_AUI | ADVERTISED_Backplane)
+
+enum {
+	PHYLINK_DISABLE_STOPPED,
+};
+
+struct phylink {
+	struct net_device *netdev;
+	const struct phylink_mac_ops *ops;
+
+	unsigned long phylink_disable_state; /* bitmask of disables */
+	struct phy_device *phydev;
+	phy_interface_t link_interface;	/* PHY_INTERFACE_xxx */
+	u8 link_an_mode;		/* MLO_AN_xxx */
+	u8 link_port;			/* The current non-phy ethtool port */
+	__ETHTOOL_DECLARE_LINK_MODE_MASK(supported);
+
+	/* The link configuration settings */
+	struct phylink_link_state link_config;
+	struct gpio_desc *link_gpio;
+
+	struct mutex state_mutex;
+	struct phylink_link_state phy_state;
+	struct work_struct resolve;
+
+	bool mac_link_dropped;
+};
+
+static inline void linkmode_zero(unsigned long *dst)
+{
+	bitmap_zero(dst, __ETHTOOL_LINK_MODE_MASK_NBITS);
+}
+
+static inline void linkmode_copy(unsigned long *dst, const unsigned long *src)
+{
+	bitmap_copy(dst, src, __ETHTOOL_LINK_MODE_MASK_NBITS);
+}
+
+static inline void linkmode_and(unsigned long *dst, const unsigned long *a,
+				const unsigned long *b)
+{
+	bitmap_and(dst, a, b, __ETHTOOL_LINK_MODE_MASK_NBITS);
+}
+
+static inline void linkmode_or(unsigned long *dst, const unsigned long *a,
+				const unsigned long *b)
+{
+	bitmap_or(dst, a, b, __ETHTOOL_LINK_MODE_MASK_NBITS);
+}
+
+static inline bool linkmode_empty(const unsigned long *src)
+{
+	return bitmap_empty(src, __ETHTOOL_LINK_MODE_MASK_NBITS);
+}
+
+void phylink_set_port_modes(unsigned long *mask)
+{
+	phylink_set(mask, TP);
+	phylink_set(mask, AUI);
+	phylink_set(mask, MII);
+	phylink_set(mask, FIBRE);
+	phylink_set(mask, BNC);
+	phylink_set(mask, Backplane);
+}
+EXPORT_SYMBOL_GPL(phylink_set_port_modes);
+
+static int phylink_is_empty_linkmode(const unsigned long *linkmode)
+{
+	__ETHTOOL_DECLARE_LINK_MODE_MASK(tmp) = { 0, };
+
+	phylink_set_port_modes(tmp);
+	phylink_set(tmp, Autoneg);
+	phylink_set(tmp, Pause);
+	phylink_set(tmp, Asym_Pause);
+
+	bitmap_andnot(tmp, linkmode, tmp, __ETHTOOL_LINK_MODE_MASK_NBITS);
+
+	return linkmode_empty(tmp);
+}
+
+static const char *phylink_an_mode_str(unsigned int mode)
+{
+	static const char *modestr[] = {
+		[MLO_AN_PHY] = "phy",
+		[MLO_AN_FIXED] = "fixed",
+		[MLO_AN_SGMII] = "SGMII",
+		[MLO_AN_8023Z] = "802.3z",
+	};
+
+	return mode < ARRAY_SIZE(modestr) ? modestr[mode] : "unknown";
+}
+
+static int phylink_validate(struct phylink *pl, unsigned long *supported,
+			    struct phylink_link_state *state)
+{
+	pl->ops->validate(pl->netdev, supported, state);
+
+	return phylink_is_empty_linkmode(supported) ? -EINVAL : 0;
+}
+
+static int phylink_parse_fixedlink(struct phylink *pl, struct device_node *np)
+{
+	struct device_node *fixed_node;
+	const struct phy_setting *s;
+	struct gpio_desc *desc;
+	const __be32 *fixed_prop;
+	u32 speed;
+	int ret, len;
+
+	fixed_node = of_get_child_by_name(np, "fixed-link");
+	if (fixed_node) {
+		ret = of_property_read_u32(fixed_node, "speed", &speed);
+
+		pl->link_config.speed = speed;
+		pl->link_config.duplex = DUPLEX_HALF;
+
+		if (of_property_read_bool(fixed_node, "full-duplex"))
+			pl->link_config.duplex = DUPLEX_FULL;
+
+		/* We treat the "pause" and "asym-pause" terminology as
+		 * defining the link partner's ability. */
+		if (of_property_read_bool(fixed_node, "pause"))
+			pl->link_config.pause |= MLO_PAUSE_SYM;
+		if (of_property_read_bool(fixed_node, "asym-pause"))
+			pl->link_config.pause |= MLO_PAUSE_ASYM;
+
+		if (ret == 0) {
+			desc = fwnode_get_named_gpiod(&fixed_node->fwnode,
+						      "link-gpios", 0,
+						      GPIOD_IN, "?");
+
+			if (!IS_ERR(desc))
+				pl->link_gpio = desc;
+			else if (desc == ERR_PTR(-EPROBE_DEFER))
+				ret = -EPROBE_DEFER;
+		}
+		of_node_put(fixed_node);
+
+		if (ret)
+			return ret;
+	} else {
+		fixed_prop = of_get_property(np, "fixed-link", &len);
+		if (!fixed_prop) {
+			netdev_err(pl->netdev, "broken fixed-link?\n");
+			return -EINVAL;
+		}
+		if (len == 5 * sizeof(*fixed_prop)) {
+			pl->link_config.duplex = be32_to_cpu(fixed_prop[1]) ?
+						DUPLEX_FULL : DUPLEX_HALF;
+			pl->link_config.speed = be32_to_cpu(fixed_prop[2]);
+			if (be32_to_cpu(fixed_prop[3]))
+				pl->link_config.pause |= MLO_PAUSE_SYM;
+			if (be32_to_cpu(fixed_prop[4]))
+				pl->link_config.pause |= MLO_PAUSE_ASYM;
+		}
+	}
+
+	if (pl->link_config.speed > SPEED_1000 &&
+	    pl->link_config.duplex != DUPLEX_FULL)
+		netdev_warn(pl->netdev, "fixed link specifies half duplex for %dMbps link?\n",
+			    pl->link_config.speed);
+
+	bitmap_fill(pl->supported, __ETHTOOL_LINK_MODE_MASK_NBITS);
+	linkmode_copy(pl->link_config.advertising, pl->supported);
+	phylink_validate(pl, pl->supported, &pl->link_config);
+
+	s = phy_lookup_setting(pl->link_config.speed, pl->link_config.duplex,
+			       pl->supported,
+			       __ETHTOOL_LINK_MODE_MASK_NBITS, true);
+	linkmode_zero(pl->supported);
+	phylink_set(pl->supported, MII);
+	if (s) {
+		__set_bit(s->bit, pl->supported);
+	} else {
+		netdev_warn(pl->netdev, "fixed link %s duplex %dMbps not recognised\n",
+			    pl->link_config.duplex == DUPLEX_FULL ? "full" : "half",
+			    pl->link_config.speed);
+	}
+
+	linkmode_and(pl->link_config.advertising, pl->link_config.advertising,
+		     pl->supported);
+
+	pl->link_config.link = 1;
+	pl->link_config.an_complete = 1;
+
+	return 0;
+}
+
+static int phylink_parse_mode(struct phylink *pl, struct device_node *np)
+{
+	struct device_node *dn;
+	const char *managed;
+
+	dn = of_get_child_by_name(np, "fixed-link");
+	if (dn || of_find_property(np, "fixed-link", NULL))
+		pl->link_an_mode = MLO_AN_FIXED;
+	of_node_put(dn);
+
+	if (of_property_read_string(np, "managed", &managed) == 0 &&
+	    strcmp(managed, "in-band-status") == 0) {
+		if (pl->link_an_mode == MLO_AN_FIXED) {
+			netdev_err(pl->netdev,
+				   "can't use both fixed-link and in-band-status\n");
+			return -EINVAL;
+		}
+
+		linkmode_zero(pl->supported);
+		phylink_set(pl->supported, MII);
+		phylink_set(pl->supported, Autoneg);
+		phylink_set(pl->supported, Asym_Pause);
+		phylink_set(pl->supported, Pause);
+		pl->link_config.an_enabled = true;
+
+		switch (pl->link_config.interface) {
+		case PHY_INTERFACE_MODE_SGMII:
+			phylink_set(pl->supported, 10baseT_Half);
+			phylink_set(pl->supported, 10baseT_Full);
+			phylink_set(pl->supported, 100baseT_Half);
+			phylink_set(pl->supported, 100baseT_Full);
+			phylink_set(pl->supported, 1000baseT_Half);
+			phylink_set(pl->supported, 1000baseT_Full);
+			pl->link_an_mode = MLO_AN_SGMII;
+			break;
+
+		case PHY_INTERFACE_MODE_1000BASEX:
+			phylink_set(pl->supported, 1000baseX_Full);
+			pl->link_an_mode = MLO_AN_8023Z;
+			break;
+
+		case PHY_INTERFACE_MODE_2500BASEX:
+			phylink_set(pl->supported, 2500baseX_Full);
+			pl->link_an_mode = MLO_AN_8023Z;
+			break;
+
+		default:
+			netdev_err(pl->netdev,
+				   "incorrect link mode %s for in-band status\n",
+				   phy_modes(pl->link_config.interface));
+			return -EINVAL;
+		}
+
+		linkmode_copy(pl->link_config.advertising, pl->supported);
+
+		if (phylink_validate(pl, pl->supported, &pl->link_config)) {
+			netdev_err(pl->netdev,
+				   "failed to validate link configuration for in-band status\n");
+			return -EINVAL;
+		}
+	}
+
+	return 0;
+}
+
+static void phylink_mac_config(struct phylink *pl,
+			       const struct phylink_link_state *state)
+{
+	netdev_dbg(pl->netdev,
+		   "%s: mode=%s/%s/%s/%s adv=%*pb pause=%02x link=%u an=%u\n",
+		   __func__, phylink_an_mode_str(pl->link_an_mode),
+		   phy_modes(state->interface),
+		   phy_speed_to_str(state->speed),
+		   phy_duplex_to_str(state->duplex),
+		   __ETHTOOL_LINK_MODE_MASK_NBITS, state->advertising,
+		   state->pause, state->link, state->an_enabled);
+
+	pl->ops->mac_config(pl->netdev, pl->link_an_mode, state);
+}
+
+static void phylink_mac_an_restart(struct phylink *pl)
+{
+	if (pl->link_config.an_enabled &&
+	    (pl->link_config.interface == PHY_INTERFACE_MODE_1000BASEX ||
+	     pl->link_config.interface == PHY_INTERFACE_MODE_2500BASEX))
+		pl->ops->mac_an_restart(pl->netdev);
+}
+
+static int phylink_get_mac_state(struct phylink *pl, struct phylink_link_state *state)
+{
+	struct net_device *ndev = pl->netdev;
+
+	linkmode_copy(state->advertising, pl->link_config.advertising);
+	linkmode_zero(state->lp_advertising);
+	state->interface = pl->link_config.interface;
+	state->an_enabled = pl->link_config.an_enabled;
+	state->link = 1;
+
+	return pl->ops->mac_link_state(ndev, state);
+}
+
+/* The fixed state is... fixed except for the link state,
+ * which may be determined by a GPIO.
+ */
+static void phylink_get_fixed_state(struct phylink *pl, struct phylink_link_state *state)
+{
+	*state = pl->link_config;
+	if (pl->link_gpio)
+		state->link = !!gpiod_get_value(pl->link_gpio);
+}
+
+/* Flow control is resolved according to our and the link partners
+ * advertisments using the following drawn from the 802.3 specs:
+ *  Local device  Link partner
+ *  Pause AsymDir Pause AsymDir Result
+ *    1     X       1     X     TX+RX
+ *    0     1       1     1     RX
+ *    1     1       0     1     TX
+ */
+static void phylink_resolve_flow(struct phylink *pl,
+	struct phylink_link_state *state)
+{
+	int new_pause = 0;
+
+	if (pl->link_config.pause & MLO_PAUSE_AN) {
+		int pause = 0;
+
+		if (phylink_test(pl->link_config.advertising, Pause))
+			pause |= MLO_PAUSE_SYM;
+		if (phylink_test(pl->link_config.advertising, Asym_Pause))
+			pause |= MLO_PAUSE_ASYM;
+
+		pause &= state->pause;
+
+		if (pause & MLO_PAUSE_SYM)
+			new_pause = MLO_PAUSE_TX | MLO_PAUSE_RX;
+		else if (pause & MLO_PAUSE_ASYM)
+			new_pause = state->pause & MLO_PAUSE_SYM ?
+				 MLO_PAUSE_RX : MLO_PAUSE_TX;
+	} else {
+		new_pause = pl->link_config.pause & MLO_PAUSE_TXRX_MASK;
+	}
+
+	state->pause &= ~MLO_PAUSE_TXRX_MASK;
+	state->pause |= new_pause;
+}
+
+static const char *phylink_pause_to_str(int pause)
+{
+	switch (pause & MLO_PAUSE_TXRX_MASK) {
+	case MLO_PAUSE_TX | MLO_PAUSE_RX:
+		return "rx/tx";
+	case MLO_PAUSE_TX:
+		return "tx";
+	case MLO_PAUSE_RX:
+		return "rx";
+	default:
+		return "off";
+	}
+}
+
+static void phylink_resolve(struct work_struct *w)
+{
+	struct phylink *pl = container_of(w, struct phylink, resolve);
+	struct phylink_link_state link_state;
+	struct net_device *ndev = pl->netdev;
+
+	mutex_lock(&pl->state_mutex);
+	if (pl->phylink_disable_state) {
+		pl->mac_link_dropped = false;
+		link_state.link = false;
+	} else if (pl->mac_link_dropped) {
+		link_state.link = false;
+	} else {
+		switch (pl->link_an_mode) {
+		case MLO_AN_PHY:
+			link_state = pl->phy_state;
+			phylink_resolve_flow(pl, &link_state);
+			phylink_mac_config(pl, &link_state);
+			break;
+
+		case MLO_AN_FIXED:
+			phylink_get_fixed_state(pl, &link_state);
+			phylink_mac_config(pl, &link_state);
+			break;
+
+		case MLO_AN_SGMII:
+			phylink_get_mac_state(pl, &link_state);
+			if (pl->phydev) {
+				bool changed = false;
+
+				link_state.link = link_state.link &&
+						  pl->phy_state.link;
+
+				if (pl->phy_state.interface !=
+				    link_state.interface) {
+					link_state.interface = pl->phy_state.interface;
+					changed = true;
+				}
+
+				/* Propagate the flow control from the PHY
+				 * to the MAC. Also propagate the interface
+				 * if changed.
+				 */
+				if (pl->phy_state.link || changed) {
+					link_state.pause |= pl->phy_state.pause;
+					phylink_resolve_flow(pl, &link_state);
+
+					phylink_mac_config(pl, &link_state);
+				}
+			}
+			break;
+
+		case MLO_AN_8023Z:
+			phylink_get_mac_state(pl, &link_state);
+			break;
+		}
+	}
+
+	if (link_state.link != netif_carrier_ok(ndev)) {
+		if (!link_state.link) {
+			netif_carrier_off(ndev);
+			pl->ops->mac_link_down(ndev, pl->link_an_mode);
+			netdev_info(ndev, "Link is Down\n");
+		} else {
+			pl->ops->mac_link_up(ndev, pl->link_an_mode,
+					     pl->phydev);
+
+			netif_carrier_on(ndev);
+
+			netdev_info(ndev,
+				    "Link is Up - %s/%s - flow control %s\n",
+				    phy_speed_to_str(link_state.speed),
+				    phy_duplex_to_str(link_state.duplex),
+				    phylink_pause_to_str(link_state.pause));
+		}
+	}
+	if (!link_state.link && pl->mac_link_dropped) {
+		pl->mac_link_dropped = false;
+		queue_work(system_power_efficient_wq, &pl->resolve);
+	}
+	mutex_unlock(&pl->state_mutex);
+}
+
+static void phylink_run_resolve(struct phylink *pl)
+{
+	if (!pl->phylink_disable_state)
+		queue_work(system_power_efficient_wq, &pl->resolve);
+}
+
+struct phylink *phylink_create(struct net_device *ndev, struct device_node *np,
+	phy_interface_t iface, const struct phylink_mac_ops *ops)
+{
+	struct phylink *pl;
+	int ret;
+
+	pl = kzalloc(sizeof(*pl), GFP_KERNEL);
+	if (!pl)
+		return ERR_PTR(-ENOMEM);
+
+	mutex_init(&pl->state_mutex);
+	INIT_WORK(&pl->resolve, phylink_resolve);
+	pl->netdev = ndev;
+	pl->phy_state.interface = iface;
+	pl->link_interface = iface;
+	pl->link_port = PORT_MII;
+	pl->link_config.interface = iface;
+	pl->link_config.pause = MLO_PAUSE_AN;
+	pl->link_config.speed = SPEED_UNKNOWN;
+	pl->link_config.duplex = DUPLEX_UNKNOWN;
+	pl->ops = ops;
+	__set_bit(PHYLINK_DISABLE_STOPPED, &pl->phylink_disable_state);
+
+	bitmap_fill(pl->supported, __ETHTOOL_LINK_MODE_MASK_NBITS);
+	linkmode_copy(pl->link_config.advertising, pl->supported);
+	phylink_validate(pl, pl->supported, &pl->link_config);
+
+	ret = phylink_parse_mode(pl, np);
+	if (ret < 0) {
+		kfree(pl);
+		return ERR_PTR(ret);
+	}
+
+	if (pl->link_an_mode == MLO_AN_FIXED) {
+		ret = phylink_parse_fixedlink(pl, np);
+		if (ret < 0) {
+			kfree(pl);
+			return ERR_PTR(ret);
+		}
+	}
+
+	return pl;
+}
+EXPORT_SYMBOL_GPL(phylink_create);
+
+void phylink_destroy(struct phylink *pl)
+{
+	cancel_work_sync(&pl->resolve);
+	kfree(pl);
+}
+EXPORT_SYMBOL_GPL(phylink_destroy);
+
+void phylink_phy_change(struct phy_device *phydev, bool up, bool do_carrier)
+{
+	struct phylink *pl = phydev->phylink;
+
+	mutex_lock(&pl->state_mutex);
+	pl->phy_state.speed = phydev->speed;
+	pl->phy_state.duplex = phydev->duplex;
+	pl->phy_state.pause = MLO_PAUSE_NONE;
+	if (phydev->pause)
+		pl->phy_state.pause |= MLO_PAUSE_SYM;
+	if (phydev->asym_pause)
+		pl->phy_state.pause |= MLO_PAUSE_ASYM;
+	pl->phy_state.interface = phydev->interface;
+	pl->phy_state.link = up;
+	mutex_unlock(&pl->state_mutex);
+
+	phylink_run_resolve(pl);
+
+	netdev_dbg(pl->netdev, "phy link %s %s/%s/%s\n", up ? "up" : "down",
+	           phy_modes(phydev->interface),
+		   phy_speed_to_str(phydev->speed),
+		   phy_duplex_to_str(phydev->duplex));
+}
+
+static int phylink_bringup_phy(struct phylink *pl, struct phy_device *phy)
+{
+	struct phylink_link_state config;
+	__ETHTOOL_DECLARE_LINK_MODE_MASK(supported);
+	u32 advertising;
+	int ret;
+
+	memset(&config, 0, sizeof(config));
+	ethtool_convert_legacy_u32_to_link_mode(supported, phy->supported);
+	ethtool_convert_legacy_u32_to_link_mode(config.advertising,
+						phy->advertising);
+	config.interface = pl->link_config.interface;
+
+	/*
+	 * This is the new way of dealing with flow control for PHYs,
+	 * as described by Timur Tabi in commit 529ed1275263 ("net: phy:
+	 * phy drivers should not set SUPPORTED_[Asym_]Pause") except
+	 * using our validate call to the MAC, we rely upon the MAC
+	 * clearing the bits from both supported and advertising fields.
+	 */
+	if (phylink_test(supported, Pause))
+		phylink_set(config.advertising, Pause);
+	if (phylink_test(supported, Asym_Pause))
+		phylink_set(config.advertising, Asym_Pause);
+
+	ret = phylink_validate(pl, supported, &config);
+	if (ret)
+		return ret;
+
+	phy->phylink = pl;
+	phy->phy_link_change = phylink_phy_change;
+
+	netdev_info(pl->netdev,
+		    "PHY [%s] driver [%s]\n", dev_name(&phy->mdio.dev),
+		    phy->drv->name);
+
+	mutex_lock(&phy->lock);
+	mutex_lock(&pl->state_mutex);
+	pl->netdev->phydev = phy;
+	pl->phydev = phy;
+	linkmode_copy(pl->supported, supported);
+	linkmode_copy(pl->link_config.advertising, config.advertising);
+
+	/* Restrict the phy advertisment according to the MAC support. */
+	ethtool_convert_link_mode_to_legacy_u32(&advertising, config.advertising);
+	phy->advertising = advertising;
+	mutex_unlock(&pl->state_mutex);
+	mutex_unlock(&phy->lock);
+
+	netdev_dbg(pl->netdev,
+		   "phy: setting supported %*pb advertising 0x%08x\n",
+		   __ETHTOOL_LINK_MODE_MASK_NBITS, pl->supported,
+		   phy->advertising);
+
+	phy_start_machine(phy);
+	if (phy->irq > 0)
+		phy_start_interrupts(phy);
+
+	return 0;
+}
+
+int phylink_connect_phy(struct phylink *pl, struct phy_device *phy)
+{
+	int ret;
+
+	ret = phy_attach_direct(pl->netdev, phy, 0, pl->link_interface);
+	if (ret)
+		return ret;
+
+	ret = phylink_bringup_phy(pl, phy);
+	if (ret)
+		phy_detach(phy);
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(phylink_connect_phy);
+
+int phylink_of_phy_connect(struct phylink *pl, struct device_node *dn)
+{
+	struct device_node *phy_node;
+	struct phy_device *phy_dev;
+	int ret;
+
+	/* Fixed links are handled without needing a PHY */
+	if (pl->link_an_mode == MLO_AN_FIXED)
+		return 0;
+
+	phy_node = of_parse_phandle(dn, "phy-handle", 0);
+	if (!phy_node)
+		phy_node = of_parse_phandle(dn, "phy", 0);
+	if (!phy_node)
+		phy_node = of_parse_phandle(dn, "phy-device", 0);
+
+	if (!phy_node) {
+		if (pl->link_an_mode == MLO_AN_PHY) {
+			netdev_err(pl->netdev, "unable to find PHY node\n");
+			return -ENODEV;
+		}
+		return 0;
+	}
+
+	phy_dev = of_phy_attach(pl->netdev, phy_node, 0, pl->link_interface);
+	/* We're done with the phy_node handle */
+	of_node_put(phy_node);
+
+	if (!phy_dev)
+		return -ENODEV;
+
+	ret = phylink_bringup_phy(pl, phy_dev);
+	if (ret)
+		phy_detach(phy_dev);
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(phylink_of_phy_connect);
+
+void phylink_disconnect_phy(struct phylink *pl)
+{
+	struct phy_device *phy;
+
+	WARN_ON(!lockdep_rtnl_is_held());
+
+	phy = pl->phydev;
+	if (phy) {
+		mutex_lock(&phy->lock);
+		mutex_lock(&pl->state_mutex);
+		pl->netdev->phydev = NULL;
+		pl->phydev = NULL;
+		mutex_unlock(&pl->state_mutex);
+		mutex_unlock(&phy->lock);
+		flush_work(&pl->resolve);
+
+		phy_disconnect(phy);
+	}
+}
+EXPORT_SYMBOL_GPL(phylink_disconnect_phy);
+
+void phylink_mac_change(struct phylink *pl, bool up)
+{
+	if (!up)
+		pl->mac_link_dropped = true;
+	phylink_run_resolve(pl);
+	netdev_dbg(pl->netdev, "mac link %s\n", up ? "up" : "down");
+}
+EXPORT_SYMBOL_GPL(phylink_mac_change);
+
+void phylink_start(struct phylink *pl)
+{
+	WARN_ON(!lockdep_rtnl_is_held());
+
+	netdev_info(pl->netdev, "configuring for %s/%s link mode\n",
+		    phylink_an_mode_str(pl->link_an_mode),
+		    phy_modes(pl->link_config.interface));
+
+	/* Apply the link configuration to the MAC when starting. This allows
+	 * a fixed-link to start with the correct parameters, and also
+	 * ensures that we set the appropriate advertisment for Serdes links.
+	 */
+	phylink_resolve_flow(pl, &pl->link_config);
+	phylink_mac_config(pl, &pl->link_config);
+
+	clear_bit(PHYLINK_DISABLE_STOPPED, &pl->phylink_disable_state);
+	phylink_run_resolve(pl);
+
+	if (pl->phydev)
+		phy_start(pl->phydev);
+}
+EXPORT_SYMBOL_GPL(phylink_start);
+
+void phylink_stop(struct phylink *pl)
+{
+	WARN_ON(!lockdep_rtnl_is_held());
+
+	if (pl->phydev)
+		phy_stop(pl->phydev);
+
+	set_bit(PHYLINK_DISABLE_STOPPED, &pl->phylink_disable_state);
+	flush_work(&pl->resolve);
+}
+EXPORT_SYMBOL_GPL(phylink_stop);
+
+void phylink_ethtool_get_wol(struct phylink *pl, struct ethtool_wolinfo *wol)
+{
+	WARN_ON(!lockdep_rtnl_is_held());
+
+	wol->supported = 0;
+	wol->wolopts = 0;
+
+	if (pl->phydev)
+		phy_ethtool_get_wol(pl->phydev, wol);
+}
+EXPORT_SYMBOL_GPL(phylink_ethtool_get_wol);
+
+int phylink_ethtool_set_wol(struct phylink *pl, struct ethtool_wolinfo *wol)
+{
+	int ret = -EOPNOTSUPP;
+
+	WARN_ON(!lockdep_rtnl_is_held());
+
+	if (pl->phydev)
+		ret = phy_ethtool_set_wol(pl->phydev, wol);
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(phylink_ethtool_set_wol);
+
+static void phylink_merge_link_mode(unsigned long *dst, const unsigned long *b)
+{
+	__ETHTOOL_DECLARE_LINK_MODE_MASK(mask);
+
+	linkmode_zero(mask);
+	phylink_set_port_modes(mask);
+
+	linkmode_and(dst, dst, mask);
+	linkmode_or(dst, dst, b);
+}
+
+static void phylink_get_ksettings(const struct phylink_link_state *state,
+				  struct ethtool_link_ksettings *kset)
+{
+	phylink_merge_link_mode(kset->link_modes.advertising, state->advertising);
+	linkmode_copy(kset->link_modes.lp_advertising, state->lp_advertising);
+	kset->base.speed = state->speed;
+	kset->base.duplex = state->duplex;
+	kset->base.autoneg = state->an_enabled ? AUTONEG_ENABLE :
+				AUTONEG_DISABLE;
+}
+
+int phylink_ethtool_ksettings_get(struct phylink *pl,
+	struct ethtool_link_ksettings *kset)
+{
+	struct phylink_link_state link_state;
+
+	WARN_ON(!lockdep_rtnl_is_held());
+
+	if (pl->phydev) {
+		phy_ethtool_ksettings_get(pl->phydev, kset);
+	} else {
+		kset->base.port = pl->link_port;
+	}
+
+	linkmode_copy(kset->link_modes.supported, pl->supported);
+
+	switch (pl->link_an_mode) {
+	case MLO_AN_FIXED:
+		/* We are using fixed settings. Report these as the
+		 * current link settings - and note that these also
+		 * represent the supported speeds/duplex/pause modes.
+		 */
+		phylink_get_fixed_state(pl, &link_state);
+		phylink_get_ksettings(&link_state, kset);
+		break;
+
+	case MLO_AN_SGMII:
+		/* If there is a phy attached, then use the reported
+		 * settings from the phy with no modification.
+		 */
+		if (pl->phydev)
+			break;
+
+	case MLO_AN_8023Z:
+		phylink_get_mac_state(pl, &link_state);
+
+		/* The MAC is reporting the link results from its own PCS
+		 * layer via in-band status. Report these as the current
+		 * link settings.
+		 */
+		phylink_get_ksettings(&link_state, kset);
+		break;
+	}
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(phylink_ethtool_ksettings_get);
+
+int phylink_ethtool_ksettings_set(struct phylink *pl,
+	const struct ethtool_link_ksettings *kset)
+{
+	struct ethtool_link_ksettings our_kset;
+	struct phylink_link_state config;
+	int ret;
+
+	WARN_ON(!lockdep_rtnl_is_held());
+
+	if (kset->base.autoneg != AUTONEG_DISABLE &&
+	    kset->base.autoneg != AUTONEG_ENABLE)
+		return -EINVAL;
+
+	config = pl->link_config;
+
+	/* Mask out unsupported advertisments */
+	linkmode_and(config.advertising, kset->link_modes.advertising,
+		     pl->supported);
+
+	/* FIXME: should we reject autoneg if phy/mac does not support it? */
+	if (kset->base.autoneg == AUTONEG_DISABLE) {
+		const struct phy_setting *s;
+
+		/* Autonegotiation disabled, select a suitable speed and
+		 * duplex.
+		 */
+		s = phy_lookup_setting(kset->base.speed, kset->base.duplex,
+				       pl->supported,
+				       __ETHTOOL_LINK_MODE_MASK_NBITS, false);
+		if (!s)
+			return -EINVAL;
+
+		/* If we have a fixed link (as specified by firmware), refuse
+		 * to change link parameters.
+		 */
+		if (pl->link_an_mode == MLO_AN_FIXED &&
+		    (s->speed != pl->link_config.speed ||
+		     s->duplex != pl->link_config.duplex))
+			return -EINVAL;
+
+		config.speed = s->speed;
+		config.duplex = s->duplex;
+		config.an_enabled = false;
+
+		__clear_bit(ETHTOOL_LINK_MODE_Autoneg_BIT, config.advertising);
+	} else {
+		/* If we have a fixed link, refuse to enable autonegotiation */
+		if (pl->link_an_mode == MLO_AN_FIXED)
+			return -EINVAL;
+
+		config.speed = SPEED_UNKNOWN;
+		config.duplex = DUPLEX_UNKNOWN;
+		config.an_enabled = true;
+
+		__set_bit(ETHTOOL_LINK_MODE_Autoneg_BIT, config.advertising);
+	}
+
+	if (phylink_validate(pl, pl->supported, &config))
+		return -EINVAL;
+
+	/* If autonegotiation is enabled, we must have an advertisment */
+	if (config.an_enabled && phylink_is_empty_linkmode(config.advertising))
+		return -EINVAL;
+
+	our_kset = *kset;
+	linkmode_copy(our_kset.link_modes.advertising, config.advertising);
+	our_kset.base.speed = config.speed;
+	our_kset.base.duplex = config.duplex;
+
+	/* If we have a PHY, configure the phy */
+	if (pl->phydev) {
+		ret = phy_ethtool_ksettings_set(pl->phydev, &our_kset);
+		if (ret)
+			return ret;
+	}
+
+	mutex_lock(&pl->state_mutex);
+	/* Configure the MAC to match the new settings */
+	linkmode_copy(pl->link_config.advertising, our_kset.link_modes.advertising);
+	pl->link_config.speed = our_kset.base.speed;
+	pl->link_config.duplex = our_kset.base.duplex;
+	pl->link_config.an_enabled = our_kset.base.autoneg != AUTONEG_DISABLE;
+
+	if (!test_bit(PHYLINK_DISABLE_STOPPED, &pl->phylink_disable_state)) {
+		phylink_mac_config(pl, &pl->link_config);
+		phylink_mac_an_restart(pl);
+	}
+	mutex_unlock(&pl->state_mutex);
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(phylink_ethtool_ksettings_set);
+
+int phylink_ethtool_nway_reset(struct phylink *pl)
+{
+	int ret = 0;
+
+	WARN_ON(!lockdep_rtnl_is_held());
+
+	if (pl->phydev)
+		ret = phy_restart_aneg(pl->phydev);
+	phylink_mac_an_restart(pl);
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(phylink_ethtool_nway_reset);
+
+void phylink_ethtool_get_pauseparam(struct phylink *pl,
+				    struct ethtool_pauseparam *pause)
+{
+	WARN_ON(!lockdep_rtnl_is_held());
+
+	pause->autoneg = !!(pl->link_config.pause & MLO_PAUSE_AN);
+	pause->rx_pause = !!(pl->link_config.pause & MLO_PAUSE_RX);
+	pause->tx_pause = !!(pl->link_config.pause & MLO_PAUSE_TX);
+}
+EXPORT_SYMBOL_GPL(phylink_ethtool_get_pauseparam);
+
+int phylink_ethtool_set_pauseparam(struct phylink *pl,
+				   struct ethtool_pauseparam *pause)
+{
+	struct phylink_link_state *config = &pl->link_config;
+
+	WARN_ON(!lockdep_rtnl_is_held());
+
+	if (!phylink_test(pl->supported, Pause) &&
+	    !phylink_test(pl->supported, Asym_Pause))
+		return -EOPNOTSUPP;
+
+	if (!phylink_test(pl->supported, Asym_Pause) &&
+	    !pause->autoneg && pause->rx_pause != pause->tx_pause)
+		return -EINVAL;
+
+	config->pause &= ~(MLO_PAUSE_AN | MLO_PAUSE_TXRX_MASK);
+
+	if (pause->autoneg)
+		config->pause |= MLO_PAUSE_AN;
+	if (pause->rx_pause)
+		config->pause |= MLO_PAUSE_RX;
+	if (pause->tx_pause)
+		config->pause |= MLO_PAUSE_TX;
+
+	if (!test_bit(PHYLINK_DISABLE_STOPPED, &pl->phylink_disable_state)) {
+		switch (pl->link_an_mode) {
+		case MLO_AN_PHY:
+			/* Silently mark the carrier down, and then trigger a resolve */
+			netif_carrier_off(pl->netdev);
+			phylink_run_resolve(pl);
+			break;
+
+		case MLO_AN_FIXED:
+			/* Should we allow fixed links to change against the config? */
+			phylink_resolve_flow(pl, config);
+			phylink_mac_config(pl, config);
+			break;
+
+		case MLO_AN_SGMII:
+		case MLO_AN_8023Z:
+			phylink_mac_config(pl, config);
+			phylink_mac_an_restart(pl);
+			break;
+		}
+	}
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(phylink_ethtool_set_pauseparam);
+
+int phylink_init_eee(struct phylink *pl, bool clk_stop_enable)
+{
+	int ret = -EPROTONOSUPPORT;
+
+	WARN_ON(!lockdep_rtnl_is_held());
+
+	if (pl->phydev)
+		ret = phy_init_eee(pl->phydev, clk_stop_enable);
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(phylink_init_eee);
+
+int phylink_get_eee_err(struct phylink *pl)
+{
+	int ret = 0;
+
+	WARN_ON(!lockdep_rtnl_is_held());
+
+	if (pl->phydev)
+		ret = phy_get_eee_err(pl->phydev);
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(phylink_get_eee_err);
+
+int phylink_ethtool_get_eee(struct phylink *pl, struct ethtool_eee *eee)
+{
+	int ret = -EOPNOTSUPP;
+
+	WARN_ON(!lockdep_rtnl_is_held());
+
+	if (pl->phydev)
+		ret = phy_ethtool_get_eee(pl->phydev, eee);
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(phylink_ethtool_get_eee);
+
+int phylink_ethtool_set_eee(struct phylink *pl, struct ethtool_eee *eee)
+{
+	int ret = -EOPNOTSUPP;
+
+	WARN_ON(!lockdep_rtnl_is_held());
+
+	if (pl->phydev)
+		ret = phy_ethtool_set_eee(pl->phydev, eee);
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(phylink_ethtool_set_eee);
+
+/* This emulates MII registers for a fixed-mode phy operating as per the
+ * passed in state. "aneg" defines if we report negotiation is possible.
+ *
+ * FIXME: should deal with negotiation state too.
+ */
+static int phylink_mii_emul_read(struct net_device *ndev, unsigned int reg,
+				 struct phylink_link_state *state, bool aneg)
+{
+	struct fixed_phy_status fs;
+	int val;
+
+	fs.link = state->link;
+	fs.speed = state->speed;
+	fs.duplex = state->duplex;
+	fs.pause = state->pause & MLO_PAUSE_SYM;
+	fs.asym_pause = state->pause & MLO_PAUSE_ASYM;
+
+	val = swphy_read_reg(reg, &fs);
+	if (reg == MII_BMSR) {
+		if (!state->an_complete)
+			val &= ~BMSR_ANEGCOMPLETE;
+		if (!aneg)
+			val &= ~BMSR_ANEGCAPABLE;
+	}
+	return val;
+}
+
+static int phylink_mii_read(struct phylink *pl, unsigned int phy_id,
+			    unsigned int reg)
+{
+	struct phylink_link_state state;
+	int val = 0xffff;
+
+	/* PHYs only exist for MLO_AN_PHY and MLO_AN_SGMII */
+	if (pl->phydev)
+		return mdiobus_read(pl->phydev->mdio.bus, phy_id, reg);
+
+	switch (pl->link_an_mode) {
+	case MLO_AN_FIXED:
+		if (phy_id == 0) {
+			phylink_get_fixed_state(pl, &state);
+			val = phylink_mii_emul_read(pl->netdev, reg, &state,
+						    true);
+		}
+		break;
+
+	case MLO_AN_PHY:
+		return -EOPNOTSUPP;
+
+	case MLO_AN_SGMII:
+		/* No phy, fall through to 8023z method */
+	case MLO_AN_8023Z:
+		if (phy_id == 0) {
+			val = phylink_get_mac_state(pl, &state);
+			if (val < 0)
+				return val;
+
+			val = phylink_mii_emul_read(pl->netdev, reg, &state,
+						    true);
+		}
+		break;
+	}
+
+	return val & 0xffff;
+}
+
+static int phylink_mii_write(struct phylink *pl, unsigned int phy_id,
+			     unsigned int reg, unsigned int val)
+{
+	/* PHYs only exist for MLO_AN_PHY and MLO_AN_SGMII */
+	if (pl->phydev) {
+		mdiobus_write(pl->phydev->mdio.bus, phy_id, reg, val);
+		return 0;
+	}
+
+	switch (pl->link_an_mode) {
+	case MLO_AN_FIXED:
+		break;
+
+	case MLO_AN_PHY:
+		return -EOPNOTSUPP;
+
+	case MLO_AN_SGMII:
+		/* No phy, fall through to 8023z method */
+	case MLO_AN_8023Z:
+		break;
+	}
+
+	return 0;
+}
+
+int phylink_mii_ioctl(struct phylink *pl, struct ifreq *ifr, int cmd)
+{
+	struct mii_ioctl_data *mii_data = if_mii(ifr);
+	int val, ret;
+
+	WARN_ON(!lockdep_rtnl_is_held());
+
+	switch (cmd) {
+	case SIOCGMIIPHY:
+		mii_data->phy_id = pl->phydev ? pl->phydev->mdio.addr : 0;
+		/* fallthrough */
+
+	case SIOCGMIIREG:
+		val = phylink_mii_read(pl, mii_data->phy_id, mii_data->reg_num);
+		if (val < 0) {
+			ret = val;
+		} else {
+			mii_data->val_out = val;
+			ret = 0;
+		}
+		break;
+
+	case SIOCSMIIREG:
+		ret = phylink_mii_write(pl, mii_data->phy_id, mii_data->reg_num,
+					mii_data->val_in);
+		break;
+
+	default:
+		ret = -EOPNOTSUPP;
+		if (pl->phydev)
+			ret = phy_mii_ioctl(pl->phydev, ifr, cmd);
+		break;
+	}
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(phylink_mii_ioctl);
+
+MODULE_LICENSE("GPL");
diff --git a/include/linux/phy.h b/include/linux/phy.h
index 0a5e8e62c9e0..d78cd01ea513 100644
--- a/include/linux/phy.h
+++ b/include/linux/phy.h
@@ -182,6 +182,7 @@ static inline const char *phy_modes(phy_interface_t interface)
 #define MII_ADDR_C45 (1<<30)
 
 struct device;
+struct phylink;
 struct sk_buff;
 
 /*
@@ -469,6 +470,7 @@ struct phy_device {
 
 	struct mutex lock;
 
+	struct phylink *phylink;
 	struct net_device *attached_dev;
 
 	u8 mdix;
diff --git a/include/linux/phylink.h b/include/linux/phylink.h
new file mode 100644
index 000000000000..76f054f39684
--- /dev/null
+++ b/include/linux/phylink.h
@@ -0,0 +1,145 @@
+#ifndef NETDEV_PCS_H
+#define NETDEV_PCS_H
+
+#include <linux/phy.h>
+#include <linux/spinlock.h>
+#include <linux/workqueue.h>
+
+struct device_node;
+struct ethtool_cmd;
+struct net_device;
+
+enum {
+	MLO_PAUSE_NONE,
+	MLO_PAUSE_ASYM = BIT(0),
+	MLO_PAUSE_SYM = BIT(1),
+	MLO_PAUSE_RX = BIT(2),
+	MLO_PAUSE_TX = BIT(3),
+	MLO_PAUSE_TXRX_MASK = MLO_PAUSE_TX | MLO_PAUSE_RX,
+	MLO_PAUSE_AN = BIT(4),
+
+	MLO_AN_PHY = 0,	/* Conventional PHY */
+	MLO_AN_FIXED,	/* Fixed-link mode */
+	MLO_AN_SGMII,	/* Cisco SGMII protocol */
+	MLO_AN_8023Z,	/* 1000base-X protocol */
+};
+
+static inline bool phylink_autoneg_inband(unsigned int mode)
+{
+	return mode == MLO_AN_SGMII || mode == MLO_AN_8023Z;
+}
+
+struct phylink_link_state {
+	__ETHTOOL_DECLARE_LINK_MODE_MASK(advertising);
+	__ETHTOOL_DECLARE_LINK_MODE_MASK(lp_advertising);
+	phy_interface_t interface;	/* PHY_INTERFACE_xxx */
+	int speed;
+	int duplex;
+	int pause;
+	unsigned int link:1;
+	unsigned int an_enabled:1;
+	unsigned int an_complete:1;
+};
+
+struct phylink_mac_ops {
+	/**
+	 * validate: validate and update the link configuration
+	 * @ndev: net_device structure associated with MAC
+	 * @config: configuration to validate
+	 *
+	 * Update the %config->supported and %config->advertised masks
+	 * clearing bits that can not be supported.
+	 *
+	 * Note: the PHY may be able to transform from one connection
+	 * technology to another, so, eg, don't clear 1000BaseX just
+	 * because the MAC is unable to support it.  This is more about
+	 * clearing unsupported speeds and duplex settings.
+	 *
+	 * If the %config->interface mode is %PHY_INTERFACE_MODE_1000BASEX
+	 * or %PHY_INTERFACE_MODE_2500BASEX, select the appropriate mode
+	 * based on %config->advertised and/or %config->speed.
+	 */
+	void (*validate)(struct net_device *ndev, unsigned long *supported,
+			 struct phylink_link_state *state);
+
+	/* Read the current link state from the hardware */
+	int (*mac_link_state)(struct net_device *, struct phylink_link_state *);
+
+	/* Configure the MAC */
+	/**
+	 * mac_config: configure the MAC for the selected mode and state
+	 * @ndev: net_device structure for the MAC
+	 * @mode: one of MLO_AN_FIXED, MLO_AN_PHY, MLO_AN_8023Z, MLO_AN_SGMII
+	 * @state: state structure
+	 *
+	 * The action performed depends on the currently selected mode:
+	 *
+	 * %MLO_AN_FIXED, %MLO_AN_PHY:
+	 *   set the specified speed, duplex, pause mode, and phy interface
+	 *   mode in the provided @state.
+	 * %MLO_AN_8023Z:
+	 *   place the link in 1000base-X mode, advertising the parameters
+	 *   given in advertising in @state.
+	 * %MLO_AN_SGMII:
+	 *   place the link in Cisco SGMII mode - there is no advertisment
+	 *   to make as the PHY communicates the speed and duplex to the
+	 *   MAC over the in-band control word.  Configuration of the pause
+	 *   mode is as per MLO_AN_PHY since this is not included.
+	 */
+	void (*mac_config)(struct net_device *ndev, unsigned int mode,
+			   const struct phylink_link_state *state);
+
+	/**
+	 * mac_an_restart: restart 802.3z BaseX autonegotiation
+	 * @ndev: net_device structure for the MAC
+	 */
+	void (*mac_an_restart)(struct net_device *ndev);
+
+	void (*mac_link_down)(struct net_device *, unsigned int mode);
+	void (*mac_link_up)(struct net_device *, unsigned int mode,
+			    struct phy_device *);
+};
+
+struct phylink *phylink_create(struct net_device *, struct device_node *,
+	phy_interface_t iface, const struct phylink_mac_ops *ops);
+void phylink_destroy(struct phylink *);
+
+int phylink_connect_phy(struct phylink *, struct phy_device *);
+int phylink_of_phy_connect(struct phylink *, struct device_node *);
+void phylink_disconnect_phy(struct phylink *);
+
+void phylink_mac_change(struct phylink *, bool up);
+
+void phylink_start(struct phylink *);
+void phylink_stop(struct phylink *);
+
+void phylink_ethtool_get_wol(struct phylink *, struct ethtool_wolinfo *);
+int phylink_ethtool_set_wol(struct phylink *, struct ethtool_wolinfo *);
+
+int phylink_ethtool_ksettings_get(struct phylink *,
+				  struct ethtool_link_ksettings *);
+int phylink_ethtool_ksettings_set(struct phylink *,
+				  const struct ethtool_link_ksettings *);
+int phylink_ethtool_nway_reset(struct phylink *);
+void phylink_ethtool_get_pauseparam(struct phylink *,
+				    struct ethtool_pauseparam *);
+int phylink_ethtool_set_pauseparam(struct phylink *,
+				   struct ethtool_pauseparam *);
+int phylink_init_eee(struct phylink *, bool);
+int phylink_get_eee_err(struct phylink *);
+int phylink_ethtool_get_eee(struct phylink *, struct ethtool_eee *);
+int phylink_ethtool_set_eee(struct phylink *, struct ethtool_eee *);
+int phylink_mii_ioctl(struct phylink *, struct ifreq *, int);
+
+#define phylink_zero(bm) \
+	bitmap_zero(bm, __ETHTOOL_LINK_MODE_MASK_NBITS)
+#define __phylink_do_bit(op, bm, mode) \
+	op(ETHTOOL_LINK_MODE_ ## mode ## _BIT, bm)
+
+#define phylink_set(bm, mode)	__phylink_do_bit(__set_bit, bm, mode)
+#define phylink_clear(bm, mode)	__phylink_do_bit(__clear_bit, bm, mode)
+#define phylink_test(bm, mode)	__phylink_do_bit(test_bit, bm, mode)
+
+void phylink_set_port_modes(unsigned long *bits);
+
+#endif
-- 
cgit v1.2.3


From ce0aa27ff3f68ed4ea1631d33797e573b3508bfa Mon Sep 17 00:00:00 2001
From: Russell King <rmk+kernel@armlinux.org.uk>
Date: Tue, 25 Jul 2017 15:03:18 +0100
Subject: sfp: add sfp-bus to bridge between network devices and sfp cages

Signed-off-by: Russell King <rmk+kernel@armlinux.org.uk>
Reviewed-by: Andrew Lunn <andrew@lunn.ch>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/phy/Makefile  |   3 +
 drivers/net/phy/phylink.c | 157 +++++++++++++++
 drivers/net/phy/sfp-bus.c | 475 ++++++++++++++++++++++++++++++++++++++++++++++
 drivers/net/phy/sfp.h     |  28 +++
 include/linux/sfp.h       | 434 ++++++++++++++++++++++++++++++++++++++++++
 5 files changed, 1097 insertions(+)
 create mode 100644 drivers/net/phy/sfp-bus.c
 create mode 100644 drivers/net/phy/sfp.h
 create mode 100644 include/linux/sfp.h

(limited to 'include')

diff --git a/drivers/net/phy/Makefile b/drivers/net/phy/Makefile
index c43e5b99fda4..4c16a10f420e 100644
--- a/drivers/net/phy/Makefile
+++ b/drivers/net/phy/Makefile
@@ -38,6 +38,9 @@ obj-$(CONFIG_MDIO_SUN4I)	+= mdio-sun4i.o
 obj-$(CONFIG_MDIO_THUNDER)	+= mdio-thunder.o
 obj-$(CONFIG_MDIO_XGENE)	+= mdio-xgene.o
 
+sfp-obj-$(CONFIG_SFP)		+= sfp-bus.o
+obj-y				+= $(sfp-obj-y) $(sfp-obj-m)
+
 obj-$(CONFIG_AMD_PHY)		+= amd.o
 obj-$(CONFIG_AQUANTIA_PHY)	+= aquantia.o
 obj-$(CONFIG_AT803X_PHY)	+= at803x.o
diff --git a/drivers/net/phy/phylink.c b/drivers/net/phy/phylink.c
index af61d7d400af..02082f4a8a95 100644
--- a/drivers/net/phy/phylink.c
+++ b/drivers/net/phy/phylink.c
@@ -21,6 +21,7 @@
 #include <linux/spinlock.h>
 #include <linux/workqueue.h>
 
+#include "sfp.h"
 #include "swphy.h"
 
 #define SUPPORTED_INTERFACES \
@@ -32,6 +33,7 @@
 
 enum {
 	PHYLINK_DISABLE_STOPPED,
+	PHYLINK_DISABLE_LINK,
 };
 
 struct phylink {
@@ -54,6 +56,8 @@ struct phylink {
 	struct work_struct resolve;
 
 	bool mac_link_dropped;
+
+	struct sfp_bus *sfp_bus;
 };
 
 static inline void linkmode_zero(unsigned long *dst)
@@ -466,6 +470,24 @@ static void phylink_run_resolve(struct phylink *pl)
 		queue_work(system_power_efficient_wq, &pl->resolve);
 }
 
+static const struct sfp_upstream_ops sfp_phylink_ops;
+
+static int phylink_register_sfp(struct phylink *pl, struct device_node *np)
+{
+	struct device_node *sfp_np;
+
+	sfp_np = of_parse_phandle(np, "sfp", 0);
+	if (!sfp_np)
+		return 0;
+
+	pl->sfp_bus = sfp_register_upstream(sfp_np, pl->netdev, pl,
+					    &sfp_phylink_ops);
+	if (!pl->sfp_bus)
+		return -ENOMEM;
+
+	return 0;
+}
+
 struct phylink *phylink_create(struct net_device *ndev, struct device_node *np,
 	phy_interface_t iface, const struct phylink_mac_ops *ops)
 {
@@ -507,12 +529,21 @@ struct phylink *phylink_create(struct net_device *ndev, struct device_node *np,
 		}
 	}
 
+	ret = phylink_register_sfp(pl, np);
+	if (ret < 0) {
+		kfree(pl);
+		return ERR_PTR(ret);
+	}
+
 	return pl;
 }
 EXPORT_SYMBOL_GPL(phylink_create);
 
 void phylink_destroy(struct phylink *pl)
 {
+	if (pl->sfp_bus)
+		sfp_unregister_upstream(pl->sfp_bus);
+
 	cancel_work_sync(&pl->resolve);
 	kfree(pl);
 }
@@ -706,6 +737,8 @@ void phylink_start(struct phylink *pl)
 	clear_bit(PHYLINK_DISABLE_STOPPED, &pl->phylink_disable_state);
 	phylink_run_resolve(pl);
 
+	if (pl->sfp_bus)
+		sfp_upstream_start(pl->sfp_bus);
 	if (pl->phydev)
 		phy_start(pl->phydev);
 }
@@ -717,6 +750,8 @@ void phylink_stop(struct phylink *pl)
 
 	if (pl->phydev)
 		phy_stop(pl->phydev);
+	if (pl->sfp_bus)
+		sfp_upstream_stop(pl->sfp_bus);
 
 	set_bit(PHYLINK_DISABLE_STOPPED, &pl->phylink_disable_state);
 	flush_work(&pl->resolve);
@@ -1166,4 +1201,126 @@ int phylink_mii_ioctl(struct phylink *pl, struct ifreq *ifr, int cmd)
 }
 EXPORT_SYMBOL_GPL(phylink_mii_ioctl);
 
+
+
+static int phylink_sfp_module_insert(void *upstream,
+				     const struct sfp_eeprom_id *id)
+{
+	struct phylink *pl = upstream;
+	__ETHTOOL_DECLARE_LINK_MODE_MASK(support) = { 0, };
+	struct phylink_link_state config;
+	phy_interface_t iface;
+	int mode, ret = 0;
+	bool changed;
+	u8 port;
+
+	sfp_parse_support(pl->sfp_bus, id, support);
+	port = sfp_parse_port(pl->sfp_bus, id, support);
+	iface = sfp_parse_interface(pl->sfp_bus, id);
+
+	WARN_ON(!lockdep_rtnl_is_held());
+
+	switch (iface) {
+	case PHY_INTERFACE_MODE_SGMII:
+		mode = MLO_AN_SGMII;
+		break;
+	case PHY_INTERFACE_MODE_1000BASEX:
+		mode = MLO_AN_8023Z;
+		break;
+	default:
+		return -EINVAL;
+	}
+
+	memset(&config, 0, sizeof(config));
+	linkmode_copy(config.advertising, support);
+	config.interface = iface;
+	config.speed = SPEED_UNKNOWN;
+	config.duplex = DUPLEX_UNKNOWN;
+	config.pause = MLO_PAUSE_AN;
+	config.an_enabled = pl->link_config.an_enabled;
+
+	/* Ignore errors if we're expecting a PHY to attach later */
+	ret = phylink_validate(pl, support, &config);
+	if (ret) {
+		netdev_err(pl->netdev, "validation of %s/%s with support %*pb failed: %d\n",
+			   phylink_an_mode_str(mode), phy_modes(config.interface),
+			   __ETHTOOL_LINK_MODE_MASK_NBITS, support, ret);
+		return ret;
+	}
+
+	netdev_dbg(pl->netdev, "requesting link mode %s/%s with support %*pb\n",
+		   phylink_an_mode_str(mode), phy_modes(config.interface),
+		   __ETHTOOL_LINK_MODE_MASK_NBITS, support);
+
+	if (mode == MLO_AN_8023Z && pl->phydev)
+		return -EINVAL;
+
+	changed = !bitmap_equal(pl->supported, support,
+				__ETHTOOL_LINK_MODE_MASK_NBITS);
+	if (changed) {
+		linkmode_copy(pl->supported, support);
+		linkmode_copy(pl->link_config.advertising, config.advertising);
+	}
+
+	if (pl->link_an_mode != mode ||
+	    pl->link_config.interface != config.interface) {
+		pl->link_config.interface = config.interface;
+		pl->link_an_mode = mode;
+
+		changed = true;
+
+		netdev_info(pl->netdev, "switched to %s/%s link mode\n",
+			    phylink_an_mode_str(mode),
+			    phy_modes(config.interface));
+	}
+
+	pl->link_port = port;
+
+	if (changed && !test_bit(PHYLINK_DISABLE_STOPPED,
+				 &pl->phylink_disable_state))
+		phylink_mac_config(pl, &pl->link_config);
+
+	return ret;
+}
+
+static void phylink_sfp_link_down(void *upstream)
+{
+	struct phylink *pl = upstream;
+
+	WARN_ON(!lockdep_rtnl_is_held());
+
+	set_bit(PHYLINK_DISABLE_LINK, &pl->phylink_disable_state);
+	flush_work(&pl->resolve);
+
+	netif_carrier_off(pl->netdev);
+}
+
+static void phylink_sfp_link_up(void *upstream)
+{
+	struct phylink *pl = upstream;
+
+	WARN_ON(!lockdep_rtnl_is_held());
+
+	clear_bit(PHYLINK_DISABLE_LINK, &pl->phylink_disable_state);
+	phylink_run_resolve(pl);
+}
+
+static int phylink_sfp_connect_phy(void *upstream, struct phy_device *phy)
+{
+	return phylink_connect_phy(upstream, phy);
+}
+
+static void phylink_sfp_disconnect_phy(void *upstream)
+{
+	phylink_disconnect_phy(upstream);
+}
+
+static const struct sfp_upstream_ops sfp_phylink_ops = {
+	.module_insert = phylink_sfp_module_insert,
+	.link_up = phylink_sfp_link_up,
+	.link_down = phylink_sfp_link_down,
+	.connect_phy = phylink_sfp_connect_phy,
+	.disconnect_phy = phylink_sfp_disconnect_phy,
+};
+
 MODULE_LICENSE("GPL");
diff --git a/drivers/net/phy/sfp-bus.c b/drivers/net/phy/sfp-bus.c
new file mode 100644
index 000000000000..5cb5384697ea
--- /dev/null
+++ b/drivers/net/phy/sfp-bus.c
@@ -0,0 +1,475 @@
+#include <linux/export.h>
+#include <linux/kref.h>
+#include <linux/list.h>
+#include <linux/mutex.h>
+#include <linux/phylink.h>
+#include <linux/rtnetlink.h>
+#include <linux/slab.h>
+
+#include "sfp.h"
+
+struct sfp_bus {
+	struct kref kref;
+	struct list_head node;
+	struct device_node *device_node;
+
+	const struct sfp_socket_ops *socket_ops;
+	struct device *sfp_dev;
+	struct sfp *sfp;
+
+	const struct sfp_upstream_ops *upstream_ops;
+	void *upstream;
+	struct net_device *netdev;
+	struct phy_device *phydev;
+
+	bool registered;
+	bool started;
+};
+
+
+int sfp_parse_port(struct sfp_bus *bus, const struct sfp_eeprom_id *id,
+		   unsigned long *support)
+{
+	int port;
+
+	/* port is the physical connector, set this from the connector field. */
+	switch (id->base.connector) {
+	case SFP_CONNECTOR_SC:
+	case SFP_CONNECTOR_FIBERJACK:
+	case SFP_CONNECTOR_LC:
+	case SFP_CONNECTOR_MT_RJ:
+	case SFP_CONNECTOR_MU:
+	case SFP_CONNECTOR_OPTICAL_PIGTAIL:
+		if (support)
+			phylink_set(support, FIBRE);
+		port = PORT_FIBRE;
+		break;
+
+	case SFP_CONNECTOR_RJ45:
+		if (support)
+			phylink_set(support, TP);
+		port = PORT_TP;
+		break;
+
+	case SFP_CONNECTOR_UNSPEC:
+		if (id->base.e1000_base_t) {
+			if (support)
+				phylink_set(support, TP);
+			port = PORT_TP;
+			break;
+		}
+		/* fallthrough */
+	case SFP_CONNECTOR_SG: /* guess */
+	case SFP_CONNECTOR_MPO_1X12:
+	case SFP_CONNECTOR_MPO_2X16:
+	case SFP_CONNECTOR_HSSDC_II:
+	case SFP_CONNECTOR_COPPER_PIGTAIL:
+	case SFP_CONNECTOR_NOSEPARATE:
+	case SFP_CONNECTOR_MXC_2X16:
+		port = PORT_OTHER;
+		break;
+	default:
+		dev_warn(bus->sfp_dev, "SFP: unknown connector id 0x%02x\n",
+			 id->base.connector);
+		port = PORT_OTHER;
+		break;
+	}
+
+	return port;
+}
+EXPORT_SYMBOL_GPL(sfp_parse_port);
+
+phy_interface_t sfp_parse_interface(struct sfp_bus *bus,
+				    const struct sfp_eeprom_id *id)
+{
+	phy_interface_t iface;
+
+	/* Setting the serdes link mode is guesswork: there's no field in
+	 * the EEPROM which indicates what mode should be used.
+	 *
+	 * If the module wants 64b66b, then it must be >= 10G.
+	 *
+	 * If it's a gigabit-only fiber module, it probably does not have
+	 * a PHY, so switch to 802.3z negotiation mode. Otherwise, switch
+	 * to SGMII mode (which is required to support non-gigabit speeds).
+	 */
+	switch (id->base.encoding) {
+	case SFP_ENCODING_8472_64B66B:
+		iface = PHY_INTERFACE_MODE_10GKR;
+		break;
+
+	case SFP_ENCODING_8B10B:
+		if (!id->base.e1000_base_t &&
+		    !id->base.e100_base_lx &&
+		    !id->base.e100_base_fx)
+			iface = PHY_INTERFACE_MODE_1000BASEX;
+		else
+			iface = PHY_INTERFACE_MODE_SGMII;
+		break;
+
+	default:
+		iface = PHY_INTERFACE_MODE_NA;
+		dev_err(bus->sfp_dev,
+			"SFP module encoding does not support 8b10b nor 64b66b\n");
+		break;
+	}
+
+	return iface;
+}
+EXPORT_SYMBOL_GPL(sfp_parse_interface);
+
+void sfp_parse_support(struct sfp_bus *bus, const struct sfp_eeprom_id *id,
+		       unsigned long *support)
+{
+	phylink_set(support, Autoneg);
+	phylink_set(support, Pause);
+	phylink_set(support, Asym_Pause);
+
+	/* Set ethtool support from the compliance fields. */
+	if (id->base.e10g_base_sr)
+		phylink_set(support, 10000baseSR_Full);
+	if (id->base.e10g_base_lr)
+		phylink_set(support, 10000baseLR_Full);
+	if (id->base.e10g_base_lrm)
+		phylink_set(support, 10000baseLRM_Full);
+	if (id->base.e10g_base_er)
+		phylink_set(support, 10000baseER_Full);
+	if (id->base.e1000_base_sx ||
+	    id->base.e1000_base_lx ||
+	    id->base.e1000_base_cx)
+		phylink_set(support, 1000baseX_Full);
+	if (id->base.e1000_base_t) {
+		phylink_set(support, 1000baseT_Half);
+		phylink_set(support, 1000baseT_Full);
+	}
+
+	switch (id->base.extended_cc) {
+	case 0x00: /* Unspecified */
+		break;
+	case 0x02: /* 100Gbase-SR4 or 25Gbase-SR */
+		phylink_set(support, 100000baseSR4_Full);
+		phylink_set(support, 25000baseSR_Full);
+		break;
+	case 0x03: /* 100Gbase-LR4 or 25Gbase-LR */
+	case 0x04: /* 100Gbase-ER4 or 25Gbase-ER */
+		phylink_set(support, 100000baseLR4_ER4_Full);
+		break;
+	case 0x0b: /* 100Gbase-CR4 or 25Gbase-CR CA-L */
+	case 0x0c: /* 25Gbase-CR CA-S */
+	case 0x0d: /* 25Gbase-CR CA-N */
+		phylink_set(support, 100000baseCR4_Full);
+		phylink_set(support, 25000baseCR_Full);
+		break;
+	default:
+		dev_warn(bus->sfp_dev,
+			 "Unknown/unsupported extended compliance code: 0x%02x\n",
+			 id->base.extended_cc);
+		break;
+	}
+
+	/* For fibre channel SFP, derive possible BaseX modes */
+	if (id->base.fc_speed_100 ||
+	    id->base.fc_speed_200 ||
+	    id->base.fc_speed_400) {
+		if (id->base.br_nominal >= 31)
+			phylink_set(support, 2500baseX_Full);
+		if (id->base.br_nominal >= 12)
+			phylink_set(support, 1000baseX_Full);
+	}
+
+	switch (id->base.connector) {
+	case SFP_CONNECTOR_SC:
+	case SFP_CONNECTOR_FIBERJACK:
+	case SFP_CONNECTOR_LC:
+	case SFP_CONNECTOR_MT_RJ:
+	case SFP_CONNECTOR_MU:
+	case SFP_CONNECTOR_OPTICAL_PIGTAIL:
+		break;
+
+	case SFP_CONNECTOR_UNSPEC:
+		if (id->base.e1000_base_t)
+			break;
+
+	case SFP_CONNECTOR_SG: /* guess */
+	case SFP_CONNECTOR_MPO_1X12:
+	case SFP_CONNECTOR_MPO_2X16:
+	case SFP_CONNECTOR_HSSDC_II:
+	case SFP_CONNECTOR_COPPER_PIGTAIL:
+	case SFP_CONNECTOR_NOSEPARATE:
+	case SFP_CONNECTOR_MXC_2X16:
+	default:
+		/* a guess at the supported link modes */
+		dev_warn(bus->sfp_dev,
+			 "Guessing link modes, please report...\n");
+		phylink_set(support, 1000baseT_Half);
+		phylink_set(support, 1000baseT_Full);
+		break;
+	}
+}
+EXPORT_SYMBOL_GPL(sfp_parse_support);
+
+
+static LIST_HEAD(sfp_buses);
+static DEFINE_MUTEX(sfp_mutex);
+
+static const struct sfp_upstream_ops *sfp_get_upstream_ops(struct sfp_bus *bus)
+{
+	return bus->registered ? bus->upstream_ops : NULL;
+}
+
+static struct sfp_bus *sfp_bus_get(struct device_node *np)
+{
+	struct sfp_bus *sfp, *new, *found = NULL;
+
+	new = kzalloc(sizeof(*new), GFP_KERNEL);
+
+	mutex_lock(&sfp_mutex);
+
+	list_for_each_entry(sfp, &sfp_buses, node) {
+		if (sfp->device_node == np) {
+			kref_get(&sfp->kref);
+			found = sfp;
+			break;
+		}
+	}
+
+	if (!found && new) {
+		kref_init(&new->kref);
+		new->device_node = np;
+		list_add(&new->node, &sfp_buses);
+		found = new;
+		new = NULL;
+	}
+
+	mutex_unlock(&sfp_mutex);
+
+	kfree(new);
+
+	return found;
+}
+
+static void sfp_bus_release(struct kref *kref) __releases(sfp_mutex)
+{
+	struct sfp_bus *bus = container_of(kref, struct sfp_bus, kref);
+
+	list_del(&bus->node);
+	mutex_unlock(&sfp_mutex);
+	kfree(bus);
+}
+
+static void sfp_bus_put(struct sfp_bus *bus)
+{
+	kref_put_mutex(&bus->kref, sfp_bus_release, &sfp_mutex);
+}
+
+static int sfp_register_bus(struct sfp_bus *bus)
+{
+	const struct sfp_upstream_ops *ops = bus->upstream_ops;
+	int ret;
+
+	if (ops) {
+		if (ops->link_down)
+			ops->link_down(bus->upstream);
+		if (ops->connect_phy && bus->phydev) {
+			ret = ops->connect_phy(bus->upstream, bus->phydev);
+			if (ret)
+				return ret;
+		}
+	}
+	if (bus->started)
+		bus->socket_ops->start(bus->sfp);
+	bus->registered = true;
+	return 0;
+}
+
+static void sfp_unregister_bus(struct sfp_bus *bus)
+{
+	const struct sfp_upstream_ops *ops = bus->upstream_ops;
+
+	if (bus->registered) {
+		if (bus->started)
+			bus->socket_ops->stop(bus->sfp);
+		if (bus->phydev && ops && ops->disconnect_phy)
+			ops->disconnect_phy(bus->upstream);
+	}
+	bus->registered = false;
+}
+
+
+int sfp_get_module_info(struct sfp_bus *bus, struct ethtool_modinfo *modinfo)
+{
+	if (!bus->registered)
+		return -ENOIOCTLCMD;
+	return bus->socket_ops->module_info(bus->sfp, modinfo);
+}
+EXPORT_SYMBOL_GPL(sfp_get_module_info);
+
+int sfp_get_module_eeprom(struct sfp_bus *bus, struct ethtool_eeprom *ee,
+	u8 *data)
+{
+	if (!bus->registered)
+		return -ENOIOCTLCMD;
+	return bus->socket_ops->module_eeprom(bus->sfp, ee, data);
+}
+EXPORT_SYMBOL_GPL(sfp_get_module_eeprom);
+
+void sfp_upstream_start(struct sfp_bus *bus)
+{
+	if (bus->registered)
+		bus->socket_ops->start(bus->sfp);
+	bus->started = true;
+}
+EXPORT_SYMBOL_GPL(sfp_upstream_start);
+
+void sfp_upstream_stop(struct sfp_bus *bus)
+{
+	if (bus->registered)
+		bus->socket_ops->stop(bus->sfp);
+	bus->started = false;
+}
+EXPORT_SYMBOL_GPL(sfp_upstream_stop);
+
+struct sfp_bus *sfp_register_upstream(struct device_node *np,
+	struct net_device *ndev, void *upstream,
+	const struct sfp_upstream_ops *ops)
+{
+	struct sfp_bus *bus = sfp_bus_get(np);
+	int ret = 0;
+
+	if (bus) {
+		rtnl_lock();
+		bus->upstream_ops = ops;
+		bus->upstream = upstream;
+		bus->netdev = ndev;
+
+		if (bus->sfp)
+			ret = sfp_register_bus(bus);
+		rtnl_unlock();
+	}
+
+	if (ret) {
+		sfp_bus_put(bus);
+		bus = NULL;
+	}
+
+	return bus;
+}
+EXPORT_SYMBOL_GPL(sfp_register_upstream);
+
+void sfp_unregister_upstream(struct sfp_bus *bus)
+{
+	rtnl_lock();
+	sfp_unregister_bus(bus);
+	bus->upstream = NULL;
+	bus->netdev = NULL;
+	rtnl_unlock();
+
+	sfp_bus_put(bus);
+}
+EXPORT_SYMBOL_GPL(sfp_unregister_upstream);
+
+
+/* Socket driver entry points */
+int sfp_add_phy(struct sfp_bus *bus, struct phy_device *phydev)
+{
+	const struct sfp_upstream_ops *ops = sfp_get_upstream_ops(bus);
+	int ret = 0;
+
+	if (ops && ops->connect_phy)
+		ret = ops->connect_phy(bus->upstream, phydev);
+
+	if (ret == 0)
+		bus->phydev = phydev;
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(sfp_add_phy);
+
+void sfp_remove_phy(struct sfp_bus *bus)
+{
+	const struct sfp_upstream_ops *ops = sfp_get_upstream_ops(bus);
+
+	if (ops && ops->disconnect_phy)
+		ops->disconnect_phy(bus->upstream);
+	bus->phydev = NULL;
+}
+EXPORT_SYMBOL_GPL(sfp_remove_phy);
+
+
+void sfp_link_up(struct sfp_bus *bus)
+{
+	const struct sfp_upstream_ops *ops = sfp_get_upstream_ops(bus);
+
+	if (ops && ops->link_up)
+		ops->link_up(bus->upstream);
+}
+EXPORT_SYMBOL_GPL(sfp_link_up);
+
+void sfp_link_down(struct sfp_bus *bus)
+{
+	const struct sfp_upstream_ops *ops = sfp_get_upstream_ops(bus);
+
+	if (ops && ops->link_down)
+		ops->link_down(bus->upstream);
+}
+EXPORT_SYMBOL_GPL(sfp_link_down);
+
+int sfp_module_insert(struct sfp_bus *bus, const struct sfp_eeprom_id *id)
+{
+	const struct sfp_upstream_ops *ops = sfp_get_upstream_ops(bus);
+	int ret = 0;
+
+	if (ops && ops->module_insert)
+		ret = ops->module_insert(bus->upstream, id);
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(sfp_module_insert);
+
+void sfp_module_remove(struct sfp_bus *bus)
+{
+	const struct sfp_upstream_ops *ops = sfp_get_upstream_ops(bus);
+
+	if (ops && ops->module_remove)
+		ops->module_remove(bus->upstream);
+}
+EXPORT_SYMBOL_GPL(sfp_module_remove);
+
+struct sfp_bus *sfp_register_socket(struct device *dev, struct sfp *sfp,
+				    const struct sfp_socket_ops *ops)
+{
+	struct sfp_bus *bus = sfp_bus_get(dev->of_node);
+	int ret = 0;
+
+	if (bus) {
+		rtnl_lock();
+		bus->sfp_dev = dev;
+		bus->sfp = sfp;
+		bus->socket_ops = ops;
+
+		if (bus->netdev)
+			ret = sfp_register_bus(bus);
+		rtnl_unlock();
+	}
+
+	if (ret) {
+		sfp_bus_put(bus);
+		bus = NULL;
+	}
+
+	return bus;
+}
+EXPORT_SYMBOL_GPL(sfp_register_socket);
+
+void sfp_unregister_socket(struct sfp_bus *bus)
+{
+	rtnl_lock();
+	sfp_unregister_bus(bus);
+	bus->sfp_dev = NULL;
+	bus->sfp = NULL;
+	bus->socket_ops = NULL;
+	rtnl_unlock();
+
+	sfp_bus_put(bus);
+}
+EXPORT_SYMBOL_GPL(sfp_unregister_socket);
diff --git a/drivers/net/phy/sfp.h b/drivers/net/phy/sfp.h
new file mode 100644
index 000000000000..31b0acf337e2
--- /dev/null
+++ b/drivers/net/phy/sfp.h
@@ -0,0 +1,28 @@
+#ifndef SFP_H
+#define SFP_H
+
+#include <linux/ethtool.h>
+#include <linux/sfp.h>
+
+struct sfp;
+
+struct sfp_socket_ops {
+	void (*start)(struct sfp *sfp);
+	void (*stop)(struct sfp *sfp);
+	int (*module_info)(struct sfp *sfp, struct ethtool_modinfo *modinfo);
+	int (*module_eeprom)(struct sfp *sfp, struct ethtool_eeprom *ee,
+			     u8 *data);
+};
+
+int sfp_add_phy(struct sfp_bus *bus, struct phy_device *phydev);
+void sfp_remove_phy(struct sfp_bus *bus);
+void sfp_link_up(struct sfp_bus *bus);
+void sfp_link_down(struct sfp_bus *bus);
+int sfp_module_insert(struct sfp_bus *bus, const struct sfp_eeprom_id *id);
+void sfp_module_remove(struct sfp_bus *bus);
+int sfp_link_configure(struct sfp_bus *bus, const struct sfp_eeprom_id *id);
+struct sfp_bus *sfp_register_socket(struct device *dev, struct sfp *sfp,
+				    const struct sfp_socket_ops *ops);
+void sfp_unregister_socket(struct sfp_bus *bus);
+
+#endif
diff --git a/include/linux/sfp.h b/include/linux/sfp.h
new file mode 100644
index 000000000000..4a906f560817
--- /dev/null
+++ b/include/linux/sfp.h
@@ -0,0 +1,434 @@
+#ifndef LINUX_SFP_H
+#define LINUX_SFP_H
+
+#include <linux/phy.h>
+
+struct __packed sfp_eeprom_base {
+	u8 phys_id;
+	u8 phys_ext_id;
+	u8 connector;
+#if defined __BIG_ENDIAN_BITFIELD
+	u8 e10g_base_er:1;
+	u8 e10g_base_lrm:1;
+	u8 e10g_base_lr:1;
+	u8 e10g_base_sr:1;
+	u8 if_1x_sx:1;
+	u8 if_1x_lx:1;
+	u8 if_1x_copper_active:1;
+	u8 if_1x_copper_passive:1;
+
+	u8 escon_mmf_1310_led:1;
+	u8 escon_smf_1310_laser:1;
+	u8 sonet_oc192_short_reach:1;
+	u8 sonet_reach_bit1:1;
+	u8 sonet_reach_bit2:1;
+	u8 sonet_oc48_long_reach:1;
+	u8 sonet_oc48_intermediate_reach:1;
+	u8 sonet_oc48_short_reach:1;
+
+	u8 unallocated_5_7:1;
+	u8 sonet_oc12_smf_long_reach:1;
+	u8 sonet_oc12_smf_intermediate_reach:1;
+	u8 sonet_oc12_short_reach:1;
+	u8 unallocated_5_3:1;
+	u8 sonet_oc3_smf_long_reach:1;
+	u8 sonet_oc3_smf_intermediate_reach:1;
+	u8 sonet_oc3_short_reach:1;
+
+	u8 e_base_px:1;
+	u8 e_base_bx10:1;
+	u8 e100_base_fx:1;
+	u8 e100_base_lx:1;
+	u8 e1000_base_t:1;
+	u8 e1000_base_cx:1;
+	u8 e1000_base_lx:1;
+	u8 e1000_base_sx:1;
+
+	u8 fc_ll_v:1;
+	u8 fc_ll_s:1;
+	u8 fc_ll_i:1;
+	u8 fc_ll_l:1;
+	u8 fc_ll_m:1;
+	u8 fc_tech_sa:1;
+	u8 fc_tech_lc:1;
+	u8 fc_tech_electrical_inter_enclosure:1;
+
+	u8 fc_tech_electrical_intra_enclosure:1;
+	u8 fc_tech_sn:1;
+	u8 fc_tech_sl:1;
+	u8 fc_tech_ll:1;
+	u8 sfp_ct_active:1;
+	u8 sfp_ct_passive:1;
+	u8 unallocated_8_1:1;
+	u8 unallocated_8_0:1;
+
+	u8 fc_media_tw:1;
+	u8 fc_media_tp:1;
+	u8 fc_media_mi:1;
+	u8 fc_media_tv:1;
+	u8 fc_media_m6:1;
+	u8 fc_media_m5:1;
+	u8 unallocated_9_1:1;
+	u8 fc_media_sm:1;
+
+	u8 fc_speed_1200:1;
+	u8 fc_speed_800:1;
+	u8 fc_speed_1600:1;
+	u8 fc_speed_400:1;
+	u8 fc_speed_3200:1;
+	u8 fc_speed_200:1;
+	u8 unallocated_10_1:1;
+	u8 fc_speed_100:1;
+#elif defined __LITTLE_ENDIAN_BITFIELD
+	u8 if_1x_copper_passive:1;
+	u8 if_1x_copper_active:1;
+	u8 if_1x_lx:1;
+	u8 if_1x_sx:1;
+	u8 e10g_base_sr:1;
+	u8 e10g_base_lr:1;
+	u8 e10g_base_lrm:1;
+	u8 e10g_base_er:1;
+
+	u8 sonet_oc3_short_reach:1;
+	u8 sonet_oc3_smf_intermediate_reach:1;
+	u8 sonet_oc3_smf_long_reach:1;
+	u8 unallocated_5_3:1;
+	u8 sonet_oc12_short_reach:1;
+	u8 sonet_oc12_smf_intermediate_reach:1;
+	u8 sonet_oc12_smf_long_reach:1;
+	u8 unallocated_5_7:1;
+
+	u8 sonet_oc48_short_reach:1;
+	u8 sonet_oc48_intermediate_reach:1;
+	u8 sonet_oc48_long_reach:1;
+	u8 sonet_reach_bit2:1;
+	u8 sonet_reach_bit1:1;
+	u8 sonet_oc192_short_reach:1;
+	u8 escon_smf_1310_laser:1;
+	u8 escon_mmf_1310_led:1;
+
+	u8 e1000_base_sx:1;
+	u8 e1000_base_lx:1;
+	u8 e1000_base_cx:1;
+	u8 e1000_base_t:1;
+	u8 e100_base_lx:1;
+	u8 e100_base_fx:1;
+	u8 e_base_bx10:1;
+	u8 e_base_px:1;
+
+	u8 fc_tech_electrical_inter_enclosure:1;
+	u8 fc_tech_lc:1;
+	u8 fc_tech_sa:1;
+	u8 fc_ll_m:1;
+	u8 fc_ll_l:1;
+	u8 fc_ll_i:1;
+	u8 fc_ll_s:1;
+	u8 fc_ll_v:1;
+
+	u8 unallocated_8_0:1;
+	u8 unallocated_8_1:1;
+	u8 sfp_ct_passive:1;
+	u8 sfp_ct_active:1;
+	u8 fc_tech_ll:1;
+	u8 fc_tech_sl:1;
+	u8 fc_tech_sn:1;
+	u8 fc_tech_electrical_intra_enclosure:1;
+
+	u8 fc_media_sm:1;
+	u8 unallocated_9_1:1;
+	u8 fc_media_m5:1;
+	u8 fc_media_m6:1;
+	u8 fc_media_tv:1;
+	u8 fc_media_mi:1;
+	u8 fc_media_tp:1;
+	u8 fc_media_tw:1;
+
+	u8 fc_speed_100:1;
+	u8 unallocated_10_1:1;
+	u8 fc_speed_200:1;
+	u8 fc_speed_3200:1;
+	u8 fc_speed_400:1;
+	u8 fc_speed_1600:1;
+	u8 fc_speed_800:1;
+	u8 fc_speed_1200:1;
+#else
+#error Unknown Endian
+#endif
+	u8 encoding;
+	u8 br_nominal;
+	u8 rate_id;
+	u8 link_len[6];
+	char vendor_name[16];
+	u8 extended_cc;
+	char vendor_oui[3];
+	char vendor_pn[16];
+	char vendor_rev[4];
+	union {
+		__be16 optical_wavelength;
+		u8 cable_spec;
+	};
+	u8 reserved62;
+	u8 cc_base;
+};
+
+struct __packed sfp_eeprom_ext {
+	__be16 options;
+	u8 br_max;
+	u8 br_min;
+	char vendor_sn[16];
+	char datecode[8];
+	u8 diagmon;
+	u8 enhopts;
+	u8 sff8472_compliance;
+	u8 cc_ext;
+};
+
+struct __packed sfp_eeprom_id {
+	struct sfp_eeprom_base base;
+	struct sfp_eeprom_ext ext;
+};
+
+/* SFP EEPROM registers */
+enum {
+	SFP_PHYS_ID			= 0x00,
+	SFP_PHYS_EXT_ID			= 0x01,
+	SFP_CONNECTOR			= 0x02,
+	SFP_COMPLIANCE			= 0x03,
+	SFP_ENCODING			= 0x0b,
+	SFP_BR_NOMINAL			= 0x0c,
+	SFP_RATE_ID			= 0x0d,
+	SFP_LINK_LEN_SM_KM		= 0x0e,
+	SFP_LINK_LEN_SM_100M		= 0x0f,
+	SFP_LINK_LEN_50UM_OM2_10M	= 0x10,
+	SFP_LINK_LEN_62_5UM_OM1_10M	= 0x11,
+	SFP_LINK_LEN_COPPER_1M		= 0x12,
+	SFP_LINK_LEN_50UM_OM4_10M	= 0x12,
+	SFP_LINK_LEN_50UM_OM3_10M	= 0x13,
+	SFP_VENDOR_NAME			= 0x14,
+	SFP_VENDOR_OUI			= 0x25,
+	SFP_VENDOR_PN			= 0x28,
+	SFP_VENDOR_REV			= 0x38,
+	SFP_OPTICAL_WAVELENGTH_MSB	= 0x3c,
+	SFP_OPTICAL_WAVELENGTH_LSB	= 0x3d,
+	SFP_CABLE_SPEC			= 0x3c,
+	SFP_CC_BASE			= 0x3f,
+	SFP_OPTIONS			= 0x40,	/* 2 bytes, MSB, LSB */
+	SFP_BR_MAX			= 0x42,
+	SFP_BR_MIN			= 0x43,
+	SFP_VENDOR_SN			= 0x44,
+	SFP_DATECODE			= 0x54,
+	SFP_DIAGMON			= 0x5c,
+	SFP_ENHOPTS			= 0x5d,
+	SFP_SFF8472_COMPLIANCE		= 0x5e,
+	SFP_CC_EXT			= 0x5f,
+
+	SFP_PHYS_ID_SFP			= 0x03,
+	SFP_PHYS_EXT_ID_SFP		= 0x04,
+	SFP_CONNECTOR_UNSPEC		= 0x00,
+	/* codes 01-05 not supportable on SFP, but some modules have single SC */
+	SFP_CONNECTOR_SC		= 0x01,
+	SFP_CONNECTOR_FIBERJACK		= 0x06,
+	SFP_CONNECTOR_LC		= 0x07,
+	SFP_CONNECTOR_MT_RJ		= 0x08,
+	SFP_CONNECTOR_MU		= 0x09,
+	SFP_CONNECTOR_SG		= 0x0a,
+	SFP_CONNECTOR_OPTICAL_PIGTAIL	= 0x0b,
+	SFP_CONNECTOR_MPO_1X12		= 0x0c,
+	SFP_CONNECTOR_MPO_2X16		= 0x0d,
+	SFP_CONNECTOR_HSSDC_II		= 0x20,
+	SFP_CONNECTOR_COPPER_PIGTAIL	= 0x21,
+	SFP_CONNECTOR_RJ45		= 0x22,
+	SFP_CONNECTOR_NOSEPARATE	= 0x23,
+	SFP_CONNECTOR_MXC_2X16		= 0x24,
+	SFP_ENCODING_UNSPEC		= 0x00,
+	SFP_ENCODING_8B10B		= 0x01,
+	SFP_ENCODING_4B5B		= 0x02,
+	SFP_ENCODING_NRZ		= 0x03,
+	SFP_ENCODING_8472_MANCHESTER	= 0x04,
+	SFP_ENCODING_8472_SONET		= 0x05,
+	SFP_ENCODING_8472_64B66B	= 0x06,
+	SFP_ENCODING_256B257B		= 0x07,
+	SFP_ENCODING_PAM4		= 0x08,
+	SFP_OPTIONS_HIGH_POWER_LEVEL	= BIT(13),
+	SFP_OPTIONS_PAGING_A2		= BIT(12),
+	SFP_OPTIONS_RETIMER		= BIT(11),
+	SFP_OPTIONS_COOLED_XCVR		= BIT(10),
+	SFP_OPTIONS_POWER_DECL		= BIT(9),
+	SFP_OPTIONS_RX_LINEAR_OUT	= BIT(8),
+	SFP_OPTIONS_RX_DECISION_THRESH	= BIT(7),
+	SFP_OPTIONS_TUNABLE_TX		= BIT(6),
+	SFP_OPTIONS_RATE_SELECT		= BIT(5),
+	SFP_OPTIONS_TX_DISABLE		= BIT(4),
+	SFP_OPTIONS_TX_FAULT		= BIT(3),
+	SFP_OPTIONS_LOS_INVERTED	= BIT(2),
+	SFP_OPTIONS_LOS_NORMAL		= BIT(1),
+	SFP_DIAGMON_DDM			= BIT(6),
+	SFP_DIAGMON_INT_CAL		= BIT(5),
+	SFP_DIAGMON_EXT_CAL		= BIT(4),
+	SFP_DIAGMON_RXPWR_AVG		= BIT(3),
+	SFP_DIAGMON_ADDRMODE		= BIT(2),
+	SFP_ENHOPTS_ALARMWARN		= BIT(7),
+	SFP_ENHOPTS_SOFT_TX_DISABLE	= BIT(6),
+	SFP_ENHOPTS_SOFT_TX_FAULT	= BIT(5),
+	SFP_ENHOPTS_SOFT_RX_LOS		= BIT(4),
+	SFP_ENHOPTS_SOFT_RATE_SELECT	= BIT(3),
+	SFP_ENHOPTS_APP_SELECT_SFF8079	= BIT(2),
+	SFP_ENHOPTS_SOFT_RATE_SFF8431	= BIT(1),
+	SFP_SFF8472_COMPLIANCE_NONE	= 0x00,
+	SFP_SFF8472_COMPLIANCE_REV9_3	= 0x01,
+	SFP_SFF8472_COMPLIANCE_REV9_5	= 0x02,
+	SFP_SFF8472_COMPLIANCE_REV10_2	= 0x03,
+	SFP_SFF8472_COMPLIANCE_REV10_4	= 0x04,
+	SFP_SFF8472_COMPLIANCE_REV11_0	= 0x05,
+	SFP_SFF8472_COMPLIANCE_REV11_3	= 0x06,
+	SFP_SFF8472_COMPLIANCE_REV11_4	= 0x07,
+	SFP_SFF8472_COMPLIANCE_REV12_0	= 0x08,
+};
+
+/* SFP Diagnostics */
+enum {
+	/* Alarm and warnings stored MSB at lower address then LSB */
+	SFP_TEMP_HIGH_ALARM		= 0x00,
+	SFP_TEMP_LOW_ALARM		= 0x02,
+	SFP_TEMP_HIGH_WARN		= 0x04,
+	SFP_TEMP_LOW_WARN		= 0x06,
+	SFP_VOLT_HIGH_ALARM		= 0x08,
+	SFP_VOLT_LOW_ALARM		= 0x0a,
+	SFP_VOLT_HIGH_WARN		= 0x0c,
+	SFP_VOLT_LOW_WARN		= 0x0e,
+	SFP_BIAS_HIGH_ALARM		= 0x10,
+	SFP_BIAS_LOW_ALARM		= 0x12,
+	SFP_BIAS_HIGH_WARN		= 0x14,
+	SFP_BIAS_LOW_WARN		= 0x16,
+	SFP_TXPWR_HIGH_ALARM		= 0x18,
+	SFP_TXPWR_LOW_ALARM		= 0x1a,
+	SFP_TXPWR_HIGH_WARN		= 0x1c,
+	SFP_TXPWR_LOW_WARN		= 0x1e,
+	SFP_RXPWR_HIGH_ALARM		= 0x20,
+	SFP_RXPWR_LOW_ALARM		= 0x22,
+	SFP_RXPWR_HIGH_WARN		= 0x24,
+	SFP_RXPWR_LOW_WARN		= 0x26,
+	SFP_LASER_TEMP_HIGH_ALARM	= 0x28,
+	SFP_LASER_TEMP_LOW_ALARM	= 0x2a,
+	SFP_LASER_TEMP_HIGH_WARN	= 0x2c,
+	SFP_LASER_TEMP_LOW_WARN		= 0x2e,
+	SFP_TEC_CUR_HIGH_ALARM		= 0x30,
+	SFP_TEC_CUR_LOW_ALARM		= 0x32,
+	SFP_TEC_CUR_HIGH_WARN		= 0x34,
+	SFP_TEC_CUR_LOW_WARN		= 0x36,
+	SFP_CAL_RXPWR4			= 0x38,
+	SFP_CAL_RXPWR3			= 0x3c,
+	SFP_CAL_RXPWR2			= 0x40,
+	SFP_CAL_RXPWR1			= 0x44,
+	SFP_CAL_RXPWR0			= 0x48,
+	SFP_CAL_TXI_SLOPE		= 0x4c,
+	SFP_CAL_TXI_OFFSET		= 0x4e,
+	SFP_CAL_TXPWR_SLOPE		= 0x50,
+	SFP_CAL_TXPWR_OFFSET		= 0x52,
+	SFP_CAL_T_SLOPE			= 0x54,
+	SFP_CAL_T_OFFSET		= 0x56,
+	SFP_CAL_V_SLOPE			= 0x58,
+	SFP_CAL_V_OFFSET		= 0x5a,
+	SFP_CHKSUM			= 0x5f,
+
+	SFP_TEMP			= 0x60,
+	SFP_VCC				= 0x62,
+	SFP_TX_BIAS			= 0x64,
+	SFP_TX_POWER			= 0x66,
+	SFP_RX_POWER			= 0x68,
+	SFP_LASER_TEMP			= 0x6a,
+	SFP_TEC_CUR			= 0x6c,
+
+	SFP_STATUS			= 0x6e,
+	SFP_ALARM			= 0x70,
+
+	SFP_EXT_STATUS			= 0x76,
+	SFP_VSL				= 0x78,
+	SFP_PAGE			= 0x7f,
+};
+
+struct device_node;
+struct ethtool_eeprom;
+struct ethtool_modinfo;
+struct net_device;
+struct sfp_bus;
+
+struct sfp_upstream_ops {
+	int (*module_insert)(void *, const struct sfp_eeprom_id *id);
+	void (*module_remove)(void *);
+	void (*link_down)(void *);
+	void (*link_up)(void *);
+	int (*connect_phy)(void *, struct phy_device *);
+	void (*disconnect_phy)(void *);
+};
+
+#if IS_ENABLED(CONFIG_SFP)
+int sfp_parse_port(struct sfp_bus *bus, const struct sfp_eeprom_id *id,
+		   unsigned long *support);
+phy_interface_t sfp_parse_interface(struct sfp_bus *bus,
+				    const struct sfp_eeprom_id *id);
+void sfp_parse_support(struct sfp_bus *bus, const struct sfp_eeprom_id *id,
+		       unsigned long *support);
+
+int sfp_get_module_info(struct sfp_bus *bus, struct ethtool_modinfo *modinfo);
+int sfp_get_module_eeprom(struct sfp_bus *bus, struct ethtool_eeprom *ee,
+			  u8 *data);
+void sfp_upstream_start(struct sfp_bus *bus);
+void sfp_upstream_stop(struct sfp_bus *bus);
+struct sfp_bus *sfp_register_upstream(struct device_node *np,
+				      struct net_device *ndev, void *upstream,
+				      const struct sfp_upstream_ops *ops);
+void sfp_unregister_upstream(struct sfp_bus *bus);
+#else
+static inline int sfp_parse_port(struct sfp_bus *bus,
+				 const struct sfp_eeprom_id *id,
+				 unsigned long *support)
+{
+	return PORT_OTHER;
+}
+
+static inline phy_interface_t sfp_parse_interface(struct sfp_bus *bus,
+						const struct sfp_eeprom_id *id)
+{
+	return PHY_INTERFACE_MODE_NA;
+}
+
+static inline void sfp_parse_support(struct sfp_bus *bus,
+				     const struct sfp_eeprom_id *id,
+				     unsigned long *support)
+{
+}
+
+static inline int sfp_get_module_info(struct sfp_bus *bus,
+				      struct ethtool_modinfo *modinfo)
+{
+	return -EOPNOTSUPP;
+}
+
+static inline int sfp_get_module_eeprom(struct sfp_bus *bus,
+					struct ethtool_eeprom *ee, u8 *data)
+{
+	return -EOPNOTSUPP;
+}
+
+static inline void sfp_upstream_start(struct sfp_bus *bus)
+{
+}
+
+static inline void sfp_upstream_stop(struct sfp_bus *bus)
+{
+}
+
+static inline struct sfp_bus *sfp_register_upstream(struct device_node *np,
+	struct net_device *ndev, void *upstream,
+	const struct sfp_upstream_ops *ops)
+{
+	return (struct sfp_bus *)-1;
+}
+
+static inline void sfp_unregister_upstream(struct sfp_bus *bus)
+{
+}
+#endif
+
+#endif
-- 
cgit v1.2.3


From 770a1ad55763a8a783cb71078e0b33a6b91ad92b Mon Sep 17 00:00:00 2001
From: Russell King <rmk+kernel@arm.linux.org.uk>
Date: Tue, 25 Jul 2017 15:03:23 +0100
Subject: phylink: add module EEPROM support

Add support for reading module EEPROMs through phylink.

Reviewed-by: Florian Fainelli <f.fainelli@gmail.com>
Signed-off-by: Russell King <rmk+kernel@arm.linux.org.uk>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/phy/phylink.c | 28 ++++++++++++++++++++++++++++
 include/linux/phylink.h   |  3 +++
 2 files changed, 31 insertions(+)

(limited to 'include')

diff --git a/drivers/net/phy/phylink.c b/drivers/net/phy/phylink.c
index 02082f4a8a95..026060c95b82 100644
--- a/drivers/net/phy/phylink.c
+++ b/drivers/net/phy/phylink.c
@@ -1020,6 +1020,34 @@ int phylink_ethtool_set_pauseparam(struct phylink *pl,
 }
 EXPORT_SYMBOL_GPL(phylink_ethtool_set_pauseparam);
 
+int phylink_ethtool_get_module_info(struct phylink *pl,
+				    struct ethtool_modinfo *modinfo)
+{
+	int ret = -EOPNOTSUPP;
+
+	WARN_ON(!lockdep_rtnl_is_held());
+
+	if (pl->sfp_bus)
+		ret = sfp_get_module_info(pl->sfp_bus, modinfo);
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(phylink_ethtool_get_module_info);
+
+int phylink_ethtool_get_module_eeprom(struct phylink *pl,
+				      struct ethtool_eeprom *ee, u8 *buf)
+{
+	int ret = -EOPNOTSUPP;
+
+	WARN_ON(!lockdep_rtnl_is_held());
+
+	if (pl->sfp_bus)
+		ret = sfp_get_module_eeprom(pl->sfp_bus, ee, buf);
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(phylink_ethtool_get_module_eeprom);
+
 int phylink_init_eee(struct phylink *pl, bool clk_stop_enable)
 {
 	int ret = -EPROTONOSUPPORT;
diff --git a/include/linux/phylink.h b/include/linux/phylink.h
index 76f054f39684..af67edd4ae38 100644
--- a/include/linux/phylink.h
+++ b/include/linux/phylink.h
@@ -125,6 +125,9 @@ void phylink_ethtool_get_pauseparam(struct phylink *,
 				    struct ethtool_pauseparam *);
 int phylink_ethtool_set_pauseparam(struct phylink *,
 				   struct ethtool_pauseparam *);
+int phylink_ethtool_get_module_info(struct phylink *, struct ethtool_modinfo *);
+int phylink_ethtool_get_module_eeprom(struct phylink *,
+				      struct ethtool_eeprom *, u8 *);
 int phylink_init_eee(struct phylink *, bool);
 int phylink_get_eee_err(struct phylink *);
 int phylink_ethtool_get_eee(struct phylink *, struct ethtool_eee *);
-- 
cgit v1.2.3


From 4faf783998b8cb88294e9df89032f473f8771b78 Mon Sep 17 00:00:00 2001
From: Yuchung Cheng <ycheng@google.com>
Date: Thu, 3 Aug 2017 20:38:51 -0700
Subject: tcp: fix cwnd undo in Reno and HTCP congestion controls

Using ssthresh to revert cwnd is less reliable when ssthresh is
bounded to 2 packets. This patch uses an existing variable in TCP
"prior_cwnd" that snapshots the cwnd right before entering fast
recovery and RTO recovery in Reno.  This fixes the issue discussed
in netdev thread: "A buggy behavior for Linux TCP Reno and HTCP"
https://www.spinics.net/lists/netdev/msg444955.html

Suggested-by: Neal Cardwell <ncardwell@google.com>
Reported-by: Wei Sun <unlcsewsun@gmail.com>
Signed-off-by: Yuchung Cheng <ncardwell@google.com>
Signed-off-by: Neal Cardwell <ncardwell@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/tcp.h  | 2 +-
 net/ipv4/tcp_cong.c  | 2 +-
 net/ipv4/tcp_htcp.c  | 3 +--
 net/ipv4/tcp_input.c | 1 +
 4 files changed, 4 insertions(+), 4 deletions(-)

(limited to 'include')

diff --git a/include/linux/tcp.h b/include/linux/tcp.h
index d7389ea36e10..267164a1d559 100644
--- a/include/linux/tcp.h
+++ b/include/linux/tcp.h
@@ -258,7 +258,7 @@ struct tcp_sock {
 	u32	snd_cwnd_clamp; /* Do not allow snd_cwnd to grow above this */
 	u32	snd_cwnd_used;
 	u32	snd_cwnd_stamp;
-	u32	prior_cwnd;	/* Congestion window at start of Recovery. */
+	u32	prior_cwnd;	/* cwnd right before starting loss recovery */
 	u32	prr_delivered;	/* Number of newly delivered packets to
 				 * receiver in Recovery. */
 	u32	prr_out;	/* Total number of pkts sent during Recovery. */
diff --git a/net/ipv4/tcp_cong.c b/net/ipv4/tcp_cong.c
index fde983f6376b..c2b174469645 100644
--- a/net/ipv4/tcp_cong.c
+++ b/net/ipv4/tcp_cong.c
@@ -456,7 +456,7 @@ u32 tcp_reno_undo_cwnd(struct sock *sk)
 {
 	const struct tcp_sock *tp = tcp_sk(sk);
 
-	return max(tp->snd_cwnd, tp->snd_ssthresh << 1);
+	return max(tp->snd_cwnd, tp->prior_cwnd);
 }
 EXPORT_SYMBOL_GPL(tcp_reno_undo_cwnd);
 
diff --git a/net/ipv4/tcp_htcp.c b/net/ipv4/tcp_htcp.c
index 3eb78cde6ff0..082d479462fa 100644
--- a/net/ipv4/tcp_htcp.c
+++ b/net/ipv4/tcp_htcp.c
@@ -66,7 +66,6 @@ static inline void htcp_reset(struct htcp *ca)
 
 static u32 htcp_cwnd_undo(struct sock *sk)
 {
-	const struct tcp_sock *tp = tcp_sk(sk);
 	struct htcp *ca = inet_csk_ca(sk);
 
 	if (ca->undo_last_cong) {
@@ -76,7 +75,7 @@ static u32 htcp_cwnd_undo(struct sock *sk)
 		ca->undo_last_cong = 0;
 	}
 
-	return max(tp->snd_cwnd, (tp->snd_ssthresh << 7) / ca->beta);
+	return tcp_reno_undo_cwnd(sk);
 }
 
 static inline void measure_rtt(struct sock *sk, u32 srtt)
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 99cdf4ccabb8..842ed75ccb25 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -1950,6 +1950,7 @@ void tcp_enter_loss(struct sock *sk)
 	    !after(tp->high_seq, tp->snd_una) ||
 	    (icsk->icsk_ca_state == TCP_CA_Loss && !icsk->icsk_retransmits)) {
 		tp->prior_ssthresh = tcp_current_ssthresh(sk);
+		tp->prior_cwnd = tp->snd_cwnd;
 		tp->snd_ssthresh = icsk->icsk_ca_ops->ssthresh(sk);
 		tcp_ca_event(sk, CA_EVENT_LOSS);
 		tcp_init_undo(tp);
-- 
cgit v1.2.3


From 233e7936c84b7d7dd90c10e2ba27abb5ab42956f Mon Sep 17 00:00:00 2001
From: Xin Long <lucien.xin@gmail.com>
Date: Sat, 5 Aug 2017 19:59:51 +0800
Subject: sctp: remove the typedef sctp_lower_cwnd_t

This patch is to remove the typedef sctp_lower_cwnd_t, and
replace with enum sctp_lower_cwnd in the places where it's
using this typedef.

Signed-off-by: Xin Long <lucien.xin@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/sctp/constants.h | 4 ++--
 include/net/sctp/structs.h   | 3 ++-
 net/sctp/transport.c         | 2 +-
 3 files changed, 5 insertions(+), 4 deletions(-)

(limited to 'include')

diff --git a/include/net/sctp/constants.h b/include/net/sctp/constants.h
index 9b18044c551e..761064ee3ac5 100644
--- a/include/net/sctp/constants.h
+++ b/include/net/sctp/constants.h
@@ -378,12 +378,12 @@ typedef enum {
 } sctp_retransmit_reason_t;
 
 /* Reasons to lower cwnd. */
-typedef enum {
+enum sctp_lower_cwnd {
 	SCTP_LOWER_CWND_T3_RTX,
 	SCTP_LOWER_CWND_FAST_RTX,
 	SCTP_LOWER_CWND_ECNE,
 	SCTP_LOWER_CWND_INACTIVE,
-} sctp_lower_cwnd_t;
+};
 
 
 /* SCTP-AUTH Necessary constants */
diff --git a/include/net/sctp/structs.h b/include/net/sctp/structs.h
index 66cd7639b912..53802d8872d7 100644
--- a/include/net/sctp/structs.h
+++ b/include/net/sctp/structs.h
@@ -950,7 +950,8 @@ int sctp_transport_hold(struct sctp_transport *);
 void sctp_transport_put(struct sctp_transport *);
 void sctp_transport_update_rto(struct sctp_transport *, __u32);
 void sctp_transport_raise_cwnd(struct sctp_transport *, __u32, __u32);
-void sctp_transport_lower_cwnd(struct sctp_transport *, sctp_lower_cwnd_t);
+void sctp_transport_lower_cwnd(struct sctp_transport *t,
+			       enum sctp_lower_cwnd reason);
 void sctp_transport_burst_limited(struct sctp_transport *);
 void sctp_transport_burst_reset(struct sctp_transport *);
 unsigned long sctp_transport_timeout(struct sctp_transport *);
diff --git a/net/sctp/transport.c b/net/sctp/transport.c
index 80a97c8501a7..2d9bd3776bc8 100644
--- a/net/sctp/transport.c
+++ b/net/sctp/transport.c
@@ -490,7 +490,7 @@ void sctp_transport_raise_cwnd(struct sctp_transport *transport,
  * detected.
  */
 void sctp_transport_lower_cwnd(struct sctp_transport *transport,
-			       sctp_lower_cwnd_t reason)
+			       enum sctp_lower_cwnd reason)
 {
 	struct sctp_association *asoc = transport->asoc;
 
-- 
cgit v1.2.3


From 125c29820252bfa5bf8081e75618e4ee7e9487da Mon Sep 17 00:00:00 2001
From: Xin Long <lucien.xin@gmail.com>
Date: Sat, 5 Aug 2017 19:59:52 +0800
Subject: sctp: remove the typedef sctp_retransmit_reason_t

This patch is to remove the typedef sctp_retransmit_reason_t, and
replace with enum sctp_retransmit_reason in the places where it's
using this typedef.

Signed-off-by: Xin Long <lucien.xin@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/sctp/constants.h | 4 ++--
 include/net/sctp/structs.h   | 4 ++--
 net/sctp/outqueue.c          | 2 +-
 3 files changed, 5 insertions(+), 5 deletions(-)

(limited to 'include')

diff --git a/include/net/sctp/constants.h b/include/net/sctp/constants.h
index 761064ee3ac5..922fba5880d6 100644
--- a/include/net/sctp/constants.h
+++ b/include/net/sctp/constants.h
@@ -370,12 +370,12 @@ typedef enum {
 						   peer */
 
 /* Reasons to retransmit. */
-typedef enum {
+enum sctp_retransmit_reason {
 	SCTP_RTXR_T3_RTX,
 	SCTP_RTXR_FAST_RTX,
 	SCTP_RTXR_PMTUD,
 	SCTP_RTXR_T1_RTX,
-} sctp_retransmit_reason_t;
+};
 
 /* Reasons to lower cwnd. */
 enum sctp_lower_cwnd {
diff --git a/include/net/sctp/structs.h b/include/net/sctp/structs.h
index 53802d8872d7..5e872acee6e3 100644
--- a/include/net/sctp/structs.h
+++ b/include/net/sctp/structs.h
@@ -1054,8 +1054,8 @@ int sctp_outq_sack(struct sctp_outq *, struct sctp_chunk *);
 int sctp_outq_is_empty(const struct sctp_outq *);
 void sctp_outq_restart(struct sctp_outq *);
 
-void sctp_retransmit(struct sctp_outq *, struct sctp_transport *,
-		     sctp_retransmit_reason_t);
+void sctp_retransmit(struct sctp_outq *q, struct sctp_transport *transport,
+		     enum sctp_retransmit_reason reason);
 void sctp_retransmit_mark(struct sctp_outq *, struct sctp_transport *, __u8);
 void sctp_outq_uncork(struct sctp_outq *, gfp_t gfp);
 void sctp_prsctp_prune(struct sctp_association *asoc,
diff --git a/net/sctp/outqueue.c b/net/sctp/outqueue.c
index d2a8adfd4a6f..08ee0ed9a0c6 100644
--- a/net/sctp/outqueue.c
+++ b/net/sctp/outqueue.c
@@ -534,7 +534,7 @@ void sctp_retransmit_mark(struct sctp_outq *q,
  * one packet out.
  */
 void sctp_retransmit(struct sctp_outq *q, struct sctp_transport *transport,
-		     sctp_retransmit_reason_t reason)
+		     enum sctp_retransmit_reason reason)
 {
 	struct net *net = sock_net(q->asoc->base.sk);
 
-- 
cgit v1.2.3


From 701ef3e6c74be771a76be39817941e68e7228644 Mon Sep 17 00:00:00 2001
From: Xin Long <lucien.xin@gmail.com>
Date: Sat, 5 Aug 2017 19:59:53 +0800
Subject: sctp: remove the typedef sctp_scope_policy_t

This patch is to remove the typedef sctp_scope_policy_t and keep
it's members as an anonymous enum.

It is also to define SCTP_SCOPE_POLICY_MAX to replace the num 3
in sysctl.c to make codes clear.

Signed-off-by: Xin Long <lucien.xin@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/sctp/constants.h | 6 ++++--
 net/sctp/sysctl.c            | 2 +-
 2 files changed, 5 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/include/net/sctp/constants.h b/include/net/sctp/constants.h
index 922fba5880d6..acb03eb64842 100644
--- a/include/net/sctp/constants.h
+++ b/include/net/sctp/constants.h
@@ -341,12 +341,14 @@ typedef enum {
 	SCTP_SCOPE_UNUSABLE,		/* IPv4 unusable addresses */
 } sctp_scope_t;
 
-typedef enum {
+enum {
 	SCTP_SCOPE_POLICY_DISABLE,	/* Disable IPv4 address scoping */
 	SCTP_SCOPE_POLICY_ENABLE,	/* Enable IPv4 address scoping */
 	SCTP_SCOPE_POLICY_PRIVATE,	/* Follow draft but allow IPv4 private addresses */
 	SCTP_SCOPE_POLICY_LINK,		/* Follow draft but allow IPv4 link local addresses */
-} sctp_scope_policy_t;
+};
+
+#define SCTP_SCOPE_POLICY_MAX	SCTP_SCOPE_POLICY_LINK
 
 /* Based on IPv4 scoping <draft-stewart-tsvwg-sctp-ipv4-00.txt>,
  * SCTP IPv4 unusable addresses: 0.0.0.0/8, 224.0.0.0/4, 198.18.0.0/24,
diff --git a/net/sctp/sysctl.c b/net/sctp/sysctl.c
index 0e732f68c2bf..ef7ca44d6e6a 100644
--- a/net/sctp/sysctl.c
+++ b/net/sctp/sysctl.c
@@ -46,7 +46,7 @@ static int timer_max = 86400000; /* ms in one day */
 static int int_max = INT_MAX;
 static int sack_timer_min = 1;
 static int sack_timer_max = 500;
-static int addr_scope_max = 3; /* check sctp_scope_policy_t in include/net/sctp/constants.h for max entries */
+static int addr_scope_max = SCTP_SCOPE_POLICY_MAX;
 static int rwnd_scale_max = 16;
 static int rto_alpha_min = 0;
 static int rto_beta_min = 0;
-- 
cgit v1.2.3


From 1c662018d2d41ecc5550cbd74d29d2d32c164ed3 Mon Sep 17 00:00:00 2001
From: Xin Long <lucien.xin@gmail.com>
Date: Sat, 5 Aug 2017 19:59:54 +0800
Subject: sctp: remove the typedef sctp_scope_t

This patch is to remove the typedef sctp_scope_t, and
replace with enum sctp_scope in the places where it's
using this typedef.

Signed-off-by: Xin Long <lucien.xin@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/sctp/constants.h |  4 ++--
 include/net/sctp/sctp.h      |  4 ++--
 include/net/sctp/structs.h   | 17 +++++++++--------
 net/sctp/associola.c         | 17 ++++++++---------
 net/sctp/bind_addr.c         | 20 ++++++++++----------
 net/sctp/ipv6.c              |  6 +++---
 net/sctp/protocol.c          |  6 +++---
 net/sctp/sm_make_chunk.c     |  6 +++---
 net/sctp/socket.c            |  4 ++--
 9 files changed, 42 insertions(+), 42 deletions(-)

(limited to 'include')

diff --git a/include/net/sctp/constants.h b/include/net/sctp/constants.h
index acb03eb64842..0503bb70e5d9 100644
--- a/include/net/sctp/constants.h
+++ b/include/net/sctp/constants.h
@@ -333,13 +333,13 @@ typedef enum {
  * At this point, the IPv6 scopes will be mapped to these internal scopes
  * as much as possible.
  */
-typedef enum {
+enum sctp_scope {
 	SCTP_SCOPE_GLOBAL,		/* IPv4 global addresses */
 	SCTP_SCOPE_PRIVATE,		/* IPv4 private addresses */
 	SCTP_SCOPE_LINK,		/* IPv4 link local address */
 	SCTP_SCOPE_LOOPBACK,		/* IPv4 loopback address */
 	SCTP_SCOPE_UNUSABLE,		/* IPv4 unusable addresses */
-} sctp_scope_t;
+};
 
 enum {
 	SCTP_SCOPE_POLICY_DISABLE,	/* Disable IPv4 address scoping */
diff --git a/include/net/sctp/sctp.h b/include/net/sctp/sctp.h
index 84650fed1e6a..ca66b033ec38 100644
--- a/include/net/sctp/sctp.h
+++ b/include/net/sctp/sctp.h
@@ -94,8 +94,8 @@
 /*
  * sctp/protocol.c
  */
-int sctp_copy_local_addr_list(struct net *, struct sctp_bind_addr *,
-			      sctp_scope_t, gfp_t gfp, int flags);
+int sctp_copy_local_addr_list(struct net *net, struct sctp_bind_addr *addr,
+			      enum sctp_scope, gfp_t gfp, int flags);
 struct sctp_pf *sctp_get_pf_specific(sa_family_t family);
 int sctp_register_pf(struct sctp_pf *, sa_family_t);
 void sctp_addr_wq_mgmt(struct net *, struct sctp_sockaddr_entry *, int);
diff --git a/include/net/sctp/structs.h b/include/net/sctp/structs.h
index 5e872acee6e3..d771d418481f 100644
--- a/include/net/sctp/structs.h
+++ b/include/net/sctp/structs.h
@@ -449,7 +449,7 @@ struct sctp_af {
 	int		(*addr_valid)	(union sctp_addr *,
 					 struct sctp_sock *,
 					 const struct sk_buff *);
-	sctp_scope_t	(*scope) (union sctp_addr *);
+	enum sctp_scope	(*scope)(union sctp_addr *);
 	void		(*inaddr_any)	(union sctp_addr *, __be16);
 	int		(*is_any)	(const union sctp_addr *);
 	int		(*available)	(union sctp_addr *,
@@ -1111,7 +1111,7 @@ void sctp_bind_addr_init(struct sctp_bind_addr *, __u16 port);
 void sctp_bind_addr_free(struct sctp_bind_addr *);
 int sctp_bind_addr_copy(struct net *net, struct sctp_bind_addr *dest,
 			const struct sctp_bind_addr *src,
-			sctp_scope_t scope, gfp_t gfp,
+			enum sctp_scope scope, gfp_t gfp,
 			int flags);
 int sctp_bind_addr_dup(struct sctp_bind_addr *dest,
 			const struct sctp_bind_addr *src,
@@ -1135,8 +1135,9 @@ union sctp_params sctp_bind_addrs_to_raw(const struct sctp_bind_addr *bp,
 int sctp_raw_to_bind_addrs(struct sctp_bind_addr *bp, __u8 *raw, int len,
 			   __u16 port, gfp_t gfp);
 
-sctp_scope_t sctp_scope(const union sctp_addr *);
-int sctp_in_scope(struct net *net, const union sctp_addr *addr, const sctp_scope_t scope);
+enum sctp_scope sctp_scope(const union sctp_addr *addr);
+int sctp_in_scope(struct net *net, const union sctp_addr *addr,
+		  const enum sctp_scope scope);
 int sctp_is_any(struct sock *sk, const union sctp_addr *addr);
 int sctp_is_ep_boundall(struct sock *sk);
 
@@ -1925,8 +1926,8 @@ static inline struct sctp_association *sctp_assoc(struct sctp_ep_common *base)
 
 
 struct sctp_association *
-sctp_association_new(const struct sctp_endpoint *, const struct sock *,
-		     sctp_scope_t scope, gfp_t gfp);
+sctp_association_new(const struct sctp_endpoint *ep, const struct sock *sk,
+		     enum sctp_scope scope, gfp_t gfp);
 void sctp_association_free(struct sctp_association *);
 void sctp_association_put(struct sctp_association *);
 void sctp_association_hold(struct sctp_association *);
@@ -1967,8 +1968,8 @@ void sctp_assoc_set_primary(struct sctp_association *,
 			    struct sctp_transport *);
 void sctp_assoc_del_nonprimary_peers(struct sctp_association *,
 				    struct sctp_transport *);
-int sctp_assoc_set_bind_addr_from_ep(struct sctp_association *,
-				     sctp_scope_t, gfp_t);
+int sctp_assoc_set_bind_addr_from_ep(struct sctp_association *asoc,
+				     enum sctp_scope scope, gfp_t gfp);
 int sctp_assoc_set_bind_addr_from_cookie(struct sctp_association *,
 					 struct sctp_cookie*,
 					 gfp_t gfp);
diff --git a/net/sctp/associola.c b/net/sctp/associola.c
index 40ec83679d6e..4c1f1bb2aaad 100644
--- a/net/sctp/associola.c
+++ b/net/sctp/associola.c
@@ -63,11 +63,11 @@ static void sctp_assoc_free_asconf_queue(struct sctp_association *asoc);
 /* 1st Level Abstractions. */
 
 /* Initialize a new association from provided memory. */
-static struct sctp_association *sctp_association_init(struct sctp_association *asoc,
-					  const struct sctp_endpoint *ep,
-					  const struct sock *sk,
-					  sctp_scope_t scope,
-					  gfp_t gfp)
+static struct sctp_association *sctp_association_init(
+					struct sctp_association *asoc,
+					const struct sctp_endpoint *ep,
+					const struct sock *sk,
+					enum sctp_scope scope, gfp_t gfp)
 {
 	struct net *net = sock_net(sk);
 	struct sctp_sock *sp;
@@ -301,9 +301,8 @@ fail_init:
 
 /* Allocate and initialize a new association */
 struct sctp_association *sctp_association_new(const struct sctp_endpoint *ep,
-					 const struct sock *sk,
-					 sctp_scope_t scope,
-					 gfp_t gfp)
+					      const struct sock *sk,
+					      enum sctp_scope scope, gfp_t gfp)
 {
 	struct sctp_association *asoc;
 
@@ -1564,7 +1563,7 @@ void sctp_assoc_rwnd_decrease(struct sctp_association *asoc, unsigned int len)
  * local endpoint and the remote peer.
  */
 int sctp_assoc_set_bind_addr_from_ep(struct sctp_association *asoc,
-				     sctp_scope_t scope, gfp_t gfp)
+				     enum sctp_scope scope, gfp_t gfp)
 {
 	int flags;
 
diff --git a/net/sctp/bind_addr.c b/net/sctp/bind_addr.c
index 1ebc184a0e23..7df3704982f5 100644
--- a/net/sctp/bind_addr.c
+++ b/net/sctp/bind_addr.c
@@ -45,9 +45,9 @@
 #include <net/sctp/sm.h>
 
 /* Forward declarations for internal helpers. */
-static int sctp_copy_one_addr(struct net *, struct sctp_bind_addr *,
-			      union sctp_addr *, sctp_scope_t scope, gfp_t gfp,
-			      int flags);
+static int sctp_copy_one_addr(struct net *net, struct sctp_bind_addr *dest,
+			      union sctp_addr *addr, enum sctp_scope scope,
+			      gfp_t gfp, int flags);
 static void sctp_bind_addr_clean(struct sctp_bind_addr *);
 
 /* First Level Abstractions. */
@@ -57,7 +57,7 @@ static void sctp_bind_addr_clean(struct sctp_bind_addr *);
  */
 int sctp_bind_addr_copy(struct net *net, struct sctp_bind_addr *dest,
 			const struct sctp_bind_addr *src,
-			sctp_scope_t scope, gfp_t gfp,
+			enum sctp_scope scope, gfp_t gfp,
 			int flags)
 {
 	struct sctp_sockaddr_entry *addr;
@@ -440,9 +440,8 @@ union sctp_addr *sctp_find_unmatch_addr(struct sctp_bind_addr	*bp,
 
 /* Copy out addresses from the global local address list. */
 static int sctp_copy_one_addr(struct net *net, struct sctp_bind_addr *dest,
-			      union sctp_addr *addr,
-			      sctp_scope_t scope, gfp_t gfp,
-			      int flags)
+			      union sctp_addr *addr, enum sctp_scope scope,
+			      gfp_t gfp, int flags)
 {
 	int error = 0;
 
@@ -485,9 +484,10 @@ int sctp_is_any(struct sock *sk, const union sctp_addr *addr)
 }
 
 /* Is 'addr' valid for 'scope'?  */
-int sctp_in_scope(struct net *net, const union sctp_addr *addr, sctp_scope_t scope)
+int sctp_in_scope(struct net *net, const union sctp_addr *addr,
+		  enum sctp_scope scope)
 {
-	sctp_scope_t addr_scope = sctp_scope(addr);
+	enum sctp_scope addr_scope = sctp_scope(addr);
 
 	/* The unusable SCTP addresses will not be considered with
 	 * any defined scopes.
@@ -545,7 +545,7 @@ int sctp_is_ep_boundall(struct sock *sk)
  ********************************************************************/
 
 /* What is the scope of 'addr'?  */
-sctp_scope_t sctp_scope(const union sctp_addr *addr)
+enum sctp_scope sctp_scope(const union sctp_addr *addr)
 {
 	struct sctp_af *af;
 
diff --git a/net/sctp/ipv6.c b/net/sctp/ipv6.c
index 107d7c912922..a2a1c1d08d51 100644
--- a/net/sctp/ipv6.c
+++ b/net/sctp/ipv6.c
@@ -243,8 +243,8 @@ static void sctp_v6_get_dst(struct sctp_transport *t, union sctp_addr *saddr,
 	union sctp_addr *daddr = &t->ipaddr;
 	union sctp_addr dst_saddr;
 	struct in6_addr *final_p, final;
+	enum sctp_scope scope;
 	__u8 matchlen = 0;
-	sctp_scope_t scope;
 
 	memset(fl6, 0, sizeof(struct flowi6));
 	fl6->daddr = daddr->v6.sin6_addr;
@@ -624,10 +624,10 @@ static int sctp_v6_addr_valid(union sctp_addr *addr,
 }
 
 /* What is the scope of 'addr'?  */
-static sctp_scope_t sctp_v6_scope(union sctp_addr *addr)
+static enum sctp_scope sctp_v6_scope(union sctp_addr *addr)
 {
+	enum sctp_scope retval;
 	int v6scope;
-	sctp_scope_t retval;
 
 	/* The IPv6 scope is really a set of bit fields.
 	 * See IFA_* in <net/if_inet6.h>.  Map to a generic SCTP scope.
diff --git a/net/sctp/protocol.c b/net/sctp/protocol.c
index 852556d67ae3..fcd80feb293f 100644
--- a/net/sctp/protocol.c
+++ b/net/sctp/protocol.c
@@ -196,7 +196,7 @@ static void sctp_free_local_addr_list(struct net *net)
 
 /* Copy the local addresses which are valid for 'scope' into 'bp'.  */
 int sctp_copy_local_addr_list(struct net *net, struct sctp_bind_addr *bp,
-			      sctp_scope_t scope, gfp_t gfp, int copy_flags)
+			      enum sctp_scope scope, gfp_t gfp, int copy_flags)
 {
 	struct sctp_sockaddr_entry *addr;
 	union sctp_addr laddr;
@@ -400,9 +400,9 @@ static int sctp_v4_available(union sctp_addr *addr, struct sctp_sock *sp)
  * IPv4 scoping can be controlled through sysctl option
  * net.sctp.addr_scope_policy
  */
-static sctp_scope_t sctp_v4_scope(union sctp_addr *addr)
+static enum sctp_scope sctp_v4_scope(union sctp_addr *addr)
 {
-	sctp_scope_t retval;
+	enum sctp_scope retval;
 
 	/* Check for unusable SCTP addresses. */
 	if (IS_IPV4_UNUSABLE_ADDRESS(addr->v4.sin_addr.s_addr)) {
diff --git a/net/sctp/sm_make_chunk.c b/net/sctp/sm_make_chunk.c
index d17e8d1f2ed9..a034d842e335 100644
--- a/net/sctp/sm_make_chunk.c
+++ b/net/sctp/sm_make_chunk.c
@@ -1578,8 +1578,8 @@ struct sctp_association *sctp_make_temp_asoc(const struct sctp_endpoint *ep,
 					gfp_t gfp)
 {
 	struct sctp_association *asoc;
+	enum sctp_scope scope;
 	struct sk_buff *skb;
-	sctp_scope_t scope;
 
 	/* Create the bare association.  */
 	scope = sctp_scope(sctp_source(chunk));
@@ -1701,7 +1701,7 @@ struct sctp_association *sctp_unpack_cookie(
 	int headersize, bodysize, fixed_size;
 	__u8 *digest = ep->digest;
 	unsigned int len;
-	sctp_scope_t scope;
+	enum sctp_scope scope;
 	struct sk_buff *skb = chunk->skb;
 	ktime_t kt;
 
@@ -2502,7 +2502,7 @@ static int sctp_process_param(struct sctp_association *asoc,
 	int i;
 	__u16 sat;
 	int retval = 1;
-	sctp_scope_t scope;
+	enum sctp_scope scope;
 	u32 stale;
 	struct sctp_af *af;
 	union sctp_addr_param *addr_param;
diff --git a/net/sctp/socket.c b/net/sctp/socket.c
index 1db478e34520..a1e2113806dd 100644
--- a/net/sctp/socket.c
+++ b/net/sctp/socket.c
@@ -1055,7 +1055,7 @@ static int __sctp_connect(struct sock *sk,
 	struct sctp_association *asoc2;
 	struct sctp_transport *transport;
 	union sctp_addr to;
-	sctp_scope_t scope;
+	enum sctp_scope scope;
 	long timeo;
 	int err = 0;
 	int addrcnt = 0;
@@ -1610,7 +1610,7 @@ static int sctp_sendmsg(struct sock *sk, struct msghdr *msg, size_t msg_len)
 	struct sctp_initmsg *sinit;
 	sctp_assoc_t associd = 0;
 	sctp_cmsgs_t cmsgs = { NULL };
-	sctp_scope_t scope;
+	enum sctp_scope scope;
 	bool fill_sinfo_ttl = false, wait_connect = false;
 	struct sctp_datamsg *datamsg;
 	int msg_flags = msg->msg_flags;
-- 
cgit v1.2.3


From 0ceaeebe28d46e49c865f88a3e3ca75f6cf13e1f Mon Sep 17 00:00:00 2001
From: Xin Long <lucien.xin@gmail.com>
Date: Sat, 5 Aug 2017 19:59:55 +0800
Subject: sctp: remove the typedef sctp_transport_cmd_t

This patch is to remove the typedef sctp_transport_cmd_t, and
replace with enum sctp_transport_cmd in the places where it's
using this typedef.

Signed-off-by: Xin Long <lucien.xin@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/sctp/constants.h | 4 ++--
 include/net/sctp/structs.h   | 7 ++++---
 net/sctp/associola.c         | 2 +-
 3 files changed, 7 insertions(+), 6 deletions(-)

(limited to 'include')

diff --git a/include/net/sctp/constants.h b/include/net/sctp/constants.h
index 0503bb70e5d9..8ce6d3263e41 100644
--- a/include/net/sctp/constants.h
+++ b/include/net/sctp/constants.h
@@ -320,11 +320,11 @@ typedef enum {
 } sctp_xmit_t;
 
 /* These are the commands for manipulating transports.  */
-typedef enum {
+enum sctp_transport_cmd {
 	SCTP_TRANSPORT_UP,
 	SCTP_TRANSPORT_DOWN,
 	SCTP_TRANSPORT_PF,
-} sctp_transport_cmd_t;
+};
 
 /* These are the address scopes defined mainly for IPv4 addresses
  * based on draft of SCTP IPv4 scoping <draft-stewart-tsvwg-sctp-ipv4-00.txt>.
diff --git a/include/net/sctp/structs.h b/include/net/sctp/structs.h
index d771d418481f..d098d1c4ed74 100644
--- a/include/net/sctp/structs.h
+++ b/include/net/sctp/structs.h
@@ -1947,9 +1947,10 @@ void sctp_assoc_del_peer(struct sctp_association *asoc,
 			 const union sctp_addr *addr);
 void sctp_assoc_rm_peer(struct sctp_association *asoc,
 			 struct sctp_transport *peer);
-void sctp_assoc_control_transport(struct sctp_association *,
-				  struct sctp_transport *,
-				  sctp_transport_cmd_t, sctp_sn_error_t);
+void sctp_assoc_control_transport(struct sctp_association *asoc,
+				  struct sctp_transport *transport,
+				  enum sctp_transport_cmd command,
+				  sctp_sn_error_t error);
 struct sctp_transport *sctp_assoc_lookup_tsn(struct sctp_association *, __u32);
 struct sctp_transport *sctp_assoc_is_match(struct sctp_association *,
 					   struct net *,
diff --git a/net/sctp/associola.c b/net/sctp/associola.c
index 4c1f1bb2aaad..b53efed8ff71 100644
--- a/net/sctp/associola.c
+++ b/net/sctp/associola.c
@@ -796,7 +796,7 @@ void sctp_assoc_del_nonprimary_peers(struct sctp_association *asoc,
  */
 void sctp_assoc_control_transport(struct sctp_association *asoc,
 				  struct sctp_transport *transport,
-				  sctp_transport_cmd_t command,
+				  enum sctp_transport_cmd command,
 				  sctp_sn_error_t error)
 {
 	struct sctp_ulpevent *event;
-- 
cgit v1.2.3


From 8496561430df6e2c21b4ed37bd93604d3acaf5d6 Mon Sep 17 00:00:00 2001
From: Xin Long <lucien.xin@gmail.com>
Date: Sat, 5 Aug 2017 19:59:56 +0800
Subject: sctp: remove the typedef sctp_sock_state_t

This patch is to remove the typedef sctp_sock_state_t, and
replace with enum sctp_sock_state in the places where it's
using this typedef.

Signed-off-by: Xin Long <lucien.xin@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/sctp/constants.h | 6 +++---
 include/net/sctp/sctp.h      | 3 ++-
 2 files changed, 5 insertions(+), 4 deletions(-)

(limited to 'include')

diff --git a/include/net/sctp/constants.h b/include/net/sctp/constants.h
index 8ce6d3263e41..049868ea7ae9 100644
--- a/include/net/sctp/constants.h
+++ b/include/net/sctp/constants.h
@@ -42,7 +42,7 @@
 
 #include <linux/sctp.h>
 #include <linux/ipv6.h> /* For ipv6hdr. */
-#include <net/tcp_states.h>  /* For TCP states used in sctp_sock_state_t */
+#include <net/tcp_states.h>  /* For TCP states used in enum sctp_sock_state */
 
 /* Value used for stream negotiation. */
 enum { SCTP_MAX_STREAM = 0xffff };
@@ -214,13 +214,13 @@ typedef enum {
  * - A socket in SCTP_SS_ESTABLISHED state indicates that it has a single 
  *   association.
  */
-typedef enum {
+enum sctp_sock_state {
 	SCTP_SS_CLOSED         = TCP_CLOSE,
 	SCTP_SS_LISTENING      = TCP_LISTEN,
 	SCTP_SS_ESTABLISHING   = TCP_SYN_SENT,
 	SCTP_SS_ESTABLISHED    = TCP_ESTABLISHED,
 	SCTP_SS_CLOSING        = TCP_CLOSE_WAIT,
-} sctp_sock_state_t;
+};
 
 /* These functions map various type to printable names.  */
 const char *sctp_cname(const sctp_subtype_t);	/* chunk types */
diff --git a/include/net/sctp/sctp.h b/include/net/sctp/sctp.h
index ca66b033ec38..0022bc713434 100644
--- a/include/net/sctp/sctp.h
+++ b/include/net/sctp/sctp.h
@@ -565,7 +565,8 @@ static inline int __sctp_state(const struct sctp_association *asoc,
 
 /* Is the socket in this state? */
 #define sctp_sstate(sk, state) __sctp_sstate((sk), (SCTP_SS_##state))
-static inline int __sctp_sstate(const struct sock *sk, sctp_sock_state_t state)
+static inline int __sctp_sstate(const struct sock *sk,
+				enum sctp_sock_state state)
 {
 	return sk->sk_state == state;
 }
-- 
cgit v1.2.3


From 86b36f2a9b9ea58dd2100b0e6f1f45a1f67ee95e Mon Sep 17 00:00:00 2001
From: Xin Long <lucien.xin@gmail.com>
Date: Sat, 5 Aug 2017 19:59:57 +0800
Subject: sctp: remove the typedef sctp_xmit_t

This patch is to remove the typedef sctp_xmit_t, and
replace with enum sctp_xmit in the places where it's
using this typedef.

Signed-off-by: Xin Long <lucien.xin@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/sctp/constants.h |  4 +--
 include/net/sctp/structs.h   |  9 ++++---
 net/sctp/output.c            | 60 ++++++++++++++++++++++----------------------
 net/sctp/outqueue.c          |  8 +++---
 4 files changed, 41 insertions(+), 40 deletions(-)

(limited to 'include')

diff --git a/include/net/sctp/constants.h b/include/net/sctp/constants.h
index 049868ea7ae9..311b2e3773e8 100644
--- a/include/net/sctp/constants.h
+++ b/include/net/sctp/constants.h
@@ -312,12 +312,12 @@ enum { SCTP_MAX_GABS = 16 };
 /* These return values describe the success or failure of a number of
  * routines which form the lower interface to SCTP_outqueue.
  */
-typedef enum {
+enum sctp_xmit {
 	SCTP_XMIT_OK,
 	SCTP_XMIT_PMTU_FULL,
 	SCTP_XMIT_RWND_FULL,
 	SCTP_XMIT_DELAY,
-} sctp_xmit_t;
+};
 
 /* These are the commands for manipulating transports.  */
 enum sctp_transport_cmd {
diff --git a/include/net/sctp/structs.h b/include/net/sctp/structs.h
index d098d1c4ed74..6fab67e7f1e5 100644
--- a/include/net/sctp/structs.h
+++ b/include/net/sctp/structs.h
@@ -697,10 +697,11 @@ struct sctp_packet {
 void sctp_packet_init(struct sctp_packet *, struct sctp_transport *,
 		      __u16 sport, __u16 dport);
 void sctp_packet_config(struct sctp_packet *, __u32 vtag, int);
-sctp_xmit_t sctp_packet_transmit_chunk(struct sctp_packet *,
-				       struct sctp_chunk *, int, gfp_t);
-sctp_xmit_t sctp_packet_append_chunk(struct sctp_packet *,
-                                     struct sctp_chunk *);
+enum sctp_xmit sctp_packet_transmit_chunk(struct sctp_packet *packet,
+					  struct sctp_chunk *chunk,
+					  int one_packet, gfp_t gfp);
+enum sctp_xmit sctp_packet_append_chunk(struct sctp_packet *packet,
+					struct sctp_chunk *chunk);
 int sctp_packet_transmit(struct sctp_packet *, gfp_t);
 void sctp_packet_free(struct sctp_packet *);
 
diff --git a/net/sctp/output.c b/net/sctp/output.c
index 9d8504985744..4a865cd06d76 100644
--- a/net/sctp/output.c
+++ b/net/sctp/output.c
@@ -57,15 +57,15 @@
 #include <net/sctp/checksum.h>
 
 /* Forward declarations for private helpers. */
-static sctp_xmit_t __sctp_packet_append_chunk(struct sctp_packet *packet,
-					      struct sctp_chunk *chunk);
-static sctp_xmit_t sctp_packet_can_append_data(struct sctp_packet *packet,
-					   struct sctp_chunk *chunk);
+static enum sctp_xmit __sctp_packet_append_chunk(struct sctp_packet *packet,
+						 struct sctp_chunk *chunk);
+static enum sctp_xmit sctp_packet_can_append_data(struct sctp_packet *packet,
+						  struct sctp_chunk *chunk);
 static void sctp_packet_append_data(struct sctp_packet *packet,
-					   struct sctp_chunk *chunk);
-static sctp_xmit_t sctp_packet_will_fit(struct sctp_packet *packet,
-					struct sctp_chunk *chunk,
-					u16 chunk_len);
+				    struct sctp_chunk *chunk);
+static enum sctp_xmit sctp_packet_will_fit(struct sctp_packet *packet,
+					   struct sctp_chunk *chunk,
+					   u16 chunk_len);
 
 static void sctp_packet_reset(struct sctp_packet *packet)
 {
@@ -181,11 +181,11 @@ void sctp_packet_free(struct sctp_packet *packet)
  * as it can fit in the packet, but any more data that does not fit in this
  * packet can be sent only after receiving the COOKIE_ACK.
  */
-sctp_xmit_t sctp_packet_transmit_chunk(struct sctp_packet *packet,
-				       struct sctp_chunk *chunk,
-				       int one_packet, gfp_t gfp)
+enum sctp_xmit sctp_packet_transmit_chunk(struct sctp_packet *packet,
+					  struct sctp_chunk *chunk,
+					  int one_packet, gfp_t gfp)
 {
-	sctp_xmit_t retval;
+	enum sctp_xmit retval;
 
 	pr_debug("%s: packet:%p size:%zu chunk:%p size:%d\n", __func__,
 		 packet, packet->size, chunk, chunk->skb ? chunk->skb->len : -1);
@@ -218,12 +218,12 @@ sctp_xmit_t sctp_packet_transmit_chunk(struct sctp_packet *packet,
 }
 
 /* Try to bundle an auth chunk into the packet. */
-static sctp_xmit_t sctp_packet_bundle_auth(struct sctp_packet *pkt,
-					   struct sctp_chunk *chunk)
+static enum sctp_xmit sctp_packet_bundle_auth(struct sctp_packet *pkt,
+					      struct sctp_chunk *chunk)
 {
 	struct sctp_association *asoc = pkt->transport->asoc;
+	enum sctp_xmit retval = SCTP_XMIT_OK;
 	struct sctp_chunk *auth;
-	sctp_xmit_t retval = SCTP_XMIT_OK;
 
 	/* if we don't have an association, we can't do authentication */
 	if (!asoc)
@@ -254,10 +254,10 @@ static sctp_xmit_t sctp_packet_bundle_auth(struct sctp_packet *pkt,
 }
 
 /* Try to bundle a SACK with the packet. */
-static sctp_xmit_t sctp_packet_bundle_sack(struct sctp_packet *pkt,
-					   struct sctp_chunk *chunk)
+static enum sctp_xmit sctp_packet_bundle_sack(struct sctp_packet *pkt,
+					      struct sctp_chunk *chunk)
 {
-	sctp_xmit_t retval = SCTP_XMIT_OK;
+	enum sctp_xmit retval = SCTP_XMIT_OK;
 
 	/* If sending DATA and haven't aleady bundled a SACK, try to
 	 * bundle one in to the packet.
@@ -299,11 +299,11 @@ out:
 /* Append a chunk to the offered packet reporting back any inability to do
  * so.
  */
-static sctp_xmit_t __sctp_packet_append_chunk(struct sctp_packet *packet,
-					      struct sctp_chunk *chunk)
+static enum sctp_xmit __sctp_packet_append_chunk(struct sctp_packet *packet,
+						 struct sctp_chunk *chunk)
 {
-	sctp_xmit_t retval = SCTP_XMIT_OK;
 	__u16 chunk_len = SCTP_PAD4(ntohs(chunk->chunk_hdr->length));
+	enum sctp_xmit retval = SCTP_XMIT_OK;
 
 	/* Check to see if this chunk will fit into the packet */
 	retval = sctp_packet_will_fit(packet, chunk, chunk_len);
@@ -353,10 +353,10 @@ finish:
 /* Append a chunk to the offered packet reporting back any inability to do
  * so.
  */
-sctp_xmit_t sctp_packet_append_chunk(struct sctp_packet *packet,
-				     struct sctp_chunk *chunk)
+enum sctp_xmit sctp_packet_append_chunk(struct sctp_packet *packet,
+					struct sctp_chunk *chunk)
 {
-	sctp_xmit_t retval = SCTP_XMIT_OK;
+	enum sctp_xmit retval = SCTP_XMIT_OK;
 
 	pr_debug("%s: packet:%p chunk:%p\n", __func__, packet, chunk);
 
@@ -653,8 +653,8 @@ out:
  ********************************************************************/
 
 /* This private function check to see if a chunk can be added */
-static sctp_xmit_t sctp_packet_can_append_data(struct sctp_packet *packet,
-					   struct sctp_chunk *chunk)
+static enum sctp_xmit sctp_packet_can_append_data(struct sctp_packet *packet,
+						  struct sctp_chunk *chunk)
 {
 	size_t datasize, rwnd, inflight, flight_size;
 	struct sctp_transport *transport = packet->transport;
@@ -762,12 +762,12 @@ static void sctp_packet_append_data(struct sctp_packet *packet,
 	sctp_chunk_assign_ssn(chunk);
 }
 
-static sctp_xmit_t sctp_packet_will_fit(struct sctp_packet *packet,
-					struct sctp_chunk *chunk,
-					u16 chunk_len)
+static enum sctp_xmit sctp_packet_will_fit(struct sctp_packet *packet,
+					   struct sctp_chunk *chunk,
+					   u16 chunk_len)
 {
+	enum sctp_xmit retval = SCTP_XMIT_OK;
 	size_t psize, pmtu, maxsize;
-	sctp_xmit_t retval = SCTP_XMIT_OK;
 
 	psize = packet->size;
 	if (packet->transport->asoc)
diff --git a/net/sctp/outqueue.c b/net/sctp/outqueue.c
index 08ee0ed9a0c6..2966ff400755 100644
--- a/net/sctp/outqueue.c
+++ b/net/sctp/outqueue.c
@@ -594,14 +594,14 @@ void sctp_retransmit(struct sctp_outq *q, struct sctp_transport *transport,
 static int sctp_outq_flush_rtx(struct sctp_outq *q, struct sctp_packet *pkt,
 			       int rtx_timeout, int *start_timer)
 {
-	struct list_head *lqueue;
 	struct sctp_transport *transport = pkt->transport;
-	sctp_xmit_t status;
 	struct sctp_chunk *chunk, *chunk1;
-	int fast_rtx;
+	struct list_head *lqueue;
+	enum sctp_xmit status;
 	int error = 0;
 	int timer = 0;
 	int done = 0;
+	int fast_rtx;
 
 	lqueue = &q->retransmit;
 	fast_rtx = q->fast_rtx;
@@ -781,7 +781,7 @@ static void sctp_outq_flush(struct sctp_outq *q, int rtx_timeout, gfp_t gfp)
 	struct sctp_transport *transport = NULL;
 	struct sctp_transport *new_transport;
 	struct sctp_chunk *chunk, *tmp;
-	sctp_xmit_t status;
+	enum sctp_xmit status;
 	int error = 0;
 	int start_timer = 0;
 	int one_packet = 0;
-- 
cgit v1.2.3


From 4785c7ae1848244da3435ba5269f6288c15975c7 Mon Sep 17 00:00:00 2001
From: Xin Long <lucien.xin@gmail.com>
Date: Sat, 5 Aug 2017 19:59:58 +0800
Subject: sctp: remove the typedef sctp_ierror_t

This patch is to remove the typedef sctp_ierror_t, and
replace with enum sctp_ierror in the places where it's
using this typedef.

Signed-off-by: Xin Long <lucien.xin@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/sctp/constants.h |  5 ++---
 net/sctp/sm_make_chunk.c     | 23 ++++++++++++-----------
 net/sctp/sm_statefuns.c      | 28 +++++++++++++++-------------
 3 files changed, 29 insertions(+), 27 deletions(-)

(limited to 'include')

diff --git a/include/net/sctp/constants.h b/include/net/sctp/constants.h
index 311b2e3773e8..9a694653b708 100644
--- a/include/net/sctp/constants.h
+++ b/include/net/sctp/constants.h
@@ -155,8 +155,7 @@ SCTP_SUBTYPE_CONSTRUCTOR(PRIMITIVE,	sctp_event_primitive_t,	primitive)
 				- sizeof(struct sctp_data_chunk)))
 
 /* Internal error codes */
-typedef enum {
-
+enum sctp_ierror {
 	SCTP_IERROR_NO_ERROR	        = 0,
 	SCTP_IERROR_BASE		= 1000,
 	SCTP_IERROR_NO_COOKIE,
@@ -177,7 +176,7 @@ typedef enum {
 	SCTP_IERROR_PROTO_VIOLATION,
 	SCTP_IERROR_ERROR,
 	SCTP_IERROR_ABORT,
-} sctp_ierror_t;
+};
 
 
diff --git a/net/sctp/sm_make_chunk.c b/net/sctp/sm_make_chunk.c
index a034d842e335..3a8fb1dffbc1 100644
--- a/net/sctp/sm_make_chunk.c
+++ b/net/sctp/sm_make_chunk.c
@@ -2065,10 +2065,11 @@ static void sctp_process_ext_param(struct sctp_association *asoc,
  * 	SCTP_IERROR_ERROR    - stop and report an error.
  * 	SCTP_IERROR_NOMEME   - out of memory.
  */
-static sctp_ierror_t sctp_process_unk_param(const struct sctp_association *asoc,
-					    union sctp_params param,
-					    struct sctp_chunk *chunk,
-					    struct sctp_chunk **errp)
+static enum sctp_ierror sctp_process_unk_param(
+					const struct sctp_association *asoc,
+					union sctp_params param,
+					struct sctp_chunk *chunk,
+					struct sctp_chunk **errp)
 {
 	int retval = SCTP_IERROR_NO_ERROR;
 
@@ -2117,13 +2118,13 @@ static sctp_ierror_t sctp_process_unk_param(const struct sctp_association *asoc,
  *	SCTP_IERROR_ERROR - stop processing, trigger an ERROR
  * 	SCTP_IERROR_NO_ERROR - continue with the chunk
  */
-static sctp_ierror_t sctp_verify_param(struct net *net,
-					const struct sctp_endpoint *ep,
-					const struct sctp_association *asoc,
-					union sctp_params param,
-					enum sctp_cid cid,
-					struct sctp_chunk *chunk,
-					struct sctp_chunk **err_chunk)
+static enum sctp_ierror sctp_verify_param(struct net *net,
+					  const struct sctp_endpoint *ep,
+					  const struct sctp_association *asoc,
+					  union sctp_params param,
+					  enum sctp_cid cid,
+					  struct sctp_chunk *chunk,
+					  struct sctp_chunk **err_chunk)
 {
 	struct sctp_hmac_algo_param *hmacs;
 	int retval = SCTP_IERROR_NO_ERROR;
diff --git a/net/sctp/sm_statefuns.c b/net/sctp/sm_statefuns.c
index 8af90a5f23cd..5381697333df 100644
--- a/net/sctp/sm_statefuns.c
+++ b/net/sctp/sm_statefuns.c
@@ -144,11 +144,12 @@ static sctp_disposition_t sctp_sf_violation_chunk(
 				     void *arg,
 				     sctp_cmd_seq_t *commands);
 
-static sctp_ierror_t sctp_sf_authenticate(struct net *net,
-				    const struct sctp_endpoint *ep,
-				    const struct sctp_association *asoc,
-				    const sctp_subtype_t type,
-				    struct sctp_chunk *chunk);
+static enum sctp_ierror sctp_sf_authenticate(
+				struct net *net,
+				const struct sctp_endpoint *ep,
+				const struct sctp_association *asoc,
+				const sctp_subtype_t type,
+				struct sctp_chunk *chunk);
 
 static sctp_disposition_t __sctp_sf_do_9_1_abort(struct net *net,
 					const struct sctp_endpoint *ep,
@@ -756,7 +757,7 @@ sctp_disposition_t sctp_sf_do_5_1D_ce(struct net *net,
 	 */
 	if (chunk->auth_chunk) {
 		struct sctp_chunk auth;
-		sctp_ierror_t ret;
+		enum sctp_ierror ret;
 
 		/* Make sure that we and the peer are AUTH capable */
 		if (!net->sctp.auth_enable || !new_asoc->peer.auth_capable) {
@@ -4077,11 +4078,12 @@ gen_shutdown:
  *
  * The return value is the disposition of the chunk.
  */
-static sctp_ierror_t sctp_sf_authenticate(struct net *net,
-				    const struct sctp_endpoint *ep,
-				    const struct sctp_association *asoc,
-				    const sctp_subtype_t type,
-				    struct sctp_chunk *chunk)
+static enum sctp_ierror sctp_sf_authenticate(
+				struct net *net,
+				const struct sctp_endpoint *ep,
+				const struct sctp_association *asoc,
+				const sctp_subtype_t type,
+				struct sctp_chunk *chunk)
 {
 	struct sctp_authhdr *auth_hdr;
 	struct sctp_hmac *hmac;
@@ -4159,10 +4161,10 @@ sctp_disposition_t sctp_sf_eat_auth(struct net *net,
 				    void *arg,
 				    sctp_cmd_seq_t *commands)
 {
-	struct sctp_authhdr *auth_hdr;
 	struct sctp_chunk *chunk = arg;
+	struct sctp_authhdr *auth_hdr;
 	struct sctp_chunk *err_chunk;
-	sctp_ierror_t error;
+	enum sctp_ierror error;
 
 	/* Make sure that the peer has AUTH capable */
 	if (!asoc->peer.auth_capable)
-- 
cgit v1.2.3


From 5210601945f5aedaf2d7f13a88436e27a39c6a8a Mon Sep 17 00:00:00 2001
From: Xin Long <lucien.xin@gmail.com>
Date: Sat, 5 Aug 2017 19:59:59 +0800
Subject: sctp: remove the typedef sctp_state_t

This patch is to remove the typedef sctp_state_t, and
replace with enum sctp_state in the places where it's
using this typedef.

Signed-off-by: Xin Long <lucien.xin@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/sctp/command.h   |  4 ++--
 include/net/sctp/constants.h |  4 ++--
 include/net/sctp/sctp.h      |  2 +-
 include/net/sctp/sm.h        | 10 +++++-----
 include/net/sctp/structs.h   |  2 +-
 net/sctp/endpointola.c       |  2 +-
 net/sctp/primitive.c         |  2 +-
 net/sctp/sm_sideeffect.c     | 12 ++++++------
 net/sctp/sm_statetable.c     | 16 +++++++++-------
 9 files changed, 28 insertions(+), 26 deletions(-)

(limited to 'include')

diff --git a/include/net/sctp/command.h b/include/net/sctp/command.h
index 1d5f6ff3f440..be12ec946628 100644
--- a/include/net/sctp/command.h
+++ b/include/net/sctp/command.h
@@ -126,7 +126,7 @@ typedef union {
 	__u8 u8;
 	int error;
 	__be16 err;
-	sctp_state_t state;
+	enum sctp_state state;
 	sctp_event_timeout_t to;
 	struct sctp_chunk *chunk;
 	struct sctp_association *asoc;
@@ -167,7 +167,7 @@ SCTP_ARG_CONSTRUCTOR(U16,	__u16, u16)
 SCTP_ARG_CONSTRUCTOR(U8,	__u8, u8)
 SCTP_ARG_CONSTRUCTOR(ERROR,     int, error)
 SCTP_ARG_CONSTRUCTOR(PERR,      __be16, err)	/* protocol error */
-SCTP_ARG_CONSTRUCTOR(STATE,	sctp_state_t, state)
+SCTP_ARG_CONSTRUCTOR(STATE,	enum sctp_state, state)
 SCTP_ARG_CONSTRUCTOR(TO,	sctp_event_timeout_t, to)
 SCTP_ARG_CONSTRUCTOR(CHUNK,	struct sctp_chunk *, chunk)
 SCTP_ARG_CONSTRUCTOR(ASOC,	struct sctp_association *, asoc)
diff --git a/include/net/sctp/constants.h b/include/net/sctp/constants.h
index 9a694653b708..db9f40b657ba 100644
--- a/include/net/sctp/constants.h
+++ b/include/net/sctp/constants.h
@@ -181,7 +181,7 @@ enum sctp_ierror {
 
 
 /* SCTP state defines for internal state machine */
-typedef enum {
+enum sctp_state {
 
 	SCTP_STATE_CLOSED		= 0,
 	SCTP_STATE_COOKIE_WAIT		= 1,
@@ -192,7 +192,7 @@ typedef enum {
 	SCTP_STATE_SHUTDOWN_RECEIVED	= 6,
 	SCTP_STATE_SHUTDOWN_ACK_SENT	= 7,
 
-} sctp_state_t;
+};
 
 #define SCTP_STATE_MAX			SCTP_STATE_SHUTDOWN_ACK_SENT
 #define SCTP_STATE_NUM_STATES		(SCTP_STATE_MAX + 1)
diff --git a/include/net/sctp/sctp.h b/include/net/sctp/sctp.h
index 0022bc713434..24ff7931d38c 100644
--- a/include/net/sctp/sctp.h
+++ b/include/net/sctp/sctp.h
@@ -558,7 +558,7 @@ static inline int __sctp_style(const struct sock *sk, sctp_socket_type_t style)
 /* Is the association in this state? */
 #define sctp_state(asoc, state) __sctp_state((asoc), (SCTP_STATE_##state))
 static inline int __sctp_state(const struct sctp_association *asoc,
-			       sctp_state_t state)
+			       enum sctp_state state)
 {
 	return asoc->state == state;
 }
diff --git a/include/net/sctp/sm.h b/include/net/sctp/sm.h
index 860f378333b5..281e8d1e0752 100644
--- a/include/net/sctp/sm.h
+++ b/include/net/sctp/sm.h
@@ -175,10 +175,10 @@ sctp_state_fn_t sctp_sf_autoclose_timer_expire;
 
 /* Prototypes for utility support functions.  */
 __u8 sctp_get_chunk_type(struct sctp_chunk *chunk);
-const sctp_sm_table_entry_t *sctp_sm_lookup_event(struct net *,
-					    sctp_event_t,
-					    sctp_state_t,
-					    sctp_subtype_t);
+const sctp_sm_table_entry_t *sctp_sm_lookup_event(struct net *net,
+						  sctp_event_t event_type,
+						  enum sctp_state state,
+						  sctp_subtype_t event_subtype);
 int sctp_chunk_iif(const struct sctp_chunk *);
 struct sctp_association *sctp_make_temp_asoc(const struct sctp_endpoint *,
 					     struct sctp_chunk *,
@@ -313,7 +313,7 @@ struct sctp_chunk *sctp_process_strreset_resp(
 /* Prototypes for statetable processing. */
 
 int sctp_do_sm(struct net *net, sctp_event_t event_type, sctp_subtype_t subtype,
-	       sctp_state_t state,
+	       enum sctp_state state,
                struct sctp_endpoint *,
                struct sctp_association *asoc,
                void *event_arg,
diff --git a/include/net/sctp/structs.h b/include/net/sctp/structs.h
index 6fab67e7f1e5..fbe6e81b889b 100644
--- a/include/net/sctp/structs.h
+++ b/include/net/sctp/structs.h
@@ -1577,7 +1577,7 @@ struct sctp_association {
 	 *
 	 *		State takes values from SCTP_STATE_*.
 	 */
-	sctp_state_t state;
+	enum sctp_state state;
 
 	/* Overall     : The overall association error count.
 	 * Error Count : [Clear this any time I get something.]
diff --git a/net/sctp/endpointola.c b/net/sctp/endpointola.c
index 3d506b2f6193..4111c00a9d9d 100644
--- a/net/sctp/endpointola.c
+++ b/net/sctp/endpointola.c
@@ -383,7 +383,7 @@ static void sctp_endpoint_bh_rcv(struct work_struct *work)
 	struct sctp_chunk *chunk;
 	struct sctp_inq *inqueue;
 	sctp_subtype_t subtype;
-	sctp_state_t state;
+	enum sctp_state state;
 	int error = 0;
 	int first_time = 1;	/* is this the first time through the loop */
 
diff --git a/net/sctp/primitive.c b/net/sctp/primitive.c
index f0553a022859..1fb5b9bb3c6d 100644
--- a/net/sctp/primitive.c
+++ b/net/sctp/primitive.c
@@ -54,7 +54,7 @@ int sctp_primitive_ ## name(struct net *net, struct sctp_association *asoc, \
 			    void *arg) { \
 	int error = 0; \
 	sctp_event_t event_type; sctp_subtype_t subtype; \
-	sctp_state_t state; \
+	enum sctp_state state; \
 	struct sctp_endpoint *ep; \
 	\
 	event_type = SCTP_EVENT_T_PRIMITIVE; \
diff --git a/net/sctp/sm_sideeffect.c b/net/sctp/sm_sideeffect.c
index 5dda8c42b5f6..b77a81aa907b 100644
--- a/net/sctp/sm_sideeffect.c
+++ b/net/sctp/sm_sideeffect.c
@@ -53,7 +53,7 @@
 
 static int sctp_cmd_interpreter(sctp_event_t event_type,
 				sctp_subtype_t subtype,
-				sctp_state_t state,
+				enum sctp_state state,
 				struct sctp_endpoint *ep,
 				struct sctp_association *asoc,
 				void *event_arg,
@@ -61,7 +61,7 @@ static int sctp_cmd_interpreter(sctp_event_t event_type,
 				sctp_cmd_seq_t *commands,
 				gfp_t gfp);
 static int sctp_side_effects(sctp_event_t event_type, sctp_subtype_t subtype,
-			     sctp_state_t state,
+			     enum sctp_state state,
 			     struct sctp_endpoint *ep,
 			     struct sctp_association **asoc,
 			     void *event_arg,
@@ -843,7 +843,7 @@ static void sctp_cmd_assoc_update(sctp_cmd_seq_t *cmds,
 /* Helper function to change the state of an association. */
 static void sctp_cmd_new_state(sctp_cmd_seq_t *cmds,
 			       struct sctp_association *asoc,
-			       sctp_state_t state)
+			       enum sctp_state state)
 {
 	struct sock *sk = asoc->base.sk;
 
@@ -1140,7 +1140,7 @@ static void sctp_cmd_send_asconf(struct sctp_association *asoc)
  * good place to start.
  */
 int sctp_do_sm(struct net *net, sctp_event_t event_type, sctp_subtype_t subtype,
-	       sctp_state_t state,
+	       enum sctp_state state,
 	       struct sctp_endpoint *ep,
 	       struct sctp_association *asoc,
 	       void *event_arg,
@@ -1179,7 +1179,7 @@ int sctp_do_sm(struct net *net, sctp_event_t event_type, sctp_subtype_t subtype,
  * This the master state function side effect processing function.
  *****************************************************************/
 static int sctp_side_effects(sctp_event_t event_type, sctp_subtype_t subtype,
-			     sctp_state_t state,
+			     enum sctp_state state,
 			     struct sctp_endpoint *ep,
 			     struct sctp_association **asoc,
 			     void *event_arg,
@@ -1265,7 +1265,7 @@ bail:
 /* This is the side-effect interpreter.  */
 static int sctp_cmd_interpreter(sctp_event_t event_type,
 				sctp_subtype_t subtype,
-				sctp_state_t state,
+				enum sctp_state state,
 				struct sctp_endpoint *ep,
 				struct sctp_association *asoc,
 				void *event_arg,
diff --git a/net/sctp/sm_statetable.c b/net/sctp/sm_statetable.c
index 3e958c1c4b95..f46c11bffea7 100644
--- a/net/sctp/sm_statetable.c
+++ b/net/sctp/sm_statetable.c
@@ -52,9 +52,10 @@ other_event_table[SCTP_NUM_OTHER_TYPES][SCTP_STATE_NUM_STATES];
 static const sctp_sm_table_entry_t
 timeout_event_table[SCTP_NUM_TIMEOUT_TYPES][SCTP_STATE_NUM_STATES];
 
-static const sctp_sm_table_entry_t *sctp_chunk_event_lookup(struct net *net,
-							    enum sctp_cid cid,
-							    sctp_state_t state);
+static const sctp_sm_table_entry_t *sctp_chunk_event_lookup(
+						struct net *net,
+						enum sctp_cid cid,
+						enum sctp_state state);
 
 
 static const sctp_sm_table_entry_t bug = {
@@ -78,7 +79,7 @@ static const sctp_sm_table_entry_t bug = {
 
 const sctp_sm_table_entry_t *sctp_sm_lookup_event(struct net *net,
 						  sctp_event_t event_type,
-						  sctp_state_t state,
+						  enum sctp_state state,
 						  sctp_subtype_t event_subtype)
 {
 	switch (event_type) {
@@ -967,9 +968,10 @@ static const sctp_sm_table_entry_t timeout_event_table[SCTP_NUM_TIMEOUT_TYPES][S
 	TYPE_SCTP_EVENT_TIMEOUT_AUTOCLOSE,
 };
 
-static const sctp_sm_table_entry_t *sctp_chunk_event_lookup(struct net *net,
-							    enum sctp_cid cid,
-							    sctp_state_t state)
+static const sctp_sm_table_entry_t *sctp_chunk_event_lookup(
+						struct net *net,
+						enum sctp_cid cid,
+						enum sctp_state state)
 {
 	if (state > SCTP_STATE_MAX)
 		return &bug;
-- 
cgit v1.2.3


From dc1e0e6eb8b2e2c6b5c6cdfa4f0cc789e9516de5 Mon Sep 17 00:00:00 2001
From: Xin Long <lucien.xin@gmail.com>
Date: Sat, 5 Aug 2017 20:00:00 +0800
Subject: sctp: remove the typedef sctp_event_primitive_t

This patch is to remove the typedef sctp_event_primitive_t, and
replace with enum sctp_event_primitive in the places where it's
using this typedef.

Signed-off-by: Xin Long <lucien.xin@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/sctp/constants.h | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'include')

diff --git a/include/net/sctp/constants.h b/include/net/sctp/constants.h
index db9f40b657ba..162ee954a6af 100644
--- a/include/net/sctp/constants.h
+++ b/include/net/sctp/constants.h
@@ -110,7 +110,7 @@ typedef enum {
 #define SCTP_NUM_OTHER_TYPES		(SCTP_EVENT_OTHER_MAX + 1)
 
 /* These are primitive requests from the ULP.  */
-typedef enum {
+enum sctp_event_primitive {
 	SCTP_PRIMITIVE_ASSOCIATE = 0,
 	SCTP_PRIMITIVE_SHUTDOWN,
 	SCTP_PRIMITIVE_ABORT,
@@ -118,7 +118,7 @@ typedef enum {
 	SCTP_PRIMITIVE_REQUESTHEARTBEAT,
 	SCTP_PRIMITIVE_ASCONF,
 	SCTP_PRIMITIVE_RECONF,
-} sctp_event_primitive_t;
+};
 
 #define SCTP_EVENT_PRIMITIVE_MAX	SCTP_PRIMITIVE_RECONF
 #define SCTP_NUM_PRIMITIVE_TYPES	(SCTP_EVENT_PRIMITIVE_MAX + 1)
@@ -133,7 +133,7 @@ typedef union {
 	enum sctp_cid chunk;
 	sctp_event_timeout_t timeout;
 	sctp_event_other_t other;
-	sctp_event_primitive_t primitive;
+	enum sctp_event_primitive primitive;
 } sctp_subtype_t;
 
 #define SCTP_SUBTYPE_CONSTRUCTOR(_name, _type, _elt) \
@@ -144,7 +144,7 @@ SCTP_ST_## _name (_type _arg)		\
 SCTP_SUBTYPE_CONSTRUCTOR(CHUNK,		enum sctp_cid,		chunk)
 SCTP_SUBTYPE_CONSTRUCTOR(TIMEOUT,	sctp_event_timeout_t,	timeout)
 SCTP_SUBTYPE_CONSTRUCTOR(OTHER,		sctp_event_other_t,	other)
-SCTP_SUBTYPE_CONSTRUCTOR(PRIMITIVE,	sctp_event_primitive_t,	primitive)
+SCTP_SUBTYPE_CONSTRUCTOR(PRIMITIVE,	enum sctp_event_primitive, primitive)
 
 
 #define sctp_chunk_is_data(a) (a->chunk_hdr->type == SCTP_CID_DATA)
-- 
cgit v1.2.3


From a0f098d0385a40f9c4c8a2ce48d075f77ea7edd8 Mon Sep 17 00:00:00 2001
From: Xin Long <lucien.xin@gmail.com>
Date: Sat, 5 Aug 2017 20:00:01 +0800
Subject: sctp: remove the typedef sctp_event_other_t

This patch is to remove the typedef sctp_event_other_t, and
replace with enum sctp_event_other in the places where it's
using this typedef.

Signed-off-by: Xin Long <lucien.xin@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/sctp/constants.h | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'include')

diff --git a/include/net/sctp/constants.h b/include/net/sctp/constants.h
index 162ee954a6af..fd8a80ec573f 100644
--- a/include/net/sctp/constants.h
+++ b/include/net/sctp/constants.h
@@ -101,10 +101,10 @@ typedef enum {
 #define SCTP_EVENT_TIMEOUT_MAX		SCTP_EVENT_TIMEOUT_AUTOCLOSE
 #define SCTP_NUM_TIMEOUT_TYPES		(SCTP_EVENT_TIMEOUT_MAX + 1)
 
-typedef enum {
+enum sctp_event_other {
 	SCTP_EVENT_NO_PENDING_TSN = 0,
 	SCTP_EVENT_ICMP_PROTO_UNREACH,
-} sctp_event_other_t;
+};
 
 #define SCTP_EVENT_OTHER_MAX		SCTP_EVENT_ICMP_PROTO_UNREACH
 #define SCTP_NUM_OTHER_TYPES		(SCTP_EVENT_OTHER_MAX + 1)
@@ -132,7 +132,7 @@ enum sctp_event_primitive {
 typedef union {
 	enum sctp_cid chunk;
 	sctp_event_timeout_t timeout;
-	sctp_event_other_t other;
+	enum sctp_event_other other;
 	enum sctp_event_primitive primitive;
 } sctp_subtype_t;
 
@@ -143,7 +143,7 @@ SCTP_ST_## _name (_type _arg)		\
 
 SCTP_SUBTYPE_CONSTRUCTOR(CHUNK,		enum sctp_cid,		chunk)
 SCTP_SUBTYPE_CONSTRUCTOR(TIMEOUT,	sctp_event_timeout_t,	timeout)
-SCTP_SUBTYPE_CONSTRUCTOR(OTHER,		sctp_event_other_t,	other)
+SCTP_SUBTYPE_CONSTRUCTOR(OTHER,		enum sctp_event_other,	other)
 SCTP_SUBTYPE_CONSTRUCTOR(PRIMITIVE,	enum sctp_event_primitive, primitive)
 
 
-- 
cgit v1.2.3


From 19cd1592a24754e16d48398812d5f69b63f674dd Mon Sep 17 00:00:00 2001
From: Xin Long <lucien.xin@gmail.com>
Date: Sat, 5 Aug 2017 20:00:02 +0800
Subject: sctp: remove the typedef sctp_event_timeout_t

This patch is to remove the typedef sctp_event_timeout_t, and
replace with enum sctp_event_timeout in the places where it's
using this typedef.

Signed-off-by: Xin Long <lucien.xin@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/sctp/command.h   | 4 ++--
 include/net/sctp/constants.h | 8 ++++----
 net/sctp/sm_sideeffect.c     | 6 +++---
 3 files changed, 9 insertions(+), 9 deletions(-)

(limited to 'include')

diff --git a/include/net/sctp/command.h b/include/net/sctp/command.h
index be12ec946628..376cb78b6247 100644
--- a/include/net/sctp/command.h
+++ b/include/net/sctp/command.h
@@ -127,7 +127,7 @@ typedef union {
 	int error;
 	__be16 err;
 	enum sctp_state state;
-	sctp_event_timeout_t to;
+	enum sctp_event_timeout to;
 	struct sctp_chunk *chunk;
 	struct sctp_association *asoc;
 	struct sctp_transport *transport;
@@ -168,7 +168,7 @@ SCTP_ARG_CONSTRUCTOR(U8,	__u8, u8)
 SCTP_ARG_CONSTRUCTOR(ERROR,     int, error)
 SCTP_ARG_CONSTRUCTOR(PERR,      __be16, err)	/* protocol error */
 SCTP_ARG_CONSTRUCTOR(STATE,	enum sctp_state, state)
-SCTP_ARG_CONSTRUCTOR(TO,	sctp_event_timeout_t, to)
+SCTP_ARG_CONSTRUCTOR(TO,	enum sctp_event_timeout, to)
 SCTP_ARG_CONSTRUCTOR(CHUNK,	struct sctp_chunk *, chunk)
 SCTP_ARG_CONSTRUCTOR(ASOC,	struct sctp_association *, asoc)
 SCTP_ARG_CONSTRUCTOR(TRANSPORT,	struct sctp_transport *, transport)
diff --git a/include/net/sctp/constants.h b/include/net/sctp/constants.h
index fd8a80ec573f..fb931f07e83a 100644
--- a/include/net/sctp/constants.h
+++ b/include/net/sctp/constants.h
@@ -84,7 +84,7 @@ typedef enum {
  * SCTP_ULP_* to the list of possible chunks.
  */
 
-typedef enum {
+enum sctp_event_timeout {
 	SCTP_EVENT_TIMEOUT_NONE = 0,
 	SCTP_EVENT_TIMEOUT_T1_COOKIE,
 	SCTP_EVENT_TIMEOUT_T1_INIT,
@@ -96,7 +96,7 @@ typedef enum {
 	SCTP_EVENT_TIMEOUT_RECONF,
 	SCTP_EVENT_TIMEOUT_SACK,
 	SCTP_EVENT_TIMEOUT_AUTOCLOSE,
-} sctp_event_timeout_t;
+};
 
 #define SCTP_EVENT_TIMEOUT_MAX		SCTP_EVENT_TIMEOUT_AUTOCLOSE
 #define SCTP_NUM_TIMEOUT_TYPES		(SCTP_EVENT_TIMEOUT_MAX + 1)
@@ -131,7 +131,7 @@ enum sctp_event_primitive {
 
 typedef union {
 	enum sctp_cid chunk;
-	sctp_event_timeout_t timeout;
+	enum sctp_event_timeout timeout;
 	enum sctp_event_other other;
 	enum sctp_event_primitive primitive;
 } sctp_subtype_t;
@@ -142,7 +142,7 @@ SCTP_ST_## _name (_type _arg)		\
 { sctp_subtype_t _retval; _retval._elt = _arg; return _retval; }
 
 SCTP_SUBTYPE_CONSTRUCTOR(CHUNK,		enum sctp_cid,		chunk)
-SCTP_SUBTYPE_CONSTRUCTOR(TIMEOUT,	sctp_event_timeout_t,	timeout)
+SCTP_SUBTYPE_CONSTRUCTOR(TIMEOUT,	enum sctp_event_timeout, timeout)
 SCTP_SUBTYPE_CONSTRUCTOR(OTHER,		enum sctp_event_other,	other)
 SCTP_SUBTYPE_CONSTRUCTOR(PRIMITIVE,	enum sctp_event_primitive, primitive)
 
diff --git a/net/sctp/sm_sideeffect.c b/net/sctp/sm_sideeffect.c
index b77a81aa907b..11a344896b71 100644
--- a/net/sctp/sm_sideeffect.c
+++ b/net/sctp/sm_sideeffect.c
@@ -280,7 +280,7 @@ out_unlock:
  * for timeouts which use the association as their parameter.
  */
 static void sctp_generate_timeout_event(struct sctp_association *asoc,
-					sctp_event_timeout_t timeout_type)
+					enum sctp_event_timeout timeout_type)
 {
 	struct sock *sk = asoc->base.sk;
 	struct net *net = sock_net(sk);
@@ -1052,8 +1052,8 @@ static void sctp_cmd_adaptation_ind(sctp_cmd_seq_t *commands,
 
 
 static void sctp_cmd_t1_timer_update(struct sctp_association *asoc,
-				    sctp_event_timeout_t timer,
-				    char *name)
+				     enum sctp_event_timeout timer,
+				     char *name)
 {
 	struct sctp_transport *t;
 
-- 
cgit v1.2.3


From 61f0eb072294a148f707335d4d7f858b2af73770 Mon Sep 17 00:00:00 2001
From: Xin Long <lucien.xin@gmail.com>
Date: Sat, 5 Aug 2017 20:00:03 +0800
Subject: sctp: remove the typedef sctp_event_t

This patch is to remove the typedef sctp_event_t, and
replace with enum sctp_event in the places where it's
using this typedef.

Signed-off-by: Xin Long <lucien.xin@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/sctp/constants.h |  6 ++----
 include/net/sctp/sm.h        | 12 +++++-------
 net/sctp/primitive.c         |  2 +-
 net/sctp/sm_sideeffect.c     | 20 +++++++++-----------
 net/sctp/sm_statetable.c     |  2 +-
 5 files changed, 18 insertions(+), 24 deletions(-)

(limited to 'include')

diff --git a/include/net/sctp/constants.h b/include/net/sctp/constants.h
index fb931f07e83a..3181e0f95b60 100644
--- a/include/net/sctp/constants.h
+++ b/include/net/sctp/constants.h
@@ -71,14 +71,12 @@ enum { SCTP_DEFAULT_INSTREAMS = SCTP_MAX_STREAM };
 					 SCTP_NUM_AUTH_CHUNK_TYPES)
 
 /* These are the different flavours of event.  */
-typedef enum {
-
+enum sctp_event {
 	SCTP_EVENT_T_CHUNK = 1,
 	SCTP_EVENT_T_TIMEOUT,
 	SCTP_EVENT_T_OTHER,
 	SCTP_EVENT_T_PRIMITIVE
-
-} sctp_event_t;
+};
 
 /* As a convenience for the state machine, we append SCTP_EVENT_* and
  * SCTP_ULP_* to the list of possible chunks.
diff --git a/include/net/sctp/sm.h b/include/net/sctp/sm.h
index 281e8d1e0752..96f54cff7964 100644
--- a/include/net/sctp/sm.h
+++ b/include/net/sctp/sm.h
@@ -176,7 +176,7 @@ sctp_state_fn_t sctp_sf_autoclose_timer_expire;
 /* Prototypes for utility support functions.  */
 __u8 sctp_get_chunk_type(struct sctp_chunk *chunk);
 const sctp_sm_table_entry_t *sctp_sm_lookup_event(struct net *net,
-						  sctp_event_t event_type,
+						  enum sctp_event event_type,
 						  enum sctp_state state,
 						  sctp_subtype_t event_subtype);
 int sctp_chunk_iif(const struct sctp_chunk *);
@@ -312,12 +312,10 @@ struct sctp_chunk *sctp_process_strreset_resp(
 
 /* Prototypes for statetable processing. */
 
-int sctp_do_sm(struct net *net, sctp_event_t event_type, sctp_subtype_t subtype,
-	       enum sctp_state state,
-               struct sctp_endpoint *,
-               struct sctp_association *asoc,
-               void *event_arg,
-	       gfp_t gfp);
+int sctp_do_sm(struct net *net, enum sctp_event event_type,
+	       sctp_subtype_t subtype, enum sctp_state state,
+	       struct sctp_endpoint *ep, struct sctp_association *asoc,
+	       void *event_arg, gfp_t gfp);
 
 /* 2nd level prototypes */
 void sctp_generate_t3_rtx_event(unsigned long peer);
diff --git a/net/sctp/primitive.c b/net/sctp/primitive.c
index 1fb5b9bb3c6d..c914166984b3 100644
--- a/net/sctp/primitive.c
+++ b/net/sctp/primitive.c
@@ -53,7 +53,7 @@
 int sctp_primitive_ ## name(struct net *net, struct sctp_association *asoc, \
 			    void *arg) { \
 	int error = 0; \
-	sctp_event_t event_type; sctp_subtype_t subtype; \
+	enum sctp_event event_type; sctp_subtype_t subtype; \
 	enum sctp_state state; \
 	struct sctp_endpoint *ep; \
 	\
diff --git a/net/sctp/sm_sideeffect.c b/net/sctp/sm_sideeffect.c
index 11a344896b71..b545c768cb9e 100644
--- a/net/sctp/sm_sideeffect.c
+++ b/net/sctp/sm_sideeffect.c
@@ -51,7 +51,7 @@
 #include <net/sctp/sctp.h>
 #include <net/sctp/sm.h>
 
-static int sctp_cmd_interpreter(sctp_event_t event_type,
+static int sctp_cmd_interpreter(enum sctp_event event_type,
 				sctp_subtype_t subtype,
 				enum sctp_state state,
 				struct sctp_endpoint *ep,
@@ -60,7 +60,7 @@ static int sctp_cmd_interpreter(sctp_event_t event_type,
 				sctp_disposition_t status,
 				sctp_cmd_seq_t *commands,
 				gfp_t gfp);
-static int sctp_side_effects(sctp_event_t event_type, sctp_subtype_t subtype,
+static int sctp_side_effects(enum sctp_event event_type, sctp_subtype_t subtype,
 			     enum sctp_state state,
 			     struct sctp_endpoint *ep,
 			     struct sctp_association **asoc,
@@ -602,7 +602,7 @@ static void sctp_cmd_init_failed(sctp_cmd_seq_t *commands,
 /* Worker routine to handle SCTP_CMD_ASSOC_FAILED.  */
 static void sctp_cmd_assoc_failed(sctp_cmd_seq_t *commands,
 				  struct sctp_association *asoc,
-				  sctp_event_t event_type,
+				  enum sctp_event event_type,
 				  sctp_subtype_t subtype,
 				  struct sctp_chunk *chunk,
 				  unsigned int error)
@@ -1139,12 +1139,10 @@ static void sctp_cmd_send_asconf(struct sctp_association *asoc)
  * If you want to understand all of lksctp, this is a
  * good place to start.
  */
-int sctp_do_sm(struct net *net, sctp_event_t event_type, sctp_subtype_t subtype,
-	       enum sctp_state state,
-	       struct sctp_endpoint *ep,
-	       struct sctp_association *asoc,
-	       void *event_arg,
-	       gfp_t gfp)
+int sctp_do_sm(struct net *net, enum sctp_event event_type,
+	       sctp_subtype_t subtype, enum sctp_state state,
+	       struct sctp_endpoint *ep, struct sctp_association *asoc,
+	       void *event_arg, gfp_t gfp)
 {
 	sctp_cmd_seq_t commands;
 	const sctp_sm_table_entry_t *state_fn;
@@ -1178,7 +1176,7 @@ int sctp_do_sm(struct net *net, sctp_event_t event_type, sctp_subtype_t subtype,
 /*****************************************************************
  * This the master state function side effect processing function.
  *****************************************************************/
-static int sctp_side_effects(sctp_event_t event_type, sctp_subtype_t subtype,
+static int sctp_side_effects(enum sctp_event event_type, sctp_subtype_t subtype,
 			     enum sctp_state state,
 			     struct sctp_endpoint *ep,
 			     struct sctp_association **asoc,
@@ -1263,7 +1261,7 @@ bail:
  ********************************************************************/
 
 /* This is the side-effect interpreter.  */
-static int sctp_cmd_interpreter(sctp_event_t event_type,
+static int sctp_cmd_interpreter(enum sctp_event event_type,
 				sctp_subtype_t subtype,
 				enum sctp_state state,
 				struct sctp_endpoint *ep,
diff --git a/net/sctp/sm_statetable.c b/net/sctp/sm_statetable.c
index f46c11bffea7..f7cdb7014244 100644
--- a/net/sctp/sm_statetable.c
+++ b/net/sctp/sm_statetable.c
@@ -78,7 +78,7 @@ static const sctp_sm_table_entry_t bug = {
 })
 
 const sctp_sm_table_entry_t *sctp_sm_lookup_event(struct net *net,
-						  sctp_event_t event_type,
+						  enum sctp_event event_type,
 						  enum sctp_state state,
 						  sctp_subtype_t event_subtype)
 {
-- 
cgit v1.2.3


From bfc6f8270fefb323662d1d7713f940149f27b7f1 Mon Sep 17 00:00:00 2001
From: Xin Long <lucien.xin@gmail.com>
Date: Sat, 5 Aug 2017 20:00:04 +0800
Subject: sctp: remove the typedef sctp_subtype_t

This patch is to remove the typedef sctp_subtype_t, and
replace with union sctp_subtype in the places where it's
using this typedef.

Note that it doesn't fix many indents although it should,
as sctp_disposition_t's removal would mess them up again.
So better to fix them when removing sctp_disposition_t in
later patch.

Signed-off-by: Xin Long <lucien.xin@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/sctp/constants.h |  18 ++--
 include/net/sctp/sm.h        |  13 +--
 net/sctp/associola.c         |   2 +-
 net/sctp/debug.c             |   8 +-
 net/sctp/endpointola.c       |   2 +-
 net/sctp/primitive.c         |   2 +-
 net/sctp/probe.c             |   2 +-
 net/sctp/sm_sideeffect.c     |  16 ++--
 net/sctp/sm_statefuns.c      | 191 ++++++++++++++++++++++---------------------
 net/sctp/sm_statetable.c     |   9 +-
 10 files changed, 134 insertions(+), 129 deletions(-)

(limited to 'include')

diff --git a/include/net/sctp/constants.h b/include/net/sctp/constants.h
index 3181e0f95b60..deaafa9b09cb 100644
--- a/include/net/sctp/constants.h
+++ b/include/net/sctp/constants.h
@@ -124,20 +124,20 @@ enum sctp_event_primitive {
 /* We define here a utility type for manipulating subtypes.
  * The subtype constructors all work like this:
  *
- * 	sctp_subtype_t foo = SCTP_ST_CHUNK(SCTP_CID_INIT);
+ *   union sctp_subtype foo = SCTP_ST_CHUNK(SCTP_CID_INIT);
  */
 
-typedef union {
+union sctp_subtype {
 	enum sctp_cid chunk;
 	enum sctp_event_timeout timeout;
 	enum sctp_event_other other;
 	enum sctp_event_primitive primitive;
-} sctp_subtype_t;
+};
 
 #define SCTP_SUBTYPE_CONSTRUCTOR(_name, _type, _elt) \
-static inline sctp_subtype_t	\
+static inline union sctp_subtype	\
 SCTP_ST_## _name (_type _arg)		\
-{ sctp_subtype_t _retval; _retval._elt = _arg; return _retval; }
+{ union sctp_subtype _retval; _retval._elt = _arg; return _retval; }
 
 SCTP_SUBTYPE_CONSTRUCTOR(CHUNK,		enum sctp_cid,		chunk)
 SCTP_SUBTYPE_CONSTRUCTOR(TIMEOUT,	enum sctp_event_timeout, timeout)
@@ -220,10 +220,10 @@ enum sctp_sock_state {
 };
 
 /* These functions map various type to printable names.  */
-const char *sctp_cname(const sctp_subtype_t);	/* chunk types */
-const char *sctp_oname(const sctp_subtype_t);	/* other events */
-const char *sctp_tname(const sctp_subtype_t);	/* timeouts */
-const char *sctp_pname(const sctp_subtype_t);	/* primitives */
+const char *sctp_cname(const union sctp_subtype id);	/* chunk types */
+const char *sctp_oname(const union sctp_subtype id);	/* other events */
+const char *sctp_tname(const union sctp_subtype id);	/* timeouts */
+const char *sctp_pname(const union sctp_subtype id);	/* primitives */
 
 /* This is a table of printable names of sctp_state_t's.  */
 extern const char *const sctp_state_tbl[];
diff --git a/include/net/sctp/sm.h b/include/net/sctp/sm.h
index 96f54cff7964..1e7651c3b158 100644
--- a/include/net/sctp/sm.h
+++ b/include/net/sctp/sm.h
@@ -73,7 +73,7 @@ typedef struct {
 typedef sctp_disposition_t (sctp_state_fn_t) (struct net *,
 					      const struct sctp_endpoint *,
 					      const struct sctp_association *,
-					      const sctp_subtype_t type,
+					      const union sctp_subtype type,
 					      void *arg,
 					      sctp_cmd_seq_t *);
 typedef void (sctp_timer_event_t) (unsigned long);
@@ -175,10 +175,11 @@ sctp_state_fn_t sctp_sf_autoclose_timer_expire;
 
 /* Prototypes for utility support functions.  */
 __u8 sctp_get_chunk_type(struct sctp_chunk *chunk);
-const sctp_sm_table_entry_t *sctp_sm_lookup_event(struct net *net,
-						  enum sctp_event event_type,
-						  enum sctp_state state,
-						  sctp_subtype_t event_subtype);
+const sctp_sm_table_entry_t *sctp_sm_lookup_event(
+					struct net *net,
+					enum sctp_event event_type,
+					enum sctp_state state,
+					union sctp_subtype event_subtype);
 int sctp_chunk_iif(const struct sctp_chunk *);
 struct sctp_association *sctp_make_temp_asoc(const struct sctp_endpoint *,
 					     struct sctp_chunk *,
@@ -313,7 +314,7 @@ struct sctp_chunk *sctp_process_strreset_resp(
 /* Prototypes for statetable processing. */
 
 int sctp_do_sm(struct net *net, enum sctp_event event_type,
-	       sctp_subtype_t subtype, enum sctp_state state,
+	       union sctp_subtype subtype, enum sctp_state state,
 	       struct sctp_endpoint *ep, struct sctp_association *asoc,
 	       void *event_arg, gfp_t gfp);
 
diff --git a/net/sctp/associola.c b/net/sctp/associola.c
index b53efed8ff71..dfb9651e818b 100644
--- a/net/sctp/associola.c
+++ b/net/sctp/associola.c
@@ -1021,11 +1021,11 @@ static void sctp_assoc_bh_rcv(struct work_struct *work)
 		container_of(work, struct sctp_association,
 			     base.inqueue.immediate);
 	struct net *net = sock_net(asoc->base.sk);
+	union sctp_subtype subtype;
 	struct sctp_endpoint *ep;
 	struct sctp_chunk *chunk;
 	struct sctp_inq *inqueue;
 	int state;
-	sctp_subtype_t subtype;
 	int error = 0;
 
 	/* The association should be held so we should be safe. */
diff --git a/net/sctp/debug.c b/net/sctp/debug.c
index 2e47eb2f05cb..3f619fdcbf0a 100644
--- a/net/sctp/debug.c
+++ b/net/sctp/debug.c
@@ -60,7 +60,7 @@ static const char *const sctp_cid_tbl[SCTP_NUM_BASE_CHUNK_TYPES] = {
 };
 
 /* Lookup "chunk type" debug name. */
-const char *sctp_cname(const sctp_subtype_t cid)
+const char *sctp_cname(const union sctp_subtype cid)
 {
 	if (cid.chunk <= SCTP_CID_BASE_MAX)
 		return sctp_cid_tbl[cid.chunk];
@@ -130,7 +130,7 @@ static const char *const sctp_primitive_tbl[SCTP_NUM_PRIMITIVE_TYPES] = {
 };
 
 /* Lookup primitive debug name. */
-const char *sctp_pname(const sctp_subtype_t id)
+const char *sctp_pname(const union sctp_subtype id)
 {
 	if (id.primitive <= SCTP_EVENT_PRIMITIVE_MAX)
 		return sctp_primitive_tbl[id.primitive];
@@ -143,7 +143,7 @@ static const char *const sctp_other_tbl[] = {
 };
 
 /* Lookup "other" debug name. */
-const char *sctp_oname(const sctp_subtype_t id)
+const char *sctp_oname(const union sctp_subtype id)
 {
 	if (id.other <= SCTP_EVENT_OTHER_MAX)
 		return sctp_other_tbl[id.other];
@@ -165,7 +165,7 @@ static const char *const sctp_timer_tbl[] = {
 };
 
 /* Lookup timer debug name. */
-const char *sctp_tname(const sctp_subtype_t id)
+const char *sctp_tname(const union sctp_subtype id)
 {
 	BUILD_BUG_ON(SCTP_EVENT_TIMEOUT_MAX + 1 != ARRAY_SIZE(sctp_timer_tbl));
 
diff --git a/net/sctp/endpointola.c b/net/sctp/endpointola.c
index 4111c00a9d9d..ee1e601a0b11 100644
--- a/net/sctp/endpointola.c
+++ b/net/sctp/endpointola.c
@@ -382,7 +382,7 @@ static void sctp_endpoint_bh_rcv(struct work_struct *work)
 	struct sctp_transport *transport;
 	struct sctp_chunk *chunk;
 	struct sctp_inq *inqueue;
-	sctp_subtype_t subtype;
+	union sctp_subtype subtype;
 	enum sctp_state state;
 	int error = 0;
 	int first_time = 1;	/* is this the first time through the loop */
diff --git a/net/sctp/primitive.c b/net/sctp/primitive.c
index c914166984b3..c0817f7a8964 100644
--- a/net/sctp/primitive.c
+++ b/net/sctp/primitive.c
@@ -53,7 +53,7 @@
 int sctp_primitive_ ## name(struct net *net, struct sctp_association *asoc, \
 			    void *arg) { \
 	int error = 0; \
-	enum sctp_event event_type; sctp_subtype_t subtype; \
+	enum sctp_event event_type; union sctp_subtype subtype; \
 	enum sctp_state state; \
 	struct sctp_endpoint *ep; \
 	\
diff --git a/net/sctp/probe.c b/net/sctp/probe.c
index 6cc2152e0740..43837dfc86a7 100644
--- a/net/sctp/probe.c
+++ b/net/sctp/probe.c
@@ -130,7 +130,7 @@ static const struct file_operations sctpprobe_fops = {
 static sctp_disposition_t jsctp_sf_eat_sack(struct net *net,
 					    const struct sctp_endpoint *ep,
 					    const struct sctp_association *asoc,
-					    const sctp_subtype_t type,
+					    const union sctp_subtype type,
 					    void *arg,
 					    sctp_cmd_seq_t *commands)
 {
diff --git a/net/sctp/sm_sideeffect.c b/net/sctp/sm_sideeffect.c
index b545c768cb9e..4a12d29d9aa1 100644
--- a/net/sctp/sm_sideeffect.c
+++ b/net/sctp/sm_sideeffect.c
@@ -52,7 +52,7 @@
 #include <net/sctp/sm.h>
 
 static int sctp_cmd_interpreter(enum sctp_event event_type,
-				sctp_subtype_t subtype,
+				union sctp_subtype subtype,
 				enum sctp_state state,
 				struct sctp_endpoint *ep,
 				struct sctp_association *asoc,
@@ -60,7 +60,8 @@ static int sctp_cmd_interpreter(enum sctp_event event_type,
 				sctp_disposition_t status,
 				sctp_cmd_seq_t *commands,
 				gfp_t gfp);
-static int sctp_side_effects(enum sctp_event event_type, sctp_subtype_t subtype,
+static int sctp_side_effects(enum sctp_event event_type,
+			     union sctp_subtype subtype,
 			     enum sctp_state state,
 			     struct sctp_endpoint *ep,
 			     struct sctp_association **asoc,
@@ -603,7 +604,7 @@ static void sctp_cmd_init_failed(sctp_cmd_seq_t *commands,
 static void sctp_cmd_assoc_failed(sctp_cmd_seq_t *commands,
 				  struct sctp_association *asoc,
 				  enum sctp_event event_type,
-				  sctp_subtype_t subtype,
+				  union sctp_subtype subtype,
 				  struct sctp_chunk *chunk,
 				  unsigned int error)
 {
@@ -1140,7 +1141,7 @@ static void sctp_cmd_send_asconf(struct sctp_association *asoc)
  * good place to start.
  */
 int sctp_do_sm(struct net *net, enum sctp_event event_type,
-	       sctp_subtype_t subtype, enum sctp_state state,
+	       union sctp_subtype subtype, enum sctp_state state,
 	       struct sctp_endpoint *ep, struct sctp_association *asoc,
 	       void *event_arg, gfp_t gfp)
 {
@@ -1148,7 +1149,7 @@ int sctp_do_sm(struct net *net, enum sctp_event event_type,
 	const sctp_sm_table_entry_t *state_fn;
 	sctp_disposition_t status;
 	int error = 0;
-	typedef const char *(printfn_t)(sctp_subtype_t);
+	typedef const char *(printfn_t)(union sctp_subtype);
 	static printfn_t *table[] = {
 		NULL, sctp_cname, sctp_tname, sctp_oname, sctp_pname,
 	};
@@ -1176,7 +1177,8 @@ int sctp_do_sm(struct net *net, enum sctp_event event_type,
 /*****************************************************************
  * This the master state function side effect processing function.
  *****************************************************************/
-static int sctp_side_effects(enum sctp_event event_type, sctp_subtype_t subtype,
+static int sctp_side_effects(enum sctp_event event_type,
+			     union sctp_subtype subtype,
 			     enum sctp_state state,
 			     struct sctp_endpoint *ep,
 			     struct sctp_association **asoc,
@@ -1262,7 +1264,7 @@ bail:
 
 /* This is the side-effect interpreter.  */
 static int sctp_cmd_interpreter(enum sctp_event event_type,
-				sctp_subtype_t subtype,
+				union sctp_subtype subtype,
 				enum sctp_state state,
 				struct sctp_endpoint *ep,
 				struct sctp_association *asoc,
diff --git a/net/sctp/sm_statefuns.c b/net/sctp/sm_statefuns.c
index 5381697333df..ac6aaa046529 100644
--- a/net/sctp/sm_statefuns.c
+++ b/net/sctp/sm_statefuns.c
@@ -80,19 +80,19 @@ static void sctp_send_stale_cookie_err(struct net *net,
 static sctp_disposition_t sctp_sf_do_5_2_6_stale(struct net *net,
 						 const struct sctp_endpoint *ep,
 						 const struct sctp_association *asoc,
-						 const sctp_subtype_t type,
+						 const union sctp_subtype type,
 						 void *arg,
 						 sctp_cmd_seq_t *commands);
 static sctp_disposition_t sctp_sf_shut_8_4_5(struct net *net,
 					     const struct sctp_endpoint *ep,
 					     const struct sctp_association *asoc,
-					     const sctp_subtype_t type,
+					     const union sctp_subtype type,
 					     void *arg,
 					     sctp_cmd_seq_t *commands);
 static sctp_disposition_t sctp_sf_tabort_8_4_8(struct net *net,
 					const struct sctp_endpoint *ep,
 					const struct sctp_association *asoc,
-					const sctp_subtype_t type,
+					const union sctp_subtype type,
 					void *arg,
 					sctp_cmd_seq_t *commands);
 static struct sctp_sackhdr *sctp_sm_pull_sack(struct sctp_chunk *chunk);
@@ -116,7 +116,7 @@ static sctp_disposition_t sctp_sf_violation_chunklen(
 				     struct net *net,
 				     const struct sctp_endpoint *ep,
 				     const struct sctp_association *asoc,
-				     const sctp_subtype_t type,
+				     const union sctp_subtype type,
 				     void *arg,
 				     sctp_cmd_seq_t *commands);
 
@@ -124,7 +124,7 @@ static sctp_disposition_t sctp_sf_violation_paramlen(
 				     struct net *net,
 				     const struct sctp_endpoint *ep,
 				     const struct sctp_association *asoc,
-				     const sctp_subtype_t type,
+				     const union sctp_subtype type,
 				     void *arg, void *ext,
 				     sctp_cmd_seq_t *commands);
 
@@ -132,7 +132,7 @@ static sctp_disposition_t sctp_sf_violation_ctsn(
 				     struct net *net,
 				     const struct sctp_endpoint *ep,
 				     const struct sctp_association *asoc,
-				     const sctp_subtype_t type,
+				     const union sctp_subtype type,
 				     void *arg,
 				     sctp_cmd_seq_t *commands);
 
@@ -140,7 +140,7 @@ static sctp_disposition_t sctp_sf_violation_chunk(
 				     struct net *net,
 				     const struct sctp_endpoint *ep,
 				     const struct sctp_association *asoc,
-				     const sctp_subtype_t type,
+				     const union sctp_subtype type,
 				     void *arg,
 				     sctp_cmd_seq_t *commands);
 
@@ -148,13 +148,13 @@ static enum sctp_ierror sctp_sf_authenticate(
 				struct net *net,
 				const struct sctp_endpoint *ep,
 				const struct sctp_association *asoc,
-				const sctp_subtype_t type,
+				const union sctp_subtype type,
 				struct sctp_chunk *chunk);
 
 static sctp_disposition_t __sctp_sf_do_9_1_abort(struct net *net,
 					const struct sctp_endpoint *ep,
 					const struct sctp_association *asoc,
-					const sctp_subtype_t type,
+					const union sctp_subtype type,
 					void *arg,
 					sctp_cmd_seq_t *commands);
 
@@ -217,7 +217,7 @@ sctp_chunk_length_valid(struct sctp_chunk *chunk, __u16 required_length)
 sctp_disposition_t sctp_sf_do_4_C(struct net *net,
 				  const struct sctp_endpoint *ep,
 				  const struct sctp_association *asoc,
-				  const sctp_subtype_t type,
+				  const union sctp_subtype type,
 				  void *arg,
 				  sctp_cmd_seq_t *commands)
 {
@@ -303,7 +303,7 @@ sctp_disposition_t sctp_sf_do_4_C(struct net *net,
 sctp_disposition_t sctp_sf_do_5_1B_init(struct net *net,
 					const struct sctp_endpoint *ep,
 					const struct sctp_association *asoc,
-					const sctp_subtype_t type,
+					const union sctp_subtype type,
 					void *arg,
 					sctp_cmd_seq_t *commands)
 {
@@ -497,7 +497,7 @@ nomem:
 sctp_disposition_t sctp_sf_do_5_1C_ack(struct net *net,
 				       const struct sctp_endpoint *ep,
 				       const struct sctp_association *asoc,
-				       const sctp_subtype_t type,
+				       const union sctp_subtype type,
 				       void *arg,
 				       sctp_cmd_seq_t *commands)
 {
@@ -647,7 +647,7 @@ sctp_disposition_t sctp_sf_do_5_1C_ack(struct net *net,
 sctp_disposition_t sctp_sf_do_5_1D_ce(struct net *net,
 				      const struct sctp_endpoint *ep,
 				      const struct sctp_association *asoc,
-				      const sctp_subtype_t type, void *arg,
+				      const union sctp_subtype type, void *arg,
 				      sctp_cmd_seq_t *commands)
 {
 	struct sctp_chunk *chunk = arg;
@@ -874,7 +874,7 @@ nomem:
 sctp_disposition_t sctp_sf_do_5_1E_ca(struct net *net,
 				      const struct sctp_endpoint *ep,
 				      const struct sctp_association *asoc,
-				      const sctp_subtype_t type, void *arg,
+				      const union sctp_subtype type, void *arg,
 				      sctp_cmd_seq_t *commands)
 {
 	struct sctp_chunk *chunk = arg;
@@ -951,7 +951,7 @@ nomem:
 /* Generate and sendout a heartbeat packet.  */
 static sctp_disposition_t sctp_sf_heartbeat(const struct sctp_endpoint *ep,
 					    const struct sctp_association *asoc,
-					    const sctp_subtype_t type,
+					    const union sctp_subtype type,
 					    void *arg,
 					    sctp_cmd_seq_t *commands)
 {
@@ -977,7 +977,7 @@ static sctp_disposition_t sctp_sf_heartbeat(const struct sctp_endpoint *ep,
 sctp_disposition_t sctp_sf_sendbeat_8_3(struct net *net,
 					const struct sctp_endpoint *ep,
 					const struct sctp_association *asoc,
-					const sctp_subtype_t type,
+					const union sctp_subtype type,
 					void *arg,
 					sctp_cmd_seq_t *commands)
 {
@@ -1025,7 +1025,7 @@ sctp_disposition_t sctp_sf_sendbeat_8_3(struct net *net,
 sctp_disposition_t sctp_sf_send_reconf(struct net *net,
 				       const struct sctp_endpoint *ep,
 				       const struct sctp_association *asoc,
-				       const sctp_subtype_t type, void *arg,
+				       const union sctp_subtype type, void *arg,
 				       sctp_cmd_seq_t *commands)
 {
 	struct sctp_transport *transport = arg;
@@ -1076,7 +1076,7 @@ sctp_disposition_t sctp_sf_send_reconf(struct net *net,
 sctp_disposition_t sctp_sf_beat_8_3(struct net *net,
 				    const struct sctp_endpoint *ep,
 				    const struct sctp_association *asoc,
-				    const sctp_subtype_t type,
+				    const union sctp_subtype type,
 				    void *arg,
 				    sctp_cmd_seq_t *commands)
 {
@@ -1151,7 +1151,7 @@ nomem:
 sctp_disposition_t sctp_sf_backbeat_8_3(struct net *net,
 					const struct sctp_endpoint *ep,
 					const struct sctp_association *asoc,
-					const sctp_subtype_t type,
+					const union sctp_subtype type,
 					void *arg,
 					sctp_cmd_seq_t *commands)
 {
@@ -1416,7 +1416,7 @@ static sctp_disposition_t sctp_sf_do_unexpected_init(
 	struct net *net,
 	const struct sctp_endpoint *ep,
 	const struct sctp_association *asoc,
-	const sctp_subtype_t type,
+	const union sctp_subtype type,
 	void *arg, sctp_cmd_seq_t *commands)
 {
 	struct sctp_chunk *chunk = arg, *repl, *err_chunk;
@@ -1627,7 +1627,7 @@ cleanup:
 sctp_disposition_t sctp_sf_do_5_2_1_siminit(struct net *net,
 				    const struct sctp_endpoint *ep,
 				    const struct sctp_association *asoc,
-				    const sctp_subtype_t type,
+				    const union sctp_subtype type,
 				    void *arg,
 				    sctp_cmd_seq_t *commands)
 {
@@ -1681,7 +1681,7 @@ sctp_disposition_t sctp_sf_do_5_2_1_siminit(struct net *net,
 sctp_disposition_t sctp_sf_do_5_2_2_dupinit(struct net *net,
 					const struct sctp_endpoint *ep,
 					const struct sctp_association *asoc,
-					const sctp_subtype_t type,
+					const union sctp_subtype type,
 					void *arg,
 					sctp_cmd_seq_t *commands)
 {
@@ -1704,7 +1704,7 @@ sctp_disposition_t sctp_sf_do_5_2_2_dupinit(struct net *net,
 sctp_disposition_t sctp_sf_do_5_2_3_initack(struct net *net,
 					    const struct sctp_endpoint *ep,
 					    const struct sctp_association *asoc,
-					    const sctp_subtype_t type,
+					    const union sctp_subtype type,
 					    void *arg, sctp_cmd_seq_t *commands)
 {
 	/* Per the above section, we'll discard the chunk if we have an
@@ -2027,7 +2027,7 @@ nomem:
 sctp_disposition_t sctp_sf_do_5_2_4_dupcook(struct net *net,
 					const struct sctp_endpoint *ep,
 					const struct sctp_association *asoc,
-					const sctp_subtype_t type,
+					const union sctp_subtype type,
 					void *arg,
 					sctp_cmd_seq_t *commands)
 {
@@ -2146,7 +2146,7 @@ sctp_disposition_t sctp_sf_shutdown_pending_abort(
 	struct net *net,
 	const struct sctp_endpoint *ep,
 	const struct sctp_association *asoc,
-	const sctp_subtype_t type,
+	const union sctp_subtype type,
 	void *arg,
 	sctp_cmd_seq_t *commands)
 {
@@ -2188,7 +2188,7 @@ sctp_disposition_t sctp_sf_shutdown_pending_abort(
 sctp_disposition_t sctp_sf_shutdown_sent_abort(struct net *net,
 					const struct sctp_endpoint *ep,
 					const struct sctp_association *asoc,
-					const sctp_subtype_t type,
+					const union sctp_subtype type,
 					void *arg,
 					sctp_cmd_seq_t *commands)
 {
@@ -2239,7 +2239,7 @@ sctp_disposition_t sctp_sf_shutdown_ack_sent_abort(
 	struct net *net,
 	const struct sctp_endpoint *ep,
 	const struct sctp_association *asoc,
-	const sctp_subtype_t type,
+	const union sctp_subtype type,
 	void *arg,
 	sctp_cmd_seq_t *commands)
 {
@@ -2266,7 +2266,7 @@ sctp_disposition_t sctp_sf_shutdown_ack_sent_abort(
 sctp_disposition_t sctp_sf_cookie_echoed_err(struct net *net,
 					const struct sctp_endpoint *ep,
 					const struct sctp_association *asoc,
-					const sctp_subtype_t type,
+					const union sctp_subtype type,
 					void *arg,
 					sctp_cmd_seq_t *commands)
 {
@@ -2330,7 +2330,7 @@ sctp_disposition_t sctp_sf_cookie_echoed_err(struct net *net,
 static sctp_disposition_t sctp_sf_do_5_2_6_stale(struct net *net,
 						 const struct sctp_endpoint *ep,
 						 const struct sctp_association *asoc,
-						 const sctp_subtype_t type,
+						 const union sctp_subtype type,
 						 void *arg,
 						 sctp_cmd_seq_t *commands)
 {
@@ -2452,7 +2452,7 @@ nomem:
 sctp_disposition_t sctp_sf_do_9_1_abort(struct net *net,
 					const struct sctp_endpoint *ep,
 					const struct sctp_association *asoc,
-					const sctp_subtype_t type,
+					const union sctp_subtype type,
 					void *arg,
 					sctp_cmd_seq_t *commands)
 {
@@ -2489,7 +2489,7 @@ sctp_disposition_t sctp_sf_do_9_1_abort(struct net *net,
 static sctp_disposition_t __sctp_sf_do_9_1_abort(struct net *net,
 					const struct sctp_endpoint *ep,
 					const struct sctp_association *asoc,
-					const sctp_subtype_t type,
+					const union sctp_subtype type,
 					void *arg,
 					sctp_cmd_seq_t *commands)
 {
@@ -2527,7 +2527,7 @@ static sctp_disposition_t __sctp_sf_do_9_1_abort(struct net *net,
 sctp_disposition_t sctp_sf_cookie_wait_abort(struct net *net,
 				     const struct sctp_endpoint *ep,
 				     const struct sctp_association *asoc,
-				     const sctp_subtype_t type,
+				     const union sctp_subtype type,
 				     void *arg,
 				     sctp_cmd_seq_t *commands)
 {
@@ -2566,7 +2566,7 @@ sctp_disposition_t sctp_sf_cookie_wait_abort(struct net *net,
 sctp_disposition_t sctp_sf_cookie_wait_icmp_abort(struct net *net,
 					const struct sctp_endpoint *ep,
 					const struct sctp_association *asoc,
-					const sctp_subtype_t type,
+					const union sctp_subtype type,
 					void *arg,
 					sctp_cmd_seq_t *commands)
 {
@@ -2581,7 +2581,7 @@ sctp_disposition_t sctp_sf_cookie_wait_icmp_abort(struct net *net,
 sctp_disposition_t sctp_sf_cookie_echoed_abort(struct net *net,
 					       const struct sctp_endpoint *ep,
 					       const struct sctp_association *asoc,
-					       const sctp_subtype_t type,
+					       const union sctp_subtype type,
 					       void *arg,
 					       sctp_cmd_seq_t *commands)
 {
@@ -2653,7 +2653,7 @@ static sctp_disposition_t sctp_stop_t1_and_abort(struct net *net,
 sctp_disposition_t sctp_sf_do_9_2_shutdown(struct net *net,
 					   const struct sctp_endpoint *ep,
 					   const struct sctp_association *asoc,
-					   const sctp_subtype_t type,
+					   const union sctp_subtype type,
 					   void *arg,
 					   sctp_cmd_seq_t *commands)
 {
@@ -2742,7 +2742,7 @@ out:
 sctp_disposition_t sctp_sf_do_9_2_shut_ctsn(struct net *net,
 					   const struct sctp_endpoint *ep,
 					   const struct sctp_association *asoc,
-					   const sctp_subtype_t type,
+					   const union sctp_subtype type,
 					   void *arg,
 					   sctp_cmd_seq_t *commands)
 {
@@ -2795,7 +2795,7 @@ sctp_disposition_t sctp_sf_do_9_2_shut_ctsn(struct net *net,
 sctp_disposition_t sctp_sf_do_9_2_reshutack(struct net *net,
 				    const struct sctp_endpoint *ep,
 				    const struct sctp_association *asoc,
-				    const sctp_subtype_t type,
+				    const union sctp_subtype type,
 				    void *arg,
 				    sctp_cmd_seq_t *commands)
 {
@@ -2859,7 +2859,7 @@ nomem:
 sctp_disposition_t sctp_sf_do_ecn_cwr(struct net *net,
 				      const struct sctp_endpoint *ep,
 				      const struct sctp_association *asoc,
-				      const sctp_subtype_t type,
+				      const union sctp_subtype type,
 				      void *arg,
 				      sctp_cmd_seq_t *commands)
 {
@@ -2915,7 +2915,7 @@ sctp_disposition_t sctp_sf_do_ecn_cwr(struct net *net,
 sctp_disposition_t sctp_sf_do_ecne(struct net *net,
 				   const struct sctp_endpoint *ep,
 				   const struct sctp_association *asoc,
-				   const sctp_subtype_t type,
+				   const union sctp_subtype type,
 				   void *arg,
 				   sctp_cmd_seq_t *commands)
 {
@@ -2972,7 +2972,7 @@ sctp_disposition_t sctp_sf_do_ecne(struct net *net,
 sctp_disposition_t sctp_sf_eat_data_6_2(struct net *net,
 					const struct sctp_endpoint *ep,
 					const struct sctp_association *asoc,
-					const sctp_subtype_t type,
+					const union sctp_subtype type,
 					void *arg,
 					sctp_cmd_seq_t *commands)
 {
@@ -3092,7 +3092,7 @@ discard_noforce:
 sctp_disposition_t sctp_sf_eat_data_fast_4_4(struct net *net,
 				     const struct sctp_endpoint *ep,
 				     const struct sctp_association *asoc,
-				     const sctp_subtype_t type,
+				     const union sctp_subtype type,
 				     void *arg,
 				     sctp_cmd_seq_t *commands)
 {
@@ -3183,7 +3183,7 @@ sctp_disposition_t sctp_sf_eat_data_fast_4_4(struct net *net,
 sctp_disposition_t sctp_sf_eat_sack_6_2(struct net *net,
 					const struct sctp_endpoint *ep,
 					const struct sctp_association *asoc,
-					const sctp_subtype_t type,
+					const union sctp_subtype type,
 					void *arg,
 					sctp_cmd_seq_t *commands)
 {
@@ -3257,7 +3257,7 @@ sctp_disposition_t sctp_sf_eat_sack_6_2(struct net *net,
 static sctp_disposition_t sctp_sf_tabort_8_4_8(struct net *net,
 					const struct sctp_endpoint *ep,
 					const struct sctp_association *asoc,
-					const sctp_subtype_t type,
+					const union sctp_subtype type,
 					void *arg,
 					sctp_cmd_seq_t *commands)
 {
@@ -3307,7 +3307,7 @@ static sctp_disposition_t sctp_sf_tabort_8_4_8(struct net *net,
 sctp_disposition_t sctp_sf_operr_notify(struct net *net,
 					const struct sctp_endpoint *ep,
 					const struct sctp_association *asoc,
-					const sctp_subtype_t type,
+					const union sctp_subtype type,
 					void *arg,
 					sctp_cmd_seq_t *commands)
 {
@@ -3345,7 +3345,7 @@ sctp_disposition_t sctp_sf_operr_notify(struct net *net,
 sctp_disposition_t sctp_sf_do_9_2_final(struct net *net,
 					const struct sctp_endpoint *ep,
 					const struct sctp_association *asoc,
-					const sctp_subtype_t type,
+					const union sctp_subtype type,
 					void *arg,
 					sctp_cmd_seq_t *commands)
 {
@@ -3428,7 +3428,7 @@ nomem:
 sctp_disposition_t sctp_sf_ootb(struct net *net,
 				const struct sctp_endpoint *ep,
 				const struct sctp_association *asoc,
-				const sctp_subtype_t type,
+				const union sctp_subtype type,
 				void *arg,
 				sctp_cmd_seq_t *commands)
 {
@@ -3521,7 +3521,7 @@ sctp_disposition_t sctp_sf_ootb(struct net *net,
 static sctp_disposition_t sctp_sf_shut_8_4_5(struct net *net,
 					     const struct sctp_endpoint *ep,
 					     const struct sctp_association *asoc,
-					     const sctp_subtype_t type,
+					     const union sctp_subtype type,
 					     void *arg,
 					     sctp_cmd_seq_t *commands)
 {
@@ -3583,7 +3583,7 @@ static sctp_disposition_t sctp_sf_shut_8_4_5(struct net *net,
 sctp_disposition_t sctp_sf_do_8_5_1_E_sa(struct net *net,
 				      const struct sctp_endpoint *ep,
 				      const struct sctp_association *asoc,
-				      const sctp_subtype_t type,
+				      const union sctp_subtype type,
 				      void *arg,
 				      sctp_cmd_seq_t *commands)
 {
@@ -3608,7 +3608,7 @@ sctp_disposition_t sctp_sf_do_8_5_1_E_sa(struct net *net,
 sctp_disposition_t sctp_sf_do_asconf(struct net *net,
 				     const struct sctp_endpoint *ep,
 				     const struct sctp_association *asoc,
-				     const sctp_subtype_t type, void *arg,
+				     const union sctp_subtype type, void *arg,
 				     sctp_cmd_seq_t *commands)
 {
 	struct sctp_chunk	*chunk = arg;
@@ -3725,7 +3725,8 @@ sctp_disposition_t sctp_sf_do_asconf(struct net *net,
 sctp_disposition_t sctp_sf_do_asconf_ack(struct net *net,
 					 const struct sctp_endpoint *ep,
 					 const struct sctp_association *asoc,
-					 const sctp_subtype_t type, void *arg,
+					 const union sctp_subtype type,
+					 void *arg,
 					 sctp_cmd_seq_t *commands)
 {
 	struct sctp_chunk	*asconf_ack = arg;
@@ -3843,7 +3844,7 @@ sctp_disposition_t sctp_sf_do_asconf_ack(struct net *net,
 sctp_disposition_t sctp_sf_do_reconf(struct net *net,
 				     const struct sctp_endpoint *ep,
 				     const struct sctp_association *asoc,
-				     const sctp_subtype_t type, void *arg,
+				     const union sctp_subtype type, void *arg,
 				     sctp_cmd_seq_t *commands)
 {
 	struct sctp_paramhdr *err_param = NULL;
@@ -3919,7 +3920,7 @@ sctp_disposition_t sctp_sf_do_reconf(struct net *net,
 sctp_disposition_t sctp_sf_eat_fwd_tsn(struct net *net,
 				       const struct sctp_endpoint *ep,
 				       const struct sctp_association *asoc,
-				       const sctp_subtype_t type,
+				       const union sctp_subtype type,
 				       void *arg,
 				       sctp_cmd_seq_t *commands)
 {
@@ -3990,7 +3991,7 @@ sctp_disposition_t sctp_sf_eat_fwd_tsn_fast(
 	struct net *net,
 	const struct sctp_endpoint *ep,
 	const struct sctp_association *asoc,
-	const sctp_subtype_t type,
+	const union sctp_subtype type,
 	void *arg,
 	sctp_cmd_seq_t *commands)
 {
@@ -4082,7 +4083,7 @@ static enum sctp_ierror sctp_sf_authenticate(
 				struct net *net,
 				const struct sctp_endpoint *ep,
 				const struct sctp_association *asoc,
-				const sctp_subtype_t type,
+				const union sctp_subtype type,
 				struct sctp_chunk *chunk)
 {
 	struct sctp_authhdr *auth_hdr;
@@ -4157,7 +4158,7 @@ nomem:
 sctp_disposition_t sctp_sf_eat_auth(struct net *net,
 				    const struct sctp_endpoint *ep,
 				    const struct sctp_association *asoc,
-				    const sctp_subtype_t type,
+				    const union sctp_subtype type,
 				    void *arg,
 				    sctp_cmd_seq_t *commands)
 {
@@ -4254,7 +4255,7 @@ sctp_disposition_t sctp_sf_eat_auth(struct net *net,
 sctp_disposition_t sctp_sf_unk_chunk(struct net *net,
 				     const struct sctp_endpoint *ep,
 				     const struct sctp_association *asoc,
-				     const sctp_subtype_t type,
+				     const union sctp_subtype type,
 				     void *arg,
 				     sctp_cmd_seq_t *commands)
 {
@@ -4334,7 +4335,7 @@ sctp_disposition_t sctp_sf_unk_chunk(struct net *net,
 sctp_disposition_t sctp_sf_discard_chunk(struct net *net,
 					 const struct sctp_endpoint *ep,
 					 const struct sctp_association *asoc,
-					 const sctp_subtype_t type,
+					 const union sctp_subtype type,
 					 void *arg,
 					 sctp_cmd_seq_t *commands)
 {
@@ -4374,7 +4375,7 @@ sctp_disposition_t sctp_sf_discard_chunk(struct net *net,
 sctp_disposition_t sctp_sf_pdiscard(struct net *net,
 				    const struct sctp_endpoint *ep,
 				    const struct sctp_association *asoc,
-				    const sctp_subtype_t type,
+				    const union sctp_subtype type,
 				    void *arg,
 				    sctp_cmd_seq_t *commands)
 {
@@ -4402,7 +4403,7 @@ sctp_disposition_t sctp_sf_pdiscard(struct net *net,
 sctp_disposition_t sctp_sf_violation(struct net *net,
 				     const struct sctp_endpoint *ep,
 				     const struct sctp_association *asoc,
-				     const sctp_subtype_t type,
+				     const union sctp_subtype type,
 				     void *arg,
 				     sctp_cmd_seq_t *commands)
 {
@@ -4540,7 +4541,7 @@ static sctp_disposition_t sctp_sf_violation_chunklen(
 				     struct net *net,
 				     const struct sctp_endpoint *ep,
 				     const struct sctp_association *asoc,
-				     const sctp_subtype_t type,
+				     const union sctp_subtype type,
 				     void *arg,
 				     sctp_cmd_seq_t *commands)
 {
@@ -4560,7 +4561,7 @@ static sctp_disposition_t sctp_sf_violation_paramlen(
 				     struct net *net,
 				     const struct sctp_endpoint *ep,
 				     const struct sctp_association *asoc,
-				     const sctp_subtype_t type,
+				     const union sctp_subtype type,
 				     void *arg, void *ext,
 				     sctp_cmd_seq_t *commands)
 {
@@ -4603,7 +4604,7 @@ static sctp_disposition_t sctp_sf_violation_ctsn(
 				     struct net *net,
 				     const struct sctp_endpoint *ep,
 				     const struct sctp_association *asoc,
-				     const sctp_subtype_t type,
+				     const union sctp_subtype type,
 				     void *arg,
 				     sctp_cmd_seq_t *commands)
 {
@@ -4623,7 +4624,7 @@ static sctp_disposition_t sctp_sf_violation_chunk(
 				     struct net *net,
 				     const struct sctp_endpoint *ep,
 				     const struct sctp_association *asoc,
-				     const sctp_subtype_t type,
+				     const union sctp_subtype type,
 				     void *arg,
 				     sctp_cmd_seq_t *commands)
 {
@@ -4698,7 +4699,7 @@ static sctp_disposition_t sctp_sf_violation_chunk(
 sctp_disposition_t sctp_sf_do_prm_asoc(struct net *net,
 				       const struct sctp_endpoint *ep,
 				       const struct sctp_association *asoc,
-				       const sctp_subtype_t type,
+				       const union sctp_subtype type,
 				       void *arg,
 				       sctp_cmd_seq_t *commands)
 {
@@ -4810,7 +4811,7 @@ nomem:
 sctp_disposition_t sctp_sf_do_prm_send(struct net *net,
 				       const struct sctp_endpoint *ep,
 				       const struct sctp_association *asoc,
-				       const sctp_subtype_t type,
+				       const union sctp_subtype type,
 				       void *arg,
 				       sctp_cmd_seq_t *commands)
 {
@@ -4850,7 +4851,7 @@ sctp_disposition_t sctp_sf_do_9_2_prm_shutdown(
 	struct net *net,
 	const struct sctp_endpoint *ep,
 	const struct sctp_association *asoc,
-	const sctp_subtype_t type,
+	const union sctp_subtype type,
 	void *arg,
 	sctp_cmd_seq_t *commands)
 {
@@ -4906,7 +4907,7 @@ sctp_disposition_t sctp_sf_do_9_1_prm_abort(
 	struct net *net,
 	const struct sctp_endpoint *ep,
 	const struct sctp_association *asoc,
-	const sctp_subtype_t type,
+	const union sctp_subtype type,
 	void *arg,
 	sctp_cmd_seq_t *commands)
 {
@@ -4943,7 +4944,7 @@ sctp_disposition_t sctp_sf_do_9_1_prm_abort(
 sctp_disposition_t sctp_sf_error_closed(struct net *net,
 					const struct sctp_endpoint *ep,
 					const struct sctp_association *asoc,
-					const sctp_subtype_t type,
+					const union sctp_subtype type,
 					void *arg,
 					sctp_cmd_seq_t *commands)
 {
@@ -4957,7 +4958,7 @@ sctp_disposition_t sctp_sf_error_closed(struct net *net,
 sctp_disposition_t sctp_sf_error_shutdown(struct net *net,
 					  const struct sctp_endpoint *ep,
 					  const struct sctp_association *asoc,
-					  const sctp_subtype_t type,
+					  const union sctp_subtype type,
 					  void *arg,
 					  sctp_cmd_seq_t *commands)
 {
@@ -4984,7 +4985,7 @@ sctp_disposition_t sctp_sf_cookie_wait_prm_shutdown(
 	struct net *net,
 	const struct sctp_endpoint *ep,
 	const struct sctp_association *asoc,
-	const sctp_subtype_t type,
+	const union sctp_subtype type,
 	void *arg,
 	sctp_cmd_seq_t *commands)
 {
@@ -5019,7 +5020,7 @@ sctp_disposition_t sctp_sf_cookie_echoed_prm_shutdown(
 	struct net *net,
 	const struct sctp_endpoint *ep,
 	const struct sctp_association *asoc,
-	const sctp_subtype_t type,
+	const union sctp_subtype type,
 	void *arg, sctp_cmd_seq_t *commands)
 {
 	/* There is a single T1 timer, so we should be able to use
@@ -5046,7 +5047,7 @@ sctp_disposition_t sctp_sf_cookie_wait_prm_abort(
 	struct net *net,
 	const struct sctp_endpoint *ep,
 	const struct sctp_association *asoc,
-	const sctp_subtype_t type,
+	const union sctp_subtype type,
 	void *arg,
 	sctp_cmd_seq_t *commands)
 {
@@ -5095,7 +5096,7 @@ sctp_disposition_t sctp_sf_cookie_echoed_prm_abort(
 	struct net *net,
 	const struct sctp_endpoint *ep,
 	const struct sctp_association *asoc,
-	const sctp_subtype_t type,
+	const union sctp_subtype type,
 	void *arg,
 	sctp_cmd_seq_t *commands)
 {
@@ -5121,7 +5122,7 @@ sctp_disposition_t sctp_sf_shutdown_pending_prm_abort(
 	struct net *net,
 	const struct sctp_endpoint *ep,
 	const struct sctp_association *asoc,
-	const sctp_subtype_t type,
+	const union sctp_subtype type,
 	void *arg,
 	sctp_cmd_seq_t *commands)
 {
@@ -5148,7 +5149,7 @@ sctp_disposition_t sctp_sf_shutdown_sent_prm_abort(
 	struct net *net,
 	const struct sctp_endpoint *ep,
 	const struct sctp_association *asoc,
-	const sctp_subtype_t type,
+	const union sctp_subtype type,
 	void *arg,
 	sctp_cmd_seq_t *commands)
 {
@@ -5179,7 +5180,7 @@ sctp_disposition_t sctp_sf_shutdown_ack_sent_prm_abort(
 	struct net *net,
 	const struct sctp_endpoint *ep,
 	const struct sctp_association *asoc,
-	const sctp_subtype_t type,
+	const union sctp_subtype type,
 	void *arg,
 	sctp_cmd_seq_t *commands)
 {
@@ -5215,7 +5216,7 @@ sctp_disposition_t sctp_sf_do_prm_requestheartbeat(
 					struct net *net,
 					const struct sctp_endpoint *ep,
 					const struct sctp_association *asoc,
-					const sctp_subtype_t type,
+					const union sctp_subtype type,
 					void *arg,
 					sctp_cmd_seq_t *commands)
 {
@@ -5247,7 +5248,7 @@ sctp_disposition_t sctp_sf_do_prm_requestheartbeat(
 sctp_disposition_t sctp_sf_do_prm_asconf(struct net *net,
 					const struct sctp_endpoint *ep,
 					const struct sctp_association *asoc,
-					const sctp_subtype_t type,
+					const union sctp_subtype type,
 					void *arg,
 					sctp_cmd_seq_t *commands)
 {
@@ -5264,7 +5265,7 @@ sctp_disposition_t sctp_sf_do_prm_asconf(struct net *net,
 sctp_disposition_t sctp_sf_do_prm_reconf(struct net *net,
 					 const struct sctp_endpoint *ep,
 					 const struct sctp_association *asoc,
-					 const sctp_subtype_t type,
+					 const union sctp_subtype type,
 					 void *arg, sctp_cmd_seq_t *commands)
 {
 	struct sctp_chunk *chunk = arg;
@@ -5282,7 +5283,7 @@ sctp_disposition_t sctp_sf_ignore_primitive(
 	struct net *net,
 	const struct sctp_endpoint *ep,
 	const struct sctp_association *asoc,
-	const sctp_subtype_t type,
+	const union sctp_subtype type,
 	void *arg,
 	sctp_cmd_seq_t *commands)
 {
@@ -5306,7 +5307,7 @@ sctp_disposition_t sctp_sf_do_no_pending_tsn(
 	struct net *net,
 	const struct sctp_endpoint *ep,
 	const struct sctp_association *asoc,
-	const sctp_subtype_t type,
+	const union sctp_subtype type,
 	void *arg,
 	sctp_cmd_seq_t *commands)
 {
@@ -5338,7 +5339,7 @@ sctp_disposition_t sctp_sf_do_9_2_start_shutdown(
 	struct net *net,
 	const struct sctp_endpoint *ep,
 	const struct sctp_association *asoc,
-	const sctp_subtype_t type,
+	const union sctp_subtype type,
 	void *arg,
 	sctp_cmd_seq_t *commands)
 {
@@ -5408,7 +5409,7 @@ sctp_disposition_t sctp_sf_do_9_2_shutdown_ack(
 	struct net *net,
 	const struct sctp_endpoint *ep,
 	const struct sctp_association *asoc,
-	const sctp_subtype_t type,
+	const union sctp_subtype type,
 	void *arg,
 	sctp_cmd_seq_t *commands)
 {
@@ -5481,7 +5482,7 @@ nomem:
 sctp_disposition_t sctp_sf_ignore_other(struct net *net,
 					const struct sctp_endpoint *ep,
 					const struct sctp_association *asoc,
-					const sctp_subtype_t type,
+					const union sctp_subtype type,
 					void *arg,
 					sctp_cmd_seq_t *commands)
 {
@@ -5509,7 +5510,7 @@ sctp_disposition_t sctp_sf_ignore_other(struct net *net,
 sctp_disposition_t sctp_sf_do_6_3_3_rtx(struct net *net,
 					const struct sctp_endpoint *ep,
 					const struct sctp_association *asoc,
-					const sctp_subtype_t type,
+					const union sctp_subtype type,
 					void *arg,
 					sctp_cmd_seq_t *commands)
 {
@@ -5597,7 +5598,7 @@ sctp_disposition_t sctp_sf_do_6_3_3_rtx(struct net *net,
 sctp_disposition_t sctp_sf_do_6_2_sack(struct net *net,
 				       const struct sctp_endpoint *ep,
 				       const struct sctp_association *asoc,
-				       const sctp_subtype_t type,
+				       const union sctp_subtype type,
 				       void *arg,
 				       sctp_cmd_seq_t *commands)
 {
@@ -5628,7 +5629,7 @@ sctp_disposition_t sctp_sf_do_6_2_sack(struct net *net,
 sctp_disposition_t sctp_sf_t1_init_timer_expire(struct net *net,
 					   const struct sctp_endpoint *ep,
 					   const struct sctp_association *asoc,
-					   const sctp_subtype_t type,
+					   const union sctp_subtype type,
 					   void *arg,
 					   sctp_cmd_seq_t *commands)
 {
@@ -5692,7 +5693,7 @@ sctp_disposition_t sctp_sf_t1_init_timer_expire(struct net *net,
 sctp_disposition_t sctp_sf_t1_cookie_timer_expire(struct net *net,
 					   const struct sctp_endpoint *ep,
 					   const struct sctp_association *asoc,
-					   const sctp_subtype_t type,
+					   const union sctp_subtype type,
 					   void *arg,
 					   sctp_cmd_seq_t *commands)
 {
@@ -5742,7 +5743,7 @@ sctp_disposition_t sctp_sf_t1_cookie_timer_expire(struct net *net,
 sctp_disposition_t sctp_sf_t2_timer_expire(struct net *net,
 					   const struct sctp_endpoint *ep,
 					   const struct sctp_association *asoc,
-					   const sctp_subtype_t type,
+					   const union sctp_subtype type,
 					   void *arg,
 					   sctp_cmd_seq_t *commands)
 {
@@ -5813,7 +5814,7 @@ sctp_disposition_t sctp_sf_t4_timer_expire(
 	struct net *net,
 	const struct sctp_endpoint *ep,
 	const struct sctp_association *asoc,
-	const sctp_subtype_t type,
+	const union sctp_subtype type,
 	void *arg,
 	sctp_cmd_seq_t *commands)
 {
@@ -5884,7 +5885,7 @@ sctp_disposition_t sctp_sf_t4_timer_expire(
 sctp_disposition_t sctp_sf_t5_timer_expire(struct net *net,
 					   const struct sctp_endpoint *ep,
 					   const struct sctp_association *asoc,
-					   const sctp_subtype_t type,
+					   const union sctp_subtype type,
 					   void *arg,
 					   sctp_cmd_seq_t *commands)
 {
@@ -5921,7 +5922,7 @@ sctp_disposition_t sctp_sf_autoclose_timer_expire(
 	struct net *net,
 	const struct sctp_endpoint *ep,
 	const struct sctp_association *asoc,
-	const sctp_subtype_t type,
+	const union sctp_subtype type,
 	void *arg,
 	sctp_cmd_seq_t *commands)
 {
@@ -5963,7 +5964,7 @@ sctp_disposition_t sctp_sf_autoclose_timer_expire(
 sctp_disposition_t sctp_sf_not_impl(struct net *net,
 				    const struct sctp_endpoint *ep,
 				    const struct sctp_association *asoc,
-				    const sctp_subtype_t type,
+				    const union sctp_subtype type,
 				    void *arg,
 				    sctp_cmd_seq_t *commands)
 {
@@ -5981,7 +5982,7 @@ sctp_disposition_t sctp_sf_not_impl(struct net *net,
 sctp_disposition_t sctp_sf_bug(struct net *net,
 			       const struct sctp_endpoint *ep,
 			       const struct sctp_association *asoc,
-			       const sctp_subtype_t type,
+			       const union sctp_subtype type,
 			       void *arg,
 			       sctp_cmd_seq_t *commands)
 {
@@ -6002,7 +6003,7 @@ sctp_disposition_t sctp_sf_bug(struct net *net,
 sctp_disposition_t sctp_sf_timer_ignore(struct net *net,
 					const struct sctp_endpoint *ep,
 					const struct sctp_association *asoc,
-					const sctp_subtype_t type,
+					const union sctp_subtype type,
 					void *arg,
 					sctp_cmd_seq_t *commands)
 {
diff --git a/net/sctp/sm_statetable.c b/net/sctp/sm_statetable.c
index f7cdb7014244..d437f3801399 100644
--- a/net/sctp/sm_statetable.c
+++ b/net/sctp/sm_statetable.c
@@ -77,10 +77,11 @@ static const sctp_sm_table_entry_t bug = {
 	rtn;								\
 })
 
-const sctp_sm_table_entry_t *sctp_sm_lookup_event(struct net *net,
-						  enum sctp_event event_type,
-						  enum sctp_state state,
-						  sctp_subtype_t event_subtype)
+const sctp_sm_table_entry_t *sctp_sm_lookup_event(
+					struct net *net,
+					enum sctp_event event_type,
+					enum sctp_state state,
+					union sctp_subtype event_subtype)
 {
 	switch (event_type) {
 	case SCTP_EVENT_T_CHUNK:
-- 
cgit v1.2.3


From eeb66cdb682678bfd1f02a4547e3649b38ffea7e Mon Sep 17 00:00:00 2001
From: Saeed Mahameed <saeedm@mellanox.com>
Date: Sun, 4 Jun 2017 23:11:55 +0300
Subject: net/mlx5: Separate between E-Switch and MPFS

Multi-Physical Function Switch (MPFs) is required for when multi-PF
configuration is enabled to allow passing user configured unicast MAC
addresses to the requesting PF.

Before this patch eswitch.c used to manage the HW MPFS l2 table,
E-Switch always (regardless of sriov) enabled vport(0) (NIC PF) vport's
contexts update on unicast mac address list changes, to populate the PF's
MPFS L2 table accordingly.

In downstream patch we would like to allow compiling the driver without
E-Switch functionalities, for that we move MPFS l2 table logic out
of eswitch.c into its own file, and provide Kconfig flag (MLX5_MPFS) to
allow compiling out MPFS for those who don't want Multi-PF support.

NIC PF netdevice will now directly update MPFS l2 table via the new MPFS
API. VF netdevice has no access to MPFS L2 table, so E-Switch will remain
responsible of updating its MPFS l2 table on behalf of its VFs.

Due to this change we also don't require enabling vport(0) (PF vport)
unicast mac changes events anymore, for when SRIOV is not enabled.
Which means E-Switch is now activated only on SRIOV activation, and not
required otherwise.

Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
Cc: Jes Sorensen <jsorensen@fb.com>
Cc: kernel-team@fb.com
---
 drivers/net/ethernet/mellanox/mlx5/core/Kconfig    |  10 +
 drivers/net/ethernet/mellanox/mlx5/core/Makefile   |   2 +
 drivers/net/ethernet/mellanox/mlx5/core/en_fs.c    |  17 +-
 drivers/net/ethernet/mellanox/mlx5/core/eswitch.c  | 190 ++++---------------
 drivers/net/ethernet/mellanox/mlx5/core/eswitch.h  |  55 +-----
 drivers/net/ethernet/mellanox/mlx5/core/lib/mpfs.c | 201 +++++++++++++++++++++
 drivers/net/ethernet/mellanox/mlx5/core/lib/mpfs.h |  95 ++++++++++
 drivers/net/ethernet/mellanox/mlx5/core/main.c     |  26 ++-
 include/linux/mlx5/driver.h                        |   2 +
 9 files changed, 377 insertions(+), 221 deletions(-)
 create mode 100644 drivers/net/ethernet/mellanox/mlx5/core/lib/mpfs.c
 create mode 100644 drivers/net/ethernet/mellanox/mlx5/core/lib/mpfs.h

(limited to 'include')

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/Kconfig b/drivers/net/ethernet/mellanox/mlx5/core/Kconfig
index 5aee05992f27..d7174295b6ef 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/Kconfig
+++ b/drivers/net/ethernet/mellanox/mlx5/core/Kconfig
@@ -34,6 +34,16 @@ config MLX5_CORE_EN
 	---help---
 	  Ethernet support in Mellanox Technologies ConnectX-4 NIC.
 
+config MLX5_MPFS
+        bool "Mellanox Technologies MLX5 MPFS support"
+        depends on MLX5_CORE_EN
+	default y
+        ---help---
+	  Mellanox Technologies Ethernet Multi-Physical Function Switch (MPFS)
+          support in ConnectX NIC. MPFs is required for when multi-PF configuration
+          is enabled to allow passing user configured unicast MAC addresses to the
+          requesting PF.
+
 config MLX5_CORE_EN_DCB
 	bool "Data Center Bridging (DCB) Support"
 	default y
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/Makefile b/drivers/net/ethernet/mellanox/mlx5/core/Makefile
index 9d17e4e76d3a..c867e48f8a4c 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/Makefile
+++ b/drivers/net/ethernet/mellanox/mlx5/core/Makefile
@@ -16,6 +16,8 @@ mlx5_core-$(CONFIG_MLX5_CORE_EN) += eswitch.o eswitch_offloads.o \
 		en_rx.o en_rx_am.o en_txrx.o en_clock.o vxlan.o \
 		en_tc.o en_arfs.o en_rep.o en_fs_ethtool.o en_selftest.o
 
+mlx5_core-$(CONFIG_MLX5_MPFS) += lib/mpfs.o
+
 mlx5_core-$(CONFIG_MLX5_CORE_EN_DCB) +=  en_dcbnl.o
 
 mlx5_core-$(CONFIG_MLX5_CORE_IPOIB) += ipoib/ipoib.o ipoib/ethtool.o
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_fs.c b/drivers/net/ethernet/mellanox/mlx5/core/en_fs.c
index dfccb5305e9c..eecbc6d4f51f 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_fs.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_fs.c
@@ -36,6 +36,7 @@
 #include <linux/tcp.h>
 #include <linux/mlx5/fs.h>
 #include "en.h"
+#include "lib/mpfs.h"
 
 static int mlx5e_add_l2_flow_rule(struct mlx5e_priv *priv,
 				  struct mlx5e_l2_rule *ai, int type);
@@ -65,6 +66,7 @@ struct mlx5e_l2_hash_node {
 	struct hlist_node          hlist;
 	u8                         action;
 	struct mlx5e_l2_rule ai;
+	bool   mpfs;
 };
 
 static inline int mlx5e_hash_l2(u8 *addr)
@@ -362,17 +364,30 @@ static void mlx5e_del_vlan_rules(struct mlx5e_priv *priv)
 static void mlx5e_execute_l2_action(struct mlx5e_priv *priv,
 				    struct mlx5e_l2_hash_node *hn)
 {
-	switch (hn->action) {
+	u8 action = hn->action;
+	int l2_err = 0;
+
+	switch (action) {
 	case MLX5E_ACTION_ADD:
 		mlx5e_add_l2_flow_rule(priv, &hn->ai, MLX5E_FULLMATCH);
+		if (!is_multicast_ether_addr(hn->ai.addr)) {
+			l2_err = mlx5_mpfs_add_mac(priv->mdev, hn->ai.addr);
+			hn->mpfs = !l2_err;
+		}
 		hn->action = MLX5E_ACTION_NONE;
 		break;
 
 	case MLX5E_ACTION_DEL:
+		if (!is_multicast_ether_addr(hn->ai.addr) && hn->mpfs)
+			l2_err = mlx5_mpfs_del_mac(priv->mdev, hn->ai.addr);
 		mlx5e_del_l2_flow_rule(priv, &hn->ai);
 		mlx5e_del_l2_from_hash(hn);
 		break;
 	}
+
+	if (l2_err)
+		netdev_warn(priv->netdev, "MPFS, failed to %s mac %pM, err(%d)\n",
+			    action == MLX5E_ACTION_ADD ? "add" : "del", hn->ai.addr, l2_err);
 }
 
 static void mlx5e_sync_netdev_addr(struct mlx5e_priv *priv)
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eswitch.c b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.c
index 5c001b61d04a..fd51f0ea8df9 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/eswitch.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.c
@@ -46,19 +46,13 @@ enum {
 	MLX5_ACTION_DEL  = 2,
 };
 
-/* E-Switch UC L2 table hash node */
-struct esw_uc_addr {
-	struct l2addr_node node;
-	u32                table_index;
-	u32                vport;
-};
-
 /* Vport UC/MC hash node */
 struct vport_addr {
 	struct l2addr_node     node;
 	u8                     action;
 	u32                    vport;
-	struct mlx5_flow_handle *flow_rule; /* SRIOV only */
+	struct mlx5_flow_handle *flow_rule;
+	bool mpfs; /* UC MAC was added to MPFs */
 	/* A flag indicating that mac was added due to mc promiscuous vport */
 	bool mc_promisc;
 };
@@ -154,81 +148,6 @@ static int modify_esw_vport_cvlan(struct mlx5_core_dev *dev, u32 vport,
 	return modify_esw_vport_context_cmd(dev, vport, in, sizeof(in));
 }
 
-/* HW L2 Table (MPFS) management */
-static int set_l2_table_entry_cmd(struct mlx5_core_dev *dev, u32 index,
-				  u8 *mac, u8 vlan_valid, u16 vlan)
-{
-	u32 in[MLX5_ST_SZ_DW(set_l2_table_entry_in)]   = {0};
-	u32 out[MLX5_ST_SZ_DW(set_l2_table_entry_out)] = {0};
-	u8 *in_mac_addr;
-
-	MLX5_SET(set_l2_table_entry_in, in, opcode,
-		 MLX5_CMD_OP_SET_L2_TABLE_ENTRY);
-	MLX5_SET(set_l2_table_entry_in, in, table_index, index);
-	MLX5_SET(set_l2_table_entry_in, in, vlan_valid, vlan_valid);
-	MLX5_SET(set_l2_table_entry_in, in, vlan, vlan);
-
-	in_mac_addr = MLX5_ADDR_OF(set_l2_table_entry_in, in, mac_address);
-	ether_addr_copy(&in_mac_addr[2], mac);
-
-	return mlx5_cmd_exec(dev, in, sizeof(in), out, sizeof(out));
-}
-
-static int del_l2_table_entry_cmd(struct mlx5_core_dev *dev, u32 index)
-{
-	u32 in[MLX5_ST_SZ_DW(delete_l2_table_entry_in)]   = {0};
-	u32 out[MLX5_ST_SZ_DW(delete_l2_table_entry_out)] = {0};
-
-	MLX5_SET(delete_l2_table_entry_in, in, opcode,
-		 MLX5_CMD_OP_DELETE_L2_TABLE_ENTRY);
-	MLX5_SET(delete_l2_table_entry_in, in, table_index, index);
-	return mlx5_cmd_exec(dev, in, sizeof(in), out, sizeof(out));
-}
-
-static int alloc_l2_table_index(struct mlx5_l2_table *l2_table, u32 *ix)
-{
-	int err = 0;
-
-	*ix = find_first_zero_bit(l2_table->bitmap, l2_table->size);
-	if (*ix >= l2_table->size)
-		err = -ENOSPC;
-	else
-		__set_bit(*ix, l2_table->bitmap);
-
-	return err;
-}
-
-static void free_l2_table_index(struct mlx5_l2_table *l2_table, u32 ix)
-{
-	__clear_bit(ix, l2_table->bitmap);
-}
-
-static int set_l2_table_entry(struct mlx5_core_dev *dev, u8 *mac,
-			      u8 vlan_valid, u16 vlan,
-			      u32 *index)
-{
-	struct mlx5_l2_table *l2_table = &dev->priv.eswitch->l2_table;
-	int err;
-
-	err = alloc_l2_table_index(l2_table, index);
-	if (err)
-		return err;
-
-	err = set_l2_table_entry_cmd(dev, *index, mac, vlan_valid, vlan);
-	if (err)
-		free_l2_table_index(l2_table, *index);
-
-	return err;
-}
-
-static void del_l2_table_entry(struct mlx5_core_dev *dev, u32 index)
-{
-	struct mlx5_l2_table *l2_table = &dev->priv.eswitch->l2_table;
-
-	del_l2_table_entry_cmd(dev, index);
-	free_l2_table_index(l2_table, index);
-}
-
 /* E-Switch FDB */
 static struct mlx5_flow_handle *
 __esw_fdb_set_vport_rule(struct mlx5_eswitch *esw, u32 vport, bool rx_rule,
@@ -455,65 +374,60 @@ typedef int (*vport_addr_action)(struct mlx5_eswitch *esw,
 
 static int esw_add_uc_addr(struct mlx5_eswitch *esw, struct vport_addr *vaddr)
 {
-	struct hlist_head *hash = esw->l2_table.l2_hash;
-	struct esw_uc_addr *esw_uc;
 	u8 *mac = vaddr->node.addr;
 	u32 vport = vaddr->vport;
 	int err;
 
-	esw_uc = l2addr_hash_find(hash, mac, struct esw_uc_addr);
-	if (esw_uc) {
+	/* Skip mlx5_mpfs_add_mac for PFs,
+	 * it is already done by the PF netdev in mlx5e_execute_l2_action
+	 */
+	if (!vport)
+		goto fdb_add;
+
+	err = mlx5_mpfs_add_mac(esw->dev, mac);
+	if (err) {
 		esw_warn(esw->dev,
-			 "Failed to set L2 mac(%pM) for vport(%d), mac is already in use by vport(%d)\n",
-			 mac, vport, esw_uc->vport);
-		return -EEXIST;
+			 "Failed to add L2 table mac(%pM) for vport(%d), err(%d)\n",
+			 mac, vport, err);
+		return err;
 	}
+	vaddr->mpfs = true;
 
-	esw_uc = l2addr_hash_add(hash, mac, struct esw_uc_addr, GFP_KERNEL);
-	if (!esw_uc)
-		return -ENOMEM;
-	esw_uc->vport = vport;
-
-	err = set_l2_table_entry(esw->dev, mac, 0, 0, &esw_uc->table_index);
-	if (err)
-		goto abort;
-
+fdb_add:
 	/* SRIOV is enabled: Forward UC MAC to vport */
 	if (esw->fdb_table.fdb && esw->mode == SRIOV_LEGACY)
 		vaddr->flow_rule = esw_fdb_set_vport_rule(esw, mac, vport);
 
-	esw_debug(esw->dev, "\tADDED UC MAC: vport[%d] %pM index:%d fr(%p)\n",
-		  vport, mac, esw_uc->table_index, vaddr->flow_rule);
-	return err;
-abort:
-	l2addr_hash_del(esw_uc);
+	esw_debug(esw->dev, "\tADDED UC MAC: vport[%d] %pM fr(%p)\n",
+		  vport, mac, vaddr->flow_rule);
+
 	return err;
 }
 
 static int esw_del_uc_addr(struct mlx5_eswitch *esw, struct vport_addr *vaddr)
 {
-	struct hlist_head *hash = esw->l2_table.l2_hash;
-	struct esw_uc_addr *esw_uc;
 	u8 *mac = vaddr->node.addr;
 	u32 vport = vaddr->vport;
+	int err = 0;
 
-	esw_uc = l2addr_hash_find(hash, mac, struct esw_uc_addr);
-	if (!esw_uc || esw_uc->vport != vport) {
-		esw_debug(esw->dev,
-			  "MAC(%pM) doesn't belong to vport (%d)\n",
-			  mac, vport);
-		return -EINVAL;
-	}
-	esw_debug(esw->dev, "\tDELETE UC MAC: vport[%d] %pM index:%d fr(%p)\n",
-		  vport, mac, esw_uc->table_index, vaddr->flow_rule);
+	/* Skip mlx5_mpfs_del_mac for PFs,
+	 * it is already done by the PF netdev in mlx5e_execute_l2_action
+	 */
+	if (!vport || !vaddr->mpfs)
+		goto fdb_del;
 
-	del_l2_table_entry(esw->dev, esw_uc->table_index);
+	err = mlx5_mpfs_del_mac(esw->dev, mac);
+	if (err)
+		esw_warn(esw->dev,
+			 "Failed to del L2 table mac(%pM) for vport(%d), err(%d)\n",
+			 mac, vport, err);
+	vaddr->mpfs = false;
 
+fdb_del:
 	if (vaddr->flow_rule)
 		mlx5_del_flow_rules(vaddr->flow_rule);
 	vaddr->flow_rule = NULL;
 
-	l2addr_hash_del(esw_uc);
 	return 0;
 }
 
@@ -1635,7 +1549,6 @@ int mlx5_eswitch_enable_sriov(struct mlx5_eswitch *esw, int nvfs, int mode)
 
 	esw_info(esw->dev, "E-Switch enable SRIOV: nvfs(%d) mode (%d)\n", nvfs, mode);
 	esw->mode = mode;
-	esw_disable_vport(esw, 0);
 
 	if (mode == SRIOV_LEGACY)
 		err = esw_create_legacy_fdb_table(esw, nvfs + 1);
@@ -1648,7 +1561,11 @@ int mlx5_eswitch_enable_sriov(struct mlx5_eswitch *esw, int nvfs, int mode)
 	if (err)
 		esw_warn(esw->dev, "Failed to create eswitch TSAR");
 
-	enabled_events = (mode == SRIOV_LEGACY) ? SRIOV_VPORT_EVENTS : UC_ADDR_CHANGE;
+	/* Don't enable vport events when in SRIOV_OFFLOADS mode, since:
+	 * 1. L2 table (MPFS) is programmed by PF/VF representors netdevs set_rx_mode
+	 * 2. FDB/Eswitch is programmed by user space tools
+	 */
+	enabled_events = (mode == SRIOV_LEGACY) ? SRIOV_VPORT_EVENTS : 0;
 	for (i = 0; i <= nvfs; i++)
 		esw_enable_vport(esw, i, enabled_events);
 
@@ -1657,7 +1574,6 @@ int mlx5_eswitch_enable_sriov(struct mlx5_eswitch *esw, int nvfs, int mode)
 	return 0;
 
 abort:
-	esw_enable_vport(esw, 0, UC_ADDR_CHANGE);
 	esw->mode = SRIOV_NONE;
 	return err;
 }
@@ -1691,30 +1607,10 @@ void mlx5_eswitch_disable_sriov(struct mlx5_eswitch *esw)
 		esw_offloads_cleanup(esw, nvports);
 
 	esw->mode = SRIOV_NONE;
-	/* VPORT 0 (PF) must be enabled back with non-sriov configuration */
-	esw_enable_vport(esw, 0, UC_ADDR_CHANGE);
-}
-
-void mlx5_eswitch_attach(struct mlx5_eswitch *esw)
-{
-	if (!ESW_ALLOWED(esw))
-		return;
-
-	esw_enable_vport(esw, 0, UC_ADDR_CHANGE);
-	/* VF Vports will be enabled when SRIOV is enabled */
-}
-
-void mlx5_eswitch_detach(struct mlx5_eswitch *esw)
-{
-	if (!ESW_ALLOWED(esw))
-		return;
-
-	esw_disable_vport(esw, 0);
 }
 
 int mlx5_eswitch_init(struct mlx5_core_dev *dev)
 {
-	int l2_table_size = 1 << MLX5_CAP_GEN(dev, log_max_l2_table);
 	int total_vports = MLX5_TOTAL_VPORTS(dev);
 	struct mlx5_eswitch *esw;
 	int vport_num;
@@ -1724,8 +1620,8 @@ int mlx5_eswitch_init(struct mlx5_core_dev *dev)
 		return 0;
 
 	esw_info(dev,
-		 "Total vports %d, l2 table size(%d), per vport: max uc(%d) max mc(%d)\n",
-		 total_vports, l2_table_size,
+		 "Total vports %d, per vport: max uc(%d) max mc(%d)\n",
+		 total_vports,
 		 MLX5_MAX_UC_PER_VPORT(dev),
 		 MLX5_MAX_MC_PER_VPORT(dev));
 
@@ -1735,14 +1631,6 @@ int mlx5_eswitch_init(struct mlx5_core_dev *dev)
 
 	esw->dev = dev;
 
-	esw->l2_table.bitmap = kcalloc(BITS_TO_LONGS(l2_table_size),
-				   sizeof(uintptr_t), GFP_KERNEL);
-	if (!esw->l2_table.bitmap) {
-		err = -ENOMEM;
-		goto abort;
-	}
-	esw->l2_table.size = l2_table_size;
-
 	esw->work_queue = create_singlethread_workqueue("mlx5_esw_wq");
 	if (!esw->work_queue) {
 		err = -ENOMEM;
@@ -1793,7 +1681,6 @@ int mlx5_eswitch_init(struct mlx5_core_dev *dev)
 abort:
 	if (esw->work_queue)
 		destroy_workqueue(esw->work_queue);
-	kfree(esw->l2_table.bitmap);
 	kfree(esw->vports);
 	kfree(esw->offloads.vport_reps);
 	kfree(esw);
@@ -1809,7 +1696,6 @@ void mlx5_eswitch_cleanup(struct mlx5_eswitch *esw)
 
 	esw->dev->priv.eswitch = NULL;
 	destroy_workqueue(esw->work_queue);
-	kfree(esw->l2_table.bitmap);
 	kfree(esw->offloads.vport_reps);
 	kfree(esw->vports);
 	kfree(esw);
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h
index 834a33050969..701d228de4ad 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h
@@ -37,6 +37,7 @@
 #include <linux/if_link.h>
 #include <net/devlink.h>
 #include <linux/mlx5/device.h>
+#include "lib/mpfs.h"
 
 #define MLX5_MAX_UC_PER_VPORT(dev) \
 	(1 << MLX5_CAP_GEN(dev, log_max_current_uc_list))
@@ -44,9 +45,6 @@
 #define MLX5_MAX_MC_PER_VPORT(dev) \
 	(1 << MLX5_CAP_GEN(dev, log_max_current_mc_list))
 
-#define MLX5_L2_ADDR_HASH_SIZE (BIT(BITS_PER_BYTE))
-#define MLX5_L2_ADDR_HASH(addr) (addr[5])
-
 #define FDB_UPLINK_VPORT 0xffff
 
 #define MLX5_MIN_BW_SHARE 1
@@ -54,48 +52,6 @@
 #define MLX5_RATE_TO_BW_SHARE(rate, divider, limit) \
 	min_t(u32, max_t(u32, (rate) / (divider), MLX5_MIN_BW_SHARE), limit)
 
-/* L2 -mac address based- hash helpers */
-struct l2addr_node {
-	struct hlist_node hlist;
-	u8                addr[ETH_ALEN];
-};
-
-#define for_each_l2hash_node(hn, tmp, hash, i) \
-	for (i = 0; i < MLX5_L2_ADDR_HASH_SIZE; i++) \
-		hlist_for_each_entry_safe(hn, tmp, &hash[i], hlist)
-
-#define l2addr_hash_find(hash, mac, type) ({                \
-	int ix = MLX5_L2_ADDR_HASH(mac);                    \
-	bool found = false;                                 \
-	type *ptr = NULL;                                   \
-							    \
-	hlist_for_each_entry(ptr, &hash[ix], node.hlist)    \
-		if (ether_addr_equal(ptr->node.addr, mac)) {\
-			found = true;                       \
-			break;                              \
-		}                                           \
-	if (!found)                                         \
-		ptr = NULL;                                 \
-	ptr;                                                \
-})
-
-#define l2addr_hash_add(hash, mac, type, gfp) ({            \
-	int ix = MLX5_L2_ADDR_HASH(mac);                    \
-	type *ptr = NULL;                                   \
-							    \
-	ptr = kzalloc(sizeof(type), gfp);                   \
-	if (ptr) {                                          \
-		ether_addr_copy(ptr->node.addr, mac);       \
-		hlist_add_head(&ptr->node.hlist, &hash[ix]);\
-	}                                                   \
-	ptr;                                                \
-})
-
-#define l2addr_hash_del(ptr) ({                             \
-	hlist_del(&ptr->node.hlist);                        \
-	kfree(ptr);                                         \
-})
-
 struct vport_ingress {
 	struct mlx5_flow_table *acl;
 	struct mlx5_flow_group *allow_untagged_spoofchk_grp;
@@ -150,12 +106,6 @@ struct mlx5_vport {
 	u16                     enabled_events;
 };
 
-struct mlx5_l2_table {
-	struct hlist_head l2_hash[MLX5_L2_ADDR_HASH_SIZE];
-	u32                  size;
-	unsigned long        *bitmap;
-};
-
 struct mlx5_eswitch_fdb {
 	void *fdb;
 	union {
@@ -222,7 +172,6 @@ struct esw_mc_addr { /* SRIOV only */
 
 struct mlx5_eswitch {
 	struct mlx5_core_dev    *dev;
-	struct mlx5_l2_table    l2_table;
 	struct mlx5_eswitch_fdb fdb_table;
 	struct hlist_head       mc_table[MLX5_L2_ADDR_HASH_SIZE];
 	struct workqueue_struct *work_queue;
@@ -250,8 +199,6 @@ int esw_offloads_init(struct mlx5_eswitch *esw, int nvports);
 /* E-Switch API */
 int mlx5_eswitch_init(struct mlx5_core_dev *dev);
 void mlx5_eswitch_cleanup(struct mlx5_eswitch *esw);
-void mlx5_eswitch_attach(struct mlx5_eswitch *esw);
-void mlx5_eswitch_detach(struct mlx5_eswitch *esw);
 void mlx5_eswitch_vport_event(struct mlx5_eswitch *esw, struct mlx5_eqe *eqe);
 int mlx5_eswitch_enable_sriov(struct mlx5_eswitch *esw, int nvfs, int mode);
 void mlx5_eswitch_disable_sriov(struct mlx5_eswitch *esw);
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/lib/mpfs.c b/drivers/net/ethernet/mellanox/mlx5/core/lib/mpfs.c
new file mode 100644
index 000000000000..7cb67122e8b5
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlx5/core/lib/mpfs.c
@@ -0,0 +1,201 @@
+/*
+ * Copyright (c) 2017, Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/etherdevice.h>
+#include <linux/mlx5/driver.h>
+#include <linux/mlx5/mlx5_ifc.h>
+#include "mlx5_core.h"
+#include "lib/mpfs.h"
+
+/* HW L2 Table (MPFS) management */
+static int set_l2table_entry_cmd(struct mlx5_core_dev *dev, u32 index, u8 *mac)
+{
+	u32 in[MLX5_ST_SZ_DW(set_l2_table_entry_in)]   = {0};
+	u32 out[MLX5_ST_SZ_DW(set_l2_table_entry_out)] = {0};
+	u8 *in_mac_addr;
+
+	MLX5_SET(set_l2_table_entry_in, in, opcode, MLX5_CMD_OP_SET_L2_TABLE_ENTRY);
+	MLX5_SET(set_l2_table_entry_in, in, table_index, index);
+
+	in_mac_addr = MLX5_ADDR_OF(set_l2_table_entry_in, in, mac_address);
+	ether_addr_copy(&in_mac_addr[2], mac);
+
+	return mlx5_cmd_exec(dev, in, sizeof(in), out, sizeof(out));
+}
+
+static int del_l2table_entry_cmd(struct mlx5_core_dev *dev, u32 index)
+{
+	u32 in[MLX5_ST_SZ_DW(delete_l2_table_entry_in)]   = {0};
+	u32 out[MLX5_ST_SZ_DW(delete_l2_table_entry_out)] = {0};
+
+	MLX5_SET(delete_l2_table_entry_in, in, opcode, MLX5_CMD_OP_DELETE_L2_TABLE_ENTRY);
+	MLX5_SET(delete_l2_table_entry_in, in, table_index, index);
+	return mlx5_cmd_exec(dev, in, sizeof(in), out, sizeof(out));
+}
+
+/* UC L2 table hash node */
+struct l2table_node {
+	struct l2addr_node node;
+	u32                index; /* index in HW l2 table */
+};
+
+struct mlx5_mpfs {
+	struct hlist_head    hash[MLX5_L2_ADDR_HASH_SIZE];
+	struct mutex         lock; /* Synchronize l2 table access */
+	u32                  size;
+	unsigned long        *bitmap;
+};
+
+static int alloc_l2table_index(struct mlx5_mpfs *l2table, u32 *ix)
+{
+	int err = 0;
+
+	*ix = find_first_zero_bit(l2table->bitmap, l2table->size);
+	if (*ix >= l2table->size)
+		err = -ENOSPC;
+	else
+		__set_bit(*ix, l2table->bitmap);
+
+	return err;
+}
+
+static void free_l2table_index(struct mlx5_mpfs *l2table, u32 ix)
+{
+	__clear_bit(ix, l2table->bitmap);
+}
+
+int mlx5_mpfs_init(struct mlx5_core_dev *dev)
+{
+	int l2table_size = 1 << MLX5_CAP_GEN(dev, log_max_l2_table);
+	struct mlx5_mpfs *mpfs;
+
+	if (!MLX5_VPORT_MANAGER(dev))
+		return 0;
+
+	mpfs = kzalloc(sizeof(*mpfs), GFP_KERNEL);
+	if (!mpfs)
+		return -ENOMEM;
+
+	mutex_init(&mpfs->lock);
+	mpfs->size   = l2table_size;
+	mpfs->bitmap = kcalloc(BITS_TO_LONGS(l2table_size),
+			       sizeof(uintptr_t), GFP_KERNEL);
+	if (!mpfs->bitmap) {
+		kfree(mpfs);
+		return -ENOMEM;
+	}
+
+	dev->priv.mpfs = mpfs;
+	return 0;
+}
+
+void mlx5_mpfs_cleanup(struct mlx5_core_dev *dev)
+{
+	struct mlx5_mpfs *mpfs = dev->priv.mpfs;
+
+	if (!MLX5_VPORT_MANAGER(dev))
+		return;
+
+	WARN_ON(!hlist_empty(mpfs->hash));
+	kfree(mpfs->bitmap);
+	kfree(mpfs);
+}
+
+int mlx5_mpfs_add_mac(struct mlx5_core_dev *dev, u8 *mac)
+{
+	struct mlx5_mpfs *mpfs = dev->priv.mpfs;
+	struct l2table_node *l2addr;
+	u32 index;
+	int err;
+
+	if (!MLX5_VPORT_MANAGER(dev))
+		return 0;
+
+	mutex_lock(&mpfs->lock);
+
+	l2addr = l2addr_hash_find(mpfs->hash, mac, struct l2table_node);
+	if (l2addr) {
+		err = -EEXIST;
+		goto abort;
+	}
+
+	err = alloc_l2table_index(mpfs, &index);
+	if (err)
+		goto abort;
+
+	l2addr = l2addr_hash_add(mpfs->hash, mac, struct l2table_node, GFP_KERNEL);
+	if (!l2addr) {
+		free_l2table_index(mpfs, index);
+		err = -ENOMEM;
+		goto abort;
+	}
+
+	l2addr->index = index;
+	err = set_l2table_entry_cmd(dev, index, mac);
+	if (err) {
+		l2addr_hash_del(l2addr);
+		free_l2table_index(mpfs, index);
+	}
+
+	mlx5_core_dbg(dev, "MPFS mac added %pM, index (%d)\n", mac, index);
+abort:
+	mutex_unlock(&mpfs->lock);
+	return err;
+}
+
+int mlx5_mpfs_del_mac(struct mlx5_core_dev *dev, u8 *mac)
+{
+	struct mlx5_mpfs *mpfs = dev->priv.mpfs;
+	struct l2table_node *l2addr;
+	int err = 0;
+	u32 index;
+
+	if (!MLX5_VPORT_MANAGER(dev))
+		return 0;
+
+	mutex_lock(&mpfs->lock);
+
+	l2addr = l2addr_hash_find(mpfs->hash, mac, struct l2table_node);
+	if (!l2addr) {
+		err = -ENOENT;
+		goto unlock;
+	}
+
+	index = l2addr->index;
+	del_l2table_entry_cmd(dev, index);
+	l2addr_hash_del(l2addr);
+	free_l2table_index(mpfs, index);
+	mlx5_core_dbg(dev, "MPFS mac deleted %pM, index (%d)\n", mac, index);
+unlock:
+	mutex_unlock(&mpfs->lock);
+	return err;
+}
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/lib/mpfs.h b/drivers/net/ethernet/mellanox/mlx5/core/lib/mpfs.h
new file mode 100644
index 000000000000..4a7b2c3203a7
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlx5/core/lib/mpfs.h
@@ -0,0 +1,95 @@
+/*
+ * Copyright (c) 2017, Mellanox Technologies, Ltd.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef __MLX5_MPFS_H__
+#define __MLX5_MPFS_H__
+
+#include <linux/if_ether.h>
+#include <linux/mlx5/device.h>
+
+/* L2 -mac address based- hash helpers */
+#define MLX5_L2_ADDR_HASH_SIZE (BIT(BITS_PER_BYTE))
+#define MLX5_L2_ADDR_HASH(addr) (addr[5])
+
+struct l2addr_node {
+	struct hlist_node hlist;
+	u8                addr[ETH_ALEN];
+};
+
+#define for_each_l2hash_node(hn, tmp, hash, i) \
+	for (i = 0; i < MLX5_L2_ADDR_HASH_SIZE; i++) \
+		hlist_for_each_entry_safe(hn, tmp, &(hash)[i], hlist)
+
+#define l2addr_hash_find(hash, mac, type) ({                \
+	int ix = MLX5_L2_ADDR_HASH(mac);                    \
+	bool found = false;                                 \
+	type *ptr = NULL;                                   \
+							    \
+	hlist_for_each_entry(ptr, &(hash)[ix], node.hlist)  \
+		if (ether_addr_equal(ptr->node.addr, mac)) {\
+			found = true;                       \
+			break;                              \
+		}                                           \
+	if (!found)                                         \
+		ptr = NULL;                                 \
+	ptr;                                                \
+})
+
+#define l2addr_hash_add(hash, mac, type, gfp) ({            \
+	int ix = MLX5_L2_ADDR_HASH(mac);                    \
+	type *ptr = NULL;                                   \
+							    \
+	ptr = kzalloc(sizeof(type), gfp);                   \
+	if (ptr) {                                          \
+		ether_addr_copy(ptr->node.addr, mac);       \
+		hlist_add_head(&ptr->node.hlist, &(hash)[ix]);\
+	}                                                   \
+	ptr;                                                \
+})
+
+#define l2addr_hash_del(ptr) ({                             \
+	hlist_del(&(ptr)->node.hlist);                      \
+	kfree(ptr);                                         \
+})
+
+#ifdef CONFIG_MLX5_MPFS
+int  mlx5_mpfs_init(struct mlx5_core_dev *dev);
+void mlx5_mpfs_cleanup(struct mlx5_core_dev *dev);
+int  mlx5_mpfs_add_mac(struct mlx5_core_dev *dev, u8 *mac);
+int  mlx5_mpfs_del_mac(struct mlx5_core_dev *dev, u8 *mac);
+#else /* #ifndef CONFIG_MLX5_MPFS */
+static inline int  mlx5_mpfs_init(struct mlx5_core_dev *dev) { return 0; }
+static inline void mlx5_mpfs_cleanup(struct mlx5_core_dev *dev) {}
+static inline int  mlx5_mpfs_add_mac(struct mlx5_core_dev *dev, u8 *mac) { return 0; }
+static inline int  mlx5_mpfs_del_mac(struct mlx5_core_dev *dev, u8 *mac) { return 0; }
+#endif
+#endif
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/main.c b/drivers/net/ethernet/mellanox/mlx5/core/main.c
index c065132b956d..d4a9c9b7b6a2 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/main.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/main.c
@@ -53,6 +53,7 @@
 #include <net/devlink.h>
 #include "mlx5_core.h"
 #include "fs_core.h"
+#include "lib/mpfs.h"
 #ifdef CONFIG_MLX5_CORE_EN
 #include "eswitch.h"
 #endif
@@ -946,11 +947,17 @@ static int mlx5_init_once(struct mlx5_core_dev *dev, struct mlx5_priv *priv)
 		goto err_tables_cleanup;
 	}
 
+	err = mlx5_mpfs_init(dev);
+	if (err) {
+		dev_err(&pdev->dev, "Failed to init l2 table %d\n", err);
+		goto err_rl_cleanup;
+	}
+
 #ifdef CONFIG_MLX5_CORE_EN
 	err = mlx5_eswitch_init(dev);
 	if (err) {
 		dev_err(&pdev->dev, "Failed to init eswitch %d\n", err);
-		goto err_rl_cleanup;
+		goto err_mpfs_cleanup;
 	}
 #endif
 
@@ -973,11 +980,11 @@ err_sriov_cleanup:
 err_eswitch_cleanup:
 #ifdef CONFIG_MLX5_CORE_EN
 	mlx5_eswitch_cleanup(dev->priv.eswitch);
-
-err_rl_cleanup:
+err_mpfs_cleanup:
 #endif
+	mlx5_mpfs_cleanup(dev);
+err_rl_cleanup:
 	mlx5_cleanup_rl_table(dev);
-
 err_tables_cleanup:
 	mlx5_cleanup_mkey_table(dev);
 	mlx5_cleanup_srq_table(dev);
@@ -998,6 +1005,7 @@ static void mlx5_cleanup_once(struct mlx5_core_dev *dev)
 #ifdef CONFIG_MLX5_CORE_EN
 	mlx5_eswitch_cleanup(dev->priv.eswitch);
 #endif
+	mlx5_mpfs_cleanup(dev);
 	mlx5_cleanup_rl_table(dev);
 	mlx5_cleanup_reserved_gids(dev);
 	mlx5_cleanup_mkey_table(dev);
@@ -1155,10 +1163,6 @@ static int mlx5_load_one(struct mlx5_core_dev *dev, struct mlx5_priv *priv,
 		goto err_fs;
 	}
 
-#ifdef CONFIG_MLX5_CORE_EN
-	mlx5_eswitch_attach(dev->priv.eswitch);
-#endif
-
 	err = mlx5_sriov_attach(dev);
 	if (err) {
 		dev_err(&pdev->dev, "sriov init failed %d\n", err);
@@ -1202,9 +1206,6 @@ err_fpga_start:
 	mlx5_sriov_detach(dev);
 
 err_sriov:
-#ifdef CONFIG_MLX5_CORE_EN
-	mlx5_eswitch_detach(dev->priv.eswitch);
-#endif
 	mlx5_cleanup_fs(dev);
 
 err_fs:
@@ -1279,9 +1280,6 @@ static int mlx5_unload_one(struct mlx5_core_dev *dev, struct mlx5_priv *priv,
 	mlx5_fpga_device_stop(dev);
 
 	mlx5_sriov_detach(dev);
-#ifdef CONFIG_MLX5_CORE_EN
-	mlx5_eswitch_detach(dev->priv.eswitch);
-#endif
 	mlx5_cleanup_fs(dev);
 	mlx5_irq_clear_affinity_hints(dev);
 	free_comp_eqs(dev);
diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h
index df6ce59a1f95..88d6eb5b3a76 100644
--- a/include/linux/mlx5/driver.h
+++ b/include/linux/mlx5/driver.h
@@ -550,6 +550,7 @@ struct mlx5_fc_stats {
 	unsigned long sampling_interval; /* jiffies */
 };
 
+struct mlx5_mpfs;
 struct mlx5_eswitch;
 struct mlx5_lag;
 struct mlx5_pagefault;
@@ -647,6 +648,7 @@ struct mlx5_priv {
 	spinlock_t              ctx_lock;
 
 	struct mlx5_flow_steering *steering;
+	struct mlx5_mpfs        *mpfs;
 	struct mlx5_eswitch     *eswitch;
 	struct mlx5_core_sriov	sriov;
 	struct mlx5_lag		*lag;
-- 
cgit v1.2.3


From 97834eba7c194659a72c5bb0f8c19c7055bb69ea Mon Sep 17 00:00:00 2001
From: Erez Shitrit <erezsh@mellanox.com>
Date: Wed, 7 Jun 2017 12:14:24 +0300
Subject: net/mlx5: Delay events till ib registration ends

When mlx5_ib registers itself to mlx5_core as an interface, it will
call mlx5_add_device which will call mlx5_ib interface add callback,
in case the latter successfully returns, only then mlx5_core will add
it to the interface list and async events will be forwarded to mlx5_ib.
Between mlx5_ib interface add callback and mlx5_core adding the mlx5_ib
interface to its devices list, arriving mlx5_core events can be missed
by the new mlx5_ib registering interface.

In other words:
thread 1: mlx5_ib: mlx5_register_interface(dev)
thread 1: mlx5_core: mlx5_add_device(dev)
thread 1: mlx5_core: ctx = dev->add => (mlx5_ib)->mlx5_ib_add
thread 2: mlx5_core_event: **new event arrives, forward to dev_list
thread 1: mlx5_core: add_ctx_to_dev_list(ctx)
/* previous event was missed by the new interface.*/
It is ok to miss events before dev->add (mlx5_ib)->mlx5_ib_add_device
but not after.

We fix this race by accumulating the events that come between the
ib_register_device (inside mlx5_add_device->(dev->add)) till the adding
to the list completes and fire them to the new registering interface
after that.

Fixes: f1ee87fe55c8 ("net/mlx5: Organize device list API in one place")
Signed-off-by: Erez Shitrit <erezsh@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
---
 drivers/net/ethernet/mellanox/mlx5/core/dev.c  | 73 ++++++++++++++++++++++++++
 drivers/net/ethernet/mellanox/mlx5/core/main.c |  3 ++
 include/linux/mlx5/driver.h                    |  3 ++
 3 files changed, 79 insertions(+)

(limited to 'include')

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/dev.c b/drivers/net/ethernet/mellanox/mlx5/core/dev.c
index a62f4b6a21a5..ff60cf7342ca 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/dev.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/dev.c
@@ -45,11 +45,70 @@ struct mlx5_device_context {
 	unsigned long		state;
 };
 
+struct mlx5_delayed_event {
+	struct list_head	list;
+	struct mlx5_core_dev	*dev;
+	enum mlx5_dev_event	event;
+	unsigned long		param;
+};
+
 enum {
 	MLX5_INTERFACE_ADDED,
 	MLX5_INTERFACE_ATTACHED,
 };
 
+static void add_delayed_event(struct mlx5_priv *priv,
+			      struct mlx5_core_dev *dev,
+			      enum mlx5_dev_event event,
+			      unsigned long param)
+{
+	struct mlx5_delayed_event *delayed_event;
+
+	delayed_event = kzalloc(sizeof(*delayed_event), GFP_ATOMIC);
+	if (!delayed_event) {
+		mlx5_core_err(dev, "event %d is missed\n", event);
+		return;
+	}
+
+	mlx5_core_dbg(dev, "Accumulating event %d\n", event);
+	delayed_event->dev = dev;
+	delayed_event->event = event;
+	delayed_event->param = param;
+	list_add_tail(&delayed_event->list, &priv->waiting_events_list);
+}
+
+static void fire_delayed_event_locked(struct mlx5_device_context *dev_ctx,
+				      struct mlx5_core_dev *dev,
+				      struct mlx5_priv *priv)
+{
+	struct mlx5_delayed_event *de;
+	struct mlx5_delayed_event *n;
+
+	/* stop delaying events */
+	priv->is_accum_events = false;
+
+	/* fire all accumulated events before new event comes */
+	list_for_each_entry_safe(de, n, &priv->waiting_events_list, list) {
+		dev_ctx->intf->event(dev, dev_ctx->context, de->event, de->param);
+		list_del(&de->list);
+		kfree(de);
+	}
+}
+
+static void cleanup_delayed_evets(struct mlx5_priv *priv)
+{
+	struct mlx5_delayed_event *de;
+	struct mlx5_delayed_event *n;
+
+	spin_lock_irq(&priv->ctx_lock);
+	priv->is_accum_events = false;
+	list_for_each_entry_safe(de, n, &priv->waiting_events_list, list) {
+		list_del(&de->list);
+		kfree(de);
+	}
+	spin_unlock_irq(&priv->ctx_lock);
+}
+
 void mlx5_add_device(struct mlx5_interface *intf, struct mlx5_priv *priv)
 {
 	struct mlx5_device_context *dev_ctx;
@@ -63,6 +122,12 @@ void mlx5_add_device(struct mlx5_interface *intf, struct mlx5_priv *priv)
 		return;
 
 	dev_ctx->intf = intf;
+	/* accumulating events that can come after mlx5_ib calls to
+	 * ib_register_device, till adding that interface to the events list.
+	 */
+
+	priv->is_accum_events = true;
+
 	dev_ctx->context = intf->add(dev);
 	set_bit(MLX5_INTERFACE_ADDED, &dev_ctx->state);
 	if (intf->attach)
@@ -71,6 +136,9 @@ void mlx5_add_device(struct mlx5_interface *intf, struct mlx5_priv *priv)
 	if (dev_ctx->context) {
 		spin_lock_irq(&priv->ctx_lock);
 		list_add_tail(&dev_ctx->list, &priv->ctx_list);
+
+		fire_delayed_event_locked(dev_ctx, dev, priv);
+
 #ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING
 		if (dev_ctx->intf->pfault) {
 			if (priv->pfault) {
@@ -84,6 +152,8 @@ void mlx5_add_device(struct mlx5_interface *intf, struct mlx5_priv *priv)
 		spin_unlock_irq(&priv->ctx_lock);
 	} else {
 		kfree(dev_ctx);
+		 /* delete all accumulated events */
+		cleanup_delayed_evets(priv);
 	}
 }
 
@@ -341,6 +411,9 @@ void mlx5_core_event(struct mlx5_core_dev *dev, enum mlx5_dev_event event,
 
 	spin_lock_irqsave(&priv->ctx_lock, flags);
 
+	if (priv->is_accum_events)
+		add_delayed_event(priv, dev, event, param);
+
 	list_for_each_entry(dev_ctx, &priv->ctx_list, list)
 		if (dev_ctx->intf->event)
 			dev_ctx->intf->event(dev, dev_ctx->context, event, param);
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/main.c b/drivers/net/ethernet/mellanox/mlx5/core/main.c
index 124c7c3c3a00..6dbd637b4e66 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/main.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/main.c
@@ -1343,6 +1343,9 @@ static int init_one(struct pci_dev *pdev,
 	mutex_init(&dev->pci_status_mutex);
 	mutex_init(&dev->intf_state_mutex);
 
+	INIT_LIST_HEAD(&priv->waiting_events_list);
+	priv->is_accum_events = false;
+
 #ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING
 	err = init_srcu_struct(&priv->pfault_srcu);
 	if (err) {
diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h
index 88d6eb5b3a76..d26f18b39c4a 100644
--- a/include/linux/mlx5/driver.h
+++ b/include/linux/mlx5/driver.h
@@ -647,6 +647,9 @@ struct mlx5_priv {
 	struct list_head        ctx_list;
 	spinlock_t              ctx_lock;
 
+	struct list_head	waiting_events_list;
+	bool			is_accum_events;
+
 	struct mlx5_flow_steering *steering;
 	struct mlx5_mpfs        *mpfs;
 	struct mlx5_eswitch     *eswitch;
-- 
cgit v1.2.3


From 61690e09c3b4a40401bdaa89ee11522cc0dc4b11 Mon Sep 17 00:00:00 2001
From: Rabie Loulou <rabiel@mellanox.com>
Date: Mon, 10 Jul 2017 14:35:10 +0300
Subject: net/mlx5: Fix counter list hardware structure

The counter list hardware structure doesn't contain a clear and
num_of_counters fields, remove them.

These wrong fields were never used by the driver hence no other driver
changes.

Fixes: a351a1b03bf1 ("net/mlx5: Introduce bulk reading of flow counters")
Signed-off-by: Rabie Loulou <rabiel@mellanox.com>
Reviewed-by: Or Gerlitz <ogerlitz@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
---
 include/linux/mlx5/mlx5_ifc.h | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h
index 3030121b4746..f847a3a57913 100644
--- a/include/linux/mlx5/mlx5_ifc.h
+++ b/include/linux/mlx5/mlx5_ifc.h
@@ -1071,8 +1071,7 @@ struct mlx5_ifc_dest_format_struct_bits {
 };
 
 struct mlx5_ifc_flow_counter_list_bits {
-	u8         clear[0x1];
-	u8         num_of_counters[0xf];
+	u8         reserved_at_0[0x10];
 	u8         flow_counter_id[0x10];
 
 	u8         reserved_at_20[0x20];
-- 
cgit v1.2.3


From a8ffcc741acb3c7f3dcf4c7d001209aa0995a5f1 Mon Sep 17 00:00:00 2001
From: Rabie Loulou <rabiel@mellanox.com>
Date: Sun, 9 Jul 2017 13:39:30 +0300
Subject: net/mlx5: Increase the maximum flow counters supported

Read new NIC capability field which represnts 16 MSBs of the max flow
counters number supported (max_flow_counter_31_16).

Backward compatibility with older firmware is preserved, the modified
driver reads max_flow_counter_31_16 as 0 from the older firmware and
uses up to 64K counters.

Changed flow counter id from 16 bits to 32 bits. Backward compatibility
with older firmware is preserved as we kept the 16 LSBs of the counter
id in place and added 16 MSBs from reserved field.

Changed the background bulk reading of flow counters to work in chunks
of at most 32K counters, to make sure we don't attempt to allocate very
large buffers.

Signed-off-by: Rabie Loulou <rabiel@mellanox.com>
Reviewed-by: Or Gerlitz <ogerlitz@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
---
 .../net/ethernet/mellanox/mlx5/core/eswitch_offloads.c   |  6 ++++--
 drivers/net/ethernet/mellanox/mlx5/core/fs_cmd.c         | 12 ++++++------
 drivers/net/ethernet/mellanox/mlx5/core/fs_cmd.h         | 10 +++++-----
 drivers/net/ethernet/mellanox/mlx5/core/fs_core.h        |  2 +-
 drivers/net/ethernet/mellanox/mlx5/core/fs_counters.c    | 13 ++++++++++---
 include/linux/mlx5/mlx5_ifc.h                            | 16 ++++++----------
 6 files changed, 32 insertions(+), 27 deletions(-)

(limited to 'include')

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c b/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c
index 95b64025ce36..e7c186b58579 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c
@@ -433,6 +433,8 @@ static int esw_create_offloads_fast_fdb_table(struct mlx5_eswitch *esw)
 	struct mlx5_flow_table *fdb = NULL;
 	int esw_size, err = 0;
 	u32 flags = 0;
+	u32 max_flow_counter = (MLX5_CAP_GEN(dev, max_flow_counter_31_16) << 16) |
+				MLX5_CAP_GEN(dev, max_flow_counter_15_0);
 
 	root_ns = mlx5_get_flow_namespace(dev, MLX5_FLOW_NAMESPACE_FDB);
 	if (!root_ns) {
@@ -443,9 +445,9 @@ static int esw_create_offloads_fast_fdb_table(struct mlx5_eswitch *esw)
 
 	esw_debug(dev, "Create offloads FDB table, min (max esw size(2^%d), max counters(%d)*groups(%d))\n",
 		  MLX5_CAP_ESW_FLOWTABLE_FDB(dev, log_max_ft_size),
-		  MLX5_CAP_GEN(dev, max_flow_counter), ESW_OFFLOADS_NUM_GROUPS);
+		  max_flow_counter, ESW_OFFLOADS_NUM_GROUPS);
 
-	esw_size = min_t(int, MLX5_CAP_GEN(dev, max_flow_counter) * ESW_OFFLOADS_NUM_GROUPS,
+	esw_size = min_t(int, max_flow_counter * ESW_OFFLOADS_NUM_GROUPS,
 			 1 << MLX5_CAP_ESW_FLOWTABLE_FDB(dev, log_max_ft_size));
 
 	if (esw->offloads.encap != DEVLINK_ESWITCH_ENCAP_MODE_NONE)
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fs_cmd.c b/drivers/net/ethernet/mellanox/mlx5/core/fs_cmd.c
index e750f07793b8..16b32f31d691 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/fs_cmd.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/fs_cmd.c
@@ -359,7 +359,7 @@ int mlx5_cmd_delete_fte(struct mlx5_core_dev *dev,
 	return mlx5_cmd_exec(dev, in, sizeof(in), out, sizeof(out));
 }
 
-int mlx5_cmd_fc_alloc(struct mlx5_core_dev *dev, u16 *id)
+int mlx5_cmd_fc_alloc(struct mlx5_core_dev *dev, u32 *id)
 {
 	u32 in[MLX5_ST_SZ_DW(alloc_flow_counter_in)]   = {0};
 	u32 out[MLX5_ST_SZ_DW(alloc_flow_counter_out)] = {0};
@@ -374,7 +374,7 @@ int mlx5_cmd_fc_alloc(struct mlx5_core_dev *dev, u16 *id)
 	return err;
 }
 
-int mlx5_cmd_fc_free(struct mlx5_core_dev *dev, u16 id)
+int mlx5_cmd_fc_free(struct mlx5_core_dev *dev, u32 id)
 {
 	u32 in[MLX5_ST_SZ_DW(dealloc_flow_counter_in)]   = {0};
 	u32 out[MLX5_ST_SZ_DW(dealloc_flow_counter_out)] = {0};
@@ -385,7 +385,7 @@ int mlx5_cmd_fc_free(struct mlx5_core_dev *dev, u16 id)
 	return mlx5_cmd_exec(dev, in, sizeof(in), out, sizeof(out));
 }
 
-int mlx5_cmd_fc_query(struct mlx5_core_dev *dev, u16 id,
+int mlx5_cmd_fc_query(struct mlx5_core_dev *dev, u32 id,
 		      u64 *packets, u64 *bytes)
 {
 	u32 out[MLX5_ST_SZ_BYTES(query_flow_counter_out) +
@@ -409,14 +409,14 @@ int mlx5_cmd_fc_query(struct mlx5_core_dev *dev, u16 id,
 }
 
 struct mlx5_cmd_fc_bulk {
-	u16 id;
+	u32 id;
 	int num;
 	int outlen;
 	u32 out[0];
 };
 
 struct mlx5_cmd_fc_bulk *
-mlx5_cmd_fc_bulk_alloc(struct mlx5_core_dev *dev, u16 id, int num)
+mlx5_cmd_fc_bulk_alloc(struct mlx5_core_dev *dev, u32 id, int num)
 {
 	struct mlx5_cmd_fc_bulk *b;
 	int outlen =
@@ -453,7 +453,7 @@ mlx5_cmd_fc_bulk_query(struct mlx5_core_dev *dev, struct mlx5_cmd_fc_bulk *b)
 }
 
 void mlx5_cmd_fc_bulk_get(struct mlx5_core_dev *dev,
-			  struct mlx5_cmd_fc_bulk *b, u16 id,
+			  struct mlx5_cmd_fc_bulk *b, u32 id,
 			  u64 *packets, u64 *bytes)
 {
 	int index = id - b->id;
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fs_cmd.h b/drivers/net/ethernet/mellanox/mlx5/core/fs_cmd.h
index 0f98a7cf4877..c6d7bdf255b6 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/fs_cmd.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/fs_cmd.h
@@ -74,20 +74,20 @@ int mlx5_cmd_update_root_ft(struct mlx5_core_dev *dev,
 			    struct mlx5_flow_table *ft,
 			    u32 underlay_qpn);
 
-int mlx5_cmd_fc_alloc(struct mlx5_core_dev *dev, u16 *id);
-int mlx5_cmd_fc_free(struct mlx5_core_dev *dev, u16 id);
-int mlx5_cmd_fc_query(struct mlx5_core_dev *dev, u16 id,
+int mlx5_cmd_fc_alloc(struct mlx5_core_dev *dev, u32 *id);
+int mlx5_cmd_fc_free(struct mlx5_core_dev *dev, u32 id);
+int mlx5_cmd_fc_query(struct mlx5_core_dev *dev, u32 id,
 		      u64 *packets, u64 *bytes);
 
 struct mlx5_cmd_fc_bulk;
 
 struct mlx5_cmd_fc_bulk *
-mlx5_cmd_fc_bulk_alloc(struct mlx5_core_dev *dev, u16 id, int num);
+mlx5_cmd_fc_bulk_alloc(struct mlx5_core_dev *dev, u32 id, int num);
 void mlx5_cmd_fc_bulk_free(struct mlx5_cmd_fc_bulk *b);
 int
 mlx5_cmd_fc_bulk_query(struct mlx5_core_dev *dev, struct mlx5_cmd_fc_bulk *b);
 void mlx5_cmd_fc_bulk_get(struct mlx5_core_dev *dev,
-			  struct mlx5_cmd_fc_bulk *b, u16 id,
+			  struct mlx5_cmd_fc_bulk *b, u32 id,
 			  u64 *packets, u64 *bytes);
 
 #endif
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fs_core.h b/drivers/net/ethernet/mellanox/mlx5/core/fs_core.h
index 990acee6fb09..9fb5a333df52 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/fs_core.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/fs_core.h
@@ -136,7 +136,7 @@ struct mlx5_fc {
 	u64 lastpackets;
 	u64 lastbytes;
 
-	u16 id;
+	u32 id;
 	bool deleted;
 	bool aging;
 
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fs_counters.c b/drivers/net/ethernet/mellanox/mlx5/core/fs_counters.c
index 6507d8acc54d..89d1f8650033 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/fs_counters.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/fs_counters.c
@@ -38,6 +38,8 @@
 #include "fs_cmd.h"
 
 #define MLX5_FC_STATS_PERIOD msecs_to_jiffies(1000)
+/* Max number of counters to query in bulk read is 32K */
+#define MLX5_SW_MAX_COUNTERS_BULK BIT(15)
 
 /* locking scheme:
  *
@@ -90,16 +92,21 @@ static void mlx5_fc_stats_insert(struct rb_root *root, struct mlx5_fc *counter)
 	rb_insert_color(&counter->node, root);
 }
 
+/* The function returns the last node that was queried so the caller
+ * function can continue calling it till all counters are queried.
+ */
 static struct rb_node *mlx5_fc_stats_query(struct mlx5_core_dev *dev,
 					   struct mlx5_fc *first,
-					   u16 last_id)
+					   u32 last_id)
 {
 	struct mlx5_cmd_fc_bulk *b;
 	struct rb_node *node = NULL;
-	u16 afirst_id;
+	u32 afirst_id;
 	int num;
 	int err;
-	int max_bulk = 1 << MLX5_CAP_GEN(dev, log_max_flow_counter_bulk);
+
+	int max_bulk = min_t(int, MLX5_SW_MAX_COUNTERS_BULK,
+			     (1 << MLX5_CAP_GEN(dev, log_max_flow_counter_bulk)));
 
 	/* first id must be aligned to 4 when using bulk query */
 	afirst_id = first->id & ~0x3;
diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h
index f847a3a57913..c99daffc3c3c 100644
--- a/include/linux/mlx5/mlx5_ifc.h
+++ b/include/linux/mlx5/mlx5_ifc.h
@@ -963,7 +963,7 @@ struct mlx5_ifc_cmd_hca_cap_bits {
 	u8         reserved_at_2a0[0x10];
 	u8         max_wqe_sz_rq[0x10];
 
-	u8         reserved_at_2c0[0x10];
+	u8         max_flow_counter_31_16[0x10];
 	u8         max_wqe_sz_sq_dc[0x10];
 
 	u8         reserved_at_2e0[0x7];
@@ -981,7 +981,7 @@ struct mlx5_ifc_cmd_hca_cap_bits {
 
 	u8         reserved_at_340[0x8];
 	u8         log_max_flow_counter_bulk[0x8];
-	u8         max_flow_counter[0x10];
+	u8         max_flow_counter_15_0[0x10];
 
 
 	u8         reserved_at_360[0x3];
@@ -1071,8 +1071,7 @@ struct mlx5_ifc_dest_format_struct_bits {
 };
 
 struct mlx5_ifc_flow_counter_list_bits {
-	u8         reserved_at_0[0x10];
-	u8         flow_counter_id[0x10];
+	u8         flow_counter_id[0x20];
 
 	u8         reserved_at_20[0x20];
 };
@@ -4402,8 +4401,7 @@ struct mlx5_ifc_query_flow_counter_in_bits {
 	u8         reserved_at_c1[0xf];
 	u8         num_of_counters[0x10];
 
-	u8         reserved_at_e0[0x10];
-	u8         flow_counter_id[0x10];
+	u8         flow_counter_id[0x20];
 };
 
 struct mlx5_ifc_query_esw_vport_context_out_bits {
@@ -6271,8 +6269,7 @@ struct mlx5_ifc_dealloc_flow_counter_in_bits {
 	u8         reserved_at_20[0x10];
 	u8         op_mod[0x10];
 
-	u8         reserved_at_40[0x10];
-	u8         flow_counter_id[0x10];
+	u8         flow_counter_id[0x20];
 
 	u8         reserved_at_60[0x20];
 };
@@ -7097,8 +7094,7 @@ struct mlx5_ifc_alloc_flow_counter_out_bits {
 
 	u8         syndrome[0x20];
 
-	u8         reserved_at_40[0x10];
-	u8         flow_counter_id[0x10];
+	u8         flow_counter_id[0x20];
 
 	u8         reserved_at_60[0x20];
 };
-- 
cgit v1.2.3


From c04ac679c6b86e4e36fbb675c6c061b4091f5810 Mon Sep 17 00:00:00 2001
From: Lorenzo Pieralisi <lorenzo.pieralisi@arm.com>
Date: Mon, 7 Aug 2017 11:29:48 +0100
Subject: ACPI: Introduce DMA ranges parsing

Some devices have limited addressing capabilities and cannot
reference the whole memory address space while carrying out DMA
operations (eg some devices with bus address bits range smaller than
system bus - which prevents them from using bus addresses that are
otherwise valid for the system).

The ACPI _DMA object allows bus devices to define the DMA window that is
actually addressable by devices that sit upstream the bus, therefore
providing a means to parse and initialize the devices DMA masks and
addressable DMA range size.

By relying on the generic ACPI kernel layer to retrieve and parse
resources, introduce ACPI core code to parse the _DMA object.

Signed-off-by: Lorenzo Pieralisi <lorenzo.pieralisi@arm.com>
Tested-by: Nate Watterson <nwatters@codeaurora.org>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/acpi/resource.c | 35 ++++++++++++++++++++++
 drivers/acpi/scan.c     | 79 +++++++++++++++++++++++++++++++++++++++++++++++++
 include/acpi/acpi_bus.h |  2 ++
 include/linux/acpi.h    |  8 +++++
 4 files changed, 124 insertions(+)

(limited to 'include')

diff --git a/drivers/acpi/resource.c b/drivers/acpi/resource.c
index 93f1b5ce89b9..d85e010ee2cc 100644
--- a/drivers/acpi/resource.c
+++ b/drivers/acpi/resource.c
@@ -635,6 +635,41 @@ int acpi_dev_get_resources(struct acpi_device *adev, struct list_head *list,
 }
 EXPORT_SYMBOL_GPL(acpi_dev_get_resources);
 
+static int is_memory(struct acpi_resource *ares, void *not_used)
+{
+	struct resource_win win;
+	struct resource *res = &win.res;
+
+	memset(&win, 0, sizeof(win));
+
+	return !(acpi_dev_resource_memory(ares, res)
+	       || acpi_dev_resource_address_space(ares, &win)
+	       || acpi_dev_resource_ext_address_space(ares, &win));
+}
+
+/**
+ * acpi_dev_get_dma_resources - Get current DMA resources of a device.
+ * @adev: ACPI device node to get the resources for.
+ * @list: Head of the resultant list of resources (must be empty).
+ *
+ * Evaluate the _DMA method for the given device node and process its
+ * output.
+ *
+ * The resultant struct resource objects are put on the list pointed to
+ * by @list, that must be empty initially, as members of struct
+ * resource_entry objects.  Callers of this routine should use
+ * %acpi_dev_free_resource_list() to free that list.
+ *
+ * The number of resources in the output list is returned on success,
+ * an error code reflecting the error condition is returned otherwise.
+ */
+int acpi_dev_get_dma_resources(struct acpi_device *adev, struct list_head *list)
+{
+	return __acpi_dev_get_resources(adev, list, is_memory, NULL,
+					METHOD_NAME__DMA);
+}
+EXPORT_SYMBOL_GPL(acpi_dev_get_dma_resources);
+
 /**
  * acpi_dev_filter_resource_type - Filter ACPI resource according to resource
  *				   types
diff --git a/drivers/acpi/scan.c b/drivers/acpi/scan.c
index 33897298f03e..94500d99f2d6 100644
--- a/drivers/acpi/scan.c
+++ b/drivers/acpi/scan.c
@@ -1359,6 +1359,85 @@ enum dev_dma_attr acpi_get_dma_attr(struct acpi_device *adev)
 		return DEV_DMA_NON_COHERENT;
 }
 
+/**
+ * acpi_dma_get_range() - Get device DMA parameters.
+ *
+ * @dev: device to configure
+ * @dma_addr: pointer device DMA address result
+ * @offset: pointer to the DMA offset result
+ * @size: pointer to DMA range size result
+ *
+ * Evaluate DMA regions and return respectively DMA region start, offset
+ * and size in dma_addr, offset and size on parsing success; it does not
+ * update the passed in values on failure.
+ *
+ * Return 0 on success, < 0 on failure.
+ */
+int acpi_dma_get_range(struct device *dev, u64 *dma_addr, u64 *offset,
+		       u64 *size)
+{
+	struct acpi_device *adev;
+	LIST_HEAD(list);
+	struct resource_entry *rentry;
+	int ret;
+	struct device *dma_dev = dev;
+	u64 len, dma_start = U64_MAX, dma_end = 0, dma_offset = 0;
+
+	/*
+	 * Walk the device tree chasing an ACPI companion with a _DMA
+	 * object while we go. Stop if we find a device with an ACPI
+	 * companion containing a _DMA method.
+	 */
+	do {
+		adev = ACPI_COMPANION(dma_dev);
+		if (adev && acpi_has_method(adev->handle, METHOD_NAME__DMA))
+			break;
+
+		dma_dev = dma_dev->parent;
+	} while (dma_dev);
+
+	if (!dma_dev)
+		return -ENODEV;
+
+	if (!acpi_has_method(adev->handle, METHOD_NAME__CRS)) {
+		acpi_handle_warn(adev->handle, "_DMA is valid only if _CRS is present\n");
+		return -EINVAL;
+	}
+
+	ret = acpi_dev_get_dma_resources(adev, &list);
+	if (ret > 0) {
+		list_for_each_entry(rentry, &list, node) {
+			if (dma_offset && rentry->offset != dma_offset) {
+				ret = -EINVAL;
+				dev_warn(dma_dev, "Can't handle multiple windows with different offsets\n");
+				goto out;
+			}
+			dma_offset = rentry->offset;
+
+			/* Take lower and upper limits */
+			if (rentry->res->start < dma_start)
+				dma_start = rentry->res->start;
+			if (rentry->res->end > dma_end)
+				dma_end = rentry->res->end;
+		}
+
+		if (dma_start >= dma_end) {
+			ret = -EINVAL;
+			dev_dbg(dma_dev, "Invalid DMA regions configuration\n");
+			goto out;
+		}
+
+		*dma_addr = dma_start - dma_offset;
+		len = dma_end - dma_start;
+		*size = max(len, len + 1);
+		*offset = dma_offset;
+	}
+ out:
+	acpi_dev_free_resource_list(&list);
+
+	return ret >= 0 ? 0 : ret;
+}
+
 /**
  * acpi_dma_configure - Set-up DMA configuration for the device.
  * @dev: The pointer to the device
diff --git a/include/acpi/acpi_bus.h b/include/acpi/acpi_bus.h
index 68bc6be447fd..07eb963b5026 100644
--- a/include/acpi/acpi_bus.h
+++ b/include/acpi/acpi_bus.h
@@ -578,6 +578,8 @@ struct acpi_pci_root {
 
 bool acpi_dma_supported(struct acpi_device *adev);
 enum dev_dma_attr acpi_get_dma_attr(struct acpi_device *adev);
+int acpi_dma_get_range(struct device *dev, u64 *dma_addr, u64 *offset,
+		       u64 *size);
 int acpi_dma_configure(struct device *dev, enum dev_dma_attr attr);
 void acpi_dma_deconfigure(struct device *dev);
 
diff --git a/include/linux/acpi.h b/include/linux/acpi.h
index c749eef1daa1..a5eaff9f2c6d 100644
--- a/include/linux/acpi.h
+++ b/include/linux/acpi.h
@@ -427,6 +427,8 @@ void acpi_dev_free_resource_list(struct list_head *list);
 int acpi_dev_get_resources(struct acpi_device *adev, struct list_head *list,
 			   int (*preproc)(struct acpi_resource *, void *),
 			   void *preproc_data);
+int acpi_dev_get_dma_resources(struct acpi_device *adev,
+			       struct list_head *list);
 int acpi_dev_filter_resource_type(struct acpi_resource *ares,
 				  unsigned long types);
 
@@ -774,6 +776,12 @@ static inline enum dev_dma_attr acpi_get_dma_attr(struct acpi_device *adev)
 	return DEV_DMA_NOT_SUPPORTED;
 }
 
+static inline int acpi_dma_get_range(struct device *dev, u64 *dma_addr,
+				     u64 *offset, u64 *size)
+{
+	return -ENODEV;
+}
+
 static inline int acpi_dma_configure(struct device *dev,
 				     enum dev_dma_attr attr)
 {
-- 
cgit v1.2.3


From 7ad4263980826e8b02e121af22f4f4c9103fe86d Mon Sep 17 00:00:00 2001
From: Lorenzo Pieralisi <lorenzo.pieralisi@arm.com>
Date: Mon, 7 Aug 2017 11:29:49 +0100
Subject: ACPI: Make acpi_dma_configure() DMA regions aware

Current ACPI DMA configuration set-up device DMA capabilities through
kernel defaults that do not take into account platform specific DMA
configurations reported by firmware.

By leveraging the ACPI acpi_dev_get_dma_resources() API, add code
in acpi_dma_configure() to retrieve the DMA regions to correctly
set-up PCI devices DMA parameters.

Rework the ACPI IORT kernel API to make sure they can accommodate
the DMA set-up required by firmware. By making PCI devices DMA set-up
ACPI IORT specific, the kernel is shielded from unwanted regressions
that could be triggered by parsing DMA resources on arches that were
previously ignoring them (ie x86/ia64), leaving kernel behaviour
unchanged on those arches.

Signed-off-by: Lorenzo Pieralisi <lorenzo.pieralisi@arm.com>
Acked-by: Will Deacon <will.deacon@arm.com>
Tested-by: Nate Watterson <nwatters@codeaurora.org>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/acpi/arm64/iort.c | 37 +++++++++++++++++++++++++++++++++++--
 drivers/acpi/scan.c       | 12 ++++--------
 include/linux/acpi_iort.h |  5 +++--
 3 files changed, 42 insertions(+), 12 deletions(-)

(limited to 'include')

diff --git a/drivers/acpi/arm64/iort.c b/drivers/acpi/arm64/iort.c
index a3215ee671c1..606af87c425f 100644
--- a/drivers/acpi/arm64/iort.c
+++ b/drivers/acpi/arm64/iort.c
@@ -681,12 +681,17 @@ static const struct iommu_ops *iort_iommu_xlate(struct device *dev,
 }
 
 /**
- * iort_set_dma_mask - Set-up dma mask for a device.
+ * iort_dma_setup() - Set-up device DMA parameters.
  *
  * @dev: device to configure
+ * @dma_addr: device DMA address result pointer
+ * @size: DMA range size result pointer
  */
-void iort_set_dma_mask(struct device *dev)
+void iort_dma_setup(struct device *dev, u64 *dma_addr, u64 *dma_size)
 {
+	u64 mask, dmaaddr = 0, size = 0, offset = 0;
+	int ret, msb;
+
 	/*
 	 * Set default coherent_dma_mask to 32 bit.  Drivers are expected to
 	 * setup the correct supported mask.
@@ -700,6 +705,34 @@ void iort_set_dma_mask(struct device *dev)
 	 */
 	if (!dev->dma_mask)
 		dev->dma_mask = &dev->coherent_dma_mask;
+
+	size = max(dev->coherent_dma_mask, dev->coherent_dma_mask + 1);
+
+	if (dev_is_pci(dev)) {
+		ret = acpi_dma_get_range(dev, &dmaaddr, &offset, &size);
+		if (!ret) {
+			msb = fls64(dmaaddr + size - 1);
+			/*
+			 * Round-up to the power-of-two mask or set
+			 * the mask to the whole 64-bit address space
+			 * in case the DMA region covers the full
+			 * memory window.
+			 */
+			mask = msb == 64 ? U64_MAX : (1ULL << msb) - 1;
+			/*
+			 * Limit coherent and dma mask based on size
+			 * retrieved from firmware.
+			 */
+			dev->coherent_dma_mask = mask;
+			*dev->dma_mask = mask;
+		}
+	}
+
+	*dma_addr = dmaaddr;
+	*dma_size = size;
+
+	dev->dma_pfn_offset = PFN_DOWN(offset);
+	dev_dbg(dev, "dma_pfn_offset(%#08llx)\n", offset);
 }
 
 /**
diff --git a/drivers/acpi/scan.c b/drivers/acpi/scan.c
index 94500d99f2d6..0483d36433ac 100644
--- a/drivers/acpi/scan.c
+++ b/drivers/acpi/scan.c
@@ -1446,20 +1446,16 @@ int acpi_dma_get_range(struct device *dev, u64 *dma_addr, u64 *offset,
 int acpi_dma_configure(struct device *dev, enum dev_dma_attr attr)
 {
 	const struct iommu_ops *iommu;
-	u64 size;
+	u64 dma_addr = 0, size = 0;
 
-	iort_set_dma_mask(dev);
+	iort_dma_setup(dev, &dma_addr, &size);
 
 	iommu = iort_iommu_configure(dev);
 	if (IS_ERR(iommu) && PTR_ERR(iommu) == -EPROBE_DEFER)
 		return -EPROBE_DEFER;
 
-	size = max(dev->coherent_dma_mask, dev->coherent_dma_mask + 1);
-	/*
-	 * Assume dma valid range starts at 0 and covers the whole
-	 * coherent_dma_mask.
-	 */
-	arch_setup_dma_ops(dev, 0, size, iommu, attr == DEV_DMA_COHERENT);
+	arch_setup_dma_ops(dev, dma_addr, size,
+				iommu, attr == DEV_DMA_COHERENT);
 
 	return 0;
 }
diff --git a/include/linux/acpi_iort.h b/include/linux/acpi_iort.h
index 8379d406ad2e..8d3f0bf80379 100644
--- a/include/linux/acpi_iort.h
+++ b/include/linux/acpi_iort.h
@@ -36,7 +36,7 @@ struct irq_domain *iort_get_device_domain(struct device *dev, u32 req_id);
 void acpi_configure_pmsi_domain(struct device *dev);
 int iort_pmsi_get_dev_id(struct device *dev, u32 *dev_id);
 /* IOMMU interface */
-void iort_set_dma_mask(struct device *dev);
+void iort_dma_setup(struct device *dev, u64 *dma_addr, u64 *size);
 const struct iommu_ops *iort_iommu_configure(struct device *dev);
 #else
 static inline void acpi_iort_init(void) { }
@@ -47,7 +47,8 @@ static inline struct irq_domain *iort_get_device_domain(struct device *dev,
 { return NULL; }
 static inline void acpi_configure_pmsi_domain(struct device *dev) { }
 /* IOMMU interface */
-static inline void iort_set_dma_mask(struct device *dev) { }
+static inline void iort_dma_setup(struct device *dev, u64 *dma_addr,
+				  u64 *size) { }
 static inline
 const struct iommu_ops *iort_iommu_configure(struct device *dev)
 { return NULL; }
-- 
cgit v1.2.3


From 5e2f30b756a37bd80c5b0471d0e10d769ab2eb9a Mon Sep 17 00:00:00 2001
From: David Hildenbrand <david@redhat.com>
Date: Thu, 3 Aug 2017 18:11:04 +0200
Subject: KVM: nVMX: get rid of nested_get_page()
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

nested_get_page() just sounds confusing. All we want is a page from G1.
This is even unrelated to nested.

Let's introduce kvm_vcpu_gpa_to_page() so we don't get too lengthy
lines.

Reviewed-by: Paolo Bonzini <pbonzini@redhat.com>
Signed-off-by: David Hildenbrand <david@redhat.com>
Signed-off-by: Radim Krčmář <rkrcmar@redhat.com>
[Squash pasto fix from Wanpeng Li. - Paolo]
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/vmx.c       | 57 ++++++++++++++++++++++--------------------------
 include/linux/kvm_host.h |  6 +++++
 2 files changed, 32 insertions(+), 31 deletions(-)

(limited to 'include')

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 6a8d74e7f9d5..fa6accca1204 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -891,14 +891,6 @@ static inline struct vmcs12 *get_vmcs12(struct kvm_vcpu *vcpu)
 	return to_vmx(vcpu)->nested.cached_vmcs12;
 }
 
-static struct page *nested_get_page(struct kvm_vcpu *vcpu, gpa_t addr)
-{
-	struct page *page = kvm_vcpu_gfn_to_page(vcpu, addr >> PAGE_SHIFT);
-	if (is_error_page(page))
-		return NULL;
-
-	return page;
-}
 
 static void nested_release_page(struct page *page)
 {
@@ -7156,8 +7148,8 @@ static int handle_vmon(struct kvm_vcpu *vcpu)
 		return kvm_skip_emulated_instruction(vcpu);
 	}
 
-	page = nested_get_page(vcpu, vmptr);
-	if (page == NULL) {
+	page = kvm_vcpu_gpa_to_page(vcpu, vmptr);
+	if (is_error_page(page)) {
 		nested_vmx_failInvalid(vcpu);
 		return kvm_skip_emulated_instruction(vcpu);
 	}
@@ -7625,8 +7617,8 @@ static int handle_vmptrld(struct kvm_vcpu *vcpu)
 	if (vmx->nested.current_vmptr != vmptr) {
 		struct vmcs12 *new_vmcs12;
 		struct page *page;
-		page = nested_get_page(vcpu, vmptr);
-		if (page == NULL) {
+		page = kvm_vcpu_gpa_to_page(vcpu, vmptr);
+		if (is_error_page(page)) {
 			nested_vmx_failInvalid(vcpu);
 			return kvm_skip_emulated_instruction(vcpu);
 		}
@@ -9632,6 +9624,7 @@ static void nested_get_vmcs12_pages(struct kvm_vcpu *vcpu,
 					struct vmcs12 *vmcs12)
 {
 	struct vcpu_vmx *vmx = to_vmx(vcpu);
+	struct page *page;
 	u64 hpa;
 
 	if (nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES)) {
@@ -9641,17 +9634,19 @@ static void nested_get_vmcs12_pages(struct kvm_vcpu *vcpu,
 		 * physical address remains valid. We keep a reference
 		 * to it so we can release it later.
 		 */
-		if (vmx->nested.apic_access_page) /* shouldn't happen */
+		if (vmx->nested.apic_access_page) { /* shouldn't happen */
 			nested_release_page(vmx->nested.apic_access_page);
-		vmx->nested.apic_access_page =
-			nested_get_page(vcpu, vmcs12->apic_access_addr);
+			vmx->nested.apic_access_page = NULL;
+		}
+		page = kvm_vcpu_gpa_to_page(vcpu, vmcs12->apic_access_addr);
 		/*
 		 * If translation failed, no matter: This feature asks
 		 * to exit when accessing the given address, and if it
 		 * can never be accessed, this feature won't do
 		 * anything anyway.
 		 */
-		if (vmx->nested.apic_access_page) {
+		if (!is_error_page(page)) {
+			vmx->nested.apic_access_page = page;
 			hpa = page_to_phys(vmx->nested.apic_access_page);
 			vmcs_write64(APIC_ACCESS_ADDR, hpa);
 		} else {
@@ -9666,10 +9661,11 @@ static void nested_get_vmcs12_pages(struct kvm_vcpu *vcpu,
 	}
 
 	if (nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW)) {
-		if (vmx->nested.virtual_apic_page) /* shouldn't happen */
+		if (vmx->nested.virtual_apic_page) { /* shouldn't happen */
 			nested_release_page(vmx->nested.virtual_apic_page);
-		vmx->nested.virtual_apic_page =
-			nested_get_page(vcpu, vmcs12->virtual_apic_page_addr);
+			vmx->nested.virtual_apic_page = NULL;
+		}
+		page = kvm_vcpu_gpa_to_page(vcpu, vmcs12->virtual_apic_page_addr);
 
 		/*
 		 * If translation failed, VM entry will fail because
@@ -9684,7 +9680,8 @@ static void nested_get_vmcs12_pages(struct kvm_vcpu *vcpu,
 		 * control.  But such a configuration is useless, so
 		 * let's keep the code simple.
 		 */
-		if (vmx->nested.virtual_apic_page) {
+		if (!is_error_page(page)) {
+			vmx->nested.virtual_apic_page = page;
 			hpa = page_to_phys(vmx->nested.virtual_apic_page);
 			vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, hpa);
 		}
@@ -9694,15 +9691,13 @@ static void nested_get_vmcs12_pages(struct kvm_vcpu *vcpu,
 		if (vmx->nested.pi_desc_page) { /* shouldn't happen */
 			kunmap(vmx->nested.pi_desc_page);
 			nested_release_page(vmx->nested.pi_desc_page);
+			vmx->nested.pi_desc_page = NULL;
 		}
-		vmx->nested.pi_desc_page =
-			nested_get_page(vcpu, vmcs12->posted_intr_desc_addr);
-		vmx->nested.pi_desc =
-			(struct pi_desc *)kmap(vmx->nested.pi_desc_page);
-		if (!vmx->nested.pi_desc) {
-			nested_release_page_clean(vmx->nested.pi_desc_page);
+		page = kvm_vcpu_gpa_to_page(vcpu, vmcs12->posted_intr_desc_addr);
+		if (is_error_page(page))
 			return;
-		}
+		vmx->nested.pi_desc_page = page;
+		vmx->nested.pi_desc = kmap(vmx->nested.pi_desc_page);
 		vmx->nested.pi_desc =
 			(struct pi_desc *)((void *)vmx->nested.pi_desc +
 			(unsigned long)(vmcs12->posted_intr_desc_addr &
@@ -9784,8 +9779,8 @@ static inline bool nested_vmx_merge_msr_bitmap(struct kvm_vcpu *vcpu,
 	if (!nested_cpu_has_virt_x2apic_mode(vmcs12))
 		return false;
 
-	page = nested_get_page(vcpu, vmcs12->msr_bitmap);
-	if (!page)
+	page = kvm_vcpu_gpa_to_page(vcpu, vmcs12->msr_bitmap);
+	if (is_error_page(page))
 		return false;
 	msr_bitmap_l1 = (unsigned long *)kmap(page);
 
@@ -11392,8 +11387,8 @@ static int vmx_write_pml_buffer(struct kvm_vcpu *vcpu)
 
 		gpa = vmcs_read64(GUEST_PHYSICAL_ADDRESS) & ~0xFFFull;
 
-		page = nested_get_page(vcpu, vmcs12->pml_address);
-		if (!page)
+		page = kvm_vcpu_gpa_to_page(vcpu, vmcs12->pml_address);
+		if (is_error_page(page))
 			return 0;
 
 		pml_address = kmap(page);
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 21a6fd6c44af..28112d7917c1 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -985,6 +985,12 @@ static inline hpa_t pfn_to_hpa(kvm_pfn_t pfn)
 	return (hpa_t)pfn << PAGE_SHIFT;
 }
 
+static inline struct page *kvm_vcpu_gpa_to_page(struct kvm_vcpu *vcpu,
+						gpa_t gpa)
+{
+	return kvm_vcpu_gfn_to_page(vcpu, gpa_to_gfn(gpa));
+}
+
 static inline bool kvm_is_error_gpa(struct kvm *kvm, gpa_t gpa)
 {
 	unsigned long hva = gfn_to_hva(kvm, gpa_to_gfn(gpa));
-- 
cgit v1.2.3


From dfc8e9165cfe0b50438f51df5a82ab1dcc08baf4 Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Mon, 7 Aug 2017 17:22:14 +0200
Subject: fbcon: mark dummy functions 'inline'

The newly introduced fb_console_init/exit declarations have a
dummy alternative that is marked 'static' in the header but not
inline, leading to a warning whenever the header gets included:

In file included from drivers/video/fbdev/core/fbmem.c:35:0:
include/linux/fbcon.h:9:13: error: 'fb_console_exit' defined but not used [-Werror=unused-function]

This adds the intended 'inline'.

Fixes: 6104c37094e7 ("fbcon: Make fbcon a built-time depency for fbdev")
Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Reviewed-by: Daniel Vetter <daniel.vetter@ffwll.ch>
Cc: Sean Paul <seanpaul@chromium.org>
Signed-off-by: Bartlomiej Zolnierkiewicz <b.zolnierkie@samsung.com>
---
 include/linux/fbcon.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/linux/fbcon.h b/include/linux/fbcon.h
index 0fac6305d51c..f68a7db14165 100644
--- a/include/linux/fbcon.h
+++ b/include/linux/fbcon.h
@@ -5,8 +5,8 @@
 void __init fb_console_init(void);
 void __exit fb_console_exit(void);
 #else
-static void fb_console_init(void) {}
-static void fb_console_exit(void) {}
+static inline void fb_console_init(void) {}
+static inline void fb_console_exit(void) {}
 #endif
 
 #endif /* _LINUX_FBCON_H */
-- 
cgit v1.2.3


From 353c64dd19ebc99786c3578ea0444e1b5b1bb172 Mon Sep 17 00:00:00 2001
From: Fabio Estevam <festevam@gmail.com>
Date: Mon, 7 Aug 2017 09:08:52 -0300
Subject: ASoC: soc-core: Remove unneeded dentry member from snd_soc_codec

There is no need to have the *debugfs_reg dentry member as part of
the snd_soc_codec structure as its only usage is inside
soc_init_codec_debugfs().

Use a local dentry variable instead.

Signed-off-by: Fabio Estevam <festevam@gmail.com>
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 include/sound/soc.h  | 4 ----
 sound/soc/soc-core.c | 9 +++++----
 2 files changed, 5 insertions(+), 8 deletions(-)

(limited to 'include')

diff --git a/include/sound/soc.h b/include/sound/soc.h
index 10b71beb9f13..aad3f14966f1 100644
--- a/include/sound/soc.h
+++ b/include/sound/soc.h
@@ -903,10 +903,6 @@ struct snd_soc_codec {
 
 	/* component */
 	struct snd_soc_component component;
-
-#ifdef CONFIG_DEBUG_FS
-	struct dentry *debugfs_reg;
-#endif
 };
 
 /* codec driver */
diff --git a/sound/soc/soc-core.c b/sound/soc/soc-core.c
index 921622a01944..558efdfd717d 100644
--- a/sound/soc/soc-core.c
+++ b/sound/soc/soc-core.c
@@ -339,11 +339,12 @@ static void soc_cleanup_component_debugfs(struct snd_soc_component *component)
 static void soc_init_codec_debugfs(struct snd_soc_component *component)
 {
 	struct snd_soc_codec *codec = snd_soc_component_to_codec(component);
+	struct dentry *debugfs_reg;
 
-	codec->debugfs_reg = debugfs_create_file("codec_reg", 0644,
-						 codec->component.debugfs_root,
-						 codec, &codec_reg_fops);
-	if (!codec->debugfs_reg)
+	debugfs_reg = debugfs_create_file("codec_reg", 0644,
+					  codec->component.debugfs_root,
+					  codec, &codec_reg_fops);
+	if (!debugfs_reg)
 		dev_warn(codec->dev,
 			"ASoC: Failed to create codec register debugfs file\n");
 }
-- 
cgit v1.2.3


From cf9e829eaf0195e722bc3a3929002a3fbe74dafa Mon Sep 17 00:00:00 2001
From: Kuninori Morimoto <kuninori.morimoto.gx@renesas.com>
Date: Mon, 7 Aug 2017 02:06:23 +0000
Subject: ASoC: soc-core: rename "cmpnt" to "component"

To unify notation, to readable.

Signed-off-by: Kuninori Morimoto <kuninori.morimoto.gx@renesas.com>
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 include/sound/soc.h  |  4 ++--
 sound/soc/soc-core.c | 36 ++++++++++++++++++------------------
 2 files changed, 20 insertions(+), 20 deletions(-)

(limited to 'include')

diff --git a/include/sound/soc.h b/include/sound/soc.h
index 9c94b97c17f8..de57a8ad057b 100644
--- a/include/sound/soc.h
+++ b/include/sound/soc.h
@@ -469,10 +469,10 @@ int snd_soc_register_codec(struct device *dev,
 		struct snd_soc_dai_driver *dai_drv, int num_dai);
 void snd_soc_unregister_codec(struct device *dev);
 int snd_soc_register_component(struct device *dev,
-			 const struct snd_soc_component_driver *cmpnt_drv,
+			 const struct snd_soc_component_driver *component_driver,
 			 struct snd_soc_dai_driver *dai_drv, int num_dai);
 int devm_snd_soc_register_component(struct device *dev,
-			 const struct snd_soc_component_driver *cmpnt_drv,
+			 const struct snd_soc_component_driver *component_driver,
 			 struct snd_soc_dai_driver *dai_drv, int num_dai);
 void snd_soc_unregister_component(struct device *dev);
 int snd_soc_cache_init(struct snd_soc_codec *codec);
diff --git a/sound/soc/soc-core.c b/sound/soc/soc-core.c
index 921622a01944..b21488972198 100644
--- a/sound/soc/soc-core.c
+++ b/sound/soc/soc-core.c
@@ -3282,40 +3282,40 @@ static void snd_soc_component_del_unlocked(struct snd_soc_component *component)
 }
 
 int snd_soc_register_component(struct device *dev,
-			       const struct snd_soc_component_driver *cmpnt_drv,
+			       const struct snd_soc_component_driver *component_driver,
 			       struct snd_soc_dai_driver *dai_drv,
 			       int num_dai)
 {
-	struct snd_soc_component *cmpnt;
+	struct snd_soc_component *component;
 	int ret;
 
-	cmpnt = kzalloc(sizeof(*cmpnt), GFP_KERNEL);
-	if (!cmpnt) {
+	component = kzalloc(sizeof(*component), GFP_KERNEL);
+	if (!component) {
 		dev_err(dev, "ASoC: Failed to allocate memory\n");
 		return -ENOMEM;
 	}
 
-	ret = snd_soc_component_initialize(cmpnt, cmpnt_drv, dev);
+	ret = snd_soc_component_initialize(component, component_driver, dev);
 	if (ret)
 		goto err_free;
 
-	cmpnt->ignore_pmdown_time = true;
-	cmpnt->registered_as_component = true;
+	component->ignore_pmdown_time = true;
+	component->registered_as_component = true;
 
-	ret = snd_soc_register_dais(cmpnt, dai_drv, num_dai, true);
+	ret = snd_soc_register_dais(component, dai_drv, num_dai, true);
 	if (ret < 0) {
 		dev_err(dev, "ASoC: Failed to register DAIs: %d\n", ret);
 		goto err_cleanup;
 	}
 
-	snd_soc_component_add(cmpnt);
+	snd_soc_component_add(component);
 
 	return 0;
 
 err_cleanup:
-	snd_soc_component_cleanup(cmpnt);
+	snd_soc_component_cleanup(component);
 err_free:
-	kfree(cmpnt);
+	kfree(component);
 	return ret;
 }
 EXPORT_SYMBOL_GPL(snd_soc_register_component);
@@ -3327,22 +3327,22 @@ EXPORT_SYMBOL_GPL(snd_soc_register_component);
  */
 void snd_soc_unregister_component(struct device *dev)
 {
-	struct snd_soc_component *cmpnt;
+	struct snd_soc_component *component;
 
 	mutex_lock(&client_mutex);
-	list_for_each_entry(cmpnt, &component_list, list) {
-		if (dev == cmpnt->dev && cmpnt->registered_as_component)
+	list_for_each_entry(component, &component_list, list) {
+		if (dev == component->dev && component->registered_as_component)
 			goto found;
 	}
 	mutex_unlock(&client_mutex);
 	return;
 
 found:
-	snd_soc_tplg_component_remove(cmpnt, SND_SOC_TPLG_INDEX_ALL);
-	snd_soc_component_del_unlocked(cmpnt);
+	snd_soc_tplg_component_remove(component, SND_SOC_TPLG_INDEX_ALL);
+	snd_soc_component_del_unlocked(component);
 	mutex_unlock(&client_mutex);
-	snd_soc_component_cleanup(cmpnt);
-	kfree(cmpnt);
+	snd_soc_component_cleanup(component);
+	kfree(component);
 }
 EXPORT_SYMBOL_GPL(snd_soc_unregister_component);
 
-- 
cgit v1.2.3


From 2606706e4d7b89ebf13f35895a7dfe00e394e782 Mon Sep 17 00:00:00 2001
From: Linus Walleij <linus.walleij@linaro.org>
Date: Tue, 30 May 2017 13:48:22 +0200
Subject: backlight: gpio_backlight: Delete pdata inversion

The option to invert the output of the GPIO (active low) is
not used by the only platform still using platform data to
set up a GPIO backlight (one SH board). Delete the option
as we do not expect to expand the use of board files for
this driver, and GPIO descriptors intrinsically keep track
of any signal inversion.

Signed-off-by: Linus Walleij <linus.walleij@linaro.org>
Acked-by: Daniel Thompson <daniel.thompson@linaro.org>
Reviewed-by: Laurent Pinchart <laurent.pinchart@ideasonboard.com>
Signed-off-by: Lee Jones <lee.jones@linaro.org>
---
 drivers/video/backlight/gpio_backlight.c     | 15 ++-------------
 include/linux/platform_data/gpio_backlight.h |  1 -
 2 files changed, 2 insertions(+), 14 deletions(-)

(limited to 'include')

diff --git a/drivers/video/backlight/gpio_backlight.c b/drivers/video/backlight/gpio_backlight.c
index 5ffaff1e4142..e470da95d806 100644
--- a/drivers/video/backlight/gpio_backlight.c
+++ b/drivers/video/backlight/gpio_backlight.c
@@ -25,7 +25,6 @@ struct gpio_backlight {
 	struct device *fbdev;
 
 	struct gpio_desc *gpiod;
-	int active;
 	int def_value;
 };
 
@@ -39,8 +38,7 @@ static int gpio_backlight_update_status(struct backlight_device *bl)
 	    bl->props.state & (BL_CORE_SUSPENDED | BL_CORE_FBBLANK))
 		brightness = 0;
 
-	gpiod_set_value_cansleep(gbl->gpiod,
-				 brightness ? gbl->active : !gbl->active);
+	gpiod_set_value_cansleep(gbl->gpiod, brightness);
 
 	return 0;
 }
@@ -69,8 +67,6 @@ static int gpio_backlight_probe_dt(struct platform_device *pdev,
 
 	gbl->def_value = of_property_read_bool(np, "default-on");
 	flags = gbl->def_value ? GPIOD_OUT_HIGH : GPIOD_OUT_LOW;
-	/* GPIO descriptors keep track of inversion */
-	gbl->active = 1;
 
 	gbl->gpiod = devm_gpiod_get(dev, NULL, flags);
 	if (IS_ERR(gbl->gpiod)) {
@@ -121,15 +117,8 @@ static int gpio_backlight_probe(struct platform_device *pdev)
 		unsigned long flags = GPIOF_DIR_OUT;
 
 		gbl->fbdev = pdata->fbdev;
-		gbl->active = pdata->active_low ? 0 : 1;
 		gbl->def_value = pdata->def_value;
-
-		if (gbl->active)
-			flags |= gbl->def_value ?
-				GPIOF_INIT_HIGH : GPIOF_INIT_LOW;
-		else
-			flags |= gbl->def_value ?
-				GPIOF_INIT_LOW : GPIOF_INIT_HIGH;
+		flags |= gbl->def_value ? GPIOF_INIT_HIGH : GPIOF_INIT_LOW;
 
 		ret = devm_gpio_request_one(gbl->dev, pdata->gpio, flags,
 					    pdata ? pdata->name : "backlight");
diff --git a/include/linux/platform_data/gpio_backlight.h b/include/linux/platform_data/gpio_backlight.h
index 5ae0d9c80d4d..683d90453c41 100644
--- a/include/linux/platform_data/gpio_backlight.h
+++ b/include/linux/platform_data/gpio_backlight.h
@@ -14,7 +14,6 @@ struct gpio_backlight_platform_data {
 	struct device *fbdev;
 	int gpio;
 	int def_value;
-	bool active_low;
 	const char *name;
 };
 
-- 
cgit v1.2.3


From f02a60924c221985fb8b634734d6610706fa779a Mon Sep 17 00:00:00 2001
From: Mikko Rapeli <mikko.rapeli@iki.fi>
Date: Sun, 6 Aug 2017 18:44:07 +0200
Subject: uapi linux/dlm_netlink.h: include linux/dlmconstants.h
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Fixes userspace compilation error:

error: ‘DLM_RESNAME_MAXLEN’ undeclared here (not in a function)
  char resource_name[DLM_RESNAME_MAXLEN];

Signed-off-by: Mikko Rapeli <mikko.rapeli@iki.fi>
Signed-off-by: David Teigland <teigland@redhat.com>
---
 include/uapi/linux/dlm_netlink.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include')

diff --git a/include/uapi/linux/dlm_netlink.h b/include/uapi/linux/dlm_netlink.h
index 647c8ef27227..ef1e2e08769a 100644
--- a/include/uapi/linux/dlm_netlink.h
+++ b/include/uapi/linux/dlm_netlink.h
@@ -10,6 +10,7 @@
 #define _DLM_NETLINK_H
 
 #include <linux/types.h>
+#include <linux/dlmconstants.h>
 
 enum {
 	DLM_STATUS_WAITING = 1,
-- 
cgit v1.2.3


From 2572ac53c46f58e500b9d8d0f99785666038c590 Mon Sep 17 00:00:00 2001
From: Jiri Pirko <jiri@mellanox.com>
Date: Mon, 7 Aug 2017 10:15:17 +0200
Subject: net: sched: make type an argument for ndo_setup_tc

Since the type is always present, push it to be a separate argument to
ndo_setup_tc. On the way, name the type enum and use it for arg type.

Signed-off-by: Jiri Pirko <jiri@mellanox.com>
Acked-by: Jamal Hadi Salim <jhs@mojatatu.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/amd/xgbe/xgbe-drv.c             |  6 +++---
 drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.c      |  7 ++++---
 drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.h      |  5 +++--
 drivers/net/ethernet/broadcom/bnxt/bnxt.c            |  7 ++++---
 drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c      |  7 ++++---
 drivers/net/ethernet/freescale/dpaa/dpaa_eth.c       |  7 ++++---
 .../net/ethernet/hisilicon/hns3/hns3pf/hns3_enet.c   |  6 +++---
 drivers/net/ethernet/intel/fm10k/fm10k_netdev.c      |  7 ++++---
 drivers/net/ethernet/intel/i40e/i40e_main.c          |  6 +++---
 drivers/net/ethernet/intel/ixgbe/ixgbe_main.c        |  9 +++++----
 drivers/net/ethernet/mellanox/mlx4/en_netdev.c       |  6 +++---
 drivers/net/ethernet/mellanox/mlx5/core/en_main.c    |  8 ++++----
 drivers/net/ethernet/mellanox/mlx5/core/en_rep.c     |  9 +++++----
 drivers/net/ethernet/mellanox/mlxsw/spectrum.c       | 10 +++++-----
 drivers/net/ethernet/netronome/nfp/bpf/main.c        |  5 +++--
 drivers/net/ethernet/netronome/nfp/flower/main.h     |  3 ++-
 drivers/net/ethernet/netronome/nfp/flower/offload.c  |  5 +++--
 drivers/net/ethernet/netronome/nfp/nfp_app.h         |  6 ++++--
 drivers/net/ethernet/netronome/nfp/nfp_port.c        |  7 ++++---
 drivers/net/ethernet/netronome/nfp/nfp_port.h        |  5 +++--
 drivers/net/ethernet/sfc/efx.h                       |  5 +++--
 drivers/net/ethernet/sfc/falcon/efx.h                |  5 +++--
 drivers/net/ethernet/sfc/falcon/tx.c                 |  7 ++++---
 drivers/net/ethernet/sfc/tx.c                        |  7 ++++---
 drivers/net/ethernet/ti/netcp_core.c                 |  7 ++++---
 include/linux/netdevice.h                            |  9 +++++----
 net/dsa/slave.c                                      |  6 +++---
 net/sched/cls_bpf.c                                  |  4 ++--
 net/sched/cls_flower.c                               | 14 ++++++--------
 net/sched/cls_matchall.c                             | 10 ++++------
 net/sched/cls_u32.c                                  | 20 ++++++++------------
 net/sched/sch_mqprio.c                               | 13 ++++++-------
 32 files changed, 125 insertions(+), 113 deletions(-)

(limited to 'include')

diff --git a/drivers/net/ethernet/amd/xgbe/xgbe-drv.c b/drivers/net/ethernet/amd/xgbe/xgbe-drv.c
index ecef3ee87b17..6a6ea3bdd056 100644
--- a/drivers/net/ethernet/amd/xgbe/xgbe-drv.c
+++ b/drivers/net/ethernet/amd/xgbe/xgbe-drv.c
@@ -1918,14 +1918,14 @@ static void xgbe_poll_controller(struct net_device *netdev)
 }
 #endif /* End CONFIG_NET_POLL_CONTROLLER */
 
-static int xgbe_setup_tc(struct net_device *netdev, u32 handle, u32 chain_index,
-			 __be16 proto,
+static int xgbe_setup_tc(struct net_device *netdev, enum tc_setup_type type,
+			 u32 handle, u32 chain_index, __be16 proto,
 			 struct tc_to_netdev *tc_to_netdev)
 {
 	struct xgbe_prv_data *pdata = netdev_priv(netdev);
 	u8 tc;
 
-	if (tc_to_netdev->type != TC_SETUP_MQPRIO)
+	if (type != TC_SETUP_MQPRIO)
 		return -EINVAL;
 
 	tc_to_netdev->mqprio->hw = TC_MQPRIO_HW_OFFLOAD_TCS;
diff --git a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.c b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.c
index 67fe3d826566..4395d1cac86f 100644
--- a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.c
+++ b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.c
@@ -4284,10 +4284,11 @@ int bnx2x_setup_tc(struct net_device *dev, u8 num_tc)
 	return 0;
 }
 
-int __bnx2x_setup_tc(struct net_device *dev, u32 handle, u32 chain_index,
-		     __be16 proto, struct tc_to_netdev *tc)
+int __bnx2x_setup_tc(struct net_device *dev, enum tc_setup_type type,
+		     u32 handle, u32 chain_index, __be16 proto,
+		     struct tc_to_netdev *tc)
 {
-	if (tc->type != TC_SETUP_MQPRIO)
+	if (type != TC_SETUP_MQPRIO)
 		return -EINVAL;
 
 	tc->mqprio->hw = TC_MQPRIO_HW_OFFLOAD_TCS;
diff --git a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.h b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.h
index c26688d2f326..1ac4eb0d3413 100644
--- a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.h
+++ b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.h
@@ -486,8 +486,9 @@ netdev_tx_t bnx2x_start_xmit(struct sk_buff *skb, struct net_device *dev);
 
 /* setup_tc callback */
 int bnx2x_setup_tc(struct net_device *dev, u8 num_tc);
-int __bnx2x_setup_tc(struct net_device *dev, u32 handle, u32 chain_index,
-		     __be16 proto, struct tc_to_netdev *tc);
+int __bnx2x_setup_tc(struct net_device *dev, enum tc_setup_type type,
+		     u32 handle, u32 chain_index, __be16 proto,
+		     struct tc_to_netdev *tc);
 
 int bnx2x_get_vf_config(struct net_device *dev, int vf,
 			struct ifla_vf_info *ivi);
diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.c b/drivers/net/ethernet/broadcom/bnxt/bnxt.c
index 156fb374522b..b98d9f33d9af 100644
--- a/drivers/net/ethernet/broadcom/bnxt/bnxt.c
+++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.c
@@ -7237,10 +7237,11 @@ int bnxt_setup_mq_tc(struct net_device *dev, u8 tc)
 	return 0;
 }
 
-static int bnxt_setup_tc(struct net_device *dev, u32 handle, u32 chain_index,
-			 __be16 proto, struct tc_to_netdev *ntc)
+static int bnxt_setup_tc(struct net_device *dev, enum tc_setup_type type,
+			 u32 handle, u32 chain_index, __be16 proto,
+			 struct tc_to_netdev *ntc)
 {
-	if (ntc->type != TC_SETUP_MQPRIO)
+	if (type != TC_SETUP_MQPRIO)
 		return -EINVAL;
 
 	ntc->mqprio->hw = TC_MQPRIO_HW_OFFLOAD_TCS;
diff --git a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c
index fdf220aa08d6..89d2b0cd9869 100644
--- a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c
+++ b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c
@@ -2889,8 +2889,9 @@ static int cxgb_set_tx_maxrate(struct net_device *dev, int index, u32 rate)
 	return err;
 }
 
-static int cxgb_setup_tc(struct net_device *dev, u32 handle, u32 chain_index,
-			 __be16 proto, struct tc_to_netdev *tc)
+static int cxgb_setup_tc(struct net_device *dev, enum tc_setup_type type,
+			 u32 handle, u32 chain_index, __be16 proto,
+			 struct tc_to_netdev *tc)
 {
 	struct port_info *pi = netdev2pinfo(dev);
 	struct adapter *adap = netdev2adap(dev);
@@ -2906,7 +2907,7 @@ static int cxgb_setup_tc(struct net_device *dev, u32 handle, u32 chain_index,
 	}
 
 	if (TC_H_MAJ(handle) == TC_H_MAJ(TC_H_INGRESS) &&
-	    tc->type == TC_SETUP_CLSU32) {
+	    type == TC_SETUP_CLSU32) {
 		switch (tc->cls_u32->command) {
 		case TC_CLSU32_NEW_KNODE:
 		case TC_CLSU32_REPLACE_KNODE:
diff --git a/drivers/net/ethernet/freescale/dpaa/dpaa_eth.c b/drivers/net/ethernet/freescale/dpaa/dpaa_eth.c
index 550ea1ec7b6c..d86d766777c8 100644
--- a/drivers/net/ethernet/freescale/dpaa/dpaa_eth.c
+++ b/drivers/net/ethernet/freescale/dpaa/dpaa_eth.c
@@ -342,14 +342,15 @@ static void dpaa_get_stats64(struct net_device *net_dev,
 	}
 }
 
-static int dpaa_setup_tc(struct net_device *net_dev, u32 handle,
-			 u32 chain_index, __be16 proto, struct tc_to_netdev *tc)
+static int dpaa_setup_tc(struct net_device *net_dev, enum tc_setup_type type,
+			 u32 handle, u32 chain_index, __be16 proto,
+			 struct tc_to_netdev *tc)
 {
 	struct dpaa_priv *priv = netdev_priv(net_dev);
 	u8 num_tc;
 	int i;
 
-	if (tc->type != TC_SETUP_MQPRIO)
+	if (type != TC_SETUP_MQPRIO)
 		return -EINVAL;
 
 	tc->mqprio->hw = TC_MQPRIO_HW_OFFLOAD_TCS;
diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hns3_enet.c b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hns3_enet.c
index ad9481c7ceae..6bb1e35336cc 100644
--- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hns3_enet.c
+++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hns3_enet.c
@@ -1219,11 +1219,11 @@ static int hns3_setup_tc(struct net_device *netdev, u8 tc)
 	return 0;
 }
 
-static int hns3_nic_setup_tc(struct net_device *dev, u32 handle,
-			     u32 chain_index, __be16 protocol,
+static int hns3_nic_setup_tc(struct net_device *dev, enum tc_setup_type type,
+			     u32 handle, u32 chain_index, __be16 protocol,
 			     struct tc_to_netdev *tc)
 {
-	if (handle != TC_H_ROOT || tc->type != TC_SETUP_MQPRIO)
+	if (handle != TC_H_ROOT || type != TC_SETUP_MQPRIO)
 		return -EINVAL;
 
 	return hns3_setup_tc(dev, tc->mqprio->num_tc);
diff --git a/drivers/net/ethernet/intel/fm10k/fm10k_netdev.c b/drivers/net/ethernet/intel/fm10k/fm10k_netdev.c
index 5e37387c7082..b30190639e78 100644
--- a/drivers/net/ethernet/intel/fm10k/fm10k_netdev.c
+++ b/drivers/net/ethernet/intel/fm10k/fm10k_netdev.c
@@ -1265,10 +1265,11 @@ err_queueing_scheme:
 	return err;
 }
 
-static int __fm10k_setup_tc(struct net_device *dev, u32 handle, u32 chain_index,
-			    __be16 proto, struct tc_to_netdev *tc)
+static int __fm10k_setup_tc(struct net_device *dev, enum tc_setup_type type,
+			    u32 handle, u32 chain_index, __be16 proto,
+			    struct tc_to_netdev *tc)
 {
-	if (tc->type != TC_SETUP_MQPRIO)
+	if (type != TC_SETUP_MQPRIO)
 		return -EINVAL;
 
 	tc->mqprio->hw = TC_MQPRIO_HW_OFFLOAD_TCS;
diff --git a/drivers/net/ethernet/intel/i40e/i40e_main.c b/drivers/net/ethernet/intel/i40e/i40e_main.c
index 4104944ea367..7d47a718f922 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_main.c
+++ b/drivers/net/ethernet/intel/i40e/i40e_main.c
@@ -5656,11 +5656,11 @@ exit:
 	return ret;
 }
 
-static int __i40e_setup_tc(struct net_device *netdev, u32 handle,
-			   u32 chain_index, __be16 proto,
+static int __i40e_setup_tc(struct net_device *netdev, enum tc_setup_type type,
+			   u32 handle, u32 chain_index, __be16 proto,
 			   struct tc_to_netdev *tc)
 {
-	if (tc->type != TC_SETUP_MQPRIO)
+	if (type != TC_SETUP_MQPRIO)
 		return -EINVAL;
 
 	tc->mqprio->hw = TC_MQPRIO_HW_OFFLOAD_TCS;
diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
index 091fcc7e6e43..d39db9711df6 100644
--- a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
+++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
@@ -9226,8 +9226,9 @@ free_jump:
 	return err;
 }
 
-static int __ixgbe_setup_tc(struct net_device *dev, u32 handle, u32 chain_index,
-			    __be16 proto, struct tc_to_netdev *tc)
+static int __ixgbe_setup_tc(struct net_device *dev, enum tc_setup_type type,
+			    u32 handle, u32 chain_index, __be16 proto,
+			    struct tc_to_netdev *tc)
 {
 	struct ixgbe_adapter *adapter = netdev_priv(dev);
 
@@ -9235,7 +9236,7 @@ static int __ixgbe_setup_tc(struct net_device *dev, u32 handle, u32 chain_index,
 		return -EOPNOTSUPP;
 
 	if (TC_H_MAJ(handle) == TC_H_MAJ(TC_H_INGRESS) &&
-	    tc->type == TC_SETUP_CLSU32) {
+	    type == TC_SETUP_CLSU32) {
 		switch (tc->cls_u32->command) {
 		case TC_CLSU32_NEW_KNODE:
 		case TC_CLSU32_REPLACE_KNODE:
@@ -9255,7 +9256,7 @@ static int __ixgbe_setup_tc(struct net_device *dev, u32 handle, u32 chain_index,
 		}
 	}
 
-	if (tc->type != TC_SETUP_MQPRIO)
+	if (type != TC_SETUP_MQPRIO)
 		return -EINVAL;
 
 	tc->mqprio->hw = TC_MQPRIO_HW_OFFLOAD_TCS;
diff --git a/drivers/net/ethernet/mellanox/mlx4/en_netdev.c b/drivers/net/ethernet/mellanox/mlx4/en_netdev.c
index 3a291fc1780a..5c33550765ed 100644
--- a/drivers/net/ethernet/mellanox/mlx4/en_netdev.c
+++ b/drivers/net/ethernet/mellanox/mlx4/en_netdev.c
@@ -130,11 +130,11 @@ out:
 	return err;
 }
 
-static int __mlx4_en_setup_tc(struct net_device *dev, u32 handle,
-			      u32 chain_index, __be16 proto,
+static int __mlx4_en_setup_tc(struct net_device *dev, enum tc_setup_type type,
+			      u32 handle, u32 chain_index, __be16 proto,
 			      struct tc_to_netdev *tc)
 {
-	if (tc->type != TC_SETUP_MQPRIO)
+	if (type != TC_SETUP_MQPRIO)
 		return -EINVAL;
 
 	if (tc->mqprio->num_tc && tc->mqprio->num_tc != MLX4_EN_NUM_UP_HIGH)
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
index 57f31fa478ce..4052e225f1dc 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
@@ -3027,8 +3027,8 @@ out:
 	return err;
 }
 
-static int mlx5e_ndo_setup_tc(struct net_device *dev, u32 handle,
-			      u32 chain_index, __be16 proto,
+static int mlx5e_ndo_setup_tc(struct net_device *dev, enum tc_setup_type type,
+			      u32 handle, u32 chain_index, __be16 proto,
 			      struct tc_to_netdev *tc)
 {
 	struct mlx5e_priv *priv = netdev_priv(dev);
@@ -3039,7 +3039,7 @@ static int mlx5e_ndo_setup_tc(struct net_device *dev, u32 handle,
 	if (chain_index)
 		return -EOPNOTSUPP;
 
-	switch (tc->type) {
+	switch (type) {
 	case TC_SETUP_CLSFLOWER:
 		switch (tc->cls_flower->command) {
 		case TC_CLSFLOWER_REPLACE:
@@ -3054,7 +3054,7 @@ static int mlx5e_ndo_setup_tc(struct net_device *dev, u32 handle,
 	}
 
 mqprio:
-	if (tc->type != TC_SETUP_MQPRIO)
+	if (type != TC_SETUP_MQPRIO)
 		return -EINVAL;
 
 	tc->mqprio->hw = TC_MQPRIO_HW_OFFLOAD_TCS;
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c b/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c
index 45e60be9c277..d44049ed5371 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c
@@ -651,7 +651,8 @@ static int mlx5e_rep_get_phys_port_name(struct net_device *dev,
 	return 0;
 }
 
-static int mlx5e_rep_ndo_setup_tc(struct net_device *dev, u32 handle,
+static int mlx5e_rep_ndo_setup_tc(struct net_device *dev,
+				  enum tc_setup_type type, u32 handle,
 				  u32 chain_index, __be16 proto,
 				  struct tc_to_netdev *tc)
 {
@@ -664,15 +665,15 @@ static int mlx5e_rep_ndo_setup_tc(struct net_device *dev, u32 handle,
 		struct mlx5_eswitch *esw = priv->mdev->priv.eswitch;
 		struct net_device *uplink_dev = mlx5_eswitch_get_uplink_netdev(esw);
 
-		return uplink_dev->netdev_ops->ndo_setup_tc(uplink_dev, handle,
-							    chain_index,
+		return uplink_dev->netdev_ops->ndo_setup_tc(uplink_dev, type,
+							    handle, chain_index,
 							    proto, tc);
 	}
 
 	if (chain_index)
 		return -EOPNOTSUPP;
 
-	switch (tc->type) {
+	switch (type) {
 	case TC_SETUP_CLSFLOWER:
 		switch (tc->cls_flower->command) {
 		case TC_CLSFLOWER_REPLACE:
diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum.c
index 66d511d45c25..155424266cbf 100644
--- a/drivers/net/ethernet/mellanox/mlxsw/spectrum.c
+++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum.c
@@ -1693,8 +1693,8 @@ static void mlxsw_sp_port_del_cls_matchall(struct mlxsw_sp_port *mlxsw_sp_port,
 	kfree(mall_tc_entry);
 }
 
-static int mlxsw_sp_setup_tc(struct net_device *dev, u32 handle,
-			     u32 chain_index, __be16 proto,
+static int mlxsw_sp_setup_tc(struct net_device *dev, enum tc_setup_type type,
+			     u32 handle, u32 chain_index, __be16 proto,
 			     struct tc_to_netdev *tc)
 {
 	struct mlxsw_sp_port *mlxsw_sp_port = netdev_priv(dev);
@@ -1703,7 +1703,7 @@ static int mlxsw_sp_setup_tc(struct net_device *dev, u32 handle,
 	if (chain_index)
 		return -EOPNOTSUPP;
 
-	switch (tc->type) {
+	switch (type) {
 	case TC_SETUP_MATCHALL:
 		switch (tc->cls_mall->command) {
 		case TC_CLSMATCHALL_REPLACE:
@@ -1733,9 +1733,9 @@ static int mlxsw_sp_setup_tc(struct net_device *dev, u32 handle,
 		default:
 			return -EOPNOTSUPP;
 		}
+	default:
+		return -EOPNOTSUPP;
 	}
-
-	return -EOPNOTSUPP;
 }
 
 static const struct net_device_ops mlxsw_sp_port_netdev_ops = {
diff --git a/drivers/net/ethernet/netronome/nfp/bpf/main.c b/drivers/net/ethernet/netronome/nfp/bpf/main.c
index afbdf5fd4e4f..788880808a6e 100644
--- a/drivers/net/ethernet/netronome/nfp/bpf/main.c
+++ b/drivers/net/ethernet/netronome/nfp/bpf/main.c
@@ -121,7 +121,8 @@ static void nfp_bpf_vnic_clean(struct nfp_app *app, struct nfp_net *nn)
 }
 
 static int nfp_bpf_setup_tc(struct nfp_app *app, struct net_device *netdev,
-			    u32 handle, __be16 proto, struct tc_to_netdev *tc)
+			    enum tc_setup_type type, u32 handle, __be16 proto,
+			    struct tc_to_netdev *tc)
 {
 	struct nfp_net *nn = netdev_priv(netdev);
 
@@ -130,7 +131,7 @@ static int nfp_bpf_setup_tc(struct nfp_app *app, struct net_device *netdev,
 	if (proto != htons(ETH_P_ALL))
 		return -EOPNOTSUPP;
 
-	if (tc->type == TC_SETUP_CLSBPF && nfp_net_ebpf_capable(nn)) {
+	if (type == TC_SETUP_CLSBPF && nfp_net_ebpf_capable(nn)) {
 		if (!nn->dp.bpf_offload_xdp)
 			return nfp_net_bpf_offload(nn, tc->cls_bpf);
 		else
diff --git a/drivers/net/ethernet/netronome/nfp/flower/main.h b/drivers/net/ethernet/netronome/nfp/flower/main.h
index 9e64c048e83f..314e6e8ba649 100644
--- a/drivers/net/ethernet/netronome/nfp/flower/main.h
+++ b/drivers/net/ethernet/netronome/nfp/flower/main.h
@@ -135,7 +135,8 @@ int nfp_flower_metadata_init(struct nfp_app *app);
 void nfp_flower_metadata_cleanup(struct nfp_app *app);
 
 int nfp_flower_setup_tc(struct nfp_app *app, struct net_device *netdev,
-			u32 handle, __be16 proto, struct tc_to_netdev *tc);
+			enum tc_setup_type type, u32 handle, __be16 proto,
+			struct tc_to_netdev *tc);
 int nfp_flower_compile_flow_match(struct tc_cls_flower_offload *flow,
 				  struct nfp_fl_key_ls *key_ls,
 				  struct net_device *netdev,
diff --git a/drivers/net/ethernet/netronome/nfp/flower/offload.c b/drivers/net/ethernet/netronome/nfp/flower/offload.c
index 4ad10bd5e139..d045cf8c140a 100644
--- a/drivers/net/ethernet/netronome/nfp/flower/offload.c
+++ b/drivers/net/ethernet/netronome/nfp/flower/offload.c
@@ -385,7 +385,8 @@ nfp_flower_repr_offload(struct nfp_app *app, struct net_device *netdev,
 }
 
 int nfp_flower_setup_tc(struct nfp_app *app, struct net_device *netdev,
-			u32 handle, __be16 proto, struct tc_to_netdev *tc)
+			enum tc_setup_type type, u32 handle, __be16 proto,
+			struct tc_to_netdev *tc)
 {
 	if (TC_H_MAJ(handle) != TC_H_MAJ(TC_H_INGRESS))
 		return -EOPNOTSUPP;
@@ -393,7 +394,7 @@ int nfp_flower_setup_tc(struct nfp_app *app, struct net_device *netdev,
 	if (!eth_proto_is_802_3(proto))
 		return -EOPNOTSUPP;
 
-	if (tc->type != TC_SETUP_CLSFLOWER)
+	if (type != TC_SETUP_CLSFLOWER)
 		return -EINVAL;
 
 	return nfp_flower_repr_offload(app, netdev, tc->cls_flower);
diff --git a/drivers/net/ethernet/netronome/nfp/nfp_app.h b/drivers/net/ethernet/netronome/nfp/nfp_app.h
index 5d714e10d9a9..b3b03bb9d907 100644
--- a/drivers/net/ethernet/netronome/nfp/nfp_app.h
+++ b/drivers/net/ethernet/netronome/nfp/nfp_app.h
@@ -109,7 +109,8 @@ struct nfp_app_type {
 	void (*ctrl_msg_rx)(struct nfp_app *app, struct sk_buff *skb);
 
 	int (*setup_tc)(struct nfp_app *app, struct net_device *netdev,
-			u32 handle, __be16 proto, struct tc_to_netdev *tc);
+			enum tc_setup_type type, u32 handle, __be16 proto,
+			struct tc_to_netdev *tc);
 	bool (*tc_busy)(struct nfp_app *app, struct nfp_net *nn);
 	int (*xdp_offload)(struct nfp_app *app, struct nfp_net *nn,
 			   struct bpf_prog *prog);
@@ -238,12 +239,13 @@ static inline bool nfp_app_tc_busy(struct nfp_app *app, struct nfp_net *nn)
 
 static inline int nfp_app_setup_tc(struct nfp_app *app,
 				   struct net_device *netdev,
+				   enum tc_setup_type type,
 				   u32 handle, __be16 proto,
 				   struct tc_to_netdev *tc)
 {
 	if (!app || !app->type->setup_tc)
 		return -EOPNOTSUPP;
-	return app->type->setup_tc(app, netdev, handle, proto, tc);
+	return app->type->setup_tc(app, netdev, type, handle, proto, tc);
 }
 
 static inline int nfp_app_xdp_offload(struct nfp_app *app, struct nfp_net *nn,
diff --git a/drivers/net/ethernet/netronome/nfp/nfp_port.c b/drivers/net/ethernet/netronome/nfp/nfp_port.c
index d16a7b78ba9b..9d776f982352 100644
--- a/drivers/net/ethernet/netronome/nfp/nfp_port.c
+++ b/drivers/net/ethernet/netronome/nfp/nfp_port.c
@@ -88,8 +88,9 @@ const struct switchdev_ops nfp_port_switchdev_ops = {
 	.switchdev_port_attr_get	= nfp_port_attr_get,
 };
 
-int nfp_port_setup_tc(struct net_device *netdev, u32 handle, u32 chain_index,
-		      __be16 proto, struct tc_to_netdev *tc)
+int nfp_port_setup_tc(struct net_device *netdev, enum tc_setup_type type,
+		      u32 handle, u32 chain_index, __be16 proto,
+		      struct tc_to_netdev *tc)
 {
 	struct nfp_port *port;
 
@@ -100,7 +101,7 @@ int nfp_port_setup_tc(struct net_device *netdev, u32 handle, u32 chain_index,
 	if (!port)
 		return -EOPNOTSUPP;
 
-	return nfp_app_setup_tc(port->app, netdev, handle, proto, tc);
+	return nfp_app_setup_tc(port->app, netdev, type, handle, proto, tc);
 }
 
 struct nfp_port *
diff --git a/drivers/net/ethernet/netronome/nfp/nfp_port.h b/drivers/net/ethernet/netronome/nfp/nfp_port.h
index 56c76926c82a..239c5401000c 100644
--- a/drivers/net/ethernet/netronome/nfp/nfp_port.h
+++ b/drivers/net/ethernet/netronome/nfp/nfp_port.h
@@ -109,8 +109,9 @@ struct nfp_port {
 
 extern const struct switchdev_ops nfp_port_switchdev_ops;
 
-int nfp_port_setup_tc(struct net_device *netdev, u32 handle, u32 chain_index,
-		      __be16 proto, struct tc_to_netdev *tc);
+int nfp_port_setup_tc(struct net_device *netdev, enum tc_setup_type type,
+		      u32 handle, u32 chain_index, __be16 proto,
+		      struct tc_to_netdev *tc);
 
 struct nfp_port *nfp_port_from_netdev(struct net_device *netdev);
 struct nfp_port *
diff --git a/drivers/net/ethernet/sfc/efx.h b/drivers/net/ethernet/sfc/efx.h
index fcea9371ab7f..e41a7179bc05 100644
--- a/drivers/net/ethernet/sfc/efx.h
+++ b/drivers/net/ethernet/sfc/efx.h
@@ -32,8 +32,9 @@ netdev_tx_t efx_hard_start_xmit(struct sk_buff *skb,
 				struct net_device *net_dev);
 netdev_tx_t efx_enqueue_skb(struct efx_tx_queue *tx_queue, struct sk_buff *skb);
 void efx_xmit_done(struct efx_tx_queue *tx_queue, unsigned int index);
-int efx_setup_tc(struct net_device *net_dev, u32 handle, u32 chain_index,
-		 __be16 proto, struct tc_to_netdev *tc);
+int efx_setup_tc(struct net_device *net_dev, enum tc_setup_type type,
+		 u32 handle, u32 chain_index, __be16 proto,
+		 struct tc_to_netdev *tc);
 unsigned int efx_tx_max_skb_descs(struct efx_nic *efx);
 extern unsigned int efx_piobuf_size;
 extern bool efx_separate_tx_channels;
diff --git a/drivers/net/ethernet/sfc/falcon/efx.h b/drivers/net/ethernet/sfc/falcon/efx.h
index e5a7a40cc8b6..f3bc67ec1f30 100644
--- a/drivers/net/ethernet/sfc/falcon/efx.h
+++ b/drivers/net/ethernet/sfc/falcon/efx.h
@@ -32,8 +32,9 @@ netdev_tx_t ef4_hard_start_xmit(struct sk_buff *skb,
 				struct net_device *net_dev);
 netdev_tx_t ef4_enqueue_skb(struct ef4_tx_queue *tx_queue, struct sk_buff *skb);
 void ef4_xmit_done(struct ef4_tx_queue *tx_queue, unsigned int index);
-int ef4_setup_tc(struct net_device *net_dev, u32 handle, u32 chain_index,
-		 __be16 proto, struct tc_to_netdev *tc);
+int ef4_setup_tc(struct net_device *net_dev, enum tc_setup_type type,
+		 u32 handle, u32 chain_index, __be16 proto,
+		 struct tc_to_netdev *tc);
 unsigned int ef4_tx_max_skb_descs(struct ef4_nic *efx);
 extern bool ef4_separate_tx_channels;
 
diff --git a/drivers/net/ethernet/sfc/falcon/tx.c b/drivers/net/ethernet/sfc/falcon/tx.c
index f1520a404ac6..6c4752694c1f 100644
--- a/drivers/net/ethernet/sfc/falcon/tx.c
+++ b/drivers/net/ethernet/sfc/falcon/tx.c
@@ -425,8 +425,9 @@ void ef4_init_tx_queue_core_txq(struct ef4_tx_queue *tx_queue)
 				     efx->n_tx_channels : 0));
 }
 
-int ef4_setup_tc(struct net_device *net_dev, u32 handle, u32 chain_index,
-		 __be16 proto, struct tc_to_netdev *ntc)
+int ef4_setup_tc(struct net_device *net_dev, enum tc_setup_type type,
+		 u32 handle, u32 chain_index, __be16 proto,
+		 struct tc_to_netdev *ntc)
 {
 	struct ef4_nic *efx = netdev_priv(net_dev);
 	struct ef4_channel *channel;
@@ -434,7 +435,7 @@ int ef4_setup_tc(struct net_device *net_dev, u32 handle, u32 chain_index,
 	unsigned tc, num_tc;
 	int rc;
 
-	if (ntc->type != TC_SETUP_MQPRIO)
+	if (type != TC_SETUP_MQPRIO)
 		return -EINVAL;
 
 	num_tc = ntc->mqprio->num_tc;
diff --git a/drivers/net/ethernet/sfc/tx.c b/drivers/net/ethernet/sfc/tx.c
index 02d41eb4a8e9..0c08c10d751c 100644
--- a/drivers/net/ethernet/sfc/tx.c
+++ b/drivers/net/ethernet/sfc/tx.c
@@ -653,8 +653,9 @@ void efx_init_tx_queue_core_txq(struct efx_tx_queue *tx_queue)
 				     efx->n_tx_channels : 0));
 }
 
-int efx_setup_tc(struct net_device *net_dev, u32 handle, u32 chain_index,
-		 __be16 proto, struct tc_to_netdev *ntc)
+int efx_setup_tc(struct net_device *net_dev, enum tc_setup_type type,
+		 u32 handle, u32 chain_index, __be16 proto,
+		 struct tc_to_netdev *ntc)
 {
 	struct efx_nic *efx = netdev_priv(net_dev);
 	struct efx_channel *channel;
@@ -662,7 +663,7 @@ int efx_setup_tc(struct net_device *net_dev, u32 handle, u32 chain_index,
 	unsigned tc, num_tc;
 	int rc;
 
-	if (ntc->type != TC_SETUP_MQPRIO)
+	if (type != TC_SETUP_MQPRIO)
 		return -EINVAL;
 
 	num_tc = ntc->mqprio->num_tc;
diff --git a/drivers/net/ethernet/ti/netcp_core.c b/drivers/net/ethernet/ti/netcp_core.c
index 9d52c3a78621..cb21742f6177 100644
--- a/drivers/net/ethernet/ti/netcp_core.c
+++ b/drivers/net/ethernet/ti/netcp_core.c
@@ -1877,8 +1877,9 @@ static u16 netcp_select_queue(struct net_device *dev, struct sk_buff *skb,
 	return 0;
 }
 
-static int netcp_setup_tc(struct net_device *dev, u32 handle, u32 chain_index,
-			  __be16 proto, struct tc_to_netdev *tc)
+static int netcp_setup_tc(struct net_device *dev, enum tc_setup_type type,
+			  u32 handle, u32 chain_index, __be16 proto,
+			  struct tc_to_netdev *tc)
 {
 	u8 num_tc;
 	int i;
@@ -1886,7 +1887,7 @@ static int netcp_setup_tc(struct net_device *dev, u32 handle, u32 chain_index,
 	/* setup tc must be called under rtnl lock */
 	ASSERT_RTNL();
 
-	if (tc->type != TC_SETUP_MQPRIO)
+	if (type != TC_SETUP_MQPRIO)
 		return -EINVAL;
 
 	tc->mqprio->hw = TC_MQPRIO_HW_OFFLOAD_TCS;
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 3a3cdc1b1f31..e4238e540544 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -774,7 +774,7 @@ typedef u16 (*select_queue_fallback_t)(struct net_device *dev,
 /* These structures hold the attributes of qdisc and classifiers
  * that are being passed to the netdevice through the setup_tc op.
  */
-enum {
+enum tc_setup_type {
 	TC_SETUP_MQPRIO,
 	TC_SETUP_CLSU32,
 	TC_SETUP_CLSFLOWER,
@@ -785,7 +785,6 @@ enum {
 struct tc_cls_u32_offload;
 
 struct tc_to_netdev {
-	unsigned int type;
 	union {
 		struct tc_cls_u32_offload *cls_u32;
 		struct tc_cls_flower_offload *cls_flower;
@@ -978,8 +977,9 @@ struct xfrmdev_ops {
  *      with PF and querying it may introduce a theoretical security risk.
  * int (*ndo_set_vf_rss_query_en)(struct net_device *dev, int vf, bool setting);
  * int (*ndo_get_vf_port)(struct net_device *dev, int vf, struct sk_buff *skb);
- * int (*ndo_setup_tc)(struct net_device *dev, u32 handle, u32 chain_index,
- *		       __be16 protocol, struct tc_to_netdev *tc);
+ * int (*ndo_setup_tc)(struct net_device *dev, enum tc_setup_type type,
+ *		       u32 handle, u32 chain_index, __be16 protocol,
+ *		       struct tc_to_netdev *tc);
  *	Called to setup any 'tc' scheduler, classifier or action on @dev.
  *	This is always called from the stack with the rtnl lock held and netif
  *	tx queues stopped. This allows the netdevice to perform queue
@@ -1227,6 +1227,7 @@ struct net_device_ops {
 						   struct net_device *dev,
 						   int vf, bool setting);
 	int			(*ndo_setup_tc)(struct net_device *dev,
+						enum tc_setup_type type,
 						u32 handle, u32 chain_index,
 						__be16 protocol,
 						struct tc_to_netdev *tc);
diff --git a/net/dsa/slave.c b/net/dsa/slave.c
index 75c5c5808220..b4b63c20ec80 100644
--- a/net/dsa/slave.c
+++ b/net/dsa/slave.c
@@ -863,8 +863,8 @@ static void dsa_slave_del_cls_matchall(struct net_device *dev,
 	kfree(mall_tc_entry);
 }
 
-static int dsa_slave_setup_tc(struct net_device *dev, u32 handle,
-			      u32 chain_index, __be16 protocol,
+static int dsa_slave_setup_tc(struct net_device *dev, enum tc_setup_type type,
+			      u32 handle, u32 chain_index, __be16 protocol,
 			      struct tc_to_netdev *tc)
 {
 	bool ingress = TC_H_MAJ(handle) == TC_H_MAJ(TC_H_INGRESS);
@@ -872,7 +872,7 @@ static int dsa_slave_setup_tc(struct net_device *dev, u32 handle,
 	if (chain_index)
 		return -EOPNOTSUPP;
 
-	switch (tc->type) {
+	switch (type) {
 	case TC_SETUP_MATCHALL:
 		switch (tc->cls_mall->command) {
 		case TC_CLSMATCHALL_REPLACE:
diff --git a/net/sched/cls_bpf.c b/net/sched/cls_bpf.c
index cf248c3137ad..e2bf2753173d 100644
--- a/net/sched/cls_bpf.c
+++ b/net/sched/cls_bpf.c
@@ -151,7 +151,6 @@ static int cls_bpf_offload_cmd(struct tcf_proto *tp, struct cls_bpf_prog *prog,
 	struct tc_to_netdev offload;
 	int err;
 
-	offload.type = TC_SETUP_CLSBPF;
 	offload.cls_bpf = &bpf_offload;
 
 	bpf_offload.command = cmd;
@@ -161,7 +160,8 @@ static int cls_bpf_offload_cmd(struct tcf_proto *tp, struct cls_bpf_prog *prog,
 	bpf_offload.exts_integrated = prog->exts_integrated;
 	bpf_offload.gen_flags = prog->gen_flags;
 
-	err = dev->netdev_ops->ndo_setup_tc(dev, tp->q->handle,
+	err = dev->netdev_ops->ndo_setup_tc(dev, TC_SETUP_CLSBPF,
+					    tp->q->handle,
 					    tp->chain->index,
 					    tp->protocol, &offload);
 
diff --git a/net/sched/cls_flower.c b/net/sched/cls_flower.c
index 7ab524fc43f9..ddeed17d2024 100644
--- a/net/sched/cls_flower.c
+++ b/net/sched/cls_flower.c
@@ -236,11 +236,10 @@ static void fl_hw_destroy_filter(struct tcf_proto *tp, struct cls_fl_filter *f)
 	offload.prio = tp->prio;
 	offload.cookie = (unsigned long)f;
 
-	tc->type = TC_SETUP_CLSFLOWER;
 	tc->cls_flower = &offload;
 
-	dev->netdev_ops->ndo_setup_tc(dev, tp->q->handle, tp->chain->index,
-				      tp->protocol, tc);
+	dev->netdev_ops->ndo_setup_tc(dev, TC_SETUP_CLSFLOWER, tp->q->handle,
+				      tp->chain->index, tp->protocol, tc);
 }
 
 static int fl_hw_replace_filter(struct tcf_proto *tp,
@@ -273,11 +272,11 @@ static int fl_hw_replace_filter(struct tcf_proto *tp,
 	offload.key = &f->mkey;
 	offload.exts = &f->exts;
 
-	tc->type = TC_SETUP_CLSFLOWER;
 	tc->cls_flower = &offload;
 
-	err = dev->netdev_ops->ndo_setup_tc(dev, tp->q->handle,
-					    tp->chain->index, tp->protocol, tc);
+	err = dev->netdev_ops->ndo_setup_tc(dev, TC_SETUP_CLSFLOWER,
+					    tp->q->handle, tp->chain->index,
+					    tp->protocol, tc);
 	if (!err)
 		f->flags |= TCA_CLS_FLAGS_IN_HW;
 
@@ -300,10 +299,9 @@ static void fl_hw_update_stats(struct tcf_proto *tp, struct cls_fl_filter *f)
 	offload.cookie = (unsigned long)f;
 	offload.exts = &f->exts;
 
-	tc->type = TC_SETUP_CLSFLOWER;
 	tc->cls_flower = &offload;
 
-	dev->netdev_ops->ndo_setup_tc(dev, tp->q->handle,
+	dev->netdev_ops->ndo_setup_tc(dev, TC_CLSFLOWER_STATS, tp->q->handle,
 				      tp->chain->index, tp->protocol, tc);
 }
 
diff --git a/net/sched/cls_matchall.c b/net/sched/cls_matchall.c
index f35177b48373..6ffe0b82ab83 100644
--- a/net/sched/cls_matchall.c
+++ b/net/sched/cls_matchall.c
@@ -58,14 +58,13 @@ static int mall_replace_hw_filter(struct tcf_proto *tp,
 	struct tc_cls_matchall_offload mall_offload = {0};
 	int err;
 
-	offload.type = TC_SETUP_MATCHALL;
 	offload.cls_mall = &mall_offload;
 	offload.cls_mall->command = TC_CLSMATCHALL_REPLACE;
 	offload.cls_mall->exts = &head->exts;
 	offload.cls_mall->cookie = cookie;
 
-	err = dev->netdev_ops->ndo_setup_tc(dev, tp->q->handle,
-					    tp->chain->index,
+	err = dev->netdev_ops->ndo_setup_tc(dev, TC_SETUP_MATCHALL,
+					    tp->q->handle, tp->chain->index,
 					    tp->protocol, &offload);
 	if (!err)
 		head->flags |= TCA_CLS_FLAGS_IN_HW;
@@ -81,14 +80,13 @@ static void mall_destroy_hw_filter(struct tcf_proto *tp,
 	struct tc_to_netdev offload;
 	struct tc_cls_matchall_offload mall_offload = {0};
 
-	offload.type = TC_SETUP_MATCHALL;
 	offload.cls_mall = &mall_offload;
 	offload.cls_mall->command = TC_CLSMATCHALL_DESTROY;
 	offload.cls_mall->exts = NULL;
 	offload.cls_mall->cookie = cookie;
 
-	dev->netdev_ops->ndo_setup_tc(dev, tp->q->handle, tp->chain->index,
-				      tp->protocol, &offload);
+	dev->netdev_ops->ndo_setup_tc(dev, TC_SETUP_MATCHALL, tp->q->handle,
+				      tp->chain->index, tp->protocol, &offload);
 }
 
 static void mall_destroy(struct tcf_proto *tp)
diff --git a/net/sched/cls_u32.c b/net/sched/cls_u32.c
index 9fd243799fe7..d1bae4cc749f 100644
--- a/net/sched/cls_u32.c
+++ b/net/sched/cls_u32.c
@@ -434,15 +434,14 @@ static void u32_remove_hw_knode(struct tcf_proto *tp, u32 handle)
 	struct tc_cls_u32_offload u32_offload = {0};
 	struct tc_to_netdev offload;
 
-	offload.type = TC_SETUP_CLSU32;
 	offload.cls_u32 = &u32_offload;
 
 	if (tc_should_offload(dev, tp, 0)) {
 		offload.cls_u32->command = TC_CLSU32_DELETE_KNODE;
 		offload.cls_u32->knode.handle = handle;
-		dev->netdev_ops->ndo_setup_tc(dev, tp->q->handle,
-					      tp->chain->index, tp->protocol,
-					      &offload);
+		dev->netdev_ops->ndo_setup_tc(dev, TC_SETUP_CLSU32,
+					      tp->q->handle, tp->chain->index,
+					      tp->protocol, &offload);
 	}
 }
 
@@ -457,7 +456,6 @@ static int u32_replace_hw_hnode(struct tcf_proto *tp, struct tc_u_hnode *h,
 	if (!tc_should_offload(dev, tp, flags))
 		return tc_skip_sw(flags) ? -EINVAL : 0;
 
-	offload.type = TC_SETUP_CLSU32;
 	offload.cls_u32 = &u32_offload;
 
 	offload.cls_u32->command = TC_CLSU32_NEW_HNODE;
@@ -465,7 +463,7 @@ static int u32_replace_hw_hnode(struct tcf_proto *tp, struct tc_u_hnode *h,
 	offload.cls_u32->hnode.handle = h->handle;
 	offload.cls_u32->hnode.prio = h->prio;
 
-	err = dev->netdev_ops->ndo_setup_tc(dev, tp->q->handle,
+	err = dev->netdev_ops->ndo_setup_tc(dev, TC_SETUP_CLSU32, tp->q->handle,
 					    tp->chain->index, tp->protocol,
 					    &offload);
 	if (tc_skip_sw(flags))
@@ -480,7 +478,6 @@ static void u32_clear_hw_hnode(struct tcf_proto *tp, struct tc_u_hnode *h)
 	struct tc_cls_u32_offload u32_offload = {0};
 	struct tc_to_netdev offload;
 
-	offload.type = TC_SETUP_CLSU32;
 	offload.cls_u32 = &u32_offload;
 
 	if (tc_should_offload(dev, tp, 0)) {
@@ -489,9 +486,9 @@ static void u32_clear_hw_hnode(struct tcf_proto *tp, struct tc_u_hnode *h)
 		offload.cls_u32->hnode.handle = h->handle;
 		offload.cls_u32->hnode.prio = h->prio;
 
-		dev->netdev_ops->ndo_setup_tc(dev, tp->q->handle,
-					      tp->chain->index, tp->protocol,
-					      &offload);
+		dev->netdev_ops->ndo_setup_tc(dev, TC_SETUP_CLSU32,
+					      tp->q->handle, tp->chain->index,
+					      tp->protocol, &offload);
 	}
 }
 
@@ -503,7 +500,6 @@ static int u32_replace_hw_knode(struct tcf_proto *tp, struct tc_u_knode *n,
 	struct tc_to_netdev offload;
 	int err;
 
-	offload.type = TC_SETUP_CLSU32;
 	offload.cls_u32 = &u32_offload;
 
 	if (!tc_should_offload(dev, tp, flags))
@@ -524,7 +520,7 @@ static int u32_replace_hw_knode(struct tcf_proto *tp, struct tc_u_knode *n,
 	if (n->ht_down)
 		offload.cls_u32->knode.link_handle = n->ht_down->handle;
 
-	err = dev->netdev_ops->ndo_setup_tc(dev, tp->q->handle,
+	err = dev->netdev_ops->ndo_setup_tc(dev, TC_SETUP_CLSU32, tp->q->handle,
 					    tp->chain->index, tp->protocol,
 					    &offload);
 
diff --git a/net/sched/sch_mqprio.c b/net/sched/sch_mqprio.c
index e0c02725cd48..329610ce4dfe 100644
--- a/net/sched/sch_mqprio.c
+++ b/net/sched/sch_mqprio.c
@@ -40,10 +40,10 @@ static void mqprio_destroy(struct Qdisc *sch)
 
 	if (priv->hw_offload && dev->netdev_ops->ndo_setup_tc) {
 		struct tc_mqprio_qopt offload = { 0 };
-		struct tc_to_netdev tc = { .type = TC_SETUP_MQPRIO,
-					   { .mqprio = &offload } };
+		struct tc_to_netdev tc = { { .mqprio = &offload } };
 
-		dev->netdev_ops->ndo_setup_tc(dev, sch->handle, 0, 0, &tc);
+		dev->netdev_ops->ndo_setup_tc(dev, TC_SETUP_MQPRIO,
+					      sch->handle, 0, 0, &tc);
 	} else {
 		netdev_set_num_tc(dev, 0);
 	}
@@ -149,11 +149,10 @@ static int mqprio_init(struct Qdisc *sch, struct nlattr *opt)
 	 */
 	if (qopt->hw) {
 		struct tc_mqprio_qopt offload = *qopt;
-		struct tc_to_netdev tc = { .type = TC_SETUP_MQPRIO,
-					   { .mqprio = &offload } };
+		struct tc_to_netdev tc = { { .mqprio = &offload } };
 
-		err = dev->netdev_ops->ndo_setup_tc(dev, sch->handle,
-						    0, 0, &tc);
+		err = dev->netdev_ops->ndo_setup_tc(dev, TC_SETUP_MQPRIO,
+						    sch->handle, 0, 0, &tc);
 		if (err)
 			return err;
 
-- 
cgit v1.2.3


From ade9b6588420b335851951702ab975c975b0c1b2 Mon Sep 17 00:00:00 2001
From: Jiri Pirko <jiri@mellanox.com>
Date: Mon, 7 Aug 2017 10:15:18 +0200
Subject: net: sched: rename TC_SETUP_MATCHALL to TC_SETUP_CLSMATCHALL

In order to be aligned with the rest of the types, rename
TC_SETUP_MATCHALL to TC_SETUP_CLSMATCHALL.

Signed-off-by: Jiri Pirko <jiri@mellanox.com>
Acked-by: Jamal Hadi Salim <jhs@mojatatu.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/mellanox/mlxsw/spectrum.c | 2 +-
 include/linux/netdevice.h                      | 2 +-
 net/dsa/slave.c                                | 2 +-
 net/sched/cls_matchall.c                       | 4 ++--
 4 files changed, 5 insertions(+), 5 deletions(-)

(limited to 'include')

diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum.c
index 155424266cbf..6438c38e7a68 100644
--- a/drivers/net/ethernet/mellanox/mlxsw/spectrum.c
+++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum.c
@@ -1704,7 +1704,7 @@ static int mlxsw_sp_setup_tc(struct net_device *dev, enum tc_setup_type type,
 		return -EOPNOTSUPP;
 
 	switch (type) {
-	case TC_SETUP_MATCHALL:
+	case TC_SETUP_CLSMATCHALL:
 		switch (tc->cls_mall->command) {
 		case TC_CLSMATCHALL_REPLACE:
 			return mlxsw_sp_port_add_cls_matchall(mlxsw_sp_port,
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index e4238e540544..f8051a36f900 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -778,7 +778,7 @@ enum tc_setup_type {
 	TC_SETUP_MQPRIO,
 	TC_SETUP_CLSU32,
 	TC_SETUP_CLSFLOWER,
-	TC_SETUP_MATCHALL,
+	TC_SETUP_CLSMATCHALL,
 	TC_SETUP_CLSBPF,
 };
 
diff --git a/net/dsa/slave.c b/net/dsa/slave.c
index b4b63c20ec80..453f6ddcd023 100644
--- a/net/dsa/slave.c
+++ b/net/dsa/slave.c
@@ -873,7 +873,7 @@ static int dsa_slave_setup_tc(struct net_device *dev, enum tc_setup_type type,
 		return -EOPNOTSUPP;
 
 	switch (type) {
-	case TC_SETUP_MATCHALL:
+	case TC_SETUP_CLSMATCHALL:
 		switch (tc->cls_mall->command) {
 		case TC_CLSMATCHALL_REPLACE:
 			return dsa_slave_add_cls_matchall(dev, protocol,
diff --git a/net/sched/cls_matchall.c b/net/sched/cls_matchall.c
index 6ffe0b82ab83..a8853ada22f6 100644
--- a/net/sched/cls_matchall.c
+++ b/net/sched/cls_matchall.c
@@ -63,7 +63,7 @@ static int mall_replace_hw_filter(struct tcf_proto *tp,
 	offload.cls_mall->exts = &head->exts;
 	offload.cls_mall->cookie = cookie;
 
-	err = dev->netdev_ops->ndo_setup_tc(dev, TC_SETUP_MATCHALL,
+	err = dev->netdev_ops->ndo_setup_tc(dev, TC_SETUP_CLSMATCHALL,
 					    tp->q->handle, tp->chain->index,
 					    tp->protocol, &offload);
 	if (!err)
@@ -85,7 +85,7 @@ static void mall_destroy_hw_filter(struct tcf_proto *tp,
 	offload.cls_mall->exts = NULL;
 	offload.cls_mall->cookie = cookie;
 
-	dev->netdev_ops->ndo_setup_tc(dev, TC_SETUP_MATCHALL, tp->q->handle,
+	dev->netdev_ops->ndo_setup_tc(dev, TC_SETUP_CLSMATCHALL, tp->q->handle,
 				      tp->chain->index, tp->protocol, &offload);
 }
 
-- 
cgit v1.2.3


From 3e0e82664322290a59189f7c2bcb39b0de932505 Mon Sep 17 00:00:00 2001
From: Jiri Pirko <jiri@mellanox.com>
Date: Mon, 7 Aug 2017 10:15:19 +0200
Subject: net: sched: make egress_dev flag part of flower offload struct

Since this is specific to flower now, make it part of the flower offload
struct.

Signed-off-by: Jiri Pirko <jiri@mellanox.com>
Acked-by: Jamal Hadi Salim <jhs@mojatatu.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/mellanox/mlx5/core/en_rep.c | 2 +-
 include/linux/netdevice.h                        | 1 -
 include/net/pkt_cls.h                            | 1 +
 net/sched/cls_flower.c                           | 2 +-
 4 files changed, 3 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c b/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c
index d44049ed5371..0e6bab182071 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c
@@ -661,7 +661,7 @@ static int mlx5e_rep_ndo_setup_tc(struct net_device *dev,
 	if (TC_H_MAJ(handle) != TC_H_MAJ(TC_H_INGRESS))
 		return -EOPNOTSUPP;
 
-	if (tc->egress_dev) {
+	if (type == TC_SETUP_CLSFLOWER && tc->cls_flower->egress_dev) {
 		struct mlx5_eswitch *esw = priv->mdev->priv.eswitch;
 		struct net_device *uplink_dev = mlx5_eswitch_get_uplink_netdev(esw);
 
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index f8051a36f900..bd49dbaee84e 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -792,7 +792,6 @@ struct tc_to_netdev {
 		struct tc_cls_bpf_offload *cls_bpf;
 		struct tc_mqprio_qopt *mqprio;
 	};
-	bool egress_dev;
 };
 
 /* These structures hold the attributes of xdp state that are being passed
diff --git a/include/net/pkt_cls.h b/include/net/pkt_cls.h
index e0c54f111467..8213acdfdf5a 100644
--- a/include/net/pkt_cls.h
+++ b/include/net/pkt_cls.h
@@ -504,6 +504,7 @@ struct tc_cls_flower_offload {
 	struct fl_flow_key *mask;
 	struct fl_flow_key *key;
 	struct tcf_exts *exts;
+	bool egress_dev;
 };
 
 enum tc_matchall_command {
diff --git a/net/sched/cls_flower.c b/net/sched/cls_flower.c
index ddeed17d2024..52deeed2b7f5 100644
--- a/net/sched/cls_flower.c
+++ b/net/sched/cls_flower.c
@@ -259,7 +259,7 @@ static int fl_hw_replace_filter(struct tcf_proto *tp,
 			return tc_skip_sw(f->flags) ? -EINVAL : 0;
 		}
 		dev = f->hw_dev;
-		tc->egress_dev = true;
+		offload.egress_dev = true;
 	} else {
 		f->hw_dev = dev;
 	}
-- 
cgit v1.2.3


From 5fd9fc4e207dba0c05cafe78417952b4c4ca02dc Mon Sep 17 00:00:00 2001
From: Jiri Pirko <jiri@mellanox.com>
Date: Mon, 7 Aug 2017 10:15:29 +0200
Subject: net: sched: push cls related args into cls_common structure

As ndo_setup_tc is generic offload op for whole tc subsystem, does not
really make sense to have cls-specific args. So move them under
cls_common structurure which is embedded in all cls structs.

Signed-off-by: Jiri Pirko <jiri@mellanox.com>
Acked-by: Jamal Hadi Salim <jhs@mojatatu.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/amd/xgbe/xgbe-drv.c           |  1 -
 drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.c    |  1 -
 drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.h    |  1 -
 drivers/net/ethernet/broadcom/bnxt/bnxt.c          |  1 -
 drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c    | 14 +++++-------
 drivers/net/ethernet/chelsio/cxgb4/cxgb4_tc_u32.c  |  7 +++---
 drivers/net/ethernet/chelsio/cxgb4/cxgb4_tc_u32.h  |  6 ++----
 drivers/net/ethernet/freescale/dpaa/dpaa_eth.c     |  1 -
 .../net/ethernet/hisilicon/hns3/hns3pf/hns3_enet.c |  1 -
 drivers/net/ethernet/intel/fm10k/fm10k_netdev.c    |  1 -
 drivers/net/ethernet/intel/i40e/i40e_main.c        |  1 -
 drivers/net/ethernet/intel/ixgbe/ixgbe_main.c      | 17 ++++++---------
 drivers/net/ethernet/mellanox/mlx4/en_netdev.c     |  1 -
 drivers/net/ethernet/mellanox/mlx5/core/en_main.c  | 11 ++++------
 drivers/net/ethernet/mellanox/mlx5/core/en_rep.c   | 15 +++++--------
 drivers/net/ethernet/mellanox/mlx5/core/en_tc.c    |  2 +-
 drivers/net/ethernet/mellanox/mlx5/core/en_tc.h    |  2 +-
 drivers/net/ethernet/mellanox/mlxsw/spectrum.c     | 25 ++++++++--------------
 drivers/net/ethernet/mellanox/mlxsw/spectrum.h     |  2 +-
 .../net/ethernet/mellanox/mlxsw/spectrum_flower.c  |  2 +-
 drivers/net/ethernet/netronome/nfp/bpf/main.c      | 10 +++++----
 drivers/net/ethernet/netronome/nfp/flower/main.h   |  3 +--
 .../net/ethernet/netronome/nfp/flower/offload.c    | 12 ++++++-----
 drivers/net/ethernet/netronome/nfp/nfp_app.h       |  6 ++----
 drivers/net/ethernet/netronome/nfp/nfp_port.c      |  6 +-----
 drivers/net/ethernet/netronome/nfp/nfp_port.h      |  1 -
 drivers/net/ethernet/sfc/efx.h                     |  1 -
 drivers/net/ethernet/sfc/falcon/efx.h              |  1 -
 drivers/net/ethernet/sfc/falcon/tx.c               |  1 -
 drivers/net/ethernet/sfc/tx.c                      |  1 -
 drivers/net/ethernet/ti/netcp_core.c               |  1 -
 include/linux/netdevice.h                          |  3 ---
 include/net/pkt_cls.h                              | 19 ++++++++++++++++
 net/dsa/slave.c                                    | 14 +++++-------
 net/sched/cls_bpf.c                                |  7 ++----
 net/sched/cls_flower.c                             | 13 ++++++-----
 net/sched/cls_matchall.c                           |  8 +++----
 net/sched/cls_u32.c                                | 20 +++++++----------
 net/sched/sch_mqprio.c                             |  6 ++----
 39 files changed, 101 insertions(+), 144 deletions(-)

(limited to 'include')

diff --git a/drivers/net/ethernet/amd/xgbe/xgbe-drv.c b/drivers/net/ethernet/amd/xgbe/xgbe-drv.c
index 6a6ea3bdd056..bbb7bfe0be7f 100644
--- a/drivers/net/ethernet/amd/xgbe/xgbe-drv.c
+++ b/drivers/net/ethernet/amd/xgbe/xgbe-drv.c
@@ -1919,7 +1919,6 @@ static void xgbe_poll_controller(struct net_device *netdev)
 #endif /* End CONFIG_NET_POLL_CONTROLLER */
 
 static int xgbe_setup_tc(struct net_device *netdev, enum tc_setup_type type,
-			 u32 handle, u32 chain_index, __be16 proto,
 			 struct tc_to_netdev *tc_to_netdev)
 {
 	struct xgbe_prv_data *pdata = netdev_priv(netdev);
diff --git a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.c b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.c
index 4395d1cac86f..257cf4be0162 100644
--- a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.c
+++ b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.c
@@ -4285,7 +4285,6 @@ int bnx2x_setup_tc(struct net_device *dev, u8 num_tc)
 }
 
 int __bnx2x_setup_tc(struct net_device *dev, enum tc_setup_type type,
-		     u32 handle, u32 chain_index, __be16 proto,
 		     struct tc_to_netdev *tc)
 {
 	if (type != TC_SETUP_MQPRIO)
diff --git a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.h b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.h
index 1ac4eb0d3413..04eb95043023 100644
--- a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.h
+++ b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.h
@@ -487,7 +487,6 @@ netdev_tx_t bnx2x_start_xmit(struct sk_buff *skb, struct net_device *dev);
 /* setup_tc callback */
 int bnx2x_setup_tc(struct net_device *dev, u8 num_tc);
 int __bnx2x_setup_tc(struct net_device *dev, enum tc_setup_type type,
-		     u32 handle, u32 chain_index, __be16 proto,
 		     struct tc_to_netdev *tc);
 
 int bnx2x_get_vf_config(struct net_device *dev, int vf,
diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.c b/drivers/net/ethernet/broadcom/bnxt/bnxt.c
index b98d9f33d9af..1545b88c545d 100644
--- a/drivers/net/ethernet/broadcom/bnxt/bnxt.c
+++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.c
@@ -7238,7 +7238,6 @@ int bnxt_setup_mq_tc(struct net_device *dev, u8 tc)
 }
 
 static int bnxt_setup_tc(struct net_device *dev, enum tc_setup_type type,
-			 u32 handle, u32 chain_index, __be16 proto,
 			 struct tc_to_netdev *ntc)
 {
 	if (type != TC_SETUP_MQPRIO)
diff --git a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c
index 651229070113..13199317c8e0 100644
--- a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c
+++ b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c
@@ -2890,27 +2890,24 @@ static int cxgb_set_tx_maxrate(struct net_device *dev, int index, u32 rate)
 }
 
 static int cxgb_setup_tc_cls_u32(struct net_device *dev,
-				 enum tc_setup_type type,
-				 u32 handle, u32 chain_index, __be16 proto,
 				 struct tc_cls_u32_offload *cls_u32)
 {
-	if (TC_H_MAJ(handle) != TC_H_MAJ(TC_H_INGRESS) ||
-	    chain_index)
+	if (TC_H_MAJ(cls_u32->common.handle) != TC_H_MAJ(TC_H_INGRESS) ||
+	    cls_u32->common.chain_index)
 		return -EOPNOTSUPP;
 
 	switch (cls_u32->command) {
 	case TC_CLSU32_NEW_KNODE:
 	case TC_CLSU32_REPLACE_KNODE:
-		return cxgb4_config_knode(dev, proto, cls_u32);
+		return cxgb4_config_knode(dev, cls_u32);
 	case TC_CLSU32_DELETE_KNODE:
-		return cxgb4_delete_knode(dev, proto, cls_u32);
+		return cxgb4_delete_knode(dev, cls_u32);
 	default:
 		return -EOPNOTSUPP;
 	}
 }
 
 static int cxgb_setup_tc(struct net_device *dev, enum tc_setup_type type,
-			 u32 handle, u32 chain_index, __be16 proto,
 			 struct tc_to_netdev *tc)
 {
 	struct port_info *pi = netdev2pinfo(dev);
@@ -2925,8 +2922,7 @@ static int cxgb_setup_tc(struct net_device *dev, enum tc_setup_type type,
 
 	switch (type) {
 	case TC_SETUP_CLSU32:
-		return cxgb_setup_tc_cls_u32(dev, type, handle, chain_index,
-					     proto, tc->cls_u32);
+		return cxgb_setup_tc_cls_u32(dev, tc->cls_u32);
 	default:
 		return -EOPNOTSUPP;
 	}
diff --git a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_tc_u32.c b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_tc_u32.c
index 6f734c52ef25..48970ba08bdc 100644
--- a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_tc_u32.c
+++ b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_tc_u32.c
@@ -146,11 +146,11 @@ static int fill_action_fields(struct adapter *adap,
 	return 0;
 }
 
-int cxgb4_config_knode(struct net_device *dev, __be16 protocol,
-		       struct tc_cls_u32_offload *cls)
+int cxgb4_config_knode(struct net_device *dev, struct tc_cls_u32_offload *cls)
 {
 	const struct cxgb4_match_field *start, *link_start = NULL;
 	struct adapter *adapter = netdev2adap(dev);
+	__be16 protocol = cls->common.protocol;
 	struct ch_filter_specification fs;
 	struct cxgb4_tc_u32_table *t;
 	struct cxgb4_link *link;
@@ -338,8 +338,7 @@ out:
 	return ret;
 }
 
-int cxgb4_delete_knode(struct net_device *dev, __be16 protocol,
-		       struct tc_cls_u32_offload *cls)
+int cxgb4_delete_knode(struct net_device *dev, struct tc_cls_u32_offload *cls)
 {
 	struct adapter *adapter = netdev2adap(dev);
 	unsigned int filter_id, max_tids, i, j;
diff --git a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_tc_u32.h b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_tc_u32.h
index 021261a41c13..70a07b7cca56 100644
--- a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_tc_u32.h
+++ b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_tc_u32.h
@@ -44,10 +44,8 @@ static inline bool can_tc_u32_offload(struct net_device *dev)
 	return (dev->features & NETIF_F_HW_TC) && adap->tc_u32 ? true : false;
 }
 
-int cxgb4_config_knode(struct net_device *dev, __be16 protocol,
-		       struct tc_cls_u32_offload *cls);
-int cxgb4_delete_knode(struct net_device *dev, __be16 protocol,
-		       struct tc_cls_u32_offload *cls);
+int cxgb4_config_knode(struct net_device *dev, struct tc_cls_u32_offload *cls);
+int cxgb4_delete_knode(struct net_device *dev, struct tc_cls_u32_offload *cls);
 
 void cxgb4_cleanup_tc_u32(struct adapter *adapter);
 struct cxgb4_tc_u32_table *cxgb4_init_tc_u32(struct adapter *adap);
diff --git a/drivers/net/ethernet/freescale/dpaa/dpaa_eth.c b/drivers/net/ethernet/freescale/dpaa/dpaa_eth.c
index d86d766777c8..3827608cec29 100644
--- a/drivers/net/ethernet/freescale/dpaa/dpaa_eth.c
+++ b/drivers/net/ethernet/freescale/dpaa/dpaa_eth.c
@@ -343,7 +343,6 @@ static void dpaa_get_stats64(struct net_device *net_dev,
 }
 
 static int dpaa_setup_tc(struct net_device *net_dev, enum tc_setup_type type,
-			 u32 handle, u32 chain_index, __be16 proto,
 			 struct tc_to_netdev *tc)
 {
 	struct dpaa_priv *priv = netdev_priv(net_dev);
diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hns3_enet.c b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hns3_enet.c
index ef5795923b0c..dc64d751db24 100644
--- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hns3_enet.c
+++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hns3_enet.c
@@ -1220,7 +1220,6 @@ static int hns3_setup_tc(struct net_device *netdev, u8 tc)
 }
 
 static int hns3_nic_setup_tc(struct net_device *dev, enum tc_setup_type type,
-			     u32 handle, u32 chain_index, __be16 protocol,
 			     struct tc_to_netdev *tc)
 {
 	if (type != TC_SETUP_MQPRIO)
diff --git a/drivers/net/ethernet/intel/fm10k/fm10k_netdev.c b/drivers/net/ethernet/intel/fm10k/fm10k_netdev.c
index b30190639e78..71004b8eff95 100644
--- a/drivers/net/ethernet/intel/fm10k/fm10k_netdev.c
+++ b/drivers/net/ethernet/intel/fm10k/fm10k_netdev.c
@@ -1266,7 +1266,6 @@ err_queueing_scheme:
 }
 
 static int __fm10k_setup_tc(struct net_device *dev, enum tc_setup_type type,
-			    u32 handle, u32 chain_index, __be16 proto,
 			    struct tc_to_netdev *tc)
 {
 	if (type != TC_SETUP_MQPRIO)
diff --git a/drivers/net/ethernet/intel/i40e/i40e_main.c b/drivers/net/ethernet/intel/i40e/i40e_main.c
index 7d47a718f922..97d8bb2e8320 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_main.c
+++ b/drivers/net/ethernet/intel/i40e/i40e_main.c
@@ -5657,7 +5657,6 @@ exit:
 }
 
 static int __i40e_setup_tc(struct net_device *netdev, enum tc_setup_type type,
-			   u32 handle, u32 chain_index, __be16 proto,
 			   struct tc_to_netdev *tc)
 {
 	if (type != TC_SETUP_MQPRIO)
diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
index 35db198199b0..0a350314d76b 100644
--- a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
+++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
@@ -8851,7 +8851,6 @@ static int ixgbe_delete_clsu32(struct ixgbe_adapter *adapter,
 }
 
 static int ixgbe_configure_clsu32_add_hnode(struct ixgbe_adapter *adapter,
-					    __be16 protocol,
 					    struct tc_cls_u32_offload *cls)
 {
 	u32 uhtid = TC_U32_USERHTID(cls->hnode.handle);
@@ -9037,9 +9036,9 @@ static int ixgbe_clsu32_build_input(struct ixgbe_fdir_filter *input,
 }
 
 static int ixgbe_configure_clsu32(struct ixgbe_adapter *adapter,
-				  __be16 protocol,
 				  struct tc_cls_u32_offload *cls)
 {
+	__be16 protocol = cls->common.protocol;
 	u32 loc = cls->knode.handle & 0xfffff;
 	struct ixgbe_hw *hw = &adapter->hw;
 	struct ixgbe_mat_field *field_ptr;
@@ -9227,25 +9226,23 @@ free_jump:
 }
 
 static int ixgbe_setup_tc_cls_u32(struct net_device *dev,
-				  u32 handle, u32 chain_index, __be16 proto,
 				  struct tc_cls_u32_offload *cls_u32)
 {
 	struct ixgbe_adapter *adapter = netdev_priv(dev);
 
-	if (TC_H_MAJ(handle) != TC_H_MAJ(TC_H_INGRESS) ||
-	    chain_index)
+	if (TC_H_MAJ(cls_u32->common.handle) != TC_H_MAJ(TC_H_INGRESS) ||
+	    cls_u32->common.chain_index)
 		return -EOPNOTSUPP;
 
 	switch (cls_u32->command) {
 	case TC_CLSU32_NEW_KNODE:
 	case TC_CLSU32_REPLACE_KNODE:
-		return ixgbe_configure_clsu32(adapter, proto, cls_u32);
+		return ixgbe_configure_clsu32(adapter, cls_u32);
 	case TC_CLSU32_DELETE_KNODE:
 		return ixgbe_delete_clsu32(adapter, cls_u32);
 	case TC_CLSU32_NEW_HNODE:
 	case TC_CLSU32_REPLACE_HNODE:
-		return ixgbe_configure_clsu32_add_hnode(adapter, proto,
-							cls_u32);
+		return ixgbe_configure_clsu32_add_hnode(adapter, cls_u32);
 	case TC_CLSU32_DELETE_HNODE:
 		return ixgbe_configure_clsu32_del_hnode(adapter, cls_u32);
 	default:
@@ -9261,13 +9258,11 @@ static int ixgbe_setup_tc_mqprio(struct net_device *dev,
 }
 
 static int __ixgbe_setup_tc(struct net_device *dev, enum tc_setup_type type,
-			    u32 handle, u32 chain_index, __be16 proto,
 			    struct tc_to_netdev *tc)
 {
 	switch (type) {
 	case TC_SETUP_CLSU32:
-		return ixgbe_setup_tc_cls_u32(dev, handle, chain_index, proto,
-					      tc->cls_u32);
+		return ixgbe_setup_tc_cls_u32(dev, tc->cls_u32);
 	case TC_SETUP_MQPRIO:
 		return ixgbe_setup_tc_mqprio(dev, tc->mqprio);
 	default:
diff --git a/drivers/net/ethernet/mellanox/mlx4/en_netdev.c b/drivers/net/ethernet/mellanox/mlx4/en_netdev.c
index 5c33550765ed..e81083e25ba6 100644
--- a/drivers/net/ethernet/mellanox/mlx4/en_netdev.c
+++ b/drivers/net/ethernet/mellanox/mlx4/en_netdev.c
@@ -131,7 +131,6 @@ out:
 }
 
 static int __mlx4_en_setup_tc(struct net_device *dev, enum tc_setup_type type,
-			      u32 handle, u32 chain_index, __be16 proto,
 			      struct tc_to_netdev *tc)
 {
 	if (type != TC_SETUP_MQPRIO)
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
index adf35da74a85..15f2a942962a 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
@@ -3032,18 +3032,17 @@ out:
 }
 
 static int mlx5e_setup_tc_cls_flower(struct net_device *dev,
-				     u32 handle, u32 chain_index, __be16 proto,
 				     struct tc_cls_flower_offload *cls_flower)
 {
 	struct mlx5e_priv *priv = netdev_priv(dev);
 
-	if (TC_H_MAJ(handle) != TC_H_MAJ(TC_H_INGRESS) ||
-	    chain_index)
+	if (TC_H_MAJ(cls_flower->common.handle) != TC_H_MAJ(TC_H_INGRESS) ||
+	    cls_flower->common.chain_index)
 		return -EOPNOTSUPP;
 
 	switch (cls_flower->command) {
 	case TC_CLSFLOWER_REPLACE:
-		return mlx5e_configure_flower(priv, proto, cls_flower);
+		return mlx5e_configure_flower(priv, cls_flower);
 	case TC_CLSFLOWER_DESTROY:
 		return mlx5e_delete_flower(priv, cls_flower);
 	case TC_CLSFLOWER_STATS:
@@ -3054,13 +3053,11 @@ static int mlx5e_setup_tc_cls_flower(struct net_device *dev,
 }
 
 static int mlx5e_setup_tc(struct net_device *dev, enum tc_setup_type type,
-			  u32 handle, u32 chain_index, __be16 proto,
 			  struct tc_to_netdev *tc)
 {
 	switch (type) {
 	case TC_SETUP_CLSFLOWER:
-		return mlx5e_setup_tc_cls_flower(dev, handle, chain_index,
-						 proto, tc->cls_flower);
+		return mlx5e_setup_tc_cls_flower(dev, tc->cls_flower);
 	case TC_SETUP_MQPRIO:
 		return mlx5e_setup_tc_mqprio(dev, tc->mqprio);
 	default:
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c b/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c
index e6cc642f6d8c..e5cf2e7ae052 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c
@@ -652,15 +652,13 @@ static int mlx5e_rep_get_phys_port_name(struct net_device *dev,
 }
 
 static int mlx5e_rep_setup_tc_cls_flower(struct net_device *dev,
-					 u32 handle, u32 chain_index,
-					 __be16 proto,
 					 struct tc_to_netdev *tc)
 {
 	struct tc_cls_flower_offload *cls_flower = tc->cls_flower;
 	struct mlx5e_priv *priv = netdev_priv(dev);
 
-	if (TC_H_MAJ(handle) != TC_H_MAJ(TC_H_INGRESS) ||
-	    chain_index)
+	if (TC_H_MAJ(cls_flower->common.handle) != TC_H_MAJ(TC_H_INGRESS) ||
+	    cls_flower->common.chain_index)
 		return -EOPNOTSUPP;
 
 	if (cls_flower->egress_dev) {
@@ -668,13 +666,12 @@ static int mlx5e_rep_setup_tc_cls_flower(struct net_device *dev,
 
 		dev = mlx5_eswitch_get_uplink_netdev(esw);
 		return dev->netdev_ops->ndo_setup_tc(dev, TC_SETUP_CLSFLOWER,
-						     handle, chain_index,
-						     proto, tc);
+						     tc);
 	}
 
 	switch (cls_flower->command) {
 	case TC_CLSFLOWER_REPLACE:
-		return mlx5e_configure_flower(priv, proto, cls_flower);
+		return mlx5e_configure_flower(priv, cls_flower);
 	case TC_CLSFLOWER_DESTROY:
 		return mlx5e_delete_flower(priv, cls_flower);
 	case TC_CLSFLOWER_STATS:
@@ -685,13 +682,11 @@ static int mlx5e_rep_setup_tc_cls_flower(struct net_device *dev,
 }
 
 static int mlx5e_rep_setup_tc(struct net_device *dev, enum tc_setup_type type,
-			      u32 handle, u32 chain_index, __be16 proto,
 			      struct tc_to_netdev *tc)
 {
 	switch (type) {
 	case TC_SETUP_CLSFLOWER:
-		return mlx5e_rep_setup_tc_cls_flower(dev, handle, chain_index,
-						     proto, tc);
+		return mlx5e_rep_setup_tc_cls_flower(dev, tc);
 	default:
 		return -EOPNOTSUPP;
 	}
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c
index 78f50d9f621d..3b10d3df7627 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c
@@ -1939,7 +1939,7 @@ static int parse_tc_fdb_actions(struct mlx5e_priv *priv, struct tcf_exts *exts,
 	return err;
 }
 
-int mlx5e_configure_flower(struct mlx5e_priv *priv, __be16 protocol,
+int mlx5e_configure_flower(struct mlx5e_priv *priv,
 			   struct tc_cls_flower_offload *f)
 {
 	struct mlx5_eswitch *esw = priv->mdev->priv.eswitch;
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.h b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.h
index ecbe30d808ae..5a0f4a487855 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.h
@@ -38,7 +38,7 @@
 int mlx5e_tc_init(struct mlx5e_priv *priv);
 void mlx5e_tc_cleanup(struct mlx5e_priv *priv);
 
-int mlx5e_configure_flower(struct mlx5e_priv *priv, __be16 protocol,
+int mlx5e_configure_flower(struct mlx5e_priv *priv,
 			   struct tc_cls_flower_offload *f);
 int mlx5e_delete_flower(struct mlx5e_priv *priv,
 			struct tc_cls_flower_offload *f);
diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum.c
index f333d086932d..1ca3204f5543 100644
--- a/drivers/net/ethernet/mellanox/mlxsw/spectrum.c
+++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum.c
@@ -1617,11 +1617,11 @@ mlxsw_sp_port_del_cls_matchall_sample(struct mlxsw_sp_port *mlxsw_sp_port)
 }
 
 static int mlxsw_sp_port_add_cls_matchall(struct mlxsw_sp_port *mlxsw_sp_port,
-					  __be16 protocol,
 					  struct tc_cls_matchall_offload *f,
 					  bool ingress)
 {
 	struct mlxsw_sp_port_mall_tc_entry *mall_tc_entry;
+	__be16 protocol = f->common.protocol;
 	const struct tc_action *a;
 	LIST_HEAD(actions);
 	int err;
@@ -1694,18 +1694,16 @@ static void mlxsw_sp_port_del_cls_matchall(struct mlxsw_sp_port *mlxsw_sp_port,
 }
 
 static int mlxsw_sp_setup_tc_cls_matchall(struct mlxsw_sp_port *mlxsw_sp_port,
-					  u32 handle, u32 chain_index,
-					  __be16 proto,
 					  struct tc_cls_matchall_offload *f)
 {
-	bool ingress = TC_H_MAJ(handle) == TC_H_MAJ(TC_H_INGRESS);
+	bool ingress = TC_H_MAJ(f->common.handle) == TC_H_MAJ(TC_H_INGRESS);
 
-	if (chain_index)
+	if (f->common.chain_index)
 		return -EOPNOTSUPP;
 
 	switch (f->command) {
 	case TC_CLSMATCHALL_REPLACE:
-		return mlxsw_sp_port_add_cls_matchall(mlxsw_sp_port, proto, f,
+		return mlxsw_sp_port_add_cls_matchall(mlxsw_sp_port, f,
 						      ingress);
 	case TC_CLSMATCHALL_DESTROY:
 		mlxsw_sp_port_del_cls_matchall(mlxsw_sp_port, f);
@@ -1717,18 +1715,16 @@ static int mlxsw_sp_setup_tc_cls_matchall(struct mlxsw_sp_port *mlxsw_sp_port,
 
 static int
 mlxsw_sp_setup_tc_cls_flower(struct mlxsw_sp_port *mlxsw_sp_port,
-			     u32 handle, u32 chain_index, __be16 proto,
 			     struct tc_cls_flower_offload *f)
 {
-	bool ingress = TC_H_MAJ(handle) == TC_H_MAJ(TC_H_INGRESS);
+	bool ingress = TC_H_MAJ(f->common.handle) == TC_H_MAJ(TC_H_INGRESS);
 
-	if (chain_index)
+	if (f->common.chain_index)
 		return -EOPNOTSUPP;
 
 	switch (f->command) {
 	case TC_CLSFLOWER_REPLACE:
-		return mlxsw_sp_flower_replace(mlxsw_sp_port, ingress,
-					       proto, f);
+		return mlxsw_sp_flower_replace(mlxsw_sp_port, ingress, f);
 	case TC_CLSFLOWER_DESTROY:
 		mlxsw_sp_flower_destroy(mlxsw_sp_port, ingress, f);
 		return 0;
@@ -1740,19 +1736,16 @@ mlxsw_sp_setup_tc_cls_flower(struct mlxsw_sp_port *mlxsw_sp_port,
 }
 
 static int mlxsw_sp_setup_tc(struct net_device *dev, enum tc_setup_type type,
-			     u32 handle, u32 chain_index, __be16 proto,
 			     struct tc_to_netdev *tc)
 {
 	struct mlxsw_sp_port *mlxsw_sp_port = netdev_priv(dev);
 
 	switch (type) {
 	case TC_SETUP_CLSMATCHALL:
-		return mlxsw_sp_setup_tc_cls_matchall(mlxsw_sp_port, handle,
-						      chain_index, proto,
+		return mlxsw_sp_setup_tc_cls_matchall(mlxsw_sp_port,
 						      tc->cls_mall);
 	case TC_SETUP_CLSFLOWER:
-		return mlxsw_sp_setup_tc_cls_flower(mlxsw_sp_port, handle,
-						    chain_index, proto,
+		return mlxsw_sp_setup_tc_cls_flower(mlxsw_sp_port,
 						    tc->cls_flower);
 	default:
 		return -EOPNOTSUPP;
diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum.h b/drivers/net/ethernet/mellanox/mlxsw/spectrum.h
index e848f06e34e6..8452d1db2f3f 100644
--- a/drivers/net/ethernet/mellanox/mlxsw/spectrum.h
+++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum.h
@@ -508,7 +508,7 @@ extern const struct mlxsw_sp_acl_ops mlxsw_sp_acl_tcam_ops;
 
 /* spectrum_flower.c */
 int mlxsw_sp_flower_replace(struct mlxsw_sp_port *mlxsw_sp_port, bool ingress,
-			    __be16 protocol, struct tc_cls_flower_offload *f);
+			    struct tc_cls_flower_offload *f);
 void mlxsw_sp_flower_destroy(struct mlxsw_sp_port *mlxsw_sp_port, bool ingress,
 			     struct tc_cls_flower_offload *f);
 int mlxsw_sp_flower_stats(struct mlxsw_sp_port *mlxsw_sp_port, bool ingress,
diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum_flower.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum_flower.c
index 9be48d2e43ca..021b6c0076c0 100644
--- a/drivers/net/ethernet/mellanox/mlxsw/spectrum_flower.c
+++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum_flower.c
@@ -368,7 +368,7 @@ static int mlxsw_sp_flower_parse(struct mlxsw_sp *mlxsw_sp,
 }
 
 int mlxsw_sp_flower_replace(struct mlxsw_sp_port *mlxsw_sp_port, bool ingress,
-			    __be16 protocol, struct tc_cls_flower_offload *f)
+			    struct tc_cls_flower_offload *f)
 {
 	struct mlxsw_sp *mlxsw_sp = mlxsw_sp_port->mlxsw_sp;
 	struct net_device *dev = mlxsw_sp_port->dev;
diff --git a/drivers/net/ethernet/netronome/nfp/bpf/main.c b/drivers/net/ethernet/netronome/nfp/bpf/main.c
index d7975dcecb40..152a7abb58ed 100644
--- a/drivers/net/ethernet/netronome/nfp/bpf/main.c
+++ b/drivers/net/ethernet/netronome/nfp/bpf/main.c
@@ -121,20 +121,22 @@ static void nfp_bpf_vnic_clean(struct nfp_app *app, struct nfp_net *nn)
 }
 
 static int nfp_bpf_setup_tc(struct nfp_app *app, struct net_device *netdev,
-			    enum tc_setup_type type, u32 handle, __be16 proto,
+			    enum tc_setup_type type,
 			    struct tc_to_netdev *tc)
 {
+	struct tc_cls_bpf_offload *cls_bpf = tc->cls_bpf;
 	struct nfp_net *nn = netdev_priv(netdev);
 
 	if (type != TC_SETUP_CLSBPF || !nfp_net_ebpf_capable(nn) ||
-	    TC_H_MAJ(handle) != TC_H_MAJ(TC_H_INGRESS) ||
-	    proto != htons(ETH_P_ALL))
+	    TC_H_MAJ(cls_bpf->common.handle) != TC_H_MAJ(TC_H_INGRESS) ||
+	    cls_bpf->common.protocol != htons(ETH_P_ALL) ||
+	    cls_bpf->common.chain_index)
 		return -EOPNOTSUPP;
 
 	if (nn->dp.bpf_offload_xdp)
 		return -EBUSY;
 
-	return nfp_net_bpf_offload(nn, tc->cls_bpf);
+	return nfp_net_bpf_offload(nn, cls_bpf);
 }
 
 static bool nfp_bpf_tc_busy(struct nfp_app *app, struct nfp_net *nn)
diff --git a/drivers/net/ethernet/netronome/nfp/flower/main.h b/drivers/net/ethernet/netronome/nfp/flower/main.h
index 314e6e8ba649..eb94d08e35cf 100644
--- a/drivers/net/ethernet/netronome/nfp/flower/main.h
+++ b/drivers/net/ethernet/netronome/nfp/flower/main.h
@@ -135,8 +135,7 @@ int nfp_flower_metadata_init(struct nfp_app *app);
 void nfp_flower_metadata_cleanup(struct nfp_app *app);
 
 int nfp_flower_setup_tc(struct nfp_app *app, struct net_device *netdev,
-			enum tc_setup_type type, u32 handle, __be16 proto,
-			struct tc_to_netdev *tc);
+			enum tc_setup_type type, struct tc_to_netdev *tc);
 int nfp_flower_compile_flow_match(struct tc_cls_flower_offload *flow,
 				  struct nfp_fl_key_ls *key_ls,
 				  struct net_device *netdev,
diff --git a/drivers/net/ethernet/netronome/nfp/flower/offload.c b/drivers/net/ethernet/netronome/nfp/flower/offload.c
index 58af438a95c1..8197836c650d 100644
--- a/drivers/net/ethernet/netronome/nfp/flower/offload.c
+++ b/drivers/net/ethernet/netronome/nfp/flower/offload.c
@@ -385,13 +385,15 @@ nfp_flower_repr_offload(struct nfp_app *app, struct net_device *netdev,
 }
 
 int nfp_flower_setup_tc(struct nfp_app *app, struct net_device *netdev,
-			enum tc_setup_type type, u32 handle, __be16 proto,
-			struct tc_to_netdev *tc)
+			enum tc_setup_type type, struct tc_to_netdev *tc)
 {
+	struct tc_cls_flower_offload *cls_flower = tc->cls_flower;
+
 	if (type != TC_SETUP_CLSFLOWER ||
-	    TC_H_MAJ(handle) != TC_H_MAJ(TC_H_INGRESS) ||
-	    !eth_proto_is_802_3(proto))
+	    TC_H_MAJ(cls_flower->common.handle) != TC_H_MAJ(TC_H_INGRESS) ||
+	    !eth_proto_is_802_3(cls_flower->common.protocol) ||
+	    cls_flower->common.chain_index)
 		return -EOPNOTSUPP;
 
-	return nfp_flower_repr_offload(app, netdev, tc->cls_flower);
+	return nfp_flower_repr_offload(app, netdev, cls_flower);
 }
diff --git a/drivers/net/ethernet/netronome/nfp/nfp_app.h b/drivers/net/ethernet/netronome/nfp/nfp_app.h
index b3b03bb9d907..7a2f950b149c 100644
--- a/drivers/net/ethernet/netronome/nfp/nfp_app.h
+++ b/drivers/net/ethernet/netronome/nfp/nfp_app.h
@@ -109,8 +109,7 @@ struct nfp_app_type {
 	void (*ctrl_msg_rx)(struct nfp_app *app, struct sk_buff *skb);
 
 	int (*setup_tc)(struct nfp_app *app, struct net_device *netdev,
-			enum tc_setup_type type, u32 handle, __be16 proto,
-			struct tc_to_netdev *tc);
+			enum tc_setup_type type, struct tc_to_netdev *tc);
 	bool (*tc_busy)(struct nfp_app *app, struct nfp_net *nn);
 	int (*xdp_offload)(struct nfp_app *app, struct nfp_net *nn,
 			   struct bpf_prog *prog);
@@ -240,12 +239,11 @@ static inline bool nfp_app_tc_busy(struct nfp_app *app, struct nfp_net *nn)
 static inline int nfp_app_setup_tc(struct nfp_app *app,
 				   struct net_device *netdev,
 				   enum tc_setup_type type,
-				   u32 handle, __be16 proto,
 				   struct tc_to_netdev *tc)
 {
 	if (!app || !app->type->setup_tc)
 		return -EOPNOTSUPP;
-	return app->type->setup_tc(app, netdev, type, handle, proto, tc);
+	return app->type->setup_tc(app, netdev, type, tc);
 }
 
 static inline int nfp_app_xdp_offload(struct nfp_app *app, struct nfp_net *nn,
diff --git a/drivers/net/ethernet/netronome/nfp/nfp_port.c b/drivers/net/ethernet/netronome/nfp/nfp_port.c
index 9d776f982352..e8abab2b912e 100644
--- a/drivers/net/ethernet/netronome/nfp/nfp_port.c
+++ b/drivers/net/ethernet/netronome/nfp/nfp_port.c
@@ -89,19 +89,15 @@ const struct switchdev_ops nfp_port_switchdev_ops = {
 };
 
 int nfp_port_setup_tc(struct net_device *netdev, enum tc_setup_type type,
-		      u32 handle, u32 chain_index, __be16 proto,
 		      struct tc_to_netdev *tc)
 {
 	struct nfp_port *port;
 
-	if (chain_index)
-		return -EOPNOTSUPP;
-
 	port = nfp_port_from_netdev(netdev);
 	if (!port)
 		return -EOPNOTSUPP;
 
-	return nfp_app_setup_tc(port->app, netdev, type, handle, proto, tc);
+	return nfp_app_setup_tc(port->app, netdev, type, tc);
 }
 
 struct nfp_port *
diff --git a/drivers/net/ethernet/netronome/nfp/nfp_port.h b/drivers/net/ethernet/netronome/nfp/nfp_port.h
index 239c5401000c..252f06d4307f 100644
--- a/drivers/net/ethernet/netronome/nfp/nfp_port.h
+++ b/drivers/net/ethernet/netronome/nfp/nfp_port.h
@@ -110,7 +110,6 @@ struct nfp_port {
 extern const struct switchdev_ops nfp_port_switchdev_ops;
 
 int nfp_port_setup_tc(struct net_device *netdev, enum tc_setup_type type,
-		      u32 handle, u32 chain_index, __be16 proto,
 		      struct tc_to_netdev *tc);
 
 struct nfp_port *nfp_port_from_netdev(struct net_device *netdev);
diff --git a/drivers/net/ethernet/sfc/efx.h b/drivers/net/ethernet/sfc/efx.h
index e41a7179bc05..b0c6004db138 100644
--- a/drivers/net/ethernet/sfc/efx.h
+++ b/drivers/net/ethernet/sfc/efx.h
@@ -33,7 +33,6 @@ netdev_tx_t efx_hard_start_xmit(struct sk_buff *skb,
 netdev_tx_t efx_enqueue_skb(struct efx_tx_queue *tx_queue, struct sk_buff *skb);
 void efx_xmit_done(struct efx_tx_queue *tx_queue, unsigned int index);
 int efx_setup_tc(struct net_device *net_dev, enum tc_setup_type type,
-		 u32 handle, u32 chain_index, __be16 proto,
 		 struct tc_to_netdev *tc);
 unsigned int efx_tx_max_skb_descs(struct efx_nic *efx);
 extern unsigned int efx_piobuf_size;
diff --git a/drivers/net/ethernet/sfc/falcon/efx.h b/drivers/net/ethernet/sfc/falcon/efx.h
index f3bc67ec1f30..4497511fc914 100644
--- a/drivers/net/ethernet/sfc/falcon/efx.h
+++ b/drivers/net/ethernet/sfc/falcon/efx.h
@@ -33,7 +33,6 @@ netdev_tx_t ef4_hard_start_xmit(struct sk_buff *skb,
 netdev_tx_t ef4_enqueue_skb(struct ef4_tx_queue *tx_queue, struct sk_buff *skb);
 void ef4_xmit_done(struct ef4_tx_queue *tx_queue, unsigned int index);
 int ef4_setup_tc(struct net_device *net_dev, enum tc_setup_type type,
-		 u32 handle, u32 chain_index, __be16 proto,
 		 struct tc_to_netdev *tc);
 unsigned int ef4_tx_max_skb_descs(struct ef4_nic *efx);
 extern bool ef4_separate_tx_channels;
diff --git a/drivers/net/ethernet/sfc/falcon/tx.c b/drivers/net/ethernet/sfc/falcon/tx.c
index 6c4752694c1f..447519ac3fa4 100644
--- a/drivers/net/ethernet/sfc/falcon/tx.c
+++ b/drivers/net/ethernet/sfc/falcon/tx.c
@@ -426,7 +426,6 @@ void ef4_init_tx_queue_core_txq(struct ef4_tx_queue *tx_queue)
 }
 
 int ef4_setup_tc(struct net_device *net_dev, enum tc_setup_type type,
-		 u32 handle, u32 chain_index, __be16 proto,
 		 struct tc_to_netdev *ntc)
 {
 	struct ef4_nic *efx = netdev_priv(net_dev);
diff --git a/drivers/net/ethernet/sfc/tx.c b/drivers/net/ethernet/sfc/tx.c
index 0c08c10d751c..d17af918ac50 100644
--- a/drivers/net/ethernet/sfc/tx.c
+++ b/drivers/net/ethernet/sfc/tx.c
@@ -654,7 +654,6 @@ void efx_init_tx_queue_core_txq(struct efx_tx_queue *tx_queue)
 }
 
 int efx_setup_tc(struct net_device *net_dev, enum tc_setup_type type,
-		 u32 handle, u32 chain_index, __be16 proto,
 		 struct tc_to_netdev *ntc)
 {
 	struct efx_nic *efx = netdev_priv(net_dev);
diff --git a/drivers/net/ethernet/ti/netcp_core.c b/drivers/net/ethernet/ti/netcp_core.c
index cb21742f6177..14f91b285f00 100644
--- a/drivers/net/ethernet/ti/netcp_core.c
+++ b/drivers/net/ethernet/ti/netcp_core.c
@@ -1878,7 +1878,6 @@ static u16 netcp_select_queue(struct net_device *dev, struct sk_buff *skb,
 }
 
 static int netcp_setup_tc(struct net_device *dev, enum tc_setup_type type,
-			  u32 handle, u32 chain_index, __be16 proto,
 			  struct tc_to_netdev *tc)
 {
 	u8 num_tc;
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index bd49dbaee84e..6e2f7e38cf8e 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -977,7 +977,6 @@ struct xfrmdev_ops {
  * int (*ndo_set_vf_rss_query_en)(struct net_device *dev, int vf, bool setting);
  * int (*ndo_get_vf_port)(struct net_device *dev, int vf, struct sk_buff *skb);
  * int (*ndo_setup_tc)(struct net_device *dev, enum tc_setup_type type,
- *		       u32 handle, u32 chain_index, __be16 protocol,
  *		       struct tc_to_netdev *tc);
  *	Called to setup any 'tc' scheduler, classifier or action on @dev.
  *	This is always called from the stack with the rtnl lock held and netif
@@ -1227,8 +1226,6 @@ struct net_device_ops {
 						   int vf, bool setting);
 	int			(*ndo_setup_tc)(struct net_device *dev,
 						enum tc_setup_type type,
-						u32 handle, u32 chain_index,
-						__be16 protocol,
 						struct tc_to_netdev *tc);
 #if IS_ENABLED(CONFIG_FCOE)
 	int			(*ndo_fcoe_enable)(struct net_device *dev);
diff --git a/include/net/pkt_cls.h b/include/net/pkt_cls.h
index 8213acdfdf5a..ffaddf72108e 100644
--- a/include/net/pkt_cls.h
+++ b/include/net/pkt_cls.h
@@ -405,6 +405,21 @@ tcf_match_indev(struct sk_buff *skb, int ifindex)
 }
 #endif /* CONFIG_NET_CLS_IND */
 
+struct tc_cls_common_offload {
+	u32 handle;
+	u32 chain_index;
+	__be16 protocol;
+};
+
+static inline void
+tc_cls_common_offload_init(struct tc_cls_common_offload *cls_common,
+			   const struct tcf_proto *tp)
+{
+	cls_common->handle = tp->q->handle;
+	cls_common->chain_index = tp->chain->index;
+	cls_common->protocol = tp->protocol;
+}
+
 struct tc_cls_u32_knode {
 	struct tcf_exts *exts;
 	struct tc_u32_sel *sel;
@@ -431,6 +446,7 @@ enum tc_clsu32_command {
 };
 
 struct tc_cls_u32_offload {
+	struct tc_cls_common_offload common;
 	/* knode values */
 	enum tc_clsu32_command command;
 	union {
@@ -497,6 +513,7 @@ enum tc_fl_command {
 };
 
 struct tc_cls_flower_offload {
+	struct tc_cls_common_offload common;
 	enum tc_fl_command command;
 	u32 prio;
 	unsigned long cookie;
@@ -513,6 +530,7 @@ enum tc_matchall_command {
 };
 
 struct tc_cls_matchall_offload {
+	struct tc_cls_common_offload common;
 	enum tc_matchall_command command;
 	struct tcf_exts *exts;
 	unsigned long cookie;
@@ -526,6 +544,7 @@ enum tc_clsbpf_command {
 };
 
 struct tc_cls_bpf_offload {
+	struct tc_cls_common_offload common;
 	enum tc_clsbpf_command command;
 	struct tcf_exts *exts;
 	struct bpf_prog *prog;
diff --git a/net/dsa/slave.c b/net/dsa/slave.c
index e76d576b941d..5e01e9271619 100644
--- a/net/dsa/slave.c
+++ b/net/dsa/slave.c
@@ -774,12 +774,12 @@ dsa_slave_mall_tc_entry_find(struct dsa_slave_priv *p,
 }
 
 static int dsa_slave_add_cls_matchall(struct net_device *dev,
-				      __be16 protocol,
 				      struct tc_cls_matchall_offload *cls,
 				      bool ingress)
 {
 	struct dsa_slave_priv *p = netdev_priv(dev);
 	struct dsa_mall_tc_entry *mall_tc_entry;
+	__be16 protocol = cls->common.protocol;
 	struct dsa_switch *ds = p->dp->ds;
 	struct net *net = dev_net(dev);
 	struct dsa_slave_priv *to_p;
@@ -864,18 +864,16 @@ static void dsa_slave_del_cls_matchall(struct net_device *dev,
 }
 
 static int dsa_slave_setup_tc_cls_matchall(struct net_device *dev,
-					   u32 handle, u32 chain_index,
-					   __be16 protocol,
 					   struct tc_cls_matchall_offload *cls)
 {
-	bool ingress = TC_H_MAJ(handle) == TC_H_MAJ(TC_H_INGRESS);
+	bool ingress = TC_H_MAJ(cls->common.handle) == TC_H_MAJ(TC_H_INGRESS);
 
-	if (chain_index)
+	if (cls->common.chain_index)
 		return -EOPNOTSUPP;
 
 	switch (cls->command) {
 	case TC_CLSMATCHALL_REPLACE:
-		return dsa_slave_add_cls_matchall(dev, protocol, cls, ingress);
+		return dsa_slave_add_cls_matchall(dev, cls, ingress);
 	case TC_CLSMATCHALL_DESTROY:
 		dsa_slave_del_cls_matchall(dev, cls);
 		return 0;
@@ -885,13 +883,11 @@ static int dsa_slave_setup_tc_cls_matchall(struct net_device *dev,
 }
 
 static int dsa_slave_setup_tc(struct net_device *dev, enum tc_setup_type type,
-			      u32 handle, u32 chain_index, __be16 protocol,
 			      struct tc_to_netdev *tc)
 {
 	switch (type) {
 	case TC_SETUP_CLSMATCHALL:
-		return dsa_slave_setup_tc_cls_matchall(dev, handle, chain_index,
-						       protocol, tc->cls_mall);
+		return dsa_slave_setup_tc_cls_matchall(dev, tc->cls_mall);
 	default:
 		return -EOPNOTSUPP;
 	}
diff --git a/net/sched/cls_bpf.c b/net/sched/cls_bpf.c
index e2bf2753173d..dde8efdcee3b 100644
--- a/net/sched/cls_bpf.c
+++ b/net/sched/cls_bpf.c
@@ -153,6 +153,7 @@ static int cls_bpf_offload_cmd(struct tcf_proto *tp, struct cls_bpf_prog *prog,
 
 	offload.cls_bpf = &bpf_offload;
 
+	tc_cls_common_offload_init(&bpf_offload.common, tp);
 	bpf_offload.command = cmd;
 	bpf_offload.exts = &prog->exts;
 	bpf_offload.prog = prog->filter;
@@ -160,11 +161,7 @@ static int cls_bpf_offload_cmd(struct tcf_proto *tp, struct cls_bpf_prog *prog,
 	bpf_offload.exts_integrated = prog->exts_integrated;
 	bpf_offload.gen_flags = prog->gen_flags;
 
-	err = dev->netdev_ops->ndo_setup_tc(dev, TC_SETUP_CLSBPF,
-					    tp->q->handle,
-					    tp->chain->index,
-					    tp->protocol, &offload);
-
+	err = dev->netdev_ops->ndo_setup_tc(dev, TC_SETUP_CLSBPF, &offload);
 	if (!err && (cmd == TC_CLSBPF_ADD || cmd == TC_CLSBPF_REPLACE))
 		prog->gen_flags |= TCA_CLS_FLAGS_IN_HW;
 
diff --git a/net/sched/cls_flower.c b/net/sched/cls_flower.c
index 52deeed2b7f5..1fdf2889ba9f 100644
--- a/net/sched/cls_flower.c
+++ b/net/sched/cls_flower.c
@@ -232,14 +232,14 @@ static void fl_hw_destroy_filter(struct tcf_proto *tp, struct cls_fl_filter *f)
 	if (!tc_can_offload(dev, tp))
 		return;
 
+	tc_cls_common_offload_init(&offload.common, tp);
 	offload.command = TC_CLSFLOWER_DESTROY;
 	offload.prio = tp->prio;
 	offload.cookie = (unsigned long)f;
 
 	tc->cls_flower = &offload;
 
-	dev->netdev_ops->ndo_setup_tc(dev, TC_SETUP_CLSFLOWER, tp->q->handle,
-				      tp->chain->index, tp->protocol, tc);
+	dev->netdev_ops->ndo_setup_tc(dev, TC_SETUP_CLSFLOWER, tc);
 }
 
 static int fl_hw_replace_filter(struct tcf_proto *tp,
@@ -264,6 +264,7 @@ static int fl_hw_replace_filter(struct tcf_proto *tp,
 		f->hw_dev = dev;
 	}
 
+	tc_cls_common_offload_init(&offload.common, tp);
 	offload.command = TC_CLSFLOWER_REPLACE;
 	offload.prio = tp->prio;
 	offload.cookie = (unsigned long)f;
@@ -274,9 +275,7 @@ static int fl_hw_replace_filter(struct tcf_proto *tp,
 
 	tc->cls_flower = &offload;
 
-	err = dev->netdev_ops->ndo_setup_tc(dev, TC_SETUP_CLSFLOWER,
-					    tp->q->handle, tp->chain->index,
-					    tp->protocol, tc);
+	err = dev->netdev_ops->ndo_setup_tc(dev, TC_SETUP_CLSFLOWER, tc);
 	if (!err)
 		f->flags |= TCA_CLS_FLAGS_IN_HW;
 
@@ -294,6 +293,7 @@ static void fl_hw_update_stats(struct tcf_proto *tp, struct cls_fl_filter *f)
 	if (!tc_can_offload(dev, tp))
 		return;
 
+	tc_cls_common_offload_init(&offload.common, tp);
 	offload.command = TC_CLSFLOWER_STATS;
 	offload.prio = tp->prio;
 	offload.cookie = (unsigned long)f;
@@ -301,8 +301,7 @@ static void fl_hw_update_stats(struct tcf_proto *tp, struct cls_fl_filter *f)
 
 	tc->cls_flower = &offload;
 
-	dev->netdev_ops->ndo_setup_tc(dev, TC_CLSFLOWER_STATS, tp->q->handle,
-				      tp->chain->index, tp->protocol, tc);
+	dev->netdev_ops->ndo_setup_tc(dev, TC_CLSFLOWER_STATS, tc);
 }
 
 static void __fl_delete(struct tcf_proto *tp, struct cls_fl_filter *f)
diff --git a/net/sched/cls_matchall.c b/net/sched/cls_matchall.c
index a8853ada22f6..174c700160ca 100644
--- a/net/sched/cls_matchall.c
+++ b/net/sched/cls_matchall.c
@@ -58,14 +58,14 @@ static int mall_replace_hw_filter(struct tcf_proto *tp,
 	struct tc_cls_matchall_offload mall_offload = {0};
 	int err;
 
+	tc_cls_common_offload_init(&mall_offload.common, tp);
 	offload.cls_mall = &mall_offload;
 	offload.cls_mall->command = TC_CLSMATCHALL_REPLACE;
 	offload.cls_mall->exts = &head->exts;
 	offload.cls_mall->cookie = cookie;
 
 	err = dev->netdev_ops->ndo_setup_tc(dev, TC_SETUP_CLSMATCHALL,
-					    tp->q->handle, tp->chain->index,
-					    tp->protocol, &offload);
+					    &offload);
 	if (!err)
 		head->flags |= TCA_CLS_FLAGS_IN_HW;
 
@@ -80,13 +80,13 @@ static void mall_destroy_hw_filter(struct tcf_proto *tp,
 	struct tc_to_netdev offload;
 	struct tc_cls_matchall_offload mall_offload = {0};
 
+	tc_cls_common_offload_init(&mall_offload.common, tp);
 	offload.cls_mall = &mall_offload;
 	offload.cls_mall->command = TC_CLSMATCHALL_DESTROY;
 	offload.cls_mall->exts = NULL;
 	offload.cls_mall->cookie = cookie;
 
-	dev->netdev_ops->ndo_setup_tc(dev, TC_SETUP_CLSMATCHALL, tp->q->handle,
-				      tp->chain->index, tp->protocol, &offload);
+	dev->netdev_ops->ndo_setup_tc(dev, TC_SETUP_CLSMATCHALL, &offload);
 }
 
 static void mall_destroy(struct tcf_proto *tp)
diff --git a/net/sched/cls_u32.c b/net/sched/cls_u32.c
index d1bae4cc749f..c0f59c471523 100644
--- a/net/sched/cls_u32.c
+++ b/net/sched/cls_u32.c
@@ -437,11 +437,10 @@ static void u32_remove_hw_knode(struct tcf_proto *tp, u32 handle)
 	offload.cls_u32 = &u32_offload;
 
 	if (tc_should_offload(dev, tp, 0)) {
+		tc_cls_common_offload_init(&u32_offload.common, tp);
 		offload.cls_u32->command = TC_CLSU32_DELETE_KNODE;
 		offload.cls_u32->knode.handle = handle;
-		dev->netdev_ops->ndo_setup_tc(dev, TC_SETUP_CLSU32,
-					      tp->q->handle, tp->chain->index,
-					      tp->protocol, &offload);
+		dev->netdev_ops->ndo_setup_tc(dev, TC_SETUP_CLSU32, &offload);
 	}
 }
 
@@ -458,14 +457,13 @@ static int u32_replace_hw_hnode(struct tcf_proto *tp, struct tc_u_hnode *h,
 
 	offload.cls_u32 = &u32_offload;
 
+	tc_cls_common_offload_init(&u32_offload.common, tp);
 	offload.cls_u32->command = TC_CLSU32_NEW_HNODE;
 	offload.cls_u32->hnode.divisor = h->divisor;
 	offload.cls_u32->hnode.handle = h->handle;
 	offload.cls_u32->hnode.prio = h->prio;
 
-	err = dev->netdev_ops->ndo_setup_tc(dev, TC_SETUP_CLSU32, tp->q->handle,
-					    tp->chain->index, tp->protocol,
-					    &offload);
+	err = dev->netdev_ops->ndo_setup_tc(dev, TC_SETUP_CLSU32, &offload);
 	if (tc_skip_sw(flags))
 		return err;
 
@@ -481,14 +479,13 @@ static void u32_clear_hw_hnode(struct tcf_proto *tp, struct tc_u_hnode *h)
 	offload.cls_u32 = &u32_offload;
 
 	if (tc_should_offload(dev, tp, 0)) {
+		tc_cls_common_offload_init(&u32_offload.common, tp);
 		offload.cls_u32->command = TC_CLSU32_DELETE_HNODE;
 		offload.cls_u32->hnode.divisor = h->divisor;
 		offload.cls_u32->hnode.handle = h->handle;
 		offload.cls_u32->hnode.prio = h->prio;
 
-		dev->netdev_ops->ndo_setup_tc(dev, TC_SETUP_CLSU32,
-					      tp->q->handle, tp->chain->index,
-					      tp->protocol, &offload);
+		dev->netdev_ops->ndo_setup_tc(dev, TC_SETUP_CLSU32, &offload);
 	}
 }
 
@@ -505,6 +502,7 @@ static int u32_replace_hw_knode(struct tcf_proto *tp, struct tc_u_knode *n,
 	if (!tc_should_offload(dev, tp, flags))
 		return tc_skip_sw(flags) ? -EINVAL : 0;
 
+	tc_cls_common_offload_init(&u32_offload.common, tp);
 	offload.cls_u32->command = TC_CLSU32_REPLACE_KNODE;
 	offload.cls_u32->knode.handle = n->handle;
 	offload.cls_u32->knode.fshift = n->fshift;
@@ -520,9 +518,7 @@ static int u32_replace_hw_knode(struct tcf_proto *tp, struct tc_u_knode *n,
 	if (n->ht_down)
 		offload.cls_u32->knode.link_handle = n->ht_down->handle;
 
-	err = dev->netdev_ops->ndo_setup_tc(dev, TC_SETUP_CLSU32, tp->q->handle,
-					    tp->chain->index, tp->protocol,
-					    &offload);
+	err = dev->netdev_ops->ndo_setup_tc(dev, TC_SETUP_CLSU32, &offload);
 
 	if (!err)
 		n->flags |= TCA_CLS_FLAGS_IN_HW;
diff --git a/net/sched/sch_mqprio.c b/net/sched/sch_mqprio.c
index 329610ce4dfe..09b577dde49c 100644
--- a/net/sched/sch_mqprio.c
+++ b/net/sched/sch_mqprio.c
@@ -42,8 +42,7 @@ static void mqprio_destroy(struct Qdisc *sch)
 		struct tc_mqprio_qopt offload = { 0 };
 		struct tc_to_netdev tc = { { .mqprio = &offload } };
 
-		dev->netdev_ops->ndo_setup_tc(dev, TC_SETUP_MQPRIO,
-					      sch->handle, 0, 0, &tc);
+		dev->netdev_ops->ndo_setup_tc(dev, TC_SETUP_MQPRIO, &tc);
 	} else {
 		netdev_set_num_tc(dev, 0);
 	}
@@ -151,8 +150,7 @@ static int mqprio_init(struct Qdisc *sch, struct nlattr *opt)
 		struct tc_mqprio_qopt offload = *qopt;
 		struct tc_to_netdev tc = { { .mqprio = &offload } };
 
-		err = dev->netdev_ops->ndo_setup_tc(dev, TC_SETUP_MQPRIO,
-						    sch->handle, 0, 0, &tc);
+		err = dev->netdev_ops->ndo_setup_tc(dev, TC_SETUP_MQPRIO, &tc);
 		if (err)
 			return err;
 
-- 
cgit v1.2.3


From d7c1c8d2e53be974b5c72e31d7d35f6d9737fe84 Mon Sep 17 00:00:00 2001
From: Jiri Pirko <jiri@mellanox.com>
Date: Mon, 7 Aug 2017 10:15:30 +0200
Subject: net: sched: move prio into cls_common

prio is not cls_flower specific, but it is meaningful for all
classifiers. Seems that only mlxsw cares about the value. Obviously,
cls offload in other drivers is broken.

Signed-off-by: Jiri Pirko <jiri@mellanox.com>
Acked-by: Jamal Hadi Salim <jhs@mojatatu.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/mellanox/mlxsw/spectrum_flower.c | 2 +-
 include/net/pkt_cls.h                                 | 3 ++-
 net/sched/cls_flower.c                                | 3 ---
 3 files changed, 3 insertions(+), 5 deletions(-)

(limited to 'include')

diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum_flower.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum_flower.c
index 021b6c0076c0..95428b41c50f 100644
--- a/drivers/net/ethernet/mellanox/mlxsw/spectrum_flower.c
+++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum_flower.c
@@ -270,7 +270,7 @@ static int mlxsw_sp_flower_parse(struct mlxsw_sp *mlxsw_sp,
 		return -EOPNOTSUPP;
 	}
 
-	mlxsw_sp_acl_rulei_priority(rulei, f->prio);
+	mlxsw_sp_acl_rulei_priority(rulei, f->common.prio);
 
 	if (dissector_uses_key(f->dissector, FLOW_DISSECTOR_KEY_CONTROL)) {
 		struct flow_dissector_key_control *key =
diff --git a/include/net/pkt_cls.h b/include/net/pkt_cls.h
index ffaddf72108e..572083af02ac 100644
--- a/include/net/pkt_cls.h
+++ b/include/net/pkt_cls.h
@@ -409,6 +409,7 @@ struct tc_cls_common_offload {
 	u32 handle;
 	u32 chain_index;
 	__be16 protocol;
+	u32 prio;
 };
 
 static inline void
@@ -418,6 +419,7 @@ tc_cls_common_offload_init(struct tc_cls_common_offload *cls_common,
 	cls_common->handle = tp->q->handle;
 	cls_common->chain_index = tp->chain->index;
 	cls_common->protocol = tp->protocol;
+	cls_common->prio = tp->prio;
 }
 
 struct tc_cls_u32_knode {
@@ -515,7 +517,6 @@ enum tc_fl_command {
 struct tc_cls_flower_offload {
 	struct tc_cls_common_offload common;
 	enum tc_fl_command command;
-	u32 prio;
 	unsigned long cookie;
 	struct flow_dissector *dissector;
 	struct fl_flow_key *mask;
diff --git a/net/sched/cls_flower.c b/net/sched/cls_flower.c
index 1fdf2889ba9f..ccdf2f5014ca 100644
--- a/net/sched/cls_flower.c
+++ b/net/sched/cls_flower.c
@@ -234,7 +234,6 @@ static void fl_hw_destroy_filter(struct tcf_proto *tp, struct cls_fl_filter *f)
 
 	tc_cls_common_offload_init(&offload.common, tp);
 	offload.command = TC_CLSFLOWER_DESTROY;
-	offload.prio = tp->prio;
 	offload.cookie = (unsigned long)f;
 
 	tc->cls_flower = &offload;
@@ -266,7 +265,6 @@ static int fl_hw_replace_filter(struct tcf_proto *tp,
 
 	tc_cls_common_offload_init(&offload.common, tp);
 	offload.command = TC_CLSFLOWER_REPLACE;
-	offload.prio = tp->prio;
 	offload.cookie = (unsigned long)f;
 	offload.dissector = dissector;
 	offload.mask = mask;
@@ -295,7 +293,6 @@ static void fl_hw_update_stats(struct tcf_proto *tp, struct cls_fl_filter *f)
 
 	tc_cls_common_offload_init(&offload.common, tp);
 	offload.command = TC_CLSFLOWER_STATS;
-	offload.prio = tp->prio;
 	offload.cookie = (unsigned long)f;
 	offload.exts = &f->exts;
 
-- 
cgit v1.2.3


From de4784ca030fed17d527dbb2bb4e21328b12de94 Mon Sep 17 00:00:00 2001
From: Jiri Pirko <jiri@mellanox.com>
Date: Mon, 7 Aug 2017 10:15:32 +0200
Subject: net: sched: get rid of struct tc_to_netdev

Get rid of struct tc_to_netdev which is now just unnecessary container
and rather pass per-type structures down to drivers directly.
Along with that, consolidate the naming of per-type structure variables
in cls_*.

Signed-off-by: Jiri Pirko <jiri@mellanox.com>
Acked-by: Jamal Hadi Salim <jhs@mojatatu.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/amd/xgbe/xgbe-drv.c           |  7 +-
 drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.c    |  8 ++-
 drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.h    |  2 +-
 drivers/net/ethernet/broadcom/bnxt/bnxt.c          |  8 ++-
 drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c    |  4 +-
 drivers/net/ethernet/freescale/dpaa/dpaa_eth.c     |  7 +-
 .../net/ethernet/hisilicon/hns3/hns3pf/hns3_enet.c |  6 +-
 drivers/net/ethernet/intel/fm10k/fm10k_netdev.c    |  8 ++-
 drivers/net/ethernet/intel/i40e/i40e_main.c        |  8 ++-
 drivers/net/ethernet/intel/ixgbe/ixgbe_main.c      |  6 +-
 drivers/net/ethernet/mellanox/mlx4/en_netdev.c     | 10 +--
 drivers/net/ethernet/mellanox/mlx5/core/en_main.c  |  6 +-
 drivers/net/ethernet/mellanox/mlx5/core/en_rep.c   | 12 ++--
 drivers/net/ethernet/mellanox/mlx5/core/en_tc.h    |  2 +
 drivers/net/ethernet/mellanox/mlxsw/spectrum.c     |  8 +--
 drivers/net/ethernet/netronome/nfp/bpf/main.c      |  5 +-
 drivers/net/ethernet/netronome/nfp/flower/main.h   |  4 +-
 .../net/ethernet/netronome/nfp/flower/offload.c    |  4 +-
 drivers/net/ethernet/netronome/nfp/nfp_app.h       |  8 +--
 drivers/net/ethernet/netronome/nfp/nfp_port.c      |  4 +-
 drivers/net/ethernet/netronome/nfp/nfp_port.h      |  3 +-
 drivers/net/ethernet/sfc/efx.h                     |  2 +-
 drivers/net/ethernet/sfc/falcon/efx.h              |  2 +-
 drivers/net/ethernet/sfc/falcon/tx.c               |  7 +-
 drivers/net/ethernet/sfc/tx.c                      |  7 +-
 drivers/net/ethernet/ti/netcp_core.c               |  7 +-
 include/linux/netdevice.h                          | 19 +----
 net/dsa/slave.c                                    |  4 +-
 net/sched/cls_bpf.c                                | 21 +++---
 net/sched/cls_flower.c                             | 54 ++++++--------
 net/sched/cls_matchall.c                           | 27 +++----
 net/sched/cls_u32.c                                | 83 ++++++++++------------
 net/sched/sch_mqprio.c                             | 13 ++--
 33 files changed, 174 insertions(+), 202 deletions(-)

(limited to 'include')

diff --git a/drivers/net/ethernet/amd/xgbe/xgbe-drv.c b/drivers/net/ethernet/amd/xgbe/xgbe-drv.c
index 37d3e5b65d94..2fd9b80b39b0 100644
--- a/drivers/net/ethernet/amd/xgbe/xgbe-drv.c
+++ b/drivers/net/ethernet/amd/xgbe/xgbe-drv.c
@@ -1919,16 +1919,17 @@ static void xgbe_poll_controller(struct net_device *netdev)
 #endif /* End CONFIG_NET_POLL_CONTROLLER */
 
 static int xgbe_setup_tc(struct net_device *netdev, enum tc_setup_type type,
-			 struct tc_to_netdev *tc_to_netdev)
+			 void *type_data)
 {
 	struct xgbe_prv_data *pdata = netdev_priv(netdev);
+	struct tc_mqprio_qopt *mqprio = type_data;
 	u8 tc;
 
 	if (type != TC_SETUP_MQPRIO)
 		return -EOPNOTSUPP;
 
-	tc_to_netdev->mqprio->hw = TC_MQPRIO_HW_OFFLOAD_TCS;
-	tc = tc_to_netdev->mqprio->num_tc;
+	mqprio->hw = TC_MQPRIO_HW_OFFLOAD_TCS;
+	tc = mqprio->num_tc;
 
 	if (tc > pdata->hw_feat.tc_cnt)
 		return -EINVAL;
diff --git a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.c b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.c
index 8687afc24698..1216c1f1e052 100644
--- a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.c
+++ b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.c
@@ -4285,14 +4285,16 @@ int bnx2x_setup_tc(struct net_device *dev, u8 num_tc)
 }
 
 int __bnx2x_setup_tc(struct net_device *dev, enum tc_setup_type type,
-		     struct tc_to_netdev *tc)
+		     void *type_data)
 {
+	struct tc_mqprio_qopt *mqprio = type_data;
+
 	if (type != TC_SETUP_MQPRIO)
 		return -EOPNOTSUPP;
 
-	tc->mqprio->hw = TC_MQPRIO_HW_OFFLOAD_TCS;
+	mqprio->hw = TC_MQPRIO_HW_OFFLOAD_TCS;
 
-	return bnx2x_setup_tc(dev, tc->mqprio->num_tc);
+	return bnx2x_setup_tc(dev, mqprio->num_tc);
 }
 
 /* called with rtnl_lock */
diff --git a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.h b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.h
index 04eb95043023..a5265e1344f1 100644
--- a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.h
+++ b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.h
@@ -487,7 +487,7 @@ netdev_tx_t bnx2x_start_xmit(struct sk_buff *skb, struct net_device *dev);
 /* setup_tc callback */
 int bnx2x_setup_tc(struct net_device *dev, u8 num_tc);
 int __bnx2x_setup_tc(struct net_device *dev, enum tc_setup_type type,
-		     struct tc_to_netdev *tc);
+		     void *type_data);
 
 int bnx2x_get_vf_config(struct net_device *dev, int vf,
 			struct ifla_vf_info *ivi);
diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.c b/drivers/net/ethernet/broadcom/bnxt/bnxt.c
index a78f72a53042..6e14fc4fe2c8 100644
--- a/drivers/net/ethernet/broadcom/bnxt/bnxt.c
+++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.c
@@ -7238,14 +7238,16 @@ int bnxt_setup_mq_tc(struct net_device *dev, u8 tc)
 }
 
 static int bnxt_setup_tc(struct net_device *dev, enum tc_setup_type type,
-			 struct tc_to_netdev *ntc)
+			 void *type_data)
 {
+	struct tc_mqprio_qopt *mqprio = type_data;
+
 	if (type != TC_SETUP_MQPRIO)
 		return -EOPNOTSUPP;
 
-	ntc->mqprio->hw = TC_MQPRIO_HW_OFFLOAD_TCS;
+	mqprio->hw = TC_MQPRIO_HW_OFFLOAD_TCS;
 
-	return bnxt_setup_mq_tc(dev, ntc->mqprio->num_tc);
+	return bnxt_setup_mq_tc(dev, mqprio->num_tc);
 }
 
 #ifdef CONFIG_RFS_ACCEL
diff --git a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c
index 13199317c8e0..d80b20d695e0 100644
--- a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c
+++ b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c
@@ -2908,7 +2908,7 @@ static int cxgb_setup_tc_cls_u32(struct net_device *dev,
 }
 
 static int cxgb_setup_tc(struct net_device *dev, enum tc_setup_type type,
-			 struct tc_to_netdev *tc)
+			 void *type_data)
 {
 	struct port_info *pi = netdev2pinfo(dev);
 	struct adapter *adap = netdev2adap(dev);
@@ -2922,7 +2922,7 @@ static int cxgb_setup_tc(struct net_device *dev, enum tc_setup_type type,
 
 	switch (type) {
 	case TC_SETUP_CLSU32:
-		return cxgb_setup_tc_cls_u32(dev, tc->cls_u32);
+		return cxgb_setup_tc_cls_u32(dev, type_data);
 	default:
 		return -EOPNOTSUPP;
 	}
diff --git a/drivers/net/ethernet/freescale/dpaa/dpaa_eth.c b/drivers/net/ethernet/freescale/dpaa/dpaa_eth.c
index bfb44c95a7ec..733d54caabb6 100644
--- a/drivers/net/ethernet/freescale/dpaa/dpaa_eth.c
+++ b/drivers/net/ethernet/freescale/dpaa/dpaa_eth.c
@@ -343,17 +343,18 @@ static void dpaa_get_stats64(struct net_device *net_dev,
 }
 
 static int dpaa_setup_tc(struct net_device *net_dev, enum tc_setup_type type,
-			 struct tc_to_netdev *tc)
+			 void *type_data)
 {
 	struct dpaa_priv *priv = netdev_priv(net_dev);
+	struct tc_mqprio_qopt *mqprio = type_data;
 	u8 num_tc;
 	int i;
 
 	if (type != TC_SETUP_MQPRIO)
 		return -EOPNOTSUPP;
 
-	tc->mqprio->hw = TC_MQPRIO_HW_OFFLOAD_TCS;
-	num_tc = tc->mqprio->num_tc;
+	mqprio->hw = TC_MQPRIO_HW_OFFLOAD_TCS;
+	num_tc = mqprio->num_tc;
 
 	if (num_tc == priv->num_tc)
 		return 0;
diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hns3_enet.c b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hns3_enet.c
index aa43ebda9a00..069ae426aa24 100644
--- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hns3_enet.c
+++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hns3_enet.c
@@ -1220,12 +1220,14 @@ static int hns3_setup_tc(struct net_device *netdev, u8 tc)
 }
 
 static int hns3_nic_setup_tc(struct net_device *dev, enum tc_setup_type type,
-			     struct tc_to_netdev *tc)
+			     void *type_data)
 {
+	struct tc_mqprio_qopt *mqprio = type_data;
+
 	if (type != TC_SETUP_MQPRIO)
 		return -EOPNOTSUPP;
 
-	return hns3_setup_tc(dev, tc->mqprio->num_tc);
+	return hns3_setup_tc(dev, mqprio->num_tc);
 }
 
 static int hns3_vlan_rx_add_vid(struct net_device *netdev,
diff --git a/drivers/net/ethernet/intel/fm10k/fm10k_netdev.c b/drivers/net/ethernet/intel/fm10k/fm10k_netdev.c
index 70888129200b..e69d49d91d67 100644
--- a/drivers/net/ethernet/intel/fm10k/fm10k_netdev.c
+++ b/drivers/net/ethernet/intel/fm10k/fm10k_netdev.c
@@ -1266,14 +1266,16 @@ err_queueing_scheme:
 }
 
 static int __fm10k_setup_tc(struct net_device *dev, enum tc_setup_type type,
-			    struct tc_to_netdev *tc)
+			    void *type_data)
 {
+	struct tc_mqprio_qopt *mqprio = type_data;
+
 	if (type != TC_SETUP_MQPRIO)
 		return -EOPNOTSUPP;
 
-	tc->mqprio->hw = TC_MQPRIO_HW_OFFLOAD_TCS;
+	mqprio->hw = TC_MQPRIO_HW_OFFLOAD_TCS;
 
-	return fm10k_setup_tc(dev, tc->mqprio->num_tc);
+	return fm10k_setup_tc(dev, mqprio->num_tc);
 }
 
 static void fm10k_assign_l2_accel(struct fm10k_intfc *interface,
diff --git a/drivers/net/ethernet/intel/i40e/i40e_main.c b/drivers/net/ethernet/intel/i40e/i40e_main.c
index 1f4633830c79..a7e5a76703e7 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_main.c
+++ b/drivers/net/ethernet/intel/i40e/i40e_main.c
@@ -5657,14 +5657,16 @@ exit:
 }
 
 static int __i40e_setup_tc(struct net_device *netdev, enum tc_setup_type type,
-			   struct tc_to_netdev *tc)
+			   void *type_data)
 {
+	struct tc_mqprio_qopt *mqprio = type_data;
+
 	if (type != TC_SETUP_MQPRIO)
 		return -EOPNOTSUPP;
 
-	tc->mqprio->hw = TC_MQPRIO_HW_OFFLOAD_TCS;
+	mqprio->hw = TC_MQPRIO_HW_OFFLOAD_TCS;
 
-	return i40e_setup_tc(netdev, tc->mqprio->num_tc);
+	return i40e_setup_tc(netdev, mqprio->num_tc);
 }
 
 /**
diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
index 0a350314d76b..c6b132476de4 100644
--- a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
+++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
@@ -9258,13 +9258,13 @@ static int ixgbe_setup_tc_mqprio(struct net_device *dev,
 }
 
 static int __ixgbe_setup_tc(struct net_device *dev, enum tc_setup_type type,
-			    struct tc_to_netdev *tc)
+			    void *type_data)
 {
 	switch (type) {
 	case TC_SETUP_CLSU32:
-		return ixgbe_setup_tc_cls_u32(dev, tc->cls_u32);
+		return ixgbe_setup_tc_cls_u32(dev, type_data);
 	case TC_SETUP_MQPRIO:
-		return ixgbe_setup_tc_mqprio(dev, tc->mqprio);
+		return ixgbe_setup_tc_mqprio(dev, type_data);
 	default:
 		return -EOPNOTSUPP;
 	}
diff --git a/drivers/net/ethernet/mellanox/mlx4/en_netdev.c b/drivers/net/ethernet/mellanox/mlx4/en_netdev.c
index 1667e86ac05d..6e67ca7aa7f5 100644
--- a/drivers/net/ethernet/mellanox/mlx4/en_netdev.c
+++ b/drivers/net/ethernet/mellanox/mlx4/en_netdev.c
@@ -131,17 +131,19 @@ out:
 }
 
 static int __mlx4_en_setup_tc(struct net_device *dev, enum tc_setup_type type,
-			      struct tc_to_netdev *tc)
+			      void *type_data)
 {
+	struct tc_mqprio_qopt *mqprio = type_data;
+
 	if (type != TC_SETUP_MQPRIO)
 		return -EOPNOTSUPP;
 
-	if (tc->mqprio->num_tc && tc->mqprio->num_tc != MLX4_EN_NUM_UP_HIGH)
+	if (mqprio->num_tc && mqprio->num_tc != MLX4_EN_NUM_UP_HIGH)
 		return -EINVAL;
 
-	tc->mqprio->hw = TC_MQPRIO_HW_OFFLOAD_TCS;
+	mqprio->hw = TC_MQPRIO_HW_OFFLOAD_TCS;
 
-	return mlx4_en_alloc_tx_queue_per_tc(dev, tc->mqprio->num_tc);
+	return mlx4_en_alloc_tx_queue_per_tc(dev, mqprio->num_tc);
 }
 
 #ifdef CONFIG_RFS_ACCEL
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
index 15f2a942962a..ae0916238b7b 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
@@ -3053,13 +3053,13 @@ static int mlx5e_setup_tc_cls_flower(struct net_device *dev,
 }
 
 static int mlx5e_setup_tc(struct net_device *dev, enum tc_setup_type type,
-			  struct tc_to_netdev *tc)
+			  void *type_data)
 {
 	switch (type) {
 	case TC_SETUP_CLSFLOWER:
-		return mlx5e_setup_tc_cls_flower(dev, tc->cls_flower);
+		return mlx5e_setup_tc_cls_flower(dev, type_data);
 	case TC_SETUP_MQPRIO:
-		return mlx5e_setup_tc_mqprio(dev, tc->mqprio);
+		return mlx5e_setup_tc_mqprio(dev, type_data);
 	default:
 		return -EOPNOTSUPP;
 	}
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c b/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c
index e5cf2e7ae052..3df994d1e173 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c
@@ -651,10 +651,10 @@ static int mlx5e_rep_get_phys_port_name(struct net_device *dev,
 	return 0;
 }
 
-static int mlx5e_rep_setup_tc_cls_flower(struct net_device *dev,
-					 struct tc_to_netdev *tc)
+static int
+mlx5e_rep_setup_tc_cls_flower(struct net_device *dev,
+			      struct tc_cls_flower_offload *cls_flower)
 {
-	struct tc_cls_flower_offload *cls_flower = tc->cls_flower;
 	struct mlx5e_priv *priv = netdev_priv(dev);
 
 	if (TC_H_MAJ(cls_flower->common.handle) != TC_H_MAJ(TC_H_INGRESS) ||
@@ -666,7 +666,7 @@ static int mlx5e_rep_setup_tc_cls_flower(struct net_device *dev,
 
 		dev = mlx5_eswitch_get_uplink_netdev(esw);
 		return dev->netdev_ops->ndo_setup_tc(dev, TC_SETUP_CLSFLOWER,
-						     tc);
+						     cls_flower);
 	}
 
 	switch (cls_flower->command) {
@@ -682,11 +682,11 @@ static int mlx5e_rep_setup_tc_cls_flower(struct net_device *dev,
 }
 
 static int mlx5e_rep_setup_tc(struct net_device *dev, enum tc_setup_type type,
-			      struct tc_to_netdev *tc)
+			      void *type_data)
 {
 	switch (type) {
 	case TC_SETUP_CLSFLOWER:
-		return mlx5e_rep_setup_tc_cls_flower(dev, tc);
+		return mlx5e_rep_setup_tc_cls_flower(dev, type_data);
 	default:
 		return -EOPNOTSUPP;
 	}
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.h b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.h
index 5a0f4a487855..2917d964ffc3 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.h
@@ -33,6 +33,8 @@
 #ifndef __MLX5_EN_TC_H__
 #define __MLX5_EN_TC_H__
 
+#include <net/pkt_cls.h>
+
 #define MLX5E_TC_FLOW_ID_MASK 0x0000ffff
 
 int mlx5e_tc_init(struct mlx5e_priv *priv);
diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum.c
index 1ca3204f5543..eb7c4549f464 100644
--- a/drivers/net/ethernet/mellanox/mlxsw/spectrum.c
+++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum.c
@@ -1736,17 +1736,15 @@ mlxsw_sp_setup_tc_cls_flower(struct mlxsw_sp_port *mlxsw_sp_port,
 }
 
 static int mlxsw_sp_setup_tc(struct net_device *dev, enum tc_setup_type type,
-			     struct tc_to_netdev *tc)
+			     void *type_data)
 {
 	struct mlxsw_sp_port *mlxsw_sp_port = netdev_priv(dev);
 
 	switch (type) {
 	case TC_SETUP_CLSMATCHALL:
-		return mlxsw_sp_setup_tc_cls_matchall(mlxsw_sp_port,
-						      tc->cls_mall);
+		return mlxsw_sp_setup_tc_cls_matchall(mlxsw_sp_port, type_data);
 	case TC_SETUP_CLSFLOWER:
-		return mlxsw_sp_setup_tc_cls_flower(mlxsw_sp_port,
-						    tc->cls_flower);
+		return mlxsw_sp_setup_tc_cls_flower(mlxsw_sp_port, type_data);
 	default:
 		return -EOPNOTSUPP;
 	}
diff --git a/drivers/net/ethernet/netronome/nfp/bpf/main.c b/drivers/net/ethernet/netronome/nfp/bpf/main.c
index 152a7abb58ed..f981f60ec306 100644
--- a/drivers/net/ethernet/netronome/nfp/bpf/main.c
+++ b/drivers/net/ethernet/netronome/nfp/bpf/main.c
@@ -121,10 +121,9 @@ static void nfp_bpf_vnic_clean(struct nfp_app *app, struct nfp_net *nn)
 }
 
 static int nfp_bpf_setup_tc(struct nfp_app *app, struct net_device *netdev,
-			    enum tc_setup_type type,
-			    struct tc_to_netdev *tc)
+			    enum tc_setup_type type, void *type_data)
 {
-	struct tc_cls_bpf_offload *cls_bpf = tc->cls_bpf;
+	struct tc_cls_bpf_offload *cls_bpf = type_data;
 	struct nfp_net *nn = netdev_priv(netdev);
 
 	if (type != TC_SETUP_CLSBPF || !nfp_net_ebpf_capable(nn) ||
diff --git a/drivers/net/ethernet/netronome/nfp/flower/main.h b/drivers/net/ethernet/netronome/nfp/flower/main.h
index eb94d08e35cf..71e4f4f4e9ba 100644
--- a/drivers/net/ethernet/netronome/nfp/flower/main.h
+++ b/drivers/net/ethernet/netronome/nfp/flower/main.h
@@ -38,8 +38,8 @@
 #include <linux/hashtable.h>
 #include <linux/time64.h>
 #include <linux/types.h>
+#include <net/pkt_cls.h>
 
-struct tc_to_netdev;
 struct net_device;
 struct nfp_app;
 
@@ -135,7 +135,7 @@ int nfp_flower_metadata_init(struct nfp_app *app);
 void nfp_flower_metadata_cleanup(struct nfp_app *app);
 
 int nfp_flower_setup_tc(struct nfp_app *app, struct net_device *netdev,
-			enum tc_setup_type type, struct tc_to_netdev *tc);
+			enum tc_setup_type type, void *type_data);
 int nfp_flower_compile_flow_match(struct tc_cls_flower_offload *flow,
 				  struct nfp_fl_key_ls *key_ls,
 				  struct net_device *netdev,
diff --git a/drivers/net/ethernet/netronome/nfp/flower/offload.c b/drivers/net/ethernet/netronome/nfp/flower/offload.c
index 8197836c650d..01767c7376d5 100644
--- a/drivers/net/ethernet/netronome/nfp/flower/offload.c
+++ b/drivers/net/ethernet/netronome/nfp/flower/offload.c
@@ -385,9 +385,9 @@ nfp_flower_repr_offload(struct nfp_app *app, struct net_device *netdev,
 }
 
 int nfp_flower_setup_tc(struct nfp_app *app, struct net_device *netdev,
-			enum tc_setup_type type, struct tc_to_netdev *tc)
+			enum tc_setup_type type, void *type_data)
 {
-	struct tc_cls_flower_offload *cls_flower = tc->cls_flower;
+	struct tc_cls_flower_offload *cls_flower = type_data;
 
 	if (type != TC_SETUP_CLSFLOWER ||
 	    TC_H_MAJ(cls_flower->common.handle) != TC_H_MAJ(TC_H_INGRESS) ||
diff --git a/drivers/net/ethernet/netronome/nfp/nfp_app.h b/drivers/net/ethernet/netronome/nfp/nfp_app.h
index 7a2f950b149c..f34e8778fae2 100644
--- a/drivers/net/ethernet/netronome/nfp/nfp_app.h
+++ b/drivers/net/ethernet/netronome/nfp/nfp_app.h
@@ -42,7 +42,6 @@ struct bpf_prog;
 struct net_device;
 struct pci_dev;
 struct sk_buff;
-struct tc_to_netdev;
 struct sk_buff;
 struct nfp_app;
 struct nfp_cpp;
@@ -109,7 +108,7 @@ struct nfp_app_type {
 	void (*ctrl_msg_rx)(struct nfp_app *app, struct sk_buff *skb);
 
 	int (*setup_tc)(struct nfp_app *app, struct net_device *netdev,
-			enum tc_setup_type type, struct tc_to_netdev *tc);
+			enum tc_setup_type type, void *type_data);
 	bool (*tc_busy)(struct nfp_app *app, struct nfp_net *nn);
 	int (*xdp_offload)(struct nfp_app *app, struct nfp_net *nn,
 			   struct bpf_prog *prog);
@@ -238,12 +237,11 @@ static inline bool nfp_app_tc_busy(struct nfp_app *app, struct nfp_net *nn)
 
 static inline int nfp_app_setup_tc(struct nfp_app *app,
 				   struct net_device *netdev,
-				   enum tc_setup_type type,
-				   struct tc_to_netdev *tc)
+				   enum tc_setup_type type, void *type_data)
 {
 	if (!app || !app->type->setup_tc)
 		return -EOPNOTSUPP;
-	return app->type->setup_tc(app, netdev, type, tc);
+	return app->type->setup_tc(app, netdev, type, type_data);
 }
 
 static inline int nfp_app_xdp_offload(struct nfp_app *app, struct nfp_net *nn,
diff --git a/drivers/net/ethernet/netronome/nfp/nfp_port.c b/drivers/net/ethernet/netronome/nfp/nfp_port.c
index e8abab2b912e..0cf65e57addb 100644
--- a/drivers/net/ethernet/netronome/nfp/nfp_port.c
+++ b/drivers/net/ethernet/netronome/nfp/nfp_port.c
@@ -89,7 +89,7 @@ const struct switchdev_ops nfp_port_switchdev_ops = {
 };
 
 int nfp_port_setup_tc(struct net_device *netdev, enum tc_setup_type type,
-		      struct tc_to_netdev *tc)
+		      void *type_data)
 {
 	struct nfp_port *port;
 
@@ -97,7 +97,7 @@ int nfp_port_setup_tc(struct net_device *netdev, enum tc_setup_type type,
 	if (!port)
 		return -EOPNOTSUPP;
 
-	return nfp_app_setup_tc(port->app, netdev, type, tc);
+	return nfp_app_setup_tc(port->app, netdev, type, type_data);
 }
 
 struct nfp_port *
diff --git a/drivers/net/ethernet/netronome/nfp/nfp_port.h b/drivers/net/ethernet/netronome/nfp/nfp_port.h
index 252f06d4307f..c88e376dcf0f 100644
--- a/drivers/net/ethernet/netronome/nfp/nfp_port.h
+++ b/drivers/net/ethernet/netronome/nfp/nfp_port.h
@@ -36,7 +36,6 @@
 
 #include <net/devlink.h>
 
-struct tc_to_netdev;
 struct net_device;
 struct nfp_app;
 struct nfp_pf;
@@ -110,7 +109,7 @@ struct nfp_port {
 extern const struct switchdev_ops nfp_port_switchdev_ops;
 
 int nfp_port_setup_tc(struct net_device *netdev, enum tc_setup_type type,
-		      struct tc_to_netdev *tc);
+		      void *type_data);
 
 struct nfp_port *nfp_port_from_netdev(struct net_device *netdev);
 struct nfp_port *
diff --git a/drivers/net/ethernet/sfc/efx.h b/drivers/net/ethernet/sfc/efx.h
index b0c6004db138..d407adf59610 100644
--- a/drivers/net/ethernet/sfc/efx.h
+++ b/drivers/net/ethernet/sfc/efx.h
@@ -33,7 +33,7 @@ netdev_tx_t efx_hard_start_xmit(struct sk_buff *skb,
 netdev_tx_t efx_enqueue_skb(struct efx_tx_queue *tx_queue, struct sk_buff *skb);
 void efx_xmit_done(struct efx_tx_queue *tx_queue, unsigned int index);
 int efx_setup_tc(struct net_device *net_dev, enum tc_setup_type type,
-		 struct tc_to_netdev *tc);
+		 void *type_data);
 unsigned int efx_tx_max_skb_descs(struct efx_nic *efx);
 extern unsigned int efx_piobuf_size;
 extern bool efx_separate_tx_channels;
diff --git a/drivers/net/ethernet/sfc/falcon/efx.h b/drivers/net/ethernet/sfc/falcon/efx.h
index 4497511fc914..4f3bb30661ea 100644
--- a/drivers/net/ethernet/sfc/falcon/efx.h
+++ b/drivers/net/ethernet/sfc/falcon/efx.h
@@ -33,7 +33,7 @@ netdev_tx_t ef4_hard_start_xmit(struct sk_buff *skb,
 netdev_tx_t ef4_enqueue_skb(struct ef4_tx_queue *tx_queue, struct sk_buff *skb);
 void ef4_xmit_done(struct ef4_tx_queue *tx_queue, unsigned int index);
 int ef4_setup_tc(struct net_device *net_dev, enum tc_setup_type type,
-		 struct tc_to_netdev *tc);
+		 void *type_data);
 unsigned int ef4_tx_max_skb_descs(struct ef4_nic *efx);
 extern bool ef4_separate_tx_channels;
 
diff --git a/drivers/net/ethernet/sfc/falcon/tx.c b/drivers/net/ethernet/sfc/falcon/tx.c
index 0f125e15143a..6a75f4140a4b 100644
--- a/drivers/net/ethernet/sfc/falcon/tx.c
+++ b/drivers/net/ethernet/sfc/falcon/tx.c
@@ -426,9 +426,10 @@ void ef4_init_tx_queue_core_txq(struct ef4_tx_queue *tx_queue)
 }
 
 int ef4_setup_tc(struct net_device *net_dev, enum tc_setup_type type,
-		 struct tc_to_netdev *ntc)
+		 void *type_data)
 {
 	struct ef4_nic *efx = netdev_priv(net_dev);
+	struct tc_mqprio_qopt *mqprio = type_data;
 	struct ef4_channel *channel;
 	struct ef4_tx_queue *tx_queue;
 	unsigned tc, num_tc;
@@ -437,12 +438,12 @@ int ef4_setup_tc(struct net_device *net_dev, enum tc_setup_type type,
 	if (type != TC_SETUP_MQPRIO)
 		return -EOPNOTSUPP;
 
-	num_tc = ntc->mqprio->num_tc;
+	num_tc = mqprio->num_tc;
 
 	if (ef4_nic_rev(efx) < EF4_REV_FALCON_B0 || num_tc > EF4_MAX_TX_TC)
 		return -EINVAL;
 
-	ntc->mqprio->hw = TC_MQPRIO_HW_OFFLOAD_TCS;
+	mqprio->hw = TC_MQPRIO_HW_OFFLOAD_TCS;
 
 	if (num_tc == net_dev->num_tc)
 		return 0;
diff --git a/drivers/net/ethernet/sfc/tx.c b/drivers/net/ethernet/sfc/tx.c
index 53ba30c3eb7b..32bf1fecf864 100644
--- a/drivers/net/ethernet/sfc/tx.c
+++ b/drivers/net/ethernet/sfc/tx.c
@@ -654,9 +654,10 @@ void efx_init_tx_queue_core_txq(struct efx_tx_queue *tx_queue)
 }
 
 int efx_setup_tc(struct net_device *net_dev, enum tc_setup_type type,
-		 struct tc_to_netdev *ntc)
+		 void *type_data)
 {
 	struct efx_nic *efx = netdev_priv(net_dev);
+	struct tc_mqprio_qopt *mqprio = type_data;
 	struct efx_channel *channel;
 	struct efx_tx_queue *tx_queue;
 	unsigned tc, num_tc;
@@ -665,12 +666,12 @@ int efx_setup_tc(struct net_device *net_dev, enum tc_setup_type type,
 	if (type != TC_SETUP_MQPRIO)
 		return -EOPNOTSUPP;
 
-	num_tc = ntc->mqprio->num_tc;
+	num_tc = mqprio->num_tc;
 
 	if (num_tc > EFX_MAX_TX_TC)
 		return -EINVAL;
 
-	ntc->mqprio->hw = TC_MQPRIO_HW_OFFLOAD_TCS;
+	mqprio->hw = TC_MQPRIO_HW_OFFLOAD_TCS;
 
 	if (num_tc == net_dev->num_tc)
 		return 0;
diff --git a/drivers/net/ethernet/ti/netcp_core.c b/drivers/net/ethernet/ti/netcp_core.c
index caba0abc0158..eb96a6913235 100644
--- a/drivers/net/ethernet/ti/netcp_core.c
+++ b/drivers/net/ethernet/ti/netcp_core.c
@@ -1878,8 +1878,9 @@ static u16 netcp_select_queue(struct net_device *dev, struct sk_buff *skb,
 }
 
 static int netcp_setup_tc(struct net_device *dev, enum tc_setup_type type,
-			  struct tc_to_netdev *tc)
+			  void *type_data)
 {
+	struct tc_mqprio_qopt *mqprio = type_data;
 	u8 num_tc;
 	int i;
 
@@ -1889,8 +1890,8 @@ static int netcp_setup_tc(struct net_device *dev, enum tc_setup_type type,
 	if (type != TC_SETUP_MQPRIO)
 		return -EOPNOTSUPP;
 
-	tc->mqprio->hw = TC_MQPRIO_HW_OFFLOAD_TCS;
-	num_tc = tc->mqprio->num_tc;
+	mqprio->hw = TC_MQPRIO_HW_OFFLOAD_TCS;
+	num_tc = mqprio->num_tc;
 
 	/* Sanity-check the number of traffic classes requested */
 	if ((dev->real_num_tx_queues <= 1) ||
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 6e2f7e38cf8e..1d238d54c484 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -771,9 +771,6 @@ static inline bool netdev_phys_item_id_same(struct netdev_phys_item_id *a,
 typedef u16 (*select_queue_fallback_t)(struct net_device *dev,
 				       struct sk_buff *skb);
 
-/* These structures hold the attributes of qdisc and classifiers
- * that are being passed to the netdevice through the setup_tc op.
- */
 enum tc_setup_type {
 	TC_SETUP_MQPRIO,
 	TC_SETUP_CLSU32,
@@ -782,18 +779,6 @@ enum tc_setup_type {
 	TC_SETUP_CLSBPF,
 };
 
-struct tc_cls_u32_offload;
-
-struct tc_to_netdev {
-	union {
-		struct tc_cls_u32_offload *cls_u32;
-		struct tc_cls_flower_offload *cls_flower;
-		struct tc_cls_matchall_offload *cls_mall;
-		struct tc_cls_bpf_offload *cls_bpf;
-		struct tc_mqprio_qopt *mqprio;
-	};
-};
-
 /* These structures hold the attributes of xdp state that are being passed
  * to the netdevice through the xdp op.
  */
@@ -977,7 +962,7 @@ struct xfrmdev_ops {
  * int (*ndo_set_vf_rss_query_en)(struct net_device *dev, int vf, bool setting);
  * int (*ndo_get_vf_port)(struct net_device *dev, int vf, struct sk_buff *skb);
  * int (*ndo_setup_tc)(struct net_device *dev, enum tc_setup_type type,
- *		       struct tc_to_netdev *tc);
+ *		       void *type_data);
  *	Called to setup any 'tc' scheduler, classifier or action on @dev.
  *	This is always called from the stack with the rtnl lock held and netif
  *	tx queues stopped. This allows the netdevice to perform queue
@@ -1226,7 +1211,7 @@ struct net_device_ops {
 						   int vf, bool setting);
 	int			(*ndo_setup_tc)(struct net_device *dev,
 						enum tc_setup_type type,
-						struct tc_to_netdev *tc);
+						void *type_data);
 #if IS_ENABLED(CONFIG_FCOE)
 	int			(*ndo_fcoe_enable)(struct net_device *dev);
 	int			(*ndo_fcoe_disable)(struct net_device *dev);
diff --git a/net/dsa/slave.c b/net/dsa/slave.c
index 5e01e9271619..c6b5de2fe413 100644
--- a/net/dsa/slave.c
+++ b/net/dsa/slave.c
@@ -883,11 +883,11 @@ static int dsa_slave_setup_tc_cls_matchall(struct net_device *dev,
 }
 
 static int dsa_slave_setup_tc(struct net_device *dev, enum tc_setup_type type,
-			      struct tc_to_netdev *tc)
+			      void *type_data)
 {
 	switch (type) {
 	case TC_SETUP_CLSMATCHALL:
-		return dsa_slave_setup_tc_cls_matchall(dev, tc->cls_mall);
+		return dsa_slave_setup_tc_cls_matchall(dev, type_data);
 	default:
 		return -EOPNOTSUPP;
 	}
diff --git a/net/sched/cls_bpf.c b/net/sched/cls_bpf.c
index dde8efdcee3b..2d4d06e41cd9 100644
--- a/net/sched/cls_bpf.c
+++ b/net/sched/cls_bpf.c
@@ -147,21 +147,18 @@ static int cls_bpf_offload_cmd(struct tcf_proto *tp, struct cls_bpf_prog *prog,
 			       enum tc_clsbpf_command cmd)
 {
 	struct net_device *dev = tp->q->dev_queue->dev;
-	struct tc_cls_bpf_offload bpf_offload = {};
-	struct tc_to_netdev offload;
+	struct tc_cls_bpf_offload cls_bpf = {};
 	int err;
 
-	offload.cls_bpf = &bpf_offload;
+	tc_cls_common_offload_init(&cls_bpf.common, tp);
+	cls_bpf.command = cmd;
+	cls_bpf.exts = &prog->exts;
+	cls_bpf.prog = prog->filter;
+	cls_bpf.name = prog->bpf_name;
+	cls_bpf.exts_integrated = prog->exts_integrated;
+	cls_bpf.gen_flags = prog->gen_flags;
 
-	tc_cls_common_offload_init(&bpf_offload.common, tp);
-	bpf_offload.command = cmd;
-	bpf_offload.exts = &prog->exts;
-	bpf_offload.prog = prog->filter;
-	bpf_offload.name = prog->bpf_name;
-	bpf_offload.exts_integrated = prog->exts_integrated;
-	bpf_offload.gen_flags = prog->gen_flags;
-
-	err = dev->netdev_ops->ndo_setup_tc(dev, TC_SETUP_CLSBPF, &offload);
+	err = dev->netdev_ops->ndo_setup_tc(dev, TC_SETUP_CLSBPF, &cls_bpf);
 	if (!err && (cmd == TC_CLSBPF_ADD || cmd == TC_CLSBPF_REPLACE))
 		prog->gen_flags |= TCA_CLS_FLAGS_IN_HW;
 
diff --git a/net/sched/cls_flower.c b/net/sched/cls_flower.c
index ccdf2f5014ca..1474bacf4df4 100644
--- a/net/sched/cls_flower.c
+++ b/net/sched/cls_flower.c
@@ -88,7 +88,6 @@ struct cls_fl_filter {
 	u32 handle;
 	u32 flags;
 	struct rcu_head	rcu;
-	struct tc_to_netdev tc;
 	struct net_device *hw_dev;
 };
 
@@ -225,20 +224,17 @@ static void fl_destroy_filter(struct rcu_head *head)
 
 static void fl_hw_destroy_filter(struct tcf_proto *tp, struct cls_fl_filter *f)
 {
-	struct tc_cls_flower_offload offload = {0};
+	struct tc_cls_flower_offload cls_flower = {};
 	struct net_device *dev = f->hw_dev;
-	struct tc_to_netdev *tc = &f->tc;
 
 	if (!tc_can_offload(dev, tp))
 		return;
 
-	tc_cls_common_offload_init(&offload.common, tp);
-	offload.command = TC_CLSFLOWER_DESTROY;
-	offload.cookie = (unsigned long)f;
+	tc_cls_common_offload_init(&cls_flower.common, tp);
+	cls_flower.command = TC_CLSFLOWER_DESTROY;
+	cls_flower.cookie = (unsigned long) f;
 
-	tc->cls_flower = &offload;
-
-	dev->netdev_ops->ndo_setup_tc(dev, TC_SETUP_CLSFLOWER, tc);
+	dev->netdev_ops->ndo_setup_tc(dev, TC_SETUP_CLSFLOWER, &cls_flower);
 }
 
 static int fl_hw_replace_filter(struct tcf_proto *tp,
@@ -247,8 +243,7 @@ static int fl_hw_replace_filter(struct tcf_proto *tp,
 				struct cls_fl_filter *f)
 {
 	struct net_device *dev = tp->q->dev_queue->dev;
-	struct tc_cls_flower_offload offload = {0};
-	struct tc_to_netdev *tc = &f->tc;
+	struct tc_cls_flower_offload cls_flower = {};
 	int err;
 
 	if (!tc_can_offload(dev, tp)) {
@@ -258,22 +253,21 @@ static int fl_hw_replace_filter(struct tcf_proto *tp,
 			return tc_skip_sw(f->flags) ? -EINVAL : 0;
 		}
 		dev = f->hw_dev;
-		offload.egress_dev = true;
+		cls_flower.egress_dev = true;
 	} else {
 		f->hw_dev = dev;
 	}
 
-	tc_cls_common_offload_init(&offload.common, tp);
-	offload.command = TC_CLSFLOWER_REPLACE;
-	offload.cookie = (unsigned long)f;
-	offload.dissector = dissector;
-	offload.mask = mask;
-	offload.key = &f->mkey;
-	offload.exts = &f->exts;
-
-	tc->cls_flower = &offload;
+	tc_cls_common_offload_init(&cls_flower.common, tp);
+	cls_flower.command = TC_CLSFLOWER_REPLACE;
+	cls_flower.cookie = (unsigned long) f;
+	cls_flower.dissector = dissector;
+	cls_flower.mask = mask;
+	cls_flower.key = &f->mkey;
+	cls_flower.exts = &f->exts;
 
-	err = dev->netdev_ops->ndo_setup_tc(dev, TC_SETUP_CLSFLOWER, tc);
+	err = dev->netdev_ops->ndo_setup_tc(dev, TC_SETUP_CLSFLOWER,
+					    &cls_flower);
 	if (!err)
 		f->flags |= TCA_CLS_FLAGS_IN_HW;
 
@@ -284,21 +278,19 @@ static int fl_hw_replace_filter(struct tcf_proto *tp,
 
 static void fl_hw_update_stats(struct tcf_proto *tp, struct cls_fl_filter *f)
 {
-	struct tc_cls_flower_offload offload = {0};
+	struct tc_cls_flower_offload cls_flower = {};
 	struct net_device *dev = f->hw_dev;
-	struct tc_to_netdev *tc = &f->tc;
 
 	if (!tc_can_offload(dev, tp))
 		return;
 
-	tc_cls_common_offload_init(&offload.common, tp);
-	offload.command = TC_CLSFLOWER_STATS;
-	offload.cookie = (unsigned long)f;
-	offload.exts = &f->exts;
-
-	tc->cls_flower = &offload;
+	tc_cls_common_offload_init(&cls_flower.common, tp);
+	cls_flower.command = TC_CLSFLOWER_STATS;
+	cls_flower.cookie = (unsigned long) f;
+	cls_flower.exts = &f->exts;
 
-	dev->netdev_ops->ndo_setup_tc(dev, TC_CLSFLOWER_STATS, tc);
+	dev->netdev_ops->ndo_setup_tc(dev, TC_CLSFLOWER_STATS,
+				      &cls_flower);
 }
 
 static void __fl_delete(struct tcf_proto *tp, struct cls_fl_filter *f)
diff --git a/net/sched/cls_matchall.c b/net/sched/cls_matchall.c
index 174c700160ca..c9f6500b8080 100644
--- a/net/sched/cls_matchall.c
+++ b/net/sched/cls_matchall.c
@@ -54,18 +54,16 @@ static int mall_replace_hw_filter(struct tcf_proto *tp,
 				  unsigned long cookie)
 {
 	struct net_device *dev = tp->q->dev_queue->dev;
-	struct tc_to_netdev offload;
-	struct tc_cls_matchall_offload mall_offload = {0};
+	struct tc_cls_matchall_offload cls_mall = {};
 	int err;
 
-	tc_cls_common_offload_init(&mall_offload.common, tp);
-	offload.cls_mall = &mall_offload;
-	offload.cls_mall->command = TC_CLSMATCHALL_REPLACE;
-	offload.cls_mall->exts = &head->exts;
-	offload.cls_mall->cookie = cookie;
+	tc_cls_common_offload_init(&cls_mall.common, tp);
+	cls_mall.command = TC_CLSMATCHALL_REPLACE;
+	cls_mall.exts = &head->exts;
+	cls_mall.cookie = cookie;
 
 	err = dev->netdev_ops->ndo_setup_tc(dev, TC_SETUP_CLSMATCHALL,
-					    &offload);
+					    &cls_mall);
 	if (!err)
 		head->flags |= TCA_CLS_FLAGS_IN_HW;
 
@@ -77,16 +75,13 @@ static void mall_destroy_hw_filter(struct tcf_proto *tp,
 				   unsigned long cookie)
 {
 	struct net_device *dev = tp->q->dev_queue->dev;
-	struct tc_to_netdev offload;
-	struct tc_cls_matchall_offload mall_offload = {0};
+	struct tc_cls_matchall_offload cls_mall = {};
 
-	tc_cls_common_offload_init(&mall_offload.common, tp);
-	offload.cls_mall = &mall_offload;
-	offload.cls_mall->command = TC_CLSMATCHALL_DESTROY;
-	offload.cls_mall->exts = NULL;
-	offload.cls_mall->cookie = cookie;
+	tc_cls_common_offload_init(&cls_mall.common, tp);
+	cls_mall.command = TC_CLSMATCHALL_DESTROY;
+	cls_mall.cookie = cookie;
 
-	dev->netdev_ops->ndo_setup_tc(dev, TC_SETUP_CLSMATCHALL, &offload);
+	dev->netdev_ops->ndo_setup_tc(dev, TC_SETUP_CLSMATCHALL, &cls_mall);
 }
 
 static void mall_destroy(struct tcf_proto *tp)
diff --git a/net/sched/cls_u32.c b/net/sched/cls_u32.c
index c0f59c471523..4ed51d347d0a 100644
--- a/net/sched/cls_u32.c
+++ b/net/sched/cls_u32.c
@@ -431,39 +431,35 @@ static int u32_delete_key(struct tcf_proto *tp, struct tc_u_knode *key)
 static void u32_remove_hw_knode(struct tcf_proto *tp, u32 handle)
 {
 	struct net_device *dev = tp->q->dev_queue->dev;
-	struct tc_cls_u32_offload u32_offload = {0};
-	struct tc_to_netdev offload;
+	struct tc_cls_u32_offload cls_u32 = {};
 
-	offload.cls_u32 = &u32_offload;
+	if (!tc_should_offload(dev, tp, 0))
+		return;
 
-	if (tc_should_offload(dev, tp, 0)) {
-		tc_cls_common_offload_init(&u32_offload.common, tp);
-		offload.cls_u32->command = TC_CLSU32_DELETE_KNODE;
-		offload.cls_u32->knode.handle = handle;
-		dev->netdev_ops->ndo_setup_tc(dev, TC_SETUP_CLSU32, &offload);
-	}
+	tc_cls_common_offload_init(&cls_u32.common, tp);
+	cls_u32.command = TC_CLSU32_DELETE_KNODE;
+	cls_u32.knode.handle = handle;
+
+	dev->netdev_ops->ndo_setup_tc(dev, TC_SETUP_CLSU32, &cls_u32);
 }
 
 static int u32_replace_hw_hnode(struct tcf_proto *tp, struct tc_u_hnode *h,
 				u32 flags)
 {
 	struct net_device *dev = tp->q->dev_queue->dev;
-	struct tc_cls_u32_offload u32_offload = {0};
-	struct tc_to_netdev offload;
+	struct tc_cls_u32_offload cls_u32 = {};
 	int err;
 
 	if (!tc_should_offload(dev, tp, flags))
 		return tc_skip_sw(flags) ? -EINVAL : 0;
 
-	offload.cls_u32 = &u32_offload;
-
-	tc_cls_common_offload_init(&u32_offload.common, tp);
-	offload.cls_u32->command = TC_CLSU32_NEW_HNODE;
-	offload.cls_u32->hnode.divisor = h->divisor;
-	offload.cls_u32->hnode.handle = h->handle;
-	offload.cls_u32->hnode.prio = h->prio;
+	tc_cls_common_offload_init(&cls_u32.common, tp);
+	cls_u32.command = TC_CLSU32_NEW_HNODE;
+	cls_u32.hnode.divisor = h->divisor;
+	cls_u32.hnode.handle = h->handle;
+	cls_u32.hnode.prio = h->prio;
 
-	err = dev->netdev_ops->ndo_setup_tc(dev, TC_SETUP_CLSU32, &offload);
+	err = dev->netdev_ops->ndo_setup_tc(dev, TC_SETUP_CLSU32, &cls_u32);
 	if (tc_skip_sw(flags))
 		return err;
 
@@ -473,52 +469,47 @@ static int u32_replace_hw_hnode(struct tcf_proto *tp, struct tc_u_hnode *h,
 static void u32_clear_hw_hnode(struct tcf_proto *tp, struct tc_u_hnode *h)
 {
 	struct net_device *dev = tp->q->dev_queue->dev;
-	struct tc_cls_u32_offload u32_offload = {0};
-	struct tc_to_netdev offload;
+	struct tc_cls_u32_offload cls_u32 = {};
 
-	offload.cls_u32 = &u32_offload;
+	if (!tc_should_offload(dev, tp, 0))
+		return;
 
-	if (tc_should_offload(dev, tp, 0)) {
-		tc_cls_common_offload_init(&u32_offload.common, tp);
-		offload.cls_u32->command = TC_CLSU32_DELETE_HNODE;
-		offload.cls_u32->hnode.divisor = h->divisor;
-		offload.cls_u32->hnode.handle = h->handle;
-		offload.cls_u32->hnode.prio = h->prio;
+	tc_cls_common_offload_init(&cls_u32.common, tp);
+	cls_u32.command = TC_CLSU32_DELETE_HNODE;
+	cls_u32.hnode.divisor = h->divisor;
+	cls_u32.hnode.handle = h->handle;
+	cls_u32.hnode.prio = h->prio;
 
-		dev->netdev_ops->ndo_setup_tc(dev, TC_SETUP_CLSU32, &offload);
-	}
+	dev->netdev_ops->ndo_setup_tc(dev, TC_SETUP_CLSU32, &cls_u32);
 }
 
 static int u32_replace_hw_knode(struct tcf_proto *tp, struct tc_u_knode *n,
 				u32 flags)
 {
 	struct net_device *dev = tp->q->dev_queue->dev;
-	struct tc_cls_u32_offload u32_offload = {0};
-	struct tc_to_netdev offload;
+	struct tc_cls_u32_offload cls_u32 = {};
 	int err;
 
-	offload.cls_u32 = &u32_offload;
-
 	if (!tc_should_offload(dev, tp, flags))
 		return tc_skip_sw(flags) ? -EINVAL : 0;
 
-	tc_cls_common_offload_init(&u32_offload.common, tp);
-	offload.cls_u32->command = TC_CLSU32_REPLACE_KNODE;
-	offload.cls_u32->knode.handle = n->handle;
-	offload.cls_u32->knode.fshift = n->fshift;
+	tc_cls_common_offload_init(&cls_u32.common, tp);
+	cls_u32.command = TC_CLSU32_REPLACE_KNODE;
+	cls_u32.knode.handle = n->handle;
+	cls_u32.knode.fshift = n->fshift;
 #ifdef CONFIG_CLS_U32_MARK
-	offload.cls_u32->knode.val = n->val;
-	offload.cls_u32->knode.mask = n->mask;
+	cls_u32.knode.val = n->val;
+	cls_u32.knode.mask = n->mask;
 #else
-	offload.cls_u32->knode.val = 0;
-	offload.cls_u32->knode.mask = 0;
+	cls_u32.knode.val = 0;
+	cls_u32.knode.mask = 0;
 #endif
-	offload.cls_u32->knode.sel = &n->sel;
-	offload.cls_u32->knode.exts = &n->exts;
+	cls_u32.knode.sel = &n->sel;
+	cls_u32.knode.exts = &n->exts;
 	if (n->ht_down)
-		offload.cls_u32->knode.link_handle = n->ht_down->handle;
+		cls_u32.knode.link_handle = n->ht_down->handle;
 
-	err = dev->netdev_ops->ndo_setup_tc(dev, TC_SETUP_CLSU32, &offload);
+	err = dev->netdev_ops->ndo_setup_tc(dev, TC_SETUP_CLSU32, &cls_u32);
 
 	if (!err)
 		n->flags |= TCA_CLS_FLAGS_IN_HW;
diff --git a/net/sched/sch_mqprio.c b/net/sched/sch_mqprio.c
index 09b577dde49c..2165a05994b7 100644
--- a/net/sched/sch_mqprio.c
+++ b/net/sched/sch_mqprio.c
@@ -39,10 +39,9 @@ static void mqprio_destroy(struct Qdisc *sch)
 	}
 
 	if (priv->hw_offload && dev->netdev_ops->ndo_setup_tc) {
-		struct tc_mqprio_qopt offload = { 0 };
-		struct tc_to_netdev tc = { { .mqprio = &offload } };
+		struct tc_mqprio_qopt mqprio = {};
 
-		dev->netdev_ops->ndo_setup_tc(dev, TC_SETUP_MQPRIO, &tc);
+		dev->netdev_ops->ndo_setup_tc(dev, TC_SETUP_MQPRIO, &mqprio);
 	} else {
 		netdev_set_num_tc(dev, 0);
 	}
@@ -147,14 +146,14 @@ static int mqprio_init(struct Qdisc *sch, struct nlattr *opt)
 	 * supplied and verified mapping
 	 */
 	if (qopt->hw) {
-		struct tc_mqprio_qopt offload = *qopt;
-		struct tc_to_netdev tc = { { .mqprio = &offload } };
+		struct tc_mqprio_qopt mqprio = *qopt;
 
-		err = dev->netdev_ops->ndo_setup_tc(dev, TC_SETUP_MQPRIO, &tc);
+		err = dev->netdev_ops->ndo_setup_tc(dev, TC_SETUP_MQPRIO,
+						    &mqprio);
 		if (err)
 			return err;
 
-		priv->hw_offload = offload.hw;
+		priv->hw_offload = mqprio.hw;
 	} else {
 		netdev_set_num_tc(dev, qopt->num_tc);
 		for (i = 0; i < qopt->num_tc; i++)
-- 
cgit v1.2.3


From 67b465250e045446ad6fc59ab3e02beb40435878 Mon Sep 17 00:00:00 2001
From: Steffen Maier <maier@linux.vnet.ibm.com>
Date: Tue, 25 Jul 2017 16:14:24 +0200
Subject: scsi: fc: start decoupling fc_block_scsi_eh from scsi_cmnd

Scsi_cmnd is an unsuitable argument for eh_device_reset_handler(),
eh_target_reset_handler(), and eh_host_reset_handler() which do not have
the scope of one single SCSI command.  These callbacks tend to use
fc_block_scsi_eh() requiring scsi_cmnd.  In order to start decoupling
above eh callbacks from scsi_cmnd, introduce a new variant of the
function called fc_block_rport() taking an fc_rport as argument.
Refactor the old fc_block_scsi_eh() to simply delegate to
fc_block_rport().

Signed-off-by: Steffen Maier <maier@linux.vnet.ibm.com>
Reviewed-by: Hannes Reinecke <hare@suse.com>
Signed-off-by: Martin K. Petersen <martin.petersen@oracle.com>
---
 drivers/scsi/scsi_transport_fc.c | 31 ++++++++++++++++++++++++++-----
 include/scsi/scsi_transport_fc.h |  1 +
 2 files changed, 27 insertions(+), 5 deletions(-)

(limited to 'include')

diff --git a/drivers/scsi/scsi_transport_fc.c b/drivers/scsi/scsi_transport_fc.c
index 892fbd9800d9..1118aa5f88cd 100644
--- a/drivers/scsi/scsi_transport_fc.c
+++ b/drivers/scsi/scsi_transport_fc.c
@@ -3276,8 +3276,8 @@ fc_scsi_scan_rport(struct work_struct *work)
 }
 
 /**
- * fc_block_scsi_eh - Block SCSI eh thread for blocked fc_rport
- * @cmnd: SCSI command that scsi_eh is trying to recover
+ * fc_block_rport() - Block SCSI eh thread for blocked fc_rport.
+ * @rport: Remote port that scsi_eh is trying to recover.
  *
  * This routine can be called from a FC LLD scsi_eh callback. It
  * blocks the scsi_eh thread until the fc_rport leaves the
@@ -3289,10 +3289,9 @@ fc_scsi_scan_rport(struct work_struct *work)
  *	    FAST_IO_FAIL if the fast_io_fail_tmo fired, this should be
  *	    passed back to scsi_eh.
  */
-int fc_block_scsi_eh(struct scsi_cmnd *cmnd)
+int fc_block_rport(struct fc_rport *rport)
 {
-	struct Scsi_Host *shost = cmnd->device->host;
-	struct fc_rport *rport = starget_to_rport(scsi_target(cmnd->device));
+	struct Scsi_Host *shost = rport_to_shost(rport);
 	unsigned long flags;
 
 	spin_lock_irqsave(shost->host_lock, flags);
@@ -3309,6 +3308,28 @@ int fc_block_scsi_eh(struct scsi_cmnd *cmnd)
 
 	return 0;
 }
+EXPORT_SYMBOL(fc_block_rport);
+
+/**
+ * fc_block_scsi_eh - Block SCSI eh thread for blocked fc_rport
+ * @cmnd: SCSI command that scsi_eh is trying to recover
+ *
+ * This routine can be called from a FC LLD scsi_eh callback. It
+ * blocks the scsi_eh thread until the fc_rport leaves the
+ * FC_PORTSTATE_BLOCKED, or the fast_io_fail_tmo fires. This is
+ * necessary to avoid the scsi_eh failing recovery actions for blocked
+ * rports which would lead to offlined SCSI devices.
+ *
+ * Returns: 0 if the fc_rport left the state FC_PORTSTATE_BLOCKED.
+ *	    FAST_IO_FAIL if the fast_io_fail_tmo fired, this should be
+ *	    passed back to scsi_eh.
+ */
+int fc_block_scsi_eh(struct scsi_cmnd *cmnd)
+{
+	struct fc_rport *rport = starget_to_rport(scsi_target(cmnd->device));
+
+	return fc_block_rport(rport);
+}
 EXPORT_SYMBOL(fc_block_scsi_eh);
 
 /**
diff --git a/include/scsi/scsi_transport_fc.h b/include/scsi/scsi_transport_fc.h
index e308cd59e556..e8644eea9fe5 100644
--- a/include/scsi/scsi_transport_fc.h
+++ b/include/scsi/scsi_transport_fc.h
@@ -804,6 +804,7 @@ void fc_host_post_vendor_event(struct Scsi_Host *shost, u32 event_number,
 struct fc_vport *fc_vport_create(struct Scsi_Host *shost, int channel,
 		struct fc_vport_identifiers *);
 int fc_vport_terminate(struct fc_vport *vport);
+int fc_block_rport(struct fc_rport *rport);
 int fc_block_scsi_eh(struct scsi_cmnd *cmnd);
 enum blk_eh_timer_return fc_eh_timed_out(struct scsi_cmnd *scmd);
 
-- 
cgit v1.2.3


From fb74c27735f0a34e76dbf1972084e984ad2ea145 Mon Sep 17 00:00:00 2001
From: David Ahern <dsahern@gmail.com>
Date: Mon, 7 Aug 2017 08:44:16 -0700
Subject: net: ipv4: add second dif to udp socket lookups

Add a second device index, sdif, to udp socket lookups. sdif is the
index for ingress devices enslaved to an l3mdev. It allows the lookups
to consider the enslaved device as well as the L3 domain when searching
for a socket.

Early demux lookups are handled in the next patch as part of INET_MATCH
changes.

Signed-off-by: David Ahern <dsahern@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/ip.h    | 10 +++++++++
 include/net/udp.h   |  2 +-
 net/ipv4/udp.c      | 58 +++++++++++++++++++++++++++++++----------------------
 net/ipv4/udp_diag.c |  6 +++---
 4 files changed, 48 insertions(+), 28 deletions(-)

(limited to 'include')

diff --git a/include/net/ip.h b/include/net/ip.h
index 9e59dcf1787a..39db596eb89f 100644
--- a/include/net/ip.h
+++ b/include/net/ip.h
@@ -78,6 +78,16 @@ struct ipcm_cookie {
 #define IPCB(skb) ((struct inet_skb_parm*)((skb)->cb))
 #define PKTINFO_SKB_CB(skb) ((struct in_pktinfo *)((skb)->cb))
 
+/* return enslaved device index if relevant */
+static inline int inet_sdif(struct sk_buff *skb)
+{
+#if IS_ENABLED(CONFIG_NET_L3_MASTER_DEV)
+	if (skb && ipv4_l3mdev_skb(IPCB(skb)->flags))
+		return IPCB(skb)->iif;
+#endif
+	return 0;
+}
+
 struct ip_ra_chain {
 	struct ip_ra_chain __rcu *next;
 	struct sock		*sk;
diff --git a/include/net/udp.h b/include/net/udp.h
index cc8036987dcb..826c713d5a48 100644
--- a/include/net/udp.h
+++ b/include/net/udp.h
@@ -287,7 +287,7 @@ int udp_lib_setsockopt(struct sock *sk, int level, int optname,
 struct sock *udp4_lib_lookup(struct net *net, __be32 saddr, __be16 sport,
 			     __be32 daddr, __be16 dport, int dif);
 struct sock *__udp4_lib_lookup(struct net *net, __be32 saddr, __be16 sport,
-			       __be32 daddr, __be16 dport, int dif,
+			       __be32 daddr, __be16 dport, int dif, int sdif,
 			       struct udp_table *tbl, struct sk_buff *skb);
 struct sock *udp4_lib_lookup_skb(struct sk_buff *skb,
 				 __be16 sport, __be16 dport);
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index 38bca2c4897d..fe14429e4a6c 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -380,8 +380,8 @@ int udp_v4_get_port(struct sock *sk, unsigned short snum)
 
 static int compute_score(struct sock *sk, struct net *net,
 			 __be32 saddr, __be16 sport,
-			 __be32 daddr, unsigned short hnum, int dif,
-			 bool exact_dif)
+			 __be32 daddr, unsigned short hnum,
+			 int dif, int sdif, bool exact_dif)
 {
 	int score;
 	struct inet_sock *inet;
@@ -413,10 +413,15 @@ static int compute_score(struct sock *sk, struct net *net,
 	}
 
 	if (sk->sk_bound_dev_if || exact_dif) {
-		if (sk->sk_bound_dev_if != dif)
+		bool dev_match = (sk->sk_bound_dev_if == dif ||
+				  sk->sk_bound_dev_if == sdif);
+
+		if (exact_dif && !dev_match)
 			return -1;
-		score += 4;
+		if (sk->sk_bound_dev_if && dev_match)
+			score += 4;
 	}
+
 	if (sk->sk_incoming_cpu == raw_smp_processor_id())
 		score++;
 	return score;
@@ -436,10 +441,11 @@ static u32 udp_ehashfn(const struct net *net, const __be32 laddr,
 
 /* called with rcu_read_lock() */
 static struct sock *udp4_lib_lookup2(struct net *net,
-		__be32 saddr, __be16 sport,
-		__be32 daddr, unsigned int hnum, int dif, bool exact_dif,
-		struct udp_hslot *hslot2,
-		struct sk_buff *skb)
+				     __be32 saddr, __be16 sport,
+				     __be32 daddr, unsigned int hnum,
+				     int dif, int sdif, bool exact_dif,
+				     struct udp_hslot *hslot2,
+				     struct sk_buff *skb)
 {
 	struct sock *sk, *result;
 	int score, badness, matches = 0, reuseport = 0;
@@ -449,7 +455,7 @@ static struct sock *udp4_lib_lookup2(struct net *net,
 	badness = 0;
 	udp_portaddr_for_each_entry_rcu(sk, &hslot2->head) {
 		score = compute_score(sk, net, saddr, sport,
-				      daddr, hnum, dif, exact_dif);
+				      daddr, hnum, dif, sdif, exact_dif);
 		if (score > badness) {
 			reuseport = sk->sk_reuseport;
 			if (reuseport) {
@@ -477,8 +483,8 @@ static struct sock *udp4_lib_lookup2(struct net *net,
  * harder than this. -DaveM
  */
 struct sock *__udp4_lib_lookup(struct net *net, __be32 saddr,
-		__be16 sport, __be32 daddr, __be16 dport,
-		int dif, struct udp_table *udptable, struct sk_buff *skb)
+		__be16 sport, __be32 daddr, __be16 dport, int dif,
+		int sdif, struct udp_table *udptable, struct sk_buff *skb)
 {
 	struct sock *sk, *result;
 	unsigned short hnum = ntohs(dport);
@@ -496,7 +502,7 @@ struct sock *__udp4_lib_lookup(struct net *net, __be32 saddr,
 			goto begin;
 
 		result = udp4_lib_lookup2(net, saddr, sport,
-					  daddr, hnum, dif,
+					  daddr, hnum, dif, sdif,
 					  exact_dif, hslot2, skb);
 		if (!result) {
 			unsigned int old_slot2 = slot2;
@@ -511,7 +517,7 @@ struct sock *__udp4_lib_lookup(struct net *net, __be32 saddr,
 				goto begin;
 
 			result = udp4_lib_lookup2(net, saddr, sport,
-						  daddr, hnum, dif,
+						  daddr, hnum, dif, sdif,
 						  exact_dif, hslot2, skb);
 		}
 		return result;
@@ -521,7 +527,7 @@ begin:
 	badness = 0;
 	sk_for_each_rcu(sk, &hslot->head) {
 		score = compute_score(sk, net, saddr, sport,
-				      daddr, hnum, dif, exact_dif);
+				      daddr, hnum, dif, sdif, exact_dif);
 		if (score > badness) {
 			reuseport = sk->sk_reuseport;
 			if (reuseport) {
@@ -554,7 +560,7 @@ static inline struct sock *__udp4_lib_lookup_skb(struct sk_buff *skb,
 
 	return __udp4_lib_lookup(dev_net(skb->dev), iph->saddr, sport,
 				 iph->daddr, dport, inet_iif(skb),
-				 udptable, skb);
+				 inet_sdif(skb), udptable, skb);
 }
 
 struct sock *udp4_lib_lookup_skb(struct sk_buff *skb,
@@ -576,7 +582,7 @@ struct sock *udp4_lib_lookup(struct net *net, __be32 saddr, __be16 sport,
 	struct sock *sk;
 
 	sk = __udp4_lib_lookup(net, saddr, sport, daddr, dport,
-			       dif, &udp_table, NULL);
+			       dif, 0, &udp_table, NULL);
 	if (sk && !refcount_inc_not_zero(&sk->sk_refcnt))
 		sk = NULL;
 	return sk;
@@ -587,7 +593,7 @@ EXPORT_SYMBOL_GPL(udp4_lib_lookup);
 static inline bool __udp_is_mcast_sock(struct net *net, struct sock *sk,
 				       __be16 loc_port, __be32 loc_addr,
 				       __be16 rmt_port, __be32 rmt_addr,
-				       int dif, unsigned short hnum)
+				       int dif, int sdif, unsigned short hnum)
 {
 	struct inet_sock *inet = inet_sk(sk);
 
@@ -597,7 +603,8 @@ static inline bool __udp_is_mcast_sock(struct net *net, struct sock *sk,
 	    (inet->inet_dport != rmt_port && inet->inet_dport) ||
 	    (inet->inet_rcv_saddr && inet->inet_rcv_saddr != loc_addr) ||
 	    ipv6_only_sock(sk) ||
-	    (sk->sk_bound_dev_if && sk->sk_bound_dev_if != dif))
+	    (sk->sk_bound_dev_if && sk->sk_bound_dev_if != dif &&
+	     sk->sk_bound_dev_if != sdif))
 		return false;
 	if (!ip_mc_sf_allow(sk, loc_addr, rmt_addr, dif))
 		return false;
@@ -628,8 +635,8 @@ void __udp4_lib_err(struct sk_buff *skb, u32 info, struct udp_table *udptable)
 	struct net *net = dev_net(skb->dev);
 
 	sk = __udp4_lib_lookup(net, iph->daddr, uh->dest,
-			iph->saddr, uh->source, skb->dev->ifindex, udptable,
-			NULL);
+			       iph->saddr, uh->source, skb->dev->ifindex, 0,
+			       udptable, NULL);
 	if (!sk) {
 		__ICMP_INC_STATS(net, ICMP_MIB_INERRORS);
 		return;	/* No socket for error */
@@ -1953,6 +1960,7 @@ static int __udp4_lib_mcast_deliver(struct net *net, struct sk_buff *skb,
 	unsigned int hash2 = 0, hash2_any = 0, use_hash2 = (hslot->count > 10);
 	unsigned int offset = offsetof(typeof(*sk), sk_node);
 	int dif = skb->dev->ifindex;
+	int sdif = inet_sdif(skb);
 	struct hlist_node *node;
 	struct sk_buff *nskb;
 
@@ -1967,7 +1975,7 @@ start_lookup:
 
 	sk_for_each_entry_offset_rcu(sk, node, &hslot->head, offset) {
 		if (!__udp_is_mcast_sock(net, sk, uh->dest, daddr,
-					 uh->source, saddr, dif, hnum))
+					 uh->source, saddr, dif, sdif, hnum))
 			continue;
 
 		if (!first) {
@@ -2157,7 +2165,7 @@ drop:
 static struct sock *__udp4_lib_mcast_demux_lookup(struct net *net,
 						  __be16 loc_port, __be32 loc_addr,
 						  __be16 rmt_port, __be32 rmt_addr,
-						  int dif)
+						  int dif, int sdif)
 {
 	struct sock *sk, *result;
 	unsigned short hnum = ntohs(loc_port);
@@ -2171,7 +2179,7 @@ static struct sock *__udp4_lib_mcast_demux_lookup(struct net *net,
 	result = NULL;
 	sk_for_each_rcu(sk, &hslot->head) {
 		if (__udp_is_mcast_sock(net, sk, loc_port, loc_addr,
-					rmt_port, rmt_addr, dif, hnum)) {
+					rmt_port, rmt_addr, dif, sdif, hnum)) {
 			if (result)
 				return NULL;
 			result = sk;
@@ -2216,6 +2224,7 @@ void udp_v4_early_demux(struct sk_buff *skb)
 	struct sock *sk = NULL;
 	struct dst_entry *dst;
 	int dif = skb->dev->ifindex;
+	int sdif = inet_sdif(skb);
 	int ours;
 
 	/* validate the packet */
@@ -2241,7 +2250,8 @@ void udp_v4_early_demux(struct sk_buff *skb)
 		}
 
 		sk = __udp4_lib_mcast_demux_lookup(net, uh->dest, iph->daddr,
-						   uh->source, iph->saddr, dif);
+						   uh->source, iph->saddr,
+						   dif, sdif);
 	} else if (skb->pkt_type == PACKET_HOST) {
 		sk = __udp4_lib_demux_lookup(net, uh->dest, iph->daddr,
 					     uh->source, iph->saddr, dif);
diff --git a/net/ipv4/udp_diag.c b/net/ipv4/udp_diag.c
index 4515836d2a3a..1f07fe109535 100644
--- a/net/ipv4/udp_diag.c
+++ b/net/ipv4/udp_diag.c
@@ -45,7 +45,7 @@ static int udp_dump_one(struct udp_table *tbl, struct sk_buff *in_skb,
 		sk = __udp4_lib_lookup(net,
 				req->id.idiag_src[0], req->id.idiag_sport,
 				req->id.idiag_dst[0], req->id.idiag_dport,
-				req->id.idiag_if, tbl, NULL);
+				req->id.idiag_if, 0, tbl, NULL);
 #if IS_ENABLED(CONFIG_IPV6)
 	else if (req->sdiag_family == AF_INET6)
 		sk = __udp6_lib_lookup(net,
@@ -182,7 +182,7 @@ static int __udp_diag_destroy(struct sk_buff *in_skb,
 		sk = __udp4_lib_lookup(net,
 				req->id.idiag_dst[0], req->id.idiag_dport,
 				req->id.idiag_src[0], req->id.idiag_sport,
-				req->id.idiag_if, tbl, NULL);
+				req->id.idiag_if, 0, tbl, NULL);
 #if IS_ENABLED(CONFIG_IPV6)
 	else if (req->sdiag_family == AF_INET6) {
 		if (ipv6_addr_v4mapped((struct in6_addr *)req->id.idiag_dst) &&
@@ -190,7 +190,7 @@ static int __udp_diag_destroy(struct sk_buff *in_skb,
 			sk = __udp4_lib_lookup(net,
 					req->id.idiag_dst[3], req->id.idiag_dport,
 					req->id.idiag_src[3], req->id.idiag_sport,
-					req->id.idiag_if, tbl, NULL);
+					req->id.idiag_if, 0, tbl, NULL);
 
 		else
 			sk = __udp6_lib_lookup(net,
-- 
cgit v1.2.3


From 3fa6f616a7a4d0bdf4d877d530456d8a5c3b109b Mon Sep 17 00:00:00 2001
From: David Ahern <dsahern@gmail.com>
Date: Mon, 7 Aug 2017 08:44:17 -0700
Subject: net: ipv4: add second dif to inet socket lookups

Add a second device index, sdif, to inet socket lookups. sdif is the
index for ingress devices enslaved to an l3mdev. It allows the lookups
to consider the enslaved device as well as the L3 domain when searching
for a socket.

TCP moves the data in the cb. Prior to tcp_v4_rcv (e.g., early demux) the
ingress index is obtained from IPCB using inet_sdif and after the cb move
in  tcp_v4_rcv the tcp_v4_sdif helper is used.

Signed-off-by: David Ahern <dsahern@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/inet_hashtables.h | 31 +++++++++++++++++--------------
 include/net/tcp.h             | 10 ++++++++++
 net/dccp/ipv4.c               |  4 ++--
 net/ipv4/inet_hashtables.c    | 27 +++++++++++++++++----------
 net/ipv4/tcp_ipv4.c           | 13 ++++++++-----
 net/ipv4/udp.c                |  6 +++---
 net/netfilter/xt_TPROXY.c     |  2 +-
 7 files changed, 58 insertions(+), 35 deletions(-)

(limited to 'include')

diff --git a/include/net/inet_hashtables.h b/include/net/inet_hashtables.h
index 5026b1f08bb8..2dbbbff5e1e3 100644
--- a/include/net/inet_hashtables.h
+++ b/include/net/inet_hashtables.h
@@ -221,16 +221,16 @@ struct sock *__inet_lookup_listener(struct net *net,
 				    const __be32 saddr, const __be16 sport,
 				    const __be32 daddr,
 				    const unsigned short hnum,
-				    const int dif);
+				    const int dif, const int sdif);
 
 static inline struct sock *inet_lookup_listener(struct net *net,
 		struct inet_hashinfo *hashinfo,
 		struct sk_buff *skb, int doff,
 		__be32 saddr, __be16 sport,
-		__be32 daddr, __be16 dport, int dif)
+		__be32 daddr, __be16 dport, int dif, int sdif)
 {
 	return __inet_lookup_listener(net, hashinfo, skb, doff, saddr, sport,
-				      daddr, ntohs(dport), dif);
+				      daddr, ntohs(dport), dif, sdif);
 }
 
 /* Socket demux engine toys. */
@@ -262,22 +262,24 @@ static inline struct sock *inet_lookup_listener(struct net *net,
 				   (((__force __u64)(__be32)(__daddr)) << 32) | \
 				   ((__force __u64)(__be32)(__saddr)))
 #endif /* __BIG_ENDIAN */
-#define INET_MATCH(__sk, __net, __cookie, __saddr, __daddr, __ports, __dif)	\
+#define INET_MATCH(__sk, __net, __cookie, __saddr, __daddr, __ports, __dif, __sdif) \
 	(((__sk)->sk_portpair == (__ports))			&&	\
 	 ((__sk)->sk_addrpair == (__cookie))			&&	\
 	 (!(__sk)->sk_bound_dev_if	||				\
-	   ((__sk)->sk_bound_dev_if == (__dif))) 		&& 	\
+	   ((__sk)->sk_bound_dev_if == (__dif))			||	\
+	   ((__sk)->sk_bound_dev_if == (__sdif)))		&&	\
 	 net_eq(sock_net(__sk), (__net)))
 #else /* 32-bit arch */
 #define INET_ADDR_COOKIE(__name, __saddr, __daddr) \
 	const int __name __deprecated __attribute__((unused))
 
-#define INET_MATCH(__sk, __net, __cookie, __saddr, __daddr, __ports, __dif) \
+#define INET_MATCH(__sk, __net, __cookie, __saddr, __daddr, __ports, __dif, __sdif) \
 	(((__sk)->sk_portpair == (__ports))		&&		\
 	 ((__sk)->sk_daddr	== (__saddr))		&&		\
 	 ((__sk)->sk_rcv_saddr	== (__daddr))		&&		\
 	 (!(__sk)->sk_bound_dev_if	||				\
-	   ((__sk)->sk_bound_dev_if == (__dif))) 	&&		\
+	   ((__sk)->sk_bound_dev_if == (__dif))		||		\
+	   ((__sk)->sk_bound_dev_if == (__sdif)))	&&		\
 	 net_eq(sock_net(__sk), (__net)))
 #endif /* 64-bit arch */
 
@@ -288,7 +290,7 @@ struct sock *__inet_lookup_established(struct net *net,
 				       struct inet_hashinfo *hashinfo,
 				       const __be32 saddr, const __be16 sport,
 				       const __be32 daddr, const u16 hnum,
-				       const int dif);
+				       const int dif, const int sdif);
 
 static inline struct sock *
 	inet_lookup_established(struct net *net, struct inet_hashinfo *hashinfo,
@@ -297,7 +299,7 @@ static inline struct sock *
 				const int dif)
 {
 	return __inet_lookup_established(net, hashinfo, saddr, sport, daddr,
-					 ntohs(dport), dif);
+					 ntohs(dport), dif, 0);
 }
 
 static inline struct sock *__inet_lookup(struct net *net,
@@ -305,20 +307,20 @@ static inline struct sock *__inet_lookup(struct net *net,
 					 struct sk_buff *skb, int doff,
 					 const __be32 saddr, const __be16 sport,
 					 const __be32 daddr, const __be16 dport,
-					 const int dif,
+					 const int dif, const int sdif,
 					 bool *refcounted)
 {
 	u16 hnum = ntohs(dport);
 	struct sock *sk;
 
 	sk = __inet_lookup_established(net, hashinfo, saddr, sport,
-				       daddr, hnum, dif);
+				       daddr, hnum, dif, sdif);
 	*refcounted = true;
 	if (sk)
 		return sk;
 	*refcounted = false;
 	return __inet_lookup_listener(net, hashinfo, skb, doff, saddr,
-				      sport, daddr, hnum, dif);
+				      sport, daddr, hnum, dif, sdif);
 }
 
 static inline struct sock *inet_lookup(struct net *net,
@@ -332,7 +334,7 @@ static inline struct sock *inet_lookup(struct net *net,
 	bool refcounted;
 
 	sk = __inet_lookup(net, hashinfo, skb, doff, saddr, sport, daddr,
-			   dport, dif, &refcounted);
+			   dport, dif, 0, &refcounted);
 
 	if (sk && !refcounted && !refcount_inc_not_zero(&sk->sk_refcnt))
 		sk = NULL;
@@ -344,6 +346,7 @@ static inline struct sock *__inet_lookup_skb(struct inet_hashinfo *hashinfo,
 					     int doff,
 					     const __be16 sport,
 					     const __be16 dport,
+					     const int sdif,
 					     bool *refcounted)
 {
 	struct sock *sk = skb_steal_sock(skb);
@@ -355,7 +358,7 @@ static inline struct sock *__inet_lookup_skb(struct inet_hashinfo *hashinfo,
 
 	return __inet_lookup(dev_net(skb_dst(skb)->dev), hashinfo, skb,
 			     doff, iph->saddr, sport,
-			     iph->daddr, dport, inet_iif(skb),
+			     iph->daddr, dport, inet_iif(skb), sdif,
 			     refcounted);
 }
 
diff --git a/include/net/tcp.h b/include/net/tcp.h
index 5173fecde495..2b89f1ab8552 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -840,6 +840,16 @@ static inline bool inet_exact_dif_match(struct net *net, struct sk_buff *skb)
 	return false;
 }
 
+/* TCP_SKB_CB reference means this can not be used from early demux */
+static inline int tcp_v4_sdif(struct sk_buff *skb)
+{
+#if IS_ENABLED(CONFIG_NET_L3_MASTER_DEV)
+	if (skb && ipv4_l3mdev_skb(TCP_SKB_CB(skb)->header.h4.flags))
+		return TCP_SKB_CB(skb)->header.h4.iif;
+#endif
+	return 0;
+}
+
 /* Due to TSO, an SKB can be composed of multiple actual
  * packets.  To keep these tracked properly, we use this.
  */
diff --git a/net/dccp/ipv4.c b/net/dccp/ipv4.c
index 1b202f16531f..001c08696334 100644
--- a/net/dccp/ipv4.c
+++ b/net/dccp/ipv4.c
@@ -256,7 +256,7 @@ static void dccp_v4_err(struct sk_buff *skb, u32 info)
 	sk = __inet_lookup_established(net, &dccp_hashinfo,
 				       iph->daddr, dh->dccph_dport,
 				       iph->saddr, ntohs(dh->dccph_sport),
-				       inet_iif(skb));
+				       inet_iif(skb), 0);
 	if (!sk) {
 		__ICMP_INC_STATS(net, ICMP_MIB_INERRORS);
 		return;
@@ -804,7 +804,7 @@ static int dccp_v4_rcv(struct sk_buff *skb)
 
 lookup:
 	sk = __inet_lookup_skb(&dccp_hashinfo, skb, __dccp_hdr_len(dh),
-			       dh->dccph_sport, dh->dccph_dport, &refcounted);
+			       dh->dccph_sport, dh->dccph_dport, 0, &refcounted);
 	if (!sk) {
 		dccp_pr_debug("failed to look up flow ID in table and "
 			      "get corresponding socket\n");
diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c
index 2e3389d614d1..597bb4cfe805 100644
--- a/net/ipv4/inet_hashtables.c
+++ b/net/ipv4/inet_hashtables.c
@@ -170,7 +170,7 @@ EXPORT_SYMBOL_GPL(__inet_inherit_port);
 
 static inline int compute_score(struct sock *sk, struct net *net,
 				const unsigned short hnum, const __be32 daddr,
-				const int dif, bool exact_dif)
+				const int dif, const int sdif, bool exact_dif)
 {
 	int score = -1;
 	struct inet_sock *inet = inet_sk(sk);
@@ -185,9 +185,13 @@ static inline int compute_score(struct sock *sk, struct net *net,
 			score += 4;
 		}
 		if (sk->sk_bound_dev_if || exact_dif) {
-			if (sk->sk_bound_dev_if != dif)
+			bool dev_match = (sk->sk_bound_dev_if == dif ||
+					  sk->sk_bound_dev_if == sdif);
+
+			if (exact_dif && !dev_match)
 				return -1;
-			score += 4;
+			if (sk->sk_bound_dev_if && dev_match)
+				score += 4;
 		}
 		if (sk->sk_incoming_cpu == raw_smp_processor_id())
 			score++;
@@ -208,7 +212,7 @@ struct sock *__inet_lookup_listener(struct net *net,
 				    struct sk_buff *skb, int doff,
 				    const __be32 saddr, __be16 sport,
 				    const __be32 daddr, const unsigned short hnum,
-				    const int dif)
+				    const int dif, const int sdif)
 {
 	unsigned int hash = inet_lhashfn(net, hnum);
 	struct inet_listen_hashbucket *ilb = &hashinfo->listening_hash[hash];
@@ -218,7 +222,8 @@ struct sock *__inet_lookup_listener(struct net *net,
 	u32 phash = 0;
 
 	sk_for_each_rcu(sk, &ilb->head) {
-		score = compute_score(sk, net, hnum, daddr, dif, exact_dif);
+		score = compute_score(sk, net, hnum, daddr,
+				      dif, sdif, exact_dif);
 		if (score > hiscore) {
 			reuseport = sk->sk_reuseport;
 			if (reuseport) {
@@ -268,7 +273,7 @@ struct sock *__inet_lookup_established(struct net *net,
 				  struct inet_hashinfo *hashinfo,
 				  const __be32 saddr, const __be16 sport,
 				  const __be32 daddr, const u16 hnum,
-				  const int dif)
+				  const int dif, const int sdif)
 {
 	INET_ADDR_COOKIE(acookie, saddr, daddr);
 	const __portpair ports = INET_COMBINED_PORTS(sport, hnum);
@@ -286,11 +291,12 @@ begin:
 		if (sk->sk_hash != hash)
 			continue;
 		if (likely(INET_MATCH(sk, net, acookie,
-				      saddr, daddr, ports, dif))) {
+				      saddr, daddr, ports, dif, sdif))) {
 			if (unlikely(!refcount_inc_not_zero(&sk->sk_refcnt)))
 				goto out;
 			if (unlikely(!INET_MATCH(sk, net, acookie,
-						 saddr, daddr, ports, dif))) {
+						 saddr, daddr, ports,
+						 dif, sdif))) {
 				sock_gen_put(sk);
 				goto begin;
 			}
@@ -321,9 +327,10 @@ static int __inet_check_established(struct inet_timewait_death_row *death_row,
 	__be32 daddr = inet->inet_rcv_saddr;
 	__be32 saddr = inet->inet_daddr;
 	int dif = sk->sk_bound_dev_if;
+	struct net *net = sock_net(sk);
+	int sdif = l3mdev_master_ifindex_by_index(net, dif);
 	INET_ADDR_COOKIE(acookie, saddr, daddr);
 	const __portpair ports = INET_COMBINED_PORTS(inet->inet_dport, lport);
-	struct net *net = sock_net(sk);
 	unsigned int hash = inet_ehashfn(net, daddr, lport,
 					 saddr, inet->inet_dport);
 	struct inet_ehash_bucket *head = inet_ehash_bucket(hinfo, hash);
@@ -339,7 +346,7 @@ static int __inet_check_established(struct inet_timewait_death_row *death_row,
 			continue;
 
 		if (likely(INET_MATCH(sk2, net, acookie,
-					 saddr, daddr, ports, dif))) {
+					 saddr, daddr, ports, dif, sdif))) {
 			if (sk2->sk_state == TCP_TIME_WAIT) {
 				tw = inet_twsk(sk2);
 				if (twsk_unique(sk, sk2, twp))
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 5f708c85110e..c8784ab37852 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -383,7 +383,7 @@ void tcp_v4_err(struct sk_buff *icmp_skb, u32 info)
 
 	sk = __inet_lookup_established(net, &tcp_hashinfo, iph->daddr,
 				       th->dest, iph->saddr, ntohs(th->source),
-				       inet_iif(icmp_skb));
+				       inet_iif(icmp_skb), 0);
 	if (!sk) {
 		__ICMP_INC_STATS(net, ICMP_MIB_INERRORS);
 		return;
@@ -659,7 +659,8 @@ static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb)
 		sk1 = __inet_lookup_listener(net, &tcp_hashinfo, NULL, 0,
 					     ip_hdr(skb)->saddr,
 					     th->source, ip_hdr(skb)->daddr,
-					     ntohs(th->source), inet_iif(skb));
+					     ntohs(th->source), inet_iif(skb),
+					     tcp_v4_sdif(skb));
 		/* don't send rst if it can't find key */
 		if (!sk1)
 			goto out;
@@ -1523,7 +1524,7 @@ void tcp_v4_early_demux(struct sk_buff *skb)
 	sk = __inet_lookup_established(dev_net(skb->dev), &tcp_hashinfo,
 				       iph->saddr, th->source,
 				       iph->daddr, ntohs(th->dest),
-				       skb->skb_iif);
+				       skb->skb_iif, inet_sdif(skb));
 	if (sk) {
 		skb->sk = sk;
 		skb->destructor = sock_edemux;
@@ -1588,6 +1589,7 @@ EXPORT_SYMBOL(tcp_filter);
 int tcp_v4_rcv(struct sk_buff *skb)
 {
 	struct net *net = dev_net(skb->dev);
+	int sdif = inet_sdif(skb);
 	const struct iphdr *iph;
 	const struct tcphdr *th;
 	bool refcounted;
@@ -1638,7 +1640,7 @@ int tcp_v4_rcv(struct sk_buff *skb)
 
 lookup:
 	sk = __inet_lookup_skb(&tcp_hashinfo, skb, __tcp_hdrlen(th), th->source,
-			       th->dest, &refcounted);
+			       th->dest, sdif, &refcounted);
 	if (!sk)
 		goto no_tcp_socket;
 
@@ -1766,7 +1768,8 @@ do_time_wait:
 							__tcp_hdrlen(th),
 							iph->saddr, th->source,
 							iph->daddr, th->dest,
-							inet_iif(skb));
+							inet_iif(skb),
+							sdif);
 		if (sk2) {
 			inet_twsk_deschedule_put(inet_twsk(sk));
 			sk = sk2;
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index fe14429e4a6c..99f25bfec606 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -2196,7 +2196,7 @@ static struct sock *__udp4_lib_mcast_demux_lookup(struct net *net,
 static struct sock *__udp4_lib_demux_lookup(struct net *net,
 					    __be16 loc_port, __be32 loc_addr,
 					    __be16 rmt_port, __be32 rmt_addr,
-					    int dif)
+					    int dif, int sdif)
 {
 	unsigned short hnum = ntohs(loc_port);
 	unsigned int hash2 = udp4_portaddr_hash(net, loc_addr, hnum);
@@ -2208,7 +2208,7 @@ static struct sock *__udp4_lib_demux_lookup(struct net *net,
 
 	udp_portaddr_for_each_entry_rcu(sk, &hslot2->head) {
 		if (INET_MATCH(sk, net, acookie, rmt_addr,
-			       loc_addr, ports, dif))
+			       loc_addr, ports, dif, sdif))
 			return sk;
 		/* Only check first socket in chain */
 		break;
@@ -2254,7 +2254,7 @@ void udp_v4_early_demux(struct sk_buff *skb)
 						   dif, sdif);
 	} else if (skb->pkt_type == PACKET_HOST) {
 		sk = __udp4_lib_demux_lookup(net, uh->dest, iph->daddr,
-					     uh->source, iph->saddr, dif);
+					     uh->source, iph->saddr, dif, sdif);
 	}
 
 	if (!sk || !refcount_inc_not_zero(&sk->sk_refcnt))
diff --git a/net/netfilter/xt_TPROXY.c b/net/netfilter/xt_TPROXY.c
index d767e35fff6b..94fb0fd0c667 100644
--- a/net/netfilter/xt_TPROXY.c
+++ b/net/netfilter/xt_TPROXY.c
@@ -125,7 +125,7 @@ nf_tproxy_get_sock_v4(struct net *net, struct sk_buff *skb, void *hp,
 						      __tcp_hdrlen(tcph),
 						    saddr, sport,
 						    daddr, dport,
-						    in->ifindex);
+						    in->ifindex, 0);
 
 			if (sk && !refcount_inc_not_zero(&sk->sk_refcnt))
 				sk = NULL;
-- 
cgit v1.2.3


From 67359930e185c491b47cb958d5f1d6c1af4598a2 Mon Sep 17 00:00:00 2001
From: David Ahern <dsahern@gmail.com>
Date: Mon, 7 Aug 2017 08:44:18 -0700
Subject: net: ipv4: add second dif to raw socket lookups

Add a second device index, sdif, to raw socket lookups. sdif is the
index for ingress devices enslaved to an l3mdev. It allows the lookups
to consider the enslaved device as well as the L3 domain when searching
for a socket.

Signed-off-by: David Ahern <dsahern@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/raw.h   |  2 +-
 net/ipv4/raw.c      | 16 +++++++++++-----
 net/ipv4/raw_diag.c |  2 +-
 3 files changed, 13 insertions(+), 7 deletions(-)

(limited to 'include')

diff --git a/include/net/raw.h b/include/net/raw.h
index 57c33dd22ec4..99d26d0c4a19 100644
--- a/include/net/raw.h
+++ b/include/net/raw.h
@@ -26,7 +26,7 @@ extern struct proto raw_prot;
 extern struct raw_hashinfo raw_v4_hashinfo;
 struct sock *__raw_v4_lookup(struct net *net, struct sock *sk,
 			     unsigned short num, __be32 raddr,
-			     __be32 laddr, int dif);
+			     __be32 laddr, int dif, int sdif);
 
 int raw_abort(struct sock *sk, int err);
 void raw_icmp_error(struct sk_buff *, int, u32);
diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c
index b0bb5d0a30bd..2726aecf224b 100644
--- a/net/ipv4/raw.c
+++ b/net/ipv4/raw.c
@@ -122,7 +122,8 @@ void raw_unhash_sk(struct sock *sk)
 EXPORT_SYMBOL_GPL(raw_unhash_sk);
 
 struct sock *__raw_v4_lookup(struct net *net, struct sock *sk,
-		unsigned short num, __be32 raddr, __be32 laddr, int dif)
+			     unsigned short num, __be32 raddr, __be32 laddr,
+			     int dif, int sdif)
 {
 	sk_for_each_from(sk) {
 		struct inet_sock *inet = inet_sk(sk);
@@ -130,7 +131,8 @@ struct sock *__raw_v4_lookup(struct net *net, struct sock *sk,
 		if (net_eq(sock_net(sk), net) && inet->inet_num == num	&&
 		    !(inet->inet_daddr && inet->inet_daddr != raddr) 	&&
 		    !(inet->inet_rcv_saddr && inet->inet_rcv_saddr != laddr) &&
-		    !(sk->sk_bound_dev_if && sk->sk_bound_dev_if != dif))
+		    !(sk->sk_bound_dev_if && sk->sk_bound_dev_if != dif &&
+		      sk->sk_bound_dev_if != sdif))
 			goto found; /* gotcha */
 	}
 	sk = NULL;
@@ -171,6 +173,7 @@ static int icmp_filter(const struct sock *sk, const struct sk_buff *skb)
  */
 static int raw_v4_input(struct sk_buff *skb, const struct iphdr *iph, int hash)
 {
+	int sdif = inet_sdif(skb);
 	struct sock *sk;
 	struct hlist_head *head;
 	int delivered = 0;
@@ -184,7 +187,7 @@ static int raw_v4_input(struct sk_buff *skb, const struct iphdr *iph, int hash)
 	net = dev_net(skb->dev);
 	sk = __raw_v4_lookup(net, __sk_head(head), iph->protocol,
 			     iph->saddr, iph->daddr,
-			     skb->dev->ifindex);
+			     skb->dev->ifindex, sdif);
 
 	while (sk) {
 		delivered = 1;
@@ -199,7 +202,7 @@ static int raw_v4_input(struct sk_buff *skb, const struct iphdr *iph, int hash)
 		}
 		sk = __raw_v4_lookup(net, sk_next(sk), iph->protocol,
 				     iph->saddr, iph->daddr,
-				     skb->dev->ifindex);
+				     skb->dev->ifindex, sdif);
 	}
 out:
 	read_unlock(&raw_v4_hashinfo.lock);
@@ -297,12 +300,15 @@ void raw_icmp_error(struct sk_buff *skb, int protocol, u32 info)
 	read_lock(&raw_v4_hashinfo.lock);
 	raw_sk = sk_head(&raw_v4_hashinfo.ht[hash]);
 	if (raw_sk) {
+		int dif = skb->dev->ifindex;
+		int sdif = inet_sdif(skb);
+
 		iph = (const struct iphdr *)skb->data;
 		net = dev_net(skb->dev);
 
 		while ((raw_sk = __raw_v4_lookup(net, raw_sk, protocol,
 						iph->daddr, iph->saddr,
-						skb->dev->ifindex)) != NULL) {
+						dif, sdif)) != NULL) {
 			raw_err(raw_sk, skb, info);
 			raw_sk = sk_next(raw_sk);
 			iph = (const struct iphdr *)skb->data;
diff --git a/net/ipv4/raw_diag.c b/net/ipv4/raw_diag.c
index e1a51ca68d23..c600d3c71d4d 100644
--- a/net/ipv4/raw_diag.c
+++ b/net/ipv4/raw_diag.c
@@ -46,7 +46,7 @@ static struct sock *raw_lookup(struct net *net, struct sock *from,
 		sk = __raw_v4_lookup(net, from, r->sdiag_raw_protocol,
 				     r->id.idiag_dst[0],
 				     r->id.idiag_src[0],
-				     r->id.idiag_if);
+				     r->id.idiag_if, 0);
 #if IS_ENABLED(CONFIG_IPV6)
 	else
 		sk = __raw_v6_lookup(net, from, r->sdiag_raw_protocol,
-- 
cgit v1.2.3


From 60d9b03141243589dacd3136f3fcb4e6976df954 Mon Sep 17 00:00:00 2001
From: David Ahern <dsahern@gmail.com>
Date: Mon, 7 Aug 2017 08:44:19 -0700
Subject: net: ipv4: add second dif to multicast source filter

Signed-off-by: David Ahern <dsahern@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/igmp.h | 3 ++-
 net/ipv4/igmp.c      | 6 ++++--
 net/ipv4/raw.c       | 2 +-
 net/ipv4/udp.c       | 2 +-
 4 files changed, 8 insertions(+), 5 deletions(-)

(limited to 'include')

diff --git a/include/linux/igmp.h b/include/linux/igmp.h
index 97caf1821de8..f8231854b5d6 100644
--- a/include/linux/igmp.h
+++ b/include/linux/igmp.h
@@ -118,7 +118,8 @@ extern int ip_mc_msfget(struct sock *sk, struct ip_msfilter *msf,
 		struct ip_msfilter __user *optval, int __user *optlen);
 extern int ip_mc_gsfget(struct sock *sk, struct group_filter *gsf,
 		struct group_filter __user *optval, int __user *optlen);
-extern int ip_mc_sf_allow(struct sock *sk, __be32 local, __be32 rmt, int dif);
+extern int ip_mc_sf_allow(struct sock *sk, __be32 local, __be32 rmt,
+			  int dif, int sdif);
 extern void ip_mc_init_dev(struct in_device *);
 extern void ip_mc_destroy_dev(struct in_device *);
 extern void ip_mc_up(struct in_device *);
diff --git a/net/ipv4/igmp.c b/net/ipv4/igmp.c
index 28f14afd0dd3..5bc8570c2ec3 100644
--- a/net/ipv4/igmp.c
+++ b/net/ipv4/igmp.c
@@ -2549,7 +2549,8 @@ done:
 /*
  * check if a multicast source filter allows delivery for a given <src,dst,intf>
  */
-int ip_mc_sf_allow(struct sock *sk, __be32 loc_addr, __be32 rmt_addr, int dif)
+int ip_mc_sf_allow(struct sock *sk, __be32 loc_addr, __be32 rmt_addr,
+		   int dif, int sdif)
 {
 	struct inet_sock *inet = inet_sk(sk);
 	struct ip_mc_socklist *pmc;
@@ -2564,7 +2565,8 @@ int ip_mc_sf_allow(struct sock *sk, __be32 loc_addr, __be32 rmt_addr, int dif)
 	rcu_read_lock();
 	for_each_pmc_rcu(inet, pmc) {
 		if (pmc->multi.imr_multiaddr.s_addr == loc_addr &&
-		    pmc->multi.imr_ifindex == dif)
+		    (pmc->multi.imr_ifindex == dif ||
+		     (sdif && pmc->multi.imr_ifindex == sdif)))
 			break;
 	}
 	ret = inet->mc_all;
diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c
index 2726aecf224b..33b70bfd1122 100644
--- a/net/ipv4/raw.c
+++ b/net/ipv4/raw.c
@@ -193,7 +193,7 @@ static int raw_v4_input(struct sk_buff *skb, const struct iphdr *iph, int hash)
 		delivered = 1;
 		if ((iph->protocol != IPPROTO_ICMP || !icmp_filter(sk, skb)) &&
 		    ip_mc_sf_allow(sk, iph->daddr, iph->saddr,
-				   skb->dev->ifindex)) {
+				   skb->dev->ifindex, sdif)) {
 			struct sk_buff *clone = skb_clone(skb, GFP_ATOMIC);
 
 			/* Not releasing hash table! */
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index 99f25bfec606..cac59d7420cd 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -606,7 +606,7 @@ static inline bool __udp_is_mcast_sock(struct net *net, struct sock *sk,
 	    (sk->sk_bound_dev_if && sk->sk_bound_dev_if != dif &&
 	     sk->sk_bound_dev_if != sdif))
 		return false;
-	if (!ip_mc_sf_allow(sk, loc_addr, rmt_addr, dif))
+	if (!ip_mc_sf_allow(sk, loc_addr, rmt_addr, dif, sdif))
 		return false;
 	return true;
 }
-- 
cgit v1.2.3


From 1801b570dd2ae50b90231f283e79a9a94fbe7875 Mon Sep 17 00:00:00 2001
From: David Ahern <dsahern@gmail.com>
Date: Mon, 7 Aug 2017 08:44:20 -0700
Subject: net: ipv6: add second dif to udp socket lookups

Add a second device index, sdif, to udp socket lookups. sdif is the
index for ingress devices enslaved to an l3mdev. It allows the lookups
to consider the enslaved device as well as the L3 domain when searching
for a socket.

Early demux lookups are handled in the next patch as part of INET_MATCH
changes.

Signed-off-by: David Ahern <dsahern@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/ipv6.h | 10 ++++++++++
 include/net/udp.h    |  2 +-
 net/ipv4/udp_diag.c  |  4 ++--
 net/ipv6/udp.c       | 40 ++++++++++++++++++++++------------------
 4 files changed, 35 insertions(+), 21 deletions(-)

(limited to 'include')

diff --git a/include/linux/ipv6.h b/include/linux/ipv6.h
index 474d6bbc158c..ac2da4e11d5e 100644
--- a/include/linux/ipv6.h
+++ b/include/linux/ipv6.h
@@ -158,6 +158,16 @@ static inline bool inet6_is_jumbogram(const struct sk_buff *skb)
 	return !!(IP6CB(skb)->flags & IP6SKB_JUMBOGRAM);
 }
 
+/* can not be used in TCP layer after tcp_v6_fill_cb */
+static inline int inet6_sdif(const struct sk_buff *skb)
+{
+#if IS_ENABLED(CONFIG_NET_L3_MASTER_DEV)
+	if (skb && ipv6_l3mdev_skb(IP6CB(skb)->flags))
+		return IP6CB(skb)->iif;
+#endif
+	return 0;
+}
+
 /* can not be used in TCP layer after tcp_v6_fill_cb */
 static inline bool inet6_exact_dif_match(struct net *net, struct sk_buff *skb)
 {
diff --git a/include/net/udp.h b/include/net/udp.h
index 826c713d5a48..20dcdca4e85c 100644
--- a/include/net/udp.h
+++ b/include/net/udp.h
@@ -298,7 +298,7 @@ struct sock *udp6_lib_lookup(struct net *net,
 struct sock *__udp6_lib_lookup(struct net *net,
 			       const struct in6_addr *saddr, __be16 sport,
 			       const struct in6_addr *daddr, __be16 dport,
-			       int dif, struct udp_table *tbl,
+			       int dif, int sdif, struct udp_table *tbl,
 			       struct sk_buff *skb);
 struct sock *udp6_lib_lookup_skb(struct sk_buff *skb,
 				 __be16 sport, __be16 dport);
diff --git a/net/ipv4/udp_diag.c b/net/ipv4/udp_diag.c
index 1f07fe109535..d0390d844ac8 100644
--- a/net/ipv4/udp_diag.c
+++ b/net/ipv4/udp_diag.c
@@ -53,7 +53,7 @@ static int udp_dump_one(struct udp_table *tbl, struct sk_buff *in_skb,
 				req->id.idiag_sport,
 				(struct in6_addr *)req->id.idiag_dst,
 				req->id.idiag_dport,
-				req->id.idiag_if, tbl, NULL);
+				req->id.idiag_if, 0, tbl, NULL);
 #endif
 	if (sk && !refcount_inc_not_zero(&sk->sk_refcnt))
 		sk = NULL;
@@ -198,7 +198,7 @@ static int __udp_diag_destroy(struct sk_buff *in_skb,
 					req->id.idiag_dport,
 					(struct in6_addr *)req->id.idiag_src,
 					req->id.idiag_sport,
-					req->id.idiag_if, tbl, NULL);
+					req->id.idiag_if, 0, tbl, NULL);
 	}
 #endif
 	else {
diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c
index 578142b7ca3e..d96a877798a7 100644
--- a/net/ipv6/udp.c
+++ b/net/ipv6/udp.c
@@ -129,7 +129,7 @@ static void udp_v6_rehash(struct sock *sk)
 static int compute_score(struct sock *sk, struct net *net,
 			 const struct in6_addr *saddr, __be16 sport,
 			 const struct in6_addr *daddr, unsigned short hnum,
-			 int dif, bool exact_dif)
+			 int dif, int sdif, bool exact_dif)
 {
 	int score;
 	struct inet_sock *inet;
@@ -161,9 +161,13 @@ static int compute_score(struct sock *sk, struct net *net,
 	}
 
 	if (sk->sk_bound_dev_if || exact_dif) {
-		if (sk->sk_bound_dev_if != dif)
+		bool dev_match = (sk->sk_bound_dev_if == dif ||
+				  sk->sk_bound_dev_if == sdif);
+
+		if (exact_dif && !dev_match)
 			return -1;
-		score++;
+		if (sk->sk_bound_dev_if && dev_match)
+			score++;
 	}
 
 	if (sk->sk_incoming_cpu == raw_smp_processor_id())
@@ -175,9 +179,9 @@ static int compute_score(struct sock *sk, struct net *net,
 /* called with rcu_read_lock() */
 static struct sock *udp6_lib_lookup2(struct net *net,
 		const struct in6_addr *saddr, __be16 sport,
-		const struct in6_addr *daddr, unsigned int hnum, int dif,
-		bool exact_dif, struct udp_hslot *hslot2,
-		struct sk_buff *skb)
+		const struct in6_addr *daddr, unsigned int hnum,
+		int dif, int sdif, bool exact_dif,
+		struct udp_hslot *hslot2, struct sk_buff *skb)
 {
 	struct sock *sk, *result;
 	int score, badness, matches = 0, reuseport = 0;
@@ -187,7 +191,7 @@ static struct sock *udp6_lib_lookup2(struct net *net,
 	badness = -1;
 	udp_portaddr_for_each_entry_rcu(sk, &hslot2->head) {
 		score = compute_score(sk, net, saddr, sport,
-				      daddr, hnum, dif, exact_dif);
+				      daddr, hnum, dif, sdif, exact_dif);
 		if (score > badness) {
 			reuseport = sk->sk_reuseport;
 			if (reuseport) {
@@ -214,10 +218,10 @@ static struct sock *udp6_lib_lookup2(struct net *net,
 
 /* rcu_read_lock() must be held */
 struct sock *__udp6_lib_lookup(struct net *net,
-				      const struct in6_addr *saddr, __be16 sport,
-				      const struct in6_addr *daddr, __be16 dport,
-				      int dif, struct udp_table *udptable,
-				      struct sk_buff *skb)
+			       const struct in6_addr *saddr, __be16 sport,
+			       const struct in6_addr *daddr, __be16 dport,
+			       int dif, int sdif, struct udp_table *udptable,
+			       struct sk_buff *skb)
 {
 	struct sock *sk, *result;
 	unsigned short hnum = ntohs(dport);
@@ -235,7 +239,7 @@ struct sock *__udp6_lib_lookup(struct net *net,
 			goto begin;
 
 		result = udp6_lib_lookup2(net, saddr, sport,
-					  daddr, hnum, dif, exact_dif,
+					  daddr, hnum, dif, sdif, exact_dif,
 					  hslot2, skb);
 		if (!result) {
 			unsigned int old_slot2 = slot2;
@@ -250,7 +254,7 @@ struct sock *__udp6_lib_lookup(struct net *net,
 				goto begin;
 
 			result = udp6_lib_lookup2(net, saddr, sport,
-						  daddr, hnum, dif,
+						  daddr, hnum, dif, sdif,
 						  exact_dif, hslot2,
 						  skb);
 		}
@@ -261,7 +265,7 @@ begin:
 	badness = -1;
 	sk_for_each_rcu(sk, &hslot->head) {
 		score = compute_score(sk, net, saddr, sport, daddr, hnum, dif,
-				      exact_dif);
+				      sdif, exact_dif);
 		if (score > badness) {
 			reuseport = sk->sk_reuseport;
 			if (reuseport) {
@@ -294,7 +298,7 @@ static struct sock *__udp6_lib_lookup_skb(struct sk_buff *skb,
 
 	return __udp6_lib_lookup(dev_net(skb->dev), &iph->saddr, sport,
 				 &iph->daddr, dport, inet6_iif(skb),
-				 udptable, skb);
+				 inet6_sdif(skb), udptable, skb);
 }
 
 struct sock *udp6_lib_lookup_skb(struct sk_buff *skb,
@@ -304,7 +308,7 @@ struct sock *udp6_lib_lookup_skb(struct sk_buff *skb,
 
 	return __udp6_lib_lookup(dev_net(skb->dev), &iph->saddr, sport,
 				 &iph->daddr, dport, inet6_iif(skb),
-				 &udp_table, skb);
+				 inet6_sdif(skb), &udp_table, skb);
 }
 EXPORT_SYMBOL_GPL(udp6_lib_lookup_skb);
 
@@ -320,7 +324,7 @@ struct sock *udp6_lib_lookup(struct net *net, const struct in6_addr *saddr, __be
 	struct sock *sk;
 
 	sk =  __udp6_lib_lookup(net, saddr, sport, daddr, dport,
-				dif, &udp_table, NULL);
+				dif, 0, &udp_table, NULL);
 	if (sk && !refcount_inc_not_zero(&sk->sk_refcnt))
 		sk = NULL;
 	return sk;
@@ -501,7 +505,7 @@ void __udp6_lib_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
 	struct net *net = dev_net(skb->dev);
 
 	sk = __udp6_lib_lookup(net, daddr, uh->dest, saddr, uh->source,
-			       inet6_iif(skb), udptable, skb);
+			       inet6_iif(skb), 0, udptable, skb);
 	if (!sk) {
 		__ICMP6_INC_STATS(net, __in6_dev_get(skb->dev),
 				  ICMP6_MIB_INERRORS);
-- 
cgit v1.2.3


From 4297a0ef085729af98adab9131d128c576ed3044 Mon Sep 17 00:00:00 2001
From: David Ahern <dsahern@gmail.com>
Date: Mon, 7 Aug 2017 08:44:21 -0700
Subject: net: ipv6: add second dif to inet6 socket lookups

Add a second device index, sdif, to inet6 socket lookups. sdif is the
index for ingress devices enslaved to an l3mdev. It allows the lookups
to consider the enslaved device as well as the L3 domain when searching
for a socket.

TCP moves the data in the cb. Prior to tcp_v4_rcv (e.g., early demux) the
ingress index is obtained from IPCB using inet_sdif and after tcp_v4_rcv
tcp_v4_sdif is used.

Signed-off-by: David Ahern <dsahern@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/inet6_hashtables.h | 22 +++++++++++++---------
 include/net/tcp.h              | 10 ++++++++++
 net/dccp/ipv6.c                |  4 ++--
 net/ipv6/inet6_hashtables.c    | 28 +++++++++++++++++-----------
 net/ipv6/tcp_ipv6.c            | 13 ++++++++-----
 net/ipv6/udp.c                 |  7 ++++---
 net/netfilter/xt_TPROXY.c      |  4 ++--
 7 files changed, 56 insertions(+), 32 deletions(-)

(limited to 'include')

diff --git a/include/net/inet6_hashtables.h b/include/net/inet6_hashtables.h
index b87becacd9d3..6e91e38a31da 100644
--- a/include/net/inet6_hashtables.h
+++ b/include/net/inet6_hashtables.h
@@ -49,7 +49,8 @@ struct sock *__inet6_lookup_established(struct net *net,
 					const struct in6_addr *saddr,
 					const __be16 sport,
 					const struct in6_addr *daddr,
-					const u16 hnum, const int dif);
+					const u16 hnum, const int dif,
+					const int sdif);
 
 struct sock *inet6_lookup_listener(struct net *net,
 				   struct inet_hashinfo *hashinfo,
@@ -57,7 +58,8 @@ struct sock *inet6_lookup_listener(struct net *net,
 				   const struct in6_addr *saddr,
 				   const __be16 sport,
 				   const struct in6_addr *daddr,
-				   const unsigned short hnum, const int dif);
+				   const unsigned short hnum,
+				   const int dif, const int sdif);
 
 static inline struct sock *__inet6_lookup(struct net *net,
 					  struct inet_hashinfo *hashinfo,
@@ -66,24 +68,25 @@ static inline struct sock *__inet6_lookup(struct net *net,
 					  const __be16 sport,
 					  const struct in6_addr *daddr,
 					  const u16 hnum,
-					  const int dif,
+					  const int dif, const int sdif,
 					  bool *refcounted)
 {
 	struct sock *sk = __inet6_lookup_established(net, hashinfo, saddr,
-						sport, daddr, hnum, dif);
+						     sport, daddr, hnum,
+						     dif, sdif);
 	*refcounted = true;
 	if (sk)
 		return sk;
 	*refcounted = false;
 	return inet6_lookup_listener(net, hashinfo, skb, doff, saddr, sport,
-				     daddr, hnum, dif);
+				     daddr, hnum, dif, sdif);
 }
 
 static inline struct sock *__inet6_lookup_skb(struct inet_hashinfo *hashinfo,
 					      struct sk_buff *skb, int doff,
 					      const __be16 sport,
 					      const __be16 dport,
-					      int iif,
+					      int iif, int sdif,
 					      bool *refcounted)
 {
 	struct sock *sk = skb_steal_sock(skb);
@@ -95,7 +98,7 @@ static inline struct sock *__inet6_lookup_skb(struct inet_hashinfo *hashinfo,
 	return __inet6_lookup(dev_net(skb_dst(skb)->dev), hashinfo, skb,
 			      doff, &ipv6_hdr(skb)->saddr, sport,
 			      &ipv6_hdr(skb)->daddr, ntohs(dport),
-			      iif, refcounted);
+			      iif, sdif, refcounted);
 }
 
 struct sock *inet6_lookup(struct net *net, struct inet_hashinfo *hashinfo,
@@ -107,13 +110,14 @@ struct sock *inet6_lookup(struct net *net, struct inet_hashinfo *hashinfo,
 int inet6_hash(struct sock *sk);
 #endif /* IS_ENABLED(CONFIG_IPV6) */
 
-#define INET6_MATCH(__sk, __net, __saddr, __daddr, __ports, __dif)	\
+#define INET6_MATCH(__sk, __net, __saddr, __daddr, __ports, __dif, __sdif) \
 	(((__sk)->sk_portpair == (__ports))			&&	\
 	 ((__sk)->sk_family == AF_INET6)			&&	\
 	 ipv6_addr_equal(&(__sk)->sk_v6_daddr, (__saddr))		&&	\
 	 ipv6_addr_equal(&(__sk)->sk_v6_rcv_saddr, (__daddr))	&&	\
 	 (!(__sk)->sk_bound_dev_if	||				\
-	   ((__sk)->sk_bound_dev_if == (__dif))) 		&&	\
+	   ((__sk)->sk_bound_dev_if == (__dif))	||			\
+	   ((__sk)->sk_bound_dev_if == (__sdif)))		&&	\
 	 net_eq(sock_net(__sk), (__net)))
 
 #endif /* _INET6_HASHTABLES_H */
diff --git a/include/net/tcp.h b/include/net/tcp.h
index 2b89f1ab8552..999f3efe572b 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -827,6 +827,16 @@ static inline int tcp_v6_iif(const struct sk_buff *skb)
 
 	return l3_slave ? skb->skb_iif : TCP_SKB_CB(skb)->header.h6.iif;
 }
+
+/* TCP_SKB_CB reference means this can not be used from early demux */
+static inline int tcp_v6_sdif(const struct sk_buff *skb)
+{
+#if IS_ENABLED(CONFIG_NET_L3_MASTER_DEV)
+	if (skb && ipv6_l3mdev_skb(TCP_SKB_CB(skb)->header.h6.flags))
+		return TCP_SKB_CB(skb)->header.h6.iif;
+#endif
+	return 0;
+}
 #endif
 
 /* TCP_SKB_CB reference means this can not be used from early demux */
diff --git a/net/dccp/ipv6.c b/net/dccp/ipv6.c
index 1b58eac8aad3..47a7b59b355e 100644
--- a/net/dccp/ipv6.c
+++ b/net/dccp/ipv6.c
@@ -89,7 +89,7 @@ static void dccp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
 	sk = __inet6_lookup_established(net, &dccp_hashinfo,
 					&hdr->daddr, dh->dccph_dport,
 					&hdr->saddr, ntohs(dh->dccph_sport),
-					inet6_iif(skb));
+					inet6_iif(skb), 0);
 
 	if (!sk) {
 		__ICMP6_INC_STATS(net, __in6_dev_get(skb->dev),
@@ -687,7 +687,7 @@ static int dccp_v6_rcv(struct sk_buff *skb)
 lookup:
 	sk = __inet6_lookup_skb(&dccp_hashinfo, skb, __dccp_hdr_len(dh),
 			        dh->dccph_sport, dh->dccph_dport,
-				inet6_iif(skb), &refcounted);
+				inet6_iif(skb), 0, &refcounted);
 	if (!sk) {
 		dccp_pr_debug("failed to look up flow ID in table and "
 			      "get corresponding socket\n");
diff --git a/net/ipv6/inet6_hashtables.c b/net/ipv6/inet6_hashtables.c
index b13b8f93079d..b01858f5deb1 100644
--- a/net/ipv6/inet6_hashtables.c
+++ b/net/ipv6/inet6_hashtables.c
@@ -56,7 +56,7 @@ struct sock *__inet6_lookup_established(struct net *net,
 					   const __be16 sport,
 					   const struct in6_addr *daddr,
 					   const u16 hnum,
-					   const int dif)
+					   const int dif, const int sdif)
 {
 	struct sock *sk;
 	const struct hlist_nulls_node *node;
@@ -73,12 +73,12 @@ begin:
 	sk_nulls_for_each_rcu(sk, node, &head->chain) {
 		if (sk->sk_hash != hash)
 			continue;
-		if (!INET6_MATCH(sk, net, saddr, daddr, ports, dif))
+		if (!INET6_MATCH(sk, net, saddr, daddr, ports, dif, sdif))
 			continue;
 		if (unlikely(!refcount_inc_not_zero(&sk->sk_refcnt)))
 			goto out;
 
-		if (unlikely(!INET6_MATCH(sk, net, saddr, daddr, ports, dif))) {
+		if (unlikely(!INET6_MATCH(sk, net, saddr, daddr, ports, dif, sdif))) {
 			sock_gen_put(sk);
 			goto begin;
 		}
@@ -96,7 +96,7 @@ EXPORT_SYMBOL(__inet6_lookup_established);
 static inline int compute_score(struct sock *sk, struct net *net,
 				const unsigned short hnum,
 				const struct in6_addr *daddr,
-				const int dif, bool exact_dif)
+				const int dif, const int sdif, bool exact_dif)
 {
 	int score = -1;
 
@@ -110,9 +110,13 @@ static inline int compute_score(struct sock *sk, struct net *net,
 			score++;
 		}
 		if (sk->sk_bound_dev_if || exact_dif) {
-			if (sk->sk_bound_dev_if != dif)
+			bool dev_match = (sk->sk_bound_dev_if == dif ||
+					  sk->sk_bound_dev_if == sdif);
+
+			if (exact_dif && !dev_match)
 				return -1;
-			score++;
+			if (sk->sk_bound_dev_if && dev_match)
+				score++;
 		}
 		if (sk->sk_incoming_cpu == raw_smp_processor_id())
 			score++;
@@ -126,7 +130,7 @@ struct sock *inet6_lookup_listener(struct net *net,
 		struct sk_buff *skb, int doff,
 		const struct in6_addr *saddr,
 		const __be16 sport, const struct in6_addr *daddr,
-		const unsigned short hnum, const int dif)
+		const unsigned short hnum, const int dif, const int sdif)
 {
 	unsigned int hash = inet_lhashfn(net, hnum);
 	struct inet_listen_hashbucket *ilb = &hashinfo->listening_hash[hash];
@@ -136,7 +140,7 @@ struct sock *inet6_lookup_listener(struct net *net,
 	u32 phash = 0;
 
 	sk_for_each(sk, &ilb->head) {
-		score = compute_score(sk, net, hnum, daddr, dif, exact_dif);
+		score = compute_score(sk, net, hnum, daddr, dif, sdif, exact_dif);
 		if (score > hiscore) {
 			reuseport = sk->sk_reuseport;
 			if (reuseport) {
@@ -171,7 +175,7 @@ struct sock *inet6_lookup(struct net *net, struct inet_hashinfo *hashinfo,
 	bool refcounted;
 
 	sk = __inet6_lookup(net, hashinfo, skb, doff, saddr, sport, daddr,
-			    ntohs(dport), dif, &refcounted);
+			    ntohs(dport), dif, 0, &refcounted);
 	if (sk && !refcounted && !refcount_inc_not_zero(&sk->sk_refcnt))
 		sk = NULL;
 	return sk;
@@ -187,8 +191,9 @@ static int __inet6_check_established(struct inet_timewait_death_row *death_row,
 	const struct in6_addr *daddr = &sk->sk_v6_rcv_saddr;
 	const struct in6_addr *saddr = &sk->sk_v6_daddr;
 	const int dif = sk->sk_bound_dev_if;
-	const __portpair ports = INET_COMBINED_PORTS(inet->inet_dport, lport);
 	struct net *net = sock_net(sk);
+	const int sdif = l3mdev_master_ifindex_by_index(net, dif);
+	const __portpair ports = INET_COMBINED_PORTS(inet->inet_dport, lport);
 	const unsigned int hash = inet6_ehashfn(net, daddr, lport, saddr,
 						inet->inet_dport);
 	struct inet_ehash_bucket *head = inet_ehash_bucket(hinfo, hash);
@@ -203,7 +208,8 @@ static int __inet6_check_established(struct inet_timewait_death_row *death_row,
 		if (sk2->sk_hash != hash)
 			continue;
 
-		if (likely(INET6_MATCH(sk2, net, saddr, daddr, ports, dif))) {
+		if (likely(INET6_MATCH(sk2, net, saddr, daddr, ports,
+				       dif, sdif))) {
 			if (sk2->sk_state == TCP_TIME_WAIT) {
 				tw = inet_twsk(sk2);
 				if (twsk_unique(sk, sk2, twp))
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index ced5dcf37465..f776ec4ecf6d 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -350,7 +350,7 @@ static void tcp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
 	sk = __inet6_lookup_established(net, &tcp_hashinfo,
 					&hdr->daddr, th->dest,
 					&hdr->saddr, ntohs(th->source),
-					skb->dev->ifindex);
+					skb->dev->ifindex, inet6_sdif(skb));
 
 	if (!sk) {
 		__ICMP6_INC_STATS(net, __in6_dev_get(skb->dev),
@@ -918,7 +918,8 @@ static void tcp_v6_send_reset(const struct sock *sk, struct sk_buff *skb)
 					   &tcp_hashinfo, NULL, 0,
 					   &ipv6h->saddr,
 					   th->source, &ipv6h->daddr,
-					   ntohs(th->source), tcp_v6_iif(skb));
+					   ntohs(th->source), tcp_v6_iif(skb),
+					   tcp_v6_sdif(skb));
 		if (!sk1)
 			goto out;
 
@@ -1397,6 +1398,7 @@ static void tcp_v6_fill_cb(struct sk_buff *skb, const struct ipv6hdr *hdr,
 
 static int tcp_v6_rcv(struct sk_buff *skb)
 {
+	int sdif = inet6_sdif(skb);
 	const struct tcphdr *th;
 	const struct ipv6hdr *hdr;
 	bool refcounted;
@@ -1430,7 +1432,7 @@ static int tcp_v6_rcv(struct sk_buff *skb)
 
 lookup:
 	sk = __inet6_lookup_skb(&tcp_hashinfo, skb, __tcp_hdrlen(th),
-				th->source, th->dest, inet6_iif(skb),
+				th->source, th->dest, inet6_iif(skb), sdif,
 				&refcounted);
 	if (!sk)
 		goto no_tcp_socket;
@@ -1563,7 +1565,8 @@ do_time_wait:
 					    skb, __tcp_hdrlen(th),
 					    &ipv6_hdr(skb)->saddr, th->source,
 					    &ipv6_hdr(skb)->daddr,
-					    ntohs(th->dest), tcp_v6_iif(skb));
+					    ntohs(th->dest), tcp_v6_iif(skb),
+					    sdif);
 		if (sk2) {
 			struct inet_timewait_sock *tw = inet_twsk(sk);
 			inet_twsk_deschedule_put(tw);
@@ -1610,7 +1613,7 @@ static void tcp_v6_early_demux(struct sk_buff *skb)
 	sk = __inet6_lookup_established(dev_net(skb->dev), &tcp_hashinfo,
 					&hdr->saddr, th->source,
 					&hdr->daddr, ntohs(th->dest),
-					inet6_iif(skb));
+					inet6_iif(skb), inet6_sdif(skb));
 	if (sk) {
 		skb->sk = sk;
 		skb->destructor = sock_edemux;
diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c
index d96a877798a7..19afcaf4a22e 100644
--- a/net/ipv6/udp.c
+++ b/net/ipv6/udp.c
@@ -897,7 +897,7 @@ discard:
 static struct sock *__udp6_lib_demux_lookup(struct net *net,
 			__be16 loc_port, const struct in6_addr *loc_addr,
 			__be16 rmt_port, const struct in6_addr *rmt_addr,
-			int dif)
+			int dif, int sdif)
 {
 	unsigned short hnum = ntohs(loc_port);
 	unsigned int hash2 = udp6_portaddr_hash(net, loc_addr, hnum);
@@ -908,7 +908,7 @@ static struct sock *__udp6_lib_demux_lookup(struct net *net,
 
 	udp_portaddr_for_each_entry_rcu(sk, &hslot2->head) {
 		if (sk->sk_state == TCP_ESTABLISHED &&
-		    INET6_MATCH(sk, net, rmt_addr, loc_addr, ports, dif))
+		    INET6_MATCH(sk, net, rmt_addr, loc_addr, ports, dif, sdif))
 			return sk;
 		/* Only check first socket in chain */
 		break;
@@ -923,6 +923,7 @@ static void udp_v6_early_demux(struct sk_buff *skb)
 	struct sock *sk;
 	struct dst_entry *dst;
 	int dif = skb->dev->ifindex;
+	int sdif = inet6_sdif(skb);
 
 	if (!pskb_may_pull(skb, skb_transport_offset(skb) +
 	    sizeof(struct udphdr)))
@@ -934,7 +935,7 @@ static void udp_v6_early_demux(struct sk_buff *skb)
 		sk = __udp6_lib_demux_lookup(net, uh->dest,
 					     &ipv6_hdr(skb)->daddr,
 					     uh->source, &ipv6_hdr(skb)->saddr,
-					     dif);
+					     dif, sdif);
 	else
 		return;
 
diff --git a/net/netfilter/xt_TPROXY.c b/net/netfilter/xt_TPROXY.c
index 94fb0fd0c667..ade4c10c28c6 100644
--- a/net/netfilter/xt_TPROXY.c
+++ b/net/netfilter/xt_TPROXY.c
@@ -195,7 +195,7 @@ nf_tproxy_get_sock_v6(struct net *net, struct sk_buff *skb, int thoff, void *hp,
 						   thoff + __tcp_hdrlen(tcph),
 						   saddr, sport,
 						   daddr, ntohs(dport),
-						   in->ifindex);
+						   in->ifindex, 0);
 
 			if (sk && !refcount_inc_not_zero(&sk->sk_refcnt))
 				sk = NULL;
@@ -208,7 +208,7 @@ nf_tproxy_get_sock_v6(struct net *net, struct sk_buff *skb, int thoff, void *hp,
 		case NFT_LOOKUP_ESTABLISHED:
 			sk = __inet6_lookup_established(net, &tcp_hashinfo,
 							saddr, sport, daddr, ntohs(dport),
-							in->ifindex);
+							in->ifindex, 0);
 			break;
 		default:
 			BUG();
-- 
cgit v1.2.3


From 5108ab4bf446fa9ad2c71f5fc1d839067b72636f Mon Sep 17 00:00:00 2001
From: David Ahern <dsahern@gmail.com>
Date: Mon, 7 Aug 2017 08:44:22 -0700
Subject: net: ipv6: add second dif to raw socket lookups

Add a second device index, sdif, to raw socket lookups. sdif is the
index for ingress devices enslaved to an l3mdev. It allows the lookups
to consider the enslaved device as well as the L3 domain when searching
for a socket.

Signed-off-by: David Ahern <dsahern@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/rawv6.h |  2 +-
 net/ipv4/raw_diag.c |  2 +-
 net/ipv6/raw.c      | 13 ++++++++-----
 3 files changed, 10 insertions(+), 7 deletions(-)

(limited to 'include')

diff --git a/include/net/rawv6.h b/include/net/rawv6.h
index cbe4e9de1894..4addc5c988e0 100644
--- a/include/net/rawv6.h
+++ b/include/net/rawv6.h
@@ -6,7 +6,7 @@
 extern struct raw_hashinfo raw_v6_hashinfo;
 struct sock *__raw_v6_lookup(struct net *net, struct sock *sk,
 			     unsigned short num, const struct in6_addr *loc_addr,
-			     const struct in6_addr *rmt_addr, int dif);
+			     const struct in6_addr *rmt_addr, int dif, int sdif);
 
 int raw_abort(struct sock *sk, int err);
 
diff --git a/net/ipv4/raw_diag.c b/net/ipv4/raw_diag.c
index c600d3c71d4d..c200065ef9a5 100644
--- a/net/ipv4/raw_diag.c
+++ b/net/ipv4/raw_diag.c
@@ -52,7 +52,7 @@ static struct sock *raw_lookup(struct net *net, struct sock *from,
 		sk = __raw_v6_lookup(net, from, r->sdiag_raw_protocol,
 				     (const struct in6_addr *)r->id.idiag_src,
 				     (const struct in6_addr *)r->id.idiag_dst,
-				     r->id.idiag_if);
+				     r->id.idiag_if, 0);
 #endif
 	return sk;
 }
diff --git a/net/ipv6/raw.c b/net/ipv6/raw.c
index 60be012fe708..e4462b0ff801 100644
--- a/net/ipv6/raw.c
+++ b/net/ipv6/raw.c
@@ -72,7 +72,7 @@ EXPORT_SYMBOL_GPL(raw_v6_hashinfo);
 
 struct sock *__raw_v6_lookup(struct net *net, struct sock *sk,
 		unsigned short num, const struct in6_addr *loc_addr,
-		const struct in6_addr *rmt_addr, int dif)
+		const struct in6_addr *rmt_addr, int dif, int sdif)
 {
 	bool is_multicast = ipv6_addr_is_multicast(loc_addr);
 
@@ -86,7 +86,9 @@ struct sock *__raw_v6_lookup(struct net *net, struct sock *sk,
 			    !ipv6_addr_equal(&sk->sk_v6_daddr, rmt_addr))
 				continue;
 
-			if (sk->sk_bound_dev_if && sk->sk_bound_dev_if != dif)
+			if (sk->sk_bound_dev_if &&
+			    sk->sk_bound_dev_if != dif &&
+			    sk->sk_bound_dev_if != sdif)
 				continue;
 
 			if (!ipv6_addr_any(&sk->sk_v6_rcv_saddr)) {
@@ -178,7 +180,8 @@ static bool ipv6_raw_deliver(struct sk_buff *skb, int nexthdr)
 		goto out;
 
 	net = dev_net(skb->dev);
-	sk = __raw_v6_lookup(net, sk, nexthdr, daddr, saddr, inet6_iif(skb));
+	sk = __raw_v6_lookup(net, sk, nexthdr, daddr, saddr,
+			     inet6_iif(skb), inet6_sdif(skb));
 
 	while (sk) {
 		int filtered;
@@ -222,7 +225,7 @@ static bool ipv6_raw_deliver(struct sk_buff *skb, int nexthdr)
 			}
 		}
 		sk = __raw_v6_lookup(net, sk_next(sk), nexthdr, daddr, saddr,
-				     inet6_iif(skb));
+				     inet6_iif(skb), inet6_sdif(skb));
 	}
 out:
 	read_unlock(&raw_v6_hashinfo.lock);
@@ -378,7 +381,7 @@ void raw6_icmp_error(struct sk_buff *skb, int nexthdr,
 		net = dev_net(skb->dev);
 
 		while ((sk = __raw_v6_lookup(net, sk, nexthdr, saddr, daddr,
-						inet6_iif(skb)))) {
+					     inet6_iif(skb), inet6_iif(skb)))) {
 			rawv6_err(sk, skb, NULL, type, code,
 					inner_offset, info);
 			sk = sk_next(sk);
-- 
cgit v1.2.3


From cf5f5cea270655dd49370760576c64b228583b79 Mon Sep 17 00:00:00 2001
From: Yonghong Song <yhs@fb.com>
Date: Fri, 4 Aug 2017 16:00:09 -0700
Subject: bpf: add support for sys_enter_* and sys_exit_* tracepoints

Currently, bpf programs cannot be attached to sys_enter_* and sys_exit_*
style tracepoints. The iovisor/bcc issue #748
(https://github.com/iovisor/bcc/issues/748) documents this issue.
For example, if you try to attach a bpf program to tracepoints
syscalls/sys_enter_newfstat, you will get the following error:
   # ./tools/trace.py t:syscalls:sys_enter_newfstat
   Ioctl(PERF_EVENT_IOC_SET_BPF): Invalid argument
   Failed to attach BPF to tracepoint

The main reason is that syscalls/sys_enter_* and syscalls/sys_exit_*
tracepoints are treated differently from other tracepoints and there
is no bpf hook to it.

This patch adds bpf support for these syscalls tracepoints by
  . permitting bpf attachment in ioctl PERF_EVENT_IOC_SET_BPF
  . calling bpf programs in perf_syscall_enter and perf_syscall_exit

The legality of bpf program ctx access is also checked.
Function trace_event_get_offsets returns correct max offset for each
specific syscall tracepoint, which is compared against the maximum offset
access in bpf program.

Signed-off-by: Yonghong Song <yhs@fb.com>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/syscalls.h      | 12 ++++++++++
 kernel/events/core.c          | 10 ++++----
 kernel/trace/trace_syscalls.c | 53 +++++++++++++++++++++++++++++++++++++++++--
 3 files changed, 69 insertions(+), 6 deletions(-)

(limited to 'include')

diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
index 3cb15ea48aee..c9170218e9e6 100644
--- a/include/linux/syscalls.h
+++ b/include/linux/syscalls.h
@@ -172,8 +172,20 @@ extern struct trace_event_functions exit_syscall_print_funcs;
 	static struct syscall_metadata __used			\
 	  __attribute__((section("__syscalls_metadata")))	\
 	 *__p_syscall_meta_##sname = &__syscall_meta_##sname;
+
+static inline int is_syscall_trace_event(struct trace_event_call *tp_event)
+{
+	return tp_event->class == &event_class_syscall_enter ||
+	       tp_event->class == &event_class_syscall_exit;
+}
+
 #else
 #define SYSCALL_METADATA(sname, nb, ...)
+
+static inline int is_syscall_trace_event(struct trace_event_call *tp_event)
+{
+	return 0;
+}
 #endif
 
 #define SYSCALL_DEFINE0(sname)					\
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 426c2ffba16d..a7a6c1d19a49 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -8050,7 +8050,7 @@ static void perf_event_free_bpf_handler(struct perf_event *event)
 
 static int perf_event_set_bpf_prog(struct perf_event *event, u32 prog_fd)
 {
-	bool is_kprobe, is_tracepoint;
+	bool is_kprobe, is_tracepoint, is_syscall_tp;
 	struct bpf_prog *prog;
 
 	if (event->attr.type != PERF_TYPE_TRACEPOINT)
@@ -8061,7 +8061,8 @@ static int perf_event_set_bpf_prog(struct perf_event *event, u32 prog_fd)
 
 	is_kprobe = event->tp_event->flags & TRACE_EVENT_FL_UKPROBE;
 	is_tracepoint = event->tp_event->flags & TRACE_EVENT_FL_TRACEPOINT;
-	if (!is_kprobe && !is_tracepoint)
+	is_syscall_tp = is_syscall_trace_event(event->tp_event);
+	if (!is_kprobe && !is_tracepoint && !is_syscall_tp)
 		/* bpf programs can only be attached to u/kprobe or tracepoint */
 		return -EINVAL;
 
@@ -8070,13 +8071,14 @@ static int perf_event_set_bpf_prog(struct perf_event *event, u32 prog_fd)
 		return PTR_ERR(prog);
 
 	if ((is_kprobe && prog->type != BPF_PROG_TYPE_KPROBE) ||
-	    (is_tracepoint && prog->type != BPF_PROG_TYPE_TRACEPOINT)) {
+	    (is_tracepoint && prog->type != BPF_PROG_TYPE_TRACEPOINT) ||
+	    (is_syscall_tp && prog->type != BPF_PROG_TYPE_TRACEPOINT)) {
 		/* valid fd, but invalid bpf program type */
 		bpf_prog_put(prog);
 		return -EINVAL;
 	}
 
-	if (is_tracepoint) {
+	if (is_tracepoint || is_syscall_tp) {
 		int off = trace_event_get_offsets(event->tp_event);
 
 		if (prog->aux->max_ctx_offset > off) {
diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c
index 5e10395da88e..7a1a92036563 100644
--- a/kernel/trace/trace_syscalls.c
+++ b/kernel/trace/trace_syscalls.c
@@ -559,11 +559,29 @@ static DECLARE_BITMAP(enabled_perf_exit_syscalls, NR_syscalls);
 static int sys_perf_refcount_enter;
 static int sys_perf_refcount_exit;
 
+static int perf_call_bpf_enter(struct bpf_prog *prog, struct pt_regs *regs,
+			      struct syscall_metadata *sys_data,
+			      struct syscall_trace_enter *rec) {
+	struct syscall_tp_t {
+		unsigned long long regs;
+		unsigned long syscall_nr;
+		unsigned long args[sys_data->nb_args];
+	} param;
+	int i;
+
+	*(struct pt_regs **)&param = regs;
+	param.syscall_nr = rec->nr;
+	for (i = 0; i < sys_data->nb_args; i++)
+		param.args[i] = rec->args[i];
+	return trace_call_bpf(prog, &param);
+}
+
 static void perf_syscall_enter(void *ignore, struct pt_regs *regs, long id)
 {
 	struct syscall_metadata *sys_data;
 	struct syscall_trace_enter *rec;
 	struct hlist_head *head;
+	struct bpf_prog *prog;
 	int syscall_nr;
 	int rctx;
 	int size;
@@ -578,8 +596,9 @@ static void perf_syscall_enter(void *ignore, struct pt_regs *regs, long id)
 	if (!sys_data)
 		return;
 
+	prog = READ_ONCE(sys_data->enter_event->prog);
 	head = this_cpu_ptr(sys_data->enter_event->perf_events);
-	if (hlist_empty(head))
+	if (!prog && hlist_empty(head))
 		return;
 
 	/* get the size after alignment with the u32 buffer size field */
@@ -594,6 +613,13 @@ static void perf_syscall_enter(void *ignore, struct pt_regs *regs, long id)
 	rec->nr = syscall_nr;
 	syscall_get_arguments(current, regs, 0, sys_data->nb_args,
 			       (unsigned long *)&rec->args);
+
+	if ((prog && !perf_call_bpf_enter(prog, regs, sys_data, rec)) ||
+	    hlist_empty(head)) {
+		perf_swevent_put_recursion_context(rctx);
+		return;
+	}
+
 	perf_trace_buf_submit(rec, size, rctx,
 			      sys_data->enter_event->event.type, 1, regs,
 			      head, NULL);
@@ -633,11 +659,26 @@ static void perf_sysenter_disable(struct trace_event_call *call)
 	mutex_unlock(&syscall_trace_lock);
 }
 
+static int perf_call_bpf_exit(struct bpf_prog *prog, struct pt_regs *regs,
+			      struct syscall_trace_exit *rec) {
+	struct syscall_tp_t {
+		unsigned long long regs;
+		unsigned long syscall_nr;
+		unsigned long ret;
+	} param;
+
+	*(struct pt_regs **)&param = regs;
+	param.syscall_nr = rec->nr;
+	param.ret = rec->ret;
+	return trace_call_bpf(prog, &param);
+}
+
 static void perf_syscall_exit(void *ignore, struct pt_regs *regs, long ret)
 {
 	struct syscall_metadata *sys_data;
 	struct syscall_trace_exit *rec;
 	struct hlist_head *head;
+	struct bpf_prog *prog;
 	int syscall_nr;
 	int rctx;
 	int size;
@@ -652,8 +693,9 @@ static void perf_syscall_exit(void *ignore, struct pt_regs *regs, long ret)
 	if (!sys_data)
 		return;
 
+	prog = READ_ONCE(sys_data->exit_event->prog);
 	head = this_cpu_ptr(sys_data->exit_event->perf_events);
-	if (hlist_empty(head))
+	if (!prog && hlist_empty(head))
 		return;
 
 	/* We can probably do that at build time */
@@ -666,6 +708,13 @@ static void perf_syscall_exit(void *ignore, struct pt_regs *regs, long ret)
 
 	rec->nr = syscall_nr;
 	rec->ret = syscall_get_return_value(current, regs);
+
+	if ((prog && !perf_call_bpf_exit(prog, regs, rec)) ||
+	    hlist_empty(head)) {
+		perf_swevent_put_recursion_context(rctx);
+		return;
+	}
+
 	perf_trace_buf_submit(rec, size, rctx, sys_data->exit_event->event.type,
 			      1, regs, head, NULL);
 }
-- 
cgit v1.2.3


From 8113c095672f6504b23eba6edf4a57b5f7f744af Mon Sep 17 00:00:00 2001
From: WANG Cong <xiyou.wangcong@gmail.com>
Date: Fri, 4 Aug 2017 21:31:43 -0700
Subject: net_sched: use void pointer for filter handle

Now we use 'unsigned long fh' as a pointer in every place,
it is safe to convert it to a void pointer now. This gets
rid of many casts to pointer.

Cc: Jamal Hadi Salim <jhs@mojatatu.com>
Cc: Jiri Pirko <jiri@resnulli.us>
Signed-off-by: Cong Wang <xiyou.wangcong@gmail.com>
Acked-by: Jamal Hadi Salim <jhs@mojatatu.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/pkt_cls.h     |  2 +-
 include/net/sch_generic.h |  8 ++++----
 net/sched/cls_api.c       | 17 ++++++++---------
 net/sched/cls_basic.c     | 22 ++++++++++------------
 net/sched/cls_bpf.c       | 27 ++++++++++++---------------
 net/sched/cls_cgroup.c    | 12 ++++++------
 net/sched/cls_flow.c      | 24 ++++++++++++------------
 net/sched/cls_flower.c    | 22 +++++++++++-----------
 net/sched/cls_fw.c        | 26 +++++++++++++-------------
 net/sched/cls_matchall.c  | 16 ++++++++--------
 net/sched/cls_route.c     | 26 +++++++++++++-------------
 net/sched/cls_rsvp.h      | 24 ++++++++++++------------
 net/sched/cls_tcindex.c   | 36 ++++++++++++++++--------------------
 net/sched/cls_u32.c       | 30 +++++++++++++++---------------
 14 files changed, 141 insertions(+), 151 deletions(-)

(limited to 'include')

diff --git a/include/net/pkt_cls.h b/include/net/pkt_cls.h
index 572083af02ac..0f78e6560b2d 100644
--- a/include/net/pkt_cls.h
+++ b/include/net/pkt_cls.h
@@ -11,7 +11,7 @@ struct tcf_walker {
 	int	stop;
 	int	skip;
 	int	count;
-	int	(*fn)(struct tcf_proto *, unsigned long node, struct tcf_walker *);
+	int	(*fn)(struct tcf_proto *, void *node, struct tcf_walker *);
 };
 
 int register_tcf_proto_ops(struct tcf_proto_ops *ops);
diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h
index 1c123e2b2415..e79f5ad1c5f3 100644
--- a/include/net/sch_generic.h
+++ b/include/net/sch_generic.h
@@ -213,16 +213,16 @@ struct tcf_proto_ops {
 	int			(*init)(struct tcf_proto*);
 	void			(*destroy)(struct tcf_proto*);
 
-	unsigned long		(*get)(struct tcf_proto*, u32 handle);
+	void*			(*get)(struct tcf_proto*, u32 handle);
 	int			(*change)(struct net *net, struct sk_buff *,
 					struct tcf_proto*, unsigned long,
 					u32 handle, struct nlattr **,
-					unsigned long *, bool);
-	int			(*delete)(struct tcf_proto*, unsigned long, bool*);
+					void **, bool);
+	int			(*delete)(struct tcf_proto*, void *, bool*);
 	void			(*walk)(struct tcf_proto*, struct tcf_walker *arg);
 
 	/* rtnetlink specific */
-	int			(*dump)(struct net*, struct tcf_proto*, unsigned long,
+	int			(*dump)(struct net*, struct tcf_proto*, void *,
 					struct sk_buff *skb, struct tcmsg*);
 
 	struct module		*owner;
diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c
index afd099727aea..668afb6e9885 100644
--- a/net/sched/cls_api.c
+++ b/net/sched/cls_api.c
@@ -102,11 +102,11 @@ EXPORT_SYMBOL(unregister_tcf_proto_ops);
 
 static int tfilter_notify(struct net *net, struct sk_buff *oskb,
 			  struct nlmsghdr *n, struct tcf_proto *tp,
-			  unsigned long fh, int event, bool unicast);
+			  void *fh, int event, bool unicast);
 
 static int tfilter_del_notify(struct net *net, struct sk_buff *oskb,
 			      struct nlmsghdr *n, struct tcf_proto *tp,
-			      unsigned long fh, bool unicast, bool *last);
+			      void *fh, bool unicast, bool *last);
 
 static void tfilter_notify_chain(struct net *net, struct sk_buff *oskb,
 				 struct nlmsghdr *n,
@@ -432,7 +432,7 @@ static int tc_ctl_tfilter(struct sk_buff *skb, struct nlmsghdr *n,
 	struct tcf_proto *tp;
 	const struct Qdisc_class_ops *cops;
 	unsigned long cl;
-	unsigned long fh;
+	void *fh;
 	int err;
 	int tp_created;
 
@@ -571,7 +571,7 @@ replay:
 
 	fh = tp->ops->get(tp, t->tcm_handle);
 
-	if (fh == 0) {
+	if (!fh) {
 		if (n->nlmsg_type == RTM_DELTFILTER && t->tcm_handle == 0) {
 			tcf_chain_tp_remove(chain, &chain_info, tp);
 			tfilter_notify(net, skb, n, tp, fh,
@@ -641,7 +641,7 @@ errout:
 }
 
 static int tcf_fill_node(struct net *net, struct sk_buff *skb,
-			 struct tcf_proto *tp, unsigned long fh, u32 portid,
+			 struct tcf_proto *tp, void *fh, u32 portid,
 			 u32 seq, u16 flags, int event)
 {
 	struct tcmsg *tcm;
@@ -679,7 +679,7 @@ nla_put_failure:
 
 static int tfilter_notify(struct net *net, struct sk_buff *oskb,
 			  struct nlmsghdr *n, struct tcf_proto *tp,
-			  unsigned long fh, int event, bool unicast)
+			  void *fh, int event, bool unicast)
 {
 	struct sk_buff *skb;
 	u32 portid = oskb ? NETLINK_CB(oskb).portid : 0;
@@ -703,7 +703,7 @@ static int tfilter_notify(struct net *net, struct sk_buff *oskb,
 
 static int tfilter_del_notify(struct net *net, struct sk_buff *oskb,
 			      struct nlmsghdr *n, struct tcf_proto *tp,
-			      unsigned long fh, bool unicast, bool *last)
+			      void *fh, bool unicast, bool *last)
 {
 	struct sk_buff *skb;
 	u32 portid = oskb ? NETLINK_CB(oskb).portid : 0;
@@ -738,8 +738,7 @@ struct tcf_dump_args {
 	struct netlink_callback *cb;
 };
 
-static int tcf_node_dump(struct tcf_proto *tp, unsigned long n,
-			 struct tcf_walker *arg)
+static int tcf_node_dump(struct tcf_proto *tp, void *n, struct tcf_walker *arg)
 {
 	struct tcf_dump_args *a = (void *)arg;
 	struct net *net = sock_net(a->skb->sk);
diff --git a/net/sched/cls_basic.c b/net/sched/cls_basic.c
index 7c7a82138f76..73cc7f167a38 100644
--- a/net/sched/cls_basic.c
+++ b/net/sched/cls_basic.c
@@ -56,20 +56,18 @@ static int basic_classify(struct sk_buff *skb, const struct tcf_proto *tp,
 	return -1;
 }
 
-static unsigned long basic_get(struct tcf_proto *tp, u32 handle)
+static void *basic_get(struct tcf_proto *tp, u32 handle)
 {
-	unsigned long l = 0UL;
 	struct basic_head *head = rtnl_dereference(tp->root);
 	struct basic_filter *f;
 
 	list_for_each_entry(f, &head->flist, link) {
 		if (f->handle == handle) {
-			l = (unsigned long) f;
-			break;
+			return f;
 		}
 	}
 
-	return l;
+	return NULL;
 }
 
 static int basic_init(struct tcf_proto *tp)
@@ -106,10 +104,10 @@ static void basic_destroy(struct tcf_proto *tp)
 	kfree_rcu(head, rcu);
 }
 
-static int basic_delete(struct tcf_proto *tp, unsigned long arg, bool *last)
+static int basic_delete(struct tcf_proto *tp, void *arg, bool *last)
 {
 	struct basic_head *head = rtnl_dereference(tp->root);
-	struct basic_filter *f = (struct basic_filter *) arg;
+	struct basic_filter *f = arg;
 
 	list_del_rcu(&f->link);
 	tcf_unbind_filter(tp, &f->res);
@@ -149,7 +147,7 @@ static int basic_set_parms(struct net *net, struct tcf_proto *tp,
 
 static int basic_change(struct net *net, struct sk_buff *in_skb,
 			struct tcf_proto *tp, unsigned long base, u32 handle,
-			struct nlattr **tca, unsigned long *arg, bool ovr)
+			struct nlattr **tca, void **arg, bool ovr)
 {
 	int err;
 	struct basic_head *head = rtnl_dereference(tp->root);
@@ -202,7 +200,7 @@ static int basic_change(struct net *net, struct sk_buff *in_skb,
 	if (err < 0)
 		goto errout;
 
-	*arg = (unsigned long)fnew;
+	*arg = fnew;
 
 	if (fold) {
 		list_replace_rcu(&fold->link, &fnew->link);
@@ -228,7 +226,7 @@ static void basic_walk(struct tcf_proto *tp, struct tcf_walker *arg)
 		if (arg->count < arg->skip)
 			goto skip;
 
-		if (arg->fn(tp, (unsigned long) f, arg) < 0) {
+		if (arg->fn(tp, f, arg) < 0) {
 			arg->stop = 1;
 			break;
 		}
@@ -237,10 +235,10 @@ skip:
 	}
 }
 
-static int basic_dump(struct net *net, struct tcf_proto *tp, unsigned long fh,
+static int basic_dump(struct net *net, struct tcf_proto *tp, void *fh,
 		      struct sk_buff *skb, struct tcmsg *t)
 {
-	struct basic_filter *f = (struct basic_filter *) fh;
+	struct basic_filter *f = fh;
 	struct nlattr *nest;
 
 	if (f == NULL)
diff --git a/net/sched/cls_bpf.c b/net/sched/cls_bpf.c
index 2d4d06e41cd9..db17b68df94e 100644
--- a/net/sched/cls_bpf.c
+++ b/net/sched/cls_bpf.c
@@ -270,11 +270,11 @@ static void __cls_bpf_delete(struct tcf_proto *tp, struct cls_bpf_prog *prog)
 	call_rcu(&prog->rcu, cls_bpf_delete_prog_rcu);
 }
 
-static int cls_bpf_delete(struct tcf_proto *tp, unsigned long arg, bool *last)
+static int cls_bpf_delete(struct tcf_proto *tp, void *arg, bool *last)
 {
 	struct cls_bpf_head *head = rtnl_dereference(tp->root);
 
-	__cls_bpf_delete(tp, (struct cls_bpf_prog *) arg);
+	__cls_bpf_delete(tp, arg);
 	*last = list_empty(&head->plist);
 	return 0;
 }
@@ -290,20 +290,17 @@ static void cls_bpf_destroy(struct tcf_proto *tp)
 	kfree_rcu(head, rcu);
 }
 
-static unsigned long cls_bpf_get(struct tcf_proto *tp, u32 handle)
+static void *cls_bpf_get(struct tcf_proto *tp, u32 handle)
 {
 	struct cls_bpf_head *head = rtnl_dereference(tp->root);
 	struct cls_bpf_prog *prog;
-	unsigned long ret = 0UL;
 
 	list_for_each_entry(prog, &head->plist, link) {
-		if (prog->handle == handle) {
-			ret = (unsigned long) prog;
-			break;
-		}
+		if (prog->handle == handle)
+			return prog;
 	}
 
-	return ret;
+	return NULL;
 }
 
 static int cls_bpf_prog_from_ops(struct nlattr **tb, struct cls_bpf_prog *prog)
@@ -448,10 +445,10 @@ static u32 cls_bpf_grab_new_handle(struct tcf_proto *tp,
 static int cls_bpf_change(struct net *net, struct sk_buff *in_skb,
 			  struct tcf_proto *tp, unsigned long base,
 			  u32 handle, struct nlattr **tca,
-			  unsigned long *arg, bool ovr)
+			  void **arg, bool ovr)
 {
 	struct cls_bpf_head *head = rtnl_dereference(tp->root);
-	struct cls_bpf_prog *oldprog = (struct cls_bpf_prog *) *arg;
+	struct cls_bpf_prog *oldprog = *arg;
 	struct nlattr *tb[TCA_BPF_MAX + 1];
 	struct cls_bpf_prog *prog;
 	int ret;
@@ -509,7 +506,7 @@ static int cls_bpf_change(struct net *net, struct sk_buff *in_skb,
 		list_add_rcu(&prog->link, &head->plist);
 	}
 
-	*arg = (unsigned long) prog;
+	*arg = prog;
 	return 0;
 
 errout:
@@ -557,10 +554,10 @@ static int cls_bpf_dump_ebpf_info(const struct cls_bpf_prog *prog,
 	return 0;
 }
 
-static int cls_bpf_dump(struct net *net, struct tcf_proto *tp, unsigned long fh,
+static int cls_bpf_dump(struct net *net, struct tcf_proto *tp, void *fh,
 			struct sk_buff *skb, struct tcmsg *tm)
 {
-	struct cls_bpf_prog *prog = (struct cls_bpf_prog *) fh;
+	struct cls_bpf_prog *prog = fh;
 	struct nlattr *nest;
 	u32 bpf_flags = 0;
 	int ret;
@@ -618,7 +615,7 @@ static void cls_bpf_walk(struct tcf_proto *tp, struct tcf_walker *arg)
 	list_for_each_entry(prog, &head->plist, link) {
 		if (arg->count < arg->skip)
 			goto skip;
-		if (arg->fn(tp, (unsigned long) prog, arg) < 0) {
+		if (arg->fn(tp, prog, arg) < 0) {
 			arg->stop = 1;
 			break;
 		}
diff --git a/net/sched/cls_cgroup.c b/net/sched/cls_cgroup.c
index df7a582775df..d48452f87975 100644
--- a/net/sched/cls_cgroup.c
+++ b/net/sched/cls_cgroup.c
@@ -43,9 +43,9 @@ static int cls_cgroup_classify(struct sk_buff *skb, const struct tcf_proto *tp,
 	return tcf_exts_exec(skb, &head->exts, res);
 }
 
-static unsigned long cls_cgroup_get(struct tcf_proto *tp, u32 handle)
+static void *cls_cgroup_get(struct tcf_proto *tp, u32 handle)
 {
-	return 0UL;
+	return NULL;
 }
 
 static int cls_cgroup_init(struct tcf_proto *tp)
@@ -71,7 +71,7 @@ static void cls_cgroup_destroy_rcu(struct rcu_head *root)
 static int cls_cgroup_change(struct net *net, struct sk_buff *in_skb,
 			     struct tcf_proto *tp, unsigned long base,
 			     u32 handle, struct nlattr **tca,
-			     unsigned long *arg, bool ovr)
+			     void **arg, bool ovr)
 {
 	struct nlattr *tb[TCA_CGROUP_MAX + 1];
 	struct cls_cgroup_head *head = rtnl_dereference(tp->root);
@@ -128,7 +128,7 @@ static void cls_cgroup_destroy(struct tcf_proto *tp)
 		call_rcu(&head->rcu, cls_cgroup_destroy_rcu);
 }
 
-static int cls_cgroup_delete(struct tcf_proto *tp, unsigned long arg, bool *last)
+static int cls_cgroup_delete(struct tcf_proto *tp, void *arg, bool *last)
 {
 	return -EOPNOTSUPP;
 }
@@ -140,7 +140,7 @@ static void cls_cgroup_walk(struct tcf_proto *tp, struct tcf_walker *arg)
 	if (arg->count < arg->skip)
 		goto skip;
 
-	if (arg->fn(tp, (unsigned long) head, arg) < 0) {
+	if (arg->fn(tp, head, arg) < 0) {
 		arg->stop = 1;
 		return;
 	}
@@ -148,7 +148,7 @@ skip:
 	arg->count++;
 }
 
-static int cls_cgroup_dump(struct net *net, struct tcf_proto *tp, unsigned long fh,
+static int cls_cgroup_dump(struct net *net, struct tcf_proto *tp, void *fh,
 			   struct sk_buff *skb, struct tcmsg *t)
 {
 	struct cls_cgroup_head *head = rtnl_dereference(tp->root);
diff --git a/net/sched/cls_flow.c b/net/sched/cls_flow.c
index 55e281b20140..2a3a60ec5b86 100644
--- a/net/sched/cls_flow.c
+++ b/net/sched/cls_flow.c
@@ -382,7 +382,7 @@ static void flow_destroy_filter(struct rcu_head *head)
 static int flow_change(struct net *net, struct sk_buff *in_skb,
 		       struct tcf_proto *tp, unsigned long base,
 		       u32 handle, struct nlattr **tca,
-		       unsigned long *arg, bool ovr)
+		       void **arg, bool ovr)
 {
 	struct flow_head *head = rtnl_dereference(tp->root);
 	struct flow_filter *fold, *fnew;
@@ -439,7 +439,7 @@ static int flow_change(struct net *net, struct sk_buff *in_skb,
 	if (err < 0)
 		goto err2;
 
-	fold = (struct flow_filter *)*arg;
+	fold = *arg;
 	if (fold) {
 		err = -EINVAL;
 		if (fold->handle != handle && handle)
@@ -532,12 +532,12 @@ static int flow_change(struct net *net, struct sk_buff *in_skb,
 	if (perturb_period)
 		mod_timer(&fnew->perturb_timer, jiffies + perturb_period);
 
-	if (*arg == 0)
+	if (!*arg)
 		list_add_tail_rcu(&fnew->list, &head->filters);
 	else
 		list_replace_rcu(&fold->list, &fnew->list);
 
-	*arg = (unsigned long)fnew;
+	*arg = fnew;
 
 	if (fold)
 		call_rcu(&fold->rcu, flow_destroy_filter);
@@ -551,10 +551,10 @@ err1:
 	return err;
 }
 
-static int flow_delete(struct tcf_proto *tp, unsigned long arg, bool *last)
+static int flow_delete(struct tcf_proto *tp, void *arg, bool *last)
 {
 	struct flow_head *head = rtnl_dereference(tp->root);
-	struct flow_filter *f = (struct flow_filter *)arg;
+	struct flow_filter *f = arg;
 
 	list_del_rcu(&f->list);
 	call_rcu(&f->rcu, flow_destroy_filter);
@@ -586,21 +586,21 @@ static void flow_destroy(struct tcf_proto *tp)
 	kfree_rcu(head, rcu);
 }
 
-static unsigned long flow_get(struct tcf_proto *tp, u32 handle)
+static void *flow_get(struct tcf_proto *tp, u32 handle)
 {
 	struct flow_head *head = rtnl_dereference(tp->root);
 	struct flow_filter *f;
 
 	list_for_each_entry(f, &head->filters, list)
 		if (f->handle == handle)
-			return (unsigned long)f;
-	return 0;
+			return f;
+	return NULL;
 }
 
-static int flow_dump(struct net *net, struct tcf_proto *tp, unsigned long fh,
+static int flow_dump(struct net *net, struct tcf_proto *tp, void *fh,
 		     struct sk_buff *skb, struct tcmsg *t)
 {
-	struct flow_filter *f = (struct flow_filter *)fh;
+	struct flow_filter *f = fh;
 	struct nlattr *nest;
 
 	if (f == NULL)
@@ -666,7 +666,7 @@ static void flow_walk(struct tcf_proto *tp, struct tcf_walker *arg)
 	list_for_each_entry(f, &head->filters, list) {
 		if (arg->count < arg->skip)
 			goto skip;
-		if (arg->fn(tp, (unsigned long)f, arg) < 0) {
+		if (arg->fn(tp, f, arg) < 0) {
 			arg->stop = 1;
 			break;
 		}
diff --git a/net/sched/cls_flower.c b/net/sched/cls_flower.c
index 1474bacf4df4..d2551a03c542 100644
--- a/net/sched/cls_flower.c
+++ b/net/sched/cls_flower.c
@@ -332,15 +332,15 @@ static void fl_destroy(struct tcf_proto *tp)
 	call_rcu(&head->rcu, fl_destroy_rcu);
 }
 
-static unsigned long fl_get(struct tcf_proto *tp, u32 handle)
+static void *fl_get(struct tcf_proto *tp, u32 handle)
 {
 	struct cls_fl_head *head = rtnl_dereference(tp->root);
 	struct cls_fl_filter *f;
 
 	list_for_each_entry(f, &head->filters, list)
 		if (f->handle == handle)
-			return (unsigned long) f;
-	return 0;
+			return f;
+	return NULL;
 }
 
 static const struct nla_policy fl_policy[TCA_FLOWER_MAX + 1] = {
@@ -883,10 +883,10 @@ static u32 fl_grab_new_handle(struct tcf_proto *tp,
 static int fl_change(struct net *net, struct sk_buff *in_skb,
 		     struct tcf_proto *tp, unsigned long base,
 		     u32 handle, struct nlattr **tca,
-		     unsigned long *arg, bool ovr)
+		     void **arg, bool ovr)
 {
 	struct cls_fl_head *head = rtnl_dereference(tp->root);
-	struct cls_fl_filter *fold = (struct cls_fl_filter *) *arg;
+	struct cls_fl_filter *fold = *arg;
 	struct cls_fl_filter *fnew;
 	struct nlattr **tb;
 	struct fl_flow_mask mask = {};
@@ -977,7 +977,7 @@ static int fl_change(struct net *net, struct sk_buff *in_skb,
 			fl_hw_destroy_filter(tp, fold);
 	}
 
-	*arg = (unsigned long) fnew;
+	*arg = fnew;
 
 	if (fold) {
 		list_replace_rcu(&fold->list, &fnew->list);
@@ -998,10 +998,10 @@ errout_tb:
 	return err;
 }
 
-static int fl_delete(struct tcf_proto *tp, unsigned long arg, bool *last)
+static int fl_delete(struct tcf_proto *tp, void *arg, bool *last)
 {
 	struct cls_fl_head *head = rtnl_dereference(tp->root);
-	struct cls_fl_filter *f = (struct cls_fl_filter *) arg;
+	struct cls_fl_filter *f = arg;
 
 	if (!tc_skip_sw(f->flags))
 		rhashtable_remove_fast(&head->ht, &f->ht_node,
@@ -1019,7 +1019,7 @@ static void fl_walk(struct tcf_proto *tp, struct tcf_walker *arg)
 	list_for_each_entry_rcu(f, &head->filters, list) {
 		if (arg->count < arg->skip)
 			goto skip;
-		if (arg->fn(tp, (unsigned long) f, arg) < 0) {
+		if (arg->fn(tp, f, arg) < 0) {
 			arg->stop = 1;
 			break;
 		}
@@ -1154,11 +1154,11 @@ static int fl_dump_key_flags(struct sk_buff *skb, u32 flags_key, u32 flags_mask)
 	return nla_put(skb, TCA_FLOWER_KEY_FLAGS_MASK, 4, &_mask);
 }
 
-static int fl_dump(struct net *net, struct tcf_proto *tp, unsigned long fh,
+static int fl_dump(struct net *net, struct tcf_proto *tp, void *fh,
 		   struct sk_buff *skb, struct tcmsg *t)
 {
 	struct cls_fl_head *head = rtnl_dereference(tp->root);
-	struct cls_fl_filter *f = (struct cls_fl_filter *) fh;
+	struct cls_fl_filter *f = fh;
 	struct nlattr *nest;
 	struct fl_flow_key *key, *mask;
 
diff --git a/net/sched/cls_fw.c b/net/sched/cls_fw.c
index 11f178f1b2be..192255ec50bd 100644
--- a/net/sched/cls_fw.c
+++ b/net/sched/cls_fw.c
@@ -95,20 +95,20 @@ static int fw_classify(struct sk_buff *skb, const struct tcf_proto *tp,
 	return -1;
 }
 
-static unsigned long fw_get(struct tcf_proto *tp, u32 handle)
+static void *fw_get(struct tcf_proto *tp, u32 handle)
 {
 	struct fw_head *head = rtnl_dereference(tp->root);
 	struct fw_filter *f;
 
 	if (head == NULL)
-		return 0;
+		return NULL;
 
 	f = rtnl_dereference(head->ht[fw_hash(handle)]);
 	for (; f; f = rtnl_dereference(f->next)) {
 		if (f->id == handle)
-			return (unsigned long)f;
+			return f;
 	}
-	return 0;
+	return NULL;
 }
 
 static int fw_init(struct tcf_proto *tp)
@@ -147,10 +147,10 @@ static void fw_destroy(struct tcf_proto *tp)
 	kfree_rcu(head, rcu);
 }
 
-static int fw_delete(struct tcf_proto *tp, unsigned long arg, bool *last)
+static int fw_delete(struct tcf_proto *tp, void *arg, bool *last)
 {
 	struct fw_head *head = rtnl_dereference(tp->root);
-	struct fw_filter *f = (struct fw_filter *)arg;
+	struct fw_filter *f = arg;
 	struct fw_filter __rcu **fp;
 	struct fw_filter *pfp;
 	int ret = -EINVAL;
@@ -230,11 +230,11 @@ static int fw_set_parms(struct net *net, struct tcf_proto *tp,
 
 static int fw_change(struct net *net, struct sk_buff *in_skb,
 		     struct tcf_proto *tp, unsigned long base,
-		     u32 handle, struct nlattr **tca, unsigned long *arg,
+		     u32 handle, struct nlattr **tca, void **arg,
 		     bool ovr)
 {
 	struct fw_head *head = rtnl_dereference(tp->root);
-	struct fw_filter *f = (struct fw_filter *) *arg;
+	struct fw_filter *f = *arg;
 	struct nlattr *opt = tca[TCA_OPTIONS];
 	struct nlattr *tb[TCA_FW_MAX + 1];
 	int err;
@@ -288,7 +288,7 @@ static int fw_change(struct net *net, struct sk_buff *in_skb,
 		tcf_unbind_filter(tp, &f->res);
 		call_rcu(&f->rcu, fw_delete_filter);
 
-		*arg = (unsigned long)fnew;
+		*arg = fnew;
 		return err;
 	}
 
@@ -325,7 +325,7 @@ static int fw_change(struct net *net, struct sk_buff *in_skb,
 	RCU_INIT_POINTER(f->next, head->ht[fw_hash(handle)]);
 	rcu_assign_pointer(head->ht[fw_hash(handle)], f);
 
-	*arg = (unsigned long)f;
+	*arg = f;
 	return 0;
 
 errout:
@@ -354,7 +354,7 @@ static void fw_walk(struct tcf_proto *tp, struct tcf_walker *arg)
 				arg->count++;
 				continue;
 			}
-			if (arg->fn(tp, (unsigned long)f, arg) < 0) {
+			if (arg->fn(tp, f, arg) < 0) {
 				arg->stop = 1;
 				return;
 			}
@@ -363,11 +363,11 @@ static void fw_walk(struct tcf_proto *tp, struct tcf_walker *arg)
 	}
 }
 
-static int fw_dump(struct net *net, struct tcf_proto *tp, unsigned long fh,
+static int fw_dump(struct net *net, struct tcf_proto *tp, void *fh,
 		   struct sk_buff *skb, struct tcmsg *t)
 {
 	struct fw_head *head = rtnl_dereference(tp->root);
-	struct fw_filter *f = (struct fw_filter *)fh;
+	struct fw_filter *f = fh;
 	struct nlattr *nest;
 
 	if (f == NULL)
diff --git a/net/sched/cls_matchall.c b/net/sched/cls_matchall.c
index c9f6500b8080..d44e26fdae84 100644
--- a/net/sched/cls_matchall.c
+++ b/net/sched/cls_matchall.c
@@ -98,9 +98,9 @@ static void mall_destroy(struct tcf_proto *tp)
 	call_rcu(&head->rcu, mall_destroy_rcu);
 }
 
-static unsigned long mall_get(struct tcf_proto *tp, u32 handle)
+static void *mall_get(struct tcf_proto *tp, u32 handle)
 {
-	return 0UL;
+	return NULL;
 }
 
 static const struct nla_policy mall_policy[TCA_MATCHALL_MAX + 1] = {
@@ -129,7 +129,7 @@ static int mall_set_parms(struct net *net, struct tcf_proto *tp,
 static int mall_change(struct net *net, struct sk_buff *in_skb,
 		       struct tcf_proto *tp, unsigned long base,
 		       u32 handle, struct nlattr **tca,
-		       unsigned long *arg, bool ovr)
+		       void **arg, bool ovr)
 {
 	struct cls_mall_head *head = rtnl_dereference(tp->root);
 	struct net_device *dev = tp->q->dev_queue->dev;
@@ -185,7 +185,7 @@ static int mall_change(struct net *net, struct sk_buff *in_skb,
 	if (!tc_in_hw(new->flags))
 		new->flags |= TCA_CLS_FLAGS_NOT_IN_HW;
 
-	*arg = (unsigned long) head;
+	*arg = head;
 	rcu_assign_pointer(tp->root, new);
 	return 0;
 
@@ -197,7 +197,7 @@ err_exts_init:
 	return err;
 }
 
-static int mall_delete(struct tcf_proto *tp, unsigned long arg, bool *last)
+static int mall_delete(struct tcf_proto *tp, void *arg, bool *last)
 {
 	return -EOPNOTSUPP;
 }
@@ -208,16 +208,16 @@ static void mall_walk(struct tcf_proto *tp, struct tcf_walker *arg)
 
 	if (arg->count < arg->skip)
 		goto skip;
-	if (arg->fn(tp, (unsigned long) head, arg) < 0)
+	if (arg->fn(tp, head, arg) < 0)
 		arg->stop = 1;
 skip:
 	arg->count++;
 }
 
-static int mall_dump(struct net *net, struct tcf_proto *tp, unsigned long fh,
+static int mall_dump(struct net *net, struct tcf_proto *tp, void *fh,
 		     struct sk_buff *skb, struct tcmsg *t)
 {
-	struct cls_mall_head *head = (struct cls_mall_head *) fh;
+	struct cls_mall_head *head = fh;
 	struct nlattr *nest;
 
 	if (!head)
diff --git a/net/sched/cls_route.c b/net/sched/cls_route.c
index f1e7d7850b44..3b70982394ce 100644
--- a/net/sched/cls_route.c
+++ b/net/sched/cls_route.c
@@ -216,7 +216,7 @@ static inline u32 from_hash(u32 id)
 	return 16 + (id & 0xF);
 }
 
-static unsigned long route4_get(struct tcf_proto *tp, u32 handle)
+static void *route4_get(struct tcf_proto *tp, u32 handle)
 {
 	struct route4_head *head = rtnl_dereference(tp->root);
 	struct route4_bucket *b;
@@ -225,11 +225,11 @@ static unsigned long route4_get(struct tcf_proto *tp, u32 handle)
 
 	h1 = to_hash(handle);
 	if (h1 > 256)
-		return 0;
+		return NULL;
 
 	h2 = from_hash(handle >> 16);
 	if (h2 > 32)
-		return 0;
+		return NULL;
 
 	b = rtnl_dereference(head->table[h1]);
 	if (b) {
@@ -237,9 +237,9 @@ static unsigned long route4_get(struct tcf_proto *tp, u32 handle)
 		     f;
 		     f = rtnl_dereference(f->next))
 			if (f->handle == handle)
-				return (unsigned long)f;
+				return f;
 	}
-	return 0;
+	return NULL;
 }
 
 static int route4_init(struct tcf_proto *tp)
@@ -294,10 +294,10 @@ static void route4_destroy(struct tcf_proto *tp)
 	kfree_rcu(head, rcu);
 }
 
-static int route4_delete(struct tcf_proto *tp, unsigned long arg, bool *last)
+static int route4_delete(struct tcf_proto *tp, void *arg, bool *last)
 {
 	struct route4_head *head = rtnl_dereference(tp->root);
-	struct route4_filter *f = (struct route4_filter *)arg;
+	struct route4_filter *f = arg;
 	struct route4_filter __rcu **fp;
 	struct route4_filter *nf;
 	struct route4_bucket *b;
@@ -448,7 +448,7 @@ static int route4_set_parms(struct net *net, struct tcf_proto *tp,
 
 static int route4_change(struct net *net, struct sk_buff *in_skb,
 			 struct tcf_proto *tp, unsigned long base, u32 handle,
-			 struct nlattr **tca, unsigned long *arg, bool ovr)
+			 struct nlattr **tca, void **arg, bool ovr)
 {
 	struct route4_head *head = rtnl_dereference(tp->root);
 	struct route4_filter __rcu **fp;
@@ -467,7 +467,7 @@ static int route4_change(struct net *net, struct sk_buff *in_skb,
 	if (err < 0)
 		return err;
 
-	fold = (struct route4_filter *)*arg;
+	fold = *arg;
 	if (fold && handle && fold->handle != handle)
 			return -EINVAL;
 
@@ -525,7 +525,7 @@ static int route4_change(struct net *net, struct sk_buff *in_skb,
 	}
 
 	route4_reset_fastmap(head);
-	*arg = (unsigned long)f;
+	*arg = f;
 	if (fold) {
 		tcf_unbind_filter(tp, &fold->res);
 		call_rcu(&fold->rcu, route4_delete_filter);
@@ -564,7 +564,7 @@ static void route4_walk(struct tcf_proto *tp, struct tcf_walker *arg)
 						arg->count++;
 						continue;
 					}
-					if (arg->fn(tp, (unsigned long)f, arg) < 0) {
+					if (arg->fn(tp, f, arg) < 0) {
 						arg->stop = 1;
 						return;
 					}
@@ -575,10 +575,10 @@ static void route4_walk(struct tcf_proto *tp, struct tcf_walker *arg)
 	}
 }
 
-static int route4_dump(struct net *net, struct tcf_proto *tp, unsigned long fh,
+static int route4_dump(struct net *net, struct tcf_proto *tp, void *fh,
 		       struct sk_buff *skb, struct tcmsg *t)
 {
-	struct route4_filter *f = (struct route4_filter *)fh;
+	struct route4_filter *f = fh;
 	struct nlattr *nest;
 	u32 id;
 
diff --git a/net/sched/cls_rsvp.h b/net/sched/cls_rsvp.h
index 4adb67a73491..26203ff817f3 100644
--- a/net/sched/cls_rsvp.h
+++ b/net/sched/cls_rsvp.h
@@ -248,7 +248,7 @@ static void rsvp_replace(struct tcf_proto *tp, struct rsvp_filter *n, u32 h)
 	BUG_ON(1);
 }
 
-static unsigned long rsvp_get(struct tcf_proto *tp, u32 handle)
+static void *rsvp_get(struct tcf_proto *tp, u32 handle)
 {
 	struct rsvp_head *head = rtnl_dereference(tp->root);
 	struct rsvp_session *s;
@@ -257,17 +257,17 @@ static unsigned long rsvp_get(struct tcf_proto *tp, u32 handle)
 	unsigned int h2 = (handle >> 8) & 0xFF;
 
 	if (h2 > 16)
-		return 0;
+		return NULL;
 
 	for (s = rtnl_dereference(head->ht[h1]); s;
 	     s = rtnl_dereference(s->next)) {
 		for (f = rtnl_dereference(s->ht[h2]); f;
 		     f = rtnl_dereference(f->next)) {
 			if (f->handle == handle)
-				return (unsigned long)f;
+				return f;
 		}
 	}
-	return 0;
+	return NULL;
 }
 
 static int rsvp_init(struct tcf_proto *tp)
@@ -328,10 +328,10 @@ static void rsvp_destroy(struct tcf_proto *tp)
 	kfree_rcu(data, rcu);
 }
 
-static int rsvp_delete(struct tcf_proto *tp, unsigned long arg, bool *last)
+static int rsvp_delete(struct tcf_proto *tp, void *arg, bool *last)
 {
 	struct rsvp_head *head = rtnl_dereference(tp->root);
-	struct rsvp_filter *nfp, *f = (struct rsvp_filter *)arg;
+	struct rsvp_filter *nfp, *f = arg;
 	struct rsvp_filter __rcu **fp;
 	unsigned int h = f->handle;
 	struct rsvp_session __rcu **sp;
@@ -464,7 +464,7 @@ static int rsvp_change(struct net *net, struct sk_buff *in_skb,
 		       struct tcf_proto *tp, unsigned long base,
 		       u32 handle,
 		       struct nlattr **tca,
-		       unsigned long *arg, bool ovr)
+		       void **arg, bool ovr)
 {
 	struct rsvp_head *data = rtnl_dereference(tp->root);
 	struct rsvp_filter *f, *nfp;
@@ -493,7 +493,7 @@ static int rsvp_change(struct net *net, struct sk_buff *in_skb,
 	if (err < 0)
 		goto errout2;
 
-	f = (struct rsvp_filter *)*arg;
+	f = *arg;
 	if (f) {
 		/* Node exists: adjust only classid */
 		struct rsvp_filter *n;
@@ -604,7 +604,7 @@ insert:
 			RCU_INIT_POINTER(f->next, nfp);
 			rcu_assign_pointer(*fp, f);
 
-			*arg = (unsigned long)f;
+			*arg = f;
 			return 0;
 		}
 	}
@@ -663,7 +663,7 @@ static void rsvp_walk(struct tcf_proto *tp, struct tcf_walker *arg)
 						arg->count++;
 						continue;
 					}
-					if (arg->fn(tp, (unsigned long)f, arg) < 0) {
+					if (arg->fn(tp, f, arg) < 0) {
 						arg->stop = 1;
 						return;
 					}
@@ -674,10 +674,10 @@ static void rsvp_walk(struct tcf_proto *tp, struct tcf_walker *arg)
 	}
 }
 
-static int rsvp_dump(struct net *net, struct tcf_proto *tp, unsigned long fh,
+static int rsvp_dump(struct net *net, struct tcf_proto *tp, void *fh,
 		     struct sk_buff *skb, struct tcmsg *t)
 {
-	struct rsvp_filter *f = (struct rsvp_filter *)fh;
+	struct rsvp_filter *f = fh;
 	struct rsvp_session *s;
 	struct nlattr *nest;
 	struct tc_rsvp_pinfo pinfo;
diff --git a/net/sched/cls_tcindex.c b/net/sched/cls_tcindex.c
index d69f828f3fed..fb281b9b2c52 100644
--- a/net/sched/cls_tcindex.c
+++ b/net/sched/cls_tcindex.c
@@ -104,16 +104,16 @@ static int tcindex_classify(struct sk_buff *skb, const struct tcf_proto *tp,
 }
 
 
-static unsigned long tcindex_get(struct tcf_proto *tp, u32 handle)
+static void *tcindex_get(struct tcf_proto *tp, u32 handle)
 {
 	struct tcindex_data *p = rtnl_dereference(tp->root);
 	struct tcindex_filter_result *r;
 
 	pr_debug("tcindex_get(tp %p,handle 0x%08x)\n", tp, handle);
 	if (p->perfect && handle >= p->alloc_hash)
-		return 0;
+		return NULL;
 	r = tcindex_lookup(p, handle);
-	return r && tcindex_filter_is_set(r) ? (unsigned long) r : 0UL;
+	return r && tcindex_filter_is_set(r) ? r : NULL;
 }
 
 static int tcindex_init(struct tcf_proto *tp)
@@ -150,14 +150,14 @@ static void tcindex_destroy_fexts(struct rcu_head *head)
 	kfree(f);
 }
 
-static int tcindex_delete(struct tcf_proto *tp, unsigned long arg, bool *last)
+static int tcindex_delete(struct tcf_proto *tp, void *arg, bool *last)
 {
 	struct tcindex_data *p = rtnl_dereference(tp->root);
-	struct tcindex_filter_result *r = (struct tcindex_filter_result *) arg;
+	struct tcindex_filter_result *r = arg;
 	struct tcindex_filter __rcu **walk;
 	struct tcindex_filter *f = NULL;
 
-	pr_debug("tcindex_delete(tp %p,arg 0x%lx),p %p\n", tp, arg, p);
+	pr_debug("tcindex_delete(tp %p,arg %p),p %p\n", tp, arg, p);
 	if (p->perfect) {
 		if (!r->res.class)
 			return -ENOENT;
@@ -192,8 +192,7 @@ found:
 }
 
 static int tcindex_destroy_element(struct tcf_proto *tp,
-				   unsigned long arg,
-				   struct tcf_walker *walker)
+				   void *arg, struct tcf_walker *walker)
 {
 	bool last;
 
@@ -471,17 +470,17 @@ errout:
 static int
 tcindex_change(struct net *net, struct sk_buff *in_skb,
 	       struct tcf_proto *tp, unsigned long base, u32 handle,
-	       struct nlattr **tca, unsigned long *arg, bool ovr)
+	       struct nlattr **tca, void **arg, bool ovr)
 {
 	struct nlattr *opt = tca[TCA_OPTIONS];
 	struct nlattr *tb[TCA_TCINDEX_MAX + 1];
 	struct tcindex_data *p = rtnl_dereference(tp->root);
-	struct tcindex_filter_result *r = (struct tcindex_filter_result *) *arg;
+	struct tcindex_filter_result *r = *arg;
 	int err;
 
 	pr_debug("tcindex_change(tp %p,handle 0x%08x,tca %p,arg %p),opt %p,"
-	    "p %p,r %p,*arg 0x%lx\n",
-	    tp, handle, tca, arg, opt, p, r, arg ? *arg : 0L);
+	    "p %p,r %p,*arg %p\n",
+	    tp, handle, tca, arg, opt, p, r, arg ? *arg : NULL);
 
 	if (!opt)
 		return 0;
@@ -506,9 +505,7 @@ static void tcindex_walk(struct tcf_proto *tp, struct tcf_walker *walker)
 			if (!p->perfect[i].res.class)
 				continue;
 			if (walker->count >= walker->skip) {
-				if (walker->fn(tp,
-				    (unsigned long) (p->perfect+i), walker)
-				     < 0) {
+				if (walker->fn(tp, p->perfect + i, walker) < 0) {
 					walker->stop = 1;
 					return;
 				}
@@ -522,8 +519,7 @@ static void tcindex_walk(struct tcf_proto *tp, struct tcf_walker *walker)
 		for (f = rtnl_dereference(p->h[i]); f; f = next) {
 			next = rtnl_dereference(f->next);
 			if (walker->count >= walker->skip) {
-				if (walker->fn(tp, (unsigned long) &f->result,
-				    walker) < 0) {
+				if (walker->fn(tp, &f->result, walker) < 0) {
 					walker->stop = 1;
 					return;
 				}
@@ -548,14 +544,14 @@ static void tcindex_destroy(struct tcf_proto *tp)
 }
 
 
-static int tcindex_dump(struct net *net, struct tcf_proto *tp, unsigned long fh,
+static int tcindex_dump(struct net *net, struct tcf_proto *tp, void *fh,
 			struct sk_buff *skb, struct tcmsg *t)
 {
 	struct tcindex_data *p = rtnl_dereference(tp->root);
-	struct tcindex_filter_result *r = (struct tcindex_filter_result *) fh;
+	struct tcindex_filter_result *r = fh;
 	struct nlattr *nest;
 
-	pr_debug("tcindex_dump(tp %p,fh 0x%lx,skb %p,t %p),p %p,r %p\n",
+	pr_debug("tcindex_dump(tp %p,fh %p,skb %p,t %p),p %p,r %p\n",
 		 tp, fh, skb, t, p, r);
 	pr_debug("p->perfect %p p->h %p\n", p->perfect, p->h);
 
diff --git a/net/sched/cls_u32.c b/net/sched/cls_u32.c
index 4ed51d347d0a..5a3f78181526 100644
--- a/net/sched/cls_u32.c
+++ b/net/sched/cls_u32.c
@@ -289,7 +289,7 @@ out:
 }
 
 
-static unsigned long u32_get(struct tcf_proto *tp, u32 handle)
+static void *u32_get(struct tcf_proto *tp, u32 handle)
 {
 	struct tc_u_hnode *ht;
 	struct tc_u_common *tp_c = tp->data;
@@ -300,12 +300,12 @@ static unsigned long u32_get(struct tcf_proto *tp, u32 handle)
 		ht = u32_lookup_ht(tp_c, TC_U32_HTID(handle));
 
 	if (!ht)
-		return 0;
+		return NULL;
 
 	if (TC_U32_KEY(handle) == 0)
-		return (unsigned long)ht;
+		return ht;
 
-	return (unsigned long)u32_lookup_key(ht, handle);
+	return u32_lookup_key(ht, handle);
 }
 
 static u32 gen_new_htid(struct tc_u_common *tp_c)
@@ -605,9 +605,9 @@ static void u32_destroy(struct tcf_proto *tp)
 	tp->data = NULL;
 }
 
-static int u32_delete(struct tcf_proto *tp, unsigned long arg, bool *last)
+static int u32_delete(struct tcf_proto *tp, void *arg, bool *last)
 {
-	struct tc_u_hnode *ht = (struct tc_u_hnode *)arg;
+	struct tc_u_hnode *ht = arg;
 	struct tc_u_hnode *root_ht = rtnl_dereference(tp->root);
 	struct tc_u_common *tp_c = tp->data;
 	int ret = 0;
@@ -831,7 +831,7 @@ static struct tc_u_knode *u32_init_knode(struct tcf_proto *tp,
 
 static int u32_change(struct net *net, struct sk_buff *in_skb,
 		      struct tcf_proto *tp, unsigned long base, u32 handle,
-		      struct nlattr **tca, unsigned long *arg, bool ovr)
+		      struct nlattr **tca, void **arg, bool ovr)
 {
 	struct tc_u_common *tp_c = tp->data;
 	struct tc_u_hnode *ht;
@@ -858,7 +858,7 @@ static int u32_change(struct net *net, struct sk_buff *in_skb,
 			return -EINVAL;
 	}
 
-	n = (struct tc_u_knode *)*arg;
+	n = *arg;
 	if (n) {
 		struct tc_u_knode *new;
 
@@ -925,7 +925,7 @@ static int u32_change(struct net *net, struct sk_buff *in_skb,
 
 		RCU_INIT_POINTER(ht->next, tp_c->hlist);
 		rcu_assign_pointer(tp_c->hlist, ht);
-		*arg = (unsigned long)ht;
+		*arg = ht;
 
 		return 0;
 	}
@@ -1020,7 +1020,7 @@ static int u32_change(struct net *net, struct sk_buff *in_skb,
 
 		RCU_INIT_POINTER(n->next, pins);
 		rcu_assign_pointer(*ins, n);
-		*arg = (unsigned long)n;
+		*arg = n;
 		return 0;
 	}
 
@@ -1054,7 +1054,7 @@ static void u32_walk(struct tcf_proto *tp, struct tcf_walker *arg)
 		if (ht->prio != tp->prio)
 			continue;
 		if (arg->count >= arg->skip) {
-			if (arg->fn(tp, (unsigned long)ht, arg) < 0) {
+			if (arg->fn(tp, ht, arg) < 0) {
 				arg->stop = 1;
 				return;
 			}
@@ -1068,7 +1068,7 @@ static void u32_walk(struct tcf_proto *tp, struct tcf_walker *arg)
 					arg->count++;
 					continue;
 				}
-				if (arg->fn(tp, (unsigned long)n, arg) < 0) {
+				if (arg->fn(tp, n, arg) < 0) {
 					arg->stop = 1;
 					return;
 				}
@@ -1078,10 +1078,10 @@ static void u32_walk(struct tcf_proto *tp, struct tcf_walker *arg)
 	}
 }
 
-static int u32_dump(struct net *net, struct tcf_proto *tp, unsigned long fh,
+static int u32_dump(struct net *net, struct tcf_proto *tp, void *fh,
 		    struct sk_buff *skb, struct tcmsg *t)
 {
-	struct tc_u_knode *n = (struct tc_u_knode *)fh;
+	struct tc_u_knode *n = fh;
 	struct tc_u_hnode *ht_up, *ht_down;
 	struct nlattr *nest;
 
@@ -1095,7 +1095,7 @@ static int u32_dump(struct net *net, struct tcf_proto *tp, unsigned long fh,
 		goto nla_put_failure;
 
 	if (TC_U32_KEY(n->handle) == 0) {
-		struct tc_u_hnode *ht = (struct tc_u_hnode *)fh;
+		struct tc_u_hnode *ht = fh;
 		u32 divisor = ht->divisor + 1;
 
 		if (nla_put_u32(skb, TCA_U32_DIVISOR, divisor))
-- 
cgit v1.2.3


From b04c80d3a7e228cfb832cdb1c9ce8151f174669c Mon Sep 17 00:00:00 2001
From: David Lebrun <david.lebrun@uclouvain.be>
Date: Sat, 5 Aug 2017 12:38:25 +0200
Subject: ipv6: sr: export SRH insertion functions

This patch exports the seg6_do_srh_encap() and seg6_do_srh_inline()
functions. It also removes the CONFIG_IPV6_SEG6_INLINE knob
that enabled the compilation of seg6_do_srh_inline(). This function
is now built-in.

Signed-off-by: David Lebrun <david.lebrun@uclouvain.be>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/seg6.h       |  2 ++
 net/ipv6/Kconfig         | 12 ------------
 net/ipv6/seg6_iptunnel.c | 12 ++++--------
 3 files changed, 6 insertions(+), 20 deletions(-)

(limited to 'include')

diff --git a/include/net/seg6.h b/include/net/seg6.h
index 4e0357517d79..a32abb040e1d 100644
--- a/include/net/seg6.h
+++ b/include/net/seg6.h
@@ -58,5 +58,7 @@ extern int seg6_iptunnel_init(void);
 extern void seg6_iptunnel_exit(void);
 
 extern bool seg6_validate_srh(struct ipv6_sr_hdr *srh, int len);
+extern int seg6_do_srh_encap(struct sk_buff *skb, struct ipv6_sr_hdr *osrh);
+extern int seg6_do_srh_inline(struct sk_buff *skb, struct ipv6_sr_hdr *osrh);
 
 #endif
diff --git a/net/ipv6/Kconfig b/net/ipv6/Kconfig
index 48c452959d2c..50181a96923e 100644
--- a/net/ipv6/Kconfig
+++ b/net/ipv6/Kconfig
@@ -315,18 +315,6 @@ config IPV6_SEG6_LWTUNNEL
 
 	  If unsure, say N.
 
-config IPV6_SEG6_INLINE
-	bool "IPv6: direct Segment Routing Header insertion "
-	depends on IPV6_SEG6_LWTUNNEL
-	---help---
-	  Support for direct insertion of the Segment Routing Header,
-	  also known as inline mode. Be aware that direct insertion of
-	  extension headers (as opposed to encapsulation) may break
-	  multiple mechanisms such as PMTUD or IPSec AH. Use this feature
-	  only if you know exactly what you are doing.
-
-	  If unsure, say N.
-
 config IPV6_SEG6_HMAC
 	bool "IPv6: Segment Routing HMAC support"
 	depends on IPV6
diff --git a/net/ipv6/seg6_iptunnel.c b/net/ipv6/seg6_iptunnel.c
index 264d772d3c7d..501233040570 100644
--- a/net/ipv6/seg6_iptunnel.c
+++ b/net/ipv6/seg6_iptunnel.c
@@ -91,7 +91,7 @@ static void set_tun_src(struct net *net, struct net_device *dev,
 }
 
 /* encapsulate an IPv6 packet within an outer IPv6 header with a given SRH */
-static int seg6_do_srh_encap(struct sk_buff *skb, struct ipv6_sr_hdr *osrh)
+int seg6_do_srh_encap(struct sk_buff *skb, struct ipv6_sr_hdr *osrh)
 {
 	struct net *net = dev_net(skb_dst(skb)->dev);
 	struct ipv6hdr *hdr, *inner_hdr;
@@ -141,10 +141,10 @@ static int seg6_do_srh_encap(struct sk_buff *skb, struct ipv6_sr_hdr *osrh)
 
 	return 0;
 }
+EXPORT_SYMBOL_GPL(seg6_do_srh_encap);
 
 /* insert an SRH within an IPv6 packet, just after the IPv6 header */
-#ifdef CONFIG_IPV6_SEG6_INLINE
-static int seg6_do_srh_inline(struct sk_buff *skb, struct ipv6_sr_hdr *osrh)
+int seg6_do_srh_inline(struct sk_buff *skb, struct ipv6_sr_hdr *osrh)
 {
 	struct ipv6hdr *hdr, *oldhdr;
 	struct ipv6_sr_hdr *isrh;
@@ -193,7 +193,7 @@ static int seg6_do_srh_inline(struct sk_buff *skb, struct ipv6_sr_hdr *osrh)
 
 	return 0;
 }
-#endif
+EXPORT_SYMBOL_GPL(seg6_do_srh_inline);
 
 static int seg6_do_srh(struct sk_buff *skb)
 {
@@ -209,12 +209,10 @@ static int seg6_do_srh(struct sk_buff *skb)
 	}
 
 	switch (tinfo->mode) {
-#ifdef CONFIG_IPV6_SEG6_INLINE
 	case SEG6_IPTUN_MODE_INLINE:
 		err = seg6_do_srh_inline(skb, tinfo->srh);
 		skb_reset_inner_headers(skb);
 		break;
-#endif
 	case SEG6_IPTUN_MODE_ENCAP:
 		err = seg6_do_srh_encap(skb, tinfo->srh);
 		break;
@@ -357,10 +355,8 @@ static int seg6_build_state(struct nlattr *nla,
 		return -EINVAL;
 
 	switch (tuninfo->mode) {
-#ifdef CONFIG_IPV6_SEG6_INLINE
 	case SEG6_IPTUN_MODE_INLINE:
 		break;
-#endif
 	case SEG6_IPTUN_MODE_ENCAP:
 		break;
 	default:
-- 
cgit v1.2.3


From d1df6fd8a1d22d37cffa0075ab8ad423ce656777 Mon Sep 17 00:00:00 2001
From: David Lebrun <david.lebrun@uclouvain.be>
Date: Sat, 5 Aug 2017 12:38:26 +0200
Subject: ipv6: sr: define core operations for seg6local lightweight tunnel

This patch implements a new type of lightweight tunnel named seg6local.
A seg6local lwt is defined by a type of action and a set of parameters.
The action represents the operation to perform on the packets matching the
lwt's route, and is not necessarily an encapsulation. The set of parameters
are arguments for the processing function.

Each action is defined in a struct seg6_action_desc within
seg6_action_table[]. This structure contains the action, mandatory
attributes, the processing function, and a static headroom size required by
the action. The mandatory attributes are encoded as a bitmask field. The
static headroom is set to a non-zero value when the processing function
always add a constant number of bytes to the skb (e.g. the header size for
encapsulations).

To facilitate rtnetlink-related operations such as parsing, fill_encap,
and cmp_encap, each type of action parameter is associated to three
function pointers, in seg6_action_params[].

All actions defined in seg6_local.h are detailed in [1].

[1] https://tools.ietf.org/html/draft-filsfils-spring-srv6-network-programming-01

Signed-off-by: David Lebrun <david.lebrun@uclouvain.be>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/seg6_local.h      |   6 +
 include/net/seg6.h              |   2 +
 include/uapi/linux/lwtunnel.h   |   1 +
 include/uapi/linux/seg6_local.h |  68 +++++++++
 net/core/lwtunnel.c             |   2 +
 net/ipv6/Kconfig                |   3 +-
 net/ipv6/Makefile               |   2 +-
 net/ipv6/seg6.c                 |   5 +
 net/ipv6/seg6_local.c           | 320 ++++++++++++++++++++++++++++++++++++++++
 9 files changed, 407 insertions(+), 2 deletions(-)
 create mode 100644 include/linux/seg6_local.h
 create mode 100644 include/uapi/linux/seg6_local.h
 create mode 100644 net/ipv6/seg6_local.c

(limited to 'include')

diff --git a/include/linux/seg6_local.h b/include/linux/seg6_local.h
new file mode 100644
index 000000000000..ee63e76fe0c7
--- /dev/null
+++ b/include/linux/seg6_local.h
@@ -0,0 +1,6 @@
+#ifndef _LINUX_SEG6_LOCAL_H
+#define _LINUX_SEG6_LOCAL_H
+
+#include <uapi/linux/seg6_local.h>
+
+#endif
diff --git a/include/net/seg6.h b/include/net/seg6.h
index a32abb040e1d..5379f550f521 100644
--- a/include/net/seg6.h
+++ b/include/net/seg6.h
@@ -56,6 +56,8 @@ extern int seg6_init(void);
 extern void seg6_exit(void);
 extern int seg6_iptunnel_init(void);
 extern void seg6_iptunnel_exit(void);
+extern int seg6_local_init(void);
+extern void seg6_local_exit(void);
 
 extern bool seg6_validate_srh(struct ipv6_sr_hdr *srh, int len);
 extern int seg6_do_srh_encap(struct sk_buff *skb, struct ipv6_sr_hdr *osrh);
diff --git a/include/uapi/linux/lwtunnel.h b/include/uapi/linux/lwtunnel.h
index 92724cba1eba..7fdd19ca7511 100644
--- a/include/uapi/linux/lwtunnel.h
+++ b/include/uapi/linux/lwtunnel.h
@@ -11,6 +11,7 @@ enum lwtunnel_encap_types {
 	LWTUNNEL_ENCAP_IP6,
 	LWTUNNEL_ENCAP_SEG6,
 	LWTUNNEL_ENCAP_BPF,
+	LWTUNNEL_ENCAP_SEG6_LOCAL,
 	__LWTUNNEL_ENCAP_MAX,
 };
 
diff --git a/include/uapi/linux/seg6_local.h b/include/uapi/linux/seg6_local.h
new file mode 100644
index 000000000000..ef2d8c3e76c1
--- /dev/null
+++ b/include/uapi/linux/seg6_local.h
@@ -0,0 +1,68 @@
+/*
+ *  SR-IPv6 implementation
+ *
+ *  Author:
+ *  David Lebrun <david.lebrun@uclouvain.be>
+ *
+ *
+ *  This program is free software; you can redistribute it and/or
+ *      modify it under the terms of the GNU General Public License
+ *      as published by the Free Software Foundation; either version
+ *      2 of the License, or (at your option) any later version.
+ */
+
+#ifndef _UAPI_LINUX_SEG6_LOCAL_H
+#define _UAPI_LINUX_SEG6_LOCAL_H
+
+#include <linux/seg6.h>
+
+enum {
+	SEG6_LOCAL_UNSPEC,
+	SEG6_LOCAL_ACTION,
+	SEG6_LOCAL_SRH,
+	SEG6_LOCAL_TABLE,
+	SEG6_LOCAL_NH4,
+	SEG6_LOCAL_NH6,
+	SEG6_LOCAL_IIF,
+	SEG6_LOCAL_OIF,
+	__SEG6_LOCAL_MAX,
+};
+#define SEG6_LOCAL_MAX (__SEG6_LOCAL_MAX - 1)
+
+enum {
+	SEG6_LOCAL_ACTION_UNSPEC	= 0,
+	/* node segment */
+	SEG6_LOCAL_ACTION_END		= 1,
+	/* adjacency segment (IPv6 cross-connect) */
+	SEG6_LOCAL_ACTION_END_X		= 2,
+	/* lookup of next seg NH in table */
+	SEG6_LOCAL_ACTION_END_T		= 3,
+	/* decap and L2 cross-connect */
+	SEG6_LOCAL_ACTION_END_DX2	= 4,
+	/* decap and IPv6 cross-connect */
+	SEG6_LOCAL_ACTION_END_DX6	= 5,
+	/* decap and IPv4 cross-connect */
+	SEG6_LOCAL_ACTION_END_DX4	= 6,
+	/* decap and lookup of DA in v6 table */
+	SEG6_LOCAL_ACTION_END_DT6	= 7,
+	/* decap and lookup of DA in v4 table */
+	SEG6_LOCAL_ACTION_END_DT4	= 8,
+	/* binding segment with insertion */
+	SEG6_LOCAL_ACTION_END_B6	= 9,
+	/* binding segment with encapsulation */
+	SEG6_LOCAL_ACTION_END_B6_ENCAP	= 10,
+	/* binding segment with MPLS encap */
+	SEG6_LOCAL_ACTION_END_BM	= 11,
+	/* lookup last seg in table */
+	SEG6_LOCAL_ACTION_END_S		= 12,
+	/* forward to SR-unaware VNF with static proxy */
+	SEG6_LOCAL_ACTION_END_AS	= 13,
+	/* forward to SR-unaware VNF with masquerading */
+	SEG6_LOCAL_ACTION_END_AM	= 14,
+
+	__SEG6_LOCAL_ACTION_MAX,
+};
+
+#define SEG6_LOCAL_ACTION_MAX (__SEG6_LOCAL_ACTION_MAX - 1)
+
+#endif
diff --git a/net/core/lwtunnel.c b/net/core/lwtunnel.c
index 435f35f9a61c..0b171756453c 100644
--- a/net/core/lwtunnel.c
+++ b/net/core/lwtunnel.c
@@ -44,6 +44,8 @@ static const char *lwtunnel_encap_str(enum lwtunnel_encap_types encap_type)
 		return "SEG6";
 	case LWTUNNEL_ENCAP_BPF:
 		return "BPF";
+	case LWTUNNEL_ENCAP_SEG6_LOCAL:
+		return "SEG6LOCAL";
 	case LWTUNNEL_ENCAP_IP6:
 	case LWTUNNEL_ENCAP_IP:
 	case LWTUNNEL_ENCAP_NONE:
diff --git a/net/ipv6/Kconfig b/net/ipv6/Kconfig
index 50181a96923e..0d722396dce6 100644
--- a/net/ipv6/Kconfig
+++ b/net/ipv6/Kconfig
@@ -311,7 +311,8 @@ config IPV6_SEG6_LWTUNNEL
 	---help---
 	  Support for encapsulation of packets within an outer IPv6
 	  header and a Segment Routing Header using the lightweight
-	  tunnels mechanism.
+	  tunnels mechanism. Also enable support for advanced local
+	  processing of SRv6 packets based on their active segment.
 
 	  If unsure, say N.
 
diff --git a/net/ipv6/Makefile b/net/ipv6/Makefile
index f8b24c2e0d77..10e342363793 100644
--- a/net/ipv6/Makefile
+++ b/net/ipv6/Makefile
@@ -23,7 +23,7 @@ ipv6-$(CONFIG_IPV6_MULTIPLE_TABLES) += fib6_rules.o
 ipv6-$(CONFIG_PROC_FS) += proc.o
 ipv6-$(CONFIG_SYN_COOKIES) += syncookies.o
 ipv6-$(CONFIG_NETLABEL) += calipso.o
-ipv6-$(CONFIG_IPV6_SEG6_LWTUNNEL) += seg6_iptunnel.o
+ipv6-$(CONFIG_IPV6_SEG6_LWTUNNEL) += seg6_iptunnel.o seg6_local.o
 ipv6-$(CONFIG_IPV6_SEG6_HMAC) += seg6_hmac.o
 
 ipv6-objs += $(ipv6-y)
diff --git a/net/ipv6/seg6.c b/net/ipv6/seg6.c
index 81c2339b3285..c81407770956 100644
--- a/net/ipv6/seg6.c
+++ b/net/ipv6/seg6.c
@@ -456,6 +456,10 @@ int __init seg6_init(void)
 	err = seg6_iptunnel_init();
 	if (err)
 		goto out_unregister_pernet;
+
+	err = seg6_local_init();
+	if (err)
+		goto out_unregister_pernet;
 #endif
 
 #ifdef CONFIG_IPV6_SEG6_HMAC
@@ -471,6 +475,7 @@ out:
 #ifdef CONFIG_IPV6_SEG6_HMAC
 out_unregister_iptun:
 #ifdef CONFIG_IPV6_SEG6_LWTUNNEL
+	seg6_local_exit();
 	seg6_iptunnel_exit();
 #endif
 #endif
diff --git a/net/ipv6/seg6_local.c b/net/ipv6/seg6_local.c
new file mode 100644
index 000000000000..53615d7e0723
--- /dev/null
+++ b/net/ipv6/seg6_local.c
@@ -0,0 +1,320 @@
+/*
+ *  SR-IPv6 implementation
+ *
+ *  Author:
+ *  David Lebrun <david.lebrun@uclouvain.be>
+ *
+ *
+ *  This program is free software; you can redistribute it and/or
+ *        modify it under the terms of the GNU General Public License
+ *        as published by the Free Software Foundation; either version
+ *        2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/types.h>
+#include <linux/skbuff.h>
+#include <linux/net.h>
+#include <linux/module.h>
+#include <net/ip.h>
+#include <net/lwtunnel.h>
+#include <net/netevent.h>
+#include <net/netns/generic.h>
+#include <net/ip6_fib.h>
+#include <net/route.h>
+#include <net/seg6.h>
+#include <linux/seg6.h>
+#include <linux/seg6_local.h>
+#include <net/addrconf.h>
+#include <net/ip6_route.h>
+#include <net/dst_cache.h>
+#ifdef CONFIG_IPV6_SEG6_HMAC
+#include <net/seg6_hmac.h>
+#endif
+
+struct seg6_local_lwt;
+
+struct seg6_action_desc {
+	int action;
+	unsigned long attrs;
+	int (*input)(struct sk_buff *skb, struct seg6_local_lwt *slwt);
+	int static_headroom;
+};
+
+struct seg6_local_lwt {
+	int action;
+	struct ipv6_sr_hdr *srh;
+	int table;
+	struct in_addr nh4;
+	struct in6_addr nh6;
+	int iif;
+	int oif;
+
+	int headroom;
+	struct seg6_action_desc *desc;
+};
+
+static struct seg6_local_lwt *seg6_local_lwtunnel(struct lwtunnel_state *lwt)
+{
+	return (struct seg6_local_lwt *)lwt->data;
+}
+
+static struct seg6_action_desc seg6_action_table[] = {
+	{
+		.action		= SEG6_LOCAL_ACTION_END,
+		.attrs		= 0,
+	},
+};
+
+static struct seg6_action_desc *__get_action_desc(int action)
+{
+	struct seg6_action_desc *desc;
+	int i, count;
+
+	count = sizeof(seg6_action_table) / sizeof(struct seg6_action_desc);
+	for (i = 0; i < count; i++) {
+		desc = &seg6_action_table[i];
+		if (desc->action == action)
+			return desc;
+	}
+
+	return NULL;
+}
+
+static int seg6_local_input(struct sk_buff *skb)
+{
+	struct dst_entry *orig_dst = skb_dst(skb);
+	struct seg6_action_desc *desc;
+	struct seg6_local_lwt *slwt;
+
+	slwt = seg6_local_lwtunnel(orig_dst->lwtstate);
+	desc = slwt->desc;
+
+	return desc->input(skb, slwt);
+}
+
+static const struct nla_policy seg6_local_policy[SEG6_LOCAL_MAX + 1] = {
+	[SEG6_LOCAL_ACTION]	= { .type = NLA_U32 },
+	[SEG6_LOCAL_SRH]	= { .type = NLA_BINARY },
+	[SEG6_LOCAL_TABLE]	= { .type = NLA_U32 },
+	[SEG6_LOCAL_NH4]	= { .type = NLA_BINARY,
+				    .len = sizeof(struct in_addr) },
+	[SEG6_LOCAL_NH6]	= { .type = NLA_BINARY,
+				    .len = sizeof(struct in6_addr) },
+	[SEG6_LOCAL_IIF]	= { .type = NLA_U32 },
+	[SEG6_LOCAL_OIF]	= { .type = NLA_U32 },
+};
+
+struct seg6_action_param {
+	int (*parse)(struct nlattr **attrs, struct seg6_local_lwt *slwt);
+	int (*put)(struct sk_buff *skb, struct seg6_local_lwt *slwt);
+	int (*cmp)(struct seg6_local_lwt *a, struct seg6_local_lwt *b);
+};
+
+static struct seg6_action_param seg6_action_params[SEG6_LOCAL_MAX + 1] = {
+	[SEG6_LOCAL_SRH]	= { .parse = NULL,
+				    .put = NULL,
+				    .cmp = NULL },
+
+	[SEG6_LOCAL_TABLE]	= { .parse = NULL,
+				    .put = NULL,
+				    .cmp = NULL },
+
+	[SEG6_LOCAL_NH4]	= { .parse = NULL,
+				    .put = NULL,
+				    .cmp = NULL },
+
+	[SEG6_LOCAL_NH6]	= { .parse = NULL,
+				    .put = NULL,
+				    .cmp = NULL },
+
+	[SEG6_LOCAL_IIF]	= { .parse = NULL,
+				    .put = NULL,
+				    .cmp = NULL },
+
+	[SEG6_LOCAL_OIF]	= { .parse = NULL,
+				    .put = NULL,
+				    .cmp = NULL },
+};
+
+static int parse_nla_action(struct nlattr **attrs, struct seg6_local_lwt *slwt)
+{
+	struct seg6_action_param *param;
+	struct seg6_action_desc *desc;
+	int i, err;
+
+	desc = __get_action_desc(slwt->action);
+	if (!desc)
+		return -EINVAL;
+
+	if (!desc->input)
+		return -EOPNOTSUPP;
+
+	slwt->desc = desc;
+	slwt->headroom += desc->static_headroom;
+
+	for (i = 0; i < SEG6_LOCAL_MAX + 1; i++) {
+		if (desc->attrs & (1 << i)) {
+			if (!attrs[i])
+				return -EINVAL;
+
+			param = &seg6_action_params[i];
+
+			err = param->parse(attrs, slwt);
+			if (err < 0)
+				return err;
+		}
+	}
+
+	return 0;
+}
+
+static int seg6_local_build_state(struct nlattr *nla, unsigned int family,
+				  const void *cfg, struct lwtunnel_state **ts,
+				  struct netlink_ext_ack *extack)
+{
+	struct nlattr *tb[SEG6_LOCAL_MAX + 1];
+	struct lwtunnel_state *newts;
+	struct seg6_local_lwt *slwt;
+	int err;
+
+	err = nla_parse_nested(tb, SEG6_LOCAL_MAX, nla, seg6_local_policy,
+			       extack);
+
+	if (err < 0)
+		return err;
+
+	if (!tb[SEG6_LOCAL_ACTION])
+		return -EINVAL;
+
+	newts = lwtunnel_state_alloc(sizeof(*slwt));
+	if (!newts)
+		return -ENOMEM;
+
+	slwt = seg6_local_lwtunnel(newts);
+	slwt->action = nla_get_u32(tb[SEG6_LOCAL_ACTION]);
+
+	err = parse_nla_action(tb, slwt);
+	if (err < 0)
+		goto out_free;
+
+	newts->type = LWTUNNEL_ENCAP_SEG6_LOCAL;
+	newts->flags = LWTUNNEL_STATE_INPUT_REDIRECT;
+	newts->headroom = slwt->headroom;
+
+	*ts = newts;
+
+	return 0;
+
+out_free:
+	kfree(slwt->srh);
+	kfree(newts);
+	return err;
+}
+
+static void seg6_local_destroy_state(struct lwtunnel_state *lwt)
+{
+	struct seg6_local_lwt *slwt = seg6_local_lwtunnel(lwt);
+
+	kfree(slwt->srh);
+}
+
+static int seg6_local_fill_encap(struct sk_buff *skb,
+				 struct lwtunnel_state *lwt)
+{
+	struct seg6_local_lwt *slwt = seg6_local_lwtunnel(lwt);
+	struct seg6_action_param *param;
+	int i, err;
+
+	if (nla_put_u32(skb, SEG6_LOCAL_ACTION, slwt->action))
+		return -EMSGSIZE;
+
+	for (i = 0; i < SEG6_LOCAL_MAX + 1; i++) {
+		if (slwt->desc->attrs & (1 << i)) {
+			param = &seg6_action_params[i];
+			err = param->put(skb, slwt);
+			if (err < 0)
+				return err;
+		}
+	}
+
+	return 0;
+}
+
+static int seg6_local_get_encap_size(struct lwtunnel_state *lwt)
+{
+	struct seg6_local_lwt *slwt = seg6_local_lwtunnel(lwt);
+	unsigned long attrs;
+	int nlsize;
+
+	nlsize = nla_total_size(4); /* action */
+
+	attrs = slwt->desc->attrs;
+
+	if (attrs & (1 << SEG6_LOCAL_SRH))
+		nlsize += nla_total_size((slwt->srh->hdrlen + 1) << 3);
+
+	if (attrs & (1 << SEG6_LOCAL_TABLE))
+		nlsize += nla_total_size(4);
+
+	if (attrs & (1 << SEG6_LOCAL_NH4))
+		nlsize += nla_total_size(4);
+
+	if (attrs & (1 << SEG6_LOCAL_NH6))
+		nlsize += nla_total_size(16);
+
+	if (attrs & (1 << SEG6_LOCAL_IIF))
+		nlsize += nla_total_size(4);
+
+	if (attrs & (1 << SEG6_LOCAL_OIF))
+		nlsize += nla_total_size(4);
+
+	return nlsize;
+}
+
+static int seg6_local_cmp_encap(struct lwtunnel_state *a,
+				struct lwtunnel_state *b)
+{
+	struct seg6_local_lwt *slwt_a, *slwt_b;
+	struct seg6_action_param *param;
+	int i;
+
+	slwt_a = seg6_local_lwtunnel(a);
+	slwt_b = seg6_local_lwtunnel(b);
+
+	if (slwt_a->action != slwt_b->action)
+		return 1;
+
+	if (slwt_a->desc->attrs != slwt_b->desc->attrs)
+		return 1;
+
+	for (i = 0; i < SEG6_LOCAL_MAX + 1; i++) {
+		if (slwt_a->desc->attrs & (1 << i)) {
+			param = &seg6_action_params[i];
+			if (param->cmp(slwt_a, slwt_b))
+				return 1;
+		}
+	}
+
+	return 0;
+}
+
+static const struct lwtunnel_encap_ops seg6_local_ops = {
+	.build_state	= seg6_local_build_state,
+	.destroy_state	= seg6_local_destroy_state,
+	.input		= seg6_local_input,
+	.fill_encap	= seg6_local_fill_encap,
+	.get_encap_size	= seg6_local_get_encap_size,
+	.cmp_encap	= seg6_local_cmp_encap,
+	.owner		= THIS_MODULE,
+};
+
+int __init seg6_local_init(void)
+{
+	return lwtunnel_encap_add_ops(&seg6_local_ops,
+				      LWTUNNEL_ENCAP_SEG6_LOCAL);
+}
+
+void seg6_local_exit(void)
+{
+	lwtunnel_encap_del_ops(&seg6_local_ops, LWTUNNEL_ENCAP_SEG6_LOCAL);
+}
-- 
cgit v1.2.3


From 6c2c1dcb185f1e44e1c895781dbaba40195234f9 Mon Sep 17 00:00:00 2001
From: Arkadi Sharshevsky <arkadis@mellanox.com>
Date: Sun, 6 Aug 2017 16:15:39 +0300
Subject: net: dsa: Change DSA slave FDB API to be switchdev independent

In order to support FDB add/del to be on a notifier chain the slave
API need to be changed to be switchdev independent.

Signed-off-by: Arkadi Sharshevsky <arkadis@mellanox.com>
Reviewed-by: Vivien Didelot <vivien.didelot@savoirfairelinux.com>
Reviewed-by: Florian Fainelli <f.fainelli@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/dsa/b53/b53_common.c       | 12 +++++-------
 drivers/net/dsa/b53/b53_priv.h         |  8 +++-----
 drivers/net/dsa/microchip/ksz_common.c | 34 ++++++++++++++++------------------
 drivers/net/dsa/mt7530.c               | 14 ++++++--------
 drivers/net/dsa/mv88e6xxx/chip.c       | 12 +++++-------
 drivers/net/dsa/qca8k.c                | 15 ++++++---------
 include/net/dsa.h                      |  8 +++-----
 net/dsa/switch.c                       |  8 +++++---
 8 files changed, 49 insertions(+), 62 deletions(-)

(limited to 'include')

diff --git a/drivers/net/dsa/b53/b53_common.c b/drivers/net/dsa/b53/b53_common.c
index 7f36d3e3c98b..53361796607a 100644
--- a/drivers/net/dsa/b53/b53_common.c
+++ b/drivers/net/dsa/b53/b53_common.c
@@ -1214,8 +1214,7 @@ static int b53_arl_op(struct b53_device *dev, int op, int port,
 }
 
 int b53_fdb_prepare(struct dsa_switch *ds, int port,
-		    const struct switchdev_obj_port_fdb *fdb,
-		    struct switchdev_trans *trans)
+		    const unsigned char *addr, u16 vid)
 {
 	struct b53_device *priv = ds->priv;
 
@@ -1230,22 +1229,21 @@ int b53_fdb_prepare(struct dsa_switch *ds, int port,
 EXPORT_SYMBOL(b53_fdb_prepare);
 
 void b53_fdb_add(struct dsa_switch *ds, int port,
-		 const struct switchdev_obj_port_fdb *fdb,
-		 struct switchdev_trans *trans)
+		 const unsigned char *addr, u16 vid)
 {
 	struct b53_device *priv = ds->priv;
 
-	if (b53_arl_op(priv, 0, port, fdb->addr, fdb->vid, true))
+	if (b53_arl_op(priv, 0, port, addr, vid, true))
 		pr_err("%s: failed to add MAC address\n", __func__);
 }
 EXPORT_SYMBOL(b53_fdb_add);
 
 int b53_fdb_del(struct dsa_switch *ds, int port,
-		const struct switchdev_obj_port_fdb *fdb)
+		const unsigned char *addr, u16 vid)
 {
 	struct b53_device *priv = ds->priv;
 
-	return b53_arl_op(priv, 0, port, fdb->addr, fdb->vid, false);
+	return b53_arl_op(priv, 0, port, addr, vid, false);
 }
 EXPORT_SYMBOL(b53_fdb_del);
 
diff --git a/drivers/net/dsa/b53/b53_priv.h b/drivers/net/dsa/b53/b53_priv.h
index 155a9c48c317..d417bcaec71d 100644
--- a/drivers/net/dsa/b53/b53_priv.h
+++ b/drivers/net/dsa/b53/b53_priv.h
@@ -397,13 +397,11 @@ int b53_vlan_dump(struct dsa_switch *ds, int port,
 		  struct switchdev_obj_port_vlan *vlan,
 		  switchdev_obj_dump_cb_t *cb);
 int b53_fdb_prepare(struct dsa_switch *ds, int port,
-		    const struct switchdev_obj_port_fdb *fdb,
-		    struct switchdev_trans *trans);
+		    const unsigned char *addr, u16 vid);
 void b53_fdb_add(struct dsa_switch *ds, int port,
-		 const struct switchdev_obj_port_fdb *fdb,
-		 struct switchdev_trans *trans);
+		 const unsigned char *addr, u16 vid);
 int b53_fdb_del(struct dsa_switch *ds, int port,
-		const struct switchdev_obj_port_fdb *fdb);
+		const unsigned char *addr, u16 vid);
 int b53_fdb_dump(struct dsa_switch *ds, int port,
 		 struct switchdev_obj_port_fdb *fdb,
 		 switchdev_obj_dump_cb_t *cb);
diff --git a/drivers/net/dsa/microchip/ksz_common.c b/drivers/net/dsa/microchip/ksz_common.c
index b313ecdf2919..db828080ee93 100644
--- a/drivers/net/dsa/microchip/ksz_common.c
+++ b/drivers/net/dsa/microchip/ksz_common.c
@@ -679,8 +679,7 @@ static int ksz_port_vlan_dump(struct dsa_switch *ds, int port,
 }
 
 static int ksz_port_fdb_prepare(struct dsa_switch *ds, int port,
-				const struct switchdev_obj_port_fdb *fdb,
-				struct switchdev_trans *trans)
+				const unsigned char *addr, u16 vid)
 {
 	/* nothing needed */
 
@@ -707,8 +706,7 @@ struct alu_struct {
 };
 
 static void ksz_port_fdb_add(struct dsa_switch *ds, int port,
-			     const struct switchdev_obj_port_fdb *fdb,
-			     struct switchdev_trans *trans)
+			     const unsigned char *addr, u16 vid)
 {
 	struct ksz_device *dev = ds->priv;
 	u32 alu_table[4];
@@ -717,12 +715,12 @@ static void ksz_port_fdb_add(struct dsa_switch *ds, int port,
 	mutex_lock(&dev->alu_mutex);
 
 	/* find any entry with mac & vid */
-	data = fdb->vid << ALU_FID_INDEX_S;
-	data |= ((fdb->addr[0] << 8) | fdb->addr[1]);
+	data = vid << ALU_FID_INDEX_S;
+	data |= ((addr[0] << 8) | addr[1]);
 	ksz_write32(dev, REG_SW_ALU_INDEX_0, data);
 
-	data = ((fdb->addr[2] << 24) | (fdb->addr[3] << 16));
-	data |= ((fdb->addr[4] << 8) | fdb->addr[5]);
+	data = ((addr[2] << 24) | (addr[3] << 16));
+	data |= ((addr[4] << 8) | addr[5]);
 	ksz_write32(dev, REG_SW_ALU_INDEX_1, data);
 
 	/* start read operation */
@@ -740,12 +738,12 @@ static void ksz_port_fdb_add(struct dsa_switch *ds, int port,
 	/* update ALU entry */
 	alu_table[0] = ALU_V_STATIC_VALID;
 	alu_table[1] |= BIT(port);
-	if (fdb->vid)
+	if (vid)
 		alu_table[1] |= ALU_V_USE_FID;
-	alu_table[2] = (fdb->vid << ALU_V_FID_S);
-	alu_table[2] |= ((fdb->addr[0] << 8) | fdb->addr[1]);
-	alu_table[3] = ((fdb->addr[2] << 24) | (fdb->addr[3] << 16));
-	alu_table[3] |= ((fdb->addr[4] << 8) | fdb->addr[5]);
+	alu_table[2] = (vid << ALU_V_FID_S);
+	alu_table[2] |= ((addr[0] << 8) | addr[1]);
+	alu_table[3] = ((addr[2] << 24) | (addr[3] << 16));
+	alu_table[3] |= ((addr[4] << 8) | addr[5]);
 
 	write_table(ds, alu_table);
 
@@ -760,7 +758,7 @@ exit:
 }
 
 static int ksz_port_fdb_del(struct dsa_switch *ds, int port,
-			    const struct switchdev_obj_port_fdb *fdb)
+			    const unsigned char *addr, u16 vid)
 {
 	struct ksz_device *dev = ds->priv;
 	u32 alu_table[4];
@@ -770,12 +768,12 @@ static int ksz_port_fdb_del(struct dsa_switch *ds, int port,
 	mutex_lock(&dev->alu_mutex);
 
 	/* read any entry with mac & vid */
-	data = fdb->vid << ALU_FID_INDEX_S;
-	data |= ((fdb->addr[0] << 8) | fdb->addr[1]);
+	data = vid << ALU_FID_INDEX_S;
+	data |= ((addr[0] << 8) | addr[1]);
 	ksz_write32(dev, REG_SW_ALU_INDEX_0, data);
 
-	data = ((fdb->addr[2] << 24) | (fdb->addr[3] << 16));
-	data |= ((fdb->addr[4] << 8) | fdb->addr[5]);
+	data = ((addr[2] << 24) | (addr[3] << 16));
+	data |= ((addr[4] << 8) | addr[5]);
 	ksz_write32(dev, REG_SW_ALU_INDEX_1, data);
 
 	/* start read operation */
diff --git a/drivers/net/dsa/mt7530.c b/drivers/net/dsa/mt7530.c
index 1e46418a3b74..430e3ab65a49 100644
--- a/drivers/net/dsa/mt7530.c
+++ b/drivers/net/dsa/mt7530.c
@@ -802,8 +802,7 @@ mt7530_port_bridge_leave(struct dsa_switch *ds, int port,
 
 static int
 mt7530_port_fdb_prepare(struct dsa_switch *ds, int port,
-			const struct switchdev_obj_port_fdb *fdb,
-			struct switchdev_trans *trans)
+			const unsigned char *addr, u16 vid)
 {
 	struct mt7530_priv *priv = ds->priv;
 	int ret;
@@ -813,7 +812,7 @@ mt7530_port_fdb_prepare(struct dsa_switch *ds, int port,
 	 * is called while the entry is still available.
 	 */
 	mutex_lock(&priv->reg_mutex);
-	mt7530_fdb_write(priv, fdb->vid, 0, fdb->addr, -1, STATIC_ENT);
+	mt7530_fdb_write(priv, vid, 0, addr, -1, STATIC_ENT);
 	ret = mt7530_fdb_cmd(priv, MT7530_FDB_WRITE, 0);
 	mutex_unlock(&priv->reg_mutex);
 
@@ -822,28 +821,27 @@ mt7530_port_fdb_prepare(struct dsa_switch *ds, int port,
 
 static void
 mt7530_port_fdb_add(struct dsa_switch *ds, int port,
-		    const struct switchdev_obj_port_fdb *fdb,
-		    struct switchdev_trans *trans)
+		    const unsigned char *addr, u16 vid)
 {
 	struct mt7530_priv *priv = ds->priv;
 	u8 port_mask = BIT(port);
 
 	mutex_lock(&priv->reg_mutex);
-	mt7530_fdb_write(priv, fdb->vid, port_mask, fdb->addr, -1, STATIC_ENT);
+	mt7530_fdb_write(priv, vid, port_mask, addr, -1, STATIC_ENT);
 	mt7530_fdb_cmd(priv, MT7530_FDB_WRITE, 0);
 	mutex_unlock(&priv->reg_mutex);
 }
 
 static int
 mt7530_port_fdb_del(struct dsa_switch *ds, int port,
-		    const struct switchdev_obj_port_fdb *fdb)
+		    const unsigned char *addr, u16 vid)
 {
 	struct mt7530_priv *priv = ds->priv;
 	int ret;
 	u8 port_mask = BIT(port);
 
 	mutex_lock(&priv->reg_mutex);
-	mt7530_fdb_write(priv, fdb->vid, port_mask, fdb->addr, -1, STATIC_EMP);
+	mt7530_fdb_write(priv, vid, port_mask, addr, -1, STATIC_EMP);
 	ret = mt7530_fdb_cmd(priv, MT7530_FDB_WRITE, 0);
 	mutex_unlock(&priv->reg_mutex);
 
diff --git a/drivers/net/dsa/mv88e6xxx/chip.c b/drivers/net/dsa/mv88e6xxx/chip.c
index 521738c4cd17..823697526db4 100644
--- a/drivers/net/dsa/mv88e6xxx/chip.c
+++ b/drivers/net/dsa/mv88e6xxx/chip.c
@@ -1407,8 +1407,7 @@ static int mv88e6xxx_port_db_load_purge(struct mv88e6xxx_chip *chip, int port,
 }
 
 static int mv88e6xxx_port_fdb_prepare(struct dsa_switch *ds, int port,
-				      const struct switchdev_obj_port_fdb *fdb,
-				      struct switchdev_trans *trans)
+				      const unsigned char *addr, u16 vid)
 {
 	/* We don't need any dynamic resource from the kernel (yet),
 	 * so skip the prepare phase.
@@ -1417,13 +1416,12 @@ static int mv88e6xxx_port_fdb_prepare(struct dsa_switch *ds, int port,
 }
 
 static void mv88e6xxx_port_fdb_add(struct dsa_switch *ds, int port,
-				   const struct switchdev_obj_port_fdb *fdb,
-				   struct switchdev_trans *trans)
+				   const unsigned char *addr, u16 vid)
 {
 	struct mv88e6xxx_chip *chip = ds->priv;
 
 	mutex_lock(&chip->reg_lock);
-	if (mv88e6xxx_port_db_load_purge(chip, port, fdb->addr, fdb->vid,
+	if (mv88e6xxx_port_db_load_purge(chip, port, addr, vid,
 					 MV88E6XXX_G1_ATU_DATA_STATE_UC_STATIC))
 		dev_err(ds->dev, "p%d: failed to load unicast MAC address\n",
 			port);
@@ -1431,13 +1429,13 @@ static void mv88e6xxx_port_fdb_add(struct dsa_switch *ds, int port,
 }
 
 static int mv88e6xxx_port_fdb_del(struct dsa_switch *ds, int port,
-				  const struct switchdev_obj_port_fdb *fdb)
+				  const unsigned char *addr, u16 vid)
 {
 	struct mv88e6xxx_chip *chip = ds->priv;
 	int err;
 
 	mutex_lock(&chip->reg_lock);
-	err = mv88e6xxx_port_db_load_purge(chip, port, fdb->addr, fdb->vid,
+	err = mv88e6xxx_port_db_load_purge(chip, port, addr, vid,
 					   MV88E6XXX_G1_ATU_DATA_STATE_UNUSED);
 	mutex_unlock(&chip->reg_lock);
 
diff --git a/drivers/net/dsa/qca8k.c b/drivers/net/dsa/qca8k.c
index 36c169b0c705..2fb5df9dbd64 100644
--- a/drivers/net/dsa/qca8k.c
+++ b/drivers/net/dsa/qca8k.c
@@ -778,8 +778,7 @@ qca8k_port_fdb_insert(struct qca8k_priv *priv, const u8 *addr,
 
 static int
 qca8k_port_fdb_prepare(struct dsa_switch *ds, int port,
-		       const struct switchdev_obj_port_fdb *fdb,
-		       struct switchdev_trans *trans)
+		       const unsigned char *addr, u16 vid)
 {
 	struct qca8k_priv *priv = (struct qca8k_priv *)ds->priv;
 
@@ -788,33 +787,31 @@ qca8k_port_fdb_prepare(struct dsa_switch *ds, int port,
 	 * when port_fdb_add is called an entry is still available. Otherwise
 	 * the last free entry might have been used up by auto learning
 	 */
-	return qca8k_port_fdb_insert(priv, fdb->addr, 0, fdb->vid);
+	return qca8k_port_fdb_insert(priv, addr, 0, vid);
 }
 
 static void
 qca8k_port_fdb_add(struct dsa_switch *ds, int port,
-		   const struct switchdev_obj_port_fdb *fdb,
-		   struct switchdev_trans *trans)
+		   const unsigned char *addr, u16 vid)
 {
 	struct qca8k_priv *priv = (struct qca8k_priv *)ds->priv;
 	u16 port_mask = BIT(port);
 
 	/* Update the FDB entry adding the port_mask */
-	qca8k_port_fdb_insert(priv, fdb->addr, port_mask, fdb->vid);
+	qca8k_port_fdb_insert(priv, addr, port_mask, vid);
 }
 
 static int
 qca8k_port_fdb_del(struct dsa_switch *ds, int port,
-		   const struct switchdev_obj_port_fdb *fdb)
+		   const unsigned char *addr, u16 vid)
 {
 	struct qca8k_priv *priv = (struct qca8k_priv *)ds->priv;
 	u16 port_mask = BIT(port);
-	u16 vid = fdb->vid;
 
 	if (!vid)
 		vid = 1;
 
-	return qca8k_fdb_del(priv, fdb->addr, port_mask, vid);
+	return qca8k_fdb_del(priv, addr, port_mask, vid);
 }
 
 static int
diff --git a/include/net/dsa.h b/include/net/dsa.h
index 0b1a0622b33c..ba11005dac8c 100644
--- a/include/net/dsa.h
+++ b/include/net/dsa.h
@@ -386,13 +386,11 @@ struct dsa_switch_ops {
 	 * Forwarding database
 	 */
 	int	(*port_fdb_prepare)(struct dsa_switch *ds, int port,
-				    const struct switchdev_obj_port_fdb *fdb,
-				    struct switchdev_trans *trans);
+				    const unsigned char *addr, u16 vid);
 	void	(*port_fdb_add)(struct dsa_switch *ds, int port,
-				const struct switchdev_obj_port_fdb *fdb,
-				struct switchdev_trans *trans);
+				const unsigned char *addr, u16 vid);
 	int	(*port_fdb_del)(struct dsa_switch *ds, int port,
-				const struct switchdev_obj_port_fdb *fdb);
+				const unsigned char *addr, u16 vid);
 	int	(*port_fdb_dump)(struct dsa_switch *ds, int port,
 				 struct switchdev_obj_port_fdb *fdb,
 				  switchdev_obj_dump_cb_t *cb);
diff --git a/net/dsa/switch.c b/net/dsa/switch.c
index 97e2e9c8cf3f..a9edfbad3889 100644
--- a/net/dsa/switch.c
+++ b/net/dsa/switch.c
@@ -94,10 +94,11 @@ static int dsa_switch_fdb_add(struct dsa_switch *ds,
 		if (!ds->ops->port_fdb_prepare || !ds->ops->port_fdb_add)
 			return -EOPNOTSUPP;
 
-		return ds->ops->port_fdb_prepare(ds, info->port, fdb, trans);
+		return ds->ops->port_fdb_prepare(ds, info->port, fdb->addr,
+						 fdb->vid);
 	}
 
-	ds->ops->port_fdb_add(ds, info->port, fdb, trans);
+	ds->ops->port_fdb_add(ds, info->port, fdb->addr, fdb->vid);
 
 	return 0;
 }
@@ -114,7 +115,8 @@ static int dsa_switch_fdb_del(struct dsa_switch *ds,
 	if (!ds->ops->port_fdb_del)
 		return -EOPNOTSUPP;
 
-	return ds->ops->port_fdb_del(ds, info->port, fdb);
+	return ds->ops->port_fdb_del(ds, info->port, fdb->addr,
+				     fdb->vid);
 }
 
 static int dsa_switch_mdb_add(struct dsa_switch *ds,
-- 
cgit v1.2.3


From 1b6dd556c3045ca5fa31cc1e98a4a43afa680e1e Mon Sep 17 00:00:00 2001
From: Arkadi Sharshevsky <arkadis@mellanox.com>
Date: Sun, 6 Aug 2017 16:15:40 +0300
Subject: net: dsa: Remove prepare phase for FDB

The prepare phase for FDB add is unneeded because most of DSA devices
can have failures during bus transactions (SPI, I2C, etc.), thus, the
prepare phase cannot guarantee success of the commit stage.

The support for learning FDB through notification chain, which will be
introduced in the following patches, will provide the ability to notify
back the bridge about successful offload.

Signed-off-by: Arkadi Sharshevsky <arkadis@mellanox.com>
Reviewed-by: Vivien Didelot <vivien.didelot@savoirfairelinux.com>
Reviewed-by: Florian Fainelli <f.fainelli@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/dsa/b53/b53_common.c       | 17 +++--------------
 drivers/net/dsa/b53/b53_priv.h         |  6 ++----
 drivers/net/dsa/bcm_sf2.c              |  1 -
 drivers/net/dsa/microchip/ksz_common.c | 24 ++++++++++--------------
 drivers/net/dsa/mt7530.c               | 25 ++++---------------------
 drivers/net/dsa/mv88e6xxx/chip.c       | 23 +++++++----------------
 drivers/net/dsa/qca8k.c                | 18 +-----------------
 include/net/dsa.h                      |  4 +---
 net/dsa/dsa_priv.h                     |  4 +---
 net/dsa/port.c                         |  4 +---
 net/dsa/slave.c                        |  4 +++-
 net/dsa/switch.c                       | 14 +++-----------
 12 files changed, 36 insertions(+), 108 deletions(-)

(limited to 'include')

diff --git a/drivers/net/dsa/b53/b53_common.c b/drivers/net/dsa/b53/b53_common.c
index 53361796607a..3cf4f0a04453 100644
--- a/drivers/net/dsa/b53/b53_common.c
+++ b/drivers/net/dsa/b53/b53_common.c
@@ -1213,8 +1213,8 @@ static int b53_arl_op(struct b53_device *dev, int op, int port,
 	return b53_arl_rw_op(dev, 0);
 }
 
-int b53_fdb_prepare(struct dsa_switch *ds, int port,
-		    const unsigned char *addr, u16 vid)
+int b53_fdb_add(struct dsa_switch *ds, int port,
+		const unsigned char *addr, u16 vid)
 {
 	struct b53_device *priv = ds->priv;
 
@@ -1224,17 +1224,7 @@ int b53_fdb_prepare(struct dsa_switch *ds, int port,
 	if (is5325(priv) || is5365(priv))
 		return -EOPNOTSUPP;
 
-	return 0;
-}
-EXPORT_SYMBOL(b53_fdb_prepare);
-
-void b53_fdb_add(struct dsa_switch *ds, int port,
-		 const unsigned char *addr, u16 vid)
-{
-	struct b53_device *priv = ds->priv;
-
-	if (b53_arl_op(priv, 0, port, addr, vid, true))
-		pr_err("%s: failed to add MAC address\n", __func__);
+	return b53_arl_op(priv, 0, port, addr, vid, true);
 }
 EXPORT_SYMBOL(b53_fdb_add);
 
@@ -1563,7 +1553,6 @@ static const struct dsa_switch_ops b53_switch_ops = {
 	.port_vlan_add		= b53_vlan_add,
 	.port_vlan_del		= b53_vlan_del,
 	.port_vlan_dump		= b53_vlan_dump,
-	.port_fdb_prepare	= b53_fdb_prepare,
 	.port_fdb_dump		= b53_fdb_dump,
 	.port_fdb_add		= b53_fdb_add,
 	.port_fdb_del		= b53_fdb_del,
diff --git a/drivers/net/dsa/b53/b53_priv.h b/drivers/net/dsa/b53/b53_priv.h
index d417bcaec71d..f29c892efa6b 100644
--- a/drivers/net/dsa/b53/b53_priv.h
+++ b/drivers/net/dsa/b53/b53_priv.h
@@ -396,10 +396,8 @@ int b53_vlan_del(struct dsa_switch *ds, int port,
 int b53_vlan_dump(struct dsa_switch *ds, int port,
 		  struct switchdev_obj_port_vlan *vlan,
 		  switchdev_obj_dump_cb_t *cb);
-int b53_fdb_prepare(struct dsa_switch *ds, int port,
-		    const unsigned char *addr, u16 vid);
-void b53_fdb_add(struct dsa_switch *ds, int port,
-		 const unsigned char *addr, u16 vid);
+int b53_fdb_add(struct dsa_switch *ds, int port,
+		const unsigned char *addr, u16 vid);
 int b53_fdb_del(struct dsa_switch *ds, int port,
 		const unsigned char *addr, u16 vid);
 int b53_fdb_dump(struct dsa_switch *ds, int port,
diff --git a/drivers/net/dsa/bcm_sf2.c b/drivers/net/dsa/bcm_sf2.c
index 558667c814c9..1907b27297c3 100644
--- a/drivers/net/dsa/bcm_sf2.c
+++ b/drivers/net/dsa/bcm_sf2.c
@@ -1022,7 +1022,6 @@ static const struct dsa_switch_ops bcm_sf2_ops = {
 	.port_vlan_add		= b53_vlan_add,
 	.port_vlan_del		= b53_vlan_del,
 	.port_vlan_dump		= b53_vlan_dump,
-	.port_fdb_prepare	= b53_fdb_prepare,
 	.port_fdb_dump		= b53_fdb_dump,
 	.port_fdb_add		= b53_fdb_add,
 	.port_fdb_del		= b53_fdb_del,
diff --git a/drivers/net/dsa/microchip/ksz_common.c b/drivers/net/dsa/microchip/ksz_common.c
index db828080ee93..b55f3649ff93 100644
--- a/drivers/net/dsa/microchip/ksz_common.c
+++ b/drivers/net/dsa/microchip/ksz_common.c
@@ -678,14 +678,6 @@ static int ksz_port_vlan_dump(struct dsa_switch *ds, int port,
 	return err;
 }
 
-static int ksz_port_fdb_prepare(struct dsa_switch *ds, int port,
-				const unsigned char *addr, u16 vid)
-{
-	/* nothing needed */
-
-	return 0;
-}
-
 struct alu_struct {
 	/* entry 1 */
 	u8	is_static:1;
@@ -705,12 +697,13 @@ struct alu_struct {
 	u8	mac[ETH_ALEN];
 };
 
-static void ksz_port_fdb_add(struct dsa_switch *ds, int port,
-			     const unsigned char *addr, u16 vid)
+static int ksz_port_fdb_add(struct dsa_switch *ds, int port,
+			    const unsigned char *addr, u16 vid)
 {
 	struct ksz_device *dev = ds->priv;
 	u32 alu_table[4];
 	u32 data;
+	int ret = 0;
 
 	mutex_lock(&dev->alu_mutex);
 
@@ -727,7 +720,8 @@ static void ksz_port_fdb_add(struct dsa_switch *ds, int port,
 	ksz_write32(dev, REG_SW_ALU_CTRL__4, ALU_READ | ALU_START);
 
 	/* wait to be finished */
-	if (wait_alu_ready(dev, ALU_START, 1000) < 0) {
+	ret = wait_alu_ready(dev, ALU_START, 1000);
+	if (ret < 0) {
 		dev_dbg(dev->dev, "Failed to read ALU\n");
 		goto exit;
 	}
@@ -750,11 +744,14 @@ static void ksz_port_fdb_add(struct dsa_switch *ds, int port,
 	ksz_write32(dev, REG_SW_ALU_CTRL__4, ALU_WRITE | ALU_START);
 
 	/* wait to be finished */
-	if (wait_alu_ready(dev, ALU_START, 1000) < 0)
-		dev_dbg(dev->dev, "Failed to read ALU\n");
+	ret = wait_alu_ready(dev, ALU_START, 1000);
+	if (ret < 0)
+		dev_dbg(dev->dev, "Failed to write ALU\n");
 
 exit:
 	mutex_unlock(&dev->alu_mutex);
+
+	return ret;
 }
 
 static int ksz_port_fdb_del(struct dsa_switch *ds, int port,
@@ -1128,7 +1125,6 @@ static const struct dsa_switch_ops ksz_switch_ops = {
 	.port_vlan_add		= ksz_port_vlan_add,
 	.port_vlan_del		= ksz_port_vlan_del,
 	.port_vlan_dump		= ksz_port_vlan_dump,
-	.port_fdb_prepare	= ksz_port_fdb_prepare,
 	.port_fdb_dump		= ksz_port_fdb_dump,
 	.port_fdb_add		= ksz_port_fdb_add,
 	.port_fdb_del		= ksz_port_fdb_del,
diff --git a/drivers/net/dsa/mt7530.c b/drivers/net/dsa/mt7530.c
index 430e3ab65a49..f92aae8947e6 100644
--- a/drivers/net/dsa/mt7530.c
+++ b/drivers/net/dsa/mt7530.c
@@ -801,35 +801,19 @@ mt7530_port_bridge_leave(struct dsa_switch *ds, int port,
 }
 
 static int
-mt7530_port_fdb_prepare(struct dsa_switch *ds, int port,
-			const unsigned char *addr, u16 vid)
-{
-	struct mt7530_priv *priv = ds->priv;
-	int ret;
-
-	/* Because auto-learned entrie shares the same FDB table.
-	 * an entry is reserved with no port_mask to make sure fdb_add
-	 * is called while the entry is still available.
-	 */
-	mutex_lock(&priv->reg_mutex);
-	mt7530_fdb_write(priv, vid, 0, addr, -1, STATIC_ENT);
-	ret = mt7530_fdb_cmd(priv, MT7530_FDB_WRITE, 0);
-	mutex_unlock(&priv->reg_mutex);
-
-	return ret;
-}
-
-static void
 mt7530_port_fdb_add(struct dsa_switch *ds, int port,
 		    const unsigned char *addr, u16 vid)
 {
 	struct mt7530_priv *priv = ds->priv;
+	int ret;
 	u8 port_mask = BIT(port);
 
 	mutex_lock(&priv->reg_mutex);
 	mt7530_fdb_write(priv, vid, port_mask, addr, -1, STATIC_ENT);
-	mt7530_fdb_cmd(priv, MT7530_FDB_WRITE, 0);
+	ret = mt7530_fdb_cmd(priv, MT7530_FDB_WRITE, 0);
 	mutex_unlock(&priv->reg_mutex);
+
+	return ret;
 }
 
 static int
@@ -1013,7 +997,6 @@ static struct dsa_switch_ops mt7530_switch_ops = {
 	.port_stp_state_set	= mt7530_stp_state_set,
 	.port_bridge_join	= mt7530_port_bridge_join,
 	.port_bridge_leave	= mt7530_port_bridge_leave,
-	.port_fdb_prepare	= mt7530_port_fdb_prepare,
 	.port_fdb_add		= mt7530_port_fdb_add,
 	.port_fdb_del		= mt7530_port_fdb_del,
 	.port_fdb_dump		= mt7530_port_fdb_dump,
diff --git a/drivers/net/dsa/mv88e6xxx/chip.c b/drivers/net/dsa/mv88e6xxx/chip.c
index 823697526db4..68cc1f6fb75e 100644
--- a/drivers/net/dsa/mv88e6xxx/chip.c
+++ b/drivers/net/dsa/mv88e6xxx/chip.c
@@ -1406,26 +1406,18 @@ static int mv88e6xxx_port_db_load_purge(struct mv88e6xxx_chip *chip, int port,
 	return mv88e6xxx_g1_atu_loadpurge(chip, vlan.fid, &entry);
 }
 
-static int mv88e6xxx_port_fdb_prepare(struct dsa_switch *ds, int port,
-				      const unsigned char *addr, u16 vid)
-{
-	/* We don't need any dynamic resource from the kernel (yet),
-	 * so skip the prepare phase.
-	 */
-	return 0;
-}
-
-static void mv88e6xxx_port_fdb_add(struct dsa_switch *ds, int port,
-				   const unsigned char *addr, u16 vid)
+static int mv88e6xxx_port_fdb_add(struct dsa_switch *ds, int port,
+				  const unsigned char *addr, u16 vid)
 {
 	struct mv88e6xxx_chip *chip = ds->priv;
+	int err;
 
 	mutex_lock(&chip->reg_lock);
-	if (mv88e6xxx_port_db_load_purge(chip, port, addr, vid,
-					 MV88E6XXX_G1_ATU_DATA_STATE_UC_STATIC))
-		dev_err(ds->dev, "p%d: failed to load unicast MAC address\n",
-			port);
+	err = mv88e6xxx_port_db_load_purge(chip, port, addr, vid,
+					   MV88E6XXX_G1_ATU_DATA_STATE_UC_STATIC);
 	mutex_unlock(&chip->reg_lock);
+
+	return err;
 }
 
 static int mv88e6xxx_port_fdb_del(struct dsa_switch *ds, int port,
@@ -3905,7 +3897,6 @@ static const struct dsa_switch_ops mv88e6xxx_switch_ops = {
 	.port_vlan_add		= mv88e6xxx_port_vlan_add,
 	.port_vlan_del		= mv88e6xxx_port_vlan_del,
 	.port_vlan_dump		= mv88e6xxx_port_vlan_dump,
-	.port_fdb_prepare       = mv88e6xxx_port_fdb_prepare,
 	.port_fdb_add           = mv88e6xxx_port_fdb_add,
 	.port_fdb_del           = mv88e6xxx_port_fdb_del,
 	.port_fdb_dump          = mv88e6xxx_port_fdb_dump,
diff --git a/drivers/net/dsa/qca8k.c b/drivers/net/dsa/qca8k.c
index 2fb5df9dbd64..f8ef823349bc 100644
--- a/drivers/net/dsa/qca8k.c
+++ b/drivers/net/dsa/qca8k.c
@@ -777,28 +777,13 @@ qca8k_port_fdb_insert(struct qca8k_priv *priv, const u8 *addr,
 }
 
 static int
-qca8k_port_fdb_prepare(struct dsa_switch *ds, int port,
-		       const unsigned char *addr, u16 vid)
-{
-	struct qca8k_priv *priv = (struct qca8k_priv *)ds->priv;
-
-	/* The FDB table for static and auto learned entries is the same. We
-	 * need to reserve an entry with no port_mask set to make sure that
-	 * when port_fdb_add is called an entry is still available. Otherwise
-	 * the last free entry might have been used up by auto learning
-	 */
-	return qca8k_port_fdb_insert(priv, addr, 0, vid);
-}
-
-static void
 qca8k_port_fdb_add(struct dsa_switch *ds, int port,
 		   const unsigned char *addr, u16 vid)
 {
 	struct qca8k_priv *priv = (struct qca8k_priv *)ds->priv;
 	u16 port_mask = BIT(port);
 
-	/* Update the FDB entry adding the port_mask */
-	qca8k_port_fdb_insert(priv, addr, port_mask, vid);
+	return qca8k_port_fdb_insert(priv, addr, port_mask, vid);
 }
 
 static int
@@ -866,7 +851,6 @@ static const struct dsa_switch_ops qca8k_switch_ops = {
 	.port_stp_state_set	= qca8k_port_stp_state_set,
 	.port_bridge_join	= qca8k_port_bridge_join,
 	.port_bridge_leave	= qca8k_port_bridge_leave,
-	.port_fdb_prepare	= qca8k_port_fdb_prepare,
 	.port_fdb_add		= qca8k_port_fdb_add,
 	.port_fdb_del		= qca8k_port_fdb_del,
 	.port_fdb_dump		= qca8k_port_fdb_dump,
diff --git a/include/net/dsa.h b/include/net/dsa.h
index ba11005dac8c..446fc438cb80 100644
--- a/include/net/dsa.h
+++ b/include/net/dsa.h
@@ -385,9 +385,7 @@ struct dsa_switch_ops {
 	/*
 	 * Forwarding database
 	 */
-	int	(*port_fdb_prepare)(struct dsa_switch *ds, int port,
-				    const unsigned char *addr, u16 vid);
-	void	(*port_fdb_add)(struct dsa_switch *ds, int port,
+	int	(*port_fdb_add)(struct dsa_switch *ds, int port,
 				const unsigned char *addr, u16 vid);
 	int	(*port_fdb_del)(struct dsa_switch *ds, int port,
 				const unsigned char *addr, u16 vid);
diff --git a/net/dsa/dsa_priv.h b/net/dsa/dsa_priv.h
index 945ded148c9c..04cd711ed2fd 100644
--- a/net/dsa/dsa_priv.h
+++ b/net/dsa/dsa_priv.h
@@ -44,7 +44,6 @@ struct dsa_notifier_bridge_info {
 /* DSA_NOTIFIER_FDB_* */
 struct dsa_notifier_fdb_info {
 	const struct switchdev_obj_port_fdb *fdb;
-	struct switchdev_trans *trans;
 	int sw_index;
 	int port;
 };
@@ -122,8 +121,7 @@ int dsa_port_vlan_filtering(struct dsa_port *dp, bool vlan_filtering,
 int dsa_port_ageing_time(struct dsa_port *dp, clock_t ageing_clock,
 			 struct switchdev_trans *trans);
 int dsa_port_fdb_add(struct dsa_port *dp,
-		     const struct switchdev_obj_port_fdb *fdb,
-		     struct switchdev_trans *trans);
+		     const struct switchdev_obj_port_fdb *fdb);
 int dsa_port_fdb_del(struct dsa_port *dp,
 		     const struct switchdev_obj_port_fdb *fdb);
 int dsa_port_fdb_dump(struct dsa_port *dp, struct switchdev_obj_port_fdb *fdb,
diff --git a/net/dsa/port.c b/net/dsa/port.c
index efc3bce3a89d..bd271b9cc1f2 100644
--- a/net/dsa/port.c
+++ b/net/dsa/port.c
@@ -147,13 +147,11 @@ int dsa_port_ageing_time(struct dsa_port *dp, clock_t ageing_clock,
 }
 
 int dsa_port_fdb_add(struct dsa_port *dp,
-		     const struct switchdev_obj_port_fdb *fdb,
-		     struct switchdev_trans *trans)
+		     const struct switchdev_obj_port_fdb *fdb)
 {
 	struct dsa_notifier_fdb_info info = {
 		.sw_index = dp->ds->index,
 		.port = dp->index,
-		.trans = trans,
 		.fdb = fdb,
 	};
 
diff --git a/net/dsa/slave.c b/net/dsa/slave.c
index 3b36c47472c6..bb7ab26ef768 100644
--- a/net/dsa/slave.c
+++ b/net/dsa/slave.c
@@ -251,7 +251,9 @@ static int dsa_slave_port_obj_add(struct net_device *dev,
 
 	switch (obj->id) {
 	case SWITCHDEV_OBJ_ID_PORT_FDB:
-		err = dsa_port_fdb_add(dp, SWITCHDEV_OBJ_PORT_FDB(obj), trans);
+		if (switchdev_trans_ph_prepare(trans))
+			return 0;
+		err = dsa_port_fdb_add(dp, SWITCHDEV_OBJ_PORT_FDB(obj));
 		break;
 	case SWITCHDEV_OBJ_ID_PORT_MDB:
 		err = dsa_port_mdb_add(dp, SWITCHDEV_OBJ_PORT_MDB(obj), trans);
diff --git a/net/dsa/switch.c b/net/dsa/switch.c
index a9edfbad3889..eb20e0fee0e1 100644
--- a/net/dsa/switch.c
+++ b/net/dsa/switch.c
@@ -84,23 +84,15 @@ static int dsa_switch_fdb_add(struct dsa_switch *ds,
 			      struct dsa_notifier_fdb_info *info)
 {
 	const struct switchdev_obj_port_fdb *fdb = info->fdb;
-	struct switchdev_trans *trans = info->trans;
 
 	/* Do not care yet about other switch chips of the fabric */
 	if (ds->index != info->sw_index)
 		return 0;
 
-	if (switchdev_trans_ph_prepare(trans)) {
-		if (!ds->ops->port_fdb_prepare || !ds->ops->port_fdb_add)
-			return -EOPNOTSUPP;
-
-		return ds->ops->port_fdb_prepare(ds, info->port, fdb->addr,
-						 fdb->vid);
-	}
-
-	ds->ops->port_fdb_add(ds, info->port, fdb->addr, fdb->vid);
+	if (!ds->ops->port_fdb_add)
+		return -EOPNOTSUPP;
 
-	return 0;
+	return ds->ops->port_fdb_add(ds, info->port, fdb->addr, fdb->vid);
 }
 
 static int dsa_switch_fdb_del(struct dsa_switch *ds,
-- 
cgit v1.2.3


From c069fcd82c571953b8aaf68769afe9ccb1aa7a9f Mon Sep 17 00:00:00 2001
From: Arkadi Sharshevsky <arkadis@mellanox.com>
Date: Sun, 6 Aug 2017 16:15:46 +0300
Subject: net: dsa: Remove support for bypass bridge port attributes/vlan set

The bridge port attributes/vlan for DSA devices should be set only
from bridge code. Furthermore, The vlans are synced totally with the
bridge so there is no need for special dump support.

Signed-off-by: Arkadi Sharshevsky <arkadis@mellanox.com>
Reviewed-by: Florian Fainelli <f.fainelli@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/dsa.h  |  4 ----
 net/dsa/dsa_priv.h |  4 ----
 net/dsa/port.c     | 12 ------------
 net/dsa/slave.c    |  6 ------
 4 files changed, 26 deletions(-)

(limited to 'include')

diff --git a/include/net/dsa.h b/include/net/dsa.h
index 446fc438cb80..d7b9bdd0e768 100644
--- a/include/net/dsa.h
+++ b/include/net/dsa.h
@@ -378,10 +378,6 @@ struct dsa_switch_ops {
 				 struct switchdev_trans *trans);
 	int	(*port_vlan_del)(struct dsa_switch *ds, int port,
 				 const struct switchdev_obj_port_vlan *vlan);
-	int	(*port_vlan_dump)(struct dsa_switch *ds, int port,
-				  struct switchdev_obj_port_vlan *vlan,
-				  switchdev_obj_dump_cb_t *cb);
-
 	/*
 	 * Forwarding database
 	 */
diff --git a/net/dsa/dsa_priv.h b/net/dsa/dsa_priv.h
index 5d4e668b9471..7f297bcefa0c 100644
--- a/net/dsa/dsa_priv.h
+++ b/net/dsa/dsa_priv.h
@@ -147,10 +147,6 @@ int dsa_port_vlan_add(struct dsa_port *dp,
 		      struct switchdev_trans *trans);
 int dsa_port_vlan_del(struct dsa_port *dp,
 		      const struct switchdev_obj_port_vlan *vlan);
-int dsa_port_vlan_dump(struct dsa_port *dp,
-		       struct switchdev_obj_port_vlan *vlan,
-		       switchdev_obj_dump_cb_t *cb);
-
 /* slave.c */
 extern const struct dsa_device_ops notag_netdev_ops;
 void dsa_slave_mii_bus_init(struct dsa_switch *ds);
diff --git a/net/dsa/port.c b/net/dsa/port.c
index 86e0585215bf..ce1921663cdd 100644
--- a/net/dsa/port.c
+++ b/net/dsa/port.c
@@ -246,15 +246,3 @@ int dsa_port_vlan_del(struct dsa_port *dp,
 
 	return dsa_port_notify(dp, DSA_NOTIFIER_VLAN_DEL, &info);
 }
-
-int dsa_port_vlan_dump(struct dsa_port *dp,
-		       struct switchdev_obj_port_vlan *vlan,
-		       switchdev_obj_dump_cb_t *cb)
-{
-	struct dsa_switch *ds = dp->ds;
-
-	if (ds->ops->port_vlan_dump)
-		return ds->ops->port_vlan_dump(ds, dp->index, vlan, cb);
-
-	return -EOPNOTSUPP;
-}
diff --git a/net/dsa/slave.c b/net/dsa/slave.c
index e9c1d8c5de0f..ccf670679343 100644
--- a/net/dsa/slave.c
+++ b/net/dsa/slave.c
@@ -302,9 +302,6 @@ static int dsa_slave_port_obj_dump(struct net_device *dev,
 	case SWITCHDEV_OBJ_ID_PORT_MDB:
 		err = dsa_port_mdb_dump(dp, SWITCHDEV_OBJ_PORT_MDB(obj), cb);
 		break;
-	case SWITCHDEV_OBJ_ID_PORT_VLAN:
-		err = dsa_port_vlan_dump(dp, SWITCHDEV_OBJ_PORT_VLAN(obj), cb);
-		break;
 	default:
 		err = -EOPNOTSUPP;
 		break;
@@ -988,9 +985,6 @@ static const struct net_device_ops dsa_slave_netdev_ops = {
 	.ndo_netpoll_cleanup	= dsa_slave_netpoll_cleanup,
 	.ndo_poll_controller	= dsa_slave_poll_controller,
 #endif
-	.ndo_bridge_getlink	= switchdev_port_bridge_getlink,
-	.ndo_bridge_setlink	= switchdev_port_bridge_setlink,
-	.ndo_bridge_dellink	= switchdev_port_bridge_dellink,
 	.ndo_get_phys_port_name	= dsa_slave_get_phys_port_name,
 	.ndo_setup_tc		= dsa_slave_setup_tc,
 	.ndo_get_stats64	= dsa_slave_get_stats64,
-- 
cgit v1.2.3


From dc0cbff3ff9fe331160c2be2b3f47564e247137d Mon Sep 17 00:00:00 2001
From: Arkadi Sharshevsky <arkadis@mellanox.com>
Date: Sun, 6 Aug 2017 16:15:48 +0300
Subject: net: dsa: Remove redundant MDB dump support

Currently the MDB HW database is synced with the bridge's one, thus,
There is no need to support special dump functionality.

Signed-off-by: Arkadi Sharshevsky <arkadis@mellanox.com>
Reviewed-by: Florian Fainelli <f.fainelli@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/dsa.h  |  4 ----
 net/dsa/dsa_priv.h |  2 --
 net/dsa/port.c     | 11 -----------
 net/dsa/slave.c    |  3 ---
 4 files changed, 20 deletions(-)

(limited to 'include')

diff --git a/include/net/dsa.h b/include/net/dsa.h
index d7b9bdd0e768..4ef185997b6f 100644
--- a/include/net/dsa.h
+++ b/include/net/dsa.h
@@ -400,10 +400,6 @@ struct dsa_switch_ops {
 				struct switchdev_trans *trans);
 	int	(*port_mdb_del)(struct dsa_switch *ds, int port,
 				const struct switchdev_obj_port_mdb *mdb);
-	int	(*port_mdb_dump)(struct dsa_switch *ds, int port,
-				 struct switchdev_obj_port_mdb *mdb,
-				  switchdev_obj_dump_cb_t *cb);
-
 	/*
 	 * RXNFC
 	 */
diff --git a/net/dsa/dsa_priv.h b/net/dsa/dsa_priv.h
index 7f297bcefa0c..9c890de6e4dd 100644
--- a/net/dsa/dsa_priv.h
+++ b/net/dsa/dsa_priv.h
@@ -140,8 +140,6 @@ int dsa_port_mdb_add(struct dsa_port *dp,
 		     struct switchdev_trans *trans);
 int dsa_port_mdb_del(struct dsa_port *dp,
 		     const struct switchdev_obj_port_mdb *mdb);
-int dsa_port_mdb_dump(struct dsa_port *dp, struct switchdev_obj_port_mdb *mdb,
-		      switchdev_obj_dump_cb_t *cb);
 int dsa_port_vlan_add(struct dsa_port *dp,
 		      const struct switchdev_obj_port_vlan *vlan,
 		      struct switchdev_trans *trans);
diff --git a/net/dsa/port.c b/net/dsa/port.c
index ce1921663cdd..73787828953a 100644
--- a/net/dsa/port.c
+++ b/net/dsa/port.c
@@ -210,17 +210,6 @@ int dsa_port_mdb_del(struct dsa_port *dp,
 	return dsa_port_notify(dp, DSA_NOTIFIER_MDB_DEL, &info);
 }
 
-int dsa_port_mdb_dump(struct dsa_port *dp, struct switchdev_obj_port_mdb *mdb,
-		      switchdev_obj_dump_cb_t *cb)
-{
-	struct dsa_switch *ds = dp->ds;
-
-	if (ds->ops->port_mdb_dump)
-		return ds->ops->port_mdb_dump(ds, dp->index, mdb, cb);
-
-	return -EOPNOTSUPP;
-}
-
 int dsa_port_vlan_add(struct dsa_port *dp,
 		      const struct switchdev_obj_port_vlan *vlan,
 		      struct switchdev_trans *trans)
diff --git a/net/dsa/slave.c b/net/dsa/slave.c
index ccf670679343..5807c905fd1d 100644
--- a/net/dsa/slave.c
+++ b/net/dsa/slave.c
@@ -299,9 +299,6 @@ static int dsa_slave_port_obj_dump(struct net_device *dev,
 	case SWITCHDEV_OBJ_ID_PORT_FDB:
 		err = dsa_port_fdb_dump(dp, SWITCHDEV_OBJ_PORT_FDB(obj), cb);
 		break;
-	case SWITCHDEV_OBJ_ID_PORT_MDB:
-		err = dsa_port_mdb_dump(dp, SWITCHDEV_OBJ_PORT_MDB(obj), cb);
-		break;
 	default:
 		err = -EOPNOTSUPP;
 		break;
-- 
cgit v1.2.3


From 2bedde1abbef5eec211308f0293dd7681b0513ec Mon Sep 17 00:00:00 2001
From: Arkadi Sharshevsky <arkadis@mellanox.com>
Date: Sun, 6 Aug 2017 16:15:49 +0300
Subject: net: dsa: Move FDB dump implementation inside DSA

>From all switchdev devices only DSA requires special FDB dump. This is due
to lack of ability for syncing the hardware learned FDBs with the bridge.
Due to this it is removed from switchdev and moved inside DSA.

Signed-off-by: Arkadi Sharshevsky <arkadis@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/dsa/b53/b53_common.c       |  16 ++----
 drivers/net/dsa/b53/b53_priv.h         |   3 +-
 drivers/net/dsa/microchip/ksz_common.c |  20 ++-----
 drivers/net/dsa/mt7530.c               |  10 +---
 drivers/net/dsa/mv88e6xxx/chip.c       |  38 ++++---------
 drivers/net/dsa/qca8k.c                |  15 ++---
 include/net/dsa.h                      |   5 +-
 include/net/switchdev.h                |  12 ----
 net/dsa/dsa_priv.h                     |   2 -
 net/dsa/port.c                         |  11 ----
 net/dsa/slave.c                        | 100 +++++++++++++++++++++++++--------
 net/switchdev/switchdev.c              |  84 ---------------------------
 12 files changed, 112 insertions(+), 204 deletions(-)

(limited to 'include')

diff --git a/drivers/net/dsa/b53/b53_common.c b/drivers/net/dsa/b53/b53_common.c
index 0176d8087344..274f3679f33d 100644
--- a/drivers/net/dsa/b53/b53_common.c
+++ b/drivers/net/dsa/b53/b53_common.c
@@ -1227,8 +1227,7 @@ static void b53_arl_search_rd(struct b53_device *dev, u8 idx,
 }
 
 static int b53_fdb_copy(int port, const struct b53_arl_entry *ent,
-			struct switchdev_obj_port_fdb *fdb,
-			switchdev_obj_dump_cb_t *cb)
+			dsa_fdb_dump_cb_t *cb, void *data)
 {
 	if (!ent->is_valid)
 		return 0;
@@ -1236,16 +1235,11 @@ static int b53_fdb_copy(int port, const struct b53_arl_entry *ent,
 	if (port != ent->port)
 		return 0;
 
-	ether_addr_copy(fdb->addr, ent->mac);
-	fdb->vid = ent->vid;
-	fdb->ndm_state = ent->is_static ? NUD_NOARP : NUD_REACHABLE;
-
-	return cb(&fdb->obj);
+	return cb(ent->mac, ent->vid, ent->is_static, data);
 }
 
 int b53_fdb_dump(struct dsa_switch *ds, int port,
-		 struct switchdev_obj_port_fdb *fdb,
-		 switchdev_obj_dump_cb_t *cb)
+		 dsa_fdb_dump_cb_t *cb, void *data)
 {
 	struct b53_device *priv = ds->priv;
 	struct b53_arl_entry results[2];
@@ -1263,13 +1257,13 @@ int b53_fdb_dump(struct dsa_switch *ds, int port,
 			return ret;
 
 		b53_arl_search_rd(priv, 0, &results[0]);
-		ret = b53_fdb_copy(port, &results[0], fdb, cb);
+		ret = b53_fdb_copy(port, &results[0], cb, data);
 		if (ret)
 			return ret;
 
 		if (priv->num_arl_entries > 2) {
 			b53_arl_search_rd(priv, 1, &results[1]);
-			ret = b53_fdb_copy(port, &results[1], fdb, cb);
+			ret = b53_fdb_copy(port, &results[1], cb, data);
 			if (ret)
 				return ret;
 
diff --git a/drivers/net/dsa/b53/b53_priv.h b/drivers/net/dsa/b53/b53_priv.h
index af5d6c166bff..01bd8cbe9a3f 100644
--- a/drivers/net/dsa/b53/b53_priv.h
+++ b/drivers/net/dsa/b53/b53_priv.h
@@ -398,8 +398,7 @@ int b53_fdb_add(struct dsa_switch *ds, int port,
 int b53_fdb_del(struct dsa_switch *ds, int port,
 		const unsigned char *addr, u16 vid);
 int b53_fdb_dump(struct dsa_switch *ds, int port,
-		 struct switchdev_obj_port_fdb *fdb,
-		 switchdev_obj_dump_cb_t *cb);
+		 dsa_fdb_dump_cb_t *cb, void *data);
 int b53_mirror_add(struct dsa_switch *ds, int port,
 		   struct dsa_mall_mirror_tc_entry *mirror, bool ingress);
 void b53_mirror_del(struct dsa_switch *ds, int port,
diff --git a/drivers/net/dsa/microchip/ksz_common.c b/drivers/net/dsa/microchip/ksz_common.c
index 4de9d90a4bb3..56cd6d365352 100644
--- a/drivers/net/dsa/microchip/ksz_common.c
+++ b/drivers/net/dsa/microchip/ksz_common.c
@@ -805,12 +805,11 @@ static void convert_alu(struct alu_struct *alu, u32 *alu_table)
 }
 
 static int ksz_port_fdb_dump(struct dsa_switch *ds, int port,
-			     struct switchdev_obj_port_fdb *fdb,
-			     switchdev_obj_dump_cb_t *cb)
+			     dsa_fdb_dump_cb_t *cb, void *data)
 {
 	struct ksz_device *dev = ds->priv;
 	int ret = 0;
-	u32 data;
+	u32 ksz_data;
 	u32 alu_table[4];
 	struct alu_struct alu;
 	int timeout;
@@ -823,8 +822,8 @@ static int ksz_port_fdb_dump(struct dsa_switch *ds, int port,
 	do {
 		timeout = 1000;
 		do {
-			ksz_read32(dev, REG_SW_ALU_CTRL__4, &data);
-			if ((data & ALU_VALID) || !(data & ALU_START))
+			ksz_read32(dev, REG_SW_ALU_CTRL__4, &ksz_data);
+			if ((ksz_data & ALU_VALID) || !(ksz_data & ALU_START))
 				break;
 			usleep_range(1, 10);
 		} while (timeout-- > 0);
@@ -841,18 +840,11 @@ static int ksz_port_fdb_dump(struct dsa_switch *ds, int port,
 		convert_alu(&alu, alu_table);
 
 		if (alu.port_forward & BIT(port)) {
-			fdb->vid = alu.fid;
-			if (alu.is_static)
-				fdb->ndm_state = NUD_NOARP;
-			else
-				fdb->ndm_state = NUD_REACHABLE;
-			ether_addr_copy(fdb->addr, alu.mac);
-
-			ret = cb(&fdb->obj);
+			ret = cb(alu.mac, alu.fid, alu.is_static, data);
 			if (ret)
 				goto exit;
 		}
-	} while (data & ALU_START);
+	} while (ksz_data & ALU_START);
 
 exit:
 
diff --git a/drivers/net/dsa/mt7530.c b/drivers/net/dsa/mt7530.c
index f92aae8947e6..12700710f26d 100644
--- a/drivers/net/dsa/mt7530.c
+++ b/drivers/net/dsa/mt7530.c
@@ -834,8 +834,7 @@ mt7530_port_fdb_del(struct dsa_switch *ds, int port,
 
 static int
 mt7530_port_fdb_dump(struct dsa_switch *ds, int port,
-		     struct switchdev_obj_port_fdb *fdb,
-		     switchdev_obj_dump_cb_t *cb)
+		     dsa_fdb_dump_cb_t *cb, void *data)
 {
 	struct mt7530_priv *priv = ds->priv;
 	struct mt7530_fdb _fdb = { 0 };
@@ -853,11 +852,8 @@ mt7530_port_fdb_dump(struct dsa_switch *ds, int port,
 		if (rsp & ATC_SRCH_HIT) {
 			mt7530_fdb_read(priv, &_fdb);
 			if (_fdb.port_mask & BIT(port)) {
-				ether_addr_copy(fdb->addr, _fdb.mac);
-				fdb->vid = _fdb.vid;
-				fdb->ndm_state = _fdb.noarp ?
-						NUD_NOARP : NUD_REACHABLE;
-				ret = cb(&fdb->obj);
+				ret = cb(_fdb.mac, _fdb.vid, _fdb.noarp,
+					 data);
 				if (ret < 0)
 					break;
 			}
diff --git a/drivers/net/dsa/mv88e6xxx/chip.c b/drivers/net/dsa/mv88e6xxx/chip.c
index 1f5c202b974d..918d8f0fe091 100644
--- a/drivers/net/dsa/mv88e6xxx/chip.c
+++ b/drivers/net/dsa/mv88e6xxx/chip.c
@@ -1381,10 +1381,10 @@ static int mv88e6xxx_port_fdb_del(struct dsa_switch *ds, int port,
 
 static int mv88e6xxx_port_db_dump_fid(struct mv88e6xxx_chip *chip,
 				      u16 fid, u16 vid, int port,
-				      struct switchdev_obj *obj,
-				      switchdev_obj_dump_cb_t *cb)
+				      dsa_fdb_dump_cb_t *cb, void *data)
 {
 	struct mv88e6xxx_atu_entry addr;
+	bool is_static;
 	int err;
 
 	addr.state = MV88E6XXX_G1_ATU_DATA_STATE_UNUSED;
@@ -1401,24 +1401,12 @@ static int mv88e6xxx_port_db_dump_fid(struct mv88e6xxx_chip *chip,
 		if (addr.trunk || (addr.portvec & BIT(port)) == 0)
 			continue;
 
-		if (obj->id == SWITCHDEV_OBJ_ID_PORT_FDB) {
-			struct switchdev_obj_port_fdb *fdb;
-
-			if (!is_unicast_ether_addr(addr.mac))
-				continue;
-
-			fdb = SWITCHDEV_OBJ_PORT_FDB(obj);
-			fdb->vid = vid;
-			ether_addr_copy(fdb->addr, addr.mac);
-			if (addr.state == MV88E6XXX_G1_ATU_DATA_STATE_UC_STATIC)
-				fdb->ndm_state = NUD_NOARP;
-			else
-				fdb->ndm_state = NUD_REACHABLE;
-		} else {
-			return -EOPNOTSUPP;
-		}
+		if (!is_unicast_ether_addr(addr.mac))
+			continue;
 
-		err = cb(obj);
+		is_static = (addr.state ==
+			     MV88E6XXX_G1_ATU_DATA_STATE_UC_STATIC);
+		err = cb(addr.mac, vid, is_static, data);
 		if (err)
 			return err;
 	} while (!is_broadcast_ether_addr(addr.mac));
@@ -1427,8 +1415,7 @@ static int mv88e6xxx_port_db_dump_fid(struct mv88e6xxx_chip *chip,
 }
 
 static int mv88e6xxx_port_db_dump(struct mv88e6xxx_chip *chip, int port,
-				  struct switchdev_obj *obj,
-				  switchdev_obj_dump_cb_t *cb)
+				  dsa_fdb_dump_cb_t *cb, void *data)
 {
 	struct mv88e6xxx_vtu_entry vlan = {
 		.vid = chip->info->max_vid,
@@ -1441,7 +1428,7 @@ static int mv88e6xxx_port_db_dump(struct mv88e6xxx_chip *chip, int port,
 	if (err)
 		return err;
 
-	err = mv88e6xxx_port_db_dump_fid(chip, fid, 0, port, obj, cb);
+	err = mv88e6xxx_port_db_dump_fid(chip, fid, 0, port, cb, data);
 	if (err)
 		return err;
 
@@ -1455,7 +1442,7 @@ static int mv88e6xxx_port_db_dump(struct mv88e6xxx_chip *chip, int port,
 			break;
 
 		err = mv88e6xxx_port_db_dump_fid(chip, vlan.fid, vlan.vid, port,
-						 obj, cb);
+						 cb, data);
 		if (err)
 			return err;
 	} while (vlan.vid < chip->info->max_vid);
@@ -1464,14 +1451,13 @@ static int mv88e6xxx_port_db_dump(struct mv88e6xxx_chip *chip, int port,
 }
 
 static int mv88e6xxx_port_fdb_dump(struct dsa_switch *ds, int port,
-				   struct switchdev_obj_port_fdb *fdb,
-				   switchdev_obj_dump_cb_t *cb)
+				   dsa_fdb_dump_cb_t *cb, void *data)
 {
 	struct mv88e6xxx_chip *chip = ds->priv;
 	int err;
 
 	mutex_lock(&chip->reg_lock);
-	err = mv88e6xxx_port_db_dump(chip, port, &fdb->obj, cb);
+	err = mv88e6xxx_port_db_dump(chip, port, cb, data);
 	mutex_unlock(&chip->reg_lock);
 
 	return err;
diff --git a/drivers/net/dsa/qca8k.c b/drivers/net/dsa/qca8k.c
index f8ef823349bc..17977f06cb98 100644
--- a/drivers/net/dsa/qca8k.c
+++ b/drivers/net/dsa/qca8k.c
@@ -801,27 +801,20 @@ qca8k_port_fdb_del(struct dsa_switch *ds, int port,
 
 static int
 qca8k_port_fdb_dump(struct dsa_switch *ds, int port,
-		    struct switchdev_obj_port_fdb *fdb,
-		    switchdev_obj_dump_cb_t *cb)
+		    dsa_fdb_dump_cb_t *cb, void *data)
 {
 	struct qca8k_priv *priv = (struct qca8k_priv *)ds->priv;
 	struct qca8k_fdb _fdb = { 0 };
 	int cnt = QCA8K_NUM_FDB_RECORDS;
+	bool is_static;
 	int ret = 0;
 
 	mutex_lock(&priv->reg_mutex);
 	while (cnt-- && !qca8k_fdb_next(priv, &_fdb, port)) {
 		if (!_fdb.aging)
 			break;
-
-		ether_addr_copy(fdb->addr, _fdb.mac);
-		fdb->vid = _fdb.vid;
-		if (_fdb.aging == QCA8K_ATU_STATUS_STATIC)
-			fdb->ndm_state = NUD_NOARP;
-		else
-			fdb->ndm_state = NUD_REACHABLE;
-
-		ret = cb(&fdb->obj);
+		is_static = (_fdb.aging == QCA8K_ATU_STATUS_STATIC);
+		ret = cb(_fdb.mac, _fdb.vid, is_static, data);
 		if (ret)
 			break;
 	}
diff --git a/include/net/dsa.h b/include/net/dsa.h
index 4ef185997b6f..a4f66dbb4b7c 100644
--- a/include/net/dsa.h
+++ b/include/net/dsa.h
@@ -272,6 +272,8 @@ static inline u8 dsa_upstream_port(struct dsa_switch *ds)
 		return ds->rtable[dst->cpu_dp->ds->index];
 }
 
+typedef int dsa_fdb_dump_cb_t(const unsigned char *addr, u16 vid,
+			      bool is_static, void *data);
 struct dsa_switch_ops {
 	/*
 	 * Legacy probing.
@@ -386,8 +388,7 @@ struct dsa_switch_ops {
 	int	(*port_fdb_del)(struct dsa_switch *ds, int port,
 				const unsigned char *addr, u16 vid);
 	int	(*port_fdb_dump)(struct dsa_switch *ds, int port,
-				 struct switchdev_obj_port_fdb *fdb,
-				  switchdev_obj_dump_cb_t *cb);
+				 dsa_fdb_dump_cb_t *cb, void *data);
 
 	/*
 	 * Multicast database
diff --git a/include/net/switchdev.h b/include/net/switchdev.h
index 8ae9e3b6392e..d2637a685ca6 100644
--- a/include/net/switchdev.h
+++ b/include/net/switchdev.h
@@ -208,9 +208,6 @@ int switchdev_port_fdb_add(struct ndmsg *ndm, struct nlattr *tb[],
 int switchdev_port_fdb_del(struct ndmsg *ndm, struct nlattr *tb[],
 			   struct net_device *dev, const unsigned char *addr,
 			   u16 vid);
-int switchdev_port_fdb_dump(struct sk_buff *skb, struct netlink_callback *cb,
-			    struct net_device *dev,
-			    struct net_device *filter_dev, int *idx);
 void switchdev_port_fwd_mark_set(struct net_device *dev,
 				 struct net_device *group_dev,
 				 bool joining);
@@ -309,15 +306,6 @@ static inline int switchdev_port_fdb_del(struct ndmsg *ndm, struct nlattr *tb[],
 	return -EOPNOTSUPP;
 }
 
-static inline int switchdev_port_fdb_dump(struct sk_buff *skb,
-					  struct netlink_callback *cb,
-					  struct net_device *dev,
-					  struct net_device *filter_dev,
-					  int *idx)
-{
-       return *idx;
-}
-
 static inline bool switchdev_port_same_parent_id(struct net_device *a,
 						 struct net_device *b)
 {
diff --git a/net/dsa/dsa_priv.h b/net/dsa/dsa_priv.h
index 9c890de6e4dd..1debf9c42fc4 100644
--- a/net/dsa/dsa_priv.h
+++ b/net/dsa/dsa_priv.h
@@ -133,8 +133,6 @@ int dsa_port_fdb_add(struct dsa_port *dp, const unsigned char *addr,
 		     u16 vid);
 int dsa_port_fdb_del(struct dsa_port *dp, const unsigned char *addr,
 		     u16 vid);
-int dsa_port_fdb_dump(struct dsa_port *dp, struct switchdev_obj_port_fdb *fdb,
-		      switchdev_obj_dump_cb_t *cb);
 int dsa_port_mdb_add(struct dsa_port *dp,
 		     const struct switchdev_obj_port_mdb *mdb,
 		     struct switchdev_trans *trans);
diff --git a/net/dsa/port.c b/net/dsa/port.c
index 73787828953a..659676ba3f8b 100644
--- a/net/dsa/port.c
+++ b/net/dsa/port.c
@@ -173,17 +173,6 @@ int dsa_port_fdb_del(struct dsa_port *dp, const unsigned char *addr,
 	return dsa_port_notify(dp, DSA_NOTIFIER_FDB_DEL, &info);
 }
 
-int dsa_port_fdb_dump(struct dsa_port *dp, struct switchdev_obj_port_fdb *fdb,
-		      switchdev_obj_dump_cb_t *cb)
-{
-	struct dsa_switch *ds = dp->ds;
-
-	if (ds->ops->port_fdb_dump)
-		return ds->ops->port_fdb_dump(ds, dp->index, fdb, cb);
-
-	return -EOPNOTSUPP;
-}
-
 int dsa_port_mdb_add(struct dsa_port *dp,
 		     const struct switchdev_obj_port_mdb *mdb,
 		     struct switchdev_trans *trans)
diff --git a/net/dsa/slave.c b/net/dsa/slave.c
index 5807c905fd1d..8c79011c5a83 100644
--- a/net/dsa/slave.c
+++ b/net/dsa/slave.c
@@ -199,6 +199,83 @@ out:
 	return 0;
 }
 
+struct dsa_slave_dump_ctx {
+	struct net_device *dev;
+	struct sk_buff *skb;
+	struct netlink_callback *cb;
+	int idx;
+};
+
+static int
+dsa_slave_port_fdb_do_dump(const unsigned char *addr, u16 vid,
+			   bool is_static, void *data)
+{
+	struct dsa_slave_dump_ctx *dump = data;
+	u32 portid = NETLINK_CB(dump->cb->skb).portid;
+	u32 seq = dump->cb->nlh->nlmsg_seq;
+	struct nlmsghdr *nlh;
+	struct ndmsg *ndm;
+
+	if (dump->idx < dump->cb->args[2])
+		goto skip;
+
+	nlh = nlmsg_put(dump->skb, portid, seq, RTM_NEWNEIGH,
+			sizeof(*ndm), NLM_F_MULTI);
+	if (!nlh)
+		return -EMSGSIZE;
+
+	ndm = nlmsg_data(nlh);
+	ndm->ndm_family  = AF_BRIDGE;
+	ndm->ndm_pad1    = 0;
+	ndm->ndm_pad2    = 0;
+	ndm->ndm_flags   = NTF_SELF;
+	ndm->ndm_type    = 0;
+	ndm->ndm_ifindex = dump->dev->ifindex;
+	ndm->ndm_state   = is_static ? NUD_NOARP : NUD_REACHABLE;
+
+	if (nla_put(dump->skb, NDA_LLADDR, ETH_ALEN, addr))
+		goto nla_put_failure;
+
+	if (vid && nla_put_u16(dump->skb, NDA_VLAN, vid))
+		goto nla_put_failure;
+
+	nlmsg_end(dump->skb, nlh);
+
+skip:
+	dump->idx++;
+	return 0;
+
+nla_put_failure:
+	nlmsg_cancel(dump->skb, nlh);
+	return -EMSGSIZE;
+}
+
+static int
+dsa_slave_fdb_dump(struct sk_buff *skb, struct netlink_callback *cb,
+		   struct net_device *dev, struct net_device *filter_dev,
+		   int *idx)
+{
+	struct dsa_slave_dump_ctx dump = {
+		.dev = dev,
+		.skb = skb,
+		.cb = cb,
+		.idx = *idx,
+	};
+	struct dsa_slave_priv *p = netdev_priv(dev);
+	struct dsa_port *dp = p->dp;
+	struct dsa_switch *ds = dp->ds;
+	int err;
+
+	if (!ds->ops->port_fdb_dump)
+		return -EOPNOTSUPP;
+
+	err = ds->ops->port_fdb_dump(ds, dp->index,
+				     dsa_slave_port_fdb_do_dump,
+				     &dump);
+	*idx = dump.idx;
+	return err;
+}
+
 static int dsa_slave_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd)
 {
 	struct dsa_slave_priv *p = netdev_priv(dev);
@@ -287,26 +364,6 @@ static int dsa_slave_port_obj_del(struct net_device *dev,
 	return err;
 }
 
-static int dsa_slave_port_obj_dump(struct net_device *dev,
-				   struct switchdev_obj *obj,
-				   switchdev_obj_dump_cb_t *cb)
-{
-	struct dsa_slave_priv *p = netdev_priv(dev);
-	struct dsa_port *dp = p->dp;
-	int err;
-
-	switch (obj->id) {
-	case SWITCHDEV_OBJ_ID_PORT_FDB:
-		err = dsa_port_fdb_dump(dp, SWITCHDEV_OBJ_PORT_FDB(obj), cb);
-		break;
-	default:
-		err = -EOPNOTSUPP;
-		break;
-	}
-
-	return err;
-}
-
 static int dsa_slave_port_attr_get(struct net_device *dev,
 				   struct switchdev_attr *attr)
 {
@@ -974,7 +1031,7 @@ static const struct net_device_ops dsa_slave_netdev_ops = {
 	.ndo_set_mac_address	= dsa_slave_set_mac_address,
 	.ndo_fdb_add		= dsa_legacy_fdb_add,
 	.ndo_fdb_del		= dsa_legacy_fdb_del,
-	.ndo_fdb_dump		= switchdev_port_fdb_dump,
+	.ndo_fdb_dump		= dsa_slave_fdb_dump,
 	.ndo_do_ioctl		= dsa_slave_ioctl,
 	.ndo_get_iflink		= dsa_slave_get_iflink,
 #ifdef CONFIG_NET_POLL_CONTROLLER
@@ -992,7 +1049,6 @@ static const struct switchdev_ops dsa_slave_switchdev_ops = {
 	.switchdev_port_attr_set	= dsa_slave_port_attr_set,
 	.switchdev_port_obj_add		= dsa_slave_port_obj_add,
 	.switchdev_port_obj_del		= dsa_slave_port_obj_del,
-	.switchdev_port_obj_dump	= dsa_slave_port_obj_dump,
 };
 
 static struct device_type dsa_type = {
diff --git a/net/switchdev/switchdev.c b/net/switchdev/switchdev.c
index 25dc67ef9d37..3d32981b9aa1 100644
--- a/net/switchdev/switchdev.c
+++ b/net/switchdev/switchdev.c
@@ -1009,90 +1009,6 @@ int switchdev_port_fdb_del(struct ndmsg *ndm, struct nlattr *tb[],
 }
 EXPORT_SYMBOL_GPL(switchdev_port_fdb_del);
 
-struct switchdev_fdb_dump {
-	struct switchdev_obj_port_fdb fdb;
-	struct net_device *dev;
-	struct sk_buff *skb;
-	struct netlink_callback *cb;
-	int idx;
-};
-
-static int switchdev_port_fdb_dump_cb(struct switchdev_obj *obj)
-{
-	struct switchdev_obj_port_fdb *fdb = SWITCHDEV_OBJ_PORT_FDB(obj);
-	struct switchdev_fdb_dump *dump =
-		container_of(fdb, struct switchdev_fdb_dump, fdb);
-	u32 portid = NETLINK_CB(dump->cb->skb).portid;
-	u32 seq = dump->cb->nlh->nlmsg_seq;
-	struct nlmsghdr *nlh;
-	struct ndmsg *ndm;
-
-	if (dump->idx < dump->cb->args[2])
-		goto skip;
-
-	nlh = nlmsg_put(dump->skb, portid, seq, RTM_NEWNEIGH,
-			sizeof(*ndm), NLM_F_MULTI);
-	if (!nlh)
-		return -EMSGSIZE;
-
-	ndm = nlmsg_data(nlh);
-	ndm->ndm_family  = AF_BRIDGE;
-	ndm->ndm_pad1    = 0;
-	ndm->ndm_pad2    = 0;
-	ndm->ndm_flags   = NTF_SELF;
-	ndm->ndm_type    = 0;
-	ndm->ndm_ifindex = dump->dev->ifindex;
-	ndm->ndm_state   = fdb->ndm_state;
-
-	if (nla_put(dump->skb, NDA_LLADDR, ETH_ALEN, fdb->addr))
-		goto nla_put_failure;
-
-	if (fdb->vid && nla_put_u16(dump->skb, NDA_VLAN, fdb->vid))
-		goto nla_put_failure;
-
-	nlmsg_end(dump->skb, nlh);
-
-skip:
-	dump->idx++;
-	return 0;
-
-nla_put_failure:
-	nlmsg_cancel(dump->skb, nlh);
-	return -EMSGSIZE;
-}
-
-/**
- *	switchdev_port_fdb_dump - Dump port FDB (MAC/VLAN) entries
- *
- *	@skb: netlink skb
- *	@cb: netlink callback
- *	@dev: port device
- *	@filter_dev: filter device
- *	@idx:
- *
- *	Dump FDB entries from switch device.
- */
-int switchdev_port_fdb_dump(struct sk_buff *skb, struct netlink_callback *cb,
-			    struct net_device *dev,
-			    struct net_device *filter_dev, int *idx)
-{
-	struct switchdev_fdb_dump dump = {
-		.fdb.obj.orig_dev = dev,
-		.fdb.obj.id = SWITCHDEV_OBJ_ID_PORT_FDB,
-		.dev = dev,
-		.skb = skb,
-		.cb = cb,
-		.idx = *idx,
-	};
-	int err;
-
-	err = switchdev_port_obj_dump(dev, &dump.fdb.obj,
-				      switchdev_port_fdb_dump_cb);
-	*idx = dump.idx;
-	return err;
-}
-EXPORT_SYMBOL_GPL(switchdev_port_fdb_dump);
-
 bool switchdev_port_same_parent_id(struct net_device *a,
 				   struct net_device *b)
 {
-- 
cgit v1.2.3


From 29ab586c3d83f81c435e269cace9a1619afb5bbd Mon Sep 17 00:00:00 2001
From: Arkadi Sharshevsky <arkadis@mellanox.com>
Date: Sun, 6 Aug 2017 16:15:51 +0300
Subject: net: switchdev: Remove bridge bypass support from switchdev

Currently the bridge port flags, vlans, FDBs and MDBs can be offloaded
through the bridge code, making the switchdev's SELF bridge bypass
implementation to be redundant. This implies several changes:
- No need for dump infra in switchdev, DSA's special case is handled
  privately.
- Remove obj_dump from switchdev_ops.
- FDBs are removed from obj_add/del routines, due to the fact that they
  are offloaded through the bridge notification chain.
- The switchdev_port_bridge_xx() and switchdev_port_fdb_xx() functions
  can be removed.

Signed-off-by: Arkadi Sharshevsky <arkadis@mellanox.com>
Reviewed-by: Vivien Didelot <vivien.didelot@savoirfairelinux.com>
Acked-by: Jiri Pirko <jiri@mellanox.com>
Reviewed-by: Ivan Vecera <ivecera@redhat.com>
Reviewed-by: Florian Fainelli <f.fainelli@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/switchdev.h   |  75 --------
 net/switchdev/switchdev.c | 435 ----------------------------------------------
 2 files changed, 510 deletions(-)

(limited to 'include')

diff --git a/include/net/switchdev.h b/include/net/switchdev.h
index d2637a685ca6..d767b7991887 100644
--- a/include/net/switchdev.h
+++ b/include/net/switchdev.h
@@ -74,7 +74,6 @@ struct switchdev_attr {
 enum switchdev_obj_id {
 	SWITCHDEV_OBJ_ID_UNDEFINED,
 	SWITCHDEV_OBJ_ID_PORT_VLAN,
-	SWITCHDEV_OBJ_ID_PORT_FDB,
 	SWITCHDEV_OBJ_ID_PORT_MDB,
 };
 
@@ -97,17 +96,6 @@ struct switchdev_obj_port_vlan {
 #define SWITCHDEV_OBJ_PORT_VLAN(obj) \
 	container_of(obj, struct switchdev_obj_port_vlan, obj)
 
-/* SWITCHDEV_OBJ_ID_PORT_FDB */
-struct switchdev_obj_port_fdb {
-	struct switchdev_obj obj;
-	unsigned char addr[ETH_ALEN];
-	u16 vid;
-	u16 ndm_state;
-};
-
-#define SWITCHDEV_OBJ_PORT_FDB(obj) \
-	container_of(obj, struct switchdev_obj_port_fdb, obj)
-
 /* SWITCHDEV_OBJ_ID_PORT_MDB */
 struct switchdev_obj_port_mdb {
 	struct switchdev_obj obj;
@@ -135,8 +123,6 @@ typedef int switchdev_obj_dump_cb_t(struct switchdev_obj *obj);
  * @switchdev_port_obj_add: Add an object to port (see switchdev_obj_*).
  *
  * @switchdev_port_obj_del: Delete an object from port (see switchdev_obj_*).
- *
- * @switchdev_port_obj_dump: Dump port objects (see switchdev_obj_*).
  */
 struct switchdev_ops {
 	int	(*switchdev_port_attr_get)(struct net_device *dev,
@@ -149,9 +135,6 @@ struct switchdev_ops {
 					  struct switchdev_trans *trans);
 	int	(*switchdev_port_obj_del)(struct net_device *dev,
 					  const struct switchdev_obj *obj);
-	int	(*switchdev_port_obj_dump)(struct net_device *dev,
-					   struct switchdev_obj *obj,
-					   switchdev_obj_dump_cb_t *cb);
 };
 
 enum switchdev_notifier_type {
@@ -189,25 +172,10 @@ int switchdev_port_obj_add(struct net_device *dev,
 			   const struct switchdev_obj *obj);
 int switchdev_port_obj_del(struct net_device *dev,
 			   const struct switchdev_obj *obj);
-int switchdev_port_obj_dump(struct net_device *dev, struct switchdev_obj *obj,
-			    switchdev_obj_dump_cb_t *cb);
 int register_switchdev_notifier(struct notifier_block *nb);
 int unregister_switchdev_notifier(struct notifier_block *nb);
 int call_switchdev_notifiers(unsigned long val, struct net_device *dev,
 			     struct switchdev_notifier_info *info);
-int switchdev_port_bridge_getlink(struct sk_buff *skb, u32 pid, u32 seq,
-				  struct net_device *dev, u32 filter_mask,
-				  int nlflags);
-int switchdev_port_bridge_setlink(struct net_device *dev,
-				  struct nlmsghdr *nlh, u16 flags);
-int switchdev_port_bridge_dellink(struct net_device *dev,
-				  struct nlmsghdr *nlh, u16 flags);
-int switchdev_port_fdb_add(struct ndmsg *ndm, struct nlattr *tb[],
-			   struct net_device *dev, const unsigned char *addr,
-			   u16 vid, u16 nlm_flags);
-int switchdev_port_fdb_del(struct ndmsg *ndm, struct nlattr *tb[],
-			   struct net_device *dev, const unsigned char *addr,
-			   u16 vid);
 void switchdev_port_fwd_mark_set(struct net_device *dev,
 				 struct net_device *group_dev,
 				 bool joining);
@@ -246,13 +214,6 @@ static inline int switchdev_port_obj_del(struct net_device *dev,
 	return -EOPNOTSUPP;
 }
 
-static inline int switchdev_port_obj_dump(struct net_device *dev,
-					  const struct switchdev_obj *obj,
-					  switchdev_obj_dump_cb_t *cb)
-{
-	return -EOPNOTSUPP;
-}
-
 static inline int register_switchdev_notifier(struct notifier_block *nb)
 {
 	return 0;
@@ -270,42 +231,6 @@ static inline int call_switchdev_notifiers(unsigned long val,
 	return NOTIFY_DONE;
 }
 
-static inline int switchdev_port_bridge_getlink(struct sk_buff *skb, u32 pid,
-					    u32 seq, struct net_device *dev,
-					    u32 filter_mask, int nlflags)
-{
-	return -EOPNOTSUPP;
-}
-
-static inline int switchdev_port_bridge_setlink(struct net_device *dev,
-						struct nlmsghdr *nlh,
-						u16 flags)
-{
-	return -EOPNOTSUPP;
-}
-
-static inline int switchdev_port_bridge_dellink(struct net_device *dev,
-						struct nlmsghdr *nlh,
-						u16 flags)
-{
-	return -EOPNOTSUPP;
-}
-
-static inline int switchdev_port_fdb_add(struct ndmsg *ndm, struct nlattr *tb[],
-					 struct net_device *dev,
-					 const unsigned char *addr,
-					 u16 vid, u16 nlm_flags)
-{
-	return -EOPNOTSUPP;
-}
-
-static inline int switchdev_port_fdb_del(struct ndmsg *ndm, struct nlattr *tb[],
-					 struct net_device *dev,
-					 const unsigned char *addr, u16 vid)
-{
-	return -EOPNOTSUPP;
-}
-
 static inline bool switchdev_port_same_parent_id(struct net_device *a,
 						 struct net_device *b)
 {
diff --git a/net/switchdev/switchdev.c b/net/switchdev/switchdev.c
index 3d32981b9aa1..0531b41d1f2d 100644
--- a/net/switchdev/switchdev.c
+++ b/net/switchdev/switchdev.c
@@ -343,8 +343,6 @@ static size_t switchdev_obj_size(const struct switchdev_obj *obj)
 	switch (obj->id) {
 	case SWITCHDEV_OBJ_ID_PORT_VLAN:
 		return sizeof(struct switchdev_obj_port_vlan);
-	case SWITCHDEV_OBJ_ID_PORT_FDB:
-		return sizeof(struct switchdev_obj_port_fdb);
 	case SWITCHDEV_OBJ_ID_PORT_MDB:
 		return sizeof(struct switchdev_obj_port_mdb);
 	default:
@@ -534,43 +532,6 @@ int switchdev_port_obj_del(struct net_device *dev,
 }
 EXPORT_SYMBOL_GPL(switchdev_port_obj_del);
 
-/**
- *	switchdev_port_obj_dump - Dump port objects
- *
- *	@dev: port device
- *	@id: object ID
- *	@obj: object to dump
- *	@cb: function to call with a filled object
- *
- *	rtnl_lock must be held.
- */
-int switchdev_port_obj_dump(struct net_device *dev, struct switchdev_obj *obj,
-			    switchdev_obj_dump_cb_t *cb)
-{
-	const struct switchdev_ops *ops = dev->switchdev_ops;
-	struct net_device *lower_dev;
-	struct list_head *iter;
-	int err = -EOPNOTSUPP;
-
-	ASSERT_RTNL();
-
-	if (ops && ops->switchdev_port_obj_dump)
-		return ops->switchdev_port_obj_dump(dev, obj, cb);
-
-	/* Switch device port(s) may be stacked under
-	 * bond/team/vlan dev, so recurse down to dump objects on
-	 * first port at bottom of stack.
-	 */
-
-	netdev_for_each_lower_dev(dev, lower_dev, iter) {
-		err = switchdev_port_obj_dump(lower_dev, obj, cb);
-		break;
-	}
-
-	return err;
-}
-EXPORT_SYMBOL_GPL(switchdev_port_obj_dump);
-
 static ATOMIC_NOTIFIER_HEAD(switchdev_notif_chain);
 
 /**
@@ -613,402 +574,6 @@ int call_switchdev_notifiers(unsigned long val, struct net_device *dev,
 }
 EXPORT_SYMBOL_GPL(call_switchdev_notifiers);
 
-struct switchdev_vlan_dump {
-	struct switchdev_obj_port_vlan vlan;
-	struct sk_buff *skb;
-	u32 filter_mask;
-	u16 flags;
-	u16 begin;
-	u16 end;
-};
-
-static int switchdev_port_vlan_dump_put(struct switchdev_vlan_dump *dump)
-{
-	struct bridge_vlan_info vinfo;
-
-	vinfo.flags = dump->flags;
-
-	if (dump->begin == 0 && dump->end == 0) {
-		return 0;
-	} else if (dump->begin == dump->end) {
-		vinfo.vid = dump->begin;
-		if (nla_put(dump->skb, IFLA_BRIDGE_VLAN_INFO,
-			    sizeof(vinfo), &vinfo))
-			return -EMSGSIZE;
-	} else {
-		vinfo.vid = dump->begin;
-		vinfo.flags |= BRIDGE_VLAN_INFO_RANGE_BEGIN;
-		if (nla_put(dump->skb, IFLA_BRIDGE_VLAN_INFO,
-			    sizeof(vinfo), &vinfo))
-			return -EMSGSIZE;
-		vinfo.vid = dump->end;
-		vinfo.flags &= ~BRIDGE_VLAN_INFO_RANGE_BEGIN;
-		vinfo.flags |= BRIDGE_VLAN_INFO_RANGE_END;
-		if (nla_put(dump->skb, IFLA_BRIDGE_VLAN_INFO,
-			    sizeof(vinfo), &vinfo))
-			return -EMSGSIZE;
-	}
-
-	return 0;
-}
-
-static int switchdev_port_vlan_dump_cb(struct switchdev_obj *obj)
-{
-	struct switchdev_obj_port_vlan *vlan = SWITCHDEV_OBJ_PORT_VLAN(obj);
-	struct switchdev_vlan_dump *dump =
-		container_of(vlan, struct switchdev_vlan_dump, vlan);
-	int err = 0;
-
-	if (vlan->vid_begin > vlan->vid_end)
-		return -EINVAL;
-
-	if (dump->filter_mask & RTEXT_FILTER_BRVLAN) {
-		dump->flags = vlan->flags;
-		for (dump->begin = dump->end = vlan->vid_begin;
-		     dump->begin <= vlan->vid_end;
-		     dump->begin++, dump->end++) {
-			err = switchdev_port_vlan_dump_put(dump);
-			if (err)
-				return err;
-		}
-	} else if (dump->filter_mask & RTEXT_FILTER_BRVLAN_COMPRESSED) {
-		if (dump->begin > vlan->vid_begin &&
-		    dump->begin >= vlan->vid_end) {
-			if ((dump->begin - 1) == vlan->vid_end &&
-			    dump->flags == vlan->flags) {
-				/* prepend */
-				dump->begin = vlan->vid_begin;
-			} else {
-				err = switchdev_port_vlan_dump_put(dump);
-				dump->flags = vlan->flags;
-				dump->begin = vlan->vid_begin;
-				dump->end = vlan->vid_end;
-			}
-		} else if (dump->end <= vlan->vid_begin &&
-		           dump->end < vlan->vid_end) {
-			if ((dump->end  + 1) == vlan->vid_begin &&
-			    dump->flags == vlan->flags) {
-				/* append */
-				dump->end = vlan->vid_end;
-			} else {
-				err = switchdev_port_vlan_dump_put(dump);
-				dump->flags = vlan->flags;
-				dump->begin = vlan->vid_begin;
-				dump->end = vlan->vid_end;
-			}
-		} else {
-			err = -EINVAL;
-		}
-	}
-
-	return err;
-}
-
-static int switchdev_port_vlan_fill(struct sk_buff *skb, struct net_device *dev,
-				    u32 filter_mask)
-{
-	struct switchdev_vlan_dump dump = {
-		.vlan.obj.orig_dev = dev,
-		.vlan.obj.id = SWITCHDEV_OBJ_ID_PORT_VLAN,
-		.skb = skb,
-		.filter_mask = filter_mask,
-	};
-	int err = 0;
-
-	if ((filter_mask & RTEXT_FILTER_BRVLAN) ||
-	    (filter_mask & RTEXT_FILTER_BRVLAN_COMPRESSED)) {
-		err = switchdev_port_obj_dump(dev, &dump.vlan.obj,
-					      switchdev_port_vlan_dump_cb);
-		if (err)
-			goto err_out;
-		if (filter_mask & RTEXT_FILTER_BRVLAN_COMPRESSED)
-			/* last one */
-			err = switchdev_port_vlan_dump_put(&dump);
-	}
-
-err_out:
-	return err == -EOPNOTSUPP ? 0 : err;
-}
-
-/**
- *	switchdev_port_bridge_getlink - Get bridge port attributes
- *
- *	@dev: port device
- *
- *	Called for SELF on rtnl_bridge_getlink to get bridge port
- *	attributes.
- */
-int switchdev_port_bridge_getlink(struct sk_buff *skb, u32 pid, u32 seq,
-				  struct net_device *dev, u32 filter_mask,
-				  int nlflags)
-{
-	struct switchdev_attr attr = {
-		.orig_dev = dev,
-		.id = SWITCHDEV_ATTR_ID_PORT_BRIDGE_FLAGS,
-	};
-	u16 mode = BRIDGE_MODE_UNDEF;
-	u32 mask = BR_LEARNING | BR_LEARNING_SYNC | BR_FLOOD;
-	int err;
-
-	if (!netif_is_bridge_port(dev))
-		return -EOPNOTSUPP;
-
-	err = switchdev_port_attr_get(dev, &attr);
-	if (err && err != -EOPNOTSUPP)
-		return err;
-
-	return ndo_dflt_bridge_getlink(skb, pid, seq, dev, mode,
-				       attr.u.brport_flags, mask, nlflags,
-				       filter_mask, switchdev_port_vlan_fill);
-}
-EXPORT_SYMBOL_GPL(switchdev_port_bridge_getlink);
-
-static int switchdev_port_br_setflag(struct net_device *dev,
-				     struct nlattr *nlattr,
-				     unsigned long brport_flag)
-{
-	struct switchdev_attr attr = {
-		.orig_dev = dev,
-		.id = SWITCHDEV_ATTR_ID_PORT_BRIDGE_FLAGS,
-	};
-	u8 flag = nla_get_u8(nlattr);
-	int err;
-
-	err = switchdev_port_attr_get(dev, &attr);
-	if (err)
-		return err;
-
-	if (flag)
-		attr.u.brport_flags |= brport_flag;
-	else
-		attr.u.brport_flags &= ~brport_flag;
-
-	return switchdev_port_attr_set(dev, &attr);
-}
-
-static const struct nla_policy
-switchdev_port_bridge_policy[IFLA_BRPORT_MAX + 1] = {
-	[IFLA_BRPORT_STATE]		= { .type = NLA_U8 },
-	[IFLA_BRPORT_COST]		= { .type = NLA_U32 },
-	[IFLA_BRPORT_PRIORITY]		= { .type = NLA_U16 },
-	[IFLA_BRPORT_MODE]		= { .type = NLA_U8 },
-	[IFLA_BRPORT_GUARD]		= { .type = NLA_U8 },
-	[IFLA_BRPORT_PROTECT]		= { .type = NLA_U8 },
-	[IFLA_BRPORT_FAST_LEAVE]	= { .type = NLA_U8 },
-	[IFLA_BRPORT_LEARNING]		= { .type = NLA_U8 },
-	[IFLA_BRPORT_LEARNING_SYNC]	= { .type = NLA_U8 },
-	[IFLA_BRPORT_UNICAST_FLOOD]	= { .type = NLA_U8 },
-};
-
-static int switchdev_port_br_setlink_protinfo(struct net_device *dev,
-					      struct nlattr *protinfo)
-{
-	struct nlattr *attr;
-	int rem;
-	int err;
-
-	err = nla_validate_nested(protinfo, IFLA_BRPORT_MAX,
-				  switchdev_port_bridge_policy, NULL);
-	if (err)
-		return err;
-
-	nla_for_each_nested(attr, protinfo, rem) {
-		switch (nla_type(attr)) {
-		case IFLA_BRPORT_LEARNING:
-			err = switchdev_port_br_setflag(dev, attr,
-							BR_LEARNING);
-			break;
-		case IFLA_BRPORT_LEARNING_SYNC:
-			err = switchdev_port_br_setflag(dev, attr,
-							BR_LEARNING_SYNC);
-			break;
-		case IFLA_BRPORT_UNICAST_FLOOD:
-			err = switchdev_port_br_setflag(dev, attr, BR_FLOOD);
-			break;
-		default:
-			err = -EOPNOTSUPP;
-			break;
-		}
-		if (err)
-			return err;
-	}
-
-	return 0;
-}
-
-static int switchdev_port_br_afspec(struct net_device *dev,
-				    struct nlattr *afspec,
-				    int (*f)(struct net_device *dev,
-					     const struct switchdev_obj *obj))
-{
-	struct nlattr *attr;
-	struct bridge_vlan_info *vinfo;
-	struct switchdev_obj_port_vlan vlan = {
-		.obj.orig_dev = dev,
-		.obj.id = SWITCHDEV_OBJ_ID_PORT_VLAN,
-	};
-	int rem;
-	int err;
-
-	nla_for_each_nested(attr, afspec, rem) {
-		if (nla_type(attr) != IFLA_BRIDGE_VLAN_INFO)
-			continue;
-		if (nla_len(attr) != sizeof(struct bridge_vlan_info))
-			return -EINVAL;
-		vinfo = nla_data(attr);
-		if (!vinfo->vid || vinfo->vid >= VLAN_VID_MASK)
-			return -EINVAL;
-		vlan.flags = vinfo->flags;
-		if (vinfo->flags & BRIDGE_VLAN_INFO_RANGE_BEGIN) {
-			if (vlan.vid_begin)
-				return -EINVAL;
-			vlan.vid_begin = vinfo->vid;
-			/* don't allow range of pvids */
-			if (vlan.flags & BRIDGE_VLAN_INFO_PVID)
-				return -EINVAL;
-		} else if (vinfo->flags & BRIDGE_VLAN_INFO_RANGE_END) {
-			if (!vlan.vid_begin)
-				return -EINVAL;
-			vlan.vid_end = vinfo->vid;
-			if (vlan.vid_end <= vlan.vid_begin)
-				return -EINVAL;
-			err = f(dev, &vlan.obj);
-			if (err)
-				return err;
-			vlan.vid_begin = 0;
-		} else {
-			if (vlan.vid_begin)
-				return -EINVAL;
-			vlan.vid_begin = vinfo->vid;
-			vlan.vid_end = vinfo->vid;
-			err = f(dev, &vlan.obj);
-			if (err)
-				return err;
-			vlan.vid_begin = 0;
-		}
-	}
-
-	return 0;
-}
-
-/**
- *	switchdev_port_bridge_setlink - Set bridge port attributes
- *
- *	@dev: port device
- *	@nlh: netlink header
- *	@flags: netlink flags
- *
- *	Called for SELF on rtnl_bridge_setlink to set bridge port
- *	attributes.
- */
-int switchdev_port_bridge_setlink(struct net_device *dev,
-				  struct nlmsghdr *nlh, u16 flags)
-{
-	struct nlattr *protinfo;
-	struct nlattr *afspec;
-	int err = 0;
-
-	if (!netif_is_bridge_port(dev))
-		return -EOPNOTSUPP;
-
-	protinfo = nlmsg_find_attr(nlh, sizeof(struct ifinfomsg),
-				   IFLA_PROTINFO);
-	if (protinfo) {
-		err = switchdev_port_br_setlink_protinfo(dev, protinfo);
-		if (err)
-			return err;
-	}
-
-	afspec = nlmsg_find_attr(nlh, sizeof(struct ifinfomsg),
-				 IFLA_AF_SPEC);
-	if (afspec)
-		err = switchdev_port_br_afspec(dev, afspec,
-					       switchdev_port_obj_add);
-
-	return err;
-}
-EXPORT_SYMBOL_GPL(switchdev_port_bridge_setlink);
-
-/**
- *	switchdev_port_bridge_dellink - Set bridge port attributes
- *
- *	@dev: port device
- *	@nlh: netlink header
- *	@flags: netlink flags
- *
- *	Called for SELF on rtnl_bridge_dellink to set bridge port
- *	attributes.
- */
-int switchdev_port_bridge_dellink(struct net_device *dev,
-				  struct nlmsghdr *nlh, u16 flags)
-{
-	struct nlattr *afspec;
-
-	if (!netif_is_bridge_port(dev))
-		return -EOPNOTSUPP;
-
-	afspec = nlmsg_find_attr(nlh, sizeof(struct ifinfomsg),
-				 IFLA_AF_SPEC);
-	if (afspec)
-		return switchdev_port_br_afspec(dev, afspec,
-						switchdev_port_obj_del);
-
-	return 0;
-}
-EXPORT_SYMBOL_GPL(switchdev_port_bridge_dellink);
-
-/**
- *	switchdev_port_fdb_add - Add FDB (MAC/VLAN) entry to port
- *
- *	@ndmsg: netlink hdr
- *	@nlattr: netlink attributes
- *	@dev: port device
- *	@addr: MAC address to add
- *	@vid: VLAN to add
- *
- *	Add FDB entry to switch device.
- */
-int switchdev_port_fdb_add(struct ndmsg *ndm, struct nlattr *tb[],
-			   struct net_device *dev, const unsigned char *addr,
-			   u16 vid, u16 nlm_flags)
-{
-	struct switchdev_obj_port_fdb fdb = {
-		.obj.orig_dev = dev,
-		.obj.id = SWITCHDEV_OBJ_ID_PORT_FDB,
-		.vid = vid,
-	};
-
-	ether_addr_copy(fdb.addr, addr);
-	return switchdev_port_obj_add(dev, &fdb.obj);
-}
-EXPORT_SYMBOL_GPL(switchdev_port_fdb_add);
-
-/**
- *	switchdev_port_fdb_del - Delete FDB (MAC/VLAN) entry from port
- *
- *	@ndmsg: netlink hdr
- *	@nlattr: netlink attributes
- *	@dev: port device
- *	@addr: MAC address to delete
- *	@vid: VLAN to delete
- *
- *	Delete FDB entry from switch device.
- */
-int switchdev_port_fdb_del(struct ndmsg *ndm, struct nlattr *tb[],
-			   struct net_device *dev, const unsigned char *addr,
-			   u16 vid)
-{
-	struct switchdev_obj_port_fdb fdb = {
-		.obj.orig_dev = dev,
-		.obj.id = SWITCHDEV_OBJ_ID_PORT_FDB,
-		.vid = vid,
-	};
-
-	ether_addr_copy(fdb.addr, addr);
-	return switchdev_port_obj_del(dev, &fdb.obj);
-}
-EXPORT_SYMBOL_GPL(switchdev_port_fdb_del);
-
 bool switchdev_port_same_parent_id(struct net_device *a,
 				   struct net_device *b)
 {
-- 
cgit v1.2.3


From 199b5763d329b43c88f6ad539db8a6c6b42f8edb Mon Sep 17 00:00:00 2001
From: "Longpeng(Mike)" <longpeng2@huawei.com>
Date: Tue, 8 Aug 2017 12:05:32 +0800
Subject: KVM: add spinlock optimization framework

If a vcpu exits due to request a user mode spinlock, then
the spinlock-holder may be preempted in user mode or kernel mode.
(Note that not all architectures trap spin loops in user mode,
only AMD x86 and ARM/ARM64 currently do).

But if a vcpu exits in kernel mode, then the holder must be
preempted in kernel mode, so we should choose a vcpu in kernel mode
as a more likely candidate for the lock holder.

This introduces kvm_arch_vcpu_in_kernel() to decide whether the
vcpu is in kernel-mode when it's preempted.  kvm_vcpu_on_spin's
new argument says the same of the spinning VCPU.

Signed-off-by: Longpeng(Mike) <longpeng2@huawei.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/arm/kvm/handle_exit.c   | 2 +-
 arch/arm64/kvm/handle_exit.c | 2 +-
 arch/mips/kvm/mips.c         | 5 +++++
 arch/powerpc/kvm/powerpc.c   | 5 +++++
 arch/s390/kvm/diag.c         | 2 +-
 arch/s390/kvm/kvm-s390.c     | 5 +++++
 arch/x86/kvm/hyperv.c        | 2 +-
 arch/x86/kvm/svm.c           | 2 +-
 arch/x86/kvm/vmx.c           | 2 +-
 arch/x86/kvm/x86.c           | 5 +++++
 include/linux/kvm_host.h     | 3 ++-
 virt/kvm/arm/arm.c           | 5 +++++
 virt/kvm/kvm_main.c          | 4 +++-
 13 files changed, 36 insertions(+), 8 deletions(-)

(limited to 'include')

diff --git a/arch/arm/kvm/handle_exit.c b/arch/arm/kvm/handle_exit.c
index 54442e375354..196122bb6968 100644
--- a/arch/arm/kvm/handle_exit.c
+++ b/arch/arm/kvm/handle_exit.c
@@ -67,7 +67,7 @@ static int kvm_handle_wfx(struct kvm_vcpu *vcpu, struct kvm_run *run)
 	if (kvm_vcpu_get_hsr(vcpu) & HSR_WFI_IS_WFE) {
 		trace_kvm_wfx(*vcpu_pc(vcpu), true);
 		vcpu->stat.wfe_exit_stat++;
-		kvm_vcpu_on_spin(vcpu);
+		kvm_vcpu_on_spin(vcpu, false);
 	} else {
 		trace_kvm_wfx(*vcpu_pc(vcpu), false);
 		vcpu->stat.wfi_exit_stat++;
diff --git a/arch/arm64/kvm/handle_exit.c b/arch/arm64/kvm/handle_exit.c
index 17d8a1677a0b..da57622cacca 100644
--- a/arch/arm64/kvm/handle_exit.c
+++ b/arch/arm64/kvm/handle_exit.c
@@ -84,7 +84,7 @@ static int kvm_handle_wfx(struct kvm_vcpu *vcpu, struct kvm_run *run)
 	if (kvm_vcpu_get_hsr(vcpu) & ESR_ELx_WFx_ISS_WFE) {
 		trace_kvm_wfx_arm64(*vcpu_pc(vcpu), true);
 		vcpu->stat.wfe_exit_stat++;
-		kvm_vcpu_on_spin(vcpu);
+		kvm_vcpu_on_spin(vcpu, false);
 	} else {
 		trace_kvm_wfx_arm64(*vcpu_pc(vcpu), false);
 		vcpu->stat.wfi_exit_stat++;
diff --git a/arch/mips/kvm/mips.c b/arch/mips/kvm/mips.c
index d4b2ad18eef2..bce2a6431430 100644
--- a/arch/mips/kvm/mips.c
+++ b/arch/mips/kvm/mips.c
@@ -98,6 +98,11 @@ int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu)
 	return !!(vcpu->arch.pending_exceptions);
 }
 
+bool kvm_arch_vcpu_in_kernel(struct kvm_vcpu *vcpu)
+{
+	return false;
+}
+
 int kvm_arch_vcpu_should_kick(struct kvm_vcpu *vcpu)
 {
 	return 1;
diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c
index 1a75c0b5f4ca..3480faaf1ef8 100644
--- a/arch/powerpc/kvm/powerpc.c
+++ b/arch/powerpc/kvm/powerpc.c
@@ -58,6 +58,11 @@ int kvm_arch_vcpu_runnable(struct kvm_vcpu *v)
 	return !!(v->arch.pending_exceptions) || kvm_request_pending(v);
 }
 
+bool kvm_arch_vcpu_in_kernel(struct kvm_vcpu *vcpu)
+{
+	return false;
+}
+
 int kvm_arch_vcpu_should_kick(struct kvm_vcpu *vcpu)
 {
 	return 1;
diff --git a/arch/s390/kvm/diag.c b/arch/s390/kvm/diag.c
index ce865bd4f81d..6182edebea3d 100644
--- a/arch/s390/kvm/diag.c
+++ b/arch/s390/kvm/diag.c
@@ -150,7 +150,7 @@ static int __diag_time_slice_end(struct kvm_vcpu *vcpu)
 {
 	VCPU_EVENT(vcpu, 5, "%s", "diag time slice end");
 	vcpu->stat.diagnose_44++;
-	kvm_vcpu_on_spin(vcpu);
+	kvm_vcpu_on_spin(vcpu, false);
 	return 0;
 }
 
diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c
index af09d3437631..5e07322034b9 100644
--- a/arch/s390/kvm/kvm-s390.c
+++ b/arch/s390/kvm/kvm-s390.c
@@ -2447,6 +2447,11 @@ int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu)
 	return kvm_s390_vcpu_has_irq(vcpu, 0);
 }
 
+bool kvm_arch_vcpu_in_kernel(struct kvm_vcpu *vcpu)
+{
+	return false;
+}
+
 void kvm_s390_vcpu_block(struct kvm_vcpu *vcpu)
 {
 	atomic_or(PROG_BLOCK_SIE, &vcpu->arch.sie_block->prog20);
diff --git a/arch/x86/kvm/hyperv.c b/arch/x86/kvm/hyperv.c
index bf9992300efa..5243d54f73ab 100644
--- a/arch/x86/kvm/hyperv.c
+++ b/arch/x86/kvm/hyperv.c
@@ -1274,7 +1274,7 @@ int kvm_hv_hypercall(struct kvm_vcpu *vcpu)
 
 	switch (code) {
 	case HVCALL_NOTIFY_LONG_SPIN_WAIT:
-		kvm_vcpu_on_spin(vcpu);
+		kvm_vcpu_on_spin(vcpu, false);
 		break;
 	case HVCALL_POST_MESSAGE:
 	case HVCALL_SIGNAL_EVENT:
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index 2432bb952a30..0cc486fd9871 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -3749,7 +3749,7 @@ static int interrupt_window_interception(struct vcpu_svm *svm)
 
 static int pause_interception(struct vcpu_svm *svm)
 {
-	kvm_vcpu_on_spin(&(svm->vcpu));
+	kvm_vcpu_on_spin(&svm->vcpu, false);
 	return 1;
 }
 
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 2c0f5287fb78..fef784c22190 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -6781,7 +6781,7 @@ static int handle_pause(struct kvm_vcpu *vcpu)
 	if (ple_gap)
 		grow_ple_window(vcpu);
 
-	kvm_vcpu_on_spin(vcpu);
+	kvm_vcpu_on_spin(vcpu, false);
 	return kvm_skip_emulated_instruction(vcpu);
 }
 
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 33fd6b6419ef..6125e1743b69 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -8432,6 +8432,11 @@ int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu)
 	return kvm_vcpu_running(vcpu) || kvm_vcpu_has_events(vcpu);
 }
 
+bool kvm_arch_vcpu_in_kernel(struct kvm_vcpu *vcpu)
+{
+	return false;
+}
+
 int kvm_arch_vcpu_should_kick(struct kvm_vcpu *vcpu)
 {
 	return kvm_vcpu_exiting_guest_mode(vcpu) == IN_GUEST_MODE;
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 28112d7917c1..6882538eda32 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -720,7 +720,7 @@ void kvm_arch_vcpu_unblocking(struct kvm_vcpu *vcpu);
 bool kvm_vcpu_wake_up(struct kvm_vcpu *vcpu);
 void kvm_vcpu_kick(struct kvm_vcpu *vcpu);
 int kvm_vcpu_yield_to(struct kvm_vcpu *target);
-void kvm_vcpu_on_spin(struct kvm_vcpu *vcpu);
+void kvm_vcpu_on_spin(struct kvm_vcpu *vcpu, bool usermode_vcpu_not_eligible);
 void kvm_load_guest_fpu(struct kvm_vcpu *vcpu);
 void kvm_put_guest_fpu(struct kvm_vcpu *vcpu);
 
@@ -800,6 +800,7 @@ int kvm_arch_hardware_setup(void);
 void kvm_arch_hardware_unsetup(void);
 void kvm_arch_check_processor_compat(void *rtn);
 int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu);
+bool kvm_arch_vcpu_in_kernel(struct kvm_vcpu *vcpu);
 int kvm_arch_vcpu_should_kick(struct kvm_vcpu *vcpu);
 
 #ifndef __KVM_HAVE_ARCH_VM_ALLOC
diff --git a/virt/kvm/arm/arm.c b/virt/kvm/arm/arm.c
index a39a1e161e63..862f820d06d4 100644
--- a/virt/kvm/arm/arm.c
+++ b/virt/kvm/arm/arm.c
@@ -416,6 +416,11 @@ int kvm_arch_vcpu_runnable(struct kvm_vcpu *v)
 		&& !v->arch.power_off && !v->arch.pause);
 }
 
+bool kvm_arch_vcpu_in_kernel(struct kvm_vcpu *vcpu)
+{
+	return false;
+}
+
 /* Just ensure a guest exit from a particular CPU */
 static void exit_vm_noop(void *info)
 {
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 15252d723b54..e17c40d986f3 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -2317,7 +2317,7 @@ static bool kvm_vcpu_eligible_for_directed_yield(struct kvm_vcpu *vcpu)
 #endif
 }
 
-void kvm_vcpu_on_spin(struct kvm_vcpu *me)
+void kvm_vcpu_on_spin(struct kvm_vcpu *me, bool yield_to_kernel_mode)
 {
 	struct kvm *kvm = me->kvm;
 	struct kvm_vcpu *vcpu;
@@ -2348,6 +2348,8 @@ void kvm_vcpu_on_spin(struct kvm_vcpu *me)
 				continue;
 			if (swait_active(&vcpu->wq) && !kvm_arch_vcpu_runnable(vcpu))
 				continue;
+			if (yield_to_kernel_mode && !kvm_arch_vcpu_in_kernel(vcpu))
+				continue;
 			if (!kvm_vcpu_eligible_for_directed_yield(vcpu))
 				continue;
 
-- 
cgit v1.2.3


From 4c7089ee66026f38275d43e26d9da6e2945af6f9 Mon Sep 17 00:00:00 2001
From: Hans Verkuil <hans.verkuil@cisco.com>
Date: Fri, 21 Jul 2017 06:48:33 -0400
Subject: media: media-device: set driver_version directly

Don't use driver_version from struct media_device, just return
LINUX_VERSION_CODE as the other media subsystems do.

The driver_version field in struct media_device will be removed
in the following patches.

Signed-off-by: Hans Verkuil <hans.verkuil@cisco.com>
Reviewed-by: Laurent Pinchart <laurent.pinchart@ideasonboard.com>
Signed-off-by: Mauro Carvalho Chehab <mchehab@s-opensource.com>
---
 drivers/media/media-device.c | 2 +-
 include/media/media-device.h | 5 -----
 2 files changed, 1 insertion(+), 6 deletions(-)

(limited to 'include')

diff --git a/drivers/media/media-device.c b/drivers/media/media-device.c
index fce91b543c14..7ff8e2d5bb07 100644
--- a/drivers/media/media-device.c
+++ b/drivers/media/media-device.c
@@ -71,7 +71,7 @@ static int media_device_get_info(struct media_device *dev,
 
 	info->media_version = MEDIA_API_VERSION;
 	info->hw_revision = dev->hw_revision;
-	info->driver_version = dev->driver_version;
+	info->driver_version = LINUX_VERSION_CODE;
 
 	return 0;
 }
diff --git a/include/media/media-device.h b/include/media/media-device.h
index 6896266031b9..7ae200d89a9f 100644
--- a/include/media/media-device.h
+++ b/include/media/media-device.h
@@ -249,11 +249,6 @@ void media_device_cleanup(struct media_device *mdev);
  *    driver-specific format. When possible the revision should be formatted
  *    with the KERNEL_VERSION() macro.
  *
- *  - &media_entity.driver_version is formatted with the KERNEL_VERSION()
- *    macro. The version minor must be incremented when new features are added
- *    to the userspace API without breaking binary compatibility. The version
- *    major must be incremented when binary compatibility is broken.
- *
  * .. note::
  *
  *    #) Upon successful registration a character device named media[0-9]+ is created. The device major and minor numbers are dynamic. The model name is exported as a sysfs attribute.
-- 
cgit v1.2.3


From 2bd8682375f3a79e463d8840794d2872b02d755b Mon Sep 17 00:00:00 2001
From: Hans Verkuil <hans.verkuil@cisco.com>
Date: Fri, 21 Jul 2017 04:49:44 -0400
Subject: media: media-device: remove driver_version

Since the driver_version field in struct media_device is no longer
used, just remove it.

Signed-off-by: Hans Verkuil <hans.verkuil@cisco.com>
Reviewed-by: Laurent Pinchart <laurent.pinchart@ideasonboard.com>
Signed-off-by: Mauro Carvalho Chehab <mchehab@s-opensource.com>
---
 drivers/media/media-device.c | 3 ---
 include/media/media-device.h | 2 --
 2 files changed, 5 deletions(-)

(limited to 'include')

diff --git a/drivers/media/media-device.c b/drivers/media/media-device.c
index 7ff8e2d5bb07..979e4307d248 100644
--- a/drivers/media/media-device.c
+++ b/drivers/media/media-device.c
@@ -833,8 +833,6 @@ void media_device_pci_init(struct media_device *mdev,
 	mdev->hw_revision = (pci_dev->subsystem_vendor << 16)
 			    | pci_dev->subsystem_device;
 
-	mdev->driver_version = LINUX_VERSION_CODE;
-
 	media_device_init(mdev);
 }
 EXPORT_SYMBOL_GPL(media_device_pci_init);
@@ -862,7 +860,6 @@ void __media_device_usb_init(struct media_device *mdev,
 		strlcpy(mdev->serial, udev->serial, sizeof(mdev->serial));
 	usb_make_path(udev, mdev->bus_info, sizeof(mdev->bus_info));
 	mdev->hw_revision = le16_to_cpu(udev->descriptor.bcdDevice);
-	mdev->driver_version = LINUX_VERSION_CODE;
 
 	media_device_init(mdev);
 }
diff --git a/include/media/media-device.h b/include/media/media-device.h
index 7ae200d89a9f..bcc6ec434f1f 100644
--- a/include/media/media-device.h
+++ b/include/media/media-device.h
@@ -68,7 +68,6 @@ struct media_device_ops {
  * @serial:	Device serial number (optional)
  * @bus_info:	Unique and stable device location identifier
  * @hw_revision: Hardware device revision
- * @driver_version: Device driver version
  * @topology_version: Monotonic counter for storing the version of the graph
  *		topology. Should be incremented each time the topology changes.
  * @id:		Unique ID used on the last registered graph object
@@ -134,7 +133,6 @@ struct media_device {
 	char serial[40];
 	char bus_info[32];
 	u32 hw_revision;
-	u32 driver_version;
 
 	u64 topology_version;
 
-- 
cgit v1.2.3


From 6c2c188f35c61c8eee71ec6d07524ce122c06539 Mon Sep 17 00:00:00 2001
From: Hans Verkuil <hans.verkuil@cisco.com>
Date: Fri, 28 Jul 2017 03:25:06 -0400
Subject: media: drop use of MEDIA_API_VERSION

Set media_version to LINUX_VERSION_CODE, just as we did for
driver_version.

Nobody ever rememebers to update the version number, but
LINUX_VERSION_CODE will always be updated.

Move the MEDIA_API_VERSION define to the ifndef __KERNEL__ section of the
media.h header. That way kernelspace can't accidentally start to use
it again.

Signed-off-by: Hans Verkuil <hans.verkuil@cisco.com>
Reviewed-by: Laurent Pinchart <laurent.pinchart@ideasonboard.com>
Signed-off-by: Mauro Carvalho Chehab <mchehab@s-opensource.com>
---
 drivers/media/media-device.c | 4 ++--
 include/uapi/linux/media.h   | 5 +++--
 2 files changed, 5 insertions(+), 4 deletions(-)

(limited to 'include')

diff --git a/drivers/media/media-device.c b/drivers/media/media-device.c
index 979e4307d248..e79f72b8b858 100644
--- a/drivers/media/media-device.c
+++ b/drivers/media/media-device.c
@@ -69,9 +69,9 @@ static int media_device_get_info(struct media_device *dev,
 	strlcpy(info->serial, dev->serial, sizeof(info->serial));
 	strlcpy(info->bus_info, dev->bus_info, sizeof(info->bus_info));
 
-	info->media_version = MEDIA_API_VERSION;
+	info->media_version = LINUX_VERSION_CODE;
+	info->driver_version = info->media_version;
 	info->hw_revision = dev->hw_revision;
-	info->driver_version = LINUX_VERSION_CODE;
 
 	return 0;
 }
diff --git a/include/uapi/linux/media.h b/include/uapi/linux/media.h
index fac96c64fe51..4865f1e71339 100644
--- a/include/uapi/linux/media.h
+++ b/include/uapi/linux/media.h
@@ -30,8 +30,6 @@
 #include <linux/types.h>
 #include <linux/version.h>
 
-#define MEDIA_API_VERSION	KERNEL_VERSION(0, 1, 0)
-
 struct media_device_info {
 	char driver[16];
 	char model[32];
@@ -187,6 +185,9 @@ struct media_device_info {
 #define MEDIA_ENT_T_V4L2_SUBDEV_LENS	MEDIA_ENT_F_LENS
 #define MEDIA_ENT_T_V4L2_SUBDEV_DECODER	MEDIA_ENT_F_ATV_DECODER
 #define MEDIA_ENT_T_V4L2_SUBDEV_TUNER	MEDIA_ENT_F_TUNER
+
+/* Obsolete symbol for media_version, no longer used in the kernel */
+#define MEDIA_API_VERSION		KERNEL_VERSION(0, 1, 0)
 #endif
 
 /* Entity flags */
-- 
cgit v1.2.3


From 0ec9eb90feec4933637fbde9d5bfbc3b62aea218 Mon Sep 17 00:00:00 2001
From: Chi-Hsien Lin <chi-hsien.lin@cypress.com>
Date: Thu, 3 Aug 2017 17:37:58 +0800
Subject: brcmfmac: Add support for CYW4373 SDIO/USB chipset

Add support for CYW4373 SDIO/USB chipset.
CYW4373 is a 1x1 dual-band 11ac chipset with 20/40/80Mhz channel support.
It's a WiFi/BT combo device.

Signed-off-by: Chi-Hsien Lin <chi-hsien.lin@cypress.com>
Reviewed-by: Arend van Spriel <arend.vanspriel@broadcom.com>
Signed-off-by: Kalle Valo <kvalo@codeaurora.org>
---
 drivers/net/wireless/broadcom/brcm80211/brcmfmac/bcmsdh.c     | 1 +
 drivers/net/wireless/broadcom/brcm80211/brcmfmac/chip.c       | 2 ++
 drivers/net/wireless/broadcom/brcm80211/brcmfmac/sdio.c       | 4 +++-
 drivers/net/wireless/broadcom/brcm80211/brcmfmac/usb.c        | 9 ++++++++-
 drivers/net/wireless/broadcom/brcm80211/include/brcm_hw_ids.h | 3 +++
 include/linux/mmc/sdio_ids.h                                  | 1 +
 6 files changed, 18 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/drivers/net/wireless/broadcom/brcm80211/brcmfmac/bcmsdh.c b/drivers/net/wireless/broadcom/brcm80211/brcmfmac/bcmsdh.c
index 984c1d0560b1..cd587325e286 100644
--- a/drivers/net/wireless/broadcom/brcm80211/brcmfmac/bcmsdh.c
+++ b/drivers/net/wireless/broadcom/brcm80211/brcmfmac/bcmsdh.c
@@ -1105,6 +1105,7 @@ static const struct sdio_device_id brcmf_sdmmc_ids[] = {
 	BRCMF_SDIO_DEVICE(SDIO_DEVICE_ID_BROADCOM_43455),
 	BRCMF_SDIO_DEVICE(SDIO_DEVICE_ID_BROADCOM_4354),
 	BRCMF_SDIO_DEVICE(SDIO_DEVICE_ID_BROADCOM_4356),
+	BRCMF_SDIO_DEVICE(SDIO_DEVICE_ID_CYPRESS_4373),
 	{ /* end: all zeroes */ }
 };
 MODULE_DEVICE_TABLE(sdio, brcmf_sdmmc_ids);
diff --git a/drivers/net/wireless/broadcom/brcm80211/brcmfmac/chip.c b/drivers/net/wireless/broadcom/brcm80211/brcmfmac/chip.c
index 05f22ff81d60..c5d1a1cbf601 100644
--- a/drivers/net/wireless/broadcom/brcm80211/brcmfmac/chip.c
+++ b/drivers/net/wireless/broadcom/brcm80211/brcmfmac/chip.c
@@ -690,6 +690,8 @@ static u32 brcmf_chip_tcm_rambase(struct brcmf_chip_priv *ci)
 	case BRCM_CC_4365_CHIP_ID:
 	case BRCM_CC_4366_CHIP_ID:
 		return 0x200000;
+	case CY_CC_4373_CHIP_ID:
+		return 0x160000;
 	default:
 		brcmf_err("unknown chip: %s\n", ci->pub.name);
 		break;
diff --git a/drivers/net/wireless/broadcom/brcm80211/brcmfmac/sdio.c b/drivers/net/wireless/broadcom/brcm80211/brcmfmac/sdio.c
index f3556122c6ac..613caca7dc02 100644
--- a/drivers/net/wireless/broadcom/brcm80211/brcmfmac/sdio.c
+++ b/drivers/net/wireless/broadcom/brcm80211/brcmfmac/sdio.c
@@ -618,6 +618,7 @@ BRCMF_FW_NVRAM_DEF(43430A1, "brcmfmac43430-sdio.bin", "brcmfmac43430-sdio.txt");
 BRCMF_FW_NVRAM_DEF(43455, "brcmfmac43455-sdio.bin", "brcmfmac43455-sdio.txt");
 BRCMF_FW_NVRAM_DEF(4354, "brcmfmac4354-sdio.bin", "brcmfmac4354-sdio.txt");
 BRCMF_FW_NVRAM_DEF(4356, "brcmfmac4356-sdio.bin", "brcmfmac4356-sdio.txt");
+BRCMF_FW_NVRAM_DEF(4373, "brcmfmac4373-sdio.bin", "brcmfmac4373-sdio.txt");
 
 static struct brcmf_firmware_mapping brcmf_sdio_fwnames[] = {
 	BRCMF_FW_NVRAM_ENTRY(BRCM_CC_43143_CHIP_ID, 0xFFFFFFFF, 43143),
@@ -636,7 +637,8 @@ static struct brcmf_firmware_mapping brcmf_sdio_fwnames[] = {
 	BRCMF_FW_NVRAM_ENTRY(BRCM_CC_43430_CHIP_ID, 0xFFFFFFFE, 43430A1),
 	BRCMF_FW_NVRAM_ENTRY(BRCM_CC_4345_CHIP_ID, 0xFFFFFFC0, 43455),
 	BRCMF_FW_NVRAM_ENTRY(BRCM_CC_4354_CHIP_ID, 0xFFFFFFFF, 4354),
-	BRCMF_FW_NVRAM_ENTRY(BRCM_CC_4356_CHIP_ID, 0xFFFFFFFF, 4356)
+	BRCMF_FW_NVRAM_ENTRY(BRCM_CC_4356_CHIP_ID, 0xFFFFFFFF, 4356),
+	BRCMF_FW_NVRAM_ENTRY(CY_CC_4373_CHIP_ID, 0xFFFFFFFF, 4373)
 };
 
 static void pkt_align(struct sk_buff *p, int len, int align)
diff --git a/drivers/net/wireless/broadcom/brcm80211/brcmfmac/usb.c b/drivers/net/wireless/broadcom/brcm80211/brcmfmac/usb.c
index 0eea48e73331..8f20a4bb40d9 100644
--- a/drivers/net/wireless/broadcom/brcm80211/brcmfmac/usb.c
+++ b/drivers/net/wireless/broadcom/brcm80211/brcmfmac/usb.c
@@ -50,6 +50,7 @@ BRCMF_FW_DEF(43143, "brcmfmac43143.bin");
 BRCMF_FW_DEF(43236B, "brcmfmac43236b.bin");
 BRCMF_FW_DEF(43242A, "brcmfmac43242a.bin");
 BRCMF_FW_DEF(43569, "brcmfmac43569.bin");
+BRCMF_FW_DEF(4373, "brcmfmac4373.bin");
 
 static struct brcmf_firmware_mapping brcmf_usb_fwnames[] = {
 	BRCMF_FW_ENTRY(BRCM_CC_43143_CHIP_ID, 0xFFFFFFFF, 43143),
@@ -58,7 +59,8 @@ static struct brcmf_firmware_mapping brcmf_usb_fwnames[] = {
 	BRCMF_FW_ENTRY(BRCM_CC_43238_CHIP_ID, 0x00000008, 43236B),
 	BRCMF_FW_ENTRY(BRCM_CC_43242_CHIP_ID, 0xFFFFFFFF, 43242A),
 	BRCMF_FW_ENTRY(BRCM_CC_43566_CHIP_ID, 0xFFFFFFFF, 43569),
-	BRCMF_FW_ENTRY(BRCM_CC_43569_CHIP_ID, 0xFFFFFFFF, 43569)
+	BRCMF_FW_ENTRY(BRCM_CC_43569_CHIP_ID, 0xFFFFFFFF, 43569),
+	BRCMF_FW_ENTRY(CY_CC_4373_CHIP_ID, 0xFFFFFFFF, 4373)
 };
 
 #define TRX_MAGIC		0x30524448	/* "HDR0" */
@@ -1463,15 +1465,20 @@ static int brcmf_usb_reset_resume(struct usb_interface *intf)
 #define LINKSYS_USB_DEVICE(dev_id)	\
 	{ USB_DEVICE(BRCM_USB_VENDOR_ID_LINKSYS, dev_id) }
 
+#define CYPRESS_USB_DEVICE(dev_id)	\
+	{ USB_DEVICE(CY_USB_VENDOR_ID_CYPRESS, dev_id) }
+
 static struct usb_device_id brcmf_usb_devid_table[] = {
 	BRCMF_USB_DEVICE(BRCM_USB_43143_DEVICE_ID),
 	BRCMF_USB_DEVICE(BRCM_USB_43236_DEVICE_ID),
 	BRCMF_USB_DEVICE(BRCM_USB_43242_DEVICE_ID),
 	BRCMF_USB_DEVICE(BRCM_USB_43569_DEVICE_ID),
 	LINKSYS_USB_DEVICE(BRCM_USB_43235_LINKSYS_DEVICE_ID),
+	CYPRESS_USB_DEVICE(CY_USB_4373_DEVICE_ID),
 	{ USB_DEVICE(BRCM_USB_VENDOR_ID_LG, BRCM_USB_43242_LG_DEVICE_ID) },
 	/* special entry for device with firmware loaded and running */
 	BRCMF_USB_DEVICE(BRCM_USB_BCMFW_DEVICE_ID),
+	CYPRESS_USB_DEVICE(BRCM_USB_BCMFW_DEVICE_ID),
 	{ /* end: all zeroes */ }
 };
 
diff --git a/drivers/net/wireless/broadcom/brcm80211/include/brcm_hw_ids.h b/drivers/net/wireless/broadcom/brcm80211/include/brcm_hw_ids.h
index f1fb8a3c7a32..57544a3a3ce4 100644
--- a/drivers/net/wireless/broadcom/brcm80211/include/brcm_hw_ids.h
+++ b/drivers/net/wireless/broadcom/brcm80211/include/brcm_hw_ids.h
@@ -23,6 +23,7 @@
 #define BRCM_USB_VENDOR_ID_BROADCOM	0x0a5c
 #define BRCM_USB_VENDOR_ID_LG		0x043e
 #define BRCM_USB_VENDOR_ID_LINKSYS	0x13b1
+#define CY_USB_VENDOR_ID_CYPRESS	0x04b4
 #define BRCM_PCIE_VENDOR_ID_BROADCOM	PCI_VENDOR_ID_BROADCOM
 
 /* Chipcommon Core Chip IDs */
@@ -57,6 +58,7 @@
 #define BRCM_CC_4365_CHIP_ID		0x4365
 #define BRCM_CC_4366_CHIP_ID		0x4366
 #define BRCM_CC_4371_CHIP_ID		0x4371
+#define CY_CC_4373_CHIP_ID		0x4373
 
 /* USB Device IDs */
 #define BRCM_USB_43143_DEVICE_ID	0xbd1e
@@ -66,6 +68,7 @@
 #define BRCM_USB_43242_LG_DEVICE_ID	0x3101
 #define BRCM_USB_43569_DEVICE_ID	0xbd27
 #define BRCM_USB_BCMFW_DEVICE_ID	0x0bdc
+#define CY_USB_4373_DEVICE_ID		0xbd29
 
 /* PCIE Device IDs */
 #define BRCM_PCIE_4350_DEVICE_ID	0x43a3
diff --git a/include/linux/mmc/sdio_ids.h b/include/linux/mmc/sdio_ids.h
index b733eb404ffc..abacd5484bc0 100644
--- a/include/linux/mmc/sdio_ids.h
+++ b/include/linux/mmc/sdio_ids.h
@@ -39,6 +39,7 @@
 #define SDIO_DEVICE_ID_BROADCOM_43455		0xa9bf
 #define SDIO_DEVICE_ID_BROADCOM_4354		0x4354
 #define SDIO_DEVICE_ID_BROADCOM_4356		0x4356
+#define SDIO_DEVICE_ID_CYPRESS_4373		0x4373
 
 #define SDIO_VENDOR_ID_INTEL			0x0089
 #define SDIO_DEVICE_ID_INTEL_IWMC3200WIMAX	0x1402
-- 
cgit v1.2.3


From 41c8f70f5a3db7e06179186b6525fd9ee1d7d314 Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Thu, 3 Aug 2017 14:30:11 -0400
Subject: xprtrdma: Harden backchannel call decoding

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: Anna Schumaker <Anna.Schumaker@Netapp.com>
---
 include/linux/sunrpc/xdr.h        | 13 +++++++++
 net/sunrpc/xprtrdma/backchannel.c | 40 ++++++---------------------
 net/sunrpc/xprtrdma/rpc_rdma.c    | 58 +++++++++++++++++++++++++--------------
 3 files changed, 59 insertions(+), 52 deletions(-)

(limited to 'include')

diff --git a/include/linux/sunrpc/xdr.h b/include/linux/sunrpc/xdr.h
index 261b48a2701d..86b59e3525a5 100644
--- a/include/linux/sunrpc/xdr.h
+++ b/include/linux/sunrpc/xdr.h
@@ -239,6 +239,19 @@ extern unsigned int xdr_read_pages(struct xdr_stream *xdr, unsigned int len);
 extern void xdr_enter_page(struct xdr_stream *xdr, unsigned int len);
 extern int xdr_process_buf(struct xdr_buf *buf, unsigned int offset, unsigned int len, int (*actor)(struct scatterlist *, void *), void *data);
 
+/**
+ * xdr_stream_remaining - Return the number of bytes remaining in the stream
+ * @xdr: pointer to struct xdr_stream
+ *
+ * Return value:
+ *   Number of bytes remaining in @xdr before xdr->end
+ */
+static inline size_t
+xdr_stream_remaining(const struct xdr_stream *xdr)
+{
+	return xdr->nwords << 2;
+}
+
 ssize_t xdr_stream_decode_string_dup(struct xdr_stream *xdr, char **str,
 		size_t maxlen, gfp_t gfp_flags);
 /**
diff --git a/net/sunrpc/xprtrdma/backchannel.c b/net/sunrpc/xprtrdma/backchannel.c
index 03f6b5840764..183a103e08a8 100644
--- a/net/sunrpc/xprtrdma/backchannel.c
+++ b/net/sunrpc/xprtrdma/backchannel.c
@@ -271,9 +271,6 @@ void xprt_rdma_bc_free_rqst(struct rpc_rqst *rqst)
  * @xprt: transport receiving the call
  * @rep: receive buffer containing the call
  *
- * Called in the RPC reply handler, which runs in a tasklet.
- * Be quick about it.
- *
  * Operational assumptions:
  *    o Backchannel credits are ignored, just as the NFS server
  *      forechannel currently does
@@ -284,7 +281,6 @@ void rpcrdma_bc_receive_call(struct rpcrdma_xprt *r_xprt,
 			     struct rpcrdma_rep *rep)
 {
 	struct rpc_xprt *xprt = &r_xprt->rx_xprt;
-	struct rpcrdma_msg *headerp;
 	struct svc_serv *bc_serv;
 	struct rpcrdma_req *req;
 	struct rpc_rqst *rqst;
@@ -292,24 +288,15 @@ void rpcrdma_bc_receive_call(struct rpcrdma_xprt *r_xprt,
 	size_t size;
 	__be32 *p;
 
-	headerp = rdmab_to_msg(rep->rr_rdmabuf);
+	p = xdr_inline_decode(&rep->rr_stream, 0);
+	size = xdr_stream_remaining(&rep->rr_stream);
+
 #ifdef RPCRDMA_BACKCHANNEL_DEBUG
 	pr_info("RPC:       %s: callback XID %08x, length=%u\n",
-		__func__, be32_to_cpu(headerp->rm_xid), rep->rr_len);
-	pr_info("RPC:       %s: %*ph\n", __func__, rep->rr_len, headerp);
+		__func__, be32_to_cpup(p), size);
+	pr_info("RPC:       %s: %*ph\n", __func__, size, p);
 #endif
 
-	/* Sanity check:
-	 * Need at least enough bytes for RPC/RDMA header, as code
-	 * here references the header fields by array offset. Also,
-	 * backward calls are always inline, so ensure there
-	 * are some bytes beyond the RPC/RDMA header.
-	 */
-	if (rep->rr_len < RPCRDMA_HDRLEN_MIN + 24)
-		goto out_short;
-	p = (__be32 *)((unsigned char *)headerp + RPCRDMA_HDRLEN_MIN);
-	size = rep->rr_len - RPCRDMA_HDRLEN_MIN;
-
 	/* Grab a free bc rqst */
 	spin_lock(&xprt->bc_pa_lock);
 	if (list_empty(&xprt->bc_pa_list)) {
@@ -325,7 +312,7 @@ void rpcrdma_bc_receive_call(struct rpcrdma_xprt *r_xprt,
 	/* Prepare rqst */
 	rqst->rq_reply_bytes_recvd = 0;
 	rqst->rq_bytes_sent = 0;
-	rqst->rq_xid = headerp->rm_xid;
+	rqst->rq_xid = *p;
 
 	rqst->rq_private_buf.len = size;
 	set_bit(RPC_BC_PA_IN_USE, &rqst->rq_bc_pa_state);
@@ -337,9 +324,9 @@ void rpcrdma_bc_receive_call(struct rpcrdma_xprt *r_xprt,
 	buf->len = size;
 
 	/* The receive buffer has to be hooked to the rpcrdma_req
-	 * so that it can be reposted after the server is done
-	 * parsing it but just before sending the backward
-	 * direction reply.
+	 * so that it is not released while the req is pointing
+	 * to its buffer, and so that it can be reposted after
+	 * the Upper Layer is done decoding it.
 	 */
 	req = rpcr_to_rdmar(rqst);
 	dprintk("RPC:       %s: attaching rep %p to req %p\n",
@@ -367,13 +354,4 @@ out_overflow:
 	 * when the connection is re-established.
 	 */
 	return;
-
-out_short:
-	pr_warn("RPC/RDMA short backward direction call\n");
-
-	if (rpcrdma_ep_post_recv(&r_xprt->rx_ia, rep))
-		xprt_disconnect_done(xprt);
-	else
-		pr_warn("RPC:       %s: reposting rep %p\n",
-			__func__, rep);
 }
diff --git a/net/sunrpc/xprtrdma/rpc_rdma.c b/net/sunrpc/xprtrdma/rpc_rdma.c
index 24f58c7b3106..9b5ab598ab7b 100644
--- a/net/sunrpc/xprtrdma/rpc_rdma.c
+++ b/net/sunrpc/xprtrdma/rpc_rdma.c
@@ -949,35 +949,59 @@ rpcrdma_mark_remote_invalidation(struct list_head *mws,
 		}
 }
 
-#if defined(CONFIG_SUNRPC_BACKCHANNEL)
 /* By convention, backchannel calls arrive via rdma_msg type
  * messages, and never populate the chunk lists. This makes
  * the RPC/RDMA header small and fixed in size, so it is
  * straightforward to check the RPC header's direction field.
  */
 static bool
-rpcrdma_is_bcall(struct rpcrdma_msg *headerp)
+rpcrdma_is_bcall(struct rpcrdma_xprt *r_xprt, struct rpcrdma_rep *rep,
+		 __be32 xid, __be32 proc)
+#if defined(CONFIG_SUNRPC_BACKCHANNEL)
 {
-	__be32 *p = (__be32 *)headerp;
+	struct xdr_stream *xdr = &rep->rr_stream;
+	__be32 *p;
 
-	if (headerp->rm_type != rdma_msg)
+	if (proc != rdma_msg)
 		return false;
-	if (headerp->rm_body.rm_chunks[0] != xdr_zero)
+
+	/* Peek at stream contents without advancing. */
+	p = xdr_inline_decode(xdr, 0);
+
+	/* Chunk lists */
+	if (*p++ != xdr_zero)
 		return false;
-	if (headerp->rm_body.rm_chunks[1] != xdr_zero)
+	if (*p++ != xdr_zero)
 		return false;
-	if (headerp->rm_body.rm_chunks[2] != xdr_zero)
+	if (*p++ != xdr_zero)
 		return false;
 
-	/* sanity */
-	if (p[7] != headerp->rm_xid)
+	/* RPC header */
+	if (*p++ != xid)
 		return false;
-	/* call direction */
-	if (p[8] != cpu_to_be32(RPC_CALL))
+	if (*p != cpu_to_be32(RPC_CALL))
 		return false;
 
+	/* Now that we are sure this is a backchannel call,
+	 * advance to the RPC header.
+	 */
+	p = xdr_inline_decode(xdr, 3 * sizeof(*p));
+	if (unlikely(!p))
+		goto out_short;
+
+	rpcrdma_bc_receive_call(r_xprt, rep);
+	return true;
+
+out_short:
+	pr_warn("RPC/RDMA short backward direction call\n");
+	if (rpcrdma_ep_post_recv(&r_xprt->rx_ia, rep))
+		xprt_disconnect_done(&r_xprt->rx_xprt);
 	return true;
 }
+#else	/* CONFIG_SUNRPC_BACKCHANNEL */
+{
+	return false;
+}
 #endif	/* CONFIG_SUNRPC_BACKCHANNEL */
 
 /* Process received RPC/RDMA messages.
@@ -1020,10 +1044,8 @@ rpcrdma_reply_handler(struct work_struct *work)
 	proc = *p++;
 
 	headerp = rdmab_to_msg(rep->rr_rdmabuf);
-#if defined(CONFIG_SUNRPC_BACKCHANNEL)
-	if (rpcrdma_is_bcall(headerp))
-		goto out_bcall;
-#endif
+	if (rpcrdma_is_bcall(r_xprt, rep, xid, proc))
+		return;
 
 	/* Match incoming rpcrdma_rep to an rpcrdma_req to
 	 * get context for handling any incoming chunks.
@@ -1159,12 +1181,6 @@ out_badstatus:
 	}
 	return;
 
-#if defined(CONFIG_SUNRPC_BACKCHANNEL)
-out_bcall:
-	rpcrdma_bc_receive_call(r_xprt, rep);
-	return;
-#endif
-
 /* If the incoming reply terminated a pending RPC, the next
  * RPC call will post a replacement receive buffer as it is
  * being marshaled.
-- 
cgit v1.2.3


From d6344d4b5628052ccda104466e5e05b9c43d7b61 Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rafael.j.wysocki@intel.com>
Date: Fri, 4 Aug 2017 14:57:39 +0200
Subject: cpufreq: Simplify cpufreq_can_do_remote_dvfs()

The if () in cpufreq_can_do_remote_dvfs() is superfluous, so drop
it and simply return the value of the expression under it.

Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Acked-by: Viresh Kumar <viresh.kumar@linaro.org>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 include/linux/cpufreq.h | 7 ++-----
 1 file changed, 2 insertions(+), 5 deletions(-)

(limited to 'include')

diff --git a/include/linux/cpufreq.h b/include/linux/cpufreq.h
index 1981a4a167ce..d69464139f67 100644
--- a/include/linux/cpufreq.h
+++ b/include/linux/cpufreq.h
@@ -578,11 +578,8 @@ static inline bool cpufreq_can_do_remote_dvfs(struct cpufreq_policy *policy)
 	 * - dvfs_possible_from_any_cpu flag is set
 	 * - the local and remote CPUs share cpufreq policy
 	 */
-	if (policy->dvfs_possible_from_any_cpu ||
-	    cpumask_test_cpu(smp_processor_id(), policy->cpus))
-		return true;
-
-	return false;
+	return policy->dvfs_possible_from_any_cpu ||
+		cpumask_test_cpu(smp_processor_id(), policy->cpus);
 }
 
 /*********************************************************************
-- 
cgit v1.2.3


From ec52e462564b9c5bfdf1f79638c537c7103e1d2b Mon Sep 17 00:00:00 2001
From: Elaine Zhang <zhangqing@rock-chips.com>
Date: Tue, 1 Aug 2017 18:21:22 +0200
Subject: clk: fractional-divider: allow overriding of approximation

Fractional dividers may have special requirements concerning numerator
and denominator selection that differ from just getting the best
approximation.

For example on Rockchip socs the denominator must be at least 20 times
larger than the numerator to generate precise clock frequencies.

Therefore add the ability to provide custom approximation functions.

Signed-off-by: Elaine Zhang <zhangqing@rock-chips.com>
Acked-by: Stephen Boyd <sboyd@codeaurora.org>
Signed-off-by: Heiko Stuebner <heiko@sntech.de>
---
 drivers/clk/clk-fractional-divider.c | 28 ++++++++++++++++++++--------
 include/linux/clk-provider.h         |  3 +++
 2 files changed, 23 insertions(+), 8 deletions(-)

(limited to 'include')

diff --git a/drivers/clk/clk-fractional-divider.c b/drivers/clk/clk-fractional-divider.c
index aab904618eb6..fdf625fb10fa 100644
--- a/drivers/clk/clk-fractional-divider.c
+++ b/drivers/clk/clk-fractional-divider.c
@@ -49,16 +49,12 @@ static unsigned long clk_fd_recalc_rate(struct clk_hw *hw,
 	return ret;
 }
 
-static long clk_fd_round_rate(struct clk_hw *hw, unsigned long rate,
-			      unsigned long *parent_rate)
+static void clk_fd_general_approximation(struct clk_hw *hw, unsigned long rate,
+					 unsigned long *parent_rate,
+					 unsigned long *m, unsigned long *n)
 {
 	struct clk_fractional_divider *fd = to_clk_fd(hw);
 	unsigned long scale;
-	unsigned long m, n;
-	u64 ret;
-
-	if (!rate || rate >= *parent_rate)
-		return *parent_rate;
 
 	/*
 	 * Get rate closer to *parent_rate to guarantee there is no overflow
@@ -71,7 +67,23 @@ static long clk_fd_round_rate(struct clk_hw *hw, unsigned long rate,
 
 	rational_best_approximation(rate, *parent_rate,
 			GENMASK(fd->mwidth - 1, 0), GENMASK(fd->nwidth - 1, 0),
-			&m, &n);
+			m, n);
+}
+
+static long clk_fd_round_rate(struct clk_hw *hw, unsigned long rate,
+			      unsigned long *parent_rate)
+{
+	struct clk_fractional_divider *fd = to_clk_fd(hw);
+	unsigned long m, n;
+	u64 ret;
+
+	if (!rate || rate >= *parent_rate)
+		return *parent_rate;
+
+	if (fd->approximation)
+		fd->approximation(hw, rate, parent_rate, &m, &n);
+	else
+		clk_fd_general_approximation(hw, rate, parent_rate, &m, &n);
 
 	ret = (u64)*parent_rate * m;
 	do_div(ret, n);
diff --git a/include/linux/clk-provider.h b/include/linux/clk-provider.h
index c59c62571e4f..1fc113fbf955 100644
--- a/include/linux/clk-provider.h
+++ b/include/linux/clk-provider.h
@@ -565,6 +565,9 @@ struct clk_fractional_divider {
 	u8		nwidth;
 	u32		nmask;
 	u8		flags;
+	void		(*approximation)(struct clk_hw *hw,
+				unsigned long rate, unsigned long *parent_rate,
+				unsigned long *m, unsigned long *n);
 	spinlock_t	*lock;
 };
 
-- 
cgit v1.2.3


From d541e45500bd269060c26387902e1bec9783c07c Mon Sep 17 00:00:00 2001
From: Dasaratharaman Chandramouli <dasaratharaman.chandramouli@intel.com>
Date: Thu, 8 Jun 2017 13:37:43 -0400
Subject: IB/core: Convert ah_attr from OPA to IB when copying to user

OPA address handle atttibutes that have 32 bit LIDs would have to
be converted to IB address handle attribute with the LID field
programmed in the GID before copying to user space.

Signed-off-by: Dasaratharaman Chandramouli <dasaratharaman.chandramouli@intel.com>
Reviewed-by: Don Hiatt <don.hiatt@intel.com>
Reviewed-by: Ira Weiny <ira.weiny@intel.com>
Signed-off-by: Doug Ledford <dledford@redhat.com>
---
 drivers/infiniband/core/ucm.c             |  2 +-
 drivers/infiniband/core/ucma.c            | 10 ++++---
 drivers/infiniband/core/uverbs_marshall.c | 48 +++++++++++++++++++++++++++----
 include/rdma/ib_marshall.h                |  6 ++--
 4 files changed, 54 insertions(+), 12 deletions(-)

(limited to 'include')

diff --git a/drivers/infiniband/core/ucm.c b/drivers/infiniband/core/ucm.c
index 112099c86a19..f2a7f62c2834 100644
--- a/drivers/infiniband/core/ucm.c
+++ b/drivers/infiniband/core/ucm.c
@@ -618,7 +618,7 @@ static ssize_t ib_ucm_init_qp_attr(struct ib_ucm_file *file,
 	if (result)
 		goto out;
 
-	ib_copy_qp_attr_to_user(&resp, &qp_attr);
+	ib_copy_qp_attr_to_user(ctx->cm_id->device, &resp, &qp_attr);
 
 	if (copy_to_user((void __user *)(unsigned long)cmd.response,
 			 &resp, sizeof(resp)))
diff --git a/drivers/infiniband/core/ucma.c b/drivers/infiniband/core/ucma.c
index 276f0ef835bd..eb85b546e223 100644
--- a/drivers/infiniband/core/ucma.c
+++ b/drivers/infiniband/core/ucma.c
@@ -248,14 +248,15 @@ static void ucma_copy_conn_event(struct rdma_ucm_conn_param *dst,
 	dst->qp_num = src->qp_num;
 }
 
-static void ucma_copy_ud_event(struct rdma_ucm_ud_param *dst,
+static void ucma_copy_ud_event(struct ib_device *device,
+			       struct rdma_ucm_ud_param *dst,
 			       struct rdma_ud_param *src)
 {
 	if (src->private_data_len)
 		memcpy(dst->private_data, src->private_data,
 		       src->private_data_len);
 	dst->private_data_len = src->private_data_len;
-	ib_copy_ah_attr_to_user(&dst->ah_attr, &src->ah_attr);
+	ib_copy_ah_attr_to_user(device, &dst->ah_attr, &src->ah_attr);
 	dst->qp_num = src->qp_num;
 	dst->qkey = src->qkey;
 }
@@ -335,7 +336,8 @@ static int ucma_event_handler(struct rdma_cm_id *cm_id,
 	uevent->resp.event = event->event;
 	uevent->resp.status = event->status;
 	if (cm_id->qp_type == IB_QPT_UD)
-		ucma_copy_ud_event(&uevent->resp.param.ud, &event->param.ud);
+		ucma_copy_ud_event(cm_id->device, &uevent->resp.param.ud,
+				   &event->param.ud);
 	else
 		ucma_copy_conn_event(&uevent->resp.param.conn,
 				     &event->param.conn);
@@ -1157,7 +1159,7 @@ static ssize_t ucma_init_qp_attr(struct ucma_file *file,
 	if (ret)
 		goto out;
 
-	ib_copy_qp_attr_to_user(&resp, &qp_attr);
+	ib_copy_qp_attr_to_user(ctx->cm_id->device, &resp, &qp_attr);
 	if (copy_to_user((void __user *)(unsigned long)cmd.response,
 			 &resp, sizeof(resp)))
 		ret = -EFAULT;
diff --git a/drivers/infiniband/core/uverbs_marshall.c b/drivers/infiniband/core/uverbs_marshall.c
index 94fd989c9060..bd0acf376af0 100644
--- a/drivers/infiniband/core/uverbs_marshall.c
+++ b/drivers/infiniband/core/uverbs_marshall.c
@@ -33,10 +33,47 @@
 #include <linux/export.h>
 #include <rdma/ib_marshall.h>
 
-void ib_copy_ah_attr_to_user(struct ib_uverbs_ah_attr *dst,
-			     struct rdma_ah_attr *src)
+#define OPA_DEFAULT_GID_PREFIX cpu_to_be64(0xfe80000000000000ULL)
+static int rdma_ah_conv_opa_to_ib(struct ib_device *dev,
+				  struct rdma_ah_attr *ib,
+				  struct rdma_ah_attr *opa)
 {
+	struct ib_port_attr port_attr;
+	int ret = 0;
+
+	/* Do structure copy and the over-write fields */
+	*ib = *opa;
+
+	ib->type = RDMA_AH_ATTR_TYPE_IB;
+	rdma_ah_set_grh(ib, NULL, 0, 0, 1, 0);
+
+	if (ib_query_port(dev, opa->port_num, &port_attr)) {
+		/* Set to default subnet to indicate error */
+		rdma_ah_set_subnet_prefix(ib, OPA_DEFAULT_GID_PREFIX);
+		ret = -EINVAL;
+	} else {
+		rdma_ah_set_subnet_prefix(ib,
+					  cpu_to_be64(port_attr.subnet_prefix));
+	}
+	rdma_ah_set_interface_id(ib, OPA_MAKE_ID(rdma_ah_get_dlid(opa)));
+	return ret;
+}
+
+void ib_copy_ah_attr_to_user(struct ib_device *device,
+			     struct ib_uverbs_ah_attr *dst,
+			     struct rdma_ah_attr *ah_attr)
+{
+	struct rdma_ah_attr *src = ah_attr;
+	struct rdma_ah_attr conv_ah;
+
 	memset(&dst->grh.reserved, 0, sizeof(dst->grh.reserved));
+
+	if ((ah_attr->type == RDMA_AH_ATTR_TYPE_OPA) &&
+	    (rdma_ah_get_dlid(ah_attr) >=
+	     be16_to_cpu(IB_MULTICAST_LID_BASE)) &&
+	    (!rdma_ah_conv_opa_to_ib(device, &conv_ah, ah_attr)))
+		src = &conv_ah;
+
 	dst->dlid		   = rdma_ah_get_dlid(src);
 	dst->sl			   = rdma_ah_get_sl(src);
 	dst->src_path_bits	   = rdma_ah_get_path_bits(src);
@@ -57,7 +94,8 @@ void ib_copy_ah_attr_to_user(struct ib_uverbs_ah_attr *dst,
 }
 EXPORT_SYMBOL(ib_copy_ah_attr_to_user);
 
-void ib_copy_qp_attr_to_user(struct ib_uverbs_qp_attr *dst,
+void ib_copy_qp_attr_to_user(struct ib_device *device,
+			     struct ib_uverbs_qp_attr *dst,
 			     struct ib_qp_attr *src)
 {
 	dst->qp_state	        = src->qp_state;
@@ -76,8 +114,8 @@ void ib_copy_qp_attr_to_user(struct ib_uverbs_qp_attr *dst,
 	dst->max_recv_sge	= src->cap.max_recv_sge;
 	dst->max_inline_data	= src->cap.max_inline_data;
 
-	ib_copy_ah_attr_to_user(&dst->ah_attr, &src->ah_attr);
-	ib_copy_ah_attr_to_user(&dst->alt_ah_attr, &src->alt_ah_attr);
+	ib_copy_ah_attr_to_user(device, &dst->ah_attr, &src->ah_attr);
+	ib_copy_ah_attr_to_user(device, &dst->alt_ah_attr, &src->alt_ah_attr);
 
 	dst->pkey_index		= src->pkey_index;
 	dst->alt_pkey_index	= src->alt_pkey_index;
diff --git a/include/rdma/ib_marshall.h b/include/rdma/ib_marshall.h
index 68cef3bd50fb..8ebf84ae9ed1 100644
--- a/include/rdma/ib_marshall.h
+++ b/include/rdma/ib_marshall.h
@@ -38,10 +38,12 @@
 #include <rdma/ib_user_verbs.h>
 #include <rdma/ib_user_sa.h>
 
-void ib_copy_qp_attr_to_user(struct ib_uverbs_qp_attr *dst,
+void ib_copy_qp_attr_to_user(struct ib_device *device,
+			     struct ib_uverbs_qp_attr *dst,
 			     struct ib_qp_attr *src);
 
-void ib_copy_ah_attr_to_user(struct ib_uverbs_ah_attr *dst,
+void ib_copy_ah_attr_to_user(struct ib_device *device,
+			     struct ib_uverbs_ah_attr *dst,
 			     struct rdma_ah_attr *src);
 
 void ib_copy_path_rec_to_user(struct ib_user_path_rec *dst,
-- 
cgit v1.2.3


From 582faf3150f57b8364ac9d2aa731d7368ada7a4b Mon Sep 17 00:00:00 2001
From: Dasaratharaman Chandramouli <dasaratharaman.chandramouli@intel.com>
Date: Thu, 8 Jun 2017 13:37:47 -0400
Subject: IB/core: Change port_attr.lid size from 16 to 32 bits

lid field in struct ib_port_attr is increased to 32 bits. This enables core
components to use larger LIDs if needed.
The user ABI is unchanged and return 16 bit values when queried.

Signed-off-by: Dasaratharaman Chandramouli <dasaratharaman.chandramouli@intel.com>
Reviewed-by: Ira Weiny <ira.weiny@intel.com>
Signed-off-by: Don Hiatt <don.hiatt@intel.com>
Signed-off-by: Doug Ledford <dledford@redhat.com>
---
 drivers/infiniband/core/core_priv.h     | 1 +
 drivers/infiniband/core/uverbs_cmd.c    | 5 ++++-
 drivers/infiniband/hw/mlx4/alias_GUID.c | 2 +-
 drivers/infiniband/hw/mlx4/mad.c        | 2 +-
 drivers/infiniband/hw/mthca/mthca_mad.c | 2 +-
 include/rdma/ib_verbs.h                 | 2 +-
 include/rdma/opa_addr.h                 | 3 ++-
 7 files changed, 11 insertions(+), 6 deletions(-)

(limited to 'include')

diff --git a/drivers/infiniband/core/core_priv.h b/drivers/infiniband/core/core_priv.h
index 11ae67514e13..6b54280530c9 100644
--- a/drivers/infiniband/core/core_priv.h
+++ b/drivers/infiniband/core/core_priv.h
@@ -38,6 +38,7 @@
 #include <linux/cgroup_rdma.h>
 
 #include <rdma/ib_verbs.h>
+#include <rdma/opa_addr.h>
 #include <rdma/ib_mad.h>
 #include "mad_priv.h"
 
diff --git a/drivers/infiniband/core/uverbs_cmd.c b/drivers/infiniband/core/uverbs_cmd.c
index 2c98533a0203..eef2623406cc 100644
--- a/drivers/infiniband/core/uverbs_cmd.c
+++ b/drivers/infiniband/core/uverbs_cmd.c
@@ -275,8 +275,11 @@ ssize_t ib_uverbs_query_port(struct ib_uverbs_file *file,
 	resp.bad_pkey_cntr   = attr.bad_pkey_cntr;
 	resp.qkey_viol_cntr  = attr.qkey_viol_cntr;
 	resp.pkey_tbl_len    = attr.pkey_tbl_len;
-	resp.lid 	     = attr.lid;
 	resp.sm_lid 	     = attr.sm_lid;
+	if (rdma_cap_opa_ah(ib_dev, cmd.port_num))
+		resp.lid  = OPA_TO_IB_UCAST_LID(attr.lid);
+	else
+		resp.lid     = (u16)attr.lid;
 	resp.lmc 	     = attr.lmc;
 	resp.max_vl_num      = attr.max_vl_num;
 	resp.sm_sl 	     = attr.sm_sl;
diff --git a/drivers/infiniband/hw/mlx4/alias_GUID.c b/drivers/infiniband/hw/mlx4/alias_GUID.c
index ea24230ea0d4..5a897b0106a9 100644
--- a/drivers/infiniband/hw/mlx4/alias_GUID.c
+++ b/drivers/infiniband/hw/mlx4/alias_GUID.c
@@ -528,7 +528,7 @@ static int set_guid_rec(struct ib_device *ibdev,
 
 	memset(&guid_info_rec, 0, sizeof (struct ib_sa_guidinfo_rec));
 
-	guid_info_rec.lid = cpu_to_be16(attr.lid);
+	guid_info_rec.lid = cpu_to_be16((u16)attr.lid);
 	guid_info_rec.block_num = index;
 
 	memcpy(guid_info_rec.guid_info_list, rec_det->all_recs,
diff --git a/drivers/infiniband/hw/mlx4/mad.c b/drivers/infiniband/hw/mlx4/mad.c
index 21d31cb1325f..00f057033cb9 100644
--- a/drivers/infiniband/hw/mlx4/mad.c
+++ b/drivers/infiniband/hw/mlx4/mad.c
@@ -860,7 +860,7 @@ static int ib_process_mad(struct ib_device *ibdev, int mad_flags, u8 port_num,
 	    in_mad->mad_hdr.method == IB_MGMT_METHOD_SET &&
 	    in_mad->mad_hdr.attr_id == IB_SMP_ATTR_PORT_INFO &&
 	    !ib_query_port(ibdev, port_num, &pattr))
-		prev_lid = pattr.lid;
+		prev_lid = (u16)pattr.lid;
 
 	err = mlx4_MAD_IFC(to_mdev(ibdev),
 			   (mad_flags & IB_MAD_IGNORE_MKEY ? MLX4_MAD_IFC_IGNORE_MKEY : 0) |
diff --git a/drivers/infiniband/hw/mthca/mthca_mad.c b/drivers/infiniband/hw/mthca/mthca_mad.c
index 7df3db71777a..617531f1bfc6 100644
--- a/drivers/infiniband/hw/mthca/mthca_mad.c
+++ b/drivers/infiniband/hw/mthca/mthca_mad.c
@@ -256,7 +256,7 @@ int mthca_process_mad(struct ib_device *ibdev,
 	    in_mad->mad_hdr.method == IB_MGMT_METHOD_SET &&
 	    in_mad->mad_hdr.attr_id == IB_SMP_ATTR_PORT_INFO &&
 	    !ib_query_port(ibdev, port_num, &pattr))
-		prev_lid = pattr.lid;
+		prev_lid = (u16)pattr.lid;
 
 	err = mthca_MAD_IFC(to_mdev(ibdev),
 			    mad_flags & IB_MAD_IGNORE_MKEY,
diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h
index b5732432bb29..4fa94e69b1fc 100644
--- a/include/rdma/ib_verbs.h
+++ b/include/rdma/ib_verbs.h
@@ -549,8 +549,8 @@ struct ib_port_attr {
 	u32			bad_pkey_cntr;
 	u32			qkey_viol_cntr;
 	u16			pkey_tbl_len;
-	u16			lid;
 	u16			sm_lid;
+	u32			lid;
 	u8			lmc;
 	u8			max_vl_num;
 	u8			sm_sl;
diff --git a/include/rdma/opa_addr.h b/include/rdma/opa_addr.h
index eace28f1555d..46d0567fffea 100644
--- a/include/rdma/opa_addr.h
+++ b/include/rdma/opa_addr.h
@@ -50,7 +50,8 @@
 
 #define	OPA_SPECIAL_OUI		(0x00066AULL)
 #define OPA_MAKE_ID(x)          (cpu_to_be64(OPA_SPECIAL_OUI << 40 | (x)))
-
+#define OPA_TO_IB_UCAST_LID(x) (((x) >= be16_to_cpu(IB_MULTICAST_LID_BASE)) \
+				? 0 : x)
 /**
  * ib_is_opa_gid: Returns true if the top 24 bits of the gid
  * contains the OPA_STL_OUI identifier. This identifies that
-- 
cgit v1.2.3


From db58540b021a17e0ede64f761b740556d77f1679 Mon Sep 17 00:00:00 2001
From: Dasaratharaman Chandramouli <dasaratharaman.chandramouli@intel.com>
Date: Thu, 8 Jun 2017 13:37:48 -0400
Subject: IB/core: Change port_attr.sm_lid from 16 to 32 bits

sm_lid field in struct ib_port_attr is increased to 32 bits. This
enables core components to use larger LIDs if needed.
The user ABI is unchanged and return 16 bit values when queried.

Signed-off-by: Dasaratharaman Chandramouli <dasaratharaman.chandramouli@intel.com>
Reviewed-by: Ira Weiny <ira.weiny@intel.com>
Signed-off-by: Don Hiatt <don.hiatt@intel.com>
Signed-off-by: Doug Ledford <dledford@redhat.com>
---
 drivers/infiniband/core/uverbs_cmd.c | 8 +++++---
 include/rdma/ib_verbs.h              | 2 +-
 2 files changed, 6 insertions(+), 4 deletions(-)

(limited to 'include')

diff --git a/drivers/infiniband/core/uverbs_cmd.c b/drivers/infiniband/core/uverbs_cmd.c
index eef2623406cc..01e2ff023980 100644
--- a/drivers/infiniband/core/uverbs_cmd.c
+++ b/drivers/infiniband/core/uverbs_cmd.c
@@ -275,11 +275,13 @@ ssize_t ib_uverbs_query_port(struct ib_uverbs_file *file,
 	resp.bad_pkey_cntr   = attr.bad_pkey_cntr;
 	resp.qkey_viol_cntr  = attr.qkey_viol_cntr;
 	resp.pkey_tbl_len    = attr.pkey_tbl_len;
-	resp.sm_lid 	     = attr.sm_lid;
-	if (rdma_cap_opa_ah(ib_dev, cmd.port_num))
+	if (rdma_cap_opa_ah(ib_dev, cmd.port_num)) {
 		resp.lid  = OPA_TO_IB_UCAST_LID(attr.lid);
-	else
+		resp.sm_lid  = OPA_TO_IB_UCAST_LID(attr.sm_lid);
+	} else {
 		resp.lid     = (u16)attr.lid;
+		resp.sm_lid  = (u16)attr.sm_lid;
+	}
 	resp.lmc 	     = attr.lmc;
 	resp.max_vl_num      = attr.max_vl_num;
 	resp.sm_sl 	     = attr.sm_sl;
diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h
index 4fa94e69b1fc..620535908118 100644
--- a/include/rdma/ib_verbs.h
+++ b/include/rdma/ib_verbs.h
@@ -549,7 +549,7 @@ struct ib_port_attr {
 	u32			bad_pkey_cntr;
 	u32			qkey_viol_cntr;
 	u16			pkey_tbl_len;
-	u16			sm_lid;
+	u32			sm_lid;
 	u32			lid;
 	u8			lmc;
 	u8			max_vl_num;
-- 
cgit v1.2.3


From 7db20ecd1d9700e2c240dee505162eb56ab55b5b Mon Sep 17 00:00:00 2001
From: "Hiatt, Don" <don.hiatt@intel.com>
Date: Thu, 8 Jun 2017 13:37:49 -0400
Subject: IB/core: Change wc.slid from 16 to 32 bits

slid field in struct ib_wc is increased to 32 bits.
This enables core components to use larger LIDs if needed.
The user ABI is unchanged and return 16 bit values when queried.

Signed-off-by: Dasaratharaman Chandramouli <dasaratharaman.chandramouli@intel.com>
Reviewed-by: Ira Weiny <ira.weiny@intel.com>
Signed-off-by: Don Hiatt <don.hiatt@intel.com>
Signed-off-by: Doug Ledford <dledford@redhat.com>
---
 drivers/infiniband/core/cm.c            |  4 ++--
 drivers/infiniband/core/user_mad.c      |  2 +-
 drivers/infiniband/core/uverbs_cmd.c    | 10 +++++++---
 drivers/infiniband/hw/hfi1/mad.c        |  2 +-
 drivers/infiniband/hw/mlx4/mad.c        |  6 +++---
 drivers/infiniband/hw/mlx5/mad.c        |  2 +-
 drivers/infiniband/hw/mthca/mthca_cmd.c |  4 ++--
 drivers/infiniband/hw/mthca/mthca_mad.c |  2 +-
 drivers/infiniband/sw/rdmavt/cq.c       |  2 +-
 include/rdma/ib_verbs.h                 | 14 +++++++++++++-
 10 files changed, 32 insertions(+), 16 deletions(-)

(limited to 'include')

diff --git a/drivers/infiniband/core/cm.c b/drivers/infiniband/core/cm.c
index 2b4d613a3474..b39ee16aa479 100644
--- a/drivers/infiniband/core/cm.c
+++ b/drivers/infiniband/core/cm.c
@@ -1703,7 +1703,7 @@ static void cm_process_routed_req(struct cm_req_msg *req_msg, struct ib_wc *wc)
 {
 	if (!cm_req_get_primary_subnet_local(req_msg)) {
 		if (req_msg->primary_local_lid == IB_LID_PERMISSIVE) {
-			req_msg->primary_local_lid = cpu_to_be16(wc->slid);
+			req_msg->primary_local_lid = ib_slid_be16(wc->slid);
 			cm_req_set_primary_sl(req_msg, wc->sl);
 		}
 
@@ -1713,7 +1713,7 @@ static void cm_process_routed_req(struct cm_req_msg *req_msg, struct ib_wc *wc)
 
 	if (!cm_req_get_alt_subnet_local(req_msg)) {
 		if (req_msg->alt_local_lid == IB_LID_PERMISSIVE) {
-			req_msg->alt_local_lid = cpu_to_be16(wc->slid);
+			req_msg->alt_local_lid = ib_slid_be16(wc->slid);
 			cm_req_set_alt_sl(req_msg, wc->sl);
 		}
 
diff --git a/drivers/infiniband/core/user_mad.c b/drivers/infiniband/core/user_mad.c
index 36a6f5c8914c..ff3c67a7aaad 100644
--- a/drivers/infiniband/core/user_mad.c
+++ b/drivers/infiniband/core/user_mad.c
@@ -229,7 +229,7 @@ static void recv_handler(struct ib_mad_agent *agent,
 	packet->mad.hdr.status	   = 0;
 	packet->mad.hdr.length	   = hdr_size(file) + mad_recv_wc->mad_len;
 	packet->mad.hdr.qpn	   = cpu_to_be32(mad_recv_wc->wc->src_qp);
-	packet->mad.hdr.lid	   = cpu_to_be16(mad_recv_wc->wc->slid);
+	packet->mad.hdr.lid	   = ib_slid_be16(mad_recv_wc->wc->slid);
 	packet->mad.hdr.sl	   = mad_recv_wc->wc->sl;
 	packet->mad.hdr.path_bits  = mad_recv_wc->wc->dlid_path_bits;
 	packet->mad.hdr.pkey_index = mad_recv_wc->wc->pkey_index;
diff --git a/drivers/infiniband/core/uverbs_cmd.c b/drivers/infiniband/core/uverbs_cmd.c
index 01e2ff023980..eb0da3784bf4 100644
--- a/drivers/infiniband/core/uverbs_cmd.c
+++ b/drivers/infiniband/core/uverbs_cmd.c
@@ -1190,7 +1190,8 @@ out:
 	return ret ? ret : in_len;
 }
 
-static int copy_wc_to_user(void __user *dest, struct ib_wc *wc)
+static int copy_wc_to_user(struct ib_device *ib_dev, void __user *dest,
+			   struct ib_wc *wc)
 {
 	struct ib_uverbs_wc tmp;
 
@@ -1204,7 +1205,10 @@ static int copy_wc_to_user(void __user *dest, struct ib_wc *wc)
 	tmp.src_qp		= wc->src_qp;
 	tmp.wc_flags		= wc->wc_flags;
 	tmp.pkey_index		= wc->pkey_index;
-	tmp.slid		= wc->slid;
+	if (rdma_cap_opa_ah(ib_dev, wc->port_num))
+		tmp.slid  = OPA_TO_IB_UCAST_LID(wc->slid);
+	else
+		tmp.slid  = ib_slid_cpu16(wc->slid);
 	tmp.sl			= wc->sl;
 	tmp.dlid_path_bits	= wc->dlid_path_bits;
 	tmp.port_num		= wc->port_num;
@@ -1248,7 +1252,7 @@ ssize_t ib_uverbs_poll_cq(struct ib_uverbs_file *file,
 		if (!ret)
 			break;
 
-		ret = copy_wc_to_user(data_ptr, &wc);
+		ret = copy_wc_to_user(ib_dev, data_ptr, &wc);
 		if (ret)
 			goto out_put;
 
diff --git a/drivers/infiniband/hw/hfi1/mad.c b/drivers/infiniband/hw/hfi1/mad.c
index 5977673a52d4..00ebc26cd187 100644
--- a/drivers/infiniband/hw/hfi1/mad.c
+++ b/drivers/infiniband/hw/hfi1/mad.c
@@ -3958,7 +3958,7 @@ static int opa_local_smp_check(struct hfi1_ibport *ibp,
 			       const struct ib_wc *in_wc)
 {
 	struct hfi1_pportdata *ppd = ppd_from_ibp(ibp);
-	u16 slid = in_wc->slid;
+	u16 slid = ib_slid_cpu16(in_wc->slid);
 	u16 pkey;
 
 	if (in_wc->pkey_index >= ARRAY_SIZE(ppd->pkeys))
diff --git a/drivers/infiniband/hw/mlx4/mad.c b/drivers/infiniband/hw/mlx4/mad.c
index 00f057033cb9..04fb44e7699e 100644
--- a/drivers/infiniband/hw/mlx4/mad.c
+++ b/drivers/infiniband/hw/mlx4/mad.c
@@ -169,7 +169,7 @@ int mlx4_MAD_IFC(struct mlx4_ib_dev *dev, int mad_ifc_flags,
 
 		op_modifier |= 0x4;
 
-		in_modifier |= in_wc->slid << 16;
+		in_modifier |= ib_slid_cpu16(in_wc->slid) << 16;
 	}
 
 	err = mlx4_cmd_box(dev->dev, inmailbox->dma, outmailbox->dma, in_modifier,
@@ -625,7 +625,7 @@ int mlx4_ib_send_to_slave(struct mlx4_ib_dev *dev, int slave, u8 port,
 		memcpy((char *)&tun_mad->hdr.slid_mac_47_32, &(wc->smac[4]), 2);
 	} else {
 		tun_mad->hdr.sl_vid = cpu_to_be16(((u16)(wc->sl)) << 12);
-		tun_mad->hdr.slid_mac_47_32 = cpu_to_be16(wc->slid);
+		tun_mad->hdr.slid_mac_47_32 = ib_slid_be16(wc->slid);
 	}
 
 	ib_dma_sync_single_for_device(&dev->ib_dev,
@@ -826,7 +826,7 @@ static int ib_process_mad(struct ib_device *ibdev, int mad_flags, u8 port_num,
 		}
 	}
 
-	slid = in_wc ? in_wc->slid : be16_to_cpu(IB_LID_PERMISSIVE);
+	slid = in_wc ? ib_slid_cpu16(in_wc->slid) : be16_to_cpu(IB_LID_PERMISSIVE);
 
 	if (in_mad->mad_hdr.method == IB_MGMT_METHOD_TRAP && slid == 0) {
 		forward_trap(to_mdev(ibdev), port_num, in_mad);
diff --git a/drivers/infiniband/hw/mlx5/mad.c b/drivers/infiniband/hw/mlx5/mad.c
index 95db929bdc34..cd2264ac88ae 100644
--- a/drivers/infiniband/hw/mlx5/mad.c
+++ b/drivers/infiniband/hw/mlx5/mad.c
@@ -78,7 +78,7 @@ static int process_mad(struct ib_device *ibdev, int mad_flags, u8 port_num,
 	u16 slid;
 	int err;
 
-	slid = in_wc ? in_wc->slid : be16_to_cpu(IB_LID_PERMISSIVE);
+	slid = in_wc ? ib_slid_cpu16(in_wc->slid) : be16_to_cpu(IB_LID_PERMISSIVE);
 
 	if (in_mad->mad_hdr.method == IB_MGMT_METHOD_TRAP && slid == 0)
 		return IB_MAD_RESULT_SUCCESS | IB_MAD_RESULT_CONSUMED;
diff --git a/drivers/infiniband/hw/mthca/mthca_cmd.c b/drivers/infiniband/hw/mthca/mthca_cmd.c
index 9d83a53c0c67..e19ae0b9b439 100644
--- a/drivers/infiniband/hw/mthca/mthca_cmd.c
+++ b/drivers/infiniband/hw/mthca/mthca_cmd.c
@@ -1921,7 +1921,7 @@ int mthca_MAD_IFC(struct mthca_dev *dev, int ignore_mkey, int ignore_bkey,
 			(in_wc->wc_flags & IB_WC_GRH ? 0x80 : 0);
 		MTHCA_PUT(inbox, val,               MAD_IFC_G_PATH_OFFSET);
 
-		MTHCA_PUT(inbox, in_wc->slid,       MAD_IFC_RLID_OFFSET);
+		MTHCA_PUT(inbox, ib_slid_cpu16(in_wc->slid), MAD_IFC_RLID_OFFSET);
 		MTHCA_PUT(inbox, in_wc->pkey_index, MAD_IFC_PKEY_OFFSET);
 
 		if (in_grh)
@@ -1929,7 +1929,7 @@ int mthca_MAD_IFC(struct mthca_dev *dev, int ignore_mkey, int ignore_bkey,
 
 		op_modifier |= 0x4;
 
-		in_modifier |= in_wc->slid << 16;
+		in_modifier |= ib_slid_cpu16(in_wc->slid) << 16;
 	}
 
 	err = mthca_cmd_box(dev, inmailbox->dma, outmailbox->dma,
diff --git a/drivers/infiniband/hw/mthca/mthca_mad.c b/drivers/infiniband/hw/mthca/mthca_mad.c
index 617531f1bfc6..a9caadab22cf 100644
--- a/drivers/infiniband/hw/mthca/mthca_mad.c
+++ b/drivers/infiniband/hw/mthca/mthca_mad.c
@@ -205,7 +205,7 @@ int mthca_process_mad(struct ib_device *ibdev,
 		      u16 *out_mad_pkey_index)
 {
 	int err;
-	u16 slid = in_wc ? in_wc->slid : be16_to_cpu(IB_LID_PERMISSIVE);
+	u16 slid = in_wc ? ib_slid_cpu16(in_wc->slid) : be16_to_cpu(IB_LID_PERMISSIVE);
 	u16 prev_lid = 0;
 	struct ib_port_attr pattr;
 	const struct ib_mad *in_mad = (const struct ib_mad *)in;
diff --git a/drivers/infiniband/sw/rdmavt/cq.c b/drivers/infiniband/sw/rdmavt/cq.c
index 0ae2ff8cf81e..0335a3df74d5 100644
--- a/drivers/infiniband/sw/rdmavt/cq.c
+++ b/drivers/infiniband/sw/rdmavt/cq.c
@@ -107,7 +107,7 @@ void rvt_cq_enter(struct rvt_cq *cq, struct ib_wc *entry, bool solicited)
 		wc->uqueue[head].src_qp = entry->src_qp;
 		wc->uqueue[head].wc_flags = entry->wc_flags;
 		wc->uqueue[head].pkey_index = entry->pkey_index;
-		wc->uqueue[head].slid = entry->slid;
+		wc->uqueue[head].slid = ib_slid_cpu16(entry->slid);
 		wc->uqueue[head].sl = entry->sl;
 		wc->uqueue[head].dlid_path_bits = entry->dlid_path_bits;
 		wc->uqueue[head].port_num = entry->port_num;
diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h
index 620535908118..7eaf7d2ab424 100644
--- a/include/rdma/ib_verbs.h
+++ b/include/rdma/ib_verbs.h
@@ -948,7 +948,7 @@ struct ib_wc {
 	u32			src_qp;
 	int			wc_flags;
 	u16			pkey_index;
-	u16			slid;
+	u32			slid;
 	u8			sl;
 	u8			dlid_path_bits;
 	u8			port_num;	/* valid only for DR SMPs on switches */
@@ -3706,4 +3706,16 @@ static inline enum rdma_ah_attr_type rdma_ah_find_type(struct ib_device *dev,
 	else
 		return RDMA_AH_ATTR_TYPE_IB;
 }
+
+/* Return slid in 16bit CPU encoding */
+static inline u16 ib_slid_cpu16(u32 slid)
+{
+	return (u16)slid;
+}
+
+/* Return slid in 16bit BE encoding */
+static inline u16 ib_slid_be16(u32 slid)
+{
+	return cpu_to_be16((u16)slid);
+}
 #endif /* IB_VERBS_H */
-- 
cgit v1.2.3


From e92aa00a518971fca6b79aa87a1a9c5e5aa51f3b Mon Sep 17 00:00:00 2001
From: "Hiatt, Don" <don.hiatt@intel.com>
Date: Thu, 8 Jun 2017 13:38:02 -0400
Subject: IB/CM: Add OPA Path record support to CM

Add OPA path record support to the Connection Manager.

Signed-off-by: Don Hiatt <don.hiatt@intel.com>
Reviewed-by: Ira Weiny <ira.weiny@intel.com>
Signed-off-by: Doug Ledford <dledford@redhat.com>
---
 drivers/infiniband/core/cm.c | 50 +++++++++++++++++++++++++++++++++++++-------
 include/rdma/opa_addr.h      | 18 ++++++++++++++++
 2 files changed, 60 insertions(+), 8 deletions(-)

(limited to 'include')

diff --git a/drivers/infiniband/core/cm.c b/drivers/infiniband/core/cm.c
index b39ee16aa479..885c429b4942 100644
--- a/drivers/infiniband/core/cm.c
+++ b/drivers/infiniband/core/cm.c
@@ -1175,6 +1175,11 @@ static void cm_format_req(struct cm_req_msg *req_msg,
 {
 	struct sa_path_rec *pri_path = param->primary_path;
 	struct sa_path_rec *alt_path = param->alternate_path;
+	bool pri_ext = false;
+
+	if (pri_path->rec_type == SA_PATH_REC_TYPE_OPA)
+		pri_ext = opa_is_extended_lid(pri_path->opa.dlid,
+					      pri_path->opa.slid);
 
 	cm_format_mad_hdr(&req_msg->hdr, CM_REQ_ATTR_ID,
 			  cm_form_tid(cm_id_priv, CM_MSG_SEQUENCE_REQ));
@@ -1202,18 +1207,24 @@ static void cm_format_req(struct cm_req_msg *req_msg,
 		cm_req_set_srq(req_msg, param->srq);
 	}
 
+	req_msg->primary_local_gid = pri_path->sgid;
+	req_msg->primary_remote_gid = pri_path->dgid;
+	if (pri_ext) {
+		req_msg->primary_local_gid.global.interface_id
+			= OPA_MAKE_ID(be32_to_cpu(pri_path->opa.slid));
+		req_msg->primary_remote_gid.global.interface_id
+			= OPA_MAKE_ID(be32_to_cpu(pri_path->opa.dlid));
+	}
 	if (pri_path->hop_limit <= 1) {
-		req_msg->primary_local_lid =
+		req_msg->primary_local_lid = pri_ext ? 0 :
 			htons(ntohl(sa_path_get_slid(pri_path)));
-		req_msg->primary_remote_lid =
+		req_msg->primary_remote_lid = pri_ext ? 0 :
 			htons(ntohl(sa_path_get_dlid(pri_path)));
 	} else {
 		/* Work-around until there's a way to obtain remote LID info */
 		req_msg->primary_local_lid = IB_LID_PERMISSIVE;
 		req_msg->primary_remote_lid = IB_LID_PERMISSIVE;
 	}
-	req_msg->primary_local_gid = pri_path->sgid;
-	req_msg->primary_remote_gid = pri_path->dgid;
 	cm_req_set_primary_flow_label(req_msg, pri_path->flow_label);
 	cm_req_set_primary_packet_rate(req_msg, pri_path->rate);
 	req_msg->primary_traffic_class = pri_path->traffic_class;
@@ -1225,17 +1236,29 @@ static void cm_format_req(struct cm_req_msg *req_msg,
 			       pri_path->packet_life_time));
 
 	if (alt_path) {
+		bool alt_ext = false;
+
+		if (alt_path->rec_type == SA_PATH_REC_TYPE_OPA)
+			alt_ext = opa_is_extended_lid(alt_path->opa.dlid,
+						      alt_path->opa.slid);
+
+		req_msg->alt_local_gid = alt_path->sgid;
+		req_msg->alt_remote_gid = alt_path->dgid;
+		if (alt_ext) {
+			req_msg->alt_local_gid.global.interface_id
+				= OPA_MAKE_ID(be32_to_cpu(alt_path->opa.slid));
+			req_msg->alt_remote_gid.global.interface_id
+				= OPA_MAKE_ID(be32_to_cpu(alt_path->opa.dlid));
+		}
 		if (alt_path->hop_limit <= 1) {
-			req_msg->alt_local_lid =
+			req_msg->alt_local_lid = alt_ext ? 0 :
 				htons(ntohl(sa_path_get_slid(alt_path)));
-			req_msg->alt_remote_lid =
+			req_msg->alt_remote_lid = alt_ext ? 0 :
 				htons(ntohl(sa_path_get_dlid(alt_path)));
 		} else {
 			req_msg->alt_local_lid = IB_LID_PERMISSIVE;
 			req_msg->alt_remote_lid = IB_LID_PERMISSIVE;
 		}
-		req_msg->alt_local_gid = alt_path->sgid;
-		req_msg->alt_remote_gid = alt_path->dgid;
 		cm_req_set_alt_flow_label(req_msg,
 					  alt_path->flow_label);
 		cm_req_set_alt_packet_rate(req_msg, alt_path->rate);
@@ -2843,6 +2866,11 @@ static void cm_format_lap(struct cm_lap_msg *lap_msg,
 			  const void *private_data,
 			  u8 private_data_len)
 {
+	bool alt_ext = false;
+
+	if (alternate_path->rec_type == SA_PATH_REC_TYPE_OPA)
+		alt_ext = opa_is_extended_lid(alternate_path->opa.dlid,
+					      alternate_path->opa.slid);
 	cm_format_mad_hdr(&lap_msg->hdr, CM_LAP_ATTR_ID,
 			  cm_form_tid(cm_id_priv, CM_MSG_SEQUENCE_LAP));
 	lap_msg->local_comm_id = cm_id_priv->id.local_id;
@@ -2856,6 +2884,12 @@ static void cm_format_lap(struct cm_lap_msg *lap_msg,
 		htons(ntohl(sa_path_get_dlid(alternate_path)));
 	lap_msg->alt_local_gid = alternate_path->sgid;
 	lap_msg->alt_remote_gid = alternate_path->dgid;
+	if (alt_ext) {
+		lap_msg->alt_local_gid.global.interface_id
+			= OPA_MAKE_ID(be32_to_cpu(alternate_path->opa.slid));
+		lap_msg->alt_remote_gid.global.interface_id
+			= OPA_MAKE_ID(be32_to_cpu(alternate_path->opa.dlid));
+	}
 	cm_lap_set_flow_label(lap_msg, alternate_path->flow_label);
 	cm_lap_set_traffic_class(lap_msg, alternate_path->traffic_class);
 	lap_msg->alt_hop_limit = alternate_path->hop_limit;
diff --git a/include/rdma/opa_addr.h b/include/rdma/opa_addr.h
index 46d0567fffea..9b5e642cf550 100644
--- a/include/rdma/opa_addr.h
+++ b/include/rdma/opa_addr.h
@@ -77,4 +77,22 @@ static inline u32 opa_get_lid_from_gid(union ib_gid *gid)
 {
 	return be64_to_cpu(gid->global.interface_id) & 0xFFFFFFFF;
 }
+
+/**
+ * opa_is_extended_lid: Returns true if dlid or slid are
+ * extended.
+ *
+ * @dlid: The DLID
+ * @slid: The SLID
+ */
+static inline bool opa_is_extended_lid(u32 dlid, u32 slid)
+{
+	if ((be32_to_cpu(dlid) >=
+	     be16_to_cpu(IB_MULTICAST_LID_BASE)) ||
+	    (be32_to_cpu(slid) >=
+	     be16_to_cpu(IB_MULTICAST_LID_BASE)))
+		return true;
+	else
+		return false;
+}
 #endif /* OPA_ADDR_H */
-- 
cgit v1.2.3


From 78249c4215840edb95447ec6867b69a7ac1d7a0d Mon Sep 17 00:00:00 2001
From: Sagi Grimberg <sagi@grimberg.me>
Date: Thu, 13 Jul 2017 11:09:38 +0300
Subject: mlx5: convert to generic pci_alloc_irq_vectors

Now that we have a generic code to allocate an array
of irq vectors and even correctly spread their affinity,
correctly handle cpu hotplug events and more, were much
better off using it.

Reviewed-by: Christoph Hellwig <hch@lst.de>
Acked-by: Leon Romanovsky <leonro@mellanox.com>
Signed-off-by: Sagi Grimberg <sagi@grimberg.me>
Signed-off-by: Doug Ledford <dledford@redhat.com>
---
 drivers/net/ethernet/mellanox/mlx5/core/en_main.c  |  2 +-
 drivers/net/ethernet/mellanox/mlx5/core/eq.c       |  9 ++---
 drivers/net/ethernet/mellanox/mlx5/core/eswitch.c  |  2 +-
 drivers/net/ethernet/mellanox/mlx5/core/health.c   |  2 +-
 drivers/net/ethernet/mellanox/mlx5/core/main.c     | 39 +++++++++-------------
 .../net/ethernet/mellanox/mlx5/core/mlx5_core.h    |  1 -
 include/linux/mlx5/driver.h                        |  1 -
 7 files changed, 20 insertions(+), 36 deletions(-)

(limited to 'include')

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
index 1eac5003084f..d0e572df3a1b 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
@@ -397,7 +397,7 @@ static void mlx5e_enable_async_events(struct mlx5e_priv *priv)
 static void mlx5e_disable_async_events(struct mlx5e_priv *priv)
 {
 	clear_bit(MLX5E_STATE_ASYNC_EVENTS_ENABLED, &priv->state);
-	synchronize_irq(mlx5_get_msix_vec(priv->mdev, MLX5_EQ_VEC_ASYNC));
+	synchronize_irq(pci_irq_vector(priv->mdev->pdev, MLX5_EQ_VEC_ASYNC));
 }
 
 static inline int mlx5e_get_wqe_mtt_sz(void)
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eq.c b/drivers/net/ethernet/mellanox/mlx5/core/eq.c
index af51a5d2b912..8a09d7197d70 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/eq.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/eq.c
@@ -585,7 +585,7 @@ int mlx5_create_map_eq(struct mlx5_core_dev *dev, struct mlx5_eq *eq, u8 vecidx,
 		 name, pci_name(dev->pdev));
 
 	eq->eqn = MLX5_GET(create_eq_out, out, eq_number);
-	eq->irqn = priv->msix_arr[vecidx].vector;
+	eq->irqn = pci_irq_vector(dev->pdev, vecidx);
 	eq->dev = dev;
 	eq->doorbell = priv->uar->map + MLX5_EQ_DOORBEL_OFFSET;
 	err = request_irq(eq->irqn, handler, 0,
@@ -620,7 +620,7 @@ int mlx5_create_map_eq(struct mlx5_core_dev *dev, struct mlx5_eq *eq, u8 vecidx,
 	return 0;
 
 err_irq:
-	free_irq(priv->msix_arr[vecidx].vector, eq);
+	free_irq(eq->irqn, eq);
 
 err_eq:
 	mlx5_cmd_destroy_eq(dev, eq->eqn);
@@ -661,11 +661,6 @@ int mlx5_destroy_unmap_eq(struct mlx5_core_dev *dev, struct mlx5_eq *eq)
 }
 EXPORT_SYMBOL_GPL(mlx5_destroy_unmap_eq);
 
-u32 mlx5_get_msix_vec(struct mlx5_core_dev *dev, int vecidx)
-{
-	return dev->priv.msix_arr[MLX5_EQ_VEC_ASYNC].vector;
-}
-
 int mlx5_eq_init(struct mlx5_core_dev *dev)
 {
 	int err;
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eswitch.c b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.c
index 89bfda419efe..1ce2543e3889 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/eswitch.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.c
@@ -1585,7 +1585,7 @@ static void esw_disable_vport(struct mlx5_eswitch *esw, int vport_num)
 	/* Mark this vport as disabled to discard new events */
 	vport->enabled = false;
 
-	synchronize_irq(mlx5_get_msix_vec(esw->dev, MLX5_EQ_VEC_ASYNC));
+	synchronize_irq(pci_irq_vector(esw->dev->pdev, MLX5_EQ_VEC_ASYNC));
 	/* Wait for current already scheduled events to complete */
 	flush_workqueue(esw->work_queue);
 	/* Disable events from this vport */
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/health.c b/drivers/net/ethernet/mellanox/mlx5/core/health.c
index 4b6b03d6297f..8aea0a065e56 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/health.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/health.c
@@ -81,7 +81,7 @@ static void trigger_cmd_completions(struct mlx5_core_dev *dev)
 	u64 vector;
 
 	/* wait for pending handlers to complete */
-	synchronize_irq(dev->priv.msix_arr[MLX5_EQ_VEC_CMD].vector);
+	synchronize_irq(pci_irq_vector(dev->pdev, MLX5_EQ_VEC_CMD));
 	spin_lock_irqsave(&dev->cmd.alloc_lock, flags);
 	vector = ~dev->cmd.bitmask & ((1ul << (1 << dev->cmd.log_sz)) - 1);
 	if (!vector)
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/main.c b/drivers/net/ethernet/mellanox/mlx5/core/main.c
index c065132b956d..d2fd55e5c68b 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/main.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/main.c
@@ -312,13 +312,12 @@ static void release_bar(struct pci_dev *pdev)
 	pci_release_regions(pdev);
 }
 
-static int mlx5_enable_msix(struct mlx5_core_dev *dev)
+static int mlx5_alloc_irq_vectors(struct mlx5_core_dev *dev)
 {
 	struct mlx5_priv *priv = &dev->priv;
 	struct mlx5_eq_table *table = &priv->eq_table;
 	int num_eqs = 1 << MLX5_CAP_GEN(dev, log_max_eq);
 	int nvec;
-	int i;
 
 	nvec = MLX5_CAP_GEN(dev, num_ports) * num_online_cpus() +
 	       MLX5_EQ_VEC_COMP_BASE;
@@ -326,17 +325,13 @@ static int mlx5_enable_msix(struct mlx5_core_dev *dev)
 	if (nvec <= MLX5_EQ_VEC_COMP_BASE)
 		return -ENOMEM;
 
-	priv->msix_arr = kcalloc(nvec, sizeof(*priv->msix_arr), GFP_KERNEL);
-
 	priv->irq_info = kcalloc(nvec, sizeof(*priv->irq_info), GFP_KERNEL);
-	if (!priv->msix_arr || !priv->irq_info)
+	if (!priv->irq_info)
 		goto err_free_msix;
 
-	for (i = 0; i < nvec; i++)
-		priv->msix_arr[i].entry = i;
-
-	nvec = pci_enable_msix_range(dev->pdev, priv->msix_arr,
-				     MLX5_EQ_VEC_COMP_BASE + 1, nvec);
+	nvec = pci_alloc_irq_vectors(dev->pdev,
+			MLX5_EQ_VEC_COMP_BASE + 1, nvec,
+			PCI_IRQ_MSIX);
 	if (nvec < 0)
 		return nvec;
 
@@ -346,17 +341,15 @@ static int mlx5_enable_msix(struct mlx5_core_dev *dev)
 
 err_free_msix:
 	kfree(priv->irq_info);
-	kfree(priv->msix_arr);
 	return -ENOMEM;
 }
 
-static void mlx5_disable_msix(struct mlx5_core_dev *dev)
+static void mlx5_free_irq_vectors(struct mlx5_core_dev *dev)
 {
 	struct mlx5_priv *priv = &dev->priv;
 
-	pci_disable_msix(dev->pdev);
+	pci_free_irq_vectors(dev->pdev);
 	kfree(priv->irq_info);
-	kfree(priv->msix_arr);
 }
 
 struct mlx5_reg_host_endianness {
@@ -615,8 +608,7 @@ u64 mlx5_read_internal_timer(struct mlx5_core_dev *dev)
 static int mlx5_irq_set_affinity_hint(struct mlx5_core_dev *mdev, int i)
 {
 	struct mlx5_priv *priv  = &mdev->priv;
-	struct msix_entry *msix = priv->msix_arr;
-	int irq                 = msix[i + MLX5_EQ_VEC_COMP_BASE].vector;
+	int irq = pci_irq_vector(mdev->pdev, MLX5_EQ_VEC_COMP_BASE + i);
 
 	if (!zalloc_cpumask_var(&priv->irq_info[i].mask, GFP_KERNEL)) {
 		mlx5_core_warn(mdev, "zalloc_cpumask_var failed");
@@ -636,8 +628,7 @@ static int mlx5_irq_set_affinity_hint(struct mlx5_core_dev *mdev, int i)
 static void mlx5_irq_clear_affinity_hint(struct mlx5_core_dev *mdev, int i)
 {
 	struct mlx5_priv *priv  = &mdev->priv;
-	struct msix_entry *msix = priv->msix_arr;
-	int irq                 = msix[i + MLX5_EQ_VEC_COMP_BASE].vector;
+	int irq = pci_irq_vector(mdev->pdev, MLX5_EQ_VEC_COMP_BASE + i);
 
 	irq_set_affinity_hint(irq, NULL);
 	free_cpumask_var(priv->irq_info[i].mask);
@@ -760,8 +751,8 @@ static int alloc_comp_eqs(struct mlx5_core_dev *dev)
 		}
 
 #ifdef CONFIG_RFS_ACCEL
-		irq_cpu_rmap_add(dev->rmap,
-				 dev->priv.msix_arr[i + MLX5_EQ_VEC_COMP_BASE].vector);
+		irq_cpu_rmap_add(dev->rmap, pci_irq_vector(dev->pdev,
+				 MLX5_EQ_VEC_COMP_BASE + i));
 #endif
 		snprintf(name, MLX5_MAX_IRQ_NAME, "mlx5_comp%d", i);
 		err = mlx5_create_map_eq(dev, eq,
@@ -1119,9 +1110,9 @@ static int mlx5_load_one(struct mlx5_core_dev *dev, struct mlx5_priv *priv,
 		goto err_stop_poll;
 	}
 
-	err = mlx5_enable_msix(dev);
+	err = mlx5_alloc_irq_vectors(dev);
 	if (err) {
-		dev_err(&pdev->dev, "enable msix failed\n");
+		dev_err(&pdev->dev, "alloc irq vectors failed\n");
 		goto err_cleanup_once;
 	}
 
@@ -1220,7 +1211,7 @@ err_put_uars:
 	mlx5_put_uars_page(dev, priv->uar);
 
 err_disable_msix:
-	mlx5_disable_msix(dev);
+	mlx5_free_irq_vectors(dev);
 
 err_cleanup_once:
 	if (boot)
@@ -1287,7 +1278,7 @@ static int mlx5_unload_one(struct mlx5_core_dev *dev, struct mlx5_priv *priv,
 	free_comp_eqs(dev);
 	mlx5_stop_eqs(dev);
 	mlx5_put_uars_page(dev, priv->uar);
-	mlx5_disable_msix(dev);
+	mlx5_free_irq_vectors(dev);
 	if (cleanup)
 		mlx5_cleanup_once(dev);
 	mlx5_stop_health_poll(dev);
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/mlx5_core.h b/drivers/net/ethernet/mellanox/mlx5/core/mlx5_core.h
index 6a3d6bef7dd4..ba1d494b016d 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/mlx5_core.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/mlx5_core.h
@@ -110,7 +110,6 @@ int mlx5_destroy_scheduling_element_cmd(struct mlx5_core_dev *dev, u8 hierarchy,
 					u32 element_id);
 int mlx5_wait_for_vf_pages(struct mlx5_core_dev *dev);
 u64 mlx5_read_internal_timer(struct mlx5_core_dev *dev);
-u32 mlx5_get_msix_vec(struct mlx5_core_dev *dev, int vecidx);
 struct mlx5_eq *mlx5_eqn2eq(struct mlx5_core_dev *dev, int eqn);
 void mlx5_cq_tasklet_cb(unsigned long data);
 
diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h
index df6ce59a1f95..5bac7f53b4f9 100644
--- a/include/linux/mlx5/driver.h
+++ b/include/linux/mlx5/driver.h
@@ -597,7 +597,6 @@ struct mlx5_port_module_event_stats {
 struct mlx5_priv {
 	char			name[MLX5_MAX_NAME_LEN];
 	struct mlx5_eq_table	eq_table;
-	struct msix_entry	*msix_arr;
 	struct mlx5_irq_info	*irq_info;
 
 	/* pages stuff */
-- 
cgit v1.2.3


From a435393acafbf0ecff4deb3e3cb554b34f0d0664 Mon Sep 17 00:00:00 2001
From: Sagi Grimberg <sagi@grimberg.me>
Date: Thu, 13 Jul 2017 11:09:40 +0300
Subject: mlx5: move affinity hints assignments to generic code

generic api takes care of spreading affinity similar to
what mlx5 open coded (and even handles better asymmetric
configurations). Ask the generic API to spread affinity
for us, and feed him pre_vectors that do not participate
in affinity settings (which is an improvement to what we
had before).

The affinity assignments should match what mlx5 tried to
do earlier but now we do not set affinity to async, cmd
and pages dedicated vectors.

Also, remove mlx5e_get_cpu and introduce mlx5e_get_node
(used for allocation purposes) and mlx5_get_vector_affinity
(for indirection table construction) as they provide the needed
information. Luckily, we have generic helpers to get cpumask
and node given a irq vector. mlx5_get_vector_affinity will
be used by mlx5_ib in a subsequent patch.

Reviewed-by: Christoph Hellwig <hch@lst.de>
Acked-by: Leon Romanovsky <leonro@mellanox.com>
Signed-off-by: Sagi Grimberg <sagi@grimberg.me>
Signed-off-by: Doug Ledford <dledford@redhat.com>
---
 drivers/net/ethernet/mellanox/mlx5/core/en.h      |  1 -
 drivers/net/ethernet/mellanox/mlx5/core/en_main.c | 45 +++++++-------
 drivers/net/ethernet/mellanox/mlx5/core/main.c    | 75 ++---------------------
 include/linux/mlx5/driver.h                       |  7 ++-
 4 files changed, 35 insertions(+), 93 deletions(-)

(limited to 'include')

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en.h b/drivers/net/ethernet/mellanox/mlx5/core/en.h
index e1b7ddfecd01..909123243a85 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en.h
@@ -587,7 +587,6 @@ struct mlx5e_channel {
 	struct mlx5_core_dev      *mdev;
 	struct mlx5e_tstamp       *tstamp;
 	int                        ix;
-	int                        cpu;
 };
 
 struct mlx5e_channels {
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
index 2c4e41833e55..fb647561c592 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
@@ -71,6 +71,11 @@ struct mlx5e_channel_param {
 	struct mlx5e_cq_param      icosq_cq;
 };
 
+static int mlx5e_get_node(struct mlx5e_priv *priv, int ix)
+{
+	return pci_irq_get_node(priv->mdev->pdev, MLX5_EQ_VEC_COMP_BASE + ix);
+}
+
 static bool mlx5e_check_fragmented_striding_rq_cap(struct mlx5_core_dev *mdev)
 {
 	return MLX5_CAP_GEN(mdev, striding_rq) &&
@@ -444,16 +449,17 @@ static int mlx5e_rq_alloc_mpwqe_info(struct mlx5e_rq *rq,
 	int wq_sz = mlx5_wq_ll_get_size(&rq->wq);
 	int mtt_sz = mlx5e_get_wqe_mtt_sz();
 	int mtt_alloc = mtt_sz + MLX5_UMR_ALIGN - 1;
+	int node = mlx5e_get_node(c->priv, c->ix);
 	int i;
 
 	rq->mpwqe.info = kzalloc_node(wq_sz * sizeof(*rq->mpwqe.info),
-				      GFP_KERNEL, cpu_to_node(c->cpu));
+					GFP_KERNEL, node);
 	if (!rq->mpwqe.info)
 		goto err_out;
 
 	/* We allocate more than mtt_sz as we will align the pointer */
-	rq->mpwqe.mtt_no_align = kzalloc_node(mtt_alloc * wq_sz, GFP_KERNEL,
-					cpu_to_node(c->cpu));
+	rq->mpwqe.mtt_no_align = kzalloc_node(mtt_alloc * wq_sz,
+					GFP_KERNEL, node);
 	if (unlikely(!rq->mpwqe.mtt_no_align))
 		goto err_free_wqe_info;
 
@@ -561,7 +567,7 @@ static int mlx5e_alloc_rq(struct mlx5e_channel *c,
 	int err;
 	int i;
 
-	rqp->wq.db_numa_node = cpu_to_node(c->cpu);
+	rqp->wq.db_numa_node = mlx5e_get_node(c->priv, c->ix);
 
 	err = mlx5_wq_ll_create(mdev, &rqp->wq, rqc_wq, &rq->wq,
 				&rq->wq_ctrl);
@@ -628,7 +634,8 @@ static int mlx5e_alloc_rq(struct mlx5e_channel *c,
 	default: /* MLX5_WQ_TYPE_LINKED_LIST */
 		rq->wqe.frag_info =
 			kzalloc_node(wq_sz * sizeof(*rq->wqe.frag_info),
-				     GFP_KERNEL, cpu_to_node(c->cpu));
+				     GFP_KERNEL,
+				     mlx5e_get_node(c->priv, c->ix));
 		if (!rq->wqe.frag_info) {
 			err = -ENOMEM;
 			goto err_rq_wq_destroy;
@@ -993,13 +1000,13 @@ static int mlx5e_alloc_xdpsq(struct mlx5e_channel *c,
 	sq->uar_map   = mdev->mlx5e_res.bfreg.map;
 	sq->min_inline_mode = params->tx_min_inline_mode;
 
-	param->wq.db_numa_node = cpu_to_node(c->cpu);
+	param->wq.db_numa_node = mlx5e_get_node(c->priv, c->ix);
 	err = mlx5_wq_cyc_create(mdev, &param->wq, sqc_wq, &sq->wq, &sq->wq_ctrl);
 	if (err)
 		return err;
 	sq->wq.db = &sq->wq.db[MLX5_SND_DBR];
 
-	err = mlx5e_alloc_xdpsq_db(sq, cpu_to_node(c->cpu));
+	err = mlx5e_alloc_xdpsq_db(sq, mlx5e_get_node(c->priv, c->ix));
 	if (err)
 		goto err_sq_wq_destroy;
 
@@ -1047,13 +1054,13 @@ static int mlx5e_alloc_icosq(struct mlx5e_channel *c,
 	sq->channel   = c;
 	sq->uar_map   = mdev->mlx5e_res.bfreg.map;
 
-	param->wq.db_numa_node = cpu_to_node(c->cpu);
+	param->wq.db_numa_node = mlx5e_get_node(c->priv, c->ix);
 	err = mlx5_wq_cyc_create(mdev, &param->wq, sqc_wq, &sq->wq, &sq->wq_ctrl);
 	if (err)
 		return err;
 	sq->wq.db = &sq->wq.db[MLX5_SND_DBR];
 
-	err = mlx5e_alloc_icosq_db(sq, cpu_to_node(c->cpu));
+	err = mlx5e_alloc_icosq_db(sq, mlx5e_get_node(c->priv, c->ix));
 	if (err)
 		goto err_sq_wq_destroy;
 
@@ -1119,13 +1126,13 @@ static int mlx5e_alloc_txqsq(struct mlx5e_channel *c,
 	if (MLX5_IPSEC_DEV(c->priv->mdev))
 		set_bit(MLX5E_SQ_STATE_IPSEC, &sq->state);
 
-	param->wq.db_numa_node = cpu_to_node(c->cpu);
+	param->wq.db_numa_node = mlx5e_get_node(c->priv, c->ix);
 	err = mlx5_wq_cyc_create(mdev, &param->wq, sqc_wq, &sq->wq, &sq->wq_ctrl);
 	if (err)
 		return err;
 	sq->wq.db    = &sq->wq.db[MLX5_SND_DBR];
 
-	err = mlx5e_alloc_txqsq_db(sq, cpu_to_node(c->cpu));
+	err = mlx5e_alloc_txqsq_db(sq, mlx5e_get_node(c->priv, c->ix));
 	if (err)
 		goto err_sq_wq_destroy;
 
@@ -1497,8 +1504,8 @@ static int mlx5e_alloc_cq(struct mlx5e_channel *c,
 	struct mlx5_core_dev *mdev = c->priv->mdev;
 	int err;
 
-	param->wq.buf_numa_node = cpu_to_node(c->cpu);
-	param->wq.db_numa_node  = cpu_to_node(c->cpu);
+	param->wq.buf_numa_node = mlx5e_get_node(c->priv, c->ix);
+	param->wq.db_numa_node  = mlx5e_get_node(c->priv, c->ix);
 	param->eq_ix   = c->ix;
 
 	err = mlx5e_alloc_cq_common(mdev, param, cq);
@@ -1597,11 +1604,6 @@ static void mlx5e_close_cq(struct mlx5e_cq *cq)
 	mlx5e_free_cq(cq);
 }
 
-static int mlx5e_get_cpu(struct mlx5e_priv *priv, int ix)
-{
-	return cpumask_first(priv->mdev->priv.irq_info[ix].mask);
-}
-
 static int mlx5e_open_tx_cqs(struct mlx5e_channel *c,
 			     struct mlx5e_params *params,
 			     struct mlx5e_channel_param *cparam)
@@ -1750,11 +1752,10 @@ static int mlx5e_open_channel(struct mlx5e_priv *priv, int ix,
 {
 	struct mlx5e_cq_moder icocq_moder = {0, 0};
 	struct net_device *netdev = priv->netdev;
-	int cpu = mlx5e_get_cpu(priv, ix);
 	struct mlx5e_channel *c;
 	int err;
 
-	c = kzalloc_node(sizeof(*c), GFP_KERNEL, cpu_to_node(cpu));
+	c = kzalloc_node(sizeof(*c), GFP_KERNEL, mlx5e_get_node(priv, ix));
 	if (!c)
 		return -ENOMEM;
 
@@ -1762,7 +1763,6 @@ static int mlx5e_open_channel(struct mlx5e_priv *priv, int ix,
 	c->mdev     = priv->mdev;
 	c->tstamp   = &priv->tstamp;
 	c->ix       = ix;
-	c->cpu      = cpu;
 	c->pdev     = &priv->mdev->pdev->dev;
 	c->netdev   = priv->netdev;
 	c->mkey_be  = cpu_to_be32(priv->mdev->mlx5e_res.mkey.key);
@@ -1848,7 +1848,8 @@ static void mlx5e_activate_channel(struct mlx5e_channel *c)
 	for (tc = 0; tc < c->num_tc; tc++)
 		mlx5e_activate_txqsq(&c->sq[tc]);
 	mlx5e_activate_rq(&c->rq);
-	netif_set_xps_queue(c->netdev, get_cpu_mask(c->cpu), c->ix);
+	netif_set_xps_queue(c->netdev,
+		mlx5_get_vector_affinity(c->priv->mdev, c->ix), c->ix);
 }
 
 static void mlx5e_deactivate_channel(struct mlx5e_channel *c)
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/main.c b/drivers/net/ethernet/mellanox/mlx5/core/main.c
index d2fd55e5c68b..e464e8179655 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/main.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/main.c
@@ -316,6 +316,9 @@ static int mlx5_alloc_irq_vectors(struct mlx5_core_dev *dev)
 {
 	struct mlx5_priv *priv = &dev->priv;
 	struct mlx5_eq_table *table = &priv->eq_table;
+	struct irq_affinity irqdesc = {
+		.pre_vectors = MLX5_EQ_VEC_COMP_BASE,
+	};
 	int num_eqs = 1 << MLX5_CAP_GEN(dev, log_max_eq);
 	int nvec;
 
@@ -329,9 +332,10 @@ static int mlx5_alloc_irq_vectors(struct mlx5_core_dev *dev)
 	if (!priv->irq_info)
 		goto err_free_msix;
 
-	nvec = pci_alloc_irq_vectors(dev->pdev,
+	nvec = pci_alloc_irq_vectors_affinity(dev->pdev,
 			MLX5_EQ_VEC_COMP_BASE + 1, nvec,
-			PCI_IRQ_MSIX);
+			PCI_IRQ_MSIX | PCI_IRQ_AFFINITY,
+			&irqdesc);
 	if (nvec < 0)
 		return nvec;
 
@@ -605,63 +609,6 @@ u64 mlx5_read_internal_timer(struct mlx5_core_dev *dev)
 	return (u64)timer_l | (u64)timer_h1 << 32;
 }
 
-static int mlx5_irq_set_affinity_hint(struct mlx5_core_dev *mdev, int i)
-{
-	struct mlx5_priv *priv  = &mdev->priv;
-	int irq = pci_irq_vector(mdev->pdev, MLX5_EQ_VEC_COMP_BASE + i);
-
-	if (!zalloc_cpumask_var(&priv->irq_info[i].mask, GFP_KERNEL)) {
-		mlx5_core_warn(mdev, "zalloc_cpumask_var failed");
-		return -ENOMEM;
-	}
-
-	cpumask_set_cpu(cpumask_local_spread(i, priv->numa_node),
-			priv->irq_info[i].mask);
-
-	if (IS_ENABLED(CONFIG_SMP) &&
-	    irq_set_affinity_hint(irq, priv->irq_info[i].mask))
-		mlx5_core_warn(mdev, "irq_set_affinity_hint failed, irq 0x%.4x", irq);
-
-	return 0;
-}
-
-static void mlx5_irq_clear_affinity_hint(struct mlx5_core_dev *mdev, int i)
-{
-	struct mlx5_priv *priv  = &mdev->priv;
-	int irq = pci_irq_vector(mdev->pdev, MLX5_EQ_VEC_COMP_BASE + i);
-
-	irq_set_affinity_hint(irq, NULL);
-	free_cpumask_var(priv->irq_info[i].mask);
-}
-
-static int mlx5_irq_set_affinity_hints(struct mlx5_core_dev *mdev)
-{
-	int err;
-	int i;
-
-	for (i = 0; i < mdev->priv.eq_table.num_comp_vectors; i++) {
-		err = mlx5_irq_set_affinity_hint(mdev, i);
-		if (err)
-			goto err_out;
-	}
-
-	return 0;
-
-err_out:
-	for (i--; i >= 0; i--)
-		mlx5_irq_clear_affinity_hint(mdev, i);
-
-	return err;
-}
-
-static void mlx5_irq_clear_affinity_hints(struct mlx5_core_dev *mdev)
-{
-	int i;
-
-	for (i = 0; i < mdev->priv.eq_table.num_comp_vectors; i++)
-		mlx5_irq_clear_affinity_hint(mdev, i);
-}
-
 int mlx5_vector2eqn(struct mlx5_core_dev *dev, int vector, int *eqn,
 		    unsigned int *irqn)
 {
@@ -1134,12 +1081,6 @@ static int mlx5_load_one(struct mlx5_core_dev *dev, struct mlx5_priv *priv,
 		goto err_stop_eqs;
 	}
 
-	err = mlx5_irq_set_affinity_hints(dev);
-	if (err) {
-		dev_err(&pdev->dev, "Failed to alloc affinity hint cpumask\n");
-		goto err_affinity_hints;
-	}
-
 	err = mlx5_init_fs(dev);
 	if (err) {
 		dev_err(&pdev->dev, "Failed to init flow steering\n");
@@ -1199,9 +1140,6 @@ err_sriov:
 	mlx5_cleanup_fs(dev);
 
 err_fs:
-	mlx5_irq_clear_affinity_hints(dev);
-
-err_affinity_hints:
 	free_comp_eqs(dev);
 
 err_stop_eqs:
@@ -1274,7 +1212,6 @@ static int mlx5_unload_one(struct mlx5_core_dev *dev, struct mlx5_priv *priv,
 	mlx5_eswitch_detach(dev->priv.eswitch);
 #endif
 	mlx5_cleanup_fs(dev);
-	mlx5_irq_clear_affinity_hints(dev);
 	free_comp_eqs(dev);
 	mlx5_stop_eqs(dev);
 	mlx5_put_uars_page(dev, priv->uar);
diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h
index 5bac7f53b4f9..579731842c94 100644
--- a/include/linux/mlx5/driver.h
+++ b/include/linux/mlx5/driver.h
@@ -534,7 +534,6 @@ struct mlx5_core_sriov {
 };
 
 struct mlx5_irq_info {
-	cpumask_var_t mask;
 	char name[MLX5_MAX_IRQ_NAME];
 };
 
@@ -1184,4 +1183,10 @@ enum {
 	MLX5_TRIGGERED_CMD_COMP = (u64)1 << 32,
 };
 
+static inline const struct cpumask *
+mlx5_get_vector_affinity(struct mlx5_core_dev *dev, int vector)
+{
+	return pci_irq_get_affinity(dev->pdev, MLX5_EQ_VEC_COMP_BASE + vector);
+}
+
 #endif /* MLX5_DRIVER_H */
-- 
cgit v1.2.3


From c66cd353bbe6869a059869a7a1518ec619afdc9d Mon Sep 17 00:00:00 2001
From: Sagi Grimberg <sagi@grimberg.me>
Date: Thu, 13 Jul 2017 11:09:41 +0300
Subject: RDMA/core: expose affinity mappings per completion vector
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This will allow ULPs to intelligently locate threads based
on completion vector cpu affinity mappings. In case the
driver does not expose a get_vector_affinity callout, return
NULL so the caller can maintain a fallback logic.

Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Håkon Bugge <haakon.bugge@oracle.com>
Acked-by: Doug Ledford <dledford@redhat.com>
Signed-off-by: Sagi Grimberg <sagi@grimberg.me>
Signed-off-by: Doug Ledford <dledford@redhat.com>
---
 include/rdma/ib_verbs.h | 24 ++++++++++++++++++++++++
 1 file changed, 24 insertions(+)

(limited to 'include')

diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h
index b5732432bb29..73ed2e4e802f 100644
--- a/include/rdma/ib_verbs.h
+++ b/include/rdma/ib_verbs.h
@@ -2296,6 +2296,8 @@ struct ib_device {
 	 */
 	int (*get_port_immutable)(struct ib_device *, u8, struct ib_port_immutable *);
 	void (*get_dev_fw_str)(struct ib_device *, char *str, size_t str_len);
+	const struct cpumask *(*get_vector_affinity)(struct ib_device *ibdev,
+						     int comp_vector);
 };
 
 struct ib_client {
@@ -3706,4 +3708,26 @@ static inline enum rdma_ah_attr_type rdma_ah_find_type(struct ib_device *dev,
 	else
 		return RDMA_AH_ATTR_TYPE_IB;
 }
+
+/**
+ * ib_get_vector_affinity - Get the affinity mappings of a given completion
+ *   vector
+ * @device:         the rdma device
+ * @comp_vector:    index of completion vector
+ *
+ * Returns NULL on failure, otherwise a corresponding cpu map of the
+ * completion vector (returns all-cpus map if the device driver doesn't
+ * implement get_vector_affinity).
+ */
+static inline const struct cpumask *
+ib_get_vector_affinity(struct ib_device *device, int comp_vector)
+{
+	if (comp_vector < 0 || comp_vector >= device->num_comp_vectors ||
+	    !device->get_vector_affinity)
+		return NULL;
+
+	return device->get_vector_affinity(device, comp_vector);
+
+}
+
 #endif /* IB_VERBS_H */
-- 
cgit v1.2.3


From 24c5dc6610e8a3764fcb885cc3284c12ff1513de Mon Sep 17 00:00:00 2001
From: Sagi Grimberg <sagi@grimberg.me>
Date: Thu, 13 Jul 2017 11:09:43 +0300
Subject: block: Add rdma affinity based queue mapping helper

Like pci and virtio, we add a rdma helper for affinity
spreading. This achieves optimal mq affinity assignments
according to the underlying rdma device affinity maps.

Reviewed-by: Jens Axboe <axboe@fb.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Max Gurtovoy <maxg@mellanox.com>
Signed-off-by: Sagi Grimberg <sagi@grimberg.me>
Signed-off-by: Doug Ledford <dledford@redhat.com>
---
 block/Kconfig               |  5 +++++
 block/Makefile              |  1 +
 block/blk-mq-rdma.c         | 52 +++++++++++++++++++++++++++++++++++++++++++++
 include/linux/blk-mq-rdma.h | 10 +++++++++
 4 files changed, 68 insertions(+)
 create mode 100644 block/blk-mq-rdma.c
 create mode 100644 include/linux/blk-mq-rdma.h

(limited to 'include')

diff --git a/block/Kconfig b/block/Kconfig
index 89cd28f8d051..3ab42bbb06d5 100644
--- a/block/Kconfig
+++ b/block/Kconfig
@@ -206,4 +206,9 @@ config BLK_MQ_VIRTIO
 	depends on BLOCK && VIRTIO
 	default y
 
+config BLK_MQ_RDMA
+	bool
+	depends on BLOCK && INFINIBAND
+	default y
+
 source block/Kconfig.iosched
diff --git a/block/Makefile b/block/Makefile
index 2b281cf258a0..9396ebc85d24 100644
--- a/block/Makefile
+++ b/block/Makefile
@@ -29,6 +29,7 @@ obj-$(CONFIG_BLK_CMDLINE_PARSER)	+= cmdline-parser.o
 obj-$(CONFIG_BLK_DEV_INTEGRITY) += bio-integrity.o blk-integrity.o t10-pi.o
 obj-$(CONFIG_BLK_MQ_PCI)	+= blk-mq-pci.o
 obj-$(CONFIG_BLK_MQ_VIRTIO)	+= blk-mq-virtio.o
+obj-$(CONFIG_BLK_MQ_RDMA)	+= blk-mq-rdma.o
 obj-$(CONFIG_BLK_DEV_ZONED)	+= blk-zoned.o
 obj-$(CONFIG_BLK_WBT)		+= blk-wbt.o
 obj-$(CONFIG_BLK_DEBUG_FS)	+= blk-mq-debugfs.o
diff --git a/block/blk-mq-rdma.c b/block/blk-mq-rdma.c
new file mode 100644
index 000000000000..996167f1de18
--- /dev/null
+++ b/block/blk-mq-rdma.c
@@ -0,0 +1,52 @@
+/*
+ * Copyright (c) 2017 Sagi Grimberg.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ */
+#include <linux/blk-mq.h>
+#include <linux/blk-mq-rdma.h>
+#include <rdma/ib_verbs.h>
+
+/**
+ * blk_mq_rdma_map_queues - provide a default queue mapping for rdma device
+ * @set:	tagset to provide the mapping for
+ * @dev:	rdma device associated with @set.
+ * @first_vec:	first interrupt vectors to use for queues (usually 0)
+ *
+ * This function assumes the rdma device @dev has at least as many available
+ * interrupt vetors as @set has queues.  It will then query it's affinity mask
+ * and built queue mapping that maps a queue to the CPUs that have irq affinity
+ * for the corresponding vector.
+ *
+ * In case either the driver passed a @dev with less vectors than
+ * @set->nr_hw_queues, or @dev does not provide an affinity mask for a
+ * vector, we fallback to the naive mapping.
+ */
+int blk_mq_rdma_map_queues(struct blk_mq_tag_set *set,
+		struct ib_device *dev, int first_vec)
+{
+	const struct cpumask *mask;
+	unsigned int queue, cpu;
+
+	for (queue = 0; queue < set->nr_hw_queues; queue++) {
+		mask = ib_get_vector_affinity(dev, first_vec + queue);
+		if (!mask)
+			goto fallback;
+
+		for_each_cpu(cpu, mask)
+			set->mq_map[cpu] = queue;
+	}
+
+	return 0;
+
+fallback:
+	return blk_mq_map_queues(set);
+}
+EXPORT_SYMBOL_GPL(blk_mq_rdma_map_queues);
diff --git a/include/linux/blk-mq-rdma.h b/include/linux/blk-mq-rdma.h
new file mode 100644
index 000000000000..b4ade198007d
--- /dev/null
+++ b/include/linux/blk-mq-rdma.h
@@ -0,0 +1,10 @@
+#ifndef _LINUX_BLK_MQ_RDMA_H
+#define _LINUX_BLK_MQ_RDMA_H
+
+struct blk_mq_tag_set;
+struct ib_device;
+
+int blk_mq_rdma_map_queues(struct blk_mq_tag_set *set,
+		struct ib_device *dev, int first_vec);
+
+#endif /* _LINUX_BLK_MQ_RDMA_H */
-- 
cgit v1.2.3


From f1174f77b50c94eecaa658fdc56fa69b421de4b8 Mon Sep 17 00:00:00 2001
From: Edward Cree <ecree@solarflare.com>
Date: Mon, 7 Aug 2017 15:26:19 +0100
Subject: bpf/verifier: rework value tracking

Unifies adjusted and unadjusted register value types (e.g. FRAME_POINTER is
 now just a PTR_TO_STACK with zero offset).
Tracks value alignment by means of tracking known & unknown bits.  This
 also replaces the 'reg->imm' (leading zero bits) calculations for (what
 were) UNKNOWN_VALUEs.
If pointer leaks are allowed, and adjust_ptr_min_max_vals returns -EACCES,
 treat the pointer as an unknown scalar and try again, because we might be
 able to conclude something about the result (e.g. pointer & 0x40 is either
 0 or 0x40).
Verifier hooks in the netronome/nfp driver were changed to match the new
 data structures.

Signed-off-by: Edward Cree <ecree@solarflare.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/netronome/nfp/bpf/verifier.c |   24 +-
 include/linux/bpf.h                               |   34 +-
 include/linux/bpf_verifier.h                      |   34 +-
 include/linux/tnum.h                              |   79 +
 kernel/bpf/Makefile                               |    2 +-
 kernel/bpf/tnum.c                                 |  164 ++
 kernel/bpf/verifier.c                             | 1780 +++++++++++----------
 7 files changed, 1265 insertions(+), 852 deletions(-)
 create mode 100644 include/linux/tnum.h
 create mode 100644 kernel/bpf/tnum.c

(limited to 'include')

diff --git a/drivers/net/ethernet/netronome/nfp/bpf/verifier.c b/drivers/net/ethernet/netronome/nfp/bpf/verifier.c
index d696ba46f70a..5b783a91b115 100644
--- a/drivers/net/ethernet/netronome/nfp/bpf/verifier.c
+++ b/drivers/net/ethernet/netronome/nfp/bpf/verifier.c
@@ -79,28 +79,32 @@ nfp_bpf_check_exit(struct nfp_prog *nfp_prog,
 		   const struct bpf_verifier_env *env)
 {
 	const struct bpf_reg_state *reg0 = &env->cur_state.regs[0];
+	u64 imm;
 
 	if (nfp_prog->act == NN_ACT_XDP)
 		return 0;
 
-	if (reg0->type != CONST_IMM) {
-		pr_info("unsupported exit state: %d, imm: %llx\n",
-			reg0->type, reg0->imm);
+	if (!(reg0->type == SCALAR_VALUE && tnum_is_const(reg0->var_off))) {
+		char tn_buf[48];
+
+		tnum_strn(tn_buf, sizeof(tn_buf), reg0->var_off);
+		pr_info("unsupported exit state: %d, var_off: %s\n",
+			reg0->type, tn_buf);
 		return -EINVAL;
 	}
 
-	if (nfp_prog->act != NN_ACT_DIRECT &&
-	    reg0->imm != 0 && (reg0->imm & ~0U) != ~0U) {
+	imm = reg0->var_off.value;
+	if (nfp_prog->act != NN_ACT_DIRECT && imm != 0 && (imm & ~0U) != ~0U) {
 		pr_info("unsupported exit state: %d, imm: %llx\n",
-			reg0->type, reg0->imm);
+			reg0->type, imm);
 		return -EINVAL;
 	}
 
-	if (nfp_prog->act == NN_ACT_DIRECT && reg0->imm <= TC_ACT_REDIRECT &&
-	    reg0->imm != TC_ACT_SHOT && reg0->imm != TC_ACT_STOLEN &&
-	    reg0->imm != TC_ACT_QUEUED) {
+	if (nfp_prog->act == NN_ACT_DIRECT && imm <= TC_ACT_REDIRECT &&
+	    imm != TC_ACT_SHOT && imm != TC_ACT_STOLEN &&
+	    imm != TC_ACT_QUEUED) {
 		pr_info("unsupported exit state: %d, imm: %llx\n",
-			reg0->type, reg0->imm);
+			reg0->type, imm);
 		return -EINVAL;
 	}
 
diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 6353c7474dba..39229c455cba 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -117,35 +117,25 @@ enum bpf_access_type {
 };
 
 /* types of values stored in eBPF registers */
+/* Pointer types represent:
+ * pointer
+ * pointer + imm
+ * pointer + (u16) var
+ * pointer + (u16) var + imm
+ * if (range > 0) then [ptr, ptr + range - off) is safe to access
+ * if (id > 0) means that some 'var' was added
+ * if (off > 0) means that 'imm' was added
+ */
 enum bpf_reg_type {
 	NOT_INIT = 0,		 /* nothing was written into register */
-	UNKNOWN_VALUE,		 /* reg doesn't contain a valid pointer */
+	SCALAR_VALUE,		 /* reg doesn't contain a valid pointer */
 	PTR_TO_CTX,		 /* reg points to bpf_context */
 	CONST_PTR_TO_MAP,	 /* reg points to struct bpf_map */
 	PTR_TO_MAP_VALUE,	 /* reg points to map element value */
 	PTR_TO_MAP_VALUE_OR_NULL,/* points to map elem value or NULL */
-	FRAME_PTR,		 /* reg == frame_pointer */
-	PTR_TO_STACK,		 /* reg == frame_pointer + imm */
-	CONST_IMM,		 /* constant integer value */
-
-	/* PTR_TO_PACKET represents:
-	 * skb->data
-	 * skb->data + imm
-	 * skb->data + (u16) var
-	 * skb->data + (u16) var + imm
-	 * if (range > 0) then [ptr, ptr + range - off) is safe to access
-	 * if (id > 0) means that some 'var' was added
-	 * if (off > 0) menas that 'imm' was added
-	 */
-	PTR_TO_PACKET,
+	PTR_TO_STACK,		 /* reg == frame_pointer + offset */
+	PTR_TO_PACKET,		 /* reg points to skb->data */
 	PTR_TO_PACKET_END,	 /* skb->data + headlen */
-
-	/* PTR_TO_MAP_VALUE_ADJ is used for doing pointer math inside of a map
-	 * elem value.  We only allow this if we can statically verify that
-	 * access from this register are going to fall within the size of the
-	 * map element.
-	 */
-	PTR_TO_MAP_VALUE_ADJ,
 };
 
 struct bpf_prog;
diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h
index 8e5d31f6faef..85936fa92d12 100644
--- a/include/linux/bpf_verifier.h
+++ b/include/linux/bpf_verifier.h
@@ -9,6 +9,7 @@
 
 #include <linux/bpf.h> /* for enum bpf_reg_type */
 #include <linux/filter.h> /* for MAX_BPF_STACK */
+#include <linux/tnum.h>
 
  /* Just some arbitrary values so we can safely do math without overflowing and
   * are obviously wrong for any sort of memory access.
@@ -19,30 +20,37 @@
 struct bpf_reg_state {
 	enum bpf_reg_type type;
 	union {
-		/* valid when type == CONST_IMM | PTR_TO_STACK | UNKNOWN_VALUE */
-		s64 imm;
-
-		/* valid when type == PTR_TO_PACKET* */
-		struct {
-			u16 off;
-			u16 range;
-		};
+		/* valid when type == PTR_TO_PACKET */
+		u16 range;
 
 		/* valid when type == CONST_PTR_TO_MAP | PTR_TO_MAP_VALUE |
 		 *   PTR_TO_MAP_VALUE_OR_NULL
 		 */
 		struct bpf_map *map_ptr;
 	};
+	/* Fixed part of pointer offset, pointer types only */
+	s32 off;
+	/* For PTR_TO_PACKET, used to find other pointers with the same variable
+	 * offset, so they can share range knowledge.
+	 * For PTR_TO_MAP_VALUE_OR_NULL this is used to share which map value we
+	 * came from, when one is tested for != NULL.
+	 */
 	u32 id;
+	/* These three fields must be last.  See states_equal() */
+	/* For scalar types (SCALAR_VALUE), this represents our knowledge of
+	 * the actual value.
+	 * For pointer types, this represents the variable part of the offset
+	 * from the pointed-to object, and is shared with all bpf_reg_states
+	 * with the same id as us.
+	 */
+	struct tnum var_off;
 	/* Used to determine if any memory access using this register will
-	 * result in a bad access. These two fields must be last.
-	 * See states_equal()
+	 * result in a bad access.
+	 * These refer to the same value as var_off, not necessarily the actual
+	 * contents of the register.
 	 */
 	s64 min_value;
 	u64 max_value;
-	u32 min_align;
-	u32 aux_off;
-	u32 aux_off_align;
 	bool value_from_signed;
 };
 
diff --git a/include/linux/tnum.h b/include/linux/tnum.h
new file mode 100644
index 000000000000..a0b07bf1842b
--- /dev/null
+++ b/include/linux/tnum.h
@@ -0,0 +1,79 @@
+/* tnum: tracked (or tristate) numbers
+ *
+ * A tnum tracks knowledge about the bits of a value.  Each bit can be either
+ * known (0 or 1), or unknown (x).  Arithmetic operations on tnums will
+ * propagate the unknown bits such that the tnum result represents all the
+ * possible results for possible values of the operands.
+ */
+#include <linux/types.h>
+
+struct tnum {
+	u64 value;
+	u64 mask;
+};
+
+/* Constructors */
+/* Represent a known constant as a tnum. */
+struct tnum tnum_const(u64 value);
+/* A completely unknown value */
+extern const struct tnum tnum_unknown;
+
+/* Arithmetic and logical ops */
+/* Shift a tnum left (by a fixed shift) */
+struct tnum tnum_lshift(struct tnum a, u8 shift);
+/* Shift a tnum right (by a fixed shift) */
+struct tnum tnum_rshift(struct tnum a, u8 shift);
+/* Add two tnums, return @a + @b */
+struct tnum tnum_add(struct tnum a, struct tnum b);
+/* Subtract two tnums, return @a - @b */
+struct tnum tnum_sub(struct tnum a, struct tnum b);
+/* Bitwise-AND, return @a & @b */
+struct tnum tnum_and(struct tnum a, struct tnum b);
+/* Bitwise-OR, return @a | @b */
+struct tnum tnum_or(struct tnum a, struct tnum b);
+/* Bitwise-XOR, return @a ^ @b */
+struct tnum tnum_xor(struct tnum a, struct tnum b);
+/* Multiply two tnums, return @a * @b */
+struct tnum tnum_mul(struct tnum a, struct tnum b);
+
+/* Return a tnum representing numbers satisfying both @a and @b */
+struct tnum tnum_intersect(struct tnum a, struct tnum b);
+
+/* Return @a with all but the lowest @size bytes cleared */
+struct tnum tnum_cast(struct tnum a, u8 size);
+
+/* Returns true if @a is a known constant */
+static inline bool tnum_is_const(struct tnum a)
+{
+	return !a.mask;
+}
+
+/* Returns true if @a == tnum_const(@b) */
+static inline bool tnum_equals_const(struct tnum a, u64 b)
+{
+	return tnum_is_const(a) && a.value == b;
+}
+
+/* Returns true if @a is completely unknown */
+static inline bool tnum_is_unknown(struct tnum a)
+{
+	return !~a.mask;
+}
+
+/* Returns true if @a is known to be a multiple of @size.
+ * @size must be a power of two.
+ */
+bool tnum_is_aligned(struct tnum a, u64 size);
+
+/* Returns true if @b represents a subset of @a. */
+bool tnum_in(struct tnum a, struct tnum b);
+
+/* Formatting functions.  These have snprintf-like semantics: they will write
+ * up to @size bytes (including the terminating NUL byte), and return the number
+ * of bytes (excluding the terminating NUL) which would have been written had
+ * sufficient space been available.  (Thus tnum_sbin always returns 64.)
+ */
+/* Format a tnum as a pair of hex numbers (value; mask) */
+int tnum_strn(char *str, size_t size, struct tnum a);
+/* Format a tnum as tristate binary expansion */
+int tnum_sbin(char *str, size_t size, struct tnum a);
diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile
index 48e92705be59..2f0bcda40e90 100644
--- a/kernel/bpf/Makefile
+++ b/kernel/bpf/Makefile
@@ -1,6 +1,6 @@
 obj-y := core.o
 
-obj-$(CONFIG_BPF_SYSCALL) += syscall.o verifier.o inode.o helpers.o
+obj-$(CONFIG_BPF_SYSCALL) += syscall.o verifier.o inode.o helpers.o tnum.o
 obj-$(CONFIG_BPF_SYSCALL) += hashtab.o arraymap.o percpu_freelist.o bpf_lru_list.o lpm_trie.o map_in_map.o
 ifeq ($(CONFIG_NET),y)
 obj-$(CONFIG_BPF_SYSCALL) += devmap.o
diff --git a/kernel/bpf/tnum.c b/kernel/bpf/tnum.c
new file mode 100644
index 000000000000..92eeeb1974a2
--- /dev/null
+++ b/kernel/bpf/tnum.c
@@ -0,0 +1,164 @@
+/* tnum: tracked (or tristate) numbers
+ *
+ * A tnum tracks knowledge about the bits of a value.  Each bit can be either
+ * known (0 or 1), or unknown (x).  Arithmetic operations on tnums will
+ * propagate the unknown bits such that the tnum result represents all the
+ * possible results for possible values of the operands.
+ */
+#include <linux/kernel.h>
+#include <linux/tnum.h>
+
+#define TNUM(_v, _m)	(struct tnum){.value = _v, .mask = _m}
+/* A completely unknown value */
+const struct tnum tnum_unknown = { .value = 0, .mask = -1 };
+
+struct tnum tnum_const(u64 value)
+{
+	return TNUM(value, 0);
+}
+
+struct tnum tnum_lshift(struct tnum a, u8 shift)
+{
+	return TNUM(a.value << shift, a.mask << shift);
+}
+
+struct tnum tnum_rshift(struct tnum a, u8 shift)
+{
+	return TNUM(a.value >> shift, a.mask >> shift);
+}
+
+struct tnum tnum_add(struct tnum a, struct tnum b)
+{
+	u64 sm, sv, sigma, chi, mu;
+
+	sm = a.mask + b.mask;
+	sv = a.value + b.value;
+	sigma = sm + sv;
+	chi = sigma ^ sv;
+	mu = chi | a.mask | b.mask;
+	return TNUM(sv & ~mu, mu);
+}
+
+struct tnum tnum_sub(struct tnum a, struct tnum b)
+{
+	u64 dv, alpha, beta, chi, mu;
+
+	dv = a.value - b.value;
+	alpha = dv + a.mask;
+	beta = dv - b.mask;
+	chi = alpha ^ beta;
+	mu = chi | a.mask | b.mask;
+	return TNUM(dv & ~mu, mu);
+}
+
+struct tnum tnum_and(struct tnum a, struct tnum b)
+{
+	u64 alpha, beta, v;
+
+	alpha = a.value | a.mask;
+	beta = b.value | b.mask;
+	v = a.value & b.value;
+	return TNUM(v, alpha & beta & ~v);
+}
+
+struct tnum tnum_or(struct tnum a, struct tnum b)
+{
+	u64 v, mu;
+
+	v = a.value | b.value;
+	mu = a.mask | b.mask;
+	return TNUM(v, mu & ~v);
+}
+
+struct tnum tnum_xor(struct tnum a, struct tnum b)
+{
+	u64 v, mu;
+
+	v = a.value ^ b.value;
+	mu = a.mask | b.mask;
+	return TNUM(v & ~mu, mu);
+}
+
+/* half-multiply add: acc += (unknown * mask * value).
+ * An intermediate step in the multiply algorithm.
+ */
+static struct tnum hma(struct tnum acc, u64 value, u64 mask)
+{
+	while (mask) {
+		if (mask & 1)
+			acc = tnum_add(acc, TNUM(0, value));
+		mask >>= 1;
+		value <<= 1;
+	}
+	return acc;
+}
+
+struct tnum tnum_mul(struct tnum a, struct tnum b)
+{
+	struct tnum acc;
+	u64 pi;
+
+	pi = a.value * b.value;
+	acc = hma(TNUM(pi, 0), a.mask, b.mask | b.value);
+	return hma(acc, b.mask, a.value);
+}
+
+/* Note that if a and b disagree - i.e. one has a 'known 1' where the other has
+ * a 'known 0' - this will return a 'known 1' for that bit.
+ */
+struct tnum tnum_intersect(struct tnum a, struct tnum b)
+{
+	u64 v, mu;
+
+	v = a.value | b.value;
+	mu = a.mask & b.mask;
+	return TNUM(v & ~mu, mu);
+}
+
+struct tnum tnum_cast(struct tnum a, u8 size)
+{
+	a.value &= (1ULL << (size * 8)) - 1;
+	a.mask &= (1ULL << (size * 8)) - 1;
+	return a;
+}
+
+bool tnum_is_aligned(struct tnum a, u64 size)
+{
+	if (!size)
+		return true;
+	return !((a.value | a.mask) & (size - 1));
+}
+
+bool tnum_in(struct tnum a, struct tnum b)
+{
+	if (b.mask & ~a.mask)
+		return false;
+	b.value &= ~a.mask;
+	return a.value == b.value;
+}
+
+int tnum_strn(char *str, size_t size, struct tnum a)
+{
+	return snprintf(str, size, "(%#llx; %#llx)", a.value, a.mask);
+}
+EXPORT_SYMBOL_GPL(tnum_strn);
+
+int tnum_sbin(char *str, size_t size, struct tnum a)
+{
+	size_t n;
+
+	for (n = 64; n; n--) {
+		if (n < size) {
+			if (a.mask & 1)
+				str[n - 1] = 'x';
+			else if (a.value & 1)
+				str[n - 1] = '1';
+			else
+				str[n - 1] = '0';
+		}
+		a.mask >>= 1;
+		a.value >>= 1;
+	}
+	str[min(size - 1, (size_t)64)] = 0;
+	return 64;
+}
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index f6e8b3887eab..c3f88b466c30 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -61,12 +61,12 @@
  * (and -20 constant is saved for further stack bounds checking).
  * Meaning that this reg is a pointer to stack plus known immediate constant.
  *
- * Most of the time the registers have UNKNOWN_VALUE type, which
+ * Most of the time the registers have SCALAR_VALUE type, which
  * means the register has some value, but it's not a valid pointer.
- * (like pointer plus pointer becomes UNKNOWN_VALUE type)
+ * (like pointer plus pointer becomes SCALAR_VALUE type)
  *
  * When verifier sees load or store instructions the type of base register
- * can be: PTR_TO_MAP_VALUE, PTR_TO_CTX, FRAME_PTR. These are three pointer
+ * can be: PTR_TO_MAP_VALUE, PTR_TO_CTX, PTR_TO_STACK. These are three pointer
  * types recognized by check_mem_access() function.
  *
  * PTR_TO_MAP_VALUE means that this register is pointing to 'map element value'
@@ -180,15 +180,12 @@ static __printf(1, 2) void verbose(const char *fmt, ...)
 /* string representation of 'enum bpf_reg_type' */
 static const char * const reg_type_str[] = {
 	[NOT_INIT]		= "?",
-	[UNKNOWN_VALUE]		= "inv",
+	[SCALAR_VALUE]		= "inv",
 	[PTR_TO_CTX]		= "ctx",
 	[CONST_PTR_TO_MAP]	= "map_ptr",
 	[PTR_TO_MAP_VALUE]	= "map_value",
 	[PTR_TO_MAP_VALUE_OR_NULL] = "map_value_or_null",
-	[PTR_TO_MAP_VALUE_ADJ]	= "map_value_adj",
-	[FRAME_PTR]		= "fp",
 	[PTR_TO_STACK]		= "fp",
-	[CONST_IMM]		= "imm",
 	[PTR_TO_PACKET]		= "pkt",
 	[PTR_TO_PACKET_END]	= "pkt_end",
 };
@@ -221,32 +218,36 @@ static void print_verifier_state(struct bpf_verifier_state *state)
 		if (t == NOT_INIT)
 			continue;
 		verbose(" R%d=%s", i, reg_type_str[t]);
-		if (t == CONST_IMM || t == PTR_TO_STACK)
-			verbose("%lld", reg->imm);
-		else if (t == PTR_TO_PACKET)
-			verbose("(id=%d,off=%d,r=%d)",
-				reg->id, reg->off, reg->range);
-		else if (t == UNKNOWN_VALUE && reg->imm)
-			verbose("%lld", reg->imm);
-		else if (t == CONST_PTR_TO_MAP || t == PTR_TO_MAP_VALUE ||
-			 t == PTR_TO_MAP_VALUE_OR_NULL ||
-			 t == PTR_TO_MAP_VALUE_ADJ)
-			verbose("(ks=%d,vs=%d,id=%u)",
-				reg->map_ptr->key_size,
-				reg->map_ptr->value_size,
-				reg->id);
-		if (reg->min_value != BPF_REGISTER_MIN_RANGE)
-			verbose(",min_value=%lld",
-				(long long)reg->min_value);
-		if (reg->max_value != BPF_REGISTER_MAX_RANGE)
-			verbose(",max_value=%llu",
-				(unsigned long long)reg->max_value);
-		if (reg->min_align)
-			verbose(",min_align=%u", reg->min_align);
-		if (reg->aux_off)
-			verbose(",aux_off=%u", reg->aux_off);
-		if (reg->aux_off_align)
-			verbose(",aux_off_align=%u", reg->aux_off_align);
+		if ((t == SCALAR_VALUE || t == PTR_TO_STACK) &&
+		    tnum_is_const(reg->var_off)) {
+			/* reg->off should be 0 for SCALAR_VALUE */
+			verbose("%lld", reg->var_off.value + reg->off);
+		} else {
+			verbose("(id=%d", reg->id);
+			if (t != SCALAR_VALUE)
+				verbose(",off=%d", reg->off);
+			if (t == PTR_TO_PACKET)
+				verbose(",r=%d", reg->range);
+			else if (t == CONST_PTR_TO_MAP ||
+				 t == PTR_TO_MAP_VALUE ||
+				 t == PTR_TO_MAP_VALUE_OR_NULL)
+				verbose(",ks=%d,vs=%d",
+					reg->map_ptr->key_size,
+					reg->map_ptr->value_size);
+			if (reg->min_value != BPF_REGISTER_MIN_RANGE)
+				verbose(",min_value=%lld",
+					(long long)reg->min_value);
+			if (reg->max_value != BPF_REGISTER_MAX_RANGE)
+				verbose(",max_value=%llu",
+					(unsigned long long)reg->max_value);
+			if (!tnum_is_unknown(reg->var_off)) {
+				char tn_buf[48];
+
+				tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off);
+				verbose(",var_off=%s", tn_buf);
+			}
+			verbose(")");
+		}
 	}
 	for (i = 0; i < MAX_BPF_STACK; i += BPF_REG_SIZE) {
 		if (state->stack_slot_type[i] == STACK_SPILL)
@@ -463,14 +464,69 @@ static const int caller_saved[CALLER_SAVED_REGS] = {
 	BPF_REG_0, BPF_REG_1, BPF_REG_2, BPF_REG_3, BPF_REG_4, BPF_REG_5
 };
 
-static void mark_reg_not_init(struct bpf_reg_state *regs, u32 regno)
+static void __mark_reg_not_init(struct bpf_reg_state *reg);
+
+/* Mark the 'variable offset' part of a register as zero.  This should be
+ * used only on registers holding a pointer type.
+ */
+static void __mark_reg_known_zero(struct bpf_reg_state *reg)
 {
-	BUG_ON(regno >= MAX_BPF_REG);
+	reg->var_off = tnum_const(0);
+	reg->min_value = 0;
+	reg->max_value = 0;
+}
 
-	memset(&regs[regno], 0, sizeof(regs[regno]));
-	regs[regno].type = NOT_INIT;
-	regs[regno].min_value = BPF_REGISTER_MIN_RANGE;
-	regs[regno].max_value = BPF_REGISTER_MAX_RANGE;
+static void mark_reg_known_zero(struct bpf_reg_state *regs, u32 regno)
+{
+	if (WARN_ON(regno >= MAX_BPF_REG)) {
+		verbose("mark_reg_known_zero(regs, %u)\n", regno);
+		/* Something bad happened, let's kill all regs */
+		for (regno = 0; regno < MAX_BPF_REG; regno++)
+			__mark_reg_not_init(regs + regno);
+		return;
+	}
+	__mark_reg_known_zero(regs + regno);
+}
+
+/* Mark a register as having a completely unknown (scalar) value. */
+static void __mark_reg_unknown(struct bpf_reg_state *reg)
+{
+	reg->type = SCALAR_VALUE;
+	reg->id = 0;
+	reg->off = 0;
+	reg->var_off = tnum_unknown;
+	reg->min_value = BPF_REGISTER_MIN_RANGE;
+	reg->max_value = BPF_REGISTER_MAX_RANGE;
+}
+
+static void mark_reg_unknown(struct bpf_reg_state *regs, u32 regno)
+{
+	if (WARN_ON(regno >= MAX_BPF_REG)) {
+		verbose("mark_reg_unknown(regs, %u)\n", regno);
+		/* Something bad happened, let's kill all regs */
+		for (regno = 0; regno < MAX_BPF_REG; regno++)
+			__mark_reg_not_init(regs + regno);
+		return;
+	}
+	__mark_reg_unknown(regs + regno);
+}
+
+static void __mark_reg_not_init(struct bpf_reg_state *reg)
+{
+	__mark_reg_unknown(reg);
+	reg->type = NOT_INIT;
+}
+
+static void mark_reg_not_init(struct bpf_reg_state *regs, u32 regno)
+{
+	if (WARN_ON(regno >= MAX_BPF_REG)) {
+		verbose("mark_reg_not_init(regs, %u)\n", regno);
+		/* Something bad happened, let's kill all regs */
+		for (regno = 0; regno < MAX_BPF_REG; regno++)
+			__mark_reg_not_init(regs + regno);
+		return;
+	}
+	__mark_reg_not_init(regs + regno);
 }
 
 static void init_reg_state(struct bpf_reg_state *regs)
@@ -481,23 +537,12 @@ static void init_reg_state(struct bpf_reg_state *regs)
 		mark_reg_not_init(regs, i);
 
 	/* frame pointer */
-	regs[BPF_REG_FP].type = FRAME_PTR;
+	regs[BPF_REG_FP].type = PTR_TO_STACK;
+	mark_reg_known_zero(regs, BPF_REG_FP);
 
 	/* 1st arg to a function */
 	regs[BPF_REG_1].type = PTR_TO_CTX;
-}
-
-static void __mark_reg_unknown_value(struct bpf_reg_state *regs, u32 regno)
-{
-	regs[regno].type = UNKNOWN_VALUE;
-	regs[regno].id = 0;
-	regs[regno].imm = 0;
-}
-
-static void mark_reg_unknown_value(struct bpf_reg_state *regs, u32 regno)
-{
-	BUG_ON(regno >= MAX_BPF_REG);
-	__mark_reg_unknown_value(regs, regno);
+	mark_reg_known_zero(regs, BPF_REG_1);
 }
 
 static void reset_reg_range_values(struct bpf_reg_state *regs, u32 regno)
@@ -505,14 +550,6 @@ static void reset_reg_range_values(struct bpf_reg_state *regs, u32 regno)
 	regs[regno].min_value = BPF_REGISTER_MIN_RANGE;
 	regs[regno].max_value = BPF_REGISTER_MAX_RANGE;
 	regs[regno].value_from_signed = false;
-	regs[regno].min_align = 0;
-}
-
-static void mark_reg_unknown_value_and_range(struct bpf_reg_state *regs,
-					     u32 regno)
-{
-	mark_reg_unknown_value(regs, regno);
-	reset_reg_range_values(regs, regno);
 }
 
 enum reg_arg_type {
@@ -542,7 +579,7 @@ static int check_reg_arg(struct bpf_reg_state *regs, u32 regno,
 			return -EACCES;
 		}
 		if (t == DST_OP)
-			mark_reg_unknown_value(regs, regno);
+			mark_reg_unknown(regs, regno);
 	}
 	return 0;
 }
@@ -552,12 +589,10 @@ static bool is_spillable_regtype(enum bpf_reg_type type)
 	switch (type) {
 	case PTR_TO_MAP_VALUE:
 	case PTR_TO_MAP_VALUE_OR_NULL:
-	case PTR_TO_MAP_VALUE_ADJ:
 	case PTR_TO_STACK:
 	case PTR_TO_CTX:
 	case PTR_TO_PACKET:
 	case PTR_TO_PACKET_END:
-	case FRAME_PTR:
 	case CONST_PTR_TO_MAP:
 		return true;
 	default:
@@ -637,14 +672,13 @@ static int check_stack_read(struct bpf_verifier_state *state, int off, int size,
 		}
 		if (value_regno >= 0)
 			/* have read misc data from the stack */
-			mark_reg_unknown_value_and_range(state->regs,
-							 value_regno);
+			mark_reg_unknown(state->regs, value_regno);
 		return 0;
 	}
 }
 
 /* check read/write into map element returned by bpf_map_lookup_elem() */
-static int check_map_access(struct bpf_verifier_env *env, u32 regno, int off,
+static int __check_map_access(struct bpf_verifier_env *env, u32 regno, int off,
 			    int size)
 {
 	struct bpf_map *map = env->cur_state.regs[regno].map_ptr;
@@ -657,22 +691,25 @@ static int check_map_access(struct bpf_verifier_env *env, u32 regno, int off,
 	return 0;
 }
 
-/* check read/write into an adjusted map element */
-static int check_map_access_adj(struct bpf_verifier_env *env, u32 regno,
+/* check read/write into a map element with possible variable offset */
+static int check_map_access(struct bpf_verifier_env *env, u32 regno,
 				int off, int size)
 {
 	struct bpf_verifier_state *state = &env->cur_state;
 	struct bpf_reg_state *reg = &state->regs[regno];
 	int err;
 
-	/* We adjusted the register to this map value, so we
-	 * need to change off and size to min_value and max_value
-	 * respectively to make sure our theoretical access will be
-	 * safe.
+	/* We may have adjusted the register to this map value, so we
+	 * need to try adding each of min_value and max_value to off
+	 * to make sure our theoretical access will be safe.
 	 */
 	if (log_level)
 		print_verifier_state(state);
-	env->varlen_map_value_access = true;
+	/* If the offset is variable, we will need to be stricter in state
+	 * pruning from now on.
+	 */
+	if (!tnum_is_const(reg->var_off))
+		env->varlen_map_value_access = true;
 	/* The minimum value is only important with signed
 	 * comparisons where we can't assume the floor of a
 	 * value is 0.  If we are using signed variables for our
@@ -684,10 +721,9 @@ static int check_map_access_adj(struct bpf_verifier_env *env, u32 regno,
 			regno);
 		return -EACCES;
 	}
-	err = check_map_access(env, regno, reg->min_value + off, size);
+	err = __check_map_access(env, regno, reg->min_value + off, size);
 	if (err) {
-		verbose("R%d min value is outside of the array range\n",
-			regno);
+		verbose("R%d min value is outside of the array range\n", regno);
 		return err;
 	}
 
@@ -699,7 +735,10 @@ static int check_map_access_adj(struct bpf_verifier_env *env, u32 regno,
 			regno);
 		return -EACCES;
 	}
-	return check_map_access(env, regno, reg->max_value + off, size);
+	err = __check_map_access(env, regno, reg->max_value + off, size);
+	if (err)
+		verbose("R%d max value is outside of the array range\n", regno);
+	return err;
 }
 
 #define MAX_PACKET_OFF 0xffff
@@ -729,14 +768,13 @@ static bool may_access_direct_pkt_data(struct bpf_verifier_env *env,
 	}
 }
 
-static int check_packet_access(struct bpf_verifier_env *env, u32 regno, int off,
-			       int size)
+static int __check_packet_access(struct bpf_verifier_env *env, u32 regno,
+				 int off, int size)
 {
 	struct bpf_reg_state *regs = env->cur_state.regs;
 	struct bpf_reg_state *reg = &regs[regno];
 
-	off += reg->off;
-	if (off < 0 || size <= 0 || off + size > reg->range) {
+	if (off < 0 || size <= 0 || (u64)off + size > reg->range) {
 		verbose("invalid access to packet, off=%d size=%d, R%d(id=%d,off=%d,r=%d)\n",
 			off, size, regno, reg->id, reg->off, reg->range);
 		return -EACCES;
@@ -744,7 +782,35 @@ static int check_packet_access(struct bpf_verifier_env *env, u32 regno, int off,
 	return 0;
 }
 
-/* check access to 'struct bpf_context' fields */
+static int check_packet_access(struct bpf_verifier_env *env, u32 regno, int off,
+			       int size)
+{
+	struct bpf_reg_state *regs = env->cur_state.regs;
+	struct bpf_reg_state *reg = &regs[regno];
+	int err;
+
+	/* We may have added a variable offset to the packet pointer; but any
+	 * reg->range we have comes after that.  We are only checking the fixed
+	 * offset.
+	 */
+
+	/* We don't allow negative numbers, because we aren't tracking enough
+	 * detail to prove they're safe.
+	 */
+	if (reg->min_value < 0) {
+		verbose("R%d min value is negative, either use unsigned index or do a if (index >=0) check.\n",
+			regno);
+		return -EACCES;
+	}
+	err = __check_packet_access(env, regno, off, size);
+	if (err) {
+		verbose("R%d offset is outside of the packet\n", regno);
+		return err;
+	}
+	return err;
+}
+
+/* check access to 'struct bpf_context' fields.  Supports fixed offsets only */
 static int check_ctx_access(struct bpf_verifier_env *env, int insn_idx, int off, int size,
 			    enum bpf_access_type t, enum bpf_reg_type *reg_type)
 {
@@ -784,13 +850,7 @@ static bool __is_pointer_value(bool allow_ptr_leaks,
 	if (allow_ptr_leaks)
 		return false;
 
-	switch (reg->type) {
-	case UNKNOWN_VALUE:
-	case CONST_IMM:
-		return false;
-	default:
-		return true;
-	}
+	return reg->type != SCALAR_VALUE;
 }
 
 static bool is_pointer_value(struct bpf_verifier_env *env, int regno)
@@ -801,23 +861,13 @@ static bool is_pointer_value(struct bpf_verifier_env *env, int regno)
 static int check_pkt_ptr_alignment(const struct bpf_reg_state *reg,
 				   int off, int size, bool strict)
 {
+	struct tnum reg_off;
 	int ip_align;
-	int reg_off;
 
 	/* Byte size accesses are always allowed. */
 	if (!strict || size == 1)
 		return 0;
 
-	reg_off = reg->off;
-	if (reg->id) {
-		if (reg->aux_off_align % size) {
-			verbose("Packet access is only %u byte aligned, %d byte access not allowed\n",
-				reg->aux_off_align, size);
-			return -EACCES;
-		}
-		reg_off += reg->aux_off;
-	}
-
 	/* For platforms that do not have a Kconfig enabling
 	 * CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS the value of
 	 * NET_IP_ALIGN is universally set to '2'.  And on platforms
@@ -827,20 +877,37 @@ static int check_pkt_ptr_alignment(const struct bpf_reg_state *reg,
 	 * unconditional IP align value of '2'.
 	 */
 	ip_align = 2;
-	if ((ip_align + reg_off + off) % size != 0) {
-		verbose("misaligned packet access off %d+%d+%d size %d\n",
-			ip_align, reg_off, off, size);
+
+	reg_off = tnum_add(reg->var_off, tnum_const(ip_align + reg->off + off));
+	if (!tnum_is_aligned(reg_off, size)) {
+		char tn_buf[48];
+
+		tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off);
+		verbose("misaligned packet access off %d+%s+%d+%d size %d\n",
+			ip_align, tn_buf, reg->off, off, size);
 		return -EACCES;
 	}
 
 	return 0;
 }
 
-static int check_val_ptr_alignment(const struct bpf_reg_state *reg,
-				   int size, bool strict)
+static int check_generic_ptr_alignment(const struct bpf_reg_state *reg,
+				       const char *pointer_desc,
+				       int off, int size, bool strict)
 {
-	if (strict && size != 1) {
-		verbose("Unknown alignment. Only byte-sized access allowed in value access.\n");
+	struct tnum reg_off;
+
+	/* Byte size accesses are always allowed. */
+	if (!strict || size == 1)
+		return 0;
+
+	reg_off = tnum_add(reg->var_off, tnum_const(reg->off + off));
+	if (!tnum_is_aligned(reg_off, size)) {
+		char tn_buf[48];
+
+		tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off);
+		verbose("misaligned %saccess off %s+%d+%d size %d\n",
+			pointer_desc, tn_buf, reg->off, off, size);
 		return -EACCES;
 	}
 
@@ -852,21 +919,25 @@ static int check_ptr_alignment(struct bpf_verifier_env *env,
 			       int off, int size)
 {
 	bool strict = env->strict_alignment;
+	const char *pointer_desc = "";
 
 	switch (reg->type) {
 	case PTR_TO_PACKET:
+		/* special case, because of NET_IP_ALIGN */
 		return check_pkt_ptr_alignment(reg, off, size, strict);
-	case PTR_TO_MAP_VALUE_ADJ:
-		return check_val_ptr_alignment(reg, size, strict);
+	case PTR_TO_MAP_VALUE:
+		pointer_desc = "value ";
+		break;
+	case PTR_TO_CTX:
+		pointer_desc = "context ";
+		break;
+	case PTR_TO_STACK:
+		pointer_desc = "stack ";
+		break;
 	default:
-		if (off % size != 0) {
-			verbose("misaligned access off %d size %d\n",
-				off, size);
-			return -EACCES;
-		}
-
-		return 0;
+		break;
 	}
+	return check_generic_ptr_alignment(reg, pointer_desc, off, size, strict);
 }
 
 /* check whether memory at (regno + off) is accessible for t = (read | write)
@@ -883,52 +954,79 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn
 	struct bpf_reg_state *reg = &state->regs[regno];
 	int size, err = 0;
 
-	if (reg->type == PTR_TO_STACK)
-		off += reg->imm;
-
 	size = bpf_size_to_bytes(bpf_size);
 	if (size < 0)
 		return size;
 
+	/* alignment checks will add in reg->off themselves */
 	err = check_ptr_alignment(env, reg, off, size);
 	if (err)
 		return err;
 
-	if (reg->type == PTR_TO_MAP_VALUE ||
-	    reg->type == PTR_TO_MAP_VALUE_ADJ) {
+	/* for access checks, reg->off is just part of off */
+	off += reg->off;
+
+	if (reg->type == PTR_TO_MAP_VALUE) {
 		if (t == BPF_WRITE && value_regno >= 0 &&
 		    is_pointer_value(env, value_regno)) {
 			verbose("R%d leaks addr into map\n", value_regno);
 			return -EACCES;
 		}
 
-		if (reg->type == PTR_TO_MAP_VALUE_ADJ)
-			err = check_map_access_adj(env, regno, off, size);
-		else
-			err = check_map_access(env, regno, off, size);
+		err = check_map_access(env, regno, off, size);
 		if (!err && t == BPF_READ && value_regno >= 0)
-			mark_reg_unknown_value_and_range(state->regs,
-							 value_regno);
+			mark_reg_unknown(state->regs, value_regno);
 
 	} else if (reg->type == PTR_TO_CTX) {
-		enum bpf_reg_type reg_type = UNKNOWN_VALUE;
+		enum bpf_reg_type reg_type = SCALAR_VALUE;
 
 		if (t == BPF_WRITE && value_regno >= 0 &&
 		    is_pointer_value(env, value_regno)) {
 			verbose("R%d leaks addr into ctx\n", value_regno);
 			return -EACCES;
 		}
+		/* ctx accesses must be at a fixed offset, so that we can
+		 * determine what type of data were returned.
+		 */
+		if (!tnum_is_const(reg->var_off)) {
+			char tn_buf[48];
+
+			tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off);
+			verbose("variable ctx access var_off=%s off=%d size=%d",
+				tn_buf, off, size);
+			return -EACCES;
+		}
+		off += reg->var_off.value;
 		err = check_ctx_access(env, insn_idx, off, size, t, &reg_type);
 		if (!err && t == BPF_READ && value_regno >= 0) {
-			mark_reg_unknown_value_and_range(state->regs,
-							 value_regno);
-			/* note that reg.[id|off|range] == 0 */
+			/* ctx access returns either a scalar, or a
+			 * PTR_TO_PACKET[_END].  In the latter case, we know
+			 * the offset is zero.
+			 */
+			if (reg_type == SCALAR_VALUE)
+				mark_reg_unknown(state->regs, value_regno);
+			else
+				mark_reg_known_zero(state->regs, value_regno);
+			state->regs[value_regno].id = 0;
+			state->regs[value_regno].off = 0;
+			state->regs[value_regno].range = 0;
 			state->regs[value_regno].type = reg_type;
-			state->regs[value_regno].aux_off = 0;
-			state->regs[value_regno].aux_off_align = 0;
 		}
 
-	} else if (reg->type == FRAME_PTR || reg->type == PTR_TO_STACK) {
+	} else if (reg->type == PTR_TO_STACK) {
+		/* stack accesses must be at a fixed offset, so that we can
+		 * determine what type of data were returned.
+		 * See check_stack_read().
+		 */
+		if (!tnum_is_const(reg->var_off)) {
+			char tn_buf[48];
+
+			tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off);
+			verbose("variable stack access var_off=%s off=%d size=%d",
+				tn_buf, off, size);
+			return -EACCES;
+		}
+		off += reg->var_off.value;
 		if (off >= 0 || off < -MAX_BPF_STACK) {
 			verbose("invalid stack off=%d size=%d\n", off, size);
 			return -EACCES;
@@ -948,7 +1046,7 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn
 		} else {
 			err = check_stack_read(state, off, size, value_regno);
 		}
-	} else if (state->regs[regno].type == PTR_TO_PACKET) {
+	} else if (reg->type == PTR_TO_PACKET) {
 		if (t == BPF_WRITE && !may_access_direct_pkt_data(env, NULL, t)) {
 			verbose("cannot write into packet\n");
 			return -EACCES;
@@ -960,21 +1058,24 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn
 		}
 		err = check_packet_access(env, regno, off, size);
 		if (!err && t == BPF_READ && value_regno >= 0)
-			mark_reg_unknown_value_and_range(state->regs,
-							 value_regno);
+			mark_reg_unknown(state->regs, value_regno);
 	} else {
 		verbose("R%d invalid mem access '%s'\n",
 			regno, reg_type_str[reg->type]);
 		return -EACCES;
 	}
 
-	if (!err && size <= 2 && value_regno >= 0 && env->allow_ptr_leaks &&
-	    state->regs[value_regno].type == UNKNOWN_VALUE) {
-		/* 1 or 2 byte load zero-extends, determine the number of
-		 * zero upper bits. Not doing it fo 4 byte load, since
-		 * such values cannot be added to ptr_to_packet anyway.
-		 */
-		state->regs[value_regno].imm = 64 - size * 8;
+	if (!err && size < BPF_REG_SIZE && value_regno >= 0 && t == BPF_READ &&
+	    state->regs[value_regno].type == SCALAR_VALUE) {
+		/* b/h/w load zero-extends, mark upper bits as known 0 */
+		state->regs[value_regno].var_off = tnum_cast(
+					state->regs[value_regno].var_off, size);
+		/* sign bit is known zero, so we can bound the value */
+		state->regs[value_regno].min_value = 0;
+		state->regs[value_regno].max_value = min_t(u64,
+					state->regs[value_regno].var_off.value |
+					state->regs[value_regno].var_off.mask,
+					BPF_REGISTER_MAX_RANGE);
 	}
 	return err;
 }
@@ -1016,9 +1117,17 @@ static int check_xadd(struct bpf_verifier_env *env, int insn_idx, struct bpf_ins
 				BPF_SIZE(insn->code), BPF_WRITE, -1);
 }
 
+/* Does this register contain a constant zero? */
+static bool register_is_null(struct bpf_reg_state reg)
+{
+	return reg.type == SCALAR_VALUE && tnum_equals_const(reg.var_off, 0);
+}
+
 /* when register 'regno' is passed into function that will read 'access_size'
  * bytes from that pointer, make sure that it's within stack boundary
- * and all elements of stack are initialized
+ * and all elements of stack are initialized.
+ * Unlike most pointer bounds-checking functions, this one doesn't take an
+ * 'off' argument, so it has to add in reg->off itself.
  */
 static int check_stack_boundary(struct bpf_verifier_env *env, int regno,
 				int access_size, bool zero_size_allowed,
@@ -1029,9 +1138,9 @@ static int check_stack_boundary(struct bpf_verifier_env *env, int regno,
 	int off, i;
 
 	if (regs[regno].type != PTR_TO_STACK) {
+		/* Allow zero-byte read from NULL, regardless of pointer type */
 		if (zero_size_allowed && access_size == 0 &&
-		    regs[regno].type == CONST_IMM &&
-		    regs[regno].imm  == 0)
+		    register_is_null(regs[regno]))
 			return 0;
 
 		verbose("R%d type=%s expected=%s\n", regno,
@@ -1040,7 +1149,15 @@ static int check_stack_boundary(struct bpf_verifier_env *env, int regno,
 		return -EACCES;
 	}
 
-	off = regs[regno].imm;
+	/* Only allow fixed-offset stack reads */
+	if (!tnum_is_const(regs[regno].var_off)) {
+		char tn_buf[48];
+
+		tnum_strn(tn_buf, sizeof(tn_buf), regs[regno].var_off);
+		verbose("invalid variable stack read R%d var_off=%s\n",
+			regno, tn_buf);
+	}
+	off = regs[regno].off + regs[regno].var_off.value;
 	if (off >= 0 || off < -MAX_BPF_STACK || off + access_size > 0 ||
 	    access_size <= 0) {
 		verbose("invalid stack type R%d off=%d access_size=%d\n",
@@ -1071,16 +1188,14 @@ static int check_helper_mem_access(struct bpf_verifier_env *env, int regno,
 				   int access_size, bool zero_size_allowed,
 				   struct bpf_call_arg_meta *meta)
 {
-	struct bpf_reg_state *regs = env->cur_state.regs;
+	struct bpf_reg_state *regs = env->cur_state.regs, *reg = &regs[regno];
 
-	switch (regs[regno].type) {
+	switch (reg->type) {
 	case PTR_TO_PACKET:
-		return check_packet_access(env, regno, 0, access_size);
+		return check_packet_access(env, regno, reg->off, access_size);
 	case PTR_TO_MAP_VALUE:
-		return check_map_access(env, regno, 0, access_size);
-	case PTR_TO_MAP_VALUE_ADJ:
-		return check_map_access_adj(env, regno, 0, access_size);
-	default: /* const_imm|ptr_to_stack or invalid ptr */
+		return check_map_access(env, regno, reg->off, access_size);
+	default: /* scalar_value|ptr_to_stack or invalid ptr */
 		return check_stack_boundary(env, regno, access_size,
 					    zero_size_allowed, meta);
 	}
@@ -1123,11 +1238,8 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno,
 			goto err_type;
 	} else if (arg_type == ARG_CONST_SIZE ||
 		   arg_type == ARG_CONST_SIZE_OR_ZERO) {
-		expected_type = CONST_IMM;
-		/* One exception. Allow UNKNOWN_VALUE registers when the
-		 * boundaries are known and don't cause unsafe memory accesses
-		 */
-		if (type != UNKNOWN_VALUE && type != expected_type)
+		expected_type = SCALAR_VALUE;
+		if (type != expected_type)
 			goto err_type;
 	} else if (arg_type == ARG_CONST_MAP_PTR) {
 		expected_type = CONST_PTR_TO_MAP;
@@ -1141,13 +1253,13 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno,
 		   arg_type == ARG_PTR_TO_UNINIT_MEM) {
 		expected_type = PTR_TO_STACK;
 		/* One exception here. In case function allows for NULL to be
-		 * passed in as argument, it's a CONST_IMM type. Final test
+		 * passed in as argument, it's a SCALAR_VALUE type. Final test
 		 * happens during stack boundary checking.
 		 */
-		if (type == CONST_IMM && reg->imm == 0)
+		if (register_is_null(*reg))
 			/* final test in check_stack_boundary() */;
 		else if (type != PTR_TO_PACKET && type != PTR_TO_MAP_VALUE &&
-			 type != PTR_TO_MAP_VALUE_ADJ && type != expected_type)
+			 type != expected_type)
 			goto err_type;
 		meta->raw_mode = arg_type == ARG_PTR_TO_UNINIT_MEM;
 	} else {
@@ -1173,7 +1285,7 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno,
 			return -EACCES;
 		}
 		if (type == PTR_TO_PACKET)
-			err = check_packet_access(env, regno, 0,
+			err = check_packet_access(env, regno, reg->off,
 						  meta->map_ptr->key_size);
 		else
 			err = check_stack_boundary(env, regno,
@@ -1189,7 +1301,7 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno,
 			return -EACCES;
 		}
 		if (type == PTR_TO_PACKET)
-			err = check_packet_access(env, regno, 0,
+			err = check_packet_access(env, regno, reg->off,
 						  meta->map_ptr->value_size);
 		else
 			err = check_stack_boundary(env, regno,
@@ -1209,10 +1321,11 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno,
 			return -EACCES;
 		}
 
-		/* If the register is UNKNOWN_VALUE, the access check happens
-		 * using its boundaries. Otherwise, just use its imm
+		/* The register is SCALAR_VALUE; the access check
+		 * happens using its boundaries.
 		 */
-		if (type == UNKNOWN_VALUE) {
+
+		if (!tnum_is_const(reg->var_off))
 			/* For unprivileged variable accesses, disable raw
 			 * mode so that the program is required to
 			 * initialize all the memory that the helper could
@@ -1220,35 +1333,28 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno,
 			 */
 			meta = NULL;
 
-			if (reg->min_value < 0) {
-				verbose("R%d min value is negative, either use unsigned or 'var &= const'\n",
-					regno);
-				return -EACCES;
-			}
-
-			if (reg->min_value == 0) {
-				err = check_helper_mem_access(env, regno - 1, 0,
-							      zero_size_allowed,
-							      meta);
-				if (err)
-					return err;
-			}
+		if (reg->min_value < 0) {
+			verbose("R%d min value is negative, either use unsigned or 'var &= const'\n",
+				regno);
+			return -EACCES;
+		}
 
-			if (reg->max_value == BPF_REGISTER_MAX_RANGE) {
-				verbose("R%d unbounded memory access, use 'var &= const' or 'if (var < const)'\n",
-					regno);
-				return -EACCES;
-			}
-			err = check_helper_mem_access(env, regno - 1,
-						      reg->max_value,
-						      zero_size_allowed, meta);
+		if (reg->min_value == 0) {
+			err = check_helper_mem_access(env, regno - 1, 0,
+						      zero_size_allowed,
+						      meta);
 			if (err)
 				return err;
-		} else {
-			/* register is CONST_IMM */
-			err = check_helper_mem_access(env, regno - 1, reg->imm,
-						      zero_size_allowed, meta);
 		}
+
+		if (reg->max_value == BPF_REGISTER_MAX_RANGE) {
+			verbose("R%d unbounded memory access, use 'var &= const' or 'if (var < const)'\n",
+				regno);
+			return -EACCES;
+		}
+		err = check_helper_mem_access(env, regno - 1,
+					      reg->max_value,
+					      zero_size_allowed, meta);
 	}
 
 	return err;
@@ -1352,6 +1458,9 @@ static int check_raw_mode(const struct bpf_func_proto *fn)
 	return count > 1 ? -EINVAL : 0;
 }
 
+/* Packet data might have moved, any old PTR_TO_PACKET[_END] are now invalid,
+ * so turn them into unknown SCALAR_VALUE.
+ */
 static void clear_all_pkt_pointers(struct bpf_verifier_env *env)
 {
 	struct bpf_verifier_state *state = &env->cur_state;
@@ -1361,7 +1470,7 @@ static void clear_all_pkt_pointers(struct bpf_verifier_env *env)
 	for (i = 0; i < MAX_BPF_REG; i++)
 		if (regs[i].type == PTR_TO_PACKET ||
 		    regs[i].type == PTR_TO_PACKET_END)
-			mark_reg_unknown_value(regs, i);
+			mark_reg_unknown(regs, i);
 
 	for (i = 0; i < MAX_BPF_STACK; i += BPF_REG_SIZE) {
 		if (state->stack_slot_type[i] != STACK_SPILL)
@@ -1370,8 +1479,7 @@ static void clear_all_pkt_pointers(struct bpf_verifier_env *env)
 		if (reg->type != PTR_TO_PACKET &&
 		    reg->type != PTR_TO_PACKET_END)
 			continue;
-		__mark_reg_unknown_value(state->spilled_regs,
-					 i / BPF_REG_SIZE);
+		__mark_reg_unknown(reg);
 	}
 }
 
@@ -1451,14 +1559,17 @@ static int check_call(struct bpf_verifier_env *env, int func_id, int insn_idx)
 
 	/* update return register */
 	if (fn->ret_type == RET_INTEGER) {
-		regs[BPF_REG_0].type = UNKNOWN_VALUE;
+		/* sets type to SCALAR_VALUE */
+		mark_reg_unknown(regs, BPF_REG_0);
 	} else if (fn->ret_type == RET_VOID) {
 		regs[BPF_REG_0].type = NOT_INIT;
 	} else if (fn->ret_type == RET_PTR_TO_MAP_VALUE_OR_NULL) {
 		struct bpf_insn_aux_data *insn_aux;
 
 		regs[BPF_REG_0].type = PTR_TO_MAP_VALUE_OR_NULL;
-		regs[BPF_REG_0].max_value = regs[BPF_REG_0].min_value = 0;
+		/* There is no offset yet applied, variable or fixed */
+		mark_reg_known_zero(regs, BPF_REG_0);
+		regs[BPF_REG_0].off = 0;
 		/* remember map_ptr, so that check_map_access()
 		 * can check 'value_size' boundary of memory access
 		 * to map element returned from bpf_map_lookup_elem()
@@ -1489,456 +1600,337 @@ static int check_call(struct bpf_verifier_env *env, int func_id, int insn_idx)
 	return 0;
 }
 
-static int check_packet_ptr_add(struct bpf_verifier_env *env,
-				struct bpf_insn *insn)
+static void check_reg_overflow(struct bpf_reg_state *reg)
 {
-	struct bpf_reg_state *regs = env->cur_state.regs;
-	struct bpf_reg_state *dst_reg = &regs[insn->dst_reg];
-	struct bpf_reg_state *src_reg = &regs[insn->src_reg];
-	struct bpf_reg_state tmp_reg;
-	s32 imm;
-
-	if (BPF_SRC(insn->code) == BPF_K) {
-		/* pkt_ptr += imm */
-		imm = insn->imm;
-
-add_imm:
-		if (imm < 0) {
-			verbose("addition of negative constant to packet pointer is not allowed\n");
-			return -EACCES;
-		}
-		if (imm >= MAX_PACKET_OFF ||
-		    imm + dst_reg->off >= MAX_PACKET_OFF) {
-			verbose("constant %d is too large to add to packet pointer\n",
-				imm);
-			return -EACCES;
-		}
-		/* a constant was added to pkt_ptr.
-		 * Remember it while keeping the same 'id'
-		 */
-		dst_reg->off += imm;
-	} else {
-		bool had_id;
-
-		if (src_reg->type == PTR_TO_PACKET) {
-			/* R6=pkt(id=0,off=0,r=62) R7=imm22; r7 += r6 */
-			tmp_reg = *dst_reg;  /* save r7 state */
-			*dst_reg = *src_reg; /* copy pkt_ptr state r6 into r7 */
-			src_reg = &tmp_reg;  /* pretend it's src_reg state */
-			/* if the checks below reject it, the copy won't matter,
-			 * since we're rejecting the whole program. If all ok,
-			 * then imm22 state will be added to r7
-			 * and r7 will be pkt(id=0,off=22,r=62) while
-			 * r6 will stay as pkt(id=0,off=0,r=62)
-			 */
-		}
-
-		if (src_reg->type == CONST_IMM) {
-			/* pkt_ptr += reg where reg is known constant */
-			imm = src_reg->imm;
-			goto add_imm;
-		}
-		/* disallow pkt_ptr += reg
-		 * if reg is not uknown_value with guaranteed zero upper bits
-		 * otherwise pkt_ptr may overflow and addition will become
-		 * subtraction which is not allowed
-		 */
-		if (src_reg->type != UNKNOWN_VALUE) {
-			verbose("cannot add '%s' to ptr_to_packet\n",
-				reg_type_str[src_reg->type]);
-			return -EACCES;
-		}
-		if (src_reg->imm < 48) {
-			verbose("cannot add integer value with %lld upper zero bits to ptr_to_packet\n",
-				src_reg->imm);
-			return -EACCES;
-		}
-
-		had_id = (dst_reg->id != 0);
+	if (reg->max_value > BPF_REGISTER_MAX_RANGE)
+		reg->max_value = BPF_REGISTER_MAX_RANGE;
+	if (reg->min_value < BPF_REGISTER_MIN_RANGE ||
+	    reg->min_value > BPF_REGISTER_MAX_RANGE)
+		reg->min_value = BPF_REGISTER_MIN_RANGE;
+}
 
-		/* dst_reg stays as pkt_ptr type and since some positive
-		 * integer value was added to the pointer, increment its 'id'
-		 */
-		dst_reg->id = ++env->id_gen;
-
-		/* something was added to pkt_ptr, set range to zero */
-		dst_reg->aux_off += dst_reg->off;
-		dst_reg->off = 0;
-		dst_reg->range = 0;
-		if (had_id)
-			dst_reg->aux_off_align = min(dst_reg->aux_off_align,
-						     src_reg->min_align);
-		else
-			dst_reg->aux_off_align = src_reg->min_align;
+static void coerce_reg_to_32(struct bpf_reg_state *reg)
+{
+	/* 32-bit values can't be negative as an s64 */
+	if (reg->min_value < 0)
+		reg->min_value = 0;
+	/* clear high 32 bits */
+	reg->var_off = tnum_cast(reg->var_off, 4);
+	/* Did value become known?  Then update bounds */
+	if (tnum_is_const(reg->var_off)) {
+		if ((s64)reg->var_off.value > BPF_REGISTER_MIN_RANGE)
+			reg->min_value = reg->var_off.value;
+		if (reg->var_off.value < BPF_REGISTER_MAX_RANGE)
+			reg->max_value = reg->var_off.value;
 	}
-	return 0;
 }
 
-static int evaluate_reg_alu(struct bpf_verifier_env *env, struct bpf_insn *insn)
+/* Handles arithmetic on a pointer and a scalar: computes new min/max and var_off.
+ * Caller must check_reg_overflow all argument regs beforehand.
+ * Caller should also handle BPF_MOV case separately.
+ * If we return -EACCES, caller may want to try again treating pointer as a
+ * scalar.  So we only emit a diagnostic if !env->allow_ptr_leaks.
+ */
+static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env,
+				   struct bpf_insn *insn,
+				   const struct bpf_reg_state *ptr_reg,
+				   const struct bpf_reg_state *off_reg)
 {
-	struct bpf_reg_state *regs = env->cur_state.regs;
-	struct bpf_reg_state *dst_reg = &regs[insn->dst_reg];
+	struct bpf_reg_state *regs = env->cur_state.regs, *dst_reg;
+	bool known = tnum_is_const(off_reg->var_off);
+	s64 min_val = off_reg->min_value;
+	u64 max_val = off_reg->max_value;
 	u8 opcode = BPF_OP(insn->code);
-	s64 imm_log2;
+	u32 dst = insn->dst_reg;
 
-	/* for type == UNKNOWN_VALUE:
-	 * imm > 0 -> number of zero upper bits
-	 * imm == 0 -> don't track which is the same as all bits can be non-zero
-	 */
+	dst_reg = &regs[dst];
 
-	if (BPF_SRC(insn->code) == BPF_X) {
-		struct bpf_reg_state *src_reg = &regs[insn->src_reg];
-
-		if (src_reg->type == UNKNOWN_VALUE && src_reg->imm > 0 &&
-		    dst_reg->imm && opcode == BPF_ADD) {
-			/* dreg += sreg
-			 * where both have zero upper bits. Adding them
-			 * can only result making one more bit non-zero
-			 * in the larger value.
-			 * Ex. 0xffff (imm=48) + 1 (imm=63) = 0x10000 (imm=47)
-			 *     0xffff (imm=48) + 0xffff = 0x1fffe (imm=47)
-			 */
-			dst_reg->imm = min(dst_reg->imm, src_reg->imm);
-			dst_reg->imm--;
-			return 0;
-		}
-		if (src_reg->type == CONST_IMM && src_reg->imm > 0 &&
-		    dst_reg->imm && opcode == BPF_ADD) {
-			/* dreg += sreg
-			 * where dreg has zero upper bits and sreg is const.
-			 * Adding them can only result making one more bit
-			 * non-zero in the larger value.
-			 */
-			imm_log2 = __ilog2_u64((long long)src_reg->imm);
-			dst_reg->imm = min(dst_reg->imm, 63 - imm_log2);
-			dst_reg->imm--;
-			return 0;
-		}
-		/* all other cases non supported yet, just mark dst_reg */
-		dst_reg->imm = 0;
-		return 0;
+	if (WARN_ON_ONCE(known && (min_val != max_val))) {
+		print_verifier_state(&env->cur_state);
+		verbose("verifier internal error\n");
+		return -EINVAL;
+	}
+
+	if (BPF_CLASS(insn->code) != BPF_ALU64) {
+		/* 32-bit ALU ops on pointers produce (meaningless) scalars */
+		if (!env->allow_ptr_leaks)
+			verbose("R%d 32-bit pointer arithmetic prohibited\n",
+				dst);
+		return -EACCES;
 	}
 
-	/* sign extend 32-bit imm into 64-bit to make sure that
-	 * negative values occupy bit 63. Note ilog2() would have
-	 * been incorrect, since sizeof(insn->imm) == 4
+	if (ptr_reg->type == PTR_TO_MAP_VALUE_OR_NULL) {
+		if (!env->allow_ptr_leaks)
+			verbose("R%d pointer arithmetic on PTR_TO_MAP_VALUE_OR_NULL prohibited, null-check it first\n",
+				dst);
+		return -EACCES;
+	}
+	if (ptr_reg->type == CONST_PTR_TO_MAP) {
+		if (!env->allow_ptr_leaks)
+			verbose("R%d pointer arithmetic on CONST_PTR_TO_MAP prohibited\n",
+				dst);
+		return -EACCES;
+	}
+	if (ptr_reg->type == PTR_TO_PACKET_END) {
+		if (!env->allow_ptr_leaks)
+			verbose("R%d pointer arithmetic on PTR_TO_PACKET_END prohibited\n",
+				dst);
+		return -EACCES;
+	}
+
+	/* In case of 'scalar += pointer', dst_reg inherits pointer type and id.
+	 * The id may be overwritten later if we create a new variable offset.
 	 */
-	imm_log2 = __ilog2_u64((long long)insn->imm);
+	dst_reg->type = ptr_reg->type;
+	dst_reg->id = ptr_reg->id;
 
-	if (dst_reg->imm && opcode == BPF_LSH) {
-		/* reg <<= imm
-		 * if reg was a result of 2 byte load, then its imm == 48
-		 * which means that upper 48 bits are zero and shifting this reg
-		 * left by 4 would mean that upper 44 bits are still zero
+	switch (opcode) {
+	case BPF_ADD:
+		/* We can take a fixed offset as long as it doesn't overflow
+		 * the s32 'off' field
 		 */
-		dst_reg->imm -= insn->imm;
-	} else if (dst_reg->imm && opcode == BPF_MUL) {
-		/* reg *= imm
-		 * if multiplying by 14 subtract 4
-		 * This is conservative calculation of upper zero bits.
-		 * It's not trying to special case insn->imm == 1 or 0 cases
+		if (known && (ptr_reg->off + min_val ==
+			      (s64)(s32)(ptr_reg->off + min_val))) {
+			/* pointer += K.  Accumulate it into fixed offset */
+			dst_reg->min_value = ptr_reg->min_value;
+			dst_reg->max_value = ptr_reg->max_value;
+			dst_reg->var_off = ptr_reg->var_off;
+			dst_reg->off = ptr_reg->off + min_val;
+			dst_reg->range = ptr_reg->range;
+			break;
+		}
+		if (max_val == BPF_REGISTER_MAX_RANGE) {
+			if (!env->allow_ptr_leaks)
+				verbose("R%d tried to add unbounded value to pointer\n",
+					dst);
+			return -EACCES;
+		}
+		/* A new variable offset is created.  Note that off_reg->off
+		 * == 0, since it's a scalar.
+		 * dst_reg gets the pointer type and since some positive
+		 * integer value was added to the pointer, give it a new 'id'
+		 * if it's a PTR_TO_PACKET.
+		 * this creates a new 'base' pointer, off_reg (variable) gets
+		 * added into the variable offset, and we copy the fixed offset
+		 * from ptr_reg.
 		 */
-		dst_reg->imm -= imm_log2 + 1;
-	} else if (opcode == BPF_AND) {
-		/* reg &= imm */
-		dst_reg->imm = 63 - imm_log2;
-	} else if (dst_reg->imm && opcode == BPF_ADD) {
-		/* reg += imm */
-		dst_reg->imm = min(dst_reg->imm, 63 - imm_log2);
-		dst_reg->imm--;
-	} else if (opcode == BPF_RSH) {
-		/* reg >>= imm
-		 * which means that after right shift, upper bits will be zero
-		 * note that verifier already checked that
-		 * 0 <= imm < 64 for shift insn
+		if (min_val <= BPF_REGISTER_MIN_RANGE)
+			dst_reg->min_value = BPF_REGISTER_MIN_RANGE;
+		if (dst_reg->min_value != BPF_REGISTER_MIN_RANGE)
+			dst_reg->min_value += min_val;
+		if (dst_reg->max_value != BPF_REGISTER_MAX_RANGE)
+			dst_reg->max_value += max_val;
+		dst_reg->var_off = tnum_add(ptr_reg->var_off, off_reg->var_off);
+		dst_reg->off = ptr_reg->off;
+		if (ptr_reg->type == PTR_TO_PACKET) {
+			dst_reg->id = ++env->id_gen;
+			/* something was added to pkt_ptr, set range to zero */
+			dst_reg->range = 0;
+		}
+		break;
+	case BPF_SUB:
+		if (dst_reg == off_reg) {
+			/* scalar -= pointer.  Creates an unknown scalar */
+			if (!env->allow_ptr_leaks)
+				verbose("R%d tried to subtract pointer from scalar\n",
+					dst);
+			return -EACCES;
+		}
+		/* We don't allow subtraction from FP, because (according to
+		 * test_verifier.c test "invalid fp arithmetic", JITs might not
+		 * be able to deal with it.
 		 */
-		dst_reg->imm += insn->imm;
-		if (unlikely(dst_reg->imm > 64))
-			/* some dumb code did:
-			 * r2 = *(u32 *)mem;
-			 * r2 >>= 32;
-			 * and all bits are zero now */
-			dst_reg->imm = 64;
-	} else {
-		/* all other alu ops, means that we don't know what will
-		 * happen to the value, mark it with unknown number of zero bits
+		if (ptr_reg->type == PTR_TO_STACK) {
+			if (!env->allow_ptr_leaks)
+				verbose("R%d subtraction from stack pointer prohibited\n",
+					dst);
+			return -EACCES;
+		}
+		if (known && (ptr_reg->off - min_val ==
+			      (s64)(s32)(ptr_reg->off - min_val))) {
+			/* pointer -= K.  Subtract it from fixed offset */
+			dst_reg->min_value = ptr_reg->min_value;
+			dst_reg->max_value = ptr_reg->max_value;
+			dst_reg->var_off = ptr_reg->var_off;
+			dst_reg->id = ptr_reg->id;
+			dst_reg->off = ptr_reg->off - min_val;
+			dst_reg->range = ptr_reg->range;
+			break;
+		}
+		/* Subtracting a negative value will just confuse everything.
+		 * This can happen if off_reg is an immediate.
 		 */
-		dst_reg->imm = 0;
-	}
-
-	if (dst_reg->imm < 0) {
-		/* all 64 bits of the register can contain non-zero bits
-		 * and such value cannot be added to ptr_to_packet, since it
-		 * may overflow, mark it as unknown to avoid further eval
+		if ((s64)max_val < 0) {
+			if (!env->allow_ptr_leaks)
+				verbose("R%d tried to subtract negative max_val %lld from pointer\n",
+					dst, (s64)max_val);
+			return -EACCES;
+		}
+		/* A new variable offset is created.  If the subtrahend is known
+		 * nonnegative, then any reg->range we had before is still good.
 		 */
-		dst_reg->imm = 0;
-	}
-	return 0;
-}
-
-static int evaluate_reg_imm_alu_unknown(struct bpf_verifier_env *env,
-					struct bpf_insn *insn)
-{
-	struct bpf_reg_state *regs = env->cur_state.regs;
-	struct bpf_reg_state *dst_reg = &regs[insn->dst_reg];
-	struct bpf_reg_state *src_reg = &regs[insn->src_reg];
-	u8 opcode = BPF_OP(insn->code);
-	s64 imm_log2 = __ilog2_u64((long long)dst_reg->imm);
-
-	/* BPF_X code with src_reg->type UNKNOWN_VALUE here. */
-	if (src_reg->imm > 0 && dst_reg->imm) {
-		switch (opcode) {
-		case BPF_ADD:
-			/* dreg += sreg
-			 * where both have zero upper bits. Adding them
-			 * can only result making one more bit non-zero
-			 * in the larger value.
-			 * Ex. 0xffff (imm=48) + 1 (imm=63) = 0x10000 (imm=47)
-			 *     0xffff (imm=48) + 0xffff = 0x1fffe (imm=47)
-			 */
-			dst_reg->imm = min(src_reg->imm, 63 - imm_log2);
-			dst_reg->imm--;
-			break;
-		case BPF_AND:
-			/* dreg &= sreg
-			 * AND can not extend zero bits only shrink
-			 * Ex.  0x00..00ffffff
-			 *    & 0x0f..ffffffff
-			 *     ----------------
-			 *      0x00..00ffffff
-			 */
-			dst_reg->imm = max(src_reg->imm, 63 - imm_log2);
-			break;
-		case BPF_OR:
-			/* dreg |= sreg
-			 * OR can only extend zero bits
-			 * Ex.  0x00..00ffffff
-			 *    | 0x0f..ffffffff
-			 *     ----------------
-			 *      0x0f..00ffffff
-			 */
-			dst_reg->imm = min(src_reg->imm, 63 - imm_log2);
-			break;
-		case BPF_SUB:
-		case BPF_MUL:
-		case BPF_RSH:
-		case BPF_LSH:
-			/* These may be flushed out later */
-		default:
-			mark_reg_unknown_value(regs, insn->dst_reg);
+		if (max_val >= BPF_REGISTER_MAX_RANGE)
+			dst_reg->min_value = BPF_REGISTER_MIN_RANGE;
+		if (dst_reg->min_value != BPF_REGISTER_MIN_RANGE)
+			dst_reg->min_value -= max_val;
+		if (min_val <= BPF_REGISTER_MIN_RANGE)
+			dst_reg->max_value = BPF_REGISTER_MAX_RANGE;
+		if (dst_reg->max_value != BPF_REGISTER_MAX_RANGE)
+			dst_reg->max_value -= min_val;
+		dst_reg->var_off = tnum_sub(ptr_reg->var_off, off_reg->var_off);
+		dst_reg->off = ptr_reg->off;
+		if (ptr_reg->type == PTR_TO_PACKET) {
+			dst_reg->id = ++env->id_gen;
+			/* something was added to pkt_ptr, set range to zero */
+			if (min_val < 0)
+				dst_reg->range = 0;
 		}
-	} else {
-		mark_reg_unknown_value(regs, insn->dst_reg);
+		break;
+	case BPF_AND:
+	case BPF_OR:
+	case BPF_XOR:
+		/* bitwise ops on pointers are troublesome, prohibit for now.
+		 * (However, in principle we could allow some cases, e.g.
+		 * ptr &= ~3 which would reduce min_value by 3.)
+		 */
+		if (!env->allow_ptr_leaks)
+			verbose("R%d bitwise operator %s on pointer prohibited\n",
+				dst, bpf_alu_string[opcode >> 4]);
+		return -EACCES;
+	default:
+		/* other operators (e.g. MUL,LSH) produce non-pointer results */
+		if (!env->allow_ptr_leaks)
+			verbose("R%d pointer arithmetic with %s operator prohibited\n",
+				dst, bpf_alu_string[opcode >> 4]);
+		return -EACCES;
 	}
 
-	dst_reg->type = UNKNOWN_VALUE;
+	check_reg_overflow(dst_reg);
 	return 0;
 }
 
-static int evaluate_reg_imm_alu(struct bpf_verifier_env *env,
-				struct bpf_insn *insn)
+static int adjust_scalar_min_max_vals(struct bpf_verifier_env *env,
+				      struct bpf_insn *insn,
+				      struct bpf_reg_state *dst_reg,
+				      struct bpf_reg_state src_reg)
 {
 	struct bpf_reg_state *regs = env->cur_state.regs;
-	struct bpf_reg_state *dst_reg = &regs[insn->dst_reg];
-	struct bpf_reg_state *src_reg = &regs[insn->src_reg];
-	u8 opcode = BPF_OP(insn->code);
-	u64 dst_imm = dst_reg->imm;
-
-	if (BPF_SRC(insn->code) == BPF_X && src_reg->type == UNKNOWN_VALUE)
-		return evaluate_reg_imm_alu_unknown(env, insn);
-
-	/* dst_reg->type == CONST_IMM here. Simulate execution of insns
-	 * containing ALU ops. Don't care about overflow or negative
-	 * values, just add/sub/... them; registers are in u64.
-	 */
-	if (opcode == BPF_ADD && BPF_SRC(insn->code) == BPF_K) {
-		dst_imm += insn->imm;
-	} else if (opcode == BPF_ADD && BPF_SRC(insn->code) == BPF_X &&
-		   src_reg->type == CONST_IMM) {
-		dst_imm += src_reg->imm;
-	} else if (opcode == BPF_SUB && BPF_SRC(insn->code) == BPF_K) {
-		dst_imm -= insn->imm;
-	} else if (opcode == BPF_SUB && BPF_SRC(insn->code) == BPF_X &&
-		   src_reg->type == CONST_IMM) {
-		dst_imm -= src_reg->imm;
-	} else if (opcode == BPF_MUL && BPF_SRC(insn->code) == BPF_K) {
-		dst_imm *= insn->imm;
-	} else if (opcode == BPF_MUL && BPF_SRC(insn->code) == BPF_X &&
-		   src_reg->type == CONST_IMM) {
-		dst_imm *= src_reg->imm;
-	} else if (opcode == BPF_OR && BPF_SRC(insn->code) == BPF_K) {
-		dst_imm |= insn->imm;
-	} else if (opcode == BPF_OR && BPF_SRC(insn->code) == BPF_X &&
-		   src_reg->type == CONST_IMM) {
-		dst_imm |= src_reg->imm;
-	} else if (opcode == BPF_AND && BPF_SRC(insn->code) == BPF_K) {
-		dst_imm &= insn->imm;
-	} else if (opcode == BPF_AND && BPF_SRC(insn->code) == BPF_X &&
-		   src_reg->type == CONST_IMM) {
-		dst_imm &= src_reg->imm;
-	} else if (opcode == BPF_RSH && BPF_SRC(insn->code) == BPF_K) {
-		dst_imm >>= insn->imm;
-	} else if (opcode == BPF_RSH && BPF_SRC(insn->code) == BPF_X &&
-		   src_reg->type == CONST_IMM) {
-		dst_imm >>= src_reg->imm;
-	} else if (opcode == BPF_LSH && BPF_SRC(insn->code) == BPF_K) {
-		dst_imm <<= insn->imm;
-	} else if (opcode == BPF_LSH && BPF_SRC(insn->code) == BPF_X &&
-		   src_reg->type == CONST_IMM) {
-		dst_imm <<= src_reg->imm;
-	} else {
-		mark_reg_unknown_value(regs, insn->dst_reg);
-		goto out;
-	}
-
-	dst_reg->imm = dst_imm;
-out:
-	return 0;
-}
-
-static void check_reg_overflow(struct bpf_reg_state *reg)
-{
-	if (reg->max_value > BPF_REGISTER_MAX_RANGE)
-		reg->max_value = BPF_REGISTER_MAX_RANGE;
-	if (reg->min_value < BPF_REGISTER_MIN_RANGE ||
-	    reg->min_value > BPF_REGISTER_MAX_RANGE)
-		reg->min_value = BPF_REGISTER_MIN_RANGE;
-}
-
-static u32 calc_align(u32 imm)
-{
-	if (!imm)
-		return 1U << 31;
-	return imm - ((imm - 1) & imm);
-}
-
-static void adjust_reg_min_max_vals(struct bpf_verifier_env *env,
-				    struct bpf_insn *insn)
-{
-	struct bpf_reg_state *regs = env->cur_state.regs, *dst_reg;
 	s64 min_val = BPF_REGISTER_MIN_RANGE;
 	u64 max_val = BPF_REGISTER_MAX_RANGE;
 	u8 opcode = BPF_OP(insn->code);
-	u32 dst_align, src_align;
+	bool src_known, dst_known;
 
-	dst_reg = &regs[insn->dst_reg];
-	src_align = 0;
-	if (BPF_SRC(insn->code) == BPF_X) {
-		check_reg_overflow(&regs[insn->src_reg]);
-		min_val = regs[insn->src_reg].min_value;
-		max_val = regs[insn->src_reg].max_value;
-
-		/* If the source register is a random pointer then the
-		 * min_value/max_value values represent the range of the known
-		 * accesses into that value, not the actual min/max value of the
-		 * register itself.  In this case we have to reset the reg range
-		 * values so we know it is not safe to look at.
-		 */
-		if (regs[insn->src_reg].type != CONST_IMM &&
-		    regs[insn->src_reg].type != UNKNOWN_VALUE) {
-			min_val = BPF_REGISTER_MIN_RANGE;
-			max_val = BPF_REGISTER_MAX_RANGE;
-			src_align = 0;
-		} else {
-			src_align = regs[insn->src_reg].min_align;
-		}
-	} else if (insn->imm < BPF_REGISTER_MAX_RANGE &&
-		   (s64)insn->imm > BPF_REGISTER_MIN_RANGE) {
-		min_val = max_val = insn->imm;
-		src_align = calc_align(insn->imm);
-	}
-
-	dst_align = dst_reg->min_align;
-
-	/* We don't know anything about what was done to this register, mark it
-	 * as unknown. Also, if both derived bounds came from signed/unsigned
-	 * mixed compares and one side is unbounded, we cannot really do anything
-	 * with them as boundaries cannot be trusted. Thus, arithmetic of two
-	 * regs of such kind will get invalidated bounds on the dst side.
-	 */
-	if ((min_val == BPF_REGISTER_MIN_RANGE &&
-	     max_val == BPF_REGISTER_MAX_RANGE) ||
-	    (BPF_SRC(insn->code) == BPF_X &&
-	     ((min_val != BPF_REGISTER_MIN_RANGE &&
-	       max_val == BPF_REGISTER_MAX_RANGE) ||
-	      (min_val == BPF_REGISTER_MIN_RANGE &&
-	       max_val != BPF_REGISTER_MAX_RANGE) ||
-	      (dst_reg->min_value != BPF_REGISTER_MIN_RANGE &&
-	       dst_reg->max_value == BPF_REGISTER_MAX_RANGE) ||
-	      (dst_reg->min_value == BPF_REGISTER_MIN_RANGE &&
-	       dst_reg->max_value != BPF_REGISTER_MAX_RANGE)) &&
-	     regs[insn->dst_reg].value_from_signed !=
-	     regs[insn->src_reg].value_from_signed)) {
-		reset_reg_range_values(regs, insn->dst_reg);
-		return;
-	}
-
-	/* If one of our values was at the end of our ranges then we can't just
-	 * do our normal operations to the register, we need to set the values
-	 * to the min/max since they are undefined.
-	 */
-	if (opcode != BPF_SUB) {
-		if (min_val == BPF_REGISTER_MIN_RANGE)
-			dst_reg->min_value = BPF_REGISTER_MIN_RANGE;
-		if (max_val == BPF_REGISTER_MAX_RANGE)
-			dst_reg->max_value = BPF_REGISTER_MAX_RANGE;
+	if (BPF_CLASS(insn->code) != BPF_ALU64) {
+		/* 32-bit ALU ops are (32,32)->64 */
+		coerce_reg_to_32(dst_reg);
+		coerce_reg_to_32(&src_reg);
 	}
+	min_val = src_reg.min_value;
+	max_val = src_reg.max_value;
+	src_known = tnum_is_const(src_reg.var_off);
+	dst_known = tnum_is_const(dst_reg->var_off);
 
 	switch (opcode) {
 	case BPF_ADD:
+		if (min_val == BPF_REGISTER_MIN_RANGE)
+			dst_reg->min_value = BPF_REGISTER_MIN_RANGE;
 		if (dst_reg->min_value != BPF_REGISTER_MIN_RANGE)
 			dst_reg->min_value += min_val;
+		/* if max_val is MAX_RANGE, this will saturate dst->max */
 		if (dst_reg->max_value != BPF_REGISTER_MAX_RANGE)
 			dst_reg->max_value += max_val;
-		dst_reg->min_align = min(src_align, dst_align);
+		dst_reg->var_off = tnum_add(dst_reg->var_off, src_reg.var_off);
 		break;
 	case BPF_SUB:
-		/* If one of our values was at the end of our ranges, then the
-		 * _opposite_ value in the dst_reg goes to the end of our range.
-		 */
-		if (min_val == BPF_REGISTER_MIN_RANGE)
-			dst_reg->max_value = BPF_REGISTER_MAX_RANGE;
 		if (max_val == BPF_REGISTER_MAX_RANGE)
 			dst_reg->min_value = BPF_REGISTER_MIN_RANGE;
 		if (dst_reg->min_value != BPF_REGISTER_MIN_RANGE)
 			dst_reg->min_value -= max_val;
+		if (min_val == BPF_REGISTER_MIN_RANGE)
+			dst_reg->max_value = BPF_REGISTER_MAX_RANGE;
 		if (dst_reg->max_value != BPF_REGISTER_MAX_RANGE)
 			dst_reg->max_value -= min_val;
-		dst_reg->min_align = min(src_align, dst_align);
+		dst_reg->var_off = tnum_sub(dst_reg->var_off, src_reg.var_off);
 		break;
 	case BPF_MUL:
-		if (dst_reg->min_value != BPF_REGISTER_MIN_RANGE)
-			dst_reg->min_value *= min_val;
+		if (min_val < 0 || dst_reg->min_value < 0) {
+			/* Ain't nobody got time to multiply that sign */
+			__mark_reg_unknown(dst_reg);
+			break;
+		}
+		dst_reg->min_value *= min_val;
+		/* if max_val is MAX_RANGE, this will saturate dst->max.
+		 * We know MAX_RANGE ** 2 won't overflow a u64, because
+		 * MAX_RANGE itself fits in a u32.
+		 */
+		BUILD_BUG_ON(BPF_REGISTER_MAX_RANGE > (u32)-1);
 		if (dst_reg->max_value != BPF_REGISTER_MAX_RANGE)
 			dst_reg->max_value *= max_val;
-		dst_reg->min_align = max(src_align, dst_align);
+		dst_reg->var_off = tnum_mul(dst_reg->var_off, src_reg.var_off);
 		break;
 	case BPF_AND:
-		/* Disallow AND'ing of negative numbers, ain't nobody got time
-		 * for that.  Otherwise the minimum is 0 and the max is the max
-		 * value we could AND against.
+		if (src_known && dst_known) {
+			u64 value = dst_reg->var_off.value & src_reg.var_off.value;
+
+			dst_reg->var_off = tnum_const(value);
+			dst_reg->min_value = dst_reg->max_value = min_t(u64,
+					value, BPF_REGISTER_MAX_RANGE);
+			break;
+		}
+		/* Lose min_value when AND'ing negative numbers, ain't nobody
+		 * got time for that.  Otherwise we get our minimum from the
+		 * var_off, since that's inherently bitwise.
+		 * Our maximum is the minimum of the operands' maxima.
 		 */
-		if (min_val < 0)
+		dst_reg->var_off = tnum_and(dst_reg->var_off, src_reg.var_off);
+		if (min_val < 0 && dst_reg->min_value < 0)
 			dst_reg->min_value = BPF_REGISTER_MIN_RANGE;
 		else
-			dst_reg->min_value = 0;
-		dst_reg->max_value = max_val;
-		dst_reg->min_align = max(src_align, dst_align);
+			dst_reg->min_value = dst_reg->var_off.value;
+		dst_reg->max_value = min(dst_reg->max_value, max_val);
+		break;
+	case BPF_OR:
+		if (src_known && dst_known) {
+			u64 value = dst_reg->var_off.value | src_reg.var_off.value;
+
+			dst_reg->var_off = tnum_const(value);
+			dst_reg->min_value = dst_reg->max_value = min_t(u64,
+					value, BPF_REGISTER_MAX_RANGE);
+			break;
+		}
+		/* Lose ranges when OR'ing negative numbers, ain't nobody got
+		 * time for that.  Otherwise we get our maximum from the var_off,
+		 * and our minimum is the maximum of the operands' minima.
+		 */
+		dst_reg->var_off = tnum_or(dst_reg->var_off, src_reg.var_off);
+		if (min_val < 0 || dst_reg->min_value < 0) {
+			dst_reg->min_value = BPF_REGISTER_MIN_RANGE;
+			dst_reg->max_value = BPF_REGISTER_MAX_RANGE;
+		} else {
+			dst_reg->min_value = max(dst_reg->min_value, min_val);
+			dst_reg->max_value = dst_reg->var_off.value | dst_reg->var_off.mask;
+		}
 		break;
 	case BPF_LSH:
+		if (min_val < 0) {
+			/* LSH by a negative number is undefined */
+			mark_reg_unknown(regs, insn->dst_reg);
+			break;
+		}
 		/* Gotta have special overflow logic here, if we're shifting
 		 * more than MAX_RANGE then just assume we have an invalid
 		 * range.
 		 */
 		if (min_val > ilog2(BPF_REGISTER_MAX_RANGE)) {
 			dst_reg->min_value = BPF_REGISTER_MIN_RANGE;
-			dst_reg->min_align = 1;
+			dst_reg->var_off = tnum_unknown;
 		} else {
 			if (dst_reg->min_value != BPF_REGISTER_MIN_RANGE)
 				dst_reg->min_value <<= min_val;
-			if (!dst_reg->min_align)
-				dst_reg->min_align = 1;
-			dst_reg->min_align <<= min_val;
+			if (src_known)
+				dst_reg->var_off = tnum_lshift(dst_reg->var_off, min_val);
+			else
+				dst_reg->var_off = tnum_lshift(tnum_unknown, min_val);
 		}
 		if (max_val > ilog2(BPF_REGISTER_MAX_RANGE))
 			dst_reg->max_value = BPF_REGISTER_MAX_RANGE;
@@ -1946,37 +1938,139 @@ static void adjust_reg_min_max_vals(struct bpf_verifier_env *env,
 			dst_reg->max_value <<= max_val;
 		break;
 	case BPF_RSH:
-		/* RSH by a negative number is undefined, and the BPF_RSH is an
-		 * unsigned shift, so make the appropriate casts.
-		 */
-		if (min_val < 0 || dst_reg->min_value < 0) {
-			dst_reg->min_value = BPF_REGISTER_MIN_RANGE;
+		if (min_val < 0) {
+			/* RSH by a negative number is undefined */
+			mark_reg_unknown(regs, insn->dst_reg);
+			break;
+		}
+		/* BPF_RSH is an unsigned shift, so make the appropriate casts */
+		if (dst_reg->min_value < 0) {
+			if (min_val)
+				/* Sign bit will be cleared */
+				dst_reg->min_value = 0;
 		} else {
 			dst_reg->min_value =
 				(u64)(dst_reg->min_value) >> min_val;
 		}
-		if (min_val < 0) {
-			dst_reg->min_align = 1;
-		} else {
-			dst_reg->min_align >>= (u64) min_val;
-			if (!dst_reg->min_align)
-				dst_reg->min_align = 1;
-		}
-		if (dst_reg->max_value != BPF_REGISTER_MAX_RANGE)
-			dst_reg->max_value >>= max_val;
+		if (src_known)
+			dst_reg->var_off = tnum_rshift(dst_reg->var_off, min_val);
+		else
+			dst_reg->var_off = tnum_rshift(tnum_unknown, min_val);
+		if (dst_reg->max_value == BPF_REGISTER_MAX_RANGE)
+			dst_reg->max_value = ~0;
+		dst_reg->max_value >>= max_val;
 		break;
 	default:
-		reset_reg_range_values(regs, insn->dst_reg);
+		mark_reg_unknown(regs, insn->dst_reg);
 		break;
 	}
 
 	check_reg_overflow(dst_reg);
+	return 0;
+}
+
+/* Handles ALU ops other than BPF_END, BPF_NEG and BPF_MOV: computes new min/max
+ * and var_off.
+ */
+static int adjust_reg_min_max_vals(struct bpf_verifier_env *env,
+				   struct bpf_insn *insn)
+{
+	struct bpf_reg_state *regs = env->cur_state.regs, *dst_reg, *src_reg;
+	struct bpf_reg_state *ptr_reg = NULL, off_reg = {0};
+	u8 opcode = BPF_OP(insn->code);
+	int rc;
+
+	dst_reg = &regs[insn->dst_reg];
+	check_reg_overflow(dst_reg);
+	src_reg = NULL;
+	if (dst_reg->type != SCALAR_VALUE)
+		ptr_reg = dst_reg;
+	if (BPF_SRC(insn->code) == BPF_X) {
+		src_reg = &regs[insn->src_reg];
+		check_reg_overflow(src_reg);
+
+		if (src_reg->type != SCALAR_VALUE) {
+			if (dst_reg->type != SCALAR_VALUE) {
+				/* Combining two pointers by any ALU op yields
+				 * an arbitrary scalar.
+				 */
+				if (!env->allow_ptr_leaks) {
+					verbose("R%d pointer %s pointer prohibited\n",
+						insn->dst_reg,
+						bpf_alu_string[opcode >> 4]);
+					return -EACCES;
+				}
+				mark_reg_unknown(regs, insn->dst_reg);
+				return 0;
+			} else {
+				/* scalar += pointer
+				 * This is legal, but we have to reverse our
+				 * src/dest handling in computing the range
+				 */
+				rc = adjust_ptr_min_max_vals(env, insn,
+							     src_reg, dst_reg);
+				if (rc == -EACCES && env->allow_ptr_leaks) {
+					/* scalar += unknown scalar */
+					__mark_reg_unknown(&off_reg);
+					return adjust_scalar_min_max_vals(
+							env, insn,
+							dst_reg, off_reg);
+				}
+				return rc;
+			}
+		} else if (ptr_reg) {
+			/* pointer += scalar */
+			rc = adjust_ptr_min_max_vals(env, insn,
+						     dst_reg, src_reg);
+			if (rc == -EACCES && env->allow_ptr_leaks) {
+				/* unknown scalar += scalar */
+				__mark_reg_unknown(dst_reg);
+				return adjust_scalar_min_max_vals(
+						env, insn, dst_reg, *src_reg);
+			}
+			return rc;
+		}
+	} else {
+		/* Pretend the src is a reg with a known value, since we only
+		 * need to be able to read from this state.
+		 */
+		off_reg.type = SCALAR_VALUE;
+		off_reg.var_off = tnum_const(insn->imm);
+		off_reg.min_value = insn->imm;
+		off_reg.max_value = insn->imm;
+		src_reg = &off_reg;
+		check_reg_overflow(src_reg);
+		if (ptr_reg) { /* pointer += K */
+			rc = adjust_ptr_min_max_vals(env, insn,
+						     ptr_reg, src_reg);
+			if (rc == -EACCES && env->allow_ptr_leaks) {
+				/* unknown scalar += K */
+				__mark_reg_unknown(dst_reg);
+				return adjust_scalar_min_max_vals(
+						env, insn, dst_reg, off_reg);
+			}
+			return rc;
+		}
+	}
+
+	/* Got here implies adding two SCALAR_VALUEs */
+	if (WARN_ON_ONCE(ptr_reg)) {
+		print_verifier_state(&env->cur_state);
+		verbose("verifier internal error: unexpected ptr_reg\n");
+		return -EINVAL;
+	}
+	if (WARN_ON(!src_reg)) {
+		print_verifier_state(&env->cur_state);
+		verbose("verifier internal error: no src_reg\n");
+		return -EINVAL;
+	}
+	return adjust_scalar_min_max_vals(env, insn, dst_reg, *src_reg);
 }
 
 /* check validity of 32-bit and 64-bit arithmetic operations */
 static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn)
 {
-	struct bpf_reg_state *regs = env->cur_state.regs, *dst_reg;
+	struct bpf_reg_state *regs = env->cur_state.regs;
 	u8 opcode = BPF_OP(insn->code);
 	int err;
 
@@ -2036,11 +2130,6 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn)
 		if (err)
 			return err;
 
-		/* we are setting our register to something new, we need to
-		 * reset its range values.
-		 */
-		reset_reg_range_values(regs, insn->dst_reg);
-
 		if (BPF_SRC(insn->code) == BPF_X) {
 			if (BPF_CLASS(insn->code) == BPF_ALU64) {
 				/* case: R1 = R2
@@ -2048,24 +2137,29 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn)
 				 */
 				regs[insn->dst_reg] = regs[insn->src_reg];
 			} else {
+				/* R1 = (u32) R2 */
 				if (is_pointer_value(env, insn->src_reg)) {
 					verbose("R%d partial copy of pointer\n",
 						insn->src_reg);
 					return -EACCES;
 				}
-				mark_reg_unknown_value(regs, insn->dst_reg);
+				mark_reg_unknown(regs, insn->dst_reg);
+				/* high 32 bits are known zero.  But this is
+				 * still out of range for max_value, so leave
+				 * that.
+				 */
+				regs[insn->dst_reg].var_off = tnum_cast(
+						regs[insn->dst_reg].var_off, 4);
 			}
 		} else {
 			/* case: R = imm
 			 * remember the value we stored into this reg
 			 */
-			regs[insn->dst_reg].type = CONST_IMM;
-			regs[insn->dst_reg].imm = insn->imm;
-			regs[insn->dst_reg].id = 0;
+			regs[insn->dst_reg].type = SCALAR_VALUE;
+			regs[insn->dst_reg].var_off = tnum_const(insn->imm);
 			regs[insn->dst_reg].max_value = insn->imm;
 			regs[insn->dst_reg].min_value = insn->imm;
-			regs[insn->dst_reg].min_align = calc_align(insn->imm);
-			regs[insn->dst_reg].value_from_signed = false;
+			regs[insn->dst_reg].id = 0;
 		}
 
 	} else if (opcode > BPF_END) {
@@ -2116,68 +2210,7 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn)
 		if (err)
 			return err;
 
-		dst_reg = &regs[insn->dst_reg];
-
-		/* first we want to adjust our ranges. */
-		adjust_reg_min_max_vals(env, insn);
-
-		/* pattern match 'bpf_add Rx, imm' instruction */
-		if (opcode == BPF_ADD && BPF_CLASS(insn->code) == BPF_ALU64 &&
-		    dst_reg->type == FRAME_PTR && BPF_SRC(insn->code) == BPF_K) {
-			dst_reg->type = PTR_TO_STACK;
-			dst_reg->imm = insn->imm;
-			return 0;
-		} else if (opcode == BPF_ADD &&
-			   BPF_CLASS(insn->code) == BPF_ALU64 &&
-			   dst_reg->type == PTR_TO_STACK &&
-			   ((BPF_SRC(insn->code) == BPF_X &&
-			     regs[insn->src_reg].type == CONST_IMM) ||
-			    BPF_SRC(insn->code) == BPF_K)) {
-			if (BPF_SRC(insn->code) == BPF_X)
-				dst_reg->imm += regs[insn->src_reg].imm;
-			else
-				dst_reg->imm += insn->imm;
-			return 0;
-		} else if (opcode == BPF_ADD &&
-			   BPF_CLASS(insn->code) == BPF_ALU64 &&
-			   (dst_reg->type == PTR_TO_PACKET ||
-			    (BPF_SRC(insn->code) == BPF_X &&
-			     regs[insn->src_reg].type == PTR_TO_PACKET))) {
-			/* ptr_to_packet += K|X */
-			return check_packet_ptr_add(env, insn);
-		} else if (BPF_CLASS(insn->code) == BPF_ALU64 &&
-			   dst_reg->type == UNKNOWN_VALUE &&
-			   env->allow_ptr_leaks) {
-			/* unknown += K|X */
-			return evaluate_reg_alu(env, insn);
-		} else if (BPF_CLASS(insn->code) == BPF_ALU64 &&
-			   dst_reg->type == CONST_IMM &&
-			   env->allow_ptr_leaks) {
-			/* reg_imm += K|X */
-			return evaluate_reg_imm_alu(env, insn);
-		} else if (is_pointer_value(env, insn->dst_reg)) {
-			verbose("R%d pointer arithmetic prohibited\n",
-				insn->dst_reg);
-			return -EACCES;
-		} else if (BPF_SRC(insn->code) == BPF_X &&
-			   is_pointer_value(env, insn->src_reg)) {
-			verbose("R%d pointer arithmetic prohibited\n",
-				insn->src_reg);
-			return -EACCES;
-		}
-
-		/* If we did pointer math on a map value then just set it to our
-		 * PTR_TO_MAP_VALUE_ADJ type so we can deal with any stores or
-		 * loads to this register appropriately, otherwise just mark the
-		 * register as unknown.
-		 */
-		if (env->allow_ptr_leaks &&
-		    BPF_CLASS(insn->code) == BPF_ALU64 && opcode == BPF_ADD &&
-		    (dst_reg->type == PTR_TO_MAP_VALUE ||
-		     dst_reg->type == PTR_TO_MAP_VALUE_ADJ))
-			dst_reg->type = PTR_TO_MAP_VALUE_ADJ;
-		else
-			mark_reg_unknown_value(regs, insn->dst_reg);
+		return adjust_reg_min_max_vals(env, insn);
 	}
 
 	return 0;
@@ -2189,6 +2222,17 @@ static void find_good_pkt_pointers(struct bpf_verifier_state *state,
 	struct bpf_reg_state *regs = state->regs, *reg;
 	int i;
 
+	if (dst_reg->off < 0)
+		/* This doesn't give us any range */
+		return;
+
+	if (dst_reg->max_value > MAX_PACKET_OFF ||
+	    dst_reg->max_value + dst_reg->off > MAX_PACKET_OFF)
+		/* Risk of overflow.  For instance, ptr + (1<<63) may be less
+		 * than pkt_end, but that's because it's also less than pkt.
+		 */
+		return;
+
 	/* LLVM can generate two kind of checks:
 	 *
 	 * Type 1:
@@ -2219,30 +2263,44 @@ static void find_good_pkt_pointers(struct bpf_verifier_state *state,
 	 * so that range of bytes [r3, r3 + 8) is safe to access.
 	 */
 
+	/* If our ids match, then we must have the same max_value.  And we
+	 * don't care about the other reg's fixed offset, since if it's too big
+	 * the range won't allow anything.
+	 * dst_reg->off is known < MAX_PACKET_OFF, therefore it fits in a u16.
+	 */
 	for (i = 0; i < MAX_BPF_REG; i++)
 		if (regs[i].type == PTR_TO_PACKET && regs[i].id == dst_reg->id)
 			/* keep the maximum range already checked */
-			regs[i].range = max(regs[i].range, dst_reg->off);
+			regs[i].range = max_t(u16, regs[i].range, dst_reg->off);
 
 	for (i = 0; i < MAX_BPF_STACK; i += BPF_REG_SIZE) {
 		if (state->stack_slot_type[i] != STACK_SPILL)
 			continue;
 		reg = &state->spilled_regs[i / BPF_REG_SIZE];
 		if (reg->type == PTR_TO_PACKET && reg->id == dst_reg->id)
-			reg->range = max(reg->range, dst_reg->off);
+			reg->range = max_t(u16, reg->range, dst_reg->off);
 	}
 }
 
 /* Adjusts the register min/max values in the case that the dst_reg is the
  * variable register that we are working on, and src_reg is a constant or we're
  * simply doing a BPF_K check.
+ * In JEQ/JNE cases we also adjust the var_off values.
  */
 static void reg_set_min_max(struct bpf_reg_state *true_reg,
 			    struct bpf_reg_state *false_reg, u64 val,
 			    u8 opcode)
 {
 	bool value_from_signed = true;
-	bool is_range = true;
+
+	/* If the dst_reg is a pointer, we can't learn anything about its
+	 * variable offset from the compare (unless src_reg were a pointer into
+	 * the same object, but we don't bother with that.
+	 * Since false_reg and true_reg have the same type by construction, we
+	 * only need to check one of them for pointerness.
+	 */
+	if (__is_pointer_value(false, false_reg))
+		return;
 
 	switch (opcode) {
 	case BPF_JEQ:
@@ -2250,14 +2308,14 @@ static void reg_set_min_max(struct bpf_reg_state *true_reg,
 		 * true then we know for sure.
 		 */
 		true_reg->max_value = true_reg->min_value = val;
-		is_range = false;
+		true_reg->var_off = tnum_const(val);
 		break;
 	case BPF_JNE:
 		/* If this is true we know nothing Jon Snow, but if it is false
 		 * we know the value for sure;
 		 */
 		false_reg->max_value = false_reg->min_value = val;
-		is_range = false;
+		false_reg->var_off = tnum_const(val);
 		break;
 	case BPF_JGT:
 		value_from_signed = false;
@@ -2305,23 +2363,19 @@ static void reg_set_min_max(struct bpf_reg_state *true_reg,
 
 	check_reg_overflow(false_reg);
 	check_reg_overflow(true_reg);
-	if (is_range) {
-		if (__is_pointer_value(false, false_reg))
-			reset_reg_range_values(false_reg, 0);
-		if (__is_pointer_value(false, true_reg))
-			reset_reg_range_values(true_reg, 0);
-	}
 }
 
-/* Same as above, but for the case that dst_reg is a CONST_IMM reg and src_reg
- * is the variable reg.
+/* Same as above, but for the case that dst_reg holds a constant and src_reg is
+ * the variable reg.
  */
 static void reg_set_min_max_inv(struct bpf_reg_state *true_reg,
 				struct bpf_reg_state *false_reg, u64 val,
 				u8 opcode)
 {
 	bool value_from_signed = true;
-	bool is_range = true;
+
+	if (__is_pointer_value(false, false_reg))
+		return;
 
 	switch (opcode) {
 	case BPF_JEQ:
@@ -2329,14 +2383,14 @@ static void reg_set_min_max_inv(struct bpf_reg_state *true_reg,
 		 * true then we know for sure.
 		 */
 		true_reg->max_value = true_reg->min_value = val;
-		is_range = false;
+		true_reg->var_off = tnum_const(val);
 		break;
 	case BPF_JNE:
 		/* If this is true we know nothing Jon Snow, but if it is false
 		 * we know the value for sure;
 		 */
 		false_reg->max_value = false_reg->min_value = val;
-		is_range = false;
+		false_reg->var_off = tnum_const(val);
 		break;
 	case BPF_JGT:
 		value_from_signed = false;
@@ -2385,27 +2439,60 @@ static void reg_set_min_max_inv(struct bpf_reg_state *true_reg,
 
 	check_reg_overflow(false_reg);
 	check_reg_overflow(true_reg);
-	if (is_range) {
-		if (__is_pointer_value(false, false_reg))
-			reset_reg_range_values(false_reg, 0);
-		if (__is_pointer_value(false, true_reg))
-			reset_reg_range_values(true_reg, 0);
+}
+
+/* Regs are known to be equal, so intersect their min/max/var_off */
+static void __reg_combine_min_max(struct bpf_reg_state *src_reg,
+				  struct bpf_reg_state *dst_reg)
+{
+	src_reg->min_value = dst_reg->min_value = max(src_reg->min_value,
+						      dst_reg->min_value);
+	src_reg->max_value = dst_reg->max_value = min(src_reg->max_value,
+						      dst_reg->max_value);
+	src_reg->var_off = dst_reg->var_off = tnum_intersect(src_reg->var_off,
+							     dst_reg->var_off);
+	check_reg_overflow(src_reg);
+	check_reg_overflow(dst_reg);
+}
+
+static void reg_combine_min_max(struct bpf_reg_state *true_src,
+				struct bpf_reg_state *true_dst,
+				struct bpf_reg_state *false_src,
+				struct bpf_reg_state *false_dst,
+				u8 opcode)
+{
+	switch (opcode) {
+	case BPF_JEQ:
+		__reg_combine_min_max(true_src, true_dst);
+		break;
+	case BPF_JNE:
+		__reg_combine_min_max(false_src, false_dst);
 	}
 }
 
 static void mark_map_reg(struct bpf_reg_state *regs, u32 regno, u32 id,
-			 enum bpf_reg_type type)
+			 bool is_null)
 {
 	struct bpf_reg_state *reg = &regs[regno];
 
 	if (reg->type == PTR_TO_MAP_VALUE_OR_NULL && reg->id == id) {
-		if (type == UNKNOWN_VALUE) {
-			__mark_reg_unknown_value(regs, regno);
+		/* Old offset (both fixed and variable parts) should
+		 * have been known-zero, because we don't allow pointer
+		 * arithmetic on pointers that might be NULL.
+		 */
+		if (WARN_ON_ONCE(reg->min_value || reg->max_value ||
+				 reg->var_off.value || reg->var_off.mask ||
+				 reg->off)) {
+			reg->min_value = reg->max_value = reg->off = 0;
+			reg->var_off = tnum_const(0);
+		}
+		if (is_null) {
+			reg->type = SCALAR_VALUE;
 		} else if (reg->map_ptr->inner_map_meta) {
 			reg->type = CONST_PTR_TO_MAP;
 			reg->map_ptr = reg->map_ptr->inner_map_meta;
 		} else {
-			reg->type = type;
+			reg->type = PTR_TO_MAP_VALUE;
 		}
 		/* We don't need id from this point onwards anymore, thus we
 		 * should better reset it, so that state pruning has chances
@@ -2419,19 +2506,19 @@ static void mark_map_reg(struct bpf_reg_state *regs, u32 regno, u32 id,
  * be folded together at some point.
  */
 static void mark_map_regs(struct bpf_verifier_state *state, u32 regno,
-			  enum bpf_reg_type type)
+			  bool is_null)
 {
 	struct bpf_reg_state *regs = state->regs;
 	u32 id = regs[regno].id;
 	int i;
 
 	for (i = 0; i < MAX_BPF_REG; i++)
-		mark_map_reg(regs, i, id, type);
+		mark_map_reg(regs, i, id, is_null);
 
 	for (i = 0; i < MAX_BPF_STACK; i += BPF_REG_SIZE) {
 		if (state->stack_slot_type[i] != STACK_SPILL)
 			continue;
-		mark_map_reg(state->spilled_regs, i / BPF_REG_SIZE, id, type);
+		mark_map_reg(state->spilled_regs, i / BPF_REG_SIZE, id, is_null);
 	}
 }
 
@@ -2481,7 +2568,8 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env,
 	/* detect if R == 0 where R was initialized to zero earlier */
 	if (BPF_SRC(insn->code) == BPF_K &&
 	    (opcode == BPF_JEQ || opcode == BPF_JNE) &&
-	    dst_reg->type == CONST_IMM && dst_reg->imm == insn->imm) {
+	    dst_reg->type == SCALAR_VALUE &&
+	    tnum_equals_const(dst_reg->var_off, insn->imm)) {
 		if (opcode == BPF_JEQ) {
 			/* if (imm == imm) goto pc+off;
 			 * only follow the goto, ignore fall-through
@@ -2503,17 +2591,30 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env,
 
 	/* detect if we are comparing against a constant value so we can adjust
 	 * our min/max values for our dst register.
+	 * this is only legit if both are scalars (or pointers to the same
+	 * object, I suppose, but we don't support that right now), because
+	 * otherwise the different base pointers mean the offsets aren't
+	 * comparable.
 	 */
 	if (BPF_SRC(insn->code) == BPF_X) {
-		if (regs[insn->src_reg].type == CONST_IMM)
-			reg_set_min_max(&other_branch->regs[insn->dst_reg],
-					dst_reg, regs[insn->src_reg].imm,
-					opcode);
-		else if (dst_reg->type == CONST_IMM)
-			reg_set_min_max_inv(&other_branch->regs[insn->src_reg],
-					    &regs[insn->src_reg], dst_reg->imm,
-					    opcode);
-	} else {
+		if (dst_reg->type == SCALAR_VALUE &&
+		    regs[insn->src_reg].type == SCALAR_VALUE) {
+			if (tnum_is_const(regs[insn->src_reg].var_off))
+				reg_set_min_max(&other_branch->regs[insn->dst_reg],
+						dst_reg, regs[insn->src_reg].var_off.value,
+						opcode);
+			else if (tnum_is_const(dst_reg->var_off))
+				reg_set_min_max_inv(&other_branch->regs[insn->src_reg],
+						    &regs[insn->src_reg],
+						    dst_reg->var_off.value, opcode);
+			else if (opcode == BPF_JEQ || opcode == BPF_JNE)
+				/* Comparing for equality, we can combine knowledge */
+				reg_combine_min_max(&other_branch->regs[insn->src_reg],
+						    &other_branch->regs[insn->dst_reg],
+						    &regs[insn->src_reg],
+						    &regs[insn->dst_reg], opcode);
+		}
+	} else if (dst_reg->type == SCALAR_VALUE) {
 		reg_set_min_max(&other_branch->regs[insn->dst_reg],
 					dst_reg, insn->imm, opcode);
 	}
@@ -2525,10 +2626,8 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env,
 		/* Mark all identical map registers in each branch as either
 		 * safe or unknown depending R == 0 or R != 0 conditional.
 		 */
-		mark_map_regs(this_branch, insn->dst_reg,
-			      opcode == BPF_JEQ ? PTR_TO_MAP_VALUE : UNKNOWN_VALUE);
-		mark_map_regs(other_branch, insn->dst_reg,
-			      opcode == BPF_JEQ ? UNKNOWN_VALUE : PTR_TO_MAP_VALUE);
+		mark_map_regs(this_branch, insn->dst_reg, opcode == BPF_JNE);
+		mark_map_regs(other_branch, insn->dst_reg, opcode == BPF_JEQ);
 	} else if (BPF_SRC(insn->code) == BPF_X && opcode == BPF_JGT &&
 		   dst_reg->type == PTR_TO_PACKET &&
 		   regs[insn->src_reg].type == PTR_TO_PACKET_END) {
@@ -2576,8 +2675,11 @@ static int check_ld_imm(struct bpf_verifier_env *env, struct bpf_insn *insn)
 	if (insn->src_reg == 0) {
 		u64 imm = ((u64)(insn + 1)->imm << 32) | (u32)insn->imm;
 
-		regs[insn->dst_reg].type = CONST_IMM;
-		regs[insn->dst_reg].imm = imm;
+		regs[insn->dst_reg].type = SCALAR_VALUE;
+		regs[insn->dst_reg].min_value = imm;
+		regs[insn->dst_reg].max_value = imm;
+		check_reg_overflow(&regs[insn->dst_reg]);
+		regs[insn->dst_reg].var_off = tnum_const(imm);
 		regs[insn->dst_reg].id = 0;
 		return 0;
 	}
@@ -2659,7 +2761,7 @@ static int check_ld_abs(struct bpf_verifier_env *env, struct bpf_insn *insn)
 	/* mark destination R0 register as readable, since it contains
 	 * the value fetched from the packet
 	 */
-	regs[BPF_REG_0].type = UNKNOWN_VALUE;
+	mark_reg_unknown(regs, BPF_REG_0);
 	return 0;
 }
 
@@ -2862,57 +2964,145 @@ err_free:
 	return ret;
 }
 
-/* the following conditions reduce the number of explored insns
- * from ~140k to ~80k for ultra large programs that use a lot of ptr_to_packet
+/* check %cur's range satisfies %old's */
+static bool range_within(struct bpf_reg_state *old,
+			 struct bpf_reg_state *cur)
+{
+	return old->min_value <= cur->min_value &&
+	       old->max_value >= cur->max_value;
+}
+
+/* Maximum number of register states that can exist at once */
+#define ID_MAP_SIZE	(MAX_BPF_REG + MAX_BPF_STACK / BPF_REG_SIZE)
+struct idpair {
+	u32 old;
+	u32 cur;
+};
+
+/* If in the old state two registers had the same id, then they need to have
+ * the same id in the new state as well.  But that id could be different from
+ * the old state, so we need to track the mapping from old to new ids.
+ * Once we have seen that, say, a reg with old id 5 had new id 9, any subsequent
+ * regs with old id 5 must also have new id 9 for the new state to be safe.  But
+ * regs with a different old id could still have new id 9, we don't care about
+ * that.
+ * So we look through our idmap to see if this old id has been seen before.  If
+ * so, we require the new id to match; otherwise, we add the id pair to the map.
  */
-static bool compare_ptrs_to_packet(struct bpf_verifier_env *env,
-				   struct bpf_reg_state *old,
-				   struct bpf_reg_state *cur)
+static bool check_ids(u32 old_id, u32 cur_id, struct idpair *idmap)
 {
-	if (old->id != cur->id)
-		return false;
+	unsigned int i;
+
+	for (i = 0; i < ID_MAP_SIZE; i++) {
+		if (!idmap[i].old) {
+			/* Reached an empty slot; haven't seen this id before */
+			idmap[i].old = old_id;
+			idmap[i].cur = cur_id;
+			return true;
+		}
+		if (idmap[i].old == old_id)
+			return idmap[i].cur == cur_id;
+	}
+	/* We ran out of idmap slots, which should be impossible */
+	WARN_ON_ONCE(1);
+	return false;
+}
 
-	/* old ptr_to_packet is more conservative, since it allows smaller
-	 * range. Ex:
-	 * old(off=0,r=10) is equal to cur(off=0,r=20), because
-	 * old(off=0,r=10) means that with range=10 the verifier proceeded
-	 * further and found no issues with the program. Now we're in the same
-	 * spot with cur(off=0,r=20), so we're safe too, since anything further
-	 * will only be looking at most 10 bytes after this pointer.
-	 */
-	if (old->off == cur->off && old->range < cur->range)
+/* Returns true if (rold safe implies rcur safe) */
+static bool regsafe(struct bpf_reg_state *rold,
+		    struct bpf_reg_state *rcur,
+		    bool varlen_map_access, struct idpair *idmap)
+{
+	if (memcmp(rold, rcur, sizeof(*rold)) == 0)
 		return true;
 
-	/* old(off=20,r=10) is equal to cur(off=22,re=22 or 5 or 0)
-	 * since both cannot be used for packet access and safe(old)
-	 * pointer has smaller off that could be used for further
-	 * 'if (ptr > data_end)' check
-	 * Ex:
-	 * old(off=20,r=10) and cur(off=22,r=22) and cur(off=22,r=0) mean
-	 * that we cannot access the packet.
-	 * The safe range is:
-	 * [ptr, ptr + range - off)
-	 * so whenever off >=range, it means no safe bytes from this pointer.
-	 * When comparing old->off <= cur->off, it means that older code
-	 * went with smaller offset and that offset was later
-	 * used to figure out the safe range after 'if (ptr > data_end)' check
-	 * Say, 'old' state was explored like:
-	 * ... R3(off=0, r=0)
-	 * R4 = R3 + 20
-	 * ... now R4(off=20,r=0)  <-- here
-	 * if (R4 > data_end)
-	 * ... R4(off=20,r=20), R3(off=0,r=20) and R3 can be used to access.
-	 * ... the code further went all the way to bpf_exit.
-	 * Now the 'cur' state at the mark 'here' has R4(off=30,r=0).
-	 * old_R4(off=20,r=0) equal to cur_R4(off=30,r=0), since if the verifier
-	 * goes further, such cur_R4 will give larger safe packet range after
-	 * 'if (R4 > data_end)' and all further insn were already good with r=20,
-	 * so they will be good with r=30 and we can prune the search.
-	 */
-	if (!env->strict_alignment && old->off <= cur->off &&
-	    old->off >= old->range && cur->off >= cur->range)
+	if (rold->type == NOT_INIT)
+		/* explored state can't have used this */
 		return true;
+	if (rcur->type == NOT_INIT)
+		return false;
+	switch (rold->type) {
+	case SCALAR_VALUE:
+		if (rcur->type == SCALAR_VALUE) {
+			/* new val must satisfy old val knowledge */
+			return range_within(rold, rcur) &&
+			       tnum_in(rold->var_off, rcur->var_off);
+		} else {
+			/* if we knew anything about the old value, we're not
+			 * equal, because we can't know anything about the
+			 * scalar value of the pointer in the new value.
+			 */
+			return rold->min_value == BPF_REGISTER_MIN_RANGE &&
+			       rold->max_value == BPF_REGISTER_MAX_RANGE &&
+			       tnum_is_unknown(rold->var_off);
+		}
+	case PTR_TO_MAP_VALUE:
+		if (varlen_map_access) {
+			/* If the new min/max/var_off satisfy the old ones and
+			 * everything else matches, we are OK.
+			 * We don't care about the 'id' value, because nothing
+			 * uses it for PTR_TO_MAP_VALUE (only for ..._OR_NULL)
+			 */
+			return memcmp(rold, rcur, offsetof(struct bpf_reg_state, id)) == 0 &&
+			       range_within(rold, rcur) &&
+			       tnum_in(rold->var_off, rcur->var_off);
+		} else {
+			/* If the ranges/var_off were not the same, but
+			 * everything else was and we didn't do a variable
+			 * access into a map then we are a-ok.
+			 */
+			return memcmp(rold, rcur, offsetof(struct bpf_reg_state, id)) == 0;
+		}
+	case PTR_TO_MAP_VALUE_OR_NULL:
+		/* a PTR_TO_MAP_VALUE could be safe to use as a
+		 * PTR_TO_MAP_VALUE_OR_NULL into the same map.
+		 * However, if the old PTR_TO_MAP_VALUE_OR_NULL then got NULL-
+		 * checked, doing so could have affected others with the same
+		 * id, and we can't check for that because we lost the id when
+		 * we converted to a PTR_TO_MAP_VALUE.
+		 */
+		if (rcur->type != PTR_TO_MAP_VALUE_OR_NULL)
+			return false;
+		if (memcmp(rold, rcur, offsetof(struct bpf_reg_state, id)))
+			return false;
+		/* Check our ids match any regs they're supposed to */
+		return check_ids(rold->id, rcur->id, idmap);
+	case PTR_TO_PACKET:
+		if (rcur->type != PTR_TO_PACKET)
+			return false;
+		/* We must have at least as much range as the old ptr
+		 * did, so that any accesses which were safe before are
+		 * still safe.  This is true even if old range < old off,
+		 * since someone could have accessed through (ptr - k), or
+		 * even done ptr -= k in a register, to get a safe access.
+		 */
+		if (rold->range > rcur->range)
+			return false;
+		/* If the offsets don't match, we can't trust our alignment;
+		 * nor can we be sure that we won't fall out of range.
+		 */
+		if (rold->off != rcur->off)
+			return false;
+		/* id relations must be preserved */
+		if (rold->id && !check_ids(rold->id, rcur->id, idmap))
+			return false;
+		/* new val must satisfy old val knowledge */
+		return range_within(rold, rcur) &&
+		       tnum_in(rold->var_off, rcur->var_off);
+	case PTR_TO_CTX:
+	case CONST_PTR_TO_MAP:
+	case PTR_TO_STACK:
+	case PTR_TO_PACKET_END:
+		/* Only valid matches are exact, which memcmp() above
+		 * would have accepted
+		 */
+	default:
+		/* Don't know what's going on, just say it's not safe */
+		return false;
+	}
 
+	/* Shouldn't get here; if we do, say it's not safe */
+	WARN_ON_ONCE(1);
 	return false;
 }
 
@@ -2947,43 +3137,19 @@ static bool states_equal(struct bpf_verifier_env *env,
 			 struct bpf_verifier_state *cur)
 {
 	bool varlen_map_access = env->varlen_map_value_access;
-	struct bpf_reg_state *rold, *rcur;
+	struct idpair *idmap;
+	bool ret = false;
 	int i;
 
-	for (i = 0; i < MAX_BPF_REG; i++) {
-		rold = &old->regs[i];
-		rcur = &cur->regs[i];
-
-		if (memcmp(rold, rcur, sizeof(*rold)) == 0)
-			continue;
-
-		/* If the ranges were not the same, but everything else was and
-		 * we didn't do a variable access into a map then we are a-ok.
-		 */
-		if (!varlen_map_access &&
-		    memcmp(rold, rcur, offsetofend(struct bpf_reg_state, id)) == 0)
-			continue;
-
-		/* If we didn't map access then again we don't care about the
-		 * mismatched range values and it's ok if our old type was
-		 * UNKNOWN and we didn't go to a NOT_INIT'ed reg.
-		 */
-		if (rold->type == NOT_INIT ||
-		    (!varlen_map_access && rold->type == UNKNOWN_VALUE &&
-		     rcur->type != NOT_INIT))
-			continue;
-
-		/* Don't care about the reg->id in this case. */
-		if (rold->type == PTR_TO_MAP_VALUE_OR_NULL &&
-		    rcur->type == PTR_TO_MAP_VALUE_OR_NULL &&
-		    rold->map_ptr == rcur->map_ptr)
-			continue;
-
-		if (rold->type == PTR_TO_PACKET && rcur->type == PTR_TO_PACKET &&
-		    compare_ptrs_to_packet(env, rold, rcur))
-			continue;
-
+	idmap = kcalloc(ID_MAP_SIZE, sizeof(struct idpair), GFP_KERNEL);
+	/* If we failed to allocate the idmap, just say it's not safe */
+	if (!idmap)
 		return false;
+
+	for (i = 0; i < MAX_BPF_REG; i++) {
+		if (!regsafe(&old->regs[i], &cur->regs[i], varlen_map_access,
+			     idmap))
+			goto out_free;
 	}
 
 	for (i = 0; i < MAX_BPF_STACK; i++) {
@@ -2995,29 +3161,32 @@ static bool states_equal(struct bpf_verifier_env *env,
 			 * this verifier states are not equivalent,
 			 * return false to continue verification of this path
 			 */
-			return false;
+			goto out_free;
 		if (i % BPF_REG_SIZE)
 			continue;
 		if (old->stack_slot_type[i] != STACK_SPILL)
 			continue;
-		if (memcmp(&old->spilled_regs[i / BPF_REG_SIZE],
-			   &cur->spilled_regs[i / BPF_REG_SIZE],
-			   sizeof(old->spilled_regs[0])))
-			/* when explored and current stack slot types are
-			 * the same, check that stored pointers types
+		if (!regsafe(&old->spilled_regs[i / BPF_REG_SIZE],
+			     &cur->spilled_regs[i / BPF_REG_SIZE],
+			     varlen_map_access, idmap))
+			/* when explored and current stack slot are both storing
+			 * spilled registers, check that stored pointers types
 			 * are the same as well.
 			 * Ex: explored safe path could have stored
-			 * (bpf_reg_state) {.type = PTR_TO_STACK, .imm = -8}
+			 * (bpf_reg_state) {.type = PTR_TO_STACK, .off = -8}
 			 * but current path has stored:
-			 * (bpf_reg_state) {.type = PTR_TO_STACK, .imm = -16}
+			 * (bpf_reg_state) {.type = PTR_TO_STACK, .off = -16}
 			 * such verifier states are not equivalent.
 			 * return false to continue verification of this path
 			 */
-			return false;
+			goto out_free;
 		else
 			continue;
 	}
-	return true;
+	ret = true;
+out_free:
+	kfree(idmap);
+	return ret;
 }
 
 static int is_state_visited(struct bpf_verifier_env *env, int insn_idx)
@@ -3331,7 +3500,6 @@ process_bpf_exit:
 				verbose("invalid BPF_LD mode\n");
 				return -EINVAL;
 			}
-			reset_reg_range_values(regs, insn->dst_reg);
 		} else {
 			verbose("unknown insn class %d\n", class);
 			return -EINVAL;
-- 
cgit v1.2.3


From b03c9f9fdc37dab81ea04d5dacdc5995d4c224c2 Mon Sep 17 00:00:00 2001
From: Edward Cree <ecree@solarflare.com>
Date: Mon, 7 Aug 2017 15:26:36 +0100
Subject: bpf/verifier: track signed and unsigned min/max values

Allows us to, sometimes, combine information from a signed check of one
 bound and an unsigned check of the other.
We now track the full range of possible values, rather than restricting
 ourselves to [0, 1<<30) and considering anything beyond that as
 unknown.  While this is probably not necessary, it makes the code more
 straightforward and symmetrical between signed and unsigned bounds.

Signed-off-by: Edward Cree <ecree@solarflare.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/bpf_verifier.h |  23 +-
 include/linux/tnum.h         |   2 +
 kernel/bpf/tnum.c            |  16 +
 kernel/bpf/verifier.c        | 737 +++++++++++++++++++++++++------------------
 4 files changed, 461 insertions(+), 317 deletions(-)

(limited to 'include')

diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h
index 85936fa92d12..c61c3033522e 100644
--- a/include/linux/bpf_verifier.h
+++ b/include/linux/bpf_verifier.h
@@ -11,11 +11,15 @@
 #include <linux/filter.h> /* for MAX_BPF_STACK */
 #include <linux/tnum.h>
 
- /* Just some arbitrary values so we can safely do math without overflowing and
-  * are obviously wrong for any sort of memory access.
-  */
-#define BPF_REGISTER_MAX_RANGE (1024 * 1024 * 1024)
-#define BPF_REGISTER_MIN_RANGE -1
+/* Maximum variable offset umax_value permitted when resolving memory accesses.
+ * In practice this is far bigger than any realistic pointer offset; this limit
+ * ensures that umax_value + (int)off + (int)size cannot overflow a u64.
+ */
+#define BPF_MAX_VAR_OFF	(1ULL << 31)
+/* Maximum variable size permitted for ARG_CONST_SIZE[_OR_ZERO].  This ensures
+ * that converting umax_value to int cannot overflow.
+ */
+#define BPF_MAX_VAR_SIZ	INT_MAX
 
 struct bpf_reg_state {
 	enum bpf_reg_type type;
@@ -36,7 +40,7 @@ struct bpf_reg_state {
 	 * came from, when one is tested for != NULL.
 	 */
 	u32 id;
-	/* These three fields must be last.  See states_equal() */
+	/* These five fields must be last.  See states_equal() */
 	/* For scalar types (SCALAR_VALUE), this represents our knowledge of
 	 * the actual value.
 	 * For pointer types, this represents the variable part of the offset
@@ -49,9 +53,10 @@ struct bpf_reg_state {
 	 * These refer to the same value as var_off, not necessarily the actual
 	 * contents of the register.
 	 */
-	s64 min_value;
-	u64 max_value;
-	bool value_from_signed;
+	s64 smin_value; /* minimum possible (s64)value */
+	s64 smax_value; /* maximum possible (s64)value */
+	u64 umin_value; /* minimum possible (u64)value */
+	u64 umax_value; /* maximum possible (u64)value */
 };
 
 enum bpf_stack_slot_type {
diff --git a/include/linux/tnum.h b/include/linux/tnum.h
index a0b07bf1842b..0d2d3da46139 100644
--- a/include/linux/tnum.h
+++ b/include/linux/tnum.h
@@ -17,6 +17,8 @@ struct tnum {
 struct tnum tnum_const(u64 value);
 /* A completely unknown value */
 extern const struct tnum tnum_unknown;
+/* A value that's unknown except that @min <= value <= @max */
+struct tnum tnum_range(u64 min, u64 max);
 
 /* Arithmetic and logical ops */
 /* Shift a tnum left (by a fixed shift) */
diff --git a/kernel/bpf/tnum.c b/kernel/bpf/tnum.c
index 92eeeb1974a2..1f4bf68c12db 100644
--- a/kernel/bpf/tnum.c
+++ b/kernel/bpf/tnum.c
@@ -17,6 +17,22 @@ struct tnum tnum_const(u64 value)
 	return TNUM(value, 0);
 }
 
+struct tnum tnum_range(u64 min, u64 max)
+{
+	u64 chi = min ^ max, delta;
+	u8 bits = fls64(chi);
+
+	/* special case, needed because 1ULL << 64 is undefined */
+	if (bits > 63)
+		return tnum_unknown;
+	/* e.g. if chi = 4, bits = 3, delta = (1<<3) - 1 = 7.
+	 * if chi = 0, bits = 0, delta = (1<<0) - 1 = 0, so we return
+	 *  constant min (since min == max).
+	 */
+	delta = (1ULL << bits) - 1;
+	return TNUM(min & ~delta, delta);
+}
+
 struct tnum tnum_lshift(struct tnum a, u8 shift)
 {
 	return TNUM(a.value << shift, a.mask << shift);
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index c3f88b466c30..7557800bf7a7 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -234,12 +234,20 @@ static void print_verifier_state(struct bpf_verifier_state *state)
 				verbose(",ks=%d,vs=%d",
 					reg->map_ptr->key_size,
 					reg->map_ptr->value_size);
-			if (reg->min_value != BPF_REGISTER_MIN_RANGE)
-				verbose(",min_value=%lld",
-					(long long)reg->min_value);
-			if (reg->max_value != BPF_REGISTER_MAX_RANGE)
-				verbose(",max_value=%llu",
-					(unsigned long long)reg->max_value);
+			if (reg->smin_value != reg->umin_value &&
+			    reg->smin_value != S64_MIN)
+				verbose(",smin_value=%lld",
+					(long long)reg->smin_value);
+			if (reg->smax_value != reg->umax_value &&
+			    reg->smax_value != S64_MAX)
+				verbose(",smax_value=%lld",
+					(long long)reg->smax_value);
+			if (reg->umin_value != 0)
+				verbose(",umin_value=%llu",
+					(unsigned long long)reg->umin_value);
+			if (reg->umax_value != U64_MAX)
+				verbose(",umax_value=%llu",
+					(unsigned long long)reg->umax_value);
 			if (!tnum_is_unknown(reg->var_off)) {
 				char tn_buf[48];
 
@@ -466,14 +474,25 @@ static const int caller_saved[CALLER_SAVED_REGS] = {
 
 static void __mark_reg_not_init(struct bpf_reg_state *reg);
 
+/* Mark the unknown part of a register (variable offset or scalar value) as
+ * known to have the value @imm.
+ */
+static void __mark_reg_known(struct bpf_reg_state *reg, u64 imm)
+{
+	reg->id = 0;
+	reg->var_off = tnum_const(imm);
+	reg->smin_value = (s64)imm;
+	reg->smax_value = (s64)imm;
+	reg->umin_value = imm;
+	reg->umax_value = imm;
+}
+
 /* Mark the 'variable offset' part of a register as zero.  This should be
  * used only on registers holding a pointer type.
  */
 static void __mark_reg_known_zero(struct bpf_reg_state *reg)
 {
-	reg->var_off = tnum_const(0);
-	reg->min_value = 0;
-	reg->max_value = 0;
+	__mark_reg_known(reg, 0);
 }
 
 static void mark_reg_known_zero(struct bpf_reg_state *regs, u32 regno)
@@ -488,6 +507,72 @@ static void mark_reg_known_zero(struct bpf_reg_state *regs, u32 regno)
 	__mark_reg_known_zero(regs + regno);
 }
 
+/* Attempts to improve min/max values based on var_off information */
+static void __update_reg_bounds(struct bpf_reg_state *reg)
+{
+	/* min signed is max(sign bit) | min(other bits) */
+	reg->smin_value = max_t(s64, reg->smin_value,
+				reg->var_off.value | (reg->var_off.mask & S64_MIN));
+	/* max signed is min(sign bit) | max(other bits) */
+	reg->smax_value = min_t(s64, reg->smax_value,
+				reg->var_off.value | (reg->var_off.mask & S64_MAX));
+	reg->umin_value = max(reg->umin_value, reg->var_off.value);
+	reg->umax_value = min(reg->umax_value,
+			      reg->var_off.value | reg->var_off.mask);
+}
+
+/* Uses signed min/max values to inform unsigned, and vice-versa */
+static void __reg_deduce_bounds(struct bpf_reg_state *reg)
+{
+	/* Learn sign from signed bounds.
+	 * If we cannot cross the sign boundary, then signed and unsigned bounds
+	 * are the same, so combine.  This works even in the negative case, e.g.
+	 * -3 s<= x s<= -1 implies 0xf...fd u<= x u<= 0xf...ff.
+	 */
+	if (reg->smin_value >= 0 || reg->smax_value < 0) {
+		reg->smin_value = reg->umin_value = max_t(u64, reg->smin_value,
+							  reg->umin_value);
+		reg->smax_value = reg->umax_value = min_t(u64, reg->smax_value,
+							  reg->umax_value);
+		return;
+	}
+	/* Learn sign from unsigned bounds.  Signed bounds cross the sign
+	 * boundary, so we must be careful.
+	 */
+	if ((s64)reg->umax_value >= 0) {
+		/* Positive.  We can't learn anything from the smin, but smax
+		 * is positive, hence safe.
+		 */
+		reg->smin_value = reg->umin_value;
+		reg->smax_value = reg->umax_value = min_t(u64, reg->smax_value,
+							  reg->umax_value);
+	} else if ((s64)reg->umin_value < 0) {
+		/* Negative.  We can't learn anything from the smax, but smin
+		 * is negative, hence safe.
+		 */
+		reg->smin_value = reg->umin_value = max_t(u64, reg->smin_value,
+							  reg->umin_value);
+		reg->smax_value = reg->umax_value;
+	}
+}
+
+/* Attempts to improve var_off based on unsigned min/max information */
+static void __reg_bound_offset(struct bpf_reg_state *reg)
+{
+	reg->var_off = tnum_intersect(reg->var_off,
+				      tnum_range(reg->umin_value,
+						 reg->umax_value));
+}
+
+/* Reset the min/max bounds of a register */
+static void __mark_reg_unbounded(struct bpf_reg_state *reg)
+{
+	reg->smin_value = S64_MIN;
+	reg->smax_value = S64_MAX;
+	reg->umin_value = 0;
+	reg->umax_value = U64_MAX;
+}
+
 /* Mark a register as having a completely unknown (scalar) value. */
 static void __mark_reg_unknown(struct bpf_reg_state *reg)
 {
@@ -495,8 +580,7 @@ static void __mark_reg_unknown(struct bpf_reg_state *reg)
 	reg->id = 0;
 	reg->off = 0;
 	reg->var_off = tnum_unknown;
-	reg->min_value = BPF_REGISTER_MIN_RANGE;
-	reg->max_value = BPF_REGISTER_MAX_RANGE;
+	__mark_reg_unbounded(reg);
 }
 
 static void mark_reg_unknown(struct bpf_reg_state *regs, u32 regno)
@@ -545,13 +629,6 @@ static void init_reg_state(struct bpf_reg_state *regs)
 	mark_reg_known_zero(regs, BPF_REG_1);
 }
 
-static void reset_reg_range_values(struct bpf_reg_state *regs, u32 regno)
-{
-	regs[regno].min_value = BPF_REGISTER_MIN_RANGE;
-	regs[regno].max_value = BPF_REGISTER_MAX_RANGE;
-	regs[regno].value_from_signed = false;
-}
-
 enum reg_arg_type {
 	SRC_OP,		/* register is used as source operand */
 	DST_OP,		/* register is used as destination operand */
@@ -716,26 +793,27 @@ static int check_map_access(struct bpf_verifier_env *env, u32 regno,
 	 * index'es we need to make sure that whatever we use
 	 * will have a set floor within our range.
 	 */
-	if (reg->min_value < 0) {
+	if (reg->smin_value < 0) {
 		verbose("R%d min value is negative, either use unsigned index or do a if (index >=0) check.\n",
 			regno);
 		return -EACCES;
 	}
-	err = __check_map_access(env, regno, reg->min_value + off, size);
+	err = __check_map_access(env, regno, reg->smin_value + off, size);
 	if (err) {
 		verbose("R%d min value is outside of the array range\n", regno);
 		return err;
 	}
 
-	/* If we haven't set a max value then we need to bail
-	 * since we can't be sure we won't do bad things.
+	/* If we haven't set a max value then we need to bail since we can't be
+	 * sure we won't do bad things.
+	 * If reg->umax_value + off could overflow, treat that as unbounded too.
 	 */
-	if (reg->max_value == BPF_REGISTER_MAX_RANGE) {
+	if (reg->umax_value >= BPF_MAX_VAR_OFF) {
 		verbose("R%d unbounded memory access, make sure to bounds check any array access into a map\n",
 			regno);
 		return -EACCES;
 	}
-	err = __check_map_access(env, regno, reg->max_value + off, size);
+	err = __check_map_access(env, regno, reg->umax_value + off, size);
 	if (err)
 		verbose("R%d max value is outside of the array range\n", regno);
 	return err;
@@ -797,7 +875,7 @@ static int check_packet_access(struct bpf_verifier_env *env, u32 regno, int off,
 	/* We don't allow negative numbers, because we aren't tracking enough
 	 * detail to prove they're safe.
 	 */
-	if (reg->min_value < 0) {
+	if (reg->smin_value < 0) {
 		verbose("R%d min value is negative, either use unsigned index or do a if (index >=0) check.\n",
 			regno);
 		return -EACCES;
@@ -1070,12 +1148,7 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn
 		/* b/h/w load zero-extends, mark upper bits as known 0 */
 		state->regs[value_regno].var_off = tnum_cast(
 					state->regs[value_regno].var_off, size);
-		/* sign bit is known zero, so we can bound the value */
-		state->regs[value_regno].min_value = 0;
-		state->regs[value_regno].max_value = min_t(u64,
-					state->regs[value_regno].var_off.value |
-					state->regs[value_regno].var_off.mask,
-					BPF_REGISTER_MAX_RANGE);
+		__update_reg_bounds(&state->regs[value_regno]);
 	}
 	return err;
 }
@@ -1333,13 +1406,13 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno,
 			 */
 			meta = NULL;
 
-		if (reg->min_value < 0) {
+		if (reg->smin_value < 0) {
 			verbose("R%d min value is negative, either use unsigned or 'var &= const'\n",
 				regno);
 			return -EACCES;
 		}
 
-		if (reg->min_value == 0) {
+		if (reg->umin_value == 0) {
 			err = check_helper_mem_access(env, regno - 1, 0,
 						      zero_size_allowed,
 						      meta);
@@ -1347,13 +1420,13 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno,
 				return err;
 		}
 
-		if (reg->max_value == BPF_REGISTER_MAX_RANGE) {
+		if (reg->umax_value >= BPF_MAX_VAR_SIZ) {
 			verbose("R%d unbounded memory access, use 'var &= const' or 'if (var < const)'\n",
 				regno);
 			return -EACCES;
 		}
 		err = check_helper_mem_access(env, regno - 1,
-					      reg->max_value,
+					      reg->umax_value,
 					      zero_size_allowed, meta);
 	}
 
@@ -1600,33 +1673,35 @@ static int check_call(struct bpf_verifier_env *env, int func_id, int insn_idx)
 	return 0;
 }
 
-static void check_reg_overflow(struct bpf_reg_state *reg)
-{
-	if (reg->max_value > BPF_REGISTER_MAX_RANGE)
-		reg->max_value = BPF_REGISTER_MAX_RANGE;
-	if (reg->min_value < BPF_REGISTER_MIN_RANGE ||
-	    reg->min_value > BPF_REGISTER_MAX_RANGE)
-		reg->min_value = BPF_REGISTER_MIN_RANGE;
-}
-
 static void coerce_reg_to_32(struct bpf_reg_state *reg)
 {
-	/* 32-bit values can't be negative as an s64 */
-	if (reg->min_value < 0)
-		reg->min_value = 0;
 	/* clear high 32 bits */
 	reg->var_off = tnum_cast(reg->var_off, 4);
-	/* Did value become known?  Then update bounds */
-	if (tnum_is_const(reg->var_off)) {
-		if ((s64)reg->var_off.value > BPF_REGISTER_MIN_RANGE)
-			reg->min_value = reg->var_off.value;
-		if (reg->var_off.value < BPF_REGISTER_MAX_RANGE)
-			reg->max_value = reg->var_off.value;
-	}
+	/* Update bounds */
+	__update_reg_bounds(reg);
+}
+
+static bool signed_add_overflows(s64 a, s64 b)
+{
+	/* Do the add in u64, where overflow is well-defined */
+	s64 res = (s64)((u64)a + (u64)b);
+
+	if (b < 0)
+		return res > a;
+	return res < a;
+}
+
+static bool signed_sub_overflows(s64 a, s64 b)
+{
+	/* Do the sub in u64, where overflow is well-defined */
+	s64 res = (s64)((u64)a - (u64)b);
+
+	if (b < 0)
+		return res < a;
+	return res > a;
 }
 
 /* Handles arithmetic on a pointer and a scalar: computes new min/max and var_off.
- * Caller must check_reg_overflow all argument regs beforehand.
  * Caller should also handle BPF_MOV case separately.
  * If we return -EACCES, caller may want to try again treating pointer as a
  * scalar.  So we only emit a diagnostic if !env->allow_ptr_leaks.
@@ -1638,16 +1713,23 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env,
 {
 	struct bpf_reg_state *regs = env->cur_state.regs, *dst_reg;
 	bool known = tnum_is_const(off_reg->var_off);
-	s64 min_val = off_reg->min_value;
-	u64 max_val = off_reg->max_value;
+	s64 smin_val = off_reg->smin_value, smax_val = off_reg->smax_value,
+	    smin_ptr = ptr_reg->smin_value, smax_ptr = ptr_reg->smax_value;
+	u64 umin_val = off_reg->umin_value, umax_val = off_reg->umax_value,
+	    umin_ptr = ptr_reg->umin_value, umax_ptr = ptr_reg->umax_value;
 	u8 opcode = BPF_OP(insn->code);
 	u32 dst = insn->dst_reg;
 
 	dst_reg = &regs[dst];
 
-	if (WARN_ON_ONCE(known && (min_val != max_val))) {
+	if (WARN_ON_ONCE(known && (smin_val != smax_val))) {
+		print_verifier_state(&env->cur_state);
+		verbose("verifier internal error: known but bad sbounds\n");
+		return -EINVAL;
+	}
+	if (WARN_ON_ONCE(known && (umin_val != umax_val))) {
 		print_verifier_state(&env->cur_state);
-		verbose("verifier internal error\n");
+		verbose("verifier internal error: known but bad ubounds\n");
 		return -EINVAL;
 	}
 
@@ -1689,22 +1771,18 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env,
 		/* We can take a fixed offset as long as it doesn't overflow
 		 * the s32 'off' field
 		 */
-		if (known && (ptr_reg->off + min_val ==
-			      (s64)(s32)(ptr_reg->off + min_val))) {
+		if (known && (ptr_reg->off + smin_val ==
+			      (s64)(s32)(ptr_reg->off + smin_val))) {
 			/* pointer += K.  Accumulate it into fixed offset */
-			dst_reg->min_value = ptr_reg->min_value;
-			dst_reg->max_value = ptr_reg->max_value;
+			dst_reg->smin_value = smin_ptr;
+			dst_reg->smax_value = smax_ptr;
+			dst_reg->umin_value = umin_ptr;
+			dst_reg->umax_value = umax_ptr;
 			dst_reg->var_off = ptr_reg->var_off;
-			dst_reg->off = ptr_reg->off + min_val;
+			dst_reg->off = ptr_reg->off + smin_val;
 			dst_reg->range = ptr_reg->range;
 			break;
 		}
-		if (max_val == BPF_REGISTER_MAX_RANGE) {
-			if (!env->allow_ptr_leaks)
-				verbose("R%d tried to add unbounded value to pointer\n",
-					dst);
-			return -EACCES;
-		}
 		/* A new variable offset is created.  Note that off_reg->off
 		 * == 0, since it's a scalar.
 		 * dst_reg gets the pointer type and since some positive
@@ -1714,12 +1792,22 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env,
 		 * added into the variable offset, and we copy the fixed offset
 		 * from ptr_reg.
 		 */
-		if (min_val <= BPF_REGISTER_MIN_RANGE)
-			dst_reg->min_value = BPF_REGISTER_MIN_RANGE;
-		if (dst_reg->min_value != BPF_REGISTER_MIN_RANGE)
-			dst_reg->min_value += min_val;
-		if (dst_reg->max_value != BPF_REGISTER_MAX_RANGE)
-			dst_reg->max_value += max_val;
+		if (signed_add_overflows(smin_ptr, smin_val) ||
+		    signed_add_overflows(smax_ptr, smax_val)) {
+			dst_reg->smin_value = S64_MIN;
+			dst_reg->smax_value = S64_MAX;
+		} else {
+			dst_reg->smin_value = smin_ptr + smin_val;
+			dst_reg->smax_value = smax_ptr + smax_val;
+		}
+		if (umin_ptr + umin_val < umin_ptr ||
+		    umax_ptr + umax_val < umax_ptr) {
+			dst_reg->umin_value = 0;
+			dst_reg->umax_value = U64_MAX;
+		} else {
+			dst_reg->umin_value = umin_ptr + umin_val;
+			dst_reg->umax_value = umax_ptr + umax_val;
+		}
 		dst_reg->var_off = tnum_add(ptr_reg->var_off, off_reg->var_off);
 		dst_reg->off = ptr_reg->off;
 		if (ptr_reg->type == PTR_TO_PACKET) {
@@ -1746,43 +1834,46 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env,
 					dst);
 			return -EACCES;
 		}
-		if (known && (ptr_reg->off - min_val ==
-			      (s64)(s32)(ptr_reg->off - min_val))) {
+		if (known && (ptr_reg->off - smin_val ==
+			      (s64)(s32)(ptr_reg->off - smin_val))) {
 			/* pointer -= K.  Subtract it from fixed offset */
-			dst_reg->min_value = ptr_reg->min_value;
-			dst_reg->max_value = ptr_reg->max_value;
+			dst_reg->smin_value = smin_ptr;
+			dst_reg->smax_value = smax_ptr;
+			dst_reg->umin_value = umin_ptr;
+			dst_reg->umax_value = umax_ptr;
 			dst_reg->var_off = ptr_reg->var_off;
 			dst_reg->id = ptr_reg->id;
-			dst_reg->off = ptr_reg->off - min_val;
+			dst_reg->off = ptr_reg->off - smin_val;
 			dst_reg->range = ptr_reg->range;
 			break;
 		}
-		/* Subtracting a negative value will just confuse everything.
-		 * This can happen if off_reg is an immediate.
-		 */
-		if ((s64)max_val < 0) {
-			if (!env->allow_ptr_leaks)
-				verbose("R%d tried to subtract negative max_val %lld from pointer\n",
-					dst, (s64)max_val);
-			return -EACCES;
-		}
 		/* A new variable offset is created.  If the subtrahend is known
 		 * nonnegative, then any reg->range we had before is still good.
 		 */
-		if (max_val >= BPF_REGISTER_MAX_RANGE)
-			dst_reg->min_value = BPF_REGISTER_MIN_RANGE;
-		if (dst_reg->min_value != BPF_REGISTER_MIN_RANGE)
-			dst_reg->min_value -= max_val;
-		if (min_val <= BPF_REGISTER_MIN_RANGE)
-			dst_reg->max_value = BPF_REGISTER_MAX_RANGE;
-		if (dst_reg->max_value != BPF_REGISTER_MAX_RANGE)
-			dst_reg->max_value -= min_val;
+		if (signed_sub_overflows(smin_ptr, smax_val) ||
+		    signed_sub_overflows(smax_ptr, smin_val)) {
+			/* Overflow possible, we know nothing */
+			dst_reg->smin_value = S64_MIN;
+			dst_reg->smax_value = S64_MAX;
+		} else {
+			dst_reg->smin_value = smin_ptr - smax_val;
+			dst_reg->smax_value = smax_ptr - smin_val;
+		}
+		if (umin_ptr < umax_val) {
+			/* Overflow possible, we know nothing */
+			dst_reg->umin_value = 0;
+			dst_reg->umax_value = U64_MAX;
+		} else {
+			/* Cannot overflow (as long as bounds are consistent) */
+			dst_reg->umin_value = umin_ptr - umax_val;
+			dst_reg->umax_value = umax_ptr - umin_val;
+		}
 		dst_reg->var_off = tnum_sub(ptr_reg->var_off, off_reg->var_off);
 		dst_reg->off = ptr_reg->off;
 		if (ptr_reg->type == PTR_TO_PACKET) {
 			dst_reg->id = ++env->id_gen;
 			/* something was added to pkt_ptr, set range to zero */
-			if (min_val < 0)
+			if (smin_val < 0)
 				dst_reg->range = 0;
 		}
 		break;
@@ -1805,7 +1896,9 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env,
 		return -EACCES;
 	}
 
-	check_reg_overflow(dst_reg);
+	__update_reg_bounds(dst_reg);
+	__reg_deduce_bounds(dst_reg);
+	__reg_bound_offset(dst_reg);
 	return 0;
 }
 
@@ -1815,157 +1908,217 @@ static int adjust_scalar_min_max_vals(struct bpf_verifier_env *env,
 				      struct bpf_reg_state src_reg)
 {
 	struct bpf_reg_state *regs = env->cur_state.regs;
-	s64 min_val = BPF_REGISTER_MIN_RANGE;
-	u64 max_val = BPF_REGISTER_MAX_RANGE;
 	u8 opcode = BPF_OP(insn->code);
 	bool src_known, dst_known;
+	s64 smin_val, smax_val;
+	u64 umin_val, umax_val;
 
 	if (BPF_CLASS(insn->code) != BPF_ALU64) {
 		/* 32-bit ALU ops are (32,32)->64 */
 		coerce_reg_to_32(dst_reg);
 		coerce_reg_to_32(&src_reg);
 	}
-	min_val = src_reg.min_value;
-	max_val = src_reg.max_value;
+	smin_val = src_reg.smin_value;
+	smax_val = src_reg.smax_value;
+	umin_val = src_reg.umin_value;
+	umax_val = src_reg.umax_value;
 	src_known = tnum_is_const(src_reg.var_off);
 	dst_known = tnum_is_const(dst_reg->var_off);
 
 	switch (opcode) {
 	case BPF_ADD:
-		if (min_val == BPF_REGISTER_MIN_RANGE)
-			dst_reg->min_value = BPF_REGISTER_MIN_RANGE;
-		if (dst_reg->min_value != BPF_REGISTER_MIN_RANGE)
-			dst_reg->min_value += min_val;
-		/* if max_val is MAX_RANGE, this will saturate dst->max */
-		if (dst_reg->max_value != BPF_REGISTER_MAX_RANGE)
-			dst_reg->max_value += max_val;
+		if (signed_add_overflows(dst_reg->smin_value, smin_val) ||
+		    signed_add_overflows(dst_reg->smax_value, smax_val)) {
+			dst_reg->smin_value = S64_MIN;
+			dst_reg->smax_value = S64_MAX;
+		} else {
+			dst_reg->smin_value += smin_val;
+			dst_reg->smax_value += smax_val;
+		}
+		if (dst_reg->umin_value + umin_val < umin_val ||
+		    dst_reg->umax_value + umax_val < umax_val) {
+			dst_reg->umin_value = 0;
+			dst_reg->umax_value = U64_MAX;
+		} else {
+			dst_reg->umin_value += umin_val;
+			dst_reg->umax_value += umax_val;
+		}
 		dst_reg->var_off = tnum_add(dst_reg->var_off, src_reg.var_off);
 		break;
 	case BPF_SUB:
-		if (max_val == BPF_REGISTER_MAX_RANGE)
-			dst_reg->min_value = BPF_REGISTER_MIN_RANGE;
-		if (dst_reg->min_value != BPF_REGISTER_MIN_RANGE)
-			dst_reg->min_value -= max_val;
-		if (min_val == BPF_REGISTER_MIN_RANGE)
-			dst_reg->max_value = BPF_REGISTER_MAX_RANGE;
-		if (dst_reg->max_value != BPF_REGISTER_MAX_RANGE)
-			dst_reg->max_value -= min_val;
+		if (signed_sub_overflows(dst_reg->smin_value, smax_val) ||
+		    signed_sub_overflows(dst_reg->smax_value, smin_val)) {
+			/* Overflow possible, we know nothing */
+			dst_reg->smin_value = S64_MIN;
+			dst_reg->smax_value = S64_MAX;
+		} else {
+			dst_reg->smin_value -= smax_val;
+			dst_reg->smax_value -= smin_val;
+		}
+		if (dst_reg->umin_value < umax_val) {
+			/* Overflow possible, we know nothing */
+			dst_reg->umin_value = 0;
+			dst_reg->umax_value = U64_MAX;
+		} else {
+			/* Cannot overflow (as long as bounds are consistent) */
+			dst_reg->umin_value -= umax_val;
+			dst_reg->umax_value -= umin_val;
+		}
 		dst_reg->var_off = tnum_sub(dst_reg->var_off, src_reg.var_off);
 		break;
 	case BPF_MUL:
-		if (min_val < 0 || dst_reg->min_value < 0) {
+		dst_reg->var_off = tnum_mul(dst_reg->var_off, src_reg.var_off);
+		if (smin_val < 0 || dst_reg->smin_value < 0) {
 			/* Ain't nobody got time to multiply that sign */
-			__mark_reg_unknown(dst_reg);
+			__mark_reg_unbounded(dst_reg);
+			__update_reg_bounds(dst_reg);
 			break;
 		}
-		dst_reg->min_value *= min_val;
-		/* if max_val is MAX_RANGE, this will saturate dst->max.
-		 * We know MAX_RANGE ** 2 won't overflow a u64, because
-		 * MAX_RANGE itself fits in a u32.
+		/* Both values are positive, so we can work with unsigned and
+		 * copy the result to signed (unless it exceeds S64_MAX).
 		 */
-		BUILD_BUG_ON(BPF_REGISTER_MAX_RANGE > (u32)-1);
-		if (dst_reg->max_value != BPF_REGISTER_MAX_RANGE)
-			dst_reg->max_value *= max_val;
-		dst_reg->var_off = tnum_mul(dst_reg->var_off, src_reg.var_off);
+		if (umax_val > U32_MAX || dst_reg->umax_value > U32_MAX) {
+			/* Potential overflow, we know nothing */
+			__mark_reg_unbounded(dst_reg);
+			/* (except what we can learn from the var_off) */
+			__update_reg_bounds(dst_reg);
+			break;
+		}
+		dst_reg->umin_value *= umin_val;
+		dst_reg->umax_value *= umax_val;
+		if (dst_reg->umax_value > S64_MAX) {
+			/* Overflow possible, we know nothing */
+			dst_reg->smin_value = S64_MIN;
+			dst_reg->smax_value = S64_MAX;
+		} else {
+			dst_reg->smin_value = dst_reg->umin_value;
+			dst_reg->smax_value = dst_reg->umax_value;
+		}
 		break;
 	case BPF_AND:
 		if (src_known && dst_known) {
-			u64 value = dst_reg->var_off.value & src_reg.var_off.value;
-
-			dst_reg->var_off = tnum_const(value);
-			dst_reg->min_value = dst_reg->max_value = min_t(u64,
-					value, BPF_REGISTER_MAX_RANGE);
+			__mark_reg_known(dst_reg, dst_reg->var_off.value &
+						  src_reg.var_off.value);
 			break;
 		}
-		/* Lose min_value when AND'ing negative numbers, ain't nobody
-		 * got time for that.  Otherwise we get our minimum from the
-		 * var_off, since that's inherently bitwise.
-		 * Our maximum is the minimum of the operands' maxima.
+		/* We get our minimum from the var_off, since that's inherently
+		 * bitwise.  Our maximum is the minimum of the operands' maxima.
 		 */
 		dst_reg->var_off = tnum_and(dst_reg->var_off, src_reg.var_off);
-		if (min_val < 0 && dst_reg->min_value < 0)
-			dst_reg->min_value = BPF_REGISTER_MIN_RANGE;
-		else
-			dst_reg->min_value = dst_reg->var_off.value;
-		dst_reg->max_value = min(dst_reg->max_value, max_val);
+		dst_reg->umin_value = dst_reg->var_off.value;
+		dst_reg->umax_value = min(dst_reg->umax_value, umax_val);
+		if (dst_reg->smin_value < 0 || smin_val < 0) {
+			/* Lose signed bounds when ANDing negative numbers,
+			 * ain't nobody got time for that.
+			 */
+			dst_reg->smin_value = S64_MIN;
+			dst_reg->smax_value = S64_MAX;
+		} else {
+			/* ANDing two positives gives a positive, so safe to
+			 * cast result into s64.
+			 */
+			dst_reg->smin_value = dst_reg->umin_value;
+			dst_reg->smax_value = dst_reg->umax_value;
+		}
+		/* We may learn something more from the var_off */
+		__update_reg_bounds(dst_reg);
 		break;
 	case BPF_OR:
 		if (src_known && dst_known) {
-			u64 value = dst_reg->var_off.value | src_reg.var_off.value;
-
-			dst_reg->var_off = tnum_const(value);
-			dst_reg->min_value = dst_reg->max_value = min_t(u64,
-					value, BPF_REGISTER_MAX_RANGE);
+			__mark_reg_known(dst_reg, dst_reg->var_off.value |
+						  src_reg.var_off.value);
 			break;
 		}
-		/* Lose ranges when OR'ing negative numbers, ain't nobody got
-		 * time for that.  Otherwise we get our maximum from the var_off,
-		 * and our minimum is the maximum of the operands' minima.
+		/* We get our maximum from the var_off, and our minimum is the
+		 * maximum of the operands' minima
 		 */
 		dst_reg->var_off = tnum_or(dst_reg->var_off, src_reg.var_off);
-		if (min_val < 0 || dst_reg->min_value < 0) {
-			dst_reg->min_value = BPF_REGISTER_MIN_RANGE;
-			dst_reg->max_value = BPF_REGISTER_MAX_RANGE;
+		dst_reg->umin_value = max(dst_reg->umin_value, umin_val);
+		dst_reg->umax_value = dst_reg->var_off.value |
+				      dst_reg->var_off.mask;
+		if (dst_reg->smin_value < 0 || smin_val < 0) {
+			/* Lose signed bounds when ORing negative numbers,
+			 * ain't nobody got time for that.
+			 */
+			dst_reg->smin_value = S64_MIN;
+			dst_reg->smax_value = S64_MAX;
 		} else {
-			dst_reg->min_value = max(dst_reg->min_value, min_val);
-			dst_reg->max_value = dst_reg->var_off.value | dst_reg->var_off.mask;
+			/* ORing two positives gives a positive, so safe to
+			 * cast result into s64.
+			 */
+			dst_reg->smin_value = dst_reg->umin_value;
+			dst_reg->smax_value = dst_reg->umax_value;
 		}
+		/* We may learn something more from the var_off */
+		__update_reg_bounds(dst_reg);
 		break;
 	case BPF_LSH:
-		if (min_val < 0) {
-			/* LSH by a negative number is undefined */
+		if (umax_val > 63) {
+			/* Shifts greater than 63 are undefined.  This includes
+			 * shifts by a negative number.
+			 */
 			mark_reg_unknown(regs, insn->dst_reg);
 			break;
 		}
-		/* Gotta have special overflow logic here, if we're shifting
-		 * more than MAX_RANGE then just assume we have an invalid
-		 * range.
+		/* We lose all sign bit information (except what we can pick
+		 * up from var_off)
 		 */
-		if (min_val > ilog2(BPF_REGISTER_MAX_RANGE)) {
-			dst_reg->min_value = BPF_REGISTER_MIN_RANGE;
-			dst_reg->var_off = tnum_unknown;
+		dst_reg->smin_value = S64_MIN;
+		dst_reg->smax_value = S64_MAX;
+		/* If we might shift our top bit out, then we know nothing */
+		if (dst_reg->umax_value > 1ULL << (63 - umax_val)) {
+			dst_reg->umin_value = 0;
+			dst_reg->umax_value = U64_MAX;
 		} else {
-			if (dst_reg->min_value != BPF_REGISTER_MIN_RANGE)
-				dst_reg->min_value <<= min_val;
-			if (src_known)
-				dst_reg->var_off = tnum_lshift(dst_reg->var_off, min_val);
-			else
-				dst_reg->var_off = tnum_lshift(tnum_unknown, min_val);
+			dst_reg->umin_value <<= umin_val;
+			dst_reg->umax_value <<= umax_val;
 		}
-		if (max_val > ilog2(BPF_REGISTER_MAX_RANGE))
-			dst_reg->max_value = BPF_REGISTER_MAX_RANGE;
-		else if (dst_reg->max_value != BPF_REGISTER_MAX_RANGE)
-			dst_reg->max_value <<= max_val;
+		if (src_known)
+			dst_reg->var_off = tnum_lshift(dst_reg->var_off, umin_val);
+		else
+			dst_reg->var_off = tnum_lshift(tnum_unknown, umin_val);
+		/* We may learn something more from the var_off */
+		__update_reg_bounds(dst_reg);
 		break;
 	case BPF_RSH:
-		if (min_val < 0) {
-			/* RSH by a negative number is undefined */
+		if (umax_val > 63) {
+			/* Shifts greater than 63 are undefined.  This includes
+			 * shifts by a negative number.
+			 */
 			mark_reg_unknown(regs, insn->dst_reg);
 			break;
 		}
 		/* BPF_RSH is an unsigned shift, so make the appropriate casts */
-		if (dst_reg->min_value < 0) {
-			if (min_val)
+		if (dst_reg->smin_value < 0) {
+			if (umin_val) {
 				/* Sign bit will be cleared */
-				dst_reg->min_value = 0;
+				dst_reg->smin_value = 0;
+			} else {
+				/* Lost sign bit information */
+				dst_reg->smin_value = S64_MIN;
+				dst_reg->smax_value = S64_MAX;
+			}
 		} else {
-			dst_reg->min_value =
-				(u64)(dst_reg->min_value) >> min_val;
+			dst_reg->smin_value =
+				(u64)(dst_reg->smin_value) >> umax_val;
 		}
 		if (src_known)
-			dst_reg->var_off = tnum_rshift(dst_reg->var_off, min_val);
+			dst_reg->var_off = tnum_rshift(dst_reg->var_off,
+						       umin_val);
 		else
-			dst_reg->var_off = tnum_rshift(tnum_unknown, min_val);
-		if (dst_reg->max_value == BPF_REGISTER_MAX_RANGE)
-			dst_reg->max_value = ~0;
-		dst_reg->max_value >>= max_val;
+			dst_reg->var_off = tnum_rshift(tnum_unknown, umin_val);
+		dst_reg->umin_value >>= umax_val;
+		dst_reg->umax_value >>= umin_val;
+		/* We may learn something more from the var_off */
+		__update_reg_bounds(dst_reg);
 		break;
 	default:
 		mark_reg_unknown(regs, insn->dst_reg);
 		break;
 	}
 
-	check_reg_overflow(dst_reg);
+	__reg_deduce_bounds(dst_reg);
+	__reg_bound_offset(dst_reg);
 	return 0;
 }
 
@@ -1981,14 +2134,11 @@ static int adjust_reg_min_max_vals(struct bpf_verifier_env *env,
 	int rc;
 
 	dst_reg = &regs[insn->dst_reg];
-	check_reg_overflow(dst_reg);
 	src_reg = NULL;
 	if (dst_reg->type != SCALAR_VALUE)
 		ptr_reg = dst_reg;
 	if (BPF_SRC(insn->code) == BPF_X) {
 		src_reg = &regs[insn->src_reg];
-		check_reg_overflow(src_reg);
-
 		if (src_reg->type != SCALAR_VALUE) {
 			if (dst_reg->type != SCALAR_VALUE) {
 				/* Combining two pointers by any ALU op yields
@@ -2035,11 +2185,8 @@ static int adjust_reg_min_max_vals(struct bpf_verifier_env *env,
 		 * need to be able to read from this state.
 		 */
 		off_reg.type = SCALAR_VALUE;
-		off_reg.var_off = tnum_const(insn->imm);
-		off_reg.min_value = insn->imm;
-		off_reg.max_value = insn->imm;
+		__mark_reg_known(&off_reg, insn->imm);
 		src_reg = &off_reg;
-		check_reg_overflow(src_reg);
 		if (ptr_reg) { /* pointer += K */
 			rc = adjust_ptr_min_max_vals(env, insn,
 						     ptr_reg, src_reg);
@@ -2144,22 +2291,17 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn)
 					return -EACCES;
 				}
 				mark_reg_unknown(regs, insn->dst_reg);
-				/* high 32 bits are known zero.  But this is
-				 * still out of range for max_value, so leave
-				 * that.
-				 */
+				/* high 32 bits are known zero. */
 				regs[insn->dst_reg].var_off = tnum_cast(
 						regs[insn->dst_reg].var_off, 4);
+				__update_reg_bounds(&regs[insn->dst_reg]);
 			}
 		} else {
 			/* case: R = imm
 			 * remember the value we stored into this reg
 			 */
 			regs[insn->dst_reg].type = SCALAR_VALUE;
-			regs[insn->dst_reg].var_off = tnum_const(insn->imm);
-			regs[insn->dst_reg].max_value = insn->imm;
-			regs[insn->dst_reg].min_value = insn->imm;
-			regs[insn->dst_reg].id = 0;
+			__mark_reg_known(regs + insn->dst_reg, insn->imm);
 		}
 
 	} else if (opcode > BPF_END) {
@@ -2226,8 +2368,8 @@ static void find_good_pkt_pointers(struct bpf_verifier_state *state,
 		/* This doesn't give us any range */
 		return;
 
-	if (dst_reg->max_value > MAX_PACKET_OFF ||
-	    dst_reg->max_value + dst_reg->off > MAX_PACKET_OFF)
+	if (dst_reg->umax_value > MAX_PACKET_OFF ||
+	    dst_reg->umax_value + dst_reg->off > MAX_PACKET_OFF)
 		/* Risk of overflow.  For instance, ptr + (1<<63) may be less
 		 * than pkt_end, but that's because it's also less than pkt.
 		 */
@@ -2291,8 +2433,6 @@ static void reg_set_min_max(struct bpf_reg_state *true_reg,
 			    struct bpf_reg_state *false_reg, u64 val,
 			    u8 opcode)
 {
-	bool value_from_signed = true;
-
 	/* If the dst_reg is a pointer, we can't learn anything about its
 	 * variable offset from the compare (unless src_reg were a pointer into
 	 * the same object, but we don't bother with that.
@@ -2307,62 +2447,45 @@ static void reg_set_min_max(struct bpf_reg_state *true_reg,
 		/* If this is false then we know nothing Jon Snow, but if it is
 		 * true then we know for sure.
 		 */
-		true_reg->max_value = true_reg->min_value = val;
-		true_reg->var_off = tnum_const(val);
+		__mark_reg_known(true_reg, val);
 		break;
 	case BPF_JNE:
 		/* If this is true we know nothing Jon Snow, but if it is false
 		 * we know the value for sure;
 		 */
-		false_reg->max_value = false_reg->min_value = val;
-		false_reg->var_off = tnum_const(val);
+		__mark_reg_known(false_reg, val);
 		break;
 	case BPF_JGT:
-		value_from_signed = false;
-		/* fallthrough */
+		false_reg->umax_value = min(false_reg->umax_value, val);
+		true_reg->umin_value = max(true_reg->umin_value, val + 1);
+		break;
 	case BPF_JSGT:
-		if (true_reg->value_from_signed != value_from_signed)
-			reset_reg_range_values(true_reg, 0);
-		if (false_reg->value_from_signed != value_from_signed)
-			reset_reg_range_values(false_reg, 0);
-		if (opcode == BPF_JGT) {
-			/* Unsigned comparison, the minimum value is 0. */
-			false_reg->min_value = 0;
-		}
-		/* If this is false then we know the maximum val is val,
-		 * otherwise we know the min val is val+1.
-		 */
-		false_reg->max_value = val;
-		false_reg->value_from_signed = value_from_signed;
-		true_reg->min_value = val + 1;
-		true_reg->value_from_signed = value_from_signed;
+		false_reg->smax_value = min_t(s64, false_reg->smax_value, val);
+		true_reg->smin_value = max_t(s64, true_reg->smin_value, val + 1);
 		break;
 	case BPF_JGE:
-		value_from_signed = false;
-		/* fallthrough */
+		false_reg->umax_value = min(false_reg->umax_value, val - 1);
+		true_reg->umin_value = max(true_reg->umin_value, val);
+		break;
 	case BPF_JSGE:
-		if (true_reg->value_from_signed != value_from_signed)
-			reset_reg_range_values(true_reg, 0);
-		if (false_reg->value_from_signed != value_from_signed)
-			reset_reg_range_values(false_reg, 0);
-		if (opcode == BPF_JGE) {
-			/* Unsigned comparison, the minimum value is 0. */
-			false_reg->min_value = 0;
-		}
-		/* If this is false then we know the maximum value is val - 1,
-		 * otherwise we know the mimimum value is val.
-		 */
-		false_reg->max_value = val - 1;
-		false_reg->value_from_signed = value_from_signed;
-		true_reg->min_value = val;
-		true_reg->value_from_signed = value_from_signed;
+		false_reg->smax_value = min_t(s64, false_reg->smax_value, val - 1);
+		true_reg->smin_value = max_t(s64, true_reg->smin_value, val);
 		break;
 	default:
 		break;
 	}
 
-	check_reg_overflow(false_reg);
-	check_reg_overflow(true_reg);
+	__reg_deduce_bounds(false_reg);
+	__reg_deduce_bounds(true_reg);
+	/* We might have learned some bits from the bounds. */
+	__reg_bound_offset(false_reg);
+	__reg_bound_offset(true_reg);
+	/* Intersecting with the old var_off might have improved our bounds
+	 * slightly.  e.g. if umax was 0x7f...f and var_off was (0; 0xf...fc),
+	 * then new var_off is (0; 0x7f...fc) which improves our umax.
+	 */
+	__update_reg_bounds(false_reg);
+	__update_reg_bounds(true_reg);
 }
 
 /* Same as above, but for the case that dst_reg holds a constant and src_reg is
@@ -2372,8 +2495,6 @@ static void reg_set_min_max_inv(struct bpf_reg_state *true_reg,
 				struct bpf_reg_state *false_reg, u64 val,
 				u8 opcode)
 {
-	bool value_from_signed = true;
-
 	if (__is_pointer_value(false, false_reg))
 		return;
 
@@ -2382,77 +2503,76 @@ static void reg_set_min_max_inv(struct bpf_reg_state *true_reg,
 		/* If this is false then we know nothing Jon Snow, but if it is
 		 * true then we know for sure.
 		 */
-		true_reg->max_value = true_reg->min_value = val;
-		true_reg->var_off = tnum_const(val);
+		__mark_reg_known(true_reg, val);
 		break;
 	case BPF_JNE:
 		/* If this is true we know nothing Jon Snow, but if it is false
 		 * we know the value for sure;
 		 */
-		false_reg->max_value = false_reg->min_value = val;
-		false_reg->var_off = tnum_const(val);
+		__mark_reg_known(false_reg, val);
 		break;
 	case BPF_JGT:
-		value_from_signed = false;
-		/* fallthrough */
+		true_reg->umax_value = min(true_reg->umax_value, val - 1);
+		false_reg->umin_value = max(false_reg->umin_value, val);
+		break;
 	case BPF_JSGT:
-		if (true_reg->value_from_signed != value_from_signed)
-			reset_reg_range_values(true_reg, 0);
-		if (false_reg->value_from_signed != value_from_signed)
-			reset_reg_range_values(false_reg, 0);
-		if (opcode == BPF_JGT) {
-			/* Unsigned comparison, the minimum value is 0. */
-			true_reg->min_value = 0;
-		}
-		/*
-		 * If this is false, then the val is <= the register, if it is
-		 * true the register <= to the val.
-		 */
-		false_reg->min_value = val;
-		false_reg->value_from_signed = value_from_signed;
-		true_reg->max_value = val - 1;
-		true_reg->value_from_signed = value_from_signed;
+		true_reg->smax_value = min_t(s64, true_reg->smax_value, val - 1);
+		false_reg->smin_value = max_t(s64, false_reg->smin_value, val);
 		break;
 	case BPF_JGE:
-		value_from_signed = false;
-		/* fallthrough */
+		true_reg->umax_value = min(true_reg->umax_value, val);
+		false_reg->umin_value = max(false_reg->umin_value, val + 1);
+		break;
 	case BPF_JSGE:
-		if (true_reg->value_from_signed != value_from_signed)
-			reset_reg_range_values(true_reg, 0);
-		if (false_reg->value_from_signed != value_from_signed)
-			reset_reg_range_values(false_reg, 0);
-		if (opcode == BPF_JGE) {
-			/* Unsigned comparison, the minimum value is 0. */
-			true_reg->min_value = 0;
-		}
-		/* If this is false then constant < register, if it is true then
-		 * the register < constant.
-		 */
-		false_reg->min_value = val + 1;
-		false_reg->value_from_signed = value_from_signed;
-		true_reg->max_value = val;
-		true_reg->value_from_signed = value_from_signed;
+		true_reg->smax_value = min_t(s64, true_reg->smax_value, val);
+		false_reg->smin_value = max_t(s64, false_reg->smin_value, val + 1);
 		break;
 	default:
 		break;
 	}
 
-	check_reg_overflow(false_reg);
-	check_reg_overflow(true_reg);
+	__reg_deduce_bounds(false_reg);
+	__reg_deduce_bounds(true_reg);
+	/* We might have learned some bits from the bounds. */
+	__reg_bound_offset(false_reg);
+	__reg_bound_offset(true_reg);
+	/* Intersecting with the old var_off might have improved our bounds
+	 * slightly.  e.g. if umax was 0x7f...f and var_off was (0; 0xf...fc),
+	 * then new var_off is (0; 0x7f...fc) which improves our umax.
+	 */
+	__update_reg_bounds(false_reg);
+	__update_reg_bounds(true_reg);
 }
 
 /* Regs are known to be equal, so intersect their min/max/var_off */
 static void __reg_combine_min_max(struct bpf_reg_state *src_reg,
 				  struct bpf_reg_state *dst_reg)
 {
-	src_reg->min_value = dst_reg->min_value = max(src_reg->min_value,
-						      dst_reg->min_value);
-	src_reg->max_value = dst_reg->max_value = min(src_reg->max_value,
-						      dst_reg->max_value);
+	src_reg->umin_value = dst_reg->umin_value = max(src_reg->umin_value,
+							dst_reg->umin_value);
+	src_reg->umax_value = dst_reg->umax_value = min(src_reg->umax_value,
+							dst_reg->umax_value);
+	src_reg->smin_value = dst_reg->smin_value = max(src_reg->smin_value,
+							dst_reg->smin_value);
+	src_reg->smax_value = dst_reg->smax_value = min(src_reg->smax_value,
+							dst_reg->smax_value);
 	src_reg->var_off = dst_reg->var_off = tnum_intersect(src_reg->var_off,
 							     dst_reg->var_off);
-	check_reg_overflow(src_reg);
-	check_reg_overflow(dst_reg);
+	/* We might have learned new bounds from the var_off. */
+	__update_reg_bounds(src_reg);
+	__update_reg_bounds(dst_reg);
+	/* We might have learned something about the sign bit. */
+	__reg_deduce_bounds(src_reg);
+	__reg_deduce_bounds(dst_reg);
+	/* We might have learned some bits from the bounds. */
+	__reg_bound_offset(src_reg);
+	__reg_bound_offset(dst_reg);
+	/* Intersecting with the old var_off might have improved our bounds
+	 * slightly.  e.g. if umax was 0x7f...f and var_off was (0; 0xf...fc),
+	 * then new var_off is (0; 0x7f...fc) which improves our umax.
+	 */
+	__update_reg_bounds(src_reg);
+	__update_reg_bounds(dst_reg);
 }
 
 static void reg_combine_min_max(struct bpf_reg_state *true_src,
@@ -2467,6 +2587,7 @@ static void reg_combine_min_max(struct bpf_reg_state *true_src,
 		break;
 	case BPF_JNE:
 		__reg_combine_min_max(false_src, false_dst);
+		break;
 	}
 }
 
@@ -2480,11 +2601,11 @@ static void mark_map_reg(struct bpf_reg_state *regs, u32 regno, u32 id,
 		 * have been known-zero, because we don't allow pointer
 		 * arithmetic on pointers that might be NULL.
 		 */
-		if (WARN_ON_ONCE(reg->min_value || reg->max_value ||
-				 reg->var_off.value || reg->var_off.mask ||
+		if (WARN_ON_ONCE(reg->smin_value || reg->smax_value ||
+				 !tnum_equals_const(reg->var_off, 0) ||
 				 reg->off)) {
-			reg->min_value = reg->max_value = reg->off = 0;
-			reg->var_off = tnum_const(0);
+			__mark_reg_known_zero(reg);
+			reg->off = 0;
 		}
 		if (is_null) {
 			reg->type = SCALAR_VALUE;
@@ -2676,11 +2797,7 @@ static int check_ld_imm(struct bpf_verifier_env *env, struct bpf_insn *insn)
 		u64 imm = ((u64)(insn + 1)->imm << 32) | (u32)insn->imm;
 
 		regs[insn->dst_reg].type = SCALAR_VALUE;
-		regs[insn->dst_reg].min_value = imm;
-		regs[insn->dst_reg].max_value = imm;
-		check_reg_overflow(&regs[insn->dst_reg]);
-		regs[insn->dst_reg].var_off = tnum_const(imm);
-		regs[insn->dst_reg].id = 0;
+		__mark_reg_known(&regs[insn->dst_reg], imm);
 		return 0;
 	}
 
@@ -2968,8 +3085,10 @@ err_free:
 static bool range_within(struct bpf_reg_state *old,
 			 struct bpf_reg_state *cur)
 {
-	return old->min_value <= cur->min_value &&
-	       old->max_value >= cur->max_value;
+	return old->umin_value <= cur->umin_value &&
+	       old->umax_value >= cur->umax_value &&
+	       old->smin_value <= cur->smin_value &&
+	       old->smax_value >= cur->smax_value;
 }
 
 /* Maximum number of register states that can exist at once */
@@ -3032,8 +3151,10 @@ static bool regsafe(struct bpf_reg_state *rold,
 			 * equal, because we can't know anything about the
 			 * scalar value of the pointer in the new value.
 			 */
-			return rold->min_value == BPF_REGISTER_MIN_RANGE &&
-			       rold->max_value == BPF_REGISTER_MAX_RANGE &&
+			return rold->umin_value == 0 &&
+			       rold->umax_value == U64_MAX &&
+			       rold->smin_value == S64_MIN &&
+			       rold->smax_value == S64_MAX &&
 			       tnum_is_unknown(rold->var_off);
 		}
 	case PTR_TO_MAP_VALUE:
-- 
cgit v1.2.3


From feca7d8c135bc1527b244fe817b8b6498066ccec Mon Sep 17 00:00:00 2001
From: Vincent Bernat <vincent@bernat.im>
Date: Tue, 8 Aug 2017 20:23:49 +0200
Subject: net: ipv6: avoid overhead when no custom FIB rules are installed
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

If the user hasn't installed any custom rules, don't go through the
whole FIB rules layer. This is pretty similar to f4530fa574df (ipv4:
Avoid overhead when no custom FIB rules are installed).

Using a micro-benchmark module [1], timing ip6_route_output() with
get_cycles(), with 40,000 routes in the main routing table, before this
patch:

    min=606 max=12911 count=627 average=1959 95th=4903 90th=3747 50th=1602 mad=821
    table=254 avgdepth=21.8 maxdepth=39
    value │                         ┊                            count
      600 │▒▒▒▒▒▒▒▒▒▒▒▒▒▒▒                                         199
      880 │▒▒▒░░░░░░░░░░░░░░░░                                      43
     1160 │▒▒▒░░░░░░░░░░░░░░░░░░░░                                  48
     1440 │▒▒▒░░░░░░░░░░░░░░░░░░░░░░░                               43
     1720 │▒▒▒▒░░░░░░░░░░░░░░░░░░░░░░░░░░░                          59
     2000 │▒▒▒░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░                      50
     2280 │▒▒░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░                    26
     2560 │▒▒░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░                  31
     2840 │▒▒░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░               28
     3120 │▒░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░              17
     3400 │▒░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░             17
     3680 │░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░             8
     3960 │░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░           11
     4240 │░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░            6
     4520 │░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░           6
     4800 │░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░           9

After:

    min=544 max=11687 count=627 average=1776 95th=4546 90th=3585 50th=1227 mad=565
    table=254 avgdepth=21.8 maxdepth=39
    value │                         ┊                            count
      540 │▒▒▒▒▒▒▒▒▒▒▒▒▒▒▒▒                                        201
      800 │▒▒▒▒▒░░░░░░░░░░░░░░░░                                    63
     1060 │▒▒▒▒▒░░░░░░░░░░░░░░░░░░░░░                               68
     1320 │▒▒▒░░░░░░░░░░░░░░░░░░░░░░░░░░                            39
     1580 │▒▒░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░                         32
     1840 │▒▒░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░                       32
     2100 │▒▒░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░                    34
     2360 │▒▒░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░                 33
     2620 │▒▒░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░               26
     2880 │▒░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░              22
     3140 │░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░              9
     3400 │░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░             8
     3660 │░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░             9
     3920 │░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░            8
     4180 │░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░           8
     4440 │░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░           8

At the frequency of the host during the bench (~ 3.7 GHz), this is
about a 100 ns difference on the median value.

A next step would be to collapse local and main tables, as in
0ddcf43d5d4a (ipv4: FIB Local/MAIN table collapse).

[1]: https://github.com/vincentbernat/network-lab/blob/master/lab-routes-ipv6/kbench_mod.c

Signed-off-by: Vincent Bernat <vincent@bernat.im>
Reviewed-by: Jiri Pirko <jiri@mellanox.com>
Acked-by: David Ahern <dsahern@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/netns/ipv6.h |  1 +
 net/ipv6/fib6_rules.c    | 40 +++++++++++++++++++++++++++-------------
 net/ipv6/route.c         |  1 +
 3 files changed, 29 insertions(+), 13 deletions(-)

(limited to 'include')

diff --git a/include/net/netns/ipv6.h b/include/net/netns/ipv6.h
index abdf3b40303b..0e50bf3ed097 100644
--- a/include/net/netns/ipv6.h
+++ b/include/net/netns/ipv6.h
@@ -65,6 +65,7 @@ struct netns_ipv6 {
 	unsigned int		 ip6_rt_gc_expire;
 	unsigned long		 ip6_rt_last_gc;
 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
+	bool			 fib6_has_custom_rules;
 	struct rt6_info         *ip6_prohibit_entry;
 	struct rt6_info         *ip6_blk_hole_entry;
 	struct fib6_table       *fib6_local_tbl;
diff --git a/net/ipv6/fib6_rules.c b/net/ipv6/fib6_rules.c
index 2f29e4e33bd3..b240f24a6e52 100644
--- a/net/ipv6/fib6_rules.c
+++ b/net/ipv6/fib6_rules.c
@@ -63,19 +63,32 @@ unsigned int fib6_rules_seq_read(struct net *net)
 struct dst_entry *fib6_rule_lookup(struct net *net, struct flowi6 *fl6,
 				   int flags, pol_lookup_t lookup)
 {
-	struct fib_lookup_arg arg = {
-		.lookup_ptr = lookup,
-		.flags = FIB_LOOKUP_NOREF,
-	};
-
-	/* update flow if oif or iif point to device enslaved to l3mdev */
-	l3mdev_update_flow(net, flowi6_to_flowi(fl6));
-
-	fib_rules_lookup(net->ipv6.fib6_rules_ops,
-			 flowi6_to_flowi(fl6), flags, &arg);
-
-	if (arg.result)
-		return arg.result;
+	if (net->ipv6.fib6_has_custom_rules) {
+		struct fib_lookup_arg arg = {
+			.lookup_ptr = lookup,
+			.flags = FIB_LOOKUP_NOREF,
+		};
+
+		/* update flow if oif or iif point to device enslaved to l3mdev */
+		l3mdev_update_flow(net, flowi6_to_flowi(fl6));
+
+		fib_rules_lookup(net->ipv6.fib6_rules_ops,
+				 flowi6_to_flowi(fl6), flags, &arg);
+
+		if (arg.result)
+			return arg.result;
+	} else {
+		struct rt6_info *rt;
+
+		rt = lookup(net, net->ipv6.fib6_local_tbl, fl6, flags);
+		if (rt != net->ipv6.ip6_null_entry && rt->dst.error != -EAGAIN)
+			return &rt->dst;
+		ip6_rt_put(rt);
+		rt = lookup(net, net->ipv6.fib6_main_tbl, fl6, flags);
+		if (rt->dst.error != -EAGAIN)
+			return &rt->dst;
+		ip6_rt_put(rt);
+	}
 
 	dst_hold(&net->ipv6.ip6_null_entry->dst);
 	return &net->ipv6.ip6_null_entry->dst;
@@ -245,6 +258,7 @@ static int fib6_rule_configure(struct fib_rule *rule, struct sk_buff *skb,
 	rule6->dst.plen = frh->dst_len;
 	rule6->tclass = frh->tos;
 
+	net->ipv6.fib6_has_custom_rules = true;
 	err = 0;
 errout:
 	return err;
diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index aba07fce67fb..7ecbe5eb19f8 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -3934,6 +3934,7 @@ static int __net_init ip6_route_net_init(struct net *net)
 			 ip6_template_metrics, true);
 
 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
+	net->ipv6.fib6_has_custom_rules = false;
 	net->ipv6.ip6_prohibit_entry = kmemdup(&ip6_prohibit_entry_template,
 					       sizeof(*net->ipv6.ip6_prohibit_entry),
 					       GFP_KERNEL);
-- 
cgit v1.2.3


From 04c2cf34362f133be09878bd752f8b014318b59a Mon Sep 17 00:00:00 2001
From: Naftali Goldstein <naftali.goldstein@intel.com>
Date: Tue, 11 Jul 2017 10:07:25 +0300
Subject: mac80211: add api to start ba session timer expired flow

Some drivers handle rx buffer reordering internally (and by extension
handle also the rx ba session timer internally), but do not ofload the
addba/delba negotiation.
Add an api for these drivers to properly tear-down the ba session,
including sending a delba.

Signed-off-by: Naftali Goldstein <naftali.goldstein@intel.com>
Signed-off-by: Luca Coelho <luciano.coelho@intel.com>
---
 include/net/mac80211.h | 15 +++++++++++++++
 net/mac80211/agg-rx.c  | 22 +++++++++++++++++++++-
 2 files changed, 36 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/net/mac80211.h b/include/net/mac80211.h
index b2b5419467cc..f8149ca192b4 100644
--- a/include/net/mac80211.h
+++ b/include/net/mac80211.h
@@ -5499,6 +5499,21 @@ static inline void ieee80211_stop_rx_ba_session_offl(struct ieee80211_vif *vif,
 	ieee80211_manage_rx_ba_offl(vif, addr, tid + IEEE80211_NUM_TIDS);
 }
 
+/**
+ * ieee80211_rx_ba_timer_expired - stop a Rx BA session due to timeout
+ *
+ * Some device drivers do not offload AddBa/DelBa negotiation, but handle rx
+ * buffer reording internally, and therefore also handle the session timer.
+ *
+ * Trigger the timeout flow, which sends a DelBa.
+ *
+ * @vif: &struct ieee80211_vif pointer from the add_interface callback
+ * @addr: station mac address
+ * @tid: the rx tid
+ */
+void ieee80211_rx_ba_timer_expired(struct ieee80211_vif *vif,
+				   const u8 *addr, unsigned int tid);
+
 /* Rate control API */
 
 /**
diff --git a/net/mac80211/agg-rx.c b/net/mac80211/agg-rx.c
index 8708cbe8af5b..2b36eff5d97e 100644
--- a/net/mac80211/agg-rx.c
+++ b/net/mac80211/agg-rx.c
@@ -7,7 +7,7 @@
  * Copyright 2006-2007	Jiri Benc <jbenc@suse.cz>
  * Copyright 2007, Michael Wu <flamingice@sourmilk.net>
  * Copyright 2007-2010, Intel Corporation
- * Copyright(c) 2015 Intel Deutschland GmbH
+ * Copyright(c) 2015-2017 Intel Deutschland GmbH
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License version 2 as
@@ -466,3 +466,23 @@ void ieee80211_manage_rx_ba_offl(struct ieee80211_vif *vif,
 	rcu_read_unlock();
 }
 EXPORT_SYMBOL(ieee80211_manage_rx_ba_offl);
+
+void ieee80211_rx_ba_timer_expired(struct ieee80211_vif *vif,
+				   const u8 *addr, unsigned int tid)
+{
+	struct ieee80211_sub_if_data *sdata = vif_to_sdata(vif);
+	struct ieee80211_local *local = sdata->local;
+	struct sta_info *sta;
+
+	rcu_read_lock();
+	sta = sta_info_get_bss(sdata, addr);
+	if (!sta)
+		goto unlock;
+
+	set_bit(tid, sta->ampdu_mlme.tid_rx_timer_expired);
+	ieee80211_queue_work(&local->hw, &sta->ampdu_mlme.work);
+
+ unlock:
+	rcu_read_unlock();
+}
+EXPORT_SYMBOL(ieee80211_rx_ba_timer_expired);
-- 
cgit v1.2.3


From 2d97591ef43d0587be22ad1b0d758d6df4999a0b Mon Sep 17 00:00:00 2001
From: Stephan Mueller <smueller@chronox.de>
Date: Wed, 2 Aug 2017 07:56:19 +0200
Subject: crypto: af_alg - consolidation of duplicate code

Consolidate following data structures:

skcipher_async_req, aead_async_req -> af_alg_async_req
skcipher_rsgl, aead_rsql -> af_alg_rsgl
skcipher_tsgl, aead_tsql -> af_alg_tsgl
skcipher_ctx, aead_ctx -> af_alg_ctx

Consolidate following functions:

skcipher_sndbuf, aead_sndbuf -> af_alg_sndbuf
skcipher_writable, aead_writable -> af_alg_writable
skcipher_rcvbuf, aead_rcvbuf -> af_alg_rcvbuf
skcipher_readable, aead_readable -> af_alg_readable
aead_alloc_tsgl, skcipher_alloc_tsgl -> af_alg_alloc_tsgl
aead_count_tsgl, skcipher_count_tsgl -> af_alg_count_tsgl
aead_pull_tsgl, skcipher_pull_tsgl -> af_alg_pull_tsgl
aead_free_areq_sgls, skcipher_free_areq_sgls -> af_alg_free_areq_sgls
aead_wait_for_wmem, skcipher_wait_for_wmem -> af_alg_wait_for_wmem
aead_wmem_wakeup, skcipher_wmem_wakeup -> af_alg_wmem_wakeup
aead_wait_for_data, skcipher_wait_for_data -> af_alg_wait_for_data
aead_data_wakeup, skcipher_data_wakeup -> af_alg_data_wakeup
aead_sendmsg, skcipher_sendmsg -> af_alg_sendmsg
aead_sendpage, skcipher_sendpage -> af_alg_sendpage
aead_async_cb, skcipher_async_cb -> af_alg_async_cb
aead_poll, skcipher_poll -> af_alg_poll

Split out the following common code from recvmsg:

af_alg_alloc_areq: allocation of the request data structure for the
cipher operation

af_alg_get_rsgl: creation of the RX SGL anchored in the request data
structure

The following changes to the implementation without affecting the
functionality have been applied to synchronize slightly different code
bases in algif_skcipher and algif_aead:

The wakeup in af_alg_wait_for_data is triggered when either more data
is received or the indicator that more data is to be expected is
released. The first is triggered by user space, the second is
triggered by the kernel upon finishing the processing of data
(i.e. the kernel is ready for more).

af_alg_sendmsg uses size_t in min_t calculation for obtaining len.
Return code determination is consistent with algif_skcipher. The
scope of the variable i is reduced to match algif_aead. The type of the
variable i is switched from int to unsigned int to match algif_aead.

af_alg_sendpage does not contain the superfluous err = 0 from
aead_sendpage.

af_alg_async_cb requires to store the number of output bytes in
areq->outlen before the AIO callback is triggered.

The POLLIN / POLLRDNORM is now set when either not more data is given or
the kernel is supplied with data. This is consistent to the wakeup from
sleep when the kernel waits for data.

The request data structure is extended by the field last_rsgl which
points to the last RX SGL list entry. This shall help recvmsg
implementation to chain the RX SGL to other SG(L)s if needed. It is
currently used by algif_aead which chains the tag SGL to the RX SGL
during decryption.

Signed-off-by: Stephan Mueller <smueller@chronox.de>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 crypto/af_alg.c         | 693 +++++++++++++++++++++++++++++++++++++++++++++++
 crypto/algif_aead.c     | 701 +++---------------------------------------------
 crypto/algif_skcipher.c | 638 +++----------------------------------------
 include/crypto/if_alg.h | 170 ++++++++++++
 4 files changed, 940 insertions(+), 1262 deletions(-)

(limited to 'include')

diff --git a/crypto/af_alg.c b/crypto/af_alg.c
index 92a3d540d920..d6936c0e08d9 100644
--- a/crypto/af_alg.c
+++ b/crypto/af_alg.c
@@ -21,6 +21,7 @@
 #include <linux/module.h>
 #include <linux/net.h>
 #include <linux/rwsem.h>
+#include <linux/sched/signal.h>
 #include <linux/security.h>
 
 struct alg_type_list {
@@ -507,6 +508,698 @@ void af_alg_complete(struct crypto_async_request *req, int err)
 }
 EXPORT_SYMBOL_GPL(af_alg_complete);
 
+/**
+ * af_alg_alloc_tsgl - allocate the TX SGL
+ *
+ * @sk socket of connection to user space
+ * @return: 0 upon success, < 0 upon error
+ */
+int af_alg_alloc_tsgl(struct sock *sk)
+{
+	struct alg_sock *ask = alg_sk(sk);
+	struct af_alg_ctx *ctx = ask->private;
+	struct af_alg_tsgl *sgl;
+	struct scatterlist *sg = NULL;
+
+	sgl = list_entry(ctx->tsgl_list.prev, struct af_alg_tsgl, list);
+	if (!list_empty(&ctx->tsgl_list))
+		sg = sgl->sg;
+
+	if (!sg || sgl->cur >= MAX_SGL_ENTS) {
+		sgl = sock_kmalloc(sk, sizeof(*sgl) +
+				       sizeof(sgl->sg[0]) * (MAX_SGL_ENTS + 1),
+				   GFP_KERNEL);
+		if (!sgl)
+			return -ENOMEM;
+
+		sg_init_table(sgl->sg, MAX_SGL_ENTS + 1);
+		sgl->cur = 0;
+
+		if (sg)
+			sg_chain(sg, MAX_SGL_ENTS + 1, sgl->sg);
+
+		list_add_tail(&sgl->list, &ctx->tsgl_list);
+	}
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(af_alg_alloc_tsgl);
+
+/**
+ * aead_count_tsgl - Count number of TX SG entries
+ *
+ * The counting starts from the beginning of the SGL to @bytes. If
+ * an offset is provided, the counting of the SG entries starts at the offset.
+ *
+ * @sk socket of connection to user space
+ * @bytes Count the number of SG entries holding given number of bytes.
+ * @offset Start the counting of SG entries from the given offset.
+ * @return Number of TX SG entries found given the constraints
+ */
+unsigned int af_alg_count_tsgl(struct sock *sk, size_t bytes, size_t offset)
+{
+	struct alg_sock *ask = alg_sk(sk);
+	struct af_alg_ctx *ctx = ask->private;
+	struct af_alg_tsgl *sgl, *tmp;
+	unsigned int i;
+	unsigned int sgl_count = 0;
+
+	if (!bytes)
+		return 0;
+
+	list_for_each_entry_safe(sgl, tmp, &ctx->tsgl_list, list) {
+		struct scatterlist *sg = sgl->sg;
+
+		for (i = 0; i < sgl->cur; i++) {
+			size_t bytes_count;
+
+			/* Skip offset */
+			if (offset >= sg[i].length) {
+				offset -= sg[i].length;
+				bytes -= sg[i].length;
+				continue;
+			}
+
+			bytes_count = sg[i].length - offset;
+
+			offset = 0;
+			sgl_count++;
+
+			/* If we have seen requested number of bytes, stop */
+			if (bytes_count >= bytes)
+				return sgl_count;
+
+			bytes -= bytes_count;
+		}
+	}
+
+	return sgl_count;
+}
+EXPORT_SYMBOL_GPL(af_alg_count_tsgl);
+
+/**
+ * aead_pull_tsgl - Release the specified buffers from TX SGL
+ *
+ * If @dst is non-null, reassign the pages to dst. The caller must release
+ * the pages. If @dst_offset is given only reassign the pages to @dst starting
+ * at the @dst_offset (byte). The caller must ensure that @dst is large
+ * enough (e.g. by using af_alg_count_tsgl with the same offset).
+ *
+ * @sk socket of connection to user space
+ * @used Number of bytes to pull from TX SGL
+ * @dst If non-NULL, buffer is reassigned to dst SGL instead of releasing. The
+ *	caller must release the buffers in dst.
+ * @dst_offset Reassign the TX SGL from given offset. All buffers before
+ *	       reaching the offset is released.
+ */
+void af_alg_pull_tsgl(struct sock *sk, size_t used, struct scatterlist *dst,
+		      size_t dst_offset)
+{
+	struct alg_sock *ask = alg_sk(sk);
+	struct af_alg_ctx *ctx = ask->private;
+	struct af_alg_tsgl *sgl;
+	struct scatterlist *sg;
+	unsigned int i, j;
+
+	while (!list_empty(&ctx->tsgl_list)) {
+		sgl = list_first_entry(&ctx->tsgl_list, struct af_alg_tsgl,
+				       list);
+		sg = sgl->sg;
+
+		for (i = 0, j = 0; i < sgl->cur; i++) {
+			size_t plen = min_t(size_t, used, sg[i].length);
+			struct page *page = sg_page(sg + i);
+
+			if (!page)
+				continue;
+
+			/*
+			 * Assumption: caller created af_alg_count_tsgl(len)
+			 * SG entries in dst.
+			 */
+			if (dst) {
+				if (dst_offset >= plen) {
+					/* discard page before offset */
+					dst_offset -= plen;
+					put_page(page);
+				} else {
+					/* reassign page to dst after offset */
+					sg_set_page(dst + j, page,
+						    plen - dst_offset,
+						    sg[i].offset + dst_offset);
+					dst_offset = 0;
+					j++;
+				}
+			}
+
+			sg[i].length -= plen;
+			sg[i].offset += plen;
+
+			used -= plen;
+			ctx->used -= plen;
+
+			if (sg[i].length)
+				return;
+
+			if (!dst)
+				put_page(page);
+
+			sg_assign_page(sg + i, NULL);
+		}
+
+		list_del(&sgl->list);
+		sock_kfree_s(sk, sgl, sizeof(*sgl) + sizeof(sgl->sg[0]) *
+						     (MAX_SGL_ENTS + 1));
+	}
+
+	if (!ctx->used)
+		ctx->merge = 0;
+}
+EXPORT_SYMBOL_GPL(af_alg_pull_tsgl);
+
+/**
+ * af_alg_free_areq_sgls - Release TX and RX SGLs of the request
+ *
+ * @areq Request holding the TX and RX SGL
+ */
+void af_alg_free_areq_sgls(struct af_alg_async_req *areq)
+{
+	struct sock *sk = areq->sk;
+	struct alg_sock *ask = alg_sk(sk);
+	struct af_alg_ctx *ctx = ask->private;
+	struct af_alg_rsgl *rsgl, *tmp;
+	struct scatterlist *tsgl;
+	struct scatterlist *sg;
+	unsigned int i;
+
+	list_for_each_entry_safe(rsgl, tmp, &areq->rsgl_list, list) {
+		ctx->rcvused -= rsgl->sg_num_bytes;
+		af_alg_free_sg(&rsgl->sgl);
+		list_del(&rsgl->list);
+		if (rsgl != &areq->first_rsgl)
+			sock_kfree_s(sk, rsgl, sizeof(*rsgl));
+	}
+
+	tsgl = areq->tsgl;
+	for_each_sg(tsgl, sg, areq->tsgl_entries, i) {
+		if (!sg_page(sg))
+			continue;
+		put_page(sg_page(sg));
+	}
+
+	if (areq->tsgl && areq->tsgl_entries)
+		sock_kfree_s(sk, tsgl, areq->tsgl_entries * sizeof(*tsgl));
+}
+EXPORT_SYMBOL_GPL(af_alg_free_areq_sgls);
+
+/**
+ * af_alg_wait_for_wmem - wait for availability of writable memory
+ *
+ * @sk socket of connection to user space
+ * @flags If MSG_DONTWAIT is set, then only report if function would sleep
+ * @return 0 when writable memory is available, < 0 upon error
+ */
+int af_alg_wait_for_wmem(struct sock *sk, unsigned int flags)
+{
+	DEFINE_WAIT_FUNC(wait, woken_wake_function);
+	int err = -ERESTARTSYS;
+	long timeout;
+
+	if (flags & MSG_DONTWAIT)
+		return -EAGAIN;
+
+	sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk);
+
+	add_wait_queue(sk_sleep(sk), &wait);
+	for (;;) {
+		if (signal_pending(current))
+			break;
+		timeout = MAX_SCHEDULE_TIMEOUT;
+		if (sk_wait_event(sk, &timeout, af_alg_writable(sk), &wait)) {
+			err = 0;
+			break;
+		}
+	}
+	remove_wait_queue(sk_sleep(sk), &wait);
+
+	return err;
+}
+EXPORT_SYMBOL_GPL(af_alg_wait_for_wmem);
+
+/**
+ * af_alg_wmem_wakeup - wakeup caller when writable memory is available
+ *
+ * @sk socket of connection to user space
+ */
+void af_alg_wmem_wakeup(struct sock *sk)
+{
+	struct socket_wq *wq;
+
+	if (!af_alg_writable(sk))
+		return;
+
+	rcu_read_lock();
+	wq = rcu_dereference(sk->sk_wq);
+	if (skwq_has_sleeper(wq))
+		wake_up_interruptible_sync_poll(&wq->wait, POLLIN |
+							   POLLRDNORM |
+							   POLLRDBAND);
+	sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_IN);
+	rcu_read_unlock();
+}
+EXPORT_SYMBOL_GPL(af_alg_wmem_wakeup);
+
+/**
+ * af_alg_wait_for_data - wait for availability of TX data
+ *
+ * @sk socket of connection to user space
+ * @flags If MSG_DONTWAIT is set, then only report if function would sleep
+ * @return 0 when writable memory is available, < 0 upon error
+ */
+int af_alg_wait_for_data(struct sock *sk, unsigned flags)
+{
+	DEFINE_WAIT_FUNC(wait, woken_wake_function);
+	struct alg_sock *ask = alg_sk(sk);
+	struct af_alg_ctx *ctx = ask->private;
+	long timeout;
+	int err = -ERESTARTSYS;
+
+	if (flags & MSG_DONTWAIT)
+		return -EAGAIN;
+
+	sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk);
+
+	add_wait_queue(sk_sleep(sk), &wait);
+	for (;;) {
+		if (signal_pending(current))
+			break;
+		timeout = MAX_SCHEDULE_TIMEOUT;
+		if (sk_wait_event(sk, &timeout, (ctx->used || !ctx->more),
+				  &wait)) {
+			err = 0;
+			break;
+		}
+	}
+	remove_wait_queue(sk_sleep(sk), &wait);
+
+	sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk);
+
+	return err;
+}
+EXPORT_SYMBOL_GPL(af_alg_wait_for_data);
+
+/**
+ * af_alg_data_wakeup - wakeup caller when new data can be sent to kernel
+ *
+ * @sk socket of connection to user space
+ */
+
+void af_alg_data_wakeup(struct sock *sk)
+{
+	struct alg_sock *ask = alg_sk(sk);
+	struct af_alg_ctx *ctx = ask->private;
+	struct socket_wq *wq;
+
+	if (!ctx->used)
+		return;
+
+	rcu_read_lock();
+	wq = rcu_dereference(sk->sk_wq);
+	if (skwq_has_sleeper(wq))
+		wake_up_interruptible_sync_poll(&wq->wait, POLLOUT |
+							   POLLRDNORM |
+							   POLLRDBAND);
+	sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT);
+	rcu_read_unlock();
+}
+EXPORT_SYMBOL_GPL(af_alg_data_wakeup);
+
+/**
+ * af_alg_sendmsg - implementation of sendmsg system call handler
+ *
+ * The sendmsg system call handler obtains the user data and stores it
+ * in ctx->tsgl_list. This implies allocation of the required numbers of
+ * struct af_alg_tsgl.
+ *
+ * In addition, the ctx is filled with the information sent via CMSG.
+ *
+ * @sock socket of connection to user space
+ * @msg message from user space
+ * @size size of message from user space
+ * @ivsize the size of the IV for the cipher operation to verify that the
+ *	   user-space-provided IV has the right size
+ * @return the number of copied data upon success, < 0 upon error
+ */
+int af_alg_sendmsg(struct socket *sock, struct msghdr *msg, size_t size,
+		   unsigned int ivsize)
+{
+	struct sock *sk = sock->sk;
+	struct alg_sock *ask = alg_sk(sk);
+	struct af_alg_ctx *ctx = ask->private;
+	struct af_alg_tsgl *sgl;
+	struct af_alg_control con = {};
+	long copied = 0;
+	bool enc = 0;
+	bool init = 0;
+	int err = 0;
+
+	if (msg->msg_controllen) {
+		err = af_alg_cmsg_send(msg, &con);
+		if (err)
+			return err;
+
+		init = 1;
+		switch (con.op) {
+		case ALG_OP_ENCRYPT:
+			enc = 1;
+			break;
+		case ALG_OP_DECRYPT:
+			enc = 0;
+			break;
+		default:
+			return -EINVAL;
+		}
+
+		if (con.iv && con.iv->ivlen != ivsize)
+			return -EINVAL;
+	}
+
+	lock_sock(sk);
+	if (!ctx->more && ctx->used) {
+		err = -EINVAL;
+		goto unlock;
+	}
+
+	if (init) {
+		ctx->enc = enc;
+		if (con.iv)
+			memcpy(ctx->iv, con.iv->iv, ivsize);
+
+		ctx->aead_assoclen = con.aead_assoclen;
+	}
+
+	while (size) {
+		struct scatterlist *sg;
+		size_t len = size;
+		size_t plen;
+
+		/* use the existing memory in an allocated page */
+		if (ctx->merge) {
+			sgl = list_entry(ctx->tsgl_list.prev,
+					 struct af_alg_tsgl, list);
+			sg = sgl->sg + sgl->cur - 1;
+			len = min_t(size_t, len,
+				    PAGE_SIZE - sg->offset - sg->length);
+
+			err = memcpy_from_msg(page_address(sg_page(sg)) +
+					      sg->offset + sg->length,
+					      msg, len);
+			if (err)
+				goto unlock;
+
+			sg->length += len;
+			ctx->merge = (sg->offset + sg->length) &
+				     (PAGE_SIZE - 1);
+
+			ctx->used += len;
+			copied += len;
+			size -= len;
+			continue;
+		}
+
+		if (!af_alg_writable(sk)) {
+			err = af_alg_wait_for_wmem(sk, msg->msg_flags);
+			if (err)
+				goto unlock;
+		}
+
+		/* allocate a new page */
+		len = min_t(unsigned long, len, af_alg_sndbuf(sk));
+
+		err = af_alg_alloc_tsgl(sk);
+		if (err)
+			goto unlock;
+
+		sgl = list_entry(ctx->tsgl_list.prev, struct af_alg_tsgl,
+				 list);
+		sg = sgl->sg;
+		if (sgl->cur)
+			sg_unmark_end(sg + sgl->cur - 1);
+
+		do {
+			unsigned int i = sgl->cur;
+
+			plen = min_t(size_t, len, PAGE_SIZE);
+
+			sg_assign_page(sg + i, alloc_page(GFP_KERNEL));
+			if (!sg_page(sg + i)) {
+				err = -ENOMEM;
+				goto unlock;
+			}
+
+			err = memcpy_from_msg(page_address(sg_page(sg + i)),
+					      msg, plen);
+			if (err) {
+				__free_page(sg_page(sg + i));
+				sg_assign_page(sg + i, NULL);
+				goto unlock;
+			}
+
+			sg[i].length = plen;
+			len -= plen;
+			ctx->used += plen;
+			copied += plen;
+			size -= plen;
+			sgl->cur++;
+		} while (len && sgl->cur < MAX_SGL_ENTS);
+
+		if (!size)
+			sg_mark_end(sg + sgl->cur - 1);
+
+		ctx->merge = plen & (PAGE_SIZE - 1);
+	}
+
+	err = 0;
+
+	ctx->more = msg->msg_flags & MSG_MORE;
+
+unlock:
+	af_alg_data_wakeup(sk);
+	release_sock(sk);
+
+	return copied ?: err;
+}
+EXPORT_SYMBOL_GPL(af_alg_sendmsg);
+
+/**
+ * af_alg_sendpage - sendpage system call handler
+ *
+ * This is a generic implementation of sendpage to fill ctx->tsgl_list.
+ */
+ssize_t af_alg_sendpage(struct socket *sock, struct page *page,
+			int offset, size_t size, int flags)
+{
+	struct sock *sk = sock->sk;
+	struct alg_sock *ask = alg_sk(sk);
+	struct af_alg_ctx *ctx = ask->private;
+	struct af_alg_tsgl *sgl;
+	int err = -EINVAL;
+
+	if (flags & MSG_SENDPAGE_NOTLAST)
+		flags |= MSG_MORE;
+
+	lock_sock(sk);
+	if (!ctx->more && ctx->used)
+		goto unlock;
+
+	if (!size)
+		goto done;
+
+	if (!af_alg_writable(sk)) {
+		err = af_alg_wait_for_wmem(sk, flags);
+		if (err)
+			goto unlock;
+	}
+
+	err = af_alg_alloc_tsgl(sk);
+	if (err)
+		goto unlock;
+
+	ctx->merge = 0;
+	sgl = list_entry(ctx->tsgl_list.prev, struct af_alg_tsgl, list);
+
+	if (sgl->cur)
+		sg_unmark_end(sgl->sg + sgl->cur - 1);
+
+	sg_mark_end(sgl->sg + sgl->cur);
+
+	get_page(page);
+	sg_set_page(sgl->sg + sgl->cur, page, size, offset);
+	sgl->cur++;
+	ctx->used += size;
+
+done:
+	ctx->more = flags & MSG_MORE;
+
+unlock:
+	af_alg_data_wakeup(sk);
+	release_sock(sk);
+
+	return err ?: size;
+}
+EXPORT_SYMBOL_GPL(af_alg_sendpage);
+
+/**
+ * af_alg_async_cb - AIO callback handler
+ *
+ * This handler cleans up the struct af_alg_async_req upon completion of the
+ * AIO operation.
+ *
+ * The number of bytes to be generated with the AIO operation must be set
+ * in areq->outlen before the AIO callback handler is invoked.
+ */
+void af_alg_async_cb(struct crypto_async_request *_req, int err)
+{
+	struct af_alg_async_req *areq = _req->data;
+	struct sock *sk = areq->sk;
+	struct kiocb *iocb = areq->iocb;
+	unsigned int resultlen;
+
+	lock_sock(sk);
+
+	/* Buffer size written by crypto operation. */
+	resultlen = areq->outlen;
+
+	af_alg_free_areq_sgls(areq);
+	sock_kfree_s(sk, areq, areq->areqlen);
+	__sock_put(sk);
+
+	iocb->ki_complete(iocb, err ? err : resultlen, 0);
+
+	release_sock(sk);
+}
+EXPORT_SYMBOL_GPL(af_alg_async_cb);
+
+/**
+ * af_alg_poll - poll system call handler
+ */
+unsigned int af_alg_poll(struct file *file, struct socket *sock,
+			 poll_table *wait)
+{
+	struct sock *sk = sock->sk;
+	struct alg_sock *ask = alg_sk(sk);
+	struct af_alg_ctx *ctx = ask->private;
+	unsigned int mask;
+
+	sock_poll_wait(file, sk_sleep(sk), wait);
+	mask = 0;
+
+	if (!ctx->more || ctx->used)
+		mask |= POLLIN | POLLRDNORM;
+
+	if (af_alg_writable(sk))
+		mask |= POLLOUT | POLLWRNORM | POLLWRBAND;
+
+	return mask;
+}
+EXPORT_SYMBOL_GPL(af_alg_poll);
+
+/**
+ * af_alg_alloc_areq - allocate struct af_alg_async_req
+ *
+ * @sk socket of connection to user space
+ * @areqlen size of struct af_alg_async_req + crypto_*_reqsize
+ * @return allocated data structure or ERR_PTR upon error
+ */
+struct af_alg_async_req *af_alg_alloc_areq(struct sock *sk,
+					   unsigned int areqlen)
+{
+	struct af_alg_async_req *areq = sock_kmalloc(sk, areqlen, GFP_KERNEL);
+
+	if (unlikely(!areq))
+		return ERR_PTR(-ENOMEM);
+
+	areq->areqlen = areqlen;
+	areq->sk = sk;
+	areq->last_rsgl = NULL;
+	INIT_LIST_HEAD(&areq->rsgl_list);
+	areq->tsgl = NULL;
+	areq->tsgl_entries = 0;
+
+	return areq;
+}
+EXPORT_SYMBOL_GPL(af_alg_alloc_areq);
+
+/**
+ * af_alg_get_rsgl - create the RX SGL for the output data from the crypto
+ *		     operation
+ *
+ * @sk socket of connection to user space
+ * @msg user space message
+ * @flags flags used to invoke recvmsg with
+ * @areq instance of the cryptographic request that will hold the RX SGL
+ * @maxsize maximum number of bytes to be pulled from user space
+ * @outlen number of bytes in the RX SGL
+ * @return 0 on success, < 0 upon error
+ */
+int af_alg_get_rsgl(struct sock *sk, struct msghdr *msg, int flags,
+		    struct af_alg_async_req *areq, size_t maxsize,
+		    size_t *outlen)
+{
+	struct alg_sock *ask = alg_sk(sk);
+	struct af_alg_ctx *ctx = ask->private;
+	size_t len = 0;
+
+	while (maxsize > len && msg_data_left(msg)) {
+		struct af_alg_rsgl *rsgl;
+		size_t seglen;
+		int err;
+
+		/* limit the amount of readable buffers */
+		if (!af_alg_readable(sk))
+			break;
+
+		if (!ctx->used) {
+			err = af_alg_wait_for_data(sk, flags);
+			if (err)
+				return err;
+		}
+
+		seglen = min_t(size_t, (maxsize - len),
+			       msg_data_left(msg));
+
+		if (list_empty(&areq->rsgl_list)) {
+			rsgl = &areq->first_rsgl;
+		} else {
+			rsgl = sock_kmalloc(sk, sizeof(*rsgl), GFP_KERNEL);
+			if (unlikely(!rsgl))
+				return -ENOMEM;
+		}
+
+		rsgl->sgl.npages = 0;
+		list_add_tail(&rsgl->list, &areq->rsgl_list);
+
+		/* make one iovec available as scatterlist */
+		err = af_alg_make_sg(&rsgl->sgl, &msg->msg_iter, seglen);
+		if (err < 0)
+			return err;
+
+		/* chain the new scatterlist with previous one */
+		if (areq->last_rsgl)
+			af_alg_link_sg(&areq->last_rsgl->sgl, &rsgl->sgl);
+
+		areq->last_rsgl = rsgl;
+		len += err;
+		ctx->rcvused += err;
+		rsgl->sg_num_bytes = err;
+		iov_iter_advance(&msg->msg_iter, err);
+	}
+
+	*outlen = len;
+	return 0;
+}
+EXPORT_SYMBOL_GPL(af_alg_get_rsgl);
+
 static int __init af_alg_init(void)
 {
 	int err = proto_register(&alg_proto, 0);
diff --git a/crypto/algif_aead.c b/crypto/algif_aead.c
index 1f0696dd64f4..48d46e74ed0d 100644
--- a/crypto/algif_aead.c
+++ b/crypto/algif_aead.c
@@ -35,101 +35,23 @@
 #include <linux/init.h>
 #include <linux/list.h>
 #include <linux/kernel.h>
-#include <linux/sched/signal.h>
 #include <linux/mm.h>
 #include <linux/module.h>
 #include <linux/net.h>
 #include <net/sock.h>
 
-struct aead_tsgl {
-	struct list_head list;
-	unsigned int cur;		/* Last processed SG entry */
-	struct scatterlist sg[0];	/* Array of SGs forming the SGL */
-};
-
-struct aead_rsgl {
-	struct af_alg_sgl sgl;
-	struct list_head list;
-	size_t sg_num_bytes;		/* Bytes of data in that SGL */
-};
-
-struct aead_async_req {
-	struct kiocb *iocb;
-	struct sock *sk;
-
-	struct aead_rsgl first_rsgl;	/* First RX SG */
-	struct list_head rsgl_list;	/* Track RX SGs */
-
-	struct scatterlist *tsgl;	/* priv. TX SGL of buffers to process */
-	unsigned int tsgl_entries;	/* number of entries in priv. TX SGL */
-
-	unsigned int outlen;		/* Filled output buf length */
-
-	unsigned int areqlen;		/* Length of this data struct */
-	struct aead_request aead_req;	/* req ctx trails this struct */
-};
-
 struct aead_tfm {
 	struct crypto_aead *aead;
 	bool has_key;
 	struct crypto_skcipher *null_tfm;
 };
 
-struct aead_ctx {
-	struct list_head tsgl_list;	/* Link to TX SGL */
-
-	void *iv;
-	size_t aead_assoclen;
-
-	struct af_alg_completion completion;	/* sync work queue */
-
-	size_t used;		/* TX bytes sent to kernel */
-	size_t rcvused;		/* total RX bytes to be processed by kernel */
-
-	bool more;		/* More data to be expected? */
-	bool merge;		/* Merge new data into existing SG */
-	bool enc;		/* Crypto operation: enc, dec */
-
-	unsigned int len;	/* Length of allocated memory for this struct */
-};
-
-#define MAX_SGL_ENTS ((4096 - sizeof(struct aead_tsgl)) / \
-		      sizeof(struct scatterlist) - 1)
-
-static inline int aead_sndbuf(struct sock *sk)
-{
-	struct alg_sock *ask = alg_sk(sk);
-	struct aead_ctx *ctx = ask->private;
-
-	return max_t(int, max_t(int, sk->sk_sndbuf & PAGE_MASK, PAGE_SIZE) -
-			  ctx->used, 0);
-}
-
-static inline bool aead_writable(struct sock *sk)
-{
-	return PAGE_SIZE <= aead_sndbuf(sk);
-}
-
-static inline int aead_rcvbuf(struct sock *sk)
-{
-	struct alg_sock *ask = alg_sk(sk);
-	struct aead_ctx *ctx = ask->private;
-
-	return max_t(int, max_t(int, sk->sk_rcvbuf & PAGE_MASK, PAGE_SIZE) -
-			  ctx->rcvused, 0);
-}
-
-static inline bool aead_readable(struct sock *sk)
-{
-	return PAGE_SIZE <= aead_rcvbuf(sk);
-}
-
 static inline bool aead_sufficient_data(struct sock *sk)
 {
 	struct alg_sock *ask = alg_sk(sk);
 	struct sock *psk = ask->parent;
 	struct alg_sock *pask = alg_sk(psk);
-	struct aead_ctx *ctx = ask->private;
+	struct af_alg_ctx *ctx = ask->private;
 	struct aead_tfm *aeadc = pask->private;
 	struct crypto_aead *tfm = aeadc->aead;
 	unsigned int as = crypto_aead_authsize(tfm);
@@ -141,490 +63,17 @@ static inline bool aead_sufficient_data(struct sock *sk)
 	return ctx->used >= ctx->aead_assoclen + (ctx->enc ? 0 : as);
 }
 
-static int aead_alloc_tsgl(struct sock *sk)
-{
-	struct alg_sock *ask = alg_sk(sk);
-	struct aead_ctx *ctx = ask->private;
-	struct aead_tsgl *sgl;
-	struct scatterlist *sg = NULL;
-
-	sgl = list_entry(ctx->tsgl_list.prev, struct aead_tsgl, list);
-	if (!list_empty(&ctx->tsgl_list))
-		sg = sgl->sg;
-
-	if (!sg || sgl->cur >= MAX_SGL_ENTS) {
-		sgl = sock_kmalloc(sk, sizeof(*sgl) +
-				       sizeof(sgl->sg[0]) * (MAX_SGL_ENTS + 1),
-				   GFP_KERNEL);
-		if (!sgl)
-			return -ENOMEM;
-
-		sg_init_table(sgl->sg, MAX_SGL_ENTS + 1);
-		sgl->cur = 0;
-
-		if (sg)
-			sg_chain(sg, MAX_SGL_ENTS + 1, sgl->sg);
-
-		list_add_tail(&sgl->list, &ctx->tsgl_list);
-	}
-
-	return 0;
-}
-
-/**
- * Count number of SG entries from the beginning of the SGL to @bytes. If
- * an offset is provided, the counting of the SG entries starts at the offset.
- */
-static unsigned int aead_count_tsgl(struct sock *sk, size_t bytes,
-				    size_t offset)
-{
-	struct alg_sock *ask = alg_sk(sk);
-	struct aead_ctx *ctx = ask->private;
-	struct aead_tsgl *sgl, *tmp;
-	unsigned int i;
-	unsigned int sgl_count = 0;
-
-	if (!bytes)
-		return 0;
-
-	list_for_each_entry_safe(sgl, tmp, &ctx->tsgl_list, list) {
-		struct scatterlist *sg = sgl->sg;
-
-		for (i = 0; i < sgl->cur; i++) {
-			size_t bytes_count;
-
-			/* Skip offset */
-			if (offset >= sg[i].length) {
-				offset -= sg[i].length;
-				bytes -= sg[i].length;
-				continue;
-			}
-
-			bytes_count = sg[i].length - offset;
-
-			offset = 0;
-			sgl_count++;
-
-			/* If we have seen requested number of bytes, stop */
-			if (bytes_count >= bytes)
-				return sgl_count;
-
-			bytes -= bytes_count;
-		}
-	}
-
-	return sgl_count;
-}
-
-/**
- * Release the specified buffers from TX SGL pointed to by ctx->tsgl_list for
- * @used bytes.
- *
- * If @dst is non-null, reassign the pages to dst. The caller must release
- * the pages. If @dst_offset is given only reassign the pages to @dst starting
- * at the @dst_offset (byte). The caller must ensure that @dst is large
- * enough (e.g. by using aead_count_tsgl with the same offset).
- */
-static void aead_pull_tsgl(struct sock *sk, size_t used,
-			   struct scatterlist *dst, size_t dst_offset)
-{
-	struct alg_sock *ask = alg_sk(sk);
-	struct aead_ctx *ctx = ask->private;
-	struct aead_tsgl *sgl;
-	struct scatterlist *sg;
-	unsigned int i, j;
-
-	while (!list_empty(&ctx->tsgl_list)) {
-		sgl = list_first_entry(&ctx->tsgl_list, struct aead_tsgl,
-				       list);
-		sg = sgl->sg;
-
-		for (i = 0, j = 0; i < sgl->cur; i++) {
-			size_t plen = min_t(size_t, used, sg[i].length);
-			struct page *page = sg_page(sg + i);
-
-			if (!page)
-				continue;
-
-			/*
-			 * Assumption: caller created aead_count_tsgl(len)
-			 * SG entries in dst.
-			 */
-			if (dst) {
-				if (dst_offset >= plen) {
-					/* discard page before offset */
-					dst_offset -= plen;
-					put_page(page);
-				} else {
-					/* reassign page to dst after offset */
-					sg_set_page(dst + j, page,
-						    plen - dst_offset,
-						    sg[i].offset + dst_offset);
-					dst_offset = 0;
-					j++;
-				}
-			}
-
-			sg[i].length -= plen;
-			sg[i].offset += plen;
-
-			used -= plen;
-			ctx->used -= plen;
-
-			if (sg[i].length)
-				return;
-
-			if (!dst)
-				put_page(page);
-
-			sg_assign_page(sg + i, NULL);
-		}
-
-		list_del(&sgl->list);
-		sock_kfree_s(sk, sgl, sizeof(*sgl) + sizeof(sgl->sg[0]) *
-						     (MAX_SGL_ENTS + 1));
-	}
-
-	if (!ctx->used)
-		ctx->merge = 0;
-}
-
-static void aead_free_areq_sgls(struct aead_async_req *areq)
-{
-	struct sock *sk = areq->sk;
-	struct alg_sock *ask = alg_sk(sk);
-	struct aead_ctx *ctx = ask->private;
-	struct aead_rsgl *rsgl, *tmp;
-	struct scatterlist *tsgl;
-	struct scatterlist *sg;
-	unsigned int i;
-
-	list_for_each_entry_safe(rsgl, tmp, &areq->rsgl_list, list) {
-		ctx->rcvused -= rsgl->sg_num_bytes;
-		af_alg_free_sg(&rsgl->sgl);
-		list_del(&rsgl->list);
-		if (rsgl != &areq->first_rsgl)
-			sock_kfree_s(sk, rsgl, sizeof(*rsgl));
-	}
-
-	tsgl = areq->tsgl;
-	for_each_sg(tsgl, sg, areq->tsgl_entries, i) {
-		if (!sg_page(sg))
-			continue;
-		put_page(sg_page(sg));
-	}
-
-	if (areq->tsgl && areq->tsgl_entries)
-		sock_kfree_s(sk, tsgl, areq->tsgl_entries * sizeof(*tsgl));
-}
-
-static int aead_wait_for_wmem(struct sock *sk, unsigned int flags)
-{
-	DEFINE_WAIT_FUNC(wait, woken_wake_function);
-	int err = -ERESTARTSYS;
-	long timeout;
-
-	if (flags & MSG_DONTWAIT)
-		return -EAGAIN;
-
-	sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk);
-
-	add_wait_queue(sk_sleep(sk), &wait);
-	for (;;) {
-		if (signal_pending(current))
-			break;
-		timeout = MAX_SCHEDULE_TIMEOUT;
-		if (sk_wait_event(sk, &timeout, aead_writable(sk), &wait)) {
-			err = 0;
-			break;
-		}
-	}
-	remove_wait_queue(sk_sleep(sk), &wait);
-
-	return err;
-}
-
-static void aead_wmem_wakeup(struct sock *sk)
-{
-	struct socket_wq *wq;
-
-	if (!aead_writable(sk))
-		return;
-
-	rcu_read_lock();
-	wq = rcu_dereference(sk->sk_wq);
-	if (skwq_has_sleeper(wq))
-		wake_up_interruptible_sync_poll(&wq->wait, POLLIN |
-							   POLLRDNORM |
-							   POLLRDBAND);
-	sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_IN);
-	rcu_read_unlock();
-}
-
-static int aead_wait_for_data(struct sock *sk, unsigned flags)
-{
-	DEFINE_WAIT_FUNC(wait, woken_wake_function);
-	struct alg_sock *ask = alg_sk(sk);
-	struct aead_ctx *ctx = ask->private;
-	long timeout;
-	int err = -ERESTARTSYS;
-
-	if (flags & MSG_DONTWAIT)
-		return -EAGAIN;
-
-	sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk);
-
-	add_wait_queue(sk_sleep(sk), &wait);
-	for (;;) {
-		if (signal_pending(current))
-			break;
-		timeout = MAX_SCHEDULE_TIMEOUT;
-		if (sk_wait_event(sk, &timeout, !ctx->more, &wait)) {
-			err = 0;
-			break;
-		}
-	}
-	remove_wait_queue(sk_sleep(sk), &wait);
-
-	sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk);
-
-	return err;
-}
-
-static void aead_data_wakeup(struct sock *sk)
-{
-	struct alg_sock *ask = alg_sk(sk);
-	struct aead_ctx *ctx = ask->private;
-	struct socket_wq *wq;
-
-	if (!ctx->used)
-		return;
-
-	rcu_read_lock();
-	wq = rcu_dereference(sk->sk_wq);
-	if (skwq_has_sleeper(wq))
-		wake_up_interruptible_sync_poll(&wq->wait, POLLOUT |
-							   POLLRDNORM |
-							   POLLRDBAND);
-	sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT);
-	rcu_read_unlock();
-}
-
 static int aead_sendmsg(struct socket *sock, struct msghdr *msg, size_t size)
 {
 	struct sock *sk = sock->sk;
 	struct alg_sock *ask = alg_sk(sk);
 	struct sock *psk = ask->parent;
 	struct alg_sock *pask = alg_sk(psk);
-	struct aead_ctx *ctx = ask->private;
 	struct aead_tfm *aeadc = pask->private;
 	struct crypto_aead *tfm = aeadc->aead;
 	unsigned int ivsize = crypto_aead_ivsize(tfm);
-	struct aead_tsgl *sgl;
-	struct af_alg_control con = {};
-	long copied = 0;
-	bool enc = 0;
-	bool init = 0;
-	int err = 0;
-
-	if (msg->msg_controllen) {
-		err = af_alg_cmsg_send(msg, &con);
-		if (err)
-			return err;
-
-		init = 1;
-		switch (con.op) {
-		case ALG_OP_ENCRYPT:
-			enc = 1;
-			break;
-		case ALG_OP_DECRYPT:
-			enc = 0;
-			break;
-		default:
-			return -EINVAL;
-		}
-
-		if (con.iv && con.iv->ivlen != ivsize)
-			return -EINVAL;
-	}
-
-	lock_sock(sk);
-	if (!ctx->more && ctx->used) {
-		err = -EINVAL;
-		goto unlock;
-	}
-
-	if (init) {
-		ctx->enc = enc;
-		if (con.iv)
-			memcpy(ctx->iv, con.iv->iv, ivsize);
-
-		ctx->aead_assoclen = con.aead_assoclen;
-	}
-
-	while (size) {
-		struct scatterlist *sg;
-		size_t len = size;
-		size_t plen;
-
-		/* use the existing memory in an allocated page */
-		if (ctx->merge) {
-			sgl = list_entry(ctx->tsgl_list.prev,
-					 struct aead_tsgl, list);
-			sg = sgl->sg + sgl->cur - 1;
-			len = min_t(unsigned long, len,
-				    PAGE_SIZE - sg->offset - sg->length);
-			err = memcpy_from_msg(page_address(sg_page(sg)) +
-					      sg->offset + sg->length,
-					      msg, len);
-			if (err)
-				goto unlock;
-
-			sg->length += len;
-			ctx->merge = (sg->offset + sg->length) &
-				     (PAGE_SIZE - 1);
-
-			ctx->used += len;
-			copied += len;
-			size -= len;
-			continue;
-		}
-
-		if (!aead_writable(sk)) {
-			err = aead_wait_for_wmem(sk, msg->msg_flags);
-			if (err)
-				goto unlock;
-		}
-
-		/* allocate a new page */
-		len = min_t(unsigned long, size, aead_sndbuf(sk));
-
-		err = aead_alloc_tsgl(sk);
-		if (err)
-			goto unlock;
-
-		sgl = list_entry(ctx->tsgl_list.prev, struct aead_tsgl,
-				 list);
-		sg = sgl->sg;
-		if (sgl->cur)
-			sg_unmark_end(sg + sgl->cur - 1);
-
-		do {
-			unsigned int i = sgl->cur;
-
-			plen = min_t(size_t, len, PAGE_SIZE);
-
-			sg_assign_page(sg + i, alloc_page(GFP_KERNEL));
-			if (!sg_page(sg + i)) {
-				err = -ENOMEM;
-				goto unlock;
-			}
-
-			err = memcpy_from_msg(page_address(sg_page(sg + i)),
-					      msg, plen);
-			if (err) {
-				__free_page(sg_page(sg + i));
-				sg_assign_page(sg + i, NULL);
-				goto unlock;
-			}
-
-			sg[i].length = plen;
-			len -= plen;
-			ctx->used += plen;
-			copied += plen;
-			size -= plen;
-			sgl->cur++;
-		} while (len && sgl->cur < MAX_SGL_ENTS);
-
-		if (!size)
-			sg_mark_end(sg + sgl->cur - 1);
-
-		ctx->merge = plen & (PAGE_SIZE - 1);
-	}
-
-	err = 0;
-
-	ctx->more = msg->msg_flags & MSG_MORE;
-
-unlock:
-	aead_data_wakeup(sk);
-	release_sock(sk);
-
-	return err ?: copied;
-}
-
-static ssize_t aead_sendpage(struct socket *sock, struct page *page,
-			     int offset, size_t size, int flags)
-{
-	struct sock *sk = sock->sk;
-	struct alg_sock *ask = alg_sk(sk);
-	struct aead_ctx *ctx = ask->private;
-	struct aead_tsgl *sgl;
-	int err = -EINVAL;
-
-	if (flags & MSG_SENDPAGE_NOTLAST)
-		flags |= MSG_MORE;
-
-	lock_sock(sk);
-	if (!ctx->more && ctx->used)
-		goto unlock;
-
-	if (!size)
-		goto done;
-
-	if (!aead_writable(sk)) {
-		err = aead_wait_for_wmem(sk, flags);
-		if (err)
-			goto unlock;
-	}
-
-	err = aead_alloc_tsgl(sk);
-	if (err)
-		goto unlock;
-
-	ctx->merge = 0;
-	sgl = list_entry(ctx->tsgl_list.prev, struct aead_tsgl, list);
-
-	if (sgl->cur)
-		sg_unmark_end(sgl->sg + sgl->cur - 1);
-
-	sg_mark_end(sgl->sg + sgl->cur);
-
-	get_page(page);
-	sg_set_page(sgl->sg + sgl->cur, page, size, offset);
-	sgl->cur++;
-	ctx->used += size;
-
-	err = 0;
 
-done:
-	ctx->more = flags & MSG_MORE;
-unlock:
-	aead_data_wakeup(sk);
-	release_sock(sk);
-
-	return err ?: size;
-}
-
-static void aead_async_cb(struct crypto_async_request *_req, int err)
-{
-	struct aead_async_req *areq = _req->data;
-	struct sock *sk = areq->sk;
-	struct kiocb *iocb = areq->iocb;
-	unsigned int resultlen;
-
-	lock_sock(sk);
-
-	/* Buffer size written by crypto operation. */
-	resultlen = areq->outlen;
-
-	aead_free_areq_sgls(areq);
-	sock_kfree_s(sk, areq, areq->areqlen);
-	__sock_put(sk);
-
-	iocb->ki_complete(iocb, err ? err : resultlen, 0);
-
-	release_sock(sk);
+	return af_alg_sendmsg(sock, msg, size, ivsize);
 }
 
 static int crypto_aead_copy_sgl(struct crypto_skcipher *null_tfm,
@@ -648,16 +97,13 @@ static int _aead_recvmsg(struct socket *sock, struct msghdr *msg,
 	struct alg_sock *ask = alg_sk(sk);
 	struct sock *psk = ask->parent;
 	struct alg_sock *pask = alg_sk(psk);
-	struct aead_ctx *ctx = ask->private;
+	struct af_alg_ctx *ctx = ask->private;
 	struct aead_tfm *aeadc = pask->private;
 	struct crypto_aead *tfm = aeadc->aead;
 	struct crypto_skcipher *null_tfm = aeadc->null_tfm;
 	unsigned int as = crypto_aead_authsize(tfm);
-	unsigned int areqlen =
-		sizeof(struct aead_async_req) + crypto_aead_reqsize(tfm);
-	struct aead_async_req *areq;
-	struct aead_rsgl *last_rsgl = NULL;
-	struct aead_tsgl *tsgl;
+	struct af_alg_async_req *areq;
+	struct af_alg_tsgl *tsgl;
 	struct scatterlist *src;
 	int err = 0;
 	size_t used = 0;		/* [in]  TX bufs to be en/decrypted */
@@ -703,61 +149,15 @@ static int _aead_recvmsg(struct socket *sock, struct msghdr *msg,
 	used -= ctx->aead_assoclen;
 
 	/* Allocate cipher request for current operation. */
-	areq = sock_kmalloc(sk, areqlen, GFP_KERNEL);
-	if (unlikely(!areq))
-		return -ENOMEM;
-	areq->areqlen = areqlen;
-	areq->sk = sk;
-	INIT_LIST_HEAD(&areq->rsgl_list);
-	areq->tsgl = NULL;
-	areq->tsgl_entries = 0;
+	areq = af_alg_alloc_areq(sk, sizeof(struct af_alg_async_req) +
+				     crypto_aead_reqsize(tfm));
+	if (IS_ERR(areq))
+		return PTR_ERR(areq);
 
 	/* convert iovecs of output buffers into RX SGL */
-	while (outlen > usedpages && msg_data_left(msg)) {
-		struct aead_rsgl *rsgl;
-		size_t seglen;
-
-		/* limit the amount of readable buffers */
-		if (!aead_readable(sk))
-			break;
-
-		if (!ctx->used) {
-			err = aead_wait_for_data(sk, flags);
-			if (err)
-				goto free;
-		}
-
-		seglen = min_t(size_t, (outlen - usedpages),
-			       msg_data_left(msg));
-
-		if (list_empty(&areq->rsgl_list)) {
-			rsgl = &areq->first_rsgl;
-		} else {
-			rsgl = sock_kmalloc(sk, sizeof(*rsgl), GFP_KERNEL);
-			if (unlikely(!rsgl)) {
-				err = -ENOMEM;
-				goto free;
-			}
-		}
-
-		rsgl->sgl.npages = 0;
-		list_add_tail(&rsgl->list, &areq->rsgl_list);
-
-		/* make one iovec available as scatterlist */
-		err = af_alg_make_sg(&rsgl->sgl, &msg->msg_iter, seglen);
-		if (err < 0)
-			goto free;
-
-		/* chain the new scatterlist with previous one */
-		if (last_rsgl)
-			af_alg_link_sg(&last_rsgl->sgl, &rsgl->sgl);
-
-		last_rsgl = rsgl;
-		usedpages += err;
-		ctx->rcvused += err;
-		rsgl->sg_num_bytes = err;
-		iov_iter_advance(&msg->msg_iter, err);
-	}
+	err = af_alg_get_rsgl(sk, msg, flags, areq, outlen, &usedpages);
+	if (err)
+		goto free;
 
 	/*
 	 * Ensure output buffer is sufficiently large. If the caller provides
@@ -778,7 +178,7 @@ static int _aead_recvmsg(struct socket *sock, struct msghdr *msg,
 	}
 
 	processed = used + ctx->aead_assoclen;
-	tsgl = list_first_entry(&ctx->tsgl_list, struct aead_tsgl, list);
+	tsgl = list_first_entry(&ctx->tsgl_list, struct af_alg_tsgl, list);
 
 	/*
 	 * Copy of AAD from source to destination
@@ -811,7 +211,7 @@ static int _aead_recvmsg(struct socket *sock, struct msghdr *msg,
 					   areq->first_rsgl.sgl.sg, processed);
 		if (err)
 			goto free;
-		aead_pull_tsgl(sk, processed, NULL, 0);
+		af_alg_pull_tsgl(sk, processed, NULL, 0);
 	} else {
 		/*
 		 * Decryption operation - To achieve an in-place cipher
@@ -831,8 +231,8 @@ static int _aead_recvmsg(struct socket *sock, struct msghdr *msg,
 			goto free;
 
 		/* Create TX SGL for tag and chain it to RX SGL. */
-		areq->tsgl_entries = aead_count_tsgl(sk, processed,
-						     processed - as);
+		areq->tsgl_entries = af_alg_count_tsgl(sk, processed,
+						       processed - as);
 		if (!areq->tsgl_entries)
 			areq->tsgl_entries = 1;
 		areq->tsgl = sock_kmalloc(sk, sizeof(*areq->tsgl) *
@@ -845,12 +245,12 @@ static int _aead_recvmsg(struct socket *sock, struct msghdr *msg,
 		sg_init_table(areq->tsgl, areq->tsgl_entries);
 
 		/* Release TX SGL, except for tag data and reassign tag data. */
-		aead_pull_tsgl(sk, processed, areq->tsgl, processed - as);
+		af_alg_pull_tsgl(sk, processed, areq->tsgl, processed - as);
 
 		/* chain the areq TX SGL holding the tag with RX SGL */
-		if (last_rsgl) {
+		if (usedpages) {
 			/* RX SGL present */
-			struct af_alg_sgl *sgl_prev = &last_rsgl->sgl;
+			struct af_alg_sgl *sgl_prev = &areq->last_rsgl->sgl;
 
 			sg_unmark_end(sgl_prev->sg + sgl_prev->npages - 1);
 			sg_chain(sgl_prev->sg, sgl_prev->npages + 1,
@@ -861,28 +261,28 @@ static int _aead_recvmsg(struct socket *sock, struct msghdr *msg,
 	}
 
 	/* Initialize the crypto operation */
-	aead_request_set_crypt(&areq->aead_req, src,
+	aead_request_set_crypt(&areq->cra_u.aead_req, src,
 			       areq->first_rsgl.sgl.sg, used, ctx->iv);
-	aead_request_set_ad(&areq->aead_req, ctx->aead_assoclen);
-	aead_request_set_tfm(&areq->aead_req, tfm);
+	aead_request_set_ad(&areq->cra_u.aead_req, ctx->aead_assoclen);
+	aead_request_set_tfm(&areq->cra_u.aead_req, tfm);
 
 	if (msg->msg_iocb && !is_sync_kiocb(msg->msg_iocb)) {
 		/* AIO operation */
 		areq->iocb = msg->msg_iocb;
-		aead_request_set_callback(&areq->aead_req,
+		aead_request_set_callback(&areq->cra_u.aead_req,
 					  CRYPTO_TFM_REQ_MAY_BACKLOG,
-					  aead_async_cb, areq);
-		err = ctx->enc ? crypto_aead_encrypt(&areq->aead_req) :
-				 crypto_aead_decrypt(&areq->aead_req);
+					  af_alg_async_cb, areq);
+		err = ctx->enc ? crypto_aead_encrypt(&areq->cra_u.aead_req) :
+				 crypto_aead_decrypt(&areq->cra_u.aead_req);
 	} else {
 		/* Synchronous operation */
-		aead_request_set_callback(&areq->aead_req,
+		aead_request_set_callback(&areq->cra_u.aead_req,
 					  CRYPTO_TFM_REQ_MAY_BACKLOG,
 					  af_alg_complete, &ctx->completion);
 		err = af_alg_wait_for_completion(ctx->enc ?
-					 crypto_aead_encrypt(&areq->aead_req) :
-					 crypto_aead_decrypt(&areq->aead_req),
-					 &ctx->completion);
+				crypto_aead_encrypt(&areq->cra_u.aead_req) :
+				crypto_aead_decrypt(&areq->cra_u.aead_req),
+						 &ctx->completion);
 	}
 
 	/* AIO operation in progress */
@@ -896,9 +296,8 @@ static int _aead_recvmsg(struct socket *sock, struct msghdr *msg,
 	}
 
 free:
-	aead_free_areq_sgls(areq);
-	if (areq)
-		sock_kfree_s(sk, areq, areqlen);
+	af_alg_free_areq_sgls(areq);
+	sock_kfree_s(sk, areq, areq->areqlen);
 
 	return err ? err : outlen;
 }
@@ -931,31 +330,11 @@ static int aead_recvmsg(struct socket *sock, struct msghdr *msg,
 	}
 
 out:
-	aead_wmem_wakeup(sk);
+	af_alg_wmem_wakeup(sk);
 	release_sock(sk);
 	return ret;
 }
 
-static unsigned int aead_poll(struct file *file, struct socket *sock,
-			      poll_table *wait)
-{
-	struct sock *sk = sock->sk;
-	struct alg_sock *ask = alg_sk(sk);
-	struct aead_ctx *ctx = ask->private;
-	unsigned int mask;
-
-	sock_poll_wait(file, sk_sleep(sk), wait);
-	mask = 0;
-
-	if (!ctx->more)
-		mask |= POLLIN | POLLRDNORM;
-
-	if (aead_writable(sk))
-		mask |= POLLOUT | POLLWRNORM | POLLWRBAND;
-
-	return mask;
-}
-
 static struct proto_ops algif_aead_ops = {
 	.family		=	PF_ALG,
 
@@ -973,9 +352,9 @@ static struct proto_ops algif_aead_ops = {
 
 	.release	=	af_alg_release,
 	.sendmsg	=	aead_sendmsg,
-	.sendpage	=	aead_sendpage,
+	.sendpage	=	af_alg_sendpage,
 	.recvmsg	=	aead_recvmsg,
-	.poll		=	aead_poll,
+	.poll		=	af_alg_poll,
 };
 
 static int aead_check_key(struct socket *sock)
@@ -1037,7 +416,7 @@ static ssize_t aead_sendpage_nokey(struct socket *sock, struct page *page,
 	if (err)
 		return err;
 
-	return aead_sendpage(sock, page, offset, size, flags);
+	return af_alg_sendpage(sock, page, offset, size, flags);
 }
 
 static int aead_recvmsg_nokey(struct socket *sock, struct msghdr *msg,
@@ -1071,7 +450,7 @@ static struct proto_ops algif_aead_ops_nokey = {
 	.sendmsg	=	aead_sendmsg_nokey,
 	.sendpage	=	aead_sendpage_nokey,
 	.recvmsg	=	aead_recvmsg_nokey,
-	.poll		=	aead_poll,
+	.poll		=	af_alg_poll,
 };
 
 static void *aead_bind(const char *name, u32 type, u32 mask)
@@ -1132,14 +511,14 @@ static int aead_setkey(void *private, const u8 *key, unsigned int keylen)
 static void aead_sock_destruct(struct sock *sk)
 {
 	struct alg_sock *ask = alg_sk(sk);
-	struct aead_ctx *ctx = ask->private;
+	struct af_alg_ctx *ctx = ask->private;
 	struct sock *psk = ask->parent;
 	struct alg_sock *pask = alg_sk(psk);
 	struct aead_tfm *aeadc = pask->private;
 	struct crypto_aead *tfm = aeadc->aead;
 	unsigned int ivlen = crypto_aead_ivsize(tfm);
 
-	aead_pull_tsgl(sk, ctx->used, NULL, 0);
+	af_alg_pull_tsgl(sk, ctx->used, NULL, 0);
 	crypto_put_default_null_skcipher2();
 	sock_kzfree_s(sk, ctx->iv, ivlen);
 	sock_kfree_s(sk, ctx, ctx->len);
@@ -1148,7 +527,7 @@ static void aead_sock_destruct(struct sock *sk)
 
 static int aead_accept_parent_nokey(void *private, struct sock *sk)
 {
-	struct aead_ctx *ctx;
+	struct af_alg_ctx *ctx;
 	struct alg_sock *ask = alg_sk(sk);
 	struct aead_tfm *tfm = private;
 	struct crypto_aead *aead = tfm->aead;
diff --git a/crypto/algif_skcipher.c b/crypto/algif_skcipher.c
index ce3b5fba2279..8ae4170aaeb4 100644
--- a/crypto/algif_skcipher.c
+++ b/crypto/algif_skcipher.c
@@ -33,320 +33,16 @@
 #include <linux/init.h>
 #include <linux/list.h>
 #include <linux/kernel.h>
-#include <linux/sched/signal.h>
 #include <linux/mm.h>
 #include <linux/module.h>
 #include <linux/net.h>
 #include <net/sock.h>
 
-struct skcipher_tsgl {
-	struct list_head list;
-	int cur;
-	struct scatterlist sg[0];
-};
-
-struct skcipher_rsgl {
-	struct af_alg_sgl sgl;
-	struct list_head list;
-	size_t sg_num_bytes;
-};
-
-struct skcipher_async_req {
-	struct kiocb *iocb;
-	struct sock *sk;
-
-	struct skcipher_rsgl first_sgl;
-	struct list_head rsgl_list;
-
-	struct scatterlist *tsgl;
-	unsigned int tsgl_entries;
-
-	unsigned int areqlen;
-	struct skcipher_request req;
-};
-
 struct skcipher_tfm {
 	struct crypto_skcipher *skcipher;
 	bool has_key;
 };
 
-struct skcipher_ctx {
-	struct list_head tsgl_list;
-
-	void *iv;
-
-	struct af_alg_completion completion;
-
-	size_t used;
-	size_t rcvused;
-
-	bool more;
-	bool merge;
-	bool enc;
-
-	unsigned int len;
-};
-
-#define MAX_SGL_ENTS ((4096 - sizeof(struct skcipher_tsgl)) / \
-		      sizeof(struct scatterlist) - 1)
-
-static inline int skcipher_sndbuf(struct sock *sk)
-{
-	struct alg_sock *ask = alg_sk(sk);
-	struct skcipher_ctx *ctx = ask->private;
-
-	return max_t(int, max_t(int, sk->sk_sndbuf & PAGE_MASK, PAGE_SIZE) -
-			  ctx->used, 0);
-}
-
-static inline bool skcipher_writable(struct sock *sk)
-{
-	return PAGE_SIZE <= skcipher_sndbuf(sk);
-}
-
-static inline int skcipher_rcvbuf(struct sock *sk)
-{
-	struct alg_sock *ask = alg_sk(sk);
-	struct skcipher_ctx *ctx = ask->private;
-
-	return max_t(int, max_t(int, sk->sk_rcvbuf & PAGE_MASK, PAGE_SIZE) -
-			  ctx->rcvused, 0);
-}
-
-static inline bool skcipher_readable(struct sock *sk)
-{
-	return PAGE_SIZE <= skcipher_rcvbuf(sk);
-}
-
-static int skcipher_alloc_tsgl(struct sock *sk)
-{
-	struct alg_sock *ask = alg_sk(sk);
-	struct skcipher_ctx *ctx = ask->private;
-	struct skcipher_tsgl *sgl;
-	struct scatterlist *sg = NULL;
-
-	sgl = list_entry(ctx->tsgl_list.prev, struct skcipher_tsgl, list);
-	if (!list_empty(&ctx->tsgl_list))
-		sg = sgl->sg;
-
-	if (!sg || sgl->cur >= MAX_SGL_ENTS) {
-		sgl = sock_kmalloc(sk, sizeof(*sgl) +
-				       sizeof(sgl->sg[0]) * (MAX_SGL_ENTS + 1),
-				   GFP_KERNEL);
-		if (!sgl)
-			return -ENOMEM;
-
-		sg_init_table(sgl->sg, MAX_SGL_ENTS + 1);
-		sgl->cur = 0;
-
-		if (sg)
-			sg_chain(sg, MAX_SGL_ENTS + 1, sgl->sg);
-
-		list_add_tail(&sgl->list, &ctx->tsgl_list);
-	}
-
-	return 0;
-}
-
-static unsigned int skcipher_count_tsgl(struct sock *sk, size_t bytes)
-{
-	struct alg_sock *ask = alg_sk(sk);
-	struct skcipher_ctx *ctx = ask->private;
-	struct skcipher_tsgl *sgl, *tmp;
-	unsigned int i;
-	unsigned int sgl_count = 0;
-
-	if (!bytes)
-		return 0;
-
-	list_for_each_entry_safe(sgl, tmp, &ctx->tsgl_list, list) {
-		struct scatterlist *sg = sgl->sg;
-
-		for (i = 0; i < sgl->cur; i++) {
-			sgl_count++;
-			if (sg[i].length >= bytes)
-				return sgl_count;
-
-			bytes -= sg[i].length;
-		}
-	}
-
-	return sgl_count;
-}
-
-static void skcipher_pull_tsgl(struct sock *sk, size_t used,
-			       struct scatterlist *dst)
-{
-	struct alg_sock *ask = alg_sk(sk);
-	struct skcipher_ctx *ctx = ask->private;
-	struct skcipher_tsgl *sgl;
-	struct scatterlist *sg;
-	unsigned int i;
-
-	while (!list_empty(&ctx->tsgl_list)) {
-		sgl = list_first_entry(&ctx->tsgl_list, struct skcipher_tsgl,
-				       list);
-		sg = sgl->sg;
-
-		for (i = 0; i < sgl->cur; i++) {
-			size_t plen = min_t(size_t, used, sg[i].length);
-			struct page *page = sg_page(sg + i);
-
-			if (!page)
-				continue;
-
-			/*
-			 * Assumption: caller created skcipher_count_tsgl(len)
-			 * SG entries in dst.
-			 */
-			if (dst)
-				sg_set_page(dst + i, page, plen, sg[i].offset);
-
-			sg[i].length -= plen;
-			sg[i].offset += plen;
-
-			used -= plen;
-			ctx->used -= plen;
-
-			if (sg[i].length)
-				return;
-
-			if (!dst)
-				put_page(page);
-			sg_assign_page(sg + i, NULL);
-		}
-
-		list_del(&sgl->list);
-		sock_kfree_s(sk, sgl, sizeof(*sgl) + sizeof(sgl->sg[0]) *
-						     (MAX_SGL_ENTS + 1));
-	}
-
-	if (!ctx->used)
-		ctx->merge = 0;
-}
-
-static void skcipher_free_areq_sgls(struct skcipher_async_req *areq)
-{
-	struct sock *sk = areq->sk;
-	struct alg_sock *ask = alg_sk(sk);
-	struct skcipher_ctx *ctx = ask->private;
-	struct skcipher_rsgl *rsgl, *tmp;
-	struct scatterlist *tsgl;
-	struct scatterlist *sg;
-	unsigned int i;
-
-	list_for_each_entry_safe(rsgl, tmp, &areq->rsgl_list, list) {
-		ctx->rcvused -= rsgl->sg_num_bytes;
-		af_alg_free_sg(&rsgl->sgl);
-		list_del(&rsgl->list);
-		if (rsgl != &areq->first_sgl)
-			sock_kfree_s(sk, rsgl, sizeof(*rsgl));
-	}
-
-	tsgl = areq->tsgl;
-	for_each_sg(tsgl, sg, areq->tsgl_entries, i) {
-		if (!sg_page(sg))
-			continue;
-		put_page(sg_page(sg));
-	}
-
-	if (areq->tsgl && areq->tsgl_entries)
-		sock_kfree_s(sk, tsgl, areq->tsgl_entries * sizeof(*tsgl));
-}
-
-static int skcipher_wait_for_wmem(struct sock *sk, unsigned flags)
-{
-	DEFINE_WAIT_FUNC(wait, woken_wake_function);
-	int err = -ERESTARTSYS;
-	long timeout;
-
-	if (flags & MSG_DONTWAIT)
-		return -EAGAIN;
-
-	sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk);
-
-	add_wait_queue(sk_sleep(sk), &wait);
-	for (;;) {
-		if (signal_pending(current))
-			break;
-		timeout = MAX_SCHEDULE_TIMEOUT;
-		if (sk_wait_event(sk, &timeout, skcipher_writable(sk), &wait)) {
-			err = 0;
-			break;
-		}
-	}
-	remove_wait_queue(sk_sleep(sk), &wait);
-
-	return err;
-}
-
-static void skcipher_wmem_wakeup(struct sock *sk)
-{
-	struct socket_wq *wq;
-
-	if (!skcipher_writable(sk))
-		return;
-
-	rcu_read_lock();
-	wq = rcu_dereference(sk->sk_wq);
-	if (skwq_has_sleeper(wq))
-		wake_up_interruptible_sync_poll(&wq->wait, POLLIN |
-							   POLLRDNORM |
-							   POLLRDBAND);
-	sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_IN);
-	rcu_read_unlock();
-}
-
-static int skcipher_wait_for_data(struct sock *sk, unsigned flags)
-{
-	DEFINE_WAIT_FUNC(wait, woken_wake_function);
-	struct alg_sock *ask = alg_sk(sk);
-	struct skcipher_ctx *ctx = ask->private;
-	long timeout;
-	int err = -ERESTARTSYS;
-
-	if (flags & MSG_DONTWAIT) {
-		return -EAGAIN;
-	}
-
-	sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk);
-
-	add_wait_queue(sk_sleep(sk), &wait);
-	for (;;) {
-		if (signal_pending(current))
-			break;
-		timeout = MAX_SCHEDULE_TIMEOUT;
-		if (sk_wait_event(sk, &timeout, ctx->used, &wait)) {
-			err = 0;
-			break;
-		}
-	}
-	remove_wait_queue(sk_sleep(sk), &wait);
-
-	sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk);
-
-	return err;
-}
-
-static void skcipher_data_wakeup(struct sock *sk)
-{
-	struct alg_sock *ask = alg_sk(sk);
-	struct skcipher_ctx *ctx = ask->private;
-	struct socket_wq *wq;
-
-	if (!ctx->used)
-		return;
-
-	rcu_read_lock();
-	wq = rcu_dereference(sk->sk_wq);
-	if (skwq_has_sleeper(wq))
-		wake_up_interruptible_sync_poll(&wq->wait, POLLOUT |
-							   POLLRDNORM |
-							   POLLRDBAND);
-	sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT);
-	rcu_read_unlock();
-}
-
 static int skcipher_sendmsg(struct socket *sock, struct msghdr *msg,
 			    size_t size)
 {
@@ -354,208 +50,11 @@ static int skcipher_sendmsg(struct socket *sock, struct msghdr *msg,
 	struct alg_sock *ask = alg_sk(sk);
 	struct sock *psk = ask->parent;
 	struct alg_sock *pask = alg_sk(psk);
-	struct skcipher_ctx *ctx = ask->private;
 	struct skcipher_tfm *skc = pask->private;
 	struct crypto_skcipher *tfm = skc->skcipher;
 	unsigned ivsize = crypto_skcipher_ivsize(tfm);
-	struct skcipher_tsgl *sgl;
-	struct af_alg_control con = {};
-	long copied = 0;
-	bool enc = 0;
-	bool init = 0;
-	int err;
-	int i;
-
-	if (msg->msg_controllen) {
-		err = af_alg_cmsg_send(msg, &con);
-		if (err)
-			return err;
-
-		init = 1;
-		switch (con.op) {
-		case ALG_OP_ENCRYPT:
-			enc = 1;
-			break;
-		case ALG_OP_DECRYPT:
-			enc = 0;
-			break;
-		default:
-			return -EINVAL;
-		}
-
-		if (con.iv && con.iv->ivlen != ivsize)
-			return -EINVAL;
-	}
-
-	err = -EINVAL;
-
-	lock_sock(sk);
-	if (!ctx->more && ctx->used)
-		goto unlock;
-
-	if (init) {
-		ctx->enc = enc;
-		if (con.iv)
-			memcpy(ctx->iv, con.iv->iv, ivsize);
-	}
-
-	while (size) {
-		struct scatterlist *sg;
-		unsigned long len = size;
-		size_t plen;
-
-		if (ctx->merge) {
-			sgl = list_entry(ctx->tsgl_list.prev,
-					 struct skcipher_tsgl, list);
-			sg = sgl->sg + sgl->cur - 1;
-			len = min_t(unsigned long, len,
-				    PAGE_SIZE - sg->offset - sg->length);
-
-			err = memcpy_from_msg(page_address(sg_page(sg)) +
-					      sg->offset + sg->length,
-					      msg, len);
-			if (err)
-				goto unlock;
-
-			sg->length += len;
-			ctx->merge = (sg->offset + sg->length) &
-				     (PAGE_SIZE - 1);
-
-			ctx->used += len;
-			copied += len;
-			size -= len;
-			continue;
-		}
-
-		if (!skcipher_writable(sk)) {
-			err = skcipher_wait_for_wmem(sk, msg->msg_flags);
-			if (err)
-				goto unlock;
-		}
-
-		len = min_t(unsigned long, len, skcipher_sndbuf(sk));
-
-		err = skcipher_alloc_tsgl(sk);
-		if (err)
-			goto unlock;
-
-		sgl = list_entry(ctx->tsgl_list.prev, struct skcipher_tsgl,
-				 list);
-		sg = sgl->sg;
-		if (sgl->cur)
-			sg_unmark_end(sg + sgl->cur - 1);
-		do {
-			i = sgl->cur;
-			plen = min_t(size_t, len, PAGE_SIZE);
-
-			sg_assign_page(sg + i, alloc_page(GFP_KERNEL));
-			err = -ENOMEM;
-			if (!sg_page(sg + i))
-				goto unlock;
-
-			err = memcpy_from_msg(page_address(sg_page(sg + i)),
-					      msg, plen);
-			if (err) {
-				__free_page(sg_page(sg + i));
-				sg_assign_page(sg + i, NULL);
-				goto unlock;
-			}
-
-			sg[i].length = plen;
-			len -= plen;
-			ctx->used += plen;
-			copied += plen;
-			size -= plen;
-			sgl->cur++;
-		} while (len && sgl->cur < MAX_SGL_ENTS);
-
-		if (!size)
-			sg_mark_end(sg + sgl->cur - 1);
-
-		ctx->merge = plen & (PAGE_SIZE - 1);
-	}
-
-	err = 0;
-
-	ctx->more = msg->msg_flags & MSG_MORE;
-
-unlock:
-	skcipher_data_wakeup(sk);
-	release_sock(sk);
 
-	return copied ?: err;
-}
-
-static ssize_t skcipher_sendpage(struct socket *sock, struct page *page,
-				 int offset, size_t size, int flags)
-{
-	struct sock *sk = sock->sk;
-	struct alg_sock *ask = alg_sk(sk);
-	struct skcipher_ctx *ctx = ask->private;
-	struct skcipher_tsgl *sgl;
-	int err = -EINVAL;
-
-	if (flags & MSG_SENDPAGE_NOTLAST)
-		flags |= MSG_MORE;
-
-	lock_sock(sk);
-	if (!ctx->more && ctx->used)
-		goto unlock;
-
-	if (!size)
-		goto done;
-
-	if (!skcipher_writable(sk)) {
-		err = skcipher_wait_for_wmem(sk, flags);
-		if (err)
-			goto unlock;
-	}
-
-	err = skcipher_alloc_tsgl(sk);
-	if (err)
-		goto unlock;
-
-	ctx->merge = 0;
-	sgl = list_entry(ctx->tsgl_list.prev, struct skcipher_tsgl, list);
-
-	if (sgl->cur)
-		sg_unmark_end(sgl->sg + sgl->cur - 1);
-
-	sg_mark_end(sgl->sg + sgl->cur);
-	get_page(page);
-	sg_set_page(sgl->sg + sgl->cur, page, size, offset);
-	sgl->cur++;
-	ctx->used += size;
-
-done:
-	ctx->more = flags & MSG_MORE;
-
-unlock:
-	skcipher_data_wakeup(sk);
-	release_sock(sk);
-
-	return err ?: size;
-}
-
-static void skcipher_async_cb(struct crypto_async_request *req, int err)
-{
-	struct skcipher_async_req *areq = req->data;
-	struct sock *sk = areq->sk;
-	struct kiocb *iocb = areq->iocb;
-	unsigned int resultlen;
-
-	lock_sock(sk);
-
-	/* Buffer size written by crypto operation. */
-	resultlen = areq->req.cryptlen;
-
-	skcipher_free_areq_sgls(areq);
-	sock_kfree_s(sk, areq, areq->areqlen);
-	__sock_put(sk);
-
-	iocb->ki_complete(iocb, err ? err : resultlen, 0);
-
-	release_sock(sk);
+	return af_alg_sendmsg(sock, msg, size, ivsize);
 }
 
 static int _skcipher_recvmsg(struct socket *sock, struct msghdr *msg,
@@ -565,72 +64,24 @@ static int _skcipher_recvmsg(struct socket *sock, struct msghdr *msg,
 	struct alg_sock *ask = alg_sk(sk);
 	struct sock *psk = ask->parent;
 	struct alg_sock *pask = alg_sk(psk);
-	struct skcipher_ctx *ctx = ask->private;
+	struct af_alg_ctx *ctx = ask->private;
 	struct skcipher_tfm *skc = pask->private;
 	struct crypto_skcipher *tfm = skc->skcipher;
 	unsigned int bs = crypto_skcipher_blocksize(tfm);
-	unsigned int areqlen = sizeof(struct skcipher_async_req) +
-			       crypto_skcipher_reqsize(tfm);
-	struct skcipher_async_req *areq;
-	struct skcipher_rsgl *last_rsgl = NULL;
+	struct af_alg_async_req *areq;
 	int err = 0;
 	size_t len = 0;
 
 	/* Allocate cipher request for current operation. */
-	areq = sock_kmalloc(sk, areqlen, GFP_KERNEL);
-	if (unlikely(!areq))
-		return -ENOMEM;
-	areq->areqlen = areqlen;
-	areq->sk = sk;
-	INIT_LIST_HEAD(&areq->rsgl_list);
-	areq->tsgl = NULL;
-	areq->tsgl_entries = 0;
+	areq = af_alg_alloc_areq(sk, sizeof(struct af_alg_async_req) +
+				     crypto_skcipher_reqsize(tfm));
+	if (IS_ERR(areq))
+		return PTR_ERR(areq);
 
 	/* convert iovecs of output buffers into RX SGL */
-	while (msg_data_left(msg)) {
-		struct skcipher_rsgl *rsgl;
-		size_t seglen;
-
-		/* limit the amount of readable buffers */
-		if (!skcipher_readable(sk))
-			break;
-
-		if (!ctx->used) {
-			err = skcipher_wait_for_data(sk, flags);
-			if (err)
-				goto free;
-		}
-
-		seglen = min_t(size_t, ctx->used, msg_data_left(msg));
-
-		if (list_empty(&areq->rsgl_list)) {
-			rsgl = &areq->first_sgl;
-		} else {
-			rsgl = sock_kmalloc(sk, sizeof(*rsgl), GFP_KERNEL);
-			if (!rsgl) {
-				err = -ENOMEM;
-				goto free;
-			}
-		}
-
-		rsgl->sgl.npages = 0;
-		list_add_tail(&rsgl->list, &areq->rsgl_list);
-
-		/* make one iovec available as scatterlist */
-		err = af_alg_make_sg(&rsgl->sgl, &msg->msg_iter, seglen);
-		if (err < 0)
-			goto free;
-
-		/* chain the new scatterlist with previous one */
-		if (last_rsgl)
-			af_alg_link_sg(&last_rsgl->sgl, &rsgl->sgl);
-
-		last_rsgl = rsgl;
-		len += err;
-		ctx->rcvused += err;
-		rsgl->sg_num_bytes = err;
-		iov_iter_advance(&msg->msg_iter, err);
-	}
+	err = af_alg_get_rsgl(sk, msg, flags, areq, -1, &len);
+	if (err)
+		goto free;
 
 	/* Process only as much RX buffers for which we have TX data */
 	if (len > ctx->used)
@@ -647,7 +98,7 @@ static int _skcipher_recvmsg(struct socket *sock, struct msghdr *msg,
 	 * Create a per request TX SGL for this request which tracks the
 	 * SG entries from the global TX SGL.
 	 */
-	areq->tsgl_entries = skcipher_count_tsgl(sk, len);
+	areq->tsgl_entries = af_alg_count_tsgl(sk, len, 0);
 	if (!areq->tsgl_entries)
 		areq->tsgl_entries = 1;
 	areq->tsgl = sock_kmalloc(sk, sizeof(*areq->tsgl) * areq->tsgl_entries,
@@ -657,44 +108,48 @@ static int _skcipher_recvmsg(struct socket *sock, struct msghdr *msg,
 		goto free;
 	}
 	sg_init_table(areq->tsgl, areq->tsgl_entries);
-	skcipher_pull_tsgl(sk, len, areq->tsgl);
+	af_alg_pull_tsgl(sk, len, areq->tsgl, 0);
 
 	/* Initialize the crypto operation */
-	skcipher_request_set_tfm(&areq->req, tfm);
-	skcipher_request_set_crypt(&areq->req, areq->tsgl,
-				   areq->first_sgl.sgl.sg, len, ctx->iv);
+	skcipher_request_set_tfm(&areq->cra_u.skcipher_req, tfm);
+	skcipher_request_set_crypt(&areq->cra_u.skcipher_req, areq->tsgl,
+				   areq->first_rsgl.sgl.sg, len, ctx->iv);
 
 	if (msg->msg_iocb && !is_sync_kiocb(msg->msg_iocb)) {
 		/* AIO operation */
 		areq->iocb = msg->msg_iocb;
-		skcipher_request_set_callback(&areq->req,
+		skcipher_request_set_callback(&areq->cra_u.skcipher_req,
 					      CRYPTO_TFM_REQ_MAY_SLEEP,
-					      skcipher_async_cb, areq);
-		err = ctx->enc ? crypto_skcipher_encrypt(&areq->req) :
-				 crypto_skcipher_decrypt(&areq->req);
+					      af_alg_async_cb, areq);
+		err = ctx->enc ?
+			crypto_skcipher_encrypt(&areq->cra_u.skcipher_req) :
+			crypto_skcipher_decrypt(&areq->cra_u.skcipher_req);
 	} else {
 		/* Synchronous operation */
-		skcipher_request_set_callback(&areq->req,
+		skcipher_request_set_callback(&areq->cra_u.skcipher_req,
 					      CRYPTO_TFM_REQ_MAY_SLEEP |
 					      CRYPTO_TFM_REQ_MAY_BACKLOG,
 					      af_alg_complete,
 					      &ctx->completion);
 		err = af_alg_wait_for_completion(ctx->enc ?
-					crypto_skcipher_encrypt(&areq->req) :
-					crypto_skcipher_decrypt(&areq->req),
+			crypto_skcipher_encrypt(&areq->cra_u.skcipher_req) :
+			crypto_skcipher_decrypt(&areq->cra_u.skcipher_req),
 						 &ctx->completion);
 	}
 
 	/* AIO operation in progress */
 	if (err == -EINPROGRESS) {
 		sock_hold(sk);
+
+		/* Remember output size that will be generated. */
+		areq->outlen = len;
+
 		return -EIOCBQUEUED;
 	}
 
 free:
-	skcipher_free_areq_sgls(areq);
-	if (areq)
-		sock_kfree_s(sk, areq, areqlen);
+	af_alg_free_areq_sgls(areq);
+	sock_kfree_s(sk, areq, areq->areqlen);
 
 	return err ? err : len;
 }
@@ -727,30 +182,11 @@ static int skcipher_recvmsg(struct socket *sock, struct msghdr *msg,
 	}
 
 out:
-	skcipher_wmem_wakeup(sk);
+	af_alg_wmem_wakeup(sk);
 	release_sock(sk);
 	return ret;
 }
 
-static unsigned int skcipher_poll(struct file *file, struct socket *sock,
-				  poll_table *wait)
-{
-	struct sock *sk = sock->sk;
-	struct alg_sock *ask = alg_sk(sk);
-	struct skcipher_ctx *ctx = ask->private;
-	unsigned int mask;
-
-	sock_poll_wait(file, sk_sleep(sk), wait);
-	mask = 0;
-
-	if (ctx->used)
-		mask |= POLLIN | POLLRDNORM;
-
-	if (skcipher_writable(sk))
-		mask |= POLLOUT | POLLWRNORM | POLLWRBAND;
-
-	return mask;
-}
 
 static struct proto_ops algif_skcipher_ops = {
 	.family		=	PF_ALG,
@@ -769,9 +205,9 @@ static struct proto_ops algif_skcipher_ops = {
 
 	.release	=	af_alg_release,
 	.sendmsg	=	skcipher_sendmsg,
-	.sendpage	=	skcipher_sendpage,
+	.sendpage	=	af_alg_sendpage,
 	.recvmsg	=	skcipher_recvmsg,
-	.poll		=	skcipher_poll,
+	.poll		=	af_alg_poll,
 };
 
 static int skcipher_check_key(struct socket *sock)
@@ -833,7 +269,7 @@ static ssize_t skcipher_sendpage_nokey(struct socket *sock, struct page *page,
 	if (err)
 		return err;
 
-	return skcipher_sendpage(sock, page, offset, size, flags);
+	return af_alg_sendpage(sock, page, offset, size, flags);
 }
 
 static int skcipher_recvmsg_nokey(struct socket *sock, struct msghdr *msg,
@@ -867,7 +303,7 @@ static struct proto_ops algif_skcipher_ops_nokey = {
 	.sendmsg	=	skcipher_sendmsg_nokey,
 	.sendpage	=	skcipher_sendpage_nokey,
 	.recvmsg	=	skcipher_recvmsg_nokey,
-	.poll		=	skcipher_poll,
+	.poll		=	af_alg_poll,
 };
 
 static void *skcipher_bind(const char *name, u32 type, u32 mask)
@@ -912,13 +348,13 @@ static int skcipher_setkey(void *private, const u8 *key, unsigned int keylen)
 static void skcipher_sock_destruct(struct sock *sk)
 {
 	struct alg_sock *ask = alg_sk(sk);
-	struct skcipher_ctx *ctx = ask->private;
+	struct af_alg_ctx *ctx = ask->private;
 	struct sock *psk = ask->parent;
 	struct alg_sock *pask = alg_sk(psk);
 	struct skcipher_tfm *skc = pask->private;
 	struct crypto_skcipher *tfm = skc->skcipher;
 
-	skcipher_pull_tsgl(sk, ctx->used, NULL);
+	af_alg_pull_tsgl(sk, ctx->used, NULL, 0);
 	sock_kzfree_s(sk, ctx->iv, crypto_skcipher_ivsize(tfm));
 	sock_kfree_s(sk, ctx, ctx->len);
 	af_alg_release_parent(sk);
@@ -926,7 +362,7 @@ static void skcipher_sock_destruct(struct sock *sk)
 
 static int skcipher_accept_parent_nokey(void *private, struct sock *sk)
 {
-	struct skcipher_ctx *ctx;
+	struct af_alg_ctx *ctx;
 	struct alg_sock *ask = alg_sk(sk);
 	struct skcipher_tfm *tfm = private;
 	struct crypto_skcipher *skcipher = tfm->skcipher;
diff --git a/include/crypto/if_alg.h b/include/crypto/if_alg.h
index e2b9c6fe2714..75ec9c662268 100644
--- a/include/crypto/if_alg.h
+++ b/include/crypto/if_alg.h
@@ -20,6 +20,9 @@
 #include <linux/types.h>
 #include <net/sock.h>
 
+#include <crypto/aead.h>
+#include <crypto/skcipher.h>
+
 #define ALG_MAX_PAGES			16
 
 struct crypto_async_request;
@@ -68,6 +71,99 @@ struct af_alg_sgl {
 	unsigned int npages;
 };
 
+/* TX SGL entry */
+struct af_alg_tsgl {
+	struct list_head list;
+	unsigned int cur;		/* Last processed SG entry */
+	struct scatterlist sg[0];	/* Array of SGs forming the SGL */
+};
+
+#define MAX_SGL_ENTS ((4096 - sizeof(struct af_alg_tsgl)) / \
+		      sizeof(struct scatterlist) - 1)
+
+/* RX SGL entry */
+struct af_alg_rsgl {
+	struct af_alg_sgl sgl;
+	struct list_head list;
+	size_t sg_num_bytes;		/* Bytes of data in that SGL */
+};
+
+/**
+ * struct af_alg_async_req - definition of crypto request
+ * @iocb:		IOCB for AIO operations
+ * @sk:			Socket the request is associated with
+ * @first_rsgl:		First RX SG
+ * @last_rsgl:		Pointer to last RX SG
+ * @rsgl_list:		Track RX SGs
+ * @tsgl:		Private, per request TX SGL of buffers to process
+ * @tsgl_entries:	Number of entries in priv. TX SGL
+ * @outlen:		Number of output bytes generated by crypto op
+ * @areqlen:		Length of this data structure
+ * @cra_u:		Cipher request
+ */
+struct af_alg_async_req {
+	struct kiocb *iocb;
+	struct sock *sk;
+
+	struct af_alg_rsgl first_rsgl;
+	struct af_alg_rsgl *last_rsgl;
+	struct list_head rsgl_list;
+
+	struct scatterlist *tsgl;
+	unsigned int tsgl_entries;
+
+	unsigned int outlen;
+	unsigned int areqlen;
+
+	union {
+		struct aead_request aead_req;
+		struct skcipher_request skcipher_req;
+	} cra_u;
+
+	/* req ctx trails this struct */
+};
+
+/**
+ * struct af_alg_ctx - definition of the crypto context
+ *
+ * The crypto context tracks the input data during the lifetime of an AF_ALG
+ * socket.
+ *
+ * @tsgl_list:		Link to TX SGL
+ * @iv:			IV for cipher operation
+ * @aead_assoclen:	Length of AAD for AEAD cipher operations
+ * @completion:		Work queue for synchronous operation
+ * @used:		TX bytes sent to kernel. This variable is used to
+ *			ensure that user space cannot cause the kernel
+ *			to allocate too much memory in sendmsg operation.
+ * @rcvused:		Total RX bytes to be filled by kernel. This variable
+ *			is used to ensure user space cannot cause the kernel
+ *			to allocate too much memory in a recvmsg operation.
+ * @more:		More data to be expected from user space?
+ * @merge:		Shall new data from user space be merged into existing
+ *			SG?
+ * @enc:		Cryptographic operation to be performed when
+ *			recvmsg is invoked.
+ * @len:		Length of memory allocated for this data structure.
+ */
+struct af_alg_ctx {
+	struct list_head tsgl_list;
+
+	void *iv;
+	size_t aead_assoclen;
+
+	struct af_alg_completion completion;
+
+	size_t used;
+	size_t rcvused;
+
+	bool more;
+	bool merge;
+	bool enc;
+
+	unsigned int len;
+};
+
 int af_alg_register_type(const struct af_alg_type *type);
 int af_alg_unregister_type(const struct af_alg_type *type);
 
@@ -94,4 +190,78 @@ static inline void af_alg_init_completion(struct af_alg_completion *completion)
 	init_completion(&completion->completion);
 }
 
+/**
+ * Size of available buffer for sending data from user space to kernel.
+ *
+ * @sk socket of connection to user space
+ * @return number of bytes still available
+ */
+static inline int af_alg_sndbuf(struct sock *sk)
+{
+	struct alg_sock *ask = alg_sk(sk);
+	struct af_alg_ctx *ctx = ask->private;
+
+	return max_t(int, max_t(int, sk->sk_sndbuf & PAGE_MASK, PAGE_SIZE) -
+			  ctx->used, 0);
+}
+
+/**
+ * Can the send buffer still be written to?
+ *
+ * @sk socket of connection to user space
+ * @return true => writable, false => not writable
+ */
+static inline bool af_alg_writable(struct sock *sk)
+{
+	return PAGE_SIZE <= af_alg_sndbuf(sk);
+}
+
+/**
+ * Size of available buffer used by kernel for the RX user space operation.
+ *
+ * @sk socket of connection to user space
+ * @return number of bytes still available
+ */
+static inline int af_alg_rcvbuf(struct sock *sk)
+{
+	struct alg_sock *ask = alg_sk(sk);
+	struct af_alg_ctx *ctx = ask->private;
+
+	return max_t(int, max_t(int, sk->sk_rcvbuf & PAGE_MASK, PAGE_SIZE) -
+			  ctx->rcvused, 0);
+}
+
+/**
+ * Can the RX buffer still be written to?
+ *
+ * @sk socket of connection to user space
+ * @return true => writable, false => not writable
+ */
+static inline bool af_alg_readable(struct sock *sk)
+{
+	return PAGE_SIZE <= af_alg_rcvbuf(sk);
+}
+
+int af_alg_alloc_tsgl(struct sock *sk);
+unsigned int af_alg_count_tsgl(struct sock *sk, size_t bytes, size_t offset);
+void af_alg_pull_tsgl(struct sock *sk, size_t used, struct scatterlist *dst,
+		      size_t dst_offset);
+void af_alg_free_areq_sgls(struct af_alg_async_req *areq);
+int af_alg_wait_for_wmem(struct sock *sk, unsigned int flags);
+void af_alg_wmem_wakeup(struct sock *sk);
+int af_alg_wait_for_data(struct sock *sk, unsigned flags);
+void af_alg_data_wakeup(struct sock *sk);
+int af_alg_sendmsg(struct socket *sock, struct msghdr *msg, size_t size,
+		   unsigned int ivsize);
+ssize_t af_alg_sendpage(struct socket *sock, struct page *page,
+			int offset, size_t size, int flags);
+void af_alg_async_cb(struct crypto_async_request *_req, int err);
+unsigned int af_alg_poll(struct file *file, struct socket *sock,
+			 poll_table *wait);
+struct af_alg_async_req *af_alg_alloc_areq(struct sock *sk,
+					   unsigned int areqlen);
+int af_alg_get_rsgl(struct sock *sk, struct msghdr *msg, int flags,
+		    struct af_alg_async_req *areq, size_t maxsize,
+		    size_t *outlen);
+
 #endif	/* _CRYPTO_IF_ALG_H */
-- 
cgit v1.2.3


From 9047811b776ce09ba06623dd2a846cc501f0065b Mon Sep 17 00:00:00 2001
From: "Ismail, Mustafa" <mustafa.ismail@intel.com>
Date: Wed, 28 Jun 2017 09:02:45 -0500
Subject: RDMA/core: Add wait/retry version of ibnl_unicast

Add a wait/retry version of ibnl_unicast, ibnl_unicast_wait,
and modify ibnl_unicast to not wait/retry.  This eliminates
the undesirable wait for future users of ibnl_unicast.

Change Portmapper calls originating from kernel to user-space
to use ibnl_unicast_wait and take advantage of the wait/retry
logic in netlink_unicast.

Signed-off-by: Mustafa Ismail <mustafa.ismail@intel.com>
Signed-off-by: Chien Tin Tung <chien.tin.tung@intel.com>
Signed-off-by: Leon Romanovsky <leon@kernel.org>
---
 drivers/infiniband/core/iwpm_msg.c |  6 +++---
 drivers/infiniband/core/netlink.c  | 12 +++++++++++-
 include/rdma/rdma_netlink.h        | 10 ++++++++++
 3 files changed, 24 insertions(+), 4 deletions(-)

(limited to 'include')

diff --git a/drivers/infiniband/core/iwpm_msg.c b/drivers/infiniband/core/iwpm_msg.c
index a0e7c16d8bd8..add99b92afdf 100644
--- a/drivers/infiniband/core/iwpm_msg.c
+++ b/drivers/infiniband/core/iwpm_msg.c
@@ -174,7 +174,7 @@ int iwpm_add_mapping(struct iwpm_sa_data *pm_msg, u8 nl_client)
 		goto add_mapping_error;
 	nlmsg_request->req_buffer = pm_msg;
 
-	ret = ibnl_unicast(skb, nlh, iwpm_user_pid);
+	ret = ibnl_unicast_wait(skb, nlh, iwpm_user_pid);
 	if (ret) {
 		skb = NULL; /* skb is freed in the netlink send-op handling */
 		iwpm_user_pid = IWPM_PID_UNDEFINED;
@@ -251,7 +251,7 @@ int iwpm_add_and_query_mapping(struct iwpm_sa_data *pm_msg, u8 nl_client)
 		goto query_mapping_error;
 	nlmsg_request->req_buffer = pm_msg;
 
-	ret = ibnl_unicast(skb, nlh, iwpm_user_pid);
+	ret = ibnl_unicast_wait(skb, nlh, iwpm_user_pid);
 	if (ret) {
 		skb = NULL; /* skb is freed in the netlink send-op handling */
 		err_str = "Unable to send a nlmsg";
@@ -312,7 +312,7 @@ int iwpm_remove_mapping(struct sockaddr_storage *local_addr, u8 nl_client)
 	if (ret)
 		goto remove_mapping_error;
 
-	ret = ibnl_unicast(skb, nlh, iwpm_user_pid);
+	ret = ibnl_unicast_wait(skb, nlh, iwpm_user_pid);
 	if (ret) {
 		skb = NULL; /* skb is freed in the netlink send-op handling */
 		iwpm_user_pid = IWPM_PID_UNDEFINED;
diff --git a/drivers/infiniband/core/netlink.c b/drivers/infiniband/core/netlink.c
index 94931c474d41..0fc50e15ae22 100644
--- a/drivers/infiniband/core/netlink.c
+++ b/drivers/infiniband/core/netlink.c
@@ -232,11 +232,21 @@ int ibnl_unicast(struct sk_buff *skb, struct nlmsghdr *nlh,
 {
 	int err;
 
-	err = netlink_unicast(nls, skb, pid, 0);
+	err = netlink_unicast(nls, skb, pid, MSG_DONTWAIT);
 	return (err < 0) ? err : 0;
 }
 EXPORT_SYMBOL(ibnl_unicast);
 
+int ibnl_unicast_wait(struct sk_buff *skb, struct nlmsghdr *nlh,
+		      __u32 pid)
+{
+	int err;
+
+	err = netlink_unicast(nls, skb, pid, 0);
+	return (err < 0) ? err : 0;
+}
+EXPORT_SYMBOL(ibnl_unicast_wait);
+
 int ibnl_multicast(struct sk_buff *skb, struct nlmsghdr *nlh,
 			unsigned int group, gfp_t flags)
 {
diff --git a/include/rdma/rdma_netlink.h b/include/rdma/rdma_netlink.h
index 348c102cb5f6..5b1466770917 100644
--- a/include/rdma/rdma_netlink.h
+++ b/include/rdma/rdma_netlink.h
@@ -63,6 +63,16 @@ int ibnl_put_attr(struct sk_buff *skb, struct nlmsghdr *nlh,
 int ibnl_unicast(struct sk_buff *skb, struct nlmsghdr *nlh,
 			__u32 pid);
 
+/**
+ * Send, with wait/1 retry, the supplied skb to a specific userspace PID.
+ * @skb: The netlink skb
+ * @nlh: Header of the netlink message to send
+ * @pid: Userspace netlink process ID
+ * Returns 0 on success or a negative error code.
+ */
+int ibnl_unicast_wait(struct sk_buff *skb, struct nlmsghdr *nlh,
+		      __u32 pid);
+
 /**
  * Send the supplied skb to a netlink group.
  * @skb: The netlink skb
-- 
cgit v1.2.3


From 79bcd34ccfe009ad21d16100ae2aef9b378a512d Mon Sep 17 00:00:00 2001
From: Hans Verkuil <hverkuil@xs4all.nl>
Date: Tue, 1 Aug 2017 07:53:30 -0400
Subject: media: cec-funcs.h: cec_ops_report_features: set *dev_features to
 NULL

gcc can get confused by this code and it thinks dev_features can be
returned uninitialized. So initialize to NULL at the beginning to shut up
the warning.

Signed-off-by: Hans Verkuil <hans.verkuil@cisco.com>
Signed-off-by: Mauro Carvalho Chehab <mchehab@s-opensource.com>
---
 include/uapi/linux/cec-funcs.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include')

diff --git a/include/uapi/linux/cec-funcs.h b/include/uapi/linux/cec-funcs.h
index c451eec42a83..270b251a3d9b 100644
--- a/include/uapi/linux/cec-funcs.h
+++ b/include/uapi/linux/cec-funcs.h
@@ -895,6 +895,7 @@ static inline void cec_ops_report_features(const struct cec_msg *msg,
 	*cec_version = msg->msg[2];
 	*all_device_types = msg->msg[3];
 	*rc_profile = p;
+	*dev_features = NULL;
 	while (p < &msg->msg[14] && (*p & CEC_OP_FEAT_EXT))
 		p++;
 	if (!(*p & CEC_OP_FEAT_EXT)) {
-- 
cgit v1.2.3


From ee0c503eacfb97e1d443baf5a03939201d0244f5 Mon Sep 17 00:00:00 2001
From: Hans Verkuil <hans.verkuil@cisco.com>
Date: Fri, 4 Aug 2017 06:41:51 -0400
Subject: media: media/cec.h: add CEC_CAP_DEFAULTS

The CEC_CAP_LOG_ADDRS, CEC_CAP_TRANSMIT, CEC_CAP_PASSTHROUGH and
CEC_CAP_RC capabilities are normally always present.

Add a CEC_CAP_DEFAULTS define that ORs these four caps to simplify
drivers.

Signed-off-by: Hans Verkuil <hans.verkuil@cisco.com>
Signed-off-by: Mauro Carvalho Chehab <mchehab@s-opensource.com>
---
 include/media/cec.h | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'include')

diff --git a/include/media/cec.h b/include/media/cec.h
index 224a6e225c52..1bec7bde4792 100644
--- a/include/media/cec.h
+++ b/include/media/cec.h
@@ -31,6 +31,9 @@
 #include <media/rc-core.h>
 #include <media/cec-notifier.h>
 
+#define CEC_CAP_DEFAULTS (CEC_CAP_LOG_ADDRS | CEC_CAP_TRANSMIT | \
+			  CEC_CAP_PASSTHROUGH | CEC_CAP_RC)
+
 /**
  * struct cec_devnode - cec device node
  * @dev:	cec device
-- 
cgit v1.2.3


From af74030591c13331760fe66246338f64ded3f76f Mon Sep 17 00:00:00 2001
From: Arvind Yadav <arvind.yadav.cs@gmail.com>
Date: Wed, 2 Aug 2017 13:14:49 -0400
Subject: media: drv-intf: saa7146: constify pci_device_id

pci_device_id are not supposed to change at runtime. All functions
working with pci_device_id provided by <linux/pci.h> work with
const pci_device_id. So mark the non-const structs as const.
So making 'pci_tbl' as const member of 'struct saa7146_extension'.

Signed-off-by: Arvind Yadav <arvind.yadav.cs@gmail.com>
Signed-off-by: Hans Verkuil <hans.verkuil@cisco.com>
Signed-off-by: Mauro Carvalho Chehab <mchehab@s-opensource.com>
---
 include/media/drv-intf/saa7146.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/media/drv-intf/saa7146.h b/include/media/drv-intf/saa7146.h
index 96058a5a4acc..45294328614d 100644
--- a/include/media/drv-intf/saa7146.h
+++ b/include/media/drv-intf/saa7146.h
@@ -96,7 +96,7 @@ struct saa7146_extension
 	   supported devices, last entry 0xffff, 0xfff */
 	struct module *module;
 	struct pci_driver driver;
-	struct pci_device_id *pci_tbl;
+	const struct pci_device_id *pci_tbl;
 
 	/* extension functions */
 	int (*probe)(struct saa7146_dev *);
-- 
cgit v1.2.3


From cb87481ee89dbd6609e227afbf64900fb4e5c930 Mon Sep 17 00:00:00 2001
From: Nicholas Piggin <npiggin@gmail.com>
Date: Wed, 26 Jul 2017 22:46:27 +1000
Subject: kbuild: linker script do not match C names unless
 LD_DEAD_CODE_DATA_ELIMINATION is configured

The .data and .bss sections were modified in the generic linker script to
pull in sections named .data.<C identifier>, which are generated by gcc with
-ffunction-sections and -fdata-sections options.

The problem with this pattern is it can also match section names that Linux
defines explicitly, e.g., .data.unlikely. This can cause Linux sections to
get moved into the wrong place.

The way to avoid this is to use ".." separators for explicit section names
(the dot character is valid in a section name but not a C identifier).
However currently there are sections which don't follow this rule, so for
now just disable the wild card by default.

Example: http://marc.info/?l=linux-arm-kernel&m=150106824024221&w=2

Cc: <stable@vger.kernel.org> # 4.9
Fixes: b67067f1176df ("kbuild: allow archs to select link dead code/data elimination")
Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
Signed-off-by: Masahiro Yamada <yamada.masahiro@socionext.com>
---
 include/asm-generic/vmlinux.lds.h | 38 ++++++++++++++++++++++++++------------
 1 file changed, 26 insertions(+), 12 deletions(-)

(limited to 'include')

diff --git a/include/asm-generic/vmlinux.lds.h b/include/asm-generic/vmlinux.lds.h
index da0be9a8d1de..9623d78f8494 100644
--- a/include/asm-generic/vmlinux.lds.h
+++ b/include/asm-generic/vmlinux.lds.h
@@ -59,6 +59,22 @@
 /* Align . to a 8 byte boundary equals to maximum function alignment. */
 #define ALIGN_FUNCTION()  . = ALIGN(8)
 
+/*
+ * LD_DEAD_CODE_DATA_ELIMINATION option enables -fdata-sections, which
+ * generates .data.identifier sections, which need to be pulled in with
+ * .data. We don't want to pull in .data..other sections, which Linux
+ * has defined. Same for text and bss.
+ */
+#ifdef CONFIG_LD_DEAD_CODE_DATA_ELIMINATION
+#define TEXT_MAIN .text .text.[0-9a-zA-Z_]*
+#define DATA_MAIN .data .data.[0-9a-zA-Z_]*
+#define BSS_MAIN .bss .bss.[0-9a-zA-Z_]*
+#else
+#define TEXT_MAIN .text
+#define DATA_MAIN .data
+#define BSS_MAIN .bss
+#endif
+
 /*
  * Align to a 32 byte boundary equal to the
  * alignment gcc 4.5 uses for a struct
@@ -198,12 +214,9 @@
 
 /*
  * .data section
- * LD_DEAD_CODE_DATA_ELIMINATION option enables -fdata-sections generates
- * .data.identifier which needs to be pulled in with .data, but don't want to
- * pull in .data..stuff which has its own requirements. Same for bss.
  */
 #define DATA_DATA							\
-	*(.data .data.[0-9a-zA-Z_]*)					\
+	*(DATA_MAIN)							\
 	*(.ref.data)							\
 	*(.data..shared_aligned) /* percpu related */			\
 	MEM_KEEP(init.data)						\
@@ -434,16 +447,17 @@
 		VMLINUX_SYMBOL(__security_initcall_end) = .;		\
 	}
 
-/* .text section. Map to function alignment to avoid address changes
+/*
+ * .text section. Map to function alignment to avoid address changes
  * during second ld run in second ld pass when generating System.map
- * LD_DEAD_CODE_DATA_ELIMINATION option enables -ffunction-sections generates
- * .text.identifier which needs to be pulled in with .text , but some
- * architectures define .text.foo which is not intended to be pulled in here.
- * Those enabling LD_DEAD_CODE_DATA_ELIMINATION must ensure they don't have
- * conflicting section names, and must pull in .text.[0-9a-zA-Z_]* */
+ *
+ * TEXT_MAIN here will match .text.fixup and .text.unlikely if dead
+ * code elimination is enabled, so these sections should be converted
+ * to use ".." first.
+ */
 #define TEXT_TEXT							\
 		ALIGN_FUNCTION();					\
-		*(.text.hot .text .text.fixup .text.unlikely)		\
+		*(.text.hot TEXT_MAIN .text.fixup .text.unlikely)	\
 		*(.ref.text)						\
 	MEM_KEEP(init.text)						\
 	MEM_KEEP(exit.text)						\
@@ -613,7 +627,7 @@
 		BSS_FIRST_SECTIONS					\
 		*(.bss..page_aligned)					\
 		*(.dynbss)						\
-		*(.bss .bss.[0-9a-zA-Z_]*)				\
+		*(BSS_MAIN)						\
 		*(COMMON)						\
 	}
 
-- 
cgit v1.2.3


From 6ec4e2514decd6fb4782a9364fa71d6244d05af4 Mon Sep 17 00:00:00 2001
From: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Date: Thu, 13 Jul 2017 18:16:01 +0100
Subject: md/raid6: implement recovery using ARM NEON intrinsics

Provide a NEON accelerated implementation of the recovery algorithm,
which supersedes the default byte-by-byte one.

Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 include/linux/raid/pq.h      |   1 +
 lib/raid6/Makefile           |   4 +-
 lib/raid6/algos.c            |   3 ++
 lib/raid6/recov_neon.c       | 110 ++++++++++++++++++++++++++++++++++++++++
 lib/raid6/recov_neon_inner.c | 117 +++++++++++++++++++++++++++++++++++++++++++
 5 files changed, 234 insertions(+), 1 deletion(-)
 create mode 100644 lib/raid6/recov_neon.c
 create mode 100644 lib/raid6/recov_neon_inner.c

(limited to 'include')

diff --git a/include/linux/raid/pq.h b/include/linux/raid/pq.h
index 30f945329818..583cdd3d49ca 100644
--- a/include/linux/raid/pq.h
+++ b/include/linux/raid/pq.h
@@ -121,6 +121,7 @@ extern const struct raid6_recov_calls raid6_recov_ssse3;
 extern const struct raid6_recov_calls raid6_recov_avx2;
 extern const struct raid6_recov_calls raid6_recov_avx512;
 extern const struct raid6_recov_calls raid6_recov_s390xc;
+extern const struct raid6_recov_calls raid6_recov_neon;
 
 extern const struct raid6_calls raid6_neonx1;
 extern const struct raid6_calls raid6_neonx2;
diff --git a/lib/raid6/Makefile b/lib/raid6/Makefile
index 3057011f5599..a93adf6dcfb2 100644
--- a/lib/raid6/Makefile
+++ b/lib/raid6/Makefile
@@ -5,7 +5,7 @@ raid6_pq-y	+= algos.o recov.o tables.o int1.o int2.o int4.o \
 
 raid6_pq-$(CONFIG_X86) += recov_ssse3.o recov_avx2.o mmx.o sse1.o sse2.o avx2.o avx512.o recov_avx512.o
 raid6_pq-$(CONFIG_ALTIVEC) += altivec1.o altivec2.o altivec4.o altivec8.o
-raid6_pq-$(CONFIG_KERNEL_MODE_NEON) += neon.o neon1.o neon2.o neon4.o neon8.o
+raid6_pq-$(CONFIG_KERNEL_MODE_NEON) += neon.o neon1.o neon2.o neon4.o neon8.o recov_neon.o recov_neon_inner.o
 raid6_pq-$(CONFIG_TILEGX) += tilegx8.o
 raid6_pq-$(CONFIG_S390) += s390vx8.o recov_s390xc.o
 
@@ -26,7 +26,9 @@ NEON_FLAGS := -ffreestanding
 ifeq ($(ARCH),arm)
 NEON_FLAGS += -mfloat-abi=softfp -mfpu=neon
 endif
+CFLAGS_recov_neon_inner.o += $(NEON_FLAGS)
 ifeq ($(ARCH),arm64)
+CFLAGS_REMOVE_recov_neon_inner.o += -mgeneral-regs-only
 CFLAGS_REMOVE_neon1.o += -mgeneral-regs-only
 CFLAGS_REMOVE_neon2.o += -mgeneral-regs-only
 CFLAGS_REMOVE_neon4.o += -mgeneral-regs-only
diff --git a/lib/raid6/algos.c b/lib/raid6/algos.c
index 7857049fd7d3..476994723258 100644
--- a/lib/raid6/algos.c
+++ b/lib/raid6/algos.c
@@ -112,6 +112,9 @@ const struct raid6_recov_calls *const raid6_recov_algos[] = {
 #endif
 #ifdef CONFIG_S390
 	&raid6_recov_s390xc,
+#endif
+#if defined(CONFIG_KERNEL_MODE_NEON)
+	&raid6_recov_neon,
 #endif
 	&raid6_recov_intx1,
 	NULL
diff --git a/lib/raid6/recov_neon.c b/lib/raid6/recov_neon.c
new file mode 100644
index 000000000000..eeb5c4065b92
--- /dev/null
+++ b/lib/raid6/recov_neon.c
@@ -0,0 +1,110 @@
+/*
+ * Copyright (C) 2012 Intel Corporation
+ * Copyright (C) 2017 Linaro Ltd. <ard.biesheuvel@linaro.org>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; version 2
+ * of the License.
+ */
+
+#include <linux/raid/pq.h>
+
+#ifdef __KERNEL__
+#include <asm/neon.h>
+#else
+#define kernel_neon_begin()
+#define kernel_neon_end()
+#define cpu_has_neon()		(1)
+#endif
+
+static int raid6_has_neon(void)
+{
+	return cpu_has_neon();
+}
+
+void __raid6_2data_recov_neon(int bytes, uint8_t *p, uint8_t *q, uint8_t *dp,
+			      uint8_t *dq, const uint8_t *pbmul,
+			      const uint8_t *qmul);
+
+void __raid6_datap_recov_neon(int bytes, uint8_t *p, uint8_t *q, uint8_t *dq,
+			      const uint8_t *qmul);
+
+static void raid6_2data_recov_neon(int disks, size_t bytes, int faila,
+		int failb, void **ptrs)
+{
+	u8 *p, *q, *dp, *dq;
+	const u8 *pbmul;	/* P multiplier table for B data */
+	const u8 *qmul;		/* Q multiplier table (for both) */
+
+	p = (u8 *)ptrs[disks - 2];
+	q = (u8 *)ptrs[disks - 1];
+
+	/*
+	 * Compute syndrome with zero for the missing data pages
+	 * Use the dead data pages as temporary storage for
+	 * delta p and delta q
+	 */
+	dp = (u8 *)ptrs[faila];
+	ptrs[faila] = (void *)raid6_empty_zero_page;
+	ptrs[disks - 2] = dp;
+	dq = (u8 *)ptrs[failb];
+	ptrs[failb] = (void *)raid6_empty_zero_page;
+	ptrs[disks - 1] = dq;
+
+	raid6_call.gen_syndrome(disks, bytes, ptrs);
+
+	/* Restore pointer table */
+	ptrs[faila]     = dp;
+	ptrs[failb]     = dq;
+	ptrs[disks - 2] = p;
+	ptrs[disks - 1] = q;
+
+	/* Now, pick the proper data tables */
+	pbmul = raid6_vgfmul[raid6_gfexi[failb-faila]];
+	qmul  = raid6_vgfmul[raid6_gfinv[raid6_gfexp[faila] ^
+					 raid6_gfexp[failb]]];
+
+	kernel_neon_begin();
+	__raid6_2data_recov_neon(bytes, p, q, dp, dq, pbmul, qmul);
+	kernel_neon_end();
+}
+
+static void raid6_datap_recov_neon(int disks, size_t bytes, int faila,
+		void **ptrs)
+{
+	u8 *p, *q, *dq;
+	const u8 *qmul;		/* Q multiplier table */
+
+	p = (u8 *)ptrs[disks - 2];
+	q = (u8 *)ptrs[disks - 1];
+
+	/*
+	 * Compute syndrome with zero for the missing data page
+	 * Use the dead data page as temporary storage for delta q
+	 */
+	dq = (u8 *)ptrs[faila];
+	ptrs[faila] = (void *)raid6_empty_zero_page;
+	ptrs[disks - 1] = dq;
+
+	raid6_call.gen_syndrome(disks, bytes, ptrs);
+
+	/* Restore pointer table */
+	ptrs[faila]     = dq;
+	ptrs[disks - 1] = q;
+
+	/* Now, pick the proper data tables */
+	qmul = raid6_vgfmul[raid6_gfinv[raid6_gfexp[faila]]];
+
+	kernel_neon_begin();
+	__raid6_datap_recov_neon(bytes, p, q, dq, qmul);
+	kernel_neon_end();
+}
+
+const struct raid6_recov_calls raid6_recov_neon = {
+	.data2		= raid6_2data_recov_neon,
+	.datap		= raid6_datap_recov_neon,
+	.valid		= raid6_has_neon,
+	.name		= "neon",
+	.priority	= 10,
+};
diff --git a/lib/raid6/recov_neon_inner.c b/lib/raid6/recov_neon_inner.c
new file mode 100644
index 000000000000..8cd20c9f834a
--- /dev/null
+++ b/lib/raid6/recov_neon_inner.c
@@ -0,0 +1,117 @@
+/*
+ * Copyright (C) 2012 Intel Corporation
+ * Copyright (C) 2017 Linaro Ltd. <ard.biesheuvel@linaro.org>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; version 2
+ * of the License.
+ */
+
+#include <arm_neon.h>
+
+static const uint8x16_t x0f = {
+	0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f,
+	0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f,
+};
+
+#ifdef CONFIG_ARM
+/*
+ * AArch32 does not provide this intrinsic natively because it does not
+ * implement the underlying instruction. AArch32 only provides a 64-bit
+ * wide vtbl.8 instruction, so use that instead.
+ */
+static uint8x16_t vqtbl1q_u8(uint8x16_t a, uint8x16_t b)
+{
+	union {
+		uint8x16_t	val;
+		uint8x8x2_t	pair;
+	} __a = { a };
+
+	return vcombine_u8(vtbl2_u8(__a.pair, vget_low_u8(b)),
+			   vtbl2_u8(__a.pair, vget_high_u8(b)));
+}
+#endif
+
+void __raid6_2data_recov_neon(int bytes, uint8_t *p, uint8_t *q, uint8_t *dp,
+			      uint8_t *dq, const uint8_t *pbmul,
+			      const uint8_t *qmul)
+{
+	uint8x16_t pm0 = vld1q_u8(pbmul);
+	uint8x16_t pm1 = vld1q_u8(pbmul + 16);
+	uint8x16_t qm0 = vld1q_u8(qmul);
+	uint8x16_t qm1 = vld1q_u8(qmul + 16);
+
+	/*
+	 * while ( bytes-- ) {
+	 *	uint8_t px, qx, db;
+	 *
+	 *	px    = *p ^ *dp;
+	 *	qx    = qmul[*q ^ *dq];
+	 *	*dq++ = db = pbmul[px] ^ qx;
+	 *	*dp++ = db ^ px;
+	 *	p++; q++;
+	 * }
+	 */
+
+	while (bytes) {
+		uint8x16_t vx, vy, px, qx, db;
+
+		px = veorq_u8(vld1q_u8(p), vld1q_u8(dp));
+		vx = veorq_u8(vld1q_u8(q), vld1q_u8(dq));
+
+		vy = (uint8x16_t)vshrq_n_s16((int16x8_t)vx, 4);
+		vx = vqtbl1q_u8(qm0, vandq_u8(vx, x0f));
+		vy = vqtbl1q_u8(qm1, vandq_u8(vy, x0f));
+		qx = veorq_u8(vx, vy);
+
+		vy = (uint8x16_t)vshrq_n_s16((int16x8_t)px, 4);
+		vx = vqtbl1q_u8(pm0, vandq_u8(px, x0f));
+		vy = vqtbl1q_u8(pm1, vandq_u8(vy, x0f));
+		vx = veorq_u8(vx, vy);
+		db = veorq_u8(vx, qx);
+
+		vst1q_u8(dq, db);
+		vst1q_u8(dp, veorq_u8(db, px));
+
+		bytes -= 16;
+		p += 16;
+		q += 16;
+		dp += 16;
+		dq += 16;
+	}
+}
+
+void __raid6_datap_recov_neon(int bytes, uint8_t *p, uint8_t *q, uint8_t *dq,
+			      const uint8_t *qmul)
+{
+	uint8x16_t qm0 = vld1q_u8(qmul);
+	uint8x16_t qm1 = vld1q_u8(qmul + 16);
+
+	/*
+	 * while (bytes--) {
+	 *	*p++ ^= *dq = qmul[*q ^ *dq];
+	 *	q++; dq++;
+	 * }
+	 */
+
+	while (bytes) {
+		uint8x16_t vx, vy;
+
+		vx = veorq_u8(vld1q_u8(q), vld1q_u8(dq));
+
+		vy = (uint8x16_t)vshrq_n_s16((int16x8_t)vx, 4);
+		vx = vqtbl1q_u8(qm0, vandq_u8(vx, x0f));
+		vy = vqtbl1q_u8(qm1, vandq_u8(vy, x0f));
+		vx = veorq_u8(vx, vy);
+		vy = veorq_u8(vx, vld1q_u8(p));
+
+		vst1q_u8(dq, vx);
+		vst1q_u8(p, vy);
+
+		bytes -= 16;
+		p += 16;
+		q += 16;
+		dq += 16;
+	}
+}
-- 
cgit v1.2.3


From d62e26b3ffd28f16ddae85a1babd0303a1a6dfb6 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Fri, 30 Jun 2017 21:55:08 -0600
Subject: block: pass in queue to inflight accounting

No functional change in this patch, just in preparation for
basing the inflight mechanism on the queue in question.

Reviewed-by: Bart Van Assche <bart.vanassche@wdc.com>
Reviewed-by: Omar Sandoval <osandov@fb.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/bio.c                   | 16 ++++++++--------
 block/blk-core.c              | 22 ++++++++++++----------
 block/blk-merge.c             |  4 ++--
 block/genhd.c                 |  4 ++--
 block/partition-generic.c     |  5 +++--
 drivers/block/drbd/drbd_req.c | 10 +++++++---
 drivers/block/rsxx/dev.c      |  6 +++---
 drivers/block/zram/zram_drv.c |  5 +++--
 drivers/md/bcache/request.c   |  7 ++++---
 drivers/md/dm.c               |  6 +++---
 drivers/nvdimm/nd.h           |  5 +++--
 include/linux/bio.h           |  9 +++++----
 include/linux/genhd.h         | 14 +++++++++-----
 13 files changed, 64 insertions(+), 49 deletions(-)

(limited to 'include')

diff --git a/block/bio.c b/block/bio.c
index e241bbc49f14..ecd1a9c7a301 100644
--- a/block/bio.c
+++ b/block/bio.c
@@ -1736,29 +1736,29 @@ void bio_check_pages_dirty(struct bio *bio)
 	}
 }
 
-void generic_start_io_acct(int rw, unsigned long sectors,
-			   struct hd_struct *part)
+void generic_start_io_acct(struct request_queue *q, int rw,
+			   unsigned long sectors, struct hd_struct *part)
 {
 	int cpu = part_stat_lock();
 
-	part_round_stats(cpu, part);
+	part_round_stats(q, cpu, part);
 	part_stat_inc(cpu, part, ios[rw]);
 	part_stat_add(cpu, part, sectors[rw], sectors);
-	part_inc_in_flight(part, rw);
+	part_inc_in_flight(q, part, rw);
 
 	part_stat_unlock();
 }
 EXPORT_SYMBOL(generic_start_io_acct);
 
-void generic_end_io_acct(int rw, struct hd_struct *part,
-			 unsigned long start_time)
+void generic_end_io_acct(struct request_queue *q, int rw,
+			 struct hd_struct *part, unsigned long start_time)
 {
 	unsigned long duration = jiffies - start_time;
 	int cpu = part_stat_lock();
 
 	part_stat_add(cpu, part, ticks[rw], duration);
-	part_round_stats(cpu, part);
-	part_dec_in_flight(part, rw);
+	part_round_stats(q, cpu, part);
+	part_dec_in_flight(q, part, rw);
 
 	part_stat_unlock();
 }
diff --git a/block/blk-core.c b/block/blk-core.c
index dbecbf4a64e0..8ee954c12e9d 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -1469,15 +1469,15 @@ static void add_acct_request(struct request_queue *q, struct request *rq,
 	__elv_add_request(q, rq, where);
 }
 
-static void part_round_stats_single(int cpu, struct hd_struct *part,
-				    unsigned long now)
+static void part_round_stats_single(struct request_queue *q, int cpu,
+				    struct hd_struct *part, unsigned long now)
 {
 	int inflight;
 
 	if (now == part->stamp)
 		return;
 
-	inflight = part_in_flight(part);
+	inflight = part_in_flight(q, part);
 	if (inflight) {
 		__part_stat_add(cpu, part, time_in_queue,
 				inflight * (now - part->stamp));
@@ -1488,6 +1488,7 @@ static void part_round_stats_single(int cpu, struct hd_struct *part,
 
 /**
  * part_round_stats() - Round off the performance stats on a struct disk_stats.
+ * @q: target block queue
  * @cpu: cpu number for stats access
  * @part: target partition
  *
@@ -1502,13 +1503,14 @@ static void part_round_stats_single(int cpu, struct hd_struct *part,
  * /proc/diskstats.  This accounts immediately for all queue usage up to
  * the current jiffies and restarts the counters again.
  */
-void part_round_stats(int cpu, struct hd_struct *part)
+void part_round_stats(struct request_queue *q, int cpu, struct hd_struct *part)
 {
 	unsigned long now = jiffies;
 
 	if (part->partno)
-		part_round_stats_single(cpu, &part_to_disk(part)->part0, now);
-	part_round_stats_single(cpu, part, now);
+		part_round_stats_single(q, cpu, &part_to_disk(part)->part0,
+						now);
+	part_round_stats_single(q, cpu, part, now);
 }
 EXPORT_SYMBOL_GPL(part_round_stats);
 
@@ -2431,8 +2433,8 @@ void blk_account_io_done(struct request *req)
 
 		part_stat_inc(cpu, part, ios[rw]);
 		part_stat_add(cpu, part, ticks[rw], duration);
-		part_round_stats(cpu, part);
-		part_dec_in_flight(part, rw);
+		part_round_stats(req->q, cpu, part);
+		part_dec_in_flight(req->q, part, rw);
 
 		hd_struct_put(part);
 		part_stat_unlock();
@@ -2489,8 +2491,8 @@ void blk_account_io_start(struct request *rq, bool new_io)
 			part = &rq->rq_disk->part0;
 			hd_struct_get(part);
 		}
-		part_round_stats(cpu, part);
-		part_inc_in_flight(part, rw);
+		part_round_stats(rq->q, cpu, part);
+		part_inc_in_flight(rq->q, part, rw);
 		rq->part = part;
 	}
 
diff --git a/block/blk-merge.c b/block/blk-merge.c
index 99038830fb42..05f116bfb99d 100644
--- a/block/blk-merge.c
+++ b/block/blk-merge.c
@@ -633,8 +633,8 @@ static void blk_account_io_merge(struct request *req)
 		cpu = part_stat_lock();
 		part = req->part;
 
-		part_round_stats(cpu, part);
-		part_dec_in_flight(part, rq_data_dir(req));
+		part_round_stats(req->q, cpu, part);
+		part_dec_in_flight(req->q, part, rq_data_dir(req));
 
 		hd_struct_put(part);
 		part_stat_unlock();
diff --git a/block/genhd.c b/block/genhd.c
index 7f520fa25d16..f735af67a0c9 100644
--- a/block/genhd.c
+++ b/block/genhd.c
@@ -1217,7 +1217,7 @@ static int diskstats_show(struct seq_file *seqf, void *v)
 	disk_part_iter_init(&piter, gp, DISK_PITER_INCL_EMPTY_PART0);
 	while ((hd = disk_part_iter_next(&piter))) {
 		cpu = part_stat_lock();
-		part_round_stats(cpu, hd);
+		part_round_stats(gp->queue, cpu, hd);
 		part_stat_unlock();
 		seq_printf(seqf, "%4d %7d %s %lu %lu %lu "
 			   "%u %lu %lu %lu %u %u %u %u\n",
@@ -1231,7 +1231,7 @@ static int diskstats_show(struct seq_file *seqf, void *v)
 			   part_stat_read(hd, merges[WRITE]),
 			   part_stat_read(hd, sectors[WRITE]),
 			   jiffies_to_msecs(part_stat_read(hd, ticks[WRITE])),
-			   part_in_flight(hd),
+			   part_in_flight(gp->queue, hd),
 			   jiffies_to_msecs(part_stat_read(hd, io_ticks)),
 			   jiffies_to_msecs(part_stat_read(hd, time_in_queue))
 			);
diff --git a/block/partition-generic.c b/block/partition-generic.c
index c5ec8246e25e..d1bdd61e1d71 100644
--- a/block/partition-generic.c
+++ b/block/partition-generic.c
@@ -112,10 +112,11 @@ ssize_t part_stat_show(struct device *dev,
 		       struct device_attribute *attr, char *buf)
 {
 	struct hd_struct *p = dev_to_part(dev);
+	struct request_queue *q = dev_to_disk(dev)->queue;
 	int cpu;
 
 	cpu = part_stat_lock();
-	part_round_stats(cpu, p);
+	part_round_stats(q, cpu, p);
 	part_stat_unlock();
 	return sprintf(buf,
 		"%8lu %8lu %8llu %8u "
@@ -130,7 +131,7 @@ ssize_t part_stat_show(struct device *dev,
 		part_stat_read(p, merges[WRITE]),
 		(unsigned long long)part_stat_read(p, sectors[WRITE]),
 		jiffies_to_msecs(part_stat_read(p, ticks[WRITE])),
-		part_in_flight(p),
+		part_in_flight(q, p),
 		jiffies_to_msecs(part_stat_read(p, io_ticks)),
 		jiffies_to_msecs(part_stat_read(p, time_in_queue)));
 }
diff --git a/drivers/block/drbd/drbd_req.c b/drivers/block/drbd/drbd_req.c
index f6e865b2d543..8d6b5d137b5e 100644
--- a/drivers/block/drbd/drbd_req.c
+++ b/drivers/block/drbd/drbd_req.c
@@ -36,14 +36,18 @@ static bool drbd_may_do_local_read(struct drbd_device *device, sector_t sector,
 /* Update disk stats at start of I/O request */
 static void _drbd_start_io_acct(struct drbd_device *device, struct drbd_request *req)
 {
-	generic_start_io_acct(bio_data_dir(req->master_bio), req->i.size >> 9,
-			      &device->vdisk->part0);
+	struct request_queue *q = device->rq_queue;
+
+	generic_start_io_acct(q, bio_data_dir(req->master_bio),
+				req->i.size >> 9, &device->vdisk->part0);
 }
 
 /* Update disk stats when completing request upwards */
 static void _drbd_end_io_acct(struct drbd_device *device, struct drbd_request *req)
 {
-	generic_end_io_acct(bio_data_dir(req->master_bio),
+	struct request_queue *q = device->rq_queue;
+
+	generic_end_io_acct(q, bio_data_dir(req->master_bio),
 			    &device->vdisk->part0, req->start_jif);
 }
 
diff --git a/drivers/block/rsxx/dev.c b/drivers/block/rsxx/dev.c
index 7f4acebf4657..e397d3ee7308 100644
--- a/drivers/block/rsxx/dev.c
+++ b/drivers/block/rsxx/dev.c
@@ -112,7 +112,7 @@ static const struct block_device_operations rsxx_fops = {
 
 static void disk_stats_start(struct rsxx_cardinfo *card, struct bio *bio)
 {
-	generic_start_io_acct(bio_data_dir(bio), bio_sectors(bio),
+	generic_start_io_acct(card->queue, bio_data_dir(bio), bio_sectors(bio),
 			     &card->gendisk->part0);
 }
 
@@ -120,8 +120,8 @@ static void disk_stats_complete(struct rsxx_cardinfo *card,
 				struct bio *bio,
 				unsigned long start_time)
 {
-	generic_end_io_acct(bio_data_dir(bio), &card->gendisk->part0,
-			   start_time);
+	generic_end_io_acct(card->queue, bio_data_dir(bio),
+				&card->gendisk->part0, start_time);
 }
 
 static void bio_dma_done_cb(struct rsxx_cardinfo *card,
diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c
index 856d5dc02451..1c3383b4a0cf 100644
--- a/drivers/block/zram/zram_drv.c
+++ b/drivers/block/zram/zram_drv.c
@@ -813,9 +813,10 @@ static int zram_bvec_rw(struct zram *zram, struct bio_vec *bvec, u32 index,
 {
 	unsigned long start_time = jiffies;
 	int rw_acct = is_write ? REQ_OP_WRITE : REQ_OP_READ;
+	struct request_queue *q = zram->disk->queue;
 	int ret;
 
-	generic_start_io_acct(rw_acct, bvec->bv_len >> SECTOR_SHIFT,
+	generic_start_io_acct(q, rw_acct, bvec->bv_len >> SECTOR_SHIFT,
 			&zram->disk->part0);
 
 	if (!is_write) {
@@ -827,7 +828,7 @@ static int zram_bvec_rw(struct zram *zram, struct bio_vec *bvec, u32 index,
 		ret = zram_bvec_write(zram, bvec, index, offset);
 	}
 
-	generic_end_io_acct(rw_acct, &zram->disk->part0, start_time);
+	generic_end_io_acct(q, rw_acct, &zram->disk->part0, start_time);
 
 	if (unlikely(ret)) {
 		if (!is_write)
diff --git a/drivers/md/bcache/request.c b/drivers/md/bcache/request.c
index 019b3df9f1c6..72eb97176403 100644
--- a/drivers/md/bcache/request.c
+++ b/drivers/md/bcache/request.c
@@ -607,7 +607,8 @@ static void request_endio(struct bio *bio)
 static void bio_complete(struct search *s)
 {
 	if (s->orig_bio) {
-		generic_end_io_acct(bio_data_dir(s->orig_bio),
+		struct request_queue *q = bdev_get_queue(s->orig_bio->bi_bdev);
+		generic_end_io_acct(q, bio_data_dir(s->orig_bio),
 				    &s->d->disk->part0, s->start_time);
 
 		trace_bcache_request_end(s->d, s->orig_bio);
@@ -959,7 +960,7 @@ static blk_qc_t cached_dev_make_request(struct request_queue *q,
 	struct cached_dev *dc = container_of(d, struct cached_dev, disk);
 	int rw = bio_data_dir(bio);
 
-	generic_start_io_acct(rw, bio_sectors(bio), &d->disk->part0);
+	generic_start_io_acct(q, rw, bio_sectors(bio), &d->disk->part0);
 
 	bio->bi_bdev = dc->bdev;
 	bio->bi_iter.bi_sector += dc->sb.data_offset;
@@ -1074,7 +1075,7 @@ static blk_qc_t flash_dev_make_request(struct request_queue *q,
 	struct bcache_device *d = bio->bi_bdev->bd_disk->private_data;
 	int rw = bio_data_dir(bio);
 
-	generic_start_io_acct(rw, bio_sectors(bio), &d->disk->part0);
+	generic_start_io_acct(q, rw, bio_sectors(bio), &d->disk->part0);
 
 	s = search_alloc(bio, d);
 	cl = &s->cl;
diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index 2edbcc2d7d3f..8612a2d1ccd9 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -520,7 +520,7 @@ static void start_io_acct(struct dm_io *io)
 	io->start_time = jiffies;
 
 	cpu = part_stat_lock();
-	part_round_stats(cpu, &dm_disk(md)->part0);
+	part_round_stats(md->queue, cpu, &dm_disk(md)->part0);
 	part_stat_unlock();
 	atomic_set(&dm_disk(md)->part0.in_flight[rw],
 		atomic_inc_return(&md->pending[rw]));
@@ -539,7 +539,7 @@ static void end_io_acct(struct dm_io *io)
 	int pending;
 	int rw = bio_data_dir(bio);
 
-	generic_end_io_acct(rw, &dm_disk(md)->part0, io->start_time);
+	generic_end_io_acct(md->queue, rw, &dm_disk(md)->part0, io->start_time);
 
 	if (unlikely(dm_stats_used(&md->stats)))
 		dm_stats_account_io(&md->stats, bio_data_dir(bio),
@@ -1542,7 +1542,7 @@ static blk_qc_t dm_make_request(struct request_queue *q, struct bio *bio)
 
 	map = dm_get_live_table(md, &srcu_idx);
 
-	generic_start_io_acct(rw, bio_sectors(bio), &dm_disk(md)->part0);
+	generic_start_io_acct(q, rw, bio_sectors(bio), &dm_disk(md)->part0);
 
 	/* if we're suspended, we have to queue this io for later */
 	if (unlikely(test_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags))) {
diff --git a/drivers/nvdimm/nd.h b/drivers/nvdimm/nd.h
index e1b5715bd91f..73062da3177f 100644
--- a/drivers/nvdimm/nd.h
+++ b/drivers/nvdimm/nd.h
@@ -396,7 +396,7 @@ static inline bool nd_iostat_start(struct bio *bio, unsigned long *start)
 		return false;
 
 	*start = jiffies;
-	generic_start_io_acct(bio_data_dir(bio),
+	generic_start_io_acct(disk->queue, bio_data_dir(bio),
 			      bio_sectors(bio), &disk->part0);
 	return true;
 }
@@ -404,7 +404,8 @@ static inline void nd_iostat_end(struct bio *bio, unsigned long start)
 {
 	struct gendisk *disk = bio->bi_bdev->bd_disk;
 
-	generic_end_io_acct(bio_data_dir(bio), &disk->part0, start);
+	generic_end_io_acct(disk->queue, bio_data_dir(bio), &disk->part0,
+				start);
 }
 static inline bool is_bad_pmem(struct badblocks *bb, sector_t sector,
 		unsigned int len)
diff --git a/include/linux/bio.h b/include/linux/bio.h
index 7b1cf4ba0902..9276788a9b24 100644
--- a/include/linux/bio.h
+++ b/include/linux/bio.h
@@ -463,10 +463,11 @@ extern struct bio *bio_copy_kern(struct request_queue *, void *, unsigned int,
 extern void bio_set_pages_dirty(struct bio *bio);
 extern void bio_check_pages_dirty(struct bio *bio);
 
-void generic_start_io_acct(int rw, unsigned long sectors,
-			   struct hd_struct *part);
-void generic_end_io_acct(int rw, struct hd_struct *part,
-			 unsigned long start_time);
+void generic_start_io_acct(struct request_queue *q, int rw,
+				unsigned long sectors, struct hd_struct *part);
+void generic_end_io_acct(struct request_queue *q, int rw,
+				struct hd_struct *part,
+				unsigned long start_time);
 
 #ifndef ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE
 # error	"You should define ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE for your platform"
diff --git a/include/linux/genhd.h b/include/linux/genhd.h
index e619fae2f037..7f7427e00f9c 100644
--- a/include/linux/genhd.h
+++ b/include/linux/genhd.h
@@ -362,23 +362,27 @@ static inline void free_part_stats(struct hd_struct *part)
 #define part_stat_sub(cpu, gendiskp, field, subnd)			\
 	part_stat_add(cpu, gendiskp, field, -subnd)
 
-static inline void part_inc_in_flight(struct hd_struct *part, int rw)
+static inline void part_inc_in_flight(struct request_queue *q,
+				      struct hd_struct *part, int rw)
 {
 	atomic_inc(&part->in_flight[rw]);
 	if (part->partno)
 		atomic_inc(&part_to_disk(part)->part0.in_flight[rw]);
 }
 
-static inline void part_dec_in_flight(struct hd_struct *part, int rw)
+static inline void part_dec_in_flight(struct request_queue *q,
+				      struct hd_struct *part, int rw)
 {
 	atomic_dec(&part->in_flight[rw]);
 	if (part->partno)
 		atomic_dec(&part_to_disk(part)->part0.in_flight[rw]);
 }
 
-static inline int part_in_flight(struct hd_struct *part)
+static inline int part_in_flight(struct request_queue *q,
+				 struct hd_struct *part)
 {
-	return atomic_read(&part->in_flight[0]) + atomic_read(&part->in_flight[1]);
+	return atomic_read(&part->in_flight[0]) +
+			atomic_read(&part->in_flight[1]);
 }
 
 static inline struct partition_meta_info *alloc_part_info(struct gendisk *disk)
@@ -395,7 +399,7 @@ static inline void free_part_info(struct hd_struct *part)
 }
 
 /* block/blk-core.c */
-extern void part_round_stats(int cpu, struct hd_struct *part);
+extern void part_round_stats(struct request_queue *q, int cpu, struct hd_struct *part);
 
 /* block/genhd.c */
 extern void device_add_disk(struct device *parent, struct gendisk *disk);
-- 
cgit v1.2.3


From 0609e0efc5e15195ecf8c6d2f2e890d98760e337 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Tue, 8 Aug 2017 17:49:47 -0600
Subject: block: make part_in_flight() take an array of two ints

Instead of returning the count that matches the partition, pass
in an array of two ints. Index 0 will be filled with the inflight
count for the partition in question, and index 1 will filled
with the root inflight count, if the partition passed in is not the
root.

This is in preparation for being able to calculate both in one
go.

Reviewed-by: Bart Van Assche <bart.vanassche@wdc.com>
Reviewed-by: Omar Sandoval <osandov@fb.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-core.c          |  8 ++++----
 block/genhd.c             |  4 +++-
 block/partition-generic.c |  4 +++-
 include/linux/genhd.h     | 13 ++++++++++---
 4 files changed, 20 insertions(+), 9 deletions(-)

(limited to 'include')

diff --git a/block/blk-core.c b/block/blk-core.c
index 8ee954c12e9d..6ad2b8602c1d 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -1472,15 +1472,15 @@ static void add_acct_request(struct request_queue *q, struct request *rq,
 static void part_round_stats_single(struct request_queue *q, int cpu,
 				    struct hd_struct *part, unsigned long now)
 {
-	int inflight;
+	int inflight[2];
 
 	if (now == part->stamp)
 		return;
 
-	inflight = part_in_flight(q, part);
-	if (inflight) {
+	part_in_flight(q, part, inflight);
+	if (inflight[0]) {
 		__part_stat_add(cpu, part, time_in_queue,
-				inflight * (now - part->stamp));
+				inflight[0] * (now - part->stamp));
 		__part_stat_add(cpu, part, io_ticks, (now - part->stamp));
 	}
 	part->stamp = now;
diff --git a/block/genhd.c b/block/genhd.c
index f735af67a0c9..822f65f95e2a 100644
--- a/block/genhd.c
+++ b/block/genhd.c
@@ -1204,6 +1204,7 @@ static int diskstats_show(struct seq_file *seqf, void *v)
 	struct disk_part_iter piter;
 	struct hd_struct *hd;
 	char buf[BDEVNAME_SIZE];
+	unsigned int inflight[2];
 	int cpu;
 
 	/*
@@ -1219,6 +1220,7 @@ static int diskstats_show(struct seq_file *seqf, void *v)
 		cpu = part_stat_lock();
 		part_round_stats(gp->queue, cpu, hd);
 		part_stat_unlock();
+		part_in_flight(gp->queue, hd, inflight);
 		seq_printf(seqf, "%4d %7d %s %lu %lu %lu "
 			   "%u %lu %lu %lu %u %u %u %u\n",
 			   MAJOR(part_devt(hd)), MINOR(part_devt(hd)),
@@ -1231,7 +1233,7 @@ static int diskstats_show(struct seq_file *seqf, void *v)
 			   part_stat_read(hd, merges[WRITE]),
 			   part_stat_read(hd, sectors[WRITE]),
 			   jiffies_to_msecs(part_stat_read(hd, ticks[WRITE])),
-			   part_in_flight(gp->queue, hd),
+			   inflight[0],
 			   jiffies_to_msecs(part_stat_read(hd, io_ticks)),
 			   jiffies_to_msecs(part_stat_read(hd, time_in_queue))
 			);
diff --git a/block/partition-generic.c b/block/partition-generic.c
index d1bdd61e1d71..fa5049a4d99b 100644
--- a/block/partition-generic.c
+++ b/block/partition-generic.c
@@ -113,11 +113,13 @@ ssize_t part_stat_show(struct device *dev,
 {
 	struct hd_struct *p = dev_to_part(dev);
 	struct request_queue *q = dev_to_disk(dev)->queue;
+	unsigned int inflight[2];
 	int cpu;
 
 	cpu = part_stat_lock();
 	part_round_stats(q, cpu, p);
 	part_stat_unlock();
+	part_in_flight(q, p, inflight);
 	return sprintf(buf,
 		"%8lu %8lu %8llu %8u "
 		"%8lu %8lu %8llu %8u "
@@ -131,7 +133,7 @@ ssize_t part_stat_show(struct device *dev,
 		part_stat_read(p, merges[WRITE]),
 		(unsigned long long)part_stat_read(p, sectors[WRITE]),
 		jiffies_to_msecs(part_stat_read(p, ticks[WRITE])),
-		part_in_flight(q, p),
+		inflight[0],
 		jiffies_to_msecs(part_stat_read(p, io_ticks)),
 		jiffies_to_msecs(part_stat_read(p, time_in_queue)));
 }
diff --git a/include/linux/genhd.h b/include/linux/genhd.h
index 7f7427e00f9c..f2a3a26cdda1 100644
--- a/include/linux/genhd.h
+++ b/include/linux/genhd.h
@@ -378,11 +378,18 @@ static inline void part_dec_in_flight(struct request_queue *q,
 		atomic_dec(&part_to_disk(part)->part0.in_flight[rw]);
 }
 
-static inline int part_in_flight(struct request_queue *q,
-				 struct hd_struct *part)
+static inline void part_in_flight(struct request_queue *q,
+				  struct hd_struct *part,
+				  unsigned int inflight[2])
 {
-	return atomic_read(&part->in_flight[0]) +
+	inflight[0] = atomic_read(&part->in_flight[0]) +
 			atomic_read(&part->in_flight[1]);
+	if (part->partno) {
+		part = &part_to_disk(part)->part0;
+		inflight[1] = atomic_read(&part->in_flight[0]) +
+				atomic_read(&part->in_flight[1]);
+	} else
+		inflight[1] = 0;
 }
 
 static inline struct partition_meta_info *alloc_part_info(struct gendisk *disk)
-- 
cgit v1.2.3


From f299b7c7a9dee64425e5965bd4f56dc024c1befc Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Tue, 8 Aug 2017 17:51:45 -0600
Subject: blk-mq: provide internal in-flight variant

We don't have to inc/dec some counter, since we can just
iterate the tags. That makes inc/dec a noop, but means we
have to iterate busy tags to get an in-flight count.

Reviewed-by: Bart Van Assche <bart.vanassche@wdc.com>
Reviewed-by: Omar Sandoval <osandov@fb.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-mq.c        | 31 +++++++++++++++++++++++++++++++
 block/blk-mq.h        |  3 +++
 block/genhd.c         | 37 +++++++++++++++++++++++++++++++++++++
 include/linux/genhd.h | 35 ++++++-----------------------------
 4 files changed, 77 insertions(+), 29 deletions(-)

(limited to 'include')

diff --git a/block/blk-mq.c b/block/blk-mq.c
index a5d369dc7622..0dfc7a9984b6 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -83,6 +83,37 @@ static void blk_mq_hctx_clear_pending(struct blk_mq_hw_ctx *hctx,
 	sbitmap_clear_bit(&hctx->ctx_map, ctx->index_hw);
 }
 
+struct mq_inflight {
+	struct hd_struct *part;
+	unsigned int *inflight;
+};
+
+static void blk_mq_check_inflight(struct blk_mq_hw_ctx *hctx,
+				  struct request *rq, void *priv,
+				  bool reserved)
+{
+	struct mq_inflight *mi = priv;
+
+	if (test_bit(REQ_ATOM_STARTED, &rq->atomic_flags) &&
+	    !test_bit(REQ_ATOM_COMPLETE, &rq->atomic_flags)) {
+		/*
+		 * Count as inflight if it either matches the partition we
+		 * asked for, or if it's the root
+		 */
+		if (rq->part == mi->part || mi->part->partno)
+			mi->inflight[0]++;
+	}
+}
+
+void blk_mq_in_flight(struct request_queue *q, struct hd_struct *part,
+		      unsigned int inflight[2])
+{
+	struct mq_inflight mi = { .part = part, .inflight = inflight, };
+
+	inflight[0] = 0;
+	blk_mq_queue_tag_busy_iter(q, blk_mq_check_inflight, &mi);
+}
+
 void blk_freeze_queue_start(struct request_queue *q)
 {
 	int freeze_depth;
diff --git a/block/blk-mq.h b/block/blk-mq.h
index 60b01c0309bc..98252b79b80b 100644
--- a/block/blk-mq.h
+++ b/block/blk-mq.h
@@ -133,4 +133,7 @@ static inline bool blk_mq_hw_queue_mapped(struct blk_mq_hw_ctx *hctx)
 	return hctx->nr_ctx && hctx->tags;
 }
 
+void blk_mq_in_flight(struct request_queue *q, struct hd_struct *part,
+			unsigned int inflight[2]);
+
 #endif
diff --git a/block/genhd.c b/block/genhd.c
index 822f65f95e2a..3dc4d115480f 100644
--- a/block/genhd.c
+++ b/block/genhd.c
@@ -45,6 +45,43 @@ static void disk_add_events(struct gendisk *disk);
 static void disk_del_events(struct gendisk *disk);
 static void disk_release_events(struct gendisk *disk);
 
+void part_inc_in_flight(struct request_queue *q, struct hd_struct *part, int rw)
+{
+	if (q->mq_ops)
+		return;
+
+	atomic_inc(&part->in_flight[rw]);
+	if (part->partno)
+		atomic_inc(&part_to_disk(part)->part0.in_flight[rw]);
+}
+
+void part_dec_in_flight(struct request_queue *q, struct hd_struct *part, int rw)
+{
+	if (q->mq_ops)
+		return;
+
+	atomic_dec(&part->in_flight[rw]);
+	if (part->partno)
+		atomic_dec(&part_to_disk(part)->part0.in_flight[rw]);
+}
+
+void part_in_flight(struct request_queue *q, struct hd_struct *part,
+		    unsigned int inflight[2])
+{
+	if (q->mq_ops) {
+		blk_mq_in_flight(q, part, inflight);
+		return;
+	}
+
+	inflight[0] = atomic_read(&part->in_flight[0]) +
+			atomic_read(&part->in_flight[1]);
+	if (part->partno) {
+		part = &part_to_disk(part)->part0;
+		inflight[1] = atomic_read(&part->in_flight[0]) +
+				atomic_read(&part->in_flight[1]);
+	}
+}
+
 /**
  * disk_get_part - get partition
  * @disk: disk to look partition from
diff --git a/include/linux/genhd.h b/include/linux/genhd.h
index f2a3a26cdda1..ea652bfcd675 100644
--- a/include/linux/genhd.h
+++ b/include/linux/genhd.h
@@ -362,35 +362,12 @@ static inline void free_part_stats(struct hd_struct *part)
 #define part_stat_sub(cpu, gendiskp, field, subnd)			\
 	part_stat_add(cpu, gendiskp, field, -subnd)
 
-static inline void part_inc_in_flight(struct request_queue *q,
-				      struct hd_struct *part, int rw)
-{
-	atomic_inc(&part->in_flight[rw]);
-	if (part->partno)
-		atomic_inc(&part_to_disk(part)->part0.in_flight[rw]);
-}
-
-static inline void part_dec_in_flight(struct request_queue *q,
-				      struct hd_struct *part, int rw)
-{
-	atomic_dec(&part->in_flight[rw]);
-	if (part->partno)
-		atomic_dec(&part_to_disk(part)->part0.in_flight[rw]);
-}
-
-static inline void part_in_flight(struct request_queue *q,
-				  struct hd_struct *part,
-				  unsigned int inflight[2])
-{
-	inflight[0] = atomic_read(&part->in_flight[0]) +
-			atomic_read(&part->in_flight[1]);
-	if (part->partno) {
-		part = &part_to_disk(part)->part0;
-		inflight[1] = atomic_read(&part->in_flight[0]) +
-				atomic_read(&part->in_flight[1]);
-	} else
-		inflight[1] = 0;
-}
+void part_in_flight(struct request_queue *q, struct hd_struct *part,
+			unsigned int inflight[2]);
+void part_dec_in_flight(struct request_queue *q, struct hd_struct *part,
+			int rw);
+void part_inc_in_flight(struct request_queue *q, struct hd_struct *part,
+			int rw);
 
 static inline struct partition_meta_info *alloc_part_info(struct gendisk *disk)
 {
-- 
cgit v1.2.3


From 0a4a060bb204c47825eb4f7c27f66fc7ee85d508 Mon Sep 17 00:00:00 2001
From: Willem de Bruijn <willemb@google.com>
Date: Wed, 9 Aug 2017 19:09:44 -0400
Subject: sock: fix zerocopy_success regression with msg_zerocopy

Do not use uarg->zerocopy outside msg_zerocopy. In other paths the
field is not explicitly initialized and aliases another field.

Those paths have only one reference so do not need this intermediate
variable. Call uarg->callback directly.

Fixes: 1f8b977ab32d ("sock: enable MSG_ZEROCOPY")
Signed-off-by: Willem de Bruijn <willemb@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/skbuff.h | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 8c0708d2e5e6..7594e19bce62 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -1273,8 +1273,13 @@ static inline void skb_zcopy_clear(struct sk_buff *skb, bool zerocopy)
 	struct ubuf_info *uarg = skb_zcopy(skb);
 
 	if (uarg) {
-		uarg->zerocopy = uarg->zerocopy && zerocopy;
-		sock_zerocopy_put(uarg);
+		if (uarg->callback == sock_zerocopy_callback) {
+			uarg->zerocopy = uarg->zerocopy && zerocopy;
+			sock_zerocopy_put(uarg);
+		} else {
+			uarg->callback(uarg, zerocopy);
+		}
+
 		skb_shinfo(skb)->tx_flags &= ~SKBTX_ZEROCOPY_FRAG;
 	}
 }
-- 
cgit v1.2.3


From 92b31a9af73b3a3fc801899335d6c47966351830 Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Thu, 10 Aug 2017 01:39:55 +0200
Subject: bpf: add BPF_J{LT,LE,SLT,SLE} instructions

Currently, eBPF only understands BPF_JGT (>), BPF_JGE (>=),
BPF_JSGT (s>), BPF_JSGE (s>=) instructions, this means that
particularly *JLT/*JLE counterparts involving immediates need
to be rewritten from e.g. X < [IMM] by swapping arguments into
[IMM] > X, meaning the immediate first is required to be loaded
into a register Y := [IMM], such that then we can compare with
Y > X. Note that the destination operand is always required to
be a register.

This has the downside of having unnecessarily increased register
pressure, meaning complex program would need to spill other
registers temporarily to stack in order to obtain an unused
register for the [IMM]. Loading to registers will thus also
affect state pruning since we need to account for that register
use and potentially those registers that had to be spilled/filled
again. As a consequence slightly more stack space might have
been used due to spilling, and BPF programs are a bit longer
due to extra code involving the register load and potentially
required spill/fills.

Thus, add BPF_JLT (<), BPF_JLE (<=), BPF_JSLT (s<), BPF_JSLE (s<=)
counterparts to the eBPF instruction set. Modifying LLVM to
remove the NegateCC() workaround in a PoC patch at [1] and
allowing it to also emit the new instructions resulted in
cilium's BPF programs that are injected into the fast-path to
have a reduced program length in the range of 2-3% (e.g.
accumulated main and tail call sections from one of the object
file reduced from 4864 to 4729 insns), reduced complexity in
the range of 10-30% (e.g. accumulated sections reduced in one
of the cases from 116432 to 88428 insns), and reduced stack
usage in the range of 1-5% (e.g. accumulated sections from one
of the object files reduced from 824 to 784b).

The modification for LLVM will be incorporated in a backwards
compatible way. Plan is for LLVM to have i) a target specific
option to offer a possibility to explicitly enable the extension
by the user (as we have with -m target specific extensions today
for various CPU insns), and ii) have the kernel checked for
presence of the extensions and enable them transparently when
the user is selecting more aggressive options such as -march=native
in a bpf target context. (Other frontends generating BPF byte
code, e.g. ply can probe the kernel directly for its code
generation.)

  [1] https://github.com/borkmann/llvm/tree/bpf-insns

Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 Documentation/networking/filter.txt |   4 +
 include/uapi/linux/bpf.h            |   5 +
 kernel/bpf/core.c                   |  60 ++++++
 lib/test_bpf.c                      | 364 ++++++++++++++++++++++++++++++++++++
 net/core/filter.c                   |  21 ++-
 tools/include/uapi/linux/bpf.h      |   5 +
 6 files changed, 455 insertions(+), 4 deletions(-)

(limited to 'include')

diff --git a/Documentation/networking/filter.txt b/Documentation/networking/filter.txt
index d0fdba7d66e2..6a0df8df6c43 100644
--- a/Documentation/networking/filter.txt
+++ b/Documentation/networking/filter.txt
@@ -906,6 +906,10 @@ If BPF_CLASS(code) == BPF_JMP, BPF_OP(code) is one of:
   BPF_JSGE  0x70  /* eBPF only: signed '>=' */
   BPF_CALL  0x80  /* eBPF only: function call */
   BPF_EXIT  0x90  /* eBPF only: function return */
+  BPF_JLT   0xa0  /* eBPF only: unsigned '<' */
+  BPF_JLE   0xb0  /* eBPF only: unsigned '<=' */
+  BPF_JSLT  0xc0  /* eBPF only: signed '<' */
+  BPF_JSLE  0xd0  /* eBPF only: signed '<=' */
 
 So BPF_ADD | BPF_X | BPF_ALU means 32-bit addition in both classic BPF
 and eBPF. There are only two registers in classic BPF, so it means A += X.
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 1d06be1569b1..91da8371a2d0 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -30,9 +30,14 @@
 #define BPF_FROM_LE	BPF_TO_LE
 #define BPF_FROM_BE	BPF_TO_BE
 
+/* jmp encodings */
 #define BPF_JNE		0x50	/* jump != */
+#define BPF_JLT		0xa0	/* LT is unsigned, '<' */
+#define BPF_JLE		0xb0	/* LE is unsigned, '<=' */
 #define BPF_JSGT	0x60	/* SGT is signed '>', GT in x86 */
 #define BPF_JSGE	0x70	/* SGE is signed '>=', GE in x86 */
+#define BPF_JSLT	0xc0	/* SLT is signed, '<' */
+#define BPF_JSLE	0xd0	/* SLE is signed, '<=' */
 #define BPF_CALL	0x80	/* function call */
 #define BPF_EXIT	0x90	/* function return */
 
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index ad5f55922a13..c69e7f5bfde7 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -595,9 +595,13 @@ static int bpf_jit_blind_insn(const struct bpf_insn *from,
 	case BPF_JMP | BPF_JEQ  | BPF_K:
 	case BPF_JMP | BPF_JNE  | BPF_K:
 	case BPF_JMP | BPF_JGT  | BPF_K:
+	case BPF_JMP | BPF_JLT  | BPF_K:
 	case BPF_JMP | BPF_JGE  | BPF_K:
+	case BPF_JMP | BPF_JLE  | BPF_K:
 	case BPF_JMP | BPF_JSGT | BPF_K:
+	case BPF_JMP | BPF_JSLT | BPF_K:
 	case BPF_JMP | BPF_JSGE | BPF_K:
+	case BPF_JMP | BPF_JSLE | BPF_K:
 	case BPF_JMP | BPF_JSET | BPF_K:
 		/* Accommodate for extra offset in case of a backjump. */
 		off = from->off;
@@ -833,12 +837,20 @@ static unsigned int ___bpf_prog_run(u64 *regs, const struct bpf_insn *insn,
 		[BPF_JMP | BPF_JNE | BPF_K] = &&JMP_JNE_K,
 		[BPF_JMP | BPF_JGT | BPF_X] = &&JMP_JGT_X,
 		[BPF_JMP | BPF_JGT | BPF_K] = &&JMP_JGT_K,
+		[BPF_JMP | BPF_JLT | BPF_X] = &&JMP_JLT_X,
+		[BPF_JMP | BPF_JLT | BPF_K] = &&JMP_JLT_K,
 		[BPF_JMP | BPF_JGE | BPF_X] = &&JMP_JGE_X,
 		[BPF_JMP | BPF_JGE | BPF_K] = &&JMP_JGE_K,
+		[BPF_JMP | BPF_JLE | BPF_X] = &&JMP_JLE_X,
+		[BPF_JMP | BPF_JLE | BPF_K] = &&JMP_JLE_K,
 		[BPF_JMP | BPF_JSGT | BPF_X] = &&JMP_JSGT_X,
 		[BPF_JMP | BPF_JSGT | BPF_K] = &&JMP_JSGT_K,
+		[BPF_JMP | BPF_JSLT | BPF_X] = &&JMP_JSLT_X,
+		[BPF_JMP | BPF_JSLT | BPF_K] = &&JMP_JSLT_K,
 		[BPF_JMP | BPF_JSGE | BPF_X] = &&JMP_JSGE_X,
 		[BPF_JMP | BPF_JSGE | BPF_K] = &&JMP_JSGE_K,
+		[BPF_JMP | BPF_JSLE | BPF_X] = &&JMP_JSLE_X,
+		[BPF_JMP | BPF_JSLE | BPF_K] = &&JMP_JSLE_K,
 		[BPF_JMP | BPF_JSET | BPF_X] = &&JMP_JSET_X,
 		[BPF_JMP | BPF_JSET | BPF_K] = &&JMP_JSET_K,
 		/* Program return */
@@ -1073,6 +1085,18 @@ out:
 			CONT_JMP;
 		}
 		CONT;
+	JMP_JLT_X:
+		if (DST < SRC) {
+			insn += insn->off;
+			CONT_JMP;
+		}
+		CONT;
+	JMP_JLT_K:
+		if (DST < IMM) {
+			insn += insn->off;
+			CONT_JMP;
+		}
+		CONT;
 	JMP_JGE_X:
 		if (DST >= SRC) {
 			insn += insn->off;
@@ -1085,6 +1109,18 @@ out:
 			CONT_JMP;
 		}
 		CONT;
+	JMP_JLE_X:
+		if (DST <= SRC) {
+			insn += insn->off;
+			CONT_JMP;
+		}
+		CONT;
+	JMP_JLE_K:
+		if (DST <= IMM) {
+			insn += insn->off;
+			CONT_JMP;
+		}
+		CONT;
 	JMP_JSGT_X:
 		if (((s64) DST) > ((s64) SRC)) {
 			insn += insn->off;
@@ -1097,6 +1133,18 @@ out:
 			CONT_JMP;
 		}
 		CONT;
+	JMP_JSLT_X:
+		if (((s64) DST) < ((s64) SRC)) {
+			insn += insn->off;
+			CONT_JMP;
+		}
+		CONT;
+	JMP_JSLT_K:
+		if (((s64) DST) < ((s64) IMM)) {
+			insn += insn->off;
+			CONT_JMP;
+		}
+		CONT;
 	JMP_JSGE_X:
 		if (((s64) DST) >= ((s64) SRC)) {
 			insn += insn->off;
@@ -1109,6 +1157,18 @@ out:
 			CONT_JMP;
 		}
 		CONT;
+	JMP_JSLE_X:
+		if (((s64) DST) <= ((s64) SRC)) {
+			insn += insn->off;
+			CONT_JMP;
+		}
+		CONT;
+	JMP_JSLE_K:
+		if (((s64) DST) <= ((s64) IMM)) {
+			insn += insn->off;
+			CONT_JMP;
+		}
+		CONT;
 	JMP_JSET_X:
 		if (DST & SRC) {
 			insn += insn->off;
diff --git a/lib/test_bpf.c b/lib/test_bpf.c
index d9d5a410955c..aa8812ae6776 100644
--- a/lib/test_bpf.c
+++ b/lib/test_bpf.c
@@ -951,6 +951,32 @@ static struct bpf_test tests[] = {
 		{ 4, 4, 4, 3, 3 },
 		{ { 2, 0 }, { 3, 1 }, { 4, MAX_K } },
 	},
+	{
+		"JGE (jt 0), test 1",
+		.u.insns = {
+			BPF_STMT(BPF_LDX | BPF_LEN, 0),
+			BPF_STMT(BPF_LD | BPF_B | BPF_ABS, 2),
+			BPF_JUMP(BPF_JMP | BPF_JGE | BPF_X, 0, 0, 1),
+			BPF_STMT(BPF_RET | BPF_K, 1),
+			BPF_STMT(BPF_RET | BPF_K, MAX_K)
+		},
+		CLASSIC,
+		{ 4, 4, 4, 3, 3 },
+		{ { 2, 0 }, { 3, 1 }, { 4, 1 } },
+	},
+	{
+		"JGE (jt 0), test 2",
+		.u.insns = {
+			BPF_STMT(BPF_LDX | BPF_LEN, 0),
+			BPF_STMT(BPF_LD | BPF_B | BPF_ABS, 2),
+			BPF_JUMP(BPF_JMP | BPF_JGE | BPF_X, 0, 0, 1),
+			BPF_STMT(BPF_RET | BPF_K, 1),
+			BPF_STMT(BPF_RET | BPF_K, MAX_K)
+		},
+		CLASSIC,
+		{ 4, 4, 5, 3, 3 },
+		{ { 4, 1 }, { 5, 1 }, { 6, MAX_K } },
+	},
 	{
 		"JGE",
 		.u.insns = {
@@ -4492,6 +4518,35 @@ static struct bpf_test tests[] = {
 		{ },
 		{ { 0, 1 } },
 	},
+	/* BPF_JMP | BPF_JSLT | BPF_K */
+	{
+		"JMP_JSLT_K: Signed jump: if (-2 < -1) return 1",
+		.u.insns_int = {
+			BPF_ALU32_IMM(BPF_MOV, R0, 0),
+			BPF_LD_IMM64(R1, 0xfffffffffffffffeLL),
+			BPF_JMP_IMM(BPF_JSLT, R1, -1, 1),
+			BPF_EXIT_INSN(),
+			BPF_ALU32_IMM(BPF_MOV, R0, 1),
+			BPF_EXIT_INSN(),
+		},
+		INTERNAL,
+		{ },
+		{ { 0, 1 } },
+	},
+	{
+		"JMP_JSLT_K: Signed jump: if (-1 < -1) return 0",
+		.u.insns_int = {
+			BPF_ALU32_IMM(BPF_MOV, R0, 1),
+			BPF_LD_IMM64(R1, 0xffffffffffffffffLL),
+			BPF_JMP_IMM(BPF_JSLT, R1, -1, 1),
+			BPF_EXIT_INSN(),
+			BPF_ALU32_IMM(BPF_MOV, R0, 0),
+			BPF_EXIT_INSN(),
+		},
+		INTERNAL,
+		{ },
+		{ { 0, 1 } },
+	},
 	/* BPF_JMP | BPF_JSGT | BPF_K */
 	{
 		"JMP_JSGT_K: Signed jump: if (-1 > -2) return 1",
@@ -4521,6 +4576,73 @@ static struct bpf_test tests[] = {
 		{ },
 		{ { 0, 1 } },
 	},
+	/* BPF_JMP | BPF_JSLE | BPF_K */
+	{
+		"JMP_JSLE_K: Signed jump: if (-2 <= -1) return 1",
+		.u.insns_int = {
+			BPF_ALU32_IMM(BPF_MOV, R0, 0),
+			BPF_LD_IMM64(R1, 0xfffffffffffffffeLL),
+			BPF_JMP_IMM(BPF_JSLE, R1, -1, 1),
+			BPF_EXIT_INSN(),
+			BPF_ALU32_IMM(BPF_MOV, R0, 1),
+			BPF_EXIT_INSN(),
+		},
+		INTERNAL,
+		{ },
+		{ { 0, 1 } },
+	},
+	{
+		"JMP_JSLE_K: Signed jump: if (-1 <= -1) return 1",
+		.u.insns_int = {
+			BPF_ALU32_IMM(BPF_MOV, R0, 0),
+			BPF_LD_IMM64(R1, 0xffffffffffffffffLL),
+			BPF_JMP_IMM(BPF_JSLE, R1, -1, 1),
+			BPF_EXIT_INSN(),
+			BPF_ALU32_IMM(BPF_MOV, R0, 1),
+			BPF_EXIT_INSN(),
+		},
+		INTERNAL,
+		{ },
+		{ { 0, 1 } },
+	},
+	{
+		"JMP_JSLE_K: Signed jump: value walk 1",
+		.u.insns_int = {
+			BPF_ALU32_IMM(BPF_MOV, R0, 0),
+			BPF_LD_IMM64(R1, 3),
+			BPF_JMP_IMM(BPF_JSLE, R1, 0, 6),
+			BPF_ALU64_IMM(BPF_SUB, R1, 1),
+			BPF_JMP_IMM(BPF_JSLE, R1, 0, 4),
+			BPF_ALU64_IMM(BPF_SUB, R1, 1),
+			BPF_JMP_IMM(BPF_JSLE, R1, 0, 2),
+			BPF_ALU64_IMM(BPF_SUB, R1, 1),
+			BPF_JMP_IMM(BPF_JSLE, R1, 0, 1),
+			BPF_EXIT_INSN(),		/* bad exit */
+			BPF_ALU32_IMM(BPF_MOV, R0, 1),	/* good exit */
+			BPF_EXIT_INSN(),
+		},
+		INTERNAL,
+		{ },
+		{ { 0, 1 } },
+	},
+	{
+		"JMP_JSLE_K: Signed jump: value walk 2",
+		.u.insns_int = {
+			BPF_ALU32_IMM(BPF_MOV, R0, 0),
+			BPF_LD_IMM64(R1, 3),
+			BPF_JMP_IMM(BPF_JSLE, R1, 0, 4),
+			BPF_ALU64_IMM(BPF_SUB, R1, 2),
+			BPF_JMP_IMM(BPF_JSLE, R1, 0, 2),
+			BPF_ALU64_IMM(BPF_SUB, R1, 2),
+			BPF_JMP_IMM(BPF_JSLE, R1, 0, 1),
+			BPF_EXIT_INSN(),		/* bad exit */
+			BPF_ALU32_IMM(BPF_MOV, R0, 1),	/* good exit */
+			BPF_EXIT_INSN(),
+		},
+		INTERNAL,
+		{ },
+		{ { 0, 1 } },
+	},
 	/* BPF_JMP | BPF_JSGE | BPF_K */
 	{
 		"JMP_JSGE_K: Signed jump: if (-1 >= -2) return 1",
@@ -4617,6 +4739,35 @@ static struct bpf_test tests[] = {
 		{ },
 		{ { 0, 1 } },
 	},
+	/* BPF_JMP | BPF_JLT | BPF_K */
+	{
+		"JMP_JLT_K: if (2 < 3) return 1",
+		.u.insns_int = {
+			BPF_ALU32_IMM(BPF_MOV, R0, 0),
+			BPF_LD_IMM64(R1, 2),
+			BPF_JMP_IMM(BPF_JLT, R1, 3, 1),
+			BPF_EXIT_INSN(),
+			BPF_ALU32_IMM(BPF_MOV, R0, 1),
+			BPF_EXIT_INSN(),
+		},
+		INTERNAL,
+		{ },
+		{ { 0, 1 } },
+	},
+	{
+		"JMP_JGT_K: Unsigned jump: if (1 < -1) return 1",
+		.u.insns_int = {
+			BPF_ALU32_IMM(BPF_MOV, R0, 0),
+			BPF_LD_IMM64(R1, 1),
+			BPF_JMP_IMM(BPF_JLT, R1, -1, 1),
+			BPF_EXIT_INSN(),
+			BPF_ALU32_IMM(BPF_MOV, R0, 1),
+			BPF_EXIT_INSN(),
+		},
+		INTERNAL,
+		{ },
+		{ { 0, 1 } },
+	},
 	/* BPF_JMP | BPF_JGE | BPF_K */
 	{
 		"JMP_JGE_K: if (3 >= 2) return 1",
@@ -4632,6 +4783,21 @@ static struct bpf_test tests[] = {
 		{ },
 		{ { 0, 1 } },
 	},
+	/* BPF_JMP | BPF_JLE | BPF_K */
+	{
+		"JMP_JLE_K: if (2 <= 3) return 1",
+		.u.insns_int = {
+			BPF_ALU32_IMM(BPF_MOV, R0, 0),
+			BPF_LD_IMM64(R1, 2),
+			BPF_JMP_IMM(BPF_JLE, R1, 3, 1),
+			BPF_EXIT_INSN(),
+			BPF_ALU32_IMM(BPF_MOV, R0, 1),
+			BPF_EXIT_INSN(),
+		},
+		INTERNAL,
+		{ },
+		{ { 0, 1 } },
+	},
 	/* BPF_JMP | BPF_JGT | BPF_K jump backwards */
 	{
 		"JMP_JGT_K: if (3 > 2) return 1 (jump backwards)",
@@ -4662,6 +4828,36 @@ static struct bpf_test tests[] = {
 		{ },
 		{ { 0, 1 } },
 	},
+	/* BPF_JMP | BPF_JLT | BPF_K jump backwards */
+	{
+		"JMP_JGT_K: if (2 < 3) return 1 (jump backwards)",
+		.u.insns_int = {
+			BPF_JMP_IMM(BPF_JA, 0, 0, 2), /* goto start */
+			BPF_ALU32_IMM(BPF_MOV, R0, 1), /* out: */
+			BPF_EXIT_INSN(),
+			BPF_ALU32_IMM(BPF_MOV, R0, 0), /* start: */
+			BPF_LD_IMM64(R1, 2), /* note: this takes 2 insns */
+			BPF_JMP_IMM(BPF_JLT, R1, 3, -6), /* goto out */
+			BPF_EXIT_INSN(),
+		},
+		INTERNAL,
+		{ },
+		{ { 0, 1 } },
+	},
+	{
+		"JMP_JLE_K: if (3 <= 3) return 1",
+		.u.insns_int = {
+			BPF_ALU32_IMM(BPF_MOV, R0, 0),
+			BPF_LD_IMM64(R1, 3),
+			BPF_JMP_IMM(BPF_JLE, R1, 3, 1),
+			BPF_EXIT_INSN(),
+			BPF_ALU32_IMM(BPF_MOV, R0, 1),
+			BPF_EXIT_INSN(),
+		},
+		INTERNAL,
+		{ },
+		{ { 0, 1 } },
+	},
 	/* BPF_JMP | BPF_JNE | BPF_K */
 	{
 		"JMP_JNE_K: if (3 != 2) return 1",
@@ -4752,6 +4948,37 @@ static struct bpf_test tests[] = {
 		{ },
 		{ { 0, 1 } },
 	},
+	/* BPF_JMP | BPF_JSLT | BPF_X */
+	{
+		"JMP_JSLT_X: Signed jump: if (-2 < -1) return 1",
+		.u.insns_int = {
+			BPF_ALU32_IMM(BPF_MOV, R0, 0),
+			BPF_LD_IMM64(R1, -1),
+			BPF_LD_IMM64(R2, -2),
+			BPF_JMP_REG(BPF_JSLT, R2, R1, 1),
+			BPF_EXIT_INSN(),
+			BPF_ALU32_IMM(BPF_MOV, R0, 1),
+			BPF_EXIT_INSN(),
+		},
+		INTERNAL,
+		{ },
+		{ { 0, 1 } },
+	},
+	{
+		"JMP_JSLT_X: Signed jump: if (-1 < -1) return 0",
+		.u.insns_int = {
+			BPF_ALU32_IMM(BPF_MOV, R0, 1),
+			BPF_LD_IMM64(R1, -1),
+			BPF_LD_IMM64(R2, -1),
+			BPF_JMP_REG(BPF_JSLT, R1, R2, 1),
+			BPF_EXIT_INSN(),
+			BPF_ALU32_IMM(BPF_MOV, R0, 0),
+			BPF_EXIT_INSN(),
+		},
+		INTERNAL,
+		{ },
+		{ { 0, 1 } },
+	},
 	/* BPF_JMP | BPF_JSGE | BPF_X */
 	{
 		"JMP_JSGE_X: Signed jump: if (-1 >= -2) return 1",
@@ -4783,6 +5010,37 @@ static struct bpf_test tests[] = {
 		{ },
 		{ { 0, 1 } },
 	},
+	/* BPF_JMP | BPF_JSLE | BPF_X */
+	{
+		"JMP_JSLE_X: Signed jump: if (-2 <= -1) return 1",
+		.u.insns_int = {
+			BPF_ALU32_IMM(BPF_MOV, R0, 0),
+			BPF_LD_IMM64(R1, -1),
+			BPF_LD_IMM64(R2, -2),
+			BPF_JMP_REG(BPF_JSLE, R2, R1, 1),
+			BPF_EXIT_INSN(),
+			BPF_ALU32_IMM(BPF_MOV, R0, 1),
+			BPF_EXIT_INSN(),
+		},
+		INTERNAL,
+		{ },
+		{ { 0, 1 } },
+	},
+	{
+		"JMP_JSLE_X: Signed jump: if (-1 <= -1) return 1",
+		.u.insns_int = {
+			BPF_ALU32_IMM(BPF_MOV, R0, 0),
+			BPF_LD_IMM64(R1, -1),
+			BPF_LD_IMM64(R2, -1),
+			BPF_JMP_REG(BPF_JSLE, R1, R2, 1),
+			BPF_EXIT_INSN(),
+			BPF_ALU32_IMM(BPF_MOV, R0, 1),
+			BPF_EXIT_INSN(),
+		},
+		INTERNAL,
+		{ },
+		{ { 0, 1 } },
+	},
 	/* BPF_JMP | BPF_JGT | BPF_X */
 	{
 		"JMP_JGT_X: if (3 > 2) return 1",
@@ -4814,6 +5072,37 @@ static struct bpf_test tests[] = {
 		{ },
 		{ { 0, 1 } },
 	},
+	/* BPF_JMP | BPF_JLT | BPF_X */
+	{
+		"JMP_JLT_X: if (2 < 3) return 1",
+		.u.insns_int = {
+			BPF_ALU32_IMM(BPF_MOV, R0, 0),
+			BPF_LD_IMM64(R1, 3),
+			BPF_LD_IMM64(R2, 2),
+			BPF_JMP_REG(BPF_JLT, R2, R1, 1),
+			BPF_EXIT_INSN(),
+			BPF_ALU32_IMM(BPF_MOV, R0, 1),
+			BPF_EXIT_INSN(),
+		},
+		INTERNAL,
+		{ },
+		{ { 0, 1 } },
+	},
+	{
+		"JMP_JLT_X: Unsigned jump: if (1 < -1) return 1",
+		.u.insns_int = {
+			BPF_ALU32_IMM(BPF_MOV, R0, 0),
+			BPF_LD_IMM64(R1, -1),
+			BPF_LD_IMM64(R2, 1),
+			BPF_JMP_REG(BPF_JLT, R2, R1, 1),
+			BPF_EXIT_INSN(),
+			BPF_ALU32_IMM(BPF_MOV, R0, 1),
+			BPF_EXIT_INSN(),
+		},
+		INTERNAL,
+		{ },
+		{ { 0, 1 } },
+	},
 	/* BPF_JMP | BPF_JGE | BPF_X */
 	{
 		"JMP_JGE_X: if (3 >= 2) return 1",
@@ -4845,6 +5134,37 @@ static struct bpf_test tests[] = {
 		{ },
 		{ { 0, 1 } },
 	},
+	/* BPF_JMP | BPF_JLE | BPF_X */
+	{
+		"JMP_JLE_X: if (2 <= 3) return 1",
+		.u.insns_int = {
+			BPF_ALU32_IMM(BPF_MOV, R0, 0),
+			BPF_LD_IMM64(R1, 3),
+			BPF_LD_IMM64(R2, 2),
+			BPF_JMP_REG(BPF_JLE, R2, R1, 1),
+			BPF_EXIT_INSN(),
+			BPF_ALU32_IMM(BPF_MOV, R0, 1),
+			BPF_EXIT_INSN(),
+		},
+		INTERNAL,
+		{ },
+		{ { 0, 1 } },
+	},
+	{
+		"JMP_JLE_X: if (3 <= 3) return 1",
+		.u.insns_int = {
+			BPF_ALU32_IMM(BPF_MOV, R0, 0),
+			BPF_LD_IMM64(R1, 3),
+			BPF_LD_IMM64(R2, 3),
+			BPF_JMP_REG(BPF_JLE, R1, R2, 1),
+			BPF_EXIT_INSN(),
+			BPF_ALU32_IMM(BPF_MOV, R0, 1),
+			BPF_EXIT_INSN(),
+		},
+		INTERNAL,
+		{ },
+		{ { 0, 1 } },
+	},
 	{
 		/* Mainly testing JIT + imm64 here. */
 		"JMP_JGE_X: ldimm64 test 1",
@@ -4890,6 +5210,50 @@ static struct bpf_test tests[] = {
 		{ },
 		{ { 0, 1 } },
 	},
+	{
+		"JMP_JLE_X: ldimm64 test 1",
+		.u.insns_int = {
+			BPF_ALU32_IMM(BPF_MOV, R0, 0),
+			BPF_LD_IMM64(R1, 3),
+			BPF_LD_IMM64(R2, 2),
+			BPF_JMP_REG(BPF_JLE, R2, R1, 2),
+			BPF_LD_IMM64(R0, 0xffffffffffffffffULL),
+			BPF_LD_IMM64(R0, 0xeeeeeeeeeeeeeeeeULL),
+			BPF_EXIT_INSN(),
+		},
+		INTERNAL,
+		{ },
+		{ { 0, 0xeeeeeeeeU } },
+	},
+	{
+		"JMP_JLE_X: ldimm64 test 2",
+		.u.insns_int = {
+			BPF_ALU32_IMM(BPF_MOV, R0, 0),
+			BPF_LD_IMM64(R1, 3),
+			BPF_LD_IMM64(R2, 2),
+			BPF_JMP_REG(BPF_JLE, R2, R1, 0),
+			BPF_LD_IMM64(R0, 0xffffffffffffffffULL),
+			BPF_EXIT_INSN(),
+		},
+		INTERNAL,
+		{ },
+		{ { 0, 0xffffffffU } },
+	},
+	{
+		"JMP_JLE_X: ldimm64 test 3",
+		.u.insns_int = {
+			BPF_ALU32_IMM(BPF_MOV, R0, 1),
+			BPF_LD_IMM64(R1, 3),
+			BPF_LD_IMM64(R2, 2),
+			BPF_JMP_REG(BPF_JLE, R2, R1, 4),
+			BPF_LD_IMM64(R0, 0xffffffffffffffffULL),
+			BPF_LD_IMM64(R0, 0xeeeeeeeeeeeeeeeeULL),
+			BPF_EXIT_INSN(),
+		},
+		INTERNAL,
+		{ },
+		{ { 0, 1 } },
+	},
 	/* BPF_JMP | BPF_JNE | BPF_X */
 	{
 		"JMP_JNE_X: if (3 != 2) return 1",
diff --git a/net/core/filter.c b/net/core/filter.c
index 78d00933dbe7..5afe3ac191ec 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -514,14 +514,27 @@ do_pass:
 				break;
 			}
 
-			/* Convert JEQ into JNE when 'jump_true' is next insn. */
-			if (fp->jt == 0 && BPF_OP(fp->code) == BPF_JEQ) {
-				insn->code = BPF_JMP | BPF_JNE | bpf_src;
+			/* Convert some jumps when 'jump_true' is next insn. */
+			if (fp->jt == 0) {
+				switch (BPF_OP(fp->code)) {
+				case BPF_JEQ:
+					insn->code = BPF_JMP | BPF_JNE | bpf_src;
+					break;
+				case BPF_JGT:
+					insn->code = BPF_JMP | BPF_JLE | bpf_src;
+					break;
+				case BPF_JGE:
+					insn->code = BPF_JMP | BPF_JLT | bpf_src;
+					break;
+				default:
+					goto jmp_rest;
+				}
+
 				target = i + fp->jf + 1;
 				BPF_EMIT_JMP;
 				break;
 			}
-
+jmp_rest:
 			/* Other jumps are mapped into two insns: Jxx and JA. */
 			target = i + fp->jt + 1;
 			insn->code = BPF_JMP | BPF_OP(fp->code) | bpf_src;
diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index 8d9bfcca3fe4..bf3b2e230455 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -30,9 +30,14 @@
 #define BPF_FROM_LE	BPF_TO_LE
 #define BPF_FROM_BE	BPF_TO_BE
 
+/* jmp encodings */
 #define BPF_JNE		0x50	/* jump != */
+#define BPF_JLT		0xa0	/* LT is unsigned, '<' */
+#define BPF_JLE		0xb0	/* LE is unsigned, '<=' */
 #define BPF_JSGT	0x60	/* SGT is signed '>', GT in x86 */
 #define BPF_JSGE	0x70	/* SGE is signed '>=', GE in x86 */
+#define BPF_JSLT	0xc0	/* SLT is signed, '<' */
+#define BPF_JSLE	0xd0	/* SLE is signed, '<=' */
 #define BPF_CALL	0x80	/* function call */
 #define BPF_EXIT	0x90	/* function return */
 
-- 
cgit v1.2.3


From b97bac64a589d0158cf866e8995e831030f68f4f Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Wed, 9 Aug 2017 20:41:48 +0200
Subject: rtnetlink: make rtnl_register accept a flags parameter

This change allows us to later indicate to rtnetlink core that certain
doit functions should be called without acquiring rtnl_mutex.

This change should have no effect, we simply replace the last (now
unused) calcit argument with the new flag.

Signed-off-by: Florian Westphal <fw@strlen.de>
Reviewed-by: Hannes Frederic Sowa <hannes@stressinduktion.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/rtnetlink.h  |  5 ++---
 net/bridge/br_mdb.c      |  6 +++---
 net/can/gw.c             |  6 +++---
 net/core/fib_rules.c     |  6 +++---
 net/core/neighbour.c     | 10 +++++-----
 net/core/net_namespace.c |  4 ++--
 net/core/rtnetlink.c     | 36 ++++++++++++++++++------------------
 net/dcb/dcbnl.c          |  4 ++--
 net/decnet/dn_dev.c      |  6 +++---
 net/decnet/dn_fib.c      |  4 ++--
 net/decnet/dn_route.c    |  4 ++--
 net/ipv4/devinet.c       |  8 ++++----
 net/ipv4/fib_frontend.c  |  6 +++---
 net/ipv4/ipmr.c          |  8 ++++----
 net/ipv4/route.c         |  2 +-
 net/ipv6/addrconf.c      | 14 +++++++-------
 net/ipv6/addrlabel.c     |  6 +++---
 net/ipv6/ip6_fib.c       |  2 +-
 net/ipv6/ip6mr.c         |  2 +-
 net/ipv6/route.c         |  6 +++---
 net/mpls/af_mpls.c       |  8 ++++----
 net/phonet/pn_netlink.c  | 12 ++++++------
 net/qrtr/qrtr.c          |  2 +-
 net/sched/act_api.c      |  6 +++---
 net/sched/cls_api.c      |  6 +++---
 net/sched/sch_api.c      | 12 ++++++------
 26 files changed, 95 insertions(+), 96 deletions(-)

(limited to 'include')

diff --git a/include/net/rtnetlink.h b/include/net/rtnetlink.h
index abe6b733d473..ac32460a0adb 100644
--- a/include/net/rtnetlink.h
+++ b/include/net/rtnetlink.h
@@ -7,12 +7,11 @@
 typedef int (*rtnl_doit_func)(struct sk_buff *, struct nlmsghdr *,
 			      struct netlink_ext_ack *);
 typedef int (*rtnl_dumpit_func)(struct sk_buff *, struct netlink_callback *);
-typedef u16 (*rtnl_calcit_func)(struct sk_buff *, struct nlmsghdr *);
 
 int __rtnl_register(int protocol, int msgtype,
-		    rtnl_doit_func, rtnl_dumpit_func, rtnl_calcit_func);
+		    rtnl_doit_func, rtnl_dumpit_func, unsigned int flags);
 void rtnl_register(int protocol, int msgtype,
-		   rtnl_doit_func, rtnl_dumpit_func, rtnl_calcit_func);
+		   rtnl_doit_func, rtnl_dumpit_func, unsigned int flags);
 int rtnl_unregister(int protocol, int msgtype);
 void rtnl_unregister_all(int protocol);
 
diff --git a/net/bridge/br_mdb.c b/net/bridge/br_mdb.c
index a0b11e7d67d9..ca01def49af0 100644
--- a/net/bridge/br_mdb.c
+++ b/net/bridge/br_mdb.c
@@ -713,9 +713,9 @@ static int br_mdb_del(struct sk_buff *skb, struct nlmsghdr *nlh,
 
 void br_mdb_init(void)
 {
-	rtnl_register(PF_BRIDGE, RTM_GETMDB, NULL, br_mdb_dump, NULL);
-	rtnl_register(PF_BRIDGE, RTM_NEWMDB, br_mdb_add, NULL, NULL);
-	rtnl_register(PF_BRIDGE, RTM_DELMDB, br_mdb_del, NULL, NULL);
+	rtnl_register(PF_BRIDGE, RTM_GETMDB, NULL, br_mdb_dump, 0);
+	rtnl_register(PF_BRIDGE, RTM_NEWMDB, br_mdb_add, NULL, 0);
+	rtnl_register(PF_BRIDGE, RTM_DELMDB, br_mdb_del, NULL, 0);
 }
 
 void br_mdb_uninit(void)
diff --git a/net/can/gw.c b/net/can/gw.c
index 29748d844c3f..73a02af4b5d7 100644
--- a/net/can/gw.c
+++ b/net/can/gw.c
@@ -1031,15 +1031,15 @@ static __init int cgw_module_init(void)
 	notifier.notifier_call = cgw_notifier;
 	register_netdevice_notifier(&notifier);
 
-	if (__rtnl_register(PF_CAN, RTM_GETROUTE, NULL, cgw_dump_jobs, NULL)) {
+	if (__rtnl_register(PF_CAN, RTM_GETROUTE, NULL, cgw_dump_jobs, 0)) {
 		unregister_netdevice_notifier(&notifier);
 		kmem_cache_destroy(cgw_cache);
 		return -ENOBUFS;
 	}
 
 	/* Only the first call to __rtnl_register can fail */
-	__rtnl_register(PF_CAN, RTM_NEWROUTE, cgw_create_job, NULL, NULL);
-	__rtnl_register(PF_CAN, RTM_DELROUTE, cgw_remove_job, NULL, NULL);
+	__rtnl_register(PF_CAN, RTM_NEWROUTE, cgw_create_job, NULL, 0);
+	__rtnl_register(PF_CAN, RTM_DELROUTE, cgw_remove_job, NULL, 0);
 
 	return 0;
 }
diff --git a/net/core/fib_rules.c b/net/core/fib_rules.c
index fc0b65093417..9a6d97c1d810 100644
--- a/net/core/fib_rules.c
+++ b/net/core/fib_rules.c
@@ -1026,9 +1026,9 @@ static struct pernet_operations fib_rules_net_ops = {
 static int __init fib_rules_init(void)
 {
 	int err;
-	rtnl_register(PF_UNSPEC, RTM_NEWRULE, fib_nl_newrule, NULL, NULL);
-	rtnl_register(PF_UNSPEC, RTM_DELRULE, fib_nl_delrule, NULL, NULL);
-	rtnl_register(PF_UNSPEC, RTM_GETRULE, NULL, fib_nl_dumprule, NULL);
+	rtnl_register(PF_UNSPEC, RTM_NEWRULE, fib_nl_newrule, NULL, 0);
+	rtnl_register(PF_UNSPEC, RTM_DELRULE, fib_nl_delrule, NULL, 0);
+	rtnl_register(PF_UNSPEC, RTM_GETRULE, NULL, fib_nl_dumprule, 0);
 
 	err = register_pernet_subsys(&fib_rules_net_ops);
 	if (err < 0)
diff --git a/net/core/neighbour.c b/net/core/neighbour.c
index d0713627deb6..16a1a4c4eb57 100644
--- a/net/core/neighbour.c
+++ b/net/core/neighbour.c
@@ -3261,13 +3261,13 @@ EXPORT_SYMBOL(neigh_sysctl_unregister);
 
 static int __init neigh_init(void)
 {
-	rtnl_register(PF_UNSPEC, RTM_NEWNEIGH, neigh_add, NULL, NULL);
-	rtnl_register(PF_UNSPEC, RTM_DELNEIGH, neigh_delete, NULL, NULL);
-	rtnl_register(PF_UNSPEC, RTM_GETNEIGH, NULL, neigh_dump_info, NULL);
+	rtnl_register(PF_UNSPEC, RTM_NEWNEIGH, neigh_add, NULL, 0);
+	rtnl_register(PF_UNSPEC, RTM_DELNEIGH, neigh_delete, NULL, 0);
+	rtnl_register(PF_UNSPEC, RTM_GETNEIGH, NULL, neigh_dump_info, 0);
 
 	rtnl_register(PF_UNSPEC, RTM_GETNEIGHTBL, NULL, neightbl_dump_info,
-		      NULL);
-	rtnl_register(PF_UNSPEC, RTM_SETNEIGHTBL, neightbl_set, NULL, NULL);
+		      0);
+	rtnl_register(PF_UNSPEC, RTM_SETNEIGHTBL, neightbl_set, NULL, 0);
 
 	return 0;
 }
diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c
index 8726d051f31d..a7f06d706aa0 100644
--- a/net/core/net_namespace.c
+++ b/net/core/net_namespace.c
@@ -855,9 +855,9 @@ static int __init net_ns_init(void)
 
 	register_pernet_subsys(&net_ns_ops);
 
-	rtnl_register(PF_UNSPEC, RTM_NEWNSID, rtnl_net_newid, NULL, NULL);
+	rtnl_register(PF_UNSPEC, RTM_NEWNSID, rtnl_net_newid, NULL, 0);
 	rtnl_register(PF_UNSPEC, RTM_GETNSID, rtnl_net_getid, rtnl_net_dumpid,
-		      NULL);
+		      0);
 
 	return 0;
 }
diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
index 8c9d34deea7d..67607c540c03 100644
--- a/net/core/rtnetlink.c
+++ b/net/core/rtnetlink.c
@@ -178,7 +178,7 @@ static rtnl_dumpit_func rtnl_get_dumpit(int protocol, int msgindex)
  * @msgtype: rtnetlink message type
  * @doit: Function pointer called for each request message
  * @dumpit: Function pointer called for each dump request (NLM_F_DUMP) message
- * @calcit: Function pointer to calc size of dump message
+ * @flags: rtnl_link_flags to modifiy behaviour of doit/dumpit functions
  *
  * Registers the specified function pointers (at least one of them has
  * to be non-NULL) to be called whenever a request message for the
@@ -192,7 +192,7 @@ static rtnl_dumpit_func rtnl_get_dumpit(int protocol, int msgindex)
  */
 int __rtnl_register(int protocol, int msgtype,
 		    rtnl_doit_func doit, rtnl_dumpit_func dumpit,
-		    rtnl_calcit_func calcit)
+		    unsigned int flags)
 {
 	struct rtnl_link *tab;
 	int msgindex;
@@ -230,9 +230,9 @@ EXPORT_SYMBOL_GPL(__rtnl_register);
  */
 void rtnl_register(int protocol, int msgtype,
 		   rtnl_doit_func doit, rtnl_dumpit_func dumpit,
-		   rtnl_calcit_func calcit)
+		   unsigned int flags)
 {
-	if (__rtnl_register(protocol, msgtype, doit, dumpit, calcit) < 0)
+	if (__rtnl_register(protocol, msgtype, doit, dumpit, flags) < 0)
 		panic("Unable to register rtnetlink message handler, "
 		      "protocol = %d, message type = %d\n",
 		      protocol, msgtype);
@@ -4279,23 +4279,23 @@ void __init rtnetlink_init(void)
 	register_netdevice_notifier(&rtnetlink_dev_notifier);
 
 	rtnl_register(PF_UNSPEC, RTM_GETLINK, rtnl_getlink,
-		      rtnl_dump_ifinfo, NULL);
-	rtnl_register(PF_UNSPEC, RTM_SETLINK, rtnl_setlink, NULL, NULL);
-	rtnl_register(PF_UNSPEC, RTM_NEWLINK, rtnl_newlink, NULL, NULL);
-	rtnl_register(PF_UNSPEC, RTM_DELLINK, rtnl_dellink, NULL, NULL);
+		      rtnl_dump_ifinfo, 0);
+	rtnl_register(PF_UNSPEC, RTM_SETLINK, rtnl_setlink, NULL, 0);
+	rtnl_register(PF_UNSPEC, RTM_NEWLINK, rtnl_newlink, NULL, 0);
+	rtnl_register(PF_UNSPEC, RTM_DELLINK, rtnl_dellink, NULL, 0);
 
-	rtnl_register(PF_UNSPEC, RTM_GETADDR, NULL, rtnl_dump_all, NULL);
-	rtnl_register(PF_UNSPEC, RTM_GETROUTE, NULL, rtnl_dump_all, NULL);
-	rtnl_register(PF_UNSPEC, RTM_GETNETCONF, NULL, rtnl_dump_all, NULL);
+	rtnl_register(PF_UNSPEC, RTM_GETADDR, NULL, rtnl_dump_all, 0);
+	rtnl_register(PF_UNSPEC, RTM_GETROUTE, NULL, rtnl_dump_all, 0);
+	rtnl_register(PF_UNSPEC, RTM_GETNETCONF, NULL, rtnl_dump_all, 0);
 
-	rtnl_register(PF_BRIDGE, RTM_NEWNEIGH, rtnl_fdb_add, NULL, NULL);
-	rtnl_register(PF_BRIDGE, RTM_DELNEIGH, rtnl_fdb_del, NULL, NULL);
-	rtnl_register(PF_BRIDGE, RTM_GETNEIGH, NULL, rtnl_fdb_dump, NULL);
+	rtnl_register(PF_BRIDGE, RTM_NEWNEIGH, rtnl_fdb_add, NULL, 0);
+	rtnl_register(PF_BRIDGE, RTM_DELNEIGH, rtnl_fdb_del, NULL, 0);
+	rtnl_register(PF_BRIDGE, RTM_GETNEIGH, NULL, rtnl_fdb_dump, 0);
 
-	rtnl_register(PF_BRIDGE, RTM_GETLINK, NULL, rtnl_bridge_getlink, NULL);
-	rtnl_register(PF_BRIDGE, RTM_DELLINK, rtnl_bridge_dellink, NULL, NULL);
-	rtnl_register(PF_BRIDGE, RTM_SETLINK, rtnl_bridge_setlink, NULL, NULL);
+	rtnl_register(PF_BRIDGE, RTM_GETLINK, NULL, rtnl_bridge_getlink, 0);
+	rtnl_register(PF_BRIDGE, RTM_DELLINK, rtnl_bridge_dellink, NULL, 0);
+	rtnl_register(PF_BRIDGE, RTM_SETLINK, rtnl_bridge_setlink, NULL, 0);
 
 	rtnl_register(PF_UNSPEC, RTM_GETSTATS, rtnl_stats_get, rtnl_stats_dump,
-		      NULL);
+		      0);
 }
diff --git a/net/dcb/dcbnl.c b/net/dcb/dcbnl.c
index 733f523707ac..bae7d78aa068 100644
--- a/net/dcb/dcbnl.c
+++ b/net/dcb/dcbnl.c
@@ -1938,8 +1938,8 @@ static int __init dcbnl_init(void)
 {
 	INIT_LIST_HEAD(&dcb_app_list);
 
-	rtnl_register(PF_UNSPEC, RTM_GETDCB, dcb_doit, NULL, NULL);
-	rtnl_register(PF_UNSPEC, RTM_SETDCB, dcb_doit, NULL, NULL);
+	rtnl_register(PF_UNSPEC, RTM_GETDCB, dcb_doit, NULL, 0);
+	rtnl_register(PF_UNSPEC, RTM_SETDCB, dcb_doit, NULL, 0);
 
 	return 0;
 }
diff --git a/net/decnet/dn_dev.c b/net/decnet/dn_dev.c
index fa0110b57ca1..4d339de56862 100644
--- a/net/decnet/dn_dev.c
+++ b/net/decnet/dn_dev.c
@@ -1419,9 +1419,9 @@ void __init dn_dev_init(void)
 
 	dn_dev_devices_on();
 
-	rtnl_register(PF_DECnet, RTM_NEWADDR, dn_nl_newaddr, NULL, NULL);
-	rtnl_register(PF_DECnet, RTM_DELADDR, dn_nl_deladdr, NULL, NULL);
-	rtnl_register(PF_DECnet, RTM_GETADDR, NULL, dn_nl_dump_ifaddr, NULL);
+	rtnl_register(PF_DECnet, RTM_NEWADDR, dn_nl_newaddr, NULL, 0);
+	rtnl_register(PF_DECnet, RTM_DELADDR, dn_nl_deladdr, NULL, 0);
+	rtnl_register(PF_DECnet, RTM_GETADDR, NULL, dn_nl_dump_ifaddr, 0);
 
 	proc_create("decnet_dev", S_IRUGO, init_net.proc_net, &dn_dev_seq_fops);
 
diff --git a/net/decnet/dn_fib.c b/net/decnet/dn_fib.c
index f9f6fb3f3c5b..3d37464c8b4a 100644
--- a/net/decnet/dn_fib.c
+++ b/net/decnet/dn_fib.c
@@ -791,8 +791,8 @@ void __init dn_fib_init(void)
 
 	register_dnaddr_notifier(&dn_fib_dnaddr_notifier);
 
-	rtnl_register(PF_DECnet, RTM_NEWROUTE, dn_fib_rtm_newroute, NULL, NULL);
-	rtnl_register(PF_DECnet, RTM_DELROUTE, dn_fib_rtm_delroute, NULL, NULL);
+	rtnl_register(PF_DECnet, RTM_NEWROUTE, dn_fib_rtm_newroute, NULL, 0);
+	rtnl_register(PF_DECnet, RTM_DELROUTE, dn_fib_rtm_delroute, NULL, 0);
 }
 
 
diff --git a/net/decnet/dn_route.c b/net/decnet/dn_route.c
index bcbe548f8854..0bd3afd01dd2 100644
--- a/net/decnet/dn_route.c
+++ b/net/decnet/dn_route.c
@@ -1922,10 +1922,10 @@ void __init dn_route_init(void)
 
 #ifdef CONFIG_DECNET_ROUTER
 	rtnl_register(PF_DECnet, RTM_GETROUTE, dn_cache_getroute,
-		      dn_fib_dump, NULL);
+		      dn_fib_dump, 0);
 #else
 	rtnl_register(PF_DECnet, RTM_GETROUTE, dn_cache_getroute,
-		      dn_cache_dump, NULL);
+		      dn_cache_dump, 0);
 #endif
 }
 
diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c
index 38d9af9b917c..d7adc0616599 100644
--- a/net/ipv4/devinet.c
+++ b/net/ipv4/devinet.c
@@ -2491,9 +2491,9 @@ void __init devinet_init(void)
 
 	rtnl_af_register(&inet_af_ops);
 
-	rtnl_register(PF_INET, RTM_NEWADDR, inet_rtm_newaddr, NULL, NULL);
-	rtnl_register(PF_INET, RTM_DELADDR, inet_rtm_deladdr, NULL, NULL);
-	rtnl_register(PF_INET, RTM_GETADDR, NULL, inet_dump_ifaddr, NULL);
+	rtnl_register(PF_INET, RTM_NEWADDR, inet_rtm_newaddr, NULL, 0);
+	rtnl_register(PF_INET, RTM_DELADDR, inet_rtm_deladdr, NULL, 0);
+	rtnl_register(PF_INET, RTM_GETADDR, NULL, inet_dump_ifaddr, 0);
 	rtnl_register(PF_INET, RTM_GETNETCONF, inet_netconf_get_devconf,
-		      inet_netconf_dump_devconf, NULL);
+		      inet_netconf_dump_devconf, 0);
 }
diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c
index 2cba559f14df..37819ab4cc74 100644
--- a/net/ipv4/fib_frontend.c
+++ b/net/ipv4/fib_frontend.c
@@ -1348,7 +1348,7 @@ void __init ip_fib_init(void)
 	register_netdevice_notifier(&fib_netdev_notifier);
 	register_inetaddr_notifier(&fib_inetaddr_notifier);
 
-	rtnl_register(PF_INET, RTM_NEWROUTE, inet_rtm_newroute, NULL, NULL);
-	rtnl_register(PF_INET, RTM_DELROUTE, inet_rtm_delroute, NULL, NULL);
-	rtnl_register(PF_INET, RTM_GETROUTE, NULL, inet_dump_fib, NULL);
+	rtnl_register(PF_INET, RTM_NEWROUTE, inet_rtm_newroute, NULL, 0);
+	rtnl_register(PF_INET, RTM_DELROUTE, inet_rtm_delroute, NULL, 0);
+	rtnl_register(PF_INET, RTM_GETROUTE, NULL, inet_dump_fib, 0);
 }
diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c
index 06863ea3fc5b..c9b3e6e069ae 100644
--- a/net/ipv4/ipmr.c
+++ b/net/ipv4/ipmr.c
@@ -3114,14 +3114,14 @@ int __init ip_mr_init(void)
 	}
 #endif
 	rtnl_register(RTNL_FAMILY_IPMR, RTM_GETROUTE,
-		      ipmr_rtm_getroute, ipmr_rtm_dumproute, NULL);
+		      ipmr_rtm_getroute, ipmr_rtm_dumproute, 0);
 	rtnl_register(RTNL_FAMILY_IPMR, RTM_NEWROUTE,
-		      ipmr_rtm_route, NULL, NULL);
+		      ipmr_rtm_route, NULL, 0);
 	rtnl_register(RTNL_FAMILY_IPMR, RTM_DELROUTE,
-		      ipmr_rtm_route, NULL, NULL);
+		      ipmr_rtm_route, NULL, 0);
 
 	rtnl_register(RTNL_FAMILY_IPMR, RTM_GETLINK,
-		      NULL, ipmr_rtm_dumplink, NULL);
+		      NULL, ipmr_rtm_dumplink, 0);
 	return 0;
 
 #ifdef CONFIG_IP_PIMSM_V2
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index 0383e66f59bc..2ef46294475f 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -3067,7 +3067,7 @@ int __init ip_rt_init(void)
 	xfrm_init();
 	xfrm4_init();
 #endif
-	rtnl_register(PF_INET, RTM_GETROUTE, inet_rtm_getroute, NULL, NULL);
+	rtnl_register(PF_INET, RTM_GETROUTE, inet_rtm_getroute, NULL, 0);
 
 #ifdef CONFIG_SYSCTL
 	register_pernet_subsys(&sysctl_route_ops);
diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c
index 30ee23eef268..640792e1ecb7 100644
--- a/net/ipv6/addrconf.c
+++ b/net/ipv6/addrconf.c
@@ -6605,21 +6605,21 @@ int __init addrconf_init(void)
 	rtnl_af_register(&inet6_ops);
 
 	err = __rtnl_register(PF_INET6, RTM_GETLINK, NULL, inet6_dump_ifinfo,
-			      NULL);
+			      0);
 	if (err < 0)
 		goto errout;
 
 	/* Only the first call to __rtnl_register can fail */
-	__rtnl_register(PF_INET6, RTM_NEWADDR, inet6_rtm_newaddr, NULL, NULL);
-	__rtnl_register(PF_INET6, RTM_DELADDR, inet6_rtm_deladdr, NULL, NULL);
+	__rtnl_register(PF_INET6, RTM_NEWADDR, inet6_rtm_newaddr, NULL, 0);
+	__rtnl_register(PF_INET6, RTM_DELADDR, inet6_rtm_deladdr, NULL, 0);
 	__rtnl_register(PF_INET6, RTM_GETADDR, inet6_rtm_getaddr,
-			inet6_dump_ifaddr, NULL);
+			inet6_dump_ifaddr, 0);
 	__rtnl_register(PF_INET6, RTM_GETMULTICAST, NULL,
-			inet6_dump_ifmcaddr, NULL);
+			inet6_dump_ifmcaddr, 0);
 	__rtnl_register(PF_INET6, RTM_GETANYCAST, NULL,
-			inet6_dump_ifacaddr, NULL);
+			inet6_dump_ifacaddr, 0);
 	__rtnl_register(PF_INET6, RTM_GETNETCONF, inet6_netconf_get_devconf,
-			inet6_netconf_dump_devconf, NULL);
+			inet6_netconf_dump_devconf, 0);
 
 	ipv6_addr_label_rtnl_register();
 
diff --git a/net/ipv6/addrlabel.c b/net/ipv6/addrlabel.c
index 7a428f65c7ec..cea5eb488013 100644
--- a/net/ipv6/addrlabel.c
+++ b/net/ipv6/addrlabel.c
@@ -593,10 +593,10 @@ out:
 void __init ipv6_addr_label_rtnl_register(void)
 {
 	__rtnl_register(PF_INET6, RTM_NEWADDRLABEL, ip6addrlbl_newdel,
-			NULL, NULL);
+			NULL, 0);
 	__rtnl_register(PF_INET6, RTM_DELADDRLABEL, ip6addrlbl_newdel,
-			NULL, NULL);
+			NULL, 0);
 	__rtnl_register(PF_INET6, RTM_GETADDRLABEL, ip6addrlbl_get,
-			ip6addrlbl_dump, NULL);
+			ip6addrlbl_dump, 0);
 }
 
diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c
index 69ed0043d117..8c58c7558de0 100644
--- a/net/ipv6/ip6_fib.c
+++ b/net/ipv6/ip6_fib.c
@@ -2038,7 +2038,7 @@ int __init fib6_init(void)
 		goto out_kmem_cache_create;
 
 	ret = __rtnl_register(PF_INET6, RTM_GETROUTE, NULL, inet6_dump_fib,
-			      NULL);
+			      0);
 	if (ret)
 		goto out_unregister_subsys;
 
diff --git a/net/ipv6/ip6mr.c b/net/ipv6/ip6mr.c
index 7454850f2098..f5500f5444e9 100644
--- a/net/ipv6/ip6mr.c
+++ b/net/ipv6/ip6mr.c
@@ -1427,7 +1427,7 @@ int __init ip6_mr_init(void)
 	}
 #endif
 	rtnl_register(RTNL_FAMILY_IP6MR, RTM_GETROUTE, NULL,
-		      ip6mr_rtm_dumproute, NULL);
+		      ip6mr_rtm_dumproute, 0);
 	return 0;
 #ifdef CONFIG_IPV6_PIMSM_V2
 add_proto_fail:
diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index c73e61750642..035762fed07d 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -4105,9 +4105,9 @@ int __init ip6_route_init(void)
 		goto fib6_rules_init;
 
 	ret = -ENOBUFS;
-	if (__rtnl_register(PF_INET6, RTM_NEWROUTE, inet6_rtm_newroute, NULL, NULL) ||
-	    __rtnl_register(PF_INET6, RTM_DELROUTE, inet6_rtm_delroute, NULL, NULL) ||
-	    __rtnl_register(PF_INET6, RTM_GETROUTE, inet6_rtm_getroute, NULL, NULL))
+	if (__rtnl_register(PF_INET6, RTM_NEWROUTE, inet6_rtm_newroute, NULL, 0) ||
+	    __rtnl_register(PF_INET6, RTM_DELROUTE, inet6_rtm_delroute, NULL, 0) ||
+	    __rtnl_register(PF_INET6, RTM_GETROUTE, inet6_rtm_getroute, NULL, 0))
 		goto out_register_late_subsys;
 
 	ret = register_netdevice_notifier(&ip6_route_dev_notifier);
diff --git a/net/mpls/af_mpls.c b/net/mpls/af_mpls.c
index ea4f481839dd..c5b9ce41d66f 100644
--- a/net/mpls/af_mpls.c
+++ b/net/mpls/af_mpls.c
@@ -2479,12 +2479,12 @@ static int __init mpls_init(void)
 
 	rtnl_af_register(&mpls_af_ops);
 
-	rtnl_register(PF_MPLS, RTM_NEWROUTE, mpls_rtm_newroute, NULL, NULL);
-	rtnl_register(PF_MPLS, RTM_DELROUTE, mpls_rtm_delroute, NULL, NULL);
+	rtnl_register(PF_MPLS, RTM_NEWROUTE, mpls_rtm_newroute, NULL, 0);
+	rtnl_register(PF_MPLS, RTM_DELROUTE, mpls_rtm_delroute, NULL, 0);
 	rtnl_register(PF_MPLS, RTM_GETROUTE, mpls_getroute, mpls_dump_routes,
-		      NULL);
+		      0);
 	rtnl_register(PF_MPLS, RTM_GETNETCONF, mpls_netconf_get_devconf,
-		      mpls_netconf_dump_devconf, NULL);
+		      mpls_netconf_dump_devconf, 0);
 	err = 0;
 out:
 	return err;
diff --git a/net/phonet/pn_netlink.c b/net/phonet/pn_netlink.c
index 45b3af3080d8..da754fc926e7 100644
--- a/net/phonet/pn_netlink.c
+++ b/net/phonet/pn_netlink.c
@@ -300,15 +300,15 @@ out:
 int __init phonet_netlink_register(void)
 {
 	int err = __rtnl_register(PF_PHONET, RTM_NEWADDR, addr_doit,
-				  NULL, NULL);
+				  NULL, 0);
 	if (err)
 		return err;
 
 	/* Further __rtnl_register() cannot fail */
-	__rtnl_register(PF_PHONET, RTM_DELADDR, addr_doit, NULL, NULL);
-	__rtnl_register(PF_PHONET, RTM_GETADDR, NULL, getaddr_dumpit, NULL);
-	__rtnl_register(PF_PHONET, RTM_NEWROUTE, route_doit, NULL, NULL);
-	__rtnl_register(PF_PHONET, RTM_DELROUTE, route_doit, NULL, NULL);
-	__rtnl_register(PF_PHONET, RTM_GETROUTE, NULL, route_dumpit, NULL);
+	__rtnl_register(PF_PHONET, RTM_DELADDR, addr_doit, NULL, 0);
+	__rtnl_register(PF_PHONET, RTM_GETADDR, NULL, getaddr_dumpit, 0);
+	__rtnl_register(PF_PHONET, RTM_NEWROUTE, route_doit, NULL, 0);
+	__rtnl_register(PF_PHONET, RTM_DELROUTE, route_doit, NULL, 0);
+	__rtnl_register(PF_PHONET, RTM_GETROUTE, NULL, route_dumpit, 0);
 	return 0;
 }
diff --git a/net/qrtr/qrtr.c b/net/qrtr/qrtr.c
index 5586609afa27..c2f5c13550c0 100644
--- a/net/qrtr/qrtr.c
+++ b/net/qrtr/qrtr.c
@@ -1081,7 +1081,7 @@ static int __init qrtr_proto_init(void)
 		return rc;
 	}
 
-	rtnl_register(PF_QIPCRTR, RTM_NEWADDR, qrtr_addr_doit, NULL, NULL);
+	rtnl_register(PF_QIPCRTR, RTM_NEWADDR, qrtr_addr_doit, NULL, 0);
 
 	return 0;
 }
diff --git a/net/sched/act_api.c b/net/sched/act_api.c
index a2915d958279..02fcb0c78a28 100644
--- a/net/sched/act_api.c
+++ b/net/sched/act_api.c
@@ -1255,10 +1255,10 @@ out_module_put:
 
 static int __init tc_action_init(void)
 {
-	rtnl_register(PF_UNSPEC, RTM_NEWACTION, tc_ctl_action, NULL, NULL);
-	rtnl_register(PF_UNSPEC, RTM_DELACTION, tc_ctl_action, NULL, NULL);
+	rtnl_register(PF_UNSPEC, RTM_NEWACTION, tc_ctl_action, NULL, 0);
+	rtnl_register(PF_UNSPEC, RTM_DELACTION, tc_ctl_action, NULL, 0);
 	rtnl_register(PF_UNSPEC, RTM_GETACTION, tc_ctl_action, tc_dump_action,
-		      NULL);
+		      0);
 
 	return 0;
 }
diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c
index 8d1157aebaf7..ebeeb87e6d44 100644
--- a/net/sched/cls_api.c
+++ b/net/sched/cls_api.c
@@ -1010,10 +1010,10 @@ EXPORT_SYMBOL(tcf_exts_get_dev);
 
 static int __init tc_filter_init(void)
 {
-	rtnl_register(PF_UNSPEC, RTM_NEWTFILTER, tc_ctl_tfilter, NULL, NULL);
-	rtnl_register(PF_UNSPEC, RTM_DELTFILTER, tc_ctl_tfilter, NULL, NULL);
+	rtnl_register(PF_UNSPEC, RTM_NEWTFILTER, tc_ctl_tfilter, NULL, 0);
+	rtnl_register(PF_UNSPEC, RTM_DELTFILTER, tc_ctl_tfilter, NULL, 0);
 	rtnl_register(PF_UNSPEC, RTM_GETTFILTER, tc_ctl_tfilter,
-		      tc_dump_tfilter, NULL);
+		      tc_dump_tfilter, 0);
 
 	return 0;
 }
diff --git a/net/sched/sch_api.c b/net/sched/sch_api.c
index bd24a550e0f9..816c8092e601 100644
--- a/net/sched/sch_api.c
+++ b/net/sched/sch_api.c
@@ -1952,14 +1952,14 @@ static int __init pktsched_init(void)
 	register_qdisc(&mq_qdisc_ops);
 	register_qdisc(&noqueue_qdisc_ops);
 
-	rtnl_register(PF_UNSPEC, RTM_NEWQDISC, tc_modify_qdisc, NULL, NULL);
-	rtnl_register(PF_UNSPEC, RTM_DELQDISC, tc_get_qdisc, NULL, NULL);
+	rtnl_register(PF_UNSPEC, RTM_NEWQDISC, tc_modify_qdisc, NULL, 0);
+	rtnl_register(PF_UNSPEC, RTM_DELQDISC, tc_get_qdisc, NULL, 0);
 	rtnl_register(PF_UNSPEC, RTM_GETQDISC, tc_get_qdisc, tc_dump_qdisc,
-		      NULL);
-	rtnl_register(PF_UNSPEC, RTM_NEWTCLASS, tc_ctl_tclass, NULL, NULL);
-	rtnl_register(PF_UNSPEC, RTM_DELTCLASS, tc_ctl_tclass, NULL, NULL);
+		      0);
+	rtnl_register(PF_UNSPEC, RTM_NEWTCLASS, tc_ctl_tclass, NULL, 0);
+	rtnl_register(PF_UNSPEC, RTM_DELTCLASS, tc_ctl_tclass, NULL, 0);
 	rtnl_register(PF_UNSPEC, RTM_GETTCLASS, tc_ctl_tclass, tc_dump_tclass,
-		      NULL);
+		      0);
 
 	return 0;
 }
-- 
cgit v1.2.3


From 62256f98f244fbb1c7a10465e1ee412f209d8978 Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Wed, 9 Aug 2017 20:41:52 +0200
Subject: rtnetlink: add RTNL_FLAG_DOIT_UNLOCKED

Allow callers to tell rtnetlink core that its doit callback
should be invoked without holding rtnl mutex.

Signed-off-by: Florian Westphal <fw@strlen.de>
Reviewed-by: Hannes Frederic Sowa <hannes@stressinduktion.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/rtnetlink.h |  4 ++++
 net/core/rtnetlink.c    | 15 +++++++++++++++
 2 files changed, 19 insertions(+)

(limited to 'include')

diff --git a/include/net/rtnetlink.h b/include/net/rtnetlink.h
index ac32460a0adb..21837ca68ecc 100644
--- a/include/net/rtnetlink.h
+++ b/include/net/rtnetlink.h
@@ -8,6 +8,10 @@ typedef int (*rtnl_doit_func)(struct sk_buff *, struct nlmsghdr *,
 			      struct netlink_ext_ack *);
 typedef int (*rtnl_dumpit_func)(struct sk_buff *, struct netlink_callback *);
 
+enum rtnl_link_flags {
+	RTNL_FLAG_DOIT_UNLOCKED = 1,
+};
+
 int __rtnl_register(int protocol, int msgtype,
 		    rtnl_doit_func, rtnl_dumpit_func, unsigned int flags);
 void rtnl_register(int protocol, int msgtype,
diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
index d45946177bc8..dd4e50dfa248 100644
--- a/net/core/rtnetlink.c
+++ b/net/core/rtnetlink.c
@@ -62,6 +62,7 @@
 struct rtnl_link {
 	rtnl_doit_func		doit;
 	rtnl_dumpit_func	dumpit;
+	unsigned int		flags;
 };
 
 static DEFINE_MUTEX(rtnl_mutex);
@@ -184,6 +185,7 @@ int __rtnl_register(int protocol, int msgtype,
 		tab[msgindex].doit = doit;
 	if (dumpit)
 		tab[msgindex].dumpit = dumpit;
+	tab[msgindex].flags |= flags;
 
 	return 0;
 }
@@ -233,6 +235,7 @@ int rtnl_unregister(int protocol, int msgtype)
 
 	handlers[msgindex].doit = NULL;
 	handlers[msgindex].dumpit = NULL;
+	handlers[msgindex].flags = 0;
 	rtnl_unlock();
 
 	return 0;
@@ -4143,6 +4146,7 @@ static int rtnetlink_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh,
 	struct rtnl_link *handlers;
 	int err = -EOPNOTSUPP;
 	rtnl_doit_func doit;
+	unsigned int flags;
 	int kind;
 	int family;
 	int type;
@@ -4209,6 +4213,17 @@ static int rtnetlink_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh,
 		return err;
 	}
 
+	flags = READ_ONCE(handlers[type].flags);
+	if (flags & RTNL_FLAG_DOIT_UNLOCKED) {
+		refcount_inc(&rtnl_msg_handlers_ref[family]);
+		doit = READ_ONCE(handlers[type].doit);
+		rcu_read_unlock();
+		if (doit)
+			err = doit(skb, nlh, extack);
+		refcount_dec(&rtnl_msg_handlers_ref[family]);
+		return err;
+	}
+
 	rcu_read_unlock();
 
 	rtnl_lock();
-- 
cgit v1.2.3


From 68277a2c9d32fd9090247a5c08aaf1353049c0b1 Mon Sep 17 00:00:00 2001
From: John Crispin <john@phrozen.org>
Date: Wed, 9 Aug 2017 14:41:16 +0200
Subject: net-next: dsa: move struct dsa_device_ops to the global header file

We need to access this struct from within the flow_dissector to fix
dissection for packets coming in on DSA devices.

Signed-off-by: Muciri Gatimu <muciri@openmesh.com>
Signed-off-by: Shashidhar Lakkavalli <shashidhar.lakkavalli@openmesh.com>
Signed-off-by: John Crispin <john@phrozen.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/dsa.h  | 7 +++++++
 net/dsa/dsa_priv.h | 7 -------
 2 files changed, 7 insertions(+), 7 deletions(-)

(limited to 'include')

diff --git a/include/net/dsa.h b/include/net/dsa.h
index a4f66dbb4b7c..65d7804c6f69 100644
--- a/include/net/dsa.h
+++ b/include/net/dsa.h
@@ -101,6 +101,13 @@ struct dsa_platform_data {
 
 struct packet_type;
 
+struct dsa_device_ops {
+	struct sk_buff *(*xmit)(struct sk_buff *skb, struct net_device *dev);
+	struct sk_buff *(*rcv)(struct sk_buff *skb, struct net_device *dev,
+			       struct packet_type *pt,
+			       struct net_device *orig_dev);
+};
+
 struct dsa_switch_tree {
 	struct list_head	list;
 
diff --git a/net/dsa/dsa_priv.h b/net/dsa/dsa_priv.h
index 1debf9c42fc4..9c3eeb72462d 100644
--- a/net/dsa/dsa_priv.h
+++ b/net/dsa/dsa_priv.h
@@ -65,13 +65,6 @@ struct dsa_notifier_vlan_info {
 	int port;
 };
 
-struct dsa_device_ops {
-	struct sk_buff *(*xmit)(struct sk_buff *skb, struct net_device *dev);
-	struct sk_buff *(*rcv)(struct sk_buff *skb, struct net_device *dev,
-			       struct packet_type *pt,
-			       struct net_device *orig_dev);
-};
-
 struct dsa_slave_priv {
 	/* Copy of dp->ds->dst->tag_ops->xmit for faster access in hot path */
 	struct sk_buff *	(*xmit)(struct sk_buff *skb,
-- 
cgit v1.2.3


From 598a968011ffc4d624934995fe46d06bd450cdf4 Mon Sep 17 00:00:00 2001
From: John Crispin <john@phrozen.org>
Date: Wed, 9 Aug 2017 14:41:17 +0200
Subject: net-next: dsa: add flow_dissect callback to struct dsa_device_ops

When the flow dissector first sees packets coming in on a DSA devices the
802.3 header wont be located where the code expects it to be as the tag
is still present. Adding this new callback allows a DSA device to provide a
new function that the flow_dissector can use to get the correct protocol
and offset of the network header.

Signed-off-by: Muciri Gatimu <muciri@openmesh.com>
Signed-off-by: Shashidhar Lakkavalli <shashidhar.lakkavalli@openmesh.com>
Signed-off-by: John Crispin <john@phrozen.org>
Reviewed-by: Andrew Lunn <andrew@lunn.ch>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/dsa.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include')

diff --git a/include/net/dsa.h b/include/net/dsa.h
index 65d7804c6f69..7f46b521313e 100644
--- a/include/net/dsa.h
+++ b/include/net/dsa.h
@@ -106,6 +106,8 @@ struct dsa_device_ops {
 	struct sk_buff *(*rcv)(struct sk_buff *skb, struct net_device *dev,
 			       struct packet_type *pt,
 			       struct net_device *orig_dev);
+	int (*flow_dissect)(const struct sk_buff *skb, __be16 *proto,
+			    int *offset);
 };
 
 struct dsa_switch_tree {
-- 
cgit v1.2.3


From bfe334924ccd9f4a53f30240c03cf2f43f5b2df1 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Wed, 2 Aug 2017 19:39:30 +0200
Subject: perf/x86: Fix RDPMC vs. mm_struct tracking

Vince reported the following rdpmc() testcase failure:

 > Failing test case:
 >
 >	fd=perf_event_open();
 >	addr=mmap(fd);
 >	exec()  // without closing or unmapping the event
 >	fd=perf_event_open();
 >	addr=mmap(fd);
 >	rdpmc()	// GPFs due to rdpmc being disabled

The problem is of course that exec() plays tricks with what is
current->mm, only destroying the old mappings after having
installed the new mm.

Fix this confusion by passing along vma->vm_mm instead of relying on
current->mm.

Reported-by: Vince Weaver <vincent.weaver@maine.edu>
Tested-by: Vince Weaver <vincent.weaver@maine.edu>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Andy Lutomirski <luto@kernel.org>
Cc: Arnaldo Carvalho de Melo <acme@kernel.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephane Eranian <eranian@gmail.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: stable@vger.kernel.org
Fixes: 1e0fb9ec679c ("perf: Add pmu callbacks to track event mapping and unmapping")
Link: http://lkml.kernel.org/r/20170802173930.cstykcqefmqt7jau@hirez.programming.kicks-ass.net
[ Minor cleanups. ]
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/events/core.c     | 16 +++++++---------
 include/linux/perf_event.h |  4 ++--
 kernel/events/core.c       |  6 +++---
 3 files changed, 12 insertions(+), 14 deletions(-)

(limited to 'include')

diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c
index 8e3db8f642a7..af12e294caed 100644
--- a/arch/x86/events/core.c
+++ b/arch/x86/events/core.c
@@ -2114,7 +2114,7 @@ static void refresh_pce(void *ignored)
 	load_mm_cr4(this_cpu_read(cpu_tlbstate.loaded_mm));
 }
 
-static void x86_pmu_event_mapped(struct perf_event *event)
+static void x86_pmu_event_mapped(struct perf_event *event, struct mm_struct *mm)
 {
 	if (!(event->hw.flags & PERF_X86_EVENT_RDPMC_ALLOWED))
 		return;
@@ -2129,22 +2129,20 @@ static void x86_pmu_event_mapped(struct perf_event *event)
 	 * For now, this can't happen because all callers hold mmap_sem
 	 * for write.  If this changes, we'll need a different solution.
 	 */
-	lockdep_assert_held_exclusive(&current->mm->mmap_sem);
+	lockdep_assert_held_exclusive(&mm->mmap_sem);
 
-	if (atomic_inc_return(&current->mm->context.perf_rdpmc_allowed) == 1)
-		on_each_cpu_mask(mm_cpumask(current->mm), refresh_pce, NULL, 1);
+	if (atomic_inc_return(&mm->context.perf_rdpmc_allowed) == 1)
+		on_each_cpu_mask(mm_cpumask(mm), refresh_pce, NULL, 1);
 }
 
-static void x86_pmu_event_unmapped(struct perf_event *event)
+static void x86_pmu_event_unmapped(struct perf_event *event, struct mm_struct *mm)
 {
-	if (!current->mm)
-		return;
 
 	if (!(event->hw.flags & PERF_X86_EVENT_RDPMC_ALLOWED))
 		return;
 
-	if (atomic_dec_and_test(&current->mm->context.perf_rdpmc_allowed))
-		on_each_cpu_mask(mm_cpumask(current->mm), refresh_pce, NULL, 1);
+	if (atomic_dec_and_test(&mm->context.perf_rdpmc_allowed))
+		on_each_cpu_mask(mm_cpumask(mm), refresh_pce, NULL, 1);
 }
 
 static int x86_pmu_event_idx(struct perf_event *event)
diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index a3b873fc59e4..b14095bcf4bb 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -310,8 +310,8 @@ struct pmu {
 	 * Notification that the event was mapped or unmapped.  Called
 	 * in the context of the mapping task.
 	 */
-	void (*event_mapped)		(struct perf_event *event); /*optional*/
-	void (*event_unmapped)		(struct perf_event *event); /*optional*/
+	void (*event_mapped)		(struct perf_event *event, struct mm_struct *mm); /* optional */
+	void (*event_unmapped)		(struct perf_event *event, struct mm_struct *mm); /* optional */
 
 	/*
 	 * Flags for ->add()/->del()/ ->start()/->stop(). There are
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 426c2ffba16d..a654b8a3586f 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -5090,7 +5090,7 @@ static void perf_mmap_open(struct vm_area_struct *vma)
 		atomic_inc(&event->rb->aux_mmap_count);
 
 	if (event->pmu->event_mapped)
-		event->pmu->event_mapped(event);
+		event->pmu->event_mapped(event, vma->vm_mm);
 }
 
 static void perf_pmu_output_stop(struct perf_event *event);
@@ -5113,7 +5113,7 @@ static void perf_mmap_close(struct vm_area_struct *vma)
 	unsigned long size = perf_data_size(rb);
 
 	if (event->pmu->event_unmapped)
-		event->pmu->event_unmapped(event);
+		event->pmu->event_unmapped(event, vma->vm_mm);
 
 	/*
 	 * rb->aux_mmap_count will always drop before rb->mmap_count and
@@ -5411,7 +5411,7 @@ aux_unlock:
 	vma->vm_ops = &perf_mmap_vmops;
 
 	if (event->pmu->event_mapped)
-		event->pmu->event_mapped(event);
+		event->pmu->event_mapped(event, vma->vm_mm);
 
 	return ret;
 }
-- 
cgit v1.2.3


From c9901724a2f14128ef6a57986babcbfbcf61a257 Mon Sep 17 00:00:00 2001
From: Leon Romanovsky <leonro@mellanox.com>
Date: Mon, 5 Jun 2017 10:20:11 +0300
Subject: RDMA/netlink: Remove netlink clients infrastructure

RDMA netlink has a complicated infrastructure for dynamically
registering and de-registering netlink clients to the NETLINK_RDMA
group. The complicated portion of this code is not widely used because
2 of the 3 current clients are statically compiled together with
netlink.c. The infrastructure, therefore, is deemed overkill.

Refactor the code to eliminate the dynamically added clients. Now all
clients are pre-registered in a client array at compile time, and at run
time they merely check-in with the infrastructure to pass their callback
table for inclusion in the pre-sized client array.

This also allows for future cleanups and removal of unneeded code in the
iwcm* netlink handler.

Signed-off-by: Leon Romanovsky <leonro@mellanox.com>
Reviewed-by: Chien Tin Tung <chien.tin.tung@intel.com>
---
 drivers/infiniband/core/cma.c       |   6 +-
 drivers/infiniband/core/core_priv.h |   4 +-
 drivers/infiniband/core/device.c    |  45 +++------
 drivers/infiniband/core/iwcm.c      |  10 +-
 drivers/infiniband/core/netlink.c   | 185 +++++++++++++++++-------------------
 include/rdma/rdma_netlink.h         |  13 +--
 6 files changed, 112 insertions(+), 151 deletions(-)

(limited to 'include')

diff --git a/drivers/infiniband/core/cma.c b/drivers/infiniband/core/cma.c
index ca4135c596ba..2a16a559bdda 100644
--- a/drivers/infiniband/core/cma.c
+++ b/drivers/infiniband/core/cma.c
@@ -4512,9 +4512,7 @@ static int __init cma_init(void)
 	if (ret)
 		goto err;
 
-	if (ibnl_add_client(RDMA_NL_RDMA_CM, ARRAY_SIZE(cma_cb_table),
-			    cma_cb_table))
-		pr_warn("RDMA CMA: failed to add netlink callback\n");
+	rdma_nl_register(RDMA_NL_RDMA_CM, cma_cb_table);
 	cma_configfs_init();
 
 	return 0;
@@ -4531,7 +4529,7 @@ err_wq:
 static void __exit cma_cleanup(void)
 {
 	cma_configfs_exit();
-	ibnl_remove_client(RDMA_NL_RDMA_CM);
+	rdma_nl_unregister(RDMA_NL_RDMA_CM);
 	ib_unregister_client(&cma_client);
 	unregister_netdevice_notifier(&cma_nb);
 	rdma_addr_unregister_client(&addr_client);
diff --git a/drivers/infiniband/core/core_priv.h b/drivers/infiniband/core/core_priv.h
index 11ae67514e13..e759c27113cd 100644
--- a/drivers/infiniband/core/core_priv.h
+++ b/drivers/infiniband/core/core_priv.h
@@ -179,8 +179,8 @@ void ib_mad_cleanup(void);
 int ib_sa_init(void);
 void ib_sa_cleanup(void);
 
-int ibnl_init(void);
-void ibnl_cleanup(void);
+int rdma_nl_init(void);
+void rdma_nl_exit(void);
 
 /**
  * Check if there are any listeners to the netlink group
diff --git a/drivers/infiniband/core/device.c b/drivers/infiniband/core/device.c
index a5dfab6adf49..d0994cd30eae 100644
--- a/drivers/infiniband/core/device.c
+++ b/drivers/infiniband/core/device.c
@@ -1086,29 +1086,15 @@ struct net_device *ib_get_net_dev_by_params(struct ib_device *dev,
 }
 EXPORT_SYMBOL(ib_get_net_dev_by_params);
 
-static struct ibnl_client_cbs ibnl_ls_cb_table[] = {
+static const struct ibnl_client_cbs ibnl_ls_cb_table[] = {
 	[RDMA_NL_LS_OP_RESOLVE] = {
-		.dump = ib_nl_handle_resolve_resp,
-		.module = THIS_MODULE },
+		.dump = ib_nl_handle_resolve_resp},
 	[RDMA_NL_LS_OP_SET_TIMEOUT] = {
-		.dump = ib_nl_handle_set_timeout,
-		.module = THIS_MODULE },
+		.dump = ib_nl_handle_set_timeout},
 	[RDMA_NL_LS_OP_IP_RESOLVE] = {
-		.dump = ib_nl_handle_ip_res_resp,
-		.module = THIS_MODULE },
+		.dump = ib_nl_handle_ip_res_resp},
 };
 
-static int ib_add_ibnl_clients(void)
-{
-	return ibnl_add_client(RDMA_NL_LS, ARRAY_SIZE(ibnl_ls_cb_table),
-			       ibnl_ls_cb_table);
-}
-
-static void ib_remove_ibnl_clients(void)
-{
-	ibnl_remove_client(RDMA_NL_LS);
-}
-
 static int __init ib_core_init(void)
 {
 	int ret;
@@ -1130,9 +1116,9 @@ static int __init ib_core_init(void)
 		goto err_comp;
 	}
 
-	ret = ibnl_init();
+	ret = rdma_nl_init();
 	if (ret) {
-		pr_warn("Couldn't init IB netlink interface\n");
+		pr_warn("Couldn't init IB netlink interface: err %d\n", ret);
 		goto err_sysfs;
 	}
 
@@ -1154,24 +1140,17 @@ static int __init ib_core_init(void)
 		goto err_mad;
 	}
 
-	ret = ib_add_ibnl_clients();
-	if (ret) {
-		pr_warn("Couldn't register ibnl clients\n");
-		goto err_sa;
-	}
-
 	ret = register_lsm_notifier(&ibdev_lsm_nb);
 	if (ret) {
 		pr_warn("Couldn't register LSM notifier. ret %d\n", ret);
-		goto err_ibnl_clients;
+		goto err_sa;
 	}
 
+	rdma_nl_register(RDMA_NL_LS, ibnl_ls_cb_table);
 	ib_cache_setup();
 
 	return 0;
 
-err_ibnl_clients:
-	ib_remove_ibnl_clients();
 err_sa:
 	ib_sa_cleanup();
 err_mad:
@@ -1179,7 +1158,7 @@ err_mad:
 err_addr:
 	addr_cleanup();
 err_ibnl:
-	ibnl_cleanup();
+	rdma_nl_exit();
 err_sysfs:
 	class_unregister(&ib_class);
 err_comp:
@@ -1191,13 +1170,13 @@ err:
 
 static void __exit ib_core_cleanup(void)
 {
-	unregister_lsm_notifier(&ibdev_lsm_nb);
 	ib_cache_cleanup();
-	ib_remove_ibnl_clients();
+	rdma_nl_unregister(RDMA_NL_LS);
+	unregister_lsm_notifier(&ibdev_lsm_nb);
 	ib_sa_cleanup();
 	ib_mad_cleanup();
 	addr_cleanup();
-	ibnl_cleanup();
+	rdma_nl_exit();
 	class_unregister(&ib_class);
 	destroy_workqueue(ib_comp_wq);
 	/* Make sure that any pending umem accounting work is done. */
diff --git a/drivers/infiniband/core/iwcm.c b/drivers/infiniband/core/iwcm.c
index 31661b5c1743..8599271d8be6 100644
--- a/drivers/infiniband/core/iwcm.c
+++ b/drivers/infiniband/core/iwcm.c
@@ -1175,12 +1175,8 @@ static int __init iw_cm_init(void)
 	ret = iwpm_init(RDMA_NL_IWCM);
 	if (ret)
 		pr_err("iw_cm: couldn't init iwpm\n");
-
-	ret = ibnl_add_client(RDMA_NL_IWCM, ARRAY_SIZE(iwcm_nl_cb_table),
-			      iwcm_nl_cb_table);
-	if (ret)
-		pr_err("iw_cm: couldn't register netlink callbacks\n");
-
+	else
+		rdma_nl_register(RDMA_NL_IWCM, iwcm_nl_cb_table);
 	iwcm_wq = alloc_ordered_workqueue("iw_cm_wq", WQ_MEM_RECLAIM);
 	if (!iwcm_wq)
 		return -ENOMEM;
@@ -1200,7 +1196,7 @@ static void __exit iw_cm_cleanup(void)
 {
 	unregister_net_sysctl_table(iwcm_ctl_table_hdr);
 	destroy_workqueue(iwcm_wq);
-	ibnl_remove_client(RDMA_NL_IWCM);
+	rdma_nl_unregister(RDMA_NL_IWCM);
 	iwpm_exit(RDMA_NL_IWCM);
 }
 
diff --git a/drivers/infiniband/core/netlink.c b/drivers/infiniband/core/netlink.c
index 0fc50e15ae22..06f7ba31fbdd 100644
--- a/drivers/infiniband/core/netlink.c
+++ b/drivers/infiniband/core/netlink.c
@@ -39,16 +39,13 @@
 #include <rdma/rdma_netlink.h>
 #include "core_priv.h"
 
-struct ibnl_client {
-	struct list_head		list;
-	int				index;
-	int				nops;
-	const struct ibnl_client_cbs   *cb_table;
-};
+#include "core_priv.h"
 
-static DEFINE_MUTEX(ibnl_mutex);
+static DEFINE_MUTEX(rdma_nl_mutex);
 static struct sock *nls;
-static LIST_HEAD(client_list);
+static struct {
+	const struct ibnl_client_cbs   *cb_table;
+} rdma_nl_types[RDMA_NL_NUM_CLIENTS];
 
 int ibnl_chk_listeners(unsigned int group)
 {
@@ -57,58 +54,74 @@ int ibnl_chk_listeners(unsigned int group)
 	return 0;
 }
 
-int ibnl_add_client(int index, int nops,
-		    const struct ibnl_client_cbs cb_table[])
+static bool is_nl_msg_valid(unsigned int type, unsigned int op)
 {
-	struct ibnl_client *cur;
-	struct ibnl_client *nl_client;
+	static const unsigned int max_num_ops[RDMA_NL_NUM_CLIENTS - 1] = {
+				  RDMA_NL_RDMA_CM_NUM_OPS,
+				  RDMA_NL_IWPM_NUM_OPS,
+				  0,
+				  RDMA_NL_LS_NUM_OPS,
+				  0 };
 
-	nl_client = kmalloc(sizeof *nl_client, GFP_KERNEL);
-	if (!nl_client)
-		return -ENOMEM;
+	/*
+	 * This BUILD_BUG_ON is intended to catch addition of new
+	 * RDMA netlink protocol without updating the array above.
+	 */
+	BUILD_BUG_ON(RDMA_NL_NUM_CLIENTS != 6);
 
-	nl_client->index	= index;
-	nl_client->nops		= nops;
-	nl_client->cb_table	= cb_table;
+	if (type > RDMA_NL_NUM_CLIENTS - 1)
+		return false;
 
-	mutex_lock(&ibnl_mutex);
+	return (op < max_num_ops[type - 1]) ? true : false;
+}
 
-	list_for_each_entry(cur, &client_list, list) {
-		if (cur->index == index) {
-			pr_warn("Client for %d already exists\n", index);
-			mutex_unlock(&ibnl_mutex);
-			kfree(nl_client);
-			return -EINVAL;
-		}
-	}
+static bool is_nl_valid(unsigned int type, unsigned int op)
+{
+	if (!is_nl_msg_valid(type, op) ||
+	    !rdma_nl_types[type].cb_table ||
+	    !rdma_nl_types[type].cb_table[op].dump)
+		return false;
+	return true;
+}
 
-	list_add_tail(&nl_client->list, &client_list);
+void rdma_nl_register(unsigned int index,
+		      const struct ibnl_client_cbs cb_table[])
+{
+	mutex_lock(&rdma_nl_mutex);
+	if (!is_nl_msg_valid(index, 0)) {
+		/*
+		 * All clients are not interesting in success/failure of
+		 * this call. They want to see the print to error log and
+		 * continue their initialization. Print warning for them,
+		 * because it is programmer's error to be here.
+		 */
+		mutex_unlock(&rdma_nl_mutex);
+		WARN(true,
+		     "The not-valid %u index was supplied to RDMA netlink\n",
+		     index);
+		return;
+	}
 
-	mutex_unlock(&ibnl_mutex);
+	if (rdma_nl_types[index].cb_table) {
+		mutex_unlock(&rdma_nl_mutex);
+		WARN(true,
+		     "The %u index is already registered in RDMA netlink\n",
+		     index);
+		return;
+	}
 
-	return 0;
+	rdma_nl_types[index].cb_table = cb_table;
+	mutex_unlock(&rdma_nl_mutex);
 }
-EXPORT_SYMBOL(ibnl_add_client);
+EXPORT_SYMBOL(rdma_nl_register);
 
-int ibnl_remove_client(int index)
+void rdma_nl_unregister(unsigned int index)
 {
-	struct ibnl_client *cur, *next;
-
-	mutex_lock(&ibnl_mutex);
-	list_for_each_entry_safe(cur, next, &client_list, list) {
-		if (cur->index == index) {
-			list_del(&(cur->list));
-			mutex_unlock(&ibnl_mutex);
-			kfree(cur);
-			return 0;
-		}
-	}
-	pr_warn("Can't remove callback for client idx %d. Not found\n", index);
-	mutex_unlock(&ibnl_mutex);
-
-	return -EINVAL;
+	mutex_lock(&rdma_nl_mutex);
+	rdma_nl_types[index].cb_table = NULL;
+	mutex_unlock(&rdma_nl_mutex);
 }
-EXPORT_SYMBOL(ibnl_remove_client);
+EXPORT_SYMBOL(rdma_nl_unregister);
 
 void *ibnl_put_msg(struct sk_buff *skb, struct nlmsghdr **nlh, int seq,
 		   int len, int client, int op, int flags)
@@ -149,45 +162,31 @@ EXPORT_SYMBOL(ibnl_put_attr);
 static int ibnl_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh,
 			struct netlink_ext_ack *extack)
 {
-	struct ibnl_client *client;
 	int type = nlh->nlmsg_type;
-	int index = RDMA_NL_GET_CLIENT(type);
+	unsigned int index = RDMA_NL_GET_CLIENT(type);
 	unsigned int op = RDMA_NL_GET_OP(type);
+	struct netlink_callback cb = {};
+	struct netlink_dump_control c = {};
 
-	list_for_each_entry(client, &client_list, list) {
-		if (client->index == index) {
-			if (op >= client->nops || !client->cb_table[op].dump)
-				return -EINVAL;
-
-			/*
-			 * For response or local service set_timeout request,
-			 * there is no need to use netlink_dump_start.
-			 */
-			if (!(nlh->nlmsg_flags & NLM_F_REQUEST) ||
-			    (index == RDMA_NL_LS &&
-			     op == RDMA_NL_LS_OP_SET_TIMEOUT)) {
-				struct netlink_callback cb = {
-					.skb = skb,
-					.nlh = nlh,
-					.dump = client->cb_table[op].dump,
-					.module = client->cb_table[op].module,
-				};
-
-				return cb.dump(skb, &cb);
-			}
-
-			{
-				struct netlink_dump_control c = {
-					.dump = client->cb_table[op].dump,
-					.module = client->cb_table[op].module,
-				};
-				return netlink_dump_start(nls, skb, nlh, &c);
-			}
-		}
+	if (!is_nl_valid(index, op))
+		return -EINVAL;
+
+	/*
+	 * For response or local service set_timeout request,
+	 * there is no need to use netlink_dump_start.
+	 */
+	if (!(nlh->nlmsg_flags & NLM_F_REQUEST) ||
+	    (index == RDMA_NL_LS && op == RDMA_NL_LS_OP_SET_TIMEOUT)) {
+		cb.skb = skb;
+		cb.nlh = nlh;
+		cb.dump = rdma_nl_types[index].cb_table[op].dump;
+		cb.module = rdma_nl_types[index].cb_table[op].module;
+		return cb.dump(skb, &cb);
 	}
 
-	pr_info("Index %d wasn't found in client list\n", index);
-	return -EINVAL;
+	c.dump = rdma_nl_types[index].cb_table[op].dump;
+	c.module = rdma_nl_types[index].cb_table[op].module;
+	return netlink_dump_start(nls, skb, nlh, &c);
 }
 
 static void ibnl_rcv_reply_skb(struct sk_buff *skb)
@@ -221,10 +220,10 @@ static void ibnl_rcv_reply_skb(struct sk_buff *skb)
 
 static void ibnl_rcv(struct sk_buff *skb)
 {
-	mutex_lock(&ibnl_mutex);
+	mutex_lock(&rdma_nl_mutex);
 	ibnl_rcv_reply_skb(skb);
 	netlink_rcv_skb(skb, &ibnl_rcv_msg);
-	mutex_unlock(&ibnl_mutex);
+	mutex_unlock(&rdma_nl_mutex);
 }
 
 int ibnl_unicast(struct sk_buff *skb, struct nlmsghdr *nlh,
@@ -254,32 +253,26 @@ int ibnl_multicast(struct sk_buff *skb, struct nlmsghdr *nlh,
 }
 EXPORT_SYMBOL(ibnl_multicast);
 
-int __init ibnl_init(void)
+int __init rdma_nl_init(void)
 {
 	struct netlink_kernel_cfg cfg = {
 		.input	= ibnl_rcv,
 	};
 
 	nls = netlink_kernel_create(&init_net, NETLINK_RDMA, &cfg);
-	if (!nls) {
-		pr_warn("Failed to create netlink socket\n");
+	if (!nls)
 		return -ENOMEM;
-	}
 
 	nls->sk_sndtimeo = 10 * HZ;
 	return 0;
 }
 
-void ibnl_cleanup(void)
+void rdma_nl_exit(void)
 {
-	struct ibnl_client *cur, *next;
+	int idx;
 
-	mutex_lock(&ibnl_mutex);
-	list_for_each_entry_safe(cur, next, &client_list, list) {
-		list_del(&(cur->list));
-		kfree(cur);
-	}
-	mutex_unlock(&ibnl_mutex);
+	for (idx = 0; idx < RDMA_NL_NUM_CLIENTS; idx++)
+		rdma_nl_unregister(idx);
 
 	netlink_kernel_release(nls);
 }
diff --git a/include/rdma/rdma_netlink.h b/include/rdma/rdma_netlink.h
index 5b1466770917..aadf0ab963b2 100644
--- a/include/rdma/rdma_netlink.h
+++ b/include/rdma/rdma_netlink.h
@@ -11,23 +11,18 @@ struct ibnl_client_cbs {
 };
 
 /**
- * Add a a client to the list of IB netlink exporters.
+ * Register client in RDMA netlink.
  * @index: Index of the added client
- * @nops: Number of supported ops by the added client.
  * @cb_table: A table for op->callback
- *
- * Returns 0 on success or a negative error code.
  */
-int ibnl_add_client(int index, int nops,
-		    const struct ibnl_client_cbs cb_table[]);
+void rdma_nl_register(unsigned int index,
+		      const struct ibnl_client_cbs cb_table[]);
 
 /**
  * Remove a client from IB netlink.
  * @index: Index of the removed IB client.
- *
- * Returns 0 on success or a negative error code.
  */
-int ibnl_remove_client(int index);
+void rdma_nl_unregister(unsigned int index);
 
 /**
  * Put a new message in a supplied skb.
-- 
cgit v1.2.3


From 64401b69b29164c5731018cc44fc9b144ac9c5ae Mon Sep 17 00:00:00 2001
From: Leon Romanovsky <leonro@mellanox.com>
Date: Tue, 30 May 2017 11:29:56 +0300
Subject: RDMA/netlink: Remove redundant owner option for netlink callbacks

Owner field is not needed to be set because netlink is part of ib_core
which will be unloaded last after all other modules are unloaded.

Signed-off-by: Leon Romanovsky <leonro@mellanox.com>
---
 drivers/infiniband/core/cma.c     | 3 +--
 drivers/infiniband/core/netlink.c | 2 --
 include/rdma/rdma_netlink.h       | 1 -
 3 files changed, 1 insertion(+), 5 deletions(-)

(limited to 'include')

diff --git a/drivers/infiniband/core/cma.c b/drivers/infiniband/core/cma.c
index 2a16a559bdda..0c85f140e616 100644
--- a/drivers/infiniband/core/cma.c
+++ b/drivers/infiniband/core/cma.c
@@ -4459,8 +4459,7 @@ out:
 }
 
 static const struct ibnl_client_cbs cma_cb_table[] = {
-	[RDMA_NL_RDMA_CM_ID_STATS] = { .dump = cma_get_id_stats,
-				       .module = THIS_MODULE },
+	[RDMA_NL_RDMA_CM_ID_STATS] = { .dump = cma_get_id_stats},
 };
 
 static int cma_init_net(struct net *net)
diff --git a/drivers/infiniband/core/netlink.c b/drivers/infiniband/core/netlink.c
index 06f7ba31fbdd..cd9b7e7b7d2c 100644
--- a/drivers/infiniband/core/netlink.c
+++ b/drivers/infiniband/core/netlink.c
@@ -180,12 +180,10 @@ static int ibnl_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh,
 		cb.skb = skb;
 		cb.nlh = nlh;
 		cb.dump = rdma_nl_types[index].cb_table[op].dump;
-		cb.module = rdma_nl_types[index].cb_table[op].module;
 		return cb.dump(skb, &cb);
 	}
 
 	c.dump = rdma_nl_types[index].cb_table[op].dump;
-	c.module = rdma_nl_types[index].cb_table[op].module;
 	return netlink_dump_start(nls, skb, nlh, &c);
 }
 
diff --git a/include/rdma/rdma_netlink.h b/include/rdma/rdma_netlink.h
index aadf0ab963b2..c124d8e43fc8 100644
--- a/include/rdma/rdma_netlink.h
+++ b/include/rdma/rdma_netlink.h
@@ -7,7 +7,6 @@
 
 struct ibnl_client_cbs {
 	int (*dump)(struct sk_buff *skb, struct netlink_callback *nlcb);
-	struct module *module;
 };
 
 /**
-- 
cgit v1.2.3


From 18f08dae19990f5fffde92e3a63e0d90cda0f1a8 Mon Sep 17 00:00:00 2001
From: Cheng Jian <cj.chengjian@huawei.com>
Date: Fri, 4 Aug 2017 17:19:37 +0800
Subject: sched/core: Remove unnecessary initialization init_idle_bootup_task()

init_idle_bootup_task( ) is called in rest_init( ) to switch
the scheduling class of the boot thread to the idle class.

the function only sets:

    idle->sched_class = &idle_sched_class;

which has been set in init_idle() called by sched_init():

    /*
     * The idle tasks have their own, simple scheduling class:
     */
    idle->sched_class = &idle_sched_class;

We've already set the boot thread to idle class in
start_kernel()->sched_init()->init_idle()
so it's unnecessary to set it again in
start_kernel()->rest_init()->init_idle_bootup_task()

Signed-off-by: Cheng Jian <cj.chengjian@huawei.com>
Signed-off-by: Xie XiuQi <xiexiuqi@huawei.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: <akpm@linux-foundation.org>
Cc: <huawei.libin@huawei.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/1501838377-109720-1-git-send-email-cj.chengjian@huawei.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/sched/task.h | 1 -
 init/main.c                | 1 -
 kernel/sched/core.c        | 5 -----
 3 files changed, 7 deletions(-)

(limited to 'include')

diff --git a/include/linux/sched/task.h b/include/linux/sched/task.h
index c97e5f096927..79a2a744648d 100644
--- a/include/linux/sched/task.h
+++ b/include/linux/sched/task.h
@@ -30,7 +30,6 @@ extern int lockdep_tasklist_lock_is_held(void);
 
 extern asmlinkage void schedule_tail(struct task_struct *prev);
 extern void init_idle(struct task_struct *idle, int cpu);
-extern void init_idle_bootup_task(struct task_struct *idle);
 
 extern int sched_fork(unsigned long clone_flags, struct task_struct *p);
 extern void sched_dead(struct task_struct *p);
diff --git a/init/main.c b/init/main.c
index 052481fbe363..881d62438b1a 100644
--- a/init/main.c
+++ b/init/main.c
@@ -430,7 +430,6 @@ static noinline void __ref rest_init(void)
 	 * The boot idle thread must execute schedule()
 	 * at least once to get things moving:
 	 */
-	init_idle_bootup_task(current);
 	schedule_preempt_disabled();
 	/* Call into cpu_idle with preempt disabled */
 	cpu_startup_entry(CPUHP_ONLINE);
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 835a234d35e8..6d91c10b1814 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -5177,11 +5177,6 @@ void show_state_filter(unsigned long state_filter)
 		debug_show_all_locks();
 }
 
-void init_idle_bootup_task(struct task_struct *idle)
-{
-	idle->sched_class = &idle_sched_class;
-}
-
 /**
  * init_idle - set up an idle thread for a given CPU
  * @idle: task in question
-- 
cgit v1.2.3


From 74dc3384fc7983b78cc46ebb1824968a3db85eb1 Mon Sep 17 00:00:00 2001
From: Aleksa Sarai <asarai@suse.com>
Date: Sun, 6 Aug 2017 14:41:41 +1000
Subject: sched/debug: Use task_pid_nr_ns in /proc/$pid/sched

It appears as though the addition of the PID namespace did not update
the output code for /proc/*/sched, which resulted in it providing PIDs
that were not self-consistent with the /proc mount. This additionally
made it trivial to detect whether a process was inside &init_pid_ns from
userspace, making container detection trivial:

   https://github.com/jessfraz/amicontained

This leads to situations such as:

  % unshare -pmf
  % mount -t proc proc /proc
  % head -n1 /proc/1/sched
  head (10047, #threads: 1)

Fix this by just using task_pid_nr_ns for the output of /proc/*/sched.
All of the other uses of task_pid_nr in kernel/sched/debug.c are from a
sysctl context and thus don't need to be namespaced.

Signed-off-by: Aleksa Sarai <asarai@suse.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Acked-by: Eric W. Biederman <ebiederm@xmission.com>
Cc: Jess Frazelle <acidburn@google.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: cyphar@cyphar.com
Link: http://lkml.kernel.org/r/20170806044141.5093-1-asarai@suse.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 fs/proc/base.c              | 3 ++-
 include/linux/sched/debug.h | 4 +++-
 kernel/sched/debug.c        | 5 +++--
 3 files changed, 8 insertions(+), 4 deletions(-)

(limited to 'include')

diff --git a/fs/proc/base.c b/fs/proc/base.c
index 719c2e943ea1..98fd8f6df851 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -1408,12 +1408,13 @@ static const struct file_operations proc_fail_nth_operations = {
 static int sched_show(struct seq_file *m, void *v)
 {
 	struct inode *inode = m->private;
+	struct pid_namespace *ns = inode->i_sb->s_fs_info;
 	struct task_struct *p;
 
 	p = get_proc_task(inode);
 	if (!p)
 		return -ESRCH;
-	proc_sched_show_task(p, m);
+	proc_sched_show_task(p, ns, m);
 
 	put_task_struct(p);
 
diff --git a/include/linux/sched/debug.h b/include/linux/sched/debug.h
index e0eaee54c5a4..5d58d49e9f87 100644
--- a/include/linux/sched/debug.h
+++ b/include/linux/sched/debug.h
@@ -6,6 +6,7 @@
  */
 
 struct task_struct;
+struct pid_namespace;
 
 extern void dump_cpu_task(int cpu);
 
@@ -34,7 +35,8 @@ extern void sched_show_task(struct task_struct *p);
 
 #ifdef CONFIG_SCHED_DEBUG
 struct seq_file;
-extern void proc_sched_show_task(struct task_struct *p, struct seq_file *m);
+extern void proc_sched_show_task(struct task_struct *p,
+				 struct pid_namespace *ns, struct seq_file *m);
 extern void proc_sched_set_task(struct task_struct *p);
 #endif
 
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
index 4fa66de52bd6..ac345115877b 100644
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -872,11 +872,12 @@ static void sched_show_numa(struct task_struct *p, struct seq_file *m)
 #endif
 }
 
-void proc_sched_show_task(struct task_struct *p, struct seq_file *m)
+void proc_sched_show_task(struct task_struct *p, struct pid_namespace *ns,
+						  struct seq_file *m)
 {
 	unsigned long nr_switches;
 
-	SEQ_printf(m, "%s (%d, #threads: %d)\n", p->comm, task_pid_nr(p),
+	SEQ_printf(m, "%s (%d, #threads: %d)\n", p->comm, task_pid_nr_ns(p, ns),
 						get_nr_threads(p));
 	SEQ_printf(m,
 		"---------------------------------------------------------"
-- 
cgit v1.2.3


From 20435d84e5f2041c64c792399ab6f2948a2c2252 Mon Sep 17 00:00:00 2001
From: Xie XiuQi <xiexiuqi@huawei.com>
Date: Mon, 7 Aug 2017 16:44:23 +0800
Subject: sched/debug: Intruduce task_state_to_char() helper function

Now that we have more than one place to get the task state,
intruduce the task_state_to_char() helper function to save some code.

No functionality changed.

Signed-off-by: Xie XiuQi <xiexiuqi@huawei.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: <cj.chengjian@huawei.com>
Cc: <huawei.libin@huawei.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/1502095463-160172-3-git-send-email-xiexiuqi@huawei.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/sched.h | 13 +++++++++++++
 kernel/sched/core.c   | 15 ++++-----------
 kernel/sched/debug.c  | 10 +++-------
 3 files changed, 20 insertions(+), 18 deletions(-)

(limited to 'include')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 8337e2db0bb2..c28b182c9833 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1229,6 +1229,19 @@ static inline pid_t task_pgrp_nr(struct task_struct *tsk)
 	return task_pgrp_nr_ns(tsk, &init_pid_ns);
 }
 
+static inline char task_state_to_char(struct task_struct *task)
+{
+	const char stat_nam[] = TASK_STATE_TO_CHAR_STR;
+	unsigned long state = task->state;
+
+	state = state ? __ffs(state) + 1 : 0;
+
+	/* Make sure the string lines up properly with the number of task states: */
+	BUILD_BUG_ON(sizeof(TASK_STATE_TO_CHAR_STR)-1 != ilog2(TASK_STATE_MAX)+1);
+
+	return state < sizeof(stat_nam) - 1 ? stat_nam[state] : '?';
+}
+
 /**
  * is_global_init - check if a task structure is init. Since init
  * is free to have sub-threads we need to check tgid.
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 6d91c10b1814..f9f9948e2470 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -5103,24 +5103,17 @@ out_unlock:
 	return retval;
 }
 
-static const char stat_nam[] = TASK_STATE_TO_CHAR_STR;
-
 void sched_show_task(struct task_struct *p)
 {
 	unsigned long free = 0;
 	int ppid;
-	unsigned long state = p->state;
-
-	/* Make sure the string lines up properly with the number of task states: */
-	BUILD_BUG_ON(sizeof(TASK_STATE_TO_CHAR_STR)-1 != ilog2(TASK_STATE_MAX)+1);
 
 	if (!try_get_task_stack(p))
 		return;
-	if (state)
-		state = __ffs(state) + 1;
-	printk(KERN_INFO "%-15.15s %c", p->comm,
-		state < sizeof(stat_nam) - 1 ? stat_nam[state] : '?');
-	if (state == TASK_RUNNING)
+
+	printk(KERN_INFO "%-15.15s %c", p->comm, task_state_to_char(p));
+
+	if (p->state == TASK_RUNNING)
 		printk(KERN_CONT "  running task    ");
 #ifdef CONFIG_DEBUG_STACK_USAGE
 	free = stack_not_used(p);
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
index d8d2ea215b85..cfd84f79e075 100644
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -426,14 +426,10 @@ static const char stat_nam[] = TASK_STATE_TO_CHAR_STR;
 static void
 print_task(struct seq_file *m, struct rq *rq, struct task_struct *p)
 {
-	unsigned long state;
-
-	if (rq->curr == p) {
+	if (rq->curr == p)
 		SEQ_printf(m, ">R");
-	} else {
-		state = p->state ? __ffs(p->state) + 1 : 0;
-		SEQ_printf(m, " %c", state < sizeof(stat_nam) - 1 ? stat_nam[state] : '?');
-	}
+	else
+		SEQ_printf(m, " %c", task_state_to_char(p));
 
 	SEQ_printf(m, "%15s %5d %9Ld.%06ld %9Ld %5d ",
 		p->comm, task_pid_nr(p),
-- 
cgit v1.2.3


From e3a2b93dddad315f01a4b67faee738954c084072 Mon Sep 17 00:00:00 2001
From: Leon Romanovsky <leonro@mellanox.com>
Date: Mon, 12 Jun 2017 16:00:19 +0300
Subject: RDMA/netlink: Add flag to consolidate common handling

Add ability to provide flags to control RDMA netlink callbacks
and convert addr.c and sa_query.c to be first users of such
infrastructure. It allows to move their CAP_NET_ADMIN checks
into netlink core.

Signed-off-by: Leon Romanovsky <leonro@mellanox.com>
Reviewed-by: Steve Wise <swise@opengridcomputing.com>
---
 drivers/infiniband/core/addr.c     |  3 +--
 drivers/infiniband/core/device.c   | 12 +++++++++---
 drivers/infiniband/core/netlink.c  |  4 ++++
 drivers/infiniband/core/sa_query.c |  6 ++----
 include/rdma/rdma_netlink.h        |  6 ++++++
 5 files changed, 22 insertions(+), 9 deletions(-)

(limited to 'include')

diff --git a/drivers/infiniband/core/addr.c b/drivers/infiniband/core/addr.c
index 01236cef7bfb..9f3339861ec5 100644
--- a/drivers/infiniband/core/addr.c
+++ b/drivers/infiniband/core/addr.c
@@ -134,8 +134,7 @@ int ib_nl_handle_ip_res_resp(struct sk_buff *skb,
 	const struct nlmsghdr *nlh = (struct nlmsghdr *)cb->nlh;
 
 	if ((nlh->nlmsg_flags & NLM_F_REQUEST) ||
-	    !(NETLINK_CB(skb).sk) ||
-	    !netlink_capable(skb, CAP_NET_ADMIN))
+	    !(NETLINK_CB(skb).sk))
 		return -EPERM;
 
 	if (ib_nl_is_good_ip_resp(nlh))
diff --git a/drivers/infiniband/core/device.c b/drivers/infiniband/core/device.c
index d0994cd30eae..7ae29cc49a5e 100644
--- a/drivers/infiniband/core/device.c
+++ b/drivers/infiniband/core/device.c
@@ -1088,11 +1088,17 @@ EXPORT_SYMBOL(ib_get_net_dev_by_params);
 
 static const struct ibnl_client_cbs ibnl_ls_cb_table[] = {
 	[RDMA_NL_LS_OP_RESOLVE] = {
-		.dump = ib_nl_handle_resolve_resp},
+		.dump = ib_nl_handle_resolve_resp,
+		.flags = RDMA_NL_ADMIN_PERM,
+	},
 	[RDMA_NL_LS_OP_SET_TIMEOUT] = {
-		.dump = ib_nl_handle_set_timeout},
+		.dump = ib_nl_handle_set_timeout,
+		.flags = RDMA_NL_ADMIN_PERM,
+	},
 	[RDMA_NL_LS_OP_IP_RESOLVE] = {
-		.dump = ib_nl_handle_ip_res_resp},
+		.dump = ib_nl_handle_ip_res_resp,
+		.flags = RDMA_NL_ADMIN_PERM,
+	},
 };
 
 static int __init ib_core_init(void)
diff --git a/drivers/infiniband/core/netlink.c b/drivers/infiniband/core/netlink.c
index 826fbd612c7d..c5ee62a24960 100644
--- a/drivers/infiniband/core/netlink.c
+++ b/drivers/infiniband/core/netlink.c
@@ -171,6 +171,10 @@ static int rdma_nl_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh,
 	if (!is_nl_valid(index, op))
 		return -EINVAL;
 
+	if ((rdma_nl_types[index].cb_table[op].flags & RDMA_NL_ADMIN_PERM) &&
+	    !netlink_capable(skb, CAP_NET_ADMIN))
+		return -EPERM;
+
 	/*
 	 * For response or local service set_timeout request,
 	 * there is no need to use netlink_dump_start.
diff --git a/drivers/infiniband/core/sa_query.c b/drivers/infiniband/core/sa_query.c
index 70fa4cabe48e..b499f4422f41 100644
--- a/drivers/infiniband/core/sa_query.c
+++ b/drivers/infiniband/core/sa_query.c
@@ -1033,8 +1033,7 @@ int ib_nl_handle_set_timeout(struct sk_buff *skb,
 	int ret;
 
 	if (!(nlh->nlmsg_flags & NLM_F_REQUEST) ||
-	    !(NETLINK_CB(skb).sk) ||
-	    !netlink_capable(skb, CAP_NET_ADMIN))
+	    !(NETLINK_CB(skb).sk))
 		return -EPERM;
 
 	ret = nla_parse(tb, LS_NLA_TYPE_MAX - 1, nlmsg_data(nlh),
@@ -1109,8 +1108,7 @@ int ib_nl_handle_resolve_resp(struct sk_buff *skb,
 	int ret;
 
 	if ((nlh->nlmsg_flags & NLM_F_REQUEST) ||
-	    !(NETLINK_CB(skb).sk) ||
-	    !netlink_capable(skb, CAP_NET_ADMIN))
+	    !(NETLINK_CB(skb).sk))
 		return -EPERM;
 
 	spin_lock_irqsave(&ib_nl_request_lock, flags);
diff --git a/include/rdma/rdma_netlink.h b/include/rdma/rdma_netlink.h
index c124d8e43fc8..6ea36ec45401 100644
--- a/include/rdma/rdma_netlink.h
+++ b/include/rdma/rdma_netlink.h
@@ -7,6 +7,12 @@
 
 struct ibnl_client_cbs {
 	int (*dump)(struct sk_buff *skb, struct netlink_callback *nlcb);
+	u8 flags;
+};
+
+enum rdma_nl_flags {
+	/* Require CAP_NET_ADMIN */
+	RDMA_NL_ADMIN_PERM	= 1 << 0,
 };
 
 /**
-- 
cgit v1.2.3


From f00e64637061876ec7b6383b0bd80197c51e7312 Mon Sep 17 00:00:00 2001
From: Leon Romanovsky <leonro@mellanox.com>
Date: Sun, 18 Jun 2017 15:35:20 +0300
Subject: RDMA/netlink: Rename and remove redundant parameter from
 ibnl_unicast*

Netlink message header is not needed for unicast reply, hence remove it.

Signed-off-by: Leon Romanovsky <leonro@mellanox.com>
Reviewed-by: Steve Wise <swise@opengridcomputing.com>
---
 drivers/infiniband/core/iwpm_msg.c  |  6 +++---
 drivers/infiniband/core/iwpm_util.c |  4 ++--
 drivers/infiniband/core/netlink.c   | 10 ++++------
 include/rdma/rdma_netlink.h         |  8 ++------
 4 files changed, 11 insertions(+), 17 deletions(-)

(limited to 'include')

diff --git a/drivers/infiniband/core/iwpm_msg.c b/drivers/infiniband/core/iwpm_msg.c
index 45de263305f5..ca3c160bb9da 100644
--- a/drivers/infiniband/core/iwpm_msg.c
+++ b/drivers/infiniband/core/iwpm_msg.c
@@ -172,7 +172,7 @@ int iwpm_add_mapping(struct iwpm_sa_data *pm_msg, u8 nl_client)
 		goto add_mapping_error;
 	nlmsg_request->req_buffer = pm_msg;
 
-	ret = ibnl_unicast_wait(skb, nlh, iwpm_user_pid);
+	ret = rdma_nl_unicast_wait(skb, iwpm_user_pid);
 	if (ret) {
 		skb = NULL; /* skb is freed in the netlink send-op handling */
 		iwpm_user_pid = IWPM_PID_UNDEFINED;
@@ -248,7 +248,7 @@ int iwpm_add_and_query_mapping(struct iwpm_sa_data *pm_msg, u8 nl_client)
 		goto query_mapping_error;
 	nlmsg_request->req_buffer = pm_msg;
 
-	ret = ibnl_unicast_wait(skb, nlh, iwpm_user_pid);
+	ret = rdma_nl_unicast_wait(skb, iwpm_user_pid);
 	if (ret) {
 		skb = NULL; /* skb is freed in the netlink send-op handling */
 		err_str = "Unable to send a nlmsg";
@@ -308,7 +308,7 @@ int iwpm_remove_mapping(struct sockaddr_storage *local_addr, u8 nl_client)
 	if (ret)
 		goto remove_mapping_error;
 
-	ret = ibnl_unicast_wait(skb, nlh, iwpm_user_pid);
+	ret = rdma_nl_unicast_wait(skb, iwpm_user_pid);
 	if (ret) {
 		skb = NULL; /* skb is freed in the netlink send-op handling */
 		iwpm_user_pid = IWPM_PID_UNDEFINED;
diff --git a/drivers/infiniband/core/iwpm_util.c b/drivers/infiniband/core/iwpm_util.c
index c46442ac71a2..c81c55942626 100644
--- a/drivers/infiniband/core/iwpm_util.c
+++ b/drivers/infiniband/core/iwpm_util.c
@@ -597,7 +597,7 @@ static int send_mapinfo_num(u32 mapping_num, u8 nl_client, int iwpm_pid)
 				&mapping_num, IWPM_NLA_MAPINFO_SEND_NUM);
 	if (ret)
 		goto mapinfo_num_error;
-	ret = ibnl_unicast(skb, nlh, iwpm_pid);
+	ret = rdma_nl_unicast(skb, iwpm_pid);
 	if (ret) {
 		skb = NULL;
 		err_str = "Unable to send a nlmsg";
@@ -626,7 +626,7 @@ static int send_nlmsg_done(struct sk_buff *skb, u8 nl_client, int iwpm_pid)
 		return -ENOMEM;
 	}
 	nlh->nlmsg_type = NLMSG_DONE;
-	ret = ibnl_unicast(skb, (struct nlmsghdr *)skb->data, iwpm_pid);
+	ret = rdma_nl_unicast(skb, iwpm_pid);
 	if (ret)
 		pr_warn("%s Unable to send a nlmsg\n", __func__);
 	return ret;
diff --git a/drivers/infiniband/core/netlink.c b/drivers/infiniband/core/netlink.c
index e2395a1d9f45..b95a70013f19 100644
--- a/drivers/infiniband/core/netlink.c
+++ b/drivers/infiniband/core/netlink.c
@@ -239,25 +239,23 @@ static void rdma_nl_rcv(struct sk_buff *skb)
 	mutex_unlock(&rdma_nl_mutex);
 }
 
-int ibnl_unicast(struct sk_buff *skb, struct nlmsghdr *nlh,
-			__u32 pid)
+int rdma_nl_unicast(struct sk_buff *skb, u32 pid)
 {
 	int err;
 
 	err = netlink_unicast(nls, skb, pid, MSG_DONTWAIT);
 	return (err < 0) ? err : 0;
 }
-EXPORT_SYMBOL(ibnl_unicast);
+EXPORT_SYMBOL(rdma_nl_unicast);
 
-int ibnl_unicast_wait(struct sk_buff *skb, struct nlmsghdr *nlh,
-		      __u32 pid)
+int rdma_nl_unicast_wait(struct sk_buff *skb, __u32 pid)
 {
 	int err;
 
 	err = netlink_unicast(nls, skb, pid, 0);
 	return (err < 0) ? err : 0;
 }
-EXPORT_SYMBOL(ibnl_unicast_wait);
+EXPORT_SYMBOL(rdma_nl_unicast_wait);
 
 int ibnl_multicast(struct sk_buff *skb, struct nlmsghdr *nlh,
 			unsigned int group, gfp_t flags)
diff --git a/include/rdma/rdma_netlink.h b/include/rdma/rdma_netlink.h
index 6ea36ec45401..e7b0779385e9 100644
--- a/include/rdma/rdma_netlink.h
+++ b/include/rdma/rdma_netlink.h
@@ -56,22 +56,18 @@ int ibnl_put_attr(struct sk_buff *skb, struct nlmsghdr *nlh,
 /**
  * Send the supplied skb to a specific userspace PID.
  * @skb: The netlink skb
- * @nlh: Header of the netlink message to send
  * @pid: Userspace netlink process ID
  * Returns 0 on success or a negative error code.
  */
-int ibnl_unicast(struct sk_buff *skb, struct nlmsghdr *nlh,
-			__u32 pid);
+int rdma_nl_unicast(struct sk_buff *skb, u32 pid);
 
 /**
  * Send, with wait/1 retry, the supplied skb to a specific userspace PID.
  * @skb: The netlink skb
- * @nlh: Header of the netlink message to send
  * @pid: Userspace netlink process ID
  * Returns 0 on success or a negative error code.
  */
-int ibnl_unicast_wait(struct sk_buff *skb, struct nlmsghdr *nlh,
-		      __u32 pid);
+int rdma_nl_unicast_wait(struct sk_buff *skb, __u32 pid);
 
 /**
  * Send the supplied skb to a netlink group.
-- 
cgit v1.2.3


From 4d7f693af0c9d0d6940ff36f5adca1adfa0e7e6e Mon Sep 17 00:00:00 2001
From: Leon Romanovsky <leonro@mellanox.com>
Date: Sun, 18 Jun 2017 15:44:32 +0300
Subject: RDMA/netlink: Rename and remove redundant parameter from
 ibnl_multicast

The pointer to netlink header was not used in the ibnl_multicast
function, so let's remove it and simplify the function
signature.

Signed-off-by: Leon Romanovsky <leonro@mellanox.com>
Reviewed-by: Steve Wise <swise@opengridcomputing.com>
---
 drivers/infiniband/core/addr.c     | 2 +-
 drivers/infiniband/core/iwpm_msg.c | 2 +-
 drivers/infiniband/core/netlink.c  | 5 ++---
 drivers/infiniband/core/sa_query.c | 2 +-
 include/rdma/rdma_netlink.h        | 4 +---
 5 files changed, 6 insertions(+), 9 deletions(-)

(limited to 'include')

diff --git a/drivers/infiniband/core/addr.c b/drivers/infiniband/core/addr.c
index 9f3339861ec5..30cf764824ec 100644
--- a/drivers/infiniband/core/addr.c
+++ b/drivers/infiniband/core/addr.c
@@ -184,7 +184,7 @@ static int ib_nl_ip_send_msg(struct rdma_dev_addr *dev_addr,
 
 	/* Repair the nlmsg header length */
 	nlmsg_end(skb, nlh);
-	ibnl_multicast(skb, nlh, RDMA_NL_GROUP_LS, GFP_KERNEL);
+	rdma_nl_multicast(skb, RDMA_NL_GROUP_LS, GFP_KERNEL);
 
 	/* Make the request retry, so when we get the response from userspace
 	 * we will have something.
diff --git a/drivers/infiniband/core/iwpm_msg.c b/drivers/infiniband/core/iwpm_msg.c
index ca3c160bb9da..30825bb9b8e9 100644
--- a/drivers/infiniband/core/iwpm_msg.c
+++ b/drivers/infiniband/core/iwpm_msg.c
@@ -103,7 +103,7 @@ int iwpm_register_pid(struct iwpm_dev_data *pm_msg, u8 nl_client)
 	pr_debug("%s: Multicasting a nlmsg (dev = %s ifname = %s iwpm = %s)\n",
 		__func__, pm_msg->dev_name, pm_msg->if_name, iwpm_ulib_name);
 
-	ret = ibnl_multicast(skb, nlh, RDMA_NL_GROUP_IWPM, GFP_KERNEL);
+	ret = rdma_nl_multicast(skb, RDMA_NL_GROUP_IWPM, GFP_KERNEL);
 	if (ret) {
 		skb = NULL; /* skb is freed in the netlink send-op handling */
 		iwpm_user_pid = IWPM_PID_UNAVAILABLE;
diff --git a/drivers/infiniband/core/netlink.c b/drivers/infiniband/core/netlink.c
index b95a70013f19..5c627d1fbaa9 100644
--- a/drivers/infiniband/core/netlink.c
+++ b/drivers/infiniband/core/netlink.c
@@ -257,12 +257,11 @@ int rdma_nl_unicast_wait(struct sk_buff *skb, __u32 pid)
 }
 EXPORT_SYMBOL(rdma_nl_unicast_wait);
 
-int ibnl_multicast(struct sk_buff *skb, struct nlmsghdr *nlh,
-			unsigned int group, gfp_t flags)
+int rdma_nl_multicast(struct sk_buff *skb, unsigned int group, gfp_t flags)
 {
 	return nlmsg_multicast(nls, skb, 0, group, flags);
 }
-EXPORT_SYMBOL(ibnl_multicast);
+EXPORT_SYMBOL(rdma_nl_multicast);
 
 int __init rdma_nl_init(void)
 {
diff --git a/drivers/infiniband/core/sa_query.c b/drivers/infiniband/core/sa_query.c
index b499f4422f41..977f64d0e983 100644
--- a/drivers/infiniband/core/sa_query.c
+++ b/drivers/infiniband/core/sa_query.c
@@ -861,7 +861,7 @@ static int ib_nl_send_msg(struct ib_sa_query *query, gfp_t gfp_mask)
 	/* Repair the nlmsg header length */
 	nlmsg_end(skb, nlh);
 
-	ret = ibnl_multicast(skb, nlh, RDMA_NL_GROUP_LS, gfp_mask);
+	ret = rdma_nl_multicast(skb, RDMA_NL_GROUP_LS, gfp_mask);
 	if (!ret)
 		ret = len;
 	else
diff --git a/include/rdma/rdma_netlink.h b/include/rdma/rdma_netlink.h
index e7b0779385e9..16a94c425938 100644
--- a/include/rdma/rdma_netlink.h
+++ b/include/rdma/rdma_netlink.h
@@ -72,12 +72,10 @@ int rdma_nl_unicast_wait(struct sk_buff *skb, __u32 pid);
 /**
  * Send the supplied skb to a netlink group.
  * @skb: The netlink skb
- * @nlh: Header of the netlink message to send
  * @group: Netlink group ID
  * @flags: allocation flags
  * Returns 0 on success or a negative error code.
  */
-int ibnl_multicast(struct sk_buff *skb, struct nlmsghdr *nlh,
-			unsigned int group, gfp_t flags);
+int rdma_nl_multicast(struct sk_buff *skb, unsigned int group, gfp_t flags);
 
 #endif /* _RDMA_NETLINK_H */
-- 
cgit v1.2.3


From ff61c425c1c563f1d688d59caf3b18a395cbf9c4 Mon Sep 17 00:00:00 2001
From: Leon Romanovsky <leonro@mellanox.com>
Date: Sun, 18 Jun 2017 15:51:16 +0300
Subject: RDMA/netlink: Simplify and rename ibnl_chk_listeners

Make ibnl_chk_listeners function to be one line by removing
unneeded comparison.

Rename that function to be complaint to other functions in RDMA netlink.

Signed-off-by: Leon Romanovsky <leonro@mellanox.com>
Reviewed-by: Steve Wise <swise@opengridcomputing.com>
---
 drivers/infiniband/core/addr.c     | 2 +-
 drivers/infiniband/core/netlink.c  | 7 +++----
 drivers/infiniband/core/sa_query.c | 2 +-
 include/rdma/rdma_netlink.h        | 6 ++++++
 4 files changed, 11 insertions(+), 6 deletions(-)

(limited to 'include')

diff --git a/drivers/infiniband/core/addr.c b/drivers/infiniband/core/addr.c
index 30cf764824ec..7310ece99cd9 100644
--- a/drivers/infiniband/core/addr.c
+++ b/drivers/infiniband/core/addr.c
@@ -325,7 +325,7 @@ static void queue_req(struct addr_req *req)
 static int ib_nl_fetch_ha(struct dst_entry *dst, struct rdma_dev_addr *dev_addr,
 			  const void *daddr, u32 seq, u16 family)
 {
-	if (ibnl_chk_listeners(RDMA_NL_GROUP_LS))
+	if (rdma_nl_chk_listeners(RDMA_NL_GROUP_LS))
 		return -EADDRNOTAVAIL;
 
 	/* We fill in what we can, the response will fill the rest */
diff --git a/drivers/infiniband/core/netlink.c b/drivers/infiniband/core/netlink.c
index 5c627d1fbaa9..514959ccaf2d 100644
--- a/drivers/infiniband/core/netlink.c
+++ b/drivers/infiniband/core/netlink.c
@@ -47,12 +47,11 @@ static struct {
 	const struct ibnl_client_cbs   *cb_table;
 } rdma_nl_types[RDMA_NL_NUM_CLIENTS];
 
-int ibnl_chk_listeners(unsigned int group)
+int rdma_nl_chk_listeners(unsigned int group)
 {
-	if (netlink_has_listeners(nls, group) == 0)
-		return -1;
-	return 0;
+	return (netlink_has_listeners(nls, group)) ? 0 : -1;
 }
+EXPORT_SYMBOL(rdma_nl_chk_listeners);
 
 static bool is_nl_msg_valid(unsigned int type, unsigned int op)
 {
diff --git a/drivers/infiniband/core/sa_query.c b/drivers/infiniband/core/sa_query.c
index 977f64d0e983..2cc85c2b74b7 100644
--- a/drivers/infiniband/core/sa_query.c
+++ b/drivers/infiniband/core/sa_query.c
@@ -1418,7 +1418,7 @@ static int send_mad(struct ib_sa_query *query, int timeout_ms, gfp_t gfp_mask)
 
 	if ((query->flags & IB_SA_ENABLE_LOCAL_SERVICE) &&
 	    (!(query->flags & IB_SA_QUERY_OPA))) {
-		if (!ibnl_chk_listeners(RDMA_NL_GROUP_LS)) {
+		if (!rdma_nl_chk_listeners(RDMA_NL_GROUP_LS)) {
 			if (!ib_nl_make_request(query, gfp_mask))
 				return id;
 		}
diff --git a/include/rdma/rdma_netlink.h b/include/rdma/rdma_netlink.h
index 16a94c425938..348e0bbe0fc9 100644
--- a/include/rdma/rdma_netlink.h
+++ b/include/rdma/rdma_netlink.h
@@ -78,4 +78,10 @@ int rdma_nl_unicast_wait(struct sk_buff *skb, __u32 pid);
  */
 int rdma_nl_multicast(struct sk_buff *skb, unsigned int group, gfp_t flags);
 
+/**
+ * Check if there are any listeners to the netlink group
+ * @group: the netlink group ID
+ * Returns 0 on success or a negative for no listeners.
+ */
+int rdma_nl_chk_listeners(unsigned int group);
 #endif /* _RDMA_NETLINK_H */
-- 
cgit v1.2.3


From 3250b4dbd87aa08c21891cabfc6f6b48b36fd7e5 Mon Sep 17 00:00:00 2001
From: Leon Romanovsky <leonro@mellanox.com>
Date: Mon, 19 Jun 2017 18:23:45 +0300
Subject: RDMA/netlink: Rename netlink callback struct

The RDMA netlink client infrastructure was removed and made obsolete.
The old infrastructure defined struct ibnl_client_cbs. Now that all
uses of this have been updated to the new infrastructure, rename the
struct to be compliant with the current stack naming standards:
struct rdma_nl_cbs.

Signed-off-by: Leon Romanovsky <leonro@mellanox.com>
Reviewed-by: Steve Wise <swise@opengridcomputing.com>
---
 drivers/infiniband/core/cma.c     | 2 +-
 drivers/infiniband/core/device.c  | 2 +-
 drivers/infiniband/core/iwcm.c    | 2 +-
 drivers/infiniband/core/netlink.c | 4 ++--
 include/rdma/rdma_netlink.h       | 4 ++--
 5 files changed, 7 insertions(+), 7 deletions(-)

(limited to 'include')

diff --git a/drivers/infiniband/core/cma.c b/drivers/infiniband/core/cma.c
index 0c85f140e616..d8edd8b11561 100644
--- a/drivers/infiniband/core/cma.c
+++ b/drivers/infiniband/core/cma.c
@@ -4458,7 +4458,7 @@ out:
 	return skb->len;
 }
 
-static const struct ibnl_client_cbs cma_cb_table[] = {
+static const struct rdma_nl_cbs cma_cb_table[] = {
 	[RDMA_NL_RDMA_CM_ID_STATS] = { .dump = cma_get_id_stats},
 };
 
diff --git a/drivers/infiniband/core/device.c b/drivers/infiniband/core/device.c
index 7ae29cc49a5e..33a39518848c 100644
--- a/drivers/infiniband/core/device.c
+++ b/drivers/infiniband/core/device.c
@@ -1086,7 +1086,7 @@ struct net_device *ib_get_net_dev_by_params(struct ib_device *dev,
 }
 EXPORT_SYMBOL(ib_get_net_dev_by_params);
 
-static const struct ibnl_client_cbs ibnl_ls_cb_table[] = {
+static const struct rdma_nl_cbs ibnl_ls_cb_table[] = {
 	[RDMA_NL_LS_OP_RESOLVE] = {
 		.dump = ib_nl_handle_resolve_resp,
 		.flags = RDMA_NL_ADMIN_PERM,
diff --git a/drivers/infiniband/core/iwcm.c b/drivers/infiniband/core/iwcm.c
index 8599271d8be6..452a3115e3e6 100644
--- a/drivers/infiniband/core/iwcm.c
+++ b/drivers/infiniband/core/iwcm.c
@@ -80,7 +80,7 @@ const char *__attribute_const__ iwcm_reject_msg(int reason)
 }
 EXPORT_SYMBOL(iwcm_reject_msg);
 
-static struct ibnl_client_cbs iwcm_nl_cb_table[] = {
+static struct rdma_nl_cbs iwcm_nl_cb_table[] = {
 	[RDMA_NL_IWPM_REG_PID] = {.dump = iwpm_register_pid_cb},
 	[RDMA_NL_IWPM_ADD_MAPPING] = {.dump = iwpm_add_mapping_cb},
 	[RDMA_NL_IWPM_QUERY_MAPPING] = {.dump = iwpm_add_and_query_mapping_cb},
diff --git a/drivers/infiniband/core/netlink.c b/drivers/infiniband/core/netlink.c
index 514959ccaf2d..a7082adae16b 100644
--- a/drivers/infiniband/core/netlink.c
+++ b/drivers/infiniband/core/netlink.c
@@ -44,7 +44,7 @@
 static DEFINE_MUTEX(rdma_nl_mutex);
 static struct sock *nls;
 static struct {
-	const struct ibnl_client_cbs   *cb_table;
+	const struct rdma_nl_cbs   *cb_table;
 } rdma_nl_types[RDMA_NL_NUM_CLIENTS];
 
 int rdma_nl_chk_listeners(unsigned int group)
@@ -84,7 +84,7 @@ static bool is_nl_valid(unsigned int type, unsigned int op)
 }
 
 void rdma_nl_register(unsigned int index,
-		      const struct ibnl_client_cbs cb_table[])
+		      const struct rdma_nl_cbs cb_table[])
 {
 	mutex_lock(&rdma_nl_mutex);
 	if (!is_nl_msg_valid(index, 0)) {
diff --git a/include/rdma/rdma_netlink.h b/include/rdma/rdma_netlink.h
index 348e0bbe0fc9..92f8832297ab 100644
--- a/include/rdma/rdma_netlink.h
+++ b/include/rdma/rdma_netlink.h
@@ -5,7 +5,7 @@
 #include <linux/netlink.h>
 #include <uapi/rdma/rdma_netlink.h>
 
-struct ibnl_client_cbs {
+struct rdma_nl_cbs {
 	int (*dump)(struct sk_buff *skb, struct netlink_callback *nlcb);
 	u8 flags;
 };
@@ -21,7 +21,7 @@ enum rdma_nl_flags {
  * @cb_table: A table for op->callback
  */
 void rdma_nl_register(unsigned int index,
-		      const struct ibnl_client_cbs cb_table[]);
+		      const struct rdma_nl_cbs cb_table[]);
 
 /**
  * Remove a client from IB netlink.
-- 
cgit v1.2.3


From ecc82c53f9a4ce08ba7df626a4262c86841ced8f Mon Sep 17 00:00:00 2001
From: Leon Romanovsky <leonro@mellanox.com>
Date: Sun, 18 Jun 2017 14:39:59 +0300
Subject: RDMA/core: Add and expose static device index

This patch adds static device index in similar fashion to
already available in netdev world (struct net->ifindex).

In downstream patches, the RDMA nelink will use this idx-to-ib_device
conversion, so as part of this commit, we are exposing the translation
function to be visible for IB/core users.

Signed-off-by: Leon Romanovsky <leonro@mellanox.com>
---
 drivers/infiniband/core/core_priv.h |  5 +++++
 drivers/infiniband/core/device.c    | 37 ++++++++++++++++++++++++++++++++++++-
 include/rdma/ib_verbs.h             |  2 ++
 3 files changed, 43 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/drivers/infiniband/core/core_priv.h b/drivers/infiniband/core/core_priv.h
index 0c175590cf92..cbdcc81e1df8 100644
--- a/drivers/infiniband/core/core_priv.h
+++ b/drivers/infiniband/core/core_priv.h
@@ -309,4 +309,9 @@ static inline int ib_mad_enforce_security(struct ib_mad_agent_private *map,
 	return 0;
 }
 #endif
+
+struct ib_device *__ib_device_get_by_index(u32 ifindex);
+/* RDMA device netlink */
+void nldev_init(void);
+void nldev_exit(void);
 #endif /* _CORE_PRIV_H */
diff --git a/drivers/infiniband/core/device.c b/drivers/infiniband/core/device.c
index 8828f26250a8..deae8b940994 100644
--- a/drivers/infiniband/core/device.c
+++ b/drivers/infiniband/core/device.c
@@ -134,6 +134,17 @@ static int ib_device_check_mandatory(struct ib_device *device)
 	return 0;
 }
 
+struct ib_device *__ib_device_get_by_index(u32 index)
+{
+	struct ib_device *device;
+
+	list_for_each_entry(device, &device_list, core_list)
+		if (device->index == index)
+			return device;
+
+	return NULL;
+}
+
 static struct ib_device *__ib_device_get_by_name(const char *name)
 {
 	struct ib_device *device;
@@ -145,7 +156,6 @@ static struct ib_device *__ib_device_get_by_name(const char *name)
 	return NULL;
 }
 
-
 static int alloc_name(char *name)
 {
 	unsigned long *inuse;
@@ -394,6 +404,30 @@ static int ib_security_change(struct notifier_block *nb, unsigned long event,
 	return NOTIFY_OK;
 }
 
+/**
+ *	__dev_new_index	-	allocate an device index
+ *
+ *	Returns a suitable unique value for a new device interface
+ *	number.  It assumes that there are less than 2^32-1 ib devices
+ *	will be present in the system.
+ */
+static u32 __dev_new_index(void)
+{
+	/*
+	 * The device index to allow stable naming.
+	 * Similar to struct net -> ifindex.
+	 */
+	static u32 index;
+
+	for (;;) {
+		if (!(++index))
+			index = 1;
+
+		if (!__ib_device_get_by_index(index))
+			return index;
+	}
+}
+
 /**
  * ib_register_device - Register an IB device with IB core
  * @device:Device to register
@@ -492,6 +526,7 @@ int ib_register_device(struct ib_device *device,
 		if (client->add && !add_client_context(device, client))
 			client->add(device);
 
+	device->index = __dev_new_index();
 	down_write(&lists_rwsem);
 	list_add_tail(&device->core_list, &device_list);
 	up_write(&lists_rwsem);
diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h
index 1082b4c81b2c..3391df5fdc9c 100644
--- a/include/rdma/ib_verbs.h
+++ b/include/rdma/ib_verbs.h
@@ -2298,6 +2298,8 @@ struct ib_device {
 	struct rdmacg_device         cg_device;
 #endif
 
+	u32                          index;
+
 	/**
 	 * The following mandatory functions are used only at device
 	 * registration.  Keep functions such as these at the end of this
-- 
cgit v1.2.3


From 1830ba21b9a475cfc6159e6cfe532c75fe7682a4 Mon Sep 17 00:00:00 2001
From: Leon Romanovsky <leonro@mellanox.com>
Date: Thu, 15 Jun 2017 12:46:33 +0300
Subject: RDMA/netlink: Add and implement doit netlink callback

The .doit callback is used by netlink core to differentiate
between get and set operations. Common convention is to use
that call for command operations like (SET, ADD, e.t.c.) and/or
access without NLF_M_DUMP flag.

This commit adds proper declaration and implementation
to RDMA netlink.

Signed-off-by: Leon Romanovsky <leonro@mellanox.com>
Reviewed-by: Steve Wise <swise@opengridcomputing.com>
---
 drivers/infiniband/core/netlink.c | 19 ++++++++++++++-----
 include/rdma/rdma_netlink.h       |  2 ++
 2 files changed, 16 insertions(+), 5 deletions(-)

(limited to 'include')

diff --git a/drivers/infiniband/core/netlink.c b/drivers/infiniband/core/netlink.c
index a7082adae16b..484d6a8a2811 100644
--- a/drivers/infiniband/core/netlink.c
+++ b/drivers/infiniband/core/netlink.c
@@ -76,9 +76,13 @@ static bool is_nl_msg_valid(unsigned int type, unsigned int op)
 
 static bool is_nl_valid(unsigned int type, unsigned int op)
 {
-	if (!is_nl_msg_valid(type, op) ||
-	    !rdma_nl_types[type].cb_table ||
-	    !rdma_nl_types[type].cb_table[op].dump)
+	const struct rdma_nl_cbs *cb_table;
+
+	if (!is_nl_msg_valid(type, op))
+		return false;
+
+	cb_table = rdma_nl_types[type].cb_table;
+	if (!cb_table || (!cb_table[op].dump && !cb_table[op].doit))
 		return false;
 	return true;
 }
@@ -151,6 +155,7 @@ static int rdma_nl_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh,
 	unsigned int op = RDMA_NL_GET_OP(type);
 	struct netlink_callback cb = {};
 	struct netlink_dump_control c = {};
+	int ret;
 
 	if (!is_nl_valid(index, op))
 		return -EINVAL;
@@ -169,10 +174,14 @@ static int rdma_nl_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh,
 		cb.nlh = nlh;
 		cb.dump = rdma_nl_types[index].cb_table[op].dump;
 		return cb.dump(skb, &cb);
+	} else {
+		c.dump = rdma_nl_types[index].cb_table[op].dump;
+		return netlink_dump_start(nls, skb, nlh, &c);
 	}
+	if (rdma_nl_types[index].cb_table[op].doit)
+		ret = rdma_nl_types[index].cb_table[op].doit(skb, nlh, extack);
+	return ret;
 
-	c.dump = rdma_nl_types[index].cb_table[op].dump;
-	return netlink_dump_start(nls, skb, nlh, &c);
 }
 
 /*
diff --git a/include/rdma/rdma_netlink.h b/include/rdma/rdma_netlink.h
index 92f8832297ab..e25bf1988846 100644
--- a/include/rdma/rdma_netlink.h
+++ b/include/rdma/rdma_netlink.h
@@ -6,6 +6,8 @@
 #include <uapi/rdma/rdma_netlink.h>
 
 struct rdma_nl_cbs {
+	int (*doit)(struct sk_buff *skb, struct nlmsghdr *nlh,
+		    struct netlink_ext_ack *extack);
 	int (*dump)(struct sk_buff *skb, struct netlink_callback *nlcb);
 	u8 flags;
 };
-- 
cgit v1.2.3


From 1a6e7c31d71db34d1b9bc3acc87eaea6c2ecc997 Mon Sep 17 00:00:00 2001
From: Leon Romanovsky <leonro@mellanox.com>
Date: Tue, 20 Jun 2017 07:55:53 +0300
Subject: RDMA/netlink: Add netlink device definitions to UAPI

Introduce new defines to rdma_netlink.h, so the RDMA configuration tool
will be able to communicate with RDMA subsystem by using the shared defines.

The addition of new client (NLDEV) revealed the fact that we exposed by
mistake the RDMA_NL_I40IW define which is not backed by any RDMA netlink
by now and it won't be exposed in the future too. So this patch reuses
the value and deletes the old defines.

The NLDEV operates with objects. The struct ib_device has two straightforward
objects: device itself and ports of that device.

This brings us to propose the following commands to work on those objects:
 * RDMA_NLDEV_CMD_{GET,SET,NEW,DEL} - works on ib_device itself
 * RDMA_NLDEV_CMD_PORT_{GET,SET,NEW,DEL} - works on ports of specific ib_device

Those commands receive/return the device index (RDMA_NLDEV_ATTR_DEV_INDEX)
and port index (RDMA_NLDEV_ATTR_PORT_INDEX). For device object accesses,
the RDMA_NLDEV_ATTR_PORT_INDEX will return the maximum number of ports
for specific ib_device and for port access the actual port index.

The port index starts from 1 to follow RDMA/core internal semantics and
the sysfs exposed knobs.

Signed-off-by: Leon Romanovsky <leonro@mellanox.com>
Reviewed-by: Steve Wise <swise@opengridcomputing.com>
---
 drivers/infiniband/core/netlink.c |  2 +-
 include/uapi/rdma/rdma_netlink.h  | 39 ++++++++++++++++++++++++++++++++++++++-
 2 files changed, 39 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/drivers/infiniband/core/netlink.c b/drivers/infiniband/core/netlink.c
index cd692bd73793..27352a352770 100644
--- a/drivers/infiniband/core/netlink.c
+++ b/drivers/infiniband/core/netlink.c
@@ -61,7 +61,7 @@ static bool is_nl_msg_valid(unsigned int type, unsigned int op)
 				  RDMA_NL_IWPM_NUM_OPS,
 				  0,
 				  RDMA_NL_LS_NUM_OPS,
-				  0 };
+				  RDMA_NLDEV_NUM_OPS };
 
 	/*
 	 * This BUILD_BUG_ON is intended to catch addition of new
diff --git a/include/uapi/rdma/rdma_netlink.h b/include/uapi/rdma/rdma_netlink.h
index 02fe8390c18f..a44229fa5eca 100644
--- a/include/uapi/rdma/rdma_netlink.h
+++ b/include/uapi/rdma/rdma_netlink.h
@@ -8,7 +8,7 @@ enum {
 	RDMA_NL_IWCM,
 	RDMA_NL_RSVD,
 	RDMA_NL_LS,	/* RDMA Local Services */
-	RDMA_NL_I40IW,
+	RDMA_NL_NLDEV,	/* RDMA device interface */
 	RDMA_NL_NUM_CLIENTS
 };
 
@@ -222,4 +222,41 @@ struct rdma_nla_ls_gid {
 	__u8		gid[16];
 };
 
+enum rdma_nldev_command {
+	RDMA_NLDEV_CMD_UNSPEC,
+
+	RDMA_NLDEV_CMD_GET, /* can dump */
+	RDMA_NLDEV_CMD_SET,
+	RDMA_NLDEV_CMD_NEW,
+	RDMA_NLDEV_CMD_DEL,
+
+	RDMA_NLDEV_CMD_PORT_GET, /* can dump */
+	RDMA_NLDEV_CMD_PORT_SET,
+	RDMA_NLDEV_CMD_PORT_NEW,
+	RDMA_NLDEV_CMD_PORT_DEL,
+
+	RDMA_NLDEV_NUM_OPS
+};
+
+enum rdma_nldev_attr {
+	/* don't change the order or add anything between, this is ABI! */
+	RDMA_NLDEV_ATTR_UNSPEC,
+
+	/* Identifier for ib_device */
+	RDMA_NLDEV_ATTR_DEV_INDEX,		/* u32 */
+
+	RDMA_NLDEV_ATTR_DEV_NAME,		/* string */
+	/*
+	 * Device index together with port index are identifiers
+	 * for port/link properties.
+	 *
+	 * For RDMA_NLDEV_CMD_GET commamnd, port index will return number
+	 * of available ports in ib_device, while for port specific operations,
+	 * it will be real port index as it appears in sysfs. Port index follows
+	 * sysfs notation and starts from 1 for the first port.
+	 */
+	RDMA_NLDEV_ATTR_PORT_INDEX,		/* u32 */
+
+	RDMA_NLDEV_ATTR_MAX
+};
 #endif /* _UAPI_RDMA_NETLINK_H */
-- 
cgit v1.2.3


From ac50525374315b9b609747f83b07f8dccb06b722 Mon Sep 17 00:00:00 2001
From: Leon Romanovsky <leonro@mellanox.com>
Date: Tue, 20 Jun 2017 14:47:08 +0300
Subject: RDMA/netlink: Expose device and port capability masks

The port capability mask is exposed to user space via sysfs interface,
while device capabilities are available for verbs only.

This patch provides those capabilities through netlink interface.

Signed-off-by: Leon Romanovsky <leonro@mellanox.com>
Reviewed-by: Steve Wise <swise@opengridcomputing.com>
---
 drivers/infiniband/core/nldev.c  | 19 +++++++++++++++++++
 include/uapi/rdma/rdma_netlink.h |  5 +++++
 2 files changed, 24 insertions(+)

(limited to 'include')

diff --git a/drivers/infiniband/core/nldev.c b/drivers/infiniband/core/nldev.c
index db9d9ffc1415..94c1e49074f5 100644
--- a/drivers/infiniband/core/nldev.c
+++ b/drivers/infiniband/core/nldev.c
@@ -50,18 +50,37 @@ static int fill_dev_info(struct sk_buff *msg, struct ib_device *device)
 		return -EMSGSIZE;
 	if (nla_put_u32(msg, RDMA_NLDEV_ATTR_PORT_INDEX, rdma_end_port(device)))
 		return -EMSGSIZE;
+
+	BUILD_BUG_ON(sizeof(device->attrs.device_cap_flags) != sizeof(u64));
+	if (nla_put_u64_64bit(msg, RDMA_NLDEV_ATTR_CAP_FLAGS,
+			      device->attrs.device_cap_flags, 0))
+		return -EMSGSIZE;
+
 	return 0;
 }
 
 static int fill_port_info(struct sk_buff *msg,
 			  struct ib_device *device, u32 port)
 {
+	struct ib_port_attr attr;
+	int ret;
+
 	if (nla_put_u32(msg, RDMA_NLDEV_ATTR_DEV_INDEX, device->index))
 		return -EMSGSIZE;
 	if (nla_put_string(msg, RDMA_NLDEV_ATTR_DEV_NAME, device->name))
 		return -EMSGSIZE;
 	if (nla_put_u32(msg, RDMA_NLDEV_ATTR_PORT_INDEX, port))
 		return -EMSGSIZE;
+
+	ret = ib_query_port(device, port, &attr);
+	if (ret)
+		return ret;
+
+	BUILD_BUG_ON(sizeof(attr.port_cap_flags) > sizeof(u64));
+	if (nla_put_u64_64bit(msg, RDMA_NLDEV_ATTR_CAP_FLAGS,
+			      (u64)attr.port_cap_flags, 0))
+		return -EMSGSIZE;
+
 	return 0;
 }
 
diff --git a/include/uapi/rdma/rdma_netlink.h b/include/uapi/rdma/rdma_netlink.h
index a44229fa5eca..90de11db6580 100644
--- a/include/uapi/rdma/rdma_netlink.h
+++ b/include/uapi/rdma/rdma_netlink.h
@@ -257,6 +257,11 @@ enum rdma_nldev_attr {
 	 */
 	RDMA_NLDEV_ATTR_PORT_INDEX,		/* u32 */
 
+	/*
+	 * Device and port capabilities
+	 */
+	RDMA_NLDEV_ATTR_CAP_FLAGS,		/* u64 */
+
 	RDMA_NLDEV_ATTR_MAX
 };
 #endif /* _UAPI_RDMA_NETLINK_H */
-- 
cgit v1.2.3


From 9abb0d1bbd9529c574eacd8586e2bf68d17966cd Mon Sep 17 00:00:00 2001
From: Leon Romanovsky <leonro@mellanox.com>
Date: Tue, 27 Jun 2017 16:49:53 +0300
Subject: RDMA: Simplify get firmware interface

There is a need to forward FW version to user space
application through RDMA netlink. In order to make it safe, there
is need to declare nla_policy and limit the size of FW string.

The new define IB_FW_VERSION_NAME_MAX will limit the size of
FW version string. That define was chosen to be equal to
ETHTOOL_FWVERS_LEN, because many drivers anyway are limited
by that value indirectly.

The introduction of this define allows us to remove the string size
from get_fw_str function signature.

Signed-off-by: Leon Romanovsky <leonro@mellanox.com>
---
 drivers/infiniband/core/device.c               | 4 ++--
 drivers/infiniband/core/sysfs.c                | 4 ++--
 drivers/infiniband/hw/cxgb3/iwch_provider.c    | 5 ++---
 drivers/infiniband/hw/cxgb4/provider.c         | 5 ++---
 drivers/infiniband/hw/hfi1/verbs.c             | 5 ++---
 drivers/infiniband/hw/i40iw/i40iw_verbs.c      | 7 +++----
 drivers/infiniband/hw/mlx4/main.c              | 5 ++---
 drivers/infiniband/hw/mlx5/main.c              | 8 ++++----
 drivers/infiniband/hw/mthca/mthca_provider.c   | 5 ++---
 drivers/infiniband/hw/nes/nes_verbs.c          | 5 ++---
 drivers/infiniband/hw/ocrdma/ocrdma_main.c     | 5 ++---
 drivers/infiniband/hw/qedr/main.c              | 5 ++---
 drivers/infiniband/hw/usnic/usnic_ib_main.c    | 6 ++----
 drivers/infiniband/hw/vmw_pvrdma/pvrdma_main.c | 5 ++---
 drivers/infiniband/ulp/ipoib/ipoib_ethtool.c   | 3 +--
 include/rdma/ib_verbs.h                        | 6 ++++--
 16 files changed, 36 insertions(+), 47 deletions(-)

(limited to 'include')

diff --git a/drivers/infiniband/core/device.c b/drivers/infiniband/core/device.c
index 66b109bc6753..fbc92c649be8 100644
--- a/drivers/infiniband/core/device.c
+++ b/drivers/infiniband/core/device.c
@@ -336,10 +336,10 @@ static int read_port_immutable(struct ib_device *device)
 	return 0;
 }
 
-void ib_get_device_fw_str(struct ib_device *dev, char *str, size_t str_len)
+void ib_get_device_fw_str(struct ib_device *dev, char *str)
 {
 	if (dev->get_dev_fw_str)
-		dev->get_dev_fw_str(dev, str, str_len);
+		dev->get_dev_fw_str(dev, str);
 	else
 		str[0] = '\0';
 }
diff --git a/drivers/infiniband/core/sysfs.c b/drivers/infiniband/core/sysfs.c
index 7ebe1ef23652..abc5ab581f82 100644
--- a/drivers/infiniband/core/sysfs.c
+++ b/drivers/infiniband/core/sysfs.c
@@ -1210,8 +1210,8 @@ static ssize_t show_fw_ver(struct device *device, struct device_attribute *attr,
 {
 	struct ib_device *dev = container_of(device, struct ib_device, dev);
 
-	ib_get_device_fw_str(dev, buf, PAGE_SIZE);
-	strlcat(buf, "\n", PAGE_SIZE);
+	ib_get_device_fw_str(dev, buf);
+	strlcat(buf, "\n", IB_FW_VERSION_NAME_MAX);
 	return strlen(buf);
 }
 
diff --git a/drivers/infiniband/hw/cxgb3/iwch_provider.c b/drivers/infiniband/hw/cxgb3/iwch_provider.c
index 0cd0c1fa27d4..099e76f3758a 100644
--- a/drivers/infiniband/hw/cxgb3/iwch_provider.c
+++ b/drivers/infiniband/hw/cxgb3/iwch_provider.c
@@ -1336,8 +1336,7 @@ static int iwch_port_immutable(struct ib_device *ibdev, u8 port_num,
 	return 0;
 }
 
-static void get_dev_fw_ver_str(struct ib_device *ibdev, char *str,
-			       size_t str_len)
+static void get_dev_fw_ver_str(struct ib_device *ibdev, char *str)
 {
 	struct iwch_dev *iwch_dev = to_iwch_dev(ibdev);
 	struct ethtool_drvinfo info;
@@ -1345,7 +1344,7 @@ static void get_dev_fw_ver_str(struct ib_device *ibdev, char *str,
 
 	pr_debug("%s dev 0x%p\n", __func__, iwch_dev);
 	lldev->ethtool_ops->get_drvinfo(lldev, &info);
-	snprintf(str, str_len, "%s", info.fw_version);
+	snprintf(str, IB_FW_VERSION_NAME_MAX, "%s", info.fw_version);
 }
 
 int iwch_register_device(struct iwch_dev *dev)
diff --git a/drivers/infiniband/hw/cxgb4/provider.c b/drivers/infiniband/hw/cxgb4/provider.c
index 0771e9a4d061..346e8334279a 100644
--- a/drivers/infiniband/hw/cxgb4/provider.c
+++ b/drivers/infiniband/hw/cxgb4/provider.c
@@ -517,14 +517,13 @@ static int c4iw_port_immutable(struct ib_device *ibdev, u8 port_num,
 	return 0;
 }
 
-static void get_dev_fw_str(struct ib_device *dev, char *str,
-			   size_t str_len)
+static void get_dev_fw_str(struct ib_device *dev, char *str)
 {
 	struct c4iw_dev *c4iw_dev = container_of(dev, struct c4iw_dev,
 						 ibdev);
 	pr_debug("%s dev 0x%p\n", __func__, dev);
 
-	snprintf(str, str_len, "%u.%u.%u.%u",
+	snprintf(str, IB_FW_VERSION_NAME_MAX, "%u.%u.%u.%u",
 		 FW_HDR_FW_VER_MAJOR_G(c4iw_dev->rdev.lldi.fw_vers),
 		 FW_HDR_FW_VER_MINOR_G(c4iw_dev->rdev.lldi.fw_vers),
 		 FW_HDR_FW_VER_MICRO_G(c4iw_dev->rdev.lldi.fw_vers),
diff --git a/drivers/infiniband/hw/hfi1/verbs.c b/drivers/infiniband/hw/hfi1/verbs.c
index dc51bf247006..c88c03c11555 100644
--- a/drivers/infiniband/hw/hfi1/verbs.c
+++ b/drivers/infiniband/hw/hfi1/verbs.c
@@ -1561,14 +1561,13 @@ static void init_ibport(struct hfi1_pportdata *ppd)
 	RCU_INIT_POINTER(ibp->rvp.qp[1], NULL);
 }
 
-static void hfi1_get_dev_fw_str(struct ib_device *ibdev, char *str,
-				size_t str_len)
+static void hfi1_get_dev_fw_str(struct ib_device *ibdev, char *str)
 {
 	struct rvt_dev_info *rdi = ib_to_rvt(ibdev);
 	struct hfi1_ibdev *dev = dev_from_rdi(rdi);
 	u32 ver = dd_from_dev(dev)->dc8051_ver;
 
-	snprintf(str, str_len, "%u.%u.%u", dc8051_ver_maj(ver),
+	snprintf(str, IB_FW_VERSION_NAME_MAX, "%u.%u.%u", dc8051_ver_maj(ver),
 		 dc8051_ver_min(ver), dc8051_ver_patch(ver));
 }
 
diff --git a/drivers/infiniband/hw/i40iw/i40iw_verbs.c b/drivers/infiniband/hw/i40iw/i40iw_verbs.c
index 02d871db7ca5..1aa411034a27 100644
--- a/drivers/infiniband/hw/i40iw/i40iw_verbs.c
+++ b/drivers/infiniband/hw/i40iw/i40iw_verbs.c
@@ -2584,13 +2584,12 @@ static const char * const i40iw_hw_stat_names[] = {
 		"iwRdmaInv"
 };
 
-static void i40iw_get_dev_fw_str(struct ib_device *dev, char *str,
-				 size_t str_len)
+static void i40iw_get_dev_fw_str(struct ib_device *dev, char *str)
 {
 	u32 firmware_version = I40IW_FW_VERSION;
 
-	snprintf(str, str_len, "%u.%u", firmware_version,
-		       (firmware_version & 0x000000ff));
+	snprintf(str, IB_FW_VERSION_NAME_MAX, "%u.%u", firmware_version,
+		 (firmware_version & 0x000000ff));
 }
 
 /**
diff --git a/drivers/infiniband/hw/mlx4/main.c b/drivers/infiniband/hw/mlx4/main.c
index 1f25a37eb056..c636842c5be0 100644
--- a/drivers/infiniband/hw/mlx4/main.c
+++ b/drivers/infiniband/hw/mlx4/main.c
@@ -2587,12 +2587,11 @@ static int mlx4_port_immutable(struct ib_device *ibdev, u8 port_num,
 	return 0;
 }
 
-static void get_fw_ver_str(struct ib_device *device, char *str,
-			   size_t str_len)
+static void get_fw_ver_str(struct ib_device *device, char *str)
 {
 	struct mlx4_ib_dev *dev =
 		container_of(device, struct mlx4_ib_dev, ib_dev);
-	snprintf(str, str_len, "%d.%d.%d",
+	snprintf(str, IB_FW_VERSION_NAME_MAX, "%d.%d.%d",
 		 (int) (dev->dev->caps.fw_ver >> 32),
 		 (int) (dev->dev->caps.fw_ver >> 16) & 0xffff,
 		 (int) dev->dev->caps.fw_ver & 0xffff);
diff --git a/drivers/infiniband/hw/mlx5/main.c b/drivers/infiniband/hw/mlx5/main.c
index 9279631d8da0..0a5a4e3fa66d 100644
--- a/drivers/infiniband/hw/mlx5/main.c
+++ b/drivers/infiniband/hw/mlx5/main.c
@@ -3285,13 +3285,13 @@ static int mlx5_port_immutable(struct ib_device *ibdev, u8 port_num,
 	return 0;
 }
 
-static void get_dev_fw_str(struct ib_device *ibdev, char *str,
-			   size_t str_len)
+static void get_dev_fw_str(struct ib_device *ibdev, char *str)
 {
 	struct mlx5_ib_dev *dev =
 		container_of(ibdev, struct mlx5_ib_dev, ib_dev);
-	snprintf(str, str_len, "%d.%d.%04d", fw_rev_maj(dev->mdev),
-		       fw_rev_min(dev->mdev), fw_rev_sub(dev->mdev));
+	snprintf(str, IB_FW_VERSION_NAME_MAX, "%d.%d.%04d",
+		 fw_rev_maj(dev->mdev), fw_rev_min(dev->mdev),
+		 fw_rev_sub(dev->mdev));
 }
 
 static int mlx5_eth_lag_init(struct mlx5_ib_dev *dev)
diff --git a/drivers/infiniband/hw/mthca/mthca_provider.c b/drivers/infiniband/hw/mthca/mthca_provider.c
index c197cd9b193f..eae9bffd45d4 100644
--- a/drivers/infiniband/hw/mthca/mthca_provider.c
+++ b/drivers/infiniband/hw/mthca/mthca_provider.c
@@ -1178,12 +1178,11 @@ static int mthca_port_immutable(struct ib_device *ibdev, u8 port_num,
 	return 0;
 }
 
-static void get_dev_fw_str(struct ib_device *device, char *str,
-			   size_t str_len)
+static void get_dev_fw_str(struct ib_device *device, char *str)
 {
 	struct mthca_dev *dev =
 		container_of(device, struct mthca_dev, ib_dev);
-	snprintf(str, str_len, "%d.%d.%d",
+	snprintf(str, IB_FW_VERSION_NAME_MAX, "%d.%d.%d",
 		 (int) (dev->fw_ver >> 32),
 		 (int) (dev->fw_ver >> 16) & 0xffff,
 		 (int) dev->fw_ver & 0xffff);
diff --git a/drivers/infiniband/hw/nes/nes_verbs.c b/drivers/infiniband/hw/nes/nes_verbs.c
index 25dcd7573df9..c2943e39d2f9 100644
--- a/drivers/infiniband/hw/nes/nes_verbs.c
+++ b/drivers/infiniband/hw/nes/nes_verbs.c
@@ -3672,15 +3672,14 @@ static int nes_port_immutable(struct ib_device *ibdev, u8 port_num,
 	return 0;
 }
 
-static void get_dev_fw_str(struct ib_device *dev, char *str,
-			   size_t str_len)
+static void get_dev_fw_str(struct ib_device *dev, char *str)
 {
 	struct nes_ib_device *nesibdev =
 			container_of(dev, struct nes_ib_device, ibdev);
 	struct nes_vnic *nesvnic = nesibdev->nesvnic;
 
 	nes_debug(NES_DBG_INIT, "\n");
-	snprintf(str, str_len, "%u.%u",
+	snprintf(str, IB_FW_VERSION_NAME_MAX, "%u.%u",
 		 (nesvnic->nesdev->nesadapter->firmware_version >> 16),
 		 (nesvnic->nesdev->nesadapter->firmware_version & 0x000000ff));
 }
diff --git a/drivers/infiniband/hw/ocrdma/ocrdma_main.c b/drivers/infiniband/hw/ocrdma/ocrdma_main.c
index 757c65816295..fbfbd9e96147 100644
--- a/drivers/infiniband/hw/ocrdma/ocrdma_main.c
+++ b/drivers/infiniband/hw/ocrdma/ocrdma_main.c
@@ -107,12 +107,11 @@ static int ocrdma_port_immutable(struct ib_device *ibdev, u8 port_num,
 	return 0;
 }
 
-static void get_dev_fw_str(struct ib_device *device, char *str,
-			   size_t str_len)
+static void get_dev_fw_str(struct ib_device *device, char *str)
 {
 	struct ocrdma_dev *dev = get_ocrdma_dev(device);
 
-	snprintf(str, str_len, "%s", &dev->attr.fw_ver[0]);
+	snprintf(str, IB_FW_VERSION_NAME_MAX, "%s", &dev->attr.fw_ver[0]);
 }
 
 static int ocrdma_register_device(struct ocrdma_dev *dev)
diff --git a/drivers/infiniband/hw/qedr/main.c b/drivers/infiniband/hw/qedr/main.c
index 199b6edbef92..97d033f51dc9 100644
--- a/drivers/infiniband/hw/qedr/main.c
+++ b/drivers/infiniband/hw/qedr/main.c
@@ -68,13 +68,12 @@ static enum rdma_link_layer qedr_link_layer(struct ib_device *device,
 	return IB_LINK_LAYER_ETHERNET;
 }
 
-static void qedr_get_dev_fw_str(struct ib_device *ibdev, char *str,
-				size_t str_len)
+static void qedr_get_dev_fw_str(struct ib_device *ibdev, char *str)
 {
 	struct qedr_dev *qedr = get_qedr_dev(ibdev);
 	u32 fw_ver = (u32)qedr->attr.fw_ver;
 
-	snprintf(str, str_len, "%d. %d. %d. %d",
+	snprintf(str, IB_FW_VERSION_NAME_MAX, "%d. %d. %d. %d",
 		 (fw_ver >> 24) & 0xFF, (fw_ver >> 16) & 0xFF,
 		 (fw_ver >> 8) & 0xFF, fw_ver & 0xFF);
 }
diff --git a/drivers/infiniband/hw/usnic/usnic_ib_main.c b/drivers/infiniband/hw/usnic/usnic_ib_main.c
index e69c8e476a2b..e86700f994cb 100644
--- a/drivers/infiniband/hw/usnic/usnic_ib_main.c
+++ b/drivers/infiniband/hw/usnic/usnic_ib_main.c
@@ -333,9 +333,7 @@ static int usnic_port_immutable(struct ib_device *ibdev, u8 port_num,
 	return 0;
 }
 
-static void usnic_get_dev_fw_str(struct ib_device *device,
-				 char *str,
-				 size_t str_len)
+static void usnic_get_dev_fw_str(struct ib_device *device, char *str)
 {
 	struct usnic_ib_dev *us_ibdev =
 		container_of(device, struct usnic_ib_dev, ib_dev);
@@ -345,7 +343,7 @@ static void usnic_get_dev_fw_str(struct ib_device *device,
 	us_ibdev->netdev->ethtool_ops->get_drvinfo(us_ibdev->netdev, &info);
 	mutex_unlock(&us_ibdev->usdev_lock);
 
-	snprintf(str, str_len, "%s", info.fw_version);
+	snprintf(str, IB_FW_VERSION_NAME_MAX, "%s", info.fw_version);
 }
 
 /* Start of PF discovery section */
diff --git a/drivers/infiniband/hw/vmw_pvrdma/pvrdma_main.c b/drivers/infiniband/hw/vmw_pvrdma/pvrdma_main.c
index e76565280afa..7f29e4db28a1 100644
--- a/drivers/infiniband/hw/vmw_pvrdma/pvrdma_main.c
+++ b/drivers/infiniband/hw/vmw_pvrdma/pvrdma_main.c
@@ -102,12 +102,11 @@ static struct device_attribute *pvrdma_class_attributes[] = {
 	&dev_attr_board_id
 };
 
-static void pvrdma_get_fw_ver_str(struct ib_device *device, char *str,
-				  size_t str_len)
+static void pvrdma_get_fw_ver_str(struct ib_device *device, char *str)
 {
 	struct pvrdma_dev *dev =
 		container_of(device, struct pvrdma_dev, ib_dev);
-	snprintf(str, str_len, "%d.%d.%d\n",
+	snprintf(str, IB_FW_VERSION_NAME_MAX, "%d.%d.%d\n",
 		 (int) (dev->dsr->caps.fw_ver >> 32),
 		 (int) (dev->dsr->caps.fw_ver >> 16) & 0xffff,
 		 (int) dev->dsr->caps.fw_ver & 0xffff);
diff --git a/drivers/infiniband/ulp/ipoib/ipoib_ethtool.c b/drivers/infiniband/ulp/ipoib/ipoib_ethtool.c
index 7871379342f4..98e30b41e436 100644
--- a/drivers/infiniband/ulp/ipoib/ipoib_ethtool.c
+++ b/drivers/infiniband/ulp/ipoib/ipoib_ethtool.c
@@ -62,8 +62,7 @@ static void ipoib_get_drvinfo(struct net_device *netdev,
 {
 	struct ipoib_dev_priv *priv = ipoib_priv(netdev);
 
-	ib_get_device_fw_str(priv->ca, drvinfo->fw_version,
-			     sizeof(drvinfo->fw_version));
+	ib_get_device_fw_str(priv->ca, drvinfo->fw_version);
 
 	strlcpy(drvinfo->bus_info, dev_name(priv->ca->dev.parent),
 		sizeof(drvinfo->bus_info));
diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h
index 3391df5fdc9c..e0e87a1f66fb 100644
--- a/include/rdma/ib_verbs.h
+++ b/include/rdma/ib_verbs.h
@@ -64,6 +64,8 @@
 #include <linux/cgroup_rdma.h>
 #include <uapi/rdma/ib_user_verbs.h>
 
+#define IB_FW_VERSION_NAME_MAX	ETHTOOL_FWVERS_LEN
+
 extern struct workqueue_struct *ib_wq;
 extern struct workqueue_struct *ib_comp_wq;
 
@@ -2307,7 +2309,7 @@ struct ib_device {
 	 * in fast paths.
 	 */
 	int (*get_port_immutable)(struct ib_device *, u8, struct ib_port_immutable *);
-	void (*get_dev_fw_str)(struct ib_device *, char *str, size_t str_len);
+	void (*get_dev_fw_str)(struct ib_device *, char *str);
 };
 
 struct ib_client {
@@ -2343,7 +2345,7 @@ struct ib_client {
 struct ib_device *ib_alloc_device(size_t size);
 void ib_dealloc_device(struct ib_device *device);
 
-void ib_get_device_fw_str(struct ib_device *device, char *str, size_t str_len);
+void ib_get_device_fw_str(struct ib_device *device, char *str);
 
 int ib_register_device(struct ib_device *device,
 		       int (*port_callback)(struct ib_device *,
-- 
cgit v1.2.3


From 8621a7e3c1c22e18385c9ced1647363884ea2aa1 Mon Sep 17 00:00:00 2001
From: Leon Romanovsky <leonro@mellanox.com>
Date: Tue, 27 Jun 2017 16:58:59 +0300
Subject: RDMA/netlink: Export FW version

Add FW version to the device properties exported
by RDMA netlink, to be used by RDMAtool.

Signed-off-by: Leon Romanovsky <leonro@mellanox.com>
---
 drivers/infiniband/core/nldev.c  | 9 +++++++++
 include/uapi/rdma/rdma_netlink.h | 4 ++++
 2 files changed, 13 insertions(+)

(limited to 'include')

diff --git a/drivers/infiniband/core/nldev.c b/drivers/infiniband/core/nldev.c
index 94c1e49074f5..cdc970ca5a1b 100644
--- a/drivers/infiniband/core/nldev.c
+++ b/drivers/infiniband/core/nldev.c
@@ -40,10 +40,14 @@ static const struct nla_policy nldev_policy[RDMA_NLDEV_ATTR_MAX] = {
 	[RDMA_NLDEV_ATTR_DEV_NAME]	= { .type = NLA_NUL_STRING,
 					    .len = IB_DEVICE_NAME_MAX - 1},
 	[RDMA_NLDEV_ATTR_PORT_INDEX]	= { .type = NLA_U32 },
+	[RDMA_NLDEV_ATTR_FW_VERSION]	= { .type = NLA_NUL_STRING,
+					    .len = IB_FW_VERSION_NAME_MAX - 1},
 };
 
 static int fill_dev_info(struct sk_buff *msg, struct ib_device *device)
 {
+	char fw[IB_FW_VERSION_NAME_MAX];
+
 	if (nla_put_u32(msg, RDMA_NLDEV_ATTR_DEV_INDEX, device->index))
 		return -EMSGSIZE;
 	if (nla_put_string(msg, RDMA_NLDEV_ATTR_DEV_NAME, device->name))
@@ -56,6 +60,11 @@ static int fill_dev_info(struct sk_buff *msg, struct ib_device *device)
 			      device->attrs.device_cap_flags, 0))
 		return -EMSGSIZE;
 
+	ib_get_device_fw_str(device, fw);
+	/* Device without FW has strlen(fw) */
+	if (strlen(fw) && nla_put_string(msg, RDMA_NLDEV_ATTR_FW_VERSION, fw))
+		return -EMSGSIZE;
+
 	return 0;
 }
 
diff --git a/include/uapi/rdma/rdma_netlink.h b/include/uapi/rdma/rdma_netlink.h
index 90de11db6580..5159858730b0 100644
--- a/include/uapi/rdma/rdma_netlink.h
+++ b/include/uapi/rdma/rdma_netlink.h
@@ -262,6 +262,10 @@ enum rdma_nldev_attr {
 	 */
 	RDMA_NLDEV_ATTR_CAP_FLAGS,		/* u64 */
 
+	/*
+	 * FW version
+	 */
+	RDMA_NLDEV_ATTR_FW_VERSION,		/* string */
 	RDMA_NLDEV_ATTR_MAX
 };
 #endif /* _UAPI_RDMA_NETLINK_H */
-- 
cgit v1.2.3


From 1aaff896ca6b968a639e3e1e72ba6146ba332501 Mon Sep 17 00:00:00 2001
From: Leon Romanovsky <leonro@mellanox.com>
Date: Wed, 28 Jun 2017 14:01:37 +0300
Subject: RDMA/netlink: Export node_guid and sys_image_guid

Add Node GUID and system image GUID to the device properties
exported by RDMA netlink, to be used by RDMAtool.

Signed-off-by: Leon Romanovsky <leonro@mellanox.com>
---
 drivers/infiniband/core/nldev.c  |  8 ++++++++
 include/uapi/rdma/rdma_netlink.h | 13 +++++++++++++
 2 files changed, 21 insertions(+)

(limited to 'include')

diff --git a/drivers/infiniband/core/nldev.c b/drivers/infiniband/core/nldev.c
index cdc970ca5a1b..f932c2c3fad0 100644
--- a/drivers/infiniband/core/nldev.c
+++ b/drivers/infiniband/core/nldev.c
@@ -42,6 +42,8 @@ static const struct nla_policy nldev_policy[RDMA_NLDEV_ATTR_MAX] = {
 	[RDMA_NLDEV_ATTR_PORT_INDEX]	= { .type = NLA_U32 },
 	[RDMA_NLDEV_ATTR_FW_VERSION]	= { .type = NLA_NUL_STRING,
 					    .len = IB_FW_VERSION_NAME_MAX - 1},
+	[RDMA_NLDEV_ATTR_NODE_GUID]	= { .type = NLA_U64 },
+	[RDMA_NLDEV_ATTR_SYS_IMAGE_GUID] = { .type = NLA_U64 },
 };
 
 static int fill_dev_info(struct sk_buff *msg, struct ib_device *device)
@@ -65,6 +67,12 @@ static int fill_dev_info(struct sk_buff *msg, struct ib_device *device)
 	if (strlen(fw) && nla_put_string(msg, RDMA_NLDEV_ATTR_FW_VERSION, fw))
 		return -EMSGSIZE;
 
+	if (nla_put_u64_64bit(msg, RDMA_NLDEV_ATTR_NODE_GUID,
+			      be64_to_cpu(device->node_guid), 0))
+		return -EMSGSIZE;
+	if (nla_put_u64_64bit(msg, RDMA_NLDEV_ATTR_SYS_IMAGE_GUID,
+			      be64_to_cpu(device->attrs.sys_image_guid), 0))
+		return -EMSGSIZE;
 	return 0;
 }
 
diff --git a/include/uapi/rdma/rdma_netlink.h b/include/uapi/rdma/rdma_netlink.h
index 5159858730b0..fe3a7429e7a1 100644
--- a/include/uapi/rdma/rdma_netlink.h
+++ b/include/uapi/rdma/rdma_netlink.h
@@ -266,6 +266,19 @@ enum rdma_nldev_attr {
 	 * FW version
 	 */
 	RDMA_NLDEV_ATTR_FW_VERSION,		/* string */
+
+	/*
+	 * Node GUID (in host byte order) associated with the RDMA device.
+	 */
+	RDMA_NLDEV_ATTR_NODE_GUID,			/* u64 */
+
+	/*
+	 * System image GUID (in host byte order) associated with
+	 * this RDMA device and other devices which are part of a
+	 * single system.
+	 */
+	RDMA_NLDEV_ATTR_SYS_IMAGE_GUID,		/* u64 */
+
 	RDMA_NLDEV_ATTR_MAX
 };
 #endif /* _UAPI_RDMA_NETLINK_H */
-- 
cgit v1.2.3


From 12026fbba6af2fc53c3c6cf88bdfc6561986ba82 Mon Sep 17 00:00:00 2001
From: Leon Romanovsky <leonro@mellanox.com>
Date: Wed, 28 Jun 2017 15:05:14 +0300
Subject: RDMA/netlink: Advertise IB subnet prefix

Add IB subnet prefix to the port properties exported
by RDMA netlink.

Signed-off-by: Leon Romanovsky <leonro@mellanox.com>
---
 drivers/infiniband/core/nldev.c  | 5 +++++
 include/uapi/rdma/rdma_netlink.h | 5 +++++
 2 files changed, 10 insertions(+)

(limited to 'include')

diff --git a/drivers/infiniband/core/nldev.c b/drivers/infiniband/core/nldev.c
index f932c2c3fad0..7af71d5e52c8 100644
--- a/drivers/infiniband/core/nldev.c
+++ b/drivers/infiniband/core/nldev.c
@@ -44,6 +44,7 @@ static const struct nla_policy nldev_policy[RDMA_NLDEV_ATTR_MAX] = {
 					    .len = IB_FW_VERSION_NAME_MAX - 1},
 	[RDMA_NLDEV_ATTR_NODE_GUID]	= { .type = NLA_U64 },
 	[RDMA_NLDEV_ATTR_SYS_IMAGE_GUID] = { .type = NLA_U64 },
+	[RDMA_NLDEV_ATTR_SUBNET_PREFIX]	= { .type = NLA_U64 },
 };
 
 static int fill_dev_info(struct sk_buff *msg, struct ib_device *device)
@@ -97,6 +98,10 @@ static int fill_port_info(struct sk_buff *msg,
 	if (nla_put_u64_64bit(msg, RDMA_NLDEV_ATTR_CAP_FLAGS,
 			      (u64)attr.port_cap_flags, 0))
 		return -EMSGSIZE;
+	if (rdma_protocol_ib(device, port) &&
+	    nla_put_u64_64bit(msg, RDMA_NLDEV_ATTR_SUBNET_PREFIX,
+			      attr.subnet_prefix, 0))
+		return -EMSGSIZE;
 
 	return 0;
 }
diff --git a/include/uapi/rdma/rdma_netlink.h b/include/uapi/rdma/rdma_netlink.h
index fe3a7429e7a1..481003182a35 100644
--- a/include/uapi/rdma/rdma_netlink.h
+++ b/include/uapi/rdma/rdma_netlink.h
@@ -279,6 +279,11 @@ enum rdma_nldev_attr {
 	 */
 	RDMA_NLDEV_ATTR_SYS_IMAGE_GUID,		/* u64 */
 
+	/*
+	 * Subnet prefix (in host byte order)
+	 */
+	RDMA_NLDEV_ATTR_SUBNET_PREFIX,		/* u64 */
+
 	RDMA_NLDEV_ATTR_MAX
 };
 #endif /* _UAPI_RDMA_NETLINK_H */
-- 
cgit v1.2.3


From 80a06dd36f79de7007f21f5cbe42181a4e5c7d6d Mon Sep 17 00:00:00 2001
From: Leon Romanovsky <leonro@mellanox.com>
Date: Wed, 28 Jun 2017 15:38:36 +0300
Subject: RDMA/netink: Export lids and sm_lids

According to the IB specification, the LID and SM_LID
are 16-bit wide, but to support OmniPath users, export
it as 32-bit value from the beginning.

Signed-off-by: Leon Romanovsky <leonro@mellanox.com>
---
 drivers/infiniband/core/nldev.c  | 9 ++++++++-
 include/uapi/rdma/rdma_netlink.h | 8 ++++++++
 2 files changed, 16 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/drivers/infiniband/core/nldev.c b/drivers/infiniband/core/nldev.c
index 7af71d5e52c8..16f1d28bea69 100644
--- a/drivers/infiniband/core/nldev.c
+++ b/drivers/infiniband/core/nldev.c
@@ -45,6 +45,8 @@ static const struct nla_policy nldev_policy[RDMA_NLDEV_ATTR_MAX] = {
 	[RDMA_NLDEV_ATTR_NODE_GUID]	= { .type = NLA_U64 },
 	[RDMA_NLDEV_ATTR_SYS_IMAGE_GUID] = { .type = NLA_U64 },
 	[RDMA_NLDEV_ATTR_SUBNET_PREFIX]	= { .type = NLA_U64 },
+	[RDMA_NLDEV_ATTR_LID]		= { .type = NLA_U32 },
+	[RDMA_NLDEV_ATTR_SM_LID]	= { .type = NLA_U32 },
 };
 
 static int fill_dev_info(struct sk_buff *msg, struct ib_device *device)
@@ -102,7 +104,12 @@ static int fill_port_info(struct sk_buff *msg,
 	    nla_put_u64_64bit(msg, RDMA_NLDEV_ATTR_SUBNET_PREFIX,
 			      attr.subnet_prefix, 0))
 		return -EMSGSIZE;
-
+	if (rdma_protocol_ib(device, port)) {
+		if (nla_put_u32(msg, RDMA_NLDEV_ATTR_LID, attr.lid))
+			return -EMSGSIZE;
+		if (nla_put_u32(msg, RDMA_NLDEV_ATTR_SM_LID, attr.sm_lid))
+			return -EMSGSIZE;
+	}
 	return 0;
 }
 
diff --git a/include/uapi/rdma/rdma_netlink.h b/include/uapi/rdma/rdma_netlink.h
index 481003182a35..7d5caaf54126 100644
--- a/include/uapi/rdma/rdma_netlink.h
+++ b/include/uapi/rdma/rdma_netlink.h
@@ -284,6 +284,14 @@ enum rdma_nldev_attr {
 	 */
 	RDMA_NLDEV_ATTR_SUBNET_PREFIX,		/* u64 */
 
+	/*
+	 * Local Identifier (LID),
+	 * According to IB specification, It is 16-bit address assigned
+	 * by the Subnet Manager. Extended to be 32-bit for OmniPath users.
+	 */
+	RDMA_NLDEV_ATTR_LID,			/* u32 */
+	RDMA_NLDEV_ATTR_SM_LID,			/* u32 */
+
 	RDMA_NLDEV_ATTR_MAX
 };
 #endif /* _UAPI_RDMA_NETLINK_H */
-- 
cgit v1.2.3


From 34840fea112d36507c19dc6052b8c6d88bdd9c16 Mon Sep 17 00:00:00 2001
From: Leon Romanovsky <leonro@mellanox.com>
Date: Wed, 28 Jun 2017 15:49:30 +0300
Subject: RDMA/netlink: Export LID mask control (LMC)

Signed-off-by: Leon Romanovsky <leonro@mellanox.com>
---
 drivers/infiniband/core/nldev.c  | 3 +++
 include/uapi/rdma/rdma_netlink.h | 5 +++++
 2 files changed, 8 insertions(+)

(limited to 'include')

diff --git a/drivers/infiniband/core/nldev.c b/drivers/infiniband/core/nldev.c
index 16f1d28bea69..11546f87c5dc 100644
--- a/drivers/infiniband/core/nldev.c
+++ b/drivers/infiniband/core/nldev.c
@@ -47,6 +47,7 @@ static const struct nla_policy nldev_policy[RDMA_NLDEV_ATTR_MAX] = {
 	[RDMA_NLDEV_ATTR_SUBNET_PREFIX]	= { .type = NLA_U64 },
 	[RDMA_NLDEV_ATTR_LID]		= { .type = NLA_U32 },
 	[RDMA_NLDEV_ATTR_SM_LID]	= { .type = NLA_U32 },
+	[RDMA_NLDEV_ATTR_LMC]		= { .type = NLA_U8 },
 };
 
 static int fill_dev_info(struct sk_buff *msg, struct ib_device *device)
@@ -109,6 +110,8 @@ static int fill_port_info(struct sk_buff *msg,
 			return -EMSGSIZE;
 		if (nla_put_u32(msg, RDMA_NLDEV_ATTR_SM_LID, attr.sm_lid))
 			return -EMSGSIZE;
+		if (nla_put_u8(msg, RDMA_NLDEV_ATTR_LMC, attr.lmc))
+			return -EMSGSIZE;
 	}
 	return 0;
 }
diff --git a/include/uapi/rdma/rdma_netlink.h b/include/uapi/rdma/rdma_netlink.h
index 7d5caaf54126..035706e6b016 100644
--- a/include/uapi/rdma/rdma_netlink.h
+++ b/include/uapi/rdma/rdma_netlink.h
@@ -292,6 +292,11 @@ enum rdma_nldev_attr {
 	RDMA_NLDEV_ATTR_LID,			/* u32 */
 	RDMA_NLDEV_ATTR_SM_LID,			/* u32 */
 
+	/*
+	 * LID mask control (LMC)
+	 */
+	RDMA_NLDEV_ATTR_LMC,			/* u8 */
+
 	RDMA_NLDEV_ATTR_MAX
 };
 #endif /* _UAPI_RDMA_NETLINK_H */
-- 
cgit v1.2.3


From 5654e49db0b2d87c12b6e120b6a830abe3d3921b Mon Sep 17 00:00:00 2001
From: Leon Romanovsky <leonro@mellanox.com>
Date: Thu, 29 Jun 2017 13:12:45 +0300
Subject: RDMA/netlink: Provide port state and physical link state

Add port state and physical link state to the users of RDMA netlink.

Signed-off-by: Leon Romanovsky <leonro@mellanox.com>
---
 drivers/infiniband/core/nldev.c  | 6 ++++++
 include/uapi/rdma/rdma_netlink.h | 3 +++
 2 files changed, 9 insertions(+)

(limited to 'include')

diff --git a/drivers/infiniband/core/nldev.c b/drivers/infiniband/core/nldev.c
index 11546f87c5dc..32ccb2b88933 100644
--- a/drivers/infiniband/core/nldev.c
+++ b/drivers/infiniband/core/nldev.c
@@ -48,6 +48,8 @@ static const struct nla_policy nldev_policy[RDMA_NLDEV_ATTR_MAX] = {
 	[RDMA_NLDEV_ATTR_LID]		= { .type = NLA_U32 },
 	[RDMA_NLDEV_ATTR_SM_LID]	= { .type = NLA_U32 },
 	[RDMA_NLDEV_ATTR_LMC]		= { .type = NLA_U8 },
+	[RDMA_NLDEV_ATTR_PORT_STATE]	= { .type = NLA_U8 },
+	[RDMA_NLDEV_ATTR_PORT_PHYS_STATE] = { .type = NLA_U8 },
 };
 
 static int fill_dev_info(struct sk_buff *msg, struct ib_device *device)
@@ -113,6 +115,10 @@ static int fill_port_info(struct sk_buff *msg,
 		if (nla_put_u8(msg, RDMA_NLDEV_ATTR_LMC, attr.lmc))
 			return -EMSGSIZE;
 	}
+	if (nla_put_u8(msg, RDMA_NLDEV_ATTR_PORT_STATE, attr.state))
+		return -EMSGSIZE;
+	if (nla_put_u8(msg, RDMA_NLDEV_ATTR_PORT_PHYS_STATE, attr.phys_state))
+		return -EMSGSIZE;
 	return 0;
 }
 
diff --git a/include/uapi/rdma/rdma_netlink.h b/include/uapi/rdma/rdma_netlink.h
index 035706e6b016..c488c3cf361b 100644
--- a/include/uapi/rdma/rdma_netlink.h
+++ b/include/uapi/rdma/rdma_netlink.h
@@ -297,6 +297,9 @@ enum rdma_nldev_attr {
 	 */
 	RDMA_NLDEV_ATTR_LMC,			/* u8 */
 
+	RDMA_NLDEV_ATTR_PORT_STATE,		/* u8 */
+	RDMA_NLDEV_ATTR_PORT_PHYS_STATE,	/* u8 */
+
 	RDMA_NLDEV_ATTR_MAX
 };
 #endif /* _UAPI_RDMA_NETLINK_H */
-- 
cgit v1.2.3


From 1bb77b8c1d57149ed0aa6825255ead80ae584034 Mon Sep 17 00:00:00 2001
From: Leon Romanovsky <leonro@mellanox.com>
Date: Thu, 29 Jun 2017 16:01:29 +0300
Subject: RDMA/netlink: Export node_type

Add ability to get node_type for RDAM netlink users.

Signed-off-by: Leon Romanovsky <leonro@mellanox.com>
---
 drivers/infiniband/core/nldev.c  | 3 +++
 include/uapi/rdma/rdma_netlink.h | 2 ++
 2 files changed, 5 insertions(+)

(limited to 'include')

diff --git a/drivers/infiniband/core/nldev.c b/drivers/infiniband/core/nldev.c
index 32ccb2b88933..474022274e09 100644
--- a/drivers/infiniband/core/nldev.c
+++ b/drivers/infiniband/core/nldev.c
@@ -50,6 +50,7 @@ static const struct nla_policy nldev_policy[RDMA_NLDEV_ATTR_MAX] = {
 	[RDMA_NLDEV_ATTR_LMC]		= { .type = NLA_U8 },
 	[RDMA_NLDEV_ATTR_PORT_STATE]	= { .type = NLA_U8 },
 	[RDMA_NLDEV_ATTR_PORT_PHYS_STATE] = { .type = NLA_U8 },
+	[RDMA_NLDEV_ATTR_DEV_NODE_TYPE] = { .type = NLA_U8 },
 };
 
 static int fill_dev_info(struct sk_buff *msg, struct ib_device *device)
@@ -79,6 +80,8 @@ static int fill_dev_info(struct sk_buff *msg, struct ib_device *device)
 	if (nla_put_u64_64bit(msg, RDMA_NLDEV_ATTR_SYS_IMAGE_GUID,
 			      be64_to_cpu(device->attrs.sys_image_guid), 0))
 		return -EMSGSIZE;
+	if (nla_put_u8(msg, RDMA_NLDEV_ATTR_DEV_NODE_TYPE, device->node_type))
+		return -EMSGSIZE;
 	return 0;
 }
 
diff --git a/include/uapi/rdma/rdma_netlink.h b/include/uapi/rdma/rdma_netlink.h
index c488c3cf361b..861440a87e7c 100644
--- a/include/uapi/rdma/rdma_netlink.h
+++ b/include/uapi/rdma/rdma_netlink.h
@@ -300,6 +300,8 @@ enum rdma_nldev_attr {
 	RDMA_NLDEV_ATTR_PORT_STATE,		/* u8 */
 	RDMA_NLDEV_ATTR_PORT_PHYS_STATE,	/* u8 */
 
+	RDMA_NLDEV_ATTR_DEV_NODE_TYPE,		/* u8 */
+
 	RDMA_NLDEV_ATTR_MAX
 };
 #endif /* _UAPI_RDMA_NETLINK_H */
-- 
cgit v1.2.3


From 9d664c0aec3bfdb77fcf7de61cfe1febbecdd389 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Fri, 9 Jun 2017 13:05:06 +0200
Subject: locking/atomic: Fix atomic_set_release() for 'funny' architectures

Those architectures that have a special atomic_set implementation also
need a special atomic_set_release(), because for the very same reason
WRITE_ONCE() is broken for them, smp_store_release() is too.

The vast majority is architectures that have spinlock hash based atomic
implementation except hexagon which seems to have a hardware 'feature'.

The spinlock based atomics should be SC, that is, none of them appear to
place extra barriers in atomic_cmpxchg() or any of the other SC atomic
primitives and therefore seem to rely on their spinlock implementation
being SC (I did not fully validate all that).

Therefore, the normal atomic_set() is SC and can be used at
atomic_set_release().

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Acked-by: Chris Metcalf <cmetcalf@mellanox.com> [for tile]
Cc: Boqun Feng <boqun.feng@gmail.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Paul McKenney <paulmck@linux.vnet.ibm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Will Deacon <will.deacon@arm.com>
Cc: davem@davemloft.net
Cc: james.hogan@imgtec.com
Cc: jejb@parisc-linux.org
Cc: rkuo@codeaurora.org
Cc: vgupta@synopsys.com
Link: http://lkml.kernel.org/r/20170609110506.yod47flaav3wgoj5@hirez.programming.kicks-ass.net
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/arc/include/asm/atomic.h         | 2 ++
 arch/hexagon/include/asm/atomic.h     | 2 ++
 arch/metag/include/asm/atomic_lock1.h | 2 ++
 arch/parisc/include/asm/atomic.h      | 2 ++
 arch/sparc/include/asm/atomic_32.h    | 2 ++
 arch/tile/include/asm/atomic_32.h     | 2 ++
 include/asm-generic/atomic64.h        | 2 ++
 7 files changed, 14 insertions(+)

(limited to 'include')

diff --git a/arch/arc/include/asm/atomic.h b/arch/arc/include/asm/atomic.h
index 54b54da6384c..11859287c52a 100644
--- a/arch/arc/include/asm/atomic.h
+++ b/arch/arc/include/asm/atomic.h
@@ -123,6 +123,8 @@ static inline void atomic_set(atomic_t *v, int i)
 	atomic_ops_unlock(flags);
 }
 
+#define atomic_set_release(v, i)	atomic_set((v), (i))
+
 #endif
 
 /*
diff --git a/arch/hexagon/include/asm/atomic.h b/arch/hexagon/include/asm/atomic.h
index a62ba368b27d..fb3dfb2a667e 100644
--- a/arch/hexagon/include/asm/atomic.h
+++ b/arch/hexagon/include/asm/atomic.h
@@ -42,6 +42,8 @@ static inline void atomic_set(atomic_t *v, int new)
 	);
 }
 
+#define atomic_set_release(v, i)	atomic_set((v), (i))
+
 /**
  * atomic_read - reads a word, atomically
  * @v: pointer to atomic value
diff --git a/arch/metag/include/asm/atomic_lock1.h b/arch/metag/include/asm/atomic_lock1.h
index 6c1380a8a0d4..eee779f26cc4 100644
--- a/arch/metag/include/asm/atomic_lock1.h
+++ b/arch/metag/include/asm/atomic_lock1.h
@@ -37,6 +37,8 @@ static inline int atomic_set(atomic_t *v, int i)
 	return i;
 }
 
+#define atomic_set_release(v, i) atomic_set((v), (i))
+
 #define ATOMIC_OP(op, c_op)						\
 static inline void atomic_##op(int i, atomic_t *v)			\
 {									\
diff --git a/arch/parisc/include/asm/atomic.h b/arch/parisc/include/asm/atomic.h
index 5394b9c5f914..17b98a87e5e2 100644
--- a/arch/parisc/include/asm/atomic.h
+++ b/arch/parisc/include/asm/atomic.h
@@ -65,6 +65,8 @@ static __inline__ void atomic_set(atomic_t *v, int i)
 	_atomic_spin_unlock_irqrestore(v, flags);
 }
 
+#define atomic_set_release(v, i)	atomic_set((v), (i))
+
 static __inline__ int atomic_read(const atomic_t *v)
 {
 	return READ_ONCE((v)->counter);
diff --git a/arch/sparc/include/asm/atomic_32.h b/arch/sparc/include/asm/atomic_32.h
index ee3f11c43cda..7643e979e333 100644
--- a/arch/sparc/include/asm/atomic_32.h
+++ b/arch/sparc/include/asm/atomic_32.h
@@ -29,6 +29,8 @@ int atomic_xchg(atomic_t *, int);
 int __atomic_add_unless(atomic_t *, int, int);
 void atomic_set(atomic_t *, int);
 
+#define atomic_set_release(v, i)	atomic_set((v), (i))
+
 #define atomic_read(v)          ACCESS_ONCE((v)->counter)
 
 #define atomic_add(i, v)	((void)atomic_add_return( (int)(i), (v)))
diff --git a/arch/tile/include/asm/atomic_32.h b/arch/tile/include/asm/atomic_32.h
index a93774255136..53a423e7cb92 100644
--- a/arch/tile/include/asm/atomic_32.h
+++ b/arch/tile/include/asm/atomic_32.h
@@ -101,6 +101,8 @@ static inline void atomic_set(atomic_t *v, int n)
 	_atomic_xchg(&v->counter, n);
 }
 
+#define atomic_set_release(v, i)	atomic_set((v), (i))
+
 /* A 64bit atomic type */
 
 typedef struct {
diff --git a/include/asm-generic/atomic64.h b/include/asm-generic/atomic64.h
index dad68bf46c77..8d28eb010d0d 100644
--- a/include/asm-generic/atomic64.h
+++ b/include/asm-generic/atomic64.h
@@ -21,6 +21,8 @@ typedef struct {
 extern long long atomic64_read(const atomic64_t *v);
 extern void	 atomic64_set(atomic64_t *v, long long i);
 
+#define atomic64_set_release(v, i)	atomic64_set((v), (i))
+
 #define ATOMIC64_OP(op)							\
 extern void	 atomic64_##op(long long a, atomic64_t *v);
 
-- 
cgit v1.2.3


From 0aa1125fa8bc5e5f98317156728fa4d0293561a5 Mon Sep 17 00:00:00 2001
From: Kirill Tkhai <ktkhai@virtuozzo.com>
Date: Mon, 19 Jun 2017 21:02:12 +0300
Subject: locking/rwsem-spinlock: Add killable versions of __down_read()

Rename __down_read() in __down_read_common() and teach it
to abort waiting in case of pending signals and killable
state argument passed.

Note, that we shouldn't wake anybody up in EINTR path, as:

We check for signal_pending_state() after (!waiter.task)
test and under spinlock. So, current task wasn't able to
be woken up. It may be in two cases: a writer is owner
of the sem, or a writer is a first waiter of the sem.

If a writer is owner of the sem, no one else may work
with it in parallel. It will wake somebody, when it
call up_write() or downgrade_write().

If a writer is the first waiter, it will be woken up,
when the last active reader releases the sem, and
sem->count became 0.

Also note, that set_current_state() may be moved down
to schedule() (after !waiter.task check), as all
assignments in this type of semaphore (including wake_up),
occur under spinlock, so we can't miss anything.

Signed-off-by: Kirill Tkhai <ktkhai@virtuozzo.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: arnd@arndb.de
Cc: avagin@virtuozzo.com
Cc: davem@davemloft.net
Cc: fenghua.yu@intel.com
Cc: gorcunov@virtuozzo.com
Cc: heiko.carstens@de.ibm.com
Cc: hpa@zytor.com
Cc: ink@jurassic.park.msu.ru
Cc: mattst88@gmail.com
Cc: rth@twiddle.net
Cc: schwidefsky@de.ibm.com
Cc: tony.luck@intel.com
Link: http://lkml.kernel.org/r/149789533283.9059.9829416940494747182.stgit@localhost.localdomain
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/rwsem-spinlock.h  |  1 +
 kernel/locking/rwsem-spinlock.c | 37 ++++++++++++++++++++++++++++---------
 2 files changed, 29 insertions(+), 9 deletions(-)

(limited to 'include')

diff --git a/include/linux/rwsem-spinlock.h b/include/linux/rwsem-spinlock.h
index ae0528b834cd..e784761a4443 100644
--- a/include/linux/rwsem-spinlock.h
+++ b/include/linux/rwsem-spinlock.h
@@ -32,6 +32,7 @@ struct rw_semaphore {
 #define RWSEM_UNLOCKED_VALUE		0x00000000
 
 extern void __down_read(struct rw_semaphore *sem);
+extern int __must_check __down_read_killable(struct rw_semaphore *sem);
 extern int __down_read_trylock(struct rw_semaphore *sem);
 extern void __down_write(struct rw_semaphore *sem);
 extern int __must_check __down_write_killable(struct rw_semaphore *sem);
diff --git a/kernel/locking/rwsem-spinlock.c b/kernel/locking/rwsem-spinlock.c
index 20819df98125..0848634c5512 100644
--- a/kernel/locking/rwsem-spinlock.c
+++ b/kernel/locking/rwsem-spinlock.c
@@ -126,7 +126,7 @@ __rwsem_wake_one_writer(struct rw_semaphore *sem)
 /*
  * get a read lock on the semaphore
  */
-void __sched __down_read(struct rw_semaphore *sem)
+int __sched __down_read_common(struct rw_semaphore *sem, int state)
 {
 	struct rwsem_waiter waiter;
 	unsigned long flags;
@@ -140,8 +140,6 @@ void __sched __down_read(struct rw_semaphore *sem)
 		goto out;
 	}
 
-	set_current_state(TASK_UNINTERRUPTIBLE);
-
 	/* set up my own style of waitqueue */
 	waiter.task = current;
 	waiter.type = RWSEM_WAITING_FOR_READ;
@@ -149,20 +147,41 @@ void __sched __down_read(struct rw_semaphore *sem)
 
 	list_add_tail(&waiter.list, &sem->wait_list);
 
-	/* we don't need to touch the semaphore struct anymore */
-	raw_spin_unlock_irqrestore(&sem->wait_lock, flags);
-
 	/* wait to be given the lock */
 	for (;;) {
 		if (!waiter.task)
 			break;
+		if (signal_pending_state(state, current))
+			goto out_nolock;
+		set_current_state(state);
+		raw_spin_unlock_irqrestore(&sem->wait_lock, flags);
 		schedule();
-		set_current_state(TASK_UNINTERRUPTIBLE);
+		raw_spin_lock_irqsave(&sem->wait_lock, flags);
 	}
 
-	__set_current_state(TASK_RUNNING);
+	raw_spin_unlock_irqrestore(&sem->wait_lock, flags);
  out:
-	;
+	return 0;
+
+out_nolock:
+	/*
+	 * We didn't take the lock, so that there is a writer, which
+	 * is owner or the first waiter of the sem. If it's a waiter,
+	 * it will be woken by current owner. Not need to wake anybody.
+	 */
+	list_del(&waiter.list);
+	raw_spin_unlock_irqrestore(&sem->wait_lock, flags);
+	return -EINTR;
+}
+
+void __sched __down_read(struct rw_semaphore *sem)
+{
+	__down_read_common(sem, TASK_UNINTERRUPTIBLE);
+}
+
+int __sched __down_read_killable(struct rw_semaphore *sem)
+{
+	return __down_read_common(sem, TASK_KILLABLE);
 }
 
 /*
-- 
cgit v1.2.3


From 83ced169d9a01f22eb39f1fcc1f89ad9d223238f Mon Sep 17 00:00:00 2001
From: Kirill Tkhai <ktkhai@virtuozzo.com>
Date: Mon, 19 Jun 2017 21:02:26 +0300
Subject: locking/rwsem-xadd: Add killable versions of rwsem_down_read_failed()

Rename rwsem_down_read_failed() in __rwsem_down_read_failed_common()
and teach it to abort waiting in case of pending signals and killable
state argument passed.

Note, that we shouldn't wake anybody up in EINTR path, as:

We check for (waiter.task) under spinlock before we go to out_nolock
path. Current task wasn't able to be woken up, so there are
a writer, owning the sem, or a writer, which is the first waiter.
In the both cases we shouldn't wake anybody. If there is a writer,
owning the sem, and we were the only waiter, remove RWSEM_WAITING_BIAS,
as there are no waiters anymore.

Signed-off-by: Kirill Tkhai <ktkhai@virtuozzo.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: arnd@arndb.de
Cc: avagin@virtuozzo.com
Cc: davem@davemloft.net
Cc: fenghua.yu@intel.com
Cc: gorcunov@virtuozzo.com
Cc: heiko.carstens@de.ibm.com
Cc: hpa@zytor.com
Cc: ink@jurassic.park.msu.ru
Cc: mattst88@gmail.com
Cc: rth@twiddle.net
Cc: schwidefsky@de.ibm.com
Cc: tony.luck@intel.com
Link: http://lkml.kernel.org/r/149789534632.9059.2901382369609922565.stgit@localhost.localdomain
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/rwsem.h       |  1 +
 kernel/locking/rwsem-xadd.c | 33 ++++++++++++++++++++++++++++++---
 2 files changed, 31 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/include/linux/rwsem.h b/include/linux/rwsem.h
index dd1d14250340..0ad7318ff299 100644
--- a/include/linux/rwsem.h
+++ b/include/linux/rwsem.h
@@ -44,6 +44,7 @@ struct rw_semaphore {
 };
 
 extern struct rw_semaphore *rwsem_down_read_failed(struct rw_semaphore *sem);
+extern struct rw_semaphore *rwsem_down_read_failed_killable(struct rw_semaphore *sem);
 extern struct rw_semaphore *rwsem_down_write_failed(struct rw_semaphore *sem);
 extern struct rw_semaphore *rwsem_down_write_failed_killable(struct rw_semaphore *sem);
 extern struct rw_semaphore *rwsem_wake(struct rw_semaphore *);
diff --git a/kernel/locking/rwsem-xadd.c b/kernel/locking/rwsem-xadd.c
index 34e727f18e49..02f660666ab8 100644
--- a/kernel/locking/rwsem-xadd.c
+++ b/kernel/locking/rwsem-xadd.c
@@ -221,8 +221,8 @@ static void __rwsem_mark_wake(struct rw_semaphore *sem,
 /*
  * Wait for the read lock to be granted
  */
-__visible
-struct rw_semaphore __sched *rwsem_down_read_failed(struct rw_semaphore *sem)
+static inline struct rw_semaphore __sched *
+__rwsem_down_read_failed_common(struct rw_semaphore *sem, int state)
 {
 	long count, adjustment = -RWSEM_ACTIVE_READ_BIAS;
 	struct rwsem_waiter waiter;
@@ -255,17 +255,44 @@ struct rw_semaphore __sched *rwsem_down_read_failed(struct rw_semaphore *sem)
 
 	/* wait to be given the lock */
 	while (true) {
-		set_current_state(TASK_UNINTERRUPTIBLE);
+		set_current_state(state);
 		if (!waiter.task)
 			break;
+		if (signal_pending_state(state, current)) {
+			raw_spin_lock_irq(&sem->wait_lock);
+			if (waiter.task)
+				goto out_nolock;
+			raw_spin_unlock_irq(&sem->wait_lock);
+			break;
+		}
 		schedule();
 	}
 
 	__set_current_state(TASK_RUNNING);
 	return sem;
+out_nolock:
+	list_del(&waiter.list);
+	if (list_empty(&sem->wait_list))
+		atomic_long_add(-RWSEM_WAITING_BIAS, &sem->count);
+	raw_spin_unlock_irq(&sem->wait_lock);
+	__set_current_state(TASK_RUNNING);
+	return ERR_PTR(-EINTR);
+}
+
+__visible struct rw_semaphore * __sched
+rwsem_down_read_failed(struct rw_semaphore *sem)
+{
+	return __rwsem_down_read_failed_common(sem, TASK_UNINTERRUPTIBLE);
 }
 EXPORT_SYMBOL(rwsem_down_read_failed);
 
+__visible struct rw_semaphore * __sched
+rwsem_down_read_failed_killable(struct rw_semaphore *sem)
+{
+	return __rwsem_down_read_failed_common(sem, TASK_KILLABLE);
+}
+EXPORT_SYMBOL(rwsem_down_read_failed_killable);
+
 /*
  * This function must be called with the sem->wait_lock held to prevent
  * race conditions between checking the rwsem wait list and setting the
-- 
cgit v1.2.3


From 1dbb6704de91b169a58d0c8221624afd6a95cfc7 Mon Sep 17 00:00:00 2001
From: Paolo Bonzini <pbonzini@redhat.com>
Date: Tue, 1 Aug 2017 17:24:04 +0200
Subject: jump_label: Fix concurrent static_key_enable/disable()

static_key_enable/disable are trying to cap the static key count to
0/1.  However, their use of key->enabled is outside jump_label_lock
so they do not really ensure that.

Rewrite them to do a quick check for an already enabled (respectively,
already disabled), and then recheck under the jump label lock.  Unlike
static_key_slow_inc/dec, a failed check under the jump label lock does
not modify key->enabled.

Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Eric Dumazet <eric.dumazet@gmail.com>
Cc: Jason Baron <jbaron@akamai.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/1501601046-35683-2-git-send-email-pbonzini@redhat.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/jump_label.h | 22 +++++++++--------
 kernel/jump_label.c        | 59 +++++++++++++++++++++++++++++-----------------
 2 files changed, 49 insertions(+), 32 deletions(-)

(limited to 'include')

diff --git a/include/linux/jump_label.h b/include/linux/jump_label.h
index 2afd74b9d844..740a42ea7f7f 100644
--- a/include/linux/jump_label.h
+++ b/include/linux/jump_label.h
@@ -234,22 +234,24 @@ static inline int jump_label_apply_nops(struct module *mod)
 
 static inline void static_key_enable(struct static_key *key)
 {
-	int count = static_key_count(key);
-
-	WARN_ON_ONCE(count < 0 || count > 1);
+	STATIC_KEY_CHECK_USE();
 
-	if (!count)
-		static_key_slow_inc(key);
+	if (atomic_read(&key->enabled) != 0) {
+		WARN_ON_ONCE(atomic_read(&key->enabled) != 1);
+		return;
+	}
+	atomic_set(&key->enabled, 1);
 }
 
 static inline void static_key_disable(struct static_key *key)
 {
-	int count = static_key_count(key);
-
-	WARN_ON_ONCE(count < 0 || count > 1);
+	STATIC_KEY_CHECK_USE();
 
-	if (count)
-		static_key_slow_dec(key);
+	if (atomic_read(&key->enabled) != 1) {
+		WARN_ON_ONCE(atomic_read(&key->enabled) != 0);
+		return;
+	}
+	atomic_set(&key->enabled, 0);
 }
 
 #define STATIC_KEY_INIT_TRUE	{ .enabled = ATOMIC_INIT(1) }
diff --git a/kernel/jump_label.c b/kernel/jump_label.c
index d11c506a6ac3..833eecae825e 100644
--- a/kernel/jump_label.c
+++ b/kernel/jump_label.c
@@ -79,28 +79,6 @@ int static_key_count(struct static_key *key)
 }
 EXPORT_SYMBOL_GPL(static_key_count);
 
-void static_key_enable(struct static_key *key)
-{
-	int count = static_key_count(key);
-
-	WARN_ON_ONCE(count < 0 || count > 1);
-
-	if (!count)
-		static_key_slow_inc(key);
-}
-EXPORT_SYMBOL_GPL(static_key_enable);
-
-void static_key_disable(struct static_key *key)
-{
-	int count = static_key_count(key);
-
-	WARN_ON_ONCE(count < 0 || count > 1);
-
-	if (count)
-		static_key_slow_dec(key);
-}
-EXPORT_SYMBOL_GPL(static_key_disable);
-
 void static_key_slow_inc(struct static_key *key)
 {
 	int v, v1;
@@ -139,6 +117,43 @@ void static_key_slow_inc(struct static_key *key)
 }
 EXPORT_SYMBOL_GPL(static_key_slow_inc);
 
+void static_key_enable(struct static_key *key)
+{
+	STATIC_KEY_CHECK_USE();
+	if (atomic_read(&key->enabled) > 0) {
+		WARN_ON_ONCE(atomic_read(&key->enabled) != 1);
+		return;
+	}
+
+	cpus_read_lock();
+	jump_label_lock();
+	if (atomic_read(&key->enabled) == 0) {
+		atomic_set(&key->enabled, -1);
+		jump_label_update(key);
+		atomic_set(&key->enabled, 1);
+	}
+	jump_label_unlock();
+	cpus_read_unlock();
+}
+EXPORT_SYMBOL_GPL(static_key_enable);
+
+void static_key_disable(struct static_key *key)
+{
+	STATIC_KEY_CHECK_USE();
+	if (atomic_read(&key->enabled) != 1) {
+		WARN_ON_ONCE(atomic_read(&key->enabled) != 0);
+		return;
+	}
+
+	cpus_read_lock();
+	jump_label_lock();
+	if (atomic_cmpxchg(&key->enabled, 1, 0))
+		jump_label_update(key);
+	jump_label_unlock();
+	cpus_read_unlock();
+}
+EXPORT_SYMBOL_GPL(static_key_disable);
+
 static void __static_key_slow_dec(struct static_key *key,
 		unsigned long rate_limit, struct delayed_work *work)
 {
-- 
cgit v1.2.3


From be040bea9085a9c2b1700c9e60888777baeb96d5 Mon Sep 17 00:00:00 2001
From: Paolo Bonzini <pbonzini@redhat.com>
Date: Tue, 1 Aug 2017 17:24:06 +0200
Subject: cpuset: Make nr_cpusets private

Any use of key->enabled (that is static_key_enabled and static_key_count)
outside jump_label_lock should handle its own serialization.  In the case
of cpusets_enabled_key, the key is always incremented/decremented under
cpuset_mutex, and hence the same rule applies to nr_cpusets.  The rule
*is* respected currently, but the mutex is static so nr_cpusets should
be static too.

Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Acked-by: Zefan Li <lizefan@huawei.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/1501601046-35683-4-git-send-email-pbonzini@redhat.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/cpuset.h | 6 ------
 kernel/cgroup/cpuset.c | 7 +++++++
 2 files changed, 7 insertions(+), 6 deletions(-)

(limited to 'include')

diff --git a/include/linux/cpuset.h b/include/linux/cpuset.h
index 898cfe2eeb42..e74655d941b7 100644
--- a/include/linux/cpuset.h
+++ b/include/linux/cpuset.h
@@ -37,12 +37,6 @@ static inline bool cpusets_enabled(void)
 	return static_branch_unlikely(&cpusets_enabled_key);
 }
 
-static inline int nr_cpusets(void)
-{
-	/* jump label reference count + the top-level cpuset */
-	return static_key_count(&cpusets_enabled_key.key) + 1;
-}
-
 static inline void cpuset_inc(void)
 {
 	static_branch_inc(&cpusets_pre_enable_key);
diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c
index 8d5151688504..9ed6a051a1b9 100644
--- a/kernel/cgroup/cpuset.c
+++ b/kernel/cgroup/cpuset.c
@@ -577,6 +577,13 @@ static void update_domain_attr_tree(struct sched_domain_attr *dattr,
 	rcu_read_unlock();
 }
 
+/* Must be called with cpuset_mutex held.  */
+static inline int nr_cpusets(void)
+{
+	/* jump label reference count + the top-level cpuset */
+	return static_key_count(&cpusets_enabled_key.key) + 1;
+}
+
 /*
  * generate_sched_domains()
  *
-- 
cgit v1.2.3


From 5a40527f8f0798553764fc8db4111d7d9c33ea51 Mon Sep 17 00:00:00 2001
From: Marc Zyngier <marc.zyngier@arm.com>
Date: Tue, 1 Aug 2017 09:02:56 +0100
Subject: jump_label: Provide hotplug context variants

As using the normal static key API under the hotplug lock is
pretty much impossible, let's provide a variant of some of them
that require the hotplug lock to have already been taken.

These function are only meant to be used in CPU hotplug callbacks.

Signed-off-by: Marc Zyngier <marc.zyngier@arm.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Leo Yan <leo.yan@linaro.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-arm-kernel@lists.infradead.org
Link: http://lkml.kernel.org/r/20170801080257.5056-4-marc.zyngier@arm.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 Documentation/static-keys.txt | 15 +++++++++++++++
 include/linux/jump_label.h    | 11 +++++++++--
 kernel/jump_label.c           | 22 ++++++++++++++++++----
 3 files changed, 42 insertions(+), 6 deletions(-)

(limited to 'include')

diff --git a/Documentation/static-keys.txt b/Documentation/static-keys.txt
index 870b4be3cb11..ab16efe0c79d 100644
--- a/Documentation/static-keys.txt
+++ b/Documentation/static-keys.txt
@@ -154,6 +154,21 @@ and 'static_key_count()'.  In general, if you use these functions, they
 should be protected with the same mutex used around the enable/disable
 or increment/decrement function.
 
+Note that switching branches results in some locks being taken,
+particularly the CPU hotplug lock (in order to avoid races against
+CPUs being brought in the kernel whilst the kernel is getting
+patched). Calling the static key API from within a hotplug notifier is
+thus a sure deadlock recipe. In order to still allow use of the
+functionnality, the following functions are provided:
+
+	static_key_enable_cpuslocked()
+	static_key_disable_cpuslocked()
+	static_branch_enable_cpuslocked()
+	static_branch_disable_cpuslocked()
+
+These functions are *not* general purpose, and must only be used when
+you really know that you're in the above context, and no other.
+
 Where an array of keys is required, it can be defined as::
 
 	DEFINE_STATIC_KEY_ARRAY_TRUE(keys, count);
diff --git a/include/linux/jump_label.h b/include/linux/jump_label.h
index 740a42ea7f7f..cd5861651b17 100644
--- a/include/linux/jump_label.h
+++ b/include/linux/jump_label.h
@@ -163,6 +163,8 @@ extern void jump_label_apply_nops(struct module *mod);
 extern int static_key_count(struct static_key *key);
 extern void static_key_enable(struct static_key *key);
 extern void static_key_disable(struct static_key *key);
+extern void static_key_enable_cpuslocked(struct static_key *key);
+extern void static_key_disable_cpuslocked(struct static_key *key);
 
 /*
  * We should be using ATOMIC_INIT() for initializing .enabled, but
@@ -254,6 +256,9 @@ static inline void static_key_disable(struct static_key *key)
 	atomic_set(&key->enabled, 0);
 }
 
+#define static_key_enable_cpuslocked(k)		static_key_enable((k))
+#define static_key_disable_cpuslocked(k)	static_key_disable((k))
+
 #define STATIC_KEY_INIT_TRUE	{ .enabled = ATOMIC_INIT(1) }
 #define STATIC_KEY_INIT_FALSE	{ .enabled = ATOMIC_INIT(0) }
 
@@ -415,8 +420,10 @@ extern bool ____wrong_branch_error(void);
  * Normal usage; boolean enable/disable.
  */
 
-#define static_branch_enable(x)		static_key_enable(&(x)->key)
-#define static_branch_disable(x)	static_key_disable(&(x)->key)
+#define static_branch_enable(x)			static_key_enable(&(x)->key)
+#define static_branch_disable(x)		static_key_disable(&(x)->key)
+#define static_branch_enable_cpuslocked(x)	static_key_enable_cpuslocked(&(x)->key)
+#define static_branch_disable_cpuslocked(x)	static_key_disable_cpuslocked(&(x)->key)
 
 #endif /* __ASSEMBLY__ */
 
diff --git a/kernel/jump_label.c b/kernel/jump_label.c
index cc6d815c75ed..0bf2e8f5244a 100644
--- a/kernel/jump_label.c
+++ b/kernel/jump_label.c
@@ -126,15 +126,15 @@ void static_key_slow_inc(struct static_key *key)
 }
 EXPORT_SYMBOL_GPL(static_key_slow_inc);
 
-void static_key_enable(struct static_key *key)
+void static_key_enable_cpuslocked(struct static_key *key)
 {
 	STATIC_KEY_CHECK_USE();
+
 	if (atomic_read(&key->enabled) > 0) {
 		WARN_ON_ONCE(atomic_read(&key->enabled) != 1);
 		return;
 	}
 
-	cpus_read_lock();
 	jump_label_lock();
 	if (atomic_read(&key->enabled) == 0) {
 		atomic_set(&key->enabled, -1);
@@ -145,23 +145,37 @@ void static_key_enable(struct static_key *key)
 		atomic_set_release(&key->enabled, 1);
 	}
 	jump_label_unlock();
+}
+EXPORT_SYMBOL_GPL(static_key_enable_cpuslocked);
+
+void static_key_enable(struct static_key *key)
+{
+	cpus_read_lock();
+	static_key_enable_cpuslocked(key);
 	cpus_read_unlock();
 }
 EXPORT_SYMBOL_GPL(static_key_enable);
 
-void static_key_disable(struct static_key *key)
+void static_key_disable_cpuslocked(struct static_key *key)
 {
 	STATIC_KEY_CHECK_USE();
+
 	if (atomic_read(&key->enabled) != 1) {
 		WARN_ON_ONCE(atomic_read(&key->enabled) != 0);
 		return;
 	}
 
-	cpus_read_lock();
 	jump_label_lock();
 	if (atomic_cmpxchg(&key->enabled, 1, 0))
 		jump_label_update(key);
 	jump_label_unlock();
+}
+EXPORT_SYMBOL_GPL(static_key_disable_cpuslocked);
+
+void static_key_disable(struct static_key *key)
+{
+	cpus_read_lock();
+	static_key_disable_cpuslocked(key);
 	cpus_read_unlock();
 }
 EXPORT_SYMBOL_GPL(static_key_disable);
-- 
cgit v1.2.3


From 8b1b436dd1ccc8a1770af6e56eec047ad4920659 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Wed, 7 Jun 2017 18:05:07 +0200
Subject: mm, locking: Rework {set,clear,mm}_tlb_flush_pending()

Commit:

  af2c1401e6f9 ("mm: numa: guarantee that tlb_flush_pending updates are visible before page table updates")

added smp_mb__before_spinlock() to set_tlb_flush_pending(). I think we
can solve the same problem without this barrier.

If instead we mandate that mm_tlb_flush_pending() is used while
holding the PTL we're guaranteed to observe prior
set_tlb_flush_pending() instances.

For this to work we need to rework migrate_misplaced_transhuge_page()
a little and move the test up into do_huge_pmd_numa_page().

NOTE: this relies on flush_tlb_range() to guarantee:

   (1) it ensures that prior page table updates are visible to the
       page table walker and
   (2) it ensures that subsequent memory accesses are only made
       visible after the invalidation has completed

This is required for architectures that implement TRANSPARENT_HUGEPAGE
(arc, arm, arm64, mips, powerpc, s390, sparc, x86) or otherwise use
mm_tlb_flush_pending() in their page-table operations (arm, arm64,
x86).

This appears true for:

 - arm (DSB ISB before and after),
 - arm64 (DSB ISHST before, and DSB ISH after),
 - powerpc (PTESYNC before and after),
 - s390 and x86 TLB invalidate are serializing instructions

But I failed to understand the situation for:

 - arc, mips, sparc

Now SPARC64 is a wee bit special in that flush_tlb_range() is a no-op
and it flushes the TLBs using arch_{enter,leave}_lazy_mmu_mode()
inside the PTL. It still needs to guarantee the PTL unlock happens
_after_ the invalidate completes.

Vineet, Ralf and Dave could you guys please have a look?

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Acked-by: Will Deacon <will.deacon@arm.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: David S. Miller <davem@davemloft.net>
Cc: Heiko Carstens <heiko.carstens@de.ibm.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Ralf Baechle <ralf@linux-mips.org>
Cc: Rik van Riel <riel@redhat.com>
Cc: Russell King <linux@armlinux.org.uk>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Vineet Gupta <vgupta@synopsys.com>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/mm_types.h | 33 +++++++++++++++++++++++++++------
 mm/huge_memory.c         | 20 ++++++++++++++++++++
 mm/migrate.c             |  6 ------
 3 files changed, 47 insertions(+), 12 deletions(-)

(limited to 'include')

diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index 7f384bb62d8e..36ea3cf7d85e 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -531,23 +531,44 @@ static inline cpumask_t *mm_cpumask(struct mm_struct *mm)
  */
 static inline bool mm_tlb_flush_pending(struct mm_struct *mm)
 {
-	barrier();
+	/*
+	 * Must be called with PTL held; such that our PTL acquire will have
+	 * observed the store from set_tlb_flush_pending().
+	 */
 	return mm->tlb_flush_pending;
 }
 static inline void set_tlb_flush_pending(struct mm_struct *mm)
 {
 	mm->tlb_flush_pending = true;
-
 	/*
-	 * Guarantee that the tlb_flush_pending store does not leak into the
-	 * critical section updating the page tables
+	 * The only time this value is relevant is when there are indeed pages
+	 * to flush. And we'll only flush pages after changing them, which
+	 * requires the PTL.
+	 *
+	 * So the ordering here is:
+	 *
+	 *	mm->tlb_flush_pending = true;
+	 *	spin_lock(&ptl);
+	 *	...
+	 *	set_pte_at();
+	 *	spin_unlock(&ptl);
+	 *
+	 *				spin_lock(&ptl)
+	 *				mm_tlb_flush_pending();
+	 *				....
+	 *				spin_unlock(&ptl);
+	 *
+	 *	flush_tlb_range();
+	 *	mm->tlb_flush_pending = false;
+	 *
+	 * So the =true store is constrained by the PTL unlock, and the =false
+	 * store is constrained by the TLB invalidate.
 	 */
-	smp_mb__before_spinlock();
 }
 /* Clearing is done after a TLB flush, which also provides a barrier. */
 static inline void clear_tlb_flush_pending(struct mm_struct *mm)
 {
-	barrier();
+	/* see set_tlb_flush_pending */
 	mm->tlb_flush_pending = false;
 }
 #else
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 86975dec0ba1..c76a720b936b 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -1410,6 +1410,7 @@ int do_huge_pmd_numa_page(struct vm_fault *vmf, pmd_t pmd)
 	unsigned long haddr = vmf->address & HPAGE_PMD_MASK;
 	int page_nid = -1, this_nid = numa_node_id();
 	int target_nid, last_cpupid = -1;
+	bool need_flush = false;
 	bool page_locked;
 	bool migrated = false;
 	bool was_writable;
@@ -1495,11 +1496,30 @@ int do_huge_pmd_numa_page(struct vm_fault *vmf, pmd_t pmd)
 		goto clear_pmdnuma;
 	}
 
+	/*
+	 * Since we took the NUMA fault, we must have observed the !accessible
+	 * bit. Make sure all other CPUs agree with that, to avoid them
+	 * modifying the page we're about to migrate.
+	 *
+	 * Must be done under PTL such that we'll observe the relevant
+	 * set_tlb_flush_pending().
+	 */
+	if (mm_tlb_flush_pending(vma->vm_mm))
+		need_flush = true;
+
 	/*
 	 * Migrate the THP to the requested node, returns with page unlocked
 	 * and access rights restored.
 	 */
 	spin_unlock(vmf->ptl);
+
+	/*
+	 * We are not sure a pending tlb flush here is for a huge page
+	 * mapping or not. Hence use the tlb range variant
+	 */
+	if (need_flush)
+		flush_tlb_range(vma, haddr, haddr + HPAGE_PMD_SIZE);
+
 	migrated = migrate_misplaced_transhuge_page(vma->vm_mm, vma,
 				vmf->pmd, pmd, vmf->address, page, target_nid);
 	if (migrated) {
diff --git a/mm/migrate.c b/mm/migrate.c
index 627671551873..d68a41da6abb 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -1937,12 +1937,6 @@ int migrate_misplaced_transhuge_page(struct mm_struct *mm,
 		put_page(new_page);
 		goto out_fail;
 	}
-	/*
-	 * We are not sure a pending tlb flush here is for a huge page
-	 * mapping or not. Hence use the tlb range variant
-	 */
-	if (mm_tlb_flush_pending(mm))
-		flush_tlb_range(vma, mmun_start, mmun_end);
 
 	/* Prepare a page as a migration target */
 	__SetPageLocked(new_page);
-- 
cgit v1.2.3


From d89e588ca4081615216cc25f2489b0281ac0bfe9 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Mon, 5 Sep 2016 11:37:53 +0200
Subject: locking: Introduce smp_mb__after_spinlock()

Since its inception, our understanding of ACQUIRE, esp. as applied to
spinlocks, has changed somewhat. Also, I wonder if, with a simple
change, we cannot make it provide more.

The problem with the comment is that the STORE done by spin_lock isn't
itself ordered by the ACQUIRE, and therefore a later LOAD can pass over
it and cross with any prior STORE, rendering the default WMB
insufficient (pointed out by Alan).

Now, this is only really a problem on PowerPC and ARM64, both of
which already defined smp_mb__before_spinlock() as a smp_mb().

At the same time, we can get a much stronger construct if we place
that same barrier _inside_ the spin_lock(). In that case we upgrade
the RCpc spinlock to an RCsc.  That would make all schedule() calls
fully transitive against one another.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Acked-by: Will Deacon <will.deacon@arm.com>
Cc: Alan Stern <stern@rowland.harvard.edu>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Nicholas Piggin <npiggin@gmail.com>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Paul McKenney <paulmck@linux.vnet.ibm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/arm64/include/asm/spinlock.h   |  2 ++
 arch/powerpc/include/asm/spinlock.h |  3 +++
 include/linux/atomic.h              |  3 +++
 include/linux/spinlock.h            | 36 ++++++++++++++++++++++++++++++++++++
 kernel/sched/core.c                 |  4 ++--
 5 files changed, 46 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/arch/arm64/include/asm/spinlock.h b/arch/arm64/include/asm/spinlock.h
index cae331d553f8..b103888b694a 100644
--- a/arch/arm64/include/asm/spinlock.h
+++ b/arch/arm64/include/asm/spinlock.h
@@ -367,5 +367,7 @@ static inline int arch_read_trylock(arch_rwlock_t *rw)
  * smp_mb__before_spinlock() can restore the required ordering.
  */
 #define smp_mb__before_spinlock()	smp_mb()
+/* See include/linux/spinlock.h */
+#define smp_mb__after_spinlock()	smp_mb()
 
 #endif /* __ASM_SPINLOCK_H */
diff --git a/arch/powerpc/include/asm/spinlock.h b/arch/powerpc/include/asm/spinlock.h
index 8c1b913de6d7..c1b1ec94b06c 100644
--- a/arch/powerpc/include/asm/spinlock.h
+++ b/arch/powerpc/include/asm/spinlock.h
@@ -342,5 +342,8 @@ static inline void arch_write_unlock(arch_rwlock_t *rw)
 #define arch_read_relax(lock)	__rw_yield(lock)
 #define arch_write_relax(lock)	__rw_yield(lock)
 
+/* See include/linux/spinlock.h */
+#define smp_mb__after_spinlock()   smp_mb()
+
 #endif /* __KERNEL__ */
 #endif /* __ASM_SPINLOCK_H */
diff --git a/include/linux/atomic.h b/include/linux/atomic.h
index c56be7410130..40d6bfec0e0d 100644
--- a/include/linux/atomic.h
+++ b/include/linux/atomic.h
@@ -38,6 +38,9 @@
  * Besides, if an arch has a special barrier for acquire/release, it could
  * implement its own __atomic_op_* and use the same framework for building
  * variants
+ *
+ * If an architecture overrides __atomic_op_acquire() it will probably want
+ * to define smp_mb__after_spinlock().
  */
 #ifndef __atomic_op_acquire
 #define __atomic_op_acquire(op, args...)				\
diff --git a/include/linux/spinlock.h b/include/linux/spinlock.h
index d9510e8522d4..840281095933 100644
--- a/include/linux/spinlock.h
+++ b/include/linux/spinlock.h
@@ -130,6 +130,42 @@ do {								\
 #define smp_mb__before_spinlock()	smp_wmb()
 #endif
 
+/*
+ * This barrier must provide two things:
+ *
+ *   - it must guarantee a STORE before the spin_lock() is ordered against a
+ *     LOAD after it, see the comments at its two usage sites.
+ *
+ *   - it must ensure the critical section is RCsc.
+ *
+ * The latter is important for cases where we observe values written by other
+ * CPUs in spin-loops, without barriers, while being subject to scheduling.
+ *
+ * CPU0			CPU1			CPU2
+ *
+ *			for (;;) {
+ *			  if (READ_ONCE(X))
+ *			    break;
+ *			}
+ * X=1
+ *			<sched-out>
+ *						<sched-in>
+ *						r = X;
+ *
+ * without transitivity it could be that CPU1 observes X!=0 breaks the loop,
+ * we get migrated and CPU2 sees X==0.
+ *
+ * Since most load-store architectures implement ACQUIRE with an smp_mb() after
+ * the LL/SC loop, they need no further barriers. Similarly all our TSO
+ * architectures imply an smp_mb() for each atomic instruction and equally don't
+ * need more.
+ *
+ * Architectures that can implement ACQUIRE better need to take care.
+ */
+#ifndef smp_mb__after_spinlock
+#define smp_mb__after_spinlock()	do { } while (0)
+#endif
+
 /**
  * raw_spin_unlock_wait - wait until the spinlock gets unlocked
  * @lock: the spinlock in question.
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 0869b20fba81..9fece583a1f0 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -1967,8 +1967,8 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
 	 * reordered with p->state check below. This pairs with mb() in
 	 * set_current_state() the waiting thread does.
 	 */
-	smp_mb__before_spinlock();
 	raw_spin_lock_irqsave(&p->pi_lock, flags);
+	smp_mb__after_spinlock();
 	if (!(p->state & state))
 		goto out;
 
@@ -3281,8 +3281,8 @@ static void __sched notrace __schedule(bool preempt)
 	 * can't be reordered with __set_current_state(TASK_INTERRUPTIBLE)
 	 * done by the caller to avoid the race with signal_wake_up().
 	 */
-	smp_mb__before_spinlock();
 	rq_lock(rq, &rf);
+	smp_mb__after_spinlock();
 
 	/* Promote REQ to ACT */
 	rq->clock_update_flags <<= 1;
-- 
cgit v1.2.3


From a9668cd6ee288c4838bc668880ac085be551cac2 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Wed, 7 Jun 2017 17:51:27 +0200
Subject: locking: Remove smp_mb__before_spinlock()

Now that there are no users of smp_mb__before_spinlock() left, remove
it entirely.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 Documentation/memory-barriers.txt                  |  5 +----
 .../translations/ko_KR/memory-barriers.txt         |  5 +----
 arch/arm64/include/asm/spinlock.h                  |  9 --------
 arch/powerpc/include/asm/barrier.h                 |  7 ------
 fs/userfaultfd.c                                   | 25 ++++++++++------------
 include/linux/spinlock.h                           | 13 -----------
 6 files changed, 13 insertions(+), 51 deletions(-)

(limited to 'include')

diff --git a/Documentation/memory-barriers.txt b/Documentation/memory-barriers.txt
index 9f34364922c8..d1d1716f904b 100644
--- a/Documentation/memory-barriers.txt
+++ b/Documentation/memory-barriers.txt
@@ -1981,10 +1981,7 @@ for each construct.  These operations all imply certain barriers:
      ACQUIRE operation has completed.
 
      Memory operations issued before the ACQUIRE may be completed after
-     the ACQUIRE operation has completed.  An smp_mb__before_spinlock(),
-     combined with a following ACQUIRE, orders prior stores against
-     subsequent loads and stores.  Note that this is weaker than smp_mb()!
-     The smp_mb__before_spinlock() primitive is free on many architectures.
+     the ACQUIRE operation has completed.
 
  (2) RELEASE operation implication:
 
diff --git a/Documentation/translations/ko_KR/memory-barriers.txt b/Documentation/translations/ko_KR/memory-barriers.txt
index 38310dcd6620..bc80fc0e210f 100644
--- a/Documentation/translations/ko_KR/memory-barriers.txt
+++ b/Documentation/translations/ko_KR/memory-barriers.txt
@@ -1956,10 +1956,7 @@ MMIO 쓰기 배리어
      뒤에 완료됩니다.
 
      ACQUIRE 앞에서 요청된 메모리 오퍼레이션은 ACQUIRE 오퍼레이션이 완료된 후에
-     완료될 수 있습니다.  smp_mb__before_spinlock() 뒤에 ACQUIRE 가 실행되는
-     코드 블록은 블록 앞의 스토어를 블록 뒤의 로드와 스토어에 대해 순서
-     맞춥니다.  이건 smp_mb() 보다 완화된 것임을 기억하세요!  많은 아키텍쳐에서
-     smp_mb__before_spinlock() 은 사실 아무일도 하지 않습니다.
+     완료될 수 있습니다.
 
  (2) RELEASE 오퍼레이션의 영향:
 
diff --git a/arch/arm64/include/asm/spinlock.h b/arch/arm64/include/asm/spinlock.h
index b103888b694a..ae4241ab19a8 100644
--- a/arch/arm64/include/asm/spinlock.h
+++ b/arch/arm64/include/asm/spinlock.h
@@ -358,15 +358,6 @@ static inline int arch_read_trylock(arch_rwlock_t *rw)
 #define arch_read_relax(lock)	cpu_relax()
 #define arch_write_relax(lock)	cpu_relax()
 
-/*
- * Accesses appearing in program order before a spin_lock() operation
- * can be reordered with accesses inside the critical section, by virtue
- * of arch_spin_lock being constructed using acquire semantics.
- *
- * In cases where this is problematic (e.g. try_to_wake_up), an
- * smp_mb__before_spinlock() can restore the required ordering.
- */
-#define smp_mb__before_spinlock()	smp_mb()
 /* See include/linux/spinlock.h */
 #define smp_mb__after_spinlock()	smp_mb()
 
diff --git a/arch/powerpc/include/asm/barrier.h b/arch/powerpc/include/asm/barrier.h
index 25d42bd3f114..9c601adfc500 100644
--- a/arch/powerpc/include/asm/barrier.h
+++ b/arch/powerpc/include/asm/barrier.h
@@ -74,13 +74,6 @@ do {									\
 	___p1;								\
 })
 
-/*
- * This must resolve to hwsync on SMP for the context switch path.
- * See _switch, and core scheduler context switch memory ordering
- * comments.
- */
-#define smp_mb__before_spinlock()   smp_mb()
-
 #include <asm-generic/barrier.h>
 
 #endif /* _ASM_POWERPC_BARRIER_H */
diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c
index 06ea26b8c996..44fcbefd84a2 100644
--- a/fs/userfaultfd.c
+++ b/fs/userfaultfd.c
@@ -109,27 +109,24 @@ static int userfaultfd_wake_function(wait_queue_entry_t *wq, unsigned mode,
 		goto out;
 	WRITE_ONCE(uwq->waken, true);
 	/*
-	 * The implicit smp_mb__before_spinlock in try_to_wake_up()
-	 * renders uwq->waken visible to other CPUs before the task is
-	 * waken.
+	 * The Program-Order guarantees provided by the scheduler
+	 * ensure uwq->waken is visible before the task is woken.
 	 */
 	ret = wake_up_state(wq->private, mode);
-	if (ret)
+	if (ret) {
 		/*
 		 * Wake only once, autoremove behavior.
 		 *
-		 * After the effect of list_del_init is visible to the
-		 * other CPUs, the waitqueue may disappear from under
-		 * us, see the !list_empty_careful() in
-		 * handle_userfault(). try_to_wake_up() has an
-		 * implicit smp_mb__before_spinlock, and the
-		 * wq->private is read before calling the extern
-		 * function "wake_up_state" (which in turns calls
-		 * try_to_wake_up). While the spin_lock;spin_unlock;
-		 * wouldn't be enough, the smp_mb__before_spinlock is
-		 * enough to avoid an explicit smp_mb() here.
+		 * After the effect of list_del_init is visible to the other
+		 * CPUs, the waitqueue may disappear from under us, see the
+		 * !list_empty_careful() in handle_userfault().
+		 *
+		 * try_to_wake_up() has an implicit smp_mb(), and the
+		 * wq->private is read before calling the extern function
+		 * "wake_up_state" (which in turns calls try_to_wake_up).
 		 */
 		list_del_init(&wq->entry);
+	}
 out:
 	return ret;
 }
diff --git a/include/linux/spinlock.h b/include/linux/spinlock.h
index 840281095933..4e8cce19b507 100644
--- a/include/linux/spinlock.h
+++ b/include/linux/spinlock.h
@@ -117,19 +117,6 @@ do {								\
 #endif /*arch_spin_is_contended*/
 #endif
 
-/*
- * Despite its name it doesn't necessarily has to be a full barrier.
- * It should only guarantee that a STORE before the critical section
- * can not be reordered with LOADs and STOREs inside this section.
- * spin_lock() is the one-way barrier, this LOAD can not escape out
- * of the region. So the default implementation simply ensures that
- * a STORE can not move into the critical section, smp_wmb() should
- * serialize it with another STORE done by spin_lock().
- */
-#ifndef smp_mb__before_spinlock
-#define smp_mb__before_spinlock()	smp_wmb()
-#endif
-
 /*
  * This barrier must provide two things:
  *
-- 
cgit v1.2.3


From d92a8cfcb37ecd1315269dab741f073b63b3a8b6 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Fri, 3 Mar 2017 10:13:38 +0100
Subject: locking/lockdep: Rework FS_RECLAIM annotation

A while ago someone, and I cannot find the email just now, asked if we
could not implement the RECLAIM_FS inversion stuff with a 'fake' lock
like we use for other things like workqueues etc. I think this should
be possible which allows reducing the 'irq' states and will reduce the
amount of __bfs() lookups we do.

Removing the 1 IRQ state results in 4 less __bfs() walks per
dependency, improving lockdep performance. And by moving this
annotation out of the lockdep code it becomes easier for the mm people
to extend.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Byungchul Park <byungchul.park@lge.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Nikolay Borisov <nborisov@suse.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: akpm@linux-foundation.org
Cc: boqun.feng@gmail.com
Cc: iamjoonsoo.kim@lge.com
Cc: kernel-team@lge.com
Cc: kirill@shutemov.name
Cc: npiggin@gmail.com
Cc: walken@google.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 drivers/gpu/drm/i915/i915_debugfs.c |  5 +-
 include/linux/lockdep.h             | 11 +----
 include/linux/sched.h               |  1 -
 include/linux/sched/mm.h            |  8 ++++
 kernel/locking/lockdep.c            | 95 +------------------------------------
 kernel/locking/lockdep_states.h     |  1 -
 mm/page_alloc.c                     | 49 +++++++++++++++++--
 mm/slab.h                           |  6 ++-
 mm/slob.c                           |  6 ++-
 mm/vmscan.c                         | 13 +++--
 10 files changed, 75 insertions(+), 120 deletions(-)

(limited to 'include')

diff --git a/drivers/gpu/drm/i915/i915_debugfs.c b/drivers/gpu/drm/i915/i915_debugfs.c
index 00d8967c8512..0511fce5c947 100644
--- a/drivers/gpu/drm/i915/i915_debugfs.c
+++ b/drivers/gpu/drm/i915/i915_debugfs.c
@@ -28,6 +28,7 @@
 
 #include <linux/debugfs.h>
 #include <linux/sort.h>
+#include <linux/sched/mm.h>
 #include "intel_drv.h"
 
 static inline struct drm_i915_private *node_to_i915(struct drm_info_node *node)
@@ -4331,7 +4332,7 @@ i915_drop_caches_set(void *data, u64 val)
 		mutex_unlock(&dev->struct_mutex);
 	}
 
-	lockdep_set_current_reclaim_state(GFP_KERNEL);
+	fs_reclaim_acquire(GFP_KERNEL);
 	if (val & DROP_BOUND)
 		i915_gem_shrink(dev_priv, LONG_MAX, I915_SHRINK_BOUND);
 
@@ -4340,7 +4341,7 @@ i915_drop_caches_set(void *data, u64 val)
 
 	if (val & DROP_SHRINK_ALL)
 		i915_gem_shrink_all(dev_priv);
-	lockdep_clear_current_reclaim_state();
+	fs_reclaim_release(GFP_KERNEL);
 
 	if (val & DROP_FREED) {
 		synchronize_rcu();
diff --git a/include/linux/lockdep.h b/include/linux/lockdep.h
index fffe49f188e6..0a4c02c2d7a2 100644
--- a/include/linux/lockdep.h
+++ b/include/linux/lockdep.h
@@ -29,7 +29,7 @@ extern int lock_stat;
  * We'd rather not expose kernel/lockdep_states.h this wide, but we do need
  * the total number of states... :-(
  */
-#define XXX_LOCK_USAGE_STATES		(1+3*4)
+#define XXX_LOCK_USAGE_STATES		(1+2*4)
 
 /*
  * NR_LOCKDEP_CACHING_CLASSES ... Number of classes
@@ -363,10 +363,6 @@ static inline void lock_set_subclass(struct lockdep_map *lock,
 
 extern void lock_downgrade(struct lockdep_map *lock, unsigned long ip);
 
-extern void lockdep_set_current_reclaim_state(gfp_t gfp_mask);
-extern void lockdep_clear_current_reclaim_state(void);
-extern void lockdep_trace_alloc(gfp_t mask);
-
 struct pin_cookie { unsigned int val; };
 
 #define NIL_COOKIE (struct pin_cookie){ .val = 0U, }
@@ -375,7 +371,7 @@ extern struct pin_cookie lock_pin_lock(struct lockdep_map *lock);
 extern void lock_repin_lock(struct lockdep_map *lock, struct pin_cookie);
 extern void lock_unpin_lock(struct lockdep_map *lock, struct pin_cookie);
 
-# define INIT_LOCKDEP				.lockdep_recursion = 0, .lockdep_reclaim_gfp = 0,
+# define INIT_LOCKDEP				.lockdep_recursion = 0,
 
 #define lockdep_depth(tsk)	(debug_locks ? (tsk)->lockdep_depth : 0)
 
@@ -416,9 +412,6 @@ static inline void lockdep_on(void)
 # define lock_downgrade(l, i)			do { } while (0)
 # define lock_set_class(l, n, k, s, i)		do { } while (0)
 # define lock_set_subclass(l, s, i)		do { } while (0)
-# define lockdep_set_current_reclaim_state(g)	do { } while (0)
-# define lockdep_clear_current_reclaim_state()	do { } while (0)
-# define lockdep_trace_alloc(g)			do { } while (0)
 # define lockdep_info()				do { } while (0)
 # define lockdep_init_map(lock, name, key, sub) \
 		do { (void)(name); (void)(key); } while (0)
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 8337e2db0bb2..57db70ec70d0 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -846,7 +846,6 @@ struct task_struct {
 	int				lockdep_depth;
 	unsigned int			lockdep_recursion;
 	struct held_lock		held_locks[MAX_LOCK_DEPTH];
-	gfp_t				lockdep_reclaim_gfp;
 #endif
 
 #ifdef CONFIG_UBSAN
diff --git a/include/linux/sched/mm.h b/include/linux/sched/mm.h
index 2b24a6974847..2b0a281f9d26 100644
--- a/include/linux/sched/mm.h
+++ b/include/linux/sched/mm.h
@@ -167,6 +167,14 @@ static inline gfp_t current_gfp_context(gfp_t flags)
 	return flags;
 }
 
+#ifdef CONFIG_LOCKDEP
+extern void fs_reclaim_acquire(gfp_t gfp_mask);
+extern void fs_reclaim_release(gfp_t gfp_mask);
+#else
+static inline void fs_reclaim_acquire(gfp_t gfp_mask) { }
+static inline void fs_reclaim_release(gfp_t gfp_mask) { }
+#endif
+
 static inline unsigned int memalloc_noio_save(void)
 {
 	unsigned int flags = current->flags & PF_MEMALLOC_NOIO;
diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c
index 7d2499bec5fe..986f2fa79dbb 100644
--- a/kernel/locking/lockdep.c
+++ b/kernel/locking/lockdep.c
@@ -344,14 +344,12 @@ EXPORT_SYMBOL(lockdep_on);
 #if VERBOSE
 # define HARDIRQ_VERBOSE	1
 # define SOFTIRQ_VERBOSE	1
-# define RECLAIM_VERBOSE	1
 #else
 # define HARDIRQ_VERBOSE	0
 # define SOFTIRQ_VERBOSE	0
-# define RECLAIM_VERBOSE	0
 #endif
 
-#if VERBOSE || HARDIRQ_VERBOSE || SOFTIRQ_VERBOSE || RECLAIM_VERBOSE
+#if VERBOSE || HARDIRQ_VERBOSE || SOFTIRQ_VERBOSE
 /*
  * Quick filtering for interesting events:
  */
@@ -2567,14 +2565,6 @@ static int SOFTIRQ_verbose(struct lock_class *class)
 	return 0;
 }
 
-static int RECLAIM_FS_verbose(struct lock_class *class)
-{
-#if RECLAIM_VERBOSE
-	return class_filter(class);
-#endif
-	return 0;
-}
-
 #define STRICT_READ_CHECKS	1
 
 static int (*state_verbose_f[])(struct lock_class *class) = {
@@ -2870,57 +2860,6 @@ void trace_softirqs_off(unsigned long ip)
 		debug_atomic_inc(redundant_softirqs_off);
 }
 
-static void __lockdep_trace_alloc(gfp_t gfp_mask, unsigned long flags)
-{
-	struct task_struct *curr = current;
-
-	if (unlikely(!debug_locks))
-		return;
-
-	gfp_mask = current_gfp_context(gfp_mask);
-
-	/* no reclaim without waiting on it */
-	if (!(gfp_mask & __GFP_DIRECT_RECLAIM))
-		return;
-
-	/* this guy won't enter reclaim */
-	if ((curr->flags & PF_MEMALLOC) && !(gfp_mask & __GFP_NOMEMALLOC))
-		return;
-
-	/* We're only interested __GFP_FS allocations for now */
-	if (!(gfp_mask & __GFP_FS) || (curr->flags & PF_MEMALLOC_NOFS))
-		return;
-
-	/*
-	 * Oi! Can't be having __GFP_FS allocations with IRQs disabled.
-	 */
-	if (DEBUG_LOCKS_WARN_ON(irqs_disabled_flags(flags)))
-		return;
-
-	/* Disable lockdep if explicitly requested */
-	if (gfp_mask & __GFP_NOLOCKDEP)
-		return;
-
-	mark_held_locks(curr, RECLAIM_FS);
-}
-
-static void check_flags(unsigned long flags);
-
-void lockdep_trace_alloc(gfp_t gfp_mask)
-{
-	unsigned long flags;
-
-	if (unlikely(current->lockdep_recursion))
-		return;
-
-	raw_local_irq_save(flags);
-	check_flags(flags);
-	current->lockdep_recursion = 1;
-	__lockdep_trace_alloc(gfp_mask, flags);
-	current->lockdep_recursion = 0;
-	raw_local_irq_restore(flags);
-}
-
 static int mark_irqflags(struct task_struct *curr, struct held_lock *hlock)
 {
 	/*
@@ -2966,22 +2905,6 @@ static int mark_irqflags(struct task_struct *curr, struct held_lock *hlock)
 		}
 	}
 
-	/*
-	 * We reuse the irq context infrastructure more broadly as a general
-	 * context checking code. This tests GFP_FS recursion (a lock taken
-	 * during reclaim for a GFP_FS allocation is held over a GFP_FS
-	 * allocation).
-	 */
-	if (!hlock->trylock && (curr->lockdep_reclaim_gfp & __GFP_FS)) {
-		if (hlock->read) {
-			if (!mark_lock(curr, hlock, LOCK_USED_IN_RECLAIM_FS_READ))
-					return 0;
-		} else {
-			if (!mark_lock(curr, hlock, LOCK_USED_IN_RECLAIM_FS))
-					return 0;
-		}
-	}
-
 	return 1;
 }
 
@@ -3040,10 +2963,6 @@ static inline int separate_irq_context(struct task_struct *curr,
 	return 0;
 }
 
-void lockdep_trace_alloc(gfp_t gfp_mask)
-{
-}
-
 #endif /* defined(CONFIG_TRACE_IRQFLAGS) && defined(CONFIG_PROVE_LOCKING) */
 
 /*
@@ -3952,18 +3871,6 @@ void lock_unpin_lock(struct lockdep_map *lock, struct pin_cookie cookie)
 }
 EXPORT_SYMBOL_GPL(lock_unpin_lock);
 
-void lockdep_set_current_reclaim_state(gfp_t gfp_mask)
-{
-	current->lockdep_reclaim_gfp = current_gfp_context(gfp_mask);
-}
-EXPORT_SYMBOL_GPL(lockdep_set_current_reclaim_state);
-
-void lockdep_clear_current_reclaim_state(void)
-{
-	current->lockdep_reclaim_gfp = 0;
-}
-EXPORT_SYMBOL_GPL(lockdep_clear_current_reclaim_state);
-
 #ifdef CONFIG_LOCK_STAT
 static int
 print_lock_contention_bug(struct task_struct *curr, struct lockdep_map *lock,
diff --git a/kernel/locking/lockdep_states.h b/kernel/locking/lockdep_states.h
index 995b0cc2b84c..35ca09f2ed0b 100644
--- a/kernel/locking/lockdep_states.h
+++ b/kernel/locking/lockdep_states.h
@@ -6,4 +6,3 @@
  */
 LOCKDEP_STATE(HARDIRQ)
 LOCKDEP_STATE(SOFTIRQ)
-LOCKDEP_STATE(RECLAIM_FS)
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index fc32aa81f359..c20d89601802 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -66,6 +66,7 @@
 #include <linux/kthread.h>
 #include <linux/memcontrol.h>
 #include <linux/ftrace.h>
+#include <linux/lockdep.h>
 
 #include <asm/sections.h>
 #include <asm/tlbflush.h>
@@ -3490,6 +3491,47 @@ should_compact_retry(struct alloc_context *ac, unsigned int order, int alloc_fla
 }
 #endif /* CONFIG_COMPACTION */
 
+#ifdef CONFIG_LOCKDEP
+struct lockdep_map __fs_reclaim_map =
+	STATIC_LOCKDEP_MAP_INIT("fs_reclaim", &__fs_reclaim_map);
+
+static bool __need_fs_reclaim(gfp_t gfp_mask)
+{
+	gfp_mask = current_gfp_context(gfp_mask);
+
+	/* no reclaim without waiting on it */
+	if (!(gfp_mask & __GFP_DIRECT_RECLAIM))
+		return false;
+
+	/* this guy won't enter reclaim */
+	if ((current->flags & PF_MEMALLOC) && !(gfp_mask & __GFP_NOMEMALLOC))
+		return false;
+
+	/* We're only interested __GFP_FS allocations for now */
+	if (!(gfp_mask & __GFP_FS))
+		return false;
+
+	if (gfp_mask & __GFP_NOLOCKDEP)
+		return false;
+
+	return true;
+}
+
+void fs_reclaim_acquire(gfp_t gfp_mask)
+{
+	if (__need_fs_reclaim(gfp_mask))
+		lock_map_acquire(&__fs_reclaim_map);
+}
+EXPORT_SYMBOL_GPL(fs_reclaim_acquire);
+
+void fs_reclaim_release(gfp_t gfp_mask)
+{
+	if (__need_fs_reclaim(gfp_mask))
+		lock_map_release(&__fs_reclaim_map);
+}
+EXPORT_SYMBOL_GPL(fs_reclaim_release);
+#endif
+
 /* Perform direct synchronous page reclaim */
 static int
 __perform_reclaim(gfp_t gfp_mask, unsigned int order,
@@ -3504,7 +3546,7 @@ __perform_reclaim(gfp_t gfp_mask, unsigned int order,
 	/* We now go into synchronous reclaim */
 	cpuset_memory_pressure_bump();
 	noreclaim_flag = memalloc_noreclaim_save();
-	lockdep_set_current_reclaim_state(gfp_mask);
+	fs_reclaim_acquire(gfp_mask);
 	reclaim_state.reclaimed_slab = 0;
 	current->reclaim_state = &reclaim_state;
 
@@ -3512,7 +3554,7 @@ __perform_reclaim(gfp_t gfp_mask, unsigned int order,
 								ac->nodemask);
 
 	current->reclaim_state = NULL;
-	lockdep_clear_current_reclaim_state();
+	fs_reclaim_release(gfp_mask);
 	memalloc_noreclaim_restore(noreclaim_flag);
 
 	cond_resched();
@@ -4041,7 +4083,8 @@ static inline bool prepare_alloc_pages(gfp_t gfp_mask, unsigned int order,
 			*alloc_flags |= ALLOC_CPUSET;
 	}
 
-	lockdep_trace_alloc(gfp_mask);
+	fs_reclaim_acquire(gfp_mask);
+	fs_reclaim_release(gfp_mask);
 
 	might_sleep_if(gfp_mask & __GFP_DIRECT_RECLAIM);
 
diff --git a/mm/slab.h b/mm/slab.h
index 6885e1192ec5..073362816acc 100644
--- a/mm/slab.h
+++ b/mm/slab.h
@@ -43,6 +43,7 @@ struct kmem_cache {
 #include <linux/kasan.h>
 #include <linux/kmemleak.h>
 #include <linux/random.h>
+#include <linux/sched/mm.h>
 
 /*
  * State of the slab allocator.
@@ -412,7 +413,10 @@ static inline struct kmem_cache *slab_pre_alloc_hook(struct kmem_cache *s,
 						     gfp_t flags)
 {
 	flags &= gfp_allowed_mask;
-	lockdep_trace_alloc(flags);
+
+	fs_reclaim_acquire(flags);
+	fs_reclaim_release(flags);
+
 	might_sleep_if(gfpflags_allow_blocking(flags));
 
 	if (should_failslab(s, flags))
diff --git a/mm/slob.c b/mm/slob.c
index 1bae78d71096..a8bd6fa11a66 100644
--- a/mm/slob.c
+++ b/mm/slob.c
@@ -432,7 +432,8 @@ __do_kmalloc_node(size_t size, gfp_t gfp, int node, unsigned long caller)
 
 	gfp &= gfp_allowed_mask;
 
-	lockdep_trace_alloc(gfp);
+	fs_reclaim_acquire(gfp);
+	fs_reclaim_release(gfp);
 
 	if (size < PAGE_SIZE - align) {
 		if (!size)
@@ -538,7 +539,8 @@ static void *slob_alloc_node(struct kmem_cache *c, gfp_t flags, int node)
 
 	flags &= gfp_allowed_mask;
 
-	lockdep_trace_alloc(flags);
+	fs_reclaim_acquire(flags);
+	fs_reclaim_release(flags);
 
 	if (c->size < PAGE_SIZE) {
 		b = slob_alloc(c->size, flags, c->align, node);
diff --git a/mm/vmscan.c b/mm/vmscan.c
index a1af041930a6..f957afe900ec 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -3525,8 +3525,6 @@ static int kswapd(void *p)
 	};
 	const struct cpumask *cpumask = cpumask_of_node(pgdat->node_id);
 
-	lockdep_set_current_reclaim_state(GFP_KERNEL);
-
 	if (!cpumask_empty(cpumask))
 		set_cpus_allowed_ptr(tsk, cpumask);
 	current->reclaim_state = &reclaim_state;
@@ -3585,14 +3583,15 @@ kswapd_try_sleep:
 		 */
 		trace_mm_vmscan_kswapd_wake(pgdat->node_id, classzone_idx,
 						alloc_order);
+		fs_reclaim_acquire(GFP_KERNEL);
 		reclaim_order = balance_pgdat(pgdat, alloc_order, classzone_idx);
+		fs_reclaim_release(GFP_KERNEL);
 		if (reclaim_order < alloc_order)
 			goto kswapd_try_sleep;
 	}
 
 	tsk->flags &= ~(PF_MEMALLOC | PF_SWAPWRITE | PF_KSWAPD);
 	current->reclaim_state = NULL;
-	lockdep_clear_current_reclaim_state();
 
 	return 0;
 }
@@ -3655,14 +3654,14 @@ unsigned long shrink_all_memory(unsigned long nr_to_reclaim)
 	unsigned int noreclaim_flag;
 
 	noreclaim_flag = memalloc_noreclaim_save();
-	lockdep_set_current_reclaim_state(sc.gfp_mask);
+	fs_reclaim_acquire(sc.gfp_mask);
 	reclaim_state.reclaimed_slab = 0;
 	p->reclaim_state = &reclaim_state;
 
 	nr_reclaimed = do_try_to_free_pages(zonelist, &sc);
 
 	p->reclaim_state = NULL;
-	lockdep_clear_current_reclaim_state();
+	fs_reclaim_release(sc.gfp_mask);
 	memalloc_noreclaim_restore(noreclaim_flag);
 
 	return nr_reclaimed;
@@ -3847,7 +3846,7 @@ static int __node_reclaim(struct pglist_data *pgdat, gfp_t gfp_mask, unsigned in
 	 */
 	noreclaim_flag = memalloc_noreclaim_save();
 	p->flags |= PF_SWAPWRITE;
-	lockdep_set_current_reclaim_state(sc.gfp_mask);
+	fs_reclaim_acquire(sc.gfp_mask);
 	reclaim_state.reclaimed_slab = 0;
 	p->reclaim_state = &reclaim_state;
 
@@ -3862,9 +3861,9 @@ static int __node_reclaim(struct pglist_data *pgdat, gfp_t gfp_mask, unsigned in
 	}
 
 	p->reclaim_state = NULL;
+	fs_reclaim_release(gfp_mask);
 	current->flags &= ~PF_SWAPWRITE;
 	memalloc_noreclaim_restore(noreclaim_flag);
-	lockdep_clear_current_reclaim_state();
 	return sc.nr_reclaimed >= nr_pages;
 }
 
-- 
cgit v1.2.3


From b09be676e0ff25bd6d2e7637e26d349f9109ad75 Mon Sep 17 00:00:00 2001
From: Byungchul Park <byungchul.park@lge.com>
Date: Mon, 7 Aug 2017 16:12:52 +0900
Subject: locking/lockdep: Implement the 'crossrelease' feature

Lockdep is a runtime locking correctness validator that detects and
reports a deadlock or its possibility by checking dependencies between
locks. It's useful since it does not report just an actual deadlock but
also the possibility of a deadlock that has not actually happened yet.
That enables problems to be fixed before they affect real systems.

However, this facility is only applicable to typical locks, such as
spinlocks and mutexes, which are normally released within the context in
which they were acquired. However, synchronization primitives like page
locks or completions, which are allowed to be released in any context,
also create dependencies and can cause a deadlock.

So lockdep should track these locks to do a better job. The 'crossrelease'
implementation makes these primitives also be tracked.

Signed-off-by: Byungchul Park <byungchul.park@lge.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: akpm@linux-foundation.org
Cc: boqun.feng@gmail.com
Cc: kernel-team@lge.com
Cc: kirill@shutemov.name
Cc: npiggin@gmail.com
Cc: walken@google.com
Cc: willy@infradead.org
Link: http://lkml.kernel.org/r/1502089981-21272-6-git-send-email-byungchul.park@lge.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/irqflags.h |  24 ++-
 include/linux/lockdep.h  | 110 +++++++++-
 include/linux/sched.h    |   8 +
 kernel/exit.c            |   1 +
 kernel/fork.c            |   4 +
 kernel/locking/lockdep.c | 508 ++++++++++++++++++++++++++++++++++++++++++++---
 kernel/workqueue.c       |   2 +
 lib/Kconfig.debug        |  12 ++
 8 files changed, 635 insertions(+), 34 deletions(-)

(limited to 'include')

diff --git a/include/linux/irqflags.h b/include/linux/irqflags.h
index 5dd1272d1ab2..5fdd93bb9300 100644
--- a/include/linux/irqflags.h
+++ b/include/linux/irqflags.h
@@ -23,10 +23,26 @@
 # define trace_softirq_context(p)	((p)->softirq_context)
 # define trace_hardirqs_enabled(p)	((p)->hardirqs_enabled)
 # define trace_softirqs_enabled(p)	((p)->softirqs_enabled)
-# define trace_hardirq_enter()	do { current->hardirq_context++; } while (0)
-# define trace_hardirq_exit()	do { current->hardirq_context--; } while (0)
-# define lockdep_softirq_enter()	do { current->softirq_context++; } while (0)
-# define lockdep_softirq_exit()	do { current->softirq_context--; } while (0)
+# define trace_hardirq_enter()			\
+do {						\
+	current->hardirq_context++;		\
+	crossrelease_hist_start(XHLOCK_HARD);	\
+} while (0)
+# define trace_hardirq_exit()			\
+do {						\
+	current->hardirq_context--;		\
+	crossrelease_hist_end(XHLOCK_HARD);	\
+} while (0)
+# define lockdep_softirq_enter()		\
+do {						\
+	current->softirq_context++;		\
+	crossrelease_hist_start(XHLOCK_SOFT);	\
+} while (0)
+# define lockdep_softirq_exit()			\
+do {						\
+	current->softirq_context--;		\
+	crossrelease_hist_end(XHLOCK_SOFT);	\
+} while (0)
 # define INIT_TRACE_IRQFLAGS	.softirqs_enabled = 1,
 #else
 # define trace_hardirqs_on()		do { } while (0)
diff --git a/include/linux/lockdep.h b/include/linux/lockdep.h
index 0a4c02c2d7a2..e1e0fcd99613 100644
--- a/include/linux/lockdep.h
+++ b/include/linux/lockdep.h
@@ -155,6 +155,12 @@ struct lockdep_map {
 	int				cpu;
 	unsigned long			ip;
 #endif
+#ifdef CONFIG_LOCKDEP_CROSSRELEASE
+	/*
+	 * Whether it's a crosslock.
+	 */
+	int				cross;
+#endif
 };
 
 static inline void lockdep_copy_map(struct lockdep_map *to,
@@ -258,8 +264,62 @@ struct held_lock {
 	unsigned int hardirqs_off:1;
 	unsigned int references:12;					/* 32 bits */
 	unsigned int pin_count;
+#ifdef CONFIG_LOCKDEP_CROSSRELEASE
+	/*
+	 * Generation id.
+	 *
+	 * A value of cross_gen_id will be stored when holding this,
+	 * which is globally increased whenever each crosslock is held.
+	 */
+	unsigned int gen_id;
+#endif
+};
+
+#ifdef CONFIG_LOCKDEP_CROSSRELEASE
+#define MAX_XHLOCK_TRACE_ENTRIES 5
+
+/*
+ * This is for keeping locks waiting for commit so that true dependencies
+ * can be added at commit step.
+ */
+struct hist_lock {
+	/*
+	 * Seperate stack_trace data. This will be used at commit step.
+	 */
+	struct stack_trace	trace;
+	unsigned long		trace_entries[MAX_XHLOCK_TRACE_ENTRIES];
+
+	/*
+	 * Seperate hlock instance. This will be used at commit step.
+	 *
+	 * TODO: Use a smaller data structure containing only necessary
+	 * data. However, we should make lockdep code able to handle the
+	 * smaller one first.
+	 */
+	struct held_lock	hlock;
+};
+
+/*
+ * To initialize a lock as crosslock, lockdep_init_map_crosslock() should
+ * be called instead of lockdep_init_map().
+ */
+struct cross_lock {
+	/*
+	 * Seperate hlock instance. This will be used at commit step.
+	 *
+	 * TODO: Use a smaller data structure containing only necessary
+	 * data. However, we should make lockdep code able to handle the
+	 * smaller one first.
+	 */
+	struct held_lock	hlock;
 };
 
+struct lockdep_map_cross {
+	struct lockdep_map map;
+	struct cross_lock xlock;
+};
+#endif
+
 /*
  * Initialization, self-test and debugging-output methods:
  */
@@ -281,13 +341,6 @@ extern void lockdep_on(void);
 extern void lockdep_init_map(struct lockdep_map *lock, const char *name,
 			     struct lock_class_key *key, int subclass);
 
-/*
- * To initialize a lockdep_map statically use this macro.
- * Note that _name must not be NULL.
- */
-#define STATIC_LOCKDEP_MAP_INIT(_name, _key) \
-	{ .name = (_name), .key = (void *)(_key), }
-
 /*
  * Reinitialize a lock key - for cases where there is special locking or
  * special initialization of locks so that the validator gets the scope
@@ -460,6 +513,49 @@ struct pin_cookie { };
 
 #endif /* !LOCKDEP */
 
+enum xhlock_context_t {
+	XHLOCK_HARD,
+	XHLOCK_SOFT,
+	XHLOCK_PROC,
+	XHLOCK_CTX_NR,
+};
+
+#ifdef CONFIG_LOCKDEP_CROSSRELEASE
+extern void lockdep_init_map_crosslock(struct lockdep_map *lock,
+				       const char *name,
+				       struct lock_class_key *key,
+				       int subclass);
+extern void lock_commit_crosslock(struct lockdep_map *lock);
+
+#define STATIC_CROSS_LOCKDEP_MAP_INIT(_name, _key) \
+	{ .map.name = (_name), .map.key = (void *)(_key), \
+	  .map.cross = 1, }
+
+/*
+ * To initialize a lockdep_map statically use this macro.
+ * Note that _name must not be NULL.
+ */
+#define STATIC_LOCKDEP_MAP_INIT(_name, _key) \
+	{ .name = (_name), .key = (void *)(_key), .cross = 0, }
+
+extern void crossrelease_hist_start(enum xhlock_context_t c);
+extern void crossrelease_hist_end(enum xhlock_context_t c);
+extern void lockdep_init_task(struct task_struct *task);
+extern void lockdep_free_task(struct task_struct *task);
+#else
+/*
+ * To initialize a lockdep_map statically use this macro.
+ * Note that _name must not be NULL.
+ */
+#define STATIC_LOCKDEP_MAP_INIT(_name, _key) \
+	{ .name = (_name), .key = (void *)(_key), }
+
+static inline void crossrelease_hist_start(enum xhlock_context_t c) {}
+static inline void crossrelease_hist_end(enum xhlock_context_t c) {}
+static inline void lockdep_init_task(struct task_struct *task) {}
+static inline void lockdep_free_task(struct task_struct *task) {}
+#endif
+
 #ifdef CONFIG_LOCK_STAT
 
 extern void lock_contended(struct lockdep_map *lock, unsigned long ip);
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 57db70ec70d0..5235fba537fc 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -848,6 +848,14 @@ struct task_struct {
 	struct held_lock		held_locks[MAX_LOCK_DEPTH];
 #endif
 
+#ifdef CONFIG_LOCKDEP_CROSSRELEASE
+#define MAX_XHLOCKS_NR 64UL
+	struct hist_lock *xhlocks; /* Crossrelease history locks */
+	unsigned int xhlock_idx;
+	/* For restoring at history boundaries */
+	unsigned int xhlock_idx_hist[XHLOCK_CTX_NR];
+#endif
+
 #ifdef CONFIG_UBSAN
 	unsigned int			in_ubsan;
 #endif
diff --git a/kernel/exit.c b/kernel/exit.c
index c5548faa9f37..fa72d57db747 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -920,6 +920,7 @@ void __noreturn do_exit(long code)
 	exit_rcu();
 	TASKS_RCU(__srcu_read_unlock(&tasks_rcu_exit_srcu, tasks_rcu_i));
 
+	lockdep_free_task(tsk);
 	do_task_dead();
 }
 EXPORT_SYMBOL_GPL(do_exit);
diff --git a/kernel/fork.c b/kernel/fork.c
index 17921b0390b4..cbf2221ee81a 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -484,6 +484,8 @@ void __init fork_init(void)
 	cpuhp_setup_state(CPUHP_BP_PREPARE_DYN, "fork:vm_stack_cache",
 			  NULL, free_vm_stack_cache);
 #endif
+
+	lockdep_init_task(&init_task);
 }
 
 int __weak arch_dup_task_struct(struct task_struct *dst,
@@ -1691,6 +1693,7 @@ static __latent_entropy struct task_struct *copy_process(
 	p->lockdep_depth = 0; /* no locks held yet */
 	p->curr_chain_key = 0;
 	p->lockdep_recursion = 0;
+	lockdep_init_task(p);
 #endif
 
 #ifdef CONFIG_DEBUG_MUTEXES
@@ -1949,6 +1952,7 @@ bad_fork_cleanup_audit:
 bad_fork_cleanup_perf:
 	perf_event_free_task(p);
 bad_fork_cleanup_policy:
+	lockdep_free_task(p);
 #ifdef CONFIG_NUMA
 	mpol_put(p->mempolicy);
 bad_fork_cleanup_threadgroup_lock:
diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c
index 841828ba35b9..56f69cc53ddc 100644
--- a/kernel/locking/lockdep.c
+++ b/kernel/locking/lockdep.c
@@ -58,6 +58,10 @@
 #define CREATE_TRACE_POINTS
 #include <trace/events/lock.h>
 
+#ifdef CONFIG_LOCKDEP_CROSSRELEASE
+#include <linux/slab.h>
+#endif
+
 #ifdef CONFIG_PROVE_LOCKING
 int prove_locking = 1;
 module_param(prove_locking, int, 0644);
@@ -724,6 +728,18 @@ look_up_lock_class(struct lockdep_map *lock, unsigned int subclass)
 	return is_static || static_obj(lock->key) ? NULL : ERR_PTR(-EINVAL);
 }
 
+#ifdef CONFIG_LOCKDEP_CROSSRELEASE
+static void cross_init(struct lockdep_map *lock, int cross);
+static int cross_lock(struct lockdep_map *lock);
+static int lock_acquire_crosslock(struct held_lock *hlock);
+static int lock_release_crosslock(struct lockdep_map *lock);
+#else
+static inline void cross_init(struct lockdep_map *lock, int cross) {}
+static inline int cross_lock(struct lockdep_map *lock) { return 0; }
+static inline int lock_acquire_crosslock(struct held_lock *hlock) { return 2; }
+static inline int lock_release_crosslock(struct lockdep_map *lock) { return 2; }
+#endif
+
 /*
  * Register a lock's class in the hash-table, if the class is not present
  * yet. Otherwise we look it up. We cache the result in the lock object
@@ -1795,6 +1811,9 @@ check_deadlock(struct task_struct *curr, struct held_lock *next,
 		if (nest)
 			return 2;
 
+		if (cross_lock(prev->instance))
+			continue;
+
 		return print_deadlock_bug(curr, prev, next);
 	}
 	return 1;
@@ -1962,30 +1981,36 @@ check_prevs_add(struct task_struct *curr, struct held_lock *next)
 		int distance = curr->lockdep_depth - depth + 1;
 		hlock = curr->held_locks + depth - 1;
 		/*
-		 * Only non-recursive-read entries get new dependencies
-		 * added:
+		 * Only non-crosslock entries get new dependencies added.
+		 * Crosslock entries will be added by commit later:
 		 */
-		if (hlock->read != 2 && hlock->check) {
-			int ret = check_prev_add(curr, hlock, next,
-						distance, &trace, save);
-			if (!ret)
-				return 0;
-
+		if (!cross_lock(hlock->instance)) {
 			/*
-			 * Stop saving stack_trace if save_trace() was
-			 * called at least once:
+			 * Only non-recursive-read entries get new dependencies
+			 * added:
 			 */
-			if (save && ret == 2)
-				save = NULL;
+			if (hlock->read != 2 && hlock->check) {
+				int ret = check_prev_add(curr, hlock, next,
+							 distance, &trace, save);
+				if (!ret)
+					return 0;
 
-			/*
-			 * Stop after the first non-trylock entry,
-			 * as non-trylock entries have added their
-			 * own direct dependencies already, so this
-			 * lock is connected to them indirectly:
-			 */
-			if (!hlock->trylock)
-				break;
+				/*
+				 * Stop saving stack_trace if save_trace() was
+				 * called at least once:
+				 */
+				if (save && ret == 2)
+					save = NULL;
+
+				/*
+				 * Stop after the first non-trylock entry,
+				 * as non-trylock entries have added their
+				 * own direct dependencies already, so this
+				 * lock is connected to them indirectly:
+				 */
+				if (!hlock->trylock)
+					break;
+			}
 		}
 		depth--;
 		/*
@@ -3176,7 +3201,7 @@ static int mark_lock(struct task_struct *curr, struct held_lock *this,
 /*
  * Initialize a lock instance's lock-class mapping info:
  */
-void lockdep_init_map(struct lockdep_map *lock, const char *name,
+static void __lockdep_init_map(struct lockdep_map *lock, const char *name,
 		      struct lock_class_key *key, int subclass)
 {
 	int i;
@@ -3234,8 +3259,25 @@ void lockdep_init_map(struct lockdep_map *lock, const char *name,
 		raw_local_irq_restore(flags);
 	}
 }
+
+void lockdep_init_map(struct lockdep_map *lock, const char *name,
+		      struct lock_class_key *key, int subclass)
+{
+	cross_init(lock, 0);
+	__lockdep_init_map(lock, name, key, subclass);
+}
 EXPORT_SYMBOL_GPL(lockdep_init_map);
 
+#ifdef CONFIG_LOCKDEP_CROSSRELEASE
+void lockdep_init_map_crosslock(struct lockdep_map *lock, const char *name,
+		      struct lock_class_key *key, int subclass)
+{
+	cross_init(lock, 1);
+	__lockdep_init_map(lock, name, key, subclass);
+}
+EXPORT_SYMBOL_GPL(lockdep_init_map_crosslock);
+#endif
+
 struct lock_class_key __lockdep_no_validate__;
 EXPORT_SYMBOL_GPL(__lockdep_no_validate__);
 
@@ -3291,6 +3333,7 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass,
 	int chain_head = 0;
 	int class_idx;
 	u64 chain_key;
+	int ret;
 
 	if (unlikely(!debug_locks))
 		return 0;
@@ -3339,7 +3382,8 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass,
 
 	class_idx = class - lock_classes + 1;
 
-	if (depth) {
+	/* TODO: nest_lock is not implemented for crosslock yet. */
+	if (depth && !cross_lock(lock)) {
 		hlock = curr->held_locks + depth - 1;
 		if (hlock->class_idx == class_idx && nest_lock) {
 			if (hlock->references) {
@@ -3427,6 +3471,14 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass,
 	if (!validate_chain(curr, lock, hlock, chain_head, chain_key))
 		return 0;
 
+	ret = lock_acquire_crosslock(hlock);
+	/*
+	 * 2 means normal acquire operations are needed. Otherwise, it's
+	 * ok just to return with '0:fail, 1:success'.
+	 */
+	if (ret != 2)
+		return ret;
+
 	curr->curr_chain_key = chain_key;
 	curr->lockdep_depth++;
 	check_chain_key(curr);
@@ -3664,11 +3716,19 @@ __lock_release(struct lockdep_map *lock, int nested, unsigned long ip)
 	struct task_struct *curr = current;
 	struct held_lock *hlock;
 	unsigned int depth;
-	int i;
+	int ret, i;
 
 	if (unlikely(!debug_locks))
 		return 0;
 
+	ret = lock_release_crosslock(lock);
+	/*
+	 * 2 means normal release operations are needed. Otherwise, it's
+	 * ok just to return with '0:fail, 1:success'.
+	 */
+	if (ret != 2)
+		return ret;
+
 	depth = curr->lockdep_depth;
 	/*
 	 * So we're all set to release this lock.. wait what lock? We don't
@@ -4532,6 +4592,13 @@ asmlinkage __visible void lockdep_sys_exit(void)
 				curr->comm, curr->pid);
 		lockdep_print_held_locks(curr);
 	}
+
+	/*
+	 * The lock history for each syscall should be independent. So wipe the
+	 * slate clean on return to userspace.
+	 */
+	crossrelease_hist_end(XHLOCK_PROC);
+	crossrelease_hist_start(XHLOCK_PROC);
 }
 
 void lockdep_rcu_suspicious(const char *file, const int line, const char *s)
@@ -4580,3 +4647,398 @@ void lockdep_rcu_suspicious(const char *file, const int line, const char *s)
 	dump_stack();
 }
 EXPORT_SYMBOL_GPL(lockdep_rcu_suspicious);
+
+#ifdef CONFIG_LOCKDEP_CROSSRELEASE
+
+/*
+ * Crossrelease works by recording a lock history for each thread and
+ * connecting those historic locks that were taken after the
+ * wait_for_completion() in the complete() context.
+ *
+ * Task-A				Task-B
+ *
+ *					mutex_lock(&A);
+ *					mutex_unlock(&A);
+ *
+ * wait_for_completion(&C);
+ *   lock_acquire_crosslock();
+ *     atomic_inc_return(&cross_gen_id);
+ *                                |
+ *				  |	mutex_lock(&B);
+ *				  |	mutex_unlock(&B);
+ *                                |
+ *				  |	complete(&C);
+ *				  `--	  lock_commit_crosslock();
+ *
+ * Which will then add a dependency between B and C.
+ */
+
+#define xhlock(i)         (current->xhlocks[(i) % MAX_XHLOCKS_NR])
+
+/*
+ * Whenever a crosslock is held, cross_gen_id will be increased.
+ */
+static atomic_t cross_gen_id; /* Can be wrapped */
+
+/*
+ * Lock history stacks; we have 3 nested lock history stacks:
+ *
+ *   Hard IRQ
+ *   Soft IRQ
+ *   History / Task
+ *
+ * The thing is that once we complete a (Hard/Soft) IRQ the future task locks
+ * should not depend on any of the locks observed while running the IRQ.
+ *
+ * So what we do is rewind the history buffer and erase all our knowledge of
+ * that temporal event.
+ */
+
+/*
+ * We need this to annotate lock history boundaries. Take for instance
+ * workqueues; each work is independent of the last. The completion of a future
+ * work does not depend on the completion of a past work (in general).
+ * Therefore we must not carry that (lock) dependency across works.
+ *
+ * This is true for many things; pretty much all kthreads fall into this
+ * pattern, where they have an 'idle' state and future completions do not
+ * depend on past completions. Its just that since they all have the 'same'
+ * form -- the kthread does the same over and over -- it doesn't typically
+ * matter.
+ *
+ * The same is true for system-calls, once a system call is completed (we've
+ * returned to userspace) the next system call does not depend on the lock
+ * history of the previous system call.
+ */
+void crossrelease_hist_start(enum xhlock_context_t c)
+{
+	if (current->xhlocks)
+		current->xhlock_idx_hist[c] = current->xhlock_idx;
+}
+
+void crossrelease_hist_end(enum xhlock_context_t c)
+{
+	if (current->xhlocks)
+		current->xhlock_idx = current->xhlock_idx_hist[c];
+}
+
+static int cross_lock(struct lockdep_map *lock)
+{
+	return lock ? lock->cross : 0;
+}
+
+/*
+ * This is needed to decide the relationship between wrapable variables.
+ */
+static inline int before(unsigned int a, unsigned int b)
+{
+	return (int)(a - b) < 0;
+}
+
+static inline struct lock_class *xhlock_class(struct hist_lock *xhlock)
+{
+	return hlock_class(&xhlock->hlock);
+}
+
+static inline struct lock_class *xlock_class(struct cross_lock *xlock)
+{
+	return hlock_class(&xlock->hlock);
+}
+
+/*
+ * Should we check a dependency with previous one?
+ */
+static inline int depend_before(struct held_lock *hlock)
+{
+	return hlock->read != 2 && hlock->check && !hlock->trylock;
+}
+
+/*
+ * Should we check a dependency with next one?
+ */
+static inline int depend_after(struct held_lock *hlock)
+{
+	return hlock->read != 2 && hlock->check;
+}
+
+/*
+ * Check if the xhlock is valid, which would be false if,
+ *
+ *    1. Has not used after initializaion yet.
+ *
+ * Remind hist_lock is implemented as a ring buffer.
+ */
+static inline int xhlock_valid(struct hist_lock *xhlock)
+{
+	/*
+	 * xhlock->hlock.instance must be !NULL.
+	 */
+	return !!xhlock->hlock.instance;
+}
+
+/*
+ * Record a hist_lock entry.
+ *
+ * Irq disable is only required.
+ */
+static void add_xhlock(struct held_lock *hlock)
+{
+	unsigned int idx = ++current->xhlock_idx;
+	struct hist_lock *xhlock = &xhlock(idx);
+
+#ifdef CONFIG_DEBUG_LOCKDEP
+	/*
+	 * This can be done locklessly because they are all task-local
+	 * state, we must however ensure IRQs are disabled.
+	 */
+	WARN_ON_ONCE(!irqs_disabled());
+#endif
+
+	/* Initialize hist_lock's members */
+	xhlock->hlock = *hlock;
+
+	xhlock->trace.nr_entries = 0;
+	xhlock->trace.max_entries = MAX_XHLOCK_TRACE_ENTRIES;
+	xhlock->trace.entries = xhlock->trace_entries;
+	xhlock->trace.skip = 3;
+	save_stack_trace(&xhlock->trace);
+}
+
+static inline int same_context_xhlock(struct hist_lock *xhlock)
+{
+	return xhlock->hlock.irq_context == task_irq_context(current);
+}
+
+/*
+ * This should be lockless as far as possible because this would be
+ * called very frequently.
+ */
+static void check_add_xhlock(struct held_lock *hlock)
+{
+	/*
+	 * Record a hist_lock, only in case that acquisitions ahead
+	 * could depend on the held_lock. For example, if the held_lock
+	 * is trylock then acquisitions ahead never depends on that.
+	 * In that case, we don't need to record it. Just return.
+	 */
+	if (!current->xhlocks || !depend_before(hlock))
+		return;
+
+	add_xhlock(hlock);
+}
+
+/*
+ * For crosslock.
+ */
+static int add_xlock(struct held_lock *hlock)
+{
+	struct cross_lock *xlock;
+	unsigned int gen_id;
+
+	if (!graph_lock())
+		return 0;
+
+	xlock = &((struct lockdep_map_cross *)hlock->instance)->xlock;
+
+	gen_id = (unsigned int)atomic_inc_return(&cross_gen_id);
+	xlock->hlock = *hlock;
+	xlock->hlock.gen_id = gen_id;
+	graph_unlock();
+
+	return 1;
+}
+
+/*
+ * Called for both normal and crosslock acquires. Normal locks will be
+ * pushed on the hist_lock queue. Cross locks will record state and
+ * stop regular lock_acquire() to avoid being placed on the held_lock
+ * stack.
+ *
+ * Return: 0 - failure;
+ *         1 - crosslock, done;
+ *         2 - normal lock, continue to held_lock[] ops.
+ */
+static int lock_acquire_crosslock(struct held_lock *hlock)
+{
+	/*
+	 *	CONTEXT 1		CONTEXT 2
+	 *	---------		---------
+	 *	lock A (cross)
+	 *	X = atomic_inc_return(&cross_gen_id)
+	 *	~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+	 *				Y = atomic_read_acquire(&cross_gen_id)
+	 *				lock B
+	 *
+	 * atomic_read_acquire() is for ordering between A and B,
+	 * IOW, A happens before B, when CONTEXT 2 see Y >= X.
+	 *
+	 * Pairs with atomic_inc_return() in add_xlock().
+	 */
+	hlock->gen_id = (unsigned int)atomic_read_acquire(&cross_gen_id);
+
+	if (cross_lock(hlock->instance))
+		return add_xlock(hlock);
+
+	check_add_xhlock(hlock);
+	return 2;
+}
+
+static int copy_trace(struct stack_trace *trace)
+{
+	unsigned long *buf = stack_trace + nr_stack_trace_entries;
+	unsigned int max_nr = MAX_STACK_TRACE_ENTRIES - nr_stack_trace_entries;
+	unsigned int nr = min(max_nr, trace->nr_entries);
+
+	trace->nr_entries = nr;
+	memcpy(buf, trace->entries, nr * sizeof(trace->entries[0]));
+	trace->entries = buf;
+	nr_stack_trace_entries += nr;
+
+	if (nr_stack_trace_entries >= MAX_STACK_TRACE_ENTRIES-1) {
+		if (!debug_locks_off_graph_unlock())
+			return 0;
+
+		print_lockdep_off("BUG: MAX_STACK_TRACE_ENTRIES too low!");
+		dump_stack();
+
+		return 0;
+	}
+
+	return 1;
+}
+
+static int commit_xhlock(struct cross_lock *xlock, struct hist_lock *xhlock)
+{
+	unsigned int xid, pid;
+	u64 chain_key;
+
+	xid = xlock_class(xlock) - lock_classes;
+	chain_key = iterate_chain_key((u64)0, xid);
+	pid = xhlock_class(xhlock) - lock_classes;
+	chain_key = iterate_chain_key(chain_key, pid);
+
+	if (lookup_chain_cache(chain_key))
+		return 1;
+
+	if (!add_chain_cache_classes(xid, pid, xhlock->hlock.irq_context,
+				chain_key))
+		return 0;
+
+	if (!check_prev_add(current, &xlock->hlock, &xhlock->hlock, 1,
+			    &xhlock->trace, copy_trace))
+		return 0;
+
+	return 1;
+}
+
+static void commit_xhlocks(struct cross_lock *xlock)
+{
+	unsigned int cur = current->xhlock_idx;
+	unsigned int i;
+
+	if (!graph_lock())
+		return;
+
+	for (i = 0; i < MAX_XHLOCKS_NR; i++) {
+		struct hist_lock *xhlock = &xhlock(cur - i);
+
+		if (!xhlock_valid(xhlock))
+			break;
+
+		if (before(xhlock->hlock.gen_id, xlock->hlock.gen_id))
+			break;
+
+		if (!same_context_xhlock(xhlock))
+			break;
+
+		/*
+		 * commit_xhlock() returns 0 with graph_lock already
+		 * released if fail.
+		 */
+		if (!commit_xhlock(xlock, xhlock))
+			return;
+	}
+
+	graph_unlock();
+}
+
+void lock_commit_crosslock(struct lockdep_map *lock)
+{
+	struct cross_lock *xlock;
+	unsigned long flags;
+
+	if (unlikely(!debug_locks || current->lockdep_recursion))
+		return;
+
+	if (!current->xhlocks)
+		return;
+
+	/*
+	 * Do commit hist_locks with the cross_lock, only in case that
+	 * the cross_lock could depend on acquisitions after that.
+	 *
+	 * For example, if the cross_lock does not have the 'check' flag
+	 * then we don't need to check dependencies and commit for that.
+	 * Just skip it. In that case, of course, the cross_lock does
+	 * not depend on acquisitions ahead, either.
+	 *
+	 * WARNING: Don't do that in add_xlock() in advance. When an
+	 * acquisition context is different from the commit context,
+	 * invalid(skipped) cross_lock might be accessed.
+	 */
+	if (!depend_after(&((struct lockdep_map_cross *)lock)->xlock.hlock))
+		return;
+
+	raw_local_irq_save(flags);
+	check_flags(flags);
+	current->lockdep_recursion = 1;
+	xlock = &((struct lockdep_map_cross *)lock)->xlock;
+	commit_xhlocks(xlock);
+	current->lockdep_recursion = 0;
+	raw_local_irq_restore(flags);
+}
+EXPORT_SYMBOL_GPL(lock_commit_crosslock);
+
+/*
+ * Return: 1 - crosslock, done;
+ *         2 - normal lock, continue to held_lock[] ops.
+ */
+static int lock_release_crosslock(struct lockdep_map *lock)
+{
+	return cross_lock(lock) ? 1 : 2;
+}
+
+static void cross_init(struct lockdep_map *lock, int cross)
+{
+	lock->cross = cross;
+
+	/*
+	 * Crossrelease assumes that the ring buffer size of xhlocks
+	 * is aligned with power of 2. So force it on build.
+	 */
+	BUILD_BUG_ON(MAX_XHLOCKS_NR & (MAX_XHLOCKS_NR - 1));
+}
+
+void lockdep_init_task(struct task_struct *task)
+{
+	int i;
+
+	task->xhlock_idx = UINT_MAX;
+
+	for (i = 0; i < XHLOCK_CTX_NR; i++)
+		task->xhlock_idx_hist[i] = UINT_MAX;
+
+	task->xhlocks = kzalloc(sizeof(struct hist_lock) * MAX_XHLOCKS_NR,
+				GFP_KERNEL);
+}
+
+void lockdep_free_task(struct task_struct *task)
+{
+	if (task->xhlocks) {
+		void *tmp = task->xhlocks;
+		/* Diable crossrelease for current */
+		task->xhlocks = NULL;
+		kfree(tmp);
+	}
+}
+#endif
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index ca937b0c3a96..e86733a8b344 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -2093,6 +2093,7 @@ __acquires(&pool->lock)
 
 	lock_map_acquire_read(&pwq->wq->lockdep_map);
 	lock_map_acquire(&lockdep_map);
+	crossrelease_hist_start(XHLOCK_PROC);
 	trace_workqueue_execute_start(work);
 	worker->current_func(work);
 	/*
@@ -2100,6 +2101,7 @@ __acquires(&pool->lock)
 	 * point will only record its address.
 	 */
 	trace_workqueue_execute_end(work);
+	crossrelease_hist_end(XHLOCK_PROC);
 	lock_map_release(&lockdep_map);
 	lock_map_release(&pwq->wq->lockdep_map);
 
diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
index 98fe715522e8..c6038f23bb1a 100644
--- a/lib/Kconfig.debug
+++ b/lib/Kconfig.debug
@@ -1150,6 +1150,18 @@ config LOCK_STAT
 	 CONFIG_LOCK_STAT defines "contended" and "acquired" lock events.
 	 (CONFIG_LOCKDEP defines "acquire" and "release" events.)
 
+config LOCKDEP_CROSSRELEASE
+	bool "Lock debugging: make lockdep work for crosslocks"
+	depends on PROVE_LOCKING
+	default n
+	help
+	 This makes lockdep work for crosslock which is a lock allowed to
+	 be released in a different context from the acquisition context.
+	 Normally a lock must be released in the context acquiring the lock.
+	 However, relexing this constraint helps synchronization primitives
+	 such as page locks or completions can use the lock correctness
+	 detector, lockdep.
+
 config DEBUG_LOCKDEP
 	bool "Lock dependency engine debugging"
 	depends on DEBUG_KERNEL && LOCKDEP
-- 
cgit v1.2.3


From 23f873d8f9526ed7e49a1a02a45f8afb9ae5fb84 Mon Sep 17 00:00:00 2001
From: Byungchul Park <byungchul.park@lge.com>
Date: Mon, 7 Aug 2017 16:12:53 +0900
Subject: locking/lockdep: Detect and handle hist_lock ring buffer overwrite

The ring buffer can be overwritten by hardirq/softirq/work contexts.
That cases must be considered on rollback or commit. For example,

          |<------ hist_lock ring buffer size ----->|
          ppppppppppppiiiiiiiiiiiiiiiiiiiiiiiiiiiiiii
wrapped > iiiiiiiiiiiiiiiiiiiiiii....................

          where 'p' represents an acquisition in process context,
          'i' represents an acquisition in irq context.

On irq exit, crossrelease tries to rollback idx to original position,
but it should not because the entry already has been invalid by
overwriting 'i'. Avoid rollback or commit for entries overwritten.

Signed-off-by: Byungchul Park <byungchul.park@lge.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: akpm@linux-foundation.org
Cc: boqun.feng@gmail.com
Cc: kernel-team@lge.com
Cc: kirill@shutemov.name
Cc: npiggin@gmail.com
Cc: walken@google.com
Cc: willy@infradead.org
Link: http://lkml.kernel.org/r/1502089981-21272-7-git-send-email-byungchul.park@lge.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/lockdep.h  | 20 +++++++++++++++++++
 include/linux/sched.h    |  3 +++
 kernel/locking/lockdep.c | 52 +++++++++++++++++++++++++++++++++++++++++++-----
 3 files changed, 70 insertions(+), 5 deletions(-)

(limited to 'include')

diff --git a/include/linux/lockdep.h b/include/linux/lockdep.h
index e1e0fcd99613..c75eedd55af5 100644
--- a/include/linux/lockdep.h
+++ b/include/linux/lockdep.h
@@ -283,6 +283,26 @@ struct held_lock {
  * can be added at commit step.
  */
 struct hist_lock {
+	/*
+	 * Id for each entry in the ring buffer. This is used to
+	 * decide whether the ring buffer was overwritten or not.
+	 *
+	 * For example,
+	 *
+	 *           |<----------- hist_lock ring buffer size ------->|
+	 *           pppppppppppppppppppppiiiiiiiiiiiiiiiiiiiiiiiiiiiii
+	 * wrapped > iiiiiiiiiiiiiiiiiiiiiiiiiii.......................
+	 *
+	 *           where 'p' represents an acquisition in process
+	 *           context, 'i' represents an acquisition in irq
+	 *           context.
+	 *
+	 * In this example, the ring buffer was overwritten by
+	 * acquisitions in irq context, that should be detected on
+	 * rollback or commit.
+	 */
+	unsigned int hist_id;
+
 	/*
 	 * Seperate stack_trace data. This will be used at commit step.
 	 */
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 5235fba537fc..772c5f643764 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -854,6 +854,9 @@ struct task_struct {
 	unsigned int xhlock_idx;
 	/* For restoring at history boundaries */
 	unsigned int xhlock_idx_hist[XHLOCK_CTX_NR];
+	unsigned int hist_id;
+	/* For overwrite check at each context exit */
+	unsigned int hist_id_save[XHLOCK_CTX_NR];
 #endif
 
 #ifdef CONFIG_UBSAN
diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c
index 56f69cc53ddc..eda8114ef793 100644
--- a/kernel/locking/lockdep.c
+++ b/kernel/locking/lockdep.c
@@ -4680,6 +4680,17 @@ EXPORT_SYMBOL_GPL(lockdep_rcu_suspicious);
  */
 static atomic_t cross_gen_id; /* Can be wrapped */
 
+/*
+ * Make an entry of the ring buffer invalid.
+ */
+static inline void invalidate_xhlock(struct hist_lock *xhlock)
+{
+	/*
+	 * Normally, xhlock->hlock.instance must be !NULL.
+	 */
+	xhlock->hlock.instance = NULL;
+}
+
 /*
  * Lock history stacks; we have 3 nested lock history stacks:
  *
@@ -4712,14 +4723,28 @@ static atomic_t cross_gen_id; /* Can be wrapped */
  */
 void crossrelease_hist_start(enum xhlock_context_t c)
 {
-	if (current->xhlocks)
-		current->xhlock_idx_hist[c] = current->xhlock_idx;
+	struct task_struct *cur = current;
+
+	if (cur->xhlocks) {
+		cur->xhlock_idx_hist[c] = cur->xhlock_idx;
+		cur->hist_id_save[c] = cur->hist_id;
+	}
 }
 
 void crossrelease_hist_end(enum xhlock_context_t c)
 {
-	if (current->xhlocks)
-		current->xhlock_idx = current->xhlock_idx_hist[c];
+	struct task_struct *cur = current;
+
+	if (cur->xhlocks) {
+		unsigned int idx = cur->xhlock_idx_hist[c];
+		struct hist_lock *h = &xhlock(idx);
+
+		cur->xhlock_idx = idx;
+
+		/* Check if the ring was overwritten. */
+		if (h->hist_id != cur->hist_id_save[c])
+			invalidate_xhlock(h);
+	}
 }
 
 static int cross_lock(struct lockdep_map *lock)
@@ -4765,6 +4790,7 @@ static inline int depend_after(struct held_lock *hlock)
  * Check if the xhlock is valid, which would be false if,
  *
  *    1. Has not used after initializaion yet.
+ *    2. Got invalidated.
  *
  * Remind hist_lock is implemented as a ring buffer.
  */
@@ -4796,6 +4822,7 @@ static void add_xhlock(struct held_lock *hlock)
 
 	/* Initialize hist_lock's members */
 	xhlock->hlock = *hlock;
+	xhlock->hist_id = current->hist_id++;
 
 	xhlock->trace.nr_entries = 0;
 	xhlock->trace.max_entries = MAX_XHLOCK_TRACE_ENTRIES;
@@ -4934,6 +4961,7 @@ static int commit_xhlock(struct cross_lock *xlock, struct hist_lock *xhlock)
 static void commit_xhlocks(struct cross_lock *xlock)
 {
 	unsigned int cur = current->xhlock_idx;
+	unsigned int prev_hist_id = xhlock(cur).hist_id;
 	unsigned int i;
 
 	if (!graph_lock())
@@ -4951,6 +4979,17 @@ static void commit_xhlocks(struct cross_lock *xlock)
 		if (!same_context_xhlock(xhlock))
 			break;
 
+		/*
+		 * Filter out the cases that the ring buffer was
+		 * overwritten and the previous entry has a bigger
+		 * hist_id than the following one, which is impossible
+		 * otherwise.
+		 */
+		if (unlikely(before(xhlock->hist_id, prev_hist_id)))
+			break;
+
+		prev_hist_id = xhlock->hist_id;
+
 		/*
 		 * commit_xhlock() returns 0 with graph_lock already
 		 * released if fail.
@@ -5024,9 +5063,12 @@ void lockdep_init_task(struct task_struct *task)
 	int i;
 
 	task->xhlock_idx = UINT_MAX;
+	task->hist_id = 0;
 
-	for (i = 0; i < XHLOCK_CTX_NR; i++)
+	for (i = 0; i < XHLOCK_CTX_NR; i++) {
 		task->xhlock_idx_hist[i] = UINT_MAX;
+		task->hist_id_save[i] = 0;
+	}
 
 	task->xhlocks = kzalloc(sizeof(struct hist_lock) * MAX_XHLOCKS_NR,
 				GFP_KERNEL);
-- 
cgit v1.2.3


From 28a903f63ec0811ead70ad0f8665e838d207a25e Mon Sep 17 00:00:00 2001
From: Byungchul Park <byungchul.park@lge.com>
Date: Mon, 7 Aug 2017 16:12:54 +0900
Subject: locking/lockdep: Handle non(or multi)-acquisition of a crosslock

No acquisition might be in progress on commit of a crosslock. Completion
operations enabling crossrelease are the case like:

   CONTEXT X                         CONTEXT Y
   ---------                         ---------
   trigger completion context
                                     complete AX
                                        commit AX
   wait_for_complete AX
      acquire AX
      wait

   where AX is a crosslock.

When no acquisition is in progress, we should not perform commit because
the lock does not exist, which might cause incorrect memory access. So
we have to track the number of acquisitions of a crosslock to handle it.

Moreover, in case that more than one acquisition of a crosslock are
overlapped like:

   CONTEXT W        CONTEXT X        CONTEXT Y        CONTEXT Z
   ---------        ---------        ---------        ---------
   acquire AX (gen_id: 1)
                                     acquire A
                    acquire AX (gen_id: 10)
                                     acquire B
                                     commit AX
                                                      acquire C
                                                      commit AX

   where A, B and C are typical locks and AX is a crosslock.

Current crossrelease code performs commits in Y and Z with gen_id = 10.
However, we can use gen_id = 1 to do it, since not only 'acquire AX in X'
but 'acquire AX in W' also depends on each acquisition in Y and Z until
their commits. So make it use gen_id = 1 instead of 10 on their commits,
which adds an additional dependency 'AX -> A' in the example above.

Signed-off-by: Byungchul Park <byungchul.park@lge.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: akpm@linux-foundation.org
Cc: boqun.feng@gmail.com
Cc: kernel-team@lge.com
Cc: kirill@shutemov.name
Cc: npiggin@gmail.com
Cc: walken@google.com
Cc: willy@infradead.org
Link: http://lkml.kernel.org/r/1502089981-21272-8-git-send-email-byungchul.park@lge.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/lockdep.h  | 22 ++++++++++++-
 kernel/locking/lockdep.c | 82 +++++++++++++++++++++++++++++++++---------------
 2 files changed, 77 insertions(+), 27 deletions(-)

(limited to 'include')

diff --git a/include/linux/lockdep.h b/include/linux/lockdep.h
index c75eedd55af5..651cc61af041 100644
--- a/include/linux/lockdep.h
+++ b/include/linux/lockdep.h
@@ -324,6 +324,19 @@ struct hist_lock {
  * be called instead of lockdep_init_map().
  */
 struct cross_lock {
+	/*
+	 * When more than one acquisition of crosslocks are overlapped,
+	 * we have to perform commit for them based on cross_gen_id of
+	 * the first acquisition, which allows us to add more true
+	 * dependencies.
+	 *
+	 * Moreover, when no acquisition of a crosslock is in progress,
+	 * we should not perform commit because the lock might not exist
+	 * any more, which might cause incorrect memory access. So we
+	 * have to track the number of acquisitions of a crosslock.
+	 */
+	int nr_acquire;
+
 	/*
 	 * Seperate hlock instance. This will be used at commit step.
 	 *
@@ -547,9 +560,16 @@ extern void lockdep_init_map_crosslock(struct lockdep_map *lock,
 				       int subclass);
 extern void lock_commit_crosslock(struct lockdep_map *lock);
 
+/*
+ * What we essencially have to initialize is 'nr_acquire'. Other members
+ * will be initialized in add_xlock().
+ */
+#define STATIC_CROSS_LOCK_INIT() \
+	{ .nr_acquire = 0,}
+
 #define STATIC_CROSS_LOCKDEP_MAP_INIT(_name, _key) \
 	{ .map.name = (_name), .map.key = (void *)(_key), \
-	  .map.cross = 1, }
+	  .map.cross = 1, .xlock = STATIC_CROSS_LOCK_INIT(), }
 
 /*
  * To initialize a lockdep_map statically use this macro.
diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c
index eda8114ef793..7f97871d48d5 100644
--- a/kernel/locking/lockdep.c
+++ b/kernel/locking/lockdep.c
@@ -4867,11 +4867,28 @@ static int add_xlock(struct held_lock *hlock)
 
 	xlock = &((struct lockdep_map_cross *)hlock->instance)->xlock;
 
+	/*
+	 * When acquisitions for a crosslock are overlapped, we use
+	 * nr_acquire to perform commit for them, based on cross_gen_id
+	 * of the first acquisition, which allows to add additional
+	 * dependencies.
+	 *
+	 * Moreover, when no acquisition of a crosslock is in progress,
+	 * we should not perform commit because the lock might not exist
+	 * any more, which might cause incorrect memory access. So we
+	 * have to track the number of acquisitions of a crosslock.
+	 *
+	 * depend_after() is necessary to initialize only the first
+	 * valid xlock so that the xlock can be used on its commit.
+	 */
+	if (xlock->nr_acquire++ && depend_after(&xlock->hlock))
+		goto unlock;
+
 	gen_id = (unsigned int)atomic_inc_return(&cross_gen_id);
 	xlock->hlock = *hlock;
 	xlock->hlock.gen_id = gen_id;
+unlock:
 	graph_unlock();
-
 	return 1;
 }
 
@@ -4967,35 +4984,37 @@ static void commit_xhlocks(struct cross_lock *xlock)
 	if (!graph_lock())
 		return;
 
-	for (i = 0; i < MAX_XHLOCKS_NR; i++) {
-		struct hist_lock *xhlock = &xhlock(cur - i);
+	if (xlock->nr_acquire) {
+		for (i = 0; i < MAX_XHLOCKS_NR; i++) {
+			struct hist_lock *xhlock = &xhlock(cur - i);
 
-		if (!xhlock_valid(xhlock))
-			break;
+			if (!xhlock_valid(xhlock))
+				break;
 
-		if (before(xhlock->hlock.gen_id, xlock->hlock.gen_id))
-			break;
+			if (before(xhlock->hlock.gen_id, xlock->hlock.gen_id))
+				break;
 
-		if (!same_context_xhlock(xhlock))
-			break;
+			if (!same_context_xhlock(xhlock))
+				break;
 
-		/*
-		 * Filter out the cases that the ring buffer was
-		 * overwritten and the previous entry has a bigger
-		 * hist_id than the following one, which is impossible
-		 * otherwise.
-		 */
-		if (unlikely(before(xhlock->hist_id, prev_hist_id)))
-			break;
+			/*
+			 * Filter out the cases that the ring buffer was
+			 * overwritten and the previous entry has a bigger
+			 * hist_id than the following one, which is impossible
+			 * otherwise.
+			 */
+			if (unlikely(before(xhlock->hist_id, prev_hist_id)))
+				break;
 
-		prev_hist_id = xhlock->hist_id;
+			prev_hist_id = xhlock->hist_id;
 
-		/*
-		 * commit_xhlock() returns 0 with graph_lock already
-		 * released if fail.
-		 */
-		if (!commit_xhlock(xlock, xhlock))
-			return;
+			/*
+			 * commit_xhlock() returns 0 with graph_lock already
+			 * released if fail.
+			 */
+			if (!commit_xhlock(xlock, xhlock))
+				return;
+		}
 	}
 
 	graph_unlock();
@@ -5039,16 +5058,27 @@ void lock_commit_crosslock(struct lockdep_map *lock)
 EXPORT_SYMBOL_GPL(lock_commit_crosslock);
 
 /*
- * Return: 1 - crosslock, done;
+ * Return: 0 - failure;
+ *         1 - crosslock, done;
  *         2 - normal lock, continue to held_lock[] ops.
  */
 static int lock_release_crosslock(struct lockdep_map *lock)
 {
-	return cross_lock(lock) ? 1 : 2;
+	if (cross_lock(lock)) {
+		if (!graph_lock())
+			return 0;
+		((struct lockdep_map_cross *)lock)->xlock.nr_acquire--;
+		graph_unlock();
+		return 1;
+	}
+	return 2;
 }
 
 static void cross_init(struct lockdep_map *lock, int cross)
 {
+	if (cross)
+		((struct lockdep_map_cross *)lock)->xlock.nr_acquire = 0;
+
 	lock->cross = cross;
 
 	/*
-- 
cgit v1.2.3


From cd8084f91c02c1afd256a39aa833bff737631304 Mon Sep 17 00:00:00 2001
From: Byungchul Park <byungchul.park@lge.com>
Date: Mon, 7 Aug 2017 16:12:56 +0900
Subject: locking/lockdep: Apply crossrelease to completions

Although wait_for_completion() and its family can cause deadlock, the
lock correctness validator could not be applied to them until now,
because things like complete() are usually called in a different context
from the waiting context, which violates lockdep's assumption.

Thanks to CONFIG_LOCKDEP_CROSSRELEASE, we can now apply the lockdep
detector to those completion operations. Applied it.

Signed-off-by: Byungchul Park <byungchul.park@lge.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: akpm@linux-foundation.org
Cc: boqun.feng@gmail.com
Cc: kernel-team@lge.com
Cc: kirill@shutemov.name
Cc: npiggin@gmail.com
Cc: walken@google.com
Cc: willy@infradead.org
Link: http://lkml.kernel.org/r/1502089981-21272-10-git-send-email-byungchul.park@lge.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/completion.h | 45 ++++++++++++++++++++++++++++++++++++++++++++-
 kernel/sched/completion.c  | 11 +++++++++++
 lib/Kconfig.debug          |  9 +++++++++
 3 files changed, 64 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/completion.h b/include/linux/completion.h
index 5d5aaae3af43..9bcebf509b4d 100644
--- a/include/linux/completion.h
+++ b/include/linux/completion.h
@@ -9,6 +9,9 @@
  */
 
 #include <linux/wait.h>
+#ifdef CONFIG_LOCKDEP_COMPLETE
+#include <linux/lockdep.h>
+#endif
 
 /*
  * struct completion - structure used to maintain state for a "completion"
@@ -25,10 +28,50 @@
 struct completion {
 	unsigned int done;
 	wait_queue_head_t wait;
+#ifdef CONFIG_LOCKDEP_COMPLETE
+	struct lockdep_map_cross map;
+#endif
 };
 
+#ifdef CONFIG_LOCKDEP_COMPLETE
+static inline void complete_acquire(struct completion *x)
+{
+	lock_acquire_exclusive((struct lockdep_map *)&x->map, 0, 0, NULL, _RET_IP_);
+}
+
+static inline void complete_release(struct completion *x)
+{
+	lock_release((struct lockdep_map *)&x->map, 0, _RET_IP_);
+}
+
+static inline void complete_release_commit(struct completion *x)
+{
+	lock_commit_crosslock((struct lockdep_map *)&x->map);
+}
+
+#define init_completion(x)						\
+do {									\
+	static struct lock_class_key __key;				\
+	lockdep_init_map_crosslock((struct lockdep_map *)&(x)->map,	\
+			"(complete)" #x,				\
+			&__key, 0);					\
+	__init_completion(x);						\
+} while (0)
+#else
+#define init_completion(x) __init_completion(x)
+static inline void complete_acquire(struct completion *x) {}
+static inline void complete_release(struct completion *x) {}
+static inline void complete_release_commit(struct completion *x) {}
+#endif
+
+#ifdef CONFIG_LOCKDEP_COMPLETE
+#define COMPLETION_INITIALIZER(work) \
+	{ 0, __WAIT_QUEUE_HEAD_INITIALIZER((work).wait), \
+	STATIC_CROSS_LOCKDEP_MAP_INIT("(complete)" #work, &(work)) }
+#else
 #define COMPLETION_INITIALIZER(work) \
 	{ 0, __WAIT_QUEUE_HEAD_INITIALIZER((work).wait) }
+#endif
 
 #define COMPLETION_INITIALIZER_ONSTACK(work) \
 	({ init_completion(&work); work; })
@@ -70,7 +113,7 @@ struct completion {
  * This inline function will initialize a dynamically created completion
  * structure.
  */
-static inline void init_completion(struct completion *x)
+static inline void __init_completion(struct completion *x)
 {
 	x->done = 0;
 	init_waitqueue_head(&x->wait);
diff --git a/kernel/sched/completion.c b/kernel/sched/completion.c
index 13fc5ae9bf2f..566b6ec7b6fe 100644
--- a/kernel/sched/completion.c
+++ b/kernel/sched/completion.c
@@ -32,6 +32,12 @@ void complete(struct completion *x)
 	unsigned long flags;
 
 	spin_lock_irqsave(&x->wait.lock, flags);
+
+	/*
+	 * Perform commit of crossrelease here.
+	 */
+	complete_release_commit(x);
+
 	if (x->done != UINT_MAX)
 		x->done++;
 	__wake_up_locked(&x->wait, TASK_NORMAL, 1);
@@ -92,9 +98,14 @@ __wait_for_common(struct completion *x,
 {
 	might_sleep();
 
+	complete_acquire(x);
+
 	spin_lock_irq(&x->wait.lock);
 	timeout = do_wait_for_common(x, action, timeout, state);
 	spin_unlock_irq(&x->wait.lock);
+
+	complete_release(x);
+
 	return timeout;
 }
 
diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
index c6038f23bb1a..ebd40d3b0b28 100644
--- a/lib/Kconfig.debug
+++ b/lib/Kconfig.debug
@@ -1162,6 +1162,15 @@ config LOCKDEP_CROSSRELEASE
 	 such as page locks or completions can use the lock correctness
 	 detector, lockdep.
 
+config LOCKDEP_COMPLETE
+	bool "Lock debugging: allow completions to use deadlock detector"
+	depends on PROVE_LOCKING
+	select LOCKDEP_CROSSRELEASE
+	default n
+	help
+	 A deadlock caused by wait_for_completion() and complete() can be
+	 detected by lockdep using crossrelease feature.
+
 config DEBUG_LOCKDEP
 	bool "Lock dependency engine debugging"
 	depends on DEBUG_KERNEL && LOCKDEP
-- 
cgit v1.2.3


From 90001d67be2fa2acbe3510d1f64fa6533efa30ef Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Mon, 31 Jul 2017 17:50:05 +0200
Subject: sched/fair: Fix wake_affine() for !NUMA_BALANCING

In commit:

  3fed382b46ba ("sched/numa: Implement NUMA node level wake_affine()")

Rik changed wake_affine to consider NUMA information when balancing
between LLC domains.

There are a number of problems here which this patch tries to address:

 - LLC < NODE; in this case we'd use the wrong information to balance
 - !NUMA_BALANCING: in this case, the new code doesn't do any
   balancing at all
 - re-computes the NUMA data for every wakeup, this can mean iterating
   up to 64 CPUs for every wakeup.
 - default affine wakeups inside a cache

We address these by saving the load/capacity values for each
sched_domain during regular load-balance and using these values in
wake_affine_llc(). The obvious down-side to using cached values is
that they can be too old and poorly reflect reality.

But this way we can use LLC wide information and thus not rely on
assuming LLC matches NODE. We also don't rely on NUMA_BALANCING nor do
we have to aggegate two nodes (or even cache domains) worth of CPUs
for each wakeup.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Josef Bacik <josef@toxicpanda.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rik van Riel <riel@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-kernel@vger.kernel.org
Fixes: 3fed382b46ba ("sched/numa: Implement NUMA node level wake_affine()")
[ Minor readability improvements. ]
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/sched/topology.h |   8 ++
 kernel/sched/fair.c            | 190 ++++++++++++++++++++++++++---------------
 2 files changed, 130 insertions(+), 68 deletions(-)

(limited to 'include')

diff --git a/include/linux/sched/topology.h b/include/linux/sched/topology.h
index 7d065abc7a47..d7b6dab956ec 100644
--- a/include/linux/sched/topology.h
+++ b/include/linux/sched/topology.h
@@ -71,6 +71,14 @@ struct sched_domain_shared {
 	atomic_t	ref;
 	atomic_t	nr_busy_cpus;
 	int		has_idle_cores;
+
+	/*
+	 * Some variables from the most recent sd_lb_stats for this domain,
+	 * used by wake_affine().
+	 */
+	unsigned long	nr_running;
+	unsigned long	load;
+	unsigned long	capacity;
 };
 
 struct sched_domain {
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index a7f1c3b797f8..8d5868771cb3 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -2658,59 +2658,6 @@ void task_tick_numa(struct rq *rq, struct task_struct *curr)
 	}
 }
 
-/*
- * Can a task be moved from prev_cpu to this_cpu without causing a load
- * imbalance that would trigger the load balancer?
- */
-static inline bool numa_wake_affine(struct sched_domain *sd,
-				    struct task_struct *p, int this_cpu,
-				    int prev_cpu, int sync)
-{
-	struct numa_stats prev_load, this_load;
-	s64 this_eff_load, prev_eff_load;
-
-	update_numa_stats(&prev_load, cpu_to_node(prev_cpu));
-	update_numa_stats(&this_load, cpu_to_node(this_cpu));
-
-	/*
-	 * If sync wakeup then subtract the (maximum possible)
-	 * effect of the currently running task from the load
-	 * of the current CPU:
-	 */
-	if (sync) {
-		unsigned long current_load = task_h_load(current);
-
-		if (this_load.load > current_load)
-			this_load.load -= current_load;
-		else
-			this_load.load = 0;
-	}
-
-	/*
-	 * In low-load situations, where this_cpu's node is idle due to the
-	 * sync cause above having dropped this_load.load to 0, move the task.
-	 * Moving to an idle socket will not create a bad imbalance.
-	 *
-	 * Otherwise check if the nodes are near enough in load to allow this
-	 * task to be woken on this_cpu's node.
-	 */
-	if (this_load.load > 0) {
-		unsigned long task_load = task_h_load(p);
-
-		this_eff_load = 100;
-		this_eff_load *= prev_load.compute_capacity;
-
-		prev_eff_load = 100 + (sd->imbalance_pct - 100) / 2;
-		prev_eff_load *= this_load.compute_capacity;
-
-		this_eff_load *= this_load.load + task_load;
-		prev_eff_load *= prev_load.load - task_load;
-
-		return this_eff_load <= prev_eff_load;
-	}
-
-	return true;
-}
 #else
 static void task_tick_numa(struct rq *rq, struct task_struct *curr)
 {
@@ -2724,14 +2671,6 @@ static inline void account_numa_dequeue(struct rq *rq, struct task_struct *p)
 {
 }
 
-#ifdef CONFIG_SMP
-static inline bool numa_wake_affine(struct sched_domain *sd,
-				    struct task_struct *p, int this_cpu,
-				    int prev_cpu, int sync)
-{
-	return true;
-}
-#endif /* !SMP */
 #endif /* CONFIG_NUMA_BALANCING */
 
 static void
@@ -5428,20 +5367,115 @@ static int wake_wide(struct task_struct *p)
 	return 1;
 }
 
+struct llc_stats {
+	unsigned long	nr_running;
+	unsigned long	load;
+	unsigned long	capacity;
+	int		has_capacity;
+};
+
+static bool get_llc_stats(struct llc_stats *stats, int cpu)
+{
+	struct sched_domain_shared *sds = rcu_dereference(per_cpu(sd_llc_shared, cpu));
+
+	if (!sds)
+		return false;
+
+	stats->nr_running	= READ_ONCE(sds->nr_running);
+	stats->load		= READ_ONCE(sds->load);
+	stats->capacity		= READ_ONCE(sds->capacity);
+	stats->has_capacity	= stats->nr_running < per_cpu(sd_llc_size, cpu);
+
+	return true;
+}
+
+/*
+ * Can a task be moved from prev_cpu to this_cpu without causing a load
+ * imbalance that would trigger the load balancer?
+ *
+ * Since we're running on 'stale' values, we might in fact create an imbalance
+ * but recomputing these values is expensive, as that'd mean iteration 2 cache
+ * domains worth of CPUs.
+ */
+static bool
+wake_affine_llc(struct sched_domain *sd, struct task_struct *p,
+		int this_cpu, int prev_cpu, int sync)
+{
+	struct llc_stats prev_stats, this_stats;
+	s64 this_eff_load, prev_eff_load;
+	unsigned long task_load;
+
+	if (!get_llc_stats(&prev_stats, prev_cpu) ||
+	    !get_llc_stats(&this_stats, this_cpu))
+		return false;
+
+	/*
+	 * If sync wakeup then subtract the (maximum possible)
+	 * effect of the currently running task from the load
+	 * of the current LLC.
+	 */
+	if (sync) {
+		unsigned long current_load = task_h_load(current);
+
+		/* in this case load hits 0 and this LLC is considered 'idle' */
+		if (current_load > this_stats.load)
+			return true;
+
+		this_stats.load -= current_load;
+	}
+
+	/*
+	 * The has_capacity stuff is not SMT aware, but by trying to balance
+	 * the nr_running on both ends we try and fill the domain at equal
+	 * rates, thereby first consuming cores before siblings.
+	 */
+
+	/* if the old cache has capacity, stay there */
+	if (prev_stats.has_capacity && prev_stats.nr_running < this_stats.nr_running+1)
+		return false;
+
+	/* if this cache has capacity, come here */
+	if (this_stats.has_capacity && this_stats.nr_running < prev_stats.nr_running+1)
+		return true;
+
+	/*
+	 * Check to see if we can move the load without causing too much
+	 * imbalance.
+	 */
+	task_load = task_h_load(p);
+
+	this_eff_load = 100;
+	this_eff_load *= prev_stats.capacity;
+
+	prev_eff_load = 100 + (sd->imbalance_pct - 100) / 2;
+	prev_eff_load *= this_stats.capacity;
+
+	this_eff_load *= this_stats.load + task_load;
+	prev_eff_load *= prev_stats.load - task_load;
+
+	return this_eff_load <= prev_eff_load;
+}
+
 static int wake_affine(struct sched_domain *sd, struct task_struct *p,
 		       int prev_cpu, int sync)
 {
 	int this_cpu = smp_processor_id();
-	bool affine = false;
+	bool affine;
 
 	/*
-	 * Common case: CPUs are in the same socket, and select_idle_sibling()
-	 * will do its thing regardless of what we return:
+	 * Default to no affine wakeups; wake_affine() should not effect a task
+	 * placement the load-balancer feels inclined to undo. The conservative
+	 * option is therefore to not move tasks when they wake up.
 	 */
-	if (cpus_share_cache(prev_cpu, this_cpu))
-		affine = true;
-	else
-		affine = numa_wake_affine(sd, p, this_cpu, prev_cpu, sync);
+	affine = false;
+
+	/*
+	 * If the wakeup is across cache domains, try to evaluate if movement
+	 * makes sense, otherwise rely on select_idle_siblings() to do
+	 * placement inside the cache domain.
+	 */
+	if (!cpus_share_cache(prev_cpu, this_cpu))
+		affine = wake_affine_llc(sd, p, this_cpu, prev_cpu, sync);
 
 	schedstat_inc(p->se.statistics.nr_wakeups_affine_attempts);
 	if (affine) {
@@ -7121,6 +7155,7 @@ struct sg_lb_stats {
 struct sd_lb_stats {
 	struct sched_group *busiest;	/* Busiest group in this sd */
 	struct sched_group *local;	/* Local group in this sd */
+	unsigned long total_running;
 	unsigned long total_load;	/* Total load of all groups in sd */
 	unsigned long total_capacity;	/* Total capacity of all groups in sd */
 	unsigned long avg_load;	/* Average load across all groups in sd */
@@ -7140,6 +7175,7 @@ static inline void init_sd_lb_stats(struct sd_lb_stats *sds)
 	*sds = (struct sd_lb_stats){
 		.busiest = NULL,
 		.local = NULL,
+		.total_running = 0UL,
 		.total_load = 0UL,
 		.total_capacity = 0UL,
 		.busiest_stat = {
@@ -7575,6 +7611,7 @@ static inline enum fbq_type fbq_classify_rq(struct rq *rq)
  */
 static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sds)
 {
+	struct sched_domain_shared *shared = env->sd->shared;
 	struct sched_domain *child = env->sd->child;
 	struct sched_group *sg = env->sd->groups;
 	struct sg_lb_stats *local = &sds->local_stat;
@@ -7631,6 +7668,7 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd
 
 next_group:
 		/* Now, start updating sd_lb_stats */
+		sds->total_running += sgs->sum_nr_running;
 		sds->total_load += sgs->group_load;
 		sds->total_capacity += sgs->group_capacity;
 
@@ -7646,6 +7684,21 @@ next_group:
 			env->dst_rq->rd->overload = overload;
 	}
 
+	if (!shared)
+		return;
+
+	/*
+	 * Since these are sums over groups they can contain some CPUs
+	 * multiple times for the NUMA domains.
+	 *
+	 * Currently only wake_affine_llc() and find_busiest_group()
+	 * uses these numbers, only the last is affected by this problem.
+	 *
+	 * XXX fix that.
+	 */
+	WRITE_ONCE(shared->nr_running,	sds->total_running);
+	WRITE_ONCE(shared->load,	sds->total_load);
+	WRITE_ONCE(shared->capacity,	sds->total_capacity);
 }
 
 /**
@@ -7875,6 +7928,7 @@ static struct sched_group *find_busiest_group(struct lb_env *env)
 	if (!sds.busiest || busiest->sum_nr_running == 0)
 		goto out_balanced;
 
+	/* XXX broken for overlapping NUMA groups */
 	sds.avg_load = (SCHED_CAPACITY_SCALE * sds.total_load)
 						/ sds.total_capacity;
 
-- 
cgit v1.2.3


From e743eb1ecd5564b5ae0a4a76c1566f748a358839 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Thu, 10 Aug 2017 08:25:38 -0600
Subject: block: remove unused syncfull/asyncfull queue flags

We haven't used these in years, but somehow the definitions still
remained. Kill them, and renumber the QUEUE_FLAG_ space. We had
a hole in the beginning of the space, too.

Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-mq-debugfs.c |  2 --
 include/linux/blkdev.h | 60 ++++++++++++++++++++++++--------------------------
 2 files changed, 29 insertions(+), 33 deletions(-)

(limited to 'include')

diff --git a/block/blk-mq-debugfs.c b/block/blk-mq-debugfs.c
index 9ebc2945f991..55940ddacd96 100644
--- a/block/blk-mq-debugfs.c
+++ b/block/blk-mq-debugfs.c
@@ -48,8 +48,6 @@ static int blk_flags_show(struct seq_file *m, const unsigned long flags,
 static const char *const blk_queue_flag_name[] = {
 	QUEUE_FLAG_NAME(QUEUED),
 	QUEUE_FLAG_NAME(STOPPED),
-	QUEUE_FLAG_NAME(SYNCFULL),
-	QUEUE_FLAG_NAME(ASYNCFULL),
 	QUEUE_FLAG_NAME(DYING),
 	QUEUE_FLAG_NAME(BYPASS),
 	QUEUE_FLAG_NAME(BIDI),
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 25f6a0cb27d3..f45f157b2910 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -601,38 +601,36 @@ struct request_queue {
 	u64			write_hints[BLK_MAX_WRITE_HINTS];
 };
 
-#define QUEUE_FLAG_QUEUED	1	/* uses generic tag queueing */
-#define QUEUE_FLAG_STOPPED	2	/* queue is stopped */
-#define	QUEUE_FLAG_SYNCFULL	3	/* read queue has been filled */
-#define QUEUE_FLAG_ASYNCFULL	4	/* write queue has been filled */
-#define QUEUE_FLAG_DYING	5	/* queue being torn down */
-#define QUEUE_FLAG_BYPASS	6	/* act as dumb FIFO queue */
-#define QUEUE_FLAG_BIDI		7	/* queue supports bidi requests */
-#define QUEUE_FLAG_NOMERGES     8	/* disable merge attempts */
-#define QUEUE_FLAG_SAME_COMP	9	/* complete on same CPU-group */
-#define QUEUE_FLAG_FAIL_IO     10	/* fake timeout */
-#define QUEUE_FLAG_STACKABLE   11	/* supports request stacking */
-#define QUEUE_FLAG_NONROT      12	/* non-rotational device (SSD) */
+#define QUEUE_FLAG_QUEUED	0	/* uses generic tag queueing */
+#define QUEUE_FLAG_STOPPED	1	/* queue is stopped */
+#define QUEUE_FLAG_DYING	2	/* queue being torn down */
+#define QUEUE_FLAG_BYPASS	3	/* act as dumb FIFO queue */
+#define QUEUE_FLAG_BIDI		4	/* queue supports bidi requests */
+#define QUEUE_FLAG_NOMERGES     5	/* disable merge attempts */
+#define QUEUE_FLAG_SAME_COMP	6	/* complete on same CPU-group */
+#define QUEUE_FLAG_FAIL_IO	7	/* fake timeout */
+#define QUEUE_FLAG_STACKABLE	8	/* supports request stacking */
+#define QUEUE_FLAG_NONROT	9	/* non-rotational device (SSD) */
 #define QUEUE_FLAG_VIRT        QUEUE_FLAG_NONROT /* paravirt device */
-#define QUEUE_FLAG_IO_STAT     13	/* do IO stats */
-#define QUEUE_FLAG_DISCARD     14	/* supports DISCARD */
-#define QUEUE_FLAG_NOXMERGES   15	/* No extended merges */
-#define QUEUE_FLAG_ADD_RANDOM  16	/* Contributes to random pool */
-#define QUEUE_FLAG_SECERASE    17	/* supports secure erase */
-#define QUEUE_FLAG_SAME_FORCE  18	/* force complete on same CPU */
-#define QUEUE_FLAG_DEAD        19	/* queue tear-down finished */
-#define QUEUE_FLAG_INIT_DONE   20	/* queue is initialized */
-#define QUEUE_FLAG_NO_SG_MERGE 21	/* don't attempt to merge SG segments*/
-#define QUEUE_FLAG_POLL	       22	/* IO polling enabled if set */
-#define QUEUE_FLAG_WC	       23	/* Write back caching */
-#define QUEUE_FLAG_FUA	       24	/* device supports FUA writes */
-#define QUEUE_FLAG_FLUSH_NQ    25	/* flush not queueuable */
-#define QUEUE_FLAG_DAX         26	/* device supports DAX */
-#define QUEUE_FLAG_STATS       27	/* track rq completion times */
-#define QUEUE_FLAG_POLL_STATS  28	/* collecting stats for hybrid polling */
-#define QUEUE_FLAG_REGISTERED  29	/* queue has been registered to a disk */
-#define QUEUE_FLAG_SCSI_PASSTHROUGH 30	/* queue supports SCSI commands */
-#define QUEUE_FLAG_QUIESCED    31	/* queue has been quiesced */
+#define QUEUE_FLAG_IO_STAT     10	/* do IO stats */
+#define QUEUE_FLAG_DISCARD     11	/* supports DISCARD */
+#define QUEUE_FLAG_NOXMERGES   12	/* No extended merges */
+#define QUEUE_FLAG_ADD_RANDOM  13	/* Contributes to random pool */
+#define QUEUE_FLAG_SECERASE    14	/* supports secure erase */
+#define QUEUE_FLAG_SAME_FORCE  15	/* force complete on same CPU */
+#define QUEUE_FLAG_DEAD        16	/* queue tear-down finished */
+#define QUEUE_FLAG_INIT_DONE   17	/* queue is initialized */
+#define QUEUE_FLAG_NO_SG_MERGE 18	/* don't attempt to merge SG segments*/
+#define QUEUE_FLAG_POLL	       19	/* IO polling enabled if set */
+#define QUEUE_FLAG_WC	       20	/* Write back caching */
+#define QUEUE_FLAG_FUA	       21	/* device supports FUA writes */
+#define QUEUE_FLAG_FLUSH_NQ    22	/* flush not queueuable */
+#define QUEUE_FLAG_DAX         23	/* device supports DAX */
+#define QUEUE_FLAG_STATS       24	/* track rq completion times */
+#define QUEUE_FLAG_POLL_STATS  25	/* collecting stats for hybrid polling */
+#define QUEUE_FLAG_REGISTERED  26	/* queue has been registered to a disk */
+#define QUEUE_FLAG_SCSI_PASSTHROUGH 27	/* queue supports SCSI commands */
+#define QUEUE_FLAG_QUIESCED    28	/* queue has been quiesced */
 
 #define QUEUE_FLAG_DEFAULT	((1 << QUEUE_FLAG_IO_STAT) |		\
 				 (1 << QUEUE_FLAG_STACKABLE)	|	\
-- 
cgit v1.2.3


From 229a71860547ec856b156179a9c6bef2de426f66 Mon Sep 17 00:00:00 2001
From: Masami Hiramatsu <mhiramat@kernel.org>
Date: Thu, 3 Aug 2017 11:38:21 +0900
Subject: irq: Make the irqentry text section unconditional

Generate irqentry and softirqentry text sections without
any Kconfig dependencies. This will add extra sections, but
there should be no performace impact.

Suggested-by: Ingo Molnar <mingo@kernel.org>
Signed-off-by: Masami Hiramatsu <mhiramat@kernel.org>
Cc: Ananth N Mavinakayanahalli <ananth@in.ibm.com>
Cc: Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>
Cc: Chris Zankel <chris@zankel.net>
Cc: David S . Miller <davem@davemloft.net>
Cc: Francis Deslauriers <francis.deslauriers@efficios.com>
Cc: Jesper Nilsson <jesper.nilsson@axis.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Max Filippov <jcmvbkbc@gmail.com>
Cc: Mikael Starvik <starvik@axis.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Yoshinori Sato <ysato@users.sourceforge.jp>
Cc: linux-arch@vger.kernel.org
Cc: linux-cris-kernel@axis.com
Cc: mathieu.desnoyers@efficios.com
Link: http://lkml.kernel.org/r/150172789110.27216.3955739126693102122.stgit@devbox
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/arm/include/asm/traps.h      |  7 -------
 arch/arm64/include/asm/traps.h    |  7 -------
 arch/x86/entry/entry_64.S         |  9 ++-------
 arch/x86/kernel/unwind_frame.c    |  2 --
 include/asm-generic/sections.h    |  4 ++++
 include/asm-generic/vmlinux.lds.h |  8 --------
 include/linux/interrupt.h         | 14 +-------------
 7 files changed, 7 insertions(+), 44 deletions(-)

(limited to 'include')

diff --git a/arch/arm/include/asm/traps.h b/arch/arm/include/asm/traps.h
index f555bb3664dc..683d9230984a 100644
--- a/arch/arm/include/asm/traps.h
+++ b/arch/arm/include/asm/traps.h
@@ -18,7 +18,6 @@ struct undef_hook {
 void register_undef_hook(struct undef_hook *hook);
 void unregister_undef_hook(struct undef_hook *hook);
 
-#ifdef CONFIG_FUNCTION_GRAPH_TRACER
 static inline int __in_irqentry_text(unsigned long ptr)
 {
 	extern char __irqentry_text_start[];
@@ -27,12 +26,6 @@ static inline int __in_irqentry_text(unsigned long ptr)
 	return ptr >= (unsigned long)&__irqentry_text_start &&
 	       ptr < (unsigned long)&__irqentry_text_end;
 }
-#else
-static inline int __in_irqentry_text(unsigned long ptr)
-{
-	return 0;
-}
-#endif
 
 static inline int in_exception_text(unsigned long ptr)
 {
diff --git a/arch/arm64/include/asm/traps.h b/arch/arm64/include/asm/traps.h
index 02e9035b0685..47a9066f7c86 100644
--- a/arch/arm64/include/asm/traps.h
+++ b/arch/arm64/include/asm/traps.h
@@ -37,18 +37,11 @@ void unregister_undef_hook(struct undef_hook *hook);
 
 void arm64_notify_segfault(struct pt_regs *regs, unsigned long addr);
 
-#ifdef CONFIG_FUNCTION_GRAPH_TRACER
 static inline int __in_irqentry_text(unsigned long ptr)
 {
 	return ptr >= (unsigned long)&__irqentry_text_start &&
 	       ptr < (unsigned long)&__irqentry_text_end;
 }
-#else
-static inline int __in_irqentry_text(unsigned long ptr)
-{
-	return 0;
-}
-#endif
 
 static inline int in_exception_text(unsigned long ptr)
 {
diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S
index d271fb79248f..3e3da2928c44 100644
--- a/arch/x86/entry/entry_64.S
+++ b/arch/x86/entry/entry_64.S
@@ -675,13 +675,8 @@ apicinterrupt3 \num trace(\sym) smp_trace(\sym)
 #endif
 
 /* Make sure APIC interrupt handlers end up in the irqentry section: */
-#if defined(CONFIG_FUNCTION_GRAPH_TRACER) || defined(CONFIG_KASAN)
-# define PUSH_SECTION_IRQENTRY	.pushsection .irqentry.text, "ax"
-# define POP_SECTION_IRQENTRY	.popsection
-#else
-# define PUSH_SECTION_IRQENTRY
-# define POP_SECTION_IRQENTRY
-#endif
+#define PUSH_SECTION_IRQENTRY	.pushsection .irqentry.text, "ax"
+#define POP_SECTION_IRQENTRY	.popsection
 
 .macro apicinterrupt num sym do_sym
 PUSH_SECTION_IRQENTRY
diff --git a/arch/x86/kernel/unwind_frame.c b/arch/x86/kernel/unwind_frame.c
index b9389d72b2f7..c29e5bc7e9c9 100644
--- a/arch/x86/kernel/unwind_frame.c
+++ b/arch/x86/kernel/unwind_frame.c
@@ -91,10 +91,8 @@ static bool in_entry_code(unsigned long ip)
 	if (addr >= __entry_text_start && addr < __entry_text_end)
 		return true;
 
-#if defined(CONFIG_FUNCTION_GRAPH_TRACER) || defined(CONFIG_KASAN)
 	if (addr >= __irqentry_text_start && addr < __irqentry_text_end)
 		return true;
-#endif
 
 	return false;
 }
diff --git a/include/asm-generic/sections.h b/include/asm-generic/sections.h
index 532372c6cf15..e5da44eddd2f 100644
--- a/include/asm-generic/sections.h
+++ b/include/asm-generic/sections.h
@@ -27,6 +27,8 @@
  *	__kprobes_text_start, __kprobes_text_end
  *	__entry_text_start, __entry_text_end
  *	__ctors_start, __ctors_end
+ *	__irqentry_text_start, __irqentry_text_end
+ *	__softirqentry_text_start, __softirqentry_text_end
  */
 extern char _text[], _stext[], _etext[];
 extern char _data[], _sdata[], _edata[];
@@ -39,6 +41,8 @@ extern char __per_cpu_load[], __per_cpu_start[], __per_cpu_end[];
 extern char __kprobes_text_start[], __kprobes_text_end[];
 extern char __entry_text_start[], __entry_text_end[];
 extern char __start_rodata[], __end_rodata[];
+extern char __irqentry_text_start[], __irqentry_text_end[];
+extern char __softirqentry_text_start[], __softirqentry_text_end[];
 
 /* Start and end of .ctors section - used for constructor calls. */
 extern char __ctors_start[], __ctors_end[];
diff --git a/include/asm-generic/vmlinux.lds.h b/include/asm-generic/vmlinux.lds.h
index da0be9a8d1de..62e2395f0a82 100644
--- a/include/asm-generic/vmlinux.lds.h
+++ b/include/asm-generic/vmlinux.lds.h
@@ -483,25 +483,17 @@
 		*(.entry.text)						\
 		VMLINUX_SYMBOL(__entry_text_end) = .;
 
-#if defined(CONFIG_FUNCTION_GRAPH_TRACER) || defined(CONFIG_KASAN)
 #define IRQENTRY_TEXT							\
 		ALIGN_FUNCTION();					\
 		VMLINUX_SYMBOL(__irqentry_text_start) = .;		\
 		*(.irqentry.text)					\
 		VMLINUX_SYMBOL(__irqentry_text_end) = .;
-#else
-#define IRQENTRY_TEXT
-#endif
 
-#if defined(CONFIG_FUNCTION_GRAPH_TRACER) || defined(CONFIG_KASAN)
 #define SOFTIRQENTRY_TEXT						\
 		ALIGN_FUNCTION();					\
 		VMLINUX_SYMBOL(__softirqentry_text_start) = .;		\
 		*(.softirqentry.text)					\
 		VMLINUX_SYMBOL(__softirqentry_text_end) = .;
-#else
-#define SOFTIRQENTRY_TEXT
-#endif
 
 /* Section used for early init (in .S files) */
 #define HEAD_TEXT  *(.head.text)
diff --git a/include/linux/interrupt.h b/include/linux/interrupt.h
index a2fddddb0d60..59ba11661b6e 100644
--- a/include/linux/interrupt.h
+++ b/include/linux/interrupt.h
@@ -18,6 +18,7 @@
 #include <linux/atomic.h>
 #include <asm/ptrace.h>
 #include <asm/irq.h>
+#include <asm/sections.h>
 
 /*
  * These correspond to the IORESOURCE_IRQ_* defines in
@@ -726,7 +727,6 @@ extern int early_irq_init(void);
 extern int arch_probe_nr_irqs(void);
 extern int arch_early_irq_init(void);
 
-#if defined(CONFIG_FUNCTION_GRAPH_TRACER) || defined(CONFIG_KASAN)
 /*
  * We want to know which function is an entrypoint of a hardirq or a softirq.
  */
@@ -734,16 +734,4 @@ extern int arch_early_irq_init(void);
 #define __softirq_entry  \
 	__attribute__((__section__(".softirqentry.text")))
 
-/* Limits of hardirq entrypoints */
-extern char __irqentry_text_start[];
-extern char __irqentry_text_end[];
-/* Limits of softirq entrypoints */
-extern char __softirqentry_text_start[];
-extern char __softirqentry_text_end[];
-
-#else
-#define __irq_entry
-#define __softirq_entry
-#endif
-
 #endif
-- 
cgit v1.2.3


From fc53662f13b889a5a1c069e79ee1e3d4534df132 Mon Sep 17 00:00:00 2001
From: Vitaly Kuznetsov <vkuznets@redhat.com>
Date: Wed, 2 Aug 2017 18:09:14 +0200
Subject: x86/hyper-v: Make hv_do_hypercall() inline

We have only three call sites for hv_do_hypercall() and we're going to
change HVCALL_SIGNAL_EVENT to doing fast hypercall so we can inline this
function for optimization.

Hyper-V top level functional specification states that r9-r11 registers
and flags may be clobbered by the hypervisor during hypercall and with
inlining this is somewhat important, add the clobbers.

Signed-off-by: Vitaly Kuznetsov <vkuznets@redhat.com>
Reviewed-by: Andy Shevchenko <andy.shevchenko@gmail.com>
Reviewed-by: Stephen Hemminger <sthemmin@microsoft.com>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Haiyang Zhang <haiyangz@microsoft.com>
Cc: Jork Loeser <Jork.Loeser@microsoft.com>
Cc: K. Y. Srinivasan <kys@microsoft.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Simon Xiao <sixiao@microsoft.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: devel@linuxdriverproject.org
Link: http://lkml.kernel.org/r/20170802160921.21791-3-vkuznets@redhat.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/hyperv/hv_init.c       | 54 ++++-------------------------------------
 arch/x86/include/asm/mshyperv.h | 40 ++++++++++++++++++++++++++++++
 drivers/hv/connection.c         |  2 ++
 include/linux/hyperv.h          |  1 -
 4 files changed, 47 insertions(+), 50 deletions(-)

(limited to 'include')

diff --git a/arch/x86/hyperv/hv_init.c b/arch/x86/hyperv/hv_init.c
index 5b882cc0c0e9..691603ee9179 100644
--- a/arch/x86/hyperv/hv_init.c
+++ b/arch/x86/hyperv/hv_init.c
@@ -75,7 +75,8 @@ static struct clocksource hyperv_cs_msr = {
 	.flags		= CLOCK_SOURCE_IS_CONTINUOUS,
 };
 
-static void *hypercall_pg;
+void *hv_hypercall_pg;
+EXPORT_SYMBOL_GPL(hv_hypercall_pg);
 struct clocksource *hyperv_cs;
 EXPORT_SYMBOL_GPL(hyperv_cs);
 
@@ -102,15 +103,15 @@ void hyperv_init(void)
 	guest_id = generate_guest_id(0, LINUX_VERSION_CODE, 0);
 	wrmsrl(HV_X64_MSR_GUEST_OS_ID, guest_id);
 
-	hypercall_pg  = __vmalloc(PAGE_SIZE, GFP_KERNEL, PAGE_KERNEL_RX);
-	if (hypercall_pg == NULL) {
+	hv_hypercall_pg  = __vmalloc(PAGE_SIZE, GFP_KERNEL, PAGE_KERNEL_RX);
+	if (hv_hypercall_pg == NULL) {
 		wrmsrl(HV_X64_MSR_GUEST_OS_ID, 0);
 		return;
 	}
 
 	rdmsrl(HV_X64_MSR_HYPERCALL, hypercall_msr.as_uint64);
 	hypercall_msr.enable = 1;
-	hypercall_msr.guest_physical_address = vmalloc_to_pfn(hypercall_pg);
+	hypercall_msr.guest_physical_address = vmalloc_to_pfn(hv_hypercall_pg);
 	wrmsrl(HV_X64_MSR_HYPERCALL, hypercall_msr.as_uint64);
 
 	/*
@@ -170,51 +171,6 @@ void hyperv_cleanup(void)
 }
 EXPORT_SYMBOL_GPL(hyperv_cleanup);
 
-/*
- * hv_do_hypercall- Invoke the specified hypercall
- */
-u64 hv_do_hypercall(u64 control, void *input, void *output)
-{
-	u64 input_address = (input) ? virt_to_phys(input) : 0;
-	u64 output_address = (output) ? virt_to_phys(output) : 0;
-#ifdef CONFIG_X86_64
-	u64 hv_status = 0;
-
-	if (!hypercall_pg)
-		return (u64)ULLONG_MAX;
-
-	__asm__ __volatile__("mov %0, %%r8" : : "r" (output_address) : "r8");
-	__asm__ __volatile__("call *%3" : "=a" (hv_status) :
-			     "c" (control), "d" (input_address),
-			     "m" (hypercall_pg));
-
-	return hv_status;
-
-#else
-
-	u32 control_hi = control >> 32;
-	u32 control_lo = control & 0xFFFFFFFF;
-	u32 hv_status_hi = 1;
-	u32 hv_status_lo = 1;
-	u32 input_address_hi = input_address >> 32;
-	u32 input_address_lo = input_address & 0xFFFFFFFF;
-	u32 output_address_hi = output_address >> 32;
-	u32 output_address_lo = output_address & 0xFFFFFFFF;
-
-	if (!hypercall_pg)
-		return (u64)ULLONG_MAX;
-
-	__asm__ __volatile__ ("call *%8" : "=d"(hv_status_hi),
-			      "=a"(hv_status_lo) : "d" (control_hi),
-			      "a" (control_lo), "b" (input_address_hi),
-			      "c" (input_address_lo), "D"(output_address_hi),
-			      "S"(output_address_lo), "m" (hypercall_pg));
-
-	return hv_status_lo | ((u64)hv_status_hi << 32);
-#endif /* !x86_64 */
-}
-EXPORT_SYMBOL_GPL(hv_do_hypercall);
-
 void hyperv_report_panic(struct pt_regs *regs)
 {
 	static bool panic_reported;
diff --git a/arch/x86/include/asm/mshyperv.h b/arch/x86/include/asm/mshyperv.h
index baea2679a0aa..6fa5e342cc86 100644
--- a/arch/x86/include/asm/mshyperv.h
+++ b/arch/x86/include/asm/mshyperv.h
@@ -3,6 +3,7 @@
 
 #include <linux/types.h>
 #include <linux/atomic.h>
+#include <asm/io.h>
 #include <asm/hyperv.h>
 
 /*
@@ -168,6 +169,45 @@ void hv_remove_crash_handler(void);
 
 #if IS_ENABLED(CONFIG_HYPERV)
 extern struct clocksource *hyperv_cs;
+extern void *hv_hypercall_pg;
+
+static inline u64 hv_do_hypercall(u64 control, void *input, void *output)
+{
+	u64 input_address = input ? virt_to_phys(input) : 0;
+	u64 output_address = output ? virt_to_phys(output) : 0;
+	u64 hv_status;
+	register void *__sp asm(_ASM_SP);
+
+#ifdef CONFIG_X86_64
+	if (!hv_hypercall_pg)
+		return U64_MAX;
+
+	__asm__ __volatile__("mov %4, %%r8\n"
+			     "call *%5"
+			     : "=a" (hv_status), "+r" (__sp),
+			       "+c" (control), "+d" (input_address)
+			     :  "r" (output_address), "m" (hv_hypercall_pg)
+			     : "cc", "memory", "r8", "r9", "r10", "r11");
+#else
+	u32 input_address_hi = upper_32_bits(input_address);
+	u32 input_address_lo = lower_32_bits(input_address);
+	u32 output_address_hi = upper_32_bits(output_address);
+	u32 output_address_lo = lower_32_bits(output_address);
+
+	if (!hv_hypercall_pg)
+		return U64_MAX;
+
+	__asm__ __volatile__("call *%7"
+			     : "=A" (hv_status),
+			       "+c" (input_address_lo), "+r" (__sp)
+			     : "A" (control),
+			       "b" (input_address_hi),
+			       "D"(output_address_hi), "S"(output_address_lo),
+			       "m" (hv_hypercall_pg)
+			     : "cc", "memory");
+#endif /* !x86_64 */
+	return hv_status;
+}
 
 void hyperv_init(void);
 void hyperv_report_panic(struct pt_regs *regs);
diff --git a/drivers/hv/connection.c b/drivers/hv/connection.c
index 59c11ff90d12..45e806e3112f 100644
--- a/drivers/hv/connection.c
+++ b/drivers/hv/connection.c
@@ -32,6 +32,8 @@
 #include <linux/hyperv.h>
 #include <linux/export.h>
 #include <asm/hyperv.h>
+#include <asm/mshyperv.h>
+
 #include "hyperv_vmbus.h"
 
 
diff --git a/include/linux/hyperv.h b/include/linux/hyperv.h
index b7d7bbec74e0..6608a71e7d79 100644
--- a/include/linux/hyperv.h
+++ b/include/linux/hyperv.h
@@ -1187,7 +1187,6 @@ int vmbus_allocate_mmio(struct resource **new, struct hv_device *device_obj,
 			bool fb_overlap_ok);
 void vmbus_free_mmio(resource_size_t start, resource_size_t size);
 int vmbus_cpu_number_to_vp_number(int cpu_number);
-u64 hv_do_hypercall(u64 control, void *input, void *output);
 
 /*
  * GUID definitions of various offer types - services offered to the guest.
-- 
cgit v1.2.3


From 057841713cfff62b4485cdd2b245f05b7ea3ba16 Mon Sep 17 00:00:00 2001
From: Vitaly Kuznetsov <vkuznets@redhat.com>
Date: Wed, 2 Aug 2017 18:09:16 +0200
Subject: hyper-v: Use fast hypercall for HVCALL_SIGNAL_EVENT

We need to pass only 8 bytes of input for HvSignalEvent which makes it a
perfect fit for fast hypercall. hv_input_signal_event_buffer is not needed
any more and hv_input_signal_event is converted to union for convenience.

Signed-off-by: Vitaly Kuznetsov <vkuznets@redhat.com>
Reviewed-by: Andy Shevchenko <andy.shevchenko@gmail.com>
Reviewed-by: Stephen Hemminger <sthemmin@microsoft.com>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Haiyang Zhang <haiyangz@microsoft.com>
Cc: Jork Loeser <Jork.Loeser@microsoft.com>
Cc: K. Y. Srinivasan <kys@microsoft.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Simon Xiao <sixiao@microsoft.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: devel@linuxdriverproject.org
Link: http://lkml.kernel.org/r/20170802160921.21791-5-vkuznets@redhat.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 drivers/hv/channel_mgmt.c | 13 ++-----------
 drivers/hv/connection.c   |  2 +-
 include/linux/hyperv.h    | 15 +--------------
 3 files changed, 4 insertions(+), 26 deletions(-)

(limited to 'include')

diff --git a/drivers/hv/channel_mgmt.c b/drivers/hv/channel_mgmt.c
index 4bbb8dea4727..fd2b6c67f781 100644
--- a/drivers/hv/channel_mgmt.c
+++ b/drivers/hv/channel_mgmt.c
@@ -805,21 +805,12 @@ static void vmbus_onoffer(struct vmbus_channel_message_header *hdr)
 	/*
 	 * Setup state for signalling the host.
 	 */
-	newchannel->sig_event = (struct hv_input_signal_event *)
-				(ALIGN((unsigned long)
-				&newchannel->sig_buf,
-				HV_HYPERCALL_PARAM_ALIGN));
-
-	newchannel->sig_event->connectionid.asu32 = 0;
-	newchannel->sig_event->connectionid.u.id = VMBUS_EVENT_CONNECTION_ID;
-	newchannel->sig_event->flag_number = 0;
-	newchannel->sig_event->rsvdz = 0;
+	newchannel->sig_event = VMBUS_EVENT_CONNECTION_ID;
 
 	if (vmbus_proto_version != VERSION_WS2008) {
 		newchannel->is_dedicated_interrupt =
 				(offer->is_dedicated_interrupt != 0);
-		newchannel->sig_event->connectionid.u.id =
-				offer->connection_id;
+		newchannel->sig_event = offer->connection_id;
 	}
 
 	memcpy(&newchannel->offermsg, offer,
diff --git a/drivers/hv/connection.c b/drivers/hv/connection.c
index 45e806e3112f..37ecf514189e 100644
--- a/drivers/hv/connection.c
+++ b/drivers/hv/connection.c
@@ -408,6 +408,6 @@ void vmbus_set_event(struct vmbus_channel *channel)
 	if (!channel->is_dedicated_interrupt)
 		vmbus_send_interrupt(child_relid);
 
-	hv_do_hypercall(HVCALL_SIGNAL_EVENT, channel->sig_event, NULL);
+	hv_do_fast_hypercall8(HVCALL_SIGNAL_EVENT, channel->sig_event);
 }
 EXPORT_SYMBOL_GPL(vmbus_set_event);
diff --git a/include/linux/hyperv.h b/include/linux/hyperv.h
index 6608a71e7d79..c472bd43bdd7 100644
--- a/include/linux/hyperv.h
+++ b/include/linux/hyperv.h
@@ -677,18 +677,6 @@ union hv_connection_id {
 	} u;
 };
 
-/* Definition of the hv_signal_event hypercall input structure. */
-struct hv_input_signal_event {
-	union hv_connection_id connectionid;
-	u16 flag_number;
-	u16 rsvdz;
-};
-
-struct hv_input_signal_event_buffer {
-	u64 align8;
-	struct hv_input_signal_event event;
-};
-
 enum hv_numa_policy {
 	HV_BALANCED = 0,
 	HV_LOCALIZED,
@@ -770,8 +758,7 @@ struct vmbus_channel {
 	} callback_mode;
 
 	bool is_dedicated_interrupt;
-	struct hv_input_signal_event_buffer sig_buf;
-	struct hv_input_signal_event *sig_event;
+	u64 sig_event;
 
 	/*
 	 * Starting with win8, this field will be used to specify
-- 
cgit v1.2.3


From 7415aea6072bab15969b6c3c5b2a193d88095326 Mon Sep 17 00:00:00 2001
From: Vitaly Kuznetsov <vkuznets@redhat.com>
Date: Wed, 2 Aug 2017 18:09:18 +0200
Subject: hyper-v: Globalize vp_index

To support implementing remote TLB flushing on Hyper-V with a hypercall
we need to make vp_index available outside of vmbus module. Rename and
globalize.

Signed-off-by: Vitaly Kuznetsov <vkuznets@redhat.com>
Reviewed-by: Andy Shevchenko <andy.shevchenko@gmail.com>
Reviewed-by: Stephen Hemminger <sthemmin@microsoft.com>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Haiyang Zhang <haiyangz@microsoft.com>
Cc: Jork Loeser <Jork.Loeser@microsoft.com>
Cc: K. Y. Srinivasan <kys@microsoft.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Simon Xiao <sixiao@microsoft.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: devel@linuxdriverproject.org
Link: http://lkml.kernel.org/r/20170802160921.21791-7-vkuznets@redhat.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/hyperv/hv_init.c       | 34 +++++++++++++++++++++++++-
 arch/x86/include/asm/mshyperv.h | 24 ++++++++++++++++++
 drivers/hv/channel_mgmt.c       |  7 +++---
 drivers/hv/connection.c         |  3 ++-
 drivers/hv/hv.c                 |  9 -------
 drivers/hv/hyperv_vmbus.h       | 11 ---------
 drivers/hv/vmbus_drv.c          | 17 -------------
 drivers/pci/host/pci-hyperv.c   | 54 +++--------------------------------------
 include/linux/hyperv.h          |  1 -
 9 files changed, 65 insertions(+), 95 deletions(-)

(limited to 'include')

diff --git a/arch/x86/hyperv/hv_init.c b/arch/x86/hyperv/hv_init.c
index 691603ee9179..e93b9a0b1b10 100644
--- a/arch/x86/hyperv/hv_init.c
+++ b/arch/x86/hyperv/hv_init.c
@@ -26,6 +26,8 @@
 #include <linux/mm.h>
 #include <linux/clockchips.h>
 #include <linux/hyperv.h>
+#include <linux/slab.h>
+#include <linux/cpuhotplug.h>
 
 #ifdef CONFIG_HYPERV_TSCPAGE
 
@@ -80,6 +82,20 @@ EXPORT_SYMBOL_GPL(hv_hypercall_pg);
 struct clocksource *hyperv_cs;
 EXPORT_SYMBOL_GPL(hyperv_cs);
 
+u32 *hv_vp_index;
+EXPORT_SYMBOL_GPL(hv_vp_index);
+
+static int hv_cpu_init(unsigned int cpu)
+{
+	u64 msr_vp_index;
+
+	hv_get_vp_index(msr_vp_index);
+
+	hv_vp_index[smp_processor_id()] = msr_vp_index;
+
+	return 0;
+}
+
 /*
  * This function is to be invoked early in the boot sequence after the
  * hypervisor has been detected.
@@ -95,6 +111,16 @@ void hyperv_init(void)
 	if (x86_hyper != &x86_hyper_ms_hyperv)
 		return;
 
+	/* Allocate percpu VP index */
+	hv_vp_index = kmalloc_array(num_possible_cpus(), sizeof(*hv_vp_index),
+				    GFP_KERNEL);
+	if (!hv_vp_index)
+		return;
+
+	if (cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "x86/hyperv_init:online",
+			      hv_cpu_init, NULL) < 0)
+		goto free_vp_index;
+
 	/*
 	 * Setup the hypercall page and enable hypercalls.
 	 * 1. Register the guest ID
@@ -106,7 +132,7 @@ void hyperv_init(void)
 	hv_hypercall_pg  = __vmalloc(PAGE_SIZE, GFP_KERNEL, PAGE_KERNEL_RX);
 	if (hv_hypercall_pg == NULL) {
 		wrmsrl(HV_X64_MSR_GUEST_OS_ID, 0);
-		return;
+		goto free_vp_index;
 	}
 
 	rdmsrl(HV_X64_MSR_HYPERCALL, hypercall_msr.as_uint64);
@@ -149,6 +175,12 @@ register_msr_cs:
 	hyperv_cs = &hyperv_cs_msr;
 	if (ms_hyperv.features & HV_X64_MSR_TIME_REF_COUNT_AVAILABLE)
 		clocksource_register_hz(&hyperv_cs_msr, NSEC_PER_SEC/100);
+
+	return;
+
+free_vp_index:
+	kfree(hv_vp_index);
+	hv_vp_index = NULL;
 }
 
 /*
diff --git a/arch/x86/include/asm/mshyperv.h b/arch/x86/include/asm/mshyperv.h
index efa1860276b5..efd2f80d3353 100644
--- a/arch/x86/include/asm/mshyperv.h
+++ b/arch/x86/include/asm/mshyperv.h
@@ -282,6 +282,30 @@ static inline u64 hv_do_rep_hypercall(u16 code, u16 rep_count, u16 varhead_size,
 	return status;
 }
 
+/*
+ * Hypervisor's notion of virtual processor ID is different from
+ * Linux' notion of CPU ID. This information can only be retrieved
+ * in the context of the calling CPU. Setup a map for easy access
+ * to this information.
+ */
+extern u32 *hv_vp_index;
+
+/**
+ * hv_cpu_number_to_vp_number() - Map CPU to VP.
+ * @cpu_number: CPU number in Linux terms
+ *
+ * This function returns the mapping between the Linux processor
+ * number and the hypervisor's virtual processor number, useful
+ * in making hypercalls and such that talk about specific
+ * processors.
+ *
+ * Return: Virtual processor number in Hyper-V terms
+ */
+static inline int hv_cpu_number_to_vp_number(int cpu_number)
+{
+	return hv_vp_index[cpu_number];
+}
+
 void hyperv_init(void);
 void hyperv_report_panic(struct pt_regs *regs);
 bool hv_is_hypercall_page_setup(void);
diff --git a/drivers/hv/channel_mgmt.c b/drivers/hv/channel_mgmt.c
index fd2b6c67f781..dc590195a74e 100644
--- a/drivers/hv/channel_mgmt.c
+++ b/drivers/hv/channel_mgmt.c
@@ -599,7 +599,7 @@ static void init_vp_index(struct vmbus_channel *channel, u16 dev_type)
 		 */
 		channel->numa_node = 0;
 		channel->target_cpu = 0;
-		channel->target_vp = hv_context.vp_index[0];
+		channel->target_vp = hv_cpu_number_to_vp_number(0);
 		return;
 	}
 
@@ -683,7 +683,7 @@ static void init_vp_index(struct vmbus_channel *channel, u16 dev_type)
 	}
 
 	channel->target_cpu = cur_cpu;
-	channel->target_vp = hv_context.vp_index[cur_cpu];
+	channel->target_vp = hv_cpu_number_to_vp_number(cur_cpu);
 }
 
 static void vmbus_wait_for_unload(void)
@@ -1219,8 +1219,7 @@ struct vmbus_channel *vmbus_get_outgoing_channel(struct vmbus_channel *primary)
 		return outgoing_channel;
 	}
 
-	cur_cpu = hv_context.vp_index[get_cpu()];
-	put_cpu();
+	cur_cpu = hv_cpu_number_to_vp_number(smp_processor_id());
 	list_for_each_safe(cur, tmp, &primary->sc_list) {
 		cur_channel = list_entry(cur, struct vmbus_channel, sc_list);
 		if (cur_channel->state != CHANNEL_OPENED_STATE)
diff --git a/drivers/hv/connection.c b/drivers/hv/connection.c
index 37ecf514189e..f41901f80b64 100644
--- a/drivers/hv/connection.c
+++ b/drivers/hv/connection.c
@@ -96,7 +96,8 @@ static int vmbus_negotiate_version(struct vmbus_channel_msginfo *msginfo,
 	 * the CPU attempting to connect may not be CPU 0.
 	 */
 	if (version >= VERSION_WIN8_1) {
-		msg->target_vcpu = hv_context.vp_index[smp_processor_id()];
+		msg->target_vcpu =
+			hv_cpu_number_to_vp_number(smp_processor_id());
 		vmbus_connection.connect_cpu = smp_processor_id();
 	} else {
 		msg->target_vcpu = 0;
diff --git a/drivers/hv/hv.c b/drivers/hv/hv.c
index 2ea12207caa0..8267439dd1ee 100644
--- a/drivers/hv/hv.c
+++ b/drivers/hv/hv.c
@@ -234,7 +234,6 @@ int hv_synic_init(unsigned int cpu)
 	union hv_synic_siefp siefp;
 	union hv_synic_sint shared_sint;
 	union hv_synic_scontrol sctrl;
-	u64 vp_index;
 
 	/* Setup the Synic's message page */
 	hv_get_simp(simp.as_uint64);
@@ -275,14 +274,6 @@ int hv_synic_init(unsigned int cpu)
 
 	hv_context.synic_initialized = true;
 
-	/*
-	 * Setup the mapping between Hyper-V's notion
-	 * of cpuid and Linux' notion of cpuid.
-	 * This array will be indexed using Linux cpuid.
-	 */
-	hv_get_vp_index(vp_index);
-	hv_context.vp_index[cpu] = (u32)vp_index;
-
 	/*
 	 * Register the per-cpu clockevent source.
 	 */
diff --git a/drivers/hv/hyperv_vmbus.h b/drivers/hv/hyperv_vmbus.h
index 1b6a5e0dfa75..49569f8fe038 100644
--- a/drivers/hv/hyperv_vmbus.h
+++ b/drivers/hv/hyperv_vmbus.h
@@ -228,17 +228,6 @@ struct hv_context {
 
 	struct hv_per_cpu_context __percpu *cpu_context;
 
-	/*
-	 * Hypervisor's notion of virtual processor ID is different from
-	 * Linux' notion of CPU ID. This information can only be retrieved
-	 * in the context of the calling CPU. Setup a map for easy access
-	 * to this information:
-	 *
-	 * vp_index[a] is the Hyper-V's processor ID corresponding to
-	 * Linux cpuid 'a'.
-	 */
-	u32 vp_index[NR_CPUS];
-
 	/*
 	 * To manage allocations in a NUMA node.
 	 * Array indexed by numa node ID.
diff --git a/drivers/hv/vmbus_drv.c b/drivers/hv/vmbus_drv.c
index ed84e96715a0..c7e7d6db2d21 100644
--- a/drivers/hv/vmbus_drv.c
+++ b/drivers/hv/vmbus_drv.c
@@ -1451,23 +1451,6 @@ void vmbus_free_mmio(resource_size_t start, resource_size_t size)
 }
 EXPORT_SYMBOL_GPL(vmbus_free_mmio);
 
-/**
- * vmbus_cpu_number_to_vp_number() - Map CPU to VP.
- * @cpu_number: CPU number in Linux terms
- *
- * This function returns the mapping between the Linux processor
- * number and the hypervisor's virtual processor number, useful
- * in making hypercalls and such that talk about specific
- * processors.
- *
- * Return: Virtual processor number in Hyper-V terms
- */
-int vmbus_cpu_number_to_vp_number(int cpu_number)
-{
-	return hv_context.vp_index[cpu_number];
-}
-EXPORT_SYMBOL_GPL(vmbus_cpu_number_to_vp_number);
-
 static int vmbus_acpi_add(struct acpi_device *device)
 {
 	acpi_status result;
diff --git a/drivers/pci/host/pci-hyperv.c b/drivers/pci/host/pci-hyperv.c
index 415dcc69a502..aba041438566 100644
--- a/drivers/pci/host/pci-hyperv.c
+++ b/drivers/pci/host/pci-hyperv.c
@@ -562,52 +562,6 @@ static void put_pcichild(struct hv_pci_dev *hv_pcidev,
 static void get_hvpcibus(struct hv_pcibus_device *hv_pcibus);
 static void put_hvpcibus(struct hv_pcibus_device *hv_pcibus);
 
-
-/*
- * Temporary CPU to vCPU mapping to address transitioning
- * vmbus_cpu_number_to_vp_number() being migrated to
- * hv_cpu_number_to_vp_number() in a separate patch. Once that patch
- * has been picked up in the main line, remove this code here and use
- * the official code.
- */
-static struct hv_tmpcpumap
-{
-	bool initialized;
-	u32 vp_index[NR_CPUS];
-} hv_tmpcpumap;
-
-static void hv_tmpcpumap_init_cpu(void *_unused)
-{
-	int cpu = smp_processor_id();
-	u64 vp_index;
-
-	hv_get_vp_index(vp_index);
-
-	hv_tmpcpumap.vp_index[cpu] = vp_index;
-}
-
-static void hv_tmpcpumap_init(void)
-{
-	if (hv_tmpcpumap.initialized)
-		return;
-
-	memset(hv_tmpcpumap.vp_index, -1, sizeof(hv_tmpcpumap.vp_index));
-	on_each_cpu(hv_tmpcpumap_init_cpu, NULL, true);
-	hv_tmpcpumap.initialized = true;
-}
-
-/**
- * hv_tmp_cpu_nr_to_vp_nr() - Convert Linux CPU nr to Hyper-V vCPU nr
- *
- * Remove once vmbus_cpu_number_to_vp_number() has been converted to
- * hv_cpu_number_to_vp_number() and replace callers appropriately.
- */
-static u32 hv_tmp_cpu_nr_to_vp_nr(int cpu)
-{
-	return hv_tmpcpumap.vp_index[cpu];
-}
-
-
 /**
  * devfn_to_wslot() - Convert from Linux PCI slot to Windows
  * @devfn:	The Linux representation of PCI slot
@@ -971,7 +925,7 @@ static void hv_irq_unmask(struct irq_data *data)
 		var_size = 1 + HV_VP_SET_BANK_COUNT_MAX;
 
 		for_each_cpu_and(cpu, dest, cpu_online_mask) {
-			cpu_vmbus = hv_tmp_cpu_nr_to_vp_nr(cpu);
+			cpu_vmbus = hv_cpu_number_to_vp_number(cpu);
 
 			if (cpu_vmbus >= HV_VP_SET_BANK_COUNT_MAX * 64) {
 				dev_err(&hbus->hdev->device,
@@ -986,7 +940,7 @@ static void hv_irq_unmask(struct irq_data *data)
 	} else {
 		for_each_cpu_and(cpu, dest, cpu_online_mask) {
 			params->int_target.vp_mask |=
-				(1ULL << hv_tmp_cpu_nr_to_vp_nr(cpu));
+				(1ULL << hv_cpu_number_to_vp_number(cpu));
 		}
 	}
 
@@ -1063,7 +1017,7 @@ static u32 hv_compose_msi_req_v2(
 	 */
 	cpu = cpumask_first_and(affinity, cpu_online_mask);
 	int_pkt->int_desc.processor_array[0] =
-		hv_tmp_cpu_nr_to_vp_nr(cpu);
+		hv_cpu_number_to_vp_number(cpu);
 	int_pkt->int_desc.processor_count = 1;
 
 	return sizeof(*int_pkt);
@@ -2490,8 +2444,6 @@ static int hv_pci_probe(struct hv_device *hdev,
 		return -ENOMEM;
 	hbus->state = hv_pcibus_init;
 
-	hv_tmpcpumap_init();
-
 	/*
 	 * The PCI bus "domain" is what is called "segment" in ACPI and
 	 * other specs.  Pull it from the instance ID, to get something
diff --git a/include/linux/hyperv.h b/include/linux/hyperv.h
index c472bd43bdd7..e2a4fa57f110 100644
--- a/include/linux/hyperv.h
+++ b/include/linux/hyperv.h
@@ -1173,7 +1173,6 @@ int vmbus_allocate_mmio(struct resource **new, struct hv_device *device_obj,
 			resource_size_t size, resource_size_t align,
 			bool fb_overlap_ok);
 void vmbus_free_mmio(resource_size_t start, resource_size_t size);
-int vmbus_cpu_number_to_vp_number(int cpu_number);
 
 /*
  * GUID definitions of various offer types - services offered to the guest.
-- 
cgit v1.2.3


From a0ac441152238c9b474bafa46940511d9b2e9c7e Mon Sep 17 00:00:00 2001
From: Kuninori Morimoto <kuninori.morimoto.gx@renesas.com>
Date: Tue, 8 Aug 2017 06:17:47 +0000
Subject: ASoC: soc-core: add snd_soc_rtdcom_xxx()

Current snd_soc_pcm_runtime has platform / codec pointers, and we could
use these specific pointer. But these will be replaced to more generic
"component" soon, and will need more generic method to get each
connected component pointer from rtd.

This patch adds new snd_soc_rtdcom_xxx() to connect/disconnect
component to rtd. It means same as previous "platform" / "codec"
pointer style, but more generic.
We can find necessary component pointer from rtd by using component
driver name on snd_soc_rtdcom_lookup().

Here, the reason why it uses "driver name" is that "component name"
was created by fmt_single_name() and difficult to use it from driver.
Driver of course knows its "driver name", thus, using it is more easy.

Signed-off-by: Kuninori Morimoto <kuninori.morimoto.gx@renesas.com>
Tested-by: Arnaud Pouliquen <arnaud.pouliquen@st.com>
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 include/sound/soc.h  | 13 +++++++++++++
 sound/soc/soc-core.c | 50 ++++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 63 insertions(+)

(limited to 'include')

diff --git a/include/sound/soc.h b/include/sound/soc.h
index aad3f14966f1..87dbe39797ca 100644
--- a/include/sound/soc.h
+++ b/include/sound/soc.h
@@ -886,6 +886,18 @@ struct snd_soc_component {
 #endif
 };
 
+struct snd_soc_rtdcom_list {
+	struct snd_soc_component *component;
+	struct list_head list; /* rtd::component_list */
+};
+struct snd_soc_component*
+snd_soc_rtdcom_lookup(struct snd_soc_pcm_runtime *rtd,
+		       const char *driver_name);
+#define for_each_rtdcom(rtd, rtdcom) \
+	list_for_each_entry(rtdcom, &(rtd)->component_list, list)
+#define for_each_rtdcom_safe(rtd, rtdcom1, rtdcom2) \
+	list_for_each_entry_safe(rtdcom1, rtdcom2, &(rtd)->component_list, list)
+
 /* SoC Audio Codec device */
 struct snd_soc_codec {
 	struct device *dev;
@@ -1240,6 +1252,7 @@ struct snd_soc_pcm_runtime {
 
 	unsigned int num; /* 0-based and monotonic increasing */
 	struct list_head list; /* rtd list of the soc card */
+	struct list_head component_list; /* list of connected components */
 
 	/* bit field */
 	unsigned int dev_registered:1;
diff --git a/sound/soc/soc-core.c b/sound/soc/soc-core.c
index 07d708498500..c2462e0f0f0d 100644
--- a/sound/soc/soc-core.c
+++ b/sound/soc/soc-core.c
@@ -551,6 +551,54 @@ static inline void snd_soc_debugfs_exit(void)
 
 #endif
 
+static int snd_soc_rtdcom_add(struct snd_soc_pcm_runtime *rtd,
+			      struct snd_soc_component *component)
+{
+	struct snd_soc_rtdcom_list *rtdcom;
+	struct snd_soc_rtdcom_list *new_rtdcom;
+
+	for_each_rtdcom(rtd, rtdcom) {
+		/* already connected */
+		if (rtdcom->component == component)
+			return 0;
+	}
+
+	new_rtdcom = kmalloc(sizeof(*new_rtdcom), GFP_KERNEL);
+	if (!new_rtdcom)
+		return -ENOMEM;
+
+	new_rtdcom->component = component;
+	INIT_LIST_HEAD(&new_rtdcom->list);
+
+	list_add_tail(&new_rtdcom->list, &rtd->component_list);
+
+	return 0;
+}
+
+static void snd_soc_rtdcom_del_all(struct snd_soc_pcm_runtime *rtd)
+{
+	struct snd_soc_rtdcom_list *rtdcom1, *rtdcom2;
+
+	for_each_rtdcom_safe(rtd, rtdcom1, rtdcom2)
+		kfree(rtdcom1);
+
+	INIT_LIST_HEAD(&rtd->component_list);
+}
+
+struct snd_soc_component *snd_soc_rtdcom_lookup(struct snd_soc_pcm_runtime *rtd,
+						const char *driver_name)
+{
+	struct snd_soc_rtdcom_list *rtdcom;
+
+	for_each_rtdcom(rtd, rtdcom) {
+		if ((rtdcom->component->driver->name == driver_name) ||
+		    strcmp(rtdcom->component->driver->name, driver_name) == 0)
+			return rtdcom->component;
+	}
+
+	return NULL;
+}
+
 struct snd_pcm_substream *snd_soc_get_dai_substream(struct snd_soc_card *card,
 		const char *dai_link, int stream)
 {
@@ -575,6 +623,7 @@ static struct snd_soc_pcm_runtime *soc_new_pcm_runtime(
 	if (!rtd)
 		return NULL;
 
+	INIT_LIST_HEAD(&rtd->component_list);
 	rtd->card = card;
 	rtd->dai_link = dai_link;
 	rtd->codec_dais = kzalloc(sizeof(struct snd_soc_dai *) *
@@ -592,6 +641,7 @@ static void soc_free_pcm_runtime(struct snd_soc_pcm_runtime *rtd)
 {
 	if (rtd && rtd->codec_dais)
 		kfree(rtd->codec_dais);
+	snd_soc_rtdcom_del_all(rtd);
 	kfree(rtd);
 }
 
-- 
cgit v1.2.3


From 90be711e23f7c7ee7b3d6a6e5aa7ee9bab321f2e Mon Sep 17 00:00:00 2001
From: Kuninori Morimoto <kuninori.morimoto.gx@renesas.com>
Date: Tue, 8 Aug 2017 06:18:10 +0000
Subject: ASoC: use snd_soc_rtdcom_add() and convert to consistent operation

Basically, current ALSA SoC framework is based on CPU/Codec/Platform,
but its operation doesn't have consistent.
Thus, source code was unreadable, and difficult to understand.
This patch connects each component (= CPU/Codec/Platform) to rtd by
using snd_soc_rtdcom_add(), and convert uneven operations to consistent
operation.

Signed-off-by: Kuninori Morimoto <kuninori.morimoto.gx@renesas.com>
Tested-by: Arnaud Pouliquen <arnaud.pouliquen@st.com>
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 include/sound/soc.h  |  2 +-
 sound/soc/soc-core.c | 61 +++++++++++++++++++++++-----------------------------
 sound/soc/soc-pcm.c  | 38 ++++++++++++++++----------------
 3 files changed, 47 insertions(+), 54 deletions(-)

(limited to 'include')

diff --git a/include/sound/soc.h b/include/sound/soc.h
index 87dbe39797ca..82e078151787 100644
--- a/include/sound/soc.h
+++ b/include/sound/soc.h
@@ -1237,7 +1237,7 @@ struct snd_soc_pcm_runtime {
 	struct snd_pcm *pcm;
 	struct snd_compr *compr;
 	struct snd_soc_codec *codec;
-	struct snd_soc_platform *platform;
+	struct snd_soc_platform *platform; /* will be removed */
 	struct snd_soc_dai *codec_dai;
 	struct snd_soc_dai *cpu_dai;
 
diff --git a/sound/soc/soc-core.c b/sound/soc/soc-core.c
index c2462e0f0f0d..bbbe003fc0af 100644
--- a/sound/soc/soc-core.c
+++ b/sound/soc/soc-core.c
@@ -1100,6 +1100,7 @@ static int soc_bind_dai_link(struct snd_soc_card *card,
 	struct snd_soc_pcm_runtime *rtd;
 	struct snd_soc_dai_link_component *codecs = dai_link->codecs;
 	struct snd_soc_dai_link_component cpu_dai_component;
+	struct snd_soc_component *component;
 	struct snd_soc_dai **codec_dais;
 	struct snd_soc_platform *platform;
 	struct device_node *platform_of_node;
@@ -1127,6 +1128,7 @@ static int soc_bind_dai_link(struct snd_soc_card *card,
 			dai_link->cpu_dai_name);
 		goto _err_defer;
 	}
+	snd_soc_rtdcom_add(rtd, rtd->cpu_dai->component);
 
 	rtd->num_codecs = dai_link->num_codecs;
 
@@ -1139,6 +1141,7 @@ static int soc_bind_dai_link(struct snd_soc_card *card,
 				codecs[i].dai_name);
 			goto _err_defer;
 		}
+		snd_soc_rtdcom_add(rtd, codec_dais[i]->component);
 	}
 
 	/* Single codec links expect codec and codec_dai in runtime data */
@@ -1150,6 +1153,23 @@ static int soc_bind_dai_link(struct snd_soc_card *card,
 	if (!platform_name && !dai_link->platform_of_node)
 		platform_name = "snd-soc-dummy";
 
+	/* find one from the set of registered platforms */
+	list_for_each_entry(component, &component_list, list) {
+		platform_of_node = component->dev->of_node;
+		if (!platform_of_node && component->dev->parent->of_node)
+			platform_of_node = component->dev->parent->of_node;
+
+		if (dai_link->platform_of_node) {
+			if (platform_of_node != dai_link->platform_of_node)
+				continue;
+		} else {
+			if (strcmp(component->name, platform_name))
+				continue;
+		}
+
+		snd_soc_rtdcom_add(rtd, component);
+	}
+
 	/* find one from the set of registered platforms */
 	list_for_each_entry(platform, &platform_list, list) {
 		platform_of_node = platform->dev->of_node;
@@ -1235,27 +1255,15 @@ static void soc_remove_link_dais(struct snd_soc_card *card,
 static void soc_remove_link_components(struct snd_soc_card *card,
 	struct snd_soc_pcm_runtime *rtd, int order)
 {
-	struct snd_soc_dai *cpu_dai = rtd->cpu_dai;
-	struct snd_soc_platform *platform = rtd->platform;
 	struct snd_soc_component *component;
-	int i;
+	struct snd_soc_rtdcom_list *rtdcom;
 
-	/* remove the platform */
-	if (platform && platform->component.driver->remove_order == order)
-		soc_remove_component(&platform->component);
+	for_each_rtdcom(rtd, rtdcom) {
+		component = rtdcom->component;
 
-	/* remove the CODEC-side CODEC */
-	for (i = 0; i < rtd->num_codecs; i++) {
-		component = rtd->codec_dais[i]->component;
 		if (component->driver->remove_order == order)
 			soc_remove_component(component);
 	}
-
-	/* remove any CPU-side CODEC */
-	if (cpu_dai) {
-		if (cpu_dai->component->driver->remove_order == order)
-			soc_remove_component(cpu_dai->component);
-	}
 }
 
 static void soc_remove_dai_links(struct snd_soc_card *card)
@@ -1607,21 +1615,13 @@ static int soc_probe_link_components(struct snd_soc_card *card,
 			struct snd_soc_pcm_runtime *rtd,
 				     int order)
 {
-	struct snd_soc_platform *platform = rtd->platform;
 	struct snd_soc_component *component;
-	int i, ret;
+	struct snd_soc_rtdcom_list *rtdcom;
+	int ret;
 
-	/* probe the CPU-side component, if it is a CODEC */
-	component = rtd->cpu_dai->component;
-	if (component->driver->probe_order == order) {
-		ret = soc_probe_component(card, component);
-		if (ret < 0)
-			return ret;
-	}
+	for_each_rtdcom(rtd, rtdcom) {
+		component = rtdcom->component;
 
-	/* probe the CODEC-side components */
-	for (i = 0; i < rtd->num_codecs; i++) {
-		component = rtd->codec_dais[i]->component;
 		if (component->driver->probe_order == order) {
 			ret = soc_probe_component(card, component);
 			if (ret < 0)
@@ -1629,13 +1629,6 @@ static int soc_probe_link_components(struct snd_soc_card *card,
 		}
 	}
 
-	/* probe the platform */
-	if (platform->component.driver->probe_order == order) {
-		ret = soc_probe_component(card, &platform->component);
-		if (ret < 0)
-			return ret;
-	}
-
 	return 0;
 }
 
diff --git a/sound/soc/soc-pcm.c b/sound/soc/soc-pcm.c
index dcc5ece08668..aa4c02de9c61 100644
--- a/sound/soc/soc-pcm.c
+++ b/sound/soc/soc-pcm.c
@@ -450,6 +450,8 @@ static int soc_pcm_open(struct snd_pcm_substream *substream)
 	struct snd_soc_pcm_runtime *rtd = substream->private_data;
 	struct snd_pcm_runtime *runtime = substream->runtime;
 	struct snd_soc_platform *platform = rtd->platform;
+	struct snd_soc_component *component;
+	struct snd_soc_rtdcom_list *rtdcom;
 	struct snd_soc_dai *cpu_dai = rtd->cpu_dai;
 	struct snd_soc_dai *codec_dai;
 	const char *codec_dai_name = "multicodec";
@@ -458,10 +460,12 @@ static int soc_pcm_open(struct snd_pcm_substream *substream)
 	pinctrl_pm_select_default_state(cpu_dai->dev);
 	for (i = 0; i < rtd->num_codecs; i++)
 		pinctrl_pm_select_default_state(rtd->codec_dais[i]->dev);
-	pm_runtime_get_sync(cpu_dai->dev);
-	for (i = 0; i < rtd->num_codecs; i++)
-		pm_runtime_get_sync(rtd->codec_dais[i]->dev);
-	pm_runtime_get_sync(platform->dev);
+
+	for_each_rtdcom(rtd, rtdcom) {
+		component = rtdcom->component;
+
+		pm_runtime_get_sync(component->dev);
+	}
 
 	mutex_lock_nested(&rtd->pcm_mutex, rtd->pcm_subclass);
 
@@ -599,15 +603,13 @@ platform_err:
 out:
 	mutex_unlock(&rtd->pcm_mutex);
 
-	pm_runtime_mark_last_busy(platform->dev);
-	pm_runtime_put_autosuspend(platform->dev);
-	for (i = 0; i < rtd->num_codecs; i++) {
-		pm_runtime_mark_last_busy(rtd->codec_dais[i]->dev);
-		pm_runtime_put_autosuspend(rtd->codec_dais[i]->dev);
+	for_each_rtdcom(rtd, rtdcom) {
+		component = rtdcom->component;
+
+		pm_runtime_mark_last_busy(component->dev);
+		pm_runtime_put_autosuspend(component->dev);
 	}
 
-	pm_runtime_mark_last_busy(cpu_dai->dev);
-	pm_runtime_put_autosuspend(cpu_dai->dev);
 	for (i = 0; i < rtd->num_codecs; i++) {
 		if (!rtd->codec_dais[i]->active)
 			pinctrl_pm_select_sleep_state(rtd->codec_dais[i]->dev);
@@ -655,6 +657,8 @@ static int soc_pcm_close(struct snd_pcm_substream *substream)
 {
 	struct snd_soc_pcm_runtime *rtd = substream->private_data;
 	struct snd_soc_platform *platform = rtd->platform;
+	struct snd_soc_component *component;
+	struct snd_soc_rtdcom_list *rtdcom;
 	struct snd_soc_dai *cpu_dai = rtd->cpu_dai;
 	struct snd_soc_dai *codec_dai;
 	int i;
@@ -711,17 +715,13 @@ static int soc_pcm_close(struct snd_pcm_substream *substream)
 
 	mutex_unlock(&rtd->pcm_mutex);
 
-	pm_runtime_mark_last_busy(platform->dev);
-	pm_runtime_put_autosuspend(platform->dev);
+	for_each_rtdcom(rtd, rtdcom) {
+		component = rtdcom->component;
 
-	for (i = 0; i < rtd->num_codecs; i++) {
-		pm_runtime_mark_last_busy(rtd->codec_dais[i]->dev);
-		pm_runtime_put_autosuspend(rtd->codec_dais[i]->dev);
+		pm_runtime_mark_last_busy(component->dev);
+		pm_runtime_put_autosuspend(component->dev);
 	}
 
-	pm_runtime_mark_last_busy(cpu_dai->dev);
-	pm_runtime_put_autosuspend(cpu_dai->dev);
-
 	for (i = 0; i < rtd->num_codecs; i++) {
 		if (!rtd->codec_dais[i]->active)
 			pinctrl_pm_select_sleep_state(rtd->codec_dais[i]->dev);
-- 
cgit v1.2.3


From fc66ce0b72046318c4c4a66e67a2362193df6de1 Mon Sep 17 00:00:00 2001
From: Sekhar Nori <nsekhar@ti.com>
Date: Thu, 10 Aug 2017 09:02:37 -0700
Subject: ARM: OMAP2+: Add pdata-quirks for MMC/SD on DRA74x EVM

DRA74x EVM Rev H EVM comes with revision 2.0 silicon.
However, earlier versions of EVM can come with either
revision 1.1 or revision 1.0 of silicon.

The device-tree file is written to support rev 2.0 of
silicon. pdata quirks are used to then override the
settings needed for PG 1.1 silicon.

PG 1.1 silicon has limitations w.r.t frequencies at
which MMC1/2/3 can operate as well as different IOdelay
numbers.

Signed-off-by: Kishon Vijay Abraham I <kishon@ti.com>
Signed-off-by: Sekhar Nori <nsekhar@ti.com>
Signed-off-by: Tony Lindgren <tony@atomide.com>
---
 arch/arm/mach-omap2/pdata-quirks.c       | 31 +++++++++++++++++++++++++++++++
 include/linux/platform_data/hsmmc-omap.h |  3 +++
 2 files changed, 34 insertions(+)

(limited to 'include')

diff --git a/arch/arm/mach-omap2/pdata-quirks.c b/arch/arm/mach-omap2/pdata-quirks.c
index 9700a8ef0f16..6b433fce65a5 100644
--- a/arch/arm/mach-omap2/pdata-quirks.c
+++ b/arch/arm/mach-omap2/pdata-quirks.c
@@ -434,6 +434,26 @@ static void __init omap5_uevm_legacy_init(void)
 }
 #endif
 
+#ifdef CONFIG_SOC_DRA7XX
+static struct omap_hsmmc_platform_data dra7_hsmmc_data_mmc1;
+static struct omap_hsmmc_platform_data dra7_hsmmc_data_mmc2;
+static struct omap_hsmmc_platform_data dra7_hsmmc_data_mmc3;
+
+static void __init dra7x_evm_mmc_quirk(void)
+{
+	if (omap_rev() == DRA752_REV_ES1_1 || omap_rev() == DRA752_REV_ES1_0) {
+		dra7_hsmmc_data_mmc1.version = "rev11";
+		dra7_hsmmc_data_mmc1.max_freq = 96000000;
+
+		dra7_hsmmc_data_mmc2.version = "rev11";
+		dra7_hsmmc_data_mmc2.max_freq = 48000000;
+
+		dra7_hsmmc_data_mmc3.version = "rev11";
+		dra7_hsmmc_data_mmc3.max_freq = 48000000;
+	}
+}
+#endif
+
 static struct pcs_pdata pcs_pdata;
 
 void omap_pcs_legacy_init(int irq, void (*rearm)(void))
@@ -560,6 +580,14 @@ static struct of_dev_auxdata omap_auxdata_lookup[] __initdata = {
 		       &omap4_iommu_pdata),
 	OF_DEV_AUXDATA("ti,omap4-iommu", 0x55082000, "55082000.mmu",
 		       &omap4_iommu_pdata),
+#endif
+#ifdef CONFIG_SOC_DRA7XX
+	OF_DEV_AUXDATA("ti,dra7-hsmmc", 0x4809c000, "4809c000.mmc",
+		       &dra7_hsmmc_data_mmc1),
+	OF_DEV_AUXDATA("ti,dra7-hsmmc", 0x480b4000, "480b4000.mmc",
+		       &dra7_hsmmc_data_mmc2),
+	OF_DEV_AUXDATA("ti,dra7-hsmmc", 0x480ad000, "480ad000.mmc",
+		       &dra7_hsmmc_data_mmc3),
 #endif
 	/* Common auxdata */
 	OF_DEV_AUXDATA("pinctrl-single", 0, NULL, &pcs_pdata),
@@ -589,6 +617,9 @@ static struct pdata_init pdata_quirks[] __initdata = {
 #endif
 #ifdef CONFIG_SOC_OMAP5
 	{ "ti,omap5-uevm", omap5_uevm_legacy_init, },
+#endif
+#ifdef CONFIG_SOC_DRA7XX
+	{ "ti,dra7-evm", dra7x_evm_mmc_quirk, },
 #endif
 	{ /* sentinel */ },
 };
diff --git a/include/linux/platform_data/hsmmc-omap.h b/include/linux/platform_data/hsmmc-omap.h
index 8e981be2e2c2..67bded22eae4 100644
--- a/include/linux/platform_data/hsmmc-omap.h
+++ b/include/linux/platform_data/hsmmc-omap.h
@@ -70,6 +70,9 @@ struct omap_hsmmc_platform_data {
 #define HSMMC_HAS_HSPE_SUPPORT	(1 << 2)
 	unsigned features;
 
+	/* string specifying a particular variant of hardware */
+	char *version;
+
 	int gpio_cd;			/* gpio (card detect) */
 	int gpio_cod;			/* gpio (cover detect) */
 	int gpio_wp;			/* gpio (write protect) */
-- 
cgit v1.2.3


From eba6130b31bc0c112302d0b1340badb37fd3be1e Mon Sep 17 00:00:00 2001
From: Tony Lindgren <tony@atomide.com>
Date: Fri, 16 Jun 2017 17:24:29 +0530
Subject: ARM: dts: Add dra7 iodelay configuration

Add dra7 iodelay configuration.

Signed-off-by: Tony Lindgren <tony@atomide.com>
Signed-off-by: Nishanth Menon <nm@ti.com>
Signed-off-by: Kishon Vijay Abraham I <kishon@ti.com>
---
 arch/arm/boot/dts/dra7.dtsi       | 8 ++++++++
 include/dt-bindings/pinctrl/dra.h | 3 +++
 2 files changed, 11 insertions(+)

(limited to 'include')

diff --git a/arch/arm/boot/dts/dra7.dtsi b/arch/arm/boot/dts/dra7.dtsi
index 571cacdf0010..32347d7f00d7 100644
--- a/arch/arm/boot/dts/dra7.dtsi
+++ b/arch/arm/boot/dts/dra7.dtsi
@@ -418,6 +418,14 @@
 			reg = <0x40d00000 0x100>;
 		};
 
+		dra7_iodelay_core: padconf@4844a000 {
+			compatible = "ti,dra7-iodelay";
+			reg = <0x4844a000 0x0d1c>;
+			#address-cells = <1>;
+			#size-cells = <0>;
+			#pinctrl-cells = <2>;
+		};
+
 		sdma: dma-controller@4a056000 {
 			compatible = "ti,omap4430-sdma";
 			reg = <0x4a056000 0x1000>;
diff --git a/include/dt-bindings/pinctrl/dra.h b/include/dt-bindings/pinctrl/dra.h
index 5c75e80915fc..18ec5df5a581 100644
--- a/include/dt-bindings/pinctrl/dra.h
+++ b/include/dt-bindings/pinctrl/dra.h
@@ -73,5 +73,8 @@
  */
 #define DRA7XX_CORE_IOPAD(pa, val)	(((pa) & 0xffff) - 0x3400) (val)
 
+/* DRA7 IODELAY configuration parameters */
+#define A_DELAY_PS(val)			((val) & 0xffff)
+#define G_DELAY_PS(val)			((val) & 0xffff)
 #endif
 
-- 
cgit v1.2.3


From 606799cc5049ae4ccb51ba3242365ca2d411da13 Mon Sep 17 00:00:00 2001
From: Bjorn Helgaas <bhelgaas@google.com>
Date: Thu, 10 Aug 2017 12:49:57 -0500
Subject: PCI: Inline and remove pcibios_update_irq()

pcibios_update_irq() was a weak function with only one trivial
implementation.  Inline it and remove the weak function.

Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
---
 drivers/pci/setup-irq.c | 8 +-------
 include/linux/pci.h     | 1 -
 2 files changed, 1 insertion(+), 8 deletions(-)

(limited to 'include')

diff --git a/drivers/pci/setup-irq.c b/drivers/pci/setup-irq.c
index 69e3b56c32a4..86106c44ce94 100644
--- a/drivers/pci/setup-irq.c
+++ b/drivers/pci/setup-irq.c
@@ -17,12 +17,6 @@
 #include <linux/cache.h>
 #include "pci.h"
 
-void __weak pcibios_update_irq(struct pci_dev *dev, int irq)
-{
-	dev_dbg(&dev->dev, "assigning IRQ %02d\n", irq);
-	pci_write_config_byte(dev, PCI_INTERRUPT_LINE, irq);
-}
-
 void pci_assign_irq(struct pci_dev *dev)
 {
 	u8 pin;
@@ -65,5 +59,5 @@ void pci_assign_irq(struct pci_dev *dev)
 
 	/* Always tell the device, so the driver knows what is
 	   the real IRQ to use; the device does not use it. */
-	pcibios_update_irq(dev, irq);
+	pci_write_config_byte(dev, PCI_INTERRUPT_LINE, irq);
 }
diff --git a/include/linux/pci.h b/include/linux/pci.h
index 69034ab8a68e..a9e8d2f3bd91 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -844,7 +844,6 @@ char *pcibios_setup(char *str);
 resource_size_t pcibios_align_resource(void *, const struct resource *,
 				resource_size_t,
 				resource_size_t);
-void pcibios_update_irq(struct pci_dev *, int irq);
 
 /* Weak but can be overriden by arch */
 void pci_fixup_cardbus(struct pci_bus *);
-- 
cgit v1.2.3


From 690cbb90a709c1b9389c6cb8e1978e77553ce0fb Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rafael.j.wysocki@intel.com>
Date: Thu, 10 Aug 2017 00:13:07 +0200
Subject: PM / s2idle: Rename PM_SUSPEND_FREEZE to PM_SUSPEND_TO_IDLE

To make it clear that the symbol in question refers to
suspend-to-idle, rename it from PM_SUSPEND_FREEZE to
PM_SUSPEND_TO_IDLE.

Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/acpi/sleep.c             |  2 +-
 drivers/regulator/of_regulator.c |  2 +-
 include/linux/suspend.h          |  4 ++--
 kernel/power/suspend.c           | 44 ++++++++++++++++++++--------------------
 kernel/power/suspend_test.c      |  4 ++--
 5 files changed, 28 insertions(+), 28 deletions(-)

(limited to 'include')

diff --git a/drivers/acpi/sleep.c b/drivers/acpi/sleep.c
index b363283dfcd9..a0a6fd10fb5f 100644
--- a/drivers/acpi/sleep.c
+++ b/drivers/acpi/sleep.c
@@ -719,7 +719,7 @@ static int lps0_device_attach(struct acpi_device *adev,
 			 * suspend mode was not set from the command line.
 			 */
 			if (mem_sleep_default > PM_SUSPEND_MEM)
-				mem_sleep_current = PM_SUSPEND_FREEZE;
+				mem_sleep_current = PM_SUSPEND_TO_IDLE;
 		}
 
 		acpi_handle_debug(adev->handle, "_DSM function mask: 0x%x\n",
diff --git a/drivers/regulator/of_regulator.c b/drivers/regulator/of_regulator.c
index 96bf75458da5..860480ecf2be 100644
--- a/drivers/regulator/of_regulator.c
+++ b/drivers/regulator/of_regulator.c
@@ -150,7 +150,7 @@ static void of_get_regulation_constraints(struct device_node *np,
 			suspend_state = &constraints->state_disk;
 			break;
 		case PM_SUSPEND_ON:
-		case PM_SUSPEND_FREEZE:
+		case PM_SUSPEND_TO_IDLE:
 		case PM_SUSPEND_STANDBY:
 		default:
 			continue;
diff --git a/include/linux/suspend.h b/include/linux/suspend.h
index 8c3b0b1e6786..d43654be0097 100644
--- a/include/linux/suspend.h
+++ b/include/linux/suspend.h
@@ -33,10 +33,10 @@ static inline void pm_restore_console(void)
 typedef int __bitwise suspend_state_t;
 
 #define PM_SUSPEND_ON		((__force suspend_state_t) 0)
-#define PM_SUSPEND_FREEZE	((__force suspend_state_t) 1)
+#define PM_SUSPEND_TO_IDLE	((__force suspend_state_t) 1)
 #define PM_SUSPEND_STANDBY	((__force suspend_state_t) 2)
 #define PM_SUSPEND_MEM		((__force suspend_state_t) 3)
-#define PM_SUSPEND_MIN		PM_SUSPEND_FREEZE
+#define PM_SUSPEND_MIN		PM_SUSPEND_TO_IDLE
 #define PM_SUSPEND_MAX		((__force suspend_state_t) 4)
 
 enum suspend_stat_step {
diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c
index 0639d3a79852..6333078a438b 100644
--- a/kernel/power/suspend.c
+++ b/kernel/power/suspend.c
@@ -35,19 +35,19 @@
 #include "power.h"
 
 const char * const pm_labels[] = {
-	[PM_SUSPEND_FREEZE] = "freeze",
+	[PM_SUSPEND_TO_IDLE] = "freeze",
 	[PM_SUSPEND_STANDBY] = "standby",
 	[PM_SUSPEND_MEM] = "mem",
 };
 const char *pm_states[PM_SUSPEND_MAX];
 static const char * const mem_sleep_labels[] = {
-	[PM_SUSPEND_FREEZE] = "s2idle",
+	[PM_SUSPEND_TO_IDLE] = "s2idle",
 	[PM_SUSPEND_STANDBY] = "shallow",
 	[PM_SUSPEND_MEM] = "deep",
 };
 const char *mem_sleep_states[PM_SUSPEND_MAX];
 
-suspend_state_t mem_sleep_current = PM_SUSPEND_FREEZE;
+suspend_state_t mem_sleep_current = PM_SUSPEND_TO_IDLE;
 suspend_state_t mem_sleep_default = PM_SUSPEND_MAX;
 suspend_state_t pm_suspend_target_state;
 EXPORT_SYMBOL_GPL(pm_suspend_target_state);
@@ -76,7 +76,7 @@ static void freeze_begin(void)
 
 static void freeze_enter(void)
 {
-	trace_suspend_resume(TPS("machine_suspend"), PM_SUSPEND_FREEZE, true);
+	trace_suspend_resume(TPS("machine_suspend"), PM_SUSPEND_TO_IDLE, true);
 
 	spin_lock_irq(&suspend_freeze_lock);
 	if (pm_wakeup_pending())
@@ -103,7 +103,7 @@ static void freeze_enter(void)
 	suspend_freeze_state = FREEZE_STATE_NONE;
 	spin_unlock_irq(&suspend_freeze_lock);
 
-	trace_suspend_resume(TPS("machine_suspend"), PM_SUSPEND_FREEZE, false);
+	trace_suspend_resume(TPS("machine_suspend"), PM_SUSPEND_TO_IDLE, false);
 }
 
 static void s2idle_loop(void)
@@ -175,19 +175,19 @@ void __init pm_states_init(void)
 {
 	/* "mem" and "freeze" are always present in /sys/power/state. */
 	pm_states[PM_SUSPEND_MEM] = pm_labels[PM_SUSPEND_MEM];
-	pm_states[PM_SUSPEND_FREEZE] = pm_labels[PM_SUSPEND_FREEZE];
+	pm_states[PM_SUSPEND_TO_IDLE] = pm_labels[PM_SUSPEND_TO_IDLE];
 	/*
 	 * Suspend-to-idle should be supported even without any suspend_ops,
 	 * initialize mem_sleep_states[] accordingly here.
 	 */
-	mem_sleep_states[PM_SUSPEND_FREEZE] = mem_sleep_labels[PM_SUSPEND_FREEZE];
+	mem_sleep_states[PM_SUSPEND_TO_IDLE] = mem_sleep_labels[PM_SUSPEND_TO_IDLE];
 }
 
 static int __init mem_sleep_default_setup(char *str)
 {
 	suspend_state_t state;
 
-	for (state = PM_SUSPEND_FREEZE; state <= PM_SUSPEND_MEM; state++)
+	for (state = PM_SUSPEND_TO_IDLE; state <= PM_SUSPEND_MEM; state++)
 		if (mem_sleep_labels[state] &&
 		    !strcmp(str, mem_sleep_labels[state])) {
 			mem_sleep_default = state;
@@ -239,48 +239,48 @@ EXPORT_SYMBOL_GPL(suspend_valid_only_mem);
 
 static bool sleep_state_supported(suspend_state_t state)
 {
-	return state == PM_SUSPEND_FREEZE || (suspend_ops && suspend_ops->enter);
+	return state == PM_SUSPEND_TO_IDLE || (suspend_ops && suspend_ops->enter);
 }
 
 static int platform_suspend_prepare(suspend_state_t state)
 {
-	return state != PM_SUSPEND_FREEZE && suspend_ops->prepare ?
+	return state != PM_SUSPEND_TO_IDLE && suspend_ops->prepare ?
 		suspend_ops->prepare() : 0;
 }
 
 static int platform_suspend_prepare_late(suspend_state_t state)
 {
-	return state == PM_SUSPEND_FREEZE && freeze_ops && freeze_ops->prepare ?
+	return state == PM_SUSPEND_TO_IDLE && freeze_ops && freeze_ops->prepare ?
 		freeze_ops->prepare() : 0;
 }
 
 static int platform_suspend_prepare_noirq(suspend_state_t state)
 {
-	return state != PM_SUSPEND_FREEZE && suspend_ops->prepare_late ?
+	return state != PM_SUSPEND_TO_IDLE && suspend_ops->prepare_late ?
 		suspend_ops->prepare_late() : 0;
 }
 
 static void platform_resume_noirq(suspend_state_t state)
 {
-	if (state != PM_SUSPEND_FREEZE && suspend_ops->wake)
+	if (state != PM_SUSPEND_TO_IDLE && suspend_ops->wake)
 		suspend_ops->wake();
 }
 
 static void platform_resume_early(suspend_state_t state)
 {
-	if (state == PM_SUSPEND_FREEZE && freeze_ops && freeze_ops->restore)
+	if (state == PM_SUSPEND_TO_IDLE && freeze_ops && freeze_ops->restore)
 		freeze_ops->restore();
 }
 
 static void platform_resume_finish(suspend_state_t state)
 {
-	if (state != PM_SUSPEND_FREEZE && suspend_ops->finish)
+	if (state != PM_SUSPEND_TO_IDLE && suspend_ops->finish)
 		suspend_ops->finish();
 }
 
 static int platform_suspend_begin(suspend_state_t state)
 {
-	if (state == PM_SUSPEND_FREEZE && freeze_ops && freeze_ops->begin)
+	if (state == PM_SUSPEND_TO_IDLE && freeze_ops && freeze_ops->begin)
 		return freeze_ops->begin();
 	else if (suspend_ops && suspend_ops->begin)
 		return suspend_ops->begin(state);
@@ -290,7 +290,7 @@ static int platform_suspend_begin(suspend_state_t state)
 
 static void platform_resume_end(suspend_state_t state)
 {
-	if (state == PM_SUSPEND_FREEZE && freeze_ops && freeze_ops->end)
+	if (state == PM_SUSPEND_TO_IDLE && freeze_ops && freeze_ops->end)
 		freeze_ops->end();
 	else if (suspend_ops && suspend_ops->end)
 		suspend_ops->end();
@@ -298,13 +298,13 @@ static void platform_resume_end(suspend_state_t state)
 
 static void platform_recover(suspend_state_t state)
 {
-	if (state != PM_SUSPEND_FREEZE && suspend_ops->recover)
+	if (state != PM_SUSPEND_TO_IDLE && suspend_ops->recover)
 		suspend_ops->recover();
 }
 
 static bool platform_suspend_again(suspend_state_t state)
 {
-	return state != PM_SUSPEND_FREEZE && suspend_ops->suspend_again ?
+	return state != PM_SUSPEND_TO_IDLE && suspend_ops->suspend_again ?
 		suspend_ops->suspend_again() : false;
 }
 
@@ -400,7 +400,7 @@ static int suspend_enter(suspend_state_t state, bool *wakeup)
 	if (error)
 		goto Devices_early_resume;
 
-	if (state == PM_SUSPEND_FREEZE && pm_test_level != TEST_PLATFORM) {
+	if (state == PM_SUSPEND_TO_IDLE && pm_test_level != TEST_PLATFORM) {
 		s2idle_loop();
 		goto Platform_early_resume;
 	}
@@ -538,7 +538,7 @@ static int enter_state(suspend_state_t state)
 	int error;
 
 	trace_suspend_resume(TPS("suspend_enter"), state, true);
-	if (state == PM_SUSPEND_FREEZE) {
+	if (state == PM_SUSPEND_TO_IDLE) {
 #ifdef CONFIG_PM_DEBUG
 		if (pm_test_level != TEST_NONE && pm_test_level <= TEST_CPUS) {
 			pr_warn("Unsupported test mode for suspend to idle, please choose none/freezer/devices/platform.\n");
@@ -551,7 +551,7 @@ static int enter_state(suspend_state_t state)
 	if (!mutex_trylock(&pm_mutex))
 		return -EBUSY;
 
-	if (state == PM_SUSPEND_FREEZE)
+	if (state == PM_SUSPEND_TO_IDLE)
 		freeze_begin();
 
 #ifndef CONFIG_SUSPEND_SKIP_SYNC
diff --git a/kernel/power/suspend_test.c b/kernel/power/suspend_test.c
index 5db217051232..6a897e8b2a88 100644
--- a/kernel/power/suspend_test.c
+++ b/kernel/power/suspend_test.c
@@ -104,9 +104,9 @@ repeat:
 		printk(info_test, pm_states[state]);
 		status = pm_suspend(state);
 		if (status < 0)
-			state = PM_SUSPEND_FREEZE;
+			state = PM_SUSPEND_TO_IDLE;
 	}
-	if (state == PM_SUSPEND_FREEZE) {
+	if (state == PM_SUSPEND_TO_IDLE) {
 		printk(info_test, pm_states[state]);
 		status = pm_suspend(state);
 	}
-- 
cgit v1.2.3


From f02f4f9d826590f1ef1b087374d3e3cfb7949eac Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rafael.j.wysocki@intel.com>
Date: Thu, 10 Aug 2017 00:13:56 +0200
Subject: PM / s2idle: Rename freeze_state enum and related items

Rename the freeze_state enum representing the suspend-to-idle state
machine states to s2idle_states and rename the related variables and
functions accordingly.

Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/base/power/wakeup.c |  2 +-
 include/linux/suspend.h     | 20 +++++++++----------
 kernel/power/suspend.c      | 48 ++++++++++++++++++++++-----------------------
 kernel/sched/idle.c         |  6 +++---
 4 files changed, 38 insertions(+), 38 deletions(-)

(limited to 'include')

diff --git a/drivers/base/power/wakeup.c b/drivers/base/power/wakeup.c
index 144e6d8fafc8..34aebf3e460b 100644
--- a/drivers/base/power/wakeup.c
+++ b/drivers/base/power/wakeup.c
@@ -863,7 +863,7 @@ bool pm_wakeup_pending(void)
 void pm_system_wakeup(void)
 {
 	atomic_inc(&pm_abort_suspend);
-	freeze_wake();
+	s2idle_wake();
 }
 EXPORT_SYMBOL_GPL(pm_system_wakeup);
 
diff --git a/include/linux/suspend.h b/include/linux/suspend.h
index d43654be0097..feb43dfbf7bc 100644
--- a/include/linux/suspend.h
+++ b/include/linux/suspend.h
@@ -237,22 +237,22 @@ static inline bool pm_resume_via_firmware(void)
 }
 
 /* Suspend-to-idle state machnine. */
-enum freeze_state {
-	FREEZE_STATE_NONE,      /* Not suspended/suspending. */
-	FREEZE_STATE_ENTER,     /* Enter suspend-to-idle. */
-	FREEZE_STATE_WAKE,      /* Wake up from suspend-to-idle. */
+enum s2idle_states {
+	S2IDLE_STATE_NONE,      /* Not suspended/suspending. */
+	S2IDLE_STATE_ENTER,     /* Enter suspend-to-idle. */
+	S2IDLE_STATE_WAKE,      /* Wake up from suspend-to-idle. */
 };
 
-extern enum freeze_state __read_mostly suspend_freeze_state;
+extern enum s2idle_states __read_mostly s2idle_state;
 
-static inline bool idle_should_freeze(void)
+static inline bool idle_should_enter_s2idle(void)
 {
-	return unlikely(suspend_freeze_state == FREEZE_STATE_ENTER);
+	return unlikely(s2idle_state == S2IDLE_STATE_ENTER);
 }
 
 extern void __init pm_states_init(void);
 extern void freeze_set_ops(const struct platform_freeze_ops *ops);
-extern void freeze_wake(void);
+extern void s2idle_wake(void);
 
 /**
  * arch_suspend_disable_irqs - disable IRQs for suspend
@@ -284,10 +284,10 @@ static inline bool pm_resume_via_firmware(void) { return false; }
 
 static inline void suspend_set_ops(const struct platform_suspend_ops *ops) {}
 static inline int pm_suspend(suspend_state_t state) { return -ENOSYS; }
-static inline bool idle_should_freeze(void) { return false; }
+static inline bool idle_should_enter_s2idle(void) { return false; }
 static inline void __init pm_states_init(void) {}
 static inline void freeze_set_ops(const struct platform_freeze_ops *ops) {}
-static inline void freeze_wake(void) {}
+static inline void s2idle_wake(void) {}
 #endif /* !CONFIG_SUSPEND */
 
 /* struct pbe is used for creating lists of pages that should be restored
diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c
index 6333078a438b..ae9b579c2533 100644
--- a/kernel/power/suspend.c
+++ b/kernel/power/suspend.c
@@ -57,10 +57,10 @@ EXPORT_SYMBOL_GPL(pm_suspend_global_flags);
 
 static const struct platform_suspend_ops *suspend_ops;
 static const struct platform_freeze_ops *freeze_ops;
-static DECLARE_WAIT_QUEUE_HEAD(suspend_freeze_wait_head);
+static DECLARE_WAIT_QUEUE_HEAD(s2idle_wait_head);
 
-enum freeze_state __read_mostly suspend_freeze_state;
-static DEFINE_SPINLOCK(suspend_freeze_lock);
+enum s2idle_states __read_mostly s2idle_state;
+static DEFINE_SPINLOCK(s2idle_lock);
 
 void freeze_set_ops(const struct platform_freeze_ops *ops)
 {
@@ -69,21 +69,21 @@ void freeze_set_ops(const struct platform_freeze_ops *ops)
 	unlock_system_sleep();
 }
 
-static void freeze_begin(void)
+static void s2idle_begin(void)
 {
-	suspend_freeze_state = FREEZE_STATE_NONE;
+	s2idle_state = S2IDLE_STATE_NONE;
 }
 
-static void freeze_enter(void)
+static void s2idle_enter(void)
 {
 	trace_suspend_resume(TPS("machine_suspend"), PM_SUSPEND_TO_IDLE, true);
 
-	spin_lock_irq(&suspend_freeze_lock);
+	spin_lock_irq(&s2idle_lock);
 	if (pm_wakeup_pending())
 		goto out;
 
-	suspend_freeze_state = FREEZE_STATE_ENTER;
-	spin_unlock_irq(&suspend_freeze_lock);
+	s2idle_state = S2IDLE_STATE_ENTER;
+	spin_unlock_irq(&s2idle_lock);
 
 	get_online_cpus();
 	cpuidle_resume();
@@ -91,17 +91,17 @@ static void freeze_enter(void)
 	/* Push all the CPUs into the idle loop. */
 	wake_up_all_idle_cpus();
 	/* Make the current CPU wait so it can enter the idle loop too. */
-	wait_event(suspend_freeze_wait_head,
-		   suspend_freeze_state == FREEZE_STATE_WAKE);
+	wait_event(s2idle_wait_head,
+		   s2idle_state == S2IDLE_STATE_WAKE);
 
 	cpuidle_pause();
 	put_online_cpus();
 
-	spin_lock_irq(&suspend_freeze_lock);
+	spin_lock_irq(&s2idle_lock);
 
  out:
-	suspend_freeze_state = FREEZE_STATE_NONE;
-	spin_unlock_irq(&suspend_freeze_lock);
+	s2idle_state = S2IDLE_STATE_NONE;
+	spin_unlock_irq(&s2idle_lock);
 
 	trace_suspend_resume(TPS("machine_suspend"), PM_SUSPEND_TO_IDLE, false);
 }
@@ -118,12 +118,12 @@ static void s2idle_loop(void)
 		/*
 		 * Suspend-to-idle equals
 		 * frozen processes + suspended devices + idle processors.
-		 * Thus freeze_enter() should be called right after
+		 * Thus s2idle_enter() should be called right after
 		 * all devices have been suspended.
 		 */
 		error = dpm_noirq_suspend_devices(PMSG_SUSPEND);
 		if (!error)
-			freeze_enter();
+			s2idle_enter();
 
 		dpm_noirq_resume_devices(PMSG_RESUME);
 		if (error && (error != -EBUSY || !pm_wakeup_pending())) {
@@ -148,18 +148,18 @@ static void s2idle_loop(void)
 	pm_pr_dbg("resume from suspend-to-idle\n");
 }
 
-void freeze_wake(void)
+void s2idle_wake(void)
 {
 	unsigned long flags;
 
-	spin_lock_irqsave(&suspend_freeze_lock, flags);
-	if (suspend_freeze_state > FREEZE_STATE_NONE) {
-		suspend_freeze_state = FREEZE_STATE_WAKE;
-		wake_up(&suspend_freeze_wait_head);
+	spin_lock_irqsave(&s2idle_lock, flags);
+	if (s2idle_state > S2IDLE_STATE_NONE) {
+		s2idle_state = S2IDLE_STATE_WAKE;
+		wake_up(&s2idle_wait_head);
 	}
-	spin_unlock_irqrestore(&suspend_freeze_lock, flags);
+	spin_unlock_irqrestore(&s2idle_lock, flags);
 }
-EXPORT_SYMBOL_GPL(freeze_wake);
+EXPORT_SYMBOL_GPL(s2idle_wake);
 
 static bool valid_state(suspend_state_t state)
 {
@@ -552,7 +552,7 @@ static int enter_state(suspend_state_t state)
 		return -EBUSY;
 
 	if (state == PM_SUSPEND_TO_IDLE)
-		freeze_begin();
+		s2idle_begin();
 
 #ifndef CONFIG_SUSPEND_SKIP_SYNC
 	trace_suspend_resume(TPS("sync_filesystems"), 0, true);
diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c
index 6c23e30c0e5c..2ea4b7b23044 100644
--- a/kernel/sched/idle.c
+++ b/kernel/sched/idle.c
@@ -158,7 +158,7 @@ static void cpuidle_idle_call(void)
 	}
 
 	/*
-	 * Suspend-to-idle ("freeze") is a system state in which all user space
+	 * Suspend-to-idle ("s2idle") is a system state in which all user space
 	 * has been frozen, all I/O devices have been suspended and the only
 	 * activity happens here and in iterrupts (if any).  In that case bypass
 	 * the cpuidle governor and go stratight for the deepest idle state
@@ -167,8 +167,8 @@ static void cpuidle_idle_call(void)
 	 * until a proper wakeup interrupt happens.
 	 */
 
-	if (idle_should_freeze() || dev->use_deepest_state) {
-		if (idle_should_freeze()) {
+	if (idle_should_enter_s2idle() || dev->use_deepest_state) {
+		if (idle_should_enter_s2idle()) {
 			entered_state = cpuidle_enter_freeze(drv, dev);
 			if (entered_state > 0) {
 				local_irq_enable();
-- 
cgit v1.2.3


From 28ba086ed30fb3fb714598aa029b894c3754fa7b Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rafael.j.wysocki@intel.com>
Date: Thu, 10 Aug 2017 00:14:45 +0200
Subject: PM / s2idle: Rename ->enter_freeze to ->enter_s2idle

Rename the ->enter_freeze cpuidle driver callback to ->enter_s2idle
to make it clear that it is used for entering suspend-to-idle and
rename the related functions, variables and so on accordingly.

Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 arch/arm/mach-tegra/cpuidle-tegra114.c |   4 +-
 drivers/acpi/processor_idle.c          |   6 +-
 drivers/cpuidle/cpuidle.c              |  18 ++--
 drivers/cpuidle/dt_idle_states.c       |   4 +-
 drivers/idle/intel_idle.c              | 180 ++++++++++++++++-----------------
 include/linux/cpuidle.h                |   8 +-
 kernel/sched/idle.c                    |   2 +-
 7 files changed, 111 insertions(+), 111 deletions(-)

(limited to 'include')

diff --git a/arch/arm/mach-tegra/cpuidle-tegra114.c b/arch/arm/mach-tegra/cpuidle-tegra114.c
index d3aa9be16621..e3fbcfedf845 100644
--- a/arch/arm/mach-tegra/cpuidle-tegra114.c
+++ b/arch/arm/mach-tegra/cpuidle-tegra114.c
@@ -60,7 +60,7 @@ static int tegra114_idle_power_down(struct cpuidle_device *dev,
 	return index;
 }
 
-static void tegra114_idle_enter_freeze(struct cpuidle_device *dev,
+static void tegra114_idle_enter_s2idle(struct cpuidle_device *dev,
 				       struct cpuidle_driver *drv,
 				       int index)
 {
@@ -77,7 +77,7 @@ static struct cpuidle_driver tegra_idle_driver = {
 #ifdef CONFIG_PM_SLEEP
 		[1] = {
 			.enter			= tegra114_idle_power_down,
-			.enter_freeze		= tegra114_idle_enter_freeze,
+			.enter_s2idle		= tegra114_idle_enter_s2idle,
 			.exit_latency		= 500,
 			.target_residency	= 1000,
 			.flags			= CPUIDLE_FLAG_TIMER_STOP,
diff --git a/drivers/acpi/processor_idle.c b/drivers/acpi/processor_idle.c
index 5c8aa9cf62d7..a9eb9d654b7a 100644
--- a/drivers/acpi/processor_idle.c
+++ b/drivers/acpi/processor_idle.c
@@ -789,7 +789,7 @@ static int acpi_idle_enter(struct cpuidle_device *dev,
 	return index;
 }
 
-static void acpi_idle_enter_freeze(struct cpuidle_device *dev,
+static void acpi_idle_enter_s2idle(struct cpuidle_device *dev,
 				   struct cpuidle_driver *drv, int index)
 {
 	struct acpi_processor_cx *cx = per_cpu(acpi_cstate[index], dev->cpu);
@@ -867,14 +867,14 @@ static int acpi_processor_setup_cstates(struct acpi_processor *pr)
 			drv->safe_state_index = count;
 		}
 		/*
-		 * Halt-induced C1 is not good for ->enter_freeze, because it
+		 * Halt-induced C1 is not good for ->enter_s2idle, because it
 		 * re-enables interrupts on exit.  Moreover, C1 is generally not
 		 * particularly interesting from the suspend-to-idle angle, so
 		 * avoid C1 and the situations in which we may need to fall back
 		 * to it altogether.
 		 */
 		if (cx->type != ACPI_STATE_C1 && !acpi_idle_fallback_to_c1(pr))
-			state->enter_freeze = acpi_idle_enter_freeze;
+			state->enter_s2idle = acpi_idle_enter_s2idle;
 
 		count++;
 		if (count == CPUIDLE_STATE_MAX)
diff --git a/drivers/cpuidle/cpuidle.c b/drivers/cpuidle/cpuidle.c
index 60bb64f4329d..484cc8909d5c 100644
--- a/drivers/cpuidle/cpuidle.c
+++ b/drivers/cpuidle/cpuidle.c
@@ -77,7 +77,7 @@ static int find_deepest_state(struct cpuidle_driver *drv,
 			      struct cpuidle_device *dev,
 			      unsigned int max_latency,
 			      unsigned int forbidden_flags,
-			      bool freeze)
+			      bool s2idle)
 {
 	unsigned int latency_req = 0;
 	int i, ret = 0;
@@ -89,7 +89,7 @@ static int find_deepest_state(struct cpuidle_driver *drv,
 		if (s->disabled || su->disable || s->exit_latency <= latency_req
 		    || s->exit_latency > max_latency
 		    || (s->flags & forbidden_flags)
-		    || (freeze && !s->enter_freeze))
+		    || (s2idle && !s->enter_s2idle))
 			continue;
 
 		latency_req = s->exit_latency;
@@ -128,7 +128,7 @@ int cpuidle_find_deepest_state(struct cpuidle_driver *drv,
 }
 
 #ifdef CONFIG_SUSPEND
-static void enter_freeze_proper(struct cpuidle_driver *drv,
+static void enter_s2idle_proper(struct cpuidle_driver *drv,
 				struct cpuidle_device *dev, int index)
 {
 	/*
@@ -143,7 +143,7 @@ static void enter_freeze_proper(struct cpuidle_driver *drv,
 	 * suspended is generally unsafe.
 	 */
 	stop_critical_timings();
-	drv->states[index].enter_freeze(dev, drv, index);
+	drv->states[index].enter_s2idle(dev, drv, index);
 	WARN_ON(!irqs_disabled());
 	/*
 	 * timekeeping_resume() that will be called by tick_unfreeze() for the
@@ -155,25 +155,25 @@ static void enter_freeze_proper(struct cpuidle_driver *drv,
 }
 
 /**
- * cpuidle_enter_freeze - Enter an idle state suitable for suspend-to-idle.
+ * cpuidle_enter_s2idle - Enter an idle state suitable for suspend-to-idle.
  * @drv: cpuidle driver for the given CPU.
  * @dev: cpuidle device for the given CPU.
  *
- * If there are states with the ->enter_freeze callback, find the deepest of
+ * If there are states with the ->enter_s2idle callback, find the deepest of
  * them and enter it with frozen tick.
  */
-int cpuidle_enter_freeze(struct cpuidle_driver *drv, struct cpuidle_device *dev)
+int cpuidle_enter_s2idle(struct cpuidle_driver *drv, struct cpuidle_device *dev)
 {
 	int index;
 
 	/*
-	 * Find the deepest state with ->enter_freeze present, which guarantees
+	 * Find the deepest state with ->enter_s2idle present, which guarantees
 	 * that interrupts won't be enabled when it exits and allows the tick to
 	 * be frozen safely.
 	 */
 	index = find_deepest_state(drv, dev, UINT_MAX, 0, true);
 	if (index > 0)
-		enter_freeze_proper(drv, dev, index);
+		enter_s2idle_proper(drv, dev, index);
 
 	return index;
 }
diff --git a/drivers/cpuidle/dt_idle_states.c b/drivers/cpuidle/dt_idle_states.c
index ae8eb0359889..5ba9fab64e7d 100644
--- a/drivers/cpuidle/dt_idle_states.c
+++ b/drivers/cpuidle/dt_idle_states.c
@@ -41,9 +41,9 @@ static int init_state_node(struct cpuidle_state *idle_state,
 	/*
 	 * Since this is not a "coupled" state, it's safe to assume interrupts
 	 * won't be enabled when it exits allowing the tick to be frozen
-	 * safely. So enter() can be also enter_freeze() callback.
+	 * safely. So enter() can be also enter_s2idle() callback.
 	 */
-	idle_state->enter_freeze = match_id->data;
+	idle_state->enter_s2idle = match_id->data;
 
 	err = of_property_read_u32(state_node, "wakeup-latency-us",
 				   &idle_state->exit_latency);
diff --git a/drivers/idle/intel_idle.c b/drivers/idle/intel_idle.c
index c2ae819a871c..474cc4330613 100644
--- a/drivers/idle/intel_idle.c
+++ b/drivers/idle/intel_idle.c
@@ -97,7 +97,7 @@ static const struct idle_cpu *icpu;
 static struct cpuidle_device __percpu *intel_idle_cpuidle_devices;
 static int intel_idle(struct cpuidle_device *dev,
 			struct cpuidle_driver *drv, int index);
-static void intel_idle_freeze(struct cpuidle_device *dev,
+static void intel_idle_s2idle(struct cpuidle_device *dev,
 			      struct cpuidle_driver *drv, int index);
 static struct cpuidle_state *cpuidle_state_table;
 
@@ -132,7 +132,7 @@ static struct cpuidle_state nehalem_cstates[] = {
 		.exit_latency = 3,
 		.target_residency = 6,
 		.enter = &intel_idle,
-		.enter_freeze = intel_idle_freeze, },
+		.enter_s2idle = intel_idle_s2idle, },
 	{
 		.name = "C1E",
 		.desc = "MWAIT 0x01",
@@ -140,7 +140,7 @@ static struct cpuidle_state nehalem_cstates[] = {
 		.exit_latency = 10,
 		.target_residency = 20,
 		.enter = &intel_idle,
-		.enter_freeze = intel_idle_freeze, },
+		.enter_s2idle = intel_idle_s2idle, },
 	{
 		.name = "C3",
 		.desc = "MWAIT 0x10",
@@ -148,7 +148,7 @@ static struct cpuidle_state nehalem_cstates[] = {
 		.exit_latency = 20,
 		.target_residency = 80,
 		.enter = &intel_idle,
-		.enter_freeze = intel_idle_freeze, },
+		.enter_s2idle = intel_idle_s2idle, },
 	{
 		.name = "C6",
 		.desc = "MWAIT 0x20",
@@ -156,7 +156,7 @@ static struct cpuidle_state nehalem_cstates[] = {
 		.exit_latency = 200,
 		.target_residency = 800,
 		.enter = &intel_idle,
-		.enter_freeze = intel_idle_freeze, },
+		.enter_s2idle = intel_idle_s2idle, },
 	{
 		.enter = NULL }
 };
@@ -169,7 +169,7 @@ static struct cpuidle_state snb_cstates[] = {
 		.exit_latency = 2,
 		.target_residency = 2,
 		.enter = &intel_idle,
-		.enter_freeze = intel_idle_freeze, },
+		.enter_s2idle = intel_idle_s2idle, },
 	{
 		.name = "C1E",
 		.desc = "MWAIT 0x01",
@@ -177,7 +177,7 @@ static struct cpuidle_state snb_cstates[] = {
 		.exit_latency = 10,
 		.target_residency = 20,
 		.enter = &intel_idle,
-		.enter_freeze = intel_idle_freeze, },
+		.enter_s2idle = intel_idle_s2idle, },
 	{
 		.name = "C3",
 		.desc = "MWAIT 0x10",
@@ -185,7 +185,7 @@ static struct cpuidle_state snb_cstates[] = {
 		.exit_latency = 80,
 		.target_residency = 211,
 		.enter = &intel_idle,
-		.enter_freeze = intel_idle_freeze, },
+		.enter_s2idle = intel_idle_s2idle, },
 	{
 		.name = "C6",
 		.desc = "MWAIT 0x20",
@@ -193,7 +193,7 @@ static struct cpuidle_state snb_cstates[] = {
 		.exit_latency = 104,
 		.target_residency = 345,
 		.enter = &intel_idle,
-		.enter_freeze = intel_idle_freeze, },
+		.enter_s2idle = intel_idle_s2idle, },
 	{
 		.name = "C7",
 		.desc = "MWAIT 0x30",
@@ -201,7 +201,7 @@ static struct cpuidle_state snb_cstates[] = {
 		.exit_latency = 109,
 		.target_residency = 345,
 		.enter = &intel_idle,
-		.enter_freeze = intel_idle_freeze, },
+		.enter_s2idle = intel_idle_s2idle, },
 	{
 		.enter = NULL }
 };
@@ -214,7 +214,7 @@ static struct cpuidle_state byt_cstates[] = {
 		.exit_latency = 1,
 		.target_residency = 1,
 		.enter = &intel_idle,
-		.enter_freeze = intel_idle_freeze, },
+		.enter_s2idle = intel_idle_s2idle, },
 	{
 		.name = "C6N",
 		.desc = "MWAIT 0x58",
@@ -222,7 +222,7 @@ static struct cpuidle_state byt_cstates[] = {
 		.exit_latency = 300,
 		.target_residency = 275,
 		.enter = &intel_idle,
-		.enter_freeze = intel_idle_freeze, },
+		.enter_s2idle = intel_idle_s2idle, },
 	{
 		.name = "C6S",
 		.desc = "MWAIT 0x52",
@@ -230,7 +230,7 @@ static struct cpuidle_state byt_cstates[] = {
 		.exit_latency = 500,
 		.target_residency = 560,
 		.enter = &intel_idle,
-		.enter_freeze = intel_idle_freeze, },
+		.enter_s2idle = intel_idle_s2idle, },
 	{
 		.name = "C7",
 		.desc = "MWAIT 0x60",
@@ -238,7 +238,7 @@ static struct cpuidle_state byt_cstates[] = {
 		.exit_latency = 1200,
 		.target_residency = 4000,
 		.enter = &intel_idle,
-		.enter_freeze = intel_idle_freeze, },
+		.enter_s2idle = intel_idle_s2idle, },
 	{
 		.name = "C7S",
 		.desc = "MWAIT 0x64",
@@ -246,7 +246,7 @@ static struct cpuidle_state byt_cstates[] = {
 		.exit_latency = 10000,
 		.target_residency = 20000,
 		.enter = &intel_idle,
-		.enter_freeze = intel_idle_freeze, },
+		.enter_s2idle = intel_idle_s2idle, },
 	{
 		.enter = NULL }
 };
@@ -259,7 +259,7 @@ static struct cpuidle_state cht_cstates[] = {
 		.exit_latency = 1,
 		.target_residency = 1,
 		.enter = &intel_idle,
-		.enter_freeze = intel_idle_freeze, },
+		.enter_s2idle = intel_idle_s2idle, },
 	{
 		.name = "C6N",
 		.desc = "MWAIT 0x58",
@@ -267,7 +267,7 @@ static struct cpuidle_state cht_cstates[] = {
 		.exit_latency = 80,
 		.target_residency = 275,
 		.enter = &intel_idle,
-		.enter_freeze = intel_idle_freeze, },
+		.enter_s2idle = intel_idle_s2idle, },
 	{
 		.name = "C6S",
 		.desc = "MWAIT 0x52",
@@ -275,7 +275,7 @@ static struct cpuidle_state cht_cstates[] = {
 		.exit_latency = 200,
 		.target_residency = 560,
 		.enter = &intel_idle,
-		.enter_freeze = intel_idle_freeze, },
+		.enter_s2idle = intel_idle_s2idle, },
 	{
 		.name = "C7",
 		.desc = "MWAIT 0x60",
@@ -283,7 +283,7 @@ static struct cpuidle_state cht_cstates[] = {
 		.exit_latency = 1200,
 		.target_residency = 4000,
 		.enter = &intel_idle,
-		.enter_freeze = intel_idle_freeze, },
+		.enter_s2idle = intel_idle_s2idle, },
 	{
 		.name = "C7S",
 		.desc = "MWAIT 0x64",
@@ -291,7 +291,7 @@ static struct cpuidle_state cht_cstates[] = {
 		.exit_latency = 10000,
 		.target_residency = 20000,
 		.enter = &intel_idle,
-		.enter_freeze = intel_idle_freeze, },
+		.enter_s2idle = intel_idle_s2idle, },
 	{
 		.enter = NULL }
 };
@@ -304,7 +304,7 @@ static struct cpuidle_state ivb_cstates[] = {
 		.exit_latency = 1,
 		.target_residency = 1,
 		.enter = &intel_idle,
-		.enter_freeze = intel_idle_freeze, },
+		.enter_s2idle = intel_idle_s2idle, },
 	{
 		.name = "C1E",
 		.desc = "MWAIT 0x01",
@@ -312,7 +312,7 @@ static struct cpuidle_state ivb_cstates[] = {
 		.exit_latency = 10,
 		.target_residency = 20,
 		.enter = &intel_idle,
-		.enter_freeze = intel_idle_freeze, },
+		.enter_s2idle = intel_idle_s2idle, },
 	{
 		.name = "C3",
 		.desc = "MWAIT 0x10",
@@ -320,7 +320,7 @@ static struct cpuidle_state ivb_cstates[] = {
 		.exit_latency = 59,
 		.target_residency = 156,
 		.enter = &intel_idle,
-		.enter_freeze = intel_idle_freeze, },
+		.enter_s2idle = intel_idle_s2idle, },
 	{
 		.name = "C6",
 		.desc = "MWAIT 0x20",
@@ -328,7 +328,7 @@ static struct cpuidle_state ivb_cstates[] = {
 		.exit_latency = 80,
 		.target_residency = 300,
 		.enter = &intel_idle,
-		.enter_freeze = intel_idle_freeze, },
+		.enter_s2idle = intel_idle_s2idle, },
 	{
 		.name = "C7",
 		.desc = "MWAIT 0x30",
@@ -336,7 +336,7 @@ static struct cpuidle_state ivb_cstates[] = {
 		.exit_latency = 87,
 		.target_residency = 300,
 		.enter = &intel_idle,
-		.enter_freeze = intel_idle_freeze, },
+		.enter_s2idle = intel_idle_s2idle, },
 	{
 		.enter = NULL }
 };
@@ -349,7 +349,7 @@ static struct cpuidle_state ivt_cstates[] = {
 		.exit_latency = 1,
 		.target_residency = 1,
 		.enter = &intel_idle,
-		.enter_freeze = intel_idle_freeze, },
+		.enter_s2idle = intel_idle_s2idle, },
 	{
 		.name = "C1E",
 		.desc = "MWAIT 0x01",
@@ -357,7 +357,7 @@ static struct cpuidle_state ivt_cstates[] = {
 		.exit_latency = 10,
 		.target_residency = 80,
 		.enter = &intel_idle,
-		.enter_freeze = intel_idle_freeze, },
+		.enter_s2idle = intel_idle_s2idle, },
 	{
 		.name = "C3",
 		.desc = "MWAIT 0x10",
@@ -365,7 +365,7 @@ static struct cpuidle_state ivt_cstates[] = {
 		.exit_latency = 59,
 		.target_residency = 156,
 		.enter = &intel_idle,
-		.enter_freeze = intel_idle_freeze, },
+		.enter_s2idle = intel_idle_s2idle, },
 	{
 		.name = "C6",
 		.desc = "MWAIT 0x20",
@@ -373,7 +373,7 @@ static struct cpuidle_state ivt_cstates[] = {
 		.exit_latency = 82,
 		.target_residency = 300,
 		.enter = &intel_idle,
-		.enter_freeze = intel_idle_freeze, },
+		.enter_s2idle = intel_idle_s2idle, },
 	{
 		.enter = NULL }
 };
@@ -386,7 +386,7 @@ static struct cpuidle_state ivt_cstates_4s[] = {
 		.exit_latency = 1,
 		.target_residency = 1,
 		.enter = &intel_idle,
-		.enter_freeze = intel_idle_freeze, },
+		.enter_s2idle = intel_idle_s2idle, },
 	{
 		.name = "C1E",
 		.desc = "MWAIT 0x01",
@@ -394,7 +394,7 @@ static struct cpuidle_state ivt_cstates_4s[] = {
 		.exit_latency = 10,
 		.target_residency = 250,
 		.enter = &intel_idle,
-		.enter_freeze = intel_idle_freeze, },
+		.enter_s2idle = intel_idle_s2idle, },
 	{
 		.name = "C3",
 		.desc = "MWAIT 0x10",
@@ -402,7 +402,7 @@ static struct cpuidle_state ivt_cstates_4s[] = {
 		.exit_latency = 59,
 		.target_residency = 300,
 		.enter = &intel_idle,
-		.enter_freeze = intel_idle_freeze, },
+		.enter_s2idle = intel_idle_s2idle, },
 	{
 		.name = "C6",
 		.desc = "MWAIT 0x20",
@@ -410,7 +410,7 @@ static struct cpuidle_state ivt_cstates_4s[] = {
 		.exit_latency = 84,
 		.target_residency = 400,
 		.enter = &intel_idle,
-		.enter_freeze = intel_idle_freeze, },
+		.enter_s2idle = intel_idle_s2idle, },
 	{
 		.enter = NULL }
 };
@@ -423,7 +423,7 @@ static struct cpuidle_state ivt_cstates_8s[] = {
 		.exit_latency = 1,
 		.target_residency = 1,
 		.enter = &intel_idle,
-		.enter_freeze = intel_idle_freeze, },
+		.enter_s2idle = intel_idle_s2idle, },
 	{
 		.name = "C1E",
 		.desc = "MWAIT 0x01",
@@ -431,7 +431,7 @@ static struct cpuidle_state ivt_cstates_8s[] = {
 		.exit_latency = 10,
 		.target_residency = 500,
 		.enter = &intel_idle,
-		.enter_freeze = intel_idle_freeze, },
+		.enter_s2idle = intel_idle_s2idle, },
 	{
 		.name = "C3",
 		.desc = "MWAIT 0x10",
@@ -439,7 +439,7 @@ static struct cpuidle_state ivt_cstates_8s[] = {
 		.exit_latency = 59,
 		.target_residency = 600,
 		.enter = &intel_idle,
-		.enter_freeze = intel_idle_freeze, },
+		.enter_s2idle = intel_idle_s2idle, },
 	{
 		.name = "C6",
 		.desc = "MWAIT 0x20",
@@ -447,7 +447,7 @@ static struct cpuidle_state ivt_cstates_8s[] = {
 		.exit_latency = 88,
 		.target_residency = 700,
 		.enter = &intel_idle,
-		.enter_freeze = intel_idle_freeze, },
+		.enter_s2idle = intel_idle_s2idle, },
 	{
 		.enter = NULL }
 };
@@ -460,7 +460,7 @@ static struct cpuidle_state hsw_cstates[] = {
 		.exit_latency = 2,
 		.target_residency = 2,
 		.enter = &intel_idle,
-		.enter_freeze = intel_idle_freeze, },
+		.enter_s2idle = intel_idle_s2idle, },
 	{
 		.name = "C1E",
 		.desc = "MWAIT 0x01",
@@ -468,7 +468,7 @@ static struct cpuidle_state hsw_cstates[] = {
 		.exit_latency = 10,
 		.target_residency = 20,
 		.enter = &intel_idle,
-		.enter_freeze = intel_idle_freeze, },
+		.enter_s2idle = intel_idle_s2idle, },
 	{
 		.name = "C3",
 		.desc = "MWAIT 0x10",
@@ -476,7 +476,7 @@ static struct cpuidle_state hsw_cstates[] = {
 		.exit_latency = 33,
 		.target_residency = 100,
 		.enter = &intel_idle,
-		.enter_freeze = intel_idle_freeze, },
+		.enter_s2idle = intel_idle_s2idle, },
 	{
 		.name = "C6",
 		.desc = "MWAIT 0x20",
@@ -484,7 +484,7 @@ static struct cpuidle_state hsw_cstates[] = {
 		.exit_latency = 133,
 		.target_residency = 400,
 		.enter = &intel_idle,
-		.enter_freeze = intel_idle_freeze, },
+		.enter_s2idle = intel_idle_s2idle, },
 	{
 		.name = "C7s",
 		.desc = "MWAIT 0x32",
@@ -492,7 +492,7 @@ static struct cpuidle_state hsw_cstates[] = {
 		.exit_latency = 166,
 		.target_residency = 500,
 		.enter = &intel_idle,
-		.enter_freeze = intel_idle_freeze, },
+		.enter_s2idle = intel_idle_s2idle, },
 	{
 		.name = "C8",
 		.desc = "MWAIT 0x40",
@@ -500,7 +500,7 @@ static struct cpuidle_state hsw_cstates[] = {
 		.exit_latency = 300,
 		.target_residency = 900,
 		.enter = &intel_idle,
-		.enter_freeze = intel_idle_freeze, },
+		.enter_s2idle = intel_idle_s2idle, },
 	{
 		.name = "C9",
 		.desc = "MWAIT 0x50",
@@ -508,7 +508,7 @@ static struct cpuidle_state hsw_cstates[] = {
 		.exit_latency = 600,
 		.target_residency = 1800,
 		.enter = &intel_idle,
-		.enter_freeze = intel_idle_freeze, },
+		.enter_s2idle = intel_idle_s2idle, },
 	{
 		.name = "C10",
 		.desc = "MWAIT 0x60",
@@ -516,7 +516,7 @@ static struct cpuidle_state hsw_cstates[] = {
 		.exit_latency = 2600,
 		.target_residency = 7700,
 		.enter = &intel_idle,
-		.enter_freeze = intel_idle_freeze, },
+		.enter_s2idle = intel_idle_s2idle, },
 	{
 		.enter = NULL }
 };
@@ -528,7 +528,7 @@ static struct cpuidle_state bdw_cstates[] = {
 		.exit_latency = 2,
 		.target_residency = 2,
 		.enter = &intel_idle,
-		.enter_freeze = intel_idle_freeze, },
+		.enter_s2idle = intel_idle_s2idle, },
 	{
 		.name = "C1E",
 		.desc = "MWAIT 0x01",
@@ -536,7 +536,7 @@ static struct cpuidle_state bdw_cstates[] = {
 		.exit_latency = 10,
 		.target_residency = 20,
 		.enter = &intel_idle,
-		.enter_freeze = intel_idle_freeze, },
+		.enter_s2idle = intel_idle_s2idle, },
 	{
 		.name = "C3",
 		.desc = "MWAIT 0x10",
@@ -544,7 +544,7 @@ static struct cpuidle_state bdw_cstates[] = {
 		.exit_latency = 40,
 		.target_residency = 100,
 		.enter = &intel_idle,
-		.enter_freeze = intel_idle_freeze, },
+		.enter_s2idle = intel_idle_s2idle, },
 	{
 		.name = "C6",
 		.desc = "MWAIT 0x20",
@@ -552,7 +552,7 @@ static struct cpuidle_state bdw_cstates[] = {
 		.exit_latency = 133,
 		.target_residency = 400,
 		.enter = &intel_idle,
-		.enter_freeze = intel_idle_freeze, },
+		.enter_s2idle = intel_idle_s2idle, },
 	{
 		.name = "C7s",
 		.desc = "MWAIT 0x32",
@@ -560,7 +560,7 @@ static struct cpuidle_state bdw_cstates[] = {
 		.exit_latency = 166,
 		.target_residency = 500,
 		.enter = &intel_idle,
-		.enter_freeze = intel_idle_freeze, },
+		.enter_s2idle = intel_idle_s2idle, },
 	{
 		.name = "C8",
 		.desc = "MWAIT 0x40",
@@ -568,7 +568,7 @@ static struct cpuidle_state bdw_cstates[] = {
 		.exit_latency = 300,
 		.target_residency = 900,
 		.enter = &intel_idle,
-		.enter_freeze = intel_idle_freeze, },
+		.enter_s2idle = intel_idle_s2idle, },
 	{
 		.name = "C9",
 		.desc = "MWAIT 0x50",
@@ -576,7 +576,7 @@ static struct cpuidle_state bdw_cstates[] = {
 		.exit_latency = 600,
 		.target_residency = 1800,
 		.enter = &intel_idle,
-		.enter_freeze = intel_idle_freeze, },
+		.enter_s2idle = intel_idle_s2idle, },
 	{
 		.name = "C10",
 		.desc = "MWAIT 0x60",
@@ -584,7 +584,7 @@ static struct cpuidle_state bdw_cstates[] = {
 		.exit_latency = 2600,
 		.target_residency = 7700,
 		.enter = &intel_idle,
-		.enter_freeze = intel_idle_freeze, },
+		.enter_s2idle = intel_idle_s2idle, },
 	{
 		.enter = NULL }
 };
@@ -597,7 +597,7 @@ static struct cpuidle_state skl_cstates[] = {
 		.exit_latency = 2,
 		.target_residency = 2,
 		.enter = &intel_idle,
-		.enter_freeze = intel_idle_freeze, },
+		.enter_s2idle = intel_idle_s2idle, },
 	{
 		.name = "C1E",
 		.desc = "MWAIT 0x01",
@@ -605,7 +605,7 @@ static struct cpuidle_state skl_cstates[] = {
 		.exit_latency = 10,
 		.target_residency = 20,
 		.enter = &intel_idle,
-		.enter_freeze = intel_idle_freeze, },
+		.enter_s2idle = intel_idle_s2idle, },
 	{
 		.name = "C3",
 		.desc = "MWAIT 0x10",
@@ -613,7 +613,7 @@ static struct cpuidle_state skl_cstates[] = {
 		.exit_latency = 70,
 		.target_residency = 100,
 		.enter = &intel_idle,
-		.enter_freeze = intel_idle_freeze, },
+		.enter_s2idle = intel_idle_s2idle, },
 	{
 		.name = "C6",
 		.desc = "MWAIT 0x20",
@@ -621,7 +621,7 @@ static struct cpuidle_state skl_cstates[] = {
 		.exit_latency = 85,
 		.target_residency = 200,
 		.enter = &intel_idle,
-		.enter_freeze = intel_idle_freeze, },
+		.enter_s2idle = intel_idle_s2idle, },
 	{
 		.name = "C7s",
 		.desc = "MWAIT 0x33",
@@ -629,7 +629,7 @@ static struct cpuidle_state skl_cstates[] = {
 		.exit_latency = 124,
 		.target_residency = 800,
 		.enter = &intel_idle,
-		.enter_freeze = intel_idle_freeze, },
+		.enter_s2idle = intel_idle_s2idle, },
 	{
 		.name = "C8",
 		.desc = "MWAIT 0x40",
@@ -637,7 +637,7 @@ static struct cpuidle_state skl_cstates[] = {
 		.exit_latency = 200,
 		.target_residency = 800,
 		.enter = &intel_idle,
-		.enter_freeze = intel_idle_freeze, },
+		.enter_s2idle = intel_idle_s2idle, },
 	{
 		.name = "C9",
 		.desc = "MWAIT 0x50",
@@ -645,7 +645,7 @@ static struct cpuidle_state skl_cstates[] = {
 		.exit_latency = 480,
 		.target_residency = 5000,
 		.enter = &intel_idle,
-		.enter_freeze = intel_idle_freeze, },
+		.enter_s2idle = intel_idle_s2idle, },
 	{
 		.name = "C10",
 		.desc = "MWAIT 0x60",
@@ -653,7 +653,7 @@ static struct cpuidle_state skl_cstates[] = {
 		.exit_latency = 890,
 		.target_residency = 5000,
 		.enter = &intel_idle,
-		.enter_freeze = intel_idle_freeze, },
+		.enter_s2idle = intel_idle_s2idle, },
 	{
 		.enter = NULL }
 };
@@ -666,7 +666,7 @@ static struct cpuidle_state skx_cstates[] = {
 		.exit_latency = 2,
 		.target_residency = 2,
 		.enter = &intel_idle,
-		.enter_freeze = intel_idle_freeze, },
+		.enter_s2idle = intel_idle_s2idle, },
 	{
 		.name = "C1E",
 		.desc = "MWAIT 0x01",
@@ -674,7 +674,7 @@ static struct cpuidle_state skx_cstates[] = {
 		.exit_latency = 10,
 		.target_residency = 20,
 		.enter = &intel_idle,
-		.enter_freeze = intel_idle_freeze, },
+		.enter_s2idle = intel_idle_s2idle, },
 	{
 		.name = "C6",
 		.desc = "MWAIT 0x20",
@@ -682,7 +682,7 @@ static struct cpuidle_state skx_cstates[] = {
 		.exit_latency = 133,
 		.target_residency = 600,
 		.enter = &intel_idle,
-		.enter_freeze = intel_idle_freeze, },
+		.enter_s2idle = intel_idle_s2idle, },
 	{
 		.enter = NULL }
 };
@@ -695,7 +695,7 @@ static struct cpuidle_state atom_cstates[] = {
 		.exit_latency = 10,
 		.target_residency = 20,
 		.enter = &intel_idle,
-		.enter_freeze = intel_idle_freeze, },
+		.enter_s2idle = intel_idle_s2idle, },
 	{
 		.name = "C2",
 		.desc = "MWAIT 0x10",
@@ -703,7 +703,7 @@ static struct cpuidle_state atom_cstates[] = {
 		.exit_latency = 20,
 		.target_residency = 80,
 		.enter = &intel_idle,
-		.enter_freeze = intel_idle_freeze, },
+		.enter_s2idle = intel_idle_s2idle, },
 	{
 		.name = "C4",
 		.desc = "MWAIT 0x30",
@@ -711,7 +711,7 @@ static struct cpuidle_state atom_cstates[] = {
 		.exit_latency = 100,
 		.target_residency = 400,
 		.enter = &intel_idle,
-		.enter_freeze = intel_idle_freeze, },
+		.enter_s2idle = intel_idle_s2idle, },
 	{
 		.name = "C6",
 		.desc = "MWAIT 0x52",
@@ -719,7 +719,7 @@ static struct cpuidle_state atom_cstates[] = {
 		.exit_latency = 140,
 		.target_residency = 560,
 		.enter = &intel_idle,
-		.enter_freeze = intel_idle_freeze, },
+		.enter_s2idle = intel_idle_s2idle, },
 	{
 		.enter = NULL }
 };
@@ -731,7 +731,7 @@ static struct cpuidle_state tangier_cstates[] = {
 		.exit_latency = 1,
 		.target_residency = 4,
 		.enter = &intel_idle,
-		.enter_freeze = intel_idle_freeze, },
+		.enter_s2idle = intel_idle_s2idle, },
 	{
 		.name = "C4",
 		.desc = "MWAIT 0x30",
@@ -739,7 +739,7 @@ static struct cpuidle_state tangier_cstates[] = {
 		.exit_latency = 100,
 		.target_residency = 400,
 		.enter = &intel_idle,
-		.enter_freeze = intel_idle_freeze, },
+		.enter_s2idle = intel_idle_s2idle, },
 	{
 		.name = "C6",
 		.desc = "MWAIT 0x52",
@@ -747,7 +747,7 @@ static struct cpuidle_state tangier_cstates[] = {
 		.exit_latency = 140,
 		.target_residency = 560,
 		.enter = &intel_idle,
-		.enter_freeze = intel_idle_freeze, },
+		.enter_s2idle = intel_idle_s2idle, },
 	{
 		.name = "C7",
 		.desc = "MWAIT 0x60",
@@ -755,7 +755,7 @@ static struct cpuidle_state tangier_cstates[] = {
 		.exit_latency = 1200,
 		.target_residency = 4000,
 		.enter = &intel_idle,
-		.enter_freeze = intel_idle_freeze, },
+		.enter_s2idle = intel_idle_s2idle, },
 	{
 		.name = "C9",
 		.desc = "MWAIT 0x64",
@@ -763,7 +763,7 @@ static struct cpuidle_state tangier_cstates[] = {
 		.exit_latency = 10000,
 		.target_residency = 20000,
 		.enter = &intel_idle,
-		.enter_freeze = intel_idle_freeze, },
+		.enter_s2idle = intel_idle_s2idle, },
 	{
 		.enter = NULL }
 };
@@ -775,7 +775,7 @@ static struct cpuidle_state avn_cstates[] = {
 		.exit_latency = 2,
 		.target_residency = 2,
 		.enter = &intel_idle,
-		.enter_freeze = intel_idle_freeze, },
+		.enter_s2idle = intel_idle_s2idle, },
 	{
 		.name = "C6",
 		.desc = "MWAIT 0x51",
@@ -783,7 +783,7 @@ static struct cpuidle_state avn_cstates[] = {
 		.exit_latency = 15,
 		.target_residency = 45,
 		.enter = &intel_idle,
-		.enter_freeze = intel_idle_freeze, },
+		.enter_s2idle = intel_idle_s2idle, },
 	{
 		.enter = NULL }
 };
@@ -795,7 +795,7 @@ static struct cpuidle_state knl_cstates[] = {
 		.exit_latency = 1,
 		.target_residency = 2,
 		.enter = &intel_idle,
-		.enter_freeze = intel_idle_freeze },
+		.enter_s2idle = intel_idle_s2idle },
 	{
 		.name = "C6",
 		.desc = "MWAIT 0x10",
@@ -803,7 +803,7 @@ static struct cpuidle_state knl_cstates[] = {
 		.exit_latency = 120,
 		.target_residency = 500,
 		.enter = &intel_idle,
-		.enter_freeze = intel_idle_freeze },
+		.enter_s2idle = intel_idle_s2idle },
 	{
 		.enter = NULL }
 };
@@ -816,7 +816,7 @@ static struct cpuidle_state bxt_cstates[] = {
 		.exit_latency = 2,
 		.target_residency = 2,
 		.enter = &intel_idle,
-		.enter_freeze = intel_idle_freeze, },
+		.enter_s2idle = intel_idle_s2idle, },
 	{
 		.name = "C1E",
 		.desc = "MWAIT 0x01",
@@ -824,7 +824,7 @@ static struct cpuidle_state bxt_cstates[] = {
 		.exit_latency = 10,
 		.target_residency = 20,
 		.enter = &intel_idle,
-		.enter_freeze = intel_idle_freeze, },
+		.enter_s2idle = intel_idle_s2idle, },
 	{
 		.name = "C6",
 		.desc = "MWAIT 0x20",
@@ -832,7 +832,7 @@ static struct cpuidle_state bxt_cstates[] = {
 		.exit_latency = 133,
 		.target_residency = 133,
 		.enter = &intel_idle,
-		.enter_freeze = intel_idle_freeze, },
+		.enter_s2idle = intel_idle_s2idle, },
 	{
 		.name = "C7s",
 		.desc = "MWAIT 0x31",
@@ -840,7 +840,7 @@ static struct cpuidle_state bxt_cstates[] = {
 		.exit_latency = 155,
 		.target_residency = 155,
 		.enter = &intel_idle,
-		.enter_freeze = intel_idle_freeze, },
+		.enter_s2idle = intel_idle_s2idle, },
 	{
 		.name = "C8",
 		.desc = "MWAIT 0x40",
@@ -848,7 +848,7 @@ static struct cpuidle_state bxt_cstates[] = {
 		.exit_latency = 1000,
 		.target_residency = 1000,
 		.enter = &intel_idle,
-		.enter_freeze = intel_idle_freeze, },
+		.enter_s2idle = intel_idle_s2idle, },
 	{
 		.name = "C9",
 		.desc = "MWAIT 0x50",
@@ -856,7 +856,7 @@ static struct cpuidle_state bxt_cstates[] = {
 		.exit_latency = 2000,
 		.target_residency = 2000,
 		.enter = &intel_idle,
-		.enter_freeze = intel_idle_freeze, },
+		.enter_s2idle = intel_idle_s2idle, },
 	{
 		.name = "C10",
 		.desc = "MWAIT 0x60",
@@ -864,7 +864,7 @@ static struct cpuidle_state bxt_cstates[] = {
 		.exit_latency = 10000,
 		.target_residency = 10000,
 		.enter = &intel_idle,
-		.enter_freeze = intel_idle_freeze, },
+		.enter_s2idle = intel_idle_s2idle, },
 	{
 		.enter = NULL }
 };
@@ -877,7 +877,7 @@ static struct cpuidle_state dnv_cstates[] = {
 		.exit_latency = 2,
 		.target_residency = 2,
 		.enter = &intel_idle,
-		.enter_freeze = intel_idle_freeze, },
+		.enter_s2idle = intel_idle_s2idle, },
 	{
 		.name = "C1E",
 		.desc = "MWAIT 0x01",
@@ -885,7 +885,7 @@ static struct cpuidle_state dnv_cstates[] = {
 		.exit_latency = 10,
 		.target_residency = 20,
 		.enter = &intel_idle,
-		.enter_freeze = intel_idle_freeze, },
+		.enter_s2idle = intel_idle_s2idle, },
 	{
 		.name = "C6",
 		.desc = "MWAIT 0x20",
@@ -893,7 +893,7 @@ static struct cpuidle_state dnv_cstates[] = {
 		.exit_latency = 50,
 		.target_residency = 500,
 		.enter = &intel_idle,
-		.enter_freeze = intel_idle_freeze, },
+		.enter_s2idle = intel_idle_s2idle, },
 	{
 		.enter = NULL }
 };
@@ -936,12 +936,12 @@ static __cpuidle int intel_idle(struct cpuidle_device *dev,
 }
 
 /**
- * intel_idle_freeze - simplified "enter" callback routine for suspend-to-idle
+ * intel_idle_s2idle - simplified "enter" callback routine for suspend-to-idle
  * @dev: cpuidle_device
  * @drv: cpuidle driver
  * @index: state index
  */
-static void intel_idle_freeze(struct cpuidle_device *dev,
+static void intel_idle_s2idle(struct cpuidle_device *dev,
 			     struct cpuidle_driver *drv, int index)
 {
 	unsigned long ecx = 1; /* break on interrupt flag */
@@ -1337,7 +1337,7 @@ static void __init intel_idle_cpuidle_driver_init(void)
 		int num_substates, mwait_hint, mwait_cstate;
 
 		if ((cpuidle_state_table[cstate].enter == NULL) &&
-		    (cpuidle_state_table[cstate].enter_freeze == NULL))
+		    (cpuidle_state_table[cstate].enter_s2idle == NULL))
 			break;
 
 		if (cstate + 1 > max_cstate) {
diff --git a/include/linux/cpuidle.h b/include/linux/cpuidle.h
index fc1e5d7fc1c7..f0e2e9223e66 100644
--- a/include/linux/cpuidle.h
+++ b/include/linux/cpuidle.h
@@ -52,11 +52,11 @@ struct cpuidle_state {
 	int (*enter_dead) (struct cpuidle_device *dev, int index);
 
 	/*
-	 * CPUs execute ->enter_freeze with the local tick or entire timekeeping
+	 * CPUs execute ->enter_s2idle with the local tick or entire timekeeping
 	 * suspended, so it must not re-enable interrupts at any point (even
 	 * temporarily) or attempt to change states of clock event devices.
 	 */
-	void (*enter_freeze) (struct cpuidle_device *dev,
+	void (*enter_s2idle) (struct cpuidle_device *dev,
 			      struct cpuidle_driver *drv,
 			      int index);
 };
@@ -197,14 +197,14 @@ static inline struct cpuidle_device *cpuidle_get_device(void) {return NULL; }
 #ifdef CONFIG_CPU_IDLE
 extern int cpuidle_find_deepest_state(struct cpuidle_driver *drv,
 				      struct cpuidle_device *dev);
-extern int cpuidle_enter_freeze(struct cpuidle_driver *drv,
+extern int cpuidle_enter_s2idle(struct cpuidle_driver *drv,
 				struct cpuidle_device *dev);
 extern void cpuidle_use_deepest_state(bool enable);
 #else
 static inline int cpuidle_find_deepest_state(struct cpuidle_driver *drv,
 					     struct cpuidle_device *dev)
 {return -ENODEV; }
-static inline int cpuidle_enter_freeze(struct cpuidle_driver *drv,
+static inline int cpuidle_enter_s2idle(struct cpuidle_driver *drv,
 				       struct cpuidle_device *dev)
 {return -ENODEV; }
 static inline void cpuidle_use_deepest_state(bool enable)
diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c
index 2ea4b7b23044..257f4f0b4532 100644
--- a/kernel/sched/idle.c
+++ b/kernel/sched/idle.c
@@ -169,7 +169,7 @@ static void cpuidle_idle_call(void)
 
 	if (idle_should_enter_s2idle() || dev->use_deepest_state) {
 		if (idle_should_enter_s2idle()) {
-			entered_state = cpuidle_enter_freeze(drv, dev);
+			entered_state = cpuidle_enter_s2idle(drv, dev);
 			if (entered_state > 0) {
 				local_irq_enable();
 				goto exit_idle;
-- 
cgit v1.2.3


From 23d5855f4774f4f7c246a67057ecacc904696d8a Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rafael.j.wysocki@intel.com>
Date: Thu, 10 Aug 2017 00:15:30 +0200
Subject: PM / s2idle: Rename platform operations structure

Rename struct platform_freeze_ops to platform_s2idle_ops to make it
clear that the callbacks in it are used during suspend-to-idle
suspend/resume transitions and rename the related functions,
variables and so on accordingly.

Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/acpi/sleep.c    | 28 ++++++++++++++--------------
 include/linux/suspend.h |  6 +++---
 kernel/power/suspend.c  | 30 +++++++++++++++---------------
 3 files changed, 32 insertions(+), 32 deletions(-)

(limited to 'include')

diff --git a/drivers/acpi/sleep.c b/drivers/acpi/sleep.c
index a0a6fd10fb5f..f7a8abbeac6e 100644
--- a/drivers/acpi/sleep.c
+++ b/drivers/acpi/sleep.c
@@ -737,14 +737,14 @@ static struct acpi_scan_handler lps0_handler = {
 	.attach = lps0_device_attach,
 };
 
-static int acpi_freeze_begin(void)
+static int acpi_s2idle_begin(void)
 {
 	acpi_scan_lock_acquire();
 	s2idle_in_progress = true;
 	return 0;
 }
 
-static int acpi_freeze_prepare(void)
+static int acpi_s2idle_prepare(void)
 {
 	if (lps0_device_handle) {
 		acpi_sleep_run_lps0_dsm(ACPI_LPS0_SCREEN_OFF);
@@ -764,7 +764,7 @@ static int acpi_freeze_prepare(void)
 	return 0;
 }
 
-static void acpi_freeze_wake(void)
+static void acpi_s2idle_wake(void)
 {
 	/*
 	 * If IRQD_WAKEUP_ARMED is not set for the SCI at this point, it means
@@ -778,7 +778,7 @@ static void acpi_freeze_wake(void)
 	}
 }
 
-static void acpi_freeze_sync(void)
+static void acpi_s2idle_sync(void)
 {
 	/*
 	 * Process all pending events in case there are any wakeup ones.
@@ -791,7 +791,7 @@ static void acpi_freeze_sync(void)
 	s2idle_wakeup = false;
 }
 
-static void acpi_freeze_restore(void)
+static void acpi_s2idle_restore(void)
 {
 	if (acpi_sci_irq_valid())
 		disable_irq_wake(acpi_sci_irq);
@@ -804,19 +804,19 @@ static void acpi_freeze_restore(void)
 	}
 }
 
-static void acpi_freeze_end(void)
+static void acpi_s2idle_end(void)
 {
 	s2idle_in_progress = false;
 	acpi_scan_lock_release();
 }
 
-static const struct platform_freeze_ops acpi_freeze_ops = {
-	.begin = acpi_freeze_begin,
-	.prepare = acpi_freeze_prepare,
-	.wake = acpi_freeze_wake,
-	.sync = acpi_freeze_sync,
-	.restore = acpi_freeze_restore,
-	.end = acpi_freeze_end,
+static const struct platform_s2idle_ops acpi_s2idle_ops = {
+	.begin = acpi_s2idle_begin,
+	.prepare = acpi_s2idle_prepare,
+	.wake = acpi_s2idle_wake,
+	.sync = acpi_s2idle_sync,
+	.restore = acpi_s2idle_restore,
+	.end = acpi_s2idle_end,
 };
 
 static void acpi_sleep_suspend_setup(void)
@@ -831,7 +831,7 @@ static void acpi_sleep_suspend_setup(void)
 		&acpi_suspend_ops_old : &acpi_suspend_ops);
 
 	acpi_scan_add_handler(&lps0_handler);
-	freeze_set_ops(&acpi_freeze_ops);
+	s2idle_set_ops(&acpi_s2idle_ops);
 }
 
 #else /* !CONFIG_SUSPEND */
diff --git a/include/linux/suspend.h b/include/linux/suspend.h
index feb43dfbf7bc..0d41daf7e89d 100644
--- a/include/linux/suspend.h
+++ b/include/linux/suspend.h
@@ -186,7 +186,7 @@ struct platform_suspend_ops {
 	void (*recover)(void);
 };
 
-struct platform_freeze_ops {
+struct platform_s2idle_ops {
 	int (*begin)(void);
 	int (*prepare)(void);
 	void (*wake)(void);
@@ -251,7 +251,7 @@ static inline bool idle_should_enter_s2idle(void)
 }
 
 extern void __init pm_states_init(void);
-extern void freeze_set_ops(const struct platform_freeze_ops *ops);
+extern void s2idle_set_ops(const struct platform_s2idle_ops *ops);
 extern void s2idle_wake(void);
 
 /**
@@ -286,7 +286,7 @@ static inline void suspend_set_ops(const struct platform_suspend_ops *ops) {}
 static inline int pm_suspend(suspend_state_t state) { return -ENOSYS; }
 static inline bool idle_should_enter_s2idle(void) { return false; }
 static inline void __init pm_states_init(void) {}
-static inline void freeze_set_ops(const struct platform_freeze_ops *ops) {}
+static inline void s2idle_set_ops(const struct platform_s2idle_ops *ops) {}
 static inline void s2idle_wake(void) {}
 #endif /* !CONFIG_SUSPEND */
 
diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c
index ae9b579c2533..3e2b4f519009 100644
--- a/kernel/power/suspend.c
+++ b/kernel/power/suspend.c
@@ -56,16 +56,16 @@ unsigned int pm_suspend_global_flags;
 EXPORT_SYMBOL_GPL(pm_suspend_global_flags);
 
 static const struct platform_suspend_ops *suspend_ops;
-static const struct platform_freeze_ops *freeze_ops;
+static const struct platform_s2idle_ops *s2idle_ops;
 static DECLARE_WAIT_QUEUE_HEAD(s2idle_wait_head);
 
 enum s2idle_states __read_mostly s2idle_state;
 static DEFINE_SPINLOCK(s2idle_lock);
 
-void freeze_set_ops(const struct platform_freeze_ops *ops)
+void s2idle_set_ops(const struct platform_s2idle_ops *ops)
 {
 	lock_system_sleep();
-	freeze_ops = ops;
+	s2idle_ops = ops;
 	unlock_system_sleep();
 }
 
@@ -131,13 +131,13 @@ static void s2idle_loop(void)
 			break;
 		}
 
-		if (freeze_ops && freeze_ops->wake)
-			freeze_ops->wake();
+		if (s2idle_ops && s2idle_ops->wake)
+			s2idle_ops->wake();
 
 		dpm_noirq_end();
 
-		if (freeze_ops && freeze_ops->sync)
-			freeze_ops->sync();
+		if (s2idle_ops && s2idle_ops->sync)
+			s2idle_ops->sync();
 
 		if (pm_wakeup_pending())
 			break;
@@ -250,8 +250,8 @@ static int platform_suspend_prepare(suspend_state_t state)
 
 static int platform_suspend_prepare_late(suspend_state_t state)
 {
-	return state == PM_SUSPEND_TO_IDLE && freeze_ops && freeze_ops->prepare ?
-		freeze_ops->prepare() : 0;
+	return state == PM_SUSPEND_TO_IDLE && s2idle_ops && s2idle_ops->prepare ?
+		s2idle_ops->prepare() : 0;
 }
 
 static int platform_suspend_prepare_noirq(suspend_state_t state)
@@ -268,8 +268,8 @@ static void platform_resume_noirq(suspend_state_t state)
 
 static void platform_resume_early(suspend_state_t state)
 {
-	if (state == PM_SUSPEND_TO_IDLE && freeze_ops && freeze_ops->restore)
-		freeze_ops->restore();
+	if (state == PM_SUSPEND_TO_IDLE && s2idle_ops && s2idle_ops->restore)
+		s2idle_ops->restore();
 }
 
 static void platform_resume_finish(suspend_state_t state)
@@ -280,8 +280,8 @@ static void platform_resume_finish(suspend_state_t state)
 
 static int platform_suspend_begin(suspend_state_t state)
 {
-	if (state == PM_SUSPEND_TO_IDLE && freeze_ops && freeze_ops->begin)
-		return freeze_ops->begin();
+	if (state == PM_SUSPEND_TO_IDLE && s2idle_ops && s2idle_ops->begin)
+		return s2idle_ops->begin();
 	else if (suspend_ops && suspend_ops->begin)
 		return suspend_ops->begin(state);
 	else
@@ -290,8 +290,8 @@ static int platform_suspend_begin(suspend_state_t state)
 
 static void platform_resume_end(suspend_state_t state)
 {
-	if (state == PM_SUSPEND_TO_IDLE && freeze_ops && freeze_ops->end)
-		freeze_ops->end();
+	if (state == PM_SUSPEND_TO_IDLE && s2idle_ops && s2idle_ops->end)
+		s2idle_ops->end();
 	else if (suspend_ops && suspend_ops->end)
 		suspend_ops->end();
 }
-- 
cgit v1.2.3


From 38e44da591303d08b0d965a033e11ade284999d0 Mon Sep 17 00:00:00 2001
From: Brian Bian <brian.bian@linux.intel.com>
Date: Wed, 9 Aug 2017 11:45:40 -0700
Subject: thermal: int3400_thermal: process "thermal table changed" event

Some BIOS implement ACPI notification code 0x83 to indicate active
relationship table(ART) and/or thermal relationship table(TRT) changes
to INT3400 device. This event needs to be propagated to user space so
that it can be handled by the user space thermal daemon.

Signed-off-by: Brian Bian <brian.bian@linux.intel.com>
Signed-off-by: Zhang Rui <rui.zhang@intel.com>
---
 drivers/thermal/int340x_thermal/int3400_thermal.c | 41 +++++++++++++++++++++++
 include/linux/thermal.h                           |  1 +
 2 files changed, 42 insertions(+)

(limited to 'include')

diff --git a/drivers/thermal/int340x_thermal/int3400_thermal.c b/drivers/thermal/int340x_thermal/int3400_thermal.c
index b2ffbed1229e..8ee38f55c7f3 100644
--- a/drivers/thermal/int340x_thermal/int3400_thermal.c
+++ b/drivers/thermal/int340x_thermal/int3400_thermal.c
@@ -16,6 +16,8 @@
 #include <linux/thermal.h>
 #include "acpi_thermal_rel.h"
 
+#define INT3400_THERMAL_TABLE_CHANGED 0x83
+
 enum int3400_thermal_uuid {
 	INT3400_THERMAL_PASSIVE_1,
 	INT3400_THERMAL_ACTIVE,
@@ -185,6 +187,35 @@ static int int3400_thermal_run_osc(acpi_handle handle,
 	return result;
 }
 
+static void int3400_notify(acpi_handle handle,
+			u32 event,
+			void *data)
+{
+	struct int3400_thermal_priv *priv = data;
+	char *thermal_prop[5];
+
+	if (!priv)
+		return;
+
+	switch (event) {
+	case INT3400_THERMAL_TABLE_CHANGED:
+		thermal_prop[0] = kasprintf(GFP_KERNEL, "NAME=%s",
+				priv->thermal->type);
+		thermal_prop[1] = kasprintf(GFP_KERNEL, "TEMP=%d",
+				priv->thermal->temperature);
+		thermal_prop[2] = kasprintf(GFP_KERNEL, "TRIP=");
+		thermal_prop[3] = kasprintf(GFP_KERNEL, "EVENT=%d",
+				THERMAL_TABLE_CHANGED);
+		thermal_prop[4] = NULL;
+		kobject_uevent_env(&priv->thermal->device.kobj, KOBJ_CHANGE,
+				thermal_prop);
+		break;
+	default:
+		dev_err(&priv->adev->dev, "Unsupported event [0x%x]\n", event);
+		break;
+	}
+}
+
 static int int3400_thermal_get_temp(struct thermal_zone_device *thermal,
 			int *temp)
 {
@@ -290,6 +321,12 @@ static int int3400_thermal_probe(struct platform_device *pdev)
 	if (result)
 		goto free_zone;
 
+	result = acpi_install_notify_handler(
+			priv->adev->handle, ACPI_DEVICE_NOTIFY, int3400_notify,
+			(void *)priv);
+	if (result)
+		goto free_zone;
+
 	return 0;
 
 free_zone:
@@ -306,6 +343,10 @@ static int int3400_thermal_remove(struct platform_device *pdev)
 {
 	struct int3400_thermal_priv *priv = platform_get_drvdata(pdev);
 
+	acpi_remove_notify_handler(
+			priv->adev->handle, ACPI_DEVICE_NOTIFY,
+			int3400_notify);
+
 	if (!priv->rel_misc_dev_res)
 		acpi_thermal_rel_misc_device_remove(priv->adev->handle);
 
diff --git a/include/linux/thermal.h b/include/linux/thermal.h
index dab11f97e1c6..fd5b959c753c 100644
--- a/include/linux/thermal.h
+++ b/include/linux/thermal.h
@@ -102,6 +102,7 @@ enum thermal_notify_event {
 	THERMAL_DEVICE_DOWN, /* Thermal device is down */
 	THERMAL_DEVICE_UP, /* Thermal device is up after a down event */
 	THERMAL_DEVICE_POWER_CAPABILITY_CHANGED, /* power capability changed */
+	THERMAL_TABLE_CHANGED, /* Thermal table(s) changed */
 };
 
 struct thermal_zone_device_ops {
-- 
cgit v1.2.3


From 077fbac405bfc6d41419ad6c1725804ad4e9887c Mon Sep 17 00:00:00 2001
From: Lorenzo Colitti <lorenzo@google.com>
Date: Fri, 11 Aug 2017 02:11:33 +0900
Subject: net: xfrm: support setting an output mark.

On systems that use mark-based routing it may be necessary for
routing lookups to use marks in order for packets to be routed
correctly. An example of such a system is Android, which uses
socket marks to route packets via different networks.

Currently, routing lookups in tunnel mode always use a mark of
zero, making routing incorrect on such systems.

This patch adds a new output_mark element to the xfrm state and
a corresponding XFRMA_OUTPUT_MARK netlink attribute. The output
mark differs from the existing xfrm mark in two ways:

1. The xfrm mark is used to match xfrm policies and states, while
   the xfrm output mark is used to set the mark (and influence
   the routing) of the packets emitted by those states.
2. The existing mark is constrained to be a subset of the bits of
   the originating socket or transformed packet, but the output
   mark is arbitrary and depends only on the state.

The use of a separate mark provides additional flexibility. For
example:

- A packet subject to two transforms (e.g., transport mode inside
  tunnel mode) can have two different output marks applied to it,
  one for the transport mode SA and one for the tunnel mode SA.
- On a system where socket marks determine routing, the packets
  emitted by an IPsec tunnel can be routed based on a mark that
  is determined by the tunnel, not by the marks of the
  unencrypted packets.
- Support for setting the output marks can be introduced without
  breaking any existing setups that employ both mark-based
  routing and xfrm tunnel mode. Simply changing the code to use
  the xfrm mark for routing output packets could xfrm mark could
  change behaviour in a way that breaks these setups.

If the output mark is unspecified or set to zero, the mark is not
set or changed.

Tested: make allyesconfig; make -j64
Tested: https://android-review.googlesource.com/452776
Signed-off-by: Lorenzo Colitti <lorenzo@google.com>
Signed-off-by: Steffen Klassert <steffen.klassert@secunet.com>
---
 include/net/xfrm.h        |  9 ++++++---
 include/uapi/linux/xfrm.h |  1 +
 net/ipv4/xfrm4_policy.c   | 14 +++++++++-----
 net/ipv6/xfrm6_policy.c   |  9 ++++++---
 net/xfrm/xfrm_device.c    |  3 ++-
 net/xfrm/xfrm_output.c    |  3 +++
 net/xfrm/xfrm_policy.c    | 17 +++++++++--------
 net/xfrm/xfrm_user.c      | 11 +++++++++++
 8 files changed, 47 insertions(+), 20 deletions(-)

(limited to 'include')

diff --git a/include/net/xfrm.h b/include/net/xfrm.h
index 18d7de34a5c3..9c7b70cce6d6 100644
--- a/include/net/xfrm.h
+++ b/include/net/xfrm.h
@@ -165,6 +165,7 @@ struct xfrm_state {
 		int		header_len;
 		int		trailer_len;
 		u32		extra_flags;
+		u32		output_mark;
 	} props;
 
 	struct xfrm_lifetime_cfg lft;
@@ -298,10 +299,12 @@ struct xfrm_policy_afinfo {
 	struct dst_entry	*(*dst_lookup)(struct net *net,
 					       int tos, int oif,
 					       const xfrm_address_t *saddr,
-					       const xfrm_address_t *daddr);
+					       const xfrm_address_t *daddr,
+					       u32 mark);
 	int			(*get_saddr)(struct net *net, int oif,
 					     xfrm_address_t *saddr,
-					     xfrm_address_t *daddr);
+					     xfrm_address_t *daddr,
+					     u32 mark);
 	void			(*decode_session)(struct sk_buff *skb,
 						  struct flowi *fl,
 						  int reverse);
@@ -1640,7 +1643,7 @@ static inline int xfrm4_udp_encap_rcv(struct sock *sk, struct sk_buff *skb)
 struct dst_entry *__xfrm_dst_lookup(struct net *net, int tos, int oif,
 				    const xfrm_address_t *saddr,
 				    const xfrm_address_t *daddr,
-				    int family);
+				    int family, u32 mark);
 
 struct xfrm_policy *xfrm_policy_alloc(struct net *net, gfp_t gfp);
 
diff --git a/include/uapi/linux/xfrm.h b/include/uapi/linux/xfrm.h
index 2b384ff09fa0..5fe7370a2bef 100644
--- a/include/uapi/linux/xfrm.h
+++ b/include/uapi/linux/xfrm.h
@@ -304,6 +304,7 @@ enum xfrm_attr_type_t {
 	XFRMA_ADDRESS_FILTER,	/* struct xfrm_address_filter */
 	XFRMA_PAD,
 	XFRMA_OFFLOAD_DEV,	/* struct xfrm_state_offload */
+	XFRMA_OUTPUT_MARK,	/* __u32 */
 	__XFRMA_MAX
 
 #define XFRMA_MAX (__XFRMA_MAX - 1)
diff --git a/net/ipv4/xfrm4_policy.c b/net/ipv4/xfrm4_policy.c
index 4aefb149fe0a..d7bf0b041885 100644
--- a/net/ipv4/xfrm4_policy.c
+++ b/net/ipv4/xfrm4_policy.c
@@ -20,7 +20,8 @@
 static struct dst_entry *__xfrm4_dst_lookup(struct net *net, struct flowi4 *fl4,
 					    int tos, int oif,
 					    const xfrm_address_t *saddr,
-					    const xfrm_address_t *daddr)
+					    const xfrm_address_t *daddr,
+					    u32 mark)
 {
 	struct rtable *rt;
 
@@ -28,6 +29,7 @@ static struct dst_entry *__xfrm4_dst_lookup(struct net *net, struct flowi4 *fl4,
 	fl4->daddr = daddr->a4;
 	fl4->flowi4_tos = tos;
 	fl4->flowi4_oif = l3mdev_master_ifindex_by_index(net, oif);
+	fl4->flowi4_mark = mark;
 	if (saddr)
 		fl4->saddr = saddr->a4;
 
@@ -42,20 +44,22 @@ static struct dst_entry *__xfrm4_dst_lookup(struct net *net, struct flowi4 *fl4,
 
 static struct dst_entry *xfrm4_dst_lookup(struct net *net, int tos, int oif,
 					  const xfrm_address_t *saddr,
-					  const xfrm_address_t *daddr)
+					  const xfrm_address_t *daddr,
+					  u32 mark)
 {
 	struct flowi4 fl4;
 
-	return __xfrm4_dst_lookup(net, &fl4, tos, oif, saddr, daddr);
+	return __xfrm4_dst_lookup(net, &fl4, tos, oif, saddr, daddr, mark);
 }
 
 static int xfrm4_get_saddr(struct net *net, int oif,
-			   xfrm_address_t *saddr, xfrm_address_t *daddr)
+			   xfrm_address_t *saddr, xfrm_address_t *daddr,
+			   u32 mark)
 {
 	struct dst_entry *dst;
 	struct flowi4 fl4;
 
-	dst = __xfrm4_dst_lookup(net, &fl4, 0, oif, NULL, daddr);
+	dst = __xfrm4_dst_lookup(net, &fl4, 0, oif, NULL, daddr, mark);
 	if (IS_ERR(dst))
 		return -EHOSTUNREACH;
 
diff --git a/net/ipv6/xfrm6_policy.c b/net/ipv6/xfrm6_policy.c
index f44b25a48478..11d1314ab6c5 100644
--- a/net/ipv6/xfrm6_policy.c
+++ b/net/ipv6/xfrm6_policy.c
@@ -27,7 +27,8 @@
 
 static struct dst_entry *xfrm6_dst_lookup(struct net *net, int tos, int oif,
 					  const xfrm_address_t *saddr,
-					  const xfrm_address_t *daddr)
+					  const xfrm_address_t *daddr,
+					  u32 mark)
 {
 	struct flowi6 fl6;
 	struct dst_entry *dst;
@@ -36,6 +37,7 @@ static struct dst_entry *xfrm6_dst_lookup(struct net *net, int tos, int oif,
 	memset(&fl6, 0, sizeof(fl6));
 	fl6.flowi6_oif = l3mdev_master_ifindex_by_index(net, oif);
 	fl6.flowi6_flags = FLOWI_FLAG_SKIP_NH_OIF;
+	fl6.flowi6_mark = mark;
 	memcpy(&fl6.daddr, daddr, sizeof(fl6.daddr));
 	if (saddr)
 		memcpy(&fl6.saddr, saddr, sizeof(fl6.saddr));
@@ -52,12 +54,13 @@ static struct dst_entry *xfrm6_dst_lookup(struct net *net, int tos, int oif,
 }
 
 static int xfrm6_get_saddr(struct net *net, int oif,
-			   xfrm_address_t *saddr, xfrm_address_t *daddr)
+			   xfrm_address_t *saddr, xfrm_address_t *daddr,
+			   u32 mark)
 {
 	struct dst_entry *dst;
 	struct net_device *dev;
 
-	dst = xfrm6_dst_lookup(net, 0, oif, NULL, daddr);
+	dst = xfrm6_dst_lookup(net, 0, oif, NULL, daddr, mark);
 	if (IS_ERR(dst))
 		return -EHOSTUNREACH;
 
diff --git a/net/xfrm/xfrm_device.c b/net/xfrm/xfrm_device.c
index 1904127f5fb8..acf00104ef31 100644
--- a/net/xfrm/xfrm_device.c
+++ b/net/xfrm/xfrm_device.c
@@ -79,7 +79,8 @@ int xfrm_dev_state_add(struct net *net, struct xfrm_state *x,
 			daddr = &x->props.saddr;
 		}
 
-		dst = __xfrm_dst_lookup(net, 0, 0, saddr, daddr, x->props.family);
+		dst = __xfrm_dst_lookup(net, 0, 0, saddr, daddr,
+					x->props.family, x->props.output_mark);
 		if (IS_ERR(dst))
 			return 0;
 
diff --git a/net/xfrm/xfrm_output.c b/net/xfrm/xfrm_output.c
index 8c0b6722aaa8..31a2e6d34dba 100644
--- a/net/xfrm/xfrm_output.c
+++ b/net/xfrm/xfrm_output.c
@@ -66,6 +66,9 @@ static int xfrm_output_one(struct sk_buff *skb, int err)
 			goto error_nolock;
 		}
 
+		if (x->props.output_mark)
+			skb->mark = x->props.output_mark;
+
 		err = x->outer_mode->output(x, skb);
 		if (err) {
 			XFRM_INC_STATS(net, LINUX_MIB_XFRMOUTSTATEMODEERROR);
diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c
index 06c3bf7ab86b..1de52f36caf5 100644
--- a/net/xfrm/xfrm_policy.c
+++ b/net/xfrm/xfrm_policy.c
@@ -122,7 +122,7 @@ static const struct xfrm_policy_afinfo *xfrm_policy_get_afinfo(unsigned short fa
 struct dst_entry *__xfrm_dst_lookup(struct net *net, int tos, int oif,
 				    const xfrm_address_t *saddr,
 				    const xfrm_address_t *daddr,
-				    int family)
+				    int family, u32 mark)
 {
 	const struct xfrm_policy_afinfo *afinfo;
 	struct dst_entry *dst;
@@ -131,7 +131,7 @@ struct dst_entry *__xfrm_dst_lookup(struct net *net, int tos, int oif,
 	if (unlikely(afinfo == NULL))
 		return ERR_PTR(-EAFNOSUPPORT);
 
-	dst = afinfo->dst_lookup(net, tos, oif, saddr, daddr);
+	dst = afinfo->dst_lookup(net, tos, oif, saddr, daddr, mark);
 
 	rcu_read_unlock();
 
@@ -143,7 +143,7 @@ static inline struct dst_entry *xfrm_dst_lookup(struct xfrm_state *x,
 						int tos, int oif,
 						xfrm_address_t *prev_saddr,
 						xfrm_address_t *prev_daddr,
-						int family)
+						int family, u32 mark)
 {
 	struct net *net = xs_net(x);
 	xfrm_address_t *saddr = &x->props.saddr;
@@ -159,7 +159,7 @@ static inline struct dst_entry *xfrm_dst_lookup(struct xfrm_state *x,
 		daddr = x->coaddr;
 	}
 
-	dst = __xfrm_dst_lookup(net, tos, oif, saddr, daddr, family);
+	dst = __xfrm_dst_lookup(net, tos, oif, saddr, daddr, family, mark);
 
 	if (!IS_ERR(dst)) {
 		if (prev_saddr != saddr)
@@ -1340,14 +1340,14 @@ int __xfrm_sk_clone_policy(struct sock *sk, const struct sock *osk)
 
 static int
 xfrm_get_saddr(struct net *net, int oif, xfrm_address_t *local,
-	       xfrm_address_t *remote, unsigned short family)
+	       xfrm_address_t *remote, unsigned short family, u32 mark)
 {
 	int err;
 	const struct xfrm_policy_afinfo *afinfo = xfrm_policy_get_afinfo(family);
 
 	if (unlikely(afinfo == NULL))
 		return -EINVAL;
-	err = afinfo->get_saddr(net, oif, local, remote);
+	err = afinfo->get_saddr(net, oif, local, remote, mark);
 	rcu_read_unlock();
 	return err;
 }
@@ -1378,7 +1378,7 @@ xfrm_tmpl_resolve_one(struct xfrm_policy *policy, const struct flowi *fl,
 			if (xfrm_addr_any(local, tmpl->encap_family)) {
 				error = xfrm_get_saddr(net, fl->flowi_oif,
 						       &tmp, remote,
-						       tmpl->encap_family);
+						       tmpl->encap_family, 0);
 				if (error)
 					goto fail;
 				local = &tmp;
@@ -1598,7 +1598,8 @@ static struct dst_entry *xfrm_bundle_create(struct xfrm_policy *policy,
 		if (xfrm[i]->props.mode != XFRM_MODE_TRANSPORT) {
 			family = xfrm[i]->props.family;
 			dst = xfrm_dst_lookup(xfrm[i], tos, fl->flowi_oif,
-					      &saddr, &daddr, family);
+					      &saddr, &daddr, family,
+					      xfrm[i]->props.output_mark);
 			err = PTR_ERR(dst);
 			if (IS_ERR(dst))
 				goto put_states;
diff --git a/net/xfrm/xfrm_user.c b/net/xfrm/xfrm_user.c
index ffe8d5ef09eb..cc3268d814b4 100644
--- a/net/xfrm/xfrm_user.c
+++ b/net/xfrm/xfrm_user.c
@@ -584,6 +584,9 @@ static struct xfrm_state *xfrm_state_construct(struct net *net,
 
 	xfrm_mark_get(attrs, &x->mark);
 
+	if (attrs[XFRMA_OUTPUT_MARK])
+		x->props.output_mark = nla_get_u32(attrs[XFRMA_OUTPUT_MARK]);
+
 	err = __xfrm_init_state(x, false, attrs[XFRMA_OFFLOAD_DEV]);
 	if (err)
 		goto error;
@@ -899,6 +902,11 @@ static int copy_to_user_state_extra(struct xfrm_state *x,
 		goto out;
 	if (x->security)
 		ret = copy_sec_ctx(x->security, skb);
+	if (x->props.output_mark) {
+		ret = nla_put_u32(skb, XFRMA_OUTPUT_MARK, x->props.output_mark);
+		if (ret)
+			goto out;
+	}
 out:
 	return ret;
 }
@@ -2454,6 +2462,7 @@ static const struct nla_policy xfrma_policy[XFRMA_MAX+1] = {
 	[XFRMA_PROTO]		= { .type = NLA_U8 },
 	[XFRMA_ADDRESS_FILTER]	= { .len = sizeof(struct xfrm_address_filter) },
 	[XFRMA_OFFLOAD_DEV]	= { .len = sizeof(struct xfrm_user_offload) },
+	[XFRMA_OUTPUT_MARK]	= { .len = NLA_U32 },
 };
 
 static const struct nla_policy xfrma_spd_policy[XFRMA_SPD_MAX+1] = {
@@ -2673,6 +2682,8 @@ static inline size_t xfrm_sa_len(struct xfrm_state *x)
 		l += nla_total_size(sizeof(x->props.extra_flags));
 	if (x->xso.dev)
 		 l += nla_total_size(sizeof(x->xso));
+	if (x->props.output_mark)
+		l += nla_total_size(sizeof(x->props.output_mark));
 
 	/* Must count x->lastused as it may become non-zero behind our back. */
 	l += nla_total_size_64bit(sizeof(u64));
-- 
cgit v1.2.3


From 3e48930cc74f0c212ee1838f89ad0ca7fcf2fea1 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Fri, 11 Aug 2017 05:49:01 -0700
Subject: cgroup: misc changes

Misc trivial changes to prepare for future changes.  No functional
difference.

* Expose cgroup_get(), cgroup_tryget() and cgroup_parent().

* Implement task_dfl_cgroup() which dereferences css_set->dfl_cgrp.

* Rename cgroup_stats_show() to cgroup_stat_show() for consistency
  with the file name.

Signed-off-by: Tejun Heo <tj@kernel.org>
---
 include/linux/cgroup.h | 24 ++++++++++++++++++++++++
 kernel/cgroup/cgroup.c | 23 ++---------------------
 2 files changed, 26 insertions(+), 21 deletions(-)

(limited to 'include')

diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index 79faa6467f76..085056e562b1 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -398,6 +398,16 @@ static inline void css_put_many(struct cgroup_subsys_state *css, unsigned int n)
 		percpu_ref_put_many(&css->refcnt, n);
 }
 
+static inline void cgroup_get(struct cgroup *cgrp)
+{
+	css_get(&cgrp->self);
+}
+
+static inline bool cgroup_tryget(struct cgroup *cgrp)
+{
+	return css_tryget(&cgrp->self);
+}
+
 static inline void cgroup_put(struct cgroup *cgrp)
 {
 	css_put(&cgrp->self);
@@ -510,6 +520,20 @@ static inline struct cgroup *task_cgroup(struct task_struct *task,
 	return task_css(task, subsys_id)->cgroup;
 }
 
+static inline struct cgroup *task_dfl_cgroup(struct task_struct *task)
+{
+	return task_css_set(task)->dfl_cgrp;
+}
+
+static inline struct cgroup *cgroup_parent(struct cgroup *cgrp)
+{
+	struct cgroup_subsys_state *parent_css = cgrp->self.parent;
+
+	if (parent_css)
+		return container_of(parent_css, struct cgroup, self);
+	return NULL;
+}
+
 /**
  * cgroup_is_descendant - test ancestry
  * @cgrp: the cgroup to be tested
diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
index f5ca55db1fe1..c038ccf95b5d 100644
--- a/kernel/cgroup/cgroup.c
+++ b/kernel/cgroup/cgroup.c
@@ -319,15 +319,6 @@ static void cgroup_idr_remove(struct idr *idr, int id)
 	spin_unlock_bh(&cgroup_idr_lock);
 }
 
-static struct cgroup *cgroup_parent(struct cgroup *cgrp)
-{
-	struct cgroup_subsys_state *parent_css = cgrp->self.parent;
-
-	if (parent_css)
-		return container_of(parent_css, struct cgroup, self);
-	return NULL;
-}
-
 static bool cgroup_has_tasks(struct cgroup *cgrp)
 {
 	return cgrp->nr_populated_csets;
@@ -534,22 +525,12 @@ out_unlock:
 	return css;
 }
 
-static void __maybe_unused cgroup_get(struct cgroup *cgrp)
-{
-	css_get(&cgrp->self);
-}
-
 static void cgroup_get_live(struct cgroup *cgrp)
 {
 	WARN_ON_ONCE(cgroup_is_dead(cgrp));
 	css_get(&cgrp->self);
 }
 
-static bool cgroup_tryget(struct cgroup *cgrp)
-{
-	return css_tryget(&cgrp->self);
-}
-
 struct cgroup_subsys_state *of_css(struct kernfs_open_file *of)
 {
 	struct cgroup *cgrp = of->kn->parent->priv;
@@ -3306,7 +3287,7 @@ static int cgroup_events_show(struct seq_file *seq, void *v)
 	return 0;
 }
 
-static int cgroup_stats_show(struct seq_file *seq, void *v)
+static int cgroup_stat_show(struct seq_file *seq, void *v)
 {
 	struct cgroup *cgroup = seq_css(seq)->cgroup;
 
@@ -4423,7 +4404,7 @@ static struct cftype cgroup_base_files[] = {
 	},
 	{
 		.name = "cgroup.stat",
-		.seq_show = cgroup_stats_show,
+		.seq_show = cgroup_stat_show,
 	},
 	{ }	/* terminate */
 };
-- 
cgit v1.2.3


From afa6c45429f6e5ddd1eb6b77a36358f9c4b789da Mon Sep 17 00:00:00 2001
From: Xin Long <lucien.xin@gmail.com>
Date: Fri, 11 Aug 2017 10:23:45 +0800
Subject: sctp: remove the unused typedef sctp_packet_phandler_t

Remove this function typedef, there is even no places
using it.

Signed-off-by: Xin Long <lucien.xin@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/sctp/structs.h | 2 --
 1 file changed, 2 deletions(-)

(limited to 'include')

diff --git a/include/net/sctp/structs.h b/include/net/sctp/structs.h
index fbe6e81b889b..73e9509c057e 100644
--- a/include/net/sctp/structs.h
+++ b/include/net/sctp/structs.h
@@ -657,8 +657,6 @@ struct sctp_sockaddr_entry {
 
 #define SCTP_ADDRESS_TICK_DELAY	500
 
-typedef struct sctp_chunk *(sctp_packet_phandler_t)(struct sctp_association *);
-
 /* This structure holds lists of chunks as we are assembling for
  * transmission.
  */
-- 
cgit v1.2.3


From edf903f83ebca988e04a39f515ab6eacb92055df Mon Sep 17 00:00:00 2001
From: Xin Long <lucien.xin@gmail.com>
Date: Fri, 11 Aug 2017 10:23:46 +0800
Subject: sctp: remove the typedef sctp_sender_hb_info_t

This patch is to remove the typedef sctp_sender_hb_info_t, and
replace with struct sctp_sender_hb_info in the places where it's
using this typedef.

It is also to use sizeof(variable) instead of sizeof(type).

Signed-off-by: Xin Long <lucien.xin@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/sctp/structs.h |  4 ++--
 net/sctp/sm_make_chunk.c   |  6 +++---
 net/sctp/sm_sideeffect.c   |  4 ++--
 net/sctp/sm_statefuns.c    | 12 +++++-------
 4 files changed, 12 insertions(+), 14 deletions(-)

(limited to 'include')

diff --git a/include/net/sctp/structs.h b/include/net/sctp/structs.h
index 73e9509c057e..60033a263193 100644
--- a/include/net/sctp/structs.h
+++ b/include/net/sctp/structs.h
@@ -371,12 +371,12 @@ union sctp_params {
  *    chunk is sent and the destination transport address to which this
  *    HEARTBEAT is sent (see Section 8.3).
  */
-typedef struct sctp_sender_hb_info {
+struct sctp_sender_hb_info {
 	struct sctp_paramhdr param_hdr;
 	union sctp_addr daddr;
 	unsigned long sent_at;
 	__u64 hb_nonce;
-} sctp_sender_hb_info_t;
+};
 
 int sctp_stream_init(struct sctp_stream *stream, __u16 outcnt, __u16 incnt,
 		     gfp_t gfp);
diff --git a/net/sctp/sm_make_chunk.c b/net/sctp/sm_make_chunk.c
index 3a8fb1dffbc1..51de638a88b2 100644
--- a/net/sctp/sm_make_chunk.c
+++ b/net/sctp/sm_make_chunk.c
@@ -1142,10 +1142,10 @@ nodata:
 
 /* Make a HEARTBEAT chunk.  */
 struct sctp_chunk *sctp_make_heartbeat(const struct sctp_association *asoc,
-				  const struct sctp_transport *transport)
+				       const struct sctp_transport *transport)
 {
+	struct sctp_sender_hb_info hbinfo;
 	struct sctp_chunk *retval;
-	sctp_sender_hb_info_t hbinfo;
 
 	retval = sctp_make_control(asoc, SCTP_CID_HEARTBEAT, 0,
 				   sizeof(hbinfo), GFP_ATOMIC);
@@ -1154,7 +1154,7 @@ struct sctp_chunk *sctp_make_heartbeat(const struct sctp_association *asoc,
 		goto nodata;
 
 	hbinfo.param_hdr.type = SCTP_PARAM_HEARTBEAT_INFO;
-	hbinfo.param_hdr.length = htons(sizeof(sctp_sender_hb_info_t));
+	hbinfo.param_hdr.length = htons(sizeof(hbinfo));
 	hbinfo.daddr = transport->ipaddr;
 	hbinfo.sent_at = jiffies;
 	hbinfo.hb_nonce = transport->hb_nonce;
diff --git a/net/sctp/sm_sideeffect.c b/net/sctp/sm_sideeffect.c
index 4a12d29d9aa1..5e8e41879b03 100644
--- a/net/sctp/sm_sideeffect.c
+++ b/net/sctp/sm_sideeffect.c
@@ -714,7 +714,7 @@ static void sctp_cmd_transport_on(sctp_cmd_seq_t *cmds,
 				  struct sctp_transport *t,
 				  struct sctp_chunk *chunk)
 {
-	sctp_sender_hb_info_t *hbinfo;
+	struct sctp_sender_hb_info *hbinfo;
 	int was_unconfirmed = 0;
 
 	/* 8.3 Upon the receipt of the HEARTBEAT ACK, the sender of the
@@ -768,7 +768,7 @@ static void sctp_cmd_transport_on(sctp_cmd_seq_t *cmds,
 	if (t->rto_pending == 0)
 		t->rto_pending = 1;
 
-	hbinfo = (sctp_sender_hb_info_t *) chunk->skb->data;
+	hbinfo = (struct sctp_sender_hb_info *)chunk->skb->data;
 	sctp_transport_update_rto(t, (jiffies - hbinfo->sent_at));
 
 	/* Update the heartbeat timer.  */
diff --git a/net/sctp/sm_statefuns.c b/net/sctp/sm_statefuns.c
index ac6aaa046529..af93419209df 100644
--- a/net/sctp/sm_statefuns.c
+++ b/net/sctp/sm_statefuns.c
@@ -1155,27 +1155,25 @@ sctp_disposition_t sctp_sf_backbeat_8_3(struct net *net,
 					void *arg,
 					sctp_cmd_seq_t *commands)
 {
+	struct sctp_sender_hb_info *hbinfo;
 	struct sctp_chunk *chunk = arg;
-	union sctp_addr from_addr;
 	struct sctp_transport *link;
-	sctp_sender_hb_info_t *hbinfo;
 	unsigned long max_interval;
+	union sctp_addr from_addr;
 
 	if (!sctp_vtag_verify(chunk, asoc))
 		return sctp_sf_pdiscard(net, ep, asoc, type, arg, commands);
 
 	/* Make sure that the HEARTBEAT-ACK chunk has a valid length.  */
 	if (!sctp_chunk_length_valid(chunk, sizeof(struct sctp_chunkhdr) +
-					    sizeof(sctp_sender_hb_info_t)))
+					    sizeof(*hbinfo)))
 		return sctp_sf_violation_chunklen(net, ep, asoc, type, arg,
 						  commands);
 
-	hbinfo = (sctp_sender_hb_info_t *) chunk->skb->data;
+	hbinfo = (struct sctp_sender_hb_info *)chunk->skb->data;
 	/* Make sure that the length of the parameter is what we expect */
-	if (ntohs(hbinfo->param_hdr.length) !=
-				    sizeof(sctp_sender_hb_info_t)) {
+	if (ntohs(hbinfo->param_hdr.length) != sizeof(*hbinfo))
 		return SCTP_DISPOSITION_DISCARD;
-	}
 
 	from_addr = hbinfo->daddr;
 	link = sctp_assoc_lookup_paddr(asoc, &from_addr);
-- 
cgit v1.2.3


From 74439f344b1becd57ec822bc0e2c1a4cbf240a53 Mon Sep 17 00:00:00 2001
From: Xin Long <lucien.xin@gmail.com>
Date: Fri, 11 Aug 2017 10:23:47 +0800
Subject: sctp: remove the typedef sctp_endpoint_type_t

This patch is to remove the typedef sctp_endpoint_type_t, and
replace with enum sctp_endpoint_type in the places where it's
using this typedef.

Signed-off-by: Xin Long <lucien.xin@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/sctp/structs.h | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/include/net/sctp/structs.h b/include/net/sctp/structs.h
index 60033a263193..937187f3bffc 100644
--- a/include/net/sctp/structs.h
+++ b/include/net/sctp/structs.h
@@ -1142,10 +1142,10 @@ int sctp_is_ep_boundall(struct sock *sk);
 
 
 /* What type of endpoint?  */
-typedef enum {
+enum sctp_endpoint_type {
 	SCTP_EP_TYPE_SOCKET,
 	SCTP_EP_TYPE_ASSOCIATION,
-} sctp_endpoint_type_t;
+};
 
 /*
  * A common base class to bridge the implmentation view of a
@@ -1169,7 +1169,7 @@ struct sctp_ep_common {
 	int hashent;
 
 	/* Runtime type information.  What kind of endpoint is this? */
-	sctp_endpoint_type_t type;
+	enum sctp_endpoint_type type;
 
 	/* Some fields to help us manage this object.
 	 *   refcnt   - Reference count access to this object.
-- 
cgit v1.2.3


From a05437ac5deb100f94e290ad4c5eef3e78f4b6bb Mon Sep 17 00:00:00 2001
From: Xin Long <lucien.xin@gmail.com>
Date: Fri, 11 Aug 2017 10:23:48 +0800
Subject: sctp: remove the typedef sctp_cmsgs_t

This patch is to remove the typedef sctp_cmsgs_t, and
replace with struct sctp_cmsgs in the places where it's
using this typedef.

Signed-off-by: Xin Long <lucien.xin@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/sctp/structs.h | 4 ++--
 net/sctp/socket.c          | 9 +++++----
 2 files changed, 7 insertions(+), 6 deletions(-)

(limited to 'include')

diff --git a/include/net/sctp/structs.h b/include/net/sctp/structs.h
index 937187f3bffc..e171d3a3d2b4 100644
--- a/include/net/sctp/structs.h
+++ b/include/net/sctp/structs.h
@@ -1985,11 +1985,11 @@ int sctp_cmp_addr_exact(const union sctp_addr *ss1,
 struct sctp_chunk *sctp_get_ecne_prepend(struct sctp_association *asoc);
 
 /* A convenience structure to parse out SCTP specific CMSGs. */
-typedef struct sctp_cmsgs {
+struct sctp_cmsgs {
 	struct sctp_initmsg *init;
 	struct sctp_sndrcvinfo *srinfo;
 	struct sctp_sndinfo *sinfo;
-} sctp_cmsgs_t;
+};
 
 /* Structure for tracking memory objects */
 typedef struct {
diff --git a/net/sctp/socket.c b/net/sctp/socket.c
index a1e2113806dd..018190655d63 100644
--- a/net/sctp/socket.c
+++ b/net/sctp/socket.c
@@ -1593,7 +1593,8 @@ static int sctp_error(struct sock *sk, int flags, int err)
  */
 /* BUG:  We do not implement the equivalent of sk_stream_wait_memory(). */
 
-static int sctp_msghdr_parse(const struct msghdr *, sctp_cmsgs_t *);
+static int sctp_msghdr_parse(const struct msghdr *msg,
+			     struct sctp_cmsgs *cmsgs);
 
 static int sctp_sendmsg(struct sock *sk, struct msghdr *msg, size_t msg_len)
 {
@@ -1609,7 +1610,7 @@ static int sctp_sendmsg(struct sock *sk, struct msghdr *msg, size_t msg_len)
 	struct sctp_sndrcvinfo *sinfo;
 	struct sctp_initmsg *sinit;
 	sctp_assoc_t associd = 0;
-	sctp_cmsgs_t cmsgs = { NULL };
+	struct sctp_cmsgs cmsgs = { NULL };
 	enum sctp_scope scope;
 	bool fill_sinfo_ttl = false, wait_connect = false;
 	struct sctp_datamsg *datamsg;
@@ -7445,10 +7446,10 @@ static int sctp_autobind(struct sock *sk)
  * msg_control
  * points here
  */
-static int sctp_msghdr_parse(const struct msghdr *msg, sctp_cmsgs_t *cmsgs)
+static int sctp_msghdr_parse(const struct msghdr *msg, struct sctp_cmsgs *cmsgs)
 {
-	struct cmsghdr *cmsg;
 	struct msghdr *my_msg = (struct msghdr *)msg;
+	struct cmsghdr *cmsg;
 
 	for_each_cmsghdr(cmsg, my_msg) {
 		if (!CMSG_OK(my_msg, cmsg))
-- 
cgit v1.2.3


From d38ef5ae35b0960f3219f1cf0203e19819e757c7 Mon Sep 17 00:00:00 2001
From: Xin Long <lucien.xin@gmail.com>
Date: Fri, 11 Aug 2017 10:23:49 +0800
Subject: sctp: remove the typedef sctp_dbg_objcnt_entry_t

This patch is to remove the typedef sctp_dbg_objcnt_entry_t, and
replace with struct sctp_dbg_objcnt_entry in the places where it's
using this typedef.

Signed-off-by: Xin Long <lucien.xin@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/sctp/structs.h | 4 ++--
 net/sctp/objcnt.c          | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/include/net/sctp/structs.h b/include/net/sctp/structs.h
index e171d3a3d2b4..b6d75b37f84d 100644
--- a/include/net/sctp/structs.h
+++ b/include/net/sctp/structs.h
@@ -1992,9 +1992,9 @@ struct sctp_cmsgs {
 };
 
 /* Structure for tracking memory objects */
-typedef struct {
+struct sctp_dbg_objcnt_entry {
 	char *label;
 	atomic_t *counter;
-} sctp_dbg_objcnt_entry_t;
+};
 
 #endif /* __sctp_structs_h__ */
diff --git a/net/sctp/objcnt.c b/net/sctp/objcnt.c
index 105ac3327b28..aeea6da81441 100644
--- a/net/sctp/objcnt.c
+++ b/net/sctp/objcnt.c
@@ -57,7 +57,7 @@ SCTP_DBG_OBJCNT(keys);
 /* An array to make it easy to pretty print the debug information
  * to the proc fs.
  */
-static sctp_dbg_objcnt_entry_t sctp_dbg_objcnt[] = {
+static struct sctp_dbg_objcnt_entry sctp_dbg_objcnt[] = {
 	SCTP_DBG_OBJCNT_ENTRY(sock),
 	SCTP_DBG_OBJCNT_ENTRY(ep),
 	SCTP_DBG_OBJCNT_ENTRY(assoc),
-- 
cgit v1.2.3


From b7ef2618a0bf75c1e480b05739b0c5f2a42081cd Mon Sep 17 00:00:00 2001
From: Xin Long <lucien.xin@gmail.com>
Date: Fri, 11 Aug 2017 10:23:50 +0800
Subject: sctp: remove the typedef sctp_socket_type_t

This patch is to remove the typedef sctp_socket_type_t, and
replace with enum sctp_socket_type in the places where it's
using this typedef.

Signed-off-by: Xin Long <lucien.xin@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/sctp/sctp.h    | 3 ++-
 include/net/sctp/structs.h | 6 +++---
 net/sctp/socket.c          | 7 ++++---
 3 files changed, 9 insertions(+), 7 deletions(-)

(limited to 'include')

diff --git a/include/net/sctp/sctp.h b/include/net/sctp/sctp.h
index 24ff7931d38c..06b4f515e157 100644
--- a/include/net/sctp/sctp.h
+++ b/include/net/sctp/sctp.h
@@ -550,7 +550,8 @@ static inline int sctp_ep_hashfn(struct net *net, __u16 lport)
 
 /* Is a socket of this style? */
 #define sctp_style(sk, style) __sctp_style((sk), (SCTP_SOCKET_##style))
-static inline int __sctp_style(const struct sock *sk, sctp_socket_type_t style)
+static inline int __sctp_style(const struct sock *sk,
+			       enum sctp_socket_type style)
 {
 	return sctp_sk(sk)->type == style;
 }
diff --git a/include/net/sctp/structs.h b/include/net/sctp/structs.h
index b6d75b37f84d..0477945de1a3 100644
--- a/include/net/sctp/structs.h
+++ b/include/net/sctp/structs.h
@@ -150,18 +150,18 @@ extern struct sctp_globals {
 #define sctp_checksum_disable		(sctp_globals.checksum_disable)
 
 /* SCTP Socket type: UDP or TCP style. */
-typedef enum {
+enum sctp_socket_type {
 	SCTP_SOCKET_UDP = 0,
 	SCTP_SOCKET_UDP_HIGH_BANDWIDTH,
 	SCTP_SOCKET_TCP
-} sctp_socket_type_t;
+};
 
 /* Per socket SCTP information. */
 struct sctp_sock {
 	/* inet_sock has to be the first member of sctp_sock */
 	struct inet_sock inet;
 	/* What kind of a socket is this? */
-	sctp_socket_type_t type;
+	enum sctp_socket_type type;
 
 	/* PF_ family specific functions.  */
 	struct sctp_pf *pf;
diff --git a/net/sctp/socket.c b/net/sctp/socket.c
index 018190655d63..c01af72cc603 100644
--- a/net/sctp/socket.c
+++ b/net/sctp/socket.c
@@ -100,8 +100,9 @@ static int sctp_send_asconf(struct sctp_association *asoc,
 			    struct sctp_chunk *chunk);
 static int sctp_do_bind(struct sock *, union sctp_addr *, int);
 static int sctp_autobind(struct sock *sk);
-static void sctp_sock_migrate(struct sock *, struct sock *,
-			      struct sctp_association *, sctp_socket_type_t);
+static void sctp_sock_migrate(struct sock *oldsk, struct sock *newsk,
+			      struct sctp_association *assoc,
+			      enum sctp_socket_type type);
 
 static unsigned long sctp_memory_pressure;
 static atomic_long_t sctp_memory_allocated;
@@ -8086,7 +8087,7 @@ static inline void sctp_copy_descendant(struct sock *sk_to,
  */
 static void sctp_sock_migrate(struct sock *oldsk, struct sock *newsk,
 			      struct sctp_association *assoc,
-			      sctp_socket_type_t type)
+			      enum sctp_socket_type type)
 {
 	struct sctp_sock *oldsp = sctp_sk(oldsk);
 	struct sctp_sock *newsp = sctp_sk(newsk);
-- 
cgit v1.2.3


From e2c3108ab25b4dbab3821e8b6084bfb73afb655c Mon Sep 17 00:00:00 2001
From: Xin Long <lucien.xin@gmail.com>
Date: Fri, 11 Aug 2017 10:23:51 +0800
Subject: sctp: remove the typedef sctp_cmd_t

This patch is to remove the typedef sctp_cmd_t, and
replace with enum sctp_cmd in the places where it's
using this typedef.

Signed-off-by: Xin Long <lucien.xin@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/sctp/command.h | 14 +++++++-------
 net/sctp/sm_sideeffect.c   |  2 +-
 2 files changed, 8 insertions(+), 8 deletions(-)

(limited to 'include')

diff --git a/include/net/sctp/command.h b/include/net/sctp/command.h
index 376cb78b6247..4e9e589b8f18 100644
--- a/include/net/sctp/command.h
+++ b/include/net/sctp/command.h
@@ -196,15 +196,15 @@ static inline sctp_arg_t SCTP_NULL(void)
 	return retval;
 }
 
-typedef struct {
+struct sctp_cmd {
 	sctp_arg_t obj;
 	sctp_verb_t verb;
-} sctp_cmd_t;
+};
 
 typedef struct {
-	sctp_cmd_t cmds[SCTP_MAX_NUM_COMMANDS];
-	sctp_cmd_t *last_used_slot;
-	sctp_cmd_t *next_cmd;
+	struct sctp_cmd cmds[SCTP_MAX_NUM_COMMANDS];
+	struct sctp_cmd *last_used_slot;
+	struct sctp_cmd *next_cmd;
 } sctp_cmd_seq_t;
 
 
@@ -228,7 +228,7 @@ static inline int sctp_init_cmd_seq(sctp_cmd_seq_t *seq)
 static inline void sctp_add_cmd_sf(sctp_cmd_seq_t *seq, sctp_verb_t verb,
 				   sctp_arg_t obj)
 {
-	sctp_cmd_t *cmd = seq->last_used_slot - 1;
+	struct sctp_cmd *cmd = seq->last_used_slot - 1;
 
 	BUG_ON(cmd < seq->cmds);
 
@@ -240,7 +240,7 @@ static inline void sctp_add_cmd_sf(sctp_cmd_seq_t *seq, sctp_verb_t verb,
 /* Return the next command structure in an sctp_cmd_seq.
  * Return NULL at the end of the sequence.
  */
-static inline sctp_cmd_t *sctp_next_cmd(sctp_cmd_seq_t *seq)
+static inline struct sctp_cmd *sctp_next_cmd(sctp_cmd_seq_t *seq)
 {
 	if (seq->next_cmd <= seq->last_used_slot)
 		return NULL;
diff --git a/net/sctp/sm_sideeffect.c b/net/sctp/sm_sideeffect.c
index 5e8e41879b03..0cb3d5a723af 100644
--- a/net/sctp/sm_sideeffect.c
+++ b/net/sctp/sm_sideeffect.c
@@ -1277,7 +1277,7 @@ static int sctp_cmd_interpreter(enum sctp_event event_type,
 	struct sctp_sock *sp = sctp_sk(sk);
 	int error = 0;
 	int force;
-	sctp_cmd_t *cmd;
+	struct sctp_cmd *cmd;
 	struct sctp_chunk *new_obj;
 	struct sctp_chunk *chunk = NULL;
 	struct sctp_packet *packet;
-- 
cgit v1.2.3


From a85bbeb221d860097859f110ba1321f2b0653f07 Mon Sep 17 00:00:00 2001
From: Xin Long <lucien.xin@gmail.com>
Date: Fri, 11 Aug 2017 10:23:52 +0800
Subject: sctp: remove the typedef sctp_cmd_seq_t

This patch is to remove the typedef sctp_cmd_seq_t, and
replace with struct sctp_cmd_seq in the places where it's
using this typedef.

Note that it doesn't fix many indents although it should,
as sctp_disposition_t's removal would mess them up again.
So better to fix them when removing sctp_disposition_t in
the later patch.

Signed-off-by: Xin Long <lucien.xin@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/sctp/command.h |  14 +--
 include/net/sctp/sm.h      |   2 +-
 net/sctp/probe.c           |   2 +-
 net/sctp/sm_sideeffect.c   |  54 ++++++------
 net/sctp/sm_statefuns.c    | 216 +++++++++++++++++++++++----------------------
 5 files changed, 145 insertions(+), 143 deletions(-)

(limited to 'include')

diff --git a/include/net/sctp/command.h b/include/net/sctp/command.h
index 4e9e589b8f18..cbf6798866ef 100644
--- a/include/net/sctp/command.h
+++ b/include/net/sctp/command.h
@@ -110,7 +110,7 @@ typedef enum {
 	SCTP_CMD_LAST
 } sctp_verb_t;
 
-/* How many commands can you put in an sctp_cmd_seq_t?
+/* How many commands can you put in an struct sctp_cmd_seq?
  * This is a rather arbitrary number, ideally derived from a careful
  * analysis of the state functions, but in reality just taken from
  * thin air in the hopes othat we don't trigger a kernel panic.
@@ -201,17 +201,17 @@ struct sctp_cmd {
 	sctp_verb_t verb;
 };
 
-typedef struct {
+struct sctp_cmd_seq {
 	struct sctp_cmd cmds[SCTP_MAX_NUM_COMMANDS];
 	struct sctp_cmd *last_used_slot;
 	struct sctp_cmd *next_cmd;
-} sctp_cmd_seq_t;
+};
 
 
 /* Initialize a block of memory as a command sequence.
  * Return 0 if the initialization fails.
  */
-static inline int sctp_init_cmd_seq(sctp_cmd_seq_t *seq)
+static inline int sctp_init_cmd_seq(struct sctp_cmd_seq *seq)
 {
 	/* cmds[] is filled backwards to simplify the overflow BUG() check */
 	seq->last_used_slot = seq->cmds + SCTP_MAX_NUM_COMMANDS;
@@ -220,12 +220,12 @@ static inline int sctp_init_cmd_seq(sctp_cmd_seq_t *seq)
 }
 
 
-/* Add a command to an sctp_cmd_seq_t.
+/* Add a command to an struct sctp_cmd_seq.
  *
  * Use the SCTP_* constructors defined by SCTP_ARG_CONSTRUCTOR() above
  * to wrap data which goes in the obj argument.
  */
-static inline void sctp_add_cmd_sf(sctp_cmd_seq_t *seq, sctp_verb_t verb,
+static inline void sctp_add_cmd_sf(struct sctp_cmd_seq *seq, sctp_verb_t verb,
 				   sctp_arg_t obj)
 {
 	struct sctp_cmd *cmd = seq->last_used_slot - 1;
@@ -240,7 +240,7 @@ static inline void sctp_add_cmd_sf(sctp_cmd_seq_t *seq, sctp_verb_t verb,
 /* Return the next command structure in an sctp_cmd_seq.
  * Return NULL at the end of the sequence.
  */
-static inline struct sctp_cmd *sctp_next_cmd(sctp_cmd_seq_t *seq)
+static inline struct sctp_cmd *sctp_next_cmd(struct sctp_cmd_seq *seq)
 {
 	if (seq->next_cmd <= seq->last_used_slot)
 		return NULL;
diff --git a/include/net/sctp/sm.h b/include/net/sctp/sm.h
index 1e7651c3b158..9af64b98d96b 100644
--- a/include/net/sctp/sm.h
+++ b/include/net/sctp/sm.h
@@ -75,7 +75,7 @@ typedef sctp_disposition_t (sctp_state_fn_t) (struct net *,
 					      const struct sctp_association *,
 					      const union sctp_subtype type,
 					      void *arg,
-					      sctp_cmd_seq_t *);
+					      struct sctp_cmd_seq *);
 typedef void (sctp_timer_event_t) (unsigned long);
 typedef struct {
 	sctp_state_fn_t *fn;
diff --git a/net/sctp/probe.c b/net/sctp/probe.c
index 43837dfc86a7..34097a167431 100644
--- a/net/sctp/probe.c
+++ b/net/sctp/probe.c
@@ -132,7 +132,7 @@ static sctp_disposition_t jsctp_sf_eat_sack(struct net *net,
 					    const struct sctp_association *asoc,
 					    const union sctp_subtype type,
 					    void *arg,
-					    sctp_cmd_seq_t *commands)
+					    struct sctp_cmd_seq *commands)
 {
 	struct sctp_chunk *chunk = arg;
 	struct sk_buff *skb = chunk->skb;
diff --git a/net/sctp/sm_sideeffect.c b/net/sctp/sm_sideeffect.c
index 0cb3d5a723af..6dd5934cbda6 100644
--- a/net/sctp/sm_sideeffect.c
+++ b/net/sctp/sm_sideeffect.c
@@ -58,7 +58,7 @@ static int sctp_cmd_interpreter(enum sctp_event event_type,
 				struct sctp_association *asoc,
 				void *event_arg,
 				sctp_disposition_t status,
-				sctp_cmd_seq_t *commands,
+				struct sctp_cmd_seq *commands,
 				gfp_t gfp);
 static int sctp_side_effects(enum sctp_event event_type,
 			     union sctp_subtype subtype,
@@ -67,7 +67,7 @@ static int sctp_side_effects(enum sctp_event event_type,
 			     struct sctp_association **asoc,
 			     void *event_arg,
 			     sctp_disposition_t status,
-			     sctp_cmd_seq_t *commands,
+			     struct sctp_cmd_seq *commands,
 			     gfp_t gfp);
 
 /********************************************************************
@@ -150,7 +150,7 @@ static void sctp_do_ecn_cwr_work(struct sctp_association *asoc,
 
 /* Generate SACK if necessary.  We call this at the end of a packet.  */
 static int sctp_gen_sack(struct sctp_association *asoc, int force,
-			 sctp_cmd_seq_t *commands)
+			 struct sctp_cmd_seq *commands)
 {
 	__u32 ctsn, max_tsn_seen;
 	struct sctp_chunk *sack;
@@ -506,7 +506,7 @@ sctp_timer_event_t *sctp_timer_events[SCTP_NUM_TIMEOUT_TYPES] = {
  * notification SHOULD be sent to the upper layer.
  *
  */
-static void sctp_do_8_2_transport_strike(sctp_cmd_seq_t *commands,
+static void sctp_do_8_2_transport_strike(struct sctp_cmd_seq *commands,
 					 struct sctp_association *asoc,
 					 struct sctp_transport *transport,
 					 int is_hb)
@@ -578,7 +578,7 @@ static void sctp_do_8_2_transport_strike(sctp_cmd_seq_t *commands,
 }
 
 /* Worker routine to handle INIT command failure.  */
-static void sctp_cmd_init_failed(sctp_cmd_seq_t *commands,
+static void sctp_cmd_init_failed(struct sctp_cmd_seq *commands,
 				 struct sctp_association *asoc,
 				 unsigned int error)
 {
@@ -601,7 +601,7 @@ static void sctp_cmd_init_failed(sctp_cmd_seq_t *commands,
 }
 
 /* Worker routine to handle SCTP_CMD_ASSOC_FAILED.  */
-static void sctp_cmd_assoc_failed(sctp_cmd_seq_t *commands,
+static void sctp_cmd_assoc_failed(struct sctp_cmd_seq *commands,
 				  struct sctp_association *asoc,
 				  enum sctp_event event_type,
 				  union sctp_subtype subtype,
@@ -645,7 +645,7 @@ static void sctp_cmd_assoc_failed(sctp_cmd_seq_t *commands,
  * since all other cases use "temporary" associations and can do all
  * their work in statefuns directly.
  */
-static int sctp_cmd_process_init(sctp_cmd_seq_t *commands,
+static int sctp_cmd_process_init(struct sctp_cmd_seq *commands,
 				 struct sctp_association *asoc,
 				 struct sctp_chunk *chunk,
 				 struct sctp_init_chunk *peer_init,
@@ -667,7 +667,7 @@ static int sctp_cmd_process_init(sctp_cmd_seq_t *commands,
 }
 
 /* Helper function to break out starting up of heartbeat timers.  */
-static void sctp_cmd_hb_timers_start(sctp_cmd_seq_t *cmds,
+static void sctp_cmd_hb_timers_start(struct sctp_cmd_seq *cmds,
 				     struct sctp_association *asoc)
 {
 	struct sctp_transport *t;
@@ -680,7 +680,7 @@ static void sctp_cmd_hb_timers_start(sctp_cmd_seq_t *cmds,
 		sctp_transport_reset_hb_timer(t);
 }
 
-static void sctp_cmd_hb_timers_stop(sctp_cmd_seq_t *cmds,
+static void sctp_cmd_hb_timers_stop(struct sctp_cmd_seq *cmds,
 				    struct sctp_association *asoc)
 {
 	struct sctp_transport *t;
@@ -695,7 +695,7 @@ static void sctp_cmd_hb_timers_stop(sctp_cmd_seq_t *cmds,
 }
 
 /* Helper function to stop any pending T3-RTX timers */
-static void sctp_cmd_t3_rtx_timers_stop(sctp_cmd_seq_t *cmds,
+static void sctp_cmd_t3_rtx_timers_stop(struct sctp_cmd_seq *cmds,
 					struct sctp_association *asoc)
 {
 	struct sctp_transport *t;
@@ -709,7 +709,7 @@ static void sctp_cmd_t3_rtx_timers_stop(sctp_cmd_seq_t *cmds,
 
 
 /* Helper function to handle the reception of an HEARTBEAT ACK.  */
-static void sctp_cmd_transport_on(sctp_cmd_seq_t *cmds,
+static void sctp_cmd_transport_on(struct sctp_cmd_seq *cmds,
 				  struct sctp_association *asoc,
 				  struct sctp_transport *t,
 				  struct sctp_chunk *chunk)
@@ -780,7 +780,7 @@ static void sctp_cmd_transport_on(sctp_cmd_seq_t *cmds,
 
 
 /* Helper function to process the process SACK command.  */
-static int sctp_cmd_process_sack(sctp_cmd_seq_t *cmds,
+static int sctp_cmd_process_sack(struct sctp_cmd_seq *cmds,
 				 struct sctp_association *asoc,
 				 struct sctp_chunk *chunk)
 {
@@ -802,7 +802,7 @@ static int sctp_cmd_process_sack(sctp_cmd_seq_t *cmds,
 /* Helper function to set the timeout value for T2-SHUTDOWN timer and to set
  * the transport for a shutdown chunk.
  */
-static void sctp_cmd_setup_t2(sctp_cmd_seq_t *cmds,
+static void sctp_cmd_setup_t2(struct sctp_cmd_seq *cmds,
 			      struct sctp_association *asoc,
 			      struct sctp_chunk *chunk)
 {
@@ -819,7 +819,7 @@ static void sctp_cmd_setup_t2(sctp_cmd_seq_t *cmds,
 	asoc->timeouts[SCTP_EVENT_TIMEOUT_T2_SHUTDOWN] = t->rto;
 }
 
-static void sctp_cmd_assoc_update(sctp_cmd_seq_t *cmds,
+static void sctp_cmd_assoc_update(struct sctp_cmd_seq *cmds,
 				  struct sctp_association *asoc,
 				  struct sctp_association *new)
 {
@@ -842,7 +842,7 @@ static void sctp_cmd_assoc_update(sctp_cmd_seq_t *cmds,
 }
 
 /* Helper function to change the state of an association. */
-static void sctp_cmd_new_state(sctp_cmd_seq_t *cmds,
+static void sctp_cmd_new_state(struct sctp_cmd_seq *cmds,
 			       struct sctp_association *asoc,
 			       enum sctp_state state)
 {
@@ -902,7 +902,7 @@ static void sctp_cmd_new_state(sctp_cmd_seq_t *cmds,
 }
 
 /* Helper function to delete an association. */
-static void sctp_cmd_delete_tcb(sctp_cmd_seq_t *cmds,
+static void sctp_cmd_delete_tcb(struct sctp_cmd_seq *cmds,
 				struct sctp_association *asoc)
 {
 	struct sock *sk = asoc->base.sk;
@@ -924,9 +924,9 @@ static void sctp_cmd_delete_tcb(sctp_cmd_seq_t *cmds,
  * destination address (we use active path instead of primary path just
  * because primary path may be inactive.
  */
-static void sctp_cmd_setup_t4(sctp_cmd_seq_t *cmds,
-				struct sctp_association *asoc,
-				struct sctp_chunk *chunk)
+static void sctp_cmd_setup_t4(struct sctp_cmd_seq *cmds,
+			      struct sctp_association *asoc,
+			      struct sctp_chunk *chunk)
 {
 	struct sctp_transport *t;
 
@@ -936,7 +936,7 @@ static void sctp_cmd_setup_t4(sctp_cmd_seq_t *cmds,
 }
 
 /* Process an incoming Operation Error Chunk. */
-static void sctp_cmd_process_operr(sctp_cmd_seq_t *cmds,
+static void sctp_cmd_process_operr(struct sctp_cmd_seq *cmds,
 				   struct sctp_association *asoc,
 				   struct sctp_chunk *chunk)
 {
@@ -1025,9 +1025,9 @@ static void sctp_cmd_set_sk_err(struct sctp_association *asoc, int error)
 }
 
 /* Helper function to generate an association change event */
-static void sctp_cmd_assoc_change(sctp_cmd_seq_t *commands,
-				 struct sctp_association *asoc,
-				 u8 state)
+static void sctp_cmd_assoc_change(struct sctp_cmd_seq *commands,
+				  struct sctp_association *asoc,
+				  u8 state)
 {
 	struct sctp_ulpevent *ev;
 
@@ -1040,7 +1040,7 @@ static void sctp_cmd_assoc_change(sctp_cmd_seq_t *commands,
 }
 
 /* Helper function to generate an adaptation indication event */
-static void sctp_cmd_adaptation_ind(sctp_cmd_seq_t *commands,
+static void sctp_cmd_adaptation_ind(struct sctp_cmd_seq *commands,
 				    struct sctp_association *asoc)
 {
 	struct sctp_ulpevent *ev;
@@ -1145,7 +1145,7 @@ int sctp_do_sm(struct net *net, enum sctp_event event_type,
 	       struct sctp_endpoint *ep, struct sctp_association *asoc,
 	       void *event_arg, gfp_t gfp)
 {
-	sctp_cmd_seq_t commands;
+	struct sctp_cmd_seq commands;
 	const sctp_sm_table_entry_t *state_fn;
 	sctp_disposition_t status;
 	int error = 0;
@@ -1184,7 +1184,7 @@ static int sctp_side_effects(enum sctp_event event_type,
 			     struct sctp_association **asoc,
 			     void *event_arg,
 			     sctp_disposition_t status,
-			     sctp_cmd_seq_t *commands,
+			     struct sctp_cmd_seq *commands,
 			     gfp_t gfp)
 {
 	int error;
@@ -1270,7 +1270,7 @@ static int sctp_cmd_interpreter(enum sctp_event event_type,
 				struct sctp_association *asoc,
 				void *event_arg,
 				sctp_disposition_t status,
-				sctp_cmd_seq_t *commands,
+				struct sctp_cmd_seq *commands,
 				gfp_t gfp)
 {
 	struct sock *sk = ep->base.sk;
diff --git a/net/sctp/sm_statefuns.c b/net/sctp/sm_statefuns.c
index af93419209df..93b6f42a9252 100644
--- a/net/sctp/sm_statefuns.c
+++ b/net/sctp/sm_statefuns.c
@@ -67,7 +67,7 @@ static struct sctp_packet *sctp_abort_pkt_new(struct net *net,
 				  size_t paylen);
 static int sctp_eat_data(const struct sctp_association *asoc,
 			 struct sctp_chunk *chunk,
-			 sctp_cmd_seq_t *commands);
+			 struct sctp_cmd_seq *commands);
 static struct sctp_packet *sctp_ootb_pkt_new(struct net *net,
 					     const struct sctp_association *asoc,
 					     const struct sctp_chunk *chunk);
@@ -75,30 +75,30 @@ static void sctp_send_stale_cookie_err(struct net *net,
 				       const struct sctp_endpoint *ep,
 				       const struct sctp_association *asoc,
 				       const struct sctp_chunk *chunk,
-				       sctp_cmd_seq_t *commands,
+				       struct sctp_cmd_seq *commands,
 				       struct sctp_chunk *err_chunk);
 static sctp_disposition_t sctp_sf_do_5_2_6_stale(struct net *net,
 						 const struct sctp_endpoint *ep,
 						 const struct sctp_association *asoc,
 						 const union sctp_subtype type,
 						 void *arg,
-						 sctp_cmd_seq_t *commands);
+						 struct sctp_cmd_seq *commands);
 static sctp_disposition_t sctp_sf_shut_8_4_5(struct net *net,
 					     const struct sctp_endpoint *ep,
 					     const struct sctp_association *asoc,
 					     const union sctp_subtype type,
 					     void *arg,
-					     sctp_cmd_seq_t *commands);
+					     struct sctp_cmd_seq *commands);
 static sctp_disposition_t sctp_sf_tabort_8_4_8(struct net *net,
 					const struct sctp_endpoint *ep,
 					const struct sctp_association *asoc,
 					const union sctp_subtype type,
 					void *arg,
-					sctp_cmd_seq_t *commands);
+					struct sctp_cmd_seq *commands);
 static struct sctp_sackhdr *sctp_sm_pull_sack(struct sctp_chunk *chunk);
 
 static sctp_disposition_t sctp_stop_t1_and_abort(struct net *net,
-					   sctp_cmd_seq_t *commands,
+					   struct sctp_cmd_seq *commands,
 					   __be16 error, int sk_err,
 					   const struct sctp_association *asoc,
 					   struct sctp_transport *transport);
@@ -108,7 +108,7 @@ static sctp_disposition_t sctp_sf_abort_violation(
 				     const struct sctp_endpoint *ep,
 				     const struct sctp_association *asoc,
 				     void *arg,
-				     sctp_cmd_seq_t *commands,
+				     struct sctp_cmd_seq *commands,
 				     const __u8 *payload,
 				     const size_t paylen);
 
@@ -118,7 +118,7 @@ static sctp_disposition_t sctp_sf_violation_chunklen(
 				     const struct sctp_association *asoc,
 				     const union sctp_subtype type,
 				     void *arg,
-				     sctp_cmd_seq_t *commands);
+				     struct sctp_cmd_seq *commands);
 
 static sctp_disposition_t sctp_sf_violation_paramlen(
 				     struct net *net,
@@ -126,7 +126,7 @@ static sctp_disposition_t sctp_sf_violation_paramlen(
 				     const struct sctp_association *asoc,
 				     const union sctp_subtype type,
 				     void *arg, void *ext,
-				     sctp_cmd_seq_t *commands);
+				     struct sctp_cmd_seq *commands);
 
 static sctp_disposition_t sctp_sf_violation_ctsn(
 				     struct net *net,
@@ -134,7 +134,7 @@ static sctp_disposition_t sctp_sf_violation_ctsn(
 				     const struct sctp_association *asoc,
 				     const union sctp_subtype type,
 				     void *arg,
-				     sctp_cmd_seq_t *commands);
+				     struct sctp_cmd_seq *commands);
 
 static sctp_disposition_t sctp_sf_violation_chunk(
 				     struct net *net,
@@ -142,7 +142,7 @@ static sctp_disposition_t sctp_sf_violation_chunk(
 				     const struct sctp_association *asoc,
 				     const union sctp_subtype type,
 				     void *arg,
-				     sctp_cmd_seq_t *commands);
+				     struct sctp_cmd_seq *commands);
 
 static enum sctp_ierror sctp_sf_authenticate(
 				struct net *net,
@@ -156,7 +156,7 @@ static sctp_disposition_t __sctp_sf_do_9_1_abort(struct net *net,
 					const struct sctp_association *asoc,
 					const union sctp_subtype type,
 					void *arg,
-					sctp_cmd_seq_t *commands);
+					struct sctp_cmd_seq *commands);
 
 /* Small helper function that checks if the chunk length
  * is of the appropriate length.  The 'required_length' argument
@@ -219,7 +219,7 @@ sctp_disposition_t sctp_sf_do_4_C(struct net *net,
 				  const struct sctp_association *asoc,
 				  const union sctp_subtype type,
 				  void *arg,
-				  sctp_cmd_seq_t *commands)
+				  struct sctp_cmd_seq *commands)
 {
 	struct sctp_chunk *chunk = arg;
 	struct sctp_ulpevent *ev;
@@ -305,7 +305,7 @@ sctp_disposition_t sctp_sf_do_5_1B_init(struct net *net,
 					const struct sctp_association *asoc,
 					const union sctp_subtype type,
 					void *arg,
-					sctp_cmd_seq_t *commands)
+					struct sctp_cmd_seq *commands)
 {
 	struct sctp_chunk *chunk = arg, *repl, *err_chunk;
 	struct sctp_unrecognized_param *unk_param;
@@ -499,7 +499,7 @@ sctp_disposition_t sctp_sf_do_5_1C_ack(struct net *net,
 				       const struct sctp_association *asoc,
 				       const union sctp_subtype type,
 				       void *arg,
-				       sctp_cmd_seq_t *commands)
+				       struct sctp_cmd_seq *commands)
 {
 	struct sctp_chunk *chunk = arg;
 	struct sctp_init_chunk *initchunk;
@@ -648,7 +648,7 @@ sctp_disposition_t sctp_sf_do_5_1D_ce(struct net *net,
 				      const struct sctp_endpoint *ep,
 				      const struct sctp_association *asoc,
 				      const union sctp_subtype type, void *arg,
-				      sctp_cmd_seq_t *commands)
+				      struct sctp_cmd_seq *commands)
 {
 	struct sctp_chunk *chunk = arg;
 	struct sctp_association *new_asoc;
@@ -875,7 +875,7 @@ sctp_disposition_t sctp_sf_do_5_1E_ca(struct net *net,
 				      const struct sctp_endpoint *ep,
 				      const struct sctp_association *asoc,
 				      const union sctp_subtype type, void *arg,
-				      sctp_cmd_seq_t *commands)
+				      struct sctp_cmd_seq *commands)
 {
 	struct sctp_chunk *chunk = arg;
 	struct sctp_ulpevent *ev;
@@ -953,7 +953,7 @@ static sctp_disposition_t sctp_sf_heartbeat(const struct sctp_endpoint *ep,
 					    const struct sctp_association *asoc,
 					    const union sctp_subtype type,
 					    void *arg,
-					    sctp_cmd_seq_t *commands)
+					    struct sctp_cmd_seq *commands)
 {
 	struct sctp_transport *transport = (struct sctp_transport *) arg;
 	struct sctp_chunk *reply;
@@ -979,7 +979,7 @@ sctp_disposition_t sctp_sf_sendbeat_8_3(struct net *net,
 					const struct sctp_association *asoc,
 					const union sctp_subtype type,
 					void *arg,
-					sctp_cmd_seq_t *commands)
+					struct sctp_cmd_seq *commands)
 {
 	struct sctp_transport *transport = (struct sctp_transport *) arg;
 
@@ -1026,7 +1026,7 @@ sctp_disposition_t sctp_sf_send_reconf(struct net *net,
 				       const struct sctp_endpoint *ep,
 				       const struct sctp_association *asoc,
 				       const union sctp_subtype type, void *arg,
-				       sctp_cmd_seq_t *commands)
+				       struct sctp_cmd_seq *commands)
 {
 	struct sctp_transport *transport = arg;
 
@@ -1078,7 +1078,7 @@ sctp_disposition_t sctp_sf_beat_8_3(struct net *net,
 				    const struct sctp_association *asoc,
 				    const union sctp_subtype type,
 				    void *arg,
-				    sctp_cmd_seq_t *commands)
+				    struct sctp_cmd_seq *commands)
 {
 	struct sctp_paramhdr *param_hdr;
 	struct sctp_chunk *chunk = arg;
@@ -1153,7 +1153,7 @@ sctp_disposition_t sctp_sf_backbeat_8_3(struct net *net,
 					const struct sctp_association *asoc,
 					const union sctp_subtype type,
 					void *arg,
-					sctp_cmd_seq_t *commands)
+					struct sctp_cmd_seq *commands)
 {
 	struct sctp_sender_hb_info *hbinfo;
 	struct sctp_chunk *chunk = arg;
@@ -1225,7 +1225,7 @@ sctp_disposition_t sctp_sf_backbeat_8_3(struct net *net,
  */
 static int sctp_sf_send_restart_abort(struct net *net, union sctp_addr *ssa,
 				      struct sctp_chunk *init,
-				      sctp_cmd_seq_t *commands)
+				      struct sctp_cmd_seq *commands)
 {
 	int len;
 	struct sctp_packet *pkt;
@@ -1290,7 +1290,7 @@ static bool list_has_sctp_addr(const struct list_head *list,
 static int sctp_sf_check_restart_addrs(const struct sctp_association *new_asoc,
 				       const struct sctp_association *asoc,
 				       struct sctp_chunk *init,
-				       sctp_cmd_seq_t *commands)
+				       struct sctp_cmd_seq *commands)
 {
 	struct net *net = sock_net(new_asoc->base.sk);
 	struct sctp_transport *new_addr;
@@ -1415,7 +1415,7 @@ static sctp_disposition_t sctp_sf_do_unexpected_init(
 	const struct sctp_endpoint *ep,
 	const struct sctp_association *asoc,
 	const union sctp_subtype type,
-	void *arg, sctp_cmd_seq_t *commands)
+	void *arg, struct sctp_cmd_seq *commands)
 {
 	struct sctp_chunk *chunk = arg, *repl, *err_chunk;
 	struct sctp_unrecognized_param *unk_param;
@@ -1627,7 +1627,7 @@ sctp_disposition_t sctp_sf_do_5_2_1_siminit(struct net *net,
 				    const struct sctp_association *asoc,
 				    const union sctp_subtype type,
 				    void *arg,
-				    sctp_cmd_seq_t *commands)
+				    struct sctp_cmd_seq *commands)
 {
 	/* Call helper to do the real work for both simulataneous and
 	 * duplicate INIT chunk handling.
@@ -1681,7 +1681,7 @@ sctp_disposition_t sctp_sf_do_5_2_2_dupinit(struct net *net,
 					const struct sctp_association *asoc,
 					const union sctp_subtype type,
 					void *arg,
-					sctp_cmd_seq_t *commands)
+					struct sctp_cmd_seq *commands)
 {
 	/* Call helper to do the real work for both simulataneous and
 	 * duplicate INIT chunk handling.
@@ -1703,7 +1703,8 @@ sctp_disposition_t sctp_sf_do_5_2_3_initack(struct net *net,
 					    const struct sctp_endpoint *ep,
 					    const struct sctp_association *asoc,
 					    const union sctp_subtype type,
-					    void *arg, sctp_cmd_seq_t *commands)
+					    void *arg,
+					    struct sctp_cmd_seq *commands)
 {
 	/* Per the above section, we'll discard the chunk if we have an
 	 * endpoint.  If this is an OOTB INIT-ACK, treat it as such.
@@ -1723,7 +1724,7 @@ static sctp_disposition_t sctp_sf_do_dupcook_a(struct net *net,
 					const struct sctp_endpoint *ep,
 					const struct sctp_association *asoc,
 					struct sctp_chunk *chunk,
-					sctp_cmd_seq_t *commands,
+					struct sctp_cmd_seq *commands,
 					struct sctp_association *new_asoc)
 {
 	struct sctp_init_chunk *peer_init;
@@ -1838,7 +1839,7 @@ static sctp_disposition_t sctp_sf_do_dupcook_b(struct net *net,
 					const struct sctp_endpoint *ep,
 					const struct sctp_association *asoc,
 					struct sctp_chunk *chunk,
-					sctp_cmd_seq_t *commands,
+					struct sctp_cmd_seq *commands,
 					struct sctp_association *new_asoc)
 {
 	struct sctp_init_chunk *peer_init;
@@ -1909,7 +1910,7 @@ static sctp_disposition_t sctp_sf_do_dupcook_c(struct net *net,
 					const struct sctp_endpoint *ep,
 					const struct sctp_association *asoc,
 					struct sctp_chunk *chunk,
-					sctp_cmd_seq_t *commands,
+					struct sctp_cmd_seq *commands,
 					struct sctp_association *new_asoc)
 {
 	/* The cookie should be silently discarded.
@@ -1931,7 +1932,7 @@ static sctp_disposition_t sctp_sf_do_dupcook_d(struct net *net,
 					const struct sctp_endpoint *ep,
 					const struct sctp_association *asoc,
 					struct sctp_chunk *chunk,
-					sctp_cmd_seq_t *commands,
+					struct sctp_cmd_seq *commands,
 					struct sctp_association *new_asoc)
 {
 	struct sctp_ulpevent *ev = NULL, *ai_ev = NULL;
@@ -2027,7 +2028,7 @@ sctp_disposition_t sctp_sf_do_5_2_4_dupcook(struct net *net,
 					const struct sctp_association *asoc,
 					const union sctp_subtype type,
 					void *arg,
-					sctp_cmd_seq_t *commands)
+					struct sctp_cmd_seq *commands)
 {
 	sctp_disposition_t retval;
 	struct sctp_chunk *chunk = arg;
@@ -2146,7 +2147,7 @@ sctp_disposition_t sctp_sf_shutdown_pending_abort(
 	const struct sctp_association *asoc,
 	const union sctp_subtype type,
 	void *arg,
-	sctp_cmd_seq_t *commands)
+	struct sctp_cmd_seq *commands)
 {
 	struct sctp_chunk *chunk = arg;
 
@@ -2188,7 +2189,7 @@ sctp_disposition_t sctp_sf_shutdown_sent_abort(struct net *net,
 					const struct sctp_association *asoc,
 					const union sctp_subtype type,
 					void *arg,
-					sctp_cmd_seq_t *commands)
+					struct sctp_cmd_seq *commands)
 {
 	struct sctp_chunk *chunk = arg;
 
@@ -2239,7 +2240,7 @@ sctp_disposition_t sctp_sf_shutdown_ack_sent_abort(
 	const struct sctp_association *asoc,
 	const union sctp_subtype type,
 	void *arg,
-	sctp_cmd_seq_t *commands)
+	struct sctp_cmd_seq *commands)
 {
 	/* The same T2 timer, so we should be able to use
 	 * common function with the SHUTDOWN-SENT state.
@@ -2266,7 +2267,7 @@ sctp_disposition_t sctp_sf_cookie_echoed_err(struct net *net,
 					const struct sctp_association *asoc,
 					const union sctp_subtype type,
 					void *arg,
-					sctp_cmd_seq_t *commands)
+					struct sctp_cmd_seq *commands)
 {
 	struct sctp_chunk *chunk = arg;
 	struct sctp_errhdr *err;
@@ -2330,7 +2331,7 @@ static sctp_disposition_t sctp_sf_do_5_2_6_stale(struct net *net,
 						 const struct sctp_association *asoc,
 						 const union sctp_subtype type,
 						 void *arg,
-						 sctp_cmd_seq_t *commands)
+						 struct sctp_cmd_seq *commands)
 {
 	int attempts = asoc->init_err_counter + 1;
 	struct sctp_chunk *chunk = arg, *reply;
@@ -2452,7 +2453,7 @@ sctp_disposition_t sctp_sf_do_9_1_abort(struct net *net,
 					const struct sctp_association *asoc,
 					const union sctp_subtype type,
 					void *arg,
-					sctp_cmd_seq_t *commands)
+					struct sctp_cmd_seq *commands)
 {
 	struct sctp_chunk *chunk = arg;
 
@@ -2489,7 +2490,7 @@ static sctp_disposition_t __sctp_sf_do_9_1_abort(struct net *net,
 					const struct sctp_association *asoc,
 					const union sctp_subtype type,
 					void *arg,
-					sctp_cmd_seq_t *commands)
+					struct sctp_cmd_seq *commands)
 {
 	struct sctp_chunk *chunk = arg;
 	unsigned int len;
@@ -2527,7 +2528,7 @@ sctp_disposition_t sctp_sf_cookie_wait_abort(struct net *net,
 				     const struct sctp_association *asoc,
 				     const union sctp_subtype type,
 				     void *arg,
-				     sctp_cmd_seq_t *commands)
+				     struct sctp_cmd_seq *commands)
 {
 	struct sctp_chunk *chunk = arg;
 	unsigned int len;
@@ -2566,7 +2567,7 @@ sctp_disposition_t sctp_sf_cookie_wait_icmp_abort(struct net *net,
 					const struct sctp_association *asoc,
 					const union sctp_subtype type,
 					void *arg,
-					sctp_cmd_seq_t *commands)
+					struct sctp_cmd_seq *commands)
 {
 	return sctp_stop_t1_and_abort(net, commands, SCTP_ERROR_NO_ERROR,
 				      ENOPROTOOPT, asoc,
@@ -2581,7 +2582,7 @@ sctp_disposition_t sctp_sf_cookie_echoed_abort(struct net *net,
 					       const struct sctp_association *asoc,
 					       const union sctp_subtype type,
 					       void *arg,
-					       sctp_cmd_seq_t *commands)
+					       struct sctp_cmd_seq *commands)
 {
 	/* There is a single T1 timer, so we should be able to use
 	 * common function with the COOKIE-WAIT state.
@@ -2595,7 +2596,7 @@ sctp_disposition_t sctp_sf_cookie_echoed_abort(struct net *net,
  * This is common code called by several sctp_sf_*_abort() functions above.
  */
 static sctp_disposition_t sctp_stop_t1_and_abort(struct net *net,
-					   sctp_cmd_seq_t *commands,
+					   struct sctp_cmd_seq *commands,
 					   __be16 error, int sk_err,
 					   const struct sctp_association *asoc,
 					   struct sctp_transport *transport)
@@ -2653,7 +2654,7 @@ sctp_disposition_t sctp_sf_do_9_2_shutdown(struct net *net,
 					   const struct sctp_association *asoc,
 					   const union sctp_subtype type,
 					   void *arg,
-					   sctp_cmd_seq_t *commands)
+					   struct sctp_cmd_seq *commands)
 {
 	struct sctp_chunk *chunk = arg;
 	sctp_disposition_t disposition;
@@ -2742,7 +2743,7 @@ sctp_disposition_t sctp_sf_do_9_2_shut_ctsn(struct net *net,
 					   const struct sctp_association *asoc,
 					   const union sctp_subtype type,
 					   void *arg,
-					   sctp_cmd_seq_t *commands)
+					   struct sctp_cmd_seq *commands)
 {
 	struct sctp_chunk *chunk = arg;
 	struct sctp_shutdownhdr *sdh;
@@ -2795,7 +2796,7 @@ sctp_disposition_t sctp_sf_do_9_2_reshutack(struct net *net,
 				    const struct sctp_association *asoc,
 				    const union sctp_subtype type,
 				    void *arg,
-				    sctp_cmd_seq_t *commands)
+				    struct sctp_cmd_seq *commands)
 {
 	struct sctp_chunk *chunk = (struct sctp_chunk *) arg;
 	struct sctp_chunk *reply;
@@ -2859,7 +2860,7 @@ sctp_disposition_t sctp_sf_do_ecn_cwr(struct net *net,
 				      const struct sctp_association *asoc,
 				      const union sctp_subtype type,
 				      void *arg,
-				      sctp_cmd_seq_t *commands)
+				      struct sctp_cmd_seq *commands)
 {
 	struct sctp_chunk *chunk = arg;
 	struct sctp_cwrhdr *cwr;
@@ -2915,7 +2916,7 @@ sctp_disposition_t sctp_sf_do_ecne(struct net *net,
 				   const struct sctp_association *asoc,
 				   const union sctp_subtype type,
 				   void *arg,
-				   sctp_cmd_seq_t *commands)
+				   struct sctp_cmd_seq *commands)
 {
 	struct sctp_chunk *chunk = arg;
 	struct sctp_ecnehdr *ecne;
@@ -2972,7 +2973,7 @@ sctp_disposition_t sctp_sf_eat_data_6_2(struct net *net,
 					const struct sctp_association *asoc,
 					const union sctp_subtype type,
 					void *arg,
-					sctp_cmd_seq_t *commands)
+					struct sctp_cmd_seq *commands)
 {
 	struct sctp_chunk *chunk = arg;
 	sctp_arg_t force = SCTP_NOFORCE();
@@ -3092,7 +3093,7 @@ sctp_disposition_t sctp_sf_eat_data_fast_4_4(struct net *net,
 				     const struct sctp_association *asoc,
 				     const union sctp_subtype type,
 				     void *arg,
-				     sctp_cmd_seq_t *commands)
+				     struct sctp_cmd_seq *commands)
 {
 	struct sctp_chunk *chunk = arg;
 	int error;
@@ -3183,7 +3184,7 @@ sctp_disposition_t sctp_sf_eat_sack_6_2(struct net *net,
 					const struct sctp_association *asoc,
 					const union sctp_subtype type,
 					void *arg,
-					sctp_cmd_seq_t *commands)
+					struct sctp_cmd_seq *commands)
 {
 	struct sctp_chunk *chunk = arg;
 	struct sctp_sackhdr *sackh;
@@ -3257,7 +3258,7 @@ static sctp_disposition_t sctp_sf_tabort_8_4_8(struct net *net,
 					const struct sctp_association *asoc,
 					const union sctp_subtype type,
 					void *arg,
-					sctp_cmd_seq_t *commands)
+					struct sctp_cmd_seq *commands)
 {
 	struct sctp_packet *packet = NULL;
 	struct sctp_chunk *chunk = arg;
@@ -3307,7 +3308,7 @@ sctp_disposition_t sctp_sf_operr_notify(struct net *net,
 					const struct sctp_association *asoc,
 					const union sctp_subtype type,
 					void *arg,
-					sctp_cmd_seq_t *commands)
+					struct sctp_cmd_seq *commands)
 {
 	struct sctp_chunk *chunk = arg;
 	struct sctp_errhdr *err;
@@ -3345,7 +3346,7 @@ sctp_disposition_t sctp_sf_do_9_2_final(struct net *net,
 					const struct sctp_association *asoc,
 					const union sctp_subtype type,
 					void *arg,
-					sctp_cmd_seq_t *commands)
+					struct sctp_cmd_seq *commands)
 {
 	struct sctp_chunk *chunk = arg;
 	struct sctp_chunk *reply;
@@ -3428,7 +3429,7 @@ sctp_disposition_t sctp_sf_ootb(struct net *net,
 				const struct sctp_association *asoc,
 				const union sctp_subtype type,
 				void *arg,
-				sctp_cmd_seq_t *commands)
+				struct sctp_cmd_seq *commands)
 {
 	struct sctp_chunk *chunk = arg;
 	struct sk_buff *skb = chunk->skb;
@@ -3521,7 +3522,7 @@ static sctp_disposition_t sctp_sf_shut_8_4_5(struct net *net,
 					     const struct sctp_association *asoc,
 					     const union sctp_subtype type,
 					     void *arg,
-					     sctp_cmd_seq_t *commands)
+					     struct sctp_cmd_seq *commands)
 {
 	struct sctp_packet *packet = NULL;
 	struct sctp_chunk *chunk = arg;
@@ -3583,7 +3584,7 @@ sctp_disposition_t sctp_sf_do_8_5_1_E_sa(struct net *net,
 				      const struct sctp_association *asoc,
 				      const union sctp_subtype type,
 				      void *arg,
-				      sctp_cmd_seq_t *commands)
+				      struct sctp_cmd_seq *commands)
 {
 	struct sctp_chunk *chunk = arg;
 
@@ -3607,7 +3608,7 @@ sctp_disposition_t sctp_sf_do_asconf(struct net *net,
 				     const struct sctp_endpoint *ep,
 				     const struct sctp_association *asoc,
 				     const union sctp_subtype type, void *arg,
-				     sctp_cmd_seq_t *commands)
+				     struct sctp_cmd_seq *commands)
 {
 	struct sctp_chunk	*chunk = arg;
 	struct sctp_chunk	*asconf_ack = NULL;
@@ -3725,7 +3726,7 @@ sctp_disposition_t sctp_sf_do_asconf_ack(struct net *net,
 					 const struct sctp_association *asoc,
 					 const union sctp_subtype type,
 					 void *arg,
-					 sctp_cmd_seq_t *commands)
+					 struct sctp_cmd_seq *commands)
 {
 	struct sctp_chunk	*asconf_ack = arg;
 	struct sctp_chunk	*last_asconf = asoc->addip_last_asconf;
@@ -3843,7 +3844,7 @@ sctp_disposition_t sctp_sf_do_reconf(struct net *net,
 				     const struct sctp_endpoint *ep,
 				     const struct sctp_association *asoc,
 				     const union sctp_subtype type, void *arg,
-				     sctp_cmd_seq_t *commands)
+				     struct sctp_cmd_seq *commands)
 {
 	struct sctp_paramhdr *err_param = NULL;
 	struct sctp_chunk *chunk = arg;
@@ -3920,7 +3921,7 @@ sctp_disposition_t sctp_sf_eat_fwd_tsn(struct net *net,
 				       const struct sctp_association *asoc,
 				       const union sctp_subtype type,
 				       void *arg,
-				       sctp_cmd_seq_t *commands)
+				       struct sctp_cmd_seq *commands)
 {
 	struct sctp_chunk *chunk = arg;
 	struct sctp_fwdtsn_hdr *fwdtsn_hdr;
@@ -3991,7 +3992,7 @@ sctp_disposition_t sctp_sf_eat_fwd_tsn_fast(
 	const struct sctp_association *asoc,
 	const union sctp_subtype type,
 	void *arg,
-	sctp_cmd_seq_t *commands)
+	struct sctp_cmd_seq *commands)
 {
 	struct sctp_chunk *chunk = arg;
 	struct sctp_fwdtsn_hdr *fwdtsn_hdr;
@@ -4158,7 +4159,7 @@ sctp_disposition_t sctp_sf_eat_auth(struct net *net,
 				    const struct sctp_association *asoc,
 				    const union sctp_subtype type,
 				    void *arg,
-				    sctp_cmd_seq_t *commands)
+				    struct sctp_cmd_seq *commands)
 {
 	struct sctp_chunk *chunk = arg;
 	struct sctp_authhdr *auth_hdr;
@@ -4255,7 +4256,7 @@ sctp_disposition_t sctp_sf_unk_chunk(struct net *net,
 				     const struct sctp_association *asoc,
 				     const union sctp_subtype type,
 				     void *arg,
-				     sctp_cmd_seq_t *commands)
+				     struct sctp_cmd_seq *commands)
 {
 	struct sctp_chunk *unk_chunk = arg;
 	struct sctp_chunk *err_chunk;
@@ -4335,7 +4336,7 @@ sctp_disposition_t sctp_sf_discard_chunk(struct net *net,
 					 const struct sctp_association *asoc,
 					 const union sctp_subtype type,
 					 void *arg,
-					 sctp_cmd_seq_t *commands)
+					 struct sctp_cmd_seq *commands)
 {
 	struct sctp_chunk *chunk = arg;
 
@@ -4375,7 +4376,7 @@ sctp_disposition_t sctp_sf_pdiscard(struct net *net,
 				    const struct sctp_association *asoc,
 				    const union sctp_subtype type,
 				    void *arg,
-				    sctp_cmd_seq_t *commands)
+				    struct sctp_cmd_seq *commands)
 {
 	SCTP_INC_STATS(net, SCTP_MIB_IN_PKT_DISCARDS);
 	sctp_add_cmd_sf(commands, SCTP_CMD_DISCARD_PACKET, SCTP_NULL());
@@ -4403,7 +4404,7 @@ sctp_disposition_t sctp_sf_violation(struct net *net,
 				     const struct sctp_association *asoc,
 				     const union sctp_subtype type,
 				     void *arg,
-				     sctp_cmd_seq_t *commands)
+				     struct sctp_cmd_seq *commands)
 {
 	struct sctp_chunk *chunk = arg;
 
@@ -4423,7 +4424,7 @@ static sctp_disposition_t sctp_sf_abort_violation(
 				     const struct sctp_endpoint *ep,
 				     const struct sctp_association *asoc,
 				     void *arg,
-				     sctp_cmd_seq_t *commands,
+				     struct sctp_cmd_seq *commands,
 				     const __u8 *payload,
 				     const size_t paylen)
 {
@@ -4541,7 +4542,7 @@ static sctp_disposition_t sctp_sf_violation_chunklen(
 				     const struct sctp_association *asoc,
 				     const union sctp_subtype type,
 				     void *arg,
-				     sctp_cmd_seq_t *commands)
+				     struct sctp_cmd_seq *commands)
 {
 	static const char err_str[] = "The following chunk had invalid length:";
 
@@ -4561,7 +4562,7 @@ static sctp_disposition_t sctp_sf_violation_paramlen(
 				     const struct sctp_association *asoc,
 				     const union sctp_subtype type,
 				     void *arg, void *ext,
-				     sctp_cmd_seq_t *commands)
+				     struct sctp_cmd_seq *commands)
 {
 	struct sctp_chunk *chunk =  arg;
 	struct sctp_paramhdr *param = ext;
@@ -4604,7 +4605,7 @@ static sctp_disposition_t sctp_sf_violation_ctsn(
 				     const struct sctp_association *asoc,
 				     const union sctp_subtype type,
 				     void *arg,
-				     sctp_cmd_seq_t *commands)
+				     struct sctp_cmd_seq *commands)
 {
 	static const char err_str[] = "The cumulative tsn ack beyond the max tsn currently sent:";
 
@@ -4624,7 +4625,7 @@ static sctp_disposition_t sctp_sf_violation_chunk(
 				     const struct sctp_association *asoc,
 				     const union sctp_subtype type,
 				     void *arg,
-				     sctp_cmd_seq_t *commands)
+				     struct sctp_cmd_seq *commands)
 {
 	static const char err_str[] = "The following chunk violates protocol:";
 
@@ -4699,7 +4700,7 @@ sctp_disposition_t sctp_sf_do_prm_asoc(struct net *net,
 				       const struct sctp_association *asoc,
 				       const union sctp_subtype type,
 				       void *arg,
-				       sctp_cmd_seq_t *commands)
+				       struct sctp_cmd_seq *commands)
 {
 	struct sctp_chunk *repl;
 	struct sctp_association *my_asoc;
@@ -4811,7 +4812,7 @@ sctp_disposition_t sctp_sf_do_prm_send(struct net *net,
 				       const struct sctp_association *asoc,
 				       const union sctp_subtype type,
 				       void *arg,
-				       sctp_cmd_seq_t *commands)
+				       struct sctp_cmd_seq *commands)
 {
 	struct sctp_datamsg *msg = arg;
 
@@ -4851,7 +4852,7 @@ sctp_disposition_t sctp_sf_do_9_2_prm_shutdown(
 	const struct sctp_association *asoc,
 	const union sctp_subtype type,
 	void *arg,
-	sctp_cmd_seq_t *commands)
+	struct sctp_cmd_seq *commands)
 {
 	int disposition;
 
@@ -4907,7 +4908,7 @@ sctp_disposition_t sctp_sf_do_9_1_prm_abort(
 	const struct sctp_association *asoc,
 	const union sctp_subtype type,
 	void *arg,
-	sctp_cmd_seq_t *commands)
+	struct sctp_cmd_seq *commands)
 {
 	/* From 9.1 Abort of an Association
 	 * Upon receipt of the ABORT primitive from its upper
@@ -4944,7 +4945,7 @@ sctp_disposition_t sctp_sf_error_closed(struct net *net,
 					const struct sctp_association *asoc,
 					const union sctp_subtype type,
 					void *arg,
-					sctp_cmd_seq_t *commands)
+					struct sctp_cmd_seq *commands)
 {
 	sctp_add_cmd_sf(commands, SCTP_CMD_REPORT_ERROR, SCTP_ERROR(-EINVAL));
 	return SCTP_DISPOSITION_CONSUME;
@@ -4958,7 +4959,7 @@ sctp_disposition_t sctp_sf_error_shutdown(struct net *net,
 					  const struct sctp_association *asoc,
 					  const union sctp_subtype type,
 					  void *arg,
-					  sctp_cmd_seq_t *commands)
+					  struct sctp_cmd_seq *commands)
 {
 	sctp_add_cmd_sf(commands, SCTP_CMD_REPORT_ERROR,
 			SCTP_ERROR(-ESHUTDOWN));
@@ -4985,7 +4986,7 @@ sctp_disposition_t sctp_sf_cookie_wait_prm_shutdown(
 	const struct sctp_association *asoc,
 	const union sctp_subtype type,
 	void *arg,
-	sctp_cmd_seq_t *commands)
+	struct sctp_cmd_seq *commands)
 {
 	sctp_add_cmd_sf(commands, SCTP_CMD_TIMER_STOP,
 			SCTP_TO(SCTP_EVENT_TIMEOUT_T1_INIT));
@@ -5019,7 +5020,7 @@ sctp_disposition_t sctp_sf_cookie_echoed_prm_shutdown(
 	const struct sctp_endpoint *ep,
 	const struct sctp_association *asoc,
 	const union sctp_subtype type,
-	void *arg, sctp_cmd_seq_t *commands)
+	void *arg, struct sctp_cmd_seq *commands)
 {
 	/* There is a single T1 timer, so we should be able to use
 	 * common function with the COOKIE-WAIT state.
@@ -5047,7 +5048,7 @@ sctp_disposition_t sctp_sf_cookie_wait_prm_abort(
 	const struct sctp_association *asoc,
 	const union sctp_subtype type,
 	void *arg,
-	sctp_cmd_seq_t *commands)
+	struct sctp_cmd_seq *commands)
 {
 	struct sctp_chunk *abort = arg;
 
@@ -5096,7 +5097,7 @@ sctp_disposition_t sctp_sf_cookie_echoed_prm_abort(
 	const struct sctp_association *asoc,
 	const union sctp_subtype type,
 	void *arg,
-	sctp_cmd_seq_t *commands)
+	struct sctp_cmd_seq *commands)
 {
 	/* There is a single T1 timer, so we should be able to use
 	 * common function with the COOKIE-WAIT state.
@@ -5122,7 +5123,7 @@ sctp_disposition_t sctp_sf_shutdown_pending_prm_abort(
 	const struct sctp_association *asoc,
 	const union sctp_subtype type,
 	void *arg,
-	sctp_cmd_seq_t *commands)
+	struct sctp_cmd_seq *commands)
 {
 	/* Stop the T5-shutdown guard timer.  */
 	sctp_add_cmd_sf(commands, SCTP_CMD_TIMER_STOP,
@@ -5149,7 +5150,7 @@ sctp_disposition_t sctp_sf_shutdown_sent_prm_abort(
 	const struct sctp_association *asoc,
 	const union sctp_subtype type,
 	void *arg,
-	sctp_cmd_seq_t *commands)
+	struct sctp_cmd_seq *commands)
 {
 	/* Stop the T2-shutdown timer.  */
 	sctp_add_cmd_sf(commands, SCTP_CMD_TIMER_STOP,
@@ -5180,7 +5181,7 @@ sctp_disposition_t sctp_sf_shutdown_ack_sent_prm_abort(
 	const struct sctp_association *asoc,
 	const union sctp_subtype type,
 	void *arg,
-	sctp_cmd_seq_t *commands)
+	struct sctp_cmd_seq *commands)
 {
 	/* The same T2 timer, so we should be able to use
 	 * common function with the SHUTDOWN-SENT state.
@@ -5216,7 +5217,7 @@ sctp_disposition_t sctp_sf_do_prm_requestheartbeat(
 					const struct sctp_association *asoc,
 					const union sctp_subtype type,
 					void *arg,
-					sctp_cmd_seq_t *commands)
+					struct sctp_cmd_seq *commands)
 {
 	if (SCTP_DISPOSITION_NOMEM == sctp_sf_heartbeat(ep, asoc, type,
 				      (struct sctp_transport *)arg, commands))
@@ -5248,7 +5249,7 @@ sctp_disposition_t sctp_sf_do_prm_asconf(struct net *net,
 					const struct sctp_association *asoc,
 					const union sctp_subtype type,
 					void *arg,
-					sctp_cmd_seq_t *commands)
+					struct sctp_cmd_seq *commands)
 {
 	struct sctp_chunk *chunk = arg;
 
@@ -5264,7 +5265,8 @@ sctp_disposition_t sctp_sf_do_prm_reconf(struct net *net,
 					 const struct sctp_endpoint *ep,
 					 const struct sctp_association *asoc,
 					 const union sctp_subtype type,
-					 void *arg, sctp_cmd_seq_t *commands)
+					 void *arg,
+					 struct sctp_cmd_seq *commands)
 {
 	struct sctp_chunk *chunk = arg;
 
@@ -5283,7 +5285,7 @@ sctp_disposition_t sctp_sf_ignore_primitive(
 	const struct sctp_association *asoc,
 	const union sctp_subtype type,
 	void *arg,
-	sctp_cmd_seq_t *commands)
+	struct sctp_cmd_seq *commands)
 {
 	pr_debug("%s: primitive type:%d is ignored\n", __func__,
 		 type.primitive);
@@ -5307,7 +5309,7 @@ sctp_disposition_t sctp_sf_do_no_pending_tsn(
 	const struct sctp_association *asoc,
 	const union sctp_subtype type,
 	void *arg,
-	sctp_cmd_seq_t *commands)
+	struct sctp_cmd_seq *commands)
 {
 	struct sctp_ulpevent *event;
 
@@ -5339,7 +5341,7 @@ sctp_disposition_t sctp_sf_do_9_2_start_shutdown(
 	const struct sctp_association *asoc,
 	const union sctp_subtype type,
 	void *arg,
-	sctp_cmd_seq_t *commands)
+	struct sctp_cmd_seq *commands)
 {
 	struct sctp_chunk *reply;
 
@@ -5409,7 +5411,7 @@ sctp_disposition_t sctp_sf_do_9_2_shutdown_ack(
 	const struct sctp_association *asoc,
 	const union sctp_subtype type,
 	void *arg,
-	sctp_cmd_seq_t *commands)
+	struct sctp_cmd_seq *commands)
 {
 	struct sctp_chunk *chunk = (struct sctp_chunk *) arg;
 	struct sctp_chunk *reply;
@@ -5482,7 +5484,7 @@ sctp_disposition_t sctp_sf_ignore_other(struct net *net,
 					const struct sctp_association *asoc,
 					const union sctp_subtype type,
 					void *arg,
-					sctp_cmd_seq_t *commands)
+					struct sctp_cmd_seq *commands)
 {
 	pr_debug("%s: the event other type:%d is ignored\n",
 		 __func__, type.other);
@@ -5510,7 +5512,7 @@ sctp_disposition_t sctp_sf_do_6_3_3_rtx(struct net *net,
 					const struct sctp_association *asoc,
 					const union sctp_subtype type,
 					void *arg,
-					sctp_cmd_seq_t *commands)
+					struct sctp_cmd_seq *commands)
 {
 	struct sctp_transport *transport = arg;
 
@@ -5598,7 +5600,7 @@ sctp_disposition_t sctp_sf_do_6_2_sack(struct net *net,
 				       const struct sctp_association *asoc,
 				       const union sctp_subtype type,
 				       void *arg,
-				       sctp_cmd_seq_t *commands)
+				       struct sctp_cmd_seq *commands)
 {
 	SCTP_INC_STATS(net, SCTP_MIB_DELAY_SACK_EXPIREDS);
 	sctp_add_cmd_sf(commands, SCTP_CMD_GEN_SACK, SCTP_FORCE());
@@ -5629,7 +5631,7 @@ sctp_disposition_t sctp_sf_t1_init_timer_expire(struct net *net,
 					   const struct sctp_association *asoc,
 					   const union sctp_subtype type,
 					   void *arg,
-					   sctp_cmd_seq_t *commands)
+					   struct sctp_cmd_seq *commands)
 {
 	struct sctp_chunk *repl = NULL;
 	struct sctp_bind_addr *bp;
@@ -5693,7 +5695,7 @@ sctp_disposition_t sctp_sf_t1_cookie_timer_expire(struct net *net,
 					   const struct sctp_association *asoc,
 					   const union sctp_subtype type,
 					   void *arg,
-					   sctp_cmd_seq_t *commands)
+					   struct sctp_cmd_seq *commands)
 {
 	struct sctp_chunk *repl = NULL;
 	int attempts = asoc->init_err_counter + 1;
@@ -5743,7 +5745,7 @@ sctp_disposition_t sctp_sf_t2_timer_expire(struct net *net,
 					   const struct sctp_association *asoc,
 					   const union sctp_subtype type,
 					   void *arg,
-					   sctp_cmd_seq_t *commands)
+					   struct sctp_cmd_seq *commands)
 {
 	struct sctp_chunk *reply = NULL;
 
@@ -5814,7 +5816,7 @@ sctp_disposition_t sctp_sf_t4_timer_expire(
 	const struct sctp_association *asoc,
 	const union sctp_subtype type,
 	void *arg,
-	sctp_cmd_seq_t *commands)
+	struct sctp_cmd_seq *commands)
 {
 	struct sctp_chunk *chunk = asoc->addip_last_asconf;
 	struct sctp_transport *transport = chunk->transport;
@@ -5885,7 +5887,7 @@ sctp_disposition_t sctp_sf_t5_timer_expire(struct net *net,
 					   const struct sctp_association *asoc,
 					   const union sctp_subtype type,
 					   void *arg,
-					   sctp_cmd_seq_t *commands)
+					   struct sctp_cmd_seq *commands)
 {
 	struct sctp_chunk *reply = NULL;
 
@@ -5922,7 +5924,7 @@ sctp_disposition_t sctp_sf_autoclose_timer_expire(
 	const struct sctp_association *asoc,
 	const union sctp_subtype type,
 	void *arg,
-	sctp_cmd_seq_t *commands)
+	struct sctp_cmd_seq *commands)
 {
 	int disposition;
 
@@ -5964,7 +5966,7 @@ sctp_disposition_t sctp_sf_not_impl(struct net *net,
 				    const struct sctp_association *asoc,
 				    const union sctp_subtype type,
 				    void *arg,
-				    sctp_cmd_seq_t *commands)
+				    struct sctp_cmd_seq *commands)
 {
 	return SCTP_DISPOSITION_NOT_IMPL;
 }
@@ -5982,7 +5984,7 @@ sctp_disposition_t sctp_sf_bug(struct net *net,
 			       const struct sctp_association *asoc,
 			       const union sctp_subtype type,
 			       void *arg,
-			       sctp_cmd_seq_t *commands)
+			       struct sctp_cmd_seq *commands)
 {
 	return SCTP_DISPOSITION_BUG;
 }
@@ -6003,7 +6005,7 @@ sctp_disposition_t sctp_sf_timer_ignore(struct net *net,
 					const struct sctp_association *asoc,
 					const union sctp_subtype type,
 					void *arg,
-					sctp_cmd_seq_t *commands)
+					struct sctp_cmd_seq *commands)
 {
 	pr_debug("%s: timer %d ignored\n", __func__, type.chunk);
 
@@ -6169,7 +6171,7 @@ static void sctp_send_stale_cookie_err(struct net *net,
 				       const struct sctp_endpoint *ep,
 				       const struct sctp_association *asoc,
 				       const struct sctp_chunk *chunk,
-				       sctp_cmd_seq_t *commands,
+				       struct sctp_cmd_seq *commands,
 				       struct sctp_chunk *err_chunk)
 {
 	struct sctp_packet *packet;
@@ -6198,7 +6200,7 @@ static void sctp_send_stale_cookie_err(struct net *net,
 /* Process a data chunk */
 static int sctp_eat_data(const struct sctp_association *asoc,
 			 struct sctp_chunk *chunk,
-			 sctp_cmd_seq_t *commands)
+			 struct sctp_cmd_seq *commands)
 {
 	struct sctp_datahdr *data_hdr;
 	struct sctp_chunk *err;
-- 
cgit v1.2.3


From c488b7704ed0eed18e11f9b685931558735f2a68 Mon Sep 17 00:00:00 2001
From: Xin Long <lucien.xin@gmail.com>
Date: Fri, 11 Aug 2017 10:23:53 +0800
Subject: sctp: remove the typedef sctp_arg_t

This patch is to remove the typedef sctp_arg_t, and
replace with union sctp_arg in the places where it's
using this typedef.

Signed-off-by: Xin Long <lucien.xin@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/sctp/command.h | 26 +++++++++++++-------------
 net/sctp/sm_statefuns.c    |  2 +-
 2 files changed, 14 insertions(+), 14 deletions(-)

(limited to 'include')

diff --git a/include/net/sctp/command.h b/include/net/sctp/command.h
index cbf6798866ef..f5fc425b5a4f 100644
--- a/include/net/sctp/command.h
+++ b/include/net/sctp/command.h
@@ -117,7 +117,7 @@ typedef enum {
  */
 #define SCTP_MAX_NUM_COMMANDS 20
 
-typedef union {
+union sctp_arg {
 	void *zero_all;	/* Set to NULL to clear the entire union */
 	__s32 i32;
 	__u32 u32;
@@ -137,24 +137,24 @@ typedef union {
 	struct sctp_packet *packet;
 	struct sctp_sackhdr *sackh;
 	struct sctp_datamsg *msg;
-} sctp_arg_t;
+};
 
 /* We are simulating ML type constructors here.
  *
  * SCTP_ARG_CONSTRUCTOR(NAME, TYPE, ELT) builds a function called
  * SCTP_NAME() which takes an argument of type TYPE and returns an
- * sctp_arg_t.  It does this by inserting the sole argument into the
- * ELT union element of a local sctp_arg_t.
+ * union sctp_arg.  It does this by inserting the sole argument into
+ * the ELT union element of a local union sctp_arg.
  *
  * E.g., SCTP_ARG_CONSTRUCTOR(I32, __s32, i32) builds SCTP_I32(arg),
- * which takes an __s32 and returns a sctp_arg_t containing the
+ * which takes an __s32 and returns a union sctp_arg containing the
  * __s32.  So, after foo = SCTP_I32(arg), foo.i32 == arg.
  */
 
 #define SCTP_ARG_CONSTRUCTOR(name, type, elt) \
-static inline sctp_arg_t	\
+static inline union sctp_arg	\
 SCTP_## name (type arg)		\
-{ sctp_arg_t retval;\
+{ union sctp_arg retval;\
   retval.zero_all = NULL;\
   retval.elt = arg;\
   return retval;\
@@ -179,25 +179,25 @@ SCTP_ARG_CONSTRUCTOR(PACKET,	struct sctp_packet *, packet)
 SCTP_ARG_CONSTRUCTOR(SACKH,	struct sctp_sackhdr *, sackh)
 SCTP_ARG_CONSTRUCTOR(DATAMSG,	struct sctp_datamsg *, msg)
 
-static inline sctp_arg_t SCTP_FORCE(void)
+static inline union sctp_arg SCTP_FORCE(void)
 {
 	return SCTP_I32(1);
 }
 
-static inline sctp_arg_t SCTP_NOFORCE(void)
+static inline union sctp_arg SCTP_NOFORCE(void)
 {
 	return SCTP_I32(0);
 }
 
-static inline sctp_arg_t SCTP_NULL(void)
+static inline union sctp_arg SCTP_NULL(void)
 {
-	sctp_arg_t retval;
+	union sctp_arg retval;
 	retval.zero_all = NULL;
 	return retval;
 }
 
 struct sctp_cmd {
-	sctp_arg_t obj;
+	union sctp_arg obj;
 	sctp_verb_t verb;
 };
 
@@ -226,7 +226,7 @@ static inline int sctp_init_cmd_seq(struct sctp_cmd_seq *seq)
  * to wrap data which goes in the obj argument.
  */
 static inline void sctp_add_cmd_sf(struct sctp_cmd_seq *seq, sctp_verb_t verb,
-				   sctp_arg_t obj)
+				   union sctp_arg obj)
 {
 	struct sctp_cmd *cmd = seq->last_used_slot - 1;
 
diff --git a/net/sctp/sm_statefuns.c b/net/sctp/sm_statefuns.c
index 93b6f42a9252..3394c4d34ea4 100644
--- a/net/sctp/sm_statefuns.c
+++ b/net/sctp/sm_statefuns.c
@@ -2975,8 +2975,8 @@ sctp_disposition_t sctp_sf_eat_data_6_2(struct net *net,
 					void *arg,
 					struct sctp_cmd_seq *commands)
 {
+	union sctp_arg force = SCTP_NOFORCE();
 	struct sctp_chunk *chunk = arg;
-	sctp_arg_t force = SCTP_NOFORCE();
 	int error;
 
 	if (!sctp_vtag_verify(chunk, asoc)) {
-- 
cgit v1.2.3


From e08af95df1130883762b388a19bb150ae5d16c09 Mon Sep 17 00:00:00 2001
From: Xin Long <lucien.xin@gmail.com>
Date: Fri, 11 Aug 2017 10:23:54 +0800
Subject: sctp: remove the typedef sctp_verb_t

This patch is to remove the typedef sctp_verb_t, and
replace with enum sctp_verb in the places where it's
using this typedef.

Signed-off-by: Xin Long <lucien.xin@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/sctp/command.h | 10 +++++-----
 net/sctp/sm_statefuns.c    |  2 +-
 2 files changed, 6 insertions(+), 6 deletions(-)

(limited to 'include')

diff --git a/include/net/sctp/command.h b/include/net/sctp/command.h
index f5fc425b5a4f..b55c6a48a206 100644
--- a/include/net/sctp/command.h
+++ b/include/net/sctp/command.h
@@ -40,7 +40,7 @@
 #include <net/sctp/structs.h>
 
 
-typedef enum {
+enum sctp_verb {
 	SCTP_CMD_NOP = 0,	/* Do nothing. */
 	SCTP_CMD_NEW_ASOC,	/* Register a new association.  */
 	SCTP_CMD_DELETE_TCB,	/* Delete the current association. */
@@ -108,7 +108,7 @@ typedef enum {
 	SCTP_CMD_PURGE_ASCONF_QUEUE, /* Purge all asconf queues.*/
 	SCTP_CMD_SET_ASOC,	 /* Restore association context */
 	SCTP_CMD_LAST
-} sctp_verb_t;
+};
 
 /* How many commands can you put in an struct sctp_cmd_seq?
  * This is a rather arbitrary number, ideally derived from a careful
@@ -198,7 +198,7 @@ static inline union sctp_arg SCTP_NULL(void)
 
 struct sctp_cmd {
 	union sctp_arg obj;
-	sctp_verb_t verb;
+	enum sctp_verb verb;
 };
 
 struct sctp_cmd_seq {
@@ -225,8 +225,8 @@ static inline int sctp_init_cmd_seq(struct sctp_cmd_seq *seq)
  * Use the SCTP_* constructors defined by SCTP_ARG_CONSTRUCTOR() above
  * to wrap data which goes in the obj argument.
  */
-static inline void sctp_add_cmd_sf(struct sctp_cmd_seq *seq, sctp_verb_t verb,
-				   union sctp_arg obj)
+static inline void sctp_add_cmd_sf(struct sctp_cmd_seq *seq,
+				   enum sctp_verb verb, union sctp_arg obj)
 {
 	struct sctp_cmd *cmd = seq->last_used_slot - 1;
 
diff --git a/net/sctp/sm_statefuns.c b/net/sctp/sm_statefuns.c
index 3394c4d34ea4..adc1dde34bfe 100644
--- a/net/sctp/sm_statefuns.c
+++ b/net/sctp/sm_statefuns.c
@@ -6205,7 +6205,7 @@ static int sctp_eat_data(const struct sctp_association *asoc,
 	struct sctp_datahdr *data_hdr;
 	struct sctp_chunk *err;
 	size_t datalen;
-	sctp_verb_t deliver;
+	enum sctp_verb deliver;
 	int tmp;
 	__u32 tsn;
 	struct sctp_tsnmap *map = (struct sctp_tsnmap *)&asoc->peer.tsn_map;
-- 
cgit v1.2.3


From eb662a6a9b2a4ee3c76bbfc4c23f4dc56859b0f3 Mon Sep 17 00:00:00 2001
From: Xin Long <lucien.xin@gmail.com>
Date: Fri, 11 Aug 2017 10:23:55 +0800
Subject: sctp: remove the unused typedef sctp_sm_command_t

Remove this typedef including the struct, there is even no places
using it.

Signed-off-by: Xin Long <lucien.xin@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/sctp/sm.h | 5 -----
 1 file changed, 5 deletions(-)

(limited to 'include')

diff --git a/include/net/sctp/sm.h b/include/net/sctp/sm.h
index 9af64b98d96b..3ca75a6c4b5e 100644
--- a/include/net/sctp/sm.h
+++ b/include/net/sctp/sm.h
@@ -65,11 +65,6 @@ typedef enum {
 	SCTP_DISPOSITION_BUG,		 /* This is a bug.  */
 } sctp_disposition_t;
 
-typedef struct {
-	int name;
-	int action;
-} sctp_sm_command_t;
-
 typedef sctp_disposition_t (sctp_state_fn_t) (struct net *,
 					      const struct sctp_endpoint *,
 					      const struct sctp_association *,
-- 
cgit v1.2.3


From 8ee821aea39c6bf4142c9319adecea6d3e1af4a2 Mon Sep 17 00:00:00 2001
From: Xin Long <lucien.xin@gmail.com>
Date: Fri, 11 Aug 2017 10:23:56 +0800
Subject: sctp: remove the typedef sctp_sm_table_entry_t

This patch is to remove the typedef sctp_sm_table_entry_t, and
replace with struct sctp_sm_table_entry in the places where it's
using this typedef.

It is also to fix some indents.

Signed-off-by: Xin Long <lucien.xin@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/sctp/sm.h    |  6 +++---
 net/sctp/sm_sideeffect.c |  2 +-
 net/sctp/sm_statetable.c | 42 +++++++++++++++++++++++++-----------------
 3 files changed, 29 insertions(+), 21 deletions(-)

(limited to 'include')

diff --git a/include/net/sctp/sm.h b/include/net/sctp/sm.h
index 3ca75a6c4b5e..7ad240228a0f 100644
--- a/include/net/sctp/sm.h
+++ b/include/net/sctp/sm.h
@@ -72,10 +72,10 @@ typedef sctp_disposition_t (sctp_state_fn_t) (struct net *,
 					      void *arg,
 					      struct sctp_cmd_seq *);
 typedef void (sctp_timer_event_t) (unsigned long);
-typedef struct {
+struct sctp_sm_table_entry {
 	sctp_state_fn_t *fn;
 	const char *name;
-} sctp_sm_table_entry_t;
+};
 
 /* A naming convention of "sctp_sf_xxx" applies to all the state functions
  * currently in use.
@@ -170,7 +170,7 @@ sctp_state_fn_t sctp_sf_autoclose_timer_expire;
 
 /* Prototypes for utility support functions.  */
 __u8 sctp_get_chunk_type(struct sctp_chunk *chunk);
-const sctp_sm_table_entry_t *sctp_sm_lookup_event(
+const struct sctp_sm_table_entry *sctp_sm_lookup_event(
 					struct net *net,
 					enum sctp_event event_type,
 					enum sctp_state state,
diff --git a/net/sctp/sm_sideeffect.c b/net/sctp/sm_sideeffect.c
index 6dd5934cbda6..2bc1204becbd 100644
--- a/net/sctp/sm_sideeffect.c
+++ b/net/sctp/sm_sideeffect.c
@@ -1146,7 +1146,7 @@ int sctp_do_sm(struct net *net, enum sctp_event event_type,
 	       void *event_arg, gfp_t gfp)
 {
 	struct sctp_cmd_seq commands;
-	const sctp_sm_table_entry_t *state_fn;
+	const struct sctp_sm_table_entry *state_fn;
 	sctp_disposition_t status;
 	int error = 0;
 	typedef const char *(printfn_t)(union sctp_subtype);
diff --git a/net/sctp/sm_statetable.c b/net/sctp/sm_statetable.c
index d437f3801399..79b6bee5b768 100644
--- a/net/sctp/sm_statetable.c
+++ b/net/sctp/sm_statetable.c
@@ -45,27 +45,27 @@
 #include <net/sctp/sctp.h>
 #include <net/sctp/sm.h>
 
-static const sctp_sm_table_entry_t
+static const struct sctp_sm_table_entry
 primitive_event_table[SCTP_NUM_PRIMITIVE_TYPES][SCTP_STATE_NUM_STATES];
-static const sctp_sm_table_entry_t
+static const struct sctp_sm_table_entry
 other_event_table[SCTP_NUM_OTHER_TYPES][SCTP_STATE_NUM_STATES];
-static const sctp_sm_table_entry_t
+static const struct sctp_sm_table_entry
 timeout_event_table[SCTP_NUM_TIMEOUT_TYPES][SCTP_STATE_NUM_STATES];
 
-static const sctp_sm_table_entry_t *sctp_chunk_event_lookup(
+static const struct sctp_sm_table_entry *sctp_chunk_event_lookup(
 						struct net *net,
 						enum sctp_cid cid,
 						enum sctp_state state);
 
 
-static const sctp_sm_table_entry_t bug = {
+static const struct sctp_sm_table_entry bug = {
 	.fn = sctp_sf_bug,
 	.name = "sctp_sf_bug"
 };
 
 #define DO_LOOKUP(_max, _type, _table)					\
 ({									\
-	const sctp_sm_table_entry_t *rtn;				\
+	const struct sctp_sm_table_entry *rtn;				\
 									\
 	if ((event_subtype._type > (_max))) {				\
 		pr_warn("table %p possible attack: event %d exceeds max %d\n", \
@@ -77,7 +77,7 @@ static const sctp_sm_table_entry_t bug = {
 	rtn;								\
 })
 
-const sctp_sm_table_entry_t *sctp_sm_lookup_event(
+const struct sctp_sm_table_entry *sctp_sm_lookup_event(
 					struct net *net,
 					enum sctp_event event_type,
 					enum sctp_state state,
@@ -394,7 +394,8 @@ const sctp_sm_table_entry_t *sctp_sm_lookup_event(
  *
  * For base protocol (RFC 2960).
  */
-static const sctp_sm_table_entry_t chunk_event_table[SCTP_NUM_BASE_CHUNK_TYPES][SCTP_STATE_NUM_STATES] = {
+static const struct sctp_sm_table_entry
+chunk_event_table[SCTP_NUM_BASE_CHUNK_TYPES][SCTP_STATE_NUM_STATES] = {
 	TYPE_SCTP_DATA,
 	TYPE_SCTP_INIT,
 	TYPE_SCTP_INIT_ACK,
@@ -453,7 +454,8 @@ static const sctp_sm_table_entry_t chunk_event_table[SCTP_NUM_BASE_CHUNK_TYPES][
 /* The primary index for this table is the chunk type.
  * The secondary index for this table is the state.
  */
-static const sctp_sm_table_entry_t addip_chunk_event_table[SCTP_NUM_ADDIP_CHUNK_TYPES][SCTP_STATE_NUM_STATES] = {
+static const struct sctp_sm_table_entry
+addip_chunk_event_table[SCTP_NUM_ADDIP_CHUNK_TYPES][SCTP_STATE_NUM_STATES] = {
 	TYPE_SCTP_ASCONF,
 	TYPE_SCTP_ASCONF_ACK,
 }; /*state_fn_t addip_chunk_event_table[][] */
@@ -480,7 +482,8 @@ static const sctp_sm_table_entry_t addip_chunk_event_table[SCTP_NUM_ADDIP_CHUNK_
 /* The primary index for this table is the chunk type.
  * The secondary index for this table is the state.
  */
-static const sctp_sm_table_entry_t prsctp_chunk_event_table[SCTP_NUM_PRSCTP_CHUNK_TYPES][SCTP_STATE_NUM_STATES] = {
+static const struct sctp_sm_table_entry
+prsctp_chunk_event_table[SCTP_NUM_PRSCTP_CHUNK_TYPES][SCTP_STATE_NUM_STATES] = {
 	TYPE_SCTP_FWD_TSN,
 }; /*state_fn_t prsctp_chunk_event_table[][] */
 
@@ -506,7 +509,8 @@ static const sctp_sm_table_entry_t prsctp_chunk_event_table[SCTP_NUM_PRSCTP_CHUN
 /* The primary index for this table is the chunk type.
  * The secondary index for this table is the state.
  */
-static const sctp_sm_table_entry_t reconf_chunk_event_table[SCTP_NUM_RECONF_CHUNK_TYPES][SCTP_STATE_NUM_STATES] = {
+static const struct sctp_sm_table_entry
+reconf_chunk_event_table[SCTP_NUM_RECONF_CHUNK_TYPES][SCTP_STATE_NUM_STATES] = {
 	TYPE_SCTP_RECONF,
 }; /*state_fn_t reconf_chunk_event_table[][] */
 
@@ -532,11 +536,12 @@ static const sctp_sm_table_entry_t reconf_chunk_event_table[SCTP_NUM_RECONF_CHUN
 /* The primary index for this table is the chunk type.
  * The secondary index for this table is the state.
  */
-static const sctp_sm_table_entry_t auth_chunk_event_table[SCTP_NUM_AUTH_CHUNK_TYPES][SCTP_STATE_NUM_STATES] = {
+static const struct sctp_sm_table_entry
+auth_chunk_event_table[SCTP_NUM_AUTH_CHUNK_TYPES][SCTP_STATE_NUM_STATES] = {
 	TYPE_SCTP_AUTH,
 }; /*state_fn_t auth_chunk_event_table[][] */
 
-static const sctp_sm_table_entry_t
+static const struct sctp_sm_table_entry
 chunk_event_table_unknown[SCTP_STATE_NUM_STATES] = {
 	/* SCTP_STATE_CLOSED */
 	TYPE_SCTP_FUNC(sctp_sf_ootb),
@@ -693,7 +698,8 @@ chunk_event_table_unknown[SCTP_STATE_NUM_STATES] = {
 /* The primary index for this table is the primitive type.
  * The secondary index for this table is the state.
  */
-static const sctp_sm_table_entry_t primitive_event_table[SCTP_NUM_PRIMITIVE_TYPES][SCTP_STATE_NUM_STATES] = {
+static const struct sctp_sm_table_entry
+primitive_event_table[SCTP_NUM_PRIMITIVE_TYPES][SCTP_STATE_NUM_STATES] = {
 	TYPE_SCTP_PRIMITIVE_ASSOCIATE,
 	TYPE_SCTP_PRIMITIVE_SHUTDOWN,
 	TYPE_SCTP_PRIMITIVE_ABORT,
@@ -741,7 +747,8 @@ static const sctp_sm_table_entry_t primitive_event_table[SCTP_NUM_PRIMITIVE_TYPE
 	TYPE_SCTP_FUNC(sctp_sf_ignore_other), \
 }
 
-static const sctp_sm_table_entry_t other_event_table[SCTP_NUM_OTHER_TYPES][SCTP_STATE_NUM_STATES] = {
+static const struct sctp_sm_table_entry
+other_event_table[SCTP_NUM_OTHER_TYPES][SCTP_STATE_NUM_STATES] = {
 	TYPE_SCTP_OTHER_NO_PENDING_TSN,
 	TYPE_SCTP_OTHER_ICMP_PROTO_UNREACH,
 };
@@ -955,7 +962,8 @@ static const sctp_sm_table_entry_t other_event_table[SCTP_NUM_OTHER_TYPES][SCTP_
 	TYPE_SCTP_FUNC(sctp_sf_timer_ignore), \
 }
 
-static const sctp_sm_table_entry_t timeout_event_table[SCTP_NUM_TIMEOUT_TYPES][SCTP_STATE_NUM_STATES] = {
+static const struct sctp_sm_table_entry
+timeout_event_table[SCTP_NUM_TIMEOUT_TYPES][SCTP_STATE_NUM_STATES] = {
 	TYPE_SCTP_EVENT_TIMEOUT_NONE,
 	TYPE_SCTP_EVENT_TIMEOUT_T1_COOKIE,
 	TYPE_SCTP_EVENT_TIMEOUT_T1_INIT,
@@ -969,7 +977,7 @@ static const sctp_sm_table_entry_t timeout_event_table[SCTP_NUM_TIMEOUT_TYPES][S
 	TYPE_SCTP_EVENT_TIMEOUT_AUTOCLOSE,
 };
 
-static const sctp_sm_table_entry_t *sctp_chunk_event_lookup(
+static const struct sctp_sm_table_entry *sctp_chunk_event_lookup(
 						struct net *net,
 						enum sctp_cid cid,
 						enum sctp_state state)
-- 
cgit v1.2.3


From 172a1599ba88df7147f6503a75686fb89c8a1f3f Mon Sep 17 00:00:00 2001
From: Xin Long <lucien.xin@gmail.com>
Date: Fri, 11 Aug 2017 10:23:57 +0800
Subject: sctp: remove the typedef sctp_disposition_t

This patch is to remove the typedef sctp_disposition_t, and
replace with enum sctp_disposition in the places where it's
using this typedef.

It's also to fix the indent for many functions' defination.

Signed-off-by: Xin Long <lucien.xin@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/sctp/sm.h    |   19 +-
 net/sctp/probe.c         |   13 +-
 net/sctp/sm_sideeffect.c |   50 +-
 net/sctp/sm_statefuns.c  | 1316 ++++++++++++++++++++++++----------------------
 4 files changed, 717 insertions(+), 681 deletions(-)

(limited to 'include')

diff --git a/include/net/sctp/sm.h b/include/net/sctp/sm.h
index 7ad240228a0f..33077f317995 100644
--- a/include/net/sctp/sm.h
+++ b/include/net/sctp/sm.h
@@ -53,7 +53,7 @@
 /*
  * Possible values for the disposition are:
  */
-typedef enum {
+enum sctp_disposition {
 	SCTP_DISPOSITION_DISCARD,	 /* No further processing.  */
 	SCTP_DISPOSITION_CONSUME,	 /* Process return values normally.  */
 	SCTP_DISPOSITION_NOMEM,		 /* We ran out of memory--recover.  */
@@ -63,14 +63,15 @@ typedef enum {
 	SCTP_DISPOSITION_NOT_IMPL,	 /* This entry is not implemented.  */
 	SCTP_DISPOSITION_ERROR,		 /* This is plain old user error.  */
 	SCTP_DISPOSITION_BUG,		 /* This is a bug.  */
-} sctp_disposition_t;
-
-typedef sctp_disposition_t (sctp_state_fn_t) (struct net *,
-					      const struct sctp_endpoint *,
-					      const struct sctp_association *,
-					      const union sctp_subtype type,
-					      void *arg,
-					      struct sctp_cmd_seq *);
+};
+
+typedef enum sctp_disposition (sctp_state_fn_t) (
+					struct net *net,
+					const struct sctp_endpoint *ep,
+					const struct sctp_association *asoc,
+					const union sctp_subtype type,
+					void *arg,
+					struct sctp_cmd_seq *commands);
 typedef void (sctp_timer_event_t) (unsigned long);
 struct sctp_sm_table_entry {
 	sctp_state_fn_t *fn;
diff --git a/net/sctp/probe.c b/net/sctp/probe.c
index 34097a167431..1280f85a598d 100644
--- a/net/sctp/probe.c
+++ b/net/sctp/probe.c
@@ -127,12 +127,13 @@ static const struct file_operations sctpprobe_fops = {
 	.llseek = noop_llseek,
 };
 
-static sctp_disposition_t jsctp_sf_eat_sack(struct net *net,
-					    const struct sctp_endpoint *ep,
-					    const struct sctp_association *asoc,
-					    const union sctp_subtype type,
-					    void *arg,
-					    struct sctp_cmd_seq *commands)
+static enum sctp_disposition jsctp_sf_eat_sack(
+					struct net *net,
+					const struct sctp_endpoint *ep,
+					const struct sctp_association *asoc,
+					const union sctp_subtype type,
+					void *arg,
+					struct sctp_cmd_seq *commands)
 {
 	struct sctp_chunk *chunk = arg;
 	struct sk_buff *skb = chunk->skb;
diff --git a/net/sctp/sm_sideeffect.c b/net/sctp/sm_sideeffect.c
index 2bc1204becbd..e6a2974e020e 100644
--- a/net/sctp/sm_sideeffect.c
+++ b/net/sctp/sm_sideeffect.c
@@ -57,7 +57,7 @@ static int sctp_cmd_interpreter(enum sctp_event event_type,
 				struct sctp_endpoint *ep,
 				struct sctp_association *asoc,
 				void *event_arg,
-				sctp_disposition_t status,
+				enum sctp_disposition status,
 				struct sctp_cmd_seq *commands,
 				gfp_t gfp);
 static int sctp_side_effects(enum sctp_event event_type,
@@ -66,7 +66,7 @@ static int sctp_side_effects(enum sctp_event event_type,
 			     struct sctp_endpoint *ep,
 			     struct sctp_association **asoc,
 			     void *event_arg,
-			     sctp_disposition_t status,
+			     enum sctp_disposition status,
 			     struct sctp_cmd_seq *commands,
 			     gfp_t gfp);
 
@@ -97,8 +97,8 @@ static void sctp_do_ecn_ce_work(struct sctp_association *asoc,
  * that was originally marked with the CE bit.
  */
 static struct sctp_chunk *sctp_do_ecn_ecne_work(struct sctp_association *asoc,
-					   __u32 lowest_tsn,
-					   struct sctp_chunk *chunk)
+						__u32 lowest_tsn,
+						struct sctp_chunk *chunk)
 {
 	struct sctp_chunk *repl;
 
@@ -152,9 +152,9 @@ static void sctp_do_ecn_cwr_work(struct sctp_association *asoc,
 static int sctp_gen_sack(struct sctp_association *asoc, int force,
 			 struct sctp_cmd_seq *commands)
 {
+	struct sctp_transport *trans = asoc->peer.last_data_from;
 	__u32 ctsn, max_tsn_seen;
 	struct sctp_chunk *sack;
-	struct sctp_transport *trans = asoc->peer.last_data_from;
 	int error = 0;
 
 	if (force ||
@@ -244,11 +244,11 @@ nomem:
  */
 void sctp_generate_t3_rtx_event(unsigned long peer)
 {
-	int error;
 	struct sctp_transport *transport = (struct sctp_transport *) peer;
 	struct sctp_association *asoc = transport->asoc;
 	struct sock *sk = asoc->base.sk;
 	struct net *net = sock_net(sk);
+	int error;
 
 	/* Check whether a task is in the sock.  */
 
@@ -361,12 +361,12 @@ static void sctp_generate_autoclose_event(unsigned long data)
  */
 void sctp_generate_heartbeat_event(unsigned long data)
 {
-	int error = 0;
 	struct sctp_transport *transport = (struct sctp_transport *) data;
 	struct sctp_association *asoc = transport->asoc;
 	struct sock *sk = asoc->base.sk;
 	struct net *net = sock_net(sk);
 	u32 elapsed, timeout;
+	int error = 0;
 
 	bh_lock_sock(sk);
 	if (sock_owned_by_user(sk)) {
@@ -406,7 +406,7 @@ out_unlock:
  */
 void sctp_generate_proto_unreach_event(unsigned long data)
 {
-	struct sctp_transport *transport = (struct sctp_transport *) data;
+	struct sctp_transport *transport = (struct sctp_transport *)data;
 	struct sctp_association *asoc = transport->asoc;
 	struct sock *sk = asoc->base.sk;
 	struct net *net = sock_net(sk);
@@ -472,7 +472,7 @@ out_unlock:
 /* Inject a SACK Timeout event into the state machine.  */
 static void sctp_generate_sack_event(unsigned long data)
 {
-	struct sctp_association *asoc = (struct sctp_association *) data;
+	struct sctp_association *asoc = (struct sctp_association *)data;
 	sctp_generate_timeout_event(asoc, SCTP_EVENT_TIMEOUT_SACK);
 }
 
@@ -610,6 +610,7 @@ static void sctp_cmd_assoc_failed(struct sctp_cmd_seq *commands,
 {
 	struct sctp_ulpevent *event;
 	struct sctp_chunk *abort;
+
 	/* Cancel any partial delivery in progress. */
 	sctp_ulpq_abort_pd(&asoc->ulpq, GFP_ATOMIC);
 
@@ -991,6 +992,7 @@ static void sctp_cmd_process_fwdtsn(struct sctp_ulpq *ulpq,
 				    struct sctp_chunk *chunk)
 {
 	struct sctp_fwdtsn_skip *skip;
+
 	/* Walk through all the skipped SSNs */
 	sctp_walk_fwdtsn(skip, chunk) {
 		sctp_ulpq_skip(ulpq, ntohs(skip->stream), ntohs(skip->ssn));
@@ -1003,8 +1005,8 @@ static void sctp_cmd_process_fwdtsn(struct sctp_ulpq *ulpq,
 static void sctp_cmd_del_non_primary(struct sctp_association *asoc)
 {
 	struct sctp_transport *t;
-	struct list_head *pos;
 	struct list_head *temp;
+	struct list_head *pos;
 
 	list_for_each_safe(pos, temp, &asoc->peer.transport_addr_list) {
 		t = list_entry(pos, struct sctp_transport, transports);
@@ -1145,15 +1147,15 @@ int sctp_do_sm(struct net *net, enum sctp_event event_type,
 	       struct sctp_endpoint *ep, struct sctp_association *asoc,
 	       void *event_arg, gfp_t gfp)
 {
-	struct sctp_cmd_seq commands;
-	const struct sctp_sm_table_entry *state_fn;
-	sctp_disposition_t status;
-	int error = 0;
 	typedef const char *(printfn_t)(union sctp_subtype);
 	static printfn_t *table[] = {
 		NULL, sctp_cname, sctp_tname, sctp_oname, sctp_pname,
 	};
 	printfn_t *debug_fn  __attribute__ ((unused)) = table[event_type];
+	const struct sctp_sm_table_entry *state_fn;
+	struct sctp_cmd_seq commands;
+	enum sctp_disposition status;
+	int error = 0;
 
 	/* Look up the state function, run it, and then process the
 	 * side effects.  These three steps are the heart of lksctp.
@@ -1183,7 +1185,7 @@ static int sctp_side_effects(enum sctp_event event_type,
 			     struct sctp_endpoint *ep,
 			     struct sctp_association **asoc,
 			     void *event_arg,
-			     sctp_disposition_t status,
+			     enum sctp_disposition status,
 			     struct sctp_cmd_seq *commands,
 			     gfp_t gfp)
 {
@@ -1269,23 +1271,21 @@ static int sctp_cmd_interpreter(enum sctp_event event_type,
 				struct sctp_endpoint *ep,
 				struct sctp_association *asoc,
 				void *event_arg,
-				sctp_disposition_t status,
+				enum sctp_disposition status,
 				struct sctp_cmd_seq *commands,
 				gfp_t gfp)
 {
-	struct sock *sk = ep->base.sk;
-	struct sctp_sock *sp = sctp_sk(sk);
-	int error = 0;
-	int force;
-	struct sctp_cmd *cmd;
-	struct sctp_chunk *new_obj;
-	struct sctp_chunk *chunk = NULL;
+	struct sctp_sock *sp = sctp_sk(ep->base.sk);
+	struct sctp_chunk *chunk = NULL, *new_obj;
 	struct sctp_packet *packet;
+	struct sctp_sackhdr sackh;
 	struct timer_list *timer;
-	unsigned long timeout;
 	struct sctp_transport *t;
-	struct sctp_sackhdr sackh;
+	unsigned long timeout;
+	struct sctp_cmd *cmd;
 	int local_cork = 0;
+	int error = 0;
+	int force;
 
 	if (SCTP_EVENT_T_TIMEOUT != event_type)
 		chunk = event_arg;
diff --git a/net/sctp/sm_statefuns.c b/net/sctp/sm_statefuns.c
index adc1dde34bfe..8f8ccded13e4 100644
--- a/net/sctp/sm_statefuns.c
+++ b/net/sctp/sm_statefuns.c
@@ -59,37 +59,41 @@
 #include <net/sctp/sm.h>
 #include <net/sctp/structs.h>
 
-static struct sctp_packet *sctp_abort_pkt_new(struct net *net,
-				  const struct sctp_endpoint *ep,
-				  const struct sctp_association *asoc,
-				  struct sctp_chunk *chunk,
-				  const void *payload,
-				  size_t paylen);
+static struct sctp_packet *sctp_abort_pkt_new(
+					struct net *net,
+					const struct sctp_endpoint *ep,
+					const struct sctp_association *asoc,
+					struct sctp_chunk *chunk,
+					const void *payload, size_t paylen);
 static int sctp_eat_data(const struct sctp_association *asoc,
 			 struct sctp_chunk *chunk,
 			 struct sctp_cmd_seq *commands);
-static struct sctp_packet *sctp_ootb_pkt_new(struct net *net,
-					     const struct sctp_association *asoc,
-					     const struct sctp_chunk *chunk);
+static struct sctp_packet *sctp_ootb_pkt_new(
+					struct net *net,
+					const struct sctp_association *asoc,
+					const struct sctp_chunk *chunk);
 static void sctp_send_stale_cookie_err(struct net *net,
 				       const struct sctp_endpoint *ep,
 				       const struct sctp_association *asoc,
 				       const struct sctp_chunk *chunk,
 				       struct sctp_cmd_seq *commands,
 				       struct sctp_chunk *err_chunk);
-static sctp_disposition_t sctp_sf_do_5_2_6_stale(struct net *net,
-						 const struct sctp_endpoint *ep,
-						 const struct sctp_association *asoc,
-						 const union sctp_subtype type,
-						 void *arg,
-						 struct sctp_cmd_seq *commands);
-static sctp_disposition_t sctp_sf_shut_8_4_5(struct net *net,
-					     const struct sctp_endpoint *ep,
-					     const struct sctp_association *asoc,
-					     const union sctp_subtype type,
-					     void *arg,
-					     struct sctp_cmd_seq *commands);
-static sctp_disposition_t sctp_sf_tabort_8_4_8(struct net *net,
+static enum sctp_disposition sctp_sf_do_5_2_6_stale(
+					struct net *net,
+					const struct sctp_endpoint *ep,
+					const struct sctp_association *asoc,
+					const union sctp_subtype type,
+					void *arg,
+					struct sctp_cmd_seq *commands);
+static enum sctp_disposition sctp_sf_shut_8_4_5(
+					struct net *net,
+					const struct sctp_endpoint *ep,
+					const struct sctp_association *asoc,
+					const union sctp_subtype type,
+					void *arg,
+					struct sctp_cmd_seq *commands);
+static enum sctp_disposition sctp_sf_tabort_8_4_8(
+					struct net *net,
 					const struct sctp_endpoint *ep,
 					const struct sctp_association *asoc,
 					const union sctp_subtype type,
@@ -97,61 +101,63 @@ static sctp_disposition_t sctp_sf_tabort_8_4_8(struct net *net,
 					struct sctp_cmd_seq *commands);
 static struct sctp_sackhdr *sctp_sm_pull_sack(struct sctp_chunk *chunk);
 
-static sctp_disposition_t sctp_stop_t1_and_abort(struct net *net,
-					   struct sctp_cmd_seq *commands,
-					   __be16 error, int sk_err,
-					   const struct sctp_association *asoc,
-					   struct sctp_transport *transport);
+static enum sctp_disposition sctp_stop_t1_and_abort(
+					struct net *net,
+					struct sctp_cmd_seq *commands,
+					__be16 error, int sk_err,
+					const struct sctp_association *asoc,
+					struct sctp_transport *transport);
 
-static sctp_disposition_t sctp_sf_abort_violation(
-				     struct net *net,
-				     const struct sctp_endpoint *ep,
-				     const struct sctp_association *asoc,
-				     void *arg,
-				     struct sctp_cmd_seq *commands,
-				     const __u8 *payload,
-				     const size_t paylen);
+static enum sctp_disposition sctp_sf_abort_violation(
+					struct net *net,
+					const struct sctp_endpoint *ep,
+					const struct sctp_association *asoc,
+					void *arg,
+					struct sctp_cmd_seq *commands,
+					const __u8 *payload,
+					const size_t paylen);
 
-static sctp_disposition_t sctp_sf_violation_chunklen(
-				     struct net *net,
-				     const struct sctp_endpoint *ep,
-				     const struct sctp_association *asoc,
-				     const union sctp_subtype type,
-				     void *arg,
-				     struct sctp_cmd_seq *commands);
+static enum sctp_disposition sctp_sf_violation_chunklen(
+					struct net *net,
+					const struct sctp_endpoint *ep,
+					const struct sctp_association *asoc,
+					const union sctp_subtype type,
+					void *arg,
+					struct sctp_cmd_seq *commands);
 
-static sctp_disposition_t sctp_sf_violation_paramlen(
-				     struct net *net,
-				     const struct sctp_endpoint *ep,
-				     const struct sctp_association *asoc,
-				     const union sctp_subtype type,
-				     void *arg, void *ext,
-				     struct sctp_cmd_seq *commands);
+static enum sctp_disposition sctp_sf_violation_paramlen(
+					struct net *net,
+					const struct sctp_endpoint *ep,
+					const struct sctp_association *asoc,
+					const union sctp_subtype type,
+					void *arg, void *ext,
+					struct sctp_cmd_seq *commands);
 
-static sctp_disposition_t sctp_sf_violation_ctsn(
-				     struct net *net,
-				     const struct sctp_endpoint *ep,
-				     const struct sctp_association *asoc,
-				     const union sctp_subtype type,
-				     void *arg,
-				     struct sctp_cmd_seq *commands);
+static enum sctp_disposition sctp_sf_violation_ctsn(
+					struct net *net,
+					const struct sctp_endpoint *ep,
+					const struct sctp_association *asoc,
+					const union sctp_subtype type,
+					void *arg,
+					struct sctp_cmd_seq *commands);
 
-static sctp_disposition_t sctp_sf_violation_chunk(
-				     struct net *net,
-				     const struct sctp_endpoint *ep,
-				     const struct sctp_association *asoc,
-				     const union sctp_subtype type,
-				     void *arg,
-				     struct sctp_cmd_seq *commands);
+static enum sctp_disposition sctp_sf_violation_chunk(
+					struct net *net,
+					const struct sctp_endpoint *ep,
+					const struct sctp_association *asoc,
+					const union sctp_subtype type,
+					void *arg,
+					struct sctp_cmd_seq *commands);
 
 static enum sctp_ierror sctp_sf_authenticate(
-				struct net *net,
-				const struct sctp_endpoint *ep,
-				const struct sctp_association *asoc,
-				const union sctp_subtype type,
-				struct sctp_chunk *chunk);
+					struct net *net,
+					const struct sctp_endpoint *ep,
+					const struct sctp_association *asoc,
+					const union sctp_subtype type,
+					struct sctp_chunk *chunk);
 
-static sctp_disposition_t __sctp_sf_do_9_1_abort(struct net *net,
+static enum sctp_disposition __sctp_sf_do_9_1_abort(
+					struct net *net,
 					const struct sctp_endpoint *ep,
 					const struct sctp_association *asoc,
 					const union sctp_subtype type,
@@ -165,8 +171,8 @@ static sctp_disposition_t __sctp_sf_do_9_1_abort(struct net *net,
  * 		   false = Invalid length
  *
  */
-static inline bool
-sctp_chunk_length_valid(struct sctp_chunk *chunk, __u16 required_length)
+static inline bool sctp_chunk_length_valid(struct sctp_chunk *chunk,
+					   __u16 required_length)
 {
 	__u16 chunk_length = ntohs(chunk->chunk_hdr->length);
 
@@ -214,12 +220,11 @@ sctp_chunk_length_valid(struct sctp_chunk *chunk, __u16 required_length)
  *
  * The return value is the disposition of the chunk.
  */
-sctp_disposition_t sctp_sf_do_4_C(struct net *net,
-				  const struct sctp_endpoint *ep,
-				  const struct sctp_association *asoc,
-				  const union sctp_subtype type,
-				  void *arg,
-				  struct sctp_cmd_seq *commands)
+enum sctp_disposition sctp_sf_do_4_C(struct net *net,
+				     const struct sctp_endpoint *ep,
+				     const struct sctp_association *asoc,
+				     const union sctp_subtype type,
+				     void *arg, struct sctp_cmd_seq *commands)
 {
 	struct sctp_chunk *chunk = arg;
 	struct sctp_ulpevent *ev;
@@ -300,12 +305,12 @@ sctp_disposition_t sctp_sf_do_4_C(struct net *net,
  *
  * The return value is the disposition of the chunk.
  */
-sctp_disposition_t sctp_sf_do_5_1B_init(struct net *net,
-					const struct sctp_endpoint *ep,
-					const struct sctp_association *asoc,
-					const union sctp_subtype type,
-					void *arg,
-					struct sctp_cmd_seq *commands)
+enum sctp_disposition sctp_sf_do_5_1B_init(struct net *net,
+					   const struct sctp_endpoint *ep,
+					   const struct sctp_association *asoc,
+					   const union sctp_subtype type,
+					   void *arg,
+					   struct sctp_cmd_seq *commands)
 {
 	struct sctp_chunk *chunk = arg, *repl, *err_chunk;
 	struct sctp_unrecognized_param *unk_param;
@@ -494,15 +499,15 @@ nomem:
  *
  * The return value is the disposition of the chunk.
  */
-sctp_disposition_t sctp_sf_do_5_1C_ack(struct net *net,
-				       const struct sctp_endpoint *ep,
-				       const struct sctp_association *asoc,
-				       const union sctp_subtype type,
-				       void *arg,
-				       struct sctp_cmd_seq *commands)
+enum sctp_disposition sctp_sf_do_5_1C_ack(struct net *net,
+					  const struct sctp_endpoint *ep,
+					  const struct sctp_association *asoc,
+					  const union sctp_subtype type,
+					  void *arg,
+					  struct sctp_cmd_seq *commands)
 {
-	struct sctp_chunk *chunk = arg;
 	struct sctp_init_chunk *initchunk;
+	struct sctp_chunk *chunk = arg;
 	struct sctp_chunk *err_chunk;
 	struct sctp_packet *packet;
 
@@ -644,20 +649,21 @@ sctp_disposition_t sctp_sf_do_5_1C_ack(struct net *net,
  *
  * The return value is the disposition of the chunk.
  */
-sctp_disposition_t sctp_sf_do_5_1D_ce(struct net *net,
-				      const struct sctp_endpoint *ep,
-				      const struct sctp_association *asoc,
-				      const union sctp_subtype type, void *arg,
-				      struct sctp_cmd_seq *commands)
+enum sctp_disposition sctp_sf_do_5_1D_ce(struct net *net,
+					 const struct sctp_endpoint *ep,
+					 const struct sctp_association *asoc,
+					 const union sctp_subtype type,
+					 void *arg,
+					 struct sctp_cmd_seq *commands)
 {
-	struct sctp_chunk *chunk = arg;
+	struct sctp_ulpevent *ev, *ai_ev = NULL;
 	struct sctp_association *new_asoc;
 	struct sctp_init_chunk *peer_init;
-	struct sctp_chunk *repl;
-	struct sctp_ulpevent *ev, *ai_ev = NULL;
-	int error = 0;
+	struct sctp_chunk *chunk = arg;
 	struct sctp_chunk *err_chk_p;
+	struct sctp_chunk *repl;
 	struct sock *sk;
+	int error = 0;
 
 	/* If the packet is an OOTB packet which is temporarily on the
 	 * control endpoint, respond with an ABORT.
@@ -871,11 +877,12 @@ nomem:
  *
  * The return value is the disposition of the chunk.
  */
-sctp_disposition_t sctp_sf_do_5_1E_ca(struct net *net,
-				      const struct sctp_endpoint *ep,
-				      const struct sctp_association *asoc,
-				      const union sctp_subtype type, void *arg,
-				      struct sctp_cmd_seq *commands)
+enum sctp_disposition sctp_sf_do_5_1E_ca(struct net *net,
+					 const struct sctp_endpoint *ep,
+					 const struct sctp_association *asoc,
+					 const union sctp_subtype type,
+					 void *arg,
+					 struct sctp_cmd_seq *commands)
 {
 	struct sctp_chunk *chunk = arg;
 	struct sctp_ulpevent *ev;
@@ -949,11 +956,12 @@ nomem:
 }
 
 /* Generate and sendout a heartbeat packet.  */
-static sctp_disposition_t sctp_sf_heartbeat(const struct sctp_endpoint *ep,
-					    const struct sctp_association *asoc,
-					    const union sctp_subtype type,
-					    void *arg,
-					    struct sctp_cmd_seq *commands)
+static enum sctp_disposition sctp_sf_heartbeat(
+					const struct sctp_endpoint *ep,
+					const struct sctp_association *asoc,
+					const union sctp_subtype type,
+					void *arg,
+					struct sctp_cmd_seq *commands)
 {
 	struct sctp_transport *transport = (struct sctp_transport *) arg;
 	struct sctp_chunk *reply;
@@ -974,12 +982,12 @@ static sctp_disposition_t sctp_sf_heartbeat(const struct sctp_endpoint *ep,
 }
 
 /* Generate a HEARTBEAT packet on the given transport.  */
-sctp_disposition_t sctp_sf_sendbeat_8_3(struct net *net,
-					const struct sctp_endpoint *ep,
-					const struct sctp_association *asoc,
-					const union sctp_subtype type,
-					void *arg,
-					struct sctp_cmd_seq *commands)
+enum sctp_disposition sctp_sf_sendbeat_8_3(struct net *net,
+					   const struct sctp_endpoint *ep,
+					   const struct sctp_association *asoc,
+					   const union sctp_subtype type,
+					   void *arg,
+					   struct sctp_cmd_seq *commands)
 {
 	struct sctp_transport *transport = (struct sctp_transport *) arg;
 
@@ -1022,11 +1030,12 @@ sctp_disposition_t sctp_sf_sendbeat_8_3(struct net *net,
 }
 
 /* resend asoc strreset_chunk.  */
-sctp_disposition_t sctp_sf_send_reconf(struct net *net,
-				       const struct sctp_endpoint *ep,
-				       const struct sctp_association *asoc,
-				       const union sctp_subtype type, void *arg,
-				       struct sctp_cmd_seq *commands)
+enum sctp_disposition sctp_sf_send_reconf(struct net *net,
+					  const struct sctp_endpoint *ep,
+					  const struct sctp_association *asoc,
+					  const union sctp_subtype type,
+					  void *arg,
+					  struct sctp_cmd_seq *commands)
 {
 	struct sctp_transport *transport = arg;
 
@@ -1073,12 +1082,11 @@ sctp_disposition_t sctp_sf_send_reconf(struct net *net,
  *
  * The return value is the disposition of the chunk.
  */
-sctp_disposition_t sctp_sf_beat_8_3(struct net *net,
-				    const struct sctp_endpoint *ep,
-				    const struct sctp_association *asoc,
-				    const union sctp_subtype type,
-				    void *arg,
-				    struct sctp_cmd_seq *commands)
+enum sctp_disposition sctp_sf_beat_8_3(struct net *net,
+				       const struct sctp_endpoint *ep,
+				       const struct sctp_association *asoc,
+				       const union sctp_subtype type,
+				       void *arg, struct sctp_cmd_seq *commands)
 {
 	struct sctp_paramhdr *param_hdr;
 	struct sctp_chunk *chunk = arg;
@@ -1148,12 +1156,12 @@ nomem:
  *
  * The return value is the disposition of the chunk.
  */
-sctp_disposition_t sctp_sf_backbeat_8_3(struct net *net,
-					const struct sctp_endpoint *ep,
-					const struct sctp_association *asoc,
-					const union sctp_subtype type,
-					void *arg,
-					struct sctp_cmd_seq *commands)
+enum sctp_disposition sctp_sf_backbeat_8_3(struct net *net,
+					   const struct sctp_endpoint *ep,
+					   const struct sctp_association *asoc,
+					   const union sctp_subtype type,
+					   void *arg,
+					   struct sctp_cmd_seq *commands)
 {
 	struct sctp_sender_hb_info *hbinfo;
 	struct sctp_chunk *chunk = arg;
@@ -1227,13 +1235,13 @@ static int sctp_sf_send_restart_abort(struct net *net, union sctp_addr *ssa,
 				      struct sctp_chunk *init,
 				      struct sctp_cmd_seq *commands)
 {
-	int len;
-	struct sctp_packet *pkt;
+	struct sctp_af *af = sctp_get_af_specific(ssa->v4.sin_family);
 	union sctp_addr_param *addrparm;
 	struct sctp_errhdr *errhdr;
-	struct sctp_endpoint *ep;
 	char buffer[sizeof(*errhdr) + sizeof(*addrparm)];
-	struct sctp_af *af = sctp_get_af_specific(ssa->v4.sin_family);
+	struct sctp_endpoint *ep;
+	struct sctp_packet *pkt;
+	int len;
 
 	/* Build the error on the stack.   We are way to malloc crazy
 	 * throughout the code today.
@@ -1410,18 +1418,19 @@ static char sctp_tietags_compare(struct sctp_association *new_asoc,
 /* Common helper routine for both duplicate and simulataneous INIT
  * chunk handling.
  */
-static sctp_disposition_t sctp_sf_do_unexpected_init(
-	struct net *net,
-	const struct sctp_endpoint *ep,
-	const struct sctp_association *asoc,
-	const union sctp_subtype type,
-	void *arg, struct sctp_cmd_seq *commands)
+static enum sctp_disposition sctp_sf_do_unexpected_init(
+					struct net *net,
+					const struct sctp_endpoint *ep,
+					const struct sctp_association *asoc,
+					const union sctp_subtype type,
+					void *arg,
+					struct sctp_cmd_seq *commands)
 {
 	struct sctp_chunk *chunk = arg, *repl, *err_chunk;
 	struct sctp_unrecognized_param *unk_param;
 	struct sctp_association *new_asoc;
+	enum sctp_disposition retval;
 	struct sctp_packet *packet;
-	sctp_disposition_t retval;
 	int len;
 
 	/* 6.10 Bundling
@@ -1622,12 +1631,13 @@ cleanup:
  *
  * The return value is the disposition of the chunk.
  */
-sctp_disposition_t sctp_sf_do_5_2_1_siminit(struct net *net,
-				    const struct sctp_endpoint *ep,
-				    const struct sctp_association *asoc,
-				    const union sctp_subtype type,
-				    void *arg,
-				    struct sctp_cmd_seq *commands)
+enum sctp_disposition sctp_sf_do_5_2_1_siminit(
+					struct net *net,
+					const struct sctp_endpoint *ep,
+					const struct sctp_association *asoc,
+					const union sctp_subtype type,
+					void *arg,
+					struct sctp_cmd_seq *commands)
 {
 	/* Call helper to do the real work for both simulataneous and
 	 * duplicate INIT chunk handling.
@@ -1676,7 +1686,8 @@ sctp_disposition_t sctp_sf_do_5_2_1_siminit(struct net *net,
  *
  * The return value is the disposition of the chunk.
  */
-sctp_disposition_t sctp_sf_do_5_2_2_dupinit(struct net *net,
+enum sctp_disposition sctp_sf_do_5_2_2_dupinit(
+					struct net *net,
 					const struct sctp_endpoint *ep,
 					const struct sctp_association *asoc,
 					const union sctp_subtype type,
@@ -1699,12 +1710,13 @@ sctp_disposition_t sctp_sf_do_5_2_2_dupinit(struct net *net,
  * An unexpected INIT ACK usually indicates the processing of an old or
  * duplicated INIT chunk.
 */
-sctp_disposition_t sctp_sf_do_5_2_3_initack(struct net *net,
-					    const struct sctp_endpoint *ep,
-					    const struct sctp_association *asoc,
-					    const union sctp_subtype type,
-					    void *arg,
-					    struct sctp_cmd_seq *commands)
+enum sctp_disposition sctp_sf_do_5_2_3_initack(
+					struct net *net,
+					const struct sctp_endpoint *ep,
+					const struct sctp_association *asoc,
+					const union sctp_subtype type,
+					void *arg,
+					struct sctp_cmd_seq *commands)
 {
 	/* Per the above section, we'll discard the chunk if we have an
 	 * endpoint.  If this is an OOTB INIT-ACK, treat it as such.
@@ -1720,7 +1732,8 @@ sctp_disposition_t sctp_sf_do_5_2_3_initack(struct net *net,
  * Section 5.2.4
  *  A)  In this case, the peer may have restarted.
  */
-static sctp_disposition_t sctp_sf_do_dupcook_a(struct net *net,
+static enum sctp_disposition sctp_sf_do_dupcook_a(
+					struct net *net,
 					const struct sctp_endpoint *ep,
 					const struct sctp_association *asoc,
 					struct sctp_chunk *chunk,
@@ -1728,10 +1741,10 @@ static sctp_disposition_t sctp_sf_do_dupcook_a(struct net *net,
 					struct sctp_association *new_asoc)
 {
 	struct sctp_init_chunk *peer_init;
+	enum sctp_disposition disposition;
 	struct sctp_ulpevent *ev;
 	struct sctp_chunk *repl;
 	struct sctp_chunk *err;
-	sctp_disposition_t disposition;
 
 	/* new_asoc is a brand-new association, so these are not yet
 	 * side effects--it is safe to run them here.
@@ -1835,7 +1848,8 @@ nomem:
  *      after responding to the local endpoint's INIT
  */
 /* This case represents an initialization collision.  */
-static sctp_disposition_t sctp_sf_do_dupcook_b(struct net *net,
+static enum sctp_disposition sctp_sf_do_dupcook_b(
+					struct net *net,
 					const struct sctp_endpoint *ep,
 					const struct sctp_association *asoc,
 					struct sctp_chunk *chunk,
@@ -1906,7 +1920,8 @@ nomem:
  *     but a new tag of its own.
  */
 /* This case represents an initialization collision.  */
-static sctp_disposition_t sctp_sf_do_dupcook_c(struct net *net,
+static enum sctp_disposition sctp_sf_do_dupcook_c(
+					struct net *net,
 					const struct sctp_endpoint *ep,
 					const struct sctp_association *asoc,
 					struct sctp_chunk *chunk,
@@ -1928,7 +1943,8 @@ static sctp_disposition_t sctp_sf_do_dupcook_c(struct net *net,
  *    enter the ESTABLISHED state, if it has not already done so.
  */
 /* This case represents an initialization collision.  */
-static sctp_disposition_t sctp_sf_do_dupcook_d(struct net *net,
+static enum sctp_disposition sctp_sf_do_dupcook_d(
+					struct net *net,
 					const struct sctp_endpoint *ep,
 					const struct sctp_association *asoc,
 					struct sctp_chunk *chunk,
@@ -2023,19 +2039,20 @@ nomem:
  *
  * The return value is the disposition of the chunk.
  */
-sctp_disposition_t sctp_sf_do_5_2_4_dupcook(struct net *net,
+enum sctp_disposition sctp_sf_do_5_2_4_dupcook(
+					struct net *net,
 					const struct sctp_endpoint *ep,
 					const struct sctp_association *asoc,
 					const union sctp_subtype type,
 					void *arg,
 					struct sctp_cmd_seq *commands)
 {
-	sctp_disposition_t retval;
-	struct sctp_chunk *chunk = arg;
 	struct sctp_association *new_asoc;
+	struct sctp_chunk *chunk = arg;
+	enum sctp_disposition retval;
+	struct sctp_chunk *err_chk_p;
 	int error = 0;
 	char action;
-	struct sctp_chunk *err_chk_p;
 
 	/* Make sure that the chunk has a valid length from the protocol
 	 * perspective.  In this case check to make sure we have at least
@@ -2141,13 +2158,13 @@ nomem:
  *
  * See sctp_sf_do_9_1_abort().
  */
-sctp_disposition_t sctp_sf_shutdown_pending_abort(
-	struct net *net,
-	const struct sctp_endpoint *ep,
-	const struct sctp_association *asoc,
-	const union sctp_subtype type,
-	void *arg,
-	struct sctp_cmd_seq *commands)
+enum sctp_disposition sctp_sf_shutdown_pending_abort(
+					struct net *net,
+					const struct sctp_endpoint *ep,
+					const struct sctp_association *asoc,
+					const union sctp_subtype type,
+					void *arg,
+					struct sctp_cmd_seq *commands)
 {
 	struct sctp_chunk *chunk = arg;
 
@@ -2184,7 +2201,8 @@ sctp_disposition_t sctp_sf_shutdown_pending_abort(
  *
  * See sctp_sf_do_9_1_abort().
  */
-sctp_disposition_t sctp_sf_shutdown_sent_abort(struct net *net,
+enum sctp_disposition sctp_sf_shutdown_sent_abort(
+					struct net *net,
 					const struct sctp_endpoint *ep,
 					const struct sctp_association *asoc,
 					const union sctp_subtype type,
@@ -2234,13 +2252,13 @@ sctp_disposition_t sctp_sf_shutdown_sent_abort(struct net *net,
  *
  * See sctp_sf_do_9_1_abort().
  */
-sctp_disposition_t sctp_sf_shutdown_ack_sent_abort(
-	struct net *net,
-	const struct sctp_endpoint *ep,
-	const struct sctp_association *asoc,
-	const union sctp_subtype type,
-	void *arg,
-	struct sctp_cmd_seq *commands)
+enum sctp_disposition sctp_sf_shutdown_ack_sent_abort(
+					struct net *net,
+					const struct sctp_endpoint *ep,
+					const struct sctp_association *asoc,
+					const union sctp_subtype type,
+					void *arg,
+					struct sctp_cmd_seq *commands)
 {
 	/* The same T2 timer, so we should be able to use
 	 * common function with the SHUTDOWN-SENT state.
@@ -2262,7 +2280,8 @@ sctp_disposition_t sctp_sf_shutdown_ack_sent_abort(
  *
  * The return value is the disposition of the chunk.
  */
-sctp_disposition_t sctp_sf_cookie_echoed_err(struct net *net,
+enum sctp_disposition sctp_sf_cookie_echoed_err(
+					struct net *net,
 					const struct sctp_endpoint *ep,
 					const struct sctp_association *asoc,
 					const union sctp_subtype type,
@@ -2326,12 +2345,13 @@ sctp_disposition_t sctp_sf_cookie_echoed_err(struct net *net,
  *
  * The return value is the disposition of the chunk.
  */
-static sctp_disposition_t sctp_sf_do_5_2_6_stale(struct net *net,
-						 const struct sctp_endpoint *ep,
-						 const struct sctp_association *asoc,
-						 const union sctp_subtype type,
-						 void *arg,
-						 struct sctp_cmd_seq *commands)
+static enum sctp_disposition sctp_sf_do_5_2_6_stale(
+					struct net *net,
+					const struct sctp_endpoint *ep,
+					const struct sctp_association *asoc,
+					const union sctp_subtype type,
+					void *arg,
+					struct sctp_cmd_seq *commands)
 {
 	int attempts = asoc->init_err_counter + 1;
 	struct sctp_chunk *chunk = arg, *reply;
@@ -2448,7 +2468,8 @@ nomem:
  *
  * The return value is the disposition of the chunk.
  */
-sctp_disposition_t sctp_sf_do_9_1_abort(struct net *net,
+enum sctp_disposition sctp_sf_do_9_1_abort(
+					struct net *net,
 					const struct sctp_endpoint *ep,
 					const struct sctp_association *asoc,
 					const union sctp_subtype type,
@@ -2485,16 +2506,17 @@ sctp_disposition_t sctp_sf_do_9_1_abort(struct net *net,
 	return __sctp_sf_do_9_1_abort(net, ep, asoc, type, arg, commands);
 }
 
-static sctp_disposition_t __sctp_sf_do_9_1_abort(struct net *net,
+static enum sctp_disposition __sctp_sf_do_9_1_abort(
+					struct net *net,
 					const struct sctp_endpoint *ep,
 					const struct sctp_association *asoc,
 					const union sctp_subtype type,
 					void *arg,
 					struct sctp_cmd_seq *commands)
 {
+	__be16 error = SCTP_ERROR_NO_ERROR;
 	struct sctp_chunk *chunk = arg;
 	unsigned int len;
-	__be16 error = SCTP_ERROR_NO_ERROR;
 
 	/* See if we have an error cause code in the chunk.  */
 	len = ntohs(chunk->chunk_hdr->length);
@@ -2523,16 +2545,17 @@ static sctp_disposition_t __sctp_sf_do_9_1_abort(struct net *net,
  *
  * See sctp_sf_do_9_1_abort() above.
  */
-sctp_disposition_t sctp_sf_cookie_wait_abort(struct net *net,
-				     const struct sctp_endpoint *ep,
-				     const struct sctp_association *asoc,
-				     const union sctp_subtype type,
-				     void *arg,
-				     struct sctp_cmd_seq *commands)
+enum sctp_disposition sctp_sf_cookie_wait_abort(
+					struct net *net,
+					const struct sctp_endpoint *ep,
+					const struct sctp_association *asoc,
+					const union sctp_subtype type,
+					void *arg,
+					struct sctp_cmd_seq *commands)
 {
+	__be16 error = SCTP_ERROR_NO_ERROR;
 	struct sctp_chunk *chunk = arg;
 	unsigned int len;
-	__be16 error = SCTP_ERROR_NO_ERROR;
 
 	if (!sctp_vtag_verify_either(chunk, asoc))
 		return sctp_sf_pdiscard(net, ep, asoc, type, arg, commands);
@@ -2562,7 +2585,8 @@ sctp_disposition_t sctp_sf_cookie_wait_abort(struct net *net,
 /*
  * Process an incoming ICMP as an ABORT.  (COOKIE-WAIT state)
  */
-sctp_disposition_t sctp_sf_cookie_wait_icmp_abort(struct net *net,
+enum sctp_disposition sctp_sf_cookie_wait_icmp_abort(
+					struct net *net,
 					const struct sctp_endpoint *ep,
 					const struct sctp_association *asoc,
 					const union sctp_subtype type,
@@ -2577,12 +2601,13 @@ sctp_disposition_t sctp_sf_cookie_wait_icmp_abort(struct net *net,
 /*
  * Process an ABORT.  (COOKIE-ECHOED state)
  */
-sctp_disposition_t sctp_sf_cookie_echoed_abort(struct net *net,
-					       const struct sctp_endpoint *ep,
-					       const struct sctp_association *asoc,
-					       const union sctp_subtype type,
-					       void *arg,
-					       struct sctp_cmd_seq *commands)
+enum sctp_disposition sctp_sf_cookie_echoed_abort(
+					struct net *net,
+					const struct sctp_endpoint *ep,
+					const struct sctp_association *asoc,
+					const union sctp_subtype type,
+					void *arg,
+					struct sctp_cmd_seq *commands)
 {
 	/* There is a single T1 timer, so we should be able to use
 	 * common function with the COOKIE-WAIT state.
@@ -2595,11 +2620,12 @@ sctp_disposition_t sctp_sf_cookie_echoed_abort(struct net *net,
  *
  * This is common code called by several sctp_sf_*_abort() functions above.
  */
-static sctp_disposition_t sctp_stop_t1_and_abort(struct net *net,
-					   struct sctp_cmd_seq *commands,
-					   __be16 error, int sk_err,
-					   const struct sctp_association *asoc,
-					   struct sctp_transport *transport)
+static enum sctp_disposition sctp_stop_t1_and_abort(
+					struct net *net,
+					struct sctp_cmd_seq *commands,
+					__be16 error, int sk_err,
+					const struct sctp_association *asoc,
+					struct sctp_transport *transport)
 {
 	pr_debug("%s: ABORT received (INIT)\n", __func__);
 
@@ -2649,15 +2675,16 @@ static sctp_disposition_t sctp_stop_t1_and_abort(struct net *net,
  *
  * The return value is the disposition of the chunk.
  */
-sctp_disposition_t sctp_sf_do_9_2_shutdown(struct net *net,
-					   const struct sctp_endpoint *ep,
-					   const struct sctp_association *asoc,
-					   const union sctp_subtype type,
-					   void *arg,
-					   struct sctp_cmd_seq *commands)
-{
+enum sctp_disposition sctp_sf_do_9_2_shutdown(
+					struct net *net,
+					const struct sctp_endpoint *ep,
+					const struct sctp_association *asoc,
+					const union sctp_subtype type,
+					void *arg,
+					struct sctp_cmd_seq *commands)
+{
+	enum sctp_disposition disposition;
 	struct sctp_chunk *chunk = arg;
-	sctp_disposition_t disposition;
 	struct sctp_shutdownhdr *sdh;
 	struct sctp_ulpevent *ev;
 	__u32 ctsn;
@@ -2738,12 +2765,13 @@ out:
  * The Cumulative TSN Ack of the received SHUTDOWN chunk
  * MUST be processed.
  */
-sctp_disposition_t sctp_sf_do_9_2_shut_ctsn(struct net *net,
-					   const struct sctp_endpoint *ep,
-					   const struct sctp_association *asoc,
-					   const union sctp_subtype type,
-					   void *arg,
-					   struct sctp_cmd_seq *commands)
+enum sctp_disposition sctp_sf_do_9_2_shut_ctsn(
+					struct net *net,
+					const struct sctp_endpoint *ep,
+					const struct sctp_association *asoc,
+					const union sctp_subtype type,
+					void *arg,
+					struct sctp_cmd_seq *commands)
 {
 	struct sctp_chunk *chunk = arg;
 	struct sctp_shutdownhdr *sdh;
@@ -2791,14 +2819,15 @@ sctp_disposition_t sctp_sf_do_9_2_shut_ctsn(struct net *net,
  * that belong to this association, it should discard the INIT chunk and
  * retransmit the SHUTDOWN ACK chunk.
  */
-sctp_disposition_t sctp_sf_do_9_2_reshutack(struct net *net,
-				    const struct sctp_endpoint *ep,
-				    const struct sctp_association *asoc,
-				    const union sctp_subtype type,
-				    void *arg,
-				    struct sctp_cmd_seq *commands)
+enum sctp_disposition sctp_sf_do_9_2_reshutack(
+					struct net *net,
+					const struct sctp_endpoint *ep,
+					const struct sctp_association *asoc,
+					const union sctp_subtype type,
+					void *arg,
+					struct sctp_cmd_seq *commands)
 {
-	struct sctp_chunk *chunk = (struct sctp_chunk *) arg;
+	struct sctp_chunk *chunk = arg;
 	struct sctp_chunk *reply;
 
 	/* Make sure that the chunk has a valid length */
@@ -2855,12 +2884,12 @@ nomem:
  *
  * The return value is the disposition of the chunk.
  */
-sctp_disposition_t sctp_sf_do_ecn_cwr(struct net *net,
-				      const struct sctp_endpoint *ep,
-				      const struct sctp_association *asoc,
-				      const union sctp_subtype type,
-				      void *arg,
-				      struct sctp_cmd_seq *commands)
+enum sctp_disposition sctp_sf_do_ecn_cwr(struct net *net,
+					 const struct sctp_endpoint *ep,
+					 const struct sctp_association *asoc,
+					 const union sctp_subtype type,
+					 void *arg,
+					 struct sctp_cmd_seq *commands)
 {
 	struct sctp_chunk *chunk = arg;
 	struct sctp_cwrhdr *cwr;
@@ -2911,12 +2940,11 @@ sctp_disposition_t sctp_sf_do_ecn_cwr(struct net *net,
  *
  * The return value is the disposition of the chunk.
  */
-sctp_disposition_t sctp_sf_do_ecne(struct net *net,
-				   const struct sctp_endpoint *ep,
-				   const struct sctp_association *asoc,
-				   const union sctp_subtype type,
-				   void *arg,
-				   struct sctp_cmd_seq *commands)
+enum sctp_disposition sctp_sf_do_ecne(struct net *net,
+				      const struct sctp_endpoint *ep,
+				      const struct sctp_association *asoc,
+				      const union sctp_subtype type,
+				      void *arg, struct sctp_cmd_seq *commands)
 {
 	struct sctp_chunk *chunk = arg;
 	struct sctp_ecnehdr *ecne;
@@ -2968,12 +2996,12 @@ sctp_disposition_t sctp_sf_do_ecne(struct net *net,
  *
  * The return value is the disposition of the chunk.
  */
-sctp_disposition_t sctp_sf_eat_data_6_2(struct net *net,
-					const struct sctp_endpoint *ep,
-					const struct sctp_association *asoc,
-					const union sctp_subtype type,
-					void *arg,
-					struct sctp_cmd_seq *commands)
+enum sctp_disposition sctp_sf_eat_data_6_2(struct net *net,
+					   const struct sctp_endpoint *ep,
+					   const struct sctp_association *asoc,
+					   const union sctp_subtype type,
+					   void *arg,
+					   struct sctp_cmd_seq *commands)
 {
 	union sctp_arg force = SCTP_NOFORCE();
 	struct sctp_chunk *chunk = arg;
@@ -3088,12 +3116,13 @@ discard_noforce:
  *
  * The return value is the disposition of the chunk.
  */
-sctp_disposition_t sctp_sf_eat_data_fast_4_4(struct net *net,
-				     const struct sctp_endpoint *ep,
-				     const struct sctp_association *asoc,
-				     const union sctp_subtype type,
-				     void *arg,
-				     struct sctp_cmd_seq *commands)
+enum sctp_disposition sctp_sf_eat_data_fast_4_4(
+					struct net *net,
+					const struct sctp_endpoint *ep,
+					const struct sctp_association *asoc,
+					const union sctp_subtype type,
+					void *arg,
+					struct sctp_cmd_seq *commands)
 {
 	struct sctp_chunk *chunk = arg;
 	int error;
@@ -3179,12 +3208,12 @@ sctp_disposition_t sctp_sf_eat_data_fast_4_4(struct net *net,
  *
  * The return value is the disposition of the chunk.
  */
-sctp_disposition_t sctp_sf_eat_sack_6_2(struct net *net,
-					const struct sctp_endpoint *ep,
-					const struct sctp_association *asoc,
-					const union sctp_subtype type,
-					void *arg,
-					struct sctp_cmd_seq *commands)
+enum sctp_disposition sctp_sf_eat_sack_6_2(struct net *net,
+					   const struct sctp_endpoint *ep,
+					   const struct sctp_association *asoc,
+					   const union sctp_subtype type,
+					   void *arg,
+					   struct sctp_cmd_seq *commands)
 {
 	struct sctp_chunk *chunk = arg;
 	struct sctp_sackhdr *sackh;
@@ -3253,7 +3282,8 @@ sctp_disposition_t sctp_sf_eat_sack_6_2(struct net *net,
  *
  * The return value is the disposition of the chunk.
 */
-static sctp_disposition_t sctp_sf_tabort_8_4_8(struct net *net,
+static enum sctp_disposition sctp_sf_tabort_8_4_8(
+					struct net *net,
 					const struct sctp_endpoint *ep,
 					const struct sctp_association *asoc,
 					const union sctp_subtype type,
@@ -3303,12 +3333,12 @@ static sctp_disposition_t sctp_sf_tabort_8_4_8(struct net *net,
  *
  * The return value is the disposition of the chunk.
 */
-sctp_disposition_t sctp_sf_operr_notify(struct net *net,
-					const struct sctp_endpoint *ep,
-					const struct sctp_association *asoc,
-					const union sctp_subtype type,
-					void *arg,
-					struct sctp_cmd_seq *commands)
+enum sctp_disposition sctp_sf_operr_notify(struct net *net,
+					   const struct sctp_endpoint *ep,
+					   const struct sctp_association *asoc,
+					   const union sctp_subtype type,
+					   void *arg,
+					   struct sctp_cmd_seq *commands)
 {
 	struct sctp_chunk *chunk = arg;
 	struct sctp_errhdr *err;
@@ -3341,12 +3371,12 @@ sctp_disposition_t sctp_sf_operr_notify(struct net *net,
  *
  * The return value is the disposition.
  */
-sctp_disposition_t sctp_sf_do_9_2_final(struct net *net,
-					const struct sctp_endpoint *ep,
-					const struct sctp_association *asoc,
-					const union sctp_subtype type,
-					void *arg,
-					struct sctp_cmd_seq *commands)
+enum sctp_disposition sctp_sf_do_9_2_final(struct net *net,
+					   const struct sctp_endpoint *ep,
+					   const struct sctp_association *asoc,
+					   const union sctp_subtype type,
+					   void *arg,
+					   struct sctp_cmd_seq *commands)
 {
 	struct sctp_chunk *chunk = arg;
 	struct sctp_chunk *reply;
@@ -3424,20 +3454,19 @@ nomem:
  *    receiver of the OOTB packet shall discard the OOTB packet and take
  *    no further action.
  */
-sctp_disposition_t sctp_sf_ootb(struct net *net,
-				const struct sctp_endpoint *ep,
-				const struct sctp_association *asoc,
-				const union sctp_subtype type,
-				void *arg,
-				struct sctp_cmd_seq *commands)
+enum sctp_disposition sctp_sf_ootb(struct net *net,
+				   const struct sctp_endpoint *ep,
+				   const struct sctp_association *asoc,
+				   const union sctp_subtype type,
+				   void *arg, struct sctp_cmd_seq *commands)
 {
 	struct sctp_chunk *chunk = arg;
 	struct sk_buff *skb = chunk->skb;
 	struct sctp_chunkhdr *ch;
 	struct sctp_errhdr *err;
-	__u8 *ch_end;
-	int ootb_shut_ack = 0;
 	int ootb_cookie_ack = 0;
+	int ootb_shut_ack = 0;
+	__u8 *ch_end;
 
 	SCTP_INC_STATS(net, SCTP_MIB_OUTOFBLUES);
 
@@ -3513,16 +3542,17 @@ sctp_disposition_t sctp_sf_ootb(struct net *net,
  * (endpoint, asoc, type, arg, commands)
  *
  * Outputs
- * (sctp_disposition_t)
+ * (enum sctp_disposition)
  *
  * The return value is the disposition of the chunk.
  */
-static sctp_disposition_t sctp_sf_shut_8_4_5(struct net *net,
-					     const struct sctp_endpoint *ep,
-					     const struct sctp_association *asoc,
-					     const union sctp_subtype type,
-					     void *arg,
-					     struct sctp_cmd_seq *commands)
+static enum sctp_disposition sctp_sf_shut_8_4_5(
+					struct net *net,
+					const struct sctp_endpoint *ep,
+					const struct sctp_association *asoc,
+					const union sctp_subtype type,
+					void *arg,
+					struct sctp_cmd_seq *commands)
 {
 	struct sctp_packet *packet = NULL;
 	struct sctp_chunk *chunk = arg;
@@ -3579,12 +3609,12 @@ static sctp_disposition_t sctp_sf_shut_8_4_5(struct net *net,
  *   chunks. --piggy ]
  *
  */
-sctp_disposition_t sctp_sf_do_8_5_1_E_sa(struct net *net,
-				      const struct sctp_endpoint *ep,
-				      const struct sctp_association *asoc,
-				      const union sctp_subtype type,
-				      void *arg,
-				      struct sctp_cmd_seq *commands)
+enum sctp_disposition sctp_sf_do_8_5_1_E_sa(struct net *net,
+					    const struct sctp_endpoint *ep,
+					    const struct sctp_association *asoc,
+					    const union sctp_subtype type,
+					    void *arg,
+					    struct sctp_cmd_seq *commands)
 {
 	struct sctp_chunk *chunk = arg;
 
@@ -3604,17 +3634,18 @@ sctp_disposition_t sctp_sf_do_8_5_1_E_sa(struct net *net,
 }
 
 /* ADDIP Section 4.2 Upon reception of an ASCONF Chunk.  */
-sctp_disposition_t sctp_sf_do_asconf(struct net *net,
-				     const struct sctp_endpoint *ep,
-				     const struct sctp_association *asoc,
-				     const union sctp_subtype type, void *arg,
-				     struct sctp_cmd_seq *commands)
+enum sctp_disposition sctp_sf_do_asconf(struct net *net,
+					const struct sctp_endpoint *ep,
+					const struct sctp_association *asoc,
+					const union sctp_subtype type,
+					void *arg,
+					struct sctp_cmd_seq *commands)
 {
-	struct sctp_chunk	*chunk = arg;
-	struct sctp_chunk	*asconf_ack = NULL;
-	struct sctp_paramhdr	*err_param = NULL;
-	struct sctp_addiphdr	*hdr;
-	__u32			serial;
+	struct sctp_paramhdr *err_param = NULL;
+	struct sctp_chunk *asconf_ack = NULL;
+	struct sctp_chunk *chunk = arg;
+	struct sctp_addiphdr *hdr;
+	__u32 serial;
 
 	if (!sctp_vtag_verify(chunk, asoc)) {
 		sctp_add_cmd_sf(commands, SCTP_CMD_REPORT_BAD_TAG,
@@ -3721,19 +3752,19 @@ sctp_disposition_t sctp_sf_do_asconf(struct net *net,
  * When building TLV parameters for the ASCONF Chunk that will add or
  * delete IP addresses the D0 to D13 rules should be applied:
  */
-sctp_disposition_t sctp_sf_do_asconf_ack(struct net *net,
-					 const struct sctp_endpoint *ep,
-					 const struct sctp_association *asoc,
-					 const union sctp_subtype type,
-					 void *arg,
-					 struct sctp_cmd_seq *commands)
+enum sctp_disposition sctp_sf_do_asconf_ack(struct net *net,
+					    const struct sctp_endpoint *ep,
+					    const struct sctp_association *asoc,
+					    const union sctp_subtype type,
+					    void *arg,
+					    struct sctp_cmd_seq *commands)
 {
-	struct sctp_chunk	*asconf_ack = arg;
-	struct sctp_chunk	*last_asconf = asoc->addip_last_asconf;
-	struct sctp_chunk	*abort;
-	struct sctp_paramhdr	*err_param = NULL;
-	struct sctp_addiphdr	*addip_hdr;
-	__u32			sent_serial, rcvd_serial;
+	struct sctp_chunk *last_asconf = asoc->addip_last_asconf;
+	struct sctp_paramhdr *err_param = NULL;
+	struct sctp_chunk *asconf_ack = arg;
+	struct sctp_addiphdr *addip_hdr;
+	__u32 sent_serial, rcvd_serial;
+	struct sctp_chunk *abort;
 
 	if (!sctp_vtag_verify(asconf_ack, asoc)) {
 		sctp_add_cmd_sf(commands, SCTP_CMD_REPORT_BAD_TAG,
@@ -3840,11 +3871,12 @@ sctp_disposition_t sctp_sf_do_asconf_ack(struct net *net,
 }
 
 /* RE-CONFIG Section 5.2 Upon reception of an RECONF Chunk. */
-sctp_disposition_t sctp_sf_do_reconf(struct net *net,
-				     const struct sctp_endpoint *ep,
-				     const struct sctp_association *asoc,
-				     const union sctp_subtype type, void *arg,
-				     struct sctp_cmd_seq *commands)
+enum sctp_disposition sctp_sf_do_reconf(struct net *net,
+					const struct sctp_endpoint *ep,
+					const struct sctp_association *asoc,
+					const union sctp_subtype type,
+					void *arg,
+					struct sctp_cmd_seq *commands)
 {
 	struct sctp_paramhdr *err_param = NULL;
 	struct sctp_chunk *chunk = arg;
@@ -3916,15 +3948,15 @@ sctp_disposition_t sctp_sf_do_reconf(struct net *net,
  *
  * The return value is the disposition of the chunk.
  */
-sctp_disposition_t sctp_sf_eat_fwd_tsn(struct net *net,
-				       const struct sctp_endpoint *ep,
-				       const struct sctp_association *asoc,
-				       const union sctp_subtype type,
-				       void *arg,
-				       struct sctp_cmd_seq *commands)
+enum sctp_disposition sctp_sf_eat_fwd_tsn(struct net *net,
+					  const struct sctp_endpoint *ep,
+					  const struct sctp_association *asoc,
+					  const union sctp_subtype type,
+					  void *arg,
+					  struct sctp_cmd_seq *commands)
 {
-	struct sctp_chunk *chunk = arg;
 	struct sctp_fwdtsn_hdr *fwdtsn_hdr;
+	struct sctp_chunk *chunk = arg;
 	struct sctp_fwdtsn_skip *skip;
 	__u16 len;
 	__u32 tsn;
@@ -3986,16 +4018,16 @@ discard_noforce:
 	return SCTP_DISPOSITION_DISCARD;
 }
 
-sctp_disposition_t sctp_sf_eat_fwd_tsn_fast(
-	struct net *net,
-	const struct sctp_endpoint *ep,
-	const struct sctp_association *asoc,
-	const union sctp_subtype type,
-	void *arg,
-	struct sctp_cmd_seq *commands)
+enum sctp_disposition sctp_sf_eat_fwd_tsn_fast(
+					struct net *net,
+					const struct sctp_endpoint *ep,
+					const struct sctp_association *asoc,
+					const union sctp_subtype type,
+					void *arg,
+					struct sctp_cmd_seq *commands)
 {
-	struct sctp_chunk *chunk = arg;
 	struct sctp_fwdtsn_hdr *fwdtsn_hdr;
+	struct sctp_chunk *chunk = arg;
 	struct sctp_fwdtsn_skip *skip;
 	__u16 len;
 	__u32 tsn;
@@ -4079,18 +4111,17 @@ gen_shutdown:
  * The return value is the disposition of the chunk.
  */
 static enum sctp_ierror sctp_sf_authenticate(
-				struct net *net,
-				const struct sctp_endpoint *ep,
-				const struct sctp_association *asoc,
-				const union sctp_subtype type,
-				struct sctp_chunk *chunk)
+					struct net *net,
+					const struct sctp_endpoint *ep,
+					const struct sctp_association *asoc,
+					const union sctp_subtype type,
+					struct sctp_chunk *chunk)
 {
 	struct sctp_authhdr *auth_hdr;
+	__u8 *save_digest, *digest;
 	struct sctp_hmac *hmac;
 	unsigned int sig_len;
 	__u16 key_id;
-	__u8 *save_digest;
-	__u8 *digest;
 
 	/* Pull in the auth header, so we can do some more verification */
 	auth_hdr = (struct sctp_authhdr *)chunk->skb->data;
@@ -4154,12 +4185,11 @@ nomem:
 	return SCTP_IERROR_NOMEM;
 }
 
-sctp_disposition_t sctp_sf_eat_auth(struct net *net,
-				    const struct sctp_endpoint *ep,
-				    const struct sctp_association *asoc,
-				    const union sctp_subtype type,
-				    void *arg,
-				    struct sctp_cmd_seq *commands)
+enum sctp_disposition sctp_sf_eat_auth(struct net *net,
+				       const struct sctp_endpoint *ep,
+				       const struct sctp_association *asoc,
+				       const union sctp_subtype type,
+				       void *arg, struct sctp_cmd_seq *commands)
 {
 	struct sctp_chunk *chunk = arg;
 	struct sctp_authhdr *auth_hdr;
@@ -4251,12 +4281,12 @@ sctp_disposition_t sctp_sf_eat_auth(struct net *net,
  *
  * The return value is the disposition of the chunk.
  */
-sctp_disposition_t sctp_sf_unk_chunk(struct net *net,
-				     const struct sctp_endpoint *ep,
-				     const struct sctp_association *asoc,
-				     const union sctp_subtype type,
-				     void *arg,
-				     struct sctp_cmd_seq *commands)
+enum sctp_disposition sctp_sf_unk_chunk(struct net *net,
+					const struct sctp_endpoint *ep,
+					const struct sctp_association *asoc,
+					const union sctp_subtype type,
+					void *arg,
+					struct sctp_cmd_seq *commands)
 {
 	struct sctp_chunk *unk_chunk = arg;
 	struct sctp_chunk *err_chunk;
@@ -4331,12 +4361,12 @@ sctp_disposition_t sctp_sf_unk_chunk(struct net *net,
  *
  * The return value is the disposition of the chunk.
  */
-sctp_disposition_t sctp_sf_discard_chunk(struct net *net,
-					 const struct sctp_endpoint *ep,
-					 const struct sctp_association *asoc,
-					 const union sctp_subtype type,
-					 void *arg,
-					 struct sctp_cmd_seq *commands)
+enum sctp_disposition sctp_sf_discard_chunk(struct net *net,
+					    const struct sctp_endpoint *ep,
+					    const struct sctp_association *asoc,
+					    const union sctp_subtype type,
+					    void *arg,
+					    struct sctp_cmd_seq *commands)
 {
 	struct sctp_chunk *chunk = arg;
 
@@ -4371,12 +4401,11 @@ sctp_disposition_t sctp_sf_discard_chunk(struct net *net,
  *
  * The return value is the disposition of the chunk.
  */
-sctp_disposition_t sctp_sf_pdiscard(struct net *net,
-				    const struct sctp_endpoint *ep,
-				    const struct sctp_association *asoc,
-				    const union sctp_subtype type,
-				    void *arg,
-				    struct sctp_cmd_seq *commands)
+enum sctp_disposition sctp_sf_pdiscard(struct net *net,
+				       const struct sctp_endpoint *ep,
+				       const struct sctp_association *asoc,
+				       const union sctp_subtype type,
+				       void *arg, struct sctp_cmd_seq *commands)
 {
 	SCTP_INC_STATS(net, SCTP_MIB_IN_PKT_DISCARDS);
 	sctp_add_cmd_sf(commands, SCTP_CMD_DISCARD_PACKET, SCTP_NULL());
@@ -4399,12 +4428,12 @@ sctp_disposition_t sctp_sf_pdiscard(struct net *net,
  * We simply tag the chunk as a violation.  The state machine will log
  * the violation and continue.
  */
-sctp_disposition_t sctp_sf_violation(struct net *net,
-				     const struct sctp_endpoint *ep,
-				     const struct sctp_association *asoc,
-				     const union sctp_subtype type,
-				     void *arg,
-				     struct sctp_cmd_seq *commands)
+enum sctp_disposition sctp_sf_violation(struct net *net,
+					const struct sctp_endpoint *ep,
+					const struct sctp_association *asoc,
+					const union sctp_subtype type,
+					void *arg,
+					struct sctp_cmd_seq *commands)
 {
 	struct sctp_chunk *chunk = arg;
 
@@ -4419,14 +4448,14 @@ sctp_disposition_t sctp_sf_violation(struct net *net,
 /*
  * Common function to handle a protocol violation.
  */
-static sctp_disposition_t sctp_sf_abort_violation(
-				     struct net *net,
-				     const struct sctp_endpoint *ep,
-				     const struct sctp_association *asoc,
-				     void *arg,
-				     struct sctp_cmd_seq *commands,
-				     const __u8 *payload,
-				     const size_t paylen)
+static enum sctp_disposition sctp_sf_abort_violation(
+					struct net *net,
+					const struct sctp_endpoint *ep,
+					const struct sctp_association *asoc,
+					void *arg,
+					struct sctp_cmd_seq *commands,
+					const __u8 *payload,
+					const size_t paylen)
 {
 	struct sctp_packet *packet = NULL;
 	struct sctp_chunk *chunk =  arg;
@@ -4536,18 +4565,18 @@ nomem:
  *
  * Generate an  ABORT chunk and terminate the association.
  */
-static sctp_disposition_t sctp_sf_violation_chunklen(
-				     struct net *net,
-				     const struct sctp_endpoint *ep,
-				     const struct sctp_association *asoc,
-				     const union sctp_subtype type,
-				     void *arg,
-				     struct sctp_cmd_seq *commands)
+static enum sctp_disposition sctp_sf_violation_chunklen(
+					struct net *net,
+					const struct sctp_endpoint *ep,
+					const struct sctp_association *asoc,
+					const union sctp_subtype type,
+					void *arg,
+					struct sctp_cmd_seq *commands)
 {
 	static const char err_str[] = "The following chunk had invalid length:";
 
 	return sctp_sf_abort_violation(net, ep, asoc, arg, commands, err_str,
-					sizeof(err_str));
+				       sizeof(err_str));
 }
 
 /*
@@ -4556,17 +4585,17 @@ static sctp_disposition_t sctp_sf_violation_chunklen(
  * or accumulated length in multi parameters exceeds the end of the chunk,
  * the length is considered as invalid.
  */
-static sctp_disposition_t sctp_sf_violation_paramlen(
-				     struct net *net,
-				     const struct sctp_endpoint *ep,
-				     const struct sctp_association *asoc,
-				     const union sctp_subtype type,
-				     void *arg, void *ext,
-				     struct sctp_cmd_seq *commands)
+static enum sctp_disposition sctp_sf_violation_paramlen(
+					struct net *net,
+					const struct sctp_endpoint *ep,
+					const struct sctp_association *asoc,
+					const union sctp_subtype type,
+					void *arg, void *ext,
+					struct sctp_cmd_seq *commands)
 {
-	struct sctp_chunk *chunk =  arg;
 	struct sctp_paramhdr *param = ext;
 	struct sctp_chunk *abort = NULL;
+	struct sctp_chunk *chunk = arg;
 
 	if (sctp_auth_recv_cid(SCTP_CID_ABORT, asoc))
 		goto discard;
@@ -4599,18 +4628,18 @@ nomem:
  * We inform the other end by sending an ABORT with a Protocol Violation
  * error code.
  */
-static sctp_disposition_t sctp_sf_violation_ctsn(
-				     struct net *net,
-				     const struct sctp_endpoint *ep,
-				     const struct sctp_association *asoc,
-				     const union sctp_subtype type,
-				     void *arg,
-				     struct sctp_cmd_seq *commands)
+static enum sctp_disposition sctp_sf_violation_ctsn(
+					struct net *net,
+					const struct sctp_endpoint *ep,
+					const struct sctp_association *asoc,
+					const union sctp_subtype type,
+					void *arg,
+					struct sctp_cmd_seq *commands)
 {
 	static const char err_str[] = "The cumulative tsn ack beyond the max tsn currently sent:";
 
 	return sctp_sf_abort_violation(net, ep, asoc, arg, commands, err_str,
-					sizeof(err_str));
+				       sizeof(err_str));
 }
 
 /* Handle protocol violation of an invalid chunk bundling.  For example,
@@ -4619,13 +4648,13 @@ static sctp_disposition_t sctp_sf_violation_ctsn(
  * statement from the specs.  Additionally, there might be an attacker
  * on the path and we may not want to continue this communication.
  */
-static sctp_disposition_t sctp_sf_violation_chunk(
-				     struct net *net,
-				     const struct sctp_endpoint *ep,
-				     const struct sctp_association *asoc,
-				     const union sctp_subtype type,
-				     void *arg,
-				     struct sctp_cmd_seq *commands)
+static enum sctp_disposition sctp_sf_violation_chunk(
+					struct net *net,
+					const struct sctp_endpoint *ep,
+					const struct sctp_association *asoc,
+					const union sctp_subtype type,
+					void *arg,
+					struct sctp_cmd_seq *commands)
 {
 	static const char err_str[] = "The following chunk violates protocol:";
 
@@ -4633,7 +4662,7 @@ static sctp_disposition_t sctp_sf_violation_chunk(
 		return sctp_sf_violation(net, ep, asoc, type, arg, commands);
 
 	return sctp_sf_abort_violation(net, ep, asoc, arg, commands, err_str,
-					sizeof(err_str));
+				       sizeof(err_str));
 }
 /***************************************************************************
  * These are the state functions for handling primitive (Section 10) events.
@@ -4695,15 +4724,15 @@ static sctp_disposition_t sctp_sf_violation_chunk(
  *
  * The return value is a disposition.
  */
-sctp_disposition_t sctp_sf_do_prm_asoc(struct net *net,
-				       const struct sctp_endpoint *ep,
-				       const struct sctp_association *asoc,
-				       const union sctp_subtype type,
-				       void *arg,
-				       struct sctp_cmd_seq *commands)
+enum sctp_disposition sctp_sf_do_prm_asoc(struct net *net,
+					  const struct sctp_endpoint *ep,
+					  const struct sctp_association *asoc,
+					  const union sctp_subtype type,
+					  void *arg,
+					  struct sctp_cmd_seq *commands)
 {
-	struct sctp_chunk *repl;
 	struct sctp_association *my_asoc;
+	struct sctp_chunk *repl;
 
 	/* The comment below says that we enter COOKIE-WAIT AFTER
 	 * sending the INIT, but that doesn't actually work in our
@@ -4807,12 +4836,12 @@ nomem:
  *
  * The return value is the disposition.
  */
-sctp_disposition_t sctp_sf_do_prm_send(struct net *net,
-				       const struct sctp_endpoint *ep,
-				       const struct sctp_association *asoc,
-				       const union sctp_subtype type,
-				       void *arg,
-				       struct sctp_cmd_seq *commands)
+enum sctp_disposition sctp_sf_do_prm_send(struct net *net,
+					  const struct sctp_endpoint *ep,
+					  const struct sctp_association *asoc,
+					  const union sctp_subtype type,
+					  void *arg,
+					  struct sctp_cmd_seq *commands)
 {
 	struct sctp_datamsg *msg = arg;
 
@@ -4846,15 +4875,15 @@ sctp_disposition_t sctp_sf_do_prm_send(struct net *net,
  *
  * The return value is the disposition.
  */
-sctp_disposition_t sctp_sf_do_9_2_prm_shutdown(
-	struct net *net,
-	const struct sctp_endpoint *ep,
-	const struct sctp_association *asoc,
-	const union sctp_subtype type,
-	void *arg,
-	struct sctp_cmd_seq *commands)
+enum sctp_disposition sctp_sf_do_9_2_prm_shutdown(
+					struct net *net,
+					const struct sctp_endpoint *ep,
+					const struct sctp_association *asoc,
+					const union sctp_subtype type,
+					void *arg,
+					struct sctp_cmd_seq *commands)
 {
-	int disposition;
+	enum sctp_disposition disposition;
 
 	/* From 9.2 Shutdown of an Association
 	 * Upon receipt of the SHUTDOWN primitive from its upper
@@ -4872,6 +4901,7 @@ sctp_disposition_t sctp_sf_do_9_2_prm_shutdown(
 		disposition = sctp_sf_do_9_2_start_shutdown(net, ep, asoc, type,
 							    arg, commands);
 	}
+
 	return disposition;
 }
 
@@ -4902,13 +4932,13 @@ sctp_disposition_t sctp_sf_do_9_2_prm_shutdown(
  *
  * The return value is the disposition.
  */
-sctp_disposition_t sctp_sf_do_9_1_prm_abort(
-	struct net *net,
-	const struct sctp_endpoint *ep,
-	const struct sctp_association *asoc,
-	const union sctp_subtype type,
-	void *arg,
-	struct sctp_cmd_seq *commands)
+enum sctp_disposition sctp_sf_do_9_1_prm_abort(
+					struct net *net,
+					const struct sctp_endpoint *ep,
+					const struct sctp_association *asoc,
+					const union sctp_subtype type,
+					void *arg,
+					struct sctp_cmd_seq *commands)
 {
 	/* From 9.1 Abort of an Association
 	 * Upon receipt of the ABORT primitive from its upper
@@ -4940,12 +4970,12 @@ sctp_disposition_t sctp_sf_do_9_1_prm_abort(
 }
 
 /* We tried an illegal operation on an association which is closed.  */
-sctp_disposition_t sctp_sf_error_closed(struct net *net,
-					const struct sctp_endpoint *ep,
-					const struct sctp_association *asoc,
-					const union sctp_subtype type,
-					void *arg,
-					struct sctp_cmd_seq *commands)
+enum sctp_disposition sctp_sf_error_closed(struct net *net,
+					   const struct sctp_endpoint *ep,
+					   const struct sctp_association *asoc,
+					   const union sctp_subtype type,
+					   void *arg,
+					   struct sctp_cmd_seq *commands)
 {
 	sctp_add_cmd_sf(commands, SCTP_CMD_REPORT_ERROR, SCTP_ERROR(-EINVAL));
 	return SCTP_DISPOSITION_CONSUME;
@@ -4954,12 +4984,13 @@ sctp_disposition_t sctp_sf_error_closed(struct net *net,
 /* We tried an illegal operation on an association which is shutting
  * down.
  */
-sctp_disposition_t sctp_sf_error_shutdown(struct net *net,
-					  const struct sctp_endpoint *ep,
-					  const struct sctp_association *asoc,
-					  const union sctp_subtype type,
-					  void *arg,
-					  struct sctp_cmd_seq *commands)
+enum sctp_disposition sctp_sf_error_shutdown(
+					struct net *net,
+					const struct sctp_endpoint *ep,
+					const struct sctp_association *asoc,
+					const union sctp_subtype type,
+					void *arg,
+					struct sctp_cmd_seq *commands)
 {
 	sctp_add_cmd_sf(commands, SCTP_CMD_REPORT_ERROR,
 			SCTP_ERROR(-ESHUTDOWN));
@@ -4980,13 +5011,13 @@ sctp_disposition_t sctp_sf_error_shutdown(struct net *net,
  * Outputs
  * (timers)
  */
-sctp_disposition_t sctp_sf_cookie_wait_prm_shutdown(
-	struct net *net,
-	const struct sctp_endpoint *ep,
-	const struct sctp_association *asoc,
-	const union sctp_subtype type,
-	void *arg,
-	struct sctp_cmd_seq *commands)
+enum sctp_disposition sctp_sf_cookie_wait_prm_shutdown(
+					struct net *net,
+					const struct sctp_endpoint *ep,
+					const struct sctp_association *asoc,
+					const union sctp_subtype type,
+					void *arg,
+					struct sctp_cmd_seq *commands)
 {
 	sctp_add_cmd_sf(commands, SCTP_CMD_TIMER_STOP,
 			SCTP_TO(SCTP_EVENT_TIMEOUT_T1_INIT));
@@ -5015,12 +5046,13 @@ sctp_disposition_t sctp_sf_cookie_wait_prm_shutdown(
  * Outputs
  * (timers)
  */
-sctp_disposition_t sctp_sf_cookie_echoed_prm_shutdown(
-	struct net *net,
-	const struct sctp_endpoint *ep,
-	const struct sctp_association *asoc,
-	const union sctp_subtype type,
-	void *arg, struct sctp_cmd_seq *commands)
+enum sctp_disposition sctp_sf_cookie_echoed_prm_shutdown(
+					struct net *net,
+					const struct sctp_endpoint *ep,
+					const struct sctp_association *asoc,
+					const union sctp_subtype type,
+					void *arg,
+					struct sctp_cmd_seq *commands)
 {
 	/* There is a single T1 timer, so we should be able to use
 	 * common function with the COOKIE-WAIT state.
@@ -5042,13 +5074,13 @@ sctp_disposition_t sctp_sf_cookie_echoed_prm_shutdown(
  * Outputs
  * (timers)
  */
-sctp_disposition_t sctp_sf_cookie_wait_prm_abort(
-	struct net *net,
-	const struct sctp_endpoint *ep,
-	const struct sctp_association *asoc,
-	const union sctp_subtype type,
-	void *arg,
-	struct sctp_cmd_seq *commands)
+enum sctp_disposition sctp_sf_cookie_wait_prm_abort(
+					struct net *net,
+					const struct sctp_endpoint *ep,
+					const struct sctp_association *asoc,
+					const union sctp_subtype type,
+					void *arg,
+					struct sctp_cmd_seq *commands)
 {
 	struct sctp_chunk *abort = arg;
 
@@ -5091,13 +5123,13 @@ sctp_disposition_t sctp_sf_cookie_wait_prm_abort(
  * Outputs
  * (timers)
  */
-sctp_disposition_t sctp_sf_cookie_echoed_prm_abort(
-	struct net *net,
-	const struct sctp_endpoint *ep,
-	const struct sctp_association *asoc,
-	const union sctp_subtype type,
-	void *arg,
-	struct sctp_cmd_seq *commands)
+enum sctp_disposition sctp_sf_cookie_echoed_prm_abort(
+					struct net *net,
+					const struct sctp_endpoint *ep,
+					const struct sctp_association *asoc,
+					const union sctp_subtype type,
+					void *arg,
+					struct sctp_cmd_seq *commands)
 {
 	/* There is a single T1 timer, so we should be able to use
 	 * common function with the COOKIE-WAIT state.
@@ -5117,13 +5149,13 @@ sctp_disposition_t sctp_sf_cookie_echoed_prm_abort(
  * Outputs
  * (timers)
  */
-sctp_disposition_t sctp_sf_shutdown_pending_prm_abort(
-	struct net *net,
-	const struct sctp_endpoint *ep,
-	const struct sctp_association *asoc,
-	const union sctp_subtype type,
-	void *arg,
-	struct sctp_cmd_seq *commands)
+enum sctp_disposition sctp_sf_shutdown_pending_prm_abort(
+					struct net *net,
+					const struct sctp_endpoint *ep,
+					const struct sctp_association *asoc,
+					const union sctp_subtype type,
+					void *arg,
+					struct sctp_cmd_seq *commands)
 {
 	/* Stop the T5-shutdown guard timer.  */
 	sctp_add_cmd_sf(commands, SCTP_CMD_TIMER_STOP,
@@ -5144,13 +5176,13 @@ sctp_disposition_t sctp_sf_shutdown_pending_prm_abort(
  * Outputs
  * (timers)
  */
-sctp_disposition_t sctp_sf_shutdown_sent_prm_abort(
-	struct net *net,
-	const struct sctp_endpoint *ep,
-	const struct sctp_association *asoc,
-	const union sctp_subtype type,
-	void *arg,
-	struct sctp_cmd_seq *commands)
+enum sctp_disposition sctp_sf_shutdown_sent_prm_abort(
+					struct net *net,
+					const struct sctp_endpoint *ep,
+					const struct sctp_association *asoc,
+					const union sctp_subtype type,
+					void *arg,
+					struct sctp_cmd_seq *commands)
 {
 	/* Stop the T2-shutdown timer.  */
 	sctp_add_cmd_sf(commands, SCTP_CMD_TIMER_STOP,
@@ -5175,13 +5207,13 @@ sctp_disposition_t sctp_sf_shutdown_sent_prm_abort(
  * Outputs
  * (timers)
  */
-sctp_disposition_t sctp_sf_shutdown_ack_sent_prm_abort(
-	struct net *net,
-	const struct sctp_endpoint *ep,
-	const struct sctp_association *asoc,
-	const union sctp_subtype type,
-	void *arg,
-	struct sctp_cmd_seq *commands)
+enum sctp_disposition sctp_sf_shutdown_ack_sent_prm_abort(
+					struct net *net,
+					const struct sctp_endpoint *ep,
+					const struct sctp_association *asoc,
+					const union sctp_subtype type,
+					void *arg,
+					struct sctp_cmd_seq *commands)
 {
 	/* The same T2 timer, so we should be able to use
 	 * common function with the SHUTDOWN-SENT state.
@@ -5211,7 +5243,7 @@ sctp_disposition_t sctp_sf_shutdown_ack_sent_prm_abort(
  * o destination transport address - the transport address of the
  *   association on which a heartbeat should be issued.
  */
-sctp_disposition_t sctp_sf_do_prm_requestheartbeat(
+enum sctp_disposition sctp_sf_do_prm_requestheartbeat(
 					struct net *net,
 					const struct sctp_endpoint *ep,
 					const struct sctp_association *asoc,
@@ -5244,12 +5276,12 @@ sctp_disposition_t sctp_sf_do_prm_requestheartbeat(
  * When an endpoint has an ASCONF signaled change to be sent to the
  * remote endpoint it should do A1 to A9
  */
-sctp_disposition_t sctp_sf_do_prm_asconf(struct net *net,
-					const struct sctp_endpoint *ep,
-					const struct sctp_association *asoc,
-					const union sctp_subtype type,
-					void *arg,
-					struct sctp_cmd_seq *commands)
+enum sctp_disposition sctp_sf_do_prm_asconf(struct net *net,
+					    const struct sctp_endpoint *ep,
+					    const struct sctp_association *asoc,
+					    const union sctp_subtype type,
+					    void *arg,
+					    struct sctp_cmd_seq *commands)
 {
 	struct sctp_chunk *chunk = arg;
 
@@ -5261,12 +5293,12 @@ sctp_disposition_t sctp_sf_do_prm_asconf(struct net *net,
 }
 
 /* RE-CONFIG Section 5.1 RECONF Chunk Procedures */
-sctp_disposition_t sctp_sf_do_prm_reconf(struct net *net,
-					 const struct sctp_endpoint *ep,
-					 const struct sctp_association *asoc,
-					 const union sctp_subtype type,
-					 void *arg,
-					 struct sctp_cmd_seq *commands)
+enum sctp_disposition sctp_sf_do_prm_reconf(struct net *net,
+					    const struct sctp_endpoint *ep,
+					    const struct sctp_association *asoc,
+					    const union sctp_subtype type,
+					    void *arg,
+					    struct sctp_cmd_seq *commands)
 {
 	struct sctp_chunk *chunk = arg;
 
@@ -5279,13 +5311,13 @@ sctp_disposition_t sctp_sf_do_prm_reconf(struct net *net,
  *
  * The return value is the disposition of the primitive.
  */
-sctp_disposition_t sctp_sf_ignore_primitive(
-	struct net *net,
-	const struct sctp_endpoint *ep,
-	const struct sctp_association *asoc,
-	const union sctp_subtype type,
-	void *arg,
-	struct sctp_cmd_seq *commands)
+enum sctp_disposition sctp_sf_ignore_primitive(
+					struct net *net,
+					const struct sctp_endpoint *ep,
+					const struct sctp_association *asoc,
+					const union sctp_subtype type,
+					void *arg,
+					struct sctp_cmd_seq *commands)
 {
 	pr_debug("%s: primitive type:%d is ignored\n", __func__,
 		 type.primitive);
@@ -5303,13 +5335,13 @@ sctp_disposition_t sctp_sf_ignore_primitive(
  * subscribes to this event, if there is no data to be sent or
  * retransmit, the stack will immediately send up this notification.
  */
-sctp_disposition_t sctp_sf_do_no_pending_tsn(
-	struct net *net,
-	const struct sctp_endpoint *ep,
-	const struct sctp_association *asoc,
-	const union sctp_subtype type,
-	void *arg,
-	struct sctp_cmd_seq *commands)
+enum sctp_disposition sctp_sf_do_no_pending_tsn(
+					struct net *net,
+					const struct sctp_endpoint *ep,
+					const struct sctp_association *asoc,
+					const union sctp_subtype type,
+					void *arg,
+					struct sctp_cmd_seq *commands)
 {
 	struct sctp_ulpevent *event;
 
@@ -5335,13 +5367,13 @@ sctp_disposition_t sctp_sf_do_no_pending_tsn(
  *
  * The return value is the disposition.
  */
-sctp_disposition_t sctp_sf_do_9_2_start_shutdown(
-	struct net *net,
-	const struct sctp_endpoint *ep,
-	const struct sctp_association *asoc,
-	const union sctp_subtype type,
-	void *arg,
-	struct sctp_cmd_seq *commands)
+enum sctp_disposition sctp_sf_do_9_2_start_shutdown(
+					struct net *net,
+					const struct sctp_endpoint *ep,
+					const struct sctp_association *asoc,
+					const union sctp_subtype type,
+					void *arg,
+					struct sctp_cmd_seq *commands)
 {
 	struct sctp_chunk *reply;
 
@@ -5405,15 +5437,15 @@ nomem:
  *
  * The return value is the disposition.
  */
-sctp_disposition_t sctp_sf_do_9_2_shutdown_ack(
-	struct net *net,
-	const struct sctp_endpoint *ep,
-	const struct sctp_association *asoc,
-	const union sctp_subtype type,
-	void *arg,
-	struct sctp_cmd_seq *commands)
+enum sctp_disposition sctp_sf_do_9_2_shutdown_ack(
+					struct net *net,
+					const struct sctp_endpoint *ep,
+					const struct sctp_association *asoc,
+					const union sctp_subtype type,
+					void *arg,
+					struct sctp_cmd_seq *commands)
 {
-	struct sctp_chunk *chunk = (struct sctp_chunk *) arg;
+	struct sctp_chunk *chunk = arg;
 	struct sctp_chunk *reply;
 
 	/* There are 2 ways of getting here:
@@ -5479,12 +5511,12 @@ nomem:
  *
  * The return value is the disposition of the event.
  */
-sctp_disposition_t sctp_sf_ignore_other(struct net *net,
-					const struct sctp_endpoint *ep,
-					const struct sctp_association *asoc,
-					const union sctp_subtype type,
-					void *arg,
-					struct sctp_cmd_seq *commands)
+enum sctp_disposition sctp_sf_ignore_other(struct net *net,
+					   const struct sctp_endpoint *ep,
+					   const struct sctp_association *asoc,
+					   const union sctp_subtype type,
+					   void *arg,
+					   struct sctp_cmd_seq *commands)
 {
 	pr_debug("%s: the event other type:%d is ignored\n",
 		 __func__, type.other);
@@ -5507,12 +5539,12 @@ sctp_disposition_t sctp_sf_ignore_other(struct net *net,
  *
  * The return value is the disposition of the chunk.
  */
-sctp_disposition_t sctp_sf_do_6_3_3_rtx(struct net *net,
-					const struct sctp_endpoint *ep,
-					const struct sctp_association *asoc,
-					const union sctp_subtype type,
-					void *arg,
-					struct sctp_cmd_seq *commands)
+enum sctp_disposition sctp_sf_do_6_3_3_rtx(struct net *net,
+					   const struct sctp_endpoint *ep,
+					   const struct sctp_association *asoc,
+					   const union sctp_subtype type,
+					   void *arg,
+					   struct sctp_cmd_seq *commands)
 {
 	struct sctp_transport *transport = arg;
 
@@ -5595,12 +5627,12 @@ sctp_disposition_t sctp_sf_do_6_3_3_rtx(struct net *net,
  * allow. However, an SCTP transmitter MUST NOT be more aggressive than
  * the following algorithms allow.
  */
-sctp_disposition_t sctp_sf_do_6_2_sack(struct net *net,
-				       const struct sctp_endpoint *ep,
-				       const struct sctp_association *asoc,
-				       const union sctp_subtype type,
-				       void *arg,
-				       struct sctp_cmd_seq *commands)
+enum sctp_disposition sctp_sf_do_6_2_sack(struct net *net,
+					  const struct sctp_endpoint *ep,
+					  const struct sctp_association *asoc,
+					  const union sctp_subtype type,
+					  void *arg,
+					  struct sctp_cmd_seq *commands)
 {
 	SCTP_INC_STATS(net, SCTP_MIB_DELAY_SACK_EXPIREDS);
 	sctp_add_cmd_sf(commands, SCTP_CMD_GEN_SACK, SCTP_FORCE());
@@ -5626,16 +5658,17 @@ sctp_disposition_t sctp_sf_do_6_2_sack(struct net *net,
  * (timers, events)
  *
  */
-sctp_disposition_t sctp_sf_t1_init_timer_expire(struct net *net,
-					   const struct sctp_endpoint *ep,
-					   const struct sctp_association *asoc,
-					   const union sctp_subtype type,
-					   void *arg,
-					   struct sctp_cmd_seq *commands)
+enum sctp_disposition sctp_sf_t1_init_timer_expire(
+					struct net *net,
+					const struct sctp_endpoint *ep,
+					const struct sctp_association *asoc,
+					const union sctp_subtype type,
+					void *arg,
+					struct sctp_cmd_seq *commands)
 {
+	int attempts = asoc->init_err_counter + 1;
 	struct sctp_chunk *repl = NULL;
 	struct sctp_bind_addr *bp;
-	int attempts = asoc->init_err_counter + 1;
 
 	pr_debug("%s: timer T1 expired (INIT)\n", __func__);
 
@@ -5690,15 +5723,16 @@ sctp_disposition_t sctp_sf_t1_init_timer_expire(struct net *net,
  * (timers, events)
  *
  */
-sctp_disposition_t sctp_sf_t1_cookie_timer_expire(struct net *net,
-					   const struct sctp_endpoint *ep,
-					   const struct sctp_association *asoc,
-					   const union sctp_subtype type,
-					   void *arg,
-					   struct sctp_cmd_seq *commands)
+enum sctp_disposition sctp_sf_t1_cookie_timer_expire(
+					struct net *net,
+					const struct sctp_endpoint *ep,
+					const struct sctp_association *asoc,
+					const union sctp_subtype type,
+					void *arg,
+					struct sctp_cmd_seq *commands)
 {
-	struct sctp_chunk *repl = NULL;
 	int attempts = asoc->init_err_counter + 1;
+	struct sctp_chunk *repl = NULL;
 
 	pr_debug("%s: timer T1 expired (COOKIE-ECHO)\n", __func__);
 
@@ -5740,12 +5774,13 @@ sctp_disposition_t sctp_sf_t1_cookie_timer_expire(struct net *net,
  * the T2-Shutdown timer,  giving its peer ample opportunity to transmit
  * all of its queued DATA chunks that have not yet been sent.
  */
-sctp_disposition_t sctp_sf_t2_timer_expire(struct net *net,
-					   const struct sctp_endpoint *ep,
-					   const struct sctp_association *asoc,
-					   const union sctp_subtype type,
-					   void *arg,
-					   struct sctp_cmd_seq *commands)
+enum sctp_disposition sctp_sf_t2_timer_expire(
+					struct net *net,
+					const struct sctp_endpoint *ep,
+					const struct sctp_association *asoc,
+					const union sctp_subtype type,
+					void *arg,
+					struct sctp_cmd_seq *commands)
 {
 	struct sctp_chunk *reply = NULL;
 
@@ -5810,13 +5845,13 @@ nomem:
  * ADDIP Section 4.1 ASCONF CHunk Procedures
  * If the T4 RTO timer expires the endpoint should do B1 to B5
  */
-sctp_disposition_t sctp_sf_t4_timer_expire(
-	struct net *net,
-	const struct sctp_endpoint *ep,
-	const struct sctp_association *asoc,
-	const union sctp_subtype type,
-	void *arg,
-	struct sctp_cmd_seq *commands)
+enum sctp_disposition sctp_sf_t4_timer_expire(
+					struct net *net,
+					const struct sctp_endpoint *ep,
+					const struct sctp_association *asoc,
+					const union sctp_subtype type,
+					void *arg,
+					struct sctp_cmd_seq *commands)
 {
 	struct sctp_chunk *chunk = asoc->addip_last_asconf;
 	struct sctp_transport *transport = chunk->transport;
@@ -5882,12 +5917,13 @@ sctp_disposition_t sctp_sf_t4_timer_expire(
  * At the expiration of this timer the sender SHOULD abort the association
  * by sending an ABORT chunk.
  */
-sctp_disposition_t sctp_sf_t5_timer_expire(struct net *net,
-					   const struct sctp_endpoint *ep,
-					   const struct sctp_association *asoc,
-					   const union sctp_subtype type,
-					   void *arg,
-					   struct sctp_cmd_seq *commands)
+enum sctp_disposition sctp_sf_t5_timer_expire(
+					struct net *net,
+					const struct sctp_endpoint *ep,
+					const struct sctp_association *asoc,
+					const union sctp_subtype type,
+					void *arg,
+					struct sctp_cmd_seq *commands)
 {
 	struct sctp_chunk *reply = NULL;
 
@@ -5918,15 +5954,15 @@ nomem:
  * The work that needs to be done is same as when SHUTDOWN is initiated by
  * the user.  So this routine looks same as sctp_sf_do_9_2_prm_shutdown().
  */
-sctp_disposition_t sctp_sf_autoclose_timer_expire(
-	struct net *net,
-	const struct sctp_endpoint *ep,
-	const struct sctp_association *asoc,
-	const union sctp_subtype type,
-	void *arg,
-	struct sctp_cmd_seq *commands)
+enum sctp_disposition sctp_sf_autoclose_timer_expire(
+					struct net *net,
+					const struct sctp_endpoint *ep,
+					const struct sctp_association *asoc,
+					const union sctp_subtype type,
+					void *arg,
+					struct sctp_cmd_seq *commands)
 {
-	int disposition;
+	enum sctp_disposition disposition;
 
 	SCTP_INC_STATS(net, SCTP_MIB_AUTOCLOSE_EXPIREDS);
 
@@ -5946,6 +5982,7 @@ sctp_disposition_t sctp_sf_autoclose_timer_expire(
 		disposition = sctp_sf_do_9_2_start_shutdown(net, ep, asoc, type,
 							    arg, commands);
 	}
+
 	return disposition;
 }
 
@@ -5961,12 +5998,11 @@ sctp_disposition_t sctp_sf_autoclose_timer_expire(
  *
  * The return value is the disposition of the chunk.
  */
-sctp_disposition_t sctp_sf_not_impl(struct net *net,
-				    const struct sctp_endpoint *ep,
-				    const struct sctp_association *asoc,
-				    const union sctp_subtype type,
-				    void *arg,
-				    struct sctp_cmd_seq *commands)
+enum sctp_disposition sctp_sf_not_impl(struct net *net,
+				       const struct sctp_endpoint *ep,
+				       const struct sctp_association *asoc,
+				       const union sctp_subtype type,
+				       void *arg, struct sctp_cmd_seq *commands)
 {
 	return SCTP_DISPOSITION_NOT_IMPL;
 }
@@ -5979,12 +6015,11 @@ sctp_disposition_t sctp_sf_not_impl(struct net *net,
  *
  * The return value is the disposition of the chunk.
  */
-sctp_disposition_t sctp_sf_bug(struct net *net,
-			       const struct sctp_endpoint *ep,
-			       const struct sctp_association *asoc,
-			       const union sctp_subtype type,
-			       void *arg,
-			       struct sctp_cmd_seq *commands)
+enum sctp_disposition sctp_sf_bug(struct net *net,
+				  const struct sctp_endpoint *ep,
+				  const struct sctp_association *asoc,
+				  const union sctp_subtype type,
+				  void *arg, struct sctp_cmd_seq *commands)
 {
 	return SCTP_DISPOSITION_BUG;
 }
@@ -6000,12 +6035,12 @@ sctp_disposition_t sctp_sf_bug(struct net *net,
  *
  * The return value is the disposition of the chunk.
  */
-sctp_disposition_t sctp_sf_timer_ignore(struct net *net,
-					const struct sctp_endpoint *ep,
-					const struct sctp_association *asoc,
-					const union sctp_subtype type,
-					void *arg,
-					struct sctp_cmd_seq *commands)
+enum sctp_disposition sctp_sf_timer_ignore(struct net *net,
+					   const struct sctp_endpoint *ep,
+					   const struct sctp_association *asoc,
+					   const union sctp_subtype type,
+					   void *arg,
+					   struct sctp_cmd_seq *commands)
 {
 	pr_debug("%s: timer %d ignored\n", __func__, type.chunk);
 
@@ -6020,9 +6055,9 @@ sctp_disposition_t sctp_sf_timer_ignore(struct net *net,
 static struct sctp_sackhdr *sctp_sm_pull_sack(struct sctp_chunk *chunk)
 {
 	struct sctp_sackhdr *sack;
+	__u16 num_dup_tsns;
 	unsigned int len;
 	__u16 num_blocks;
-	__u16 num_dup_tsns;
 
 	/* Protect ourselves from reading too far into
 	 * the skb from a bogus sender.
@@ -6044,12 +6079,12 @@ static struct sctp_sackhdr *sctp_sm_pull_sack(struct sctp_chunk *chunk)
 /* Create an ABORT packet to be sent as a response, with the specified
  * error causes.
  */
-static struct sctp_packet *sctp_abort_pkt_new(struct net *net,
-				  const struct sctp_endpoint *ep,
-				  const struct sctp_association *asoc,
-				  struct sctp_chunk *chunk,
-				  const void *payload,
-				  size_t paylen)
+static struct sctp_packet *sctp_abort_pkt_new(
+					struct net *net,
+					const struct sctp_endpoint *ep,
+					const struct sctp_association *asoc,
+					struct sctp_chunk *chunk,
+					const void *payload, size_t paylen)
 {
 	struct sctp_packet *packet;
 	struct sctp_chunk *abort;
@@ -6086,14 +6121,14 @@ static struct sctp_packet *sctp_abort_pkt_new(struct net *net,
 }
 
 /* Allocate a packet for responding in the OOTB conditions.  */
-static struct sctp_packet *sctp_ootb_pkt_new(struct net *net,
-					     const struct sctp_association *asoc,
-					     const struct sctp_chunk *chunk)
+static struct sctp_packet *sctp_ootb_pkt_new(
+					struct net *net,
+					const struct sctp_association *asoc,
+					const struct sctp_chunk *chunk)
 {
-	struct sctp_packet *packet;
 	struct sctp_transport *transport;
-	__u16 sport;
-	__u16 dport;
+	struct sctp_packet *packet;
+	__u16 sport, dport;
 	__u32 vtag;
 
 	/* Get the source and destination port from the inbound packet.  */
@@ -6202,18 +6237,17 @@ static int sctp_eat_data(const struct sctp_association *asoc,
 			 struct sctp_chunk *chunk,
 			 struct sctp_cmd_seq *commands)
 {
-	struct sctp_datahdr *data_hdr;
-	struct sctp_chunk *err;
-	size_t datalen;
-	enum sctp_verb deliver;
-	int tmp;
-	__u32 tsn;
 	struct sctp_tsnmap *map = (struct sctp_tsnmap *)&asoc->peer.tsn_map;
 	struct sock *sk = asoc->base.sk;
 	struct net *net = sock_net(sk);
-	u16 ssn;
-	u16 sid;
+	struct sctp_datahdr *data_hdr;
+	struct sctp_chunk *err;
+	enum sctp_verb deliver;
+	size_t datalen;
 	u8 ordered = 0;
+	u16 ssn, sid;
+	__u32 tsn;
+	int tmp;
 
 	data_hdr = (struct sctp_datahdr *)chunk->skb->data;
 	chunk->subh.data_hdr = data_hdr;
-- 
cgit v1.2.3


From 327c0dab8d1301cd866816de8c7f32eb872cabac Mon Sep 17 00:00:00 2001
From: Xin Long <lucien.xin@gmail.com>
Date: Fri, 11 Aug 2017 10:23:58 +0800
Subject: sctp: fix some indents in sm_make_chunk.c

There are some bad indents of functions' defination in sm_make_chunk.c.
They have been there since beginning, it was probably caused by that
the typedef sctp_chunk_t was replaced with struct sctp_chunk.

So it's the best time to fix them in this patchset, it's also to fix
some bad indents in other functions' defination in sm_make_chunk.c.

Signed-off-by: Xin Long <lucien.xin@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/sctp/sm.h    | 158 ++++++++++-----------
 net/sctp/sm_make_chunk.c | 348 +++++++++++++++++++++++------------------------
 2 files changed, 249 insertions(+), 257 deletions(-)

(limited to 'include')

diff --git a/include/net/sctp/sm.h b/include/net/sctp/sm.h
index 33077f317995..2db3d3a9ce1d 100644
--- a/include/net/sctp/sm.h
+++ b/include/net/sctp/sm.h
@@ -184,68 +184,69 @@ __u32 sctp_generate_verification_tag(void);
 void sctp_populate_tie_tags(__u8 *cookie, __u32 curTag, __u32 hisTag);
 
 /* Prototypes for chunk-building functions.  */
-struct sctp_chunk *sctp_make_init(const struct sctp_association *,
-			     const struct sctp_bind_addr *,
-			     gfp_t gfp, int vparam_len);
-struct sctp_chunk *sctp_make_init_ack(const struct sctp_association *,
-				 const struct sctp_chunk *,
-				 const gfp_t gfp,
-				 const int unkparam_len);
-struct sctp_chunk *sctp_make_cookie_echo(const struct sctp_association *,
-				    const struct sctp_chunk *);
-struct sctp_chunk *sctp_make_cookie_ack(const struct sctp_association *,
-				   const struct sctp_chunk *);
-struct sctp_chunk *sctp_make_cwr(const struct sctp_association *,
+struct sctp_chunk *sctp_make_init(const struct sctp_association *asoc,
+				  const struct sctp_bind_addr *bp,
+				  gfp_t gfp, int vparam_len);
+struct sctp_chunk *sctp_make_init_ack(const struct sctp_association *asoc,
+				      const struct sctp_chunk *chunk,
+				      const gfp_t gfp, const int unkparam_len);
+struct sctp_chunk *sctp_make_cookie_echo(const struct sctp_association *asoc,
+					 const struct sctp_chunk *chunk);
+struct sctp_chunk *sctp_make_cookie_ack(const struct sctp_association *asoc,
+					const struct sctp_chunk *chunk);
+struct sctp_chunk *sctp_make_cwr(const struct sctp_association *asoc,
 				 const __u32 lowest_tsn,
-				 const struct sctp_chunk *);
-struct sctp_chunk * sctp_make_datafrag_empty(struct sctp_association *,
-					const struct sctp_sndrcvinfo *sinfo,
-					int len, const __u8 flags,
-					__u16 ssn, gfp_t gfp);
-struct sctp_chunk *sctp_make_ecne(const struct sctp_association *,
-				  const __u32);
-struct sctp_chunk *sctp_make_sack(const struct sctp_association *);
+				 const struct sctp_chunk *chunk);
+struct sctp_chunk *sctp_make_datafrag_empty(struct sctp_association *asoc,
+					    const struct sctp_sndrcvinfo *sinfo,
+					    int len, const __u8 flags,
+					    __u16 ssn, gfp_t gfp);
+struct sctp_chunk *sctp_make_ecne(const struct sctp_association *asoc,
+				  const __u32 lowest_tsn);
+struct sctp_chunk *sctp_make_sack(const struct sctp_association *asoc);
 struct sctp_chunk *sctp_make_shutdown(const struct sctp_association *asoc,
 				      const struct sctp_chunk *chunk);
 struct sctp_chunk *sctp_make_shutdown_ack(const struct sctp_association *asoc,
-					  const struct sctp_chunk *);
-struct sctp_chunk *sctp_make_shutdown_complete(const struct sctp_association *,
-					  const struct sctp_chunk *);
-void sctp_init_cause(struct sctp_chunk *, __be16 cause, size_t);
-struct sctp_chunk *sctp_make_abort(const struct sctp_association *,
-			      const struct sctp_chunk *,
-			      const size_t hint);
-struct sctp_chunk *sctp_make_abort_no_data(const struct sctp_association *,
-				      const struct sctp_chunk *,
-				      __u32 tsn);
-struct sctp_chunk *sctp_make_abort_user(const struct sctp_association *,
-					struct msghdr *, size_t msg_len);
-struct sctp_chunk *sctp_make_abort_violation(const struct sctp_association *,
-				   const struct sctp_chunk *,
-				   const __u8 *,
-				   const size_t );
-struct sctp_chunk *sctp_make_violation_paramlen(const struct sctp_association *,
-				   const struct sctp_chunk *,
-				   struct sctp_paramhdr *);
-struct sctp_chunk *sctp_make_violation_max_retrans(const struct sctp_association *,
-						   const struct sctp_chunk *);
-struct sctp_chunk *sctp_make_heartbeat(const struct sctp_association *,
-				  const struct sctp_transport *);
-struct sctp_chunk *sctp_make_heartbeat_ack(const struct sctp_association *,
-				      const struct sctp_chunk *,
-				      const void *payload,
-				      const size_t paylen);
-struct sctp_chunk *sctp_make_op_error(const struct sctp_association *,
-				 const struct sctp_chunk *chunk,
-				 __be16 cause_code,
-				 const void *payload,
-				 size_t paylen,
-				 size_t reserve_tail);
-
-struct sctp_chunk *sctp_make_asconf_update_ip(struct sctp_association *,
-					      union sctp_addr *,
-					      struct sockaddr *,
-					      int, __be16);
+					  const struct sctp_chunk *chunk);
+struct sctp_chunk *sctp_make_shutdown_complete(
+					const struct sctp_association *asoc,
+					const struct sctp_chunk *chunk);
+void sctp_init_cause(struct sctp_chunk *chunk, __be16 cause, size_t paylen);
+struct sctp_chunk *sctp_make_abort(const struct sctp_association *asoc,
+				   const struct sctp_chunk *chunk,
+				   const size_t hint);
+struct sctp_chunk *sctp_make_abort_no_data(const struct sctp_association *asoc,
+					   const struct sctp_chunk *chunk,
+					   __u32 tsn);
+struct sctp_chunk *sctp_make_abort_user(const struct sctp_association *asoc,
+					struct msghdr *msg, size_t msg_len);
+struct sctp_chunk *sctp_make_abort_violation(
+					const struct sctp_association *asoc,
+					const struct sctp_chunk *chunk,
+					const __u8 *payload,
+					const size_t paylen);
+struct sctp_chunk *sctp_make_violation_paramlen(
+					const struct sctp_association *asoc,
+					const struct sctp_chunk *chunk,
+					struct sctp_paramhdr *param);
+struct sctp_chunk *sctp_make_violation_max_retrans(
+					const struct sctp_association *asoc,
+					const struct sctp_chunk *chunk);
+struct sctp_chunk *sctp_make_heartbeat(const struct sctp_association *asoc,
+				       const struct sctp_transport *transport);
+struct sctp_chunk *sctp_make_heartbeat_ack(const struct sctp_association *asoc,
+					   const struct sctp_chunk *chunk,
+					   const void *payload,
+					   const size_t paylen);
+struct sctp_chunk *sctp_make_op_error(const struct sctp_association *asoc,
+				      const struct sctp_chunk *chunk,
+				      __be16 cause_code, const void *payload,
+				      size_t paylen, size_t reserve_tail);
+
+struct sctp_chunk *sctp_make_asconf_update_ip(struct sctp_association *asoc,
+					      union sctp_addr *laddr,
+					      struct sockaddr *addrs,
+					      int addrcnt, __be16 flags);
 struct sctp_chunk *sctp_make_asconf_set_prim(struct sctp_association *asoc,
 					     union sctp_addr *addr);
 bool sctp_verify_asconf(const struct sctp_association *asoc,
@@ -259,27 +260,25 @@ struct sctp_chunk *sctp_make_fwdtsn(const struct sctp_association *asoc,
 				    __u32 new_cum_tsn, size_t nstreams,
 				    struct sctp_fwdtsn_skip *skiplist);
 struct sctp_chunk *sctp_make_auth(const struct sctp_association *asoc);
-struct sctp_chunk *sctp_make_strreset_req(
-				const struct sctp_association *asoc,
-				__u16 stream_num, __u16 *stream_list,
-				bool out, bool in);
+struct sctp_chunk *sctp_make_strreset_req(const struct sctp_association *asoc,
+					  __u16 stream_num, __u16 *stream_list,
+					  bool out, bool in);
 struct sctp_chunk *sctp_make_strreset_tsnreq(
-				const struct sctp_association *asoc);
+					const struct sctp_association *asoc);
 struct sctp_chunk *sctp_make_strreset_addstrm(
-				const struct sctp_association *asoc,
-				__u16 out, __u16 in);
-struct sctp_chunk *sctp_make_strreset_resp(
-				const struct sctp_association *asoc,
-				__u32 result, __u32 sn);
-struct sctp_chunk *sctp_make_strreset_tsnresp(
-				struct sctp_association *asoc,
-				__u32 result, __u32 sn,
-				__u32 sender_tsn, __u32 receiver_tsn);
+					const struct sctp_association *asoc,
+					__u16 out, __u16 in);
+struct sctp_chunk *sctp_make_strreset_resp(const struct sctp_association *asoc,
+					   __u32 result, __u32 sn);
+struct sctp_chunk *sctp_make_strreset_tsnresp(struct sctp_association *asoc,
+					      __u32 result, __u32 sn,
+					      __u32 sender_tsn,
+					      __u32 receiver_tsn);
 bool sctp_verify_reconf(const struct sctp_association *asoc,
 			struct sctp_chunk *chunk,
 			struct sctp_paramhdr **errp);
-void sctp_chunk_assign_tsn(struct sctp_chunk *);
-void sctp_chunk_assign_ssn(struct sctp_chunk *);
+void sctp_chunk_assign_tsn(struct sctp_chunk *chunk);
+void sctp_chunk_assign_ssn(struct sctp_chunk *chunk);
 
 /* Prototypes for stream-processing functions.  */
 struct sctp_chunk *sctp_process_strreset_outreq(
@@ -322,11 +321,12 @@ void sctp_generate_proto_unreach_event(unsigned long peer);
 
 void sctp_ootb_pkt_free(struct sctp_packet *packet);
 
-struct sctp_association *sctp_unpack_cookie(const struct sctp_endpoint *ep,
-				       const struct sctp_association *asoc,
-				       struct sctp_chunk *chunk,
-				       gfp_t gfp, int *err,
-				       struct sctp_chunk **err_chk_p);
+struct sctp_association *sctp_unpack_cookie(
+					const struct sctp_endpoint *ep,
+					const struct sctp_association *asoc,
+					struct sctp_chunk *chunk,
+					gfp_t gfp, int *err,
+					struct sctp_chunk **err_chk_p);
 
 /* 3rd level prototypes */
 __u32 sctp_generate_tag(const struct sctp_endpoint *ep);
diff --git a/net/sctp/sm_make_chunk.c b/net/sctp/sm_make_chunk.c
index 51de638a88b2..ca8f196b6c6c 100644
--- a/net/sctp/sm_make_chunk.c
+++ b/net/sctp/sm_make_chunk.c
@@ -132,8 +132,8 @@ static const struct sctp_paramhdr prsctp_param = {
  * provided chunk, as most cause codes will be embedded inside an
  * abort chunk.
  */
-void  sctp_init_cause(struct sctp_chunk *chunk, __be16 cause_code,
-		      size_t paylen)
+void sctp_init_cause(struct sctp_chunk *chunk, __be16 cause_code,
+		     size_t paylen)
 {
 	struct sctp_errhdr err;
 	__u16 len;
@@ -151,7 +151,7 @@ void  sctp_init_cause(struct sctp_chunk *chunk, __be16 cause_code,
  * if there isn't enough space in the op error chunk
  */
 static int sctp_init_cause_fixed(struct sctp_chunk *chunk, __be16 cause_code,
-		      size_t paylen)
+				 size_t paylen)
 {
 	struct sctp_errhdr err;
 	__u16 len;
@@ -213,32 +213,31 @@ static int sctp_init_cause_fixed(struct sctp_chunk *chunk, __be16 cause_code,
  * Supported Address Types (Note 4)    Optional    12
  */
 struct sctp_chunk *sctp_make_init(const struct sctp_association *asoc,
-			     const struct sctp_bind_addr *bp,
-			     gfp_t gfp, int vparam_len)
+				  const struct sctp_bind_addr *bp,
+				  gfp_t gfp, int vparam_len)
 {
 	struct net *net = sock_net(asoc->base.sk);
+	struct sctp_supported_ext_param ext_param;
+	struct sctp_adaptation_ind_param aiparam;
+	struct sctp_paramhdr *auth_chunks = NULL;
+	struct sctp_paramhdr *auth_hmacs = NULL;
+	struct sctp_supported_addrs_param sat;
 	struct sctp_endpoint *ep = asoc->ep;
-	struct sctp_inithdr init;
-	union sctp_params addrs;
-	size_t chunksize;
 	struct sctp_chunk *retval = NULL;
 	int num_types, addrs_len = 0;
+	struct sctp_inithdr init;
+	union sctp_params addrs;
 	struct sctp_sock *sp;
-	struct sctp_supported_addrs_param sat;
+	__u8 extensions[4];
+	size_t chunksize;
 	__be16 types[2];
-	struct sctp_adaptation_ind_param aiparam;
-	struct sctp_supported_ext_param ext_param;
 	int num_ext = 0;
-	__u8 extensions[4];
-	struct sctp_paramhdr *auth_chunks = NULL,
-			*auth_hmacs = NULL;
 
 	/* RFC 2960 3.3.2 Initiation (INIT) (1)
 	 *
 	 * Note 1: The INIT chunks can contain multiple addresses that
 	 * can be IPv4 and/or IPv6 in any combination.
 	 */
-	retval = NULL;
 
 	/* Convert the provided bind address list to raw format. */
 	addrs = sctp_bind_addrs_to_raw(bp, &addrs_len, gfp);
@@ -380,26 +379,24 @@ nodata:
 }
 
 struct sctp_chunk *sctp_make_init_ack(const struct sctp_association *asoc,
-				 const struct sctp_chunk *chunk,
-				 gfp_t gfp, int unkparam_len)
+				      const struct sctp_chunk *chunk,
+				      gfp_t gfp, int unkparam_len)
 {
+	struct sctp_supported_ext_param ext_param;
+	struct sctp_adaptation_ind_param aiparam;
+	struct sctp_paramhdr *auth_chunks = NULL;
+	struct sctp_paramhdr *auth_random = NULL;
+	struct sctp_paramhdr *auth_hmacs = NULL;
+	struct sctp_chunk *retval = NULL;
+	struct sctp_cookie_param *cookie;
 	struct sctp_inithdr initack;
-	struct sctp_chunk *retval;
 	union sctp_params addrs;
 	struct sctp_sock *sp;
-	int addrs_len;
-	struct sctp_cookie_param *cookie;
-	int cookie_len;
+	__u8 extensions[4];
 	size_t chunksize;
-	struct sctp_adaptation_ind_param aiparam;
-	struct sctp_supported_ext_param ext_param;
 	int num_ext = 0;
-	__u8 extensions[4];
-	struct sctp_paramhdr *auth_chunks = NULL,
-			*auth_hmacs = NULL,
-			*auth_random = NULL;
-
-	retval = NULL;
+	int cookie_len;
+	int addrs_len;
 
 	/* Note: there may be no addresses to embed. */
 	addrs = sctp_bind_addrs_to_raw(&asoc->base.bind_addr, &addrs_len, gfp);
@@ -562,11 +559,11 @@ nomem_cookie:
  *   to insure interoperability.
  */
 struct sctp_chunk *sctp_make_cookie_echo(const struct sctp_association *asoc,
-				    const struct sctp_chunk *chunk)
+					 const struct sctp_chunk *chunk)
 {
 	struct sctp_chunk *retval;
-	void *cookie;
 	int cookie_len;
+	void *cookie;
 
 	cookie = asoc->peer.cookie;
 	cookie_len = asoc->peer.cookie_len;
@@ -614,7 +611,7 @@ nodata:
  *   Set to zero on transmit and ignored on receipt.
  */
 struct sctp_chunk *sctp_make_cookie_ack(const struct sctp_association *asoc,
-				   const struct sctp_chunk *chunk)
+					const struct sctp_chunk *chunk)
 {
 	struct sctp_chunk *retval;
 
@@ -659,8 +656,8 @@ struct sctp_chunk *sctp_make_cookie_ack(const struct sctp_association *asoc,
  *     Note: The CWR is considered a Control chunk.
  */
 struct sctp_chunk *sctp_make_cwr(const struct sctp_association *asoc,
-			    const __u32 lowest_tsn,
-			    const struct sctp_chunk *chunk)
+				 const __u32 lowest_tsn,
+				 const struct sctp_chunk *chunk)
 {
 	struct sctp_chunk *retval;
 	struct sctp_cwrhdr cwr;
@@ -694,7 +691,7 @@ nodata:
 
 /* Make an ECNE chunk.  This is a congestion experienced report.  */
 struct sctp_chunk *sctp_make_ecne(const struct sctp_association *asoc,
-			     const __u32 lowest_tsn)
+				  const __u32 lowest_tsn)
 {
 	struct sctp_chunk *retval;
 	struct sctp_ecnehdr ecne;
@@ -715,9 +712,9 @@ nodata:
  * parameters.  However, do not populate the data payload.
  */
 struct sctp_chunk *sctp_make_datafrag_empty(struct sctp_association *asoc,
-				       const struct sctp_sndrcvinfo *sinfo,
-				       int data_len, __u8 flags, __u16 ssn,
-				       gfp_t gfp)
+					    const struct sctp_sndrcvinfo *sinfo,
+					    int data_len, __u8 flags, __u16 ssn,
+					    gfp_t gfp)
 {
 	struct sctp_chunk *retval;
 	struct sctp_datahdr dp;
@@ -755,15 +752,15 @@ nodata:
  */
 struct sctp_chunk *sctp_make_sack(const struct sctp_association *asoc)
 {
-	struct sctp_chunk *retval;
-	struct sctp_sackhdr sack;
-	int len;
-	__u32 ctsn;
-	__u16 num_gabs, num_dup_tsns;
-	struct sctp_association *aptr = (struct sctp_association *)asoc;
 	struct sctp_tsnmap *map = (struct sctp_tsnmap *)&asoc->peer.tsn_map;
+	struct sctp_association *aptr = (struct sctp_association *)asoc;
 	struct sctp_gap_ack_block gabs[SCTP_MAX_GABS];
+	__u16 num_gabs, num_dup_tsns;
 	struct sctp_transport *trans;
+	struct sctp_chunk *retval;
+	struct sctp_sackhdr sack;
+	__u32 ctsn;
+	int len;
 
 	memset(gabs, 0, sizeof(gabs));
 	ctsn = sctp_tsnmap_get_ctsn(map);
@@ -879,7 +876,7 @@ nodata:
 }
 
 struct sctp_chunk *sctp_make_shutdown_ack(const struct sctp_association *asoc,
-				     const struct sctp_chunk *chunk)
+					  const struct sctp_chunk *chunk)
 {
 	struct sctp_chunk *retval;
 
@@ -902,8 +899,8 @@ struct sctp_chunk *sctp_make_shutdown_ack(const struct sctp_association *asoc,
 }
 
 struct sctp_chunk *sctp_make_shutdown_complete(
-	const struct sctp_association *asoc,
-	const struct sctp_chunk *chunk)
+					const struct sctp_association *asoc,
+					const struct sctp_chunk *chunk)
 {
 	struct sctp_chunk *retval;
 	__u8 flags = 0;
@@ -936,8 +933,8 @@ struct sctp_chunk *sctp_make_shutdown_complete(
  * association, except when responding to an INIT (sctpimpguide 2.41).
  */
 struct sctp_chunk *sctp_make_abort(const struct sctp_association *asoc,
-			      const struct sctp_chunk *chunk,
-			      const size_t hint)
+				   const struct sctp_chunk *chunk,
+				   const size_t hint)
 {
 	struct sctp_chunk *retval;
 	__u8 flags = 0;
@@ -973,8 +970,9 @@ struct sctp_chunk *sctp_make_abort(const struct sctp_association *asoc,
 
 /* Helper to create ABORT with a NO_USER_DATA error.  */
 struct sctp_chunk *sctp_make_abort_no_data(
-	const struct sctp_association *asoc,
-	const struct sctp_chunk *chunk, __u32 tsn)
+					const struct sctp_association *asoc,
+					const struct sctp_chunk *chunk,
+					__u32 tsn)
 {
 	struct sctp_chunk *retval;
 	__be32 payload;
@@ -1054,8 +1052,8 @@ err_chunk:
 static void *sctp_addto_param(struct sctp_chunk *chunk, int len,
 			      const void *data)
 {
-	void *target;
 	int chunklen = ntohs(chunk->chunk_hdr->length);
+	void *target;
 
 	target = skb_put(chunk->skb, len);
 
@@ -1073,10 +1071,10 @@ static void *sctp_addto_param(struct sctp_chunk *chunk, int len,
 
 /* Make an ABORT chunk with a PROTOCOL VIOLATION cause code. */
 struct sctp_chunk *sctp_make_abort_violation(
-	const struct sctp_association *asoc,
-	const struct sctp_chunk *chunk,
-	const __u8   *payload,
-	const size_t paylen)
+					const struct sctp_association *asoc,
+					const struct sctp_chunk *chunk,
+					const __u8 *payload,
+					const size_t paylen)
 {
 	struct sctp_chunk  *retval;
 	struct sctp_paramhdr phdr;
@@ -1099,14 +1097,14 @@ end:
 }
 
 struct sctp_chunk *sctp_make_violation_paramlen(
-	const struct sctp_association *asoc,
-	const struct sctp_chunk *chunk,
-	struct sctp_paramhdr *param)
+					const struct sctp_association *asoc,
+					const struct sctp_chunk *chunk,
+					struct sctp_paramhdr *param)
 {
-	struct sctp_chunk *retval;
 	static const char error[] = "The following parameter had invalid length:";
 	size_t payload_len = sizeof(error) + sizeof(struct sctp_errhdr) +
 			     sizeof(*param);
+	struct sctp_chunk *retval;
 
 	retval = sctp_make_abort(asoc, chunk, payload_len);
 	if (!retval)
@@ -1122,12 +1120,12 @@ nodata:
 }
 
 struct sctp_chunk *sctp_make_violation_max_retrans(
-	const struct sctp_association *asoc,
-	const struct sctp_chunk *chunk)
+					const struct sctp_association *asoc,
+					const struct sctp_chunk *chunk)
 {
-	struct sctp_chunk *retval;
 	static const char error[] = "Association exceeded its max_retans count";
 	size_t payload_len = sizeof(error) + sizeof(struct sctp_errhdr);
+	struct sctp_chunk *retval;
 
 	retval = sctp_make_abort(asoc, chunk, payload_len);
 	if (!retval)
@@ -1171,8 +1169,9 @@ nodata:
 }
 
 struct sctp_chunk *sctp_make_heartbeat_ack(const struct sctp_association *asoc,
-				      const struct sctp_chunk *chunk,
-				      const void *payload, const size_t paylen)
+					   const struct sctp_chunk *chunk,
+					   const void *payload,
+					   const size_t paylen)
 {
 	struct sctp_chunk *retval;
 
@@ -1203,9 +1202,9 @@ nodata:
  * This routine can be used for containing multiple causes in the chunk.
  */
 static struct sctp_chunk *sctp_make_op_error_space(
-	const struct sctp_association *asoc,
-	const struct sctp_chunk *chunk,
-	size_t size)
+					const struct sctp_association *asoc,
+					const struct sctp_chunk *chunk,
+					size_t size)
 {
 	struct sctp_chunk *retval;
 
@@ -1237,8 +1236,8 @@ nodata:
  * to report all the errors, if the incoming chunk is large
  */
 static inline struct sctp_chunk *sctp_make_op_error_fixed(
-	const struct sctp_association *asoc,
-	const struct sctp_chunk *chunk)
+					const struct sctp_association *asoc,
+					const struct sctp_chunk *chunk)
 {
 	size_t size = asoc ? asoc->pathmtu : 0;
 
@@ -1250,9 +1249,9 @@ static inline struct sctp_chunk *sctp_make_op_error_fixed(
 
 /* Create an Operation Error chunk.  */
 struct sctp_chunk *sctp_make_op_error(const struct sctp_association *asoc,
-				 const struct sctp_chunk *chunk,
-				 __be16 cause_code, const void *payload,
-				 size_t paylen, size_t reserve_tail)
+				      const struct sctp_chunk *chunk,
+				      __be16 cause_code, const void *payload,
+				      size_t paylen, size_t reserve_tail)
 {
 	struct sctp_chunk *retval;
 
@@ -1271,9 +1270,9 @@ nodata:
 
 struct sctp_chunk *sctp_make_auth(const struct sctp_association *asoc)
 {
-	struct sctp_chunk *retval;
-	struct sctp_hmac *hmac_desc;
 	struct sctp_authhdr auth_hdr;
+	struct sctp_hmac *hmac_desc;
+	struct sctp_chunk *retval;
 	__u8 *hmac;
 
 	/* Get the first hmac that the peer told us to use */
@@ -1319,8 +1318,8 @@ struct sctp_chunk *sctp_make_auth(const struct sctp_association *asoc)
  *
  */
 struct sctp_chunk *sctp_chunkify(struct sk_buff *skb,
-			    const struct sctp_association *asoc,
-			    struct sock *sk, gfp_t gfp)
+				 const struct sctp_association *asoc,
+				 struct sock *sk, gfp_t gfp)
 {
 	struct sctp_chunk *retval;
 
@@ -1372,11 +1371,11 @@ const union sctp_addr *sctp_source(const struct sctp_chunk *chunk)
  * arguments, reserving enough space for a 'paylen' byte payload.
  */
 static struct sctp_chunk *_sctp_make_chunk(const struct sctp_association *asoc,
-					    __u8 type, __u8 flags, int paylen,
-					    gfp_t gfp)
+					   __u8 type, __u8 flags, int paylen,
+					   gfp_t gfp)
 {
-	struct sctp_chunk *retval;
 	struct sctp_chunkhdr *chunk_hdr;
+	struct sctp_chunk *retval;
 	struct sk_buff *skb;
 	struct sock *sk;
 
@@ -1470,9 +1469,9 @@ void sctp_chunk_put(struct sctp_chunk *ch)
  */
 void *sctp_addto_chunk(struct sctp_chunk *chunk, int len, const void *data)
 {
-	void *target;
 	int chunklen = ntohs(chunk->chunk_hdr->length);
 	int padlen = SCTP_PAD4(chunklen) - chunklen;
+	void *target;
 
 	skb_put_zero(chunk->skb, padlen);
 	target = skb_put_data(chunk->skb, data, len);
@@ -1525,11 +1524,10 @@ int sctp_user_addto_chunk(struct sctp_chunk *chunk, int len,
  */
 void sctp_chunk_assign_ssn(struct sctp_chunk *chunk)
 {
-	struct sctp_datamsg *msg;
-	struct sctp_chunk *lchunk;
 	struct sctp_stream *stream;
-	__u16 ssn;
-	__u16 sid;
+	struct sctp_chunk *lchunk;
+	struct sctp_datamsg *msg;
+	__u16 ssn, sid;
 
 	if (chunk->has_ssn)
 		return;
@@ -1574,8 +1572,8 @@ void sctp_chunk_assign_tsn(struct sctp_chunk *chunk)
 
 /* Create a CLOSED association to use with an incoming packet.  */
 struct sctp_association *sctp_make_temp_asoc(const struct sctp_endpoint *ep,
-					struct sctp_chunk *chunk,
-					gfp_t gfp)
+					     struct sctp_chunk *chunk,
+					     gfp_t gfp)
 {
 	struct sctp_association *asoc;
 	enum sctp_scope scope;
@@ -1602,8 +1600,8 @@ static struct sctp_cookie_param *sctp_pack_cookie(
 					const struct sctp_endpoint *ep,
 					const struct sctp_association *asoc,
 					const struct sctp_chunk *init_chunk,
-					int *cookie_len,
-					const __u8 *raw_addrs, int addrs_len)
+					int *cookie_len, const __u8 *raw_addrs,
+					int addrs_len)
 {
 	struct sctp_signed_cookie *cookie;
 	struct sctp_cookie_param *retval;
@@ -1690,19 +1688,19 @@ nodata:
 
 /* Unpack the cookie from COOKIE ECHO chunk, recreating the association.  */
 struct sctp_association *sctp_unpack_cookie(
-	const struct sctp_endpoint *ep,
-	const struct sctp_association *asoc,
-	struct sctp_chunk *chunk, gfp_t gfp,
-	int *error, struct sctp_chunk **errp)
+					const struct sctp_endpoint *ep,
+					const struct sctp_association *asoc,
+					struct sctp_chunk *chunk, gfp_t gfp,
+					int *error, struct sctp_chunk **errp)
 {
 	struct sctp_association *retval = NULL;
+	int headersize, bodysize, fixed_size;
 	struct sctp_signed_cookie *cookie;
+	struct sk_buff *skb = chunk->skb;
 	struct sctp_cookie *bear_cookie;
-	int headersize, bodysize, fixed_size;
 	__u8 *digest = ep->digest;
-	unsigned int len;
 	enum sctp_scope scope;
-	struct sk_buff *skb = chunk->skb;
+	unsigned int len;
 	ktime_t kt;
 
 	/* Header size is static data prior to the actual cookie, including
@@ -1974,8 +1972,8 @@ static int sctp_process_hn_param(const struct sctp_association *asoc,
 static int sctp_verify_ext_param(struct net *net, union sctp_params param)
 {
 	__u16 num_ext = ntohs(param.p->length) - sizeof(struct sctp_paramhdr);
-	int have_auth = 0;
 	int have_asconf = 0;
+	int have_auth = 0;
 	int i;
 
 	for (i = 0; i < num_ext; i++) {
@@ -2005,10 +2003,10 @@ static int sctp_verify_ext_param(struct net *net, union sctp_params param)
 }
 
 static void sctp_process_ext_param(struct sctp_association *asoc,
-				    union sctp_params param)
+				   union sctp_params param)
 {
-	struct net *net = sock_net(asoc->base.sk);
 	__u16 num_ext = ntohs(param.p->length) - sizeof(struct sctp_paramhdr);
+	struct net *net = sock_net(asoc->base.sk);
 	int i;
 
 	for (i = 0; i < num_ext; i++) {
@@ -2309,13 +2307,13 @@ int sctp_process_init(struct sctp_association *asoc, struct sctp_chunk *chunk,
 		      struct sctp_init_chunk *peer_init, gfp_t gfp)
 {
 	struct net *net = sock_net(asoc->base.sk);
-	union sctp_params param;
 	struct sctp_transport *transport;
 	struct list_head *pos, *temp;
-	struct sctp_af *af;
+	union sctp_params param;
 	union sctp_addr addr;
-	char *cookie;
+	struct sctp_af *af;
 	int src_match = 0;
+	char *cookie;
 
 	/* We must include the address that the INIT packet came from.
 	 * This is the only address that matters for an INIT packet.
@@ -2499,16 +2497,15 @@ static int sctp_process_param(struct sctp_association *asoc,
 			      gfp_t gfp)
 {
 	struct net *net = sock_net(asoc->base.sk);
-	union sctp_addr addr;
-	int i;
-	__u16 sat;
-	int retval = 1;
-	enum sctp_scope scope;
-	u32 stale;
-	struct sctp_af *af;
+	struct sctp_endpoint *ep = asoc->ep;
 	union sctp_addr_param *addr_param;
 	struct sctp_transport *t;
-	struct sctp_endpoint *ep = asoc->ep;
+	enum sctp_scope scope;
+	union sctp_addr addr;
+	struct sctp_af *af;
+	int retval = 1, i;
+	u32 stale;
+	__u16 sat;
 
 	/* We maintain all INIT parameters in network byte order all the
 	 * time.  This allows us to not worry about whether the parameters
@@ -2806,22 +2803,20 @@ static struct sctp_chunk *sctp_make_asconf(struct sctp_association *asoc,
  *
  */
 struct sctp_chunk *sctp_make_asconf_update_ip(struct sctp_association *asoc,
-					      union sctp_addr	      *laddr,
-					      struct sockaddr	      *addrs,
-					      int		      addrcnt,
-					      __be16		      flags)
+					      union sctp_addr *laddr,
+					      struct sockaddr *addrs,
+					      int addrcnt, __be16 flags)
 {
+	union sctp_addr_param addr_param;
 	struct sctp_addip_param	param;
-	struct sctp_chunk	*retval;
-	union sctp_addr_param	addr_param;
-	union sctp_addr		*addr;
-	void			*addr_buf;
-	struct sctp_af		*af;
-	int			paramlen = sizeof(param);
-	int			addr_param_len = 0;
-	int 			totallen = 0;
-	int 			i;
-	int			del_pickup = 0;
+	int paramlen = sizeof(param);
+	struct sctp_chunk *retval;
+	int addr_param_len = 0;
+	union sctp_addr *addr;
+	int totallen = 0, i;
+	int del_pickup = 0;
+	struct sctp_af *af;
+	void *addr_buf;
 
 	/* Get total length of all the address parameters. */
 	addr_buf = addrs;
@@ -2897,12 +2892,12 @@ struct sctp_chunk *sctp_make_asconf_update_ip(struct sctp_association *asoc,
 struct sctp_chunk *sctp_make_asconf_set_prim(struct sctp_association *asoc,
 					     union sctp_addr *addr)
 {
+	struct sctp_af *af = sctp_get_af_specific(addr->v4.sin_family);
+	union sctp_addr_param addrparam;
 	struct sctp_addip_param	param;
-	struct sctp_chunk 	*retval;
-	int 			len = sizeof(param);
-	union sctp_addr_param	addrparam;
-	int			addrlen;
-	struct sctp_af		*af = sctp_get_af_specific(addr->v4.sin_family);
+	struct sctp_chunk *retval;
+	int len = sizeof(param);
+	int addrlen;
 
 	addrlen = af->to_addr_param(addr, &addrparam);
 	if (!addrlen)
@@ -2946,9 +2941,9 @@ struct sctp_chunk *sctp_make_asconf_set_prim(struct sctp_association *asoc,
 static struct sctp_chunk *sctp_make_asconf_ack(const struct sctp_association *asoc,
 					       __u32 serial, int vparam_len)
 {
-	struct sctp_addiphdr	asconf;
-	struct sctp_chunk	*retval;
-	int			length = sizeof(asconf) + vparam_len;
+	struct sctp_addiphdr asconf;
+	struct sctp_chunk *retval;
+	int length = sizeof(asconf) + vparam_len;
 
 	/* Create the chunk.  */
 	retval = sctp_make_control(asoc, SCTP_CID_ASCONF_ACK, 0, length,
@@ -2970,10 +2965,10 @@ static void sctp_add_asconf_response(struct sctp_chunk *chunk, __be32 crr_id,
 				     struct sctp_addip_param *asconf_param)
 {
 	struct sctp_addip_param ack_param;
-	struct sctp_errhdr	err_param;
-	int			asconf_param_len = 0;
-	int			err_param_len = 0;
-	__be16			response_type;
+	struct sctp_errhdr err_param;
+	int asconf_param_len = 0;
+	int err_param_len = 0;
+	__be16 response_type;
 
 	if (SCTP_ERROR_NO_ERROR == err_code) {
 		response_type = SCTP_PARAM_SUCCESS_REPORT;
@@ -3011,10 +3006,10 @@ static __be16 sctp_process_asconf_param(struct sctp_association *asoc,
 					struct sctp_chunk *asconf,
 					struct sctp_addip_param *asconf_param)
 {
+	union sctp_addr_param *addr_param;
 	struct sctp_transport *peer;
-	struct sctp_af *af;
 	union sctp_addr	addr;
-	union sctp_addr_param *addr_param;
+	struct sctp_af *af;
 
 	addr_param = (void *)asconf_param + sizeof(*asconf_param);
 
@@ -3142,8 +3137,8 @@ bool sctp_verify_asconf(const struct sctp_association *asoc,
 			struct sctp_paramhdr **errp)
 {
 	struct sctp_addip_chunk *addip;
-	union sctp_params param;
 	bool addr_param_seen = false;
+	union sctp_params param;
 
 	addip = (struct sctp_addip_chunk *)chunk->chunk_hdr;
 	sctp_walk_params(param, addip, addip_hdr.params) {
@@ -3209,16 +3204,15 @@ bool sctp_verify_asconf(const struct sctp_association *asoc,
 struct sctp_chunk *sctp_process_asconf(struct sctp_association *asoc,
 				       struct sctp_chunk *asconf)
 {
+	union sctp_addr_param *addr_param;
 	struct sctp_addip_chunk *addip;
+	struct sctp_chunk *asconf_ack;
 	bool all_param_pass = true;
+	struct sctp_addiphdr *hdr;
+	int length = 0, chunk_len;
 	union sctp_params param;
-	struct sctp_addiphdr	*hdr;
-	union sctp_addr_param	*addr_param;
-	struct sctp_chunk	*asconf_ack;
-	__be16	err_code;
-	int	length = 0;
-	int	chunk_len;
-	__u32	serial;
+	__be16 err_code;
+	__u32 serial;
 
 	addip = (struct sctp_addip_chunk *)asconf->chunk_hdr;
 	chunk_len = ntohs(asconf->chunk_hdr->length) -
@@ -3295,12 +3289,12 @@ done:
 static void sctp_asconf_param_success(struct sctp_association *asoc,
 				      struct sctp_addip_param *asconf_param)
 {
-	struct sctp_af *af;
-	union sctp_addr	addr;
 	struct sctp_bind_addr *bp = &asoc->base.bind_addr;
 	union sctp_addr_param *addr_param;
-	struct sctp_transport *transport;
 	struct sctp_sockaddr_entry *saddr;
+	struct sctp_transport *transport;
+	union sctp_addr	addr;
+	struct sctp_af *af;
 
 	addr_param = (void *)asconf_param + sizeof(*asconf_param);
 
@@ -3357,10 +3351,10 @@ static __be16 sctp_get_asconf_response(struct sctp_chunk *asconf_ack,
 				       int no_err)
 {
 	struct sctp_addip_param	*asconf_ack_param;
-	struct sctp_errhdr	*err_param;
-	int			length;
-	int			asconf_ack_len;
-	__be16			err_code;
+	struct sctp_errhdr *err_param;
+	int asconf_ack_len;
+	__be16 err_code;
+	int length;
 
 	if (no_err)
 		err_code = SCTP_ERROR_NO_ERROR;
@@ -3409,15 +3403,15 @@ static __be16 sctp_get_asconf_response(struct sctp_chunk *asconf_ack,
 int sctp_process_asconf_ack(struct sctp_association *asoc,
 			    struct sctp_chunk *asconf_ack)
 {
-	struct sctp_chunk	*asconf = asoc->addip_last_asconf;
-	union sctp_addr_param	*addr_param;
-	struct sctp_addip_param	*asconf_param;
-	int	length = 0;
-	int	asconf_len = asconf->skb->len;
-	int	all_param_pass = 0;
-	int	no_err = 1;
-	int	retval = 0;
-	__be16	err_code = SCTP_ERROR_NO_ERROR;
+	struct sctp_chunk *asconf = asoc->addip_last_asconf;
+	struct sctp_addip_param *asconf_param;
+	__be16 err_code = SCTP_ERROR_NO_ERROR;
+	union sctp_addr_param *addr_param;
+	int asconf_len = asconf->skb->len;
+	int all_param_pass = 0;
+	int length = 0;
+	int no_err = 1;
+	int retval = 0;
 
 	/* Skip the chunkhdr and addiphdr from the last asconf sent and store
 	 * a pointer to address parameter.
@@ -3544,9 +3538,8 @@ struct sctp_chunk *sctp_make_fwdtsn(const struct sctp_association *asoc,
  *  \                                                               \
  *  +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
  */
-static struct sctp_chunk *sctp_make_reconf(
-				const struct sctp_association *asoc,
-				int length)
+static struct sctp_chunk *sctp_make_reconf(const struct sctp_association *asoc,
+					   int length)
 {
 	struct sctp_reconf_chunk *reconf;
 	struct sctp_chunk *retval;
@@ -3597,9 +3590,9 @@ static struct sctp_chunk *sctp_make_reconf(
  *  +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
  */
 struct sctp_chunk *sctp_make_strreset_req(
-				const struct sctp_association *asoc,
-				__u16 stream_num, __u16 *stream_list,
-				bool out, bool in)
+					const struct sctp_association *asoc,
+					__u16 stream_num, __u16 *stream_list,
+					bool out, bool in)
 {
 	struct sctp_strreset_outreq outreq;
 	__u16 stream_len = stream_num * 2;
@@ -3651,7 +3644,7 @@ struct sctp_chunk *sctp_make_strreset_req(
  *  +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
  */
 struct sctp_chunk *sctp_make_strreset_tsnreq(
-				const struct sctp_association *asoc)
+					const struct sctp_association *asoc)
 {
 	struct sctp_strreset_tsnreq tsnreq;
 	__u16 length = sizeof(tsnreq);
@@ -3682,8 +3675,8 @@ struct sctp_chunk *sctp_make_strreset_tsnreq(
  *  +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
  */
 struct sctp_chunk *sctp_make_strreset_addstrm(
-				const struct sctp_association *asoc,
-				__u16 out, __u16 in)
+					const struct sctp_association *asoc,
+					__u16 out, __u16 in)
 {
 	struct sctp_strreset_addstrm addstrm;
 	__u16 size = sizeof(addstrm);
@@ -3727,9 +3720,8 @@ struct sctp_chunk *sctp_make_strreset_addstrm(
  *  |                            Result                             |
  *  +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
  */
-struct sctp_chunk *sctp_make_strreset_resp(
-				const struct sctp_association *asoc,
-				__u32 result, __u32 sn)
+struct sctp_chunk *sctp_make_strreset_resp(const struct sctp_association *asoc,
+					   __u32 result, __u32 sn)
 {
 	struct sctp_strreset_resp resp;
 	__u16 length = sizeof(resp);
@@ -3764,10 +3756,10 @@ struct sctp_chunk *sctp_make_strreset_resp(
  *  |                  Receiver's Next TSN (optional)               |
  *  +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
  */
-struct sctp_chunk *sctp_make_strreset_tsnresp(
-					struct sctp_association *asoc,
-					__u32 result, __u32 sn,
-					__u32 sender_tsn, __u32 receiver_tsn)
+struct sctp_chunk *sctp_make_strreset_tsnresp(struct sctp_association *asoc,
+					      __u32 result, __u32 sn,
+					      __u32 sender_tsn,
+					      __u32 receiver_tsn)
 {
 	struct sctp_strreset_resptsn tsnresp;
 	__u16 length = sizeof(tsnresp);
-- 
cgit v1.2.3


From 591b6bb605785c12a21e8b07a08a277065b655a5 Mon Sep 17 00:00:00 2001
From: Andrey Korolyov <andrey@xdel.ru>
Date: Thu, 10 Aug 2017 13:21:14 +0300
Subject: cs5536: add support for IDE controller variant

Several legacy devices such as Geode-based Cisco ASA appliances
and DB800 development board do possess CS5536 IDE controller
with different PCI id than existing one. Using pata_generic is
not always feasible as at least DB800 requires MSR quirk from
pata_cs5536 to be used with vendor firmware.

Signed-off-by: Andrey Korolyov <andrey@xdel.ru>
CC: stable@vger.kernel.org
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 drivers/ata/pata_amd.c    | 1 +
 drivers/ata/pata_cs5536.c | 1 +
 include/linux/pci_ids.h   | 1 +
 3 files changed, 3 insertions(+)

(limited to 'include')

diff --git a/drivers/ata/pata_amd.c b/drivers/ata/pata_amd.c
index 8d4d959a821c..8706533db57b 100644
--- a/drivers/ata/pata_amd.c
+++ b/drivers/ata/pata_amd.c
@@ -616,6 +616,7 @@ static const struct pci_device_id amd[] = {
 	{ PCI_VDEVICE(NVIDIA,	PCI_DEVICE_ID_NVIDIA_NFORCE_MCP73_IDE),	8 },
 	{ PCI_VDEVICE(NVIDIA,	PCI_DEVICE_ID_NVIDIA_NFORCE_MCP77_IDE),	8 },
 	{ PCI_VDEVICE(AMD,	PCI_DEVICE_ID_AMD_CS5536_IDE),		9 },
+	{ PCI_VDEVICE(AMD,	PCI_DEVICE_ID_AMD_CS5536_DEV_IDE),	9 },
 
 	{ },
 };
diff --git a/drivers/ata/pata_cs5536.c b/drivers/ata/pata_cs5536.c
index 6c15a554efbe..dc1255294628 100644
--- a/drivers/ata/pata_cs5536.c
+++ b/drivers/ata/pata_cs5536.c
@@ -289,6 +289,7 @@ static int cs5536_init_one(struct pci_dev *dev, const struct pci_device_id *id)
 
 static const struct pci_device_id cs5536[] = {
 	{ PCI_VDEVICE(AMD,	PCI_DEVICE_ID_AMD_CS5536_IDE), },
+	{ PCI_VDEVICE(AMD,	PCI_DEVICE_ID_AMD_CS5536_DEV_IDE), },
 	{ },
 };
 
diff --git a/include/linux/pci_ids.h b/include/linux/pci_ids.h
index c71e532da458..4adf6161ec77 100644
--- a/include/linux/pci_ids.h
+++ b/include/linux/pci_ids.h
@@ -576,6 +576,7 @@
 #define PCI_DEVICE_ID_AMD_CS5536_EHC    0x2095
 #define PCI_DEVICE_ID_AMD_CS5536_UDC    0x2096
 #define PCI_DEVICE_ID_AMD_CS5536_UOC    0x2097
+#define PCI_DEVICE_ID_AMD_CS5536_DEV_IDE    0x2092
 #define PCI_DEVICE_ID_AMD_CS5536_IDE    0x209A
 #define PCI_DEVICE_ID_AMD_LX_VIDEO  0x2081
 #define PCI_DEVICE_ID_AMD_LX_AES    0x2082
-- 
cgit v1.2.3


From 861932ecc36063196c80ca3e240502b0f6d0e977 Mon Sep 17 00:00:00 2001
From: Jiri Pirko <jiri@mellanox.com>
Date: Wed, 9 Aug 2017 14:30:31 +0200
Subject: net: sched: Add helpers to identify classids

Offloading drivers need to understand what qdisc class a filter is added
to. Currently they only need to identify ingress, clsact->ingress and
clsact->egress. So provide these helpers.

Signed-off-by: Jiri Pirko <jiri@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/pkt_sched.h | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

(limited to 'include')

diff --git a/include/net/pkt_sched.h b/include/net/pkt_sched.h
index 2579c209ea51..259bc191ba59 100644
--- a/include/net/pkt_sched.h
+++ b/include/net/pkt_sched.h
@@ -5,6 +5,7 @@
 #include <linux/ktime.h>
 #include <linux/if_vlan.h>
 #include <net/sch_generic.h>
+#include <uapi/linux/pkt_sched.h>
 
 #define DEFAULT_TX_QUEUE_LEN	1000
 
@@ -132,4 +133,17 @@ static inline unsigned int psched_mtu(const struct net_device *dev)
 	return dev->mtu + dev->hard_header_len;
 }
 
+static inline bool is_classid_clsact_ingress(u32 classid)
+{
+	/* This also returns true for ingress qdisc */
+	return TC_H_MAJ(classid) == TC_H_MAJ(TC_H_CLSACT) &&
+	       TC_H_MIN(classid) != TC_H_MIN(TC_H_MIN_EGRESS);
+}
+
+static inline bool is_classid_clsact_egress(u32 classid)
+{
+	return TC_H_MAJ(classid) == TC_H_MAJ(TC_H_CLSACT) &&
+	       TC_H_MIN(classid) == TC_H_MIN(TC_H_MIN_EGRESS);
+}
+
 #endif
-- 
cgit v1.2.3


From 7690f2a51d8afe51ac97e7fae66b081f192a7158 Mon Sep 17 00:00:00 2001
From: Jiri Pirko <jiri@mellanox.com>
Date: Wed, 9 Aug 2017 14:30:32 +0200
Subject: net: sched: propagate classid down to offload drivers

Drivers need classid to decide they support this specific qdisc+class
or not. So propagate it down via the tc_cls_common_offload struct.

Signed-off-by: Jiri Pirko <jiri@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/pkt_cls.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include')

diff --git a/include/net/pkt_cls.h b/include/net/pkt_cls.h
index 0f78e6560b2d..1f1de20e584f 100644
--- a/include/net/pkt_cls.h
+++ b/include/net/pkt_cls.h
@@ -410,6 +410,7 @@ struct tc_cls_common_offload {
 	u32 chain_index;
 	__be16 protocol;
 	u32 prio;
+	u32 classid;
 };
 
 static inline void
@@ -420,6 +421,7 @@ tc_cls_common_offload_init(struct tc_cls_common_offload *cls_common,
 	cls_common->chain_index = tp->chain->index;
 	cls_common->protocol = tp->protocol;
 	cls_common->prio = tp->prio;
+	cls_common->classid = tp->classid;
 }
 
 struct tc_cls_u32_knode {
-- 
cgit v1.2.3


From 237f79d24ebe1eb9b5651b7342ba5cc9d9b8f222 Mon Sep 17 00:00:00 2001
From: Jiri Pirko <jiri@mellanox.com>
Date: Wed, 9 Aug 2017 14:30:34 +0200
Subject: net: sched: remove handle propagation down to the drivers

There is no longer need to use handle in drivers, so remove it from
tc_cls_common_offload struct.

Signed-off-by: Jiri Pirko <jiri@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/pkt_cls.h | 2 --
 1 file changed, 2 deletions(-)

(limited to 'include')

diff --git a/include/net/pkt_cls.h b/include/net/pkt_cls.h
index 1f1de20e584f..bd9dd79357fe 100644
--- a/include/net/pkt_cls.h
+++ b/include/net/pkt_cls.h
@@ -406,7 +406,6 @@ tcf_match_indev(struct sk_buff *skb, int ifindex)
 #endif /* CONFIG_NET_CLS_IND */
 
 struct tc_cls_common_offload {
-	u32 handle;
 	u32 chain_index;
 	__be16 protocol;
 	u32 prio;
@@ -417,7 +416,6 @@ static inline void
 tc_cls_common_offload_init(struct tc_cls_common_offload *cls_common,
 			   const struct tcf_proto *tp)
 {
-	cls_common->handle = tp->q->handle;
 	cls_common->chain_index = tp->chain->index;
 	cls_common->protocol = tp->protocol;
 	cls_common->prio = tp->prio;
-- 
cgit v1.2.3


From 7b06e8aed283081010596c98a67f06c595affe51 Mon Sep 17 00:00:00 2001
From: Jiri Pirko <jiri@mellanox.com>
Date: Wed, 9 Aug 2017 14:30:35 +0200
Subject: net: sched: remove cops->tcf_cl_offload

cops->tcf_cl_offload is no longer needed, as the drivers check what they
can and cannot offload using the classid identify helpers. So remove this.

Signed-off-by: Jiri Pirko <jiri@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/pkt_cls.h     | 14 +++-----------
 include/net/sch_generic.h |  1 -
 net/sched/cls_bpf.c       |  4 ++--
 net/sched/cls_flower.c    |  8 ++++----
 net/sched/cls_matchall.c  |  4 ++--
 net/sched/cls_u32.c       |  8 ++++----
 net/sched/sch_ingress.c   | 12 ------------
 7 files changed, 15 insertions(+), 36 deletions(-)

(limited to 'include')

diff --git a/include/net/pkt_cls.h b/include/net/pkt_cls.h
index bd9dd79357fe..e80edd8879ef 100644
--- a/include/net/pkt_cls.h
+++ b/include/net/pkt_cls.h
@@ -457,19 +457,12 @@ struct tc_cls_u32_offload {
 	};
 };
 
-static inline bool tc_can_offload(const struct net_device *dev,
-				  const struct tcf_proto *tp)
+static inline bool tc_can_offload(const struct net_device *dev)
 {
-	const struct Qdisc *sch = tp->q;
-	const struct Qdisc_class_ops *cops = sch->ops->cl_ops;
-
 	if (!(dev->features & NETIF_F_HW_TC))
 		return false;
 	if (!dev->netdev_ops->ndo_setup_tc)
 		return false;
-	if (cops && cops->tcf_cl_offload)
-		return cops->tcf_cl_offload(tp->classid);
-
 	return true;
 }
 
@@ -478,12 +471,11 @@ static inline bool tc_skip_hw(u32 flags)
 	return (flags & TCA_CLS_FLAGS_SKIP_HW) ? true : false;
 }
 
-static inline bool tc_should_offload(const struct net_device *dev,
-				     const struct tcf_proto *tp, u32 flags)
+static inline bool tc_should_offload(const struct net_device *dev, u32 flags)
 {
 	if (tc_skip_hw(flags))
 		return false;
-	return tc_can_offload(dev, tp);
+	return tc_can_offload(dev);
 }
 
 static inline bool tc_skip_sw(u32 flags)
diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h
index e79f5ad1c5f3..5865db91976b 100644
--- a/include/net/sch_generic.h
+++ b/include/net/sch_generic.h
@@ -156,7 +156,6 @@ struct Qdisc_class_ops {
 
 	/* Filter manipulation */
 	struct tcf_block *	(*tcf_block)(struct Qdisc *, unsigned long);
-	bool			(*tcf_cl_offload)(u32 classid);
 	unsigned long		(*bind_tcf)(struct Qdisc *, unsigned long,
 					u32 classid);
 	void			(*unbind_tcf)(struct Qdisc *, unsigned long);
diff --git a/net/sched/cls_bpf.c b/net/sched/cls_bpf.c
index db17b68df94e..6f2dffe30f25 100644
--- a/net/sched/cls_bpf.c
+++ b/net/sched/cls_bpf.c
@@ -178,7 +178,7 @@ static int cls_bpf_offload(struct tcf_proto *tp, struct cls_bpf_prog *prog,
 		(oldprog && tc_skip_sw(oldprog->gen_flags));
 
 	if (oldprog && oldprog->offloaded) {
-		if (tc_should_offload(dev, tp, prog->gen_flags)) {
+		if (tc_should_offload(dev, prog->gen_flags)) {
 			cmd = TC_CLSBPF_REPLACE;
 		} else if (!tc_skip_sw(prog->gen_flags)) {
 			obj = oldprog;
@@ -187,7 +187,7 @@ static int cls_bpf_offload(struct tcf_proto *tp, struct cls_bpf_prog *prog,
 			return -EINVAL;
 		}
 	} else {
-		if (!tc_should_offload(dev, tp, prog->gen_flags))
+		if (!tc_should_offload(dev, prog->gen_flags))
 			return skip_sw ? -EINVAL : 0;
 		cmd = TC_CLSBPF_ADD;
 	}
diff --git a/net/sched/cls_flower.c b/net/sched/cls_flower.c
index d2551a03c542..052e902dc71c 100644
--- a/net/sched/cls_flower.c
+++ b/net/sched/cls_flower.c
@@ -227,7 +227,7 @@ static void fl_hw_destroy_filter(struct tcf_proto *tp, struct cls_fl_filter *f)
 	struct tc_cls_flower_offload cls_flower = {};
 	struct net_device *dev = f->hw_dev;
 
-	if (!tc_can_offload(dev, tp))
+	if (!tc_can_offload(dev))
 		return;
 
 	tc_cls_common_offload_init(&cls_flower.common, tp);
@@ -246,9 +246,9 @@ static int fl_hw_replace_filter(struct tcf_proto *tp,
 	struct tc_cls_flower_offload cls_flower = {};
 	int err;
 
-	if (!tc_can_offload(dev, tp)) {
+	if (!tc_can_offload(dev)) {
 		if (tcf_exts_get_dev(dev, &f->exts, &f->hw_dev) ||
-		    (f->hw_dev && !tc_can_offload(f->hw_dev, tp))) {
+		    (f->hw_dev && !tc_can_offload(f->hw_dev))) {
 			f->hw_dev = dev;
 			return tc_skip_sw(f->flags) ? -EINVAL : 0;
 		}
@@ -281,7 +281,7 @@ static void fl_hw_update_stats(struct tcf_proto *tp, struct cls_fl_filter *f)
 	struct tc_cls_flower_offload cls_flower = {};
 	struct net_device *dev = f->hw_dev;
 
-	if (!tc_can_offload(dev, tp))
+	if (!tc_can_offload(dev))
 		return;
 
 	tc_cls_common_offload_init(&cls_flower.common, tp);
diff --git a/net/sched/cls_matchall.c b/net/sched/cls_matchall.c
index d44e26fdae84..d4dc387f7a56 100644
--- a/net/sched/cls_matchall.c
+++ b/net/sched/cls_matchall.c
@@ -92,7 +92,7 @@ static void mall_destroy(struct tcf_proto *tp)
 	if (!head)
 		return;
 
-	if (tc_should_offload(dev, tp, head->flags))
+	if (tc_should_offload(dev, head->flags))
 		mall_destroy_hw_filter(tp, head, (unsigned long) head);
 
 	call_rcu(&head->rcu, mall_destroy_rcu);
@@ -172,7 +172,7 @@ static int mall_change(struct net *net, struct sk_buff *in_skb,
 	if (err)
 		goto err_set_parms;
 
-	if (tc_should_offload(dev, tp, flags)) {
+	if (tc_should_offload(dev, flags)) {
 		err = mall_replace_hw_filter(tp, new, (unsigned long) new);
 		if (err) {
 			if (tc_skip_sw(flags))
diff --git a/net/sched/cls_u32.c b/net/sched/cls_u32.c
index 5a3f78181526..af22742d2847 100644
--- a/net/sched/cls_u32.c
+++ b/net/sched/cls_u32.c
@@ -433,7 +433,7 @@ static void u32_remove_hw_knode(struct tcf_proto *tp, u32 handle)
 	struct net_device *dev = tp->q->dev_queue->dev;
 	struct tc_cls_u32_offload cls_u32 = {};
 
-	if (!tc_should_offload(dev, tp, 0))
+	if (!tc_should_offload(dev, 0))
 		return;
 
 	tc_cls_common_offload_init(&cls_u32.common, tp);
@@ -450,7 +450,7 @@ static int u32_replace_hw_hnode(struct tcf_proto *tp, struct tc_u_hnode *h,
 	struct tc_cls_u32_offload cls_u32 = {};
 	int err;
 
-	if (!tc_should_offload(dev, tp, flags))
+	if (!tc_should_offload(dev, flags))
 		return tc_skip_sw(flags) ? -EINVAL : 0;
 
 	tc_cls_common_offload_init(&cls_u32.common, tp);
@@ -471,7 +471,7 @@ static void u32_clear_hw_hnode(struct tcf_proto *tp, struct tc_u_hnode *h)
 	struct net_device *dev = tp->q->dev_queue->dev;
 	struct tc_cls_u32_offload cls_u32 = {};
 
-	if (!tc_should_offload(dev, tp, 0))
+	if (!tc_should_offload(dev, 0))
 		return;
 
 	tc_cls_common_offload_init(&cls_u32.common, tp);
@@ -490,7 +490,7 @@ static int u32_replace_hw_knode(struct tcf_proto *tp, struct tc_u_knode *n,
 	struct tc_cls_u32_offload cls_u32 = {};
 	int err;
 
-	if (!tc_should_offload(dev, tp, flags))
+	if (!tc_should_offload(dev, flags))
 		return tc_skip_sw(flags) ? -EINVAL : 0;
 
 	tc_cls_common_offload_init(&cls_u32.common, tp);
diff --git a/net/sched/sch_ingress.c b/net/sched/sch_ingress.c
index d8a9bebcab90..a15c543c3569 100644
--- a/net/sched/sch_ingress.c
+++ b/net/sched/sch_ingress.c
@@ -32,11 +32,6 @@ static unsigned long ingress_get(struct Qdisc *sch, u32 classid)
 	return TC_H_MIN(classid) + 1;
 }
 
-static bool ingress_cl_offload(u32 classid)
-{
-	return true;
-}
-
 static unsigned long ingress_bind_filter(struct Qdisc *sch,
 					 unsigned long parent, u32 classid)
 {
@@ -103,7 +98,6 @@ static const struct Qdisc_class_ops ingress_class_ops = {
 	.put		=	ingress_put,
 	.walk		=	ingress_walk,
 	.tcf_block	=	ingress_tcf_block,
-	.tcf_cl_offload	=	ingress_cl_offload,
 	.bind_tcf	=	ingress_bind_filter,
 	.unbind_tcf	=	ingress_put,
 };
@@ -134,11 +128,6 @@ static unsigned long clsact_get(struct Qdisc *sch, u32 classid)
 	}
 }
 
-static bool clsact_cl_offload(u32 classid)
-{
-	return TC_H_MIN(classid) == TC_H_MIN(TC_H_MIN_INGRESS);
-}
-
 static unsigned long clsact_bind_filter(struct Qdisc *sch,
 					unsigned long parent, u32 classid)
 {
@@ -198,7 +187,6 @@ static const struct Qdisc_class_ops clsact_class_ops = {
 	.put		=	ingress_put,
 	.walk		=	ingress_walk,
 	.tcf_block	=	clsact_tcf_block,
-	.tcf_cl_offload	=	clsact_cl_offload,
 	.bind_tcf	=	clsact_bind_filter,
 	.unbind_tcf	=	ingress_put,
 };
-- 
cgit v1.2.3


From ad729bc9acfb7c47112964b4877ef5404578ed13 Mon Sep 17 00:00:00 2001
From: Andreas Born <futur.andy@googlemail.com>
Date: Thu, 10 Aug 2017 06:41:44 +0200
Subject: bonding: require speed/duplex only for 802.3ad, alb and tlb

The patch c4adfc822bf5 ("bonding: make speed, duplex setting consistent
with link state") puts the link state to down if
bond_update_speed_duplex() cannot retrieve speed and duplex settings.
Assumably the patch was written with 802.3ad mode in mind which relies
on link speed/duplex settings. For other modes like active-backup these
settings are not required. Thus, only for these other modes, this patch
reintroduces support for slaves that do not support reporting speed or
duplex such as wireless devices. This fixes the regression reported in
bug 196547 (https://bugzilla.kernel.org/show_bug.cgi?id=196547).

Fixes: c4adfc822bf5 ("bonding: make speed, duplex setting consistent
with link state")
Signed-off-by: Andreas Born <futur.andy@googlemail.com>
Acked-by: Mahesh Bandewar <maheshb@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/bonding/bond_main.c | 6 ++++--
 include/net/bonding.h           | 5 +++++
 2 files changed, 9 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c
index 9bee6c1c70cc..85bb272d2a34 100644
--- a/drivers/net/bonding/bond_main.c
+++ b/drivers/net/bonding/bond_main.c
@@ -1569,7 +1569,8 @@ int bond_enslave(struct net_device *bond_dev, struct net_device *slave_dev)
 	new_slave->delay = 0;
 	new_slave->link_failure_count = 0;
 
-	if (bond_update_speed_duplex(new_slave))
+	if (bond_update_speed_duplex(new_slave) &&
+	    bond_needs_speed_duplex(bond))
 		new_slave->link = BOND_LINK_DOWN;
 
 	new_slave->last_rx = jiffies -
@@ -2140,7 +2141,8 @@ static void bond_miimon_commit(struct bonding *bond)
 			continue;
 
 		case BOND_LINK_UP:
-			if (bond_update_speed_duplex(slave)) {
+			if (bond_update_speed_duplex(slave) &&
+			    bond_needs_speed_duplex(bond)) {
 				slave->link = BOND_LINK_DOWN;
 				netdev_warn(bond->dev,
 					    "failed to get link speed/duplex for %s\n",
diff --git a/include/net/bonding.h b/include/net/bonding.h
index b00508d22e0a..b2e68657a216 100644
--- a/include/net/bonding.h
+++ b/include/net/bonding.h
@@ -277,6 +277,11 @@ static inline bool bond_is_lb(const struct bonding *bond)
 	       BOND_MODE(bond) == BOND_MODE_ALB;
 }
 
+static inline bool bond_needs_speed_duplex(const struct bonding *bond)
+{
+	return BOND_MODE(bond) == BOND_MODE_8023AD || bond_is_lb(bond);
+}
+
 static inline bool bond_is_nondyn_tlb(const struct bonding *bond)
 {
 	return (BOND_MODE(bond) == BOND_MODE_TLB)  &&
-- 
cgit v1.2.3


From e4dde4127396f0c8f1c2e11b3ecc5baf4f8628bf Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Fri, 11 Aug 2017 18:31:24 +0200
Subject: net: fix compilation when busy poll is not enabled
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

MIN_NAPI_ID is used in various places outside of
CONFIG_NET_RX_BUSY_POLL wrapping, so when it's not set
we run into build errors such as:

  net/core/dev.c: In function 'dev_get_by_napi_id':
  net/core/dev.c:886:16: error: ‘MIN_NAPI_ID’ undeclared (first use in this function)
    if (napi_id < MIN_NAPI_ID)
                  ^~~~~~~~~~~

Thus, have MIN_NAPI_ID always defined to fix these errors.

Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/busy_poll.h | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

(limited to 'include')

diff --git a/include/net/busy_poll.h b/include/net/busy_poll.h
index 8ffd434676b7..71c72a939bf8 100644
--- a/include/net/busy_poll.h
+++ b/include/net/busy_poll.h
@@ -29,18 +29,18 @@
 #include <linux/sched/signal.h>
 #include <net/ip.h>
 
-#ifdef CONFIG_NET_RX_BUSY_POLL
-
-struct napi_struct;
-extern unsigned int sysctl_net_busy_read __read_mostly;
-extern unsigned int sysctl_net_busy_poll __read_mostly;
-
 /*		0 - Reserved to indicate value not set
  *     1..NR_CPUS - Reserved for sender_cpu
  *  NR_CPUS+1..~0 - Region available for NAPI IDs
  */
 #define MIN_NAPI_ID ((unsigned int)(NR_CPUS + 1))
 
+#ifdef CONFIG_NET_RX_BUSY_POLL
+
+struct napi_struct;
+extern unsigned int sysctl_net_busy_read __read_mostly;
+extern unsigned int sysctl_net_busy_poll __read_mostly;
+
 static inline bool net_busy_loop_on(void)
 {
 	return sysctl_net_busy_poll;
-- 
cgit v1.2.3


From fd851ba9caa9a63fdbb72a2e6ed5560c0989e999 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Fri, 11 Aug 2017 10:48:53 -0700
Subject: udp: harden copy_linear_skb()

syzkaller got crashes with CONFIG_HARDENED_USERCOPY=y configs.

Issue here is that recvfrom() can be used with user buffer of Z bytes,
and SO_PEEK_OFF of X bytes, from a skb with Y bytes, and following
condition :

Z < X < Y

kernel BUG at mm/usercopy.c:72!
invalid opcode: 0000 [#1] SMP KASAN
Dumping ftrace buffer:
   (ftrace buffer empty)
Modules linked in:
CPU: 0 PID: 2917 Comm: syzkaller842281 Not tainted 4.13.0-rc3+ #16
Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS
Google 01/01/2011
task: ffff8801d2fa40c0 task.stack: ffff8801d1fe8000
RIP: 0010:report_usercopy mm/usercopy.c:64 [inline]
RIP: 0010:__check_object_size+0x3ad/0x500 mm/usercopy.c:264
RSP: 0018:ffff8801d1fef8a8 EFLAGS: 00010286
RAX: 0000000000000078 RBX: ffffffff847102c0 RCX: 0000000000000000
RDX: 0000000000000078 RSI: 1ffff1003a3fded5 RDI: ffffed003a3fdf09
RBP: ffff8801d1fef998 R08: 0000000000000001 R09: 0000000000000000
R10: 0000000000000000 R11: 0000000000000000 R12: ffff8801d1ea480e
R13: fffffffffffffffa R14: ffffffff84710280 R15: dffffc0000000000
FS:  0000000001360880(0000) GS:ffff8801dc000000(0000)
knlGS:0000000000000000
CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
CR2: 00000000202ecfe4 CR3: 00000001d1ff8000 CR4: 00000000001406f0
DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
Call Trace:
 check_object_size include/linux/thread_info.h:108 [inline]
 check_copy_size include/linux/thread_info.h:139 [inline]
 copy_to_iter include/linux/uio.h:105 [inline]
 copy_linear_skb include/net/udp.h:371 [inline]
 udpv6_recvmsg+0x1040/0x1af0 net/ipv6/udp.c:395
 inet_recvmsg+0x14c/0x5f0 net/ipv4/af_inet.c:793
 sock_recvmsg_nosec net/socket.c:792 [inline]
 sock_recvmsg+0xc9/0x110 net/socket.c:799
 SYSC_recvfrom+0x2d6/0x570 net/socket.c:1788
 SyS_recvfrom+0x40/0x50 net/socket.c:1760
 entry_SYSCALL_64_fastpath+0x1f/0xbe

Fixes: b65ac44674dd ("udp: try to avoid 2 cache miss on dequeue")
Signed-off-by: Eric Dumazet <edumazet@google.com>
Cc: Paolo Abeni <pabeni@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/udp.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include')

diff --git a/include/net/udp.h b/include/net/udp.h
index cc8036987dcb..e9b1d1eacb59 100644
--- a/include/net/udp.h
+++ b/include/net/udp.h
@@ -368,6 +368,8 @@ static inline int copy_linear_skb(struct sk_buff *skb, int len, int off,
 {
 	int n, copy = len - off;
 
+	if (copy < 0)
+		return -EINVAL;
 	n = copy_to_iter(skb->data + off, copy, to);
 	if (n == copy)
 		return 0;
-- 
cgit v1.2.3


From 45fff14bab00eea97e0f60a2a5ba3eca97bd476a Mon Sep 17 00:00:00 2001
From: Fabrice Gasnier <fabrice.gasnier@st.com>
Date: Thu, 3 Aug 2017 11:14:13 +0200
Subject: iio: trigger: stm32-timer: add support for STM32H7

Add support for STM32H7 timer triggers:
- Add new valids_table
- Introduce compatible, with configuration data
- Extend up to 15 timers, available on STM32H7

Signed-off-by: Fabrice Gasnier <fabrice.gasnier@st.com>
Signed-off-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
---
 drivers/iio/trigger/stm32-timer-trigger.c     | 55 +++++++++++++++++++++++++--
 include/linux/iio/timer/stm32-timer-trigger.h |  2 +
 2 files changed, 53 insertions(+), 4 deletions(-)

(limited to 'include')

diff --git a/drivers/iio/trigger/stm32-timer-trigger.c b/drivers/iio/trigger/stm32-timer-trigger.c
index d22bc56dd9fc..33890f9ad11b 100644
--- a/drivers/iio/trigger/stm32-timer-trigger.c
+++ b/drivers/iio/trigger/stm32-timer-trigger.c
@@ -13,6 +13,7 @@
 #include <linux/mfd/stm32-timers.h>
 #include <linux/module.h>
 #include <linux/platform_device.h>
+#include <linux/of_device.h>
 
 #define MAX_TRIGGERS 7
 #define MAX_VALIDS 5
@@ -31,6 +32,9 @@ static const void *triggers_table[][MAX_TRIGGERS] = {
 	{ }, /* timer 10 */
 	{ }, /* timer 11 */
 	{ TIM12_TRGO, TIM12_CH1, TIM12_CH2,},
+	{ }, /* timer 13 */
+	{ }, /* timer 14 */
+	{ TIM15_TRGO,},
 };
 
 /* List the triggers accepted by each timer */
@@ -49,6 +53,24 @@ static const void *valids_table[][MAX_VALIDS] = {
 	{ TIM4_TRGO, TIM5_TRGO,},
 };
 
+static const void *stm32h7_valids_table[][MAX_VALIDS] = {
+	{ TIM15_TRGO, TIM2_TRGO, TIM3_TRGO, TIM4_TRGO,},
+	{ TIM1_TRGO, TIM8_TRGO, TIM3_TRGO, TIM4_TRGO,},
+	{ TIM1_TRGO, TIM2_TRGO, TIM15_TRGO, TIM4_TRGO,},
+	{ TIM1_TRGO, TIM2_TRGO, TIM3_TRGO, TIM8_TRGO,},
+	{ TIM1_TRGO, TIM8_TRGO, TIM3_TRGO, TIM4_TRGO,},
+	{ }, /* timer 6 */
+	{ }, /* timer 7 */
+	{ TIM1_TRGO, TIM2_TRGO, TIM4_TRGO, TIM5_TRGO,},
+	{ }, /* timer 9 */
+	{ }, /* timer 10 */
+	{ }, /* timer 11 */
+	{ TIM4_TRGO, TIM5_TRGO,},
+	{ }, /* timer 13 */
+	{ }, /* timer 14 */
+	{ TIM1_TRGO, TIM3_TRGO,},
+};
+
 struct stm32_timer_trigger {
 	struct device *dev;
 	struct regmap *regmap;
@@ -59,6 +81,11 @@ struct stm32_timer_trigger {
 	bool has_trgo2;
 };
 
+struct stm32_timer_trigger_cfg {
+	const void *(*valids_table)[MAX_VALIDS];
+	const unsigned int num_valids_table;
+};
+
 static bool stm32_timer_is_trgo2_name(const char *name)
 {
 	return !!strstr(name, "trgo2");
@@ -734,18 +761,22 @@ static int stm32_timer_trigger_probe(struct platform_device *pdev)
 	struct device *dev = &pdev->dev;
 	struct stm32_timer_trigger *priv;
 	struct stm32_timers *ddata = dev_get_drvdata(pdev->dev.parent);
+	const struct stm32_timer_trigger_cfg *cfg;
 	unsigned int index;
 	int ret;
 
 	if (of_property_read_u32(dev->of_node, "reg", &index))
 		return -EINVAL;
 
+	cfg = (const struct stm32_timer_trigger_cfg *)
+		of_match_device(dev->driver->of_match_table, dev)->data;
+
 	if (index >= ARRAY_SIZE(triggers_table) ||
-	    index >= ARRAY_SIZE(valids_table))
+	    index >= cfg->num_valids_table)
 		return -EINVAL;
 
 	/* Create an IIO device only if we have triggers to be validated */
-	if (*valids_table[index])
+	if (*cfg->valids_table[index])
 		priv = stm32_setup_counter_device(dev);
 	else
 		priv = devm_kzalloc(dev, sizeof(*priv), GFP_KERNEL);
@@ -758,7 +789,7 @@ static int stm32_timer_trigger_probe(struct platform_device *pdev)
 	priv->clk = ddata->clk;
 	priv->max_arr = ddata->max_arr;
 	priv->triggers = triggers_table[index];
-	priv->valids = valids_table[index];
+	priv->valids = cfg->valids_table[index];
 	stm32_timer_detect_trgo2(priv);
 
 	ret = stm32_setup_iio_triggers(priv);
@@ -770,8 +801,24 @@ static int stm32_timer_trigger_probe(struct platform_device *pdev)
 	return 0;
 }
 
+static const struct stm32_timer_trigger_cfg stm32_timer_trg_cfg = {
+	.valids_table = valids_table,
+	.num_valids_table = ARRAY_SIZE(valids_table),
+};
+
+static const struct stm32_timer_trigger_cfg stm32h7_timer_trg_cfg = {
+	.valids_table = stm32h7_valids_table,
+	.num_valids_table = ARRAY_SIZE(stm32h7_valids_table),
+};
+
 static const struct of_device_id stm32_trig_of_match[] = {
-	{ .compatible = "st,stm32-timer-trigger", },
+	{
+		.compatible = "st,stm32-timer-trigger",
+		.data = (void *)&stm32_timer_trg_cfg,
+	}, {
+		.compatible = "st,stm32h7-timer-trigger",
+		.data = (void *)&stm32h7_timer_trg_cfg,
+	},
 	{ /* end node */ },
 };
 MODULE_DEVICE_TABLE(of, stm32_trig_of_match);
diff --git a/include/linux/iio/timer/stm32-timer-trigger.h b/include/linux/iio/timer/stm32-timer-trigger.h
index fa7d786ed99e..20f465353a7c 100644
--- a/include/linux/iio/timer/stm32-timer-trigger.h
+++ b/include/linux/iio/timer/stm32-timer-trigger.h
@@ -59,6 +59,8 @@
 #define TIM12_CH1	"tim12_ch1"
 #define TIM12_CH2	"tim12_ch2"
 
+#define TIM15_TRGO	"tim15_trgo"
+
 bool is_stm32_timer_trigger(struct iio_trigger *trig);
 
 #endif
-- 
cgit v1.2.3


From 25892d6eac9cb752af95bcd539ed5e1158eba760 Mon Sep 17 00:00:00 2001
From: Fabrice Gasnier <fabrice.gasnier@st.com>
Date: Thu, 3 Aug 2017 11:14:14 +0200
Subject: iio: trigger: stm32-timer: add output compare triggers

Add output compare trigger sources available on some instances.

Signed-off-by: Fabrice Gasnier <fabrice.gasnier@st.com>
Signed-off-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
---
 drivers/iio/trigger/stm32-timer-trigger.c     | 31 +++++++++++++++++----------
 include/linux/iio/timer/stm32-timer-trigger.h | 12 +++++++++++
 2 files changed, 32 insertions(+), 11 deletions(-)

(limited to 'include')

diff --git a/drivers/iio/trigger/stm32-timer-trigger.c b/drivers/iio/trigger/stm32-timer-trigger.c
index 33890f9ad11b..a9bc5b603b86 100644
--- a/drivers/iio/trigger/stm32-timer-trigger.c
+++ b/drivers/iio/trigger/stm32-timer-trigger.c
@@ -29,12 +29,14 @@ static const void *triggers_table[][MAX_TRIGGERS] = {
 	{ TIM7_TRGO,},
 	{ TIM8_TRGO, TIM8_TRGO2, TIM8_CH1, TIM8_CH2, TIM8_CH3, TIM8_CH4,},
 	{ TIM9_TRGO, TIM9_CH1, TIM9_CH2,},
-	{ }, /* timer 10 */
-	{ }, /* timer 11 */
+	{ TIM10_OC1,},
+	{ TIM11_OC1,},
 	{ TIM12_TRGO, TIM12_CH1, TIM12_CH2,},
-	{ }, /* timer 13 */
-	{ }, /* timer 14 */
+	{ TIM13_OC1,},
+	{ TIM14_OC1,},
 	{ TIM15_TRGO,},
+	{ TIM16_OC1,},
+	{ TIM17_OC1,},
 };
 
 /* List the triggers accepted by each timer */
@@ -47,10 +49,10 @@ static const void *valids_table[][MAX_VALIDS] = {
 	{ }, /* timer 6 */
 	{ }, /* timer 7 */
 	{ TIM1_TRGO, TIM2_TRGO, TIM4_TRGO, TIM5_TRGO,},
-	{ TIM2_TRGO, TIM3_TRGO,},
+	{ TIM2_TRGO, TIM3_TRGO, TIM10_OC1, TIM11_OC1,},
 	{ }, /* timer 10 */
 	{ }, /* timer 11 */
-	{ TIM4_TRGO, TIM5_TRGO,},
+	{ TIM4_TRGO, TIM5_TRGO, TIM13_OC1, TIM14_OC1,},
 };
 
 static const void *stm32h7_valids_table[][MAX_VALIDS] = {
@@ -65,10 +67,12 @@ static const void *stm32h7_valids_table[][MAX_VALIDS] = {
 	{ }, /* timer 9 */
 	{ }, /* timer 10 */
 	{ }, /* timer 11 */
-	{ TIM4_TRGO, TIM5_TRGO,},
+	{ TIM4_TRGO, TIM5_TRGO, TIM13_OC1, TIM14_OC1,},
 	{ }, /* timer 13 */
 	{ }, /* timer 14 */
-	{ TIM1_TRGO, TIM3_TRGO,},
+	{ TIM1_TRGO, TIM3_TRGO, TIM16_OC1, TIM17_OC1,},
+	{ }, /* timer 16 */
+	{ }, /* timer 17 */
 };
 
 struct stm32_timer_trigger {
@@ -91,6 +95,11 @@ static bool stm32_timer_is_trgo2_name(const char *name)
 	return !!strstr(name, "trgo2");
 }
 
+static bool stm32_timer_is_trgo_name(const char *name)
+{
+	return (!!strstr(name, "trgo") && !strstr(name, "trgo2"));
+}
+
 static int stm32_timer_start(struct stm32_timer_trigger *priv,
 			     struct iio_trigger *trig,
 			     unsigned int frequency)
@@ -355,6 +364,7 @@ static int stm32_setup_iio_triggers(struct stm32_timer_trigger *priv)
 
 	while (cur && *cur) {
 		struct iio_trigger *trig;
+		bool cur_is_trgo = stm32_timer_is_trgo_name(*cur);
 		bool cur_is_trgo2 = stm32_timer_is_trgo2_name(*cur);
 
 		if (cur_is_trgo2 && !priv->has_trgo2) {
@@ -371,10 +381,9 @@ static int stm32_setup_iio_triggers(struct stm32_timer_trigger *priv)
 
 		/*
 		 * sampling frequency and master mode attributes
-		 * should only be available on trgo trigger which
-		 * is always the first in the list.
+		 * should only be available on trgo/trgo2 triggers
 		 */
-		if (cur == priv->triggers || cur_is_trgo2)
+		if (cur_is_trgo || cur_is_trgo2)
 			trig->dev.groups = stm32_trigger_attr_groups;
 
 		iio_trigger_set_drvdata(trig, priv);
diff --git a/include/linux/iio/timer/stm32-timer-trigger.h b/include/linux/iio/timer/stm32-timer-trigger.h
index 20f465353a7c..d68add80ab86 100644
--- a/include/linux/iio/timer/stm32-timer-trigger.h
+++ b/include/linux/iio/timer/stm32-timer-trigger.h
@@ -55,12 +55,24 @@
 #define TIM9_CH1	"tim9_ch1"
 #define TIM9_CH2	"tim9_ch2"
 
+#define TIM10_OC1	"tim10_oc1"
+
+#define TIM11_OC1	"tim11_oc1"
+
 #define TIM12_TRGO	"tim12_trgo"
 #define TIM12_CH1	"tim12_ch1"
 #define TIM12_CH2	"tim12_ch2"
 
+#define TIM13_OC1	"tim13_oc1"
+
+#define TIM14_OC1	"tim14_oc1"
+
 #define TIM15_TRGO	"tim15_trgo"
 
+#define TIM16_OC1	"tim16_oc1"
+
+#define TIM17_OC1	"tim17_oc1"
+
 bool is_stm32_timer_trigger(struct iio_trigger *trig);
 
 #endif
-- 
cgit v1.2.3


From d4092d76a4a4e57b65910899948a83cc8646c5a5 Mon Sep 17 00:00:00 2001
From: Boris Brezillon <boris.brezillon@free-electrons.com>
Date: Fri, 4 Aug 2017 17:29:10 +0200
Subject: mtd: nand: Rename nand.h into rawnand.h

We are planning to share more code between different NAND based
devices (SPI NAND, OneNAND and raw NANDs), but before doing that
we need to move the existing include/linux/mtd/nand.h file into
include/linux/mtd/rawnand.h so we can later create a nand.h header
containing all common structure and function prototypes.

Signed-off-by: Boris Brezillon <boris.brezillon@free-electrons.com>
Signed-off-by: Peter Pan <peterpandong@micron.com>
Acked-by: Vladimir Zapolskiy <vz@mleia.com>
Acked-by: Alexander Sverdlin <alexander.sverdlin@gmail.com>
Acked-by: Wenyou Yang <wenyou.yang@microchip.com>
Acked-by: Krzysztof Kozlowski <krzk@kernel.org>
Acked-by: Han Xu <han.xu@nxp.com>
Acked-by: H Hartley Sweeten <hsweeten@visionengravers.com>
Acked-by: Shawn Guo <shawnguo@kernel.org>
Acked-by: Gregory CLEMENT <gregory.clement@free-electrons.com>
Acked-by: Neil Armstrong <narmstrong@baylibre.com>
Acked-by: Masahiro Yamada <yamada.masahiro@socionext.com>
Acked-By: Harvey Hunt <harveyhuntnexus@gmail.com>
Acked-by: Tony Lindgren <tony@atomide.com>
Acked-by: Krzysztof Halasa <khalasa@piap.pl>
---
 Documentation/driver-api/mtdnand.rst            |    8 +-
 MAINTAINERS                                     |    2 +-
 arch/arm/mach-davinci/board-da850-evm.c         |    2 +-
 arch/arm/mach-davinci/board-dm355-evm.c         |    2 +-
 arch/arm/mach-davinci/board-dm355-leopard.c     |    2 +-
 arch/arm/mach-davinci/board-dm365-evm.c         |    2 +-
 arch/arm/mach-davinci/board-dm644x-evm.c        |    2 +-
 arch/arm/mach-davinci/board-dm646x-evm.c        |    2 +-
 arch/arm/mach-davinci/board-sffsdr.c            |    2 +-
 arch/arm/mach-dove/dove-db-setup.c              |    2 +-
 arch/arm/mach-ep93xx/snappercl15.c              |    2 +-
 arch/arm/mach-ep93xx/ts72xx.c                   |    2 +-
 arch/arm/mach-imx/mach-qong.c                   |    2 +-
 arch/arm/mach-ixp4xx/ixdp425-setup.c            |    2 +-
 arch/arm/mach-mmp/aspenite.c                    |    2 +-
 arch/arm/mach-omap1/board-fsample.c             |    2 +-
 arch/arm/mach-omap1/board-h2.c                  |    2 +-
 arch/arm/mach-omap1/board-h3.c                  |    2 +-
 arch/arm/mach-omap1/board-nand.c                |    2 +-
 arch/arm/mach-omap1/board-perseus2.c            |    2 +-
 arch/arm/mach-orion5x/db88f5281-setup.c         |    2 +-
 arch/arm/mach-orion5x/kurobox_pro-setup.c       |    2 +-
 arch/arm/mach-orion5x/ts209-setup.c             |    2 +-
 arch/arm/mach-orion5x/ts78xx-setup.c            |    2 +-
 arch/arm/mach-pxa/balloon3.c                    |    2 +-
 arch/arm/mach-pxa/em-x270.c                     |    2 +-
 arch/arm/mach-pxa/eseries.c                     |    2 +-
 arch/arm/mach-pxa/palmtx.c                      |    2 +-
 arch/arm/mach-pxa/tosa.c                        |    2 +-
 arch/arm/mach-s3c24xx/common-smdk.c             |    2 +-
 arch/arm/mach-s3c24xx/mach-anubis.c             |    2 +-
 arch/arm/mach-s3c24xx/mach-at2440evb.c          |    2 +-
 arch/arm/mach-s3c24xx/mach-bast.c               |    2 +-
 arch/arm/mach-s3c24xx/mach-gta02.c              |    2 +-
 arch/arm/mach-s3c24xx/mach-jive.c               |    2 +-
 arch/arm/mach-s3c24xx/mach-mini2440.c           |    2 +-
 arch/arm/mach-s3c24xx/mach-osiris.c             |    2 +-
 arch/arm/mach-s3c24xx/mach-qt2410.c             |    2 +-
 arch/arm/mach-s3c24xx/mach-rx3715.c             |    2 +-
 arch/arm/mach-s3c24xx/mach-vstms.c              |    2 +-
 arch/blackfin/mach-bf537/boards/dnp5370.c       |    2 +-
 arch/blackfin/mach-bf537/boards/stamp.c         |    2 +-
 arch/blackfin/mach-bf561/boards/acvilon.c       |    2 +-
 arch/cris/arch-v32/drivers/mach-a3/nandflash.c  |    2 +-
 arch/cris/arch-v32/drivers/mach-fs/nandflash.c  |    2 +-
 arch/mips/alchemy/devboards/db1200.c            |    2 +-
 arch/mips/alchemy/devboards/db1300.c            |    2 +-
 arch/mips/alchemy/devboards/db1550.c            |    2 +-
 arch/mips/include/asm/mach-jz4740/jz4740_nand.h |    2 +-
 arch/mips/netlogic/xlr/platform-flash.c         |    2 +-
 arch/mips/pnx833x/common/platform.c             |    2 +-
 arch/mips/rb532/devices.c                       |    2 +-
 arch/sh/boards/mach-migor/setup.c               |    2 +-
 drivers/mtd/inftlcore.c                         |    2 +-
 drivers/mtd/nand/ams-delta.c                    |    2 +-
 drivers/mtd/nand/atmel/nand-controller.c        |    2 +-
 drivers/mtd/nand/atmel/pmecc.c                  |    2 +-
 drivers/mtd/nand/au1550nd.c                     |    2 +-
 drivers/mtd/nand/bcm47xxnflash/bcm47xxnflash.h  |    2 +-
 drivers/mtd/nand/bf5xx_nand.c                   |    2 +-
 drivers/mtd/nand/brcmnand/brcmnand.c            |    2 +-
 drivers/mtd/nand/cafe_nand.c                    |    2 +-
 drivers/mtd/nand/cmx270_nand.c                  |    2 +-
 drivers/mtd/nand/cs553x_nand.c                  |    2 +-
 drivers/mtd/nand/davinci_nand.c                 |    2 +-
 drivers/mtd/nand/denali.h                       |    2 +-
 drivers/mtd/nand/diskonchip.c                   |    2 +-
 drivers/mtd/nand/docg4.c                        |    2 +-
 drivers/mtd/nand/fsl_elbc_nand.c                |    2 +-
 drivers/mtd/nand/fsl_ifc_nand.c                 |    2 +-
 drivers/mtd/nand/fsl_upm.c                      |    2 +-
 drivers/mtd/nand/fsmc_nand.c                    |    2 +-
 drivers/mtd/nand/gpio.c                         |    2 +-
 drivers/mtd/nand/gpmi-nand/gpmi-nand.h          |    2 +-
 drivers/mtd/nand/hisi504_nand.c                 |    2 +-
 drivers/mtd/nand/jz4740_nand.c                  |    2 +-
 drivers/mtd/nand/jz4780_nand.c                  |    2 +-
 drivers/mtd/nand/lpc32xx_mlc.c                  |    2 +-
 drivers/mtd/nand/lpc32xx_slc.c                  |    2 +-
 drivers/mtd/nand/mpc5121_nfc.c                  |    2 +-
 drivers/mtd/nand/mtk_nand.c                     |    2 +-
 drivers/mtd/nand/mxc_nand.c                     |    2 +-
 drivers/mtd/nand/nand_amd.c                     |    2 +-
 drivers/mtd/nand/nand_base.c                    |    2 +-
 drivers/mtd/nand/nand_bbt.c                     |    2 +-
 drivers/mtd/nand/nand_bch.c                     |    2 +-
 drivers/mtd/nand/nand_ecc.c                     |    2 +-
 drivers/mtd/nand/nand_hynix.c                   |    2 +-
 drivers/mtd/nand/nand_ids.c                     |    2 +-
 drivers/mtd/nand/nand_macronix.c                |    2 +-
 drivers/mtd/nand/nand_micron.c                  |    2 +-
 drivers/mtd/nand/nand_samsung.c                 |    2 +-
 drivers/mtd/nand/nand_timings.c                 |    2 +-
 drivers/mtd/nand/nand_toshiba.c                 |    2 +-
 drivers/mtd/nand/nandsim.c                      |    2 +-
 drivers/mtd/nand/ndfc.c                         |    2 +-
 drivers/mtd/nand/nuc900_nand.c                  |    2 +-
 drivers/mtd/nand/omap2.c                        |    2 +-
 drivers/mtd/nand/orion_nand.c                   |    2 +-
 drivers/mtd/nand/oxnas_nand.c                   |    2 +-
 drivers/mtd/nand/pasemi_nand.c                  |    2 +-
 drivers/mtd/nand/plat_nand.c                    |    2 +-
 drivers/mtd/nand/pxa3xx_nand.c                  |    2 +-
 drivers/mtd/nand/qcom_nandc.c                   |    2 +-
 drivers/mtd/nand/r852.h                         |    2 +-
 drivers/mtd/nand/s3c2410.c                      |    2 +-
 drivers/mtd/nand/sh_flctl.c                     |    2 +-
 drivers/mtd/nand/sharpsl.c                      |    2 +-
 drivers/mtd/nand/sm_common.c                    |    2 +-
 drivers/mtd/nand/socrates_nand.c                |    2 +-
 drivers/mtd/nand/sunxi_nand.c                   |    2 +-
 drivers/mtd/nand/tango_nand.c                   |    2 +-
 drivers/mtd/nand/tmio_nand.c                    |    2 +-
 drivers/mtd/nand/txx9ndfmc.c                    |    2 +-
 drivers/mtd/nand/vf610_nfc.c                    |    2 +-
 drivers/mtd/nand/xway_nand.c                    |    2 +-
 drivers/mtd/nftlcore.c                          |    2 +-
 drivers/mtd/nftlmount.c                         |    2 +-
 drivers/mtd/ssfdc.c                             |    2 +-
 drivers/mtd/tests/nandbiterrs.c                 |    2 +-
 drivers/staging/mt29f_spinand/mt29f_spinand.c   |    2 +-
 fs/jffs2/wbuf.c                                 |    2 +-
 include/linux/mtd/nand-gpio.h                   |    2 +-
 include/linux/mtd/nand.h                        | 1331 -----------------------
 include/linux/mtd/rawnand.h                     | 1329 ++++++++++++++++++++++
 include/linux/mtd/sh_flctl.h                    |    2 +-
 include/linux/mtd/sharpsl.h                     |    2 +-
 include/linux/platform_data/mtd-davinci.h       |    2 +-
 include/linux/platform_data/mtd-nand-s3c2410.h  |    2 +-
 129 files changed, 1459 insertions(+), 1461 deletions(-)
 delete mode 100644 include/linux/mtd/nand.h
 create mode 100644 include/linux/mtd/rawnand.h

(limited to 'include')

diff --git a/Documentation/driver-api/mtdnand.rst b/Documentation/driver-api/mtdnand.rst
index e9afa586d15e..2a5191b6d445 100644
--- a/Documentation/driver-api/mtdnand.rst
+++ b/Documentation/driver-api/mtdnand.rst
@@ -516,7 +516,7 @@ mirrored table is performed.
 
 The most important field in the nand_bbt_descr structure is the
 options field. The options define most of the table properties. Use the
-predefined constants from nand.h to define the options.
+predefined constants from rawnand.h to define the options.
 
 -  Number of bits per block
 
@@ -843,7 +843,7 @@ Chip option constants
 Constants for chip id table
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
-These constants are defined in nand.h. They are OR-ed together to
+These constants are defined in rawnand.h. They are OR-ed together to
 describe the chip functionality::
 
     /* Buswitdh is 16 bit */
@@ -865,7 +865,7 @@ describe the chip functionality::
 Constants for runtime options
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
-These constants are defined in nand.h. They are OR-ed together to
+These constants are defined in rawnand.h. They are OR-ed together to
 describe the functionality::
 
     /* The hw ecc generator provides a syndrome instead a ecc value on read
@@ -956,7 +956,7 @@ developer. Each struct member has a short description which is marked
 with an [XXX] identifier. See the chapter "Documentation hints" for an
 explanation.
 
-.. kernel-doc:: include/linux/mtd/nand.h
+.. kernel-doc:: include/linux/mtd/rawnand.h
    :internal:
 
 Public Functions Provided
diff --git a/MAINTAINERS b/MAINTAINERS
index 205d3977ac46..bffb38373550 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -9034,7 +9034,7 @@ T:	git git://git.infradead.org/linux-mtd.git nand/fixes
 T:	git git://git.infradead.org/l2-mtd.git nand/next
 S:	Maintained
 F:	drivers/mtd/nand/
-F:	include/linux/mtd/nand*.h
+F:	include/linux/mtd/*nand*.h
 
 NATSEMI ETHERNET DRIVER (DP8381x)
 S:	Orphan
diff --git a/arch/arm/mach-davinci/board-da850-evm.c b/arch/arm/mach-davinci/board-da850-evm.c
index b5625d009288..f54410388194 100644
--- a/arch/arm/mach-davinci/board-da850-evm.c
+++ b/arch/arm/mach-davinci/board-da850-evm.c
@@ -26,7 +26,7 @@
 #include <linux/input/tps6507x-ts.h>
 #include <linux/mfd/tps6507x.h>
 #include <linux/mtd/mtd.h>
-#include <linux/mtd/nand.h>
+#include <linux/mtd/rawnand.h>
 #include <linux/mtd/partitions.h>
 #include <linux/mtd/physmap.h>
 #include <linux/platform_device.h>
diff --git a/arch/arm/mach-davinci/board-dm355-evm.c b/arch/arm/mach-davinci/board-dm355-evm.c
index 18296a99c4d2..62e7bc3018f0 100644
--- a/arch/arm/mach-davinci/board-dm355-evm.c
+++ b/arch/arm/mach-davinci/board-dm355-evm.c
@@ -14,7 +14,7 @@
 #include <linux/platform_device.h>
 #include <linux/mtd/mtd.h>
 #include <linux/mtd/partitions.h>
-#include <linux/mtd/nand.h>
+#include <linux/mtd/rawnand.h>
 #include <linux/i2c.h>
 #include <linux/gpio.h>
 #include <linux/clk.h>
diff --git a/arch/arm/mach-davinci/board-dm355-leopard.c b/arch/arm/mach-davinci/board-dm355-leopard.c
index 284ff27c1b32..be997243447b 100644
--- a/arch/arm/mach-davinci/board-dm355-leopard.c
+++ b/arch/arm/mach-davinci/board-dm355-leopard.c
@@ -13,7 +13,7 @@
 #include <linux/platform_device.h>
 #include <linux/mtd/mtd.h>
 #include <linux/mtd/partitions.h>
-#include <linux/mtd/nand.h>
+#include <linux/mtd/rawnand.h>
 #include <linux/i2c.h>
 #include <linux/gpio.h>
 #include <linux/clk.h>
diff --git a/arch/arm/mach-davinci/board-dm365-evm.c b/arch/arm/mach-davinci/board-dm365-evm.c
index 0464999b7137..e75741fb2c1d 100644
--- a/arch/arm/mach-davinci/board-dm365-evm.c
+++ b/arch/arm/mach-davinci/board-dm365-evm.c
@@ -23,7 +23,7 @@
 #include <linux/mtd/mtd.h>
 #include <linux/mtd/partitions.h>
 #include <linux/slab.h>
-#include <linux/mtd/nand.h>
+#include <linux/mtd/rawnand.h>
 #include <linux/input.h>
 #include <linux/spi/spi.h>
 #include <linux/spi/eeprom.h>
diff --git a/arch/arm/mach-davinci/board-dm644x-evm.c b/arch/arm/mach-davinci/board-dm644x-evm.c
index 70e00dbeec96..b07c9b18d427 100644
--- a/arch/arm/mach-davinci/board-dm644x-evm.c
+++ b/arch/arm/mach-davinci/board-dm644x-evm.c
@@ -17,7 +17,7 @@
 #include <linux/platform_data/pcf857x.h>
 #include <linux/platform_data/at24.h>
 #include <linux/mtd/mtd.h>
-#include <linux/mtd/nand.h>
+#include <linux/mtd/rawnand.h>
 #include <linux/mtd/partitions.h>
 #include <linux/mtd/physmap.h>
 #include <linux/phy.h>
diff --git a/arch/arm/mach-davinci/board-dm646x-evm.c b/arch/arm/mach-davinci/board-dm646x-evm.c
index 1d76e7480a42..cb0a41e83582 100644
--- a/arch/arm/mach-davinci/board-dm646x-evm.c
+++ b/arch/arm/mach-davinci/board-dm646x-evm.c
@@ -29,7 +29,7 @@
 #include <media/i2c/adv7343.h>
 
 #include <linux/mtd/mtd.h>
-#include <linux/mtd/nand.h>
+#include <linux/mtd/rawnand.h>
 #include <linux/mtd/partitions.h>
 #include <linux/clk.h>
 #include <linux/export.h>
diff --git a/arch/arm/mach-davinci/board-sffsdr.c b/arch/arm/mach-davinci/board-sffsdr.c
index 41c7c9615791..d85accf7f760 100644
--- a/arch/arm/mach-davinci/board-sffsdr.c
+++ b/arch/arm/mach-davinci/board-sffsdr.c
@@ -28,7 +28,7 @@
 #include <linux/i2c.h>
 #include <linux/platform_data/at24.h>
 #include <linux/mtd/mtd.h>
-#include <linux/mtd/nand.h>
+#include <linux/mtd/rawnand.h>
 #include <linux/mtd/partitions.h>
 
 #include <asm/mach-types.h>
diff --git a/arch/arm/mach-dove/dove-db-setup.c b/arch/arm/mach-dove/dove-db-setup.c
index bcb678fd2415..8971c3c0f0fe 100644
--- a/arch/arm/mach-dove/dove-db-setup.c
+++ b/arch/arm/mach-dove/dove-db-setup.c
@@ -13,7 +13,7 @@
 #include <linux/platform_device.h>
 #include <linux/irq.h>
 #include <linux/mtd/physmap.h>
-#include <linux/mtd/nand.h>
+#include <linux/mtd/rawnand.h>
 #include <linux/timer.h>
 #include <linux/ata_platform.h>
 #include <linux/mv643xx_eth.h>
diff --git a/arch/arm/mach-ep93xx/snappercl15.c b/arch/arm/mach-ep93xx/snappercl15.c
index b2db791b3b38..8b29398f4dc7 100644
--- a/arch/arm/mach-ep93xx/snappercl15.c
+++ b/arch/arm/mach-ep93xx/snappercl15.c
@@ -25,7 +25,7 @@
 #include <linux/fb.h>
 
 #include <linux/mtd/partitions.h>
-#include <linux/mtd/nand.h>
+#include <linux/mtd/rawnand.h>
 
 #include <mach/hardware.h>
 #include <linux/platform_data/video-ep93xx.h>
diff --git a/arch/arm/mach-ep93xx/ts72xx.c b/arch/arm/mach-ep93xx/ts72xx.c
index 55b186ef863a..8745162ec05d 100644
--- a/arch/arm/mach-ep93xx/ts72xx.c
+++ b/arch/arm/mach-ep93xx/ts72xx.c
@@ -16,7 +16,7 @@
 #include <linux/init.h>
 #include <linux/platform_device.h>
 #include <linux/io.h>
-#include <linux/mtd/nand.h>
+#include <linux/mtd/rawnand.h>
 #include <linux/mtd/partitions.h>
 
 #include <mach/hardware.h>
diff --git a/arch/arm/mach-imx/mach-qong.c b/arch/arm/mach-imx/mach-qong.c
index 8c2cbd693d21..42a700053103 100644
--- a/arch/arm/mach-imx/mach-qong.c
+++ b/arch/arm/mach-imx/mach-qong.c
@@ -18,7 +18,7 @@
 #include <linux/memory.h>
 #include <linux/platform_device.h>
 #include <linux/mtd/physmap.h>
-#include <linux/mtd/nand.h>
+#include <linux/mtd/rawnand.h>
 #include <linux/gpio.h>
 
 #include <asm/mach-types.h>
diff --git a/arch/arm/mach-ixp4xx/ixdp425-setup.c b/arch/arm/mach-ixp4xx/ixdp425-setup.c
index 508c2d7786e2..93b89291c06b 100644
--- a/arch/arm/mach-ixp4xx/ixdp425-setup.c
+++ b/arch/arm/mach-ixp4xx/ixdp425-setup.c
@@ -17,7 +17,7 @@
 #include <linux/i2c-gpio.h>
 #include <linux/io.h>
 #include <linux/mtd/mtd.h>
-#include <linux/mtd/nand.h>
+#include <linux/mtd/rawnand.h>
 #include <linux/mtd/partitions.h>
 #include <linux/delay.h>
 #include <linux/gpio.h>
diff --git a/arch/arm/mach-mmp/aspenite.c b/arch/arm/mach-mmp/aspenite.c
index 5db0edf716dd..d2283009a5ff 100644
--- a/arch/arm/mach-mmp/aspenite.c
+++ b/arch/arm/mach-mmp/aspenite.c
@@ -16,7 +16,7 @@
 #include <linux/smc91x.h>
 #include <linux/mtd/mtd.h>
 #include <linux/mtd/partitions.h>
-#include <linux/mtd/nand.h>
+#include <linux/mtd/rawnand.h>
 #include <linux/interrupt.h>
 #include <linux/platform_data/mv_usb.h>
 
diff --git a/arch/arm/mach-omap1/board-fsample.c b/arch/arm/mach-omap1/board-fsample.c
index fad95b74bb65..b93ad58b0a63 100644
--- a/arch/arm/mach-omap1/board-fsample.c
+++ b/arch/arm/mach-omap1/board-fsample.c
@@ -16,7 +16,7 @@
 #include <linux/platform_device.h>
 #include <linux/delay.h>
 #include <linux/mtd/mtd.h>
-#include <linux/mtd/nand.h>
+#include <linux/mtd/rawnand.h>
 #include <linux/mtd/partitions.h>
 #include <linux/mtd/physmap.h>
 #include <linux/input.h>
diff --git a/arch/arm/mach-omap1/board-h2.c b/arch/arm/mach-omap1/board-h2.c
index 675254ee4b1e..a444b139bff5 100644
--- a/arch/arm/mach-omap1/board-h2.c
+++ b/arch/arm/mach-omap1/board-h2.c
@@ -24,7 +24,7 @@
 #include <linux/delay.h>
 #include <linux/i2c.h>
 #include <linux/mtd/mtd.h>
-#include <linux/mtd/nand.h>
+#include <linux/mtd/rawnand.h>
 #include <linux/mtd/partitions.h>
 #include <linux/mtd/physmap.h>
 #include <linux/input.h>
diff --git a/arch/arm/mach-omap1/board-h3.c b/arch/arm/mach-omap1/board-h3.c
index e62f9d454f10..a618a49a30b8 100644
--- a/arch/arm/mach-omap1/board-h3.c
+++ b/arch/arm/mach-omap1/board-h3.c
@@ -23,7 +23,7 @@
 #include <linux/workqueue.h>
 #include <linux/i2c.h>
 #include <linux/mtd/mtd.h>
-#include <linux/mtd/nand.h>
+#include <linux/mtd/rawnand.h>
 #include <linux/mtd/partitions.h>
 #include <linux/mtd/physmap.h>
 #include <linux/input.h>
diff --git a/arch/arm/mach-omap1/board-nand.c b/arch/arm/mach-omap1/board-nand.c
index 7684f9203474..1bffbb4e050f 100644
--- a/arch/arm/mach-omap1/board-nand.c
+++ b/arch/arm/mach-omap1/board-nand.c
@@ -16,7 +16,7 @@
 #include <linux/kernel.h>
 #include <linux/io.h>
 #include <linux/mtd/mtd.h>
-#include <linux/mtd/nand.h>
+#include <linux/mtd/rawnand.h>
 
 #include "common.h"
 
diff --git a/arch/arm/mach-omap1/board-perseus2.c b/arch/arm/mach-omap1/board-perseus2.c
index 150b57ba42bf..e994a78bdd09 100644
--- a/arch/arm/mach-omap1/board-perseus2.c
+++ b/arch/arm/mach-omap1/board-perseus2.c
@@ -16,7 +16,7 @@
 #include <linux/platform_device.h>
 #include <linux/delay.h>
 #include <linux/mtd/mtd.h>
-#include <linux/mtd/nand.h>
+#include <linux/mtd/rawnand.h>
 #include <linux/mtd/partitions.h>
 #include <linux/mtd/physmap.h>
 #include <linux/input.h>
diff --git a/arch/arm/mach-orion5x/db88f5281-setup.c b/arch/arm/mach-orion5x/db88f5281-setup.c
index 12f74b46e2ff..3f5863de766a 100644
--- a/arch/arm/mach-orion5x/db88f5281-setup.c
+++ b/arch/arm/mach-orion5x/db88f5281-setup.c
@@ -16,7 +16,7 @@
 #include <linux/pci.h>
 #include <linux/irq.h>
 #include <linux/mtd/physmap.h>
-#include <linux/mtd/nand.h>
+#include <linux/mtd/rawnand.h>
 #include <linux/timer.h>
 #include <linux/mv643xx_eth.h>
 #include <linux/i2c.h>
diff --git a/arch/arm/mach-orion5x/kurobox_pro-setup.c b/arch/arm/mach-orion5x/kurobox_pro-setup.c
index 9dc3f59bed9c..83d43cff4bd7 100644
--- a/arch/arm/mach-orion5x/kurobox_pro-setup.c
+++ b/arch/arm/mach-orion5x/kurobox_pro-setup.c
@@ -15,7 +15,7 @@
 #include <linux/irq.h>
 #include <linux/delay.h>
 #include <linux/mtd/physmap.h>
-#include <linux/mtd/nand.h>
+#include <linux/mtd/rawnand.h>
 #include <linux/mv643xx_eth.h>
 #include <linux/i2c.h>
 #include <linux/serial_reg.h>
diff --git a/arch/arm/mach-orion5x/ts209-setup.c b/arch/arm/mach-orion5x/ts209-setup.c
index 7bd671b2854c..0c315515dd2d 100644
--- a/arch/arm/mach-orion5x/ts209-setup.c
+++ b/arch/arm/mach-orion5x/ts209-setup.c
@@ -15,7 +15,7 @@
 #include <linux/pci.h>
 #include <linux/irq.h>
 #include <linux/mtd/physmap.h>
-#include <linux/mtd/nand.h>
+#include <linux/mtd/rawnand.h>
 #include <linux/mv643xx_eth.h>
 #include <linux/gpio_keys.h>
 #include <linux/input.h>
diff --git a/arch/arm/mach-orion5x/ts78xx-setup.c b/arch/arm/mach-orion5x/ts78xx-setup.c
index 7ef80a8304c0..94778739e38f 100644
--- a/arch/arm/mach-orion5x/ts78xx-setup.c
+++ b/arch/arm/mach-orion5x/ts78xx-setup.c
@@ -16,7 +16,7 @@
 #include <linux/platform_device.h>
 #include <linux/mv643xx_eth.h>
 #include <linux/ata_platform.h>
-#include <linux/mtd/nand.h>
+#include <linux/mtd/rawnand.h>
 #include <linux/mtd/partitions.h>
 #include <linux/timeriomem-rng.h>
 #include <asm/mach-types.h>
diff --git a/arch/arm/mach-pxa/balloon3.c b/arch/arm/mach-pxa/balloon3.c
index 1467c1d1e541..d6d92f388f14 100644
--- a/arch/arm/mach-pxa/balloon3.c
+++ b/arch/arm/mach-pxa/balloon3.c
@@ -29,7 +29,7 @@
 #include <linux/types.h>
 #include <linux/platform_data/pcf857x.h>
 #include <linux/i2c/pxa-i2c.h>
-#include <linux/mtd/nand.h>
+#include <linux/mtd/rawnand.h>
 #include <linux/mtd/physmap.h>
 #include <linux/regulator/max1586.h>
 
diff --git a/arch/arm/mach-pxa/em-x270.c b/arch/arm/mach-pxa/em-x270.c
index 811a7317f3ea..6d28035ebba5 100644
--- a/arch/arm/mach-pxa/em-x270.c
+++ b/arch/arm/mach-pxa/em-x270.c
@@ -15,7 +15,7 @@
 
 #include <linux/dm9000.h>
 #include <linux/platform_data/rtc-v3020.h>
-#include <linux/mtd/nand.h>
+#include <linux/mtd/rawnand.h>
 #include <linux/mtd/partitions.h>
 #include <linux/mtd/physmap.h>
 #include <linux/input.h>
diff --git a/arch/arm/mach-pxa/eseries.c b/arch/arm/mach-pxa/eseries.c
index fa9d71d194f0..91f7c3e40065 100644
--- a/arch/arm/mach-pxa/eseries.c
+++ b/arch/arm/mach-pxa/eseries.c
@@ -20,7 +20,7 @@
 #include <linux/mfd/tc6387xb.h>
 #include <linux/mfd/tc6393xb.h>
 #include <linux/mfd/t7l66xb.h>
-#include <linux/mtd/nand.h>
+#include <linux/mtd/rawnand.h>
 #include <linux/mtd/partitions.h>
 #include <linux/usb/gpio_vbus.h>
 #include <linux/memblock.h>
diff --git a/arch/arm/mach-pxa/palmtx.c b/arch/arm/mach-pxa/palmtx.c
index 36646975b5d2..47e3e38e9bec 100644
--- a/arch/arm/mach-pxa/palmtx.c
+++ b/arch/arm/mach-pxa/palmtx.c
@@ -28,7 +28,7 @@
 #include <linux/wm97xx.h>
 #include <linux/power_supply.h>
 #include <linux/usb/gpio_vbus.h>
-#include <linux/mtd/nand.h>
+#include <linux/mtd/rawnand.h>
 #include <linux/mtd/partitions.h>
 #include <linux/mtd/mtd.h>
 #include <linux/mtd/physmap.h>
diff --git a/arch/arm/mach-pxa/tosa.c b/arch/arm/mach-pxa/tosa.c
index 13de6602966f..6a386fd6363e 100644
--- a/arch/arm/mach-pxa/tosa.c
+++ b/arch/arm/mach-pxa/tosa.c
@@ -24,7 +24,7 @@
 #include <linux/mmc/host.h>
 #include <linux/mfd/tc6393xb.h>
 #include <linux/mfd/tmio.h>
-#include <linux/mtd/nand.h>
+#include <linux/mtd/rawnand.h>
 #include <linux/mtd/partitions.h>
 #include <linux/mtd/physmap.h>
 #include <linux/pm.h>
diff --git a/arch/arm/mach-s3c24xx/common-smdk.c b/arch/arm/mach-s3c24xx/common-smdk.c
index 9e0bc46e90ec..0e116c92bf01 100644
--- a/arch/arm/mach-s3c24xx/common-smdk.c
+++ b/arch/arm/mach-s3c24xx/common-smdk.c
@@ -23,7 +23,7 @@
 #include <linux/platform_device.h>
 
 #include <linux/mtd/mtd.h>
-#include <linux/mtd/nand.h>
+#include <linux/mtd/rawnand.h>
 #include <linux/mtd/nand_ecc.h>
 #include <linux/mtd/partitions.h>
 #include <linux/io.h>
diff --git a/arch/arm/mach-s3c24xx/mach-anubis.c b/arch/arm/mach-s3c24xx/mach-anubis.c
index 029ef1b58925..c14cab361922 100644
--- a/arch/arm/mach-s3c24xx/mach-anubis.c
+++ b/arch/arm/mach-s3c24xx/mach-anubis.c
@@ -40,7 +40,7 @@
 #include <linux/platform_data/i2c-s3c2410.h>
 
 #include <linux/mtd/mtd.h>
-#include <linux/mtd/nand.h>
+#include <linux/mtd/rawnand.h>
 #include <linux/mtd/nand_ecc.h>
 #include <linux/mtd/partitions.h>
 
diff --git a/arch/arm/mach-s3c24xx/mach-at2440evb.c b/arch/arm/mach-s3c24xx/mach-at2440evb.c
index 7b28eb623fc1..ebdbafb9382a 100644
--- a/arch/arm/mach-s3c24xx/mach-at2440evb.c
+++ b/arch/arm/mach-s3c24xx/mach-at2440evb.c
@@ -41,7 +41,7 @@
 #include <linux/platform_data/i2c-s3c2410.h>
 
 #include <linux/mtd/mtd.h>
-#include <linux/mtd/nand.h>
+#include <linux/mtd/rawnand.h>
 #include <linux/mtd/nand_ecc.h>
 #include <linux/mtd/partitions.h>
 
diff --git a/arch/arm/mach-s3c24xx/mach-bast.c b/arch/arm/mach-s3c24xx/mach-bast.c
index 5185036765db..704dc84b3480 100644
--- a/arch/arm/mach-s3c24xx/mach-bast.c
+++ b/arch/arm/mach-s3c24xx/mach-bast.c
@@ -28,7 +28,7 @@
 #include <linux/serial_8250.h>
 
 #include <linux/mtd/mtd.h>
-#include <linux/mtd/nand.h>
+#include <linux/mtd/rawnand.h>
 #include <linux/mtd/nand_ecc.h>
 #include <linux/mtd/partitions.h>
 
diff --git a/arch/arm/mach-s3c24xx/mach-gta02.c b/arch/arm/mach-s3c24xx/mach-gta02.c
index b0ed401da3a3..afe18baf0c84 100644
--- a/arch/arm/mach-s3c24xx/mach-gta02.c
+++ b/arch/arm/mach-s3c24xx/mach-gta02.c
@@ -50,7 +50,7 @@
 #include <linux/mfd/pcf50633/pmic.h>
 
 #include <linux/mtd/mtd.h>
-#include <linux/mtd/nand.h>
+#include <linux/mtd/rawnand.h>
 #include <linux/mtd/nand_ecc.h>
 #include <linux/mtd/partitions.h>
 #include <linux/mtd/physmap.h>
diff --git a/arch/arm/mach-s3c24xx/mach-jive.c b/arch/arm/mach-s3c24xx/mach-jive.c
index f5b5c49b56ac..17821976f769 100644
--- a/arch/arm/mach-s3c24xx/mach-jive.c
+++ b/arch/arm/mach-s3c24xx/mach-jive.c
@@ -43,7 +43,7 @@
 #include <asm/mach-types.h>
 
 #include <linux/mtd/mtd.h>
-#include <linux/mtd/nand.h>
+#include <linux/mtd/rawnand.h>
 #include <linux/mtd/nand_ecc.h>
 #include <linux/mtd/partitions.h>
 
diff --git a/arch/arm/mach-s3c24xx/mach-mini2440.c b/arch/arm/mach-s3c24xx/mach-mini2440.c
index 71af8d2fd320..15140d34f927 100644
--- a/arch/arm/mach-s3c24xx/mach-mini2440.c
+++ b/arch/arm/mach-s3c24xx/mach-mini2440.c
@@ -49,7 +49,7 @@
 #include <linux/platform_data/usb-s3c2410_udc.h>
 
 #include <linux/mtd/mtd.h>
-#include <linux/mtd/nand.h>
+#include <linux/mtd/rawnand.h>
 #include <linux/mtd/nand_ecc.h>
 #include <linux/mtd/partitions.h>
 
diff --git a/arch/arm/mach-s3c24xx/mach-osiris.c b/arch/arm/mach-s3c24xx/mach-osiris.c
index 70b0eb7d3134..a6657e720430 100644
--- a/arch/arm/mach-s3c24xx/mach-osiris.c
+++ b/arch/arm/mach-s3c24xx/mach-osiris.c
@@ -36,7 +36,7 @@
 #include <linux/platform_data/i2c-s3c2410.h>
 
 #include <linux/mtd/mtd.h>
-#include <linux/mtd/nand.h>
+#include <linux/mtd/rawnand.h>
 #include <linux/mtd/nand_ecc.h>
 #include <linux/mtd/partitions.h>
 
diff --git a/arch/arm/mach-s3c24xx/mach-qt2410.c b/arch/arm/mach-s3c24xx/mach-qt2410.c
index 868c82087403..84e3a9c53184 100644
--- a/arch/arm/mach-s3c24xx/mach-qt2410.c
+++ b/arch/arm/mach-s3c24xx/mach-qt2410.c
@@ -36,7 +36,7 @@
 #include <linux/spi/spi_gpio.h>
 #include <linux/io.h>
 #include <linux/mtd/mtd.h>
-#include <linux/mtd/nand.h>
+#include <linux/mtd/rawnand.h>
 #include <linux/mtd/nand_ecc.h>
 #include <linux/mtd/partitions.h>
 
diff --git a/arch/arm/mach-s3c24xx/mach-rx3715.c b/arch/arm/mach-s3c24xx/mach-rx3715.c
index a39fb9780dd3..b5ba615cf9dd 100644
--- a/arch/arm/mach-s3c24xx/mach-rx3715.c
+++ b/arch/arm/mach-s3c24xx/mach-rx3715.c
@@ -27,7 +27,7 @@
 #include <linux/serial.h>
 #include <linux/io.h>
 #include <linux/mtd/mtd.h>
-#include <linux/mtd/nand.h>
+#include <linux/mtd/rawnand.h>
 #include <linux/mtd/nand_ecc.h>
 #include <linux/mtd/partitions.h>
 
diff --git a/arch/arm/mach-s3c24xx/mach-vstms.c b/arch/arm/mach-s3c24xx/mach-vstms.c
index f5e6322145fa..1adc957edf0f 100644
--- a/arch/arm/mach-s3c24xx/mach-vstms.c
+++ b/arch/arm/mach-s3c24xx/mach-vstms.c
@@ -20,7 +20,7 @@
 #include <linux/platform_device.h>
 #include <linux/io.h>
 #include <linux/mtd/mtd.h>
-#include <linux/mtd/nand.h>
+#include <linux/mtd/rawnand.h>
 #include <linux/mtd/nand_ecc.h>
 #include <linux/mtd/partitions.h>
 #include <linux/memblock.h>
diff --git a/arch/blackfin/mach-bf537/boards/dnp5370.c b/arch/blackfin/mach-bf537/boards/dnp5370.c
index e79b3b810c39..c4a8ffb15417 100644
--- a/arch/blackfin/mach-bf537/boards/dnp5370.c
+++ b/arch/blackfin/mach-bf537/boards/dnp5370.c
@@ -17,7 +17,7 @@
 #include <linux/platform_device.h>
 #include <linux/io.h>
 #include <linux/mtd/mtd.h>
-#include <linux/mtd/nand.h>
+#include <linux/mtd/rawnand.h>
 #include <linux/mtd/partitions.h>
 #include <linux/mtd/plat-ram.h>
 #include <linux/mtd/physmap.h>
diff --git a/arch/blackfin/mach-bf537/boards/stamp.c b/arch/blackfin/mach-bf537/boards/stamp.c
index 7528148dc492..400e6693643e 100644
--- a/arch/blackfin/mach-bf537/boards/stamp.c
+++ b/arch/blackfin/mach-bf537/boards/stamp.c
@@ -12,7 +12,7 @@
 #include <linux/platform_device.h>
 #include <linux/io.h>
 #include <linux/mtd/mtd.h>
-#include <linux/mtd/nand.h>
+#include <linux/mtd/rawnand.h>
 #include <linux/mtd/partitions.h>
 #include <linux/mtd/plat-ram.h>
 #include <linux/mtd/physmap.h>
diff --git a/arch/blackfin/mach-bf561/boards/acvilon.c b/arch/blackfin/mach-bf561/boards/acvilon.c
index 37f8f25a1347..696cc9d7820a 100644
--- a/arch/blackfin/mach-bf561/boards/acvilon.c
+++ b/arch/blackfin/mach-bf561/boards/acvilon.c
@@ -38,7 +38,7 @@
 #include <linux/mtd/mtd.h>
 #include <linux/mtd/partitions.h>
 #include <linux/mtd/physmap.h>
-#include <linux/mtd/nand.h>
+#include <linux/mtd/rawnand.h>
 #include <linux/mtd/plat-ram.h>
 #include <linux/spi/spi.h>
 #include <linux/spi/flash.h>
diff --git a/arch/cris/arch-v32/drivers/mach-a3/nandflash.c b/arch/cris/arch-v32/drivers/mach-a3/nandflash.c
index 3f646c787e58..925a98eb6d68 100644
--- a/arch/cris/arch-v32/drivers/mach-a3/nandflash.c
+++ b/arch/cris/arch-v32/drivers/mach-a3/nandflash.c
@@ -16,7 +16,7 @@
 #include <linux/init.h>
 #include <linux/module.h>
 #include <linux/mtd/mtd.h>
-#include <linux/mtd/nand.h>
+#include <linux/mtd/rawnand.h>
 #include <linux/mtd/partitions.h>
 #include <arch/memmap.h>
 #include <hwregs/reg_map.h>
diff --git a/arch/cris/arch-v32/drivers/mach-fs/nandflash.c b/arch/cris/arch-v32/drivers/mach-fs/nandflash.c
index a74540514bdb..53b56a429dde 100644
--- a/arch/cris/arch-v32/drivers/mach-fs/nandflash.c
+++ b/arch/cris/arch-v32/drivers/mach-fs/nandflash.c
@@ -16,7 +16,7 @@
 #include <linux/init.h>
 #include <linux/module.h>
 #include <linux/mtd/mtd.h>
-#include <linux/mtd/nand.h>
+#include <linux/mtd/rawnand.h>
 #include <linux/mtd/partitions.h>
 #include <arch/memmap.h>
 #include <hwregs/reg_map.h>
diff --git a/arch/mips/alchemy/devboards/db1200.c b/arch/mips/alchemy/devboards/db1200.c
index 992442a03d8b..83831002c832 100644
--- a/arch/mips/alchemy/devboards/db1200.c
+++ b/arch/mips/alchemy/devboards/db1200.c
@@ -29,7 +29,7 @@
 #include <linux/leds.h>
 #include <linux/mmc/host.h>
 #include <linux/mtd/mtd.h>
-#include <linux/mtd/nand.h>
+#include <linux/mtd/rawnand.h>
 #include <linux/mtd/partitions.h>
 #include <linux/platform_device.h>
 #include <linux/serial_8250.h>
diff --git a/arch/mips/alchemy/devboards/db1300.c b/arch/mips/alchemy/devboards/db1300.c
index a5504f57cb00..3e7fbdbdb3c4 100644
--- a/arch/mips/alchemy/devboards/db1300.c
+++ b/arch/mips/alchemy/devboards/db1300.c
@@ -18,7 +18,7 @@
 #include <linux/mmc/host.h>
 #include <linux/module.h>
 #include <linux/mtd/mtd.h>
-#include <linux/mtd/nand.h>
+#include <linux/mtd/rawnand.h>
 #include <linux/mtd/partitions.h>
 #include <linux/platform_device.h>
 #include <linux/smsc911x.h>
diff --git a/arch/mips/alchemy/devboards/db1550.c b/arch/mips/alchemy/devboards/db1550.c
index 1c01d6eadb08..421bd5793f7e 100644
--- a/arch/mips/alchemy/devboards/db1550.c
+++ b/arch/mips/alchemy/devboards/db1550.c
@@ -12,7 +12,7 @@
 #include <linux/io.h>
 #include <linux/interrupt.h>
 #include <linux/mtd/mtd.h>
-#include <linux/mtd/nand.h>
+#include <linux/mtd/rawnand.h>
 #include <linux/mtd/partitions.h>
 #include <linux/platform_device.h>
 #include <linux/pm.h>
diff --git a/arch/mips/include/asm/mach-jz4740/jz4740_nand.h b/arch/mips/include/asm/mach-jz4740/jz4740_nand.h
index 7f7b0fc554da..f381d465e768 100644
--- a/arch/mips/include/asm/mach-jz4740/jz4740_nand.h
+++ b/arch/mips/include/asm/mach-jz4740/jz4740_nand.h
@@ -16,7 +16,7 @@
 #ifndef __ASM_MACH_JZ4740_JZ4740_NAND_H__
 #define __ASM_MACH_JZ4740_JZ4740_NAND_H__
 
-#include <linux/mtd/nand.h>
+#include <linux/mtd/rawnand.h>
 #include <linux/mtd/partitions.h>
 
 #define JZ_NAND_NUM_BANKS 4
diff --git a/arch/mips/netlogic/xlr/platform-flash.c b/arch/mips/netlogic/xlr/platform-flash.c
index f03131fec41d..4d1b4c003376 100644
--- a/arch/mips/netlogic/xlr/platform-flash.c
+++ b/arch/mips/netlogic/xlr/platform-flash.c
@@ -19,7 +19,7 @@
 
 #include <linux/mtd/mtd.h>
 #include <linux/mtd/physmap.h>
-#include <linux/mtd/nand.h>
+#include <linux/mtd/rawnand.h>
 #include <linux/mtd/partitions.h>
 
 #include <asm/netlogic/haldefs.h>
diff --git a/arch/mips/pnx833x/common/platform.c b/arch/mips/pnx833x/common/platform.c
index 7cf4eb50fc72..a7a4e9f5146d 100644
--- a/arch/mips/pnx833x/common/platform.c
+++ b/arch/mips/pnx833x/common/platform.c
@@ -30,7 +30,7 @@
 #include <linux/resource.h>
 #include <linux/serial.h>
 #include <linux/serial_pnx8xxx.h>
-#include <linux/mtd/nand.h>
+#include <linux/mtd/rawnand.h>
 #include <linux/mtd/partitions.h>
 
 #include <irq.h>
diff --git a/arch/mips/rb532/devices.c b/arch/mips/rb532/devices.c
index 0966adccf520..32ea3e6731d6 100644
--- a/arch/mips/rb532/devices.c
+++ b/arch/mips/rb532/devices.c
@@ -20,7 +20,7 @@
 #include <linux/ctype.h>
 #include <linux/string.h>
 #include <linux/platform_device.h>
-#include <linux/mtd/nand.h>
+#include <linux/mtd/rawnand.h>
 #include <linux/mtd/mtd.h>
 #include <linux/mtd/partitions.h>
 #include <linux/gpio.h>
diff --git a/arch/sh/boards/mach-migor/setup.c b/arch/sh/boards/mach-migor/setup.c
index 5de60a77eaa1..0bcbe58b11e9 100644
--- a/arch/sh/boards/mach-migor/setup.c
+++ b/arch/sh/boards/mach-migor/setup.c
@@ -15,7 +15,7 @@
 #include <linux/mmc/host.h>
 #include <linux/mtd/physmap.h>
 #include <linux/mfd/tmio.h>
-#include <linux/mtd/nand.h>
+#include <linux/mtd/rawnand.h>
 #include <linux/i2c.h>
 #include <linux/regulator/fixed.h>
 #include <linux/regulator/machine.h>
diff --git a/drivers/mtd/inftlcore.c b/drivers/mtd/inftlcore.c
index 8db740d6eb08..57ef1fb42a04 100644
--- a/drivers/mtd/inftlcore.c
+++ b/drivers/mtd/inftlcore.c
@@ -33,7 +33,7 @@
 #include <linux/mtd/mtd.h>
 #include <linux/mtd/nftl.h>
 #include <linux/mtd/inftl.h>
-#include <linux/mtd/nand.h>
+#include <linux/mtd/rawnand.h>
 #include <linux/uaccess.h>
 #include <asm/errno.h>
 #include <asm/io.h>
diff --git a/drivers/mtd/nand/ams-delta.c b/drivers/mtd/nand/ams-delta.c
index 5d6c26f3cf7f..dcec9cf4983f 100644
--- a/drivers/mtd/nand/ams-delta.c
+++ b/drivers/mtd/nand/ams-delta.c
@@ -20,7 +20,7 @@
 #include <linux/module.h>
 #include <linux/delay.h>
 #include <linux/mtd/mtd.h>
-#include <linux/mtd/nand.h>
+#include <linux/mtd/rawnand.h>
 #include <linux/mtd/partitions.h>
 #include <linux/gpio.h>
 #include <linux/platform_data/gpio-omap.h>
diff --git a/drivers/mtd/nand/atmel/nand-controller.c b/drivers/mtd/nand/atmel/nand-controller.c
index d922a88e407f..6606270b9b9b 100644
--- a/drivers/mtd/nand/atmel/nand-controller.c
+++ b/drivers/mtd/nand/atmel/nand-controller.c
@@ -59,7 +59,7 @@
 #include <linux/mfd/syscon/atmel-matrix.h>
 #include <linux/mfd/syscon/atmel-smc.h>
 #include <linux/module.h>
-#include <linux/mtd/nand.h>
+#include <linux/mtd/rawnand.h>
 #include <linux/of_address.h>
 #include <linux/of_irq.h>
 #include <linux/of_platform.h>
diff --git a/drivers/mtd/nand/atmel/pmecc.c b/drivers/mtd/nand/atmel/pmecc.c
index 55a8ee5306ea..27a969a8f105 100644
--- a/drivers/mtd/nand/atmel/pmecc.c
+++ b/drivers/mtd/nand/atmel/pmecc.c
@@ -47,7 +47,7 @@
 #include <linux/genalloc.h>
 #include <linux/iopoll.h>
 #include <linux/module.h>
-#include <linux/mtd/nand.h>
+#include <linux/mtd/rawnand.h>
 #include <linux/of_irq.h>
 #include <linux/of_platform.h>
 #include <linux/platform_device.h>
diff --git a/drivers/mtd/nand/au1550nd.c b/drivers/mtd/nand/au1550nd.c
index 9bf6d9915694..9d4a28fa6b73 100644
--- a/drivers/mtd/nand/au1550nd.c
+++ b/drivers/mtd/nand/au1550nd.c
@@ -14,7 +14,7 @@
 #include <linux/module.h>
 #include <linux/interrupt.h>
 #include <linux/mtd/mtd.h>
-#include <linux/mtd/nand.h>
+#include <linux/mtd/rawnand.h>
 #include <linux/mtd/partitions.h>
 #include <linux/platform_device.h>
 #include <asm/io.h>
diff --git a/drivers/mtd/nand/bcm47xxnflash/bcm47xxnflash.h b/drivers/mtd/nand/bcm47xxnflash/bcm47xxnflash.h
index 8ea75710a854..c8834767ab6d 100644
--- a/drivers/mtd/nand/bcm47xxnflash/bcm47xxnflash.h
+++ b/drivers/mtd/nand/bcm47xxnflash/bcm47xxnflash.h
@@ -6,7 +6,7 @@
 #endif
 
 #include <linux/mtd/mtd.h>
-#include <linux/mtd/nand.h>
+#include <linux/mtd/rawnand.h>
 
 struct bcm47xxnflash {
 	struct bcma_drv_cc *cc;
diff --git a/drivers/mtd/nand/bf5xx_nand.c b/drivers/mtd/nand/bf5xx_nand.c
index 3962f55bd034..5655dca6ce43 100644
--- a/drivers/mtd/nand/bf5xx_nand.c
+++ b/drivers/mtd/nand/bf5xx_nand.c
@@ -49,7 +49,7 @@
 #include <linux/bitops.h>
 
 #include <linux/mtd/mtd.h>
-#include <linux/mtd/nand.h>
+#include <linux/mtd/rawnand.h>
 #include <linux/mtd/nand_ecc.h>
 #include <linux/mtd/partitions.h>
 
diff --git a/drivers/mtd/nand/brcmnand/brcmnand.c b/drivers/mtd/nand/brcmnand/brcmnand.c
index 7419c5ce63f8..e0eb51d8c012 100644
--- a/drivers/mtd/nand/brcmnand/brcmnand.c
+++ b/drivers/mtd/nand/brcmnand/brcmnand.c
@@ -29,7 +29,7 @@
 #include <linux/bitops.h>
 #include <linux/mm.h>
 #include <linux/mtd/mtd.h>
-#include <linux/mtd/nand.h>
+#include <linux/mtd/rawnand.h>
 #include <linux/mtd/partitions.h>
 #include <linux/of.h>
 #include <linux/of_platform.h>
diff --git a/drivers/mtd/nand/cafe_nand.c b/drivers/mtd/nand/cafe_nand.c
index 2fd733eba0a3..bc558c438a57 100644
--- a/drivers/mtd/nand/cafe_nand.c
+++ b/drivers/mtd/nand/cafe_nand.c
@@ -13,7 +13,7 @@
 #include <linux/device.h>
 #undef DEBUG
 #include <linux/mtd/mtd.h>
-#include <linux/mtd/nand.h>
+#include <linux/mtd/rawnand.h>
 #include <linux/mtd/partitions.h>
 #include <linux/rslib.h>
 #include <linux/pci.h>
diff --git a/drivers/mtd/nand/cmx270_nand.c b/drivers/mtd/nand/cmx270_nand.c
index 949b9400dcb7..1fc435f994e1 100644
--- a/drivers/mtd/nand/cmx270_nand.c
+++ b/drivers/mtd/nand/cmx270_nand.c
@@ -18,7 +18,7 @@
  *   CM-X270 board.
  */
 
-#include <linux/mtd/nand.h>
+#include <linux/mtd/rawnand.h>
 #include <linux/mtd/partitions.h>
 #include <linux/slab.h>
 #include <linux/gpio.h>
diff --git a/drivers/mtd/nand/cs553x_nand.c b/drivers/mtd/nand/cs553x_nand.c
index 594b28684138..d48877540f14 100644
--- a/drivers/mtd/nand/cs553x_nand.c
+++ b/drivers/mtd/nand/cs553x_nand.c
@@ -24,7 +24,7 @@
 #include <linux/module.h>
 #include <linux/delay.h>
 #include <linux/mtd/mtd.h>
-#include <linux/mtd/nand.h>
+#include <linux/mtd/rawnand.h>
 #include <linux/mtd/nand_ecc.h>
 #include <linux/mtd/partitions.h>
 
diff --git a/drivers/mtd/nand/davinci_nand.c b/drivers/mtd/nand/davinci_nand.c
index 7b26e53b95b1..ccc8c43abcff 100644
--- a/drivers/mtd/nand/davinci_nand.c
+++ b/drivers/mtd/nand/davinci_nand.c
@@ -29,7 +29,7 @@
 #include <linux/err.h>
 #include <linux/clk.h>
 #include <linux/io.h>
-#include <linux/mtd/nand.h>
+#include <linux/mtd/rawnand.h>
 #include <linux/mtd/partitions.h>
 #include <linux/slab.h>
 #include <linux/of_device.h>
diff --git a/drivers/mtd/nand/denali.h b/drivers/mtd/nand/denali.h
index 237cc706b0fb..9239e6793e6e 100644
--- a/drivers/mtd/nand/denali.h
+++ b/drivers/mtd/nand/denali.h
@@ -21,7 +21,7 @@
 #define __DENALI_H__
 
 #include <linux/bitops.h>
-#include <linux/mtd/nand.h>
+#include <linux/mtd/rawnand.h>
 
 #define DEVICE_RESET				0x0
 #define     DEVICE_RESET__BANK(bank)			BIT(bank)
diff --git a/drivers/mtd/nand/diskonchip.c b/drivers/mtd/nand/diskonchip.c
index a023ab9e9cbf..c3aa53caab5c 100644
--- a/drivers/mtd/nand/diskonchip.c
+++ b/drivers/mtd/nand/diskonchip.c
@@ -27,7 +27,7 @@
 #include <linux/io.h>
 
 #include <linux/mtd/mtd.h>
-#include <linux/mtd/nand.h>
+#include <linux/mtd/rawnand.h>
 #include <linux/mtd/doc2000.h>
 #include <linux/mtd/partitions.h>
 #include <linux/mtd/inftl.h>
diff --git a/drivers/mtd/nand/docg4.c b/drivers/mtd/nand/docg4.c
index a27a84fbfb84..2436cbc71662 100644
--- a/drivers/mtd/nand/docg4.c
+++ b/drivers/mtd/nand/docg4.c
@@ -41,7 +41,7 @@
 #include <linux/bitops.h>
 #include <linux/mtd/partitions.h>
 #include <linux/mtd/mtd.h>
-#include <linux/mtd/nand.h>
+#include <linux/mtd/rawnand.h>
 #include <linux/bch.h>
 #include <linux/bitrev.h>
 #include <linux/jiffies.h>
diff --git a/drivers/mtd/nand/fsl_elbc_nand.c b/drivers/mtd/nand/fsl_elbc_nand.c
index b9ac16f05057..17db2f90aa2c 100644
--- a/drivers/mtd/nand/fsl_elbc_nand.c
+++ b/drivers/mtd/nand/fsl_elbc_nand.c
@@ -34,7 +34,7 @@
 #include <linux/interrupt.h>
 
 #include <linux/mtd/mtd.h>
-#include <linux/mtd/nand.h>
+#include <linux/mtd/rawnand.h>
 #include <linux/mtd/nand_ecc.h>
 #include <linux/mtd/partitions.h>
 
diff --git a/drivers/mtd/nand/fsl_ifc_nand.c b/drivers/mtd/nand/fsl_ifc_nand.c
index 59408ec2c69f..9e03bac7f34c 100644
--- a/drivers/mtd/nand/fsl_ifc_nand.c
+++ b/drivers/mtd/nand/fsl_ifc_nand.c
@@ -26,7 +26,7 @@
 #include <linux/of_address.h>
 #include <linux/slab.h>
 #include <linux/mtd/mtd.h>
-#include <linux/mtd/nand.h>
+#include <linux/mtd/rawnand.h>
 #include <linux/mtd/partitions.h>
 #include <linux/mtd/nand_ecc.h>
 #include <linux/fsl_ifc.h>
diff --git a/drivers/mtd/nand/fsl_upm.c b/drivers/mtd/nand/fsl_upm.c
index d85fa2555b68..a88e2cf66e0f 100644
--- a/drivers/mtd/nand/fsl_upm.c
+++ b/drivers/mtd/nand/fsl_upm.c
@@ -14,7 +14,7 @@
 #include <linux/kernel.h>
 #include <linux/module.h>
 #include <linux/delay.h>
-#include <linux/mtd/nand.h>
+#include <linux/mtd/rawnand.h>
 #include <linux/mtd/nand_ecc.h>
 #include <linux/mtd/partitions.h>
 #include <linux/mtd/mtd.h>
diff --git a/drivers/mtd/nand/fsmc_nand.c b/drivers/mtd/nand/fsmc_nand.c
index 9d8b051d3187..eac15d9bf49e 100644
--- a/drivers/mtd/nand/fsmc_nand.c
+++ b/drivers/mtd/nand/fsmc_nand.c
@@ -28,7 +28,7 @@
 #include <linux/sched.h>
 #include <linux/types.h>
 #include <linux/mtd/mtd.h>
-#include <linux/mtd/nand.h>
+#include <linux/mtd/rawnand.h>
 #include <linux/mtd/nand_ecc.h>
 #include <linux/platform_device.h>
 #include <linux/of.h>
diff --git a/drivers/mtd/nand/gpio.c b/drivers/mtd/nand/gpio.c
index 85294f150f4f..fd3648952b5a 100644
--- a/drivers/mtd/nand/gpio.c
+++ b/drivers/mtd/nand/gpio.c
@@ -26,7 +26,7 @@
 #include <linux/gpio.h>
 #include <linux/io.h>
 #include <linux/mtd/mtd.h>
-#include <linux/mtd/nand.h>
+#include <linux/mtd/rawnand.h>
 #include <linux/mtd/partitions.h>
 #include <linux/mtd/nand-gpio.h>
 #include <linux/of.h>
diff --git a/drivers/mtd/nand/gpmi-nand/gpmi-nand.h b/drivers/mtd/nand/gpmi-nand/gpmi-nand.h
index 9df0ad64e7e0..a45e4ce13d10 100644
--- a/drivers/mtd/nand/gpmi-nand/gpmi-nand.h
+++ b/drivers/mtd/nand/gpmi-nand/gpmi-nand.h
@@ -17,7 +17,7 @@
 #ifndef __DRIVERS_MTD_NAND_GPMI_NAND_H
 #define __DRIVERS_MTD_NAND_GPMI_NAND_H
 
-#include <linux/mtd/nand.h>
+#include <linux/mtd/rawnand.h>
 #include <linux/platform_device.h>
 #include <linux/dma-mapping.h>
 #include <linux/dmaengine.h>
diff --git a/drivers/mtd/nand/hisi504_nand.c b/drivers/mtd/nand/hisi504_nand.c
index 530caa80b1b6..d9ee1a7e6956 100644
--- a/drivers/mtd/nand/hisi504_nand.c
+++ b/drivers/mtd/nand/hisi504_nand.c
@@ -26,7 +26,7 @@
 #include <linux/module.h>
 #include <linux/delay.h>
 #include <linux/interrupt.h>
-#include <linux/mtd/nand.h>
+#include <linux/mtd/rawnand.h>
 #include <linux/dma-mapping.h>
 #include <linux/platform_device.h>
 #include <linux/mtd/partitions.h>
diff --git a/drivers/mtd/nand/jz4740_nand.c b/drivers/mtd/nand/jz4740_nand.c
index 0d06a1f07d82..ad827d4af3e9 100644
--- a/drivers/mtd/nand/jz4740_nand.c
+++ b/drivers/mtd/nand/jz4740_nand.c
@@ -20,7 +20,7 @@
 #include <linux/slab.h>
 
 #include <linux/mtd/mtd.h>
-#include <linux/mtd/nand.h>
+#include <linux/mtd/rawnand.h>
 #include <linux/mtd/partitions.h>
 
 #include <linux/gpio.h>
diff --git a/drivers/mtd/nand/jz4780_nand.c b/drivers/mtd/nand/jz4780_nand.c
index 8bc835f71b26..e69f6ae4c539 100644
--- a/drivers/mtd/nand/jz4780_nand.c
+++ b/drivers/mtd/nand/jz4780_nand.c
@@ -20,7 +20,7 @@
 #include <linux/platform_device.h>
 #include <linux/slab.h>
 #include <linux/mtd/mtd.h>
-#include <linux/mtd/nand.h>
+#include <linux/mtd/rawnand.h>
 #include <linux/mtd/partitions.h>
 
 #include <linux/jz4780-nemc.h>
diff --git a/drivers/mtd/nand/lpc32xx_mlc.c b/drivers/mtd/nand/lpc32xx_mlc.c
index 846a66c1b133..bffa01cd0d38 100644
--- a/drivers/mtd/nand/lpc32xx_mlc.c
+++ b/drivers/mtd/nand/lpc32xx_mlc.c
@@ -27,7 +27,7 @@
 #include <linux/module.h>
 #include <linux/platform_device.h>
 #include <linux/mtd/mtd.h>
-#include <linux/mtd/nand.h>
+#include <linux/mtd/rawnand.h>
 #include <linux/mtd/partitions.h>
 #include <linux/clk.h>
 #include <linux/err.h>
diff --git a/drivers/mtd/nand/lpc32xx_slc.c b/drivers/mtd/nand/lpc32xx_slc.c
index a0669a33f8fe..7c8402f023df 100644
--- a/drivers/mtd/nand/lpc32xx_slc.c
+++ b/drivers/mtd/nand/lpc32xx_slc.c
@@ -23,7 +23,7 @@
 #include <linux/module.h>
 #include <linux/platform_device.h>
 #include <linux/mtd/mtd.h>
-#include <linux/mtd/nand.h>
+#include <linux/mtd/rawnand.h>
 #include <linux/mtd/partitions.h>
 #include <linux/clk.h>
 #include <linux/err.h>
diff --git a/drivers/mtd/nand/mpc5121_nfc.c b/drivers/mtd/nand/mpc5121_nfc.c
index 0e86fb6277c3..b6b97cc9fba6 100644
--- a/drivers/mtd/nand/mpc5121_nfc.c
+++ b/drivers/mtd/nand/mpc5121_nfc.c
@@ -33,7 +33,7 @@
 #include <linux/interrupt.h>
 #include <linux/io.h>
 #include <linux/mtd/mtd.h>
-#include <linux/mtd/nand.h>
+#include <linux/mtd/rawnand.h>
 #include <linux/mtd/partitions.h>
 #include <linux/of_address.h>
 #include <linux/of_device.h>
diff --git a/drivers/mtd/nand/mtk_nand.c b/drivers/mtd/nand/mtk_nand.c
index f7ae99464375..d86a7d131cc0 100644
--- a/drivers/mtd/nand/mtk_nand.c
+++ b/drivers/mtd/nand/mtk_nand.c
@@ -19,7 +19,7 @@
 #include <linux/interrupt.h>
 #include <linux/delay.h>
 #include <linux/clk.h>
-#include <linux/mtd/nand.h>
+#include <linux/mtd/rawnand.h>
 #include <linux/mtd/mtd.h>
 #include <linux/module.h>
 #include <linux/iopoll.h>
diff --git a/drivers/mtd/nand/mxc_nand.c b/drivers/mtd/nand/mxc_nand.c
index a764d5ca7536..30c71915fd7b 100644
--- a/drivers/mtd/nand/mxc_nand.c
+++ b/drivers/mtd/nand/mxc_nand.c
@@ -22,7 +22,7 @@
 #include <linux/init.h>
 #include <linux/module.h>
 #include <linux/mtd/mtd.h>
-#include <linux/mtd/nand.h>
+#include <linux/mtd/rawnand.h>
 #include <linux/mtd/partitions.h>
 #include <linux/interrupt.h>
 #include <linux/device.h>
diff --git a/drivers/mtd/nand/nand_amd.c b/drivers/mtd/nand/nand_amd.c
index 170403a3bfa8..22f060f38123 100644
--- a/drivers/mtd/nand/nand_amd.c
+++ b/drivers/mtd/nand/nand_amd.c
@@ -15,7 +15,7 @@
  * GNU General Public License for more details.
  */
 
-#include <linux/mtd/nand.h>
+#include <linux/mtd/rawnand.h>
 
 static void amd_nand_decode_id(struct nand_chip *chip)
 {
diff --git a/drivers/mtd/nand/nand_base.c b/drivers/mtd/nand/nand_base.c
index 5fa5ddc94834..8093df278692 100644
--- a/drivers/mtd/nand/nand_base.c
+++ b/drivers/mtd/nand/nand_base.c
@@ -39,7 +39,7 @@
 #include <linux/nmi.h>
 #include <linux/types.h>
 #include <linux/mtd/mtd.h>
-#include <linux/mtd/nand.h>
+#include <linux/mtd/rawnand.h>
 #include <linux/mtd/nand_ecc.h>
 #include <linux/mtd/nand_bch.h>
 #include <linux/interrupt.h>
diff --git a/drivers/mtd/nand/nand_bbt.c b/drivers/mtd/nand/nand_bbt.c
index 7695efea65f2..2915b6739bf8 100644
--- a/drivers/mtd/nand/nand_bbt.c
+++ b/drivers/mtd/nand/nand_bbt.c
@@ -61,7 +61,7 @@
 #include <linux/types.h>
 #include <linux/mtd/mtd.h>
 #include <linux/mtd/bbm.h>
-#include <linux/mtd/nand.h>
+#include <linux/mtd/rawnand.h>
 #include <linux/bitops.h>
 #include <linux/delay.h>
 #include <linux/vmalloc.h>
diff --git a/drivers/mtd/nand/nand_bch.c b/drivers/mtd/nand/nand_bch.c
index 44763f87eae4..505441c9373b 100644
--- a/drivers/mtd/nand/nand_bch.c
+++ b/drivers/mtd/nand/nand_bch.c
@@ -25,7 +25,7 @@
 #include <linux/slab.h>
 #include <linux/bitops.h>
 #include <linux/mtd/mtd.h>
-#include <linux/mtd/nand.h>
+#include <linux/mtd/rawnand.h>
 #include <linux/mtd/nand_bch.h>
 #include <linux/bch.h>
 
diff --git a/drivers/mtd/nand/nand_ecc.c b/drivers/mtd/nand/nand_ecc.c
index d1770b066396..7613a0388044 100644
--- a/drivers/mtd/nand/nand_ecc.c
+++ b/drivers/mtd/nand/nand_ecc.c
@@ -43,7 +43,7 @@
 #include <linux/kernel.h>
 #include <linux/module.h>
 #include <linux/mtd/mtd.h>
-#include <linux/mtd/nand.h>
+#include <linux/mtd/rawnand.h>
 #include <linux/mtd/nand_ecc.h>
 #include <asm/byteorder.h>
 #else
diff --git a/drivers/mtd/nand/nand_hynix.c b/drivers/mtd/nand/nand_hynix.c
index b12dc7325378..b735cc8ec104 100644
--- a/drivers/mtd/nand/nand_hynix.c
+++ b/drivers/mtd/nand/nand_hynix.c
@@ -15,7 +15,7 @@
  * GNU General Public License for more details.
  */
 
-#include <linux/mtd/nand.h>
+#include <linux/mtd/rawnand.h>
 #include <linux/sizes.h>
 #include <linux/slab.h>
 
diff --git a/drivers/mtd/nand/nand_ids.c b/drivers/mtd/nand/nand_ids.c
index 92e2cf8e9ff9..5423c3bb388e 100644
--- a/drivers/mtd/nand/nand_ids.c
+++ b/drivers/mtd/nand/nand_ids.c
@@ -6,7 +6,7 @@
  * published by the Free Software Foundation.
  *
  */
-#include <linux/mtd/nand.h>
+#include <linux/mtd/rawnand.h>
 #include <linux/sizes.h>
 
 #define LP_OPTIONS 0
diff --git a/drivers/mtd/nand/nand_macronix.c b/drivers/mtd/nand/nand_macronix.c
index 84855c3e1a02..d290ff2a6d2f 100644
--- a/drivers/mtd/nand/nand_macronix.c
+++ b/drivers/mtd/nand/nand_macronix.c
@@ -15,7 +15,7 @@
  * GNU General Public License for more details.
  */
 
-#include <linux/mtd/nand.h>
+#include <linux/mtd/rawnand.h>
 
 static int macronix_nand_init(struct nand_chip *chip)
 {
diff --git a/drivers/mtd/nand/nand_micron.c b/drivers/mtd/nand/nand_micron.c
index c30ab60f8e1b..abf6a3c376e8 100644
--- a/drivers/mtd/nand/nand_micron.c
+++ b/drivers/mtd/nand/nand_micron.c
@@ -15,7 +15,7 @@
  * GNU General Public License for more details.
  */
 
-#include <linux/mtd/nand.h>
+#include <linux/mtd/rawnand.h>
 
 /*
  * Special Micron status bit that indicates when the block has been
diff --git a/drivers/mtd/nand/nand_samsung.c b/drivers/mtd/nand/nand_samsung.c
index 1e0755997762..d348f0129ae7 100644
--- a/drivers/mtd/nand/nand_samsung.c
+++ b/drivers/mtd/nand/nand_samsung.c
@@ -15,7 +15,7 @@
  * GNU General Public License for more details.
  */
 
-#include <linux/mtd/nand.h>
+#include <linux/mtd/rawnand.h>
 
 static void samsung_nand_decode_id(struct nand_chip *chip)
 {
diff --git a/drivers/mtd/nand/nand_timings.c b/drivers/mtd/nand/nand_timings.c
index f06312df3669..90228b9735bd 100644
--- a/drivers/mtd/nand/nand_timings.c
+++ b/drivers/mtd/nand/nand_timings.c
@@ -11,7 +11,7 @@
 #include <linux/kernel.h>
 #include <linux/err.h>
 #include <linux/export.h>
-#include <linux/mtd/nand.h>
+#include <linux/mtd/rawnand.h>
 
 static const struct nand_data_interface onfi_sdr_timings[] = {
 	/* Mode 0 */
diff --git a/drivers/mtd/nand/nand_toshiba.c b/drivers/mtd/nand/nand_toshiba.c
index fa787ba38dcd..57df857074e6 100644
--- a/drivers/mtd/nand/nand_toshiba.c
+++ b/drivers/mtd/nand/nand_toshiba.c
@@ -15,7 +15,7 @@
  * GNU General Public License for more details.
  */
 
-#include <linux/mtd/nand.h>
+#include <linux/mtd/rawnand.h>
 
 static void toshiba_nand_decode_id(struct nand_chip *chip)
 {
diff --git a/drivers/mtd/nand/nandsim.c b/drivers/mtd/nand/nandsim.c
index 03a0d057bf2f..5ba46354bf0f 100644
--- a/drivers/mtd/nand/nandsim.c
+++ b/drivers/mtd/nand/nandsim.c
@@ -33,7 +33,7 @@
 #include <linux/errno.h>
 #include <linux/string.h>
 #include <linux/mtd/mtd.h>
-#include <linux/mtd/nand.h>
+#include <linux/mtd/rawnand.h>
 #include <linux/mtd/nand_bch.h>
 #include <linux/mtd/partitions.h>
 #include <linux/delay.h>
diff --git a/drivers/mtd/nand/ndfc.c b/drivers/mtd/nand/ndfc.c
index 28e6118362f7..d8a806894937 100644
--- a/drivers/mtd/nand/ndfc.c
+++ b/drivers/mtd/nand/ndfc.c
@@ -22,7 +22,7 @@
  *
  */
 #include <linux/module.h>
-#include <linux/mtd/nand.h>
+#include <linux/mtd/rawnand.h>
 #include <linux/mtd/nand_ecc.h>
 #include <linux/mtd/partitions.h>
 #include <linux/mtd/ndfc.h>
diff --git a/drivers/mtd/nand/nuc900_nand.c b/drivers/mtd/nand/nuc900_nand.c
index 8f64011d32ef..7bb4d2ea9342 100644
--- a/drivers/mtd/nand/nuc900_nand.c
+++ b/drivers/mtd/nand/nuc900_nand.c
@@ -19,7 +19,7 @@
 #include <linux/err.h>
 
 #include <linux/mtd/mtd.h>
-#include <linux/mtd/nand.h>
+#include <linux/mtd/rawnand.h>
 #include <linux/mtd/partitions.h>
 
 #define REG_FMICSR   	0x00
diff --git a/drivers/mtd/nand/omap2.c b/drivers/mtd/nand/omap2.c
index 084934a9f19c..54540c8fa1a2 100644
--- a/drivers/mtd/nand/omap2.c
+++ b/drivers/mtd/nand/omap2.c
@@ -18,7 +18,7 @@
 #include <linux/jiffies.h>
 #include <linux/sched.h>
 #include <linux/mtd/mtd.h>
-#include <linux/mtd/nand.h>
+#include <linux/mtd/rawnand.h>
 #include <linux/mtd/partitions.h>
 #include <linux/omap-dma.h>
 #include <linux/io.h>
diff --git a/drivers/mtd/nand/orion_nand.c b/drivers/mtd/nand/orion_nand.c
index 209170ed2b76..c48c872f29ee 100644
--- a/drivers/mtd/nand/orion_nand.c
+++ b/drivers/mtd/nand/orion_nand.c
@@ -15,7 +15,7 @@
 #include <linux/platform_device.h>
 #include <linux/of.h>
 #include <linux/mtd/mtd.h>
-#include <linux/mtd/nand.h>
+#include <linux/mtd/rawnand.h>
 #include <linux/mtd/partitions.h>
 #include <linux/clk.h>
 #include <linux/err.h>
diff --git a/drivers/mtd/nand/oxnas_nand.c b/drivers/mtd/nand/oxnas_nand.c
index 1b207aac840c..f14eec3886d9 100644
--- a/drivers/mtd/nand/oxnas_nand.c
+++ b/drivers/mtd/nand/oxnas_nand.c
@@ -21,7 +21,7 @@
 #include <linux/clk.h>
 #include <linux/reset.h>
 #include <linux/mtd/mtd.h>
-#include <linux/mtd/nand.h>
+#include <linux/mtd/rawnand.h>
 #include <linux/mtd/partitions.h>
 #include <linux/of.h>
 
diff --git a/drivers/mtd/nand/pasemi_nand.c b/drivers/mtd/nand/pasemi_nand.c
index 074b8b01289e..a47a7e4bd25a 100644
--- a/drivers/mtd/nand/pasemi_nand.c
+++ b/drivers/mtd/nand/pasemi_nand.c
@@ -25,7 +25,7 @@
 #include <linux/slab.h>
 #include <linux/module.h>
 #include <linux/mtd/mtd.h>
-#include <linux/mtd/nand.h>
+#include <linux/mtd/rawnand.h>
 #include <linux/mtd/nand_ecc.h>
 #include <linux/of_address.h>
 #include <linux/of_irq.h>
diff --git a/drivers/mtd/nand/plat_nand.c b/drivers/mtd/nand/plat_nand.c
index 791de3e4bbb6..925a1323604d 100644
--- a/drivers/mtd/nand/plat_nand.c
+++ b/drivers/mtd/nand/plat_nand.c
@@ -15,7 +15,7 @@
 #include <linux/platform_device.h>
 #include <linux/slab.h>
 #include <linux/mtd/mtd.h>
-#include <linux/mtd/nand.h>
+#include <linux/mtd/rawnand.h>
 #include <linux/mtd/partitions.h>
 
 struct plat_nand_data {
diff --git a/drivers/mtd/nand/pxa3xx_nand.c b/drivers/mtd/nand/pxa3xx_nand.c
index 74dae4bbdac8..85cff68643e0 100644
--- a/drivers/mtd/nand/pxa3xx_nand.c
+++ b/drivers/mtd/nand/pxa3xx_nand.c
@@ -21,7 +21,7 @@
 #include <linux/delay.h>
 #include <linux/clk.h>
 #include <linux/mtd/mtd.h>
-#include <linux/mtd/nand.h>
+#include <linux/mtd/rawnand.h>
 #include <linux/mtd/partitions.h>
 #include <linux/io.h>
 #include <linux/iopoll.h>
diff --git a/drivers/mtd/nand/qcom_nandc.c b/drivers/mtd/nand/qcom_nandc.c
index 88af7145a51a..d2751b3a2972 100644
--- a/drivers/mtd/nand/qcom_nandc.c
+++ b/drivers/mtd/nand/qcom_nandc.c
@@ -17,7 +17,7 @@
 #include <linux/dma-mapping.h>
 #include <linux/dmaengine.h>
 #include <linux/module.h>
-#include <linux/mtd/nand.h>
+#include <linux/mtd/rawnand.h>
 #include <linux/mtd/partitions.h>
 #include <linux/of.h>
 #include <linux/of_device.h>
diff --git a/drivers/mtd/nand/r852.h b/drivers/mtd/nand/r852.h
index d042ddb71a8b..8713c57f6207 100644
--- a/drivers/mtd/nand/r852.h
+++ b/drivers/mtd/nand/r852.h
@@ -10,7 +10,7 @@
 #include <linux/pci.h>
 #include <linux/completion.h>
 #include <linux/workqueue.h>
-#include <linux/mtd/nand.h>
+#include <linux/mtd/rawnand.h>
 #include <linux/spinlock.h>
 
 
diff --git a/drivers/mtd/nand/s3c2410.c b/drivers/mtd/nand/s3c2410.c
index 9e0c849607b9..4c383eeec6f6 100644
--- a/drivers/mtd/nand/s3c2410.c
+++ b/drivers/mtd/nand/s3c2410.c
@@ -43,7 +43,7 @@
 #include <linux/of_device.h>
 
 #include <linux/mtd/mtd.h>
-#include <linux/mtd/nand.h>
+#include <linux/mtd/rawnand.h>
 #include <linux/mtd/nand_ecc.h>
 #include <linux/mtd/partitions.h>
 
diff --git a/drivers/mtd/nand/sh_flctl.c b/drivers/mtd/nand/sh_flctl.c
index 891ac7b99305..2637b9052fe7 100644
--- a/drivers/mtd/nand/sh_flctl.c
+++ b/drivers/mtd/nand/sh_flctl.c
@@ -38,7 +38,7 @@
 #include <linux/string.h>
 
 #include <linux/mtd/mtd.h>
-#include <linux/mtd/nand.h>
+#include <linux/mtd/rawnand.h>
 #include <linux/mtd/partitions.h>
 #include <linux/mtd/sh_flctl.h>
 
diff --git a/drivers/mtd/nand/sharpsl.c b/drivers/mtd/nand/sharpsl.c
index 064ca1757589..737efe83cd36 100644
--- a/drivers/mtd/nand/sharpsl.c
+++ b/drivers/mtd/nand/sharpsl.c
@@ -17,7 +17,7 @@
 #include <linux/module.h>
 #include <linux/delay.h>
 #include <linux/mtd/mtd.h>
-#include <linux/mtd/nand.h>
+#include <linux/mtd/rawnand.h>
 #include <linux/mtd/nand_ecc.h>
 #include <linux/mtd/partitions.h>
 #include <linux/mtd/sharpsl.h>
diff --git a/drivers/mtd/nand/sm_common.c b/drivers/mtd/nand/sm_common.c
index 5939dff253c2..c378705c6e2b 100644
--- a/drivers/mtd/nand/sm_common.c
+++ b/drivers/mtd/nand/sm_common.c
@@ -7,7 +7,7 @@
  * published by the Free Software Foundation.
  */
 #include <linux/kernel.h>
-#include <linux/mtd/nand.h>
+#include <linux/mtd/rawnand.h>
 #include <linux/module.h>
 #include <linux/sizes.h>
 #include "sm_common.h"
diff --git a/drivers/mtd/nand/socrates_nand.c b/drivers/mtd/nand/socrates_nand.c
index 72369bd079af..575997d0ef8a 100644
--- a/drivers/mtd/nand/socrates_nand.c
+++ b/drivers/mtd/nand/socrates_nand.c
@@ -13,7 +13,7 @@
 #include <linux/slab.h>
 #include <linux/module.h>
 #include <linux/mtd/mtd.h>
-#include <linux/mtd/nand.h>
+#include <linux/mtd/rawnand.h>
 #include <linux/mtd/partitions.h>
 #include <linux/of_address.h>
 #include <linux/of_platform.h>
diff --git a/drivers/mtd/nand/sunxi_nand.c b/drivers/mtd/nand/sunxi_nand.c
index d0b6f8f9f297..393d5dce633d 100644
--- a/drivers/mtd/nand/sunxi_nand.c
+++ b/drivers/mtd/nand/sunxi_nand.c
@@ -31,7 +31,7 @@
 #include <linux/of_device.h>
 #include <linux/of_gpio.h>
 #include <linux/mtd/mtd.h>
-#include <linux/mtd/nand.h>
+#include <linux/mtd/rawnand.h>
 #include <linux/mtd/partitions.h>
 #include <linux/clk.h>
 #include <linux/delay.h>
diff --git a/drivers/mtd/nand/tango_nand.c b/drivers/mtd/nand/tango_nand.c
index 9d40b793b1c4..766906f03943 100644
--- a/drivers/mtd/nand/tango_nand.c
+++ b/drivers/mtd/nand/tango_nand.c
@@ -11,7 +11,7 @@
 #include <linux/clk.h>
 #include <linux/iopoll.h>
 #include <linux/module.h>
-#include <linux/mtd/nand.h>
+#include <linux/mtd/rawnand.h>
 #include <linux/dmaengine.h>
 #include <linux/dma-mapping.h>
 #include <linux/platform_device.h>
diff --git a/drivers/mtd/nand/tmio_nand.c b/drivers/mtd/nand/tmio_nand.c
index fc5e773f8b60..c9dd682fb353 100644
--- a/drivers/mtd/nand/tmio_nand.c
+++ b/drivers/mtd/nand/tmio_nand.c
@@ -34,7 +34,7 @@
 #include <linux/interrupt.h>
 #include <linux/ioport.h>
 #include <linux/mtd/mtd.h>
-#include <linux/mtd/nand.h>
+#include <linux/mtd/rawnand.h>
 #include <linux/mtd/nand_ecc.h>
 #include <linux/mtd/partitions.h>
 #include <linux/slab.h>
diff --git a/drivers/mtd/nand/txx9ndfmc.c b/drivers/mtd/nand/txx9ndfmc.c
index 0a14fda2e41b..b567d212fe7d 100644
--- a/drivers/mtd/nand/txx9ndfmc.c
+++ b/drivers/mtd/nand/txx9ndfmc.c
@@ -16,7 +16,7 @@
 #include <linux/platform_device.h>
 #include <linux/delay.h>
 #include <linux/mtd/mtd.h>
-#include <linux/mtd/nand.h>
+#include <linux/mtd/rawnand.h>
 #include <linux/mtd/nand_ecc.h>
 #include <linux/mtd/partitions.h>
 #include <linux/io.h>
diff --git a/drivers/mtd/nand/vf610_nfc.c b/drivers/mtd/nand/vf610_nfc.c
index 744ab10e8962..8ba9b6de5fa6 100644
--- a/drivers/mtd/nand/vf610_nfc.c
+++ b/drivers/mtd/nand/vf610_nfc.c
@@ -31,7 +31,7 @@
 #include <linux/interrupt.h>
 #include <linux/io.h>
 #include <linux/mtd/mtd.h>
-#include <linux/mtd/nand.h>
+#include <linux/mtd/rawnand.h>
 #include <linux/mtd/partitions.h>
 #include <linux/of_device.h>
 #include <linux/pinctrl/consumer.h>
diff --git a/drivers/mtd/nand/xway_nand.c b/drivers/mtd/nand/xway_nand.c
index ddee4005248c..9926b4e3d69d 100644
--- a/drivers/mtd/nand/xway_nand.c
+++ b/drivers/mtd/nand/xway_nand.c
@@ -7,7 +7,7 @@
  *  Copyright © 2016 Hauke Mehrtens <hauke@hauke-m.de>
  */
 
-#include <linux/mtd/nand.h>
+#include <linux/mtd/rawnand.h>
 #include <linux/of_gpio.h>
 #include <linux/of_platform.h>
 
diff --git a/drivers/mtd/nftlcore.c b/drivers/mtd/nftlcore.c
index e21161353e76..1f1a61168b3d 100644
--- a/drivers/mtd/nftlcore.c
+++ b/drivers/mtd/nftlcore.c
@@ -34,7 +34,7 @@
 
 #include <linux/kmod.h>
 #include <linux/mtd/mtd.h>
-#include <linux/mtd/nand.h>
+#include <linux/mtd/rawnand.h>
 #include <linux/mtd/nftl.h>
 #include <linux/mtd/blktrans.h>
 
diff --git a/drivers/mtd/nftlmount.c b/drivers/mtd/nftlmount.c
index a5dfbfbebfca..184c8fbfe465 100644
--- a/drivers/mtd/nftlmount.c
+++ b/drivers/mtd/nftlmount.c
@@ -25,7 +25,7 @@
 #include <linux/delay.h>
 #include <linux/slab.h>
 #include <linux/mtd/mtd.h>
-#include <linux/mtd/nand.h>
+#include <linux/mtd/rawnand.h>
 #include <linux/mtd/nftl.h>
 
 #define SECTORSIZE 512
diff --git a/drivers/mtd/ssfdc.c b/drivers/mtd/ssfdc.c
index 41b13d1cdcc4..95f0bf95f095 100644
--- a/drivers/mtd/ssfdc.c
+++ b/drivers/mtd/ssfdc.c
@@ -16,7 +16,7 @@
 #include <linux/slab.h>
 #include <linux/hdreg.h>
 #include <linux/mtd/mtd.h>
-#include <linux/mtd/nand.h>
+#include <linux/mtd/rawnand.h>
 #include <linux/mtd/blktrans.h>
 
 struct ssfdcr_record {
diff --git a/drivers/mtd/tests/nandbiterrs.c b/drivers/mtd/tests/nandbiterrs.c
index f26dec896afa..5f03b8c885a9 100644
--- a/drivers/mtd/tests/nandbiterrs.c
+++ b/drivers/mtd/tests/nandbiterrs.c
@@ -47,7 +47,7 @@
 #include <linux/moduleparam.h>
 #include <linux/mtd/mtd.h>
 #include <linux/err.h>
-#include <linux/mtd/nand.h>
+#include <linux/mtd/rawnand.h>
 #include <linux/slab.h>
 #include "mtd_test.h"
 
diff --git a/drivers/staging/mt29f_spinand/mt29f_spinand.c b/drivers/staging/mt29f_spinand/mt29f_spinand.c
index a4e3ae8f0c85..13eaf16ecd16 100644
--- a/drivers/staging/mt29f_spinand/mt29f_spinand.c
+++ b/drivers/staging/mt29f_spinand/mt29f_spinand.c
@@ -18,7 +18,7 @@
 #include <linux/delay.h>
 #include <linux/mtd/mtd.h>
 #include <linux/mtd/partitions.h>
-#include <linux/mtd/nand.h>
+#include <linux/mtd/rawnand.h>
 #include <linux/spi/spi.h>
 
 #include "mt29f_spinand.h"
diff --git a/fs/jffs2/wbuf.c b/fs/jffs2/wbuf.c
index b25d28a21212..48d9522e209c 100644
--- a/fs/jffs2/wbuf.c
+++ b/fs/jffs2/wbuf.c
@@ -17,7 +17,7 @@
 #include <linux/slab.h>
 #include <linux/mtd/mtd.h>
 #include <linux/crc32.h>
-#include <linux/mtd/nand.h>
+#include <linux/mtd/rawnand.h>
 #include <linux/jiffies.h>
 #include <linux/sched.h>
 #include <linux/writeback.h>
diff --git a/include/linux/mtd/nand-gpio.h b/include/linux/mtd/nand-gpio.h
index 51534e50f7fc..be4f45d89be2 100644
--- a/include/linux/mtd/nand-gpio.h
+++ b/include/linux/mtd/nand-gpio.h
@@ -1,7 +1,7 @@
 #ifndef __LINUX_MTD_NAND_GPIO_H
 #define __LINUX_MTD_NAND_GPIO_H
 
-#include <linux/mtd/nand.h>
+#include <linux/mtd/rawnand.h>
 
 struct gpio_nand_platdata {
 	int	gpio_nce;
diff --git a/include/linux/mtd/nand.h b/include/linux/mtd/nand.h
deleted file mode 100644
index 892148c448cc..000000000000
--- a/include/linux/mtd/nand.h
+++ /dev/null
@@ -1,1331 +0,0 @@
-/*
- *  linux/include/linux/mtd/nand.h
- *
- *  Copyright © 2000-2010 David Woodhouse <dwmw2@infradead.org>
- *                        Steven J. Hill <sjhill@realitydiluted.com>
- *		          Thomas Gleixner <tglx@linutronix.de>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- *
- * Info:
- *	Contains standard defines and IDs for NAND flash devices
- *
- * Changelog:
- *	See git changelog.
- */
-#ifndef __LINUX_MTD_NAND_H
-#define __LINUX_MTD_NAND_H
-
-#include <linux/wait.h>
-#include <linux/spinlock.h>
-#include <linux/mtd/mtd.h>
-#include <linux/mtd/flashchip.h>
-#include <linux/mtd/bbm.h>
-
-struct mtd_info;
-struct nand_flash_dev;
-struct device_node;
-
-/* Scan and identify a NAND device */
-int nand_scan(struct mtd_info *mtd, int max_chips);
-/*
- * Separate phases of nand_scan(), allowing board driver to intervene
- * and override command or ECC setup according to flash type.
- */
-int nand_scan_ident(struct mtd_info *mtd, int max_chips,
-			   struct nand_flash_dev *table);
-int nand_scan_tail(struct mtd_info *mtd);
-
-/* Unregister the MTD device and free resources held by the NAND device */
-void nand_release(struct mtd_info *mtd);
-
-/* Internal helper for board drivers which need to override command function */
-void nand_wait_ready(struct mtd_info *mtd);
-
-/* locks all blocks present in the device */
-int nand_lock(struct mtd_info *mtd, loff_t ofs, uint64_t len);
-
-/* unlocks specified locked blocks */
-int nand_unlock(struct mtd_info *mtd, loff_t ofs, uint64_t len);
-
-/* The maximum number of NAND chips in an array */
-#define NAND_MAX_CHIPS		8
-
-/*
- * Constants for hardware specific CLE/ALE/NCE function
- *
- * These are bits which can be or'ed to set/clear multiple
- * bits in one go.
- */
-/* Select the chip by setting nCE to low */
-#define NAND_NCE		0x01
-/* Select the command latch by setting CLE to high */
-#define NAND_CLE		0x02
-/* Select the address latch by setting ALE to high */
-#define NAND_ALE		0x04
-
-#define NAND_CTRL_CLE		(NAND_NCE | NAND_CLE)
-#define NAND_CTRL_ALE		(NAND_NCE | NAND_ALE)
-#define NAND_CTRL_CHANGE	0x80
-
-/*
- * Standard NAND flash commands
- */
-#define NAND_CMD_READ0		0
-#define NAND_CMD_READ1		1
-#define NAND_CMD_RNDOUT		5
-#define NAND_CMD_PAGEPROG	0x10
-#define NAND_CMD_READOOB	0x50
-#define NAND_CMD_ERASE1		0x60
-#define NAND_CMD_STATUS		0x70
-#define NAND_CMD_SEQIN		0x80
-#define NAND_CMD_RNDIN		0x85
-#define NAND_CMD_READID		0x90
-#define NAND_CMD_ERASE2		0xd0
-#define NAND_CMD_PARAM		0xec
-#define NAND_CMD_GET_FEATURES	0xee
-#define NAND_CMD_SET_FEATURES	0xef
-#define NAND_CMD_RESET		0xff
-
-#define NAND_CMD_LOCK		0x2a
-#define NAND_CMD_UNLOCK1	0x23
-#define NAND_CMD_UNLOCK2	0x24
-
-/* Extended commands for large page devices */
-#define NAND_CMD_READSTART	0x30
-#define NAND_CMD_RNDOUTSTART	0xE0
-#define NAND_CMD_CACHEDPROG	0x15
-
-#define NAND_CMD_NONE		-1
-
-/* Status bits */
-#define NAND_STATUS_FAIL	0x01
-#define NAND_STATUS_FAIL_N1	0x02
-#define NAND_STATUS_TRUE_READY	0x20
-#define NAND_STATUS_READY	0x40
-#define NAND_STATUS_WP		0x80
-
-#define NAND_DATA_IFACE_CHECK_ONLY	-1
-
-/*
- * Constants for ECC_MODES
- */
-typedef enum {
-	NAND_ECC_NONE,
-	NAND_ECC_SOFT,
-	NAND_ECC_HW,
-	NAND_ECC_HW_SYNDROME,
-	NAND_ECC_HW_OOB_FIRST,
-	NAND_ECC_ON_DIE,
-} nand_ecc_modes_t;
-
-enum nand_ecc_algo {
-	NAND_ECC_UNKNOWN,
-	NAND_ECC_HAMMING,
-	NAND_ECC_BCH,
-};
-
-/*
- * Constants for Hardware ECC
- */
-/* Reset Hardware ECC for read */
-#define NAND_ECC_READ		0
-/* Reset Hardware ECC for write */
-#define NAND_ECC_WRITE		1
-/* Enable Hardware ECC before syndrome is read back from flash */
-#define NAND_ECC_READSYN	2
-
-/*
- * Enable generic NAND 'page erased' check. This check is only done when
- * ecc.correct() returns -EBADMSG.
- * Set this flag if your implementation does not fix bitflips in erased
- * pages and you want to rely on the default implementation.
- */
-#define NAND_ECC_GENERIC_ERASED_CHECK	BIT(0)
-#define NAND_ECC_MAXIMIZE		BIT(1)
-/*
- * If your controller already sends the required NAND commands when
- * reading or writing a page, then the framework is not supposed to
- * send READ0 and SEQIN/PAGEPROG respectively.
- */
-#define NAND_ECC_CUSTOM_PAGE_ACCESS	BIT(2)
-
-/* Bit mask for flags passed to do_nand_read_ecc */
-#define NAND_GET_DEVICE		0x80
-
-
-/*
- * Option constants for bizarre disfunctionality and real
- * features.
- */
-/* Buswidth is 16 bit */
-#define NAND_BUSWIDTH_16	0x00000002
-/* Chip has cache program function */
-#define NAND_CACHEPRG		0x00000008
-/*
- * Chip requires ready check on read (for auto-incremented sequential read).
- * True only for small page devices; large page devices do not support
- * autoincrement.
- */
-#define NAND_NEED_READRDY	0x00000100
-
-/* Chip does not allow subpage writes */
-#define NAND_NO_SUBPAGE_WRITE	0x00000200
-
-/* Device is one of 'new' xD cards that expose fake nand command set */
-#define NAND_BROKEN_XD		0x00000400
-
-/* Device behaves just like nand, but is readonly */
-#define NAND_ROM		0x00000800
-
-/* Device supports subpage reads */
-#define NAND_SUBPAGE_READ	0x00001000
-
-/*
- * Some MLC NANDs need data scrambling to limit bitflips caused by repeated
- * patterns.
- */
-#define NAND_NEED_SCRAMBLING	0x00002000
-
-/* Options valid for Samsung large page devices */
-#define NAND_SAMSUNG_LP_OPTIONS NAND_CACHEPRG
-
-/* Macros to identify the above */
-#define NAND_HAS_CACHEPROG(chip) ((chip->options & NAND_CACHEPRG))
-#define NAND_HAS_SUBPAGE_READ(chip) ((chip->options & NAND_SUBPAGE_READ))
-#define NAND_HAS_SUBPAGE_WRITE(chip) !((chip)->options & NAND_NO_SUBPAGE_WRITE)
-
-/* Non chip related options */
-/* This option skips the bbt scan during initialization. */
-#define NAND_SKIP_BBTSCAN	0x00010000
-/*
- * This option is defined if the board driver allocates its own buffers
- * (e.g. because it needs them DMA-coherent).
- */
-#define NAND_OWN_BUFFERS	0x00020000
-/* Chip may not exist, so silence any errors in scan */
-#define NAND_SCAN_SILENT_NODEV	0x00040000
-/*
- * Autodetect nand buswidth with readid/onfi.
- * This suppose the driver will configure the hardware in 8 bits mode
- * when calling nand_scan_ident, and update its configuration
- * before calling nand_scan_tail.
- */
-#define NAND_BUSWIDTH_AUTO      0x00080000
-/*
- * This option could be defined by controller drivers to protect against
- * kmap'ed, vmalloc'ed highmem buffers being passed from upper layers
- */
-#define NAND_USE_BOUNCE_BUFFER	0x00100000
-
-/*
- * In case your controller is implementing ->cmd_ctrl() and is relying on the
- * default ->cmdfunc() implementation, you may want to let the core handle the
- * tCCS delay which is required when a column change (RNDIN or RNDOUT) is
- * requested.
- * If your controller already takes care of this delay, you don't need to set
- * this flag.
- */
-#define NAND_WAIT_TCCS		0x00200000
-
-/* Options set by nand scan */
-/* Nand scan has allocated controller struct */
-#define NAND_CONTROLLER_ALLOC	0x80000000
-
-/* Cell info constants */
-#define NAND_CI_CHIPNR_MSK	0x03
-#define NAND_CI_CELLTYPE_MSK	0x0C
-#define NAND_CI_CELLTYPE_SHIFT	2
-
-/* Keep gcc happy */
-struct nand_chip;
-
-/* ONFI features */
-#define ONFI_FEATURE_16_BIT_BUS		(1 << 0)
-#define ONFI_FEATURE_EXT_PARAM_PAGE	(1 << 7)
-
-/* ONFI timing mode, used in both asynchronous and synchronous mode */
-#define ONFI_TIMING_MODE_0		(1 << 0)
-#define ONFI_TIMING_MODE_1		(1 << 1)
-#define ONFI_TIMING_MODE_2		(1 << 2)
-#define ONFI_TIMING_MODE_3		(1 << 3)
-#define ONFI_TIMING_MODE_4		(1 << 4)
-#define ONFI_TIMING_MODE_5		(1 << 5)
-#define ONFI_TIMING_MODE_UNKNOWN	(1 << 6)
-
-/* ONFI feature address */
-#define ONFI_FEATURE_ADDR_TIMING_MODE	0x1
-
-/* Vendor-specific feature address (Micron) */
-#define ONFI_FEATURE_ADDR_READ_RETRY	0x89
-#define ONFI_FEATURE_ON_DIE_ECC		0x90
-#define   ONFI_FEATURE_ON_DIE_ECC_EN	BIT(3)
-
-/* ONFI subfeature parameters length */
-#define ONFI_SUBFEATURE_PARAM_LEN	4
-
-/* ONFI optional commands SET/GET FEATURES supported? */
-#define ONFI_OPT_CMD_SET_GET_FEATURES	(1 << 2)
-
-struct nand_onfi_params {
-	/* rev info and features block */
-	/* 'O' 'N' 'F' 'I'  */
-	u8 sig[4];
-	__le16 revision;
-	__le16 features;
-	__le16 opt_cmd;
-	u8 reserved0[2];
-	__le16 ext_param_page_length; /* since ONFI 2.1 */
-	u8 num_of_param_pages;        /* since ONFI 2.1 */
-	u8 reserved1[17];
-
-	/* manufacturer information block */
-	char manufacturer[12];
-	char model[20];
-	u8 jedec_id;
-	__le16 date_code;
-	u8 reserved2[13];
-
-	/* memory organization block */
-	__le32 byte_per_page;
-	__le16 spare_bytes_per_page;
-	__le32 data_bytes_per_ppage;
-	__le16 spare_bytes_per_ppage;
-	__le32 pages_per_block;
-	__le32 blocks_per_lun;
-	u8 lun_count;
-	u8 addr_cycles;
-	u8 bits_per_cell;
-	__le16 bb_per_lun;
-	__le16 block_endurance;
-	u8 guaranteed_good_blocks;
-	__le16 guaranteed_block_endurance;
-	u8 programs_per_page;
-	u8 ppage_attr;
-	u8 ecc_bits;
-	u8 interleaved_bits;
-	u8 interleaved_ops;
-	u8 reserved3[13];
-
-	/* electrical parameter block */
-	u8 io_pin_capacitance_max;
-	__le16 async_timing_mode;
-	__le16 program_cache_timing_mode;
-	__le16 t_prog;
-	__le16 t_bers;
-	__le16 t_r;
-	__le16 t_ccs;
-	__le16 src_sync_timing_mode;
-	u8 src_ssync_features;
-	__le16 clk_pin_capacitance_typ;
-	__le16 io_pin_capacitance_typ;
-	__le16 input_pin_capacitance_typ;
-	u8 input_pin_capacitance_max;
-	u8 driver_strength_support;
-	__le16 t_int_r;
-	__le16 t_adl;
-	u8 reserved4[8];
-
-	/* vendor */
-	__le16 vendor_revision;
-	u8 vendor[88];
-
-	__le16 crc;
-} __packed;
-
-#define ONFI_CRC_BASE	0x4F4E
-
-/* Extended ECC information Block Definition (since ONFI 2.1) */
-struct onfi_ext_ecc_info {
-	u8 ecc_bits;
-	u8 codeword_size;
-	__le16 bb_per_lun;
-	__le16 block_endurance;
-	u8 reserved[2];
-} __packed;
-
-#define ONFI_SECTION_TYPE_0	0	/* Unused section. */
-#define ONFI_SECTION_TYPE_1	1	/* for additional sections. */
-#define ONFI_SECTION_TYPE_2	2	/* for ECC information. */
-struct onfi_ext_section {
-	u8 type;
-	u8 length;
-} __packed;
-
-#define ONFI_EXT_SECTION_MAX 8
-
-/* Extended Parameter Page Definition (since ONFI 2.1) */
-struct onfi_ext_param_page {
-	__le16 crc;
-	u8 sig[4];             /* 'E' 'P' 'P' 'S' */
-	u8 reserved0[10];
-	struct onfi_ext_section sections[ONFI_EXT_SECTION_MAX];
-
-	/*
-	 * The actual size of the Extended Parameter Page is in
-	 * @ext_param_page_length of nand_onfi_params{}.
-	 * The following are the variable length sections.
-	 * So we do not add any fields below. Please see the ONFI spec.
-	 */
-} __packed;
-
-struct jedec_ecc_info {
-	u8 ecc_bits;
-	u8 codeword_size;
-	__le16 bb_per_lun;
-	__le16 block_endurance;
-	u8 reserved[2];
-} __packed;
-
-/* JEDEC features */
-#define JEDEC_FEATURE_16_BIT_BUS	(1 << 0)
-
-struct nand_jedec_params {
-	/* rev info and features block */
-	/* 'J' 'E' 'S' 'D'  */
-	u8 sig[4];
-	__le16 revision;
-	__le16 features;
-	u8 opt_cmd[3];
-	__le16 sec_cmd;
-	u8 num_of_param_pages;
-	u8 reserved0[18];
-
-	/* manufacturer information block */
-	char manufacturer[12];
-	char model[20];
-	u8 jedec_id[6];
-	u8 reserved1[10];
-
-	/* memory organization block */
-	__le32 byte_per_page;
-	__le16 spare_bytes_per_page;
-	u8 reserved2[6];
-	__le32 pages_per_block;
-	__le32 blocks_per_lun;
-	u8 lun_count;
-	u8 addr_cycles;
-	u8 bits_per_cell;
-	u8 programs_per_page;
-	u8 multi_plane_addr;
-	u8 multi_plane_op_attr;
-	u8 reserved3[38];
-
-	/* electrical parameter block */
-	__le16 async_sdr_speed_grade;
-	__le16 toggle_ddr_speed_grade;
-	__le16 sync_ddr_speed_grade;
-	u8 async_sdr_features;
-	u8 toggle_ddr_features;
-	u8 sync_ddr_features;
-	__le16 t_prog;
-	__le16 t_bers;
-	__le16 t_r;
-	__le16 t_r_multi_plane;
-	__le16 t_ccs;
-	__le16 io_pin_capacitance_typ;
-	__le16 input_pin_capacitance_typ;
-	__le16 clk_pin_capacitance_typ;
-	u8 driver_strength_support;
-	__le16 t_adl;
-	u8 reserved4[36];
-
-	/* ECC and endurance block */
-	u8 guaranteed_good_blocks;
-	__le16 guaranteed_block_endurance;
-	struct jedec_ecc_info ecc_info[4];
-	u8 reserved5[29];
-
-	/* reserved */
-	u8 reserved6[148];
-
-	/* vendor */
-	__le16 vendor_rev_num;
-	u8 reserved7[88];
-
-	/* CRC for Parameter Page */
-	__le16 crc;
-} __packed;
-
-/**
- * struct nand_id - NAND id structure
- * @data: buffer containing the id bytes. Currently 8 bytes large, but can
- *	  be extended if required.
- * @len: ID length.
- */
-struct nand_id {
-	u8 data[8];
-	int len;
-};
-
-/**
- * struct nand_hw_control - Control structure for hardware controller (e.g ECC generator) shared among independent devices
- * @lock:               protection lock
- * @active:		the mtd device which holds the controller currently
- * @wq:			wait queue to sleep on if a NAND operation is in
- *			progress used instead of the per chip wait queue
- *			when a hw controller is available.
- */
-struct nand_hw_control {
-	spinlock_t lock;
-	struct nand_chip *active;
-	wait_queue_head_t wq;
-};
-
-static inline void nand_hw_control_init(struct nand_hw_control *nfc)
-{
-	nfc->active = NULL;
-	spin_lock_init(&nfc->lock);
-	init_waitqueue_head(&nfc->wq);
-}
-
-/**
- * struct nand_ecc_step_info - ECC step information of ECC engine
- * @stepsize: data bytes per ECC step
- * @strengths: array of supported strengths
- * @nstrengths: number of supported strengths
- */
-struct nand_ecc_step_info {
-	int stepsize;
-	const int *strengths;
-	int nstrengths;
-};
-
-/**
- * struct nand_ecc_caps - capability of ECC engine
- * @stepinfos: array of ECC step information
- * @nstepinfos: number of ECC step information
- * @calc_ecc_bytes: driver's hook to calculate ECC bytes per step
- */
-struct nand_ecc_caps {
-	const struct nand_ecc_step_info *stepinfos;
-	int nstepinfos;
-	int (*calc_ecc_bytes)(int step_size, int strength);
-};
-
-/* a shorthand to generate struct nand_ecc_caps with only one ECC stepsize */
-#define NAND_ECC_CAPS_SINGLE(__name, __calc, __step, ...)	\
-static const int __name##_strengths[] = { __VA_ARGS__ };	\
-static const struct nand_ecc_step_info __name##_stepinfo = {	\
-	.stepsize = __step,					\
-	.strengths = __name##_strengths,			\
-	.nstrengths = ARRAY_SIZE(__name##_strengths),		\
-};								\
-static const struct nand_ecc_caps __name = {			\
-	.stepinfos = &__name##_stepinfo,			\
-	.nstepinfos = 1,					\
-	.calc_ecc_bytes = __calc,				\
-}
-
-/**
- * struct nand_ecc_ctrl - Control structure for ECC
- * @mode:	ECC mode
- * @algo:	ECC algorithm
- * @steps:	number of ECC steps per page
- * @size:	data bytes per ECC step
- * @bytes:	ECC bytes per step
- * @strength:	max number of correctible bits per ECC step
- * @total:	total number of ECC bytes per page
- * @prepad:	padding information for syndrome based ECC generators
- * @postpad:	padding information for syndrome based ECC generators
- * @options:	ECC specific options (see NAND_ECC_XXX flags defined above)
- * @priv:	pointer to private ECC control data
- * @hwctl:	function to control hardware ECC generator. Must only
- *		be provided if an hardware ECC is available
- * @calculate:	function for ECC calculation or readback from ECC hardware
- * @correct:	function for ECC correction, matching to ECC generator (sw/hw).
- *		Should return a positive number representing the number of
- *		corrected bitflips, -EBADMSG if the number of bitflips exceed
- *		ECC strength, or any other error code if the error is not
- *		directly related to correction.
- *		If -EBADMSG is returned the input buffers should be left
- *		untouched.
- * @read_page_raw:	function to read a raw page without ECC. This function
- *			should hide the specific layout used by the ECC
- *			controller and always return contiguous in-band and
- *			out-of-band data even if they're not stored
- *			contiguously on the NAND chip (e.g.
- *			NAND_ECC_HW_SYNDROME interleaves in-band and
- *			out-of-band data).
- * @write_page_raw:	function to write a raw page without ECC. This function
- *			should hide the specific layout used by the ECC
- *			controller and consider the passed data as contiguous
- *			in-band and out-of-band data. ECC controller is
- *			responsible for doing the appropriate transformations
- *			to adapt to its specific layout (e.g.
- *			NAND_ECC_HW_SYNDROME interleaves in-band and
- *			out-of-band data).
- * @read_page:	function to read a page according to the ECC generator
- *		requirements; returns maximum number of bitflips corrected in
- *		any single ECC step, -EIO hw error
- * @read_subpage:	function to read parts of the page covered by ECC;
- *			returns same as read_page()
- * @write_subpage:	function to write parts of the page covered by ECC.
- * @write_page:	function to write a page according to the ECC generator
- *		requirements.
- * @write_oob_raw:	function to write chip OOB data without ECC
- * @read_oob_raw:	function to read chip OOB data without ECC
- * @read_oob:	function to read chip OOB data
- * @write_oob:	function to write chip OOB data
- */
-struct nand_ecc_ctrl {
-	nand_ecc_modes_t mode;
-	enum nand_ecc_algo algo;
-	int steps;
-	int size;
-	int bytes;
-	int total;
-	int strength;
-	int prepad;
-	int postpad;
-	unsigned int options;
-	void *priv;
-	void (*hwctl)(struct mtd_info *mtd, int mode);
-	int (*calculate)(struct mtd_info *mtd, const uint8_t *dat,
-			uint8_t *ecc_code);
-	int (*correct)(struct mtd_info *mtd, uint8_t *dat, uint8_t *read_ecc,
-			uint8_t *calc_ecc);
-	int (*read_page_raw)(struct mtd_info *mtd, struct nand_chip *chip,
-			uint8_t *buf, int oob_required, int page);
-	int (*write_page_raw)(struct mtd_info *mtd, struct nand_chip *chip,
-			const uint8_t *buf, int oob_required, int page);
-	int (*read_page)(struct mtd_info *mtd, struct nand_chip *chip,
-			uint8_t *buf, int oob_required, int page);
-	int (*read_subpage)(struct mtd_info *mtd, struct nand_chip *chip,
-			uint32_t offs, uint32_t len, uint8_t *buf, int page);
-	int (*write_subpage)(struct mtd_info *mtd, struct nand_chip *chip,
-			uint32_t offset, uint32_t data_len,
-			const uint8_t *data_buf, int oob_required, int page);
-	int (*write_page)(struct mtd_info *mtd, struct nand_chip *chip,
-			const uint8_t *buf, int oob_required, int page);
-	int (*write_oob_raw)(struct mtd_info *mtd, struct nand_chip *chip,
-			int page);
-	int (*read_oob_raw)(struct mtd_info *mtd, struct nand_chip *chip,
-			int page);
-	int (*read_oob)(struct mtd_info *mtd, struct nand_chip *chip, int page);
-	int (*write_oob)(struct mtd_info *mtd, struct nand_chip *chip,
-			int page);
-};
-
-static inline int nand_standard_page_accessors(struct nand_ecc_ctrl *ecc)
-{
-	return !(ecc->options & NAND_ECC_CUSTOM_PAGE_ACCESS);
-}
-
-/**
- * struct nand_buffers - buffer structure for read/write
- * @ecccalc:	buffer pointer for calculated ECC, size is oobsize.
- * @ecccode:	buffer pointer for ECC read from flash, size is oobsize.
- * @databuf:	buffer pointer for data, size is (page size + oobsize).
- *
- * Do not change the order of buffers. databuf and oobrbuf must be in
- * consecutive order.
- */
-struct nand_buffers {
-	uint8_t	*ecccalc;
-	uint8_t	*ecccode;
-	uint8_t *databuf;
-};
-
-/**
- * struct nand_sdr_timings - SDR NAND chip timings
- *
- * This struct defines the timing requirements of a SDR NAND chip.
- * These information can be found in every NAND datasheets and the timings
- * meaning are described in the ONFI specifications:
- * www.onfi.org/~/media/ONFI/specs/onfi_3_1_spec.pdf (chapter 4.15 Timing
- * Parameters)
- *
- * All these timings are expressed in picoseconds.
- *
- * @tBERS_max: Block erase time
- * @tCCS_min: Change column setup time
- * @tPROG_max: Page program time
- * @tR_max: Page read time
- * @tALH_min: ALE hold time
- * @tADL_min: ALE to data loading time
- * @tALS_min: ALE setup time
- * @tAR_min: ALE to RE# delay
- * @tCEA_max: CE# access time
- * @tCEH_min: CE# high hold time
- * @tCH_min:  CE# hold time
- * @tCHZ_max: CE# high to output hi-Z
- * @tCLH_min: CLE hold time
- * @tCLR_min: CLE to RE# delay
- * @tCLS_min: CLE setup time
- * @tCOH_min: CE# high to output hold
- * @tCS_min: CE# setup time
- * @tDH_min: Data hold time
- * @tDS_min: Data setup time
- * @tFEAT_max: Busy time for Set Features and Get Features
- * @tIR_min: Output hi-Z to RE# low
- * @tITC_max: Interface and Timing Mode Change time
- * @tRC_min: RE# cycle time
- * @tREA_max: RE# access time
- * @tREH_min: RE# high hold time
- * @tRHOH_min: RE# high to output hold
- * @tRHW_min: RE# high to WE# low
- * @tRHZ_max: RE# high to output hi-Z
- * @tRLOH_min: RE# low to output hold
- * @tRP_min: RE# pulse width
- * @tRR_min: Ready to RE# low (data only)
- * @tRST_max: Device reset time, measured from the falling edge of R/B# to the
- *	      rising edge of R/B#.
- * @tWB_max: WE# high to SR[6] low
- * @tWC_min: WE# cycle time
- * @tWH_min: WE# high hold time
- * @tWHR_min: WE# high to RE# low
- * @tWP_min: WE# pulse width
- * @tWW_min: WP# transition to WE# low
- */
-struct nand_sdr_timings {
-	u32 tBERS_max;
-	u32 tCCS_min;
-	u32 tPROG_max;
-	u32 tR_max;
-	u32 tALH_min;
-	u32 tADL_min;
-	u32 tALS_min;
-	u32 tAR_min;
-	u32 tCEA_max;
-	u32 tCEH_min;
-	u32 tCH_min;
-	u32 tCHZ_max;
-	u32 tCLH_min;
-	u32 tCLR_min;
-	u32 tCLS_min;
-	u32 tCOH_min;
-	u32 tCS_min;
-	u32 tDH_min;
-	u32 tDS_min;
-	u32 tFEAT_max;
-	u32 tIR_min;
-	u32 tITC_max;
-	u32 tRC_min;
-	u32 tREA_max;
-	u32 tREH_min;
-	u32 tRHOH_min;
-	u32 tRHW_min;
-	u32 tRHZ_max;
-	u32 tRLOH_min;
-	u32 tRP_min;
-	u32 tRR_min;
-	u64 tRST_max;
-	u32 tWB_max;
-	u32 tWC_min;
-	u32 tWH_min;
-	u32 tWHR_min;
-	u32 tWP_min;
-	u32 tWW_min;
-};
-
-/**
- * enum nand_data_interface_type - NAND interface timing type
- * @NAND_SDR_IFACE:	Single Data Rate interface
- */
-enum nand_data_interface_type {
-	NAND_SDR_IFACE,
-};
-
-/**
- * struct nand_data_interface - NAND interface timing
- * @type:	type of the timing
- * @timings:	The timing, type according to @type
- */
-struct nand_data_interface {
-	enum nand_data_interface_type type;
-	union {
-		struct nand_sdr_timings sdr;
-	} timings;
-};
-
-/**
- * nand_get_sdr_timings - get SDR timing from data interface
- * @conf:	The data interface
- */
-static inline const struct nand_sdr_timings *
-nand_get_sdr_timings(const struct nand_data_interface *conf)
-{
-	if (conf->type != NAND_SDR_IFACE)
-		return ERR_PTR(-EINVAL);
-
-	return &conf->timings.sdr;
-}
-
-/**
- * struct nand_manufacturer_ops - NAND Manufacturer operations
- * @detect: detect the NAND memory organization and capabilities
- * @init: initialize all vendor specific fields (like the ->read_retry()
- *	  implementation) if any.
- * @cleanup: the ->init() function may have allocated resources, ->cleanup()
- *	     is here to let vendor specific code release those resources.
- */
-struct nand_manufacturer_ops {
-	void (*detect)(struct nand_chip *chip);
-	int (*init)(struct nand_chip *chip);
-	void (*cleanup)(struct nand_chip *chip);
-};
-
-/**
- * struct nand_chip - NAND Private Flash Chip Data
- * @mtd:		MTD device registered to the MTD framework
- * @IO_ADDR_R:		[BOARDSPECIFIC] address to read the 8 I/O lines of the
- *			flash device
- * @IO_ADDR_W:		[BOARDSPECIFIC] address to write the 8 I/O lines of the
- *			flash device.
- * @read_byte:		[REPLACEABLE] read one byte from the chip
- * @read_word:		[REPLACEABLE] read one word from the chip
- * @write_byte:		[REPLACEABLE] write a single byte to the chip on the
- *			low 8 I/O lines
- * @write_buf:		[REPLACEABLE] write data from the buffer to the chip
- * @read_buf:		[REPLACEABLE] read data from the chip into the buffer
- * @select_chip:	[REPLACEABLE] select chip nr
- * @block_bad:		[REPLACEABLE] check if a block is bad, using OOB markers
- * @block_markbad:	[REPLACEABLE] mark a block bad
- * @cmd_ctrl:		[BOARDSPECIFIC] hardwarespecific function for controlling
- *			ALE/CLE/nCE. Also used to write command and address
- * @dev_ready:		[BOARDSPECIFIC] hardwarespecific function for accessing
- *			device ready/busy line. If set to NULL no access to
- *			ready/busy is available and the ready/busy information
- *			is read from the chip status register.
- * @cmdfunc:		[REPLACEABLE] hardwarespecific function for writing
- *			commands to the chip.
- * @waitfunc:		[REPLACEABLE] hardwarespecific function for wait on
- *			ready.
- * @setup_read_retry:	[FLASHSPECIFIC] flash (vendor) specific function for
- *			setting the read-retry mode. Mostly needed for MLC NAND.
- * @ecc:		[BOARDSPECIFIC] ECC control structure
- * @buffers:		buffer structure for read/write
- * @buf_align:		minimum buffer alignment required by a platform
- * @hwcontrol:		platform-specific hardware control structure
- * @erase:		[REPLACEABLE] erase function
- * @scan_bbt:		[REPLACEABLE] function to scan bad block table
- * @chip_delay:		[BOARDSPECIFIC] chip dependent delay for transferring
- *			data from array to read regs (tR).
- * @state:		[INTERN] the current state of the NAND device
- * @oob_poi:		"poison value buffer," used for laying out OOB data
- *			before writing
- * @page_shift:		[INTERN] number of address bits in a page (column
- *			address bits).
- * @phys_erase_shift:	[INTERN] number of address bits in a physical eraseblock
- * @bbt_erase_shift:	[INTERN] number of address bits in a bbt entry
- * @chip_shift:		[INTERN] number of address bits in one chip
- * @options:		[BOARDSPECIFIC] various chip options. They can partly
- *			be set to inform nand_scan about special functionality.
- *			See the defines for further explanation.
- * @bbt_options:	[INTERN] bad block specific options. All options used
- *			here must come from bbm.h. By default, these options
- *			will be copied to the appropriate nand_bbt_descr's.
- * @badblockpos:	[INTERN] position of the bad block marker in the oob
- *			area.
- * @badblockbits:	[INTERN] minimum number of set bits in a good block's
- *			bad block marker position; i.e., BBM == 11110111b is
- *			not bad when badblockbits == 7
- * @bits_per_cell:	[INTERN] number of bits per cell. i.e., 1 means SLC.
- * @ecc_strength_ds:	[INTERN] ECC correctability from the datasheet.
- *			Minimum amount of bit errors per @ecc_step_ds guaranteed
- *			to be correctable. If unknown, set to zero.
- * @ecc_step_ds:	[INTERN] ECC step required by the @ecc_strength_ds,
- *			also from the datasheet. It is the recommended ECC step
- *			size, if known; if unknown, set to zero.
- * @onfi_timing_mode_default: [INTERN] default ONFI timing mode. This field is
- *			      set to the actually used ONFI mode if the chip is
- *			      ONFI compliant or deduced from the datasheet if
- *			      the NAND chip is not ONFI compliant.
- * @numchips:		[INTERN] number of physical chips
- * @chipsize:		[INTERN] the size of one chip for multichip arrays
- * @pagemask:		[INTERN] page number mask = number of (pages / chip) - 1
- * @pagebuf:		[INTERN] holds the pagenumber which is currently in
- *			data_buf.
- * @pagebuf_bitflips:	[INTERN] holds the bitflip count for the page which is
- *			currently in data_buf.
- * @subpagesize:	[INTERN] holds the subpagesize
- * @id:			[INTERN] holds NAND ID
- * @onfi_version:	[INTERN] holds the chip ONFI version (BCD encoded),
- *			non 0 if ONFI supported.
- * @jedec_version:	[INTERN] holds the chip JEDEC version (BCD encoded),
- *			non 0 if JEDEC supported.
- * @onfi_params:	[INTERN] holds the ONFI page parameter when ONFI is
- *			supported, 0 otherwise.
- * @jedec_params:	[INTERN] holds the JEDEC parameter page when JEDEC is
- *			supported, 0 otherwise.
- * @max_bb_per_die:	[INTERN] the max number of bad blocks each die of a
- *			this nand device will encounter their life times.
- * @blocks_per_die:	[INTERN] The number of PEBs in a die
- * @data_interface:	[INTERN] NAND interface timing information
- * @read_retries:	[INTERN] the number of read retry modes supported
- * @onfi_set_features:	[REPLACEABLE] set the features for ONFI nand
- * @onfi_get_features:	[REPLACEABLE] get the features for ONFI nand
- * @setup_data_interface: [OPTIONAL] setup the data interface and timing. If
- *			  chipnr is set to %NAND_DATA_IFACE_CHECK_ONLY this
- *			  means the configuration should not be applied but
- *			  only checked.
- * @bbt:		[INTERN] bad block table pointer
- * @bbt_td:		[REPLACEABLE] bad block table descriptor for flash
- *			lookup.
- * @bbt_md:		[REPLACEABLE] bad block table mirror descriptor
- * @badblock_pattern:	[REPLACEABLE] bad block scan pattern used for initial
- *			bad block scan.
- * @controller:		[REPLACEABLE] a pointer to a hardware controller
- *			structure which is shared among multiple independent
- *			devices.
- * @priv:		[OPTIONAL] pointer to private chip data
- * @manufacturer:	[INTERN] Contains manufacturer information
- */
-
-struct nand_chip {
-	struct mtd_info mtd;
-	void __iomem *IO_ADDR_R;
-	void __iomem *IO_ADDR_W;
-
-	uint8_t (*read_byte)(struct mtd_info *mtd);
-	u16 (*read_word)(struct mtd_info *mtd);
-	void (*write_byte)(struct mtd_info *mtd, uint8_t byte);
-	void (*write_buf)(struct mtd_info *mtd, const uint8_t *buf, int len);
-	void (*read_buf)(struct mtd_info *mtd, uint8_t *buf, int len);
-	void (*select_chip)(struct mtd_info *mtd, int chip);
-	int (*block_bad)(struct mtd_info *mtd, loff_t ofs);
-	int (*block_markbad)(struct mtd_info *mtd, loff_t ofs);
-	void (*cmd_ctrl)(struct mtd_info *mtd, int dat, unsigned int ctrl);
-	int (*dev_ready)(struct mtd_info *mtd);
-	void (*cmdfunc)(struct mtd_info *mtd, unsigned command, int column,
-			int page_addr);
-	int(*waitfunc)(struct mtd_info *mtd, struct nand_chip *this);
-	int (*erase)(struct mtd_info *mtd, int page);
-	int (*scan_bbt)(struct mtd_info *mtd);
-	int (*onfi_set_features)(struct mtd_info *mtd, struct nand_chip *chip,
-			int feature_addr, uint8_t *subfeature_para);
-	int (*onfi_get_features)(struct mtd_info *mtd, struct nand_chip *chip,
-			int feature_addr, uint8_t *subfeature_para);
-	int (*setup_read_retry)(struct mtd_info *mtd, int retry_mode);
-	int (*setup_data_interface)(struct mtd_info *mtd, int chipnr,
-				    const struct nand_data_interface *conf);
-
-
-	int chip_delay;
-	unsigned int options;
-	unsigned int bbt_options;
-
-	int page_shift;
-	int phys_erase_shift;
-	int bbt_erase_shift;
-	int chip_shift;
-	int numchips;
-	uint64_t chipsize;
-	int pagemask;
-	int pagebuf;
-	unsigned int pagebuf_bitflips;
-	int subpagesize;
-	uint8_t bits_per_cell;
-	uint16_t ecc_strength_ds;
-	uint16_t ecc_step_ds;
-	int onfi_timing_mode_default;
-	int badblockpos;
-	int badblockbits;
-
-	struct nand_id id;
-	int onfi_version;
-	int jedec_version;
-	union {
-		struct nand_onfi_params	onfi_params;
-		struct nand_jedec_params jedec_params;
-	};
-	u16 max_bb_per_die;
-	u32 blocks_per_die;
-
-	struct nand_data_interface *data_interface;
-
-	int read_retries;
-
-	flstate_t state;
-
-	uint8_t *oob_poi;
-	struct nand_hw_control *controller;
-
-	struct nand_ecc_ctrl ecc;
-	struct nand_buffers *buffers;
-	unsigned long buf_align;
-	struct nand_hw_control hwcontrol;
-
-	uint8_t *bbt;
-	struct nand_bbt_descr *bbt_td;
-	struct nand_bbt_descr *bbt_md;
-
-	struct nand_bbt_descr *badblock_pattern;
-
-	void *priv;
-
-	struct {
-		const struct nand_manufacturer *desc;
-		void *priv;
-	} manufacturer;
-};
-
-extern const struct mtd_ooblayout_ops nand_ooblayout_sp_ops;
-extern const struct mtd_ooblayout_ops nand_ooblayout_lp_ops;
-
-static inline void nand_set_flash_node(struct nand_chip *chip,
-				       struct device_node *np)
-{
-	mtd_set_of_node(&chip->mtd, np);
-}
-
-static inline struct device_node *nand_get_flash_node(struct nand_chip *chip)
-{
-	return mtd_get_of_node(&chip->mtd);
-}
-
-static inline struct nand_chip *mtd_to_nand(struct mtd_info *mtd)
-{
-	return container_of(mtd, struct nand_chip, mtd);
-}
-
-static inline struct mtd_info *nand_to_mtd(struct nand_chip *chip)
-{
-	return &chip->mtd;
-}
-
-static inline void *nand_get_controller_data(struct nand_chip *chip)
-{
-	return chip->priv;
-}
-
-static inline void nand_set_controller_data(struct nand_chip *chip, void *priv)
-{
-	chip->priv = priv;
-}
-
-static inline void nand_set_manufacturer_data(struct nand_chip *chip,
-					      void *priv)
-{
-	chip->manufacturer.priv = priv;
-}
-
-static inline void *nand_get_manufacturer_data(struct nand_chip *chip)
-{
-	return chip->manufacturer.priv;
-}
-
-/*
- * NAND Flash Manufacturer ID Codes
- */
-#define NAND_MFR_TOSHIBA	0x98
-#define NAND_MFR_ESMT		0xc8
-#define NAND_MFR_SAMSUNG	0xec
-#define NAND_MFR_FUJITSU	0x04
-#define NAND_MFR_NATIONAL	0x8f
-#define NAND_MFR_RENESAS	0x07
-#define NAND_MFR_STMICRO	0x20
-#define NAND_MFR_HYNIX		0xad
-#define NAND_MFR_MICRON		0x2c
-#define NAND_MFR_AMD		0x01
-#define NAND_MFR_MACRONIX	0xc2
-#define NAND_MFR_EON		0x92
-#define NAND_MFR_SANDISK	0x45
-#define NAND_MFR_INTEL		0x89
-#define NAND_MFR_ATO		0x9b
-#define NAND_MFR_WINBOND	0xef
-
-/* The maximum expected count of bytes in the NAND ID sequence */
-#define NAND_MAX_ID_LEN 8
-
-/*
- * A helper for defining older NAND chips where the second ID byte fully
- * defined the chip, including the geometry (chip size, eraseblock size, page
- * size). All these chips have 512 bytes NAND page size.
- */
-#define LEGACY_ID_NAND(nm, devid, chipsz, erasesz, opts)          \
-	{ .name = (nm), {{ .dev_id = (devid) }}, .pagesize = 512, \
-	  .chipsize = (chipsz), .erasesize = (erasesz), .options = (opts) }
-
-/*
- * A helper for defining newer chips which report their page size and
- * eraseblock size via the extended ID bytes.
- *
- * The real difference between LEGACY_ID_NAND and EXTENDED_ID_NAND is that with
- * EXTENDED_ID_NAND, manufacturers overloaded the same device ID so that the
- * device ID now only represented a particular total chip size (and voltage,
- * buswidth), and the page size, eraseblock size, and OOB size could vary while
- * using the same device ID.
- */
-#define EXTENDED_ID_NAND(nm, devid, chipsz, opts)                      \
-	{ .name = (nm), {{ .dev_id = (devid) }}, .chipsize = (chipsz), \
-	  .options = (opts) }
-
-#define NAND_ECC_INFO(_strength, _step)	\
-			{ .strength_ds = (_strength), .step_ds = (_step) }
-#define NAND_ECC_STRENGTH(type)		((type)->ecc.strength_ds)
-#define NAND_ECC_STEP(type)		((type)->ecc.step_ds)
-
-/**
- * struct nand_flash_dev - NAND Flash Device ID Structure
- * @name: a human-readable name of the NAND chip
- * @dev_id: the device ID (the second byte of the full chip ID array)
- * @mfr_id: manufecturer ID part of the full chip ID array (refers the same
- *          memory address as @id[0])
- * @dev_id: device ID part of the full chip ID array (refers the same memory
- *          address as @id[1])
- * @id: full device ID array
- * @pagesize: size of the NAND page in bytes; if 0, then the real page size (as
- *            well as the eraseblock size) is determined from the extended NAND
- *            chip ID array)
- * @chipsize: total chip size in MiB
- * @erasesize: eraseblock size in bytes (determined from the extended ID if 0)
- * @options: stores various chip bit options
- * @id_len: The valid length of the @id.
- * @oobsize: OOB size
- * @ecc: ECC correctability and step information from the datasheet.
- * @ecc.strength_ds: The ECC correctability from the datasheet, same as the
- *                   @ecc_strength_ds in nand_chip{}.
- * @ecc.step_ds: The ECC step required by the @ecc.strength_ds, same as the
- *               @ecc_step_ds in nand_chip{}, also from the datasheet.
- *               For example, the "4bit ECC for each 512Byte" can be set with
- *               NAND_ECC_INFO(4, 512).
- * @onfi_timing_mode_default: the default ONFI timing mode entered after a NAND
- *			      reset. Should be deduced from timings described
- *			      in the datasheet.
- *
- */
-struct nand_flash_dev {
-	char *name;
-	union {
-		struct {
-			uint8_t mfr_id;
-			uint8_t dev_id;
-		};
-		uint8_t id[NAND_MAX_ID_LEN];
-	};
-	unsigned int pagesize;
-	unsigned int chipsize;
-	unsigned int erasesize;
-	unsigned int options;
-	uint16_t id_len;
-	uint16_t oobsize;
-	struct {
-		uint16_t strength_ds;
-		uint16_t step_ds;
-	} ecc;
-	int onfi_timing_mode_default;
-};
-
-/**
- * struct nand_manufacturer - NAND Flash Manufacturer structure
- * @name:	Manufacturer name
- * @id:		manufacturer ID code of device.
- * @ops:	manufacturer operations
-*/
-struct nand_manufacturer {
-	int id;
-	char *name;
-	const struct nand_manufacturer_ops *ops;
-};
-
-const struct nand_manufacturer *nand_get_manufacturer(u8 id);
-
-static inline const char *
-nand_manufacturer_name(const struct nand_manufacturer *manufacturer)
-{
-	return manufacturer ? manufacturer->name : "Unknown";
-}
-
-extern struct nand_flash_dev nand_flash_ids[];
-
-extern const struct nand_manufacturer_ops toshiba_nand_manuf_ops;
-extern const struct nand_manufacturer_ops samsung_nand_manuf_ops;
-extern const struct nand_manufacturer_ops hynix_nand_manuf_ops;
-extern const struct nand_manufacturer_ops micron_nand_manuf_ops;
-extern const struct nand_manufacturer_ops amd_nand_manuf_ops;
-extern const struct nand_manufacturer_ops macronix_nand_manuf_ops;
-
-int nand_default_bbt(struct mtd_info *mtd);
-int nand_markbad_bbt(struct mtd_info *mtd, loff_t offs);
-int nand_isreserved_bbt(struct mtd_info *mtd, loff_t offs);
-int nand_isbad_bbt(struct mtd_info *mtd, loff_t offs, int allowbbt);
-int nand_erase_nand(struct mtd_info *mtd, struct erase_info *instr,
-		    int allowbbt);
-int nand_do_read(struct mtd_info *mtd, loff_t from, size_t len,
-		 size_t *retlen, uint8_t *buf);
-
-/**
- * struct platform_nand_chip - chip level device structure
- * @nr_chips:		max. number of chips to scan for
- * @chip_offset:	chip number offset
- * @nr_partitions:	number of partitions pointed to by partitions (or zero)
- * @partitions:		mtd partition list
- * @chip_delay:		R/B delay value in us
- * @options:		Option flags, e.g. 16bit buswidth
- * @bbt_options:	BBT option flags, e.g. NAND_BBT_USE_FLASH
- * @part_probe_types:	NULL-terminated array of probe types
- */
-struct platform_nand_chip {
-	int nr_chips;
-	int chip_offset;
-	int nr_partitions;
-	struct mtd_partition *partitions;
-	int chip_delay;
-	unsigned int options;
-	unsigned int bbt_options;
-	const char **part_probe_types;
-};
-
-/* Keep gcc happy */
-struct platform_device;
-
-/**
- * struct platform_nand_ctrl - controller level device structure
- * @probe:		platform specific function to probe/setup hardware
- * @remove:		platform specific function to remove/teardown hardware
- * @hwcontrol:		platform specific hardware control structure
- * @dev_ready:		platform specific function to read ready/busy pin
- * @select_chip:	platform specific chip select function
- * @cmd_ctrl:		platform specific function for controlling
- *			ALE/CLE/nCE. Also used to write command and address
- * @write_buf:		platform specific function for write buffer
- * @read_buf:		platform specific function for read buffer
- * @read_byte:		platform specific function to read one byte from chip
- * @priv:		private data to transport driver specific settings
- *
- * All fields are optional and depend on the hardware driver requirements
- */
-struct platform_nand_ctrl {
-	int (*probe)(struct platform_device *pdev);
-	void (*remove)(struct platform_device *pdev);
-	void (*hwcontrol)(struct mtd_info *mtd, int cmd);
-	int (*dev_ready)(struct mtd_info *mtd);
-	void (*select_chip)(struct mtd_info *mtd, int chip);
-	void (*cmd_ctrl)(struct mtd_info *mtd, int dat, unsigned int ctrl);
-	void (*write_buf)(struct mtd_info *mtd, const uint8_t *buf, int len);
-	void (*read_buf)(struct mtd_info *mtd, uint8_t *buf, int len);
-	unsigned char (*read_byte)(struct mtd_info *mtd);
-	void *priv;
-};
-
-/**
- * struct platform_nand_data - container structure for platform-specific data
- * @chip:		chip level chip structure
- * @ctrl:		controller level device structure
- */
-struct platform_nand_data {
-	struct platform_nand_chip chip;
-	struct platform_nand_ctrl ctrl;
-};
-
-/* return the supported features. */
-static inline int onfi_feature(struct nand_chip *chip)
-{
-	return chip->onfi_version ? le16_to_cpu(chip->onfi_params.features) : 0;
-}
-
-/* return the supported asynchronous timing mode. */
-static inline int onfi_get_async_timing_mode(struct nand_chip *chip)
-{
-	if (!chip->onfi_version)
-		return ONFI_TIMING_MODE_UNKNOWN;
-	return le16_to_cpu(chip->onfi_params.async_timing_mode);
-}
-
-/* return the supported synchronous timing mode. */
-static inline int onfi_get_sync_timing_mode(struct nand_chip *chip)
-{
-	if (!chip->onfi_version)
-		return ONFI_TIMING_MODE_UNKNOWN;
-	return le16_to_cpu(chip->onfi_params.src_sync_timing_mode);
-}
-
-int onfi_init_data_interface(struct nand_chip *chip,
-			     struct nand_data_interface *iface,
-			     enum nand_data_interface_type type,
-			     int timing_mode);
-
-/*
- * Check if it is a SLC nand.
- * The !nand_is_slc() can be used to check the MLC/TLC nand chips.
- * We do not distinguish the MLC and TLC now.
- */
-static inline bool nand_is_slc(struct nand_chip *chip)
-{
-	return chip->bits_per_cell == 1;
-}
-
-/**
- * Check if the opcode's address should be sent only on the lower 8 bits
- * @command: opcode to check
- */
-static inline int nand_opcode_8bits(unsigned int command)
-{
-	switch (command) {
-	case NAND_CMD_READID:
-	case NAND_CMD_PARAM:
-	case NAND_CMD_GET_FEATURES:
-	case NAND_CMD_SET_FEATURES:
-		return 1;
-	default:
-		break;
-	}
-	return 0;
-}
-
-/* return the supported JEDEC features. */
-static inline int jedec_feature(struct nand_chip *chip)
-{
-	return chip->jedec_version ? le16_to_cpu(chip->jedec_params.features)
-		: 0;
-}
-
-/* get timing characteristics from ONFI timing mode. */
-const struct nand_sdr_timings *onfi_async_timing_mode_to_sdr_timings(int mode);
-/* get data interface from ONFI timing mode 0, used after reset. */
-const struct nand_data_interface *nand_get_default_data_interface(void);
-
-int nand_check_erased_ecc_chunk(void *data, int datalen,
-				void *ecc, int ecclen,
-				void *extraoob, int extraooblen,
-				int threshold);
-
-int nand_check_ecc_caps(struct nand_chip *chip,
-			const struct nand_ecc_caps *caps, int oobavail);
-
-int nand_match_ecc_req(struct nand_chip *chip,
-		       const struct nand_ecc_caps *caps,  int oobavail);
-
-int nand_maximize_ecc(struct nand_chip *chip,
-		      const struct nand_ecc_caps *caps, int oobavail);
-
-/* Default write_oob implementation */
-int nand_write_oob_std(struct mtd_info *mtd, struct nand_chip *chip, int page);
-
-/* Default write_oob syndrome implementation */
-int nand_write_oob_syndrome(struct mtd_info *mtd, struct nand_chip *chip,
-			    int page);
-
-/* Default read_oob implementation */
-int nand_read_oob_std(struct mtd_info *mtd, struct nand_chip *chip, int page);
-
-/* Default read_oob syndrome implementation */
-int nand_read_oob_syndrome(struct mtd_info *mtd, struct nand_chip *chip,
-			   int page);
-
-/* Stub used by drivers that do not support GET/SET FEATURES operations */
-int nand_onfi_get_set_features_notsupp(struct mtd_info *mtd,
-				       struct nand_chip *chip, int addr,
-				       u8 *subfeature_param);
-
-/* Default read_page_raw implementation */
-int nand_read_page_raw(struct mtd_info *mtd, struct nand_chip *chip,
-		       uint8_t *buf, int oob_required, int page);
-
-/* Default write_page_raw implementation */
-int nand_write_page_raw(struct mtd_info *mtd, struct nand_chip *chip,
-			const uint8_t *buf, int oob_required, int page);
-
-/* Reset and initialize a NAND device */
-int nand_reset(struct nand_chip *chip, int chipnr);
-
-/* Free resources held by the NAND device */
-void nand_cleanup(struct nand_chip *chip);
-
-/* Default extended ID decoding function */
-void nand_decode_ext_id(struct nand_chip *chip);
-#endif /* __LINUX_MTD_NAND_H */
diff --git a/include/linux/mtd/rawnand.h b/include/linux/mtd/rawnand.h
new file mode 100644
index 000000000000..b371dc0914a7
--- /dev/null
+++ b/include/linux/mtd/rawnand.h
@@ -0,0 +1,1329 @@
+/*
+ *  Copyright © 2000-2010 David Woodhouse <dwmw2@infradead.org>
+ *                        Steven J. Hill <sjhill@realitydiluted.com>
+ *		          Thomas Gleixner <tglx@linutronix.de>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * Info:
+ *	Contains standard defines and IDs for NAND flash devices
+ *
+ * Changelog:
+ *	See git changelog.
+ */
+#ifndef __LINUX_MTD_RAWNAND_H
+#define __LINUX_MTD_RAWNAND_H
+
+#include <linux/wait.h>
+#include <linux/spinlock.h>
+#include <linux/mtd/mtd.h>
+#include <linux/mtd/flashchip.h>
+#include <linux/mtd/bbm.h>
+
+struct mtd_info;
+struct nand_flash_dev;
+struct device_node;
+
+/* Scan and identify a NAND device */
+int nand_scan(struct mtd_info *mtd, int max_chips);
+/*
+ * Separate phases of nand_scan(), allowing board driver to intervene
+ * and override command or ECC setup according to flash type.
+ */
+int nand_scan_ident(struct mtd_info *mtd, int max_chips,
+			   struct nand_flash_dev *table);
+int nand_scan_tail(struct mtd_info *mtd);
+
+/* Unregister the MTD device and free resources held by the NAND device */
+void nand_release(struct mtd_info *mtd);
+
+/* Internal helper for board drivers which need to override command function */
+void nand_wait_ready(struct mtd_info *mtd);
+
+/* locks all blocks present in the device */
+int nand_lock(struct mtd_info *mtd, loff_t ofs, uint64_t len);
+
+/* unlocks specified locked blocks */
+int nand_unlock(struct mtd_info *mtd, loff_t ofs, uint64_t len);
+
+/* The maximum number of NAND chips in an array */
+#define NAND_MAX_CHIPS		8
+
+/*
+ * Constants for hardware specific CLE/ALE/NCE function
+ *
+ * These are bits which can be or'ed to set/clear multiple
+ * bits in one go.
+ */
+/* Select the chip by setting nCE to low */
+#define NAND_NCE		0x01
+/* Select the command latch by setting CLE to high */
+#define NAND_CLE		0x02
+/* Select the address latch by setting ALE to high */
+#define NAND_ALE		0x04
+
+#define NAND_CTRL_CLE		(NAND_NCE | NAND_CLE)
+#define NAND_CTRL_ALE		(NAND_NCE | NAND_ALE)
+#define NAND_CTRL_CHANGE	0x80
+
+/*
+ * Standard NAND flash commands
+ */
+#define NAND_CMD_READ0		0
+#define NAND_CMD_READ1		1
+#define NAND_CMD_RNDOUT		5
+#define NAND_CMD_PAGEPROG	0x10
+#define NAND_CMD_READOOB	0x50
+#define NAND_CMD_ERASE1		0x60
+#define NAND_CMD_STATUS		0x70
+#define NAND_CMD_SEQIN		0x80
+#define NAND_CMD_RNDIN		0x85
+#define NAND_CMD_READID		0x90
+#define NAND_CMD_ERASE2		0xd0
+#define NAND_CMD_PARAM		0xec
+#define NAND_CMD_GET_FEATURES	0xee
+#define NAND_CMD_SET_FEATURES	0xef
+#define NAND_CMD_RESET		0xff
+
+#define NAND_CMD_LOCK		0x2a
+#define NAND_CMD_UNLOCK1	0x23
+#define NAND_CMD_UNLOCK2	0x24
+
+/* Extended commands for large page devices */
+#define NAND_CMD_READSTART	0x30
+#define NAND_CMD_RNDOUTSTART	0xE0
+#define NAND_CMD_CACHEDPROG	0x15
+
+#define NAND_CMD_NONE		-1
+
+/* Status bits */
+#define NAND_STATUS_FAIL	0x01
+#define NAND_STATUS_FAIL_N1	0x02
+#define NAND_STATUS_TRUE_READY	0x20
+#define NAND_STATUS_READY	0x40
+#define NAND_STATUS_WP		0x80
+
+#define NAND_DATA_IFACE_CHECK_ONLY	-1
+
+/*
+ * Constants for ECC_MODES
+ */
+typedef enum {
+	NAND_ECC_NONE,
+	NAND_ECC_SOFT,
+	NAND_ECC_HW,
+	NAND_ECC_HW_SYNDROME,
+	NAND_ECC_HW_OOB_FIRST,
+	NAND_ECC_ON_DIE,
+} nand_ecc_modes_t;
+
+enum nand_ecc_algo {
+	NAND_ECC_UNKNOWN,
+	NAND_ECC_HAMMING,
+	NAND_ECC_BCH,
+};
+
+/*
+ * Constants for Hardware ECC
+ */
+/* Reset Hardware ECC for read */
+#define NAND_ECC_READ		0
+/* Reset Hardware ECC for write */
+#define NAND_ECC_WRITE		1
+/* Enable Hardware ECC before syndrome is read back from flash */
+#define NAND_ECC_READSYN	2
+
+/*
+ * Enable generic NAND 'page erased' check. This check is only done when
+ * ecc.correct() returns -EBADMSG.
+ * Set this flag if your implementation does not fix bitflips in erased
+ * pages and you want to rely on the default implementation.
+ */
+#define NAND_ECC_GENERIC_ERASED_CHECK	BIT(0)
+#define NAND_ECC_MAXIMIZE		BIT(1)
+/*
+ * If your controller already sends the required NAND commands when
+ * reading or writing a page, then the framework is not supposed to
+ * send READ0 and SEQIN/PAGEPROG respectively.
+ */
+#define NAND_ECC_CUSTOM_PAGE_ACCESS	BIT(2)
+
+/* Bit mask for flags passed to do_nand_read_ecc */
+#define NAND_GET_DEVICE		0x80
+
+
+/*
+ * Option constants for bizarre disfunctionality and real
+ * features.
+ */
+/* Buswidth is 16 bit */
+#define NAND_BUSWIDTH_16	0x00000002
+/* Chip has cache program function */
+#define NAND_CACHEPRG		0x00000008
+/*
+ * Chip requires ready check on read (for auto-incremented sequential read).
+ * True only for small page devices; large page devices do not support
+ * autoincrement.
+ */
+#define NAND_NEED_READRDY	0x00000100
+
+/* Chip does not allow subpage writes */
+#define NAND_NO_SUBPAGE_WRITE	0x00000200
+
+/* Device is one of 'new' xD cards that expose fake nand command set */
+#define NAND_BROKEN_XD		0x00000400
+
+/* Device behaves just like nand, but is readonly */
+#define NAND_ROM		0x00000800
+
+/* Device supports subpage reads */
+#define NAND_SUBPAGE_READ	0x00001000
+
+/*
+ * Some MLC NANDs need data scrambling to limit bitflips caused by repeated
+ * patterns.
+ */
+#define NAND_NEED_SCRAMBLING	0x00002000
+
+/* Options valid for Samsung large page devices */
+#define NAND_SAMSUNG_LP_OPTIONS NAND_CACHEPRG
+
+/* Macros to identify the above */
+#define NAND_HAS_CACHEPROG(chip) ((chip->options & NAND_CACHEPRG))
+#define NAND_HAS_SUBPAGE_READ(chip) ((chip->options & NAND_SUBPAGE_READ))
+#define NAND_HAS_SUBPAGE_WRITE(chip) !((chip)->options & NAND_NO_SUBPAGE_WRITE)
+
+/* Non chip related options */
+/* This option skips the bbt scan during initialization. */
+#define NAND_SKIP_BBTSCAN	0x00010000
+/*
+ * This option is defined if the board driver allocates its own buffers
+ * (e.g. because it needs them DMA-coherent).
+ */
+#define NAND_OWN_BUFFERS	0x00020000
+/* Chip may not exist, so silence any errors in scan */
+#define NAND_SCAN_SILENT_NODEV	0x00040000
+/*
+ * Autodetect nand buswidth with readid/onfi.
+ * This suppose the driver will configure the hardware in 8 bits mode
+ * when calling nand_scan_ident, and update its configuration
+ * before calling nand_scan_tail.
+ */
+#define NAND_BUSWIDTH_AUTO      0x00080000
+/*
+ * This option could be defined by controller drivers to protect against
+ * kmap'ed, vmalloc'ed highmem buffers being passed from upper layers
+ */
+#define NAND_USE_BOUNCE_BUFFER	0x00100000
+
+/*
+ * In case your controller is implementing ->cmd_ctrl() and is relying on the
+ * default ->cmdfunc() implementation, you may want to let the core handle the
+ * tCCS delay which is required when a column change (RNDIN or RNDOUT) is
+ * requested.
+ * If your controller already takes care of this delay, you don't need to set
+ * this flag.
+ */
+#define NAND_WAIT_TCCS		0x00200000
+
+/* Options set by nand scan */
+/* Nand scan has allocated controller struct */
+#define NAND_CONTROLLER_ALLOC	0x80000000
+
+/* Cell info constants */
+#define NAND_CI_CHIPNR_MSK	0x03
+#define NAND_CI_CELLTYPE_MSK	0x0C
+#define NAND_CI_CELLTYPE_SHIFT	2
+
+/* Keep gcc happy */
+struct nand_chip;
+
+/* ONFI features */
+#define ONFI_FEATURE_16_BIT_BUS		(1 << 0)
+#define ONFI_FEATURE_EXT_PARAM_PAGE	(1 << 7)
+
+/* ONFI timing mode, used in both asynchronous and synchronous mode */
+#define ONFI_TIMING_MODE_0		(1 << 0)
+#define ONFI_TIMING_MODE_1		(1 << 1)
+#define ONFI_TIMING_MODE_2		(1 << 2)
+#define ONFI_TIMING_MODE_3		(1 << 3)
+#define ONFI_TIMING_MODE_4		(1 << 4)
+#define ONFI_TIMING_MODE_5		(1 << 5)
+#define ONFI_TIMING_MODE_UNKNOWN	(1 << 6)
+
+/* ONFI feature address */
+#define ONFI_FEATURE_ADDR_TIMING_MODE	0x1
+
+/* Vendor-specific feature address (Micron) */
+#define ONFI_FEATURE_ADDR_READ_RETRY	0x89
+#define ONFI_FEATURE_ON_DIE_ECC		0x90
+#define   ONFI_FEATURE_ON_DIE_ECC_EN	BIT(3)
+
+/* ONFI subfeature parameters length */
+#define ONFI_SUBFEATURE_PARAM_LEN	4
+
+/* ONFI optional commands SET/GET FEATURES supported? */
+#define ONFI_OPT_CMD_SET_GET_FEATURES	(1 << 2)
+
+struct nand_onfi_params {
+	/* rev info and features block */
+	/* 'O' 'N' 'F' 'I'  */
+	u8 sig[4];
+	__le16 revision;
+	__le16 features;
+	__le16 opt_cmd;
+	u8 reserved0[2];
+	__le16 ext_param_page_length; /* since ONFI 2.1 */
+	u8 num_of_param_pages;        /* since ONFI 2.1 */
+	u8 reserved1[17];
+
+	/* manufacturer information block */
+	char manufacturer[12];
+	char model[20];
+	u8 jedec_id;
+	__le16 date_code;
+	u8 reserved2[13];
+
+	/* memory organization block */
+	__le32 byte_per_page;
+	__le16 spare_bytes_per_page;
+	__le32 data_bytes_per_ppage;
+	__le16 spare_bytes_per_ppage;
+	__le32 pages_per_block;
+	__le32 blocks_per_lun;
+	u8 lun_count;
+	u8 addr_cycles;
+	u8 bits_per_cell;
+	__le16 bb_per_lun;
+	__le16 block_endurance;
+	u8 guaranteed_good_blocks;
+	__le16 guaranteed_block_endurance;
+	u8 programs_per_page;
+	u8 ppage_attr;
+	u8 ecc_bits;
+	u8 interleaved_bits;
+	u8 interleaved_ops;
+	u8 reserved3[13];
+
+	/* electrical parameter block */
+	u8 io_pin_capacitance_max;
+	__le16 async_timing_mode;
+	__le16 program_cache_timing_mode;
+	__le16 t_prog;
+	__le16 t_bers;
+	__le16 t_r;
+	__le16 t_ccs;
+	__le16 src_sync_timing_mode;
+	u8 src_ssync_features;
+	__le16 clk_pin_capacitance_typ;
+	__le16 io_pin_capacitance_typ;
+	__le16 input_pin_capacitance_typ;
+	u8 input_pin_capacitance_max;
+	u8 driver_strength_support;
+	__le16 t_int_r;
+	__le16 t_adl;
+	u8 reserved4[8];
+
+	/* vendor */
+	__le16 vendor_revision;
+	u8 vendor[88];
+
+	__le16 crc;
+} __packed;
+
+#define ONFI_CRC_BASE	0x4F4E
+
+/* Extended ECC information Block Definition (since ONFI 2.1) */
+struct onfi_ext_ecc_info {
+	u8 ecc_bits;
+	u8 codeword_size;
+	__le16 bb_per_lun;
+	__le16 block_endurance;
+	u8 reserved[2];
+} __packed;
+
+#define ONFI_SECTION_TYPE_0	0	/* Unused section. */
+#define ONFI_SECTION_TYPE_1	1	/* for additional sections. */
+#define ONFI_SECTION_TYPE_2	2	/* for ECC information. */
+struct onfi_ext_section {
+	u8 type;
+	u8 length;
+} __packed;
+
+#define ONFI_EXT_SECTION_MAX 8
+
+/* Extended Parameter Page Definition (since ONFI 2.1) */
+struct onfi_ext_param_page {
+	__le16 crc;
+	u8 sig[4];             /* 'E' 'P' 'P' 'S' */
+	u8 reserved0[10];
+	struct onfi_ext_section sections[ONFI_EXT_SECTION_MAX];
+
+	/*
+	 * The actual size of the Extended Parameter Page is in
+	 * @ext_param_page_length of nand_onfi_params{}.
+	 * The following are the variable length sections.
+	 * So we do not add any fields below. Please see the ONFI spec.
+	 */
+} __packed;
+
+struct jedec_ecc_info {
+	u8 ecc_bits;
+	u8 codeword_size;
+	__le16 bb_per_lun;
+	__le16 block_endurance;
+	u8 reserved[2];
+} __packed;
+
+/* JEDEC features */
+#define JEDEC_FEATURE_16_BIT_BUS	(1 << 0)
+
+struct nand_jedec_params {
+	/* rev info and features block */
+	/* 'J' 'E' 'S' 'D'  */
+	u8 sig[4];
+	__le16 revision;
+	__le16 features;
+	u8 opt_cmd[3];
+	__le16 sec_cmd;
+	u8 num_of_param_pages;
+	u8 reserved0[18];
+
+	/* manufacturer information block */
+	char manufacturer[12];
+	char model[20];
+	u8 jedec_id[6];
+	u8 reserved1[10];
+
+	/* memory organization block */
+	__le32 byte_per_page;
+	__le16 spare_bytes_per_page;
+	u8 reserved2[6];
+	__le32 pages_per_block;
+	__le32 blocks_per_lun;
+	u8 lun_count;
+	u8 addr_cycles;
+	u8 bits_per_cell;
+	u8 programs_per_page;
+	u8 multi_plane_addr;
+	u8 multi_plane_op_attr;
+	u8 reserved3[38];
+
+	/* electrical parameter block */
+	__le16 async_sdr_speed_grade;
+	__le16 toggle_ddr_speed_grade;
+	__le16 sync_ddr_speed_grade;
+	u8 async_sdr_features;
+	u8 toggle_ddr_features;
+	u8 sync_ddr_features;
+	__le16 t_prog;
+	__le16 t_bers;
+	__le16 t_r;
+	__le16 t_r_multi_plane;
+	__le16 t_ccs;
+	__le16 io_pin_capacitance_typ;
+	__le16 input_pin_capacitance_typ;
+	__le16 clk_pin_capacitance_typ;
+	u8 driver_strength_support;
+	__le16 t_adl;
+	u8 reserved4[36];
+
+	/* ECC and endurance block */
+	u8 guaranteed_good_blocks;
+	__le16 guaranteed_block_endurance;
+	struct jedec_ecc_info ecc_info[4];
+	u8 reserved5[29];
+
+	/* reserved */
+	u8 reserved6[148];
+
+	/* vendor */
+	__le16 vendor_rev_num;
+	u8 reserved7[88];
+
+	/* CRC for Parameter Page */
+	__le16 crc;
+} __packed;
+
+/**
+ * struct nand_id - NAND id structure
+ * @data: buffer containing the id bytes. Currently 8 bytes large, but can
+ *	  be extended if required.
+ * @len: ID length.
+ */
+struct nand_id {
+	u8 data[8];
+	int len;
+};
+
+/**
+ * struct nand_hw_control - Control structure for hardware controller (e.g ECC generator) shared among independent devices
+ * @lock:               protection lock
+ * @active:		the mtd device which holds the controller currently
+ * @wq:			wait queue to sleep on if a NAND operation is in
+ *			progress used instead of the per chip wait queue
+ *			when a hw controller is available.
+ */
+struct nand_hw_control {
+	spinlock_t lock;
+	struct nand_chip *active;
+	wait_queue_head_t wq;
+};
+
+static inline void nand_hw_control_init(struct nand_hw_control *nfc)
+{
+	nfc->active = NULL;
+	spin_lock_init(&nfc->lock);
+	init_waitqueue_head(&nfc->wq);
+}
+
+/**
+ * struct nand_ecc_step_info - ECC step information of ECC engine
+ * @stepsize: data bytes per ECC step
+ * @strengths: array of supported strengths
+ * @nstrengths: number of supported strengths
+ */
+struct nand_ecc_step_info {
+	int stepsize;
+	const int *strengths;
+	int nstrengths;
+};
+
+/**
+ * struct nand_ecc_caps - capability of ECC engine
+ * @stepinfos: array of ECC step information
+ * @nstepinfos: number of ECC step information
+ * @calc_ecc_bytes: driver's hook to calculate ECC bytes per step
+ */
+struct nand_ecc_caps {
+	const struct nand_ecc_step_info *stepinfos;
+	int nstepinfos;
+	int (*calc_ecc_bytes)(int step_size, int strength);
+};
+
+/* a shorthand to generate struct nand_ecc_caps with only one ECC stepsize */
+#define NAND_ECC_CAPS_SINGLE(__name, __calc, __step, ...)	\
+static const int __name##_strengths[] = { __VA_ARGS__ };	\
+static const struct nand_ecc_step_info __name##_stepinfo = {	\
+	.stepsize = __step,					\
+	.strengths = __name##_strengths,			\
+	.nstrengths = ARRAY_SIZE(__name##_strengths),		\
+};								\
+static const struct nand_ecc_caps __name = {			\
+	.stepinfos = &__name##_stepinfo,			\
+	.nstepinfos = 1,					\
+	.calc_ecc_bytes = __calc,				\
+}
+
+/**
+ * struct nand_ecc_ctrl - Control structure for ECC
+ * @mode:	ECC mode
+ * @algo:	ECC algorithm
+ * @steps:	number of ECC steps per page
+ * @size:	data bytes per ECC step
+ * @bytes:	ECC bytes per step
+ * @strength:	max number of correctible bits per ECC step
+ * @total:	total number of ECC bytes per page
+ * @prepad:	padding information for syndrome based ECC generators
+ * @postpad:	padding information for syndrome based ECC generators
+ * @options:	ECC specific options (see NAND_ECC_XXX flags defined above)
+ * @priv:	pointer to private ECC control data
+ * @hwctl:	function to control hardware ECC generator. Must only
+ *		be provided if an hardware ECC is available
+ * @calculate:	function for ECC calculation or readback from ECC hardware
+ * @correct:	function for ECC correction, matching to ECC generator (sw/hw).
+ *		Should return a positive number representing the number of
+ *		corrected bitflips, -EBADMSG if the number of bitflips exceed
+ *		ECC strength, or any other error code if the error is not
+ *		directly related to correction.
+ *		If -EBADMSG is returned the input buffers should be left
+ *		untouched.
+ * @read_page_raw:	function to read a raw page without ECC. This function
+ *			should hide the specific layout used by the ECC
+ *			controller and always return contiguous in-band and
+ *			out-of-band data even if they're not stored
+ *			contiguously on the NAND chip (e.g.
+ *			NAND_ECC_HW_SYNDROME interleaves in-band and
+ *			out-of-band data).
+ * @write_page_raw:	function to write a raw page without ECC. This function
+ *			should hide the specific layout used by the ECC
+ *			controller and consider the passed data as contiguous
+ *			in-band and out-of-band data. ECC controller is
+ *			responsible for doing the appropriate transformations
+ *			to adapt to its specific layout (e.g.
+ *			NAND_ECC_HW_SYNDROME interleaves in-band and
+ *			out-of-band data).
+ * @read_page:	function to read a page according to the ECC generator
+ *		requirements; returns maximum number of bitflips corrected in
+ *		any single ECC step, -EIO hw error
+ * @read_subpage:	function to read parts of the page covered by ECC;
+ *			returns same as read_page()
+ * @write_subpage:	function to write parts of the page covered by ECC.
+ * @write_page:	function to write a page according to the ECC generator
+ *		requirements.
+ * @write_oob_raw:	function to write chip OOB data without ECC
+ * @read_oob_raw:	function to read chip OOB data without ECC
+ * @read_oob:	function to read chip OOB data
+ * @write_oob:	function to write chip OOB data
+ */
+struct nand_ecc_ctrl {
+	nand_ecc_modes_t mode;
+	enum nand_ecc_algo algo;
+	int steps;
+	int size;
+	int bytes;
+	int total;
+	int strength;
+	int prepad;
+	int postpad;
+	unsigned int options;
+	void *priv;
+	void (*hwctl)(struct mtd_info *mtd, int mode);
+	int (*calculate)(struct mtd_info *mtd, const uint8_t *dat,
+			uint8_t *ecc_code);
+	int (*correct)(struct mtd_info *mtd, uint8_t *dat, uint8_t *read_ecc,
+			uint8_t *calc_ecc);
+	int (*read_page_raw)(struct mtd_info *mtd, struct nand_chip *chip,
+			uint8_t *buf, int oob_required, int page);
+	int (*write_page_raw)(struct mtd_info *mtd, struct nand_chip *chip,
+			const uint8_t *buf, int oob_required, int page);
+	int (*read_page)(struct mtd_info *mtd, struct nand_chip *chip,
+			uint8_t *buf, int oob_required, int page);
+	int (*read_subpage)(struct mtd_info *mtd, struct nand_chip *chip,
+			uint32_t offs, uint32_t len, uint8_t *buf, int page);
+	int (*write_subpage)(struct mtd_info *mtd, struct nand_chip *chip,
+			uint32_t offset, uint32_t data_len,
+			const uint8_t *data_buf, int oob_required, int page);
+	int (*write_page)(struct mtd_info *mtd, struct nand_chip *chip,
+			const uint8_t *buf, int oob_required, int page);
+	int (*write_oob_raw)(struct mtd_info *mtd, struct nand_chip *chip,
+			int page);
+	int (*read_oob_raw)(struct mtd_info *mtd, struct nand_chip *chip,
+			int page);
+	int (*read_oob)(struct mtd_info *mtd, struct nand_chip *chip, int page);
+	int (*write_oob)(struct mtd_info *mtd, struct nand_chip *chip,
+			int page);
+};
+
+static inline int nand_standard_page_accessors(struct nand_ecc_ctrl *ecc)
+{
+	return !(ecc->options & NAND_ECC_CUSTOM_PAGE_ACCESS);
+}
+
+/**
+ * struct nand_buffers - buffer structure for read/write
+ * @ecccalc:	buffer pointer for calculated ECC, size is oobsize.
+ * @ecccode:	buffer pointer for ECC read from flash, size is oobsize.
+ * @databuf:	buffer pointer for data, size is (page size + oobsize).
+ *
+ * Do not change the order of buffers. databuf and oobrbuf must be in
+ * consecutive order.
+ */
+struct nand_buffers {
+	uint8_t	*ecccalc;
+	uint8_t	*ecccode;
+	uint8_t *databuf;
+};
+
+/**
+ * struct nand_sdr_timings - SDR NAND chip timings
+ *
+ * This struct defines the timing requirements of a SDR NAND chip.
+ * These information can be found in every NAND datasheets and the timings
+ * meaning are described in the ONFI specifications:
+ * www.onfi.org/~/media/ONFI/specs/onfi_3_1_spec.pdf (chapter 4.15 Timing
+ * Parameters)
+ *
+ * All these timings are expressed in picoseconds.
+ *
+ * @tBERS_max: Block erase time
+ * @tCCS_min: Change column setup time
+ * @tPROG_max: Page program time
+ * @tR_max: Page read time
+ * @tALH_min: ALE hold time
+ * @tADL_min: ALE to data loading time
+ * @tALS_min: ALE setup time
+ * @tAR_min: ALE to RE# delay
+ * @tCEA_max: CE# access time
+ * @tCEH_min: CE# high hold time
+ * @tCH_min:  CE# hold time
+ * @tCHZ_max: CE# high to output hi-Z
+ * @tCLH_min: CLE hold time
+ * @tCLR_min: CLE to RE# delay
+ * @tCLS_min: CLE setup time
+ * @tCOH_min: CE# high to output hold
+ * @tCS_min: CE# setup time
+ * @tDH_min: Data hold time
+ * @tDS_min: Data setup time
+ * @tFEAT_max: Busy time for Set Features and Get Features
+ * @tIR_min: Output hi-Z to RE# low
+ * @tITC_max: Interface and Timing Mode Change time
+ * @tRC_min: RE# cycle time
+ * @tREA_max: RE# access time
+ * @tREH_min: RE# high hold time
+ * @tRHOH_min: RE# high to output hold
+ * @tRHW_min: RE# high to WE# low
+ * @tRHZ_max: RE# high to output hi-Z
+ * @tRLOH_min: RE# low to output hold
+ * @tRP_min: RE# pulse width
+ * @tRR_min: Ready to RE# low (data only)
+ * @tRST_max: Device reset time, measured from the falling edge of R/B# to the
+ *	      rising edge of R/B#.
+ * @tWB_max: WE# high to SR[6] low
+ * @tWC_min: WE# cycle time
+ * @tWH_min: WE# high hold time
+ * @tWHR_min: WE# high to RE# low
+ * @tWP_min: WE# pulse width
+ * @tWW_min: WP# transition to WE# low
+ */
+struct nand_sdr_timings {
+	u32 tBERS_max;
+	u32 tCCS_min;
+	u32 tPROG_max;
+	u32 tR_max;
+	u32 tALH_min;
+	u32 tADL_min;
+	u32 tALS_min;
+	u32 tAR_min;
+	u32 tCEA_max;
+	u32 tCEH_min;
+	u32 tCH_min;
+	u32 tCHZ_max;
+	u32 tCLH_min;
+	u32 tCLR_min;
+	u32 tCLS_min;
+	u32 tCOH_min;
+	u32 tCS_min;
+	u32 tDH_min;
+	u32 tDS_min;
+	u32 tFEAT_max;
+	u32 tIR_min;
+	u32 tITC_max;
+	u32 tRC_min;
+	u32 tREA_max;
+	u32 tREH_min;
+	u32 tRHOH_min;
+	u32 tRHW_min;
+	u32 tRHZ_max;
+	u32 tRLOH_min;
+	u32 tRP_min;
+	u32 tRR_min;
+	u64 tRST_max;
+	u32 tWB_max;
+	u32 tWC_min;
+	u32 tWH_min;
+	u32 tWHR_min;
+	u32 tWP_min;
+	u32 tWW_min;
+};
+
+/**
+ * enum nand_data_interface_type - NAND interface timing type
+ * @NAND_SDR_IFACE:	Single Data Rate interface
+ */
+enum nand_data_interface_type {
+	NAND_SDR_IFACE,
+};
+
+/**
+ * struct nand_data_interface - NAND interface timing
+ * @type:	type of the timing
+ * @timings:	The timing, type according to @type
+ */
+struct nand_data_interface {
+	enum nand_data_interface_type type;
+	union {
+		struct nand_sdr_timings sdr;
+	} timings;
+};
+
+/**
+ * nand_get_sdr_timings - get SDR timing from data interface
+ * @conf:	The data interface
+ */
+static inline const struct nand_sdr_timings *
+nand_get_sdr_timings(const struct nand_data_interface *conf)
+{
+	if (conf->type != NAND_SDR_IFACE)
+		return ERR_PTR(-EINVAL);
+
+	return &conf->timings.sdr;
+}
+
+/**
+ * struct nand_manufacturer_ops - NAND Manufacturer operations
+ * @detect: detect the NAND memory organization and capabilities
+ * @init: initialize all vendor specific fields (like the ->read_retry()
+ *	  implementation) if any.
+ * @cleanup: the ->init() function may have allocated resources, ->cleanup()
+ *	     is here to let vendor specific code release those resources.
+ */
+struct nand_manufacturer_ops {
+	void (*detect)(struct nand_chip *chip);
+	int (*init)(struct nand_chip *chip);
+	void (*cleanup)(struct nand_chip *chip);
+};
+
+/**
+ * struct nand_chip - NAND Private Flash Chip Data
+ * @mtd:		MTD device registered to the MTD framework
+ * @IO_ADDR_R:		[BOARDSPECIFIC] address to read the 8 I/O lines of the
+ *			flash device
+ * @IO_ADDR_W:		[BOARDSPECIFIC] address to write the 8 I/O lines of the
+ *			flash device.
+ * @read_byte:		[REPLACEABLE] read one byte from the chip
+ * @read_word:		[REPLACEABLE] read one word from the chip
+ * @write_byte:		[REPLACEABLE] write a single byte to the chip on the
+ *			low 8 I/O lines
+ * @write_buf:		[REPLACEABLE] write data from the buffer to the chip
+ * @read_buf:		[REPLACEABLE] read data from the chip into the buffer
+ * @select_chip:	[REPLACEABLE] select chip nr
+ * @block_bad:		[REPLACEABLE] check if a block is bad, using OOB markers
+ * @block_markbad:	[REPLACEABLE] mark a block bad
+ * @cmd_ctrl:		[BOARDSPECIFIC] hardwarespecific function for controlling
+ *			ALE/CLE/nCE. Also used to write command and address
+ * @dev_ready:		[BOARDSPECIFIC] hardwarespecific function for accessing
+ *			device ready/busy line. If set to NULL no access to
+ *			ready/busy is available and the ready/busy information
+ *			is read from the chip status register.
+ * @cmdfunc:		[REPLACEABLE] hardwarespecific function for writing
+ *			commands to the chip.
+ * @waitfunc:		[REPLACEABLE] hardwarespecific function for wait on
+ *			ready.
+ * @setup_read_retry:	[FLASHSPECIFIC] flash (vendor) specific function for
+ *			setting the read-retry mode. Mostly needed for MLC NAND.
+ * @ecc:		[BOARDSPECIFIC] ECC control structure
+ * @buffers:		buffer structure for read/write
+ * @buf_align:		minimum buffer alignment required by a platform
+ * @hwcontrol:		platform-specific hardware control structure
+ * @erase:		[REPLACEABLE] erase function
+ * @scan_bbt:		[REPLACEABLE] function to scan bad block table
+ * @chip_delay:		[BOARDSPECIFIC] chip dependent delay for transferring
+ *			data from array to read regs (tR).
+ * @state:		[INTERN] the current state of the NAND device
+ * @oob_poi:		"poison value buffer," used for laying out OOB data
+ *			before writing
+ * @page_shift:		[INTERN] number of address bits in a page (column
+ *			address bits).
+ * @phys_erase_shift:	[INTERN] number of address bits in a physical eraseblock
+ * @bbt_erase_shift:	[INTERN] number of address bits in a bbt entry
+ * @chip_shift:		[INTERN] number of address bits in one chip
+ * @options:		[BOARDSPECIFIC] various chip options. They can partly
+ *			be set to inform nand_scan about special functionality.
+ *			See the defines for further explanation.
+ * @bbt_options:	[INTERN] bad block specific options. All options used
+ *			here must come from bbm.h. By default, these options
+ *			will be copied to the appropriate nand_bbt_descr's.
+ * @badblockpos:	[INTERN] position of the bad block marker in the oob
+ *			area.
+ * @badblockbits:	[INTERN] minimum number of set bits in a good block's
+ *			bad block marker position; i.e., BBM == 11110111b is
+ *			not bad when badblockbits == 7
+ * @bits_per_cell:	[INTERN] number of bits per cell. i.e., 1 means SLC.
+ * @ecc_strength_ds:	[INTERN] ECC correctability from the datasheet.
+ *			Minimum amount of bit errors per @ecc_step_ds guaranteed
+ *			to be correctable. If unknown, set to zero.
+ * @ecc_step_ds:	[INTERN] ECC step required by the @ecc_strength_ds,
+ *			also from the datasheet. It is the recommended ECC step
+ *			size, if known; if unknown, set to zero.
+ * @onfi_timing_mode_default: [INTERN] default ONFI timing mode. This field is
+ *			      set to the actually used ONFI mode if the chip is
+ *			      ONFI compliant or deduced from the datasheet if
+ *			      the NAND chip is not ONFI compliant.
+ * @numchips:		[INTERN] number of physical chips
+ * @chipsize:		[INTERN] the size of one chip for multichip arrays
+ * @pagemask:		[INTERN] page number mask = number of (pages / chip) - 1
+ * @pagebuf:		[INTERN] holds the pagenumber which is currently in
+ *			data_buf.
+ * @pagebuf_bitflips:	[INTERN] holds the bitflip count for the page which is
+ *			currently in data_buf.
+ * @subpagesize:	[INTERN] holds the subpagesize
+ * @id:			[INTERN] holds NAND ID
+ * @onfi_version:	[INTERN] holds the chip ONFI version (BCD encoded),
+ *			non 0 if ONFI supported.
+ * @jedec_version:	[INTERN] holds the chip JEDEC version (BCD encoded),
+ *			non 0 if JEDEC supported.
+ * @onfi_params:	[INTERN] holds the ONFI page parameter when ONFI is
+ *			supported, 0 otherwise.
+ * @jedec_params:	[INTERN] holds the JEDEC parameter page when JEDEC is
+ *			supported, 0 otherwise.
+ * @max_bb_per_die:	[INTERN] the max number of bad blocks each die of a
+ *			this nand device will encounter their life times.
+ * @blocks_per_die:	[INTERN] The number of PEBs in a die
+ * @data_interface:	[INTERN] NAND interface timing information
+ * @read_retries:	[INTERN] the number of read retry modes supported
+ * @onfi_set_features:	[REPLACEABLE] set the features for ONFI nand
+ * @onfi_get_features:	[REPLACEABLE] get the features for ONFI nand
+ * @setup_data_interface: [OPTIONAL] setup the data interface and timing. If
+ *			  chipnr is set to %NAND_DATA_IFACE_CHECK_ONLY this
+ *			  means the configuration should not be applied but
+ *			  only checked.
+ * @bbt:		[INTERN] bad block table pointer
+ * @bbt_td:		[REPLACEABLE] bad block table descriptor for flash
+ *			lookup.
+ * @bbt_md:		[REPLACEABLE] bad block table mirror descriptor
+ * @badblock_pattern:	[REPLACEABLE] bad block scan pattern used for initial
+ *			bad block scan.
+ * @controller:		[REPLACEABLE] a pointer to a hardware controller
+ *			structure which is shared among multiple independent
+ *			devices.
+ * @priv:		[OPTIONAL] pointer to private chip data
+ * @manufacturer:	[INTERN] Contains manufacturer information
+ */
+
+struct nand_chip {
+	struct mtd_info mtd;
+	void __iomem *IO_ADDR_R;
+	void __iomem *IO_ADDR_W;
+
+	uint8_t (*read_byte)(struct mtd_info *mtd);
+	u16 (*read_word)(struct mtd_info *mtd);
+	void (*write_byte)(struct mtd_info *mtd, uint8_t byte);
+	void (*write_buf)(struct mtd_info *mtd, const uint8_t *buf, int len);
+	void (*read_buf)(struct mtd_info *mtd, uint8_t *buf, int len);
+	void (*select_chip)(struct mtd_info *mtd, int chip);
+	int (*block_bad)(struct mtd_info *mtd, loff_t ofs);
+	int (*block_markbad)(struct mtd_info *mtd, loff_t ofs);
+	void (*cmd_ctrl)(struct mtd_info *mtd, int dat, unsigned int ctrl);
+	int (*dev_ready)(struct mtd_info *mtd);
+	void (*cmdfunc)(struct mtd_info *mtd, unsigned command, int column,
+			int page_addr);
+	int(*waitfunc)(struct mtd_info *mtd, struct nand_chip *this);
+	int (*erase)(struct mtd_info *mtd, int page);
+	int (*scan_bbt)(struct mtd_info *mtd);
+	int (*onfi_set_features)(struct mtd_info *mtd, struct nand_chip *chip,
+			int feature_addr, uint8_t *subfeature_para);
+	int (*onfi_get_features)(struct mtd_info *mtd, struct nand_chip *chip,
+			int feature_addr, uint8_t *subfeature_para);
+	int (*setup_read_retry)(struct mtd_info *mtd, int retry_mode);
+	int (*setup_data_interface)(struct mtd_info *mtd, int chipnr,
+				    const struct nand_data_interface *conf);
+
+
+	int chip_delay;
+	unsigned int options;
+	unsigned int bbt_options;
+
+	int page_shift;
+	int phys_erase_shift;
+	int bbt_erase_shift;
+	int chip_shift;
+	int numchips;
+	uint64_t chipsize;
+	int pagemask;
+	int pagebuf;
+	unsigned int pagebuf_bitflips;
+	int subpagesize;
+	uint8_t bits_per_cell;
+	uint16_t ecc_strength_ds;
+	uint16_t ecc_step_ds;
+	int onfi_timing_mode_default;
+	int badblockpos;
+	int badblockbits;
+
+	struct nand_id id;
+	int onfi_version;
+	int jedec_version;
+	union {
+		struct nand_onfi_params	onfi_params;
+		struct nand_jedec_params jedec_params;
+	};
+	u16 max_bb_per_die;
+	u32 blocks_per_die;
+
+	struct nand_data_interface *data_interface;
+
+	int read_retries;
+
+	flstate_t state;
+
+	uint8_t *oob_poi;
+	struct nand_hw_control *controller;
+
+	struct nand_ecc_ctrl ecc;
+	struct nand_buffers *buffers;
+	unsigned long buf_align;
+	struct nand_hw_control hwcontrol;
+
+	uint8_t *bbt;
+	struct nand_bbt_descr *bbt_td;
+	struct nand_bbt_descr *bbt_md;
+
+	struct nand_bbt_descr *badblock_pattern;
+
+	void *priv;
+
+	struct {
+		const struct nand_manufacturer *desc;
+		void *priv;
+	} manufacturer;
+};
+
+extern const struct mtd_ooblayout_ops nand_ooblayout_sp_ops;
+extern const struct mtd_ooblayout_ops nand_ooblayout_lp_ops;
+
+static inline void nand_set_flash_node(struct nand_chip *chip,
+				       struct device_node *np)
+{
+	mtd_set_of_node(&chip->mtd, np);
+}
+
+static inline struct device_node *nand_get_flash_node(struct nand_chip *chip)
+{
+	return mtd_get_of_node(&chip->mtd);
+}
+
+static inline struct nand_chip *mtd_to_nand(struct mtd_info *mtd)
+{
+	return container_of(mtd, struct nand_chip, mtd);
+}
+
+static inline struct mtd_info *nand_to_mtd(struct nand_chip *chip)
+{
+	return &chip->mtd;
+}
+
+static inline void *nand_get_controller_data(struct nand_chip *chip)
+{
+	return chip->priv;
+}
+
+static inline void nand_set_controller_data(struct nand_chip *chip, void *priv)
+{
+	chip->priv = priv;
+}
+
+static inline void nand_set_manufacturer_data(struct nand_chip *chip,
+					      void *priv)
+{
+	chip->manufacturer.priv = priv;
+}
+
+static inline void *nand_get_manufacturer_data(struct nand_chip *chip)
+{
+	return chip->manufacturer.priv;
+}
+
+/*
+ * NAND Flash Manufacturer ID Codes
+ */
+#define NAND_MFR_TOSHIBA	0x98
+#define NAND_MFR_ESMT		0xc8
+#define NAND_MFR_SAMSUNG	0xec
+#define NAND_MFR_FUJITSU	0x04
+#define NAND_MFR_NATIONAL	0x8f
+#define NAND_MFR_RENESAS	0x07
+#define NAND_MFR_STMICRO	0x20
+#define NAND_MFR_HYNIX		0xad
+#define NAND_MFR_MICRON		0x2c
+#define NAND_MFR_AMD		0x01
+#define NAND_MFR_MACRONIX	0xc2
+#define NAND_MFR_EON		0x92
+#define NAND_MFR_SANDISK	0x45
+#define NAND_MFR_INTEL		0x89
+#define NAND_MFR_ATO		0x9b
+#define NAND_MFR_WINBOND	0xef
+
+/* The maximum expected count of bytes in the NAND ID sequence */
+#define NAND_MAX_ID_LEN 8
+
+/*
+ * A helper for defining older NAND chips where the second ID byte fully
+ * defined the chip, including the geometry (chip size, eraseblock size, page
+ * size). All these chips have 512 bytes NAND page size.
+ */
+#define LEGACY_ID_NAND(nm, devid, chipsz, erasesz, opts)          \
+	{ .name = (nm), {{ .dev_id = (devid) }}, .pagesize = 512, \
+	  .chipsize = (chipsz), .erasesize = (erasesz), .options = (opts) }
+
+/*
+ * A helper for defining newer chips which report their page size and
+ * eraseblock size via the extended ID bytes.
+ *
+ * The real difference between LEGACY_ID_NAND and EXTENDED_ID_NAND is that with
+ * EXTENDED_ID_NAND, manufacturers overloaded the same device ID so that the
+ * device ID now only represented a particular total chip size (and voltage,
+ * buswidth), and the page size, eraseblock size, and OOB size could vary while
+ * using the same device ID.
+ */
+#define EXTENDED_ID_NAND(nm, devid, chipsz, opts)                      \
+	{ .name = (nm), {{ .dev_id = (devid) }}, .chipsize = (chipsz), \
+	  .options = (opts) }
+
+#define NAND_ECC_INFO(_strength, _step)	\
+			{ .strength_ds = (_strength), .step_ds = (_step) }
+#define NAND_ECC_STRENGTH(type)		((type)->ecc.strength_ds)
+#define NAND_ECC_STEP(type)		((type)->ecc.step_ds)
+
+/**
+ * struct nand_flash_dev - NAND Flash Device ID Structure
+ * @name: a human-readable name of the NAND chip
+ * @dev_id: the device ID (the second byte of the full chip ID array)
+ * @mfr_id: manufecturer ID part of the full chip ID array (refers the same
+ *          memory address as @id[0])
+ * @dev_id: device ID part of the full chip ID array (refers the same memory
+ *          address as @id[1])
+ * @id: full device ID array
+ * @pagesize: size of the NAND page in bytes; if 0, then the real page size (as
+ *            well as the eraseblock size) is determined from the extended NAND
+ *            chip ID array)
+ * @chipsize: total chip size in MiB
+ * @erasesize: eraseblock size in bytes (determined from the extended ID if 0)
+ * @options: stores various chip bit options
+ * @id_len: The valid length of the @id.
+ * @oobsize: OOB size
+ * @ecc: ECC correctability and step information from the datasheet.
+ * @ecc.strength_ds: The ECC correctability from the datasheet, same as the
+ *                   @ecc_strength_ds in nand_chip{}.
+ * @ecc.step_ds: The ECC step required by the @ecc.strength_ds, same as the
+ *               @ecc_step_ds in nand_chip{}, also from the datasheet.
+ *               For example, the "4bit ECC for each 512Byte" can be set with
+ *               NAND_ECC_INFO(4, 512).
+ * @onfi_timing_mode_default: the default ONFI timing mode entered after a NAND
+ *			      reset. Should be deduced from timings described
+ *			      in the datasheet.
+ *
+ */
+struct nand_flash_dev {
+	char *name;
+	union {
+		struct {
+			uint8_t mfr_id;
+			uint8_t dev_id;
+		};
+		uint8_t id[NAND_MAX_ID_LEN];
+	};
+	unsigned int pagesize;
+	unsigned int chipsize;
+	unsigned int erasesize;
+	unsigned int options;
+	uint16_t id_len;
+	uint16_t oobsize;
+	struct {
+		uint16_t strength_ds;
+		uint16_t step_ds;
+	} ecc;
+	int onfi_timing_mode_default;
+};
+
+/**
+ * struct nand_manufacturer - NAND Flash Manufacturer structure
+ * @name:	Manufacturer name
+ * @id:		manufacturer ID code of device.
+ * @ops:	manufacturer operations
+*/
+struct nand_manufacturer {
+	int id;
+	char *name;
+	const struct nand_manufacturer_ops *ops;
+};
+
+const struct nand_manufacturer *nand_get_manufacturer(u8 id);
+
+static inline const char *
+nand_manufacturer_name(const struct nand_manufacturer *manufacturer)
+{
+	return manufacturer ? manufacturer->name : "Unknown";
+}
+
+extern struct nand_flash_dev nand_flash_ids[];
+
+extern const struct nand_manufacturer_ops toshiba_nand_manuf_ops;
+extern const struct nand_manufacturer_ops samsung_nand_manuf_ops;
+extern const struct nand_manufacturer_ops hynix_nand_manuf_ops;
+extern const struct nand_manufacturer_ops micron_nand_manuf_ops;
+extern const struct nand_manufacturer_ops amd_nand_manuf_ops;
+extern const struct nand_manufacturer_ops macronix_nand_manuf_ops;
+
+int nand_default_bbt(struct mtd_info *mtd);
+int nand_markbad_bbt(struct mtd_info *mtd, loff_t offs);
+int nand_isreserved_bbt(struct mtd_info *mtd, loff_t offs);
+int nand_isbad_bbt(struct mtd_info *mtd, loff_t offs, int allowbbt);
+int nand_erase_nand(struct mtd_info *mtd, struct erase_info *instr,
+		    int allowbbt);
+int nand_do_read(struct mtd_info *mtd, loff_t from, size_t len,
+		 size_t *retlen, uint8_t *buf);
+
+/**
+ * struct platform_nand_chip - chip level device structure
+ * @nr_chips:		max. number of chips to scan for
+ * @chip_offset:	chip number offset
+ * @nr_partitions:	number of partitions pointed to by partitions (or zero)
+ * @partitions:		mtd partition list
+ * @chip_delay:		R/B delay value in us
+ * @options:		Option flags, e.g. 16bit buswidth
+ * @bbt_options:	BBT option flags, e.g. NAND_BBT_USE_FLASH
+ * @part_probe_types:	NULL-terminated array of probe types
+ */
+struct platform_nand_chip {
+	int nr_chips;
+	int chip_offset;
+	int nr_partitions;
+	struct mtd_partition *partitions;
+	int chip_delay;
+	unsigned int options;
+	unsigned int bbt_options;
+	const char **part_probe_types;
+};
+
+/* Keep gcc happy */
+struct platform_device;
+
+/**
+ * struct platform_nand_ctrl - controller level device structure
+ * @probe:		platform specific function to probe/setup hardware
+ * @remove:		platform specific function to remove/teardown hardware
+ * @hwcontrol:		platform specific hardware control structure
+ * @dev_ready:		platform specific function to read ready/busy pin
+ * @select_chip:	platform specific chip select function
+ * @cmd_ctrl:		platform specific function for controlling
+ *			ALE/CLE/nCE. Also used to write command and address
+ * @write_buf:		platform specific function for write buffer
+ * @read_buf:		platform specific function for read buffer
+ * @read_byte:		platform specific function to read one byte from chip
+ * @priv:		private data to transport driver specific settings
+ *
+ * All fields are optional and depend on the hardware driver requirements
+ */
+struct platform_nand_ctrl {
+	int (*probe)(struct platform_device *pdev);
+	void (*remove)(struct platform_device *pdev);
+	void (*hwcontrol)(struct mtd_info *mtd, int cmd);
+	int (*dev_ready)(struct mtd_info *mtd);
+	void (*select_chip)(struct mtd_info *mtd, int chip);
+	void (*cmd_ctrl)(struct mtd_info *mtd, int dat, unsigned int ctrl);
+	void (*write_buf)(struct mtd_info *mtd, const uint8_t *buf, int len);
+	void (*read_buf)(struct mtd_info *mtd, uint8_t *buf, int len);
+	unsigned char (*read_byte)(struct mtd_info *mtd);
+	void *priv;
+};
+
+/**
+ * struct platform_nand_data - container structure for platform-specific data
+ * @chip:		chip level chip structure
+ * @ctrl:		controller level device structure
+ */
+struct platform_nand_data {
+	struct platform_nand_chip chip;
+	struct platform_nand_ctrl ctrl;
+};
+
+/* return the supported features. */
+static inline int onfi_feature(struct nand_chip *chip)
+{
+	return chip->onfi_version ? le16_to_cpu(chip->onfi_params.features) : 0;
+}
+
+/* return the supported asynchronous timing mode. */
+static inline int onfi_get_async_timing_mode(struct nand_chip *chip)
+{
+	if (!chip->onfi_version)
+		return ONFI_TIMING_MODE_UNKNOWN;
+	return le16_to_cpu(chip->onfi_params.async_timing_mode);
+}
+
+/* return the supported synchronous timing mode. */
+static inline int onfi_get_sync_timing_mode(struct nand_chip *chip)
+{
+	if (!chip->onfi_version)
+		return ONFI_TIMING_MODE_UNKNOWN;
+	return le16_to_cpu(chip->onfi_params.src_sync_timing_mode);
+}
+
+int onfi_init_data_interface(struct nand_chip *chip,
+			     struct nand_data_interface *iface,
+			     enum nand_data_interface_type type,
+			     int timing_mode);
+
+/*
+ * Check if it is a SLC nand.
+ * The !nand_is_slc() can be used to check the MLC/TLC nand chips.
+ * We do not distinguish the MLC and TLC now.
+ */
+static inline bool nand_is_slc(struct nand_chip *chip)
+{
+	return chip->bits_per_cell == 1;
+}
+
+/**
+ * Check if the opcode's address should be sent only on the lower 8 bits
+ * @command: opcode to check
+ */
+static inline int nand_opcode_8bits(unsigned int command)
+{
+	switch (command) {
+	case NAND_CMD_READID:
+	case NAND_CMD_PARAM:
+	case NAND_CMD_GET_FEATURES:
+	case NAND_CMD_SET_FEATURES:
+		return 1;
+	default:
+		break;
+	}
+	return 0;
+}
+
+/* return the supported JEDEC features. */
+static inline int jedec_feature(struct nand_chip *chip)
+{
+	return chip->jedec_version ? le16_to_cpu(chip->jedec_params.features)
+		: 0;
+}
+
+/* get timing characteristics from ONFI timing mode. */
+const struct nand_sdr_timings *onfi_async_timing_mode_to_sdr_timings(int mode);
+/* get data interface from ONFI timing mode 0, used after reset. */
+const struct nand_data_interface *nand_get_default_data_interface(void);
+
+int nand_check_erased_ecc_chunk(void *data, int datalen,
+				void *ecc, int ecclen,
+				void *extraoob, int extraooblen,
+				int threshold);
+
+int nand_check_ecc_caps(struct nand_chip *chip,
+			const struct nand_ecc_caps *caps, int oobavail);
+
+int nand_match_ecc_req(struct nand_chip *chip,
+		       const struct nand_ecc_caps *caps,  int oobavail);
+
+int nand_maximize_ecc(struct nand_chip *chip,
+		      const struct nand_ecc_caps *caps, int oobavail);
+
+/* Default write_oob implementation */
+int nand_write_oob_std(struct mtd_info *mtd, struct nand_chip *chip, int page);
+
+/* Default write_oob syndrome implementation */
+int nand_write_oob_syndrome(struct mtd_info *mtd, struct nand_chip *chip,
+			    int page);
+
+/* Default read_oob implementation */
+int nand_read_oob_std(struct mtd_info *mtd, struct nand_chip *chip, int page);
+
+/* Default read_oob syndrome implementation */
+int nand_read_oob_syndrome(struct mtd_info *mtd, struct nand_chip *chip,
+			   int page);
+
+/* Stub used by drivers that do not support GET/SET FEATURES operations */
+int nand_onfi_get_set_features_notsupp(struct mtd_info *mtd,
+				       struct nand_chip *chip, int addr,
+				       u8 *subfeature_param);
+
+/* Default read_page_raw implementation */
+int nand_read_page_raw(struct mtd_info *mtd, struct nand_chip *chip,
+		       uint8_t *buf, int oob_required, int page);
+
+/* Default write_page_raw implementation */
+int nand_write_page_raw(struct mtd_info *mtd, struct nand_chip *chip,
+			const uint8_t *buf, int oob_required, int page);
+
+/* Reset and initialize a NAND device */
+int nand_reset(struct nand_chip *chip, int chipnr);
+
+/* Free resources held by the NAND device */
+void nand_cleanup(struct nand_chip *chip);
+
+/* Default extended ID decoding function */
+void nand_decode_ext_id(struct nand_chip *chip);
+#endif /* __LINUX_MTD_RAWNAND_H */
diff --git a/include/linux/mtd/sh_flctl.h b/include/linux/mtd/sh_flctl.h
index 2251add65fa7..c759d403cbc0 100644
--- a/include/linux/mtd/sh_flctl.h
+++ b/include/linux/mtd/sh_flctl.h
@@ -22,7 +22,7 @@
 
 #include <linux/completion.h>
 #include <linux/mtd/mtd.h>
-#include <linux/mtd/nand.h>
+#include <linux/mtd/rawnand.h>
 #include <linux/mtd/partitions.h>
 #include <linux/pm_qos.h>
 
diff --git a/include/linux/mtd/sharpsl.h b/include/linux/mtd/sharpsl.h
index 65e91d0fa981..72a79c7d0e08 100644
--- a/include/linux/mtd/sharpsl.h
+++ b/include/linux/mtd/sharpsl.h
@@ -8,7 +8,7 @@
  * published by the Free Software Foundation.
  */
 
-#include <linux/mtd/nand.h>
+#include <linux/mtd/rawnand.h>
 #include <linux/mtd/nand_ecc.h>
 #include <linux/mtd/partitions.h>
 
diff --git a/include/linux/platform_data/mtd-davinci.h b/include/linux/platform_data/mtd-davinci.h
index 1cf555aef896..f1a2cf655bdb 100644
--- a/include/linux/platform_data/mtd-davinci.h
+++ b/include/linux/platform_data/mtd-davinci.h
@@ -28,7 +28,7 @@
 #ifndef __ARCH_ARM_DAVINCI_NAND_H
 #define __ARCH_ARM_DAVINCI_NAND_H
 
-#include <linux/mtd/nand.h>
+#include <linux/mtd/rawnand.h>
 
 #define NANDFCR_OFFSET		0x60
 #define NANDFSR_OFFSET		0x64
diff --git a/include/linux/platform_data/mtd-nand-s3c2410.h b/include/linux/platform_data/mtd-nand-s3c2410.h
index f01659026b26..f8c553f92655 100644
--- a/include/linux/platform_data/mtd-nand-s3c2410.h
+++ b/include/linux/platform_data/mtd-nand-s3c2410.h
@@ -12,7 +12,7 @@
 #ifndef __MTD_NAND_S3C2410_H
 #define __MTD_NAND_S3C2410_H
 
-#include <linux/mtd/nand.h>
+#include <linux/mtd/rawnand.h>
 
 /**
  * struct s3c2410_nand_set - define a set of one or more nand chips
-- 
cgit v1.2.3


From 0a1c7959acd9674a0e4e59f911f3e5fbf25fd693 Mon Sep 17 00:00:00 2001
From: Wolfram Sang <wsa@the-dreams.de>
Date: Wed, 17 May 2017 17:22:18 +0200
Subject: gpu: drm: tc35876x: move header file out of I2C realm

include/linux/i2c is not for client devices. Move the header file to a
more appropriate location.

Signed-off-by: Wolfram Sang <wsa@the-dreams.de>
Acked-by: Patrik Jakobsson <patrik.r.jakobsson@gmail.com>
---
 arch/x86/platform/intel-mid/device_libs/platform_tc35876x.c |  2 +-
 drivers/gpu/drm/gma500/tc35876x-dsi-lvds.c                  |  2 +-
 include/linux/i2c/tc35876x.h                                | 11 -----------
 include/linux/platform_data/tc35876x.h                      | 11 +++++++++++
 4 files changed, 13 insertions(+), 13 deletions(-)
 delete mode 100644 include/linux/i2c/tc35876x.h
 create mode 100644 include/linux/platform_data/tc35876x.h

(limited to 'include')

diff --git a/arch/x86/platform/intel-mid/device_libs/platform_tc35876x.c b/arch/x86/platform/intel-mid/device_libs/platform_tc35876x.c
index b1526b95fd43..2905376559f1 100644
--- a/arch/x86/platform/intel-mid/device_libs/platform_tc35876x.c
+++ b/arch/x86/platform/intel-mid/device_libs/platform_tc35876x.c
@@ -11,7 +11,7 @@
  */
 
 #include <linux/gpio.h>
-#include <linux/i2c/tc35876x.h>
+#include <linux/platform_data/tc35876x.h>
 #include <asm/intel-mid.h>
 
 /*tc35876x DSI_LVDS bridge chip and panel platform data*/
diff --git a/drivers/gpu/drm/gma500/tc35876x-dsi-lvds.c b/drivers/gpu/drm/gma500/tc35876x-dsi-lvds.c
index 771ff66711af..37c997e24b9e 100644
--- a/drivers/gpu/drm/gma500/tc35876x-dsi-lvds.c
+++ b/drivers/gpu/drm/gma500/tc35876x-dsi-lvds.c
@@ -26,7 +26,7 @@
 #include "mdfld_output.h"
 #include "mdfld_dsi_pkg_sender.h"
 #include "tc35876x-dsi-lvds.h"
-#include <linux/i2c/tc35876x.h>
+#include <linux/platform_data/tc35876x.h>
 #include <linux/kernel.h>
 #include <linux/module.h>
 #include <asm/intel_scu_ipc.h>
diff --git a/include/linux/i2c/tc35876x.h b/include/linux/i2c/tc35876x.h
deleted file mode 100644
index cd6a51c71e7e..000000000000
--- a/include/linux/i2c/tc35876x.h
+++ /dev/null
@@ -1,11 +0,0 @@
-
-#ifndef _TC35876X_H
-#define _TC35876X_H
-
-struct tc35876x_platform_data {
-	int gpio_bridge_reset;
-	int gpio_panel_bl_en;
-	int gpio_panel_vadd;
-};
-
-#endif /* _TC35876X_H */
diff --git a/include/linux/platform_data/tc35876x.h b/include/linux/platform_data/tc35876x.h
new file mode 100644
index 000000000000..cd6a51c71e7e
--- /dev/null
+++ b/include/linux/platform_data/tc35876x.h
@@ -0,0 +1,11 @@
+
+#ifndef _TC35876X_H
+#define _TC35876X_H
+
+struct tc35876x_platform_data {
+	int gpio_bridge_reset;
+	int gpio_panel_bl_en;
+	int gpio_panel_vadd;
+};
+
+#endif /* _TC35876X_H */
-- 
cgit v1.2.3


From 7c4974786f4794178f04e96318fc3b2f2850cbc6 Mon Sep 17 00:00:00 2001
From: Jason Wang <jasowang@redhat.com>
Date: Fri, 11 Aug 2017 19:41:17 +0800
Subject: net: export some generic xdp helpers

This patch tries to export some generic xdp helpers to drivers. This
can let driver to do XDP for a specific skb. This is useful for the
case when the packet is hard to be processed at page level directly
(e.g jumbo/GSO frame).

With this patch, there's no need for driver to forbid the XDP set when
configuration is not suitable. Instead, it can defer the XDP for
packets that is hard to be processed directly after skb is created.

Signed-off-by: Jason Wang <jasowang@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netdevice.h |  2 ++
 net/core/dev.c            | 14 ++++++++------
 2 files changed, 10 insertions(+), 6 deletions(-)

(limited to 'include')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 1d238d54c484..0f1c4cb2441e 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -3243,6 +3243,8 @@ static inline void dev_consume_skb_any(struct sk_buff *skb)
 	__dev_kfree_skb_any(skb, SKB_REASON_CONSUMED);
 }
 
+void generic_xdp_tx(struct sk_buff *skb, struct bpf_prog *xdp_prog);
+int do_xdp_generic(struct bpf_prog *xdp_prog, struct sk_buff *skb);
 int netif_rx(struct sk_buff *skb);
 int netif_rx_ni(struct sk_buff *skb);
 int netif_receive_skb(struct sk_buff *skb);
diff --git a/net/core/dev.c b/net/core/dev.c
index 1024d3741d12..40b28e417072 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -3919,7 +3919,7 @@ static u32 netif_receive_generic_xdp(struct sk_buff *skb,
 /* When doing generic XDP we have to bypass the qdisc layer and the
  * network taps in order to match in-driver-XDP behavior.
  */
-static void generic_xdp_tx(struct sk_buff *skb, struct bpf_prog *xdp_prog)
+void generic_xdp_tx(struct sk_buff *skb, struct bpf_prog *xdp_prog)
 {
 	struct net_device *dev = skb->dev;
 	struct netdev_queue *txq;
@@ -3940,13 +3940,12 @@ static void generic_xdp_tx(struct sk_buff *skb, struct bpf_prog *xdp_prog)
 		kfree_skb(skb);
 	}
 }
+EXPORT_SYMBOL_GPL(generic_xdp_tx);
 
 static struct static_key generic_xdp_needed __read_mostly;
 
-static int do_xdp_generic(struct sk_buff *skb)
+int do_xdp_generic(struct bpf_prog *xdp_prog, struct sk_buff *skb)
 {
-	struct bpf_prog *xdp_prog = rcu_dereference(skb->dev->xdp_prog);
-
 	if (xdp_prog) {
 		u32 act = netif_receive_generic_xdp(skb, xdp_prog);
 		int err;
@@ -3971,6 +3970,7 @@ out_redir:
 	kfree_skb(skb);
 	return XDP_DROP;
 }
+EXPORT_SYMBOL_GPL(do_xdp_generic);
 
 static int netif_rx_internal(struct sk_buff *skb)
 {
@@ -3981,7 +3981,8 @@ static int netif_rx_internal(struct sk_buff *skb)
 	trace_netif_rx(skb);
 
 	if (static_key_false(&generic_xdp_needed)) {
-		int ret = do_xdp_generic(skb);
+		int ret = do_xdp_generic(rcu_dereference(skb->dev->xdp_prog),
+					 skb);
 
 		/* Consider XDP consuming the packet a success from
 		 * the netdev point of view we do not want to count
@@ -4502,7 +4503,8 @@ static int netif_receive_skb_internal(struct sk_buff *skb)
 	rcu_read_lock();
 
 	if (static_key_false(&generic_xdp_needed)) {
-		int ret = do_xdp_generic(skb);
+		int ret = do_xdp_generic(rcu_dereference(skb->dev->xdp_prog),
+					 skb);
 
 		if (ret != XDP_PASS) {
 			rcu_read_unlock();
-- 
cgit v1.2.3


From 34fc75bfc616f1c1fbab56508c3f48f4b97c97ea Mon Sep 17 00:00:00 2001
From: Florian Weimer <fweimer@redhat.com>
Date: Fri, 11 Aug 2017 16:24:15 +0200
Subject: uapi/linux/quota.h: Do not include linux/errno.h

linux/errno.h is very sensitive to coordination with libc headers.
Nothing in linux/quota.h needs it, so this change allows using
this header in more contexts.

Signed-off-by: Florian Weimer <fweimer@redhat.com>
Signed-off-by: Jan Kara <jack@suse.cz>
---
 include/uapi/linux/quota.h | 1 -
 1 file changed, 1 deletion(-)

(limited to 'include')

diff --git a/include/uapi/linux/quota.h b/include/uapi/linux/quota.h
index 4d2489ef6f10..f17c9636a859 100644
--- a/include/uapi/linux/quota.h
+++ b/include/uapi/linux/quota.h
@@ -33,7 +33,6 @@
 #ifndef _UAPI_LINUX_QUOTA_
 #define _UAPI_LINUX_QUOTA_
 
-#include <linux/errno.h>
 #include <linux/types.h>
 
 #define __DQUOT_VERSION__	"dquot_6.6.0"
-- 
cgit v1.2.3


From 2be03aedbb6d78929c9e3be0d1850830303744f7 Mon Sep 17 00:00:00 2001
From: Wolfram Sang <wsa@the-dreams.de>
Date: Mon, 14 Aug 2017 10:23:25 +0200
Subject: i2c: mux: pca954x: move header file out of I2C realm

include/linux/i2c is not for client devices. Move the header file to a
more appropriate location.

Signed-off-by: Wolfram Sang <wsa@the-dreams.de>
Signed-off-by: Peter Rosin <peda@axentia.se>
---
 drivers/i2c/muxes/i2c-mux-pca9541.c   |  2 +-
 drivers/i2c/muxes/i2c-mux-pca954x.c   |  2 +-
 include/linux/i2c/pca954x.h           | 48 -----------------------------------
 include/linux/platform_data/pca954x.h | 48 +++++++++++++++++++++++++++++++++++
 4 files changed, 50 insertions(+), 50 deletions(-)
 delete mode 100644 include/linux/i2c/pca954x.h
 create mode 100644 include/linux/platform_data/pca954x.h

(limited to 'include')

diff --git a/drivers/i2c/muxes/i2c-mux-pca9541.c b/drivers/i2c/muxes/i2c-mux-pca9541.c
index be961f2855c7..6a39adaf433f 100644
--- a/drivers/i2c/muxes/i2c-mux-pca9541.c
+++ b/drivers/i2c/muxes/i2c-mux-pca9541.c
@@ -20,9 +20,9 @@
 #include <linux/device.h>
 #include <linux/i2c.h>
 #include <linux/i2c-mux.h>
-#include <linux/i2c/pca954x.h>
 #include <linux/jiffies.h>
 #include <linux/module.h>
+#include <linux/platform_data/pca954x.h>
 #include <linux/slab.h>
 
 /*
diff --git a/drivers/i2c/muxes/i2c-mux-pca954x.c b/drivers/i2c/muxes/i2c-mux-pca954x.c
index f1751c290af6..7b992db38021 100644
--- a/drivers/i2c/muxes/i2c-mux-pca954x.c
+++ b/drivers/i2c/muxes/i2c-mux-pca954x.c
@@ -39,13 +39,13 @@
 #include <linux/gpio/consumer.h>
 #include <linux/i2c.h>
 #include <linux/i2c-mux.h>
-#include <linux/i2c/pca954x.h>
 #include <linux/interrupt.h>
 #include <linux/irq.h>
 #include <linux/module.h>
 #include <linux/of.h>
 #include <linux/of_device.h>
 #include <linux/of_irq.h>
+#include <linux/platform_data/pca954x.h>
 #include <linux/pm.h>
 #include <linux/slab.h>
 #include <linux/spinlock.h>
diff --git a/include/linux/i2c/pca954x.h b/include/linux/i2c/pca954x.h
deleted file mode 100644
index 1712677d5904..000000000000
--- a/include/linux/i2c/pca954x.h
+++ /dev/null
@@ -1,48 +0,0 @@
-/*
- *
- * pca954x.h - I2C multiplexer/switch support
- *
- * Copyright (c) 2008-2009 Rodolfo Giometti <giometti@linux.it>
- * Copyright (c) 2008-2009 Eurotech S.p.A. <info@eurotech.it>
- * Michael Lawnick <michael.lawnick.ext@nsn.com>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
- */
-
-
-#ifndef _LINUX_I2C_PCA954X_H
-#define _LINUX_I2C_PCA954X_H
-
-/* Platform data for the PCA954x I2C multiplexers */
-
-/* Per channel initialisation data:
- * @adap_id: bus number for the adapter. 0 = don't care
- * @deselect_on_exit: set this entry to 1, if your H/W needs deselection
- *                    of this channel after transaction.
- *
- */
-struct pca954x_platform_mode {
-	int		adap_id;
-	unsigned int	deselect_on_exit:1;
-	unsigned int	class;
-};
-
-/* Per mux/switch data, used with i2c_register_board_info */
-struct pca954x_platform_data {
-	struct pca954x_platform_mode *modes;
-	int num_modes;
-};
-
-#endif /* _LINUX_I2C_PCA954X_H */
diff --git a/include/linux/platform_data/pca954x.h b/include/linux/platform_data/pca954x.h
new file mode 100644
index 000000000000..1712677d5904
--- /dev/null
+++ b/include/linux/platform_data/pca954x.h
@@ -0,0 +1,48 @@
+/*
+ *
+ * pca954x.h - I2C multiplexer/switch support
+ *
+ * Copyright (c) 2008-2009 Rodolfo Giometti <giometti@linux.it>
+ * Copyright (c) 2008-2009 Eurotech S.p.A. <info@eurotech.it>
+ * Michael Lawnick <michael.lawnick.ext@nsn.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+
+#ifndef _LINUX_I2C_PCA954X_H
+#define _LINUX_I2C_PCA954X_H
+
+/* Platform data for the PCA954x I2C multiplexers */
+
+/* Per channel initialisation data:
+ * @adap_id: bus number for the adapter. 0 = don't care
+ * @deselect_on_exit: set this entry to 1, if your H/W needs deselection
+ *                    of this channel after transaction.
+ *
+ */
+struct pca954x_platform_mode {
+	int		adap_id;
+	unsigned int	deselect_on_exit:1;
+	unsigned int	class;
+};
+
+/* Per mux/switch data, used with i2c_register_board_info */
+struct pca954x_platform_data {
+	struct pca954x_platform_mode *modes;
+	int num_modes;
+};
+
+#endif /* _LINUX_I2C_PCA954X_H */
-- 
cgit v1.2.3


From 26b1083b4545ec08a7dcfe759323a2e142d60d75 Mon Sep 17 00:00:00 2001
From: Wolfram Sang <wsa@the-dreams.de>
Date: Mon, 14 Aug 2017 10:23:26 +0200
Subject: i2c: mux: mlxcpld: move header file out of I2C realm

include/linux/i2c is not for client devices. Move the header file to a
more appropriate location.

Signed-off-by: Wolfram Sang <wsa@the-dreams.de>
Signed-off-by: Peter Rosin <peda@axentia.se>
---
 drivers/i2c/muxes/i2c-mux-mlxcpld.c       |  2 +-
 include/linux/i2c/mlxcpld.h               | 52 -------------------------------
 include/linux/platform_data/x86/mlxcpld.h | 52 +++++++++++++++++++++++++++++++
 3 files changed, 53 insertions(+), 53 deletions(-)
 delete mode 100644 include/linux/i2c/mlxcpld.h
 create mode 100644 include/linux/platform_data/x86/mlxcpld.h

(limited to 'include')

diff --git a/drivers/i2c/muxes/i2c-mux-mlxcpld.c b/drivers/i2c/muxes/i2c-mux-mlxcpld.c
index e53f2abd1350..12ad8d65faf6 100644
--- a/drivers/i2c/muxes/i2c-mux-mlxcpld.c
+++ b/drivers/i2c/muxes/i2c-mux-mlxcpld.c
@@ -38,9 +38,9 @@
 #include <linux/io.h>
 #include <linux/init.h>
 #include <linux/module.h>
+#include <linux/platform_data/x86/mlxcpld.h>
 #include <linux/platform_device.h>
 #include <linux/slab.h>
-#include <linux/i2c/mlxcpld.h>
 
 #define CPLD_MUX_MAX_NCHANS	8
 
diff --git a/include/linux/i2c/mlxcpld.h b/include/linux/i2c/mlxcpld.h
deleted file mode 100644
index b08dcb183fca..000000000000
--- a/include/linux/i2c/mlxcpld.h
+++ /dev/null
@@ -1,52 +0,0 @@
-/*
- * mlxcpld.h - Mellanox I2C multiplexer support in CPLD
- *
- * Copyright (c) 2016 Mellanox Technologies. All rights reserved.
- * Copyright (c) 2016 Michael Shych <michaels@mellanox.com>
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright
- *    notice, this list of conditions and the following disclaimer.
- * 2. Redistributions in binary form must reproduce the above copyright
- *    notice, this list of conditions and the following disclaimer in the
- *    documentation and/or other materials provided with the distribution.
- * 3. Neither the names of the copyright holders nor the names of its
- *    contributors may be used to endorse or promote products derived from
- *    this software without specific prior written permission.
- *
- * Alternatively, this software may be distributed under the terms of the
- * GNU General Public License ("GPL") version 2 as published by the Free
- * Software Foundation.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
- * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
- * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
- * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
- * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
- * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
- * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
- * POSSIBILITY OF SUCH DAMAGE.
- */
-
-#ifndef _LINUX_I2C_MLXCPLD_H
-#define _LINUX_I2C_MLXCPLD_H
-
-/* Platform data for the CPLD I2C multiplexers */
-
-/* mlxcpld_mux_plat_data - per mux data, used with i2c_register_board_info
- * @adap_ids - adapter array
- * @num_adaps - number of adapters
- * @sel_reg_addr - mux select register offset in CPLD space
- */
-struct mlxcpld_mux_plat_data {
-	int *adap_ids;
-	int num_adaps;
-	int sel_reg_addr;
-};
-
-#endif /* _LINUX_I2C_MLXCPLD_H */
diff --git a/include/linux/platform_data/x86/mlxcpld.h b/include/linux/platform_data/x86/mlxcpld.h
new file mode 100644
index 000000000000..b08dcb183fca
--- /dev/null
+++ b/include/linux/platform_data/x86/mlxcpld.h
@@ -0,0 +1,52 @@
+/*
+ * mlxcpld.h - Mellanox I2C multiplexer support in CPLD
+ *
+ * Copyright (c) 2016 Mellanox Technologies. All rights reserved.
+ * Copyright (c) 2016 Michael Shych <michaels@mellanox.com>
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the names of the copyright holders nor the names of its
+ *    contributors may be used to endorse or promote products derived from
+ *    this software without specific prior written permission.
+ *
+ * Alternatively, this software may be distributed under the terms of the
+ * GNU General Public License ("GPL") version 2 as published by the Free
+ * Software Foundation.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef _LINUX_I2C_MLXCPLD_H
+#define _LINUX_I2C_MLXCPLD_H
+
+/* Platform data for the CPLD I2C multiplexers */
+
+/* mlxcpld_mux_plat_data - per mux data, used with i2c_register_board_info
+ * @adap_ids - adapter array
+ * @num_adaps - number of adapters
+ * @sel_reg_addr - mux select register offset in CPLD space
+ */
+struct mlxcpld_mux_plat_data {
+	int *adap_ids;
+	int num_adaps;
+	int sel_reg_addr;
+};
+
+#endif /* _LINUX_I2C_MLXCPLD_H */
-- 
cgit v1.2.3


From c4aee3e1b0de8e732fafd0d13e75ab7bdbc606ef Mon Sep 17 00:00:00 2001
From: Peter Rosin <peda@axentia.se>
Date: Sun, 21 May 2017 18:48:07 +0200
Subject: i2c: mux: pinctrl: remove platform_data

No platform (at least no upstreamed platform) has ever used this
platform_data. Just drop it and simplify the code.

Reviewed-by: Stephen Warren <swarren@nvidia.com>
Acked-by: Wolfram Sang <wsa@the-dreams.de>
Signed-off-by: Peter Rosin <peda@axentia.se>
---
 drivers/i2c/muxes/Kconfig           |   1 +
 drivers/i2c/muxes/i2c-mux-pinctrl.c | 219 ++++++++++++------------------------
 include/linux/i2c-mux-pinctrl.h     |  41 -------
 3 files changed, 72 insertions(+), 189 deletions(-)
 delete mode 100644 include/linux/i2c-mux-pinctrl.h

(limited to 'include')

diff --git a/drivers/i2c/muxes/Kconfig b/drivers/i2c/muxes/Kconfig
index 2c64d0e0740f..1bba95ecc7c0 100644
--- a/drivers/i2c/muxes/Kconfig
+++ b/drivers/i2c/muxes/Kconfig
@@ -76,6 +76,7 @@ config I2C_MUX_PCA954x
 config I2C_MUX_PINCTRL
 	tristate "pinctrl-based I2C multiplexer"
 	depends on PINCTRL
+	depends on OF || COMPILE_TEST
 	help
 	  If you say yes to this option, support will be included for an I2C
 	  multiplexer that uses the pinctrl subsystem, i.e. pin multiplexing.
diff --git a/drivers/i2c/muxes/i2c-mux-pinctrl.c b/drivers/i2c/muxes/i2c-mux-pinctrl.c
index 7c0c264b07bc..aa4a3bf9507f 100644
--- a/drivers/i2c/muxes/i2c-mux-pinctrl.c
+++ b/drivers/i2c/muxes/i2c-mux-pinctrl.c
@@ -20,14 +20,12 @@
 #include <linux/i2c-mux.h>
 #include <linux/module.h>
 #include <linux/pinctrl/consumer.h>
-#include <linux/i2c-mux-pinctrl.h>
 #include <linux/platform_device.h>
 #include <linux/slab.h>
 #include <linux/of.h>
 #include "../../pinctrl/core.h"
 
 struct i2c_mux_pinctrl {
-	struct i2c_mux_pinctrl_platform_data *pdata;
 	struct pinctrl *pinctrl;
 	struct pinctrl_state **states;
 	struct pinctrl_state *state_idle;
@@ -47,80 +45,6 @@ static int i2c_mux_pinctrl_deselect(struct i2c_mux_core *muxc, u32 chan)
 	return pinctrl_select_state(mux->pinctrl, mux->state_idle);
 }
 
-#ifdef CONFIG_OF
-static int i2c_mux_pinctrl_parse_dt(struct i2c_mux_pinctrl *mux,
-				    struct platform_device *pdev)
-{
-	struct device_node *np = pdev->dev.of_node;
-	int num_names, i, ret;
-	struct device_node *adapter_np;
-	struct i2c_adapter *adapter;
-
-	if (!np)
-		return 0;
-
-	mux->pdata = devm_kzalloc(&pdev->dev, sizeof(*mux->pdata), GFP_KERNEL);
-	if (!mux->pdata)
-		return -ENOMEM;
-
-	num_names = of_property_count_strings(np, "pinctrl-names");
-	if (num_names < 0) {
-		dev_err(&pdev->dev, "Cannot parse pinctrl-names: %d\n",
-			num_names);
-		return num_names;
-	}
-
-	mux->pdata->pinctrl_states = devm_kzalloc(&pdev->dev,
-		sizeof(*mux->pdata->pinctrl_states) * num_names,
-		GFP_KERNEL);
-	if (!mux->pdata->pinctrl_states)
-		return -ENOMEM;
-
-	for (i = 0; i < num_names; i++) {
-		ret = of_property_read_string_index(np, "pinctrl-names", i,
-			&mux->pdata->pinctrl_states[mux->pdata->bus_count]);
-		if (ret < 0) {
-			dev_err(&pdev->dev, "Cannot parse pinctrl-names: %d\n",
-				ret);
-			return ret;
-		}
-		if (!strcmp(mux->pdata->pinctrl_states[mux->pdata->bus_count],
-			    "idle")) {
-			if (i != num_names - 1) {
-				dev_err(&pdev->dev,
-					"idle state must be last\n");
-				return -EINVAL;
-			}
-			mux->pdata->pinctrl_state_idle = "idle";
-		} else {
-			mux->pdata->bus_count++;
-		}
-	}
-
-	adapter_np = of_parse_phandle(np, "i2c-parent", 0);
-	if (!adapter_np) {
-		dev_err(&pdev->dev, "Cannot parse i2c-parent\n");
-		return -ENODEV;
-	}
-	adapter = of_find_i2c_adapter_by_node(adapter_np);
-	of_node_put(adapter_np);
-	if (!adapter) {
-		dev_err(&pdev->dev, "Cannot find parent bus\n");
-		return -EPROBE_DEFER;
-	}
-	mux->pdata->parent_bus_num = i2c_adapter_id(adapter);
-	put_device(&adapter->dev);
-
-	return 0;
-}
-#else
-static inline int i2c_mux_pinctrl_parse_dt(struct i2c_mux_pinctrl *mux,
-					   struct platform_device *pdev)
-{
-	return 0;
-}
-#endif
-
 static struct i2c_adapter *i2c_mux_pinctrl_root_adapter(
 	struct pinctrl_state *state)
 {
@@ -141,110 +65,109 @@ static struct i2c_adapter *i2c_mux_pinctrl_root_adapter(
 	return root;
 }
 
+static struct i2c_adapter *i2c_mux_pinctrl_parent_adapter(struct device *dev)
+{
+	struct device_node *np = dev->of_node;
+	struct device_node *parent_np;
+	struct i2c_adapter *parent;
+
+	parent_np = of_parse_phandle(np, "i2c-parent", 0);
+	if (!parent_np) {
+		dev_err(dev, "Cannot parse i2c-parent\n");
+		return ERR_PTR(-ENODEV);
+	}
+	parent = of_find_i2c_adapter_by_node(parent_np);
+	of_node_put(parent_np);
+	if (!parent)
+		return ERR_PTR(-EPROBE_DEFER);
+
+	return parent;
+}
+
 static int i2c_mux_pinctrl_probe(struct platform_device *pdev)
 {
+	struct device *dev = &pdev->dev;
+	struct device_node *np = dev->of_node;
 	struct i2c_mux_core *muxc;
 	struct i2c_mux_pinctrl *mux;
+	struct i2c_adapter *parent;
 	struct i2c_adapter *root;
-	int i, ret;
-
-	mux = devm_kzalloc(&pdev->dev, sizeof(*mux), GFP_KERNEL);
-	if (!mux) {
-		ret = -ENOMEM;
-		goto err;
-	}
+	int num_names, i, ret;
+	const char *name;
 
-	mux->pdata = dev_get_platdata(&pdev->dev);
-	if (!mux->pdata) {
-		ret = i2c_mux_pinctrl_parse_dt(mux, pdev);
-		if (ret < 0)
-			goto err;
-	}
-	if (!mux->pdata) {
-		dev_err(&pdev->dev, "Missing platform data\n");
-		ret = -ENODEV;
-		goto err;
+	num_names = of_property_count_strings(np, "pinctrl-names");
+	if (num_names < 0) {
+		dev_err(dev, "Cannot parse pinctrl-names: %d\n",
+			num_names);
+		return num_names;
 	}
 
-	mux->states = devm_kzalloc(&pdev->dev,
-				   sizeof(*mux->states) * mux->pdata->bus_count,
-				   GFP_KERNEL);
-	if (!mux->states) {
-		dev_err(&pdev->dev, "Cannot allocate states\n");
-		ret = -ENOMEM;
-		goto err;
-	}
+	parent = i2c_mux_pinctrl_parent_adapter(dev);
+	if (IS_ERR(parent))
+		return PTR_ERR(parent);
 
-	muxc = i2c_mux_alloc(NULL, &pdev->dev, mux->pdata->bus_count, 0, 0,
-			     i2c_mux_pinctrl_select, NULL);
+	muxc = i2c_mux_alloc(parent, dev, num_names,
+			     sizeof(*mux) + num_names * sizeof(*mux->states),
+			     0, i2c_mux_pinctrl_select, NULL);
 	if (!muxc) {
 		ret = -ENOMEM;
-		goto err;
+		goto err_put_parent;
 	}
-	muxc->priv = mux;
+	mux = i2c_mux_priv(muxc);
+	mux->states = (struct pinctrl_state **)(mux + 1);
 
 	platform_set_drvdata(pdev, muxc);
 
-	mux->pinctrl = devm_pinctrl_get(&pdev->dev);
+	mux->pinctrl = devm_pinctrl_get(dev);
 	if (IS_ERR(mux->pinctrl)) {
 		ret = PTR_ERR(mux->pinctrl);
-		dev_err(&pdev->dev, "Cannot get pinctrl: %d\n", ret);
-		goto err;
+		dev_err(dev, "Cannot get pinctrl: %d\n", ret);
+		goto err_put_parent;
 	}
-	for (i = 0; i < mux->pdata->bus_count; i++) {
-		mux->states[i] = pinctrl_lookup_state(mux->pinctrl,
-						mux->pdata->pinctrl_states[i]);
+
+	for (i = 0; i < num_names; i++) {
+		ret = of_property_read_string_index(np, "pinctrl-names", i,
+						    &name);
+		if (ret < 0) {
+			dev_err(dev, "Cannot parse pinctrl-names: %d\n", ret);
+			goto err_put_parent;
+		}
+
+		mux->states[i] = pinctrl_lookup_state(mux->pinctrl, name);
 		if (IS_ERR(mux->states[i])) {
 			ret = PTR_ERR(mux->states[i]);
-			dev_err(&pdev->dev,
-				"Cannot look up pinctrl state %s: %d\n",
-				mux->pdata->pinctrl_states[i], ret);
-			goto err;
-		}
-	}
-	if (mux->pdata->pinctrl_state_idle) {
-		mux->state_idle = pinctrl_lookup_state(mux->pinctrl,
-						mux->pdata->pinctrl_state_idle);
-		if (IS_ERR(mux->state_idle)) {
-			ret = PTR_ERR(mux->state_idle);
-			dev_err(&pdev->dev,
-				"Cannot look up pinctrl state %s: %d\n",
-				mux->pdata->pinctrl_state_idle, ret);
-			goto err;
+			dev_err(dev, "Cannot look up pinctrl state %s: %d\n",
+				name, ret);
+			goto err_put_parent;
 		}
 
-		muxc->deselect = i2c_mux_pinctrl_deselect;
-	}
+		if (strcmp(name, "idle"))
+			continue;
 
-	muxc->parent = i2c_get_adapter(mux->pdata->parent_bus_num);
-	if (!muxc->parent) {
-		dev_err(&pdev->dev, "Parent adapter (%d) not found\n",
-			mux->pdata->parent_bus_num);
-		ret = -EPROBE_DEFER;
-		goto err;
+		if (i != num_names - 1) {
+			dev_err(dev, "idle state must be last\n");
+			ret = -EINVAL;
+			goto err_put_parent;
+		}
+		mux->state_idle = mux->states[i];
+		muxc->deselect = i2c_mux_pinctrl_deselect;
 	}
 
 	root = i2c_root_adapter(&muxc->parent->dev);
 
 	muxc->mux_locked = true;
-	for (i = 0; i < mux->pdata->bus_count; i++) {
+	for (i = 0; i < num_names; i++) {
 		if (root != i2c_mux_pinctrl_root_adapter(mux->states[i])) {
 			muxc->mux_locked = false;
 			break;
 		}
 	}
-	if (muxc->mux_locked && mux->pdata->pinctrl_state_idle &&
-	    root != i2c_mux_pinctrl_root_adapter(mux->state_idle))
-		muxc->mux_locked = false;
-
 	if (muxc->mux_locked)
-		dev_info(&pdev->dev, "mux-locked i2c mux\n");
+		dev_info(dev, "mux-locked i2c mux\n");
 
-	for (i = 0; i < mux->pdata->bus_count; i++) {
-		u32 bus = mux->pdata->base_bus_num ?
-				(mux->pdata->base_bus_num + i) : 0;
-
-		ret = i2c_mux_add_adapter(muxc, bus, i, 0);
+	/* Do not add any adapter for the idle state (if it's there at all). */
+	for (i = 0; i < num_names - !!mux->state_idle; i++) {
+		ret = i2c_mux_add_adapter(muxc, 0, i, 0);
 		if (ret)
 			goto err_del_adapter;
 	}
@@ -253,8 +176,9 @@ static int i2c_mux_pinctrl_probe(struct platform_device *pdev)
 
 err_del_adapter:
 	i2c_mux_del_adapters(muxc);
+err_put_parent:
 	i2c_put_adapter(muxc->parent);
-err:
+
 	return ret;
 }
 
@@ -264,16 +188,15 @@ static int i2c_mux_pinctrl_remove(struct platform_device *pdev)
 
 	i2c_mux_del_adapters(muxc);
 	i2c_put_adapter(muxc->parent);
+
 	return 0;
 }
 
-#ifdef CONFIG_OF
 static const struct of_device_id i2c_mux_pinctrl_of_match[] = {
 	{ .compatible = "i2c-mux-pinctrl", },
 	{},
 };
 MODULE_DEVICE_TABLE(of, i2c_mux_pinctrl_of_match);
-#endif
 
 static struct platform_driver i2c_mux_pinctrl_driver = {
 	.driver	= {
diff --git a/include/linux/i2c-mux-pinctrl.h b/include/linux/i2c-mux-pinctrl.h
deleted file mode 100644
index a65c86429e84..000000000000
--- a/include/linux/i2c-mux-pinctrl.h
+++ /dev/null
@@ -1,41 +0,0 @@
-/*
- * i2c-mux-pinctrl platform data
- *
- * Copyright (c) 2012, NVIDIA CORPORATION.  All rights reserved.
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms and conditions of the GNU General Public License,
- * version 2, as published by the Free Software Foundation.
- *
- * This program is distributed in the hope it will be useful, but WITHOUT
- * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
- * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
- * more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program.  If not, see <http://www.gnu.org/licenses/>.
- */
-
-#ifndef _LINUX_I2C_MUX_PINCTRL_H
-#define _LINUX_I2C_MUX_PINCTRL_H
-
-/**
- * struct i2c_mux_pinctrl_platform_data - Platform data for i2c-mux-pinctrl
- * @parent_bus_num: Parent I2C bus number
- * @base_bus_num: Base I2C bus number for the child busses. 0 for dynamic.
- * @bus_count: Number of child busses. Also the number of elements in
- *	@pinctrl_states
- * @pinctrl_states: The names of the pinctrl state to select for each child bus
- * @pinctrl_state_idle: The pinctrl state to select when no child bus is being
- *	accessed. If NULL, the most recently used pinctrl state will be left
- *	selected.
- */
-struct i2c_mux_pinctrl_platform_data {
-	int parent_bus_num;
-	int base_bus_num;
-	int bus_count;
-	const char **pinctrl_states;
-	const char *pinctrl_state_idle;
-};
-
-#endif
-- 
cgit v1.2.3


From 3f713b7c223ebe5094973ce6e0272bd97363b552 Mon Sep 17 00:00:00 2001
From: Masahiro Yamada <yamada.masahiro@socionext.com>
Date: Fri, 4 Aug 2017 11:22:31 +0900
Subject: pinctrl: move const qualifier before struct

Update subsystem wide for consistency.

Signed-off-by: Masahiro Yamada <yamada.masahiro@socionext.com>
Signed-off-by: Linus Walleij <linus.walleij@linaro.org>
---
 drivers/pinctrl/berlin/berlin.c |  4 ++--
 drivers/pinctrl/core.c          | 14 +++++++-------
 drivers/pinctrl/core.h          |  6 +++---
 drivers/pinctrl/pinconf.c       | 12 ++++++------
 drivers/pinctrl/pinconf.h       | 24 ++++++++++++------------
 drivers/pinctrl/pinmux.c        | 14 +++++++-------
 drivers/pinctrl/pinmux.h        | 29 ++++++++++++++---------------
 include/linux/pinctrl/machine.h |  4 ++--
 8 files changed, 53 insertions(+), 54 deletions(-)

(limited to 'include')

diff --git a/drivers/pinctrl/berlin/berlin.c b/drivers/pinctrl/berlin/berlin.c
index 8f0dc02f7624..cc3bd2efafe3 100644
--- a/drivers/pinctrl/berlin/berlin.c
+++ b/drivers/pinctrl/berlin/berlin.c
@@ -206,8 +206,8 @@ static int berlin_pinctrl_add_function(struct berlin_pinctrl *pctrl,
 static int berlin_pinctrl_build_state(struct platform_device *pdev)
 {
 	struct berlin_pinctrl *pctrl = platform_get_drvdata(pdev);
-	struct berlin_desc_group const *desc_group;
-	struct berlin_desc_function const *desc_function;
+	const struct berlin_desc_group *desc_group;
+	const struct berlin_desc_function *desc_function;
 	int i, max_functions = 0;
 
 	pctrl->nfunctions = 0;
diff --git a/drivers/pinctrl/core.c b/drivers/pinctrl/core.c
index c5e2c5705058..89b9ca77daea 100644
--- a/drivers/pinctrl/core.c
+++ b/drivers/pinctrl/core.c
@@ -264,7 +264,7 @@ static int pinctrl_register_one_pin(struct pinctrl_dev *pctldev,
 }
 
 static int pinctrl_register_pins(struct pinctrl_dev *pctldev,
-				 struct pinctrl_pin_desc const *pins,
+				 const struct pinctrl_pin_desc *pins,
 				 unsigned num_descs)
 {
 	unsigned i;
@@ -907,7 +907,7 @@ static struct pinctrl_state *create_state(struct pinctrl *p,
 }
 
 static int add_setting(struct pinctrl *p, struct pinctrl_dev *pctldev,
-		       struct pinctrl_map const *map)
+		       const struct pinctrl_map *map)
 {
 	struct pinctrl_state *state;
 	struct pinctrl_setting *setting;
@@ -995,7 +995,7 @@ static struct pinctrl *create_pinctrl(struct device *dev,
 	const char *devname;
 	struct pinctrl_maps *maps_node;
 	int i;
-	struct pinctrl_map const *map;
+	const struct pinctrl_map *map;
 	int ret;
 
 	/*
@@ -1321,7 +1321,7 @@ void devm_pinctrl_put(struct pinctrl *p)
 }
 EXPORT_SYMBOL_GPL(devm_pinctrl_put);
 
-int pinctrl_register_map(struct pinctrl_map const *maps, unsigned num_maps,
+int pinctrl_register_map(const struct pinctrl_map *maps, unsigned num_maps,
 			 bool dup)
 {
 	int i, ret;
@@ -1402,13 +1402,13 @@ int pinctrl_register_map(struct pinctrl_map const *maps, unsigned num_maps,
  *	function will perform a shallow copy for the mapping entries.
  * @num_maps: the number of maps in the mapping table
  */
-int pinctrl_register_mappings(struct pinctrl_map const *maps,
+int pinctrl_register_mappings(const struct pinctrl_map *maps,
 			      unsigned num_maps)
 {
 	return pinctrl_register_map(maps, num_maps, true);
 }
 
-void pinctrl_unregister_map(struct pinctrl_map const *map)
+void pinctrl_unregister_map(const struct pinctrl_map *map)
 {
 	struct pinctrl_maps *maps_node;
 
@@ -1702,7 +1702,7 @@ static int pinctrl_maps_show(struct seq_file *s, void *what)
 {
 	struct pinctrl_maps *maps_node;
 	int i;
-	struct pinctrl_map const *map;
+	const struct pinctrl_map *map;
 
 	seq_puts(s, "Pinctrl maps:\n");
 
diff --git a/drivers/pinctrl/core.h b/drivers/pinctrl/core.h
index 1c35de59a658..7880c3adc450 100644
--- a/drivers/pinctrl/core.h
+++ b/drivers/pinctrl/core.h
@@ -179,7 +179,7 @@ struct pin_desc {
  */
 struct pinctrl_maps {
 	struct list_head node;
-	struct pinctrl_map const *maps;
+	const struct pinctrl_map *maps;
 	unsigned num_maps;
 };
 
@@ -243,9 +243,9 @@ extern struct pinctrl_gpio_range *
 pinctrl_find_gpio_range_from_pin_nolock(struct pinctrl_dev *pctldev,
 					unsigned int pin);
 
-int pinctrl_register_map(struct pinctrl_map const *maps, unsigned num_maps,
+int pinctrl_register_map(const struct pinctrl_map *maps, unsigned num_maps,
 			 bool dup);
-void pinctrl_unregister_map(struct pinctrl_map const *map);
+void pinctrl_unregister_map(const struct pinctrl_map *map);
 
 extern int pinctrl_force_sleep(struct pinctrl_dev *pctldev);
 extern int pinctrl_force_default(struct pinctrl_dev *pctldev);
diff --git a/drivers/pinctrl/pinconf.c b/drivers/pinctrl/pinconf.c
index 7fc417e4ae96..92f363793f35 100644
--- a/drivers/pinctrl/pinconf.c
+++ b/drivers/pinctrl/pinconf.c
@@ -37,7 +37,7 @@ int pinconf_check_ops(struct pinctrl_dev *pctldev)
 	return 0;
 }
 
-int pinconf_validate_map(struct pinctrl_map const *map, int i)
+int pinconf_validate_map(const struct pinctrl_map *map, int i)
 {
 	if (!map->data.configs.group_or_pin) {
 		pr_err("failed to register map %s (%d): no group/pin given\n",
@@ -106,7 +106,7 @@ unlock:
 	return ret;
 }
 
-int pinconf_map_to_setting(struct pinctrl_map const *map,
+int pinconf_map_to_setting(const struct pinctrl_map *map,
 			  struct pinctrl_setting *setting)
 {
 	struct pinctrl_dev *pctldev = setting->pctldev;
@@ -143,11 +143,11 @@ int pinconf_map_to_setting(struct pinctrl_map const *map,
 	return 0;
 }
 
-void pinconf_free_setting(struct pinctrl_setting const *setting)
+void pinconf_free_setting(const struct pinctrl_setting *setting)
 {
 }
 
-int pinconf_apply_setting(struct pinctrl_setting const *setting)
+int pinconf_apply_setting(const struct pinctrl_setting *setting)
 {
 	struct pinctrl_dev *pctldev = setting->pctldev;
 	const struct pinconf_ops *ops = pctldev->desc->confops;
@@ -235,7 +235,7 @@ static void pinconf_show_config(struct seq_file *s, struct pinctrl_dev *pctldev,
 	}
 }
 
-void pinconf_show_map(struct seq_file *s, struct pinctrl_map const *map)
+void pinconf_show_map(struct seq_file *s, const struct pinctrl_map *map)
 {
 	struct pinctrl_dev *pctldev;
 
@@ -259,7 +259,7 @@ void pinconf_show_map(struct seq_file *s, struct pinctrl_map const *map)
 }
 
 void pinconf_show_setting(struct seq_file *s,
-			  struct pinctrl_setting const *setting)
+			  const struct pinctrl_setting *setting)
 {
 	struct pinctrl_dev *pctldev = setting->pctldev;
 	const struct pinctrl_ops *pctlops = pctldev->desc->pctlops;
diff --git a/drivers/pinctrl/pinconf.h b/drivers/pinctrl/pinconf.h
index bf8aff9abf32..6c722505f893 100644
--- a/drivers/pinctrl/pinconf.h
+++ b/drivers/pinctrl/pinconf.h
@@ -14,11 +14,11 @@
 #ifdef CONFIG_PINCONF
 
 int pinconf_check_ops(struct pinctrl_dev *pctldev);
-int pinconf_validate_map(struct pinctrl_map const *map, int i);
-int pinconf_map_to_setting(struct pinctrl_map const *map,
+int pinconf_validate_map(const struct pinctrl_map *map, int i);
+int pinconf_map_to_setting(const struct pinctrl_map *map,
 			  struct pinctrl_setting *setting);
-void pinconf_free_setting(struct pinctrl_setting const *setting);
-int pinconf_apply_setting(struct pinctrl_setting const *setting);
+void pinconf_free_setting(const struct pinctrl_setting *setting);
+int pinconf_apply_setting(const struct pinctrl_setting *setting);
 
 int pinconf_set_config(struct pinctrl_dev *pctldev, unsigned pin,
 		       unsigned long *configs, size_t nconfigs);
@@ -39,22 +39,22 @@ static inline int pinconf_check_ops(struct pinctrl_dev *pctldev)
 	return 0;
 }
 
-static inline int pinconf_validate_map(struct pinctrl_map const *map, int i)
+static inline int pinconf_validate_map(const struct pinctrl_map *map, int i)
 {
 	return 0;
 }
 
-static inline int pinconf_map_to_setting(struct pinctrl_map const *map,
+static inline int pinconf_map_to_setting(const struct pinctrl_map *map,
 			  struct pinctrl_setting *setting)
 {
 	return 0;
 }
 
-static inline void pinconf_free_setting(struct pinctrl_setting const *setting)
+static inline void pinconf_free_setting(const struct pinctrl_setting *setting)
 {
 }
 
-static inline int pinconf_apply_setting(struct pinctrl_setting const *setting)
+static inline int pinconf_apply_setting(const struct pinctrl_setting *setting)
 {
 	return 0;
 }
@@ -69,21 +69,21 @@ static inline int pinconf_set_config(struct pinctrl_dev *pctldev, unsigned pin,
 
 #if defined(CONFIG_PINCONF) && defined(CONFIG_DEBUG_FS)
 
-void pinconf_show_map(struct seq_file *s, struct pinctrl_map const *map);
+void pinconf_show_map(struct seq_file *s, const struct pinctrl_map *map);
 void pinconf_show_setting(struct seq_file *s,
-			  struct pinctrl_setting const *setting);
+			  const struct pinctrl_setting *setting);
 void pinconf_init_device_debugfs(struct dentry *devroot,
 				 struct pinctrl_dev *pctldev);
 
 #else
 
 static inline void pinconf_show_map(struct seq_file *s,
-				    struct pinctrl_map const *map)
+				    const struct pinctrl_map *map)
 {
 }
 
 static inline void pinconf_show_setting(struct seq_file *s,
-			  struct pinctrl_setting const *setting)
+					const struct pinctrl_setting *setting)
 {
 }
 
diff --git a/drivers/pinctrl/pinmux.c b/drivers/pinctrl/pinmux.c
index 16b3ae5e4f44..36d5da9dc587 100644
--- a/drivers/pinctrl/pinmux.c
+++ b/drivers/pinctrl/pinmux.c
@@ -61,7 +61,7 @@ int pinmux_check_ops(struct pinctrl_dev *pctldev)
 	return 0;
 }
 
-int pinmux_validate_map(struct pinctrl_map const *map, int i)
+int pinmux_validate_map(const struct pinctrl_map *map, int i)
 {
 	if (!map->data.mux.function) {
 		pr_err("failed to register map %s (%d): no function given\n",
@@ -312,7 +312,7 @@ static int pinmux_func_name_to_selector(struct pinctrl_dev *pctldev,
 	return -EINVAL;
 }
 
-int pinmux_map_to_setting(struct pinctrl_map const *map,
+int pinmux_map_to_setting(const struct pinctrl_map *map,
 			  struct pinctrl_setting *setting)
 {
 	struct pinctrl_dev *pctldev = setting->pctldev;
@@ -372,12 +372,12 @@ int pinmux_map_to_setting(struct pinctrl_map const *map,
 	return 0;
 }
 
-void pinmux_free_setting(struct pinctrl_setting const *setting)
+void pinmux_free_setting(const struct pinctrl_setting *setting)
 {
 	/* This function is currently unused */
 }
 
-int pinmux_enable_setting(struct pinctrl_setting const *setting)
+int pinmux_enable_setting(const struct pinctrl_setting *setting)
 {
 	struct pinctrl_dev *pctldev = setting->pctldev;
 	const struct pinctrl_ops *pctlops = pctldev->desc->pctlops;
@@ -458,7 +458,7 @@ err_pin_request:
 	return ret;
 }
 
-void pinmux_disable_setting(struct pinctrl_setting const *setting)
+void pinmux_disable_setting(const struct pinctrl_setting *setting)
 {
 	struct pinctrl_dev *pctldev = setting->pctldev;
 	const struct pinctrl_ops *pctlops = pctldev->desc->pctlops;
@@ -627,7 +627,7 @@ static int pinmux_pins_show(struct seq_file *s, void *what)
 	return 0;
 }
 
-void pinmux_show_map(struct seq_file *s, struct pinctrl_map const *map)
+void pinmux_show_map(struct seq_file *s, const struct pinctrl_map *map)
 {
 	seq_printf(s, "group %s\nfunction %s\n",
 		map->data.mux.group ? map->data.mux.group : "(default)",
@@ -635,7 +635,7 @@ void pinmux_show_map(struct seq_file *s, struct pinctrl_map const *map)
 }
 
 void pinmux_show_setting(struct seq_file *s,
-			 struct pinctrl_setting const *setting)
+			 const struct pinctrl_setting *setting)
 {
 	struct pinctrl_dev *pctldev = setting->pctldev;
 	const struct pinmux_ops *pmxops = pctldev->desc->pmxops;
diff --git a/drivers/pinctrl/pinmux.h b/drivers/pinctrl/pinmux.h
index 248d8ea30e26..a331fcdbedd9 100644
--- a/drivers/pinctrl/pinmux.h
+++ b/drivers/pinctrl/pinmux.h
@@ -14,7 +14,7 @@
 
 int pinmux_check_ops(struct pinctrl_dev *pctldev);
 
-int pinmux_validate_map(struct pinctrl_map const *map, int i);
+int pinmux_validate_map(const struct pinctrl_map *map, int i);
 
 int pinmux_request_gpio(struct pinctrl_dev *pctldev,
 			struct pinctrl_gpio_range *range,
@@ -25,11 +25,11 @@ int pinmux_gpio_direction(struct pinctrl_dev *pctldev,
 			  struct pinctrl_gpio_range *range,
 			  unsigned pin, bool input);
 
-int pinmux_map_to_setting(struct pinctrl_map const *map,
+int pinmux_map_to_setting(const struct pinctrl_map *map,
 			  struct pinctrl_setting *setting);
-void pinmux_free_setting(struct pinctrl_setting const *setting);
-int pinmux_enable_setting(struct pinctrl_setting const *setting);
-void pinmux_disable_setting(struct pinctrl_setting const *setting);
+void pinmux_free_setting(const struct pinctrl_setting *setting);
+int pinmux_enable_setting(const struct pinctrl_setting *setting);
+void pinmux_disable_setting(const struct pinctrl_setting *setting);
 
 #else
 
@@ -38,7 +38,7 @@ static inline int pinmux_check_ops(struct pinctrl_dev *pctldev)
 	return 0;
 }
 
-static inline int pinmux_validate_map(struct pinctrl_map const *map, int i)
+static inline int pinmux_validate_map(const struct pinctrl_map *map, int i)
 {
 	return 0;
 }
@@ -63,23 +63,22 @@ static inline int pinmux_gpio_direction(struct pinctrl_dev *pctldev,
 	return 0;
 }
 
-static inline int pinmux_map_to_setting(struct pinctrl_map const *map,
+static inline int pinmux_map_to_setting(const struct pinctrl_map *map,
 			  struct pinctrl_setting *setting)
 {
 	return 0;
 }
 
-static inline void pinmux_free_setting(struct pinctrl_setting const *setting)
+static inline void pinmux_free_setting(const struct pinctrl_setting *setting)
 {
 }
 
-static inline int pinmux_enable_setting(struct pinctrl_setting const *setting)
+static inline int pinmux_enable_setting(const struct pinctrl_setting *setting)
 {
 	return 0;
 }
 
-static inline void pinmux_disable_setting(
-			struct pinctrl_setting const *setting)
+static inline void pinmux_disable_setting(const struct pinctrl_setting *setting)
 {
 }
 
@@ -87,21 +86,21 @@ static inline void pinmux_disable_setting(
 
 #if defined(CONFIG_PINMUX) && defined(CONFIG_DEBUG_FS)
 
-void pinmux_show_map(struct seq_file *s, struct pinctrl_map const *map);
+void pinmux_show_map(struct seq_file *s, const struct pinctrl_map *map);
 void pinmux_show_setting(struct seq_file *s,
-			 struct pinctrl_setting const *setting);
+			 const struct pinctrl_setting *setting);
 void pinmux_init_device_debugfs(struct dentry *devroot,
 				struct pinctrl_dev *pctldev);
 
 #else
 
 static inline void pinmux_show_map(struct seq_file *s,
-				   struct pinctrl_map const *map)
+				   const struct pinctrl_map *map)
 {
 }
 
 static inline void pinmux_show_setting(struct seq_file *s,
-				       struct pinctrl_setting const *setting)
+				       const struct pinctrl_setting *setting)
 {
 }
 
diff --git a/include/linux/pinctrl/machine.h b/include/linux/pinctrl/machine.h
index e5b1716f98cc..7fa5d87190c2 100644
--- a/include/linux/pinctrl/machine.h
+++ b/include/linux/pinctrl/machine.h
@@ -152,12 +152,12 @@ struct pinctrl_map {
 
 #ifdef CONFIG_PINCTRL
 
-extern int pinctrl_register_mappings(struct pinctrl_map const *map,
+extern int pinctrl_register_mappings(const struct pinctrl_map *map,
 				unsigned num_maps);
 extern void pinctrl_provide_dummies(void);
 #else
 
-static inline int pinctrl_register_mappings(struct pinctrl_map const *map,
+static inline int pinctrl_register_mappings(const struct pinctrl_map *map,
 					   unsigned num_maps)
 {
 	return 0;
-- 
cgit v1.2.3


From 950d55f5bf7991222e34428b5f779bd030b6a42a Mon Sep 17 00:00:00 2001
From: Thierry Reding <treding@nvidia.com>
Date: Mon, 24 Jul 2017 16:57:22 +0200
Subject: gpio: Cleanup kerneldoc

Some kerneldoc has become stale or wasn't quite correct from the outset.
Fix up the most serious issues to silence warnings when building the
documentation.

Signed-off-by: Thierry Reding <treding@nvidia.com>
Signed-off-by: Linus Walleij <linus.walleij@linaro.org>
---
 drivers/gpio/gpiolib.c      | 82 +++++++++++++++++++++++++++++++++------------
 include/linux/gpio/driver.h |  3 +-
 2 files changed, 61 insertions(+), 24 deletions(-)

(limited to 'include')

diff --git a/drivers/gpio/gpiolib.c b/drivers/gpio/gpiolib.c
index 528539d31274..18bba1748a35 100644
--- a/drivers/gpio/gpiolib.c
+++ b/drivers/gpio/gpiolib.c
@@ -84,7 +84,12 @@ static inline void desc_set_label(struct gpio_desc *d, const char *label)
 }
 
 /**
- * Convert a GPIO number to its descriptor
+ * gpio_to_desc - Convert a GPIO number to its descriptor
+ * @gpio: global GPIO number
+ *
+ * Returns:
+ * The GPIO descriptor associated with the given GPIO, or %NULL if no GPIO
+ * with the given number exists in the system.
  */
 struct gpio_desc *gpio_to_desc(unsigned gpio)
 {
@@ -111,7 +116,14 @@ struct gpio_desc *gpio_to_desc(unsigned gpio)
 EXPORT_SYMBOL_GPL(gpio_to_desc);
 
 /**
- * Get the GPIO descriptor corresponding to the given hw number for this chip.
+ * gpiochip_get_desc - get the GPIO descriptor corresponding to the given
+ *                     hardware number for this chip
+ * @chip: GPIO chip
+ * @hwnum: hardware number of the GPIO for this chip
+ *
+ * Returns:
+ * A pointer to the GPIO descriptor or %ERR_PTR(-EINVAL) if no GPIO exists
+ * in the given chip for the specified hardware number.
  */
 struct gpio_desc *gpiochip_get_desc(struct gpio_chip *chip,
 				    u16 hwnum)
@@ -125,9 +137,14 @@ struct gpio_desc *gpiochip_get_desc(struct gpio_chip *chip,
 }
 
 /**
- * Convert a GPIO descriptor to the integer namespace.
+ * desc_to_gpio - convert a GPIO descriptor to the integer namespace
+ * @desc: GPIO descriptor
+ *
  * This should disappear in the future but is needed since we still
- * use GPIO numbers for error messages and sysfs nodes
+ * use GPIO numbers for error messages and sysfs nodes.
+ *
+ * Returns:
+ * The global GPIO number for the GPIO specified by its descriptor.
  */
 int desc_to_gpio(const struct gpio_desc *desc)
 {
@@ -254,7 +271,7 @@ static int gpiodev_add_to_list(struct gpio_device *gdev)
 	return -EBUSY;
 }
 
-/**
+/*
  * Convert a GPIO name to its descriptor
  */
 static struct gpio_desc *gpio_name_to_desc(const char * const name)
@@ -878,7 +895,7 @@ out_free_le:
 	return ret;
 }
 
-/**
+/*
  * gpio_ioctl() - ioctl handler for the GPIO chardev
  */
 static long gpio_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
@@ -1077,11 +1094,9 @@ static void gpiochip_setup_devs(void)
 /**
  * gpiochip_add_data() - register a gpio_chip
  * @chip: the chip to register, with chip->base initialized
- * Context: potentially before irqs will work
+ * @data: driver-private data associated with this chip
  *
- * Returns a negative errno if the chip can't be registered, such as
- * because the chip->base is invalid or already associated with a
- * different chip.  Otherwise it returns zero as a success code.
+ * Context: potentially before irqs will work
  *
  * When gpiochip_add_data() is called very early during boot, so that GPIOs
  * can be freely used, the chip->parent device must be registered before
@@ -1093,6 +1108,11 @@ static void gpiochip_setup_devs(void)
  *
  * If chip->base is negative, this requests dynamic assignment of
  * a range of valid GPIOs.
+ *
+ * Returns:
+ * A negative errno if the chip can't be registered, such as because the
+ * chip->base is invalid or already associated with a different chip.
+ * Otherwise it returns zero as a success code.
  */
 int gpiochip_add_data(struct gpio_chip *chip, void *data)
 {
@@ -1298,6 +1318,10 @@ EXPORT_SYMBOL_GPL(gpiochip_add_data);
 
 /**
  * gpiochip_get_data() - get per-subdriver data for the chip
+ * @chip: GPIO chip
+ *
+ * Returns:
+ * The per-subdriver data for the chip.
  */
 void *gpiochip_get_data(struct gpio_chip *chip)
 {
@@ -1381,13 +1405,16 @@ static int devm_gpio_chip_match(struct device *dev, void *res, void *data)
  * devm_gpiochip_add_data() - Resource manager piochip_add_data()
  * @dev: the device pointer on which irq_chip belongs to.
  * @chip: the chip to register, with chip->base initialized
- * Context: potentially before irqs will work
+ * @data: driver-private data associated with this chip
  *
- * Returns a negative errno if the chip can't be registered, such as
- * because the chip->base is invalid or already associated with a
- * different chip.  Otherwise it returns zero as a success code.
+ * Context: potentially before irqs will work
  *
  * The gpio chip automatically be released when the device is unbound.
+ *
+ * Returns:
+ * A negative errno if the chip can't be registered, such as because the
+ * chip->base is invalid or already associated with a different chip.
+ * Otherwise it returns zero as a success code.
  */
 int devm_gpiochip_add_data(struct device *dev, struct gpio_chip *chip,
 			   void *data)
@@ -1433,7 +1460,7 @@ EXPORT_SYMBOL_GPL(devm_gpiochip_remove);
 /**
  * gpiochip_find() - iterator for locating a specific gpio_chip
  * @data: data to pass to match function
- * @callback: Callback function to check gpio_chip
+ * @match: Callback function to check gpio_chip
  *
  * Similar to bus_find_device.  It returns a reference to a gpio_chip as
  * determined by a user supplied @match callback.  The callback should return
@@ -1924,11 +1951,14 @@ EXPORT_SYMBOL_GPL(gpiochip_add_pingroup_range);
 /**
  * gpiochip_add_pin_range() - add a range for GPIO <-> pin mapping
  * @chip: the gpiochip to add the range for
- * @pinctrl_name: the dev_name() of the pin controller to map to
+ * @pinctl_name: the dev_name() of the pin controller to map to
  * @gpio_offset: the start offset in the current gpio_chip number space
  * @pin_offset: the start offset in the pin controller number space
  * @npins: the number of pins from the offset of each pin space (GPIO and
  *	pin controller) to accumulate in this range
+ *
+ * Returns:
+ * 0 on success, or a negative error-code on failure.
  */
 int gpiochip_add_pin_range(struct gpio_chip *chip, const char *pinctl_name,
 			   unsigned int gpio_offset, unsigned int pin_offset,
@@ -2173,7 +2203,8 @@ EXPORT_SYMBOL_GPL(gpiochip_is_requested);
 
 /**
  * gpiochip_request_own_desc - Allow GPIO chip to request its own descriptor
- * @desc: GPIO descriptor to request
+ * @chip: GPIO chip
+ * @hwnum: hardware number of the GPIO for which to request the descriptor
  * @label: label for the GPIO
  *
  * Function allows GPIO chip drivers to request and use their own GPIO
@@ -2181,6 +2212,10 @@ EXPORT_SYMBOL_GPL(gpiochip_is_requested);
  * function will not increase reference count of the GPIO chip module. This
  * allows the GPIO chip module to be unloaded as needed (we assume that the
  * GPIO chip driver handles freeing the GPIOs it has requested).
+ *
+ * Returns:
+ * A pointer to the GPIO descriptor, or an ERR_PTR()-encoded negative error
+ * code on failure.
  */
 struct gpio_desc *gpiochip_request_own_desc(struct gpio_chip *chip, u16 hwnum,
 					    const char *label)
@@ -2362,12 +2397,13 @@ int gpiod_direction_output(struct gpio_desc *desc, int value)
 EXPORT_SYMBOL_GPL(gpiod_direction_output);
 
 /**
- * gpiod_set_debounce - sets @debounce time for a @gpio
- * @gpio: the gpio to set debounce time
- * @debounce: debounce time is microseconds
+ * gpiod_set_debounce - sets @debounce time for a GPIO
+ * @desc: descriptor of the GPIO for which to set debounce time
+ * @debounce: debounce time in microseconds
  *
- * returns -ENOTSUPP if the controller does not support setting
- * debounce.
+ * Returns:
+ * 0 on success, %-ENOTSUPP if the controller doesn't support setting the
+ * debounce time.
  */
 int gpiod_set_debounce(struct gpio_desc *desc, unsigned debounce)
 {
@@ -3316,6 +3352,7 @@ EXPORT_SYMBOL_GPL(gpiod_get_index);
  * @propname:	name of the firmware property representing the GPIO
  * @index:	index of the GPIO to obtain in the consumer
  * @dflags:	GPIO initialization flags
+ * @label:	label to attach to the requested GPIO
  *
  * This function can be used for drivers that get their configuration
  * from firmware.
@@ -3324,6 +3361,7 @@ EXPORT_SYMBOL_GPL(gpiod_get_index);
  * underlying firmware interface and then makes sure that the GPIO
  * descriptor is requested before it is returned to the caller.
  *
+ * Returns:
  * On successful request the GPIO pin is configured in accordance with
  * provided @dflags.
  *
diff --git a/include/linux/gpio/driver.h b/include/linux/gpio/driver.h
index af20369ec8e7..ad4150d075c3 100644
--- a/include/linux/gpio/driver.h
+++ b/include/linux/gpio/driver.h
@@ -327,11 +327,10 @@ int gpiochip_generic_config(struct gpio_chip *chip, unsigned offset,
 
 /**
  * struct gpio_pin_range - pin range controlled by a gpio chip
- * @head: list for maintaining set of pin ranges, used internally
+ * @node: list for maintaining set of pin ranges, used internally
  * @pctldev: pinctrl device which handles corresponding pins
  * @range: actual range of pins controlled by a gpio controller
  */
-
 struct gpio_pin_range {
 	struct list_head node;
 	struct pinctrl_dev *pctldev;
-- 
cgit v1.2.3


From 67049c505017d7b380e16aa4f4b02344c2be0d55 Mon Sep 17 00:00:00 2001
From: Thierry Reding <treding@nvidia.com>
Date: Mon, 24 Jul 2017 16:57:23 +0200
Subject: gpio: of: Improve kerneldoc

Add descriptions for missing fields and fix up some parameter references
to match the code.

Signed-off-by: Thierry Reding <treding@nvidia.com>
Signed-off-by: Linus Walleij <linus.walleij@linaro.org>
---
 drivers/gpio/gpiolib-of.c   |  7 +++----
 include/linux/gpio/driver.h | 19 +++++++++++++++++++
 2 files changed, 22 insertions(+), 4 deletions(-)

(limited to 'include')

diff --git a/drivers/gpio/gpiolib-of.c b/drivers/gpio/gpiolib-of.c
index 8cde02ed5fd5..bfcd20699ec8 100644
--- a/drivers/gpio/gpiolib-of.c
+++ b/drivers/gpio/gpiolib-of.c
@@ -273,14 +273,13 @@ static int of_gpiochip_scan_gpios(struct gpio_chip *chip)
 }
 
 /**
- * of_gpio_simple_xlate - translate gpio_spec to the GPIO number and flags
+ * of_gpio_simple_xlate - translate gpiospec to the GPIO number and flags
  * @gc:		pointer to the gpio_chip structure
- * @np:		device node of the GPIO chip
- * @gpio_spec:	gpio specifier as found in the device tree
+ * @gpiospec:	GPIO specifier as found in the device tree
  * @flags:	a flags pointer to fill in
  *
  * This is simple translation function, suitable for the most 1:1 mapped
- * gpio chips. This function performs only one sanity check: whether gpio
+ * GPIO chips. This function performs only one sanity check: whether GPIO
  * is less than ngpios (that is specified in the gpio_chip).
  */
 int of_gpio_simple_xlate(struct gpio_chip *gc,
diff --git a/include/linux/gpio/driver.h b/include/linux/gpio/driver.h
index ad4150d075c3..fe66c9306caf 100644
--- a/include/linux/gpio/driver.h
+++ b/include/linux/gpio/driver.h
@@ -180,8 +180,27 @@ struct gpio_chip {
 	 * If CONFIG_OF is enabled, then all GPIO controllers described in the
 	 * device tree automatically may have an OF translation
 	 */
+
+	/**
+	 * @of_node:
+	 *
+	 * Pointer to a device tree node representing this GPIO controller.
+	 */
 	struct device_node *of_node;
+
+	/**
+	 * @of_gpio_n_cells:
+	 *
+	 * Number of cells used to form the GPIO specifier.
+	 */
 	int of_gpio_n_cells;
+
+	/**
+	 * @of_xlate:
+	 *
+	 * Callback to translate a device tree GPIO specifier into a chip-
+	 * relative GPIO number and flags.
+	 */
 	int (*of_xlate)(struct gpio_chip *gc,
 			const struct of_phandle_args *gpiospec, u32 *flags);
 #endif
-- 
cgit v1.2.3


From e3b445d77819b8fb41e3ceae9dbd49c8c8427c5d Mon Sep 17 00:00:00 2001
From: Thierry Reding <treding@nvidia.com>
Date: Mon, 24 Jul 2017 16:57:28 +0200
Subject: gpio: Use unsigned int for of_gpio_n_cells

The cell count for GPIO specifiers can never be negative, so make the
field unsigned.

Signed-off-by: Thierry Reding <treding@nvidia.com>
Signed-off-by: Linus Walleij <linus.walleij@linaro.org>
---
 include/linux/gpio/driver.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/gpio/driver.h b/include/linux/gpio/driver.h
index fe66c9306caf..c97f8325e8bf 100644
--- a/include/linux/gpio/driver.h
+++ b/include/linux/gpio/driver.h
@@ -193,7 +193,7 @@ struct gpio_chip {
 	 *
 	 * Number of cells used to form the GPIO specifier.
 	 */
-	int of_gpio_n_cells;
+	unsigned int of_gpio_n_cells;
 
 	/**
 	 * @of_xlate:
-- 
cgit v1.2.3


From 4f04ff03dc48b81b436abf4f8600f9553825e39f Mon Sep 17 00:00:00 2001
From: Chen Zhong <chen.zhong@mediatek.com>
Date: Mon, 7 Aug 2017 15:24:36 +0800
Subject: soc: mediatek: add header files required for MT7622 SCPSYS dt-binding

Add relevant header files required for dt-bindings of SCPSYS power domain
control for all subsystems found on MT7622 SoC.

Signed-off-by: Chen Zhong <chen.zhong@mediatek.com>
Signed-off-by: Sean Wang <sean.wang@mediatek.com>
Reviewed-by: Ulf Hansson <ulf.hansson@linaro.org>
Signed-off-by: Matthias Brugger <matthias.bgg@gmail.com>
---
 include/dt-bindings/power/mt7622-power.h | 22 ++++++++++++++++++++++
 1 file changed, 22 insertions(+)
 create mode 100644 include/dt-bindings/power/mt7622-power.h

(limited to 'include')

diff --git a/include/dt-bindings/power/mt7622-power.h b/include/dt-bindings/power/mt7622-power.h
new file mode 100644
index 000000000000..1b639269790c
--- /dev/null
+++ b/include/dt-bindings/power/mt7622-power.h
@@ -0,0 +1,22 @@
+/*
+ * Copyright (C) 2017 MediaTek Inc.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ * See http://www.gnu.org/licenses/gpl-2.0.html for more details.
+ */
+
+#ifndef _DT_BINDINGS_POWER_MT7622_POWER_H
+#define _DT_BINDINGS_POWER_MT7622_POWER_H
+
+#define MT7622_POWER_DOMAIN_ETHSYS	0
+#define MT7622_POWER_DOMAIN_HIF0	1
+#define MT7622_POWER_DOMAIN_HIF1	2
+#define MT7622_POWER_DOMAIN_WB		3
+
+#endif /* _DT_BINDINGS_POWER_MT7622_POWER_H */
-- 
cgit v1.2.3


From 52510ee934885ec846faaa5d029329c9ba0e6ecc Mon Sep 17 00:00:00 2001
From: Sean Wang <sean.wang@mediatek.com>
Date: Mon, 7 Aug 2017 15:24:37 +0800
Subject: soc: mediatek: add SCPSYS power domain driver for MediaTek MT7622 SoC

Add SCPSYS power domain driver for MT7622 SoC having four power domains
which are respectively ETHSYS for Ethernet including embedded switch,
WBSYS for WIFI and Bluetooth, HIF0SYS for PCI-E and SATA, and HIF1SYS for
USB. Those functions could be selectively powered gated when the
corresponding function is no longer to use in order to reach more minimal
power dissipation.

Signed-off-by: Chen Zhong <chen.zhong@mediatek.com>
Signed-off-by: Sean Wang <sean.wang@mediatek.com>
Reviewed-by: Ulf Hansson <ulf.hansson@linaro.org>
Signed-off-by: Matthias Brugger <matthias.bgg@gmail.com>
---
 drivers/soc/mediatek/mtk-scpsys.c     | 71 +++++++++++++++++++++++++++++++++++
 include/linux/soc/mediatek/infracfg.h |  7 ++++
 2 files changed, 78 insertions(+)

(limited to 'include')

diff --git a/drivers/soc/mediatek/mtk-scpsys.c b/drivers/soc/mediatek/mtk-scpsys.c
index 6268b28ed44d..e1ce8b1b5090 100644
--- a/drivers/soc/mediatek/mtk-scpsys.c
+++ b/drivers/soc/mediatek/mtk-scpsys.c
@@ -22,6 +22,7 @@
 
 #include <dt-bindings/power/mt2701-power.h>
 #include <dt-bindings/power/mt6797-power.h>
+#include <dt-bindings/power/mt7622-power.h>
 #include <dt-bindings/power/mt8173-power.h>
 
 #define SPM_VDE_PWR_CON			0x0210
@@ -39,6 +40,11 @@
 #define SPM_MFG_2D_PWR_CON		0x02c0
 #define SPM_MFG_ASYNC_PWR_CON		0x02c4
 #define SPM_USB_PWR_CON			0x02cc
+#define SPM_ETHSYS_PWR_CON		0x02e0	/* MT7622 */
+#define SPM_HIF0_PWR_CON		0x02e4	/* MT7622 */
+#define SPM_HIF1_PWR_CON		0x02e8	/* MT7622 */
+#define SPM_WB_PWR_CON			0x02ec	/* MT7622 */
+
 
 #define SPM_PWR_STATUS			0x060c
 #define SPM_PWR_STATUS_2ND		0x0610
@@ -64,6 +70,10 @@
 #define PWR_STATUS_MFG_ASYNC		BIT(23)
 #define PWR_STATUS_AUDIO		BIT(24)
 #define PWR_STATUS_USB			BIT(25)
+#define PWR_STATUS_ETHSYS		BIT(24)	/* MT7622 */
+#define PWR_STATUS_HIF0			BIT(25)	/* MT7622 */
+#define PWR_STATUS_HIF1			BIT(26)	/* MT7622 */
+#define PWR_STATUS_WB			BIT(27)	/* MT7622 */
 
 enum clk_id {
 	CLK_NONE,
@@ -73,6 +83,7 @@ enum clk_id {
 	CLK_VENC_LT,
 	CLK_ETHIF,
 	CLK_VDEC,
+	CLK_HIFSEL,
 	CLK_MAX,
 };
 
@@ -84,6 +95,7 @@ static const char * const clk_names[] = {
 	"venc_lt",
 	"ethif",
 	"vdec",
+	"hif_sel",
 	NULL,
 };
 
@@ -652,6 +664,53 @@ static const struct scp_subdomain scp_subdomain_mt6797[] = {
 	{MT6797_POWER_DOMAIN_MM, MT6797_POWER_DOMAIN_MJC},
 };
 
+/*
+ * MT7622 power domain support
+ */
+
+static const struct scp_domain_data scp_domain_data_mt7622[] = {
+	[MT7622_POWER_DOMAIN_ETHSYS] = {
+		.name = "ethsys",
+		.sta_mask = PWR_STATUS_ETHSYS,
+		.ctl_offs = SPM_ETHSYS_PWR_CON,
+		.sram_pdn_bits = GENMASK(11, 8),
+		.sram_pdn_ack_bits = GENMASK(15, 12),
+		.clk_id = {CLK_NONE},
+		.bus_prot_mask = MT7622_TOP_AXI_PROT_EN_ETHSYS,
+		.active_wakeup = true,
+	},
+	[MT7622_POWER_DOMAIN_HIF0] = {
+		.name = "hif0",
+		.sta_mask = PWR_STATUS_HIF0,
+		.ctl_offs = SPM_HIF0_PWR_CON,
+		.sram_pdn_bits = GENMASK(11, 8),
+		.sram_pdn_ack_bits = GENMASK(15, 12),
+		.clk_id = {CLK_HIFSEL},
+		.bus_prot_mask = MT7622_TOP_AXI_PROT_EN_HIF0,
+		.active_wakeup = true,
+	},
+	[MT7622_POWER_DOMAIN_HIF1] = {
+		.name = "hif1",
+		.sta_mask = PWR_STATUS_HIF1,
+		.ctl_offs = SPM_HIF1_PWR_CON,
+		.sram_pdn_bits = GENMASK(11, 8),
+		.sram_pdn_ack_bits = GENMASK(15, 12),
+		.clk_id = {CLK_HIFSEL},
+		.bus_prot_mask = MT7622_TOP_AXI_PROT_EN_HIF1,
+		.active_wakeup = true,
+	},
+	[MT7622_POWER_DOMAIN_WB] = {
+		.name = "wb",
+		.sta_mask = PWR_STATUS_WB,
+		.ctl_offs = SPM_WB_PWR_CON,
+		.sram_pdn_bits = 0,
+		.sram_pdn_ack_bits = 0,
+		.clk_id = {CLK_NONE},
+		.bus_prot_mask = MT7622_TOP_AXI_PROT_EN_WB,
+		.active_wakeup = true,
+	},
+};
+
 /*
  * MT8173 power domain support
  */
@@ -771,6 +830,15 @@ static const struct scp_soc_data mt6797_data = {
 	}
 };
 
+static const struct scp_soc_data mt7622_data = {
+	.domains = scp_domain_data_mt7622,
+	.num_domains = ARRAY_SIZE(scp_domain_data_mt7622),
+	.regs = {
+		.pwr_sta_offs = SPM_PWR_STATUS,
+		.pwr_sta2nd_offs = SPM_PWR_STATUS_2ND
+	}
+};
+
 static const struct scp_soc_data mt8173_data = {
 	.domains = scp_domain_data_mt8173,
 	.num_domains = ARRAY_SIZE(scp_domain_data_mt8173),
@@ -793,6 +861,9 @@ static const struct of_device_id of_scpsys_match_tbl[] = {
 	}, {
 		.compatible = "mediatek,mt6797-scpsys",
 		.data = &mt6797_data,
+	}, {
+		.compatible = "mediatek,mt7622-scpsys",
+		.data = &mt7622_data,
 	}, {
 		.compatible = "mediatek,mt8173-scpsys",
 		.data = &mt8173_data,
diff --git a/include/linux/soc/mediatek/infracfg.h b/include/linux/soc/mediatek/infracfg.h
index a5714e93fb34..a0182ec2a621 100644
--- a/include/linux/soc/mediatek/infracfg.h
+++ b/include/linux/soc/mediatek/infracfg.h
@@ -20,6 +20,13 @@
 #define MT8173_TOP_AXI_PROT_EN_MFG_M1		BIT(22)
 #define MT8173_TOP_AXI_PROT_EN_MFG_SNOOP_OUT	BIT(23)
 
+#define MT7622_TOP_AXI_PROT_EN_ETHSYS		(BIT(3) | BIT(17))
+#define MT7622_TOP_AXI_PROT_EN_HIF0		(BIT(24) | BIT(25))
+#define MT7622_TOP_AXI_PROT_EN_HIF1		(BIT(26) | BIT(27) | \
+						 BIT(28))
+#define MT7622_TOP_AXI_PROT_EN_WB		(BIT(2) | BIT(6) | \
+						 BIT(7) | BIT(8))
+
 int mtk_infracfg_set_bus_protection(struct regmap *infracfg, u32 mask);
 int mtk_infracfg_clear_bus_protection(struct regmap *infracfg, u32 mask);
 
-- 
cgit v1.2.3


From 561099a1a2e992a482a8318c0c9c5af26222e5cd Mon Sep 17 00:00:00 2001
From: Cédric Le Goater <clg@kaod.org>
Date: Tue, 8 Aug 2017 15:42:39 +0200
Subject: leds: pca955x: add GPIO support
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The PCA955x family of chips are I2C LED blinkers whose pins not used
to control LEDs can be used as general purpose I/Os (GPIOs).

The following adds such a support by defining different operation
modes for the pins (See bindings documentation for more details). The
pca955x driver is then extended with a gpio_chip when some of pins are
operating as GPIOs. The default operating mode is to behave as a LED.

The GPIO support is conditioned by CONFIG_LEDS_PCA955X_GPIO.

Signed-off-by: Cédric Le Goater <clg@kaod.org>
Acked-by: Pavel Machek <pavel@ucw.cz>
Signed-off-by: Jacek Anaszewski <jacek.anaszewski@gmail.com>
---
 drivers/leds/Kconfig                    |  11 +++
 drivers/leds/leds-pca955x.c             | 137 ++++++++++++++++++++++++++++----
 include/dt-bindings/leds/leds-pca955x.h |  16 ++++
 3 files changed, 147 insertions(+), 17 deletions(-)
 create mode 100644 include/dt-bindings/leds/leds-pca955x.h

(limited to 'include')

diff --git a/drivers/leds/Kconfig b/drivers/leds/Kconfig
index 594b24d410c3..3013cd35c65e 100644
--- a/drivers/leds/Kconfig
+++ b/drivers/leds/Kconfig
@@ -377,6 +377,17 @@ config LEDS_PCA955X
 	  LED driver chips accessed via the I2C bus.  Supported
 	  devices include PCA9550, PCA9551, PCA9552, and PCA9553.
 
+config LEDS_PCA955X_GPIO
+	bool "Enable GPIO support for PCA955X"
+	depends on LEDS_PCA955X
+	depends on GPIOLIB
+	help
+	  Allow unused pins on PCA955X to be used as gpio.
+
+	  To use a pin as gpio the pin type should be set to
+	  PCA955X_TYPE_GPIO in the device tree.
+
+
 config LEDS_PCA963X
 	tristate "LED support for PCA963x I2C chip"
 	depends on LEDS_CLASS
diff --git a/drivers/leds/leds-pca955x.c b/drivers/leds/leds-pca955x.c
index 5a5d3765cfd3..f062d1e7640f 100644
--- a/drivers/leds/leds-pca955x.c
+++ b/drivers/leds/leds-pca955x.c
@@ -53,6 +53,8 @@
 #include <linux/slab.h>
 #include <linux/string.h>
 
+#include <dt-bindings/leds/leds-pca955x.h>
+
 /* LED select registers determine the source that drives LED outputs */
 #define PCA955X_LS_LED_ON	0x0	/* Output LOW */
 #define PCA955X_LS_LED_OFF	0x1	/* Output HI-Z */
@@ -118,6 +120,9 @@ struct pca955x {
 	struct pca955x_led *leds;
 	struct pca955x_chipdef	*chipdef;
 	struct i2c_client	*client;
+#ifdef CONFIG_LEDS_PCA955X_GPIO
+	struct gpio_chip gpio;
+#endif
 };
 
 struct pca955x_led {
@@ -125,6 +130,7 @@ struct pca955x_led {
 	struct led_classdev	led_cdev;
 	int			led_num;	/* 0 .. 15 potentially */
 	char			name[32];
+	u32			type;
 	const char		*default_trigger;
 };
 
@@ -259,6 +265,65 @@ static int pca955x_led_set(struct led_classdev *led_cdev,
 	return 0;
 }
 
+#ifdef CONFIG_LEDS_PCA955X_GPIO
+/*
+ * Read the INPUT register, which contains the state of LEDs.
+ */
+static u8 pca955x_read_input(struct i2c_client *client, int n)
+{
+	return (u8)i2c_smbus_read_byte_data(client, n);
+}
+
+static int pca955x_gpio_request_pin(struct gpio_chip *gc, unsigned int offset)
+{
+	struct pca955x *pca955x = gpiochip_get_data(gc);
+	struct pca955x_led *led = &pca955x->leds[offset];
+
+	if (led->type == PCA955X_TYPE_GPIO)
+		return 0;
+
+	return -EBUSY;
+}
+
+static void pca955x_gpio_set_value(struct gpio_chip *gc, unsigned int offset,
+				   int val)
+{
+	struct pca955x *pca955x = gpiochip_get_data(gc);
+	struct pca955x_led *led = &pca955x->leds[offset];
+
+	if (val)
+		pca955x_led_set(&led->led_cdev, LED_FULL);
+	else
+		pca955x_led_set(&led->led_cdev, LED_OFF);
+}
+
+static int pca955x_gpio_get_value(struct gpio_chip *gc, unsigned int offset)
+{
+	struct pca955x *pca955x = gpiochip_get_data(gc);
+	struct pca955x_led *led = &pca955x->leds[offset];
+	u8 reg = pca955x_read_input(pca955x->client, led->led_num / 8);
+
+	return !!(reg & (1 << (led->led_num % 8)));
+}
+
+static int pca955x_gpio_direction_input(struct gpio_chip *gc,
+					unsigned int offset)
+{
+	/* To use as input ensure pin is not driven */
+	pca955x_gpio_set_value(gc, offset, 0);
+
+	return 0;
+}
+
+static int pca955x_gpio_direction_output(struct gpio_chip *gc,
+					 unsigned int offset, int val)
+{
+	pca955x_gpio_set_value(gc, offset, val);
+
+	return 0;
+}
+#endif /* CONFIG_LEDS_PCA955X_GPIO */
+
 #if IS_ENABLED(CONFIG_OF)
 static struct pca955x_platform_data *
 pca955x_pdata_of_init(struct i2c_client *client, struct pca955x_chipdef *chip)
@@ -297,6 +362,8 @@ pca955x_pdata_of_init(struct i2c_client *client, struct pca955x_chipdef *chip)
 		snprintf(pdata->leds[reg].name, sizeof(pdata->leds[reg].name),
 			 "%s", name);
 
+		pdata->leds[reg].type = PCA955X_TYPE_LED;
+		of_property_read_u32(child, "type", &pdata->leds[reg].type);
 		of_property_read_string(child, "linux,default-trigger",
 					&pdata->leds[reg].default_trigger);
 	}
@@ -332,6 +399,7 @@ static int pca955x_probe(struct i2c_client *client,
 	struct i2c_adapter *adapter;
 	int i, err;
 	struct pca955x_platform_data *pdata;
+	int ngpios = 0;
 
 	if (id) {
 		chip = &pca955x_chipdefs[id->driver_data];
@@ -392,9 +460,19 @@ static int pca955x_probe(struct i2c_client *client,
 		pca955x_led = &pca955x->leds[i];
 		pca955x_led->led_num = i;
 		pca955x_led->pca955x = pca955x;
-
-		/* Platform data can specify LED names and default triggers */
-		if (pdata) {
+		pca955x_led->type = pdata->leds[i].type;
+
+		switch (pca955x_led->type) {
+		case PCA955X_TYPE_NONE:
+			break;
+		case PCA955X_TYPE_GPIO:
+			ngpios++;
+			break;
+		case PCA955X_TYPE_LED:
+			/*
+			 * Platform data can specify LED names and
+			 * default triggers
+			 */
 			if (pdata->leds[i].name)
 				snprintf(pca955x_led->name,
 					sizeof(pca955x_led->name), "pca955x:%s",
@@ -402,23 +480,20 @@ static int pca955x_probe(struct i2c_client *client,
 			if (pdata->leds[i].default_trigger)
 				pca955x_led->led_cdev.default_trigger =
 					pdata->leds[i].default_trigger;
-		} else {
-			snprintf(pca955x_led->name, sizeof(pca955x_led->name),
-				 "pca955x:%d", i);
-		}
 
-		pca955x_led->led_cdev.name = pca955x_led->name;
-		pca955x_led->led_cdev.brightness_set_blocking = pca955x_led_set;
+			pca955x_led->led_cdev.name = pca955x_led->name;
+			pca955x_led->led_cdev.brightness_set_blocking =
+				pca955x_led_set;
 
-		err = devm_led_classdev_register(&client->dev,
-						 &pca955x_led->led_cdev);
-		if (err)
-			return err;
-	}
+			err = devm_led_classdev_register(&client->dev,
+							&pca955x_led->led_cdev);
+			if (err)
+				return err;
 
-	/* Turn off LEDs */
-	for (i = 0; i < pca95xx_num_led_regs(chip->bits); i++)
-		pca955x_write_ls(client, i, 0x55);
+			/* Turn off LED */
+			pca955x_led_set(&pca955x_led->led_cdev, LED_OFF);
+		}
+	}
 
 	/* PWM0 is used for half brightness or 50% duty cycle */
 	pca955x_write_pwm(client, 0, 255-LED_HALF);
@@ -430,6 +505,34 @@ static int pca955x_probe(struct i2c_client *client,
 	pca955x_write_psc(client, 0, 0);
 	pca955x_write_psc(client, 1, 0);
 
+#ifdef CONFIG_LEDS_PCA955X_GPIO
+	if (ngpios) {
+		pca955x->gpio.label = "gpio-pca955x";
+		pca955x->gpio.direction_input = pca955x_gpio_direction_input;
+		pca955x->gpio.direction_output = pca955x_gpio_direction_output;
+		pca955x->gpio.set = pca955x_gpio_set_value;
+		pca955x->gpio.get = pca955x_gpio_get_value;
+		pca955x->gpio.request = pca955x_gpio_request_pin;
+		pca955x->gpio.can_sleep = 1;
+		pca955x->gpio.base = -1;
+		pca955x->gpio.ngpio = ngpios;
+		pca955x->gpio.parent = &client->dev;
+		pca955x->gpio.owner = THIS_MODULE;
+
+		err = devm_gpiochip_add_data(&client->dev, &pca955x->gpio,
+					     pca955x);
+		if (err) {
+			/* Use data->gpio.dev as a flag for freeing gpiochip */
+			pca955x->gpio.parent = NULL;
+			dev_warn(&client->dev, "could not add gpiochip\n");
+			return err;
+		}
+		dev_info(&client->dev, "gpios %i...%i\n",
+			 pca955x->gpio.base, pca955x->gpio.base +
+			 pca955x->gpio.ngpio - 1);
+	}
+#endif
+
 	return 0;
 }
 
diff --git a/include/dt-bindings/leds/leds-pca955x.h b/include/dt-bindings/leds/leds-pca955x.h
new file mode 100644
index 000000000000..78cb7e979de7
--- /dev/null
+++ b/include/dt-bindings/leds/leds-pca955x.h
@@ -0,0 +1,16 @@
+/*
+ * This header provides constants for pca955x LED bindings.
+ *
+ * This file is licensed under the terms of the GNU General Public
+ * License version 2.  This program is licensed "as is" without any
+ * warranty of any kind, whether express or implied.
+ */
+
+#ifndef _DT_BINDINGS_LEDS_PCA955X_H
+#define _DT_BINDINGS_LEDS_PCA955X_H
+
+#define PCA955X_TYPE_NONE         0
+#define PCA955X_TYPE_LED          1
+#define PCA955X_TYPE_GPIO         2
+
+#endif /* _DT_BINDINGS_LEDS_PCA955X_H */
-- 
cgit v1.2.3


From d612b1fd8010d0d67b5287fe146b8b55bcbb8655 Mon Sep 17 00:00:00 2001
From: Tyler Hicks <tyhicks@canonical.com>
Date: Fri, 11 Aug 2017 04:33:53 +0000
Subject: seccomp: Operation for checking if an action is available

Userspace code that needs to check if the kernel supports a given action
may not be able to use the /proc/sys/kernel/seccomp/actions_avail
sysctl. The process may be running in a sandbox and, therefore,
sufficient filesystem access may not be available. This patch adds an
operation to the seccomp(2) syscall that allows userspace code to ask
the kernel if a given action is available.

If the action is supported by the kernel, 0 is returned. If the action
is not supported by the kernel, -1 is returned with errno set to
-EOPNOTSUPP. If this check is attempted on a kernel that doesn't support
this new operation, -1 is returned with errno set to -EINVAL meaning
that userspace code will have the ability to differentiate between the
two error cases.

Signed-off-by: Tyler Hicks <tyhicks@canonical.com>
Suggested-by: Andy Lutomirski <luto@amacapital.net>
Signed-off-by: Kees Cook <keescook@chromium.org>
---
 include/uapi/linux/seccomp.h                  |  5 ++--
 kernel/seccomp.c                              | 26 +++++++++++++++++++
 tools/testing/selftests/seccomp/seccomp_bpf.c | 36 +++++++++++++++++++++++++++
 3 files changed, 65 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/uapi/linux/seccomp.h b/include/uapi/linux/seccomp.h
index 0f238a43ff1e..aaad61cc46bc 100644
--- a/include/uapi/linux/seccomp.h
+++ b/include/uapi/linux/seccomp.h
@@ -11,8 +11,9 @@
 #define SECCOMP_MODE_FILTER	2 /* uses user-supplied filter. */
 
 /* Valid operations for seccomp syscall. */
-#define SECCOMP_SET_MODE_STRICT	0
-#define SECCOMP_SET_MODE_FILTER	1
+#define SECCOMP_SET_MODE_STRICT		0
+#define SECCOMP_SET_MODE_FILTER		1
+#define SECCOMP_GET_ACTION_AVAIL	2
 
 /* Valid flags for SECCOMP_SET_MODE_FILTER */
 #define SECCOMP_FILTER_FLAG_TSYNC	1
diff --git a/kernel/seccomp.c b/kernel/seccomp.c
index 5f19f41e4e50..7a6089f66fed 100644
--- a/kernel/seccomp.c
+++ b/kernel/seccomp.c
@@ -808,6 +808,27 @@ static inline long seccomp_set_mode_filter(unsigned int flags,
 }
 #endif
 
+static long seccomp_get_action_avail(const char __user *uaction)
+{
+	u32 action;
+
+	if (copy_from_user(&action, uaction, sizeof(action)))
+		return -EFAULT;
+
+	switch (action) {
+	case SECCOMP_RET_KILL:
+	case SECCOMP_RET_TRAP:
+	case SECCOMP_RET_ERRNO:
+	case SECCOMP_RET_TRACE:
+	case SECCOMP_RET_ALLOW:
+		break;
+	default:
+		return -EOPNOTSUPP;
+	}
+
+	return 0;
+}
+
 /* Common entry point for both prctl and syscall. */
 static long do_seccomp(unsigned int op, unsigned int flags,
 		       const char __user *uargs)
@@ -819,6 +840,11 @@ static long do_seccomp(unsigned int op, unsigned int flags,
 		return seccomp_set_mode_strict();
 	case SECCOMP_SET_MODE_FILTER:
 		return seccomp_set_mode_filter(flags, uargs);
+	case SECCOMP_GET_ACTION_AVAIL:
+		if (flags != 0)
+			return -EINVAL;
+
+		return seccomp_get_action_avail(uargs);
 	default:
 		return -EINVAL;
 	}
diff --git a/tools/testing/selftests/seccomp/seccomp_bpf.c b/tools/testing/selftests/seccomp/seccomp_bpf.c
index 2fb49d99588d..1f2888f6678b 100644
--- a/tools/testing/selftests/seccomp/seccomp_bpf.c
+++ b/tools/testing/selftests/seccomp/seccomp_bpf.c
@@ -1731,6 +1731,10 @@ TEST_F_SIGNAL(TRACE_syscall, kill_after_ptrace, SIGSYS)
 #define SECCOMP_SET_MODE_FILTER 1
 #endif
 
+#ifndef SECCOMP_GET_ACTION_AVAIL
+#define SECCOMP_GET_ACTION_AVAIL 2
+#endif
+
 #ifndef SECCOMP_FILTER_FLAG_TSYNC
 #define SECCOMP_FILTER_FLAG_TSYNC 1
 #endif
@@ -2469,6 +2473,38 @@ TEST(syscall_restart)
 		_metadata->passed = 0;
 }
 
+TEST(get_action_avail)
+{
+	__u32 actions[] = { SECCOMP_RET_KILL,  SECCOMP_RET_TRAP,
+			    SECCOMP_RET_ERRNO, SECCOMP_RET_TRACE,
+			    SECCOMP_RET_ALLOW };
+	__u32 unknown_action = 0x10000000U;
+	int i;
+	long ret;
+
+	ret = seccomp(SECCOMP_GET_ACTION_AVAIL, 0, &actions[0]);
+	ASSERT_NE(ENOSYS, errno) {
+		TH_LOG("Kernel does not support seccomp syscall!");
+	}
+	ASSERT_NE(EINVAL, errno) {
+		TH_LOG("Kernel does not support SECCOMP_GET_ACTION_AVAIL operation!");
+	}
+	EXPECT_EQ(ret, 0);
+
+	for (i = 0; i < ARRAY_SIZE(actions); i++) {
+		ret = seccomp(SECCOMP_GET_ACTION_AVAIL, 0, &actions[i]);
+		EXPECT_EQ(ret, 0) {
+			TH_LOG("Expected action (0x%X) not available!",
+			       actions[i]);
+		}
+	}
+
+	/* Check that an unknown action is handled properly (EOPNOTSUPP) */
+	ret = seccomp(SECCOMP_GET_ACTION_AVAIL, 0, &unknown_action);
+	EXPECT_EQ(ret, -1);
+	EXPECT_EQ(errno, EOPNOTSUPP);
+}
+
 /*
  * TODO:
  * - add microbenchmarks
-- 
cgit v1.2.3


From 0ddec0fc8900201c0897b87b762b7c420436662f Mon Sep 17 00:00:00 2001
From: Tyler Hicks <tyhicks@canonical.com>
Date: Fri, 11 Aug 2017 04:33:54 +0000
Subject: seccomp: Sysctl to configure actions that are allowed to be logged

Adminstrators can write to this sysctl to set the seccomp actions that
are allowed to be logged. Any actions not found in this sysctl will not
be logged.

For example, all SECCOMP_RET_KILL, SECCOMP_RET_TRAP, and
SECCOMP_RET_ERRNO actions would be loggable if "kill trap errno" were
written to the sysctl. SECCOMP_RET_TRACE actions would not be logged
since its string representation ("trace") wasn't present in the sysctl
value.

The path to the sysctl is:

 /proc/sys/kernel/seccomp/actions_logged

The actions_avail sysctl can be read to discover the valid action names
that can be written to the actions_logged sysctl with the exception of
"allow". SECCOMP_RET_ALLOW actions cannot be configured for logging.

The default setting for the sysctl is to allow all actions to be logged
except SECCOMP_RET_ALLOW. While only SECCOMP_RET_KILL actions are
currently logged, an upcoming patch will allow applications to request
additional actions to be logged.

There's one important exception to this sysctl. If a task is
specifically being audited, meaning that an audit context has been
allocated for the task, seccomp will log all actions other than
SECCOMP_RET_ALLOW despite the value of actions_logged. This exception
preserves the existing auditing behavior of tasks with an allocated
audit context.

With this patch, the logic for deciding if an action will be logged is:

if action == RET_ALLOW:
  do not log
else if action == RET_KILL && RET_KILL in actions_logged:
  log
else if audit_enabled && task-is-being-audited:
  log
else:
  do not log

Signed-off-by: Tyler Hicks <tyhicks@canonical.com>
Signed-off-by: Kees Cook <keescook@chromium.org>
---
 Documentation/userspace-api/seccomp_filter.rst |  18 +++
 include/linux/audit.h                          |   6 +-
 kernel/seccomp.c                               | 171 ++++++++++++++++++++++++-
 3 files changed, 187 insertions(+), 8 deletions(-)

(limited to 'include')

diff --git a/Documentation/userspace-api/seccomp_filter.rst b/Documentation/userspace-api/seccomp_filter.rst
index 35fc7cbf1d95..2d1d8ab04ac5 100644
--- a/Documentation/userspace-api/seccomp_filter.rst
+++ b/Documentation/userspace-api/seccomp_filter.rst
@@ -187,6 +187,24 @@ directory. Here's a description of each file in that directory:
 	program was built, differs from the set of actions actually
 	supported in the current running kernel.
 
+``actions_logged``:
+	A read-write ordered list of seccomp return values (refer to the
+	``SECCOMP_RET_*`` macros above) that are allowed to be logged. Writes
+	to the file do not need to be in ordered form but reads from the file
+	will be ordered in the same way as the actions_avail sysctl.
+
+	It is important to note that the value of ``actions_logged`` does not
+	prevent certain actions from being logged when the audit subsystem is
+	configured to audit a task. If the action is not found in
+	``actions_logged`` list, the final decision on whether to audit the
+	action for that task is ultimately left up to the audit subsystem to
+	decide for all seccomp return values other than ``SECCOMP_RET_ALLOW``.
+
+	The ``allow`` string is not accepted in the ``actions_logged`` sysctl
+	as it is not possible to log ``SECCOMP_RET_ALLOW`` actions. Attempting
+	to write ``allow`` to the sysctl will result in an EINVAL being
+	returned.
+
 Adding architecture support
 ===========================
 
diff --git a/include/linux/audit.h b/include/linux/audit.h
index 2150bdccfbab..8c30f06d639d 100644
--- a/include/linux/audit.h
+++ b/include/linux/audit.h
@@ -314,11 +314,7 @@ void audit_core_dumps(long signr);
 
 static inline void audit_seccomp(unsigned long syscall, long signr, int code)
 {
-	if (!audit_enabled)
-		return;
-
-	/* Force a record to be reported if a signal was delivered. */
-	if (signr || unlikely(!audit_dummy_context()))
+	if (audit_enabled && unlikely(!audit_dummy_context()))
 		__audit_seccomp(syscall, signr, code);
 }
 
diff --git a/kernel/seccomp.c b/kernel/seccomp.c
index 7a6089f66fed..54357e361aea 100644
--- a/kernel/seccomp.c
+++ b/kernel/seccomp.c
@@ -522,6 +522,45 @@ static void seccomp_send_sigsys(int syscall, int reason)
 }
 #endif	/* CONFIG_SECCOMP_FILTER */
 
+/* For use with seccomp_actions_logged */
+#define SECCOMP_LOG_KILL		(1 << 0)
+#define SECCOMP_LOG_TRAP		(1 << 2)
+#define SECCOMP_LOG_ERRNO		(1 << 3)
+#define SECCOMP_LOG_TRACE		(1 << 4)
+#define SECCOMP_LOG_ALLOW		(1 << 5)
+
+static u32 seccomp_actions_logged = SECCOMP_LOG_KILL  | SECCOMP_LOG_TRAP  |
+				    SECCOMP_LOG_ERRNO | SECCOMP_LOG_TRACE;
+
+static inline void seccomp_log(unsigned long syscall, long signr, u32 action)
+{
+	bool log = false;
+
+	switch (action) {
+	case SECCOMP_RET_ALLOW:
+	case SECCOMP_RET_TRAP:
+	case SECCOMP_RET_ERRNO:
+	case SECCOMP_RET_TRACE:
+		break;
+	case SECCOMP_RET_KILL:
+	default:
+		log = seccomp_actions_logged & SECCOMP_LOG_KILL;
+	}
+
+	/*
+	 * Force an audit message to be emitted when the action is RET_KILL and
+	 * the action is allowed to be logged by the admin.
+	 */
+	if (log)
+		return __audit_seccomp(syscall, signr, action);
+
+	/*
+	 * Let the audit subsystem decide if the action should be audited based
+	 * on whether the current task itself is being audited.
+	 */
+	return audit_seccomp(syscall, signr, action);
+}
+
 /*
  * Secure computing mode 1 allows only read/write/exit/sigreturn.
  * To be fully secure this must be combined with rlimit
@@ -547,7 +586,7 @@ static void __secure_computing_strict(int this_syscall)
 #ifdef SECCOMP_DEBUG
 	dump_stack();
 #endif
-	audit_seccomp(this_syscall, SIGKILL, SECCOMP_RET_KILL);
+	seccomp_log(this_syscall, SIGKILL, SECCOMP_RET_KILL);
 	do_exit(SIGKILL);
 }
 
@@ -656,7 +695,7 @@ static int __seccomp_filter(int this_syscall, const struct seccomp_data *sd,
 
 	case SECCOMP_RET_KILL:
 	default:
-		audit_seccomp(this_syscall, SIGSYS, action);
+		seccomp_log(this_syscall, SIGSYS, action);
 		/* Dump core only if this is the last remaining thread. */
 		if (get_nr_threads(current) == 1) {
 			siginfo_t info;
@@ -673,7 +712,7 @@ static int __seccomp_filter(int this_syscall, const struct seccomp_data *sd,
 	unreachable();
 
 skip:
-	audit_seccomp(this_syscall, 0, action);
+	seccomp_log(this_syscall, 0, action);
 	return -1;
 }
 #else
@@ -978,6 +1017,127 @@ static const char seccomp_actions_avail[] = SECCOMP_RET_KILL_NAME	" "
 					    SECCOMP_RET_TRACE_NAME	" "
 					    SECCOMP_RET_ALLOW_NAME;
 
+struct seccomp_log_name {
+	u32		log;
+	const char	*name;
+};
+
+static const struct seccomp_log_name seccomp_log_names[] = {
+	{ SECCOMP_LOG_KILL, SECCOMP_RET_KILL_NAME },
+	{ SECCOMP_LOG_TRAP, SECCOMP_RET_TRAP_NAME },
+	{ SECCOMP_LOG_ERRNO, SECCOMP_RET_ERRNO_NAME },
+	{ SECCOMP_LOG_TRACE, SECCOMP_RET_TRACE_NAME },
+	{ SECCOMP_LOG_ALLOW, SECCOMP_RET_ALLOW_NAME },
+	{ }
+};
+
+static bool seccomp_names_from_actions_logged(char *names, size_t size,
+					      u32 actions_logged)
+{
+	const struct seccomp_log_name *cur;
+	bool append_space = false;
+
+	for (cur = seccomp_log_names; cur->name && size; cur++) {
+		ssize_t ret;
+
+		if (!(actions_logged & cur->log))
+			continue;
+
+		if (append_space) {
+			ret = strscpy(names, " ", size);
+			if (ret < 0)
+				return false;
+
+			names += ret;
+			size -= ret;
+		} else
+			append_space = true;
+
+		ret = strscpy(names, cur->name, size);
+		if (ret < 0)
+			return false;
+
+		names += ret;
+		size -= ret;
+	}
+
+	return true;
+}
+
+static bool seccomp_action_logged_from_name(u32 *action_logged,
+					    const char *name)
+{
+	const struct seccomp_log_name *cur;
+
+	for (cur = seccomp_log_names; cur->name; cur++) {
+		if (!strcmp(cur->name, name)) {
+			*action_logged = cur->log;
+			return true;
+		}
+	}
+
+	return false;
+}
+
+static bool seccomp_actions_logged_from_names(u32 *actions_logged, char *names)
+{
+	char *name;
+
+	*actions_logged = 0;
+	while ((name = strsep(&names, " ")) && *name) {
+		u32 action_logged = 0;
+
+		if (!seccomp_action_logged_from_name(&action_logged, name))
+			return false;
+
+		*actions_logged |= action_logged;
+	}
+
+	return true;
+}
+
+static int seccomp_actions_logged_handler(struct ctl_table *ro_table, int write,
+					  void __user *buffer, size_t *lenp,
+					  loff_t *ppos)
+{
+	char names[sizeof(seccomp_actions_avail)];
+	struct ctl_table table;
+	int ret;
+
+	if (write && !capable(CAP_SYS_ADMIN))
+		return -EPERM;
+
+	memset(names, 0, sizeof(names));
+
+	if (!write) {
+		if (!seccomp_names_from_actions_logged(names, sizeof(names),
+						       seccomp_actions_logged))
+			return -EINVAL;
+	}
+
+	table = *ro_table;
+	table.data = names;
+	table.maxlen = sizeof(names);
+	ret = proc_dostring(&table, write, buffer, lenp, ppos);
+	if (ret)
+		return ret;
+
+	if (write) {
+		u32 actions_logged;
+
+		if (!seccomp_actions_logged_from_names(&actions_logged,
+						       table.data))
+			return -EINVAL;
+
+		if (actions_logged & SECCOMP_LOG_ALLOW)
+			return -EINVAL;
+
+		seccomp_actions_logged = actions_logged;
+	}
+
+	return 0;
+}
+
 static struct ctl_path seccomp_sysctl_path[] = {
 	{ .procname = "kernel", },
 	{ .procname = "seccomp", },
@@ -992,6 +1152,11 @@ static struct ctl_table seccomp_sysctl_table[] = {
 		.mode		= 0444,
 		.proc_handler	= proc_dostring,
 	},
+	{
+		.procname	= "actions_logged",
+		.mode		= 0644,
+		.proc_handler	= seccomp_actions_logged_handler,
+	},
 	{ }
 };
 
-- 
cgit v1.2.3


From e66a39977985b1e69e17c4042cb290768eca9b02 Mon Sep 17 00:00:00 2001
From: Tyler Hicks <tyhicks@canonical.com>
Date: Fri, 11 Aug 2017 04:33:56 +0000
Subject: seccomp: Filter flag to log all actions except SECCOMP_RET_ALLOW

Add a new filter flag, SECCOMP_FILTER_FLAG_LOG, that enables logging for
all actions except for SECCOMP_RET_ALLOW for the given filter.

SECCOMP_RET_KILL actions are always logged, when "kill" is in the
actions_logged sysctl, and SECCOMP_RET_ALLOW actions are never logged,
regardless of this flag.

This flag can be used to create noisy filters that result in all
non-allowed actions to be logged. A process may have one noisy filter,
which is loaded with this flag, as well as a quiet filter that's not
loaded with this flag. This allows for the actions in a set of filters
to be selectively conveyed to the admin.

Since a system could have a large number of allocated seccomp_filter
structs, struct packing was taken in consideration. On 64 bit x86, the
new log member takes up one byte of an existing four byte hole in the
struct. On 32 bit x86, the new log member creates a new four byte hole
(unavoidable) and consumes one of those bytes.

Unfortunately, the tests added for SECCOMP_FILTER_FLAG_LOG are not
capable of inspecting the audit log to verify that the actions taken in
the filter were logged.

With this patch, the logic for deciding if an action will be logged is:

if action == RET_ALLOW:
  do not log
else if action == RET_KILL && RET_KILL in actions_logged:
  log
else if filter-requests-logging && action in actions_logged:
  log
else if audit_enabled && process-is-being-audited:
  log
else:
  do not log

Signed-off-by: Tyler Hicks <tyhicks@canonical.com>
Signed-off-by: Kees Cook <keescook@chromium.org>
---
 include/linux/seccomp.h                       |  3 +-
 include/uapi/linux/seccomp.h                  |  1 +
 kernel/seccomp.c                              | 26 +++++++---
 tools/testing/selftests/seccomp/seccomp_bpf.c | 69 ++++++++++++++++++++++++++-
 4 files changed, 91 insertions(+), 8 deletions(-)

(limited to 'include')

diff --git a/include/linux/seccomp.h b/include/linux/seccomp.h
index ecc296c137cd..c8bef436b61d 100644
--- a/include/linux/seccomp.h
+++ b/include/linux/seccomp.h
@@ -3,7 +3,8 @@
 
 #include <uapi/linux/seccomp.h>
 
-#define SECCOMP_FILTER_FLAG_MASK	(SECCOMP_FILTER_FLAG_TSYNC)
+#define SECCOMP_FILTER_FLAG_MASK	(SECCOMP_FILTER_FLAG_TSYNC | \
+					 SECCOMP_FILTER_FLAG_LOG)
 
 #ifdef CONFIG_SECCOMP
 
diff --git a/include/uapi/linux/seccomp.h b/include/uapi/linux/seccomp.h
index aaad61cc46bc..19a611d0712e 100644
--- a/include/uapi/linux/seccomp.h
+++ b/include/uapi/linux/seccomp.h
@@ -17,6 +17,7 @@
 
 /* Valid flags for SECCOMP_SET_MODE_FILTER */
 #define SECCOMP_FILTER_FLAG_TSYNC	1
+#define SECCOMP_FILTER_FLAG_LOG		2
 
 /*
  * All BPF programs must return a 32-bit value.
diff --git a/kernel/seccomp.c b/kernel/seccomp.c
index 54357e361aea..ed9fde418fc4 100644
--- a/kernel/seccomp.c
+++ b/kernel/seccomp.c
@@ -44,6 +44,7 @@
  *         get/put helpers should be used when accessing an instance
  *         outside of a lifetime-guarded section.  In general, this
  *         is only needed for handling filters shared across tasks.
+ * @log: true if all actions except for SECCOMP_RET_ALLOW should be logged
  * @prev: points to a previously installed, or inherited, filter
  * @prog: the BPF program to evaluate
  *
@@ -59,6 +60,7 @@
  */
 struct seccomp_filter {
 	refcount_t usage;
+	bool log;
 	struct seccomp_filter *prev;
 	struct bpf_prog *prog;
 };
@@ -452,6 +454,10 @@ static long seccomp_attach_filter(unsigned int flags,
 			return ret;
 	}
 
+	/* Set log flag, if present. */
+	if (flags & SECCOMP_FILTER_FLAG_LOG)
+		filter->log = true;
+
 	/*
 	 * If there is an existing filter, make it the prev and don't drop its
 	 * task reference.
@@ -532,15 +538,22 @@ static void seccomp_send_sigsys(int syscall, int reason)
 static u32 seccomp_actions_logged = SECCOMP_LOG_KILL  | SECCOMP_LOG_TRAP  |
 				    SECCOMP_LOG_ERRNO | SECCOMP_LOG_TRACE;
 
-static inline void seccomp_log(unsigned long syscall, long signr, u32 action)
+static inline void seccomp_log(unsigned long syscall, long signr, u32 action,
+			       bool requested)
 {
 	bool log = false;
 
 	switch (action) {
 	case SECCOMP_RET_ALLOW:
+		break;
 	case SECCOMP_RET_TRAP:
+		log = requested && seccomp_actions_logged & SECCOMP_LOG_TRAP;
+		break;
 	case SECCOMP_RET_ERRNO:
+		log = requested && seccomp_actions_logged & SECCOMP_LOG_ERRNO;
+		break;
 	case SECCOMP_RET_TRACE:
+		log = requested && seccomp_actions_logged & SECCOMP_LOG_TRACE;
 		break;
 	case SECCOMP_RET_KILL:
 	default:
@@ -548,8 +561,9 @@ static inline void seccomp_log(unsigned long syscall, long signr, u32 action)
 	}
 
 	/*
-	 * Force an audit message to be emitted when the action is RET_KILL and
-	 * the action is allowed to be logged by the admin.
+	 * Force an audit message to be emitted when the action is RET_KILL or
+	 * the FILTER_FLAG_LOG bit was set and the action is allowed to be
+	 * logged by the admin.
 	 */
 	if (log)
 		return __audit_seccomp(syscall, signr, action);
@@ -586,7 +600,7 @@ static void __secure_computing_strict(int this_syscall)
 #ifdef SECCOMP_DEBUG
 	dump_stack();
 #endif
-	seccomp_log(this_syscall, SIGKILL, SECCOMP_RET_KILL);
+	seccomp_log(this_syscall, SIGKILL, SECCOMP_RET_KILL, true);
 	do_exit(SIGKILL);
 }
 
@@ -695,7 +709,7 @@ static int __seccomp_filter(int this_syscall, const struct seccomp_data *sd,
 
 	case SECCOMP_RET_KILL:
 	default:
-		seccomp_log(this_syscall, SIGSYS, action);
+		seccomp_log(this_syscall, SIGSYS, action, true);
 		/* Dump core only if this is the last remaining thread. */
 		if (get_nr_threads(current) == 1) {
 			siginfo_t info;
@@ -712,7 +726,7 @@ static int __seccomp_filter(int this_syscall, const struct seccomp_data *sd,
 	unreachable();
 
 skip:
-	seccomp_log(this_syscall, 0, action);
+	seccomp_log(this_syscall, 0, action, match ? match->log : false);
 	return -1;
 }
 #else
diff --git a/tools/testing/selftests/seccomp/seccomp_bpf.c b/tools/testing/selftests/seccomp/seccomp_bpf.c
index abf708e09892..1c8c22ce7740 100644
--- a/tools/testing/selftests/seccomp/seccomp_bpf.c
+++ b/tools/testing/selftests/seccomp/seccomp_bpf.c
@@ -1739,6 +1739,10 @@ TEST_F_SIGNAL(TRACE_syscall, kill_after_ptrace, SIGSYS)
 #define SECCOMP_FILTER_FLAG_TSYNC 1
 #endif
 
+#ifndef SECCOMP_FILTER_FLAG_LOG
+#define SECCOMP_FILTER_FLAG_LOG 2
+#endif
+
 #ifndef seccomp
 int seccomp(unsigned int op, unsigned int flags, void *args)
 {
@@ -1844,7 +1848,8 @@ TEST(seccomp_syscall_mode_lock)
  */
 TEST(detect_seccomp_filter_flags)
 {
-	unsigned int flags[] = { SECCOMP_FILTER_FLAG_TSYNC };
+	unsigned int flags[] = { SECCOMP_FILTER_FLAG_TSYNC,
+				 SECCOMP_FILTER_FLAG_LOG };
 	unsigned int flag, all_flags;
 	int i;
 	long ret;
@@ -2533,6 +2538,67 @@ TEST(syscall_restart)
 		_metadata->passed = 0;
 }
 
+TEST_SIGNAL(filter_flag_log, SIGSYS)
+{
+	struct sock_filter allow_filter[] = {
+		BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW),
+	};
+	struct sock_filter kill_filter[] = {
+		BPF_STMT(BPF_LD|BPF_W|BPF_ABS,
+			offsetof(struct seccomp_data, nr)),
+		BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, __NR_getpid, 0, 1),
+		BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_KILL),
+		BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW),
+	};
+	struct sock_fprog allow_prog = {
+		.len = (unsigned short)ARRAY_SIZE(allow_filter),
+		.filter = allow_filter,
+	};
+	struct sock_fprog kill_prog = {
+		.len = (unsigned short)ARRAY_SIZE(kill_filter),
+		.filter = kill_filter,
+	};
+	long ret;
+	pid_t parent = getppid();
+
+	ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
+	ASSERT_EQ(0, ret);
+
+	/* Verify that the FILTER_FLAG_LOG flag isn't accepted in strict mode */
+	ret = seccomp(SECCOMP_SET_MODE_STRICT, SECCOMP_FILTER_FLAG_LOG,
+		      &allow_prog);
+	ASSERT_NE(ENOSYS, errno) {
+		TH_LOG("Kernel does not support seccomp syscall!");
+	}
+	EXPECT_NE(0, ret) {
+		TH_LOG("Kernel accepted FILTER_FLAG_LOG flag in strict mode!");
+	}
+	EXPECT_EQ(EINVAL, errno) {
+		TH_LOG("Kernel returned unexpected errno for FILTER_FLAG_LOG flag in strict mode!");
+	}
+
+	/* Verify that a simple, permissive filter can be added with no flags */
+	ret = seccomp(SECCOMP_SET_MODE_FILTER, 0, &allow_prog);
+	EXPECT_EQ(0, ret);
+
+	/* See if the same filter can be added with the FILTER_FLAG_LOG flag */
+	ret = seccomp(SECCOMP_SET_MODE_FILTER, SECCOMP_FILTER_FLAG_LOG,
+		      &allow_prog);
+	ASSERT_NE(EINVAL, errno) {
+		TH_LOG("Kernel does not support the FILTER_FLAG_LOG flag!");
+	}
+	EXPECT_EQ(0, ret);
+
+	/* Ensure that the kill filter works with the FILTER_FLAG_LOG flag */
+	ret = seccomp(SECCOMP_SET_MODE_FILTER, SECCOMP_FILTER_FLAG_LOG,
+		      &kill_prog);
+	EXPECT_EQ(0, ret);
+
+	EXPECT_EQ(parent, syscall(__NR_getppid));
+	/* getpid() should never return. */
+	EXPECT_EQ(0, syscall(__NR_getpid));
+}
+
 TEST(get_action_avail)
 {
 	__u32 actions[] = { SECCOMP_RET_KILL,  SECCOMP_RET_TRAP,
@@ -2573,6 +2639,7 @@ TEST(get_action_avail)
  * - endianness checking when appropriate
  * - 64-bit arg prodding
  * - arch value testing (x86 modes especially)
+ * - verify that FILTER_FLAG_LOG filters generate log messages
  * - ...
  */
 
-- 
cgit v1.2.3


From 59f5cf44a38284eb9e76270c786fb6cc62ef8ac4 Mon Sep 17 00:00:00 2001
From: Tyler Hicks <tyhicks@canonical.com>
Date: Fri, 11 Aug 2017 04:33:57 +0000
Subject: seccomp: Action to log before allowing

Add a new action, SECCOMP_RET_LOG, that logs a syscall before allowing
the syscall. At the implementation level, this action is identical to
the existing SECCOMP_RET_ALLOW action. However, it can be very useful when
initially developing a seccomp filter for an application. The developer
can set the default action to be SECCOMP_RET_LOG, maybe mark any
obviously needed syscalls with SECCOMP_RET_ALLOW, and then put the
application through its paces. A list of syscalls that triggered the
default action (SECCOMP_RET_LOG) can be easily gleaned from the logs and
that list can be used to build the syscall whitelist. Finally, the
developer can change the default action to the desired value.

This provides a more friendly experience than seeing the application get
killed, then updating the filter and rebuilding the app, seeing the
application get killed due to a different syscall, then updating the
filter and rebuilding the app, etc.

The functionality is similar to what's supported by the various LSMs.
SELinux has permissive mode, AppArmor has complain mode, SMACK has
bring-up mode, etc.

SECCOMP_RET_LOG is given a lower value than SECCOMP_RET_ALLOW as allow
while logging is slightly more restrictive than quietly allowing.

Unfortunately, the tests added for SECCOMP_RET_LOG are not capable of
inspecting the audit log to verify that the syscall was logged.

With this patch, the logic for deciding if an action will be logged is:

if action == RET_ALLOW:
  do not log
else if action == RET_KILL && RET_KILL in actions_logged:
  log
else if action == RET_LOG && RET_LOG in actions_logged:
  log
else if filter-requests-logging && action in actions_logged:
  log
else if audit_enabled && process-is-being-audited:
  log
else:
  do not log

Signed-off-by: Tyler Hicks <tyhicks@canonical.com>
Signed-off-by: Kees Cook <keescook@chromium.org>
---
 Documentation/userspace-api/seccomp_filter.rst |  9 +++
 include/uapi/linux/seccomp.h                   |  1 +
 kernel/seccomp.c                               | 23 ++++--
 tools/testing/selftests/seccomp/seccomp_bpf.c  | 98 +++++++++++++++++++++++++-
 4 files changed, 125 insertions(+), 6 deletions(-)

(limited to 'include')

diff --git a/Documentation/userspace-api/seccomp_filter.rst b/Documentation/userspace-api/seccomp_filter.rst
index 2d1d8ab04ac5..f4977357daf2 100644
--- a/Documentation/userspace-api/seccomp_filter.rst
+++ b/Documentation/userspace-api/seccomp_filter.rst
@@ -141,6 +141,15 @@ In precedence order, they are:
 	allow use of ptrace, even of other sandboxed processes, without
 	extreme care; ptracers can use this mechanism to escape.)
 
+``SECCOMP_RET_LOG``:
+	Results in the system call being executed after it is logged. This
+	should be used by application developers to learn which syscalls their
+	application needs without having to iterate through multiple test and
+	development cycles to build the list.
+
+	This action will only be logged if "log" is present in the
+	actions_logged sysctl string.
+
 ``SECCOMP_RET_ALLOW``:
 	Results in the system call being executed.
 
diff --git a/include/uapi/linux/seccomp.h b/include/uapi/linux/seccomp.h
index 19a611d0712e..f94433263e4b 100644
--- a/include/uapi/linux/seccomp.h
+++ b/include/uapi/linux/seccomp.h
@@ -31,6 +31,7 @@
 #define SECCOMP_RET_TRAP	0x00030000U /* disallow and force a SIGSYS */
 #define SECCOMP_RET_ERRNO	0x00050000U /* returns an errno */
 #define SECCOMP_RET_TRACE	0x7ff00000U /* pass to a tracer or disallow */
+#define SECCOMP_RET_LOG		0x7ffc0000U /* allow after logging */
 #define SECCOMP_RET_ALLOW	0x7fff0000U /* allow */
 
 /* Masks for the return value sections. */
diff --git a/kernel/seccomp.c b/kernel/seccomp.c
index ed9fde418fc4..59cde2ed3b92 100644
--- a/kernel/seccomp.c
+++ b/kernel/seccomp.c
@@ -533,10 +533,12 @@ static void seccomp_send_sigsys(int syscall, int reason)
 #define SECCOMP_LOG_TRAP		(1 << 2)
 #define SECCOMP_LOG_ERRNO		(1 << 3)
 #define SECCOMP_LOG_TRACE		(1 << 4)
-#define SECCOMP_LOG_ALLOW		(1 << 5)
+#define SECCOMP_LOG_LOG			(1 << 5)
+#define SECCOMP_LOG_ALLOW		(1 << 6)
 
 static u32 seccomp_actions_logged = SECCOMP_LOG_KILL  | SECCOMP_LOG_TRAP  |
-				    SECCOMP_LOG_ERRNO | SECCOMP_LOG_TRACE;
+				    SECCOMP_LOG_ERRNO | SECCOMP_LOG_TRACE |
+				    SECCOMP_LOG_LOG;
 
 static inline void seccomp_log(unsigned long syscall, long signr, u32 action,
 			       bool requested)
@@ -555,15 +557,18 @@ static inline void seccomp_log(unsigned long syscall, long signr, u32 action,
 	case SECCOMP_RET_TRACE:
 		log = requested && seccomp_actions_logged & SECCOMP_LOG_TRACE;
 		break;
+	case SECCOMP_RET_LOG:
+		log = seccomp_actions_logged & SECCOMP_LOG_LOG;
+		break;
 	case SECCOMP_RET_KILL:
 	default:
 		log = seccomp_actions_logged & SECCOMP_LOG_KILL;
 	}
 
 	/*
-	 * Force an audit message to be emitted when the action is RET_KILL or
-	 * the FILTER_FLAG_LOG bit was set and the action is allowed to be
-	 * logged by the admin.
+	 * Force an audit message to be emitted when the action is RET_KILL,
+	 * RET_LOG, or the FILTER_FLAG_LOG bit was set and the action is
+	 * allowed to be logged by the admin.
 	 */
 	if (log)
 		return __audit_seccomp(syscall, signr, action);
@@ -699,6 +704,10 @@ static int __seccomp_filter(int this_syscall, const struct seccomp_data *sd,
 
 		return 0;
 
+	case SECCOMP_RET_LOG:
+		seccomp_log(this_syscall, 0, action, true);
+		return 0;
+
 	case SECCOMP_RET_ALLOW:
 		/*
 		 * Note that the "match" filter will always be NULL for
@@ -873,6 +882,7 @@ static long seccomp_get_action_avail(const char __user *uaction)
 	case SECCOMP_RET_TRAP:
 	case SECCOMP_RET_ERRNO:
 	case SECCOMP_RET_TRACE:
+	case SECCOMP_RET_LOG:
 	case SECCOMP_RET_ALLOW:
 		break;
 	default:
@@ -1023,12 +1033,14 @@ out:
 #define SECCOMP_RET_TRAP_NAME		"trap"
 #define SECCOMP_RET_ERRNO_NAME		"errno"
 #define SECCOMP_RET_TRACE_NAME		"trace"
+#define SECCOMP_RET_LOG_NAME		"log"
 #define SECCOMP_RET_ALLOW_NAME		"allow"
 
 static const char seccomp_actions_avail[] = SECCOMP_RET_KILL_NAME	" "
 					    SECCOMP_RET_TRAP_NAME	" "
 					    SECCOMP_RET_ERRNO_NAME	" "
 					    SECCOMP_RET_TRACE_NAME	" "
+					    SECCOMP_RET_LOG_NAME	" "
 					    SECCOMP_RET_ALLOW_NAME;
 
 struct seccomp_log_name {
@@ -1041,6 +1053,7 @@ static const struct seccomp_log_name seccomp_log_names[] = {
 	{ SECCOMP_LOG_TRAP, SECCOMP_RET_TRAP_NAME },
 	{ SECCOMP_LOG_ERRNO, SECCOMP_RET_ERRNO_NAME },
 	{ SECCOMP_LOG_TRACE, SECCOMP_RET_TRACE_NAME },
+	{ SECCOMP_LOG_LOG, SECCOMP_RET_LOG_NAME },
 	{ SECCOMP_LOG_ALLOW, SECCOMP_RET_ALLOW_NAME },
 	{ }
 };
diff --git a/tools/testing/selftests/seccomp/seccomp_bpf.c b/tools/testing/selftests/seccomp/seccomp_bpf.c
index 1c8c22ce7740..7372958eccb5 100644
--- a/tools/testing/selftests/seccomp/seccomp_bpf.c
+++ b/tools/testing/selftests/seccomp/seccomp_bpf.c
@@ -74,7 +74,12 @@
 #define SECCOMP_RET_ERRNO       0x00050000U /* returns an errno */
 #define SECCOMP_RET_TRACE       0x7ff00000U /* pass to a tracer or disallow */
 #define SECCOMP_RET_ALLOW       0x7fff0000U /* allow */
+#endif
+#ifndef SECCOMP_RET_LOG
+#define SECCOMP_RET_LOG       0x7ffc0000U /* allow after logging */
+#endif
 
+#ifndef SECCOMP_RET_ACTION
 /* Masks for the return value sections. */
 #define SECCOMP_RET_ACTION      0x7fff0000U
 #define SECCOMP_RET_DATA        0x0000ffffU
@@ -342,6 +347,28 @@ TEST(empty_prog)
 	EXPECT_EQ(EINVAL, errno);
 }
 
+TEST(log_all)
+{
+	struct sock_filter filter[] = {
+		BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_LOG),
+	};
+	struct sock_fprog prog = {
+		.len = (unsigned short)ARRAY_SIZE(filter),
+		.filter = filter,
+	};
+	long ret;
+	pid_t parent = getppid();
+
+	ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
+	ASSERT_EQ(0, ret);
+
+	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog);
+	ASSERT_EQ(0, ret);
+
+	/* getppid() should succeed and be logged (no check for logging) */
+	EXPECT_EQ(parent, syscall(__NR_getppid));
+}
+
 TEST_SIGNAL(unknown_ret_is_kill_inside, SIGSYS)
 {
 	struct sock_filter filter[] = {
@@ -756,6 +783,7 @@ TEST_F(TRAP, handler)
 
 FIXTURE_DATA(precedence) {
 	struct sock_fprog allow;
+	struct sock_fprog log;
 	struct sock_fprog trace;
 	struct sock_fprog error;
 	struct sock_fprog trap;
@@ -767,6 +795,13 @@ FIXTURE_SETUP(precedence)
 	struct sock_filter allow_insns[] = {
 		BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW),
 	};
+	struct sock_filter log_insns[] = {
+		BPF_STMT(BPF_LD|BPF_W|BPF_ABS,
+			offsetof(struct seccomp_data, nr)),
+		BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, __NR_getpid, 1, 0),
+		BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW),
+		BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_LOG),
+	};
 	struct sock_filter trace_insns[] = {
 		BPF_STMT(BPF_LD|BPF_W|BPF_ABS,
 			offsetof(struct seccomp_data, nr)),
@@ -803,6 +838,7 @@ FIXTURE_SETUP(precedence)
 	memcpy(self->_x.filter, &_x##_insns, sizeof(_x##_insns)); \
 	self->_x.len = (unsigned short)ARRAY_SIZE(_x##_insns)
 	FILTER_ALLOC(allow);
+	FILTER_ALLOC(log);
 	FILTER_ALLOC(trace);
 	FILTER_ALLOC(error);
 	FILTER_ALLOC(trap);
@@ -813,6 +849,7 @@ FIXTURE_TEARDOWN(precedence)
 {
 #define FILTER_FREE(_x) if (self->_x.filter) free(self->_x.filter)
 	FILTER_FREE(allow);
+	FILTER_FREE(log);
 	FILTER_FREE(trace);
 	FILTER_FREE(error);
 	FILTER_FREE(trap);
@@ -830,6 +867,8 @@ TEST_F(precedence, allow_ok)
 
 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->allow);
 	ASSERT_EQ(0, ret);
+	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->log);
+	ASSERT_EQ(0, ret);
 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->trace);
 	ASSERT_EQ(0, ret);
 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->error);
@@ -854,6 +893,8 @@ TEST_F_SIGNAL(precedence, kill_is_highest, SIGSYS)
 
 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->allow);
 	ASSERT_EQ(0, ret);
+	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->log);
+	ASSERT_EQ(0, ret);
 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->trace);
 	ASSERT_EQ(0, ret);
 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->error);
@@ -885,6 +926,8 @@ TEST_F_SIGNAL(precedence, kill_is_highest_in_any_order, SIGSYS)
 	ASSERT_EQ(0, ret);
 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->error);
 	ASSERT_EQ(0, ret);
+	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->log);
+	ASSERT_EQ(0, ret);
 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->trace);
 	ASSERT_EQ(0, ret);
 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->trap);
@@ -906,6 +949,8 @@ TEST_F_SIGNAL(precedence, trap_is_second, SIGSYS)
 
 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->allow);
 	ASSERT_EQ(0, ret);
+	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->log);
+	ASSERT_EQ(0, ret);
 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->trace);
 	ASSERT_EQ(0, ret);
 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->error);
@@ -931,6 +976,8 @@ TEST_F_SIGNAL(precedence, trap_is_second_in_any_order, SIGSYS)
 	ASSERT_EQ(0, ret);
 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->trap);
 	ASSERT_EQ(0, ret);
+	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->log);
+	ASSERT_EQ(0, ret);
 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->trace);
 	ASSERT_EQ(0, ret);
 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->error);
@@ -952,6 +999,8 @@ TEST_F(precedence, errno_is_third)
 
 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->allow);
 	ASSERT_EQ(0, ret);
+	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->log);
+	ASSERT_EQ(0, ret);
 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->trace);
 	ASSERT_EQ(0, ret);
 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->error);
@@ -970,6 +1019,8 @@ TEST_F(precedence, errno_is_third_in_any_order)
 	ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
 	ASSERT_EQ(0, ret);
 
+	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->log);
+	ASSERT_EQ(0, ret);
 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->error);
 	ASSERT_EQ(0, ret);
 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->trace);
@@ -992,6 +1043,8 @@ TEST_F(precedence, trace_is_fourth)
 
 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->allow);
 	ASSERT_EQ(0, ret);
+	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->log);
+	ASSERT_EQ(0, ret);
 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->trace);
 	ASSERT_EQ(0, ret);
 	/* Should work just fine. */
@@ -1013,12 +1066,54 @@ TEST_F(precedence, trace_is_fourth_in_any_order)
 	ASSERT_EQ(0, ret);
 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->allow);
 	ASSERT_EQ(0, ret);
+	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->log);
+	ASSERT_EQ(0, ret);
 	/* Should work just fine. */
 	EXPECT_EQ(parent, syscall(__NR_getppid));
 	/* No ptracer */
 	EXPECT_EQ(-1, syscall(__NR_getpid));
 }
 
+TEST_F(precedence, log_is_fifth)
+{
+	pid_t mypid, parent;
+	long ret;
+
+	mypid = getpid();
+	parent = getppid();
+	ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
+	ASSERT_EQ(0, ret);
+
+	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->allow);
+	ASSERT_EQ(0, ret);
+	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->log);
+	ASSERT_EQ(0, ret);
+	/* Should work just fine. */
+	EXPECT_EQ(parent, syscall(__NR_getppid));
+	/* Should also work just fine */
+	EXPECT_EQ(mypid, syscall(__NR_getpid));
+}
+
+TEST_F(precedence, log_is_fifth_in_any_order)
+{
+	pid_t mypid, parent;
+	long ret;
+
+	mypid = getpid();
+	parent = getppid();
+	ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
+	ASSERT_EQ(0, ret);
+
+	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->log);
+	ASSERT_EQ(0, ret);
+	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->allow);
+	ASSERT_EQ(0, ret);
+	/* Should work just fine. */
+	EXPECT_EQ(parent, syscall(__NR_getppid));
+	/* Should also work just fine */
+	EXPECT_EQ(mypid, syscall(__NR_getpid));
+}
+
 #ifndef PTRACE_O_TRACESECCOMP
 #define PTRACE_O_TRACESECCOMP	0x00000080
 #endif
@@ -2603,7 +2698,7 @@ TEST(get_action_avail)
 {
 	__u32 actions[] = { SECCOMP_RET_KILL,  SECCOMP_RET_TRAP,
 			    SECCOMP_RET_ERRNO, SECCOMP_RET_TRACE,
-			    SECCOMP_RET_ALLOW };
+			    SECCOMP_RET_LOG,   SECCOMP_RET_ALLOW };
 	__u32 unknown_action = 0x10000000U;
 	int i;
 	long ret;
@@ -2640,6 +2735,7 @@ TEST(get_action_avail)
  * - 64-bit arg prodding
  * - arch value testing (x86 modes especially)
  * - verify that FILTER_FLAG_LOG filters generate log messages
+ * - verify that RET_LOG generates log messages
  * - ...
  */
 
-- 
cgit v1.2.3


From fd76875ca289a3d4722f266fd2d5532a27083903 Mon Sep 17 00:00:00 2001
From: Kees Cook <keescook@chromium.org>
Date: Fri, 11 Aug 2017 12:53:18 -0700
Subject: seccomp: Rename SECCOMP_RET_KILL to SECCOMP_RET_KILL_THREAD

In preparation for adding SECCOMP_RET_KILL_PROCESS, rename SECCOMP_RET_KILL
to the more accurate SECCOMP_RET_KILL_THREAD.

The existing selftest values are intentionally left as SECCOMP_RET_KILL
just to be sure we're exercising the alias.

Signed-off-by: Kees Cook <keescook@chromium.org>
---
 Documentation/networking/filter.txt            |  2 +-
 Documentation/userspace-api/seccomp_filter.rst |  4 +--
 include/uapi/linux/seccomp.h                   |  3 +-
 kernel/seccomp.c                               | 39 ++++++++++++++------------
 samples/seccomp/bpf-direct.c                   |  4 +--
 samples/seccomp/bpf-helper.h                   |  2 +-
 tools/testing/selftests/seccomp/seccomp_bpf.c  | 17 ++++++-----
 7 files changed, 39 insertions(+), 32 deletions(-)

(limited to 'include')

diff --git a/Documentation/networking/filter.txt b/Documentation/networking/filter.txt
index b69b205501de..73aa0f12156d 100644
--- a/Documentation/networking/filter.txt
+++ b/Documentation/networking/filter.txt
@@ -337,7 +337,7 @@ Examples for low-level BPF:
   jeq #14, good           /* __NR_rt_sigprocmask */
   jeq #13, good           /* __NR_rt_sigaction */
   jeq #35, good           /* __NR_nanosleep */
-  bad: ret #0             /* SECCOMP_RET_KILL */
+  bad: ret #0             /* SECCOMP_RET_KILL_THREAD */
   good: ret #0x7fff0000   /* SECCOMP_RET_ALLOW */
 
 The above example code can be placed into a file (here called "foo"), and
diff --git a/Documentation/userspace-api/seccomp_filter.rst b/Documentation/userspace-api/seccomp_filter.rst
index f4977357daf2..d76396f2d8ed 100644
--- a/Documentation/userspace-api/seccomp_filter.rst
+++ b/Documentation/userspace-api/seccomp_filter.rst
@@ -87,11 +87,11 @@ Return values
 A seccomp filter may return any of the following values. If multiple
 filters exist, the return value for the evaluation of a given system
 call will always use the highest precedent value. (For example,
-``SECCOMP_RET_KILL`` will always take precedence.)
+``SECCOMP_RET_KILL_THREAD`` will always take precedence.)
 
 In precedence order, they are:
 
-``SECCOMP_RET_KILL``:
+``SECCOMP_RET_KILL_THREAD``:
 	Results in the task exiting immediately without executing the
 	system call.  The exit status of the task (``status & 0x7f``) will
 	be ``SIGSYS``, not ``SIGKILL``.
diff --git a/include/uapi/linux/seccomp.h b/include/uapi/linux/seccomp.h
index f94433263e4b..5a03f699eb17 100644
--- a/include/uapi/linux/seccomp.h
+++ b/include/uapi/linux/seccomp.h
@@ -27,7 +27,8 @@
  * The ordering ensures that a min_t() over composed return values always
  * selects the least permissive choice.
  */
-#define SECCOMP_RET_KILL	0x00000000U /* kill the task immediately */
+#define SECCOMP_RET_KILL_THREAD	0x00000000U /* kill the thread */
+#define SECCOMP_RET_KILL	SECCOMP_RET_KILL_THREAD
 #define SECCOMP_RET_TRAP	0x00030000U /* disallow and force a SIGSYS */
 #define SECCOMP_RET_ERRNO	0x00050000U /* returns an errno */
 #define SECCOMP_RET_TRACE	0x7ff00000U /* pass to a tracer or disallow */
diff --git a/kernel/seccomp.c b/kernel/seccomp.c
index 59cde2ed3b92..95ac54cff00f 100644
--- a/kernel/seccomp.c
+++ b/kernel/seccomp.c
@@ -192,7 +192,7 @@ static u32 seccomp_run_filters(const struct seccomp_data *sd,
 
 	/* Ensure unexpected behavior doesn't result in failing open. */
 	if (unlikely(WARN_ON(f == NULL)))
-		return SECCOMP_RET_KILL;
+		return SECCOMP_RET_KILL_THREAD;
 
 	if (!sd) {
 		populate_seccomp_data(&sd_local);
@@ -529,15 +529,17 @@ static void seccomp_send_sigsys(int syscall, int reason)
 #endif	/* CONFIG_SECCOMP_FILTER */
 
 /* For use with seccomp_actions_logged */
-#define SECCOMP_LOG_KILL		(1 << 0)
+#define SECCOMP_LOG_KILL_THREAD		(1 << 0)
 #define SECCOMP_LOG_TRAP		(1 << 2)
 #define SECCOMP_LOG_ERRNO		(1 << 3)
 #define SECCOMP_LOG_TRACE		(1 << 4)
 #define SECCOMP_LOG_LOG			(1 << 5)
 #define SECCOMP_LOG_ALLOW		(1 << 6)
 
-static u32 seccomp_actions_logged = SECCOMP_LOG_KILL  | SECCOMP_LOG_TRAP  |
-				    SECCOMP_LOG_ERRNO | SECCOMP_LOG_TRACE |
+static u32 seccomp_actions_logged = SECCOMP_LOG_KILL_THREAD |
+				    SECCOMP_LOG_TRAP  |
+				    SECCOMP_LOG_ERRNO |
+				    SECCOMP_LOG_TRACE |
 				    SECCOMP_LOG_LOG;
 
 static inline void seccomp_log(unsigned long syscall, long signr, u32 action,
@@ -560,13 +562,13 @@ static inline void seccomp_log(unsigned long syscall, long signr, u32 action,
 	case SECCOMP_RET_LOG:
 		log = seccomp_actions_logged & SECCOMP_LOG_LOG;
 		break;
-	case SECCOMP_RET_KILL:
+	case SECCOMP_RET_KILL_THREAD:
 	default:
-		log = seccomp_actions_logged & SECCOMP_LOG_KILL;
+		log = seccomp_actions_logged & SECCOMP_LOG_KILL_THREAD;
 	}
 
 	/*
-	 * Force an audit message to be emitted when the action is RET_KILL,
+	 * Force an audit message to be emitted when the action is RET_KILL_*,
 	 * RET_LOG, or the FILTER_FLAG_LOG bit was set and the action is
 	 * allowed to be logged by the admin.
 	 */
@@ -605,7 +607,7 @@ static void __secure_computing_strict(int this_syscall)
 #ifdef SECCOMP_DEBUG
 	dump_stack();
 #endif
-	seccomp_log(this_syscall, SIGKILL, SECCOMP_RET_KILL, true);
+	seccomp_log(this_syscall, SIGKILL, SECCOMP_RET_KILL_THREAD, true);
 	do_exit(SIGKILL);
 }
 
@@ -716,7 +718,7 @@ static int __seccomp_filter(int this_syscall, const struct seccomp_data *sd,
 		 */
 		return 0;
 
-	case SECCOMP_RET_KILL:
+	case SECCOMP_RET_KILL_THREAD:
 	default:
 		seccomp_log(this_syscall, SIGSYS, action, true);
 		/* Dump core only if this is the last remaining thread. */
@@ -878,7 +880,7 @@ static long seccomp_get_action_avail(const char __user *uaction)
 		return -EFAULT;
 
 	switch (action) {
-	case SECCOMP_RET_KILL:
+	case SECCOMP_RET_KILL_THREAD:
 	case SECCOMP_RET_TRAP:
 	case SECCOMP_RET_ERRNO:
 	case SECCOMP_RET_TRACE:
@@ -1029,19 +1031,20 @@ out:
 #ifdef CONFIG_SYSCTL
 
 /* Human readable action names for friendly sysctl interaction */
-#define SECCOMP_RET_KILL_NAME		"kill"
+#define SECCOMP_RET_KILL_THREAD_NAME	"kill_thread"
 #define SECCOMP_RET_TRAP_NAME		"trap"
 #define SECCOMP_RET_ERRNO_NAME		"errno"
 #define SECCOMP_RET_TRACE_NAME		"trace"
 #define SECCOMP_RET_LOG_NAME		"log"
 #define SECCOMP_RET_ALLOW_NAME		"allow"
 
-static const char seccomp_actions_avail[] = SECCOMP_RET_KILL_NAME	" "
-					    SECCOMP_RET_TRAP_NAME	" "
-					    SECCOMP_RET_ERRNO_NAME	" "
-					    SECCOMP_RET_TRACE_NAME	" "
-					    SECCOMP_RET_LOG_NAME	" "
-					    SECCOMP_RET_ALLOW_NAME;
+static const char seccomp_actions_avail[] =
+				SECCOMP_RET_KILL_THREAD_NAME	" "
+				SECCOMP_RET_TRAP_NAME		" "
+				SECCOMP_RET_ERRNO_NAME		" "
+				SECCOMP_RET_TRACE_NAME		" "
+				SECCOMP_RET_LOG_NAME		" "
+				SECCOMP_RET_ALLOW_NAME;
 
 struct seccomp_log_name {
 	u32		log;
@@ -1049,7 +1052,7 @@ struct seccomp_log_name {
 };
 
 static const struct seccomp_log_name seccomp_log_names[] = {
-	{ SECCOMP_LOG_KILL, SECCOMP_RET_KILL_NAME },
+	{ SECCOMP_LOG_KILL_THREAD, SECCOMP_RET_KILL_THREAD_NAME },
 	{ SECCOMP_LOG_TRAP, SECCOMP_RET_TRAP_NAME },
 	{ SECCOMP_LOG_ERRNO, SECCOMP_RET_ERRNO_NAME },
 	{ SECCOMP_LOG_TRACE, SECCOMP_RET_TRACE_NAME },
diff --git a/samples/seccomp/bpf-direct.c b/samples/seccomp/bpf-direct.c
index 151ec3f52189..235ce3c49ee9 100644
--- a/samples/seccomp/bpf-direct.c
+++ b/samples/seccomp/bpf-direct.c
@@ -129,7 +129,7 @@ static int install_filter(void)
 		/* Check that read is only using stdin. */
 		BPF_STMT(BPF_LD+BPF_W+BPF_ABS, syscall_arg(0)),
 		BPF_JUMP(BPF_JMP+BPF_JEQ+BPF_K, STDIN_FILENO, 4, 0),
-		BPF_STMT(BPF_RET+BPF_K, SECCOMP_RET_KILL),
+		BPF_STMT(BPF_RET+BPF_K, SECCOMP_RET_KILL_THREAD),
 
 		/* Check that write is only using stdout */
 		BPF_STMT(BPF_LD+BPF_W+BPF_ABS, syscall_arg(0)),
@@ -139,7 +139,7 @@ static int install_filter(void)
 
 		BPF_STMT(BPF_RET+BPF_K, SECCOMP_RET_ALLOW),
 		BPF_STMT(BPF_RET+BPF_K, SECCOMP_RET_TRAP),
-		BPF_STMT(BPF_RET+BPF_K, SECCOMP_RET_KILL),
+		BPF_STMT(BPF_RET+BPF_K, SECCOMP_RET_KILL_THREAD),
 	};
 	struct sock_fprog prog = {
 		.len = (unsigned short)(sizeof(filter)/sizeof(filter[0])),
diff --git a/samples/seccomp/bpf-helper.h b/samples/seccomp/bpf-helper.h
index 1d8de9edd858..83dbe79cbe2c 100644
--- a/samples/seccomp/bpf-helper.h
+++ b/samples/seccomp/bpf-helper.h
@@ -44,7 +44,7 @@ void seccomp_bpf_print(struct sock_filter *filter, size_t count);
 #define ALLOW \
 	BPF_STMT(BPF_RET+BPF_K, SECCOMP_RET_ALLOW)
 #define DENY \
-	BPF_STMT(BPF_RET+BPF_K, SECCOMP_RET_KILL)
+	BPF_STMT(BPF_RET+BPF_K, SECCOMP_RET_KILL_THREAD)
 #define JUMP(labels, label) \
 	BPF_JUMP(BPF_JMP+BPF_JA, FIND_LABEL((labels), (label)), \
 		 JUMP_JT, JUMP_JF)
diff --git a/tools/testing/selftests/seccomp/seccomp_bpf.c b/tools/testing/selftests/seccomp/seccomp_bpf.c
index 7372958eccb5..a3ba39a32449 100644
--- a/tools/testing/selftests/seccomp/seccomp_bpf.c
+++ b/tools/testing/selftests/seccomp/seccomp_bpf.c
@@ -68,15 +68,18 @@
 #define SECCOMP_MODE_FILTER 2
 #endif
 
+#ifndef SECCOMP_RET_KILL_THREAD
+#define SECCOMP_RET_KILL_THREAD	 0x00000000U /* kill the thread */
+#endif
 #ifndef SECCOMP_RET_KILL
-#define SECCOMP_RET_KILL        0x00000000U /* kill the task immediately */
-#define SECCOMP_RET_TRAP        0x00030000U /* disallow and force a SIGSYS */
-#define SECCOMP_RET_ERRNO       0x00050000U /* returns an errno */
-#define SECCOMP_RET_TRACE       0x7ff00000U /* pass to a tracer or disallow */
-#define SECCOMP_RET_ALLOW       0x7fff0000U /* allow */
+#define SECCOMP_RET_KILL	 SECCOMP_RET_KILL_THREAD
+#define SECCOMP_RET_TRAP	 0x00030000U /* disallow and force a SIGSYS */
+#define SECCOMP_RET_ERRNO	 0x00050000U /* returns an errno */
+#define SECCOMP_RET_TRACE	 0x7ff00000U /* pass to a tracer or disallow */
+#define SECCOMP_RET_ALLOW	 0x7fff0000U /* allow */
 #endif
 #ifndef SECCOMP_RET_LOG
-#define SECCOMP_RET_LOG       0x7ffc0000U /* allow after logging */
+#define SECCOMP_RET_LOG		 0x7ffc0000U /* allow after logging */
 #endif
 
 #ifndef SECCOMP_RET_ACTION
@@ -2696,7 +2699,7 @@ TEST_SIGNAL(filter_flag_log, SIGSYS)
 
 TEST(get_action_avail)
 {
-	__u32 actions[] = { SECCOMP_RET_KILL,  SECCOMP_RET_TRAP,
+	__u32 actions[] = { SECCOMP_RET_KILL_THREAD, SECCOMP_RET_TRAP,
 			    SECCOMP_RET_ERRNO, SECCOMP_RET_TRACE,
 			    SECCOMP_RET_LOG,   SECCOMP_RET_ALLOW };
 	__u32 unknown_action = 0x10000000U;
-- 
cgit v1.2.3


From 4d3b0b05aae9ee9ce0970dc4cc0fb3fad5e85945 Mon Sep 17 00:00:00 2001
From: Kees Cook <keescook@chromium.org>
Date: Fri, 11 Aug 2017 13:01:39 -0700
Subject: seccomp: Introduce SECCOMP_RET_KILL_PROCESS

This introduces the BPF return value for SECCOMP_RET_KILL_PROCESS to kill
an entire process. This cannot yet be reached by seccomp, but it changes
the default-kill behavior (for unknown return values) from kill-thread to
kill-process.

Signed-off-by: Kees Cook <keescook@chromium.org>
---
 include/uapi/linux/seccomp.h | 18 ++++++++++--------
 kernel/seccomp.c             | 22 ++++++++++++++++------
 2 files changed, 26 insertions(+), 14 deletions(-)

(limited to 'include')

diff --git a/include/uapi/linux/seccomp.h b/include/uapi/linux/seccomp.h
index 5a03f699eb17..7e77c92df78a 100644
--- a/include/uapi/linux/seccomp.h
+++ b/include/uapi/linux/seccomp.h
@@ -22,18 +22,20 @@
 /*
  * All BPF programs must return a 32-bit value.
  * The bottom 16-bits are for optional return data.
- * The upper 16-bits are ordered from least permissive values to most.
+ * The upper 16-bits are ordered from least permissive values to most,
+ * as a signed value (so 0x8000000 is negative).
  *
  * The ordering ensures that a min_t() over composed return values always
  * selects the least permissive choice.
  */
-#define SECCOMP_RET_KILL_THREAD	0x00000000U /* kill the thread */
-#define SECCOMP_RET_KILL	SECCOMP_RET_KILL_THREAD
-#define SECCOMP_RET_TRAP	0x00030000U /* disallow and force a SIGSYS */
-#define SECCOMP_RET_ERRNO	0x00050000U /* returns an errno */
-#define SECCOMP_RET_TRACE	0x7ff00000U /* pass to a tracer or disallow */
-#define SECCOMP_RET_LOG		0x7ffc0000U /* allow after logging */
-#define SECCOMP_RET_ALLOW	0x7fff0000U /* allow */
+#define SECCOMP_RET_KILL_PROCESS 0x80000000U /* kill the process */
+#define SECCOMP_RET_KILL_THREAD	 0x00000000U /* kill the thread */
+#define SECCOMP_RET_KILL	 SECCOMP_RET_KILL_THREAD
+#define SECCOMP_RET_TRAP	 0x00030000U /* disallow and force a SIGSYS */
+#define SECCOMP_RET_ERRNO	 0x00050000U /* returns an errno */
+#define SECCOMP_RET_TRACE	 0x7ff00000U /* pass to a tracer or disallow */
+#define SECCOMP_RET_LOG		 0x7ffc0000U /* allow after logging */
+#define SECCOMP_RET_ALLOW	 0x7fff0000U /* allow */
 
 /* Masks for the return value sections. */
 #define SECCOMP_RET_ACTION	0x7fff0000U
diff --git a/kernel/seccomp.c b/kernel/seccomp.c
index 95ac54cff00f..5c7299b9d953 100644
--- a/kernel/seccomp.c
+++ b/kernel/seccomp.c
@@ -192,7 +192,7 @@ static u32 seccomp_run_filters(const struct seccomp_data *sd,
 
 	/* Ensure unexpected behavior doesn't result in failing open. */
 	if (unlikely(WARN_ON(f == NULL)))
-		return SECCOMP_RET_KILL_THREAD;
+		return SECCOMP_RET_KILL_PROCESS;
 
 	if (!sd) {
 		populate_seccomp_data(&sd_local);
@@ -529,14 +529,16 @@ static void seccomp_send_sigsys(int syscall, int reason)
 #endif	/* CONFIG_SECCOMP_FILTER */
 
 /* For use with seccomp_actions_logged */
-#define SECCOMP_LOG_KILL_THREAD		(1 << 0)
+#define SECCOMP_LOG_KILL_PROCESS	(1 << 0)
+#define SECCOMP_LOG_KILL_THREAD		(1 << 1)
 #define SECCOMP_LOG_TRAP		(1 << 2)
 #define SECCOMP_LOG_ERRNO		(1 << 3)
 #define SECCOMP_LOG_TRACE		(1 << 4)
 #define SECCOMP_LOG_LOG			(1 << 5)
 #define SECCOMP_LOG_ALLOW		(1 << 6)
 
-static u32 seccomp_actions_logged = SECCOMP_LOG_KILL_THREAD |
+static u32 seccomp_actions_logged = SECCOMP_LOG_KILL_PROCESS |
+				    SECCOMP_LOG_KILL_THREAD  |
 				    SECCOMP_LOG_TRAP  |
 				    SECCOMP_LOG_ERRNO |
 				    SECCOMP_LOG_TRACE |
@@ -563,8 +565,11 @@ static inline void seccomp_log(unsigned long syscall, long signr, u32 action,
 		log = seccomp_actions_logged & SECCOMP_LOG_LOG;
 		break;
 	case SECCOMP_RET_KILL_THREAD:
-	default:
 		log = seccomp_actions_logged & SECCOMP_LOG_KILL_THREAD;
+		break;
+	case SECCOMP_RET_KILL_PROCESS:
+	default:
+		log = seccomp_actions_logged & SECCOMP_LOG_KILL_PROCESS;
 	}
 
 	/*
@@ -719,10 +724,12 @@ static int __seccomp_filter(int this_syscall, const struct seccomp_data *sd,
 		return 0;
 
 	case SECCOMP_RET_KILL_THREAD:
+	case SECCOMP_RET_KILL_PROCESS:
 	default:
 		seccomp_log(this_syscall, SIGSYS, action, true);
 		/* Dump core only if this is the last remaining thread. */
-		if (get_nr_threads(current) == 1) {
+		if (action == SECCOMP_RET_KILL_PROCESS ||
+		    get_nr_threads(current) == 1) {
 			siginfo_t info;
 
 			/* Show the original registers in the dump. */
@@ -731,7 +738,10 @@ static int __seccomp_filter(int this_syscall, const struct seccomp_data *sd,
 			seccomp_init_siginfo(&info, this_syscall, data);
 			do_coredump(&info);
 		}
-		do_exit(SIGSYS);
+		if (action == SECCOMP_RET_KILL_PROCESS)
+			do_group_exit(SIGSYS);
+		else
+			do_exit(SIGSYS);
 	}
 
 	unreachable();
-- 
cgit v1.2.3


From 0466bdb99e8744bc9befa8d62a317f0fd7fd7421 Mon Sep 17 00:00:00 2001
From: Kees Cook <keescook@chromium.org>
Date: Fri, 11 Aug 2017 13:12:11 -0700
Subject: seccomp: Implement SECCOMP_RET_KILL_PROCESS action
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Right now, SECCOMP_RET_KILL_THREAD (neé SECCOMP_RET_KILL) kills the
current thread. There have been a few requests for this to kill the entire
process (the thread group). This cannot be just changed (discovered when
adding coredump support since coredumping kills the entire process)
because there are userspace programs depending on the thread-kill
behavior.

Instead, implement SECCOMP_RET_KILL_PROCESS, which is 0x80000000, and can
be processed as "-1" by the kernel, below the existing RET_KILL that is
ABI-set to "0". For userspace, SECCOMP_RET_ACTION_FULL is added to expand
the mask to the signed bit. Old userspace using the SECCOMP_RET_ACTION
mask will see SECCOMP_RET_KILL_PROCESS as 0 still, but this would only
be visible when examining the siginfo in a core dump from a RET_KILL_*,
where it will think it was thread-killed instead of process-killed.

Attempts to introduce this behavior via other ways (filter flags,
seccomp struct flags, masked RET_DATA bits) all come with weird
side-effects and baggage. This change preserves the central behavioral
expectations of the seccomp filter engine without putting too great
a burden on changes needed in userspace to use the new action.

The new action is discoverable by userspace through either the new
actions_avail sysctl or through the SECCOMP_GET_ACTION_AVAIL seccomp
operation. If used without checking for availability, old kernels
will treat RET_KILL_PROCESS as RET_KILL_THREAD (since the old mask
will produce RET_KILL_THREAD).

Cc: Paul Moore <paul@paul-moore.com>
Cc: Fabricio Voznika <fvoznika@google.com>
Signed-off-by: Kees Cook <keescook@chromium.org>
---
 Documentation/userspace-api/seccomp_filter.rst | 7 ++++++-
 include/uapi/linux/seccomp.h                   | 1 +
 kernel/seccomp.c                               | 9 +++++++--
 3 files changed, 14 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/Documentation/userspace-api/seccomp_filter.rst b/Documentation/userspace-api/seccomp_filter.rst
index d76396f2d8ed..099c412951d6 100644
--- a/Documentation/userspace-api/seccomp_filter.rst
+++ b/Documentation/userspace-api/seccomp_filter.rst
@@ -87,10 +87,15 @@ Return values
 A seccomp filter may return any of the following values. If multiple
 filters exist, the return value for the evaluation of a given system
 call will always use the highest precedent value. (For example,
-``SECCOMP_RET_KILL_THREAD`` will always take precedence.)
+``SECCOMP_RET_KILL_PROCESS`` will always take precedence.)
 
 In precedence order, they are:
 
+``SECCOMP_RET_KILL_PROCESS``:
+	Results in the entire process exiting immediately without executing
+	the system call.  The exit status of the task (``status & 0x7f``)
+	will be ``SIGSYS``, not ``SIGKILL``.
+
 ``SECCOMP_RET_KILL_THREAD``:
 	Results in the task exiting immediately without executing the
 	system call.  The exit status of the task (``status & 0x7f``) will
diff --git a/include/uapi/linux/seccomp.h b/include/uapi/linux/seccomp.h
index 7e77c92df78a..f6bc1dea3247 100644
--- a/include/uapi/linux/seccomp.h
+++ b/include/uapi/linux/seccomp.h
@@ -38,6 +38,7 @@
 #define SECCOMP_RET_ALLOW	 0x7fff0000U /* allow */
 
 /* Masks for the return value sections. */
+#define SECCOMP_RET_ACTION_FULL	0xffff0000U
 #define SECCOMP_RET_ACTION	0x7fff0000U
 #define SECCOMP_RET_DATA	0x0000ffffU
 
diff --git a/kernel/seccomp.c b/kernel/seccomp.c
index 5c7299b9d953..c24579dfa7a1 100644
--- a/kernel/seccomp.c
+++ b/kernel/seccomp.c
@@ -181,6 +181,7 @@ static int seccomp_check_filter(struct sock_filter *filter, unsigned int flen)
  *
  * Returns valid seccomp BPF response codes.
  */
+#define ACTION_ONLY(ret) ((s32)((ret) & (SECCOMP_RET_ACTION_FULL)))
 static u32 seccomp_run_filters(const struct seccomp_data *sd,
 			       struct seccomp_filter **match)
 {
@@ -206,7 +207,7 @@ static u32 seccomp_run_filters(const struct seccomp_data *sd,
 	for (; f; f = f->prev) {
 		u32 cur_ret = BPF_PROG_RUN(f->prog, sd);
 
-		if ((cur_ret & SECCOMP_RET_ACTION) < (ret & SECCOMP_RET_ACTION)) {
+		if (ACTION_ONLY(cur_ret) < ACTION_ONLY(ret)) {
 			ret = cur_ret;
 			*match = f;
 		}
@@ -650,7 +651,7 @@ static int __seccomp_filter(int this_syscall, const struct seccomp_data *sd,
 
 	filter_ret = seccomp_run_filters(sd, &match);
 	data = filter_ret & SECCOMP_RET_DATA;
-	action = filter_ret & SECCOMP_RET_ACTION;
+	action = filter_ret & SECCOMP_RET_ACTION_FULL;
 
 	switch (action) {
 	case SECCOMP_RET_ERRNO:
@@ -890,6 +891,7 @@ static long seccomp_get_action_avail(const char __user *uaction)
 		return -EFAULT;
 
 	switch (action) {
+	case SECCOMP_RET_KILL_PROCESS:
 	case SECCOMP_RET_KILL_THREAD:
 	case SECCOMP_RET_TRAP:
 	case SECCOMP_RET_ERRNO:
@@ -1041,6 +1043,7 @@ out:
 #ifdef CONFIG_SYSCTL
 
 /* Human readable action names for friendly sysctl interaction */
+#define SECCOMP_RET_KILL_PROCESS_NAME	"kill_process"
 #define SECCOMP_RET_KILL_THREAD_NAME	"kill_thread"
 #define SECCOMP_RET_TRAP_NAME		"trap"
 #define SECCOMP_RET_ERRNO_NAME		"errno"
@@ -1049,6 +1052,7 @@ out:
 #define SECCOMP_RET_ALLOW_NAME		"allow"
 
 static const char seccomp_actions_avail[] =
+				SECCOMP_RET_KILL_PROCESS_NAME	" "
 				SECCOMP_RET_KILL_THREAD_NAME	" "
 				SECCOMP_RET_TRAP_NAME		" "
 				SECCOMP_RET_ERRNO_NAME		" "
@@ -1062,6 +1066,7 @@ struct seccomp_log_name {
 };
 
 static const struct seccomp_log_name seccomp_log_names[] = {
+	{ SECCOMP_LOG_KILL_PROCESS, SECCOMP_RET_KILL_PROCESS_NAME },
 	{ SECCOMP_LOG_KILL_THREAD, SECCOMP_RET_KILL_THREAD_NAME },
 	{ SECCOMP_LOG_TRAP, SECCOMP_RET_TRAP_NAME },
 	{ SECCOMP_LOG_ERRNO, SECCOMP_RET_ERRNO_NAME },
-- 
cgit v1.2.3


From a99b646afa8a02571ea298bedca6592d818229cd Mon Sep 17 00:00:00 2001
From: dingtianhong <dingtianhong@huawei.com>
Date: Tue, 15 Aug 2017 11:23:23 +0800
Subject: PCI: Disable PCIe Relaxed Ordering if unsupported

When bit4 is set in the PCIe Device Control register, it indicates
whether the device is permitted to use relaxed ordering.
On some platforms using relaxed ordering can have performance issues or
due to erratum can cause data-corruption. In such cases devices must avoid
using relaxed ordering.

The patch adds a new flag PCI_DEV_FLAGS_NO_RELAXED_ORDERING to indicate that
Relaxed Ordering (RO) attribute should not be used for Transaction Layer
Packets (TLP) targeted towards these affected root complexes.

This patch checks if there is any node in the hierarchy that indicates that
using relaxed ordering is not safe. In such cases the patch turns off the
relaxed ordering by clearing the capability for this device.

Signed-off-by: Casey Leedom <leedom@chelsio.com>
Signed-off-by: Ding Tianhong <dingtianhong@huawei.com>
Acked-by: Ashok Raj <ashok.raj@intel.com>
Acked-by: Alexander Duyck <alexander.h.duyck@intel.com>
Acked-by: Casey Leedom <leedom@chelsio.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/pci/probe.c  | 43 +++++++++++++++++++++++++++++++++++++++++++
 drivers/pci/quirks.c | 11 +++++++++++
 include/linux/pci.h  |  3 +++
 3 files changed, 57 insertions(+)

(limited to 'include')

diff --git a/drivers/pci/probe.c b/drivers/pci/probe.c
index c31310db0404..e6a917b4acd3 100644
--- a/drivers/pci/probe.c
+++ b/drivers/pci/probe.c
@@ -1762,6 +1762,48 @@ static void pci_configure_extended_tags(struct pci_dev *dev)
 					 PCI_EXP_DEVCTL_EXT_TAG);
 }
 
+/**
+ * pcie_relaxed_ordering_enabled - Probe for PCIe relaxed ordering enable
+ * @dev: PCI device to query
+ *
+ * Returns true if the device has enabled relaxed ordering attribute.
+ */
+bool pcie_relaxed_ordering_enabled(struct pci_dev *dev)
+{
+	u16 v;
+
+	pcie_capability_read_word(dev, PCI_EXP_DEVCTL, &v);
+
+	return !!(v & PCI_EXP_DEVCTL_RELAX_EN);
+}
+EXPORT_SYMBOL(pcie_relaxed_ordering_enabled);
+
+static void pci_configure_relaxed_ordering(struct pci_dev *dev)
+{
+	struct pci_dev *root;
+
+	/* PCI_EXP_DEVICE_RELAX_EN is RsvdP in VFs */
+	if (dev->is_virtfn)
+		return;
+
+	if (!pcie_relaxed_ordering_enabled(dev))
+		return;
+
+	/*
+	 * For now, we only deal with Relaxed Ordering issues with Root
+	 * Ports. Peer-to-Peer DMA is another can of worms.
+	 */
+	root = pci_find_pcie_root_port(dev);
+	if (!root)
+		return;
+
+	if (root->dev_flags & PCI_DEV_FLAGS_NO_RELAXED_ORDERING) {
+		pcie_capability_clear_word(dev, PCI_EXP_DEVCTL,
+					   PCI_EXP_DEVCTL_RELAX_EN);
+		dev_info(&dev->dev, "Disable Relaxed Ordering because the Root Port didn't support it\n");
+	}
+}
+
 static void pci_configure_device(struct pci_dev *dev)
 {
 	struct hotplug_params hpp;
@@ -1769,6 +1811,7 @@ static void pci_configure_device(struct pci_dev *dev)
 
 	pci_configure_mps(dev);
 	pci_configure_extended_tags(dev);
+	pci_configure_relaxed_ordering(dev);
 
 	memset(&hpp, 0, sizeof(hpp));
 	ret = pci_get_hp_params(dev, &hpp);
diff --git a/drivers/pci/quirks.c b/drivers/pci/quirks.c
index 6967c6b4cf6b..61b59bfa7bfc 100644
--- a/drivers/pci/quirks.c
+++ b/drivers/pci/quirks.c
@@ -4015,6 +4015,17 @@ DECLARE_PCI_FIXUP_CLASS_EARLY(0x1797, 0x6868, PCI_CLASS_NOT_DEFINED, 8,
 DECLARE_PCI_FIXUP_CLASS_EARLY(0x1797, 0x6869, PCI_CLASS_NOT_DEFINED, 8,
 			      quirk_tw686x_class);
 
+/*
+ * Some devices have problems with Transaction Layer Packets with the Relaxed
+ * Ordering Attribute set.  Such devices should mark themselves and other
+ * Device Drivers should check before sending TLPs with RO set.
+ */
+static void quirk_relaxedordering_disable(struct pci_dev *dev)
+{
+	dev->dev_flags |= PCI_DEV_FLAGS_NO_RELAXED_ORDERING;
+	dev_info(&dev->dev, "Disable Relaxed Ordering Attributes to avoid PCIe Completion erratum\n");
+}
+
 /*
  * Per PCIe r3.0, sec 2.2.9, "Completion headers must supply the same
  * values for the Attribute as were supplied in the header of the
diff --git a/include/linux/pci.h b/include/linux/pci.h
index 4869e66dd659..29606fb89464 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -188,6 +188,8 @@ enum pci_dev_flags {
 	 * the direct_complete optimization.
 	 */
 	PCI_DEV_FLAGS_NEEDS_RESUME = (__force pci_dev_flags_t) (1 << 11),
+	/* Don't use Relaxed Ordering for TLPs directed at this device */
+	PCI_DEV_FLAGS_NO_RELAXED_ORDERING = (__force pci_dev_flags_t) (1 << 12),
 };
 
 enum pci_irq_reroute_variant {
@@ -1125,6 +1127,7 @@ bool pci_check_pme_status(struct pci_dev *dev);
 void pci_pme_wakeup_bus(struct pci_bus *bus);
 void pci_d3cold_enable(struct pci_dev *dev);
 void pci_d3cold_disable(struct pci_dev *dev);
+bool pcie_relaxed_ordering_enabled(struct pci_dev *dev);
 
 /* PCI Virtual Channel */
 int pci_save_vc_state(struct pci_dev *dev);
-- 
cgit v1.2.3


From 42b7305905be52e467bbc346b0f2f95ad44eb1a0 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@ZenIV.linux.org.uk>
Date: Mon, 14 Aug 2017 21:31:38 +0200
Subject: udp: fix linear skb reception with PEEK_OFF

copy_linear_skb() is broken; both of its callers actually
expect 'len' to be the amount we are trying to copy,
not the offset of the end.
Fix it keeping the meanings of arguments in sync with what the
callers (both of them) expect.
Also restore a saner behavior on EFAULT (i.e. preserving
the iov_iter position in case of failure):

The commit fd851ba9caa9 ("udp: harden copy_linear_skb()")
avoids the more destructive effect of the buggy
copy_linear_skb(), e.g. no more invalid memory access, but
said function still behaves incorrectly: when peeking with
offset it can fail with EINVAL instead of copying the
appropriate amount of memory.

Reported-by: Sasha Levin <alexander.levin@verizon.com>
Fixes: b65ac44674dd ("udp: try to avoid 2 cache miss on dequeue")
Fixes: fd851ba9caa9 ("udp: harden copy_linear_skb()")
Signed-off-by: Al Viro <viro@ZenIV.linux.org.uk>
Acked-by: Paolo Abeni <pabeni@redhat.com>
Tested-by: Sasha Levin <alexander.levin@verizon.com>
Acked-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/udp.h | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

(limited to 'include')

diff --git a/include/net/udp.h b/include/net/udp.h
index e9b1d1eacb59..586de4b811b5 100644
--- a/include/net/udp.h
+++ b/include/net/udp.h
@@ -366,14 +366,13 @@ static inline bool udp_skb_is_linear(struct sk_buff *skb)
 static inline int copy_linear_skb(struct sk_buff *skb, int len, int off,
 				  struct iov_iter *to)
 {
-	int n, copy = len - off;
+	int n;
 
-	if (copy < 0)
-		return -EINVAL;
-	n = copy_to_iter(skb->data + off, copy, to);
-	if (n == copy)
+	n = copy_to_iter(skb->data + off, len, to);
+	if (n == len)
 		return 0;
 
+	iov_iter_revert(to, n);
 	return -EFAULT;
 }
 
-- 
cgit v1.2.3


From 0335a9554b4d6acc2d17efe752bff54ce59b57eb Mon Sep 17 00:00:00 2001
From: Wolfram Sang <wsa@the-dreams.de>
Date: Mon, 14 Aug 2017 18:34:22 +0200
Subject: mfd: dm355evm_msp: Move header file out of I2C realm

include/linux/i2c is not for client devices. Move the header file to a
more appropriate location.

Signed-off-by: Wolfram Sang <wsa@the-dreams.de>
Acked-by: Alexandre Belloni <alexandre.belloni@free-electrons.com>
Acked-by: Dmitry Torokhov <dmitry.torokhov@gmail.com>
Signed-off-by: Lee Jones <lee.jones@linaro.org>
---
 drivers/input/misc/dm355evm_keys.c |  2 +-
 drivers/mfd/dm355evm_msp.c         |  2 +-
 drivers/rtc/rtc-dm355evm.c         |  2 +-
 include/linux/i2c/dm355evm_msp.h   | 79 --------------------------------------
 include/linux/mfd/dm355evm_msp.h   | 79 ++++++++++++++++++++++++++++++++++++++
 5 files changed, 82 insertions(+), 82 deletions(-)
 delete mode 100644 include/linux/i2c/dm355evm_msp.h
 create mode 100644 include/linux/mfd/dm355evm_msp.h

(limited to 'include')

diff --git a/drivers/input/misc/dm355evm_keys.c b/drivers/input/misc/dm355evm_keys.c
index bab256ef32b9..c803db64a376 100644
--- a/drivers/input/misc/dm355evm_keys.c
+++ b/drivers/input/misc/dm355evm_keys.c
@@ -15,7 +15,7 @@
 #include <linux/platform_device.h>
 #include <linux/interrupt.h>
 
-#include <linux/i2c/dm355evm_msp.h>
+#include <linux/mfd/dm355evm_msp.h>
 #include <linux/module.h>
 
 
diff --git a/drivers/mfd/dm355evm_msp.c b/drivers/mfd/dm355evm_msp.c
index 86eca614507b..2a2756709f22 100644
--- a/drivers/mfd/dm355evm_msp.c
+++ b/drivers/mfd/dm355evm_msp.c
@@ -18,7 +18,7 @@
 #include <linux/gpio.h>
 #include <linux/leds.h>
 #include <linux/i2c.h>
-#include <linux/i2c/dm355evm_msp.h>
+#include <linux/mfd/dm355evm_msp.h>
 
 
 /*
diff --git a/drivers/rtc/rtc-dm355evm.c b/drivers/rtc/rtc-dm355evm.c
index f225cd873ff6..97d8259b9494 100644
--- a/drivers/rtc/rtc-dm355evm.c
+++ b/drivers/rtc/rtc-dm355evm.c
@@ -13,7 +13,7 @@
 #include <linux/rtc.h>
 #include <linux/platform_device.h>
 
-#include <linux/i2c/dm355evm_msp.h>
+#include <linux/mfd/dm355evm_msp.h>
 #include <linux/module.h>
 
 
diff --git a/include/linux/i2c/dm355evm_msp.h b/include/linux/i2c/dm355evm_msp.h
deleted file mode 100644
index 372470350fab..000000000000
--- a/include/linux/i2c/dm355evm_msp.h
+++ /dev/null
@@ -1,79 +0,0 @@
-/*
- * dm355evm_msp.h - support MSP430 microcontroller on DM355EVM board
- */
-#ifndef __LINUX_I2C_DM355EVM_MSP
-#define __LINUX_I2C_DM355EVM_MSP
-
-/*
- * Written against Spectrum's writeup for the A4 firmware revision,
- * and tweaked to match source and rev D2 schematics by removing CPLD
- * and NOR flash hooks (which were last appropriate in rev B boards).
- *
- * Note that the firmware supports a flavor of write posting ... to be
- * sure a write completes, issue another read or write.
- */
-
-/* utilities to access "registers" emulated by msp430 firmware */
-extern int dm355evm_msp_write(u8 value, u8 reg);
-extern int dm355evm_msp_read(u8 reg);
-
-
-/* command/control registers */
-#define DM355EVM_MSP_COMMAND		0x00
-#	define MSP_COMMAND_NULL		0
-#	define MSP_COMMAND_RESET_COLD	1
-#	define MSP_COMMAND_RESET_WARM	2
-#	define MSP_COMMAND_RESET_WARM_I	3
-#	define MSP_COMMAND_POWEROFF	4
-#	define MSP_COMMAND_IR_REINIT	5
-#define DM355EVM_MSP_STATUS		0x01
-#	define MSP_STATUS_BAD_OFFSET	BIT(0)
-#	define MSP_STATUS_BAD_COMMAND	BIT(1)
-#	define MSP_STATUS_POWER_ERROR	BIT(2)
-#	define MSP_STATUS_RXBUF_OVERRUN	BIT(3)
-#define DM355EVM_MSP_RESET		0x02	/* 0 bits == in reset */
-#	define MSP_RESET_DC5		BIT(0)
-#	define MSP_RESET_TVP5154	BIT(2)
-#	define MSP_RESET_IMAGER		BIT(3)
-#	define MSP_RESET_ETHERNET	BIT(4)
-#	define MSP_RESET_SYS		BIT(5)
-#	define MSP_RESET_AIC33		BIT(7)
-
-/* GPIO registers ... bit patterns mostly match the source MSP ports */
-#define DM355EVM_MSP_LED		0x03	/* active low (MSP P4) */
-#define DM355EVM_MSP_SWITCH1		0x04	/* (MSP P5, masked) */
-#	define MSP_SWITCH1_SW6_1	BIT(0)
-#	define MSP_SWITCH1_SW6_2	BIT(1)
-#	define MSP_SWITCH1_SW6_3	BIT(2)
-#	define MSP_SWITCH1_SW6_4	BIT(3)
-#	define MSP_SWITCH1_J1		BIT(4)	/* NTSC/PAL */
-#	define MSP_SWITCH1_MSP_INT	BIT(5)	/* active low */
-#define DM355EVM_MSP_SWITCH2		0x05	/* (MSP P6, masked) */
-#	define MSP_SWITCH2_SW10		BIT(3)
-#	define MSP_SWITCH2_SW11		BIT(4)
-#	define MSP_SWITCH2_SW12		BIT(5)
-#	define MSP_SWITCH2_SW13		BIT(6)
-#	define MSP_SWITCH2_SW14		BIT(7)
-#define DM355EVM_MSP_SDMMC		0x06	/* (MSP P2, masked) */
-#	define MSP_SDMMC_0_WP		BIT(1)
-#	define MSP_SDMMC_0_CD		BIT(2)	/* active low */
-#	define MSP_SDMMC_1_WP		BIT(3)
-#	define MSP_SDMMC_1_CD		BIT(4)	/* active low */
-#define DM355EVM_MSP_FIRMREV		0x07	/* not a GPIO (out of order) */
-#define DM355EVM_MSP_VIDEO_IN		0x08	/* (MSP P3, masked) */
-#	define MSP_VIDEO_IMAGER		BIT(7)	/* low == tvp5146 */
-
-/* power supply registers are currently omitted */
-
-/* RTC registers */
-#define DM355EVM_MSP_RTC_0		0x12	/* LSB */
-#define DM355EVM_MSP_RTC_1		0x13
-#define DM355EVM_MSP_RTC_2		0x14
-#define DM355EVM_MSP_RTC_3		0x15	/* MSB */
-
-/* input event queue registers; code == ((HIGH << 8) | LOW) */
-#define DM355EVM_MSP_INPUT_COUNT	0x16	/* decrement by reading LOW */
-#define DM355EVM_MSP_INPUT_HIGH		0x17
-#define DM355EVM_MSP_INPUT_LOW		0x18
-
-#endif /* __LINUX_I2C_DM355EVM_MSP */
diff --git a/include/linux/mfd/dm355evm_msp.h b/include/linux/mfd/dm355evm_msp.h
new file mode 100644
index 000000000000..372470350fab
--- /dev/null
+++ b/include/linux/mfd/dm355evm_msp.h
@@ -0,0 +1,79 @@
+/*
+ * dm355evm_msp.h - support MSP430 microcontroller on DM355EVM board
+ */
+#ifndef __LINUX_I2C_DM355EVM_MSP
+#define __LINUX_I2C_DM355EVM_MSP
+
+/*
+ * Written against Spectrum's writeup for the A4 firmware revision,
+ * and tweaked to match source and rev D2 schematics by removing CPLD
+ * and NOR flash hooks (which were last appropriate in rev B boards).
+ *
+ * Note that the firmware supports a flavor of write posting ... to be
+ * sure a write completes, issue another read or write.
+ */
+
+/* utilities to access "registers" emulated by msp430 firmware */
+extern int dm355evm_msp_write(u8 value, u8 reg);
+extern int dm355evm_msp_read(u8 reg);
+
+
+/* command/control registers */
+#define DM355EVM_MSP_COMMAND		0x00
+#	define MSP_COMMAND_NULL		0
+#	define MSP_COMMAND_RESET_COLD	1
+#	define MSP_COMMAND_RESET_WARM	2
+#	define MSP_COMMAND_RESET_WARM_I	3
+#	define MSP_COMMAND_POWEROFF	4
+#	define MSP_COMMAND_IR_REINIT	5
+#define DM355EVM_MSP_STATUS		0x01
+#	define MSP_STATUS_BAD_OFFSET	BIT(0)
+#	define MSP_STATUS_BAD_COMMAND	BIT(1)
+#	define MSP_STATUS_POWER_ERROR	BIT(2)
+#	define MSP_STATUS_RXBUF_OVERRUN	BIT(3)
+#define DM355EVM_MSP_RESET		0x02	/* 0 bits == in reset */
+#	define MSP_RESET_DC5		BIT(0)
+#	define MSP_RESET_TVP5154	BIT(2)
+#	define MSP_RESET_IMAGER		BIT(3)
+#	define MSP_RESET_ETHERNET	BIT(4)
+#	define MSP_RESET_SYS		BIT(5)
+#	define MSP_RESET_AIC33		BIT(7)
+
+/* GPIO registers ... bit patterns mostly match the source MSP ports */
+#define DM355EVM_MSP_LED		0x03	/* active low (MSP P4) */
+#define DM355EVM_MSP_SWITCH1		0x04	/* (MSP P5, masked) */
+#	define MSP_SWITCH1_SW6_1	BIT(0)
+#	define MSP_SWITCH1_SW6_2	BIT(1)
+#	define MSP_SWITCH1_SW6_3	BIT(2)
+#	define MSP_SWITCH1_SW6_4	BIT(3)
+#	define MSP_SWITCH1_J1		BIT(4)	/* NTSC/PAL */
+#	define MSP_SWITCH1_MSP_INT	BIT(5)	/* active low */
+#define DM355EVM_MSP_SWITCH2		0x05	/* (MSP P6, masked) */
+#	define MSP_SWITCH2_SW10		BIT(3)
+#	define MSP_SWITCH2_SW11		BIT(4)
+#	define MSP_SWITCH2_SW12		BIT(5)
+#	define MSP_SWITCH2_SW13		BIT(6)
+#	define MSP_SWITCH2_SW14		BIT(7)
+#define DM355EVM_MSP_SDMMC		0x06	/* (MSP P2, masked) */
+#	define MSP_SDMMC_0_WP		BIT(1)
+#	define MSP_SDMMC_0_CD		BIT(2)	/* active low */
+#	define MSP_SDMMC_1_WP		BIT(3)
+#	define MSP_SDMMC_1_CD		BIT(4)	/* active low */
+#define DM355EVM_MSP_FIRMREV		0x07	/* not a GPIO (out of order) */
+#define DM355EVM_MSP_VIDEO_IN		0x08	/* (MSP P3, masked) */
+#	define MSP_VIDEO_IMAGER		BIT(7)	/* low == tvp5146 */
+
+/* power supply registers are currently omitted */
+
+/* RTC registers */
+#define DM355EVM_MSP_RTC_0		0x12	/* LSB */
+#define DM355EVM_MSP_RTC_1		0x13
+#define DM355EVM_MSP_RTC_2		0x14
+#define DM355EVM_MSP_RTC_3		0x15	/* MSB */
+
+/* input event queue registers; code == ((HIGH << 8) | LOW) */
+#define DM355EVM_MSP_INPUT_COUNT	0x16	/* decrement by reading LOW */
+#define DM355EVM_MSP_INPUT_HIGH		0x17
+#define DM355EVM_MSP_INPUT_LOW		0x18
+
+#endif /* __LINUX_I2C_DM355EVM_MSP */
-- 
cgit v1.2.3


From 9787076c43a86f3465d46aa344cc3e2f607ae72f Mon Sep 17 00:00:00 2001
From: Wolfram Sang <wsa@the-dreams.de>
Date: Mon, 14 Aug 2017 18:34:23 +0200
Subject: mfd: tps65010: Move header file out of I2C realm

include/linux/i2c is not for client devices. Move the header file to a
more appropriate location.

Signed-off-by: Wolfram Sang <wsa@the-dreams.de>
Acked-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Acked-by: Krzysztof Kozlowski <krzk@kernel.org>
Acked-by: Bartlomiej Zolnierkiewicz <b.zolnierkie@samsung.com>
Acked-by: Tony Lindgren <tony@atomide.com>
Signed-off-by: Lee Jones <lee.jones@linaro.org>
---
 arch/arm/mach-omap1/board-h2-mmc.c      |   2 +-
 arch/arm/mach-omap1/board-h2.c          |   2 +-
 arch/arm/mach-omap1/board-h3-mmc.c      |   2 +-
 arch/arm/mach-omap1/board-h3.c          |   2 +-
 arch/arm/mach-omap1/board-osk.c         |   2 +-
 arch/arm/mach-s3c24xx/mach-osiris-dvs.c |   2 +-
 arch/arm/mach-s3c24xx/mach-osiris.c     |   2 +-
 drivers/mfd/tps65010.c                  |   2 +-
 drivers/usb/host/ohci-omap.c            |   2 +-
 drivers/usb/phy/phy-isp1301-omap.c      |   2 +-
 drivers/video/fbdev/omap/lcd_h3.c       |   2 +-
 include/linux/i2c/tps65010.h            | 205 --------------------------------
 include/linux/mfd/tps65010.h            | 205 ++++++++++++++++++++++++++++++++
 13 files changed, 216 insertions(+), 216 deletions(-)
 delete mode 100644 include/linux/i2c/tps65010.h
 create mode 100644 include/linux/mfd/tps65010.h

(limited to 'include')

diff --git a/arch/arm/mach-omap1/board-h2-mmc.c b/arch/arm/mach-omap1/board-h2-mmc.c
index 357be2debc9d..91bda9c802ff 100644
--- a/arch/arm/mach-omap1/board-h2-mmc.c
+++ b/arch/arm/mach-omap1/board-h2-mmc.c
@@ -14,7 +14,7 @@
 #include <linux/gpio.h>
 #include <linux/platform_device.h>
 #include <linux/platform_data/gpio-omap.h>
-#include <linux/i2c/tps65010.h>
+#include <linux/mfd/tps65010.h>
 
 #include "board-h2.h"
 #include "mmc.h"
diff --git a/arch/arm/mach-omap1/board-h2.c b/arch/arm/mach-omap1/board-h2.c
index 675254ee4b1e..dece47d76282 100644
--- a/arch/arm/mach-omap1/board-h2.c
+++ b/arch/arm/mach-omap1/board-h2.c
@@ -28,7 +28,7 @@
 #include <linux/mtd/partitions.h>
 #include <linux/mtd/physmap.h>
 #include <linux/input.h>
-#include <linux/i2c/tps65010.h>
+#include <linux/mfd/tps65010.h>
 #include <linux/smc91x.h>
 #include <linux/omapfb.h>
 #include <linux/platform_data/gpio-omap.h>
diff --git a/arch/arm/mach-omap1/board-h3-mmc.c b/arch/arm/mach-omap1/board-h3-mmc.c
index 4f58bfa5e754..692c267a9a90 100644
--- a/arch/arm/mach-omap1/board-h3-mmc.c
+++ b/arch/arm/mach-omap1/board-h3-mmc.c
@@ -14,7 +14,7 @@
 #include <linux/gpio.h>
 #include <linux/platform_device.h>
 
-#include <linux/i2c/tps65010.h>
+#include <linux/mfd/tps65010.h>
 
 #include "common.h"
 #include "board-h3.h"
diff --git a/arch/arm/mach-omap1/board-h3.c b/arch/arm/mach-omap1/board-h3.c
index e62f9d454f10..6d32beeb2d88 100644
--- a/arch/arm/mach-omap1/board-h3.c
+++ b/arch/arm/mach-omap1/board-h3.c
@@ -28,7 +28,7 @@
 #include <linux/mtd/physmap.h>
 #include <linux/input.h>
 #include <linux/spi/spi.h>
-#include <linux/i2c/tps65010.h>
+#include <linux/mfd/tps65010.h>
 #include <linux/smc91x.h>
 #include <linux/omapfb.h>
 #include <linux/platform_data/gpio-omap.h>
diff --git a/arch/arm/mach-omap1/board-osk.c b/arch/arm/mach-omap1/board-osk.c
index 4dfb99504810..f20361b8ffb6 100644
--- a/arch/arm/mach-omap1/board-osk.c
+++ b/arch/arm/mach-omap1/board-osk.c
@@ -38,7 +38,7 @@
 #include <linux/mtd/mtd.h>
 #include <linux/mtd/partitions.h>
 #include <linux/mtd/physmap.h>
-#include <linux/i2c/tps65010.h>
+#include <linux/mfd/tps65010.h>
 #include <linux/platform_data/gpio-omap.h>
 #include <linux/platform_data/omap1_bl.h>
 
diff --git a/arch/arm/mach-s3c24xx/mach-osiris-dvs.c b/arch/arm/mach-s3c24xx/mach-osiris-dvs.c
index 262ab0744748..6cac7da15e2b 100644
--- a/arch/arm/mach-s3c24xx/mach-osiris-dvs.c
+++ b/arch/arm/mach-s3c24xx/mach-osiris-dvs.c
@@ -17,7 +17,7 @@
 #include <linux/cpufreq.h>
 #include <linux/gpio.h>
 
-#include <linux/i2c/tps65010.h>
+#include <linux/mfd/tps65010.h>
 
 #include <plat/cpu-freq.h>
 #include <mach/gpio-samsung.h>
diff --git a/arch/arm/mach-s3c24xx/mach-osiris.c b/arch/arm/mach-s3c24xx/mach-osiris.c
index 70b0eb7d3134..64b1a0b7b803 100644
--- a/arch/arm/mach-s3c24xx/mach-osiris.c
+++ b/arch/arm/mach-s3c24xx/mach-osiris.c
@@ -24,7 +24,7 @@
 #include <linux/io.h>
 #include <linux/platform_device.h>
 
-#include <linux/i2c/tps65010.h>
+#include <linux/mfd/tps65010.h>
 
 #include <asm/mach-types.h>
 #include <asm/mach/arch.h>
diff --git a/drivers/mfd/tps65010.c b/drivers/mfd/tps65010.c
index d829a6131f09..2ab67386b4ef 100644
--- a/drivers/mfd/tps65010.c
+++ b/drivers/mfd/tps65010.c
@@ -32,7 +32,7 @@
 #include <linux/mutex.h>
 #include <linux/platform_device.h>
 
-#include <linux/i2c/tps65010.h>
+#include <linux/mfd/tps65010.h>
 
 #include <linux/gpio/driver.h>
 
diff --git a/drivers/usb/host/ohci-omap.c b/drivers/usb/host/ohci-omap.c
index a4d814b7f380..91393ec7d850 100644
--- a/drivers/usb/host/ohci-omap.c
+++ b/drivers/usb/host/ohci-omap.c
@@ -53,7 +53,7 @@
 #define DRIVER_DESC "OHCI OMAP driver"
 
 #ifdef CONFIG_TPS65010
-#include <linux/i2c/tps65010.h>
+#include <linux/mfd/tps65010.h>
 #else
 
 #define LOW	0
diff --git a/drivers/usb/phy/phy-isp1301-omap.c b/drivers/usb/phy/phy-isp1301-omap.c
index 042c5a8fd423..c6052c814bcc 100644
--- a/drivers/usb/phy/phy-isp1301-omap.c
+++ b/drivers/usb/phy/phy-isp1301-omap.c
@@ -96,7 +96,7 @@ struct isp1301 {
 
 #if IS_REACHABLE(CONFIG_TPS65010)
 
-#include <linux/i2c/tps65010.h>
+#include <linux/mfd/tps65010.h>
 
 #else
 
diff --git a/drivers/video/fbdev/omap/lcd_h3.c b/drivers/video/fbdev/omap/lcd_h3.c
index 9d2da146813e..796f4634c4c6 100644
--- a/drivers/video/fbdev/omap/lcd_h3.c
+++ b/drivers/video/fbdev/omap/lcd_h3.c
@@ -21,7 +21,7 @@
 
 #include <linux/module.h>
 #include <linux/platform_device.h>
-#include <linux/i2c/tps65010.h>
+#include <linux/mfd/tps65010.h>
 #include <linux/gpio.h>
 
 #include "omapfb.h"
diff --git a/include/linux/i2c/tps65010.h b/include/linux/i2c/tps65010.h
deleted file mode 100644
index 08aa92278d71..000000000000
--- a/include/linux/i2c/tps65010.h
+++ /dev/null
@@ -1,205 +0,0 @@
-/* linux/i2c/tps65010.h
- *
- * Functions to access TPS65010 power management device.
- *
- * Copyright (C) 2004 Dirk Behme <dirk.behme@de.bosch.com>
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms of the GNU General Public License as published by the
- * Free Software Foundation; either version 2 of the License, or (at your
- * option) any later version.
- *
- * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED
- * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
- * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN
- * NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
- * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
- * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
- * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
- * ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
- * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- * You should have received a copy of the  GNU General Public License along
- * with this program; if not, write  to the Free Software Foundation, Inc.,
- * 675 Mass Ave, Cambridge, MA 02139, USA.
- */
-
-#ifndef __LINUX_I2C_TPS65010_H
-#define __LINUX_I2C_TPS65010_H
-
-/*
- * ----------------------------------------------------------------------------
- * Registers, all 8 bits
- * ----------------------------------------------------------------------------
- */
-
-#define	TPS_CHGSTATUS		0x01
-#	define	TPS_CHG_USB		(1 << 7)
-#	define	TPS_CHG_AC		(1 << 6)
-#	define	TPS_CHG_THERM		(1 << 5)
-#	define	TPS_CHG_TERM		(1 << 4)
-#	define	TPS_CHG_TAPER_TMO	(1 << 3)
-#	define	TPS_CHG_CHG_TMO		(1 << 2)
-#	define	TPS_CHG_PRECHG_TMO	(1 << 1)
-#	define	TPS_CHG_TEMP_ERR	(1 << 0)
-#define	TPS_REGSTATUS		0x02
-#	define	TPS_REG_ONOFF		(1 << 7)
-#	define	TPS_REG_COVER		(1 << 6)
-#	define	TPS_REG_UVLO		(1 << 5)
-#	define	TPS_REG_NO_CHG		(1 << 4)	/* tps65013 */
-#	define	TPS_REG_PG_LD02		(1 << 3)
-#	define	TPS_REG_PG_LD01		(1 << 2)
-#	define	TPS_REG_PG_MAIN		(1 << 1)
-#	define	TPS_REG_PG_CORE		(1 << 0)
-#define	TPS_MASK1		0x03
-#define	TPS_MASK2		0x04
-#define	TPS_ACKINT1		0x05
-#define	TPS_ACKINT2		0x06
-#define	TPS_CHGCONFIG		0x07
-#	define	TPS_CHARGE_POR		(1 << 7)	/* 65010/65012 */
-#	define	TPS65013_AUA		(1 << 7)	/* 65011/65013 */
-#	define	TPS_CHARGE_RESET	(1 << 6)
-#	define	TPS_CHARGE_FAST		(1 << 5)
-#	define	TPS_CHARGE_CURRENT	(3 << 3)
-#	define	TPS_VBUS_500MA		(1 << 2)
-#	define	TPS_VBUS_CHARGING	(1 << 1)
-#	define	TPS_CHARGE_ENABLE	(1 << 0)
-#define	TPS_LED1_ON		0x08
-#define	TPS_LED1_PER		0x09
-#define	TPS_LED2_ON		0x0a
-#define	TPS_LED2_PER		0x0b
-#define	TPS_VDCDC1		0x0c
-#	define	TPS_ENABLE_LP		(1 << 3)
-#define	TPS_VDCDC2		0x0d
-#	define	TPS_LP_COREOFF	(1 << 7)
-#	define 	TPS_VCORE_1_8V	(7<<4)
-#	define 	TPS_VCORE_1_5V	(6 << 4)
-#	define 	TPS_VCORE_1_4V	(5 << 4)
-#	define 	TPS_VCORE_1_3V	(4 << 4)
-#	define 	TPS_VCORE_1_2V	(3 << 4)
-#	define 	TPS_VCORE_1_1V	(2 << 4)
-#	define 	TPS_VCORE_1_0V	(1 << 4)
-#	define 	TPS_VCORE_0_85V	(0 << 4)
-#	define	TPS_VCORE_LP_1_2V (3 << 2)
-#	define	TPS_VCORE_LP_1_1V (2 << 2)
-#	define	TPS_VCORE_LP_1_0V (1 << 2)
-#	define	TPS_VCORE_LP_0_85V (0 << 2)
-#	define	TPS_VIB		(1 << 1)
-#	define	TPS_VCORE_DISCH	(1 << 0)
-#define	TPS_VREGS1		0x0e
-#	define	TPS_LDO2_ENABLE	(1 << 7)
-#	define	TPS_LDO2_OFF	(1 << 6)
-#	define	TPS_VLDO2_3_0V	(3 << 4)
-#	define	TPS_VLDO2_2_75V	(2 << 4)
-#	define	TPS_VLDO2_2_5V	(1 << 4)
-#	define	TPS_VLDO2_1_8V	(0 << 4)
-#	define	TPS_LDO1_ENABLE	(1 << 3)
-#	define	TPS_LDO1_OFF	(1 << 2)
-#	define	TPS_VLDO1_3_0V	(3 << 0)
-#	define	TPS_VLDO1_2_75V	(2 << 0)
-#	define	TPS_VLDO1_2_5V	(1 << 0)
-#	define	TPS_VLDO1_ADJ	(0 << 0)
-#define	TPS_MASK3		0x0f
-#define	TPS_DEFGPIO		0x10
-
-/*
- * ----------------------------------------------------------------------------
- * Macros used by exported functions
- * ----------------------------------------------------------------------------
- */
-
-#define LED1  1
-#define LED2  2
-#define OFF   0
-#define ON    1
-#define BLINK 2
-#define GPIO1 1
-#define GPIO2 2
-#define GPIO3 3
-#define GPIO4 4
-#define LOW   0
-#define HIGH  1
-
-/*
- * ----------------------------------------------------------------------------
- * Exported functions
- * ----------------------------------------------------------------------------
- */
-
-/* Draw from VBUS:
- *   0 mA -- DON'T DRAW (might supply power instead)
- * 100 mA -- usb unit load (slowest charge rate)
- * 500 mA -- usb high power (fast battery charge)
- */
-extern int tps65010_set_vbus_draw(unsigned mA);
-
-/* tps65010_set_gpio_out_value parameter:
- * gpio:  GPIO1, GPIO2, GPIO3 or GPIO4
- * value: LOW or HIGH
- */
-extern int tps65010_set_gpio_out_value(unsigned gpio, unsigned value);
-
-/* tps65010_set_led parameter:
- * led:  LED1 or LED2
- * mode: ON, OFF or BLINK
- */
-extern int tps65010_set_led(unsigned led, unsigned mode);
-
-/* tps65010_set_vib parameter:
- * value: ON or OFF
- */
-extern int tps65010_set_vib(unsigned value);
-
-/* tps65010_set_low_pwr parameter:
- * mode: ON or OFF
- */
-extern int tps65010_set_low_pwr(unsigned mode);
-
-/* tps65010_config_vregs1 parameter:
- * value to be written to VREGS1 register
- * Note: The complete register is written, set all bits you need
- */
-extern int tps65010_config_vregs1(unsigned value);
-
-/* tps65013_set_low_pwr parameter:
- * mode: ON or OFF
- */
-extern int tps65013_set_low_pwr(unsigned mode);
-
-/* tps65010_set_vdcdc2
- *  value to be written to VDCDC2
- */
-extern int tps65010_config_vdcdc2(unsigned value);
-
-struct i2c_client;
-
-/**
- * struct tps65010_board - packages GPIO and LED lines
- * @base: the GPIO number to assign to GPIO-1
- * @outmask: bit (N-1) is set to allow GPIO-N to be used as an
- *	(open drain) output
- * @setup: optional callback issued once the GPIOs are valid
- * @teardown: optional callback issued before the GPIOs are invalidated
- * @context: optional parameter passed to setup() and teardown()
- *
- * Board data may be used to package the GPIO (and LED) lines for use
- * in by the generic GPIO and LED frameworks.  The first four GPIOs
- * starting at gpio_base are GPIO1..GPIO4.  The next two are LED1/nPG
- * and LED2 (with hardware blinking capability, not currently exposed).
- *
- * The @setup callback may be used with the kind of board-specific glue
- * which hands the (now-valid) GPIOs to other drivers, or which puts
- * devices in their initial states using these GPIOs.
- */
-struct tps65010_board {
-	int				base;
-	unsigned			outmask;
-
-	int		(*setup)(struct i2c_client *client, void *context);
-	int		(*teardown)(struct i2c_client *client, void *context);
-	void		*context;
-};
-
-#endif /*  __LINUX_I2C_TPS65010_H */
-
diff --git a/include/linux/mfd/tps65010.h b/include/linux/mfd/tps65010.h
new file mode 100644
index 000000000000..a1fb9bc5311d
--- /dev/null
+++ b/include/linux/mfd/tps65010.h
@@ -0,0 +1,205 @@
+/* linux/mfd/tps65010.h
+ *
+ * Functions to access TPS65010 power management device.
+ *
+ * Copyright (C) 2004 Dirk Behme <dirk.behme@de.bosch.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the
+ * Free Software Foundation; either version 2 of the License, or (at your
+ * option) any later version.
+ *
+ * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED
+ * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN
+ * NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
+ * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
+ * ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * You should have received a copy of the  GNU General Public License along
+ * with this program; if not, write  to the Free Software Foundation, Inc.,
+ * 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+#ifndef __LINUX_I2C_TPS65010_H
+#define __LINUX_I2C_TPS65010_H
+
+/*
+ * ----------------------------------------------------------------------------
+ * Registers, all 8 bits
+ * ----------------------------------------------------------------------------
+ */
+
+#define	TPS_CHGSTATUS		0x01
+#	define	TPS_CHG_USB		(1 << 7)
+#	define	TPS_CHG_AC		(1 << 6)
+#	define	TPS_CHG_THERM		(1 << 5)
+#	define	TPS_CHG_TERM		(1 << 4)
+#	define	TPS_CHG_TAPER_TMO	(1 << 3)
+#	define	TPS_CHG_CHG_TMO		(1 << 2)
+#	define	TPS_CHG_PRECHG_TMO	(1 << 1)
+#	define	TPS_CHG_TEMP_ERR	(1 << 0)
+#define	TPS_REGSTATUS		0x02
+#	define	TPS_REG_ONOFF		(1 << 7)
+#	define	TPS_REG_COVER		(1 << 6)
+#	define	TPS_REG_UVLO		(1 << 5)
+#	define	TPS_REG_NO_CHG		(1 << 4)	/* tps65013 */
+#	define	TPS_REG_PG_LD02		(1 << 3)
+#	define	TPS_REG_PG_LD01		(1 << 2)
+#	define	TPS_REG_PG_MAIN		(1 << 1)
+#	define	TPS_REG_PG_CORE		(1 << 0)
+#define	TPS_MASK1		0x03
+#define	TPS_MASK2		0x04
+#define	TPS_ACKINT1		0x05
+#define	TPS_ACKINT2		0x06
+#define	TPS_CHGCONFIG		0x07
+#	define	TPS_CHARGE_POR		(1 << 7)	/* 65010/65012 */
+#	define	TPS65013_AUA		(1 << 7)	/* 65011/65013 */
+#	define	TPS_CHARGE_RESET	(1 << 6)
+#	define	TPS_CHARGE_FAST		(1 << 5)
+#	define	TPS_CHARGE_CURRENT	(3 << 3)
+#	define	TPS_VBUS_500MA		(1 << 2)
+#	define	TPS_VBUS_CHARGING	(1 << 1)
+#	define	TPS_CHARGE_ENABLE	(1 << 0)
+#define	TPS_LED1_ON		0x08
+#define	TPS_LED1_PER		0x09
+#define	TPS_LED2_ON		0x0a
+#define	TPS_LED2_PER		0x0b
+#define	TPS_VDCDC1		0x0c
+#	define	TPS_ENABLE_LP		(1 << 3)
+#define	TPS_VDCDC2		0x0d
+#	define	TPS_LP_COREOFF	(1 << 7)
+#	define 	TPS_VCORE_1_8V	(7<<4)
+#	define 	TPS_VCORE_1_5V	(6 << 4)
+#	define 	TPS_VCORE_1_4V	(5 << 4)
+#	define 	TPS_VCORE_1_3V	(4 << 4)
+#	define 	TPS_VCORE_1_2V	(3 << 4)
+#	define 	TPS_VCORE_1_1V	(2 << 4)
+#	define 	TPS_VCORE_1_0V	(1 << 4)
+#	define 	TPS_VCORE_0_85V	(0 << 4)
+#	define	TPS_VCORE_LP_1_2V (3 << 2)
+#	define	TPS_VCORE_LP_1_1V (2 << 2)
+#	define	TPS_VCORE_LP_1_0V (1 << 2)
+#	define	TPS_VCORE_LP_0_85V (0 << 2)
+#	define	TPS_VIB		(1 << 1)
+#	define	TPS_VCORE_DISCH	(1 << 0)
+#define	TPS_VREGS1		0x0e
+#	define	TPS_LDO2_ENABLE	(1 << 7)
+#	define	TPS_LDO2_OFF	(1 << 6)
+#	define	TPS_VLDO2_3_0V	(3 << 4)
+#	define	TPS_VLDO2_2_75V	(2 << 4)
+#	define	TPS_VLDO2_2_5V	(1 << 4)
+#	define	TPS_VLDO2_1_8V	(0 << 4)
+#	define	TPS_LDO1_ENABLE	(1 << 3)
+#	define	TPS_LDO1_OFF	(1 << 2)
+#	define	TPS_VLDO1_3_0V	(3 << 0)
+#	define	TPS_VLDO1_2_75V	(2 << 0)
+#	define	TPS_VLDO1_2_5V	(1 << 0)
+#	define	TPS_VLDO1_ADJ	(0 << 0)
+#define	TPS_MASK3		0x0f
+#define	TPS_DEFGPIO		0x10
+
+/*
+ * ----------------------------------------------------------------------------
+ * Macros used by exported functions
+ * ----------------------------------------------------------------------------
+ */
+
+#define LED1  1
+#define LED2  2
+#define OFF   0
+#define ON    1
+#define BLINK 2
+#define GPIO1 1
+#define GPIO2 2
+#define GPIO3 3
+#define GPIO4 4
+#define LOW   0
+#define HIGH  1
+
+/*
+ * ----------------------------------------------------------------------------
+ * Exported functions
+ * ----------------------------------------------------------------------------
+ */
+
+/* Draw from VBUS:
+ *   0 mA -- DON'T DRAW (might supply power instead)
+ * 100 mA -- usb unit load (slowest charge rate)
+ * 500 mA -- usb high power (fast battery charge)
+ */
+extern int tps65010_set_vbus_draw(unsigned mA);
+
+/* tps65010_set_gpio_out_value parameter:
+ * gpio:  GPIO1, GPIO2, GPIO3 or GPIO4
+ * value: LOW or HIGH
+ */
+extern int tps65010_set_gpio_out_value(unsigned gpio, unsigned value);
+
+/* tps65010_set_led parameter:
+ * led:  LED1 or LED2
+ * mode: ON, OFF or BLINK
+ */
+extern int tps65010_set_led(unsigned led, unsigned mode);
+
+/* tps65010_set_vib parameter:
+ * value: ON or OFF
+ */
+extern int tps65010_set_vib(unsigned value);
+
+/* tps65010_set_low_pwr parameter:
+ * mode: ON or OFF
+ */
+extern int tps65010_set_low_pwr(unsigned mode);
+
+/* tps65010_config_vregs1 parameter:
+ * value to be written to VREGS1 register
+ * Note: The complete register is written, set all bits you need
+ */
+extern int tps65010_config_vregs1(unsigned value);
+
+/* tps65013_set_low_pwr parameter:
+ * mode: ON or OFF
+ */
+extern int tps65013_set_low_pwr(unsigned mode);
+
+/* tps65010_set_vdcdc2
+ *  value to be written to VDCDC2
+ */
+extern int tps65010_config_vdcdc2(unsigned value);
+
+struct i2c_client;
+
+/**
+ * struct tps65010_board - packages GPIO and LED lines
+ * @base: the GPIO number to assign to GPIO-1
+ * @outmask: bit (N-1) is set to allow GPIO-N to be used as an
+ *	(open drain) output
+ * @setup: optional callback issued once the GPIOs are valid
+ * @teardown: optional callback issued before the GPIOs are invalidated
+ * @context: optional parameter passed to setup() and teardown()
+ *
+ * Board data may be used to package the GPIO (and LED) lines for use
+ * in by the generic GPIO and LED frameworks.  The first four GPIOs
+ * starting at gpio_base are GPIO1..GPIO4.  The next two are LED1/nPG
+ * and LED2 (with hardware blinking capability, not currently exposed).
+ *
+ * The @setup callback may be used with the kind of board-specific glue
+ * which hands the (now-valid) GPIOs to other drivers, or which puts
+ * devices in their initial states using these GPIOs.
+ */
+struct tps65010_board {
+	int				base;
+	unsigned			outmask;
+
+	int		(*setup)(struct i2c_client *client, void *context);
+	int		(*teardown)(struct i2c_client *client, void *context);
+	void		*context;
+};
+
+#endif /*  __LINUX_I2C_TPS65010_H */
+
-- 
cgit v1.2.3


From d7b5f5cc5eb438270ba6c86207cc74fb492e1a57 Mon Sep 17 00:00:00 2001
From: Fenglin Wu <fenglinw@codeaurora.org>
Date: Tue, 15 Aug 2017 08:38:37 +0800
Subject: pinctrl: qcom: spmi-gpio: Add support for GPIO LV/MV subtype

GPIO LV (low voltage)/MV (medium voltage) subtypes have different
features and register mappings than 4CH/8CH subtypes. Add support
for LV and MV subtypes.

Signed-off-by: Fenglin Wu <fenglinw@codeaurora.org>
Acked-by: Rob Herring <robh@kernel.org>
Acked-by: Bjorn Andersson <bjorn.andersson@linaro.org>
Signed-off-by: Linus Walleij <linus.walleij@linaro.org>
---
 .../devicetree/bindings/pinctrl/qcom,pmic-gpio.txt |  17 +-
 drivers/pinctrl/qcom/pinctrl-spmi-gpio.c           | 281 +++++++++++++++++----
 include/dt-bindings/pinctrl/qcom,pmic-gpio.h       |   2 +
 3 files changed, 250 insertions(+), 50 deletions(-)

(limited to 'include')

diff --git a/Documentation/devicetree/bindings/pinctrl/qcom,pmic-gpio.txt b/Documentation/devicetree/bindings/pinctrl/qcom,pmic-gpio.txt
index d6f8adbf25c6..e5b8ff7f545b 100644
--- a/Documentation/devicetree/bindings/pinctrl/qcom,pmic-gpio.txt
+++ b/Documentation/devicetree/bindings/pinctrl/qcom,pmic-gpio.txt
@@ -100,7 +100,10 @@ to specify in a pin configuration subnode:
 		    "dtest1",
 		    "dtest2",
 		    "dtest3",
-		    "dtest4"
+		    "dtest4",
+		    And following values are supported by LV/MV GPIO subtypes:
+		    "func3",
+		    "func4"
 
 - bias-disable:
 	Usage: optional
@@ -185,6 +188,18 @@ to specify in a pin configuration subnode:
 	Value type: <none>
 	Definition: The specified pins are configured in open-source mode.
 
+- qcom,analog-pass:
+	Usage: optional
+	Value type: <none>
+	Definition: The specified pins are configured in analog-pass-through mode.
+
+- qcom,atest:
+	Usage: optional
+	Value type: <u32>
+	Definition: Selects ATEST rail to route to GPIO when it's configured
+		    in analog-pass-through mode.
+		    Valid values are 1-4 corresponding to ATEST1 to ATEST4.
+
 Example:
 
 	pm8921_gpio: gpio@150 {
diff --git a/drivers/pinctrl/qcom/pinctrl-spmi-gpio.c b/drivers/pinctrl/qcom/pinctrl-spmi-gpio.c
index 664b641fd776..6b21832d45e6 100644
--- a/drivers/pinctrl/qcom/pinctrl-spmi-gpio.c
+++ b/drivers/pinctrl/qcom/pinctrl-spmi-gpio.c
@@ -40,6 +40,8 @@
 #define PMIC_GPIO_SUBTYPE_GPIOC_4CH		0x5
 #define PMIC_GPIO_SUBTYPE_GPIO_8CH		0x9
 #define PMIC_GPIO_SUBTYPE_GPIOC_8CH		0xd
+#define PMIC_GPIO_SUBTYPE_GPIO_LV		0x10
+#define PMIC_GPIO_SUBTYPE_GPIO_MV		0x11
 
 #define PMIC_MPP_REG_RT_STS			0x10
 #define PMIC_MPP_REG_RT_STS_VAL_MASK		0x1
@@ -48,8 +50,10 @@
 #define PMIC_GPIO_REG_MODE_CTL			0x40
 #define PMIC_GPIO_REG_DIG_VIN_CTL		0x41
 #define PMIC_GPIO_REG_DIG_PULL_CTL		0x42
+#define PMIC_GPIO_REG_LV_MV_DIG_OUT_SOURCE_CTL	0x44
 #define PMIC_GPIO_REG_DIG_OUT_CTL		0x45
 #define PMIC_GPIO_REG_EN_CTL			0x46
+#define PMIC_GPIO_REG_LV_MV_ANA_PASS_THRU_SEL	0x4A
 
 /* PMIC_GPIO_REG_MODE_CTL */
 #define PMIC_GPIO_REG_MODE_VALUE_SHIFT		0x1
@@ -58,6 +62,12 @@
 #define PMIC_GPIO_REG_MODE_DIR_SHIFT		4
 #define PMIC_GPIO_REG_MODE_DIR_MASK		0x7
 
+#define PMIC_GPIO_MODE_DIGITAL_INPUT		0
+#define PMIC_GPIO_MODE_DIGITAL_OUTPUT		1
+#define PMIC_GPIO_MODE_DIGITAL_INPUT_OUTPUT	2
+#define PMIC_GPIO_MODE_ANALOG_PASS_THRU		3
+#define PMIC_GPIO_REG_LV_MV_MODE_DIR_MASK	0x3
+
 /* PMIC_GPIO_REG_DIG_VIN_CTL */
 #define PMIC_GPIO_REG_VIN_SHIFT			0
 #define PMIC_GPIO_REG_VIN_MASK			0x7
@@ -69,6 +79,11 @@
 #define PMIC_GPIO_PULL_DOWN			4
 #define PMIC_GPIO_PULL_DISABLE			5
 
+/* PMIC_GPIO_REG_LV_MV_DIG_OUT_SOURCE_CTL for LV/MV */
+#define PMIC_GPIO_LV_MV_OUTPUT_INVERT		0x80
+#define PMIC_GPIO_LV_MV_OUTPUT_INVERT_SHIFT	7
+#define PMIC_GPIO_LV_MV_OUTPUT_SOURCE_SEL_MASK	0xF
+
 /* PMIC_GPIO_REG_DIG_OUT_CTL */
 #define PMIC_GPIO_REG_OUT_STRENGTH_SHIFT	0
 #define PMIC_GPIO_REG_OUT_STRENGTH_MASK		0x3
@@ -88,9 +103,28 @@
 
 #define PMIC_GPIO_PHYSICAL_OFFSET		1
 
+/* PMIC_GPIO_REG_LV_MV_ANA_PASS_THRU_SEL */
+#define PMIC_GPIO_LV_MV_ANA_MUX_SEL_MASK		0x3
+
 /* Qualcomm specific pin configurations */
 #define PMIC_GPIO_CONF_PULL_UP			(PIN_CONFIG_END + 1)
 #define PMIC_GPIO_CONF_STRENGTH			(PIN_CONFIG_END + 2)
+#define PMIC_GPIO_CONF_ATEST			(PIN_CONFIG_END + 3)
+#define PMIC_GPIO_CONF_ANALOG_PASS		(PIN_CONFIG_END + 4)
+
+/* The index of each function in pmic_gpio_functions[] array */
+enum pmic_gpio_func_index {
+	PMIC_GPIO_FUNC_INDEX_NORMAL,
+	PMIC_GPIO_FUNC_INDEX_PAIRED,
+	PMIC_GPIO_FUNC_INDEX_FUNC1,
+	PMIC_GPIO_FUNC_INDEX_FUNC2,
+	PMIC_GPIO_FUNC_INDEX_FUNC3,
+	PMIC_GPIO_FUNC_INDEX_FUNC4,
+	PMIC_GPIO_FUNC_INDEX_DTEST1,
+	PMIC_GPIO_FUNC_INDEX_DTEST2,
+	PMIC_GPIO_FUNC_INDEX_DTEST3,
+	PMIC_GPIO_FUNC_INDEX_DTEST4,
+};
 
 /**
  * struct pmic_gpio_pad - keep current GPIO settings
@@ -102,12 +136,15 @@
  *	open-drain or open-source mode.
  * @output_enabled: Set to true if GPIO output logic is enabled.
  * @input_enabled: Set to true if GPIO input buffer logic is enabled.
+ * @analog_pass: Set to true if GPIO is in analog-pass-through mode.
+ * @lv_mv_type: Set to true if GPIO subtype is GPIO_LV(0x10) or GPIO_MV(0x11).
  * @num_sources: Number of power-sources supported by this GPIO.
  * @power_source: Current power-source used.
  * @buffer_type: Push-pull, open-drain or open-source.
  * @pullup: Constant current which flow trough GPIO output buffer.
  * @strength: No, Low, Medium, High
  * @function: See pmic_gpio_functions[]
+ * @atest: the ATEST selection for GPIO analog-pass-through mode
  */
 struct pmic_gpio_pad {
 	u16		base;
@@ -117,12 +154,15 @@ struct pmic_gpio_pad {
 	bool		have_buffer;
 	bool		output_enabled;
 	bool		input_enabled;
+	bool		analog_pass;
+	bool		lv_mv_type;
 	unsigned int	num_sources;
 	unsigned int	power_source;
 	unsigned int	buffer_type;
 	unsigned int	pullup;
 	unsigned int	strength;
 	unsigned int	function;
+	unsigned int	atest;
 };
 
 struct pmic_gpio_state {
@@ -135,12 +175,16 @@ struct pmic_gpio_state {
 static const struct pinconf_generic_params pmic_gpio_bindings[] = {
 	{"qcom,pull-up-strength",	PMIC_GPIO_CONF_PULL_UP,		0},
 	{"qcom,drive-strength",		PMIC_GPIO_CONF_STRENGTH,	0},
+	{"qcom,atest",			PMIC_GPIO_CONF_ATEST,		0},
+	{"qcom,analog-pass",		PMIC_GPIO_CONF_ANALOG_PASS,	0},
 };
 
 #ifdef CONFIG_DEBUG_FS
 static const struct pin_config_item pmic_conf_items[ARRAY_SIZE(pmic_gpio_bindings)] = {
 	PCONFDUMP(PMIC_GPIO_CONF_PULL_UP,  "pull up strength", NULL, true),
 	PCONFDUMP(PMIC_GPIO_CONF_STRENGTH, "drive-strength", NULL, true),
+	PCONFDUMP(PMIC_GPIO_CONF_ATEST, "atest", NULL, true),
+	PCONFDUMP(PMIC_GPIO_CONF_ANALOG_PASS, "analog-pass", NULL, true),
 };
 #endif
 
@@ -153,10 +197,16 @@ static const char *const pmic_gpio_groups[] = {
 };
 
 static const char *const pmic_gpio_functions[] = {
-	PMIC_GPIO_FUNC_NORMAL, PMIC_GPIO_FUNC_PAIRED,
-	PMIC_GPIO_FUNC_FUNC1, PMIC_GPIO_FUNC_FUNC2,
-	PMIC_GPIO_FUNC_DTEST1, PMIC_GPIO_FUNC_DTEST2,
-	PMIC_GPIO_FUNC_DTEST3, PMIC_GPIO_FUNC_DTEST4,
+	[PMIC_GPIO_FUNC_INDEX_NORMAL]	= PMIC_GPIO_FUNC_NORMAL,
+	[PMIC_GPIO_FUNC_INDEX_PAIRED]	= PMIC_GPIO_FUNC_PAIRED,
+	[PMIC_GPIO_FUNC_INDEX_FUNC1]	= PMIC_GPIO_FUNC_FUNC1,
+	[PMIC_GPIO_FUNC_INDEX_FUNC2]	= PMIC_GPIO_FUNC_FUNC2,
+	[PMIC_GPIO_FUNC_INDEX_FUNC3]	= PMIC_GPIO_FUNC_FUNC3,
+	[PMIC_GPIO_FUNC_INDEX_FUNC4]	= PMIC_GPIO_FUNC_FUNC4,
+	[PMIC_GPIO_FUNC_INDEX_DTEST1]	= PMIC_GPIO_FUNC_DTEST1,
+	[PMIC_GPIO_FUNC_INDEX_DTEST2]	= PMIC_GPIO_FUNC_DTEST2,
+	[PMIC_GPIO_FUNC_INDEX_DTEST3]	= PMIC_GPIO_FUNC_DTEST3,
+	[PMIC_GPIO_FUNC_INDEX_DTEST4]	= PMIC_GPIO_FUNC_DTEST4,
 };
 
 static int pmic_gpio_read(struct pmic_gpio_state *state,
@@ -244,25 +294,67 @@ static int pmic_gpio_set_mux(struct pinctrl_dev *pctldev, unsigned function,
 	unsigned int val;
 	int ret;
 
+	if (function > PMIC_GPIO_FUNC_INDEX_DTEST4) {
+		pr_err("function: %d is not defined\n", function);
+		return -EINVAL;
+	}
+
 	pad = pctldev->desc->pins[pin].drv_data;
+	/*
+	 * Non-LV/MV subtypes only support 2 special functions,
+	 * offsetting the dtestx function values by 2
+	 */
+	if (!pad->lv_mv_type) {
+		if (function == PMIC_GPIO_FUNC_INDEX_FUNC3 ||
+				function == PMIC_GPIO_FUNC_INDEX_FUNC4) {
+			pr_err("LV/MV subtype doesn't have func3/func4\n");
+			return -EINVAL;
+		}
+		if (function >= PMIC_GPIO_FUNC_INDEX_DTEST1)
+			function -= (PMIC_GPIO_FUNC_INDEX_DTEST1 -
+					PMIC_GPIO_FUNC_INDEX_FUNC3);
+	}
 
 	pad->function = function;
 
-	val = 0;
-	if (pad->output_enabled) {
-		if (pad->input_enabled)
-			val = 2;
-		else
-			val = 1;
-	}
+	if (pad->analog_pass)
+		val = PMIC_GPIO_MODE_ANALOG_PASS_THRU;
+	else if (pad->output_enabled && pad->input_enabled)
+		val = PMIC_GPIO_MODE_DIGITAL_INPUT_OUTPUT;
+	else if (pad->output_enabled)
+		val = PMIC_GPIO_MODE_DIGITAL_OUTPUT;
+	else
+		val = PMIC_GPIO_MODE_DIGITAL_INPUT;
 
-	val = val << PMIC_GPIO_REG_MODE_DIR_SHIFT;
-	val |= pad->function << PMIC_GPIO_REG_MODE_FUNCTION_SHIFT;
-	val |= pad->out_value & PMIC_GPIO_REG_MODE_VALUE_SHIFT;
+	if (pad->lv_mv_type) {
+		ret = pmic_gpio_write(state, pad,
+				PMIC_GPIO_REG_MODE_CTL, val);
+		if (ret < 0)
+			return ret;
 
-	ret = pmic_gpio_write(state, pad, PMIC_GPIO_REG_MODE_CTL, val);
-	if (ret < 0)
-		return ret;
+		val = pad->atest - 1;
+		ret = pmic_gpio_write(state, pad,
+				PMIC_GPIO_REG_LV_MV_ANA_PASS_THRU_SEL, val);
+		if (ret < 0)
+			return ret;
+
+		val = pad->out_value
+			<< PMIC_GPIO_LV_MV_OUTPUT_INVERT_SHIFT;
+		val |= pad->function
+			& PMIC_GPIO_LV_MV_OUTPUT_SOURCE_SEL_MASK;
+		ret = pmic_gpio_write(state, pad,
+			PMIC_GPIO_REG_LV_MV_DIG_OUT_SOURCE_CTL, val);
+		if (ret < 0)
+			return ret;
+	} else {
+		val = val << PMIC_GPIO_REG_MODE_DIR_SHIFT;
+		val |= pad->function << PMIC_GPIO_REG_MODE_FUNCTION_SHIFT;
+		val |= pad->out_value & PMIC_GPIO_REG_MODE_VALUE_SHIFT;
+
+		ret = pmic_gpio_write(state, pad, PMIC_GPIO_REG_MODE_CTL, val);
+		if (ret < 0)
+			return ret;
+	}
 
 	val = pad->is_enabled << PMIC_GPIO_REG_MASTER_EN_SHIFT;
 
@@ -322,6 +414,12 @@ static int pmic_gpio_config_get(struct pinctrl_dev *pctldev,
 	case PMIC_GPIO_CONF_STRENGTH:
 		arg = pad->strength;
 		break;
+	case PMIC_GPIO_CONF_ATEST:
+		arg = pad->atest;
+		break;
+	case PMIC_GPIO_CONF_ANALOG_PASS:
+		arg = pad->analog_pass;
+		break;
 	default:
 		return -EINVAL;
 	}
@@ -396,6 +494,16 @@ static int pmic_gpio_config_set(struct pinctrl_dev *pctldev, unsigned int pin,
 				return -EINVAL;
 			pad->strength = arg;
 			break;
+		case PMIC_GPIO_CONF_ATEST:
+			if (!pad->lv_mv_type || arg > 4)
+				return -EINVAL;
+			pad->atest = arg;
+			break;
+		case PMIC_GPIO_CONF_ANALOG_PASS:
+			if (!pad->lv_mv_type)
+				return -EINVAL;
+			pad->analog_pass = true;
+			break;
 		default:
 			return -EINVAL;
 		}
@@ -420,19 +528,46 @@ static int pmic_gpio_config_set(struct pinctrl_dev *pctldev, unsigned int pin,
 	if (ret < 0)
 		return ret;
 
-	val = 0;
-	if (pad->output_enabled) {
-		if (pad->input_enabled)
-			val = 2;
-		else
-			val = 1;
-	}
+	if (pad->analog_pass)
+		val = PMIC_GPIO_MODE_ANALOG_PASS_THRU;
+	else if (pad->output_enabled && pad->input_enabled)
+		val = PMIC_GPIO_MODE_DIGITAL_INPUT_OUTPUT;
+	else if (pad->output_enabled)
+		val = PMIC_GPIO_MODE_DIGITAL_OUTPUT;
+	else
+		val = PMIC_GPIO_MODE_DIGITAL_INPUT;
 
-	val = val << PMIC_GPIO_REG_MODE_DIR_SHIFT;
-	val |= pad->function << PMIC_GPIO_REG_MODE_FUNCTION_SHIFT;
-	val |= pad->out_value & PMIC_GPIO_REG_MODE_VALUE_SHIFT;
+	if (pad->lv_mv_type) {
+		ret = pmic_gpio_write(state, pad,
+				PMIC_GPIO_REG_MODE_CTL, val);
+		if (ret < 0)
+			return ret;
+
+		val = pad->atest - 1;
+		ret = pmic_gpio_write(state, pad,
+				PMIC_GPIO_REG_LV_MV_ANA_PASS_THRU_SEL, val);
+		if (ret < 0)
+			return ret;
+
+		val = pad->out_value
+			<< PMIC_GPIO_LV_MV_OUTPUT_INVERT_SHIFT;
+		val |= pad->function
+			& PMIC_GPIO_LV_MV_OUTPUT_SOURCE_SEL_MASK;
+		ret = pmic_gpio_write(state, pad,
+			PMIC_GPIO_REG_LV_MV_DIG_OUT_SOURCE_CTL, val);
+		if (ret < 0)
+			return ret;
+	} else {
+		val = val << PMIC_GPIO_REG_MODE_DIR_SHIFT;
+		val |= pad->function << PMIC_GPIO_REG_MODE_FUNCTION_SHIFT;
+		val |= pad->out_value & PMIC_GPIO_REG_MODE_VALUE_SHIFT;
 
-	return pmic_gpio_write(state, pad, PMIC_GPIO_REG_MODE_CTL, val);
+		ret = pmic_gpio_write(state, pad, PMIC_GPIO_REG_MODE_CTL, val);
+		if (ret < 0)
+			return ret;
+	}
+
+	return ret;
 }
 
 static void pmic_gpio_config_dbg_show(struct pinctrl_dev *pctldev,
@@ -440,7 +575,7 @@ static void pmic_gpio_config_dbg_show(struct pinctrl_dev *pctldev,
 {
 	struct pmic_gpio_state *state = pinctrl_dev_get_drvdata(pctldev);
 	struct pmic_gpio_pad *pad;
-	int ret, val;
+	int ret, val, function;
 
 	static const char *const biases[] = {
 		"pull-up 30uA", "pull-up 1.5uA", "pull-up 31.5uA",
@@ -462,7 +597,6 @@ static void pmic_gpio_config_dbg_show(struct pinctrl_dev *pctldev,
 	if (val < 0 || !(val >> PMIC_GPIO_REG_MASTER_EN_SHIFT)) {
 		seq_puts(s, " ---");
 	} else {
-
 		if (pad->input_enabled) {
 			ret = pmic_gpio_read(state, pad, PMIC_MPP_REG_RT_STS);
 			if (ret < 0)
@@ -471,14 +605,28 @@ static void pmic_gpio_config_dbg_show(struct pinctrl_dev *pctldev,
 			ret &= PMIC_MPP_REG_RT_STS_VAL_MASK;
 			pad->out_value = ret;
 		}
-
-		seq_printf(s, " %-4s", pad->output_enabled ? "out" : "in");
-		seq_printf(s, " %-7s", pmic_gpio_functions[pad->function]);
+		/*
+		 * For the non-LV/MV subtypes only 2 special functions are
+		 * available, offsetting the dtest function values by 2.
+		 */
+		function = pad->function;
+		if (!pad->lv_mv_type &&
+				pad->function >= PMIC_GPIO_FUNC_INDEX_FUNC3)
+			function += PMIC_GPIO_FUNC_INDEX_DTEST1 -
+				PMIC_GPIO_FUNC_INDEX_FUNC3;
+
+		if (pad->analog_pass)
+			seq_puts(s, " analog-pass");
+		else
+			seq_printf(s, " %-4s",
+					pad->output_enabled ? "out" : "in");
+		seq_printf(s, " %-7s", pmic_gpio_functions[function]);
 		seq_printf(s, " vin-%d", pad->power_source);
 		seq_printf(s, " %-27s", biases[pad->pullup]);
 		seq_printf(s, " %-10s", buffer_types[pad->buffer_type]);
 		seq_printf(s, " %-4s", pad->out_value ? "high" : "low");
 		seq_printf(s, " %-7s", strengths[pad->strength]);
+		seq_printf(s, " atest-%d", pad->atest);
 	}
 }
 
@@ -618,40 +766,71 @@ static int pmic_gpio_populate(struct pmic_gpio_state *state,
 	case PMIC_GPIO_SUBTYPE_GPIOC_8CH:
 		pad->num_sources = 8;
 		break;
+	case PMIC_GPIO_SUBTYPE_GPIO_LV:
+		pad->num_sources = 1;
+		pad->have_buffer = true;
+		pad->lv_mv_type = true;
+		break;
+	case PMIC_GPIO_SUBTYPE_GPIO_MV:
+		pad->num_sources = 2;
+		pad->have_buffer = true;
+		pad->lv_mv_type = true;
+		break;
 	default:
 		dev_err(state->dev, "unknown GPIO type 0x%x\n", subtype);
 		return -ENODEV;
 	}
 
-	val = pmic_gpio_read(state, pad, PMIC_GPIO_REG_MODE_CTL);
-	if (val < 0)
-		return val;
+	if (pad->lv_mv_type) {
+		val = pmic_gpio_read(state, pad,
+				PMIC_GPIO_REG_LV_MV_DIG_OUT_SOURCE_CTL);
+		if (val < 0)
+			return val;
+
+		pad->out_value = !!(val & PMIC_GPIO_LV_MV_OUTPUT_INVERT);
+		pad->function = val & PMIC_GPIO_LV_MV_OUTPUT_SOURCE_SEL_MASK;
+
+		val = pmic_gpio_read(state, pad, PMIC_GPIO_REG_MODE_CTL);
+		if (val < 0)
+			return val;
+
+		dir = val & PMIC_GPIO_REG_LV_MV_MODE_DIR_MASK;
+	} else {
+		val = pmic_gpio_read(state, pad, PMIC_GPIO_REG_MODE_CTL);
+		if (val < 0)
+			return val;
 
-	pad->out_value = val & PMIC_GPIO_REG_MODE_VALUE_SHIFT;
+		pad->out_value = val & PMIC_GPIO_REG_MODE_VALUE_SHIFT;
+
+		dir = val >> PMIC_GPIO_REG_MODE_DIR_SHIFT;
+		dir &= PMIC_GPIO_REG_MODE_DIR_MASK;
+		pad->function = val >> PMIC_GPIO_REG_MODE_FUNCTION_SHIFT;
+		pad->function &= PMIC_GPIO_REG_MODE_FUNCTION_MASK;
+	}
 
-	dir = val >> PMIC_GPIO_REG_MODE_DIR_SHIFT;
-	dir &= PMIC_GPIO_REG_MODE_DIR_MASK;
 	switch (dir) {
-	case 0:
+	case PMIC_GPIO_MODE_DIGITAL_INPUT:
 		pad->input_enabled = true;
 		pad->output_enabled = false;
 		break;
-	case 1:
+	case PMIC_GPIO_MODE_DIGITAL_OUTPUT:
 		pad->input_enabled = false;
 		pad->output_enabled = true;
 		break;
-	case 2:
+	case PMIC_GPIO_MODE_DIGITAL_INPUT_OUTPUT:
 		pad->input_enabled = true;
 		pad->output_enabled = true;
 		break;
+	case PMIC_GPIO_MODE_ANALOG_PASS_THRU:
+		if (!pad->lv_mv_type)
+			return -ENODEV;
+		pad->analog_pass = true;
+		break;
 	default:
 		dev_err(state->dev, "unknown GPIO direction\n");
 		return -ENODEV;
 	}
 
-	pad->function = val >> PMIC_GPIO_REG_MODE_FUNCTION_SHIFT;
-	pad->function &= PMIC_GPIO_REG_MODE_FUNCTION_MASK;
-
 	val = pmic_gpio_read(state, pad, PMIC_GPIO_REG_DIG_VIN_CTL);
 	if (val < 0)
 		return val;
@@ -666,16 +845,20 @@ static int pmic_gpio_populate(struct pmic_gpio_state *state,
 	pad->pullup = val >> PMIC_GPIO_REG_PULL_SHIFT;
 	pad->pullup &= PMIC_GPIO_REG_PULL_MASK;
 
-	val = pmic_gpio_read(state, pad, PMIC_GPIO_REG_DIG_OUT_CTL);
-	if (val < 0)
-		return val;
-
 	pad->strength = val >> PMIC_GPIO_REG_OUT_STRENGTH_SHIFT;
 	pad->strength &= PMIC_GPIO_REG_OUT_STRENGTH_MASK;
 
 	pad->buffer_type = val >> PMIC_GPIO_REG_OUT_TYPE_SHIFT;
 	pad->buffer_type &= PMIC_GPIO_REG_OUT_TYPE_MASK;
 
+	if (pad->lv_mv_type) {
+		val = pmic_gpio_read(state, pad,
+				PMIC_GPIO_REG_LV_MV_ANA_PASS_THRU_SEL);
+		if (val < 0)
+			return val;
+		pad->atest = (val & PMIC_GPIO_LV_MV_ANA_MUX_SEL_MASK) + 1;
+	}
+
 	/* Pin could be disabled with PIN_CONFIG_BIAS_HIGH_IMPEDANCE */
 	pad->is_enabled = true;
 	return 0;
diff --git a/include/dt-bindings/pinctrl/qcom,pmic-gpio.h b/include/dt-bindings/pinctrl/qcom,pmic-gpio.h
index d33f17c8a515..b8ff8824e21b 100644
--- a/include/dt-bindings/pinctrl/qcom,pmic-gpio.h
+++ b/include/dt-bindings/pinctrl/qcom,pmic-gpio.h
@@ -98,6 +98,8 @@
 #define PMIC_GPIO_FUNC_PAIRED		"paired"
 #define PMIC_GPIO_FUNC_FUNC1		"func1"
 #define PMIC_GPIO_FUNC_FUNC2		"func2"
+#define PMIC_GPIO_FUNC_FUNC3		"func3"
+#define PMIC_GPIO_FUNC_FUNC4		"func4"
 #define PMIC_GPIO_FUNC_DTEST1		"dtest1"
 #define PMIC_GPIO_FUNC_DTEST2		"dtest2"
 #define PMIC_GPIO_FUNC_DTEST3		"dtest3"
-- 
cgit v1.2.3


From b3c4ec71ec413c2e5bfb028bdf1737af07f1fde0 Mon Sep 17 00:00:00 2001
From: Abdulhadi Mohamed <abdulahhadi2@gmail.com>
Date: Wed, 19 Jul 2017 16:31:10 +0100
Subject: usb: gadget: f_hid: {GET,SET} PROTOCOL Support

The current f_hid driver doesn't handle GET_PROCOTOL and
SET_PROCOTOL requests, which are required to operate HID
gadgets in BOOT mode. This patch implements this feature for
devices that have the same implementation for REPORT and BOOT mode
so that these devices are recognized by older BIOSes.

Signed-off-by: Abdulhadi Mohamed <abdulahhadi2@gmail.com>
Signed-off-by: Felipe Balbi <felipe.balbi@linux.intel.com>
---
 drivers/usb/gadget/function/f_hid.c | 17 ++++++++++++++++-
 include/linux/hid.h                 |  6 ++++++
 2 files changed, 22 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/drivers/usb/gadget/function/f_hid.c b/drivers/usb/gadget/function/f_hid.c
index 5eea44823ca0..d8e359ef6eb1 100644
--- a/drivers/usb/gadget/function/f_hid.c
+++ b/drivers/usb/gadget/function/f_hid.c
@@ -44,6 +44,7 @@ struct f_hidg {
 	/* configuration */
 	unsigned char			bInterfaceSubClass;
 	unsigned char			bInterfaceProtocol;
+	unsigned char			protocol;
 	unsigned short			report_desc_length;
 	char				*report_desc;
 	unsigned short			report_length;
@@ -527,7 +528,9 @@ static int hidg_setup(struct usb_function *f,
 	case ((USB_DIR_IN | USB_TYPE_CLASS | USB_RECIP_INTERFACE) << 8
 		  | HID_REQ_GET_PROTOCOL):
 		VDBG(cdev, "get_protocol\n");
-		goto stall;
+		length = min_t(unsigned int, length, 1);
+		((u8 *) req->buf)[0] = hidg->protocol;
+		goto respond;
 		break;
 
 	case ((USB_DIR_OUT | USB_TYPE_CLASS | USB_RECIP_INTERFACE) << 8
@@ -539,6 +542,17 @@ static int hidg_setup(struct usb_function *f,
 	case ((USB_DIR_OUT | USB_TYPE_CLASS | USB_RECIP_INTERFACE) << 8
 		  | HID_REQ_SET_PROTOCOL):
 		VDBG(cdev, "set_protocol\n");
+		if (value > HID_REPORT_PROTOCOL)
+			goto stall;
+		length = 0;
+		/*
+		 * We assume that programs implementing the Boot protocol
+		 * are also compatible with the Report Protocol
+		 */
+		if (hidg->bInterfaceSubClass == USB_INTERFACE_SUBCLASS_BOOT) {
+			hidg->protocol = value;
+			goto respond;
+		}
 		goto stall;
 		break;
 
@@ -768,6 +782,7 @@ static int hidg_bind(struct usb_configuration *c, struct usb_function *f)
 	/* set descriptor dynamic values */
 	hidg_interface_desc.bInterfaceSubClass = hidg->bInterfaceSubClass;
 	hidg_interface_desc.bInterfaceProtocol = hidg->bInterfaceProtocol;
+	hidg->protocol = HID_REPORT_PROTOCOL;
 	hidg_ss_in_ep_desc.wMaxPacketSize = cpu_to_le16(hidg->report_length);
 	hidg_ss_in_comp_desc.wBytesPerInterval =
 				cpu_to_le16(hidg->report_length);
diff --git a/include/linux/hid.h b/include/linux/hid.h
index 5006f9b5d837..6519cdc4c7d3 100644
--- a/include/linux/hid.h
+++ b/include/linux/hid.h
@@ -362,6 +362,12 @@ struct hid_item {
 #define HID_GROUP_WACOM				0x0101
 #define HID_GROUP_LOGITECH_DJ_DEVICE		0x0102
 
+/*
+ * HID protocol status
+ */
+#define HID_REPORT_PROTOCOL	1
+#define HID_BOOT_PROTOCOL	0
+
 /*
  * This is the global environment of the parser. This information is
  * persistent for main-items. The global environment can be saved and
-- 
cgit v1.2.3


From 31fe084ffaaf8abece14f8ca28e5e3b4e2bf97b6 Mon Sep 17 00:00:00 2001
From: Jack Pham <jackp@codeaurora.org>
Date: Tue, 1 Aug 2017 02:00:56 -0700
Subject: usb: gadget: core: unmap request from DMA only if previously mapped

In the SG case this is already handled since a non-zero
request->num_mapped_sgs is a clear indicator that dma_map_sg()
had been called. While it would be nice to do the same for the
singly mapped case by simply checking for non-zero request->dma,
it's conceivable that 0 is a valid dma_addr_t handle. Hence add
a flag 'dma_mapped' to struct usb_request and use this to
determine the need to call dma_unmap_single(). Otherwise, if a
request is not DMA mapped then the result of calling
usb_request_unmap_request() would safely be a no-op.

Signed-off-by: Jack Pham <jackp@codeaurora.org>
Signed-off-by: Felipe Balbi <felipe.balbi@linux.intel.com>
---
 drivers/usb/gadget/udc/core.c | 5 ++++-
 include/linux/usb/gadget.h    | 2 ++
 2 files changed, 6 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/drivers/usb/gadget/udc/core.c b/drivers/usb/gadget/udc/core.c
index e6f04eee95c4..c1cef6a11ecb 100644
--- a/drivers/usb/gadget/udc/core.c
+++ b/drivers/usb/gadget/udc/core.c
@@ -812,6 +812,8 @@ int usb_gadget_map_request_by_dev(struct device *dev,
 			dev_err(dev, "failed to map buffer\n");
 			return -EFAULT;
 		}
+
+		req->dma_mapped = 1;
 	}
 
 	return 0;
@@ -836,9 +838,10 @@ void usb_gadget_unmap_request_by_dev(struct device *dev,
 				is_in ? DMA_TO_DEVICE : DMA_FROM_DEVICE);
 
 		req->num_mapped_sgs = 0;
-	} else {
+	} else if (req->dma_mapped) {
 		dma_unmap_single(dev, req->dma, req->length,
 				is_in ? DMA_TO_DEVICE : DMA_FROM_DEVICE);
+		req->dma_mapped = 0;
 	}
 }
 EXPORT_SYMBOL_GPL(usb_gadget_unmap_request_by_dev);
diff --git a/include/linux/usb/gadget.h b/include/linux/usb/gadget.h
index 1a4a4bacfae6..21468a722c4a 100644
--- a/include/linux/usb/gadget.h
+++ b/include/linux/usb/gadget.h
@@ -48,6 +48,7 @@ struct usb_ep;
  *     by adding a zero length packet as needed;
  * @short_not_ok: When reading data, makes short packets be
  *     treated as errors (queue stops advancing till cleanup).
+ * @dma_mapped: Indicates if request has been mapped to DMA (internal)
  * @complete: Function called when request completes, so this request and
  *	its buffer may be re-used.  The function will always be called with
  *	interrupts disabled, and it must not sleep.
@@ -103,6 +104,7 @@ struct usb_request {
 	unsigned		no_interrupt:1;
 	unsigned		zero:1;
 	unsigned		short_not_ok:1;
+	unsigned		dma_mapped:1;
 
 	void			(*complete)(struct usb_ep *ep,
 					struct usb_request *req);
-- 
cgit v1.2.3


From a551e27368dea202cbef3e8861c21d965427cfe6 Mon Sep 17 00:00:00 2001
From: Chenglin Xu <chenglin.xu@mediatek.com>
Date: Tue, 15 Aug 2017 17:09:15 +0800
Subject: regulator: mt6380: Add support for MT6380

The MT6380 is a regulator found those boards with MediaTek MT7622 SoC
It is connected as a slave to the SoC using MediaTek PMIC wrapper which
is the common interface connecting with Mediatek made various PMICs.

Signed-off-by: Chenglin Xu <chenglin.xu@mediatek.com>
Signed-off-by: Sean Wang <sean.wang@mediatek.com>
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 drivers/regulator/Kconfig                  |   9 +
 drivers/regulator/Makefile                 |   1 +
 drivers/regulator/mt6380-regulator.c       | 352 +++++++++++++++++++++++++++++
 include/linux/regulator/mt6380-regulator.h |  32 +++
 4 files changed, 394 insertions(+)
 create mode 100644 drivers/regulator/mt6380-regulator.c
 create mode 100644 include/linux/regulator/mt6380-regulator.h

(limited to 'include')

diff --git a/drivers/regulator/Kconfig b/drivers/regulator/Kconfig
index 99b9362331b5..1205e82971d0 100644
--- a/drivers/regulator/Kconfig
+++ b/drivers/regulator/Kconfig
@@ -559,6 +559,15 @@ config REGULATOR_MT6323
 	  This driver supports the control of different power rails of device
 	  through regulator interface.
 
+config REGULATOR_MT6380
+	tristate "MediaTek MT6380 PMIC"
+	depends on MTK_PMIC_WRAP
+	help
+	  Say y here to select this option to enable the power regulator of
+	  MediaTek MT6380 PMIC.
+	  This driver supports the control of different power rails of device
+	  through regulator interface.
+
 config REGULATOR_MT6397
 	tristate "MediaTek MT6397 PMIC"
 	depends on MFD_MT6397
diff --git a/drivers/regulator/Makefile b/drivers/regulator/Makefile
index 95b1e86ae692..81514b8c3932 100644
--- a/drivers/regulator/Makefile
+++ b/drivers/regulator/Makefile
@@ -72,6 +72,7 @@ obj-$(CONFIG_REGULATOR_MC13892) += mc13892-regulator.o
 obj-$(CONFIG_REGULATOR_MC13XXX_CORE) +=  mc13xxx-regulator-core.o
 obj-$(CONFIG_REGULATOR_MT6311) += mt6311-regulator.o
 obj-$(CONFIG_REGULATOR_MT6323)	+= mt6323-regulator.o
+obj-$(CONFIG_REGULATOR_MT6380)	+= mt6380-regulator.o
 obj-$(CONFIG_REGULATOR_MT6397)	+= mt6397-regulator.o
 obj-$(CONFIG_REGULATOR_QCOM_RPM) += qcom_rpm-regulator.o
 obj-$(CONFIG_REGULATOR_QCOM_SMD_RPM) += qcom_smd-regulator.o
diff --git a/drivers/regulator/mt6380-regulator.c b/drivers/regulator/mt6380-regulator.c
new file mode 100644
index 000000000000..127dd720cbcc
--- /dev/null
+++ b/drivers/regulator/mt6380-regulator.c
@@ -0,0 +1,352 @@
+/*
+ * Copyright (c) 2017 MediaTek Inc.
+ * Author: Chenglin Xu <chenglin.xu@mediatek.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ */
+
+#include <linux/module.h>
+#include <linux/of.h>
+#include <linux/platform_device.h>
+#include <linux/regmap.h>
+#include <linux/regulator/driver.h>
+#include <linux/regulator/machine.h>
+#include <linux/regulator/mt6380-regulator.h>
+#include <linux/regulator/of_regulator.h>
+
+/* PMIC Registers */
+#define MT6380_ALDO_CON_0                         0x0000
+#define MT6380_BTLDO_CON_0                        0x0004
+#define MT6380_COMP_CON_0                         0x0008
+#define MT6380_CPUBUCK_CON_0                      0x000C
+#define MT6380_CPUBUCK_CON_1                      0x0010
+#define MT6380_CPUBUCK_CON_2                      0x0014
+#define MT6380_DDRLDO_CON_0                       0x0018
+#define MT6380_MLDO_CON_0                         0x001C
+#define MT6380_PALDO_CON_0                        0x0020
+#define MT6380_PHYLDO_CON_0                       0x0024
+#define MT6380_SIDO_CON_0                         0x0028
+#define MT6380_SIDO_CON_1                         0x002C
+#define MT6380_SIDO_CON_2                         0x0030
+#define MT6380_SLDO_CON_0                         0x0034
+#define MT6380_TLDO_CON_0                         0x0038
+#define MT6380_STARTUP_CON_0                      0x003C
+#define MT6380_STARTUP_CON_1                      0x0040
+#define MT6380_SMPS_TOP_CON_0                     0x0044
+#define MT6380_SMPS_TOP_CON_1                     0x0048
+#define MT6380_ANA_CTRL_0                         0x0050
+#define MT6380_ANA_CTRL_1                         0x0054
+#define MT6380_ANA_CTRL_2                         0x0058
+#define MT6380_ANA_CTRL_3                         0x005C
+#define MT6380_ANA_CTRL_4                         0x0060
+#define MT6380_SPK_CON9                           0x0064
+#define MT6380_SPK_CON11                          0x0068
+#define MT6380_SPK_CON12                          0x006A
+#define MT6380_CLK_CTRL                           0x0070
+#define MT6380_PINMUX_CTRL                        0x0074
+#define MT6380_IO_CTRL                            0x0078
+#define MT6380_SLP_MODE_CTRL_0                    0x007C
+#define MT6380_SLP_MODE_CTRL_1                    0x0080
+#define MT6380_SLP_MODE_CTRL_2                    0x0084
+#define MT6380_SLP_MODE_CTRL_3                    0x0088
+#define MT6380_SLP_MODE_CTRL_4                    0x008C
+#define MT6380_SLP_MODE_CTRL_5                    0x0090
+#define MT6380_SLP_MODE_CTRL_6                    0x0094
+#define MT6380_SLP_MODE_CTRL_7                    0x0098
+#define MT6380_SLP_MODE_CTRL_8                    0x009C
+#define MT6380_FCAL_CTRL_0                        0x00A0
+#define MT6380_FCAL_CTRL_1                        0x00A4
+#define MT6380_LDO_CTRL_0                         0x00A8
+#define MT6380_LDO_CTRL_1                         0x00AC
+#define MT6380_LDO_CTRL_2                         0x00B0
+#define MT6380_LDO_CTRL_3                         0x00B4
+#define MT6380_LDO_CTRL_4                         0x00B8
+#define MT6380_DEBUG_CTRL_0                       0x00BC
+#define MT6380_EFU_CTRL_0                         0x0200
+#define MT6380_EFU_CTRL_1                         0x0201
+#define MT6380_EFU_CTRL_2                         0x0202
+#define MT6380_EFU_CTRL_3                         0x0203
+#define MT6380_EFU_CTRL_4                         0x0204
+#define MT6380_EFU_CTRL_5                         0x0205
+#define MT6380_EFU_CTRL_6                         0x0206
+#define MT6380_EFU_CTRL_7                         0x0207
+#define MT6380_EFU_CTRL_8                         0x0208
+
+#define MT6380_REGULATOR_MODE_AUTO	0
+#define MT6380_REGULATOR_MODE_FORCE_PWM	1
+
+/*
+ * mt6380 regulators' information
+ *
+ * @desc: standard fields of regulator description
+ * @vselon_reg: Register sections for hardware control mode of bucks
+ * @modeset_reg: Register for controlling the buck/LDO control mode
+ * @modeset_mask: Mask for controlling the buck/LDO control mode
+ */
+struct mt6380_regulator_info {
+	struct regulator_desc desc;
+	u32 vselon_reg;
+	u32 modeset_reg;
+	u32 modeset_mask;
+};
+
+#define MT6380_BUCK(match, vreg, min, max, step, volt_ranges, enreg,	\
+		    vosel, vosel_mask, enbit, voselon, _modeset_reg,	\
+		    _modeset_mask)					\
+[MT6380_ID_##vreg] = {							\
+	.desc = {							\
+		.name = #vreg,						\
+		.of_match = of_match_ptr(match),			\
+		.ops = &mt6380_volt_range_ops,				\
+		.type = REGULATOR_VOLTAGE,				\
+		.id = MT6380_ID_##vreg,					\
+		.owner = THIS_MODULE,					\
+		.n_voltages = ((max) - (min)) / (step) + 1,		\
+		.linear_ranges = volt_ranges,				\
+		.n_linear_ranges = ARRAY_SIZE(volt_ranges),		\
+		.vsel_reg = vosel,					\
+		.vsel_mask = vosel_mask,				\
+		.enable_reg = enreg,					\
+		.enable_mask = BIT(enbit),				\
+	},								\
+	.vselon_reg = voselon,						\
+	.modeset_reg = _modeset_reg,					\
+	.modeset_mask = _modeset_mask,					\
+}
+
+#define MT6380_LDO(match, vreg, ldo_volt_table, enreg, enbit, vosel,	\
+		   vosel_mask, _modeset_reg, _modeset_mask)		\
+[MT6380_ID_##vreg] = {							\
+	.desc = {							\
+		.name = #vreg,						\
+		.of_match = of_match_ptr(match),			\
+		.ops = &mt6380_volt_table_ops,				\
+		.type = REGULATOR_VOLTAGE,				\
+		.id = MT6380_ID_##vreg,					\
+		.owner = THIS_MODULE,					\
+		.n_voltages = ARRAY_SIZE(ldo_volt_table),		\
+		.volt_table = ldo_volt_table,				\
+		.vsel_reg = vosel,					\
+		.vsel_mask = vosel_mask,				\
+		.enable_reg = enreg,					\
+		.enable_mask = BIT(enbit),				\
+	},								\
+	.modeset_reg = _modeset_reg,					\
+	.modeset_mask = _modeset_mask,					\
+}
+
+#define MT6380_REG_FIXED(match, vreg, enreg, enbit, volt,		\
+			 _modeset_reg, _modeset_mask)			\
+[MT6380_ID_##vreg] = {							\
+	.desc = {							\
+		.name = #vreg,						\
+		.of_match = of_match_ptr(match),			\
+		.ops = &mt6380_volt_fixed_ops,				\
+		.type = REGULATOR_VOLTAGE,				\
+		.id = MT6380_ID_##vreg,					\
+		.owner = THIS_MODULE,					\
+		.n_voltages = 1,					\
+		.enable_reg = enreg,					\
+		.enable_mask = BIT(enbit),				\
+		.min_uV = volt,						\
+	},								\
+	.modeset_reg = _modeset_reg,					\
+	.modeset_mask = _modeset_mask,					\
+}
+
+static const struct regulator_linear_range buck_volt_range1[] = {
+	REGULATOR_LINEAR_RANGE(600000, 0, 0xfe, 6250),
+};
+
+static const struct regulator_linear_range buck_volt_range2[] = {
+	REGULATOR_LINEAR_RANGE(600000, 0, 0xfe, 6250),
+};
+
+static const struct regulator_linear_range buck_volt_range3[] = {
+	REGULATOR_LINEAR_RANGE(1200000, 0, 0x3c, 25000),
+};
+
+static const u32 ldo_volt_table1[] = {
+	1400000, 1350000, 1300000, 1250000, 1200000, 1150000, 1100000, 1050000,
+};
+
+static const u32 ldo_volt_table2[] = {
+	2200000, 3300000,
+};
+
+static const u32 ldo_volt_table3[] = {
+	1240000, 1390000, 1540000, 1840000,
+};
+
+static const u32 ldo_volt_table4[] = {
+	2200000, 3300000,
+};
+
+static int mt6380_regulator_set_mode(struct regulator_dev *rdev,
+				     unsigned int mode)
+{
+	int ret, val = 0;
+	struct mt6380_regulator_info *info = rdev_get_drvdata(rdev);
+
+	switch (mode) {
+	case REGULATOR_MODE_NORMAL:
+		val = MT6380_REGULATOR_MODE_AUTO;
+		break;
+	case REGULATOR_MODE_FAST:
+		val = MT6380_REGULATOR_MODE_FORCE_PWM;
+		break;
+	default:
+		return -EINVAL;
+	}
+
+	val <<= ffs(info->modeset_mask) - 1;
+
+	ret = regmap_update_bits(rdev->regmap, info->modeset_reg,
+				 info->modeset_mask, val);
+
+	return ret;
+}
+
+static unsigned int mt6380_regulator_get_mode(struct regulator_dev *rdev)
+{
+	unsigned int val;
+	unsigned int mode;
+	int ret;
+	struct mt6380_regulator_info *info = rdev_get_drvdata(rdev);
+
+	ret = regmap_read(rdev->regmap, info->modeset_reg, &val);
+	if (ret < 0)
+		return ret;
+
+	val &= info->modeset_mask;
+	val >>= ffs(info->modeset_mask) - 1;
+
+	switch (val) {
+	case MT6380_REGULATOR_MODE_AUTO:
+		mode = REGULATOR_MODE_NORMAL;
+		break;
+	case MT6380_REGULATOR_MODE_FORCE_PWM:
+		mode = REGULATOR_MODE_FAST;
+		break;
+	default:
+		return -EINVAL;
+	}
+
+	return mode;
+}
+
+static const struct regulator_ops mt6380_volt_range_ops = {
+	.list_voltage = regulator_list_voltage_linear_range,
+	.map_voltage = regulator_map_voltage_linear_range,
+	.set_voltage_sel = regulator_set_voltage_sel_regmap,
+	.get_voltage_sel = regulator_get_voltage_sel_regmap,
+	.set_voltage_time_sel = regulator_set_voltage_time_sel,
+	.enable = regulator_enable_regmap,
+	.disable = regulator_disable_regmap,
+	.is_enabled = regulator_is_enabled_regmap,
+	.set_mode = mt6380_regulator_set_mode,
+	.get_mode = mt6380_regulator_get_mode,
+};
+
+static const struct regulator_ops mt6380_volt_table_ops = {
+	.list_voltage = regulator_list_voltage_table,
+	.map_voltage = regulator_map_voltage_iterate,
+	.set_voltage_sel = regulator_set_voltage_sel_regmap,
+	.get_voltage_sel = regulator_get_voltage_sel_regmap,
+	.set_voltage_time_sel = regulator_set_voltage_time_sel,
+	.enable = regulator_enable_regmap,
+	.disable = regulator_disable_regmap,
+	.is_enabled = regulator_is_enabled_regmap,
+	.set_mode = mt6380_regulator_set_mode,
+	.get_mode = mt6380_regulator_get_mode,
+};
+
+static const struct regulator_ops mt6380_volt_fixed_ops = {
+	.list_voltage = regulator_list_voltage_linear,
+	.enable = regulator_enable_regmap,
+	.disable = regulator_disable_regmap,
+	.is_enabled = regulator_is_enabled_regmap,
+	.set_mode = mt6380_regulator_set_mode,
+	.get_mode = mt6380_regulator_get_mode,
+};
+
+/* The array is indexed by id(MT6380_ID_XXX) */
+static struct mt6380_regulator_info mt6380_regulators[] = {
+	MT6380_BUCK("buck-vcore1", VCPU, 600000, 1393750, 6250,
+		    buck_volt_range1, MT6380_ANA_CTRL_3, MT6380_ANA_CTRL_1,
+		    0xfe, 3, MT6380_ANA_CTRL_1,
+		    MT6380_CPUBUCK_CON_0, 0x8000000),
+	MT6380_BUCK("buck-vcore", VCORE, 600000, 1393750, 6250,
+		    buck_volt_range2, MT6380_ANA_CTRL_3, MT6380_ANA_CTRL_2,
+		    0xfe, 2, MT6380_ANA_CTRL_2, MT6380_SIDO_CON_0, 0x1000000),
+	MT6380_BUCK("buck-vrf", VRF, 1200000, 1575000, 25000,
+		    buck_volt_range3, MT6380_ANA_CTRL_3, MT6380_SIDO_CON_0,
+		    0x78, 1, MT6380_SIDO_CON_0, MT6380_SIDO_CON_0, 0x8000),
+	MT6380_LDO("ldo-vm", VMLDO, ldo_volt_table1, MT6380_LDO_CTRL_0,
+		   1, MT6380_MLDO_CON_0, 0xE000, MT6380_ANA_CTRL_1, 0x4000000),
+	MT6380_LDO("ldo-va", VALDO, ldo_volt_table2, MT6380_LDO_CTRL_0,
+		   2, MT6380_ALDO_CON_0, 0x400, MT6380_ALDO_CON_0, 0x20),
+	MT6380_REG_FIXED("ldo-vphy", VPHYLDO, MT6380_LDO_CTRL_0, 7, 1800000,
+			 MT6380_PHYLDO_CON_0, 0x80),
+	MT6380_LDO("ldo-vddr", VDDRLDO, ldo_volt_table3, MT6380_LDO_CTRL_0,
+		   8, MT6380_DDRLDO_CON_0, 0x3000, MT6380_DDRLDO_CON_0, 0x80),
+	MT6380_LDO("ldo-vt", VTLDO, ldo_volt_table4, MT6380_LDO_CTRL_0, 3,
+		   MT6380_TLDO_CON_0, 0x400, MT6380_TLDO_CON_0, 0x20),
+};
+
+static int mt6380_regulator_probe(struct platform_device *pdev)
+{
+	struct regmap *regmap = dev_get_regmap(pdev->dev.parent, NULL);
+	struct regulator_config config = {};
+	struct regulator_dev *rdev;
+	int i;
+
+	for (i = 0; i < MT6380_MAX_REGULATOR; i++) {
+		config.dev = &pdev->dev;
+		config.driver_data = &mt6380_regulators[i];
+		config.regmap = regmap;
+		rdev = devm_regulator_register(&pdev->dev,
+					       &mt6380_regulators[i].desc,
+				&config);
+		if (IS_ERR(rdev)) {
+			dev_err(&pdev->dev, "failed to register %s\n",
+				mt6380_regulators[i].desc.name);
+			return PTR_ERR(rdev);
+		}
+	}
+	return 0;
+}
+
+static const struct platform_device_id mt6380_platform_ids[] = {
+	{"mt6380-regulator", 0},
+	{ /* sentinel */ },
+};
+MODULE_DEVICE_TABLE(platform, mt6380_platform_ids);
+
+static const struct of_device_id mt6380_of_match[] = {
+	{ .compatible = "mediatek,mt6380-regulator", },
+	{ /* sentinel */ },
+};
+MODULE_DEVICE_TABLE(of, mt6380_of_match);
+
+static struct platform_driver mt6380_regulator_driver = {
+	.driver = {
+		.name = "mt6380-regulator",
+		.of_match_table = of_match_ptr(mt6380_of_match),
+	},
+	.probe = mt6380_regulator_probe,
+	.id_table = mt6380_platform_ids,
+};
+
+module_platform_driver(mt6380_regulator_driver);
+
+MODULE_AUTHOR("Chenglin Xu <chenglin.xu@mediatek.com>");
+MODULE_DESCRIPTION("Regulator Driver for MediaTek MT6380 PMIC");
+MODULE_LICENSE("GPL v2");
diff --git a/include/linux/regulator/mt6380-regulator.h b/include/linux/regulator/mt6380-regulator.h
new file mode 100644
index 000000000000..465182da6315
--- /dev/null
+++ b/include/linux/regulator/mt6380-regulator.h
@@ -0,0 +1,32 @@
+/*
+ * Copyright (c) 2017 MediaTek Inc.
+ * Author: Chenglin Xu <chenglin.xu@mediatek.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ */
+
+#ifndef __LINUX_REGULATOR_mt6380_H
+#define __LINUX_REGULATOR_mt6380_H
+
+enum {
+	MT6380_ID_VCPU = 0,
+	MT6380_ID_VCORE,
+	MT6380_ID_VRF,
+	MT6380_ID_VMLDO,
+	MT6380_ID_VALDO,
+	MT6380_ID_VPHYLDO,
+	MT6380_ID_VDDRLDO,
+	MT6380_ID_VTLDO,
+	MT6380_ID_RG_MAX,
+};
+
+#define MT6380_MAX_REGULATOR	MT6380_ID_RG_MAX
+
+#endif /* __LINUX_REGULATOR_mt6380_H */
-- 
cgit v1.2.3


From 2926a2aa5c14fb2add75e6584845b1c03022235f Mon Sep 17 00:00:00 2001
From: Joerg Roedel <jroedel@suse.de>
Date: Mon, 14 Aug 2017 17:19:26 +0200
Subject: iommu: Fix wrong freeing of iommu_device->dev

The struct iommu_device has a 'struct device' embedded into
it, not as a pointer, but the whole struct. In the
conversion of the iommu drivers to use struct iommu_device
it was forgotten that the relase function for that struct
device simply calls kfree() on the pointer.

This frees memory that was never allocated and causes memory
corruption.

To fix this issue, use a pointer to struct device instead of
embedding the whole struct. This needs some updates in the
iommu sysfs code as well as the Intel VT-d and AMD IOMMU
driver.

Reported-by: Sebastian Ott <sebott@linux.vnet.ibm.com>
Fixes: 39ab9555c241 ('iommu: Add sysfs bindings for struct iommu_device')
Cc: stable@vger.kernel.org # >= v4.11
Signed-off-by: Joerg Roedel <jroedel@suse.de>
---
 drivers/iommu/amd_iommu_types.h |  4 +++-
 drivers/iommu/intel-iommu.c     |  4 +++-
 drivers/iommu/iommu-sysfs.c     | 32 ++++++++++++++++++++------------
 include/linux/iommu.h           | 12 +++++++++++-
 4 files changed, 37 insertions(+), 15 deletions(-)

(limited to 'include')

diff --git a/drivers/iommu/amd_iommu_types.h b/drivers/iommu/amd_iommu_types.h
index 294a409e283b..d6b873b57054 100644
--- a/drivers/iommu/amd_iommu_types.h
+++ b/drivers/iommu/amd_iommu_types.h
@@ -574,7 +574,9 @@ struct amd_iommu {
 
 static inline struct amd_iommu *dev_to_amd_iommu(struct device *dev)
 {
-	return container_of(dev, struct amd_iommu, iommu.dev);
+	struct iommu_device *iommu = dev_to_iommu_device(dev);
+
+	return container_of(iommu, struct amd_iommu, iommu);
 }
 
 #define ACPIHID_UID_LEN 256
diff --git a/drivers/iommu/intel-iommu.c b/drivers/iommu/intel-iommu.c
index 687f18f65cea..3e8636f1220e 100644
--- a/drivers/iommu/intel-iommu.c
+++ b/drivers/iommu/intel-iommu.c
@@ -4736,7 +4736,9 @@ static void intel_disable_iommus(void)
 
 static inline struct intel_iommu *dev_to_intel_iommu(struct device *dev)
 {
-	return container_of(dev, struct intel_iommu, iommu.dev);
+	struct iommu_device *iommu_dev = dev_to_iommu_device(dev);
+
+	return container_of(iommu_dev, struct intel_iommu, iommu);
 }
 
 static ssize_t intel_iommu_show_version(struct device *dev,
diff --git a/drivers/iommu/iommu-sysfs.c b/drivers/iommu/iommu-sysfs.c
index c58351ed61c1..36d1a7ce7fc4 100644
--- a/drivers/iommu/iommu-sysfs.c
+++ b/drivers/iommu/iommu-sysfs.c
@@ -62,32 +62,40 @@ int iommu_device_sysfs_add(struct iommu_device *iommu,
 	va_list vargs;
 	int ret;
 
-	device_initialize(&iommu->dev);
+	iommu->dev = kzalloc(sizeof(*iommu->dev), GFP_KERNEL);
+	if (!iommu->dev)
+		return -ENOMEM;
 
-	iommu->dev.class = &iommu_class;
-	iommu->dev.parent = parent;
-	iommu->dev.groups = groups;
+	device_initialize(iommu->dev);
+
+	iommu->dev->class = &iommu_class;
+	iommu->dev->parent = parent;
+	iommu->dev->groups = groups;
 
 	va_start(vargs, fmt);
-	ret = kobject_set_name_vargs(&iommu->dev.kobj, fmt, vargs);
+	ret = kobject_set_name_vargs(&iommu->dev->kobj, fmt, vargs);
 	va_end(vargs);
 	if (ret)
 		goto error;
 
-	ret = device_add(&iommu->dev);
+	ret = device_add(iommu->dev);
 	if (ret)
 		goto error;
 
+	dev_set_drvdata(iommu->dev, iommu);
+
 	return 0;
 
 error:
-	put_device(&iommu->dev);
+	put_device(iommu->dev);
 	return ret;
 }
 
 void iommu_device_sysfs_remove(struct iommu_device *iommu)
 {
-	device_unregister(&iommu->dev);
+	dev_set_drvdata(iommu->dev, NULL);
+	device_unregister(iommu->dev);
+	iommu->dev = NULL;
 }
 /*
  * IOMMU drivers can indicate a device is managed by a given IOMMU using
@@ -102,14 +110,14 @@ int iommu_device_link(struct iommu_device *iommu, struct device *link)
 	if (!iommu || IS_ERR(iommu))
 		return -ENODEV;
 
-	ret = sysfs_add_link_to_group(&iommu->dev.kobj, "devices",
+	ret = sysfs_add_link_to_group(&iommu->dev->kobj, "devices",
 				      &link->kobj, dev_name(link));
 	if (ret)
 		return ret;
 
-	ret = sysfs_create_link_nowarn(&link->kobj, &iommu->dev.kobj, "iommu");
+	ret = sysfs_create_link_nowarn(&link->kobj, &iommu->dev->kobj, "iommu");
 	if (ret)
-		sysfs_remove_link_from_group(&iommu->dev.kobj, "devices",
+		sysfs_remove_link_from_group(&iommu->dev->kobj, "devices",
 					     dev_name(link));
 
 	return ret;
@@ -121,5 +129,5 @@ void iommu_device_unlink(struct iommu_device *iommu, struct device *link)
 		return;
 
 	sysfs_remove_link(&link->kobj, "iommu");
-	sysfs_remove_link_from_group(&iommu->dev.kobj, "devices", dev_name(link));
+	sysfs_remove_link_from_group(&iommu->dev->kobj, "devices", dev_name(link));
 }
diff --git a/include/linux/iommu.h b/include/linux/iommu.h
index 2cb54adc4a33..176f7569d874 100644
--- a/include/linux/iommu.h
+++ b/include/linux/iommu.h
@@ -240,7 +240,7 @@ struct iommu_device {
 	struct list_head list;
 	const struct iommu_ops *ops;
 	struct fwnode_handle *fwnode;
-	struct device dev;
+	struct device *dev;
 };
 
 int  iommu_device_register(struct iommu_device *iommu);
@@ -265,6 +265,11 @@ static inline void iommu_device_set_fwnode(struct iommu_device *iommu,
 	iommu->fwnode = fwnode;
 }
 
+static inline struct iommu_device *dev_to_iommu_device(struct device *dev)
+{
+	return (struct iommu_device *)dev_get_drvdata(dev);
+}
+
 #define IOMMU_GROUP_NOTIFY_ADD_DEVICE		1 /* Device added */
 #define IOMMU_GROUP_NOTIFY_DEL_DEVICE		2 /* Pre Device removed */
 #define IOMMU_GROUP_NOTIFY_BIND_DRIVER		3 /* Pre Driver bind */
@@ -589,6 +594,11 @@ static inline void iommu_device_set_fwnode(struct iommu_device *iommu,
 {
 }
 
+static inline struct iommu_device *dev_to_iommu_device(struct device *dev)
+{
+	return NULL;
+}
+
 static inline void iommu_device_unregister(struct iommu_device *iommu)
 {
 }
-- 
cgit v1.2.3


From 129f6c4820dd68be26b516ba6d6e471e63855d47 Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Fri, 21 Jul 2017 22:26:25 +0200
Subject: mtd: only use __xipram annotation when XIP_KERNEL is set

When XIP_KERNEL is enabled, some functions are defined in the .data
ELF section because we require them to be in RAM whenever we communicate
with the flash chip. However this causes problems when FTRACE is
enabled and gcc emits calls to __gnu_mcount_nc in the function
prolog:

drivers/built-in.o: In function `cfi_chip_setup':
:(.data+0x272fc): relocation truncated to fit: R_ARM_CALL against symbol `__gnu_mcount_nc' defined in .text section in arch/arm/kernel/built-in.o
drivers/built-in.o: In function `cfi_probe_chip':
:(.data+0x27de8): relocation truncated to fit: R_ARM_CALL against symbol `__gnu_mcount_nc' defined in .text section in arch/arm/kernel/built-in.o
/tmp/ccY172rP.s: Assembler messages:
/tmp/ccY172rP.s:70: Warning: ignoring changed section attributes for .data
/tmp/ccY172rP.s: Error: 1 warning, treating warnings as errors
make[5]: *** [drivers/mtd/chips/cfi_probe.o] Error 1
/tmp/ccK4rjeO.s: Assembler messages:
/tmp/ccK4rjeO.s:421: Warning: ignoring changed section attributes for .data
/tmp/ccK4rjeO.s: Error: 1 warning, treating warnings as errors
make[5]: *** [drivers/mtd/chips/cfi_util.o] Error 1
/tmp/ccUvhCYR.s: Assembler messages:
/tmp/ccUvhCYR.s:1895: Warning: ignoring changed section attributes for .data
/tmp/ccUvhCYR.s: Error: 1 warning, treating warnings as errors

Specifically, this does not work because the .data section is not
marked executable, which leads LD to not generate trampolines for
long calls.

This moves the __xipram functions into their own .xiptext section instead.
The section is still placed next to .data and located in RAM but is marked
executable, which avoids the build errors.

Also, we only need to place the XIP functions into a separate section
if both CONFIG_XIP_KERNEL and CONFIG_MTD_XIP are set: When only MTD_XIP
is used, the whole kernel is still in RAM and we do not need to worry
about pulling out the rug under it. When only XIP_KERNEL but not MTD_XIP
is set, the kernel is in some form of ROM, but we never write to it.

Note that MTD_XIP has been broken on ARM since around 2011 or 2012. I
have sent another patch[2] to fix compilation, which I plan to merge
through arm-soc unless there are objections. The obvious alternative
to that would be to completely rip out the MTD_XIP support from the
kernel, since obviously nobody has been using it in a long while.

Link: [1] https://patchwork.kernel.org/patch/8109771/
Link: [2] https://patchwork.kernel.org/patch/9855225/
Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Signed-off-by: Boris Brezillon <boris.brezillon@free-electrons.com>
---
 include/asm-generic/vmlinux.lds.h |  1 +
 include/linux/mtd/xip.h           | 10 ++++++----
 2 files changed, 7 insertions(+), 4 deletions(-)

(limited to 'include')

diff --git a/include/asm-generic/vmlinux.lds.h b/include/asm-generic/vmlinux.lds.h
index da0be9a8d1de..ff507c178327 100644
--- a/include/asm-generic/vmlinux.lds.h
+++ b/include/asm-generic/vmlinux.lds.h
@@ -203,6 +203,7 @@
  * pull in .data..stuff which has its own requirements. Same for bss.
  */
 #define DATA_DATA							\
+	*(.xiptext)							\
 	*(.data .data.[0-9a-zA-Z_]*)					\
 	*(.ref.data)							\
 	*(.data..shared_aligned) /* percpu related */			\
diff --git a/include/linux/mtd/xip.h b/include/linux/mtd/xip.h
index abed4dec5c2f..e373690cce0a 100644
--- a/include/linux/mtd/xip.h
+++ b/include/linux/mtd/xip.h
@@ -30,7 +30,9 @@
  * obviously not be running from flash.  The __xipram is therefore marking
  * those functions so they get relocated to ram.
  */
-#define __xipram noinline __attribute__ ((__section__ (".data")))
+#ifdef CONFIG_XIP_KERNEL
+#define __xipram noinline __attribute__ ((__section__ (".xiptext")))
+#endif
 
 /*
  * Each architecture has to provide the following macros.  They must access
@@ -90,10 +92,10 @@
 #define xip_cpu_idle()  do { } while (0)
 #endif
 
-#else
+#endif /* CONFIG_MTD_XIP */
 
+#ifndef __xipram
 #define __xipram
-
-#endif /* CONFIG_MTD_XIP */
+#endif
 
 #endif /* __LINUX_MTD_XIP_H__ */
-- 
cgit v1.2.3


From 44dd8a989c787e9077745417140aa132bfe45bf5 Mon Sep 17 00:00:00 2001
From: Baolin Wang <baolin.wang@linaro.org>
Date: Tue, 15 Aug 2017 19:07:53 +0800
Subject: include: uapi: usb: Introduce USB charger type and state definition

Introducing USB charger type and state definition can help
to support USB charging which will be added in USB phy core.

Signed-off-by: Baolin Wang <baolin.wang@linaro.org>
Signed-off-by: Felipe Balbi <felipe.balbi@linux.intel.com>
---
 include/uapi/linux/usb/charger.h | 31 +++++++++++++++++++++++++++++++
 1 file changed, 31 insertions(+)
 create mode 100644 include/uapi/linux/usb/charger.h

(limited to 'include')

diff --git a/include/uapi/linux/usb/charger.h b/include/uapi/linux/usb/charger.h
new file mode 100644
index 000000000000..5f72af35b3ed
--- /dev/null
+++ b/include/uapi/linux/usb/charger.h
@@ -0,0 +1,31 @@
+/*
+ * This file defines the USB charger type and state that are needed for
+ * USB device APIs.
+ */
+
+#ifndef _UAPI__LINUX_USB_CHARGER_H
+#define _UAPI__LINUX_USB_CHARGER_H
+
+/*
+ * USB charger type:
+ * SDP (Standard Downstream Port)
+ * DCP (Dedicated Charging Port)
+ * CDP (Charging Downstream Port)
+ * ACA (Accessory Charger Adapters)
+ */
+enum usb_charger_type {
+	UNKNOWN_TYPE,
+	SDP_TYPE,
+	DCP_TYPE,
+	CDP_TYPE,
+	ACA_TYPE,
+};
+
+/* USB charger state */
+enum usb_charger_state {
+	USB_CHARGER_DEFAULT,
+	USB_CHARGER_PRESENT,
+	USB_CHARGER_ABSENT,
+};
+
+#endif /* _UAPI__LINUX_USB_CHARGER_H */
-- 
cgit v1.2.3


From a9081a008f84819ab2f3da596bf89afa16beea94 Mon Sep 17 00:00:00 2001
From: Baolin Wang <baolin.wang@linaro.org>
Date: Tue, 15 Aug 2017 19:07:54 +0800
Subject: usb: phy: Add USB charger support

This patch introduces the usb charger support based on usb phy that
makes an enhancement to a power driver. The basic conception of the
usb charger is that, when one usb charger is added or removed by
reporting from the extcon device state change, the usb charger will
report to power user to set the current limitation.

Power user can register a notifiee on the usb phy by issuing
usb_register_notifier() to get notified by charger status changes
or charger current changes.

we can notify what current to be drawn to power user according to
different charger type, and now we have 2 methods to get charger type.
One is get charger type from extcon subsystem, which also means the
charger state changes. Another is we can get the charger type from
USB controller detecting or PMIC detecting, and the charger state
changes should be told by issuing usb_phy_set_charger_state().

Signed-off-by: Baolin Wang <baolin.wang@linaro.org>
Signed-off-by: Felipe Balbi <felipe.balbi@linux.intel.com>
---
 drivers/usb/phy/phy.c   | 272 ++++++++++++++++++++++++++++++++++++++++++++++++
 include/linux/usb/phy.h |  49 +++++++++
 2 files changed, 321 insertions(+)

(limited to 'include')

diff --git a/drivers/usb/phy/phy.c b/drivers/usb/phy/phy.c
index 032f5afaad4b..2dc48bb2553c 100644
--- a/drivers/usb/phy/phy.c
+++ b/drivers/usb/phy/phy.c
@@ -18,6 +18,18 @@
 
 #include <linux/usb/phy.h>
 
+/* Default current range by charger type. */
+#define DEFAULT_SDP_CUR_MIN	2
+#define DEFAULT_SDP_CUR_MAX	500
+#define DEFAULT_SDP_CUR_MIN_SS	150
+#define DEFAULT_SDP_CUR_MAX_SS	900
+#define DEFAULT_DCP_CUR_MIN	500
+#define DEFAULT_DCP_CUR_MAX	5000
+#define DEFAULT_CDP_CUR_MIN	1500
+#define DEFAULT_CDP_CUR_MAX	5000
+#define DEFAULT_ACA_CUR_MIN	1500
+#define DEFAULT_ACA_CUR_MAX	5000
+
 static LIST_HEAD(phy_list);
 static LIST_HEAD(phy_bind_list);
 static DEFINE_SPINLOCK(phy_lock);
@@ -77,6 +89,221 @@ static struct usb_phy *__of_usb_find_phy(struct device_node *node)
 	return ERR_PTR(-EPROBE_DEFER);
 }
 
+static void usb_phy_set_default_current(struct usb_phy *usb_phy)
+{
+	usb_phy->chg_cur.sdp_min = DEFAULT_SDP_CUR_MIN;
+	usb_phy->chg_cur.sdp_max = DEFAULT_SDP_CUR_MAX;
+	usb_phy->chg_cur.dcp_min = DEFAULT_DCP_CUR_MIN;
+	usb_phy->chg_cur.dcp_max = DEFAULT_DCP_CUR_MAX;
+	usb_phy->chg_cur.cdp_min = DEFAULT_CDP_CUR_MIN;
+	usb_phy->chg_cur.cdp_max = DEFAULT_CDP_CUR_MAX;
+	usb_phy->chg_cur.aca_min = DEFAULT_ACA_CUR_MIN;
+	usb_phy->chg_cur.aca_max = DEFAULT_ACA_CUR_MAX;
+}
+
+/**
+ * usb_phy_notify_charger_work - notify the USB charger state
+ * @work - the charger work to notify the USB charger state
+ *
+ * This work can be issued when USB charger state has been changed or
+ * USB charger current has been changed, then we can notify the current
+ * what can be drawn to power user and the charger state to userspace.
+ *
+ * If we get the charger type from extcon subsystem, we can notify the
+ * charger state to power user automatically by usb_phy_get_charger_type()
+ * issuing from extcon subsystem.
+ *
+ * If we get the charger type from ->charger_detect() instead of extcon
+ * subsystem, the usb phy driver should issue usb_phy_set_charger_state()
+ * to set charger state when the charger state has been changed.
+ */
+static void usb_phy_notify_charger_work(struct work_struct *work)
+{
+	struct usb_phy *usb_phy = container_of(work, struct usb_phy, chg_work);
+	char uchger_state[50] = { 0 };
+	char *envp[] = { uchger_state, NULL };
+	unsigned int min, max;
+
+	switch (usb_phy->chg_state) {
+	case USB_CHARGER_PRESENT:
+		usb_phy_get_charger_current(usb_phy, &min, &max);
+
+		atomic_notifier_call_chain(&usb_phy->notifier, max, usb_phy);
+		snprintf(uchger_state, ARRAY_SIZE(uchger_state),
+			 "USB_CHARGER_STATE=%s", "USB_CHARGER_PRESENT");
+		break;
+	case USB_CHARGER_ABSENT:
+		usb_phy_set_default_current(usb_phy);
+
+		atomic_notifier_call_chain(&usb_phy->notifier, 0, usb_phy);
+		snprintf(uchger_state, ARRAY_SIZE(uchger_state),
+			 "USB_CHARGER_STATE=%s", "USB_CHARGER_ABSENT");
+		break;
+	default:
+		dev_warn(usb_phy->dev, "Unknown USB charger state: %d\n",
+			 usb_phy->chg_state);
+		return;
+	}
+
+	kobject_uevent_env(&usb_phy->dev->kobj, KOBJ_CHANGE, envp);
+}
+
+static void __usb_phy_get_charger_type(struct usb_phy *usb_phy)
+{
+	if (extcon_get_state(usb_phy->edev, EXTCON_CHG_USB_SDP) > 0) {
+		usb_phy->chg_type = SDP_TYPE;
+		usb_phy->chg_state = USB_CHARGER_PRESENT;
+	} else if (extcon_get_state(usb_phy->edev, EXTCON_CHG_USB_CDP) > 0) {
+		usb_phy->chg_type = CDP_TYPE;
+		usb_phy->chg_state = USB_CHARGER_PRESENT;
+	} else if (extcon_get_state(usb_phy->edev, EXTCON_CHG_USB_DCP) > 0) {
+		usb_phy->chg_type = DCP_TYPE;
+		usb_phy->chg_state = USB_CHARGER_PRESENT;
+	} else if (extcon_get_state(usb_phy->edev, EXTCON_CHG_USB_ACA) > 0) {
+		usb_phy->chg_type = ACA_TYPE;
+		usb_phy->chg_state = USB_CHARGER_PRESENT;
+	} else {
+		usb_phy->chg_type = UNKNOWN_TYPE;
+		usb_phy->chg_state = USB_CHARGER_ABSENT;
+	}
+
+	schedule_work(&usb_phy->chg_work);
+}
+
+/**
+ * usb_phy_get_charger_type - get charger type from extcon subsystem
+ * @nb -the notifier block to determine charger type
+ * @state - the cable state
+ * @data - private data
+ *
+ * Determin the charger type from extcon subsystem which also means the
+ * charger state has been chaned, then we should notify this event.
+ */
+static int usb_phy_get_charger_type(struct notifier_block *nb,
+				    unsigned long state, void *data)
+{
+	struct usb_phy *usb_phy = container_of(nb, struct usb_phy, type_nb);
+
+	__usb_phy_get_charger_type(usb_phy);
+	return NOTIFY_OK;
+}
+
+/**
+ * usb_phy_set_charger_current - set the USB charger current
+ * @usb_phy - the USB phy to be used
+ * @mA - the current need to be set
+ *
+ * Usually we only change the charger default current when USB finished the
+ * enumeration as one SDP charger. As one SDP charger, usb_phy_set_power()
+ * will issue this function to change charger current when after setting USB
+ * configuration, or suspend/resume USB. For other type charger, we should
+ * use the default charger current and we do not suggest to issue this function
+ * to change the charger current.
+ *
+ * When USB charger current has been changed, we need to notify the power users.
+ */
+void usb_phy_set_charger_current(struct usb_phy *usb_phy, unsigned int mA)
+{
+	switch (usb_phy->chg_type) {
+	case SDP_TYPE:
+		if (usb_phy->chg_cur.sdp_max == mA)
+			return;
+
+		usb_phy->chg_cur.sdp_max = (mA > DEFAULT_SDP_CUR_MAX_SS) ?
+			DEFAULT_SDP_CUR_MAX_SS : mA;
+		break;
+	case DCP_TYPE:
+		if (usb_phy->chg_cur.dcp_max == mA)
+			return;
+
+		usb_phy->chg_cur.dcp_max = (mA > DEFAULT_DCP_CUR_MAX) ?
+			DEFAULT_DCP_CUR_MAX : mA;
+		break;
+	case CDP_TYPE:
+		if (usb_phy->chg_cur.cdp_max == mA)
+			return;
+
+		usb_phy->chg_cur.cdp_max = (mA > DEFAULT_CDP_CUR_MAX) ?
+			DEFAULT_CDP_CUR_MAX : mA;
+		break;
+	case ACA_TYPE:
+		if (usb_phy->chg_cur.aca_max == mA)
+			return;
+
+		usb_phy->chg_cur.aca_max = (mA > DEFAULT_ACA_CUR_MAX) ?
+			DEFAULT_ACA_CUR_MAX : mA;
+		break;
+	default:
+		return;
+	}
+
+	schedule_work(&usb_phy->chg_work);
+}
+EXPORT_SYMBOL_GPL(usb_phy_set_charger_current);
+
+/**
+ * usb_phy_get_charger_current - get the USB charger current
+ * @usb_phy - the USB phy to be used
+ * @min - the minimum current
+ * @max - the maximum current
+ *
+ * Usually we will notify the maximum current to power user, but for some
+ * special case, power user also need the minimum current value. Then the
+ * power user can issue this function to get the suitable current.
+ */
+void usb_phy_get_charger_current(struct usb_phy *usb_phy,
+				 unsigned int *min, unsigned int *max)
+{
+	switch (usb_phy->chg_type) {
+	case SDP_TYPE:
+		*min = usb_phy->chg_cur.sdp_min;
+		*max = usb_phy->chg_cur.sdp_max;
+		break;
+	case DCP_TYPE:
+		*min = usb_phy->chg_cur.dcp_min;
+		*max = usb_phy->chg_cur.dcp_max;
+		break;
+	case CDP_TYPE:
+		*min = usb_phy->chg_cur.cdp_min;
+		*max = usb_phy->chg_cur.cdp_max;
+		break;
+	case ACA_TYPE:
+		*min = usb_phy->chg_cur.aca_min;
+		*max = usb_phy->chg_cur.aca_max;
+		break;
+	default:
+		*min = 0;
+		*max = 0;
+		break;
+	}
+}
+EXPORT_SYMBOL_GPL(usb_phy_get_charger_current);
+
+/**
+ * usb_phy_set_charger_state - set the USB charger state
+ * @usb_phy - the USB phy to be used
+ * @state - the new state need to be set for charger
+ *
+ * The usb phy driver can issue this function when the usb phy driver
+ * detected the charger state has been changed, in this case the charger
+ * type should be get from ->charger_detect().
+ */
+void usb_phy_set_charger_state(struct usb_phy *usb_phy,
+			       enum usb_charger_state state)
+{
+	if (usb_phy->chg_state == state || !usb_phy->charger_detect)
+		return;
+
+	usb_phy->chg_state = state;
+	if (usb_phy->chg_state == USB_CHARGER_PRESENT)
+		usb_phy->chg_type = usb_phy->charger_detect(usb_phy);
+	else
+		usb_phy->chg_type = UNKNOWN_TYPE;
+
+	schedule_work(&usb_phy->chg_work);
+}
+EXPORT_SYMBOL_GPL(usb_phy_set_charger_state);
+
 static void devm_usb_phy_release(struct device *dev, void *res)
 {
 	struct usb_phy *phy = *(struct usb_phy **)res;
@@ -124,6 +351,44 @@ static int usb_add_extcon(struct usb_phy *x)
 					"register VBUS notifier failed\n");
 				return ret;
 			}
+		} else {
+			x->type_nb.notifier_call = usb_phy_get_charger_type;
+
+			ret = devm_extcon_register_notifier(x->dev, x->edev,
+							    EXTCON_CHG_USB_SDP,
+							    &x->type_nb);
+			if (ret) {
+				dev_err(x->dev,
+					"register extcon USB SDP failed.\n");
+				return ret;
+			}
+
+			ret = devm_extcon_register_notifier(x->dev, x->edev,
+							    EXTCON_CHG_USB_CDP,
+							    &x->type_nb);
+			if (ret) {
+				dev_err(x->dev,
+					"register extcon USB CDP failed.\n");
+				return ret;
+			}
+
+			ret = devm_extcon_register_notifier(x->dev, x->edev,
+							    EXTCON_CHG_USB_DCP,
+							    &x->type_nb);
+			if (ret) {
+				dev_err(x->dev,
+					"register extcon USB DCP failed.\n");
+				return ret;
+			}
+
+			ret = devm_extcon_register_notifier(x->dev, x->edev,
+							    EXTCON_CHG_USB_ACA,
+							    &x->type_nb);
+			if (ret) {
+				dev_err(x->dev,
+					"register extcon USB ACA failed.\n");
+				return ret;
+			}
 		}
 
 		if (x->id_nb.notifier_call) {
@@ -145,6 +410,13 @@ static int usb_add_extcon(struct usb_phy *x)
 		}
 	}
 
+	usb_phy_set_default_current(x);
+	INIT_WORK(&x->chg_work, usb_phy_notify_charger_work);
+	x->chg_type = UNKNOWN_TYPE;
+	x->chg_state = USB_CHARGER_DEFAULT;
+	if (x->type_nb.notifier_call)
+		__usb_phy_get_charger_type(x);
+
 	return 0;
 }
 
diff --git a/include/linux/usb/phy.h b/include/linux/usb/phy.h
index 299245105610..de881b171ba9 100644
--- a/include/linux/usb/phy.h
+++ b/include/linux/usb/phy.h
@@ -12,6 +12,7 @@
 #include <linux/extcon.h>
 #include <linux/notifier.h>
 #include <linux/usb.h>
+#include <uapi/linux/usb/charger.h>
 
 enum usb_phy_interface {
 	USBPHY_INTERFACE_MODE_UNKNOWN,
@@ -72,6 +73,17 @@ struct usb_phy_io_ops {
 	int (*write)(struct usb_phy *x, u32 val, u32 reg);
 };
 
+struct usb_charger_current {
+	unsigned int sdp_min;
+	unsigned int sdp_max;
+	unsigned int dcp_min;
+	unsigned int dcp_max;
+	unsigned int cdp_min;
+	unsigned int cdp_max;
+	unsigned int aca_min;
+	unsigned int aca_max;
+};
+
 struct usb_phy {
 	struct device		*dev;
 	const char		*label;
@@ -91,6 +103,13 @@ struct usb_phy {
 	struct extcon_dev	*id_edev;
 	struct notifier_block	vbus_nb;
 	struct notifier_block	id_nb;
+	struct notifier_block	type_nb;
+
+	/* Support USB charger */
+	enum usb_charger_type	chg_type;
+	enum usb_charger_state	chg_state;
+	struct usb_charger_current	chg_cur;
+	struct work_struct		chg_work;
 
 	/* for notification of usb_phy_events */
 	struct atomic_notifier_head	notifier;
@@ -129,6 +148,12 @@ struct usb_phy {
 			enum usb_device_speed speed);
 	int	(*notify_disconnect)(struct usb_phy *x,
 			enum usb_device_speed speed);
+
+	/*
+	 * Charger detection method can be implemented if you need to
+	 * manually detect the charger type.
+	 */
+	enum usb_charger_type (*charger_detect)(struct usb_phy *x);
 };
 
 /**
@@ -219,6 +244,12 @@ extern void devm_usb_put_phy(struct device *dev, struct usb_phy *x);
 extern int usb_bind_phy(const char *dev_name, u8 index,
 				const char *phy_dev_name);
 extern void usb_phy_set_event(struct usb_phy *x, unsigned long event);
+extern void usb_phy_set_charger_current(struct usb_phy *usb_phy,
+					unsigned int mA);
+extern void usb_phy_get_charger_current(struct usb_phy *usb_phy,
+					unsigned int *min, unsigned int *max);
+extern void usb_phy_set_charger_state(struct usb_phy *usb_phy,
+				      enum usb_charger_state state);
 #else
 static inline struct usb_phy *usb_get_phy(enum usb_phy_type type)
 {
@@ -270,11 +301,29 @@ static inline int usb_bind_phy(const char *dev_name, u8 index,
 static inline void usb_phy_set_event(struct usb_phy *x, unsigned long event)
 {
 }
+
+static inline void usb_phy_set_charger_current(struct usb_phy *usb_phy,
+					       unsigned int mA)
+{
+}
+
+static inline void usb_phy_get_charger_current(struct usb_phy *usb_phy,
+					       unsigned int *min,
+					       unsigned int *max)
+{
+}
+
+static inline void usb_phy_set_charger_state(struct usb_phy *usb_phy,
+					     enum usb_charger_state state)
+{
+}
 #endif
 
 static inline int
 usb_phy_set_power(struct usb_phy *x, unsigned mA)
 {
+	usb_phy_set_charger_current(x, mA);
+
 	if (x && x->set_power)
 		return x->set_power(x, mA);
 	return 0;
-- 
cgit v1.2.3


From e24a1307ba1f99fc62a0bd61d5e87fcfb6d5503d Mon Sep 17 00:00:00 2001
From: "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com>
Date: Fri, 28 Jul 2017 10:31:25 +0530
Subject: mm/hugetlb: Allow arch to override and call the weak function

When running in guest mode ppc64 supports a different mechanism for hugetlb
allocation/reservation. The LPAR management application called HMC can
be used to reserve a set of hugepages and we pass the details of
reserved pages via device tree to the guest. (more details in
htab_dt_scan_hugepage_blocks()) . We do the memblock_reserve of the range
and later in the boot sequence, we add the reserved range to huge_boot_pages.

But to enable 16G hugetlb on baremetal config (when we are not running as guest)
we want to do memblock reservation during boot. Generic code already does this

Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 include/linux/hugetlb.h | 1 +
 mm/hugetlb.c            | 4 +++-
 2 files changed, 4 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
index 0ed8e41aaf11..8bbbd37ab105 100644
--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
@@ -358,6 +358,7 @@ int huge_add_to_page_cache(struct page *page, struct address_space *mapping,
 			pgoff_t idx);
 
 /* arch callback */
+int __init __alloc_bootmem_huge_page(struct hstate *h);
 int __init alloc_bootmem_huge_page(struct hstate *h);
 
 void __init hugetlb_bad_size(void);
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index bc48ee783dd9..b97e6494d74d 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -2083,7 +2083,9 @@ struct page *alloc_huge_page_noerr(struct vm_area_struct *vma,
 	return page;
 }
 
-int __weak alloc_bootmem_huge_page(struct hstate *h)
+int alloc_bootmem_huge_page(struct hstate *h)
+	__attribute__ ((weak, alias("__alloc_bootmem_huge_page")));
+int __alloc_bootmem_huge_page(struct hstate *h)
 {
 	struct huge_bootmem_page *m;
 	int nr_nodes, node;
-- 
cgit v1.2.3


From dee83046e73cb7ebbbae955c1ef0f4f55a0f44f9 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <trond.myklebust@primarydata.com>
Date: Mon, 17 Jul 2017 10:51:02 -0400
Subject: NFS: Remove unuse function nfs_page_group_lock_wait()

Signed-off-by: Trond Myklebust <trond.myklebust@primarydata.com>
---
 fs/nfs/pagelist.c        | 21 ---------------------
 include/linux/nfs_page.h |  1 -
 2 files changed, 22 deletions(-)

(limited to 'include')

diff --git a/fs/nfs/pagelist.c b/fs/nfs/pagelist.c
index a6f2bbd709ba..ced7974622dd 100644
--- a/fs/nfs/pagelist.c
+++ b/fs/nfs/pagelist.c
@@ -165,27 +165,6 @@ nfs_page_group_lock(struct nfs_page *req, bool nonblock)
 	return -EAGAIN;
 }
 
-/*
- * nfs_page_group_lock_wait - wait for the lock to clear, but don't grab it
- * @req - a request in the group
- *
- * This is a blocking call to wait for the group lock to be cleared.
- */
-void
-nfs_page_group_lock_wait(struct nfs_page *req)
-{
-	struct nfs_page *head = req->wb_head;
-
-	WARN_ON_ONCE(head != head->wb_head);
-
-	if (!test_bit(PG_HEADLOCK, &head->wb_flags))
-		return;
-	set_bit(PG_CONTENDED1, &head->wb_flags);
-	smp_mb__after_atomic();
-	wait_on_bit(&head->wb_flags, PG_HEADLOCK,
-		TASK_UNINTERRUPTIBLE);
-}
-
 /*
  * nfs_page_group_unlock - unlock the head of the page group
  * @req - request in group that is to be unlocked
diff --git a/include/linux/nfs_page.h b/include/linux/nfs_page.h
index d67b67ae6c8b..de1d24cedaa2 100644
--- a/include/linux/nfs_page.h
+++ b/include/linux/nfs_page.h
@@ -140,7 +140,6 @@ extern  int nfs_wait_on_request(struct nfs_page *);
 extern	void nfs_unlock_request(struct nfs_page *req);
 extern	void nfs_unlock_and_release_request(struct nfs_page *);
 extern int nfs_page_group_lock(struct nfs_page *, bool);
-extern void nfs_page_group_lock_wait(struct nfs_page *);
 extern void nfs_page_group_unlock(struct nfs_page *);
 extern bool nfs_page_group_sync_on_bit(struct nfs_page *, unsigned int);
 extern bool nfs_async_iocounter_wait(struct rpc_task *, struct nfs_lock_context *);
-- 
cgit v1.2.3


From 1344b7ea172b4911a8ee8a6ff26c5bc6b5abb302 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <trond.myklebust@primarydata.com>
Date: Mon, 17 Jul 2017 10:54:14 -0400
Subject: NFS: Remove unused parameter from nfs_page_group_lock()

nfs_page_group_lock() is now always called with the 'nonblock'
parameter set to 'false'.

Signed-off-by: Trond Myklebust <trond.myklebust@primarydata.com>
---
 fs/nfs/pagelist.c        | 31 +++++++++++--------------------
 fs/nfs/write.c           |  6 +++---
 include/linux/nfs_page.h |  2 +-
 3 files changed, 15 insertions(+), 24 deletions(-)

(limited to 'include')

diff --git a/fs/nfs/pagelist.c b/fs/nfs/pagelist.c
index ced7974622dd..af6731dd4324 100644
--- a/fs/nfs/pagelist.c
+++ b/fs/nfs/pagelist.c
@@ -134,19 +134,14 @@ EXPORT_SYMBOL_GPL(nfs_async_iocounter_wait);
 /*
  * nfs_page_group_lock - lock the head of the page group
  * @req - request in group that is to be locked
- * @nonblock - if true don't block waiting for lock
  *
- * this lock must be held if modifying the page group list
+ * this lock must be held when traversing or modifying the page
+ * group list
  *
- * return 0 on success, < 0 on error: -EDELAY if nonblocking or the
- * result from wait_on_bit_lock
- *
- * NOTE: calling with nonblock=false should always have set the
- *       lock bit (see fs/buffer.c and other uses of wait_on_bit_lock
- *       with TASK_UNINTERRUPTIBLE), so there is no need to check the result.
+ * return 0 on success, < 0 on error
  */
 int
-nfs_page_group_lock(struct nfs_page *req, bool nonblock)
+nfs_page_group_lock(struct nfs_page *req)
 {
 	struct nfs_page *head = req->wb_head;
 
@@ -155,14 +150,10 @@ nfs_page_group_lock(struct nfs_page *req, bool nonblock)
 	if (!test_and_set_bit(PG_HEADLOCK, &head->wb_flags))
 		return 0;
 
-	if (!nonblock) {
-		set_bit(PG_CONTENDED1, &head->wb_flags);
-		smp_mb__after_atomic();
-		return wait_on_bit_lock(&head->wb_flags, PG_HEADLOCK,
+	set_bit(PG_CONTENDED1, &head->wb_flags);
+	smp_mb__after_atomic();
+	return wait_on_bit_lock(&head->wb_flags, PG_HEADLOCK,
 				TASK_UNINTERRUPTIBLE);
-	}
-
-	return -EAGAIN;
 }
 
 /*
@@ -225,7 +216,7 @@ bool nfs_page_group_sync_on_bit(struct nfs_page *req, unsigned int bit)
 {
 	bool ret;
 
-	nfs_page_group_lock(req, false);
+	nfs_page_group_lock(req);
 	ret = nfs_page_group_sync_on_bit_locked(req, bit);
 	nfs_page_group_unlock(req);
 
@@ -1016,7 +1007,7 @@ static int __nfs_pageio_add_request(struct nfs_pageio_descriptor *desc,
 	unsigned int bytes_left = 0;
 	unsigned int offset, pgbase;
 
-	nfs_page_group_lock(req, false);
+	nfs_page_group_lock(req);
 
 	subreq = req;
 	bytes_left = subreq->wb_bytes;
@@ -1038,7 +1029,7 @@ static int __nfs_pageio_add_request(struct nfs_pageio_descriptor *desc,
 			if (mirror->pg_recoalesce)
 				return 0;
 			/* retry add_request for this subreq */
-			nfs_page_group_lock(req, false);
+			nfs_page_group_lock(req);
 			continue;
 		}
 
@@ -1135,7 +1126,7 @@ int nfs_pageio_add_request(struct nfs_pageio_descriptor *desc,
 
 	for (midx = 0; midx < desc->pg_mirror_count; midx++) {
 		if (midx) {
-			nfs_page_group_lock(req, false);
+			nfs_page_group_lock(req);
 
 			/* find the last request */
 			for (lastreq = req->wb_head;
diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index 20d44ea328b6..0f418d825185 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -271,7 +271,7 @@ static bool nfs_page_group_covers_page(struct nfs_page *req)
 	unsigned int pos = 0;
 	unsigned int len = nfs_page_length(req->wb_page);
 
-	nfs_page_group_lock(req, false);
+	nfs_page_group_lock(req);
 
 	do {
 		tmp = nfs_page_group_search_locked(req->wb_head, pos);
@@ -480,7 +480,7 @@ try_again:
 	}
 	spin_unlock(&inode->i_lock);
 
-	ret = nfs_page_group_lock(head, false);
+	ret = nfs_page_group_lock(head);
 	if (ret < 0) {
 		nfs_unlock_and_release_request(head);
 		return ERR_PTR(ret);
@@ -501,7 +501,7 @@ try_again:
 			nfs_page_group_unlock(head);
 			ret = nfs_wait_on_request(subreq);
 			if (!ret)
-				ret = nfs_page_group_lock(head, false);
+				ret = nfs_page_group_lock(head);
 			if (ret < 0) {
 				nfs_unroll_locks(inode, head, subreq);
 				nfs_release_request(subreq);
diff --git a/include/linux/nfs_page.h b/include/linux/nfs_page.h
index de1d24cedaa2..2f4fdafb6746 100644
--- a/include/linux/nfs_page.h
+++ b/include/linux/nfs_page.h
@@ -139,7 +139,7 @@ extern size_t nfs_generic_pg_test(struct nfs_pageio_descriptor *desc,
 extern  int nfs_wait_on_request(struct nfs_page *);
 extern	void nfs_unlock_request(struct nfs_page *req);
 extern	void nfs_unlock_and_release_request(struct nfs_page *);
-extern int nfs_page_group_lock(struct nfs_page *, bool);
+extern int nfs_page_group_lock(struct nfs_page *);
 extern void nfs_page_group_unlock(struct nfs_page *);
 extern bool nfs_page_group_sync_on_bit(struct nfs_page *, unsigned int);
 extern bool nfs_async_iocounter_wait(struct rpc_task *, struct nfs_lock_context *);
-- 
cgit v1.2.3


From e824f99adaaf1ed0e03eac8574599af6d992163d Mon Sep 17 00:00:00 2001
From: Trond Myklebust <trond.myklebust@primarydata.com>
Date: Tue, 1 Aug 2017 11:53:49 -0400
Subject: NFSv4: Use a mutex to protect the per-inode commit lists

The commit lists can get very large, so using the inode->i_lock can
end up affecting general metadata performance.

Signed-off-by: Trond Myklebust <trond.myklebust@primarydata.com>
---
 fs/nfs/direct.c        |  4 ++--
 fs/nfs/inode.c         |  1 +
 fs/nfs/pnfs_nfs.c      | 15 +++++++--------
 fs/nfs/write.c         | 24 ++++++++++++------------
 include/linux/nfs_fs.h |  1 +
 5 files changed, 23 insertions(+), 22 deletions(-)

(limited to 'include')

diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c
index 6fb9fad2d1e6..d2972d537469 100644
--- a/fs/nfs/direct.c
+++ b/fs/nfs/direct.c
@@ -616,13 +616,13 @@ nfs_direct_write_scan_commit_list(struct inode *inode,
 				  struct list_head *list,
 				  struct nfs_commit_info *cinfo)
 {
-	spin_lock(&cinfo->inode->i_lock);
+	mutex_lock(&NFS_I(cinfo->inode)->commit_mutex);
 #ifdef CONFIG_NFS_V4_1
 	if (cinfo->ds != NULL && cinfo->ds->nwritten != 0)
 		NFS_SERVER(inode)->pnfs_curr_ld->recover_commit_reqs(list, cinfo);
 #endif
 	nfs_scan_commit_list(&cinfo->mds->list, list, cinfo, 0);
-	spin_unlock(&cinfo->inode->i_lock);
+	mutex_unlock(&NFS_I(cinfo->inode)->commit_mutex);
 }
 
 static void nfs_direct_write_reschedule(struct nfs_direct_req *dreq)
diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index 109279d6d91b..34d9ebbc0dfd 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -2016,6 +2016,7 @@ static void init_once(void *foo)
 	nfsi->commit_info.ncommit = 0;
 	atomic_set(&nfsi->commit_info.rpcs_out, 0);
 	init_rwsem(&nfsi->rmdir_sem);
+	mutex_init(&nfsi->commit_mutex);
 	nfs4_init_once(nfsi);
 }
 
diff --git a/fs/nfs/pnfs_nfs.c b/fs/nfs/pnfs_nfs.c
index 25f28fa64c57..2cdee8ce2094 100644
--- a/fs/nfs/pnfs_nfs.c
+++ b/fs/nfs/pnfs_nfs.c
@@ -98,14 +98,13 @@ pnfs_generic_transfer_commit_list(struct list_head *src, struct list_head *dst,
 		if (!nfs_lock_request(req))
 			continue;
 		kref_get(&req->wb_kref);
-		if (cond_resched_lock(&cinfo->inode->i_lock))
-			list_safe_reset_next(req, tmp, wb_list);
 		nfs_request_remove_commit_list(req, cinfo);
 		clear_bit(PG_COMMIT_TO_DS, &req->wb_flags);
 		nfs_list_add_request(req, dst);
 		ret++;
 		if ((ret == max) && !cinfo->dreq)
 			break;
+		cond_resched();
 	}
 	return ret;
 }
@@ -119,7 +118,7 @@ pnfs_generic_scan_ds_commit_list(struct pnfs_commit_bucket *bucket,
 	struct list_head *dst = &bucket->committing;
 	int ret;
 
-	lockdep_assert_held(&cinfo->inode->i_lock);
+	lockdep_assert_held(&NFS_I(cinfo->inode)->commit_mutex);
 	ret = pnfs_generic_transfer_commit_list(src, dst, cinfo, max);
 	if (ret) {
 		cinfo->ds->nwritten -= ret;
@@ -142,7 +141,7 @@ int pnfs_generic_scan_commit_lists(struct nfs_commit_info *cinfo,
 {
 	int i, rv = 0, cnt;
 
-	lockdep_assert_held(&cinfo->inode->i_lock);
+	lockdep_assert_held(&NFS_I(cinfo->inode)->commit_mutex);
 	for (i = 0; i < cinfo->ds->nbuckets && max != 0; i++) {
 		cnt = pnfs_generic_scan_ds_commit_list(&cinfo->ds->buckets[i],
 						       cinfo, max);
@@ -162,7 +161,7 @@ void pnfs_generic_recover_commit_reqs(struct list_head *dst,
 	int nwritten;
 	int i;
 
-	lockdep_assert_held(&cinfo->inode->i_lock);
+	lockdep_assert_held(&NFS_I(cinfo->inode)->commit_mutex);
 restart:
 	for (i = 0, b = cinfo->ds->buckets; i < cinfo->ds->nbuckets; i++, b++) {
 		nwritten = pnfs_generic_transfer_commit_list(&b->written,
@@ -953,12 +952,12 @@ pnfs_layout_mark_request_commit(struct nfs_page *req,
 	struct list_head *list;
 	struct pnfs_commit_bucket *buckets;
 
-	spin_lock(&cinfo->inode->i_lock);
+	mutex_lock(&NFS_I(cinfo->inode)->commit_mutex);
 	buckets = cinfo->ds->buckets;
 	list = &buckets[ds_commit_idx].written;
 	if (list_empty(list)) {
 		if (!pnfs_is_valid_lseg(lseg)) {
-			spin_unlock(&cinfo->inode->i_lock);
+			mutex_unlock(&NFS_I(cinfo->inode)->commit_mutex);
 			cinfo->completion_ops->resched_write(cinfo, req);
 			return;
 		}
@@ -975,7 +974,7 @@ pnfs_layout_mark_request_commit(struct nfs_page *req,
 	cinfo->ds->nwritten++;
 
 	nfs_request_add_commit_list_locked(req, list, cinfo);
-	spin_unlock(&cinfo->inode->i_lock);
+	mutex_unlock(&NFS_I(cinfo->inode)->commit_mutex);
 	nfs_mark_page_unstable(req->wb_page, cinfo);
 }
 EXPORT_SYMBOL_GPL(pnfs_layout_mark_request_commit);
diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index 8d8fa6d4cfcc..5ab5ca24b48a 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -195,7 +195,7 @@ nfs_page_find_swap_request(struct page *page)
 	struct nfs_page *req = NULL;
 	if (!PageSwapCache(page))
 		return NULL;
-	spin_lock(&inode->i_lock);
+	mutex_lock(&nfsi->commit_mutex);
 	if (PageSwapCache(page)) {
 		req = nfs_page_search_commits_for_head_request_locked(nfsi,
 			page);
@@ -204,7 +204,7 @@ nfs_page_find_swap_request(struct page *page)
 			kref_get(&req->wb_kref);
 		}
 	}
-	spin_unlock(&inode->i_lock);
+	mutex_unlock(&nfsi->commit_mutex);
 	return req;
 }
 
@@ -856,7 +856,8 @@ nfs_page_search_commits_for_head_request_locked(struct nfs_inode *nfsi,
  * number of outstanding requests requiring a commit as well as
  * the MM page stats.
  *
- * The caller must hold cinfo->inode->i_lock, and the nfs_page lock.
+ * The caller must hold NFS_I(cinfo->inode)->commit_mutex, and the
+ * nfs_page lock.
  */
 void
 nfs_request_add_commit_list_locked(struct nfs_page *req, struct list_head *dst,
@@ -884,9 +885,9 @@ EXPORT_SYMBOL_GPL(nfs_request_add_commit_list_locked);
 void
 nfs_request_add_commit_list(struct nfs_page *req, struct nfs_commit_info *cinfo)
 {
-	spin_lock(&cinfo->inode->i_lock);
+	mutex_lock(&NFS_I(cinfo->inode)->commit_mutex);
 	nfs_request_add_commit_list_locked(req, &cinfo->mds->list, cinfo);
-	spin_unlock(&cinfo->inode->i_lock);
+	mutex_unlock(&NFS_I(cinfo->inode)->commit_mutex);
 	if (req->wb_page)
 		nfs_mark_page_unstable(req->wb_page, cinfo);
 }
@@ -964,11 +965,11 @@ nfs_clear_request_commit(struct nfs_page *req)
 		struct nfs_commit_info cinfo;
 
 		nfs_init_cinfo_from_inode(&cinfo, inode);
-		spin_lock(&inode->i_lock);
+		mutex_lock(&NFS_I(inode)->commit_mutex);
 		if (!pnfs_clear_request_commit(req, &cinfo)) {
 			nfs_request_remove_commit_list(req, &cinfo);
 		}
-		spin_unlock(&inode->i_lock);
+		mutex_unlock(&NFS_I(inode)->commit_mutex);
 		nfs_clear_page_commit(req->wb_page);
 	}
 }
@@ -1027,7 +1028,7 @@ nfs_reqs_to_commit(struct nfs_commit_info *cinfo)
 	return cinfo->mds->ncommit;
 }
 
-/* cinfo->inode->i_lock held by caller */
+/* NFS_I(cinfo->inode)->commit_mutex held by caller */
 int
 nfs_scan_commit_list(struct list_head *src, struct list_head *dst,
 		     struct nfs_commit_info *cinfo, int max)
@@ -1039,13 +1040,12 @@ nfs_scan_commit_list(struct list_head *src, struct list_head *dst,
 		if (!nfs_lock_request(req))
 			continue;
 		kref_get(&req->wb_kref);
-		if (cond_resched_lock(&cinfo->inode->i_lock))
-			list_safe_reset_next(req, tmp, wb_list);
 		nfs_request_remove_commit_list(req, cinfo);
 		nfs_list_add_request(req, dst);
 		ret++;
 		if ((ret == max) && !cinfo->dreq)
 			break;
+		cond_resched();
 	}
 	return ret;
 }
@@ -1065,7 +1065,7 @@ nfs_scan_commit(struct inode *inode, struct list_head *dst,
 {
 	int ret = 0;
 
-	spin_lock(&cinfo->inode->i_lock);
+	mutex_lock(&NFS_I(cinfo->inode)->commit_mutex);
 	if (cinfo->mds->ncommit > 0) {
 		const int max = INT_MAX;
 
@@ -1073,7 +1073,7 @@ nfs_scan_commit(struct inode *inode, struct list_head *dst,
 					   cinfo, max);
 		ret += pnfs_scan_commit_lists(inode, cinfo, max - ret);
 	}
-	spin_unlock(&cinfo->inode->i_lock);
+	mutex_unlock(&NFS_I(cinfo->inode)->commit_mutex);
 	return ret;
 }
 
diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h
index 5cc91d6381a3..121a702888b4 100644
--- a/include/linux/nfs_fs.h
+++ b/include/linux/nfs_fs.h
@@ -163,6 +163,7 @@ struct nfs_inode {
 	/* Readers: in-flight sillydelete RPC calls */
 	/* Writers: rmdir */
 	struct rw_semaphore	rmdir_sem;
+	struct mutex		commit_mutex;
 
 #if IS_ENABLED(CONFIG_NFS_V4)
 	struct nfs4_cached_acl	*nfs4_acl;
-- 
cgit v1.2.3


From a6b6d5b85abf4914bbceade5dddd54c345c64136 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <trond.myklebust@primarydata.com>
Date: Tue, 1 Aug 2017 15:39:46 -0400
Subject: NFS: Use an atomic_long_t to count the number of requests

Rather than forcing us to take the inode->i_lock just in order to bump
the number.

Signed-off-by: Trond Myklebust <trond.myklebust@primarydata.com>
---
 fs/nfs/callback_proc.c |  2 +-
 fs/nfs/delegation.c    |  2 +-
 fs/nfs/inode.c         |  7 +++----
 fs/nfs/pagelist.c      |  4 +---
 fs/nfs/write.c         | 18 +++++-------------
 include/linux/nfs_fs.h |  4 ++--
 6 files changed, 13 insertions(+), 24 deletions(-)

(limited to 'include')

diff --git a/fs/nfs/callback_proc.c b/fs/nfs/callback_proc.c
index 5427cdf04c5a..14358de173fb 100644
--- a/fs/nfs/callback_proc.c
+++ b/fs/nfs/callback_proc.c
@@ -51,7 +51,7 @@ __be32 nfs4_callback_getattr(void *argp, void *resp,
 		goto out_iput;
 	res->size = i_size_read(inode);
 	res->change_attr = delegation->change_attr;
-	if (nfsi->nrequests != 0)
+	if (nfs_have_writebacks(inode))
 		res->change_attr++;
 	res->ctime = inode->i_ctime;
 	res->mtime = inode->i_mtime;
diff --git a/fs/nfs/delegation.c b/fs/nfs/delegation.c
index d7df5e67b0c1..606dd3871f66 100644
--- a/fs/nfs/delegation.c
+++ b/fs/nfs/delegation.c
@@ -1089,7 +1089,7 @@ bool nfs4_delegation_flush_on_close(const struct inode *inode)
 	delegation = rcu_dereference(nfsi->delegation);
 	if (delegation == NULL || !(delegation->type & FMODE_WRITE))
 		goto out;
-	if (nfsi->nrequests < delegation->pagemod_limit)
+	if (atomic_long_read(&nfsi->nrequests) < delegation->pagemod_limit)
 		ret = false;
 out:
 	rcu_read_unlock();
diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index 34d9ebbc0dfd..0480eb02299a 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -1285,7 +1285,6 @@ static bool nfs_file_has_buffered_writers(struct nfs_inode *nfsi)
 
 static unsigned long nfs_wcc_update_inode(struct inode *inode, struct nfs_fattr *fattr)
 {
-	struct nfs_inode *nfsi = NFS_I(inode);
 	unsigned long ret = 0;
 
 	if ((fattr->valid & NFS_ATTR_FATTR_PRECHANGE)
@@ -1315,7 +1314,7 @@ static unsigned long nfs_wcc_update_inode(struct inode *inode, struct nfs_fattr
 	if ((fattr->valid & NFS_ATTR_FATTR_PRESIZE)
 			&& (fattr->valid & NFS_ATTR_FATTR_SIZE)
 			&& i_size_read(inode) == nfs_size_to_loff_t(fattr->pre_size)
-			&& nfsi->nrequests == 0) {
+			&& !nfs_have_writebacks(inode)) {
 		i_size_write(inode, nfs_size_to_loff_t(fattr->size));
 		ret |= NFS_INO_INVALID_ATTR;
 	}
@@ -1823,7 +1822,7 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)
 		if (new_isize != cur_isize) {
 			/* Do we perhaps have any outstanding writes, or has
 			 * the file grown beyond our last write? */
-			if (nfsi->nrequests == 0 || new_isize > cur_isize) {
+			if (!nfs_have_writebacks(inode) || new_isize > cur_isize) {
 				i_size_write(inode, new_isize);
 				if (!have_writers)
 					invalid |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_DATA;
@@ -2012,7 +2011,7 @@ static void init_once(void *foo)
 	INIT_LIST_HEAD(&nfsi->access_cache_entry_lru);
 	INIT_LIST_HEAD(&nfsi->access_cache_inode_lru);
 	INIT_LIST_HEAD(&nfsi->commit_info.list);
-	nfsi->nrequests = 0;
+	atomic_long_set(&nfsi->nrequests, 0);
 	nfsi->commit_info.ncommit = 0;
 	atomic_set(&nfsi->commit_info.rpcs_out, 0);
 	init_rwsem(&nfsi->rmdir_sem);
diff --git a/fs/nfs/pagelist.c b/fs/nfs/pagelist.c
index af6731dd4324..ec97c301899b 100644
--- a/fs/nfs/pagelist.c
+++ b/fs/nfs/pagelist.c
@@ -258,9 +258,7 @@ nfs_page_group_init(struct nfs_page *req, struct nfs_page *prev)
 			inode = page_file_mapping(req->wb_page)->host;
 			set_bit(PG_INODE_REF, &req->wb_flags);
 			kref_get(&req->wb_kref);
-			spin_lock(&inode->i_lock);
-			NFS_I(inode)->nrequests++;
-			spin_unlock(&inode->i_lock);
+			atomic_long_inc(&NFS_I(inode)->nrequests);
 		}
 	}
 }
diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index 5ab5ca24b48a..08093552f115 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -434,9 +434,7 @@ nfs_destroy_unlinked_subrequests(struct nfs_page *destroy_list,
 
 		if (test_and_clear_bit(PG_INODE_REF, &subreq->wb_flags)) {
 			nfs_release_request(subreq);
-			spin_lock(&inode->i_lock);
-			NFS_I(inode)->nrequests--;
-			spin_unlock(&inode->i_lock);
+			atomic_long_dec(&NFS_I(inode)->nrequests);
 		}
 
 		/* subreq is now totally disconnected from page group or any
@@ -567,9 +565,7 @@ try_again:
 	if (test_and_clear_bit(PG_REMOVE, &head->wb_flags)) {
 		set_bit(PG_INODE_REF, &head->wb_flags);
 		kref_get(&head->wb_kref);
-		spin_lock(&inode->i_lock);
-		NFS_I(inode)->nrequests++;
-		spin_unlock(&inode->i_lock);
+		atomic_long_inc(&NFS_I(inode)->nrequests);
 	}
 
 	nfs_page_group_unlock(head);
@@ -755,7 +751,7 @@ static void nfs_inode_add_request(struct inode *inode, struct nfs_page *req)
 	nfs_lock_request(req);
 
 	spin_lock(&inode->i_lock);
-	if (!nfsi->nrequests &&
+	if (!nfs_have_writebacks(inode) &&
 	    NFS_PROTO(inode)->have_delegation(inode, FMODE_WRITE))
 		inode->i_version++;
 	/*
@@ -767,7 +763,7 @@ static void nfs_inode_add_request(struct inode *inode, struct nfs_page *req)
 		SetPagePrivate(req->wb_page);
 		set_page_private(req->wb_page, (unsigned long)req);
 	}
-	nfsi->nrequests++;
+	atomic_long_inc(&nfsi->nrequests);
 	/* this a head request for a page group - mark it as having an
 	 * extra reference so sub groups can follow suit.
 	 * This flag also informs pgio layer when to bump nrequests when
@@ -786,6 +782,7 @@ static void nfs_inode_remove_request(struct nfs_page *req)
 	struct nfs_inode *nfsi = NFS_I(inode);
 	struct nfs_page *head;
 
+	atomic_long_dec(&nfsi->nrequests);
 	if (nfs_page_group_sync_on_bit(req, PG_REMOVE)) {
 		head = req->wb_head;
 
@@ -795,11 +792,6 @@ static void nfs_inode_remove_request(struct nfs_page *req)
 			ClearPagePrivate(head->wb_page);
 			clear_bit(PG_MAPPED, &head->wb_flags);
 		}
-		nfsi->nrequests--;
-		spin_unlock(&inode->i_lock);
-	} else {
-		spin_lock(&inode->i_lock);
-		nfsi->nrequests--;
 		spin_unlock(&inode->i_lock);
 	}
 
diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h
index 121a702888b4..238fdc4c46df 100644
--- a/include/linux/nfs_fs.h
+++ b/include/linux/nfs_fs.h
@@ -154,7 +154,7 @@ struct nfs_inode {
 	 */
 	__be32			cookieverf[2];
 
-	unsigned long		nrequests;
+	atomic_long_t		nrequests;
 	struct nfs_mds_commit_info commit_info;
 
 	/* Open contexts for shared mmap writes */
@@ -511,7 +511,7 @@ extern void nfs_commit_free(struct nfs_commit_data *data);
 static inline int
 nfs_have_writebacks(struct inode *inode)
 {
-	return NFS_I(inode)->nrequests != 0;
+	return atomic_long_read(&NFS_I(inode)->nrequests) != 0;
 }
 
 /*
-- 
cgit v1.2.3


From 5cb953d4b1e70a09084f71594c45d47458346bc2 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <trond.myklebust@primarydata.com>
Date: Tue, 1 Aug 2017 17:04:12 -0400
Subject: NFS: Use an atomic_long_t to count the number of commits

Signed-off-by: Trond Myklebust <trond.myklebust@primarydata.com>
---
 fs/nfs/inode.c          |  2 +-
 fs/nfs/write.c          | 12 +++++++-----
 include/linux/nfs_xdr.h |  2 +-
 3 files changed, 9 insertions(+), 7 deletions(-)

(limited to 'include')

diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index 0480eb02299a..134d9f560240 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -2012,7 +2012,7 @@ static void init_once(void *foo)
 	INIT_LIST_HEAD(&nfsi->access_cache_inode_lru);
 	INIT_LIST_HEAD(&nfsi->commit_info.list);
 	atomic_long_set(&nfsi->nrequests, 0);
-	nfsi->commit_info.ncommit = 0;
+	atomic_long_set(&nfsi->commit_info.ncommit, 0);
 	atomic_set(&nfsi->commit_info.rpcs_out, 0);
 	init_rwsem(&nfsi->rmdir_sem);
 	mutex_init(&nfsi->commit_mutex);
diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index 08093552f115..12479c25028e 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -857,7 +857,7 @@ nfs_request_add_commit_list_locked(struct nfs_page *req, struct list_head *dst,
 {
 	set_bit(PG_CLEAN, &req->wb_flags);
 	nfs_list_add_request(req, dst);
-	cinfo->mds->ncommit++;
+	atomic_long_inc(&cinfo->mds->ncommit);
 }
 EXPORT_SYMBOL_GPL(nfs_request_add_commit_list_locked);
 
@@ -903,7 +903,7 @@ nfs_request_remove_commit_list(struct nfs_page *req,
 	if (!test_and_clear_bit(PG_CLEAN, &(req)->wb_flags))
 		return;
 	nfs_list_remove_request(req);
-	cinfo->mds->ncommit--;
+	atomic_long_dec(&cinfo->mds->ncommit);
 }
 EXPORT_SYMBOL_GPL(nfs_request_remove_commit_list);
 
@@ -1017,7 +1017,7 @@ out:
 unsigned long
 nfs_reqs_to_commit(struct nfs_commit_info *cinfo)
 {
-	return cinfo->mds->ncommit;
+	return atomic_long_read(&cinfo->mds->ncommit);
 }
 
 /* NFS_I(cinfo->inode)->commit_mutex held by caller */
@@ -1057,8 +1057,10 @@ nfs_scan_commit(struct inode *inode, struct list_head *dst,
 {
 	int ret = 0;
 
+	if (!atomic_long_read(&cinfo->mds->ncommit))
+		return 0;
 	mutex_lock(&NFS_I(cinfo->inode)->commit_mutex);
-	if (cinfo->mds->ncommit > 0) {
+	if (atomic_long_read(&cinfo->mds->ncommit) > 0) {
 		const int max = INT_MAX;
 
 		ret = nfs_scan_commit_list(&cinfo->mds->list, dst,
@@ -1890,7 +1892,7 @@ int nfs_write_inode(struct inode *inode, struct writeback_control *wbc)
 	int ret = 0;
 
 	/* no commits means nothing needs to be done */
-	if (!nfsi->commit_info.ncommit)
+	if (!atomic_long_read(&nfsi->commit_info.ncommit))
 		return ret;
 
 	if (wbc->sync_mode == WB_SYNC_NONE) {
diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h
index 62cbcb842f99..164d5359d4ab 100644
--- a/include/linux/nfs_xdr.h
+++ b/include/linux/nfs_xdr.h
@@ -1476,7 +1476,7 @@ struct nfs_pgio_header {
 
 struct nfs_mds_commit_info {
 	atomic_t rpcs_out;
-	unsigned long		ncommit;
+	atomic_long_t		ncommit;
 	struct list_head	list;
 };
 
-- 
cgit v1.2.3


From 5d2405227a9eaea48e8cc95756a06d407b11f141 Mon Sep 17 00:00:00 2001
From: Nick Terrell <terrelln@fb.com>
Date: Fri, 4 Aug 2017 13:19:17 -0700
Subject: lib: Add xxhash module

Adds xxhash kernel module with xxh32 and xxh64 hashes. xxhash is an
extremely fast non-cryptographic hash algorithm for checksumming.
The zstd compression and decompression modules added in the next patch
require xxhash. I extracted it out from zstd since it is useful on its
own. I copied the code from the upstream XXHash source repository and
translated it into kernel style. I ran benchmarks and tests in the kernel
and tests in userland.

I benchmarked xxhash as a special character device. I ran in four modes,
no-op, xxh32, xxh64, and crc32. The no-op mode simply copies the data to
kernel space and ignores it. The xxh32, xxh64, and crc32 modes compute
hashes on the copied data. I also ran it with four different buffer sizes.
The benchmark file is located in the upstream zstd source repository under
`contrib/linux-kernel/xxhash_test.c` [1].

I ran the benchmarks on a Ubuntu 14.04 VM with 2 cores and 4 GiB of RAM.
The VM is running on a MacBook Pro with a 3.1 GHz Intel Core i7 processor,
16 GB of RAM, and a SSD. I benchmarked using the file `filesystem.squashfs`
from `ubuntu-16.10-desktop-amd64.iso`, which is 1,536,217,088 B large.
Run the following commands for the benchmark:

    modprobe xxhash_test
    mknod xxhash_test c 245 0
    time cp filesystem.squashfs xxhash_test

The time is reported by the time of the userland `cp`.
The GB/s is computed with

    1,536,217,008 B / time(buffer size, hash)

which includes the time to copy from userland.
The Normalized GB/s is computed with

    1,536,217,088 B / (time(buffer size, hash) - time(buffer size, none)).

| Buffer Size (B) | Hash  | Time (s) | GB/s | Adjusted GB/s |
|-----------------|-------|----------|------|---------------|
|            1024 | none  |    0.408 | 3.77 |             - |
|            1024 | xxh32 |    0.649 | 2.37 |          6.37 |
|            1024 | xxh64 |    0.542 | 2.83 |         11.46 |
|            1024 | crc32 |    1.290 | 1.19 |          1.74 |
|            4096 | none  |    0.380 | 4.04 |             - |
|            4096 | xxh32 |    0.645 | 2.38 |          5.79 |
|            4096 | xxh64 |    0.500 | 3.07 |         12.80 |
|            4096 | crc32 |    1.168 | 1.32 |          1.95 |
|            8192 | none  |    0.351 | 4.38 |             - |
|            8192 | xxh32 |    0.614 | 2.50 |          5.84 |
|            8192 | xxh64 |    0.464 | 3.31 |         13.60 |
|            8192 | crc32 |    1.163 | 1.32 |          1.89 |
|           16384 | none  |    0.346 | 4.43 |             - |
|           16384 | xxh32 |    0.590 | 2.60 |          6.30 |
|           16384 | xxh64 |    0.466 | 3.30 |         12.80 |
|           16384 | crc32 |    1.183 | 1.30 |          1.84 |

Tested in userland using the test-suite in the zstd repo under
`contrib/linux-kernel/test/XXHashUserlandTest.cpp` [2] by mocking the
kernel functions. A line in each branch of every function in `xxhash.c`
was commented out to ensure that the test-suite fails. Additionally
tested while testing zstd and with SMHasher [3].

[1] https://phabricator.intern.facebook.com/P57526246
[2] https://github.com/facebook/zstd/blob/dev/contrib/linux-kernel/test/XXHashUserlandTest.cpp
[3] https://github.com/aappleby/smhasher

zstd source repository: https://github.com/facebook/zstd
XXHash source repository: https://github.com/cyan4973/xxhash

Signed-off-by: Nick Terrell <terrelln@fb.com>
Signed-off-by: Chris Mason <clm@fb.com>
---
 include/linux/xxhash.h | 236 +++++++++++++++++++++++
 lib/Kconfig            |   3 +
 lib/Makefile           |   1 +
 lib/xxhash.c           | 500 +++++++++++++++++++++++++++++++++++++++++++++++++
 4 files changed, 740 insertions(+)
 create mode 100644 include/linux/xxhash.h
 create mode 100644 lib/xxhash.c

(limited to 'include')

diff --git a/include/linux/xxhash.h b/include/linux/xxhash.h
new file mode 100644
index 000000000000..9e1f42cb57e9
--- /dev/null
+++ b/include/linux/xxhash.h
@@ -0,0 +1,236 @@
+/*
+ * xxHash - Extremely Fast Hash algorithm
+ * Copyright (C) 2012-2016, Yann Collet.
+ *
+ * BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php)
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met:
+ *
+ *   * Redistributions of source code must retain the above copyright
+ *     notice, this list of conditions and the following disclaimer.
+ *   * Redistributions in binary form must reproduce the above
+ *     copyright notice, this list of conditions and the following disclaimer
+ *     in the documentation and/or other materials provided with the
+ *     distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * This program is free software; you can redistribute it and/or modify it under
+ * the terms of the GNU General Public License version 2 as published by the
+ * Free Software Foundation. This program is dual-licensed; you may select
+ * either version 2 of the GNU General Public License ("GPL") or BSD license
+ * ("BSD").
+ *
+ * You can contact the author at:
+ * - xxHash homepage: http://cyan4973.github.io/xxHash/
+ * - xxHash source repository: https://github.com/Cyan4973/xxHash
+ */
+
+/*
+ * Notice extracted from xxHash homepage:
+ *
+ * xxHash is an extremely fast Hash algorithm, running at RAM speed limits.
+ * It also successfully passes all tests from the SMHasher suite.
+ *
+ * Comparison (single thread, Windows Seven 32 bits, using SMHasher on a Core 2
+ * Duo @3GHz)
+ *
+ * Name            Speed       Q.Score   Author
+ * xxHash          5.4 GB/s     10
+ * CrapWow         3.2 GB/s      2       Andrew
+ * MumurHash 3a    2.7 GB/s     10       Austin Appleby
+ * SpookyHash      2.0 GB/s     10       Bob Jenkins
+ * SBox            1.4 GB/s      9       Bret Mulvey
+ * Lookup3         1.2 GB/s      9       Bob Jenkins
+ * SuperFastHash   1.2 GB/s      1       Paul Hsieh
+ * CityHash64      1.05 GB/s    10       Pike & Alakuijala
+ * FNV             0.55 GB/s     5       Fowler, Noll, Vo
+ * CRC32           0.43 GB/s     9
+ * MD5-32          0.33 GB/s    10       Ronald L. Rivest
+ * SHA1-32         0.28 GB/s    10
+ *
+ * Q.Score is a measure of quality of the hash function.
+ * It depends on successfully passing SMHasher test set.
+ * 10 is a perfect score.
+ *
+ * A 64-bits version, named xxh64 offers much better speed,
+ * but for 64-bits applications only.
+ * Name     Speed on 64 bits    Speed on 32 bits
+ * xxh64       13.8 GB/s            1.9 GB/s
+ * xxh32        6.8 GB/s            6.0 GB/s
+ */
+
+#ifndef XXHASH_H
+#define XXHASH_H
+
+#include <linux/types.h>
+
+/*-****************************
+ * Simple Hash Functions
+ *****************************/
+
+/**
+ * xxh32() - calculate the 32-bit hash of the input with a given seed.
+ *
+ * @input:  The data to hash.
+ * @length: The length of the data to hash.
+ * @seed:   The seed can be used to alter the result predictably.
+ *
+ * Speed on Core 2 Duo @ 3 GHz (single thread, SMHasher benchmark) : 5.4 GB/s
+ *
+ * Return:  The 32-bit hash of the data.
+ */
+uint32_t xxh32(const void *input, size_t length, uint32_t seed);
+
+/**
+ * xxh64() - calculate the 64-bit hash of the input with a given seed.
+ *
+ * @input:  The data to hash.
+ * @length: The length of the data to hash.
+ * @seed:   The seed can be used to alter the result predictably.
+ *
+ * This function runs 2x faster on 64-bit systems, but slower on 32-bit systems.
+ *
+ * Return:  The 64-bit hash of the data.
+ */
+uint64_t xxh64(const void *input, size_t length, uint64_t seed);
+
+/*-****************************
+ * Streaming Hash Functions
+ *****************************/
+
+/*
+ * These definitions are only meant to allow allocation of XXH state
+ * statically, on stack, or in a struct for example.
+ * Do not use members directly.
+ */
+
+/**
+ * struct xxh32_state - private xxh32 state, do not use members directly
+ */
+struct xxh32_state {
+	uint32_t total_len_32;
+	uint32_t large_len;
+	uint32_t v1;
+	uint32_t v2;
+	uint32_t v3;
+	uint32_t v4;
+	uint32_t mem32[4];
+	uint32_t memsize;
+};
+
+/**
+ * struct xxh32_state - private xxh64 state, do not use members directly
+ */
+struct xxh64_state {
+	uint64_t total_len;
+	uint64_t v1;
+	uint64_t v2;
+	uint64_t v3;
+	uint64_t v4;
+	uint64_t mem64[4];
+	uint32_t memsize;
+};
+
+/**
+ * xxh32_reset() - reset the xxh32 state to start a new hashing operation
+ *
+ * @state: The xxh32 state to reset.
+ * @seed:  Initialize the hash state with this seed.
+ *
+ * Call this function on any xxh32_state to prepare for a new hashing operation.
+ */
+void xxh32_reset(struct xxh32_state *state, uint32_t seed);
+
+/**
+ * xxh32_update() - hash the data given and update the xxh32 state
+ *
+ * @state:  The xxh32 state to update.
+ * @input:  The data to hash.
+ * @length: The length of the data to hash.
+ *
+ * After calling xxh32_reset() call xxh32_update() as many times as necessary.
+ *
+ * Return:  Zero on success, otherwise an error code.
+ */
+int xxh32_update(struct xxh32_state *state, const void *input, size_t length);
+
+/**
+ * xxh32_digest() - produce the current xxh32 hash
+ *
+ * @state: Produce the current xxh32 hash of this state.
+ *
+ * A hash value can be produced at any time. It is still possible to continue
+ * inserting input into the hash state after a call to xxh32_digest(), and
+ * generate new hashes later on, by calling xxh32_digest() again.
+ *
+ * Return: The xxh32 hash stored in the state.
+ */
+uint32_t xxh32_digest(const struct xxh32_state *state);
+
+/**
+ * xxh64_reset() - reset the xxh64 state to start a new hashing operation
+ *
+ * @state: The xxh64 state to reset.
+ * @seed:  Initialize the hash state with this seed.
+ */
+void xxh64_reset(struct xxh64_state *state, uint64_t seed);
+
+/**
+ * xxh64_update() - hash the data given and update the xxh64 state
+ * @state:  The xxh64 state to update.
+ * @input:  The data to hash.
+ * @length: The length of the data to hash.
+ *
+ * After calling xxh64_reset() call xxh64_update() as many times as necessary.
+ *
+ * Return:  Zero on success, otherwise an error code.
+ */
+int xxh64_update(struct xxh64_state *state, const void *input, size_t length);
+
+/**
+ * xxh64_digest() - produce the current xxh64 hash
+ *
+ * @state: Produce the current xxh64 hash of this state.
+ *
+ * A hash value can be produced at any time. It is still possible to continue
+ * inserting input into the hash state after a call to xxh64_digest(), and
+ * generate new hashes later on, by calling xxh64_digest() again.
+ *
+ * Return: The xxh64 hash stored in the state.
+ */
+uint64_t xxh64_digest(const struct xxh64_state *state);
+
+/*-**************************
+ * Utils
+ ***************************/
+
+/**
+ * xxh32_copy_state() - copy the source state into the destination state
+ *
+ * @src: The source xxh32 state.
+ * @dst: The destination xxh32 state.
+ */
+void xxh32_copy_state(struct xxh32_state *dst, const struct xxh32_state *src);
+
+/**
+ * xxh64_copy_state() - copy the source state into the destination state
+ *
+ * @src: The source xxh64 state.
+ * @dst: The destination xxh64 state.
+ */
+void xxh64_copy_state(struct xxh64_state *dst, const struct xxh64_state *src);
+
+#endif /* XXHASH_H */
diff --git a/lib/Kconfig b/lib/Kconfig
index 6762529ad9e4..5e7541f46081 100644
--- a/lib/Kconfig
+++ b/lib/Kconfig
@@ -192,6 +192,9 @@ config CRC8
 	  when they need to do cyclic redundancy check according CRC8
 	  algorithm. Module will be called crc8.
 
+config XXHASH
+	tristate
+
 config AUDIT_GENERIC
 	bool
 	depends on AUDIT && !AUDIT_ARCH
diff --git a/lib/Makefile b/lib/Makefile
index 40c18372b301..d06b68ae55a3 100644
--- a/lib/Makefile
+++ b/lib/Makefile
@@ -102,6 +102,7 @@ obj-$(CONFIG_CRC4)	+= crc4.o
 obj-$(CONFIG_CRC7)	+= crc7.o
 obj-$(CONFIG_LIBCRC32C)	+= libcrc32c.o
 obj-$(CONFIG_CRC8)	+= crc8.o
+obj-$(CONFIG_XXHASH)	+= xxhash.o
 obj-$(CONFIG_GENERIC_ALLOCATOR) += genalloc.o
 
 obj-$(CONFIG_842_COMPRESS) += 842/
diff --git a/lib/xxhash.c b/lib/xxhash.c
new file mode 100644
index 000000000000..aa61e2a3802f
--- /dev/null
+++ b/lib/xxhash.c
@@ -0,0 +1,500 @@
+/*
+ * xxHash - Extremely Fast Hash algorithm
+ * Copyright (C) 2012-2016, Yann Collet.
+ *
+ * BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php)
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met:
+ *
+ *   * Redistributions of source code must retain the above copyright
+ *     notice, this list of conditions and the following disclaimer.
+ *   * Redistributions in binary form must reproduce the above
+ *     copyright notice, this list of conditions and the following disclaimer
+ *     in the documentation and/or other materials provided with the
+ *     distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * This program is free software; you can redistribute it and/or modify it under
+ * the terms of the GNU General Public License version 2 as published by the
+ * Free Software Foundation. This program is dual-licensed; you may select
+ * either version 2 of the GNU General Public License ("GPL") or BSD license
+ * ("BSD").
+ *
+ * You can contact the author at:
+ * - xxHash homepage: http://cyan4973.github.io/xxHash/
+ * - xxHash source repository: https://github.com/Cyan4973/xxHash
+ */
+
+#include <asm/unaligned.h>
+#include <linux/errno.h>
+#include <linux/compiler.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/string.h>
+#include <linux/xxhash.h>
+
+/*-*************************************
+ * Macros
+ **************************************/
+#define xxh_rotl32(x, r) ((x << r) | (x >> (32 - r)))
+#define xxh_rotl64(x, r) ((x << r) | (x >> (64 - r)))
+
+#ifdef __LITTLE_ENDIAN
+# define XXH_CPU_LITTLE_ENDIAN 1
+#else
+# define XXH_CPU_LITTLE_ENDIAN 0
+#endif
+
+/*-*************************************
+ * Constants
+ **************************************/
+static const uint32_t PRIME32_1 = 2654435761U;
+static const uint32_t PRIME32_2 = 2246822519U;
+static const uint32_t PRIME32_3 = 3266489917U;
+static const uint32_t PRIME32_4 =  668265263U;
+static const uint32_t PRIME32_5 =  374761393U;
+
+static const uint64_t PRIME64_1 = 11400714785074694791ULL;
+static const uint64_t PRIME64_2 = 14029467366897019727ULL;
+static const uint64_t PRIME64_3 =  1609587929392839161ULL;
+static const uint64_t PRIME64_4 =  9650029242287828579ULL;
+static const uint64_t PRIME64_5 =  2870177450012600261ULL;
+
+/*-**************************
+ *  Utils
+ ***************************/
+void xxh32_copy_state(struct xxh32_state *dst, const struct xxh32_state *src)
+{
+	memcpy(dst, src, sizeof(*dst));
+}
+EXPORT_SYMBOL(xxh32_copy_state);
+
+void xxh64_copy_state(struct xxh64_state *dst, const struct xxh64_state *src)
+{
+	memcpy(dst, src, sizeof(*dst));
+}
+EXPORT_SYMBOL(xxh64_copy_state);
+
+/*-***************************
+ * Simple Hash Functions
+ ****************************/
+static uint32_t xxh32_round(uint32_t seed, const uint32_t input)
+{
+	seed += input * PRIME32_2;
+	seed = xxh_rotl32(seed, 13);
+	seed *= PRIME32_1;
+	return seed;
+}
+
+uint32_t xxh32(const void *input, const size_t len, const uint32_t seed)
+{
+	const uint8_t *p = (const uint8_t *)input;
+	const uint8_t *b_end = p + len;
+	uint32_t h32;
+
+	if (len >= 16) {
+		const uint8_t *const limit = b_end - 16;
+		uint32_t v1 = seed + PRIME32_1 + PRIME32_2;
+		uint32_t v2 = seed + PRIME32_2;
+		uint32_t v3 = seed + 0;
+		uint32_t v4 = seed - PRIME32_1;
+
+		do {
+			v1 = xxh32_round(v1, get_unaligned_le32(p));
+			p += 4;
+			v2 = xxh32_round(v2, get_unaligned_le32(p));
+			p += 4;
+			v3 = xxh32_round(v3, get_unaligned_le32(p));
+			p += 4;
+			v4 = xxh32_round(v4, get_unaligned_le32(p));
+			p += 4;
+		} while (p <= limit);
+
+		h32 = xxh_rotl32(v1, 1) + xxh_rotl32(v2, 7) +
+			xxh_rotl32(v3, 12) + xxh_rotl32(v4, 18);
+	} else {
+		h32 = seed + PRIME32_5;
+	}
+
+	h32 += (uint32_t)len;
+
+	while (p + 4 <= b_end) {
+		h32 += get_unaligned_le32(p) * PRIME32_3;
+		h32 = xxh_rotl32(h32, 17) * PRIME32_4;
+		p += 4;
+	}
+
+	while (p < b_end) {
+		h32 += (*p) * PRIME32_5;
+		h32 = xxh_rotl32(h32, 11) * PRIME32_1;
+		p++;
+	}
+
+	h32 ^= h32 >> 15;
+	h32 *= PRIME32_2;
+	h32 ^= h32 >> 13;
+	h32 *= PRIME32_3;
+	h32 ^= h32 >> 16;
+
+	return h32;
+}
+EXPORT_SYMBOL(xxh32);
+
+static uint64_t xxh64_round(uint64_t acc, const uint64_t input)
+{
+	acc += input * PRIME64_2;
+	acc = xxh_rotl64(acc, 31);
+	acc *= PRIME64_1;
+	return acc;
+}
+
+static uint64_t xxh64_merge_round(uint64_t acc, uint64_t val)
+{
+	val = xxh64_round(0, val);
+	acc ^= val;
+	acc = acc * PRIME64_1 + PRIME64_4;
+	return acc;
+}
+
+uint64_t xxh64(const void *input, const size_t len, const uint64_t seed)
+{
+	const uint8_t *p = (const uint8_t *)input;
+	const uint8_t *const b_end = p + len;
+	uint64_t h64;
+
+	if (len >= 32) {
+		const uint8_t *const limit = b_end - 32;
+		uint64_t v1 = seed + PRIME64_1 + PRIME64_2;
+		uint64_t v2 = seed + PRIME64_2;
+		uint64_t v3 = seed + 0;
+		uint64_t v4 = seed - PRIME64_1;
+
+		do {
+			v1 = xxh64_round(v1, get_unaligned_le64(p));
+			p += 8;
+			v2 = xxh64_round(v2, get_unaligned_le64(p));
+			p += 8;
+			v3 = xxh64_round(v3, get_unaligned_le64(p));
+			p += 8;
+			v4 = xxh64_round(v4, get_unaligned_le64(p));
+			p += 8;
+		} while (p <= limit);
+
+		h64 = xxh_rotl64(v1, 1) + xxh_rotl64(v2, 7) +
+			xxh_rotl64(v3, 12) + xxh_rotl64(v4, 18);
+		h64 = xxh64_merge_round(h64, v1);
+		h64 = xxh64_merge_round(h64, v2);
+		h64 = xxh64_merge_round(h64, v3);
+		h64 = xxh64_merge_round(h64, v4);
+
+	} else {
+		h64  = seed + PRIME64_5;
+	}
+
+	h64 += (uint64_t)len;
+
+	while (p + 8 <= b_end) {
+		const uint64_t k1 = xxh64_round(0, get_unaligned_le64(p));
+
+		h64 ^= k1;
+		h64 = xxh_rotl64(h64, 27) * PRIME64_1 + PRIME64_4;
+		p += 8;
+	}
+
+	if (p + 4 <= b_end) {
+		h64 ^= (uint64_t)(get_unaligned_le32(p)) * PRIME64_1;
+		h64 = xxh_rotl64(h64, 23) * PRIME64_2 + PRIME64_3;
+		p += 4;
+	}
+
+	while (p < b_end) {
+		h64 ^= (*p) * PRIME64_5;
+		h64 = xxh_rotl64(h64, 11) * PRIME64_1;
+		p++;
+	}
+
+	h64 ^= h64 >> 33;
+	h64 *= PRIME64_2;
+	h64 ^= h64 >> 29;
+	h64 *= PRIME64_3;
+	h64 ^= h64 >> 32;
+
+	return h64;
+}
+EXPORT_SYMBOL(xxh64);
+
+/*-**************************************************
+ * Advanced Hash Functions
+ ***************************************************/
+void xxh32_reset(struct xxh32_state *statePtr, const uint32_t seed)
+{
+	/* use a local state for memcpy() to avoid strict-aliasing warnings */
+	struct xxh32_state state;
+
+	memset(&state, 0, sizeof(state));
+	state.v1 = seed + PRIME32_1 + PRIME32_2;
+	state.v2 = seed + PRIME32_2;
+	state.v3 = seed + 0;
+	state.v4 = seed - PRIME32_1;
+	memcpy(statePtr, &state, sizeof(state));
+}
+EXPORT_SYMBOL(xxh32_reset);
+
+void xxh64_reset(struct xxh64_state *statePtr, const uint64_t seed)
+{
+	/* use a local state for memcpy() to avoid strict-aliasing warnings */
+	struct xxh64_state state;
+
+	memset(&state, 0, sizeof(state));
+	state.v1 = seed + PRIME64_1 + PRIME64_2;
+	state.v2 = seed + PRIME64_2;
+	state.v3 = seed + 0;
+	state.v4 = seed - PRIME64_1;
+	memcpy(statePtr, &state, sizeof(state));
+}
+EXPORT_SYMBOL(xxh64_reset);
+
+int xxh32_update(struct xxh32_state *state, const void *input, const size_t len)
+{
+	const uint8_t *p = (const uint8_t *)input;
+	const uint8_t *const b_end = p + len;
+
+	if (input == NULL)
+		return -EINVAL;
+
+	state->total_len_32 += (uint32_t)len;
+	state->large_len |= (len >= 16) | (state->total_len_32 >= 16);
+
+	if (state->memsize + len < 16) { /* fill in tmp buffer */
+		memcpy((uint8_t *)(state->mem32) + state->memsize, input, len);
+		state->memsize += (uint32_t)len;
+		return 0;
+	}
+
+	if (state->memsize) { /* some data left from previous update */
+		const uint32_t *p32 = state->mem32;
+
+		memcpy((uint8_t *)(state->mem32) + state->memsize, input,
+			16 - state->memsize);
+
+		state->v1 = xxh32_round(state->v1, get_unaligned_le32(p32));
+		p32++;
+		state->v2 = xxh32_round(state->v2, get_unaligned_le32(p32));
+		p32++;
+		state->v3 = xxh32_round(state->v3, get_unaligned_le32(p32));
+		p32++;
+		state->v4 = xxh32_round(state->v4, get_unaligned_le32(p32));
+		p32++;
+
+		p += 16-state->memsize;
+		state->memsize = 0;
+	}
+
+	if (p <= b_end - 16) {
+		const uint8_t *const limit = b_end - 16;
+		uint32_t v1 = state->v1;
+		uint32_t v2 = state->v2;
+		uint32_t v3 = state->v3;
+		uint32_t v4 = state->v4;
+
+		do {
+			v1 = xxh32_round(v1, get_unaligned_le32(p));
+			p += 4;
+			v2 = xxh32_round(v2, get_unaligned_le32(p));
+			p += 4;
+			v3 = xxh32_round(v3, get_unaligned_le32(p));
+			p += 4;
+			v4 = xxh32_round(v4, get_unaligned_le32(p));
+			p += 4;
+		} while (p <= limit);
+
+		state->v1 = v1;
+		state->v2 = v2;
+		state->v3 = v3;
+		state->v4 = v4;
+	}
+
+	if (p < b_end) {
+		memcpy(state->mem32, p, (size_t)(b_end-p));
+		state->memsize = (uint32_t)(b_end-p);
+	}
+
+	return 0;
+}
+EXPORT_SYMBOL(xxh32_update);
+
+uint32_t xxh32_digest(const struct xxh32_state *state)
+{
+	const uint8_t *p = (const uint8_t *)state->mem32;
+	const uint8_t *const b_end = (const uint8_t *)(state->mem32) +
+		state->memsize;
+	uint32_t h32;
+
+	if (state->large_len) {
+		h32 = xxh_rotl32(state->v1, 1) + xxh_rotl32(state->v2, 7) +
+			xxh_rotl32(state->v3, 12) + xxh_rotl32(state->v4, 18);
+	} else {
+		h32 = state->v3 /* == seed */ + PRIME32_5;
+	}
+
+	h32 += state->total_len_32;
+
+	while (p + 4 <= b_end) {
+		h32 += get_unaligned_le32(p) * PRIME32_3;
+		h32 = xxh_rotl32(h32, 17) * PRIME32_4;
+		p += 4;
+	}
+
+	while (p < b_end) {
+		h32 += (*p) * PRIME32_5;
+		h32 = xxh_rotl32(h32, 11) * PRIME32_1;
+		p++;
+	}
+
+	h32 ^= h32 >> 15;
+	h32 *= PRIME32_2;
+	h32 ^= h32 >> 13;
+	h32 *= PRIME32_3;
+	h32 ^= h32 >> 16;
+
+	return h32;
+}
+EXPORT_SYMBOL(xxh32_digest);
+
+int xxh64_update(struct xxh64_state *state, const void *input, const size_t len)
+{
+	const uint8_t *p = (const uint8_t *)input;
+	const uint8_t *const b_end = p + len;
+
+	if (input == NULL)
+		return -EINVAL;
+
+	state->total_len += len;
+
+	if (state->memsize + len < 32) { /* fill in tmp buffer */
+		memcpy(((uint8_t *)state->mem64) + state->memsize, input, len);
+		state->memsize += (uint32_t)len;
+		return 0;
+	}
+
+	if (state->memsize) { /* tmp buffer is full */
+		uint64_t *p64 = state->mem64;
+
+		memcpy(((uint8_t *)p64) + state->memsize, input,
+			32 - state->memsize);
+
+		state->v1 = xxh64_round(state->v1, get_unaligned_le64(p64));
+		p64++;
+		state->v2 = xxh64_round(state->v2, get_unaligned_le64(p64));
+		p64++;
+		state->v3 = xxh64_round(state->v3, get_unaligned_le64(p64));
+		p64++;
+		state->v4 = xxh64_round(state->v4, get_unaligned_le64(p64));
+
+		p += 32 - state->memsize;
+		state->memsize = 0;
+	}
+
+	if (p + 32 <= b_end) {
+		const uint8_t *const limit = b_end - 32;
+		uint64_t v1 = state->v1;
+		uint64_t v2 = state->v2;
+		uint64_t v3 = state->v3;
+		uint64_t v4 = state->v4;
+
+		do {
+			v1 = xxh64_round(v1, get_unaligned_le64(p));
+			p += 8;
+			v2 = xxh64_round(v2, get_unaligned_le64(p));
+			p += 8;
+			v3 = xxh64_round(v3, get_unaligned_le64(p));
+			p += 8;
+			v4 = xxh64_round(v4, get_unaligned_le64(p));
+			p += 8;
+		} while (p <= limit);
+
+		state->v1 = v1;
+		state->v2 = v2;
+		state->v3 = v3;
+		state->v4 = v4;
+	}
+
+	if (p < b_end) {
+		memcpy(state->mem64, p, (size_t)(b_end-p));
+		state->memsize = (uint32_t)(b_end - p);
+	}
+
+	return 0;
+}
+EXPORT_SYMBOL(xxh64_update);
+
+uint64_t xxh64_digest(const struct xxh64_state *state)
+{
+	const uint8_t *p = (const uint8_t *)state->mem64;
+	const uint8_t *const b_end = (const uint8_t *)state->mem64 +
+		state->memsize;
+	uint64_t h64;
+
+	if (state->total_len >= 32) {
+		const uint64_t v1 = state->v1;
+		const uint64_t v2 = state->v2;
+		const uint64_t v3 = state->v3;
+		const uint64_t v4 = state->v4;
+
+		h64 = xxh_rotl64(v1, 1) + xxh_rotl64(v2, 7) +
+			xxh_rotl64(v3, 12) + xxh_rotl64(v4, 18);
+		h64 = xxh64_merge_round(h64, v1);
+		h64 = xxh64_merge_round(h64, v2);
+		h64 = xxh64_merge_round(h64, v3);
+		h64 = xxh64_merge_round(h64, v4);
+	} else {
+		h64  = state->v3 + PRIME64_5;
+	}
+
+	h64 += (uint64_t)state->total_len;
+
+	while (p + 8 <= b_end) {
+		const uint64_t k1 = xxh64_round(0, get_unaligned_le64(p));
+
+		h64 ^= k1;
+		h64 = xxh_rotl64(h64, 27) * PRIME64_1 + PRIME64_4;
+		p += 8;
+	}
+
+	if (p + 4 <= b_end) {
+		h64 ^= (uint64_t)(get_unaligned_le32(p)) * PRIME64_1;
+		h64 = xxh_rotl64(h64, 23) * PRIME64_2 + PRIME64_3;
+		p += 4;
+	}
+
+	while (p < b_end) {
+		h64 ^= (*p) * PRIME64_5;
+		h64 = xxh_rotl64(h64, 11) * PRIME64_1;
+		p++;
+	}
+
+	h64 ^= h64 >> 33;
+	h64 *= PRIME64_2;
+	h64 ^= h64 >> 29;
+	h64 *= PRIME64_3;
+	h64 ^= h64 >> 32;
+
+	return h64;
+}
+EXPORT_SYMBOL(xxh64_digest);
+
+MODULE_LICENSE("Dual BSD/GPL");
+MODULE_DESCRIPTION("xxHash");
-- 
cgit v1.2.3


From 73f3d1b48f5069d46ba48aa28c2898dc93185560 Mon Sep 17 00:00:00 2001
From: Nick Terrell <terrelln@fb.com>
Date: Wed, 9 Aug 2017 19:35:53 -0700
Subject: lib: Add zstd modules

Add zstd compression and decompression kernel modules.
zstd offers a wide varity of compression speed and quality trade-offs.
It can compress at speeds approaching lz4, and quality approaching lzma.
zstd decompressions at speeds more than twice as fast as zlib, and
decompression speed remains roughly the same across all compression levels.

The code was ported from the upstream zstd source repository. The
`linux/zstd.h` header was modified to match linux kernel style.
The cross-platform and allocation code was stripped out. Instead zstd
requires the caller to pass a preallocated workspace. The source files
were clang-formatted [1] to match the Linux Kernel style as much as
possible. Otherwise, the code was unmodified. We would like to avoid
as much further manual modification to the source code as possible, so it
will be easier to keep the kernel zstd up to date.

I benchmarked zstd compression as a special character device. I ran zstd
and zlib compression at several levels, as well as performing no
compression, which measure the time spent copying the data to kernel space.
Data is passed to the compresser 4096 B at a time. The benchmark file is
located in the upstream zstd source repository under
`contrib/linux-kernel/zstd_compress_test.c` [2].

I ran the benchmarks on a Ubuntu 14.04 VM with 2 cores and 4 GiB of RAM.
The VM is running on a MacBook Pro with a 3.1 GHz Intel Core i7 processor,
16 GB of RAM, and a SSD. I benchmarked using `silesia.tar` [3], which is
211,988,480 B large. Run the following commands for the benchmark:

    sudo modprobe zstd_compress_test
    sudo mknod zstd_compress_test c 245 0
    sudo cp silesia.tar zstd_compress_test

The time is reported by the time of the userland `cp`.
The MB/s is computed with

    1,536,217,008 B / time(buffer size, hash)

which includes the time to copy from userland.
The Adjusted MB/s is computed with

    1,536,217,088 B / (time(buffer size, hash) - time(buffer size, none)).

The memory reported is the amount of memory the compressor requests.

| Method   | Size (B) | Time (s) | Ratio | MB/s    | Adj MB/s | Mem (MB) |
|----------|----------|----------|-------|---------|----------|----------|
| none     | 11988480 |    0.100 |     1 | 2119.88 |        - |        - |
| zstd -1  | 73645762 |    1.044 | 2.878 |  203.05 |   224.56 |     1.23 |
| zstd -3  | 66988878 |    1.761 | 3.165 |  120.38 |   127.63 |     2.47 |
| zstd -5  | 65001259 |    2.563 | 3.261 |   82.71 |    86.07 |     2.86 |
| zstd -10 | 60165346 |   13.242 | 3.523 |   16.01 |    16.13 |    13.22 |
| zstd -15 | 58009756 |   47.601 | 3.654 |    4.45 |     4.46 |    21.61 |
| zstd -19 | 54014593 |  102.835 | 3.925 |    2.06 |     2.06 |    60.15 |
| zlib -1  | 77260026 |    2.895 | 2.744 |   73.23 |    75.85 |     0.27 |
| zlib -3  | 72972206 |    4.116 | 2.905 |   51.50 |    52.79 |     0.27 |
| zlib -6  | 68190360 |    9.633 | 3.109 |   22.01 |    22.24 |     0.27 |
| zlib -9  | 67613382 |   22.554 | 3.135 |    9.40 |     9.44 |     0.27 |

I benchmarked zstd decompression using the same method on the same machine.
The benchmark file is located in the upstream zstd repo under
`contrib/linux-kernel/zstd_decompress_test.c` [4]. The memory reported is
the amount of memory required to decompress data compressed with the given
compression level. If you know the maximum size of your input, you can
reduce the memory usage of decompression irrespective of the compression
level.

| Method   | Time (s) | MB/s    | Adjusted MB/s | Memory (MB) |
|----------|----------|---------|---------------|-------------|
| none     |    0.025 | 8479.54 |             - |           - |
| zstd -1  |    0.358 |  592.15 |        636.60 |        0.84 |
| zstd -3  |    0.396 |  535.32 |        571.40 |        1.46 |
| zstd -5  |    0.396 |  535.32 |        571.40 |        1.46 |
| zstd -10 |    0.374 |  566.81 |        607.42 |        2.51 |
| zstd -15 |    0.379 |  559.34 |        598.84 |        4.61 |
| zstd -19 |    0.412 |  514.54 |        547.77 |        8.80 |
| zlib -1  |    0.940 |  225.52 |        231.68 |        0.04 |
| zlib -3  |    0.883 |  240.08 |        247.07 |        0.04 |
| zlib -6  |    0.844 |  251.17 |        258.84 |        0.04 |
| zlib -9  |    0.837 |  253.27 |        287.64 |        0.04 |

Tested in userland using the test-suite in the zstd repo under
`contrib/linux-kernel/test/UserlandTest.cpp` [5] by mocking the kernel
functions. Fuzz tested using libfuzzer [6] with the fuzz harnesses under
`contrib/linux-kernel/test/{RoundTripCrash.c,DecompressCrash.c}` [7] [8]
with ASAN, UBSAN, and MSAN. Additionaly, it was tested while testing the
BtrFS and SquashFS patches coming next.

[1] https://clang.llvm.org/docs/ClangFormat.html
[2] https://github.com/facebook/zstd/blob/dev/contrib/linux-kernel/zstd_compress_test.c
[3] http://sun.aei.polsl.pl/~sdeor/index.php?page=silesia
[4] https://github.com/facebook/zstd/blob/dev/contrib/linux-kernel/zstd_decompress_test.c
[5] https://github.com/facebook/zstd/blob/dev/contrib/linux-kernel/test/UserlandTest.cpp
[6] http://llvm.org/docs/LibFuzzer.html
[7] https://github.com/facebook/zstd/blob/dev/contrib/linux-kernel/test/RoundTripCrash.c
[8] https://github.com/facebook/zstd/blob/dev/contrib/linux-kernel/test/DecompressCrash.c

zstd source repository: https://github.com/facebook/zstd

Signed-off-by: Nick Terrell <terrelln@fb.com>
Signed-off-by: Chris Mason <clm@fb.com>
---
 include/linux/zstd.h      | 1157 +++++++++++++++
 lib/Kconfig               |    8 +
 lib/Makefile              |    2 +
 lib/zstd/Makefile         |   18 +
 lib/zstd/bitstream.h      |  374 +++++
 lib/zstd/compress.c       | 3484 +++++++++++++++++++++++++++++++++++++++++++++
 lib/zstd/decompress.c     | 2528 ++++++++++++++++++++++++++++++++
 lib/zstd/entropy_common.c |  243 ++++
 lib/zstd/error_private.h  |   53 +
 lib/zstd/fse.h            |  575 ++++++++
 lib/zstd/fse_compress.c   |  795 +++++++++++
 lib/zstd/fse_decompress.c |  332 +++++
 lib/zstd/huf.h            |  212 +++
 lib/zstd/huf_compress.c   |  770 ++++++++++
 lib/zstd/huf_decompress.c |  960 +++++++++++++
 lib/zstd/mem.h            |  151 ++
 lib/zstd/zstd_common.c    |   75 +
 lib/zstd/zstd_internal.h  |  263 ++++
 lib/zstd/zstd_opt.h       | 1014 +++++++++++++
 19 files changed, 13014 insertions(+)
 create mode 100644 include/linux/zstd.h
 create mode 100644 lib/zstd/Makefile
 create mode 100644 lib/zstd/bitstream.h
 create mode 100644 lib/zstd/compress.c
 create mode 100644 lib/zstd/decompress.c
 create mode 100644 lib/zstd/entropy_common.c
 create mode 100644 lib/zstd/error_private.h
 create mode 100644 lib/zstd/fse.h
 create mode 100644 lib/zstd/fse_compress.c
 create mode 100644 lib/zstd/fse_decompress.c
 create mode 100644 lib/zstd/huf.h
 create mode 100644 lib/zstd/huf_compress.c
 create mode 100644 lib/zstd/huf_decompress.c
 create mode 100644 lib/zstd/mem.h
 create mode 100644 lib/zstd/zstd_common.c
 create mode 100644 lib/zstd/zstd_internal.h
 create mode 100644 lib/zstd/zstd_opt.h

(limited to 'include')

diff --git a/include/linux/zstd.h b/include/linux/zstd.h
new file mode 100644
index 000000000000..249575e2485f
--- /dev/null
+++ b/include/linux/zstd.h
@@ -0,0 +1,1157 @@
+/*
+ * Copyright (c) 2016-present, Yann Collet, Facebook, Inc.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of https://github.com/facebook/zstd.
+ * An additional grant of patent rights can be found in the PATENTS file in the
+ * same directory.
+ *
+ * This program is free software; you can redistribute it and/or modify it under
+ * the terms of the GNU General Public License version 2 as published by the
+ * Free Software Foundation. This program is dual-licensed; you may select
+ * either version 2 of the GNU General Public License ("GPL") or BSD license
+ * ("BSD").
+ */
+
+#ifndef ZSTD_H
+#define ZSTD_H
+
+/* ======   Dependency   ======*/
+#include <linux/types.h>   /* size_t */
+
+
+/*-*****************************************************************************
+ * Introduction
+ *
+ * zstd, short for Zstandard, is a fast lossless compression algorithm,
+ * targeting real-time compression scenarios at zlib-level and better
+ * compression ratios. The zstd compression library provides in-memory
+ * compression and decompression functions. The library supports compression
+ * levels from 1 up to ZSTD_maxCLevel() which is 22. Levels >= 20, labeled
+ * ultra, should be used with caution, as they require more memory.
+ * Compression can be done in:
+ *  - a single step, reusing a context (described as Explicit memory management)
+ *  - unbounded multiple steps (described as Streaming compression)
+ * The compression ratio achievable on small data can be highly improved using
+ * compression with a dictionary in:
+ *  - a single step (described as Simple dictionary API)
+ *  - a single step, reusing a dictionary (described as Fast dictionary API)
+ ******************************************************************************/
+
+/*======  Helper functions  ======*/
+
+/**
+ * enum ZSTD_ErrorCode - zstd error codes
+ *
+ * Functions that return size_t can be checked for errors using ZSTD_isError()
+ * and the ZSTD_ErrorCode can be extracted using ZSTD_getErrorCode().
+ */
+typedef enum {
+	ZSTD_error_no_error,
+	ZSTD_error_GENERIC,
+	ZSTD_error_prefix_unknown,
+	ZSTD_error_version_unsupported,
+	ZSTD_error_parameter_unknown,
+	ZSTD_error_frameParameter_unsupported,
+	ZSTD_error_frameParameter_unsupportedBy32bits,
+	ZSTD_error_frameParameter_windowTooLarge,
+	ZSTD_error_compressionParameter_unsupported,
+	ZSTD_error_init_missing,
+	ZSTD_error_memory_allocation,
+	ZSTD_error_stage_wrong,
+	ZSTD_error_dstSize_tooSmall,
+	ZSTD_error_srcSize_wrong,
+	ZSTD_error_corruption_detected,
+	ZSTD_error_checksum_wrong,
+	ZSTD_error_tableLog_tooLarge,
+	ZSTD_error_maxSymbolValue_tooLarge,
+	ZSTD_error_maxSymbolValue_tooSmall,
+	ZSTD_error_dictionary_corrupted,
+	ZSTD_error_dictionary_wrong,
+	ZSTD_error_dictionaryCreation_failed,
+	ZSTD_error_maxCode
+} ZSTD_ErrorCode;
+
+/**
+ * ZSTD_maxCLevel() - maximum compression level available
+ *
+ * Return: Maximum compression level available.
+ */
+int ZSTD_maxCLevel(void);
+/**
+ * ZSTD_compressBound() - maximum compressed size in worst case scenario
+ * @srcSize: The size of the data to compress.
+ *
+ * Return:   The maximum compressed size in the worst case scenario.
+ */
+size_t ZSTD_compressBound(size_t srcSize);
+/**
+ * ZSTD_isError() - tells if a size_t function result is an error code
+ * @code:  The function result to check for error.
+ *
+ * Return: Non-zero iff the code is an error.
+ */
+static __attribute__((unused)) unsigned int ZSTD_isError(size_t code)
+{
+	return code > (size_t)-ZSTD_error_maxCode;
+}
+/**
+ * ZSTD_getErrorCode() - translates an error function result to a ZSTD_ErrorCode
+ * @functionResult: The result of a function for which ZSTD_isError() is true.
+ *
+ * Return:          The ZSTD_ErrorCode corresponding to the functionResult or 0
+ *                  if the functionResult isn't an error.
+ */
+static __attribute__((unused)) ZSTD_ErrorCode ZSTD_getErrorCode(
+	size_t functionResult)
+{
+	if (!ZSTD_isError(functionResult))
+		return (ZSTD_ErrorCode)0;
+	return (ZSTD_ErrorCode)(0 - functionResult);
+}
+
+/**
+ * enum ZSTD_strategy - zstd compression search strategy
+ *
+ * From faster to stronger.
+ */
+typedef enum {
+	ZSTD_fast,
+	ZSTD_dfast,
+	ZSTD_greedy,
+	ZSTD_lazy,
+	ZSTD_lazy2,
+	ZSTD_btlazy2,
+	ZSTD_btopt,
+	ZSTD_btopt2
+} ZSTD_strategy;
+
+/**
+ * struct ZSTD_compressionParameters - zstd compression parameters
+ * @windowLog:    Log of the largest match distance. Larger means more
+ *                compression, and more memory needed during decompression.
+ * @chainLog:     Fully searched segment. Larger means more compression, slower,
+ *                and more memory (useless for fast).
+ * @hashLog:      Dispatch table. Larger means more compression,
+ *                slower, and more memory.
+ * @searchLog:    Number of searches. Larger means more compression and slower.
+ * @searchLength: Match length searched. Larger means faster decompression,
+ *                sometimes less compression.
+ * @targetLength: Acceptable match size for optimal parser (only). Larger means
+ *                more compression, and slower.
+ * @strategy:     The zstd compression strategy.
+ */
+typedef struct {
+	unsigned int windowLog;
+	unsigned int chainLog;
+	unsigned int hashLog;
+	unsigned int searchLog;
+	unsigned int searchLength;
+	unsigned int targetLength;
+	ZSTD_strategy strategy;
+} ZSTD_compressionParameters;
+
+/**
+ * struct ZSTD_frameParameters - zstd frame parameters
+ * @contentSizeFlag: Controls whether content size will be present in the frame
+ *                   header (when known).
+ * @checksumFlag:    Controls whether a 32-bit checksum is generated at the end
+ *                   of the frame for error detection.
+ * @noDictIDFlag:    Controls whether dictID will be saved into the frame header
+ *                   when using dictionary compression.
+ *
+ * The default value is all fields set to 0.
+ */
+typedef struct {
+	unsigned int contentSizeFlag;
+	unsigned int checksumFlag;
+	unsigned int noDictIDFlag;
+} ZSTD_frameParameters;
+
+/**
+ * struct ZSTD_parameters - zstd parameters
+ * @cParams: The compression parameters.
+ * @fParams: The frame parameters.
+ */
+typedef struct {
+	ZSTD_compressionParameters cParams;
+	ZSTD_frameParameters fParams;
+} ZSTD_parameters;
+
+/**
+ * ZSTD_getCParams() - returns ZSTD_compressionParameters for selected level
+ * @compressionLevel: The compression level from 1 to ZSTD_maxCLevel().
+ * @estimatedSrcSize: The estimated source size to compress or 0 if unknown.
+ * @dictSize:         The dictionary size or 0 if a dictionary isn't being used.
+ *
+ * Return:            The selected ZSTD_compressionParameters.
+ */
+ZSTD_compressionParameters ZSTD_getCParams(int compressionLevel,
+	unsigned long long estimatedSrcSize, size_t dictSize);
+
+/**
+ * ZSTD_getParams() - returns ZSTD_parameters for selected level
+ * @compressionLevel: The compression level from 1 to ZSTD_maxCLevel().
+ * @estimatedSrcSize: The estimated source size to compress or 0 if unknown.
+ * @dictSize:         The dictionary size or 0 if a dictionary isn't being used.
+ *
+ * The same as ZSTD_getCParams() except also selects the default frame
+ * parameters (all zero).
+ *
+ * Return:            The selected ZSTD_parameters.
+ */
+ZSTD_parameters ZSTD_getParams(int compressionLevel,
+	unsigned long long estimatedSrcSize, size_t dictSize);
+
+/*-*************************************
+ * Explicit memory management
+ **************************************/
+
+/**
+ * ZSTD_CCtxWorkspaceBound() - amount of memory needed to initialize a ZSTD_CCtx
+ * @cParams: The compression parameters to be used for compression.
+ *
+ * If multiple compression parameters might be used, the caller must call
+ * ZSTD_CCtxWorkspaceBound() for each set of parameters and use the maximum
+ * size.
+ *
+ * Return:   A lower bound on the size of the workspace that is passed to
+ *           ZSTD_initCCtx().
+ */
+size_t ZSTD_CCtxWorkspaceBound(ZSTD_compressionParameters cParams);
+
+/**
+ * struct ZSTD_CCtx - the zstd compression context
+ *
+ * When compressing many times it is recommended to allocate a context just once
+ * and reuse it for each successive compression operation.
+ */
+typedef struct ZSTD_CCtx_s ZSTD_CCtx;
+/**
+ * ZSTD_initCCtx() - initialize a zstd compression context
+ * @workspace:     The workspace to emplace the context into. It must outlive
+ *                 the returned context.
+ * @workspaceSize: The size of workspace. Use ZSTD_CCtxWorkspaceBound() to
+ *                 determine how large the workspace must be.
+ *
+ * Return:         A compression context emplaced into workspace.
+ */
+ZSTD_CCtx *ZSTD_initCCtx(void *workspace, size_t workspaceSize);
+
+/**
+ * ZSTD_compressCCtx() - compress src into dst
+ * @ctx:         The context. Must have been initialized with a workspace at
+ *               least as large as ZSTD_CCtxWorkspaceBound(params.cParams).
+ * @dst:         The buffer to compress src into.
+ * @dstCapacity: The size of the destination buffer. May be any size, but
+ *               ZSTD_compressBound(srcSize) is guaranteed to be large enough.
+ * @src:         The data to compress.
+ * @srcSize:     The size of the data to compress.
+ * @params:      The parameters to use for compression. See ZSTD_getParams().
+ *
+ * Return:       The compressed size or an error, which can be checked using
+ *               ZSTD_isError().
+ */
+size_t ZSTD_compressCCtx(ZSTD_CCtx *ctx, void *dst, size_t dstCapacity,
+	const void *src, size_t srcSize, ZSTD_parameters params);
+
+/**
+ * ZSTD_DCtxWorkspaceBound() - amount of memory needed to initialize a ZSTD_DCtx
+ *
+ * Return: A lower bound on the size of the workspace that is passed to
+ *         ZSTD_initDCtx().
+ */
+size_t ZSTD_DCtxWorkspaceBound(void);
+
+/**
+ * struct ZSTD_DCtx - the zstd decompression context
+ *
+ * When decompressing many times it is recommended to allocate a context just
+ * once and reuse it for each successive decompression operation.
+ */
+typedef struct ZSTD_DCtx_s ZSTD_DCtx;
+/**
+ * ZSTD_initDCtx() - initialize a zstd decompression context
+ * @workspace:     The workspace to emplace the context into. It must outlive
+ *                 the returned context.
+ * @workspaceSize: The size of workspace. Use ZSTD_DCtxWorkspaceBound() to
+ *                 determine how large the workspace must be.
+ *
+ * Return:         A decompression context emplaced into workspace.
+ */
+ZSTD_DCtx *ZSTD_initDCtx(void *workspace, size_t workspaceSize);
+
+/**
+ * ZSTD_decompressDCtx() - decompress zstd compressed src into dst
+ * @ctx:         The decompression context.
+ * @dst:         The buffer to decompress src into.
+ * @dstCapacity: The size of the destination buffer. Must be at least as large
+ *               as the decompressed size. If the caller cannot upper bound the
+ *               decompressed size, then it's better to use the streaming API.
+ * @src:         The zstd compressed data to decompress. Multiple concatenated
+ *               frames and skippable frames are allowed.
+ * @srcSize:     The exact size of the data to decompress.
+ *
+ * Return:       The decompressed size or an error, which can be checked using
+ *               ZSTD_isError().
+ */
+size_t ZSTD_decompressDCtx(ZSTD_DCtx *ctx, void *dst, size_t dstCapacity,
+	const void *src, size_t srcSize);
+
+/*-************************
+ * Simple dictionary API
+ **************************/
+
+/**
+ * ZSTD_compress_usingDict() - compress src into dst using a dictionary
+ * @ctx:         The context. Must have been initialized with a workspace at
+ *               least as large as ZSTD_CCtxWorkspaceBound(params.cParams).
+ * @dst:         The buffer to compress src into.
+ * @dstCapacity: The size of the destination buffer. May be any size, but
+ *               ZSTD_compressBound(srcSize) is guaranteed to be large enough.
+ * @src:         The data to compress.
+ * @srcSize:     The size of the data to compress.
+ * @dict:        The dictionary to use for compression.
+ * @dictSize:    The size of the dictionary.
+ * @params:      The parameters to use for compression. See ZSTD_getParams().
+ *
+ * Compression using a predefined dictionary. The same dictionary must be used
+ * during decompression.
+ *
+ * Return:       The compressed size or an error, which can be checked using
+ *               ZSTD_isError().
+ */
+size_t ZSTD_compress_usingDict(ZSTD_CCtx *ctx, void *dst, size_t dstCapacity,
+	const void *src, size_t srcSize, const void *dict, size_t dictSize,
+	ZSTD_parameters params);
+
+/**
+ * ZSTD_decompress_usingDict() - decompress src into dst using a dictionary
+ * @ctx:         The decompression context.
+ * @dst:         The buffer to decompress src into.
+ * @dstCapacity: The size of the destination buffer. Must be at least as large
+ *               as the decompressed size. If the caller cannot upper bound the
+ *               decompressed size, then it's better to use the streaming API.
+ * @src:         The zstd compressed data to decompress. Multiple concatenated
+ *               frames and skippable frames are allowed.
+ * @srcSize:     The exact size of the data to decompress.
+ * @dict:        The dictionary to use for decompression. The same dictionary
+ *               must've been used to compress the data.
+ * @dictSize:    The size of the dictionary.
+ *
+ * Return:       The decompressed size or an error, which can be checked using
+ *               ZSTD_isError().
+ */
+size_t ZSTD_decompress_usingDict(ZSTD_DCtx *ctx, void *dst, size_t dstCapacity,
+	const void *src, size_t srcSize, const void *dict, size_t dictSize);
+
+/*-**************************
+ * Fast dictionary API
+ ***************************/
+
+/**
+ * ZSTD_CDictWorkspaceBound() - memory needed to initialize a ZSTD_CDict
+ * @cParams: The compression parameters to be used for compression.
+ *
+ * Return:   A lower bound on the size of the workspace that is passed to
+ *           ZSTD_initCDict().
+ */
+size_t ZSTD_CDictWorkspaceBound(ZSTD_compressionParameters cParams);
+
+/**
+ * struct ZSTD_CDict - a digested dictionary to be used for compression
+ */
+typedef struct ZSTD_CDict_s ZSTD_CDict;
+
+/**
+ * ZSTD_initCDict() - initialize a digested dictionary for compression
+ * @dictBuffer:    The dictionary to digest. The buffer is referenced by the
+ *                 ZSTD_CDict so it must outlive the returned ZSTD_CDict.
+ * @dictSize:      The size of the dictionary.
+ * @params:        The parameters to use for compression. See ZSTD_getParams().
+ * @workspace:     The workspace. It must outlive the returned ZSTD_CDict.
+ * @workspaceSize: The workspace size. Must be at least
+ *                 ZSTD_CDictWorkspaceBound(params.cParams).
+ *
+ * When compressing multiple messages / blocks with the same dictionary it is
+ * recommended to load it just once. The ZSTD_CDict merely references the
+ * dictBuffer, so it must outlive the returned ZSTD_CDict.
+ *
+ * Return:         The digested dictionary emplaced into workspace.
+ */
+ZSTD_CDict *ZSTD_initCDict(const void *dictBuffer, size_t dictSize,
+	ZSTD_parameters params, void *workspace, size_t workspaceSize);
+
+/**
+ * ZSTD_compress_usingCDict() - compress src into dst using a ZSTD_CDict
+ * @ctx:         The context. Must have been initialized with a workspace at
+ *               least as large as ZSTD_CCtxWorkspaceBound(cParams) where
+ *               cParams are the compression parameters used to initialize the
+ *               cdict.
+ * @dst:         The buffer to compress src into.
+ * @dstCapacity: The size of the destination buffer. May be any size, but
+ *               ZSTD_compressBound(srcSize) is guaranteed to be large enough.
+ * @src:         The data to compress.
+ * @srcSize:     The size of the data to compress.
+ * @cdict:       The digested dictionary to use for compression.
+ * @params:      The parameters to use for compression. See ZSTD_getParams().
+ *
+ * Compression using a digested dictionary. The same dictionary must be used
+ * during decompression.
+ *
+ * Return:       The compressed size or an error, which can be checked using
+ *               ZSTD_isError().
+ */
+size_t ZSTD_compress_usingCDict(ZSTD_CCtx *cctx, void *dst, size_t dstCapacity,
+	const void *src, size_t srcSize, const ZSTD_CDict *cdict);
+
+
+/**
+ * ZSTD_DDictWorkspaceBound() - memory needed to initialize a ZSTD_DDict
+ *
+ * Return:  A lower bound on the size of the workspace that is passed to
+ *          ZSTD_initDDict().
+ */
+size_t ZSTD_DDictWorkspaceBound(void);
+
+/**
+ * struct ZSTD_DDict - a digested dictionary to be used for decompression
+ */
+typedef struct ZSTD_DDict_s ZSTD_DDict;
+
+/**
+ * ZSTD_initDDict() - initialize a digested dictionary for decompression
+ * @dictBuffer:    The dictionary to digest. The buffer is referenced by the
+ *                 ZSTD_DDict so it must outlive the returned ZSTD_DDict.
+ * @dictSize:      The size of the dictionary.
+ * @workspace:     The workspace. It must outlive the returned ZSTD_DDict.
+ * @workspaceSize: The workspace size. Must be at least
+ *                 ZSTD_DDictWorkspaceBound().
+ *
+ * When decompressing multiple messages / blocks with the same dictionary it is
+ * recommended to load it just once. The ZSTD_DDict merely references the
+ * dictBuffer, so it must outlive the returned ZSTD_DDict.
+ *
+ * Return:         The digested dictionary emplaced into workspace.
+ */
+ZSTD_DDict *ZSTD_initDDict(const void *dictBuffer, size_t dictSize,
+	void *workspace, size_t workspaceSize);
+
+/**
+ * ZSTD_decompress_usingDDict() - decompress src into dst using a ZSTD_DDict
+ * @ctx:         The decompression context.
+ * @dst:         The buffer to decompress src into.
+ * @dstCapacity: The size of the destination buffer. Must be at least as large
+ *               as the decompressed size. If the caller cannot upper bound the
+ *               decompressed size, then it's better to use the streaming API.
+ * @src:         The zstd compressed data to decompress. Multiple concatenated
+ *               frames and skippable frames are allowed.
+ * @srcSize:     The exact size of the data to decompress.
+ * @ddict:       The digested dictionary to use for decompression. The same
+ *               dictionary must've been used to compress the data.
+ *
+ * Return:       The decompressed size or an error, which can be checked using
+ *               ZSTD_isError().
+ */
+size_t ZSTD_decompress_usingDDict(ZSTD_DCtx *dctx, void *dst,
+	size_t dstCapacity, const void *src, size_t srcSize,
+	const ZSTD_DDict *ddict);
+
+
+/*-**************************
+ * Streaming
+ ***************************/
+
+/**
+ * struct ZSTD_inBuffer - input buffer for streaming
+ * @src:  Start of the input buffer.
+ * @size: Size of the input buffer.
+ * @pos:  Position where reading stopped. Will be updated.
+ *        Necessarily 0 <= pos <= size.
+ */
+typedef struct ZSTD_inBuffer_s {
+	const void *src;
+	size_t size;
+	size_t pos;
+} ZSTD_inBuffer;
+
+/**
+ * struct ZSTD_outBuffer - output buffer for streaming
+ * @dst:  Start of the output buffer.
+ * @size: Size of the output buffer.
+ * @pos:  Position where writing stopped. Will be updated.
+ *        Necessarily 0 <= pos <= size.
+ */
+typedef struct ZSTD_outBuffer_s {
+	void *dst;
+	size_t size;
+	size_t pos;
+} ZSTD_outBuffer;
+
+
+
+/*-*****************************************************************************
+ * Streaming compression - HowTo
+ *
+ * A ZSTD_CStream object is required to track streaming operation.
+ * Use ZSTD_initCStream() to initialize a ZSTD_CStream object.
+ * ZSTD_CStream objects can be reused multiple times on consecutive compression
+ * operations. It is recommended to re-use ZSTD_CStream in situations where many
+ * streaming operations will be achieved consecutively. Use one separate
+ * ZSTD_CStream per thread for parallel execution.
+ *
+ * Use ZSTD_compressStream() repetitively to consume input stream.
+ * The function will automatically update both `pos` fields.
+ * Note that it may not consume the entire input, in which case `pos < size`,
+ * and it's up to the caller to present again remaining data.
+ * It returns a hint for the preferred number of bytes to use as an input for
+ * the next function call.
+ *
+ * At any moment, it's possible to flush whatever data remains within internal
+ * buffer, using ZSTD_flushStream(). `output->pos` will be updated. There might
+ * still be some content left within the internal buffer if `output->size` is
+ * too small. It returns the number of bytes left in the internal buffer and
+ * must be called until it returns 0.
+ *
+ * ZSTD_endStream() instructs to finish a frame. It will perform a flush and
+ * write frame epilogue. The epilogue is required for decoders to consider a
+ * frame completed. Similar to ZSTD_flushStream(), it may not be able to flush
+ * the full content if `output->size` is too small. In which case, call again
+ * ZSTD_endStream() to complete the flush. It returns the number of bytes left
+ * in the internal buffer and must be called until it returns 0.
+ ******************************************************************************/
+
+/**
+ * ZSTD_CStreamWorkspaceBound() - memory needed to initialize a ZSTD_CStream
+ * @cParams: The compression parameters to be used for compression.
+ *
+ * Return:   A lower bound on the size of the workspace that is passed to
+ *           ZSTD_initCStream() and ZSTD_initCStream_usingCDict().
+ */
+size_t ZSTD_CStreamWorkspaceBound(ZSTD_compressionParameters cParams);
+
+/**
+ * struct ZSTD_CStream - the zstd streaming compression context
+ */
+typedef struct ZSTD_CStream_s ZSTD_CStream;
+
+/*===== ZSTD_CStream management functions =====*/
+/**
+ * ZSTD_initCStream() - initialize a zstd streaming compression context
+ * @params:         The zstd compression parameters.
+ * @pledgedSrcSize: If params.fParams.contentSizeFlag == 1 then the caller must
+ *                  pass the source size (zero means empty source). Otherwise,
+ *                  the caller may optionally pass the source size, or zero if
+ *                  unknown.
+ * @workspace:      The workspace to emplace the context into. It must outlive
+ *                  the returned context.
+ * @workspaceSize:  The size of workspace.
+ *                  Use ZSTD_CStreamWorkspaceBound(params.cParams) to determine
+ *                  how large the workspace must be.
+ *
+ * Return:          The zstd streaming compression context.
+ */
+ZSTD_CStream *ZSTD_initCStream(ZSTD_parameters params,
+	unsigned long long pledgedSrcSize, void *workspace,
+	size_t workspaceSize);
+
+/**
+ * ZSTD_initCStream_usingCDict() - initialize a streaming compression context
+ * @cdict:          The digested dictionary to use for compression.
+ * @pledgedSrcSize: Optionally the source size, or zero if unknown.
+ * @workspace:      The workspace to emplace the context into. It must outlive
+ *                  the returned context.
+ * @workspaceSize:  The size of workspace. Call ZSTD_CStreamWorkspaceBound()
+ *                  with the cParams used to initialize the cdict to determine
+ *                  how large the workspace must be.
+ *
+ * Return:          The zstd streaming compression context.
+ */
+ZSTD_CStream *ZSTD_initCStream_usingCDict(const ZSTD_CDict *cdict,
+	unsigned long long pledgedSrcSize, void *workspace,
+	size_t workspaceSize);
+
+/*===== Streaming compression functions =====*/
+/**
+ * ZSTD_resetCStream() - reset the context using parameters from creation
+ * @zcs:            The zstd streaming compression context to reset.
+ * @pledgedSrcSize: Optionally the source size, or zero if unknown.
+ *
+ * Resets the context using the parameters from creation. Skips dictionary
+ * loading, since it can be reused. If `pledgedSrcSize` is non-zero the frame
+ * content size is always written into the frame header.
+ *
+ * Return:          Zero or an error, which can be checked using ZSTD_isError().
+ */
+size_t ZSTD_resetCStream(ZSTD_CStream *zcs, unsigned long long pledgedSrcSize);
+/**
+ * ZSTD_compressStream() - streaming compress some of input into output
+ * @zcs:    The zstd streaming compression context.
+ * @output: Destination buffer. `output->pos` is updated to indicate how much
+ *          compressed data was written.
+ * @input:  Source buffer. `input->pos` is updated to indicate how much data was
+ *          read. Note that it may not consume the entire input, in which case
+ *          `input->pos < input->size`, and it's up to the caller to present
+ *          remaining data again.
+ *
+ * The `input` and `output` buffers may be any size. Guaranteed to make some
+ * forward progress if `input` and `output` are not empty.
+ *
+ * Return:  A hint for the number of bytes to use as the input for the next
+ *          function call or an error, which can be checked using
+ *          ZSTD_isError().
+ */
+size_t ZSTD_compressStream(ZSTD_CStream *zcs, ZSTD_outBuffer *output,
+	ZSTD_inBuffer *input);
+/**
+ * ZSTD_flushStream() - flush internal buffers into output
+ * @zcs:    The zstd streaming compression context.
+ * @output: Destination buffer. `output->pos` is updated to indicate how much
+ *          compressed data was written.
+ *
+ * ZSTD_flushStream() must be called until it returns 0, meaning all the data
+ * has been flushed. Since ZSTD_flushStream() causes a block to be ended,
+ * calling it too often will degrade the compression ratio.
+ *
+ * Return:  The number of bytes still present within internal buffers or an
+ *          error, which can be checked using ZSTD_isError().
+ */
+size_t ZSTD_flushStream(ZSTD_CStream *zcs, ZSTD_outBuffer *output);
+/**
+ * ZSTD_endStream() - flush internal buffers into output and end the frame
+ * @zcs:    The zstd streaming compression context.
+ * @output: Destination buffer. `output->pos` is updated to indicate how much
+ *          compressed data was written.
+ *
+ * ZSTD_endStream() must be called until it returns 0, meaning all the data has
+ * been flushed and the frame epilogue has been written.
+ *
+ * Return:  The number of bytes still present within internal buffers or an
+ *          error, which can be checked using ZSTD_isError().
+ */
+size_t ZSTD_endStream(ZSTD_CStream *zcs, ZSTD_outBuffer *output);
+
+/**
+ * ZSTD_CStreamInSize() - recommended size for the input buffer
+ *
+ * Return: The recommended size for the input buffer.
+ */
+size_t ZSTD_CStreamInSize(void);
+/**
+ * ZSTD_CStreamOutSize() - recommended size for the output buffer
+ *
+ * When the output buffer is at least this large, it is guaranteed to be large
+ * enough to flush at least one complete compressed block.
+ *
+ * Return: The recommended size for the output buffer.
+ */
+size_t ZSTD_CStreamOutSize(void);
+
+
+
+/*-*****************************************************************************
+ * Streaming decompression - HowTo
+ *
+ * A ZSTD_DStream object is required to track streaming operations.
+ * Use ZSTD_initDStream() to initialize a ZSTD_DStream object.
+ * ZSTD_DStream objects can be re-used multiple times.
+ *
+ * Use ZSTD_decompressStream() repetitively to consume your input.
+ * The function will update both `pos` fields.
+ * If `input->pos < input->size`, some input has not been consumed.
+ * It's up to the caller to present again remaining data.
+ * If `output->pos < output->size`, decoder has flushed everything it could.
+ * Returns 0 iff a frame is completely decoded and fully flushed.
+ * Otherwise it returns a suggested next input size that will never load more
+ * than the current frame.
+ ******************************************************************************/
+
+/**
+ * ZSTD_DStreamWorkspaceBound() - memory needed to initialize a ZSTD_DStream
+ * @maxWindowSize: The maximum window size allowed for compressed frames.
+ *
+ * Return:         A lower bound on the size of the workspace that is passed to
+ *                 ZSTD_initDStream() and ZSTD_initDStream_usingDDict().
+ */
+size_t ZSTD_DStreamWorkspaceBound(size_t maxWindowSize);
+
+/**
+ * struct ZSTD_DStream - the zstd streaming decompression context
+ */
+typedef struct ZSTD_DStream_s ZSTD_DStream;
+/*===== ZSTD_DStream management functions =====*/
+/**
+ * ZSTD_initDStream() - initialize a zstd streaming decompression context
+ * @maxWindowSize: The maximum window size allowed for compressed frames.
+ * @workspace:     The workspace to emplace the context into. It must outlive
+ *                 the returned context.
+ * @workspaceSize: The size of workspace.
+ *                 Use ZSTD_DStreamWorkspaceBound(maxWindowSize) to determine
+ *                 how large the workspace must be.
+ *
+ * Return:         The zstd streaming decompression context.
+ */
+ZSTD_DStream *ZSTD_initDStream(size_t maxWindowSize, void *workspace,
+	size_t workspaceSize);
+/**
+ * ZSTD_initDStream_usingDDict() - initialize streaming decompression context
+ * @maxWindowSize: The maximum window size allowed for compressed frames.
+ * @ddict:         The digested dictionary to use for decompression.
+ * @workspace:     The workspace to emplace the context into. It must outlive
+ *                 the returned context.
+ * @workspaceSize: The size of workspace.
+ *                 Use ZSTD_DStreamWorkspaceBound(maxWindowSize) to determine
+ *                 how large the workspace must be.
+ *
+ * Return:         The zstd streaming decompression context.
+ */
+ZSTD_DStream *ZSTD_initDStream_usingDDict(size_t maxWindowSize,
+	const ZSTD_DDict *ddict, void *workspace, size_t workspaceSize);
+
+/*===== Streaming decompression functions =====*/
+/**
+ * ZSTD_resetDStream() - reset the context using parameters from creation
+ * @zds:   The zstd streaming decompression context to reset.
+ *
+ * Resets the context using the parameters from creation. Skips dictionary
+ * loading, since it can be reused.
+ *
+ * Return: Zero or an error, which can be checked using ZSTD_isError().
+ */
+size_t ZSTD_resetDStream(ZSTD_DStream *zds);
+/**
+ * ZSTD_decompressStream() - streaming decompress some of input into output
+ * @zds:    The zstd streaming decompression context.
+ * @output: Destination buffer. `output.pos` is updated to indicate how much
+ *          decompressed data was written.
+ * @input:  Source buffer. `input.pos` is updated to indicate how much data was
+ *          read. Note that it may not consume the entire input, in which case
+ *          `input.pos < input.size`, and it's up to the caller to present
+ *          remaining data again.
+ *
+ * The `input` and `output` buffers may be any size. Guaranteed to make some
+ * forward progress if `input` and `output` are not empty.
+ * ZSTD_decompressStream() will not consume the last byte of the frame until
+ * the entire frame is flushed.
+ *
+ * Return:  Returns 0 iff a frame is completely decoded and fully flushed.
+ *          Otherwise returns a hint for the number of bytes to use as the input
+ *          for the next function call or an error, which can be checked using
+ *          ZSTD_isError(). The size hint will never load more than the frame.
+ */
+size_t ZSTD_decompressStream(ZSTD_DStream *zds, ZSTD_outBuffer *output,
+	ZSTD_inBuffer *input);
+
+/**
+ * ZSTD_DStreamInSize() - recommended size for the input buffer
+ *
+ * Return: The recommended size for the input buffer.
+ */
+size_t ZSTD_DStreamInSize(void);
+/**
+ * ZSTD_DStreamOutSize() - recommended size for the output buffer
+ *
+ * When the output buffer is at least this large, it is guaranteed to be large
+ * enough to flush at least one complete decompressed block.
+ *
+ * Return: The recommended size for the output buffer.
+ */
+size_t ZSTD_DStreamOutSize(void);
+
+
+/* --- Constants ---*/
+#define ZSTD_MAGICNUMBER            0xFD2FB528   /* >= v0.8.0 */
+#define ZSTD_MAGIC_SKIPPABLE_START  0x184D2A50U
+
+#define ZSTD_CONTENTSIZE_UNKNOWN (0ULL - 1)
+#define ZSTD_CONTENTSIZE_ERROR   (0ULL - 2)
+
+#define ZSTD_WINDOWLOG_MAX_32  27
+#define ZSTD_WINDOWLOG_MAX_64  27
+#define ZSTD_WINDOWLOG_MAX \
+	((unsigned int)(sizeof(size_t) == 4 \
+		? ZSTD_WINDOWLOG_MAX_32 \
+		: ZSTD_WINDOWLOG_MAX_64))
+#define ZSTD_WINDOWLOG_MIN 10
+#define ZSTD_HASHLOG_MAX ZSTD_WINDOWLOG_MAX
+#define ZSTD_HASHLOG_MIN        6
+#define ZSTD_CHAINLOG_MAX     (ZSTD_WINDOWLOG_MAX+1)
+#define ZSTD_CHAINLOG_MIN      ZSTD_HASHLOG_MIN
+#define ZSTD_HASHLOG3_MAX      17
+#define ZSTD_SEARCHLOG_MAX    (ZSTD_WINDOWLOG_MAX-1)
+#define ZSTD_SEARCHLOG_MIN      1
+/* only for ZSTD_fast, other strategies are limited to 6 */
+#define ZSTD_SEARCHLENGTH_MAX   7
+/* only for ZSTD_btopt, other strategies are limited to 4 */
+#define ZSTD_SEARCHLENGTH_MIN   3
+#define ZSTD_TARGETLENGTH_MIN   4
+#define ZSTD_TARGETLENGTH_MAX 999
+
+/* for static allocation */
+#define ZSTD_FRAMEHEADERSIZE_MAX 18
+#define ZSTD_FRAMEHEADERSIZE_MIN  6
+static const size_t ZSTD_frameHeaderSize_prefix = 5;
+static const size_t ZSTD_frameHeaderSize_min = ZSTD_FRAMEHEADERSIZE_MIN;
+static const size_t ZSTD_frameHeaderSize_max = ZSTD_FRAMEHEADERSIZE_MAX;
+/* magic number + skippable frame length */
+static const size_t ZSTD_skippableHeaderSize = 8;
+
+
+/*-*************************************
+ * Compressed size functions
+ **************************************/
+
+/**
+ * ZSTD_findFrameCompressedSize() - returns the size of a compressed frame
+ * @src:     Source buffer. It should point to the start of a zstd encoded frame
+ *           or a skippable frame.
+ * @srcSize: The size of the source buffer. It must be at least as large as the
+ *           size of the frame.
+ *
+ * Return:   The compressed size of the frame pointed to by `src` or an error,
+ *           which can be check with ZSTD_isError().
+ *           Suitable to pass to ZSTD_decompress() or similar functions.
+ */
+size_t ZSTD_findFrameCompressedSize(const void *src, size_t srcSize);
+
+/*-*************************************
+ * Decompressed size functions
+ **************************************/
+/**
+ * ZSTD_getFrameContentSize() - returns the content size in a zstd frame header
+ * @src:     It should point to the start of a zstd encoded frame.
+ * @srcSize: The size of the source buffer. It must be at least as large as the
+ *           frame header. `ZSTD_frameHeaderSize_max` is always large enough.
+ *
+ * Return:   The frame content size stored in the frame header if known.
+ *           `ZSTD_CONTENTSIZE_UNKNOWN` if the content size isn't stored in the
+ *           frame header. `ZSTD_CONTENTSIZE_ERROR` on invalid input.
+ */
+unsigned long long ZSTD_getFrameContentSize(const void *src, size_t srcSize);
+
+/**
+ * ZSTD_findDecompressedSize() - returns decompressed size of a series of frames
+ * @src:     It should point to the start of a series of zstd encoded and/or
+ *           skippable frames.
+ * @srcSize: The exact size of the series of frames.
+ *
+ * If any zstd encoded frame in the series doesn't have the frame content size
+ * set, `ZSTD_CONTENTSIZE_UNKNOWN` is returned. But frame content size is always
+ * set when using ZSTD_compress(). The decompressed size can be very large.
+ * If the source is untrusted, the decompressed size could be wrong or
+ * intentionally modified. Always ensure the result fits within the
+ * application's authorized limits. ZSTD_findDecompressedSize() handles multiple
+ * frames, and so it must traverse the input to read each frame header. This is
+ * efficient as most of the data is skipped, however it does mean that all frame
+ * data must be present and valid.
+ *
+ * Return:   Decompressed size of all the data contained in the frames if known.
+ *           `ZSTD_CONTENTSIZE_UNKNOWN` if the decompressed size is unknown.
+ *           `ZSTD_CONTENTSIZE_ERROR` if an error occurred.
+ */
+unsigned long long ZSTD_findDecompressedSize(const void *src, size_t srcSize);
+
+/*-*************************************
+ * Advanced compression functions
+ **************************************/
+/**
+ * ZSTD_checkCParams() - ensure parameter values remain within authorized range
+ * @cParams: The zstd compression parameters.
+ *
+ * Return:   Zero or an error, which can be checked using ZSTD_isError().
+ */
+size_t ZSTD_checkCParams(ZSTD_compressionParameters cParams);
+
+/**
+ * ZSTD_adjustCParams() - optimize parameters for a given srcSize and dictSize
+ * @srcSize:  Optionally the estimated source size, or zero if unknown.
+ * @dictSize: Optionally the estimated dictionary size, or zero if unknown.
+ *
+ * Return:    The optimized parameters.
+ */
+ZSTD_compressionParameters ZSTD_adjustCParams(
+	ZSTD_compressionParameters cParams, unsigned long long srcSize,
+	size_t dictSize);
+
+/*--- Advanced decompression functions ---*/
+
+/**
+ * ZSTD_isFrame() - returns true iff the buffer starts with a valid frame
+ * @buffer: The source buffer to check.
+ * @size:   The size of the source buffer, must be at least 4 bytes.
+ *
+ * Return: True iff the buffer starts with a zstd or skippable frame identifier.
+ */
+unsigned int ZSTD_isFrame(const void *buffer, size_t size);
+
+/**
+ * ZSTD_getDictID_fromDict() - returns the dictionary id stored in a dictionary
+ * @dict:     The dictionary buffer.
+ * @dictSize: The size of the dictionary buffer.
+ *
+ * Return:    The dictionary id stored within the dictionary or 0 if the
+ *            dictionary is not a zstd dictionary. If it returns 0 the
+ *            dictionary can still be loaded as a content-only dictionary.
+ */
+unsigned int ZSTD_getDictID_fromDict(const void *dict, size_t dictSize);
+
+/**
+ * ZSTD_getDictID_fromDDict() - returns the dictionary id stored in a ZSTD_DDict
+ * @ddict: The ddict to find the id of.
+ *
+ * Return: The dictionary id stored within `ddict` or 0 if the dictionary is not
+ *         a zstd dictionary. If it returns 0 `ddict` will be loaded as a
+ *         content-only dictionary.
+ */
+unsigned int ZSTD_getDictID_fromDDict(const ZSTD_DDict *ddict);
+
+/**
+ * ZSTD_getDictID_fromFrame() - returns the dictionary id stored in a zstd frame
+ * @src:     Source buffer. It must be a zstd encoded frame.
+ * @srcSize: The size of the source buffer. It must be at least as large as the
+ *           frame header. `ZSTD_frameHeaderSize_max` is always large enough.
+ *
+ * Return:   The dictionary id required to decompress the frame stored within
+ *           `src` or 0 if the dictionary id could not be decoded. It can return
+ *           0 if the frame does not require a dictionary, the dictionary id
+ *           wasn't stored in the frame, `src` is not a zstd frame, or `srcSize`
+ *           is too small.
+ */
+unsigned int ZSTD_getDictID_fromFrame(const void *src, size_t srcSize);
+
+/**
+ * struct ZSTD_frameParams - zstd frame parameters stored in the frame header
+ * @frameContentSize: The frame content size, or 0 if not present.
+ * @windowSize:       The window size, or 0 if the frame is a skippable frame.
+ * @dictID:           The dictionary id, or 0 if not present.
+ * @checksumFlag:     Whether a checksum was used.
+ */
+typedef struct {
+	unsigned long long frameContentSize;
+	unsigned int windowSize;
+	unsigned int dictID;
+	unsigned int checksumFlag;
+} ZSTD_frameParams;
+
+/**
+ * ZSTD_getFrameParams() - extracts parameters from a zstd or skippable frame
+ * @fparamsPtr: On success the frame parameters are written here.
+ * @src:        The source buffer. It must point to a zstd or skippable frame.
+ * @srcSize:    The size of the source buffer. `ZSTD_frameHeaderSize_max` is
+ *              always large enough to succeed.
+ *
+ * Return:      0 on success. If more data is required it returns how many bytes
+ *              must be provided to make forward progress. Otherwise it returns
+ *              an error, which can be checked using ZSTD_isError().
+ */
+size_t ZSTD_getFrameParams(ZSTD_frameParams *fparamsPtr, const void *src,
+	size_t srcSize);
+
+/*-*****************************************************************************
+ * Buffer-less and synchronous inner streaming functions
+ *
+ * This is an advanced API, giving full control over buffer management, for
+ * users which need direct control over memory.
+ * But it's also a complex one, with many restrictions (documented below).
+ * Prefer using normal streaming API for an easier experience
+ ******************************************************************************/
+
+/*-*****************************************************************************
+ * Buffer-less streaming compression (synchronous mode)
+ *
+ * A ZSTD_CCtx object is required to track streaming operations.
+ * Use ZSTD_initCCtx() to initialize a context.
+ * ZSTD_CCtx object can be re-used multiple times within successive compression
+ * operations.
+ *
+ * Start by initializing a context.
+ * Use ZSTD_compressBegin(), or ZSTD_compressBegin_usingDict() for dictionary
+ * compression,
+ * or ZSTD_compressBegin_advanced(), for finer parameter control.
+ * It's also possible to duplicate a reference context which has already been
+ * initialized, using ZSTD_copyCCtx()
+ *
+ * Then, consume your input using ZSTD_compressContinue().
+ * There are some important considerations to keep in mind when using this
+ * advanced function :
+ * - ZSTD_compressContinue() has no internal buffer. It uses externally provided
+ *   buffer only.
+ * - Interface is synchronous : input is consumed entirely and produce 1+
+ *   (or more) compressed blocks.
+ * - Caller must ensure there is enough space in `dst` to store compressed data
+ *   under worst case scenario. Worst case evaluation is provided by
+ *   ZSTD_compressBound().
+ *   ZSTD_compressContinue() doesn't guarantee recover after a failed
+ *   compression.
+ * - ZSTD_compressContinue() presumes prior input ***is still accessible and
+ *   unmodified*** (up to maximum distance size, see WindowLog).
+ *   It remembers all previous contiguous blocks, plus one separated memory
+ *   segment (which can itself consists of multiple contiguous blocks)
+ * - ZSTD_compressContinue() detects that prior input has been overwritten when
+ *   `src` buffer overlaps. In which case, it will "discard" the relevant memory
+ *   section from its history.
+ *
+ * Finish a frame with ZSTD_compressEnd(), which will write the last block(s)
+ * and optional checksum. It's possible to use srcSize==0, in which case, it
+ * will write a final empty block to end the frame. Without last block mark,
+ * frames will be considered unfinished (corrupted) by decoders.
+ *
+ * `ZSTD_CCtx` object can be re-used (ZSTD_compressBegin()) to compress some new
+ * frame.
+ ******************************************************************************/
+
+/*=====   Buffer-less streaming compression functions  =====*/
+size_t ZSTD_compressBegin(ZSTD_CCtx *cctx, int compressionLevel);
+size_t ZSTD_compressBegin_usingDict(ZSTD_CCtx *cctx, const void *dict,
+	size_t dictSize, int compressionLevel);
+size_t ZSTD_compressBegin_advanced(ZSTD_CCtx *cctx, const void *dict,
+	size_t dictSize, ZSTD_parameters params,
+	unsigned long long pledgedSrcSize);
+size_t ZSTD_copyCCtx(ZSTD_CCtx *cctx, const ZSTD_CCtx *preparedCCtx,
+	unsigned long long pledgedSrcSize);
+size_t ZSTD_compressBegin_usingCDict(ZSTD_CCtx *cctx, const ZSTD_CDict *cdict,
+	unsigned long long pledgedSrcSize);
+size_t ZSTD_compressContinue(ZSTD_CCtx *cctx, void *dst, size_t dstCapacity,
+	const void *src, size_t srcSize);
+size_t ZSTD_compressEnd(ZSTD_CCtx *cctx, void *dst, size_t dstCapacity,
+	const void *src, size_t srcSize);
+
+
+
+/*-*****************************************************************************
+ * Buffer-less streaming decompression (synchronous mode)
+ *
+ * A ZSTD_DCtx object is required to track streaming operations.
+ * Use ZSTD_initDCtx() to initialize a context.
+ * A ZSTD_DCtx object can be re-used multiple times.
+ *
+ * First typical operation is to retrieve frame parameters, using
+ * ZSTD_getFrameParams(). It fills a ZSTD_frameParams structure which provide
+ * important information to correctly decode the frame, such as the minimum
+ * rolling buffer size to allocate to decompress data (`windowSize`), and the
+ * dictionary ID used.
+ * Note: content size is optional, it may not be present. 0 means unknown.
+ * Note that these values could be wrong, either because of data malformation,
+ * or because an attacker is spoofing deliberate false information. As a
+ * consequence, check that values remain within valid application range,
+ * especially `windowSize`, before allocation. Each application can set its own
+ * limit, depending on local restrictions. For extended interoperability, it is
+ * recommended to support at least 8 MB.
+ * Frame parameters are extracted from the beginning of the compressed frame.
+ * Data fragment must be large enough to ensure successful decoding, typically
+ * `ZSTD_frameHeaderSize_max` bytes.
+ * Result: 0: successful decoding, the `ZSTD_frameParams` structure is filled.
+ *        >0: `srcSize` is too small, provide at least this many bytes.
+ *        errorCode, which can be tested using ZSTD_isError().
+ *
+ * Start decompression, with ZSTD_decompressBegin() or
+ * ZSTD_decompressBegin_usingDict(). Alternatively, you can copy a prepared
+ * context, using ZSTD_copyDCtx().
+ *
+ * Then use ZSTD_nextSrcSizeToDecompress() and ZSTD_decompressContinue()
+ * alternatively.
+ * ZSTD_nextSrcSizeToDecompress() tells how many bytes to provide as 'srcSize'
+ * to ZSTD_decompressContinue().
+ * ZSTD_decompressContinue() requires this _exact_ amount of bytes, or it will
+ * fail.
+ *
+ * The result of ZSTD_decompressContinue() is the number of bytes regenerated
+ * within 'dst' (necessarily <= dstCapacity). It can be zero, which is not an
+ * error; it just means ZSTD_decompressContinue() has decoded some metadata
+ * item. It can also be an error code, which can be tested with ZSTD_isError().
+ *
+ * ZSTD_decompressContinue() needs previous data blocks during decompression, up
+ * to `windowSize`. They should preferably be located contiguously, prior to
+ * current block. Alternatively, a round buffer of sufficient size is also
+ * possible. Sufficient size is determined by frame parameters.
+ * ZSTD_decompressContinue() is very sensitive to contiguity, if 2 blocks don't
+ * follow each other, make sure that either the compressor breaks contiguity at
+ * the same place, or that previous contiguous segment is large enough to
+ * properly handle maximum back-reference.
+ *
+ * A frame is fully decoded when ZSTD_nextSrcSizeToDecompress() returns zero.
+ * Context can then be reset to start a new decompression.
+ *
+ * Note: it's possible to know if next input to present is a header or a block,
+ * using ZSTD_nextInputType(). This information is not required to properly
+ * decode a frame.
+ *
+ * == Special case: skippable frames ==
+ *
+ * Skippable frames allow integration of user-defined data into a flow of
+ * concatenated frames. Skippable frames will be ignored (skipped) by a
+ * decompressor. The format of skippable frames is as follows:
+ * a) Skippable frame ID - 4 Bytes, Little endian format, any value from
+ *    0x184D2A50 to 0x184D2A5F
+ * b) Frame Size - 4 Bytes, Little endian format, unsigned 32-bits
+ * c) Frame Content - any content (User Data) of length equal to Frame Size
+ * For skippable frames ZSTD_decompressContinue() always returns 0.
+ * For skippable frames ZSTD_getFrameParams() returns fparamsPtr->windowLog==0
+ * what means that a frame is skippable.
+ * Note: If fparamsPtr->frameContentSize==0, it is ambiguous: the frame might
+ *       actually be a zstd encoded frame with no content. For purposes of
+ *       decompression, it is valid in both cases to skip the frame using
+ *       ZSTD_findFrameCompressedSize() to find its size in bytes.
+ * It also returns frame size as fparamsPtr->frameContentSize.
+ ******************************************************************************/
+
+/*=====   Buffer-less streaming decompression functions  =====*/
+size_t ZSTD_decompressBegin(ZSTD_DCtx *dctx);
+size_t ZSTD_decompressBegin_usingDict(ZSTD_DCtx *dctx, const void *dict,
+	size_t dictSize);
+void   ZSTD_copyDCtx(ZSTD_DCtx *dctx, const ZSTD_DCtx *preparedDCtx);
+size_t ZSTD_nextSrcSizeToDecompress(ZSTD_DCtx *dctx);
+size_t ZSTD_decompressContinue(ZSTD_DCtx *dctx, void *dst, size_t dstCapacity,
+	const void *src, size_t srcSize);
+typedef enum {
+	ZSTDnit_frameHeader,
+	ZSTDnit_blockHeader,
+	ZSTDnit_block,
+	ZSTDnit_lastBlock,
+	ZSTDnit_checksum,
+	ZSTDnit_skippableFrame
+} ZSTD_nextInputType_e;
+ZSTD_nextInputType_e ZSTD_nextInputType(ZSTD_DCtx *dctx);
+
+/*-*****************************************************************************
+ * Block functions
+ *
+ * Block functions produce and decode raw zstd blocks, without frame metadata.
+ * Frame metadata cost is typically ~18 bytes, which can be non-negligible for
+ * very small blocks (< 100 bytes). User will have to take in charge required
+ * information to regenerate data, such as compressed and content sizes.
+ *
+ * A few rules to respect:
+ * - Compressing and decompressing require a context structure
+ *   + Use ZSTD_initCCtx() and ZSTD_initDCtx()
+ * - It is necessary to init context before starting
+ *   + compression : ZSTD_compressBegin()
+ *   + decompression : ZSTD_decompressBegin()
+ *   + variants _usingDict() are also allowed
+ *   + copyCCtx() and copyDCtx() work too
+ * - Block size is limited, it must be <= ZSTD_getBlockSizeMax()
+ *   + If you need to compress more, cut data into multiple blocks
+ *   + Consider using the regular ZSTD_compress() instead, as frame metadata
+ *     costs become negligible when source size is large.
+ * - When a block is considered not compressible enough, ZSTD_compressBlock()
+ *   result will be zero. In which case, nothing is produced into `dst`.
+ *   + User must test for such outcome and deal directly with uncompressed data
+ *   + ZSTD_decompressBlock() doesn't accept uncompressed data as input!!!
+ *   + In case of multiple successive blocks, decoder must be informed of
+ *     uncompressed block existence to follow proper history. Use
+ *     ZSTD_insertBlock() in such a case.
+ ******************************************************************************/
+
+/* Define for static allocation */
+#define ZSTD_BLOCKSIZE_ABSOLUTEMAX (128 * 1024)
+/*=====   Raw zstd block functions  =====*/
+size_t ZSTD_getBlockSizeMax(ZSTD_CCtx *cctx);
+size_t ZSTD_compressBlock(ZSTD_CCtx *cctx, void *dst, size_t dstCapacity,
+	const void *src, size_t srcSize);
+size_t ZSTD_decompressBlock(ZSTD_DCtx *dctx, void *dst, size_t dstCapacity,
+	const void *src, size_t srcSize);
+size_t ZSTD_insertBlock(ZSTD_DCtx *dctx, const void *blockStart,
+	size_t blockSize);
+
+#endif  /* ZSTD_H */
diff --git a/lib/Kconfig b/lib/Kconfig
index 5e7541f46081..0d49ed0e6253 100644
--- a/lib/Kconfig
+++ b/lib/Kconfig
@@ -249,6 +249,14 @@ config LZ4HC_COMPRESS
 config LZ4_DECOMPRESS
 	tristate
 
+config ZSTD_COMPRESS
+	select XXHASH
+	tristate
+
+config ZSTD_DECOMPRESS
+	select XXHASH
+	tristate
+
 source "lib/xz/Kconfig"
 
 #
diff --git a/lib/Makefile b/lib/Makefile
index d06b68ae55a3..d5c8a4ffff80 100644
--- a/lib/Makefile
+++ b/lib/Makefile
@@ -116,6 +116,8 @@ obj-$(CONFIG_LZO_DECOMPRESS) += lzo/
 obj-$(CONFIG_LZ4_COMPRESS) += lz4/
 obj-$(CONFIG_LZ4HC_COMPRESS) += lz4/
 obj-$(CONFIG_LZ4_DECOMPRESS) += lz4/
+obj-$(CONFIG_ZSTD_COMPRESS) += zstd/
+obj-$(CONFIG_ZSTD_DECOMPRESS) += zstd/
 obj-$(CONFIG_XZ_DEC) += xz/
 obj-$(CONFIG_RAID6_PQ) += raid6/
 
diff --git a/lib/zstd/Makefile b/lib/zstd/Makefile
new file mode 100644
index 000000000000..dd0a359c135b
--- /dev/null
+++ b/lib/zstd/Makefile
@@ -0,0 +1,18 @@
+obj-$(CONFIG_ZSTD_COMPRESS) += zstd_compress.o
+obj-$(CONFIG_ZSTD_DECOMPRESS) += zstd_decompress.o
+
+ccflags-y += -O3
+
+# Object files unique to zstd_compress and zstd_decompress
+zstd_compress-y := fse_compress.o huf_compress.o compress.o
+zstd_decompress-y := huf_decompress.o decompress.o
+
+# These object files are shared between the modules.
+# Always add them to zstd_compress.
+# Unless both zstd_compress and zstd_decompress are built in
+# then also add them to zstd_decompress.
+zstd_compress-y += entropy_common.o fse_decompress.o zstd_common.o
+
+ifneq ($(CONFIG_ZSTD_COMPRESS)$(CONFIG_ZSTD_DECOMPRESS),yy)
+	zstd_decompress-y += entropy_common.o fse_decompress.o zstd_common.o
+endif
diff --git a/lib/zstd/bitstream.h b/lib/zstd/bitstream.h
new file mode 100644
index 000000000000..a826b99e1d63
--- /dev/null
+++ b/lib/zstd/bitstream.h
@@ -0,0 +1,374 @@
+/*
+ * bitstream
+ * Part of FSE library
+ * header file (to include)
+ * Copyright (C) 2013-2016, Yann Collet.
+ *
+ * BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php)
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met:
+ *
+ *   * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ *   * Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following disclaimer
+ * in the documentation and/or other materials provided with the
+ * distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * This program is free software; you can redistribute it and/or modify it under
+ * the terms of the GNU General Public License version 2 as published by the
+ * Free Software Foundation. This program is dual-licensed; you may select
+ * either version 2 of the GNU General Public License ("GPL") or BSD license
+ * ("BSD").
+ *
+ * You can contact the author at :
+ * - Source repository : https://github.com/Cyan4973/FiniteStateEntropy
+ */
+#ifndef BITSTREAM_H_MODULE
+#define BITSTREAM_H_MODULE
+
+/*
+*  This API consists of small unitary functions, which must be inlined for best performance.
+*  Since link-time-optimization is not available for all compilers,
+*  these functions are defined into a .h to be included.
+*/
+
+/*-****************************************
+*  Dependencies
+******************************************/
+#include "error_private.h" /* error codes and messages */
+#include "mem.h"	   /* unaligned access routines */
+
+/*=========================================
+*  Target specific
+=========================================*/
+#define STREAM_ACCUMULATOR_MIN_32 25
+#define STREAM_ACCUMULATOR_MIN_64 57
+#define STREAM_ACCUMULATOR_MIN ((U32)(ZSTD_32bits() ? STREAM_ACCUMULATOR_MIN_32 : STREAM_ACCUMULATOR_MIN_64))
+
+/*-******************************************
+*  bitStream encoding API (write forward)
+********************************************/
+/* bitStream can mix input from multiple sources.
+*  A critical property of these streams is that they encode and decode in **reverse** direction.
+*  So the first bit sequence you add will be the last to be read, like a LIFO stack.
+*/
+typedef struct {
+	size_t bitContainer;
+	int bitPos;
+	char *startPtr;
+	char *ptr;
+	char *endPtr;
+} BIT_CStream_t;
+
+ZSTD_STATIC size_t BIT_initCStream(BIT_CStream_t *bitC, void *dstBuffer, size_t dstCapacity);
+ZSTD_STATIC void BIT_addBits(BIT_CStream_t *bitC, size_t value, unsigned nbBits);
+ZSTD_STATIC void BIT_flushBits(BIT_CStream_t *bitC);
+ZSTD_STATIC size_t BIT_closeCStream(BIT_CStream_t *bitC);
+
+/* Start with initCStream, providing the size of buffer to write into.
+*  bitStream will never write outside of this buffer.
+*  `dstCapacity` must be >= sizeof(bitD->bitContainer), otherwise @return will be an error code.
+*
+*  bits are first added to a local register.
+*  Local register is size_t, hence 64-bits on 64-bits systems, or 32-bits on 32-bits systems.
+*  Writing data into memory is an explicit operation, performed by the flushBits function.
+*  Hence keep track how many bits are potentially stored into local register to avoid register overflow.
+*  After a flushBits, a maximum of 7 bits might still be stored into local register.
+*
+*  Avoid storing elements of more than 24 bits if you want compatibility with 32-bits bitstream readers.
+*
+*  Last operation is to close the bitStream.
+*  The function returns the final size of CStream in bytes.
+*  If data couldn't fit into `dstBuffer`, it will return a 0 ( == not storable)
+*/
+
+/*-********************************************
+*  bitStream decoding API (read backward)
+**********************************************/
+typedef struct {
+	size_t bitContainer;
+	unsigned bitsConsumed;
+	const char *ptr;
+	const char *start;
+} BIT_DStream_t;
+
+typedef enum {
+	BIT_DStream_unfinished = 0,
+	BIT_DStream_endOfBuffer = 1,
+	BIT_DStream_completed = 2,
+	BIT_DStream_overflow = 3
+} BIT_DStream_status; /* result of BIT_reloadDStream() */
+/* 1,2,4,8 would be better for bitmap combinations, but slows down performance a bit ... :( */
+
+ZSTD_STATIC size_t BIT_initDStream(BIT_DStream_t *bitD, const void *srcBuffer, size_t srcSize);
+ZSTD_STATIC size_t BIT_readBits(BIT_DStream_t *bitD, unsigned nbBits);
+ZSTD_STATIC BIT_DStream_status BIT_reloadDStream(BIT_DStream_t *bitD);
+ZSTD_STATIC unsigned BIT_endOfDStream(const BIT_DStream_t *bitD);
+
+/* Start by invoking BIT_initDStream().
+*  A chunk of the bitStream is then stored into a local register.
+*  Local register size is 64-bits on 64-bits systems, 32-bits on 32-bits systems (size_t).
+*  You can then retrieve bitFields stored into the local register, **in reverse order**.
+*  Local register is explicitly reloaded from memory by the BIT_reloadDStream() method.
+*  A reload guarantee a minimum of ((8*sizeof(bitD->bitContainer))-7) bits when its result is BIT_DStream_unfinished.
+*  Otherwise, it can be less than that, so proceed accordingly.
+*  Checking if DStream has reached its end can be performed with BIT_endOfDStream().
+*/
+
+/*-****************************************
+*  unsafe API
+******************************************/
+ZSTD_STATIC void BIT_addBitsFast(BIT_CStream_t *bitC, size_t value, unsigned nbBits);
+/* faster, but works only if value is "clean", meaning all high bits above nbBits are 0 */
+
+ZSTD_STATIC void BIT_flushBitsFast(BIT_CStream_t *bitC);
+/* unsafe version; does not check buffer overflow */
+
+ZSTD_STATIC size_t BIT_readBitsFast(BIT_DStream_t *bitD, unsigned nbBits);
+/* faster, but works only if nbBits >= 1 */
+
+/*-**************************************************************
+*  Internal functions
+****************************************************************/
+ZSTD_STATIC unsigned BIT_highbit32(register U32 val) { return 31 - __builtin_clz(val); }
+
+/*=====    Local Constants   =====*/
+static const unsigned BIT_mask[] = {0,       1,       3,       7,	0xF,      0x1F,     0x3F,     0x7F,      0xFF,
+				    0x1FF,   0x3FF,   0x7FF,   0xFFF,    0x1FFF,   0x3FFF,   0x7FFF,   0xFFFF,    0x1FFFF,
+				    0x3FFFF, 0x7FFFF, 0xFFFFF, 0x1FFFFF, 0x3FFFFF, 0x7FFFFF, 0xFFFFFF, 0x1FFFFFF, 0x3FFFFFF}; /* up to 26 bits */
+
+/*-**************************************************************
+*  bitStream encoding
+****************************************************************/
+/*! BIT_initCStream() :
+ *  `dstCapacity` must be > sizeof(void*)
+ *  @return : 0 if success,
+			  otherwise an error code (can be tested using ERR_isError() ) */
+ZSTD_STATIC size_t BIT_initCStream(BIT_CStream_t *bitC, void *startPtr, size_t dstCapacity)
+{
+	bitC->bitContainer = 0;
+	bitC->bitPos = 0;
+	bitC->startPtr = (char *)startPtr;
+	bitC->ptr = bitC->startPtr;
+	bitC->endPtr = bitC->startPtr + dstCapacity - sizeof(bitC->ptr);
+	if (dstCapacity <= sizeof(bitC->ptr))
+		return ERROR(dstSize_tooSmall);
+	return 0;
+}
+
+/*! BIT_addBits() :
+	can add up to 26 bits into `bitC`.
+	Does not check for register overflow ! */
+ZSTD_STATIC void BIT_addBits(BIT_CStream_t *bitC, size_t value, unsigned nbBits)
+{
+	bitC->bitContainer |= (value & BIT_mask[nbBits]) << bitC->bitPos;
+	bitC->bitPos += nbBits;
+}
+
+/*! BIT_addBitsFast() :
+ *  works only if `value` is _clean_, meaning all high bits above nbBits are 0 */
+ZSTD_STATIC void BIT_addBitsFast(BIT_CStream_t *bitC, size_t value, unsigned nbBits)
+{
+	bitC->bitContainer |= value << bitC->bitPos;
+	bitC->bitPos += nbBits;
+}
+
+/*! BIT_flushBitsFast() :
+ *  unsafe version; does not check buffer overflow */
+ZSTD_STATIC void BIT_flushBitsFast(BIT_CStream_t *bitC)
+{
+	size_t const nbBytes = bitC->bitPos >> 3;
+	ZSTD_writeLEST(bitC->ptr, bitC->bitContainer);
+	bitC->ptr += nbBytes;
+	bitC->bitPos &= 7;
+	bitC->bitContainer >>= nbBytes * 8; /* if bitPos >= sizeof(bitContainer)*8 --> undefined behavior */
+}
+
+/*! BIT_flushBits() :
+ *  safe version; check for buffer overflow, and prevents it.
+ *  note : does not signal buffer overflow. This will be revealed later on using BIT_closeCStream() */
+ZSTD_STATIC void BIT_flushBits(BIT_CStream_t *bitC)
+{
+	size_t const nbBytes = bitC->bitPos >> 3;
+	ZSTD_writeLEST(bitC->ptr, bitC->bitContainer);
+	bitC->ptr += nbBytes;
+	if (bitC->ptr > bitC->endPtr)
+		bitC->ptr = bitC->endPtr;
+	bitC->bitPos &= 7;
+	bitC->bitContainer >>= nbBytes * 8; /* if bitPos >= sizeof(bitContainer)*8 --> undefined behavior */
+}
+
+/*! BIT_closeCStream() :
+ *  @return : size of CStream, in bytes,
+			  or 0 if it could not fit into dstBuffer */
+ZSTD_STATIC size_t BIT_closeCStream(BIT_CStream_t *bitC)
+{
+	BIT_addBitsFast(bitC, 1, 1); /* endMark */
+	BIT_flushBits(bitC);
+
+	if (bitC->ptr >= bitC->endPtr)
+		return 0; /* doesn't fit within authorized budget : cancel */
+
+	return (bitC->ptr - bitC->startPtr) + (bitC->bitPos > 0);
+}
+
+/*-********************************************************
+* bitStream decoding
+**********************************************************/
+/*! BIT_initDStream() :
+*   Initialize a BIT_DStream_t.
+*   `bitD` : a pointer to an already allocated BIT_DStream_t structure.
+*   `srcSize` must be the *exact* size of the bitStream, in bytes.
+*   @return : size of stream (== srcSize) or an errorCode if a problem is detected
+*/
+ZSTD_STATIC size_t BIT_initDStream(BIT_DStream_t *bitD, const void *srcBuffer, size_t srcSize)
+{
+	if (srcSize < 1) {
+		memset(bitD, 0, sizeof(*bitD));
+		return ERROR(srcSize_wrong);
+	}
+
+	if (srcSize >= sizeof(bitD->bitContainer)) { /* normal case */
+		bitD->start = (const char *)srcBuffer;
+		bitD->ptr = (const char *)srcBuffer + srcSize - sizeof(bitD->bitContainer);
+		bitD->bitContainer = ZSTD_readLEST(bitD->ptr);
+		{
+			BYTE const lastByte = ((const BYTE *)srcBuffer)[srcSize - 1];
+			bitD->bitsConsumed = lastByte ? 8 - BIT_highbit32(lastByte) : 0; /* ensures bitsConsumed is always set */
+			if (lastByte == 0)
+				return ERROR(GENERIC); /* endMark not present */
+		}
+	} else {
+		bitD->start = (const char *)srcBuffer;
+		bitD->ptr = bitD->start;
+		bitD->bitContainer = *(const BYTE *)(bitD->start);
+		switch (srcSize) {
+		case 7: bitD->bitContainer += (size_t)(((const BYTE *)(srcBuffer))[6]) << (sizeof(bitD->bitContainer) * 8 - 16);
+		case 6: bitD->bitContainer += (size_t)(((const BYTE *)(srcBuffer))[5]) << (sizeof(bitD->bitContainer) * 8 - 24);
+		case 5: bitD->bitContainer += (size_t)(((const BYTE *)(srcBuffer))[4]) << (sizeof(bitD->bitContainer) * 8 - 32);
+		case 4: bitD->bitContainer += (size_t)(((const BYTE *)(srcBuffer))[3]) << 24;
+		case 3: bitD->bitContainer += (size_t)(((const BYTE *)(srcBuffer))[2]) << 16;
+		case 2: bitD->bitContainer += (size_t)(((const BYTE *)(srcBuffer))[1]) << 8;
+		default:;
+		}
+		{
+			BYTE const lastByte = ((const BYTE *)srcBuffer)[srcSize - 1];
+			bitD->bitsConsumed = lastByte ? 8 - BIT_highbit32(lastByte) : 0;
+			if (lastByte == 0)
+				return ERROR(GENERIC); /* endMark not present */
+		}
+		bitD->bitsConsumed += (U32)(sizeof(bitD->bitContainer) - srcSize) * 8;
+	}
+
+	return srcSize;
+}
+
+ZSTD_STATIC size_t BIT_getUpperBits(size_t bitContainer, U32 const start) { return bitContainer >> start; }
+
+ZSTD_STATIC size_t BIT_getMiddleBits(size_t bitContainer, U32 const start, U32 const nbBits) { return (bitContainer >> start) & BIT_mask[nbBits]; }
+
+ZSTD_STATIC size_t BIT_getLowerBits(size_t bitContainer, U32 const nbBits) { return bitContainer & BIT_mask[nbBits]; }
+
+/*! BIT_lookBits() :
+ *  Provides next n bits from local register.
+ *  local register is not modified.
+ *  On 32-bits, maxNbBits==24.
+ *  On 64-bits, maxNbBits==56.
+ *  @return : value extracted
+ */
+ZSTD_STATIC size_t BIT_lookBits(const BIT_DStream_t *bitD, U32 nbBits)
+{
+	U32 const bitMask = sizeof(bitD->bitContainer) * 8 - 1;
+	return ((bitD->bitContainer << (bitD->bitsConsumed & bitMask)) >> 1) >> ((bitMask - nbBits) & bitMask);
+}
+
+/*! BIT_lookBitsFast() :
+*   unsafe version; only works only if nbBits >= 1 */
+ZSTD_STATIC size_t BIT_lookBitsFast(const BIT_DStream_t *bitD, U32 nbBits)
+{
+	U32 const bitMask = sizeof(bitD->bitContainer) * 8 - 1;
+	return (bitD->bitContainer << (bitD->bitsConsumed & bitMask)) >> (((bitMask + 1) - nbBits) & bitMask);
+}
+
+ZSTD_STATIC void BIT_skipBits(BIT_DStream_t *bitD, U32 nbBits) { bitD->bitsConsumed += nbBits; }
+
+/*! BIT_readBits() :
+ *  Read (consume) next n bits from local register and update.
+ *  Pay attention to not read more than nbBits contained into local register.
+ *  @return : extracted value.
+ */
+ZSTD_STATIC size_t BIT_readBits(BIT_DStream_t *bitD, U32 nbBits)
+{
+	size_t const value = BIT_lookBits(bitD, nbBits);
+	BIT_skipBits(bitD, nbBits);
+	return value;
+}
+
+/*! BIT_readBitsFast() :
+*   unsafe version; only works only if nbBits >= 1 */
+ZSTD_STATIC size_t BIT_readBitsFast(BIT_DStream_t *bitD, U32 nbBits)
+{
+	size_t const value = BIT_lookBitsFast(bitD, nbBits);
+	BIT_skipBits(bitD, nbBits);
+	return value;
+}
+
+/*! BIT_reloadDStream() :
+*   Refill `bitD` from buffer previously set in BIT_initDStream() .
+*   This function is safe, it guarantees it will not read beyond src buffer.
+*   @return : status of `BIT_DStream_t` internal register.
+			  if status == BIT_DStream_unfinished, internal register is filled with >= (sizeof(bitD->bitContainer)*8 - 7) bits */
+ZSTD_STATIC BIT_DStream_status BIT_reloadDStream(BIT_DStream_t *bitD)
+{
+	if (bitD->bitsConsumed > (sizeof(bitD->bitContainer) * 8)) /* should not happen => corruption detected */
+		return BIT_DStream_overflow;
+
+	if (bitD->ptr >= bitD->start + sizeof(bitD->bitContainer)) {
+		bitD->ptr -= bitD->bitsConsumed >> 3;
+		bitD->bitsConsumed &= 7;
+		bitD->bitContainer = ZSTD_readLEST(bitD->ptr);
+		return BIT_DStream_unfinished;
+	}
+	if (bitD->ptr == bitD->start) {
+		if (bitD->bitsConsumed < sizeof(bitD->bitContainer) * 8)
+			return BIT_DStream_endOfBuffer;
+		return BIT_DStream_completed;
+	}
+	{
+		U32 nbBytes = bitD->bitsConsumed >> 3;
+		BIT_DStream_status result = BIT_DStream_unfinished;
+		if (bitD->ptr - nbBytes < bitD->start) {
+			nbBytes = (U32)(bitD->ptr - bitD->start); /* ptr > start */
+			result = BIT_DStream_endOfBuffer;
+		}
+		bitD->ptr -= nbBytes;
+		bitD->bitsConsumed -= nbBytes * 8;
+		bitD->bitContainer = ZSTD_readLEST(bitD->ptr); /* reminder : srcSize > sizeof(bitD) */
+		return result;
+	}
+}
+
+/*! BIT_endOfDStream() :
+*   @return Tells if DStream has exactly reached its end (all bits consumed).
+*/
+ZSTD_STATIC unsigned BIT_endOfDStream(const BIT_DStream_t *DStream)
+{
+	return ((DStream->ptr == DStream->start) && (DStream->bitsConsumed == sizeof(DStream->bitContainer) * 8));
+}
+
+#endif /* BITSTREAM_H_MODULE */
diff --git a/lib/zstd/compress.c b/lib/zstd/compress.c
new file mode 100644
index 000000000000..f9166cf4f7a9
--- /dev/null
+++ b/lib/zstd/compress.c
@@ -0,0 +1,3484 @@
+/**
+ * Copyright (c) 2016-present, Yann Collet, Facebook, Inc.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of https://github.com/facebook/zstd.
+ * An additional grant of patent rights can be found in the PATENTS file in the
+ * same directory.
+ *
+ * This program is free software; you can redistribute it and/or modify it under
+ * the terms of the GNU General Public License version 2 as published by the
+ * Free Software Foundation. This program is dual-licensed; you may select
+ * either version 2 of the GNU General Public License ("GPL") or BSD license
+ * ("BSD").
+ */
+
+/*-*************************************
+*  Dependencies
+***************************************/
+#include "fse.h"
+#include "huf.h"
+#include "mem.h"
+#include "zstd_internal.h" /* includes zstd.h */
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/string.h> /* memset */
+
+/*-*************************************
+*  Constants
+***************************************/
+static const U32 g_searchStrength = 8; /* control skip over incompressible data */
+#define HASH_READ_SIZE 8
+typedef enum { ZSTDcs_created = 0, ZSTDcs_init, ZSTDcs_ongoing, ZSTDcs_ending } ZSTD_compressionStage_e;
+
+/*-*************************************
+*  Helper functions
+***************************************/
+size_t ZSTD_compressBound(size_t srcSize) { return FSE_compressBound(srcSize) + 12; }
+
+/*-*************************************
+*  Sequence storage
+***************************************/
+static void ZSTD_resetSeqStore(seqStore_t *ssPtr)
+{
+	ssPtr->lit = ssPtr->litStart;
+	ssPtr->sequences = ssPtr->sequencesStart;
+	ssPtr->longLengthID = 0;
+}
+
+/*-*************************************
+*  Context memory management
+***************************************/
+struct ZSTD_CCtx_s {
+	const BYTE *nextSrc;  /* next block here to continue on curr prefix */
+	const BYTE *base;     /* All regular indexes relative to this position */
+	const BYTE *dictBase; /* extDict indexes relative to this position */
+	U32 dictLimit;	/* below that point, need extDict */
+	U32 lowLimit;	 /* below that point, no more data */
+	U32 nextToUpdate;     /* index from which to continue dictionary update */
+	U32 nextToUpdate3;    /* index from which to continue dictionary update */
+	U32 hashLog3;	 /* dispatch table : larger == faster, more memory */
+	U32 loadedDictEnd;    /* index of end of dictionary */
+	U32 forceWindow;      /* force back-references to respect limit of 1<<wLog, even for dictionary */
+	U32 forceRawDict;     /* Force loading dictionary in "content-only" mode (no header analysis) */
+	ZSTD_compressionStage_e stage;
+	U32 rep[ZSTD_REP_NUM];
+	U32 repToConfirm[ZSTD_REP_NUM];
+	U32 dictID;
+	ZSTD_parameters params;
+	void *workSpace;
+	size_t workSpaceSize;
+	size_t blockSize;
+	U64 frameContentSize;
+	struct xxh64_state xxhState;
+	ZSTD_customMem customMem;
+
+	seqStore_t seqStore; /* sequences storage ptrs */
+	U32 *hashTable;
+	U32 *hashTable3;
+	U32 *chainTable;
+	HUF_CElt *hufTable;
+	U32 flagStaticTables;
+	HUF_repeat flagStaticHufTable;
+	FSE_CTable offcodeCTable[FSE_CTABLE_SIZE_U32(OffFSELog, MaxOff)];
+	FSE_CTable matchlengthCTable[FSE_CTABLE_SIZE_U32(MLFSELog, MaxML)];
+	FSE_CTable litlengthCTable[FSE_CTABLE_SIZE_U32(LLFSELog, MaxLL)];
+	unsigned tmpCounters[HUF_COMPRESS_WORKSPACE_SIZE_U32];
+};
+
+size_t ZSTD_CCtxWorkspaceBound(ZSTD_compressionParameters cParams)
+{
+	size_t const blockSize = MIN(ZSTD_BLOCKSIZE_ABSOLUTEMAX, (size_t)1 << cParams.windowLog);
+	U32 const divider = (cParams.searchLength == 3) ? 3 : 4;
+	size_t const maxNbSeq = blockSize / divider;
+	size_t const tokenSpace = blockSize + 11 * maxNbSeq;
+	size_t const chainSize = (cParams.strategy == ZSTD_fast) ? 0 : (1 << cParams.chainLog);
+	size_t const hSize = ((size_t)1) << cParams.hashLog;
+	U32 const hashLog3 = (cParams.searchLength > 3) ? 0 : MIN(ZSTD_HASHLOG3_MAX, cParams.windowLog);
+	size_t const h3Size = ((size_t)1) << hashLog3;
+	size_t const tableSpace = (chainSize + hSize + h3Size) * sizeof(U32);
+	size_t const optSpace =
+	    ((MaxML + 1) + (MaxLL + 1) + (MaxOff + 1) + (1 << Litbits)) * sizeof(U32) + (ZSTD_OPT_NUM + 1) * (sizeof(ZSTD_match_t) + sizeof(ZSTD_optimal_t));
+	size_t const workspaceSize = tableSpace + (256 * sizeof(U32)) /* huffTable */ + tokenSpace +
+				     (((cParams.strategy == ZSTD_btopt) || (cParams.strategy == ZSTD_btopt2)) ? optSpace : 0);
+
+	return ZSTD_ALIGN(sizeof(ZSTD_stack)) + ZSTD_ALIGN(sizeof(ZSTD_CCtx)) + ZSTD_ALIGN(workspaceSize);
+}
+
+static ZSTD_CCtx *ZSTD_createCCtx_advanced(ZSTD_customMem customMem)
+{
+	ZSTD_CCtx *cctx;
+	if (!customMem.customAlloc || !customMem.customFree)
+		return NULL;
+	cctx = (ZSTD_CCtx *)ZSTD_malloc(sizeof(ZSTD_CCtx), customMem);
+	if (!cctx)
+		return NULL;
+	memset(cctx, 0, sizeof(ZSTD_CCtx));
+	cctx->customMem = customMem;
+	return cctx;
+}
+
+ZSTD_CCtx *ZSTD_initCCtx(void *workspace, size_t workspaceSize)
+{
+	ZSTD_customMem const stackMem = ZSTD_initStack(workspace, workspaceSize);
+	ZSTD_CCtx *cctx = ZSTD_createCCtx_advanced(stackMem);
+	if (cctx) {
+		cctx->workSpace = ZSTD_stackAllocAll(cctx->customMem.opaque, &cctx->workSpaceSize);
+	}
+	return cctx;
+}
+
+size_t ZSTD_freeCCtx(ZSTD_CCtx *cctx)
+{
+	if (cctx == NULL)
+		return 0; /* support free on NULL */
+	ZSTD_free(cctx->workSpace, cctx->customMem);
+	ZSTD_free(cctx, cctx->customMem);
+	return 0; /* reserved as a potential error code in the future */
+}
+
+const seqStore_t *ZSTD_getSeqStore(const ZSTD_CCtx *ctx) /* hidden interface */ { return &(ctx->seqStore); }
+
+static ZSTD_parameters ZSTD_getParamsFromCCtx(const ZSTD_CCtx *cctx) { return cctx->params; }
+
+/** ZSTD_checkParams() :
+	ensure param values remain within authorized range.
+	@return : 0, or an error code if one value is beyond authorized range */
+size_t ZSTD_checkCParams(ZSTD_compressionParameters cParams)
+{
+#define CLAMPCHECK(val, min, max)                                       \
+	{                                                               \
+		if ((val < min) | (val > max))                          \
+			return ERROR(compressionParameter_unsupported); \
+	}
+	CLAMPCHECK(cParams.windowLog, ZSTD_WINDOWLOG_MIN, ZSTD_WINDOWLOG_MAX);
+	CLAMPCHECK(cParams.chainLog, ZSTD_CHAINLOG_MIN, ZSTD_CHAINLOG_MAX);
+	CLAMPCHECK(cParams.hashLog, ZSTD_HASHLOG_MIN, ZSTD_HASHLOG_MAX);
+	CLAMPCHECK(cParams.searchLog, ZSTD_SEARCHLOG_MIN, ZSTD_SEARCHLOG_MAX);
+	CLAMPCHECK(cParams.searchLength, ZSTD_SEARCHLENGTH_MIN, ZSTD_SEARCHLENGTH_MAX);
+	CLAMPCHECK(cParams.targetLength, ZSTD_TARGETLENGTH_MIN, ZSTD_TARGETLENGTH_MAX);
+	if ((U32)(cParams.strategy) > (U32)ZSTD_btopt2)
+		return ERROR(compressionParameter_unsupported);
+	return 0;
+}
+
+/** ZSTD_cycleLog() :
+ *  condition for correct operation : hashLog > 1 */
+static U32 ZSTD_cycleLog(U32 hashLog, ZSTD_strategy strat)
+{
+	U32 const btScale = ((U32)strat >= (U32)ZSTD_btlazy2);
+	return hashLog - btScale;
+}
+
+/** ZSTD_adjustCParams() :
+	optimize `cPar` for a given input (`srcSize` and `dictSize`).
+	mostly downsizing to reduce memory consumption and initialization.
+	Both `srcSize` and `dictSize` are optional (use 0 if unknown),
+	but if both are 0, no optimization can be done.
+	Note : cPar is considered validated at this stage. Use ZSTD_checkParams() to ensure that. */
+ZSTD_compressionParameters ZSTD_adjustCParams(ZSTD_compressionParameters cPar, unsigned long long srcSize, size_t dictSize)
+{
+	if (srcSize + dictSize == 0)
+		return cPar; /* no size information available : no adjustment */
+
+	/* resize params, to use less memory when necessary */
+	{
+		U32 const minSrcSize = (srcSize == 0) ? 500 : 0;
+		U64 const rSize = srcSize + dictSize + minSrcSize;
+		if (rSize < ((U64)1 << ZSTD_WINDOWLOG_MAX)) {
+			U32 const srcLog = MAX(ZSTD_HASHLOG_MIN, ZSTD_highbit32((U32)(rSize)-1) + 1);
+			if (cPar.windowLog > srcLog)
+				cPar.windowLog = srcLog;
+		}
+	}
+	if (cPar.hashLog > cPar.windowLog)
+		cPar.hashLog = cPar.windowLog;
+	{
+		U32 const cycleLog = ZSTD_cycleLog(cPar.chainLog, cPar.strategy);
+		if (cycleLog > cPar.windowLog)
+			cPar.chainLog -= (cycleLog - cPar.windowLog);
+	}
+
+	if (cPar.windowLog < ZSTD_WINDOWLOG_ABSOLUTEMIN)
+		cPar.windowLog = ZSTD_WINDOWLOG_ABSOLUTEMIN; /* required for frame header */
+
+	return cPar;
+}
+
+static U32 ZSTD_equivalentParams(ZSTD_parameters param1, ZSTD_parameters param2)
+{
+	return (param1.cParams.hashLog == param2.cParams.hashLog) & (param1.cParams.chainLog == param2.cParams.chainLog) &
+	       (param1.cParams.strategy == param2.cParams.strategy) & ((param1.cParams.searchLength == 3) == (param2.cParams.searchLength == 3));
+}
+
+/*! ZSTD_continueCCtx() :
+	reuse CCtx without reset (note : requires no dictionary) */
+static size_t ZSTD_continueCCtx(ZSTD_CCtx *cctx, ZSTD_parameters params, U64 frameContentSize)
+{
+	U32 const end = (U32)(cctx->nextSrc - cctx->base);
+	cctx->params = params;
+	cctx->frameContentSize = frameContentSize;
+	cctx->lowLimit = end;
+	cctx->dictLimit = end;
+	cctx->nextToUpdate = end + 1;
+	cctx->stage = ZSTDcs_init;
+	cctx->dictID = 0;
+	cctx->loadedDictEnd = 0;
+	{
+		int i;
+		for (i = 0; i < ZSTD_REP_NUM; i++)
+			cctx->rep[i] = repStartValue[i];
+	}
+	cctx->seqStore.litLengthSum = 0; /* force reset of btopt stats */
+	xxh64_reset(&cctx->xxhState, 0);
+	return 0;
+}
+
+typedef enum { ZSTDcrp_continue, ZSTDcrp_noMemset, ZSTDcrp_fullReset } ZSTD_compResetPolicy_e;
+
+/*! ZSTD_resetCCtx_advanced() :
+	note : `params` must be validated */
+static size_t ZSTD_resetCCtx_advanced(ZSTD_CCtx *zc, ZSTD_parameters params, U64 frameContentSize, ZSTD_compResetPolicy_e const crp)
+{
+	if (crp == ZSTDcrp_continue)
+		if (ZSTD_equivalentParams(params, zc->params)) {
+			zc->flagStaticTables = 0;
+			zc->flagStaticHufTable = HUF_repeat_none;
+			return ZSTD_continueCCtx(zc, params, frameContentSize);
+		}
+
+	{
+		size_t const blockSize = MIN(ZSTD_BLOCKSIZE_ABSOLUTEMAX, (size_t)1 << params.cParams.windowLog);
+		U32 const divider = (params.cParams.searchLength == 3) ? 3 : 4;
+		size_t const maxNbSeq = blockSize / divider;
+		size_t const tokenSpace = blockSize + 11 * maxNbSeq;
+		size_t const chainSize = (params.cParams.strategy == ZSTD_fast) ? 0 : (1 << params.cParams.chainLog);
+		size_t const hSize = ((size_t)1) << params.cParams.hashLog;
+		U32 const hashLog3 = (params.cParams.searchLength > 3) ? 0 : MIN(ZSTD_HASHLOG3_MAX, params.cParams.windowLog);
+		size_t const h3Size = ((size_t)1) << hashLog3;
+		size_t const tableSpace = (chainSize + hSize + h3Size) * sizeof(U32);
+		void *ptr;
+
+		/* Check if workSpace is large enough, alloc a new one if needed */
+		{
+			size_t const optSpace = ((MaxML + 1) + (MaxLL + 1) + (MaxOff + 1) + (1 << Litbits)) * sizeof(U32) +
+						(ZSTD_OPT_NUM + 1) * (sizeof(ZSTD_match_t) + sizeof(ZSTD_optimal_t));
+			size_t const neededSpace = tableSpace + (256 * sizeof(U32)) /* huffTable */ + tokenSpace +
+						   (((params.cParams.strategy == ZSTD_btopt) || (params.cParams.strategy == ZSTD_btopt2)) ? optSpace : 0);
+			if (zc->workSpaceSize < neededSpace) {
+				ZSTD_free(zc->workSpace, zc->customMem);
+				zc->workSpace = ZSTD_malloc(neededSpace, zc->customMem);
+				if (zc->workSpace == NULL)
+					return ERROR(memory_allocation);
+				zc->workSpaceSize = neededSpace;
+			}
+		}
+
+		if (crp != ZSTDcrp_noMemset)
+			memset(zc->workSpace, 0, tableSpace); /* reset tables only */
+		xxh64_reset(&zc->xxhState, 0);
+		zc->hashLog3 = hashLog3;
+		zc->hashTable = (U32 *)(zc->workSpace);
+		zc->chainTable = zc->hashTable + hSize;
+		zc->hashTable3 = zc->chainTable + chainSize;
+		ptr = zc->hashTable3 + h3Size;
+		zc->hufTable = (HUF_CElt *)ptr;
+		zc->flagStaticTables = 0;
+		zc->flagStaticHufTable = HUF_repeat_none;
+		ptr = ((U32 *)ptr) + 256; /* note : HUF_CElt* is incomplete type, size is simulated using U32 */
+
+		zc->nextToUpdate = 1;
+		zc->nextSrc = NULL;
+		zc->base = NULL;
+		zc->dictBase = NULL;
+		zc->dictLimit = 0;
+		zc->lowLimit = 0;
+		zc->params = params;
+		zc->blockSize = blockSize;
+		zc->frameContentSize = frameContentSize;
+		{
+			int i;
+			for (i = 0; i < ZSTD_REP_NUM; i++)
+				zc->rep[i] = repStartValue[i];
+		}
+
+		if ((params.cParams.strategy == ZSTD_btopt) || (params.cParams.strategy == ZSTD_btopt2)) {
+			zc->seqStore.litFreq = (U32 *)ptr;
+			zc->seqStore.litLengthFreq = zc->seqStore.litFreq + (1 << Litbits);
+			zc->seqStore.matchLengthFreq = zc->seqStore.litLengthFreq + (MaxLL + 1);
+			zc->seqStore.offCodeFreq = zc->seqStore.matchLengthFreq + (MaxML + 1);
+			ptr = zc->seqStore.offCodeFreq + (MaxOff + 1);
+			zc->seqStore.matchTable = (ZSTD_match_t *)ptr;
+			ptr = zc->seqStore.matchTable + ZSTD_OPT_NUM + 1;
+			zc->seqStore.priceTable = (ZSTD_optimal_t *)ptr;
+			ptr = zc->seqStore.priceTable + ZSTD_OPT_NUM + 1;
+			zc->seqStore.litLengthSum = 0;
+		}
+		zc->seqStore.sequencesStart = (seqDef *)ptr;
+		ptr = zc->seqStore.sequencesStart + maxNbSeq;
+		zc->seqStore.llCode = (BYTE *)ptr;
+		zc->seqStore.mlCode = zc->seqStore.llCode + maxNbSeq;
+		zc->seqStore.ofCode = zc->seqStore.mlCode + maxNbSeq;
+		zc->seqStore.litStart = zc->seqStore.ofCode + maxNbSeq;
+
+		zc->stage = ZSTDcs_init;
+		zc->dictID = 0;
+		zc->loadedDictEnd = 0;
+
+		return 0;
+	}
+}
+
+/* ZSTD_invalidateRepCodes() :
+ * ensures next compression will not use repcodes from previous block.
+ * Note : only works with regular variant;
+ *        do not use with extDict variant ! */
+void ZSTD_invalidateRepCodes(ZSTD_CCtx *cctx)
+{
+	int i;
+	for (i = 0; i < ZSTD_REP_NUM; i++)
+		cctx->rep[i] = 0;
+}
+
+/*! ZSTD_copyCCtx() :
+*   Duplicate an existing context `srcCCtx` into another one `dstCCtx`.
+*   Only works during stage ZSTDcs_init (i.e. after creation, but before first call to ZSTD_compressContinue()).
+*   @return : 0, or an error code */
+size_t ZSTD_copyCCtx(ZSTD_CCtx *dstCCtx, const ZSTD_CCtx *srcCCtx, unsigned long long pledgedSrcSize)
+{
+	if (srcCCtx->stage != ZSTDcs_init)
+		return ERROR(stage_wrong);
+
+	memcpy(&dstCCtx->customMem, &srcCCtx->customMem, sizeof(ZSTD_customMem));
+	{
+		ZSTD_parameters params = srcCCtx->params;
+		params.fParams.contentSizeFlag = (pledgedSrcSize > 0);
+		ZSTD_resetCCtx_advanced(dstCCtx, params, pledgedSrcSize, ZSTDcrp_noMemset);
+	}
+
+	/* copy tables */
+	{
+		size_t const chainSize = (srcCCtx->params.cParams.strategy == ZSTD_fast) ? 0 : (1 << srcCCtx->params.cParams.chainLog);
+		size_t const hSize = ((size_t)1) << srcCCtx->params.cParams.hashLog;
+		size_t const h3Size = (size_t)1 << srcCCtx->hashLog3;
+		size_t const tableSpace = (chainSize + hSize + h3Size) * sizeof(U32);
+		memcpy(dstCCtx->workSpace, srcCCtx->workSpace, tableSpace);
+	}
+
+	/* copy dictionary offsets */
+	dstCCtx->nextToUpdate = srcCCtx->nextToUpdate;
+	dstCCtx->nextToUpdate3 = srcCCtx->nextToUpdate3;
+	dstCCtx->nextSrc = srcCCtx->nextSrc;
+	dstCCtx->base = srcCCtx->base;
+	dstCCtx->dictBase = srcCCtx->dictBase;
+	dstCCtx->dictLimit = srcCCtx->dictLimit;
+	dstCCtx->lowLimit = srcCCtx->lowLimit;
+	dstCCtx->loadedDictEnd = srcCCtx->loadedDictEnd;
+	dstCCtx->dictID = srcCCtx->dictID;
+
+	/* copy entropy tables */
+	dstCCtx->flagStaticTables = srcCCtx->flagStaticTables;
+	dstCCtx->flagStaticHufTable = srcCCtx->flagStaticHufTable;
+	if (srcCCtx->flagStaticTables) {
+		memcpy(dstCCtx->litlengthCTable, srcCCtx->litlengthCTable, sizeof(dstCCtx->litlengthCTable));
+		memcpy(dstCCtx->matchlengthCTable, srcCCtx->matchlengthCTable, sizeof(dstCCtx->matchlengthCTable));
+		memcpy(dstCCtx->offcodeCTable, srcCCtx->offcodeCTable, sizeof(dstCCtx->offcodeCTable));
+	}
+	if (srcCCtx->flagStaticHufTable) {
+		memcpy(dstCCtx->hufTable, srcCCtx->hufTable, 256 * 4);
+	}
+
+	return 0;
+}
+
+/*! ZSTD_reduceTable() :
+*   reduce table indexes by `reducerValue` */
+static void ZSTD_reduceTable(U32 *const table, U32 const size, U32 const reducerValue)
+{
+	U32 u;
+	for (u = 0; u < size; u++) {
+		if (table[u] < reducerValue)
+			table[u] = 0;
+		else
+			table[u] -= reducerValue;
+	}
+}
+
+/*! ZSTD_reduceIndex() :
+*   rescale all indexes to avoid future overflow (indexes are U32) */
+static void ZSTD_reduceIndex(ZSTD_CCtx *zc, const U32 reducerValue)
+{
+	{
+		U32 const hSize = 1 << zc->params.cParams.hashLog;
+		ZSTD_reduceTable(zc->hashTable, hSize, reducerValue);
+	}
+
+	{
+		U32 const chainSize = (zc->params.cParams.strategy == ZSTD_fast) ? 0 : (1 << zc->params.cParams.chainLog);
+		ZSTD_reduceTable(zc->chainTable, chainSize, reducerValue);
+	}
+
+	{
+		U32 const h3Size = (zc->hashLog3) ? 1 << zc->hashLog3 : 0;
+		ZSTD_reduceTable(zc->hashTable3, h3Size, reducerValue);
+	}
+}
+
+/*-*******************************************************
+*  Block entropic compression
+*********************************************************/
+
+/* See doc/zstd_compression_format.md for detailed format description */
+
+size_t ZSTD_noCompressBlock(void *dst, size_t dstCapacity, const void *src, size_t srcSize)
+{
+	if (srcSize + ZSTD_blockHeaderSize > dstCapacity)
+		return ERROR(dstSize_tooSmall);
+	memcpy((BYTE *)dst + ZSTD_blockHeaderSize, src, srcSize);
+	ZSTD_writeLE24(dst, (U32)(srcSize << 2) + (U32)bt_raw);
+	return ZSTD_blockHeaderSize + srcSize;
+}
+
+static size_t ZSTD_noCompressLiterals(void *dst, size_t dstCapacity, const void *src, size_t srcSize)
+{
+	BYTE *const ostart = (BYTE * const)dst;
+	U32 const flSize = 1 + (srcSize > 31) + (srcSize > 4095);
+
+	if (srcSize + flSize > dstCapacity)
+		return ERROR(dstSize_tooSmall);
+
+	switch (flSize) {
+	case 1: /* 2 - 1 - 5 */ ostart[0] = (BYTE)((U32)set_basic + (srcSize << 3)); break;
+	case 2: /* 2 - 2 - 12 */ ZSTD_writeLE16(ostart, (U16)((U32)set_basic + (1 << 2) + (srcSize << 4))); break;
+	default: /*note : should not be necessary : flSize is within {1,2,3} */
+	case 3: /* 2 - 2 - 20 */ ZSTD_writeLE32(ostart, (U32)((U32)set_basic + (3 << 2) + (srcSize << 4))); break;
+	}
+
+	memcpy(ostart + flSize, src, srcSize);
+	return srcSize + flSize;
+}
+
+static size_t ZSTD_compressRleLiteralsBlock(void *dst, size_t dstCapacity, const void *src, size_t srcSize)
+{
+	BYTE *const ostart = (BYTE * const)dst;
+	U32 const flSize = 1 + (srcSize > 31) + (srcSize > 4095);
+
+	(void)dstCapacity; /* dstCapacity already guaranteed to be >=4, hence large enough */
+
+	switch (flSize) {
+	case 1: /* 2 - 1 - 5 */ ostart[0] = (BYTE)((U32)set_rle + (srcSize << 3)); break;
+	case 2: /* 2 - 2 - 12 */ ZSTD_writeLE16(ostart, (U16)((U32)set_rle + (1 << 2) + (srcSize << 4))); break;
+	default: /*note : should not be necessary : flSize is necessarily within {1,2,3} */
+	case 3: /* 2 - 2 - 20 */ ZSTD_writeLE32(ostart, (U32)((U32)set_rle + (3 << 2) + (srcSize << 4))); break;
+	}
+
+	ostart[flSize] = *(const BYTE *)src;
+	return flSize + 1;
+}
+
+static size_t ZSTD_minGain(size_t srcSize) { return (srcSize >> 6) + 2; }
+
+static size_t ZSTD_compressLiterals(ZSTD_CCtx *zc, void *dst, size_t dstCapacity, const void *src, size_t srcSize)
+{
+	size_t const minGain = ZSTD_minGain(srcSize);
+	size_t const lhSize = 3 + (srcSize >= 1 KB) + (srcSize >= 16 KB);
+	BYTE *const ostart = (BYTE *)dst;
+	U32 singleStream = srcSize < 256;
+	symbolEncodingType_e hType = set_compressed;
+	size_t cLitSize;
+
+/* small ? don't even attempt compression (speed opt) */
+#define LITERAL_NOENTROPY 63
+	{
+		size_t const minLitSize = zc->flagStaticHufTable == HUF_repeat_valid ? 6 : LITERAL_NOENTROPY;
+		if (srcSize <= minLitSize)
+			return ZSTD_noCompressLiterals(dst, dstCapacity, src, srcSize);
+	}
+
+	if (dstCapacity < lhSize + 1)
+		return ERROR(dstSize_tooSmall); /* not enough space for compression */
+	{
+		HUF_repeat repeat = zc->flagStaticHufTable;
+		int const preferRepeat = zc->params.cParams.strategy < ZSTD_lazy ? srcSize <= 1024 : 0;
+		if (repeat == HUF_repeat_valid && lhSize == 3)
+			singleStream = 1;
+		cLitSize = singleStream ? HUF_compress1X_repeat(ostart + lhSize, dstCapacity - lhSize, src, srcSize, 255, 11, zc->tmpCounters,
+								sizeof(zc->tmpCounters), zc->hufTable, &repeat, preferRepeat)
+					: HUF_compress4X_repeat(ostart + lhSize, dstCapacity - lhSize, src, srcSize, 255, 11, zc->tmpCounters,
+								sizeof(zc->tmpCounters), zc->hufTable, &repeat, preferRepeat);
+		if (repeat != HUF_repeat_none) {
+			hType = set_repeat;
+		} /* reused the existing table */
+		else {
+			zc->flagStaticHufTable = HUF_repeat_check;
+		} /* now have a table to reuse */
+	}
+
+	if ((cLitSize == 0) | (cLitSize >= srcSize - minGain)) {
+		zc->flagStaticHufTable = HUF_repeat_none;
+		return ZSTD_noCompressLiterals(dst, dstCapacity, src, srcSize);
+	}
+	if (cLitSize == 1) {
+		zc->flagStaticHufTable = HUF_repeat_none;
+		return ZSTD_compressRleLiteralsBlock(dst, dstCapacity, src, srcSize);
+	}
+
+	/* Build header */
+	switch (lhSize) {
+	case 3: /* 2 - 2 - 10 - 10 */
+	{
+		U32 const lhc = hType + ((!singleStream) << 2) + ((U32)srcSize << 4) + ((U32)cLitSize << 14);
+		ZSTD_writeLE24(ostart, lhc);
+		break;
+	}
+	case 4: /* 2 - 2 - 14 - 14 */
+	{
+		U32 const lhc = hType + (2 << 2) + ((U32)srcSize << 4) + ((U32)cLitSize << 18);
+		ZSTD_writeLE32(ostart, lhc);
+		break;
+	}
+	default: /* should not be necessary, lhSize is only {3,4,5} */
+	case 5:  /* 2 - 2 - 18 - 18 */
+	{
+		U32 const lhc = hType + (3 << 2) + ((U32)srcSize << 4) + ((U32)cLitSize << 22);
+		ZSTD_writeLE32(ostart, lhc);
+		ostart[4] = (BYTE)(cLitSize >> 10);
+		break;
+	}
+	}
+	return lhSize + cLitSize;
+}
+
+static const BYTE LL_Code[64] = {0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15, 16, 16, 17, 17, 18, 18,
+				 19, 19, 20, 20, 20, 20, 21, 21, 21, 21, 22, 22, 22, 22, 22, 22, 22, 22, 23, 23, 23, 23,
+				 23, 23, 23, 23, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24};
+
+static const BYTE ML_Code[128] = {0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25,
+				  26, 27, 28, 29, 30, 31, 32, 32, 33, 33, 34, 34, 35, 35, 36, 36, 36, 36, 37, 37, 37, 37, 38, 38, 38, 38,
+				  38, 38, 38, 38, 39, 39, 39, 39, 39, 39, 39, 39, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40,
+				  40, 40, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 42, 42, 42, 42, 42, 42, 42, 42,
+				  42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42};
+
+void ZSTD_seqToCodes(const seqStore_t *seqStorePtr)
+{
+	BYTE const LL_deltaCode = 19;
+	BYTE const ML_deltaCode = 36;
+	const seqDef *const sequences = seqStorePtr->sequencesStart;
+	BYTE *const llCodeTable = seqStorePtr->llCode;
+	BYTE *const ofCodeTable = seqStorePtr->ofCode;
+	BYTE *const mlCodeTable = seqStorePtr->mlCode;
+	U32 const nbSeq = (U32)(seqStorePtr->sequences - seqStorePtr->sequencesStart);
+	U32 u;
+	for (u = 0; u < nbSeq; u++) {
+		U32 const llv = sequences[u].litLength;
+		U32 const mlv = sequences[u].matchLength;
+		llCodeTable[u] = (llv > 63) ? (BYTE)ZSTD_highbit32(llv) + LL_deltaCode : LL_Code[llv];
+		ofCodeTable[u] = (BYTE)ZSTD_highbit32(sequences[u].offset);
+		mlCodeTable[u] = (mlv > 127) ? (BYTE)ZSTD_highbit32(mlv) + ML_deltaCode : ML_Code[mlv];
+	}
+	if (seqStorePtr->longLengthID == 1)
+		llCodeTable[seqStorePtr->longLengthPos] = MaxLL;
+	if (seqStorePtr->longLengthID == 2)
+		mlCodeTable[seqStorePtr->longLengthPos] = MaxML;
+}
+
+ZSTD_STATIC size_t ZSTD_compressSequences_internal(ZSTD_CCtx *zc, void *dst, size_t dstCapacity)
+{
+	const int longOffsets = zc->params.cParams.windowLog > STREAM_ACCUMULATOR_MIN;
+	const seqStore_t *seqStorePtr = &(zc->seqStore);
+	FSE_CTable *CTable_LitLength = zc->litlengthCTable;
+	FSE_CTable *CTable_OffsetBits = zc->offcodeCTable;
+	FSE_CTable *CTable_MatchLength = zc->matchlengthCTable;
+	U32 LLtype, Offtype, MLtype; /* compressed, raw or rle */
+	const seqDef *const sequences = seqStorePtr->sequencesStart;
+	const BYTE *const ofCodeTable = seqStorePtr->ofCode;
+	const BYTE *const llCodeTable = seqStorePtr->llCode;
+	const BYTE *const mlCodeTable = seqStorePtr->mlCode;
+	BYTE *const ostart = (BYTE *)dst;
+	BYTE *const oend = ostart + dstCapacity;
+	BYTE *op = ostart;
+	size_t const nbSeq = seqStorePtr->sequences - seqStorePtr->sequencesStart;
+	BYTE *seqHead;
+
+	U32 *count;
+	S16 *norm;
+	U32 *workspace;
+	size_t workspaceSize = sizeof(zc->tmpCounters);
+	{
+		size_t spaceUsed32 = 0;
+		count = (U32 *)zc->tmpCounters + spaceUsed32;
+		spaceUsed32 += MaxSeq + 1;
+		norm = (S16 *)((U32 *)zc->tmpCounters + spaceUsed32);
+		spaceUsed32 += ALIGN(sizeof(S16) * (MaxSeq + 1), sizeof(U32)) >> 2;
+
+		workspace = (U32 *)zc->tmpCounters + spaceUsed32;
+		workspaceSize -= (spaceUsed32 << 2);
+	}
+
+	/* Compress literals */
+	{
+		const BYTE *const literals = seqStorePtr->litStart;
+		size_t const litSize = seqStorePtr->lit - literals;
+		size_t const cSize = ZSTD_compressLiterals(zc, op, dstCapacity, literals, litSize);
+		if (ZSTD_isError(cSize))
+			return cSize;
+		op += cSize;
+	}
+
+	/* Sequences Header */
+	if ((oend - op) < 3 /*max nbSeq Size*/ + 1 /*seqHead */)
+		return ERROR(dstSize_tooSmall);
+	if (nbSeq < 0x7F)
+		*op++ = (BYTE)nbSeq;
+	else if (nbSeq < LONGNBSEQ)
+		op[0] = (BYTE)((nbSeq >> 8) + 0x80), op[1] = (BYTE)nbSeq, op += 2;
+	else
+		op[0] = 0xFF, ZSTD_writeLE16(op + 1, (U16)(nbSeq - LONGNBSEQ)), op += 3;
+	if (nbSeq == 0)
+		return op - ostart;
+
+	/* seqHead : flags for FSE encoding type */
+	seqHead = op++;
+
+#define MIN_SEQ_FOR_DYNAMIC_FSE 64
+#define MAX_SEQ_FOR_STATIC_FSE 1000
+
+	/* convert length/distances into codes */
+	ZSTD_seqToCodes(seqStorePtr);
+
+	/* CTable for Literal Lengths */
+	{
+		U32 max = MaxLL;
+		size_t const mostFrequent = FSE_countFast_wksp(count, &max, llCodeTable, nbSeq, workspace);
+		if ((mostFrequent == nbSeq) && (nbSeq > 2)) {
+			*op++ = llCodeTable[0];
+			FSE_buildCTable_rle(CTable_LitLength, (BYTE)max);
+			LLtype = set_rle;
+		} else if ((zc->flagStaticTables) && (nbSeq < MAX_SEQ_FOR_STATIC_FSE)) {
+			LLtype = set_repeat;
+		} else if ((nbSeq < MIN_SEQ_FOR_DYNAMIC_FSE) || (mostFrequent < (nbSeq >> (LL_defaultNormLog - 1)))) {
+			FSE_buildCTable_wksp(CTable_LitLength, LL_defaultNorm, MaxLL, LL_defaultNormLog, workspace, workspaceSize);
+			LLtype = set_basic;
+		} else {
+			size_t nbSeq_1 = nbSeq;
+			const U32 tableLog = FSE_optimalTableLog(LLFSELog, nbSeq, max);
+			if (count[llCodeTable[nbSeq - 1]] > 1) {
+				count[llCodeTable[nbSeq - 1]]--;
+				nbSeq_1--;
+			}
+			FSE_normalizeCount(norm, tableLog, count, nbSeq_1, max);
+			{
+				size_t const NCountSize = FSE_writeNCount(op, oend - op, norm, max, tableLog); /* overflow protected */
+				if (FSE_isError(NCountSize))
+					return NCountSize;
+				op += NCountSize;
+			}
+			FSE_buildCTable_wksp(CTable_LitLength, norm, max, tableLog, workspace, workspaceSize);
+			LLtype = set_compressed;
+		}
+	}
+
+	/* CTable for Offsets */
+	{
+		U32 max = MaxOff;
+		size_t const mostFrequent = FSE_countFast_wksp(count, &max, ofCodeTable, nbSeq, workspace);
+		if ((mostFrequent == nbSeq) && (nbSeq > 2)) {
+			*op++ = ofCodeTable[0];
+			FSE_buildCTable_rle(CTable_OffsetBits, (BYTE)max);
+			Offtype = set_rle;
+		} else if ((zc->flagStaticTables) && (nbSeq < MAX_SEQ_FOR_STATIC_FSE)) {
+			Offtype = set_repeat;
+		} else if ((nbSeq < MIN_SEQ_FOR_DYNAMIC_FSE) || (mostFrequent < (nbSeq >> (OF_defaultNormLog - 1)))) {
+			FSE_buildCTable_wksp(CTable_OffsetBits, OF_defaultNorm, MaxOff, OF_defaultNormLog, workspace, workspaceSize);
+			Offtype = set_basic;
+		} else {
+			size_t nbSeq_1 = nbSeq;
+			const U32 tableLog = FSE_optimalTableLog(OffFSELog, nbSeq, max);
+			if (count[ofCodeTable[nbSeq - 1]] > 1) {
+				count[ofCodeTable[nbSeq - 1]]--;
+				nbSeq_1--;
+			}
+			FSE_normalizeCount(norm, tableLog, count, nbSeq_1, max);
+			{
+				size_t const NCountSize = FSE_writeNCount(op, oend - op, norm, max, tableLog); /* overflow protected */
+				if (FSE_isError(NCountSize))
+					return NCountSize;
+				op += NCountSize;
+			}
+			FSE_buildCTable_wksp(CTable_OffsetBits, norm, max, tableLog, workspace, workspaceSize);
+			Offtype = set_compressed;
+		}
+	}
+
+	/* CTable for MatchLengths */
+	{
+		U32 max = MaxML;
+		size_t const mostFrequent = FSE_countFast_wksp(count, &max, mlCodeTable, nbSeq, workspace);
+		if ((mostFrequent == nbSeq) && (nbSeq > 2)) {
+			*op++ = *mlCodeTable;
+			FSE_buildCTable_rle(CTable_MatchLength, (BYTE)max);
+			MLtype = set_rle;
+		} else if ((zc->flagStaticTables) && (nbSeq < MAX_SEQ_FOR_STATIC_FSE)) {
+			MLtype = set_repeat;
+		} else if ((nbSeq < MIN_SEQ_FOR_DYNAMIC_FSE) || (mostFrequent < (nbSeq >> (ML_defaultNormLog - 1)))) {
+			FSE_buildCTable_wksp(CTable_MatchLength, ML_defaultNorm, MaxML, ML_defaultNormLog, workspace, workspaceSize);
+			MLtype = set_basic;
+		} else {
+			size_t nbSeq_1 = nbSeq;
+			const U32 tableLog = FSE_optimalTableLog(MLFSELog, nbSeq, max);
+			if (count[mlCodeTable[nbSeq - 1]] > 1) {
+				count[mlCodeTable[nbSeq - 1]]--;
+				nbSeq_1--;
+			}
+			FSE_normalizeCount(norm, tableLog, count, nbSeq_1, max);
+			{
+				size_t const NCountSize = FSE_writeNCount(op, oend - op, norm, max, tableLog); /* overflow protected */
+				if (FSE_isError(NCountSize))
+					return NCountSize;
+				op += NCountSize;
+			}
+			FSE_buildCTable_wksp(CTable_MatchLength, norm, max, tableLog, workspace, workspaceSize);
+			MLtype = set_compressed;
+		}
+	}
+
+	*seqHead = (BYTE)((LLtype << 6) + (Offtype << 4) + (MLtype << 2));
+	zc->flagStaticTables = 0;
+
+	/* Encoding Sequences */
+	{
+		BIT_CStream_t blockStream;
+		FSE_CState_t stateMatchLength;
+		FSE_CState_t stateOffsetBits;
+		FSE_CState_t stateLitLength;
+
+		CHECK_E(BIT_initCStream(&blockStream, op, oend - op), dstSize_tooSmall); /* not enough space remaining */
+
+		/* first symbols */
+		FSE_initCState2(&stateMatchLength, CTable_MatchLength, mlCodeTable[nbSeq - 1]);
+		FSE_initCState2(&stateOffsetBits, CTable_OffsetBits, ofCodeTable[nbSeq - 1]);
+		FSE_initCState2(&stateLitLength, CTable_LitLength, llCodeTable[nbSeq - 1]);
+		BIT_addBits(&blockStream, sequences[nbSeq - 1].litLength, LL_bits[llCodeTable[nbSeq - 1]]);
+		if (ZSTD_32bits())
+			BIT_flushBits(&blockStream);
+		BIT_addBits(&blockStream, sequences[nbSeq - 1].matchLength, ML_bits[mlCodeTable[nbSeq - 1]]);
+		if (ZSTD_32bits())
+			BIT_flushBits(&blockStream);
+		if (longOffsets) {
+			U32 const ofBits = ofCodeTable[nbSeq - 1];
+			int const extraBits = ofBits - MIN(ofBits, STREAM_ACCUMULATOR_MIN - 1);
+			if (extraBits) {
+				BIT_addBits(&blockStream, sequences[nbSeq - 1].offset, extraBits);
+				BIT_flushBits(&blockStream);
+			}
+			BIT_addBits(&blockStream, sequences[nbSeq - 1].offset >> extraBits, ofBits - extraBits);
+		} else {
+			BIT_addBits(&blockStream, sequences[nbSeq - 1].offset, ofCodeTable[nbSeq - 1]);
+		}
+		BIT_flushBits(&blockStream);
+
+		{
+			size_t n;
+			for (n = nbSeq - 2; n < nbSeq; n--) { /* intentional underflow */
+				BYTE const llCode = llCodeTable[n];
+				BYTE const ofCode = ofCodeTable[n];
+				BYTE const mlCode = mlCodeTable[n];
+				U32 const llBits = LL_bits[llCode];
+				U32 const ofBits = ofCode; /* 32b*/ /* 64b*/
+				U32 const mlBits = ML_bits[mlCode];
+				/* (7)*/							    /* (7)*/
+				FSE_encodeSymbol(&blockStream, &stateOffsetBits, ofCode); /* 15 */  /* 15 */
+				FSE_encodeSymbol(&blockStream, &stateMatchLength, mlCode); /* 24 */ /* 24 */
+				if (ZSTD_32bits())
+					BIT_flushBits(&blockStream);				  /* (7)*/
+				FSE_encodeSymbol(&blockStream, &stateLitLength, llCode); /* 16 */ /* 33 */
+				if (ZSTD_32bits() || (ofBits + mlBits + llBits >= 64 - 7 - (LLFSELog + MLFSELog + OffFSELog)))
+					BIT_flushBits(&blockStream); /* (7)*/
+				BIT_addBits(&blockStream, sequences[n].litLength, llBits);
+				if (ZSTD_32bits() && ((llBits + mlBits) > 24))
+					BIT_flushBits(&blockStream);
+				BIT_addBits(&blockStream, sequences[n].matchLength, mlBits);
+				if (ZSTD_32bits())
+					BIT_flushBits(&blockStream); /* (7)*/
+				if (longOffsets) {
+					int const extraBits = ofBits - MIN(ofBits, STREAM_ACCUMULATOR_MIN - 1);
+					if (extraBits) {
+						BIT_addBits(&blockStream, sequences[n].offset, extraBits);
+						BIT_flushBits(&blockStream); /* (7)*/
+					}
+					BIT_addBits(&blockStream, sequences[n].offset >> extraBits, ofBits - extraBits); /* 31 */
+				} else {
+					BIT_addBits(&blockStream, sequences[n].offset, ofBits); /* 31 */
+				}
+				BIT_flushBits(&blockStream); /* (7)*/
+			}
+		}
+
+		FSE_flushCState(&blockStream, &stateMatchLength);
+		FSE_flushCState(&blockStream, &stateOffsetBits);
+		FSE_flushCState(&blockStream, &stateLitLength);
+
+		{
+			size_t const streamSize = BIT_closeCStream(&blockStream);
+			if (streamSize == 0)
+				return ERROR(dstSize_tooSmall); /* not enough space */
+			op += streamSize;
+		}
+	}
+	return op - ostart;
+}
+
+ZSTD_STATIC size_t ZSTD_compressSequences(ZSTD_CCtx *zc, void *dst, size_t dstCapacity, size_t srcSize)
+{
+	size_t const cSize = ZSTD_compressSequences_internal(zc, dst, dstCapacity);
+	size_t const minGain = ZSTD_minGain(srcSize);
+	size_t const maxCSize = srcSize - minGain;
+	/* If the srcSize <= dstCapacity, then there is enough space to write a
+	 * raw uncompressed block. Since we ran out of space, the block must not
+	 * be compressible, so fall back to a raw uncompressed block.
+	 */
+	int const uncompressibleError = cSize == ERROR(dstSize_tooSmall) && srcSize <= dstCapacity;
+	int i;
+
+	if (ZSTD_isError(cSize) && !uncompressibleError)
+		return cSize;
+	if (cSize >= maxCSize || uncompressibleError) {
+		zc->flagStaticHufTable = HUF_repeat_none;
+		return 0;
+	}
+	/* confirm repcodes */
+	for (i = 0; i < ZSTD_REP_NUM; i++)
+		zc->rep[i] = zc->repToConfirm[i];
+	return cSize;
+}
+
+/*! ZSTD_storeSeq() :
+	Store a sequence (literal length, literals, offset code and match length code) into seqStore_t.
+	`offsetCode` : distance to match, or 0 == repCode.
+	`matchCode` : matchLength - MINMATCH
+*/
+ZSTD_STATIC void ZSTD_storeSeq(seqStore_t *seqStorePtr, size_t litLength, const void *literals, U32 offsetCode, size_t matchCode)
+{
+	/* copy Literals */
+	ZSTD_wildcopy(seqStorePtr->lit, literals, litLength);
+	seqStorePtr->lit += litLength;
+
+	/* literal Length */
+	if (litLength > 0xFFFF) {
+		seqStorePtr->longLengthID = 1;
+		seqStorePtr->longLengthPos = (U32)(seqStorePtr->sequences - seqStorePtr->sequencesStart);
+	}
+	seqStorePtr->sequences[0].litLength = (U16)litLength;
+
+	/* match offset */
+	seqStorePtr->sequences[0].offset = offsetCode + 1;
+
+	/* match Length */
+	if (matchCode > 0xFFFF) {
+		seqStorePtr->longLengthID = 2;
+		seqStorePtr->longLengthPos = (U32)(seqStorePtr->sequences - seqStorePtr->sequencesStart);
+	}
+	seqStorePtr->sequences[0].matchLength = (U16)matchCode;
+
+	seqStorePtr->sequences++;
+}
+
+/*-*************************************
+*  Match length counter
+***************************************/
+static unsigned ZSTD_NbCommonBytes(register size_t val)
+{
+	if (ZSTD_isLittleEndian()) {
+		if (ZSTD_64bits()) {
+			return (__builtin_ctzll((U64)val) >> 3);
+		} else { /* 32 bits */
+			return (__builtin_ctz((U32)val) >> 3);
+		}
+	} else { /* Big Endian CPU */
+		if (ZSTD_64bits()) {
+			return (__builtin_clzll(val) >> 3);
+		} else { /* 32 bits */
+			return (__builtin_clz((U32)val) >> 3);
+		}
+	}
+}
+
+static size_t ZSTD_count(const BYTE *pIn, const BYTE *pMatch, const BYTE *const pInLimit)
+{
+	const BYTE *const pStart = pIn;
+	const BYTE *const pInLoopLimit = pInLimit - (sizeof(size_t) - 1);
+
+	while (pIn < pInLoopLimit) {
+		size_t const diff = ZSTD_readST(pMatch) ^ ZSTD_readST(pIn);
+		if (!diff) {
+			pIn += sizeof(size_t);
+			pMatch += sizeof(size_t);
+			continue;
+		}
+		pIn += ZSTD_NbCommonBytes(diff);
+		return (size_t)(pIn - pStart);
+	}
+	if (ZSTD_64bits())
+		if ((pIn < (pInLimit - 3)) && (ZSTD_read32(pMatch) == ZSTD_read32(pIn))) {
+			pIn += 4;
+			pMatch += 4;
+		}
+	if ((pIn < (pInLimit - 1)) && (ZSTD_read16(pMatch) == ZSTD_read16(pIn))) {
+		pIn += 2;
+		pMatch += 2;
+	}
+	if ((pIn < pInLimit) && (*pMatch == *pIn))
+		pIn++;
+	return (size_t)(pIn - pStart);
+}
+
+/** ZSTD_count_2segments() :
+*   can count match length with `ip` & `match` in 2 different segments.
+*   convention : on reaching mEnd, match count continue starting from iStart
+*/
+static size_t ZSTD_count_2segments(const BYTE *ip, const BYTE *match, const BYTE *iEnd, const BYTE *mEnd, const BYTE *iStart)
+{
+	const BYTE *const vEnd = MIN(ip + (mEnd - match), iEnd);
+	size_t const matchLength = ZSTD_count(ip, match, vEnd);
+	if (match + matchLength != mEnd)
+		return matchLength;
+	return matchLength + ZSTD_count(ip + matchLength, iStart, iEnd);
+}
+
+/*-*************************************
+*  Hashes
+***************************************/
+static const U32 prime3bytes = 506832829U;
+static U32 ZSTD_hash3(U32 u, U32 h) { return ((u << (32 - 24)) * prime3bytes) >> (32 - h); }
+ZSTD_STATIC size_t ZSTD_hash3Ptr(const void *ptr, U32 h) { return ZSTD_hash3(ZSTD_readLE32(ptr), h); } /* only in zstd_opt.h */
+
+static const U32 prime4bytes = 2654435761U;
+static U32 ZSTD_hash4(U32 u, U32 h) { return (u * prime4bytes) >> (32 - h); }
+static size_t ZSTD_hash4Ptr(const void *ptr, U32 h) { return ZSTD_hash4(ZSTD_read32(ptr), h); }
+
+static const U64 prime5bytes = 889523592379ULL;
+static size_t ZSTD_hash5(U64 u, U32 h) { return (size_t)(((u << (64 - 40)) * prime5bytes) >> (64 - h)); }
+static size_t ZSTD_hash5Ptr(const void *p, U32 h) { return ZSTD_hash5(ZSTD_readLE64(p), h); }
+
+static const U64 prime6bytes = 227718039650203ULL;
+static size_t ZSTD_hash6(U64 u, U32 h) { return (size_t)(((u << (64 - 48)) * prime6bytes) >> (64 - h)); }
+static size_t ZSTD_hash6Ptr(const void *p, U32 h) { return ZSTD_hash6(ZSTD_readLE64(p), h); }
+
+static const U64 prime7bytes = 58295818150454627ULL;
+static size_t ZSTD_hash7(U64 u, U32 h) { return (size_t)(((u << (64 - 56)) * prime7bytes) >> (64 - h)); }
+static size_t ZSTD_hash7Ptr(const void *p, U32 h) { return ZSTD_hash7(ZSTD_readLE64(p), h); }
+
+static const U64 prime8bytes = 0xCF1BBCDCB7A56463ULL;
+static size_t ZSTD_hash8(U64 u, U32 h) { return (size_t)(((u)*prime8bytes) >> (64 - h)); }
+static size_t ZSTD_hash8Ptr(const void *p, U32 h) { return ZSTD_hash8(ZSTD_readLE64(p), h); }
+
+static size_t ZSTD_hashPtr(const void *p, U32 hBits, U32 mls)
+{
+	switch (mls) {
+	// case 3: return ZSTD_hash3Ptr(p, hBits);
+	default:
+	case 4: return ZSTD_hash4Ptr(p, hBits);
+	case 5: return ZSTD_hash5Ptr(p, hBits);
+	case 6: return ZSTD_hash6Ptr(p, hBits);
+	case 7: return ZSTD_hash7Ptr(p, hBits);
+	case 8: return ZSTD_hash8Ptr(p, hBits);
+	}
+}
+
+/*-*************************************
+*  Fast Scan
+***************************************/
+static void ZSTD_fillHashTable(ZSTD_CCtx *zc, const void *end, const U32 mls)
+{
+	U32 *const hashTable = zc->hashTable;
+	U32 const hBits = zc->params.cParams.hashLog;
+	const BYTE *const base = zc->base;
+	const BYTE *ip = base + zc->nextToUpdate;
+	const BYTE *const iend = ((const BYTE *)end) - HASH_READ_SIZE;
+	const size_t fastHashFillStep = 3;
+
+	while (ip <= iend) {
+		hashTable[ZSTD_hashPtr(ip, hBits, mls)] = (U32)(ip - base);
+		ip += fastHashFillStep;
+	}
+}
+
+FORCE_INLINE
+void ZSTD_compressBlock_fast_generic(ZSTD_CCtx *cctx, const void *src, size_t srcSize, const U32 mls)
+{
+	U32 *const hashTable = cctx->hashTable;
+	U32 const hBits = cctx->params.cParams.hashLog;
+	seqStore_t *seqStorePtr = &(cctx->seqStore);
+	const BYTE *const base = cctx->base;
+	const BYTE *const istart = (const BYTE *)src;
+	const BYTE *ip = istart;
+	const BYTE *anchor = istart;
+	const U32 lowestIndex = cctx->dictLimit;
+	const BYTE *const lowest = base + lowestIndex;
+	const BYTE *const iend = istart + srcSize;
+	const BYTE *const ilimit = iend - HASH_READ_SIZE;
+	U32 offset_1 = cctx->rep[0], offset_2 = cctx->rep[1];
+	U32 offsetSaved = 0;
+
+	/* init */
+	ip += (ip == lowest);
+	{
+		U32 const maxRep = (U32)(ip - lowest);
+		if (offset_2 > maxRep)
+			offsetSaved = offset_2, offset_2 = 0;
+		if (offset_1 > maxRep)
+			offsetSaved = offset_1, offset_1 = 0;
+	}
+
+	/* Main Search Loop */
+	while (ip < ilimit) { /* < instead of <=, because repcode check at (ip+1) */
+		size_t mLength;
+		size_t const h = ZSTD_hashPtr(ip, hBits, mls);
+		U32 const curr = (U32)(ip - base);
+		U32 const matchIndex = hashTable[h];
+		const BYTE *match = base + matchIndex;
+		hashTable[h] = curr; /* update hash table */
+
+		if ((offset_1 > 0) & (ZSTD_read32(ip + 1 - offset_1) == ZSTD_read32(ip + 1))) {
+			mLength = ZSTD_count(ip + 1 + 4, ip + 1 + 4 - offset_1, iend) + 4;
+			ip++;
+			ZSTD_storeSeq(seqStorePtr, ip - anchor, anchor, 0, mLength - MINMATCH);
+		} else {
+			U32 offset;
+			if ((matchIndex <= lowestIndex) || (ZSTD_read32(match) != ZSTD_read32(ip))) {
+				ip += ((ip - anchor) >> g_searchStrength) + 1;
+				continue;
+			}
+			mLength = ZSTD_count(ip + 4, match + 4, iend) + 4;
+			offset = (U32)(ip - match);
+			while (((ip > anchor) & (match > lowest)) && (ip[-1] == match[-1])) {
+				ip--;
+				match--;
+				mLength++;
+			} /* catch up */
+			offset_2 = offset_1;
+			offset_1 = offset;
+
+			ZSTD_storeSeq(seqStorePtr, ip - anchor, anchor, offset + ZSTD_REP_MOVE, mLength - MINMATCH);
+		}
+
+		/* match found */
+		ip += mLength;
+		anchor = ip;
+
+		if (ip <= ilimit) {
+			/* Fill Table */
+			hashTable[ZSTD_hashPtr(base + curr + 2, hBits, mls)] = curr + 2; /* here because curr+2 could be > iend-8 */
+			hashTable[ZSTD_hashPtr(ip - 2, hBits, mls)] = (U32)(ip - 2 - base);
+			/* check immediate repcode */
+			while ((ip <= ilimit) && ((offset_2 > 0) & (ZSTD_read32(ip) == ZSTD_read32(ip - offset_2)))) {
+				/* store sequence */
+				size_t const rLength = ZSTD_count(ip + 4, ip + 4 - offset_2, iend) + 4;
+				{
+					U32 const tmpOff = offset_2;
+					offset_2 = offset_1;
+					offset_1 = tmpOff;
+				} /* swap offset_2 <=> offset_1 */
+				hashTable[ZSTD_hashPtr(ip, hBits, mls)] = (U32)(ip - base);
+				ZSTD_storeSeq(seqStorePtr, 0, anchor, 0, rLength - MINMATCH);
+				ip += rLength;
+				anchor = ip;
+				continue; /* faster when present ... (?) */
+			}
+		}
+	}
+
+	/* save reps for next block */
+	cctx->repToConfirm[0] = offset_1 ? offset_1 : offsetSaved;
+	cctx->repToConfirm[1] = offset_2 ? offset_2 : offsetSaved;
+
+	/* Last Literals */
+	{
+		size_t const lastLLSize = iend - anchor;
+		memcpy(seqStorePtr->lit, anchor, lastLLSize);
+		seqStorePtr->lit += lastLLSize;
+	}
+}
+
+static void ZSTD_compressBlock_fast(ZSTD_CCtx *ctx, const void *src, size_t srcSize)
+{
+	const U32 mls = ctx->params.cParams.searchLength;
+	switch (mls) {
+	default: /* includes case 3 */
+	case 4: ZSTD_compressBlock_fast_generic(ctx, src, srcSize, 4); return;
+	case 5: ZSTD_compressBlock_fast_generic(ctx, src, srcSize, 5); return;
+	case 6: ZSTD_compressBlock_fast_generic(ctx, src, srcSize, 6); return;
+	case 7: ZSTD_compressBlock_fast_generic(ctx, src, srcSize, 7); return;
+	}
+}
+
+static void ZSTD_compressBlock_fast_extDict_generic(ZSTD_CCtx *ctx, const void *src, size_t srcSize, const U32 mls)
+{
+	U32 *hashTable = ctx->hashTable;
+	const U32 hBits = ctx->params.cParams.hashLog;
+	seqStore_t *seqStorePtr = &(ctx->seqStore);
+	const BYTE *const base = ctx->base;
+	const BYTE *const dictBase = ctx->dictBase;
+	const BYTE *const istart = (const BYTE *)src;
+	const BYTE *ip = istart;
+	const BYTE *anchor = istart;
+	const U32 lowestIndex = ctx->lowLimit;
+	const BYTE *const dictStart = dictBase + lowestIndex;
+	const U32 dictLimit = ctx->dictLimit;
+	const BYTE *const lowPrefixPtr = base + dictLimit;
+	const BYTE *const dictEnd = dictBase + dictLimit;
+	const BYTE *const iend = istart + srcSize;
+	const BYTE *const ilimit = iend - 8;
+	U32 offset_1 = ctx->rep[0], offset_2 = ctx->rep[1];
+
+	/* Search Loop */
+	while (ip < ilimit) { /* < instead of <=, because (ip+1) */
+		const size_t h = ZSTD_hashPtr(ip, hBits, mls);
+		const U32 matchIndex = hashTable[h];
+		const BYTE *matchBase = matchIndex < dictLimit ? dictBase : base;
+		const BYTE *match = matchBase + matchIndex;
+		const U32 curr = (U32)(ip - base);
+		const U32 repIndex = curr + 1 - offset_1; /* offset_1 expected <= curr +1 */
+		const BYTE *repBase = repIndex < dictLimit ? dictBase : base;
+		const BYTE *repMatch = repBase + repIndex;
+		size_t mLength;
+		hashTable[h] = curr; /* update hash table */
+
+		if ((((U32)((dictLimit - 1) - repIndex) >= 3) /* intentional underflow */ & (repIndex > lowestIndex)) &&
+		    (ZSTD_read32(repMatch) == ZSTD_read32(ip + 1))) {
+			const BYTE *repMatchEnd = repIndex < dictLimit ? dictEnd : iend;
+			mLength = ZSTD_count_2segments(ip + 1 + EQUAL_READ32, repMatch + EQUAL_READ32, iend, repMatchEnd, lowPrefixPtr) + EQUAL_READ32;
+			ip++;
+			ZSTD_storeSeq(seqStorePtr, ip - anchor, anchor, 0, mLength - MINMATCH);
+		} else {
+			if ((matchIndex < lowestIndex) || (ZSTD_read32(match) != ZSTD_read32(ip))) {
+				ip += ((ip - anchor) >> g_searchStrength) + 1;
+				continue;
+			}
+			{
+				const BYTE *matchEnd = matchIndex < dictLimit ? dictEnd : iend;
+				const BYTE *lowMatchPtr = matchIndex < dictLimit ? dictStart : lowPrefixPtr;
+				U32 offset;
+				mLength = ZSTD_count_2segments(ip + EQUAL_READ32, match + EQUAL_READ32, iend, matchEnd, lowPrefixPtr) + EQUAL_READ32;
+				while (((ip > anchor) & (match > lowMatchPtr)) && (ip[-1] == match[-1])) {
+					ip--;
+					match--;
+					mLength++;
+				} /* catch up */
+				offset = curr - matchIndex;
+				offset_2 = offset_1;
+				offset_1 = offset;
+				ZSTD_storeSeq(seqStorePtr, ip - anchor, anchor, offset + ZSTD_REP_MOVE, mLength - MINMATCH);
+			}
+		}
+
+		/* found a match : store it */
+		ip += mLength;
+		anchor = ip;
+
+		if (ip <= ilimit) {
+			/* Fill Table */
+			hashTable[ZSTD_hashPtr(base + curr + 2, hBits, mls)] = curr + 2;
+			hashTable[ZSTD_hashPtr(ip - 2, hBits, mls)] = (U32)(ip - 2 - base);
+			/* check immediate repcode */
+			while (ip <= ilimit) {
+				U32 const curr2 = (U32)(ip - base);
+				U32 const repIndex2 = curr2 - offset_2;
+				const BYTE *repMatch2 = repIndex2 < dictLimit ? dictBase + repIndex2 : base + repIndex2;
+				if ((((U32)((dictLimit - 1) - repIndex2) >= 3) & (repIndex2 > lowestIndex)) /* intentional overflow */
+				    && (ZSTD_read32(repMatch2) == ZSTD_read32(ip))) {
+					const BYTE *const repEnd2 = repIndex2 < dictLimit ? dictEnd : iend;
+					size_t repLength2 =
+					    ZSTD_count_2segments(ip + EQUAL_READ32, repMatch2 + EQUAL_READ32, iend, repEnd2, lowPrefixPtr) + EQUAL_READ32;
+					U32 tmpOffset = offset_2;
+					offset_2 = offset_1;
+					offset_1 = tmpOffset; /* swap offset_2 <=> offset_1 */
+					ZSTD_storeSeq(seqStorePtr, 0, anchor, 0, repLength2 - MINMATCH);
+					hashTable[ZSTD_hashPtr(ip, hBits, mls)] = curr2;
+					ip += repLength2;
+					anchor = ip;
+					continue;
+				}
+				break;
+			}
+		}
+	}
+
+	/* save reps for next block */
+	ctx->repToConfirm[0] = offset_1;
+	ctx->repToConfirm[1] = offset_2;
+
+	/* Last Literals */
+	{
+		size_t const lastLLSize = iend - anchor;
+		memcpy(seqStorePtr->lit, anchor, lastLLSize);
+		seqStorePtr->lit += lastLLSize;
+	}
+}
+
+static void ZSTD_compressBlock_fast_extDict(ZSTD_CCtx *ctx, const void *src, size_t srcSize)
+{
+	U32 const mls = ctx->params.cParams.searchLength;
+	switch (mls) {
+	default: /* includes case 3 */
+	case 4: ZSTD_compressBlock_fast_extDict_generic(ctx, src, srcSize, 4); return;
+	case 5: ZSTD_compressBlock_fast_extDict_generic(ctx, src, srcSize, 5); return;
+	case 6: ZSTD_compressBlock_fast_extDict_generic(ctx, src, srcSize, 6); return;
+	case 7: ZSTD_compressBlock_fast_extDict_generic(ctx, src, srcSize, 7); return;
+	}
+}
+
+/*-*************************************
+*  Double Fast
+***************************************/
+static void ZSTD_fillDoubleHashTable(ZSTD_CCtx *cctx, const void *end, const U32 mls)
+{
+	U32 *const hashLarge = cctx->hashTable;
+	U32 const hBitsL = cctx->params.cParams.hashLog;
+	U32 *const hashSmall = cctx->chainTable;
+	U32 const hBitsS = cctx->params.cParams.chainLog;
+	const BYTE *const base = cctx->base;
+	const BYTE *ip = base + cctx->nextToUpdate;
+	const BYTE *const iend = ((const BYTE *)end) - HASH_READ_SIZE;
+	const size_t fastHashFillStep = 3;
+
+	while (ip <= iend) {
+		hashSmall[ZSTD_hashPtr(ip, hBitsS, mls)] = (U32)(ip - base);
+		hashLarge[ZSTD_hashPtr(ip, hBitsL, 8)] = (U32)(ip - base);
+		ip += fastHashFillStep;
+	}
+}
+
+FORCE_INLINE
+void ZSTD_compressBlock_doubleFast_generic(ZSTD_CCtx *cctx, const void *src, size_t srcSize, const U32 mls)
+{
+	U32 *const hashLong = cctx->hashTable;
+	const U32 hBitsL = cctx->params.cParams.hashLog;
+	U32 *const hashSmall = cctx->chainTable;
+	const U32 hBitsS = cctx->params.cParams.chainLog;
+	seqStore_t *seqStorePtr = &(cctx->seqStore);
+	const BYTE *const base = cctx->base;
+	const BYTE *const istart = (const BYTE *)src;
+	const BYTE *ip = istart;
+	const BYTE *anchor = istart;
+	const U32 lowestIndex = cctx->dictLimit;
+	const BYTE *const lowest = base + lowestIndex;
+	const BYTE *const iend = istart + srcSize;
+	const BYTE *const ilimit = iend - HASH_READ_SIZE;
+	U32 offset_1 = cctx->rep[0], offset_2 = cctx->rep[1];
+	U32 offsetSaved = 0;
+
+	/* init */
+	ip += (ip == lowest);
+	{
+		U32 const maxRep = (U32)(ip - lowest);
+		if (offset_2 > maxRep)
+			offsetSaved = offset_2, offset_2 = 0;
+		if (offset_1 > maxRep)
+			offsetSaved = offset_1, offset_1 = 0;
+	}
+
+	/* Main Search Loop */
+	while (ip < ilimit) { /* < instead of <=, because repcode check at (ip+1) */
+		size_t mLength;
+		size_t const h2 = ZSTD_hashPtr(ip, hBitsL, 8);
+		size_t const h = ZSTD_hashPtr(ip, hBitsS, mls);
+		U32 const curr = (U32)(ip - base);
+		U32 const matchIndexL = hashLong[h2];
+		U32 const matchIndexS = hashSmall[h];
+		const BYTE *matchLong = base + matchIndexL;
+		const BYTE *match = base + matchIndexS;
+		hashLong[h2] = hashSmall[h] = curr; /* update hash tables */
+
+		if ((offset_1 > 0) & (ZSTD_read32(ip + 1 - offset_1) == ZSTD_read32(ip + 1))) { /* note : by construction, offset_1 <= curr */
+			mLength = ZSTD_count(ip + 1 + 4, ip + 1 + 4 - offset_1, iend) + 4;
+			ip++;
+			ZSTD_storeSeq(seqStorePtr, ip - anchor, anchor, 0, mLength - MINMATCH);
+		} else {
+			U32 offset;
+			if ((matchIndexL > lowestIndex) && (ZSTD_read64(matchLong) == ZSTD_read64(ip))) {
+				mLength = ZSTD_count(ip + 8, matchLong + 8, iend) + 8;
+				offset = (U32)(ip - matchLong);
+				while (((ip > anchor) & (matchLong > lowest)) && (ip[-1] == matchLong[-1])) {
+					ip--;
+					matchLong--;
+					mLength++;
+				} /* catch up */
+			} else if ((matchIndexS > lowestIndex) && (ZSTD_read32(match) == ZSTD_read32(ip))) {
+				size_t const h3 = ZSTD_hashPtr(ip + 1, hBitsL, 8);
+				U32 const matchIndex3 = hashLong[h3];
+				const BYTE *match3 = base + matchIndex3;
+				hashLong[h3] = curr + 1;
+				if ((matchIndex3 > lowestIndex) && (ZSTD_read64(match3) == ZSTD_read64(ip + 1))) {
+					mLength = ZSTD_count(ip + 9, match3 + 8, iend) + 8;
+					ip++;
+					offset = (U32)(ip - match3);
+					while (((ip > anchor) & (match3 > lowest)) && (ip[-1] == match3[-1])) {
+						ip--;
+						match3--;
+						mLength++;
+					} /* catch up */
+				} else {
+					mLength = ZSTD_count(ip + 4, match + 4, iend) + 4;
+					offset = (U32)(ip - match);
+					while (((ip > anchor) & (match > lowest)) && (ip[-1] == match[-1])) {
+						ip--;
+						match--;
+						mLength++;
+					} /* catch up */
+				}
+			} else {
+				ip += ((ip - anchor) >> g_searchStrength) + 1;
+				continue;
+			}
+
+			offset_2 = offset_1;
+			offset_1 = offset;
+
+			ZSTD_storeSeq(seqStorePtr, ip - anchor, anchor, offset + ZSTD_REP_MOVE, mLength - MINMATCH);
+		}
+
+		/* match found */
+		ip += mLength;
+		anchor = ip;
+
+		if (ip <= ilimit) {
+			/* Fill Table */
+			hashLong[ZSTD_hashPtr(base + curr + 2, hBitsL, 8)] = hashSmall[ZSTD_hashPtr(base + curr + 2, hBitsS, mls)] =
+			    curr + 2; /* here because curr+2 could be > iend-8 */
+			hashLong[ZSTD_hashPtr(ip - 2, hBitsL, 8)] = hashSmall[ZSTD_hashPtr(ip - 2, hBitsS, mls)] = (U32)(ip - 2 - base);
+
+			/* check immediate repcode */
+			while ((ip <= ilimit) && ((offset_2 > 0) & (ZSTD_read32(ip) == ZSTD_read32(ip - offset_2)))) {
+				/* store sequence */
+				size_t const rLength = ZSTD_count(ip + 4, ip + 4 - offset_2, iend) + 4;
+				{
+					U32 const tmpOff = offset_2;
+					offset_2 = offset_1;
+					offset_1 = tmpOff;
+				} /* swap offset_2 <=> offset_1 */
+				hashSmall[ZSTD_hashPtr(ip, hBitsS, mls)] = (U32)(ip - base);
+				hashLong[ZSTD_hashPtr(ip, hBitsL, 8)] = (U32)(ip - base);
+				ZSTD_storeSeq(seqStorePtr, 0, anchor, 0, rLength - MINMATCH);
+				ip += rLength;
+				anchor = ip;
+				continue; /* faster when present ... (?) */
+			}
+		}
+	}
+
+	/* save reps for next block */
+	cctx->repToConfirm[0] = offset_1 ? offset_1 : offsetSaved;
+	cctx->repToConfirm[1] = offset_2 ? offset_2 : offsetSaved;
+
+	/* Last Literals */
+	{
+		size_t const lastLLSize = iend - anchor;
+		memcpy(seqStorePtr->lit, anchor, lastLLSize);
+		seqStorePtr->lit += lastLLSize;
+	}
+}
+
+static void ZSTD_compressBlock_doubleFast(ZSTD_CCtx *ctx, const void *src, size_t srcSize)
+{
+	const U32 mls = ctx->params.cParams.searchLength;
+	switch (mls) {
+	default: /* includes case 3 */
+	case 4: ZSTD_compressBlock_doubleFast_generic(ctx, src, srcSize, 4); return;
+	case 5: ZSTD_compressBlock_doubleFast_generic(ctx, src, srcSize, 5); return;
+	case 6: ZSTD_compressBlock_doubleFast_generic(ctx, src, srcSize, 6); return;
+	case 7: ZSTD_compressBlock_doubleFast_generic(ctx, src, srcSize, 7); return;
+	}
+}
+
+static void ZSTD_compressBlock_doubleFast_extDict_generic(ZSTD_CCtx *ctx, const void *src, size_t srcSize, const U32 mls)
+{
+	U32 *const hashLong = ctx->hashTable;
+	U32 const hBitsL = ctx->params.cParams.hashLog;
+	U32 *const hashSmall = ctx->chainTable;
+	U32 const hBitsS = ctx->params.cParams.chainLog;
+	seqStore_t *seqStorePtr = &(ctx->seqStore);
+	const BYTE *const base = ctx->base;
+	const BYTE *const dictBase = ctx->dictBase;
+	const BYTE *const istart = (const BYTE *)src;
+	const BYTE *ip = istart;
+	const BYTE *anchor = istart;
+	const U32 lowestIndex = ctx->lowLimit;
+	const BYTE *const dictStart = dictBase + lowestIndex;
+	const U32 dictLimit = ctx->dictLimit;
+	const BYTE *const lowPrefixPtr = base + dictLimit;
+	const BYTE *const dictEnd = dictBase + dictLimit;
+	const BYTE *const iend = istart + srcSize;
+	const BYTE *const ilimit = iend - 8;
+	U32 offset_1 = ctx->rep[0], offset_2 = ctx->rep[1];
+
+	/* Search Loop */
+	while (ip < ilimit) { /* < instead of <=, because (ip+1) */
+		const size_t hSmall = ZSTD_hashPtr(ip, hBitsS, mls);
+		const U32 matchIndex = hashSmall[hSmall];
+		const BYTE *matchBase = matchIndex < dictLimit ? dictBase : base;
+		const BYTE *match = matchBase + matchIndex;
+
+		const size_t hLong = ZSTD_hashPtr(ip, hBitsL, 8);
+		const U32 matchLongIndex = hashLong[hLong];
+		const BYTE *matchLongBase = matchLongIndex < dictLimit ? dictBase : base;
+		const BYTE *matchLong = matchLongBase + matchLongIndex;
+
+		const U32 curr = (U32)(ip - base);
+		const U32 repIndex = curr + 1 - offset_1; /* offset_1 expected <= curr +1 */
+		const BYTE *repBase = repIndex < dictLimit ? dictBase : base;
+		const BYTE *repMatch = repBase + repIndex;
+		size_t mLength;
+		hashSmall[hSmall] = hashLong[hLong] = curr; /* update hash table */
+
+		if ((((U32)((dictLimit - 1) - repIndex) >= 3) /* intentional underflow */ & (repIndex > lowestIndex)) &&
+		    (ZSTD_read32(repMatch) == ZSTD_read32(ip + 1))) {
+			const BYTE *repMatchEnd = repIndex < dictLimit ? dictEnd : iend;
+			mLength = ZSTD_count_2segments(ip + 1 + 4, repMatch + 4, iend, repMatchEnd, lowPrefixPtr) + 4;
+			ip++;
+			ZSTD_storeSeq(seqStorePtr, ip - anchor, anchor, 0, mLength - MINMATCH);
+		} else {
+			if ((matchLongIndex > lowestIndex) && (ZSTD_read64(matchLong) == ZSTD_read64(ip))) {
+				const BYTE *matchEnd = matchLongIndex < dictLimit ? dictEnd : iend;
+				const BYTE *lowMatchPtr = matchLongIndex < dictLimit ? dictStart : lowPrefixPtr;
+				U32 offset;
+				mLength = ZSTD_count_2segments(ip + 8, matchLong + 8, iend, matchEnd, lowPrefixPtr) + 8;
+				offset = curr - matchLongIndex;
+				while (((ip > anchor) & (matchLong > lowMatchPtr)) && (ip[-1] == matchLong[-1])) {
+					ip--;
+					matchLong--;
+					mLength++;
+				} /* catch up */
+				offset_2 = offset_1;
+				offset_1 = offset;
+				ZSTD_storeSeq(seqStorePtr, ip - anchor, anchor, offset + ZSTD_REP_MOVE, mLength - MINMATCH);
+
+			} else if ((matchIndex > lowestIndex) && (ZSTD_read32(match) == ZSTD_read32(ip))) {
+				size_t const h3 = ZSTD_hashPtr(ip + 1, hBitsL, 8);
+				U32 const matchIndex3 = hashLong[h3];
+				const BYTE *const match3Base = matchIndex3 < dictLimit ? dictBase : base;
+				const BYTE *match3 = match3Base + matchIndex3;
+				U32 offset;
+				hashLong[h3] = curr + 1;
+				if ((matchIndex3 > lowestIndex) && (ZSTD_read64(match3) == ZSTD_read64(ip + 1))) {
+					const BYTE *matchEnd = matchIndex3 < dictLimit ? dictEnd : iend;
+					const BYTE *lowMatchPtr = matchIndex3 < dictLimit ? dictStart : lowPrefixPtr;
+					mLength = ZSTD_count_2segments(ip + 9, match3 + 8, iend, matchEnd, lowPrefixPtr) + 8;
+					ip++;
+					offset = curr + 1 - matchIndex3;
+					while (((ip > anchor) & (match3 > lowMatchPtr)) && (ip[-1] == match3[-1])) {
+						ip--;
+						match3--;
+						mLength++;
+					} /* catch up */
+				} else {
+					const BYTE *matchEnd = matchIndex < dictLimit ? dictEnd : iend;
+					const BYTE *lowMatchPtr = matchIndex < dictLimit ? dictStart : lowPrefixPtr;
+					mLength = ZSTD_count_2segments(ip + 4, match + 4, iend, matchEnd, lowPrefixPtr) + 4;
+					offset = curr - matchIndex;
+					while (((ip > anchor) & (match > lowMatchPtr)) && (ip[-1] == match[-1])) {
+						ip--;
+						match--;
+						mLength++;
+					} /* catch up */
+				}
+				offset_2 = offset_1;
+				offset_1 = offset;
+				ZSTD_storeSeq(seqStorePtr, ip - anchor, anchor, offset + ZSTD_REP_MOVE, mLength - MINMATCH);
+
+			} else {
+				ip += ((ip - anchor) >> g_searchStrength) + 1;
+				continue;
+			}
+		}
+
+		/* found a match : store it */
+		ip += mLength;
+		anchor = ip;
+
+		if (ip <= ilimit) {
+			/* Fill Table */
+			hashSmall[ZSTD_hashPtr(base + curr + 2, hBitsS, mls)] = curr + 2;
+			hashLong[ZSTD_hashPtr(base + curr + 2, hBitsL, 8)] = curr + 2;
+			hashSmall[ZSTD_hashPtr(ip - 2, hBitsS, mls)] = (U32)(ip - 2 - base);
+			hashLong[ZSTD_hashPtr(ip - 2, hBitsL, 8)] = (U32)(ip - 2 - base);
+			/* check immediate repcode */
+			while (ip <= ilimit) {
+				U32 const curr2 = (U32)(ip - base);
+				U32 const repIndex2 = curr2 - offset_2;
+				const BYTE *repMatch2 = repIndex2 < dictLimit ? dictBase + repIndex2 : base + repIndex2;
+				if ((((U32)((dictLimit - 1) - repIndex2) >= 3) & (repIndex2 > lowestIndex)) /* intentional overflow */
+				    && (ZSTD_read32(repMatch2) == ZSTD_read32(ip))) {
+					const BYTE *const repEnd2 = repIndex2 < dictLimit ? dictEnd : iend;
+					size_t const repLength2 =
+					    ZSTD_count_2segments(ip + EQUAL_READ32, repMatch2 + EQUAL_READ32, iend, repEnd2, lowPrefixPtr) + EQUAL_READ32;
+					U32 tmpOffset = offset_2;
+					offset_2 = offset_1;
+					offset_1 = tmpOffset; /* swap offset_2 <=> offset_1 */
+					ZSTD_storeSeq(seqStorePtr, 0, anchor, 0, repLength2 - MINMATCH);
+					hashSmall[ZSTD_hashPtr(ip, hBitsS, mls)] = curr2;
+					hashLong[ZSTD_hashPtr(ip, hBitsL, 8)] = curr2;
+					ip += repLength2;
+					anchor = ip;
+					continue;
+				}
+				break;
+			}
+		}
+	}
+
+	/* save reps for next block */
+	ctx->repToConfirm[0] = offset_1;
+	ctx->repToConfirm[1] = offset_2;
+
+	/* Last Literals */
+	{
+		size_t const lastLLSize = iend - anchor;
+		memcpy(seqStorePtr->lit, anchor, lastLLSize);
+		seqStorePtr->lit += lastLLSize;
+	}
+}
+
+static void ZSTD_compressBlock_doubleFast_extDict(ZSTD_CCtx *ctx, const void *src, size_t srcSize)
+{
+	U32 const mls = ctx->params.cParams.searchLength;
+	switch (mls) {
+	default: /* includes case 3 */
+	case 4: ZSTD_compressBlock_doubleFast_extDict_generic(ctx, src, srcSize, 4); return;
+	case 5: ZSTD_compressBlock_doubleFast_extDict_generic(ctx, src, srcSize, 5); return;
+	case 6: ZSTD_compressBlock_doubleFast_extDict_generic(ctx, src, srcSize, 6); return;
+	case 7: ZSTD_compressBlock_doubleFast_extDict_generic(ctx, src, srcSize, 7); return;
+	}
+}
+
+/*-*************************************
+*  Binary Tree search
+***************************************/
+/** ZSTD_insertBt1() : add one or multiple positions to tree.
+*   ip : assumed <= iend-8 .
+*   @return : nb of positions added */
+static U32 ZSTD_insertBt1(ZSTD_CCtx *zc, const BYTE *const ip, const U32 mls, const BYTE *const iend, U32 nbCompares, U32 extDict)
+{
+	U32 *const hashTable = zc->hashTable;
+	U32 const hashLog = zc->params.cParams.hashLog;
+	size_t const h = ZSTD_hashPtr(ip, hashLog, mls);
+	U32 *const bt = zc->chainTable;
+	U32 const btLog = zc->params.cParams.chainLog - 1;
+	U32 const btMask = (1 << btLog) - 1;
+	U32 matchIndex = hashTable[h];
+	size_t commonLengthSmaller = 0, commonLengthLarger = 0;
+	const BYTE *const base = zc->base;
+	const BYTE *const dictBase = zc->dictBase;
+	const U32 dictLimit = zc->dictLimit;
+	const BYTE *const dictEnd = dictBase + dictLimit;
+	const BYTE *const prefixStart = base + dictLimit;
+	const BYTE *match;
+	const U32 curr = (U32)(ip - base);
+	const U32 btLow = btMask >= curr ? 0 : curr - btMask;
+	U32 *smallerPtr = bt + 2 * (curr & btMask);
+	U32 *largerPtr = smallerPtr + 1;
+	U32 dummy32; /* to be nullified at the end */
+	U32 const windowLow = zc->lowLimit;
+	U32 matchEndIdx = curr + 8;
+	size_t bestLength = 8;
+
+	hashTable[h] = curr; /* Update Hash Table */
+
+	while (nbCompares-- && (matchIndex > windowLow)) {
+		U32 *const nextPtr = bt + 2 * (matchIndex & btMask);
+		size_t matchLength = MIN(commonLengthSmaller, commonLengthLarger); /* guaranteed minimum nb of common bytes */
+
+		if ((!extDict) || (matchIndex + matchLength >= dictLimit)) {
+			match = base + matchIndex;
+			if (match[matchLength] == ip[matchLength])
+				matchLength += ZSTD_count(ip + matchLength + 1, match + matchLength + 1, iend) + 1;
+		} else {
+			match = dictBase + matchIndex;
+			matchLength += ZSTD_count_2segments(ip + matchLength, match + matchLength, iend, dictEnd, prefixStart);
+			if (matchIndex + matchLength >= dictLimit)
+				match = base + matchIndex; /* to prepare for next usage of match[matchLength] */
+		}
+
+		if (matchLength > bestLength) {
+			bestLength = matchLength;
+			if (matchLength > matchEndIdx - matchIndex)
+				matchEndIdx = matchIndex + (U32)matchLength;
+		}
+
+		if (ip + matchLength == iend) /* equal : no way to know if inf or sup */
+			break;		      /* drop , to guarantee consistency ; miss a bit of compression, but other solutions can corrupt the tree */
+
+		if (match[matchLength] < ip[matchLength]) { /* necessarily within correct buffer */
+			/* match is smaller than curr */
+			*smallerPtr = matchIndex;	  /* update smaller idx */
+			commonLengthSmaller = matchLength; /* all smaller will now have at least this guaranteed common length */
+			if (matchIndex <= btLow) {
+				smallerPtr = &dummy32;
+				break;
+			}			  /* beyond tree size, stop the search */
+			smallerPtr = nextPtr + 1; /* new "smaller" => larger of match */
+			matchIndex = nextPtr[1];  /* new matchIndex larger than previous (closer to curr) */
+		} else {
+			/* match is larger than curr */
+			*largerPtr = matchIndex;
+			commonLengthLarger = matchLength;
+			if (matchIndex <= btLow) {
+				largerPtr = &dummy32;
+				break;
+			} /* beyond tree size, stop the search */
+			largerPtr = nextPtr;
+			matchIndex = nextPtr[0];
+		}
+	}
+
+	*smallerPtr = *largerPtr = 0;
+	if (bestLength > 384)
+		return MIN(192, (U32)(bestLength - 384)); /* speed optimization */
+	if (matchEndIdx > curr + 8)
+		return matchEndIdx - curr - 8;
+	return 1;
+}
+
+static size_t ZSTD_insertBtAndFindBestMatch(ZSTD_CCtx *zc, const BYTE *const ip, const BYTE *const iend, size_t *offsetPtr, U32 nbCompares, const U32 mls,
+					    U32 extDict)
+{
+	U32 *const hashTable = zc->hashTable;
+	U32 const hashLog = zc->params.cParams.hashLog;
+	size_t const h = ZSTD_hashPtr(ip, hashLog, mls);
+	U32 *const bt = zc->chainTable;
+	U32 const btLog = zc->params.cParams.chainLog - 1;
+	U32 const btMask = (1 << btLog) - 1;
+	U32 matchIndex = hashTable[h];
+	size_t commonLengthSmaller = 0, commonLengthLarger = 0;
+	const BYTE *const base = zc->base;
+	const BYTE *const dictBase = zc->dictBase;
+	const U32 dictLimit = zc->dictLimit;
+	const BYTE *const dictEnd = dictBase + dictLimit;
+	const BYTE *const prefixStart = base + dictLimit;
+	const U32 curr = (U32)(ip - base);
+	const U32 btLow = btMask >= curr ? 0 : curr - btMask;
+	const U32 windowLow = zc->lowLimit;
+	U32 *smallerPtr = bt + 2 * (curr & btMask);
+	U32 *largerPtr = bt + 2 * (curr & btMask) + 1;
+	U32 matchEndIdx = curr + 8;
+	U32 dummy32; /* to be nullified at the end */
+	size_t bestLength = 0;
+
+	hashTable[h] = curr; /* Update Hash Table */
+
+	while (nbCompares-- && (matchIndex > windowLow)) {
+		U32 *const nextPtr = bt + 2 * (matchIndex & btMask);
+		size_t matchLength = MIN(commonLengthSmaller, commonLengthLarger); /* guaranteed minimum nb of common bytes */
+		const BYTE *match;
+
+		if ((!extDict) || (matchIndex + matchLength >= dictLimit)) {
+			match = base + matchIndex;
+			if (match[matchLength] == ip[matchLength])
+				matchLength += ZSTD_count(ip + matchLength + 1, match + matchLength + 1, iend) + 1;
+		} else {
+			match = dictBase + matchIndex;
+			matchLength += ZSTD_count_2segments(ip + matchLength, match + matchLength, iend, dictEnd, prefixStart);
+			if (matchIndex + matchLength >= dictLimit)
+				match = base + matchIndex; /* to prepare for next usage of match[matchLength] */
+		}
+
+		if (matchLength > bestLength) {
+			if (matchLength > matchEndIdx - matchIndex)
+				matchEndIdx = matchIndex + (U32)matchLength;
+			if ((4 * (int)(matchLength - bestLength)) > (int)(ZSTD_highbit32(curr - matchIndex + 1) - ZSTD_highbit32((U32)offsetPtr[0] + 1)))
+				bestLength = matchLength, *offsetPtr = ZSTD_REP_MOVE + curr - matchIndex;
+			if (ip + matchLength == iend) /* equal : no way to know if inf or sup */
+				break;		      /* drop, to guarantee consistency (miss a little bit of compression) */
+		}
+
+		if (match[matchLength] < ip[matchLength]) {
+			/* match is smaller than curr */
+			*smallerPtr = matchIndex;	  /* update smaller idx */
+			commonLengthSmaller = matchLength; /* all smaller will now have at least this guaranteed common length */
+			if (matchIndex <= btLow) {
+				smallerPtr = &dummy32;
+				break;
+			}			  /* beyond tree size, stop the search */
+			smallerPtr = nextPtr + 1; /* new "smaller" => larger of match */
+			matchIndex = nextPtr[1];  /* new matchIndex larger than previous (closer to curr) */
+		} else {
+			/* match is larger than curr */
+			*largerPtr = matchIndex;
+			commonLengthLarger = matchLength;
+			if (matchIndex <= btLow) {
+				largerPtr = &dummy32;
+				break;
+			} /* beyond tree size, stop the search */
+			largerPtr = nextPtr;
+			matchIndex = nextPtr[0];
+		}
+	}
+
+	*smallerPtr = *largerPtr = 0;
+
+	zc->nextToUpdate = (matchEndIdx > curr + 8) ? matchEndIdx - 8 : curr + 1;
+	return bestLength;
+}
+
+static void ZSTD_updateTree(ZSTD_CCtx *zc, const BYTE *const ip, const BYTE *const iend, const U32 nbCompares, const U32 mls)
+{
+	const BYTE *const base = zc->base;
+	const U32 target = (U32)(ip - base);
+	U32 idx = zc->nextToUpdate;
+
+	while (idx < target)
+		idx += ZSTD_insertBt1(zc, base + idx, mls, iend, nbCompares, 0);
+}
+
+/** ZSTD_BtFindBestMatch() : Tree updater, providing best match */
+static size_t ZSTD_BtFindBestMatch(ZSTD_CCtx *zc, const BYTE *const ip, const BYTE *const iLimit, size_t *offsetPtr, const U32 maxNbAttempts, const U32 mls)
+{
+	if (ip < zc->base + zc->nextToUpdate)
+		return 0; /* skipped area */
+	ZSTD_updateTree(zc, ip, iLimit, maxNbAttempts, mls);
+	return ZSTD_insertBtAndFindBestMatch(zc, ip, iLimit, offsetPtr, maxNbAttempts, mls, 0);
+}
+
+static size_t ZSTD_BtFindBestMatch_selectMLS(ZSTD_CCtx *zc, /* Index table will be updated */
+					     const BYTE *ip, const BYTE *const iLimit, size_t *offsetPtr, const U32 maxNbAttempts, const U32 matchLengthSearch)
+{
+	switch (matchLengthSearch) {
+	default: /* includes case 3 */
+	case 4: return ZSTD_BtFindBestMatch(zc, ip, iLimit, offsetPtr, maxNbAttempts, 4);
+	case 5: return ZSTD_BtFindBestMatch(zc, ip, iLimit, offsetPtr, maxNbAttempts, 5);
+	case 7:
+	case 6: return ZSTD_BtFindBestMatch(zc, ip, iLimit, offsetPtr, maxNbAttempts, 6);
+	}
+}
+
+static void ZSTD_updateTree_extDict(ZSTD_CCtx *zc, const BYTE *const ip, const BYTE *const iend, const U32 nbCompares, const U32 mls)
+{
+	const BYTE *const base = zc->base;
+	const U32 target = (U32)(ip - base);
+	U32 idx = zc->nextToUpdate;
+
+	while (idx < target)
+		idx += ZSTD_insertBt1(zc, base + idx, mls, iend, nbCompares, 1);
+}
+
+/** Tree updater, providing best match */
+static size_t ZSTD_BtFindBestMatch_extDict(ZSTD_CCtx *zc, const BYTE *const ip, const BYTE *const iLimit, size_t *offsetPtr, const U32 maxNbAttempts,
+					   const U32 mls)
+{
+	if (ip < zc->base + zc->nextToUpdate)
+		return 0; /* skipped area */
+	ZSTD_updateTree_extDict(zc, ip, iLimit, maxNbAttempts, mls);
+	return ZSTD_insertBtAndFindBestMatch(zc, ip, iLimit, offsetPtr, maxNbAttempts, mls, 1);
+}
+
+static size_t ZSTD_BtFindBestMatch_selectMLS_extDict(ZSTD_CCtx *zc, /* Index table will be updated */
+						     const BYTE *ip, const BYTE *const iLimit, size_t *offsetPtr, const U32 maxNbAttempts,
+						     const U32 matchLengthSearch)
+{
+	switch (matchLengthSearch) {
+	default: /* includes case 3 */
+	case 4: return ZSTD_BtFindBestMatch_extDict(zc, ip, iLimit, offsetPtr, maxNbAttempts, 4);
+	case 5: return ZSTD_BtFindBestMatch_extDict(zc, ip, iLimit, offsetPtr, maxNbAttempts, 5);
+	case 7:
+	case 6: return ZSTD_BtFindBestMatch_extDict(zc, ip, iLimit, offsetPtr, maxNbAttempts, 6);
+	}
+}
+
+/* *********************************
+*  Hash Chain
+***********************************/
+#define NEXT_IN_CHAIN(d, mask) chainTable[(d)&mask]
+
+/* Update chains up to ip (excluded)
+   Assumption : always within prefix (i.e. not within extDict) */
+FORCE_INLINE
+U32 ZSTD_insertAndFindFirstIndex(ZSTD_CCtx *zc, const BYTE *ip, U32 mls)
+{
+	U32 *const hashTable = zc->hashTable;
+	const U32 hashLog = zc->params.cParams.hashLog;
+	U32 *const chainTable = zc->chainTable;
+	const U32 chainMask = (1 << zc->params.cParams.chainLog) - 1;
+	const BYTE *const base = zc->base;
+	const U32 target = (U32)(ip - base);
+	U32 idx = zc->nextToUpdate;
+
+	while (idx < target) { /* catch up */
+		size_t const h = ZSTD_hashPtr(base + idx, hashLog, mls);
+		NEXT_IN_CHAIN(idx, chainMask) = hashTable[h];
+		hashTable[h] = idx;
+		idx++;
+	}
+
+	zc->nextToUpdate = target;
+	return hashTable[ZSTD_hashPtr(ip, hashLog, mls)];
+}
+
+/* inlining is important to hardwire a hot branch (template emulation) */
+FORCE_INLINE
+size_t ZSTD_HcFindBestMatch_generic(ZSTD_CCtx *zc, /* Index table will be updated */
+				    const BYTE *const ip, const BYTE *const iLimit, size_t *offsetPtr, const U32 maxNbAttempts, const U32 mls,
+				    const U32 extDict)
+{
+	U32 *const chainTable = zc->chainTable;
+	const U32 chainSize = (1 << zc->params.cParams.chainLog);
+	const U32 chainMask = chainSize - 1;
+	const BYTE *const base = zc->base;
+	const BYTE *const dictBase = zc->dictBase;
+	const U32 dictLimit = zc->dictLimit;
+	const BYTE *const prefixStart = base + dictLimit;
+	const BYTE *const dictEnd = dictBase + dictLimit;
+	const U32 lowLimit = zc->lowLimit;
+	const U32 curr = (U32)(ip - base);
+	const U32 minChain = curr > chainSize ? curr - chainSize : 0;
+	int nbAttempts = maxNbAttempts;
+	size_t ml = EQUAL_READ32 - 1;
+
+	/* HC4 match finder */
+	U32 matchIndex = ZSTD_insertAndFindFirstIndex(zc, ip, mls);
+
+	for (; (matchIndex > lowLimit) & (nbAttempts > 0); nbAttempts--) {
+		const BYTE *match;
+		size_t currMl = 0;
+		if ((!extDict) || matchIndex >= dictLimit) {
+			match = base + matchIndex;
+			if (match[ml] == ip[ml]) /* potentially better */
+				currMl = ZSTD_count(ip, match, iLimit);
+		} else {
+			match = dictBase + matchIndex;
+			if (ZSTD_read32(match) == ZSTD_read32(ip)) /* assumption : matchIndex <= dictLimit-4 (by table construction) */
+				currMl = ZSTD_count_2segments(ip + EQUAL_READ32, match + EQUAL_READ32, iLimit, dictEnd, prefixStart) + EQUAL_READ32;
+		}
+
+		/* save best solution */
+		if (currMl > ml) {
+			ml = currMl;
+			*offsetPtr = curr - matchIndex + ZSTD_REP_MOVE;
+			if (ip + currMl == iLimit)
+				break; /* best possible, and avoid read overflow*/
+		}
+
+		if (matchIndex <= minChain)
+			break;
+		matchIndex = NEXT_IN_CHAIN(matchIndex, chainMask);
+	}
+
+	return ml;
+}
+
+FORCE_INLINE size_t ZSTD_HcFindBestMatch_selectMLS(ZSTD_CCtx *zc, const BYTE *ip, const BYTE *const iLimit, size_t *offsetPtr, const U32 maxNbAttempts,
+						   const U32 matchLengthSearch)
+{
+	switch (matchLengthSearch) {
+	default: /* includes case 3 */
+	case 4: return ZSTD_HcFindBestMatch_generic(zc, ip, iLimit, offsetPtr, maxNbAttempts, 4, 0);
+	case 5: return ZSTD_HcFindBestMatch_generic(zc, ip, iLimit, offsetPtr, maxNbAttempts, 5, 0);
+	case 7:
+	case 6: return ZSTD_HcFindBestMatch_generic(zc, ip, iLimit, offsetPtr, maxNbAttempts, 6, 0);
+	}
+}
+
+FORCE_INLINE size_t ZSTD_HcFindBestMatch_extDict_selectMLS(ZSTD_CCtx *zc, const BYTE *ip, const BYTE *const iLimit, size_t *offsetPtr, const U32 maxNbAttempts,
+							   const U32 matchLengthSearch)
+{
+	switch (matchLengthSearch) {
+	default: /* includes case 3 */
+	case 4: return ZSTD_HcFindBestMatch_generic(zc, ip, iLimit, offsetPtr, maxNbAttempts, 4, 1);
+	case 5: return ZSTD_HcFindBestMatch_generic(zc, ip, iLimit, offsetPtr, maxNbAttempts, 5, 1);
+	case 7:
+	case 6: return ZSTD_HcFindBestMatch_generic(zc, ip, iLimit, offsetPtr, maxNbAttempts, 6, 1);
+	}
+}
+
+/* *******************************
+*  Common parser - lazy strategy
+*********************************/
+FORCE_INLINE
+void ZSTD_compressBlock_lazy_generic(ZSTD_CCtx *ctx, const void *src, size_t srcSize, const U32 searchMethod, const U32 depth)
+{
+	seqStore_t *seqStorePtr = &(ctx->seqStore);
+	const BYTE *const istart = (const BYTE *)src;
+	const BYTE *ip = istart;
+	const BYTE *anchor = istart;
+	const BYTE *const iend = istart + srcSize;
+	const BYTE *const ilimit = iend - 8;
+	const BYTE *const base = ctx->base + ctx->dictLimit;
+
+	U32 const maxSearches = 1 << ctx->params.cParams.searchLog;
+	U32 const mls = ctx->params.cParams.searchLength;
+
+	typedef size_t (*searchMax_f)(ZSTD_CCtx * zc, const BYTE *ip, const BYTE *iLimit, size_t *offsetPtr, U32 maxNbAttempts, U32 matchLengthSearch);
+	searchMax_f const searchMax = searchMethod ? ZSTD_BtFindBestMatch_selectMLS : ZSTD_HcFindBestMatch_selectMLS;
+	U32 offset_1 = ctx->rep[0], offset_2 = ctx->rep[1], savedOffset = 0;
+
+	/* init */
+	ip += (ip == base);
+	ctx->nextToUpdate3 = ctx->nextToUpdate;
+	{
+		U32 const maxRep = (U32)(ip - base);
+		if (offset_2 > maxRep)
+			savedOffset = offset_2, offset_2 = 0;
+		if (offset_1 > maxRep)
+			savedOffset = offset_1, offset_1 = 0;
+	}
+
+	/* Match Loop */
+	while (ip < ilimit) {
+		size_t matchLength = 0;
+		size_t offset = 0;
+		const BYTE *start = ip + 1;
+
+		/* check repCode */
+		if ((offset_1 > 0) & (ZSTD_read32(ip + 1) == ZSTD_read32(ip + 1 - offset_1))) {
+			/* repcode : we take it */
+			matchLength = ZSTD_count(ip + 1 + EQUAL_READ32, ip + 1 + EQUAL_READ32 - offset_1, iend) + EQUAL_READ32;
+			if (depth == 0)
+				goto _storeSequence;
+		}
+
+		/* first search (depth 0) */
+		{
+			size_t offsetFound = 99999999;
+			size_t const ml2 = searchMax(ctx, ip, iend, &offsetFound, maxSearches, mls);
+			if (ml2 > matchLength)
+				matchLength = ml2, start = ip, offset = offsetFound;
+		}
+
+		if (matchLength < EQUAL_READ32) {
+			ip += ((ip - anchor) >> g_searchStrength) + 1; /* jump faster over incompressible sections */
+			continue;
+		}
+
+		/* let's try to find a better solution */
+		if (depth >= 1)
+			while (ip < ilimit) {
+				ip++;
+				if ((offset) && ((offset_1 > 0) & (ZSTD_read32(ip) == ZSTD_read32(ip - offset_1)))) {
+					size_t const mlRep = ZSTD_count(ip + EQUAL_READ32, ip + EQUAL_READ32 - offset_1, iend) + EQUAL_READ32;
+					int const gain2 = (int)(mlRep * 3);
+					int const gain1 = (int)(matchLength * 3 - ZSTD_highbit32((U32)offset + 1) + 1);
+					if ((mlRep >= EQUAL_READ32) && (gain2 > gain1))
+						matchLength = mlRep, offset = 0, start = ip;
+				}
+				{
+					size_t offset2 = 99999999;
+					size_t const ml2 = searchMax(ctx, ip, iend, &offset2, maxSearches, mls);
+					int const gain2 = (int)(ml2 * 4 - ZSTD_highbit32((U32)offset2 + 1)); /* raw approx */
+					int const gain1 = (int)(matchLength * 4 - ZSTD_highbit32((U32)offset + 1) + 4);
+					if ((ml2 >= EQUAL_READ32) && (gain2 > gain1)) {
+						matchLength = ml2, offset = offset2, start = ip;
+						continue; /* search a better one */
+					}
+				}
+
+				/* let's find an even better one */
+				if ((depth == 2) && (ip < ilimit)) {
+					ip++;
+					if ((offset) && ((offset_1 > 0) & (ZSTD_read32(ip) == ZSTD_read32(ip - offset_1)))) {
+						size_t const ml2 = ZSTD_count(ip + EQUAL_READ32, ip + EQUAL_READ32 - offset_1, iend) + EQUAL_READ32;
+						int const gain2 = (int)(ml2 * 4);
+						int const gain1 = (int)(matchLength * 4 - ZSTD_highbit32((U32)offset + 1) + 1);
+						if ((ml2 >= EQUAL_READ32) && (gain2 > gain1))
+							matchLength = ml2, offset = 0, start = ip;
+					}
+					{
+						size_t offset2 = 99999999;
+						size_t const ml2 = searchMax(ctx, ip, iend, &offset2, maxSearches, mls);
+						int const gain2 = (int)(ml2 * 4 - ZSTD_highbit32((U32)offset2 + 1)); /* raw approx */
+						int const gain1 = (int)(matchLength * 4 - ZSTD_highbit32((U32)offset + 1) + 7);
+						if ((ml2 >= EQUAL_READ32) && (gain2 > gain1)) {
+							matchLength = ml2, offset = offset2, start = ip;
+							continue;
+						}
+					}
+				}
+				break; /* nothing found : store previous solution */
+			}
+
+		/* NOTE:
+		 * start[-offset+ZSTD_REP_MOVE-1] is undefined behavior.
+		 * (-offset+ZSTD_REP_MOVE-1) is unsigned, and is added to start, which
+		 * overflows the pointer, which is undefined behavior.
+		 */
+		/* catch up */
+		if (offset) {
+			while ((start > anchor) && (start > base + offset - ZSTD_REP_MOVE) &&
+			       (start[-1] == (start-offset+ZSTD_REP_MOVE)[-1])) /* only search for offset within prefix */
+			{
+				start--;
+				matchLength++;
+			}
+			offset_2 = offset_1;
+			offset_1 = (U32)(offset - ZSTD_REP_MOVE);
+		}
+
+	/* store sequence */
+_storeSequence:
+		{
+			size_t const litLength = start - anchor;
+			ZSTD_storeSeq(seqStorePtr, litLength, anchor, (U32)offset, matchLength - MINMATCH);
+			anchor = ip = start + matchLength;
+		}
+
+		/* check immediate repcode */
+		while ((ip <= ilimit) && ((offset_2 > 0) & (ZSTD_read32(ip) == ZSTD_read32(ip - offset_2)))) {
+			/* store sequence */
+			matchLength = ZSTD_count(ip + EQUAL_READ32, ip + EQUAL_READ32 - offset_2, iend) + EQUAL_READ32;
+			offset = offset_2;
+			offset_2 = offset_1;
+			offset_1 = (U32)offset; /* swap repcodes */
+			ZSTD_storeSeq(seqStorePtr, 0, anchor, 0, matchLength - MINMATCH);
+			ip += matchLength;
+			anchor = ip;
+			continue; /* faster when present ... (?) */
+		}
+	}
+
+	/* Save reps for next block */
+	ctx->repToConfirm[0] = offset_1 ? offset_1 : savedOffset;
+	ctx->repToConfirm[1] = offset_2 ? offset_2 : savedOffset;
+
+	/* Last Literals */
+	{
+		size_t const lastLLSize = iend - anchor;
+		memcpy(seqStorePtr->lit, anchor, lastLLSize);
+		seqStorePtr->lit += lastLLSize;
+	}
+}
+
+static void ZSTD_compressBlock_btlazy2(ZSTD_CCtx *ctx, const void *src, size_t srcSize) { ZSTD_compressBlock_lazy_generic(ctx, src, srcSize, 1, 2); }
+
+static void ZSTD_compressBlock_lazy2(ZSTD_CCtx *ctx, const void *src, size_t srcSize) { ZSTD_compressBlock_lazy_generic(ctx, src, srcSize, 0, 2); }
+
+static void ZSTD_compressBlock_lazy(ZSTD_CCtx *ctx, const void *src, size_t srcSize) { ZSTD_compressBlock_lazy_generic(ctx, src, srcSize, 0, 1); }
+
+static void ZSTD_compressBlock_greedy(ZSTD_CCtx *ctx, const void *src, size_t srcSize) { ZSTD_compressBlock_lazy_generic(ctx, src, srcSize, 0, 0); }
+
+FORCE_INLINE
+void ZSTD_compressBlock_lazy_extDict_generic(ZSTD_CCtx *ctx, const void *src, size_t srcSize, const U32 searchMethod, const U32 depth)
+{
+	seqStore_t *seqStorePtr = &(ctx->seqStore);
+	const BYTE *const istart = (const BYTE *)src;
+	const BYTE *ip = istart;
+	const BYTE *anchor = istart;
+	const BYTE *const iend = istart + srcSize;
+	const BYTE *const ilimit = iend - 8;
+	const BYTE *const base = ctx->base;
+	const U32 dictLimit = ctx->dictLimit;
+	const U32 lowestIndex = ctx->lowLimit;
+	const BYTE *const prefixStart = base + dictLimit;
+	const BYTE *const dictBase = ctx->dictBase;
+	const BYTE *const dictEnd = dictBase + dictLimit;
+	const BYTE *const dictStart = dictBase + ctx->lowLimit;
+
+	const U32 maxSearches = 1 << ctx->params.cParams.searchLog;
+	const U32 mls = ctx->params.cParams.searchLength;
+
+	typedef size_t (*searchMax_f)(ZSTD_CCtx * zc, const BYTE *ip, const BYTE *iLimit, size_t *offsetPtr, U32 maxNbAttempts, U32 matchLengthSearch);
+	searchMax_f searchMax = searchMethod ? ZSTD_BtFindBestMatch_selectMLS_extDict : ZSTD_HcFindBestMatch_extDict_selectMLS;
+
+	U32 offset_1 = ctx->rep[0], offset_2 = ctx->rep[1];
+
+	/* init */
+	ctx->nextToUpdate3 = ctx->nextToUpdate;
+	ip += (ip == prefixStart);
+
+	/* Match Loop */
+	while (ip < ilimit) {
+		size_t matchLength = 0;
+		size_t offset = 0;
+		const BYTE *start = ip + 1;
+		U32 curr = (U32)(ip - base);
+
+		/* check repCode */
+		{
+			const U32 repIndex = (U32)(curr + 1 - offset_1);
+			const BYTE *const repBase = repIndex < dictLimit ? dictBase : base;
+			const BYTE *const repMatch = repBase + repIndex;
+			if (((U32)((dictLimit - 1) - repIndex) >= 3) & (repIndex > lowestIndex)) /* intentional overflow */
+				if (ZSTD_read32(ip + 1) == ZSTD_read32(repMatch)) {
+					/* repcode detected we should take it */
+					const BYTE *const repEnd = repIndex < dictLimit ? dictEnd : iend;
+					matchLength =
+					    ZSTD_count_2segments(ip + 1 + EQUAL_READ32, repMatch + EQUAL_READ32, iend, repEnd, prefixStart) + EQUAL_READ32;
+					if (depth == 0)
+						goto _storeSequence;
+				}
+		}
+
+		/* first search (depth 0) */
+		{
+			size_t offsetFound = 99999999;
+			size_t const ml2 = searchMax(ctx, ip, iend, &offsetFound, maxSearches, mls);
+			if (ml2 > matchLength)
+				matchLength = ml2, start = ip, offset = offsetFound;
+		}
+
+		if (matchLength < EQUAL_READ32) {
+			ip += ((ip - anchor) >> g_searchStrength) + 1; /* jump faster over incompressible sections */
+			continue;
+		}
+
+		/* let's try to find a better solution */
+		if (depth >= 1)
+			while (ip < ilimit) {
+				ip++;
+				curr++;
+				/* check repCode */
+				if (offset) {
+					const U32 repIndex = (U32)(curr - offset_1);
+					const BYTE *const repBase = repIndex < dictLimit ? dictBase : base;
+					const BYTE *const repMatch = repBase + repIndex;
+					if (((U32)((dictLimit - 1) - repIndex) >= 3) & (repIndex > lowestIndex)) /* intentional overflow */
+						if (ZSTD_read32(ip) == ZSTD_read32(repMatch)) {
+							/* repcode detected */
+							const BYTE *const repEnd = repIndex < dictLimit ? dictEnd : iend;
+							size_t const repLength =
+							    ZSTD_count_2segments(ip + EQUAL_READ32, repMatch + EQUAL_READ32, iend, repEnd, prefixStart) +
+							    EQUAL_READ32;
+							int const gain2 = (int)(repLength * 3);
+							int const gain1 = (int)(matchLength * 3 - ZSTD_highbit32((U32)offset + 1) + 1);
+							if ((repLength >= EQUAL_READ32) && (gain2 > gain1))
+								matchLength = repLength, offset = 0, start = ip;
+						}
+				}
+
+				/* search match, depth 1 */
+				{
+					size_t offset2 = 99999999;
+					size_t const ml2 = searchMax(ctx, ip, iend, &offset2, maxSearches, mls);
+					int const gain2 = (int)(ml2 * 4 - ZSTD_highbit32((U32)offset2 + 1)); /* raw approx */
+					int const gain1 = (int)(matchLength * 4 - ZSTD_highbit32((U32)offset + 1) + 4);
+					if ((ml2 >= EQUAL_READ32) && (gain2 > gain1)) {
+						matchLength = ml2, offset = offset2, start = ip;
+						continue; /* search a better one */
+					}
+				}
+
+				/* let's find an even better one */
+				if ((depth == 2) && (ip < ilimit)) {
+					ip++;
+					curr++;
+					/* check repCode */
+					if (offset) {
+						const U32 repIndex = (U32)(curr - offset_1);
+						const BYTE *const repBase = repIndex < dictLimit ? dictBase : base;
+						const BYTE *const repMatch = repBase + repIndex;
+						if (((U32)((dictLimit - 1) - repIndex) >= 3) & (repIndex > lowestIndex)) /* intentional overflow */
+							if (ZSTD_read32(ip) == ZSTD_read32(repMatch)) {
+								/* repcode detected */
+								const BYTE *const repEnd = repIndex < dictLimit ? dictEnd : iend;
+								size_t repLength = ZSTD_count_2segments(ip + EQUAL_READ32, repMatch + EQUAL_READ32, iend,
+													repEnd, prefixStart) +
+										   EQUAL_READ32;
+								int gain2 = (int)(repLength * 4);
+								int gain1 = (int)(matchLength * 4 - ZSTD_highbit32((U32)offset + 1) + 1);
+								if ((repLength >= EQUAL_READ32) && (gain2 > gain1))
+									matchLength = repLength, offset = 0, start = ip;
+							}
+					}
+
+					/* search match, depth 2 */
+					{
+						size_t offset2 = 99999999;
+						size_t const ml2 = searchMax(ctx, ip, iend, &offset2, maxSearches, mls);
+						int const gain2 = (int)(ml2 * 4 - ZSTD_highbit32((U32)offset2 + 1)); /* raw approx */
+						int const gain1 = (int)(matchLength * 4 - ZSTD_highbit32((U32)offset + 1) + 7);
+						if ((ml2 >= EQUAL_READ32) && (gain2 > gain1)) {
+							matchLength = ml2, offset = offset2, start = ip;
+							continue;
+						}
+					}
+				}
+				break; /* nothing found : store previous solution */
+			}
+
+		/* catch up */
+		if (offset) {
+			U32 const matchIndex = (U32)((start - base) - (offset - ZSTD_REP_MOVE));
+			const BYTE *match = (matchIndex < dictLimit) ? dictBase + matchIndex : base + matchIndex;
+			const BYTE *const mStart = (matchIndex < dictLimit) ? dictStart : prefixStart;
+			while ((start > anchor) && (match > mStart) && (start[-1] == match[-1])) {
+				start--;
+				match--;
+				matchLength++;
+			} /* catch up */
+			offset_2 = offset_1;
+			offset_1 = (U32)(offset - ZSTD_REP_MOVE);
+		}
+
+	/* store sequence */
+	_storeSequence : {
+		size_t const litLength = start - anchor;
+		ZSTD_storeSeq(seqStorePtr, litLength, anchor, (U32)offset, matchLength - MINMATCH);
+		anchor = ip = start + matchLength;
+	}
+
+		/* check immediate repcode */
+		while (ip <= ilimit) {
+			const U32 repIndex = (U32)((ip - base) - offset_2);
+			const BYTE *const repBase = repIndex < dictLimit ? dictBase : base;
+			const BYTE *const repMatch = repBase + repIndex;
+			if (((U32)((dictLimit - 1) - repIndex) >= 3) & (repIndex > lowestIndex)) /* intentional overflow */
+				if (ZSTD_read32(ip) == ZSTD_read32(repMatch)) {
+					/* repcode detected we should take it */
+					const BYTE *const repEnd = repIndex < dictLimit ? dictEnd : iend;
+					matchLength =
+					    ZSTD_count_2segments(ip + EQUAL_READ32, repMatch + EQUAL_READ32, iend, repEnd, prefixStart) + EQUAL_READ32;
+					offset = offset_2;
+					offset_2 = offset_1;
+					offset_1 = (U32)offset; /* swap offset history */
+					ZSTD_storeSeq(seqStorePtr, 0, anchor, 0, matchLength - MINMATCH);
+					ip += matchLength;
+					anchor = ip;
+					continue; /* faster when present ... (?) */
+				}
+			break;
+		}
+	}
+
+	/* Save reps for next block */
+	ctx->repToConfirm[0] = offset_1;
+	ctx->repToConfirm[1] = offset_2;
+
+	/* Last Literals */
+	{
+		size_t const lastLLSize = iend - anchor;
+		memcpy(seqStorePtr->lit, anchor, lastLLSize);
+		seqStorePtr->lit += lastLLSize;
+	}
+}
+
+void ZSTD_compressBlock_greedy_extDict(ZSTD_CCtx *ctx, const void *src, size_t srcSize) { ZSTD_compressBlock_lazy_extDict_generic(ctx, src, srcSize, 0, 0); }
+
+static void ZSTD_compressBlock_lazy_extDict(ZSTD_CCtx *ctx, const void *src, size_t srcSize)
+{
+	ZSTD_compressBlock_lazy_extDict_generic(ctx, src, srcSize, 0, 1);
+}
+
+static void ZSTD_compressBlock_lazy2_extDict(ZSTD_CCtx *ctx, const void *src, size_t srcSize)
+{
+	ZSTD_compressBlock_lazy_extDict_generic(ctx, src, srcSize, 0, 2);
+}
+
+static void ZSTD_compressBlock_btlazy2_extDict(ZSTD_CCtx *ctx, const void *src, size_t srcSize)
+{
+	ZSTD_compressBlock_lazy_extDict_generic(ctx, src, srcSize, 1, 2);
+}
+
+/* The optimal parser */
+#include "zstd_opt.h"
+
+static void ZSTD_compressBlock_btopt(ZSTD_CCtx *ctx, const void *src, size_t srcSize)
+{
+#ifdef ZSTD_OPT_H_91842398743
+	ZSTD_compressBlock_opt_generic(ctx, src, srcSize, 0);
+#else
+	(void)ctx;
+	(void)src;
+	(void)srcSize;
+	return;
+#endif
+}
+
+static void ZSTD_compressBlock_btopt2(ZSTD_CCtx *ctx, const void *src, size_t srcSize)
+{
+#ifdef ZSTD_OPT_H_91842398743
+	ZSTD_compressBlock_opt_generic(ctx, src, srcSize, 1);
+#else
+	(void)ctx;
+	(void)src;
+	(void)srcSize;
+	return;
+#endif
+}
+
+static void ZSTD_compressBlock_btopt_extDict(ZSTD_CCtx *ctx, const void *src, size_t srcSize)
+{
+#ifdef ZSTD_OPT_H_91842398743
+	ZSTD_compressBlock_opt_extDict_generic(ctx, src, srcSize, 0);
+#else
+	(void)ctx;
+	(void)src;
+	(void)srcSize;
+	return;
+#endif
+}
+
+static void ZSTD_compressBlock_btopt2_extDict(ZSTD_CCtx *ctx, const void *src, size_t srcSize)
+{
+#ifdef ZSTD_OPT_H_91842398743
+	ZSTD_compressBlock_opt_extDict_generic(ctx, src, srcSize, 1);
+#else
+	(void)ctx;
+	(void)src;
+	(void)srcSize;
+	return;
+#endif
+}
+
+typedef void (*ZSTD_blockCompressor)(ZSTD_CCtx *ctx, const void *src, size_t srcSize);
+
+static ZSTD_blockCompressor ZSTD_selectBlockCompressor(ZSTD_strategy strat, int extDict)
+{
+	static const ZSTD_blockCompressor blockCompressor[2][8] = {
+	    {ZSTD_compressBlock_fast, ZSTD_compressBlock_doubleFast, ZSTD_compressBlock_greedy, ZSTD_compressBlock_lazy, ZSTD_compressBlock_lazy2,
+	     ZSTD_compressBlock_btlazy2, ZSTD_compressBlock_btopt, ZSTD_compressBlock_btopt2},
+	    {ZSTD_compressBlock_fast_extDict, ZSTD_compressBlock_doubleFast_extDict, ZSTD_compressBlock_greedy_extDict, ZSTD_compressBlock_lazy_extDict,
+	     ZSTD_compressBlock_lazy2_extDict, ZSTD_compressBlock_btlazy2_extDict, ZSTD_compressBlock_btopt_extDict, ZSTD_compressBlock_btopt2_extDict}};
+
+	return blockCompressor[extDict][(U32)strat];
+}
+
+static size_t ZSTD_compressBlock_internal(ZSTD_CCtx *zc, void *dst, size_t dstCapacity, const void *src, size_t srcSize)
+{
+	ZSTD_blockCompressor const blockCompressor = ZSTD_selectBlockCompressor(zc->params.cParams.strategy, zc->lowLimit < zc->dictLimit);
+	const BYTE *const base = zc->base;
+	const BYTE *const istart = (const BYTE *)src;
+	const U32 curr = (U32)(istart - base);
+	if (srcSize < MIN_CBLOCK_SIZE + ZSTD_blockHeaderSize + 1)
+		return 0; /* don't even attempt compression below a certain srcSize */
+	ZSTD_resetSeqStore(&(zc->seqStore));
+	if (curr > zc->nextToUpdate + 384)
+		zc->nextToUpdate = curr - MIN(192, (U32)(curr - zc->nextToUpdate - 384)); /* update tree not updated after finding very long rep matches */
+	blockCompressor(zc, src, srcSize);
+	return ZSTD_compressSequences(zc, dst, dstCapacity, srcSize);
+}
+
+/*! ZSTD_compress_generic() :
+*   Compress a chunk of data into one or multiple blocks.
+*   All blocks will be terminated, all input will be consumed.
+*   Function will issue an error if there is not enough `dstCapacity` to hold the compressed content.
+*   Frame is supposed already started (header already produced)
+*   @return : compressed size, or an error code
+*/
+static size_t ZSTD_compress_generic(ZSTD_CCtx *cctx, void *dst, size_t dstCapacity, const void *src, size_t srcSize, U32 lastFrameChunk)
+{
+	size_t blockSize = cctx->blockSize;
+	size_t remaining = srcSize;
+	const BYTE *ip = (const BYTE *)src;
+	BYTE *const ostart = (BYTE *)dst;
+	BYTE *op = ostart;
+	U32 const maxDist = 1 << cctx->params.cParams.windowLog;
+
+	if (cctx->params.fParams.checksumFlag && srcSize)
+		xxh64_update(&cctx->xxhState, src, srcSize);
+
+	while (remaining) {
+		U32 const lastBlock = lastFrameChunk & (blockSize >= remaining);
+		size_t cSize;
+
+		if (dstCapacity < ZSTD_blockHeaderSize + MIN_CBLOCK_SIZE)
+			return ERROR(dstSize_tooSmall); /* not enough space to store compressed block */
+		if (remaining < blockSize)
+			blockSize = remaining;
+
+		/* preemptive overflow correction */
+		if (cctx->lowLimit > (3U << 29)) {
+			U32 const cycleMask = (1 << ZSTD_cycleLog(cctx->params.cParams.hashLog, cctx->params.cParams.strategy)) - 1;
+			U32 const curr = (U32)(ip - cctx->base);
+			U32 const newCurr = (curr & cycleMask) + (1 << cctx->params.cParams.windowLog);
+			U32 const correction = curr - newCurr;
+			ZSTD_STATIC_ASSERT(ZSTD_WINDOWLOG_MAX_64 <= 30);
+			ZSTD_reduceIndex(cctx, correction);
+			cctx->base += correction;
+			cctx->dictBase += correction;
+			cctx->lowLimit -= correction;
+			cctx->dictLimit -= correction;
+			if (cctx->nextToUpdate < correction)
+				cctx->nextToUpdate = 0;
+			else
+				cctx->nextToUpdate -= correction;
+		}
+
+		if ((U32)(ip + blockSize - cctx->base) > cctx->loadedDictEnd + maxDist) {
+			/* enforce maxDist */
+			U32 const newLowLimit = (U32)(ip + blockSize - cctx->base) - maxDist;
+			if (cctx->lowLimit < newLowLimit)
+				cctx->lowLimit = newLowLimit;
+			if (cctx->dictLimit < cctx->lowLimit)
+				cctx->dictLimit = cctx->lowLimit;
+		}
+
+		cSize = ZSTD_compressBlock_internal(cctx, op + ZSTD_blockHeaderSize, dstCapacity - ZSTD_blockHeaderSize, ip, blockSize);
+		if (ZSTD_isError(cSize))
+			return cSize;
+
+		if (cSize == 0) { /* block is not compressible */
+			U32 const cBlockHeader24 = lastBlock + (((U32)bt_raw) << 1) + (U32)(blockSize << 3);
+			if (blockSize + ZSTD_blockHeaderSize > dstCapacity)
+				return ERROR(dstSize_tooSmall);
+			ZSTD_writeLE32(op, cBlockHeader24); /* no pb, 4th byte will be overwritten */
+			memcpy(op + ZSTD_blockHeaderSize, ip, blockSize);
+			cSize = ZSTD_blockHeaderSize + blockSize;
+		} else {
+			U32 const cBlockHeader24 = lastBlock + (((U32)bt_compressed) << 1) + (U32)(cSize << 3);
+			ZSTD_writeLE24(op, cBlockHeader24);
+			cSize += ZSTD_blockHeaderSize;
+		}
+
+		remaining -= blockSize;
+		dstCapacity -= cSize;
+		ip += blockSize;
+		op += cSize;
+	}
+
+	if (lastFrameChunk && (op > ostart))
+		cctx->stage = ZSTDcs_ending;
+	return op - ostart;
+}
+
+static size_t ZSTD_writeFrameHeader(void *dst, size_t dstCapacity, ZSTD_parameters params, U64 pledgedSrcSize, U32 dictID)
+{
+	BYTE *const op = (BYTE *)dst;
+	U32 const dictIDSizeCode = (dictID > 0) + (dictID >= 256) + (dictID >= 65536); /* 0-3 */
+	U32 const checksumFlag = params.fParams.checksumFlag > 0;
+	U32 const windowSize = 1U << params.cParams.windowLog;
+	U32 const singleSegment = params.fParams.contentSizeFlag && (windowSize >= pledgedSrcSize);
+	BYTE const windowLogByte = (BYTE)((params.cParams.windowLog - ZSTD_WINDOWLOG_ABSOLUTEMIN) << 3);
+	U32 const fcsCode =
+	    params.fParams.contentSizeFlag ? (pledgedSrcSize >= 256) + (pledgedSrcSize >= 65536 + 256) + (pledgedSrcSize >= 0xFFFFFFFFU) : 0; /* 0-3 */
+	BYTE const frameHeaderDecriptionByte = (BYTE)(dictIDSizeCode + (checksumFlag << 2) + (singleSegment << 5) + (fcsCode << 6));
+	size_t pos;
+
+	if (dstCapacity < ZSTD_frameHeaderSize_max)
+		return ERROR(dstSize_tooSmall);
+
+	ZSTD_writeLE32(dst, ZSTD_MAGICNUMBER);
+	op[4] = frameHeaderDecriptionByte;
+	pos = 5;
+	if (!singleSegment)
+		op[pos++] = windowLogByte;
+	switch (dictIDSizeCode) {
+	default: /* impossible */
+	case 0: break;
+	case 1:
+		op[pos] = (BYTE)(dictID);
+		pos++;
+		break;
+	case 2:
+		ZSTD_writeLE16(op + pos, (U16)dictID);
+		pos += 2;
+		break;
+	case 3:
+		ZSTD_writeLE32(op + pos, dictID);
+		pos += 4;
+		break;
+	}
+	switch (fcsCode) {
+	default: /* impossible */
+	case 0:
+		if (singleSegment)
+			op[pos++] = (BYTE)(pledgedSrcSize);
+		break;
+	case 1:
+		ZSTD_writeLE16(op + pos, (U16)(pledgedSrcSize - 256));
+		pos += 2;
+		break;
+	case 2:
+		ZSTD_writeLE32(op + pos, (U32)(pledgedSrcSize));
+		pos += 4;
+		break;
+	case 3:
+		ZSTD_writeLE64(op + pos, (U64)(pledgedSrcSize));
+		pos += 8;
+		break;
+	}
+	return pos;
+}
+
+static size_t ZSTD_compressContinue_internal(ZSTD_CCtx *cctx, void *dst, size_t dstCapacity, const void *src, size_t srcSize, U32 frame, U32 lastFrameChunk)
+{
+	const BYTE *const ip = (const BYTE *)src;
+	size_t fhSize = 0;
+
+	if (cctx->stage == ZSTDcs_created)
+		return ERROR(stage_wrong); /* missing init (ZSTD_compressBegin) */
+
+	if (frame && (cctx->stage == ZSTDcs_init)) {
+		fhSize = ZSTD_writeFrameHeader(dst, dstCapacity, cctx->params, cctx->frameContentSize, cctx->dictID);
+		if (ZSTD_isError(fhSize))
+			return fhSize;
+		dstCapacity -= fhSize;
+		dst = (char *)dst + fhSize;
+		cctx->stage = ZSTDcs_ongoing;
+	}
+
+	/* Check if blocks follow each other */
+	if (src != cctx->nextSrc) {
+		/* not contiguous */
+		ptrdiff_t const delta = cctx->nextSrc - ip;
+		cctx->lowLimit = cctx->dictLimit;
+		cctx->dictLimit = (U32)(cctx->nextSrc - cctx->base);
+		cctx->dictBase = cctx->base;
+		cctx->base -= delta;
+		cctx->nextToUpdate = cctx->dictLimit;
+		if (cctx->dictLimit - cctx->lowLimit < HASH_READ_SIZE)
+			cctx->lowLimit = cctx->dictLimit; /* too small extDict */
+	}
+
+	/* if input and dictionary overlap : reduce dictionary (area presumed modified by input) */
+	if ((ip + srcSize > cctx->dictBase + cctx->lowLimit) & (ip < cctx->dictBase + cctx->dictLimit)) {
+		ptrdiff_t const highInputIdx = (ip + srcSize) - cctx->dictBase;
+		U32 const lowLimitMax = (highInputIdx > (ptrdiff_t)cctx->dictLimit) ? cctx->dictLimit : (U32)highInputIdx;
+		cctx->lowLimit = lowLimitMax;
+	}
+
+	cctx->nextSrc = ip + srcSize;
+
+	if (srcSize) {
+		size_t const cSize = frame ? ZSTD_compress_generic(cctx, dst, dstCapacity, src, srcSize, lastFrameChunk)
+					   : ZSTD_compressBlock_internal(cctx, dst, dstCapacity, src, srcSize);
+		if (ZSTD_isError(cSize))
+			return cSize;
+		return cSize + fhSize;
+	} else
+		return fhSize;
+}
+
+size_t ZSTD_compressContinue(ZSTD_CCtx *cctx, void *dst, size_t dstCapacity, const void *src, size_t srcSize)
+{
+	return ZSTD_compressContinue_internal(cctx, dst, dstCapacity, src, srcSize, 1, 0);
+}
+
+size_t ZSTD_getBlockSizeMax(ZSTD_CCtx *cctx) { return MIN(ZSTD_BLOCKSIZE_ABSOLUTEMAX, 1 << cctx->params.cParams.windowLog); }
+
+size_t ZSTD_compressBlock(ZSTD_CCtx *cctx, void *dst, size_t dstCapacity, const void *src, size_t srcSize)
+{
+	size_t const blockSizeMax = ZSTD_getBlockSizeMax(cctx);
+	if (srcSize > blockSizeMax)
+		return ERROR(srcSize_wrong);
+	return ZSTD_compressContinue_internal(cctx, dst, dstCapacity, src, srcSize, 0, 0);
+}
+
+/*! ZSTD_loadDictionaryContent() :
+ *  @return : 0, or an error code
+ */
+static size_t ZSTD_loadDictionaryContent(ZSTD_CCtx *zc, const void *src, size_t srcSize)
+{
+	const BYTE *const ip = (const BYTE *)src;
+	const BYTE *const iend = ip + srcSize;
+
+	/* input becomes curr prefix */
+	zc->lowLimit = zc->dictLimit;
+	zc->dictLimit = (U32)(zc->nextSrc - zc->base);
+	zc->dictBase = zc->base;
+	zc->base += ip - zc->nextSrc;
+	zc->nextToUpdate = zc->dictLimit;
+	zc->loadedDictEnd = zc->forceWindow ? 0 : (U32)(iend - zc->base);
+
+	zc->nextSrc = iend;
+	if (srcSize <= HASH_READ_SIZE)
+		return 0;
+
+	switch (zc->params.cParams.strategy) {
+	case ZSTD_fast: ZSTD_fillHashTable(zc, iend, zc->params.cParams.searchLength); break;
+
+	case ZSTD_dfast: ZSTD_fillDoubleHashTable(zc, iend, zc->params.cParams.searchLength); break;
+
+	case ZSTD_greedy:
+	case ZSTD_lazy:
+	case ZSTD_lazy2:
+		if (srcSize >= HASH_READ_SIZE)
+			ZSTD_insertAndFindFirstIndex(zc, iend - HASH_READ_SIZE, zc->params.cParams.searchLength);
+		break;
+
+	case ZSTD_btlazy2:
+	case ZSTD_btopt:
+	case ZSTD_btopt2:
+		if (srcSize >= HASH_READ_SIZE)
+			ZSTD_updateTree(zc, iend - HASH_READ_SIZE, iend, 1 << zc->params.cParams.searchLog, zc->params.cParams.searchLength);
+		break;
+
+	default:
+		return ERROR(GENERIC); /* strategy doesn't exist; impossible */
+	}
+
+	zc->nextToUpdate = (U32)(iend - zc->base);
+	return 0;
+}
+
+/* Dictionaries that assign zero probability to symbols that show up causes problems
+   when FSE encoding.  Refuse dictionaries that assign zero probability to symbols
+   that we may encounter during compression.
+   NOTE: This behavior is not standard and could be improved in the future. */
+static size_t ZSTD_checkDictNCount(short *normalizedCounter, unsigned dictMaxSymbolValue, unsigned maxSymbolValue)
+{
+	U32 s;
+	if (dictMaxSymbolValue < maxSymbolValue)
+		return ERROR(dictionary_corrupted);
+	for (s = 0; s <= maxSymbolValue; ++s) {
+		if (normalizedCounter[s] == 0)
+			return ERROR(dictionary_corrupted);
+	}
+	return 0;
+}
+
+/* Dictionary format :
+ * See :
+ * https://github.com/facebook/zstd/blob/master/doc/zstd_compression_format.md#dictionary-format
+ */
+/*! ZSTD_loadZstdDictionary() :
+ * @return : 0, or an error code
+ *  assumptions : magic number supposed already checked
+ *                dictSize supposed > 8
+ */
+static size_t ZSTD_loadZstdDictionary(ZSTD_CCtx *cctx, const void *dict, size_t dictSize)
+{
+	const BYTE *dictPtr = (const BYTE *)dict;
+	const BYTE *const dictEnd = dictPtr + dictSize;
+	short offcodeNCount[MaxOff + 1];
+	unsigned offcodeMaxValue = MaxOff;
+
+	dictPtr += 4; /* skip magic number */
+	cctx->dictID = cctx->params.fParams.noDictIDFlag ? 0 : ZSTD_readLE32(dictPtr);
+	dictPtr += 4;
+
+	{
+		size_t const hufHeaderSize = HUF_readCTable_wksp(cctx->hufTable, 255, dictPtr, dictEnd - dictPtr, cctx->tmpCounters, sizeof(cctx->tmpCounters));
+		if (HUF_isError(hufHeaderSize))
+			return ERROR(dictionary_corrupted);
+		dictPtr += hufHeaderSize;
+	}
+
+	{
+		unsigned offcodeLog;
+		size_t const offcodeHeaderSize = FSE_readNCount(offcodeNCount, &offcodeMaxValue, &offcodeLog, dictPtr, dictEnd - dictPtr);
+		if (FSE_isError(offcodeHeaderSize))
+			return ERROR(dictionary_corrupted);
+		if (offcodeLog > OffFSELog)
+			return ERROR(dictionary_corrupted);
+		/* Defer checking offcodeMaxValue because we need to know the size of the dictionary content */
+		CHECK_E(FSE_buildCTable_wksp(cctx->offcodeCTable, offcodeNCount, offcodeMaxValue, offcodeLog, cctx->tmpCounters, sizeof(cctx->tmpCounters)),
+			dictionary_corrupted);
+		dictPtr += offcodeHeaderSize;
+	}
+
+	{
+		short matchlengthNCount[MaxML + 1];
+		unsigned matchlengthMaxValue = MaxML, matchlengthLog;
+		size_t const matchlengthHeaderSize = FSE_readNCount(matchlengthNCount, &matchlengthMaxValue, &matchlengthLog, dictPtr, dictEnd - dictPtr);
+		if (FSE_isError(matchlengthHeaderSize))
+			return ERROR(dictionary_corrupted);
+		if (matchlengthLog > MLFSELog)
+			return ERROR(dictionary_corrupted);
+		/* Every match length code must have non-zero probability */
+		CHECK_F(ZSTD_checkDictNCount(matchlengthNCount, matchlengthMaxValue, MaxML));
+		CHECK_E(
+		    FSE_buildCTable_wksp(cctx->matchlengthCTable, matchlengthNCount, matchlengthMaxValue, matchlengthLog, cctx->tmpCounters, sizeof(cctx->tmpCounters)),
+		    dictionary_corrupted);
+		dictPtr += matchlengthHeaderSize;
+	}
+
+	{
+		short litlengthNCount[MaxLL + 1];
+		unsigned litlengthMaxValue = MaxLL, litlengthLog;
+		size_t const litlengthHeaderSize = FSE_readNCount(litlengthNCount, &litlengthMaxValue, &litlengthLog, dictPtr, dictEnd - dictPtr);
+		if (FSE_isError(litlengthHeaderSize))
+			return ERROR(dictionary_corrupted);
+		if (litlengthLog > LLFSELog)
+			return ERROR(dictionary_corrupted);
+		/* Every literal length code must have non-zero probability */
+		CHECK_F(ZSTD_checkDictNCount(litlengthNCount, litlengthMaxValue, MaxLL));
+		CHECK_E(FSE_buildCTable_wksp(cctx->litlengthCTable, litlengthNCount, litlengthMaxValue, litlengthLog, cctx->tmpCounters, sizeof(cctx->tmpCounters)),
+			dictionary_corrupted);
+		dictPtr += litlengthHeaderSize;
+	}
+
+	if (dictPtr + 12 > dictEnd)
+		return ERROR(dictionary_corrupted);
+	cctx->rep[0] = ZSTD_readLE32(dictPtr + 0);
+	cctx->rep[1] = ZSTD_readLE32(dictPtr + 4);
+	cctx->rep[2] = ZSTD_readLE32(dictPtr + 8);
+	dictPtr += 12;
+
+	{
+		size_t const dictContentSize = (size_t)(dictEnd - dictPtr);
+		U32 offcodeMax = MaxOff;
+		if (dictContentSize <= ((U32)-1) - 128 KB) {
+			U32 const maxOffset = (U32)dictContentSize + 128 KB; /* The maximum offset that must be supported */
+			offcodeMax = ZSTD_highbit32(maxOffset);		     /* Calculate minimum offset code required to represent maxOffset */
+		}
+		/* All offset values <= dictContentSize + 128 KB must be representable */
+		CHECK_F(ZSTD_checkDictNCount(offcodeNCount, offcodeMaxValue, MIN(offcodeMax, MaxOff)));
+		/* All repCodes must be <= dictContentSize and != 0*/
+		{
+			U32 u;
+			for (u = 0; u < 3; u++) {
+				if (cctx->rep[u] == 0)
+					return ERROR(dictionary_corrupted);
+				if (cctx->rep[u] > dictContentSize)
+					return ERROR(dictionary_corrupted);
+			}
+		}
+
+		cctx->flagStaticTables = 1;
+		cctx->flagStaticHufTable = HUF_repeat_valid;
+		return ZSTD_loadDictionaryContent(cctx, dictPtr, dictContentSize);
+	}
+}
+
+/** ZSTD_compress_insertDictionary() :
+*   @return : 0, or an error code */
+static size_t ZSTD_compress_insertDictionary(ZSTD_CCtx *cctx, const void *dict, size_t dictSize)
+{
+	if ((dict == NULL) || (dictSize <= 8))
+		return 0;
+
+	/* dict as pure content */
+	if ((ZSTD_readLE32(dict) != ZSTD_DICT_MAGIC) || (cctx->forceRawDict))
+		return ZSTD_loadDictionaryContent(cctx, dict, dictSize);
+
+	/* dict as zstd dictionary */
+	return ZSTD_loadZstdDictionary(cctx, dict, dictSize);
+}
+
+/*! ZSTD_compressBegin_internal() :
+*   @return : 0, or an error code */
+static size_t ZSTD_compressBegin_internal(ZSTD_CCtx *cctx, const void *dict, size_t dictSize, ZSTD_parameters params, U64 pledgedSrcSize)
+{
+	ZSTD_compResetPolicy_e const crp = dictSize ? ZSTDcrp_fullReset : ZSTDcrp_continue;
+	CHECK_F(ZSTD_resetCCtx_advanced(cctx, params, pledgedSrcSize, crp));
+	return ZSTD_compress_insertDictionary(cctx, dict, dictSize);
+}
+
+/*! ZSTD_compressBegin_advanced() :
+*   @return : 0, or an error code */
+size_t ZSTD_compressBegin_advanced(ZSTD_CCtx *cctx, const void *dict, size_t dictSize, ZSTD_parameters params, unsigned long long pledgedSrcSize)
+{
+	/* compression parameters verification and optimization */
+	CHECK_F(ZSTD_checkCParams(params.cParams));
+	return ZSTD_compressBegin_internal(cctx, dict, dictSize, params, pledgedSrcSize);
+}
+
+size_t ZSTD_compressBegin_usingDict(ZSTD_CCtx *cctx, const void *dict, size_t dictSize, int compressionLevel)
+{
+	ZSTD_parameters const params = ZSTD_getParams(compressionLevel, 0, dictSize);
+	return ZSTD_compressBegin_internal(cctx, dict, dictSize, params, 0);
+}
+
+size_t ZSTD_compressBegin(ZSTD_CCtx *cctx, int compressionLevel) { return ZSTD_compressBegin_usingDict(cctx, NULL, 0, compressionLevel); }
+
+/*! ZSTD_writeEpilogue() :
+*   Ends a frame.
+*   @return : nb of bytes written into dst (or an error code) */
+static size_t ZSTD_writeEpilogue(ZSTD_CCtx *cctx, void *dst, size_t dstCapacity)
+{
+	BYTE *const ostart = (BYTE *)dst;
+	BYTE *op = ostart;
+	size_t fhSize = 0;
+
+	if (cctx->stage == ZSTDcs_created)
+		return ERROR(stage_wrong); /* init missing */
+
+	/* special case : empty frame */
+	if (cctx->stage == ZSTDcs_init) {
+		fhSize = ZSTD_writeFrameHeader(dst, dstCapacity, cctx->params, 0, 0);
+		if (ZSTD_isError(fhSize))
+			return fhSize;
+		dstCapacity -= fhSize;
+		op += fhSize;
+		cctx->stage = ZSTDcs_ongoing;
+	}
+
+	if (cctx->stage != ZSTDcs_ending) {
+		/* write one last empty block, make it the "last" block */
+		U32 const cBlockHeader24 = 1 /* last block */ + (((U32)bt_raw) << 1) + 0;
+		if (dstCapacity < 4)
+			return ERROR(dstSize_tooSmall);
+		ZSTD_writeLE32(op, cBlockHeader24);
+		op += ZSTD_blockHeaderSize;
+		dstCapacity -= ZSTD_blockHeaderSize;
+	}
+
+	if (cctx->params.fParams.checksumFlag) {
+		U32 const checksum = (U32)xxh64_digest(&cctx->xxhState);
+		if (dstCapacity < 4)
+			return ERROR(dstSize_tooSmall);
+		ZSTD_writeLE32(op, checksum);
+		op += 4;
+	}
+
+	cctx->stage = ZSTDcs_created; /* return to "created but no init" status */
+	return op - ostart;
+}
+
+size_t ZSTD_compressEnd(ZSTD_CCtx *cctx, void *dst, size_t dstCapacity, const void *src, size_t srcSize)
+{
+	size_t endResult;
+	size_t const cSize = ZSTD_compressContinue_internal(cctx, dst, dstCapacity, src, srcSize, 1, 1);
+	if (ZSTD_isError(cSize))
+		return cSize;
+	endResult = ZSTD_writeEpilogue(cctx, (char *)dst + cSize, dstCapacity - cSize);
+	if (ZSTD_isError(endResult))
+		return endResult;
+	return cSize + endResult;
+}
+
+static size_t ZSTD_compress_internal(ZSTD_CCtx *cctx, void *dst, size_t dstCapacity, const void *src, size_t srcSize, const void *dict, size_t dictSize,
+				     ZSTD_parameters params)
+{
+	CHECK_F(ZSTD_compressBegin_internal(cctx, dict, dictSize, params, srcSize));
+	return ZSTD_compressEnd(cctx, dst, dstCapacity, src, srcSize);
+}
+
+size_t ZSTD_compress_usingDict(ZSTD_CCtx *ctx, void *dst, size_t dstCapacity, const void *src, size_t srcSize, const void *dict, size_t dictSize,
+			       ZSTD_parameters params)
+{
+	return ZSTD_compress_internal(ctx, dst, dstCapacity, src, srcSize, dict, dictSize, params);
+}
+
+size_t ZSTD_compressCCtx(ZSTD_CCtx *ctx, void *dst, size_t dstCapacity, const void *src, size_t srcSize, ZSTD_parameters params)
+{
+	return ZSTD_compress_internal(ctx, dst, dstCapacity, src, srcSize, NULL, 0, params);
+}
+
+/* =====  Dictionary API  ===== */
+
+struct ZSTD_CDict_s {
+	void *dictBuffer;
+	const void *dictContent;
+	size_t dictContentSize;
+	ZSTD_CCtx *refContext;
+}; /* typedef'd tp ZSTD_CDict within "zstd.h" */
+
+size_t ZSTD_CDictWorkspaceBound(ZSTD_compressionParameters cParams) { return ZSTD_CCtxWorkspaceBound(cParams) + ZSTD_ALIGN(sizeof(ZSTD_CDict)); }
+
+static ZSTD_CDict *ZSTD_createCDict_advanced(const void *dictBuffer, size_t dictSize, unsigned byReference, ZSTD_parameters params, ZSTD_customMem customMem)
+{
+	if (!customMem.customAlloc || !customMem.customFree)
+		return NULL;
+
+	{
+		ZSTD_CDict *const cdict = (ZSTD_CDict *)ZSTD_malloc(sizeof(ZSTD_CDict), customMem);
+		ZSTD_CCtx *const cctx = ZSTD_createCCtx_advanced(customMem);
+
+		if (!cdict || !cctx) {
+			ZSTD_free(cdict, customMem);
+			ZSTD_freeCCtx(cctx);
+			return NULL;
+		}
+
+		if ((byReference) || (!dictBuffer) || (!dictSize)) {
+			cdict->dictBuffer = NULL;
+			cdict->dictContent = dictBuffer;
+		} else {
+			void *const internalBuffer = ZSTD_malloc(dictSize, customMem);
+			if (!internalBuffer) {
+				ZSTD_free(cctx, customMem);
+				ZSTD_free(cdict, customMem);
+				return NULL;
+			}
+			memcpy(internalBuffer, dictBuffer, dictSize);
+			cdict->dictBuffer = internalBuffer;
+			cdict->dictContent = internalBuffer;
+		}
+
+		{
+			size_t const errorCode = ZSTD_compressBegin_advanced(cctx, cdict->dictContent, dictSize, params, 0);
+			if (ZSTD_isError(errorCode)) {
+				ZSTD_free(cdict->dictBuffer, customMem);
+				ZSTD_free(cdict, customMem);
+				ZSTD_freeCCtx(cctx);
+				return NULL;
+			}
+		}
+
+		cdict->refContext = cctx;
+		cdict->dictContentSize = dictSize;
+		return cdict;
+	}
+}
+
+ZSTD_CDict *ZSTD_initCDict(const void *dict, size_t dictSize, ZSTD_parameters params, void *workspace, size_t workspaceSize)
+{
+	ZSTD_customMem const stackMem = ZSTD_initStack(workspace, workspaceSize);
+	return ZSTD_createCDict_advanced(dict, dictSize, 1, params, stackMem);
+}
+
+size_t ZSTD_freeCDict(ZSTD_CDict *cdict)
+{
+	if (cdict == NULL)
+		return 0; /* support free on NULL */
+	{
+		ZSTD_customMem const cMem = cdict->refContext->customMem;
+		ZSTD_freeCCtx(cdict->refContext);
+		ZSTD_free(cdict->dictBuffer, cMem);
+		ZSTD_free(cdict, cMem);
+		return 0;
+	}
+}
+
+static ZSTD_parameters ZSTD_getParamsFromCDict(const ZSTD_CDict *cdict) { return ZSTD_getParamsFromCCtx(cdict->refContext); }
+
+size_t ZSTD_compressBegin_usingCDict(ZSTD_CCtx *cctx, const ZSTD_CDict *cdict, unsigned long long pledgedSrcSize)
+{
+	if (cdict->dictContentSize)
+		CHECK_F(ZSTD_copyCCtx(cctx, cdict->refContext, pledgedSrcSize))
+	else {
+		ZSTD_parameters params = cdict->refContext->params;
+		params.fParams.contentSizeFlag = (pledgedSrcSize > 0);
+		CHECK_F(ZSTD_compressBegin_advanced(cctx, NULL, 0, params, pledgedSrcSize));
+	}
+	return 0;
+}
+
+/*! ZSTD_compress_usingCDict() :
+*   Compression using a digested Dictionary.
+*   Faster startup than ZSTD_compress_usingDict(), recommended when same dictionary is used multiple times.
+*   Note that compression level is decided during dictionary creation */
+size_t ZSTD_compress_usingCDict(ZSTD_CCtx *cctx, void *dst, size_t dstCapacity, const void *src, size_t srcSize, const ZSTD_CDict *cdict)
+{
+	CHECK_F(ZSTD_compressBegin_usingCDict(cctx, cdict, srcSize));
+
+	if (cdict->refContext->params.fParams.contentSizeFlag == 1) {
+		cctx->params.fParams.contentSizeFlag = 1;
+		cctx->frameContentSize = srcSize;
+	} else {
+		cctx->params.fParams.contentSizeFlag = 0;
+	}
+
+	return ZSTD_compressEnd(cctx, dst, dstCapacity, src, srcSize);
+}
+
+/* ******************************************************************
+*  Streaming
+********************************************************************/
+
+typedef enum { zcss_init, zcss_load, zcss_flush, zcss_final } ZSTD_cStreamStage;
+
+struct ZSTD_CStream_s {
+	ZSTD_CCtx *cctx;
+	ZSTD_CDict *cdictLocal;
+	const ZSTD_CDict *cdict;
+	char *inBuff;
+	size_t inBuffSize;
+	size_t inToCompress;
+	size_t inBuffPos;
+	size_t inBuffTarget;
+	size_t blockSize;
+	char *outBuff;
+	size_t outBuffSize;
+	size_t outBuffContentSize;
+	size_t outBuffFlushedSize;
+	ZSTD_cStreamStage stage;
+	U32 checksum;
+	U32 frameEnded;
+	U64 pledgedSrcSize;
+	U64 inputProcessed;
+	ZSTD_parameters params;
+	ZSTD_customMem customMem;
+}; /* typedef'd to ZSTD_CStream within "zstd.h" */
+
+size_t ZSTD_CStreamWorkspaceBound(ZSTD_compressionParameters cParams)
+{
+	size_t const inBuffSize = (size_t)1 << cParams.windowLog;
+	size_t const blockSize = MIN(ZSTD_BLOCKSIZE_ABSOLUTEMAX, inBuffSize);
+	size_t const outBuffSize = ZSTD_compressBound(blockSize) + 1;
+
+	return ZSTD_CCtxWorkspaceBound(cParams) + ZSTD_ALIGN(sizeof(ZSTD_CStream)) + ZSTD_ALIGN(inBuffSize) + ZSTD_ALIGN(outBuffSize);
+}
+
+ZSTD_CStream *ZSTD_createCStream_advanced(ZSTD_customMem customMem)
+{
+	ZSTD_CStream *zcs;
+
+	if (!customMem.customAlloc || !customMem.customFree)
+		return NULL;
+
+	zcs = (ZSTD_CStream *)ZSTD_malloc(sizeof(ZSTD_CStream), customMem);
+	if (zcs == NULL)
+		return NULL;
+	memset(zcs, 0, sizeof(ZSTD_CStream));
+	memcpy(&zcs->customMem, &customMem, sizeof(ZSTD_customMem));
+	zcs->cctx = ZSTD_createCCtx_advanced(customMem);
+	if (zcs->cctx == NULL) {
+		ZSTD_freeCStream(zcs);
+		return NULL;
+	}
+	return zcs;
+}
+
+size_t ZSTD_freeCStream(ZSTD_CStream *zcs)
+{
+	if (zcs == NULL)
+		return 0; /* support free on NULL */
+	{
+		ZSTD_customMem const cMem = zcs->customMem;
+		ZSTD_freeCCtx(zcs->cctx);
+		zcs->cctx = NULL;
+		ZSTD_freeCDict(zcs->cdictLocal);
+		zcs->cdictLocal = NULL;
+		ZSTD_free(zcs->inBuff, cMem);
+		zcs->inBuff = NULL;
+		ZSTD_free(zcs->outBuff, cMem);
+		zcs->outBuff = NULL;
+		ZSTD_free(zcs, cMem);
+		return 0;
+	}
+}
+
+/*======   Initialization   ======*/
+
+size_t ZSTD_CStreamInSize(void) { return ZSTD_BLOCKSIZE_ABSOLUTEMAX; }
+size_t ZSTD_CStreamOutSize(void) { return ZSTD_compressBound(ZSTD_BLOCKSIZE_ABSOLUTEMAX) + ZSTD_blockHeaderSize + 4 /* 32-bits hash */; }
+
+static size_t ZSTD_resetCStream_internal(ZSTD_CStream *zcs, unsigned long long pledgedSrcSize)
+{
+	if (zcs->inBuffSize == 0)
+		return ERROR(stage_wrong); /* zcs has not been init at least once => can't reset */
+
+	if (zcs->cdict)
+		CHECK_F(ZSTD_compressBegin_usingCDict(zcs->cctx, zcs->cdict, pledgedSrcSize))
+	else
+		CHECK_F(ZSTD_compressBegin_advanced(zcs->cctx, NULL, 0, zcs->params, pledgedSrcSize));
+
+	zcs->inToCompress = 0;
+	zcs->inBuffPos = 0;
+	zcs->inBuffTarget = zcs->blockSize;
+	zcs->outBuffContentSize = zcs->outBuffFlushedSize = 0;
+	zcs->stage = zcss_load;
+	zcs->frameEnded = 0;
+	zcs->pledgedSrcSize = pledgedSrcSize;
+	zcs->inputProcessed = 0;
+	return 0; /* ready to go */
+}
+
+size_t ZSTD_resetCStream(ZSTD_CStream *zcs, unsigned long long pledgedSrcSize)
+{
+
+	zcs->params.fParams.contentSizeFlag = (pledgedSrcSize > 0);
+
+	return ZSTD_resetCStream_internal(zcs, pledgedSrcSize);
+}
+
+static size_t ZSTD_initCStream_advanced(ZSTD_CStream *zcs, const void *dict, size_t dictSize, ZSTD_parameters params, unsigned long long pledgedSrcSize)
+{
+	/* allocate buffers */
+	{
+		size_t const neededInBuffSize = (size_t)1 << params.cParams.windowLog;
+		if (zcs->inBuffSize < neededInBuffSize) {
+			zcs->inBuffSize = neededInBuffSize;
+			ZSTD_free(zcs->inBuff, zcs->customMem);
+			zcs->inBuff = (char *)ZSTD_malloc(neededInBuffSize, zcs->customMem);
+			if (zcs->inBuff == NULL)
+				return ERROR(memory_allocation);
+		}
+		zcs->blockSize = MIN(ZSTD_BLOCKSIZE_ABSOLUTEMAX, neededInBuffSize);
+	}
+	if (zcs->outBuffSize < ZSTD_compressBound(zcs->blockSize) + 1) {
+		zcs->outBuffSize = ZSTD_compressBound(zcs->blockSize) + 1;
+		ZSTD_free(zcs->outBuff, zcs->customMem);
+		zcs->outBuff = (char *)ZSTD_malloc(zcs->outBuffSize, zcs->customMem);
+		if (zcs->outBuff == NULL)
+			return ERROR(memory_allocation);
+	}
+
+	if (dict && dictSize >= 8) {
+		ZSTD_freeCDict(zcs->cdictLocal);
+		zcs->cdictLocal = ZSTD_createCDict_advanced(dict, dictSize, 0, params, zcs->customMem);
+		if (zcs->cdictLocal == NULL)
+			return ERROR(memory_allocation);
+		zcs->cdict = zcs->cdictLocal;
+	} else
+		zcs->cdict = NULL;
+
+	zcs->checksum = params.fParams.checksumFlag > 0;
+	zcs->params = params;
+
+	return ZSTD_resetCStream_internal(zcs, pledgedSrcSize);
+}
+
+ZSTD_CStream *ZSTD_initCStream(ZSTD_parameters params, unsigned long long pledgedSrcSize, void *workspace, size_t workspaceSize)
+{
+	ZSTD_customMem const stackMem = ZSTD_initStack(workspace, workspaceSize);
+	ZSTD_CStream *const zcs = ZSTD_createCStream_advanced(stackMem);
+	if (zcs) {
+		size_t const code = ZSTD_initCStream_advanced(zcs, NULL, 0, params, pledgedSrcSize);
+		if (ZSTD_isError(code)) {
+			return NULL;
+		}
+	}
+	return zcs;
+}
+
+ZSTD_CStream *ZSTD_initCStream_usingCDict(const ZSTD_CDict *cdict, unsigned long long pledgedSrcSize, void *workspace, size_t workspaceSize)
+{
+	ZSTD_parameters const params = ZSTD_getParamsFromCDict(cdict);
+	ZSTD_CStream *const zcs = ZSTD_initCStream(params, pledgedSrcSize, workspace, workspaceSize);
+	if (zcs) {
+		zcs->cdict = cdict;
+		if (ZSTD_isError(ZSTD_resetCStream_internal(zcs, pledgedSrcSize))) {
+			return NULL;
+		}
+	}
+	return zcs;
+}
+
+/*======   Compression   ======*/
+
+typedef enum { zsf_gather, zsf_flush, zsf_end } ZSTD_flush_e;
+
+ZSTD_STATIC size_t ZSTD_limitCopy(void *dst, size_t dstCapacity, const void *src, size_t srcSize)
+{
+	size_t const length = MIN(dstCapacity, srcSize);
+	memcpy(dst, src, length);
+	return length;
+}
+
+static size_t ZSTD_compressStream_generic(ZSTD_CStream *zcs, void *dst, size_t *dstCapacityPtr, const void *src, size_t *srcSizePtr, ZSTD_flush_e const flush)
+{
+	U32 someMoreWork = 1;
+	const char *const istart = (const char *)src;
+	const char *const iend = istart + *srcSizePtr;
+	const char *ip = istart;
+	char *const ostart = (char *)dst;
+	char *const oend = ostart + *dstCapacityPtr;
+	char *op = ostart;
+
+	while (someMoreWork) {
+		switch (zcs->stage) {
+		case zcss_init:
+			return ERROR(init_missing); /* call ZBUFF_compressInit() first ! */
+
+		case zcss_load:
+			/* complete inBuffer */
+			{
+				size_t const toLoad = zcs->inBuffTarget - zcs->inBuffPos;
+				size_t const loaded = ZSTD_limitCopy(zcs->inBuff + zcs->inBuffPos, toLoad, ip, iend - ip);
+				zcs->inBuffPos += loaded;
+				ip += loaded;
+				if ((zcs->inBuffPos == zcs->inToCompress) || (!flush && (toLoad != loaded))) {
+					someMoreWork = 0;
+					break; /* not enough input to get a full block : stop there, wait for more */
+				}
+			}
+			/* compress curr block (note : this stage cannot be stopped in the middle) */
+			{
+				void *cDst;
+				size_t cSize;
+				size_t const iSize = zcs->inBuffPos - zcs->inToCompress;
+				size_t oSize = oend - op;
+				if (oSize >= ZSTD_compressBound(iSize))
+					cDst = op; /* compress directly into output buffer (avoid flush stage) */
+				else
+					cDst = zcs->outBuff, oSize = zcs->outBuffSize;
+				cSize = (flush == zsf_end) ? ZSTD_compressEnd(zcs->cctx, cDst, oSize, zcs->inBuff + zcs->inToCompress, iSize)
+							   : ZSTD_compressContinue(zcs->cctx, cDst, oSize, zcs->inBuff + zcs->inToCompress, iSize);
+				if (ZSTD_isError(cSize))
+					return cSize;
+				if (flush == zsf_end)
+					zcs->frameEnded = 1;
+				/* prepare next block */
+				zcs->inBuffTarget = zcs->inBuffPos + zcs->blockSize;
+				if (zcs->inBuffTarget > zcs->inBuffSize)
+					zcs->inBuffPos = 0, zcs->inBuffTarget = zcs->blockSize; /* note : inBuffSize >= blockSize */
+				zcs->inToCompress = zcs->inBuffPos;
+				if (cDst == op) {
+					op += cSize;
+					break;
+				} /* no need to flush */
+				zcs->outBuffContentSize = cSize;
+				zcs->outBuffFlushedSize = 0;
+				zcs->stage = zcss_flush; /* pass-through to flush stage */
+			}
+
+		case zcss_flush: {
+			size_t const toFlush = zcs->outBuffContentSize - zcs->outBuffFlushedSize;
+			size_t const flushed = ZSTD_limitCopy(op, oend - op, zcs->outBuff + zcs->outBuffFlushedSize, toFlush);
+			op += flushed;
+			zcs->outBuffFlushedSize += flushed;
+			if (toFlush != flushed) {
+				someMoreWork = 0;
+				break;
+			} /* dst too small to store flushed data : stop there */
+			zcs->outBuffContentSize = zcs->outBuffFlushedSize = 0;
+			zcs->stage = zcss_load;
+			break;
+		}
+
+		case zcss_final:
+			someMoreWork = 0; /* do nothing */
+			break;
+
+		default:
+			return ERROR(GENERIC); /* impossible */
+		}
+	}
+
+	*srcSizePtr = ip - istart;
+	*dstCapacityPtr = op - ostart;
+	zcs->inputProcessed += *srcSizePtr;
+	if (zcs->frameEnded)
+		return 0;
+	{
+		size_t hintInSize = zcs->inBuffTarget - zcs->inBuffPos;
+		if (hintInSize == 0)
+			hintInSize = zcs->blockSize;
+		return hintInSize;
+	}
+}
+
+size_t ZSTD_compressStream(ZSTD_CStream *zcs, ZSTD_outBuffer *output, ZSTD_inBuffer *input)
+{
+	size_t sizeRead = input->size - input->pos;
+	size_t sizeWritten = output->size - output->pos;
+	size_t const result =
+	    ZSTD_compressStream_generic(zcs, (char *)(output->dst) + output->pos, &sizeWritten, (const char *)(input->src) + input->pos, &sizeRead, zsf_gather);
+	input->pos += sizeRead;
+	output->pos += sizeWritten;
+	return result;
+}
+
+/*======   Finalize   ======*/
+
+/*! ZSTD_flushStream() :
+*   @return : amount of data remaining to flush */
+size_t ZSTD_flushStream(ZSTD_CStream *zcs, ZSTD_outBuffer *output)
+{
+	size_t srcSize = 0;
+	size_t sizeWritten = output->size - output->pos;
+	size_t const result = ZSTD_compressStream_generic(zcs, (char *)(output->dst) + output->pos, &sizeWritten, &srcSize,
+							  &srcSize, /* use a valid src address instead of NULL */
+							  zsf_flush);
+	output->pos += sizeWritten;
+	if (ZSTD_isError(result))
+		return result;
+	return zcs->outBuffContentSize - zcs->outBuffFlushedSize; /* remaining to flush */
+}
+
+size_t ZSTD_endStream(ZSTD_CStream *zcs, ZSTD_outBuffer *output)
+{
+	BYTE *const ostart = (BYTE *)(output->dst) + output->pos;
+	BYTE *const oend = (BYTE *)(output->dst) + output->size;
+	BYTE *op = ostart;
+
+	if ((zcs->pledgedSrcSize) && (zcs->inputProcessed != zcs->pledgedSrcSize))
+		return ERROR(srcSize_wrong); /* pledgedSrcSize not respected */
+
+	if (zcs->stage != zcss_final) {
+		/* flush whatever remains */
+		size_t srcSize = 0;
+		size_t sizeWritten = output->size - output->pos;
+		size_t const notEnded =
+		    ZSTD_compressStream_generic(zcs, ostart, &sizeWritten, &srcSize, &srcSize, zsf_end); /* use a valid src address instead of NULL */
+		size_t const remainingToFlush = zcs->outBuffContentSize - zcs->outBuffFlushedSize;
+		op += sizeWritten;
+		if (remainingToFlush) {
+			output->pos += sizeWritten;
+			return remainingToFlush + ZSTD_BLOCKHEADERSIZE /* final empty block */ + (zcs->checksum * 4);
+		}
+		/* create epilogue */
+		zcs->stage = zcss_final;
+		zcs->outBuffContentSize = !notEnded ? 0 : ZSTD_compressEnd(zcs->cctx, zcs->outBuff, zcs->outBuffSize, NULL,
+									   0); /* write epilogue, including final empty block, into outBuff */
+	}
+
+	/* flush epilogue */
+	{
+		size_t const toFlush = zcs->outBuffContentSize - zcs->outBuffFlushedSize;
+		size_t const flushed = ZSTD_limitCopy(op, oend - op, zcs->outBuff + zcs->outBuffFlushedSize, toFlush);
+		op += flushed;
+		zcs->outBuffFlushedSize += flushed;
+		output->pos += op - ostart;
+		if (toFlush == flushed)
+			zcs->stage = zcss_init; /* end reached */
+		return toFlush - flushed;
+	}
+}
+
+/*-=====  Pre-defined compression levels  =====-*/
+
+#define ZSTD_DEFAULT_CLEVEL 1
+#define ZSTD_MAX_CLEVEL 22
+int ZSTD_maxCLevel(void) { return ZSTD_MAX_CLEVEL; }
+
+static const ZSTD_compressionParameters ZSTD_defaultCParameters[4][ZSTD_MAX_CLEVEL + 1] = {
+    {
+	/* "default" */
+	/* W,  C,  H,  S,  L, TL, strat */
+	{18, 12, 12, 1, 7, 16, ZSTD_fast},    /* level  0 - never used */
+	{19, 13, 14, 1, 7, 16, ZSTD_fast},    /* level  1 */
+	{19, 15, 16, 1, 6, 16, ZSTD_fast},    /* level  2 */
+	{20, 16, 17, 1, 5, 16, ZSTD_dfast},   /* level  3.*/
+	{20, 18, 18, 1, 5, 16, ZSTD_dfast},   /* level  4.*/
+	{20, 15, 18, 3, 5, 16, ZSTD_greedy},  /* level  5 */
+	{21, 16, 19, 2, 5, 16, ZSTD_lazy},    /* level  6 */
+	{21, 17, 20, 3, 5, 16, ZSTD_lazy},    /* level  7 */
+	{21, 18, 20, 3, 5, 16, ZSTD_lazy2},   /* level  8 */
+	{21, 20, 20, 3, 5, 16, ZSTD_lazy2},   /* level  9 */
+	{21, 19, 21, 4, 5, 16, ZSTD_lazy2},   /* level 10 */
+	{22, 20, 22, 4, 5, 16, ZSTD_lazy2},   /* level 11 */
+	{22, 20, 22, 5, 5, 16, ZSTD_lazy2},   /* level 12 */
+	{22, 21, 22, 5, 5, 16, ZSTD_lazy2},   /* level 13 */
+	{22, 21, 22, 6, 5, 16, ZSTD_lazy2},   /* level 14 */
+	{22, 21, 21, 5, 5, 16, ZSTD_btlazy2}, /* level 15 */
+	{23, 22, 22, 5, 5, 16, ZSTD_btlazy2}, /* level 16 */
+	{23, 21, 22, 4, 5, 24, ZSTD_btopt},   /* level 17 */
+	{23, 23, 22, 6, 5, 32, ZSTD_btopt},   /* level 18 */
+	{23, 23, 22, 6, 3, 48, ZSTD_btopt},   /* level 19 */
+	{25, 25, 23, 7, 3, 64, ZSTD_btopt2},  /* level 20 */
+	{26, 26, 23, 7, 3, 256, ZSTD_btopt2}, /* level 21 */
+	{27, 27, 25, 9, 3, 512, ZSTD_btopt2}, /* level 22 */
+    },
+    {
+	/* for srcSize <= 256 KB */
+	/* W,  C,  H,  S,  L,  T, strat */
+	{0, 0, 0, 0, 0, 0, ZSTD_fast},	 /* level  0 - not used */
+	{18, 13, 14, 1, 6, 8, ZSTD_fast},      /* level  1 */
+	{18, 14, 13, 1, 5, 8, ZSTD_dfast},     /* level  2 */
+	{18, 16, 15, 1, 5, 8, ZSTD_dfast},     /* level  3 */
+	{18, 15, 17, 1, 5, 8, ZSTD_greedy},    /* level  4.*/
+	{18, 16, 17, 4, 5, 8, ZSTD_greedy},    /* level  5.*/
+	{18, 16, 17, 3, 5, 8, ZSTD_lazy},      /* level  6.*/
+	{18, 17, 17, 4, 4, 8, ZSTD_lazy},      /* level  7 */
+	{18, 17, 17, 4, 4, 8, ZSTD_lazy2},     /* level  8 */
+	{18, 17, 17, 5, 4, 8, ZSTD_lazy2},     /* level  9 */
+	{18, 17, 17, 6, 4, 8, ZSTD_lazy2},     /* level 10 */
+	{18, 18, 17, 6, 4, 8, ZSTD_lazy2},     /* level 11.*/
+	{18, 18, 17, 7, 4, 8, ZSTD_lazy2},     /* level 12.*/
+	{18, 19, 17, 6, 4, 8, ZSTD_btlazy2},   /* level 13 */
+	{18, 18, 18, 4, 4, 16, ZSTD_btopt},    /* level 14.*/
+	{18, 18, 18, 4, 3, 16, ZSTD_btopt},    /* level 15.*/
+	{18, 19, 18, 6, 3, 32, ZSTD_btopt},    /* level 16.*/
+	{18, 19, 18, 8, 3, 64, ZSTD_btopt},    /* level 17.*/
+	{18, 19, 18, 9, 3, 128, ZSTD_btopt},   /* level 18.*/
+	{18, 19, 18, 10, 3, 256, ZSTD_btopt},  /* level 19.*/
+	{18, 19, 18, 11, 3, 512, ZSTD_btopt2}, /* level 20.*/
+	{18, 19, 18, 12, 3, 512, ZSTD_btopt2}, /* level 21.*/
+	{18, 19, 18, 13, 3, 512, ZSTD_btopt2}, /* level 22.*/
+    },
+    {
+	/* for srcSize <= 128 KB */
+	/* W,  C,  H,  S,  L,  T, strat */
+	{17, 12, 12, 1, 7, 8, ZSTD_fast},      /* level  0 - not used */
+	{17, 12, 13, 1, 6, 8, ZSTD_fast},      /* level  1 */
+	{17, 13, 16, 1, 5, 8, ZSTD_fast},      /* level  2 */
+	{17, 16, 16, 2, 5, 8, ZSTD_dfast},     /* level  3 */
+	{17, 13, 15, 3, 4, 8, ZSTD_greedy},    /* level  4 */
+	{17, 15, 17, 4, 4, 8, ZSTD_greedy},    /* level  5 */
+	{17, 16, 17, 3, 4, 8, ZSTD_lazy},      /* level  6 */
+	{17, 15, 17, 4, 4, 8, ZSTD_lazy2},     /* level  7 */
+	{17, 17, 17, 4, 4, 8, ZSTD_lazy2},     /* level  8 */
+	{17, 17, 17, 5, 4, 8, ZSTD_lazy2},     /* level  9 */
+	{17, 17, 17, 6, 4, 8, ZSTD_lazy2},     /* level 10 */
+	{17, 17, 17, 7, 4, 8, ZSTD_lazy2},     /* level 11 */
+	{17, 17, 17, 8, 4, 8, ZSTD_lazy2},     /* level 12 */
+	{17, 18, 17, 6, 4, 8, ZSTD_btlazy2},   /* level 13.*/
+	{17, 17, 17, 7, 3, 8, ZSTD_btopt},     /* level 14.*/
+	{17, 17, 17, 7, 3, 16, ZSTD_btopt},    /* level 15.*/
+	{17, 18, 17, 7, 3, 32, ZSTD_btopt},    /* level 16.*/
+	{17, 18, 17, 7, 3, 64, ZSTD_btopt},    /* level 17.*/
+	{17, 18, 17, 7, 3, 256, ZSTD_btopt},   /* level 18.*/
+	{17, 18, 17, 8, 3, 256, ZSTD_btopt},   /* level 19.*/
+	{17, 18, 17, 9, 3, 256, ZSTD_btopt2},  /* level 20.*/
+	{17, 18, 17, 10, 3, 256, ZSTD_btopt2}, /* level 21.*/
+	{17, 18, 17, 11, 3, 512, ZSTD_btopt2}, /* level 22.*/
+    },
+    {
+	/* for srcSize <= 16 KB */
+	/* W,  C,  H,  S,  L,  T, strat */
+	{14, 12, 12, 1, 7, 6, ZSTD_fast},      /* level  0 - not used */
+	{14, 14, 14, 1, 6, 6, ZSTD_fast},      /* level  1 */
+	{14, 14, 14, 1, 4, 6, ZSTD_fast},      /* level  2 */
+	{14, 14, 14, 1, 4, 6, ZSTD_dfast},     /* level  3.*/
+	{14, 14, 14, 4, 4, 6, ZSTD_greedy},    /* level  4.*/
+	{14, 14, 14, 3, 4, 6, ZSTD_lazy},      /* level  5.*/
+	{14, 14, 14, 4, 4, 6, ZSTD_lazy2},     /* level  6 */
+	{14, 14, 14, 5, 4, 6, ZSTD_lazy2},     /* level  7 */
+	{14, 14, 14, 6, 4, 6, ZSTD_lazy2},     /* level  8.*/
+	{14, 15, 14, 6, 4, 6, ZSTD_btlazy2},   /* level  9.*/
+	{14, 15, 14, 3, 3, 6, ZSTD_btopt},     /* level 10.*/
+	{14, 15, 14, 6, 3, 8, ZSTD_btopt},     /* level 11.*/
+	{14, 15, 14, 6, 3, 16, ZSTD_btopt},    /* level 12.*/
+	{14, 15, 14, 6, 3, 24, ZSTD_btopt},    /* level 13.*/
+	{14, 15, 15, 6, 3, 48, ZSTD_btopt},    /* level 14.*/
+	{14, 15, 15, 6, 3, 64, ZSTD_btopt},    /* level 15.*/
+	{14, 15, 15, 6, 3, 96, ZSTD_btopt},    /* level 16.*/
+	{14, 15, 15, 6, 3, 128, ZSTD_btopt},   /* level 17.*/
+	{14, 15, 15, 6, 3, 256, ZSTD_btopt},   /* level 18.*/
+	{14, 15, 15, 7, 3, 256, ZSTD_btopt},   /* level 19.*/
+	{14, 15, 15, 8, 3, 256, ZSTD_btopt2},  /* level 20.*/
+	{14, 15, 15, 9, 3, 256, ZSTD_btopt2},  /* level 21.*/
+	{14, 15, 15, 10, 3, 256, ZSTD_btopt2}, /* level 22.*/
+    },
+};
+
+/*! ZSTD_getCParams() :
+*   @return ZSTD_compressionParameters structure for a selected compression level, `srcSize` and `dictSize`.
+*   Size values are optional, provide 0 if not known or unused */
+ZSTD_compressionParameters ZSTD_getCParams(int compressionLevel, unsigned long long srcSize, size_t dictSize)
+{
+	ZSTD_compressionParameters cp;
+	size_t const addedSize = srcSize ? 0 : 500;
+	U64 const rSize = srcSize + dictSize ? srcSize + dictSize + addedSize : (U64)-1;
+	U32 const tableID = (rSize <= 256 KB) + (rSize <= 128 KB) + (rSize <= 16 KB); /* intentional underflow for srcSizeHint == 0 */
+	if (compressionLevel <= 0)
+		compressionLevel = ZSTD_DEFAULT_CLEVEL; /* 0 == default; no negative compressionLevel yet */
+	if (compressionLevel > ZSTD_MAX_CLEVEL)
+		compressionLevel = ZSTD_MAX_CLEVEL;
+	cp = ZSTD_defaultCParameters[tableID][compressionLevel];
+	if (ZSTD_32bits()) { /* auto-correction, for 32-bits mode */
+		if (cp.windowLog > ZSTD_WINDOWLOG_MAX)
+			cp.windowLog = ZSTD_WINDOWLOG_MAX;
+		if (cp.chainLog > ZSTD_CHAINLOG_MAX)
+			cp.chainLog = ZSTD_CHAINLOG_MAX;
+		if (cp.hashLog > ZSTD_HASHLOG_MAX)
+			cp.hashLog = ZSTD_HASHLOG_MAX;
+	}
+	cp = ZSTD_adjustCParams(cp, srcSize, dictSize);
+	return cp;
+}
+
+/*! ZSTD_getParams() :
+*   same as ZSTD_getCParams(), but @return a `ZSTD_parameters` object (instead of `ZSTD_compressionParameters`).
+*   All fields of `ZSTD_frameParameters` are set to default (0) */
+ZSTD_parameters ZSTD_getParams(int compressionLevel, unsigned long long srcSize, size_t dictSize)
+{
+	ZSTD_parameters params;
+	ZSTD_compressionParameters const cParams = ZSTD_getCParams(compressionLevel, srcSize, dictSize);
+	memset(&params, 0, sizeof(params));
+	params.cParams = cParams;
+	return params;
+}
+
+EXPORT_SYMBOL(ZSTD_maxCLevel);
+EXPORT_SYMBOL(ZSTD_compressBound);
+
+EXPORT_SYMBOL(ZSTD_CCtxWorkspaceBound);
+EXPORT_SYMBOL(ZSTD_initCCtx);
+EXPORT_SYMBOL(ZSTD_compressCCtx);
+EXPORT_SYMBOL(ZSTD_compress_usingDict);
+
+EXPORT_SYMBOL(ZSTD_CDictWorkspaceBound);
+EXPORT_SYMBOL(ZSTD_initCDict);
+EXPORT_SYMBOL(ZSTD_compress_usingCDict);
+
+EXPORT_SYMBOL(ZSTD_CStreamWorkspaceBound);
+EXPORT_SYMBOL(ZSTD_initCStream);
+EXPORT_SYMBOL(ZSTD_initCStream_usingCDict);
+EXPORT_SYMBOL(ZSTD_resetCStream);
+EXPORT_SYMBOL(ZSTD_compressStream);
+EXPORT_SYMBOL(ZSTD_flushStream);
+EXPORT_SYMBOL(ZSTD_endStream);
+EXPORT_SYMBOL(ZSTD_CStreamInSize);
+EXPORT_SYMBOL(ZSTD_CStreamOutSize);
+
+EXPORT_SYMBOL(ZSTD_getCParams);
+EXPORT_SYMBOL(ZSTD_getParams);
+EXPORT_SYMBOL(ZSTD_checkCParams);
+EXPORT_SYMBOL(ZSTD_adjustCParams);
+
+EXPORT_SYMBOL(ZSTD_compressBegin);
+EXPORT_SYMBOL(ZSTD_compressBegin_usingDict);
+EXPORT_SYMBOL(ZSTD_compressBegin_advanced);
+EXPORT_SYMBOL(ZSTD_copyCCtx);
+EXPORT_SYMBOL(ZSTD_compressBegin_usingCDict);
+EXPORT_SYMBOL(ZSTD_compressContinue);
+EXPORT_SYMBOL(ZSTD_compressEnd);
+
+EXPORT_SYMBOL(ZSTD_getBlockSizeMax);
+EXPORT_SYMBOL(ZSTD_compressBlock);
+
+MODULE_LICENSE("Dual BSD/GPL");
+MODULE_DESCRIPTION("Zstd Compressor");
diff --git a/lib/zstd/decompress.c b/lib/zstd/decompress.c
new file mode 100644
index 000000000000..b17846725ca0
--- /dev/null
+++ b/lib/zstd/decompress.c
@@ -0,0 +1,2528 @@
+/**
+ * Copyright (c) 2016-present, Yann Collet, Facebook, Inc.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of https://github.com/facebook/zstd.
+ * An additional grant of patent rights can be found in the PATENTS file in the
+ * same directory.
+ *
+ * This program is free software; you can redistribute it and/or modify it under
+ * the terms of the GNU General Public License version 2 as published by the
+ * Free Software Foundation. This program is dual-licensed; you may select
+ * either version 2 of the GNU General Public License ("GPL") or BSD license
+ * ("BSD").
+ */
+
+/* ***************************************************************
+*  Tuning parameters
+*****************************************************************/
+/*!
+*  MAXWINDOWSIZE_DEFAULT :
+*  maximum window size accepted by DStream, by default.
+*  Frames requiring more memory will be rejected.
+*/
+#ifndef ZSTD_MAXWINDOWSIZE_DEFAULT
+#define ZSTD_MAXWINDOWSIZE_DEFAULT ((1 << ZSTD_WINDOWLOG_MAX) + 1) /* defined within zstd.h */
+#endif
+
+/*-*******************************************************
+*  Dependencies
+*********************************************************/
+#include "fse.h"
+#include "huf.h"
+#include "mem.h" /* low level memory routines */
+#include "zstd_internal.h"
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/string.h> /* memcpy, memmove, memset */
+
+#define ZSTD_PREFETCH(ptr) __builtin_prefetch(ptr, 0, 0)
+
+/*-*************************************
+*  Macros
+***************************************/
+#define ZSTD_isError ERR_isError /* for inlining */
+#define FSE_isError ERR_isError
+#define HUF_isError ERR_isError
+
+/*_*******************************************************
+*  Memory operations
+**********************************************************/
+static void ZSTD_copy4(void *dst, const void *src) { memcpy(dst, src, 4); }
+
+/*-*************************************************************
+*   Context management
+***************************************************************/
+typedef enum {
+	ZSTDds_getFrameHeaderSize,
+	ZSTDds_decodeFrameHeader,
+	ZSTDds_decodeBlockHeader,
+	ZSTDds_decompressBlock,
+	ZSTDds_decompressLastBlock,
+	ZSTDds_checkChecksum,
+	ZSTDds_decodeSkippableHeader,
+	ZSTDds_skipFrame
+} ZSTD_dStage;
+
+typedef struct {
+	FSE_DTable LLTable[FSE_DTABLE_SIZE_U32(LLFSELog)];
+	FSE_DTable OFTable[FSE_DTABLE_SIZE_U32(OffFSELog)];
+	FSE_DTable MLTable[FSE_DTABLE_SIZE_U32(MLFSELog)];
+	HUF_DTable hufTable[HUF_DTABLE_SIZE(HufLog)]; /* can accommodate HUF_decompress4X */
+	U64 workspace[HUF_DECOMPRESS_WORKSPACE_SIZE_U32 / 2];
+	U32 rep[ZSTD_REP_NUM];
+} ZSTD_entropyTables_t;
+
+struct ZSTD_DCtx_s {
+	const FSE_DTable *LLTptr;
+	const FSE_DTable *MLTptr;
+	const FSE_DTable *OFTptr;
+	const HUF_DTable *HUFptr;
+	ZSTD_entropyTables_t entropy;
+	const void *previousDstEnd; /* detect continuity */
+	const void *base;	   /* start of curr segment */
+	const void *vBase;	  /* virtual start of previous segment if it was just before curr one */
+	const void *dictEnd;	/* end of previous segment */
+	size_t expected;
+	ZSTD_frameParams fParams;
+	blockType_e bType; /* used in ZSTD_decompressContinue(), to transfer blockType between header decoding and block decoding stages */
+	ZSTD_dStage stage;
+	U32 litEntropy;
+	U32 fseEntropy;
+	struct xxh64_state xxhState;
+	size_t headerSize;
+	U32 dictID;
+	const BYTE *litPtr;
+	ZSTD_customMem customMem;
+	size_t litSize;
+	size_t rleSize;
+	BYTE litBuffer[ZSTD_BLOCKSIZE_ABSOLUTEMAX + WILDCOPY_OVERLENGTH];
+	BYTE headerBuffer[ZSTD_FRAMEHEADERSIZE_MAX];
+}; /* typedef'd to ZSTD_DCtx within "zstd.h" */
+
+size_t ZSTD_DCtxWorkspaceBound(void) { return ZSTD_ALIGN(sizeof(ZSTD_stack)) + ZSTD_ALIGN(sizeof(ZSTD_DCtx)); }
+
+size_t ZSTD_decompressBegin(ZSTD_DCtx *dctx)
+{
+	dctx->expected = ZSTD_frameHeaderSize_prefix;
+	dctx->stage = ZSTDds_getFrameHeaderSize;
+	dctx->previousDstEnd = NULL;
+	dctx->base = NULL;
+	dctx->vBase = NULL;
+	dctx->dictEnd = NULL;
+	dctx->entropy.hufTable[0] = (HUF_DTable)((HufLog)*0x1000001); /* cover both little and big endian */
+	dctx->litEntropy = dctx->fseEntropy = 0;
+	dctx->dictID = 0;
+	ZSTD_STATIC_ASSERT(sizeof(dctx->entropy.rep) == sizeof(repStartValue));
+	memcpy(dctx->entropy.rep, repStartValue, sizeof(repStartValue)); /* initial repcodes */
+	dctx->LLTptr = dctx->entropy.LLTable;
+	dctx->MLTptr = dctx->entropy.MLTable;
+	dctx->OFTptr = dctx->entropy.OFTable;
+	dctx->HUFptr = dctx->entropy.hufTable;
+	return 0;
+}
+
+ZSTD_DCtx *ZSTD_createDCtx_advanced(ZSTD_customMem customMem)
+{
+	ZSTD_DCtx *dctx;
+
+	if (!customMem.customAlloc || !customMem.customFree)
+		return NULL;
+
+	dctx = (ZSTD_DCtx *)ZSTD_malloc(sizeof(ZSTD_DCtx), customMem);
+	if (!dctx)
+		return NULL;
+	memcpy(&dctx->customMem, &customMem, sizeof(customMem));
+	ZSTD_decompressBegin(dctx);
+	return dctx;
+}
+
+ZSTD_DCtx *ZSTD_initDCtx(void *workspace, size_t workspaceSize)
+{
+	ZSTD_customMem const stackMem = ZSTD_initStack(workspace, workspaceSize);
+	return ZSTD_createDCtx_advanced(stackMem);
+}
+
+size_t ZSTD_freeDCtx(ZSTD_DCtx *dctx)
+{
+	if (dctx == NULL)
+		return 0; /* support free on NULL */
+	ZSTD_free(dctx, dctx->customMem);
+	return 0; /* reserved as a potential error code in the future */
+}
+
+void ZSTD_copyDCtx(ZSTD_DCtx *dstDCtx, const ZSTD_DCtx *srcDCtx)
+{
+	size_t const workSpaceSize = (ZSTD_BLOCKSIZE_ABSOLUTEMAX + WILDCOPY_OVERLENGTH) + ZSTD_frameHeaderSize_max;
+	memcpy(dstDCtx, srcDCtx, sizeof(ZSTD_DCtx) - workSpaceSize); /* no need to copy workspace */
+}
+
+static void ZSTD_refDDict(ZSTD_DCtx *dstDCtx, const ZSTD_DDict *ddict);
+
+/*-*************************************************************
+*   Decompression section
+***************************************************************/
+
+/*! ZSTD_isFrame() :
+ *  Tells if the content of `buffer` starts with a valid Frame Identifier.
+ *  Note : Frame Identifier is 4 bytes. If `size < 4`, @return will always be 0.
+ *  Note 2 : Legacy Frame Identifiers are considered valid only if Legacy Support is enabled.
+ *  Note 3 : Skippable Frame Identifiers are considered valid. */
+unsigned ZSTD_isFrame(const void *buffer, size_t size)
+{
+	if (size < 4)
+		return 0;
+	{
+		U32 const magic = ZSTD_readLE32(buffer);
+		if (magic == ZSTD_MAGICNUMBER)
+			return 1;
+		if ((magic & 0xFFFFFFF0U) == ZSTD_MAGIC_SKIPPABLE_START)
+			return 1;
+	}
+	return 0;
+}
+
+/** ZSTD_frameHeaderSize() :
+*   srcSize must be >= ZSTD_frameHeaderSize_prefix.
+*   @return : size of the Frame Header */
+static size_t ZSTD_frameHeaderSize(const void *src, size_t srcSize)
+{
+	if (srcSize < ZSTD_frameHeaderSize_prefix)
+		return ERROR(srcSize_wrong);
+	{
+		BYTE const fhd = ((const BYTE *)src)[4];
+		U32 const dictID = fhd & 3;
+		U32 const singleSegment = (fhd >> 5) & 1;
+		U32 const fcsId = fhd >> 6;
+		return ZSTD_frameHeaderSize_prefix + !singleSegment + ZSTD_did_fieldSize[dictID] + ZSTD_fcs_fieldSize[fcsId] + (singleSegment && !fcsId);
+	}
+}
+
+/** ZSTD_getFrameParams() :
+*   decode Frame Header, or require larger `srcSize`.
+*   @return : 0, `fparamsPtr` is correctly filled,
+*            >0, `srcSize` is too small, result is expected `srcSize`,
+*             or an error code, which can be tested using ZSTD_isError() */
+size_t ZSTD_getFrameParams(ZSTD_frameParams *fparamsPtr, const void *src, size_t srcSize)
+{
+	const BYTE *ip = (const BYTE *)src;
+
+	if (srcSize < ZSTD_frameHeaderSize_prefix)
+		return ZSTD_frameHeaderSize_prefix;
+	if (ZSTD_readLE32(src) != ZSTD_MAGICNUMBER) {
+		if ((ZSTD_readLE32(src) & 0xFFFFFFF0U) == ZSTD_MAGIC_SKIPPABLE_START) {
+			if (srcSize < ZSTD_skippableHeaderSize)
+				return ZSTD_skippableHeaderSize; /* magic number + skippable frame length */
+			memset(fparamsPtr, 0, sizeof(*fparamsPtr));
+			fparamsPtr->frameContentSize = ZSTD_readLE32((const char *)src + 4);
+			fparamsPtr->windowSize = 0; /* windowSize==0 means a frame is skippable */
+			return 0;
+		}
+		return ERROR(prefix_unknown);
+	}
+
+	/* ensure there is enough `srcSize` to fully read/decode frame header */
+	{
+		size_t const fhsize = ZSTD_frameHeaderSize(src, srcSize);
+		if (srcSize < fhsize)
+			return fhsize;
+	}
+
+	{
+		BYTE const fhdByte = ip[4];
+		size_t pos = 5;
+		U32 const dictIDSizeCode = fhdByte & 3;
+		U32 const checksumFlag = (fhdByte >> 2) & 1;
+		U32 const singleSegment = (fhdByte >> 5) & 1;
+		U32 const fcsID = fhdByte >> 6;
+		U32 const windowSizeMax = 1U << ZSTD_WINDOWLOG_MAX;
+		U32 windowSize = 0;
+		U32 dictID = 0;
+		U64 frameContentSize = 0;
+		if ((fhdByte & 0x08) != 0)
+			return ERROR(frameParameter_unsupported); /* reserved bits, which must be zero */
+		if (!singleSegment) {
+			BYTE const wlByte = ip[pos++];
+			U32 const windowLog = (wlByte >> 3) + ZSTD_WINDOWLOG_ABSOLUTEMIN;
+			if (windowLog > ZSTD_WINDOWLOG_MAX)
+				return ERROR(frameParameter_windowTooLarge); /* avoids issue with 1 << windowLog */
+			windowSize = (1U << windowLog);
+			windowSize += (windowSize >> 3) * (wlByte & 7);
+		}
+
+		switch (dictIDSizeCode) {
+		default: /* impossible */
+		case 0: break;
+		case 1:
+			dictID = ip[pos];
+			pos++;
+			break;
+		case 2:
+			dictID = ZSTD_readLE16(ip + pos);
+			pos += 2;
+			break;
+		case 3:
+			dictID = ZSTD_readLE32(ip + pos);
+			pos += 4;
+			break;
+		}
+		switch (fcsID) {
+		default: /* impossible */
+		case 0:
+			if (singleSegment)
+				frameContentSize = ip[pos];
+			break;
+		case 1: frameContentSize = ZSTD_readLE16(ip + pos) + 256; break;
+		case 2: frameContentSize = ZSTD_readLE32(ip + pos); break;
+		case 3: frameContentSize = ZSTD_readLE64(ip + pos); break;
+		}
+		if (!windowSize)
+			windowSize = (U32)frameContentSize;
+		if (windowSize > windowSizeMax)
+			return ERROR(frameParameter_windowTooLarge);
+		fparamsPtr->frameContentSize = frameContentSize;
+		fparamsPtr->windowSize = windowSize;
+		fparamsPtr->dictID = dictID;
+		fparamsPtr->checksumFlag = checksumFlag;
+	}
+	return 0;
+}
+
+/** ZSTD_getFrameContentSize() :
+*   compatible with legacy mode
+*   @return : decompressed size of the single frame pointed to be `src` if known, otherwise
+*             - ZSTD_CONTENTSIZE_UNKNOWN if the size cannot be determined
+*             - ZSTD_CONTENTSIZE_ERROR if an error occurred (e.g. invalid magic number, srcSize too small) */
+unsigned long long ZSTD_getFrameContentSize(const void *src, size_t srcSize)
+{
+	{
+		ZSTD_frameParams fParams;
+		if (ZSTD_getFrameParams(&fParams, src, srcSize) != 0)
+			return ZSTD_CONTENTSIZE_ERROR;
+		if (fParams.windowSize == 0) {
+			/* Either skippable or empty frame, size == 0 either way */
+			return 0;
+		} else if (fParams.frameContentSize != 0) {
+			return fParams.frameContentSize;
+		} else {
+			return ZSTD_CONTENTSIZE_UNKNOWN;
+		}
+	}
+}
+
+/** ZSTD_findDecompressedSize() :
+ *  compatible with legacy mode
+ *  `srcSize` must be the exact length of some number of ZSTD compressed and/or
+ *      skippable frames
+ *  @return : decompressed size of the frames contained */
+unsigned long long ZSTD_findDecompressedSize(const void *src, size_t srcSize)
+{
+	{
+		unsigned long long totalDstSize = 0;
+		while (srcSize >= ZSTD_frameHeaderSize_prefix) {
+			const U32 magicNumber = ZSTD_readLE32(src);
+
+			if ((magicNumber & 0xFFFFFFF0U) == ZSTD_MAGIC_SKIPPABLE_START) {
+				size_t skippableSize;
+				if (srcSize < ZSTD_skippableHeaderSize)
+					return ERROR(srcSize_wrong);
+				skippableSize = ZSTD_readLE32((const BYTE *)src + 4) + ZSTD_skippableHeaderSize;
+				if (srcSize < skippableSize) {
+					return ZSTD_CONTENTSIZE_ERROR;
+				}
+
+				src = (const BYTE *)src + skippableSize;
+				srcSize -= skippableSize;
+				continue;
+			}
+
+			{
+				unsigned long long const ret = ZSTD_getFrameContentSize(src, srcSize);
+				if (ret >= ZSTD_CONTENTSIZE_ERROR)
+					return ret;
+
+				/* check for overflow */
+				if (totalDstSize + ret < totalDstSize)
+					return ZSTD_CONTENTSIZE_ERROR;
+				totalDstSize += ret;
+			}
+			{
+				size_t const frameSrcSize = ZSTD_findFrameCompressedSize(src, srcSize);
+				if (ZSTD_isError(frameSrcSize)) {
+					return ZSTD_CONTENTSIZE_ERROR;
+				}
+
+				src = (const BYTE *)src + frameSrcSize;
+				srcSize -= frameSrcSize;
+			}
+		}
+
+		if (srcSize) {
+			return ZSTD_CONTENTSIZE_ERROR;
+		}
+
+		return totalDstSize;
+	}
+}
+
+/** ZSTD_decodeFrameHeader() :
+*   `headerSize` must be the size provided by ZSTD_frameHeaderSize().
+*   @return : 0 if success, or an error code, which can be tested using ZSTD_isError() */
+static size_t ZSTD_decodeFrameHeader(ZSTD_DCtx *dctx, const void *src, size_t headerSize)
+{
+	size_t const result = ZSTD_getFrameParams(&(dctx->fParams), src, headerSize);
+	if (ZSTD_isError(result))
+		return result; /* invalid header */
+	if (result > 0)
+		return ERROR(srcSize_wrong); /* headerSize too small */
+	if (dctx->fParams.dictID && (dctx->dictID != dctx->fParams.dictID))
+		return ERROR(dictionary_wrong);
+	if (dctx->fParams.checksumFlag)
+		xxh64_reset(&dctx->xxhState, 0);
+	return 0;
+}
+
+typedef struct {
+	blockType_e blockType;
+	U32 lastBlock;
+	U32 origSize;
+} blockProperties_t;
+
+/*! ZSTD_getcBlockSize() :
+*   Provides the size of compressed block from block header `src` */
+size_t ZSTD_getcBlockSize(const void *src, size_t srcSize, blockProperties_t *bpPtr)
+{
+	if (srcSize < ZSTD_blockHeaderSize)
+		return ERROR(srcSize_wrong);
+	{
+		U32 const cBlockHeader = ZSTD_readLE24(src);
+		U32 const cSize = cBlockHeader >> 3;
+		bpPtr->lastBlock = cBlockHeader & 1;
+		bpPtr->blockType = (blockType_e)((cBlockHeader >> 1) & 3);
+		bpPtr->origSize = cSize; /* only useful for RLE */
+		if (bpPtr->blockType == bt_rle)
+			return 1;
+		if (bpPtr->blockType == bt_reserved)
+			return ERROR(corruption_detected);
+		return cSize;
+	}
+}
+
+static size_t ZSTD_copyRawBlock(void *dst, size_t dstCapacity, const void *src, size_t srcSize)
+{
+	if (srcSize > dstCapacity)
+		return ERROR(dstSize_tooSmall);
+	memcpy(dst, src, srcSize);
+	return srcSize;
+}
+
+static size_t ZSTD_setRleBlock(void *dst, size_t dstCapacity, const void *src, size_t srcSize, size_t regenSize)
+{
+	if (srcSize != 1)
+		return ERROR(srcSize_wrong);
+	if (regenSize > dstCapacity)
+		return ERROR(dstSize_tooSmall);
+	memset(dst, *(const BYTE *)src, regenSize);
+	return regenSize;
+}
+
+/*! ZSTD_decodeLiteralsBlock() :
+	@return : nb of bytes read from src (< srcSize ) */
+size_t ZSTD_decodeLiteralsBlock(ZSTD_DCtx *dctx, const void *src, size_t srcSize) /* note : srcSize < BLOCKSIZE */
+{
+	if (srcSize < MIN_CBLOCK_SIZE)
+		return ERROR(corruption_detected);
+
+	{
+		const BYTE *const istart = (const BYTE *)src;
+		symbolEncodingType_e const litEncType = (symbolEncodingType_e)(istart[0] & 3);
+
+		switch (litEncType) {
+		case set_repeat:
+			if (dctx->litEntropy == 0)
+				return ERROR(dictionary_corrupted);
+		/* fall-through */
+		case set_compressed:
+			if (srcSize < 5)
+				return ERROR(corruption_detected); /* srcSize >= MIN_CBLOCK_SIZE == 3; here we need up to 5 for case 3 */
+			{
+				size_t lhSize, litSize, litCSize;
+				U32 singleStream = 0;
+				U32 const lhlCode = (istart[0] >> 2) & 3;
+				U32 const lhc = ZSTD_readLE32(istart);
+				switch (lhlCode) {
+				case 0:
+				case 1:
+				default: /* note : default is impossible, since lhlCode into [0..3] */
+					/* 2 - 2 - 10 - 10 */
+					singleStream = !lhlCode;
+					lhSize = 3;
+					litSize = (lhc >> 4) & 0x3FF;
+					litCSize = (lhc >> 14) & 0x3FF;
+					break;
+				case 2:
+					/* 2 - 2 - 14 - 14 */
+					lhSize = 4;
+					litSize = (lhc >> 4) & 0x3FFF;
+					litCSize = lhc >> 18;
+					break;
+				case 3:
+					/* 2 - 2 - 18 - 18 */
+					lhSize = 5;
+					litSize = (lhc >> 4) & 0x3FFFF;
+					litCSize = (lhc >> 22) + (istart[4] << 10);
+					break;
+				}
+				if (litSize > ZSTD_BLOCKSIZE_ABSOLUTEMAX)
+					return ERROR(corruption_detected);
+				if (litCSize + lhSize > srcSize)
+					return ERROR(corruption_detected);
+
+				if (HUF_isError(
+					(litEncType == set_repeat)
+					    ? (singleStream ? HUF_decompress1X_usingDTable(dctx->litBuffer, litSize, istart + lhSize, litCSize, dctx->HUFptr)
+							    : HUF_decompress4X_usingDTable(dctx->litBuffer, litSize, istart + lhSize, litCSize, dctx->HUFptr))
+					    : (singleStream
+						   ? HUF_decompress1X2_DCtx_wksp(dctx->entropy.hufTable, dctx->litBuffer, litSize, istart + lhSize, litCSize,
+										 dctx->entropy.workspace, sizeof(dctx->entropy.workspace))
+						   : HUF_decompress4X_hufOnly_wksp(dctx->entropy.hufTable, dctx->litBuffer, litSize, istart + lhSize, litCSize,
+										   dctx->entropy.workspace, sizeof(dctx->entropy.workspace)))))
+					return ERROR(corruption_detected);
+
+				dctx->litPtr = dctx->litBuffer;
+				dctx->litSize = litSize;
+				dctx->litEntropy = 1;
+				if (litEncType == set_compressed)
+					dctx->HUFptr = dctx->entropy.hufTable;
+				memset(dctx->litBuffer + dctx->litSize, 0, WILDCOPY_OVERLENGTH);
+				return litCSize + lhSize;
+			}
+
+		case set_basic: {
+			size_t litSize, lhSize;
+			U32 const lhlCode = ((istart[0]) >> 2) & 3;
+			switch (lhlCode) {
+			case 0:
+			case 2:
+			default: /* note : default is impossible, since lhlCode into [0..3] */
+				lhSize = 1;
+				litSize = istart[0] >> 3;
+				break;
+			case 1:
+				lhSize = 2;
+				litSize = ZSTD_readLE16(istart) >> 4;
+				break;
+			case 3:
+				lhSize = 3;
+				litSize = ZSTD_readLE24(istart) >> 4;
+				break;
+			}
+
+			if (lhSize + litSize + WILDCOPY_OVERLENGTH > srcSize) { /* risk reading beyond src buffer with wildcopy */
+				if (litSize + lhSize > srcSize)
+					return ERROR(corruption_detected);
+				memcpy(dctx->litBuffer, istart + lhSize, litSize);
+				dctx->litPtr = dctx->litBuffer;
+				dctx->litSize = litSize;
+				memset(dctx->litBuffer + dctx->litSize, 0, WILDCOPY_OVERLENGTH);
+				return lhSize + litSize;
+			}
+			/* direct reference into compressed stream */
+			dctx->litPtr = istart + lhSize;
+			dctx->litSize = litSize;
+			return lhSize + litSize;
+		}
+
+		case set_rle: {
+			U32 const lhlCode = ((istart[0]) >> 2) & 3;
+			size_t litSize, lhSize;
+			switch (lhlCode) {
+			case 0:
+			case 2:
+			default: /* note : default is impossible, since lhlCode into [0..3] */
+				lhSize = 1;
+				litSize = istart[0] >> 3;
+				break;
+			case 1:
+				lhSize = 2;
+				litSize = ZSTD_readLE16(istart) >> 4;
+				break;
+			case 3:
+				lhSize = 3;
+				litSize = ZSTD_readLE24(istart) >> 4;
+				if (srcSize < 4)
+					return ERROR(corruption_detected); /* srcSize >= MIN_CBLOCK_SIZE == 3; here we need lhSize+1 = 4 */
+				break;
+			}
+			if (litSize > ZSTD_BLOCKSIZE_ABSOLUTEMAX)
+				return ERROR(corruption_detected);
+			memset(dctx->litBuffer, istart[lhSize], litSize + WILDCOPY_OVERLENGTH);
+			dctx->litPtr = dctx->litBuffer;
+			dctx->litSize = litSize;
+			return lhSize + 1;
+		}
+		default:
+			return ERROR(corruption_detected); /* impossible */
+		}
+	}
+}
+
+typedef union {
+	FSE_decode_t realData;
+	U32 alignedBy4;
+} FSE_decode_t4;
+
+static const FSE_decode_t4 LL_defaultDTable[(1 << LL_DEFAULTNORMLOG) + 1] = {
+    {{LL_DEFAULTNORMLOG, 1, 1}}, /* header : tableLog, fastMode, fastMode */
+    {{0, 0, 4}},		 /* 0 : base, symbol, bits */
+    {{16, 0, 4}},
+    {{32, 1, 5}},
+    {{0, 3, 5}},
+    {{0, 4, 5}},
+    {{0, 6, 5}},
+    {{0, 7, 5}},
+    {{0, 9, 5}},
+    {{0, 10, 5}},
+    {{0, 12, 5}},
+    {{0, 14, 6}},
+    {{0, 16, 5}},
+    {{0, 18, 5}},
+    {{0, 19, 5}},
+    {{0, 21, 5}},
+    {{0, 22, 5}},
+    {{0, 24, 5}},
+    {{32, 25, 5}},
+    {{0, 26, 5}},
+    {{0, 27, 6}},
+    {{0, 29, 6}},
+    {{0, 31, 6}},
+    {{32, 0, 4}},
+    {{0, 1, 4}},
+    {{0, 2, 5}},
+    {{32, 4, 5}},
+    {{0, 5, 5}},
+    {{32, 7, 5}},
+    {{0, 8, 5}},
+    {{32, 10, 5}},
+    {{0, 11, 5}},
+    {{0, 13, 6}},
+    {{32, 16, 5}},
+    {{0, 17, 5}},
+    {{32, 19, 5}},
+    {{0, 20, 5}},
+    {{32, 22, 5}},
+    {{0, 23, 5}},
+    {{0, 25, 4}},
+    {{16, 25, 4}},
+    {{32, 26, 5}},
+    {{0, 28, 6}},
+    {{0, 30, 6}},
+    {{48, 0, 4}},
+    {{16, 1, 4}},
+    {{32, 2, 5}},
+    {{32, 3, 5}},
+    {{32, 5, 5}},
+    {{32, 6, 5}},
+    {{32, 8, 5}},
+    {{32, 9, 5}},
+    {{32, 11, 5}},
+    {{32, 12, 5}},
+    {{0, 15, 6}},
+    {{32, 17, 5}},
+    {{32, 18, 5}},
+    {{32, 20, 5}},
+    {{32, 21, 5}},
+    {{32, 23, 5}},
+    {{32, 24, 5}},
+    {{0, 35, 6}},
+    {{0, 34, 6}},
+    {{0, 33, 6}},
+    {{0, 32, 6}},
+}; /* LL_defaultDTable */
+
+static const FSE_decode_t4 ML_defaultDTable[(1 << ML_DEFAULTNORMLOG) + 1] = {
+    {{ML_DEFAULTNORMLOG, 1, 1}}, /* header : tableLog, fastMode, fastMode */
+    {{0, 0, 6}},		 /* 0 : base, symbol, bits */
+    {{0, 1, 4}},
+    {{32, 2, 5}},
+    {{0, 3, 5}},
+    {{0, 5, 5}},
+    {{0, 6, 5}},
+    {{0, 8, 5}},
+    {{0, 10, 6}},
+    {{0, 13, 6}},
+    {{0, 16, 6}},
+    {{0, 19, 6}},
+    {{0, 22, 6}},
+    {{0, 25, 6}},
+    {{0, 28, 6}},
+    {{0, 31, 6}},
+    {{0, 33, 6}},
+    {{0, 35, 6}},
+    {{0, 37, 6}},
+    {{0, 39, 6}},
+    {{0, 41, 6}},
+    {{0, 43, 6}},
+    {{0, 45, 6}},
+    {{16, 1, 4}},
+    {{0, 2, 4}},
+    {{32, 3, 5}},
+    {{0, 4, 5}},
+    {{32, 6, 5}},
+    {{0, 7, 5}},
+    {{0, 9, 6}},
+    {{0, 12, 6}},
+    {{0, 15, 6}},
+    {{0, 18, 6}},
+    {{0, 21, 6}},
+    {{0, 24, 6}},
+    {{0, 27, 6}},
+    {{0, 30, 6}},
+    {{0, 32, 6}},
+    {{0, 34, 6}},
+    {{0, 36, 6}},
+    {{0, 38, 6}},
+    {{0, 40, 6}},
+    {{0, 42, 6}},
+    {{0, 44, 6}},
+    {{32, 1, 4}},
+    {{48, 1, 4}},
+    {{16, 2, 4}},
+    {{32, 4, 5}},
+    {{32, 5, 5}},
+    {{32, 7, 5}},
+    {{32, 8, 5}},
+    {{0, 11, 6}},
+    {{0, 14, 6}},
+    {{0, 17, 6}},
+    {{0, 20, 6}},
+    {{0, 23, 6}},
+    {{0, 26, 6}},
+    {{0, 29, 6}},
+    {{0, 52, 6}},
+    {{0, 51, 6}},
+    {{0, 50, 6}},
+    {{0, 49, 6}},
+    {{0, 48, 6}},
+    {{0, 47, 6}},
+    {{0, 46, 6}},
+}; /* ML_defaultDTable */
+
+static const FSE_decode_t4 OF_defaultDTable[(1 << OF_DEFAULTNORMLOG) + 1] = {
+    {{OF_DEFAULTNORMLOG, 1, 1}}, /* header : tableLog, fastMode, fastMode */
+    {{0, 0, 5}},		 /* 0 : base, symbol, bits */
+    {{0, 6, 4}},
+    {{0, 9, 5}},
+    {{0, 15, 5}},
+    {{0, 21, 5}},
+    {{0, 3, 5}},
+    {{0, 7, 4}},
+    {{0, 12, 5}},
+    {{0, 18, 5}},
+    {{0, 23, 5}},
+    {{0, 5, 5}},
+    {{0, 8, 4}},
+    {{0, 14, 5}},
+    {{0, 20, 5}},
+    {{0, 2, 5}},
+    {{16, 7, 4}},
+    {{0, 11, 5}},
+    {{0, 17, 5}},
+    {{0, 22, 5}},
+    {{0, 4, 5}},
+    {{16, 8, 4}},
+    {{0, 13, 5}},
+    {{0, 19, 5}},
+    {{0, 1, 5}},
+    {{16, 6, 4}},
+    {{0, 10, 5}},
+    {{0, 16, 5}},
+    {{0, 28, 5}},
+    {{0, 27, 5}},
+    {{0, 26, 5}},
+    {{0, 25, 5}},
+    {{0, 24, 5}},
+}; /* OF_defaultDTable */
+
+/*! ZSTD_buildSeqTable() :
+	@return : nb bytes read from src,
+			  or an error code if it fails, testable with ZSTD_isError()
+*/
+static size_t ZSTD_buildSeqTable(FSE_DTable *DTableSpace, const FSE_DTable **DTablePtr, symbolEncodingType_e type, U32 max, U32 maxLog, const void *src,
+				 size_t srcSize, const FSE_decode_t4 *defaultTable, U32 flagRepeatTable, void *workspace, size_t workspaceSize)
+{
+	const void *const tmpPtr = defaultTable; /* bypass strict aliasing */
+	switch (type) {
+	case set_rle:
+		if (!srcSize)
+			return ERROR(srcSize_wrong);
+		if ((*(const BYTE *)src) > max)
+			return ERROR(corruption_detected);
+		FSE_buildDTable_rle(DTableSpace, *(const BYTE *)src);
+		*DTablePtr = DTableSpace;
+		return 1;
+	case set_basic: *DTablePtr = (const FSE_DTable *)tmpPtr; return 0;
+	case set_repeat:
+		if (!flagRepeatTable)
+			return ERROR(corruption_detected);
+		return 0;
+	default: /* impossible */
+	case set_compressed: {
+		U32 tableLog;
+		S16 *norm = (S16 *)workspace;
+		size_t const spaceUsed32 = ALIGN(sizeof(S16) * (MaxSeq + 1), sizeof(U32)) >> 2;
+
+		if ((spaceUsed32 << 2) > workspaceSize)
+			return ERROR(GENERIC);
+		workspace = (U32 *)workspace + spaceUsed32;
+		workspaceSize -= (spaceUsed32 << 2);
+		{
+			size_t const headerSize = FSE_readNCount(norm, &max, &tableLog, src, srcSize);
+			if (FSE_isError(headerSize))
+				return ERROR(corruption_detected);
+			if (tableLog > maxLog)
+				return ERROR(corruption_detected);
+			FSE_buildDTable_wksp(DTableSpace, norm, max, tableLog, workspace, workspaceSize);
+			*DTablePtr = DTableSpace;
+			return headerSize;
+		}
+	}
+	}
+}
+
+size_t ZSTD_decodeSeqHeaders(ZSTD_DCtx *dctx, int *nbSeqPtr, const void *src, size_t srcSize)
+{
+	const BYTE *const istart = (const BYTE *const)src;
+	const BYTE *const iend = istart + srcSize;
+	const BYTE *ip = istart;
+
+	/* check */
+	if (srcSize < MIN_SEQUENCES_SIZE)
+		return ERROR(srcSize_wrong);
+
+	/* SeqHead */
+	{
+		int nbSeq = *ip++;
+		if (!nbSeq) {
+			*nbSeqPtr = 0;
+			return 1;
+		}
+		if (nbSeq > 0x7F) {
+			if (nbSeq == 0xFF) {
+				if (ip + 2 > iend)
+					return ERROR(srcSize_wrong);
+				nbSeq = ZSTD_readLE16(ip) + LONGNBSEQ, ip += 2;
+			} else {
+				if (ip >= iend)
+					return ERROR(srcSize_wrong);
+				nbSeq = ((nbSeq - 0x80) << 8) + *ip++;
+			}
+		}
+		*nbSeqPtr = nbSeq;
+	}
+
+	/* FSE table descriptors */
+	if (ip + 4 > iend)
+		return ERROR(srcSize_wrong); /* minimum possible size */
+	{
+		symbolEncodingType_e const LLtype = (symbolEncodingType_e)(*ip >> 6);
+		symbolEncodingType_e const OFtype = (symbolEncodingType_e)((*ip >> 4) & 3);
+		symbolEncodingType_e const MLtype = (symbolEncodingType_e)((*ip >> 2) & 3);
+		ip++;
+
+		/* Build DTables */
+		{
+			size_t const llhSize = ZSTD_buildSeqTable(dctx->entropy.LLTable, &dctx->LLTptr, LLtype, MaxLL, LLFSELog, ip, iend - ip,
+								  LL_defaultDTable, dctx->fseEntropy, dctx->entropy.workspace, sizeof(dctx->entropy.workspace));
+			if (ZSTD_isError(llhSize))
+				return ERROR(corruption_detected);
+			ip += llhSize;
+		}
+		{
+			size_t const ofhSize = ZSTD_buildSeqTable(dctx->entropy.OFTable, &dctx->OFTptr, OFtype, MaxOff, OffFSELog, ip, iend - ip,
+								  OF_defaultDTable, dctx->fseEntropy, dctx->entropy.workspace, sizeof(dctx->entropy.workspace));
+			if (ZSTD_isError(ofhSize))
+				return ERROR(corruption_detected);
+			ip += ofhSize;
+		}
+		{
+			size_t const mlhSize = ZSTD_buildSeqTable(dctx->entropy.MLTable, &dctx->MLTptr, MLtype, MaxML, MLFSELog, ip, iend - ip,
+								  ML_defaultDTable, dctx->fseEntropy, dctx->entropy.workspace, sizeof(dctx->entropy.workspace));
+			if (ZSTD_isError(mlhSize))
+				return ERROR(corruption_detected);
+			ip += mlhSize;
+		}
+	}
+
+	return ip - istart;
+}
+
+typedef struct {
+	size_t litLength;
+	size_t matchLength;
+	size_t offset;
+	const BYTE *match;
+} seq_t;
+
+typedef struct {
+	BIT_DStream_t DStream;
+	FSE_DState_t stateLL;
+	FSE_DState_t stateOffb;
+	FSE_DState_t stateML;
+	size_t prevOffset[ZSTD_REP_NUM];
+	const BYTE *base;
+	size_t pos;
+	uPtrDiff gotoDict;
+} seqState_t;
+
+FORCE_NOINLINE
+size_t ZSTD_execSequenceLast7(BYTE *op, BYTE *const oend, seq_t sequence, const BYTE **litPtr, const BYTE *const litLimit, const BYTE *const base,
+			      const BYTE *const vBase, const BYTE *const dictEnd)
+{
+	BYTE *const oLitEnd = op + sequence.litLength;
+	size_t const sequenceLength = sequence.litLength + sequence.matchLength;
+	BYTE *const oMatchEnd = op + sequenceLength; /* risk : address space overflow (32-bits) */
+	BYTE *const oend_w = oend - WILDCOPY_OVERLENGTH;
+	const BYTE *const iLitEnd = *litPtr + sequence.litLength;
+	const BYTE *match = oLitEnd - sequence.offset;
+
+	/* check */
+	if (oMatchEnd > oend)
+		return ERROR(dstSize_tooSmall); /* last match must start at a minimum distance of WILDCOPY_OVERLENGTH from oend */
+	if (iLitEnd > litLimit)
+		return ERROR(corruption_detected); /* over-read beyond lit buffer */
+	if (oLitEnd <= oend_w)
+		return ERROR(GENERIC); /* Precondition */
+
+	/* copy literals */
+	if (op < oend_w) {
+		ZSTD_wildcopy(op, *litPtr, oend_w - op);
+		*litPtr += oend_w - op;
+		op = oend_w;
+	}
+	while (op < oLitEnd)
+		*op++ = *(*litPtr)++;
+
+	/* copy Match */
+	if (sequence.offset > (size_t)(oLitEnd - base)) {
+		/* offset beyond prefix */
+		if (sequence.offset > (size_t)(oLitEnd - vBase))
+			return ERROR(corruption_detected);
+		match = dictEnd - (base - match);
+		if (match + sequence.matchLength <= dictEnd) {
+			memmove(oLitEnd, match, sequence.matchLength);
+			return sequenceLength;
+		}
+		/* span extDict & currPrefixSegment */
+		{
+			size_t const length1 = dictEnd - match;
+			memmove(oLitEnd, match, length1);
+			op = oLitEnd + length1;
+			sequence.matchLength -= length1;
+			match = base;
+		}
+	}
+	while (op < oMatchEnd)
+		*op++ = *match++;
+	return sequenceLength;
+}
+
+static seq_t ZSTD_decodeSequence(seqState_t *seqState)
+{
+	seq_t seq;
+
+	U32 const llCode = FSE_peekSymbol(&seqState->stateLL);
+	U32 const mlCode = FSE_peekSymbol(&seqState->stateML);
+	U32 const ofCode = FSE_peekSymbol(&seqState->stateOffb); /* <= maxOff, by table construction */
+
+	U32 const llBits = LL_bits[llCode];
+	U32 const mlBits = ML_bits[mlCode];
+	U32 const ofBits = ofCode;
+	U32 const totalBits = llBits + mlBits + ofBits;
+
+	static const U32 LL_base[MaxLL + 1] = {0,  1,  2,  3,  4,  5,  6,  7,  8,    9,     10,    11,    12,    13,     14,     15,     16,     18,
+					       20, 22, 24, 28, 32, 40, 48, 64, 0x80, 0x100, 0x200, 0x400, 0x800, 0x1000, 0x2000, 0x4000, 0x8000, 0x10000};
+
+	static const U32 ML_base[MaxML + 1] = {3,  4,  5,  6,  7,  8,  9,  10,   11,    12,    13,    14,    15,     16,     17,     18,     19,     20,
+					       21, 22, 23, 24, 25, 26, 27, 28,   29,    30,    31,    32,    33,     34,     35,     37,     39,     41,
+					       43, 47, 51, 59, 67, 83, 99, 0x83, 0x103, 0x203, 0x403, 0x803, 0x1003, 0x2003, 0x4003, 0x8003, 0x10003};
+
+	static const U32 OF_base[MaxOff + 1] = {0,       1,	1,	5,	0xD,      0x1D,      0x3D,      0x7D,      0xFD,     0x1FD,
+						0x3FD,   0x7FD,    0xFFD,    0x1FFD,   0x3FFD,   0x7FFD,    0xFFFD,    0x1FFFD,   0x3FFFD,  0x7FFFD,
+						0xFFFFD, 0x1FFFFD, 0x3FFFFD, 0x7FFFFD, 0xFFFFFD, 0x1FFFFFD, 0x3FFFFFD, 0x7FFFFFD, 0xFFFFFFD};
+
+	/* sequence */
+	{
+		size_t offset;
+		if (!ofCode)
+			offset = 0;
+		else {
+			offset = OF_base[ofCode] + BIT_readBitsFast(&seqState->DStream, ofBits); /* <=  (ZSTD_WINDOWLOG_MAX-1) bits */
+			if (ZSTD_32bits())
+				BIT_reloadDStream(&seqState->DStream);
+		}
+
+		if (ofCode <= 1) {
+			offset += (llCode == 0);
+			if (offset) {
+				size_t temp = (offset == 3) ? seqState->prevOffset[0] - 1 : seqState->prevOffset[offset];
+				temp += !temp; /* 0 is not valid; input is corrupted; force offset to 1 */
+				if (offset != 1)
+					seqState->prevOffset[2] = seqState->prevOffset[1];
+				seqState->prevOffset[1] = seqState->prevOffset[0];
+				seqState->prevOffset[0] = offset = temp;
+			} else {
+				offset = seqState->prevOffset[0];
+			}
+		} else {
+			seqState->prevOffset[2] = seqState->prevOffset[1];
+			seqState->prevOffset[1] = seqState->prevOffset[0];
+			seqState->prevOffset[0] = offset;
+		}
+		seq.offset = offset;
+	}
+
+	seq.matchLength = ML_base[mlCode] + ((mlCode > 31) ? BIT_readBitsFast(&seqState->DStream, mlBits) : 0); /* <=  16 bits */
+	if (ZSTD_32bits() && (mlBits + llBits > 24))
+		BIT_reloadDStream(&seqState->DStream);
+
+	seq.litLength = LL_base[llCode] + ((llCode > 15) ? BIT_readBitsFast(&seqState->DStream, llBits) : 0); /* <=  16 bits */
+	if (ZSTD_32bits() || (totalBits > 64 - 7 - (LLFSELog + MLFSELog + OffFSELog)))
+		BIT_reloadDStream(&seqState->DStream);
+
+	/* ANS state update */
+	FSE_updateState(&seqState->stateLL, &seqState->DStream); /* <=  9 bits */
+	FSE_updateState(&seqState->stateML, &seqState->DStream); /* <=  9 bits */
+	if (ZSTD_32bits())
+		BIT_reloadDStream(&seqState->DStream);		   /* <= 18 bits */
+	FSE_updateState(&seqState->stateOffb, &seqState->DStream); /* <=  8 bits */
+
+	seq.match = NULL;
+
+	return seq;
+}
+
+FORCE_INLINE
+size_t ZSTD_execSequence(BYTE *op, BYTE *const oend, seq_t sequence, const BYTE **litPtr, const BYTE *const litLimit, const BYTE *const base,
+			 const BYTE *const vBase, const BYTE *const dictEnd)
+{
+	BYTE *const oLitEnd = op + sequence.litLength;
+	size_t const sequenceLength = sequence.litLength + sequence.matchLength;
+	BYTE *const oMatchEnd = op + sequenceLength; /* risk : address space overflow (32-bits) */
+	BYTE *const oend_w = oend - WILDCOPY_OVERLENGTH;
+	const BYTE *const iLitEnd = *litPtr + sequence.litLength;
+	const BYTE *match = oLitEnd - sequence.offset;
+
+	/* check */
+	if (oMatchEnd > oend)
+		return ERROR(dstSize_tooSmall); /* last match must start at a minimum distance of WILDCOPY_OVERLENGTH from oend */
+	if (iLitEnd > litLimit)
+		return ERROR(corruption_detected); /* over-read beyond lit buffer */
+	if (oLitEnd > oend_w)
+		return ZSTD_execSequenceLast7(op, oend, sequence, litPtr, litLimit, base, vBase, dictEnd);
+
+	/* copy Literals */
+	ZSTD_copy8(op, *litPtr);
+	if (sequence.litLength > 8)
+		ZSTD_wildcopy(op + 8, (*litPtr) + 8,
+			      sequence.litLength - 8); /* note : since oLitEnd <= oend-WILDCOPY_OVERLENGTH, no risk of overwrite beyond oend */
+	op = oLitEnd;
+	*litPtr = iLitEnd; /* update for next sequence */
+
+	/* copy Match */
+	if (sequence.offset > (size_t)(oLitEnd - base)) {
+		/* offset beyond prefix */
+		if (sequence.offset > (size_t)(oLitEnd - vBase))
+			return ERROR(corruption_detected);
+		match = dictEnd + (match - base);
+		if (match + sequence.matchLength <= dictEnd) {
+			memmove(oLitEnd, match, sequence.matchLength);
+			return sequenceLength;
+		}
+		/* span extDict & currPrefixSegment */
+		{
+			size_t const length1 = dictEnd - match;
+			memmove(oLitEnd, match, length1);
+			op = oLitEnd + length1;
+			sequence.matchLength -= length1;
+			match = base;
+			if (op > oend_w || sequence.matchLength < MINMATCH) {
+				U32 i;
+				for (i = 0; i < sequence.matchLength; ++i)
+					op[i] = match[i];
+				return sequenceLength;
+			}
+		}
+	}
+	/* Requirement: op <= oend_w && sequence.matchLength >= MINMATCH */
+
+	/* match within prefix */
+	if (sequence.offset < 8) {
+		/* close range match, overlap */
+		static const U32 dec32table[] = {0, 1, 2, 1, 4, 4, 4, 4};   /* added */
+		static const int dec64table[] = {8, 8, 8, 7, 8, 9, 10, 11}; /* subtracted */
+		int const sub2 = dec64table[sequence.offset];
+		op[0] = match[0];
+		op[1] = match[1];
+		op[2] = match[2];
+		op[3] = match[3];
+		match += dec32table[sequence.offset];
+		ZSTD_copy4(op + 4, match);
+		match -= sub2;
+	} else {
+		ZSTD_copy8(op, match);
+	}
+	op += 8;
+	match += 8;
+
+	if (oMatchEnd > oend - (16 - MINMATCH)) {
+		if (op < oend_w) {
+			ZSTD_wildcopy(op, match, oend_w - op);
+			match += oend_w - op;
+			op = oend_w;
+		}
+		while (op < oMatchEnd)
+			*op++ = *match++;
+	} else {
+		ZSTD_wildcopy(op, match, (ptrdiff_t)sequence.matchLength - 8); /* works even if matchLength < 8 */
+	}
+	return sequenceLength;
+}
+
+static size_t ZSTD_decompressSequences(ZSTD_DCtx *dctx, void *dst, size_t maxDstSize, const void *seqStart, size_t seqSize)
+{
+	const BYTE *ip = (const BYTE *)seqStart;
+	const BYTE *const iend = ip + seqSize;
+	BYTE *const ostart = (BYTE * const)dst;
+	BYTE *const oend = ostart + maxDstSize;
+	BYTE *op = ostart;
+	const BYTE *litPtr = dctx->litPtr;
+	const BYTE *const litEnd = litPtr + dctx->litSize;
+	const BYTE *const base = (const BYTE *)(dctx->base);
+	const BYTE *const vBase = (const BYTE *)(dctx->vBase);
+	const BYTE *const dictEnd = (const BYTE *)(dctx->dictEnd);
+	int nbSeq;
+
+	/* Build Decoding Tables */
+	{
+		size_t const seqHSize = ZSTD_decodeSeqHeaders(dctx, &nbSeq, ip, seqSize);
+		if (ZSTD_isError(seqHSize))
+			return seqHSize;
+		ip += seqHSize;
+	}
+
+	/* Regen sequences */
+	if (nbSeq) {
+		seqState_t seqState;
+		dctx->fseEntropy = 1;
+		{
+			U32 i;
+			for (i = 0; i < ZSTD_REP_NUM; i++)
+				seqState.prevOffset[i] = dctx->entropy.rep[i];
+		}
+		CHECK_E(BIT_initDStream(&seqState.DStream, ip, iend - ip), corruption_detected);
+		FSE_initDState(&seqState.stateLL, &seqState.DStream, dctx->LLTptr);
+		FSE_initDState(&seqState.stateOffb, &seqState.DStream, dctx->OFTptr);
+		FSE_initDState(&seqState.stateML, &seqState.DStream, dctx->MLTptr);
+
+		for (; (BIT_reloadDStream(&(seqState.DStream)) <= BIT_DStream_completed) && nbSeq;) {
+			nbSeq--;
+			{
+				seq_t const sequence = ZSTD_decodeSequence(&seqState);
+				size_t const oneSeqSize = ZSTD_execSequence(op, oend, sequence, &litPtr, litEnd, base, vBase, dictEnd);
+				if (ZSTD_isError(oneSeqSize))
+					return oneSeqSize;
+				op += oneSeqSize;
+			}
+		}
+
+		/* check if reached exact end */
+		if (nbSeq)
+			return ERROR(corruption_detected);
+		/* save reps for next block */
+		{
+			U32 i;
+			for (i = 0; i < ZSTD_REP_NUM; i++)
+				dctx->entropy.rep[i] = (U32)(seqState.prevOffset[i]);
+		}
+	}
+
+	/* last literal segment */
+	{
+		size_t const lastLLSize = litEnd - litPtr;
+		if (lastLLSize > (size_t)(oend - op))
+			return ERROR(dstSize_tooSmall);
+		memcpy(op, litPtr, lastLLSize);
+		op += lastLLSize;
+	}
+
+	return op - ostart;
+}
+
+FORCE_INLINE seq_t ZSTD_decodeSequenceLong_generic(seqState_t *seqState, int const longOffsets)
+{
+	seq_t seq;
+
+	U32 const llCode = FSE_peekSymbol(&seqState->stateLL);
+	U32 const mlCode = FSE_peekSymbol(&seqState->stateML);
+	U32 const ofCode = FSE_peekSymbol(&seqState->stateOffb); /* <= maxOff, by table construction */
+
+	U32 const llBits = LL_bits[llCode];
+	U32 const mlBits = ML_bits[mlCode];
+	U32 const ofBits = ofCode;
+	U32 const totalBits = llBits + mlBits + ofBits;
+
+	static const U32 LL_base[MaxLL + 1] = {0,  1,  2,  3,  4,  5,  6,  7,  8,    9,     10,    11,    12,    13,     14,     15,     16,     18,
+					       20, 22, 24, 28, 32, 40, 48, 64, 0x80, 0x100, 0x200, 0x400, 0x800, 0x1000, 0x2000, 0x4000, 0x8000, 0x10000};
+
+	static const U32 ML_base[MaxML + 1] = {3,  4,  5,  6,  7,  8,  9,  10,   11,    12,    13,    14,    15,     16,     17,     18,     19,     20,
+					       21, 22, 23, 24, 25, 26, 27, 28,   29,    30,    31,    32,    33,     34,     35,     37,     39,     41,
+					       43, 47, 51, 59, 67, 83, 99, 0x83, 0x103, 0x203, 0x403, 0x803, 0x1003, 0x2003, 0x4003, 0x8003, 0x10003};
+
+	static const U32 OF_base[MaxOff + 1] = {0,       1,	1,	5,	0xD,      0x1D,      0x3D,      0x7D,      0xFD,     0x1FD,
+						0x3FD,   0x7FD,    0xFFD,    0x1FFD,   0x3FFD,   0x7FFD,    0xFFFD,    0x1FFFD,   0x3FFFD,  0x7FFFD,
+						0xFFFFD, 0x1FFFFD, 0x3FFFFD, 0x7FFFFD, 0xFFFFFD, 0x1FFFFFD, 0x3FFFFFD, 0x7FFFFFD, 0xFFFFFFD};
+
+	/* sequence */
+	{
+		size_t offset;
+		if (!ofCode)
+			offset = 0;
+		else {
+			if (longOffsets) {
+				int const extraBits = ofBits - MIN(ofBits, STREAM_ACCUMULATOR_MIN);
+				offset = OF_base[ofCode] + (BIT_readBitsFast(&seqState->DStream, ofBits - extraBits) << extraBits);
+				if (ZSTD_32bits() || extraBits)
+					BIT_reloadDStream(&seqState->DStream);
+				if (extraBits)
+					offset += BIT_readBitsFast(&seqState->DStream, extraBits);
+			} else {
+				offset = OF_base[ofCode] + BIT_readBitsFast(&seqState->DStream, ofBits); /* <=  (ZSTD_WINDOWLOG_MAX-1) bits */
+				if (ZSTD_32bits())
+					BIT_reloadDStream(&seqState->DStream);
+			}
+		}
+
+		if (ofCode <= 1) {
+			offset += (llCode == 0);
+			if (offset) {
+				size_t temp = (offset == 3) ? seqState->prevOffset[0] - 1 : seqState->prevOffset[offset];
+				temp += !temp; /* 0 is not valid; input is corrupted; force offset to 1 */
+				if (offset != 1)
+					seqState->prevOffset[2] = seqState->prevOffset[1];
+				seqState->prevOffset[1] = seqState->prevOffset[0];
+				seqState->prevOffset[0] = offset = temp;
+			} else {
+				offset = seqState->prevOffset[0];
+			}
+		} else {
+			seqState->prevOffset[2] = seqState->prevOffset[1];
+			seqState->prevOffset[1] = seqState->prevOffset[0];
+			seqState->prevOffset[0] = offset;
+		}
+		seq.offset = offset;
+	}
+
+	seq.matchLength = ML_base[mlCode] + ((mlCode > 31) ? BIT_readBitsFast(&seqState->DStream, mlBits) : 0); /* <=  16 bits */
+	if (ZSTD_32bits() && (mlBits + llBits > 24))
+		BIT_reloadDStream(&seqState->DStream);
+
+	seq.litLength = LL_base[llCode] + ((llCode > 15) ? BIT_readBitsFast(&seqState->DStream, llBits) : 0); /* <=  16 bits */
+	if (ZSTD_32bits() || (totalBits > 64 - 7 - (LLFSELog + MLFSELog + OffFSELog)))
+		BIT_reloadDStream(&seqState->DStream);
+
+	{
+		size_t const pos = seqState->pos + seq.litLength;
+		seq.match = seqState->base + pos - seq.offset; /* single memory segment */
+		if (seq.offset > pos)
+			seq.match += seqState->gotoDict; /* separate memory segment */
+		seqState->pos = pos + seq.matchLength;
+	}
+
+	/* ANS state update */
+	FSE_updateState(&seqState->stateLL, &seqState->DStream); /* <=  9 bits */
+	FSE_updateState(&seqState->stateML, &seqState->DStream); /* <=  9 bits */
+	if (ZSTD_32bits())
+		BIT_reloadDStream(&seqState->DStream);		   /* <= 18 bits */
+	FSE_updateState(&seqState->stateOffb, &seqState->DStream); /* <=  8 bits */
+
+	return seq;
+}
+
+static seq_t ZSTD_decodeSequenceLong(seqState_t *seqState, unsigned const windowSize)
+{
+	if (ZSTD_highbit32(windowSize) > STREAM_ACCUMULATOR_MIN) {
+		return ZSTD_decodeSequenceLong_generic(seqState, 1);
+	} else {
+		return ZSTD_decodeSequenceLong_generic(seqState, 0);
+	}
+}
+
+FORCE_INLINE
+size_t ZSTD_execSequenceLong(BYTE *op, BYTE *const oend, seq_t sequence, const BYTE **litPtr, const BYTE *const litLimit, const BYTE *const base,
+			     const BYTE *const vBase, const BYTE *const dictEnd)
+{
+	BYTE *const oLitEnd = op + sequence.litLength;
+	size_t const sequenceLength = sequence.litLength + sequence.matchLength;
+	BYTE *const oMatchEnd = op + sequenceLength; /* risk : address space overflow (32-bits) */
+	BYTE *const oend_w = oend - WILDCOPY_OVERLENGTH;
+	const BYTE *const iLitEnd = *litPtr + sequence.litLength;
+	const BYTE *match = sequence.match;
+
+	/* check */
+	if (oMatchEnd > oend)
+		return ERROR(dstSize_tooSmall); /* last match must start at a minimum distance of WILDCOPY_OVERLENGTH from oend */
+	if (iLitEnd > litLimit)
+		return ERROR(corruption_detected); /* over-read beyond lit buffer */
+	if (oLitEnd > oend_w)
+		return ZSTD_execSequenceLast7(op, oend, sequence, litPtr, litLimit, base, vBase, dictEnd);
+
+	/* copy Literals */
+	ZSTD_copy8(op, *litPtr);
+	if (sequence.litLength > 8)
+		ZSTD_wildcopy(op + 8, (*litPtr) + 8,
+			      sequence.litLength - 8); /* note : since oLitEnd <= oend-WILDCOPY_OVERLENGTH, no risk of overwrite beyond oend */
+	op = oLitEnd;
+	*litPtr = iLitEnd; /* update for next sequence */
+
+	/* copy Match */
+	if (sequence.offset > (size_t)(oLitEnd - base)) {
+		/* offset beyond prefix */
+		if (sequence.offset > (size_t)(oLitEnd - vBase))
+			return ERROR(corruption_detected);
+		if (match + sequence.matchLength <= dictEnd) {
+			memmove(oLitEnd, match, sequence.matchLength);
+			return sequenceLength;
+		}
+		/* span extDict & currPrefixSegment */
+		{
+			size_t const length1 = dictEnd - match;
+			memmove(oLitEnd, match, length1);
+			op = oLitEnd + length1;
+			sequence.matchLength -= length1;
+			match = base;
+			if (op > oend_w || sequence.matchLength < MINMATCH) {
+				U32 i;
+				for (i = 0; i < sequence.matchLength; ++i)
+					op[i] = match[i];
+				return sequenceLength;
+			}
+		}
+	}
+	/* Requirement: op <= oend_w && sequence.matchLength >= MINMATCH */
+
+	/* match within prefix */
+	if (sequence.offset < 8) {
+		/* close range match, overlap */
+		static const U32 dec32table[] = {0, 1, 2, 1, 4, 4, 4, 4};   /* added */
+		static const int dec64table[] = {8, 8, 8, 7, 8, 9, 10, 11}; /* subtracted */
+		int const sub2 = dec64table[sequence.offset];
+		op[0] = match[0];
+		op[1] = match[1];
+		op[2] = match[2];
+		op[3] = match[3];
+		match += dec32table[sequence.offset];
+		ZSTD_copy4(op + 4, match);
+		match -= sub2;
+	} else {
+		ZSTD_copy8(op, match);
+	}
+	op += 8;
+	match += 8;
+
+	if (oMatchEnd > oend - (16 - MINMATCH)) {
+		if (op < oend_w) {
+			ZSTD_wildcopy(op, match, oend_w - op);
+			match += oend_w - op;
+			op = oend_w;
+		}
+		while (op < oMatchEnd)
+			*op++ = *match++;
+	} else {
+		ZSTD_wildcopy(op, match, (ptrdiff_t)sequence.matchLength - 8); /* works even if matchLength < 8 */
+	}
+	return sequenceLength;
+}
+
+static size_t ZSTD_decompressSequencesLong(ZSTD_DCtx *dctx, void *dst, size_t maxDstSize, const void *seqStart, size_t seqSize)
+{
+	const BYTE *ip = (const BYTE *)seqStart;
+	const BYTE *const iend = ip + seqSize;
+	BYTE *const ostart = (BYTE * const)dst;
+	BYTE *const oend = ostart + maxDstSize;
+	BYTE *op = ostart;
+	const BYTE *litPtr = dctx->litPtr;
+	const BYTE *const litEnd = litPtr + dctx->litSize;
+	const BYTE *const base = (const BYTE *)(dctx->base);
+	const BYTE *const vBase = (const BYTE *)(dctx->vBase);
+	const BYTE *const dictEnd = (const BYTE *)(dctx->dictEnd);
+	unsigned const windowSize = dctx->fParams.windowSize;
+	int nbSeq;
+
+	/* Build Decoding Tables */
+	{
+		size_t const seqHSize = ZSTD_decodeSeqHeaders(dctx, &nbSeq, ip, seqSize);
+		if (ZSTD_isError(seqHSize))
+			return seqHSize;
+		ip += seqHSize;
+	}
+
+	/* Regen sequences */
+	if (nbSeq) {
+#define STORED_SEQS 4
+#define STOSEQ_MASK (STORED_SEQS - 1)
+#define ADVANCED_SEQS 4
+		seq_t *sequences = (seq_t *)dctx->entropy.workspace;
+		int const seqAdvance = MIN(nbSeq, ADVANCED_SEQS);
+		seqState_t seqState;
+		int seqNb;
+		ZSTD_STATIC_ASSERT(sizeof(dctx->entropy.workspace) >= sizeof(seq_t) * STORED_SEQS);
+		dctx->fseEntropy = 1;
+		{
+			U32 i;
+			for (i = 0; i < ZSTD_REP_NUM; i++)
+				seqState.prevOffset[i] = dctx->entropy.rep[i];
+		}
+		seqState.base = base;
+		seqState.pos = (size_t)(op - base);
+		seqState.gotoDict = (uPtrDiff)dictEnd - (uPtrDiff)base; /* cast to avoid undefined behaviour */
+		CHECK_E(BIT_initDStream(&seqState.DStream, ip, iend - ip), corruption_detected);
+		FSE_initDState(&seqState.stateLL, &seqState.DStream, dctx->LLTptr);
+		FSE_initDState(&seqState.stateOffb, &seqState.DStream, dctx->OFTptr);
+		FSE_initDState(&seqState.stateML, &seqState.DStream, dctx->MLTptr);
+
+		/* prepare in advance */
+		for (seqNb = 0; (BIT_reloadDStream(&seqState.DStream) <= BIT_DStream_completed) && seqNb < seqAdvance; seqNb++) {
+			sequences[seqNb] = ZSTD_decodeSequenceLong(&seqState, windowSize);
+		}
+		if (seqNb < seqAdvance)
+			return ERROR(corruption_detected);
+
+		/* decode and decompress */
+		for (; (BIT_reloadDStream(&(seqState.DStream)) <= BIT_DStream_completed) && seqNb < nbSeq; seqNb++) {
+			seq_t const sequence = ZSTD_decodeSequenceLong(&seqState, windowSize);
+			size_t const oneSeqSize =
+			    ZSTD_execSequenceLong(op, oend, sequences[(seqNb - ADVANCED_SEQS) & STOSEQ_MASK], &litPtr, litEnd, base, vBase, dictEnd);
+			if (ZSTD_isError(oneSeqSize))
+				return oneSeqSize;
+			ZSTD_PREFETCH(sequence.match);
+			sequences[seqNb & STOSEQ_MASK] = sequence;
+			op += oneSeqSize;
+		}
+		if (seqNb < nbSeq)
+			return ERROR(corruption_detected);
+
+		/* finish queue */
+		seqNb -= seqAdvance;
+		for (; seqNb < nbSeq; seqNb++) {
+			size_t const oneSeqSize = ZSTD_execSequenceLong(op, oend, sequences[seqNb & STOSEQ_MASK], &litPtr, litEnd, base, vBase, dictEnd);
+			if (ZSTD_isError(oneSeqSize))
+				return oneSeqSize;
+			op += oneSeqSize;
+		}
+
+		/* save reps for next block */
+		{
+			U32 i;
+			for (i = 0; i < ZSTD_REP_NUM; i++)
+				dctx->entropy.rep[i] = (U32)(seqState.prevOffset[i]);
+		}
+	}
+
+	/* last literal segment */
+	{
+		size_t const lastLLSize = litEnd - litPtr;
+		if (lastLLSize > (size_t)(oend - op))
+			return ERROR(dstSize_tooSmall);
+		memcpy(op, litPtr, lastLLSize);
+		op += lastLLSize;
+	}
+
+	return op - ostart;
+}
+
+static size_t ZSTD_decompressBlock_internal(ZSTD_DCtx *dctx, void *dst, size_t dstCapacity, const void *src, size_t srcSize)
+{ /* blockType == blockCompressed */
+	const BYTE *ip = (const BYTE *)src;
+
+	if (srcSize >= ZSTD_BLOCKSIZE_ABSOLUTEMAX)
+		return ERROR(srcSize_wrong);
+
+	/* Decode literals section */
+	{
+		size_t const litCSize = ZSTD_decodeLiteralsBlock(dctx, src, srcSize);
+		if (ZSTD_isError(litCSize))
+			return litCSize;
+		ip += litCSize;
+		srcSize -= litCSize;
+	}
+	if (sizeof(size_t) > 4) /* do not enable prefetching on 32-bits x86, as it's performance detrimental */
+				/* likely because of register pressure */
+				/* if that's the correct cause, then 32-bits ARM should be affected differently */
+				/* it would be good to test this on ARM real hardware, to see if prefetch version improves speed */
+		if (dctx->fParams.windowSize > (1 << 23))
+			return ZSTD_decompressSequencesLong(dctx, dst, dstCapacity, ip, srcSize);
+	return ZSTD_decompressSequences(dctx, dst, dstCapacity, ip, srcSize);
+}
+
+static void ZSTD_checkContinuity(ZSTD_DCtx *dctx, const void *dst)
+{
+	if (dst != dctx->previousDstEnd) { /* not contiguous */
+		dctx->dictEnd = dctx->previousDstEnd;
+		dctx->vBase = (const char *)dst - ((const char *)(dctx->previousDstEnd) - (const char *)(dctx->base));
+		dctx->base = dst;
+		dctx->previousDstEnd = dst;
+	}
+}
+
+size_t ZSTD_decompressBlock(ZSTD_DCtx *dctx, void *dst, size_t dstCapacity, const void *src, size_t srcSize)
+{
+	size_t dSize;
+	ZSTD_checkContinuity(dctx, dst);
+	dSize = ZSTD_decompressBlock_internal(dctx, dst, dstCapacity, src, srcSize);
+	dctx->previousDstEnd = (char *)dst + dSize;
+	return dSize;
+}
+
+/** ZSTD_insertBlock() :
+	insert `src` block into `dctx` history. Useful to track uncompressed blocks. */
+size_t ZSTD_insertBlock(ZSTD_DCtx *dctx, const void *blockStart, size_t blockSize)
+{
+	ZSTD_checkContinuity(dctx, blockStart);
+	dctx->previousDstEnd = (const char *)blockStart + blockSize;
+	return blockSize;
+}
+
+size_t ZSTD_generateNxBytes(void *dst, size_t dstCapacity, BYTE byte, size_t length)
+{
+	if (length > dstCapacity)
+		return ERROR(dstSize_tooSmall);
+	memset(dst, byte, length);
+	return length;
+}
+
+/** ZSTD_findFrameCompressedSize() :
+ *  compatible with legacy mode
+ *  `src` must point to the start of a ZSTD frame, ZSTD legacy frame, or skippable frame
+ *  `srcSize` must be at least as large as the frame contained
+ *  @return : the compressed size of the frame starting at `src` */
+size_t ZSTD_findFrameCompressedSize(const void *src, size_t srcSize)
+{
+	if (srcSize >= ZSTD_skippableHeaderSize && (ZSTD_readLE32(src) & 0xFFFFFFF0U) == ZSTD_MAGIC_SKIPPABLE_START) {
+		return ZSTD_skippableHeaderSize + ZSTD_readLE32((const BYTE *)src + 4);
+	} else {
+		const BYTE *ip = (const BYTE *)src;
+		const BYTE *const ipstart = ip;
+		size_t remainingSize = srcSize;
+		ZSTD_frameParams fParams;
+
+		size_t const headerSize = ZSTD_frameHeaderSize(ip, remainingSize);
+		if (ZSTD_isError(headerSize))
+			return headerSize;
+
+		/* Frame Header */
+		{
+			size_t const ret = ZSTD_getFrameParams(&fParams, ip, remainingSize);
+			if (ZSTD_isError(ret))
+				return ret;
+			if (ret > 0)
+				return ERROR(srcSize_wrong);
+		}
+
+		ip += headerSize;
+		remainingSize -= headerSize;
+
+		/* Loop on each block */
+		while (1) {
+			blockProperties_t blockProperties;
+			size_t const cBlockSize = ZSTD_getcBlockSize(ip, remainingSize, &blockProperties);
+			if (ZSTD_isError(cBlockSize))
+				return cBlockSize;
+
+			if (ZSTD_blockHeaderSize + cBlockSize > remainingSize)
+				return ERROR(srcSize_wrong);
+
+			ip += ZSTD_blockHeaderSize + cBlockSize;
+			remainingSize -= ZSTD_blockHeaderSize + cBlockSize;
+
+			if (blockProperties.lastBlock)
+				break;
+		}
+
+		if (fParams.checksumFlag) { /* Frame content checksum */
+			if (remainingSize < 4)
+				return ERROR(srcSize_wrong);
+			ip += 4;
+			remainingSize -= 4;
+		}
+
+		return ip - ipstart;
+	}
+}
+
+/*! ZSTD_decompressFrame() :
+*   @dctx must be properly initialized */
+static size_t ZSTD_decompressFrame(ZSTD_DCtx *dctx, void *dst, size_t dstCapacity, const void **srcPtr, size_t *srcSizePtr)
+{
+	const BYTE *ip = (const BYTE *)(*srcPtr);
+	BYTE *const ostart = (BYTE * const)dst;
+	BYTE *const oend = ostart + dstCapacity;
+	BYTE *op = ostart;
+	size_t remainingSize = *srcSizePtr;
+
+	/* check */
+	if (remainingSize < ZSTD_frameHeaderSize_min + ZSTD_blockHeaderSize)
+		return ERROR(srcSize_wrong);
+
+	/* Frame Header */
+	{
+		size_t const frameHeaderSize = ZSTD_frameHeaderSize(ip, ZSTD_frameHeaderSize_prefix);
+		if (ZSTD_isError(frameHeaderSize))
+			return frameHeaderSize;
+		if (remainingSize < frameHeaderSize + ZSTD_blockHeaderSize)
+			return ERROR(srcSize_wrong);
+		CHECK_F(ZSTD_decodeFrameHeader(dctx, ip, frameHeaderSize));
+		ip += frameHeaderSize;
+		remainingSize -= frameHeaderSize;
+	}
+
+	/* Loop on each block */
+	while (1) {
+		size_t decodedSize;
+		blockProperties_t blockProperties;
+		size_t const cBlockSize = ZSTD_getcBlockSize(ip, remainingSize, &blockProperties);
+		if (ZSTD_isError(cBlockSize))
+			return cBlockSize;
+
+		ip += ZSTD_blockHeaderSize;
+		remainingSize -= ZSTD_blockHeaderSize;
+		if (cBlockSize > remainingSize)
+			return ERROR(srcSize_wrong);
+
+		switch (blockProperties.blockType) {
+		case bt_compressed: decodedSize = ZSTD_decompressBlock_internal(dctx, op, oend - op, ip, cBlockSize); break;
+		case bt_raw: decodedSize = ZSTD_copyRawBlock(op, oend - op, ip, cBlockSize); break;
+		case bt_rle: decodedSize = ZSTD_generateNxBytes(op, oend - op, *ip, blockProperties.origSize); break;
+		case bt_reserved:
+		default: return ERROR(corruption_detected);
+		}
+
+		if (ZSTD_isError(decodedSize))
+			return decodedSize;
+		if (dctx->fParams.checksumFlag)
+			xxh64_update(&dctx->xxhState, op, decodedSize);
+		op += decodedSize;
+		ip += cBlockSize;
+		remainingSize -= cBlockSize;
+		if (blockProperties.lastBlock)
+			break;
+	}
+
+	if (dctx->fParams.checksumFlag) { /* Frame content checksum verification */
+		U32 const checkCalc = (U32)xxh64_digest(&dctx->xxhState);
+		U32 checkRead;
+		if (remainingSize < 4)
+			return ERROR(checksum_wrong);
+		checkRead = ZSTD_readLE32(ip);
+		if (checkRead != checkCalc)
+			return ERROR(checksum_wrong);
+		ip += 4;
+		remainingSize -= 4;
+	}
+
+	/* Allow caller to get size read */
+	*srcPtr = ip;
+	*srcSizePtr = remainingSize;
+	return op - ostart;
+}
+
+static const void *ZSTD_DDictDictContent(const ZSTD_DDict *ddict);
+static size_t ZSTD_DDictDictSize(const ZSTD_DDict *ddict);
+
+static size_t ZSTD_decompressMultiFrame(ZSTD_DCtx *dctx, void *dst, size_t dstCapacity, const void *src, size_t srcSize, const void *dict, size_t dictSize,
+					const ZSTD_DDict *ddict)
+{
+	void *const dststart = dst;
+
+	if (ddict) {
+		if (dict) {
+			/* programmer error, these two cases should be mutually exclusive */
+			return ERROR(GENERIC);
+		}
+
+		dict = ZSTD_DDictDictContent(ddict);
+		dictSize = ZSTD_DDictDictSize(ddict);
+	}
+
+	while (srcSize >= ZSTD_frameHeaderSize_prefix) {
+		U32 magicNumber;
+
+		magicNumber = ZSTD_readLE32(src);
+		if (magicNumber != ZSTD_MAGICNUMBER) {
+			if ((magicNumber & 0xFFFFFFF0U) == ZSTD_MAGIC_SKIPPABLE_START) {
+				size_t skippableSize;
+				if (srcSize < ZSTD_skippableHeaderSize)
+					return ERROR(srcSize_wrong);
+				skippableSize = ZSTD_readLE32((const BYTE *)src + 4) + ZSTD_skippableHeaderSize;
+				if (srcSize < skippableSize) {
+					return ERROR(srcSize_wrong);
+				}
+
+				src = (const BYTE *)src + skippableSize;
+				srcSize -= skippableSize;
+				continue;
+			} else {
+				return ERROR(prefix_unknown);
+			}
+		}
+
+		if (ddict) {
+			/* we were called from ZSTD_decompress_usingDDict */
+			ZSTD_refDDict(dctx, ddict);
+		} else {
+			/* this will initialize correctly with no dict if dict == NULL, so
+			 * use this in all cases but ddict */
+			CHECK_F(ZSTD_decompressBegin_usingDict(dctx, dict, dictSize));
+		}
+		ZSTD_checkContinuity(dctx, dst);
+
+		{
+			const size_t res = ZSTD_decompressFrame(dctx, dst, dstCapacity, &src, &srcSize);
+			if (ZSTD_isError(res))
+				return res;
+			/* don't need to bounds check this, ZSTD_decompressFrame will have
+			 * already */
+			dst = (BYTE *)dst + res;
+			dstCapacity -= res;
+		}
+	}
+
+	if (srcSize)
+		return ERROR(srcSize_wrong); /* input not entirely consumed */
+
+	return (BYTE *)dst - (BYTE *)dststart;
+}
+
+size_t ZSTD_decompress_usingDict(ZSTD_DCtx *dctx, void *dst, size_t dstCapacity, const void *src, size_t srcSize, const void *dict, size_t dictSize)
+{
+	return ZSTD_decompressMultiFrame(dctx, dst, dstCapacity, src, srcSize, dict, dictSize, NULL);
+}
+
+size_t ZSTD_decompressDCtx(ZSTD_DCtx *dctx, void *dst, size_t dstCapacity, const void *src, size_t srcSize)
+{
+	return ZSTD_decompress_usingDict(dctx, dst, dstCapacity, src, srcSize, NULL, 0);
+}
+
+/*-**************************************
+*   Advanced Streaming Decompression API
+*   Bufferless and synchronous
+****************************************/
+size_t ZSTD_nextSrcSizeToDecompress(ZSTD_DCtx *dctx) { return dctx->expected; }
+
+ZSTD_nextInputType_e ZSTD_nextInputType(ZSTD_DCtx *dctx)
+{
+	switch (dctx->stage) {
+	default: /* should not happen */
+	case ZSTDds_getFrameHeaderSize:
+	case ZSTDds_decodeFrameHeader: return ZSTDnit_frameHeader;
+	case ZSTDds_decodeBlockHeader: return ZSTDnit_blockHeader;
+	case ZSTDds_decompressBlock: return ZSTDnit_block;
+	case ZSTDds_decompressLastBlock: return ZSTDnit_lastBlock;
+	case ZSTDds_checkChecksum: return ZSTDnit_checksum;
+	case ZSTDds_decodeSkippableHeader:
+	case ZSTDds_skipFrame: return ZSTDnit_skippableFrame;
+	}
+}
+
+int ZSTD_isSkipFrame(ZSTD_DCtx *dctx) { return dctx->stage == ZSTDds_skipFrame; } /* for zbuff */
+
+/** ZSTD_decompressContinue() :
+*   @return : nb of bytes generated into `dst` (necessarily <= `dstCapacity)
+*             or an error code, which can be tested using ZSTD_isError() */
+size_t ZSTD_decompressContinue(ZSTD_DCtx *dctx, void *dst, size_t dstCapacity, const void *src, size_t srcSize)
+{
+	/* Sanity check */
+	if (srcSize != dctx->expected)
+		return ERROR(srcSize_wrong);
+	if (dstCapacity)
+		ZSTD_checkContinuity(dctx, dst);
+
+	switch (dctx->stage) {
+	case ZSTDds_getFrameHeaderSize:
+		if (srcSize != ZSTD_frameHeaderSize_prefix)
+			return ERROR(srcSize_wrong);					/* impossible */
+		if ((ZSTD_readLE32(src) & 0xFFFFFFF0U) == ZSTD_MAGIC_SKIPPABLE_START) { /* skippable frame */
+			memcpy(dctx->headerBuffer, src, ZSTD_frameHeaderSize_prefix);
+			dctx->expected = ZSTD_skippableHeaderSize - ZSTD_frameHeaderSize_prefix; /* magic number + skippable frame length */
+			dctx->stage = ZSTDds_decodeSkippableHeader;
+			return 0;
+		}
+		dctx->headerSize = ZSTD_frameHeaderSize(src, ZSTD_frameHeaderSize_prefix);
+		if (ZSTD_isError(dctx->headerSize))
+			return dctx->headerSize;
+		memcpy(dctx->headerBuffer, src, ZSTD_frameHeaderSize_prefix);
+		if (dctx->headerSize > ZSTD_frameHeaderSize_prefix) {
+			dctx->expected = dctx->headerSize - ZSTD_frameHeaderSize_prefix;
+			dctx->stage = ZSTDds_decodeFrameHeader;
+			return 0;
+		}
+		dctx->expected = 0; /* not necessary to copy more */
+
+	case ZSTDds_decodeFrameHeader:
+		memcpy(dctx->headerBuffer + ZSTD_frameHeaderSize_prefix, src, dctx->expected);
+		CHECK_F(ZSTD_decodeFrameHeader(dctx, dctx->headerBuffer, dctx->headerSize));
+		dctx->expected = ZSTD_blockHeaderSize;
+		dctx->stage = ZSTDds_decodeBlockHeader;
+		return 0;
+
+	case ZSTDds_decodeBlockHeader: {
+		blockProperties_t bp;
+		size_t const cBlockSize = ZSTD_getcBlockSize(src, ZSTD_blockHeaderSize, &bp);
+		if (ZSTD_isError(cBlockSize))
+			return cBlockSize;
+		dctx->expected = cBlockSize;
+		dctx->bType = bp.blockType;
+		dctx->rleSize = bp.origSize;
+		if (cBlockSize) {
+			dctx->stage = bp.lastBlock ? ZSTDds_decompressLastBlock : ZSTDds_decompressBlock;
+			return 0;
+		}
+		/* empty block */
+		if (bp.lastBlock) {
+			if (dctx->fParams.checksumFlag) {
+				dctx->expected = 4;
+				dctx->stage = ZSTDds_checkChecksum;
+			} else {
+				dctx->expected = 0; /* end of frame */
+				dctx->stage = ZSTDds_getFrameHeaderSize;
+			}
+		} else {
+			dctx->expected = 3; /* go directly to next header */
+			dctx->stage = ZSTDds_decodeBlockHeader;
+		}
+		return 0;
+	}
+	case ZSTDds_decompressLastBlock:
+	case ZSTDds_decompressBlock: {
+		size_t rSize;
+		switch (dctx->bType) {
+		case bt_compressed: rSize = ZSTD_decompressBlock_internal(dctx, dst, dstCapacity, src, srcSize); break;
+		case bt_raw: rSize = ZSTD_copyRawBlock(dst, dstCapacity, src, srcSize); break;
+		case bt_rle: rSize = ZSTD_setRleBlock(dst, dstCapacity, src, srcSize, dctx->rleSize); break;
+		case bt_reserved: /* should never happen */
+		default: return ERROR(corruption_detected);
+		}
+		if (ZSTD_isError(rSize))
+			return rSize;
+		if (dctx->fParams.checksumFlag)
+			xxh64_update(&dctx->xxhState, dst, rSize);
+
+		if (dctx->stage == ZSTDds_decompressLastBlock) { /* end of frame */
+			if (dctx->fParams.checksumFlag) {	/* another round for frame checksum */
+				dctx->expected = 4;
+				dctx->stage = ZSTDds_checkChecksum;
+			} else {
+				dctx->expected = 0; /* ends here */
+				dctx->stage = ZSTDds_getFrameHeaderSize;
+			}
+		} else {
+			dctx->stage = ZSTDds_decodeBlockHeader;
+			dctx->expected = ZSTD_blockHeaderSize;
+			dctx->previousDstEnd = (char *)dst + rSize;
+		}
+		return rSize;
+	}
+	case ZSTDds_checkChecksum: {
+		U32 const h32 = (U32)xxh64_digest(&dctx->xxhState);
+		U32 const check32 = ZSTD_readLE32(src); /* srcSize == 4, guaranteed by dctx->expected */
+		if (check32 != h32)
+			return ERROR(checksum_wrong);
+		dctx->expected = 0;
+		dctx->stage = ZSTDds_getFrameHeaderSize;
+		return 0;
+	}
+	case ZSTDds_decodeSkippableHeader: {
+		memcpy(dctx->headerBuffer + ZSTD_frameHeaderSize_prefix, src, dctx->expected);
+		dctx->expected = ZSTD_readLE32(dctx->headerBuffer + 4);
+		dctx->stage = ZSTDds_skipFrame;
+		return 0;
+	}
+	case ZSTDds_skipFrame: {
+		dctx->expected = 0;
+		dctx->stage = ZSTDds_getFrameHeaderSize;
+		return 0;
+	}
+	default:
+		return ERROR(GENERIC); /* impossible */
+	}
+}
+
+static size_t ZSTD_refDictContent(ZSTD_DCtx *dctx, const void *dict, size_t dictSize)
+{
+	dctx->dictEnd = dctx->previousDstEnd;
+	dctx->vBase = (const char *)dict - ((const char *)(dctx->previousDstEnd) - (const char *)(dctx->base));
+	dctx->base = dict;
+	dctx->previousDstEnd = (const char *)dict + dictSize;
+	return 0;
+}
+
+/* ZSTD_loadEntropy() :
+ * dict : must point at beginning of a valid zstd dictionary
+ * @return : size of entropy tables read */
+static size_t ZSTD_loadEntropy(ZSTD_entropyTables_t *entropy, const void *const dict, size_t const dictSize)
+{
+	const BYTE *dictPtr = (const BYTE *)dict;
+	const BYTE *const dictEnd = dictPtr + dictSize;
+
+	if (dictSize <= 8)
+		return ERROR(dictionary_corrupted);
+	dictPtr += 8; /* skip header = magic + dictID */
+
+	{
+		size_t const hSize = HUF_readDTableX4_wksp(entropy->hufTable, dictPtr, dictEnd - dictPtr, entropy->workspace, sizeof(entropy->workspace));
+		if (HUF_isError(hSize))
+			return ERROR(dictionary_corrupted);
+		dictPtr += hSize;
+	}
+
+	{
+		short offcodeNCount[MaxOff + 1];
+		U32 offcodeMaxValue = MaxOff, offcodeLog;
+		size_t const offcodeHeaderSize = FSE_readNCount(offcodeNCount, &offcodeMaxValue, &offcodeLog, dictPtr, dictEnd - dictPtr);
+		if (FSE_isError(offcodeHeaderSize))
+			return ERROR(dictionary_corrupted);
+		if (offcodeLog > OffFSELog)
+			return ERROR(dictionary_corrupted);
+		CHECK_E(FSE_buildDTable_wksp(entropy->OFTable, offcodeNCount, offcodeMaxValue, offcodeLog, entropy->workspace, sizeof(entropy->workspace)), dictionary_corrupted);
+		dictPtr += offcodeHeaderSize;
+	}
+
+	{
+		short matchlengthNCount[MaxML + 1];
+		unsigned matchlengthMaxValue = MaxML, matchlengthLog;
+		size_t const matchlengthHeaderSize = FSE_readNCount(matchlengthNCount, &matchlengthMaxValue, &matchlengthLog, dictPtr, dictEnd - dictPtr);
+		if (FSE_isError(matchlengthHeaderSize))
+			return ERROR(dictionary_corrupted);
+		if (matchlengthLog > MLFSELog)
+			return ERROR(dictionary_corrupted);
+		CHECK_E(FSE_buildDTable_wksp(entropy->MLTable, matchlengthNCount, matchlengthMaxValue, matchlengthLog, entropy->workspace, sizeof(entropy->workspace)), dictionary_corrupted);
+		dictPtr += matchlengthHeaderSize;
+	}
+
+	{
+		short litlengthNCount[MaxLL + 1];
+		unsigned litlengthMaxValue = MaxLL, litlengthLog;
+		size_t const litlengthHeaderSize = FSE_readNCount(litlengthNCount, &litlengthMaxValue, &litlengthLog, dictPtr, dictEnd - dictPtr);
+		if (FSE_isError(litlengthHeaderSize))
+			return ERROR(dictionary_corrupted);
+		if (litlengthLog > LLFSELog)
+			return ERROR(dictionary_corrupted);
+		CHECK_E(FSE_buildDTable_wksp(entropy->LLTable, litlengthNCount, litlengthMaxValue, litlengthLog, entropy->workspace, sizeof(entropy->workspace)), dictionary_corrupted);
+		dictPtr += litlengthHeaderSize;
+	}
+
+	if (dictPtr + 12 > dictEnd)
+		return ERROR(dictionary_corrupted);
+	{
+		int i;
+		size_t const dictContentSize = (size_t)(dictEnd - (dictPtr + 12));
+		for (i = 0; i < 3; i++) {
+			U32 const rep = ZSTD_readLE32(dictPtr);
+			dictPtr += 4;
+			if (rep == 0 || rep >= dictContentSize)
+				return ERROR(dictionary_corrupted);
+			entropy->rep[i] = rep;
+		}
+	}
+
+	return dictPtr - (const BYTE *)dict;
+}
+
+static size_t ZSTD_decompress_insertDictionary(ZSTD_DCtx *dctx, const void *dict, size_t dictSize)
+{
+	if (dictSize < 8)
+		return ZSTD_refDictContent(dctx, dict, dictSize);
+	{
+		U32 const magic = ZSTD_readLE32(dict);
+		if (magic != ZSTD_DICT_MAGIC) {
+			return ZSTD_refDictContent(dctx, dict, dictSize); /* pure content mode */
+		}
+	}
+	dctx->dictID = ZSTD_readLE32((const char *)dict + 4);
+
+	/* load entropy tables */
+	{
+		size_t const eSize = ZSTD_loadEntropy(&dctx->entropy, dict, dictSize);
+		if (ZSTD_isError(eSize))
+			return ERROR(dictionary_corrupted);
+		dict = (const char *)dict + eSize;
+		dictSize -= eSize;
+	}
+	dctx->litEntropy = dctx->fseEntropy = 1;
+
+	/* reference dictionary content */
+	return ZSTD_refDictContent(dctx, dict, dictSize);
+}
+
+size_t ZSTD_decompressBegin_usingDict(ZSTD_DCtx *dctx, const void *dict, size_t dictSize)
+{
+	CHECK_F(ZSTD_decompressBegin(dctx));
+	if (dict && dictSize)
+		CHECK_E(ZSTD_decompress_insertDictionary(dctx, dict, dictSize), dictionary_corrupted);
+	return 0;
+}
+
+/* ======   ZSTD_DDict   ====== */
+
+struct ZSTD_DDict_s {
+	void *dictBuffer;
+	const void *dictContent;
+	size_t dictSize;
+	ZSTD_entropyTables_t entropy;
+	U32 dictID;
+	U32 entropyPresent;
+	ZSTD_customMem cMem;
+}; /* typedef'd to ZSTD_DDict within "zstd.h" */
+
+size_t ZSTD_DDictWorkspaceBound(void) { return ZSTD_ALIGN(sizeof(ZSTD_stack)) + ZSTD_ALIGN(sizeof(ZSTD_DDict)); }
+
+static const void *ZSTD_DDictDictContent(const ZSTD_DDict *ddict) { return ddict->dictContent; }
+
+static size_t ZSTD_DDictDictSize(const ZSTD_DDict *ddict) { return ddict->dictSize; }
+
+static void ZSTD_refDDict(ZSTD_DCtx *dstDCtx, const ZSTD_DDict *ddict)
+{
+	ZSTD_decompressBegin(dstDCtx); /* init */
+	if (ddict) {		       /* support refDDict on NULL */
+		dstDCtx->dictID = ddict->dictID;
+		dstDCtx->base = ddict->dictContent;
+		dstDCtx->vBase = ddict->dictContent;
+		dstDCtx->dictEnd = (const BYTE *)ddict->dictContent + ddict->dictSize;
+		dstDCtx->previousDstEnd = dstDCtx->dictEnd;
+		if (ddict->entropyPresent) {
+			dstDCtx->litEntropy = 1;
+			dstDCtx->fseEntropy = 1;
+			dstDCtx->LLTptr = ddict->entropy.LLTable;
+			dstDCtx->MLTptr = ddict->entropy.MLTable;
+			dstDCtx->OFTptr = ddict->entropy.OFTable;
+			dstDCtx->HUFptr = ddict->entropy.hufTable;
+			dstDCtx->entropy.rep[0] = ddict->entropy.rep[0];
+			dstDCtx->entropy.rep[1] = ddict->entropy.rep[1];
+			dstDCtx->entropy.rep[2] = ddict->entropy.rep[2];
+		} else {
+			dstDCtx->litEntropy = 0;
+			dstDCtx->fseEntropy = 0;
+		}
+	}
+}
+
+static size_t ZSTD_loadEntropy_inDDict(ZSTD_DDict *ddict)
+{
+	ddict->dictID = 0;
+	ddict->entropyPresent = 0;
+	if (ddict->dictSize < 8)
+		return 0;
+	{
+		U32 const magic = ZSTD_readLE32(ddict->dictContent);
+		if (magic != ZSTD_DICT_MAGIC)
+			return 0; /* pure content mode */
+	}
+	ddict->dictID = ZSTD_readLE32((const char *)ddict->dictContent + 4);
+
+	/* load entropy tables */
+	CHECK_E(ZSTD_loadEntropy(&ddict->entropy, ddict->dictContent, ddict->dictSize), dictionary_corrupted);
+	ddict->entropyPresent = 1;
+	return 0;
+}
+
+static ZSTD_DDict *ZSTD_createDDict_advanced(const void *dict, size_t dictSize, unsigned byReference, ZSTD_customMem customMem)
+{
+	if (!customMem.customAlloc || !customMem.customFree)
+		return NULL;
+
+	{
+		ZSTD_DDict *const ddict = (ZSTD_DDict *)ZSTD_malloc(sizeof(ZSTD_DDict), customMem);
+		if (!ddict)
+			return NULL;
+		ddict->cMem = customMem;
+
+		if ((byReference) || (!dict) || (!dictSize)) {
+			ddict->dictBuffer = NULL;
+			ddict->dictContent = dict;
+		} else {
+			void *const internalBuffer = ZSTD_malloc(dictSize, customMem);
+			if (!internalBuffer) {
+				ZSTD_freeDDict(ddict);
+				return NULL;
+			}
+			memcpy(internalBuffer, dict, dictSize);
+			ddict->dictBuffer = internalBuffer;
+			ddict->dictContent = internalBuffer;
+		}
+		ddict->dictSize = dictSize;
+		ddict->entropy.hufTable[0] = (HUF_DTable)((HufLog)*0x1000001); /* cover both little and big endian */
+		/* parse dictionary content */
+		{
+			size_t const errorCode = ZSTD_loadEntropy_inDDict(ddict);
+			if (ZSTD_isError(errorCode)) {
+				ZSTD_freeDDict(ddict);
+				return NULL;
+			}
+		}
+
+		return ddict;
+	}
+}
+
+/*! ZSTD_initDDict() :
+*   Create a digested dictionary, to start decompression without startup delay.
+*   `dict` content is copied inside DDict.
+*   Consequently, `dict` can be released after `ZSTD_DDict` creation */
+ZSTD_DDict *ZSTD_initDDict(const void *dict, size_t dictSize, void *workspace, size_t workspaceSize)
+{
+	ZSTD_customMem const stackMem = ZSTD_initStack(workspace, workspaceSize);
+	return ZSTD_createDDict_advanced(dict, dictSize, 1, stackMem);
+}
+
+size_t ZSTD_freeDDict(ZSTD_DDict *ddict)
+{
+	if (ddict == NULL)
+		return 0; /* support free on NULL */
+	{
+		ZSTD_customMem const cMem = ddict->cMem;
+		ZSTD_free(ddict->dictBuffer, cMem);
+		ZSTD_free(ddict, cMem);
+		return 0;
+	}
+}
+
+/*! ZSTD_getDictID_fromDict() :
+ *  Provides the dictID stored within dictionary.
+ *  if @return == 0, the dictionary is not conformant with Zstandard specification.
+ *  It can still be loaded, but as a content-only dictionary. */
+unsigned ZSTD_getDictID_fromDict(const void *dict, size_t dictSize)
+{
+	if (dictSize < 8)
+		return 0;
+	if (ZSTD_readLE32(dict) != ZSTD_DICT_MAGIC)
+		return 0;
+	return ZSTD_readLE32((const char *)dict + 4);
+}
+
+/*! ZSTD_getDictID_fromDDict() :
+ *  Provides the dictID of the dictionary loaded into `ddict`.
+ *  If @return == 0, the dictionary is not conformant to Zstandard specification, or empty.
+ *  Non-conformant dictionaries can still be loaded, but as content-only dictionaries. */
+unsigned ZSTD_getDictID_fromDDict(const ZSTD_DDict *ddict)
+{
+	if (ddict == NULL)
+		return 0;
+	return ZSTD_getDictID_fromDict(ddict->dictContent, ddict->dictSize);
+}
+
+/*! ZSTD_getDictID_fromFrame() :
+ *  Provides the dictID required to decompressed the frame stored within `src`.
+ *  If @return == 0, the dictID could not be decoded.
+ *  This could for one of the following reasons :
+ *  - The frame does not require a dictionary to be decoded (most common case).
+ *  - The frame was built with dictID intentionally removed. Whatever dictionary is necessary is a hidden information.
+ *    Note : this use case also happens when using a non-conformant dictionary.
+ *  - `srcSize` is too small, and as a result, the frame header could not be decoded (only possible if `srcSize < ZSTD_FRAMEHEADERSIZE_MAX`).
+ *  - This is not a Zstandard frame.
+ *  When identifying the exact failure cause, it's possible to used ZSTD_getFrameParams(), which will provide a more precise error code. */
+unsigned ZSTD_getDictID_fromFrame(const void *src, size_t srcSize)
+{
+	ZSTD_frameParams zfp = {0, 0, 0, 0};
+	size_t const hError = ZSTD_getFrameParams(&zfp, src, srcSize);
+	if (ZSTD_isError(hError))
+		return 0;
+	return zfp.dictID;
+}
+
+/*! ZSTD_decompress_usingDDict() :
+*   Decompression using a pre-digested Dictionary
+*   Use dictionary without significant overhead. */
+size_t ZSTD_decompress_usingDDict(ZSTD_DCtx *dctx, void *dst, size_t dstCapacity, const void *src, size_t srcSize, const ZSTD_DDict *ddict)
+{
+	/* pass content and size in case legacy frames are encountered */
+	return ZSTD_decompressMultiFrame(dctx, dst, dstCapacity, src, srcSize, NULL, 0, ddict);
+}
+
+/*=====================================
+*   Streaming decompression
+*====================================*/
+
+typedef enum { zdss_init, zdss_loadHeader, zdss_read, zdss_load, zdss_flush } ZSTD_dStreamStage;
+
+/* *** Resource management *** */
+struct ZSTD_DStream_s {
+	ZSTD_DCtx *dctx;
+	ZSTD_DDict *ddictLocal;
+	const ZSTD_DDict *ddict;
+	ZSTD_frameParams fParams;
+	ZSTD_dStreamStage stage;
+	char *inBuff;
+	size_t inBuffSize;
+	size_t inPos;
+	size_t maxWindowSize;
+	char *outBuff;
+	size_t outBuffSize;
+	size_t outStart;
+	size_t outEnd;
+	size_t blockSize;
+	BYTE headerBuffer[ZSTD_FRAMEHEADERSIZE_MAX]; /* tmp buffer to store frame header */
+	size_t lhSize;
+	ZSTD_customMem customMem;
+	void *legacyContext;
+	U32 previousLegacyVersion;
+	U32 legacyVersion;
+	U32 hostageByte;
+}; /* typedef'd to ZSTD_DStream within "zstd.h" */
+
+size_t ZSTD_DStreamWorkspaceBound(size_t maxWindowSize)
+{
+	size_t const blockSize = MIN(maxWindowSize, ZSTD_BLOCKSIZE_ABSOLUTEMAX);
+	size_t const inBuffSize = blockSize;
+	size_t const outBuffSize = maxWindowSize + blockSize + WILDCOPY_OVERLENGTH * 2;
+	return ZSTD_DCtxWorkspaceBound() + ZSTD_ALIGN(sizeof(ZSTD_DStream)) + ZSTD_ALIGN(inBuffSize) + ZSTD_ALIGN(outBuffSize);
+}
+
+static ZSTD_DStream *ZSTD_createDStream_advanced(ZSTD_customMem customMem)
+{
+	ZSTD_DStream *zds;
+
+	if (!customMem.customAlloc || !customMem.customFree)
+		return NULL;
+
+	zds = (ZSTD_DStream *)ZSTD_malloc(sizeof(ZSTD_DStream), customMem);
+	if (zds == NULL)
+		return NULL;
+	memset(zds, 0, sizeof(ZSTD_DStream));
+	memcpy(&zds->customMem, &customMem, sizeof(ZSTD_customMem));
+	zds->dctx = ZSTD_createDCtx_advanced(customMem);
+	if (zds->dctx == NULL) {
+		ZSTD_freeDStream(zds);
+		return NULL;
+	}
+	zds->stage = zdss_init;
+	zds->maxWindowSize = ZSTD_MAXWINDOWSIZE_DEFAULT;
+	return zds;
+}
+
+ZSTD_DStream *ZSTD_initDStream(size_t maxWindowSize, void *workspace, size_t workspaceSize)
+{
+	ZSTD_customMem const stackMem = ZSTD_initStack(workspace, workspaceSize);
+	ZSTD_DStream *zds = ZSTD_createDStream_advanced(stackMem);
+	if (!zds) {
+		return NULL;
+	}
+
+	zds->maxWindowSize = maxWindowSize;
+	zds->stage = zdss_loadHeader;
+	zds->lhSize = zds->inPos = zds->outStart = zds->outEnd = 0;
+	ZSTD_freeDDict(zds->ddictLocal);
+	zds->ddictLocal = NULL;
+	zds->ddict = zds->ddictLocal;
+	zds->legacyVersion = 0;
+	zds->hostageByte = 0;
+
+	{
+		size_t const blockSize = MIN(zds->maxWindowSize, ZSTD_BLOCKSIZE_ABSOLUTEMAX);
+		size_t const neededOutSize = zds->maxWindowSize + blockSize + WILDCOPY_OVERLENGTH * 2;
+
+		zds->inBuff = (char *)ZSTD_malloc(blockSize, zds->customMem);
+		zds->inBuffSize = blockSize;
+		zds->outBuff = (char *)ZSTD_malloc(neededOutSize, zds->customMem);
+		zds->outBuffSize = neededOutSize;
+		if (zds->inBuff == NULL || zds->outBuff == NULL) {
+			ZSTD_freeDStream(zds);
+			return NULL;
+		}
+	}
+	return zds;
+}
+
+ZSTD_DStream *ZSTD_initDStream_usingDDict(size_t maxWindowSize, const ZSTD_DDict *ddict, void *workspace, size_t workspaceSize)
+{
+	ZSTD_DStream *zds = ZSTD_initDStream(maxWindowSize, workspace, workspaceSize);
+	if (zds) {
+		zds->ddict = ddict;
+	}
+	return zds;
+}
+
+size_t ZSTD_freeDStream(ZSTD_DStream *zds)
+{
+	if (zds == NULL)
+		return 0; /* support free on null */
+	{
+		ZSTD_customMem const cMem = zds->customMem;
+		ZSTD_freeDCtx(zds->dctx);
+		zds->dctx = NULL;
+		ZSTD_freeDDict(zds->ddictLocal);
+		zds->ddictLocal = NULL;
+		ZSTD_free(zds->inBuff, cMem);
+		zds->inBuff = NULL;
+		ZSTD_free(zds->outBuff, cMem);
+		zds->outBuff = NULL;
+		ZSTD_free(zds, cMem);
+		return 0;
+	}
+}
+
+/* *** Initialization *** */
+
+size_t ZSTD_DStreamInSize(void) { return ZSTD_BLOCKSIZE_ABSOLUTEMAX + ZSTD_blockHeaderSize; }
+size_t ZSTD_DStreamOutSize(void) { return ZSTD_BLOCKSIZE_ABSOLUTEMAX; }
+
+size_t ZSTD_resetDStream(ZSTD_DStream *zds)
+{
+	zds->stage = zdss_loadHeader;
+	zds->lhSize = zds->inPos = zds->outStart = zds->outEnd = 0;
+	zds->legacyVersion = 0;
+	zds->hostageByte = 0;
+	return ZSTD_frameHeaderSize_prefix;
+}
+
+/* *****   Decompression   ***** */
+
+ZSTD_STATIC size_t ZSTD_limitCopy(void *dst, size_t dstCapacity, const void *src, size_t srcSize)
+{
+	size_t const length = MIN(dstCapacity, srcSize);
+	memcpy(dst, src, length);
+	return length;
+}
+
+size_t ZSTD_decompressStream(ZSTD_DStream *zds, ZSTD_outBuffer *output, ZSTD_inBuffer *input)
+{
+	const char *const istart = (const char *)(input->src) + input->pos;
+	const char *const iend = (const char *)(input->src) + input->size;
+	const char *ip = istart;
+	char *const ostart = (char *)(output->dst) + output->pos;
+	char *const oend = (char *)(output->dst) + output->size;
+	char *op = ostart;
+	U32 someMoreWork = 1;
+
+	while (someMoreWork) {
+		switch (zds->stage) {
+		case zdss_init:
+			ZSTD_resetDStream(zds); /* transparent reset on starting decoding a new frame */
+						/* fall-through */
+
+		case zdss_loadHeader: {
+			size_t const hSize = ZSTD_getFrameParams(&zds->fParams, zds->headerBuffer, zds->lhSize);
+			if (ZSTD_isError(hSize))
+				return hSize;
+			if (hSize != 0) {				   /* need more input */
+				size_t const toLoad = hSize - zds->lhSize; /* if hSize!=0, hSize > zds->lhSize */
+				if (toLoad > (size_t)(iend - ip)) {	/* not enough input to load full header */
+					memcpy(zds->headerBuffer + zds->lhSize, ip, iend - ip);
+					zds->lhSize += iend - ip;
+					input->pos = input->size;
+					return (MAX(ZSTD_frameHeaderSize_min, hSize) - zds->lhSize) +
+					       ZSTD_blockHeaderSize; /* remaining header bytes + next block header */
+				}
+				memcpy(zds->headerBuffer + zds->lhSize, ip, toLoad);
+				zds->lhSize = hSize;
+				ip += toLoad;
+				break;
+			}
+
+			/* check for single-pass mode opportunity */
+			if (zds->fParams.frameContentSize && zds->fParams.windowSize /* skippable frame if == 0 */
+			    && (U64)(size_t)(oend - op) >= zds->fParams.frameContentSize) {
+				size_t const cSize = ZSTD_findFrameCompressedSize(istart, iend - istart);
+				if (cSize <= (size_t)(iend - istart)) {
+					size_t const decompressedSize = ZSTD_decompress_usingDDict(zds->dctx, op, oend - op, istart, cSize, zds->ddict);
+					if (ZSTD_isError(decompressedSize))
+						return decompressedSize;
+					ip = istart + cSize;
+					op += decompressedSize;
+					zds->dctx->expected = 0;
+					zds->stage = zdss_init;
+					someMoreWork = 0;
+					break;
+				}
+			}
+
+			/* Consume header */
+			ZSTD_refDDict(zds->dctx, zds->ddict);
+			{
+				size_t const h1Size = ZSTD_nextSrcSizeToDecompress(zds->dctx); /* == ZSTD_frameHeaderSize_prefix */
+				CHECK_F(ZSTD_decompressContinue(zds->dctx, NULL, 0, zds->headerBuffer, h1Size));
+				{
+					size_t const h2Size = ZSTD_nextSrcSizeToDecompress(zds->dctx);
+					CHECK_F(ZSTD_decompressContinue(zds->dctx, NULL, 0, zds->headerBuffer + h1Size, h2Size));
+				}
+			}
+
+			zds->fParams.windowSize = MAX(zds->fParams.windowSize, 1U << ZSTD_WINDOWLOG_ABSOLUTEMIN);
+			if (zds->fParams.windowSize > zds->maxWindowSize)
+				return ERROR(frameParameter_windowTooLarge);
+
+			/* Buffers are preallocated, but double check */
+			{
+				size_t const blockSize = MIN(zds->maxWindowSize, ZSTD_BLOCKSIZE_ABSOLUTEMAX);
+				size_t const neededOutSize = zds->maxWindowSize + blockSize + WILDCOPY_OVERLENGTH * 2;
+				if (zds->inBuffSize < blockSize) {
+					return ERROR(GENERIC);
+				}
+				if (zds->outBuffSize < neededOutSize) {
+					return ERROR(GENERIC);
+				}
+				zds->blockSize = blockSize;
+			}
+			zds->stage = zdss_read;
+		}
+		/* pass-through */
+
+		case zdss_read: {
+			size_t const neededInSize = ZSTD_nextSrcSizeToDecompress(zds->dctx);
+			if (neededInSize == 0) { /* end of frame */
+				zds->stage = zdss_init;
+				someMoreWork = 0;
+				break;
+			}
+			if ((size_t)(iend - ip) >= neededInSize) { /* decode directly from src */
+				const int isSkipFrame = ZSTD_isSkipFrame(zds->dctx);
+				size_t const decodedSize = ZSTD_decompressContinue(zds->dctx, zds->outBuff + zds->outStart,
+										   (isSkipFrame ? 0 : zds->outBuffSize - zds->outStart), ip, neededInSize);
+				if (ZSTD_isError(decodedSize))
+					return decodedSize;
+				ip += neededInSize;
+				if (!decodedSize && !isSkipFrame)
+					break; /* this was just a header */
+				zds->outEnd = zds->outStart + decodedSize;
+				zds->stage = zdss_flush;
+				break;
+			}
+			if (ip == iend) {
+				someMoreWork = 0;
+				break;
+			} /* no more input */
+			zds->stage = zdss_load;
+			/* pass-through */
+		}
+
+		case zdss_load: {
+			size_t const neededInSize = ZSTD_nextSrcSizeToDecompress(zds->dctx);
+			size_t const toLoad = neededInSize - zds->inPos; /* should always be <= remaining space within inBuff */
+			size_t loadedSize;
+			if (toLoad > zds->inBuffSize - zds->inPos)
+				return ERROR(corruption_detected); /* should never happen */
+			loadedSize = ZSTD_limitCopy(zds->inBuff + zds->inPos, toLoad, ip, iend - ip);
+			ip += loadedSize;
+			zds->inPos += loadedSize;
+			if (loadedSize < toLoad) {
+				someMoreWork = 0;
+				break;
+			} /* not enough input, wait for more */
+
+			/* decode loaded input */
+			{
+				const int isSkipFrame = ZSTD_isSkipFrame(zds->dctx);
+				size_t const decodedSize = ZSTD_decompressContinue(zds->dctx, zds->outBuff + zds->outStart, zds->outBuffSize - zds->outStart,
+										   zds->inBuff, neededInSize);
+				if (ZSTD_isError(decodedSize))
+					return decodedSize;
+				zds->inPos = 0; /* input is consumed */
+				if (!decodedSize && !isSkipFrame) {
+					zds->stage = zdss_read;
+					break;
+				} /* this was just a header */
+				zds->outEnd = zds->outStart + decodedSize;
+				zds->stage = zdss_flush;
+				/* pass-through */
+			}
+		}
+
+		case zdss_flush: {
+			size_t const toFlushSize = zds->outEnd - zds->outStart;
+			size_t const flushedSize = ZSTD_limitCopy(op, oend - op, zds->outBuff + zds->outStart, toFlushSize);
+			op += flushedSize;
+			zds->outStart += flushedSize;
+			if (flushedSize == toFlushSize) { /* flush completed */
+				zds->stage = zdss_read;
+				if (zds->outStart + zds->blockSize > zds->outBuffSize)
+					zds->outStart = zds->outEnd = 0;
+				break;
+			}
+			/* cannot complete flush */
+			someMoreWork = 0;
+			break;
+		}
+		default:
+			return ERROR(GENERIC); /* impossible */
+		}
+	}
+
+	/* result */
+	input->pos += (size_t)(ip - istart);
+	output->pos += (size_t)(op - ostart);
+	{
+		size_t nextSrcSizeHint = ZSTD_nextSrcSizeToDecompress(zds->dctx);
+		if (!nextSrcSizeHint) {			    /* frame fully decoded */
+			if (zds->outEnd == zds->outStart) { /* output fully flushed */
+				if (zds->hostageByte) {
+					if (input->pos >= input->size) {
+						zds->stage = zdss_read;
+						return 1;
+					}	     /* can't release hostage (not present) */
+					input->pos++; /* release hostage */
+				}
+				return 0;
+			}
+			if (!zds->hostageByte) { /* output not fully flushed; keep last byte as hostage; will be released when all output is flushed */
+				input->pos--;    /* note : pos > 0, otherwise, impossible to finish reading last block */
+				zds->hostageByte = 1;
+			}
+			return 1;
+		}
+		nextSrcSizeHint += ZSTD_blockHeaderSize * (ZSTD_nextInputType(zds->dctx) == ZSTDnit_block); /* preload header of next block */
+		if (zds->inPos > nextSrcSizeHint)
+			return ERROR(GENERIC); /* should never happen */
+		nextSrcSizeHint -= zds->inPos; /* already loaded*/
+		return nextSrcSizeHint;
+	}
+}
+
+EXPORT_SYMBOL(ZSTD_DCtxWorkspaceBound);
+EXPORT_SYMBOL(ZSTD_initDCtx);
+EXPORT_SYMBOL(ZSTD_decompressDCtx);
+EXPORT_SYMBOL(ZSTD_decompress_usingDict);
+
+EXPORT_SYMBOL(ZSTD_DDictWorkspaceBound);
+EXPORT_SYMBOL(ZSTD_initDDict);
+EXPORT_SYMBOL(ZSTD_decompress_usingDDict);
+
+EXPORT_SYMBOL(ZSTD_DStreamWorkspaceBound);
+EXPORT_SYMBOL(ZSTD_initDStream);
+EXPORT_SYMBOL(ZSTD_initDStream_usingDDict);
+EXPORT_SYMBOL(ZSTD_resetDStream);
+EXPORT_SYMBOL(ZSTD_decompressStream);
+EXPORT_SYMBOL(ZSTD_DStreamInSize);
+EXPORT_SYMBOL(ZSTD_DStreamOutSize);
+
+EXPORT_SYMBOL(ZSTD_findFrameCompressedSize);
+EXPORT_SYMBOL(ZSTD_getFrameContentSize);
+EXPORT_SYMBOL(ZSTD_findDecompressedSize);
+
+EXPORT_SYMBOL(ZSTD_isFrame);
+EXPORT_SYMBOL(ZSTD_getDictID_fromDict);
+EXPORT_SYMBOL(ZSTD_getDictID_fromDDict);
+EXPORT_SYMBOL(ZSTD_getDictID_fromFrame);
+
+EXPORT_SYMBOL(ZSTD_getFrameParams);
+EXPORT_SYMBOL(ZSTD_decompressBegin);
+EXPORT_SYMBOL(ZSTD_decompressBegin_usingDict);
+EXPORT_SYMBOL(ZSTD_copyDCtx);
+EXPORT_SYMBOL(ZSTD_nextSrcSizeToDecompress);
+EXPORT_SYMBOL(ZSTD_decompressContinue);
+EXPORT_SYMBOL(ZSTD_nextInputType);
+
+EXPORT_SYMBOL(ZSTD_decompressBlock);
+EXPORT_SYMBOL(ZSTD_insertBlock);
+
+MODULE_LICENSE("Dual BSD/GPL");
+MODULE_DESCRIPTION("Zstd Decompressor");
diff --git a/lib/zstd/entropy_common.c b/lib/zstd/entropy_common.c
new file mode 100644
index 000000000000..2b0a643c32c4
--- /dev/null
+++ b/lib/zstd/entropy_common.c
@@ -0,0 +1,243 @@
+/*
+ * Common functions of New Generation Entropy library
+ * Copyright (C) 2016, Yann Collet.
+ *
+ * BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php)
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met:
+ *
+ *   * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ *   * Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following disclaimer
+ * in the documentation and/or other materials provided with the
+ * distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * This program is free software; you can redistribute it and/or modify it under
+ * the terms of the GNU General Public License version 2 as published by the
+ * Free Software Foundation. This program is dual-licensed; you may select
+ * either version 2 of the GNU General Public License ("GPL") or BSD license
+ * ("BSD").
+ *
+ * You can contact the author at :
+ * - Source repository : https://github.com/Cyan4973/FiniteStateEntropy
+ */
+
+/* *************************************
+*  Dependencies
+***************************************/
+#include "error_private.h" /* ERR_*, ERROR */
+#include "fse.h"
+#include "huf.h"
+#include "mem.h"
+
+/*===   Version   ===*/
+unsigned FSE_versionNumber(void) { return FSE_VERSION_NUMBER; }
+
+/*===   Error Management   ===*/
+unsigned FSE_isError(size_t code) { return ERR_isError(code); }
+
+unsigned HUF_isError(size_t code) { return ERR_isError(code); }
+
+/*-**************************************************************
+*  FSE NCount encoding-decoding
+****************************************************************/
+size_t FSE_readNCount(short *normalizedCounter, unsigned *maxSVPtr, unsigned *tableLogPtr, const void *headerBuffer, size_t hbSize)
+{
+	const BYTE *const istart = (const BYTE *)headerBuffer;
+	const BYTE *const iend = istart + hbSize;
+	const BYTE *ip = istart;
+	int nbBits;
+	int remaining;
+	int threshold;
+	U32 bitStream;
+	int bitCount;
+	unsigned charnum = 0;
+	int previous0 = 0;
+
+	if (hbSize < 4)
+		return ERROR(srcSize_wrong);
+	bitStream = ZSTD_readLE32(ip);
+	nbBits = (bitStream & 0xF) + FSE_MIN_TABLELOG; /* extract tableLog */
+	if (nbBits > FSE_TABLELOG_ABSOLUTE_MAX)
+		return ERROR(tableLog_tooLarge);
+	bitStream >>= 4;
+	bitCount = 4;
+	*tableLogPtr = nbBits;
+	remaining = (1 << nbBits) + 1;
+	threshold = 1 << nbBits;
+	nbBits++;
+
+	while ((remaining > 1) & (charnum <= *maxSVPtr)) {
+		if (previous0) {
+			unsigned n0 = charnum;
+			while ((bitStream & 0xFFFF) == 0xFFFF) {
+				n0 += 24;
+				if (ip < iend - 5) {
+					ip += 2;
+					bitStream = ZSTD_readLE32(ip) >> bitCount;
+				} else {
+					bitStream >>= 16;
+					bitCount += 16;
+				}
+			}
+			while ((bitStream & 3) == 3) {
+				n0 += 3;
+				bitStream >>= 2;
+				bitCount += 2;
+			}
+			n0 += bitStream & 3;
+			bitCount += 2;
+			if (n0 > *maxSVPtr)
+				return ERROR(maxSymbolValue_tooSmall);
+			while (charnum < n0)
+				normalizedCounter[charnum++] = 0;
+			if ((ip <= iend - 7) || (ip + (bitCount >> 3) <= iend - 4)) {
+				ip += bitCount >> 3;
+				bitCount &= 7;
+				bitStream = ZSTD_readLE32(ip) >> bitCount;
+			} else {
+				bitStream >>= 2;
+			}
+		}
+		{
+			int const max = (2 * threshold - 1) - remaining;
+			int count;
+
+			if ((bitStream & (threshold - 1)) < (U32)max) {
+				count = bitStream & (threshold - 1);
+				bitCount += nbBits - 1;
+			} else {
+				count = bitStream & (2 * threshold - 1);
+				if (count >= threshold)
+					count -= max;
+				bitCount += nbBits;
+			}
+
+			count--;				 /* extra accuracy */
+			remaining -= count < 0 ? -count : count; /* -1 means +1 */
+			normalizedCounter[charnum++] = (short)count;
+			previous0 = !count;
+			while (remaining < threshold) {
+				nbBits--;
+				threshold >>= 1;
+			}
+
+			if ((ip <= iend - 7) || (ip + (bitCount >> 3) <= iend - 4)) {
+				ip += bitCount >> 3;
+				bitCount &= 7;
+			} else {
+				bitCount -= (int)(8 * (iend - 4 - ip));
+				ip = iend - 4;
+			}
+			bitStream = ZSTD_readLE32(ip) >> (bitCount & 31);
+		}
+	} /* while ((remaining>1) & (charnum<=*maxSVPtr)) */
+	if (remaining != 1)
+		return ERROR(corruption_detected);
+	if (bitCount > 32)
+		return ERROR(corruption_detected);
+	*maxSVPtr = charnum - 1;
+
+	ip += (bitCount + 7) >> 3;
+	return ip - istart;
+}
+
+/*! HUF_readStats() :
+	Read compact Huffman tree, saved by HUF_writeCTable().
+	`huffWeight` is destination buffer.
+	`rankStats` is assumed to be a table of at least HUF_TABLELOG_MAX U32.
+	@return : size read from `src` , or an error Code .
+	Note : Needed by HUF_readCTable() and HUF_readDTableX?() .
+*/
+size_t HUF_readStats_wksp(BYTE *huffWeight, size_t hwSize, U32 *rankStats, U32 *nbSymbolsPtr, U32 *tableLogPtr, const void *src, size_t srcSize, void *workspace, size_t workspaceSize)
+{
+	U32 weightTotal;
+	const BYTE *ip = (const BYTE *)src;
+	size_t iSize;
+	size_t oSize;
+
+	if (!srcSize)
+		return ERROR(srcSize_wrong);
+	iSize = ip[0];
+	/* memset(huffWeight, 0, hwSize);   */ /* is not necessary, even though some analyzer complain ... */
+
+	if (iSize >= 128) { /* special header */
+		oSize = iSize - 127;
+		iSize = ((oSize + 1) / 2);
+		if (iSize + 1 > srcSize)
+			return ERROR(srcSize_wrong);
+		if (oSize >= hwSize)
+			return ERROR(corruption_detected);
+		ip += 1;
+		{
+			U32 n;
+			for (n = 0; n < oSize; n += 2) {
+				huffWeight[n] = ip[n / 2] >> 4;
+				huffWeight[n + 1] = ip[n / 2] & 15;
+			}
+		}
+	} else {						 /* header compressed with FSE (normal case) */
+		if (iSize + 1 > srcSize)
+			return ERROR(srcSize_wrong);
+		oSize = FSE_decompress_wksp(huffWeight, hwSize - 1, ip + 1, iSize, 6, workspace, workspaceSize); /* max (hwSize-1) values decoded, as last one is implied */
+		if (FSE_isError(oSize))
+			return oSize;
+	}
+
+	/* collect weight stats */
+	memset(rankStats, 0, (HUF_TABLELOG_MAX + 1) * sizeof(U32));
+	weightTotal = 0;
+	{
+		U32 n;
+		for (n = 0; n < oSize; n++) {
+			if (huffWeight[n] >= HUF_TABLELOG_MAX)
+				return ERROR(corruption_detected);
+			rankStats[huffWeight[n]]++;
+			weightTotal += (1 << huffWeight[n]) >> 1;
+		}
+	}
+	if (weightTotal == 0)
+		return ERROR(corruption_detected);
+
+	/* get last non-null symbol weight (implied, total must be 2^n) */
+	{
+		U32 const tableLog = BIT_highbit32(weightTotal) + 1;
+		if (tableLog > HUF_TABLELOG_MAX)
+			return ERROR(corruption_detected);
+		*tableLogPtr = tableLog;
+		/* determine last weight */
+		{
+			U32 const total = 1 << tableLog;
+			U32 const rest = total - weightTotal;
+			U32 const verif = 1 << BIT_highbit32(rest);
+			U32 const lastWeight = BIT_highbit32(rest) + 1;
+			if (verif != rest)
+				return ERROR(corruption_detected); /* last value must be a clean power of 2 */
+			huffWeight[oSize] = (BYTE)lastWeight;
+			rankStats[lastWeight]++;
+		}
+	}
+
+	/* check tree construction validity */
+	if ((rankStats[1] < 2) || (rankStats[1] & 1))
+		return ERROR(corruption_detected); /* by construction : at least 2 elts of rank 1, must be even */
+
+	/* results */
+	*nbSymbolsPtr = (U32)(oSize + 1);
+	return iSize + 1;
+}
diff --git a/lib/zstd/error_private.h b/lib/zstd/error_private.h
new file mode 100644
index 000000000000..1a60b31f706c
--- /dev/null
+++ b/lib/zstd/error_private.h
@@ -0,0 +1,53 @@
+/**
+ * Copyright (c) 2016-present, Yann Collet, Facebook, Inc.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of https://github.com/facebook/zstd.
+ * An additional grant of patent rights can be found in the PATENTS file in the
+ * same directory.
+ *
+ * This program is free software; you can redistribute it and/or modify it under
+ * the terms of the GNU General Public License version 2 as published by the
+ * Free Software Foundation. This program is dual-licensed; you may select
+ * either version 2 of the GNU General Public License ("GPL") or BSD license
+ * ("BSD").
+ */
+
+/* Note : this module is expected to remain private, do not expose it */
+
+#ifndef ERROR_H_MODULE
+#define ERROR_H_MODULE
+
+/* ****************************************
+*  Dependencies
+******************************************/
+#include <linux/types.h> /* size_t */
+#include <linux/zstd.h>  /* enum list */
+
+/* ****************************************
+*  Compiler-specific
+******************************************/
+#define ERR_STATIC static __attribute__((unused))
+
+/*-****************************************
+*  Customization (error_public.h)
+******************************************/
+typedef ZSTD_ErrorCode ERR_enum;
+#define PREFIX(name) ZSTD_error_##name
+
+/*-****************************************
+*  Error codes handling
+******************************************/
+#define ERROR(name) ((size_t)-PREFIX(name))
+
+ERR_STATIC unsigned ERR_isError(size_t code) { return (code > ERROR(maxCode)); }
+
+ERR_STATIC ERR_enum ERR_getErrorCode(size_t code)
+{
+	if (!ERR_isError(code))
+		return (ERR_enum)0;
+	return (ERR_enum)(0 - code);
+}
+
+#endif /* ERROR_H_MODULE */
diff --git a/lib/zstd/fse.h b/lib/zstd/fse.h
new file mode 100644
index 000000000000..7460ab04b191
--- /dev/null
+++ b/lib/zstd/fse.h
@@ -0,0 +1,575 @@
+/*
+ * FSE : Finite State Entropy codec
+ * Public Prototypes declaration
+ * Copyright (C) 2013-2016, Yann Collet.
+ *
+ * BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php)
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met:
+ *
+ *   * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ *   * Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following disclaimer
+ * in the documentation and/or other materials provided with the
+ * distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * This program is free software; you can redistribute it and/or modify it under
+ * the terms of the GNU General Public License version 2 as published by the
+ * Free Software Foundation. This program is dual-licensed; you may select
+ * either version 2 of the GNU General Public License ("GPL") or BSD license
+ * ("BSD").
+ *
+ * You can contact the author at :
+ * - Source repository : https://github.com/Cyan4973/FiniteStateEntropy
+ */
+#ifndef FSE_H
+#define FSE_H
+
+/*-*****************************************
+*  Dependencies
+******************************************/
+#include <linux/types.h> /* size_t, ptrdiff_t */
+
+/*-*****************************************
+*  FSE_PUBLIC_API : control library symbols visibility
+******************************************/
+#define FSE_PUBLIC_API
+
+/*------   Version   ------*/
+#define FSE_VERSION_MAJOR 0
+#define FSE_VERSION_MINOR 9
+#define FSE_VERSION_RELEASE 0
+
+#define FSE_LIB_VERSION FSE_VERSION_MAJOR.FSE_VERSION_MINOR.FSE_VERSION_RELEASE
+#define FSE_QUOTE(str) #str
+#define FSE_EXPAND_AND_QUOTE(str) FSE_QUOTE(str)
+#define FSE_VERSION_STRING FSE_EXPAND_AND_QUOTE(FSE_LIB_VERSION)
+
+#define FSE_VERSION_NUMBER (FSE_VERSION_MAJOR * 100 * 100 + FSE_VERSION_MINOR * 100 + FSE_VERSION_RELEASE)
+FSE_PUBLIC_API unsigned FSE_versionNumber(void); /**< library version number; to be used when checking dll version */
+
+/*-*****************************************
+*  Tool functions
+******************************************/
+FSE_PUBLIC_API size_t FSE_compressBound(size_t size); /* maximum compressed size */
+
+/* Error Management */
+FSE_PUBLIC_API unsigned FSE_isError(size_t code); /* tells if a return value is an error code */
+
+/*-*****************************************
+*  FSE detailed API
+******************************************/
+/*!
+FSE_compress() does the following:
+1. count symbol occurrence from source[] into table count[]
+2. normalize counters so that sum(count[]) == Power_of_2 (2^tableLog)
+3. save normalized counters to memory buffer using writeNCount()
+4. build encoding table 'CTable' from normalized counters
+5. encode the data stream using encoding table 'CTable'
+
+FSE_decompress() does the following:
+1. read normalized counters with readNCount()
+2. build decoding table 'DTable' from normalized counters
+3. decode the data stream using decoding table 'DTable'
+
+The following API allows targeting specific sub-functions for advanced tasks.
+For example, it's possible to compress several blocks using the same 'CTable',
+or to save and provide normalized distribution using external method.
+*/
+
+/* *** COMPRESSION *** */
+/*! FSE_optimalTableLog():
+	dynamically downsize 'tableLog' when conditions are met.
+	It saves CPU time, by using smaller tables, while preserving or even improving compression ratio.
+	@return : recommended tableLog (necessarily <= 'maxTableLog') */
+FSE_PUBLIC_API unsigned FSE_optimalTableLog(unsigned maxTableLog, size_t srcSize, unsigned maxSymbolValue);
+
+/*! FSE_normalizeCount():
+	normalize counts so that sum(count[]) == Power_of_2 (2^tableLog)
+	'normalizedCounter' is a table of short, of minimum size (maxSymbolValue+1).
+	@return : tableLog,
+			  or an errorCode, which can be tested using FSE_isError() */
+FSE_PUBLIC_API size_t FSE_normalizeCount(short *normalizedCounter, unsigned tableLog, const unsigned *count, size_t srcSize, unsigned maxSymbolValue);
+
+/*! FSE_NCountWriteBound():
+	Provides the maximum possible size of an FSE normalized table, given 'maxSymbolValue' and 'tableLog'.
+	Typically useful for allocation purpose. */
+FSE_PUBLIC_API size_t FSE_NCountWriteBound(unsigned maxSymbolValue, unsigned tableLog);
+
+/*! FSE_writeNCount():
+	Compactly save 'normalizedCounter' into 'buffer'.
+	@return : size of the compressed table,
+			  or an errorCode, which can be tested using FSE_isError(). */
+FSE_PUBLIC_API size_t FSE_writeNCount(void *buffer, size_t bufferSize, const short *normalizedCounter, unsigned maxSymbolValue, unsigned tableLog);
+
+/*! Constructor and Destructor of FSE_CTable.
+	Note that FSE_CTable size depends on 'tableLog' and 'maxSymbolValue' */
+typedef unsigned FSE_CTable; /* don't allocate that. It's only meant to be more restrictive than void* */
+
+/*! FSE_compress_usingCTable():
+	Compress `src` using `ct` into `dst` which must be already allocated.
+	@return : size of compressed data (<= `dstCapacity`),
+			  or 0 if compressed data could not fit into `dst`,
+			  or an errorCode, which can be tested using FSE_isError() */
+FSE_PUBLIC_API size_t FSE_compress_usingCTable(void *dst, size_t dstCapacity, const void *src, size_t srcSize, const FSE_CTable *ct);
+
+/*!
+Tutorial :
+----------
+The first step is to count all symbols. FSE_count() does this job very fast.
+Result will be saved into 'count', a table of unsigned int, which must be already allocated, and have 'maxSymbolValuePtr[0]+1' cells.
+'src' is a table of bytes of size 'srcSize'. All values within 'src' MUST be <= maxSymbolValuePtr[0]
+maxSymbolValuePtr[0] will be updated, with its real value (necessarily <= original value)
+FSE_count() will return the number of occurrence of the most frequent symbol.
+This can be used to know if there is a single symbol within 'src', and to quickly evaluate its compressibility.
+If there is an error, the function will return an ErrorCode (which can be tested using FSE_isError()).
+
+The next step is to normalize the frequencies.
+FSE_normalizeCount() will ensure that sum of frequencies is == 2 ^'tableLog'.
+It also guarantees a minimum of 1 to any Symbol with frequency >= 1.
+You can use 'tableLog'==0 to mean "use default tableLog value".
+If you are unsure of which tableLog value to use, you can ask FSE_optimalTableLog(),
+which will provide the optimal valid tableLog given sourceSize, maxSymbolValue, and a user-defined maximum (0 means "default").
+
+The result of FSE_normalizeCount() will be saved into a table,
+called 'normalizedCounter', which is a table of signed short.
+'normalizedCounter' must be already allocated, and have at least 'maxSymbolValue+1' cells.
+The return value is tableLog if everything proceeded as expected.
+It is 0 if there is a single symbol within distribution.
+If there is an error (ex: invalid tableLog value), the function will return an ErrorCode (which can be tested using FSE_isError()).
+
+'normalizedCounter' can be saved in a compact manner to a memory area using FSE_writeNCount().
+'buffer' must be already allocated.
+For guaranteed success, buffer size must be at least FSE_headerBound().
+The result of the function is the number of bytes written into 'buffer'.
+If there is an error, the function will return an ErrorCode (which can be tested using FSE_isError(); ex : buffer size too small).
+
+'normalizedCounter' can then be used to create the compression table 'CTable'.
+The space required by 'CTable' must be already allocated, using FSE_createCTable().
+You can then use FSE_buildCTable() to fill 'CTable'.
+If there is an error, both functions will return an ErrorCode (which can be tested using FSE_isError()).
+
+'CTable' can then be used to compress 'src', with FSE_compress_usingCTable().
+Similar to FSE_count(), the convention is that 'src' is assumed to be a table of char of size 'srcSize'
+The function returns the size of compressed data (without header), necessarily <= `dstCapacity`.
+If it returns '0', compressed data could not fit into 'dst'.
+If there is an error, the function will return an ErrorCode (which can be tested using FSE_isError()).
+*/
+
+/* *** DECOMPRESSION *** */
+
+/*! FSE_readNCount():
+	Read compactly saved 'normalizedCounter' from 'rBuffer'.
+	@return : size read from 'rBuffer',
+			  or an errorCode, which can be tested using FSE_isError().
+			  maxSymbolValuePtr[0] and tableLogPtr[0] will also be updated with their respective values */
+FSE_PUBLIC_API size_t FSE_readNCount(short *normalizedCounter, unsigned *maxSymbolValuePtr, unsigned *tableLogPtr, const void *rBuffer, size_t rBuffSize);
+
+/*! Constructor and Destructor of FSE_DTable.
+	Note that its size depends on 'tableLog' */
+typedef unsigned FSE_DTable; /* don't allocate that. It's just a way to be more restrictive than void* */
+
+/*! FSE_buildDTable():
+	Builds 'dt', which must be already allocated, using FSE_createDTable().
+	return : 0, or an errorCode, which can be tested using FSE_isError() */
+FSE_PUBLIC_API size_t FSE_buildDTable_wksp(FSE_DTable *dt, const short *normalizedCounter, unsigned maxSymbolValue, unsigned tableLog, void *workspace, size_t workspaceSize);
+
+/*! FSE_decompress_usingDTable():
+	Decompress compressed source `cSrc` of size `cSrcSize` using `dt`
+	into `dst` which must be already allocated.
+	@return : size of regenerated data (necessarily <= `dstCapacity`),
+			  or an errorCode, which can be tested using FSE_isError() */
+FSE_PUBLIC_API size_t FSE_decompress_usingDTable(void *dst, size_t dstCapacity, const void *cSrc, size_t cSrcSize, const FSE_DTable *dt);
+
+/*!
+Tutorial :
+----------
+(Note : these functions only decompress FSE-compressed blocks.
+ If block is uncompressed, use memcpy() instead
+ If block is a single repeated byte, use memset() instead )
+
+The first step is to obtain the normalized frequencies of symbols.
+This can be performed by FSE_readNCount() if it was saved using FSE_writeNCount().
+'normalizedCounter' must be already allocated, and have at least 'maxSymbolValuePtr[0]+1' cells of signed short.
+In practice, that means it's necessary to know 'maxSymbolValue' beforehand,
+or size the table to handle worst case situations (typically 256).
+FSE_readNCount() will provide 'tableLog' and 'maxSymbolValue'.
+The result of FSE_readNCount() is the number of bytes read from 'rBuffer'.
+Note that 'rBufferSize' must be at least 4 bytes, even if useful information is less than that.
+If there is an error, the function will return an error code, which can be tested using FSE_isError().
+
+The next step is to build the decompression tables 'FSE_DTable' from 'normalizedCounter'.
+This is performed by the function FSE_buildDTable().
+The space required by 'FSE_DTable' must be already allocated using FSE_createDTable().
+If there is an error, the function will return an error code, which can be tested using FSE_isError().
+
+`FSE_DTable` can then be used to decompress `cSrc`, with FSE_decompress_usingDTable().
+`cSrcSize` must be strictly correct, otherwise decompression will fail.
+FSE_decompress_usingDTable() result will tell how many bytes were regenerated (<=`dstCapacity`).
+If there is an error, the function will return an error code, which can be tested using FSE_isError(). (ex: dst buffer too small)
+*/
+
+/* *** Dependency *** */
+#include "bitstream.h"
+
+/* *****************************************
+*  Static allocation
+*******************************************/
+/* FSE buffer bounds */
+#define FSE_NCOUNTBOUND 512
+#define FSE_BLOCKBOUND(size) (size + (size >> 7))
+#define FSE_COMPRESSBOUND(size) (FSE_NCOUNTBOUND + FSE_BLOCKBOUND(size)) /* Macro version, useful for static allocation */
+
+/* It is possible to statically allocate FSE CTable/DTable as a table of FSE_CTable/FSE_DTable using below macros */
+#define FSE_CTABLE_SIZE_U32(maxTableLog, maxSymbolValue) (1 + (1 << (maxTableLog - 1)) + ((maxSymbolValue + 1) * 2))
+#define FSE_DTABLE_SIZE_U32(maxTableLog) (1 + (1 << maxTableLog))
+
+/* *****************************************
+*  FSE advanced API
+*******************************************/
+/* FSE_count_wksp() :
+ * Same as FSE_count(), but using an externally provided scratch buffer.
+ * `workSpace` size must be table of >= `1024` unsigned
+ */
+size_t FSE_count_wksp(unsigned *count, unsigned *maxSymbolValuePtr, const void *source, size_t sourceSize, unsigned *workSpace);
+
+/* FSE_countFast_wksp() :
+ * Same as FSE_countFast(), but using an externally provided scratch buffer.
+ * `workSpace` must be a table of minimum `1024` unsigned
+ */
+size_t FSE_countFast_wksp(unsigned *count, unsigned *maxSymbolValuePtr, const void *src, size_t srcSize, unsigned *workSpace);
+
+/*! FSE_count_simple
+ * Same as FSE_countFast(), but does not use any additional memory (not even on stack).
+ * This function is unsafe, and will segfault if any value within `src` is `> *maxSymbolValuePtr` (presuming it's also the size of `count`).
+*/
+size_t FSE_count_simple(unsigned *count, unsigned *maxSymbolValuePtr, const void *src, size_t srcSize);
+
+unsigned FSE_optimalTableLog_internal(unsigned maxTableLog, size_t srcSize, unsigned maxSymbolValue, unsigned minus);
+/**< same as FSE_optimalTableLog(), which used `minus==2` */
+
+size_t FSE_buildCTable_raw(FSE_CTable *ct, unsigned nbBits);
+/**< build a fake FSE_CTable, designed for a flat distribution, where each symbol uses nbBits */
+
+size_t FSE_buildCTable_rle(FSE_CTable *ct, unsigned char symbolValue);
+/**< build a fake FSE_CTable, designed to compress always the same symbolValue */
+
+/* FSE_buildCTable_wksp() :
+ * Same as FSE_buildCTable(), but using an externally allocated scratch buffer (`workSpace`).
+ * `wkspSize` must be >= `(1<<tableLog)`.
+ */
+size_t FSE_buildCTable_wksp(FSE_CTable *ct, const short *normalizedCounter, unsigned maxSymbolValue, unsigned tableLog, void *workSpace, size_t wkspSize);
+
+size_t FSE_buildDTable_raw(FSE_DTable *dt, unsigned nbBits);
+/**< build a fake FSE_DTable, designed to read a flat distribution where each symbol uses nbBits */
+
+size_t FSE_buildDTable_rle(FSE_DTable *dt, unsigned char symbolValue);
+/**< build a fake FSE_DTable, designed to always generate the same symbolValue */
+
+size_t FSE_decompress_wksp(void *dst, size_t dstCapacity, const void *cSrc, size_t cSrcSize, unsigned maxLog, void *workspace, size_t workspaceSize);
+/**< same as FSE_decompress(), using an externally allocated `workSpace` produced with `FSE_DTABLE_SIZE_U32(maxLog)` */
+
+/* *****************************************
+*  FSE symbol compression API
+*******************************************/
+/*!
+   This API consists of small unitary functions, which highly benefit from being inlined.
+   Hence their body are included in next section.
+*/
+typedef struct {
+	ptrdiff_t value;
+	const void *stateTable;
+	const void *symbolTT;
+	unsigned stateLog;
+} FSE_CState_t;
+
+static void FSE_initCState(FSE_CState_t *CStatePtr, const FSE_CTable *ct);
+
+static void FSE_encodeSymbol(BIT_CStream_t *bitC, FSE_CState_t *CStatePtr, unsigned symbol);
+
+static void FSE_flushCState(BIT_CStream_t *bitC, const FSE_CState_t *CStatePtr);
+
+/**<
+These functions are inner components of FSE_compress_usingCTable().
+They allow the creation of custom streams, mixing multiple tables and bit sources.
+
+A key property to keep in mind is that encoding and decoding are done **in reverse direction**.
+So the first symbol you will encode is the last you will decode, like a LIFO stack.
+
+You will need a few variables to track your CStream. They are :
+
+FSE_CTable    ct;         // Provided by FSE_buildCTable()
+BIT_CStream_t bitStream;  // bitStream tracking structure
+FSE_CState_t  state;      // State tracking structure (can have several)
+
+
+The first thing to do is to init bitStream and state.
+	size_t errorCode = BIT_initCStream(&bitStream, dstBuffer, maxDstSize);
+	FSE_initCState(&state, ct);
+
+Note that BIT_initCStream() can produce an error code, so its result should be tested, using FSE_isError();
+You can then encode your input data, byte after byte.
+FSE_encodeSymbol() outputs a maximum of 'tableLog' bits at a time.
+Remember decoding will be done in reverse direction.
+	FSE_encodeByte(&bitStream, &state, symbol);
+
+At any time, you can also add any bit sequence.
+Note : maximum allowed nbBits is 25, for compatibility with 32-bits decoders
+	BIT_addBits(&bitStream, bitField, nbBits);
+
+The above methods don't commit data to memory, they just store it into local register, for speed.
+Local register size is 64-bits on 64-bits systems, 32-bits on 32-bits systems (size_t).
+Writing data to memory is a manual operation, performed by the flushBits function.
+	BIT_flushBits(&bitStream);
+
+Your last FSE encoding operation shall be to flush your last state value(s).
+	FSE_flushState(&bitStream, &state);
+
+Finally, you must close the bitStream.
+The function returns the size of CStream in bytes.
+If data couldn't fit into dstBuffer, it will return a 0 ( == not compressible)
+If there is an error, it returns an errorCode (which can be tested using FSE_isError()).
+	size_t size = BIT_closeCStream(&bitStream);
+*/
+
+/* *****************************************
+*  FSE symbol decompression API
+*******************************************/
+typedef struct {
+	size_t state;
+	const void *table; /* precise table may vary, depending on U16 */
+} FSE_DState_t;
+
+static void FSE_initDState(FSE_DState_t *DStatePtr, BIT_DStream_t *bitD, const FSE_DTable *dt);
+
+static unsigned char FSE_decodeSymbol(FSE_DState_t *DStatePtr, BIT_DStream_t *bitD);
+
+static unsigned FSE_endOfDState(const FSE_DState_t *DStatePtr);
+
+/**<
+Let's now decompose FSE_decompress_usingDTable() into its unitary components.
+You will decode FSE-encoded symbols from the bitStream,
+and also any other bitFields you put in, **in reverse order**.
+
+You will need a few variables to track your bitStream. They are :
+
+BIT_DStream_t DStream;    // Stream context
+FSE_DState_t  DState;     // State context. Multiple ones are possible
+FSE_DTable*   DTablePtr;  // Decoding table, provided by FSE_buildDTable()
+
+The first thing to do is to init the bitStream.
+	errorCode = BIT_initDStream(&DStream, srcBuffer, srcSize);
+
+You should then retrieve your initial state(s)
+(in reverse flushing order if you have several ones) :
+	errorCode = FSE_initDState(&DState, &DStream, DTablePtr);
+
+You can then decode your data, symbol after symbol.
+For information the maximum number of bits read by FSE_decodeSymbol() is 'tableLog'.
+Keep in mind that symbols are decoded in reverse order, like a LIFO stack (last in, first out).
+	unsigned char symbol = FSE_decodeSymbol(&DState, &DStream);
+
+You can retrieve any bitfield you eventually stored into the bitStream (in reverse order)
+Note : maximum allowed nbBits is 25, for 32-bits compatibility
+	size_t bitField = BIT_readBits(&DStream, nbBits);
+
+All above operations only read from local register (which size depends on size_t).
+Refueling the register from memory is manually performed by the reload method.
+	endSignal = FSE_reloadDStream(&DStream);
+
+BIT_reloadDStream() result tells if there is still some more data to read from DStream.
+BIT_DStream_unfinished : there is still some data left into the DStream.
+BIT_DStream_endOfBuffer : Dstream reached end of buffer. Its container may no longer be completely filled.
+BIT_DStream_completed : Dstream reached its exact end, corresponding in general to decompression completed.
+BIT_DStream_tooFar : Dstream went too far. Decompression result is corrupted.
+
+When reaching end of buffer (BIT_DStream_endOfBuffer), progress slowly, notably if you decode multiple symbols per loop,
+to properly detect the exact end of stream.
+After each decoded symbol, check if DStream is fully consumed using this simple test :
+	BIT_reloadDStream(&DStream) >= BIT_DStream_completed
+
+When it's done, verify decompression is fully completed, by checking both DStream and the relevant states.
+Checking if DStream has reached its end is performed by :
+	BIT_endOfDStream(&DStream);
+Check also the states. There might be some symbols left there, if some high probability ones (>50%) are possible.
+	FSE_endOfDState(&DState);
+*/
+
+/* *****************************************
+*  FSE unsafe API
+*******************************************/
+static unsigned char FSE_decodeSymbolFast(FSE_DState_t *DStatePtr, BIT_DStream_t *bitD);
+/* faster, but works only if nbBits is always >= 1 (otherwise, result will be corrupted) */
+
+/* *****************************************
+*  Implementation of inlined functions
+*******************************************/
+typedef struct {
+	int deltaFindState;
+	U32 deltaNbBits;
+} FSE_symbolCompressionTransform; /* total 8 bytes */
+
+ZSTD_STATIC void FSE_initCState(FSE_CState_t *statePtr, const FSE_CTable *ct)
+{
+	const void *ptr = ct;
+	const U16 *u16ptr = (const U16 *)ptr;
+	const U32 tableLog = ZSTD_read16(ptr);
+	statePtr->value = (ptrdiff_t)1 << tableLog;
+	statePtr->stateTable = u16ptr + 2;
+	statePtr->symbolTT = ((const U32 *)ct + 1 + (tableLog ? (1 << (tableLog - 1)) : 1));
+	statePtr->stateLog = tableLog;
+}
+
+/*! FSE_initCState2() :
+*   Same as FSE_initCState(), but the first symbol to include (which will be the last to be read)
+*   uses the smallest state value possible, saving the cost of this symbol */
+ZSTD_STATIC void FSE_initCState2(FSE_CState_t *statePtr, const FSE_CTable *ct, U32 symbol)
+{
+	FSE_initCState(statePtr, ct);
+	{
+		const FSE_symbolCompressionTransform symbolTT = ((const FSE_symbolCompressionTransform *)(statePtr->symbolTT))[symbol];
+		const U16 *stateTable = (const U16 *)(statePtr->stateTable);
+		U32 nbBitsOut = (U32)((symbolTT.deltaNbBits + (1 << 15)) >> 16);
+		statePtr->value = (nbBitsOut << 16) - symbolTT.deltaNbBits;
+		statePtr->value = stateTable[(statePtr->value >> nbBitsOut) + symbolTT.deltaFindState];
+	}
+}
+
+ZSTD_STATIC void FSE_encodeSymbol(BIT_CStream_t *bitC, FSE_CState_t *statePtr, U32 symbol)
+{
+	const FSE_symbolCompressionTransform symbolTT = ((const FSE_symbolCompressionTransform *)(statePtr->symbolTT))[symbol];
+	const U16 *const stateTable = (const U16 *)(statePtr->stateTable);
+	U32 nbBitsOut = (U32)((statePtr->value + symbolTT.deltaNbBits) >> 16);
+	BIT_addBits(bitC, statePtr->value, nbBitsOut);
+	statePtr->value = stateTable[(statePtr->value >> nbBitsOut) + symbolTT.deltaFindState];
+}
+
+ZSTD_STATIC void FSE_flushCState(BIT_CStream_t *bitC, const FSE_CState_t *statePtr)
+{
+	BIT_addBits(bitC, statePtr->value, statePtr->stateLog);
+	BIT_flushBits(bitC);
+}
+
+/* ======    Decompression    ====== */
+
+typedef struct {
+	U16 tableLog;
+	U16 fastMode;
+} FSE_DTableHeader; /* sizeof U32 */
+
+typedef struct {
+	unsigned short newState;
+	unsigned char symbol;
+	unsigned char nbBits;
+} FSE_decode_t; /* size == U32 */
+
+ZSTD_STATIC void FSE_initDState(FSE_DState_t *DStatePtr, BIT_DStream_t *bitD, const FSE_DTable *dt)
+{
+	const void *ptr = dt;
+	const FSE_DTableHeader *const DTableH = (const FSE_DTableHeader *)ptr;
+	DStatePtr->state = BIT_readBits(bitD, DTableH->tableLog);
+	BIT_reloadDStream(bitD);
+	DStatePtr->table = dt + 1;
+}
+
+ZSTD_STATIC BYTE FSE_peekSymbol(const FSE_DState_t *DStatePtr)
+{
+	FSE_decode_t const DInfo = ((const FSE_decode_t *)(DStatePtr->table))[DStatePtr->state];
+	return DInfo.symbol;
+}
+
+ZSTD_STATIC void FSE_updateState(FSE_DState_t *DStatePtr, BIT_DStream_t *bitD)
+{
+	FSE_decode_t const DInfo = ((const FSE_decode_t *)(DStatePtr->table))[DStatePtr->state];
+	U32 const nbBits = DInfo.nbBits;
+	size_t const lowBits = BIT_readBits(bitD, nbBits);
+	DStatePtr->state = DInfo.newState + lowBits;
+}
+
+ZSTD_STATIC BYTE FSE_decodeSymbol(FSE_DState_t *DStatePtr, BIT_DStream_t *bitD)
+{
+	FSE_decode_t const DInfo = ((const FSE_decode_t *)(DStatePtr->table))[DStatePtr->state];
+	U32 const nbBits = DInfo.nbBits;
+	BYTE const symbol = DInfo.symbol;
+	size_t const lowBits = BIT_readBits(bitD, nbBits);
+
+	DStatePtr->state = DInfo.newState + lowBits;
+	return symbol;
+}
+
+/*! FSE_decodeSymbolFast() :
+	unsafe, only works if no symbol has a probability > 50% */
+ZSTD_STATIC BYTE FSE_decodeSymbolFast(FSE_DState_t *DStatePtr, BIT_DStream_t *bitD)
+{
+	FSE_decode_t const DInfo = ((const FSE_decode_t *)(DStatePtr->table))[DStatePtr->state];
+	U32 const nbBits = DInfo.nbBits;
+	BYTE const symbol = DInfo.symbol;
+	size_t const lowBits = BIT_readBitsFast(bitD, nbBits);
+
+	DStatePtr->state = DInfo.newState + lowBits;
+	return symbol;
+}
+
+ZSTD_STATIC unsigned FSE_endOfDState(const FSE_DState_t *DStatePtr) { return DStatePtr->state == 0; }
+
+/* **************************************************************
+*  Tuning parameters
+****************************************************************/
+/*!MEMORY_USAGE :
+*  Memory usage formula : N->2^N Bytes (examples : 10 -> 1KB; 12 -> 4KB ; 16 -> 64KB; 20 -> 1MB; etc.)
+*  Increasing memory usage improves compression ratio
+*  Reduced memory usage can improve speed, due to cache effect
+*  Recommended max value is 14, for 16KB, which nicely fits into Intel x86 L1 cache */
+#ifndef FSE_MAX_MEMORY_USAGE
+#define FSE_MAX_MEMORY_USAGE 14
+#endif
+#ifndef FSE_DEFAULT_MEMORY_USAGE
+#define FSE_DEFAULT_MEMORY_USAGE 13
+#endif
+
+/*!FSE_MAX_SYMBOL_VALUE :
+*  Maximum symbol value authorized.
+*  Required for proper stack allocation */
+#ifndef FSE_MAX_SYMBOL_VALUE
+#define FSE_MAX_SYMBOL_VALUE 255
+#endif
+
+/* **************************************************************
+*  template functions type & suffix
+****************************************************************/
+#define FSE_FUNCTION_TYPE BYTE
+#define FSE_FUNCTION_EXTENSION
+#define FSE_DECODE_TYPE FSE_decode_t
+
+/* ***************************************************************
+*  Constants
+*****************************************************************/
+#define FSE_MAX_TABLELOG (FSE_MAX_MEMORY_USAGE - 2)
+#define FSE_MAX_TABLESIZE (1U << FSE_MAX_TABLELOG)
+#define FSE_MAXTABLESIZE_MASK (FSE_MAX_TABLESIZE - 1)
+#define FSE_DEFAULT_TABLELOG (FSE_DEFAULT_MEMORY_USAGE - 2)
+#define FSE_MIN_TABLELOG 5
+
+#define FSE_TABLELOG_ABSOLUTE_MAX 15
+#if FSE_MAX_TABLELOG > FSE_TABLELOG_ABSOLUTE_MAX
+#error "FSE_MAX_TABLELOG > FSE_TABLELOG_ABSOLUTE_MAX is not supported"
+#endif
+
+#define FSE_TABLESTEP(tableSize) ((tableSize >> 1) + (tableSize >> 3) + 3)
+
+#endif /* FSE_H */
diff --git a/lib/zstd/fse_compress.c b/lib/zstd/fse_compress.c
new file mode 100644
index 000000000000..ef3d1741d532
--- /dev/null
+++ b/lib/zstd/fse_compress.c
@@ -0,0 +1,795 @@
+/*
+ * FSE : Finite State Entropy encoder
+ * Copyright (C) 2013-2015, Yann Collet.
+ *
+ * BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php)
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met:
+ *
+ *   * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ *   * Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following disclaimer
+ * in the documentation and/or other materials provided with the
+ * distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * This program is free software; you can redistribute it and/or modify it under
+ * the terms of the GNU General Public License version 2 as published by the
+ * Free Software Foundation. This program is dual-licensed; you may select
+ * either version 2 of the GNU General Public License ("GPL") or BSD license
+ * ("BSD").
+ *
+ * You can contact the author at :
+ * - Source repository : https://github.com/Cyan4973/FiniteStateEntropy
+ */
+
+/* **************************************************************
+*  Compiler specifics
+****************************************************************/
+#define FORCE_INLINE static __always_inline
+
+/* **************************************************************
+*  Includes
+****************************************************************/
+#include "bitstream.h"
+#include "fse.h"
+#include <linux/compiler.h>
+#include <linux/kernel.h>
+#include <linux/math64.h>
+#include <linux/string.h> /* memcpy, memset */
+
+/* **************************************************************
+*  Error Management
+****************************************************************/
+#define FSE_STATIC_ASSERT(c)                                   \
+	{                                                      \
+		enum { FSE_static_assert = 1 / (int)(!!(c)) }; \
+	} /* use only *after* variable declarations */
+
+/* **************************************************************
+*  Templates
+****************************************************************/
+/*
+  designed to be included
+  for type-specific functions (template emulation in C)
+  Objective is to write these functions only once, for improved maintenance
+*/
+
+/* safety checks */
+#ifndef FSE_FUNCTION_EXTENSION
+#error "FSE_FUNCTION_EXTENSION must be defined"
+#endif
+#ifndef FSE_FUNCTION_TYPE
+#error "FSE_FUNCTION_TYPE must be defined"
+#endif
+
+/* Function names */
+#define FSE_CAT(X, Y) X##Y
+#define FSE_FUNCTION_NAME(X, Y) FSE_CAT(X, Y)
+#define FSE_TYPE_NAME(X, Y) FSE_CAT(X, Y)
+
+/* Function templates */
+
+/* FSE_buildCTable_wksp() :
+ * Same as FSE_buildCTable(), but using an externally allocated scratch buffer (`workSpace`).
+ * wkspSize should be sized to handle worst case situation, which is `1<<max_tableLog * sizeof(FSE_FUNCTION_TYPE)`
+ * workSpace must also be properly aligned with FSE_FUNCTION_TYPE requirements
+ */
+size_t FSE_buildCTable_wksp(FSE_CTable *ct, const short *normalizedCounter, unsigned maxSymbolValue, unsigned tableLog, void *workspace, size_t workspaceSize)
+{
+	U32 const tableSize = 1 << tableLog;
+	U32 const tableMask = tableSize - 1;
+	void *const ptr = ct;
+	U16 *const tableU16 = ((U16 *)ptr) + 2;
+	void *const FSCT = ((U32 *)ptr) + 1 /* header */ + (tableLog ? tableSize >> 1 : 1);
+	FSE_symbolCompressionTransform *const symbolTT = (FSE_symbolCompressionTransform *)(FSCT);
+	U32 const step = FSE_TABLESTEP(tableSize);
+	U32 highThreshold = tableSize - 1;
+
+	U32 *cumul;
+	FSE_FUNCTION_TYPE *tableSymbol;
+	size_t spaceUsed32 = 0;
+
+	cumul = (U32 *)workspace + spaceUsed32;
+	spaceUsed32 += FSE_MAX_SYMBOL_VALUE + 2;
+	tableSymbol = (FSE_FUNCTION_TYPE *)((U32 *)workspace + spaceUsed32);
+	spaceUsed32 += ALIGN(sizeof(FSE_FUNCTION_TYPE) * ((size_t)1 << tableLog), sizeof(U32)) >> 2;
+
+	if ((spaceUsed32 << 2) > workspaceSize)
+		return ERROR(tableLog_tooLarge);
+	workspace = (U32 *)workspace + spaceUsed32;
+	workspaceSize -= (spaceUsed32 << 2);
+
+	/* CTable header */
+	tableU16[-2] = (U16)tableLog;
+	tableU16[-1] = (U16)maxSymbolValue;
+
+	/* For explanations on how to distribute symbol values over the table :
+	*  http://fastcompression.blogspot.fr/2014/02/fse-distributing-symbol-values.html */
+
+	/* symbol start positions */
+	{
+		U32 u;
+		cumul[0] = 0;
+		for (u = 1; u <= maxSymbolValue + 1; u++) {
+			if (normalizedCounter[u - 1] == -1) { /* Low proba symbol */
+				cumul[u] = cumul[u - 1] + 1;
+				tableSymbol[highThreshold--] = (FSE_FUNCTION_TYPE)(u - 1);
+			} else {
+				cumul[u] = cumul[u - 1] + normalizedCounter[u - 1];
+			}
+		}
+		cumul[maxSymbolValue + 1] = tableSize + 1;
+	}
+
+	/* Spread symbols */
+	{
+		U32 position = 0;
+		U32 symbol;
+		for (symbol = 0; symbol <= maxSymbolValue; symbol++) {
+			int nbOccurences;
+			for (nbOccurences = 0; nbOccurences < normalizedCounter[symbol]; nbOccurences++) {
+				tableSymbol[position] = (FSE_FUNCTION_TYPE)symbol;
+				position = (position + step) & tableMask;
+				while (position > highThreshold)
+					position = (position + step) & tableMask; /* Low proba area */
+			}
+		}
+
+		if (position != 0)
+			return ERROR(GENERIC); /* Must have gone through all positions */
+	}
+
+	/* Build table */
+	{
+		U32 u;
+		for (u = 0; u < tableSize; u++) {
+			FSE_FUNCTION_TYPE s = tableSymbol[u];	/* note : static analyzer may not understand tableSymbol is properly initialized */
+			tableU16[cumul[s]++] = (U16)(tableSize + u); /* TableU16 : sorted by symbol order; gives next state value */
+		}
+	}
+
+	/* Build Symbol Transformation Table */
+	{
+		unsigned total = 0;
+		unsigned s;
+		for (s = 0; s <= maxSymbolValue; s++) {
+			switch (normalizedCounter[s]) {
+			case 0: break;
+
+			case -1:
+			case 1:
+				symbolTT[s].deltaNbBits = (tableLog << 16) - (1 << tableLog);
+				symbolTT[s].deltaFindState = total - 1;
+				total++;
+				break;
+			default: {
+				U32 const maxBitsOut = tableLog - BIT_highbit32(normalizedCounter[s] - 1);
+				U32 const minStatePlus = normalizedCounter[s] << maxBitsOut;
+				symbolTT[s].deltaNbBits = (maxBitsOut << 16) - minStatePlus;
+				symbolTT[s].deltaFindState = total - normalizedCounter[s];
+				total += normalizedCounter[s];
+			}
+			}
+		}
+	}
+
+	return 0;
+}
+
+/*-**************************************************************
+*  FSE NCount encoding-decoding
+****************************************************************/
+size_t FSE_NCountWriteBound(unsigned maxSymbolValue, unsigned tableLog)
+{
+	size_t const maxHeaderSize = (((maxSymbolValue + 1) * tableLog) >> 3) + 3;
+	return maxSymbolValue ? maxHeaderSize : FSE_NCOUNTBOUND; /* maxSymbolValue==0 ? use default */
+}
+
+static size_t FSE_writeNCount_generic(void *header, size_t headerBufferSize, const short *normalizedCounter, unsigned maxSymbolValue, unsigned tableLog,
+				      unsigned writeIsSafe)
+{
+	BYTE *const ostart = (BYTE *)header;
+	BYTE *out = ostart;
+	BYTE *const oend = ostart + headerBufferSize;
+	int nbBits;
+	const int tableSize = 1 << tableLog;
+	int remaining;
+	int threshold;
+	U32 bitStream;
+	int bitCount;
+	unsigned charnum = 0;
+	int previous0 = 0;
+
+	bitStream = 0;
+	bitCount = 0;
+	/* Table Size */
+	bitStream += (tableLog - FSE_MIN_TABLELOG) << bitCount;
+	bitCount += 4;
+
+	/* Init */
+	remaining = tableSize + 1; /* +1 for extra accuracy */
+	threshold = tableSize;
+	nbBits = tableLog + 1;
+
+	while (remaining > 1) { /* stops at 1 */
+		if (previous0) {
+			unsigned start = charnum;
+			while (!normalizedCounter[charnum])
+				charnum++;
+			while (charnum >= start + 24) {
+				start += 24;
+				bitStream += 0xFFFFU << bitCount;
+				if ((!writeIsSafe) && (out > oend - 2))
+					return ERROR(dstSize_tooSmall); /* Buffer overflow */
+				out[0] = (BYTE)bitStream;
+				out[1] = (BYTE)(bitStream >> 8);
+				out += 2;
+				bitStream >>= 16;
+			}
+			while (charnum >= start + 3) {
+				start += 3;
+				bitStream += 3 << bitCount;
+				bitCount += 2;
+			}
+			bitStream += (charnum - start) << bitCount;
+			bitCount += 2;
+			if (bitCount > 16) {
+				if ((!writeIsSafe) && (out > oend - 2))
+					return ERROR(dstSize_tooSmall); /* Buffer overflow */
+				out[0] = (BYTE)bitStream;
+				out[1] = (BYTE)(bitStream >> 8);
+				out += 2;
+				bitStream >>= 16;
+				bitCount -= 16;
+			}
+		}
+		{
+			int count = normalizedCounter[charnum++];
+			int const max = (2 * threshold - 1) - remaining;
+			remaining -= count < 0 ? -count : count;
+			count++; /* +1 for extra accuracy */
+			if (count >= threshold)
+				count += max; /* [0..max[ [max..threshold[ (...) [threshold+max 2*threshold[ */
+			bitStream += count << bitCount;
+			bitCount += nbBits;
+			bitCount -= (count < max);
+			previous0 = (count == 1);
+			if (remaining < 1)
+				return ERROR(GENERIC);
+			while (remaining < threshold)
+				nbBits--, threshold >>= 1;
+		}
+		if (bitCount > 16) {
+			if ((!writeIsSafe) && (out > oend - 2))
+				return ERROR(dstSize_tooSmall); /* Buffer overflow */
+			out[0] = (BYTE)bitStream;
+			out[1] = (BYTE)(bitStream >> 8);
+			out += 2;
+			bitStream >>= 16;
+			bitCount -= 16;
+		}
+	}
+
+	/* flush remaining bitStream */
+	if ((!writeIsSafe) && (out > oend - 2))
+		return ERROR(dstSize_tooSmall); /* Buffer overflow */
+	out[0] = (BYTE)bitStream;
+	out[1] = (BYTE)(bitStream >> 8);
+	out += (bitCount + 7) / 8;
+
+	if (charnum > maxSymbolValue + 1)
+		return ERROR(GENERIC);
+
+	return (out - ostart);
+}
+
+size_t FSE_writeNCount(void *buffer, size_t bufferSize, const short *normalizedCounter, unsigned maxSymbolValue, unsigned tableLog)
+{
+	if (tableLog > FSE_MAX_TABLELOG)
+		return ERROR(tableLog_tooLarge); /* Unsupported */
+	if (tableLog < FSE_MIN_TABLELOG)
+		return ERROR(GENERIC); /* Unsupported */
+
+	if (bufferSize < FSE_NCountWriteBound(maxSymbolValue, tableLog))
+		return FSE_writeNCount_generic(buffer, bufferSize, normalizedCounter, maxSymbolValue, tableLog, 0);
+
+	return FSE_writeNCount_generic(buffer, bufferSize, normalizedCounter, maxSymbolValue, tableLog, 1);
+}
+
+/*-**************************************************************
+*  Counting histogram
+****************************************************************/
+/*! FSE_count_simple
+	This function counts byte values within `src`, and store the histogram into table `count`.
+	It doesn't use any additional memory.
+	But this function is unsafe : it doesn't check that all values within `src` can fit into `count`.
+	For this reason, prefer using a table `count` with 256 elements.
+	@return : count of most numerous element
+*/
+size_t FSE_count_simple(unsigned *count, unsigned *maxSymbolValuePtr, const void *src, size_t srcSize)
+{
+	const BYTE *ip = (const BYTE *)src;
+	const BYTE *const end = ip + srcSize;
+	unsigned maxSymbolValue = *maxSymbolValuePtr;
+	unsigned max = 0;
+
+	memset(count, 0, (maxSymbolValue + 1) * sizeof(*count));
+	if (srcSize == 0) {
+		*maxSymbolValuePtr = 0;
+		return 0;
+	}
+
+	while (ip < end)
+		count[*ip++]++;
+
+	while (!count[maxSymbolValue])
+		maxSymbolValue--;
+	*maxSymbolValuePtr = maxSymbolValue;
+
+	{
+		U32 s;
+		for (s = 0; s <= maxSymbolValue; s++)
+			if (count[s] > max)
+				max = count[s];
+	}
+
+	return (size_t)max;
+}
+
+/* FSE_count_parallel_wksp() :
+ * Same as FSE_count_parallel(), but using an externally provided scratch buffer.
+ * `workSpace` size must be a minimum of `1024 * sizeof(unsigned)`` */
+static size_t FSE_count_parallel_wksp(unsigned *count, unsigned *maxSymbolValuePtr, const void *source, size_t sourceSize, unsigned checkMax,
+				      unsigned *const workSpace)
+{
+	const BYTE *ip = (const BYTE *)source;
+	const BYTE *const iend = ip + sourceSize;
+	unsigned maxSymbolValue = *maxSymbolValuePtr;
+	unsigned max = 0;
+	U32 *const Counting1 = workSpace;
+	U32 *const Counting2 = Counting1 + 256;
+	U32 *const Counting3 = Counting2 + 256;
+	U32 *const Counting4 = Counting3 + 256;
+
+	memset(Counting1, 0, 4 * 256 * sizeof(unsigned));
+
+	/* safety checks */
+	if (!sourceSize) {
+		memset(count, 0, maxSymbolValue + 1);
+		*maxSymbolValuePtr = 0;
+		return 0;
+	}
+	if (!maxSymbolValue)
+		maxSymbolValue = 255; /* 0 == default */
+
+	/* by stripes of 16 bytes */
+	{
+		U32 cached = ZSTD_read32(ip);
+		ip += 4;
+		while (ip < iend - 15) {
+			U32 c = cached;
+			cached = ZSTD_read32(ip);
+			ip += 4;
+			Counting1[(BYTE)c]++;
+			Counting2[(BYTE)(c >> 8)]++;
+			Counting3[(BYTE)(c >> 16)]++;
+			Counting4[c >> 24]++;
+			c = cached;
+			cached = ZSTD_read32(ip);
+			ip += 4;
+			Counting1[(BYTE)c]++;
+			Counting2[(BYTE)(c >> 8)]++;
+			Counting3[(BYTE)(c >> 16)]++;
+			Counting4[c >> 24]++;
+			c = cached;
+			cached = ZSTD_read32(ip);
+			ip += 4;
+			Counting1[(BYTE)c]++;
+			Counting2[(BYTE)(c >> 8)]++;
+			Counting3[(BYTE)(c >> 16)]++;
+			Counting4[c >> 24]++;
+			c = cached;
+			cached = ZSTD_read32(ip);
+			ip += 4;
+			Counting1[(BYTE)c]++;
+			Counting2[(BYTE)(c >> 8)]++;
+			Counting3[(BYTE)(c >> 16)]++;
+			Counting4[c >> 24]++;
+		}
+		ip -= 4;
+	}
+
+	/* finish last symbols */
+	while (ip < iend)
+		Counting1[*ip++]++;
+
+	if (checkMax) { /* verify stats will fit into destination table */
+		U32 s;
+		for (s = 255; s > maxSymbolValue; s--) {
+			Counting1[s] += Counting2[s] + Counting3[s] + Counting4[s];
+			if (Counting1[s])
+				return ERROR(maxSymbolValue_tooSmall);
+		}
+	}
+
+	{
+		U32 s;
+		for (s = 0; s <= maxSymbolValue; s++) {
+			count[s] = Counting1[s] + Counting2[s] + Counting3[s] + Counting4[s];
+			if (count[s] > max)
+				max = count[s];
+		}
+	}
+
+	while (!count[maxSymbolValue])
+		maxSymbolValue--;
+	*maxSymbolValuePtr = maxSymbolValue;
+	return (size_t)max;
+}
+
+/* FSE_countFast_wksp() :
+ * Same as FSE_countFast(), but using an externally provided scratch buffer.
+ * `workSpace` size must be table of >= `1024` unsigned */
+size_t FSE_countFast_wksp(unsigned *count, unsigned *maxSymbolValuePtr, const void *source, size_t sourceSize, unsigned *workSpace)
+{
+	if (sourceSize < 1500)
+		return FSE_count_simple(count, maxSymbolValuePtr, source, sourceSize);
+	return FSE_count_parallel_wksp(count, maxSymbolValuePtr, source, sourceSize, 0, workSpace);
+}
+
+/* FSE_count_wksp() :
+ * Same as FSE_count(), but using an externally provided scratch buffer.
+ * `workSpace` size must be table of >= `1024` unsigned */
+size_t FSE_count_wksp(unsigned *count, unsigned *maxSymbolValuePtr, const void *source, size_t sourceSize, unsigned *workSpace)
+{
+	if (*maxSymbolValuePtr < 255)
+		return FSE_count_parallel_wksp(count, maxSymbolValuePtr, source, sourceSize, 1, workSpace);
+	*maxSymbolValuePtr = 255;
+	return FSE_countFast_wksp(count, maxSymbolValuePtr, source, sourceSize, workSpace);
+}
+
+/*-**************************************************************
+*  FSE Compression Code
+****************************************************************/
+/*! FSE_sizeof_CTable() :
+	FSE_CTable is a variable size structure which contains :
+	`U16 tableLog;`
+	`U16 maxSymbolValue;`
+	`U16 nextStateNumber[1 << tableLog];`                         // This size is variable
+	`FSE_symbolCompressionTransform symbolTT[maxSymbolValue+1];`  // This size is variable
+Allocation is manual (C standard does not support variable-size structures).
+*/
+size_t FSE_sizeof_CTable(unsigned maxSymbolValue, unsigned tableLog)
+{
+	if (tableLog > FSE_MAX_TABLELOG)
+		return ERROR(tableLog_tooLarge);
+	return FSE_CTABLE_SIZE_U32(tableLog, maxSymbolValue) * sizeof(U32);
+}
+
+/* provides the minimum logSize to safely represent a distribution */
+static unsigned FSE_minTableLog(size_t srcSize, unsigned maxSymbolValue)
+{
+	U32 minBitsSrc = BIT_highbit32((U32)(srcSize - 1)) + 1;
+	U32 minBitsSymbols = BIT_highbit32(maxSymbolValue) + 2;
+	U32 minBits = minBitsSrc < minBitsSymbols ? minBitsSrc : minBitsSymbols;
+	return minBits;
+}
+
+unsigned FSE_optimalTableLog_internal(unsigned maxTableLog, size_t srcSize, unsigned maxSymbolValue, unsigned minus)
+{
+	U32 maxBitsSrc = BIT_highbit32((U32)(srcSize - 1)) - minus;
+	U32 tableLog = maxTableLog;
+	U32 minBits = FSE_minTableLog(srcSize, maxSymbolValue);
+	if (tableLog == 0)
+		tableLog = FSE_DEFAULT_TABLELOG;
+	if (maxBitsSrc < tableLog)
+		tableLog = maxBitsSrc; /* Accuracy can be reduced */
+	if (minBits > tableLog)
+		tableLog = minBits; /* Need a minimum to safely represent all symbol values */
+	if (tableLog < FSE_MIN_TABLELOG)
+		tableLog = FSE_MIN_TABLELOG;
+	if (tableLog > FSE_MAX_TABLELOG)
+		tableLog = FSE_MAX_TABLELOG;
+	return tableLog;
+}
+
+unsigned FSE_optimalTableLog(unsigned maxTableLog, size_t srcSize, unsigned maxSymbolValue)
+{
+	return FSE_optimalTableLog_internal(maxTableLog, srcSize, maxSymbolValue, 2);
+}
+
+/* Secondary normalization method.
+   To be used when primary method fails. */
+
+static size_t FSE_normalizeM2(short *norm, U32 tableLog, const unsigned *count, size_t total, U32 maxSymbolValue)
+{
+	short const NOT_YET_ASSIGNED = -2;
+	U32 s;
+	U32 distributed = 0;
+	U32 ToDistribute;
+
+	/* Init */
+	U32 const lowThreshold = (U32)(total >> tableLog);
+	U32 lowOne = (U32)((total * 3) >> (tableLog + 1));
+
+	for (s = 0; s <= maxSymbolValue; s++) {
+		if (count[s] == 0) {
+			norm[s] = 0;
+			continue;
+		}
+		if (count[s] <= lowThreshold) {
+			norm[s] = -1;
+			distributed++;
+			total -= count[s];
+			continue;
+		}
+		if (count[s] <= lowOne) {
+			norm[s] = 1;
+			distributed++;
+			total -= count[s];
+			continue;
+		}
+
+		norm[s] = NOT_YET_ASSIGNED;
+	}
+	ToDistribute = (1 << tableLog) - distributed;
+
+	if ((total / ToDistribute) > lowOne) {
+		/* risk of rounding to zero */
+		lowOne = (U32)((total * 3) / (ToDistribute * 2));
+		for (s = 0; s <= maxSymbolValue; s++) {
+			if ((norm[s] == NOT_YET_ASSIGNED) && (count[s] <= lowOne)) {
+				norm[s] = 1;
+				distributed++;
+				total -= count[s];
+				continue;
+			}
+		}
+		ToDistribute = (1 << tableLog) - distributed;
+	}
+
+	if (distributed == maxSymbolValue + 1) {
+		/* all values are pretty poor;
+		   probably incompressible data (should have already been detected);
+		   find max, then give all remaining points to max */
+		U32 maxV = 0, maxC = 0;
+		for (s = 0; s <= maxSymbolValue; s++)
+			if (count[s] > maxC)
+				maxV = s, maxC = count[s];
+		norm[maxV] += (short)ToDistribute;
+		return 0;
+	}
+
+	if (total == 0) {
+		/* all of the symbols were low enough for the lowOne or lowThreshold */
+		for (s = 0; ToDistribute > 0; s = (s + 1) % (maxSymbolValue + 1))
+			if (norm[s] > 0)
+				ToDistribute--, norm[s]++;
+		return 0;
+	}
+
+	{
+		U64 const vStepLog = 62 - tableLog;
+		U64 const mid = (1ULL << (vStepLog - 1)) - 1;
+		U64 const rStep = div_u64((((U64)1 << vStepLog) * ToDistribute) + mid, (U32)total); /* scale on remaining */
+		U64 tmpTotal = mid;
+		for (s = 0; s <= maxSymbolValue; s++) {
+			if (norm[s] == NOT_YET_ASSIGNED) {
+				U64 const end = tmpTotal + (count[s] * rStep);
+				U32 const sStart = (U32)(tmpTotal >> vStepLog);
+				U32 const sEnd = (U32)(end >> vStepLog);
+				U32 const weight = sEnd - sStart;
+				if (weight < 1)
+					return ERROR(GENERIC);
+				norm[s] = (short)weight;
+				tmpTotal = end;
+			}
+		}
+	}
+
+	return 0;
+}
+
+size_t FSE_normalizeCount(short *normalizedCounter, unsigned tableLog, const unsigned *count, size_t total, unsigned maxSymbolValue)
+{
+	/* Sanity checks */
+	if (tableLog == 0)
+		tableLog = FSE_DEFAULT_TABLELOG;
+	if (tableLog < FSE_MIN_TABLELOG)
+		return ERROR(GENERIC); /* Unsupported size */
+	if (tableLog > FSE_MAX_TABLELOG)
+		return ERROR(tableLog_tooLarge); /* Unsupported size */
+	if (tableLog < FSE_minTableLog(total, maxSymbolValue))
+		return ERROR(GENERIC); /* Too small tableLog, compression potentially impossible */
+
+	{
+		U32 const rtbTable[] = {0, 473195, 504333, 520860, 550000, 700000, 750000, 830000};
+		U64 const scale = 62 - tableLog;
+		U64 const step = div_u64((U64)1 << 62, (U32)total); /* <== here, one division ! */
+		U64 const vStep = 1ULL << (scale - 20);
+		int stillToDistribute = 1 << tableLog;
+		unsigned s;
+		unsigned largest = 0;
+		short largestP = 0;
+		U32 lowThreshold = (U32)(total >> tableLog);
+
+		for (s = 0; s <= maxSymbolValue; s++) {
+			if (count[s] == total)
+				return 0; /* rle special case */
+			if (count[s] == 0) {
+				normalizedCounter[s] = 0;
+				continue;
+			}
+			if (count[s] <= lowThreshold) {
+				normalizedCounter[s] = -1;
+				stillToDistribute--;
+			} else {
+				short proba = (short)((count[s] * step) >> scale);
+				if (proba < 8) {
+					U64 restToBeat = vStep * rtbTable[proba];
+					proba += (count[s] * step) - ((U64)proba << scale) > restToBeat;
+				}
+				if (proba > largestP)
+					largestP = proba, largest = s;
+				normalizedCounter[s] = proba;
+				stillToDistribute -= proba;
+			}
+		}
+		if (-stillToDistribute >= (normalizedCounter[largest] >> 1)) {
+			/* corner case, need another normalization method */
+			size_t const errorCode = FSE_normalizeM2(normalizedCounter, tableLog, count, total, maxSymbolValue);
+			if (FSE_isError(errorCode))
+				return errorCode;
+		} else
+			normalizedCounter[largest] += (short)stillToDistribute;
+	}
+
+	return tableLog;
+}
+
+/* fake FSE_CTable, for raw (uncompressed) input */
+size_t FSE_buildCTable_raw(FSE_CTable *ct, unsigned nbBits)
+{
+	const unsigned tableSize = 1 << nbBits;
+	const unsigned tableMask = tableSize - 1;
+	const unsigned maxSymbolValue = tableMask;
+	void *const ptr = ct;
+	U16 *const tableU16 = ((U16 *)ptr) + 2;
+	void *const FSCT = ((U32 *)ptr) + 1 /* header */ + (tableSize >> 1); /* assumption : tableLog >= 1 */
+	FSE_symbolCompressionTransform *const symbolTT = (FSE_symbolCompressionTransform *)(FSCT);
+	unsigned s;
+
+	/* Sanity checks */
+	if (nbBits < 1)
+		return ERROR(GENERIC); /* min size */
+
+	/* header */
+	tableU16[-2] = (U16)nbBits;
+	tableU16[-1] = (U16)maxSymbolValue;
+
+	/* Build table */
+	for (s = 0; s < tableSize; s++)
+		tableU16[s] = (U16)(tableSize + s);
+
+	/* Build Symbol Transformation Table */
+	{
+		const U32 deltaNbBits = (nbBits << 16) - (1 << nbBits);
+		for (s = 0; s <= maxSymbolValue; s++) {
+			symbolTT[s].deltaNbBits = deltaNbBits;
+			symbolTT[s].deltaFindState = s - 1;
+		}
+	}
+
+	return 0;
+}
+
+/* fake FSE_CTable, for rle input (always same symbol) */
+size_t FSE_buildCTable_rle(FSE_CTable *ct, BYTE symbolValue)
+{
+	void *ptr = ct;
+	U16 *tableU16 = ((U16 *)ptr) + 2;
+	void *FSCTptr = (U32 *)ptr + 2;
+	FSE_symbolCompressionTransform *symbolTT = (FSE_symbolCompressionTransform *)FSCTptr;
+
+	/* header */
+	tableU16[-2] = (U16)0;
+	tableU16[-1] = (U16)symbolValue;
+
+	/* Build table */
+	tableU16[0] = 0;
+	tableU16[1] = 0; /* just in case */
+
+	/* Build Symbol Transformation Table */
+	symbolTT[symbolValue].deltaNbBits = 0;
+	symbolTT[symbolValue].deltaFindState = 0;
+
+	return 0;
+}
+
+static size_t FSE_compress_usingCTable_generic(void *dst, size_t dstSize, const void *src, size_t srcSize, const FSE_CTable *ct, const unsigned fast)
+{
+	const BYTE *const istart = (const BYTE *)src;
+	const BYTE *const iend = istart + srcSize;
+	const BYTE *ip = iend;
+
+	BIT_CStream_t bitC;
+	FSE_CState_t CState1, CState2;
+
+	/* init */
+	if (srcSize <= 2)
+		return 0;
+	{
+		size_t const initError = BIT_initCStream(&bitC, dst, dstSize);
+		if (FSE_isError(initError))
+			return 0; /* not enough space available to write a bitstream */
+	}
+
+#define FSE_FLUSHBITS(s) (fast ? BIT_flushBitsFast(s) : BIT_flushBits(s))
+
+	if (srcSize & 1) {
+		FSE_initCState2(&CState1, ct, *--ip);
+		FSE_initCState2(&CState2, ct, *--ip);
+		FSE_encodeSymbol(&bitC, &CState1, *--ip);
+		FSE_FLUSHBITS(&bitC);
+	} else {
+		FSE_initCState2(&CState2, ct, *--ip);
+		FSE_initCState2(&CState1, ct, *--ip);
+	}
+
+	/* join to mod 4 */
+	srcSize -= 2;
+	if ((sizeof(bitC.bitContainer) * 8 > FSE_MAX_TABLELOG * 4 + 7) && (srcSize & 2)) { /* test bit 2 */
+		FSE_encodeSymbol(&bitC, &CState2, *--ip);
+		FSE_encodeSymbol(&bitC, &CState1, *--ip);
+		FSE_FLUSHBITS(&bitC);
+	}
+
+	/* 2 or 4 encoding per loop */
+	while (ip > istart) {
+
+		FSE_encodeSymbol(&bitC, &CState2, *--ip);
+
+		if (sizeof(bitC.bitContainer) * 8 < FSE_MAX_TABLELOG * 2 + 7) /* this test must be static */
+			FSE_FLUSHBITS(&bitC);
+
+		FSE_encodeSymbol(&bitC, &CState1, *--ip);
+
+		if (sizeof(bitC.bitContainer) * 8 > FSE_MAX_TABLELOG * 4 + 7) { /* this test must be static */
+			FSE_encodeSymbol(&bitC, &CState2, *--ip);
+			FSE_encodeSymbol(&bitC, &CState1, *--ip);
+		}
+
+		FSE_FLUSHBITS(&bitC);
+	}
+
+	FSE_flushCState(&bitC, &CState2);
+	FSE_flushCState(&bitC, &CState1);
+	return BIT_closeCStream(&bitC);
+}
+
+size_t FSE_compress_usingCTable(void *dst, size_t dstSize, const void *src, size_t srcSize, const FSE_CTable *ct)
+{
+	unsigned const fast = (dstSize >= FSE_BLOCKBOUND(srcSize));
+
+	if (fast)
+		return FSE_compress_usingCTable_generic(dst, dstSize, src, srcSize, ct, 1);
+	else
+		return FSE_compress_usingCTable_generic(dst, dstSize, src, srcSize, ct, 0);
+}
+
+size_t FSE_compressBound(size_t size) { return FSE_COMPRESSBOUND(size); }
diff --git a/lib/zstd/fse_decompress.c b/lib/zstd/fse_decompress.c
new file mode 100644
index 000000000000..a84300e5a013
--- /dev/null
+++ b/lib/zstd/fse_decompress.c
@@ -0,0 +1,332 @@
+/*
+ * FSE : Finite State Entropy decoder
+ * Copyright (C) 2013-2015, Yann Collet.
+ *
+ * BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php)
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met:
+ *
+ *   * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ *   * Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following disclaimer
+ * in the documentation and/or other materials provided with the
+ * distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * This program is free software; you can redistribute it and/or modify it under
+ * the terms of the GNU General Public License version 2 as published by the
+ * Free Software Foundation. This program is dual-licensed; you may select
+ * either version 2 of the GNU General Public License ("GPL") or BSD license
+ * ("BSD").
+ *
+ * You can contact the author at :
+ * - Source repository : https://github.com/Cyan4973/FiniteStateEntropy
+ */
+
+/* **************************************************************
+*  Compiler specifics
+****************************************************************/
+#define FORCE_INLINE static __always_inline
+
+/* **************************************************************
+*  Includes
+****************************************************************/
+#include "bitstream.h"
+#include "fse.h"
+#include <linux/compiler.h>
+#include <linux/kernel.h>
+#include <linux/string.h> /* memcpy, memset */
+
+/* **************************************************************
+*  Error Management
+****************************************************************/
+#define FSE_isError ERR_isError
+#define FSE_STATIC_ASSERT(c)                                   \
+	{                                                      \
+		enum { FSE_static_assert = 1 / (int)(!!(c)) }; \
+	} /* use only *after* variable declarations */
+
+/* check and forward error code */
+#define CHECK_F(f)                  \
+	{                           \
+		size_t const e = f; \
+		if (FSE_isError(e)) \
+			return e;   \
+	}
+
+/* **************************************************************
+*  Templates
+****************************************************************/
+/*
+  designed to be included
+  for type-specific functions (template emulation in C)
+  Objective is to write these functions only once, for improved maintenance
+*/
+
+/* safety checks */
+#ifndef FSE_FUNCTION_EXTENSION
+#error "FSE_FUNCTION_EXTENSION must be defined"
+#endif
+#ifndef FSE_FUNCTION_TYPE
+#error "FSE_FUNCTION_TYPE must be defined"
+#endif
+
+/* Function names */
+#define FSE_CAT(X, Y) X##Y
+#define FSE_FUNCTION_NAME(X, Y) FSE_CAT(X, Y)
+#define FSE_TYPE_NAME(X, Y) FSE_CAT(X, Y)
+
+/* Function templates */
+
+size_t FSE_buildDTable_wksp(FSE_DTable *dt, const short *normalizedCounter, unsigned maxSymbolValue, unsigned tableLog, void *workspace, size_t workspaceSize)
+{
+	void *const tdPtr = dt + 1; /* because *dt is unsigned, 32-bits aligned on 32-bits */
+	FSE_DECODE_TYPE *const tableDecode = (FSE_DECODE_TYPE *)(tdPtr);
+	U16 *symbolNext = (U16 *)workspace;
+
+	U32 const maxSV1 = maxSymbolValue + 1;
+	U32 const tableSize = 1 << tableLog;
+	U32 highThreshold = tableSize - 1;
+
+	/* Sanity Checks */
+	if (workspaceSize < sizeof(U16) * (FSE_MAX_SYMBOL_VALUE + 1))
+		return ERROR(tableLog_tooLarge);
+	if (maxSymbolValue > FSE_MAX_SYMBOL_VALUE)
+		return ERROR(maxSymbolValue_tooLarge);
+	if (tableLog > FSE_MAX_TABLELOG)
+		return ERROR(tableLog_tooLarge);
+
+	/* Init, lay down lowprob symbols */
+	{
+		FSE_DTableHeader DTableH;
+		DTableH.tableLog = (U16)tableLog;
+		DTableH.fastMode = 1;
+		{
+			S16 const largeLimit = (S16)(1 << (tableLog - 1));
+			U32 s;
+			for (s = 0; s < maxSV1; s++) {
+				if (normalizedCounter[s] == -1) {
+					tableDecode[highThreshold--].symbol = (FSE_FUNCTION_TYPE)s;
+					symbolNext[s] = 1;
+				} else {
+					if (normalizedCounter[s] >= largeLimit)
+						DTableH.fastMode = 0;
+					symbolNext[s] = normalizedCounter[s];
+				}
+			}
+		}
+		memcpy(dt, &DTableH, sizeof(DTableH));
+	}
+
+	/* Spread symbols */
+	{
+		U32 const tableMask = tableSize - 1;
+		U32 const step = FSE_TABLESTEP(tableSize);
+		U32 s, position = 0;
+		for (s = 0; s < maxSV1; s++) {
+			int i;
+			for (i = 0; i < normalizedCounter[s]; i++) {
+				tableDecode[position].symbol = (FSE_FUNCTION_TYPE)s;
+				position = (position + step) & tableMask;
+				while (position > highThreshold)
+					position = (position + step) & tableMask; /* lowprob area */
+			}
+		}
+		if (position != 0)
+			return ERROR(GENERIC); /* position must reach all cells once, otherwise normalizedCounter is incorrect */
+	}
+
+	/* Build Decoding table */
+	{
+		U32 u;
+		for (u = 0; u < tableSize; u++) {
+			FSE_FUNCTION_TYPE const symbol = (FSE_FUNCTION_TYPE)(tableDecode[u].symbol);
+			U16 nextState = symbolNext[symbol]++;
+			tableDecode[u].nbBits = (BYTE)(tableLog - BIT_highbit32((U32)nextState));
+			tableDecode[u].newState = (U16)((nextState << tableDecode[u].nbBits) - tableSize);
+		}
+	}
+
+	return 0;
+}
+
+/*-*******************************************************
+*  Decompression (Byte symbols)
+*********************************************************/
+size_t FSE_buildDTable_rle(FSE_DTable *dt, BYTE symbolValue)
+{
+	void *ptr = dt;
+	FSE_DTableHeader *const DTableH = (FSE_DTableHeader *)ptr;
+	void *dPtr = dt + 1;
+	FSE_decode_t *const cell = (FSE_decode_t *)dPtr;
+
+	DTableH->tableLog = 0;
+	DTableH->fastMode = 0;
+
+	cell->newState = 0;
+	cell->symbol = symbolValue;
+	cell->nbBits = 0;
+
+	return 0;
+}
+
+size_t FSE_buildDTable_raw(FSE_DTable *dt, unsigned nbBits)
+{
+	void *ptr = dt;
+	FSE_DTableHeader *const DTableH = (FSE_DTableHeader *)ptr;
+	void *dPtr = dt + 1;
+	FSE_decode_t *const dinfo = (FSE_decode_t *)dPtr;
+	const unsigned tableSize = 1 << nbBits;
+	const unsigned tableMask = tableSize - 1;
+	const unsigned maxSV1 = tableMask + 1;
+	unsigned s;
+
+	/* Sanity checks */
+	if (nbBits < 1)
+		return ERROR(GENERIC); /* min size */
+
+	/* Build Decoding Table */
+	DTableH->tableLog = (U16)nbBits;
+	DTableH->fastMode = 1;
+	for (s = 0; s < maxSV1; s++) {
+		dinfo[s].newState = 0;
+		dinfo[s].symbol = (BYTE)s;
+		dinfo[s].nbBits = (BYTE)nbBits;
+	}
+
+	return 0;
+}
+
+FORCE_INLINE size_t FSE_decompress_usingDTable_generic(void *dst, size_t maxDstSize, const void *cSrc, size_t cSrcSize, const FSE_DTable *dt,
+						       const unsigned fast)
+{
+	BYTE *const ostart = (BYTE *)dst;
+	BYTE *op = ostart;
+	BYTE *const omax = op + maxDstSize;
+	BYTE *const olimit = omax - 3;
+
+	BIT_DStream_t bitD;
+	FSE_DState_t state1;
+	FSE_DState_t state2;
+
+	/* Init */
+	CHECK_F(BIT_initDStream(&bitD, cSrc, cSrcSize));
+
+	FSE_initDState(&state1, &bitD, dt);
+	FSE_initDState(&state2, &bitD, dt);
+
+#define FSE_GETSYMBOL(statePtr) fast ? FSE_decodeSymbolFast(statePtr, &bitD) : FSE_decodeSymbol(statePtr, &bitD)
+
+	/* 4 symbols per loop */
+	for (; (BIT_reloadDStream(&bitD) == BIT_DStream_unfinished) & (op < olimit); op += 4) {
+		op[0] = FSE_GETSYMBOL(&state1);
+
+		if (FSE_MAX_TABLELOG * 2 + 7 > sizeof(bitD.bitContainer) * 8) /* This test must be static */
+			BIT_reloadDStream(&bitD);
+
+		op[1] = FSE_GETSYMBOL(&state2);
+
+		if (FSE_MAX_TABLELOG * 4 + 7 > sizeof(bitD.bitContainer) * 8) /* This test must be static */
+		{
+			if (BIT_reloadDStream(&bitD) > BIT_DStream_unfinished) {
+				op += 2;
+				break;
+			}
+		}
+
+		op[2] = FSE_GETSYMBOL(&state1);
+
+		if (FSE_MAX_TABLELOG * 2 + 7 > sizeof(bitD.bitContainer) * 8) /* This test must be static */
+			BIT_reloadDStream(&bitD);
+
+		op[3] = FSE_GETSYMBOL(&state2);
+	}
+
+	/* tail */
+	/* note : BIT_reloadDStream(&bitD) >= FSE_DStream_partiallyFilled; Ends at exactly BIT_DStream_completed */
+	while (1) {
+		if (op > (omax - 2))
+			return ERROR(dstSize_tooSmall);
+		*op++ = FSE_GETSYMBOL(&state1);
+		if (BIT_reloadDStream(&bitD) == BIT_DStream_overflow) {
+			*op++ = FSE_GETSYMBOL(&state2);
+			break;
+		}
+
+		if (op > (omax - 2))
+			return ERROR(dstSize_tooSmall);
+		*op++ = FSE_GETSYMBOL(&state2);
+		if (BIT_reloadDStream(&bitD) == BIT_DStream_overflow) {
+			*op++ = FSE_GETSYMBOL(&state1);
+			break;
+		}
+	}
+
+	return op - ostart;
+}
+
+size_t FSE_decompress_usingDTable(void *dst, size_t originalSize, const void *cSrc, size_t cSrcSize, const FSE_DTable *dt)
+{
+	const void *ptr = dt;
+	const FSE_DTableHeader *DTableH = (const FSE_DTableHeader *)ptr;
+	const U32 fastMode = DTableH->fastMode;
+
+	/* select fast mode (static) */
+	if (fastMode)
+		return FSE_decompress_usingDTable_generic(dst, originalSize, cSrc, cSrcSize, dt, 1);
+	return FSE_decompress_usingDTable_generic(dst, originalSize, cSrc, cSrcSize, dt, 0);
+}
+
+size_t FSE_decompress_wksp(void *dst, size_t dstCapacity, const void *cSrc, size_t cSrcSize, unsigned maxLog, void *workspace, size_t workspaceSize)
+{
+	const BYTE *const istart = (const BYTE *)cSrc;
+	const BYTE *ip = istart;
+	unsigned tableLog;
+	unsigned maxSymbolValue = FSE_MAX_SYMBOL_VALUE;
+	size_t NCountLength;
+
+	FSE_DTable *dt;
+	short *counting;
+	size_t spaceUsed32 = 0;
+
+	FSE_STATIC_ASSERT(sizeof(FSE_DTable) == sizeof(U32));
+
+	dt = (FSE_DTable *)((U32 *)workspace + spaceUsed32);
+	spaceUsed32 += FSE_DTABLE_SIZE_U32(maxLog);
+	counting = (short *)((U32 *)workspace + spaceUsed32);
+	spaceUsed32 += ALIGN(sizeof(short) * (FSE_MAX_SYMBOL_VALUE + 1), sizeof(U32)) >> 2;
+
+	if ((spaceUsed32 << 2) > workspaceSize)
+		return ERROR(tableLog_tooLarge);
+	workspace = (U32 *)workspace + spaceUsed32;
+	workspaceSize -= (spaceUsed32 << 2);
+
+	/* normal FSE decoding mode */
+	NCountLength = FSE_readNCount(counting, &maxSymbolValue, &tableLog, istart, cSrcSize);
+	if (FSE_isError(NCountLength))
+		return NCountLength;
+	// if (NCountLength >= cSrcSize) return ERROR(srcSize_wrong);   /* too small input size; supposed to be already checked in NCountLength, only remaining
+	// case : NCountLength==cSrcSize */
+	if (tableLog > maxLog)
+		return ERROR(tableLog_tooLarge);
+	ip += NCountLength;
+	cSrcSize -= NCountLength;
+
+	CHECK_F(FSE_buildDTable_wksp(dt, counting, maxSymbolValue, tableLog, workspace, workspaceSize));
+
+	return FSE_decompress_usingDTable(dst, dstCapacity, ip, cSrcSize, dt); /* always return, even if it is an error code */
+}
diff --git a/lib/zstd/huf.h b/lib/zstd/huf.h
new file mode 100644
index 000000000000..2143da28d952
--- /dev/null
+++ b/lib/zstd/huf.h
@@ -0,0 +1,212 @@
+/*
+ * Huffman coder, part of New Generation Entropy library
+ * header file
+ * Copyright (C) 2013-2016, Yann Collet.
+ *
+ * BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php)
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met:
+ *
+ *   * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ *   * Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following disclaimer
+ * in the documentation and/or other materials provided with the
+ * distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * This program is free software; you can redistribute it and/or modify it under
+ * the terms of the GNU General Public License version 2 as published by the
+ * Free Software Foundation. This program is dual-licensed; you may select
+ * either version 2 of the GNU General Public License ("GPL") or BSD license
+ * ("BSD").
+ *
+ * You can contact the author at :
+ * - Source repository : https://github.com/Cyan4973/FiniteStateEntropy
+ */
+#ifndef HUF_H_298734234
+#define HUF_H_298734234
+
+/* *** Dependencies *** */
+#include <linux/types.h> /* size_t */
+
+/* ***   Tool functions *** */
+#define HUF_BLOCKSIZE_MAX (128 * 1024) /**< maximum input size for a single block compressed with HUF_compress */
+size_t HUF_compressBound(size_t size); /**< maximum compressed size (worst case) */
+
+/* Error Management */
+unsigned HUF_isError(size_t code); /**< tells if a return value is an error code */
+
+/* ***   Advanced function   *** */
+
+/** HUF_compress4X_wksp() :
+*   Same as HUF_compress2(), but uses externally allocated `workSpace`, which must be a table of >= 1024 unsigned */
+size_t HUF_compress4X_wksp(void *dst, size_t dstSize, const void *src, size_t srcSize, unsigned maxSymbolValue, unsigned tableLog, void *workSpace,
+			   size_t wkspSize); /**< `workSpace` must be a table of at least HUF_COMPRESS_WORKSPACE_SIZE_U32 unsigned */
+
+/* *** Dependencies *** */
+#include "mem.h" /* U32 */
+
+/* *** Constants *** */
+#define HUF_TABLELOG_MAX 12     /* max configured tableLog (for static allocation); can be modified up to HUF_ABSOLUTEMAX_TABLELOG */
+#define HUF_TABLELOG_DEFAULT 11 /* tableLog by default, when not specified */
+#define HUF_SYMBOLVALUE_MAX 255
+
+#define HUF_TABLELOG_ABSOLUTEMAX 15 /* absolute limit of HUF_MAX_TABLELOG. Beyond that value, code does not work */
+#if (HUF_TABLELOG_MAX > HUF_TABLELOG_ABSOLUTEMAX)
+#error "HUF_TABLELOG_MAX is too large !"
+#endif
+
+/* ****************************************
+*  Static allocation
+******************************************/
+/* HUF buffer bounds */
+#define HUF_CTABLEBOUND 129
+#define HUF_BLOCKBOUND(size) (size + (size >> 8) + 8)			 /* only true if incompressible pre-filtered with fast heuristic */
+#define HUF_COMPRESSBOUND(size) (HUF_CTABLEBOUND + HUF_BLOCKBOUND(size)) /* Macro version, useful for static allocation */
+
+/* static allocation of HUF's Compression Table */
+#define HUF_CREATE_STATIC_CTABLE(name, maxSymbolValue) \
+	U32 name##hb[maxSymbolValue + 1];              \
+	void *name##hv = &(name##hb);                  \
+	HUF_CElt *name = (HUF_CElt *)(name##hv) /* no final ; */
+
+/* static allocation of HUF's DTable */
+typedef U32 HUF_DTable;
+#define HUF_DTABLE_SIZE(maxTableLog) (1 + (1 << (maxTableLog)))
+#define HUF_CREATE_STATIC_DTABLEX2(DTable, maxTableLog) HUF_DTable DTable[HUF_DTABLE_SIZE((maxTableLog)-1)] = {((U32)((maxTableLog)-1) * 0x01000001)}
+#define HUF_CREATE_STATIC_DTABLEX4(DTable, maxTableLog) HUF_DTable DTable[HUF_DTABLE_SIZE(maxTableLog)] = {((U32)(maxTableLog)*0x01000001)}
+
+/* The workspace must have alignment at least 4 and be at least this large */
+#define HUF_COMPRESS_WORKSPACE_SIZE (6 << 10)
+#define HUF_COMPRESS_WORKSPACE_SIZE_U32 (HUF_COMPRESS_WORKSPACE_SIZE / sizeof(U32))
+
+/* The workspace must have alignment at least 4 and be at least this large */
+#define HUF_DECOMPRESS_WORKSPACE_SIZE (3 << 10)
+#define HUF_DECOMPRESS_WORKSPACE_SIZE_U32 (HUF_DECOMPRESS_WORKSPACE_SIZE / sizeof(U32))
+
+/* ****************************************
+*  Advanced decompression functions
+******************************************/
+size_t HUF_decompress4X_DCtx_wksp(HUF_DTable *dctx, void *dst, size_t dstSize, const void *cSrc, size_t cSrcSize, void *workspace, size_t workspaceSize); /**< decodes RLE and uncompressed */
+size_t HUF_decompress4X_hufOnly_wksp(HUF_DTable *dctx, void *dst, size_t dstSize, const void *cSrc, size_t cSrcSize, void *workspace,
+				size_t workspaceSize);							       /**< considers RLE and uncompressed as errors */
+size_t HUF_decompress4X2_DCtx_wksp(HUF_DTable *dctx, void *dst, size_t dstSize, const void *cSrc, size_t cSrcSize, void *workspace,
+				   size_t workspaceSize); /**< single-symbol decoder */
+size_t HUF_decompress4X4_DCtx_wksp(HUF_DTable *dctx, void *dst, size_t dstSize, const void *cSrc, size_t cSrcSize, void *workspace,
+				   size_t workspaceSize); /**< double-symbols decoder */
+
+/* ****************************************
+*  HUF detailed API
+******************************************/
+/*!
+HUF_compress() does the following:
+1. count symbol occurrence from source[] into table count[] using FSE_count()
+2. (optional) refine tableLog using HUF_optimalTableLog()
+3. build Huffman table from count using HUF_buildCTable()
+4. save Huffman table to memory buffer using HUF_writeCTable_wksp()
+5. encode the data stream using HUF_compress4X_usingCTable()
+
+The following API allows targeting specific sub-functions for advanced tasks.
+For example, it's possible to compress several blocks using the same 'CTable',
+or to save and regenerate 'CTable' using external methods.
+*/
+/* FSE_count() : find it within "fse.h" */
+unsigned HUF_optimalTableLog(unsigned maxTableLog, size_t srcSize, unsigned maxSymbolValue);
+typedef struct HUF_CElt_s HUF_CElt; /* incomplete type */
+size_t HUF_writeCTable_wksp(void *dst, size_t maxDstSize, const HUF_CElt *CTable, unsigned maxSymbolValue, unsigned huffLog, void *workspace, size_t workspaceSize);
+size_t HUF_compress4X_usingCTable(void *dst, size_t dstSize, const void *src, size_t srcSize, const HUF_CElt *CTable);
+
+typedef enum {
+	HUF_repeat_none,  /**< Cannot use the previous table */
+	HUF_repeat_check, /**< Can use the previous table but it must be checked. Note : The previous table must have been constructed by HUF_compress{1,
+			     4}X_repeat */
+	HUF_repeat_valid  /**< Can use the previous table and it is asumed to be valid */
+} HUF_repeat;
+/** HUF_compress4X_repeat() :
+*   Same as HUF_compress4X_wksp(), but considers using hufTable if *repeat != HUF_repeat_none.
+*   If it uses hufTable it does not modify hufTable or repeat.
+*   If it doesn't, it sets *repeat = HUF_repeat_none, and it sets hufTable to the table used.
+*   If preferRepeat then the old table will always be used if valid. */
+size_t HUF_compress4X_repeat(void *dst, size_t dstSize, const void *src, size_t srcSize, unsigned maxSymbolValue, unsigned tableLog, void *workSpace,
+			     size_t wkspSize, HUF_CElt *hufTable, HUF_repeat *repeat,
+			     int preferRepeat); /**< `workSpace` must be a table of at least HUF_COMPRESS_WORKSPACE_SIZE_U32 unsigned */
+
+/** HUF_buildCTable_wksp() :
+ *  Same as HUF_buildCTable(), but using externally allocated scratch buffer.
+ *  `workSpace` must be aligned on 4-bytes boundaries, and be at least as large as a table of 1024 unsigned.
+ */
+size_t HUF_buildCTable_wksp(HUF_CElt *tree, const U32 *count, U32 maxSymbolValue, U32 maxNbBits, void *workSpace, size_t wkspSize);
+
+/*! HUF_readStats() :
+	Read compact Huffman tree, saved by HUF_writeCTable().
+	`huffWeight` is destination buffer.
+	@return : size read from `src` , or an error Code .
+	Note : Needed by HUF_readCTable() and HUF_readDTableXn() . */
+size_t HUF_readStats_wksp(BYTE *huffWeight, size_t hwSize, U32 *rankStats, U32 *nbSymbolsPtr, U32 *tableLogPtr, const void *src, size_t srcSize,
+			  void *workspace, size_t workspaceSize);
+
+/** HUF_readCTable() :
+*   Loading a CTable saved with HUF_writeCTable() */
+size_t HUF_readCTable_wksp(HUF_CElt *CTable, unsigned maxSymbolValue, const void *src, size_t srcSize, void *workspace, size_t workspaceSize);
+
+/*
+HUF_decompress() does the following:
+1. select the decompression algorithm (X2, X4) based on pre-computed heuristics
+2. build Huffman table from save, using HUF_readDTableXn()
+3. decode 1 or 4 segments in parallel using HUF_decompressSXn_usingDTable
+*/
+
+/** HUF_selectDecoder() :
+*   Tells which decoder is likely to decode faster,
+*   based on a set of pre-determined metrics.
+*   @return : 0==HUF_decompress4X2, 1==HUF_decompress4X4 .
+*   Assumption : 0 < cSrcSize < dstSize <= 128 KB */
+U32 HUF_selectDecoder(size_t dstSize, size_t cSrcSize);
+
+size_t HUF_readDTableX2_wksp(HUF_DTable *DTable, const void *src, size_t srcSize, void *workspace, size_t workspaceSize);
+size_t HUF_readDTableX4_wksp(HUF_DTable *DTable, const void *src, size_t srcSize, void *workspace, size_t workspaceSize);
+
+size_t HUF_decompress4X_usingDTable(void *dst, size_t maxDstSize, const void *cSrc, size_t cSrcSize, const HUF_DTable *DTable);
+size_t HUF_decompress4X2_usingDTable(void *dst, size_t maxDstSize, const void *cSrc, size_t cSrcSize, const HUF_DTable *DTable);
+size_t HUF_decompress4X4_usingDTable(void *dst, size_t maxDstSize, const void *cSrc, size_t cSrcSize, const HUF_DTable *DTable);
+
+/* single stream variants */
+
+size_t HUF_compress1X_wksp(void *dst, size_t dstSize, const void *src, size_t srcSize, unsigned maxSymbolValue, unsigned tableLog, void *workSpace,
+			   size_t wkspSize); /**< `workSpace` must be a table of at least HUF_COMPRESS_WORKSPACE_SIZE_U32 unsigned */
+size_t HUF_compress1X_usingCTable(void *dst, size_t dstSize, const void *src, size_t srcSize, const HUF_CElt *CTable);
+/** HUF_compress1X_repeat() :
+*   Same as HUF_compress1X_wksp(), but considers using hufTable if *repeat != HUF_repeat_none.
+*   If it uses hufTable it does not modify hufTable or repeat.
+*   If it doesn't, it sets *repeat = HUF_repeat_none, and it sets hufTable to the table used.
+*   If preferRepeat then the old table will always be used if valid. */
+size_t HUF_compress1X_repeat(void *dst, size_t dstSize, const void *src, size_t srcSize, unsigned maxSymbolValue, unsigned tableLog, void *workSpace,
+			     size_t wkspSize, HUF_CElt *hufTable, HUF_repeat *repeat,
+			     int preferRepeat); /**< `workSpace` must be a table of at least HUF_COMPRESS_WORKSPACE_SIZE_U32 unsigned */
+
+size_t HUF_decompress1X_DCtx_wksp(HUF_DTable *dctx, void *dst, size_t dstSize, const void *cSrc, size_t cSrcSize, void *workspace, size_t workspaceSize);
+size_t HUF_decompress1X2_DCtx_wksp(HUF_DTable *dctx, void *dst, size_t dstSize, const void *cSrc, size_t cSrcSize, void *workspace,
+				   size_t workspaceSize); /**< single-symbol decoder */
+size_t HUF_decompress1X4_DCtx_wksp(HUF_DTable *dctx, void *dst, size_t dstSize, const void *cSrc, size_t cSrcSize, void *workspace,
+				   size_t workspaceSize); /**< double-symbols decoder */
+
+size_t HUF_decompress1X_usingDTable(void *dst, size_t maxDstSize, const void *cSrc, size_t cSrcSize,
+				    const HUF_DTable *DTable); /**< automatic selection of sing or double symbol decoder, based on DTable */
+size_t HUF_decompress1X2_usingDTable(void *dst, size_t maxDstSize, const void *cSrc, size_t cSrcSize, const HUF_DTable *DTable);
+size_t HUF_decompress1X4_usingDTable(void *dst, size_t maxDstSize, const void *cSrc, size_t cSrcSize, const HUF_DTable *DTable);
+
+#endif /* HUF_H_298734234 */
diff --git a/lib/zstd/huf_compress.c b/lib/zstd/huf_compress.c
new file mode 100644
index 000000000000..40055a7016e6
--- /dev/null
+++ b/lib/zstd/huf_compress.c
@@ -0,0 +1,770 @@
+/*
+ * Huffman encoder, part of New Generation Entropy library
+ * Copyright (C) 2013-2016, Yann Collet.
+ *
+ * BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php)
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met:
+ *
+ *   * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ *   * Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following disclaimer
+ * in the documentation and/or other materials provided with the
+ * distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * This program is free software; you can redistribute it and/or modify it under
+ * the terms of the GNU General Public License version 2 as published by the
+ * Free Software Foundation. This program is dual-licensed; you may select
+ * either version 2 of the GNU General Public License ("GPL") or BSD license
+ * ("BSD").
+ *
+ * You can contact the author at :
+ * - Source repository : https://github.com/Cyan4973/FiniteStateEntropy
+ */
+
+/* **************************************************************
+*  Includes
+****************************************************************/
+#include "bitstream.h"
+#include "fse.h" /* header compression */
+#include "huf.h"
+#include <linux/kernel.h>
+#include <linux/string.h> /* memcpy, memset */
+
+/* **************************************************************
+*  Error Management
+****************************************************************/
+#define HUF_STATIC_ASSERT(c)                                   \
+	{                                                      \
+		enum { HUF_static_assert = 1 / (int)(!!(c)) }; \
+	} /* use only *after* variable declarations */
+#define CHECK_V_F(e, f)     \
+	size_t const e = f; \
+	if (ERR_isError(e)) \
+	return f
+#define CHECK_F(f)                        \
+	{                                 \
+		CHECK_V_F(_var_err__, f); \
+	}
+
+/* **************************************************************
+*  Utils
+****************************************************************/
+unsigned HUF_optimalTableLog(unsigned maxTableLog, size_t srcSize, unsigned maxSymbolValue)
+{
+	return FSE_optimalTableLog_internal(maxTableLog, srcSize, maxSymbolValue, 1);
+}
+
+/* *******************************************************
+*  HUF : Huffman block compression
+*********************************************************/
+/* HUF_compressWeights() :
+ * Same as FSE_compress(), but dedicated to huff0's weights compression.
+ * The use case needs much less stack memory.
+ * Note : all elements within weightTable are supposed to be <= HUF_TABLELOG_MAX.
+ */
+#define MAX_FSE_TABLELOG_FOR_HUFF_HEADER 6
+size_t HUF_compressWeights_wksp(void *dst, size_t dstSize, const void *weightTable, size_t wtSize, void *workspace, size_t workspaceSize)
+{
+	BYTE *const ostart = (BYTE *)dst;
+	BYTE *op = ostart;
+	BYTE *const oend = ostart + dstSize;
+
+	U32 maxSymbolValue = HUF_TABLELOG_MAX;
+	U32 tableLog = MAX_FSE_TABLELOG_FOR_HUFF_HEADER;
+
+	FSE_CTable *CTable;
+	U32 *count;
+	S16 *norm;
+	size_t spaceUsed32 = 0;
+
+	HUF_STATIC_ASSERT(sizeof(FSE_CTable) == sizeof(U32));
+
+	CTable = (FSE_CTable *)((U32 *)workspace + spaceUsed32);
+	spaceUsed32 += FSE_CTABLE_SIZE_U32(MAX_FSE_TABLELOG_FOR_HUFF_HEADER, HUF_TABLELOG_MAX);
+	count = (U32 *)workspace + spaceUsed32;
+	spaceUsed32 += HUF_TABLELOG_MAX + 1;
+	norm = (S16 *)((U32 *)workspace + spaceUsed32);
+	spaceUsed32 += ALIGN(sizeof(S16) * (HUF_TABLELOG_MAX + 1), sizeof(U32)) >> 2;
+
+	if ((spaceUsed32 << 2) > workspaceSize)
+		return ERROR(tableLog_tooLarge);
+	workspace = (U32 *)workspace + spaceUsed32;
+	workspaceSize -= (spaceUsed32 << 2);
+
+	/* init conditions */
+	if (wtSize <= 1)
+		return 0; /* Not compressible */
+
+	/* Scan input and build symbol stats */
+	{
+		CHECK_V_F(maxCount, FSE_count_simple(count, &maxSymbolValue, weightTable, wtSize));
+		if (maxCount == wtSize)
+			return 1; /* only a single symbol in src : rle */
+		if (maxCount == 1)
+			return 0; /* each symbol present maximum once => not compressible */
+	}
+
+	tableLog = FSE_optimalTableLog(tableLog, wtSize, maxSymbolValue);
+	CHECK_F(FSE_normalizeCount(norm, tableLog, count, wtSize, maxSymbolValue));
+
+	/* Write table description header */
+	{
+		CHECK_V_F(hSize, FSE_writeNCount(op, oend - op, norm, maxSymbolValue, tableLog));
+		op += hSize;
+	}
+
+	/* Compress */
+	CHECK_F(FSE_buildCTable_wksp(CTable, norm, maxSymbolValue, tableLog, workspace, workspaceSize));
+	{
+		CHECK_V_F(cSize, FSE_compress_usingCTable(op, oend - op, weightTable, wtSize, CTable));
+		if (cSize == 0)
+			return 0; /* not enough space for compressed data */
+		op += cSize;
+	}
+
+	return op - ostart;
+}
+
+struct HUF_CElt_s {
+	U16 val;
+	BYTE nbBits;
+}; /* typedef'd to HUF_CElt within "huf.h" */
+
+/*! HUF_writeCTable_wksp() :
+	`CTable` : Huffman tree to save, using huf representation.
+	@return : size of saved CTable */
+size_t HUF_writeCTable_wksp(void *dst, size_t maxDstSize, const HUF_CElt *CTable, U32 maxSymbolValue, U32 huffLog, void *workspace, size_t workspaceSize)
+{
+	BYTE *op = (BYTE *)dst;
+	U32 n;
+
+	BYTE *bitsToWeight;
+	BYTE *huffWeight;
+	size_t spaceUsed32 = 0;
+
+	bitsToWeight = (BYTE *)((U32 *)workspace + spaceUsed32);
+	spaceUsed32 += ALIGN(HUF_TABLELOG_MAX + 1, sizeof(U32)) >> 2;
+	huffWeight = (BYTE *)((U32 *)workspace + spaceUsed32);
+	spaceUsed32 += ALIGN(HUF_SYMBOLVALUE_MAX, sizeof(U32)) >> 2;
+
+	if ((spaceUsed32 << 2) > workspaceSize)
+		return ERROR(tableLog_tooLarge);
+	workspace = (U32 *)workspace + spaceUsed32;
+	workspaceSize -= (spaceUsed32 << 2);
+
+	/* check conditions */
+	if (maxSymbolValue > HUF_SYMBOLVALUE_MAX)
+		return ERROR(maxSymbolValue_tooLarge);
+
+	/* convert to weight */
+	bitsToWeight[0] = 0;
+	for (n = 1; n < huffLog + 1; n++)
+		bitsToWeight[n] = (BYTE)(huffLog + 1 - n);
+	for (n = 0; n < maxSymbolValue; n++)
+		huffWeight[n] = bitsToWeight[CTable[n].nbBits];
+
+	/* attempt weights compression by FSE */
+	{
+		CHECK_V_F(hSize, HUF_compressWeights_wksp(op + 1, maxDstSize - 1, huffWeight, maxSymbolValue, workspace, workspaceSize));
+		if ((hSize > 1) & (hSize < maxSymbolValue / 2)) { /* FSE compressed */
+			op[0] = (BYTE)hSize;
+			return hSize + 1;
+		}
+	}
+
+	/* write raw values as 4-bits (max : 15) */
+	if (maxSymbolValue > (256 - 128))
+		return ERROR(GENERIC); /* should not happen : likely means source cannot be compressed */
+	if (((maxSymbolValue + 1) / 2) + 1 > maxDstSize)
+		return ERROR(dstSize_tooSmall); /* not enough space within dst buffer */
+	op[0] = (BYTE)(128 /*special case*/ + (maxSymbolValue - 1));
+	huffWeight[maxSymbolValue] = 0; /* to be sure it doesn't cause msan issue in final combination */
+	for (n = 0; n < maxSymbolValue; n += 2)
+		op[(n / 2) + 1] = (BYTE)((huffWeight[n] << 4) + huffWeight[n + 1]);
+	return ((maxSymbolValue + 1) / 2) + 1;
+}
+
+size_t HUF_readCTable_wksp(HUF_CElt *CTable, U32 maxSymbolValue, const void *src, size_t srcSize, void *workspace, size_t workspaceSize)
+{
+	U32 *rankVal;
+	BYTE *huffWeight;
+	U32 tableLog = 0;
+	U32 nbSymbols = 0;
+	size_t readSize;
+	size_t spaceUsed32 = 0;
+
+	rankVal = (U32 *)workspace + spaceUsed32;
+	spaceUsed32 += HUF_TABLELOG_ABSOLUTEMAX + 1;
+	huffWeight = (BYTE *)((U32 *)workspace + spaceUsed32);
+	spaceUsed32 += ALIGN(HUF_SYMBOLVALUE_MAX + 1, sizeof(U32)) >> 2;
+
+	if ((spaceUsed32 << 2) > workspaceSize)
+		return ERROR(tableLog_tooLarge);
+	workspace = (U32 *)workspace + spaceUsed32;
+	workspaceSize -= (spaceUsed32 << 2);
+
+	/* get symbol weights */
+	readSize = HUF_readStats_wksp(huffWeight, HUF_SYMBOLVALUE_MAX + 1, rankVal, &nbSymbols, &tableLog, src, srcSize, workspace, workspaceSize);
+	if (ERR_isError(readSize))
+		return readSize;
+
+	/* check result */
+	if (tableLog > HUF_TABLELOG_MAX)
+		return ERROR(tableLog_tooLarge);
+	if (nbSymbols > maxSymbolValue + 1)
+		return ERROR(maxSymbolValue_tooSmall);
+
+	/* Prepare base value per rank */
+	{
+		U32 n, nextRankStart = 0;
+		for (n = 1; n <= tableLog; n++) {
+			U32 curr = nextRankStart;
+			nextRankStart += (rankVal[n] << (n - 1));
+			rankVal[n] = curr;
+		}
+	}
+
+	/* fill nbBits */
+	{
+		U32 n;
+		for (n = 0; n < nbSymbols; n++) {
+			const U32 w = huffWeight[n];
+			CTable[n].nbBits = (BYTE)(tableLog + 1 - w);
+		}
+	}
+
+	/* fill val */
+	{
+		U16 nbPerRank[HUF_TABLELOG_MAX + 2] = {0}; /* support w=0=>n=tableLog+1 */
+		U16 valPerRank[HUF_TABLELOG_MAX + 2] = {0};
+		{
+			U32 n;
+			for (n = 0; n < nbSymbols; n++)
+				nbPerRank[CTable[n].nbBits]++;
+		}
+		/* determine stating value per rank */
+		valPerRank[tableLog + 1] = 0; /* for w==0 */
+		{
+			U16 min = 0;
+			U32 n;
+			for (n = tableLog; n > 0; n--) { /* start at n=tablelog <-> w=1 */
+				valPerRank[n] = min;     /* get starting value within each rank */
+				min += nbPerRank[n];
+				min >>= 1;
+			}
+		}
+		/* assign value within rank, symbol order */
+		{
+			U32 n;
+			for (n = 0; n <= maxSymbolValue; n++)
+				CTable[n].val = valPerRank[CTable[n].nbBits]++;
+		}
+	}
+
+	return readSize;
+}
+
+typedef struct nodeElt_s {
+	U32 count;
+	U16 parent;
+	BYTE byte;
+	BYTE nbBits;
+} nodeElt;
+
+static U32 HUF_setMaxHeight(nodeElt *huffNode, U32 lastNonNull, U32 maxNbBits)
+{
+	const U32 largestBits = huffNode[lastNonNull].nbBits;
+	if (largestBits <= maxNbBits)
+		return largestBits; /* early exit : no elt > maxNbBits */
+
+	/* there are several too large elements (at least >= 2) */
+	{
+		int totalCost = 0;
+		const U32 baseCost = 1 << (largestBits - maxNbBits);
+		U32 n = lastNonNull;
+
+		while (huffNode[n].nbBits > maxNbBits) {
+			totalCost += baseCost - (1 << (largestBits - huffNode[n].nbBits));
+			huffNode[n].nbBits = (BYTE)maxNbBits;
+			n--;
+		} /* n stops at huffNode[n].nbBits <= maxNbBits */
+		while (huffNode[n].nbBits == maxNbBits)
+			n--; /* n end at index of smallest symbol using < maxNbBits */
+
+		/* renorm totalCost */
+		totalCost >>= (largestBits - maxNbBits); /* note : totalCost is necessarily a multiple of baseCost */
+
+		/* repay normalized cost */
+		{
+			U32 const noSymbol = 0xF0F0F0F0;
+			U32 rankLast[HUF_TABLELOG_MAX + 2];
+			int pos;
+
+			/* Get pos of last (smallest) symbol per rank */
+			memset(rankLast, 0xF0, sizeof(rankLast));
+			{
+				U32 currNbBits = maxNbBits;
+				for (pos = n; pos >= 0; pos--) {
+					if (huffNode[pos].nbBits >= currNbBits)
+						continue;
+					currNbBits = huffNode[pos].nbBits; /* < maxNbBits */
+					rankLast[maxNbBits - currNbBits] = pos;
+				}
+			}
+
+			while (totalCost > 0) {
+				U32 nBitsToDecrease = BIT_highbit32(totalCost) + 1;
+				for (; nBitsToDecrease > 1; nBitsToDecrease--) {
+					U32 highPos = rankLast[nBitsToDecrease];
+					U32 lowPos = rankLast[nBitsToDecrease - 1];
+					if (highPos == noSymbol)
+						continue;
+					if (lowPos == noSymbol)
+						break;
+					{
+						U32 const highTotal = huffNode[highPos].count;
+						U32 const lowTotal = 2 * huffNode[lowPos].count;
+						if (highTotal <= lowTotal)
+							break;
+					}
+				}
+				/* only triggered when no more rank 1 symbol left => find closest one (note : there is necessarily at least one !) */
+				/* HUF_MAX_TABLELOG test just to please gcc 5+; but it should not be necessary */
+				while ((nBitsToDecrease <= HUF_TABLELOG_MAX) && (rankLast[nBitsToDecrease] == noSymbol))
+					nBitsToDecrease++;
+				totalCost -= 1 << (nBitsToDecrease - 1);
+				if (rankLast[nBitsToDecrease - 1] == noSymbol)
+					rankLast[nBitsToDecrease - 1] = rankLast[nBitsToDecrease]; /* this rank is no longer empty */
+				huffNode[rankLast[nBitsToDecrease]].nbBits++;
+				if (rankLast[nBitsToDecrease] == 0) /* special case, reached largest symbol */
+					rankLast[nBitsToDecrease] = noSymbol;
+				else {
+					rankLast[nBitsToDecrease]--;
+					if (huffNode[rankLast[nBitsToDecrease]].nbBits != maxNbBits - nBitsToDecrease)
+						rankLast[nBitsToDecrease] = noSymbol; /* this rank is now empty */
+				}
+			} /* while (totalCost > 0) */
+
+			while (totalCost < 0) {		       /* Sometimes, cost correction overshoot */
+				if (rankLast[1] == noSymbol) { /* special case : no rank 1 symbol (using maxNbBits-1); let's create one from largest rank 0
+								  (using maxNbBits) */
+					while (huffNode[n].nbBits == maxNbBits)
+						n--;
+					huffNode[n + 1].nbBits--;
+					rankLast[1] = n + 1;
+					totalCost++;
+					continue;
+				}
+				huffNode[rankLast[1] + 1].nbBits--;
+				rankLast[1]++;
+				totalCost++;
+			}
+		}
+	} /* there are several too large elements (at least >= 2) */
+
+	return maxNbBits;
+}
+
+typedef struct {
+	U32 base;
+	U32 curr;
+} rankPos;
+
+static void HUF_sort(nodeElt *huffNode, const U32 *count, U32 maxSymbolValue)
+{
+	rankPos rank[32];
+	U32 n;
+
+	memset(rank, 0, sizeof(rank));
+	for (n = 0; n <= maxSymbolValue; n++) {
+		U32 r = BIT_highbit32(count[n] + 1);
+		rank[r].base++;
+	}
+	for (n = 30; n > 0; n--)
+		rank[n - 1].base += rank[n].base;
+	for (n = 0; n < 32; n++)
+		rank[n].curr = rank[n].base;
+	for (n = 0; n <= maxSymbolValue; n++) {
+		U32 const c = count[n];
+		U32 const r = BIT_highbit32(c + 1) + 1;
+		U32 pos = rank[r].curr++;
+		while ((pos > rank[r].base) && (c > huffNode[pos - 1].count))
+			huffNode[pos] = huffNode[pos - 1], pos--;
+		huffNode[pos].count = c;
+		huffNode[pos].byte = (BYTE)n;
+	}
+}
+
+/** HUF_buildCTable_wksp() :
+ *  Same as HUF_buildCTable(), but using externally allocated scratch buffer.
+ *  `workSpace` must be aligned on 4-bytes boundaries, and be at least as large as a table of 1024 unsigned.
+ */
+#define STARTNODE (HUF_SYMBOLVALUE_MAX + 1)
+typedef nodeElt huffNodeTable[2 * HUF_SYMBOLVALUE_MAX + 1 + 1];
+size_t HUF_buildCTable_wksp(HUF_CElt *tree, const U32 *count, U32 maxSymbolValue, U32 maxNbBits, void *workSpace, size_t wkspSize)
+{
+	nodeElt *const huffNode0 = (nodeElt *)workSpace;
+	nodeElt *const huffNode = huffNode0 + 1;
+	U32 n, nonNullRank;
+	int lowS, lowN;
+	U16 nodeNb = STARTNODE;
+	U32 nodeRoot;
+
+	/* safety checks */
+	if (wkspSize < sizeof(huffNodeTable))
+		return ERROR(GENERIC); /* workSpace is not large enough */
+	if (maxNbBits == 0)
+		maxNbBits = HUF_TABLELOG_DEFAULT;
+	if (maxSymbolValue > HUF_SYMBOLVALUE_MAX)
+		return ERROR(GENERIC);
+	memset(huffNode0, 0, sizeof(huffNodeTable));
+
+	/* sort, decreasing order */
+	HUF_sort(huffNode, count, maxSymbolValue);
+
+	/* init for parents */
+	nonNullRank = maxSymbolValue;
+	while (huffNode[nonNullRank].count == 0)
+		nonNullRank--;
+	lowS = nonNullRank;
+	nodeRoot = nodeNb + lowS - 1;
+	lowN = nodeNb;
+	huffNode[nodeNb].count = huffNode[lowS].count + huffNode[lowS - 1].count;
+	huffNode[lowS].parent = huffNode[lowS - 1].parent = nodeNb;
+	nodeNb++;
+	lowS -= 2;
+	for (n = nodeNb; n <= nodeRoot; n++)
+		huffNode[n].count = (U32)(1U << 30);
+	huffNode0[0].count = (U32)(1U << 31); /* fake entry, strong barrier */
+
+	/* create parents */
+	while (nodeNb <= nodeRoot) {
+		U32 n1 = (huffNode[lowS].count < huffNode[lowN].count) ? lowS-- : lowN++;
+		U32 n2 = (huffNode[lowS].count < huffNode[lowN].count) ? lowS-- : lowN++;
+		huffNode[nodeNb].count = huffNode[n1].count + huffNode[n2].count;
+		huffNode[n1].parent = huffNode[n2].parent = nodeNb;
+		nodeNb++;
+	}
+
+	/* distribute weights (unlimited tree height) */
+	huffNode[nodeRoot].nbBits = 0;
+	for (n = nodeRoot - 1; n >= STARTNODE; n--)
+		huffNode[n].nbBits = huffNode[huffNode[n].parent].nbBits + 1;
+	for (n = 0; n <= nonNullRank; n++)
+		huffNode[n].nbBits = huffNode[huffNode[n].parent].nbBits + 1;
+
+	/* enforce maxTableLog */
+	maxNbBits = HUF_setMaxHeight(huffNode, nonNullRank, maxNbBits);
+
+	/* fill result into tree (val, nbBits) */
+	{
+		U16 nbPerRank[HUF_TABLELOG_MAX + 1] = {0};
+		U16 valPerRank[HUF_TABLELOG_MAX + 1] = {0};
+		if (maxNbBits > HUF_TABLELOG_MAX)
+			return ERROR(GENERIC); /* check fit into table */
+		for (n = 0; n <= nonNullRank; n++)
+			nbPerRank[huffNode[n].nbBits]++;
+		/* determine stating value per rank */
+		{
+			U16 min = 0;
+			for (n = maxNbBits; n > 0; n--) {
+				valPerRank[n] = min; /* get starting value within each rank */
+				min += nbPerRank[n];
+				min >>= 1;
+			}
+		}
+		for (n = 0; n <= maxSymbolValue; n++)
+			tree[huffNode[n].byte].nbBits = huffNode[n].nbBits; /* push nbBits per symbol, symbol order */
+		for (n = 0; n <= maxSymbolValue; n++)
+			tree[n].val = valPerRank[tree[n].nbBits]++; /* assign value within rank, symbol order */
+	}
+
+	return maxNbBits;
+}
+
+static size_t HUF_estimateCompressedSize(HUF_CElt *CTable, const unsigned *count, unsigned maxSymbolValue)
+{
+	size_t nbBits = 0;
+	int s;
+	for (s = 0; s <= (int)maxSymbolValue; ++s) {
+		nbBits += CTable[s].nbBits * count[s];
+	}
+	return nbBits >> 3;
+}
+
+static int HUF_validateCTable(const HUF_CElt *CTable, const unsigned *count, unsigned maxSymbolValue)
+{
+	int bad = 0;
+	int s;
+	for (s = 0; s <= (int)maxSymbolValue; ++s) {
+		bad |= (count[s] != 0) & (CTable[s].nbBits == 0);
+	}
+	return !bad;
+}
+
+static void HUF_encodeSymbol(BIT_CStream_t *bitCPtr, U32 symbol, const HUF_CElt *CTable)
+{
+	BIT_addBitsFast(bitCPtr, CTable[symbol].val, CTable[symbol].nbBits);
+}
+
+size_t HUF_compressBound(size_t size) { return HUF_COMPRESSBOUND(size); }
+
+#define HUF_FLUSHBITS(s)  BIT_flushBits(s)
+
+#define HUF_FLUSHBITS_1(stream)                                            \
+	if (sizeof((stream)->bitContainer) * 8 < HUF_TABLELOG_MAX * 2 + 7) \
+	HUF_FLUSHBITS(stream)
+
+#define HUF_FLUSHBITS_2(stream)                                            \
+	if (sizeof((stream)->bitContainer) * 8 < HUF_TABLELOG_MAX * 4 + 7) \
+	HUF_FLUSHBITS(stream)
+
+size_t HUF_compress1X_usingCTable(void *dst, size_t dstSize, const void *src, size_t srcSize, const HUF_CElt *CTable)
+{
+	const BYTE *ip = (const BYTE *)src;
+	BYTE *const ostart = (BYTE *)dst;
+	BYTE *const oend = ostart + dstSize;
+	BYTE *op = ostart;
+	size_t n;
+	BIT_CStream_t bitC;
+
+	/* init */
+	if (dstSize < 8)
+		return 0; /* not enough space to compress */
+	{
+		size_t const initErr = BIT_initCStream(&bitC, op, oend - op);
+		if (HUF_isError(initErr))
+			return 0;
+	}
+
+	n = srcSize & ~3; /* join to mod 4 */
+	switch (srcSize & 3) {
+	case 3: HUF_encodeSymbol(&bitC, ip[n + 2], CTable); HUF_FLUSHBITS_2(&bitC);
+	case 2: HUF_encodeSymbol(&bitC, ip[n + 1], CTable); HUF_FLUSHBITS_1(&bitC);
+	case 1: HUF_encodeSymbol(&bitC, ip[n + 0], CTable); HUF_FLUSHBITS(&bitC);
+	case 0:
+	default:;
+	}
+
+	for (; n > 0; n -= 4) { /* note : n&3==0 at this stage */
+		HUF_encodeSymbol(&bitC, ip[n - 1], CTable);
+		HUF_FLUSHBITS_1(&bitC);
+		HUF_encodeSymbol(&bitC, ip[n - 2], CTable);
+		HUF_FLUSHBITS_2(&bitC);
+		HUF_encodeSymbol(&bitC, ip[n - 3], CTable);
+		HUF_FLUSHBITS_1(&bitC);
+		HUF_encodeSymbol(&bitC, ip[n - 4], CTable);
+		HUF_FLUSHBITS(&bitC);
+	}
+
+	return BIT_closeCStream(&bitC);
+}
+
+size_t HUF_compress4X_usingCTable(void *dst, size_t dstSize, const void *src, size_t srcSize, const HUF_CElt *CTable)
+{
+	size_t const segmentSize = (srcSize + 3) / 4; /* first 3 segments */
+	const BYTE *ip = (const BYTE *)src;
+	const BYTE *const iend = ip + srcSize;
+	BYTE *const ostart = (BYTE *)dst;
+	BYTE *const oend = ostart + dstSize;
+	BYTE *op = ostart;
+
+	if (dstSize < 6 + 1 + 1 + 1 + 8)
+		return 0; /* minimum space to compress successfully */
+	if (srcSize < 12)
+		return 0; /* no saving possible : too small input */
+	op += 6;	  /* jumpTable */
+
+	{
+		CHECK_V_F(cSize, HUF_compress1X_usingCTable(op, oend - op, ip, segmentSize, CTable));
+		if (cSize == 0)
+			return 0;
+		ZSTD_writeLE16(ostart, (U16)cSize);
+		op += cSize;
+	}
+
+	ip += segmentSize;
+	{
+		CHECK_V_F(cSize, HUF_compress1X_usingCTable(op, oend - op, ip, segmentSize, CTable));
+		if (cSize == 0)
+			return 0;
+		ZSTD_writeLE16(ostart + 2, (U16)cSize);
+		op += cSize;
+	}
+
+	ip += segmentSize;
+	{
+		CHECK_V_F(cSize, HUF_compress1X_usingCTable(op, oend - op, ip, segmentSize, CTable));
+		if (cSize == 0)
+			return 0;
+		ZSTD_writeLE16(ostart + 4, (U16)cSize);
+		op += cSize;
+	}
+
+	ip += segmentSize;
+	{
+		CHECK_V_F(cSize, HUF_compress1X_usingCTable(op, oend - op, ip, iend - ip, CTable));
+		if (cSize == 0)
+			return 0;
+		op += cSize;
+	}
+
+	return op - ostart;
+}
+
+static size_t HUF_compressCTable_internal(BYTE *const ostart, BYTE *op, BYTE *const oend, const void *src, size_t srcSize, unsigned singleStream,
+					  const HUF_CElt *CTable)
+{
+	size_t const cSize =
+	    singleStream ? HUF_compress1X_usingCTable(op, oend - op, src, srcSize, CTable) : HUF_compress4X_usingCTable(op, oend - op, src, srcSize, CTable);
+	if (HUF_isError(cSize)) {
+		return cSize;
+	}
+	if (cSize == 0) {
+		return 0;
+	} /* uncompressible */
+	op += cSize;
+	/* check compressibility */
+	if ((size_t)(op - ostart) >= srcSize - 1) {
+		return 0;
+	}
+	return op - ostart;
+}
+
+/* `workSpace` must a table of at least 1024 unsigned */
+static size_t HUF_compress_internal(void *dst, size_t dstSize, const void *src, size_t srcSize, unsigned maxSymbolValue, unsigned huffLog,
+				    unsigned singleStream, void *workSpace, size_t wkspSize, HUF_CElt *oldHufTable, HUF_repeat *repeat, int preferRepeat)
+{
+	BYTE *const ostart = (BYTE *)dst;
+	BYTE *const oend = ostart + dstSize;
+	BYTE *op = ostart;
+
+	U32 *count;
+	size_t const countSize = sizeof(U32) * (HUF_SYMBOLVALUE_MAX + 1);
+	HUF_CElt *CTable;
+	size_t const CTableSize = sizeof(HUF_CElt) * (HUF_SYMBOLVALUE_MAX + 1);
+
+	/* checks & inits */
+	if (wkspSize < sizeof(huffNodeTable) + countSize + CTableSize)
+		return ERROR(GENERIC);
+	if (!srcSize)
+		return 0; /* Uncompressed (note : 1 means rle, so first byte must be correct) */
+	if (!dstSize)
+		return 0; /* cannot fit within dst budget */
+	if (srcSize > HUF_BLOCKSIZE_MAX)
+		return ERROR(srcSize_wrong); /* curr block size limit */
+	if (huffLog > HUF_TABLELOG_MAX)
+		return ERROR(tableLog_tooLarge);
+	if (!maxSymbolValue)
+		maxSymbolValue = HUF_SYMBOLVALUE_MAX;
+	if (!huffLog)
+		huffLog = HUF_TABLELOG_DEFAULT;
+
+	count = (U32 *)workSpace;
+	workSpace = (BYTE *)workSpace + countSize;
+	wkspSize -= countSize;
+	CTable = (HUF_CElt *)workSpace;
+	workSpace = (BYTE *)workSpace + CTableSize;
+	wkspSize -= CTableSize;
+
+	/* Heuristic : If we don't need to check the validity of the old table use the old table for small inputs */
+	if (preferRepeat && repeat && *repeat == HUF_repeat_valid) {
+		return HUF_compressCTable_internal(ostart, op, oend, src, srcSize, singleStream, oldHufTable);
+	}
+
+	/* Scan input and build symbol stats */
+	{
+		CHECK_V_F(largest, FSE_count_wksp(count, &maxSymbolValue, (const BYTE *)src, srcSize, (U32 *)workSpace));
+		if (largest == srcSize) {
+			*ostart = ((const BYTE *)src)[0];
+			return 1;
+		} /* single symbol, rle */
+		if (largest <= (srcSize >> 7) + 1)
+			return 0; /* Fast heuristic : not compressible enough */
+	}
+
+	/* Check validity of previous table */
+	if (repeat && *repeat == HUF_repeat_check && !HUF_validateCTable(oldHufTable, count, maxSymbolValue)) {
+		*repeat = HUF_repeat_none;
+	}
+	/* Heuristic : use existing table for small inputs */
+	if (preferRepeat && repeat && *repeat != HUF_repeat_none) {
+		return HUF_compressCTable_internal(ostart, op, oend, src, srcSize, singleStream, oldHufTable);
+	}
+
+	/* Build Huffman Tree */
+	huffLog = HUF_optimalTableLog(huffLog, srcSize, maxSymbolValue);
+	{
+		CHECK_V_F(maxBits, HUF_buildCTable_wksp(CTable, count, maxSymbolValue, huffLog, workSpace, wkspSize));
+		huffLog = (U32)maxBits;
+		/* Zero the unused symbols so we can check it for validity */
+		memset(CTable + maxSymbolValue + 1, 0, CTableSize - (maxSymbolValue + 1) * sizeof(HUF_CElt));
+	}
+
+	/* Write table description header */
+	{
+		CHECK_V_F(hSize, HUF_writeCTable_wksp(op, dstSize, CTable, maxSymbolValue, huffLog, workSpace, wkspSize));
+		/* Check if using the previous table will be beneficial */
+		if (repeat && *repeat != HUF_repeat_none) {
+			size_t const oldSize = HUF_estimateCompressedSize(oldHufTable, count, maxSymbolValue);
+			size_t const newSize = HUF_estimateCompressedSize(CTable, count, maxSymbolValue);
+			if (oldSize <= hSize + newSize || hSize + 12 >= srcSize) {
+				return HUF_compressCTable_internal(ostart, op, oend, src, srcSize, singleStream, oldHufTable);
+			}
+		}
+		/* Use the new table */
+		if (hSize + 12ul >= srcSize) {
+			return 0;
+		}
+		op += hSize;
+		if (repeat) {
+			*repeat = HUF_repeat_none;
+		}
+		if (oldHufTable) {
+			memcpy(oldHufTable, CTable, CTableSize);
+		} /* Save the new table */
+	}
+	return HUF_compressCTable_internal(ostart, op, oend, src, srcSize, singleStream, CTable);
+}
+
+size_t HUF_compress1X_wksp(void *dst, size_t dstSize, const void *src, size_t srcSize, unsigned maxSymbolValue, unsigned huffLog, void *workSpace,
+			   size_t wkspSize)
+{
+	return HUF_compress_internal(dst, dstSize, src, srcSize, maxSymbolValue, huffLog, 1 /* single stream */, workSpace, wkspSize, NULL, NULL, 0);
+}
+
+size_t HUF_compress1X_repeat(void *dst, size_t dstSize, const void *src, size_t srcSize, unsigned maxSymbolValue, unsigned huffLog, void *workSpace,
+			     size_t wkspSize, HUF_CElt *hufTable, HUF_repeat *repeat, int preferRepeat)
+{
+	return HUF_compress_internal(dst, dstSize, src, srcSize, maxSymbolValue, huffLog, 1 /* single stream */, workSpace, wkspSize, hufTable, repeat,
+				     preferRepeat);
+}
+
+size_t HUF_compress4X_wksp(void *dst, size_t dstSize, const void *src, size_t srcSize, unsigned maxSymbolValue, unsigned huffLog, void *workSpace,
+			   size_t wkspSize)
+{
+	return HUF_compress_internal(dst, dstSize, src, srcSize, maxSymbolValue, huffLog, 0 /* 4 streams */, workSpace, wkspSize, NULL, NULL, 0);
+}
+
+size_t HUF_compress4X_repeat(void *dst, size_t dstSize, const void *src, size_t srcSize, unsigned maxSymbolValue, unsigned huffLog, void *workSpace,
+			     size_t wkspSize, HUF_CElt *hufTable, HUF_repeat *repeat, int preferRepeat)
+{
+	return HUF_compress_internal(dst, dstSize, src, srcSize, maxSymbolValue, huffLog, 0 /* 4 streams */, workSpace, wkspSize, hufTable, repeat,
+				     preferRepeat);
+}
diff --git a/lib/zstd/huf_decompress.c b/lib/zstd/huf_decompress.c
new file mode 100644
index 000000000000..6526482047dc
--- /dev/null
+++ b/lib/zstd/huf_decompress.c
@@ -0,0 +1,960 @@
+/*
+ * Huffman decoder, part of New Generation Entropy library
+ * Copyright (C) 2013-2016, Yann Collet.
+ *
+ * BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php)
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met:
+ *
+ *   * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ *   * Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following disclaimer
+ * in the documentation and/or other materials provided with the
+ * distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * This program is free software; you can redistribute it and/or modify it under
+ * the terms of the GNU General Public License version 2 as published by the
+ * Free Software Foundation. This program is dual-licensed; you may select
+ * either version 2 of the GNU General Public License ("GPL") or BSD license
+ * ("BSD").
+ *
+ * You can contact the author at :
+ * - Source repository : https://github.com/Cyan4973/FiniteStateEntropy
+ */
+
+/* **************************************************************
+*  Compiler specifics
+****************************************************************/
+#define FORCE_INLINE static __always_inline
+
+/* **************************************************************
+*  Dependencies
+****************************************************************/
+#include "bitstream.h" /* BIT_* */
+#include "fse.h"       /* header compression */
+#include "huf.h"
+#include <linux/compiler.h>
+#include <linux/kernel.h>
+#include <linux/string.h> /* memcpy, memset */
+
+/* **************************************************************
+*  Error Management
+****************************************************************/
+#define HUF_STATIC_ASSERT(c)                                   \
+	{                                                      \
+		enum { HUF_static_assert = 1 / (int)(!!(c)) }; \
+	} /* use only *after* variable declarations */
+
+/*-***************************/
+/*  generic DTableDesc       */
+/*-***************************/
+
+typedef struct {
+	BYTE maxTableLog;
+	BYTE tableType;
+	BYTE tableLog;
+	BYTE reserved;
+} DTableDesc;
+
+static DTableDesc HUF_getDTableDesc(const HUF_DTable *table)
+{
+	DTableDesc dtd;
+	memcpy(&dtd, table, sizeof(dtd));
+	return dtd;
+}
+
+/*-***************************/
+/*  single-symbol decoding   */
+/*-***************************/
+
+typedef struct {
+	BYTE byte;
+	BYTE nbBits;
+} HUF_DEltX2; /* single-symbol decoding */
+
+size_t HUF_readDTableX2_wksp(HUF_DTable *DTable, const void *src, size_t srcSize, void *workspace, size_t workspaceSize)
+{
+	U32 tableLog = 0;
+	U32 nbSymbols = 0;
+	size_t iSize;
+	void *const dtPtr = DTable + 1;
+	HUF_DEltX2 *const dt = (HUF_DEltX2 *)dtPtr;
+
+	U32 *rankVal;
+	BYTE *huffWeight;
+	size_t spaceUsed32 = 0;
+
+	rankVal = (U32 *)workspace + spaceUsed32;
+	spaceUsed32 += HUF_TABLELOG_ABSOLUTEMAX + 1;
+	huffWeight = (BYTE *)((U32 *)workspace + spaceUsed32);
+	spaceUsed32 += ALIGN(HUF_SYMBOLVALUE_MAX + 1, sizeof(U32)) >> 2;
+
+	if ((spaceUsed32 << 2) > workspaceSize)
+		return ERROR(tableLog_tooLarge);
+	workspace = (U32 *)workspace + spaceUsed32;
+	workspaceSize -= (spaceUsed32 << 2);
+
+	HUF_STATIC_ASSERT(sizeof(DTableDesc) == sizeof(HUF_DTable));
+	/* memset(huffWeight, 0, sizeof(huffWeight)); */ /* is not necessary, even though some analyzer complain ... */
+
+	iSize = HUF_readStats_wksp(huffWeight, HUF_SYMBOLVALUE_MAX + 1, rankVal, &nbSymbols, &tableLog, src, srcSize, workspace, workspaceSize);
+	if (HUF_isError(iSize))
+		return iSize;
+
+	/* Table header */
+	{
+		DTableDesc dtd = HUF_getDTableDesc(DTable);
+		if (tableLog > (U32)(dtd.maxTableLog + 1))
+			return ERROR(tableLog_tooLarge); /* DTable too small, Huffman tree cannot fit in */
+		dtd.tableType = 0;
+		dtd.tableLog = (BYTE)tableLog;
+		memcpy(DTable, &dtd, sizeof(dtd));
+	}
+
+	/* Calculate starting value for each rank */
+	{
+		U32 n, nextRankStart = 0;
+		for (n = 1; n < tableLog + 1; n++) {
+			U32 const curr = nextRankStart;
+			nextRankStart += (rankVal[n] << (n - 1));
+			rankVal[n] = curr;
+		}
+	}
+
+	/* fill DTable */
+	{
+		U32 n;
+		for (n = 0; n < nbSymbols; n++) {
+			U32 const w = huffWeight[n];
+			U32 const length = (1 << w) >> 1;
+			U32 u;
+			HUF_DEltX2 D;
+			D.byte = (BYTE)n;
+			D.nbBits = (BYTE)(tableLog + 1 - w);
+			for (u = rankVal[w]; u < rankVal[w] + length; u++)
+				dt[u] = D;
+			rankVal[w] += length;
+		}
+	}
+
+	return iSize;
+}
+
+static BYTE HUF_decodeSymbolX2(BIT_DStream_t *Dstream, const HUF_DEltX2 *dt, const U32 dtLog)
+{
+	size_t const val = BIT_lookBitsFast(Dstream, dtLog); /* note : dtLog >= 1 */
+	BYTE const c = dt[val].byte;
+	BIT_skipBits(Dstream, dt[val].nbBits);
+	return c;
+}
+
+#define HUF_DECODE_SYMBOLX2_0(ptr, DStreamPtr) *ptr++ = HUF_decodeSymbolX2(DStreamPtr, dt, dtLog)
+
+#define HUF_DECODE_SYMBOLX2_1(ptr, DStreamPtr)         \
+	if (ZSTD_64bits() || (HUF_TABLELOG_MAX <= 12)) \
+	HUF_DECODE_SYMBOLX2_0(ptr, DStreamPtr)
+
+#define HUF_DECODE_SYMBOLX2_2(ptr, DStreamPtr) \
+	if (ZSTD_64bits())                     \
+	HUF_DECODE_SYMBOLX2_0(ptr, DStreamPtr)
+
+FORCE_INLINE size_t HUF_decodeStreamX2(BYTE *p, BIT_DStream_t *const bitDPtr, BYTE *const pEnd, const HUF_DEltX2 *const dt, const U32 dtLog)
+{
+	BYTE *const pStart = p;
+
+	/* up to 4 symbols at a time */
+	while ((BIT_reloadDStream(bitDPtr) == BIT_DStream_unfinished) && (p <= pEnd - 4)) {
+		HUF_DECODE_SYMBOLX2_2(p, bitDPtr);
+		HUF_DECODE_SYMBOLX2_1(p, bitDPtr);
+		HUF_DECODE_SYMBOLX2_2(p, bitDPtr);
+		HUF_DECODE_SYMBOLX2_0(p, bitDPtr);
+	}
+
+	/* closer to the end */
+	while ((BIT_reloadDStream(bitDPtr) == BIT_DStream_unfinished) && (p < pEnd))
+		HUF_DECODE_SYMBOLX2_0(p, bitDPtr);
+
+	/* no more data to retrieve from bitstream, hence no need to reload */
+	while (p < pEnd)
+		HUF_DECODE_SYMBOLX2_0(p, bitDPtr);
+
+	return pEnd - pStart;
+}
+
+static size_t HUF_decompress1X2_usingDTable_internal(void *dst, size_t dstSize, const void *cSrc, size_t cSrcSize, const HUF_DTable *DTable)
+{
+	BYTE *op = (BYTE *)dst;
+	BYTE *const oend = op + dstSize;
+	const void *dtPtr = DTable + 1;
+	const HUF_DEltX2 *const dt = (const HUF_DEltX2 *)dtPtr;
+	BIT_DStream_t bitD;
+	DTableDesc const dtd = HUF_getDTableDesc(DTable);
+	U32 const dtLog = dtd.tableLog;
+
+	{
+		size_t const errorCode = BIT_initDStream(&bitD, cSrc, cSrcSize);
+		if (HUF_isError(errorCode))
+			return errorCode;
+	}
+
+	HUF_decodeStreamX2(op, &bitD, oend, dt, dtLog);
+
+	/* check */
+	if (!BIT_endOfDStream(&bitD))
+		return ERROR(corruption_detected);
+
+	return dstSize;
+}
+
+size_t HUF_decompress1X2_usingDTable(void *dst, size_t dstSize, const void *cSrc, size_t cSrcSize, const HUF_DTable *DTable)
+{
+	DTableDesc dtd = HUF_getDTableDesc(DTable);
+	if (dtd.tableType != 0)
+		return ERROR(GENERIC);
+	return HUF_decompress1X2_usingDTable_internal(dst, dstSize, cSrc, cSrcSize, DTable);
+}
+
+size_t HUF_decompress1X2_DCtx_wksp(HUF_DTable *DCtx, void *dst, size_t dstSize, const void *cSrc, size_t cSrcSize, void *workspace, size_t workspaceSize)
+{
+	const BYTE *ip = (const BYTE *)cSrc;
+
+	size_t const hSize = HUF_readDTableX2_wksp(DCtx, cSrc, cSrcSize, workspace, workspaceSize);
+	if (HUF_isError(hSize))
+		return hSize;
+	if (hSize >= cSrcSize)
+		return ERROR(srcSize_wrong);
+	ip += hSize;
+	cSrcSize -= hSize;
+
+	return HUF_decompress1X2_usingDTable_internal(dst, dstSize, ip, cSrcSize, DCtx);
+}
+
+static size_t HUF_decompress4X2_usingDTable_internal(void *dst, size_t dstSize, const void *cSrc, size_t cSrcSize, const HUF_DTable *DTable)
+{
+	/* Check */
+	if (cSrcSize < 10)
+		return ERROR(corruption_detected); /* strict minimum : jump table + 1 byte per stream */
+
+	{
+		const BYTE *const istart = (const BYTE *)cSrc;
+		BYTE *const ostart = (BYTE *)dst;
+		BYTE *const oend = ostart + dstSize;
+		const void *const dtPtr = DTable + 1;
+		const HUF_DEltX2 *const dt = (const HUF_DEltX2 *)dtPtr;
+
+		/* Init */
+		BIT_DStream_t bitD1;
+		BIT_DStream_t bitD2;
+		BIT_DStream_t bitD3;
+		BIT_DStream_t bitD4;
+		size_t const length1 = ZSTD_readLE16(istart);
+		size_t const length2 = ZSTD_readLE16(istart + 2);
+		size_t const length3 = ZSTD_readLE16(istart + 4);
+		size_t const length4 = cSrcSize - (length1 + length2 + length3 + 6);
+		const BYTE *const istart1 = istart + 6; /* jumpTable */
+		const BYTE *const istart2 = istart1 + length1;
+		const BYTE *const istart3 = istart2 + length2;
+		const BYTE *const istart4 = istart3 + length3;
+		const size_t segmentSize = (dstSize + 3) / 4;
+		BYTE *const opStart2 = ostart + segmentSize;
+		BYTE *const opStart3 = opStart2 + segmentSize;
+		BYTE *const opStart4 = opStart3 + segmentSize;
+		BYTE *op1 = ostart;
+		BYTE *op2 = opStart2;
+		BYTE *op3 = opStart3;
+		BYTE *op4 = opStart4;
+		U32 endSignal;
+		DTableDesc const dtd = HUF_getDTableDesc(DTable);
+		U32 const dtLog = dtd.tableLog;
+
+		if (length4 > cSrcSize)
+			return ERROR(corruption_detected); /* overflow */
+		{
+			size_t const errorCode = BIT_initDStream(&bitD1, istart1, length1);
+			if (HUF_isError(errorCode))
+				return errorCode;
+		}
+		{
+			size_t const errorCode = BIT_initDStream(&bitD2, istart2, length2);
+			if (HUF_isError(errorCode))
+				return errorCode;
+		}
+		{
+			size_t const errorCode = BIT_initDStream(&bitD3, istart3, length3);
+			if (HUF_isError(errorCode))
+				return errorCode;
+		}
+		{
+			size_t const errorCode = BIT_initDStream(&bitD4, istart4, length4);
+			if (HUF_isError(errorCode))
+				return errorCode;
+		}
+
+		/* 16-32 symbols per loop (4-8 symbols per stream) */
+		endSignal = BIT_reloadDStream(&bitD1) | BIT_reloadDStream(&bitD2) | BIT_reloadDStream(&bitD3) | BIT_reloadDStream(&bitD4);
+		for (; (endSignal == BIT_DStream_unfinished) && (op4 < (oend - 7));) {
+			HUF_DECODE_SYMBOLX2_2(op1, &bitD1);
+			HUF_DECODE_SYMBOLX2_2(op2, &bitD2);
+			HUF_DECODE_SYMBOLX2_2(op3, &bitD3);
+			HUF_DECODE_SYMBOLX2_2(op4, &bitD4);
+			HUF_DECODE_SYMBOLX2_1(op1, &bitD1);
+			HUF_DECODE_SYMBOLX2_1(op2, &bitD2);
+			HUF_DECODE_SYMBOLX2_1(op3, &bitD3);
+			HUF_DECODE_SYMBOLX2_1(op4, &bitD4);
+			HUF_DECODE_SYMBOLX2_2(op1, &bitD1);
+			HUF_DECODE_SYMBOLX2_2(op2, &bitD2);
+			HUF_DECODE_SYMBOLX2_2(op3, &bitD3);
+			HUF_DECODE_SYMBOLX2_2(op4, &bitD4);
+			HUF_DECODE_SYMBOLX2_0(op1, &bitD1);
+			HUF_DECODE_SYMBOLX2_0(op2, &bitD2);
+			HUF_DECODE_SYMBOLX2_0(op3, &bitD3);
+			HUF_DECODE_SYMBOLX2_0(op4, &bitD4);
+			endSignal = BIT_reloadDStream(&bitD1) | BIT_reloadDStream(&bitD2) | BIT_reloadDStream(&bitD3) | BIT_reloadDStream(&bitD4);
+		}
+
+		/* check corruption */
+		if (op1 > opStart2)
+			return ERROR(corruption_detected);
+		if (op2 > opStart3)
+			return ERROR(corruption_detected);
+		if (op3 > opStart4)
+			return ERROR(corruption_detected);
+		/* note : op4 supposed already verified within main loop */
+
+		/* finish bitStreams one by one */
+		HUF_decodeStreamX2(op1, &bitD1, opStart2, dt, dtLog);
+		HUF_decodeStreamX2(op2, &bitD2, opStart3, dt, dtLog);
+		HUF_decodeStreamX2(op3, &bitD3, opStart4, dt, dtLog);
+		HUF_decodeStreamX2(op4, &bitD4, oend, dt, dtLog);
+
+		/* check */
+		endSignal = BIT_endOfDStream(&bitD1) & BIT_endOfDStream(&bitD2) & BIT_endOfDStream(&bitD3) & BIT_endOfDStream(&bitD4);
+		if (!endSignal)
+			return ERROR(corruption_detected);
+
+		/* decoded size */
+		return dstSize;
+	}
+}
+
+size_t HUF_decompress4X2_usingDTable(void *dst, size_t dstSize, const void *cSrc, size_t cSrcSize, const HUF_DTable *DTable)
+{
+	DTableDesc dtd = HUF_getDTableDesc(DTable);
+	if (dtd.tableType != 0)
+		return ERROR(GENERIC);
+	return HUF_decompress4X2_usingDTable_internal(dst, dstSize, cSrc, cSrcSize, DTable);
+}
+
+size_t HUF_decompress4X2_DCtx_wksp(HUF_DTable *dctx, void *dst, size_t dstSize, const void *cSrc, size_t cSrcSize, void *workspace, size_t workspaceSize)
+{
+	const BYTE *ip = (const BYTE *)cSrc;
+
+	size_t const hSize = HUF_readDTableX2_wksp(dctx, cSrc, cSrcSize, workspace, workspaceSize);
+	if (HUF_isError(hSize))
+		return hSize;
+	if (hSize >= cSrcSize)
+		return ERROR(srcSize_wrong);
+	ip += hSize;
+	cSrcSize -= hSize;
+
+	return HUF_decompress4X2_usingDTable_internal(dst, dstSize, ip, cSrcSize, dctx);
+}
+
+/* *************************/
+/* double-symbols decoding */
+/* *************************/
+typedef struct {
+	U16 sequence;
+	BYTE nbBits;
+	BYTE length;
+} HUF_DEltX4; /* double-symbols decoding */
+
+typedef struct {
+	BYTE symbol;
+	BYTE weight;
+} sortedSymbol_t;
+
+/* HUF_fillDTableX4Level2() :
+ * `rankValOrigin` must be a table of at least (HUF_TABLELOG_MAX + 1) U32 */
+static void HUF_fillDTableX4Level2(HUF_DEltX4 *DTable, U32 sizeLog, const U32 consumed, const U32 *rankValOrigin, const int minWeight,
+				   const sortedSymbol_t *sortedSymbols, const U32 sortedListSize, U32 nbBitsBaseline, U16 baseSeq)
+{
+	HUF_DEltX4 DElt;
+	U32 rankVal[HUF_TABLELOG_MAX + 1];
+
+	/* get pre-calculated rankVal */
+	memcpy(rankVal, rankValOrigin, sizeof(rankVal));
+
+	/* fill skipped values */
+	if (minWeight > 1) {
+		U32 i, skipSize = rankVal[minWeight];
+		ZSTD_writeLE16(&(DElt.sequence), baseSeq);
+		DElt.nbBits = (BYTE)(consumed);
+		DElt.length = 1;
+		for (i = 0; i < skipSize; i++)
+			DTable[i] = DElt;
+	}
+
+	/* fill DTable */
+	{
+		U32 s;
+		for (s = 0; s < sortedListSize; s++) { /* note : sortedSymbols already skipped */
+			const U32 symbol = sortedSymbols[s].symbol;
+			const U32 weight = sortedSymbols[s].weight;
+			const U32 nbBits = nbBitsBaseline - weight;
+			const U32 length = 1 << (sizeLog - nbBits);
+			const U32 start = rankVal[weight];
+			U32 i = start;
+			const U32 end = start + length;
+
+			ZSTD_writeLE16(&(DElt.sequence), (U16)(baseSeq + (symbol << 8)));
+			DElt.nbBits = (BYTE)(nbBits + consumed);
+			DElt.length = 2;
+			do {
+				DTable[i++] = DElt;
+			} while (i < end); /* since length >= 1 */
+
+			rankVal[weight] += length;
+		}
+	}
+}
+
+typedef U32 rankVal_t[HUF_TABLELOG_MAX][HUF_TABLELOG_MAX + 1];
+typedef U32 rankValCol_t[HUF_TABLELOG_MAX + 1];
+
+static void HUF_fillDTableX4(HUF_DEltX4 *DTable, const U32 targetLog, const sortedSymbol_t *sortedList, const U32 sortedListSize, const U32 *rankStart,
+			     rankVal_t rankValOrigin, const U32 maxWeight, const U32 nbBitsBaseline)
+{
+	U32 rankVal[HUF_TABLELOG_MAX + 1];
+	const int scaleLog = nbBitsBaseline - targetLog; /* note : targetLog >= srcLog, hence scaleLog <= 1 */
+	const U32 minBits = nbBitsBaseline - maxWeight;
+	U32 s;
+
+	memcpy(rankVal, rankValOrigin, sizeof(rankVal));
+
+	/* fill DTable */
+	for (s = 0; s < sortedListSize; s++) {
+		const U16 symbol = sortedList[s].symbol;
+		const U32 weight = sortedList[s].weight;
+		const U32 nbBits = nbBitsBaseline - weight;
+		const U32 start = rankVal[weight];
+		const U32 length = 1 << (targetLog - nbBits);
+
+		if (targetLog - nbBits >= minBits) { /* enough room for a second symbol */
+			U32 sortedRank;
+			int minWeight = nbBits + scaleLog;
+			if (minWeight < 1)
+				minWeight = 1;
+			sortedRank = rankStart[minWeight];
+			HUF_fillDTableX4Level2(DTable + start, targetLog - nbBits, nbBits, rankValOrigin[nbBits], minWeight, sortedList + sortedRank,
+					       sortedListSize - sortedRank, nbBitsBaseline, symbol);
+		} else {
+			HUF_DEltX4 DElt;
+			ZSTD_writeLE16(&(DElt.sequence), symbol);
+			DElt.nbBits = (BYTE)(nbBits);
+			DElt.length = 1;
+			{
+				U32 const end = start + length;
+				U32 u;
+				for (u = start; u < end; u++)
+					DTable[u] = DElt;
+			}
+		}
+		rankVal[weight] += length;
+	}
+}
+
+size_t HUF_readDTableX4_wksp(HUF_DTable *DTable, const void *src, size_t srcSize, void *workspace, size_t workspaceSize)
+{
+	U32 tableLog, maxW, sizeOfSort, nbSymbols;
+	DTableDesc dtd = HUF_getDTableDesc(DTable);
+	U32 const maxTableLog = dtd.maxTableLog;
+	size_t iSize;
+	void *dtPtr = DTable + 1; /* force compiler to avoid strict-aliasing */
+	HUF_DEltX4 *const dt = (HUF_DEltX4 *)dtPtr;
+	U32 *rankStart;
+
+	rankValCol_t *rankVal;
+	U32 *rankStats;
+	U32 *rankStart0;
+	sortedSymbol_t *sortedSymbol;
+	BYTE *weightList;
+	size_t spaceUsed32 = 0;
+
+	HUF_STATIC_ASSERT((sizeof(rankValCol_t) & 3) == 0);
+
+	rankVal = (rankValCol_t *)((U32 *)workspace + spaceUsed32);
+	spaceUsed32 += (sizeof(rankValCol_t) * HUF_TABLELOG_MAX) >> 2;
+	rankStats = (U32 *)workspace + spaceUsed32;
+	spaceUsed32 += HUF_TABLELOG_MAX + 1;
+	rankStart0 = (U32 *)workspace + spaceUsed32;
+	spaceUsed32 += HUF_TABLELOG_MAX + 2;
+	sortedSymbol = (sortedSymbol_t *)((U32 *)workspace + spaceUsed32);
+	spaceUsed32 += ALIGN(sizeof(sortedSymbol_t) * (HUF_SYMBOLVALUE_MAX + 1), sizeof(U32)) >> 2;
+	weightList = (BYTE *)((U32 *)workspace + spaceUsed32);
+	spaceUsed32 += ALIGN(HUF_SYMBOLVALUE_MAX + 1, sizeof(U32)) >> 2;
+
+	if ((spaceUsed32 << 2) > workspaceSize)
+		return ERROR(tableLog_tooLarge);
+	workspace = (U32 *)workspace + spaceUsed32;
+	workspaceSize -= (spaceUsed32 << 2);
+
+	rankStart = rankStart0 + 1;
+	memset(rankStats, 0, sizeof(U32) * (2 * HUF_TABLELOG_MAX + 2 + 1));
+
+	HUF_STATIC_ASSERT(sizeof(HUF_DEltX4) == sizeof(HUF_DTable)); /* if compiler fails here, assertion is wrong */
+	if (maxTableLog > HUF_TABLELOG_MAX)
+		return ERROR(tableLog_tooLarge);
+	/* memset(weightList, 0, sizeof(weightList)); */ /* is not necessary, even though some analyzer complain ... */
+
+	iSize = HUF_readStats_wksp(weightList, HUF_SYMBOLVALUE_MAX + 1, rankStats, &nbSymbols, &tableLog, src, srcSize, workspace, workspaceSize);
+	if (HUF_isError(iSize))
+		return iSize;
+
+	/* check result */
+	if (tableLog > maxTableLog)
+		return ERROR(tableLog_tooLarge); /* DTable can't fit code depth */
+
+	/* find maxWeight */
+	for (maxW = tableLog; rankStats[maxW] == 0; maxW--) {
+	} /* necessarily finds a solution before 0 */
+
+	/* Get start index of each weight */
+	{
+		U32 w, nextRankStart = 0;
+		for (w = 1; w < maxW + 1; w++) {
+			U32 curr = nextRankStart;
+			nextRankStart += rankStats[w];
+			rankStart[w] = curr;
+		}
+		rankStart[0] = nextRankStart; /* put all 0w symbols at the end of sorted list*/
+		sizeOfSort = nextRankStart;
+	}
+
+	/* sort symbols by weight */
+	{
+		U32 s;
+		for (s = 0; s < nbSymbols; s++) {
+			U32 const w = weightList[s];
+			U32 const r = rankStart[w]++;
+			sortedSymbol[r].symbol = (BYTE)s;
+			sortedSymbol[r].weight = (BYTE)w;
+		}
+		rankStart[0] = 0; /* forget 0w symbols; this is beginning of weight(1) */
+	}
+
+	/* Build rankVal */
+	{
+		U32 *const rankVal0 = rankVal[0];
+		{
+			int const rescale = (maxTableLog - tableLog) - 1; /* tableLog <= maxTableLog */
+			U32 nextRankVal = 0;
+			U32 w;
+			for (w = 1; w < maxW + 1; w++) {
+				U32 curr = nextRankVal;
+				nextRankVal += rankStats[w] << (w + rescale);
+				rankVal0[w] = curr;
+			}
+		}
+		{
+			U32 const minBits = tableLog + 1 - maxW;
+			U32 consumed;
+			for (consumed = minBits; consumed < maxTableLog - minBits + 1; consumed++) {
+				U32 *const rankValPtr = rankVal[consumed];
+				U32 w;
+				for (w = 1; w < maxW + 1; w++) {
+					rankValPtr[w] = rankVal0[w] >> consumed;
+				}
+			}
+		}
+	}
+
+	HUF_fillDTableX4(dt, maxTableLog, sortedSymbol, sizeOfSort, rankStart0, rankVal, maxW, tableLog + 1);
+
+	dtd.tableLog = (BYTE)maxTableLog;
+	dtd.tableType = 1;
+	memcpy(DTable, &dtd, sizeof(dtd));
+	return iSize;
+}
+
+static U32 HUF_decodeSymbolX4(void *op, BIT_DStream_t *DStream, const HUF_DEltX4 *dt, const U32 dtLog)
+{
+	size_t const val = BIT_lookBitsFast(DStream, dtLog); /* note : dtLog >= 1 */
+	memcpy(op, dt + val, 2);
+	BIT_skipBits(DStream, dt[val].nbBits);
+	return dt[val].length;
+}
+
+static U32 HUF_decodeLastSymbolX4(void *op, BIT_DStream_t *DStream, const HUF_DEltX4 *dt, const U32 dtLog)
+{
+	size_t const val = BIT_lookBitsFast(DStream, dtLog); /* note : dtLog >= 1 */
+	memcpy(op, dt + val, 1);
+	if (dt[val].length == 1)
+		BIT_skipBits(DStream, dt[val].nbBits);
+	else {
+		if (DStream->bitsConsumed < (sizeof(DStream->bitContainer) * 8)) {
+			BIT_skipBits(DStream, dt[val].nbBits);
+			if (DStream->bitsConsumed > (sizeof(DStream->bitContainer) * 8))
+				/* ugly hack; works only because it's the last symbol. Note : can't easily extract nbBits from just this symbol */
+				DStream->bitsConsumed = (sizeof(DStream->bitContainer) * 8);
+		}
+	}
+	return 1;
+}
+
+#define HUF_DECODE_SYMBOLX4_0(ptr, DStreamPtr) ptr += HUF_decodeSymbolX4(ptr, DStreamPtr, dt, dtLog)
+
+#define HUF_DECODE_SYMBOLX4_1(ptr, DStreamPtr)         \
+	if (ZSTD_64bits() || (HUF_TABLELOG_MAX <= 12)) \
+	ptr += HUF_decodeSymbolX4(ptr, DStreamPtr, dt, dtLog)
+
+#define HUF_DECODE_SYMBOLX4_2(ptr, DStreamPtr) \
+	if (ZSTD_64bits())                     \
+	ptr += HUF_decodeSymbolX4(ptr, DStreamPtr, dt, dtLog)
+
+FORCE_INLINE size_t HUF_decodeStreamX4(BYTE *p, BIT_DStream_t *bitDPtr, BYTE *const pEnd, const HUF_DEltX4 *const dt, const U32 dtLog)
+{
+	BYTE *const pStart = p;
+
+	/* up to 8 symbols at a time */
+	while ((BIT_reloadDStream(bitDPtr) == BIT_DStream_unfinished) & (p < pEnd - (sizeof(bitDPtr->bitContainer) - 1))) {
+		HUF_DECODE_SYMBOLX4_2(p, bitDPtr);
+		HUF_DECODE_SYMBOLX4_1(p, bitDPtr);
+		HUF_DECODE_SYMBOLX4_2(p, bitDPtr);
+		HUF_DECODE_SYMBOLX4_0(p, bitDPtr);
+	}
+
+	/* closer to end : up to 2 symbols at a time */
+	while ((BIT_reloadDStream(bitDPtr) == BIT_DStream_unfinished) & (p <= pEnd - 2))
+		HUF_DECODE_SYMBOLX4_0(p, bitDPtr);
+
+	while (p <= pEnd - 2)
+		HUF_DECODE_SYMBOLX4_0(p, bitDPtr); /* no need to reload : reached the end of DStream */
+
+	if (p < pEnd)
+		p += HUF_decodeLastSymbolX4(p, bitDPtr, dt, dtLog);
+
+	return p - pStart;
+}
+
+static size_t HUF_decompress1X4_usingDTable_internal(void *dst, size_t dstSize, const void *cSrc, size_t cSrcSize, const HUF_DTable *DTable)
+{
+	BIT_DStream_t bitD;
+
+	/* Init */
+	{
+		size_t const errorCode = BIT_initDStream(&bitD, cSrc, cSrcSize);
+		if (HUF_isError(errorCode))
+			return errorCode;
+	}
+
+	/* decode */
+	{
+		BYTE *const ostart = (BYTE *)dst;
+		BYTE *const oend = ostart + dstSize;
+		const void *const dtPtr = DTable + 1; /* force compiler to not use strict-aliasing */
+		const HUF_DEltX4 *const dt = (const HUF_DEltX4 *)dtPtr;
+		DTableDesc const dtd = HUF_getDTableDesc(DTable);
+		HUF_decodeStreamX4(ostart, &bitD, oend, dt, dtd.tableLog);
+	}
+
+	/* check */
+	if (!BIT_endOfDStream(&bitD))
+		return ERROR(corruption_detected);
+
+	/* decoded size */
+	return dstSize;
+}
+
+size_t HUF_decompress1X4_usingDTable(void *dst, size_t dstSize, const void *cSrc, size_t cSrcSize, const HUF_DTable *DTable)
+{
+	DTableDesc dtd = HUF_getDTableDesc(DTable);
+	if (dtd.tableType != 1)
+		return ERROR(GENERIC);
+	return HUF_decompress1X4_usingDTable_internal(dst, dstSize, cSrc, cSrcSize, DTable);
+}
+
+size_t HUF_decompress1X4_DCtx_wksp(HUF_DTable *DCtx, void *dst, size_t dstSize, const void *cSrc, size_t cSrcSize, void *workspace, size_t workspaceSize)
+{
+	const BYTE *ip = (const BYTE *)cSrc;
+
+	size_t const hSize = HUF_readDTableX4_wksp(DCtx, cSrc, cSrcSize, workspace, workspaceSize);
+	if (HUF_isError(hSize))
+		return hSize;
+	if (hSize >= cSrcSize)
+		return ERROR(srcSize_wrong);
+	ip += hSize;
+	cSrcSize -= hSize;
+
+	return HUF_decompress1X4_usingDTable_internal(dst, dstSize, ip, cSrcSize, DCtx);
+}
+
+static size_t HUF_decompress4X4_usingDTable_internal(void *dst, size_t dstSize, const void *cSrc, size_t cSrcSize, const HUF_DTable *DTable)
+{
+	if (cSrcSize < 10)
+		return ERROR(corruption_detected); /* strict minimum : jump table + 1 byte per stream */
+
+	{
+		const BYTE *const istart = (const BYTE *)cSrc;
+		BYTE *const ostart = (BYTE *)dst;
+		BYTE *const oend = ostart + dstSize;
+		const void *const dtPtr = DTable + 1;
+		const HUF_DEltX4 *const dt = (const HUF_DEltX4 *)dtPtr;
+
+		/* Init */
+		BIT_DStream_t bitD1;
+		BIT_DStream_t bitD2;
+		BIT_DStream_t bitD3;
+		BIT_DStream_t bitD4;
+		size_t const length1 = ZSTD_readLE16(istart);
+		size_t const length2 = ZSTD_readLE16(istart + 2);
+		size_t const length3 = ZSTD_readLE16(istart + 4);
+		size_t const length4 = cSrcSize - (length1 + length2 + length3 + 6);
+		const BYTE *const istart1 = istart + 6; /* jumpTable */
+		const BYTE *const istart2 = istart1 + length1;
+		const BYTE *const istart3 = istart2 + length2;
+		const BYTE *const istart4 = istart3 + length3;
+		size_t const segmentSize = (dstSize + 3) / 4;
+		BYTE *const opStart2 = ostart + segmentSize;
+		BYTE *const opStart3 = opStart2 + segmentSize;
+		BYTE *const opStart4 = opStart3 + segmentSize;
+		BYTE *op1 = ostart;
+		BYTE *op2 = opStart2;
+		BYTE *op3 = opStart3;
+		BYTE *op4 = opStart4;
+		U32 endSignal;
+		DTableDesc const dtd = HUF_getDTableDesc(DTable);
+		U32 const dtLog = dtd.tableLog;
+
+		if (length4 > cSrcSize)
+			return ERROR(corruption_detected); /* overflow */
+		{
+			size_t const errorCode = BIT_initDStream(&bitD1, istart1, length1);
+			if (HUF_isError(errorCode))
+				return errorCode;
+		}
+		{
+			size_t const errorCode = BIT_initDStream(&bitD2, istart2, length2);
+			if (HUF_isError(errorCode))
+				return errorCode;
+		}
+		{
+			size_t const errorCode = BIT_initDStream(&bitD3, istart3, length3);
+			if (HUF_isError(errorCode))
+				return errorCode;
+		}
+		{
+			size_t const errorCode = BIT_initDStream(&bitD4, istart4, length4);
+			if (HUF_isError(errorCode))
+				return errorCode;
+		}
+
+		/* 16-32 symbols per loop (4-8 symbols per stream) */
+		endSignal = BIT_reloadDStream(&bitD1) | BIT_reloadDStream(&bitD2) | BIT_reloadDStream(&bitD3) | BIT_reloadDStream(&bitD4);
+		for (; (endSignal == BIT_DStream_unfinished) & (op4 < (oend - (sizeof(bitD4.bitContainer) - 1)));) {
+			HUF_DECODE_SYMBOLX4_2(op1, &bitD1);
+			HUF_DECODE_SYMBOLX4_2(op2, &bitD2);
+			HUF_DECODE_SYMBOLX4_2(op3, &bitD3);
+			HUF_DECODE_SYMBOLX4_2(op4, &bitD4);
+			HUF_DECODE_SYMBOLX4_1(op1, &bitD1);
+			HUF_DECODE_SYMBOLX4_1(op2, &bitD2);
+			HUF_DECODE_SYMBOLX4_1(op3, &bitD3);
+			HUF_DECODE_SYMBOLX4_1(op4, &bitD4);
+			HUF_DECODE_SYMBOLX4_2(op1, &bitD1);
+			HUF_DECODE_SYMBOLX4_2(op2, &bitD2);
+			HUF_DECODE_SYMBOLX4_2(op3, &bitD3);
+			HUF_DECODE_SYMBOLX4_2(op4, &bitD4);
+			HUF_DECODE_SYMBOLX4_0(op1, &bitD1);
+			HUF_DECODE_SYMBOLX4_0(op2, &bitD2);
+			HUF_DECODE_SYMBOLX4_0(op3, &bitD3);
+			HUF_DECODE_SYMBOLX4_0(op4, &bitD4);
+
+			endSignal = BIT_reloadDStream(&bitD1) | BIT_reloadDStream(&bitD2) | BIT_reloadDStream(&bitD3) | BIT_reloadDStream(&bitD4);
+		}
+
+		/* check corruption */
+		if (op1 > opStart2)
+			return ERROR(corruption_detected);
+		if (op2 > opStart3)
+			return ERROR(corruption_detected);
+		if (op3 > opStart4)
+			return ERROR(corruption_detected);
+		/* note : op4 already verified within main loop */
+
+		/* finish bitStreams one by one */
+		HUF_decodeStreamX4(op1, &bitD1, opStart2, dt, dtLog);
+		HUF_decodeStreamX4(op2, &bitD2, opStart3, dt, dtLog);
+		HUF_decodeStreamX4(op3, &bitD3, opStart4, dt, dtLog);
+		HUF_decodeStreamX4(op4, &bitD4, oend, dt, dtLog);
+
+		/* check */
+		{
+			U32 const endCheck = BIT_endOfDStream(&bitD1) & BIT_endOfDStream(&bitD2) & BIT_endOfDStream(&bitD3) & BIT_endOfDStream(&bitD4);
+			if (!endCheck)
+				return ERROR(corruption_detected);
+		}
+
+		/* decoded size */
+		return dstSize;
+	}
+}
+
+size_t HUF_decompress4X4_usingDTable(void *dst, size_t dstSize, const void *cSrc, size_t cSrcSize, const HUF_DTable *DTable)
+{
+	DTableDesc dtd = HUF_getDTableDesc(DTable);
+	if (dtd.tableType != 1)
+		return ERROR(GENERIC);
+	return HUF_decompress4X4_usingDTable_internal(dst, dstSize, cSrc, cSrcSize, DTable);
+}
+
+size_t HUF_decompress4X4_DCtx_wksp(HUF_DTable *dctx, void *dst, size_t dstSize, const void *cSrc, size_t cSrcSize, void *workspace, size_t workspaceSize)
+{
+	const BYTE *ip = (const BYTE *)cSrc;
+
+	size_t hSize = HUF_readDTableX4_wksp(dctx, cSrc, cSrcSize, workspace, workspaceSize);
+	if (HUF_isError(hSize))
+		return hSize;
+	if (hSize >= cSrcSize)
+		return ERROR(srcSize_wrong);
+	ip += hSize;
+	cSrcSize -= hSize;
+
+	return HUF_decompress4X4_usingDTable_internal(dst, dstSize, ip, cSrcSize, dctx);
+}
+
+/* ********************************/
+/* Generic decompression selector */
+/* ********************************/
+
+size_t HUF_decompress1X_usingDTable(void *dst, size_t maxDstSize, const void *cSrc, size_t cSrcSize, const HUF_DTable *DTable)
+{
+	DTableDesc const dtd = HUF_getDTableDesc(DTable);
+	return dtd.tableType ? HUF_decompress1X4_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable)
+			     : HUF_decompress1X2_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable);
+}
+
+size_t HUF_decompress4X_usingDTable(void *dst, size_t maxDstSize, const void *cSrc, size_t cSrcSize, const HUF_DTable *DTable)
+{
+	DTableDesc const dtd = HUF_getDTableDesc(DTable);
+	return dtd.tableType ? HUF_decompress4X4_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable)
+			     : HUF_decompress4X2_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable);
+}
+
+typedef struct {
+	U32 tableTime;
+	U32 decode256Time;
+} algo_time_t;
+static const algo_time_t algoTime[16 /* Quantization */][3 /* single, double, quad */] = {
+    /* single, double, quad */
+    {{0, 0}, {1, 1}, {2, 2}},		     /* Q==0 : impossible */
+    {{0, 0}, {1, 1}, {2, 2}},		     /* Q==1 : impossible */
+    {{38, 130}, {1313, 74}, {2151, 38}},     /* Q == 2 : 12-18% */
+    {{448, 128}, {1353, 74}, {2238, 41}},    /* Q == 3 : 18-25% */
+    {{556, 128}, {1353, 74}, {2238, 47}},    /* Q == 4 : 25-32% */
+    {{714, 128}, {1418, 74}, {2436, 53}},    /* Q == 5 : 32-38% */
+    {{883, 128}, {1437, 74}, {2464, 61}},    /* Q == 6 : 38-44% */
+    {{897, 128}, {1515, 75}, {2622, 68}},    /* Q == 7 : 44-50% */
+    {{926, 128}, {1613, 75}, {2730, 75}},    /* Q == 8 : 50-56% */
+    {{947, 128}, {1729, 77}, {3359, 77}},    /* Q == 9 : 56-62% */
+    {{1107, 128}, {2083, 81}, {4006, 84}},   /* Q ==10 : 62-69% */
+    {{1177, 128}, {2379, 87}, {4785, 88}},   /* Q ==11 : 69-75% */
+    {{1242, 128}, {2415, 93}, {5155, 84}},   /* Q ==12 : 75-81% */
+    {{1349, 128}, {2644, 106}, {5260, 106}}, /* Q ==13 : 81-87% */
+    {{1455, 128}, {2422, 124}, {4174, 124}}, /* Q ==14 : 87-93% */
+    {{722, 128}, {1891, 145}, {1936, 146}},  /* Q ==15 : 93-99% */
+};
+
+/** HUF_selectDecoder() :
+*   Tells which decoder is likely to decode faster,
+*   based on a set of pre-determined metrics.
+*   @return : 0==HUF_decompress4X2, 1==HUF_decompress4X4 .
+*   Assumption : 0 < cSrcSize < dstSize <= 128 KB */
+U32 HUF_selectDecoder(size_t dstSize, size_t cSrcSize)
+{
+	/* decoder timing evaluation */
+	U32 const Q = (U32)(cSrcSize * 16 / dstSize); /* Q < 16 since dstSize > cSrcSize */
+	U32 const D256 = (U32)(dstSize >> 8);
+	U32 const DTime0 = algoTime[Q][0].tableTime + (algoTime[Q][0].decode256Time * D256);
+	U32 DTime1 = algoTime[Q][1].tableTime + (algoTime[Q][1].decode256Time * D256);
+	DTime1 += DTime1 >> 3; /* advantage to algorithm using less memory, for cache eviction */
+
+	return DTime1 < DTime0;
+}
+
+typedef size_t (*decompressionAlgo)(void *dst, size_t dstSize, const void *cSrc, size_t cSrcSize);
+
+size_t HUF_decompress4X_DCtx_wksp(HUF_DTable *dctx, void *dst, size_t dstSize, const void *cSrc, size_t cSrcSize, void *workspace, size_t workspaceSize)
+{
+	/* validation checks */
+	if (dstSize == 0)
+		return ERROR(dstSize_tooSmall);
+	if (cSrcSize > dstSize)
+		return ERROR(corruption_detected); /* invalid */
+	if (cSrcSize == dstSize) {
+		memcpy(dst, cSrc, dstSize);
+		return dstSize;
+	} /* not compressed */
+	if (cSrcSize == 1) {
+		memset(dst, *(const BYTE *)cSrc, dstSize);
+		return dstSize;
+	} /* RLE */
+
+	{
+		U32 const algoNb = HUF_selectDecoder(dstSize, cSrcSize);
+		return algoNb ? HUF_decompress4X4_DCtx_wksp(dctx, dst, dstSize, cSrc, cSrcSize, workspace, workspaceSize)
+			      : HUF_decompress4X2_DCtx_wksp(dctx, dst, dstSize, cSrc, cSrcSize, workspace, workspaceSize);
+	}
+}
+
+size_t HUF_decompress4X_hufOnly_wksp(HUF_DTable *dctx, void *dst, size_t dstSize, const void *cSrc, size_t cSrcSize, void *workspace, size_t workspaceSize)
+{
+	/* validation checks */
+	if (dstSize == 0)
+		return ERROR(dstSize_tooSmall);
+	if ((cSrcSize >= dstSize) || (cSrcSize <= 1))
+		return ERROR(corruption_detected); /* invalid */
+
+	{
+		U32 const algoNb = HUF_selectDecoder(dstSize, cSrcSize);
+		return algoNb ? HUF_decompress4X4_DCtx_wksp(dctx, dst, dstSize, cSrc, cSrcSize, workspace, workspaceSize)
+			      : HUF_decompress4X2_DCtx_wksp(dctx, dst, dstSize, cSrc, cSrcSize, workspace, workspaceSize);
+	}
+}
+
+size_t HUF_decompress1X_DCtx_wksp(HUF_DTable *dctx, void *dst, size_t dstSize, const void *cSrc, size_t cSrcSize, void *workspace, size_t workspaceSize)
+{
+	/* validation checks */
+	if (dstSize == 0)
+		return ERROR(dstSize_tooSmall);
+	if (cSrcSize > dstSize)
+		return ERROR(corruption_detected); /* invalid */
+	if (cSrcSize == dstSize) {
+		memcpy(dst, cSrc, dstSize);
+		return dstSize;
+	} /* not compressed */
+	if (cSrcSize == 1) {
+		memset(dst, *(const BYTE *)cSrc, dstSize);
+		return dstSize;
+	} /* RLE */
+
+	{
+		U32 const algoNb = HUF_selectDecoder(dstSize, cSrcSize);
+		return algoNb ? HUF_decompress1X4_DCtx_wksp(dctx, dst, dstSize, cSrc, cSrcSize, workspace, workspaceSize)
+			      : HUF_decompress1X2_DCtx_wksp(dctx, dst, dstSize, cSrc, cSrcSize, workspace, workspaceSize);
+	}
+}
diff --git a/lib/zstd/mem.h b/lib/zstd/mem.h
new file mode 100644
index 000000000000..3a0f34c8706c
--- /dev/null
+++ b/lib/zstd/mem.h
@@ -0,0 +1,151 @@
+/**
+ * Copyright (c) 2016-present, Yann Collet, Facebook, Inc.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of https://github.com/facebook/zstd.
+ * An additional grant of patent rights can be found in the PATENTS file in the
+ * same directory.
+ *
+ * This program is free software; you can redistribute it and/or modify it under
+ * the terms of the GNU General Public License version 2 as published by the
+ * Free Software Foundation. This program is dual-licensed; you may select
+ * either version 2 of the GNU General Public License ("GPL") or BSD license
+ * ("BSD").
+ */
+
+#ifndef MEM_H_MODULE
+#define MEM_H_MODULE
+
+/*-****************************************
+*  Dependencies
+******************************************/
+#include <asm/unaligned.h>
+#include <linux/string.h> /* memcpy */
+#include <linux/types.h>  /* size_t, ptrdiff_t */
+
+/*-****************************************
+*  Compiler specifics
+******************************************/
+#define ZSTD_STATIC static __inline __attribute__((unused))
+
+/*-**************************************************************
+*  Basic Types
+*****************************************************************/
+typedef uint8_t BYTE;
+typedef uint16_t U16;
+typedef int16_t S16;
+typedef uint32_t U32;
+typedef int32_t S32;
+typedef uint64_t U64;
+typedef int64_t S64;
+typedef ptrdiff_t iPtrDiff;
+typedef uintptr_t uPtrDiff;
+
+/*-**************************************************************
+*  Memory I/O
+*****************************************************************/
+ZSTD_STATIC unsigned ZSTD_32bits(void) { return sizeof(size_t) == 4; }
+ZSTD_STATIC unsigned ZSTD_64bits(void) { return sizeof(size_t) == 8; }
+
+#if defined(__LITTLE_ENDIAN)
+#define ZSTD_LITTLE_ENDIAN 1
+#else
+#define ZSTD_LITTLE_ENDIAN 0
+#endif
+
+ZSTD_STATIC unsigned ZSTD_isLittleEndian(void) { return ZSTD_LITTLE_ENDIAN; }
+
+ZSTD_STATIC U16 ZSTD_read16(const void *memPtr) { return get_unaligned((const U16 *)memPtr); }
+
+ZSTD_STATIC U32 ZSTD_read32(const void *memPtr) { return get_unaligned((const U32 *)memPtr); }
+
+ZSTD_STATIC U64 ZSTD_read64(const void *memPtr) { return get_unaligned((const U64 *)memPtr); }
+
+ZSTD_STATIC size_t ZSTD_readST(const void *memPtr) { return get_unaligned((const size_t *)memPtr); }
+
+ZSTD_STATIC void ZSTD_write16(void *memPtr, U16 value) { put_unaligned(value, (U16 *)memPtr); }
+
+ZSTD_STATIC void ZSTD_write32(void *memPtr, U32 value) { put_unaligned(value, (U32 *)memPtr); }
+
+ZSTD_STATIC void ZSTD_write64(void *memPtr, U64 value) { put_unaligned(value, (U64 *)memPtr); }
+
+/*=== Little endian r/w ===*/
+
+ZSTD_STATIC U16 ZSTD_readLE16(const void *memPtr) { return get_unaligned_le16(memPtr); }
+
+ZSTD_STATIC void ZSTD_writeLE16(void *memPtr, U16 val) { put_unaligned_le16(val, memPtr); }
+
+ZSTD_STATIC U32 ZSTD_readLE24(const void *memPtr) { return ZSTD_readLE16(memPtr) + (((const BYTE *)memPtr)[2] << 16); }
+
+ZSTD_STATIC void ZSTD_writeLE24(void *memPtr, U32 val)
+{
+	ZSTD_writeLE16(memPtr, (U16)val);
+	((BYTE *)memPtr)[2] = (BYTE)(val >> 16);
+}
+
+ZSTD_STATIC U32 ZSTD_readLE32(const void *memPtr) { return get_unaligned_le32(memPtr); }
+
+ZSTD_STATIC void ZSTD_writeLE32(void *memPtr, U32 val32) { put_unaligned_le32(val32, memPtr); }
+
+ZSTD_STATIC U64 ZSTD_readLE64(const void *memPtr) { return get_unaligned_le64(memPtr); }
+
+ZSTD_STATIC void ZSTD_writeLE64(void *memPtr, U64 val64) { put_unaligned_le64(val64, memPtr); }
+
+ZSTD_STATIC size_t ZSTD_readLEST(const void *memPtr)
+{
+	if (ZSTD_32bits())
+		return (size_t)ZSTD_readLE32(memPtr);
+	else
+		return (size_t)ZSTD_readLE64(memPtr);
+}
+
+ZSTD_STATIC void ZSTD_writeLEST(void *memPtr, size_t val)
+{
+	if (ZSTD_32bits())
+		ZSTD_writeLE32(memPtr, (U32)val);
+	else
+		ZSTD_writeLE64(memPtr, (U64)val);
+}
+
+/*=== Big endian r/w ===*/
+
+ZSTD_STATIC U32 ZSTD_readBE32(const void *memPtr) { return get_unaligned_be32(memPtr); }
+
+ZSTD_STATIC void ZSTD_writeBE32(void *memPtr, U32 val32) { put_unaligned_be32(val32, memPtr); }
+
+ZSTD_STATIC U64 ZSTD_readBE64(const void *memPtr) { return get_unaligned_be64(memPtr); }
+
+ZSTD_STATIC void ZSTD_writeBE64(void *memPtr, U64 val64) { put_unaligned_be64(val64, memPtr); }
+
+ZSTD_STATIC size_t ZSTD_readBEST(const void *memPtr)
+{
+	if (ZSTD_32bits())
+		return (size_t)ZSTD_readBE32(memPtr);
+	else
+		return (size_t)ZSTD_readBE64(memPtr);
+}
+
+ZSTD_STATIC void ZSTD_writeBEST(void *memPtr, size_t val)
+{
+	if (ZSTD_32bits())
+		ZSTD_writeBE32(memPtr, (U32)val);
+	else
+		ZSTD_writeBE64(memPtr, (U64)val);
+}
+
+/* function safe only for comparisons */
+ZSTD_STATIC U32 ZSTD_readMINMATCH(const void *memPtr, U32 length)
+{
+	switch (length) {
+	default:
+	case 4: return ZSTD_read32(memPtr);
+	case 3:
+		if (ZSTD_isLittleEndian())
+			return ZSTD_read32(memPtr) << 8;
+		else
+			return ZSTD_read32(memPtr) >> 8;
+	}
+}
+
+#endif /* MEM_H_MODULE */
diff --git a/lib/zstd/zstd_common.c b/lib/zstd/zstd_common.c
new file mode 100644
index 000000000000..a282624ee155
--- /dev/null
+++ b/lib/zstd/zstd_common.c
@@ -0,0 +1,75 @@
+/**
+ * Copyright (c) 2016-present, Yann Collet, Facebook, Inc.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of https://github.com/facebook/zstd.
+ * An additional grant of patent rights can be found in the PATENTS file in the
+ * same directory.
+ *
+ * This program is free software; you can redistribute it and/or modify it under
+ * the terms of the GNU General Public License version 2 as published by the
+ * Free Software Foundation. This program is dual-licensed; you may select
+ * either version 2 of the GNU General Public License ("GPL") or BSD license
+ * ("BSD").
+ */
+
+/*-*************************************
+*  Dependencies
+***************************************/
+#include "error_private.h"
+#include "zstd_internal.h" /* declaration of ZSTD_isError, ZSTD_getErrorName, ZSTD_getErrorCode, ZSTD_getErrorString, ZSTD_versionNumber */
+#include <linux/kernel.h>
+
+/*=**************************************************************
+*  Custom allocator
+****************************************************************/
+
+#define stack_push(stack, size)                                 \
+	({                                                      \
+		void *const ptr = ZSTD_PTR_ALIGN((stack)->ptr); \
+		(stack)->ptr = (char *)ptr + (size);            \
+		(stack)->ptr <= (stack)->end ? ptr : NULL;      \
+	})
+
+ZSTD_customMem ZSTD_initStack(void *workspace, size_t workspaceSize)
+{
+	ZSTD_customMem stackMem = {ZSTD_stackAlloc, ZSTD_stackFree, workspace};
+	ZSTD_stack *stack = (ZSTD_stack *)workspace;
+	/* Verify preconditions */
+	if (!workspace || workspaceSize < sizeof(ZSTD_stack) || workspace != ZSTD_PTR_ALIGN(workspace)) {
+		ZSTD_customMem error = {NULL, NULL, NULL};
+		return error;
+	}
+	/* Initialize the stack */
+	stack->ptr = workspace;
+	stack->end = (char *)workspace + workspaceSize;
+	stack_push(stack, sizeof(ZSTD_stack));
+	return stackMem;
+}
+
+void *ZSTD_stackAllocAll(void *opaque, size_t *size)
+{
+	ZSTD_stack *stack = (ZSTD_stack *)opaque;
+	*size = (BYTE const *)stack->end - (BYTE *)ZSTD_PTR_ALIGN(stack->ptr);
+	return stack_push(stack, *size);
+}
+
+void *ZSTD_stackAlloc(void *opaque, size_t size)
+{
+	ZSTD_stack *stack = (ZSTD_stack *)opaque;
+	return stack_push(stack, size);
+}
+void ZSTD_stackFree(void *opaque, void *address)
+{
+	(void)opaque;
+	(void)address;
+}
+
+void *ZSTD_malloc(size_t size, ZSTD_customMem customMem) { return customMem.customAlloc(customMem.opaque, size); }
+
+void ZSTD_free(void *ptr, ZSTD_customMem customMem)
+{
+	if (ptr != NULL)
+		customMem.customFree(customMem.opaque, ptr);
+}
diff --git a/lib/zstd/zstd_internal.h b/lib/zstd/zstd_internal.h
new file mode 100644
index 000000000000..1a79fab9e13a
--- /dev/null
+++ b/lib/zstd/zstd_internal.h
@@ -0,0 +1,263 @@
+/**
+ * Copyright (c) 2016-present, Yann Collet, Facebook, Inc.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of https://github.com/facebook/zstd.
+ * An additional grant of patent rights can be found in the PATENTS file in the
+ * same directory.
+ *
+ * This program is free software; you can redistribute it and/or modify it under
+ * the terms of the GNU General Public License version 2 as published by the
+ * Free Software Foundation. This program is dual-licensed; you may select
+ * either version 2 of the GNU General Public License ("GPL") or BSD license
+ * ("BSD").
+ */
+
+#ifndef ZSTD_CCOMMON_H_MODULE
+#define ZSTD_CCOMMON_H_MODULE
+
+/*-*******************************************************
+*  Compiler specifics
+*********************************************************/
+#define FORCE_INLINE static __always_inline
+#define FORCE_NOINLINE static noinline
+
+/*-*************************************
+*  Dependencies
+***************************************/
+#include "error_private.h"
+#include "mem.h"
+#include <linux/compiler.h>
+#include <linux/kernel.h>
+#include <linux/xxhash.h>
+#include <linux/zstd.h>
+
+/*-*************************************
+*  shared macros
+***************************************/
+#define MIN(a, b) ((a) < (b) ? (a) : (b))
+#define MAX(a, b) ((a) > (b) ? (a) : (b))
+#define CHECK_F(f)                       \
+	{                                \
+		size_t const errcod = f; \
+		if (ERR_isError(errcod)) \
+			return errcod;   \
+	} /* check and Forward error code */
+#define CHECK_E(f, e)                    \
+	{                                \
+		size_t const errcod = f; \
+		if (ERR_isError(errcod)) \
+			return ERROR(e); \
+	} /* check and send Error code */
+#define ZSTD_STATIC_ASSERT(c)                                   \
+	{                                                       \
+		enum { ZSTD_static_assert = 1 / (int)(!!(c)) }; \
+	}
+
+/*-*************************************
+*  Common constants
+***************************************/
+#define ZSTD_OPT_NUM (1 << 12)
+#define ZSTD_DICT_MAGIC 0xEC30A437 /* v0.7+ */
+
+#define ZSTD_REP_NUM 3		      /* number of repcodes */
+#define ZSTD_REP_CHECK (ZSTD_REP_NUM) /* number of repcodes to check by the optimal parser */
+#define ZSTD_REP_MOVE (ZSTD_REP_NUM - 1)
+#define ZSTD_REP_MOVE_OPT (ZSTD_REP_NUM)
+static const U32 repStartValue[ZSTD_REP_NUM] = {1, 4, 8};
+
+#define KB *(1 << 10)
+#define MB *(1 << 20)
+#define GB *(1U << 30)
+
+#define BIT7 128
+#define BIT6 64
+#define BIT5 32
+#define BIT4 16
+#define BIT1 2
+#define BIT0 1
+
+#define ZSTD_WINDOWLOG_ABSOLUTEMIN 10
+static const size_t ZSTD_fcs_fieldSize[4] = {0, 2, 4, 8};
+static const size_t ZSTD_did_fieldSize[4] = {0, 1, 2, 4};
+
+#define ZSTD_BLOCKHEADERSIZE 3 /* C standard doesn't allow `static const` variable to be init using another `static const` variable */
+static const size_t ZSTD_blockHeaderSize = ZSTD_BLOCKHEADERSIZE;
+typedef enum { bt_raw, bt_rle, bt_compressed, bt_reserved } blockType_e;
+
+#define MIN_SEQUENCES_SIZE 1									  /* nbSeq==0 */
+#define MIN_CBLOCK_SIZE (1 /*litCSize*/ + 1 /* RLE or RAW */ + MIN_SEQUENCES_SIZE /* nbSeq==0 */) /* for a non-null block */
+
+#define HufLog 12
+typedef enum { set_basic, set_rle, set_compressed, set_repeat } symbolEncodingType_e;
+
+#define LONGNBSEQ 0x7F00
+
+#define MINMATCH 3
+#define EQUAL_READ32 4
+
+#define Litbits 8
+#define MaxLit ((1 << Litbits) - 1)
+#define MaxML 52
+#define MaxLL 35
+#define MaxOff 28
+#define MaxSeq MAX(MaxLL, MaxML) /* Assumption : MaxOff < MaxLL,MaxML */
+#define MLFSELog 9
+#define LLFSELog 9
+#define OffFSELog 8
+
+static const U32 LL_bits[MaxLL + 1] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 3, 3, 4, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16};
+static const S16 LL_defaultNorm[MaxLL + 1] = {4, 3, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 2, 1, 1, 1, 1, 1, -1, -1, -1, -1};
+#define LL_DEFAULTNORMLOG 6 /* for static allocation */
+static const U32 LL_defaultNormLog = LL_DEFAULTNORMLOG;
+
+static const U32 ML_bits[MaxML + 1] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  0,  0,  0,  0,  0,  0, 0,
+				       0, 0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 3, 3, 4, 4, 5, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16};
+static const S16 ML_defaultNorm[MaxML + 1] = {1, 4, 3, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  1,  1,  1,  1,  1,  1, 1,
+					      1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, -1, -1, -1, -1, -1, -1, -1};
+#define ML_DEFAULTNORMLOG 6 /* for static allocation */
+static const U32 ML_defaultNormLog = ML_DEFAULTNORMLOG;
+
+static const S16 OF_defaultNorm[MaxOff + 1] = {1, 1, 1, 1, 1, 1, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, -1, -1, -1, -1, -1};
+#define OF_DEFAULTNORMLOG 5 /* for static allocation */
+static const U32 OF_defaultNormLog = OF_DEFAULTNORMLOG;
+
+/*-*******************************************
+*  Shared functions to include for inlining
+*********************************************/
+ZSTD_STATIC void ZSTD_copy8(void *dst, const void *src) {
+	memcpy(dst, src, 8);
+}
+/*! ZSTD_wildcopy() :
+*   custom version of memcpy(), can copy up to 7 bytes too many (8 bytes if length==0) */
+#define WILDCOPY_OVERLENGTH 8
+ZSTD_STATIC void ZSTD_wildcopy(void *dst, const void *src, ptrdiff_t length)
+{
+	const BYTE* ip = (const BYTE*)src;
+	BYTE* op = (BYTE*)dst;
+	BYTE* const oend = op + length;
+	/* Work around https://gcc.gnu.org/bugzilla/show_bug.cgi?id=81388.
+	 * Avoid the bad case where the loop only runs once by handling the
+	 * special case separately. This doesn't trigger the bug because it
+	 * doesn't involve pointer/integer overflow.
+	 */
+	if (length <= 8)
+		return ZSTD_copy8(dst, src);
+	do {
+		ZSTD_copy8(op, ip);
+		op += 8;
+		ip += 8;
+	} while (op < oend);
+}
+
+/*-*******************************************
+*  Private interfaces
+*********************************************/
+typedef struct ZSTD_stats_s ZSTD_stats_t;
+
+typedef struct {
+	U32 off;
+	U32 len;
+} ZSTD_match_t;
+
+typedef struct {
+	U32 price;
+	U32 off;
+	U32 mlen;
+	U32 litlen;
+	U32 rep[ZSTD_REP_NUM];
+} ZSTD_optimal_t;
+
+typedef struct seqDef_s {
+	U32 offset;
+	U16 litLength;
+	U16 matchLength;
+} seqDef;
+
+typedef struct {
+	seqDef *sequencesStart;
+	seqDef *sequences;
+	BYTE *litStart;
+	BYTE *lit;
+	BYTE *llCode;
+	BYTE *mlCode;
+	BYTE *ofCode;
+	U32 longLengthID; /* 0 == no longLength; 1 == Lit.longLength; 2 == Match.longLength; */
+	U32 longLengthPos;
+	/* opt */
+	ZSTD_optimal_t *priceTable;
+	ZSTD_match_t *matchTable;
+	U32 *matchLengthFreq;
+	U32 *litLengthFreq;
+	U32 *litFreq;
+	U32 *offCodeFreq;
+	U32 matchLengthSum;
+	U32 matchSum;
+	U32 litLengthSum;
+	U32 litSum;
+	U32 offCodeSum;
+	U32 log2matchLengthSum;
+	U32 log2matchSum;
+	U32 log2litLengthSum;
+	U32 log2litSum;
+	U32 log2offCodeSum;
+	U32 factor;
+	U32 staticPrices;
+	U32 cachedPrice;
+	U32 cachedLitLength;
+	const BYTE *cachedLiterals;
+} seqStore_t;
+
+const seqStore_t *ZSTD_getSeqStore(const ZSTD_CCtx *ctx);
+void ZSTD_seqToCodes(const seqStore_t *seqStorePtr);
+int ZSTD_isSkipFrame(ZSTD_DCtx *dctx);
+
+/*= Custom memory allocation functions */
+typedef void *(*ZSTD_allocFunction)(void *opaque, size_t size);
+typedef void (*ZSTD_freeFunction)(void *opaque, void *address);
+typedef struct {
+	ZSTD_allocFunction customAlloc;
+	ZSTD_freeFunction customFree;
+	void *opaque;
+} ZSTD_customMem;
+
+void *ZSTD_malloc(size_t size, ZSTD_customMem customMem);
+void ZSTD_free(void *ptr, ZSTD_customMem customMem);
+
+/*====== stack allocation  ======*/
+
+typedef struct {
+	void *ptr;
+	const void *end;
+} ZSTD_stack;
+
+#define ZSTD_ALIGN(x) ALIGN(x, sizeof(size_t))
+#define ZSTD_PTR_ALIGN(p) PTR_ALIGN(p, sizeof(size_t))
+
+ZSTD_customMem ZSTD_initStack(void *workspace, size_t workspaceSize);
+
+void *ZSTD_stackAllocAll(void *opaque, size_t *size);
+void *ZSTD_stackAlloc(void *opaque, size_t size);
+void ZSTD_stackFree(void *opaque, void *address);
+
+/*======  common function  ======*/
+
+ZSTD_STATIC U32 ZSTD_highbit32(U32 val) { return 31 - __builtin_clz(val); }
+
+/* hidden functions */
+
+/* ZSTD_invalidateRepCodes() :
+ * ensures next compression will not use repcodes from previous block.
+ * Note : only works with regular variant;
+ *        do not use with extDict variant ! */
+void ZSTD_invalidateRepCodes(ZSTD_CCtx *cctx);
+
+size_t ZSTD_freeCCtx(ZSTD_CCtx *cctx);
+size_t ZSTD_freeDCtx(ZSTD_DCtx *dctx);
+size_t ZSTD_freeCDict(ZSTD_CDict *cdict);
+size_t ZSTD_freeDDict(ZSTD_DDict *cdict);
+size_t ZSTD_freeCStream(ZSTD_CStream *zcs);
+size_t ZSTD_freeDStream(ZSTD_DStream *zds);
+
+#endif /* ZSTD_CCOMMON_H_MODULE */
diff --git a/lib/zstd/zstd_opt.h b/lib/zstd/zstd_opt.h
new file mode 100644
index 000000000000..55e1b4cba808
--- /dev/null
+++ b/lib/zstd/zstd_opt.h
@@ -0,0 +1,1014 @@
+/**
+ * Copyright (c) 2016-present, Przemyslaw Skibinski, Yann Collet, Facebook, Inc.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of https://github.com/facebook/zstd.
+ * An additional grant of patent rights can be found in the PATENTS file in the
+ * same directory.
+ *
+ * This program is free software; you can redistribute it and/or modify it under
+ * the terms of the GNU General Public License version 2 as published by the
+ * Free Software Foundation. This program is dual-licensed; you may select
+ * either version 2 of the GNU General Public License ("GPL") or BSD license
+ * ("BSD").
+ */
+
+/* Note : this file is intended to be included within zstd_compress.c */
+
+#ifndef ZSTD_OPT_H_91842398743
+#define ZSTD_OPT_H_91842398743
+
+#define ZSTD_LITFREQ_ADD 2
+#define ZSTD_FREQ_DIV 4
+#define ZSTD_MAX_PRICE (1 << 30)
+
+/*-*************************************
+*  Price functions for optimal parser
+***************************************/
+FORCE_INLINE void ZSTD_setLog2Prices(seqStore_t *ssPtr)
+{
+	ssPtr->log2matchLengthSum = ZSTD_highbit32(ssPtr->matchLengthSum + 1);
+	ssPtr->log2litLengthSum = ZSTD_highbit32(ssPtr->litLengthSum + 1);
+	ssPtr->log2litSum = ZSTD_highbit32(ssPtr->litSum + 1);
+	ssPtr->log2offCodeSum = ZSTD_highbit32(ssPtr->offCodeSum + 1);
+	ssPtr->factor = 1 + ((ssPtr->litSum >> 5) / ssPtr->litLengthSum) + ((ssPtr->litSum << 1) / (ssPtr->litSum + ssPtr->matchSum));
+}
+
+ZSTD_STATIC void ZSTD_rescaleFreqs(seqStore_t *ssPtr, const BYTE *src, size_t srcSize)
+{
+	unsigned u;
+
+	ssPtr->cachedLiterals = NULL;
+	ssPtr->cachedPrice = ssPtr->cachedLitLength = 0;
+	ssPtr->staticPrices = 0;
+
+	if (ssPtr->litLengthSum == 0) {
+		if (srcSize <= 1024)
+			ssPtr->staticPrices = 1;
+
+		for (u = 0; u <= MaxLit; u++)
+			ssPtr->litFreq[u] = 0;
+		for (u = 0; u < srcSize; u++)
+			ssPtr->litFreq[src[u]]++;
+
+		ssPtr->litSum = 0;
+		ssPtr->litLengthSum = MaxLL + 1;
+		ssPtr->matchLengthSum = MaxML + 1;
+		ssPtr->offCodeSum = (MaxOff + 1);
+		ssPtr->matchSum = (ZSTD_LITFREQ_ADD << Litbits);
+
+		for (u = 0; u <= MaxLit; u++) {
+			ssPtr->litFreq[u] = 1 + (ssPtr->litFreq[u] >> ZSTD_FREQ_DIV);
+			ssPtr->litSum += ssPtr->litFreq[u];
+		}
+		for (u = 0; u <= MaxLL; u++)
+			ssPtr->litLengthFreq[u] = 1;
+		for (u = 0; u <= MaxML; u++)
+			ssPtr->matchLengthFreq[u] = 1;
+		for (u = 0; u <= MaxOff; u++)
+			ssPtr->offCodeFreq[u] = 1;
+	} else {
+		ssPtr->matchLengthSum = 0;
+		ssPtr->litLengthSum = 0;
+		ssPtr->offCodeSum = 0;
+		ssPtr->matchSum = 0;
+		ssPtr->litSum = 0;
+
+		for (u = 0; u <= MaxLit; u++) {
+			ssPtr->litFreq[u] = 1 + (ssPtr->litFreq[u] >> (ZSTD_FREQ_DIV + 1));
+			ssPtr->litSum += ssPtr->litFreq[u];
+		}
+		for (u = 0; u <= MaxLL; u++) {
+			ssPtr->litLengthFreq[u] = 1 + (ssPtr->litLengthFreq[u] >> (ZSTD_FREQ_DIV + 1));
+			ssPtr->litLengthSum += ssPtr->litLengthFreq[u];
+		}
+		for (u = 0; u <= MaxML; u++) {
+			ssPtr->matchLengthFreq[u] = 1 + (ssPtr->matchLengthFreq[u] >> ZSTD_FREQ_DIV);
+			ssPtr->matchLengthSum += ssPtr->matchLengthFreq[u];
+			ssPtr->matchSum += ssPtr->matchLengthFreq[u] * (u + 3);
+		}
+		ssPtr->matchSum *= ZSTD_LITFREQ_ADD;
+		for (u = 0; u <= MaxOff; u++) {
+			ssPtr->offCodeFreq[u] = 1 + (ssPtr->offCodeFreq[u] >> ZSTD_FREQ_DIV);
+			ssPtr->offCodeSum += ssPtr->offCodeFreq[u];
+		}
+	}
+
+	ZSTD_setLog2Prices(ssPtr);
+}
+
+FORCE_INLINE U32 ZSTD_getLiteralPrice(seqStore_t *ssPtr, U32 litLength, const BYTE *literals)
+{
+	U32 price, u;
+
+	if (ssPtr->staticPrices)
+		return ZSTD_highbit32((U32)litLength + 1) + (litLength * 6);
+
+	if (litLength == 0)
+		return ssPtr->log2litLengthSum - ZSTD_highbit32(ssPtr->litLengthFreq[0] + 1);
+
+	/* literals */
+	if (ssPtr->cachedLiterals == literals) {
+		U32 const additional = litLength - ssPtr->cachedLitLength;
+		const BYTE *literals2 = ssPtr->cachedLiterals + ssPtr->cachedLitLength;
+		price = ssPtr->cachedPrice + additional * ssPtr->log2litSum;
+		for (u = 0; u < additional; u++)
+			price -= ZSTD_highbit32(ssPtr->litFreq[literals2[u]] + 1);
+		ssPtr->cachedPrice = price;
+		ssPtr->cachedLitLength = litLength;
+	} else {
+		price = litLength * ssPtr->log2litSum;
+		for (u = 0; u < litLength; u++)
+			price -= ZSTD_highbit32(ssPtr->litFreq[literals[u]] + 1);
+
+		if (litLength >= 12) {
+			ssPtr->cachedLiterals = literals;
+			ssPtr->cachedPrice = price;
+			ssPtr->cachedLitLength = litLength;
+		}
+	}
+
+	/* literal Length */
+	{
+		const BYTE LL_deltaCode = 19;
+		const BYTE llCode = (litLength > 63) ? (BYTE)ZSTD_highbit32(litLength) + LL_deltaCode : LL_Code[litLength];
+		price += LL_bits[llCode] + ssPtr->log2litLengthSum - ZSTD_highbit32(ssPtr->litLengthFreq[llCode] + 1);
+	}
+
+	return price;
+}
+
+FORCE_INLINE U32 ZSTD_getPrice(seqStore_t *seqStorePtr, U32 litLength, const BYTE *literals, U32 offset, U32 matchLength, const int ultra)
+{
+	/* offset */
+	U32 price;
+	BYTE const offCode = (BYTE)ZSTD_highbit32(offset + 1);
+
+	if (seqStorePtr->staticPrices)
+		return ZSTD_getLiteralPrice(seqStorePtr, litLength, literals) + ZSTD_highbit32((U32)matchLength + 1) + 16 + offCode;
+
+	price = offCode + seqStorePtr->log2offCodeSum - ZSTD_highbit32(seqStorePtr->offCodeFreq[offCode] + 1);
+	if (!ultra && offCode >= 20)
+		price += (offCode - 19) * 2;
+
+	/* match Length */
+	{
+		const BYTE ML_deltaCode = 36;
+		const BYTE mlCode = (matchLength > 127) ? (BYTE)ZSTD_highbit32(matchLength) + ML_deltaCode : ML_Code[matchLength];
+		price += ML_bits[mlCode] + seqStorePtr->log2matchLengthSum - ZSTD_highbit32(seqStorePtr->matchLengthFreq[mlCode] + 1);
+	}
+
+	return price + ZSTD_getLiteralPrice(seqStorePtr, litLength, literals) + seqStorePtr->factor;
+}
+
+ZSTD_STATIC void ZSTD_updatePrice(seqStore_t *seqStorePtr, U32 litLength, const BYTE *literals, U32 offset, U32 matchLength)
+{
+	U32 u;
+
+	/* literals */
+	seqStorePtr->litSum += litLength * ZSTD_LITFREQ_ADD;
+	for (u = 0; u < litLength; u++)
+		seqStorePtr->litFreq[literals[u]] += ZSTD_LITFREQ_ADD;
+
+	/* literal Length */
+	{
+		const BYTE LL_deltaCode = 19;
+		const BYTE llCode = (litLength > 63) ? (BYTE)ZSTD_highbit32(litLength) + LL_deltaCode : LL_Code[litLength];
+		seqStorePtr->litLengthFreq[llCode]++;
+		seqStorePtr->litLengthSum++;
+	}
+
+	/* match offset */
+	{
+		BYTE const offCode = (BYTE)ZSTD_highbit32(offset + 1);
+		seqStorePtr->offCodeSum++;
+		seqStorePtr->offCodeFreq[offCode]++;
+	}
+
+	/* match Length */
+	{
+		const BYTE ML_deltaCode = 36;
+		const BYTE mlCode = (matchLength > 127) ? (BYTE)ZSTD_highbit32(matchLength) + ML_deltaCode : ML_Code[matchLength];
+		seqStorePtr->matchLengthFreq[mlCode]++;
+		seqStorePtr->matchLengthSum++;
+	}
+
+	ZSTD_setLog2Prices(seqStorePtr);
+}
+
+#define SET_PRICE(pos, mlen_, offset_, litlen_, price_)           \
+	{                                                         \
+		while (last_pos < pos) {                          \
+			opt[last_pos + 1].price = ZSTD_MAX_PRICE; \
+			last_pos++;                               \
+		}                                                 \
+		opt[pos].mlen = mlen_;                            \
+		opt[pos].off = offset_;                           \
+		opt[pos].litlen = litlen_;                        \
+		opt[pos].price = price_;                          \
+	}
+
+/* Update hashTable3 up to ip (excluded)
+   Assumption : always within prefix (i.e. not within extDict) */
+FORCE_INLINE
+U32 ZSTD_insertAndFindFirstIndexHash3(ZSTD_CCtx *zc, const BYTE *ip)
+{
+	U32 *const hashTable3 = zc->hashTable3;
+	U32 const hashLog3 = zc->hashLog3;
+	const BYTE *const base = zc->base;
+	U32 idx = zc->nextToUpdate3;
+	const U32 target = zc->nextToUpdate3 = (U32)(ip - base);
+	const size_t hash3 = ZSTD_hash3Ptr(ip, hashLog3);
+
+	while (idx < target) {
+		hashTable3[ZSTD_hash3Ptr(base + idx, hashLog3)] = idx;
+		idx++;
+	}
+
+	return hashTable3[hash3];
+}
+
+/*-*************************************
+*  Binary Tree search
+***************************************/
+static U32 ZSTD_insertBtAndGetAllMatches(ZSTD_CCtx *zc, const BYTE *const ip, const BYTE *const iLimit, U32 nbCompares, const U32 mls, U32 extDict,
+					 ZSTD_match_t *matches, const U32 minMatchLen)
+{
+	const BYTE *const base = zc->base;
+	const U32 curr = (U32)(ip - base);
+	const U32 hashLog = zc->params.cParams.hashLog;
+	const size_t h = ZSTD_hashPtr(ip, hashLog, mls);
+	U32 *const hashTable = zc->hashTable;
+	U32 matchIndex = hashTable[h];
+	U32 *const bt = zc->chainTable;
+	const U32 btLog = zc->params.cParams.chainLog - 1;
+	const U32 btMask = (1U << btLog) - 1;
+	size_t commonLengthSmaller = 0, commonLengthLarger = 0;
+	const BYTE *const dictBase = zc->dictBase;
+	const U32 dictLimit = zc->dictLimit;
+	const BYTE *const dictEnd = dictBase + dictLimit;
+	const BYTE *const prefixStart = base + dictLimit;
+	const U32 btLow = btMask >= curr ? 0 : curr - btMask;
+	const U32 windowLow = zc->lowLimit;
+	U32 *smallerPtr = bt + 2 * (curr & btMask);
+	U32 *largerPtr = bt + 2 * (curr & btMask) + 1;
+	U32 matchEndIdx = curr + 8;
+	U32 dummy32; /* to be nullified at the end */
+	U32 mnum = 0;
+
+	const U32 minMatch = (mls == 3) ? 3 : 4;
+	size_t bestLength = minMatchLen - 1;
+
+	if (minMatch == 3) { /* HC3 match finder */
+		U32 const matchIndex3 = ZSTD_insertAndFindFirstIndexHash3(zc, ip);
+		if (matchIndex3 > windowLow && (curr - matchIndex3 < (1 << 18))) {
+			const BYTE *match;
+			size_t currMl = 0;
+			if ((!extDict) || matchIndex3 >= dictLimit) {
+				match = base + matchIndex3;
+				if (match[bestLength] == ip[bestLength])
+					currMl = ZSTD_count(ip, match, iLimit);
+			} else {
+				match = dictBase + matchIndex3;
+				if (ZSTD_readMINMATCH(match, MINMATCH) ==
+				    ZSTD_readMINMATCH(ip, MINMATCH)) /* assumption : matchIndex3 <= dictLimit-4 (by table construction) */
+					currMl = ZSTD_count_2segments(ip + MINMATCH, match + MINMATCH, iLimit, dictEnd, prefixStart) + MINMATCH;
+			}
+
+			/* save best solution */
+			if (currMl > bestLength) {
+				bestLength = currMl;
+				matches[mnum].off = ZSTD_REP_MOVE_OPT + curr - matchIndex3;
+				matches[mnum].len = (U32)currMl;
+				mnum++;
+				if (currMl > ZSTD_OPT_NUM)
+					goto update;
+				if (ip + currMl == iLimit)
+					goto update; /* best possible, and avoid read overflow*/
+			}
+		}
+	}
+
+	hashTable[h] = curr; /* Update Hash Table */
+
+	while (nbCompares-- && (matchIndex > windowLow)) {
+		U32 *nextPtr = bt + 2 * (matchIndex & btMask);
+		size_t matchLength = MIN(commonLengthSmaller, commonLengthLarger); /* guaranteed minimum nb of common bytes */
+		const BYTE *match;
+
+		if ((!extDict) || (matchIndex + matchLength >= dictLimit)) {
+			match = base + matchIndex;
+			if (match[matchLength] == ip[matchLength]) {
+				matchLength += ZSTD_count(ip + matchLength + 1, match + matchLength + 1, iLimit) + 1;
+			}
+		} else {
+			match = dictBase + matchIndex;
+			matchLength += ZSTD_count_2segments(ip + matchLength, match + matchLength, iLimit, dictEnd, prefixStart);
+			if (matchIndex + matchLength >= dictLimit)
+				match = base + matchIndex; /* to prepare for next usage of match[matchLength] */
+		}
+
+		if (matchLength > bestLength) {
+			if (matchLength > matchEndIdx - matchIndex)
+				matchEndIdx = matchIndex + (U32)matchLength;
+			bestLength = matchLength;
+			matches[mnum].off = ZSTD_REP_MOVE_OPT + curr - matchIndex;
+			matches[mnum].len = (U32)matchLength;
+			mnum++;
+			if (matchLength > ZSTD_OPT_NUM)
+				break;
+			if (ip + matchLength == iLimit) /* equal : no way to know if inf or sup */
+				break;			/* drop, to guarantee consistency (miss a little bit of compression) */
+		}
+
+		if (match[matchLength] < ip[matchLength]) {
+			/* match is smaller than curr */
+			*smallerPtr = matchIndex;	  /* update smaller idx */
+			commonLengthSmaller = matchLength; /* all smaller will now have at least this guaranteed common length */
+			if (matchIndex <= btLow) {
+				smallerPtr = &dummy32;
+				break;
+			}			  /* beyond tree size, stop the search */
+			smallerPtr = nextPtr + 1; /* new "smaller" => larger of match */
+			matchIndex = nextPtr[1];  /* new matchIndex larger than previous (closer to curr) */
+		} else {
+			/* match is larger than curr */
+			*largerPtr = matchIndex;
+			commonLengthLarger = matchLength;
+			if (matchIndex <= btLow) {
+				largerPtr = &dummy32;
+				break;
+			} /* beyond tree size, stop the search */
+			largerPtr = nextPtr;
+			matchIndex = nextPtr[0];
+		}
+	}
+
+	*smallerPtr = *largerPtr = 0;
+
+update:
+	zc->nextToUpdate = (matchEndIdx > curr + 8) ? matchEndIdx - 8 : curr + 1;
+	return mnum;
+}
+
+/** Tree updater, providing best match */
+static U32 ZSTD_BtGetAllMatches(ZSTD_CCtx *zc, const BYTE *const ip, const BYTE *const iLimit, const U32 maxNbAttempts, const U32 mls, ZSTD_match_t *matches,
+				const U32 minMatchLen)
+{
+	if (ip < zc->base + zc->nextToUpdate)
+		return 0; /* skipped area */
+	ZSTD_updateTree(zc, ip, iLimit, maxNbAttempts, mls);
+	return ZSTD_insertBtAndGetAllMatches(zc, ip, iLimit, maxNbAttempts, mls, 0, matches, minMatchLen);
+}
+
+static U32 ZSTD_BtGetAllMatches_selectMLS(ZSTD_CCtx *zc, /* Index table will be updated */
+					  const BYTE *ip, const BYTE *const iHighLimit, const U32 maxNbAttempts, const U32 matchLengthSearch,
+					  ZSTD_match_t *matches, const U32 minMatchLen)
+{
+	switch (matchLengthSearch) {
+	case 3: return ZSTD_BtGetAllMatches(zc, ip, iHighLimit, maxNbAttempts, 3, matches, minMatchLen);
+	default:
+	case 4: return ZSTD_BtGetAllMatches(zc, ip, iHighLimit, maxNbAttempts, 4, matches, minMatchLen);
+	case 5: return ZSTD_BtGetAllMatches(zc, ip, iHighLimit, maxNbAttempts, 5, matches, minMatchLen);
+	case 7:
+	case 6: return ZSTD_BtGetAllMatches(zc, ip, iHighLimit, maxNbAttempts, 6, matches, minMatchLen);
+	}
+}
+
+/** Tree updater, providing best match */
+static U32 ZSTD_BtGetAllMatches_extDict(ZSTD_CCtx *zc, const BYTE *const ip, const BYTE *const iLimit, const U32 maxNbAttempts, const U32 mls,
+					ZSTD_match_t *matches, const U32 minMatchLen)
+{
+	if (ip < zc->base + zc->nextToUpdate)
+		return 0; /* skipped area */
+	ZSTD_updateTree_extDict(zc, ip, iLimit, maxNbAttempts, mls);
+	return ZSTD_insertBtAndGetAllMatches(zc, ip, iLimit, maxNbAttempts, mls, 1, matches, minMatchLen);
+}
+
+static U32 ZSTD_BtGetAllMatches_selectMLS_extDict(ZSTD_CCtx *zc, /* Index table will be updated */
+						  const BYTE *ip, const BYTE *const iHighLimit, const U32 maxNbAttempts, const U32 matchLengthSearch,
+						  ZSTD_match_t *matches, const U32 minMatchLen)
+{
+	switch (matchLengthSearch) {
+	case 3: return ZSTD_BtGetAllMatches_extDict(zc, ip, iHighLimit, maxNbAttempts, 3, matches, minMatchLen);
+	default:
+	case 4: return ZSTD_BtGetAllMatches_extDict(zc, ip, iHighLimit, maxNbAttempts, 4, matches, minMatchLen);
+	case 5: return ZSTD_BtGetAllMatches_extDict(zc, ip, iHighLimit, maxNbAttempts, 5, matches, minMatchLen);
+	case 7:
+	case 6: return ZSTD_BtGetAllMatches_extDict(zc, ip, iHighLimit, maxNbAttempts, 6, matches, minMatchLen);
+	}
+}
+
+/*-*******************************
+*  Optimal parser
+*********************************/
+FORCE_INLINE
+void ZSTD_compressBlock_opt_generic(ZSTD_CCtx *ctx, const void *src, size_t srcSize, const int ultra)
+{
+	seqStore_t *seqStorePtr = &(ctx->seqStore);
+	const BYTE *const istart = (const BYTE *)src;
+	const BYTE *ip = istart;
+	const BYTE *anchor = istart;
+	const BYTE *const iend = istart + srcSize;
+	const BYTE *const ilimit = iend - 8;
+	const BYTE *const base = ctx->base;
+	const BYTE *const prefixStart = base + ctx->dictLimit;
+
+	const U32 maxSearches = 1U << ctx->params.cParams.searchLog;
+	const U32 sufficient_len = ctx->params.cParams.targetLength;
+	const U32 mls = ctx->params.cParams.searchLength;
+	const U32 minMatch = (ctx->params.cParams.searchLength == 3) ? 3 : 4;
+
+	ZSTD_optimal_t *opt = seqStorePtr->priceTable;
+	ZSTD_match_t *matches = seqStorePtr->matchTable;
+	const BYTE *inr;
+	U32 offset, rep[ZSTD_REP_NUM];
+
+	/* init */
+	ctx->nextToUpdate3 = ctx->nextToUpdate;
+	ZSTD_rescaleFreqs(seqStorePtr, (const BYTE *)src, srcSize);
+	ip += (ip == prefixStart);
+	{
+		U32 i;
+		for (i = 0; i < ZSTD_REP_NUM; i++)
+			rep[i] = ctx->rep[i];
+	}
+
+	/* Match Loop */
+	while (ip < ilimit) {
+		U32 cur, match_num, last_pos, litlen, price;
+		U32 u, mlen, best_mlen, best_off, litLength;
+		memset(opt, 0, sizeof(ZSTD_optimal_t));
+		last_pos = 0;
+		litlen = (U32)(ip - anchor);
+
+		/* check repCode */
+		{
+			U32 i, last_i = ZSTD_REP_CHECK + (ip == anchor);
+			for (i = (ip == anchor); i < last_i; i++) {
+				const S32 repCur = (i == ZSTD_REP_MOVE_OPT) ? (rep[0] - 1) : rep[i];
+				if ((repCur > 0) && (repCur < (S32)(ip - prefixStart)) &&
+				    (ZSTD_readMINMATCH(ip, minMatch) == ZSTD_readMINMATCH(ip - repCur, minMatch))) {
+					mlen = (U32)ZSTD_count(ip + minMatch, ip + minMatch - repCur, iend) + minMatch;
+					if (mlen > sufficient_len || mlen >= ZSTD_OPT_NUM) {
+						best_mlen = mlen;
+						best_off = i;
+						cur = 0;
+						last_pos = 1;
+						goto _storeSequence;
+					}
+					best_off = i - (ip == anchor);
+					do {
+						price = ZSTD_getPrice(seqStorePtr, litlen, anchor, best_off, mlen - MINMATCH, ultra);
+						if (mlen > last_pos || price < opt[mlen].price)
+							SET_PRICE(mlen, mlen, i, litlen, price); /* note : macro modifies last_pos */
+						mlen--;
+					} while (mlen >= minMatch);
+				}
+			}
+		}
+
+		match_num = ZSTD_BtGetAllMatches_selectMLS(ctx, ip, iend, maxSearches, mls, matches, minMatch);
+
+		if (!last_pos && !match_num) {
+			ip++;
+			continue;
+		}
+
+		if (match_num && (matches[match_num - 1].len > sufficient_len || matches[match_num - 1].len >= ZSTD_OPT_NUM)) {
+			best_mlen = matches[match_num - 1].len;
+			best_off = matches[match_num - 1].off;
+			cur = 0;
+			last_pos = 1;
+			goto _storeSequence;
+		}
+
+		/* set prices using matches at position = 0 */
+		best_mlen = (last_pos) ? last_pos : minMatch;
+		for (u = 0; u < match_num; u++) {
+			mlen = (u > 0) ? matches[u - 1].len + 1 : best_mlen;
+			best_mlen = matches[u].len;
+			while (mlen <= best_mlen) {
+				price = ZSTD_getPrice(seqStorePtr, litlen, anchor, matches[u].off - 1, mlen - MINMATCH, ultra);
+				if (mlen > last_pos || price < opt[mlen].price)
+					SET_PRICE(mlen, mlen, matches[u].off, litlen, price); /* note : macro modifies last_pos */
+				mlen++;
+			}
+		}
+
+		if (last_pos < minMatch) {
+			ip++;
+			continue;
+		}
+
+		/* initialize opt[0] */
+		{
+			U32 i;
+			for (i = 0; i < ZSTD_REP_NUM; i++)
+				opt[0].rep[i] = rep[i];
+		}
+		opt[0].mlen = 1;
+		opt[0].litlen = litlen;
+
+		/* check further positions */
+		for (cur = 1; cur <= last_pos; cur++) {
+			inr = ip + cur;
+
+			if (opt[cur - 1].mlen == 1) {
+				litlen = opt[cur - 1].litlen + 1;
+				if (cur > litlen) {
+					price = opt[cur - litlen].price + ZSTD_getLiteralPrice(seqStorePtr, litlen, inr - litlen);
+				} else
+					price = ZSTD_getLiteralPrice(seqStorePtr, litlen, anchor);
+			} else {
+				litlen = 1;
+				price = opt[cur - 1].price + ZSTD_getLiteralPrice(seqStorePtr, litlen, inr - 1);
+			}
+
+			if (cur > last_pos || price <= opt[cur].price)
+				SET_PRICE(cur, 1, 0, litlen, price);
+
+			if (cur == last_pos)
+				break;
+
+			if (inr > ilimit) /* last match must start at a minimum distance of 8 from oend */
+				continue;
+
+			mlen = opt[cur].mlen;
+			if (opt[cur].off > ZSTD_REP_MOVE_OPT) {
+				opt[cur].rep[2] = opt[cur - mlen].rep[1];
+				opt[cur].rep[1] = opt[cur - mlen].rep[0];
+				opt[cur].rep[0] = opt[cur].off - ZSTD_REP_MOVE_OPT;
+			} else {
+				opt[cur].rep[2] = (opt[cur].off > 1) ? opt[cur - mlen].rep[1] : opt[cur - mlen].rep[2];
+				opt[cur].rep[1] = (opt[cur].off > 0) ? opt[cur - mlen].rep[0] : opt[cur - mlen].rep[1];
+				opt[cur].rep[0] =
+				    ((opt[cur].off == ZSTD_REP_MOVE_OPT) && (mlen != 1)) ? (opt[cur - mlen].rep[0] - 1) : (opt[cur - mlen].rep[opt[cur].off]);
+			}
+
+			best_mlen = minMatch;
+			{
+				U32 i, last_i = ZSTD_REP_CHECK + (mlen != 1);
+				for (i = (opt[cur].mlen != 1); i < last_i; i++) { /* check rep */
+					const S32 repCur = (i == ZSTD_REP_MOVE_OPT) ? (opt[cur].rep[0] - 1) : opt[cur].rep[i];
+					if ((repCur > 0) && (repCur < (S32)(inr - prefixStart)) &&
+					    (ZSTD_readMINMATCH(inr, minMatch) == ZSTD_readMINMATCH(inr - repCur, minMatch))) {
+						mlen = (U32)ZSTD_count(inr + minMatch, inr + minMatch - repCur, iend) + minMatch;
+
+						if (mlen > sufficient_len || cur + mlen >= ZSTD_OPT_NUM) {
+							best_mlen = mlen;
+							best_off = i;
+							last_pos = cur + 1;
+							goto _storeSequence;
+						}
+
+						best_off = i - (opt[cur].mlen != 1);
+						if (mlen > best_mlen)
+							best_mlen = mlen;
+
+						do {
+							if (opt[cur].mlen == 1) {
+								litlen = opt[cur].litlen;
+								if (cur > litlen) {
+									price = opt[cur - litlen].price + ZSTD_getPrice(seqStorePtr, litlen, inr - litlen,
+															best_off, mlen - MINMATCH, ultra);
+								} else
+									price = ZSTD_getPrice(seqStorePtr, litlen, anchor, best_off, mlen - MINMATCH, ultra);
+							} else {
+								litlen = 0;
+								price = opt[cur].price + ZSTD_getPrice(seqStorePtr, 0, NULL, best_off, mlen - MINMATCH, ultra);
+							}
+
+							if (cur + mlen > last_pos || price <= opt[cur + mlen].price)
+								SET_PRICE(cur + mlen, mlen, i, litlen, price);
+							mlen--;
+						} while (mlen >= minMatch);
+					}
+				}
+			}
+
+			match_num = ZSTD_BtGetAllMatches_selectMLS(ctx, inr, iend, maxSearches, mls, matches, best_mlen);
+
+			if (match_num > 0 && (matches[match_num - 1].len > sufficient_len || cur + matches[match_num - 1].len >= ZSTD_OPT_NUM)) {
+				best_mlen = matches[match_num - 1].len;
+				best_off = matches[match_num - 1].off;
+				last_pos = cur + 1;
+				goto _storeSequence;
+			}
+
+			/* set prices using matches at position = cur */
+			for (u = 0; u < match_num; u++) {
+				mlen = (u > 0) ? matches[u - 1].len + 1 : best_mlen;
+				best_mlen = matches[u].len;
+
+				while (mlen <= best_mlen) {
+					if (opt[cur].mlen == 1) {
+						litlen = opt[cur].litlen;
+						if (cur > litlen)
+							price = opt[cur - litlen].price + ZSTD_getPrice(seqStorePtr, litlen, ip + cur - litlen,
+													matches[u].off - 1, mlen - MINMATCH, ultra);
+						else
+							price = ZSTD_getPrice(seqStorePtr, litlen, anchor, matches[u].off - 1, mlen - MINMATCH, ultra);
+					} else {
+						litlen = 0;
+						price = opt[cur].price + ZSTD_getPrice(seqStorePtr, 0, NULL, matches[u].off - 1, mlen - MINMATCH, ultra);
+					}
+
+					if (cur + mlen > last_pos || (price < opt[cur + mlen].price))
+						SET_PRICE(cur + mlen, mlen, matches[u].off, litlen, price);
+
+					mlen++;
+				}
+			}
+		}
+
+		best_mlen = opt[last_pos].mlen;
+		best_off = opt[last_pos].off;
+		cur = last_pos - best_mlen;
+
+	/* store sequence */
+_storeSequence: /* cur, last_pos, best_mlen, best_off have to be set */
+		opt[0].mlen = 1;
+
+		while (1) {
+			mlen = opt[cur].mlen;
+			offset = opt[cur].off;
+			opt[cur].mlen = best_mlen;
+			opt[cur].off = best_off;
+			best_mlen = mlen;
+			best_off = offset;
+			if (mlen > cur)
+				break;
+			cur -= mlen;
+		}
+
+		for (u = 0; u <= last_pos;) {
+			u += opt[u].mlen;
+		}
+
+		for (cur = 0; cur < last_pos;) {
+			mlen = opt[cur].mlen;
+			if (mlen == 1) {
+				ip++;
+				cur++;
+				continue;
+			}
+			offset = opt[cur].off;
+			cur += mlen;
+			litLength = (U32)(ip - anchor);
+
+			if (offset > ZSTD_REP_MOVE_OPT) {
+				rep[2] = rep[1];
+				rep[1] = rep[0];
+				rep[0] = offset - ZSTD_REP_MOVE_OPT;
+				offset--;
+			} else {
+				if (offset != 0) {
+					best_off = (offset == ZSTD_REP_MOVE_OPT) ? (rep[0] - 1) : (rep[offset]);
+					if (offset != 1)
+						rep[2] = rep[1];
+					rep[1] = rep[0];
+					rep[0] = best_off;
+				}
+				if (litLength == 0)
+					offset--;
+			}
+
+			ZSTD_updatePrice(seqStorePtr, litLength, anchor, offset, mlen - MINMATCH);
+			ZSTD_storeSeq(seqStorePtr, litLength, anchor, offset, mlen - MINMATCH);
+			anchor = ip = ip + mlen;
+		}
+	} /* for (cur=0; cur < last_pos; ) */
+
+	/* Save reps for next block */
+	{
+		int i;
+		for (i = 0; i < ZSTD_REP_NUM; i++)
+			ctx->repToConfirm[i] = rep[i];
+	}
+
+	/* Last Literals */
+	{
+		size_t const lastLLSize = iend - anchor;
+		memcpy(seqStorePtr->lit, anchor, lastLLSize);
+		seqStorePtr->lit += lastLLSize;
+	}
+}
+
+FORCE_INLINE
+void ZSTD_compressBlock_opt_extDict_generic(ZSTD_CCtx *ctx, const void *src, size_t srcSize, const int ultra)
+{
+	seqStore_t *seqStorePtr = &(ctx->seqStore);
+	const BYTE *const istart = (const BYTE *)src;
+	const BYTE *ip = istart;
+	const BYTE *anchor = istart;
+	const BYTE *const iend = istart + srcSize;
+	const BYTE *const ilimit = iend - 8;
+	const BYTE *const base = ctx->base;
+	const U32 lowestIndex = ctx->lowLimit;
+	const U32 dictLimit = ctx->dictLimit;
+	const BYTE *const prefixStart = base + dictLimit;
+	const BYTE *const dictBase = ctx->dictBase;
+	const BYTE *const dictEnd = dictBase + dictLimit;
+
+	const U32 maxSearches = 1U << ctx->params.cParams.searchLog;
+	const U32 sufficient_len = ctx->params.cParams.targetLength;
+	const U32 mls = ctx->params.cParams.searchLength;
+	const U32 minMatch = (ctx->params.cParams.searchLength == 3) ? 3 : 4;
+
+	ZSTD_optimal_t *opt = seqStorePtr->priceTable;
+	ZSTD_match_t *matches = seqStorePtr->matchTable;
+	const BYTE *inr;
+
+	/* init */
+	U32 offset, rep[ZSTD_REP_NUM];
+	{
+		U32 i;
+		for (i = 0; i < ZSTD_REP_NUM; i++)
+			rep[i] = ctx->rep[i];
+	}
+
+	ctx->nextToUpdate3 = ctx->nextToUpdate;
+	ZSTD_rescaleFreqs(seqStorePtr, (const BYTE *)src, srcSize);
+	ip += (ip == prefixStart);
+
+	/* Match Loop */
+	while (ip < ilimit) {
+		U32 cur, match_num, last_pos, litlen, price;
+		U32 u, mlen, best_mlen, best_off, litLength;
+		U32 curr = (U32)(ip - base);
+		memset(opt, 0, sizeof(ZSTD_optimal_t));
+		last_pos = 0;
+		opt[0].litlen = (U32)(ip - anchor);
+
+		/* check repCode */
+		{
+			U32 i, last_i = ZSTD_REP_CHECK + (ip == anchor);
+			for (i = (ip == anchor); i < last_i; i++) {
+				const S32 repCur = (i == ZSTD_REP_MOVE_OPT) ? (rep[0] - 1) : rep[i];
+				const U32 repIndex = (U32)(curr - repCur);
+				const BYTE *const repBase = repIndex < dictLimit ? dictBase : base;
+				const BYTE *const repMatch = repBase + repIndex;
+				if ((repCur > 0 && repCur <= (S32)curr) &&
+				    (((U32)((dictLimit - 1) - repIndex) >= 3) & (repIndex > lowestIndex)) /* intentional overflow */
+				    && (ZSTD_readMINMATCH(ip, minMatch) == ZSTD_readMINMATCH(repMatch, minMatch))) {
+					/* repcode detected we should take it */
+					const BYTE *const repEnd = repIndex < dictLimit ? dictEnd : iend;
+					mlen = (U32)ZSTD_count_2segments(ip + minMatch, repMatch + minMatch, iend, repEnd, prefixStart) + minMatch;
+
+					if (mlen > sufficient_len || mlen >= ZSTD_OPT_NUM) {
+						best_mlen = mlen;
+						best_off = i;
+						cur = 0;
+						last_pos = 1;
+						goto _storeSequence;
+					}
+
+					best_off = i - (ip == anchor);
+					litlen = opt[0].litlen;
+					do {
+						price = ZSTD_getPrice(seqStorePtr, litlen, anchor, best_off, mlen - MINMATCH, ultra);
+						if (mlen > last_pos || price < opt[mlen].price)
+							SET_PRICE(mlen, mlen, i, litlen, price); /* note : macro modifies last_pos */
+						mlen--;
+					} while (mlen >= minMatch);
+				}
+			}
+		}
+
+		match_num = ZSTD_BtGetAllMatches_selectMLS_extDict(ctx, ip, iend, maxSearches, mls, matches, minMatch); /* first search (depth 0) */
+
+		if (!last_pos && !match_num) {
+			ip++;
+			continue;
+		}
+
+		{
+			U32 i;
+			for (i = 0; i < ZSTD_REP_NUM; i++)
+				opt[0].rep[i] = rep[i];
+		}
+		opt[0].mlen = 1;
+
+		if (match_num && (matches[match_num - 1].len > sufficient_len || matches[match_num - 1].len >= ZSTD_OPT_NUM)) {
+			best_mlen = matches[match_num - 1].len;
+			best_off = matches[match_num - 1].off;
+			cur = 0;
+			last_pos = 1;
+			goto _storeSequence;
+		}
+
+		best_mlen = (last_pos) ? last_pos : minMatch;
+
+		/* set prices using matches at position = 0 */
+		for (u = 0; u < match_num; u++) {
+			mlen = (u > 0) ? matches[u - 1].len + 1 : best_mlen;
+			best_mlen = matches[u].len;
+			litlen = opt[0].litlen;
+			while (mlen <= best_mlen) {
+				price = ZSTD_getPrice(seqStorePtr, litlen, anchor, matches[u].off - 1, mlen - MINMATCH, ultra);
+				if (mlen > last_pos || price < opt[mlen].price)
+					SET_PRICE(mlen, mlen, matches[u].off, litlen, price);
+				mlen++;
+			}
+		}
+
+		if (last_pos < minMatch) {
+			ip++;
+			continue;
+		}
+
+		/* check further positions */
+		for (cur = 1; cur <= last_pos; cur++) {
+			inr = ip + cur;
+
+			if (opt[cur - 1].mlen == 1) {
+				litlen = opt[cur - 1].litlen + 1;
+				if (cur > litlen) {
+					price = opt[cur - litlen].price + ZSTD_getLiteralPrice(seqStorePtr, litlen, inr - litlen);
+				} else
+					price = ZSTD_getLiteralPrice(seqStorePtr, litlen, anchor);
+			} else {
+				litlen = 1;
+				price = opt[cur - 1].price + ZSTD_getLiteralPrice(seqStorePtr, litlen, inr - 1);
+			}
+
+			if (cur > last_pos || price <= opt[cur].price)
+				SET_PRICE(cur, 1, 0, litlen, price);
+
+			if (cur == last_pos)
+				break;
+
+			if (inr > ilimit) /* last match must start at a minimum distance of 8 from oend */
+				continue;
+
+			mlen = opt[cur].mlen;
+			if (opt[cur].off > ZSTD_REP_MOVE_OPT) {
+				opt[cur].rep[2] = opt[cur - mlen].rep[1];
+				opt[cur].rep[1] = opt[cur - mlen].rep[0];
+				opt[cur].rep[0] = opt[cur].off - ZSTD_REP_MOVE_OPT;
+			} else {
+				opt[cur].rep[2] = (opt[cur].off > 1) ? opt[cur - mlen].rep[1] : opt[cur - mlen].rep[2];
+				opt[cur].rep[1] = (opt[cur].off > 0) ? opt[cur - mlen].rep[0] : opt[cur - mlen].rep[1];
+				opt[cur].rep[0] =
+				    ((opt[cur].off == ZSTD_REP_MOVE_OPT) && (mlen != 1)) ? (opt[cur - mlen].rep[0] - 1) : (opt[cur - mlen].rep[opt[cur].off]);
+			}
+
+			best_mlen = minMatch;
+			{
+				U32 i, last_i = ZSTD_REP_CHECK + (mlen != 1);
+				for (i = (mlen != 1); i < last_i; i++) {
+					const S32 repCur = (i == ZSTD_REP_MOVE_OPT) ? (opt[cur].rep[0] - 1) : opt[cur].rep[i];
+					const U32 repIndex = (U32)(curr + cur - repCur);
+					const BYTE *const repBase = repIndex < dictLimit ? dictBase : base;
+					const BYTE *const repMatch = repBase + repIndex;
+					if ((repCur > 0 && repCur <= (S32)(curr + cur)) &&
+					    (((U32)((dictLimit - 1) - repIndex) >= 3) & (repIndex > lowestIndex)) /* intentional overflow */
+					    && (ZSTD_readMINMATCH(inr, minMatch) == ZSTD_readMINMATCH(repMatch, minMatch))) {
+						/* repcode detected */
+						const BYTE *const repEnd = repIndex < dictLimit ? dictEnd : iend;
+						mlen = (U32)ZSTD_count_2segments(inr + minMatch, repMatch + minMatch, iend, repEnd, prefixStart) + minMatch;
+
+						if (mlen > sufficient_len || cur + mlen >= ZSTD_OPT_NUM) {
+							best_mlen = mlen;
+							best_off = i;
+							last_pos = cur + 1;
+							goto _storeSequence;
+						}
+
+						best_off = i - (opt[cur].mlen != 1);
+						if (mlen > best_mlen)
+							best_mlen = mlen;
+
+						do {
+							if (opt[cur].mlen == 1) {
+								litlen = opt[cur].litlen;
+								if (cur > litlen) {
+									price = opt[cur - litlen].price + ZSTD_getPrice(seqStorePtr, litlen, inr - litlen,
+															best_off, mlen - MINMATCH, ultra);
+								} else
+									price = ZSTD_getPrice(seqStorePtr, litlen, anchor, best_off, mlen - MINMATCH, ultra);
+							} else {
+								litlen = 0;
+								price = opt[cur].price + ZSTD_getPrice(seqStorePtr, 0, NULL, best_off, mlen - MINMATCH, ultra);
+							}
+
+							if (cur + mlen > last_pos || price <= opt[cur + mlen].price)
+								SET_PRICE(cur + mlen, mlen, i, litlen, price);
+							mlen--;
+						} while (mlen >= minMatch);
+					}
+				}
+			}
+
+			match_num = ZSTD_BtGetAllMatches_selectMLS_extDict(ctx, inr, iend, maxSearches, mls, matches, minMatch);
+
+			if (match_num > 0 && (matches[match_num - 1].len > sufficient_len || cur + matches[match_num - 1].len >= ZSTD_OPT_NUM)) {
+				best_mlen = matches[match_num - 1].len;
+				best_off = matches[match_num - 1].off;
+				last_pos = cur + 1;
+				goto _storeSequence;
+			}
+
+			/* set prices using matches at position = cur */
+			for (u = 0; u < match_num; u++) {
+				mlen = (u > 0) ? matches[u - 1].len + 1 : best_mlen;
+				best_mlen = matches[u].len;
+
+				while (mlen <= best_mlen) {
+					if (opt[cur].mlen == 1) {
+						litlen = opt[cur].litlen;
+						if (cur > litlen)
+							price = opt[cur - litlen].price + ZSTD_getPrice(seqStorePtr, litlen, ip + cur - litlen,
+													matches[u].off - 1, mlen - MINMATCH, ultra);
+						else
+							price = ZSTD_getPrice(seqStorePtr, litlen, anchor, matches[u].off - 1, mlen - MINMATCH, ultra);
+					} else {
+						litlen = 0;
+						price = opt[cur].price + ZSTD_getPrice(seqStorePtr, 0, NULL, matches[u].off - 1, mlen - MINMATCH, ultra);
+					}
+
+					if (cur + mlen > last_pos || (price < opt[cur + mlen].price))
+						SET_PRICE(cur + mlen, mlen, matches[u].off, litlen, price);
+
+					mlen++;
+				}
+			}
+		} /* for (cur = 1; cur <= last_pos; cur++) */
+
+		best_mlen = opt[last_pos].mlen;
+		best_off = opt[last_pos].off;
+		cur = last_pos - best_mlen;
+
+	/* store sequence */
+_storeSequence: /* cur, last_pos, best_mlen, best_off have to be set */
+		opt[0].mlen = 1;
+
+		while (1) {
+			mlen = opt[cur].mlen;
+			offset = opt[cur].off;
+			opt[cur].mlen = best_mlen;
+			opt[cur].off = best_off;
+			best_mlen = mlen;
+			best_off = offset;
+			if (mlen > cur)
+				break;
+			cur -= mlen;
+		}
+
+		for (u = 0; u <= last_pos;) {
+			u += opt[u].mlen;
+		}
+
+		for (cur = 0; cur < last_pos;) {
+			mlen = opt[cur].mlen;
+			if (mlen == 1) {
+				ip++;
+				cur++;
+				continue;
+			}
+			offset = opt[cur].off;
+			cur += mlen;
+			litLength = (U32)(ip - anchor);
+
+			if (offset > ZSTD_REP_MOVE_OPT) {
+				rep[2] = rep[1];
+				rep[1] = rep[0];
+				rep[0] = offset - ZSTD_REP_MOVE_OPT;
+				offset--;
+			} else {
+				if (offset != 0) {
+					best_off = (offset == ZSTD_REP_MOVE_OPT) ? (rep[0] - 1) : (rep[offset]);
+					if (offset != 1)
+						rep[2] = rep[1];
+					rep[1] = rep[0];
+					rep[0] = best_off;
+				}
+
+				if (litLength == 0)
+					offset--;
+			}
+
+			ZSTD_updatePrice(seqStorePtr, litLength, anchor, offset, mlen - MINMATCH);
+			ZSTD_storeSeq(seqStorePtr, litLength, anchor, offset, mlen - MINMATCH);
+			anchor = ip = ip + mlen;
+		}
+	} /* for (cur=0; cur < last_pos; ) */
+
+	/* Save reps for next block */
+	{
+		int i;
+		for (i = 0; i < ZSTD_REP_NUM; i++)
+			ctx->repToConfirm[i] = rep[i];
+	}
+
+	/* Last Literals */
+	{
+		size_t lastLLSize = iend - anchor;
+		memcpy(seqStorePtr->lit, anchor, lastLLSize);
+		seqStorePtr->lit += lastLLSize;
+	}
+}
+
+#endif /* ZSTD_OPT_H_91842398743 */
-- 
cgit v1.2.3


From 5c1aab1dd5445ed8bdcdbb575abc1b0d7ee5b2e7 Mon Sep 17 00:00:00 2001
From: Nick Terrell <terrelln@fb.com>
Date: Wed, 9 Aug 2017 19:39:02 -0700
Subject: btrfs: Add zstd support

Add zstd compression and decompression support to BtrFS. zstd at its
fastest level compresses almost as well as zlib, while offering much
faster compression and decompression, approaching lzo speeds.

I benchmarked btrfs with zstd compression against no compression, lzo
compression, and zlib compression. I benchmarked two scenarios. Copying
a set of files to btrfs, and then reading the files. Copying a tarball
to btrfs, extracting it to btrfs, and then reading the extracted files.
After every operation, I call `sync` and include the sync time.
Between every pair of operations I unmount and remount the filesystem
to avoid caching. The benchmark files can be found in the upstream
zstd source repository under
`contrib/linux-kernel/{btrfs-benchmark.sh,btrfs-extract-benchmark.sh}`
[1] [2].

I ran the benchmarks on a Ubuntu 14.04 VM with 2 cores and 4 GiB of RAM.
The VM is running on a MacBook Pro with a 3.1 GHz Intel Core i7 processor,
16 GB of RAM, and a SSD.

The first compression benchmark is copying 10 copies of the unzipped
Silesia corpus [3] into a BtrFS filesystem mounted with
`-o compress-force=Method`. The decompression benchmark times how long
it takes to `tar` all 10 copies into `/dev/null`. The compression ratio is
measured by comparing the output of `df` and `du`. See the benchmark file
[1] for details. I benchmarked multiple zstd compression levels, although
the patch uses zstd level 1.

| Method  | Ratio | Compression MB/s | Decompression speed |
|---------|-------|------------------|---------------------|
| None    |  0.99 |              504 |                 686 |
| lzo     |  1.66 |              398 |                 442 |
| zlib    |  2.58 |               65 |                 241 |
| zstd 1  |  2.57 |              260 |                 383 |
| zstd 3  |  2.71 |              174 |                 408 |
| zstd 6  |  2.87 |               70 |                 398 |
| zstd 9  |  2.92 |               43 |                 406 |
| zstd 12 |  2.93 |               21 |                 408 |
| zstd 15 |  3.01 |               11 |                 354 |

The next benchmark first copies `linux-4.11.6.tar` [4] to btrfs. Then it
measures the compression ratio, extracts the tar, and deletes the tar.
Then it measures the compression ratio again, and `tar`s the extracted
files into `/dev/null`. See the benchmark file [2] for details.

| Method | Tar Ratio | Extract Ratio | Copy (s) | Extract (s)| Read (s) |
|--------|-----------|---------------|----------|------------|----------|
| None   |      0.97 |          0.78 |    0.981 |      5.501 |    8.807 |
| lzo    |      2.06 |          1.38 |    1.631 |      8.458 |    8.585 |
| zlib   |      3.40 |          1.86 |    7.750 |     21.544 |   11.744 |
| zstd 1 |      3.57 |          1.85 |    2.579 |     11.479 |    9.389 |

[1] https://github.com/facebook/zstd/blob/dev/contrib/linux-kernel/btrfs-benchmark.sh
[2] https://github.com/facebook/zstd/blob/dev/contrib/linux-kernel/btrfs-extract-benchmark.sh
[3] http://sun.aei.polsl.pl/~sdeor/index.php?page=silesia
[4] https://cdn.kernel.org/pub/linux/kernel/v4.x/linux-4.11.6.tar.xz

zstd source repository: https://github.com/facebook/zstd

Signed-off-by: Nick Terrell <terrelln@fb.com>
Signed-off-by: Chris Mason <clm@fb.com>
---
 fs/btrfs/Kconfig           |   2 +
 fs/btrfs/Makefile          |   2 +-
 fs/btrfs/compression.c     |   1 +
 fs/btrfs/compression.h     |   6 +-
 fs/btrfs/ctree.h           |   1 +
 fs/btrfs/disk-io.c         |   2 +
 fs/btrfs/ioctl.c           |   6 +-
 fs/btrfs/props.c           |   6 +
 fs/btrfs/super.c           |  12 +-
 fs/btrfs/sysfs.c           |   2 +
 fs/btrfs/zstd.c            | 432 +++++++++++++++++++++++++++++++++++++++++++++
 include/uapi/linux/btrfs.h |   8 +-
 12 files changed, 468 insertions(+), 12 deletions(-)
 create mode 100644 fs/btrfs/zstd.c

(limited to 'include')

diff --git a/fs/btrfs/Kconfig b/fs/btrfs/Kconfig
index 80e9c18ea64f..a26c63b4ad68 100644
--- a/fs/btrfs/Kconfig
+++ b/fs/btrfs/Kconfig
@@ -6,6 +6,8 @@ config BTRFS_FS
 	select ZLIB_DEFLATE
 	select LZO_COMPRESS
 	select LZO_DECOMPRESS
+	select ZSTD_COMPRESS
+	select ZSTD_DECOMPRESS
 	select RAID6_PQ
 	select XOR_BLOCKS
 	select SRCU
diff --git a/fs/btrfs/Makefile b/fs/btrfs/Makefile
index 128ce17a80b0..962a95aefb81 100644
--- a/fs/btrfs/Makefile
+++ b/fs/btrfs/Makefile
@@ -6,7 +6,7 @@ btrfs-y += super.o ctree.o extent-tree.o print-tree.o root-tree.o dir-item.o \
 	   transaction.o inode.o file.o tree-defrag.o \
 	   extent_map.o sysfs.o struct-funcs.o xattr.o ordered-data.o \
 	   extent_io.o volumes.o async-thread.o ioctl.o locking.o orphan.o \
-	   export.o tree-log.o free-space-cache.o zlib.o lzo.o \
+	   export.o tree-log.o free-space-cache.o zlib.o lzo.o zstd.o \
 	   compression.o delayed-ref.o relocation.o delayed-inode.o scrub.o \
 	   reada.o backref.o ulist.o qgroup.o send.o dev-replace.o raid56.o \
 	   uuid-tree.o props.o hash.o free-space-tree.o
diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c
index d2ef9ac2a630..4ff42d18a64d 100644
--- a/fs/btrfs/compression.c
+++ b/fs/btrfs/compression.c
@@ -704,6 +704,7 @@ static struct {
 static const struct btrfs_compress_op * const btrfs_compress_op[] = {
 	&btrfs_zlib_compress,
 	&btrfs_lzo_compress,
+	&btrfs_zstd_compress,
 };
 
 void __init btrfs_init_compress(void)
diff --git a/fs/btrfs/compression.h b/fs/btrfs/compression.h
index 87f6d3332163..2269e00854d8 100644
--- a/fs/btrfs/compression.h
+++ b/fs/btrfs/compression.h
@@ -99,8 +99,9 @@ enum btrfs_compression_type {
 	BTRFS_COMPRESS_NONE  = 0,
 	BTRFS_COMPRESS_ZLIB  = 1,
 	BTRFS_COMPRESS_LZO   = 2,
-	BTRFS_COMPRESS_TYPES = 2,
-	BTRFS_COMPRESS_LAST  = 3,
+	BTRFS_COMPRESS_ZSTD  = 3,
+	BTRFS_COMPRESS_TYPES = 3,
+	BTRFS_COMPRESS_LAST  = 4,
 };
 
 struct btrfs_compress_op {
@@ -128,5 +129,6 @@ struct btrfs_compress_op {
 
 extern const struct btrfs_compress_op btrfs_zlib_compress;
 extern const struct btrfs_compress_op btrfs_lzo_compress;
+extern const struct btrfs_compress_op btrfs_zstd_compress;
 
 #endif
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 3f3eb7b17cac..845d77c097d6 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -270,6 +270,7 @@ struct btrfs_super_block {
 	 BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS |		\
 	 BTRFS_FEATURE_INCOMPAT_BIG_METADATA |		\
 	 BTRFS_FEATURE_INCOMPAT_COMPRESS_LZO |		\
+	 BTRFS_FEATURE_INCOMPAT_COMPRESS_ZSTD |		\
 	 BTRFS_FEATURE_INCOMPAT_RAID56 |		\
 	 BTRFS_FEATURE_INCOMPAT_EXTENDED_IREF |		\
 	 BTRFS_FEATURE_INCOMPAT_SKINNY_METADATA |	\
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 080e2ebb8aa0..04632f4de933 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -2828,6 +2828,8 @@ int open_ctree(struct super_block *sb,
 	features |= BTRFS_FEATURE_INCOMPAT_MIXED_BACKREF;
 	if (fs_info->compress_type == BTRFS_COMPRESS_LZO)
 		features |= BTRFS_FEATURE_INCOMPAT_COMPRESS_LZO;
+	else if (fs_info->compress_type == BTRFS_COMPRESS_ZSTD)
+		features |= BTRFS_FEATURE_INCOMPAT_COMPRESS_ZSTD;
 
 	if (features & BTRFS_FEATURE_INCOMPAT_SKINNY_METADATA)
 		btrfs_info(fs_info, "has skinny extents");
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index fa1b78cf25f6..b9963d94d727 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -327,8 +327,10 @@ static int btrfs_ioctl_setflags(struct file *file, void __user *arg)
 
 		if (fs_info->compress_type == BTRFS_COMPRESS_LZO)
 			comp = "lzo";
-		else
+		else if (fs_info->compress_type == BTRFS_COMPRESS_ZLIB)
 			comp = "zlib";
+		else
+			comp = "zstd";
 		ret = btrfs_set_prop(inode, "btrfs.compression",
 				     comp, strlen(comp), 0);
 		if (ret)
@@ -1466,6 +1468,8 @@ int btrfs_defrag_file(struct inode *inode, struct file *file,
 
 	if (range->compress_type == BTRFS_COMPRESS_LZO) {
 		btrfs_set_fs_incompat(fs_info, COMPRESS_LZO);
+	} else if (range->compress_type == BTRFS_COMPRESS_ZSTD) {
+		btrfs_set_fs_incompat(fs_info, COMPRESS_ZSTD);
 	}
 
 	ret = defrag_count;
diff --git a/fs/btrfs/props.c b/fs/btrfs/props.c
index 4b23ae5d0e5c..20631e9273a0 100644
--- a/fs/btrfs/props.c
+++ b/fs/btrfs/props.c
@@ -390,6 +390,8 @@ static int prop_compression_validate(const char *value, size_t len)
 		return 0;
 	else if (!strncmp("zlib", value, len))
 		return 0;
+	else if (!strncmp("zstd", value, len))
+		return 0;
 
 	return -EINVAL;
 }
@@ -412,6 +414,8 @@ static int prop_compression_apply(struct inode *inode,
 		type = BTRFS_COMPRESS_LZO;
 	else if (!strncmp("zlib", value, len))
 		type = BTRFS_COMPRESS_ZLIB;
+	else if (!strncmp("zstd", value, len))
+		type = BTRFS_COMPRESS_ZSTD;
 	else
 		return -EINVAL;
 
@@ -429,6 +433,8 @@ static const char *prop_compression_extract(struct inode *inode)
 		return "zlib";
 	case BTRFS_COMPRESS_LZO:
 		return "lzo";
+	case BTRFS_COMPRESS_ZSTD:
+		return "zstd";
 	}
 
 	return NULL;
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 12540b6104b5..c370deadb790 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -513,6 +513,14 @@ int btrfs_parse_options(struct btrfs_fs_info *info, char *options,
 				btrfs_clear_opt(info->mount_opt, NODATASUM);
 				btrfs_set_fs_incompat(info, COMPRESS_LZO);
 				no_compress = 0;
+			} else if (strcmp(args[0].from, "zstd") == 0) {
+				compress_type = "zstd";
+				info->compress_type = BTRFS_COMPRESS_ZSTD;
+				btrfs_set_opt(info->mount_opt, COMPRESS);
+				btrfs_clear_opt(info->mount_opt, NODATACOW);
+				btrfs_clear_opt(info->mount_opt, NODATASUM);
+				btrfs_set_fs_incompat(info, COMPRESS_ZSTD);
+				no_compress = 0;
 			} else if (strncmp(args[0].from, "no", 2) == 0) {
 				compress_type = "no";
 				btrfs_clear_opt(info->mount_opt, COMPRESS);
@@ -1227,8 +1235,10 @@ static int btrfs_show_options(struct seq_file *seq, struct dentry *dentry)
 	if (btrfs_test_opt(info, COMPRESS)) {
 		if (info->compress_type == BTRFS_COMPRESS_ZLIB)
 			compress_type = "zlib";
-		else
+		else if (info->compress_type == BTRFS_COMPRESS_LZO)
 			compress_type = "lzo";
+		else
+			compress_type = "zstd";
 		if (btrfs_test_opt(info, FORCE_COMPRESS))
 			seq_printf(seq, ",compress-force=%s", compress_type);
 		else
diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c
index c2d5f3580b4c..2b6d37c09a81 100644
--- a/fs/btrfs/sysfs.c
+++ b/fs/btrfs/sysfs.c
@@ -200,6 +200,7 @@ BTRFS_FEAT_ATTR_INCOMPAT(mixed_backref, MIXED_BACKREF);
 BTRFS_FEAT_ATTR_INCOMPAT(default_subvol, DEFAULT_SUBVOL);
 BTRFS_FEAT_ATTR_INCOMPAT(mixed_groups, MIXED_GROUPS);
 BTRFS_FEAT_ATTR_INCOMPAT(compress_lzo, COMPRESS_LZO);
+BTRFS_FEAT_ATTR_INCOMPAT(compress_zstd, COMPRESS_ZSTD);
 BTRFS_FEAT_ATTR_INCOMPAT(big_metadata, BIG_METADATA);
 BTRFS_FEAT_ATTR_INCOMPAT(extended_iref, EXTENDED_IREF);
 BTRFS_FEAT_ATTR_INCOMPAT(raid56, RAID56);
@@ -212,6 +213,7 @@ static struct attribute *btrfs_supported_feature_attrs[] = {
 	BTRFS_FEAT_ATTR_PTR(default_subvol),
 	BTRFS_FEAT_ATTR_PTR(mixed_groups),
 	BTRFS_FEAT_ATTR_PTR(compress_lzo),
+	BTRFS_FEAT_ATTR_PTR(compress_zstd),
 	BTRFS_FEAT_ATTR_PTR(big_metadata),
 	BTRFS_FEAT_ATTR_PTR(extended_iref),
 	BTRFS_FEAT_ATTR_PTR(raid56),
diff --git a/fs/btrfs/zstd.c b/fs/btrfs/zstd.c
new file mode 100644
index 000000000000..607ce47b483a
--- /dev/null
+++ b/fs/btrfs/zstd.c
@@ -0,0 +1,432 @@
+/*
+ * Copyright (c) 2016-present, Facebook, Inc.
+ * All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ */
+#include <linux/bio.h>
+#include <linux/err.h>
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/mm.h>
+#include <linux/pagemap.h>
+#include <linux/refcount.h>
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/zstd.h>
+#include "compression.h"
+
+#define ZSTD_BTRFS_MAX_WINDOWLOG 17
+#define ZSTD_BTRFS_MAX_INPUT (1 << ZSTD_BTRFS_MAX_WINDOWLOG)
+#define ZSTD_BTRFS_DEFAULT_LEVEL 3
+
+static ZSTD_parameters zstd_get_btrfs_parameters(size_t src_len)
+{
+	ZSTD_parameters params = ZSTD_getParams(ZSTD_BTRFS_DEFAULT_LEVEL,
+						src_len, 0);
+
+	if (params.cParams.windowLog > ZSTD_BTRFS_MAX_WINDOWLOG)
+		params.cParams.windowLog = ZSTD_BTRFS_MAX_WINDOWLOG;
+	WARN_ON(src_len > ZSTD_BTRFS_MAX_INPUT);
+	return params;
+}
+
+struct workspace {
+	void *mem;
+	size_t size;
+	char *buf;
+	struct list_head list;
+};
+
+static void zstd_free_workspace(struct list_head *ws)
+{
+	struct workspace *workspace = list_entry(ws, struct workspace, list);
+
+	kvfree(workspace->mem);
+	kfree(workspace->buf);
+	kfree(workspace);
+}
+
+static struct list_head *zstd_alloc_workspace(void)
+{
+	ZSTD_parameters params =
+			zstd_get_btrfs_parameters(ZSTD_BTRFS_MAX_INPUT);
+	struct workspace *workspace;
+
+	workspace = kzalloc(sizeof(*workspace), GFP_KERNEL);
+	if (!workspace)
+		return ERR_PTR(-ENOMEM);
+
+	workspace->size = max_t(size_t,
+			ZSTD_CStreamWorkspaceBound(params.cParams),
+			ZSTD_DStreamWorkspaceBound(ZSTD_BTRFS_MAX_INPUT));
+	workspace->mem = kvmalloc(workspace->size, GFP_KERNEL);
+	workspace->buf = kmalloc(PAGE_SIZE, GFP_KERNEL);
+	if (!workspace->mem || !workspace->buf)
+		goto fail;
+
+	INIT_LIST_HEAD(&workspace->list);
+
+	return &workspace->list;
+fail:
+	zstd_free_workspace(&workspace->list);
+	return ERR_PTR(-ENOMEM);
+}
+
+static int zstd_compress_pages(struct list_head *ws,
+		struct address_space *mapping,
+		u64 start,
+		struct page **pages,
+		unsigned long *out_pages,
+		unsigned long *total_in,
+		unsigned long *total_out)
+{
+	struct workspace *workspace = list_entry(ws, struct workspace, list);
+	ZSTD_CStream *stream;
+	int ret = 0;
+	int nr_pages = 0;
+	struct page *in_page = NULL;  /* The current page to read */
+	struct page *out_page = NULL; /* The current page to write to */
+	ZSTD_inBuffer in_buf = { NULL, 0, 0 };
+	ZSTD_outBuffer out_buf = { NULL, 0, 0 };
+	unsigned long tot_in = 0;
+	unsigned long tot_out = 0;
+	unsigned long len = *total_out;
+	const unsigned long nr_dest_pages = *out_pages;
+	unsigned long max_out = nr_dest_pages * PAGE_SIZE;
+	ZSTD_parameters params = zstd_get_btrfs_parameters(len);
+
+	*out_pages = 0;
+	*total_out = 0;
+	*total_in = 0;
+
+	/* Initialize the stream */
+	stream = ZSTD_initCStream(params, len, workspace->mem,
+			workspace->size);
+	if (!stream) {
+		pr_warn("BTRFS: ZSTD_initCStream failed\n");
+		ret = -EIO;
+		goto out;
+	}
+
+	/* map in the first page of input data */
+	in_page = find_get_page(mapping, start >> PAGE_SHIFT);
+	in_buf.src = kmap(in_page);
+	in_buf.pos = 0;
+	in_buf.size = min_t(size_t, len, PAGE_SIZE);
+
+
+	/* Allocate and map in the output buffer */
+	out_page = alloc_page(GFP_NOFS | __GFP_HIGHMEM);
+	if (out_page == NULL) {
+		ret = -ENOMEM;
+		goto out;
+	}
+	pages[nr_pages++] = out_page;
+	out_buf.dst = kmap(out_page);
+	out_buf.pos = 0;
+	out_buf.size = min_t(size_t, max_out, PAGE_SIZE);
+
+	while (1) {
+		size_t ret2;
+
+		ret2 = ZSTD_compressStream(stream, &out_buf, &in_buf);
+		if (ZSTD_isError(ret2)) {
+			pr_debug("BTRFS: ZSTD_compressStream returned %d\n",
+					ZSTD_getErrorCode(ret2));
+			ret = -EIO;
+			goto out;
+		}
+
+		/* Check to see if we are making it bigger */
+		if (tot_in + in_buf.pos > 8192 &&
+				tot_in + in_buf.pos <
+				tot_out + out_buf.pos) {
+			ret = -E2BIG;
+			goto out;
+		}
+
+		/* We've reached the end of our output range */
+		if (out_buf.pos >= max_out) {
+			tot_out += out_buf.pos;
+			ret = -E2BIG;
+			goto out;
+		}
+
+		/* Check if we need more output space */
+		if (out_buf.pos == out_buf.size) {
+			tot_out += PAGE_SIZE;
+			max_out -= PAGE_SIZE;
+			kunmap(out_page);
+			if (nr_pages == nr_dest_pages) {
+				out_page = NULL;
+				ret = -E2BIG;
+				goto out;
+			}
+			out_page = alloc_page(GFP_NOFS | __GFP_HIGHMEM);
+			if (out_page == NULL) {
+				ret = -ENOMEM;
+				goto out;
+			}
+			pages[nr_pages++] = out_page;
+			out_buf.dst = kmap(out_page);
+			out_buf.pos = 0;
+			out_buf.size = min_t(size_t, max_out, PAGE_SIZE);
+		}
+
+		/* We've reached the end of the input */
+		if (in_buf.pos >= len) {
+			tot_in += in_buf.pos;
+			break;
+		}
+
+		/* Check if we need more input */
+		if (in_buf.pos == in_buf.size) {
+			tot_in += PAGE_SIZE;
+			kunmap(in_page);
+			put_page(in_page);
+
+			start += PAGE_SIZE;
+			len -= PAGE_SIZE;
+			in_page = find_get_page(mapping, start >> PAGE_SHIFT);
+			in_buf.src = kmap(in_page);
+			in_buf.pos = 0;
+			in_buf.size = min_t(size_t, len, PAGE_SIZE);
+		}
+	}
+	while (1) {
+		size_t ret2;
+
+		ret2 = ZSTD_endStream(stream, &out_buf);
+		if (ZSTD_isError(ret2)) {
+			pr_debug("BTRFS: ZSTD_endStream returned %d\n",
+					ZSTD_getErrorCode(ret2));
+			ret = -EIO;
+			goto out;
+		}
+		if (ret2 == 0) {
+			tot_out += out_buf.pos;
+			break;
+		}
+		if (out_buf.pos >= max_out) {
+			tot_out += out_buf.pos;
+			ret = -E2BIG;
+			goto out;
+		}
+
+		tot_out += PAGE_SIZE;
+		max_out -= PAGE_SIZE;
+		kunmap(out_page);
+		if (nr_pages == nr_dest_pages) {
+			out_page = NULL;
+			ret = -E2BIG;
+			goto out;
+		}
+		out_page = alloc_page(GFP_NOFS | __GFP_HIGHMEM);
+		if (out_page == NULL) {
+			ret = -ENOMEM;
+			goto out;
+		}
+		pages[nr_pages++] = out_page;
+		out_buf.dst = kmap(out_page);
+		out_buf.pos = 0;
+		out_buf.size = min_t(size_t, max_out, PAGE_SIZE);
+	}
+
+	if (tot_out >= tot_in) {
+		ret = -E2BIG;
+		goto out;
+	}
+
+	ret = 0;
+	*total_in = tot_in;
+	*total_out = tot_out;
+out:
+	*out_pages = nr_pages;
+	/* Cleanup */
+	if (in_page) {
+		kunmap(in_page);
+		put_page(in_page);
+	}
+	if (out_page)
+		kunmap(out_page);
+	return ret;
+}
+
+static int zstd_decompress_bio(struct list_head *ws, struct compressed_bio *cb)
+{
+	struct workspace *workspace = list_entry(ws, struct workspace, list);
+	struct page **pages_in = cb->compressed_pages;
+	u64 disk_start = cb->start;
+	struct bio *orig_bio = cb->orig_bio;
+	size_t srclen = cb->compressed_len;
+	ZSTD_DStream *stream;
+	int ret = 0;
+	unsigned long page_in_index = 0;
+	unsigned long total_pages_in = DIV_ROUND_UP(srclen, PAGE_SIZE);
+	unsigned long buf_start;
+	unsigned long total_out = 0;
+	ZSTD_inBuffer in_buf = { NULL, 0, 0 };
+	ZSTD_outBuffer out_buf = { NULL, 0, 0 };
+
+	stream = ZSTD_initDStream(
+			ZSTD_BTRFS_MAX_INPUT, workspace->mem, workspace->size);
+	if (!stream) {
+		pr_debug("BTRFS: ZSTD_initDStream failed\n");
+		ret = -EIO;
+		goto done;
+	}
+
+	in_buf.src = kmap(pages_in[page_in_index]);
+	in_buf.pos = 0;
+	in_buf.size = min_t(size_t, srclen, PAGE_SIZE);
+
+	out_buf.dst = workspace->buf;
+	out_buf.pos = 0;
+	out_buf.size = PAGE_SIZE;
+
+	while (1) {
+		size_t ret2;
+
+		ret2 = ZSTD_decompressStream(stream, &out_buf, &in_buf);
+		if (ZSTD_isError(ret2)) {
+			pr_debug("BTRFS: ZSTD_decompressStream returned %d\n",
+					ZSTD_getErrorCode(ret2));
+			ret = -EIO;
+			goto done;
+		}
+		buf_start = total_out;
+		total_out += out_buf.pos;
+		out_buf.pos = 0;
+
+		ret = btrfs_decompress_buf2page(out_buf.dst, buf_start,
+				total_out, disk_start, orig_bio);
+		if (ret == 0)
+			break;
+
+		if (in_buf.pos >= srclen)
+			break;
+
+		/* Check if we've hit the end of a frame */
+		if (ret2 == 0)
+			break;
+
+		if (in_buf.pos == in_buf.size) {
+			kunmap(pages_in[page_in_index++]);
+			if (page_in_index >= total_pages_in) {
+				in_buf.src = NULL;
+				ret = -EIO;
+				goto done;
+			}
+			srclen -= PAGE_SIZE;
+			in_buf.src = kmap(pages_in[page_in_index]);
+			in_buf.pos = 0;
+			in_buf.size = min_t(size_t, srclen, PAGE_SIZE);
+		}
+	}
+	ret = 0;
+	zero_fill_bio(orig_bio);
+done:
+	if (in_buf.src)
+		kunmap(pages_in[page_in_index]);
+	return ret;
+}
+
+static int zstd_decompress(struct list_head *ws, unsigned char *data_in,
+		struct page *dest_page,
+		unsigned long start_byte,
+		size_t srclen, size_t destlen)
+{
+	struct workspace *workspace = list_entry(ws, struct workspace, list);
+	ZSTD_DStream *stream;
+	int ret = 0;
+	size_t ret2;
+	ZSTD_inBuffer in_buf = { NULL, 0, 0 };
+	ZSTD_outBuffer out_buf = { NULL, 0, 0 };
+	unsigned long total_out = 0;
+	unsigned long pg_offset = 0;
+	char *kaddr;
+
+	stream = ZSTD_initDStream(
+			ZSTD_BTRFS_MAX_INPUT, workspace->mem, workspace->size);
+	if (!stream) {
+		pr_warn("BTRFS: ZSTD_initDStream failed\n");
+		ret = -EIO;
+		goto finish;
+	}
+
+	destlen = min_t(size_t, destlen, PAGE_SIZE);
+
+	in_buf.src = data_in;
+	in_buf.pos = 0;
+	in_buf.size = srclen;
+
+	out_buf.dst = workspace->buf;
+	out_buf.pos = 0;
+	out_buf.size = PAGE_SIZE;
+
+	ret2 = 1;
+	while (pg_offset < destlen && in_buf.pos < in_buf.size) {
+		unsigned long buf_start;
+		unsigned long buf_offset;
+		unsigned long bytes;
+
+		/* Check if the frame is over and we still need more input */
+		if (ret2 == 0) {
+			pr_debug("BTRFS: ZSTD_decompressStream ended early\n");
+			ret = -EIO;
+			goto finish;
+		}
+		ret2 = ZSTD_decompressStream(stream, &out_buf, &in_buf);
+		if (ZSTD_isError(ret2)) {
+			pr_debug("BTRFS: ZSTD_decompressStream returned %d\n",
+					ZSTD_getErrorCode(ret2));
+			ret = -EIO;
+			goto finish;
+		}
+
+		buf_start = total_out;
+		total_out += out_buf.pos;
+		out_buf.pos = 0;
+
+		if (total_out <= start_byte)
+			continue;
+
+		if (total_out > start_byte && buf_start < start_byte)
+			buf_offset = start_byte - buf_start;
+		else
+			buf_offset = 0;
+
+		bytes = min_t(unsigned long, destlen - pg_offset,
+				out_buf.size - buf_offset);
+
+		kaddr = kmap_atomic(dest_page);
+		memcpy(kaddr + pg_offset, out_buf.dst + buf_offset, bytes);
+		kunmap_atomic(kaddr);
+
+		pg_offset += bytes;
+	}
+	ret = 0;
+finish:
+	if (pg_offset < destlen) {
+		kaddr = kmap_atomic(dest_page);
+		memset(kaddr + pg_offset, 0, destlen - pg_offset);
+		kunmap_atomic(kaddr);
+	}
+	return ret;
+}
+
+const struct btrfs_compress_op btrfs_zstd_compress = {
+	.alloc_workspace = zstd_alloc_workspace,
+	.free_workspace = zstd_free_workspace,
+	.compress_pages = zstd_compress_pages,
+	.decompress_bio = zstd_decompress_bio,
+	.decompress = zstd_decompress,
+};
diff --git a/include/uapi/linux/btrfs.h b/include/uapi/linux/btrfs.h
index 9aa74f317747..378230c163d5 100644
--- a/include/uapi/linux/btrfs.h
+++ b/include/uapi/linux/btrfs.h
@@ -255,13 +255,7 @@ struct btrfs_ioctl_fs_info_args {
 #define BTRFS_FEATURE_INCOMPAT_DEFAULT_SUBVOL	(1ULL << 1)
 #define BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS	(1ULL << 2)
 #define BTRFS_FEATURE_INCOMPAT_COMPRESS_LZO	(1ULL << 3)
-/*
- * some patches floated around with a second compression method
- * lets save that incompat here for when they do get in
- * Note we don't actually support it, we're just reserving the
- * number
- */
-#define BTRFS_FEATURE_INCOMPAT_COMPRESS_LZOv2	(1ULL << 4)
+#define BTRFS_FEATURE_INCOMPAT_COMPRESS_ZSTD	(1ULL << 4)
 
 /*
  * older kernels tried to do bigger metadata blocks, but the
-- 
cgit v1.2.3


From e01d1913b0d0817191418381a6fcebaa01abde2a Mon Sep 17 00:00:00 2001
From: Baoquan He <bhe@redhat.com>
Date: Wed, 9 Aug 2017 16:33:40 +0800
Subject: iommu: Add is_attach_deferred call-back to iommu-ops

This new call-back will be used to check if the domain attach need be
deferred for now. If yes, the domain attach/detach will return directly.

Signed-off-by: Baoquan He <bhe@redhat.com>
Signed-off-by: Joerg Roedel <jroedel@suse.de>
---
 drivers/iommu/iommu.c | 8 ++++++++
 include/linux/iommu.h | 1 +
 2 files changed, 9 insertions(+)

(limited to 'include')

diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c
index 3f6ea160afed..86581b115b92 100644
--- a/drivers/iommu/iommu.c
+++ b/drivers/iommu/iommu.c
@@ -1283,6 +1283,10 @@ static int __iommu_attach_device(struct iommu_domain *domain,
 				 struct device *dev)
 {
 	int ret;
+	if ((domain->ops->is_attach_deferred != NULL) &&
+	    domain->ops->is_attach_deferred(domain, dev))
+		return 0;
+
 	if (unlikely(domain->ops->attach_dev == NULL))
 		return -ENODEV;
 
@@ -1324,6 +1328,10 @@ EXPORT_SYMBOL_GPL(iommu_attach_device);
 static void __iommu_detach_device(struct iommu_domain *domain,
 				  struct device *dev)
 {
+	if ((domain->ops->is_attach_deferred != NULL) &&
+	    domain->ops->is_attach_deferred(domain, dev))
+		return;
+
 	if (unlikely(domain->ops->detach_dev == NULL))
 		return;
 
diff --git a/include/linux/iommu.h b/include/linux/iommu.h
index 2cb54adc4a33..63983c9e6c3a 100644
--- a/include/linux/iommu.h
+++ b/include/linux/iommu.h
@@ -225,6 +225,7 @@ struct iommu_ops {
 	u32 (*domain_get_windows)(struct iommu_domain *domain);
 
 	int (*of_xlate)(struct device *dev, struct of_phandle_args *args);
+	bool (*is_attach_deferred)(struct iommu_domain *domain, struct device *dev);
 
 	unsigned long pgsize_bitmap;
 };
-- 
cgit v1.2.3


From 42f87e71c3df12d8f29ec1bb7b47772ffaeaf1ee Mon Sep 17 00:00:00 2001
From: Joerg Roedel <jroedel@suse.de>
Date: Thu, 10 Aug 2017 14:44:28 +0200
Subject: iommu/iova: Add flush-queue data structures

This patch adds the basic data-structures to implement
flush-queues in the generic IOVA code. It also adds the
initialization and destroy routines for these data
structures.

The initialization routine is designed so that the use of
this feature is optional for the users of IOVA code.

Signed-off-by: Joerg Roedel <jroedel@suse.de>
---
 drivers/iommu/iova.c | 39 +++++++++++++++++++++++++++++++++++++++
 include/linux/iova.h | 41 +++++++++++++++++++++++++++++++++++++++++
 2 files changed, 80 insertions(+)

(limited to 'include')

diff --git a/drivers/iommu/iova.c b/drivers/iommu/iova.c
index 246f14c83944..b9f6ce02a1e1 100644
--- a/drivers/iommu/iova.c
+++ b/drivers/iommu/iova.c
@@ -50,10 +50,48 @@ init_iova_domain(struct iova_domain *iovad, unsigned long granule,
 	iovad->granule = granule;
 	iovad->start_pfn = start_pfn;
 	iovad->dma_32bit_pfn = pfn_32bit + 1;
+	iovad->flush_cb = NULL;
+	iovad->fq = NULL;
 	init_iova_rcaches(iovad);
 }
 EXPORT_SYMBOL_GPL(init_iova_domain);
 
+static void free_iova_flush_queue(struct iova_domain *iovad)
+{
+	if (!iovad->fq)
+		return;
+
+	free_percpu(iovad->fq);
+
+	iovad->fq         = NULL;
+	iovad->flush_cb   = NULL;
+	iovad->entry_dtor = NULL;
+}
+
+int init_iova_flush_queue(struct iova_domain *iovad,
+			  iova_flush_cb flush_cb, iova_entry_dtor entry_dtor)
+{
+	int cpu;
+
+	iovad->fq = alloc_percpu(struct iova_fq);
+	if (!iovad->fq)
+		return -ENOMEM;
+
+	iovad->flush_cb   = flush_cb;
+	iovad->entry_dtor = entry_dtor;
+
+	for_each_possible_cpu(cpu) {
+		struct iova_fq *fq;
+
+		fq = per_cpu_ptr(iovad->fq, cpu);
+		fq->head = 0;
+		fq->tail = 0;
+	}
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(init_iova_flush_queue);
+
 static struct rb_node *
 __get_cached_rbnode(struct iova_domain *iovad, unsigned long *limit_pfn)
 {
@@ -433,6 +471,7 @@ void put_iova_domain(struct iova_domain *iovad)
 	struct rb_node *node;
 	unsigned long flags;
 
+	free_iova_flush_queue(iovad);
 	free_iova_rcaches(iovad);
 	spin_lock_irqsave(&iovad->iova_rbtree_lock, flags);
 	node = rb_first(&iovad->rbroot);
diff --git a/include/linux/iova.h b/include/linux/iova.h
index e0a892ae45c0..8aa10896150e 100644
--- a/include/linux/iova.h
+++ b/include/linux/iova.h
@@ -36,6 +36,30 @@ struct iova_rcache {
 	struct iova_cpu_rcache __percpu *cpu_rcaches;
 };
 
+struct iova_domain;
+
+/* Call-Back from IOVA code into IOMMU drivers */
+typedef void (* iova_flush_cb)(struct iova_domain *domain);
+
+/* Destructor for per-entry data */
+typedef void (* iova_entry_dtor)(unsigned long data);
+
+/* Number of entries per Flush Queue */
+#define IOVA_FQ_SIZE	256
+
+/* Flush Queue entry for defered flushing */
+struct iova_fq_entry {
+	unsigned long iova_pfn;
+	unsigned long pages;
+	unsigned long data;
+};
+
+/* Per-CPU Flush Queue structure */
+struct iova_fq {
+	struct iova_fq_entry entries[IOVA_FQ_SIZE];
+	unsigned head, tail;
+};
+
 /* holds all the iova translations for a domain */
 struct iova_domain {
 	spinlock_t	iova_rbtree_lock; /* Lock to protect update of rbtree */
@@ -45,6 +69,14 @@ struct iova_domain {
 	unsigned long	start_pfn;	/* Lower limit for this domain */
 	unsigned long	dma_32bit_pfn;
 	struct iova_rcache rcaches[IOVA_RANGE_CACHE_MAX_SIZE];	/* IOVA range caches */
+
+	iova_flush_cb	flush_cb;	/* Call-Back function to flush IOMMU
+					   TLBs */
+
+	iova_entry_dtor entry_dtor;	/* IOMMU driver specific destructor for
+					   iova entry */
+
+	struct iova_fq __percpu *fq;	/* Flush Queue */
 };
 
 static inline unsigned long iova_size(struct iova *iova)
@@ -102,6 +134,8 @@ struct iova *reserve_iova(struct iova_domain *iovad, unsigned long pfn_lo,
 void copy_reserved_iova(struct iova_domain *from, struct iova_domain *to);
 void init_iova_domain(struct iova_domain *iovad, unsigned long granule,
 	unsigned long start_pfn, unsigned long pfn_32bit);
+int init_iova_flush_queue(struct iova_domain *iovad,
+			  iova_flush_cb flush_cb, iova_entry_dtor entry_dtor);
 struct iova *find_iova(struct iova_domain *iovad, unsigned long pfn);
 void put_iova_domain(struct iova_domain *iovad);
 struct iova *split_and_remove_iova(struct iova_domain *iovad,
@@ -174,6 +208,13 @@ static inline void init_iova_domain(struct iova_domain *iovad,
 {
 }
 
+static inline int init_iova_flush_queue(struct iova_domain *iovad,
+					iova_flush_cb flush_cb,
+					iova_entry_dtor entry_dtor)
+{
+	return -ENODEV;
+}
+
 static inline struct iova *find_iova(struct iova_domain *iovad,
 				     unsigned long pfn)
 {
-- 
cgit v1.2.3


From 1928210107edd4fa786199fef6b875d3af3bef88 Mon Sep 17 00:00:00 2001
From: Joerg Roedel <jroedel@suse.de>
Date: Thu, 10 Aug 2017 15:49:44 +0200
Subject: iommu/iova: Implement Flush-Queue ring buffer

Add a function to add entries to the Flush-Queue ring
buffer. If the buffer is full, call the flush-callback and
free the entries.

Signed-off-by: Joerg Roedel <jroedel@suse.de>
---
 drivers/iommu/iova.c | 80 ++++++++++++++++++++++++++++++++++++++++++++++++++++
 include/linux/iova.h |  9 ++++++
 2 files changed, 89 insertions(+)

(limited to 'include')

diff --git a/drivers/iommu/iova.c b/drivers/iommu/iova.c
index b9f6ce02a1e1..e5c9a7ae6088 100644
--- a/drivers/iommu/iova.c
+++ b/drivers/iommu/iova.c
@@ -32,6 +32,7 @@ static unsigned long iova_rcache_get(struct iova_domain *iovad,
 				     unsigned long limit_pfn);
 static void init_iova_rcaches(struct iova_domain *iovad);
 static void free_iova_rcaches(struct iova_domain *iovad);
+static void fq_destroy_all_entries(struct iova_domain *iovad);
 
 void
 init_iova_domain(struct iova_domain *iovad, unsigned long granule,
@@ -61,6 +62,7 @@ static void free_iova_flush_queue(struct iova_domain *iovad)
 	if (!iovad->fq)
 		return;
 
+	fq_destroy_all_entries(iovad);
 	free_percpu(iovad->fq);
 
 	iovad->fq         = NULL;
@@ -461,6 +463,84 @@ free_iova_fast(struct iova_domain *iovad, unsigned long pfn, unsigned long size)
 }
 EXPORT_SYMBOL_GPL(free_iova_fast);
 
+#define fq_ring_for_each(i, fq) \
+	for ((i) = (fq)->head; (i) != (fq)->tail; (i) = ((i) + 1) % IOVA_FQ_SIZE)
+
+static inline bool fq_full(struct iova_fq *fq)
+{
+	return (((fq->tail + 1) % IOVA_FQ_SIZE) == fq->head);
+}
+
+static inline unsigned fq_ring_add(struct iova_fq *fq)
+{
+	unsigned idx = fq->tail;
+
+	fq->tail = (idx + 1) % IOVA_FQ_SIZE;
+
+	return idx;
+}
+
+static void fq_ring_free(struct iova_domain *iovad, struct iova_fq *fq)
+{
+	unsigned idx;
+
+	fq_ring_for_each(idx, fq) {
+
+		if (iovad->entry_dtor)
+			iovad->entry_dtor(fq->entries[idx].data);
+
+		free_iova_fast(iovad,
+			       fq->entries[idx].iova_pfn,
+			       fq->entries[idx].pages);
+	}
+
+	fq->head = 0;
+	fq->tail = 0;
+}
+
+static void fq_destroy_all_entries(struct iova_domain *iovad)
+{
+	int cpu;
+
+	/*
+	 * This code runs when the iova_domain is being detroyed, so don't
+	 * bother to free iovas, just call the entry_dtor on all remaining
+	 * entries.
+	 */
+	if (!iovad->entry_dtor)
+		return;
+
+	for_each_possible_cpu(cpu) {
+		struct iova_fq *fq = per_cpu_ptr(iovad->fq, cpu);
+		int idx;
+
+		fq_ring_for_each(idx, fq)
+			iovad->entry_dtor(fq->entries[idx].data);
+	}
+}
+
+void queue_iova(struct iova_domain *iovad,
+		unsigned long pfn, unsigned long pages,
+		unsigned long data)
+{
+	struct iova_fq *fq = get_cpu_ptr(iovad->fq);
+	unsigned idx;
+
+	if (fq_full(fq)) {
+		iovad->flush_cb(iovad);
+		fq_ring_free(iovad, fq);
+	}
+
+	idx = fq_ring_add(fq);
+
+	fq->entries[idx].iova_pfn = pfn;
+	fq->entries[idx].pages    = pages;
+	fq->entries[idx].data     = data;
+
+	put_cpu_ptr(iovad->fq);
+}
+EXPORT_SYMBOL_GPL(queue_iova);
+
 /**
  * put_iova_domain - destroys the iova doamin
  * @iovad: - iova domain in question.
diff --git a/include/linux/iova.h b/include/linux/iova.h
index 8aa10896150e..1ae85248ec50 100644
--- a/include/linux/iova.h
+++ b/include/linux/iova.h
@@ -127,6 +127,9 @@ struct iova *alloc_iova(struct iova_domain *iovad, unsigned long size,
 	bool size_aligned);
 void free_iova_fast(struct iova_domain *iovad, unsigned long pfn,
 		    unsigned long size);
+void queue_iova(struct iova_domain *iovad,
+		unsigned long pfn, unsigned long pages,
+		unsigned long data);
 unsigned long alloc_iova_fast(struct iova_domain *iovad, unsigned long size,
 			      unsigned long limit_pfn);
 struct iova *reserve_iova(struct iova_domain *iovad, unsigned long pfn_lo,
@@ -182,6 +185,12 @@ static inline void free_iova_fast(struct iova_domain *iovad,
 {
 }
 
+static inline void queue_iova(struct iova_domain *iovad,
+			      unsigned long pfn, unsigned long pages,
+			      unsigned long data)
+{
+}
+
 static inline unsigned long alloc_iova_fast(struct iova_domain *iovad,
 					    unsigned long size,
 					    unsigned long limit_pfn)
-- 
cgit v1.2.3


From fb418dab8a4f01dde0c025d15145c589ec02796b Mon Sep 17 00:00:00 2001
From: Joerg Roedel <jroedel@suse.de>
Date: Thu, 10 Aug 2017 16:14:59 +0200
Subject: iommu/iova: Add flush counters to Flush-Queue implementation

There are two counters:

	* fq_flush_start_cnt  - Increased when a TLB flush
	                        is started.

	* fq_flush_finish_cnt - Increased when a TLB flush
				is finished.

The fq_flush_start_cnt is assigned to every Flush-Queue
entry on its creation. When freeing entries from the
Flush-Queue, the value in the entry is compared to the
fq_flush_finish_cnt. The entry can only be freed when its
value is less than the value of fq_flush_finish_cnt.

The reason for these counters it to take advantage of IOMMU
TLB flushes that happened on other CPUs. These already
flushed the TLB for Flush-Queue entries on other CPUs so
that they can already be freed without flushing the TLB
again.

This makes it less likely that the Flush-Queue is full and
saves IOMMU TLB flushes.

Signed-off-by: Joerg Roedel <jroedel@suse.de>
---
 drivers/iommu/iova.c | 27 ++++++++++++++++++++++++---
 include/linux/iova.h |  8 ++++++++
 2 files changed, 32 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/drivers/iommu/iova.c b/drivers/iommu/iova.c
index e5c9a7ae6088..47b144e417ad 100644
--- a/drivers/iommu/iova.c
+++ b/drivers/iommu/iova.c
@@ -75,6 +75,9 @@ int init_iova_flush_queue(struct iova_domain *iovad,
 {
 	int cpu;
 
+	atomic64_set(&iovad->fq_flush_start_cnt,  0);
+	atomic64_set(&iovad->fq_flush_finish_cnt, 0);
+
 	iovad->fq = alloc_percpu(struct iova_fq);
 	if (!iovad->fq)
 		return -ENOMEM;
@@ -482,20 +485,30 @@ static inline unsigned fq_ring_add(struct iova_fq *fq)
 
 static void fq_ring_free(struct iova_domain *iovad, struct iova_fq *fq)
 {
+	u64 counter = atomic64_read(&iovad->fq_flush_finish_cnt);
 	unsigned idx;
 
 	fq_ring_for_each(idx, fq) {
 
+		if (fq->entries[idx].counter >= counter)
+			break;
+
 		if (iovad->entry_dtor)
 			iovad->entry_dtor(fq->entries[idx].data);
 
 		free_iova_fast(iovad,
 			       fq->entries[idx].iova_pfn,
 			       fq->entries[idx].pages);
+
+		fq->head = (fq->head + 1) % IOVA_FQ_SIZE;
 	}
+}
 
-	fq->head = 0;
-	fq->tail = 0;
+static void iova_domain_flush(struct iova_domain *iovad)
+{
+	atomic64_inc(&iovad->fq_flush_start_cnt);
+	iovad->flush_cb(iovad);
+	atomic64_inc(&iovad->fq_flush_finish_cnt);
 }
 
 static void fq_destroy_all_entries(struct iova_domain *iovad)
@@ -526,8 +539,15 @@ void queue_iova(struct iova_domain *iovad,
 	struct iova_fq *fq = get_cpu_ptr(iovad->fq);
 	unsigned idx;
 
+	/*
+	 * First remove all entries from the flush queue that have already been
+	 * flushed out on another CPU. This makes the fq_full() check below less
+	 * likely to be true.
+	 */
+	fq_ring_free(iovad, fq);
+
 	if (fq_full(fq)) {
-		iovad->flush_cb(iovad);
+		iova_domain_flush(iovad);
 		fq_ring_free(iovad, fq);
 	}
 
@@ -536,6 +556,7 @@ void queue_iova(struct iova_domain *iovad,
 	fq->entries[idx].iova_pfn = pfn;
 	fq->entries[idx].pages    = pages;
 	fq->entries[idx].data     = data;
+	fq->entries[idx].counter  = atomic64_read(&iovad->fq_flush_start_cnt);
 
 	put_cpu_ptr(iovad->fq);
 }
diff --git a/include/linux/iova.h b/include/linux/iova.h
index 1ae85248ec50..985b8008999e 100644
--- a/include/linux/iova.h
+++ b/include/linux/iova.h
@@ -14,6 +14,7 @@
 #include <linux/types.h>
 #include <linux/kernel.h>
 #include <linux/rbtree.h>
+#include <linux/atomic.h>
 #include <linux/dma-mapping.h>
 
 /* iova structure */
@@ -52,6 +53,7 @@ struct iova_fq_entry {
 	unsigned long iova_pfn;
 	unsigned long pages;
 	unsigned long data;
+	u64 counter; /* Flush counter when this entrie was added */
 };
 
 /* Per-CPU Flush Queue structure */
@@ -77,6 +79,12 @@ struct iova_domain {
 					   iova entry */
 
 	struct iova_fq __percpu *fq;	/* Flush Queue */
+
+	atomic64_t	fq_flush_start_cnt;	/* Number of TLB flushes that
+						   have been started */
+
+	atomic64_t	fq_flush_finish_cnt;	/* Number of TLB flushes that
+						   have been finished */
 };
 
 static inline unsigned long iova_size(struct iova *iova)
-- 
cgit v1.2.3


From 8109c2a2f8463852dddd6a1c3fcf262047c0c124 Mon Sep 17 00:00:00 2001
From: Joerg Roedel <jroedel@suse.de>
Date: Thu, 10 Aug 2017 16:31:17 +0200
Subject: iommu/iova: Add locking to Flush-Queues

The lock is taken from the same CPU most of the time. But
having it allows to flush the queue also from another CPU if
necessary.

This will be used by a timer to regularily flush any pending
IOVAs from the Flush-Queues.

Signed-off-by: Joerg Roedel <jroedel@suse.de>
---
 drivers/iommu/iova.c | 11 +++++++++++
 include/linux/iova.h |  1 +
 2 files changed, 12 insertions(+)

(limited to 'include')

diff --git a/drivers/iommu/iova.c b/drivers/iommu/iova.c
index 47b144e417ad..749d39533e0b 100644
--- a/drivers/iommu/iova.c
+++ b/drivers/iommu/iova.c
@@ -91,6 +91,8 @@ int init_iova_flush_queue(struct iova_domain *iovad,
 		fq = per_cpu_ptr(iovad->fq, cpu);
 		fq->head = 0;
 		fq->tail = 0;
+
+		spin_lock_init(&fq->lock);
 	}
 
 	return 0;
@@ -471,6 +473,7 @@ EXPORT_SYMBOL_GPL(free_iova_fast);
 
 static inline bool fq_full(struct iova_fq *fq)
 {
+	assert_spin_locked(&fq->lock);
 	return (((fq->tail + 1) % IOVA_FQ_SIZE) == fq->head);
 }
 
@@ -478,6 +481,8 @@ static inline unsigned fq_ring_add(struct iova_fq *fq)
 {
 	unsigned idx = fq->tail;
 
+	assert_spin_locked(&fq->lock);
+
 	fq->tail = (idx + 1) % IOVA_FQ_SIZE;
 
 	return idx;
@@ -488,6 +493,8 @@ static void fq_ring_free(struct iova_domain *iovad, struct iova_fq *fq)
 	u64 counter = atomic64_read(&iovad->fq_flush_finish_cnt);
 	unsigned idx;
 
+	assert_spin_locked(&fq->lock);
+
 	fq_ring_for_each(idx, fq) {
 
 		if (fq->entries[idx].counter >= counter)
@@ -537,8 +544,11 @@ void queue_iova(struct iova_domain *iovad,
 		unsigned long data)
 {
 	struct iova_fq *fq = get_cpu_ptr(iovad->fq);
+	unsigned long flags;
 	unsigned idx;
 
+	spin_lock_irqsave(&fq->lock, flags);
+
 	/*
 	 * First remove all entries from the flush queue that have already been
 	 * flushed out on another CPU. This makes the fq_full() check below less
@@ -558,6 +568,7 @@ void queue_iova(struct iova_domain *iovad,
 	fq->entries[idx].data     = data;
 	fq->entries[idx].counter  = atomic64_read(&iovad->fq_flush_start_cnt);
 
+	spin_unlock_irqrestore(&fq->lock, flags);
 	put_cpu_ptr(iovad->fq);
 }
 EXPORT_SYMBOL_GPL(queue_iova);
diff --git a/include/linux/iova.h b/include/linux/iova.h
index 985b8008999e..913a690cd4b0 100644
--- a/include/linux/iova.h
+++ b/include/linux/iova.h
@@ -60,6 +60,7 @@ struct iova_fq_entry {
 struct iova_fq {
 	struct iova_fq_entry entries[IOVA_FQ_SIZE];
 	unsigned head, tail;
+	spinlock_t lock;
 };
 
 /* holds all the iova translations for a domain */
-- 
cgit v1.2.3


From 9a005a800ae817c2c90ef117d7cd77614d866777 Mon Sep 17 00:00:00 2001
From: Joerg Roedel <jroedel@suse.de>
Date: Thu, 10 Aug 2017 16:58:18 +0200
Subject: iommu/iova: Add flush timer

Add a timer to flush entries from the Flush-Queues every
10ms. This makes sure that no stale TLB entries remain for
too long after an IOVA has been unmapped.

Signed-off-by: Joerg Roedel <jroedel@suse.de>
---
 drivers/iommu/iova.c | 32 ++++++++++++++++++++++++++++++++
 include/linux/iova.h |  8 ++++++++
 2 files changed, 40 insertions(+)

(limited to 'include')

diff --git a/drivers/iommu/iova.c b/drivers/iommu/iova.c
index 749d39533e0b..33edfa794ae9 100644
--- a/drivers/iommu/iova.c
+++ b/drivers/iommu/iova.c
@@ -33,6 +33,7 @@ static unsigned long iova_rcache_get(struct iova_domain *iovad,
 static void init_iova_rcaches(struct iova_domain *iovad);
 static void free_iova_rcaches(struct iova_domain *iovad);
 static void fq_destroy_all_entries(struct iova_domain *iovad);
+static void fq_flush_timeout(unsigned long data);
 
 void
 init_iova_domain(struct iova_domain *iovad, unsigned long granule,
@@ -62,7 +63,11 @@ static void free_iova_flush_queue(struct iova_domain *iovad)
 	if (!iovad->fq)
 		return;
 
+	if (timer_pending(&iovad->fq_timer))
+		del_timer(&iovad->fq_timer);
+
 	fq_destroy_all_entries(iovad);
+
 	free_percpu(iovad->fq);
 
 	iovad->fq         = NULL;
@@ -95,6 +100,9 @@ int init_iova_flush_queue(struct iova_domain *iovad,
 		spin_lock_init(&fq->lock);
 	}
 
+	setup_timer(&iovad->fq_timer, fq_flush_timeout, (unsigned long)iovad);
+	atomic_set(&iovad->fq_timer_on, 0);
+
 	return 0;
 }
 EXPORT_SYMBOL_GPL(init_iova_flush_queue);
@@ -539,6 +547,25 @@ static void fq_destroy_all_entries(struct iova_domain *iovad)
 	}
 }
 
+static void fq_flush_timeout(unsigned long data)
+{
+	struct iova_domain *iovad = (struct iova_domain *)data;
+	int cpu;
+
+	atomic_set(&iovad->fq_timer_on, 0);
+	iova_domain_flush(iovad);
+
+	for_each_possible_cpu(cpu) {
+		unsigned long flags;
+		struct iova_fq *fq;
+
+		fq = per_cpu_ptr(iovad->fq, cpu);
+		spin_lock_irqsave(&fq->lock, flags);
+		fq_ring_free(iovad, fq);
+		spin_unlock_irqrestore(&fq->lock, flags);
+	}
+}
+
 void queue_iova(struct iova_domain *iovad,
 		unsigned long pfn, unsigned long pages,
 		unsigned long data)
@@ -569,6 +596,11 @@ void queue_iova(struct iova_domain *iovad,
 	fq->entries[idx].counter  = atomic64_read(&iovad->fq_flush_start_cnt);
 
 	spin_unlock_irqrestore(&fq->lock, flags);
+
+	if (atomic_cmpxchg(&iovad->fq_timer_on, 0, 1) == 0)
+		mod_timer(&iovad->fq_timer,
+			  jiffies + msecs_to_jiffies(IOVA_FQ_TIMEOUT));
+
 	put_cpu_ptr(iovad->fq);
 }
 EXPORT_SYMBOL_GPL(queue_iova);
diff --git a/include/linux/iova.h b/include/linux/iova.h
index 913a690cd4b0..d179b9bf7814 100644
--- a/include/linux/iova.h
+++ b/include/linux/iova.h
@@ -48,6 +48,9 @@ typedef void (* iova_entry_dtor)(unsigned long data);
 /* Number of entries per Flush Queue */
 #define IOVA_FQ_SIZE	256
 
+/* Timeout (in ms) after which entries are flushed from the Flush-Queue */
+#define IOVA_FQ_TIMEOUT	10
+
 /* Flush Queue entry for defered flushing */
 struct iova_fq_entry {
 	unsigned long iova_pfn;
@@ -86,6 +89,11 @@ struct iova_domain {
 
 	atomic64_t	fq_flush_finish_cnt;	/* Number of TLB flushes that
 						   have been finished */
+
+	struct timer_list fq_timer;		/* Timer to regularily empty the
+						   flush-queues */
+	atomic_t fq_timer_on;			/* 1 when timer is active, 0
+						   when not */
 };
 
 static inline unsigned long iova_size(struct iova *iova)
-- 
cgit v1.2.3


From 48ac3c18cc62d4a23d5dc5c59f8720589d0de14b Mon Sep 17 00:00:00 2001
From: Mark Rutland <mark.rutland@arm.com>
Date: Fri, 14 Jul 2017 12:23:09 +0100
Subject: fork: allow arch-override of VMAP stack alignment

In some cases, an architecture might wish its stacks to be aligned to a
boundary larger than THREAD_SIZE. For example, using an alignment of
double THREAD_SIZE can allow for stack overflows smaller than
THREAD_SIZE to be detected by checking a single bit of the stack
pointer.

This patch allows architectures to override the alignment of VMAP'd
stacks, by defining THREAD_ALIGN. Where not defined, this defaults to
THREAD_SIZE, as is the case today.

Signed-off-by: Mark Rutland <mark.rutland@arm.com>
Reviewed-by: Will Deacon <will.deacon@arm.com>
Tested-by: Laura Abbott <labbott@redhat.com>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: James Morse <james.morse@arm.com>
Cc: linux-kernel@vger.kernel.org
---
 include/linux/thread_info.h | 4 ++++
 kernel/fork.c               | 3 ++-
 2 files changed, 6 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/thread_info.h b/include/linux/thread_info.h
index 250a27614328..905d769d8ddc 100644
--- a/include/linux/thread_info.h
+++ b/include/linux/thread_info.h
@@ -38,6 +38,10 @@ enum {
 
 #ifdef __KERNEL__
 
+#ifndef THREAD_ALIGN
+#define THREAD_ALIGN	THREAD_SIZE
+#endif
+
 #ifdef CONFIG_DEBUG_STACK_USAGE
 # define THREADINFO_GFP		(GFP_KERNEL_ACCOUNT | __GFP_NOTRACK | \
 				 __GFP_ZERO)
diff --git a/kernel/fork.c b/kernel/fork.c
index 17921b0390b4..f12882a2323b 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -88,6 +88,7 @@
 #include <linux/sysctl.h>
 #include <linux/kcov.h>
 #include <linux/livepatch.h>
+#include <linux/thread_info.h>
 
 #include <asm/pgtable.h>
 #include <asm/pgalloc.h>
@@ -217,7 +218,7 @@ static unsigned long *alloc_thread_stack_node(struct task_struct *tsk, int node)
 		return s->addr;
 	}
 
-	stack = __vmalloc_node_range(THREAD_SIZE, THREAD_SIZE,
+	stack = __vmalloc_node_range(THREAD_SIZE, THREAD_ALIGN,
 				     VMALLOC_START, VMALLOC_END,
 				     THREADINFO_GFP,
 				     PAGE_KERNEL,
-- 
cgit v1.2.3


From b8c502b81e3f899c6488967dec61eed0f5907db3 Mon Sep 17 00:00:00 2001
From: Chao Yu <yuchao0@huawei.com>
Date: Mon, 7 Aug 2017 23:12:46 +0800
Subject: f2fs: fix potential overflow when adjusting GC cycle

While comparing signed and unsigned variables, compiler will converts the
signed value to unsigned one, due to this reason, {in,de}crease_sleep_time
may return overflowed result.

Signed-off-by: Chao Yu <yuchao0@huawei.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
 fs/f2fs/gc.c                |  2 +-
 fs/f2fs/gc.h                | 23 +++++++++++++++--------
 include/trace/events/f2fs.h |  6 +++---
 3 files changed, 19 insertions(+), 12 deletions(-)

(limited to 'include')

diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c
index 8da7c14a9d29..e60480f71bb5 100644
--- a/fs/f2fs/gc.c
+++ b/fs/f2fs/gc.c
@@ -28,7 +28,7 @@ static int gc_thread_func(void *data)
 	struct f2fs_sb_info *sbi = data;
 	struct f2fs_gc_kthread *gc_th = sbi->gc_thread;
 	wait_queue_head_t *wq = &sbi->gc_thread->gc_wait_queue_head;
-	long wait_ms;
+	unsigned int wait_ms;
 
 	wait_ms = gc_th->min_sleep_time;
 
diff --git a/fs/f2fs/gc.h b/fs/f2fs/gc.h
index 57a9000ce3af..9325191fab2d 100644
--- a/fs/f2fs/gc.h
+++ b/fs/f2fs/gc.h
@@ -69,25 +69,32 @@ static inline block_t limit_free_user_blocks(struct f2fs_sb_info *sbi)
 }
 
 static inline void increase_sleep_time(struct f2fs_gc_kthread *gc_th,
-								long *wait)
+							unsigned int *wait)
 {
+	unsigned int min_time = gc_th->min_sleep_time;
+	unsigned int max_time = gc_th->max_sleep_time;
+
 	if (*wait == gc_th->no_gc_sleep_time)
 		return;
 
-	*wait += gc_th->min_sleep_time;
-	if (*wait > gc_th->max_sleep_time)
-		*wait = gc_th->max_sleep_time;
+	if ((long long)*wait + (long long)min_time > (long long)max_time)
+		*wait = max_time;
+	else
+		*wait += min_time;
 }
 
 static inline void decrease_sleep_time(struct f2fs_gc_kthread *gc_th,
-								long *wait)
+							unsigned int *wait)
 {
+	unsigned int min_time = gc_th->min_sleep_time;
+
 	if (*wait == gc_th->no_gc_sleep_time)
 		*wait = gc_th->max_sleep_time;
 
-	*wait -= gc_th->min_sleep_time;
-	if (*wait <= gc_th->min_sleep_time)
-		*wait = gc_th->min_sleep_time;
+	if ((long long)*wait - (long long)min_time < (long long)min_time)
+		*wait = min_time;
+	else
+		*wait -= min_time;
 }
 
 static inline bool has_enough_invalid_blocks(struct f2fs_sb_info *sbi)
diff --git a/include/trace/events/f2fs.h b/include/trace/events/f2fs.h
index 6f77a2755abb..db5c98256147 100644
--- a/include/trace/events/f2fs.h
+++ b/include/trace/events/f2fs.h
@@ -543,14 +543,14 @@ TRACE_EVENT(f2fs_map_blocks,
 
 TRACE_EVENT(f2fs_background_gc,
 
-	TP_PROTO(struct super_block *sb, long wait_ms,
+	TP_PROTO(struct super_block *sb, unsigned int wait_ms,
 			unsigned int prefree, unsigned int free),
 
 	TP_ARGS(sb, wait_ms, prefree, free),
 
 	TP_STRUCT__entry(
 		__field(dev_t,	dev)
-		__field(long,	wait_ms)
+		__field(unsigned int,	wait_ms)
 		__field(unsigned int,	prefree)
 		__field(unsigned int,	free)
 	),
@@ -562,7 +562,7 @@ TRACE_EVENT(f2fs_background_gc,
 		__entry->free		= free;
 	),
 
-	TP_printk("dev = (%d,%d), wait_ms = %ld, prefree = %u, free = %u",
+	TP_printk("dev = (%d,%d), wait_ms = %u, prefree = %u, free = %u",
 		show_dev(__entry->dev),
 		__entry->wait_ms,
 		__entry->prefree,
-- 
cgit v1.2.3


From b352baf15b66c5799018104d38f9eb77c7445a34 Mon Sep 17 00:00:00 2001
From: Paul Burton <paul.burton@imgtec.com>
Date: Tue, 15 Aug 2017 12:02:16 -0700
Subject: PCI: Move enum pci_interrupt_pin to linux/pci.h

We currently have a definition of enum pci_interrupt_pin in a header
specific to PCI endpoints - linux/pci-epf.h. In order to allow for use of
this enum from PCI host code in a future commit, move its definition to
linux/pci.h & include that from linux/pci-epf.h.

Additionally we add a PCI_NUM_INTX macro which indicates the number of PCI
INTx interrupts, and will be used alongside enum pci_interrupt_pin in
further patches.

Signed-off-by: Paul Burton <paul.burton@imgtec.com>
[bhelgaas: move enum pci_interrupt_pin outside #ifdef CONFIG_PCI]
Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
---
 include/linux/pci-epf.h |  9 +--------
 include/linux/pci.h     | 22 ++++++++++++++++++++++
 2 files changed, 23 insertions(+), 8 deletions(-)

(limited to 'include')

diff --git a/include/linux/pci-epf.h b/include/linux/pci-epf.h
index 0d529cb90143..bc8750688348 100644
--- a/include/linux/pci-epf.h
+++ b/include/linux/pci-epf.h
@@ -14,17 +14,10 @@
 
 #include <linux/device.h>
 #include <linux/mod_devicetable.h>
+#include <linux/pci.h>
 
 struct pci_epf;
 
-enum pci_interrupt_pin {
-	PCI_INTERRUPT_UNKNOWN,
-	PCI_INTERRUPT_INTA,
-	PCI_INTERRUPT_INTB,
-	PCI_INTERRUPT_INTC,
-	PCI_INTERRUPT_INTD,
-};
-
 enum pci_barno {
 	BAR_0,
 	BAR_1,
diff --git a/include/linux/pci.h b/include/linux/pci.h
index 4869e66dd659..bb9c367c85f0 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -102,6 +102,28 @@ enum {
 	DEVICE_COUNT_RESOURCE = PCI_NUM_RESOURCES,
 };
 
+/**
+ * enum pci_interrupt_pin - PCI INTx interrupt values
+ * @PCI_INTERRUPT_UNKNOWN: Unknown or unassigned interrupt
+ * @PCI_INTERRUPT_INTA: PCI INTA pin
+ * @PCI_INTERRUPT_INTB: PCI INTB pin
+ * @PCI_INTERRUPT_INTC: PCI INTC pin
+ * @PCI_INTERRUPT_INTD: PCI INTD pin
+ *
+ * Corresponds to values for legacy PCI INTx interrupts, as can be found in the
+ * PCI_INTERRUPT_PIN register.
+ */
+enum pci_interrupt_pin {
+	PCI_INTERRUPT_UNKNOWN,
+	PCI_INTERRUPT_INTA,
+	PCI_INTERRUPT_INTB,
+	PCI_INTERRUPT_INTC,
+	PCI_INTERRUPT_INTD,
+};
+
+/* The number of legacy PCI INTx interrupts */
+#define PCI_NUM_INTX	4
+
 /*
  * pci_power_t values must match the bits in the Capabilities PME_Support
  * and Control/Status PowerState fields in the Power Management capability.
-- 
cgit v1.2.3


From dc503a8ad98474ea0073a1c5c4d9f18cb8dd0dbf Mon Sep 17 00:00:00 2001
From: Edward Cree <ecree@solarflare.com>
Date: Tue, 15 Aug 2017 20:34:35 +0100
Subject: bpf/verifier: track liveness for pruning

State of a register doesn't matter if it wasn't read in reaching an exit;
 a write screens off all reads downstream of it from all explored_states
 upstream of it.
This allows us to prune many more branches; here are some processed insn
 counts for some Cilium programs:
Program                  before  after
bpf_lb_opt_-DLB_L3.o       6515   3361
bpf_lb_opt_-DLB_L4.o       8976   5176
bpf_lb_opt_-DUNKNOWN.o     2960   1137
bpf_lxc_opt_-DDROP_ALL.o  95412  48537
bpf_lxc_opt_-DUNKNOWN.o  141706  78718
bpf_netdev.o              24251  17995
bpf_overlay.o             10999   9385

The runtime is also improved; here are 'time' results in ms:
Program                  before  after
bpf_lb_opt_-DLB_L3.o         24      6
bpf_lb_opt_-DLB_L4.o         26     11
bpf_lb_opt_-DUNKNOWN.o       11      2
bpf_lxc_opt_-DDROP_ALL.o   1288    139
bpf_lxc_opt_-DUNKNOWN.o    1768    234
bpf_netdev.o                 62     31
bpf_overlay.o                15     13

Signed-off-by: Edward Cree <ecree@solarflare.com>
Acked-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/bpf_verifier.h |  11 ++-
 kernel/bpf/verifier.c        | 189 +++++++++++++++++++++++++++++++++----------
 2 files changed, 156 insertions(+), 44 deletions(-)

(limited to 'include')

diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h
index c61c3033522e..91d07efed2ba 100644
--- a/include/linux/bpf_verifier.h
+++ b/include/linux/bpf_verifier.h
@@ -21,6 +21,12 @@
  */
 #define BPF_MAX_VAR_SIZ	INT_MAX
 
+enum bpf_reg_liveness {
+	REG_LIVE_NONE = 0, /* reg hasn't been read or written this branch */
+	REG_LIVE_READ, /* reg was read, so we're sensitive to initial value */
+	REG_LIVE_WRITTEN, /* reg was written first, screening off later reads */
+};
+
 struct bpf_reg_state {
 	enum bpf_reg_type type;
 	union {
@@ -40,7 +46,7 @@ struct bpf_reg_state {
 	 * came from, when one is tested for != NULL.
 	 */
 	u32 id;
-	/* These five fields must be last.  See states_equal() */
+	/* Ordering of fields matters.  See states_equal() */
 	/* For scalar types (SCALAR_VALUE), this represents our knowledge of
 	 * the actual value.
 	 * For pointer types, this represents the variable part of the offset
@@ -57,6 +63,8 @@ struct bpf_reg_state {
 	s64 smax_value; /* maximum possible (s64)value */
 	u64 umin_value; /* minimum possible (u64)value */
 	u64 umax_value; /* maximum possible (u64)value */
+	/* This field must be last, for states_equal() reasons. */
+	enum bpf_reg_liveness live;
 };
 
 enum bpf_stack_slot_type {
@@ -74,6 +82,7 @@ struct bpf_verifier_state {
 	struct bpf_reg_state regs[MAX_BPF_REG];
 	u8 stack_slot_type[MAX_BPF_STACK];
 	struct bpf_reg_state spilled_regs[MAX_BPF_STACK / BPF_REG_SIZE];
+	struct bpf_verifier_state *parent;
 };
 
 /* linked list of verifier states used to prune search */
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index ecc590e01a1d..7dd96d064be1 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -629,8 +629,10 @@ static void init_reg_state(struct bpf_reg_state *regs)
 {
 	int i;
 
-	for (i = 0; i < MAX_BPF_REG; i++)
+	for (i = 0; i < MAX_BPF_REG; i++) {
 		mark_reg_not_init(regs, i);
+		regs[i].live = REG_LIVE_NONE;
+	}
 
 	/* frame pointer */
 	regs[BPF_REG_FP].type = PTR_TO_STACK;
@@ -647,9 +649,26 @@ enum reg_arg_type {
 	DST_OP_NO_MARK	/* same as above, check only, don't mark */
 };
 
-static int check_reg_arg(struct bpf_reg_state *regs, u32 regno,
+static void mark_reg_read(const struct bpf_verifier_state *state, u32 regno)
+{
+	struct bpf_verifier_state *parent = state->parent;
+
+	while (parent) {
+		/* if read wasn't screened by an earlier write ... */
+		if (state->regs[regno].live & REG_LIVE_WRITTEN)
+			break;
+		/* ... then we depend on parent's value */
+		parent->regs[regno].live |= REG_LIVE_READ;
+		state = parent;
+		parent = state->parent;
+	}
+}
+
+static int check_reg_arg(struct bpf_verifier_env *env, u32 regno,
 			 enum reg_arg_type t)
 {
+	struct bpf_reg_state *regs = env->cur_state.regs;
+
 	if (regno >= MAX_BPF_REG) {
 		verbose("R%d is invalid\n", regno);
 		return -EINVAL;
@@ -661,12 +680,14 @@ static int check_reg_arg(struct bpf_reg_state *regs, u32 regno,
 			verbose("R%d !read_ok\n", regno);
 			return -EACCES;
 		}
+		mark_reg_read(&env->cur_state, regno);
 	} else {
 		/* check whether register used as dest operand can be written to */
 		if (regno == BPF_REG_FP) {
 			verbose("frame pointer is read only\n");
 			return -EACCES;
 		}
+		regs[regno].live |= REG_LIVE_WRITTEN;
 		if (t == DST_OP)
 			mark_reg_unknown(regs, regno);
 	}
@@ -695,7 +716,7 @@ static bool is_spillable_regtype(enum bpf_reg_type type)
 static int check_stack_write(struct bpf_verifier_state *state, int off,
 			     int size, int value_regno)
 {
-	int i;
+	int i, spi = (MAX_BPF_STACK + off) / BPF_REG_SIZE;
 	/* caller checked that off % size == 0 and -MAX_BPF_STACK <= off < 0,
 	 * so it's aligned access and [off, off + size) are within stack limits
 	 */
@@ -710,15 +731,14 @@ static int check_stack_write(struct bpf_verifier_state *state, int off,
 		}
 
 		/* save register state */
-		state->spilled_regs[(MAX_BPF_STACK + off) / BPF_REG_SIZE] =
-			state->regs[value_regno];
+		state->spilled_regs[spi] = state->regs[value_regno];
+		state->spilled_regs[spi].live |= REG_LIVE_WRITTEN;
 
 		for (i = 0; i < BPF_REG_SIZE; i++)
 			state->stack_slot_type[MAX_BPF_STACK + off + i] = STACK_SPILL;
 	} else {
 		/* regular write of data into stack */
-		state->spilled_regs[(MAX_BPF_STACK + off) / BPF_REG_SIZE] =
-			(struct bpf_reg_state) {};
+		state->spilled_regs[spi] = (struct bpf_reg_state) {};
 
 		for (i = 0; i < size; i++)
 			state->stack_slot_type[MAX_BPF_STACK + off + i] = STACK_MISC;
@@ -726,11 +746,26 @@ static int check_stack_write(struct bpf_verifier_state *state, int off,
 	return 0;
 }
 
+static void mark_stack_slot_read(const struct bpf_verifier_state *state, int slot)
+{
+	struct bpf_verifier_state *parent = state->parent;
+
+	while (parent) {
+		/* if read wasn't screened by an earlier write ... */
+		if (state->spilled_regs[slot].live & REG_LIVE_WRITTEN)
+			break;
+		/* ... then we depend on parent's value */
+		parent->spilled_regs[slot].live |= REG_LIVE_READ;
+		state = parent;
+		parent = state->parent;
+	}
+}
+
 static int check_stack_read(struct bpf_verifier_state *state, int off, int size,
 			    int value_regno)
 {
 	u8 *slot_type;
-	int i;
+	int i, spi;
 
 	slot_type = &state->stack_slot_type[MAX_BPF_STACK + off];
 
@@ -746,10 +781,13 @@ static int check_stack_read(struct bpf_verifier_state *state, int off, int size,
 			}
 		}
 
-		if (value_regno >= 0)
+		spi = (MAX_BPF_STACK + off) / BPF_REG_SIZE;
+
+		if (value_regno >= 0) {
 			/* restore register state from stack */
-			state->regs[value_regno] =
-				state->spilled_regs[(MAX_BPF_STACK + off) / BPF_REG_SIZE];
+			state->regs[value_regno] = state->spilled_regs[spi];
+			mark_stack_slot_read(state, spi);
+		}
 		return 0;
 	} else {
 		for (i = 0; i < size; i++) {
@@ -1167,7 +1205,6 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn
 
 static int check_xadd(struct bpf_verifier_env *env, int insn_idx, struct bpf_insn *insn)
 {
-	struct bpf_reg_state *regs = env->cur_state.regs;
 	int err;
 
 	if ((BPF_SIZE(insn->code) != BPF_W && BPF_SIZE(insn->code) != BPF_DW) ||
@@ -1177,12 +1214,12 @@ static int check_xadd(struct bpf_verifier_env *env, int insn_idx, struct bpf_ins
 	}
 
 	/* check src1 operand */
-	err = check_reg_arg(regs, insn->src_reg, SRC_OP);
+	err = check_reg_arg(env, insn->src_reg, SRC_OP);
 	if (err)
 		return err;
 
 	/* check src2 operand */
-	err = check_reg_arg(regs, insn->dst_reg, SRC_OP);
+	err = check_reg_arg(env, insn->dst_reg, SRC_OP);
 	if (err)
 		return err;
 
@@ -1297,10 +1334,9 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno,
 	if (arg_type == ARG_DONTCARE)
 		return 0;
 
-	if (type == NOT_INIT) {
-		verbose("R%d !read_ok\n", regno);
-		return -EACCES;
-	}
+	err = check_reg_arg(env, regno, SRC_OP);
+	if (err)
+		return err;
 
 	if (arg_type == ARG_ANYTHING) {
 		if (is_pointer_value(env, regno)) {
@@ -1639,10 +1675,12 @@ static int check_call(struct bpf_verifier_env *env, int func_id, int insn_idx)
 	}
 
 	/* reset caller saved regs */
-	for (i = 0; i < CALLER_SAVED_REGS; i++)
+	for (i = 0; i < CALLER_SAVED_REGS; i++) {
 		mark_reg_not_init(regs, caller_saved[i]);
+		check_reg_arg(env, caller_saved[i], DST_OP_NO_MARK);
+	}
 
-	/* update return register */
+	/* update return register (already marked as written above) */
 	if (fn->ret_type == RET_INTEGER) {
 		/* sets type to SCALAR_VALUE */
 		mark_reg_unknown(regs, BPF_REG_0);
@@ -2250,7 +2288,7 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn)
 		}
 
 		/* check src operand */
-		err = check_reg_arg(regs, insn->dst_reg, SRC_OP);
+		err = check_reg_arg(env, insn->dst_reg, SRC_OP);
 		if (err)
 			return err;
 
@@ -2261,7 +2299,7 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn)
 		}
 
 		/* check dest operand */
-		err = check_reg_arg(regs, insn->dst_reg, DST_OP);
+		err = check_reg_arg(env, insn->dst_reg, DST_OP);
 		if (err)
 			return err;
 
@@ -2274,7 +2312,7 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn)
 			}
 
 			/* check src operand */
-			err = check_reg_arg(regs, insn->src_reg, SRC_OP);
+			err = check_reg_arg(env, insn->src_reg, SRC_OP);
 			if (err)
 				return err;
 		} else {
@@ -2285,7 +2323,7 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn)
 		}
 
 		/* check dest operand */
-		err = check_reg_arg(regs, insn->dst_reg, DST_OP);
+		err = check_reg_arg(env, insn->dst_reg, DST_OP);
 		if (err)
 			return err;
 
@@ -2328,7 +2366,7 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn)
 				return -EINVAL;
 			}
 			/* check src1 operand */
-			err = check_reg_arg(regs, insn->src_reg, SRC_OP);
+			err = check_reg_arg(env, insn->src_reg, SRC_OP);
 			if (err)
 				return err;
 		} else {
@@ -2339,7 +2377,7 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn)
 		}
 
 		/* check src2 operand */
-		err = check_reg_arg(regs, insn->dst_reg, SRC_OP);
+		err = check_reg_arg(env, insn->dst_reg, SRC_OP);
 		if (err)
 			return err;
 
@@ -2360,7 +2398,7 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn)
 		}
 
 		/* check dest operand */
-		err = check_reg_arg(regs, insn->dst_reg, DST_OP_NO_MARK);
+		err = check_reg_arg(env, insn->dst_reg, DST_OP_NO_MARK);
 		if (err)
 			return err;
 
@@ -2717,7 +2755,7 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env,
 		}
 
 		/* check src1 operand */
-		err = check_reg_arg(regs, insn->src_reg, SRC_OP);
+		err = check_reg_arg(env, insn->src_reg, SRC_OP);
 		if (err)
 			return err;
 
@@ -2734,7 +2772,7 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env,
 	}
 
 	/* check src2 operand */
-	err = check_reg_arg(regs, insn->dst_reg, SRC_OP);
+	err = check_reg_arg(env, insn->dst_reg, SRC_OP);
 	if (err)
 		return err;
 
@@ -2851,7 +2889,7 @@ static int check_ld_imm(struct bpf_verifier_env *env, struct bpf_insn *insn)
 		return -EINVAL;
 	}
 
-	err = check_reg_arg(regs, insn->dst_reg, DST_OP);
+	err = check_reg_arg(env, insn->dst_reg, DST_OP);
 	if (err)
 		return err;
 
@@ -2917,7 +2955,7 @@ static int check_ld_abs(struct bpf_verifier_env *env, struct bpf_insn *insn)
 	}
 
 	/* check whether implicit source operand (register R6) is readable */
-	err = check_reg_arg(regs, BPF_REG_6, SRC_OP);
+	err = check_reg_arg(env, BPF_REG_6, SRC_OP);
 	if (err)
 		return err;
 
@@ -2928,17 +2966,20 @@ static int check_ld_abs(struct bpf_verifier_env *env, struct bpf_insn *insn)
 
 	if (mode == BPF_IND) {
 		/* check explicit source operand */
-		err = check_reg_arg(regs, insn->src_reg, SRC_OP);
+		err = check_reg_arg(env, insn->src_reg, SRC_OP);
 		if (err)
 			return err;
 	}
 
 	/* reset caller saved regs to unreadable */
-	for (i = 0; i < CALLER_SAVED_REGS; i++)
+	for (i = 0; i < CALLER_SAVED_REGS; i++) {
 		mark_reg_not_init(regs, caller_saved[i]);
+		check_reg_arg(env, caller_saved[i], DST_OP_NO_MARK);
+	}
 
 	/* mark destination R0 register as readable, since it contains
-	 * the value fetched from the packet
+	 * the value fetched from the packet.
+	 * Already marked as written above.
 	 */
 	mark_reg_unknown(regs, BPF_REG_0);
 	return 0;
@@ -3194,7 +3235,11 @@ static bool regsafe(struct bpf_reg_state *rold,
 		    struct bpf_reg_state *rcur,
 		    bool varlen_map_access, struct idpair *idmap)
 {
-	if (memcmp(rold, rcur, sizeof(*rold)) == 0)
+	if (!(rold->live & REG_LIVE_READ))
+		/* explored state didn't use this */
+		return true;
+
+	if (memcmp(rold, rcur, offsetof(struct bpf_reg_state, live)) == 0)
 		return true;
 
 	if (rold->type == NOT_INIT)
@@ -3372,10 +3417,56 @@ out_free:
 	return ret;
 }
 
+static bool do_propagate_liveness(const struct bpf_verifier_state *state,
+				  struct bpf_verifier_state *parent)
+{
+	bool touched = false; /* any changes made? */
+	int i;
+
+	if (!parent)
+		return touched;
+	/* Propagate read liveness of registers... */
+	BUILD_BUG_ON(BPF_REG_FP + 1 != MAX_BPF_REG);
+	/* We don't need to worry about FP liveness because it's read-only */
+	for (i = 0; i < BPF_REG_FP; i++) {
+		if (parent->regs[i].live & REG_LIVE_READ)
+			continue;
+		if (state->regs[i].live == REG_LIVE_READ) {
+			parent->regs[i].live |= REG_LIVE_READ;
+			touched = true;
+		}
+	}
+	/* ... and stack slots */
+	for (i = 0; i < MAX_BPF_STACK / BPF_REG_SIZE; i++) {
+		if (parent->stack_slot_type[i * BPF_REG_SIZE] != STACK_SPILL)
+			continue;
+		if (state->stack_slot_type[i * BPF_REG_SIZE] != STACK_SPILL)
+			continue;
+		if (parent->spilled_regs[i].live & REG_LIVE_READ)
+			continue;
+		if (state->spilled_regs[i].live == REG_LIVE_READ) {
+			parent->regs[i].live |= REG_LIVE_READ;
+			touched = true;
+		}
+	}
+	return touched;
+}
+
+static void propagate_liveness(const struct bpf_verifier_state *state,
+			       struct bpf_verifier_state *parent)
+{
+	while (do_propagate_liveness(state, parent)) {
+		/* Something changed, so we need to feed those changes onward */
+		state = parent;
+		parent = state->parent;
+	}
+}
+
 static int is_state_visited(struct bpf_verifier_env *env, int insn_idx)
 {
 	struct bpf_verifier_state_list *new_sl;
 	struct bpf_verifier_state_list *sl;
+	int i;
 
 	sl = env->explored_states[insn_idx];
 	if (!sl)
@@ -3385,11 +3476,14 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx)
 		return 0;
 
 	while (sl != STATE_LIST_MARK) {
-		if (states_equal(env, &sl->state, &env->cur_state))
+		if (states_equal(env, &sl->state, &env->cur_state)) {
 			/* reached equivalent register/stack state,
-			 * prune the search
+			 * prune the search.
+			 * Registers read by the continuation are read by us.
 			 */
+			propagate_liveness(&sl->state, &env->cur_state);
 			return 1;
+		}
 		sl = sl->next;
 	}
 
@@ -3407,6 +3501,14 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx)
 	memcpy(&new_sl->state, &env->cur_state, sizeof(env->cur_state));
 	new_sl->next = env->explored_states[insn_idx];
 	env->explored_states[insn_idx] = new_sl;
+	/* connect new state to parentage chain */
+	env->cur_state.parent = &new_sl->state;
+	/* clear liveness marks in current state */
+	for (i = 0; i < BPF_REG_FP; i++)
+		env->cur_state.regs[i].live = REG_LIVE_NONE;
+	for (i = 0; i < MAX_BPF_STACK / BPF_REG_SIZE; i++)
+		if (env->cur_state.stack_slot_type[i * BPF_REG_SIZE] == STACK_SPILL)
+			env->cur_state.spilled_regs[i].live = REG_LIVE_NONE;
 	return 0;
 }
 
@@ -3430,6 +3532,7 @@ static int do_check(struct bpf_verifier_env *env)
 	bool do_print_state = false;
 
 	init_reg_state(regs);
+	state->parent = NULL;
 	insn_idx = 0;
 	env->varlen_map_value_access = false;
 	for (;;) {
@@ -3500,11 +3603,11 @@ static int do_check(struct bpf_verifier_env *env)
 			/* check for reserved fields is already done */
 
 			/* check src operand */
-			err = check_reg_arg(regs, insn->src_reg, SRC_OP);
+			err = check_reg_arg(env, insn->src_reg, SRC_OP);
 			if (err)
 				return err;
 
-			err = check_reg_arg(regs, insn->dst_reg, DST_OP_NO_MARK);
+			err = check_reg_arg(env, insn->dst_reg, DST_OP_NO_MARK);
 			if (err)
 				return err;
 
@@ -3554,11 +3657,11 @@ static int do_check(struct bpf_verifier_env *env)
 			}
 
 			/* check src1 operand */
-			err = check_reg_arg(regs, insn->src_reg, SRC_OP);
+			err = check_reg_arg(env, insn->src_reg, SRC_OP);
 			if (err)
 				return err;
 			/* check src2 operand */
-			err = check_reg_arg(regs, insn->dst_reg, SRC_OP);
+			err = check_reg_arg(env, insn->dst_reg, SRC_OP);
 			if (err)
 				return err;
 
@@ -3589,7 +3692,7 @@ static int do_check(struct bpf_verifier_env *env)
 				return -EINVAL;
 			}
 			/* check src operand */
-			err = check_reg_arg(regs, insn->dst_reg, SRC_OP);
+			err = check_reg_arg(env, insn->dst_reg, SRC_OP);
 			if (err)
 				return err;
 
@@ -3643,7 +3746,7 @@ static int do_check(struct bpf_verifier_env *env)
 				 * of bpf_exit, which means that program wrote
 				 * something into it earlier
 				 */
-				err = check_reg_arg(regs, BPF_REG_0, SRC_OP);
+				err = check_reg_arg(env, BPF_REG_0, SRC_OP);
 				if (err)
 					return err;
 
-- 
cgit v1.2.3


From fe4007999599c02598c17b643e8de43e487d48e8 Mon Sep 17 00:00:00 2001
From: Ido Schimmel <idosch@mellanox.com>
Date: Tue, 15 Aug 2017 09:09:49 +0200
Subject: ipv6: fib: Provide offload indication using nexthop flags

IPv6 routes currently lack nexthop flags as in IPv4. This has several
implications.

In the forwarding path, it requires us to check the carrier state of the
nexthop device and potentially ignore a linkdown route, instead of
checking for RTNH_F_LINKDOWN.

It also requires capable drivers to use the user facing IPv6-specific
route flags to provide offload indication, instead of using the nexthop
flags as in IPv4.

Add nexthop flags to IPv6 routes in the 40 bytes hole and use it to
provide offload indication instead of the RTF_OFFLOAD flag, which is
removed while it's still not part of any official kernel release.

In the near future we would like to use the field for the
RTNH_F_{LINKDOWN,DEAD} flags, but this change is more involved and might
not be ready in time for the current cycle.

Signed-off-by: Ido Schimmel <idosch@mellanox.com>
Signed-off-by: Jiri Pirko <jiri@mellanox.com>
Acked-by: David Ahern <dsahern@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c | 8 ++++----
 include/net/ip6_fib.h                                 | 2 ++
 include/uapi/linux/ipv6_route.h                       | 1 -
 net/ipv6/route.c                                      | 7 +------
 4 files changed, 7 insertions(+), 11 deletions(-)

(limited to 'include')

diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c
index 16676fffbf70..4895d5b8942b 100644
--- a/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c
+++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c
@@ -2397,7 +2397,7 @@ mlxsw_sp_fib6_entry_offload_set(struct mlxsw_sp_fib_entry *fib_entry)
 
 	if (fib_entry->type == MLXSW_SP_FIB_ENTRY_TYPE_LOCAL) {
 		list_first_entry(&fib6_entry->rt6_list, struct mlxsw_sp_rt6,
-				 list)->rt->rt6i_flags |= RTF_OFFLOAD;
+				 list)->rt->rt6i_nh_flags |= RTNH_F_OFFLOAD;
 		return;
 	}
 
@@ -2407,9 +2407,9 @@ mlxsw_sp_fib6_entry_offload_set(struct mlxsw_sp_fib_entry *fib_entry)
 
 		nh = mlxsw_sp_rt6_nexthop(nh_grp, mlxsw_sp_rt6);
 		if (nh && nh->offloaded)
-			mlxsw_sp_rt6->rt->rt6i_flags |= RTF_OFFLOAD;
+			mlxsw_sp_rt6->rt->rt6i_nh_flags |= RTNH_F_OFFLOAD;
 		else
-			mlxsw_sp_rt6->rt->rt6i_flags &= ~RTF_OFFLOAD;
+			mlxsw_sp_rt6->rt->rt6i_nh_flags &= ~RTNH_F_OFFLOAD;
 	}
 }
 
@@ -2424,7 +2424,7 @@ mlxsw_sp_fib6_entry_offload_unset(struct mlxsw_sp_fib_entry *fib_entry)
 	list_for_each_entry(mlxsw_sp_rt6, &fib6_entry->rt6_list, list) {
 		struct rt6_info *rt = mlxsw_sp_rt6->rt;
 
-		rt->rt6i_flags &= ~RTF_OFFLOAD;
+		rt->rt6i_nh_flags &= ~RTNH_F_OFFLOAD;
 	}
 }
 
diff --git a/include/net/ip6_fib.h b/include/net/ip6_fib.h
index 1d790ea40ea7..71c1646298ae 100644
--- a/include/net/ip6_fib.h
+++ b/include/net/ip6_fib.h
@@ -120,6 +120,8 @@ struct rt6_info {
 
 	atomic_t			rt6i_ref;
 
+	unsigned int			rt6i_nh_flags;
+
 	/* These are in a separate cache line. */
 	struct rt6key			rt6i_dst ____cacheline_aligned_in_smp;
 	u32				rt6i_flags;
diff --git a/include/uapi/linux/ipv6_route.h b/include/uapi/linux/ipv6_route.h
index 33e2a5732bd1..d496c02e14bc 100644
--- a/include/uapi/linux/ipv6_route.h
+++ b/include/uapi/linux/ipv6_route.h
@@ -35,7 +35,6 @@
 #define RTF_PREF(pref)	((pref) << 27)
 #define RTF_PREF_MASK	0x18000000
 
-#define RTF_OFFLOAD	0x20000000	/* offloaded route		*/
 #define RTF_PCPU	0x40000000	/* read-only: can not be set by user */
 #define RTF_LOCAL	0x80000000
 
diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index 035762fed07d..6793135d49db 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -1820,11 +1820,6 @@ static struct rt6_info *ip6_route_info_create(struct fib6_config *cfg,
 		goto out;
 	}
 
-	if (cfg->fc_flags & RTF_OFFLOAD) {
-		NL_SET_ERR_MSG(extack, "Userspace can not set RTF_OFFLOAD");
-		goto out;
-	}
-
 	if (cfg->fc_dst_len > 128) {
 		NL_SET_ERR_MSG(extack, "Invalid prefix length");
 		goto out;
@@ -3335,7 +3330,7 @@ static int rt6_nexthop_info(struct sk_buff *skb, struct rt6_info *rt,
 			goto nla_put_failure;
 	}
 
-	if (rt->rt6i_flags & RTF_OFFLOAD)
+	if (rt->rt6i_nh_flags & RTNH_F_OFFLOAD)
 		*flags |= RTNH_F_OFFLOAD;
 
 	/* not needed for multipath encoding b/c it has a rtnexthop struct */
-- 
cgit v1.2.3


From 12d94a804946af291e24b80fc53ec86264765781 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Tue, 15 Aug 2017 04:09:51 -0700
Subject: ipv6: fix NULL dereference in ip6_route_dev_notify()

Based on a syzkaller report [1], I found that a per cpu allocation
failure in snmp6_alloc_dev() would then lead to NULL dereference in
ip6_route_dev_notify().

It seems this is a very old bug, thus no Fixes tag in this submission.

Let's add in6_dev_put_clear() helper, as we will probably use
it elsewhere (once available/present in net-next)

[1]
kasan: CONFIG_KASAN_INLINE enabled
kasan: GPF could be caused by NULL-ptr deref or user memory access
general protection fault: 0000 [#1] SMP KASAN
Dumping ftrace buffer:
   (ftrace buffer empty)
Modules linked in:
CPU: 1 PID: 17294 Comm: syz-executor6 Not tainted 4.13.0-rc2+ #10
Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011
task: ffff88019f456680 task.stack: ffff8801c6e58000
RIP: 0010:__read_once_size include/linux/compiler.h:250 [inline]
RIP: 0010:atomic_read arch/x86/include/asm/atomic.h:26 [inline]
RIP: 0010:refcount_sub_and_test+0x7d/0x1b0 lib/refcount.c:178
RSP: 0018:ffff8801c6e5f1b0 EFLAGS: 00010202
RAX: 0000000000000037 RBX: dffffc0000000000 RCX: ffffc90005d25000
RDX: ffff8801c6e5f218 RSI: ffffffff82342bbf RDI: 0000000000000001
RBP: ffff8801c6e5f240 R08: 0000000000000001 R09: 0000000000000000
R10: 0000000000000000 R11: 0000000000000000 R12: 1ffff10038dcbe37
R13: 0000000000000006 R14: 0000000000000001 R15: 00000000000001b8
FS:  00007f21e0429700(0000) GS:ffff8801dc100000(0000) knlGS:0000000000000000
CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
CR2: 0000001ddbc22000 CR3: 00000001d632b000 CR4: 00000000001426e0
DR0: 0000000020000000 DR1: 0000000000000000 DR2: 0000000000000000
DR3: 0000000000000000 DR6: 00000000ffff0ff0 DR7: 0000000000000600
Call Trace:
 refcount_dec_and_test+0x1a/0x20 lib/refcount.c:211
 in6_dev_put include/net/addrconf.h:335 [inline]
 ip6_route_dev_notify+0x1c9/0x4a0 net/ipv6/route.c:3732
 notifier_call_chain+0x136/0x2c0 kernel/notifier.c:93
 __raw_notifier_call_chain kernel/notifier.c:394 [inline]
 raw_notifier_call_chain+0x2d/0x40 kernel/notifier.c:401
 call_netdevice_notifiers_info+0x51/0x90 net/core/dev.c:1678
 call_netdevice_notifiers net/core/dev.c:1694 [inline]
 rollback_registered_many+0x91c/0xe80 net/core/dev.c:7107
 rollback_registered+0x1be/0x3c0 net/core/dev.c:7149
 register_netdevice+0xbcd/0xee0 net/core/dev.c:7587
 register_netdev+0x1a/0x30 net/core/dev.c:7669
 loopback_net_init+0x76/0x160 drivers/net/loopback.c:214
 ops_init+0x10a/0x570 net/core/net_namespace.c:118
 setup_net+0x313/0x710 net/core/net_namespace.c:294
 copy_net_ns+0x27c/0x580 net/core/net_namespace.c:418
 create_new_namespaces+0x425/0x880 kernel/nsproxy.c:107
 unshare_nsproxy_namespaces+0xae/0x1e0 kernel/nsproxy.c:206
 SYSC_unshare kernel/fork.c:2347 [inline]
 SyS_unshare+0x653/0xfa0 kernel/fork.c:2297
 entry_SYSCALL_64_fastpath+0x1f/0xbe
RIP: 0033:0x4512c9
RSP: 002b:00007f21e0428c08 EFLAGS: 00000216 ORIG_RAX: 0000000000000110
RAX: ffffffffffffffda RBX: 0000000000718150 RCX: 00000000004512c9
RDX: 0000000000000000 RSI: 0000000000000000 RDI: 0000000062020200
RBP: 0000000000000086 R08: 0000000000000000 R09: 0000000000000000
R10: 0000000000000000 R11: 0000000000000216 R12: 00000000004b973d
R13: 00000000ffffffff R14: 000000002001d000 R15: 00000000000002dd
Code: 50 2b 34 82 c7 00 f1 f1 f1 f1 c7 40 04 04 f2 f2 f2 c7 40 08 f3 f3
f3 f3 e8 a1 43 39 ff 4c 89 f8 48 8b 95 70 ff ff ff 48 c1 e8 03 <0f> b6
0c 18 4c 89 f8 83 e0 07 83 c0 03 38 c8 7c 08 84 c9 0f 85
RIP: __read_once_size include/linux/compiler.h:250 [inline] RSP:
ffff8801c6e5f1b0
RIP: atomic_read arch/x86/include/asm/atomic.h:26 [inline] RSP:
ffff8801c6e5f1b0
RIP: refcount_sub_and_test+0x7d/0x1b0 lib/refcount.c:178 RSP:
ffff8801c6e5f1b0
---[ end trace e441d046c6410d31 ]---

Signed-off-by: Eric Dumazet <edumazet@google.com>
Reported-by: Dmitry Vyukov <dvyukov@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/addrconf.h | 10 ++++++++++
 net/ipv6/route.c       |  6 +++---
 2 files changed, 13 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/include/net/addrconf.h b/include/net/addrconf.h
index 6df79e96a780..f44ff2476758 100644
--- a/include/net/addrconf.h
+++ b/include/net/addrconf.h
@@ -336,6 +336,16 @@ static inline void in6_dev_put(struct inet6_dev *idev)
 		in6_dev_finish_destroy(idev);
 }
 
+static inline void in6_dev_put_clear(struct inet6_dev **pidev)
+{
+	struct inet6_dev *idev = *pidev;
+
+	if (idev) {
+		in6_dev_put(idev);
+		*pidev = NULL;
+	}
+}
+
 static inline void __in6_dev_put(struct inet6_dev *idev)
 {
 	refcount_dec(&idev->refcnt);
diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index 99d4727f2b18..94d6a13d47f0 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -3721,10 +3721,10 @@ static int ip6_route_dev_notify(struct notifier_block *this,
 		/* NETDEV_UNREGISTER could be fired for multiple times by
 		 * netdev_wait_allrefs(). Make sure we only call this once.
 		 */
-		in6_dev_put(net->ipv6.ip6_null_entry->rt6i_idev);
+		in6_dev_put_clear(&net->ipv6.ip6_null_entry->rt6i_idev);
 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
-		in6_dev_put(net->ipv6.ip6_prohibit_entry->rt6i_idev);
-		in6_dev_put(net->ipv6.ip6_blk_hole_entry->rt6i_idev);
+		in6_dev_put_clear(&net->ipv6.ip6_prohibit_entry->rt6i_idev);
+		in6_dev_put_clear(&net->ipv6.ip6_blk_hole_entry->rt6i_idev);
 #endif
 	}
 
-- 
cgit v1.2.3


From b3dc8f772fab5b2d284b780830fd56494491e493 Mon Sep 17 00:00:00 2001
From: Tonghao Zhang <xiangxia.m.yue@gmail.com>
Date: Tue, 15 Aug 2017 04:28:54 -0700
Subject: net: Fix a typo in comment about sock flags.

Signed-off-by: Tonghao Zhang <xiangxia.m.yue@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/net.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/net.h b/include/linux/net.h
index dda2cc939a53..ebeb48c92005 100644
--- a/include/linux/net.h
+++ b/include/linux/net.h
@@ -37,7 +37,7 @@ struct net;
 
 /* Historically, SOCKWQ_ASYNC_NOSPACE & SOCKWQ_ASYNC_WAITDATA were located
  * in sock->flags, but moved into sk->sk_wq->flags to be RCU protected.
- * Eventually all flags will be in sk->sk_wq_flags.
+ * Eventually all flags will be in sk->sk_wq->flags.
  */
 #define SOCKWQ_ASYNC_NOSPACE	0
 #define SOCKWQ_ASYNC_WAITDATA	1
-- 
cgit v1.2.3


From 808ae8f3c7fefef3aece08820c108b68cdb06e1e Mon Sep 17 00:00:00 2001
From: Chanwoo Choi <cw00.choi@samsung.com>
Date: Wed, 22 Mar 2017 19:17:49 +0900
Subject: extcon: Remove deprecated extcon_set/get_cable_state_()

The commit 575c2b867ee0 ("extcon: Rename the extcon_set/get_state()
to maintain the function naming pattern") renames the extcon function as
following: But, the extcon just keeps the old API to prevent the build error.
This patch removes the deprecatd extcon API.

- extcon_get_cable_state_() -> extcon_get_state()
- extcon_set_cable_state_() -> extcon_set_state_sync()

Signed-off-by: Chanwoo Choi <cw00.choi@samsung.com>
---
 include/linux/extcon.h | 11 -----------
 1 file changed, 11 deletions(-)

(limited to 'include')

diff --git a/include/linux/extcon.h b/include/linux/extcon.h
index 7e206a9f88db..3ba02eecba2e 100644
--- a/include/linux/extcon.h
+++ b/include/linux/extcon.h
@@ -422,15 +422,4 @@ static inline int extcon_unregister_interest(struct extcon_specific_cable_nb
 {
 	return -EINVAL;
 }
-
-static inline int extcon_get_cable_state_(struct extcon_dev *edev, unsigned int id)
-{
-	return extcon_get_state(edev, id);
-}
-
-static inline int extcon_set_cable_state_(struct extcon_dev *edev, unsigned int id,
-				   bool cable_state)
-{
-	return extcon_set_state_sync(edev, id, cable_state);
-}
 #endif /* __LINUX_EXTCON_H__ */
-- 
cgit v1.2.3


From 6ab6094f370df082fdf2e117381dd323ce948f68 Mon Sep 17 00:00:00 2001
From: Chanwoo Choi <cw00.choi@samsung.com>
Date: Mon, 3 Apr 2017 19:45:48 +0900
Subject: extcon: Correct description to improve the readability

The extcon files explains the detailed operation for functions and
what is meaning of extcon structure. There are different explanation
even if the same argument.

So, it modifies the description for both functions and structures
in order to improve the readability and guide the role of functions
more well.

Also, this patch fixes the mismatching license info as a GPL v2
and removes the inactive author information.

Signed-off-by: Chanwoo Choi <cw00.choi@samsung.com>
---
 drivers/extcon/devres.c |  50 +++++----
 drivers/extcon/extcon.c | 274 +++++++++++++++++++++++-------------------------
 include/linux/extcon.h  |  72 ++++++-------
 3 files changed, 190 insertions(+), 206 deletions(-)

(limited to 'include')

diff --git a/drivers/extcon/devres.c b/drivers/extcon/devres.c
index 186fd735eb28..f599aeddf8e5 100644
--- a/drivers/extcon/devres.c
+++ b/drivers/extcon/devres.c
@@ -1,5 +1,5 @@
 /*
- *  drivers/extcon/devres.c - EXTCON device's resource management
+ * drivers/extcon/devres.c - EXTCON device's resource management
  *
  * Copyright (C) 2016 Samsung Electronics
  * Author: Chanwoo Choi <cw00.choi@samsung.com>
@@ -59,10 +59,9 @@ static void devm_extcon_dev_notifier_all_unreg(struct device *dev, void *res)
 
 /**
  * devm_extcon_dev_allocate - Allocate managed extcon device
- * @dev:		device owning the extcon device being created
- * @supported_cable:	Array of supported extcon ending with EXTCON_NONE.
- *			If supported_cable is NULL, cable name related APIs
- *			are disabled.
+ * @dev:		the device owning the extcon device being created
+ * @supported_cable:	the array of the supported external connectors
+ *			ending with EXTCON_NONE.
  *
  * This function manages automatically the memory of extcon device using device
  * resource management and simplify the control of freeing the memory of extcon
@@ -97,8 +96,8 @@ EXPORT_SYMBOL_GPL(devm_extcon_dev_allocate);
 
 /**
  * devm_extcon_dev_free() - Resource-managed extcon_dev_unregister()
- * @dev:	device the extcon belongs to
- * @edev:	the extcon device to unregister
+ * @dev:	the device owning the extcon device being created
+ * @edev:	the extcon device to be freed
  *
  * Free the memory that is allocated with devm_extcon_dev_allocate()
  * function.
@@ -112,10 +111,9 @@ EXPORT_SYMBOL_GPL(devm_extcon_dev_free);
 
 /**
  * devm_extcon_dev_register() - Resource-managed extcon_dev_register()
- * @dev:	device to allocate extcon device
- * @edev:	the new extcon device to register
+ * @dev:	the device owning the extcon device being created
+ * @edev:	the extcon device to be registered
  *
- * Managed extcon_dev_register() function. If extcon device is attached with
  * this function, that extcon device is automatically unregistered on driver
  * detach. Internally this function calls extcon_dev_register() function.
  * To get more information, refer that function.
@@ -149,8 +147,8 @@ EXPORT_SYMBOL_GPL(devm_extcon_dev_register);
 
 /**
  * devm_extcon_dev_unregister() - Resource-managed extcon_dev_unregister()
- * @dev:	device the extcon belongs to
- * @edev:	the extcon device to unregister
+ * @dev:	the device owning the extcon device being created
+ * @edev:	the extcon device to unregistered
  *
  * Unregister extcon device that is registered with devm_extcon_dev_register()
  * function.
@@ -164,10 +162,10 @@ EXPORT_SYMBOL_GPL(devm_extcon_dev_unregister);
 
 /**
  * devm_extcon_register_notifier() - Resource-managed extcon_register_notifier()
- * @dev:	device to allocate extcon device
- * @edev:	the extcon device that has the external connecotr.
- * @id:		the unique id of each external connector in extcon enumeration.
- * @nb:		a notifier block to be registered.
+ * @dev:	the device owning the extcon device being created
+ * @edev:	the extcon device
+ * @id:		the unique id among the extcon enumeration
+ * @nb:		a notifier block to be registered
  *
  * This function manages automatically the notifier of extcon device using
  * device resource management and simplify the control of unregistering
@@ -208,10 +206,10 @@ EXPORT_SYMBOL(devm_extcon_register_notifier);
 /**
  * devm_extcon_unregister_notifier()
 			- Resource-managed extcon_unregister_notifier()
- * @dev:	device to allocate extcon device
- * @edev:	the extcon device that has the external connecotr.
- * @id:		the unique id of each external connector in extcon enumeration.
- * @nb:		a notifier block to be registered.
+ * @dev:	the device owning the extcon device being created
+ * @edev:	the extcon device
+ * @id:		the unique id among the extcon enumeration
+ * @nb:		a notifier block to be registered
  */
 void devm_extcon_unregister_notifier(struct device *dev,
 				struct extcon_dev *edev, unsigned int id,
@@ -225,9 +223,9 @@ EXPORT_SYMBOL(devm_extcon_unregister_notifier);
 /**
  * devm_extcon_register_notifier_all()
  *		- Resource-managed extcon_register_notifier_all()
- * @dev:	device to allocate extcon device
- * @edev:	the extcon device that has the external connecotr.
- * @nb:		a notifier block to be registered.
+ * @dev:	the device owning the extcon device being created
+ * @edev:	the extcon device
+ * @nb:		a notifier block to be registered
  *
  * This function manages automatically the notifier of extcon device using
  * device resource management and simplify the control of unregistering
@@ -263,9 +261,9 @@ EXPORT_SYMBOL(devm_extcon_register_notifier_all);
 /**
  * devm_extcon_unregister_notifier_all()
  *		- Resource-managed extcon_unregister_notifier_all()
- * @dev:	device to allocate extcon device
- * @edev:	the extcon device that has the external connecotr.
- * @nb:		a notifier block to be registered.
+ * @dev:	the device owning the extcon device being created
+ * @edev:	the extcon device
+ * @nb:		a notifier block to be registered
  */
 void devm_extcon_unregister_notifier_all(struct device *dev,
 				struct extcon_dev *edev,
diff --git a/drivers/extcon/extcon.c b/drivers/extcon/extcon.c
index b93932f79584..35e9fb885486 100644
--- a/drivers/extcon/extcon.c
+++ b/drivers/extcon/extcon.c
@@ -1,7 +1,5 @@
 /*
- *  drivers/extcon/extcon.c - External Connector (extcon) framework.
- *
- *  External connector (extcon) class driver
+ * drivers/extcon/extcon.c - External Connector (extcon) framework.
  *
  * Copyright (C) 2015 Samsung Electronics
  * Author: Chanwoo Choi <cw00.choi@samsung.com>
@@ -199,13 +197,13 @@ struct __extcon_info {
 };
 
 /**
- * struct extcon_cable - An internal data for each cable of extcon device.
- * @edev:		The extcon device
- * @cable_index:	Index of this cable in the edev
- * @attr_g:		Attribute group for the cable
+ * struct extcon_cable - An internal data for an external connector.
+ * @edev:		the extcon device
+ * @cable_index:	the index of this cable in the edev
+ * @attr_g:		the attribute group for the cable
  * @attr_name:		"name" sysfs entry
  * @attr_state:		"state" sysfs entry
- * @attrs:		Array pointing to attr_name and attr_state for attr_g
+ * @attrs:		the array pointing to attr_name and attr_state for attr_g
  */
 struct extcon_cable {
 	struct extcon_dev *edev;
@@ -233,15 +231,6 @@ static struct class *extcon_class;
 static LIST_HEAD(extcon_dev_list);
 static DEFINE_MUTEX(extcon_dev_list_lock);
 
-/**
- * check_mutually_exclusive - Check if new_state violates mutually_exclusive
- *			      condition.
- * @edev:	the extcon device
- * @new_state:	new cable attach status for @edev
- *
- * Returns 0 if nothing violates. Returns the index + 1 for the first
- * violated condition.
- */
 static int check_mutually_exclusive(struct extcon_dev *edev, u32 new_state)
 {
 	int i = 0;
@@ -416,11 +405,13 @@ static ssize_t cable_state_show(struct device *dev,
 }
 
 /**
- * extcon_sync()	- Synchronize the states for both the attached/detached
- * @edev:		the extcon device that has the cable.
+ * extcon_sync() - Synchronize the state for an external connector.
+ * @edev:	the extcon device
+ *
+ * Note that this function send a notification in order to synchronize
+ * the state and property of an external connector.
  *
- * This function send a notification to synchronize the all states of a
- * specific external connector
+ * Returns 0 if success or error number if fail.
  */
 int extcon_sync(struct extcon_dev *edev, unsigned int id)
 {
@@ -496,9 +487,11 @@ int extcon_sync(struct extcon_dev *edev, unsigned int id)
 EXPORT_SYMBOL_GPL(extcon_sync);
 
 /**
- * extcon_get_state() - Get the state of a external connector.
- * @edev:	the extcon device that has the cable.
- * @id:		the unique id of each external connector in extcon enumeration.
+ * extcon_get_state() - Get the state of an external connector.
+ * @edev:	the extcon device
+ * @id:		the unique id indicating an external connector
+ *
+ * Returns 0 if success or error number if fail.
  */
 int extcon_get_state(struct extcon_dev *edev, const unsigned int id)
 {
@@ -521,20 +514,19 @@ int extcon_get_state(struct extcon_dev *edev, const unsigned int id)
 EXPORT_SYMBOL_GPL(extcon_get_state);
 
 /**
- * extcon_set_state() - Set the state of a external connector.
- *			without a notification.
- * @edev:		the extcon device that has the cable.
- * @id:			the unique id of each external connector
- *			in extcon enumeration.
- * @state:		the new cable status. The default semantics is
- *			true: attached / false: detached.
+ * extcon_set_state() - Set the state of an external connector.
+ * @edev:	the extcon device
+ * @id:		the unique id indicating an external connector
+ * @state:	the new state of an external connector.
+ *		the default semantics is true: attached / false: detached.
+ *
+ * Note that this function set the state of an external connector without
+ * a notification. To synchronize the state of an external connector,
+ * have to use extcon_set_state_sync() and extcon_sync().
  *
- * This function only set the state of a external connector without
- * a notification. To synchronize the data of a external connector,
- * use extcon_set_state_sync() and extcon_sync().
+ * Returns 0 if success or error number if fail.
  */
-int extcon_set_state(struct extcon_dev *edev, unsigned int id,
-				bool cable_state)
+int extcon_set_state(struct extcon_dev *edev, unsigned int id, bool state)
 {
 	unsigned long flags;
 	int index, ret = 0;
@@ -549,11 +541,11 @@ int extcon_set_state(struct extcon_dev *edev, unsigned int id,
 	spin_lock_irqsave(&edev->lock, flags);
 
 	/* Check whether the external connector's state is changed. */
-	if (!is_extcon_changed(edev, index, cable_state))
+	if (!is_extcon_changed(edev, index, state))
 		goto out;
 
 	if (check_mutually_exclusive(edev,
-		(edev->state & ~BIT(index)) | (cable_state & BIT(index)))) {
+		(edev->state & ~BIT(index)) | (state & BIT(index)))) {
 		ret = -EPERM;
 		goto out;
 	}
@@ -562,11 +554,11 @@ int extcon_set_state(struct extcon_dev *edev, unsigned int id,
 	 * Initialize the value of extcon property before setting
 	 * the detached state for an external connector.
 	 */
-	if (!cable_state)
+	if (!state)
 		init_property(edev, id, index);
 
-	/* Update the state for a external connector. */
-	if (cable_state)
+	/* Update the state for an external connector. */
+	if (state)
 		edev->state |= BIT(index);
 	else
 		edev->state &= ~(BIT(index));
@@ -578,19 +570,18 @@ out:
 EXPORT_SYMBOL_GPL(extcon_set_state);
 
 /**
- * extcon_set_state_sync() - Set the state of a external connector
- *			with a notification.
- * @edev:		the extcon device that has the cable.
- * @id:			the unique id of each external connector
- *			in extcon enumeration.
- * @state:		the new cable status. The default semantics is
- *			true: attached / false: detached.
+ * extcon_set_state_sync() - Set the state of an external connector with sync.
+ * @edev:	the extcon device
+ * @id:		the unique id indicating an external connector
+ * @state:	the new state of external connector.
+ *		the default semantics is true: attached / false: detached.
+ *
+ * Note that this function set the state of external connector
+ * and synchronize the state by sending a notification.
  *
- * This function set the state of external connector and synchronize the data
- * by usning a notification.
+ * Returns 0 if success or error number if fail.
  */
-int extcon_set_state_sync(struct extcon_dev *edev, unsigned int id,
-				bool cable_state)
+int extcon_set_state_sync(struct extcon_dev *edev, unsigned int id, bool state)
 {
 	int ret, index;
 	unsigned long flags;
@@ -601,12 +592,12 @@ int extcon_set_state_sync(struct extcon_dev *edev, unsigned int id,
 
 	/* Check whether the external connector's state is changed. */
 	spin_lock_irqsave(&edev->lock, flags);
-	ret = is_extcon_changed(edev, index, cable_state);
+	ret = is_extcon_changed(edev, index, state);
 	spin_unlock_irqrestore(&edev->lock, flags);
 	if (!ret)
 		return 0;
 
-	ret = extcon_set_state(edev, id, cable_state);
+	ret = extcon_set_state(edev, id, state);
 	if (ret < 0)
 		return ret;
 
@@ -615,19 +606,18 @@ int extcon_set_state_sync(struct extcon_dev *edev, unsigned int id,
 EXPORT_SYMBOL_GPL(extcon_set_state_sync);
 
 /**
- * extcon_get_property() - Get the property value of a specific cable.
- * @edev:		the extcon device that has the cable.
- * @id:			the unique id of each external connector
- *			in extcon enumeration.
- * @prop:		the property id among enum extcon_property.
- * @prop_val:		the pointer which store the value of property.
+ * extcon_get_property() - Get the property value of an external connector.
+ * @edev:	the extcon device
+ * @id:		the unique id indicating an external connector
+ * @prop:	the property id indicating an extcon property
+ * @prop_val:	the pointer which store the value of extcon property
  *
- * When getting the property value of external connector, the external connector
- * should be attached. If detached state, function just return 0 without
- * property value. Also, the each property should be included in the list of
- * supported properties according to the type of external connectors.
+ * Note that when getting the property value of external connector,
+ * the external connector should be attached. If detached state, function
+ * return 0 without property value. Also, the each property should be
+ * included in the list of supported properties according to extcon type.
  *
- * Returns 0 if success or error number if fail
+ * Returns 0 if success or error number if fail.
  */
 int extcon_get_property(struct extcon_dev *edev, unsigned int id,
 				unsigned int prop,
@@ -697,17 +687,16 @@ int extcon_get_property(struct extcon_dev *edev, unsigned int id,
 EXPORT_SYMBOL_GPL(extcon_get_property);
 
 /**
- * extcon_set_property() - Set the property value of a specific cable.
- * @edev:		the extcon device that has the cable.
- * @id:			the unique id of each external connector
- *			in extcon enumeration.
- * @prop:		the property id among enum extcon_property.
- * @prop_val:		the pointer including the new value of property.
+ * extcon_set_property() - Set the property value of an external connector.
+ * @edev:	the extcon device
+ * @id:		the unique id indicating an external connector
+ * @prop:	the property id indicating an extcon property
+ * @prop_val:	the pointer including the new value of extcon property
  *
- * The each property should be included in the list of supported properties
- * according to the type of external connectors.
+ * Note that each property should be included in the list of supported
+ * properties according to the extcon type.
  *
- * Returns 0 if success or error number if fail
+ * Returns 0 if success or error number if fail.
  */
 int extcon_set_property(struct extcon_dev *edev, unsigned int id,
 				unsigned int prop,
@@ -765,15 +754,14 @@ int extcon_set_property(struct extcon_dev *edev, unsigned int id,
 EXPORT_SYMBOL_GPL(extcon_set_property);
 
 /**
- * extcon_set_property_sync() - Set the property value of a specific cable
-			with a notification.
- * @prop_val:		the pointer including the new value of property.
+ * extcon_set_property_sync() - Set property of an external connector with sync.
+ * @prop_val:	the pointer including the new value of extcon property
  *
- * When setting the property value of external connector, the external connector
- * should be attached. The each property should be included in the list of
- * supported properties according to the type of external connectors.
+ * Note that when setting the property value of external connector,
+ * the external connector should be attached. The each property should
+ * be included in the list of supported properties according to extcon type.
  *
- * Returns 0 if success or error number if fail
+ * Returns 0 if success or error number if fail.
  */
 int extcon_set_property_sync(struct extcon_dev *edev, unsigned int id,
 				unsigned int prop,
@@ -790,12 +778,11 @@ int extcon_set_property_sync(struct extcon_dev *edev, unsigned int id,
 EXPORT_SYMBOL_GPL(extcon_set_property_sync);
 
 /**
- * extcon_get_property_capability() - Get the capability of property
- *			of an external connector.
- * @edev:		the extcon device that has the cable.
- * @id:			the unique id of each external connector
- *			in extcon enumeration.
- * @prop:		the property id among enum extcon_property.
+ * extcon_get_property_capability() - Get the capability of the property
+ *					for an external connector.
+ * @edev:	the extcon device
+ * @id:		the unique id indicating an external connector
+ * @prop:	the property id indicating an extcon property
  *
  * Returns 1 if the property is available or 0 if not available.
  */
@@ -821,18 +808,17 @@ int extcon_get_property_capability(struct extcon_dev *edev, unsigned int id,
 EXPORT_SYMBOL_GPL(extcon_get_property_capability);
 
 /**
- * extcon_set_property_capability() - Set the capability of a property
- *			of an external connector.
- * @edev:		the extcon device that has the cable.
- * @id:			the unique id of each external connector
- *			in extcon enumeration.
- * @prop:		the property id among enum extcon_property.
+ * extcon_set_property_capability() - Set the capability of the property
+ *					for an external connector.
+ * @edev:	the extcon device
+ * @id:		the unique id indicating an external connector
+ * @prop:	the property id indicating an extcon property
  *
- * This function set the capability of a property for an external connector
- * to mark the bit in capability bitmap which mean the available state of
- * a property.
+ * Note that this function set the capability of the property
+ * for an external connector in order to mark the bit in capability
+ * bitmap which mean the available state of the property.
  *
- * Returns 0 if success or error number if fail
+ * Returns 0 if success or error number if fail.
  */
 int extcon_set_property_capability(struct extcon_dev *edev, unsigned int id,
 					unsigned int prop)
@@ -880,8 +866,10 @@ int extcon_set_property_capability(struct extcon_dev *edev, unsigned int id,
 EXPORT_SYMBOL_GPL(extcon_set_property_capability);
 
 /**
- * extcon_get_extcon_dev() - Get the extcon device instance from the name
- * @extcon_name:	The extcon name provided with extcon_dev_register()
+ * extcon_get_extcon_dev() - Get the extcon device instance from the name.
+ * @extcon_name:	the extcon name provided with extcon_dev_register()
+ *
+ * Return the pointer of extcon device if success or ERR_PTR(err) if fail.
  */
 struct extcon_dev *extcon_get_extcon_dev(const char *extcon_name)
 {
@@ -903,15 +891,17 @@ out:
 EXPORT_SYMBOL_GPL(extcon_get_extcon_dev);
 
 /**
- * extcon_register_notifier() - Register a notifiee to get notified by
- *				any attach status changes from the extcon.
- * @edev:	the extcon device that has the external connecotr.
- * @id:		the unique id of each external connector in extcon enumeration.
- * @nb:		a notifier block to be registered.
+ * extcon_register_notifier() - Register a notifier block to get notified by
+ *				any state changes from the extcon.
+ * @edev:	the extcon device
+ * @id:		the unique id indicating an external connector
+ * @nb:		a notifier block to be registered
  *
  * Note that the second parameter given to the callback of nb (val) is
- * "old_state", not the current state. The current state can be retrieved
- * by looking at the third pameter (edev pointer)'s state value.
+ * the current state of an external connector and the third pameter
+ * is the pointer of extcon device.
+ *
+ * Returns 0 if success or error number if fail.
  */
 int extcon_register_notifier(struct extcon_dev *edev, unsigned int id,
 			     struct notifier_block *nb)
@@ -935,10 +925,12 @@ int extcon_register_notifier(struct extcon_dev *edev, unsigned int id,
 EXPORT_SYMBOL_GPL(extcon_register_notifier);
 
 /**
- * extcon_unregister_notifier() - Unregister a notifiee from the extcon device.
- * @edev:	the extcon device that has the external connecotr.
- * @id:		the unique id of each external connector in extcon enumeration.
- * @nb:		a notifier block to be registered.
+ * extcon_unregister_notifier() - Unregister a notifier block from the extcon.
+ * @edev:	the extcon device
+ * @id:		the unique id indicating an external connector
+ * @nb:		a notifier block to be registered
+ *
+ * Returns 0 if success or error number if fail.
  */
 int extcon_unregister_notifier(struct extcon_dev *edev, unsigned int id,
 				struct notifier_block *nb)
@@ -962,16 +954,16 @@ int extcon_unregister_notifier(struct extcon_dev *edev, unsigned int id,
 EXPORT_SYMBOL_GPL(extcon_unregister_notifier);
 
 /**
- * extcon_register_notifier_all() - Register a notifier block for all connectors
- * @edev:	the extcon device that has the external connector.
- * @nb:		a notifier block to be registered.
+ * extcon_register_notifier_all() - Register a notifier block for all connectors.
+ * @edev:	the extcon device
+ * @nb:		a notifier block to be registered
  *
- * This function registers a notifier block in order to receive the state
- * change of all supported external connectors from extcon device.
+ * Note that this function registers a notifier block in order to receive
+ * the state change of all supported external connectors from extcon device.
  * And the second parameter given to the callback of nb (val) is
- * the current state and third parameter is the edev pointer.
+ * the current state and the third pameter is the pointer of extcon device.
  *
- * Returns 0 if success or error number if fail
+ * Returns 0 if success or error number if fail.
  */
 int extcon_register_notifier_all(struct extcon_dev *edev,
 				struct notifier_block *nb)
@@ -992,10 +984,10 @@ EXPORT_SYMBOL_GPL(extcon_register_notifier_all);
 
 /**
  * extcon_unregister_notifier_all() - Unregister a notifier block from extcon.
- * @edev:	the extcon device that has the external connecotr.
- * @nb:		a notifier block to be registered.
+ * @edev:	the extcon device
+ * @nb:		a notifier block to be registered
  *
- * Returns 0 if success or error number if fail
+ * Returns 0 if success or error number if fail.
  */
 int extcon_unregister_notifier_all(struct extcon_dev *edev,
 				struct notifier_block *nb)
@@ -1044,15 +1036,14 @@ static void dummy_sysfs_dev_release(struct device *dev)
 
 /*
  * extcon_dev_allocate() - Allocate the memory of extcon device.
- * @supported_cable:	Array of supported extcon ending with EXTCON_NONE.
- *			If supported_cable is NULL, cable name related APIs
- *			are disabled.
+ * @supported_cable:	the array of the supported external connectors
+ *			ending with EXTCON_NONE.
  *
- * This function allocates the memory for extcon device without allocating
- * memory in each extcon provider driver and initialize default setting for
- * extcon device.
+ * Note that this function allocates the memory for extcon device 
+ * and initialize default setting for the extcon device.
  *
- * Return the pointer of extcon device if success or ERR_PTR(err) if fail
+ * Returns the pointer memory of allocated extcon_dev if success
+ * or ERR_PTR(err) if fail.
  */
 struct extcon_dev *extcon_dev_allocate(const unsigned int *supported_cable)
 {
@@ -1073,7 +1064,7 @@ struct extcon_dev *extcon_dev_allocate(const unsigned int *supported_cable)
 
 /*
  * extcon_dev_free() - Free the memory of extcon device.
- * @edev:	the extcon device to free
+ * @edev:	the extcon device
  */
 void extcon_dev_free(struct extcon_dev *edev)
 {
@@ -1082,13 +1073,18 @@ void extcon_dev_free(struct extcon_dev *edev)
 EXPORT_SYMBOL_GPL(extcon_dev_free);
 
 /**
- * extcon_dev_register() - Register a new extcon device
- * @edev	: the new extcon device (should be allocated before calling)
+ * extcon_dev_register() - Register an new extcon device
+ * @edev:	the extcon device to be registered
  *
  * Among the members of edev struct, please set the "user initializing data"
- * in any case and set the "optional callbacks" if required. However, please
  * do not set the values of "internal data", which are initialized by
  * this function.
+ *
+ * Note that before calling this funciton, have to allocate the memory
+ * of an extcon device by using the extcon_dev_allocate(). And the extcon
+ * dev should include the supported_cable information.
+ *
+ * Returns 0 if success or error number if fail.
  */
 int extcon_dev_register(struct extcon_dev *edev)
 {
@@ -1295,7 +1291,7 @@ EXPORT_SYMBOL_GPL(extcon_dev_register);
 
 /**
  * extcon_dev_unregister() - Unregister the extcon device.
- * @edev:	the extcon device instance to be unregistered.
+ * @edev:	the extcon device to be unregistered.
  *
  * Note that this does not call kfree(edev) because edev was not allocated
  * by this class.
@@ -1341,11 +1337,11 @@ EXPORT_SYMBOL_GPL(extcon_dev_unregister);
 
 #ifdef CONFIG_OF
 /*
- * extcon_get_edev_by_phandle - Get the extcon device from devicetree
- * @dev - instance to the given device
- * @index - index into list of extcon_dev
+ * extcon_get_edev_by_phandle - Get the extcon device from devicetree.
+ * @dev		: the instance to the given device
+ * @index	: the index into list of extcon_dev
  *
- * return the instance of extcon device
+ * Return the pointer of extcon device if success or ERR_PTR(err) if fail.
  */
 struct extcon_dev *extcon_get_edev_by_phandle(struct device *dev, int index)
 {
@@ -1410,8 +1406,6 @@ static void __exit extcon_class_exit(void)
 module_exit(extcon_class_exit);
 
 MODULE_AUTHOR("Chanwoo Choi <cw00.choi@samsung.com>");
-MODULE_AUTHOR("Mike Lockwood <lockwood@android.com>");
-MODULE_AUTHOR("Donggeun Kim <dg77.kim@samsung.com>");
 MODULE_AUTHOR("MyungJoo Ham <myungjoo.ham@samsung.com>");
-MODULE_DESCRIPTION("External connector (extcon) class driver");
-MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("External Connector (extcon) framework");
+MODULE_LICENSE("GPL v2");
diff --git a/include/linux/extcon.h b/include/linux/extcon.h
index 3ba02eecba2e..e03204d5a366 100644
--- a/include/linux/extcon.h
+++ b/include/linux/extcon.h
@@ -1,5 +1,5 @@
 /*
- *  External connector (extcon) class driver
+ * External Connector (extcon) framework
  *
  * Copyright (C) 2015 Samsung Electronics
  * Author: Chanwoo Choi <cw00.choi@samsung.com>
@@ -20,8 +20,7 @@
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  * GNU General Public License for more details.
- *
-*/
+ */
 
 #ifndef __LINUX_EXTCON_H__
 #define __LINUX_EXTCON_H__
@@ -93,7 +92,7 @@
 #define EXTCON_NUM		63
 
 /*
- * Define the property of supported external connectors.
+ * Define the properties of supported external connectors.
  *
  * When adding the new extcon property, they *must* have
  * the type/value/default information. Also, you *have to*
@@ -176,44 +175,42 @@ struct extcon_dev;
 
 #if IS_ENABLED(CONFIG_EXTCON)
 
-/*
- * Following APIs are for notifiers or configurations.
- * Notifiers are the external port and connection devices.
- */
+/* Following APIs register/unregister the extcon device. */
 extern int extcon_dev_register(struct extcon_dev *edev);
 extern void extcon_dev_unregister(struct extcon_dev *edev);
 extern int devm_extcon_dev_register(struct device *dev,
 				    struct extcon_dev *edev);
 extern void devm_extcon_dev_unregister(struct device *dev,
 				       struct extcon_dev *edev);
-extern struct extcon_dev *extcon_get_extcon_dev(const char *extcon_name);
 
-/*
- * Following APIs control the memory of extcon device.
- */
+/* Following APIs allocate/free the memory of the extcon device. */
 extern struct extcon_dev *extcon_dev_allocate(const unsigned int *cable);
 extern void extcon_dev_free(struct extcon_dev *edev);
 extern struct extcon_dev *devm_extcon_dev_allocate(struct device *dev,
 						   const unsigned int *cable);
 extern void devm_extcon_dev_free(struct device *dev, struct extcon_dev *edev);
 
+/* Synchronize the state and property value for each external connector. */
+extern int extcon_sync(struct extcon_dev *edev, unsigned int id);
+
 /*
- * get/set_state access each bit of the 32b encoded state value.
- * They are used to access the status of each cable based on the cable id.
+ * Following APIs get/set the connected state of each external connector.
+ * The 'id' argument indicates the defined external connector.
  */
 extern int extcon_get_state(struct extcon_dev *edev, unsigned int id);
 extern int extcon_set_state(struct extcon_dev *edev, unsigned int id,
-				   bool cable_state);
+				   bool state);
 extern int extcon_set_state_sync(struct extcon_dev *edev, unsigned int id,
-				bool cable_state);
-/*
- * Synchronize the state and property data for a specific external connector.
- */
-extern int extcon_sync(struct extcon_dev *edev, unsigned int id);
+				bool state);
 
 /*
- * get/set_property access the property value of each external connector.
- * They are used to access the property of each cable based on the property id.
+ * Following APIs get/set the property of each external connector.
+ * The 'id' argument indicates the defined external connector
+ * and the 'prop' indicates the extcon property.
+ *
+ * And extcon_get/set_property_capability() set the capability of the property
+ * for each external connector. They are used to set the capability of the
+ * property of each external connector based on the id and property.
  */
 extern int extcon_get_property(struct extcon_dev *edev, unsigned int id,
 				unsigned int prop,
@@ -224,19 +221,15 @@ extern int extcon_set_property(struct extcon_dev *edev, unsigned int id,
 extern int extcon_set_property_sync(struct extcon_dev *edev, unsigned int id,
 				unsigned int prop,
 				union extcon_property_value prop_val);
-
-/*
- * get/set_property_capability set the capability of the property for each
- * external connector. They are used to set the capability of the property
- * of each external connector based on the id and property.
- */
 extern int extcon_get_property_capability(struct extcon_dev *edev,
 				unsigned int id, unsigned int prop);
 extern int extcon_set_property_capability(struct extcon_dev *edev,
 				unsigned int id, unsigned int prop);
 
 /*
- * Following APIs are to monitor the status change of the external connectors.
+ * Following APIs register the notifier block in order to detect
+ * the change of both state and property value for each external connector.
+ *
  * extcon_register_notifier(*edev, id, *nb) : Register a notifier block
  *			for specific external connector of the extcon.
  * extcon_register_notifier_all(*edev, *nb) : Register a notifier block
@@ -265,16 +258,15 @@ extern void devm_extcon_unregister_notifier_all(struct device *dev,
 				struct notifier_block *nb);
 
 /*
- * Following API get the extcon device from devicetree.
- * This function use phandle of devicetree to get extcon device directly.
+ * Following APIs get the extcon_dev from devicetree or by through extcon name.
  */
+extern struct extcon_dev *extcon_get_extcon_dev(const char *extcon_name);
 extern struct extcon_dev *extcon_get_edev_by_phandle(struct device *dev,
 						     int index);
 
-/* Following API to get information of extcon device */
+/* Following API get the name of extcon device. */
 extern const char *extcon_get_edev_name(struct extcon_dev *edev);
 
-
 #else /* CONFIG_EXTCON */
 static inline int extcon_dev_register(struct extcon_dev *edev)
 {
@@ -314,13 +306,13 @@ static inline int extcon_get_state(struct extcon_dev *edev, unsigned int id)
 }
 
 static inline int extcon_set_state(struct extcon_dev *edev, unsigned int id,
-				bool cable_state)
+				bool state)
 {
 	return 0;
 }
 
 static inline int extcon_set_state_sync(struct extcon_dev *edev, unsigned int id,
-				bool cable_state)
+				bool state)
 {
 	return 0;
 }
@@ -362,11 +354,6 @@ static inline int extcon_set_property_capability(struct extcon_dev *edev,
 	return 0;
 }
 
-static inline struct extcon_dev *extcon_get_extcon_dev(const char *extcon_name)
-{
-	return NULL;
-}
-
 static inline int extcon_register_notifier(struct extcon_dev *edev,
 					unsigned int id,
 					struct notifier_block *nb)
@@ -392,6 +379,11 @@ static inline  void devm_extcon_unregister_notifier(struct device *dev,
 				struct extcon_dev *edev, unsigned int id,
 				struct notifier_block *nb) { }
 
+static inline struct extcon_dev *extcon_get_extcon_dev(const char *extcon_name)
+{
+	return NULL;
+}
+
 static inline struct extcon_dev *extcon_get_edev_by_phandle(struct device *dev,
 							    int index)
 {
-- 
cgit v1.2.3


From ab8a8fbe939de5e2651da6761f83a2b9641b2daa Mon Sep 17 00:00:00 2001
From: Chanwoo Choi <cwchoi00@gmail.com>
Date: Fri, 14 Jul 2017 01:00:55 +0900
Subject: extcon: Use tab instead of space for indentation

The extcon header file defines the functions which used the mismatched
indentation and used the space on some case. So, this patch clean-up
the indentation in order to improve the readbility.

And this patch changes the return value of extcon_get_extcon_dev()
because of maintaing the same value with extcon_get_edev_by_phandle().

Signed-off-by: Chanwoo Choi <cwchoi00@gmail.com>
---
 include/linux/extcon.h | 51 ++++++++++++++++++++++++--------------------------
 1 file changed, 24 insertions(+), 27 deletions(-)

(limited to 'include')

diff --git a/include/linux/extcon.h b/include/linux/extcon.h
index e03204d5a366..744d60ca80c3 100644
--- a/include/linux/extcon.h
+++ b/include/linux/extcon.h
@@ -179,15 +179,15 @@ struct extcon_dev;
 extern int extcon_dev_register(struct extcon_dev *edev);
 extern void extcon_dev_unregister(struct extcon_dev *edev);
 extern int devm_extcon_dev_register(struct device *dev,
-				    struct extcon_dev *edev);
+				struct extcon_dev *edev);
 extern void devm_extcon_dev_unregister(struct device *dev,
-				       struct extcon_dev *edev);
+				struct extcon_dev *edev);
 
 /* Following APIs allocate/free the memory of the extcon device. */
 extern struct extcon_dev *extcon_dev_allocate(const unsigned int *cable);
 extern void extcon_dev_free(struct extcon_dev *edev);
 extern struct extcon_dev *devm_extcon_dev_allocate(struct device *dev,
-						   const unsigned int *cable);
+				const unsigned int *cable);
 extern void devm_extcon_dev_free(struct device *dev, struct extcon_dev *edev);
 
 /* Synchronize the state and property value for each external connector. */
@@ -199,7 +199,7 @@ extern int extcon_sync(struct extcon_dev *edev, unsigned int id);
  */
 extern int extcon_get_state(struct extcon_dev *edev, unsigned int id);
 extern int extcon_set_state(struct extcon_dev *edev, unsigned int id,
-				   bool state);
+				bool state);
 extern int extcon_set_state_sync(struct extcon_dev *edev, unsigned int id,
 				bool state);
 
@@ -236,9 +236,9 @@ extern int extcon_set_property_capability(struct extcon_dev *edev,
  *			for all supported external connectors of the extcon.
  */
 extern int extcon_register_notifier(struct extcon_dev *edev, unsigned int id,
-				    struct notifier_block *nb);
+				struct notifier_block *nb);
 extern int extcon_unregister_notifier(struct extcon_dev *edev, unsigned int id,
-				    struct notifier_block *nb);
+				struct notifier_block *nb);
 extern int devm_extcon_register_notifier(struct device *dev,
 				struct extcon_dev *edev, unsigned int id,
 				struct notifier_block *nb);
@@ -276,13 +276,13 @@ static inline int extcon_dev_register(struct extcon_dev *edev)
 static inline void extcon_dev_unregister(struct extcon_dev *edev) { }
 
 static inline int devm_extcon_dev_register(struct device *dev,
-					   struct extcon_dev *edev)
+				struct extcon_dev *edev)
 {
 	return -EINVAL;
 }
 
 static inline void devm_extcon_dev_unregister(struct device *dev,
-					      struct extcon_dev *edev) { }
+				struct extcon_dev *edev) { }
 
 static inline struct extcon_dev *extcon_dev_allocate(const unsigned int *cable)
 {
@@ -292,7 +292,7 @@ static inline struct extcon_dev *extcon_dev_allocate(const unsigned int *cable)
 static inline void extcon_dev_free(struct extcon_dev *edev) { }
 
 static inline struct extcon_dev *devm_extcon_dev_allocate(struct device *dev,
-						const unsigned int *cable)
+				const unsigned int *cable)
 {
 	return ERR_PTR(-ENOSYS);
 }
@@ -323,47 +323,45 @@ static inline int extcon_sync(struct extcon_dev *edev, unsigned int id)
 }
 
 static inline int extcon_get_property(struct extcon_dev *edev, unsigned int id,
-					unsigned int prop,
-					union extcon_property_value *prop_val)
+				unsigned int prop,
+				union extcon_property_value *prop_val)
 {
 	return 0;
 }
 static inline int extcon_set_property(struct extcon_dev *edev, unsigned int id,
-					unsigned int prop,
-					union extcon_property_value prop_val)
+				unsigned int prop,
+				union extcon_property_value prop_val)
 {
 	return 0;
 }
 
 static inline int extcon_set_property_sync(struct extcon_dev *edev,
-					unsigned int id, unsigned int prop,
-					union extcon_property_value prop_val)
+				unsigned int id, unsigned int prop,
+				union extcon_property_value prop_val)
 {
 	return 0;
 }
 
 static inline int extcon_get_property_capability(struct extcon_dev *edev,
-					unsigned int id, unsigned int prop)
+				unsigned int id, unsigned int prop)
 {
 	return 0;
 }
 
 static inline int extcon_set_property_capability(struct extcon_dev *edev,
-					unsigned int id, unsigned int prop)
+				unsigned int id, unsigned int prop)
 {
 	return 0;
 }
 
 static inline int extcon_register_notifier(struct extcon_dev *edev,
-					unsigned int id,
-					struct notifier_block *nb)
+				unsigned int id, struct notifier_block *nb)
 {
 	return 0;
 }
 
 static inline int extcon_unregister_notifier(struct extcon_dev *edev,
-					unsigned int id,
-					struct notifier_block *nb)
+				unsigned int id, struct notifier_block *nb)
 {
 	return 0;
 }
@@ -381,11 +379,11 @@ static inline  void devm_extcon_unregister_notifier(struct device *dev,
 
 static inline struct extcon_dev *extcon_get_extcon_dev(const char *extcon_name)
 {
-	return NULL;
+	return ERR_PTR(-ENODEV);
 }
 
 static inline struct extcon_dev *extcon_get_edev_by_phandle(struct device *dev,
-							    int index)
+				int index)
 {
 	return ERR_PTR(-ENODEV);
 }
@@ -403,14 +401,13 @@ struct extcon_specific_cable_nb {
 };
 
 static inline int extcon_register_interest(struct extcon_specific_cable_nb *obj,
-			const char *extcon_name, const char *cable_name,
-			struct notifier_block *nb)
+				const char *extcon_name, const char *cable_name,
+				struct notifier_block *nb)
 {
 	return -EINVAL;
 }
 
-static inline int extcon_unregister_interest(struct extcon_specific_cable_nb
-						    *obj)
+static inline int extcon_unregister_interest(struct extcon_specific_cable_nb *obj)
 {
 	return -EINVAL;
 }
-- 
cgit v1.2.3


From 714c53aa2e2d6d60ca1e7d18980767c8b715c288 Mon Sep 17 00:00:00 2001
From: Geert Uytterhoeven <geert+renesas@glider.be>
Date: Tue, 11 Jul 2017 16:58:42 +0200
Subject: clk: renesas: Add r8a77995 CPG Core Clock Definitions

Add all R-Car D3 Clock Pulse Generator Core Clock Outputs, as listed
in Table 8.2f ("List of Clocks [R-Car D3]") of the R-Car Series, 3rd
Generation Hardware User's Manual (Rev. 0.55, Jun. 30, 2017).

Note that internal CPG clocks (S0, S1, S2, S3, S1C, S3C, SDSRC, and
SSPSRC) are not included, as they are used as internal clock sources
only, and never referenced from DT.

Signed-off-by: Geert Uytterhoeven <geert+renesas@glider.be>
Acked-by: Stephen Boyd <sboyd@codeaurora.org>
---
 include/dt-bindings/clock/r8a77995-cpg-mssr.h | 57 +++++++++++++++++++++++++++
 1 file changed, 57 insertions(+)
 create mode 100644 include/dt-bindings/clock/r8a77995-cpg-mssr.h

(limited to 'include')

diff --git a/include/dt-bindings/clock/r8a77995-cpg-mssr.h b/include/dt-bindings/clock/r8a77995-cpg-mssr.h
new file mode 100644
index 000000000000..4e8ae3dee590
--- /dev/null
+++ b/include/dt-bindings/clock/r8a77995-cpg-mssr.h
@@ -0,0 +1,57 @@
+/*
+ * Copyright (C) 2017 Glider bvba
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ */
+#ifndef __DT_BINDINGS_CLOCK_R8A77995_CPG_MSSR_H__
+#define __DT_BINDINGS_CLOCK_R8A77995_CPG_MSSR_H__
+
+#include <dt-bindings/clock/renesas-cpg-mssr.h>
+
+/* r8a77995 CPG Core Clocks */
+#define R8A77995_CLK_Z2			0
+#define R8A77995_CLK_ZG			1
+#define R8A77995_CLK_ZTR		2
+#define R8A77995_CLK_ZT			3
+#define R8A77995_CLK_ZX			4
+#define R8A77995_CLK_S0D1		5
+#define R8A77995_CLK_S1D1		6
+#define R8A77995_CLK_S1D2		7
+#define R8A77995_CLK_S1D4		8
+#define R8A77995_CLK_S2D1		9
+#define R8A77995_CLK_S2D2		10
+#define R8A77995_CLK_S2D4		11
+#define R8A77995_CLK_S3D1		12
+#define R8A77995_CLK_S3D2		13
+#define R8A77995_CLK_S3D4		14
+#define R8A77995_CLK_S1D4C		15
+#define R8A77995_CLK_S3D1C		16
+#define R8A77995_CLK_S3D2C		17
+#define R8A77995_CLK_S3D4C		18
+#define R8A77995_CLK_LB			19
+#define R8A77995_CLK_CL			20
+#define R8A77995_CLK_ZB3		21
+#define R8A77995_CLK_ZB3D2		22
+#define R8A77995_CLK_CR			23
+#define R8A77995_CLK_CRD2		24
+#define R8A77995_CLK_SD0H		25
+#define R8A77995_CLK_SD0		26
+#define R8A77995_CLK_SSP2		27
+#define R8A77995_CLK_SSP1		28
+#define R8A77995_CLK_RPC		29
+#define R8A77995_CLK_RPCD2		30
+#define R8A77995_CLK_ZA2		31
+#define R8A77995_CLK_ZA8		32
+#define R8A77995_CLK_Z2D		33
+#define R8A77995_CLK_CANFD		34
+#define R8A77995_CLK_MSO		35
+#define R8A77995_CLK_R			36
+#define R8A77995_CLK_OSC		37
+#define R8A77995_CLK_LV0		38
+#define R8A77995_CLK_LV1		39
+#define R8A77995_CLK_CP			40
+
+#endif /* __DT_BINDINGS_CLOCK_R8A77995_CPG_MSSR_H__ */
-- 
cgit v1.2.3


From 9a35b63728ceb8602c111260044451dd64952500 Mon Sep 17 00:00:00 2001
From: Jeff Mahoney <jeffm@suse.com>
Date: Wed, 28 Jun 2017 21:56:54 -0600
Subject: btrfs: constify tracepoint arguments

Tracepoint arguments are all read-only.  If we mark the arguments
as const, we're able to keep or convert those arguments to const
where appropriate.

Signed-off-by: Jeff Mahoney <jeffm@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/async-thread.c      |   6 +-
 fs/btrfs/async-thread.h      |   6 +-
 fs/btrfs/btrfs_inode.h       |   4 +-
 include/trace/events/btrfs.h | 242 +++++++++++++++++++++++--------------------
 4 files changed, 136 insertions(+), 122 deletions(-)

(limited to 'include')

diff --git a/fs/btrfs/async-thread.c b/fs/btrfs/async-thread.c
index ff0b0be92d61..e00c8a9fd5bb 100644
--- a/fs/btrfs/async-thread.c
+++ b/fs/btrfs/async-thread.c
@@ -75,18 +75,18 @@ void btrfs_##name(struct work_struct *arg)				\
 }
 
 struct btrfs_fs_info *
-btrfs_workqueue_owner(struct __btrfs_workqueue *wq)
+btrfs_workqueue_owner(const struct __btrfs_workqueue *wq)
 {
 	return wq->fs_info;
 }
 
 struct btrfs_fs_info *
-btrfs_work_owner(struct btrfs_work *work)
+btrfs_work_owner(const struct btrfs_work *work)
 {
 	return work->wq->fs_info;
 }
 
-bool btrfs_workqueue_normal_congested(struct btrfs_workqueue *wq)
+bool btrfs_workqueue_normal_congested(const struct btrfs_workqueue *wq)
 {
 	/*
 	 * We could compare wq->normal->pending with num_online_cpus()
diff --git a/fs/btrfs/async-thread.h b/fs/btrfs/async-thread.h
index 1f9597355c9d..fc957e00cef1 100644
--- a/fs/btrfs/async-thread.h
+++ b/fs/btrfs/async-thread.h
@@ -82,7 +82,7 @@ void btrfs_queue_work(struct btrfs_workqueue *wq,
 void btrfs_destroy_workqueue(struct btrfs_workqueue *wq);
 void btrfs_workqueue_set_max(struct btrfs_workqueue *wq, int max);
 void btrfs_set_work_high_priority(struct btrfs_work *work);
-struct btrfs_fs_info *btrfs_work_owner(struct btrfs_work *work);
-struct btrfs_fs_info *btrfs_workqueue_owner(struct __btrfs_workqueue *wq);
-bool btrfs_workqueue_normal_congested(struct btrfs_workqueue *wq);
+struct btrfs_fs_info *btrfs_work_owner(const struct btrfs_work *work);
+struct btrfs_fs_info *btrfs_workqueue_owner(const struct __btrfs_workqueue *wq);
+bool btrfs_workqueue_normal_congested(const struct btrfs_workqueue *wq);
 #endif
diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h
index d87ac27a5f2b..31c259c8462d 100644
--- a/fs/btrfs/btrfs_inode.h
+++ b/fs/btrfs/btrfs_inode.h
@@ -207,7 +207,7 @@ struct btrfs_inode {
 
 extern unsigned char btrfs_filetype_table[];
 
-static inline struct btrfs_inode *BTRFS_I(struct inode *inode)
+static inline struct btrfs_inode *BTRFS_I(const struct inode *inode)
 {
 	return container_of(inode, struct btrfs_inode, vfs_inode);
 }
@@ -231,7 +231,7 @@ static inline void btrfs_insert_inode_hash(struct inode *inode)
 	__insert_inode_hash(inode, h);
 }
 
-static inline u64 btrfs_ino(struct btrfs_inode *inode)
+static inline u64 btrfs_ino(const struct btrfs_inode *inode)
 {
 	u64 ino = inode->location.objectid;
 
diff --git a/include/trace/events/btrfs.h b/include/trace/events/btrfs.h
index cd99a3658156..42560feb9920 100644
--- a/include/trace/events/btrfs.h
+++ b/include/trace/events/btrfs.h
@@ -92,7 +92,7 @@ struct btrfs_qgroup;
 
 TRACE_EVENT(btrfs_transaction_commit,
 
-	TP_PROTO(struct btrfs_root *root),
+	TP_PROTO(const struct btrfs_root *root),
 
 	TP_ARGS(root),
 
@@ -113,7 +113,7 @@ TRACE_EVENT(btrfs_transaction_commit,
 
 DECLARE_EVENT_CLASS(btrfs__inode,
 
-	TP_PROTO(struct inode *inode),
+	TP_PROTO(const struct inode *inode),
 
 	TP_ARGS(inode),
 
@@ -151,21 +151,21 @@ DECLARE_EVENT_CLASS(btrfs__inode,
 
 DEFINE_EVENT(btrfs__inode, btrfs_inode_new,
 
-	TP_PROTO(struct inode *inode),
+	TP_PROTO(const struct inode *inode),
 
 	TP_ARGS(inode)
 );
 
 DEFINE_EVENT(btrfs__inode, btrfs_inode_request,
 
-	TP_PROTO(struct inode *inode),
+	TP_PROTO(const struct inode *inode),
 
 	TP_ARGS(inode)
 );
 
 DEFINE_EVENT(btrfs__inode, btrfs_inode_evict,
 
-	TP_PROTO(struct inode *inode),
+	TP_PROTO(const struct inode *inode),
 
 	TP_ARGS(inode)
 );
@@ -192,8 +192,8 @@ DEFINE_EVENT(btrfs__inode, btrfs_inode_evict,
 
 TRACE_EVENT_CONDITION(btrfs_get_extent,
 
-	TP_PROTO(struct btrfs_root *root, struct btrfs_inode *inode,
-		 struct extent_map *map),
+	TP_PROTO(const struct btrfs_root *root, const struct btrfs_inode *inode,
+		 const struct extent_map *map),
 
 	TP_ARGS(root, inode, map),
 
@@ -388,7 +388,8 @@ DEFINE_EVENT(
 
 DECLARE_EVENT_CLASS(btrfs__ordered_extent,
 
-	TP_PROTO(struct inode *inode, struct btrfs_ordered_extent *ordered),
+	TP_PROTO(const struct inode *inode,
+		 const struct btrfs_ordered_extent *ordered),
 
 	TP_ARGS(inode, ordered),
 
@@ -440,36 +441,40 @@ DECLARE_EVENT_CLASS(btrfs__ordered_extent,
 
 DEFINE_EVENT(btrfs__ordered_extent, btrfs_ordered_extent_add,
 
-	TP_PROTO(struct inode *inode, struct btrfs_ordered_extent *ordered),
+	TP_PROTO(const struct inode *inode,
+		 const struct btrfs_ordered_extent *ordered),
 
 	TP_ARGS(inode, ordered)
 );
 
 DEFINE_EVENT(btrfs__ordered_extent, btrfs_ordered_extent_remove,
 
-	TP_PROTO(struct inode *inode, struct btrfs_ordered_extent *ordered),
+	TP_PROTO(const struct inode *inode,
+		 const struct btrfs_ordered_extent *ordered),
 
 	TP_ARGS(inode, ordered)
 );
 
 DEFINE_EVENT(btrfs__ordered_extent, btrfs_ordered_extent_start,
 
-	TP_PROTO(struct inode *inode, struct btrfs_ordered_extent *ordered),
+	TP_PROTO(const struct inode *inode,
+		 const struct btrfs_ordered_extent *ordered),
 
 	TP_ARGS(inode, ordered)
 );
 
 DEFINE_EVENT(btrfs__ordered_extent, btrfs_ordered_extent_put,
 
-	TP_PROTO(struct inode *inode, struct btrfs_ordered_extent *ordered),
+	TP_PROTO(const struct inode *inode,
+		 const struct btrfs_ordered_extent *ordered),
 
 	TP_ARGS(inode, ordered)
 );
 
 DECLARE_EVENT_CLASS(btrfs__writepage,
 
-	TP_PROTO(struct page *page, struct inode *inode,
-		 struct writeback_control *wbc),
+	TP_PROTO(const struct page *page, const struct inode *inode,
+		 const struct writeback_control *wbc),
 
 	TP_ARGS(page, inode, wbc),
 
@@ -517,15 +522,15 @@ DECLARE_EVENT_CLASS(btrfs__writepage,
 
 DEFINE_EVENT(btrfs__writepage, __extent_writepage,
 
-	TP_PROTO(struct page *page, struct inode *inode,
-		 struct writeback_control *wbc),
+	TP_PROTO(const struct page *page, const struct inode *inode,
+		 const struct writeback_control *wbc),
 
 	TP_ARGS(page, inode, wbc)
 );
 
 TRACE_EVENT(btrfs_writepage_end_io_hook,
 
-	TP_PROTO(struct page *page, u64 start, u64 end, int uptodate),
+	TP_PROTO(const struct page *page, u64 start, u64 end, int uptodate),
 
 	TP_ARGS(page, start, end, uptodate),
 
@@ -558,7 +563,7 @@ TRACE_EVENT(btrfs_writepage_end_io_hook,
 
 TRACE_EVENT(btrfs_sync_file,
 
-	TP_PROTO(struct file *file, int datasync),
+	TP_PROTO(const struct file *file, int datasync),
 
 	TP_ARGS(file, datasync),
 
@@ -570,8 +575,8 @@ TRACE_EVENT(btrfs_sync_file,
 	),
 
 	TP_fast_assign(
-		struct dentry *dentry = file->f_path.dentry;
-		struct inode *inode = d_inode(dentry);
+		const struct dentry *dentry = file->f_path.dentry;
+		const struct inode *inode = d_inode(dentry);
 
 		TP_fast_assign_fsid(btrfs_sb(file->f_path.dentry->d_sb));
 		__entry->ino		= inode->i_ino;
@@ -589,7 +594,7 @@ TRACE_EVENT(btrfs_sync_file,
 
 TRACE_EVENT(btrfs_sync_fs,
 
-	TP_PROTO(struct btrfs_fs_info *fs_info, int wait),
+	TP_PROTO(const struct btrfs_fs_info *fs_info, int wait),
 
 	TP_ARGS(fs_info, wait),
 
@@ -606,8 +611,8 @@ TRACE_EVENT(btrfs_sync_fs,
 
 TRACE_EVENT(btrfs_add_block_group,
 
-	TP_PROTO(struct btrfs_fs_info *fs_info,
-		 struct btrfs_block_group_cache *block_group, int create),
+	TP_PROTO(const struct btrfs_fs_info *fs_info,
+		 const struct btrfs_block_group_cache *block_group, int create),
 
 	TP_ARGS(fs_info, block_group, create),
 
@@ -654,9 +659,9 @@ TRACE_EVENT(btrfs_add_block_group,
 
 DECLARE_EVENT_CLASS(btrfs_delayed_tree_ref,
 
-	TP_PROTO(struct btrfs_fs_info *fs_info,
-		 struct btrfs_delayed_ref_node *ref,
-		 struct btrfs_delayed_tree_ref *full_ref,
+	TP_PROTO(const struct btrfs_fs_info *fs_info,
+		 const struct btrfs_delayed_ref_node *ref,
+		 const struct btrfs_delayed_tree_ref *full_ref,
 		 int action),
 
 	TP_ARGS(fs_info, ref, full_ref, action),
@@ -697,9 +702,9 @@ DECLARE_EVENT_CLASS(btrfs_delayed_tree_ref,
 
 DEFINE_EVENT(btrfs_delayed_tree_ref,  add_delayed_tree_ref,
 
-	TP_PROTO(struct btrfs_fs_info *fs_info,
-		 struct btrfs_delayed_ref_node *ref,
-		 struct btrfs_delayed_tree_ref *full_ref,
+	TP_PROTO(const struct btrfs_fs_info *fs_info,
+		 const struct btrfs_delayed_ref_node *ref,
+		 const struct btrfs_delayed_tree_ref *full_ref,
 		 int action),
 
 	TP_ARGS(fs_info, ref, full_ref, action)
@@ -707,9 +712,9 @@ DEFINE_EVENT(btrfs_delayed_tree_ref,  add_delayed_tree_ref,
 
 DEFINE_EVENT(btrfs_delayed_tree_ref,  run_delayed_tree_ref,
 
-	TP_PROTO(struct btrfs_fs_info *fs_info,
-		 struct btrfs_delayed_ref_node *ref,
-		 struct btrfs_delayed_tree_ref *full_ref,
+	TP_PROTO(const struct btrfs_fs_info *fs_info,
+		 const struct btrfs_delayed_ref_node *ref,
+		 const struct btrfs_delayed_tree_ref *full_ref,
 		 int action),
 
 	TP_ARGS(fs_info, ref, full_ref, action)
@@ -717,9 +722,9 @@ DEFINE_EVENT(btrfs_delayed_tree_ref,  run_delayed_tree_ref,
 
 DECLARE_EVENT_CLASS(btrfs_delayed_data_ref,
 
-	TP_PROTO(struct btrfs_fs_info *fs_info,
-		 struct btrfs_delayed_ref_node *ref,
-		 struct btrfs_delayed_data_ref *full_ref,
+	TP_PROTO(const struct btrfs_fs_info *fs_info,
+		 const struct btrfs_delayed_ref_node *ref,
+		 const struct btrfs_delayed_data_ref *full_ref,
 		 int action),
 
 	TP_ARGS(fs_info, ref, full_ref, action),
@@ -764,9 +769,9 @@ DECLARE_EVENT_CLASS(btrfs_delayed_data_ref,
 
 DEFINE_EVENT(btrfs_delayed_data_ref,  add_delayed_data_ref,
 
-	TP_PROTO(struct btrfs_fs_info *fs_info,
-		 struct btrfs_delayed_ref_node *ref,
-		 struct btrfs_delayed_data_ref *full_ref,
+	TP_PROTO(const struct btrfs_fs_info *fs_info,
+		 const struct btrfs_delayed_ref_node *ref,
+		 const struct btrfs_delayed_data_ref *full_ref,
 		 int action),
 
 	TP_ARGS(fs_info, ref, full_ref, action)
@@ -774,9 +779,9 @@ DEFINE_EVENT(btrfs_delayed_data_ref,  add_delayed_data_ref,
 
 DEFINE_EVENT(btrfs_delayed_data_ref,  run_delayed_data_ref,
 
-	TP_PROTO(struct btrfs_fs_info *fs_info,
-		 struct btrfs_delayed_ref_node *ref,
-		 struct btrfs_delayed_data_ref *full_ref,
+	TP_PROTO(const struct btrfs_fs_info *fs_info,
+		 const struct btrfs_delayed_ref_node *ref,
+		 const struct btrfs_delayed_data_ref *full_ref,
 		 int action),
 
 	TP_ARGS(fs_info, ref, full_ref, action)
@@ -784,9 +789,9 @@ DEFINE_EVENT(btrfs_delayed_data_ref,  run_delayed_data_ref,
 
 DECLARE_EVENT_CLASS(btrfs_delayed_ref_head,
 
-	TP_PROTO(struct btrfs_fs_info *fs_info,
-		 struct btrfs_delayed_ref_node *ref,
-		 struct btrfs_delayed_ref_head *head_ref,
+	TP_PROTO(const struct btrfs_fs_info *fs_info,
+		 const struct btrfs_delayed_ref_node *ref,
+		 const struct btrfs_delayed_ref_head *head_ref,
 		 int action),
 
 	TP_ARGS(fs_info, ref, head_ref, action),
@@ -814,9 +819,9 @@ DECLARE_EVENT_CLASS(btrfs_delayed_ref_head,
 
 DEFINE_EVENT(btrfs_delayed_ref_head,  add_delayed_ref_head,
 
-	TP_PROTO(struct btrfs_fs_info *fs_info,
-		 struct btrfs_delayed_ref_node *ref,
-		 struct btrfs_delayed_ref_head *head_ref,
+	TP_PROTO(const struct btrfs_fs_info *fs_info,
+		 const struct btrfs_delayed_ref_node *ref,
+		 const struct btrfs_delayed_ref_head *head_ref,
 		 int action),
 
 	TP_ARGS(fs_info, ref, head_ref, action)
@@ -824,9 +829,9 @@ DEFINE_EVENT(btrfs_delayed_ref_head,  add_delayed_ref_head,
 
 DEFINE_EVENT(btrfs_delayed_ref_head,  run_delayed_ref_head,
 
-	TP_PROTO(struct btrfs_fs_info *fs_info,
-		 struct btrfs_delayed_ref_node *ref,
-		 struct btrfs_delayed_ref_head *head_ref,
+	TP_PROTO(const struct btrfs_fs_info *fs_info,
+		 const struct btrfs_delayed_ref_node *ref,
+		 const struct btrfs_delayed_ref_head *head_ref,
 		 int action),
 
 	TP_ARGS(fs_info, ref, head_ref, action)
@@ -846,8 +851,8 @@ DEFINE_EVENT(btrfs_delayed_ref_head,  run_delayed_ref_head,
 
 DECLARE_EVENT_CLASS(btrfs__chunk,
 
-	TP_PROTO(struct btrfs_fs_info *fs_info, struct map_lookup *map,
-		 u64 offset, u64 size),
+	TP_PROTO(const struct btrfs_fs_info *fs_info,
+		 const struct map_lookup *map, u64 offset, u64 size),
 
 	TP_ARGS(fs_info, map, offset, size),
 
@@ -880,24 +885,24 @@ DECLARE_EVENT_CLASS(btrfs__chunk,
 
 DEFINE_EVENT(btrfs__chunk,  btrfs_chunk_alloc,
 
-	TP_PROTO(struct btrfs_fs_info *fs_info, struct map_lookup *map,
-		 u64 offset, u64 size),
+	TP_PROTO(const struct btrfs_fs_info *fs_info,
+		 const struct map_lookup *map, u64 offset, u64 size),
 
 	TP_ARGS(fs_info, map, offset, size)
 );
 
 DEFINE_EVENT(btrfs__chunk,  btrfs_chunk_free,
 
-	TP_PROTO(struct btrfs_fs_info *fs_info, struct map_lookup *map,
-		 u64 offset, u64 size),
+	TP_PROTO(const struct btrfs_fs_info *fs_info,
+		 const struct map_lookup *map, u64 offset, u64 size),
 
 	TP_ARGS(fs_info, map, offset, size)
 );
 
 TRACE_EVENT(btrfs_cow_block,
 
-	TP_PROTO(struct btrfs_root *root, struct extent_buffer *buf,
-		 struct extent_buffer *cow),
+	TP_PROTO(const struct btrfs_root *root, const struct extent_buffer *buf,
+		 const struct extent_buffer *cow),
 
 	TP_ARGS(root, buf, cow),
 
@@ -931,7 +936,7 @@ TRACE_EVENT(btrfs_cow_block,
 
 TRACE_EVENT(btrfs_space_reservation,
 
-	TP_PROTO(struct btrfs_fs_info *fs_info, char *type, u64 val,
+	TP_PROTO(const struct btrfs_fs_info *fs_info, char *type, u64 val,
 		 u64 bytes, int reserve),
 
 	TP_ARGS(fs_info, type, val, bytes, reserve),
@@ -963,7 +968,7 @@ TRACE_EVENT(btrfs_space_reservation,
 
 TRACE_EVENT(btrfs_trigger_flush,
 
-	TP_PROTO(struct btrfs_fs_info *fs_info, u64 flags, u64 bytes,
+	TP_PROTO(const struct btrfs_fs_info *fs_info, u64 flags, u64 bytes,
 		 int flush, char *reason),
 
 	TP_ARGS(fs_info, flags, bytes, flush, reason),
@@ -1004,7 +1009,7 @@ TRACE_EVENT(btrfs_trigger_flush,
 
 TRACE_EVENT(btrfs_flush_space,
 
-	TP_PROTO(struct btrfs_fs_info *fs_info, u64 flags, u64 num_bytes,
+	TP_PROTO(const struct btrfs_fs_info *fs_info, u64 flags, u64 num_bytes,
 		 u64 orig_bytes, int state, int ret),
 
 	TP_ARGS(fs_info, flags, num_bytes, orig_bytes, state, ret),
@@ -1039,7 +1044,7 @@ TRACE_EVENT(btrfs_flush_space,
 
 DECLARE_EVENT_CLASS(btrfs__reserved_extent,
 
-	TP_PROTO(struct btrfs_fs_info *fs_info, u64 start, u64 len),
+	TP_PROTO(const struct btrfs_fs_info *fs_info, u64 start, u64 len),
 
 	TP_ARGS(fs_info, start, len),
 
@@ -1061,22 +1066,22 @@ DECLARE_EVENT_CLASS(btrfs__reserved_extent,
 
 DEFINE_EVENT(btrfs__reserved_extent,  btrfs_reserved_extent_alloc,
 
-	TP_PROTO(struct btrfs_fs_info *fs_info, u64 start, u64 len),
+	TP_PROTO(const struct btrfs_fs_info *fs_info, u64 start, u64 len),
 
 	TP_ARGS(fs_info, start, len)
 );
 
 DEFINE_EVENT(btrfs__reserved_extent,  btrfs_reserved_extent_free,
 
-	TP_PROTO(struct btrfs_fs_info *fs_info, u64 start, u64 len),
+	TP_PROTO(const struct btrfs_fs_info *fs_info, u64 start, u64 len),
 
 	TP_ARGS(fs_info, start, len)
 );
 
 TRACE_EVENT(find_free_extent,
 
-	TP_PROTO(struct btrfs_fs_info *fs_info, u64 num_bytes, u64 empty_size,
-		 u64 data),
+	TP_PROTO(const struct btrfs_fs_info *fs_info, u64 num_bytes,
+		 u64 empty_size, u64 data),
 
 	TP_ARGS(fs_info, num_bytes, empty_size, data),
 
@@ -1101,8 +1106,8 @@ TRACE_EVENT(find_free_extent,
 
 DECLARE_EVENT_CLASS(btrfs__reserve_extent,
 
-	TP_PROTO(struct btrfs_fs_info *fs_info,
-		 struct btrfs_block_group_cache *block_group, u64 start,
+	TP_PROTO(const struct btrfs_fs_info *fs_info,
+		 const struct btrfs_block_group_cache *block_group, u64 start,
 		 u64 len),
 
 	TP_ARGS(fs_info, block_group, start, len),
@@ -1132,8 +1137,8 @@ DECLARE_EVENT_CLASS(btrfs__reserve_extent,
 
 DEFINE_EVENT(btrfs__reserve_extent, btrfs_reserve_extent,
 
-	TP_PROTO(struct btrfs_fs_info *fs_info,
-		 struct btrfs_block_group_cache *block_group, u64 start,
+	TP_PROTO(const struct btrfs_fs_info *fs_info,
+		 const struct btrfs_block_group_cache *block_group, u64 start,
 		 u64 len),
 
 	TP_ARGS(fs_info, block_group, start, len)
@@ -1141,8 +1146,8 @@ DEFINE_EVENT(btrfs__reserve_extent, btrfs_reserve_extent,
 
 DEFINE_EVENT(btrfs__reserve_extent, btrfs_reserve_extent_cluster,
 
-	TP_PROTO(struct btrfs_fs_info *fs_info,
-		 struct btrfs_block_group_cache *block_group, u64 start,
+	TP_PROTO(const struct btrfs_fs_info *fs_info,
+		 const struct btrfs_block_group_cache *block_group, u64 start,
 		 u64 len),
 
 	TP_ARGS(fs_info, block_group, start, len)
@@ -1150,7 +1155,7 @@ DEFINE_EVENT(btrfs__reserve_extent, btrfs_reserve_extent_cluster,
 
 TRACE_EVENT(btrfs_find_cluster,
 
-	TP_PROTO(struct btrfs_block_group_cache *block_group, u64 start,
+	TP_PROTO(const struct btrfs_block_group_cache *block_group, u64 start,
 		 u64 bytes, u64 empty_size, u64 min_bytes),
 
 	TP_ARGS(block_group, start, bytes, empty_size, min_bytes),
@@ -1183,7 +1188,7 @@ TRACE_EVENT(btrfs_find_cluster,
 
 TRACE_EVENT(btrfs_failed_cluster_setup,
 
-	TP_PROTO(struct btrfs_block_group_cache *block_group),
+	TP_PROTO(const struct btrfs_block_group_cache *block_group),
 
 	TP_ARGS(block_group),
 
@@ -1200,8 +1205,9 @@ TRACE_EVENT(btrfs_failed_cluster_setup,
 
 TRACE_EVENT(btrfs_setup_cluster,
 
-	TP_PROTO(struct btrfs_block_group_cache *block_group,
-		 struct btrfs_free_cluster *cluster, u64 size, int bitmap),
+	TP_PROTO(const struct btrfs_block_group_cache *block_group,
+		 const struct btrfs_free_cluster *cluster,
+		 u64 size, int bitmap),
 
 	TP_ARGS(block_group, cluster, size, bitmap),
 
@@ -1235,12 +1241,13 @@ TRACE_EVENT(btrfs_setup_cluster,
 struct extent_state;
 TRACE_EVENT(alloc_extent_state,
 
-	TP_PROTO(struct extent_state *state, gfp_t mask, unsigned long IP),
+	TP_PROTO(const struct extent_state *state,
+		 gfp_t mask, unsigned long IP),
 
 	TP_ARGS(state, mask, IP),
 
 	TP_STRUCT__entry(
-		__field(struct extent_state *, state)
+		__field(const struct extent_state *, state)
 		__field(gfp_t, mask)
 		__field(unsigned long, ip)
 	),
@@ -1252,17 +1259,17 @@ TRACE_EVENT(alloc_extent_state,
 	),
 
 	TP_printk("state=%p mask=%s caller=%pS", __entry->state,
-		  show_gfp_flags(__entry->mask), (void *)__entry->ip)
+		  show_gfp_flags(__entry->mask), (const void *)__entry->ip)
 );
 
 TRACE_EVENT(free_extent_state,
 
-	TP_PROTO(struct extent_state *state, unsigned long IP),
+	TP_PROTO(const struct extent_state *state, unsigned long IP),
 
 	TP_ARGS(state, IP),
 
 	TP_STRUCT__entry(
-		__field(struct extent_state *, state)
+		__field(const struct extent_state *, state)
 		__field(unsigned long, ip)
 	),
 
@@ -1272,22 +1279,22 @@ TRACE_EVENT(free_extent_state,
 	),
 
 	TP_printk("state=%p caller=%pS", __entry->state,
-		  (void *)__entry->ip)
+		  (const void *)__entry->ip)
 );
 
 DECLARE_EVENT_CLASS(btrfs__work,
 
-	TP_PROTO(struct btrfs_work *work),
+	TP_PROTO(const struct btrfs_work *work),
 
 	TP_ARGS(work),
 
 	TP_STRUCT__entry_btrfs(
-		__field(	void *,	work			)
-		__field(	void *, wq			)
-		__field(	void *,	func			)
-		__field(	void *,	ordered_func		)
-		__field(	void *,	ordered_free		)
-		__field(	void *,	normal_work		)
+		__field(	const void *,	work			)
+		__field(	const void *,	wq			)
+		__field(	const void *,	func			)
+		__field(	const void *,	ordered_func		)
+		__field(	const void *,	ordered_free		)
+		__field(	const void *,	normal_work		)
 	),
 
 	TP_fast_assign_btrfs(btrfs_work_owner(work),
@@ -1312,12 +1319,12 @@ DECLARE_EVENT_CLASS(btrfs__work,
  */
 DECLARE_EVENT_CLASS(btrfs__work__done,
 
-	TP_PROTO(struct btrfs_fs_info *fs_info, void *wtag),
+	TP_PROTO(const struct btrfs_fs_info *fs_info, const void *wtag),
 
 	TP_ARGS(fs_info, wtag),
 
 	TP_STRUCT__entry_btrfs(
-		__field(	void *,	wtag			)
+		__field(	const void *,	wtag			)
 	),
 
 	TP_fast_assign_btrfs(fs_info,
@@ -1329,40 +1336,41 @@ DECLARE_EVENT_CLASS(btrfs__work__done,
 
 DEFINE_EVENT(btrfs__work, btrfs_work_queued,
 
-	TP_PROTO(struct btrfs_work *work),
+	TP_PROTO(const struct btrfs_work *work),
 
 	TP_ARGS(work)
 );
 
 DEFINE_EVENT(btrfs__work, btrfs_work_sched,
 
-	TP_PROTO(struct btrfs_work *work),
+	TP_PROTO(const struct btrfs_work *work),
 
 	TP_ARGS(work)
 );
 
 DEFINE_EVENT(btrfs__work__done, btrfs_all_work_done,
 
-	TP_PROTO(struct btrfs_fs_info *fs_info, void *wtag),
+	TP_PROTO(const struct btrfs_fs_info *fs_info, const void *wtag),
 
 	TP_ARGS(fs_info, wtag)
 );
 
 DEFINE_EVENT(btrfs__work, btrfs_ordered_sched,
 
-	TP_PROTO(struct btrfs_work *work),
+	TP_PROTO(const struct btrfs_work *work),
 
 	TP_ARGS(work)
 );
 
 DECLARE_EVENT_CLASS(btrfs__workqueue,
 
-	TP_PROTO(struct __btrfs_workqueue *wq, const char *name, int high),
+	TP_PROTO(const struct __btrfs_workqueue *wq,
+		 const char *name, int high),
 
 	TP_ARGS(wq, name, high),
 
 	TP_STRUCT__entry_btrfs(
-		__field(	void *,	wq			)
+		__field(	const void *,	wq			)
 		__string(	name,	name			)
 		__field(	int ,	high			)
 	),
@@ -1381,19 +1389,20 @@ DECLARE_EVENT_CLASS(btrfs__workqueue,
 
 DEFINE_EVENT(btrfs__workqueue, btrfs_workqueue_alloc,
 
-	TP_PROTO(struct __btrfs_workqueue *wq, const char *name, int high),
+	TP_PROTO(const struct __btrfs_workqueue *wq,
+		 const char *name, int high),
 
 	TP_ARGS(wq, name, high)
 );
 
 DECLARE_EVENT_CLASS(btrfs__workqueue_done,
 
-	TP_PROTO(struct __btrfs_workqueue *wq),
+	TP_PROTO(const struct __btrfs_workqueue *wq),
 
 	TP_ARGS(wq),
 
 	TP_STRUCT__entry_btrfs(
-		__field(	void *,	wq			)
+		__field(	const void *,	wq		)
 	),
 
 	TP_fast_assign_btrfs(btrfs_workqueue_owner(wq),
@@ -1405,7 +1414,7 @@ DECLARE_EVENT_CLASS(btrfs__workqueue_done,
 
 DEFINE_EVENT(btrfs__workqueue_done, btrfs_workqueue_destroy,
 
-	TP_PROTO(struct __btrfs_workqueue *wq),
+	TP_PROTO(const struct __btrfs_workqueue *wq),
 
 	TP_ARGS(wq)
 );
@@ -1417,7 +1426,8 @@ DEFINE_EVENT(btrfs__workqueue_done, btrfs_workqueue_destroy,
 
 DECLARE_EVENT_CLASS(btrfs__qgroup_rsv_data,
 
-	TP_PROTO(struct inode *inode, u64 start, u64 len, u64 reserved, int op),
+	TP_PROTO(const struct inode *inode, u64 start, u64 len,
+		 u64 reserved, int op),
 
 	TP_ARGS(inode, start, len, reserved, op),
 
@@ -1449,21 +1459,24 @@ DECLARE_EVENT_CLASS(btrfs__qgroup_rsv_data,
 
 DEFINE_EVENT(btrfs__qgroup_rsv_data, btrfs_qgroup_reserve_data,
 
-	TP_PROTO(struct inode *inode, u64 start, u64 len, u64 reserved, int op),
+	TP_PROTO(const struct inode *inode, u64 start, u64 len,
+		 u64 reserved, int op),
 
 	TP_ARGS(inode, start, len, reserved, op)
 );
 
 DEFINE_EVENT(btrfs__qgroup_rsv_data, btrfs_qgroup_release_data,
 
-	TP_PROTO(struct inode *inode, u64 start, u64 len, u64 reserved, int op),
+	TP_PROTO(const struct inode *inode, u64 start, u64 len,
+		 u64 reserved, int op),
 
 	TP_ARGS(inode, start, len, reserved, op)
 );
 
 DECLARE_EVENT_CLASS(btrfs__qgroup_delayed_ref,
 
-	TP_PROTO(struct btrfs_fs_info *fs_info, u64 ref_root, u64 reserved),
+	TP_PROTO(const struct btrfs_fs_info *fs_info,
+		 u64 ref_root, u64 reserved),
 
 	TP_ARGS(fs_info, ref_root, reserved),
 
@@ -1483,14 +1496,15 @@ DECLARE_EVENT_CLASS(btrfs__qgroup_delayed_ref,
 
 DEFINE_EVENT(btrfs__qgroup_delayed_ref, btrfs_qgroup_free_delayed_ref,
 
-	TP_PROTO(struct btrfs_fs_info *fs_info, u64 ref_root, u64 reserved),
+	TP_PROTO(const struct btrfs_fs_info *fs_info,
+		 u64 ref_root, u64 reserved),
 
 	TP_ARGS(fs_info, ref_root, reserved)
 );
 
 DECLARE_EVENT_CLASS(btrfs_qgroup_extent,
-	TP_PROTO(struct btrfs_fs_info *fs_info,
-		 struct btrfs_qgroup_extent_record *rec),
+	TP_PROTO(const struct btrfs_fs_info *fs_info,
+		 const struct btrfs_qgroup_extent_record *rec),
 
 	TP_ARGS(fs_info, rec),
 
@@ -1511,23 +1525,23 @@ DECLARE_EVENT_CLASS(btrfs_qgroup_extent,
 
 DEFINE_EVENT(btrfs_qgroup_extent, btrfs_qgroup_account_extents,
 
-	TP_PROTO(struct btrfs_fs_info *fs_info,
-		 struct btrfs_qgroup_extent_record *rec),
+	TP_PROTO(const struct btrfs_fs_info *fs_info,
+		 const struct btrfs_qgroup_extent_record *rec),
 
 	TP_ARGS(fs_info, rec)
 );
 
 DEFINE_EVENT(btrfs_qgroup_extent, btrfs_qgroup_trace_extent,
 
-	TP_PROTO(struct btrfs_fs_info *fs_info,
-		 struct btrfs_qgroup_extent_record *rec),
+	TP_PROTO(const struct btrfs_fs_info *fs_info,
+		 const struct btrfs_qgroup_extent_record *rec),
 
 	TP_ARGS(fs_info, rec)
 );
 
 TRACE_EVENT(btrfs_qgroup_account_extent,
 
-	TP_PROTO(struct btrfs_fs_info *fs_info, u64 bytenr,
+	TP_PROTO(const struct btrfs_fs_info *fs_info, u64 bytenr,
 		 u64 num_bytes, u64 nr_old_roots, u64 nr_new_roots),
 
 	TP_ARGS(fs_info, bytenr, num_bytes, nr_old_roots, nr_new_roots),
@@ -1556,7 +1570,7 @@ TRACE_EVENT(btrfs_qgroup_account_extent,
 
 TRACE_EVENT(qgroup_update_counters,
 
-	TP_PROTO(struct btrfs_fs_info *fs_info, u64 qgid,
+	TP_PROTO(const struct btrfs_fs_info *fs_info, u64 qgid,
 		 u64 cur_old_count, u64 cur_new_count),
 
 	TP_ARGS(fs_info, qgid, cur_old_count, cur_new_count),
-- 
cgit v1.2.3


From 00142756e1f8015d2f8ce96532d156689db7e448 Mon Sep 17 00:00:00 2001
From: Jeff Mahoney <jeffm@suse.com>
Date: Wed, 12 Jul 2017 16:20:08 -0600
Subject: btrfs: backref, add tracepoints for prelim_ref insertion and merging

This patch adds a tracepoint event for prelim_ref insertion and
merging.  For each, the ref being inserted or merged and the count
of tree nodes is issued.

Signed-off-by: Jeff Mahoney <jeffm@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/backref.c           | 118 ++++++++++++++++++++++---------------------
 fs/btrfs/backref.h           |  12 +++++
 fs/btrfs/super.c             |   1 +
 include/trace/events/btrfs.h |  58 +++++++++++++++++++++
 4 files changed, 131 insertions(+), 58 deletions(-)

(limited to 'include')

diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c
index 297f33850425..4cda81964dd4 100644
--- a/fs/btrfs/backref.c
+++ b/fs/btrfs/backref.c
@@ -18,6 +18,7 @@
 
 #include <linux/mm.h>
 #include <linux/rbtree.h>
+#include <trace/events/btrfs.h>
 #include "ctree.h"
 #include "disk-io.h"
 #include "backref.h"
@@ -120,20 +121,6 @@ static int find_extent_in_eb(const struct extent_buffer *eb,
 	return 0;
 }
 
-/*
- * this structure records all encountered refs on the way up to the root
- */
-struct prelim_ref {
-	struct rb_node rbnode;
-	u64 root_id;
-	struct btrfs_key key_for_search;
-	int level;
-	int count;
-	struct extent_inode_elem *inode_list;
-	u64 parent;
-	u64 wanted_disk_byte;
-};
-
 struct preftree {
 	struct rb_root root;
 	unsigned int count;
@@ -212,7 +199,8 @@ static int prelim_ref_compare(struct prelim_ref *ref1,
  *
  * Callers should assumed that newref has been freed after calling.
  */
-static void prelim_ref_insert(struct preftree *preftree,
+static void prelim_ref_insert(const struct btrfs_fs_info *fs_info,
+			      struct preftree *preftree,
 			      struct prelim_ref *newref)
 {
 	struct rb_root *root;
@@ -243,6 +231,8 @@ static void prelim_ref_insert(struct preftree *preftree,
 				ref->inode_list = newref->inode_list;
 			else
 				eie->next = newref->inode_list;
+			trace_btrfs_prelim_ref_merge(fs_info, ref, newref,
+						     preftree->count);
 			ref->count += newref->count;
 			free_pref(newref);
 			return;
@@ -250,6 +240,7 @@ static void prelim_ref_insert(struct preftree *preftree,
 	}
 
 	preftree->count++;
+	trace_btrfs_prelim_ref_insert(fs_info, newref, NULL, preftree->count);
 	rb_link_node(&newref->rbnode, parent, p);
 	rb_insert_color(&newref->rbnode, root);
 }
@@ -308,7 +299,8 @@ static void prelim_release(struct preftree *preftree)
  * additional information that's available but not required to find the parent
  * block might help in merging entries to gain some speed.
  */
-static int add_prelim_ref(struct preftree *preftree, u64 root_id,
+static int add_prelim_ref(const struct btrfs_fs_info *fs_info,
+			  struct preftree *preftree, u64 root_id,
 			  const struct btrfs_key *key, int level, u64 parent,
 			  u64 wanted_disk_byte, int count, gfp_t gfp_mask)
 {
@@ -355,21 +347,23 @@ static int add_prelim_ref(struct preftree *preftree, u64 root_id,
 	ref->count = count;
 	ref->parent = parent;
 	ref->wanted_disk_byte = wanted_disk_byte;
-	prelim_ref_insert(preftree, ref);
+	prelim_ref_insert(fs_info, preftree, ref);
 
 	return 0;
 }
 
 /* direct refs use root == 0, key == NULL */
-static int add_direct_ref(struct preftrees *preftrees, int level, u64 parent,
+static int add_direct_ref(const struct btrfs_fs_info *fs_info,
+			  struct preftrees *preftrees, int level, u64 parent,
 			  u64 wanted_disk_byte, int count, gfp_t gfp_mask)
 {
-	return add_prelim_ref(&preftrees->direct, 0, NULL, level, parent,
-			      wanted_disk_byte, count, gfp_mask);
+	return add_prelim_ref(fs_info, &preftrees->direct, 0, NULL, level,
+			      parent, wanted_disk_byte, count, gfp_mask);
 }
 
 /* indirect refs use parent == 0 */
-static int add_indirect_ref(struct preftrees *preftrees, u64 root_id,
+static int add_indirect_ref(const struct btrfs_fs_info *fs_info,
+			    struct preftrees *preftrees, u64 root_id,
 			    const struct btrfs_key *key, int level,
 			    u64 wanted_disk_byte, int count, gfp_t gfp_mask)
 {
@@ -377,7 +371,7 @@ static int add_indirect_ref(struct preftrees *preftrees, u64 root_id,
 
 	if (!key)
 		tree = &preftrees->indirect_missing_keys;
-	return add_prelim_ref(tree, root_id, key, level, 0,
+	return add_prelim_ref(fs_info, tree, root_id, key, level, 0,
 			      wanted_disk_byte, count, gfp_mask);
 }
 
@@ -631,7 +625,7 @@ static int resolve_indirect_refs(struct btrfs_fs_info *fs_info,
 		 * and return directly.
 		 */
 		if (err == -ENOENT) {
-			prelim_ref_insert(&preftrees->direct, ref);
+			prelim_ref_insert(fs_info, &preftrees->direct, ref);
 			continue;
 		} else if (err) {
 			free_pref(ref);
@@ -659,11 +653,11 @@ static int resolve_indirect_refs(struct btrfs_fs_info *fs_info,
 			memcpy(new_ref, ref, sizeof(*ref));
 			new_ref->parent = node->val;
 			new_ref->inode_list = unode_aux_to_inode_list(node);
-			prelim_ref_insert(&preftrees->direct, new_ref);
+			prelim_ref_insert(fs_info, &preftrees->direct, new_ref);
 		}
 
 		/* Now it's a direct ref, put it in the the direct tree */
-		prelim_ref_insert(&preftrees->direct, ref);
+		prelim_ref_insert(fs_info, &preftrees->direct, ref);
 
 		ulist_reinit(parents);
 	}
@@ -707,7 +701,7 @@ static int add_missing_keys(struct btrfs_fs_info *fs_info,
 			btrfs_node_key_to_cpu(eb, &ref->key_for_search, 0);
 		btrfs_tree_read_unlock(eb);
 		free_extent_buffer(eb);
-		prelim_ref_insert(&preftrees->indirect, ref);
+		prelim_ref_insert(fs_info, &preftrees->indirect, ref);
 	}
 	return 0;
 }
@@ -716,7 +710,8 @@ static int add_missing_keys(struct btrfs_fs_info *fs_info,
  * add all currently queued delayed refs from this head whose seq nr is
  * smaller or equal that seq to the list
  */
-static int add_delayed_refs(struct btrfs_delayed_ref_head *head, u64 seq,
+static int add_delayed_refs(const struct btrfs_fs_info *fs_info,
+			    struct btrfs_delayed_ref_head *head, u64 seq,
 			    struct preftrees *preftrees, u64 *total_refs,
 			    u64 inum)
 {
@@ -759,8 +754,9 @@ static int add_delayed_refs(struct btrfs_delayed_ref_head *head, u64 seq,
 			struct btrfs_delayed_tree_ref *ref;
 
 			ref = btrfs_delayed_node_to_tree_ref(node);
-			ret = add_indirect_ref(preftrees, ref->root, &tmp_op_key,
-					       ref->level + 1, node->bytenr,
+			ret = add_indirect_ref(fs_info, preftrees, ref->root,
+					       &tmp_op_key, ref->level + 1,
+					       node->bytenr,
 					       node->ref_mod * sgn,
 					       GFP_ATOMIC);
 			break;
@@ -771,9 +767,9 @@ static int add_delayed_refs(struct btrfs_delayed_ref_head *head, u64 seq,
 
 			ref = btrfs_delayed_node_to_tree_ref(node);
 
-			ret = add_direct_ref(preftrees, ref->level + 1,
-					     ref->parent, node->bytenr,
-					     node->ref_mod * sgn,
+			ret = add_direct_ref(fs_info, preftrees,
+					     ref->level + 1, ref->parent,
+					     node->bytenr, node->ref_mod * sgn,
 					     GFP_ATOMIC);
 			break;
 		}
@@ -795,8 +791,8 @@ static int add_delayed_refs(struct btrfs_delayed_ref_head *head, u64 seq,
 				break;
 			}
 
-			ret = add_indirect_ref(preftrees, ref->root, &key, 0,
-					       node->bytenr,
+			ret = add_indirect_ref(fs_info, preftrees, ref->root,
+					       &key, 0, node->bytenr,
 					       node->ref_mod * sgn,
 					       GFP_ATOMIC);
 			break;
@@ -807,8 +803,8 @@ static int add_delayed_refs(struct btrfs_delayed_ref_head *head, u64 seq,
 
 			ref = btrfs_delayed_node_to_data_ref(node);
 
-			ret = add_direct_ref(preftrees, 0, ref->parent,
-					     node->bytenr,
+			ret = add_direct_ref(fs_info, preftrees, 0,
+					     ref->parent, node->bytenr,
 					     node->ref_mod * sgn,
 					     GFP_ATOMIC);
 			break;
@@ -826,7 +822,8 @@ static int add_delayed_refs(struct btrfs_delayed_ref_head *head, u64 seq,
 /*
  * add all inline backrefs for bytenr to the list
  */
-static int add_inline_refs(struct btrfs_path *path, u64 bytenr,
+static int add_inline_refs(const struct btrfs_fs_info *fs_info,
+			   struct btrfs_path *path, u64 bytenr,
 			   int *info_level, struct preftrees *preftrees,
 			   u64 *total_refs, u64 inum)
 {
@@ -883,7 +880,8 @@ static int add_inline_refs(struct btrfs_path *path, u64 bytenr,
 
 		switch (type) {
 		case BTRFS_SHARED_BLOCK_REF_KEY:
-			ret = add_direct_ref(preftrees, *info_level + 1, offset,
+			ret = add_direct_ref(fs_info, preftrees,
+					     *info_level + 1, offset,
 					     bytenr, 1, GFP_NOFS);
 			break;
 		case BTRFS_SHARED_DATA_REF_KEY: {
@@ -893,14 +891,14 @@ static int add_inline_refs(struct btrfs_path *path, u64 bytenr,
 			sdref = (struct btrfs_shared_data_ref *)(iref + 1);
 			count = btrfs_shared_data_ref_count(leaf, sdref);
 
-			ret = add_direct_ref(preftrees, 0, offset,
+			ret = add_direct_ref(fs_info, preftrees, 0, offset,
 					     bytenr, count, GFP_NOFS);
 			break;
 		}
 		case BTRFS_TREE_BLOCK_REF_KEY:
-			ret = add_indirect_ref(preftrees, offset, NULL,
-					       *info_level + 1, bytenr, 1,
-					       GFP_NOFS);
+			ret = add_indirect_ref(fs_info, preftrees, offset,
+					       NULL, *info_level + 1,
+					       bytenr, 1, GFP_NOFS);
 			break;
 		case BTRFS_EXTENT_DATA_REF_KEY: {
 			struct btrfs_extent_data_ref *dref;
@@ -921,8 +919,9 @@ static int add_inline_refs(struct btrfs_path *path, u64 bytenr,
 
 			root = btrfs_extent_data_ref_root(leaf, dref);
 
-			ret = add_indirect_ref(preftrees, root, &key, 0, bytenr,
-					       count, GFP_NOFS);
+			ret = add_indirect_ref(fs_info, preftrees, root,
+					       &key, 0, bytenr, count,
+					       GFP_NOFS);
 			break;
 		}
 		default:
@@ -973,9 +972,9 @@ static int add_keyed_refs(struct btrfs_fs_info *fs_info,
 		switch (key.type) {
 		case BTRFS_SHARED_BLOCK_REF_KEY:
 			/* SHARED DIRECT METADATA backref */
-			ret = add_direct_ref(preftrees, info_level + 1,
-					     key.offset, bytenr, 1,
-					     GFP_NOFS);
+			ret = add_direct_ref(fs_info, preftrees,
+					     info_level + 1, key.offset,
+					     bytenr, 1, GFP_NOFS);
 			break;
 		case BTRFS_SHARED_DATA_REF_KEY: {
 			/* SHARED DIRECT FULL backref */
@@ -985,15 +984,16 @@ static int add_keyed_refs(struct btrfs_fs_info *fs_info,
 			sdref = btrfs_item_ptr(leaf, slot,
 					      struct btrfs_shared_data_ref);
 			count = btrfs_shared_data_ref_count(leaf, sdref);
-			ret = add_direct_ref(preftrees, 0, key.offset, bytenr,
-					     count, GFP_NOFS);
+			ret = add_direct_ref(fs_info, preftrees, 0,
+					     key.offset, bytenr, count,
+					     GFP_NOFS);
 			break;
 		}
 		case BTRFS_TREE_BLOCK_REF_KEY:
 			/* NORMAL INDIRECT METADATA backref */
-			ret = add_indirect_ref(preftrees, key.offset, NULL,
-					       info_level + 1, bytenr, 1,
-					       GFP_NOFS);
+			ret = add_indirect_ref(fs_info, preftrees, key.offset,
+					       NULL, info_level + 1, bytenr,
+					       1, GFP_NOFS);
 			break;
 		case BTRFS_EXTENT_DATA_REF_KEY: {
 			/* NORMAL INDIRECT DATA backref */
@@ -1015,8 +1015,9 @@ static int add_keyed_refs(struct btrfs_fs_info *fs_info,
 			}
 
 			root = btrfs_extent_data_ref_root(leaf, dref);
-			ret = add_indirect_ref(preftrees, root, &key, 0, bytenr,
-					       count, GFP_NOFS);
+			ret = add_indirect_ref(fs_info, preftrees, root,
+					       &key, 0, bytenr, count,
+					       GFP_NOFS);
 			break;
 		}
 		default:
@@ -1129,8 +1130,8 @@ again:
 				goto again;
 			}
 			spin_unlock(&delayed_refs->lock);
-			ret = add_delayed_refs(head, time_seq, &preftrees,
-					       &total_refs, inum);
+			ret = add_delayed_refs(fs_info, head, time_seq,
+					       &preftrees, &total_refs, inum);
 			mutex_unlock(&head->mutex);
 			if (ret)
 				goto out;
@@ -1150,8 +1151,9 @@ again:
 		if (key.objectid == bytenr &&
 		    (key.type == BTRFS_EXTENT_ITEM_KEY ||
 		     key.type == BTRFS_METADATA_ITEM_KEY)) {
-			ret = add_inline_refs(path, bytenr, &info_level,
-					      &preftrees, &total_refs, inum);
+			ret = add_inline_refs(fs_info, path, bytenr,
+					      &info_level, &preftrees,
+					      &total_refs, inum);
 			if (ret)
 				goto out;
 			ret = add_keyed_refs(fs_info, path, bytenr, info_level,
diff --git a/fs/btrfs/backref.h b/fs/btrfs/backref.h
index f9428aaaa77a..e410335841aa 100644
--- a/fs/btrfs/backref.h
+++ b/fs/btrfs/backref.h
@@ -72,4 +72,16 @@ int btrfs_check_shared(struct btrfs_root *root, u64 inum, u64 bytenr);
 
 int __init btrfs_prelim_ref_init(void);
 void btrfs_prelim_ref_exit(void);
+
+struct prelim_ref {
+	struct rb_node rbnode;
+	u64 root_id;
+	struct btrfs_key key_for_search;
+	int level;
+	int count;
+	struct extent_inode_elem *inode_list;
+	u64 parent;
+	u64 wanted_disk_byte;
+};
+
 #endif
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 12540b6104b5..58650f2e0f17 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -61,6 +61,7 @@
 #include "tests/btrfs-tests.h"
 
 #include "qgroup.h"
+#include "backref.h"
 #define CREATE_TRACE_POINTS
 #include <trace/events/btrfs.h>
 
diff --git a/include/trace/events/btrfs.h b/include/trace/events/btrfs.h
index 42560feb9920..90d25085762f 100644
--- a/include/trace/events/btrfs.h
+++ b/include/trace/events/btrfs.h
@@ -26,6 +26,7 @@ struct btrfs_work;
 struct __btrfs_workqueue;
 struct btrfs_qgroup_extent_record;
 struct btrfs_qgroup;
+struct prelim_ref;
 
 #define show_ref_type(type)						\
 	__print_symbolic(type,						\
@@ -1636,6 +1637,63 @@ TRACE_EVENT(qgroup_meta_reserve,
 		show_root_type(__entry->refroot), __entry->diff)
 );
 
+DECLARE_EVENT_CLASS(btrfs__prelim_ref,
+	TP_PROTO(const struct btrfs_fs_info *fs_info,
+		 const struct prelim_ref *oldref,
+		 const struct prelim_ref *newref, u64 tree_size),
+	TP_ARGS(fs_info, newref, oldref, tree_size),
+
+	TP_STRUCT__entry_btrfs(
+		__field(	u64,  root_id		)
+		__field(	u64,  objectid		)
+		__field(	 u8,  type		)
+		__field(	u64,  offset		)
+		__field(	int,  level		)
+		__field(	int,  old_count		)
+		__field(	u64,  parent		)
+		__field(	u64,  bytenr		)
+		__field(	int,  mod_count		)
+		__field(	u64,  tree_size		)
+	),
+
+	TP_fast_assign_btrfs(fs_info,
+		__entry->root_id	= oldref->root_id;
+		__entry->objectid	= oldref->key_for_search.objectid;
+		__entry->type		= oldref->key_for_search.type;
+		__entry->offset		= oldref->key_for_search.offset;
+		__entry->level		= oldref->level;
+		__entry->old_count	= oldref->count;
+		__entry->parent		= oldref->parent;
+		__entry->bytenr		= oldref->wanted_disk_byte;
+		__entry->mod_count	= newref ? newref->count : 0;
+		__entry->tree_size	= tree_size;
+	),
+
+	TP_printk_btrfs("root_id=%llu key=[%llu,%u,%llu] level=%d count=[%d+%d=%d] parent=%llu wanted_disk_byte=%llu nodes=%llu",
+			(unsigned long long)__entry->root_id,
+			(unsigned long long)__entry->objectid, __entry->type,
+			(unsigned long long)__entry->offset, __entry->level,
+			__entry->old_count, __entry->mod_count,
+			__entry->old_count + __entry->mod_count,
+			(unsigned long long)__entry->parent,
+			(unsigned long long)__entry->bytenr,
+			(unsigned long long)__entry->tree_size)
+);
+
+DEFINE_EVENT(btrfs__prelim_ref, btrfs_prelim_ref_merge,
+	TP_PROTO(const struct btrfs_fs_info *fs_info,
+		 const struct prelim_ref *oldref,
+		 const struct prelim_ref *newref, u64 tree_size),
+	TP_ARGS(fs_info, oldref, newref, tree_size)
+);
+
+DEFINE_EVENT(btrfs__prelim_ref, btrfs_prelim_ref_insert,
+	TP_PROTO(const struct btrfs_fs_info *fs_info,
+		 const struct prelim_ref *oldref,
+		 const struct prelim_ref *newref, u64 tree_size),
+	TP_ARGS(fs_info, oldref, newref, tree_size)
+);
+
 #endif /* _TRACE_BTRFS_H */
 
 /* This part must be outside protection */
-- 
cgit v1.2.3


From 7bdd6277e0dc2beb4f5db5ea4ff7670ecf0b5879 Mon Sep 17 00:00:00 2001
From: Nikolay Borisov <nborisov@suse.com>
Date: Tue, 11 Jul 2017 13:25:13 +0300
Subject: btrfs: Remove redundant argument of flush_space

All callers of flush_space pass the same number for orig/num_bytes
arguments. Let's remove one of the numbers and also modify the trace
point to show only a single number - bytes requested.

Seems that last point where the two parameters were treated differently
is before the ticketed enospc rework.

Signed-off-by: Nikolay Borisov <nborisov@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/extent-tree.c       | 16 +++++++---------
 include/trace/events/btrfs.h | 13 +++++--------
 2 files changed, 12 insertions(+), 17 deletions(-)

(limited to 'include')

diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 6d04563585e6..288b38ae8791 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -4906,7 +4906,7 @@ struct reserve_ticket {
 
 static int flush_space(struct btrfs_fs_info *fs_info,
 		       struct btrfs_space_info *space_info, u64 num_bytes,
-		       u64 orig_bytes, int state)
+		       int state)
 {
 	struct btrfs_root *root = fs_info->extent_root;
 	struct btrfs_trans_handle *trans;
@@ -4931,7 +4931,7 @@ static int flush_space(struct btrfs_fs_info *fs_info,
 		break;
 	case FLUSH_DELALLOC:
 	case FLUSH_DELALLOC_WAIT:
-		shrink_delalloc(fs_info, num_bytes * 2, orig_bytes,
+		shrink_delalloc(fs_info, num_bytes * 2, num_bytes,
 				state == FLUSH_DELALLOC_WAIT);
 		break;
 	case ALLOC_CHUNK:
@@ -4949,15 +4949,15 @@ static int flush_space(struct btrfs_fs_info *fs_info,
 		break;
 	case COMMIT_TRANS:
 		ret = may_commit_transaction(fs_info, space_info,
-					     orig_bytes, 0);
+					     num_bytes, 0);
 		break;
 	default:
 		ret = -ENOSPC;
 		break;
 	}
 
-	trace_btrfs_flush_space(fs_info, space_info->flags, num_bytes,
-				orig_bytes, state, ret);
+	trace_btrfs_flush_space(fs_info, space_info->flags, num_bytes, state,
+				ret);
 	return ret;
 }
 
@@ -5063,8 +5063,7 @@ static void btrfs_async_reclaim_metadata_space(struct work_struct *work)
 		struct reserve_ticket *ticket;
 		int ret;
 
-		ret = flush_space(fs_info, space_info, to_reclaim, to_reclaim,
-				  flush_state);
+		ret = flush_space(fs_info, space_info, to_reclaim, flush_state);
 		spin_lock(&space_info->lock);
 		if (list_empty(&space_info->tickets)) {
 			space_info->flush = 0;
@@ -5120,8 +5119,7 @@ static void priority_reclaim_metadata_space(struct btrfs_fs_info *fs_info,
 	spin_unlock(&space_info->lock);
 
 	do {
-		flush_space(fs_info, space_info, to_reclaim, to_reclaim,
-			    flush_state);
+		flush_space(fs_info, space_info, to_reclaim, flush_state);
 		flush_state++;
 		spin_lock(&space_info->lock);
 		if (ticket->bytes == 0) {
diff --git a/include/trace/events/btrfs.h b/include/trace/events/btrfs.h
index 90d25085762f..1e4908dcd065 100644
--- a/include/trace/events/btrfs.h
+++ b/include/trace/events/btrfs.h
@@ -1011,15 +1011,14 @@ TRACE_EVENT(btrfs_trigger_flush,
 TRACE_EVENT(btrfs_flush_space,
 
 	TP_PROTO(const struct btrfs_fs_info *fs_info, u64 flags, u64 num_bytes,
-		 u64 orig_bytes, int state, int ret),
+		 int state, int ret),
 
-	TP_ARGS(fs_info, flags, num_bytes, orig_bytes, state, ret),
+	TP_ARGS(fs_info, flags, num_bytes, state, ret),
 
 	TP_STRUCT__entry(
 		__array(	u8,	fsid,	BTRFS_UUID_SIZE	)
 		__field(	u64,	flags			)
 		__field(	u64,	num_bytes		)
-		__field(	u64,	orig_bytes		)
 		__field(	int,	state			)
 		__field(	int,	ret			)
 	),
@@ -1028,19 +1027,17 @@ TRACE_EVENT(btrfs_flush_space,
 		memcpy(__entry->fsid, fs_info->fsid, BTRFS_UUID_SIZE);
 		__entry->flags		=	flags;
 		__entry->num_bytes	=	num_bytes;
-		__entry->orig_bytes	=	orig_bytes;
 		__entry->state		=	state;
 		__entry->ret		=	ret;
 	),
 
-	TP_printk("%pU: state=%d(%s) flags=%llu(%s) num_bytes=%llu "
-		  "orig_bytes=%llu ret=%d", __entry->fsid, __entry->state,
+	TP_printk("%pU: state=%d(%s) flags=%llu(%s) num_bytes=%llu ret=%d",
+		  __entry->fsid, __entry->state,
 		  show_flush_state(__entry->state),
 		  (unsigned long long)__entry->flags,
 		  __print_flags((unsigned long)__entry->flags, "|",
 				BTRFS_GROUP_FLAGS),
-		  (unsigned long long)__entry->num_bytes,
-		  (unsigned long long)__entry->orig_bytes, __entry->ret)
+		  (unsigned long long)__entry->num_bytes, __entry->ret)
 );
 
 DECLARE_EVENT_CLASS(btrfs__reserved_extent,
-- 
cgit v1.2.3


From b19af510e67e6ca696b8721f45c148119437307c Mon Sep 17 00:00:00 2001
From: Bartosz Golaszewski <brgl@bgdev.pl>
Date: Mon, 14 Aug 2017 16:53:16 +0200
Subject: genirq/irq_sim: Add a simple interrupt simulator framework

Implement a simple, irq_work-based framework for simulating
interrupts. Currently the API exposes routines for initializing and
deinitializing the simulator object, enqueueing the interrupts and
retrieving the allocated interrupt numbers based on the offset of the
dummy interrupt in the simulator struct.

Signed-off-by: Bartosz Golaszewski <brgl@bgdev.pl>
Reviewed-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
Cc: Lars-Peter Clausen <lars@metafoo.de>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Marc Zyngier <marc.zyngier@arm.com>
Cc: Linus Walleij <linus.walleij@linaro.org>
Cc: linux-doc@vger.kernel.org
Cc: linux-gpio@vger.kernel.org
Cc: Bamvor Jian Zhang <bamvor.zhangjian@linaro.org>
Cc: Jonathan Cameron <jic23@kernel.org>
Link: http://lkml.kernel.org/r/20170814145318.6495-2-brgl@bgdev.pl
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 include/linux/irq_sim.h |  41 ++++++++++++++++
 kernel/irq/Kconfig      |   5 ++
 kernel/irq/Makefile     |   1 +
 kernel/irq/irq_sim.c    | 121 ++++++++++++++++++++++++++++++++++++++++++++++++
 4 files changed, 168 insertions(+)
 create mode 100644 include/linux/irq_sim.h
 create mode 100644 kernel/irq/irq_sim.c

(limited to 'include')

diff --git a/include/linux/irq_sim.h b/include/linux/irq_sim.h
new file mode 100644
index 000000000000..39ce57bfa995
--- /dev/null
+++ b/include/linux/irq_sim.h
@@ -0,0 +1,41 @@
+#ifndef _LINUX_IRQ_SIM_H
+#define _LINUX_IRQ_SIM_H
+/*
+ * Copyright (C) 2017 Bartosz Golaszewski <brgl@bgdev.pl>
+ *
+ * This program is free software; you can redistribute  it and/or modify it
+ * under  the terms of  the GNU General  Public License as published by the
+ * Free Software Foundation;  either version 2 of the  License, or (at your
+ * option) any later version.
+ */
+
+#include <linux/irq_work.h>
+
+/*
+ * Provides a framework for allocating simulated interrupts which can be
+ * requested like normal irqs and enqueued from process context.
+ */
+
+struct irq_sim_work_ctx {
+	struct irq_work		work;
+	int			irq;
+};
+
+struct irq_sim_irq_ctx {
+	int			irqnum;
+	bool			enabled;
+};
+
+struct irq_sim {
+	struct irq_sim_work_ctx	work_ctx;
+	int			irq_base;
+	unsigned int		irq_count;
+	struct irq_sim_irq_ctx	*irqs;
+};
+
+int irq_sim_init(struct irq_sim *sim, unsigned int num_irqs);
+void irq_sim_fini(struct irq_sim *sim);
+void irq_sim_fire(struct irq_sim *sim, unsigned int offset);
+int irq_sim_irqnum(struct irq_sim *sim, unsigned int offset);
+
+#endif /* _LINUX_IRQ_SIM_H */
diff --git a/kernel/irq/Kconfig b/kernel/irq/Kconfig
index 27c4e774071c..1d06af787932 100644
--- a/kernel/irq/Kconfig
+++ b/kernel/irq/Kconfig
@@ -63,6 +63,11 @@ config GENERIC_IRQ_CHIP
 config IRQ_DOMAIN
 	bool
 
+# Support for simulated interrupts
+config IRQ_SIM
+	bool
+	select IRQ_WORK
+
 # Support for hierarchical irq domains
 config IRQ_DOMAIN_HIERARCHY
 	bool
diff --git a/kernel/irq/Makefile b/kernel/irq/Makefile
index e4aef7351f2b..1970cafe8f2a 100644
--- a/kernel/irq/Makefile
+++ b/kernel/irq/Makefile
@@ -4,6 +4,7 @@ obj-$(CONFIG_IRQ_TIMINGS) += timings.o
 obj-$(CONFIG_GENERIC_IRQ_CHIP) += generic-chip.o
 obj-$(CONFIG_GENERIC_IRQ_PROBE) += autoprobe.o
 obj-$(CONFIG_IRQ_DOMAIN) += irqdomain.o
+obj-$(CONFIG_IRQ_SIM) += irq_sim.o
 obj-$(CONFIG_PROC_FS) += proc.o
 obj-$(CONFIG_GENERIC_PENDING_IRQ) += migration.o
 obj-$(CONFIG_GENERIC_IRQ_MIGRATION) += cpuhotplug.o
diff --git a/kernel/irq/irq_sim.c b/kernel/irq/irq_sim.c
new file mode 100644
index 000000000000..31a2c12a79ae
--- /dev/null
+++ b/kernel/irq/irq_sim.c
@@ -0,0 +1,121 @@
+/*
+ * Copyright (C) 2017 Bartosz Golaszewski <brgl@bgdev.pl>
+ *
+ * This program is free software; you can redistribute  it and/or modify it
+ * under  the terms of  the GNU General  Public License as published by the
+ * Free Software Foundation;  either version 2 of the  License, or (at your
+ * option) any later version.
+ */
+
+#include <linux/irq_sim.h>
+#include <linux/irq.h>
+
+static void irq_sim_irqmask(struct irq_data *data)
+{
+	struct irq_sim_irq_ctx *irq_ctx = irq_data_get_irq_chip_data(data);
+
+	irq_ctx->enabled = false;
+}
+
+static void irq_sim_irqunmask(struct irq_data *data)
+{
+	struct irq_sim_irq_ctx *irq_ctx = irq_data_get_irq_chip_data(data);
+
+	irq_ctx->enabled = true;
+}
+
+static struct irq_chip irq_sim_irqchip = {
+	.name		= "irq_sim",
+	.irq_mask	= irq_sim_irqmask,
+	.irq_unmask	= irq_sim_irqunmask,
+};
+
+static void irq_sim_handle_irq(struct irq_work *work)
+{
+	struct irq_sim_work_ctx *work_ctx;
+
+	work_ctx = container_of(work, struct irq_sim_work_ctx, work);
+	handle_simple_irq(irq_to_desc(work_ctx->irq));
+}
+
+/**
+ * irq_sim_init - Initialize the interrupt simulator: allocate a range of
+ *                dummy interrupts.
+ *
+ * @sim:        The interrupt simulator object to initialize.
+ * @num_irqs:   Number of interrupts to allocate
+ *
+ * Returns 0 on success and a negative error number on failure.
+ */
+int irq_sim_init(struct irq_sim *sim, unsigned int num_irqs)
+{
+	int i;
+
+	sim->irqs = kmalloc_array(num_irqs, sizeof(*sim->irqs), GFP_KERNEL);
+	if (!sim->irqs)
+		return -ENOMEM;
+
+	sim->irq_base = irq_alloc_descs(-1, 0, num_irqs, 0);
+	if (sim->irq_base < 0) {
+		kfree(sim->irqs);
+		return sim->irq_base;
+	}
+
+	for (i = 0; i < num_irqs; i++) {
+		sim->irqs[i].irqnum = sim->irq_base + i;
+		sim->irqs[i].enabled = false;
+		irq_set_chip(sim->irq_base + i, &irq_sim_irqchip);
+		irq_set_chip_data(sim->irq_base + i, &sim->irqs[i]);
+		irq_set_handler(sim->irq_base + i, &handle_simple_irq);
+		irq_modify_status(sim->irq_base + i,
+				  IRQ_NOREQUEST | IRQ_NOAUTOEN, IRQ_NOPROBE);
+	}
+
+	init_irq_work(&sim->work_ctx.work, irq_sim_handle_irq);
+	sim->irq_count = num_irqs;
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(irq_sim_init);
+
+/**
+ * irq_sim_fini - Deinitialize the interrupt simulator: free the interrupt
+ *                descriptors and allocated memory.
+ *
+ * @sim:        The interrupt simulator to tear down.
+ */
+void irq_sim_fini(struct irq_sim *sim)
+{
+	irq_work_sync(&sim->work_ctx.work);
+	irq_free_descs(sim->irq_base, sim->irq_count);
+	kfree(sim->irqs);
+}
+EXPORT_SYMBOL_GPL(irq_sim_fini);
+
+/**
+ * irq_sim_fire - Enqueue an interrupt.
+ *
+ * @sim:        The interrupt simulator object.
+ * @offset:     Offset of the simulated interrupt which should be fired.
+ */
+void irq_sim_fire(struct irq_sim *sim, unsigned int offset)
+{
+	if (sim->irqs[offset].enabled) {
+		sim->work_ctx.irq = irq_sim_irqnum(sim, offset);
+		irq_work_queue(&sim->work_ctx.work);
+	}
+}
+EXPORT_SYMBOL_GPL(irq_sim_fire);
+
+/**
+ * irq_sim_irqnum - Get the allocated number of a dummy interrupt.
+ *
+ * @sim:        The interrupt simulator object.
+ * @offset:     Offset of the simulated interrupt for which to retrieve
+ *              the number.
+ */
+int irq_sim_irqnum(struct irq_sim *sim, unsigned int offset)
+{
+	return sim->irqs[offset].irqnum;
+}
+EXPORT_SYMBOL_GPL(irq_sim_irqnum);
-- 
cgit v1.2.3


From 44e72c7ebf294043cfe276f7328b8c0e6a3e50e9 Mon Sep 17 00:00:00 2001
From: Bartosz Golaszewski <brgl@bgdev.pl>
Date: Mon, 14 Aug 2017 16:53:17 +0200
Subject: genirq/irq_sim: Add a devres variant of irq_sim_init()

Add a resource managed version of irq_sim_init(). This can be
conveniently used in device drivers.

Signed-off-by: Bartosz Golaszewski <brgl@bgdev.pl>
Acked-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
Cc: Lars-Peter Clausen <lars@metafoo.de>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Marc Zyngier <marc.zyngier@arm.com>
Cc: Linus Walleij <linus.walleij@linaro.org>
Cc: linux-doc@vger.kernel.org
Cc: linux-gpio@vger.kernel.org
Cc: Bamvor Jian Zhang <bamvor.zhangjian@linaro.org>
Cc: Jonathan Cameron <jic23@kernel.org>
Link: http://lkml.kernel.org/r/20170814145318.6495-3-brgl@bgdev.pl
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 Documentation/driver-model/devres.txt |  1 +
 include/linux/irq_sim.h               |  3 +++
 kernel/irq/irq_sim.c                  | 43 +++++++++++++++++++++++++++++++++++
 3 files changed, 47 insertions(+)

(limited to 'include')

diff --git a/Documentation/driver-model/devres.txt b/Documentation/driver-model/devres.txt
index 30e04f7a690d..69f08c0f23a8 100644
--- a/Documentation/driver-model/devres.txt
+++ b/Documentation/driver-model/devres.txt
@@ -312,6 +312,7 @@ IRQ
   devm_irq_alloc_descs_from()
   devm_irq_alloc_generic_chip()
   devm_irq_setup_generic_chip()
+  devm_irq_sim_init()
 
 LED
   devm_led_classdev_register()
diff --git a/include/linux/irq_sim.h b/include/linux/irq_sim.h
index 39ce57bfa995..0380d899b955 100644
--- a/include/linux/irq_sim.h
+++ b/include/linux/irq_sim.h
@@ -10,6 +10,7 @@
  */
 
 #include <linux/irq_work.h>
+#include <linux/device.h>
 
 /*
  * Provides a framework for allocating simulated interrupts which can be
@@ -34,6 +35,8 @@ struct irq_sim {
 };
 
 int irq_sim_init(struct irq_sim *sim, unsigned int num_irqs);
+int devm_irq_sim_init(struct device *dev, struct irq_sim *sim,
+		      unsigned int num_irqs);
 void irq_sim_fini(struct irq_sim *sim);
 void irq_sim_fire(struct irq_sim *sim, unsigned int offset);
 int irq_sim_irqnum(struct irq_sim *sim, unsigned int offset);
diff --git a/kernel/irq/irq_sim.c b/kernel/irq/irq_sim.c
index 31a2c12a79ae..24caabf1a0f7 100644
--- a/kernel/irq/irq_sim.c
+++ b/kernel/irq/irq_sim.c
@@ -10,6 +10,10 @@
 #include <linux/irq_sim.h>
 #include <linux/irq.h>
 
+struct irq_sim_devres {
+	struct irq_sim		*sim;
+};
+
 static void irq_sim_irqmask(struct irq_data *data)
 {
 	struct irq_sim_irq_ctx *irq_ctx = irq_data_get_irq_chip_data(data);
@@ -92,6 +96,45 @@ void irq_sim_fini(struct irq_sim *sim)
 }
 EXPORT_SYMBOL_GPL(irq_sim_fini);
 
+static void devm_irq_sim_release(struct device *dev, void *res)
+{
+	struct irq_sim_devres *this = res;
+
+	irq_sim_fini(this->sim);
+}
+
+/**
+ * irq_sim_init - Initialize the interrupt simulator for a managed device.
+ *
+ * @dev:        Device to initialize the simulator object for.
+ * @sim:        The interrupt simulator object to initialize.
+ * @num_irqs:   Number of interrupts to allocate
+ *
+ * Returns 0 on success and a negative error number on failure.
+ */
+int devm_irq_sim_init(struct device *dev, struct irq_sim *sim,
+		      unsigned int num_irqs)
+{
+	struct irq_sim_devres *dr;
+	int rv;
+
+	dr = devres_alloc(devm_irq_sim_release, sizeof(*dr), GFP_KERNEL);
+	if (!dr)
+		return -ENOMEM;
+
+	rv = irq_sim_init(sim, num_irqs);
+	if (rv) {
+		devres_free(dr);
+		return rv;
+	}
+
+	dr->sim = sim;
+	devres_add(dev, dr);
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(devm_irq_sim_init);
+
 /**
  * irq_sim_fire - Enqueue an interrupt.
  *
-- 
cgit v1.2.3


From 6f3d791f300618caf82a2be0c27456edd76d5164 Mon Sep 17 00:00:00 2001
From: "K. Y. Srinivasan" <kys@microsoft.com>
Date: Fri, 11 Aug 2017 10:03:59 -0700
Subject: Drivers: hv: vmbus: Fix rescind handling issues

This patch handles the following issues that were observed when we are
handling racing channel offer message and rescind message for the same
offer:

1. Since the host does not respond to messages on a rescinded channel,
in the current code, we could be indefinitely blocked on the vmbus_open() call.

2. When a rescinded channel is being closed, if there is a pending interrupt on the
channel, we could end up freeing the channel that the interrupt handler would run on.

Signed-off-by: K. Y. Srinivasan <kys@microsoft.com>
Reviewed-by: Dexuan Cui <decui@microsoft.com>
Tested-by: Dexuan Cui <decui@microsoft.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/hv/channel.c      | 14 ++++++++++++++
 drivers/hv/channel_mgmt.c | 29 ++++++++++++++++++++++++++---
 drivers/hv/vmbus_drv.c    |  3 +++
 include/linux/hyperv.h    |  2 ++
 4 files changed, 45 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/drivers/hv/channel.c b/drivers/hv/channel.c
index e57cc40cb768..63ac1c6a825f 100644
--- a/drivers/hv/channel.c
+++ b/drivers/hv/channel.c
@@ -177,6 +177,11 @@ int vmbus_open(struct vmbus_channel *newchannel, u32 send_ringbuffer_size,
 		      &vmbus_connection.chn_msg_list);
 	spin_unlock_irqrestore(&vmbus_connection.channelmsg_lock, flags);
 
+	if (newchannel->rescind) {
+		err = -ENODEV;
+		goto error_free_gpadl;
+	}
+
 	ret = vmbus_post_msg(open_msg,
 			     sizeof(struct vmbus_channel_open_channel), true);
 
@@ -421,6 +426,11 @@ int vmbus_establish_gpadl(struct vmbus_channel *channel, void *kbuffer,
 
 	spin_unlock_irqrestore(&vmbus_connection.channelmsg_lock, flags);
 
+	if (channel->rescind) {
+		ret = -ENODEV;
+		goto cleanup;
+	}
+
 	ret = vmbus_post_msg(gpadlmsg, msginfo->msgsize -
 			     sizeof(*msginfo), true);
 	if (ret != 0)
@@ -494,6 +504,10 @@ int vmbus_teardown_gpadl(struct vmbus_channel *channel, u32 gpadl_handle)
 	list_add_tail(&info->msglistentry,
 		      &vmbus_connection.chn_msg_list);
 	spin_unlock_irqrestore(&vmbus_connection.channelmsg_lock, flags);
+
+	if (channel->rescind)
+		goto post_msg_err;
+
 	ret = vmbus_post_msg(msg, sizeof(struct vmbus_channel_gpadl_teardown),
 			     true);
 
diff --git a/drivers/hv/channel_mgmt.c b/drivers/hv/channel_mgmt.c
index 4bbb8dea4727..968af173c4c1 100644
--- a/drivers/hv/channel_mgmt.c
+++ b/drivers/hv/channel_mgmt.c
@@ -451,6 +451,12 @@ static void vmbus_process_offer(struct vmbus_channel *newchannel)
 	/* Make sure this is a new offer */
 	mutex_lock(&vmbus_connection.channel_mutex);
 
+	/*
+	 * Now that we have acquired the channel_mutex,
+	 * we can release the potentially racing rescind thread.
+	 */
+	atomic_dec(&vmbus_connection.offer_in_progress);
+
 	list_for_each_entry(channel, &vmbus_connection.chn_list, listentry) {
 		if (!uuid_le_cmp(channel->offermsg.offer.if_type,
 			newchannel->offermsg.offer.if_type) &&
@@ -481,7 +487,6 @@ static void vmbus_process_offer(struct vmbus_channel *newchannel)
 			channel->num_sc++;
 			spin_unlock_irqrestore(&channel->lock, flags);
 		} else {
-			atomic_dec(&vmbus_connection.offer_in_progress);
 			goto err_free_chan;
 		}
 	}
@@ -510,7 +515,6 @@ static void vmbus_process_offer(struct vmbus_channel *newchannel)
 	if (!fnew) {
 		if (channel->sc_creation_callback != NULL)
 			channel->sc_creation_callback(newchannel);
-		atomic_dec(&vmbus_connection.offer_in_progress);
 		return;
 	}
 
@@ -541,7 +545,7 @@ static void vmbus_process_offer(struct vmbus_channel *newchannel)
 		goto err_deq_chan;
 	}
 
-	atomic_dec(&vmbus_connection.offer_in_progress);
+	newchannel->probe_done = true;
 	return;
 
 err_deq_chan:
@@ -882,8 +886,27 @@ static void vmbus_onoffer_rescind(struct vmbus_channel_message_header *hdr)
 	channel->rescind = true;
 	spin_unlock_irqrestore(&channel->lock, flags);
 
+	/*
+	 * Now that we have posted the rescind state, perform
+	 * rescind related cleanup.
+	 */
 	vmbus_rescind_cleanup(channel);
 
+	/*
+	 * Now wait for offer handling to complete.
+	 */
+	while (READ_ONCE(channel->probe_done) == false) {
+		/*
+		 * We wait here until any channel offer is currently
+		 * being processed.
+		 */
+		msleep(1);
+	}
+
+	/*
+	 * At this point, the rescind handling can proceed safely.
+	 */
+
 	if (channel->device_obj) {
 		if (channel->chn_rescind_callback) {
 			channel->chn_rescind_callback(channel);
diff --git a/drivers/hv/vmbus_drv.c b/drivers/hv/vmbus_drv.c
index ed84e96715a0..43160a2eafe0 100644
--- a/drivers/hv/vmbus_drv.c
+++ b/drivers/hv/vmbus_drv.c
@@ -940,6 +940,9 @@ static void vmbus_chan_sched(struct hv_per_cpu_context *hv_cpu)
 			if (channel->offermsg.child_relid != relid)
 				continue;
 
+			if (channel->rescind)
+				continue;
+
 			switch (channel->callback_mode) {
 			case HV_CALL_ISR:
 				vmbus_channel_isr(channel);
diff --git a/include/linux/hyperv.h b/include/linux/hyperv.h
index 27db4e650b8c..07650d0232cc 100644
--- a/include/linux/hyperv.h
+++ b/include/linux/hyperv.h
@@ -879,6 +879,8 @@ struct vmbus_channel {
 	 */
 	enum hv_numa_policy affinity_policy;
 
+	bool probe_done;
+
 };
 
 static inline bool is_hvsock_channel(const struct vmbus_channel *c)
-- 
cgit v1.2.3


From 0d58e6c1b19b30623b5f0a053818bd2c32d61166 Mon Sep 17 00:00:00 2001
From: Paul Burton <paul.burton@imgtec.com>
Date: Tue, 15 Aug 2017 12:02:17 -0700
Subject: PCI: Add pci_irqd_intx_xlate()

Legacy PCI INTx interrupts are represented in the PCI_INTERRUPT_PIN
register using the range 1-4, which matches our enum pci_interrupt_pin.
This is however not ideal for an IRQ domain, where with 4 interrupts we
would ideally have a domain of size 4 & hwirq numbers in the range 0-3.

Different PCI host controller drivers have handled this in different ways.
Of those under drivers/pci/ which register an INTx IRQ domain, we have:

  - pcie-altera uses the range 1-4 in device trees and an IRQ domain of
    size 5 to cover that range, with entry 0 wasted.

  - pcie-xilinx & pcie-xilinx-nwl use the range 1-4 in device trees but
    register an IRQ domain of size 4, which doesn't cover the hwirq=4/INTD
    case leading to that interrupt being broken.

  - pci-ftpci100 & pci-aardvark use the range 0-3 in both device trees & as
    hwirq numbering in the driver & IRQ domain.

In order to introduce some level of consistency in at least the hwirq
numbering used by the drivers & IRQ domains, this patch introduces a new
pci_irqd_intx_xlate() helper function which drivers using the 1-4 range in
device trees can assign as the xlate callback for their INTx IRQ domain.
This translates the 1-4 range into a 0-3 range, allowing us to use an IRQ
domain of size 4 & avoid a wasted entry. Further patches will make use of
this in drivers to allow them to use an IRQ domain of size 4 for legacy
INTx interrupts without breaking INTD.

Signed-off-by: Paul Burton <paul.burton@imgtec.com>
Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
---
 include/linux/pci.h | 32 ++++++++++++++++++++++++++++++++
 1 file changed, 32 insertions(+)

(limited to 'include')

diff --git a/include/linux/pci.h b/include/linux/pci.h
index bb9c367c85f0..bbc2a991b63f 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -1416,6 +1416,38 @@ pci_alloc_irq_vectors(struct pci_dev *dev, unsigned int min_vecs,
 					      NULL);
 }
 
+/**
+ * pci_irqd_intx_xlate() - Translate PCI INTx value to an IRQ domain hwirq
+ * @d: the INTx IRQ domain
+ * @node: the DT node for the device whose interrupt we're translating
+ * @intspec: the interrupt specifier data from the DT
+ * @intsize: the number of entries in @intspec
+ * @out_hwirq: pointer at which to write the hwirq number
+ * @out_type: pointer at which to write the interrupt type
+ *
+ * Translate a PCI INTx interrupt number from device tree in the range 1-4, as
+ * stored in the standard PCI_INTERRUPT_PIN register, to a value in the range
+ * 0-3 suitable for use in a 4 entry IRQ domain. That is, subtract one from the
+ * INTx value to obtain the hwirq number.
+ *
+ * Returns 0 on success, or -EINVAL if the interrupt specifier is out of range.
+ */
+static inline int pci_irqd_intx_xlate(struct irq_domain *d,
+				      struct device_node *node,
+				      const u32 *intspec,
+				      unsigned int intsize,
+				      unsigned long *out_hwirq,
+				      unsigned int *out_type)
+{
+	const u32 intx = intspec[0];
+
+	if (intx < PCI_INTERRUPT_INTA || intx > PCI_INTERRUPT_INTD)
+		return -EINVAL;
+
+	*out_hwirq = intx - PCI_INTERRUPT_INTA;
+	return 0;
+}
+
 #ifdef CONFIG_PCIEPORTBUS
 extern bool pcie_ports_disabled;
 extern bool pcie_ports_auto;
-- 
cgit v1.2.3


From b005fd189cec9407b700599e1e80e0552446ee79 Mon Sep 17 00:00:00 2001
From: John Fastabend <john.fastabend@gmail.com>
Date: Tue, 15 Aug 2017 22:31:58 -0700
Subject: bpf: introduce new program type for skbs on sockets

A class of programs, run from strparser and soon from a new map type
called sock map, are used with skb as the context but on established
sockets. By creating a specific program type for these we can use
bpf helpers that expect full sockets and get the verifier to ensure
these helpers are not used out of context.

The new type is BPF_PROG_TYPE_SK_SKB. This patch introduces the
infrastructure and type.

Signed-off-by: John Fastabend <john.fastabend@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/bpf_types.h |  1 +
 include/uapi/linux/bpf.h  |  1 +
 net/core/filter.c         | 36 ++++++++++++++++++++++++++++++++++++
 3 files changed, 38 insertions(+)

(limited to 'include')

diff --git a/include/linux/bpf_types.h b/include/linux/bpf_types.h
index b1e1035ca24b..4b72db30dacf 100644
--- a/include/linux/bpf_types.h
+++ b/include/linux/bpf_types.h
@@ -11,6 +11,7 @@ BPF_PROG_TYPE(BPF_PROG_TYPE_LWT_IN, lwt_inout_prog_ops)
 BPF_PROG_TYPE(BPF_PROG_TYPE_LWT_OUT, lwt_inout_prog_ops)
 BPF_PROG_TYPE(BPF_PROG_TYPE_LWT_XMIT, lwt_xmit_prog_ops)
 BPF_PROG_TYPE(BPF_PROG_TYPE_SOCK_OPS, sock_ops_prog_ops)
+BPF_PROG_TYPE(BPF_PROG_TYPE_SK_SKB, sk_skb_prog_ops)
 #endif
 #ifdef CONFIG_BPF_EVENTS
 BPF_PROG_TYPE(BPF_PROG_TYPE_KPROBE, kprobe_prog_ops)
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 91da8371a2d0..2e796e384aeb 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -127,6 +127,7 @@ enum bpf_prog_type {
 	BPF_PROG_TYPE_LWT_OUT,
 	BPF_PROG_TYPE_LWT_XMIT,
 	BPF_PROG_TYPE_SOCK_OPS,
+	BPF_PROG_TYPE_SK_SKB,
 };
 
 enum bpf_attach_type {
diff --git a/net/core/filter.c b/net/core/filter.c
index e0688a855c47..46321033ae0e 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -3234,6 +3234,20 @@ static const struct bpf_func_proto *
 	}
 }
 
+static const struct bpf_func_proto *sk_skb_func_proto(enum bpf_func_id func_id)
+{
+	switch (func_id) {
+	case BPF_FUNC_skb_load_bytes:
+		return &bpf_skb_load_bytes_proto;
+	case BPF_FUNC_get_socket_cookie:
+		return &bpf_get_socket_cookie_proto;
+	case BPF_FUNC_get_socket_uid:
+		return &bpf_get_socket_uid_proto;
+	default:
+		return bpf_base_func_proto(func_id);
+	}
+}
+
 static const struct bpf_func_proto *
 lwt_xmit_func_proto(enum bpf_func_id func_id)
 {
@@ -3525,6 +3539,22 @@ static bool sock_ops_is_valid_access(int off, int size,
 	return __is_valid_sock_ops_access(off, size);
 }
 
+static bool sk_skb_is_valid_access(int off, int size,
+				   enum bpf_access_type type,
+				   struct bpf_insn_access_aux *info)
+{
+	switch (off) {
+	case bpf_ctx_range(struct __sk_buff, data):
+		info->reg_type = PTR_TO_PACKET;
+		break;
+	case bpf_ctx_range(struct __sk_buff, data_end):
+		info->reg_type = PTR_TO_PACKET_END;
+		break;
+	}
+
+	return bpf_skb_is_valid_access(off, size, type, info);
+}
+
 static u32 bpf_convert_ctx_access(enum bpf_access_type type,
 				  const struct bpf_insn *si,
 				  struct bpf_insn *insn_buf,
@@ -3994,6 +4024,12 @@ const struct bpf_verifier_ops sock_ops_prog_ops = {
 	.convert_ctx_access	= sock_ops_convert_ctx_access,
 };
 
+const struct bpf_verifier_ops sk_skb_prog_ops = {
+	.get_func_proto		= sk_skb_func_proto,
+	.is_valid_access	= sk_skb_is_valid_access,
+	.convert_ctx_access	= bpf_convert_ctx_access,
+};
+
 int sk_detach_filter(struct sock *sk)
 {
 	int ret = -ENOENT;
-- 
cgit v1.2.3


From a6f6df69c48b86cd84f36c70593eb4968fceb34a Mon Sep 17 00:00:00 2001
From: John Fastabend <john.fastabend@gmail.com>
Date: Tue, 15 Aug 2017 22:32:22 -0700
Subject: bpf: export bpf_prog_inc_not_zero

bpf_prog_inc_not_zero will be used by upcoming sockmap patches this
patch simply exports it so we can pull it in.

Signed-off-by: John Fastabend <john.fastabend@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/bpf.h  | 7 +++++++
 kernel/bpf/syscall.c | 3 ++-
 2 files changed, 9 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 39229c455cba..d6e1de8ce0fc 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -252,6 +252,7 @@ struct bpf_prog *bpf_prog_get_type(u32 ufd, enum bpf_prog_type type);
 struct bpf_prog * __must_check bpf_prog_add(struct bpf_prog *prog, int i);
 void bpf_prog_sub(struct bpf_prog *prog, int i);
 struct bpf_prog * __must_check bpf_prog_inc(struct bpf_prog *prog);
+struct bpf_prog * __must_check bpf_prog_inc_not_zero(struct bpf_prog *prog);
 void bpf_prog_put(struct bpf_prog *prog);
 int __bpf_prog_charge(struct user_struct *user, u32 pages);
 void __bpf_prog_uncharge(struct user_struct *user, u32 pages);
@@ -344,6 +345,12 @@ static inline struct bpf_prog * __must_check bpf_prog_inc(struct bpf_prog *prog)
 	return ERR_PTR(-EOPNOTSUPP);
 }
 
+static inline struct bpf_prog *__must_check
+bpf_prog_inc_not_zero(struct bpf_prog *prog)
+{
+	return ERR_PTR(-EOPNOTSUPP);
+}
+
 static inline int __bpf_prog_charge(struct user_struct *user, u32 pages)
 {
 	return 0;
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index fbe09a0cccf4..17e29f596de1 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -911,7 +911,7 @@ struct bpf_prog *bpf_prog_inc(struct bpf_prog *prog)
 EXPORT_SYMBOL_GPL(bpf_prog_inc);
 
 /* prog_idr_lock should have been held */
-static struct bpf_prog *bpf_prog_inc_not_zero(struct bpf_prog *prog)
+struct bpf_prog *bpf_prog_inc_not_zero(struct bpf_prog *prog)
 {
 	int refold;
 
@@ -927,6 +927,7 @@ static struct bpf_prog *bpf_prog_inc_not_zero(struct bpf_prog *prog)
 
 	return prog;
 }
+EXPORT_SYMBOL_GPL(bpf_prog_inc_not_zero);
 
 static struct bpf_prog *__bpf_prog_get(u32 ufd, enum bpf_prog_type *type)
 {
-- 
cgit v1.2.3


From 174a79ff9515f400b9a6115643dafd62a635b7e6 Mon Sep 17 00:00:00 2001
From: John Fastabend <john.fastabend@gmail.com>
Date: Tue, 15 Aug 2017 22:32:47 -0700
Subject: bpf: sockmap with sk redirect support

Recently we added a new map type called dev map used to forward XDP
packets between ports (6093ec2dc313). This patches introduces a
similar notion for sockets.

A sockmap allows users to add participating sockets to a map. When
sockets are added to the map enough context is stored with the
map entry to use the entry with a new helper

  bpf_sk_redirect_map(map, key, flags)

This helper (analogous to bpf_redirect_map in XDP) is given the map
and an entry in the map. When called from a sockmap program, discussed
below, the skb will be sent on the socket using skb_send_sock().

With the above we need a bpf program to call the helper from that will
then implement the send logic. The initial site implemented in this
series is the recv_sock hook. For this to work we implemented a map
attach command to add attributes to a map. In sockmap we add two
programs a parse program and a verdict program. The parse program
uses strparser to build messages and pass them to the verdict program.
The parse programs use the normal strparser semantics. The verdict
program is of type SK_SKB.

The verdict program returns a verdict SK_DROP, or  SK_REDIRECT for
now. Additional actions may be added later. When SK_REDIRECT is
returned, expected when bpf program uses bpf_sk_redirect_map(), the
sockmap logic will consult per cpu variables set by the helper routine
and pull the sock entry out of the sock map. This pattern follows the
existing redirect logic in cls and xdp programs.

This gives the flow,

 recv_sock -> str_parser (parse_prog) -> verdict_prog -> skb_send_sock
                                                     \
                                                      -> kfree_skb

As an example use case a message based load balancer may use specific
logic in the verdict program to select the sock to send on.

Sample programs are provided in future patches that hopefully illustrate
the user interfaces. Also selftests are in follow-on patches.

Signed-off-by: John Fastabend <john.fastabend@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/bpf.h       |   7 +-
 include/linux/bpf_types.h |   1 +
 include/linux/filter.h    |   2 +
 include/uapi/linux/bpf.h  |  33 +-
 kernel/bpf/Makefile       |   2 +-
 kernel/bpf/sockmap.c      | 792 ++++++++++++++++++++++++++++++++++++++++++++++
 kernel/bpf/syscall.c      |  51 ++-
 kernel/bpf/verifier.c     |  14 +
 net/core/filter.c         |  43 +++
 9 files changed, 940 insertions(+), 5 deletions(-)
 create mode 100644 kernel/bpf/sockmap.c

(limited to 'include')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index d6e1de8ce0fc..a4145e9c74b5 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -16,6 +16,7 @@
 #include <linux/rbtree_latch.h>
 
 struct perf_event;
+struct bpf_prog;
 struct bpf_map;
 
 /* map is generic key/value storage optionally accesible by eBPF programs */
@@ -37,6 +38,8 @@ struct bpf_map_ops {
 	void (*map_fd_put_ptr)(void *ptr);
 	u32 (*map_gen_lookup)(struct bpf_map *map, struct bpf_insn *insn_buf);
 	u32 (*map_fd_sys_lookup_elem)(void *ptr);
+	int (*map_attach)(struct bpf_map *map,
+			  struct bpf_prog *p1, struct bpf_prog *p2);
 };
 
 struct bpf_map {
@@ -138,8 +141,6 @@ enum bpf_reg_type {
 	PTR_TO_PACKET_END,	 /* skb->data + headlen */
 };
 
-struct bpf_prog;
-
 /* The information passed from prog-specific *_is_valid_access
  * back to the verifier.
  */
@@ -312,6 +313,7 @@ int bpf_check(struct bpf_prog **fp, union bpf_attr *attr);
 
 /* Map specifics */
 struct net_device  *__dev_map_lookup_elem(struct bpf_map *map, u32 key);
+struct sock  *__sock_map_lookup_elem(struct bpf_map *map, u32 key);
 void __dev_map_insert_ctx(struct bpf_map *map, u32 index);
 void __dev_map_flush(struct bpf_map *map);
 
@@ -391,6 +393,7 @@ extern const struct bpf_func_proto bpf_get_current_comm_proto;
 extern const struct bpf_func_proto bpf_skb_vlan_push_proto;
 extern const struct bpf_func_proto bpf_skb_vlan_pop_proto;
 extern const struct bpf_func_proto bpf_get_stackid_proto;
+extern const struct bpf_func_proto bpf_sock_map_update_proto;
 
 /* Shared helpers among cBPF and eBPF. */
 void bpf_user_rnd_init_once(void);
diff --git a/include/linux/bpf_types.h b/include/linux/bpf_types.h
index 4b72db30dacf..fa805074d168 100644
--- a/include/linux/bpf_types.h
+++ b/include/linux/bpf_types.h
@@ -38,4 +38,5 @@ BPF_MAP_TYPE(BPF_MAP_TYPE_ARRAY_OF_MAPS, array_of_maps_map_ops)
 BPF_MAP_TYPE(BPF_MAP_TYPE_HASH_OF_MAPS, htab_of_maps_map_ops)
 #ifdef CONFIG_NET
 BPF_MAP_TYPE(BPF_MAP_TYPE_DEVMAP, dev_map_ops)
+BPF_MAP_TYPE(BPF_MAP_TYPE_SOCKMAP, sock_map_ops)
 #endif
diff --git a/include/linux/filter.h b/include/linux/filter.h
index d19ed3c15e1e..7015116331af 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -727,6 +727,8 @@ void xdp_do_flush_map(void);
 void bpf_warn_invalid_xdp_action(u32 act);
 void bpf_warn_invalid_xdp_redirect(u32 ifindex);
 
+struct sock *do_sk_redirect_map(void);
+
 #ifdef CONFIG_BPF_JIT
 extern int bpf_jit_enable;
 extern int bpf_jit_harden;
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 2e796e384aeb..7f774769e3f5 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -110,6 +110,7 @@ enum bpf_map_type {
 	BPF_MAP_TYPE_ARRAY_OF_MAPS,
 	BPF_MAP_TYPE_HASH_OF_MAPS,
 	BPF_MAP_TYPE_DEVMAP,
+	BPF_MAP_TYPE_SOCKMAP,
 };
 
 enum bpf_prog_type {
@@ -135,11 +136,15 @@ enum bpf_attach_type {
 	BPF_CGROUP_INET_EGRESS,
 	BPF_CGROUP_INET_SOCK_CREATE,
 	BPF_CGROUP_SOCK_OPS,
+	BPF_CGROUP_SMAP_INGRESS,
 	__MAX_BPF_ATTACH_TYPE
 };
 
 #define MAX_BPF_ATTACH_TYPE __MAX_BPF_ATTACH_TYPE
 
+/* If BPF_SOCKMAP_STRPARSER is used sockmap will use strparser on receive */
+#define BPF_SOCKMAP_STRPARSER	(1U << 0)
+
 /* If BPF_F_ALLOW_OVERRIDE flag is used in BPF_PROG_ATTACH command
  * to the given target_fd cgroup the descendent cgroup will be able to
  * override effective bpf program that was inherited from this cgroup
@@ -211,6 +216,7 @@ union bpf_attr {
 		__u32		attach_bpf_fd;	/* eBPF program to attach */
 		__u32		attach_type;
 		__u32		attach_flags;
+		__u32		attach_bpf_fd2;
 	};
 
 	struct { /* anonymous struct used by BPF_PROG_TEST_RUN command */
@@ -557,6 +563,23 @@ union bpf_attr {
  *     @mode: operation mode (enum bpf_adj_room_mode)
  *     @flags: reserved for future use
  *     Return: 0 on success or negative error code
+ *
+ * int bpf_sk_redirect_map(map, key, flags)
+ *     Redirect skb to a sock in map using key as a lookup key for the
+ *     sock in map.
+ *     @map: pointer to sockmap
+ *     @key: key to lookup sock in map
+ *     @flags: reserved for future use
+ *     Return: SK_REDIRECT
+ *
+ * int bpf_sock_map_update(skops, map, key, flags, map_flags)
+ *	@skops: pointer to bpf_sock_ops
+ *	@map: pointer to sockmap to update
+ *	@key: key to insert/update sock in map
+ *	@flags: same flags as map update elem
+ *	@map_flags: sock map specific flags
+ *	   bit 1: Enable strparser
+ *	   other bits: reserved
  */
 #define __BPF_FUNC_MAPPER(FN)		\
 	FN(unspec),			\
@@ -610,7 +633,9 @@ union bpf_attr {
 	FN(set_hash),			\
 	FN(setsockopt),			\
 	FN(skb_adjust_room),		\
-	FN(redirect_map),
+	FN(redirect_map),		\
+	FN(sk_redirect_map),		\
+	FN(sock_map_update),		\
 
 /* integer value in 'imm' field of BPF_CALL instruction selects which helper
  * function eBPF program intends to call
@@ -747,6 +772,12 @@ struct xdp_md {
 	__u32 data_end;
 };
 
+enum sk_action {
+	SK_ABORTED = 0,
+	SK_DROP,
+	SK_REDIRECT,
+};
+
 #define BPF_TAG_SIZE	8
 
 struct bpf_prog_info {
diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile
index 2f0bcda40e90..aa24287db888 100644
--- a/kernel/bpf/Makefile
+++ b/kernel/bpf/Makefile
@@ -3,7 +3,7 @@ obj-y := core.o
 obj-$(CONFIG_BPF_SYSCALL) += syscall.o verifier.o inode.o helpers.o tnum.o
 obj-$(CONFIG_BPF_SYSCALL) += hashtab.o arraymap.o percpu_freelist.o bpf_lru_list.o lpm_trie.o map_in_map.o
 ifeq ($(CONFIG_NET),y)
-obj-$(CONFIG_BPF_SYSCALL) += devmap.o
+obj-$(CONFIG_BPF_SYSCALL) += devmap.o sockmap.o
 endif
 ifeq ($(CONFIG_PERF_EVENTS),y)
 obj-$(CONFIG_BPF_SYSCALL) += stackmap.o
diff --git a/kernel/bpf/sockmap.c b/kernel/bpf/sockmap.c
new file mode 100644
index 000000000000..792f0addfafa
--- /dev/null
+++ b/kernel/bpf/sockmap.c
@@ -0,0 +1,792 @@
+/* Copyright (c) 2017 Covalent IO, Inc. http://covalent.io
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ */
+
+/* A BPF sock_map is used to store sock objects. This is primarly used
+ * for doing socket redirect with BPF helper routines.
+ *
+ * A sock map may have two BPF programs attached to it, a program used
+ * to parse packets and a program to provide a verdict and redirect
+ * decision on the packet. If no BPF parse program is provided it is
+ * assumed that every skb is a "message" (skb->len). Otherwise the
+ * parse program is attached to strparser and used to build messages
+ * that may span multiple skbs. The verdict program will either select
+ * a socket to send/receive the skb on or provide the drop code indicating
+ * the skb should be dropped. More actions may be added later as needed.
+ * The default program will drop packets.
+ *
+ * For reference this program is similar to devmap used in XDP context
+ * reviewing these together may be useful. For an example please review
+ * ./samples/bpf/sockmap/.
+ */
+#include <linux/bpf.h>
+#include <net/sock.h>
+#include <linux/filter.h>
+#include <linux/errno.h>
+#include <linux/file.h>
+#include <linux/kernel.h>
+#include <linux/net.h>
+#include <linux/skbuff.h>
+#include <linux/workqueue.h>
+#include <linux/list.h>
+#include <net/strparser.h>
+
+struct bpf_stab {
+	struct bpf_map map;
+	struct sock **sock_map;
+	struct bpf_prog *bpf_parse;
+	struct bpf_prog *bpf_verdict;
+	refcount_t refcnt;
+};
+
+enum smap_psock_state {
+	SMAP_TX_RUNNING,
+};
+
+struct smap_psock {
+	struct rcu_head	rcu;
+
+	/* datapath variables */
+	struct sk_buff_head rxqueue;
+	bool strp_enabled;
+
+	/* datapath error path cache across tx work invocations */
+	int save_rem;
+	int save_off;
+	struct sk_buff *save_skb;
+
+	struct strparser strp;
+	struct bpf_prog *bpf_parse;
+	struct bpf_prog *bpf_verdict;
+	struct bpf_stab *stab;
+
+	/* Back reference used when sock callback trigger sockmap operations */
+	int key;
+	struct sock *sock;
+	unsigned long state;
+
+	struct work_struct tx_work;
+	struct work_struct gc_work;
+
+	void (*save_data_ready)(struct sock *sk);
+	void (*save_write_space)(struct sock *sk);
+	void (*save_state_change)(struct sock *sk);
+};
+
+static inline struct smap_psock *smap_psock_sk(const struct sock *sk)
+{
+	return (struct smap_psock *)rcu_dereference_sk_user_data(sk);
+}
+
+static int smap_verdict_func(struct smap_psock *psock, struct sk_buff *skb)
+{
+	struct bpf_prog *prog = READ_ONCE(psock->bpf_verdict);
+	int rc;
+
+	if (unlikely(!prog))
+		return SK_DROP;
+
+	skb_orphan(skb);
+	skb->sk = psock->sock;
+	bpf_compute_data_end(skb);
+	rc = (*prog->bpf_func)(skb, prog->insnsi);
+	skb->sk = NULL;
+
+	return rc;
+}
+
+static void smap_do_verdict(struct smap_psock *psock, struct sk_buff *skb)
+{
+	struct sock *sock;
+	int rc;
+
+	/* Because we use per cpu values to feed input from sock redirect
+	 * in BPF program to do_sk_redirect_map() call we need to ensure we
+	 * are not preempted. RCU read lock is not sufficient in this case
+	 * with CONFIG_PREEMPT_RCU enabled so we must be explicit here.
+	 */
+	preempt_disable();
+	rc = smap_verdict_func(psock, skb);
+	switch (rc) {
+	case SK_REDIRECT:
+		sock = do_sk_redirect_map();
+		preempt_enable();
+		if (likely(sock)) {
+			struct smap_psock *peer = smap_psock_sk(sock);
+
+			if (likely(peer &&
+				   test_bit(SMAP_TX_RUNNING, &peer->state) &&
+				   sk_stream_memory_free(peer->sock))) {
+				peer->sock->sk_wmem_queued += skb->truesize;
+				sk_mem_charge(peer->sock, skb->truesize);
+				skb_queue_tail(&peer->rxqueue, skb);
+				schedule_work(&peer->tx_work);
+				break;
+			}
+		}
+	/* Fall through and free skb otherwise */
+	case SK_DROP:
+	default:
+		preempt_enable();
+		kfree_skb(skb);
+	}
+}
+
+static void smap_report_sk_error(struct smap_psock *psock, int err)
+{
+	struct sock *sk = psock->sock;
+
+	sk->sk_err = err;
+	sk->sk_error_report(sk);
+}
+
+static void smap_release_sock(struct sock *sock);
+
+/* Called with lock_sock(sk) held */
+static void smap_state_change(struct sock *sk)
+{
+	struct smap_psock *psock;
+	struct sock *osk;
+
+	rcu_read_lock();
+
+	/* Allowing transitions into an established syn_recv states allows
+	 * for early binding sockets to a smap object before the connection
+	 * is established.
+	 */
+	switch (sk->sk_state) {
+	case TCP_SYN_RECV:
+	case TCP_ESTABLISHED:
+		break;
+	case TCP_CLOSE_WAIT:
+	case TCP_CLOSING:
+	case TCP_LAST_ACK:
+	case TCP_FIN_WAIT1:
+	case TCP_FIN_WAIT2:
+	case TCP_LISTEN:
+		break;
+	case TCP_CLOSE:
+		/* Only release if the map entry is in fact the sock in
+		 * question. There is a case where the operator deletes
+		 * the sock from the map, but the TCP sock is closed before
+		 * the psock is detached. Use cmpxchg to verify correct
+		 * sock is removed.
+		 */
+		psock = smap_psock_sk(sk);
+		if (unlikely(!psock))
+			break;
+		osk = cmpxchg(&psock->stab->sock_map[psock->key], sk, NULL);
+		if (osk == sk)
+			smap_release_sock(sk);
+		break;
+	default:
+		smap_report_sk_error(psock, EPIPE);
+		break;
+	}
+	rcu_read_unlock();
+}
+
+static void smap_read_sock_strparser(struct strparser *strp,
+				     struct sk_buff *skb)
+{
+	struct smap_psock *psock;
+
+	rcu_read_lock();
+	psock = container_of(strp, struct smap_psock, strp);
+	smap_do_verdict(psock, skb);
+	rcu_read_unlock();
+}
+
+/* Called with lock held on socket */
+static void smap_data_ready(struct sock *sk)
+{
+	struct smap_psock *psock;
+
+	write_lock_bh(&sk->sk_callback_lock);
+	psock = smap_psock_sk(sk);
+	if (likely(psock))
+		strp_data_ready(&psock->strp);
+	write_unlock_bh(&sk->sk_callback_lock);
+}
+
+static void smap_tx_work(struct work_struct *w)
+{
+	struct smap_psock *psock;
+	struct sk_buff *skb;
+	int rem, off, n;
+
+	psock = container_of(w, struct smap_psock, tx_work);
+
+	/* lock sock to avoid losing sk_socket at some point during loop */
+	lock_sock(psock->sock);
+	if (psock->save_skb) {
+		skb = psock->save_skb;
+		rem = psock->save_rem;
+		off = psock->save_off;
+		psock->save_skb = NULL;
+		goto start;
+	}
+
+	while ((skb = skb_dequeue(&psock->rxqueue))) {
+		rem = skb->len;
+		off = 0;
+start:
+		do {
+			if (likely(psock->sock->sk_socket))
+				n = skb_send_sock_locked(psock->sock,
+							 skb, off, rem);
+			else
+				n = -EINVAL;
+			if (n <= 0) {
+				if (n == -EAGAIN) {
+					/* Retry when space is available */
+					psock->save_skb = skb;
+					psock->save_rem = rem;
+					psock->save_off = off;
+					goto out;
+				}
+				/* Hard errors break pipe and stop xmit */
+				smap_report_sk_error(psock, n ? -n : EPIPE);
+				clear_bit(SMAP_TX_RUNNING, &psock->state);
+				sk_mem_uncharge(psock->sock, skb->truesize);
+				psock->sock->sk_wmem_queued -= skb->truesize;
+				kfree_skb(skb);
+				goto out;
+			}
+			rem -= n;
+			off += n;
+		} while (rem);
+		sk_mem_uncharge(psock->sock, skb->truesize);
+		psock->sock->sk_wmem_queued -= skb->truesize;
+		kfree_skb(skb);
+	}
+out:
+	release_sock(psock->sock);
+}
+
+static void smap_write_space(struct sock *sk)
+{
+	struct smap_psock *psock;
+
+	rcu_read_lock();
+	psock = smap_psock_sk(sk);
+	if (likely(psock && test_bit(SMAP_TX_RUNNING, &psock->state)))
+		schedule_work(&psock->tx_work);
+	rcu_read_unlock();
+}
+
+static void smap_stop_sock(struct smap_psock *psock, struct sock *sk)
+{
+	write_lock_bh(&sk->sk_callback_lock);
+	if (!psock->strp_enabled)
+		goto out;
+	sk->sk_data_ready = psock->save_data_ready;
+	sk->sk_write_space = psock->save_write_space;
+	sk->sk_state_change = psock->save_state_change;
+	psock->save_data_ready = NULL;
+	psock->save_write_space = NULL;
+	psock->save_state_change = NULL;
+	strp_stop(&psock->strp);
+	psock->strp_enabled = false;
+out:
+	write_unlock_bh(&sk->sk_callback_lock);
+}
+
+static void smap_destroy_psock(struct rcu_head *rcu)
+{
+	struct smap_psock *psock = container_of(rcu,
+						  struct smap_psock, rcu);
+
+	/* Now that a grace period has passed there is no longer
+	 * any reference to this sock in the sockmap so we can
+	 * destroy the psock, strparser, and bpf programs. But,
+	 * because we use workqueue sync operations we can not
+	 * do it in rcu context
+	 */
+	schedule_work(&psock->gc_work);
+}
+
+static void smap_release_sock(struct sock *sock)
+{
+	struct smap_psock *psock = smap_psock_sk(sock);
+
+	smap_stop_sock(psock, sock);
+	clear_bit(SMAP_TX_RUNNING, &psock->state);
+	rcu_assign_sk_user_data(sock, NULL);
+	call_rcu_sched(&psock->rcu, smap_destroy_psock);
+}
+
+static int smap_parse_func_strparser(struct strparser *strp,
+				       struct sk_buff *skb)
+{
+	struct smap_psock *psock;
+	struct bpf_prog *prog;
+	int rc;
+
+	rcu_read_lock();
+	psock = container_of(strp, struct smap_psock, strp);
+	prog = READ_ONCE(psock->bpf_parse);
+
+	if (unlikely(!prog)) {
+		rcu_read_unlock();
+		return skb->len;
+	}
+
+	/* Attach socket for bpf program to use if needed we can do this
+	 * because strparser clones the skb before handing it to a upper
+	 * layer, meaning skb_orphan has been called. We NULL sk on the
+	 * way out to ensure we don't trigger a BUG_ON in skb/sk operations
+	 * later and because we are not charging the memory of this skb to
+	 * any socket yet.
+	 */
+	skb->sk = psock->sock;
+	bpf_compute_data_end(skb);
+	rc = (*prog->bpf_func)(skb, prog->insnsi);
+	skb->sk = NULL;
+	rcu_read_unlock();
+	return rc;
+}
+
+
+static int smap_read_sock_done(struct strparser *strp, int err)
+{
+	return err;
+}
+
+static int smap_init_sock(struct smap_psock *psock,
+			  struct sock *sk)
+{
+	struct strp_callbacks cb;
+
+	memset(&cb, 0, sizeof(cb));
+	cb.rcv_msg = smap_read_sock_strparser;
+	cb.parse_msg = smap_parse_func_strparser;
+	cb.read_sock_done = smap_read_sock_done;
+	return strp_init(&psock->strp, sk, &cb);
+}
+
+static void smap_init_progs(struct smap_psock *psock,
+			    struct bpf_stab *stab,
+			    struct bpf_prog *verdict,
+			    struct bpf_prog *parse)
+{
+	struct bpf_prog *orig_parse, *orig_verdict;
+
+	orig_parse = xchg(&psock->bpf_parse, parse);
+	orig_verdict = xchg(&psock->bpf_verdict, verdict);
+
+	if (orig_verdict)
+		bpf_prog_put(orig_verdict);
+	if (orig_parse)
+		bpf_prog_put(orig_parse);
+}
+
+static void smap_start_sock(struct smap_psock *psock, struct sock *sk)
+{
+	if (sk->sk_data_ready == smap_data_ready)
+		return;
+	psock->save_data_ready = sk->sk_data_ready;
+	psock->save_write_space = sk->sk_write_space;
+	psock->save_state_change = sk->sk_state_change;
+	sk->sk_data_ready = smap_data_ready;
+	sk->sk_write_space = smap_write_space;
+	sk->sk_state_change = smap_state_change;
+	psock->strp_enabled = true;
+}
+
+static void sock_map_remove_complete(struct bpf_stab *stab)
+{
+	bpf_map_area_free(stab->sock_map);
+	kfree(stab);
+}
+
+static void smap_gc_work(struct work_struct *w)
+{
+	struct smap_psock *psock;
+
+	psock = container_of(w, struct smap_psock, gc_work);
+
+	/* no callback lock needed because we already detached sockmap ops */
+	if (psock->strp_enabled)
+		strp_done(&psock->strp);
+
+	cancel_work_sync(&psock->tx_work);
+	__skb_queue_purge(&psock->rxqueue);
+
+	/* At this point all strparser and xmit work must be complete */
+	if (psock->bpf_parse)
+		bpf_prog_put(psock->bpf_parse);
+	if (psock->bpf_verdict)
+		bpf_prog_put(psock->bpf_verdict);
+
+	if (refcount_dec_and_test(&psock->stab->refcnt))
+		sock_map_remove_complete(psock->stab);
+
+	sock_put(psock->sock);
+	kfree(psock);
+}
+
+static struct smap_psock *smap_init_psock(struct sock *sock,
+					  struct bpf_stab *stab)
+{
+	struct smap_psock *psock;
+
+	psock = kzalloc(sizeof(struct smap_psock), GFP_ATOMIC | __GFP_NOWARN);
+	if (!psock)
+		return ERR_PTR(-ENOMEM);
+
+	psock->sock = sock;
+	skb_queue_head_init(&psock->rxqueue);
+	INIT_WORK(&psock->tx_work, smap_tx_work);
+	INIT_WORK(&psock->gc_work, smap_gc_work);
+
+	rcu_assign_sk_user_data(sock, psock);
+	sock_hold(sock);
+	return psock;
+}
+
+static struct bpf_map *sock_map_alloc(union bpf_attr *attr)
+{
+	struct bpf_stab *stab;
+	int err = -EINVAL;
+	u64 cost;
+
+	/* check sanity of attributes */
+	if (attr->max_entries == 0 || attr->key_size != 4 ||
+	    attr->value_size != 4 || attr->map_flags)
+		return ERR_PTR(-EINVAL);
+
+	if (attr->value_size > KMALLOC_MAX_SIZE)
+		return ERR_PTR(-E2BIG);
+
+	stab = kzalloc(sizeof(*stab), GFP_USER);
+	if (!stab)
+		return ERR_PTR(-ENOMEM);
+
+	/* mandatory map attributes */
+	stab->map.map_type = attr->map_type;
+	stab->map.key_size = attr->key_size;
+	stab->map.value_size = attr->value_size;
+	stab->map.max_entries = attr->max_entries;
+	stab->map.map_flags = attr->map_flags;
+
+	/* make sure page count doesn't overflow */
+	cost = (u64) stab->map.max_entries * sizeof(struct sock *);
+	if (cost >= U32_MAX - PAGE_SIZE)
+		goto free_stab;
+
+	stab->map.pages = round_up(cost, PAGE_SIZE) >> PAGE_SHIFT;
+
+	/* if map size is larger than memlock limit, reject it early */
+	err = bpf_map_precharge_memlock(stab->map.pages);
+	if (err)
+		goto free_stab;
+
+	stab->sock_map = bpf_map_area_alloc(stab->map.max_entries *
+					    sizeof(struct sock *));
+	if (!stab->sock_map)
+		goto free_stab;
+
+	refcount_set(&stab->refcnt, 1);
+	return &stab->map;
+free_stab:
+	kfree(stab);
+	return ERR_PTR(err);
+}
+
+static void sock_map_free(struct bpf_map *map)
+{
+	struct bpf_stab *stab = container_of(map, struct bpf_stab, map);
+	int i;
+
+	synchronize_rcu();
+
+	/* At this point no update, lookup or delete operations can happen.
+	 * However, be aware we can still get a socket state event updates,
+	 * and data ready callabacks that reference the psock from sk_user_data
+	 * Also psock worker threads are still in-flight. So smap_release_sock
+	 * will only free the psock after cancel_sync on the worker threads
+	 * and a grace period expire to ensure psock is really safe to remove.
+	 */
+	rcu_read_lock();
+	for (i = 0; i < stab->map.max_entries; i++) {
+		struct sock *sock;
+
+		sock = xchg(&stab->sock_map[i], NULL);
+		if (!sock)
+			continue;
+
+		smap_release_sock(sock);
+	}
+	rcu_read_unlock();
+
+	if (stab->bpf_verdict)
+		bpf_prog_put(stab->bpf_verdict);
+	if (stab->bpf_parse)
+		bpf_prog_put(stab->bpf_parse);
+
+	if (refcount_dec_and_test(&stab->refcnt))
+		sock_map_remove_complete(stab);
+}
+
+static int sock_map_get_next_key(struct bpf_map *map, void *key, void *next_key)
+{
+	struct bpf_stab *stab = container_of(map, struct bpf_stab, map);
+	u32 i = key ? *(u32 *)key : U32_MAX;
+	u32 *next = (u32 *)next_key;
+
+	if (i >= stab->map.max_entries) {
+		*next = 0;
+		return 0;
+	}
+
+	if (i == stab->map.max_entries - 1)
+		return -ENOENT;
+
+	*next = i + 1;
+	return 0;
+}
+
+struct sock  *__sock_map_lookup_elem(struct bpf_map *map, u32 key)
+{
+	struct bpf_stab *stab = container_of(map, struct bpf_stab, map);
+
+	if (key >= map->max_entries)
+		return NULL;
+
+	return READ_ONCE(stab->sock_map[key]);
+}
+
+static int sock_map_delete_elem(struct bpf_map *map, void *key)
+{
+	struct bpf_stab *stab = container_of(map, struct bpf_stab, map);
+	int k = *(u32 *)key;
+	struct sock *sock;
+
+	if (k >= map->max_entries)
+		return -EINVAL;
+
+	sock = xchg(&stab->sock_map[k], NULL);
+	if (!sock)
+		return -EINVAL;
+
+	smap_release_sock(sock);
+	return 0;
+}
+
+/* Locking notes: Concurrent updates, deletes, and lookups are allowed and are
+ * done inside rcu critical sections. This ensures on updates that the psock
+ * will not be released via smap_release_sock() until concurrent updates/deletes
+ * complete. All operations operate on sock_map using cmpxchg and xchg
+ * operations to ensure we do not get stale references. Any reads into the
+ * map must be done with READ_ONCE() because of this.
+ *
+ * A psock is destroyed via call_rcu and after any worker threads are cancelled
+ * and syncd so we are certain all references from the update/lookup/delete
+ * operations as well as references in the data path are no longer in use.
+ *
+ * A psock object holds a refcnt on the sockmap it is attached to and this is
+ * not decremented until after a RCU grace period and garbage collection occurs.
+ * This ensures the map is not free'd until psocks linked to it are removed. The
+ * map link is used when the independent sock events trigger map deletion.
+ *
+ * Psocks may only participate in one sockmap at a time. Users that try to
+ * join a single sock to multiple maps will get an error.
+ *
+ * Last, but not least, it is possible the socket is closed while running
+ * an update on an existing psock. This will release the psock, but again
+ * not until the update has completed due to rcu grace period rules.
+ */
+static int sock_map_ctx_update_elem(struct bpf_sock_ops_kern *skops,
+				    struct bpf_map *map,
+				    void *key, u64 flags, u64 map_flags)
+{
+	struct bpf_stab *stab = container_of(map, struct bpf_stab, map);
+	struct bpf_prog *verdict, *parse;
+	struct smap_psock *psock = NULL;
+	struct sock *old_sock, *sock;
+	u32 i = *(u32 *)key;
+	bool update = false;
+	int err = 0;
+
+	if (unlikely(flags > BPF_EXIST))
+		return -EINVAL;
+
+	if (unlikely(i >= stab->map.max_entries))
+		return -E2BIG;
+
+	if (unlikely(map_flags > BPF_SOCKMAP_STRPARSER))
+		return -EINVAL;
+
+	verdict = parse = NULL;
+	sock = READ_ONCE(stab->sock_map[i]);
+
+	if (flags == BPF_EXIST || flags == BPF_ANY) {
+		if (!sock && flags == BPF_EXIST) {
+			return -ENOENT;
+		} else if (sock && sock != skops->sk) {
+			return -EINVAL;
+		} else if (sock) {
+			psock = smap_psock_sk(sock);
+			if (unlikely(!psock))
+				return -EBUSY;
+			update = true;
+		}
+	} else if (sock && BPF_NOEXIST) {
+		return -EEXIST;
+	}
+
+	/* reserve BPF programs early so can abort easily on failures */
+	if (map_flags & BPF_SOCKMAP_STRPARSER) {
+		verdict = READ_ONCE(stab->bpf_verdict);
+		parse = READ_ONCE(stab->bpf_parse);
+
+		if (!verdict || !parse)
+			return -ENOENT;
+
+		/* bpf prog refcnt may be zero if a concurrent attach operation
+		 * removes the program after the above READ_ONCE() but before
+		 * we increment the refcnt. If this is the case abort with an
+		 * error.
+		 */
+		verdict = bpf_prog_inc_not_zero(stab->bpf_verdict);
+		if (IS_ERR(verdict))
+			return PTR_ERR(verdict);
+
+		parse = bpf_prog_inc_not_zero(stab->bpf_parse);
+		if (IS_ERR(parse)) {
+			bpf_prog_put(verdict);
+			return PTR_ERR(parse);
+		}
+	}
+
+	if (!psock) {
+		sock = skops->sk;
+		if (rcu_dereference_sk_user_data(sock))
+			return -EEXIST;
+		psock = smap_init_psock(sock, stab);
+		if (IS_ERR(psock)) {
+			if (verdict)
+				bpf_prog_put(verdict);
+			if (parse)
+				bpf_prog_put(parse);
+			return PTR_ERR(psock);
+		}
+		psock->key = i;
+		psock->stab = stab;
+		refcount_inc(&stab->refcnt);
+		set_bit(SMAP_TX_RUNNING, &psock->state);
+	}
+
+	if (map_flags & BPF_SOCKMAP_STRPARSER) {
+		write_lock_bh(&sock->sk_callback_lock);
+		if (psock->strp_enabled)
+			goto start_done;
+		err = smap_init_sock(psock, sock);
+		if (err)
+			goto out;
+		smap_init_progs(psock, stab, verdict, parse);
+		smap_start_sock(psock, sock);
+start_done:
+		write_unlock_bh(&sock->sk_callback_lock);
+	} else if (update) {
+		smap_stop_sock(psock, sock);
+	}
+
+	if (!update) {
+		old_sock = xchg(&stab->sock_map[i], skops->sk);
+		if (old_sock)
+			smap_release_sock(old_sock);
+	}
+
+	return 0;
+out:
+	write_unlock_bh(&sock->sk_callback_lock);
+	if (!update)
+		smap_release_sock(sock);
+	return err;
+}
+
+static int sock_map_attach_prog(struct bpf_map *map,
+				struct bpf_prog *parse,
+				struct bpf_prog *verdict)
+{
+	struct bpf_stab *stab = container_of(map, struct bpf_stab, map);
+	struct bpf_prog *_parse, *_verdict;
+
+	_parse = xchg(&stab->bpf_parse, parse);
+	_verdict = xchg(&stab->bpf_verdict, verdict);
+
+	if (_parse)
+		bpf_prog_put(_parse);
+	if (_verdict)
+		bpf_prog_put(_verdict);
+
+	return 0;
+}
+
+static void *sock_map_lookup(struct bpf_map *map, void *key)
+{
+	return NULL;
+}
+
+static int sock_map_update_elem(struct bpf_map *map,
+				void *key, void *value, u64 flags)
+{
+	struct bpf_sock_ops_kern skops;
+	u32 fd = *(u32 *)value;
+	struct socket *socket;
+	int err;
+
+	socket = sockfd_lookup(fd, &err);
+	if (!socket)
+		return err;
+
+	skops.sk = socket->sk;
+	if (!skops.sk) {
+		fput(socket->file);
+		return -EINVAL;
+	}
+
+	err = sock_map_ctx_update_elem(&skops, map, key,
+				       flags, BPF_SOCKMAP_STRPARSER);
+	fput(socket->file);
+	return err;
+}
+
+const struct bpf_map_ops sock_map_ops = {
+	.map_alloc = sock_map_alloc,
+	.map_free = sock_map_free,
+	.map_lookup_elem = sock_map_lookup,
+	.map_get_next_key = sock_map_get_next_key,
+	.map_update_elem = sock_map_update_elem,
+	.map_delete_elem = sock_map_delete_elem,
+	.map_attach = sock_map_attach_prog,
+};
+
+BPF_CALL_5(bpf_sock_map_update, struct bpf_sock_ops_kern *, bpf_sock,
+	   struct bpf_map *, map, void *, key, u64, flags, u64, map_flags)
+{
+	WARN_ON_ONCE(!rcu_read_lock_held());
+	return sock_map_ctx_update_elem(bpf_sock, map, key, flags, map_flags);
+}
+
+const struct bpf_func_proto bpf_sock_map_update_proto = {
+	.func		= bpf_sock_map_update,
+	.gpl_only	= false,
+	.pkt_access	= true,
+	.ret_type	= RET_INTEGER,
+	.arg1_type	= ARG_PTR_TO_CTX,
+	.arg2_type	= ARG_CONST_MAP_PTR,
+	.arg3_type	= ARG_PTR_TO_MAP_KEY,
+	.arg4_type	= ARG_ANYTHING,
+	.arg5_type	= ARG_ANYTHING,
+};
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 17e29f596de1..d2f2bdf71ffa 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -1087,7 +1087,50 @@ static int bpf_obj_get(const union bpf_attr *attr)
 
 #ifdef CONFIG_CGROUP_BPF
 
-#define BPF_PROG_ATTACH_LAST_FIELD attach_flags
+#define BPF_PROG_ATTACH_LAST_FIELD attach_bpf_fd2
+
+static int sockmap_get_from_fd(const union bpf_attr *attr, int ptype)
+{
+	struct bpf_prog *prog1, *prog2;
+	int ufd = attr->target_fd;
+	struct bpf_map *map;
+	struct fd f;
+	int err;
+
+	f = fdget(ufd);
+	map = __bpf_map_get(f);
+	if (IS_ERR(map))
+		return PTR_ERR(map);
+
+	if (!map->ops->map_attach) {
+		fdput(f);
+		return -EOPNOTSUPP;
+	}
+
+	prog1 = bpf_prog_get_type(attr->attach_bpf_fd, ptype);
+	if (IS_ERR(prog1)) {
+		fdput(f);
+		return PTR_ERR(prog1);
+	}
+
+	prog2 = bpf_prog_get_type(attr->attach_bpf_fd2, ptype);
+	if (IS_ERR(prog2)) {
+		fdput(f);
+		bpf_prog_put(prog1);
+		return PTR_ERR(prog2);
+	}
+
+	err = map->ops->map_attach(map, prog1, prog2);
+	if (err) {
+		fdput(f);
+		bpf_prog_put(prog1);
+		bpf_prog_put(prog2);
+		return PTR_ERR(map);
+	}
+
+	fdput(f);
+	return err;
+}
 
 static int bpf_prog_attach(const union bpf_attr *attr)
 {
@@ -1116,10 +1159,16 @@ static int bpf_prog_attach(const union bpf_attr *attr)
 	case BPF_CGROUP_SOCK_OPS:
 		ptype = BPF_PROG_TYPE_SOCK_OPS;
 		break;
+	case BPF_CGROUP_SMAP_INGRESS:
+		ptype = BPF_PROG_TYPE_SK_SKB;
+		break;
 	default:
 		return -EINVAL;
 	}
 
+	if (attr->attach_type == BPF_CGROUP_SMAP_INGRESS)
+		return sockmap_get_from_fd(attr, ptype);
+
 	prog = bpf_prog_get_type(attr->attach_bpf_fd, ptype);
 	if (IS_ERR(prog))
 		return PTR_ERR(prog);
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 7dd96d064be1..a71bc0996572 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -1522,6 +1522,12 @@ static int check_map_func_compatibility(struct bpf_map *map, int func_id)
 	case BPF_MAP_TYPE_HASH_OF_MAPS:
 		if (func_id != BPF_FUNC_map_lookup_elem)
 			goto error;
+	case BPF_MAP_TYPE_SOCKMAP:
+		if (func_id != BPF_FUNC_sk_redirect_map &&
+		    func_id != BPF_FUNC_sock_map_update &&
+		    func_id != BPF_FUNC_map_delete_elem)
+			goto error;
+		break;
 	default:
 		break;
 	}
@@ -1550,6 +1556,14 @@ static int check_map_func_compatibility(struct bpf_map *map, int func_id)
 		if (map->map_type != BPF_MAP_TYPE_DEVMAP)
 			goto error;
 		break;
+	case BPF_FUNC_sk_redirect_map:
+		if (map->map_type != BPF_MAP_TYPE_SOCKMAP)
+			goto error;
+		break;
+	case BPF_FUNC_sock_map_update:
+		if (map->map_type != BPF_MAP_TYPE_SOCKMAP)
+			goto error;
+		break;
 	default:
 		break;
 	}
diff --git a/net/core/filter.c b/net/core/filter.c
index 46321033ae0e..8e136578488c 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -1858,6 +1858,45 @@ static const struct bpf_func_proto bpf_redirect_map_proto = {
 	.arg3_type      = ARG_ANYTHING,
 };
 
+BPF_CALL_3(bpf_sk_redirect_map, struct bpf_map *, map, u32, key, u64, flags)
+{
+	struct redirect_info *ri = this_cpu_ptr(&redirect_info);
+
+	if (unlikely(flags))
+		return SK_ABORTED;
+
+	ri->ifindex = key;
+	ri->flags = flags;
+	ri->map = map;
+
+	return SK_REDIRECT;
+}
+
+struct sock *do_sk_redirect_map(void)
+{
+	struct redirect_info *ri = this_cpu_ptr(&redirect_info);
+	struct sock *sk = NULL;
+
+	if (ri->map) {
+		sk = __sock_map_lookup_elem(ri->map, ri->ifindex);
+
+		ri->ifindex = 0;
+		ri->map = NULL;
+		/* we do not clear flags for future lookup */
+	}
+
+	return sk;
+}
+
+static const struct bpf_func_proto bpf_sk_redirect_map_proto = {
+	.func           = bpf_sk_redirect_map,
+	.gpl_only       = false,
+	.ret_type       = RET_INTEGER,
+	.arg1_type      = ARG_CONST_MAP_PTR,
+	.arg2_type      = ARG_ANYTHING,
+	.arg3_type      = ARG_ANYTHING,
+};
+
 BPF_CALL_1(bpf_get_cgroup_classid, const struct sk_buff *, skb)
 {
 	return task_get_classid(skb);
@@ -3229,6 +3268,8 @@ static const struct bpf_func_proto *
 	switch (func_id) {
 	case BPF_FUNC_setsockopt:
 		return &bpf_setsockopt_proto;
+	case BPF_FUNC_sock_map_update:
+		return &bpf_sock_map_update_proto;
 	default:
 		return bpf_base_func_proto(func_id);
 	}
@@ -3243,6 +3284,8 @@ static const struct bpf_func_proto *sk_skb_func_proto(enum bpf_func_id func_id)
 		return &bpf_get_socket_cookie_proto;
 	case BPF_FUNC_get_socket_uid:
 		return &bpf_get_socket_uid_proto;
+	case BPF_FUNC_sk_redirect_map:
+		return &bpf_sk_redirect_map_proto;
 	default:
 		return bpf_base_func_proto(func_id);
 	}
-- 
cgit v1.2.3


From 8a31db5615667956c513d205cfb06885c3ec6d0b Mon Sep 17 00:00:00 2001
From: John Fastabend <john.fastabend@gmail.com>
Date: Tue, 15 Aug 2017 22:33:09 -0700
Subject: bpf: add access to sock fields and pkt data from sk_skb programs

Signed-off-by: John Fastabend <john.fastabend@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/bpf.h |   9 +++
 kernel/bpf/verifier.c    |   1 +
 net/core/filter.c        | 169 +++++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 179 insertions(+)

(limited to 'include')

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 7f774769e3f5..5ecbe812a2cc 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -712,6 +712,15 @@ struct __sk_buff {
 	__u32 data;
 	__u32 data_end;
 	__u32 napi_id;
+
+	/* accessed by BPF_PROG_TYPE_sk_skb types */
+	__u32 family;
+	__u32 remote_ip4;	/* Stored in network byte order */
+	__u32 local_ip4;	/* Stored in network byte order */
+	__u32 remote_ip6[4];	/* Stored in network byte order */
+	__u32 local_ip6[4];	/* Stored in network byte order */
+	__u32 remote_port;	/* Stored in network byte order */
+	__u32 local_port;	/* stored in host byte order */
 };
 
 struct bpf_tunnel_key {
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index a71bc0996572..958ba84a9995 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -886,6 +886,7 @@ static bool may_access_direct_pkt_data(struct bpf_verifier_env *env,
 	case BPF_PROG_TYPE_SCHED_ACT:
 	case BPF_PROG_TYPE_XDP:
 	case BPF_PROG_TYPE_LWT_XMIT:
+	case BPF_PROG_TYPE_SK_SKB:
 		if (meta)
 			return meta->pkt_access;
 
diff --git a/net/core/filter.c b/net/core/filter.c
index 8e136578488c..e9f8dcef6c57 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -3278,8 +3278,16 @@ static const struct bpf_func_proto *
 static const struct bpf_func_proto *sk_skb_func_proto(enum bpf_func_id func_id)
 {
 	switch (func_id) {
+	case BPF_FUNC_skb_store_bytes:
+		return &bpf_skb_store_bytes_proto;
 	case BPF_FUNC_skb_load_bytes:
 		return &bpf_skb_load_bytes_proto;
+	case BPF_FUNC_skb_pull_data:
+		return &bpf_skb_pull_data_proto;
+	case BPF_FUNC_skb_change_tail:
+		return &bpf_skb_change_tail_proto;
+	case BPF_FUNC_skb_change_head:
+		return &bpf_skb_change_head_proto;
 	case BPF_FUNC_get_socket_cookie:
 		return &bpf_get_socket_cookie_proto;
 	case BPF_FUNC_get_socket_uid:
@@ -3343,6 +3351,10 @@ static bool bpf_skb_is_valid_access(int off, int size, enum bpf_access_type type
 		if (off + size > offsetofend(struct __sk_buff, cb[4]))
 			return false;
 		break;
+	case bpf_ctx_range_till(struct __sk_buff, remote_ip6[0], remote_ip6[3]):
+	case bpf_ctx_range_till(struct __sk_buff, local_ip6[0], local_ip6[3]):
+	case bpf_ctx_range_till(struct __sk_buff, remote_ip4, remote_ip4):
+	case bpf_ctx_range_till(struct __sk_buff, local_ip4, local_ip4):
 	case bpf_ctx_range(struct __sk_buff, data):
 	case bpf_ctx_range(struct __sk_buff, data_end):
 		if (size != size_default)
@@ -3371,6 +3383,7 @@ static bool sk_filter_is_valid_access(int off, int size,
 	case bpf_ctx_range(struct __sk_buff, tc_classid):
 	case bpf_ctx_range(struct __sk_buff, data):
 	case bpf_ctx_range(struct __sk_buff, data_end):
+	case bpf_ctx_range_till(struct __sk_buff, family, local_port):
 		return false;
 	}
 
@@ -3392,6 +3405,7 @@ static bool lwt_is_valid_access(int off, int size,
 {
 	switch (off) {
 	case bpf_ctx_range(struct __sk_buff, tc_classid):
+	case bpf_ctx_range_till(struct __sk_buff, family, local_port):
 		return false;
 	}
 
@@ -3505,6 +3519,8 @@ static bool tc_cls_act_is_valid_access(int off, int size,
 	case bpf_ctx_range(struct __sk_buff, data_end):
 		info->reg_type = PTR_TO_PACKET_END;
 		break;
+	case bpf_ctx_range_till(struct __sk_buff, family, local_port):
+		return false;
 	}
 
 	return bpf_skb_is_valid_access(off, size, type, info);
@@ -3582,11 +3598,63 @@ static bool sock_ops_is_valid_access(int off, int size,
 	return __is_valid_sock_ops_access(off, size);
 }
 
+static int sk_skb_prologue(struct bpf_insn *insn_buf, bool direct_write,
+			   const struct bpf_prog *prog)
+{
+	struct bpf_insn *insn = insn_buf;
+
+	if (!direct_write)
+		return 0;
+
+	/* if (!skb->cloned)
+	 *       goto start;
+	 *
+	 * (Fast-path, otherwise approximation that we might be
+	 *  a clone, do the rest in helper.)
+	 */
+	*insn++ = BPF_LDX_MEM(BPF_B, BPF_REG_6, BPF_REG_1, CLONED_OFFSET());
+	*insn++ = BPF_ALU32_IMM(BPF_AND, BPF_REG_6, CLONED_MASK);
+	*insn++ = BPF_JMP_IMM(BPF_JEQ, BPF_REG_6, 0, 7);
+
+	/* ret = bpf_skb_pull_data(skb, 0); */
+	*insn++ = BPF_MOV64_REG(BPF_REG_6, BPF_REG_1);
+	*insn++ = BPF_ALU64_REG(BPF_XOR, BPF_REG_2, BPF_REG_2);
+	*insn++ = BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0,
+			       BPF_FUNC_skb_pull_data);
+	/* if (!ret)
+	 *      goto restore;
+	 * return SK_DROP;
+	 */
+	*insn++ = BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 2);
+	*insn++ = BPF_ALU32_IMM(BPF_MOV, BPF_REG_0, SK_DROP);
+	*insn++ = BPF_EXIT_INSN();
+
+	/* restore: */
+	*insn++ = BPF_MOV64_REG(BPF_REG_1, BPF_REG_6);
+	/* start: */
+	*insn++ = prog->insnsi[0];
+
+	return insn - insn_buf;
+}
+
 static bool sk_skb_is_valid_access(int off, int size,
 				   enum bpf_access_type type,
 				   struct bpf_insn_access_aux *info)
 {
+	if (type == BPF_WRITE) {
+		switch (off) {
+		case bpf_ctx_range(struct __sk_buff, mark):
+		case bpf_ctx_range(struct __sk_buff, tc_index):
+		case bpf_ctx_range(struct __sk_buff, priority):
+			break;
+		default:
+			return false;
+		}
+	}
+
 	switch (off) {
+	case bpf_ctx_range(struct __sk_buff, tc_classid):
+		return false;
 	case bpf_ctx_range(struct __sk_buff, data):
 		info->reg_type = PTR_TO_PACKET;
 		break;
@@ -3783,6 +3851,106 @@ static u32 bpf_convert_ctx_access(enum bpf_access_type type,
 		*insn++ = BPF_MOV64_IMM(si->dst_reg, 0);
 #endif
 		break;
+	case offsetof(struct __sk_buff, family):
+		BUILD_BUG_ON(FIELD_SIZEOF(struct sock_common, skc_family) != 2);
+
+		*insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, sk),
+				      si->dst_reg, si->src_reg,
+				      offsetof(struct sk_buff, sk));
+		*insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->dst_reg,
+				      bpf_target_off(struct sock_common,
+						     skc_family,
+						     2, target_size));
+		break;
+	case offsetof(struct __sk_buff, remote_ip4):
+		BUILD_BUG_ON(FIELD_SIZEOF(struct sock_common, skc_daddr) != 4);
+
+		*insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, sk),
+				      si->dst_reg, si->src_reg,
+				      offsetof(struct sk_buff, sk));
+		*insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg,
+				      bpf_target_off(struct sock_common,
+						     skc_daddr,
+						     4, target_size));
+		break;
+	case offsetof(struct __sk_buff, local_ip4):
+		BUILD_BUG_ON(FIELD_SIZEOF(struct sock_common,
+					  skc_rcv_saddr) != 4);
+
+		*insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, sk),
+				      si->dst_reg, si->src_reg,
+				      offsetof(struct sk_buff, sk));
+		*insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg,
+				      bpf_target_off(struct sock_common,
+						     skc_rcv_saddr,
+						     4, target_size));
+		break;
+	case offsetof(struct __sk_buff, remote_ip6[0]) ...
+	     offsetof(struct __sk_buff, remote_ip6[3]):
+#if IS_ENABLED(CONFIG_IPV6)
+		BUILD_BUG_ON(FIELD_SIZEOF(struct sock_common,
+					  skc_v6_daddr.s6_addr32[0]) != 4);
+
+		off = si->off;
+		off -= offsetof(struct __sk_buff, remote_ip6[0]);
+
+		*insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, sk),
+				      si->dst_reg, si->src_reg,
+				      offsetof(struct sk_buff, sk));
+		*insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg,
+				      offsetof(struct sock_common,
+					       skc_v6_daddr.s6_addr32[0]) +
+				      off);
+#else
+		*insn++ = BPF_MOV32_IMM(si->dst_reg, 0);
+#endif
+		break;
+	case offsetof(struct __sk_buff, local_ip6[0]) ...
+	     offsetof(struct __sk_buff, local_ip6[3]):
+#if IS_ENABLED(CONFIG_IPV6)
+		BUILD_BUG_ON(FIELD_SIZEOF(struct sock_common,
+					  skc_v6_rcv_saddr.s6_addr32[0]) != 4);
+
+		off = si->off;
+		off -= offsetof(struct __sk_buff, local_ip6[0]);
+
+		*insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, sk),
+				      si->dst_reg, si->src_reg,
+				      offsetof(struct sk_buff, sk));
+		*insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg,
+				      offsetof(struct sock_common,
+					       skc_v6_rcv_saddr.s6_addr32[0]) +
+				      off);
+#else
+		*insn++ = BPF_MOV32_IMM(si->dst_reg, 0);
+#endif
+		break;
+
+	case offsetof(struct __sk_buff, remote_port):
+		BUILD_BUG_ON(FIELD_SIZEOF(struct sock_common, skc_dport) != 2);
+
+		*insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, sk),
+				      si->dst_reg, si->src_reg,
+				      offsetof(struct sk_buff, sk));
+		*insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->dst_reg,
+				      bpf_target_off(struct sock_common,
+						     skc_dport,
+						     2, target_size));
+#ifndef __BIG_ENDIAN_BITFIELD
+		*insn++ = BPF_ALU32_IMM(BPF_LSH, si->dst_reg, 16);
+#endif
+		break;
+
+	case offsetof(struct __sk_buff, local_port):
+		BUILD_BUG_ON(FIELD_SIZEOF(struct sock_common, skc_num) != 2);
+
+		*insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, sk),
+				      si->dst_reg, si->src_reg,
+				      offsetof(struct sk_buff, sk));
+		*insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->dst_reg,
+				      bpf_target_off(struct sock_common,
+						     skc_num, 2, target_size));
+		break;
 	}
 
 	return insn - insn_buf;
@@ -4071,6 +4239,7 @@ const struct bpf_verifier_ops sk_skb_prog_ops = {
 	.get_func_proto		= sk_skb_func_proto,
 	.is_valid_access	= sk_skb_is_valid_access,
 	.convert_ctx_access	= bpf_convert_ctx_access,
+	.gen_prologue		= sk_skb_prologue,
 };
 
 int sk_detach_filter(struct sock *sk)
-- 
cgit v1.2.3


From 729749bb8da186e68d97d1b0439f0b1e0059c41d Mon Sep 17 00:00:00 2001
From: Trond Myklebust <trond.myklebust@primarydata.com>
Date: Sun, 13 Aug 2017 10:03:59 -0400
Subject: SUNRPC: Don't hold the transport lock across socket copy operations

Instead add a mechanism to ensure that the request doesn't disappear
from underneath us while copying from the socket. We do this by
preventing xprt_release() from freeing the XDR buffers until the
flag RPC_TASK_MSG_RECV has been cleared from the request.

Signed-off-by: Trond Myklebust <trond.myklebust@primarydata.com>
Reviewed-by: Chuck Lever <chuck.lever@oracle.com>
---
 include/linux/sunrpc/sched.h |  2 ++
 include/linux/sunrpc/xprt.h  |  2 ++
 net/sunrpc/xprt.c            | 43 +++++++++++++++++++++++++++++++++++++++++++
 net/sunrpc/xprtsock.c        | 23 ++++++++++++++++++-----
 4 files changed, 65 insertions(+), 5 deletions(-)

(limited to 'include')

diff --git a/include/linux/sunrpc/sched.h b/include/linux/sunrpc/sched.h
index 50a99a117da7..c1768f9d993b 100644
--- a/include/linux/sunrpc/sched.h
+++ b/include/linux/sunrpc/sched.h
@@ -139,6 +139,8 @@ struct rpc_task_setup {
 #define RPC_TASK_RUNNING	0
 #define RPC_TASK_QUEUED		1
 #define RPC_TASK_ACTIVE		2
+#define RPC_TASK_MSG_RECV	3
+#define RPC_TASK_MSG_RECV_WAIT	4
 
 #define RPC_IS_RUNNING(t)	test_bit(RPC_TASK_RUNNING, &(t)->tk_runstate)
 #define rpc_set_running(t)	set_bit(RPC_TASK_RUNNING, &(t)->tk_runstate)
diff --git a/include/linux/sunrpc/xprt.h b/include/linux/sunrpc/xprt.h
index eab1c749e192..65b9e0224753 100644
--- a/include/linux/sunrpc/xprt.h
+++ b/include/linux/sunrpc/xprt.h
@@ -372,6 +372,8 @@ void			xprt_write_space(struct rpc_xprt *xprt);
 void			xprt_adjust_cwnd(struct rpc_xprt *xprt, struct rpc_task *task, int result);
 struct rpc_rqst *	xprt_lookup_rqst(struct rpc_xprt *xprt, __be32 xid);
 void			xprt_complete_rqst(struct rpc_task *task, int copied);
+void			xprt_pin_rqst(struct rpc_rqst *req);
+void			xprt_unpin_rqst(struct rpc_rqst *req);
 void			xprt_release_rqst_cong(struct rpc_task *task);
 void			xprt_disconnect_done(struct rpc_xprt *xprt);
 void			xprt_force_disconnect(struct rpc_xprt *xprt);
diff --git a/net/sunrpc/xprt.c b/net/sunrpc/xprt.c
index 4654a9934269..3eb9ec16eec4 100644
--- a/net/sunrpc/xprt.c
+++ b/net/sunrpc/xprt.c
@@ -844,6 +844,48 @@ struct rpc_rqst *xprt_lookup_rqst(struct rpc_xprt *xprt, __be32 xid)
 }
 EXPORT_SYMBOL_GPL(xprt_lookup_rqst);
 
+/**
+ * xprt_pin_rqst - Pin a request on the transport receive list
+ * @req: Request to pin
+ *
+ * Caller must ensure this is atomic with the call to xprt_lookup_rqst()
+ * so should be holding the xprt transport lock.
+ */
+void xprt_pin_rqst(struct rpc_rqst *req)
+{
+	set_bit(RPC_TASK_MSG_RECV, &req->rq_task->tk_runstate);
+}
+
+/**
+ * xprt_unpin_rqst - Unpin a request on the transport receive list
+ * @req: Request to pin
+ *
+ * Caller should be holding the xprt transport lock.
+ */
+void xprt_unpin_rqst(struct rpc_rqst *req)
+{
+	struct rpc_task *task = req->rq_task;
+
+	clear_bit(RPC_TASK_MSG_RECV, &task->tk_runstate);
+	if (test_bit(RPC_TASK_MSG_RECV_WAIT, &task->tk_runstate))
+		wake_up_bit(&task->tk_runstate, RPC_TASK_MSG_RECV);
+}
+
+static void xprt_wait_on_pinned_rqst(struct rpc_rqst *req)
+__must_hold(&req->rq_xprt->transport_lock)
+{
+	struct rpc_task *task = req->rq_task;
+	
+	if (task && test_bit(RPC_TASK_MSG_RECV, &task->tk_runstate)) {
+		spin_unlock_bh(&req->rq_xprt->transport_lock);
+		set_bit(RPC_TASK_MSG_RECV_WAIT, &task->tk_runstate);
+		wait_on_bit(&task->tk_runstate, RPC_TASK_MSG_RECV,
+				TASK_UNINTERRUPTIBLE);
+		clear_bit(RPC_TASK_MSG_RECV_WAIT, &task->tk_runstate);
+		spin_lock_bh(&req->rq_xprt->transport_lock);
+	}
+}
+
 static void xprt_update_rtt(struct rpc_task *task)
 {
 	struct rpc_rqst *req = task->tk_rqstp;
@@ -1295,6 +1337,7 @@ void xprt_release(struct rpc_task *task)
 		list_del(&req->rq_list);
 	xprt->last_used = jiffies;
 	xprt_schedule_autodisconnect(xprt);
+	xprt_wait_on_pinned_rqst(req);
 	spin_unlock_bh(&xprt->transport_lock);
 	if (req->rq_buffer)
 		xprt->ops->buf_free(task);
diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c
index 4f154d388748..04dbc7027712 100644
--- a/net/sunrpc/xprtsock.c
+++ b/net/sunrpc/xprtsock.c
@@ -973,6 +973,8 @@ static void xs_local_data_read_skb(struct rpc_xprt *xprt,
 	rovr = xprt_lookup_rqst(xprt, *xp);
 	if (!rovr)
 		goto out_unlock;
+	xprt_pin_rqst(rovr);
+	spin_unlock_bh(&xprt->transport_lock);
 	task = rovr->rq_task;
 
 	copied = rovr->rq_private_buf.buflen;
@@ -981,11 +983,14 @@ static void xs_local_data_read_skb(struct rpc_xprt *xprt,
 
 	if (xs_local_copy_to_xdr(&rovr->rq_private_buf, skb)) {
 		dprintk("RPC:       sk_buff copy failed\n");
-		goto out_unlock;
+		spin_lock_bh(&xprt->transport_lock);
+		goto out_unpin;
 	}
 
+	spin_lock_bh(&xprt->transport_lock);
 	xprt_complete_rqst(task, copied);
-
+out_unpin:
+	xprt_unpin_rqst(rovr);
  out_unlock:
 	spin_unlock_bh(&xprt->transport_lock);
 }
@@ -1054,6 +1059,8 @@ static void xs_udp_data_read_skb(struct rpc_xprt *xprt,
 	rovr = xprt_lookup_rqst(xprt, *xp);
 	if (!rovr)
 		goto out_unlock;
+	xprt_pin_rqst(rovr);
+	spin_unlock_bh(&xprt->transport_lock);
 	task = rovr->rq_task;
 
 	if ((copied = rovr->rq_private_buf.buflen) > repsize)
@@ -1062,14 +1069,17 @@ static void xs_udp_data_read_skb(struct rpc_xprt *xprt,
 	/* Suck it into the iovec, verify checksum if not done by hw. */
 	if (csum_partial_copy_to_xdr(&rovr->rq_private_buf, skb)) {
 		__UDPX_INC_STATS(sk, UDP_MIB_INERRORS);
-		goto out_unlock;
+		spin_lock_bh(&xprt->transport_lock);
+		goto out_unpin;
 	}
 
 	__UDPX_INC_STATS(sk, UDP_MIB_INDATAGRAMS);
 
+	spin_lock_bh(&xprt->transport_lock);
 	xprt_adjust_cwnd(xprt, task, copied);
 	xprt_complete_rqst(task, copied);
-
+out_unpin:
+	xprt_unpin_rqst(rovr);
  out_unlock:
 	spin_unlock_bh(&xprt->transport_lock);
 }
@@ -1351,12 +1361,15 @@ static inline int xs_tcp_read_reply(struct rpc_xprt *xprt,
 		spin_unlock_bh(&xprt->transport_lock);
 		return -1;
 	}
+	xprt_pin_rqst(req);
+	spin_unlock_bh(&xprt->transport_lock);
 
 	xs_tcp_read_common(xprt, desc, req);
 
+	spin_lock_bh(&xprt->transport_lock);
 	if (!(transport->tcp_flags & TCP_RCV_COPY_DATA))
 		xprt_complete_rqst(req->rq_task, transport->tcp_copied);
-
+	xprt_unpin_rqst(req);
 	spin_unlock_bh(&xprt->transport_lock);
 	return 0;
 }
-- 
cgit v1.2.3


From e543002f77f463501d47fab43acf7ba881e9dcaf Mon Sep 17 00:00:00 2001
From: Jesper Dangaard Brouer <brouer@redhat.com>
Date: Tue, 15 Aug 2017 21:11:03 +0200
Subject: qdisc: add tracepoint qdisc:qdisc_dequeue for dequeued SKBs

The main purpose of this tracepoint is to monitor bulk dequeue
in the network qdisc layer, as it cannot be deducted from the
existing qdisc stats.

The txq_state can be used for determining the reason for zero packet
dequeues, see enum netdev_queue_state_t.

Notice all packets doesn't necessary activate this tracepoint. As
qdiscs with flag TCQ_F_CAN_BYPASS, can directly invoke
sch_direct_xmit() when qdisc_qlen is zero.

Remember that perf record supports filters like:

 perf record -e qdisc:qdisc_dequeue \
  --filter 'ifindex == 4 && (packets > 1 || txq_state > 0)'

Signed-off-by: Jesper Dangaard Brouer <brouer@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/trace/events/qdisc.h | 50 ++++++++++++++++++++++++++++++++++++++++++++
 net/core/net-traces.c        |  1 +
 net/sched/sch_generic.c      |  8 +++++--
 3 files changed, 57 insertions(+), 2 deletions(-)
 create mode 100644 include/trace/events/qdisc.h

(limited to 'include')

diff --git a/include/trace/events/qdisc.h b/include/trace/events/qdisc.h
new file mode 100644
index 000000000000..60d0d8bd336d
--- /dev/null
+++ b/include/trace/events/qdisc.h
@@ -0,0 +1,50 @@
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM qdisc
+
+#if !defined(_TRACE_QDISC_H) || defined(TRACE_HEADER_MULTI_READ)
+#define _TRACE_QDISC_H_
+
+#include <linux/skbuff.h>
+#include <linux/netdevice.h>
+#include <linux/tracepoint.h>
+#include <linux/ftrace.h>
+
+TRACE_EVENT(qdisc_dequeue,
+
+	TP_PROTO(struct Qdisc *qdisc, const struct netdev_queue *txq,
+		 int packets, struct sk_buff *skb),
+
+	TP_ARGS(qdisc, txq, packets, skb),
+
+	TP_STRUCT__entry(
+		__field(	struct Qdisc *,		qdisc	)
+		__field(const	struct netdev_queue *,	txq	)
+		__field(	int,			packets	)
+		__field(	void *,			skbaddr	)
+		__field(	int,			ifindex	)
+		__field(	u32,			handle	)
+		__field(	u32,			parent	)
+		__field(	unsigned long,		txq_state)
+	),
+
+	/* skb==NULL indicate packets dequeued was 0, even when packets==1 */
+	TP_fast_assign(
+		__entry->qdisc		= qdisc;
+		__entry->txq		= txq;
+		__entry->packets	= skb ? packets : 0;
+		__entry->skbaddr	= skb;
+		__entry->ifindex	= txq->dev ? txq->dev->ifindex : 0;
+		__entry->handle		= qdisc->handle;
+		__entry->parent		= qdisc->parent;
+		__entry->txq_state	= txq->state;
+	),
+
+	TP_printk("dequeue ifindex=%d qdisc handle=0x%X parent=0x%X txq_state=0x%lX packets=%d skbaddr=%p",
+		  __entry->ifindex, __entry->handle, __entry->parent,
+		  __entry->txq_state, __entry->packets, __entry->skbaddr )
+);
+
+#endif /* _TRACE_QDISC_H_ */
+
+/* This part must be outside protection */
+#include <trace/define_trace.h>
diff --git a/net/core/net-traces.c b/net/core/net-traces.c
index 92da5e4ceb4f..4f1468ccd056 100644
--- a/net/core/net-traces.c
+++ b/net/core/net-traces.c
@@ -32,6 +32,7 @@
 #include <trace/events/sock.h>
 #include <trace/events/udp.h>
 #include <trace/events/fib.h>
+#include <trace/events/qdisc.h>
 #if IS_ENABLED(CONFIG_IPV6)
 #include <trace/events/fib6.h>
 EXPORT_TRACEPOINT_SYMBOL_GPL(fib6_table_lookup);
diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c
index 57ba406f1437..c6b89a34e8d2 100644
--- a/net/sched/sch_generic.c
+++ b/net/sched/sch_generic.c
@@ -29,6 +29,7 @@
 #include <net/sch_generic.h>
 #include <net/pkt_sched.h>
 #include <net/dst.h>
+#include <trace/events/qdisc.h>
 
 /* Qdisc to use by default */
 const struct Qdisc_ops *default_qdisc_ops = &pfifo_fast_ops;
@@ -126,7 +127,7 @@ static struct sk_buff *dequeue_skb(struct Qdisc *q, bool *validate,
 			q->q.qlen--;
 		} else
 			skb = NULL;
-		return skb;
+		goto trace;
 	}
 	*validate = true;
 	skb = q->skb_bad_txq;
@@ -139,7 +140,8 @@ static struct sk_buff *dequeue_skb(struct Qdisc *q, bool *validate,
 			q->q.qlen--;
 			goto bulk;
 		}
-		return NULL;
+		skb = NULL;
+		goto trace;
 	}
 	if (!(q->flags & TCQ_F_ONETXQUEUE) ||
 	    !netif_xmit_frozen_or_stopped(txq))
@@ -151,6 +153,8 @@ bulk:
 		else
 			try_bulk_dequeue_skb_slow(q, skb, packets);
 	}
+trace:
+	trace_qdisc_dequeue(q, txq, *packets, skb);
 	return skb;
 }
 
-- 
cgit v1.2.3


From 6bdc9c4c31c81688e19cb186d49be01bbb6a1618 Mon Sep 17 00:00:00 2001
From: John Fastabend <john.fastabend@gmail.com>
Date: Wed, 16 Aug 2017 15:02:32 -0700
Subject: bpf: sock_map fixes for !CONFIG_BPF_SYSCALL and !STREAM_PARSER
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Resolve issues with !CONFIG_BPF_SYSCALL and !STREAM_PARSER

net/core/filter.c: In function ‘do_sk_redirect_map’:
net/core/filter.c:1881:3: error: implicit declaration of function ‘__sock_map_lookup_elem’ [-Werror=implicit-function-declaration]
   sk = __sock_map_lookup_elem(ri->map, ri->ifindex);
   ^
net/core/filter.c:1881:6: warning: assignment makes pointer from integer without a cast [enabled by default]
   sk = __sock_map_lookup_elem(ri->map, ri->ifindex);

Fixes: 174a79ff9515 ("bpf: sockmap with sk redirect support")
Reported-by: Eric Dumazet <eric.dumazet@gmail.com>
Signed-off-by: John Fastabend <john.fastabend@gmail.com>
Acked-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/bpf.h       | 10 +++++++++-
 include/linux/bpf_types.h |  2 ++
 kernel/bpf/Makefile       |  5 ++++-
 kernel/bpf/core.c         |  1 +
 4 files changed, 16 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index a4145e9c74b5..1cc6c5ff61ec 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -313,7 +313,6 @@ int bpf_check(struct bpf_prog **fp, union bpf_attr *attr);
 
 /* Map specifics */
 struct net_device  *__dev_map_lookup_elem(struct bpf_map *map, u32 key);
-struct sock  *__sock_map_lookup_elem(struct bpf_map *map, u32 key);
 void __dev_map_insert_ctx(struct bpf_map *map, u32 index);
 void __dev_map_flush(struct bpf_map *map);
 
@@ -377,6 +376,15 @@ static inline void __dev_map_flush(struct bpf_map *map)
 }
 #endif /* CONFIG_BPF_SYSCALL */
 
+#if defined(CONFIG_STREAM_PARSER) && defined(CONFIG_BPF_SYSCALL)
+struct sock  *__sock_map_lookup_elem(struct bpf_map *map, u32 key);
+#else
+static inline struct sock  *__sock_map_lookup_elem(struct bpf_map *map, u32 key)
+{
+	return NULL;
+}
+#endif
+
 /* verifier prototypes for helper functions called from eBPF programs */
 extern const struct bpf_func_proto bpf_map_lookup_elem_proto;
 extern const struct bpf_func_proto bpf_map_update_elem_proto;
diff --git a/include/linux/bpf_types.h b/include/linux/bpf_types.h
index fa805074d168..6f1a567667b8 100644
--- a/include/linux/bpf_types.h
+++ b/include/linux/bpf_types.h
@@ -38,5 +38,7 @@ BPF_MAP_TYPE(BPF_MAP_TYPE_ARRAY_OF_MAPS, array_of_maps_map_ops)
 BPF_MAP_TYPE(BPF_MAP_TYPE_HASH_OF_MAPS, htab_of_maps_map_ops)
 #ifdef CONFIG_NET
 BPF_MAP_TYPE(BPF_MAP_TYPE_DEVMAP, dev_map_ops)
+#ifdef CONFIG_STREAM_PARSER
 BPF_MAP_TYPE(BPF_MAP_TYPE_SOCKMAP, sock_map_ops)
 #endif
+#endif
diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile
index aa24287db888..897daa005b23 100644
--- a/kernel/bpf/Makefile
+++ b/kernel/bpf/Makefile
@@ -3,7 +3,10 @@ obj-y := core.o
 obj-$(CONFIG_BPF_SYSCALL) += syscall.o verifier.o inode.o helpers.o tnum.o
 obj-$(CONFIG_BPF_SYSCALL) += hashtab.o arraymap.o percpu_freelist.o bpf_lru_list.o lpm_trie.o map_in_map.o
 ifeq ($(CONFIG_NET),y)
-obj-$(CONFIG_BPF_SYSCALL) += devmap.o sockmap.o
+obj-$(CONFIG_BPF_SYSCALL) += devmap.o
+ifeq ($(CONFIG_STREAM_PARSER),y)
+obj-$(CONFIG_BPF_SYSCALL) += sockmap.o
+endif
 endif
 ifeq ($(CONFIG_PERF_EVENTS),y)
 obj-$(CONFIG_BPF_SYSCALL) += stackmap.o
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index c69e7f5bfde7..917cc04a0a94 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -1438,6 +1438,7 @@ const struct bpf_func_proto bpf_ktime_get_ns_proto __weak;
 const struct bpf_func_proto bpf_get_current_pid_tgid_proto __weak;
 const struct bpf_func_proto bpf_get_current_uid_gid_proto __weak;
 const struct bpf_func_proto bpf_get_current_comm_proto __weak;
+const struct bpf_func_proto bpf_sock_map_update_proto __weak;
 
 const struct bpf_func_proto * __weak bpf_get_trace_printk_proto(void)
 {
-- 
cgit v1.2.3


From 9a603b8e1136f2b55f780fefbcbf84d31844ff2b Mon Sep 17 00:00:00 2001
From: stephen hemminger <stephen@networkplumber.org>
Date: Wed, 16 Aug 2017 08:56:24 -0700
Subject: vmbus: remove unused vmbus_sendpacket_multipagebuffer

This function is not used anywhere in current code.

Signed-off-by: Stephen Hemminger <sthemmin@microsoft.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/hv/channel.c   | 56 --------------------------------------------------
 include/linux/hyperv.h |  6 ------
 2 files changed, 62 deletions(-)

(limited to 'include')

diff --git a/drivers/hv/channel.c b/drivers/hv/channel.c
index e57cc40cb768..756a1e841142 100644
--- a/drivers/hv/channel.c
+++ b/drivers/hv/channel.c
@@ -814,62 +814,6 @@ int vmbus_sendpacket_mpb_desc(struct vmbus_channel *channel,
 }
 EXPORT_SYMBOL_GPL(vmbus_sendpacket_mpb_desc);
 
-/*
- * vmbus_sendpacket_multipagebuffer - Send a multi-page buffer packet
- * using a GPADL Direct packet type.
- */
-int vmbus_sendpacket_multipagebuffer(struct vmbus_channel *channel,
-				struct hv_multipage_buffer *multi_pagebuffer,
-				void *buffer, u32 bufferlen, u64 requestid)
-{
-	struct vmbus_channel_packet_multipage_buffer desc;
-	u32 descsize;
-	u32 packetlen;
-	u32 packetlen_aligned;
-	struct kvec bufferlist[3];
-	u64 aligned_data = 0;
-	u32 pfncount = NUM_PAGES_SPANNED(multi_pagebuffer->offset,
-					 multi_pagebuffer->len);
-
-	if (pfncount > MAX_MULTIPAGE_BUFFER_COUNT)
-		return -EINVAL;
-
-	/*
-	 * Adjust the size down since vmbus_channel_packet_multipage_buffer is
-	 * the largest size we support
-	 */
-	descsize = sizeof(struct vmbus_channel_packet_multipage_buffer) -
-			  ((MAX_MULTIPAGE_BUFFER_COUNT - pfncount) *
-			  sizeof(u64));
-	packetlen = descsize + bufferlen;
-	packetlen_aligned = ALIGN(packetlen, sizeof(u64));
-
-
-	/* Setup the descriptor */
-	desc.type = VM_PKT_DATA_USING_GPA_DIRECT;
-	desc.flags = VMBUS_DATA_PACKET_FLAG_COMPLETION_REQUESTED;
-	desc.dataoffset8 = descsize >> 3; /* in 8-bytes granularity */
-	desc.length8 = (u16)(packetlen_aligned >> 3);
-	desc.transactionid = requestid;
-	desc.rangecount = 1;
-
-	desc.range.len = multi_pagebuffer->len;
-	desc.range.offset = multi_pagebuffer->offset;
-
-	memcpy(desc.range.pfn_array, multi_pagebuffer->pfn_array,
-	       pfncount * sizeof(u64));
-
-	bufferlist[0].iov_base = &desc;
-	bufferlist[0].iov_len = descsize;
-	bufferlist[1].iov_base = buffer;
-	bufferlist[1].iov_len = bufferlen;
-	bufferlist[2].iov_base = &aligned_data;
-	bufferlist[2].iov_len = (packetlen_aligned - packetlen);
-
-	return hv_ringbuffer_write(channel, bufferlist, 3);
-}
-EXPORT_SYMBOL_GPL(vmbus_sendpacket_multipagebuffer);
-
 /**
  * vmbus_recvpacket() - Retrieve the user packet on the specified channel
  * @channel: Pointer to vmbus_channel structure.
diff --git a/include/linux/hyperv.h b/include/linux/hyperv.h
index b7d7bbec74e0..39a080ce17da 100644
--- a/include/linux/hyperv.h
+++ b/include/linux/hyperv.h
@@ -1052,12 +1052,6 @@ extern int vmbus_sendpacket_pagebuffer_ctl(struct vmbus_channel *channel,
 					   u64 requestid,
 					   u32 flags);
 
-extern int vmbus_sendpacket_multipagebuffer(struct vmbus_channel *channel,
-					struct hv_multipage_buffer *mpb,
-					void *buffer,
-					u32 bufferlen,
-					u64 requestid);
-
 extern int vmbus_sendpacket_mpb_desc(struct vmbus_channel *channel,
 				     struct vmbus_packet_mpb_array *mpb,
 				     u32 desc_size,
-- 
cgit v1.2.3


From 5a668d8cddbe8bf14379ce110c49ca088a1e9fae Mon Sep 17 00:00:00 2001
From: stephen hemminger <stephen@networkplumber.org>
Date: Wed, 16 Aug 2017 08:56:25 -0700
Subject: vmbus: remove unused vmubs_sendpacket_pagebuffer_ctl

The function vmbus_sendpacket_pagebuffer_ctl was never used directly.
Just have vmbus_send_pagebuffer

Signed-off-by: Stephen Hemminger <sthemmin@microsoft.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/hv/channel.c        | 30 ++++++------------------------
 drivers/net/hyperv/netvsc.c | 10 ++++------
 include/linux/hyperv.h      |  8 --------
 3 files changed, 10 insertions(+), 38 deletions(-)

(limited to 'include')

diff --git a/drivers/hv/channel.c b/drivers/hv/channel.c
index 756a1e841142..9223fe8823e0 100644
--- a/drivers/hv/channel.c
+++ b/drivers/hv/channel.c
@@ -702,16 +702,16 @@ int vmbus_sendpacket(struct vmbus_channel *channel, void *buffer,
 EXPORT_SYMBOL(vmbus_sendpacket);
 
 /*
- * vmbus_sendpacket_pagebuffer_ctl - Send a range of single-page buffer
+ * vmbus_sendpacket_pagebuffer - Send a range of single-page buffer
  * packets using a GPADL Direct packet type. This interface allows you
  * to control notifying the host. This will be useful for sending
  * batched data. Also the sender can control the send flags
  * explicitly.
  */
-int vmbus_sendpacket_pagebuffer_ctl(struct vmbus_channel *channel,
-				    struct hv_page_buffer pagebuffers[],
-				    u32 pagecount, void *buffer, u32 bufferlen,
-				    u64 requestid, u32 flags)
+int vmbus_sendpacket_pagebuffer(struct vmbus_channel *channel,
+				struct hv_page_buffer pagebuffers[],
+				u32 pagecount, void *buffer, u32 bufferlen,
+				u64 requestid)
 {
 	int i;
 	struct vmbus_channel_packet_page_buffer desc;
@@ -736,7 +736,7 @@ int vmbus_sendpacket_pagebuffer_ctl(struct vmbus_channel *channel,
 
 	/* Setup the descriptor */
 	desc.type = VM_PKT_DATA_USING_GPA_DIRECT;
-	desc.flags = flags;
+	desc.flags = VMBUS_DATA_PACKET_FLAG_COMPLETION_REQUESTED;
 	desc.dataoffset8 = descsize >> 3; /* in 8-bytes granularity */
 	desc.length8 = (u16)(packetlen_aligned >> 3);
 	desc.transactionid = requestid;
@@ -757,24 +757,6 @@ int vmbus_sendpacket_pagebuffer_ctl(struct vmbus_channel *channel,
 
 	return hv_ringbuffer_write(channel, bufferlist, 3);
 }
-EXPORT_SYMBOL_GPL(vmbus_sendpacket_pagebuffer_ctl);
-
-/*
- * vmbus_sendpacket_pagebuffer - Send a range of single-page buffer
- * packets using a GPADL Direct packet type.
- */
-int vmbus_sendpacket_pagebuffer(struct vmbus_channel *channel,
-				     struct hv_page_buffer pagebuffers[],
-				     u32 pagecount, void *buffer, u32 bufferlen,
-				     u64 requestid)
-{
-	u32 flags = VMBUS_DATA_PACKET_FLAG_COMPLETION_REQUESTED;
-
-	return vmbus_sendpacket_pagebuffer_ctl(channel, pagebuffers, pagecount,
-					       buffer, bufferlen,
-					       requestid, flags);
-
-}
 EXPORT_SYMBOL_GPL(vmbus_sendpacket_pagebuffer);
 
 /*
diff --git a/drivers/net/hyperv/netvsc.c b/drivers/net/hyperv/netvsc.c
index 0530e7d729e1..6031102cbba3 100644
--- a/drivers/net/hyperv/netvsc.c
+++ b/drivers/net/hyperv/netvsc.c
@@ -775,12 +775,10 @@ static inline int netvsc_send_pkt(
 		if (packet->cp_partial)
 			pb += packet->rmsg_pgcnt;
 
-		ret = vmbus_sendpacket_pagebuffer_ctl(out_channel,
-						      pb, packet->page_buf_cnt,
-						      &nvmsg,
-						      sizeof(struct nvsp_message),
-						      req_id,
-						      VMBUS_DATA_PACKET_FLAG_COMPLETION_REQUESTED);
+		ret = vmbus_sendpacket_pagebuffer(out_channel,
+						  pb, packet->page_buf_cnt,
+						  &nvmsg, sizeof(nvmsg),
+						  req_id);
 	} else {
 		ret = vmbus_sendpacket_ctl(out_channel, &nvmsg,
 					   sizeof(struct nvsp_message),
diff --git a/include/linux/hyperv.h b/include/linux/hyperv.h
index 39a080ce17da..9692592d43a3 100644
--- a/include/linux/hyperv.h
+++ b/include/linux/hyperv.h
@@ -1044,14 +1044,6 @@ extern int vmbus_sendpacket_pagebuffer(struct vmbus_channel *channel,
 					    u32 bufferlen,
 					    u64 requestid);
 
-extern int vmbus_sendpacket_pagebuffer_ctl(struct vmbus_channel *channel,
-					   struct hv_page_buffer pagebuffers[],
-					   u32 pagecount,
-					   void *buffer,
-					   u32 bufferlen,
-					   u64 requestid,
-					   u32 flags);
-
 extern int vmbus_sendpacket_mpb_desc(struct vmbus_channel *channel,
 				     struct vmbus_packet_mpb_array *mpb,
 				     u32 desc_size,
-- 
cgit v1.2.3


From 5dd0fb9b9ffc0ef9b312d05604f4ad0fffc50505 Mon Sep 17 00:00:00 2001
From: stephen hemminger <stephen@networkplumber.org>
Date: Wed, 16 Aug 2017 08:56:26 -0700
Subject: vmbus: remove unused vmbus_sendpacket_ctl

The only usage of vmbus_sendpacket_ctl was by vmbus_sendpacket.

Signed-off-by: Stephen Hemminger <sthemmin@microsoft.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/hv/channel.c        | 43 +++++++++++++++++--------------------------
 drivers/net/hyperv/netvsc.c |  9 ++++-----
 include/linux/hyperv.h      |  7 -------
 3 files changed, 21 insertions(+), 38 deletions(-)

(limited to 'include')

diff --git a/drivers/hv/channel.c b/drivers/hv/channel.c
index 9223fe8823e0..d9e9676e2b40 100644
--- a/drivers/hv/channel.c
+++ b/drivers/hv/channel.c
@@ -647,9 +647,23 @@ void vmbus_close(struct vmbus_channel *channel)
 }
 EXPORT_SYMBOL_GPL(vmbus_close);
 
-int vmbus_sendpacket_ctl(struct vmbus_channel *channel, void *buffer,
-			 u32 bufferlen, u64 requestid,
-			 enum vmbus_packet_type type, u32 flags)
+/**
+ * vmbus_sendpacket() - Send the specified buffer on the given channel
+ * @channel: Pointer to vmbus_channel structure.
+ * @buffer: Pointer to the buffer you want to receive the data into.
+ * @bufferlen: Maximum size of what the the buffer will hold
+ * @requestid: Identifier of the request
+ * @type: Type of packet that is being send e.g. negotiate, time
+ * packet etc.
+ *
+ * Sends data in @buffer directly to hyper-v via the vmbus
+ * This will send the data unparsed to hyper-v.
+ *
+ * Mainly used by Hyper-V drivers.
+ */
+int vmbus_sendpacket(struct vmbus_channel *channel, void *buffer,
+			   u32 bufferlen, u64 requestid,
+			   enum vmbus_packet_type type, u32 flags)
 {
 	struct vmpacket_descriptor desc;
 	u32 packetlen = sizeof(struct vmpacket_descriptor) + bufferlen;
@@ -676,29 +690,6 @@ int vmbus_sendpacket_ctl(struct vmbus_channel *channel, void *buffer,
 
 	return hv_ringbuffer_write(channel, bufferlist, num_vecs);
 }
-EXPORT_SYMBOL(vmbus_sendpacket_ctl);
-
-/**
- * vmbus_sendpacket() - Send the specified buffer on the given channel
- * @channel: Pointer to vmbus_channel structure.
- * @buffer: Pointer to the buffer you want to receive the data into.
- * @bufferlen: Maximum size of what the the buffer will hold
- * @requestid: Identifier of the request
- * @type: Type of packet that is being send e.g. negotiate, time
- * packet etc.
- *
- * Sends data in @buffer directly to hyper-v via the vmbus
- * This will send the data unparsed to hyper-v.
- *
- * Mainly used by Hyper-V drivers.
- */
-int vmbus_sendpacket(struct vmbus_channel *channel, void *buffer,
-			   u32 bufferlen, u64 requestid,
-			   enum vmbus_packet_type type, u32 flags)
-{
-	return vmbus_sendpacket_ctl(channel, buffer, bufferlen, requestid,
-				    type, flags);
-}
 EXPORT_SYMBOL(vmbus_sendpacket);
 
 /*
diff --git a/drivers/net/hyperv/netvsc.c b/drivers/net/hyperv/netvsc.c
index 6031102cbba3..0062b802676f 100644
--- a/drivers/net/hyperv/netvsc.c
+++ b/drivers/net/hyperv/netvsc.c
@@ -780,11 +780,10 @@ static inline int netvsc_send_pkt(
 						  &nvmsg, sizeof(nvmsg),
 						  req_id);
 	} else {
-		ret = vmbus_sendpacket_ctl(out_channel, &nvmsg,
-					   sizeof(struct nvsp_message),
-					   req_id,
-					   VM_PKT_DATA_INBAND,
-					   VMBUS_DATA_PACKET_FLAG_COMPLETION_REQUESTED);
+		ret = vmbus_sendpacket(out_channel,
+				       &nvmsg, sizeof(nvmsg),
+				       req_id, VM_PKT_DATA_INBAND,
+				       VMBUS_DATA_PACKET_FLAG_COMPLETION_REQUESTED);
 	}
 
 	if (ret == 0) {
diff --git a/include/linux/hyperv.h b/include/linux/hyperv.h
index 9692592d43a3..a5f961c4149e 100644
--- a/include/linux/hyperv.h
+++ b/include/linux/hyperv.h
@@ -1030,13 +1030,6 @@ extern int vmbus_sendpacket(struct vmbus_channel *channel,
 				  enum vmbus_packet_type type,
 				  u32 flags);
 
-extern int vmbus_sendpacket_ctl(struct vmbus_channel *channel,
-				  void *buffer,
-				  u32 bufferLen,
-				  u64 requestid,
-				  enum vmbus_packet_type type,
-				  u32 flags);
-
 extern int vmbus_sendpacket_pagebuffer(struct vmbus_channel *channel,
 					    struct hv_page_buffer pagebuffers[],
 					    u32 pagecount,
-- 
cgit v1.2.3


From 81fbfe8adaf38d4f5a98c19bebfd41c5d6acaee8 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Wed, 16 Aug 2017 10:36:47 -0700
Subject: ptr_ring: use kmalloc_array()

As found by syzkaller, malicious users can set whatever tx_queue_len
on a tun device and eventually crash the kernel.

Lets remove the ALIGN(XXX, SMP_CACHE_BYTES) thing since a small
ring buffer is not fast anyway.

Fixes: 2e0ab8ca83c1 ("ptr_ring: array based FIFO for pointers")
Signed-off-by: Eric Dumazet <edumazet@google.com>
Reported-by: Dmitry Vyukov <dvyukov@google.com>
Cc: Michael S. Tsirkin <mst@redhat.com>
Cc: Jason Wang <jasowang@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/ptr_ring.h  | 9 +++++----
 include/linux/skb_array.h | 3 ++-
 2 files changed, 7 insertions(+), 5 deletions(-)

(limited to 'include')

diff --git a/include/linux/ptr_ring.h b/include/linux/ptr_ring.h
index d8c97ec8a8e6..37b4bb2545b3 100644
--- a/include/linux/ptr_ring.h
+++ b/include/linux/ptr_ring.h
@@ -436,9 +436,9 @@ static inline int ptr_ring_consume_batched_bh(struct ptr_ring *r,
 	__PTR_RING_PEEK_CALL_v; \
 })
 
-static inline void **__ptr_ring_init_queue_alloc(int size, gfp_t gfp)
+static inline void **__ptr_ring_init_queue_alloc(unsigned int size, gfp_t gfp)
 {
-	return kzalloc(ALIGN(size * sizeof(void *), SMP_CACHE_BYTES), gfp);
+	return kcalloc(size, sizeof(void *), gfp);
 }
 
 static inline void __ptr_ring_set_size(struct ptr_ring *r, int size)
@@ -582,7 +582,8 @@ static inline int ptr_ring_resize(struct ptr_ring *r, int size, gfp_t gfp,
  * In particular if you consume ring in interrupt or BH context, you must
  * disable interrupts/BH when doing so.
  */
-static inline int ptr_ring_resize_multiple(struct ptr_ring **rings, int nrings,
+static inline int ptr_ring_resize_multiple(struct ptr_ring **rings,
+					   unsigned int nrings,
 					   int size,
 					   gfp_t gfp, void (*destroy)(void *))
 {
@@ -590,7 +591,7 @@ static inline int ptr_ring_resize_multiple(struct ptr_ring **rings, int nrings,
 	void ***queues;
 	int i;
 
-	queues = kmalloc(nrings * sizeof *queues, gfp);
+	queues = kmalloc_array(nrings, sizeof(*queues), gfp);
 	if (!queues)
 		goto noqueues;
 
diff --git a/include/linux/skb_array.h b/include/linux/skb_array.h
index 35226cd4efb0..8621ffdeecbf 100644
--- a/include/linux/skb_array.h
+++ b/include/linux/skb_array.h
@@ -193,7 +193,8 @@ static inline int skb_array_resize(struct skb_array *a, int size, gfp_t gfp)
 }
 
 static inline int skb_array_resize_multiple(struct skb_array **rings,
-					    int nrings, int size, gfp_t gfp)
+					    int nrings, unsigned int size,
+					    gfp_t gfp)
 {
 	BUILD_BUG_ON(offsetof(struct skb_array, ring));
 	return ptr_ring_resize_multiple((struct ptr_ring **)rings,
-- 
cgit v1.2.3


From c780a049f9bf442314335372c9abc4548bfe3e44 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Wed, 16 Aug 2017 11:09:12 -0700
Subject: ipv4: better IP_MAX_MTU enforcement

While working on yet another syzkaller report, I found
that our IP_MAX_MTU enforcements were not properly done.

gcc seems to reload dev->mtu for min(dev->mtu, IP_MAX_MTU), and
final result can be bigger than IP_MAX_MTU :/

This is a problem because device mtu can be changed on other cpus or
threads.

While this patch does not fix the issue I am working on, it is
probably worth addressing it.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/ip.h | 4 ++--
 net/ipv4/route.c | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/include/net/ip.h b/include/net/ip.h
index 821cedcc8e73..0cf7f5a65fe6 100644
--- a/include/net/ip.h
+++ b/include/net/ip.h
@@ -352,7 +352,7 @@ static inline unsigned int ip_dst_mtu_maybe_forward(const struct dst_entry *dst,
 	    !forwarding)
 		return dst_mtu(dst);
 
-	return min(dst->dev->mtu, IP_MAX_MTU);
+	return min(READ_ONCE(dst->dev->mtu), IP_MAX_MTU);
 }
 
 static inline unsigned int ip_skb_dst_mtu(struct sock *sk,
@@ -364,7 +364,7 @@ static inline unsigned int ip_skb_dst_mtu(struct sock *sk,
 		return ip_dst_mtu_maybe_forward(skb_dst(skb), forwarding);
 	}
 
-	return min(skb_dst(skb)->dev->mtu, IP_MAX_MTU);
+	return min(READ_ONCE(skb_dst(skb)->dev->mtu), IP_MAX_MTU);
 }
 
 u32 ip_idents_reserve(u32 hash, int segs);
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index 7effa62beed3..fe877a4a72b1 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -1267,7 +1267,7 @@ static unsigned int ipv4_mtu(const struct dst_entry *dst)
 	if (mtu)
 		return mtu;
 
-	mtu = dst->dev->mtu;
+	mtu = READ_ONCE(dst->dev->mtu);
 
 	if (unlikely(dst_metric_locked(dst, RTAX_MTU))) {
 		if (rt->rt_uses_gateway && mtu > 576)
-- 
cgit v1.2.3


From 70e42fd02c46e2aa9ab07b766d418637e3a51de7 Mon Sep 17 00:00:00 2001
From: Damien Le Moal <damien.lemoal@wdc.com>
Date: Wed, 9 Aug 2017 12:00:22 +0900
Subject: scsi: sd_zbc: Write unlock zone from sd_uninit_cmnd()

Releasing a zone write lock only when the write commnand that acquired
the lock completes can cause deadlocks due to potential command
reordering if the lock owning request is requeued and not executed. This
problem exists only with the scsi-mq path as, unlike the legacy path,
requests are moved out of the dispatch queue before being prepared and
so before locking a zone for a write command.

Since sd_uninit_cmnd() is now always called when a request is requeued,
call sd_zbc_write_unlock_zone() from that function for write requests
that acquired a zone lock instead of from sd_done(). Acquisition of a zone
lock by a write command is indicated using the new command
flag SCMD_ZONE_WRITE_LOCK.

Signed-off-by: Damien Le Moal <damien.lemoal@wdc.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Bart Van Assche <Bart.VanAssche@wdc.com>
Signed-off-by: Martin K. Petersen <martin.petersen@oracle.com>
---
 drivers/scsi/sd.c        | 3 +++
 drivers/scsi/sd_zbc.c    | 9 +++++----
 include/scsi/scsi_cmnd.h | 1 +
 3 files changed, 9 insertions(+), 4 deletions(-)

(limited to 'include')

diff --git a/drivers/scsi/sd.c b/drivers/scsi/sd.c
index bea36adeee17..e2647f2d4430 100644
--- a/drivers/scsi/sd.c
+++ b/drivers/scsi/sd.c
@@ -1277,6 +1277,9 @@ static void sd_uninit_command(struct scsi_cmnd *SCpnt)
 {
 	struct request *rq = SCpnt->request;
 
+	if (SCpnt->flags & SCMD_ZONE_WRITE_LOCK)
+		sd_zbc_write_unlock_zone(SCpnt);
+
 	if (rq->rq_flags & RQF_SPECIAL_PAYLOAD)
 		__free_page(rq->special_vec.bv_page);
 
diff --git a/drivers/scsi/sd_zbc.c b/drivers/scsi/sd_zbc.c
index 96855df9f49d..8aa54779aac1 100644
--- a/drivers/scsi/sd_zbc.c
+++ b/drivers/scsi/sd_zbc.c
@@ -294,6 +294,9 @@ int sd_zbc_write_lock_zone(struct scsi_cmnd *cmd)
 	    test_and_set_bit(zno, sdkp->zones_wlock))
 		return BLKPREP_DEFER;
 
+	WARN_ON_ONCE(cmd->flags & SCMD_ZONE_WRITE_LOCK);
+	cmd->flags |= SCMD_ZONE_WRITE_LOCK;
+
 	return BLKPREP_OK;
 }
 
@@ -302,9 +305,10 @@ void sd_zbc_write_unlock_zone(struct scsi_cmnd *cmd)
 	struct request *rq = cmd->request;
 	struct scsi_disk *sdkp = scsi_disk(rq->rq_disk);
 
-	if (sdkp->zones_wlock) {
+	if (sdkp->zones_wlock && cmd->flags & SCMD_ZONE_WRITE_LOCK) {
 		unsigned int zno = sd_zbc_zone_no(sdkp, blk_rq_pos(rq));
 		WARN_ON_ONCE(!test_bit(zno, sdkp->zones_wlock));
+		cmd->flags &= ~SCMD_ZONE_WRITE_LOCK;
 		clear_bit_unlock(zno, sdkp->zones_wlock);
 		smp_mb__after_atomic();
 	}
@@ -335,9 +339,6 @@ void sd_zbc_complete(struct scsi_cmnd *cmd,
 	case REQ_OP_WRITE_ZEROES:
 	case REQ_OP_WRITE_SAME:
 
-		/* Unlock the zone */
-		sd_zbc_write_unlock_zone(cmd);
-
 		if (result &&
 		    sshdr->sense_key == ILLEGAL_REQUEST &&
 		    sshdr->asc == 0x21)
diff --git a/include/scsi/scsi_cmnd.h b/include/scsi/scsi_cmnd.h
index a1266d318c85..6af198d8120b 100644
--- a/include/scsi/scsi_cmnd.h
+++ b/include/scsi/scsi_cmnd.h
@@ -57,6 +57,7 @@ struct scsi_pointer {
 /* for scmd->flags */
 #define SCMD_TAGGED		(1 << 0)
 #define SCMD_UNCHECKED_ISA_DMA	(1 << 1)
+#define SCMD_ZONE_WRITE_LOCK	(1 << 2)
 
 struct scsi_cmnd {
 	struct scsi_request req;
-- 
cgit v1.2.3


From ce0fa3e56ad20f04d8252353dcd24e924abdafca Mon Sep 17 00:00:00 2001
From: Tony Luck <tony.luck@intel.com>
Date: Wed, 16 Aug 2017 10:18:03 -0700
Subject: x86/mm, mm/hwpoison: Clear PRESENT bit for kernel 1:1 mappings of
 poison pages

Speculative processor accesses may reference any memory that has a
valid page table entry.  While a speculative access won't generate
a machine check, it will log the error in a machine check bank. That
could cause escalation of a subsequent error since the overflow bit
will be then set in the machine check bank status register.

Code has to be double-plus-tricky to avoid mentioning the 1:1 virtual
address of the page we want to map out otherwise we may trigger the
very problem we are trying to avoid.  We use a non-canonical address
that passes through the usual Linux table walking code to get to the
same "pte".

Thanks to Dave Hansen for reviewing several iterations of this.

Also see:

  http://marc.info/?l=linux-mm&m=149860136413338&w=2

Signed-off-by: Tony Luck <tony.luck@intel.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Borislav Petkov <bp@suse.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Dave Hansen <dave.hansen@intel.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: Elliott, Robert (Persistent Memory) <elliott@hpe.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Josh Poimboeuf <jpoimboe@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-mm@kvack.org
Cc: stable@vger.kernel.org
Link: http://lkml.kernel.org/r/20170816171803.28342-1-tony.luck@intel.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/include/asm/page_64.h   |  4 ++++
 arch/x86/kernel/cpu/mcheck/mce.c | 43 ++++++++++++++++++++++++++++++++++++++++
 include/linux/mm_inline.h        |  6 ++++++
 mm/memory-failure.c              |  2 ++
 4 files changed, 55 insertions(+)

(limited to 'include')

diff --git a/arch/x86/include/asm/page_64.h b/arch/x86/include/asm/page_64.h
index b4a0d43248cf..b50df06ad251 100644
--- a/arch/x86/include/asm/page_64.h
+++ b/arch/x86/include/asm/page_64.h
@@ -51,6 +51,10 @@ static inline void clear_page(void *page)
 
 void copy_page(void *to, void *from);
 
+#ifdef CONFIG_X86_MCE
+#define arch_unmap_kpfn arch_unmap_kpfn
+#endif
+
 #endif	/* !__ASSEMBLY__ */
 
 #ifdef CONFIG_X86_VSYSCALL_EMULATION
diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c
index 6dde0497efc7..3b413065c613 100644
--- a/arch/x86/kernel/cpu/mcheck/mce.c
+++ b/arch/x86/kernel/cpu/mcheck/mce.c
@@ -51,6 +51,7 @@
 #include <asm/mce.h>
 #include <asm/msr.h>
 #include <asm/reboot.h>
+#include <asm/set_memory.h>
 
 #include "mce-internal.h"
 
@@ -1051,6 +1052,48 @@ static int do_memory_failure(struct mce *m)
 	return ret;
 }
 
+#if defined(arch_unmap_kpfn) && defined(CONFIG_MEMORY_FAILURE)
+
+void arch_unmap_kpfn(unsigned long pfn)
+{
+	unsigned long decoy_addr;
+
+	/*
+	 * Unmap this page from the kernel 1:1 mappings to make sure
+	 * we don't log more errors because of speculative access to
+	 * the page.
+	 * We would like to just call:
+	 *	set_memory_np((unsigned long)pfn_to_kaddr(pfn), 1);
+	 * but doing that would radically increase the odds of a
+	 * speculative access to the posion page because we'd have
+	 * the virtual address of the kernel 1:1 mapping sitting
+	 * around in registers.
+	 * Instead we get tricky.  We create a non-canonical address
+	 * that looks just like the one we want, but has bit 63 flipped.
+	 * This relies on set_memory_np() not checking whether we passed
+	 * a legal address.
+	 */
+
+/*
+ * Build time check to see if we have a spare virtual bit. Don't want
+ * to leave this until run time because most developers don't have a
+ * system that can exercise this code path. This will only become a
+ * problem if/when we move beyond 5-level page tables.
+ *
+ * Hard code "9" here because cpp doesn't grok ilog2(PTRS_PER_PGD)
+ */
+#if PGDIR_SHIFT + 9 < 63
+	decoy_addr = (pfn << PAGE_SHIFT) + (PAGE_OFFSET ^ BIT(63));
+#else
+#error "no unused virtual bit available"
+#endif
+
+	if (set_memory_np(decoy_addr, 1))
+		pr_warn("Could not invalidate pfn=0x%lx from 1:1 map\n", pfn);
+
+}
+#endif
+
 /*
  * The actual machine check handler. This only handles real
  * exceptions when something got corrupted coming in through int 18.
diff --git a/include/linux/mm_inline.h b/include/linux/mm_inline.h
index e030a68ead7e..25438b2b6f22 100644
--- a/include/linux/mm_inline.h
+++ b/include/linux/mm_inline.h
@@ -126,4 +126,10 @@ static __always_inline enum lru_list page_lru(struct page *page)
 
 #define lru_to_page(head) (list_entry((head)->prev, struct page, lru))
 
+#ifdef arch_unmap_kpfn
+extern void arch_unmap_kpfn(unsigned long pfn);
+#else
+static __always_inline void arch_unmap_kpfn(unsigned long pfn) { }
+#endif
+
 #endif
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index 1cd3b3569af8..88366626c0b7 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -1146,6 +1146,8 @@ int memory_failure(unsigned long pfn, int trapno, int flags)
 		return 0;
 	}
 
+	arch_unmap_kpfn(pfn);
+
 	orig_head = hpage = compound_head(p);
 	num_poisoned_pages_inc();
 
-- 
cgit v1.2.3


From 7a46ec0e2f4850407de5e1d19a44edee6efa58ec Mon Sep 17 00:00:00 2001
From: Kees Cook <keescook@chromium.org>
Date: Tue, 15 Aug 2017 09:19:24 -0700
Subject: locking/refcounts, x86/asm: Implement fast refcount overflow
 protection

This implements refcount_t overflow protection on x86 without a noticeable
performance impact, though without the fuller checking of REFCOUNT_FULL.

This is done by duplicating the existing atomic_t refcount implementation
but with normally a single instruction added to detect if the refcount
has gone negative (e.g. wrapped past INT_MAX or below zero). When detected,
the handler saturates the refcount_t to INT_MIN / 2. With this overflow
protection, the erroneous reference release that would follow a wrap back
to zero is blocked from happening, avoiding the class of refcount-overflow
use-after-free vulnerabilities entirely.

Only the overflow case of refcounting can be perfectly protected, since
it can be detected and stopped before the reference is freed and left to
be abused by an attacker. There isn't a way to block early decrements,
and while REFCOUNT_FULL stops increment-from-zero cases (which would
be the state _after_ an early decrement and stops potential double-free
conditions), this fast implementation does not, since it would require
the more expensive cmpxchg loops. Since the overflow case is much more
common (e.g. missing a "put" during an error path), this protection
provides real-world protection. For example, the two public refcount
overflow use-after-free exploits published in 2016 would have been
rendered unexploitable:

  http://perception-point.io/2016/01/14/analysis-and-exploitation-of-a-linux-kernel-vulnerability-cve-2016-0728/

  http://cyseclabs.com/page?n=02012016

This implementation does, however, notice an unchecked decrement to zero
(i.e. caller used refcount_dec() instead of refcount_dec_and_test() and it
resulted in a zero). Decrements under zero are noticed (since they will
have resulted in a negative value), though this only indicates that a
use-after-free may have already happened. Such notifications are likely
avoidable by an attacker that has already exploited a use-after-free
vulnerability, but it's better to have them reported than allow such
conditions to remain universally silent.

On first overflow detection, the refcount value is reset to INT_MIN / 2
(which serves as a saturation value) and a report and stack trace are
produced. When operations detect only negative value results (such as
changing an already saturated value), saturation still happens but no
notification is performed (since the value was already saturated).

On the matter of races, since the entire range beyond INT_MAX but before
0 is negative, every operation at INT_MIN / 2 will trap, leaving no
overflow-only race condition.

As for performance, this implementation adds a single "js" instruction
to the regular execution flow of a copy of the standard atomic_t refcount
operations. (The non-"and_test" refcount_dec() function, which is uncommon
in regular refcount design patterns, has an additional "jz" instruction
to detect reaching exactly zero.) Since this is a forward jump, it is by
default the non-predicted path, which will be reinforced by dynamic branch
prediction. The result is this protection having virtually no measurable
change in performance over standard atomic_t operations. The error path,
located in .text.unlikely, saves the refcount location and then uses UD0
to fire a refcount exception handler, which resets the refcount, handles
reporting, and returns to regular execution. This keeps the changes to
.text size minimal, avoiding return jumps and open-coded calls to the
error reporting routine.

Example assembly comparison:

refcount_inc() before:

  .text:
  ffffffff81546149:       f0 ff 45 f4             lock incl -0xc(%rbp)

refcount_inc() after:

  .text:
  ffffffff81546149:       f0 ff 45 f4             lock incl -0xc(%rbp)
  ffffffff8154614d:       0f 88 80 d5 17 00       js     ffffffff816c36d3
  ...
  .text.unlikely:
  ffffffff816c36d3:       48 8d 4d f4             lea    -0xc(%rbp),%rcx
  ffffffff816c36d7:       0f ff                   (bad)

These are the cycle counts comparing a loop of refcount_inc() from 1
to INT_MAX and back down to 0 (via refcount_dec_and_test()), between
unprotected refcount_t (atomic_t), fully protected REFCOUNT_FULL
(refcount_t-full), and this overflow-protected refcount (refcount_t-fast):

  2147483646 refcount_inc()s and 2147483647 refcount_dec_and_test()s:
		    cycles		protections
  atomic_t           82249267387	none
  refcount_t-fast    82211446892	overflow, untested dec-to-zero
  refcount_t-full   144814735193	overflow, untested dec-to-zero, inc-from-zero

This code is a modified version of the x86 PAX_REFCOUNT atomic_t
overflow defense from the last public patch of PaX/grsecurity, based
on my understanding of the code. Changes or omissions from the original
code are mine and don't reflect the original grsecurity/PaX code. Thanks
to PaX Team for various suggestions for improvement for repurposing this
code to be a refcount-only protection.

Signed-off-by: Kees Cook <keescook@chromium.org>
Reviewed-by: Josh Poimboeuf <jpoimboe@redhat.com>
Cc: Alexey Dobriyan <adobriyan@gmail.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Christoph Hellwig <hch@infradead.org>
Cc: David S. Miller <davem@davemloft.net>
Cc: Davidlohr Bueso <dave@stgolabs.net>
Cc: Elena Reshetova <elena.reshetova@intel.com>
Cc: Eric Biggers <ebiggers3@gmail.com>
Cc: Eric W. Biederman <ebiederm@xmission.com>
Cc: Greg KH <gregkh@linuxfoundation.org>
Cc: Hans Liljestrand <ishkamiel@gmail.com>
Cc: James Bottomley <James.Bottomley@hansenpartnership.com>
Cc: Jann Horn <jannh@google.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Manfred Spraul <manfred@colorfullife.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rik van Riel <riel@redhat.com>
Cc: Serge E. Hallyn <serge@hallyn.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: arozansk@redhat.com
Cc: axboe@kernel.dk
Cc: kernel-hardening@lists.openwall.com
Cc: linux-arch <linux-arch@vger.kernel.org>
Link: http://lkml.kernel.org/r/20170815161924.GA133115@beast
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/Kconfig                    |  12 +++++
 arch/x86/Kconfig                |   1 +
 arch/x86/include/asm/asm.h      |   6 +++
 arch/x86/include/asm/refcount.h | 109 ++++++++++++++++++++++++++++++++++++++++
 arch/x86/mm/extable.c           |  42 ++++++++++++++++
 include/linux/kernel.h          |   7 +++
 include/linux/refcount.h        |   4 ++
 kernel/panic.c                  |  12 +++++
 8 files changed, 193 insertions(+)
 create mode 100644 arch/x86/include/asm/refcount.h

(limited to 'include')

diff --git a/arch/Kconfig b/arch/Kconfig
index 21d0089117fe..2520ca5b42eb 100644
--- a/arch/Kconfig
+++ b/arch/Kconfig
@@ -931,6 +931,18 @@ config STRICT_MODULE_RWX
 config ARCH_WANT_RELAX_ORDER
 	bool
 
+config ARCH_HAS_REFCOUNT
+	bool
+	help
+	  An architecture selects this when it has implemented refcount_t
+	  using open coded assembly primitives that provide an optimized
+	  refcount_t implementation, possibly at the expense of some full
+	  refcount state checks of CONFIG_REFCOUNT_FULL=y.
+
+	  The refcount overflow check behavior, however, must be retained.
+	  Catching overflows is the primary security concern for protecting
+	  against bugs in reference counts.
+
 config REFCOUNT_FULL
 	bool "Perform full reference count validation at the expense of speed"
 	help
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 781521b7cf9e..73574c91e857 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -55,6 +55,7 @@ config X86
 	select ARCH_HAS_KCOV			if X86_64
 	select ARCH_HAS_MMIO_FLUSH
 	select ARCH_HAS_PMEM_API		if X86_64
+	select ARCH_HAS_REFCOUNT
 	select ARCH_HAS_UACCESS_FLUSHCACHE	if X86_64
 	select ARCH_HAS_SET_MEMORY
 	select ARCH_HAS_SG_CHAIN
diff --git a/arch/x86/include/asm/asm.h b/arch/x86/include/asm/asm.h
index 7a9df3beb89b..676ee5807d86 100644
--- a/arch/x86/include/asm/asm.h
+++ b/arch/x86/include/asm/asm.h
@@ -74,6 +74,9 @@
 # define _ASM_EXTABLE_EX(from, to)				\
 	_ASM_EXTABLE_HANDLE(from, to, ex_handler_ext)
 
+# define _ASM_EXTABLE_REFCOUNT(from, to)			\
+	_ASM_EXTABLE_HANDLE(from, to, ex_handler_refcount)
+
 # define _ASM_NOKPROBE(entry)					\
 	.pushsection "_kprobe_blacklist","aw" ;			\
 	_ASM_ALIGN ;						\
@@ -123,6 +126,9 @@
 # define _ASM_EXTABLE_EX(from, to)				\
 	_ASM_EXTABLE_HANDLE(from, to, ex_handler_ext)
 
+# define _ASM_EXTABLE_REFCOUNT(from, to)			\
+	_ASM_EXTABLE_HANDLE(from, to, ex_handler_refcount)
+
 /* For C file, we already have NOKPROBE_SYMBOL macro */
 #endif
 
diff --git a/arch/x86/include/asm/refcount.h b/arch/x86/include/asm/refcount.h
new file mode 100644
index 000000000000..ff871210b9f2
--- /dev/null
+++ b/arch/x86/include/asm/refcount.h
@@ -0,0 +1,109 @@
+#ifndef __ASM_X86_REFCOUNT_H
+#define __ASM_X86_REFCOUNT_H
+/*
+ * x86-specific implementation of refcount_t. Based on PAX_REFCOUNT from
+ * PaX/grsecurity.
+ */
+#include <linux/refcount.h>
+
+/*
+ * This is the first portion of the refcount error handling, which lives in
+ * .text.unlikely, and is jumped to from the CPU flag check (in the
+ * following macros). This saves the refcount value location into CX for
+ * the exception handler to use (in mm/extable.c), and then triggers the
+ * central refcount exception. The fixup address for the exception points
+ * back to the regular execution flow in .text.
+ */
+#define _REFCOUNT_EXCEPTION				\
+	".pushsection .text.unlikely\n"			\
+	"111:\tlea %[counter], %%" _ASM_CX "\n"		\
+	"112:\t" ASM_UD0 "\n"				\
+	ASM_UNREACHABLE					\
+	".popsection\n"					\
+	"113:\n"					\
+	_ASM_EXTABLE_REFCOUNT(112b, 113b)
+
+/* Trigger refcount exception if refcount result is negative. */
+#define REFCOUNT_CHECK_LT_ZERO				\
+	"js 111f\n\t"					\
+	_REFCOUNT_EXCEPTION
+
+/* Trigger refcount exception if refcount result is zero or negative. */
+#define REFCOUNT_CHECK_LE_ZERO				\
+	"jz 111f\n\t"					\
+	REFCOUNT_CHECK_LT_ZERO
+
+/* Trigger refcount exception unconditionally. */
+#define REFCOUNT_ERROR					\
+	"jmp 111f\n\t"					\
+	_REFCOUNT_EXCEPTION
+
+static __always_inline void refcount_add(unsigned int i, refcount_t *r)
+{
+	asm volatile(LOCK_PREFIX "addl %1,%0\n\t"
+		REFCOUNT_CHECK_LT_ZERO
+		: [counter] "+m" (r->refs.counter)
+		: "ir" (i)
+		: "cc", "cx");
+}
+
+static __always_inline void refcount_inc(refcount_t *r)
+{
+	asm volatile(LOCK_PREFIX "incl %0\n\t"
+		REFCOUNT_CHECK_LT_ZERO
+		: [counter] "+m" (r->refs.counter)
+		: : "cc", "cx");
+}
+
+static __always_inline void refcount_dec(refcount_t *r)
+{
+	asm volatile(LOCK_PREFIX "decl %0\n\t"
+		REFCOUNT_CHECK_LE_ZERO
+		: [counter] "+m" (r->refs.counter)
+		: : "cc", "cx");
+}
+
+static __always_inline __must_check
+bool refcount_sub_and_test(unsigned int i, refcount_t *r)
+{
+	GEN_BINARY_SUFFIXED_RMWcc(LOCK_PREFIX "subl", REFCOUNT_CHECK_LT_ZERO,
+				  r->refs.counter, "er", i, "%0", e);
+}
+
+static __always_inline __must_check bool refcount_dec_and_test(refcount_t *r)
+{
+	GEN_UNARY_SUFFIXED_RMWcc(LOCK_PREFIX "decl", REFCOUNT_CHECK_LT_ZERO,
+				 r->refs.counter, "%0", e);
+}
+
+static __always_inline __must_check
+bool refcount_add_not_zero(unsigned int i, refcount_t *r)
+{
+	int c, result;
+
+	c = atomic_read(&(r->refs));
+	do {
+		if (unlikely(c == 0))
+			return false;
+
+		result = c + i;
+
+		/* Did we try to increment from/to an undesirable state? */
+		if (unlikely(c < 0 || c == INT_MAX || result < c)) {
+			asm volatile(REFCOUNT_ERROR
+				     : : [counter] "m" (r->refs.counter)
+				     : "cc", "cx");
+			break;
+		}
+
+	} while (!atomic_try_cmpxchg(&(r->refs), &c, result));
+
+	return c != 0;
+}
+
+static __always_inline __must_check bool refcount_inc_not_zero(refcount_t *r)
+{
+	return refcount_add_not_zero(1, r);
+}
+
+#endif
diff --git a/arch/x86/mm/extable.c b/arch/x86/mm/extable.c
index 0ea8afcb929c..761fc88cd820 100644
--- a/arch/x86/mm/extable.c
+++ b/arch/x86/mm/extable.c
@@ -36,6 +36,48 @@ bool ex_handler_fault(const struct exception_table_entry *fixup,
 }
 EXPORT_SYMBOL_GPL(ex_handler_fault);
 
+/*
+ * Handler for UD0 exception following a failed test against the
+ * result of a refcount inc/dec/add/sub.
+ */
+bool ex_handler_refcount(const struct exception_table_entry *fixup,
+			 struct pt_regs *regs, int trapnr)
+{
+	/* First unconditionally saturate the refcount. */
+	*(int *)regs->cx = INT_MIN / 2;
+
+	/*
+	 * Strictly speaking, this reports the fixup destination, not
+	 * the fault location, and not the actually overflowing
+	 * instruction, which is the instruction before the "js", but
+	 * since that instruction could be a variety of lengths, just
+	 * report the location after the overflow, which should be close
+	 * enough for finding the overflow, as it's at least back in
+	 * the function, having returned from .text.unlikely.
+	 */
+	regs->ip = ex_fixup_addr(fixup);
+
+	/*
+	 * This function has been called because either a negative refcount
+	 * value was seen by any of the refcount functions, or a zero
+	 * refcount value was seen by refcount_dec().
+	 *
+	 * If we crossed from INT_MAX to INT_MIN, OF (Overflow Flag: result
+	 * wrapped around) will be set. Additionally, seeing the refcount
+	 * reach 0 will set ZF (Zero Flag: result was zero). In each of
+	 * these cases we want a report, since it's a boundary condition.
+	 *
+	 */
+	if (regs->flags & (X86_EFLAGS_OF | X86_EFLAGS_ZF)) {
+		bool zero = regs->flags & X86_EFLAGS_ZF;
+
+		refcount_error_report(regs, zero ? "hit zero" : "overflow");
+	}
+
+	return true;
+}
+EXPORT_SYMBOL_GPL(ex_handler_refcount);
+
 bool ex_handler_ext(const struct exception_table_entry *fixup,
 		   struct pt_regs *regs, int trapnr)
 {
diff --git a/include/linux/kernel.h b/include/linux/kernel.h
index bd6d96cf80b1..6607225d0ea4 100644
--- a/include/linux/kernel.h
+++ b/include/linux/kernel.h
@@ -277,6 +277,13 @@ extern int oops_may_print(void);
 void do_exit(long error_code) __noreturn;
 void complete_and_exit(struct completion *, long) __noreturn;
 
+#ifdef CONFIG_ARCH_HAS_REFCOUNT
+void refcount_error_report(struct pt_regs *regs, const char *err);
+#else
+static inline void refcount_error_report(struct pt_regs *regs, const char *err)
+{ }
+#endif
+
 /* Internal, do not use. */
 int __must_check _kstrtoul(const char *s, unsigned int base, unsigned long *res);
 int __must_check _kstrtol(const char *s, unsigned int base, long *res);
diff --git a/include/linux/refcount.h b/include/linux/refcount.h
index 591792c8e5b0..48b7c9c68c4d 100644
--- a/include/linux/refcount.h
+++ b/include/linux/refcount.h
@@ -53,6 +53,9 @@ extern __must_check bool refcount_sub_and_test(unsigned int i, refcount_t *r);
 extern __must_check bool refcount_dec_and_test(refcount_t *r);
 extern void refcount_dec(refcount_t *r);
 #else
+# ifdef CONFIG_ARCH_HAS_REFCOUNT
+#  include <asm/refcount.h>
+# else
 static inline __must_check bool refcount_add_not_zero(unsigned int i, refcount_t *r)
 {
 	return atomic_add_unless(&r->refs, i, 0);
@@ -87,6 +90,7 @@ static inline void refcount_dec(refcount_t *r)
 {
 	atomic_dec(&r->refs);
 }
+# endif /* !CONFIG_ARCH_HAS_REFCOUNT */
 #endif /* CONFIG_REFCOUNT_FULL */
 
 extern __must_check bool refcount_dec_if_one(refcount_t *r);
diff --git a/kernel/panic.c b/kernel/panic.c
index a58932b41700..bdd18afa19a4 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -26,6 +26,7 @@
 #include <linux/nmi.h>
 #include <linux/console.h>
 #include <linux/bug.h>
+#include <linux/ratelimit.h>
 
 #define PANIC_TIMER_STEP 100
 #define PANIC_BLINK_SPD 18
@@ -601,6 +602,17 @@ EXPORT_SYMBOL(__stack_chk_fail);
 
 #endif
 
+#ifdef CONFIG_ARCH_HAS_REFCOUNT
+void refcount_error_report(struct pt_regs *regs, const char *err)
+{
+	WARN_RATELIMIT(1, "refcount_t %s at %pB in %s[%d], uid/euid: %u/%u\n",
+		err, (void *)instruction_pointer(regs),
+		current->comm, task_pid_nr(current),
+		from_kuid_munged(&init_user_ns, current_uid()),
+		from_kuid_munged(&init_user_ns, current_euid()));
+}
+#endif
+
 core_param(panic, panic_timeout, int, 0644);
 core_param(pause_on_oops, pause_on_oops, int, 0644);
 core_param(panic_on_warn, panic_on_warn, int, 0644);
-- 
cgit v1.2.3


From 02e43c2dcd3b3cf7244f6dda65a07e8dacadaf8d Mon Sep 17 00:00:00 2001
From: Baoquan He <bhe@redhat.com>
Date: Wed, 16 Aug 2017 21:46:51 +0800
Subject: efi: Introduce efi_early_memdesc_ptr to get pointer to memmap
 descriptor

The existing map iteration helper for_each_efi_memory_desc_in_map can
only be used after the kernel initializes the EFI subsystem to set up
struct efi_memory_map.

Before that we also need iterate map descriptors which are stored in several
intermediate structures, like struct efi_boot_memmap for arch independent
usage and struct efi_info for x86 arch only.

Introduce efi_early_memdesc_ptr() to get pointer to a map descriptor, and
replace several places where that primitive is open coded.

Signed-off-by: Baoquan He <bhe@redhat.com>
[ Various improvements to the text. ]
Acked-by: Matt Fleming <matt@codeblueprint.co.uk>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: ard.biesheuvel@linaro.org
Cc: fanc.fnst@cn.fujitsu.com
Cc: izumi.taku@jp.fujitsu.com
Cc: keescook@chromium.org
Cc: linux-efi@vger.kernel.org
Cc: n-horiguchi@ah.jp.nec.com
Cc: thgarnie@google.com
Link: http://lkml.kernel.org/r/20170816134651.GF21273@x1
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/boot/compressed/eboot.c               |  2 +-
 drivers/firmware/efi/libstub/efi-stub-helper.c |  4 ++--
 include/linux/efi.h                            | 22 ++++++++++++++++++++++
 3 files changed, 25 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/arch/x86/boot/compressed/eboot.c b/arch/x86/boot/compressed/eboot.c
index c3e869eaef0c..e007887a33b0 100644
--- a/arch/x86/boot/compressed/eboot.c
+++ b/arch/x86/boot/compressed/eboot.c
@@ -767,7 +767,7 @@ static efi_status_t setup_e820(struct boot_params *params,
 		m |= (u64)efi->efi_memmap_hi << 32;
 #endif
 
-		d = (efi_memory_desc_t *)(m + (i * efi->efi_memdesc_size));
+		d = efi_early_memdesc_ptr(m, efi->efi_memdesc_size, i);
 		switch (d->type) {
 		case EFI_RESERVED_TYPE:
 		case EFI_RUNTIME_SERVICES_CODE:
diff --git a/drivers/firmware/efi/libstub/efi-stub-helper.c b/drivers/firmware/efi/libstub/efi-stub-helper.c
index b0184360efc6..50a9cab5a834 100644
--- a/drivers/firmware/efi/libstub/efi-stub-helper.c
+++ b/drivers/firmware/efi/libstub/efi-stub-helper.c
@@ -205,7 +205,7 @@ again:
 		unsigned long m = (unsigned long)map;
 		u64 start, end;
 
-		desc = (efi_memory_desc_t *)(m + (i * desc_size));
+		desc = efi_early_memdesc_ptr(m, desc_size, i);
 		if (desc->type != EFI_CONVENTIONAL_MEMORY)
 			continue;
 
@@ -298,7 +298,7 @@ efi_status_t efi_low_alloc(efi_system_table_t *sys_table_arg,
 		unsigned long m = (unsigned long)map;
 		u64 start, end;
 
-		desc = (efi_memory_desc_t *)(m + (i * desc_size));
+		desc = efi_early_memdesc_ptr(m, desc_size, i);
 
 		if (desc->type != EFI_CONVENTIONAL_MEMORY)
 			continue;
diff --git a/include/linux/efi.h b/include/linux/efi.h
index 8269bcb8ccf7..a686ca9a7e5c 100644
--- a/include/linux/efi.h
+++ b/include/linux/efi.h
@@ -1020,6 +1020,28 @@ extern int efi_memattr_init(void);
 extern int efi_memattr_apply_permissions(struct mm_struct *mm,
 					 efi_memattr_perm_setter fn);
 
+/*
+ * efi_early_memdesc_ptr - get the n-th EFI memmap descriptor
+ * @map: the start of efi memmap
+ * @desc_size: the size of space for each EFI memmap descriptor
+ * @n: the index of efi memmap descriptor
+ *
+ * EFI boot service provides the GetMemoryMap() function to get a copy of the
+ * current memory map which is an array of memory descriptors, each of
+ * which describes a contiguous block of memory. It also gets the size of the
+ * map, and the size of each descriptor, etc.
+ *
+ * Note that per section 6.2 of UEFI Spec 2.6 Errata A, the returned size of
+ * each descriptor might not be equal to sizeof(efi_memory_memdesc_t),
+ * since efi_memory_memdesc_t may be extended in the future. Thus the OS
+ * MUST use the returned size of the descriptor to find the start of each
+ * efi_memory_memdesc_t in the memory map array. This should only be used
+ * during bootup since for_each_efi_memory_desc_xxx() is available after the
+ * kernel initializes the EFI subsystem to set up struct efi_memory_map.
+ */
+#define efi_early_memdesc_ptr(map, desc_size, n)			\
+	(efi_memory_desc_t *)((void *)(map) + ((n) * (desc_size)))
+
 /* Iterate through an efi_memory_map */
 #define for_each_efi_memory_desc_in_map(m, md)				   \
 	for ((md) = (m)->map;						   \
-- 
cgit v1.2.3


From ea3f2c0fdfbb180f142cdbc0d1f055c7b9a5e63e Mon Sep 17 00:00:00 2001
From: Byungchul Park <byungchul.park@lge.com>
Date: Thu, 17 Aug 2017 17:57:41 +0900
Subject: locking/lockdep: Rename CONFIG_LOCKDEP_COMPLETE to
 CONFIG_LOCKDEP_COMPLETIONS

'complete' is an adjective and LOCKDEP_COMPLETE sounds like 'lockdep is complete',
so pick a better name that uses a noun.

Signed-off-by: Byungchul Park <byungchul.park@lge.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: kernel-team@lge.com
Link: http://lkml.kernel.org/r/1502960261-16206-3-git-send-email-byungchul.park@lge.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/completion.h | 8 ++++----
 lib/Kconfig.debug          | 4 ++--
 2 files changed, 6 insertions(+), 6 deletions(-)

(limited to 'include')

diff --git a/include/linux/completion.h b/include/linux/completion.h
index 9bcebf509b4d..791f053f28b7 100644
--- a/include/linux/completion.h
+++ b/include/linux/completion.h
@@ -9,7 +9,7 @@
  */
 
 #include <linux/wait.h>
-#ifdef CONFIG_LOCKDEP_COMPLETE
+#ifdef CONFIG_LOCKDEP_COMPLETIONS
 #include <linux/lockdep.h>
 #endif
 
@@ -28,12 +28,12 @@
 struct completion {
 	unsigned int done;
 	wait_queue_head_t wait;
-#ifdef CONFIG_LOCKDEP_COMPLETE
+#ifdef CONFIG_LOCKDEP_COMPLETIONS
 	struct lockdep_map_cross map;
 #endif
 };
 
-#ifdef CONFIG_LOCKDEP_COMPLETE
+#ifdef CONFIG_LOCKDEP_COMPLETIONS
 static inline void complete_acquire(struct completion *x)
 {
 	lock_acquire_exclusive((struct lockdep_map *)&x->map, 0, 0, NULL, _RET_IP_);
@@ -64,7 +64,7 @@ static inline void complete_release(struct completion *x) {}
 static inline void complete_release_commit(struct completion *x) {}
 #endif
 
-#ifdef CONFIG_LOCKDEP_COMPLETE
+#ifdef CONFIG_LOCKDEP_COMPLETIONS
 #define COMPLETION_INITIALIZER(work) \
 	{ 0, __WAIT_QUEUE_HEAD_INITIALIZER((work).wait), \
 	STATIC_CROSS_LOCKDEP_MAP_INIT("(complete)" #work, &(work)) }
diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
index 8fb8a206db12..a0e60d56303e 100644
--- a/lib/Kconfig.debug
+++ b/lib/Kconfig.debug
@@ -1082,7 +1082,7 @@ config PROVE_LOCKING
 	select DEBUG_RT_MUTEXES if RT_MUTEXES
 	select DEBUG_LOCK_ALLOC
 	select LOCKDEP_CROSSRELEASE
-	select LOCKDEP_COMPLETE
+	select LOCKDEP_COMPLETIONS
 	select TRACE_IRQFLAGS
 	default n
 	help
@@ -1162,7 +1162,7 @@ config LOCKDEP_CROSSRELEASE
 	 such as page locks or completions can use the lock correctness
 	 detector, lockdep.
 
-config LOCKDEP_COMPLETE
+config LOCKDEP_COMPLETIONS
 	bool "Lock debugging: allow completions to use deadlock detector"
 	help
 	 A deadlock caused by wait_for_completion() and complete() can be
-- 
cgit v1.2.3


From 52fa5bc5cbba089f09bc2c372e3432f3f3e48051 Mon Sep 17 00:00:00 2001
From: Boqun Feng <boqun.feng@gmail.com>
Date: Thu, 17 Aug 2017 17:46:12 +0800
Subject: locking/lockdep: Explicitly initialize wq_barrier::done::map

With the new lockdep crossrelease feature, which checks completions usage,
a false positive is reported in the workqueue code:

> Worker A : acquired of wfc.work -> wait for cpu_hotplug_lock to be released
> Task   B : acquired of cpu_hotplug_lock -> wait for lock#3 to be released
> Task   C : acquired of lock#3 -> wait for completion of barr->done
> (Task C is in lru_add_drain_all_cpuslocked())
> Worker D : wait for wfc.work to be released -> will complete barr->done

Such a dead lock can not happen because Task C's barr->done and Worker D's
barr->done can not be the same instance.

The reason of this false positive is we initialize all wq_barrier::done
at insert_wq_barrier() via init_completion(), which makes them belong to
the same lock class, therefore, impossible circles are reported.

To fix this, explicitly initialize the lockdep map for wq_barrier::done
in insert_wq_barrier(), so that the lock class key of wq_barrier::done
is a subkey of the corresponding work_struct, as a result we won't build
a dependency between a wq_barrier with a unrelated work, and we can
differ wq barriers based on the related works, so the false positive
above is avoided.

Also define the empty lockdep_init_map_crosslock() for !CROSSRELEASE
to make the code simple and away from unnecessary #ifdefs.

Reported-by: Ingo Molnar <mingo@kernel.org>
Signed-off-by: Boqun Feng <boqun.feng@gmail.com>
Cc: Byungchul Park <byungchul.park@lge.com>
Cc: Lai Jiangshan <jiangshanlai@gmail.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Tejun Heo <tj@kernel.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/20170817094622.12915-1-boqun.feng@gmail.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/lockdep.h |  1 +
 kernel/workqueue.c      | 11 ++++++++++-
 2 files changed, 11 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/lockdep.h b/include/linux/lockdep.h
index 651cc61af041..fc827cab6d6e 100644
--- a/include/linux/lockdep.h
+++ b/include/linux/lockdep.h
@@ -583,6 +583,7 @@ extern void crossrelease_hist_end(enum xhlock_context_t c);
 extern void lockdep_init_task(struct task_struct *task);
 extern void lockdep_free_task(struct task_struct *task);
 #else
+#define lockdep_init_map_crosslock(m, n, k, s) do {} while (0)
 /*
  * To initialize a lockdep_map statically use this macro.
  * Note that _name must not be NULL.
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index e86733a8b344..f128b3becfe1 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -2476,7 +2476,16 @@ static void insert_wq_barrier(struct pool_workqueue *pwq,
 	 */
 	INIT_WORK_ONSTACK(&barr->work, wq_barrier_func);
 	__set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(&barr->work));
-	init_completion(&barr->done);
+
+	/*
+	 * Explicitly init the crosslock for wq_barrier::done, make its lock
+	 * key a subkey of the corresponding work. As a result we won't
+	 * build a dependency between wq_barrier::done and unrelated work.
+	 */
+	lockdep_init_map_crosslock((struct lockdep_map *)&barr->done.map,
+				   "(complete)wq_barr::done",
+				   target->lockdep_map.key, 1);
+	__init_completion(&barr->done);
 	barr->task = current;
 
 	/*
-- 
cgit v1.2.3


From 7e42776d5ed1fe9a941ed8876c5d15cd7cf5d89f Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Thu, 25 May 2017 08:05:00 -0700
Subject: rcu: Drive TASKS_RCU directly off of PREEMPT

The actual use of TASKS_RCU is only when PREEMPT, otherwise RCU-sched
is used instead.  This commit therefore makes synchronize_rcu_tasks()
and call_rcu_tasks() available always, but mapped to synchronize_sched()
and call_rcu_sched(), respectively, when !PREEMPT.  This approach also
allows some #ifdefs to be removed from rcutorture.

Reported-by: Ingo Molnar <mingo@kernel.org>
Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Reviewed-by: Masami Hiramatsu <mhiramat@kernel.org>
Acked-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/rcupdate.h                                |  6 ++++--
 kernel/rcu/Kconfig                                      |  3 +--
 kernel/rcu/rcutorture.c                                 | 17 +----------------
 .../selftests/rcutorture/doc/TREE_RCU-kconfig.txt       |  2 +-
 4 files changed, 7 insertions(+), 21 deletions(-)

(limited to 'include')

diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h
index f816fc72b51e..c3f380befdd7 100644
--- a/include/linux/rcupdate.h
+++ b/include/linux/rcupdate.h
@@ -58,8 +58,6 @@ void call_rcu(struct rcu_head *head, rcu_callback_t func);
 void call_rcu_bh(struct rcu_head *head, rcu_callback_t func);
 void call_rcu_sched(struct rcu_head *head, rcu_callback_t func);
 void synchronize_sched(void);
-void call_rcu_tasks(struct rcu_head *head, rcu_callback_t func);
-void synchronize_rcu_tasks(void);
 void rcu_barrier_tasks(void);
 
 #ifdef CONFIG_PREEMPT_RCU
@@ -176,10 +174,14 @@ extern struct srcu_struct tasks_rcu_exit_srcu;
 		rcu_all_qs(); \
 		rcu_note_voluntary_context_switch_lite(t); \
 	} while (0)
+void call_rcu_tasks(struct rcu_head *head, rcu_callback_t func);
+void synchronize_rcu_tasks(void);
 #else /* #ifdef CONFIG_TASKS_RCU */
 #define TASKS_RCU(x) do { } while (0)
 #define rcu_note_voluntary_context_switch_lite(t)	do { } while (0)
 #define rcu_note_voluntary_context_switch(t)		rcu_all_qs()
+#define call_rcu_tasks call_rcu_sched
+#define synchronize_rcu_tasks synchronize_sched
 #endif /* #else #ifdef CONFIG_TASKS_RCU */
 
 /**
diff --git a/kernel/rcu/Kconfig b/kernel/rcu/Kconfig
index be90c945063f..9210379c0353 100644
--- a/kernel/rcu/Kconfig
+++ b/kernel/rcu/Kconfig
@@ -69,8 +69,7 @@ config TREE_SRCU
 	  This option selects the full-fledged version of SRCU.
 
 config TASKS_RCU
-	bool
-	default n
+	def_bool PREEMPT
 	select SRCU
 	help
 	  This option enables a task-based RCU implementation that uses
diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c
index b8f7f8ce8575..b284c861a511 100644
--- a/kernel/rcu/rcutorture.c
+++ b/kernel/rcu/rcutorture.c
@@ -696,8 +696,6 @@ static struct rcu_torture_ops sched_ops = {
 	.name		= "sched"
 };
 
-#ifdef CONFIG_TASKS_RCU
-
 /*
  * Definitions for RCU-tasks torture testing.
  */
@@ -735,24 +733,11 @@ static struct rcu_torture_ops tasks_ops = {
 	.name		= "tasks"
 };
 
-#define RCUTORTURE_TASKS_OPS &tasks_ops,
-
 static bool __maybe_unused torturing_tasks(void)
 {
 	return cur_ops == &tasks_ops;
 }
 
-#else /* #ifdef CONFIG_TASKS_RCU */
-
-#define RCUTORTURE_TASKS_OPS
-
-static bool __maybe_unused torturing_tasks(void)
-{
-	return false;
-}
-
-#endif /* #else #ifdef CONFIG_TASKS_RCU */
-
 /*
  * RCU torture priority-boost testing.  Runs one real-time thread per
  * CPU for moderate bursts, repeatedly registering RCU callbacks and
@@ -1749,7 +1734,7 @@ rcu_torture_init(void)
 	int firsterr = 0;
 	static struct rcu_torture_ops *torture_ops[] = {
 		&rcu_ops, &rcu_bh_ops, &rcu_busted_ops, &srcu_ops, &srcud_ops,
-		&sched_ops, RCUTORTURE_TASKS_OPS
+		&sched_ops, &tasks_ops,
 	};
 
 	if (!torture_init_begin(torture_type, verbose, &torture_runnable))
diff --git a/tools/testing/selftests/rcutorture/doc/TREE_RCU-kconfig.txt b/tools/testing/selftests/rcutorture/doc/TREE_RCU-kconfig.txt
index 9ad3f89c8dc7..af6fca03602f 100644
--- a/tools/testing/selftests/rcutorture/doc/TREE_RCU-kconfig.txt
+++ b/tools/testing/selftests/rcutorture/doc/TREE_RCU-kconfig.txt
@@ -69,11 +69,11 @@ CONFIG_RCU_TORTURE_TEST_RUNNABLE
 CONFIG_PREEMPT_RCU
 CONFIG_TREE_RCU
 CONFIG_TINY_RCU
+CONFIG_TASKS_RCU
 
 	These are controlled by CONFIG_PREEMPT and/or CONFIG_SMP.
 
 CONFIG_SRCU
-CONFIG_TASKS_RCU
 
 	Selected by CONFIG_RCU_TORTURE_TEST, so cannot disable.
 
-- 
cgit v1.2.3


From ccdd29ffffa7246cb359b9408772858a15fc4ea5 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Thu, 25 May 2017 08:51:48 -0700
Subject: rcu: Create reasonable API for do_exit() TASKS_RCU processing

Currently, the exit-time support for TASKS_RCU is open-coded in do_exit().
This commit creates exit_tasks_rcu_start() and exit_tasks_rcu_finish()
APIs for do_exit() use.  This has the benefit of confining the use of the
tasks_rcu_exit_srcu variable to one file, allowing it to become static.

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---
 include/linux/rcupdate.h |  7 ++++---
 include/linux/sched.h    |  5 +++--
 kernel/exit.c            |  7 ++-----
 kernel/rcu/update.c      | 18 +++++++++++++++++-
 4 files changed, 26 insertions(+), 11 deletions(-)

(limited to 'include')

diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h
index c3f380befdd7..ce9d21923d75 100644
--- a/include/linux/rcupdate.h
+++ b/include/linux/rcupdate.h
@@ -162,8 +162,6 @@ static inline void rcu_init_nohz(void) { }
  * macro rather than an inline function to avoid #include hell.
  */
 #ifdef CONFIG_TASKS_RCU
-#define TASKS_RCU(x) x
-extern struct srcu_struct tasks_rcu_exit_srcu;
 #define rcu_note_voluntary_context_switch_lite(t) \
 	do { \
 		if (READ_ONCE((t)->rcu_tasks_holdout)) \
@@ -176,12 +174,15 @@ extern struct srcu_struct tasks_rcu_exit_srcu;
 	} while (0)
 void call_rcu_tasks(struct rcu_head *head, rcu_callback_t func);
 void synchronize_rcu_tasks(void);
+void exit_tasks_rcu_start(void);
+void exit_tasks_rcu_finish(void);
 #else /* #ifdef CONFIG_TASKS_RCU */
-#define TASKS_RCU(x) do { } while (0)
 #define rcu_note_voluntary_context_switch_lite(t)	do { } while (0)
 #define rcu_note_voluntary_context_switch(t)		rcu_all_qs()
 #define call_rcu_tasks call_rcu_sched
 #define synchronize_rcu_tasks synchronize_sched
+static inline void exit_tasks_rcu_start(void) { }
+static inline void exit_tasks_rcu_finish(void) { }
 #endif /* #else #ifdef CONFIG_TASKS_RCU */
 
 /**
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 8337e2db0bb2..e4c38809a09e 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -589,9 +589,10 @@ struct task_struct {
 
 #ifdef CONFIG_TASKS_RCU
 	unsigned long			rcu_tasks_nvcsw;
-	bool				rcu_tasks_holdout;
-	struct list_head		rcu_tasks_holdout_list;
+	u8				rcu_tasks_holdout;
+	u8				rcu_tasks_idx;
 	int				rcu_tasks_idle_cpu;
+	struct list_head		rcu_tasks_holdout_list;
 #endif /* #ifdef CONFIG_TASKS_RCU */
 
 	struct sched_info		sched_info;
diff --git a/kernel/exit.c b/kernel/exit.c
index c5548faa9f37..d297c525f188 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -764,7 +764,6 @@ void __noreturn do_exit(long code)
 {
 	struct task_struct *tsk = current;
 	int group_dead;
-	TASKS_RCU(int tasks_rcu_i);
 
 	profile_task_exit(tsk);
 	kcov_task_exit(tsk);
@@ -881,9 +880,7 @@ void __noreturn do_exit(long code)
 	 */
 	flush_ptrace_hw_breakpoint(tsk);
 
-	TASKS_RCU(preempt_disable());
-	TASKS_RCU(tasks_rcu_i = __srcu_read_lock(&tasks_rcu_exit_srcu));
-	TASKS_RCU(preempt_enable());
+	exit_tasks_rcu_start();
 	exit_notify(tsk, group_dead);
 	proc_exit_connector(tsk);
 	mpol_put_task_policy(tsk);
@@ -918,7 +915,7 @@ void __noreturn do_exit(long code)
 	if (tsk->nr_dirtied)
 		__this_cpu_add(dirty_throttle_leaks, tsk->nr_dirtied);
 	exit_rcu();
-	TASKS_RCU(__srcu_read_unlock(&tasks_rcu_exit_srcu, tasks_rcu_i));
+	exit_tasks_rcu_finish();
 
 	do_task_dead();
 }
diff --git a/kernel/rcu/update.c b/kernel/rcu/update.c
index 00e77c470017..5033b66d2753 100644
--- a/kernel/rcu/update.c
+++ b/kernel/rcu/update.c
@@ -568,7 +568,7 @@ static DECLARE_WAIT_QUEUE_HEAD(rcu_tasks_cbs_wq);
 static DEFINE_RAW_SPINLOCK(rcu_tasks_cbs_lock);
 
 /* Track exiting tasks in order to allow them to be waited for. */
-DEFINE_SRCU(tasks_rcu_exit_srcu);
+DEFINE_STATIC_SRCU(tasks_rcu_exit_srcu);
 
 /* Control stall timeouts.  Disable with <= 0, otherwise jiffies till stall. */
 #define RCU_TASK_STALL_TIMEOUT (HZ * 60 * 10)
@@ -875,6 +875,22 @@ static void rcu_spawn_tasks_kthread(void)
 	mutex_unlock(&rcu_tasks_kthread_mutex);
 }
 
+/* Do the srcu_read_lock() for the above synchronize_srcu().  */
+void exit_tasks_rcu_start(void)
+{
+	preempt_disable();
+	current->rcu_tasks_idx = __srcu_read_lock(&tasks_rcu_exit_srcu);
+	preempt_enable();
+}
+
+/* Do the srcu_read_unlock() for the above synchronize_srcu().  */
+void exit_tasks_rcu_finish(void)
+{
+	preempt_disable();
+	__srcu_read_unlock(&tasks_rcu_exit_srcu, current->rcu_tasks_idx);
+	preempt_enable();
+}
+
 #endif /* #ifdef CONFIG_TASKS_RCU */
 
 #ifndef CONFIG_TINY_RCU
-- 
cgit v1.2.3


From 352eee1242ef62e18a475ef9278697dbd865969b Mon Sep 17 00:00:00 2001
From: "Luis R. Rodriguez" <mcgrof@kernel.org>
Date: Tue, 20 Jun 2017 14:45:46 -0700
Subject: swait: Add idle variants which don't contribute to load average

There are cases where folks are using an interruptible swait when
using kthreads. This is rather confusing given you'd expect
interruptible waits to be -- interruptible, but kthreads are not
interruptible ! The reason for such practice though is to avoid
having these kthreads contribute to the system load average.

When systems are idle some kthreads may spend a lot of time blocking if
using swait_event_timeout(). This would contribute to the system load
average. On systems without preemption this would mean the load average
of an idle system is bumped to 2 instead of 0. On systems with PREEMPT=y
this would mean the load average of an idle system is bumped to 3
instead of 0.

This adds proper API using TASK_IDLE to make such goals explicit and
avoid confusion.

Suggested-by: "Eric W. Biederman" <ebiederm@xmission.com>
Acked-by: "Eric W. Biederman" <ebiederm@xmission.com>
Tested-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Signed-off-by: Luis R. Rodriguez <mcgrof@kernel.org>
Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---
 include/linux/swait.h | 55 +++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 55 insertions(+)

(limited to 'include')

diff --git a/include/linux/swait.h b/include/linux/swait.h
index c1f9c62a8a50..4a4e180d0a35 100644
--- a/include/linux/swait.h
+++ b/include/linux/swait.h
@@ -169,4 +169,59 @@ do {									\
 	__ret;								\
 })
 
+#define __swait_event_idle(wq, condition)				\
+	(void)___swait_event(wq, condition, TASK_IDLE, 0, schedule())
+
+/**
+ * swait_event_idle - wait without system load contribution
+ * @wq: the waitqueue to wait on
+ * @condition: a C expression for the event to wait for
+ *
+ * The process is put to sleep (TASK_IDLE) until the @condition evaluates to
+ * true. The @condition is checked each time the waitqueue @wq is woken up.
+ *
+ * This function is mostly used when a kthread or workqueue waits for some
+ * condition and doesn't want to contribute to system load. Signals are
+ * ignored.
+ */
+#define swait_event_idle(wq, condition)					\
+do {									\
+	if (condition)							\
+		break;							\
+	__swait_event_idle(wq, condition);				\
+} while (0)
+
+#define __swait_event_idle_timeout(wq, condition, timeout)		\
+	___swait_event(wq, ___wait_cond_timeout(condition),		\
+		       TASK_IDLE, timeout,				\
+		       __ret = schedule_timeout(__ret))
+
+/**
+ * swait_event_idle_timeout - wait up to timeout without load contribution
+ * @wq: the waitqueue to wait on
+ * @condition: a C expression for the event to wait for
+ * @timeout: timeout at which we'll give up in jiffies
+ *
+ * The process is put to sleep (TASK_IDLE) until the @condition evaluates to
+ * true. The @condition is checked each time the waitqueue @wq is woken up.
+ *
+ * This function is mostly used when a kthread or workqueue waits for some
+ * condition and doesn't want to contribute to system load. Signals are
+ * ignored.
+ *
+ * Returns:
+ * 0 if the @condition evaluated to %false after the @timeout elapsed,
+ * 1 if the @condition evaluated to %true after the @timeout elapsed,
+ * or the remaining jiffies (at least 1) if the @condition evaluated
+ * to %true before the @timeout elapsed.
+ */
+#define swait_event_idle_timeout(wq, condition, timeout)		\
+({									\
+	long __ret = timeout;						\
+	if (!___wait_cond_timeout(condition))				\
+		__ret = __swait_event_idle_timeout(wq,			\
+						   condition, timeout);	\
+	__ret;								\
+})
+
 #endif /* _LINUX_SWAIT_H */
-- 
cgit v1.2.3


From 22e4ebb975822833b083533035233d128b30e98f Mon Sep 17 00:00:00 2001
From: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Date: Fri, 28 Jul 2017 16:40:40 -0400
Subject: membarrier: Provide expedited private command

Implement MEMBARRIER_CMD_PRIVATE_EXPEDITED with IPIs using cpumask built
from all runqueues for which current thread's mm is the same as the
thread calling sys_membarrier. It executes faster than the non-expedited
variant (no blocking). It also works on NOHZ_FULL configurations.

Scheduler-wise, it requires a memory barrier before and after context
switching between processes (which have different mm). The memory
barrier before context switch is already present. For the barrier after
context switch:

* Our TSO archs can do RELEASE without being a full barrier. Look at
  x86 spin_unlock() being a regular STORE for example.  But for those
  archs, all atomics imply smp_mb and all of them have atomic ops in
  switch_mm() for mm_cpumask(), and on x86 the CR3 load acts as a full
  barrier.

* From all weakly ordered machines, only ARM64 and PPC can do RELEASE,
  the rest does indeed do smp_mb(), so there the spin_unlock() is a full
  barrier and we're good.

* ARM64 has a very heavy barrier in switch_to(), which suffices.

* PPC just removed its barrier from switch_to(), but appears to be
  talking about adding something to switch_mm(). So add a
  smp_mb__after_unlock_lock() for now, until this is settled on the PPC
  side.

Changes since v3:
- Properly document the memory barriers provided by each architecture.

Changes since v2:
- Address comments from Peter Zijlstra,
- Add smp_mb__after_unlock_lock() after finish_lock_switch() in
  finish_task_switch() to add the memory barrier we need after storing
  to rq->curr. This is much simpler than the previous approach relying
  on atomic_dec_and_test() in mmdrop(), which actually added a memory
  barrier in the common case of switching between userspace processes.
- Return -EINVAL when MEMBARRIER_CMD_SHARED is used on a nohz_full
  kernel, rather than having the whole membarrier system call returning
  -ENOSYS. Indeed, CMD_PRIVATE_EXPEDITED is compatible with nohz_full.
  Adapt the CMD_QUERY mask accordingly.

Changes since v1:
- move membarrier code under kernel/sched/ because it uses the
  scheduler runqueue,
- only add the barrier when we switch from a kernel thread. The case
  where we switch from a user-space thread is already handled by
  the atomic_dec_and_test() in mmdrop().
- add a comment to mmdrop() documenting the requirement on the implicit
  memory barrier.

CC: Peter Zijlstra <peterz@infradead.org>
CC: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
CC: Boqun Feng <boqun.feng@gmail.com>
CC: Andrew Hunter <ahh@google.com>
CC: Maged Michael <maged.michael@gmail.com>
CC: gromer@google.com
CC: Avi Kivity <avi@scylladb.com>
CC: Benjamin Herrenschmidt <benh@kernel.crashing.org>
CC: Paul Mackerras <paulus@samba.org>
CC: Michael Ellerman <mpe@ellerman.id.au>
Signed-off-by: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Tested-by: Dave Watson <davejwatson@fb.com>
---
 MAINTAINERS                     |   2 +-
 arch/arm64/kernel/process.c     |   2 +
 include/uapi/linux/membarrier.h |  23 +++++-
 kernel/Makefile                 |   1 -
 kernel/membarrier.c             |  70 ------------------
 kernel/sched/Makefile           |   1 +
 kernel/sched/core.c             |  25 +++++++
 kernel/sched/membarrier.c       | 152 ++++++++++++++++++++++++++++++++++++++++
 8 files changed, 202 insertions(+), 74 deletions(-)
 delete mode 100644 kernel/membarrier.c
 create mode 100644 kernel/sched/membarrier.c

(limited to 'include')

diff --git a/MAINTAINERS b/MAINTAINERS
index f66488dfdbc9..3b035584272f 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -8621,7 +8621,7 @@ M:	Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
 M:	"Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
 L:	linux-kernel@vger.kernel.org
 S:	Supported
-F:	kernel/membarrier.c
+F:	kernel/sched/membarrier.c
 F:	include/uapi/linux/membarrier.h
 
 MEMORY MANAGEMENT
diff --git a/arch/arm64/kernel/process.c b/arch/arm64/kernel/process.c
index 659ae8094ed5..c8f7d98d8cb9 100644
--- a/arch/arm64/kernel/process.c
+++ b/arch/arm64/kernel/process.c
@@ -360,6 +360,8 @@ __notrace_funcgraph struct task_struct *__switch_to(struct task_struct *prev,
 	/*
 	 * Complete any pending TLB or cache maintenance on this CPU in case
 	 * the thread migrates to a different CPU.
+	 * This full barrier is also required by the membarrier system
+	 * call.
 	 */
 	dsb(ish);
 
diff --git a/include/uapi/linux/membarrier.h b/include/uapi/linux/membarrier.h
index e0b108bd2624..6d47b3249d8a 100644
--- a/include/uapi/linux/membarrier.h
+++ b/include/uapi/linux/membarrier.h
@@ -40,14 +40,33 @@
  *                          (non-running threads are de facto in such a
  *                          state). This covers threads from all processes
  *                          running on the system. This command returns 0.
+ * @MEMBARRIER_CMD_PRIVATE_EXPEDITED:
+ *                          Execute a memory barrier on each running
+ *                          thread belonging to the same process as the current
+ *                          thread. Upon return from system call, the
+ *                          caller thread is ensured that all its running
+ *                          threads siblings have passed through a state
+ *                          where all memory accesses to user-space
+ *                          addresses match program order between entry
+ *                          to and return from the system call
+ *                          (non-running threads are de facto in such a
+ *                          state). This only covers threads from the
+ *                          same processes as the caller thread. This
+ *                          command returns 0. The "expedited" commands
+ *                          complete faster than the non-expedited ones,
+ *                          they never block, but have the downside of
+ *                          causing extra overhead.
  *
  * Command to be passed to the membarrier system call. The commands need to
  * be a single bit each, except for MEMBARRIER_CMD_QUERY which is assigned to
  * the value 0.
  */
 enum membarrier_cmd {
-	MEMBARRIER_CMD_QUERY = 0,
-	MEMBARRIER_CMD_SHARED = (1 << 0),
+	MEMBARRIER_CMD_QUERY			= 0,
+	MEMBARRIER_CMD_SHARED			= (1 << 0),
+	/* reserved for MEMBARRIER_CMD_SHARED_EXPEDITED (1 << 1) */
+	/* reserved for MEMBARRIER_CMD_PRIVATE (1 << 2) */
+	MEMBARRIER_CMD_PRIVATE_EXPEDITED	= (1 << 3),
 };
 
 #endif /* _UAPI_LINUX_MEMBARRIER_H */
diff --git a/kernel/Makefile b/kernel/Makefile
index 4cb8e8b23c6e..9c323a6daa46 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -108,7 +108,6 @@ obj-$(CONFIG_CRASH_DUMP) += crash_dump.o
 obj-$(CONFIG_JUMP_LABEL) += jump_label.o
 obj-$(CONFIG_CONTEXT_TRACKING) += context_tracking.o
 obj-$(CONFIG_TORTURE_TEST) += torture.o
-obj-$(CONFIG_MEMBARRIER) += membarrier.o
 
 obj-$(CONFIG_HAS_IOMEM) += memremap.o
 
diff --git a/kernel/membarrier.c b/kernel/membarrier.c
deleted file mode 100644
index 9f9284f37f8d..000000000000
--- a/kernel/membarrier.c
+++ /dev/null
@@ -1,70 +0,0 @@
-/*
- * Copyright (C) 2010, 2015 Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
- *
- * membarrier system call
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- */
-
-#include <linux/syscalls.h>
-#include <linux/membarrier.h>
-#include <linux/tick.h>
-
-/*
- * Bitmask made from a "or" of all commands within enum membarrier_cmd,
- * except MEMBARRIER_CMD_QUERY.
- */
-#define MEMBARRIER_CMD_BITMASK	(MEMBARRIER_CMD_SHARED)
-
-/**
- * sys_membarrier - issue memory barriers on a set of threads
- * @cmd:   Takes command values defined in enum membarrier_cmd.
- * @flags: Currently needs to be 0. For future extensions.
- *
- * If this system call is not implemented, -ENOSYS is returned. If the
- * command specified does not exist, or if the command argument is invalid,
- * this system call returns -EINVAL. For a given command, with flags argument
- * set to 0, this system call is guaranteed to always return the same value
- * until reboot.
- *
- * All memory accesses performed in program order from each targeted thread
- * is guaranteed to be ordered with respect to sys_membarrier(). If we use
- * the semantic "barrier()" to represent a compiler barrier forcing memory
- * accesses to be performed in program order across the barrier, and
- * smp_mb() to represent explicit memory barriers forcing full memory
- * ordering across the barrier, we have the following ordering table for
- * each pair of barrier(), sys_membarrier() and smp_mb():
- *
- * The pair ordering is detailed as (O: ordered, X: not ordered):
- *
- *                        barrier()   smp_mb() sys_membarrier()
- *        barrier()          X           X            O
- *        smp_mb()           X           O            O
- *        sys_membarrier()   O           O            O
- */
-SYSCALL_DEFINE2(membarrier, int, cmd, int, flags)
-{
-	/* MEMBARRIER_CMD_SHARED is not compatible with nohz_full. */
-	if (tick_nohz_full_enabled())
-		return -ENOSYS;
-	if (unlikely(flags))
-		return -EINVAL;
-	switch (cmd) {
-	case MEMBARRIER_CMD_QUERY:
-		return MEMBARRIER_CMD_BITMASK;
-	case MEMBARRIER_CMD_SHARED:
-		if (num_online_cpus() > 1)
-			synchronize_sched();
-		return 0;
-	default:
-		return -EINVAL;
-	}
-}
diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile
index 53f0164ed362..78f54932ea1d 100644
--- a/kernel/sched/Makefile
+++ b/kernel/sched/Makefile
@@ -25,3 +25,4 @@ obj-$(CONFIG_SCHED_DEBUG) += debug.o
 obj-$(CONFIG_CGROUP_CPUACCT) += cpuacct.o
 obj-$(CONFIG_CPU_FREQ) += cpufreq.o
 obj-$(CONFIG_CPU_FREQ_GOV_SCHEDUTIL) += cpufreq_schedutil.o
+obj-$(CONFIG_MEMBARRIER) += membarrier.o
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index bfee6ea7db49..f77269c6c2f8 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -2640,6 +2640,16 @@ static struct rq *finish_task_switch(struct task_struct *prev)
 	prev_state = prev->state;
 	vtime_task_switch(prev);
 	perf_event_task_sched_in(prev, current);
+	/*
+	 * The membarrier system call requires a full memory barrier
+	 * after storing to rq->curr, before going back to user-space.
+	 *
+	 * TODO: This smp_mb__after_unlock_lock can go away if PPC end
+	 * up adding a full barrier to switch_mm(), or we should figure
+	 * out if a smp_mb__after_unlock_lock is really the proper API
+	 * to use.
+	 */
+	smp_mb__after_unlock_lock();
 	finish_lock_switch(rq, prev);
 	finish_arch_post_lock_switch();
 
@@ -3329,6 +3339,21 @@ static void __sched notrace __schedule(bool preempt)
 	if (likely(prev != next)) {
 		rq->nr_switches++;
 		rq->curr = next;
+		/*
+		 * The membarrier system call requires each architecture
+		 * to have a full memory barrier after updating
+		 * rq->curr, before returning to user-space. For TSO
+		 * (e.g. x86), the architecture must provide its own
+		 * barrier in switch_mm(). For weakly ordered machines
+		 * for which spin_unlock() acts as a full memory
+		 * barrier, finish_lock_switch() in common code takes
+		 * care of this barrier. For weakly ordered machines for
+		 * which spin_unlock() acts as a RELEASE barrier (only
+		 * arm64 and PowerPC), arm64 has a full barrier in
+		 * switch_to(), and PowerPC has
+		 * smp_mb__after_unlock_lock() before
+		 * finish_lock_switch().
+		 */
 		++*switch_count;
 
 		trace_sched_switch(preempt, prev, next);
diff --git a/kernel/sched/membarrier.c b/kernel/sched/membarrier.c
new file mode 100644
index 000000000000..a92fddc22747
--- /dev/null
+++ b/kernel/sched/membarrier.c
@@ -0,0 +1,152 @@
+/*
+ * Copyright (C) 2010-2017 Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
+ *
+ * membarrier system call
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ */
+
+#include <linux/syscalls.h>
+#include <linux/membarrier.h>
+#include <linux/tick.h>
+#include <linux/cpumask.h>
+
+#include "sched.h"	/* for cpu_rq(). */
+
+/*
+ * Bitmask made from a "or" of all commands within enum membarrier_cmd,
+ * except MEMBARRIER_CMD_QUERY.
+ */
+#define MEMBARRIER_CMD_BITMASK	\
+	(MEMBARRIER_CMD_SHARED | MEMBARRIER_CMD_PRIVATE_EXPEDITED)
+
+static void ipi_mb(void *info)
+{
+	smp_mb();	/* IPIs should be serializing but paranoid. */
+}
+
+static void membarrier_private_expedited(void)
+{
+	int cpu;
+	bool fallback = false;
+	cpumask_var_t tmpmask;
+
+	if (num_online_cpus() == 1)
+		return;
+
+	/*
+	 * Matches memory barriers around rq->curr modification in
+	 * scheduler.
+	 */
+	smp_mb();	/* system call entry is not a mb. */
+
+	/*
+	 * Expedited membarrier commands guarantee that they won't
+	 * block, hence the GFP_NOWAIT allocation flag and fallback
+	 * implementation.
+	 */
+	if (!zalloc_cpumask_var(&tmpmask, GFP_NOWAIT)) {
+		/* Fallback for OOM. */
+		fallback = true;
+	}
+
+	cpus_read_lock();
+	for_each_online_cpu(cpu) {
+		struct task_struct *p;
+
+		/*
+		 * Skipping the current CPU is OK even through we can be
+		 * migrated at any point. The current CPU, at the point
+		 * where we read raw_smp_processor_id(), is ensured to
+		 * be in program order with respect to the caller
+		 * thread. Therefore, we can skip this CPU from the
+		 * iteration.
+		 */
+		if (cpu == raw_smp_processor_id())
+			continue;
+		rcu_read_lock();
+		p = task_rcu_dereference(&cpu_rq(cpu)->curr);
+		if (p && p->mm == current->mm) {
+			if (!fallback)
+				__cpumask_set_cpu(cpu, tmpmask);
+			else
+				smp_call_function_single(cpu, ipi_mb, NULL, 1);
+		}
+		rcu_read_unlock();
+	}
+	if (!fallback) {
+		smp_call_function_many(tmpmask, ipi_mb, NULL, 1);
+		free_cpumask_var(tmpmask);
+	}
+	cpus_read_unlock();
+
+	/*
+	 * Memory barrier on the caller thread _after_ we finished
+	 * waiting for the last IPI. Matches memory barriers around
+	 * rq->curr modification in scheduler.
+	 */
+	smp_mb();	/* exit from system call is not a mb */
+}
+
+/**
+ * sys_membarrier - issue memory barriers on a set of threads
+ * @cmd:   Takes command values defined in enum membarrier_cmd.
+ * @flags: Currently needs to be 0. For future extensions.
+ *
+ * If this system call is not implemented, -ENOSYS is returned. If the
+ * command specified does not exist, not available on the running
+ * kernel, or if the command argument is invalid, this system call
+ * returns -EINVAL. For a given command, with flags argument set to 0,
+ * this system call is guaranteed to always return the same value until
+ * reboot.
+ *
+ * All memory accesses performed in program order from each targeted thread
+ * is guaranteed to be ordered with respect to sys_membarrier(). If we use
+ * the semantic "barrier()" to represent a compiler barrier forcing memory
+ * accesses to be performed in program order across the barrier, and
+ * smp_mb() to represent explicit memory barriers forcing full memory
+ * ordering across the barrier, we have the following ordering table for
+ * each pair of barrier(), sys_membarrier() and smp_mb():
+ *
+ * The pair ordering is detailed as (O: ordered, X: not ordered):
+ *
+ *                        barrier()   smp_mb() sys_membarrier()
+ *        barrier()          X           X            O
+ *        smp_mb()           X           O            O
+ *        sys_membarrier()   O           O            O
+ */
+SYSCALL_DEFINE2(membarrier, int, cmd, int, flags)
+{
+	if (unlikely(flags))
+		return -EINVAL;
+	switch (cmd) {
+	case MEMBARRIER_CMD_QUERY:
+	{
+		int cmd_mask = MEMBARRIER_CMD_BITMASK;
+
+		if (tick_nohz_full_enabled())
+			cmd_mask &= ~MEMBARRIER_CMD_SHARED;
+		return cmd_mask;
+	}
+	case MEMBARRIER_CMD_SHARED:
+		/* MEMBARRIER_CMD_SHARED is not compatible with nohz_full. */
+		if (tick_nohz_full_enabled())
+			return -EINVAL;
+		if (num_online_cpus() > 1)
+			synchronize_sched();
+		return 0;
+	case MEMBARRIER_CMD_PRIVATE_EXPEDITED:
+		membarrier_private_expedited();
+		return 0;
+	default:
+		return -EINVAL;
+	}
+}
-- 
cgit v1.2.3


From 27a0342ac162bf2ba30c288cfb7b72eabed38d8b Mon Sep 17 00:00:00 2001
From: Thierry Reding <treding@nvidia.com>
Date: Thu, 17 Aug 2017 16:42:17 +0200
Subject: soc/tegra: Register SoC device

Move this code from arch/arm/mach-tegra and make it common among 32-bit
and 64-bit Tegra SoCs. This is slightly complicated by the fact that on
32-bit Tegra, the SoC device is used as the parent for all devices that
are instantiated from device tree.

Signed-off-by: Thierry Reding <treding@nvidia.com>
---
 arch/arm/mach-tegra/tegra.c         | 29 +---------------------
 drivers/soc/tegra/Kconfig           |  5 ++++
 drivers/soc/tegra/fuse/fuse-tegra.c | 48 +++++++++++++++++++++++++++++++++++--
 include/soc/tegra/fuse.h            |  2 ++
 4 files changed, 54 insertions(+), 30 deletions(-)

(limited to 'include')

diff --git a/arch/arm/mach-tegra/tegra.c b/arch/arm/mach-tegra/tegra.c
index 649e9e8c7bcc..02e712d2ea30 100644
--- a/arch/arm/mach-tegra/tegra.c
+++ b/arch/arm/mach-tegra/tegra.c
@@ -84,35 +84,8 @@ static void __init tegra_dt_init_irq(void)
 
 static void __init tegra_dt_init(void)
 {
-	struct soc_device_attribute *soc_dev_attr;
-	struct soc_device *soc_dev;
-	struct device *parent = NULL;
+	struct device *parent = tegra_soc_device_register();
 
-	soc_dev_attr = kzalloc(sizeof(*soc_dev_attr), GFP_KERNEL);
-	if (!soc_dev_attr)
-		goto out;
-
-	soc_dev_attr->family = kasprintf(GFP_KERNEL, "Tegra");
-	soc_dev_attr->revision = kasprintf(GFP_KERNEL, "%d",
-					   tegra_sku_info.revision);
-	soc_dev_attr->soc_id = kasprintf(GFP_KERNEL, "%u", tegra_get_chip_id());
-
-	soc_dev = soc_device_register(soc_dev_attr);
-	if (IS_ERR(soc_dev)) {
-		kfree(soc_dev_attr->family);
-		kfree(soc_dev_attr->revision);
-		kfree(soc_dev_attr->soc_id);
-		kfree(soc_dev_attr);
-		goto out;
-	}
-
-	parent = soc_device_to_device(soc_dev);
-
-	/*
-	 * Finished with the static registrations now; fill in the missing
-	 * devices
-	 */
-out:
 	of_platform_default_populate(NULL, NULL, parent);
 }
 
diff --git a/drivers/soc/tegra/Kconfig b/drivers/soc/tegra/Kconfig
index 1beb7c347344..e9e277178c94 100644
--- a/drivers/soc/tegra/Kconfig
+++ b/drivers/soc/tegra/Kconfig
@@ -107,6 +107,11 @@ config ARCH_TEGRA_186_SOC
 endif
 endif
 
+config SOC_TEGRA_FUSE
+	def_bool y
+	depends on ARCH_TEGRA
+	select SOC_BUS
+
 config SOC_TEGRA_FLOWCTRL
 	bool
 
diff --git a/drivers/soc/tegra/fuse/fuse-tegra.c b/drivers/soc/tegra/fuse/fuse-tegra.c
index 7413f60fa855..e4f78de8f95f 100644
--- a/drivers/soc/tegra/fuse/fuse-tegra.c
+++ b/drivers/soc/tegra/fuse/fuse-tegra.c
@@ -19,10 +19,12 @@
 #include <linux/device.h>
 #include <linux/kobject.h>
 #include <linux/init.h>
-#include <linux/platform_device.h>
+#include <linux/io.h>
 #include <linux/of.h>
 #include <linux/of_address.h>
-#include <linux/io.h>
+#include <linux/platform_device.h>
+#include <linux/slab.h>
+#include <linux/sys_soc.h>
 
 #include <soc/tegra/common.h>
 #include <soc/tegra/fuse.h>
@@ -210,6 +212,31 @@ static void tegra_enable_fuse_clk(void __iomem *base)
 	writel(reg, base + 0x14);
 }
 
+struct device * __init tegra_soc_device_register(void)
+{
+	struct soc_device_attribute *attr;
+	struct soc_device *dev;
+
+	attr = kzalloc(sizeof(*attr), GFP_KERNEL);
+	if (!attr)
+		return NULL;
+
+	attr->family = kasprintf(GFP_KERNEL, "Tegra");
+	attr->revision = kasprintf(GFP_KERNEL, "%d", tegra_sku_info.revision);
+	attr->soc_id = kasprintf(GFP_KERNEL, "%u", tegra_get_chip_id());
+
+	dev = soc_device_register(attr);
+	if (IS_ERR(dev)) {
+		kfree(attr->soc_id);
+		kfree(attr->revision);
+		kfree(attr->family);
+		kfree(attr);
+		return ERR_CAST(dev);
+	}
+
+	return soc_device_to_device(dev);
+}
+
 static int __init tegra_init_fuse(void)
 {
 	const struct of_device_id *match;
@@ -311,6 +338,23 @@ static int __init tegra_init_fuse(void)
 	pr_debug("Tegra CPU Speedo ID %d, SoC Speedo ID %d\n",
 		 tegra_sku_info.cpu_speedo_id, tegra_sku_info.soc_speedo_id);
 
+
 	return 0;
 }
 early_initcall(tegra_init_fuse);
+
+#ifdef CONFIG_ARM64
+static int __init tegra_init_soc(void)
+{
+	struct device *soc;
+
+	soc = tegra_soc_device_register();
+	if (IS_ERR(soc)) {
+		pr_err("failed to register SoC device: %ld\n", PTR_ERR(soc));
+		return PTR_ERR(soc);
+	}
+
+	return 0;
+}
+device_initcall(tegra_init_soc)
+#endif
diff --git a/include/soc/tegra/fuse.h b/include/soc/tegra/fuse.h
index b4c9219e7f95..9b6ea0c72117 100644
--- a/include/soc/tegra/fuse.h
+++ b/include/soc/tegra/fuse.h
@@ -65,6 +65,8 @@ int tegra_fuse_readl(unsigned long offset, u32 *value);
 
 extern struct tegra_sku_info tegra_sku_info;
 
+struct device *tegra_soc_device_register(void);
+
 #endif /* __ASSEMBLY__ */
 
 #endif /* __SOC_TEGRA_FUSE_H__ */
-- 
cgit v1.2.3


From d3a024abbc438277851c510b51ec9b158821488b Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Thu, 29 Jun 2017 15:47:44 -0700
Subject: locking: Remove spin_unlock_wait() generic definitions

There is no agreed-upon definition of spin_unlock_wait()'s semantics,
and it appears that all callers could do just as well with a lock/unlock
pair.  This commit therefore removes spin_unlock_wait() and related
definitions from core code.

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Will Deacon <will.deacon@arm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Alan Stern <stern@rowland.harvard.edu>
Cc: Andrea Parri <parri.andrea@gmail.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/asm-generic/qspinlock.h |  14 -----
 include/linux/spinlock.h        |  11 ----
 include/linux/spinlock_up.h     |   6 ---
 kernel/locking/qspinlock.c      | 117 ----------------------------------------
 4 files changed, 148 deletions(-)

(limited to 'include')

diff --git a/include/asm-generic/qspinlock.h b/include/asm-generic/qspinlock.h
index 9f0681bf1e87..66260777d644 100644
--- a/include/asm-generic/qspinlock.h
+++ b/include/asm-generic/qspinlock.h
@@ -21,17 +21,6 @@
 
 #include <asm-generic/qspinlock_types.h>
 
-/**
- * queued_spin_unlock_wait - wait until the _current_ lock holder releases the lock
- * @lock : Pointer to queued spinlock structure
- *
- * There is a very slight possibility of live-lock if the lockers keep coming
- * and the waiter is just unfortunate enough to not see any unlock state.
- */
-#ifndef queued_spin_unlock_wait
-extern void queued_spin_unlock_wait(struct qspinlock *lock);
-#endif
-
 /**
  * queued_spin_is_locked - is the spinlock locked?
  * @lock: Pointer to queued spinlock structure
@@ -41,8 +30,6 @@ extern void queued_spin_unlock_wait(struct qspinlock *lock);
 static __always_inline int queued_spin_is_locked(struct qspinlock *lock)
 {
 	/*
-	 * See queued_spin_unlock_wait().
-	 *
 	 * Any !0 state indicates it is locked, even if _Q_LOCKED_VAL
 	 * isn't immediately observable.
 	 */
@@ -135,6 +122,5 @@ static __always_inline bool virt_spin_lock(struct qspinlock *lock)
 #define arch_spin_trylock(l)		queued_spin_trylock(l)
 #define arch_spin_unlock(l)		queued_spin_unlock(l)
 #define arch_spin_lock_flags(l, f)	queued_spin_lock(l)
-#define arch_spin_unlock_wait(l)	queued_spin_unlock_wait(l)
 
 #endif /* __ASM_GENERIC_QSPINLOCK_H */
diff --git a/include/linux/spinlock.h b/include/linux/spinlock.h
index 59248dcc6ef3..ef018a6e4985 100644
--- a/include/linux/spinlock.h
+++ b/include/linux/spinlock.h
@@ -130,12 +130,6 @@ do {								\
 #define smp_mb__before_spinlock()	smp_wmb()
 #endif
 
-/**
- * raw_spin_unlock_wait - wait until the spinlock gets unlocked
- * @lock: the spinlock in question.
- */
-#define raw_spin_unlock_wait(lock)	arch_spin_unlock_wait(&(lock)->raw_lock)
-
 #ifdef CONFIG_DEBUG_SPINLOCK
  extern void do_raw_spin_lock(raw_spinlock_t *lock) __acquires(lock);
 #define do_raw_spin_lock_flags(lock, flags) do_raw_spin_lock(lock)
@@ -369,11 +363,6 @@ static __always_inline int spin_trylock_irq(spinlock_t *lock)
 	raw_spin_trylock_irqsave(spinlock_check(lock), flags); \
 })
 
-static __always_inline void spin_unlock_wait(spinlock_t *lock)
-{
-	raw_spin_unlock_wait(&lock->rlock);
-}
-
 static __always_inline int spin_is_locked(spinlock_t *lock)
 {
 	return raw_spin_is_locked(&lock->rlock);
diff --git a/include/linux/spinlock_up.h b/include/linux/spinlock_up.h
index 0d9848de677d..612fb530af41 100644
--- a/include/linux/spinlock_up.h
+++ b/include/linux/spinlock_up.h
@@ -26,11 +26,6 @@
 #ifdef CONFIG_DEBUG_SPINLOCK
 #define arch_spin_is_locked(x)		((x)->slock == 0)
 
-static inline void arch_spin_unlock_wait(arch_spinlock_t *lock)
-{
-	smp_cond_load_acquire(&lock->slock, VAL);
-}
-
 static inline void arch_spin_lock(arch_spinlock_t *lock)
 {
 	lock->slock = 0;
@@ -73,7 +68,6 @@ static inline void arch_spin_unlock(arch_spinlock_t *lock)
 
 #else /* DEBUG_SPINLOCK */
 #define arch_spin_is_locked(lock)	((void)(lock), 0)
-#define arch_spin_unlock_wait(lock)	do { barrier(); (void)(lock); } while (0)
 /* for sched/core.c and kernel_lock.c: */
 # define arch_spin_lock(lock)		do { barrier(); (void)(lock); } while (0)
 # define arch_spin_lock_flags(lock, flags)	do { barrier(); (void)(lock); } while (0)
diff --git a/kernel/locking/qspinlock.c b/kernel/locking/qspinlock.c
index fd24153e8a48..294294c71ba4 100644
--- a/kernel/locking/qspinlock.c
+++ b/kernel/locking/qspinlock.c
@@ -268,123 +268,6 @@ static __always_inline u32  __pv_wait_head_or_lock(struct qspinlock *lock,
 #define queued_spin_lock_slowpath	native_queued_spin_lock_slowpath
 #endif
 
-/*
- * Various notes on spin_is_locked() and spin_unlock_wait(), which are
- * 'interesting' functions:
- *
- * PROBLEM: some architectures have an interesting issue with atomic ACQUIRE
- * operations in that the ACQUIRE applies to the LOAD _not_ the STORE (ARM64,
- * PPC). Also qspinlock has a similar issue per construction, the setting of
- * the locked byte can be unordered acquiring the lock proper.
- *
- * This gets to be 'interesting' in the following cases, where the /should/s
- * end up false because of this issue.
- *
- *
- * CASE 1:
- *
- * So the spin_is_locked() correctness issue comes from something like:
- *
- *   CPU0				CPU1
- *
- *   global_lock();			local_lock(i)
- *     spin_lock(&G)			  spin_lock(&L[i])
- *     for (i)				  if (!spin_is_locked(&G)) {
- *       spin_unlock_wait(&L[i]);	    smp_acquire__after_ctrl_dep();
- *					    return;
- *					  }
- *					  // deal with fail
- *
- * Where it is important CPU1 sees G locked or CPU0 sees L[i] locked such
- * that there is exclusion between the two critical sections.
- *
- * The load from spin_is_locked(&G) /should/ be constrained by the ACQUIRE from
- * spin_lock(&L[i]), and similarly the load(s) from spin_unlock_wait(&L[i])
- * /should/ be constrained by the ACQUIRE from spin_lock(&G).
- *
- * Similarly, later stuff is constrained by the ACQUIRE from CTRL+RMB.
- *
- *
- * CASE 2:
- *
- * For spin_unlock_wait() there is a second correctness issue, namely:
- *
- *   CPU0				CPU1
- *
- *   flag = set;
- *   smp_mb();				spin_lock(&l)
- *   spin_unlock_wait(&l);		if (!flag)
- *					  // add to lockless list
- *					spin_unlock(&l);
- *   // iterate lockless list
- *
- * Which wants to ensure that CPU1 will stop adding bits to the list and CPU0
- * will observe the last entry on the list (if spin_unlock_wait() had ACQUIRE
- * semantics etc..)
- *
- * Where flag /should/ be ordered against the locked store of l.
- */
-
-/*
- * queued_spin_lock_slowpath() can (load-)ACQUIRE the lock before
- * issuing an _unordered_ store to set _Q_LOCKED_VAL.
- *
- * This means that the store can be delayed, but no later than the
- * store-release from the unlock. This means that simply observing
- * _Q_LOCKED_VAL is not sufficient to determine if the lock is acquired.
- *
- * There are two paths that can issue the unordered store:
- *
- *  (1) clear_pending_set_locked():	*,1,0 -> *,0,1
- *
- *  (2) set_locked():			t,0,0 -> t,0,1 ; t != 0
- *      atomic_cmpxchg_relaxed():	t,0,0 -> 0,0,1
- *
- * However, in both cases we have other !0 state we've set before to queue
- * ourseves:
- *
- * For (1) we have the atomic_cmpxchg_acquire() that set _Q_PENDING_VAL, our
- * load is constrained by that ACQUIRE to not pass before that, and thus must
- * observe the store.
- *
- * For (2) we have a more intersting scenario. We enqueue ourselves using
- * xchg_tail(), which ends up being a RELEASE. This in itself is not
- * sufficient, however that is followed by an smp_cond_acquire() on the same
- * word, giving a RELEASE->ACQUIRE ordering. This again constrains our load and
- * guarantees we must observe that store.
- *
- * Therefore both cases have other !0 state that is observable before the
- * unordered locked byte store comes through. This means we can use that to
- * wait for the lock store, and then wait for an unlock.
- */
-#ifndef queued_spin_unlock_wait
-void queued_spin_unlock_wait(struct qspinlock *lock)
-{
-	u32 val;
-
-	for (;;) {
-		val = atomic_read(&lock->val);
-
-		if (!val) /* not locked, we're done */
-			goto done;
-
-		if (val & _Q_LOCKED_MASK) /* locked, go wait for unlock */
-			break;
-
-		/* not locked, but pending, wait until we observe the lock */
-		cpu_relax();
-	}
-
-	/* any unlock is good */
-	while (atomic_read(&lock->val) & _Q_LOCKED_MASK)
-		cpu_relax();
-
-done:
-	smp_acquire__after_ctrl_dep();
-}
-EXPORT_SYMBOL(queued_spin_unlock_wait);
-#endif
-
 #endif /* _GEN_PV_LOCK_SLOWPATH */
 
 /**
-- 
cgit v1.2.3


From c8c03f1858331e85d397bacccd34ef409aae993c Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@linux-foundation.org>
Date: Wed, 16 Aug 2017 17:08:07 -0700
Subject: pty: fix the cached path of the pty slave file descriptor in the
 master

Christian Brauner reported that if you use the TIOCGPTPEER ioctl() to
get a slave pty file descriptor, the resulting file descriptor doesn't
look right in /proc/<pid>/fd/<fd>.  In particular, he wanted to use
readlink() on /proc/self/fd/<fd> to get the pathname of the slave pty
(basically implementing "ptsname{_r}()").

The reason for that was that we had generated the wrong 'struct path'
when we create the pty in ptmx_open().

In particular, the dentry was correct, but the vfsmount pointed to the
mount of the ptmx node. That _can_ be correct - in case you use
"/dev/pts/ptmx" to open the master - but usually is not.  The normal
case is to use /dev/ptmx, which then looks up the pts/ directory, and
then the vfsmount of the ptmx node is obviously the /dev directory, not
the /dev/pts/ directory.

We actually did have the right vfsmount available, but in the wrong
place (it gets looked up in 'devpts_acquire()' when we get a reference
to the pts filesystem), and so ptmx_open() used the wrong mnt pointer.

The end result of this confusion was that the pty worked fine, but when
if you did TIOCGPTPEER to get the slave side of the pty, end end result
would also work, but have that dodgy 'struct path'.

And then when doing "d_path()" on to get the pathname, the vfsmount
would not match the root of the pts directory, and d_path() would return
an empty pathname thinking that the entry had escaped a bind mount into
another mount.

This fixes the problem by making devpts_acquire() return the vfsmount
for the pts filesystem, allowing ptmx_open() to trivially just use the
right mount for the pts dentry, and create the proper 'struct path'.

Reported-by: Christian Brauner <christian.brauner@ubuntu.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Acked-by: Eric Biederman <ebiederm@xmission.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/tty/pty.c         | 7 +++++--
 fs/devpts/inode.c         | 4 +++-
 include/linux/devpts_fs.h | 2 +-
 3 files changed, 9 insertions(+), 4 deletions(-)

(limited to 'include')

diff --git a/drivers/tty/pty.c b/drivers/tty/pty.c
index 284749fb0f6b..1fc80ea87c13 100644
--- a/drivers/tty/pty.c
+++ b/drivers/tty/pty.c
@@ -793,6 +793,7 @@ static int ptmx_open(struct inode *inode, struct file *filp)
 	struct tty_struct *tty;
 	struct path *pts_path;
 	struct dentry *dentry;
+	struct vfsmount *mnt;
 	int retval;
 	int index;
 
@@ -805,7 +806,7 @@ static int ptmx_open(struct inode *inode, struct file *filp)
 	if (retval)
 		return retval;
 
-	fsi = devpts_acquire(filp);
+	fsi = devpts_acquire(filp, &mnt);
 	if (IS_ERR(fsi)) {
 		retval = PTR_ERR(fsi);
 		goto out_free_file;
@@ -849,7 +850,7 @@ static int ptmx_open(struct inode *inode, struct file *filp)
 	pts_path = kmalloc(sizeof(struct path), GFP_KERNEL);
 	if (!pts_path)
 		goto err_release;
-	pts_path->mnt = filp->f_path.mnt;
+	pts_path->mnt = mnt;
 	pts_path->dentry = dentry;
 	path_get(pts_path);
 	tty->link->driver_data = pts_path;
@@ -866,6 +867,7 @@ err_path_put:
 	path_put(pts_path);
 	kfree(pts_path);
 err_release:
+	mntput(mnt);
 	tty_unlock(tty);
 	// This will also put-ref the fsi
 	tty_release(inode, filp);
@@ -874,6 +876,7 @@ out:
 	devpts_kill_index(fsi, index);
 out_put_fsi:
 	devpts_release(fsi);
+	mntput(mnt);
 out_free_file:
 	tty_free_file(filp);
 	return retval;
diff --git a/fs/devpts/inode.c b/fs/devpts/inode.c
index 108df2e3602c..44dfbca9306f 100644
--- a/fs/devpts/inode.c
+++ b/fs/devpts/inode.c
@@ -133,7 +133,7 @@ static inline struct pts_fs_info *DEVPTS_SB(struct super_block *sb)
 	return sb->s_fs_info;
 }
 
-struct pts_fs_info *devpts_acquire(struct file *filp)
+struct pts_fs_info *devpts_acquire(struct file *filp, struct vfsmount **ptsmnt)
 {
 	struct pts_fs_info *result;
 	struct path path;
@@ -142,6 +142,7 @@ struct pts_fs_info *devpts_acquire(struct file *filp)
 
 	path = filp->f_path;
 	path_get(&path);
+	*ptsmnt = NULL;
 
 	/* Has the devpts filesystem already been found? */
 	sb = path.mnt->mnt_sb;
@@ -165,6 +166,7 @@ struct pts_fs_info *devpts_acquire(struct file *filp)
 	 * pty code needs to hold extra references in case of last /dev/tty close
 	 */
 	atomic_inc(&sb->s_active);
+	*ptsmnt = mntget(path.mnt);
 	result = DEVPTS_SB(sb);
 
 out:
diff --git a/include/linux/devpts_fs.h b/include/linux/devpts_fs.h
index 277ab9af9ac2..7883e901f65c 100644
--- a/include/linux/devpts_fs.h
+++ b/include/linux/devpts_fs.h
@@ -19,7 +19,7 @@
 
 struct pts_fs_info;
 
-struct pts_fs_info *devpts_acquire(struct file *);
+struct pts_fs_info *devpts_acquire(struct file *, struct vfsmount **ptsmnt);
 void devpts_release(struct pts_fs_info *);
 
 int devpts_new_index(struct pts_fs_info *);
-- 
cgit v1.2.3


From bc8230ee8e2ba967af780cdaf2dcc0f8e5eb45ca Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Thu, 8 Jun 2017 14:39:48 +0200
Subject: quota: Convert dqio_mutex to rwsem

Convert dqio_mutex to rwsem and call it dqio_sem. No functional changes
yet.

Signed-off-by: Jan Kara <jack@suse.cz>
---
 fs/ext4/super.c         | 13 ++++---------
 fs/ocfs2/quota_global.c | 20 ++++++++++----------
 fs/ocfs2/quota_local.c  | 10 +++++-----
 fs/quota/dquot.c        | 28 ++++++++++++++--------------
 fs/quota/quota_tree.c   |  2 +-
 fs/super.c              |  2 +-
 include/linux/quota.h   |  2 +-
 7 files changed, 36 insertions(+), 41 deletions(-)

(limited to 'include')

diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index d61a70e2193a..4c168b90903c 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -5263,18 +5263,13 @@ static int ext4_statfs(struct dentry *dentry, struct kstatfs *buf)
 	return 0;
 }
 
-/* Helper function for writing quotas on sync - we need to start transaction
- * before quota file is locked for write. Otherwise the are possible deadlocks:
- * Process 1                         Process 2
- * ext4_create()                     quota_sync()
- *   jbd2_journal_start()                  write_dquot()
- *   dquot_initialize()                         down(dqio_mutex)
- *     down(dqio_mutex)                    jbd2_journal_start()
- *
- */
 
 #ifdef CONFIG_QUOTA
 
+/*
+ * Helper functions so that transaction is started before we acquire dqio_sem
+ * to keep correct lock ordering of transaction > dqio_sem
+ */
 static inline struct inode *dquot_to_inode(struct dquot *dquot)
 {
 	return sb_dqopt(dquot->dq_sb)->files[dquot->dq_id.type];
diff --git a/fs/ocfs2/quota_global.c b/fs/ocfs2/quota_global.c
index cec495a921e3..4134d557a8e5 100644
--- a/fs/ocfs2/quota_global.c
+++ b/fs/ocfs2/quota_global.c
@@ -33,7 +33,7 @@
  * Locking of quotas with OCFS2 is rather complex. Here are rules that
  * should be obeyed by all the functions:
  * - any write of quota structure (either to local or global file) is protected
- *   by dqio_mutex or dquot->dq_lock.
+ *   by dqio_sem or dquot->dq_lock.
  * - any modification of global quota file holds inode cluster lock, i_mutex,
  *   and ip_alloc_sem of the global quota file (achieved by
  *   ocfs2_lock_global_qf). It also has to hold qinfo_lock.
@@ -42,9 +42,9 @@
  *
  * A rough sketch of locking dependencies (lf = local file, gf = global file):
  * Normal filesystem operation:
- *   start_trans -> dqio_mutex -> write to lf
+ *   start_trans -> dqio_sem -> write to lf
  * Syncing of local and global file:
- *   ocfs2_lock_global_qf -> start_trans -> dqio_mutex -> qinfo_lock ->
+ *   ocfs2_lock_global_qf -> start_trans -> dqio_sem -> qinfo_lock ->
  *     write to gf
  *						       -> write to lf
  * Acquire dquot for the first time:
@@ -60,7 +60,7 @@
  * Recovery:
  *   inode cluster lock of recovered lf
  *     -> read bitmaps -> ip_alloc_sem of lf
- *     -> ocfs2_lock_global_qf -> start_trans -> dqio_mutex -> qinfo_lock ->
+ *     -> ocfs2_lock_global_qf -> start_trans -> dqio_sem -> qinfo_lock ->
  *        write to gf
  */
 
@@ -611,7 +611,7 @@ static int ocfs2_sync_dquot_helper(struct dquot *dquot, unsigned long type)
 		mlog_errno(status);
 		goto out_ilock;
 	}
-	mutex_lock(&sb_dqopt(sb)->dqio_mutex);
+	down_write(&sb_dqopt(sb)->dqio_sem);
 	status = ocfs2_sync_dquot(dquot);
 	if (status < 0)
 		mlog_errno(status);
@@ -619,7 +619,7 @@ static int ocfs2_sync_dquot_helper(struct dquot *dquot, unsigned long type)
 	status = ocfs2_local_write_dquot(dquot);
 	if (status < 0)
 		mlog_errno(status);
-	mutex_unlock(&sb_dqopt(sb)->dqio_mutex);
+	up_write(&sb_dqopt(sb)->dqio_sem);
 	ocfs2_commit_trans(osb, handle);
 out_ilock:
 	ocfs2_unlock_global_qf(oinfo, 1);
@@ -666,9 +666,9 @@ static int ocfs2_write_dquot(struct dquot *dquot)
 		mlog_errno(status);
 		goto out;
 	}
-	mutex_lock(&sb_dqopt(dquot->dq_sb)->dqio_mutex);
+	down_write(&sb_dqopt(dquot->dq_sb)->dqio_sem);
 	status = ocfs2_local_write_dquot(dquot);
-	mutex_unlock(&sb_dqopt(dquot->dq_sb)->dqio_mutex);
+	up_write(&sb_dqopt(dquot->dq_sb)->dqio_sem);
 	ocfs2_commit_trans(osb, handle);
 out:
 	return status;
@@ -939,7 +939,7 @@ static int ocfs2_mark_dquot_dirty(struct dquot *dquot)
 		mlog_errno(status);
 		goto out_ilock;
 	}
-	mutex_lock(&sb_dqopt(sb)->dqio_mutex);
+	down_write(&sb_dqopt(sb)->dqio_sem);
 	status = ocfs2_sync_dquot(dquot);
 	if (status < 0) {
 		mlog_errno(status);
@@ -948,7 +948,7 @@ static int ocfs2_mark_dquot_dirty(struct dquot *dquot)
 	/* Now write updated local dquot structure */
 	status = ocfs2_local_write_dquot(dquot);
 out_dlock:
-	mutex_unlock(&sb_dqopt(sb)->dqio_mutex);
+	up_write(&sb_dqopt(sb)->dqio_sem);
 	ocfs2_commit_trans(osb, handle);
 out_ilock:
 	ocfs2_unlock_global_qf(oinfo, 1);
diff --git a/fs/ocfs2/quota_local.c b/fs/ocfs2/quota_local.c
index 32c5a40c1257..1311eff1c050 100644
--- a/fs/ocfs2/quota_local.c
+++ b/fs/ocfs2/quota_local.c
@@ -520,7 +520,7 @@ static int ocfs2_recover_local_quota_file(struct inode *lqinode,
 				mlog_errno(status);
 				goto out_drop_lock;
 			}
-			mutex_lock(&sb_dqopt(sb)->dqio_mutex);
+			down_write(&sb_dqopt(sb)->dqio_sem);
 			spin_lock(&dq_data_lock);
 			/* Add usage from quota entry into quota changes
 			 * of our node. Auxiliary variables are important
@@ -553,7 +553,7 @@ static int ocfs2_recover_local_quota_file(struct inode *lqinode,
 			unlock_buffer(qbh);
 			ocfs2_journal_dirty(handle, qbh);
 out_commit:
-			mutex_unlock(&sb_dqopt(sb)->dqio_mutex);
+			up_write(&sb_dqopt(sb)->dqio_sem);
 			ocfs2_commit_trans(OCFS2_SB(sb), handle);
 out_drop_lock:
 			ocfs2_unlock_global_qf(oinfo, 1);
@@ -693,7 +693,7 @@ static int ocfs2_local_read_info(struct super_block *sb, int type)
 
 	/* We don't need the lock and we have to acquire quota file locks
 	 * which will later depend on this lock */
-	mutex_unlock(&sb_dqopt(sb)->dqio_mutex);
+	up_write(&sb_dqopt(sb)->dqio_sem);
 	info->dqi_max_spc_limit = 0x7fffffffffffffffLL;
 	info->dqi_max_ino_limit = 0x7fffffffffffffffLL;
 	oinfo = kmalloc(sizeof(struct ocfs2_mem_dqinfo), GFP_NOFS);
@@ -772,7 +772,7 @@ static int ocfs2_local_read_info(struct super_block *sb, int type)
 		goto out_err;
 	}
 
-	mutex_lock(&sb_dqopt(sb)->dqio_mutex);
+	down_write(&sb_dqopt(sb)->dqio_sem);
 	return 0;
 out_err:
 	if (oinfo) {
@@ -786,7 +786,7 @@ out_err:
 		kfree(oinfo);
 	}
 	brelse(bh);
-	mutex_lock(&sb_dqopt(sb)->dqio_mutex);
+	down_write(&sb_dqopt(sb)->dqio_sem);
 	return -1;
 }
 
diff --git a/fs/quota/dquot.c b/fs/quota/dquot.c
index 53a17496c5c5..29d447598642 100644
--- a/fs/quota/dquot.c
+++ b/fs/quota/dquot.c
@@ -120,7 +120,7 @@
  * spinlock to internal buffers before writing.
  *
  * Lock ordering (including related VFS locks) is the following:
- *   s_umount > i_mutex > journal_lock > dquot->dq_lock > dqio_mutex
+ *   s_umount > i_mutex > journal_lock > dquot->dq_lock > dqio_sem
  */
 
 static __cacheline_aligned_in_smp DEFINE_SPINLOCK(dq_list_lock);
@@ -406,7 +406,7 @@ int dquot_acquire(struct dquot *dquot)
 	struct quota_info *dqopt = sb_dqopt(dquot->dq_sb);
 
 	mutex_lock(&dquot->dq_lock);
-	mutex_lock(&dqopt->dqio_mutex);
+	down_write(&dqopt->dqio_sem);
 	if (!test_bit(DQ_READ_B, &dquot->dq_flags))
 		ret = dqopt->ops[dquot->dq_id.type]->read_dqblk(dquot);
 	if (ret < 0)
@@ -436,7 +436,7 @@ int dquot_acquire(struct dquot *dquot)
 	smp_mb__before_atomic();
 	set_bit(DQ_ACTIVE_B, &dquot->dq_flags);
 out_iolock:
-	mutex_unlock(&dqopt->dqio_mutex);
+	up_write(&dqopt->dqio_sem);
 	mutex_unlock(&dquot->dq_lock);
 	return ret;
 }
@@ -450,7 +450,7 @@ int dquot_commit(struct dquot *dquot)
 	int ret = 0;
 	struct quota_info *dqopt = sb_dqopt(dquot->dq_sb);
 
-	mutex_lock(&dqopt->dqio_mutex);
+	down_write(&dqopt->dqio_sem);
 	spin_lock(&dq_list_lock);
 	if (!clear_dquot_dirty(dquot)) {
 		spin_unlock(&dq_list_lock);
@@ -464,7 +464,7 @@ int dquot_commit(struct dquot *dquot)
 	else
 		ret = -EIO;
 out_sem:
-	mutex_unlock(&dqopt->dqio_mutex);
+	up_write(&dqopt->dqio_sem);
 	return ret;
 }
 EXPORT_SYMBOL(dquot_commit);
@@ -481,7 +481,7 @@ int dquot_release(struct dquot *dquot)
 	/* Check whether we are not racing with some other dqget() */
 	if (atomic_read(&dquot->dq_count) > 1)
 		goto out_dqlock;
-	mutex_lock(&dqopt->dqio_mutex);
+	down_write(&dqopt->dqio_sem);
 	if (dqopt->ops[dquot->dq_id.type]->release_dqblk) {
 		ret = dqopt->ops[dquot->dq_id.type]->release_dqblk(dquot);
 		/* Write the info */
@@ -493,7 +493,7 @@ int dquot_release(struct dquot *dquot)
 			ret = ret2;
 	}
 	clear_bit(DQ_ACTIVE_B, &dquot->dq_flags);
-	mutex_unlock(&dqopt->dqio_mutex);
+	up_write(&dqopt->dqio_sem);
 out_dqlock:
 	mutex_unlock(&dquot->dq_lock);
 	return ret;
@@ -2060,9 +2060,9 @@ int dquot_commit_info(struct super_block *sb, int type)
 	int ret;
 	struct quota_info *dqopt = sb_dqopt(sb);
 
-	mutex_lock(&dqopt->dqio_mutex);
+	down_write(&dqopt->dqio_sem);
 	ret = dqopt->ops[type]->write_file_info(sb, type);
-	mutex_unlock(&dqopt->dqio_mutex);
+	up_write(&dqopt->dqio_sem);
 	return ret;
 }
 EXPORT_SYMBOL(dquot_commit_info);
@@ -2076,9 +2076,9 @@ int dquot_get_next_id(struct super_block *sb, struct kqid *qid)
 		return -ESRCH;
 	if (!dqopt->ops[qid->type]->get_next_id)
 		return -ENOSYS;
-	mutex_lock(&dqopt->dqio_mutex);
+	down_write(&dqopt->dqio_sem);
 	err = dqopt->ops[qid->type]->get_next_id(sb, qid);
-	mutex_unlock(&dqopt->dqio_mutex);
+	up_write(&dqopt->dqio_sem);
 	return err;
 }
 EXPORT_SYMBOL(dquot_get_next_id);
@@ -2328,15 +2328,15 @@ static int vfs_load_quota_inode(struct inode *inode, int type, int format_id,
 	dqopt->info[type].dqi_format = fmt;
 	dqopt->info[type].dqi_fmt_id = format_id;
 	INIT_LIST_HEAD(&dqopt->info[type].dqi_dirty_list);
-	mutex_lock(&dqopt->dqio_mutex);
+	down_write(&dqopt->dqio_sem);
 	error = dqopt->ops[type]->read_file_info(sb, type);
 	if (error < 0) {
-		mutex_unlock(&dqopt->dqio_mutex);
+		up_write(&dqopt->dqio_sem);
 		goto out_file_init;
 	}
 	if (dqopt->flags & DQUOT_QUOTA_SYS_FILE)
 		dqopt->info[type].dqi_flags |= DQF_SYS_FILE;
-	mutex_unlock(&dqopt->dqio_mutex);
+	up_write(&dqopt->dqio_sem);
 	spin_lock(&dq_state_lock);
 	dqopt->flags |= dquot_state_flag(flags, type);
 	spin_unlock(&dq_state_lock);
diff --git a/fs/quota/quota_tree.c b/fs/quota/quota_tree.c
index 0738972e8d3f..54f85eb2609c 100644
--- a/fs/quota/quota_tree.c
+++ b/fs/quota/quota_tree.c
@@ -379,7 +379,7 @@ int qtree_write_dquot(struct qtree_mem_dqinfo *info, struct dquot *dquot)
 	if (!ddquot)
 		return -ENOMEM;
 
-	/* dq_off is guarded by dqio_mutex */
+	/* dq_off is guarded by dqio_sem */
 	if (!dquot->dq_off) {
 		ret = dq_insert_tree(info, dquot);
 		if (ret < 0) {
diff --git a/fs/super.c b/fs/super.c
index 6bc3352adcf3..221cfa1f4e92 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -242,7 +242,7 @@ static struct super_block *alloc_super(struct file_system_type *type, int flags,
 	atomic_set(&s->s_active, 1);
 	mutex_init(&s->s_vfs_rename_mutex);
 	lockdep_set_class(&s->s_vfs_rename_mutex, &type->s_vfs_rename_key);
-	mutex_init(&s->s_dquot.dqio_mutex);
+	init_rwsem(&s->s_dquot.dqio_sem);
 	s->s_maxbytes = MAX_NON_LFS;
 	s->s_op = &default_op;
 	s->s_time_gran = 1000000000;
diff --git a/include/linux/quota.h b/include/linux/quota.h
index bfd077ca6ac3..3a6df7461642 100644
--- a/include/linux/quota.h
+++ b/include/linux/quota.h
@@ -521,7 +521,7 @@ static inline void quota_send_warning(struct kqid qid, dev_t dev,
 
 struct quota_info {
 	unsigned int flags;			/* Flags for diskquotas on this device */
-	struct mutex dqio_mutex;		/* lock device while I/O in progress */
+	struct rw_semaphore dqio_sem;		/* Lock quota file while I/O in progress */
 	struct inode *files[MAXQUOTAS];		/* inodes of quotafiles */
 	struct mem_dqinfo info[MAXQUOTAS];	/* Information for each quota type */
 	const struct quota_format_ops *ops[MAXQUOTAS];	/* Operations for each type */
-- 
cgit v1.2.3


From 1312b7e0caca44e7ff312bc2eaa888943384e3e1 Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rafael.j.wysocki@intel.com>
Date: Thu, 10 Aug 2017 00:31:58 +0200
Subject: ACPICA: Make it possible to enable runtime GPEs earlier

Runtime GPEs have corresponding _Lxx/_Exx methods and are enabled
automatically during the initialization of the ACPI subsystem through
acpi_update_all_gpes() with the assumption that acpi_setup_gpe_for_wake()
will be called in advance for all of the GPEs pointed to by _PRW
objects in the namespace that may be affected by acpi_update_all_gpes().
That is, acpi_ev_initialize_gpe_block() can only be called for a GPE
block after acpi_setup_gpe_for_wake() has been called for all of the
_PRW (wakeup) GPEs in it.

The platform firmware on some systems, however, expects GPEs to be
enabled before the enumeration of devices which is when
acpi_setup_gpe_for_wake() is called and that goes against the above
assumption.

For this reason, introduce a new flag to be set by
acpi_ev_initialize_gpe_block() when automatically enabling a GPE
to indicate to acpi_setup_gpe_for_wake() that it needs to drop the
reference to the GPE coming from acpi_ev_initialize_gpe_block()
and modify acpi_setup_gpe_for_wake() accordingly.  These changes
allow acpi_setup_gpe_for_wake() and acpi_ev_initialize_gpe_block()
to be invoked in any order.

Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Tested-by: Mika Westerberg <mika.westerberg@linux.intel.com>
---
 drivers/acpi/acpica/evgpeblk.c | 2 ++
 drivers/acpi/acpica/evxfgpe.c  | 8 ++++++++
 include/acpi/actypes.h         | 3 ++-
 3 files changed, 12 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/drivers/acpi/acpica/evgpeblk.c b/drivers/acpi/acpica/evgpeblk.c
index c27386cca493..3a3cb8624f41 100644
--- a/drivers/acpi/acpica/evgpeblk.c
+++ b/drivers/acpi/acpica/evgpeblk.c
@@ -496,6 +496,8 @@ acpi_ev_initialize_gpe_block(struct acpi_gpe_xrupt_info *gpe_xrupt_info,
 				continue;
 			}
 
+			gpe_event_info->flags |= ACPI_GPE_AUTO_ENABLED;
+
 			if (event_status & ACPI_EVENT_FLAG_STATUS_SET) {
 				ACPI_INFO(("GPE 0x%02X active on init",
 					   gpe_number));
diff --git a/drivers/acpi/acpica/evxfgpe.c b/drivers/acpi/acpica/evxfgpe.c
index 57718a3e029a..67c7c4ce276c 100644
--- a/drivers/acpi/acpica/evxfgpe.c
+++ b/drivers/acpi/acpica/evxfgpe.c
@@ -435,6 +435,14 @@ acpi_setup_gpe_for_wake(acpi_handle wake_device,
 		 */
 		gpe_event_info->flags =
 		    (ACPI_GPE_DISPATCH_NOTIFY | ACPI_GPE_LEVEL_TRIGGERED);
+	} else if (gpe_event_info->flags & ACPI_GPE_AUTO_ENABLED) {
+		/*
+		 * A reference to this GPE has been added during the GPE block
+		 * initialization, so drop it now to prevent the GPE from being
+		 * permanently enabled and clear its ACPI_GPE_AUTO_ENABLED flag.
+		 */
+		(void)acpi_ev_remove_gpe_reference(gpe_event_info);
+		gpe_event_info->flags &= ~ACPI_GPE_AUTO_ENABLED;
 	}
 
 	/*
diff --git a/include/acpi/actypes.h b/include/acpi/actypes.h
index 2fcbaec8b368..71eddf645566 100644
--- a/include/acpi/actypes.h
+++ b/include/acpi/actypes.h
@@ -775,7 +775,7 @@ typedef u32 acpi_event_status;
  *   |  | | |  +-- Type of dispatch:to method, handler, notify, or none
  *   |  | | +----- Interrupt type: edge or level triggered
  *   |  | +------- Is a Wake GPE
- *   |  +--------- Is GPE masked by the software GPE masking mechanism
+ *   |  +--------- Has been enabled automatically at init time
  *   +------------ <Reserved>
  */
 #define ACPI_GPE_DISPATCH_NONE          (u8) 0x00
@@ -791,6 +791,7 @@ typedef u32 acpi_event_status;
 #define ACPI_GPE_XRUPT_TYPE_MASK        (u8) 0x08
 
 #define ACPI_GPE_CAN_WAKE               (u8) 0x10
+#define ACPI_GPE_AUTO_ENABLED           (u8) 0x20
 
 /*
  * Flags for GPE and Lock interfaces
-- 
cgit v1.2.3


From 5d72801538eb59cfd9ca25d00aa439cfbc02ac9a Mon Sep 17 00:00:00 2001
From: Stephen Smalley <sds@tycho.nsa.gov>
Date: Thu, 17 Aug 2017 13:32:37 -0400
Subject: lsm_audit: update my email address

Update my email address since epoch.ncsc.mil no longer exists.
MAINTAINERS and CREDITS are already correct.

Signed-off-by: Stephen Smalley <sds@tycho.nsa.gov>
Signed-off-by: Paul Moore <paul@paul-moore.com>
---
 include/linux/lsm_audit.h | 2 +-
 security/lsm_audit.c      | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/linux/lsm_audit.h b/include/linux/lsm_audit.h
index 22b5d4e687ce..d1c2901f1542 100644
--- a/include/linux/lsm_audit.h
+++ b/include/linux/lsm_audit.h
@@ -4,7 +4,7 @@
  *
  * Author : Etienne BASSET  <etienne.basset@ensta.org>
  *
- * All credits to : Stephen Smalley, <sds@epoch.ncsc.mil>
+ * All credits to : Stephen Smalley, <sds@tycho.nsa.gov>
  * All BUGS to : Etienne BASSET  <etienne.basset@ensta.org>
  */
 #ifndef _LSM_COMMON_LOGGING_
diff --git a/security/lsm_audit.c b/security/lsm_audit.c
index 28d4c3a528ab..67703dbe29ea 100644
--- a/security/lsm_audit.c
+++ b/security/lsm_audit.c
@@ -2,7 +2,7 @@
  * common LSM auditing functions
  *
  * Based on code written for SELinux by :
- *			Stephen Smalley, <sds@epoch.ncsc.mil>
+ *			Stephen Smalley, <sds@tycho.nsa.gov>
  * 			James Morris <jmorris@redhat.com>
  * Author : Etienne Basset, <etienne.basset@ensta.org>
  *
-- 
cgit v1.2.3


From 503330f3820fab13aa2a7b1f9e7633686acc7c79 Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Wed, 2 Aug 2017 17:18:50 +0200
Subject: quota: Remove dq_wait_unused from dquot

Currently every dquot carries a wait_queue_head_t used only when we are
turning quotas off to wait for last users to drop dquot references.
Since such rare case is not performance sensitive in any means, just use
a global waitqueue for this and save space in struct dquot. Also convert
the logic to use wait_event() instead of open-coding it.

Signed-off-by: Jan Kara <jack@suse.cz>
---
 fs/quota/dquot.c      | 17 +++++++----------
 include/linux/quota.h |  1 -
 2 files changed, 7 insertions(+), 11 deletions(-)

(limited to 'include')

diff --git a/fs/quota/dquot.c b/fs/quota/dquot.c
index 93adcdd6a260..361a2a6f13e1 100644
--- a/fs/quota/dquot.c
+++ b/fs/quota/dquot.c
@@ -126,6 +126,8 @@ __cacheline_aligned_in_smp DEFINE_SPINLOCK(dq_data_lock);
 EXPORT_SYMBOL(dq_data_lock);
 DEFINE_STATIC_SRCU(dquot_srcu);
 
+static DECLARE_WAIT_QUEUE_HEAD(dquot_ref_wq);
+
 void __quota_error(struct super_block *sb, const char *func,
 		   const char *fmt, ...)
 {
@@ -527,22 +529,18 @@ restart:
 			continue;
 		/* Wait for dquot users */
 		if (atomic_read(&dquot->dq_count)) {
-			DEFINE_WAIT(wait);
-
 			dqgrab(dquot);
-			prepare_to_wait(&dquot->dq_wait_unused, &wait,
-					TASK_UNINTERRUPTIBLE);
 			spin_unlock(&dq_list_lock);
-			/* Once dqput() wakes us up, we know it's time to free
+			/*
+			 * Once dqput() wakes us up, we know it's time to free
 			 * the dquot.
 			 * IMPORTANT: we rely on the fact that there is always
 			 * at most one process waiting for dquot to free.
 			 * Otherwise dq_count would be > 1 and we would never
 			 * wake up.
 			 */
-			if (atomic_read(&dquot->dq_count) > 1)
-				schedule();
-			finish_wait(&dquot->dq_wait_unused, &wait);
+			wait_event(dquot_ref_wq,
+				   atomic_read(&dquot->dq_count) == 1);
 			dqput(dquot);
 			/* At this moment dquot() need not exist (it could be
 			 * reclaimed by prune_dqcache(). Hence we must
@@ -754,7 +752,7 @@ we_slept:
 		/* Releasing dquot during quotaoff phase? */
 		if (!sb_has_quota_active(dquot->dq_sb, dquot->dq_id.type) &&
 		    atomic_read(&dquot->dq_count) == 1)
-			wake_up(&dquot->dq_wait_unused);
+			wake_up(&dquot_ref_wq);
 		spin_unlock(&dq_list_lock);
 		return;
 	}
@@ -809,7 +807,6 @@ static struct dquot *get_empty_dquot(struct super_block *sb, int type)
 	INIT_LIST_HEAD(&dquot->dq_inuse);
 	INIT_HLIST_NODE(&dquot->dq_hash);
 	INIT_LIST_HEAD(&dquot->dq_dirty);
-	init_waitqueue_head(&dquot->dq_wait_unused);
 	dquot->dq_sb = sb;
 	dquot->dq_id = make_kqid_invalid(type);
 	atomic_set(&dquot->dq_count, 1);
diff --git a/include/linux/quota.h b/include/linux/quota.h
index 3a6df7461642..ad6809f099ac 100644
--- a/include/linux/quota.h
+++ b/include/linux/quota.h
@@ -299,7 +299,6 @@ struct dquot {
 	struct list_head dq_dirty;	/* List of dirty dquots */
 	struct mutex dq_lock;		/* dquot IO lock */
 	atomic_t dq_count;		/* Use count */
-	wait_queue_head_t dq_wait_unused;	/* Wait queue for dquot to become unused */
 	struct super_block *dq_sb;	/* superblock this applies to */
 	struct kqid dq_id;		/* ID this applies to (uid, gid, projid) */
 	loff_t dq_off;			/* Offset of dquot on disk */
-- 
cgit v1.2.3


From 834057bf846691552a8906f7ed3f67546e5f897c Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Thu, 3 Aug 2017 11:18:23 +0200
Subject: quota: Allow disabling tracking of dirty dquots in a list

Filesystems that are journalling quotas generally don't need tracking of
dirty dquots in a list since forcing a transaction commit flushes all
quotas anyway. Allow filesystem to say it doesn't want dquots to be
tracked as it reduces contention on the dq_list_lock.

Signed-off-by: Jan Kara <jack@suse.cz>
---
 fs/quota/dquot.c      | 6 ++++++
 include/linux/quota.h | 3 +++
 2 files changed, 9 insertions(+)

(limited to 'include')

diff --git a/fs/quota/dquot.c b/fs/quota/dquot.c
index 361a2a6f13e1..b867578e62c0 100644
--- a/fs/quota/dquot.c
+++ b/fs/quota/dquot.c
@@ -344,6 +344,9 @@ int dquot_mark_dquot_dirty(struct dquot *dquot)
 	if (!test_bit(DQ_ACTIVE_B, &dquot->dq_flags))
 		return 0;
 
+	if (sb_dqopt(dquot->dq_sb)->flags & DQUOT_NOLIST_DIRTY)
+		return test_and_set_bit(DQ_MOD_B, &dquot->dq_flags);
+
 	/* If quota is dirty already, we don't have to acquire dq_list_lock */
 	if (test_bit(DQ_MOD_B, &dquot->dq_flags))
 		return 1;
@@ -385,6 +388,9 @@ static inline void dqput_all(struct dquot **dquot)
 
 static inline int clear_dquot_dirty(struct dquot *dquot)
 {
+	if (sb_dqopt(dquot->dq_sb)->flags & DQUOT_NOLIST_DIRTY)
+		return test_and_clear_bit(DQ_MOD_B, &dquot->dq_flags);
+
 	spin_lock(&dq_list_lock);
 	if (!test_and_clear_bit(DQ_MOD_B, &dquot->dq_flags)) {
 		spin_unlock(&dq_list_lock);
diff --git a/include/linux/quota.h b/include/linux/quota.h
index ad6809f099ac..eccc1cb6274e 100644
--- a/include/linux/quota.h
+++ b/include/linux/quota.h
@@ -490,6 +490,9 @@ enum {
 						 */
 #define DQUOT_NEGATIVE_USAGE	(1 << (DQUOT_STATE_LAST + 1))
 					       /* Allow negative quota usage */
+/* Do not track dirty dquots in a list */
+#define DQUOT_NOLIST_DIRTY	(1 << (DQUOT_STATE_LAST + 2))
+
 static inline unsigned int dquot_state_flag(unsigned int flags, int type)
 {
 	return flags << type;
-- 
cgit v1.2.3


From 0ed60de34a975804a256fb3fe233b1466c603be6 Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Mon, 7 Aug 2017 17:07:28 +0200
Subject: quota: Inline functions into their callsites

inode_add_rsv_space() and inode_sub_rsv_space() had only one callsite.
Inline them there directly. inode_claim_rsv_space() and
inode_reclaim_rsv_space() had two callsites so inline them there as
well. This will simplify further locking changes.

Signed-off-by: Jan Kara <jack@suse.cz>
---
 fs/quota/dquot.c         | 72 +++++++++++++++++++-----------------------------
 include/linux/quotaops.h |  5 ----
 2 files changed, 28 insertions(+), 49 deletions(-)

(limited to 'include')

diff --git a/fs/quota/dquot.c b/fs/quota/dquot.c
index b867578e62c0..d881d5a073b9 100644
--- a/fs/quota/dquot.c
+++ b/fs/quota/dquot.c
@@ -1583,40 +1583,6 @@ static qsize_t *inode_reserved_space(struct inode * inode)
 	return inode->i_sb->dq_op->get_reserved_space(inode);
 }
 
-void inode_add_rsv_space(struct inode *inode, qsize_t number)
-{
-	spin_lock(&inode->i_lock);
-	*inode_reserved_space(inode) += number;
-	spin_unlock(&inode->i_lock);
-}
-EXPORT_SYMBOL(inode_add_rsv_space);
-
-void inode_claim_rsv_space(struct inode *inode, qsize_t number)
-{
-	spin_lock(&inode->i_lock);
-	*inode_reserved_space(inode) -= number;
-	__inode_add_bytes(inode, number);
-	spin_unlock(&inode->i_lock);
-}
-EXPORT_SYMBOL(inode_claim_rsv_space);
-
-void inode_reclaim_rsv_space(struct inode *inode, qsize_t number)
-{
-	spin_lock(&inode->i_lock);
-	*inode_reserved_space(inode) += number;
-	__inode_sub_bytes(inode, number);
-	spin_unlock(&inode->i_lock);
-}
-EXPORT_SYMBOL(inode_reclaim_rsv_space);
-
-void inode_sub_rsv_space(struct inode *inode, qsize_t number)
-{
-	spin_lock(&inode->i_lock);
-	*inode_reserved_space(inode) -= number;
-	spin_unlock(&inode->i_lock);
-}
-EXPORT_SYMBOL(inode_sub_rsv_space);
-
 static qsize_t inode_get_rsv_space(struct inode *inode)
 {
 	qsize_t ret;
@@ -1632,18 +1598,24 @@ static qsize_t inode_get_rsv_space(struct inode *inode)
 static void inode_incr_space(struct inode *inode, qsize_t number,
 				int reserve)
 {
-	if (reserve)
-		inode_add_rsv_space(inode, number);
-	else
+	if (reserve) {
+		spin_lock(&inode->i_lock);
+		*inode_reserved_space(inode) += number;
+		spin_unlock(&inode->i_lock);
+	} else {
 		inode_add_bytes(inode, number);
+	}
 }
 
 static void inode_decr_space(struct inode *inode, qsize_t number, int reserve)
 {
-	if (reserve)
-		inode_sub_rsv_space(inode, number);
-	else
+	if (reserve) {
+		spin_lock(&inode->i_lock);
+		*inode_reserved_space(inode) -= number;
+		spin_unlock(&inode->i_lock);
+	} else {
 		inode_sub_bytes(inode, number);
+	}
 }
 
 /*
@@ -1759,7 +1731,10 @@ int dquot_claim_space_nodirty(struct inode *inode, qsize_t number)
 	int cnt, index;
 
 	if (!dquot_active(inode)) {
-		inode_claim_rsv_space(inode, number);
+		spin_lock(&inode->i_lock);
+		*inode_reserved_space(inode) -= number;
+		__inode_add_bytes(inode, number);
+		spin_unlock(&inode->i_lock);
 		return 0;
 	}
 
@@ -1772,7 +1747,10 @@ int dquot_claim_space_nodirty(struct inode *inode, qsize_t number)
 			dquot_claim_reserved_space(dquots[cnt], number);
 	}
 	/* Update inode bytes */
-	inode_claim_rsv_space(inode, number);
+	spin_lock(&inode->i_lock);
+	*inode_reserved_space(inode) -= number;
+	__inode_add_bytes(inode, number);
+	spin_unlock(&inode->i_lock);
 	spin_unlock(&dq_data_lock);
 	mark_all_dquot_dirty(dquots);
 	srcu_read_unlock(&dquot_srcu, index);
@@ -1789,7 +1767,10 @@ void dquot_reclaim_space_nodirty(struct inode *inode, qsize_t number)
 	int cnt, index;
 
 	if (!dquot_active(inode)) {
-		inode_reclaim_rsv_space(inode, number);
+		spin_lock(&inode->i_lock);
+		*inode_reserved_space(inode) += number;
+		__inode_sub_bytes(inode, number);
+		spin_unlock(&inode->i_lock);
 		return;
 	}
 
@@ -1802,7 +1783,10 @@ void dquot_reclaim_space_nodirty(struct inode *inode, qsize_t number)
 			dquot_reclaim_reserved_space(dquots[cnt], number);
 	}
 	/* Update inode bytes */
-	inode_reclaim_rsv_space(inode, number);
+	spin_lock(&inode->i_lock);
+	*inode_reserved_space(inode) += number;
+	__inode_sub_bytes(inode, number);
+	spin_unlock(&inode->i_lock);
 	spin_unlock(&dq_data_lock);
 	mark_all_dquot_dirty(dquots);
 	srcu_read_unlock(&dquot_srcu, index);
diff --git a/include/linux/quotaops.h b/include/linux/quotaops.h
index dda22f45fc1b..0ce6fc49962e 100644
--- a/include/linux/quotaops.h
+++ b/include/linux/quotaops.h
@@ -38,11 +38,6 @@ void __quota_error(struct super_block *sb, const char *func,
 /*
  * declaration of quota_function calls in kernel.
  */
-void inode_add_rsv_space(struct inode *inode, qsize_t number);
-void inode_claim_rsv_space(struct inode *inode, qsize_t number);
-void inode_sub_rsv_space(struct inode *inode, qsize_t number);
-void inode_reclaim_rsv_space(struct inode *inode, qsize_t number);
-
 int dquot_initialize(struct inode *inode);
 bool dquot_initialize_needed(struct inode *inode);
 void dquot_drop(struct inode *inode);
-- 
cgit v1.2.3


From f4a8116a4c8c8f754d0ec1498a2ba4b63d114e6a Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Tue, 8 Aug 2017 09:54:36 +0200
Subject: fs: Provide __inode_get_bytes()

Provide helper __inode_get_bytes() which assumes i_lock is already
acquired. Quota code will need this to be able to use i_lock to protect
consistency of quota accounting information and inode usage.

Signed-off-by: Jan Kara <jack@suse.cz>
---
 fs/stat.c          | 2 +-
 include/linux/fs.h | 4 ++++
 2 files changed, 5 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/fs/stat.c b/fs/stat.c
index c35610845ab1..8a6aa8caf891 100644
--- a/fs/stat.c
+++ b/fs/stat.c
@@ -710,7 +710,7 @@ loff_t inode_get_bytes(struct inode *inode)
 	loff_t ret;
 
 	spin_lock(&inode->i_lock);
-	ret = (((loff_t)inode->i_blocks) << 9) + inode->i_bytes;
+	ret = __inode_get_bytes(inode);
 	spin_unlock(&inode->i_lock);
 	return ret;
 }
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 6e1fd5d21248..d6e9ab7f184f 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -2998,6 +2998,10 @@ void __inode_add_bytes(struct inode *inode, loff_t bytes);
 void inode_add_bytes(struct inode *inode, loff_t bytes);
 void __inode_sub_bytes(struct inode *inode, loff_t bytes);
 void inode_sub_bytes(struct inode *inode, loff_t bytes);
+static inline loff_t __inode_get_bytes(struct inode *inode)
+{
+	return (((loff_t)inode->i_blocks) << 9) + inode->i_bytes;
+}
 loff_t inode_get_bytes(struct inode *inode);
 void inode_set_bytes(struct inode *inode, loff_t bytes);
 const char *simple_get_link(struct dentry *, struct inode *,
-- 
cgit v1.2.3


From 7b9ca4c61bc278b771fb57d6290a31ab1fc7fdac Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Mon, 7 Aug 2017 13:19:50 +0200
Subject: quota: Reduce contention on dq_data_lock

dq_data_lock is currently used to protect all modifications of quota
accounting information, consistency of quota accounting on the inode,
and dquot pointers from inode. As a result contention on the lock can be
pretty heavy.

Reduce the contention on the lock by protecting quota accounting
information by a new dquot->dq_dqb_lock and consistency of quota
accounting with inode usage by inode->i_lock.

This change reduces time to create 500000 files on ext4 on ramdisk by 50
different processes in separate directories by 6% when user quota is
turned on. When those 50 processes belong to 50 different users, the
improvement is about 9%.

Signed-off-by: Jan Kara <jack@suse.cz>
---
 fs/ext4/super.c         |   4 +-
 fs/ocfs2/quota_global.c |   8 +-
 fs/ocfs2/quota_local.c  |   8 +-
 fs/quota/dquot.c        | 287 ++++++++++++++++++++++++++++--------------------
 fs/quota/quota_tree.c   |   8 +-
 include/linux/quota.h   |   3 +-
 6 files changed, 186 insertions(+), 132 deletions(-)

(limited to 'include')

diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 16a877a0f309..67ce21224dab 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -5194,7 +5194,7 @@ static int ext4_statfs_project(struct super_block *sb,
 	dquot = dqget(sb, qid);
 	if (IS_ERR(dquot))
 		return PTR_ERR(dquot);
-	spin_lock(&dq_data_lock);
+	spin_lock(&dquot->dq_dqb_lock);
 
 	limit = (dquot->dq_dqb.dqb_bsoftlimit ?
 		 dquot->dq_dqb.dqb_bsoftlimit :
@@ -5217,7 +5217,7 @@ static int ext4_statfs_project(struct super_block *sb,
 			 (buf->f_files - dquot->dq_dqb.dqb_curinodes) : 0;
 	}
 
-	spin_unlock(&dq_data_lock);
+	spin_unlock(&dquot->dq_dqb_lock);
 	dqput(dquot);
 	return 0;
 }
diff --git a/fs/ocfs2/quota_global.c b/fs/ocfs2/quota_global.c
index 78f3a869748c..c94b6baaa551 100644
--- a/fs/ocfs2/quota_global.c
+++ b/fs/ocfs2/quota_global.c
@@ -504,7 +504,7 @@ int __ocfs2_sync_dquot(struct dquot *dquot, int freeing)
 	/* Update space and inode usage. Get also other information from
 	 * global quota file so that we don't overwrite any changes there.
 	 * We are */
-	spin_lock(&dq_data_lock);
+	spin_lock(&dquot->dq_dqb_lock);
 	spacechange = dquot->dq_dqb.dqb_curspace -
 					OCFS2_DQUOT(dquot)->dq_origspace;
 	inodechange = dquot->dq_dqb.dqb_curinodes -
@@ -560,7 +560,7 @@ int __ocfs2_sync_dquot(struct dquot *dquot, int freeing)
 	__clear_bit(DQ_LASTSET_B + QIF_ITIME_B, &dquot->dq_flags);
 	OCFS2_DQUOT(dquot)->dq_origspace = dquot->dq_dqb.dqb_curspace;
 	OCFS2_DQUOT(dquot)->dq_originodes = dquot->dq_dqb.dqb_curinodes;
-	spin_unlock(&dq_data_lock);
+	spin_unlock(&dquot->dq_dqb_lock);
 	err = ocfs2_qinfo_lock(info, freeing);
 	if (err < 0) {
 		mlog(ML_ERROR, "Failed to lock quota info, losing quota write"
@@ -924,10 +924,10 @@ static int ocfs2_mark_dquot_dirty(struct dquot *dquot)
 
 	/* In case user set some limits, sync dquot immediately to global
 	 * quota file so that information propagates quicker */
-	spin_lock(&dq_data_lock);
+	spin_lock(&dquot->dq_dqb_lock);
 	if (dquot->dq_flags & mask)
 		sync = 1;
-	spin_unlock(&dq_data_lock);
+	spin_unlock(&dquot->dq_dqb_lock);
 	/* This is a slight hack but we can't afford getting global quota
 	 * lock if we already have a transaction started. */
 	if (!sync || journal_current_handle()) {
diff --git a/fs/ocfs2/quota_local.c b/fs/ocfs2/quota_local.c
index 2eeedcc2e9da..aa700fd10610 100644
--- a/fs/ocfs2/quota_local.c
+++ b/fs/ocfs2/quota_local.c
@@ -521,7 +521,7 @@ static int ocfs2_recover_local_quota_file(struct inode *lqinode,
 				goto out_drop_lock;
 			}
 			down_write(&sb_dqopt(sb)->dqio_sem);
-			spin_lock(&dq_data_lock);
+			spin_lock(&dquot->dq_dqb_lock);
 			/* Add usage from quota entry into quota changes
 			 * of our node. Auxiliary variables are important
 			 * due to signedness */
@@ -529,7 +529,7 @@ static int ocfs2_recover_local_quota_file(struct inode *lqinode,
 			inodechange = le64_to_cpu(dqblk->dqb_inodemod);
 			dquot->dq_dqb.dqb_curspace += spacechange;
 			dquot->dq_dqb.dqb_curinodes += inodechange;
-			spin_unlock(&dq_data_lock);
+			spin_unlock(&dquot->dq_dqb_lock);
 			/* We want to drop reference held by the crashed
 			 * node. Since we have our own reference we know
 			 * global structure actually won't be freed. */
@@ -877,12 +877,12 @@ static void olq_set_dquot(struct buffer_head *bh, void *private)
 
 	dqblk->dqb_id = cpu_to_le64(from_kqid(&init_user_ns,
 					      od->dq_dquot.dq_id));
-	spin_lock(&dq_data_lock);
+	spin_lock(&od->dq_dquot.dq_dqb_lock);
 	dqblk->dqb_spacemod = cpu_to_le64(od->dq_dquot.dq_dqb.dqb_curspace -
 					  od->dq_origspace);
 	dqblk->dqb_inodemod = cpu_to_le64(od->dq_dquot.dq_dqb.dqb_curinodes -
 					  od->dq_originodes);
-	spin_unlock(&dq_data_lock);
+	spin_unlock(&od->dq_dquot.dq_dqb_lock);
 	trace_olq_set_dquot(
 		(unsigned long long)le64_to_cpu(dqblk->dqb_spacemod),
 		(unsigned long long)le64_to_cpu(dqblk->dqb_inodemod),
diff --git a/fs/quota/dquot.c b/fs/quota/dquot.c
index 411142a2f074..d51797f850c5 100644
--- a/fs/quota/dquot.c
+++ b/fs/quota/dquot.c
@@ -82,16 +82,19 @@
 #include <linux/uaccess.h>
 
 /*
- * There are three quota SMP locks. dq_list_lock protects all lists with quotas
- * and quota formats.
- * dq_data_lock protects data from dq_dqb and also mem_dqinfo structures and
- * also guards consistency of dquot->dq_dqb with inode->i_blocks, i_bytes.
- * i_blocks and i_bytes updates itself are guarded by i_lock acquired directly
- * in inode_add_bytes() and inode_sub_bytes(). dq_state_lock protects
- * modifications of quota state (on quotaon and quotaoff) and readers who care
- * about latest values take it as well.
+ * There are five quota SMP locks:
+ * * dq_list_lock protects all lists with quotas and quota formats.
+ * * dquot->dq_dqb_lock protects data from dq_dqb
+ * * inode->i_lock protects inode->i_blocks, i_bytes and also guards
+ *   consistency of dquot->dq_dqb with inode->i_blocks, i_bytes so that
+ *   dquot_transfer() can stabilize amount it transfers
+ * * dq_data_lock protects mem_dqinfo structures and modifications of dquot
+ *   pointers in the inode
+ * * dq_state_lock protects modifications of quota state (on quotaon and
+ *   quotaoff) and readers who care about latest values take it as well.
  *
- * The spinlock ordering is hence: dq_data_lock > dq_list_lock > i_lock,
+ * The spinlock ordering is hence:
+ *   dq_data_lock > dq_list_lock > i_lock > dquot->dq_dqb_lock,
  *   dq_list_lock > dq_state_lock
  *
  * Note that some things (eg. sb pointer, type, id) doesn't change during
@@ -246,6 +249,7 @@ struct dqstats dqstats;
 EXPORT_SYMBOL(dqstats);
 
 static qsize_t inode_get_rsv_space(struct inode *inode);
+static qsize_t __inode_get_rsv_space(struct inode *inode);
 static int __dquot_initialize(struct inode *inode, int type);
 
 static inline unsigned int
@@ -816,6 +820,7 @@ static struct dquot *get_empty_dquot(struct super_block *sb, int type)
 	dquot->dq_sb = sb;
 	dquot->dq_id = make_kqid_invalid(type);
 	atomic_set(&dquot->dq_count, 1);
+	spin_lock_init(&dquot->dq_dqb_lock);
 
 	return dquot;
 }
@@ -1073,21 +1078,6 @@ static void drop_dquot_ref(struct super_block *sb, int type)
 	}
 }
 
-static inline void dquot_incr_inodes(struct dquot *dquot, qsize_t number)
-{
-	dquot->dq_dqb.dqb_curinodes += number;
-}
-
-static inline void dquot_incr_space(struct dquot *dquot, qsize_t number)
-{
-	dquot->dq_dqb.dqb_curspace += number;
-}
-
-static inline void dquot_resv_space(struct dquot *dquot, qsize_t number)
-{
-	dquot->dq_dqb.dqb_rsvspace += number;
-}
-
 static inline
 void dquot_free_reserved_space(struct dquot *dquot, qsize_t number)
 {
@@ -1246,21 +1236,24 @@ static int ignore_hardlimit(struct dquot *dquot)
 		!(info->dqi_flags & DQF_ROOT_SQUASH));
 }
 
-/* needs dq_data_lock */
-static int check_idq(struct dquot *dquot, qsize_t inodes,
-		     struct dquot_warn *warn)
+static int dquot_add_inodes(struct dquot *dquot, qsize_t inodes,
+			    struct dquot_warn *warn)
 {
-	qsize_t newinodes = dquot->dq_dqb.dqb_curinodes + inodes;
+	qsize_t newinodes;
+	int ret = 0;
 
+	spin_lock(&dquot->dq_dqb_lock);
+	newinodes = dquot->dq_dqb.dqb_curinodes + inodes;
 	if (!sb_has_quota_limits_enabled(dquot->dq_sb, dquot->dq_id.type) ||
 	    test_bit(DQ_FAKE_B, &dquot->dq_flags))
-		return 0;
+		goto add;
 
 	if (dquot->dq_dqb.dqb_ihardlimit &&
 	    newinodes > dquot->dq_dqb.dqb_ihardlimit &&
             !ignore_hardlimit(dquot)) {
 		prepare_warning(warn, dquot, QUOTA_NL_IHARDWARN);
-		return -EDQUOT;
+		ret = -EDQUOT;
+		goto out;
 	}
 
 	if (dquot->dq_dqb.dqb_isoftlimit &&
@@ -1269,7 +1262,8 @@ static int check_idq(struct dquot *dquot, qsize_t inodes,
 	    ktime_get_real_seconds() >= dquot->dq_dqb.dqb_itime &&
             !ignore_hardlimit(dquot)) {
 		prepare_warning(warn, dquot, QUOTA_NL_ISOFTLONGWARN);
-		return -EDQUOT;
+		ret = -EDQUOT;
+		goto out;
 	}
 
 	if (dquot->dq_dqb.dqb_isoftlimit &&
@@ -1279,30 +1273,40 @@ static int check_idq(struct dquot *dquot, qsize_t inodes,
 		dquot->dq_dqb.dqb_itime = ktime_get_real_seconds() +
 		    sb_dqopt(dquot->dq_sb)->info[dquot->dq_id.type].dqi_igrace;
 	}
+add:
+	dquot->dq_dqb.dqb_curinodes = newinodes;
 
-	return 0;
+out:
+	spin_unlock(&dquot->dq_dqb_lock);
+	return ret;
 }
 
-/* needs dq_data_lock */
-static int check_bdq(struct dquot *dquot, qsize_t space, int prealloc,
-		     struct dquot_warn *warn)
+static int dquot_add_space(struct dquot *dquot, qsize_t space,
+			   qsize_t rsv_space, unsigned int flags,
+			   struct dquot_warn *warn)
 {
 	qsize_t tspace;
 	struct super_block *sb = dquot->dq_sb;
+	int ret = 0;
 
+	spin_lock(&dquot->dq_dqb_lock);
 	if (!sb_has_quota_limits_enabled(sb, dquot->dq_id.type) ||
 	    test_bit(DQ_FAKE_B, &dquot->dq_flags))
-		return 0;
+		goto add;
 
 	tspace = dquot->dq_dqb.dqb_curspace + dquot->dq_dqb.dqb_rsvspace
-		+ space;
+		+ space + rsv_space;
+
+	if (flags & DQUOT_SPACE_NOFAIL)
+		goto add;
 
 	if (dquot->dq_dqb.dqb_bhardlimit &&
 	    tspace > dquot->dq_dqb.dqb_bhardlimit &&
             !ignore_hardlimit(dquot)) {
-		if (!prealloc)
+		if (flags & DQUOT_SPACE_WARN)
 			prepare_warning(warn, dquot, QUOTA_NL_BHARDWARN);
-		return -EDQUOT;
+		ret = -EDQUOT;
+		goto out;
 	}
 
 	if (dquot->dq_dqb.dqb_bsoftlimit &&
@@ -1310,28 +1314,34 @@ static int check_bdq(struct dquot *dquot, qsize_t space, int prealloc,
 	    dquot->dq_dqb.dqb_btime &&
 	    ktime_get_real_seconds() >= dquot->dq_dqb.dqb_btime &&
             !ignore_hardlimit(dquot)) {
-		if (!prealloc)
+		if (flags & DQUOT_SPACE_WARN)
 			prepare_warning(warn, dquot, QUOTA_NL_BSOFTLONGWARN);
-		return -EDQUOT;
+		ret = -EDQUOT;
+		goto out;
 	}
 
 	if (dquot->dq_dqb.dqb_bsoftlimit &&
 	    tspace > dquot->dq_dqb.dqb_bsoftlimit &&
 	    dquot->dq_dqb.dqb_btime == 0) {
-		if (!prealloc) {
+		if (flags & DQUOT_SPACE_WARN) {
 			prepare_warning(warn, dquot, QUOTA_NL_BSOFTWARN);
 			dquot->dq_dqb.dqb_btime = ktime_get_real_seconds() +
 			    sb_dqopt(sb)->info[dquot->dq_id.type].dqi_bgrace;
-		}
-		else
+		} else {
 			/*
 			 * We don't allow preallocation to exceed softlimit so exceeding will
 			 * be always printed
 			 */
-			return -EDQUOT;
+			ret = -EDQUOT;
+			goto out;
+		}
 	}
-
-	return 0;
+add:
+	dquot->dq_dqb.dqb_rsvspace += rsv_space;
+	dquot->dq_dqb.dqb_curspace += space;
+out:
+	spin_unlock(&dquot->dq_dqb_lock);
+	return ret;
 }
 
 static int info_idq_free(struct dquot *dquot, qsize_t inodes)
@@ -1466,8 +1476,15 @@ static int __dquot_initialize(struct inode *inode, int type)
 			 * did a write before quota was turned on
 			 */
 			rsv = inode_get_rsv_space(inode);
-			if (unlikely(rsv))
-				dquot_resv_space(dquots[cnt], rsv);
+			if (unlikely(rsv)) {
+				spin_lock(&inode->i_lock);
+				/* Get reservation again under proper lock */
+				rsv = __inode_get_rsv_space(inode);
+				spin_lock(&dquots[cnt]->dq_dqb_lock);
+				dquots[cnt]->dq_dqb.dqb_rsvspace += rsv;
+				spin_unlock(&dquots[cnt]->dq_dqb_lock);
+				spin_unlock(&inode->i_lock);
+			}
 		}
 	}
 out_lock:
@@ -1562,6 +1579,13 @@ static qsize_t *inode_reserved_space(struct inode * inode)
 	return inode->i_sb->dq_op->get_reserved_space(inode);
 }
 
+static qsize_t __inode_get_rsv_space(struct inode *inode)
+{
+	if (!inode->i_sb->dq_op->get_reserved_space)
+		return 0;
+	return *inode_reserved_space(inode);
+}
+
 static qsize_t inode_get_rsv_space(struct inode *inode)
 {
 	qsize_t ret;
@@ -1569,7 +1593,7 @@ static qsize_t inode_get_rsv_space(struct inode *inode)
 	if (!inode->i_sb->dq_op->get_reserved_space)
 		return 0;
 	spin_lock(&inode->i_lock);
-	ret = *inode_reserved_space(inode);
+	ret = __inode_get_rsv_space(inode);
 	spin_unlock(&inode->i_lock);
 	return ret;
 }
@@ -1610,33 +1634,41 @@ int __dquot_alloc_space(struct inode *inode, qsize_t number, int flags)
 
 	dquots = i_dquot(inode);
 	index = srcu_read_lock(&dquot_srcu);
-	spin_lock(&dq_data_lock);
+	spin_lock(&inode->i_lock);
 	for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
 		if (!dquots[cnt])
 			continue;
-		ret = check_bdq(dquots[cnt], number,
-				!(flags & DQUOT_SPACE_WARN), &warn[cnt]);
-		if (ret && !(flags & DQUOT_SPACE_NOFAIL)) {
-			spin_unlock(&dq_data_lock);
+		if (flags & DQUOT_SPACE_RESERVE) {
+			ret = dquot_add_space(dquots[cnt], 0, number, flags,
+					      &warn[cnt]);
+		} else {
+			ret = dquot_add_space(dquots[cnt], number, 0, flags,
+					      &warn[cnt]);
+		}
+		if (ret) {
+			/* Back out changes we already did */
+			for (cnt--; cnt >= 0; cnt--) {
+				if (!dquots[cnt])
+					continue;
+				spin_lock(&dquots[cnt]->dq_dqb_lock);
+				if (flags & DQUOT_SPACE_RESERVE) {
+					dquots[cnt]->dq_dqb.dqb_rsvspace -=
+									number;
+				} else {
+					dquots[cnt]->dq_dqb.dqb_curspace -=
+									number;
+				}
+				spin_unlock(&dquots[cnt]->dq_dqb_lock);
+			}
+			spin_unlock(&inode->i_lock);
 			goto out_flush_warn;
 		}
 	}
-	for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
-		if (!dquots[cnt])
-			continue;
-		if (reserve)
-			dquot_resv_space(dquots[cnt], number);
-		else
-			dquot_incr_space(dquots[cnt], number);
-	}
-	if (reserve) {
-		spin_lock(&inode->i_lock);
+	if (reserve)
 		*inode_reserved_space(inode) += number;
-		spin_unlock(&inode->i_lock);
-	} else {
-		inode_add_bytes(inode, number);
-	}
-	spin_unlock(&dq_data_lock);
+	else
+		__inode_add_bytes(inode, number);
+	spin_unlock(&inode->i_lock);
 
 	if (reserve)
 		goto out_flush_warn;
@@ -1665,23 +1697,26 @@ int dquot_alloc_inode(struct inode *inode)
 
 	dquots = i_dquot(inode);
 	index = srcu_read_lock(&dquot_srcu);
-	spin_lock(&dq_data_lock);
+	spin_lock(&inode->i_lock);
 	for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
 		if (!dquots[cnt])
 			continue;
-		ret = check_idq(dquots[cnt], 1, &warn[cnt]);
-		if (ret)
+		ret = dquot_add_inodes(dquots[cnt], 1, &warn[cnt]);
+		if (ret) {
+			for (cnt--; cnt >= 0; cnt--) {
+				if (!dquots[cnt])
+					continue;
+				/* Back out changes we already did */
+				spin_lock(&dquots[cnt]->dq_dqb_lock);
+				dquots[cnt]->dq_dqb.dqb_curinodes--;
+				spin_unlock(&dquots[cnt]->dq_dqb_lock);
+			}
 			goto warn_put_all;
-	}
-
-	for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
-		if (!dquots[cnt])
-			continue;
-		dquot_incr_inodes(dquots[cnt], 1);
+		}
 	}
 
 warn_put_all:
-	spin_unlock(&dq_data_lock);
+	spin_unlock(&inode->i_lock);
 	if (ret == 0)
 		mark_all_dquot_dirty(dquots);
 	srcu_read_unlock(&dquot_srcu, index);
@@ -1708,24 +1743,24 @@ int dquot_claim_space_nodirty(struct inode *inode, qsize_t number)
 
 	dquots = i_dquot(inode);
 	index = srcu_read_lock(&dquot_srcu);
-	spin_lock(&dq_data_lock);
+	spin_lock(&inode->i_lock);
 	/* Claim reserved quotas to allocated quotas */
 	for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
 		if (dquots[cnt]) {
 			struct dquot *dquot = dquots[cnt];
 
+			spin_lock(&dquot->dq_dqb_lock);
 			if (WARN_ON_ONCE(dquot->dq_dqb.dqb_rsvspace < number))
 				number = dquot->dq_dqb.dqb_rsvspace;
 			dquot->dq_dqb.dqb_curspace += number;
 			dquot->dq_dqb.dqb_rsvspace -= number;
+			spin_unlock(&dquot->dq_dqb_lock);
 		}
 	}
 	/* Update inode bytes */
-	spin_lock(&inode->i_lock);
 	*inode_reserved_space(inode) -= number;
 	__inode_add_bytes(inode, number);
 	spin_unlock(&inode->i_lock);
-	spin_unlock(&dq_data_lock);
 	mark_all_dquot_dirty(dquots);
 	srcu_read_unlock(&dquot_srcu, index);
 	return 0;
@@ -1750,24 +1785,24 @@ void dquot_reclaim_space_nodirty(struct inode *inode, qsize_t number)
 
 	dquots = i_dquot(inode);
 	index = srcu_read_lock(&dquot_srcu);
-	spin_lock(&dq_data_lock);
+	spin_lock(&inode->i_lock);
 	/* Claim reserved quotas to allocated quotas */
 	for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
 		if (dquots[cnt]) {
 			struct dquot *dquot = dquots[cnt];
 
+			spin_lock(&dquot->dq_dqb_lock);
 			if (WARN_ON_ONCE(dquot->dq_dqb.dqb_curspace < number))
 				number = dquot->dq_dqb.dqb_curspace;
 			dquot->dq_dqb.dqb_rsvspace += number;
 			dquot->dq_dqb.dqb_curspace -= number;
+			spin_unlock(&dquot->dq_dqb_lock);
 		}
 	}
 	/* Update inode bytes */
-	spin_lock(&inode->i_lock);
 	*inode_reserved_space(inode) += number;
 	__inode_sub_bytes(inode, number);
 	spin_unlock(&inode->i_lock);
-	spin_unlock(&dq_data_lock);
 	mark_all_dquot_dirty(dquots);
 	srcu_read_unlock(&dquot_srcu, index);
 	return;
@@ -1797,13 +1832,14 @@ void __dquot_free_space(struct inode *inode, qsize_t number, int flags)
 
 	dquots = i_dquot(inode);
 	index = srcu_read_lock(&dquot_srcu);
-	spin_lock(&dq_data_lock);
+	spin_lock(&inode->i_lock);
 	for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
 		int wtype;
 
 		warn[cnt].w_type = QUOTA_NL_NOWARN;
 		if (!dquots[cnt])
 			continue;
+		spin_lock(&dquots[cnt]->dq_dqb_lock);
 		wtype = info_bdq_free(dquots[cnt], number);
 		if (wtype != QUOTA_NL_NOWARN)
 			prepare_warning(&warn[cnt], dquots[cnt], wtype);
@@ -1811,15 +1847,13 @@ void __dquot_free_space(struct inode *inode, qsize_t number, int flags)
 			dquot_free_reserved_space(dquots[cnt], number);
 		else
 			dquot_decr_space(dquots[cnt], number);
+		spin_unlock(&dquots[cnt]->dq_dqb_lock);
 	}
-	if (reserve) {
-		spin_lock(&inode->i_lock);
+	if (reserve)
 		*inode_reserved_space(inode) -= number;
-		spin_unlock(&inode->i_lock);
-	} else {
-		inode_sub_bytes(inode, number);
-	}
-	spin_unlock(&dq_data_lock);
+	else
+		__inode_sub_bytes(inode, number);
+	spin_unlock(&inode->i_lock);
 
 	if (reserve)
 		goto out_unlock;
@@ -1845,19 +1879,21 @@ void dquot_free_inode(struct inode *inode)
 
 	dquots = i_dquot(inode);
 	index = srcu_read_lock(&dquot_srcu);
-	spin_lock(&dq_data_lock);
+	spin_lock(&inode->i_lock);
 	for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
 		int wtype;
 
 		warn[cnt].w_type = QUOTA_NL_NOWARN;
 		if (!dquots[cnt])
 			continue;
+		spin_lock(&dquots[cnt]->dq_dqb_lock);
 		wtype = info_idq_free(dquots[cnt], 1);
 		if (wtype != QUOTA_NL_NOWARN)
 			prepare_warning(&warn[cnt], dquots[cnt], wtype);
 		dquot_decr_inodes(dquots[cnt], 1);
+		spin_unlock(&dquots[cnt]->dq_dqb_lock);
 	}
-	spin_unlock(&dq_data_lock);
+	spin_unlock(&inode->i_lock);
 	mark_all_dquot_dirty(dquots);
 	srcu_read_unlock(&dquot_srcu, index);
 	flush_warnings(warn);
@@ -1878,7 +1914,7 @@ EXPORT_SYMBOL(dquot_free_inode);
  */
 int __dquot_transfer(struct inode *inode, struct dquot **transfer_to)
 {
-	qsize_t space, cur_space;
+	qsize_t cur_space;
 	qsize_t rsv_space = 0;
 	qsize_t inode_usage = 1;
 	struct dquot *transfer_from[MAXQUOTAS] = {};
@@ -1905,14 +1941,18 @@ int __dquot_transfer(struct inode *inode, struct dquot **transfer_to)
 	}
 
 	spin_lock(&dq_data_lock);
+	spin_lock(&inode->i_lock);
 	if (IS_NOQUOTA(inode)) {	/* File without quota accounting? */
+		spin_unlock(&inode->i_lock);
 		spin_unlock(&dq_data_lock);
 		return 0;
 	}
-	cur_space = inode_get_bytes(inode);
-	rsv_space = inode_get_rsv_space(inode);
-	space = cur_space + rsv_space;
-	/* Build the transfer_from list and check the limits */
+	cur_space = __inode_get_bytes(inode);
+	rsv_space = __inode_get_rsv_space(inode);
+	/*
+	 * Build the transfer_from list, check limits, and update usage in
+	 * the target structures.
+	 */
 	for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
 		/*
 		 * Skip changes for same uid or gid or for turned off quota-type.
@@ -1924,28 +1964,33 @@ int __dquot_transfer(struct inode *inode, struct dquot **transfer_to)
 			continue;
 		is_valid[cnt] = 1;
 		transfer_from[cnt] = i_dquot(inode)[cnt];
-		ret = check_idq(transfer_to[cnt], inode_usage, &warn_to[cnt]);
+		ret = dquot_add_inodes(transfer_to[cnt], inode_usage,
+				       &warn_to[cnt]);
 		if (ret)
 			goto over_quota;
-		ret = check_bdq(transfer_to[cnt], space, 0, &warn_to[cnt]);
-		if (ret)
+		ret = dquot_add_space(transfer_to[cnt], cur_space, rsv_space, 0,
+				      &warn_to[cnt]);
+		if (ret) {
+			dquot_decr_inodes(transfer_to[cnt], inode_usage);
 			goto over_quota;
+		}
 	}
 
-	/*
-	 * Finally perform the needed transfer from transfer_from to transfer_to
-	 */
+	/* Decrease usage for source structures and update quota pointers */
 	for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
 		if (!is_valid[cnt])
 			continue;
 		/* Due to IO error we might not have transfer_from[] structure */
 		if (transfer_from[cnt]) {
 			int wtype;
+
+			spin_lock(&transfer_from[cnt]->dq_dqb_lock);
 			wtype = info_idq_free(transfer_from[cnt], inode_usage);
 			if (wtype != QUOTA_NL_NOWARN)
 				prepare_warning(&warn_from_inodes[cnt],
 						transfer_from[cnt], wtype);
-			wtype = info_bdq_free(transfer_from[cnt], space);
+			wtype = info_bdq_free(transfer_from[cnt],
+					      cur_space + rsv_space);
 			if (wtype != QUOTA_NL_NOWARN)
 				prepare_warning(&warn_from_space[cnt],
 						transfer_from[cnt], wtype);
@@ -1953,14 +1998,11 @@ int __dquot_transfer(struct inode *inode, struct dquot **transfer_to)
 			dquot_decr_space(transfer_from[cnt], cur_space);
 			dquot_free_reserved_space(transfer_from[cnt],
 						  rsv_space);
+			spin_unlock(&transfer_from[cnt]->dq_dqb_lock);
 		}
-
-		dquot_incr_inodes(transfer_to[cnt], inode_usage);
-		dquot_incr_space(transfer_to[cnt], cur_space);
-		dquot_resv_space(transfer_to[cnt], rsv_space);
-
 		i_dquot(inode)[cnt] = transfer_to[cnt];
 	}
+	spin_unlock(&inode->i_lock);
 	spin_unlock(&dq_data_lock);
 
 	mark_all_dquot_dirty(transfer_from);
@@ -1974,6 +2016,17 @@ int __dquot_transfer(struct inode *inode, struct dquot **transfer_to)
 			transfer_to[cnt] = transfer_from[cnt];
 	return 0;
 over_quota:
+	/* Back out changes we already did */
+	for (cnt--; cnt >= 0; cnt--) {
+		if (!is_valid[cnt])
+			continue;
+		spin_lock(&transfer_to[cnt]->dq_dqb_lock);
+		dquot_decr_inodes(transfer_to[cnt], inode_usage);
+		dquot_decr_space(transfer_to[cnt], cur_space);
+		dquot_free_reserved_space(transfer_to[cnt], rsv_space);
+		spin_unlock(&transfer_to[cnt]->dq_dqb_lock);
+	}
+	spin_unlock(&inode->i_lock);
 	spin_unlock(&dq_data_lock);
 	flush_warnings(warn_to);
 	return ret;
@@ -2524,7 +2577,7 @@ static void do_get_dqblk(struct dquot *dquot, struct qc_dqblk *di)
 	struct mem_dqblk *dm = &dquot->dq_dqb;
 
 	memset(di, 0, sizeof(*di));
-	spin_lock(&dq_data_lock);
+	spin_lock(&dquot->dq_dqb_lock);
 	di->d_spc_hardlimit = dm->dqb_bhardlimit;
 	di->d_spc_softlimit = dm->dqb_bsoftlimit;
 	di->d_ino_hardlimit = dm->dqb_ihardlimit;
@@ -2533,7 +2586,7 @@ static void do_get_dqblk(struct dquot *dquot, struct qc_dqblk *di)
 	di->d_ino_count = dm->dqb_curinodes;
 	di->d_spc_timer = dm->dqb_btime;
 	di->d_ino_timer = dm->dqb_itime;
-	spin_unlock(&dq_data_lock);
+	spin_unlock(&dquot->dq_dqb_lock);
 }
 
 int dquot_get_dqblk(struct super_block *sb, struct kqid qid,
@@ -2597,7 +2650,7 @@ static int do_set_dqblk(struct dquot *dquot, struct qc_dqblk *di)
 	     (di->d_ino_hardlimit > dqi->dqi_max_ino_limit)))
 		return -ERANGE;
 
-	spin_lock(&dq_data_lock);
+	spin_lock(&dquot->dq_dqb_lock);
 	if (di->d_fieldmask & QC_SPACE) {
 		dm->dqb_curspace = di->d_space - dm->dqb_rsvspace;
 		check_blim = 1;
@@ -2663,7 +2716,7 @@ static int do_set_dqblk(struct dquot *dquot, struct qc_dqblk *di)
 		clear_bit(DQ_FAKE_B, &dquot->dq_flags);
 	else
 		set_bit(DQ_FAKE_B, &dquot->dq_flags);
-	spin_unlock(&dq_data_lock);
+	spin_unlock(&dquot->dq_dqb_lock);
 	mark_dquot_dirty(dquot);
 
 	return 0;
diff --git a/fs/quota/quota_tree.c b/fs/quota/quota_tree.c
index 54f85eb2609c..bb3f59bcfcf5 100644
--- a/fs/quota/quota_tree.c
+++ b/fs/quota/quota_tree.c
@@ -389,9 +389,9 @@ int qtree_write_dquot(struct qtree_mem_dqinfo *info, struct dquot *dquot)
 			return ret;
 		}
 	}
-	spin_lock(&dq_data_lock);
+	spin_lock(&dquot->dq_dqb_lock);
 	info->dqi_ops->mem2disk_dqblk(ddquot, dquot);
-	spin_unlock(&dq_data_lock);
+	spin_unlock(&dquot->dq_dqb_lock);
 	ret = sb->s_op->quota_write(sb, type, ddquot, info->dqi_entry_size,
 				    dquot->dq_off);
 	if (ret != info->dqi_entry_size) {
@@ -649,14 +649,14 @@ int qtree_read_dquot(struct qtree_mem_dqinfo *info, struct dquot *dquot)
 		kfree(ddquot);
 		goto out;
 	}
-	spin_lock(&dq_data_lock);
+	spin_lock(&dquot->dq_dqb_lock);
 	info->dqi_ops->disk2mem_dqblk(dquot, ddquot);
 	if (!dquot->dq_dqb.dqb_bhardlimit &&
 	    !dquot->dq_dqb.dqb_bsoftlimit &&
 	    !dquot->dq_dqb.dqb_ihardlimit &&
 	    !dquot->dq_dqb.dqb_isoftlimit)
 		set_bit(DQ_FAKE_B, &dquot->dq_flags);
-	spin_unlock(&dq_data_lock);
+	spin_unlock(&dquot->dq_dqb_lock);
 	kfree(ddquot);
 out:
 	dqstats_inc(DQST_READS);
diff --git a/include/linux/quota.h b/include/linux/quota.h
index eccc1cb6274e..074123975595 100644
--- a/include/linux/quota.h
+++ b/include/linux/quota.h
@@ -298,12 +298,13 @@ struct dquot {
 	struct list_head dq_free;	/* Free list element */
 	struct list_head dq_dirty;	/* List of dirty dquots */
 	struct mutex dq_lock;		/* dquot IO lock */
+	spinlock_t dq_dqb_lock;		/* Lock protecting dq_dqb changes */
 	atomic_t dq_count;		/* Use count */
 	struct super_block *dq_sb;	/* superblock this applies to */
 	struct kqid dq_id;		/* ID this applies to (uid, gid, projid) */
 	loff_t dq_off;			/* Offset of dquot on disk */
 	unsigned long dq_flags;		/* See DQ_* */
-	struct mem_dqblk dq_dqb;	/* Diskquota usage */
+	struct mem_dqblk dq_dqb;	/* Diskquota usage [dq_dqb_lock] */
 };
 
 /* Operations which must be implemented by each quota format */
-- 
cgit v1.2.3


From c71b02e4d207cbcf097f9746d5f7967b22905e70 Mon Sep 17 00:00:00 2001
From: Kees Cook <keescook@chromium.org>
Date: Wed, 9 Aug 2017 21:11:00 -0700
Subject: Revert "pstore: Honor dmesg_restrict sysctl on dmesg dumps"

This reverts commit 68c4a4f8abc60c9440ede9cd123d48b78325f7a3, with
various conflict clean-ups.

The capability check required too much privilege compared to simple DAC
controls. A system builder was forced to have crash handler processes
run with CAP_SYSLOG which would give it the ability to read (and wipe)
the _current_ dmesg, which is much more access than being given access
only to the historical log stored in pstorefs.

With the prior commit to make the root directory 0750, the files are
protected by default but a system builder can now opt to give access
to a specific group (via chgrp on the pstorefs root directory) without
being forced to also give away CAP_SYSLOG.

Suggested-by: Nick Kralevich <nnk@google.com>
Signed-off-by: Kees Cook <keescook@chromium.org>
Reviewed-by: Petr Mladek <pmladek@suse.cz>
Reviewed-by: Sergey Senozhatsky <sergey.senozhatsky@gmail.com>
---
 fs/pstore/inode.c      | 22 ----------------------
 include/linux/syslog.h |  9 ---------
 kernel/printk/printk.c |  3 +--
 3 files changed, 1 insertion(+), 33 deletions(-)

(limited to 'include')

diff --git a/fs/pstore/inode.c b/fs/pstore/inode.c
index f1e88b695090..d814723fb27d 100644
--- a/fs/pstore/inode.c
+++ b/fs/pstore/inode.c
@@ -36,7 +36,6 @@
 #include <linux/slab.h>
 #include <linux/spinlock.h>
 #include <linux/uaccess.h>
-#include <linux/syslog.h>
 
 #include "internal.h"
 
@@ -132,18 +131,6 @@ static const struct seq_operations pstore_ftrace_seq_ops = {
 	.show	= pstore_ftrace_seq_show,
 };
 
-static int pstore_check_syslog_permissions(struct pstore_private *ps)
-{
-	switch (ps->record->type) {
-	case PSTORE_TYPE_DMESG:
-	case PSTORE_TYPE_CONSOLE:
-		return check_syslog_permissions(SYSLOG_ACTION_READ_ALL,
-			SYSLOG_FROM_READER);
-	default:
-		return 0;
-	}
-}
-
 static ssize_t pstore_file_read(struct file *file, char __user *userbuf,
 						size_t count, loff_t *ppos)
 {
@@ -163,10 +150,6 @@ static int pstore_file_open(struct inode *inode, struct file *file)
 	int err;
 	const struct seq_operations *sops = NULL;
 
-	err = pstore_check_syslog_permissions(ps);
-	if (err)
-		return err;
-
 	if (ps->record->type == PSTORE_TYPE_FTRACE)
 		sops = &pstore_ftrace_seq_ops;
 
@@ -204,11 +187,6 @@ static int pstore_unlink(struct inode *dir, struct dentry *dentry)
 {
 	struct pstore_private *p = d_inode(dentry)->i_private;
 	struct pstore_record *record = p->record;
-	int err;
-
-	err = pstore_check_syslog_permissions(p);
-	if (err)
-		return err;
 
 	if (!record->psi->erase)
 		return -EPERM;
diff --git a/include/linux/syslog.h b/include/linux/syslog.h
index c3a7f0cc3a27..e1c3632f4e81 100644
--- a/include/linux/syslog.h
+++ b/include/linux/syslog.h
@@ -49,13 +49,4 @@
 
 int do_syslog(int type, char __user *buf, int count, int source);
 
-#ifdef CONFIG_PRINTK
-int check_syslog_permissions(int type, int source);
-#else
-static inline int check_syslog_permissions(int type, int source)
-{
-	return 0;
-}
-#endif
-
 #endif /* _LINUX_SYSLOG_H */
diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c
index fc47863f629c..97bda7b0655b 100644
--- a/kernel/printk/printk.c
+++ b/kernel/printk/printk.c
@@ -649,7 +649,7 @@ static int syslog_action_restricted(int type)
 	       type != SYSLOG_ACTION_SIZE_BUFFER;
 }
 
-int check_syslog_permissions(int type, int source)
+static int check_syslog_permissions(int type, int source)
 {
 	/*
 	 * If this is from /proc/kmsg and we've already opened it, then we've
@@ -677,7 +677,6 @@ int check_syslog_permissions(int type, int source)
 ok:
 	return security_syslog(type);
 }
-EXPORT_SYMBOL_GPL(check_syslog_permissions);
 
 static void append_char(char **pp, char *e, char c)
 {
-- 
cgit v1.2.3


From 726fb6b4f2a82a14a906f39bdabac4863b87c01a Mon Sep 17 00:00:00 2001
From: Srinivas Pandruvada <srinivas.pandruvada@linux.intel.com>
Date: Tue, 15 Aug 2017 18:16:59 -0700
Subject: ACPI / PM: Check low power idle constraints for debug only

For SoC to achieve its lowest power platform idle state a set of hardware
preconditions must be met. These preconditions or constraints can be
obtained by issuing a device specific method (_DSM) with function "1".
Refer to the document provided in the link below.

Here during initialization (from attach() callback of LPS0 device), invoke
function 1 to get the device constraints. Each enabled constraint is
stored in a table.

The devices in this table are used to check whether they were in required
minimum state, while entering suspend. This check is done from platform
freeze wake() callback, only when /sys/power/pm_debug_messages attribute
is non zero.

If any constraint is not met and device is ACPI power managed then it
prints the device information to kernel logs.

Also if debug is enabled in acpi/sleep.c, the constraint table and state
of each device on wake is dumped in kernel logs.

Since pm_debug_messages_on setting is used as condition to check
constraints outside kernel/power/main.c, pm_debug_messages_on is changed
to a global variable.

Link: http://www.uefi.org/sites/default/files/resources/Intel_ACPI_Low_Power_S0_Idle.pdf
Signed-off-by: Srinivas Pandruvada <srinivas.pandruvada@linux.intel.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/acpi/sleep.c    | 168 ++++++++++++++++++++++++++++++++++++++++++++++++
 include/linux/suspend.h |   2 +
 kernel/power/main.c     |   2 +-
 3 files changed, 171 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/drivers/acpi/sleep.c b/drivers/acpi/sleep.c
index f7a8abbeac6e..61b2e9f17c4e 100644
--- a/drivers/acpi/sleep.c
+++ b/drivers/acpi/sleep.c
@@ -669,6 +669,7 @@ static const struct acpi_device_id lps0_device_ids[] = {
 
 #define ACPI_LPS0_DSM_UUID	"c4eb40a0-6cd2-11e2-bcfd-0800200c9a66"
 
+#define ACPI_LPS0_GET_DEVICE_CONSTRAINTS	1
 #define ACPI_LPS0_SCREEN_OFF	3
 #define ACPI_LPS0_SCREEN_ON	4
 #define ACPI_LPS0_ENTRY		5
@@ -680,6 +681,166 @@ static acpi_handle lps0_device_handle;
 static guid_t lps0_dsm_guid;
 static char lps0_dsm_func_mask;
 
+/* Device constraint entry structure */
+struct lpi_device_info {
+	char *name;
+	int enabled;
+	union acpi_object *package;
+};
+
+/* Constraint package structure */
+struct lpi_device_constraint {
+	int uid;
+	int min_dstate;
+	int function_states;
+};
+
+struct lpi_constraints {
+	acpi_handle handle;
+	int min_dstate;
+};
+
+static struct lpi_constraints *lpi_constraints_table;
+static int lpi_constraints_table_size;
+
+static void lpi_device_get_constraints(void)
+{
+	union acpi_object *out_obj;
+	int i;
+
+	out_obj = acpi_evaluate_dsm_typed(lps0_device_handle, &lps0_dsm_guid,
+					  1, ACPI_LPS0_GET_DEVICE_CONSTRAINTS,
+					  NULL, ACPI_TYPE_PACKAGE);
+
+	acpi_handle_debug(lps0_device_handle, "_DSM function 1 eval %s\n",
+			  out_obj ? "successful" : "failed");
+
+	if (!out_obj)
+		return;
+
+	lpi_constraints_table = kcalloc(out_obj->package.count,
+					sizeof(*lpi_constraints_table),
+					GFP_KERNEL);
+	if (!lpi_constraints_table)
+		goto free_acpi_buffer;
+
+	acpi_handle_debug(lps0_device_handle, "LPI: constraints list begin:\n");
+
+	for (i = 0; i < out_obj->package.count; i++) {
+		struct lpi_constraints *constraint;
+		acpi_status status;
+		union acpi_object *package = &out_obj->package.elements[i];
+		struct lpi_device_info info = { };
+		int package_count = 0, j;
+
+		if (!package)
+			continue;
+
+		for (j = 0; j < package->package.count; ++j) {
+			union acpi_object *element =
+					&(package->package.elements[j]);
+
+			switch (element->type) {
+			case ACPI_TYPE_INTEGER:
+				info.enabled = element->integer.value;
+				break;
+			case ACPI_TYPE_STRING:
+				info.name = element->string.pointer;
+				break;
+			case ACPI_TYPE_PACKAGE:
+				package_count = element->package.count;
+				info.package = element->package.elements;
+				break;
+			}
+		}
+
+		if (!info.enabled || !info.package || !info.name)
+			continue;
+
+		constraint = &lpi_constraints_table[lpi_constraints_table_size];
+
+		status = acpi_get_handle(NULL, info.name, &constraint->handle);
+		if (ACPI_FAILURE(status))
+			continue;
+
+		acpi_handle_debug(lps0_device_handle,
+				  "index:%d Name:%s\n", i, info.name);
+
+		constraint->min_dstate = -1;
+
+		for (j = 0; j < package_count; ++j) {
+			union acpi_object *info_obj = &info.package[j];
+			union acpi_object *cnstr_pkg;
+			union acpi_object *obj;
+			struct lpi_device_constraint dev_info;
+
+			switch (info_obj->type) {
+			case ACPI_TYPE_INTEGER:
+				/* version */
+				break;
+			case ACPI_TYPE_PACKAGE:
+				if (info_obj->package.count < 2)
+					break;
+
+				cnstr_pkg = info_obj->package.elements;
+				obj = &cnstr_pkg[0];
+				dev_info.uid = obj->integer.value;
+				obj = &cnstr_pkg[1];
+				dev_info.min_dstate = obj->integer.value;
+
+				acpi_handle_debug(lps0_device_handle,
+					"uid:%d min_dstate:%s\n",
+					dev_info.uid,
+					acpi_power_state_string(dev_info.min_dstate));
+
+				constraint->min_dstate = dev_info.min_dstate;
+				break;
+			}
+		}
+
+		if (constraint->min_dstate < 0) {
+			acpi_handle_debug(lps0_device_handle,
+					  "Incomplete constraint defined\n");
+			continue;
+		}
+
+		lpi_constraints_table_size++;
+	}
+
+	acpi_handle_debug(lps0_device_handle, "LPI: constraints list end\n");
+
+free_acpi_buffer:
+	ACPI_FREE(out_obj);
+}
+
+static void lpi_check_constraints(void)
+{
+	int i;
+
+	for (i = 0; i < lpi_constraints_table_size; ++i) {
+		struct acpi_device *adev;
+
+		if (acpi_bus_get_device(lpi_constraints_table[i].handle, &adev))
+			continue;
+
+		acpi_handle_debug(adev->handle,
+			"LPI: required min power state:%s current power state:%s\n",
+			acpi_power_state_string(lpi_constraints_table[i].min_dstate),
+			acpi_power_state_string(adev->power.state));
+
+		if (!adev->flags.power_manageable) {
+			acpi_handle_info(adev->handle, "LPI: Device not power manageble\n");
+			continue;
+		}
+
+		if (adev->power.state < lpi_constraints_table[i].min_dstate)
+			acpi_handle_info(adev->handle,
+				"LPI: Constraint not met; min power state:%s current power state:%s\n",
+				acpi_power_state_string(lpi_constraints_table[i].min_dstate),
+				acpi_power_state_string(adev->power.state));
+	}
+}
+
 static void acpi_sleep_run_lps0_dsm(unsigned int func)
 {
 	union acpi_object *out_obj;
@@ -729,6 +890,9 @@ static int lps0_device_attach(struct acpi_device *adev,
 				  "_DSM function 0 evaluation failed\n");
 	}
 	ACPI_FREE(out_obj);
+
+	lpi_device_get_constraints();
+
 	return 0;
 }
 
@@ -766,6 +930,10 @@ static int acpi_s2idle_prepare(void)
 
 static void acpi_s2idle_wake(void)
 {
+
+	if (pm_debug_messages_on)
+		lpi_check_constraints();
+
 	/*
 	 * If IRQD_WAKEUP_ARMED is not set for the SCI at this point, it means
 	 * that the SCI has triggered while suspended, so cancel the wakeup in
diff --git a/include/linux/suspend.h b/include/linux/suspend.h
index 0d41daf7e89d..d10b7980799d 100644
--- a/include/linux/suspend.h
+++ b/include/linux/suspend.h
@@ -495,9 +495,11 @@ static inline void unlock_system_sleep(void) {}
 
 #ifdef CONFIG_PM_SLEEP_DEBUG
 extern bool pm_print_times_enabled;
+extern bool pm_debug_messages_on;
 extern __printf(2, 3) void __pm_pr_dbg(bool defer, const char *fmt, ...);
 #else
 #define pm_print_times_enabled	(false)
+#define pm_debug_messages_on	(false)
 
 #include <linux/printk.h>
 
diff --git a/kernel/power/main.c b/kernel/power/main.c
index 3074ea4cec0a..3a2ca9066583 100644
--- a/kernel/power/main.c
+++ b/kernel/power/main.c
@@ -361,7 +361,7 @@ static ssize_t pm_wakeup_irq_show(struct kobject *kobj,
 
 power_attr_ro(pm_wakeup_irq);
 
-static bool pm_debug_messages_on __read_mostly;
+bool pm_debug_messages_on __read_mostly;
 
 static ssize_t pm_debug_messages_show(struct kobject *kobj,
 				      struct kobj_attribute *attr, char *buf)
-- 
cgit v1.2.3


From 74def747bcd09692bdbf8c6a15350795b0f11ca8 Mon Sep 17 00:00:00 2001
From: Marc Zyngier <marc.zyngier@arm.com>
Date: Fri, 18 Aug 2017 09:39:14 +0100
Subject: genirq: Restrict effective affinity to interrupts actually using it

Just because CONFIG_GENERIC_IRQ_EFFECTIVE_AFF_MASK is selected
doesn't mean that all the interrupts are using the effective
affinity mask. For a number of them, this mask is likely to
be empty.

In order to deal with this, let's restrict the use of the
effective affinity mask to these interrupts that have a non empty
effective affinity.

Signed-off-by: Marc Zyngier <marc.zyngier@arm.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Andrew Lunn <andrew@lunn.ch>
Cc: James Hogan <james.hogan@imgtec.com>
Cc: Jason Cooper <jason@lakedaemon.net>
Cc: Paul Burton <paul.burton@imgtec.com>
Cc: Chris Zankel <chris@zankel.net>
Cc: Kevin Cernekee <cernekee@gmail.com>
Cc: Wei Xu <xuwei5@hisilicon.com>
Cc: Max Filippov <jcmvbkbc@gmail.com>
Cc: Florian Fainelli <f.fainelli@gmail.com>
Cc: Gregory Clement <gregory.clement@free-electrons.com>
Cc: Matt Redfearn <matt.redfearn@imgtec.com>
Cc: Sebastian Hesselbarth <sebastian.hesselbarth@gmail.com>
Link: http://lkml.kernel.org/r/20170818083925.10108-2-marc.zyngier@arm.com
---
 include/linux/irq.h | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/irq.h b/include/linux/irq.h
index d2d543794093..dcfac6c8ba18 100644
--- a/include/linux/irq.h
+++ b/include/linux/irq.h
@@ -781,7 +781,10 @@ static inline struct cpumask *irq_data_get_affinity_mask(struct irq_data *d)
 static inline
 struct cpumask *irq_data_get_effective_affinity_mask(struct irq_data *d)
 {
-	return d->common->effective_affinity;
+	if (!cpumask_empty(d->common->effective_affinity))
+		return d->common->effective_affinity;
+
+	return d->common->affinity;
 }
 static inline void irq_data_update_effective_affinity(struct irq_data *d,
 						      const struct cpumask *m)
-- 
cgit v1.2.3


From 7703b08cc93b3586f9eb733f3a2b10bed634a5cf Mon Sep 17 00:00:00 2001
From: David Daney <david.daney@cavium.com>
Date: Thu, 17 Aug 2017 17:53:31 -0700
Subject: genirq: Add handle_fasteoi_{level,edge}_irq flow handlers

Follow-on patch for gpio-thunderx uses a irqdomain hierarchy which
requires slightly different flow handlers, add them to chip.c which
contains most of the other flow handlers.  Make these conditionally
compiled based on CONFIG_IRQ_FASTEOI_HIERARCHY_HANDLERS.

Signed-off-by: David Daney <david.daney@cavium.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Alexandre Courbot <gnurou@gmail.com>
Cc: Marc Zyngier <marc.zyngier@arm.com>
Cc: Linus Walleij <linus.walleij@linaro.org>
Cc: linux-gpio@vger.kernel.org
Link: http://lkml.kernel.org/r/1503017616-3252-3-git-send-email-david.daney@cavium.com
---
 include/linux/irq.h |   2 +
 kernel/irq/Kconfig  |   4 ++
 kernel/irq/chip.c   | 106 ++++++++++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 112 insertions(+)

(limited to 'include')

diff --git a/include/linux/irq.h b/include/linux/irq.h
index d2d543794093..d4728bf6a537 100644
--- a/include/linux/irq.h
+++ b/include/linux/irq.h
@@ -568,6 +568,8 @@ extern int irq_chip_compose_msi_msg(struct irq_data *data, struct msi_msg *msg);
 extern int irq_chip_pm_get(struct irq_data *data);
 extern int irq_chip_pm_put(struct irq_data *data);
 #ifdef	CONFIG_IRQ_DOMAIN_HIERARCHY
+extern void handle_fasteoi_ack_irq(struct irq_desc *desc);
+extern void handle_fasteoi_mask_irq(struct irq_desc *desc);
 extern void irq_chip_enable_parent(struct irq_data *data);
 extern void irq_chip_disable_parent(struct irq_data *data);
 extern void irq_chip_ack_parent(struct irq_data *data);
diff --git a/kernel/irq/Kconfig b/kernel/irq/Kconfig
index 1d06af787932..a117adf7084b 100644
--- a/kernel/irq/Kconfig
+++ b/kernel/irq/Kconfig
@@ -73,6 +73,10 @@ config IRQ_DOMAIN_HIERARCHY
 	bool
 	select IRQ_DOMAIN
 
+# Support for hierarchical fasteoi+edge and fasteoi+level handlers
+config IRQ_FASTEOI_HIERARCHY_HANDLERS
+	bool
+
 # Generic IRQ IPI support
 config GENERIC_IRQ_IPI
 	bool
diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c
index 6514f07acaad..23958980189d 100644
--- a/kernel/irq/chip.c
+++ b/kernel/irq/chip.c
@@ -1092,6 +1092,112 @@ void irq_cpu_offline(void)
 }
 
 #ifdef	CONFIG_IRQ_DOMAIN_HIERARCHY
+
+#ifdef CONFIG_IRQ_FASTEOI_HIERARCHY_HANDLERS
+/**
+ *	handle_fasteoi_ack_irq - irq handler for edge hierarchy
+ *	stacked on transparent controllers
+ *
+ *	@desc:	the interrupt description structure for this irq
+ *
+ *	Like handle_fasteoi_irq(), but for use with hierarchy where
+ *	the irq_chip also needs to have its ->irq_ack() function
+ *	called.
+ */
+void handle_fasteoi_ack_irq(struct irq_desc *desc)
+{
+	struct irq_chip *chip = desc->irq_data.chip;
+
+	raw_spin_lock(&desc->lock);
+
+	if (!irq_may_run(desc))
+		goto out;
+
+	desc->istate &= ~(IRQS_REPLAY | IRQS_WAITING);
+
+	/*
+	 * If its disabled or no action available
+	 * then mask it and get out of here:
+	 */
+	if (unlikely(!desc->action || irqd_irq_disabled(&desc->irq_data))) {
+		desc->istate |= IRQS_PENDING;
+		mask_irq(desc);
+		goto out;
+	}
+
+	kstat_incr_irqs_this_cpu(desc);
+	if (desc->istate & IRQS_ONESHOT)
+		mask_irq(desc);
+
+	/* Start handling the irq */
+	desc->irq_data.chip->irq_ack(&desc->irq_data);
+
+	preflow_handler(desc);
+	handle_irq_event(desc);
+
+	cond_unmask_eoi_irq(desc, chip);
+
+	raw_spin_unlock(&desc->lock);
+	return;
+out:
+	if (!(chip->flags & IRQCHIP_EOI_IF_HANDLED))
+		chip->irq_eoi(&desc->irq_data);
+	raw_spin_unlock(&desc->lock);
+}
+EXPORT_SYMBOL_GPL(handle_fasteoi_ack_irq);
+
+/**
+ *	handle_fasteoi_mask_irq - irq handler for level hierarchy
+ *	stacked on transparent controllers
+ *
+ *	@desc:	the interrupt description structure for this irq
+ *
+ *	Like handle_fasteoi_irq(), but for use with hierarchy where
+ *	the irq_chip also needs to have its ->irq_mask_ack() function
+ *	called.
+ */
+void handle_fasteoi_mask_irq(struct irq_desc *desc)
+{
+	struct irq_chip *chip = desc->irq_data.chip;
+
+	raw_spin_lock(&desc->lock);
+	mask_ack_irq(desc);
+
+	if (!irq_may_run(desc))
+		goto out;
+
+	desc->istate &= ~(IRQS_REPLAY | IRQS_WAITING);
+
+	/*
+	 * If its disabled or no action available
+	 * then mask it and get out of here:
+	 */
+	if (unlikely(!desc->action || irqd_irq_disabled(&desc->irq_data))) {
+		desc->istate |= IRQS_PENDING;
+		mask_irq(desc);
+		goto out;
+	}
+
+	kstat_incr_irqs_this_cpu(desc);
+	if (desc->istate & IRQS_ONESHOT)
+		mask_irq(desc);
+
+	preflow_handler(desc);
+	handle_irq_event(desc);
+
+	cond_unmask_eoi_irq(desc, chip);
+
+	raw_spin_unlock(&desc->lock);
+	return;
+out:
+	if (!(chip->flags & IRQCHIP_EOI_IF_HANDLED))
+		chip->irq_eoi(&desc->irq_data);
+	raw_spin_unlock(&desc->lock);
+}
+EXPORT_SYMBOL_GPL(handle_fasteoi_mask_irq);
+
+#endif /* CONFIG_IRQ_FASTEOI_HIERARCHY_HANDLERS */
+
 /**
  * irq_chip_enable_parent - Enable the parent interrupt (defaults to unmask if
  * NULL)
-- 
cgit v1.2.3


From 495c38d3001fd226cf91df1d031320f349bcaf35 Mon Sep 17 00:00:00 2001
From: David Daney <david.daney@cavium.com>
Date: Thu, 17 Aug 2017 17:53:34 -0700
Subject: irqdomain: Add irq_domain_{push,pop}_irq() functions

For an already existing irqdomain hierarchy, as might be obtained via
a call to pci_enable_msix_range(), a PCI driver wishing to add an
additional irqdomain to the hierarchy needs to be able to insert the
irqdomain to that already initialized hierarchy.  Calling
irq_domain_create_hierarchy() allows the new irqdomain to be created,
but no existing code allows for initializing the associated irq_data.

Add a couple of helper functions (irq_domain_push_irq() and
irq_domain_pop_irq()) to initialize the irq_data for the new
irqdomain added to an existing hierarchy.

Signed-off-by: David Daney <david.daney@cavium.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Marc Zyngier <marc.zyngier@arm.com>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Alexandre Courbot <gnurou@gmail.com>
Cc: Linus Walleij <linus.walleij@linaro.org>
Cc: linux-gpio@vger.kernel.org
Link: http://lkml.kernel.org/r/1503017616-3252-6-git-send-email-david.daney@cavium.com
---
 include/linux/irqdomain.h |   3 +
 kernel/irq/irqdomain.c    | 169 ++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 172 insertions(+)

(limited to 'include')

diff --git a/include/linux/irqdomain.h b/include/linux/irqdomain.h
index cac77a5c5555..2318f29054af 100644
--- a/include/linux/irqdomain.h
+++ b/include/linux/irqdomain.h
@@ -460,6 +460,9 @@ extern void irq_domain_free_irqs_common(struct irq_domain *domain,
 extern void irq_domain_free_irqs_top(struct irq_domain *domain,
 				     unsigned int virq, unsigned int nr_irqs);
 
+extern int irq_domain_push_irq(struct irq_domain *domain, int virq, void *arg);
+extern int irq_domain_pop_irq(struct irq_domain *domain, int virq);
+
 extern int irq_domain_alloc_irqs_parent(struct irq_domain *domain,
 					unsigned int irq_base,
 					unsigned int nr_irqs, void *arg);
diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c
index 24fda7557cef..1ff9912211e9 100644
--- a/kernel/irq/irqdomain.c
+++ b/kernel/irq/irqdomain.c
@@ -1449,6 +1449,175 @@ out_free_desc:
 	return ret;
 }
 
+/* The irq_data was moved, fix the revmap to refer to the new location */
+static void irq_domain_fix_revmap(struct irq_data *d)
+{
+	void **slot;
+
+	if (d->hwirq < d->domain->revmap_size)
+		return; /* Not using radix tree. */
+
+	/* Fix up the revmap. */
+	mutex_lock(&revmap_trees_mutex);
+	slot = radix_tree_lookup_slot(&d->domain->revmap_tree, d->hwirq);
+	if (slot)
+		radix_tree_replace_slot(&d->domain->revmap_tree, slot, d);
+	mutex_unlock(&revmap_trees_mutex);
+}
+
+/**
+ * irq_domain_push_irq() - Push a domain in to the top of a hierarchy.
+ * @domain:	Domain to push.
+ * @virq:	Irq to push the domain in to.
+ * @arg:	Passed to the irq_domain_ops alloc() function.
+ *
+ * For an already existing irqdomain hierarchy, as might be obtained
+ * via a call to pci_enable_msix(), add an additional domain to the
+ * head of the processing chain.  Must be called before request_irq()
+ * has been called.
+ */
+int irq_domain_push_irq(struct irq_domain *domain, int virq, void *arg)
+{
+	struct irq_data *child_irq_data;
+	struct irq_data *root_irq_data = irq_get_irq_data(virq);
+	struct irq_desc *desc;
+	int rv = 0;
+
+	/*
+	 * Check that no action has been set, which indicates the virq
+	 * is in a state where this function doesn't have to deal with
+	 * races between interrupt handling and maintaining the
+	 * hierarchy.  This will catch gross misuse.  Attempting to
+	 * make the check race free would require holding locks across
+	 * calls to struct irq_domain_ops->alloc(), which could lead
+	 * to deadlock, so we just do a simple check before starting.
+	 */
+	desc = irq_to_desc(virq);
+	if (!desc)
+		return -EINVAL;
+	if (WARN_ON(desc->action))
+		return -EBUSY;
+
+	if (domain == NULL)
+		return -EINVAL;
+
+	if (WARN_ON(!irq_domain_is_hierarchy(domain)))
+		return -EINVAL;
+
+	if (domain->parent != root_irq_data->domain)
+		return -EINVAL;
+
+	if (!root_irq_data)
+		return -EINVAL;
+
+	child_irq_data = kzalloc_node(sizeof(*child_irq_data), GFP_KERNEL,
+				      irq_data_get_node(root_irq_data));
+	if (!child_irq_data)
+		return -ENOMEM;
+
+	mutex_lock(&irq_domain_mutex);
+
+	/* Copy the original irq_data. */
+	*child_irq_data = *root_irq_data;
+
+	/*
+	 * Overwrite the root_irq_data, which is embedded in struct
+	 * irq_desc, with values for this domain.
+	 */
+	root_irq_data->parent_data = child_irq_data;
+	root_irq_data->domain = domain;
+	root_irq_data->mask = 0;
+	root_irq_data->hwirq = 0;
+	root_irq_data->chip = NULL;
+	root_irq_data->chip_data = NULL;
+
+	/* May (probably does) set hwirq, chip, etc. */
+	rv = irq_domain_alloc_irqs_hierarchy(domain, virq, 1, arg);
+	if (rv) {
+		/* Restore the original irq_data. */
+		*root_irq_data = *child_irq_data;
+		goto error;
+	}
+
+	irq_domain_fix_revmap(child_irq_data);
+	irq_domain_set_mapping(domain, root_irq_data->hwirq, root_irq_data);
+
+error:
+	mutex_unlock(&irq_domain_mutex);
+
+	return rv;
+}
+EXPORT_SYMBOL_GPL(irq_domain_push_irq);
+
+/**
+ * irq_domain_pop_irq() - Remove a domain from the top of a hierarchy.
+ * @domain:	Domain to remove.
+ * @virq:	Irq to remove the domain from.
+ *
+ * Undo the effects of a call to irq_domain_push_irq().  Must be
+ * called either before request_irq() or after free_irq().
+ */
+int irq_domain_pop_irq(struct irq_domain *domain, int virq)
+{
+	struct irq_data *root_irq_data = irq_get_irq_data(virq);
+	struct irq_data *child_irq_data;
+	struct irq_data *tmp_irq_data;
+	struct irq_desc *desc;
+
+	/*
+	 * Check that no action is set, which indicates the virq is in
+	 * a state where this function doesn't have to deal with races
+	 * between interrupt handling and maintaining the hierarchy.
+	 * This will catch gross misuse.  Attempting to make the check
+	 * race free would require holding locks across calls to
+	 * struct irq_domain_ops->free(), which could lead to
+	 * deadlock, so we just do a simple check before starting.
+	 */
+	desc = irq_to_desc(virq);
+	if (!desc)
+		return -EINVAL;
+	if (WARN_ON(desc->action))
+		return -EBUSY;
+
+	if (domain == NULL)
+		return -EINVAL;
+
+	if (!root_irq_data)
+		return -EINVAL;
+
+	tmp_irq_data = irq_domain_get_irq_data(domain, virq);
+
+	/* We can only "pop" if this domain is at the top of the list */
+	if (WARN_ON(root_irq_data != tmp_irq_data))
+		return -EINVAL;
+
+	if (WARN_ON(root_irq_data->domain != domain))
+		return -EINVAL;
+
+	child_irq_data = root_irq_data->parent_data;
+	if (WARN_ON(!child_irq_data))
+		return -EINVAL;
+
+	mutex_lock(&irq_domain_mutex);
+
+	root_irq_data->parent_data = NULL;
+
+	irq_domain_clear_mapping(domain, root_irq_data->hwirq);
+	irq_domain_free_irqs_hierarchy(domain, virq, 1);
+
+	/* Restore the original irq_data. */
+	*root_irq_data = *child_irq_data;
+
+	irq_domain_fix_revmap(root_irq_data);
+
+	mutex_unlock(&irq_domain_mutex);
+
+	kfree(child_irq_data);
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(irq_domain_pop_irq);
+
 /**
  * irq_domain_free_irqs - Free IRQ number and associated data structures
  * @virq:	base IRQ number
-- 
cgit v1.2.3


From 7edaeb6841dfb27e362288ab8466ebdc4972e867 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Tue, 15 Aug 2017 09:50:13 +0200
Subject: kernel/watchdog: Prevent false positives with turbo modes

The hardlockup detector on x86 uses a performance counter based on unhalted
CPU cycles and a periodic hrtimer. The hrtimer period is about 2/5 of the
performance counter period, so the hrtimer should fire 2-3 times before the
performance counter NMI fires. The NMI code checks whether the hrtimer
fired since the last invocation. If not, it assumess a hard lockup.

The calculation of those periods is based on the nominal CPU
frequency. Turbo modes increase the CPU clock frequency and therefore
shorten the period of the perf/NMI watchdog. With extreme Turbo-modes (3x
nominal frequency) the perf/NMI period is shorter than the hrtimer period
which leads to false positives.

A simple fix would be to shorten the hrtimer period, but that comes with
the side effect of more frequent hrtimer and softlockup thread wakeups,
which is not desired.

Implement a low pass filter, which checks the perf/NMI period against
kernel time. If the perf/NMI fires before 4/5 of the watchdog period has
elapsed then the event is ignored and postponed to the next perf/NMI.

That solves the problem and avoids the overhead of shorter hrtimer periods
and more frequent softlockup thread wakeups.

Fixes: 58687acba592 ("lockup_detector: Combine nmi_watchdog and softlockup detector")
Reported-and-tested-by: Kan Liang <Kan.liang@intel.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: dzickus@redhat.com
Cc: prarit@redhat.com
Cc: ak@linux.intel.com
Cc: babu.moger@oracle.com
Cc: peterz@infradead.org
Cc: eranian@google.com
Cc: acme@redhat.com
Cc: stable@vger.kernel.org
Cc: atomlin@redhat.com
Cc: akpm@linux-foundation.org
Cc: torvalds@linux-foundation.org
Link: http://lkml.kernel.org/r/alpine.DEB.2.20.1708150931310.1886@nanos
---
 arch/x86/Kconfig      |  1 +
 include/linux/nmi.h   |  8 +++++++
 kernel/watchdog.c     |  1 +
 kernel/watchdog_hld.c | 59 +++++++++++++++++++++++++++++++++++++++++++++++++++
 lib/Kconfig.debug     |  7 ++++++
 5 files changed, 76 insertions(+)

(limited to 'include')

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 781521b7cf9e..9101bfc85539 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -100,6 +100,7 @@ config X86
 	select GENERIC_STRNCPY_FROM_USER
 	select GENERIC_STRNLEN_USER
 	select GENERIC_TIME_VSYSCALL
+	select HARDLOCKUP_CHECK_TIMESTAMP	if X86_64
 	select HAVE_ACPI_APEI			if ACPI
 	select HAVE_ACPI_APEI_NMI		if ACPI
 	select HAVE_ALIGNED_STRUCT_PAGE		if SLUB
diff --git a/include/linux/nmi.h b/include/linux/nmi.h
index 8aa01fd859fb..a36abe2da13e 100644
--- a/include/linux/nmi.h
+++ b/include/linux/nmi.h
@@ -168,6 +168,14 @@ extern int sysctl_hardlockup_all_cpu_backtrace;
 #define sysctl_softlockup_all_cpu_backtrace 0
 #define sysctl_hardlockup_all_cpu_backtrace 0
 #endif
+
+#if defined(CONFIG_HARDLOCKUP_CHECK_TIMESTAMP) && \
+    defined(CONFIG_HARDLOCKUP_DETECTOR)
+void watchdog_update_hrtimer_threshold(u64 period);
+#else
+static inline void watchdog_update_hrtimer_threshold(u64 period) { }
+#endif
+
 extern bool is_hardlockup(void);
 struct ctl_table;
 extern int proc_watchdog(struct ctl_table *, int ,
diff --git a/kernel/watchdog.c b/kernel/watchdog.c
index 06d3389bca0d..f5d52024f6b7 100644
--- a/kernel/watchdog.c
+++ b/kernel/watchdog.c
@@ -240,6 +240,7 @@ static void set_sample_period(void)
 	 * hardlockup detector generates a warning
 	 */
 	sample_period = get_softlockup_thresh() * ((u64)NSEC_PER_SEC / 5);
+	watchdog_update_hrtimer_threshold(sample_period);
 }
 
 /* Commands for resetting the watchdog */
diff --git a/kernel/watchdog_hld.c b/kernel/watchdog_hld.c
index 295a0d84934c..3a09ea1b1d3d 100644
--- a/kernel/watchdog_hld.c
+++ b/kernel/watchdog_hld.c
@@ -37,6 +37,62 @@ void arch_touch_nmi_watchdog(void)
 }
 EXPORT_SYMBOL(arch_touch_nmi_watchdog);
 
+#ifdef CONFIG_HARDLOCKUP_CHECK_TIMESTAMP
+static DEFINE_PER_CPU(ktime_t, last_timestamp);
+static DEFINE_PER_CPU(unsigned int, nmi_rearmed);
+static ktime_t watchdog_hrtimer_sample_threshold __read_mostly;
+
+void watchdog_update_hrtimer_threshold(u64 period)
+{
+	/*
+	 * The hrtimer runs with a period of (watchdog_threshold * 2) / 5
+	 *
+	 * So it runs effectively with 2.5 times the rate of the NMI
+	 * watchdog. That means the hrtimer should fire 2-3 times before
+	 * the NMI watchdog expires. The NMI watchdog on x86 is based on
+	 * unhalted CPU cycles, so if Turbo-Mode is enabled the CPU cycles
+	 * might run way faster than expected and the NMI fires in a
+	 * smaller period than the one deduced from the nominal CPU
+	 * frequency. Depending on the Turbo-Mode factor this might be fast
+	 * enough to get the NMI period smaller than the hrtimer watchdog
+	 * period and trigger false positives.
+	 *
+	 * The sample threshold is used to check in the NMI handler whether
+	 * the minimum time between two NMI samples has elapsed. That
+	 * prevents false positives.
+	 *
+	 * Set this to 4/5 of the actual watchdog threshold period so the
+	 * hrtimer is guaranteed to fire at least once within the real
+	 * watchdog threshold.
+	 */
+	watchdog_hrtimer_sample_threshold = period * 2;
+}
+
+static bool watchdog_check_timestamp(void)
+{
+	ktime_t delta, now = ktime_get_mono_fast_ns();
+
+	delta = now - __this_cpu_read(last_timestamp);
+	if (delta < watchdog_hrtimer_sample_threshold) {
+		/*
+		 * If ktime is jiffies based, a stalled timer would prevent
+		 * jiffies from being incremented and the filter would look
+		 * at a stale timestamp and never trigger.
+		 */
+		if (__this_cpu_inc_return(nmi_rearmed) < 10)
+			return false;
+	}
+	__this_cpu_write(nmi_rearmed, 0);
+	__this_cpu_write(last_timestamp, now);
+	return true;
+}
+#else
+static inline bool watchdog_check_timestamp(void)
+{
+	return true;
+}
+#endif
+
 static struct perf_event_attr wd_hw_attr = {
 	.type		= PERF_TYPE_HARDWARE,
 	.config		= PERF_COUNT_HW_CPU_CYCLES,
@@ -61,6 +117,9 @@ static void watchdog_overflow_callback(struct perf_event *event,
 		return;
 	}
 
+	if (!watchdog_check_timestamp())
+		return;
+
 	/* check for a hardlockup
 	 * This is done by making sure our timer interrupt
 	 * is incrementing.  The timer interrupt should have
diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
index 98fe715522e8..c617b9d1d6cb 100644
--- a/lib/Kconfig.debug
+++ b/lib/Kconfig.debug
@@ -797,6 +797,13 @@ config HARDLOCKUP_DETECTOR_PERF
 	bool
 	select SOFTLOCKUP_DETECTOR
 
+#
+# Enables a timestamp based low pass filter to compensate for perf based
+# hard lockup detection which runs too fast due to turbo modes.
+#
+config HARDLOCKUP_CHECK_TIMESTAMP
+	bool
+
 #
 # arch/ can define HAVE_HARDLOCKUP_DETECTOR_ARCH to provide their own hard
 # lockup detector rather than the perf based detector.
-- 
cgit v1.2.3


From b94417eaa5f5a20d58a99328a401c0b5a812ec7d Mon Sep 17 00:00:00 2001
From: Anand Jain <anand.jain@oracle.com>
Date: Sun, 13 Aug 2017 11:58:30 +0800
Subject: btrfs: use BTRFS_FSID_SIZE for fsid

We have define for FSID size so use it.

Signed-off-by: Anand Jain <anand.jain@oracle.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 include/trace/events/btrfs.h | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

(limited to 'include')

diff --git a/include/trace/events/btrfs.h b/include/trace/events/btrfs.h
index 1e4908dcd065..dc1d0df91e0b 100644
--- a/include/trace/events/btrfs.h
+++ b/include/trace/events/btrfs.h
@@ -74,11 +74,11 @@ struct prelim_ref;
 	{ BTRFS_BLOCK_GROUP_RAID5,	"RAID5"},	\
 	{ BTRFS_BLOCK_GROUP_RAID6,	"RAID6"}
 
-#define BTRFS_UUID_SIZE 16
-#define TP_STRUCT__entry_fsid __array(u8, fsid, BTRFS_UUID_SIZE)
+#define BTRFS_FSID_SIZE 16
+#define TP_STRUCT__entry_fsid __array(u8, fsid, BTRFS_FSID_SIZE)
 
 #define TP_fast_assign_fsid(fs_info)					\
-	memcpy(__entry->fsid, fs_info->fsid, BTRFS_UUID_SIZE)
+	memcpy(__entry->fsid, fs_info->fsid, BTRFS_FSID_SIZE)
 
 #define TP_STRUCT__entry_btrfs(args...)					\
 	TP_STRUCT__entry(						\
@@ -618,7 +618,7 @@ TRACE_EVENT(btrfs_add_block_group,
 	TP_ARGS(fs_info, block_group, create),
 
 	TP_STRUCT__entry(
-		__array(	u8,	fsid,	BTRFS_UUID_SIZE	)
+		__array(	u8,	fsid,	BTRFS_FSID_SIZE	)
 		__field(	u64,	offset			)
 		__field(	u64,	size			)
 		__field(	u64,	flags			)
@@ -628,7 +628,7 @@ TRACE_EVENT(btrfs_add_block_group,
 	),
 
 	TP_fast_assign(
-		memcpy(__entry->fsid, fs_info->fsid, BTRFS_UUID_SIZE);
+		memcpy(__entry->fsid, fs_info->fsid, BTRFS_FSID_SIZE);
 		__entry->offset		= block_group->key.objectid;
 		__entry->size		= block_group->key.offset;
 		__entry->flags		= block_group->flags;
@@ -975,7 +975,7 @@ TRACE_EVENT(btrfs_trigger_flush,
 	TP_ARGS(fs_info, flags, bytes, flush, reason),
 
 	TP_STRUCT__entry(
-		__array(	u8,	fsid,	BTRFS_UUID_SIZE	)
+		__array(	u8,	fsid,	BTRFS_FSID_SIZE	)
 		__field(	u64,	flags			)
 		__field(	u64,	bytes			)
 		__field(	int,	flush			)
@@ -983,7 +983,7 @@ TRACE_EVENT(btrfs_trigger_flush,
 	),
 
 	TP_fast_assign(
-		memcpy(__entry->fsid, fs_info->fsid, BTRFS_UUID_SIZE);
+		memcpy(__entry->fsid, fs_info->fsid, BTRFS_FSID_SIZE);
 		__entry->flags	= flags;
 		__entry->bytes	= bytes;
 		__entry->flush	= flush;
@@ -1016,7 +1016,7 @@ TRACE_EVENT(btrfs_flush_space,
 	TP_ARGS(fs_info, flags, num_bytes, state, ret),
 
 	TP_STRUCT__entry(
-		__array(	u8,	fsid,	BTRFS_UUID_SIZE	)
+		__array(	u8,	fsid,	BTRFS_FSID_SIZE	)
 		__field(	u64,	flags			)
 		__field(	u64,	num_bytes		)
 		__field(	int,	state			)
@@ -1024,7 +1024,7 @@ TRACE_EVENT(btrfs_flush_space,
 	),
 
 	TP_fast_assign(
-		memcpy(__entry->fsid, fs_info->fsid, BTRFS_UUID_SIZE);
+		memcpy(__entry->fsid, fs_info->fsid, BTRFS_FSID_SIZE);
 		__entry->flags		=	flags;
 		__entry->num_bytes	=	num_bytes;
 		__entry->state		=	state;
-- 
cgit v1.2.3


From d352ae205d8b05f3f7558d10f474d8436581b3e2 Mon Sep 17 00:00:00 2001
From: Bart Van Assche <bart.vanassche@wdc.com>
Date: Thu, 17 Aug 2017 16:23:03 -0700
Subject: blk-mq: Make blk_mq_reinit_tagset() calls easier to read

Since blk_mq_ops.reinit_request is only called from inside
blk_mq_reinit_tagset(), make this function pointer an argument of
blk_mq_reinit_tagset() instead of a member of struct blk_mq_ops.
This patch does not change any functionality but makes
blk_mq_reinit_tagset() calls easier to read and to analyze.

Signed-off-by: Bart Van Assche <bart.vanassche@sandisk.com>
Reviewed-by: Hannes Reinecke <hare@suse.com>
Cc: Christoph Hellwig <hch@lst.de>
Cc: Sagi Grimberg <sagi@grimberg.me>
Cc: James Smart <james.smart@broadcom.com>
Cc: Johannes Thumshirn <jthumshirn@suse.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-mq-tag.c       |  9 +++++----
 drivers/nvme/host/fc.c   |  4 +---
 drivers/nvme/host/rdma.c | 11 ++++++-----
 include/linux/blk-mq.h   |  5 ++---
 4 files changed, 14 insertions(+), 15 deletions(-)

(limited to 'include')

diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c
index dc9e6dac5a2a..6714507aa6c7 100644
--- a/block/blk-mq-tag.c
+++ b/block/blk-mq-tag.c
@@ -298,11 +298,12 @@ void blk_mq_tagset_busy_iter(struct blk_mq_tag_set *tagset,
 }
 EXPORT_SYMBOL(blk_mq_tagset_busy_iter);
 
-int blk_mq_reinit_tagset(struct blk_mq_tag_set *set)
+int blk_mq_reinit_tagset(struct blk_mq_tag_set *set,
+			 int (reinit_request)(void *, struct request *))
 {
 	int i, j, ret = 0;
 
-	if (!set->ops->reinit_request)
+	if (WARN_ON_ONCE(!reinit_request))
 		goto out;
 
 	for (i = 0; i < set->nr_hw_queues; i++) {
@@ -315,8 +316,8 @@ int blk_mq_reinit_tagset(struct blk_mq_tag_set *set)
 			if (!tags->static_rqs[j])
 				continue;
 
-			ret = set->ops->reinit_request(set->driver_data,
-						tags->static_rqs[j]);
+			ret = reinit_request(set->driver_data,
+					     tags->static_rqs[j]);
 			if (ret)
 				goto out;
 		}
diff --git a/drivers/nvme/host/fc.c b/drivers/nvme/host/fc.c
index 5c2a08ef08ba..1438be649866 100644
--- a/drivers/nvme/host/fc.c
+++ b/drivers/nvme/host/fc.c
@@ -2168,7 +2168,6 @@ static const struct blk_mq_ops nvme_fc_mq_ops = {
 	.complete	= nvme_fc_complete_rq,
 	.init_request	= nvme_fc_init_request,
 	.exit_request	= nvme_fc_exit_request,
-	.reinit_request	= nvme_fc_reinit_request,
 	.init_hctx	= nvme_fc_init_hctx,
 	.poll		= nvme_fc_poll,
 	.timeout	= nvme_fc_timeout,
@@ -2269,7 +2268,7 @@ nvme_fc_reinit_io_queues(struct nvme_fc_ctrl *ctrl)
 
 	nvme_fc_init_io_queues(ctrl);
 
-	ret = blk_mq_reinit_tagset(&ctrl->tag_set);
+	ret = blk_mq_reinit_tagset(&ctrl->tag_set, nvme_fc_reinit_request);
 	if (ret)
 		goto out_free_io_queues;
 
@@ -2655,7 +2654,6 @@ static const struct blk_mq_ops nvme_fc_admin_mq_ops = {
 	.complete	= nvme_fc_complete_rq,
 	.init_request	= nvme_fc_init_request,
 	.exit_request	= nvme_fc_exit_request,
-	.reinit_request	= nvme_fc_reinit_request,
 	.init_hctx	= nvme_fc_init_admin_hctx,
 	.timeout	= nvme_fc_timeout,
 };
diff --git a/drivers/nvme/host/rdma.c b/drivers/nvme/host/rdma.c
index da04df1af231..9ff0eb3a625e 100644
--- a/drivers/nvme/host/rdma.c
+++ b/drivers/nvme/host/rdma.c
@@ -704,14 +704,16 @@ static void nvme_rdma_reconnect_ctrl_work(struct work_struct *work)
 	if (ctrl->ctrl.queue_count > 1) {
 		nvme_rdma_free_io_queues(ctrl);
 
-		ret = blk_mq_reinit_tagset(&ctrl->tag_set);
+		ret = blk_mq_reinit_tagset(&ctrl->tag_set,
+					   nvme_rdma_reinit_request);
 		if (ret)
 			goto requeue;
 	}
 
 	nvme_rdma_stop_and_free_queue(&ctrl->queues[0]);
 
-	ret = blk_mq_reinit_tagset(&ctrl->admin_tag_set);
+	ret = blk_mq_reinit_tagset(&ctrl->admin_tag_set,
+				   nvme_rdma_reinit_request);
 	if (ret)
 		goto requeue;
 
@@ -1503,7 +1505,6 @@ static const struct blk_mq_ops nvme_rdma_mq_ops = {
 	.complete	= nvme_rdma_complete_rq,
 	.init_request	= nvme_rdma_init_request,
 	.exit_request	= nvme_rdma_exit_request,
-	.reinit_request	= nvme_rdma_reinit_request,
 	.init_hctx	= nvme_rdma_init_hctx,
 	.poll		= nvme_rdma_poll,
 	.timeout	= nvme_rdma_timeout,
@@ -1514,7 +1515,6 @@ static const struct blk_mq_ops nvme_rdma_admin_mq_ops = {
 	.complete	= nvme_rdma_complete_rq,
 	.init_request	= nvme_rdma_init_request,
 	.exit_request	= nvme_rdma_exit_request,
-	.reinit_request	= nvme_rdma_reinit_request,
 	.init_hctx	= nvme_rdma_init_admin_hctx,
 	.timeout	= nvme_rdma_timeout,
 };
@@ -1712,7 +1712,8 @@ static void nvme_rdma_reset_ctrl_work(struct work_struct *work)
 	}
 
 	if (ctrl->ctrl.queue_count > 1) {
-		ret = blk_mq_reinit_tagset(&ctrl->tag_set);
+		ret = blk_mq_reinit_tagset(&ctrl->tag_set,
+					   nvme_rdma_reinit_request);
 		if (ret)
 			goto del_dead_ctrl;
 
diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
index 14542308d25b..50c6485cb04f 100644
--- a/include/linux/blk-mq.h
+++ b/include/linux/blk-mq.h
@@ -97,7 +97,6 @@ typedef int (init_request_fn)(struct blk_mq_tag_set *set, struct request *,
 		unsigned int, unsigned int);
 typedef void (exit_request_fn)(struct blk_mq_tag_set *set, struct request *,
 		unsigned int);
-typedef int (reinit_request_fn)(void *, struct request *);
 
 typedef void (busy_iter_fn)(struct blk_mq_hw_ctx *, struct request *, void *,
 		bool);
@@ -143,7 +142,6 @@ struct blk_mq_ops {
 	 */
 	init_request_fn		*init_request;
 	exit_request_fn		*exit_request;
-	reinit_request_fn	*reinit_request;
 	/* Called from inside blk_get_request() */
 	void (*initialize_rq_fn)(struct request *rq);
 
@@ -261,7 +259,8 @@ void blk_freeze_queue_start(struct request_queue *q);
 void blk_mq_freeze_queue_wait(struct request_queue *q);
 int blk_mq_freeze_queue_wait_timeout(struct request_queue *q,
 				     unsigned long timeout);
-int blk_mq_reinit_tagset(struct blk_mq_tag_set *set);
+int blk_mq_reinit_tagset(struct blk_mq_tag_set *set,
+			 int (reinit_request)(void *, struct request *));
 
 int blk_mq_map_queues(struct blk_mq_tag_set *set);
 void blk_mq_update_nr_hw_queues(struct blk_mq_tag_set *set, int nr_hw_queues);
-- 
cgit v1.2.3


From e1cba4b85daa71b710384d451ff6238d5e4d1ff6 Mon Sep 17 00:00:00 2001
From: Waiman Long <longman@redhat.com>
Date: Thu, 17 Aug 2017 15:33:09 -0400
Subject: cgroup: Add mount flag to enable cpuset to use v2 behavior in v1
 cgroup

A new mount option "cpuset_v2_mode" is added to the v1 cgroupfs
filesystem to enable cpuset controller to use v2 behavior in a v1
cgroup. This mount option applies only to cpuset controller and have
no effect on other controllers.

Signed-off-by: Waiman Long <longman@redhat.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 include/linux/cgroup-defs.h | 5 +++++
 kernel/cgroup/cgroup-v1.c   | 6 ++++++
 2 files changed, 11 insertions(+)

(limited to 'include')

diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h
index 59e4ad9e7bac..ade4a78a54c2 100644
--- a/include/linux/cgroup-defs.h
+++ b/include/linux/cgroup-defs.h
@@ -74,6 +74,11 @@ enum {
 	 * aren't writeable from inside the namespace.
 	 */
 	CGRP_ROOT_NS_DELEGATE	= (1 << 3),
+
+	/*
+	 * Enable cpuset controller in v1 cgroup to use v2 behavior.
+	 */
+	CGRP_ROOT_CPUSET_V2_MODE = (1 << 4),
 };
 
 /* cftype->flags */
diff --git a/kernel/cgroup/cgroup-v1.c b/kernel/cgroup/cgroup-v1.c
index f0e8601b13cb..024085daab1a 100644
--- a/kernel/cgroup/cgroup-v1.c
+++ b/kernel/cgroup/cgroup-v1.c
@@ -895,6 +895,8 @@ static int cgroup1_show_options(struct seq_file *seq, struct kernfs_root *kf_roo
 		seq_puts(seq, ",noprefix");
 	if (root->flags & CGRP_ROOT_XATTR)
 		seq_puts(seq, ",xattr");
+	if (root->flags & CGRP_ROOT_CPUSET_V2_MODE)
+		seq_puts(seq, ",cpuset_v2_mode");
 
 	spin_lock(&release_agent_path_lock);
 	if (strlen(root->release_agent_path))
@@ -949,6 +951,10 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts)
 			opts->cpuset_clone_children = true;
 			continue;
 		}
+		if (!strcmp(token, "cpuset_v2_mode")) {
+			opts->flags |= CGRP_ROOT_CPUSET_V2_MODE;
+			continue;
+		}
 		if (!strcmp(token, "xattr")) {
 			opts->flags |= CGRP_ROOT_XATTR;
 			continue;
-- 
cgit v1.2.3


From f01f969e25ecc6f88844bcdd63f1f27610418bbc Mon Sep 17 00:00:00 2001
From: Kishon Vijay Abraham I <kishon@ti.com>
Date: Fri, 18 Aug 2017 20:27:54 +0530
Subject: PCI: endpoint: Add an API to get matching "pci_epf_device_id"

Add an API to get "pci_epf_device_id" matching the EPF name. This can be
used by the EPF driver to get the driver data corresponding to the EPF
device name.

Signed-off-by: Kishon Vijay Abraham I <kishon@ti.com>
[bhelgaas: folded in "while" loop termination fix from Colin Ian King
<colin.king@canonical.com>]
Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
---
 drivers/pci/endpoint/pci-epf-core.c | 16 ++++++++++++++++
 include/linux/pci-epf.h             |  2 ++
 2 files changed, 18 insertions(+)

(limited to 'include')

diff --git a/drivers/pci/endpoint/pci-epf-core.c b/drivers/pci/endpoint/pci-epf-core.c
index 6877d6a5bcc9..f14e2be52658 100644
--- a/drivers/pci/endpoint/pci-epf-core.c
+++ b/drivers/pci/endpoint/pci-epf-core.c
@@ -267,6 +267,22 @@ err_ret:
 }
 EXPORT_SYMBOL_GPL(pci_epf_create);
 
+const struct pci_epf_device_id *
+pci_epf_match_device(const struct pci_epf_device_id *id, struct pci_epf *epf)
+{
+	if (!id || !epf)
+		return NULL;
+
+	while (*id->name) {
+		if (strcmp(epf->name, id->name) == 0)
+			return id;
+		id++;
+	}
+
+	return NULL;
+}
+EXPORT_SYMBOL_GPL(pci_epf_match_device);
+
 static void pci_epf_dev_release(struct device *dev)
 {
 	struct pci_epf *epf = to_pci_epf(dev);
diff --git a/include/linux/pci-epf.h b/include/linux/pci-epf.h
index 0d529cb90143..6ed63b5e106b 100644
--- a/include/linux/pci-epf.h
+++ b/include/linux/pci-epf.h
@@ -149,6 +149,8 @@ static inline void *epf_get_drvdata(struct pci_epf *epf)
 	return dev_get_drvdata(&epf->dev);
 }
 
+const struct pci_epf_device_id *
+pci_epf_match_device(const struct pci_epf_device_id *id, struct pci_epf *epf);
 struct pci_epf *pci_epf_create(const char *name);
 void pci_epf_destroy(struct pci_epf *epf);
 int __pci_epf_register_driver(struct pci_epf_driver *driver,
-- 
cgit v1.2.3


From 300238cecede382fbd9084499e57f3b30acc75fa Mon Sep 17 00:00:00 2001
From: Gustavo Padovan <gustavo.padovan@collabora.com>
Date: Mon, 31 Jul 2017 16:36:55 -0300
Subject: dma-buf/sync_file: document flags field

Documentation for it was missing.

Signed-off-by: Gustavo Padovan <gustavo.padovan@collabora.com>
Acked-by: Sumit Semwal <sumit.semwal@linaro.org>
Link: https://patchwork.freedesktop.org/patch/msgid/20170731193655.6176-1-gustavo@padovan.org
---
 include/linux/sync_file.h | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'include')

diff --git a/include/linux/sync_file.h b/include/linux/sync_file.h
index 0ad87c434ae6..790ca021203a 100644
--- a/include/linux/sync_file.h
+++ b/include/linux/sync_file.h
@@ -25,8 +25,12 @@
  * @file:		file representing this fence
  * @sync_file_list:	membership in global file list
  * @wq:			wait queue for fence signaling
+ * @flags:		flags for the sync_file
  * @fence:		fence with the fences in the sync_file
  * @cb:			fence callback information
+ *
+ * flags:
+ * POLL_ENABLED: whether userspace is currently poll()'ing or not
  */
 struct sync_file {
 	struct file		*file;
-- 
cgit v1.2.3


From 92d50fc1602ecef44babe411c475344e55e1cdd9 Mon Sep 17 00:00:00 2001
From: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Date: Wed, 19 Jul 2017 15:01:06 +0200
Subject: PCI/IB: add support for pci driver attribute groups

Some drivers (specifically the nes IB driver), want to create a lot of
sysfs driver attributes.  Instead of open-coding the creation and
removal of these files (and getting it wrong btw), it's a better idea to
let the driver core handle all of this logic for us.

So add a new field to the pci driver structure, **groups, that allows
pci drivers to specify an attribute group list it wishes to have created
when it is registered with the driver core.

Big bonus is now the driver doesn't race with userspace when the sysfs
files are created vs. when the kobject is announced, so any script/tool
that actually wanted to use these files will not have to poll waiting
for them to show up.

Cc: Faisal Latif <faisal.latif@intel.com>
Cc: Doug Ledford <dledford@redhat.com>
Cc: Sean Hefty <sean.hefty@intel.com>
Cc: Hal Rosenstock <hal.rosenstock@gmail.com>
Cc: Bjorn Helgaas <bhelgaas@google.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Acked-by: Bjorn Helgaas <bhelgaas@google.com>
Signed-off-by: Doug Ledford <dledford@redhat.com>
---
 drivers/infiniband/hw/nes/nes.c | 67 ++++++++++++++---------------------------
 drivers/pci/pci-driver.c        |  1 +
 include/linux/pci.h             |  1 +
 3 files changed, 25 insertions(+), 44 deletions(-)

(limited to 'include')

diff --git a/drivers/infiniband/hw/nes/nes.c b/drivers/infiniband/hw/nes/nes.c
index a30aa6527f7e..84162e9ae5c0 100644
--- a/drivers/infiniband/hw/nes/nes.c
+++ b/drivers/infiniband/hw/nes/nes.c
@@ -808,13 +808,6 @@ static void nes_remove(struct pci_dev *pcidev)
 }
 
 
-static struct pci_driver nes_pci_driver = {
-	.name = DRV_NAME,
-	.id_table = nes_pci_table,
-	.probe = nes_probe,
-	.remove = nes_remove,
-};
-
 static ssize_t adapter_show(struct device_driver *ddp, char *buf)
 {
 	unsigned int  devfn = 0xffffffff;
@@ -1156,35 +1149,29 @@ static DRIVER_ATTR_RW(idx_addr);
 static DRIVER_ATTR_RW(idx_data);
 static DRIVER_ATTR_RW(wqm_quanta);
 
-static int nes_create_driver_sysfs(struct pci_driver *drv)
-{
-	int error;
-	error  = driver_create_file(&drv->driver, &driver_attr_adapter);
-	error |= driver_create_file(&drv->driver, &driver_attr_eeprom_cmd);
-	error |= driver_create_file(&drv->driver, &driver_attr_eeprom_data);
-	error |= driver_create_file(&drv->driver, &driver_attr_flash_cmd);
-	error |= driver_create_file(&drv->driver, &driver_attr_flash_data);
-	error |= driver_create_file(&drv->driver, &driver_attr_nonidx_addr);
-	error |= driver_create_file(&drv->driver, &driver_attr_nonidx_data);
-	error |= driver_create_file(&drv->driver, &driver_attr_idx_addr);
-	error |= driver_create_file(&drv->driver, &driver_attr_idx_data);
-	error |= driver_create_file(&drv->driver, &driver_attr_wqm_quanta);
-	return error;
-}
+static struct attribute *nes_attrs[] = {
+	&driver_attr_adapter.attr,
+	&driver_attr_eeprom_cmd.attr,
+	&driver_attr_eeprom_data.attr,
+	&driver_attr_flash_cmd.attr,
+	&driver_attr_flash_data.attr,
+	&driver_attr_nonidx_addr.attr,
+	&driver_attr_nonidx_data.attr,
+	&driver_attr_idx_addr.attr,
+	&driver_attr_idx_data.attr,
+	&driver_attr_wqm_quanta.attr,
+	NULL,
+};
+ATTRIBUTE_GROUPS(nes);
+
+static struct pci_driver nes_pci_driver = {
+	.name = DRV_NAME,
+	.id_table = nes_pci_table,
+	.probe = nes_probe,
+	.remove = nes_remove,
+	.groups = nes_groups,
+};
 
-static void nes_remove_driver_sysfs(struct pci_driver *drv)
-{
-	driver_remove_file(&drv->driver, &driver_attr_adapter);
-	driver_remove_file(&drv->driver, &driver_attr_eeprom_cmd);
-	driver_remove_file(&drv->driver, &driver_attr_eeprom_data);
-	driver_remove_file(&drv->driver, &driver_attr_flash_cmd);
-	driver_remove_file(&drv->driver, &driver_attr_flash_data);
-	driver_remove_file(&drv->driver, &driver_attr_nonidx_addr);
-	driver_remove_file(&drv->driver, &driver_attr_nonidx_data);
-	driver_remove_file(&drv->driver, &driver_attr_idx_addr);
-	driver_remove_file(&drv->driver, &driver_attr_idx_data);
-	driver_remove_file(&drv->driver, &driver_attr_wqm_quanta);
-}
 
 /**
  * nes_init_module - module initialization entry point
@@ -1192,20 +1179,13 @@ static void nes_remove_driver_sysfs(struct pci_driver *drv)
 static int __init nes_init_module(void)
 {
 	int retval;
-	int retval1;
 
 	retval = nes_cm_start();
 	if (retval) {
 		printk(KERN_ERR PFX "Unable to start NetEffect iWARP CM.\n");
 		return retval;
 	}
-	retval = pci_register_driver(&nes_pci_driver);
-	if (retval >= 0) {
-		retval1 = nes_create_driver_sysfs(&nes_pci_driver);
-		if (retval1 < 0)
-			printk(KERN_ERR PFX "Unable to create NetEffect sys files.\n");
-	}
-	return retval;
+	return pci_register_driver(&nes_pci_driver);
 }
 
 
@@ -1215,7 +1195,6 @@ static int __init nes_init_module(void)
 static void __exit nes_exit_module(void)
 {
 	nes_cm_stop();
-	nes_remove_driver_sysfs(&nes_pci_driver);
 
 	pci_unregister_driver(&nes_pci_driver);
 }
diff --git a/drivers/pci/pci-driver.c b/drivers/pci/pci-driver.c
index d51e8738f9c2..4450feaf5c00 100644
--- a/drivers/pci/pci-driver.c
+++ b/drivers/pci/pci-driver.c
@@ -1307,6 +1307,7 @@ int __pci_register_driver(struct pci_driver *drv, struct module *owner,
 	drv->driver.bus = &pci_bus_type;
 	drv->driver.owner = owner;
 	drv->driver.mod_name = mod_name;
+	drv->driver.groups = drv->groups;
 
 	spin_lock_init(&drv->dynids.lock);
 	INIT_LIST_HEAD(&drv->dynids.list);
diff --git a/include/linux/pci.h b/include/linux/pci.h
index 4869e66dd659..c421af60f2df 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -729,6 +729,7 @@ struct pci_driver {
 	void (*shutdown) (struct pci_dev *dev);
 	int (*sriov_configure) (struct pci_dev *dev, int num_vfs); /* PF pdev */
 	const struct pci_error_handlers *err_handler;
+	const struct attribute_group **groups;
 	struct device_driver	driver;
 	struct pci_dynids dynids;
 };
-- 
cgit v1.2.3


From 5f8a4db715ca21c3195b5eea24e26574c0f5acfa Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Mon, 31 Jul 2017 08:50:05 +0200
Subject: infiniband: avoid overflow warning

A sockaddr_in structure on the stack getting passed into rdma_ip2gid
triggers this warning, since we memcpy into a larger sockaddr_in6
structure:

In function 'memcpy',
    inlined from 'rdma_ip2gid' at include/rdma/ib_addr.h:175:3,
    inlined from 'addr_event.isra.4.constprop' at drivers/infiniband/core/roce_gid_mgmt.c:693:2,
    inlined from 'inetaddr_event' at drivers/infiniband/core/roce_gid_mgmt.c:716:9:
include/linux/string.h:305:4: error: call to '__read_overflow2' declared with attribute error: detected read beyond size of object passed as 2nd parameter

The warning seems appropriate here, but the code is also clearly
correct, so we really just want to shut up this instance of the
output.

The best way I found so far is to avoid the memcpy() call and instead
replace it with a struct assignment.

Fixes: 6974f0c4555e ("include/linux/string.h: add the option of fortified string.h functions")
Cc: Daniel Micay <danielmicay@gmail.com>
Cc: Kees Cook <keescook@chromium.org>
Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Signed-off-by: Doug Ledford <dledford@redhat.com>
---
 include/rdma/ib_addr.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/rdma/ib_addr.h b/include/rdma/ib_addr.h
index b73a14edc85e..454e6ea742a5 100644
--- a/include/rdma/ib_addr.h
+++ b/include/rdma/ib_addr.h
@@ -172,7 +172,8 @@ static inline int rdma_ip2gid(struct sockaddr *addr, union ib_gid *gid)
 				       (struct in6_addr *)gid);
 		break;
 	case AF_INET6:
-		memcpy(gid->raw, &((struct sockaddr_in6 *)addr)->sin6_addr, 16);
+		*(struct in6_addr *)&gid->raw =
+			((struct sockaddr_in6 *)addr)->sin6_addr;
 		break;
 	default:
 		return -EINVAL;
-- 
cgit v1.2.3


From ce7c252a8c741aba7c38f817b86e34361f561e42 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <trond.myklebust@primarydata.com>
Date: Wed, 16 Aug 2017 15:30:35 -0400
Subject: SUNRPC: Add a separate spinlock to protect the RPC request receive
 list

This further reduces contention with the transport_lock, and allows us
to convert to using a non-bh-safe spinlock, since the list is now never
accessed from a bh context.

Signed-off-by: Trond Myklebust <trond.myklebust@primarydata.com>
---
 include/linux/sunrpc/xprt.h                |  1 +
 net/sunrpc/svcsock.c                       |  6 +++---
 net/sunrpc/xprt.c                          | 20 ++++++++++++--------
 net/sunrpc/xprtrdma/rpc_rdma.c             |  8 ++++----
 net/sunrpc/xprtrdma/svc_rdma_backchannel.c |  7 +++++--
 net/sunrpc/xprtsock.c                      | 30 ++++++++++++++++--------------
 6 files changed, 41 insertions(+), 31 deletions(-)

(limited to 'include')

diff --git a/include/linux/sunrpc/xprt.h b/include/linux/sunrpc/xprt.h
index 65b9e0224753..a97e6de5f9f2 100644
--- a/include/linux/sunrpc/xprt.h
+++ b/include/linux/sunrpc/xprt.h
@@ -232,6 +232,7 @@ struct rpc_xprt {
 	 */
 	spinlock_t		transport_lock;	/* lock transport info */
 	spinlock_t		reserve_lock;	/* lock slot table */
+	spinlock_t		recv_lock;	/* lock receive list */
 	u32			xid;		/* Next XID value to use */
 	struct rpc_task *	snd_task;	/* Task blocked in send */
 	struct svc_xprt		*bc_xprt;	/* NFSv4.1 backchannel */
diff --git a/net/sunrpc/svcsock.c b/net/sunrpc/svcsock.c
index 2b720fa35c4f..272063ca81e8 100644
--- a/net/sunrpc/svcsock.c
+++ b/net/sunrpc/svcsock.c
@@ -1001,7 +1001,7 @@ static int receive_cb_reply(struct svc_sock *svsk, struct svc_rqst *rqstp)
 
 	if (!bc_xprt)
 		return -EAGAIN;
-	spin_lock_bh(&bc_xprt->transport_lock);
+	spin_lock(&bc_xprt->recv_lock);
 	req = xprt_lookup_rqst(bc_xprt, xid);
 	if (!req)
 		goto unlock_notfound;
@@ -1019,7 +1019,7 @@ static int receive_cb_reply(struct svc_sock *svsk, struct svc_rqst *rqstp)
 	memcpy(dst->iov_base, src->iov_base, src->iov_len);
 	xprt_complete_rqst(req->rq_task, rqstp->rq_arg.len);
 	rqstp->rq_arg.len = 0;
-	spin_unlock_bh(&bc_xprt->transport_lock);
+	spin_unlock(&bc_xprt->recv_lock);
 	return 0;
 unlock_notfound:
 	printk(KERN_NOTICE
@@ -1028,7 +1028,7 @@ unlock_notfound:
 		__func__, ntohl(calldir),
 		bc_xprt, ntohl(xid));
 unlock_eagain:
-	spin_unlock_bh(&bc_xprt->transport_lock);
+	spin_unlock(&bc_xprt->recv_lock);
 	return -EAGAIN;
 }
 
diff --git a/net/sunrpc/xprt.c b/net/sunrpc/xprt.c
index 3eb9ec16eec4..2af189c5ac3e 100644
--- a/net/sunrpc/xprt.c
+++ b/net/sunrpc/xprt.c
@@ -872,17 +872,17 @@ void xprt_unpin_rqst(struct rpc_rqst *req)
 }
 
 static void xprt_wait_on_pinned_rqst(struct rpc_rqst *req)
-__must_hold(&req->rq_xprt->transport_lock)
+__must_hold(&req->rq_xprt->recv_lock)
 {
 	struct rpc_task *task = req->rq_task;
 	
 	if (task && test_bit(RPC_TASK_MSG_RECV, &task->tk_runstate)) {
-		spin_unlock_bh(&req->rq_xprt->transport_lock);
+		spin_unlock(&req->rq_xprt->recv_lock);
 		set_bit(RPC_TASK_MSG_RECV_WAIT, &task->tk_runstate);
 		wait_on_bit(&task->tk_runstate, RPC_TASK_MSG_RECV,
 				TASK_UNINTERRUPTIBLE);
 		clear_bit(RPC_TASK_MSG_RECV_WAIT, &task->tk_runstate);
-		spin_lock_bh(&req->rq_xprt->transport_lock);
+		spin_lock(&req->rq_xprt->recv_lock);
 	}
 }
 
@@ -1008,13 +1008,13 @@ void xprt_transmit(struct rpc_task *task)
 			/*
 			 * Add to the list only if we're expecting a reply
 			 */
-			spin_lock_bh(&xprt->transport_lock);
 			/* Update the softirq receive buffer */
 			memcpy(&req->rq_private_buf, &req->rq_rcv_buf,
 					sizeof(req->rq_private_buf));
 			/* Add request to the receive list */
+			spin_lock(&xprt->recv_lock);
 			list_add_tail(&req->rq_list, &xprt->recv);
-			spin_unlock_bh(&xprt->transport_lock);
+			spin_unlock(&xprt->recv_lock);
 			xprt_reset_majortimeo(req);
 			/* Turn off autodisconnect */
 			del_singleshot_timer_sync(&xprt->timer);
@@ -1329,15 +1329,18 @@ void xprt_release(struct rpc_task *task)
 		task->tk_ops->rpc_count_stats(task, task->tk_calldata);
 	else if (task->tk_client)
 		rpc_count_iostats(task, task->tk_client->cl_metrics);
+	spin_lock(&xprt->recv_lock);
+	if (!list_empty(&req->rq_list)) {
+		list_del(&req->rq_list);
+		xprt_wait_on_pinned_rqst(req);
+	}
+	spin_unlock(&xprt->recv_lock);
 	spin_lock_bh(&xprt->transport_lock);
 	xprt->ops->release_xprt(xprt, task);
 	if (xprt->ops->release_request)
 		xprt->ops->release_request(task);
-	if (!list_empty(&req->rq_list))
-		list_del(&req->rq_list);
 	xprt->last_used = jiffies;
 	xprt_schedule_autodisconnect(xprt);
-	xprt_wait_on_pinned_rqst(req);
 	spin_unlock_bh(&xprt->transport_lock);
 	if (req->rq_buffer)
 		xprt->ops->buf_free(task);
@@ -1361,6 +1364,7 @@ static void xprt_init(struct rpc_xprt *xprt, struct net *net)
 
 	spin_lock_init(&xprt->transport_lock);
 	spin_lock_init(&xprt->reserve_lock);
+	spin_lock_init(&xprt->recv_lock);
 
 	INIT_LIST_HEAD(&xprt->free);
 	INIT_LIST_HEAD(&xprt->recv);
diff --git a/net/sunrpc/xprtrdma/rpc_rdma.c b/net/sunrpc/xprtrdma/rpc_rdma.c
index ca4d6e4528f3..dfa748a0c8de 100644
--- a/net/sunrpc/xprtrdma/rpc_rdma.c
+++ b/net/sunrpc/xprtrdma/rpc_rdma.c
@@ -1051,7 +1051,7 @@ rpcrdma_reply_handler(struct work_struct *work)
 	 * RPC completion while holding the transport lock to ensure
 	 * the rep, rqst, and rq_task pointers remain stable.
 	 */
-	spin_lock_bh(&xprt->transport_lock);
+	spin_lock(&xprt->recv_lock);
 	rqst = xprt_lookup_rqst(xprt, headerp->rm_xid);
 	if (!rqst)
 		goto out_norqst;
@@ -1136,7 +1136,7 @@ out:
 		xprt_release_rqst_cong(rqst->rq_task);
 
 	xprt_complete_rqst(rqst->rq_task, status);
-	spin_unlock_bh(&xprt->transport_lock);
+	spin_unlock(&xprt->recv_lock);
 	dprintk("RPC:       %s: xprt_complete_rqst(0x%p, 0x%p, %d)\n",
 		__func__, xprt, rqst, status);
 	return;
@@ -1187,12 +1187,12 @@ out_rdmaerr:
 	r_xprt->rx_stats.bad_reply_count++;
 	goto out;
 
-/* The req was still available, but by the time the transport_lock
+/* The req was still available, but by the time the recv_lock
  * was acquired, the rqst and task had been released. Thus the RPC
  * has already been terminated.
  */
 out_norqst:
-	spin_unlock_bh(&xprt->transport_lock);
+	spin_unlock(&xprt->recv_lock);
 	rpcrdma_buffer_put(req);
 	dprintk("RPC:       %s: race, no rqst left for req %p\n",
 		__func__, req);
diff --git a/net/sunrpc/xprtrdma/svc_rdma_backchannel.c b/net/sunrpc/xprtrdma/svc_rdma_backchannel.c
index c676ed0efb5a..0d574cda242d 100644
--- a/net/sunrpc/xprtrdma/svc_rdma_backchannel.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_backchannel.c
@@ -52,7 +52,7 @@ int svc_rdma_handle_bc_reply(struct rpc_xprt *xprt, __be32 *rdma_resp,
 	if (src->iov_len < 24)
 		goto out_shortreply;
 
-	spin_lock_bh(&xprt->transport_lock);
+	spin_lock(&xprt->recv_lock);
 	req = xprt_lookup_rqst(xprt, xid);
 	if (!req)
 		goto out_notfound;
@@ -69,17 +69,20 @@ int svc_rdma_handle_bc_reply(struct rpc_xprt *xprt, __be32 *rdma_resp,
 	else if (credits > r_xprt->rx_buf.rb_bc_max_requests)
 		credits = r_xprt->rx_buf.rb_bc_max_requests;
 
+	spin_lock_bh(&xprt->transport_lock);
 	cwnd = xprt->cwnd;
 	xprt->cwnd = credits << RPC_CWNDSHIFT;
 	if (xprt->cwnd > cwnd)
 		xprt_release_rqst_cong(req->rq_task);
+	spin_unlock_bh(&xprt->transport_lock);
+
 
 	ret = 0;
 	xprt_complete_rqst(req->rq_task, rcvbuf->len);
 	rcvbuf->len = 0;
 
 out_unlock:
-	spin_unlock_bh(&xprt->transport_lock);
+	spin_unlock(&xprt->recv_lock);
 out:
 	return ret;
 
diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c
index a344bea15fc7..2b918137aaa0 100644
--- a/net/sunrpc/xprtsock.c
+++ b/net/sunrpc/xprtsock.c
@@ -969,12 +969,12 @@ static void xs_local_data_read_skb(struct rpc_xprt *xprt,
 		return;
 
 	/* Look up and lock the request corresponding to the given XID */
-	spin_lock_bh(&xprt->transport_lock);
+	spin_lock(&xprt->recv_lock);
 	rovr = xprt_lookup_rqst(xprt, *xp);
 	if (!rovr)
 		goto out_unlock;
 	xprt_pin_rqst(rovr);
-	spin_unlock_bh(&xprt->transport_lock);
+	spin_unlock(&xprt->recv_lock);
 	task = rovr->rq_task;
 
 	copied = rovr->rq_private_buf.buflen;
@@ -983,16 +983,16 @@ static void xs_local_data_read_skb(struct rpc_xprt *xprt,
 
 	if (xs_local_copy_to_xdr(&rovr->rq_private_buf, skb)) {
 		dprintk("RPC:       sk_buff copy failed\n");
-		spin_lock_bh(&xprt->transport_lock);
+		spin_lock(&xprt->recv_lock);
 		goto out_unpin;
 	}
 
-	spin_lock_bh(&xprt->transport_lock);
+	spin_lock(&xprt->recv_lock);
 	xprt_complete_rqst(task, copied);
 out_unpin:
 	xprt_unpin_rqst(rovr);
  out_unlock:
-	spin_unlock_bh(&xprt->transport_lock);
+	spin_unlock(&xprt->recv_lock);
 }
 
 static void xs_local_data_receive(struct sock_xprt *transport)
@@ -1055,12 +1055,12 @@ static void xs_udp_data_read_skb(struct rpc_xprt *xprt,
 		return;
 
 	/* Look up and lock the request corresponding to the given XID */
-	spin_lock_bh(&xprt->transport_lock);
+	spin_lock(&xprt->recv_lock);
 	rovr = xprt_lookup_rqst(xprt, *xp);
 	if (!rovr)
 		goto out_unlock;
 	xprt_pin_rqst(rovr);
-	spin_unlock_bh(&xprt->transport_lock);
+	spin_unlock(&xprt->recv_lock);
 	task = rovr->rq_task;
 
 	if ((copied = rovr->rq_private_buf.buflen) > repsize)
@@ -1069,7 +1069,7 @@ static void xs_udp_data_read_skb(struct rpc_xprt *xprt,
 	/* Suck it into the iovec, verify checksum if not done by hw. */
 	if (csum_partial_copy_to_xdr(&rovr->rq_private_buf, skb)) {
 		__UDPX_INC_STATS(sk, UDP_MIB_INERRORS);
-		spin_lock_bh(&xprt->transport_lock);
+		spin_lock(&xprt->recv_lock);
 		goto out_unpin;
 	}
 
@@ -1077,11 +1077,13 @@ static void xs_udp_data_read_skb(struct rpc_xprt *xprt,
 
 	spin_lock_bh(&xprt->transport_lock);
 	xprt_adjust_cwnd(xprt, task, copied);
+	spin_unlock_bh(&xprt->transport_lock);
+	spin_lock(&xprt->recv_lock);
 	xprt_complete_rqst(task, copied);
 out_unpin:
 	xprt_unpin_rqst(rovr);
  out_unlock:
-	spin_unlock_bh(&xprt->transport_lock);
+	spin_unlock(&xprt->recv_lock);
 }
 
 static void xs_udp_data_receive(struct sock_xprt *transport)
@@ -1344,24 +1346,24 @@ static inline int xs_tcp_read_reply(struct rpc_xprt *xprt,
 	dprintk("RPC:       read reply XID %08x\n", ntohl(transport->tcp_xid));
 
 	/* Find and lock the request corresponding to this xid */
-	spin_lock_bh(&xprt->transport_lock);
+	spin_lock(&xprt->recv_lock);
 	req = xprt_lookup_rqst(xprt, transport->tcp_xid);
 	if (!req) {
 		dprintk("RPC:       XID %08x request not found!\n",
 				ntohl(transport->tcp_xid));
-		spin_unlock_bh(&xprt->transport_lock);
+		spin_unlock(&xprt->recv_lock);
 		return -1;
 	}
 	xprt_pin_rqst(req);
-	spin_unlock_bh(&xprt->transport_lock);
+	spin_unlock(&xprt->recv_lock);
 
 	xs_tcp_read_common(xprt, desc, req);
 
-	spin_lock_bh(&xprt->transport_lock);
+	spin_lock(&xprt->recv_lock);
 	if (!(transport->tcp_flags & TCP_RCV_COPY_DATA))
 		xprt_complete_rqst(req->rq_task, transport->tcp_copied);
 	xprt_unpin_rqst(req);
-	spin_unlock_bh(&xprt->transport_lock);
+	spin_unlock(&xprt->recv_lock);
 	return 0;
 }
 
-- 
cgit v1.2.3


From 62ede7779904bc75bdd84f1ff0016113956ce3b4 Mon Sep 17 00:00:00 2001
From: "Hiatt, Don" <don.hiatt@intel.com>
Date: Mon, 14 Aug 2017 14:17:43 -0400
Subject: Add OPA extended LID support

This patch series primarily increases sizes of variables that hold
lid values from 16 to 32 bits. Additionally, it adds a check in
the IB mad stack to verify a properly formatted MAD when OPA
extended LIDs are used.

Signed-off-by: Don Hiatt <don.hiatt@intel.com>
Reviewed-by: Dennis Dalessandro <dennis.dalessandro@intel.com>
Signed-off-by: Doug Ledford <dledford@redhat.com>
---
 drivers/infiniband/core/cm.c            |  4 ++--
 drivers/infiniband/core/user_mad.c      |  2 +-
 drivers/infiniband/core/uverbs_cmd.c    | 11 ++++++-----
 drivers/infiniband/hw/hfi1/mad.c        |  2 +-
 drivers/infiniband/hw/mlx4/alias_GUID.c |  2 +-
 drivers/infiniband/hw/mlx4/mad.c        |  8 ++++----
 drivers/infiniband/hw/mlx5/mad.c        |  2 +-
 drivers/infiniband/hw/mthca/mthca_cmd.c |  4 ++--
 drivers/infiniband/hw/mthca/mthca_mad.c |  4 ++--
 drivers/infiniband/sw/rdmavt/cq.c       |  2 +-
 include/rdma/ib_verbs.h                 | 26 ++++++++++++++++++++------
 11 files changed, 41 insertions(+), 26 deletions(-)

(limited to 'include')

diff --git a/drivers/infiniband/core/cm.c b/drivers/infiniband/core/cm.c
index 7a389697e2ec..fa3b0a428195 100644
--- a/drivers/infiniband/core/cm.c
+++ b/drivers/infiniband/core/cm.c
@@ -1770,7 +1770,7 @@ static void cm_process_routed_req(struct cm_req_msg *req_msg, struct ib_wc *wc)
 {
 	if (!cm_req_get_primary_subnet_local(req_msg)) {
 		if (req_msg->primary_local_lid == IB_LID_PERMISSIVE) {
-			req_msg->primary_local_lid = ib_slid_be16(wc->slid);
+			req_msg->primary_local_lid = ib_lid_be16(wc->slid);
 			cm_req_set_primary_sl(req_msg, wc->sl);
 		}
 
@@ -1780,7 +1780,7 @@ static void cm_process_routed_req(struct cm_req_msg *req_msg, struct ib_wc *wc)
 
 	if (!cm_req_get_alt_subnet_local(req_msg)) {
 		if (req_msg->alt_local_lid == IB_LID_PERMISSIVE) {
-			req_msg->alt_local_lid = ib_slid_be16(wc->slid);
+			req_msg->alt_local_lid = ib_lid_be16(wc->slid);
 			cm_req_set_alt_sl(req_msg, wc->sl);
 		}
 
diff --git a/drivers/infiniband/core/user_mad.c b/drivers/infiniband/core/user_mad.c
index ff3c67a7aaad..c1696e6084b2 100644
--- a/drivers/infiniband/core/user_mad.c
+++ b/drivers/infiniband/core/user_mad.c
@@ -229,7 +229,7 @@ static void recv_handler(struct ib_mad_agent *agent,
 	packet->mad.hdr.status	   = 0;
 	packet->mad.hdr.length	   = hdr_size(file) + mad_recv_wc->mad_len;
 	packet->mad.hdr.qpn	   = cpu_to_be32(mad_recv_wc->wc->src_qp);
-	packet->mad.hdr.lid	   = ib_slid_be16(mad_recv_wc->wc->slid);
+	packet->mad.hdr.lid	   = ib_lid_be16(mad_recv_wc->wc->slid);
 	packet->mad.hdr.sl	   = mad_recv_wc->wc->sl;
 	packet->mad.hdr.path_bits  = mad_recv_wc->wc->dlid_path_bits;
 	packet->mad.hdr.pkey_index = mad_recv_wc->wc->pkey_index;
diff --git a/drivers/infiniband/core/uverbs_cmd.c b/drivers/infiniband/core/uverbs_cmd.c
index 39a0f1dc84e4..a21881e22bad 100644
--- a/drivers/infiniband/core/uverbs_cmd.c
+++ b/drivers/infiniband/core/uverbs_cmd.c
@@ -275,12 +275,13 @@ ssize_t ib_uverbs_query_port(struct ib_uverbs_file *file,
 	resp.bad_pkey_cntr   = attr.bad_pkey_cntr;
 	resp.qkey_viol_cntr  = attr.qkey_viol_cntr;
 	resp.pkey_tbl_len    = attr.pkey_tbl_len;
+
 	if (rdma_cap_opa_ah(ib_dev, cmd.port_num)) {
-		resp.lid  = OPA_TO_IB_UCAST_LID(attr.lid);
+		resp.lid     = OPA_TO_IB_UCAST_LID(attr.lid);
 		resp.sm_lid  = OPA_TO_IB_UCAST_LID(attr.sm_lid);
 	} else {
-		resp.lid     = (u16)attr.lid;
-		resp.sm_lid  = (u16)attr.sm_lid;
+		resp.lid     = ib_lid_cpu16(attr.lid);
+		resp.sm_lid  = ib_lid_cpu16(attr.sm_lid);
 	}
 	resp.lmc 	     = attr.lmc;
 	resp.max_vl_num      = attr.max_vl_num;
@@ -1206,9 +1207,9 @@ static int copy_wc_to_user(struct ib_device *ib_dev, void __user *dest,
 	tmp.wc_flags		= wc->wc_flags;
 	tmp.pkey_index		= wc->pkey_index;
 	if (rdma_cap_opa_ah(ib_dev, wc->port_num))
-		tmp.slid  = OPA_TO_IB_UCAST_LID(wc->slid);
+		tmp.slid	= OPA_TO_IB_UCAST_LID(wc->slid);
 	else
-		tmp.slid  = ib_slid_cpu16(wc->slid);
+		tmp.slid	= ib_lid_cpu16(wc->slid);
 	tmp.sl			= wc->sl;
 	tmp.dlid_path_bits	= wc->dlid_path_bits;
 	tmp.port_num		= wc->port_num;
diff --git a/drivers/infiniband/hw/hfi1/mad.c b/drivers/infiniband/hw/hfi1/mad.c
index 8daa3a5f7e95..11be4d19e607 100644
--- a/drivers/infiniband/hw/hfi1/mad.c
+++ b/drivers/infiniband/hw/hfi1/mad.c
@@ -4216,7 +4216,7 @@ static int opa_local_smp_check(struct hfi1_ibport *ibp,
 			       const struct ib_wc *in_wc)
 {
 	struct hfi1_pportdata *ppd = ppd_from_ibp(ibp);
-	u16 slid = ib_slid_cpu16(in_wc->slid);
+	u16 slid = ib_lid_cpu16(in_wc->slid);
 	u16 pkey;
 
 	if (in_wc->pkey_index >= ARRAY_SIZE(ppd->pkeys))
diff --git a/drivers/infiniband/hw/mlx4/alias_GUID.c b/drivers/infiniband/hw/mlx4/alias_GUID.c
index 5a897b0106a9..0e4f60cfc59d 100644
--- a/drivers/infiniband/hw/mlx4/alias_GUID.c
+++ b/drivers/infiniband/hw/mlx4/alias_GUID.c
@@ -528,7 +528,7 @@ static int set_guid_rec(struct ib_device *ibdev,
 
 	memset(&guid_info_rec, 0, sizeof (struct ib_sa_guidinfo_rec));
 
-	guid_info_rec.lid = cpu_to_be16((u16)attr.lid);
+	guid_info_rec.lid = ib_lid_be16(attr.lid);
 	guid_info_rec.block_num = index;
 
 	memcpy(guid_info_rec.guid_info_list, rec_det->all_recs,
diff --git a/drivers/infiniband/hw/mlx4/mad.c b/drivers/infiniband/hw/mlx4/mad.c
index 04fb44e7699e..0793a21d76f4 100644
--- a/drivers/infiniband/hw/mlx4/mad.c
+++ b/drivers/infiniband/hw/mlx4/mad.c
@@ -169,7 +169,7 @@ int mlx4_MAD_IFC(struct mlx4_ib_dev *dev, int mad_ifc_flags,
 
 		op_modifier |= 0x4;
 
-		in_modifier |= ib_slid_cpu16(in_wc->slid) << 16;
+		in_modifier |= ib_lid_cpu16(in_wc->slid) << 16;
 	}
 
 	err = mlx4_cmd_box(dev->dev, inmailbox->dma, outmailbox->dma, in_modifier,
@@ -625,7 +625,7 @@ int mlx4_ib_send_to_slave(struct mlx4_ib_dev *dev, int slave, u8 port,
 		memcpy((char *)&tun_mad->hdr.slid_mac_47_32, &(wc->smac[4]), 2);
 	} else {
 		tun_mad->hdr.sl_vid = cpu_to_be16(((u16)(wc->sl)) << 12);
-		tun_mad->hdr.slid_mac_47_32 = ib_slid_be16(wc->slid);
+		tun_mad->hdr.slid_mac_47_32 = ib_lid_be16(wc->slid);
 	}
 
 	ib_dma_sync_single_for_device(&dev->ib_dev,
@@ -826,7 +826,7 @@ static int ib_process_mad(struct ib_device *ibdev, int mad_flags, u8 port_num,
 		}
 	}
 
-	slid = in_wc ? ib_slid_cpu16(in_wc->slid) : be16_to_cpu(IB_LID_PERMISSIVE);
+	slid = in_wc ? ib_lid_cpu16(in_wc->slid) : be16_to_cpu(IB_LID_PERMISSIVE);
 
 	if (in_mad->mad_hdr.method == IB_MGMT_METHOD_TRAP && slid == 0) {
 		forward_trap(to_mdev(ibdev), port_num, in_mad);
@@ -860,7 +860,7 @@ static int ib_process_mad(struct ib_device *ibdev, int mad_flags, u8 port_num,
 	    in_mad->mad_hdr.method == IB_MGMT_METHOD_SET &&
 	    in_mad->mad_hdr.attr_id == IB_SMP_ATTR_PORT_INFO &&
 	    !ib_query_port(ibdev, port_num, &pattr))
-		prev_lid = (u16)pattr.lid;
+		prev_lid = ib_lid_cpu16(pattr.lid);
 
 	err = mlx4_MAD_IFC(to_mdev(ibdev),
 			   (mad_flags & IB_MAD_IGNORE_MKEY ? MLX4_MAD_IFC_IGNORE_MKEY : 0) |
diff --git a/drivers/infiniband/hw/mlx5/mad.c b/drivers/infiniband/hw/mlx5/mad.c
index cd2264ac88ae..18cfe5bf0fa3 100644
--- a/drivers/infiniband/hw/mlx5/mad.c
+++ b/drivers/infiniband/hw/mlx5/mad.c
@@ -78,7 +78,7 @@ static int process_mad(struct ib_device *ibdev, int mad_flags, u8 port_num,
 	u16 slid;
 	int err;
 
-	slid = in_wc ? ib_slid_cpu16(in_wc->slid) : be16_to_cpu(IB_LID_PERMISSIVE);
+	slid = in_wc ? ib_lid_cpu16(in_wc->slid) : be16_to_cpu(IB_LID_PERMISSIVE);
 
 	if (in_mad->mad_hdr.method == IB_MGMT_METHOD_TRAP && slid == 0)
 		return IB_MAD_RESULT_SUCCESS | IB_MAD_RESULT_CONSUMED;
diff --git a/drivers/infiniband/hw/mthca/mthca_cmd.c b/drivers/infiniband/hw/mthca/mthca_cmd.c
index e19ae0b9b439..d0f062fc2a4b 100644
--- a/drivers/infiniband/hw/mthca/mthca_cmd.c
+++ b/drivers/infiniband/hw/mthca/mthca_cmd.c
@@ -1921,7 +1921,7 @@ int mthca_MAD_IFC(struct mthca_dev *dev, int ignore_mkey, int ignore_bkey,
 			(in_wc->wc_flags & IB_WC_GRH ? 0x80 : 0);
 		MTHCA_PUT(inbox, val,               MAD_IFC_G_PATH_OFFSET);
 
-		MTHCA_PUT(inbox, ib_slid_cpu16(in_wc->slid), MAD_IFC_RLID_OFFSET);
+		MTHCA_PUT(inbox, ib_lid_cpu16(in_wc->slid), MAD_IFC_RLID_OFFSET);
 		MTHCA_PUT(inbox, in_wc->pkey_index, MAD_IFC_PKEY_OFFSET);
 
 		if (in_grh)
@@ -1929,7 +1929,7 @@ int mthca_MAD_IFC(struct mthca_dev *dev, int ignore_mkey, int ignore_bkey,
 
 		op_modifier |= 0x4;
 
-		in_modifier |= ib_slid_cpu16(in_wc->slid) << 16;
+		in_modifier |= ib_lid_cpu16(in_wc->slid) << 16;
 	}
 
 	err = mthca_cmd_box(dev, inmailbox->dma, outmailbox->dma,
diff --git a/drivers/infiniband/hw/mthca/mthca_mad.c b/drivers/infiniband/hw/mthca/mthca_mad.c
index a9caadab22cf..093f7755c843 100644
--- a/drivers/infiniband/hw/mthca/mthca_mad.c
+++ b/drivers/infiniband/hw/mthca/mthca_mad.c
@@ -205,7 +205,7 @@ int mthca_process_mad(struct ib_device *ibdev,
 		      u16 *out_mad_pkey_index)
 {
 	int err;
-	u16 slid = in_wc ? ib_slid_cpu16(in_wc->slid) : be16_to_cpu(IB_LID_PERMISSIVE);
+	u16 slid = in_wc ? ib_lid_cpu16(in_wc->slid) : be16_to_cpu(IB_LID_PERMISSIVE);
 	u16 prev_lid = 0;
 	struct ib_port_attr pattr;
 	const struct ib_mad *in_mad = (const struct ib_mad *)in;
@@ -256,7 +256,7 @@ int mthca_process_mad(struct ib_device *ibdev,
 	    in_mad->mad_hdr.method == IB_MGMT_METHOD_SET &&
 	    in_mad->mad_hdr.attr_id == IB_SMP_ATTR_PORT_INFO &&
 	    !ib_query_port(ibdev, port_num, &pattr))
-		prev_lid = (u16)pattr.lid;
+		prev_lid = ib_lid_cpu16(pattr.lid);
 
 	err = mthca_MAD_IFC(to_mdev(ibdev),
 			    mad_flags & IB_MAD_IGNORE_MKEY,
diff --git a/drivers/infiniband/sw/rdmavt/cq.c b/drivers/infiniband/sw/rdmavt/cq.c
index 0335a3df74d5..97d71e49c092 100644
--- a/drivers/infiniband/sw/rdmavt/cq.c
+++ b/drivers/infiniband/sw/rdmavt/cq.c
@@ -107,7 +107,7 @@ void rvt_cq_enter(struct rvt_cq *cq, struct ib_wc *entry, bool solicited)
 		wc->uqueue[head].src_qp = entry->src_qp;
 		wc->uqueue[head].wc_flags = entry->wc_flags;
 		wc->uqueue[head].pkey_index = entry->pkey_index;
-		wc->uqueue[head].slid = ib_slid_cpu16(entry->slid);
+		wc->uqueue[head].slid = ib_lid_cpu16(entry->slid);
 		wc->uqueue[head].sl = entry->sl;
 		wc->uqueue[head].dlid_path_bits = entry->dlid_path_bits;
 		wc->uqueue[head].port_num = entry->port_num;
diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h
index 4db4ad56ace6..70a183179224 100644
--- a/include/rdma/ib_verbs.h
+++ b/include/rdma/ib_verbs.h
@@ -3724,16 +3724,30 @@ static inline enum rdma_ah_attr_type rdma_ah_find_type(struct ib_device *dev,
 		return RDMA_AH_ATTR_TYPE_IB;
 }
 
-/* Return slid in 16bit CPU encoding */
-static inline u16 ib_slid_cpu16(u32 slid)
+/**
+ * ib_lid_cpu16 - Return lid in 16bit CPU encoding.
+ *     In the current implementation the only way to get
+ *     get the 32bit lid is from other sources for OPA.
+ *     For IB, lids will always be 16bits so cast the
+ *     value accordingly.
+ *
+ * @lid: A 32bit LID
+ */
+static inline u16 ib_lid_cpu16(u32 lid)
 {
-	return (u16)slid;
+	WARN_ON_ONCE(lid & 0xFFFF0000);
+	return (u16)lid;
 }
 
-/* Return slid in 16bit BE encoding */
-static inline u16 ib_slid_be16(u32 slid)
+/**
+ * ib_lid_be16 - Return lid in 16bit BE encoding.
+ *
+ * @lid: A 32bit LID
+ */
+static inline __be16 ib_lid_be16(u32 lid)
 {
-	return cpu_to_be16((u16)slid);
+	WARN_ON_ONCE(lid & 0xFFFF0000);
+	return cpu_to_be16((u16)lid);
 }
 
 /**
-- 
cgit v1.2.3


From a0917e0bc6efc05834c0c1eafebd579a9c75e6e9 Mon Sep 17 00:00:00 2001
From: Matthew Dawson <matthew@mjdsystems.ca>
Date: Fri, 18 Aug 2017 15:04:54 -0400
Subject: datagram: When peeking datagrams with offset < 0 don't skip empty
 skbs

Due to commit e6afc8ace6dd5cef5e812f26c72579da8806f5ac ("udp: remove
headers from UDP packets before queueing"), when udp packets are being
peeked the requested extra offset is always 0 as there is no need to skip
the udp header.  However, when the offset is 0 and the next skb is
of length 0, it is only returned once.  The behaviour can be seen with
the following python script:

from socket import *;
f=socket(AF_INET6, SOCK_DGRAM | SOCK_NONBLOCK, 0);
g=socket(AF_INET6, SOCK_DGRAM | SOCK_NONBLOCK, 0);
f.bind(('::', 0));
addr=('::1', f.getsockname()[1]);
g.sendto(b'', addr)
g.sendto(b'b', addr)
print(f.recvfrom(10, MSG_PEEK));
print(f.recvfrom(10, MSG_PEEK));

Where the expected output should be the empty string twice.

Instead, make sk_peek_offset return negative values, and pass those values
to __skb_try_recv_datagram/__skb_try_recv_from_queue.  If the passed offset
to __skb_try_recv_from_queue is negative, the checked skb is never skipped.
__skb_try_recv_from_queue will then ensure the offset is reset back to 0
if a peek is requested without an offset, unless no packets are found.

Also simplify the if condition in __skb_try_recv_from_queue.  If _off is
greater then 0, and off is greater then or equal to skb->len, then
(_off || skb->len) must always be true assuming skb->len >= 0 is always
true.

Also remove a redundant check around a call to sk_peek_offset in af_unix.c,
as it double checked if MSG_PEEK was set in the flags.

V2:
 - Moved the negative fixup into __skb_try_recv_from_queue, and remove now
redundant checks
 - Fix peeking in udp{,v6}_recvmsg to report the right value when the
offset is 0

V3:
 - Marked new branch in __skb_try_recv_from_queue as unlikely.

Signed-off-by: Matthew Dawson <matthew@mjdsystems.ca>
Acked-by: Willem de Bruijn <willemb@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/sock.h  |  4 +---
 net/core/datagram.c | 12 +++++++++---
 net/ipv4/udp.c      |  3 ++-
 net/ipv6/udp.c      |  3 ++-
 net/unix/af_unix.c  |  5 +----
 5 files changed, 15 insertions(+), 12 deletions(-)

(limited to 'include')

diff --git a/include/net/sock.h b/include/net/sock.h
index 7c0632c7e870..aeeec62992ca 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -507,9 +507,7 @@ int sk_set_peek_off(struct sock *sk, int val);
 static inline int sk_peek_offset(struct sock *sk, int flags)
 {
 	if (unlikely(flags & MSG_PEEK)) {
-		s32 off = READ_ONCE(sk->sk_peek_off);
-		if (off >= 0)
-			return off;
+		return READ_ONCE(sk->sk_peek_off);
 	}
 
 	return 0;
diff --git a/net/core/datagram.c b/net/core/datagram.c
index ee5647bd91b3..a21ca8dee5ea 100644
--- a/net/core/datagram.c
+++ b/net/core/datagram.c
@@ -169,14 +169,20 @@ struct sk_buff *__skb_try_recv_from_queue(struct sock *sk,
 					  int *peeked, int *off, int *err,
 					  struct sk_buff **last)
 {
+	bool peek_at_off = false;
 	struct sk_buff *skb;
-	int _off = *off;
+	int _off = 0;
+
+	if (unlikely(flags & MSG_PEEK && *off >= 0)) {
+		peek_at_off = true;
+		_off = *off;
+	}
 
 	*last = queue->prev;
 	skb_queue_walk(queue, skb) {
 		if (flags & MSG_PEEK) {
-			if (_off >= skb->len && (skb->len || _off ||
-						 skb->peeked)) {
+			if (peek_at_off && _off >= skb->len &&
+			    (_off || skb->peeked)) {
 				_off -= skb->len;
 				continue;
 			}
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index a7c804f73990..cd1d044a7fa5 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -1574,7 +1574,8 @@ int udp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int noblock,
 		return ip_recv_error(sk, msg, len, addr_len);
 
 try_again:
-	peeking = off = sk_peek_offset(sk, flags);
+	peeking = flags & MSG_PEEK;
+	off = sk_peek_offset(sk, flags);
 	skb = __skb_recv_udp(sk, flags, noblock, &peeked, &off, &err);
 	if (!skb)
 		return err;
diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c
index 578142b7ca3e..20039c8501eb 100644
--- a/net/ipv6/udp.c
+++ b/net/ipv6/udp.c
@@ -362,7 +362,8 @@ int udpv6_recvmsg(struct sock *sk, struct msghdr *msg, size_t len,
 		return ipv6_recv_rxpmtu(sk, msg, len, addr_len);
 
 try_again:
-	peeking = off = sk_peek_offset(sk, flags);
+	peeking = flags & MSG_PEEK;
+	off = sk_peek_offset(sk, flags);
 	skb = __skb_recv_udp(sk, flags, noblock, &peeked, &off, &err);
 	if (!skb)
 		return err;
diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c
index 7b52a380d710..be8982b4f8c0 100644
--- a/net/unix/af_unix.c
+++ b/net/unix/af_unix.c
@@ -2304,10 +2304,7 @@ static int unix_stream_read_generic(struct unix_stream_read_state *state,
 	 */
 	mutex_lock(&u->iolock);
 
-	if (flags & MSG_PEEK)
-		skip = sk_peek_offset(sk, flags);
-	else
-		skip = 0;
+	skip = max(sk_peek_offset(sk, flags), 0);
 
 	do {
 		int chunk;
-- 
cgit v1.2.3


From 9620fef27ed2cdb37bf6fd028f32bea2ef5119a8 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Fri, 18 Aug 2017 12:08:07 -0700
Subject: ipv4: convert dst_metrics.refcnt from atomic_t to refcount_t

refcount_t type and corresponding API should be
used instead of atomic_t when the variable is used as
a reference counter. This allows to avoid accidental
refcounter overflows that might lead to use-after-free
situations.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/dst.h        | 3 ++-
 net/core/dst.c           | 6 +++---
 net/ipv4/fib_semantics.c | 4 ++--
 net/ipv4/route.c         | 4 ++--
 4 files changed, 9 insertions(+), 8 deletions(-)

(limited to 'include')

diff --git a/include/net/dst.h b/include/net/dst.h
index f73611ec4017..93568bd0a352 100644
--- a/include/net/dst.h
+++ b/include/net/dst.h
@@ -14,6 +14,7 @@
 #include <linux/rcupdate.h>
 #include <linux/bug.h>
 #include <linux/jiffies.h>
+#include <linux/refcount.h>
 #include <net/neighbour.h>
 #include <asm/processor.h>
 
@@ -107,7 +108,7 @@ struct dst_entry {
 
 struct dst_metrics {
 	u32		metrics[RTAX_MAX];
-	atomic_t	refcnt;
+	refcount_t	refcnt;
 };
 extern const struct dst_metrics dst_default_metrics;
 
diff --git a/net/core/dst.c b/net/core/dst.c
index 00aa972ad1a1..d6ead757c258 100644
--- a/net/core/dst.c
+++ b/net/core/dst.c
@@ -55,7 +55,7 @@ const struct dst_metrics dst_default_metrics = {
 	 * We really want to avoid false sharing on this variable, and catch
 	 * any writes on it.
 	 */
-	.refcnt = ATOMIC_INIT(1),
+	.refcnt = REFCOUNT_INIT(1),
 };
 
 void dst_init(struct dst_entry *dst, struct dst_ops *ops,
@@ -213,7 +213,7 @@ u32 *dst_cow_metrics_generic(struct dst_entry *dst, unsigned long old)
 		struct dst_metrics *old_p = (struct dst_metrics *)__DST_METRICS_PTR(old);
 		unsigned long prev, new;
 
-		atomic_set(&p->refcnt, 1);
+		refcount_set(&p->refcnt, 1);
 		memcpy(p->metrics, old_p->metrics, sizeof(p->metrics));
 
 		new = (unsigned long) p;
@@ -225,7 +225,7 @@ u32 *dst_cow_metrics_generic(struct dst_entry *dst, unsigned long old)
 			if (prev & DST_METRICS_READ_ONLY)
 				p = NULL;
 		} else if (prev & DST_METRICS_REFCOUNTED) {
-			if (atomic_dec_and_test(&old_p->refcnt))
+			if (refcount_dec_and_test(&old_p->refcnt))
 				kfree(old_p);
 		}
 	}
diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c
index d521caf57385..394d800db50c 100644
--- a/net/ipv4/fib_semantics.c
+++ b/net/ipv4/fib_semantics.c
@@ -220,7 +220,7 @@ static void free_fib_info_rcu(struct rcu_head *head)
 	} endfor_nexthops(fi);
 
 	m = fi->fib_metrics;
-	if (m != &dst_default_metrics && atomic_dec_and_test(&m->refcnt))
+	if (m != &dst_default_metrics && refcount_dec_and_test(&m->refcnt))
 		kfree(m);
 	kfree(fi);
 }
@@ -1090,7 +1090,7 @@ struct fib_info *fib_create_info(struct fib_config *cfg,
 			kfree(fi);
 			return ERR_PTR(err);
 		}
-		atomic_set(&fi->fib_metrics->refcnt, 1);
+		refcount_set(&fi->fib_metrics->refcnt, 1);
 	} else {
 		fi->fib_metrics = (struct dst_metrics *)&dst_default_metrics;
 	}
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index d400c0543106..872b4cb136d3 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -1398,7 +1398,7 @@ static void ipv4_dst_destroy(struct dst_entry *dst)
 	struct dst_metrics *p = (struct dst_metrics *)DST_METRICS_PTR(dst);
 	struct rtable *rt = (struct rtable *) dst;
 
-	if (p != &dst_default_metrics && atomic_dec_and_test(&p->refcnt))
+	if (p != &dst_default_metrics && refcount_dec_and_test(&p->refcnt))
 		kfree(p);
 
 	if (!list_empty(&rt->rt_uncached)) {
@@ -1456,7 +1456,7 @@ static void rt_set_nexthop(struct rtable *rt, __be32 daddr,
 		dst_init_metrics(&rt->dst, fi->fib_metrics->metrics, true);
 		if (fi->fib_metrics != &dst_default_metrics) {
 			rt->dst._metrics |= DST_METRICS_REFCOUNTED;
-			atomic_inc(&fi->fib_metrics->refcnt);
+			refcount_inc(&fi->fib_metrics->refcnt);
 		}
 #ifdef CONFIG_IP_ROUTE_CLASSID
 		rt->dst.tclassid = nh->nh_tclassid;
-- 
cgit v1.2.3


From 739f79fc9db1b38f96b5a5109b247a650fbebf6d Mon Sep 17 00:00:00 2001
From: Johannes Weiner <hannes@cmpxchg.org>
Date: Fri, 18 Aug 2017 15:15:48 -0700
Subject: mm: memcontrol: fix NULL pointer crash in test_clear_page_writeback()

Jaegeuk and Brad report a NULL pointer crash when writeback ending tries
to update the memcg stats:

    BUG: unable to handle kernel NULL pointer dereference at 00000000000003b0
    IP: test_clear_page_writeback+0x12e/0x2c0
    [...]
    RIP: 0010:test_clear_page_writeback+0x12e/0x2c0
    Call Trace:
     <IRQ>
     end_page_writeback+0x47/0x70
     f2fs_write_end_io+0x76/0x180 [f2fs]
     bio_endio+0x9f/0x120
     blk_update_request+0xa8/0x2f0
     scsi_end_request+0x39/0x1d0
     scsi_io_completion+0x211/0x690
     scsi_finish_command+0xd9/0x120
     scsi_softirq_done+0x127/0x150
     __blk_mq_complete_request_remote+0x13/0x20
     flush_smp_call_function_queue+0x56/0x110
     generic_smp_call_function_single_interrupt+0x13/0x30
     smp_call_function_single_interrupt+0x27/0x40
     call_function_single_interrupt+0x89/0x90
    RIP: 0010:native_safe_halt+0x6/0x10

    (gdb) l *(test_clear_page_writeback+0x12e)
    0xffffffff811bae3e is in test_clear_page_writeback (./include/linux/memcontrol.h:619).
    614		mod_node_page_state(page_pgdat(page), idx, val);
    615		if (mem_cgroup_disabled() || !page->mem_cgroup)
    616			return;
    617		mod_memcg_state(page->mem_cgroup, idx, val);
    618		pn = page->mem_cgroup->nodeinfo[page_to_nid(page)];
    619		this_cpu_add(pn->lruvec_stat->count[idx], val);
    620	}
    621
    622	unsigned long mem_cgroup_soft_limit_reclaim(pg_data_t *pgdat, int order,
    623							gfp_t gfp_mask,

The issue is that writeback doesn't hold a page reference and the page
might get freed after PG_writeback is cleared (and the mapping is
unlocked) in test_clear_page_writeback().  The stat functions looking up
the page's node or zone are safe, as those attributes are static across
allocation and free cycles.  But page->mem_cgroup is not, and it will
get cleared if we race with truncation or migration.

It appears this race window has been around for a while, but less likely
to trigger when the memcg stats were updated first thing after
PG_writeback is cleared.  Recent changes reshuffled this code to update
the global node stats before the memcg ones, though, stretching the race
window out to an extent where people can reproduce the problem.

Update test_clear_page_writeback() to look up and pin page->mem_cgroup
before clearing PG_writeback, then not use that pointer afterward.  It
is a partial revert of 62cccb8c8e7a ("mm: simplify lock_page_memcg()")
but leaves the pageref-holding callsites that aren't affected alone.

Link: http://lkml.kernel.org/r/20170809183825.GA26387@cmpxchg.org
Fixes: 62cccb8c8e7a ("mm: simplify lock_page_memcg()")
Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Reported-by: Jaegeuk Kim <jaegeuk@kernel.org>
Tested-by: Jaegeuk Kim <jaegeuk@kernel.org>
Reported-by: Bradley Bolen <bradleybolen@gmail.com>
Tested-by: Brad Bolen <bradleybolen@gmail.com>
Cc: Vladimir Davydov <vdavydov@virtuozzo.com>
Cc: Michal Hocko <mhocko@suse.cz>
Cc: <stable@vger.kernel.org>	[4.6+]
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/memcontrol.h | 10 ++++++++--
 mm/memcontrol.c            | 43 +++++++++++++++++++++++++++++++------------
 mm/page-writeback.c        | 15 ++++++++++++---
 3 files changed, 51 insertions(+), 17 deletions(-)

(limited to 'include')

diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index 3914e3dd6168..9b15a4bcfa77 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -484,7 +484,8 @@ bool mem_cgroup_oom_synchronize(bool wait);
 extern int do_swap_account;
 #endif
 
-void lock_page_memcg(struct page *page);
+struct mem_cgroup *lock_page_memcg(struct page *page);
+void __unlock_page_memcg(struct mem_cgroup *memcg);
 void unlock_page_memcg(struct page *page);
 
 static inline unsigned long memcg_page_state(struct mem_cgroup *memcg,
@@ -809,7 +810,12 @@ mem_cgroup_print_oom_info(struct mem_cgroup *memcg, struct task_struct *p)
 {
 }
 
-static inline void lock_page_memcg(struct page *page)
+static inline struct mem_cgroup *lock_page_memcg(struct page *page)
+{
+	return NULL;
+}
+
+static inline void __unlock_page_memcg(struct mem_cgroup *memcg)
 {
 }
 
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 3df3c04d73ab..e09741af816f 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -1611,9 +1611,13 @@ cleanup:
  * @page: the page
  *
  * This function protects unlocked LRU pages from being moved to
- * another cgroup and stabilizes their page->mem_cgroup binding.
+ * another cgroup.
+ *
+ * It ensures lifetime of the returned memcg. Caller is responsible
+ * for the lifetime of the page; __unlock_page_memcg() is available
+ * when @page might get freed inside the locked section.
  */
-void lock_page_memcg(struct page *page)
+struct mem_cgroup *lock_page_memcg(struct page *page)
 {
 	struct mem_cgroup *memcg;
 	unsigned long flags;
@@ -1622,18 +1626,24 @@ void lock_page_memcg(struct page *page)
 	 * The RCU lock is held throughout the transaction.  The fast
 	 * path can get away without acquiring the memcg->move_lock
 	 * because page moving starts with an RCU grace period.
-	 */
+	 *
+	 * The RCU lock also protects the memcg from being freed when
+	 * the page state that is going to change is the only thing
+	 * preventing the page itself from being freed. E.g. writeback
+	 * doesn't hold a page reference and relies on PG_writeback to
+	 * keep off truncation, migration and so forth.
+         */
 	rcu_read_lock();
 
 	if (mem_cgroup_disabled())
-		return;
+		return NULL;
 again:
 	memcg = page->mem_cgroup;
 	if (unlikely(!memcg))
-		return;
+		return NULL;
 
 	if (atomic_read(&memcg->moving_account) <= 0)
-		return;
+		return memcg;
 
 	spin_lock_irqsave(&memcg->move_lock, flags);
 	if (memcg != page->mem_cgroup) {
@@ -1649,18 +1659,18 @@ again:
 	memcg->move_lock_task = current;
 	memcg->move_lock_flags = flags;
 
-	return;
+	return memcg;
 }
 EXPORT_SYMBOL(lock_page_memcg);
 
 /**
- * unlock_page_memcg - unlock a page->mem_cgroup binding
- * @page: the page
+ * __unlock_page_memcg - unlock and unpin a memcg
+ * @memcg: the memcg
+ *
+ * Unlock and unpin a memcg returned by lock_page_memcg().
  */
-void unlock_page_memcg(struct page *page)
+void __unlock_page_memcg(struct mem_cgroup *memcg)
 {
-	struct mem_cgroup *memcg = page->mem_cgroup;
-
 	if (memcg && memcg->move_lock_task == current) {
 		unsigned long flags = memcg->move_lock_flags;
 
@@ -1672,6 +1682,15 @@ void unlock_page_memcg(struct page *page)
 
 	rcu_read_unlock();
 }
+
+/**
+ * unlock_page_memcg - unlock a page->mem_cgroup binding
+ * @page: the page
+ */
+void unlock_page_memcg(struct page *page)
+{
+	__unlock_page_memcg(page->mem_cgroup);
+}
 EXPORT_SYMBOL(unlock_page_memcg);
 
 /*
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index 96e93b214d31..bf050ab025b7 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -2724,9 +2724,12 @@ EXPORT_SYMBOL(clear_page_dirty_for_io);
 int test_clear_page_writeback(struct page *page)
 {
 	struct address_space *mapping = page_mapping(page);
+	struct mem_cgroup *memcg;
+	struct lruvec *lruvec;
 	int ret;
 
-	lock_page_memcg(page);
+	memcg = lock_page_memcg(page);
+	lruvec = mem_cgroup_page_lruvec(page, page_pgdat(page));
 	if (mapping && mapping_use_writeback_tags(mapping)) {
 		struct inode *inode = mapping->host;
 		struct backing_dev_info *bdi = inode_to_bdi(inode);
@@ -2754,12 +2757,18 @@ int test_clear_page_writeback(struct page *page)
 	} else {
 		ret = TestClearPageWriteback(page);
 	}
+	/*
+	 * NOTE: Page might be free now! Writeback doesn't hold a page
+	 * reference on its own, it relies on truncation to wait for
+	 * the clearing of PG_writeback. The below can only access
+	 * page state that is static across allocation cycles.
+	 */
 	if (ret) {
-		dec_lruvec_page_state(page, NR_WRITEBACK);
+		dec_lruvec_state(lruvec, NR_WRITEBACK);
 		dec_zone_page_state(page, NR_ZONE_WRITE_PENDING);
 		inc_node_page_state(page, NR_WRITTEN);
 	}
-	unlock_page_memcg(page);
+	__unlock_page_memcg(memcg);
 	return ret;
 }
 
-- 
cgit v1.2.3


From 8ada92799ec4de00f4bc0f10b1ededa256c1ab22 Mon Sep 17 00:00:00 2001
From: "Luis R. Rodriguez" <mcgrof@kernel.org>
Date: Fri, 18 Aug 2017 15:15:55 -0700
Subject: wait: add wait_event_killable_timeout()

These are the few pending fixes I have queued up for v4.13-final.  One
is a a generic regression fix for recursive loops on kmod and the other
one is a trivial print out correction.

During the v4.13 development we assumed that recursive kmod loops were
no longer possible.  Clearly that is not true.  The regression fix makes
use of a new killable wait.  We use a killable wait to be paranoid in
how signals might be sent to modprobe and only accept a proper SIGKILL.
The signal will only be available to userspace to issue *iff* a thread
has already entered a wait state, and that happens only if we've already
throttled after 50 kmod threads have been hit.

Note that although it may seem excessive to trigger a failure afer 5
seconds if all kmod thread remain busy, prior to the series of changes
that went into v4.13 we would actually *always* fatally fail any request
which came in if the limit was already reached.  The new waiting
implemented in v4.13 actually gives us *more* breathing room -- the wait
for 5 seconds is a wait for *any* kmod thread to finish.  We give up and
fail *iff* no kmod thread has finished and they're *all* running
straight for 5 consecutive seconds.  If 50 kmod threads are running
consecutively for 5 seconds something else must be really bad.

Recursive loops with kmod are bad but they're also hard to implement
properly as a selftest without currently fooling current userspace tools
like kmod [1].  For instance kmod will complain when you run depmod if
it finds a recursive loop with symbol dependency between modules as such
this type of recursive loop cannot go upstream as the modules_install
target will fail after running depmod.

These tests already exist on userspace kmod upstream though (refer to
the testsuite/module-playground/mod-loop-*.c files).  The same is not
true if request_module() is used though, or worst if aliases are used.

Likewise the issue with 64-bit kernels booting 32-bit userspace without
a binfmt handler built-in is also currently not detected and proactively
avoided by userspace kmod tools, or kconfig for all architectures.
Although we could complain in the kernel when some of these individual
recursive issues creep up, proactively avoiding these situations in
userspace at build time is what we should keep striving for.

Lastly, since recursive loops could happen with kmod it may mean
recursive loops may also be possible with other kernel usermode helpers,
this should be investigated and long term if we can come up with a more
sensible generic solution even better!

[0] https://git.kernel.org/pub/scm/linux/kernel/git/mcgrof/linux.git/log/?h=20170809-kmod-for-v4.13-final
[1] https://git.kernel.org/pub/scm/utils/kernel/kmod/kmod.git

This patch (of 3):

This wait is similar to wait_event_interruptible_timeout() but only
accepts SIGKILL interrupt signal.  Other signals are ignored.

Link: http://lkml.kernel.org/r/20170809234635.13443-2-mcgrof@kernel.org
Signed-off-by: Luis R. Rodriguez <mcgrof@kernel.org>
Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Kees Cook <keescook@chromium.org>
Cc: Dmitry Torokhov <dmitry.torokhov@gmail.com>
Cc: Jessica Yu <jeyu@redhat.com>
Cc: Rusty Russell <rusty@rustcorp.com.au>
Cc: Michal Marek <mmarek@suse.com>
Cc: Petr Mladek <pmladek@suse.com>
Cc: Miroslav Benes <mbenes@suse.cz>
Cc: Josh Poimboeuf <jpoimboe@redhat.com>
Cc: "Eric W. Biederman" <ebiederm@xmission.com>
Cc: Shuah Khan <shuah@kernel.org>
Cc: Matt Redfearn <matt.redfearn@imgtec.com>
Cc: Dan Carpenter <dan.carpenter@oracle.com>
Cc: Colin Ian King <colin.king@canonical.com>
Cc: Daniel Mentz <danielmentz@google.com>
Cc: David Binderman <dcb314@hotmail.com>
Cc: Matt Redfearn <matt.redfearn@imgetc.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/wait.h | 37 +++++++++++++++++++++++++++++++++++++
 1 file changed, 37 insertions(+)

(limited to 'include')

diff --git a/include/linux/wait.h b/include/linux/wait.h
index 5b74e36c0ca8..dc19880c02f5 100644
--- a/include/linux/wait.h
+++ b/include/linux/wait.h
@@ -757,6 +757,43 @@ extern int do_wait_intr_irq(wait_queue_head_t *, wait_queue_entry_t *);
 	__ret;									\
 })
 
+#define __wait_event_killable_timeout(wq_head, condition, timeout)		\
+	___wait_event(wq_head, ___wait_cond_timeout(condition),			\
+		      TASK_KILLABLE, 0, timeout,				\
+		      __ret = schedule_timeout(__ret))
+
+/**
+ * wait_event_killable_timeout - sleep until a condition gets true or a timeout elapses
+ * @wq_head: the waitqueue to wait on
+ * @condition: a C expression for the event to wait for
+ * @timeout: timeout, in jiffies
+ *
+ * The process is put to sleep (TASK_KILLABLE) until the
+ * @condition evaluates to true or a kill signal is received.
+ * The @condition is checked each time the waitqueue @wq_head is woken up.
+ *
+ * wake_up() has to be called after changing any variable that could
+ * change the result of the wait condition.
+ *
+ * Returns:
+ * 0 if the @condition evaluated to %false after the @timeout elapsed,
+ * 1 if the @condition evaluated to %true after the @timeout elapsed,
+ * the remaining jiffies (at least 1) if the @condition evaluated
+ * to %true before the @timeout elapsed, or -%ERESTARTSYS if it was
+ * interrupted by a kill signal.
+ *
+ * Only kill signals interrupt this process.
+ */
+#define wait_event_killable_timeout(wq_head, condition, timeout)		\
+({										\
+	long __ret = timeout;							\
+	might_sleep();								\
+	if (!___wait_cond_timeout(condition))					\
+		__ret = __wait_event_killable_timeout(wq_head,			\
+						condition, timeout);		\
+	__ret;									\
+})
+
 
 #define __wait_event_lock_irq(wq_head, condition, lock, cmd)			\
 	(void)___wait_event(wq_head, condition, TASK_UNINTERRUPTIBLE, 0, 0,	\
-- 
cgit v1.2.3


From 3010f876500f9ba921afaeccec30c45ca6584dc8 Mon Sep 17 00:00:00 2001
From: Pavel Tatashin <pasha.tatashin@oracle.com>
Date: Fri, 18 Aug 2017 15:16:05 -0700
Subject: mm: discard memblock data later

There is existing use after free bug when deferred struct pages are
enabled:

The memblock_add() allocates memory for the memory array if more than
128 entries are needed.  See comment in e820__memblock_setup():

  * The bootstrap memblock region count maximum is 128 entries
  * (INIT_MEMBLOCK_REGIONS), but EFI might pass us more E820 entries
  * than that - so allow memblock resizing.

This memblock memory is freed here:
        free_low_memory_core_early()

We access the freed memblock.memory later in boot when deferred pages
are initialized in this path:

        deferred_init_memmap()
                for_each_mem_pfn_range()
                  __next_mem_pfn_range()
                    type = &memblock.memory;

One possible explanation for why this use-after-free hasn't been hit
before is that the limit of INIT_MEMBLOCK_REGIONS has never been
exceeded at least on systems where deferred struct pages were enabled.

Tested by reducing INIT_MEMBLOCK_REGIONS down to 4 from the current 128,
and verifying in qemu that this code is getting excuted and that the
freed pages are sane.

Link: http://lkml.kernel.org/r/1502485554-318703-2-git-send-email-pasha.tatashin@oracle.com
Fixes: 7e18adb4f80b ("mm: meminit: initialise remaining struct pages in parallel with kswapd")
Signed-off-by: Pavel Tatashin <pasha.tatashin@oracle.com>
Reviewed-by: Steven Sistare <steven.sistare@oracle.com>
Reviewed-by: Daniel Jordan <daniel.m.jordan@oracle.com>
Reviewed-by: Bob Picco <bob.picco@oracle.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Cc: Mel Gorman <mgorman@techsingularity.net>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/memblock.h |  6 ++++--
 mm/memblock.c            | 38 +++++++++++++++++---------------------
 mm/nobootmem.c           | 16 ----------------
 mm/page_alloc.c          |  4 ++++
 4 files changed, 25 insertions(+), 39 deletions(-)

(limited to 'include')

diff --git a/include/linux/memblock.h b/include/linux/memblock.h
index 77d427974f57..bae11c7e7bf3 100644
--- a/include/linux/memblock.h
+++ b/include/linux/memblock.h
@@ -61,6 +61,7 @@ extern int memblock_debug;
 #ifdef CONFIG_ARCH_DISCARD_MEMBLOCK
 #define __init_memblock __meminit
 #define __initdata_memblock __meminitdata
+void memblock_discard(void);
 #else
 #define __init_memblock
 #define __initdata_memblock
@@ -74,8 +75,6 @@ phys_addr_t memblock_find_in_range_node(phys_addr_t size, phys_addr_t align,
 					int nid, ulong flags);
 phys_addr_t memblock_find_in_range(phys_addr_t start, phys_addr_t end,
 				   phys_addr_t size, phys_addr_t align);
-phys_addr_t get_allocated_memblock_reserved_regions_info(phys_addr_t *addr);
-phys_addr_t get_allocated_memblock_memory_regions_info(phys_addr_t *addr);
 void memblock_allow_resize(void);
 int memblock_add_node(phys_addr_t base, phys_addr_t size, int nid);
 int memblock_add(phys_addr_t base, phys_addr_t size);
@@ -110,6 +109,9 @@ void __next_mem_range_rev(u64 *idx, int nid, ulong flags,
 void __next_reserved_mem_region(u64 *idx, phys_addr_t *out_start,
 				phys_addr_t *out_end);
 
+void __memblock_free_early(phys_addr_t base, phys_addr_t size);
+void __memblock_free_late(phys_addr_t base, phys_addr_t size);
+
 /**
  * for_each_mem_range - iterate through memblock areas from type_a and not
  * included in type_b. Or just type_a if type_b is NULL.
diff --git a/mm/memblock.c b/mm/memblock.c
index 2cb25fe4452c..bf14aea6ab70 100644
--- a/mm/memblock.c
+++ b/mm/memblock.c
@@ -285,31 +285,27 @@ static void __init_memblock memblock_remove_region(struct memblock_type *type, u
 }
 
 #ifdef CONFIG_ARCH_DISCARD_MEMBLOCK
-
-phys_addr_t __init_memblock get_allocated_memblock_reserved_regions_info(
-					phys_addr_t *addr)
-{
-	if (memblock.reserved.regions == memblock_reserved_init_regions)
-		return 0;
-
-	*addr = __pa(memblock.reserved.regions);
-
-	return PAGE_ALIGN(sizeof(struct memblock_region) *
-			  memblock.reserved.max);
-}
-
-phys_addr_t __init_memblock get_allocated_memblock_memory_regions_info(
-					phys_addr_t *addr)
+/**
+ * Discard memory and reserved arrays if they were allocated
+ */
+void __init memblock_discard(void)
 {
-	if (memblock.memory.regions == memblock_memory_init_regions)
-		return 0;
+	phys_addr_t addr, size;
 
-	*addr = __pa(memblock.memory.regions);
+	if (memblock.reserved.regions != memblock_reserved_init_regions) {
+		addr = __pa(memblock.reserved.regions);
+		size = PAGE_ALIGN(sizeof(struct memblock_region) *
+				  memblock.reserved.max);
+		__memblock_free_late(addr, size);
+	}
 
-	return PAGE_ALIGN(sizeof(struct memblock_region) *
-			  memblock.memory.max);
+	if (memblock.memory.regions == memblock_memory_init_regions) {
+		addr = __pa(memblock.memory.regions);
+		size = PAGE_ALIGN(sizeof(struct memblock_region) *
+				  memblock.memory.max);
+		__memblock_free_late(addr, size);
+	}
 }
-
 #endif
 
 /**
diff --git a/mm/nobootmem.c b/mm/nobootmem.c
index 36454d0f96ee..3637809a18d0 100644
--- a/mm/nobootmem.c
+++ b/mm/nobootmem.c
@@ -146,22 +146,6 @@ static unsigned long __init free_low_memory_core_early(void)
 				NULL)
 		count += __free_memory_core(start, end);
 
-#ifdef CONFIG_ARCH_DISCARD_MEMBLOCK
-	{
-		phys_addr_t size;
-
-		/* Free memblock.reserved array if it was allocated */
-		size = get_allocated_memblock_reserved_regions_info(&start);
-		if (size)
-			count += __free_memory_core(start, start + size);
-
-		/* Free memblock.memory array if it was allocated */
-		size = get_allocated_memblock_memory_regions_info(&start);
-		if (size)
-			count += __free_memory_core(start, start + size);
-	}
-#endif
-
 	return count;
 }
 
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 6d00f746c2fd..1bad301820c7 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -1584,6 +1584,10 @@ void __init page_alloc_init_late(void)
 	/* Reinit limits that are based on free pages after the kernel is up */
 	files_maxfiles_init();
 #endif
+#ifdef CONFIG_ARCH_DISCARD_MEMBLOCK
+	/* Discard memblock private memory */
+	memblock_discard();
+#endif
 
 	for_each_populated_zone(zone)
 		set_zone_contiguous(zone);
-- 
cgit v1.2.3


From 6b31d5955cb29a51c5baffee382f213d75e98fb8 Mon Sep 17 00:00:00 2001
From: Michal Hocko <mhocko@suse.com>
Date: Fri, 18 Aug 2017 15:16:15 -0700
Subject: mm, oom: fix potential data corruption when oom_reaper races with
 writer

Wenwei Tao has noticed that our current assumption that the oom victim
is dying and never doing any visible changes after it dies, and so the
oom_reaper can tear it down, is not entirely true.

__task_will_free_mem consider a task dying when SIGNAL_GROUP_EXIT is set
but do_group_exit sends SIGKILL to all threads _after_ the flag is set.
So there is a race window when some threads won't have
fatal_signal_pending while the oom_reaper could start unmapping the
address space.  Moreover some paths might not check for fatal signals
before each PF/g-u-p/copy_from_user.

We already have a protection for oom_reaper vs.  PF races by checking
MMF_UNSTABLE.  This has been, however, checked only for kernel threads
(use_mm users) which can outlive the oom victim.  A simple fix would be
to extend the current check in handle_mm_fault for all tasks but that
wouldn't be sufficient because the current check assumes that a kernel
thread would bail out after EFAULT from get_user*/copy_from_user and
never re-read the same address which would succeed because the PF path
has established page tables already.  This seems to be the case for the
only existing use_mm user currently (virtio driver) but it is rather
fragile in general.

This is even more fragile in general for more complex paths such as
generic_perform_write which can re-read the same address more times
(e.g.  iov_iter_copy_from_user_atomic to fail and then
iov_iter_fault_in_readable on retry).

Therefore we have to implement MMF_UNSTABLE protection in a robust way
and never make a potentially corrupted content visible.  That requires
to hook deeper into the PF path and check for the flag _every time_
before a pte for anonymous memory is established (that means all
!VM_SHARED mappings).

The corruption can be triggered artificially
(http://lkml.kernel.org/r/201708040646.v746kkhC024636@www262.sakura.ne.jp)
but there doesn't seem to be any real life bug report.  The race window
should be quite tight to trigger most of the time.

Link: http://lkml.kernel.org/r/20170807113839.16695-3-mhocko@kernel.org
Fixes: aac453635549 ("mm, oom: introduce oom reaper")
Signed-off-by: Michal Hocko <mhocko@suse.com>
Reported-by: Wenwei Tao <wenwei.tww@alibaba-inc.com>
Tested-by: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
Cc: "Kirill A. Shutemov" <kirill@shutemov.name>
Cc: Andrea Argangeli <andrea@kernel.org>
Cc: David Rientjes <rientjes@google.com>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Tetsuo Handa <penguin-kernel@i-love.sakura.ne.jp>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/oom.h | 22 ++++++++++++++++++++++
 mm/huge_memory.c    | 30 ++++++++++++++++++++++--------
 mm/memory.c         | 46 ++++++++++++++++++++--------------------------
 3 files changed, 64 insertions(+), 34 deletions(-)

(limited to 'include')

diff --git a/include/linux/oom.h b/include/linux/oom.h
index 8a266e2be5a6..76aac4ce39bc 100644
--- a/include/linux/oom.h
+++ b/include/linux/oom.h
@@ -6,6 +6,8 @@
 #include <linux/types.h>
 #include <linux/nodemask.h>
 #include <uapi/linux/oom.h>
+#include <linux/sched/coredump.h> /* MMF_* */
+#include <linux/mm.h> /* VM_FAULT* */
 
 struct zonelist;
 struct notifier_block;
@@ -63,6 +65,26 @@ static inline bool tsk_is_oom_victim(struct task_struct * tsk)
 	return tsk->signal->oom_mm;
 }
 
+/*
+ * Checks whether a page fault on the given mm is still reliable.
+ * This is no longer true if the oom reaper started to reap the
+ * address space which is reflected by MMF_UNSTABLE flag set in
+ * the mm. At that moment any !shared mapping would lose the content
+ * and could cause a memory corruption (zero pages instead of the
+ * original content).
+ *
+ * User should call this before establishing a page table entry for
+ * a !shared mapping and under the proper page table lock.
+ *
+ * Return 0 when the PF is safe VM_FAULT_SIGBUS otherwise.
+ */
+static inline int check_stable_address_space(struct mm_struct *mm)
+{
+	if (unlikely(test_bit(MMF_UNSTABLE, &mm->flags)))
+		return VM_FAULT_SIGBUS;
+	return 0;
+}
+
 extern unsigned long oom_badness(struct task_struct *p,
 		struct mem_cgroup *memcg, const nodemask_t *nodemask,
 		unsigned long totalpages);
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 216114f6ef0b..90731e3b7e58 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -32,6 +32,7 @@
 #include <linux/userfaultfd_k.h>
 #include <linux/page_idle.h>
 #include <linux/shmem_fs.h>
+#include <linux/oom.h>
 
 #include <asm/tlb.h>
 #include <asm/pgalloc.h>
@@ -550,6 +551,7 @@ static int __do_huge_pmd_anonymous_page(struct vm_fault *vmf, struct page *page,
 	struct mem_cgroup *memcg;
 	pgtable_t pgtable;
 	unsigned long haddr = vmf->address & HPAGE_PMD_MASK;
+	int ret = 0;
 
 	VM_BUG_ON_PAGE(!PageCompound(page), page);
 
@@ -561,9 +563,8 @@ static int __do_huge_pmd_anonymous_page(struct vm_fault *vmf, struct page *page,
 
 	pgtable = pte_alloc_one(vma->vm_mm, haddr);
 	if (unlikely(!pgtable)) {
-		mem_cgroup_cancel_charge(page, memcg, true);
-		put_page(page);
-		return VM_FAULT_OOM;
+		ret = VM_FAULT_OOM;
+		goto release;
 	}
 
 	clear_huge_page(page, haddr, HPAGE_PMD_NR);
@@ -576,13 +577,14 @@ static int __do_huge_pmd_anonymous_page(struct vm_fault *vmf, struct page *page,
 
 	vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd);
 	if (unlikely(!pmd_none(*vmf->pmd))) {
-		spin_unlock(vmf->ptl);
-		mem_cgroup_cancel_charge(page, memcg, true);
-		put_page(page);
-		pte_free(vma->vm_mm, pgtable);
+		goto unlock_release;
 	} else {
 		pmd_t entry;
 
+		ret = check_stable_address_space(vma->vm_mm);
+		if (ret)
+			goto unlock_release;
+
 		/* Deliver the page fault to userland */
 		if (userfaultfd_missing(vma)) {
 			int ret;
@@ -610,6 +612,15 @@ static int __do_huge_pmd_anonymous_page(struct vm_fault *vmf, struct page *page,
 	}
 
 	return 0;
+unlock_release:
+	spin_unlock(vmf->ptl);
+release:
+	if (pgtable)
+		pte_free(vma->vm_mm, pgtable);
+	mem_cgroup_cancel_charge(page, memcg, true);
+	put_page(page);
+	return ret;
+
 }
 
 /*
@@ -688,7 +699,10 @@ int do_huge_pmd_anonymous_page(struct vm_fault *vmf)
 		ret = 0;
 		set = false;
 		if (pmd_none(*vmf->pmd)) {
-			if (userfaultfd_missing(vma)) {
+			ret = check_stable_address_space(vma->vm_mm);
+			if (ret) {
+				spin_unlock(vmf->ptl);
+			} else if (userfaultfd_missing(vma)) {
 				spin_unlock(vmf->ptl);
 				ret = handle_userfault(vmf, VM_UFFD_MISSING);
 				VM_BUG_ON(ret & VM_FAULT_FALLBACK);
diff --git a/mm/memory.c b/mm/memory.c
index c717b5bcc80e..fe2fba27ded2 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -68,6 +68,7 @@
 #include <linux/debugfs.h>
 #include <linux/userfaultfd_k.h>
 #include <linux/dax.h>
+#include <linux/oom.h>
 
 #include <asm/io.h>
 #include <asm/mmu_context.h>
@@ -2893,6 +2894,7 @@ static int do_anonymous_page(struct vm_fault *vmf)
 	struct vm_area_struct *vma = vmf->vma;
 	struct mem_cgroup *memcg;
 	struct page *page;
+	int ret = 0;
 	pte_t entry;
 
 	/* File mapping without ->vm_ops ? */
@@ -2925,6 +2927,9 @@ static int do_anonymous_page(struct vm_fault *vmf)
 				vmf->address, &vmf->ptl);
 		if (!pte_none(*vmf->pte))
 			goto unlock;
+		ret = check_stable_address_space(vma->vm_mm);
+		if (ret)
+			goto unlock;
 		/* Deliver the page fault to userland, check inside PT lock */
 		if (userfaultfd_missing(vma)) {
 			pte_unmap_unlock(vmf->pte, vmf->ptl);
@@ -2959,6 +2964,10 @@ static int do_anonymous_page(struct vm_fault *vmf)
 	if (!pte_none(*vmf->pte))
 		goto release;
 
+	ret = check_stable_address_space(vma->vm_mm);
+	if (ret)
+		goto release;
+
 	/* Deliver the page fault to userland, check inside PT lock */
 	if (userfaultfd_missing(vma)) {
 		pte_unmap_unlock(vmf->pte, vmf->ptl);
@@ -2978,7 +2987,7 @@ setpte:
 	update_mmu_cache(vma, vmf->address, vmf->pte);
 unlock:
 	pte_unmap_unlock(vmf->pte, vmf->ptl);
-	return 0;
+	return ret;
 release:
 	mem_cgroup_cancel_charge(page, memcg, false);
 	put_page(page);
@@ -3252,7 +3261,7 @@ int alloc_set_pte(struct vm_fault *vmf, struct mem_cgroup *memcg,
 int finish_fault(struct vm_fault *vmf)
 {
 	struct page *page;
-	int ret;
+	int ret = 0;
 
 	/* Did we COW the page? */
 	if ((vmf->flags & FAULT_FLAG_WRITE) &&
@@ -3260,7 +3269,15 @@ int finish_fault(struct vm_fault *vmf)
 		page = vmf->cow_page;
 	else
 		page = vmf->page;
-	ret = alloc_set_pte(vmf, vmf->memcg, page);
+
+	/*
+	 * check even for read faults because we might have lost our CoWed
+	 * page
+	 */
+	if (!(vmf->vma->vm_flags & VM_SHARED))
+		ret = check_stable_address_space(vmf->vma->vm_mm);
+	if (!ret)
+		ret = alloc_set_pte(vmf, vmf->memcg, page);
 	if (vmf->pte)
 		pte_unmap_unlock(vmf->pte, vmf->ptl);
 	return ret;
@@ -3900,29 +3917,6 @@ int handle_mm_fault(struct vm_area_struct *vma, unsigned long address,
 			mem_cgroup_oom_synchronize(false);
 	}
 
-	/*
-	 * This mm has been already reaped by the oom reaper and so the
-	 * refault cannot be trusted in general. Anonymous refaults would
-	 * lose data and give a zero page instead e.g. This is especially
-	 * problem for use_mm() because regular tasks will just die and
-	 * the corrupted data will not be visible anywhere while kthread
-	 * will outlive the oom victim and potentially propagate the date
-	 * further.
-	 */
-	if (unlikely((current->flags & PF_KTHREAD) && !(ret & VM_FAULT_ERROR)
-				&& test_bit(MMF_UNSTABLE, &vma->vm_mm->flags))) {
-
-		/*
-		 * We are going to enforce SIGBUS but the PF path might have
-		 * dropped the mmap_sem already so take it again so that
-		 * we do not break expectations of all arch specific PF paths
-		 * and g-u-p
-		 */
-		if (ret & VM_FAULT_RETRY)
-			down_read(&vma->vm_mm->mmap_sem);
-		ret = VM_FAULT_SIGBUS;
-	}
-
 	return ret;
 }
 EXPORT_SYMBOL_GPL(handle_mm_fault);
-- 
cgit v1.2.3


From 0888e372c37fa31882c8ed89fb2f8188b08b6718 Mon Sep 17 00:00:00 2001
From: "Levin, Alexander (Sasha Levin)" <alexander.levin@one.verizon.com>
Date: Thu, 17 Aug 2017 00:35:11 +0000
Subject: net: inet: diag: expose sockets cgroup classid

This is useful for directly looking up a task based on class id rather than
having to scan through all open file descriptors.

Signed-off-by: Sasha Levin <alexander.levin@verizon.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/inet_diag.h |  1 +
 net/ipv4/inet_diag.c           | 11 +++++++++++
 2 files changed, 12 insertions(+)

(limited to 'include')

diff --git a/include/uapi/linux/inet_diag.h b/include/uapi/linux/inet_diag.h
index bbe201047df6..678496897a68 100644
--- a/include/uapi/linux/inet_diag.h
+++ b/include/uapi/linux/inet_diag.h
@@ -142,6 +142,7 @@ enum {
 	INET_DIAG_PAD,
 	INET_DIAG_MARK,
 	INET_DIAG_BBRINFO,
+	INET_DIAG_CLASS_ID,
 	__INET_DIAG_MAX,
 };
 
diff --git a/net/ipv4/inet_diag.c b/net/ipv4/inet_diag.c
index 3828b3a805cd..67325d5832d7 100644
--- a/net/ipv4/inet_diag.c
+++ b/net/ipv4/inet_diag.c
@@ -274,6 +274,17 @@ int inet_sk_diag_fill(struct sock *sk, struct inet_connection_sock *icsk,
 			goto errout;
 	}
 
+	if (ext & (1 << (INET_DIAG_CLASS_ID - 1))) {
+		u32 classid = 0;
+
+#ifdef CONFIG_SOCK_CGROUP_DATA
+		classid = sock_cgroup_classid(&sk->sk_cgrp_data);
+#endif
+
+		if (nla_put_u32(skb, INET_DIAG_CLASS_ID, classid))
+			goto errout;
+	}
+
 out:
 	nlmsg_end(skb, nlh);
 	return 0;
-- 
cgit v1.2.3


From 4c03bdd7b5c084c3c6973cb2419edac5363c051f Mon Sep 17 00:00:00 2001
From: Jesper Dangaard Brouer <brouer@redhat.com>
Date: Thu, 17 Aug 2017 18:22:37 +0200
Subject: xdp: adjust xdp redirect tracepoint to include return error code

The return error code need to be included in the tracepoint
xdp:xdp_redirect, else its not possible to distinguish successful or
failed XDP_REDIRECT transmits.

XDP have no queuing mechanism. Thus, it is fairly easily to overrun a
NIC transmit queue.  The eBPF program invoking helpers (bpf_redirect
or bpf_redirect_map) to redirect a packet doesn't get any feedback
whether the packet was actually transmitted.

Info on failed transmits in the tracepoint xdp:xdp_redirect, is
interesting as this opens for providing a feedback-loop to the
receiving XDP program.

Signed-off-by: Jesper Dangaard Brouer <brouer@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/trace/events/xdp.h | 11 +++++++----
 net/core/filter.c          | 19 ++++++++++++-------
 2 files changed, 19 insertions(+), 11 deletions(-)

(limited to 'include')

diff --git a/include/trace/events/xdp.h b/include/trace/events/xdp.h
index 7b1eb7b4be41..0e42e69f773b 100644
--- a/include/trace/events/xdp.h
+++ b/include/trace/events/xdp.h
@@ -53,15 +53,16 @@ TRACE_EVENT(xdp_redirect,
 
 	TP_PROTO(const struct net_device *from,
 		 const struct net_device *to,
-		 const struct bpf_prog *xdp, u32 act),
+		 const struct bpf_prog *xdp, u32 act, int err),
 
-	TP_ARGS(from, to, xdp, act),
+	TP_ARGS(from, to, xdp, act, err),
 
 	TP_STRUCT__entry(
 		__string(name_from, from->name)
 		__string(name_to, to->name)
 		__array(u8, prog_tag, 8)
 		__field(u32, act)
+		__field(int, err)
 	),
 
 	TP_fast_assign(
@@ -70,12 +71,14 @@ TRACE_EVENT(xdp_redirect,
 		__assign_str(name_from, from->name);
 		__assign_str(name_to, to->name);
 		__entry->act = act;
+		__entry->err = err;
 	),
 
-	TP_printk("prog=%s from=%s to=%s action=%s",
+	TP_printk("prog=%s from=%s to=%s action=%s err=%d",
 		  __print_hex_str(__entry->prog_tag, 8),
 		  __get_str(name_from), __get_str(name_to),
-		  __print_symbolic(__entry->act, __XDP_ACT_SYM_TAB))
+		  __print_symbolic(__entry->act, __XDP_ACT_SYM_TAB),
+		  __entry->err)
 );
 #endif /* _TRACE_XDP_H */
 
diff --git a/net/core/filter.c b/net/core/filter.c
index 0f4df86d936a..fa2115695037 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -2535,14 +2535,16 @@ int xdp_do_redirect_map(struct net_device *dev, struct xdp_buff *xdp,
 	struct bpf_map *map = ri->map;
 	u32 index = ri->ifindex;
 	struct net_device *fwd;
-	int err = -EINVAL;
+	int err;
 
 	ri->ifindex = 0;
 	ri->map = NULL;
 
 	fwd = __dev_map_lookup_elem(map, index);
-	if (!fwd)
+	if (!fwd) {
+		err = -EINVAL;
 		goto out;
+	}
 
 	if (ri->map_to_flush && (ri->map_to_flush != map))
 		xdp_do_flush_map();
@@ -2552,7 +2554,7 @@ int xdp_do_redirect_map(struct net_device *dev, struct xdp_buff *xdp,
 		ri->map_to_flush = map;
 
 out:
-	trace_xdp_redirect(dev, fwd, xdp_prog, XDP_REDIRECT);
+	trace_xdp_redirect(dev, fwd, xdp_prog, XDP_REDIRECT, err);
 	return err;
 }
 
@@ -2562,6 +2564,7 @@ int xdp_do_redirect(struct net_device *dev, struct xdp_buff *xdp,
 	struct redirect_info *ri = this_cpu_ptr(&redirect_info);
 	struct net_device *fwd;
 	u32 index = ri->ifindex;
+	int err;
 
 	if (ri->map)
 		return xdp_do_redirect_map(dev, xdp, xdp_prog);
@@ -2570,12 +2573,14 @@ int xdp_do_redirect(struct net_device *dev, struct xdp_buff *xdp,
 	ri->ifindex = 0;
 	if (unlikely(!fwd)) {
 		bpf_warn_invalid_xdp_redirect(index);
-		return -EINVAL;
+		err = -EINVAL;
+		goto out;
 	}
 
-	trace_xdp_redirect(dev, fwd, xdp_prog, XDP_REDIRECT);
-
-	return __bpf_tx_xdp(fwd, NULL, xdp, 0);
+	err = __bpf_tx_xdp(fwd, NULL, xdp, 0);
+out:
+	trace_xdp_redirect(dev, fwd, xdp_prog, XDP_REDIRECT, err);
+	return err;
 }
 EXPORT_SYMBOL_GPL(xdp_do_redirect);
 
-- 
cgit v1.2.3


From 606c07f3086017a8c2d7ce0843807e81b541edcc Mon Sep 17 00:00:00 2001
From: "Lendacky, Thomas" <Thomas.Lendacky@amd.com>
Date: Fri, 18 Aug 2017 09:03:44 -0500
Subject: net: ethtool: Add macro to clear a link mode setting

There are currently macros to set and test an ETHTOOL_LINK_MODE_ setting,
but not to clear one. Add a macro to clear an ETHTOOL_LINK_MODE_ setting.

Signed-off-by: Tom Lendacky <thomas.lendacky@amd.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/ethtool.h | 11 +++++++++++
 1 file changed, 11 insertions(+)

(limited to 'include')

diff --git a/include/linux/ethtool.h b/include/linux/ethtool.h
index afdbb701fdb4..4587a4c36923 100644
--- a/include/linux/ethtool.h
+++ b/include/linux/ethtool.h
@@ -136,6 +136,17 @@ struct ethtool_link_ksettings {
 #define ethtool_link_ksettings_add_link_mode(ptr, name, mode)		\
 	__set_bit(ETHTOOL_LINK_MODE_ ## mode ## _BIT, (ptr)->link_modes.name)
 
+/**
+ * ethtool_link_ksettings_del_link_mode - clear bit in link_ksettings
+ * link mode mask
+ *   @ptr : pointer to struct ethtool_link_ksettings
+ *   @name : one of supported/advertising/lp_advertising
+ *   @mode : one of the ETHTOOL_LINK_MODE_*_BIT
+ * (not atomic, no bound checking)
+ */
+#define ethtool_link_ksettings_del_link_mode(ptr, name, mode)		\
+	__clear_bit(ETHTOOL_LINK_MODE_ ## mode ## _BIT, (ptr)->link_modes.name)
+
 /**
  * ethtool_link_ksettings_test_link_mode - test bit in ksettings link mode mask
  *   @ptr : pointer to struct ethtool_link_ksettings
-- 
cgit v1.2.3


From b793dc5c6edfb106fd57d12ad6aca64bf160b403 Mon Sep 17 00:00:00 2001
From: stephen hemminger <stephen@networkplumber.org>
Date: Fri, 18 Aug 2017 13:46:20 -0700
Subject: net: constify netdev_class_file

These functions are wrapper arount class_create_file which can take a
const attribute.

Signed-off-by: Stephen Hemminger <sthemmin@microsoft.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netdevice.h | 8 ++++----
 net/core/net-sysfs.c      | 4 ++--
 2 files changed, 6 insertions(+), 6 deletions(-)

(limited to 'include')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 0f1c4cb2441e..eaa77bd9cb80 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -4013,17 +4013,17 @@ static inline netdev_tx_t netdev_start_xmit(struct sk_buff *skb, struct net_devi
 	return rc;
 }
 
-int netdev_class_create_file_ns(struct class_attribute *class_attr,
+int netdev_class_create_file_ns(const struct class_attribute *class_attr,
 				const void *ns);
-void netdev_class_remove_file_ns(struct class_attribute *class_attr,
+void netdev_class_remove_file_ns(const struct class_attribute *class_attr,
 				 const void *ns);
 
-static inline int netdev_class_create_file(struct class_attribute *class_attr)
+static inline int netdev_class_create_file(const struct class_attribute *class_attr)
 {
 	return netdev_class_create_file_ns(class_attr, NULL);
 }
 
-static inline void netdev_class_remove_file(struct class_attribute *class_attr)
+static inline void netdev_class_remove_file(const struct class_attribute *class_attr)
 {
 	netdev_class_remove_file_ns(class_attr, NULL);
 }
diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c
index 46ff41bf0210..40937ee63f14 100644
--- a/net/core/net-sysfs.c
+++ b/net/core/net-sysfs.c
@@ -1618,14 +1618,14 @@ int netdev_register_kobject(struct net_device *ndev)
 	return error;
 }
 
-int netdev_class_create_file_ns(struct class_attribute *class_attr,
+int netdev_class_create_file_ns(const struct class_attribute *class_attr,
 				const void *ns)
 {
 	return class_create_file_ns(&net_class, class_attr, ns);
 }
 EXPORT_SYMBOL(netdev_class_create_file_ns);
 
-void netdev_class_remove_file_ns(struct class_attribute *class_attr,
+void netdev_class_remove_file_ns(const struct class_attribute *class_attr,
 				 const void *ns)
 {
 	class_remove_file_ns(&net_class, class_attr, ns);
-- 
cgit v1.2.3


From 737aec57c672c1308d602afecd841455c39561e5 Mon Sep 17 00:00:00 2001
From: stephen hemminger <stephen@networkplumber.org>
Date: Fri, 18 Aug 2017 13:46:22 -0700
Subject: net: constify net_ns_type_operations

This can be const.

Signed-off-by: Stephen Hemminger <sthemmin@microsoft.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netdevice.h | 2 +-
 net/core/net-sysfs.c      | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index eaa77bd9cb80..b0c928598dab 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -4028,7 +4028,7 @@ static inline void netdev_class_remove_file(const struct class_attribute *class_
 	netdev_class_remove_file_ns(class_attr, NULL);
 }
 
-extern struct kobj_ns_type_operations net_ns_type_operations;
+extern const struct kobj_ns_type_operations net_ns_type_operations;
 
 const char *netdev_drivername(const struct net_device *dev);
 
diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c
index 99061b0a1ebd..2de441692f28 100644
--- a/net/core/net-sysfs.c
+++ b/net/core/net-sysfs.c
@@ -1461,7 +1461,7 @@ static const void *net_netlink_ns(struct sock *sk)
 	return sock_net(sk);
 }
 
-struct kobj_ns_type_operations net_ns_type_operations = {
+const struct kobj_ns_type_operations net_ns_type_operations = {
 	.type = KOBJ_NS_TYPE_NET,
 	.current_may_mount = net_current_may_mount,
 	.grab_current_ns = net_grab_current_ns,
-- 
cgit v1.2.3


From 718ad681eff47d3d04879ff5f5290bdab0b8bad6 Mon Sep 17 00:00:00 2001
From: stephen hemminger <stephen@networkplumber.org>
Date: Fri, 18 Aug 2017 13:46:24 -0700
Subject: net: drop unused attribute argument from sysfs queue funcs

The show and store functions don't need/use the attribute.

Signed-off-by: Stephen Hemminger <sthemmin@microsoft.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/virtio_net.c  |  2 +-
 include/linux/netdevice.h |  5 ++---
 net/core/net-sysfs.c      | 37 +++++++++++--------------------------
 3 files changed, 14 insertions(+), 30 deletions(-)

(limited to 'include')

diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
index 4302f313d9a7..52ae78ca3d38 100644
--- a/drivers/net/virtio_net.c
+++ b/drivers/net/virtio_net.c
@@ -2376,7 +2376,7 @@ err:
 
 #ifdef CONFIG_SYSFS
 static ssize_t mergeable_rx_buffer_size_show(struct netdev_rx_queue *queue,
-		struct rx_queue_attribute *attribute, char *buf)
+		char *buf)
 {
 	struct virtnet_info *vi = netdev_priv(queue->dev);
 	unsigned int queue_index = get_netdev_rx_queue_index(queue);
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index b0c928598dab..c5475b37a631 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -694,10 +694,9 @@ struct netdev_rx_queue {
  */
 struct rx_queue_attribute {
 	struct attribute attr;
-	ssize_t (*show)(struct netdev_rx_queue *queue,
-	    struct rx_queue_attribute *attr, char *buf);
+	ssize_t (*show)(struct netdev_rx_queue *queue, char *buf);
 	ssize_t (*store)(struct netdev_rx_queue *queue,
-	    struct rx_queue_attribute *attr, const char *buf, size_t len);
+			 const char *buf, size_t len);
 };
 
 #ifdef CONFIG_XPS
diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c
index 76ec74d4a65b..48714c8024f3 100644
--- a/net/core/net-sysfs.c
+++ b/net/core/net-sysfs.c
@@ -661,7 +661,7 @@ static ssize_t rx_queue_attr_show(struct kobject *kobj, struct attribute *attr,
 	if (!attribute->show)
 		return -EIO;
 
-	return attribute->show(queue, attribute, buf);
+	return attribute->show(queue, buf);
 }
 
 static ssize_t rx_queue_attr_store(struct kobject *kobj, struct attribute *attr,
@@ -673,7 +673,7 @@ static ssize_t rx_queue_attr_store(struct kobject *kobj, struct attribute *attr,
 	if (!attribute->store)
 		return -EIO;
 
-	return attribute->store(queue, attribute, buf, count);
+	return attribute->store(queue, buf, count);
 }
 
 static const struct sysfs_ops rx_queue_sysfs_ops = {
@@ -682,8 +682,7 @@ static const struct sysfs_ops rx_queue_sysfs_ops = {
 };
 
 #ifdef CONFIG_RPS
-static ssize_t show_rps_map(struct netdev_rx_queue *queue,
-			    struct rx_queue_attribute *attribute, char *buf)
+static ssize_t show_rps_map(struct netdev_rx_queue *queue, char *buf)
 {
 	struct rps_map *map;
 	cpumask_var_t mask;
@@ -706,8 +705,7 @@ static ssize_t show_rps_map(struct netdev_rx_queue *queue,
 }
 
 static ssize_t store_rps_map(struct netdev_rx_queue *queue,
-		      struct rx_queue_attribute *attribute,
-		      const char *buf, size_t len)
+			     const char *buf, size_t len)
 {
 	struct rps_map *old_map, *map;
 	cpumask_var_t mask;
@@ -765,7 +763,6 @@ static ssize_t store_rps_map(struct netdev_rx_queue *queue,
 }
 
 static ssize_t show_rps_dev_flow_table_cnt(struct netdev_rx_queue *queue,
-					   struct rx_queue_attribute *attr,
 					   char *buf)
 {
 	struct rps_dev_flow_table *flow_table;
@@ -788,8 +785,7 @@ static void rps_dev_flow_table_release(struct rcu_head *rcu)
 }
 
 static ssize_t store_rps_dev_flow_table_cnt(struct netdev_rx_queue *queue,
-				     struct rx_queue_attribute *attr,
-				     const char *buf, size_t len)
+					    const char *buf, size_t len)
 {
 	unsigned long mask, count;
 	struct rps_dev_flow_table *table, *old_table;
@@ -975,10 +971,9 @@ net_rx_queue_update_kobjects(struct net_device *dev, int old_num, int new_num)
  */
 struct netdev_queue_attribute {
 	struct attribute attr;
-	ssize_t (*show)(struct netdev_queue *queue,
-	    struct netdev_queue_attribute *attr, char *buf);
+	ssize_t (*show)(struct netdev_queue *queue, char *buf);
 	ssize_t (*store)(struct netdev_queue *queue,
-	    struct netdev_queue_attribute *attr, const char *buf, size_t len);
+			 const char *buf, size_t len);
 };
 #define to_netdev_queue_attr(_attr) container_of(_attr,		\
     struct netdev_queue_attribute, attr)
@@ -994,7 +989,7 @@ static ssize_t netdev_queue_attr_show(struct kobject *kobj,
 	if (!attribute->show)
 		return -EIO;
 
-	return attribute->show(queue, attribute, buf);
+	return attribute->show(queue, buf);
 }
 
 static ssize_t netdev_queue_attr_store(struct kobject *kobj,
@@ -1007,7 +1002,7 @@ static ssize_t netdev_queue_attr_store(struct kobject *kobj,
 	if (!attribute->store)
 		return -EIO;
 
-	return attribute->store(queue, attribute, buf, count);
+	return attribute->store(queue, buf, count);
 }
 
 static const struct sysfs_ops netdev_queue_sysfs_ops = {
@@ -1016,7 +1011,6 @@ static const struct sysfs_ops netdev_queue_sysfs_ops = {
 };
 
 static ssize_t show_trans_timeout(struct netdev_queue *queue,
-				  struct netdev_queue_attribute *attribute,
 				  char *buf)
 {
 	unsigned long trans_timeout;
@@ -1040,7 +1034,6 @@ static unsigned int get_netdev_queue_index(struct netdev_queue *queue)
 }
 
 static ssize_t show_traffic_class(struct netdev_queue *queue,
-				  struct netdev_queue_attribute *attribute,
 				  char *buf)
 {
 	struct net_device *dev = queue->dev;
@@ -1055,14 +1048,12 @@ static ssize_t show_traffic_class(struct netdev_queue *queue,
 
 #ifdef CONFIG_XPS
 static ssize_t show_tx_maxrate(struct netdev_queue *queue,
-			       struct netdev_queue_attribute *attribute,
 			       char *buf)
 {
 	return sprintf(buf, "%lu\n", queue->tx_maxrate);
 }
 
 static ssize_t set_tx_maxrate(struct netdev_queue *queue,
-			      struct netdev_queue_attribute *attribute,
 			      const char *buf, size_t len)
 {
 	struct net_device *dev = queue->dev;
@@ -1130,7 +1121,6 @@ static ssize_t bql_set(const char *buf, const size_t count,
 }
 
 static ssize_t bql_show_hold_time(struct netdev_queue *queue,
-				  struct netdev_queue_attribute *attr,
 				  char *buf)
 {
 	struct dql *dql = &queue->dql;
@@ -1139,7 +1129,6 @@ static ssize_t bql_show_hold_time(struct netdev_queue *queue,
 }
 
 static ssize_t bql_set_hold_time(struct netdev_queue *queue,
-				 struct netdev_queue_attribute *attribute,
 				 const char *buf, size_t len)
 {
 	struct dql *dql = &queue->dql;
@@ -1160,7 +1149,6 @@ static struct netdev_queue_attribute bql_hold_time_attribute =
 	    bql_set_hold_time);
 
 static ssize_t bql_show_inflight(struct netdev_queue *queue,
-				 struct netdev_queue_attribute *attr,
 				 char *buf)
 {
 	struct dql *dql = &queue->dql;
@@ -1173,14 +1161,12 @@ static struct netdev_queue_attribute bql_inflight_attribute =
 
 #define BQL_ATTR(NAME, FIELD)						\
 static ssize_t bql_show_ ## NAME(struct netdev_queue *queue,		\
-				 struct netdev_queue_attribute *attr,	\
 				 char *buf)				\
 {									\
 	return bql_show(buf, queue->dql.FIELD);				\
 }									\
 									\
 static ssize_t bql_set_ ## NAME(struct netdev_queue *queue,		\
-				struct netdev_queue_attribute *attr,	\
 				const char *buf, size_t len)		\
 {									\
 	return bql_set(buf, len, &queue->dql.FIELD);			\
@@ -1211,7 +1197,7 @@ static const struct attribute_group dql_group = {
 
 #ifdef CONFIG_XPS
 static ssize_t show_xps_map(struct netdev_queue *queue,
-			    struct netdev_queue_attribute *attribute, char *buf)
+			    char *buf)
 {
 	struct net_device *dev = queue->dev;
 	int cpu, len, num_tc = 1, tc = 0;
@@ -1258,8 +1244,7 @@ static ssize_t show_xps_map(struct netdev_queue *queue,
 }
 
 static ssize_t store_xps_map(struct netdev_queue *queue,
-		      struct netdev_queue_attribute *attribute,
-		      const char *buf, size_t len)
+			     const char *buf, size_t len)
 {
 	struct net_device *dev = queue->dev;
 	unsigned long index;
-- 
cgit v1.2.3


From cd030a78f7aa06fe216f6665a6ea84b8f3e5b3d3 Mon Sep 17 00:00:00 2001
From: Icenowy Zheng <icenowy@aosc.io>
Date: Tue, 15 Aug 2017 13:55:29 +0800
Subject: clk: sunxi-ng: support R40 SoC

Allwinner R40 SoC have a clock controller module in the style of the
SoCs beyond sun6i, however, it's more rich and complex.

Add support for it.

Signed-off-by: Icenowy Zheng <icenowy@aosc.io>
Signed-off-by: Chen-Yu Tsai <wens@csie.org>
---
 drivers/clk/sunxi-ng/Kconfig              |    5 +
 drivers/clk/sunxi-ng/Makefile             |    1 +
 drivers/clk/sunxi-ng/ccu-sun8i-r40.c      | 1290 +++++++++++++++++++++++++++++
 drivers/clk/sunxi-ng/ccu-sun8i-r40.h      |   69 ++
 include/dt-bindings/clock/sun8i-r40-ccu.h |  187 +++++
 include/dt-bindings/reset/sun8i-r40-ccu.h |  130 +++
 6 files changed, 1682 insertions(+)
 create mode 100644 drivers/clk/sunxi-ng/ccu-sun8i-r40.c
 create mode 100644 drivers/clk/sunxi-ng/ccu-sun8i-r40.h
 create mode 100644 include/dt-bindings/clock/sun8i-r40-ccu.h
 create mode 100644 include/dt-bindings/reset/sun8i-r40-ccu.h

(limited to 'include')

diff --git a/drivers/clk/sunxi-ng/Kconfig b/drivers/clk/sunxi-ng/Kconfig
index 7342928c35cd..7a360737fe3c 100644
--- a/drivers/clk/sunxi-ng/Kconfig
+++ b/drivers/clk/sunxi-ng/Kconfig
@@ -48,6 +48,11 @@ config SUN8I_V3S_CCU
 config SUN8I_DE2_CCU
 	bool "Support for the Allwinner SoCs DE2 CCU"
 
+config SUN8I_R40_CCU
+	bool "Support for the Allwinner R40 CCU"
+	default MACH_SUN8I
+	depends on MACH_SUN8I || COMPILE_TEST
+
 config SUN9I_A80_CCU
 	bool "Support for the Allwinner A80 CCU"
 	default MACH_SUN9I
diff --git a/drivers/clk/sunxi-ng/Makefile b/drivers/clk/sunxi-ng/Makefile
index 0c45fa50283d..58741cd647c3 100644
--- a/drivers/clk/sunxi-ng/Makefile
+++ b/drivers/clk/sunxi-ng/Makefile
@@ -28,6 +28,7 @@ obj-$(CONFIG_SUN8I_H3_CCU)	+= ccu-sun8i-h3.o
 obj-$(CONFIG_SUN8I_V3S_CCU)	+= ccu-sun8i-v3s.o
 obj-$(CONFIG_SUN8I_DE2_CCU)	+= ccu-sun8i-de2.o
 obj-$(CONFIG_SUN8I_R_CCU)	+= ccu-sun8i-r.o
+obj-$(CONFIG_SUN8I_R40_CCU)	+= ccu-sun8i-r40.o
 obj-$(CONFIG_SUN9I_A80_CCU)	+= ccu-sun9i-a80.o
 obj-$(CONFIG_SUN9I_A80_CCU)	+= ccu-sun9i-a80-de.o
 obj-$(CONFIG_SUN9I_A80_CCU)	+= ccu-sun9i-a80-usb.o
diff --git a/drivers/clk/sunxi-ng/ccu-sun8i-r40.c b/drivers/clk/sunxi-ng/ccu-sun8i-r40.c
new file mode 100644
index 000000000000..933f2e68f42a
--- /dev/null
+++ b/drivers/clk/sunxi-ng/ccu-sun8i-r40.c
@@ -0,0 +1,1290 @@
+/*
+ * Copyright (c) 2017 Icenowy Zheng <icenowy@aosc.io>
+ *
+ * This software is licensed under the terms of the GNU General Public
+ * License version 2, as published by the Free Software Foundation, and
+ * may be copied, distributed, and modified under those terms.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ */
+
+#include <linux/clk-provider.h>
+#include <linux/of_address.h>
+
+#include "ccu_common.h"
+#include "ccu_reset.h"
+
+#include "ccu_div.h"
+#include "ccu_gate.h"
+#include "ccu_mp.h"
+#include "ccu_mult.h"
+#include "ccu_nk.h"
+#include "ccu_nkm.h"
+#include "ccu_nkmp.h"
+#include "ccu_nm.h"
+#include "ccu_phase.h"
+
+#include "ccu-sun8i-r40.h"
+
+/* TODO: The result of N*K is required to be in [10, 88] range. */
+static struct ccu_nkmp pll_cpu_clk = {
+	.enable		= BIT(31),
+	.lock		= BIT(28),
+	.n		= _SUNXI_CCU_MULT(8, 5),
+	.k		= _SUNXI_CCU_MULT(4, 2),
+	.m		= _SUNXI_CCU_DIV(0, 2),
+	.p		= _SUNXI_CCU_DIV_MAX(16, 2, 4),
+	.common		= {
+		.reg		= 0x000,
+		.hw.init	= CLK_HW_INIT("pll-cpu",
+					      "osc24M",
+					      &ccu_nkmp_ops,
+					      CLK_SET_RATE_UNGATE),
+	},
+};
+
+/*
+ * The Audio PLL is supposed to have 4 outputs: 3 fixed factors from
+ * the base (2x, 4x and 8x), and one variable divider (the one true
+ * pll audio).
+ *
+ * We don't have any need for the variable divider for now, so we just
+ * hardcode it to match with the clock names
+ */
+#define SUN8I_R40_PLL_AUDIO_REG	0x008
+
+static SUNXI_CCU_NM_WITH_GATE_LOCK(pll_audio_base_clk, "pll-audio-base",
+				   "osc24M", 0x008,
+				   8, 7,	/* N */
+				   0, 5,	/* M */
+				   BIT(31),	/* gate */
+				   BIT(28),	/* lock */
+				   CLK_SET_RATE_UNGATE);
+
+/* TODO: The result of N/M is required to be in [8, 25] range. */
+static SUNXI_CCU_NM_WITH_FRAC_GATE_LOCK(pll_video0_clk, "pll-video0",
+					"osc24M", 0x0010,
+					8, 7,		/* N */
+					0, 4,		/* M */
+					BIT(24),	/* frac enable */
+					BIT(25),	/* frac select */
+					270000000,	/* frac rate 0 */
+					297000000,	/* frac rate 1 */
+					BIT(31),	/* gate */
+					BIT(28),	/* lock */
+					CLK_SET_RATE_UNGATE);
+
+/* TODO: The result of N/M is required to be in [8, 25] range. */
+static SUNXI_CCU_NM_WITH_FRAC_GATE_LOCK(pll_ve_clk, "pll-ve",
+					"osc24M", 0x0018,
+					8, 7,		/* N */
+					0, 4,		/* M */
+					BIT(24),	/* frac enable */
+					BIT(25),	/* frac select */
+					270000000,	/* frac rate 0 */
+					297000000,	/* frac rate 1 */
+					BIT(31),	/* gate */
+					BIT(28),	/* lock */
+					CLK_SET_RATE_UNGATE);
+
+/* TODO: The result of N*K is required to be in [10, 77] range. */
+static SUNXI_CCU_NKM_WITH_GATE_LOCK(pll_ddr0_clk, "pll-ddr0",
+				    "osc24M", 0x020,
+				    8, 5,	/* N */
+				    4, 2,	/* K */
+				    0, 2,	/* M */
+				    BIT(31),	/* gate */
+				    BIT(28),	/* lock */
+				    CLK_SET_RATE_UNGATE);
+
+/* TODO: The result of N*K is required to be in [21, 58] range. */
+static struct ccu_nk pll_periph0_clk = {
+	.enable		= BIT(31),
+	.lock		= BIT(28),
+	.n		= _SUNXI_CCU_MULT(8, 5),
+	.k		= _SUNXI_CCU_MULT(4, 2),
+	.fixed_post_div	= 2,
+	.common		= {
+		.reg		= 0x028,
+		.features	= CCU_FEATURE_FIXED_POSTDIV,
+		.hw.init	= CLK_HW_INIT("pll-periph0", "osc24M",
+					      &ccu_nk_ops,
+					      CLK_SET_RATE_UNGATE),
+	},
+};
+
+static struct ccu_div pll_periph0_sata_clk = {
+	.enable		= BIT(24),
+	.div		= _SUNXI_CCU_DIV(0, 2),
+	/*
+	 * The formula of pll-periph0 (1x) is 24MHz*N*K/2, and the formula
+	 * of pll-periph0-sata is 24MHz*N*K/M/6, so the postdiv here is
+	 * 6/2 = 3.
+	 */
+	.fixed_post_div	= 3,
+	.common		= {
+		.reg		= 0x028,
+		.features	= CCU_FEATURE_FIXED_POSTDIV,
+		.hw.init	= CLK_HW_INIT("pll-periph0-sata",
+					      "pll-periph0",
+					      &ccu_div_ops, 0),
+	},
+};
+
+/* TODO: The result of N*K is required to be in [21, 58] range. */
+static struct ccu_nk pll_periph1_clk = {
+	.enable		= BIT(31),
+	.lock		= BIT(28),
+	.n		= _SUNXI_CCU_MULT(8, 5),
+	.k		= _SUNXI_CCU_MULT(4, 2),
+	.fixed_post_div	= 2,
+	.common		= {
+		.reg		= 0x02c,
+		.features	= CCU_FEATURE_FIXED_POSTDIV,
+		.hw.init	= CLK_HW_INIT("pll-periph1", "osc24M",
+					      &ccu_nk_ops,
+					      CLK_SET_RATE_UNGATE),
+	},
+};
+
+/* TODO: The result of N/M is required to be in [8, 25] range. */
+static SUNXI_CCU_NM_WITH_FRAC_GATE_LOCK(pll_video1_clk, "pll-video1",
+					"osc24M", 0x030,
+					8, 7,		/* N */
+					0, 4,		/* M */
+					BIT(24),	/* frac enable */
+					BIT(25),	/* frac select */
+					270000000,	/* frac rate 0 */
+					297000000,	/* frac rate 1 */
+					BIT(31),	/* gate */
+					BIT(28),	/* lock */
+					CLK_SET_RATE_UNGATE);
+
+static struct ccu_nkm pll_sata_clk = {
+	.enable		= BIT(31),
+	.lock		= BIT(28),
+	.n		= _SUNXI_CCU_MULT(8, 5),
+	.k		= _SUNXI_CCU_MULT(4, 2),
+	.m		= _SUNXI_CCU_DIV(0, 2),
+	.fixed_post_div	= 6,
+	.common		= {
+		.reg		= 0x034,
+		.features	= CCU_FEATURE_FIXED_POSTDIV,
+		.hw.init	= CLK_HW_INIT("pll-sata", "osc24M",
+					      &ccu_nkm_ops,
+					      CLK_SET_RATE_UNGATE),
+	},
+};
+
+static const char * const pll_sata_out_parents[] = { "pll-sata",
+						     "pll-periph0-sata" };
+static SUNXI_CCU_MUX_WITH_GATE(pll_sata_out_clk, "pll-sata-out",
+			       pll_sata_out_parents, 0x034,
+			       30, 1,	/* mux */
+			       BIT(14),	/* gate */
+			       CLK_SET_RATE_PARENT);
+
+/* TODO: The result of N/M is required to be in [8, 25] range. */
+static SUNXI_CCU_NM_WITH_FRAC_GATE_LOCK(pll_gpu_clk, "pll-gpu",
+					"osc24M", 0x038,
+					8, 7,		/* N */
+					0, 4,		/* M */
+					BIT(24),	/* frac enable */
+					BIT(25),	/* frac select */
+					270000000,	/* frac rate 0 */
+					297000000,	/* frac rate 1 */
+					BIT(31),	/* gate */
+					BIT(28),	/* lock */
+					CLK_SET_RATE_UNGATE);
+
+/*
+ * The MIPI PLL has 2 modes: "MIPI" and "HDMI".
+ *
+ * The MIPI mode is a standard NKM-style clock. The HDMI mode is an
+ * integer / fractional clock with switchable multipliers and dividers.
+ * This is not supported here. We hardcode the PLL to MIPI mode.
+ *
+ * TODO: In the MIPI mode, M/N is required to be equal or lesser than 3,
+ * which cannot be implemented now.
+ */
+#define SUN8I_R40_PLL_MIPI_REG	0x040
+
+static const char * const pll_mipi_parents[] = { "pll-video0" };
+static struct ccu_nkm pll_mipi_clk = {
+	.enable	= BIT(31) | BIT(23) | BIT(22),
+	.lock	= BIT(28),
+	.n	= _SUNXI_CCU_MULT(8, 4),
+	.k	= _SUNXI_CCU_MULT_MIN(4, 2, 2),
+	.m	= _SUNXI_CCU_DIV(0, 4),
+	.mux	= _SUNXI_CCU_MUX(21, 1),
+	.common	= {
+		.reg		= 0x040,
+		.hw.init	= CLK_HW_INIT_PARENTS("pll-mipi",
+						      pll_mipi_parents,
+						      &ccu_nkm_ops,
+						      CLK_SET_RATE_UNGATE)
+	},
+};
+
+/* TODO: The result of N/M is required to be in [8, 25] range. */
+static SUNXI_CCU_NM_WITH_FRAC_GATE_LOCK(pll_de_clk, "pll-de",
+					"osc24M", 0x048,
+					8, 7,		/* N */
+					0, 4,		/* M */
+					BIT(24),	/* frac enable */
+					BIT(25),	/* frac select */
+					270000000,	/* frac rate 0 */
+					297000000,	/* frac rate 1 */
+					BIT(31),	/* gate */
+					BIT(28),	/* lock */
+					CLK_SET_RATE_UNGATE);
+
+/* TODO: The N factor is required to be in [16, 75] range. */
+static SUNXI_CCU_NM_WITH_GATE_LOCK(pll_ddr1_clk, "pll-ddr1",
+				   "osc24M", 0x04c,
+				   8, 7,	/* N */
+				   0, 2,	/* M */
+				   BIT(31),	/* gate */
+				   BIT(28),	/* lock */
+				   CLK_SET_RATE_UNGATE);
+
+static const char * const cpu_parents[] = { "osc32k", "osc24M",
+					     "pll-cpu", "pll-cpu" };
+static SUNXI_CCU_MUX(cpu_clk, "cpu", cpu_parents,
+		     0x050, 16, 2, CLK_IS_CRITICAL | CLK_SET_RATE_PARENT);
+
+static SUNXI_CCU_M(axi_clk, "axi", "cpu", 0x050, 0, 2, 0);
+
+static const char * const ahb1_parents[] = { "osc32k", "osc24M",
+					     "axi", "pll-periph0" };
+static const struct ccu_mux_var_prediv ahb1_predivs[] = {
+	{ .index = 3, .shift = 6, .width = 2 },
+};
+static struct ccu_div ahb1_clk = {
+	.div		= _SUNXI_CCU_DIV_FLAGS(4, 2, CLK_DIVIDER_POWER_OF_TWO),
+
+	.mux		= {
+		.shift	= 12,
+		.width	= 2,
+
+		.var_predivs	= ahb1_predivs,
+		.n_var_predivs	= ARRAY_SIZE(ahb1_predivs),
+	},
+
+	.common		= {
+		.reg		= 0x054,
+		.features	= CCU_FEATURE_VARIABLE_PREDIV,
+		.hw.init	= CLK_HW_INIT_PARENTS("ahb1",
+						      ahb1_parents,
+						      &ccu_div_ops,
+						      0),
+	},
+};
+
+static struct clk_div_table apb1_div_table[] = {
+	{ .val = 0, .div = 2 },
+	{ .val = 1, .div = 2 },
+	{ .val = 2, .div = 4 },
+	{ .val = 3, .div = 8 },
+	{ /* Sentinel */ },
+};
+static SUNXI_CCU_DIV_TABLE(apb1_clk, "apb1", "ahb1",
+			   0x054, 8, 2, apb1_div_table, 0);
+
+static const char * const apb2_parents[] = { "osc32k", "osc24M",
+					     "pll-periph0-2x",
+					     "pll-periph0-2x" };
+static SUNXI_CCU_MP_WITH_MUX(apb2_clk, "apb2", apb2_parents, 0x058,
+			     0, 5,	/* M */
+			     16, 2,	/* P */
+			     24, 2,	/* mux */
+			     0);
+
+static SUNXI_CCU_GATE(bus_mipi_dsi_clk,	"bus-mipi-dsi",	"ahb1",
+		      0x060, BIT(1), 0);
+static SUNXI_CCU_GATE(bus_ce_clk,	"bus-ce",	"ahb1",
+		      0x060, BIT(5), 0);
+static SUNXI_CCU_GATE(bus_dma_clk,	"bus-dma",	"ahb1",
+		      0x060, BIT(6), 0);
+static SUNXI_CCU_GATE(bus_mmc0_clk,	"bus-mmc0",	"ahb1",
+		      0x060, BIT(8), 0);
+static SUNXI_CCU_GATE(bus_mmc1_clk,	"bus-mmc1",	"ahb1",
+		      0x060, BIT(9), 0);
+static SUNXI_CCU_GATE(bus_mmc2_clk,	"bus-mmc2",	"ahb1",
+		      0x060, BIT(10), 0);
+static SUNXI_CCU_GATE(bus_mmc3_clk,	"bus-mmc3",	"ahb1",
+		      0x060, BIT(11), 0);
+static SUNXI_CCU_GATE(bus_nand_clk,	"bus-nand",	"ahb1",
+		      0x060, BIT(13), 0);
+static SUNXI_CCU_GATE(bus_dram_clk,	"bus-dram",	"ahb1",
+		      0x060, BIT(14), 0);
+static SUNXI_CCU_GATE(bus_emac_clk,	"bus-emac",	"ahb1",
+		      0x060, BIT(17), 0);
+static SUNXI_CCU_GATE(bus_ts_clk,	"bus-ts",	"ahb1",
+		      0x060, BIT(18), 0);
+static SUNXI_CCU_GATE(bus_hstimer_clk,	"bus-hstimer",	"ahb1",
+		      0x060, BIT(19), 0);
+static SUNXI_CCU_GATE(bus_spi0_clk,	"bus-spi0",	"ahb1",
+		      0x060, BIT(20), 0);
+static SUNXI_CCU_GATE(bus_spi1_clk,	"bus-spi1",	"ahb1",
+		      0x060, BIT(21), 0);
+static SUNXI_CCU_GATE(bus_spi2_clk,	"bus-spi2",	"ahb1",
+		      0x060, BIT(22), 0);
+static SUNXI_CCU_GATE(bus_spi3_clk,	"bus-spi3",	"ahb1",
+		      0x060, BIT(23), 0);
+static SUNXI_CCU_GATE(bus_sata_clk,	"bus-sata",	"ahb1",
+		      0x060, BIT(24), 0);
+static SUNXI_CCU_GATE(bus_otg_clk,	"bus-otg",	"ahb1",
+		      0x060, BIT(25), 0);
+static SUNXI_CCU_GATE(bus_ehci0_clk,	"bus-ehci0",	"ahb1",
+		      0x060, BIT(26), 0);
+static SUNXI_CCU_GATE(bus_ehci1_clk,	"bus-ehci1",	"ahb1",
+		      0x060, BIT(27), 0);
+static SUNXI_CCU_GATE(bus_ehci2_clk,	"bus-ehci2",	"ahb1",
+		      0x060, BIT(28), 0);
+static SUNXI_CCU_GATE(bus_ohci0_clk,	"bus-ohci0",	"ahb1",
+		      0x060, BIT(29), 0);
+static SUNXI_CCU_GATE(bus_ohci1_clk,	"bus-ohci1",	"ahb1",
+		      0x060, BIT(30), 0);
+static SUNXI_CCU_GATE(bus_ohci2_clk,	"bus-ohci2",	"ahb1",
+		      0x060, BIT(31), 0);
+
+static SUNXI_CCU_GATE(bus_ve_clk,	"bus-ve",	"ahb1",
+		      0x064, BIT(0), 0);
+static SUNXI_CCU_GATE(bus_mp_clk,	"bus-mp",	"ahb1",
+		      0x064, BIT(2), 0);
+static SUNXI_CCU_GATE(bus_deinterlace_clk,	"bus-deinterlace",	"ahb1",
+		      0x064, BIT(5), 0);
+static SUNXI_CCU_GATE(bus_csi0_clk,	"bus-csi0",	"ahb1",
+		      0x064, BIT(8), 0);
+static SUNXI_CCU_GATE(bus_csi1_clk,	"bus-csi1",	"ahb1",
+		      0x064, BIT(9), 0);
+static SUNXI_CCU_GATE(bus_hdmi0_clk,	"bus-hdmi0",	"ahb1",
+		      0x064, BIT(10), 0);
+static SUNXI_CCU_GATE(bus_hdmi1_clk,	"bus-hdmi1",	"ahb1",
+		      0x064, BIT(11), 0);
+static SUNXI_CCU_GATE(bus_de_clk,	"bus-de",	"ahb1",
+		      0x064, BIT(12), 0);
+static SUNXI_CCU_GATE(bus_tve0_clk,	"bus-tve0",	"ahb1",
+		      0x064, BIT(13), 0);
+static SUNXI_CCU_GATE(bus_tve1_clk,	"bus-tve1",	"ahb1",
+		      0x064, BIT(14), 0);
+static SUNXI_CCU_GATE(bus_tve_top_clk,	"bus-tve-top",	"ahb1",
+		      0x064, BIT(15), 0);
+static SUNXI_CCU_GATE(bus_gmac_clk,	"bus-gmac",	"ahb1",
+		      0x064, BIT(17), 0);
+static SUNXI_CCU_GATE(bus_gpu_clk,	"bus-gpu",	"ahb1",
+		      0x064, BIT(20), 0);
+static SUNXI_CCU_GATE(bus_tvd0_clk,	"bus-tvd0",	"ahb1",
+		      0x064, BIT(21), 0);
+static SUNXI_CCU_GATE(bus_tvd1_clk,	"bus-tvd1",	"ahb1",
+		      0x064, BIT(22), 0);
+static SUNXI_CCU_GATE(bus_tvd2_clk,	"bus-tvd2",	"ahb1",
+		      0x064, BIT(23), 0);
+static SUNXI_CCU_GATE(bus_tvd3_clk,	"bus-tvd3",	"ahb1",
+		      0x064, BIT(24), 0);
+static SUNXI_CCU_GATE(bus_tvd_top_clk,	"bus-tvd-top",	"ahb1",
+		      0x064, BIT(25), 0);
+static SUNXI_CCU_GATE(bus_tcon_lcd0_clk,	"bus-tcon-lcd0",	"ahb1",
+		      0x064, BIT(26), 0);
+static SUNXI_CCU_GATE(bus_tcon_lcd1_clk,	"bus-tcon-lcd1",	"ahb1",
+		      0x064, BIT(27), 0);
+static SUNXI_CCU_GATE(bus_tcon_tv0_clk,	"bus-tcon-tv0",	"ahb1",
+		      0x064, BIT(28), 0);
+static SUNXI_CCU_GATE(bus_tcon_tv1_clk,	"bus-tcon-tv1",	"ahb1",
+		      0x064, BIT(29), 0);
+static SUNXI_CCU_GATE(bus_tcon_top_clk,	"bus-tcon-top",	"ahb1",
+		      0x064, BIT(30), 0);
+
+static SUNXI_CCU_GATE(bus_codec_clk,	"bus-codec",	"apb1",
+		      0x068, BIT(0), 0);
+static SUNXI_CCU_GATE(bus_spdif_clk,	"bus-spdif",	"apb1",
+		      0x068, BIT(1), 0);
+static SUNXI_CCU_GATE(bus_ac97_clk,	"bus-ac97",	"apb1",
+		      0x068, BIT(2), 0);
+static SUNXI_CCU_GATE(bus_pio_clk,	"bus-pio",	"apb1",
+		      0x068, BIT(5), 0);
+static SUNXI_CCU_GATE(bus_ir0_clk,	"bus-ir0",	"apb1",
+		      0x068, BIT(6), 0);
+static SUNXI_CCU_GATE(bus_ir1_clk,	"bus-ir1",	"apb1",
+		      0x068, BIT(7), 0);
+static SUNXI_CCU_GATE(bus_ths_clk,	"bus-ths",	"apb1",
+		      0x068, BIT(8), 0);
+static SUNXI_CCU_GATE(bus_keypad_clk,	"bus-keypad",	"apb1",
+		      0x068, BIT(10), 0);
+static SUNXI_CCU_GATE(bus_i2s0_clk,	"bus-i2s0",	"apb1",
+		      0x068, BIT(12), 0);
+static SUNXI_CCU_GATE(bus_i2s1_clk,	"bus-i2s1",	"apb1",
+		      0x068, BIT(13), 0);
+static SUNXI_CCU_GATE(bus_i2s2_clk,	"bus-i2s2",	"apb1",
+		      0x068, BIT(14), 0);
+
+static SUNXI_CCU_GATE(bus_i2c0_clk,	"bus-i2c0",	"apb2",
+		      0x06c, BIT(0), 0);
+static SUNXI_CCU_GATE(bus_i2c1_clk,	"bus-i2c1",	"apb2",
+		      0x06c, BIT(1), 0);
+static SUNXI_CCU_GATE(bus_i2c2_clk,	"bus-i2c2",	"apb2",
+		      0x06c, BIT(2), 0);
+static SUNXI_CCU_GATE(bus_i2c3_clk,	"bus-i2c3",	"apb2",
+		      0x06c, BIT(3), 0);
+/*
+ * In datasheet here's "Reserved", however the gate exists in BSP soucre
+ * code.
+ */
+static SUNXI_CCU_GATE(bus_can_clk,	"bus-can",	"apb2",
+		      0x06c, BIT(4), 0);
+static SUNXI_CCU_GATE(bus_scr_clk,	"bus-scr",	"apb2",
+		      0x06c, BIT(5), 0);
+static SUNXI_CCU_GATE(bus_ps20_clk,	"bus-ps20",	"apb2",
+		      0x06c, BIT(6), 0);
+static SUNXI_CCU_GATE(bus_ps21_clk,	"bus-ps21",	"apb2",
+		      0x06c, BIT(7), 0);
+static SUNXI_CCU_GATE(bus_i2c4_clk,	"bus-i2c4",	"apb2",
+		      0x06c, BIT(15), 0);
+static SUNXI_CCU_GATE(bus_uart0_clk,	"bus-uart0",	"apb2",
+		      0x06c, BIT(16), 0);
+static SUNXI_CCU_GATE(bus_uart1_clk,	"bus-uart1",	"apb2",
+		      0x06c, BIT(17), 0);
+static SUNXI_CCU_GATE(bus_uart2_clk,	"bus-uart2",	"apb2",
+		      0x06c, BIT(18), 0);
+static SUNXI_CCU_GATE(bus_uart3_clk,	"bus-uart3",	"apb2",
+		      0x06c, BIT(19), 0);
+static SUNXI_CCU_GATE(bus_uart4_clk,	"bus-uart4",	"apb2",
+		      0x06c, BIT(20), 0);
+static SUNXI_CCU_GATE(bus_uart5_clk,	"bus-uart5",	"apb2",
+		      0x06c, BIT(21), 0);
+static SUNXI_CCU_GATE(bus_uart6_clk,	"bus-uart6",	"apb2",
+		      0x06c, BIT(22), 0);
+static SUNXI_CCU_GATE(bus_uart7_clk,	"bus-uart7",	"apb2",
+		      0x06c, BIT(23), 0);
+
+static SUNXI_CCU_GATE(bus_dbg_clk,	"bus-dbg",	"ahb1",
+		      0x070, BIT(7), 0);
+
+static const char * const ths_parents[] = { "osc24M" };
+static struct ccu_div ths_clk = {
+	.enable	= BIT(31),
+	.div	= _SUNXI_CCU_DIV_FLAGS(0, 2, CLK_DIVIDER_POWER_OF_TWO),
+	.mux	= _SUNXI_CCU_MUX(24, 2),
+	.common	= {
+		.reg		= 0x074,
+		.hw.init	= CLK_HW_INIT_PARENTS("ths",
+						      ths_parents,
+						      &ccu_div_ops,
+						      0),
+	},
+};
+
+static const char * const mod0_default_parents[] = { "osc24M", "pll-periph0",
+						     "pll-periph1" };
+static SUNXI_CCU_MP_WITH_MUX_GATE(nand_clk, "nand", mod0_default_parents, 0x080,
+				  0, 4,		/* M */
+				  16, 2,	/* P */
+				  24, 2,	/* mux */
+				  BIT(31),	/* gate */
+				  0);
+
+static SUNXI_CCU_MP_WITH_MUX_GATE(mmc0_clk, "mmc0", mod0_default_parents, 0x088,
+				  0, 4,		/* M */
+				  16, 2,	/* P */
+				  24, 2,	/* mux */
+				  BIT(31),	/* gate */
+				  0);
+
+static SUNXI_CCU_MP_WITH_MUX_GATE(mmc1_clk, "mmc1", mod0_default_parents, 0x08c,
+				  0, 4,		/* M */
+				  16, 2,	/* P */
+				  24, 2,	/* mux */
+				  BIT(31),	/* gate */
+				  0);
+
+static SUNXI_CCU_MP_WITH_MUX_GATE(mmc2_clk, "mmc2", mod0_default_parents, 0x090,
+				  0, 4,		/* M */
+				  16, 2,	/* P */
+				  24, 2,	/* mux */
+				  BIT(31),	/* gate */
+				  0);
+
+static SUNXI_CCU_MP_WITH_MUX_GATE(mmc3_clk, "mmc3", mod0_default_parents, 0x094,
+				  0, 4,		/* M */
+				  16, 2,	/* P */
+				  24, 2,	/* mux */
+				  BIT(31),	/* gate */
+				  0);
+
+static const char * const ts_parents[] = { "osc24M", "pll-periph0", };
+static SUNXI_CCU_MP_WITH_MUX_GATE(ts_clk, "ts", ts_parents, 0x098,
+				  0, 4,		/* M */
+				  16, 2,	/* P */
+				  24, 4,	/* mux */
+				  BIT(31),	/* gate */
+				  0);
+
+static const char * const ce_parents[] = { "osc24M", "pll-periph0-2x",
+					   "pll-periph1-2x" };
+static SUNXI_CCU_MP_WITH_MUX_GATE(ce_clk, "ce", ce_parents, 0x09c,
+				  0, 4,		/* M */
+				  16, 2,	/* P */
+				  24, 2,	/* mux */
+				  BIT(31),	/* gate */
+				  0);
+
+static SUNXI_CCU_MP_WITH_MUX_GATE(spi0_clk, "spi0", mod0_default_parents, 0x0a0,
+				  0, 4,		/* M */
+				  16, 2,	/* P */
+				  24, 2,	/* mux */
+				  BIT(31),	/* gate */
+				  0);
+
+static SUNXI_CCU_MP_WITH_MUX_GATE(spi1_clk, "spi1", mod0_default_parents, 0x0a4,
+				  0, 4,		/* M */
+				  16, 2,	/* P */
+				  24, 2,	/* mux */
+				  BIT(31),	/* gate */
+				  0);
+
+static SUNXI_CCU_MP_WITH_MUX_GATE(spi2_clk, "spi2", mod0_default_parents, 0x0a8,
+				  0, 4,		/* M */
+				  16, 2,	/* P */
+				  24, 2,	/* mux */
+				  BIT(31),	/* gate */
+				  0);
+
+static SUNXI_CCU_MP_WITH_MUX_GATE(spi3_clk, "spi3", mod0_default_parents, 0x0ac,
+				  0, 4,		/* M */
+				  16, 2,	/* P */
+				  24, 2,	/* mux */
+				  BIT(31),	/* gate */
+				  0);
+
+static const char * const i2s_parents[] = { "pll-audio-8x", "pll-audio-4x",
+					    "pll-audio-2x", "pll-audio" };
+static SUNXI_CCU_MUX_WITH_GATE(i2s0_clk, "i2s0", i2s_parents,
+			       0x0b0, 16, 2, BIT(31), CLK_SET_RATE_PARENT);
+
+static SUNXI_CCU_MUX_WITH_GATE(i2s1_clk, "i2s1", i2s_parents,
+			       0x0b4, 16, 2, BIT(31), CLK_SET_RATE_PARENT);
+
+static SUNXI_CCU_MUX_WITH_GATE(i2s2_clk, "i2s2", i2s_parents,
+			       0x0b8, 16, 2, BIT(31), CLK_SET_RATE_PARENT);
+
+static SUNXI_CCU_MUX_WITH_GATE(ac97_clk, "ac97", i2s_parents,
+			       0x0bc, 16, 2, BIT(31), CLK_SET_RATE_PARENT);
+
+static SUNXI_CCU_MUX_WITH_GATE(spdif_clk, "spdif", i2s_parents,
+			       0x0c0, 16, 2, BIT(31), CLK_SET_RATE_PARENT);
+
+static const char * const keypad_parents[] = { "osc24M", "osc32k" };
+static const u8 keypad_table[] = { 0, 2 };
+static struct ccu_mp keypad_clk = {
+	.enable	= BIT(31),
+	.m	= _SUNXI_CCU_DIV(0, 5),
+	.p	= _SUNXI_CCU_DIV(16, 2),
+	.mux	= _SUNXI_CCU_MUX_TABLE(24, 2, keypad_table),
+	.common	= {
+		.reg		= 0x0c4,
+		.hw.init	= CLK_HW_INIT_PARENTS("keypad",
+						      keypad_parents,
+						      &ccu_mp_ops,
+						      0),
+	}
+};
+
+static const char * const sata_parents[] = { "pll-sata-out", "sata-ext" };
+static SUNXI_CCU_MUX_WITH_GATE(sata_clk, "sata", sata_parents,
+			       0x0c8, 24, 1, BIT(31), CLK_SET_RATE_PARENT);
+
+/*
+ * There are 3 OHCI 12M clock source selection bits in this register.
+ * We will force them to 0 (12M divided from 48M).
+ */
+#define SUN8I_R40_USB_CLK_REG	0x0cc
+
+static SUNXI_CCU_GATE(usb_phy0_clk,	"usb-phy0",	"osc24M",
+		      0x0cc, BIT(8), 0);
+static SUNXI_CCU_GATE(usb_phy1_clk,	"usb-phy1",	"osc24M",
+		      0x0cc, BIT(9), 0);
+static SUNXI_CCU_GATE(usb_phy2_clk,	"usb-phy2",	"osc24M",
+		      0x0cc, BIT(10), 0);
+static SUNXI_CCU_GATE(usb_ohci0_clk,	"usb-ohci0",	"osc12M",
+		      0x0cc, BIT(16), 0);
+static SUNXI_CCU_GATE(usb_ohci1_clk,	"usb-ohci1",	"osc12M",
+		      0x0cc, BIT(17), 0);
+static SUNXI_CCU_GATE(usb_ohci2_clk,	"usb-ohci2",	"osc12M",
+		      0x0cc, BIT(18), 0);
+
+static const char * const ir_parents[] = { "osc24M", "pll-periph0",
+					   "pll-periph1", "osc32k" };
+static SUNXI_CCU_MP_WITH_MUX_GATE(ir0_clk, "ir0", ir_parents, 0x0d0,
+				  0, 4,		/* M */
+				  16, 2,	/* P */
+				  24, 2,	/* mux */
+				  BIT(31),	/* gate */
+				  0);
+
+static SUNXI_CCU_MP_WITH_MUX_GATE(ir1_clk, "ir1", ir_parents, 0x0d4,
+				  0, 4,		/* M */
+				  16, 2,	/* P */
+				  24, 2,	/* mux */
+				  BIT(31),	/* gate */
+				  0);
+
+static const char * const dram_parents[] = { "pll-ddr0", "pll-ddr1" };
+static SUNXI_CCU_M_WITH_MUX(dram_clk, "dram", dram_parents,
+			    0x0f4, 0, 2, 20, 2, CLK_IS_CRITICAL);
+
+static SUNXI_CCU_GATE(dram_ve_clk,	"dram-ve",	"dram",
+		      0x100, BIT(0), 0);
+static SUNXI_CCU_GATE(dram_csi0_clk,	"dram-csi0",	"dram",
+		      0x100, BIT(1), 0);
+static SUNXI_CCU_GATE(dram_csi1_clk,	"dram-csi1",	"dram",
+		      0x100, BIT(2), 0);
+static SUNXI_CCU_GATE(dram_ts_clk,	"dram-ts",	"dram",
+		      0x100, BIT(3), 0);
+static SUNXI_CCU_GATE(dram_tvd_clk,	"dram-tvd",	"dram",
+		      0x100, BIT(4), 0);
+static SUNXI_CCU_GATE(dram_mp_clk,	"dram-mp",	"dram",
+		      0x100, BIT(5), 0);
+static SUNXI_CCU_GATE(dram_deinterlace_clk,	"dram-deinterlace",	"dram",
+		      0x100, BIT(6), 0);
+
+static const char * const de_parents[] = { "pll-periph0-2x", "pll-de" };
+static SUNXI_CCU_M_WITH_MUX_GATE(de_clk, "de", de_parents,
+				 0x104, 0, 4, 24, 3, BIT(31), 0);
+static SUNXI_CCU_M_WITH_MUX_GATE(mp_clk, "mp", de_parents,
+				 0x108, 0, 4, 24, 3, BIT(31), 0);
+
+static const char * const tcon_parents[] = { "pll-video0", "pll-video1",
+					     "pll-video0-2x", "pll-video1-2x",
+					     "pll-mipi" };
+static SUNXI_CCU_MUX_WITH_GATE(tcon_lcd0_clk, "tcon-lcd0", tcon_parents,
+			       0x110, 24, 3, BIT(31), CLK_SET_RATE_PARENT);
+static SUNXI_CCU_MUX_WITH_GATE(tcon_lcd1_clk, "tcon-lcd1", tcon_parents,
+			       0x114, 24, 3, BIT(31), CLK_SET_RATE_PARENT);
+static SUNXI_CCU_M_WITH_MUX_GATE(tcon_tv0_clk, "tcon-tv0", tcon_parents,
+				 0x118, 0, 4, 24, 3, BIT(31), 0);
+static SUNXI_CCU_M_WITH_MUX_GATE(tcon_tv1_clk, "tcon-tv1", tcon_parents,
+				 0x11c, 0, 4, 24, 3, BIT(31), 0);
+
+static const char * const deinterlace_parents[] = { "pll-periph0",
+						    "pll-periph1" };
+static SUNXI_CCU_M_WITH_MUX_GATE(deinterlace_clk, "deinterlace",
+				 deinterlace_parents, 0x124, 0, 4, 24, 3,
+				 BIT(31), 0);
+
+static const char * const csi_mclk_parents[] = { "osc24M", "pll-video1",
+						 "pll-periph1" };
+static SUNXI_CCU_M_WITH_MUX_GATE(csi1_mclk_clk, "csi1-mclk", csi_mclk_parents,
+				 0x130, 0, 5, 8, 3, BIT(15), 0);
+
+static const char * const csi_sclk_parents[] = { "pll-periph0", "pll-periph1" };
+static SUNXI_CCU_M_WITH_MUX_GATE(csi_sclk_clk, "csi-sclk", csi_sclk_parents,
+				 0x134, 16, 4, 24, 3, BIT(31), 0);
+
+static SUNXI_CCU_M_WITH_MUX_GATE(csi0_mclk_clk, "csi0-mclk", csi_mclk_parents,
+				 0x134, 0, 5, 8, 3, BIT(15), 0);
+
+static SUNXI_CCU_M_WITH_GATE(ve_clk, "ve", "pll-ve",
+			     0x13c, 16, 3, BIT(31), CLK_SET_RATE_PARENT);
+
+static SUNXI_CCU_GATE(codec_clk,	"codec",	"pll-audio",
+		      0x140, BIT(31), CLK_SET_RATE_PARENT);
+static SUNXI_CCU_GATE(avs_clk,		"avs",		"osc24M",
+		      0x144, BIT(31), 0);
+
+static const char * const hdmi_parents[] = { "pll-video0", "pll-video1" };
+static SUNXI_CCU_M_WITH_MUX_GATE(hdmi_clk, "hdmi", hdmi_parents,
+				 0x150, 0, 4, 24, 2, BIT(31), 0);
+
+static SUNXI_CCU_GATE(hdmi_slow_clk,	"hdmi-slow",	"osc24M",
+		      0x154, BIT(31), 0);
+
+/*
+ * In the SoC's user manual, the P factor is mentioned, but not used in
+ * the frequency formula.
+ *
+ * Here the factor is included, according to the BSP kernel source,
+ * which contains the P factor of this clock.
+ */
+static const char * const mbus_parents[] = { "osc24M", "pll-periph0-2x",
+					     "pll-ddr0" };
+static SUNXI_CCU_MP_WITH_MUX_GATE(mbus_clk, "mbus", mbus_parents, 0x15c,
+				  0, 4,		/* M */
+				  16, 2,	/* P */
+				  24, 2,	/* mux */
+				  BIT(31),	/* gate */
+				  CLK_IS_CRITICAL);
+
+static const char * const dsi_dphy_parents[] = { "pll-video0", "pll-video1",
+						 "pll-periph0" };
+static SUNXI_CCU_M_WITH_MUX_GATE(dsi_dphy_clk, "dsi-dphy", dsi_dphy_parents,
+				 0x168, 0, 4, 8, 2, BIT(15), 0);
+
+static SUNXI_CCU_M_WITH_MUX_GATE(tve0_clk, "tve0", tcon_parents,
+				 0x180, 0, 4, 24, 3, BIT(31), 0);
+static SUNXI_CCU_M_WITH_MUX_GATE(tve1_clk, "tve1", tcon_parents,
+				 0x184, 0, 4, 24, 3, BIT(31), 0);
+
+static const char * const tvd_parents[] = { "pll-video0", "pll-video1",
+					    "pll-video0-2x", "pll-video1-2x" };
+static SUNXI_CCU_M_WITH_MUX_GATE(tvd0_clk, "tvd0", tvd_parents,
+				 0x188, 0, 4, 24, 3, BIT(31), 0);
+static SUNXI_CCU_M_WITH_MUX_GATE(tvd1_clk, "tvd1", tvd_parents,
+				 0x18c, 0, 4, 24, 3, BIT(31), 0);
+static SUNXI_CCU_M_WITH_MUX_GATE(tvd2_clk, "tvd2", tvd_parents,
+				 0x190, 0, 4, 24, 3, BIT(31), 0);
+static SUNXI_CCU_M_WITH_MUX_GATE(tvd3_clk, "tvd3", tvd_parents,
+				 0x194, 0, 4, 24, 3, BIT(31), 0);
+
+static SUNXI_CCU_M_WITH_GATE(gpu_clk, "gpu", "pll-gpu",
+			     0x1a0, 0, 3, BIT(31), CLK_SET_RATE_PARENT);
+
+static const char * const out_parents[] = { "osc24M", "osc32k", "osc24M" };
+static const struct ccu_mux_fixed_prediv out_predivs[] = {
+	{ .index = 0, .div = 750, },
+};
+
+static struct ccu_mp outa_clk = {
+	.enable	= BIT(31),
+	.m	= _SUNXI_CCU_DIV(8, 5),
+	.p	= _SUNXI_CCU_DIV(20, 2),
+	.mux	= {
+		.shift		= 24,
+		.width		= 2,
+		.fixed_predivs	= out_predivs,
+		.n_predivs	= ARRAY_SIZE(out_predivs),
+	},
+	.common	= {
+		.reg		= 0x1f0,
+		.features	= CCU_FEATURE_FIXED_PREDIV,
+		.hw.init	= CLK_HW_INIT_PARENTS("outa", out_parents,
+						      &ccu_mp_ops, 0),
+	}
+};
+
+static struct ccu_mp outb_clk = {
+	.enable	= BIT(31),
+	.m	= _SUNXI_CCU_DIV(8, 5),
+	.p	= _SUNXI_CCU_DIV(20, 2),
+	.mux	= {
+		.shift		= 24,
+		.width		= 2,
+		.fixed_predivs	= out_predivs,
+		.n_predivs	= ARRAY_SIZE(out_predivs),
+	},
+	.common	= {
+		.reg		= 0x1f4,
+		.features	= CCU_FEATURE_FIXED_PREDIV,
+		.hw.init	= CLK_HW_INIT_PARENTS("outb", out_parents,
+						      &ccu_mp_ops, 0),
+	}
+};
+
+static struct ccu_common *sun8i_r40_ccu_clks[] = {
+	&pll_cpu_clk.common,
+	&pll_audio_base_clk.common,
+	&pll_video0_clk.common,
+	&pll_ve_clk.common,
+	&pll_ddr0_clk.common,
+	&pll_periph0_clk.common,
+	&pll_periph0_sata_clk.common,
+	&pll_periph1_clk.common,
+	&pll_video1_clk.common,
+	&pll_sata_clk.common,
+	&pll_sata_out_clk.common,
+	&pll_gpu_clk.common,
+	&pll_mipi_clk.common,
+	&pll_de_clk.common,
+	&pll_ddr1_clk.common,
+	&cpu_clk.common,
+	&axi_clk.common,
+	&ahb1_clk.common,
+	&apb1_clk.common,
+	&apb2_clk.common,
+	&bus_mipi_dsi_clk.common,
+	&bus_ce_clk.common,
+	&bus_dma_clk.common,
+	&bus_mmc0_clk.common,
+	&bus_mmc1_clk.common,
+	&bus_mmc2_clk.common,
+	&bus_mmc3_clk.common,
+	&bus_nand_clk.common,
+	&bus_dram_clk.common,
+	&bus_emac_clk.common,
+	&bus_ts_clk.common,
+	&bus_hstimer_clk.common,
+	&bus_spi0_clk.common,
+	&bus_spi1_clk.common,
+	&bus_spi2_clk.common,
+	&bus_spi3_clk.common,
+	&bus_sata_clk.common,
+	&bus_otg_clk.common,
+	&bus_ehci0_clk.common,
+	&bus_ehci1_clk.common,
+	&bus_ehci2_clk.common,
+	&bus_ohci0_clk.common,
+	&bus_ohci1_clk.common,
+	&bus_ohci2_clk.common,
+	&bus_ve_clk.common,
+	&bus_mp_clk.common,
+	&bus_deinterlace_clk.common,
+	&bus_csi0_clk.common,
+	&bus_csi1_clk.common,
+	&bus_hdmi0_clk.common,
+	&bus_hdmi1_clk.common,
+	&bus_de_clk.common,
+	&bus_tve0_clk.common,
+	&bus_tve1_clk.common,
+	&bus_tve_top_clk.common,
+	&bus_gmac_clk.common,
+	&bus_gpu_clk.common,
+	&bus_tvd0_clk.common,
+	&bus_tvd1_clk.common,
+	&bus_tvd2_clk.common,
+	&bus_tvd3_clk.common,
+	&bus_tvd_top_clk.common,
+	&bus_tcon_lcd0_clk.common,
+	&bus_tcon_lcd1_clk.common,
+	&bus_tcon_tv0_clk.common,
+	&bus_tcon_tv1_clk.common,
+	&bus_tcon_top_clk.common,
+	&bus_codec_clk.common,
+	&bus_spdif_clk.common,
+	&bus_ac97_clk.common,
+	&bus_pio_clk.common,
+	&bus_ir0_clk.common,
+	&bus_ir1_clk.common,
+	&bus_ths_clk.common,
+	&bus_keypad_clk.common,
+	&bus_i2s0_clk.common,
+	&bus_i2s1_clk.common,
+	&bus_i2s2_clk.common,
+	&bus_i2c0_clk.common,
+	&bus_i2c1_clk.common,
+	&bus_i2c2_clk.common,
+	&bus_i2c3_clk.common,
+	&bus_can_clk.common,
+	&bus_scr_clk.common,
+	&bus_ps20_clk.common,
+	&bus_ps21_clk.common,
+	&bus_i2c4_clk.common,
+	&bus_uart0_clk.common,
+	&bus_uart1_clk.common,
+	&bus_uart2_clk.common,
+	&bus_uart3_clk.common,
+	&bus_uart4_clk.common,
+	&bus_uart5_clk.common,
+	&bus_uart6_clk.common,
+	&bus_uart7_clk.common,
+	&bus_dbg_clk.common,
+	&ths_clk.common,
+	&nand_clk.common,
+	&mmc0_clk.common,
+	&mmc1_clk.common,
+	&mmc2_clk.common,
+	&mmc3_clk.common,
+	&ts_clk.common,
+	&ce_clk.common,
+	&spi0_clk.common,
+	&spi1_clk.common,
+	&spi2_clk.common,
+	&spi3_clk.common,
+	&i2s0_clk.common,
+	&i2s1_clk.common,
+	&i2s2_clk.common,
+	&ac97_clk.common,
+	&spdif_clk.common,
+	&keypad_clk.common,
+	&sata_clk.common,
+	&usb_phy0_clk.common,
+	&usb_phy1_clk.common,
+	&usb_phy2_clk.common,
+	&usb_ohci0_clk.common,
+	&usb_ohci1_clk.common,
+	&usb_ohci2_clk.common,
+	&ir0_clk.common,
+	&ir1_clk.common,
+	&dram_clk.common,
+	&dram_ve_clk.common,
+	&dram_csi0_clk.common,
+	&dram_csi1_clk.common,
+	&dram_ts_clk.common,
+	&dram_tvd_clk.common,
+	&dram_mp_clk.common,
+	&dram_deinterlace_clk.common,
+	&de_clk.common,
+	&mp_clk.common,
+	&tcon_lcd0_clk.common,
+	&tcon_lcd1_clk.common,
+	&tcon_tv0_clk.common,
+	&tcon_tv1_clk.common,
+	&deinterlace_clk.common,
+	&csi1_mclk_clk.common,
+	&csi_sclk_clk.common,
+	&csi0_mclk_clk.common,
+	&ve_clk.common,
+	&codec_clk.common,
+	&avs_clk.common,
+	&hdmi_clk.common,
+	&hdmi_slow_clk.common,
+	&mbus_clk.common,
+	&dsi_dphy_clk.common,
+	&tve0_clk.common,
+	&tve1_clk.common,
+	&tvd0_clk.common,
+	&tvd1_clk.common,
+	&tvd2_clk.common,
+	&tvd3_clk.common,
+	&gpu_clk.common,
+	&outa_clk.common,
+	&outb_clk.common,
+};
+
+/* Fixed Factor clocks */
+static CLK_FIXED_FACTOR(osc12M_clk, "osc12M", "osc24M", 2, 1, 0);
+
+/* We hardcode the divider to 4 for now */
+static CLK_FIXED_FACTOR(pll_audio_clk, "pll-audio",
+			"pll-audio-base", 4, 1, CLK_SET_RATE_PARENT);
+static CLK_FIXED_FACTOR(pll_audio_2x_clk, "pll-audio-2x",
+			"pll-audio-base", 2, 1, CLK_SET_RATE_PARENT);
+static CLK_FIXED_FACTOR(pll_audio_4x_clk, "pll-audio-4x",
+			"pll-audio-base", 1, 1, CLK_SET_RATE_PARENT);
+static CLK_FIXED_FACTOR(pll_audio_8x_clk, "pll-audio-8x",
+			"pll-audio-base", 1, 2, CLK_SET_RATE_PARENT);
+static CLK_FIXED_FACTOR(pll_periph0_2x_clk, "pll-periph0-2x",
+			"pll-periph0", 1, 2, 0);
+static CLK_FIXED_FACTOR(pll_periph1_2x_clk, "pll-periph1-2x",
+			"pll-periph1", 1, 2, 0);
+static CLK_FIXED_FACTOR(pll_video0_2x_clk, "pll-video0-2x",
+			"pll-video0", 1, 2, 0);
+static CLK_FIXED_FACTOR(pll_video1_2x_clk, "pll-video1-2x",
+			"pll-video1", 1, 2, 0);
+
+static struct clk_hw_onecell_data sun8i_r40_hw_clks = {
+	.hws	= {
+		[CLK_OSC_12M]		= &osc12M_clk.hw,
+		[CLK_PLL_CPU]		= &pll_cpu_clk.common.hw,
+		[CLK_PLL_AUDIO_BASE]	= &pll_audio_base_clk.common.hw,
+		[CLK_PLL_AUDIO]		= &pll_audio_clk.hw,
+		[CLK_PLL_AUDIO_2X]	= &pll_audio_2x_clk.hw,
+		[CLK_PLL_AUDIO_4X]	= &pll_audio_4x_clk.hw,
+		[CLK_PLL_AUDIO_8X]	= &pll_audio_8x_clk.hw,
+		[CLK_PLL_VIDEO0]	= &pll_video0_clk.common.hw,
+		[CLK_PLL_VIDEO0_2X]	= &pll_video0_2x_clk.hw,
+		[CLK_PLL_VE]		= &pll_ve_clk.common.hw,
+		[CLK_PLL_DDR0]		= &pll_ddr0_clk.common.hw,
+		[CLK_PLL_PERIPH0]	= &pll_periph0_clk.common.hw,
+		[CLK_PLL_PERIPH0_SATA]	= &pll_periph0_sata_clk.common.hw,
+		[CLK_PLL_PERIPH0_2X]	= &pll_periph0_2x_clk.hw,
+		[CLK_PLL_PERIPH1]	= &pll_periph1_clk.common.hw,
+		[CLK_PLL_PERIPH1_2X]	= &pll_periph1_2x_clk.hw,
+		[CLK_PLL_VIDEO1]	= &pll_video1_clk.common.hw,
+		[CLK_PLL_VIDEO1_2X]	= &pll_video1_2x_clk.hw,
+		[CLK_PLL_SATA]		= &pll_sata_clk.common.hw,
+		[CLK_PLL_SATA_OUT]	= &pll_sata_out_clk.common.hw,
+		[CLK_PLL_GPU]		= &pll_gpu_clk.common.hw,
+		[CLK_PLL_MIPI]		= &pll_mipi_clk.common.hw,
+		[CLK_PLL_DE]		= &pll_de_clk.common.hw,
+		[CLK_PLL_DDR1]		= &pll_ddr1_clk.common.hw,
+		[CLK_CPU]		= &cpu_clk.common.hw,
+		[CLK_AXI]		= &axi_clk.common.hw,
+		[CLK_AHB1]		= &ahb1_clk.common.hw,
+		[CLK_APB1]		= &apb1_clk.common.hw,
+		[CLK_APB2]		= &apb2_clk.common.hw,
+		[CLK_BUS_MIPI_DSI]	= &bus_mipi_dsi_clk.common.hw,
+		[CLK_BUS_CE]		= &bus_ce_clk.common.hw,
+		[CLK_BUS_DMA]		= &bus_dma_clk.common.hw,
+		[CLK_BUS_MMC0]		= &bus_mmc0_clk.common.hw,
+		[CLK_BUS_MMC1]		= &bus_mmc1_clk.common.hw,
+		[CLK_BUS_MMC2]		= &bus_mmc2_clk.common.hw,
+		[CLK_BUS_MMC3]		= &bus_mmc3_clk.common.hw,
+		[CLK_BUS_NAND]		= &bus_nand_clk.common.hw,
+		[CLK_BUS_DRAM]		= &bus_dram_clk.common.hw,
+		[CLK_BUS_EMAC]		= &bus_emac_clk.common.hw,
+		[CLK_BUS_TS]		= &bus_ts_clk.common.hw,
+		[CLK_BUS_HSTIMER]	= &bus_hstimer_clk.common.hw,
+		[CLK_BUS_SPI0]		= &bus_spi0_clk.common.hw,
+		[CLK_BUS_SPI1]		= &bus_spi1_clk.common.hw,
+		[CLK_BUS_SPI2]		= &bus_spi2_clk.common.hw,
+		[CLK_BUS_SPI3]		= &bus_spi3_clk.common.hw,
+		[CLK_BUS_SATA]		= &bus_sata_clk.common.hw,
+		[CLK_BUS_OTG]		= &bus_otg_clk.common.hw,
+		[CLK_BUS_EHCI0]		= &bus_ehci0_clk.common.hw,
+		[CLK_BUS_EHCI1]		= &bus_ehci1_clk.common.hw,
+		[CLK_BUS_EHCI2]		= &bus_ehci2_clk.common.hw,
+		[CLK_BUS_OHCI0]		= &bus_ohci0_clk.common.hw,
+		[CLK_BUS_OHCI1]		= &bus_ohci1_clk.common.hw,
+		[CLK_BUS_OHCI2]		= &bus_ohci2_clk.common.hw,
+		[CLK_BUS_VE]		= &bus_ve_clk.common.hw,
+		[CLK_BUS_MP]		= &bus_mp_clk.common.hw,
+		[CLK_BUS_DEINTERLACE]	= &bus_deinterlace_clk.common.hw,
+		[CLK_BUS_CSI0]		= &bus_csi0_clk.common.hw,
+		[CLK_BUS_CSI1]		= &bus_csi1_clk.common.hw,
+		[CLK_BUS_HDMI0]		= &bus_hdmi0_clk.common.hw,
+		[CLK_BUS_HDMI1]		= &bus_hdmi1_clk.common.hw,
+		[CLK_BUS_DE]		= &bus_de_clk.common.hw,
+		[CLK_BUS_TVE0]		= &bus_tve0_clk.common.hw,
+		[CLK_BUS_TVE1]		= &bus_tve1_clk.common.hw,
+		[CLK_BUS_TVE_TOP]	= &bus_tve_top_clk.common.hw,
+		[CLK_BUS_GMAC]		= &bus_gmac_clk.common.hw,
+		[CLK_BUS_GPU]		= &bus_gpu_clk.common.hw,
+		[CLK_BUS_TVD0]		= &bus_tvd0_clk.common.hw,
+		[CLK_BUS_TVD1]		= &bus_tvd1_clk.common.hw,
+		[CLK_BUS_TVD2]		= &bus_tvd2_clk.common.hw,
+		[CLK_BUS_TVD3]		= &bus_tvd3_clk.common.hw,
+		[CLK_BUS_TVD_TOP]	= &bus_tvd_top_clk.common.hw,
+		[CLK_BUS_TCON_LCD0]	= &bus_tcon_lcd0_clk.common.hw,
+		[CLK_BUS_TCON_LCD1]	= &bus_tcon_lcd1_clk.common.hw,
+		[CLK_BUS_TCON_TV0]	= &bus_tcon_tv0_clk.common.hw,
+		[CLK_BUS_TCON_TV1]	= &bus_tcon_tv1_clk.common.hw,
+		[CLK_BUS_TCON_TOP]	= &bus_tcon_top_clk.common.hw,
+		[CLK_BUS_CODEC]		= &bus_codec_clk.common.hw,
+		[CLK_BUS_SPDIF]		= &bus_spdif_clk.common.hw,
+		[CLK_BUS_AC97]		= &bus_ac97_clk.common.hw,
+		[CLK_BUS_PIO]		= &bus_pio_clk.common.hw,
+		[CLK_BUS_IR0]		= &bus_ir0_clk.common.hw,
+		[CLK_BUS_IR1]		= &bus_ir1_clk.common.hw,
+		[CLK_BUS_THS]		= &bus_ths_clk.common.hw,
+		[CLK_BUS_KEYPAD]	= &bus_keypad_clk.common.hw,
+		[CLK_BUS_I2S0]		= &bus_i2s0_clk.common.hw,
+		[CLK_BUS_I2S1]		= &bus_i2s1_clk.common.hw,
+		[CLK_BUS_I2S2]		= &bus_i2s2_clk.common.hw,
+		[CLK_BUS_I2C0]		= &bus_i2c0_clk.common.hw,
+		[CLK_BUS_I2C1]		= &bus_i2c1_clk.common.hw,
+		[CLK_BUS_I2C2]		= &bus_i2c2_clk.common.hw,
+		[CLK_BUS_I2C3]		= &bus_i2c3_clk.common.hw,
+		[CLK_BUS_CAN]		= &bus_can_clk.common.hw,
+		[CLK_BUS_SCR]		= &bus_scr_clk.common.hw,
+		[CLK_BUS_PS20]		= &bus_ps20_clk.common.hw,
+		[CLK_BUS_PS21]		= &bus_ps21_clk.common.hw,
+		[CLK_BUS_I2C4]		= &bus_i2c4_clk.common.hw,
+		[CLK_BUS_UART0]		= &bus_uart0_clk.common.hw,
+		[CLK_BUS_UART1]		= &bus_uart1_clk.common.hw,
+		[CLK_BUS_UART2]		= &bus_uart2_clk.common.hw,
+		[CLK_BUS_UART3]		= &bus_uart3_clk.common.hw,
+		[CLK_BUS_UART4]		= &bus_uart4_clk.common.hw,
+		[CLK_BUS_UART5]		= &bus_uart5_clk.common.hw,
+		[CLK_BUS_UART6]		= &bus_uart6_clk.common.hw,
+		[CLK_BUS_UART7]		= &bus_uart7_clk.common.hw,
+		[CLK_BUS_DBG]		= &bus_dbg_clk.common.hw,
+		[CLK_THS]		= &ths_clk.common.hw,
+		[CLK_NAND]		= &nand_clk.common.hw,
+		[CLK_MMC0]		= &mmc0_clk.common.hw,
+		[CLK_MMC1]		= &mmc1_clk.common.hw,
+		[CLK_MMC2]		= &mmc2_clk.common.hw,
+		[CLK_MMC3]		= &mmc3_clk.common.hw,
+		[CLK_TS]		= &ts_clk.common.hw,
+		[CLK_CE]		= &ce_clk.common.hw,
+		[CLK_SPI0]		= &spi0_clk.common.hw,
+		[CLK_SPI1]		= &spi1_clk.common.hw,
+		[CLK_SPI2]		= &spi2_clk.common.hw,
+		[CLK_SPI3]		= &spi3_clk.common.hw,
+		[CLK_I2S0]		= &i2s0_clk.common.hw,
+		[CLK_I2S1]		= &i2s1_clk.common.hw,
+		[CLK_I2S2]		= &i2s2_clk.common.hw,
+		[CLK_AC97]		= &ac97_clk.common.hw,
+		[CLK_SPDIF]		= &spdif_clk.common.hw,
+		[CLK_KEYPAD]		= &keypad_clk.common.hw,
+		[CLK_SATA]		= &sata_clk.common.hw,
+		[CLK_USB_PHY0]		= &usb_phy0_clk.common.hw,
+		[CLK_USB_PHY1]		= &usb_phy1_clk.common.hw,
+		[CLK_USB_PHY2]		= &usb_phy2_clk.common.hw,
+		[CLK_USB_OHCI0]		= &usb_ohci0_clk.common.hw,
+		[CLK_USB_OHCI1]		= &usb_ohci1_clk.common.hw,
+		[CLK_USB_OHCI2]		= &usb_ohci2_clk.common.hw,
+		[CLK_IR0]		= &ir0_clk.common.hw,
+		[CLK_IR1]		= &ir1_clk.common.hw,
+		[CLK_DRAM]		= &dram_clk.common.hw,
+		[CLK_DRAM_VE]		= &dram_ve_clk.common.hw,
+		[CLK_DRAM_CSI0]		= &dram_csi0_clk.common.hw,
+		[CLK_DRAM_CSI1]		= &dram_csi1_clk.common.hw,
+		[CLK_DRAM_TS]		= &dram_ts_clk.common.hw,
+		[CLK_DRAM_TVD]		= &dram_tvd_clk.common.hw,
+		[CLK_DRAM_MP]		= &dram_mp_clk.common.hw,
+		[CLK_DRAM_DEINTERLACE]	= &dram_deinterlace_clk.common.hw,
+		[CLK_DE]		= &de_clk.common.hw,
+		[CLK_MP]		= &mp_clk.common.hw,
+		[CLK_TCON_LCD0]		= &tcon_lcd0_clk.common.hw,
+		[CLK_TCON_LCD1]		= &tcon_lcd1_clk.common.hw,
+		[CLK_TCON_TV0]		= &tcon_tv0_clk.common.hw,
+		[CLK_TCON_TV1]		= &tcon_tv1_clk.common.hw,
+		[CLK_DEINTERLACE]	= &deinterlace_clk.common.hw,
+		[CLK_CSI1_MCLK]		= &csi1_mclk_clk.common.hw,
+		[CLK_CSI_SCLK]		= &csi_sclk_clk.common.hw,
+		[CLK_CSI0_MCLK]		= &csi0_mclk_clk.common.hw,
+		[CLK_VE]		= &ve_clk.common.hw,
+		[CLK_CODEC]		= &codec_clk.common.hw,
+		[CLK_AVS]		= &avs_clk.common.hw,
+		[CLK_HDMI]		= &hdmi_clk.common.hw,
+		[CLK_HDMI_SLOW]		= &hdmi_slow_clk.common.hw,
+		[CLK_MBUS]		= &mbus_clk.common.hw,
+		[CLK_DSI_DPHY]		= &dsi_dphy_clk.common.hw,
+		[CLK_TVE0]		= &tve0_clk.common.hw,
+		[CLK_TVE1]		= &tve1_clk.common.hw,
+		[CLK_TVD0]		= &tvd0_clk.common.hw,
+		[CLK_TVD1]		= &tvd1_clk.common.hw,
+		[CLK_TVD2]		= &tvd2_clk.common.hw,
+		[CLK_TVD3]		= &tvd3_clk.common.hw,
+		[CLK_GPU]		= &gpu_clk.common.hw,
+		[CLK_OUTA]		= &outa_clk.common.hw,
+		[CLK_OUTB]		= &outb_clk.common.hw,
+	},
+	.num	= CLK_NUMBER,
+};
+
+static struct ccu_reset_map sun8i_r40_ccu_resets[] = {
+	[RST_USB_PHY0]		=  { 0x0cc, BIT(0) },
+	[RST_USB_PHY1]		=  { 0x0cc, BIT(1) },
+	[RST_USB_PHY2]		=  { 0x0cc, BIT(2) },
+
+	[RST_DRAM]		=  { 0x0f4, BIT(31) },
+	[RST_MBUS]		=  { 0x0fc, BIT(31) },
+
+	[RST_BUS_MIPI_DSI]	=  { 0x2c0, BIT(1) },
+	[RST_BUS_CE]		=  { 0x2c0, BIT(5) },
+	[RST_BUS_DMA]		=  { 0x2c0, BIT(6) },
+	[RST_BUS_MMC0]		=  { 0x2c0, BIT(8) },
+	[RST_BUS_MMC1]		=  { 0x2c0, BIT(9) },
+	[RST_BUS_MMC2]		=  { 0x2c0, BIT(10) },
+	[RST_BUS_MMC3]		=  { 0x2c0, BIT(11) },
+	[RST_BUS_NAND]		=  { 0x2c0, BIT(13) },
+	[RST_BUS_DRAM]		=  { 0x2c0, BIT(14) },
+	[RST_BUS_EMAC]		=  { 0x2c0, BIT(17) },
+	[RST_BUS_TS]		=  { 0x2c0, BIT(18) },
+	[RST_BUS_HSTIMER]	=  { 0x2c0, BIT(19) },
+	[RST_BUS_SPI0]		=  { 0x2c0, BIT(20) },
+	[RST_BUS_SPI1]		=  { 0x2c0, BIT(21) },
+	[RST_BUS_SPI2]		=  { 0x2c0, BIT(22) },
+	[RST_BUS_SPI3]		=  { 0x2c0, BIT(23) },
+	[RST_BUS_SATA]		=  { 0x2c0, BIT(24) },
+	[RST_BUS_OTG]		=  { 0x2c0, BIT(25) },
+	[RST_BUS_EHCI0]		=  { 0x2c0, BIT(26) },
+	[RST_BUS_EHCI1]		=  { 0x2c0, BIT(27) },
+	[RST_BUS_EHCI2]		=  { 0x2c0, BIT(28) },
+	[RST_BUS_OHCI0]		=  { 0x2c0, BIT(29) },
+	[RST_BUS_OHCI1]		=  { 0x2c0, BIT(30) },
+	[RST_BUS_OHCI2]		=  { 0x2c0, BIT(31) },
+
+	[RST_BUS_VE]		=  { 0x2c4, BIT(0) },
+	[RST_BUS_MP]		=  { 0x2c4, BIT(2) },
+	[RST_BUS_DEINTERLACE]	=  { 0x2c4, BIT(5) },
+	[RST_BUS_CSI0]		=  { 0x2c4, BIT(8) },
+	[RST_BUS_CSI1]		=  { 0x2c4, BIT(9) },
+	[RST_BUS_HDMI0]		=  { 0x2c4, BIT(10) },
+	[RST_BUS_HDMI1]		=  { 0x2c4, BIT(11) },
+	[RST_BUS_DE]		=  { 0x2c4, BIT(12) },
+	[RST_BUS_TVE0]		=  { 0x2c4, BIT(13) },
+	[RST_BUS_TVE1]		=  { 0x2c4, BIT(14) },
+	[RST_BUS_TVE_TOP]	=  { 0x2c4, BIT(15) },
+	[RST_BUS_GMAC]		=  { 0x2c4, BIT(17) },
+	[RST_BUS_GPU]		=  { 0x2c4, BIT(20) },
+	[RST_BUS_TVD0]		=  { 0x2c4, BIT(21) },
+	[RST_BUS_TVD1]		=  { 0x2c4, BIT(22) },
+	[RST_BUS_TVD2]		=  { 0x2c4, BIT(23) },
+	[RST_BUS_TVD3]		=  { 0x2c4, BIT(24) },
+	[RST_BUS_TVD_TOP]	=  { 0x2c4, BIT(25) },
+	[RST_BUS_TCON_LCD0]	=  { 0x2c4, BIT(26) },
+	[RST_BUS_TCON_LCD1]	=  { 0x2c4, BIT(27) },
+	[RST_BUS_TCON_TV0]	=  { 0x2c4, BIT(28) },
+	[RST_BUS_TCON_TV1]	=  { 0x2c4, BIT(29) },
+	[RST_BUS_TCON_TOP]	=  { 0x2c4, BIT(30) },
+	[RST_BUS_DBG]		=  { 0x2c4, BIT(31) },
+
+	[RST_BUS_LVDS]		=  { 0x2c8, BIT(0) },
+
+	[RST_BUS_CODEC]		=  { 0x2d0, BIT(0) },
+	[RST_BUS_SPDIF]		=  { 0x2d0, BIT(1) },
+	[RST_BUS_AC97]		=  { 0x2d0, BIT(2) },
+	[RST_BUS_IR0]		=  { 0x2d0, BIT(6) },
+	[RST_BUS_IR1]		=  { 0x2d0, BIT(7) },
+	[RST_BUS_THS]		=  { 0x2d0, BIT(8) },
+	[RST_BUS_KEYPAD]	=  { 0x2d0, BIT(10) },
+	[RST_BUS_I2S0]		=  { 0x2d0, BIT(12) },
+	[RST_BUS_I2S1]		=  { 0x2d0, BIT(13) },
+	[RST_BUS_I2S2]		=  { 0x2d0, BIT(14) },
+
+	[RST_BUS_I2C0]		=  { 0x2d8, BIT(0) },
+	[RST_BUS_I2C1]		=  { 0x2d8, BIT(1) },
+	[RST_BUS_I2C2]		=  { 0x2d8, BIT(2) },
+	[RST_BUS_I2C3]		=  { 0x2d8, BIT(3) },
+	[RST_BUS_CAN]		=  { 0x2d8, BIT(4) },
+	[RST_BUS_SCR]		=  { 0x2d8, BIT(5) },
+	[RST_BUS_PS20]		=  { 0x2d8, BIT(6) },
+	[RST_BUS_PS21]		=  { 0x2d8, BIT(7) },
+	[RST_BUS_I2C4]		=  { 0x2d8, BIT(15) },
+	[RST_BUS_UART0]		=  { 0x2d8, BIT(16) },
+	[RST_BUS_UART1]		=  { 0x2d8, BIT(17) },
+	[RST_BUS_UART2]		=  { 0x2d8, BIT(18) },
+	[RST_BUS_UART3]		=  { 0x2d8, BIT(19) },
+	[RST_BUS_UART4]		=  { 0x2d8, BIT(20) },
+	[RST_BUS_UART5]		=  { 0x2d8, BIT(21) },
+	[RST_BUS_UART6]		=  { 0x2d8, BIT(22) },
+	[RST_BUS_UART7]		=  { 0x2d8, BIT(23) },
+};
+
+static const struct sunxi_ccu_desc sun8i_r40_ccu_desc = {
+	.ccu_clks	= sun8i_r40_ccu_clks,
+	.num_ccu_clks	= ARRAY_SIZE(sun8i_r40_ccu_clks),
+
+	.hw_clks	= &sun8i_r40_hw_clks,
+
+	.resets		= sun8i_r40_ccu_resets,
+	.num_resets	= ARRAY_SIZE(sun8i_r40_ccu_resets),
+};
+
+static struct ccu_pll_nb sun8i_r40_pll_cpu_nb = {
+	.common	= &pll_cpu_clk.common,
+	/* copy from pll_cpu_clk */
+	.enable	= BIT(31),
+	.lock	= BIT(28),
+};
+
+static struct ccu_mux_nb sun8i_r40_cpu_nb = {
+	.common		= &cpu_clk.common,
+	.cm		= &cpu_clk.mux,
+	.delay_us	= 1, /* > 8 clock cycles at 24 MHz */
+	.bypass_index	= 1, /* index of 24 MHz oscillator */
+};
+
+static void __init sun8i_r40_ccu_setup(struct device_node *node)
+{
+	void __iomem *reg;
+	u32 val;
+
+	reg = of_io_request_and_map(node, 0, of_node_full_name(node));
+	if (IS_ERR(reg)) {
+		pr_err("%s: Could not map the clock registers\n",
+		       of_node_full_name(node));
+		return;
+	}
+
+	/* Force the PLL-Audio-1x divider to 4 */
+	val = readl(reg + SUN8I_R40_PLL_AUDIO_REG);
+	val &= ~GENMASK(19, 16);
+	writel(val | (3 << 16), reg + SUN8I_R40_PLL_AUDIO_REG);
+
+	/* Force PLL-MIPI to MIPI mode */
+	val = readl(reg + SUN8I_R40_PLL_MIPI_REG);
+	val &= ~BIT(16);
+	writel(val, reg + SUN8I_R40_PLL_MIPI_REG);
+
+	/* Force OHCI 12M parent to 12M divided from 48M */
+	val = readl(reg + SUN8I_R40_USB_CLK_REG);
+	val &= ~GENMASK(25, 20);
+	writel(val, reg + SUN8I_R40_USB_CLK_REG);
+
+	sunxi_ccu_probe(node, reg, &sun8i_r40_ccu_desc);
+
+	/* Gate then ungate PLL CPU after any rate changes */
+	ccu_pll_notifier_register(&sun8i_r40_pll_cpu_nb);
+
+	/* Reparent CPU during PLL CPU rate changes */
+	ccu_mux_notifier_register(pll_cpu_clk.common.hw.clk,
+				  &sun8i_r40_cpu_nb);
+}
+CLK_OF_DECLARE(sun8i_r40_ccu, "allwinner,sun8i-r40-ccu",
+	       sun8i_r40_ccu_setup);
diff --git a/drivers/clk/sunxi-ng/ccu-sun8i-r40.h b/drivers/clk/sunxi-ng/ccu-sun8i-r40.h
new file mode 100644
index 000000000000..0db8e1e97af8
--- /dev/null
+++ b/drivers/clk/sunxi-ng/ccu-sun8i-r40.h
@@ -0,0 +1,69 @@
+/*
+ * Copyright 2017 Icenowy Zheng <icenowy@aosc.io>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ */
+
+#ifndef _CCU_SUN8I_R40_H_
+#define _CCU_SUN8I_R40_H_
+
+#include <dt-bindings/clock/sun8i-r40-ccu.h>
+#include <dt-bindings/reset/sun8i-r40-ccu.h>
+
+#define CLK_OSC_12M		0
+#define CLK_PLL_CPU		1
+#define CLK_PLL_AUDIO_BASE	2
+#define CLK_PLL_AUDIO		3
+#define CLK_PLL_AUDIO_2X	4
+#define CLK_PLL_AUDIO_4X	5
+#define CLK_PLL_AUDIO_8X	6
+#define CLK_PLL_VIDEO0		7
+#define CLK_PLL_VIDEO0_2X	8
+#define CLK_PLL_VE		9
+#define CLK_PLL_DDR0		10
+#define CLK_PLL_PERIPH0		11
+#define CLK_PLL_PERIPH0_SATA	12
+#define CLK_PLL_PERIPH0_2X	13
+#define CLK_PLL_PERIPH1		14
+#define CLK_PLL_PERIPH1_2X	15
+#define CLK_PLL_VIDEO1		16
+#define CLK_PLL_VIDEO1_2X	17
+#define CLK_PLL_SATA		18
+#define CLK_PLL_SATA_OUT	19
+#define CLK_PLL_GPU		20
+#define CLK_PLL_MIPI		21
+#define CLK_PLL_DE		22
+#define CLK_PLL_DDR1		23
+
+/* The CPU clock is exported */
+
+#define CLK_AXI			25
+#define CLK_AHB1		26
+#define CLK_APB1		27
+#define CLK_APB2		28
+
+/* All the bus gates are exported */
+
+/* The first bunch of module clocks are exported */
+
+#define CLK_DRAM		132
+
+/* All the DRAM gates are exported */
+
+/* Some more module clocks are exported */
+
+#define CLK_MBUS		155
+
+/* Another bunch of module clocks are exported */
+
+#define CLK_NUMBER		(CLK_OUTB + 1)
+
+#endif /* _CCU_SUN8I_R40_H_ */
diff --git a/include/dt-bindings/clock/sun8i-r40-ccu.h b/include/dt-bindings/clock/sun8i-r40-ccu.h
new file mode 100644
index 000000000000..4fa5f69fc297
--- /dev/null
+++ b/include/dt-bindings/clock/sun8i-r40-ccu.h
@@ -0,0 +1,187 @@
+/*
+ * Copyright (C) 2017 Icenowy Zheng <icenowy@aosc.io>
+ *
+ * This file is dual-licensed: you can use it either under the terms
+ * of the GPL or the X11 license, at your option. Note that this dual
+ * licensing only applies to this file, and not this project as a
+ * whole.
+ *
+ *  a) This file is free software; you can redistribute it and/or
+ *     modify it under the terms of the GNU General Public License as
+ *     published by the Free Software Foundation; either version 2 of the
+ *     License, or (at your option) any later version.
+ *
+ *     This file is distributed in the hope that it will be useful,
+ *     but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *     GNU General Public License for more details.
+ *
+ * Or, alternatively,
+ *
+ *  b) Permission is hereby granted, free of charge, to any person
+ *     obtaining a copy of this software and associated documentation
+ *     files (the "Software"), to deal in the Software without
+ *     restriction, including without limitation the rights to use,
+ *     copy, modify, merge, publish, distribute, sublicense, and/or
+ *     sell copies of the Software, and to permit persons to whom the
+ *     Software is furnished to do so, subject to the following
+ *     conditions:
+ *
+ *     The above copyright notice and this permission notice shall be
+ *     included in all copies or substantial portions of the Software.
+ *
+ *     THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ *     EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+ *     OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ *     NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+ *     HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+ *     WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ *     FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ *     OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#ifndef _DT_BINDINGS_CLK_SUN8I_R40_H_
+#define _DT_BINDINGS_CLK_SUN8I_R40_H_
+
+#define CLK_CPU			24
+
+#define CLK_BUS_MIPI_DSI	29
+#define CLK_BUS_CE		30
+#define CLK_BUS_DMA		31
+#define CLK_BUS_MMC0		32
+#define CLK_BUS_MMC1		33
+#define CLK_BUS_MMC2		34
+#define CLK_BUS_MMC3		35
+#define CLK_BUS_NAND		36
+#define CLK_BUS_DRAM		37
+#define CLK_BUS_EMAC		38
+#define CLK_BUS_TS		39
+#define CLK_BUS_HSTIMER		40
+#define CLK_BUS_SPI0		41
+#define CLK_BUS_SPI1		42
+#define CLK_BUS_SPI2		43
+#define CLK_BUS_SPI3		44
+#define CLK_BUS_SATA		45
+#define CLK_BUS_OTG		46
+#define CLK_BUS_EHCI0		47
+#define CLK_BUS_EHCI1		48
+#define CLK_BUS_EHCI2		49
+#define CLK_BUS_OHCI0		50
+#define CLK_BUS_OHCI1		51
+#define CLK_BUS_OHCI2		52
+#define CLK_BUS_VE		53
+#define CLK_BUS_MP		54
+#define CLK_BUS_DEINTERLACE	55
+#define CLK_BUS_CSI0		56
+#define CLK_BUS_CSI1		57
+#define CLK_BUS_HDMI1		58
+#define CLK_BUS_HDMI0		59
+#define CLK_BUS_DE		60
+#define CLK_BUS_TVE0		61
+#define CLK_BUS_TVE1		62
+#define CLK_BUS_TVE_TOP		63
+#define CLK_BUS_GMAC		64
+#define CLK_BUS_GPU		65
+#define CLK_BUS_TVD0		66
+#define CLK_BUS_TVD1		67
+#define CLK_BUS_TVD2		68
+#define CLK_BUS_TVD3		69
+#define CLK_BUS_TVD_TOP		70
+#define CLK_BUS_TCON_LCD0	71
+#define CLK_BUS_TCON_LCD1	72
+#define CLK_BUS_TCON_TV0	73
+#define CLK_BUS_TCON_TV1	74
+#define CLK_BUS_TCON_TOP	75
+#define CLK_BUS_CODEC		76
+#define CLK_BUS_SPDIF		77
+#define CLK_BUS_AC97		78
+#define CLK_BUS_PIO		79
+#define CLK_BUS_IR0		80
+#define CLK_BUS_IR1		81
+#define CLK_BUS_THS		82
+#define CLK_BUS_KEYPAD		83
+#define CLK_BUS_I2S0		84
+#define CLK_BUS_I2S1		85
+#define CLK_BUS_I2S2		86
+#define CLK_BUS_I2C0		87
+#define CLK_BUS_I2C1		88
+#define CLK_BUS_I2C2		89
+#define CLK_BUS_I2C3		90
+#define CLK_BUS_CAN		91
+#define CLK_BUS_SCR		92
+#define CLK_BUS_PS20		93
+#define CLK_BUS_PS21		94
+#define CLK_BUS_I2C4		95
+#define CLK_BUS_UART0		96
+#define CLK_BUS_UART1		97
+#define CLK_BUS_UART2		98
+#define CLK_BUS_UART3		99
+#define CLK_BUS_UART4		100
+#define CLK_BUS_UART5		101
+#define CLK_BUS_UART6		102
+#define CLK_BUS_UART7		103
+#define CLK_BUS_DBG		104
+
+#define CLK_THS			105
+#define CLK_NAND		106
+#define CLK_MMC0		107
+#define CLK_MMC1		108
+#define CLK_MMC2		109
+#define CLK_MMC3		110
+#define CLK_TS			111
+#define CLK_CE			112
+#define CLK_SPI0		113
+#define CLK_SPI1		114
+#define CLK_SPI2		115
+#define CLK_SPI3		116
+#define CLK_I2S0		117
+#define CLK_I2S1		118
+#define CLK_I2S2		119
+#define CLK_AC97		120
+#define CLK_SPDIF		121
+#define CLK_KEYPAD		122
+#define CLK_SATA		123
+#define CLK_USB_PHY0		124
+#define CLK_USB_PHY1		125
+#define CLK_USB_PHY2		126
+#define CLK_USB_OHCI0		127
+#define CLK_USB_OHCI1		128
+#define CLK_USB_OHCI2		129
+#define CLK_IR0			130
+#define CLK_IR1			131
+
+#define CLK_DRAM_VE		133
+#define CLK_DRAM_CSI0		134
+#define CLK_DRAM_CSI1		135
+#define CLK_DRAM_TS		136
+#define CLK_DRAM_TVD		137
+#define CLK_DRAM_MP		138
+#define CLK_DRAM_DEINTERLACE	139
+#define CLK_DE			140
+#define CLK_MP			141
+#define CLK_TCON_LCD0		142
+#define CLK_TCON_LCD1		143
+#define CLK_TCON_TV0		144
+#define CLK_TCON_TV1		145
+#define CLK_DEINTERLACE		146
+#define CLK_CSI1_MCLK		147
+#define CLK_CSI_SCLK		148
+#define CLK_CSI0_MCLK		149
+#define CLK_VE			150
+#define CLK_CODEC		151
+#define CLK_AVS			152
+#define CLK_HDMI		153
+#define CLK_HDMI_SLOW		154
+
+#define CLK_DSI_DPHY		156
+#define CLK_TVE0		157
+#define CLK_TVE1		158
+#define CLK_TVD0		159
+#define CLK_TVD1		160
+#define CLK_TVD2		161
+#define CLK_TVD3		162
+#define CLK_GPU			163
+#define CLK_OUTA		164
+#define CLK_OUTB		165
+
+#endif /* _DT_BINDINGS_CLK_SUN8I_R40_H_ */
diff --git a/include/dt-bindings/reset/sun8i-r40-ccu.h b/include/dt-bindings/reset/sun8i-r40-ccu.h
new file mode 100644
index 000000000000..c5ebcf6672e4
--- /dev/null
+++ b/include/dt-bindings/reset/sun8i-r40-ccu.h
@@ -0,0 +1,130 @@
+/*
+ * Copyright (C) 2017 Icenowy Zheng <icenowy@aosc.io>
+ *
+ * This file is dual-licensed: you can use it either under the terms
+ * of the GPL or the X11 license, at your option. Note that this dual
+ * licensing only applies to this file, and not this project as a
+ * whole.
+ *
+ *  a) This file is free software; you can redistribute it and/or
+ *     modify it under the terms of the GNU General Public License as
+ *     published by the Free Software Foundation; either version 2 of the
+ *     License, or (at your option) any later version.
+ *
+ *     This file is distributed in the hope that it will be useful,
+ *     but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *     GNU General Public License for more details.
+ *
+ * Or, alternatively,
+ *
+ *  b) Permission is hereby granted, free of charge, to any person
+ *     obtaining a copy of this software and associated documentation
+ *     files (the "Software"), to deal in the Software without
+ *     restriction, including without limitation the rights to use,
+ *     copy, modify, merge, publish, distribute, sublicense, and/or
+ *     sell copies of the Software, and to permit persons to whom the
+ *     Software is furnished to do so, subject to the following
+ *     conditions:
+ *
+ *     The above copyright notice and this permission notice shall be
+ *     included in all copies or substantial portions of the Software.
+ *
+ *     THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ *     EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+ *     OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ *     NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+ *     HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+ *     WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ *     FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ *     OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#ifndef _DT_BINDINGS_RST_SUN8I_R40_H_
+#define _DT_BINDINGS_RST_SUN8I_R40_H_
+
+#define RST_USB_PHY0		0
+#define RST_USB_PHY1		1
+#define RST_USB_PHY2		2
+
+#define RST_DRAM		3
+#define RST_MBUS		4
+
+#define RST_BUS_MIPI_DSI	5
+#define RST_BUS_CE		6
+#define RST_BUS_DMA		7
+#define RST_BUS_MMC0		8
+#define RST_BUS_MMC1		9
+#define RST_BUS_MMC2		10
+#define RST_BUS_MMC3		11
+#define RST_BUS_NAND		12
+#define RST_BUS_DRAM		13
+#define RST_BUS_EMAC		14
+#define RST_BUS_TS		15
+#define RST_BUS_HSTIMER		16
+#define RST_BUS_SPI0		17
+#define RST_BUS_SPI1		18
+#define RST_BUS_SPI2		19
+#define RST_BUS_SPI3		20
+#define RST_BUS_SATA		21
+#define RST_BUS_OTG		22
+#define RST_BUS_EHCI0		23
+#define RST_BUS_EHCI1		24
+#define RST_BUS_EHCI2		25
+#define RST_BUS_OHCI0		26
+#define RST_BUS_OHCI1		27
+#define RST_BUS_OHCI2		28
+#define RST_BUS_VE		29
+#define RST_BUS_MP		30
+#define RST_BUS_DEINTERLACE	31
+#define RST_BUS_CSI0		32
+#define RST_BUS_CSI1		33
+#define RST_BUS_HDMI0		34
+#define RST_BUS_HDMI1		35
+#define RST_BUS_DE		36
+#define RST_BUS_TVE0		37
+#define RST_BUS_TVE1		38
+#define RST_BUS_TVE_TOP		39
+#define RST_BUS_GMAC		40
+#define RST_BUS_GPU		41
+#define RST_BUS_TVD0		42
+#define RST_BUS_TVD1		43
+#define RST_BUS_TVD2		44
+#define RST_BUS_TVD3		45
+#define RST_BUS_TVD_TOP		46
+#define RST_BUS_TCON_LCD0	47
+#define RST_BUS_TCON_LCD1	48
+#define RST_BUS_TCON_TV0	49
+#define RST_BUS_TCON_TV1	50
+#define RST_BUS_TCON_TOP	51
+#define RST_BUS_DBG		52
+#define RST_BUS_LVDS		53
+#define RST_BUS_CODEC		54
+#define RST_BUS_SPDIF		55
+#define RST_BUS_AC97		56
+#define RST_BUS_IR0		57
+#define RST_BUS_IR1		58
+#define RST_BUS_THS		59
+#define RST_BUS_KEYPAD		60
+#define RST_BUS_I2S0		61
+#define RST_BUS_I2S1		62
+#define RST_BUS_I2S2		63
+#define RST_BUS_I2C0		64
+#define RST_BUS_I2C1		65
+#define RST_BUS_I2C2		66
+#define RST_BUS_I2C3		67
+#define RST_BUS_CAN		68
+#define RST_BUS_SCR		69
+#define RST_BUS_PS20		70
+#define RST_BUS_PS21		71
+#define RST_BUS_I2C4		72
+#define RST_BUS_UART0		73
+#define RST_BUS_UART1		74
+#define RST_BUS_UART2		75
+#define RST_BUS_UART3		76
+#define RST_BUS_UART4		77
+#define RST_BUS_UART5		78
+#define RST_BUS_UART6		79
+#define RST_BUS_UART7		80
+
+#endif /* _DT_BINDINGS_RST_SUN8I_R40_H_ */
-- 
cgit v1.2.3


From 99d1712bc41c7c9a5a473c104a4ad15427757b22 Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Tue, 8 Aug 2017 15:15:29 +0200
Subject: netfilter: exthdr: tcp option set support

This allows setting 2 and 4 byte quantities in the tcp option space.
Main purpose is to allow native replacement for xt_TCPMSS to
work around pmtu blackholes.

Writes to kind and len are now allowed at the moment, it does not seem
useful to do this as it causes corruption of the tcp option space.

We can always lift this restriction later if a use-case appears.

Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/uapi/linux/netfilter/nf_tables.h |   4 +-
 net/netfilter/nft_exthdr.c               | 164 ++++++++++++++++++++++++++++++-
 2 files changed, 165 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/include/uapi/linux/netfilter/nf_tables.h b/include/uapi/linux/netfilter/nf_tables.h
index be25cf69295b..40fd199f7531 100644
--- a/include/uapi/linux/netfilter/nf_tables.h
+++ b/include/uapi/linux/netfilter/nf_tables.h
@@ -732,7 +732,8 @@ enum nft_exthdr_op {
  * @NFTA_EXTHDR_OFFSET: extension header offset (NLA_U32)
  * @NFTA_EXTHDR_LEN: extension header length (NLA_U32)
  * @NFTA_EXTHDR_FLAGS: extension header flags (NLA_U32)
- * @NFTA_EXTHDR_OP: option match type (NLA_U8)
+ * @NFTA_EXTHDR_OP: option match type (NLA_U32)
+ * @NFTA_EXTHDR_SREG: option match type (NLA_U32)
  */
 enum nft_exthdr_attributes {
 	NFTA_EXTHDR_UNSPEC,
@@ -742,6 +743,7 @@ enum nft_exthdr_attributes {
 	NFTA_EXTHDR_LEN,
 	NFTA_EXTHDR_FLAGS,
 	NFTA_EXTHDR_OP,
+	NFTA_EXTHDR_SREG,
 	__NFTA_EXTHDR_MAX
 };
 #define NFTA_EXTHDR_MAX		(__NFTA_EXTHDR_MAX - 1)
diff --git a/net/netfilter/nft_exthdr.c b/net/netfilter/nft_exthdr.c
index e3a6eebe7e0c..f5a0bf5e3bdd 100644
--- a/net/netfilter/nft_exthdr.c
+++ b/net/netfilter/nft_exthdr.c
@@ -8,6 +8,7 @@
  * Development of this code funded by Astaro AG (http://www.astaro.com/)
  */
 
+#include <asm/unaligned.h>
 #include <linux/kernel.h>
 #include <linux/init.h>
 #include <linux/module.h>
@@ -23,6 +24,7 @@ struct nft_exthdr {
 	u8			len;
 	u8			op;
 	enum nft_registers	dreg:8;
+	enum nft_registers	sreg:8;
 	u8			flags;
 };
 
@@ -124,6 +126,88 @@ err:
 		regs->verdict.code = NFT_BREAK;
 }
 
+static void nft_exthdr_tcp_set_eval(const struct nft_expr *expr,
+				    struct nft_regs *regs,
+				    const struct nft_pktinfo *pkt)
+{
+	u8 buff[sizeof(struct tcphdr) + MAX_TCP_OPTION_SPACE];
+	struct nft_exthdr *priv = nft_expr_priv(expr);
+	unsigned int i, optl, tcphdr_len, offset;
+	struct tcphdr *tcph;
+	u8 *opt;
+	u32 src;
+
+	tcph = nft_tcp_header_pointer(pkt, sizeof(buff), buff, &tcphdr_len);
+	if (!tcph)
+		return;
+
+	opt = (u8 *)tcph;
+	for (i = sizeof(*tcph); i < tcphdr_len - 1; i += optl) {
+		union {
+			u8 octet;
+			__be16 v16;
+			__be32 v32;
+		} old, new;
+
+		optl = optlen(opt, i);
+
+		if (priv->type != opt[i])
+			continue;
+
+		if (i + optl > tcphdr_len || priv->len + priv->offset > optl)
+			return;
+
+		if (!skb_make_writable(pkt->skb, pkt->xt.thoff + i + priv->len))
+			return;
+
+		tcph = nft_tcp_header_pointer(pkt, sizeof(buff), buff,
+					      &tcphdr_len);
+		if (!tcph)
+			return;
+
+		src = regs->data[priv->sreg];
+		offset = i + priv->offset;
+
+		switch (priv->len) {
+		case 2:
+			old.v16 = get_unaligned((u16 *)(opt + offset));
+			new.v16 = src;
+
+			switch (priv->type) {
+			case TCPOPT_MSS:
+				/* increase can cause connection to stall */
+				if (ntohs(old.v16) <= ntohs(new.v16))
+					return;
+			break;
+			}
+
+			if (old.v16 == new.v16)
+				return;
+
+			put_unaligned(new.v16, (u16*)(opt + offset));
+			inet_proto_csum_replace2(&tcph->check, pkt->skb,
+						 old.v16, new.v16, false);
+			break;
+		case 4:
+			new.v32 = src;
+			old.v32 = get_unaligned((u32 *)(opt + offset));
+
+			if (old.v32 == new.v32)
+				return;
+
+			put_unaligned(new.v32, (u32*)(opt + offset));
+			inet_proto_csum_replace4(&tcph->check, pkt->skb,
+						 old.v32, new.v32, false);
+			break;
+		default:
+			WARN_ON_ONCE(1);
+			break;
+		}
+
+		return;
+	}
+}
+
 static const struct nla_policy nft_exthdr_policy[NFTA_EXTHDR_MAX + 1] = {
 	[NFTA_EXTHDR_DREG]		= { .type = NLA_U32 },
 	[NFTA_EXTHDR_TYPE]		= { .type = NLA_U8 },
@@ -180,6 +264,55 @@ static int nft_exthdr_init(const struct nft_ctx *ctx,
 					   NFT_DATA_VALUE, priv->len);
 }
 
+static int nft_exthdr_tcp_set_init(const struct nft_ctx *ctx,
+				   const struct nft_expr *expr,
+				   const struct nlattr * const tb[])
+{
+	struct nft_exthdr *priv = nft_expr_priv(expr);
+	u32 offset, len, flags = 0, op = NFT_EXTHDR_OP_IPV6;
+	int err;
+
+	if (!tb[NFTA_EXTHDR_SREG] ||
+	    !tb[NFTA_EXTHDR_TYPE] ||
+	    !tb[NFTA_EXTHDR_OFFSET] ||
+	    !tb[NFTA_EXTHDR_LEN])
+		return -EINVAL;
+
+	if (tb[NFTA_EXTHDR_DREG] || tb[NFTA_EXTHDR_FLAGS])
+		return -EINVAL;
+
+	err = nft_parse_u32_check(tb[NFTA_EXTHDR_OFFSET], U8_MAX, &offset);
+	if (err < 0)
+		return err;
+
+	err = nft_parse_u32_check(tb[NFTA_EXTHDR_LEN], U8_MAX, &len);
+	if (err < 0)
+		return err;
+
+	if (offset < 2)
+		return -EOPNOTSUPP;
+
+	switch (len) {
+	case 2: break;
+	case 4: break;
+	default:
+		return -EOPNOTSUPP;
+	}
+
+	err = nft_parse_u32_check(tb[NFTA_EXTHDR_OP], U8_MAX, &op);
+	if (err < 0)
+		return err;
+
+	priv->type   = nla_get_u8(tb[NFTA_EXTHDR_TYPE]);
+	priv->offset = offset;
+	priv->len    = len;
+	priv->sreg   = nft_parse_register(tb[NFTA_EXTHDR_SREG]);
+	priv->flags  = flags;
+	priv->op     = op;
+
+	return nft_validate_register_load(priv->sreg, priv->len);
+}
+
 static int nft_exthdr_dump_common(struct sk_buff *skb, const struct nft_exthdr *priv)
 {
 	if (nla_put_u8(skb, NFTA_EXTHDR_TYPE, priv->type))
@@ -208,6 +341,16 @@ static int nft_exthdr_dump(struct sk_buff *skb, const struct nft_expr *expr)
 	return nft_exthdr_dump_common(skb, priv);
 }
 
+static int nft_exthdr_dump_set(struct sk_buff *skb, const struct nft_expr *expr)
+{
+	const struct nft_exthdr *priv = nft_expr_priv(expr);
+
+	if (nft_dump_register(skb, NFTA_EXTHDR_SREG, priv->sreg))
+		return -1;
+
+	return nft_exthdr_dump_common(skb, priv);
+}
+
 static struct nft_expr_type nft_exthdr_type;
 static const struct nft_expr_ops nft_exthdr_ipv6_ops = {
 	.type		= &nft_exthdr_type,
@@ -225,6 +368,14 @@ static const struct nft_expr_ops nft_exthdr_tcp_ops = {
 	.dump		= nft_exthdr_dump,
 };
 
+static const struct nft_expr_ops nft_exthdr_tcp_set_ops = {
+	.type		= &nft_exthdr_type,
+	.size		= NFT_EXPR_SIZE(sizeof(struct nft_exthdr)),
+	.eval		= nft_exthdr_tcp_set_eval,
+	.init		= nft_exthdr_tcp_set_init,
+	.dump		= nft_exthdr_dump_set,
+};
+
 static const struct nft_expr_ops *
 nft_exthdr_select_ops(const struct nft_ctx *ctx,
 		      const struct nlattr * const tb[])
@@ -234,12 +385,21 @@ nft_exthdr_select_ops(const struct nft_ctx *ctx,
 	if (!tb[NFTA_EXTHDR_OP])
 		return &nft_exthdr_ipv6_ops;
 
+	if (tb[NFTA_EXTHDR_SREG] && tb[NFTA_EXTHDR_DREG])
+		return ERR_PTR(-EOPNOTSUPP);
+
 	op = ntohl(nla_get_u32(tb[NFTA_EXTHDR_OP]));
 	switch (op) {
 	case NFT_EXTHDR_OP_TCPOPT:
-		return &nft_exthdr_tcp_ops;
+		if (tb[NFTA_EXTHDR_SREG])
+			return &nft_exthdr_tcp_set_ops;
+		if (tb[NFTA_EXTHDR_DREG])
+			return &nft_exthdr_tcp_ops;
+		break;
 	case NFT_EXTHDR_OP_IPV6:
-		return &nft_exthdr_ipv6_ops;
+		if (tb[NFTA_EXTHDR_DREG])
+			return &nft_exthdr_ipv6_ops;
+		break;
 	}
 
 	return ERR_PTR(-EOPNOTSUPP);
-- 
cgit v1.2.3


From 6b5dc98e8fac041a3decfc3186e08c1c570ea691 Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Tue, 8 Aug 2017 15:48:04 +0200
Subject: netfilter: rt: add support to fetch path mss

to be used in combination with tcp option set support to mimic
iptables TCPMSS --clamp-mss-to-pmtu.

v2: Eric Dumazet points out dst must be initialized.

Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/uapi/linux/netfilter/nf_tables.h |  2 +
 net/netfilter/nft_rt.c                   | 66 ++++++++++++++++++++++++++++++++
 2 files changed, 68 insertions(+)

(limited to 'include')

diff --git a/include/uapi/linux/netfilter/nf_tables.h b/include/uapi/linux/netfilter/nf_tables.h
index 40fd199f7531..b49da72efa68 100644
--- a/include/uapi/linux/netfilter/nf_tables.h
+++ b/include/uapi/linux/netfilter/nf_tables.h
@@ -811,11 +811,13 @@ enum nft_meta_keys {
  * @NFT_RT_CLASSID: realm value of packet's route (skb->dst->tclassid)
  * @NFT_RT_NEXTHOP4: routing nexthop for IPv4
  * @NFT_RT_NEXTHOP6: routing nexthop for IPv6
+ * @NFT_RT_TCPMSS: fetch current path tcp mss
  */
 enum nft_rt_keys {
 	NFT_RT_CLASSID,
 	NFT_RT_NEXTHOP4,
 	NFT_RT_NEXTHOP6,
+	NFT_RT_TCPMSS,
 };
 
 /**
diff --git a/net/netfilter/nft_rt.c b/net/netfilter/nft_rt.c
index c7383d8f88d0..e142e65d3176 100644
--- a/net/netfilter/nft_rt.c
+++ b/net/netfilter/nft_rt.c
@@ -23,6 +23,42 @@ struct nft_rt {
 	enum nft_registers	dreg:8;
 };
 
+static u16 get_tcpmss(const struct nft_pktinfo *pkt, const struct dst_entry *skbdst)
+{
+	u32 minlen = sizeof(struct ipv6hdr), mtu = dst_mtu(skbdst);
+	const struct sk_buff *skb = pkt->skb;
+	const struct nf_afinfo *ai;
+	struct flowi fl;
+
+	memset(&fl, 0, sizeof(fl));
+
+	switch (nft_pf(pkt)) {
+	case NFPROTO_IPV4:
+		fl.u.ip4.daddr = ip_hdr(skb)->saddr;
+		minlen = sizeof(struct iphdr);
+		break;
+	case NFPROTO_IPV6:
+		fl.u.ip6.daddr = ipv6_hdr(skb)->saddr;
+		break;
+	}
+
+	ai = nf_get_afinfo(nft_pf(pkt));
+	if (ai) {
+		struct dst_entry *dst = NULL;
+
+		ai->route(nft_net(pkt), &dst, &fl, false);
+		if (dst) {
+			mtu = min(mtu, dst_mtu(dst));
+			dst_release(dst);
+		}
+	}
+
+	if (mtu <= minlen || mtu > 0xffff)
+		return TCP_MSS_DEFAULT;
+
+	return mtu - minlen;
+}
+
 static void nft_rt_get_eval(const struct nft_expr *expr,
 			    struct nft_regs *regs,
 			    const struct nft_pktinfo *pkt)
@@ -57,6 +93,9 @@ static void nft_rt_get_eval(const struct nft_expr *expr,
 					 &ipv6_hdr(skb)->daddr),
 		       sizeof(struct in6_addr));
 		break;
+	case NFT_RT_TCPMSS:
+		nft_reg_store16(dest, get_tcpmss(pkt, dst));
+		break;
 	default:
 		WARN_ON(1);
 		goto err;
@@ -94,6 +133,9 @@ static int nft_rt_get_init(const struct nft_ctx *ctx,
 	case NFT_RT_NEXTHOP6:
 		len = sizeof(struct in6_addr);
 		break;
+	case NFT_RT_TCPMSS:
+		len = sizeof(u16);
+		break;
 	default:
 		return -EOPNOTSUPP;
 	}
@@ -118,6 +160,29 @@ nla_put_failure:
 	return -1;
 }
 
+static int nft_rt_validate(const struct nft_ctx *ctx, const struct nft_expr *expr,
+			   const struct nft_data **data)
+{
+	const struct nft_rt *priv = nft_expr_priv(expr);
+	unsigned int hooks;
+
+	switch (priv->key) {
+	case NFT_RT_NEXTHOP4:
+	case NFT_RT_NEXTHOP6:
+	case NFT_RT_CLASSID:
+		return 0;
+	case NFT_RT_TCPMSS:
+		hooks = (1 << NF_INET_FORWARD) |
+			(1 << NF_INET_LOCAL_OUT) |
+			(1 << NF_INET_POST_ROUTING);
+		break;
+	default:
+		return -EINVAL;
+	}
+
+	return nft_chain_validate_hooks(ctx->chain, hooks);
+}
+
 static struct nft_expr_type nft_rt_type;
 static const struct nft_expr_ops nft_rt_get_ops = {
 	.type		= &nft_rt_type,
@@ -125,6 +190,7 @@ static const struct nft_expr_ops nft_rt_get_ops = {
 	.eval		= nft_rt_get_eval,
 	.init		= nft_rt_get_init,
 	.dump		= nft_rt_get_dump,
+	.validate	= nft_rt_validate,
 };
 
 static struct nft_expr_type nft_rt_type __read_mostly = {
-- 
cgit v1.2.3


From 96eabe7a40aa17e613cf3db2c742ee8b1fc764d0 Mon Sep 17 00:00:00 2001
From: Martin KaFai Lau <kafai@fb.com>
Date: Fri, 18 Aug 2017 11:28:00 -0700
Subject: bpf: Allow selecting numa node during map creation

The current map creation API does not allow to provide the numa-node
preference.  The memory usually comes from where the map-creation-process
is running.  The performance is not ideal if the bpf_prog is known to
always run in a numa node different from the map-creation-process.

One of the use case is sharding on CPU to different LRU maps (i.e.
an array of LRU maps).  Here is the test result of map_perf_test on
the INNER_LRU_HASH_PREALLOC test if we force the lru map used by
CPU0 to be allocated from a remote numa node:

[ The machine has 20 cores. CPU0-9 at node 0. CPU10-19 at node 1 ]

># taskset -c 10 ./map_perf_test 512 8 1260000 8000000
5:inner_lru_hash_map_perf pre-alloc 1628380 events per sec
4:inner_lru_hash_map_perf pre-alloc 1626396 events per sec
3:inner_lru_hash_map_perf pre-alloc 1626144 events per sec
6:inner_lru_hash_map_perf pre-alloc 1621657 events per sec
2:inner_lru_hash_map_perf pre-alloc 1621534 events per sec
1:inner_lru_hash_map_perf pre-alloc 1620292 events per sec
7:inner_lru_hash_map_perf pre-alloc 1613305 events per sec
0:inner_lru_hash_map_perf pre-alloc 1239150 events per sec  #<<<

After specifying numa node:
># taskset -c 10 ./map_perf_test 512 8 1260000 8000000
5:inner_lru_hash_map_perf pre-alloc 1629627 events per sec
3:inner_lru_hash_map_perf pre-alloc 1628057 events per sec
1:inner_lru_hash_map_perf pre-alloc 1623054 events per sec
6:inner_lru_hash_map_perf pre-alloc 1616033 events per sec
2:inner_lru_hash_map_perf pre-alloc 1614630 events per sec
4:inner_lru_hash_map_perf pre-alloc 1612651 events per sec
7:inner_lru_hash_map_perf pre-alloc 1609337 events per sec
0:inner_lru_hash_map_perf pre-alloc 1619340 events per sec #<<<

This patch adds one field, numa_node, to the bpf_attr.  Since numa node 0
is a valid node, a new flag BPF_F_NUMA_NODE is also added.  The numa_node
field is honored if and only if the BPF_F_NUMA_NODE flag is set.

Numa node selection is not supported for percpu map.

This patch does not change all the kmalloc.  F.e.
'htab = kzalloc()' is not changed since the object
is small enough to stay in the cache.

Signed-off-by: Martin KaFai Lau <kafai@fb.com>
Acked-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Alexei Starovoitov <ast@fb.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/bpf.h      | 10 +++++++++-
 include/uapi/linux/bpf.h | 10 +++++++++-
 kernel/bpf/arraymap.c    |  7 +++++--
 kernel/bpf/devmap.c      |  9 ++++++---
 kernel/bpf/hashtab.c     | 19 +++++++++++++++----
 kernel/bpf/lpm_trie.c    |  9 +++++++--
 kernel/bpf/sockmap.c     | 10 +++++++---
 kernel/bpf/stackmap.c    |  8 +++++---
 kernel/bpf/syscall.c     | 14 ++++++++++----
 9 files changed, 73 insertions(+), 23 deletions(-)

(limited to 'include')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 1cc6c5ff61ec..55b88e329804 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -51,6 +51,7 @@ struct bpf_map {
 	u32 map_flags;
 	u32 pages;
 	u32 id;
+	int numa_node;
 	struct user_struct *user;
 	const struct bpf_map_ops *ops;
 	struct work_struct work;
@@ -264,7 +265,7 @@ struct bpf_map * __must_check bpf_map_inc(struct bpf_map *map, bool uref);
 void bpf_map_put_with_uref(struct bpf_map *map);
 void bpf_map_put(struct bpf_map *map);
 int bpf_map_precharge_memlock(u32 pages);
-void *bpf_map_area_alloc(size_t size);
+void *bpf_map_area_alloc(size_t size, int numa_node);
 void bpf_map_area_free(void *base);
 
 extern int sysctl_unprivileged_bpf_disabled;
@@ -316,6 +317,13 @@ struct net_device  *__dev_map_lookup_elem(struct bpf_map *map, u32 key);
 void __dev_map_insert_ctx(struct bpf_map *map, u32 index);
 void __dev_map_flush(struct bpf_map *map);
 
+/* Return map's numa specified by userspace */
+static inline int bpf_map_attr_numa_node(const union bpf_attr *attr)
+{
+	return (attr->map_flags & BPF_F_NUMA_NODE) ?
+		attr->numa_node : NUMA_NO_NODE;
+}
+
 #else
 static inline struct bpf_prog *bpf_prog_get(u32 ufd)
 {
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 5ecbe812a2cc..843818dff96d 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -165,6 +165,7 @@ enum bpf_attach_type {
 #define BPF_NOEXIST	1 /* create new element if it didn't exist */
 #define BPF_EXIST	2 /* update existing element */
 
+/* flags for BPF_MAP_CREATE command */
 #define BPF_F_NO_PREALLOC	(1U << 0)
 /* Instead of having one common LRU list in the
  * BPF_MAP_TYPE_LRU_[PERCPU_]HASH map, use a percpu LRU list
@@ -173,6 +174,8 @@ enum bpf_attach_type {
  * across different LRU lists.
  */
 #define BPF_F_NO_COMMON_LRU	(1U << 1)
+/* Specify numa node during map creation */
+#define BPF_F_NUMA_NODE		(1U << 2)
 
 union bpf_attr {
 	struct { /* anonymous struct used by BPF_MAP_CREATE command */
@@ -180,8 +183,13 @@ union bpf_attr {
 		__u32	key_size;	/* size of key in bytes */
 		__u32	value_size;	/* size of value in bytes */
 		__u32	max_entries;	/* max number of entries in a map */
-		__u32	map_flags;	/* prealloc or not */
+		__u32	map_flags;	/* BPF_MAP_CREATE related
+					 * flags defined above.
+					 */
 		__u32	inner_map_fd;	/* fd pointing to the inner map */
+		__u32	numa_node;	/* numa node (effective only if
+					 * BPF_F_NUMA_NODE is set).
+					 */
 	};
 
 	struct { /* anonymous struct used by BPF_MAP_*_ELEM commands */
diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c
index d771a3872500..96e9c5c1dfc9 100644
--- a/kernel/bpf/arraymap.c
+++ b/kernel/bpf/arraymap.c
@@ -49,13 +49,15 @@ static int bpf_array_alloc_percpu(struct bpf_array *array)
 static struct bpf_map *array_map_alloc(union bpf_attr *attr)
 {
 	bool percpu = attr->map_type == BPF_MAP_TYPE_PERCPU_ARRAY;
+	int numa_node = bpf_map_attr_numa_node(attr);
 	struct bpf_array *array;
 	u64 array_size;
 	u32 elem_size;
 
 	/* check sanity of attributes */
 	if (attr->max_entries == 0 || attr->key_size != 4 ||
-	    attr->value_size == 0 || attr->map_flags)
+	    attr->value_size == 0 || attr->map_flags & ~BPF_F_NUMA_NODE ||
+	    (percpu && numa_node != NUMA_NO_NODE))
 		return ERR_PTR(-EINVAL);
 
 	if (attr->value_size > KMALLOC_MAX_SIZE)
@@ -77,7 +79,7 @@ static struct bpf_map *array_map_alloc(union bpf_attr *attr)
 		return ERR_PTR(-ENOMEM);
 
 	/* allocate all map elements and zero-initialize them */
-	array = bpf_map_area_alloc(array_size);
+	array = bpf_map_area_alloc(array_size, numa_node);
 	if (!array)
 		return ERR_PTR(-ENOMEM);
 
@@ -87,6 +89,7 @@ static struct bpf_map *array_map_alloc(union bpf_attr *attr)
 	array->map.value_size = attr->value_size;
 	array->map.max_entries = attr->max_entries;
 	array->map.map_flags = attr->map_flags;
+	array->map.numa_node = numa_node;
 	array->elem_size = elem_size;
 
 	if (!percpu)
diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c
index 18a72a8add43..67f4f00ce33a 100644
--- a/kernel/bpf/devmap.c
+++ b/kernel/bpf/devmap.c
@@ -80,7 +80,7 @@ static struct bpf_map *dev_map_alloc(union bpf_attr *attr)
 
 	/* check sanity of attributes */
 	if (attr->max_entries == 0 || attr->key_size != 4 ||
-	    attr->value_size != 4 || attr->map_flags)
+	    attr->value_size != 4 || attr->map_flags & ~BPF_F_NUMA_NODE)
 		return ERR_PTR(-EINVAL);
 
 	dtab = kzalloc(sizeof(*dtab), GFP_USER);
@@ -93,6 +93,7 @@ static struct bpf_map *dev_map_alloc(union bpf_attr *attr)
 	dtab->map.value_size = attr->value_size;
 	dtab->map.max_entries = attr->max_entries;
 	dtab->map.map_flags = attr->map_flags;
+	dtab->map.numa_node = bpf_map_attr_numa_node(attr);
 
 	err = -ENOMEM;
 
@@ -119,7 +120,8 @@ static struct bpf_map *dev_map_alloc(union bpf_attr *attr)
 		goto free_dtab;
 
 	dtab->netdev_map = bpf_map_area_alloc(dtab->map.max_entries *
-					      sizeof(struct bpf_dtab_netdev *));
+					      sizeof(struct bpf_dtab_netdev *),
+					      dtab->map.numa_node);
 	if (!dtab->netdev_map)
 		goto free_dtab;
 
@@ -344,7 +346,8 @@ static int dev_map_update_elem(struct bpf_map *map, void *key, void *value,
 	if (!ifindex) {
 		dev = NULL;
 	} else {
-		dev = kmalloc(sizeof(*dev), GFP_ATOMIC | __GFP_NOWARN);
+		dev = kmalloc_node(sizeof(*dev), GFP_ATOMIC | __GFP_NOWARN,
+				   map->numa_node);
 		if (!dev)
 			return -ENOMEM;
 
diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c
index 4fb463172aa8..47ae748c3a49 100644
--- a/kernel/bpf/hashtab.c
+++ b/kernel/bpf/hashtab.c
@@ -18,6 +18,9 @@
 #include "bpf_lru_list.h"
 #include "map_in_map.h"
 
+#define HTAB_CREATE_FLAG_MASK \
+	(BPF_F_NO_PREALLOC | BPF_F_NO_COMMON_LRU | BPF_F_NUMA_NODE)
+
 struct bucket {
 	struct hlist_nulls_head head;
 	raw_spinlock_t lock;
@@ -138,7 +141,8 @@ static int prealloc_init(struct bpf_htab *htab)
 	if (!htab_is_percpu(htab) && !htab_is_lru(htab))
 		num_entries += num_possible_cpus();
 
-	htab->elems = bpf_map_area_alloc(htab->elem_size * num_entries);
+	htab->elems = bpf_map_area_alloc(htab->elem_size * num_entries,
+					 htab->map.numa_node);
 	if (!htab->elems)
 		return -ENOMEM;
 
@@ -233,6 +237,7 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr)
 	 */
 	bool percpu_lru = (attr->map_flags & BPF_F_NO_COMMON_LRU);
 	bool prealloc = !(attr->map_flags & BPF_F_NO_PREALLOC);
+	int numa_node = bpf_map_attr_numa_node(attr);
 	struct bpf_htab *htab;
 	int err, i;
 	u64 cost;
@@ -248,7 +253,7 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr)
 		 */
 		return ERR_PTR(-EPERM);
 
-	if (attr->map_flags & ~(BPF_F_NO_PREALLOC | BPF_F_NO_COMMON_LRU))
+	if (attr->map_flags & ~HTAB_CREATE_FLAG_MASK)
 		/* reserved bits should not be used */
 		return ERR_PTR(-EINVAL);
 
@@ -258,6 +263,9 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr)
 	if (lru && !prealloc)
 		return ERR_PTR(-ENOTSUPP);
 
+	if (numa_node != NUMA_NO_NODE && (percpu || percpu_lru))
+		return ERR_PTR(-EINVAL);
+
 	htab = kzalloc(sizeof(*htab), GFP_USER);
 	if (!htab)
 		return ERR_PTR(-ENOMEM);
@@ -268,6 +276,7 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr)
 	htab->map.value_size = attr->value_size;
 	htab->map.max_entries = attr->max_entries;
 	htab->map.map_flags = attr->map_flags;
+	htab->map.numa_node = numa_node;
 
 	/* check sanity of attributes.
 	 * value_size == 0 may be allowed in the future to use map as a set
@@ -346,7 +355,8 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr)
 
 	err = -ENOMEM;
 	htab->buckets = bpf_map_area_alloc(htab->n_buckets *
-					   sizeof(struct bucket));
+					   sizeof(struct bucket),
+					   htab->map.numa_node);
 	if (!htab->buckets)
 		goto free_htab;
 
@@ -689,7 +699,8 @@ static struct htab_elem *alloc_htab_elem(struct bpf_htab *htab, void *key,
 				atomic_dec(&htab->count);
 				return ERR_PTR(-E2BIG);
 			}
-		l_new = kmalloc(htab->elem_size, GFP_ATOMIC | __GFP_NOWARN);
+		l_new = kmalloc_node(htab->elem_size, GFP_ATOMIC | __GFP_NOWARN,
+				     htab->map.numa_node);
 		if (!l_new)
 			return ERR_PTR(-ENOMEM);
 	}
diff --git a/kernel/bpf/lpm_trie.c b/kernel/bpf/lpm_trie.c
index b09185f0f17d..1b767844a76f 100644
--- a/kernel/bpf/lpm_trie.c
+++ b/kernel/bpf/lpm_trie.c
@@ -244,7 +244,8 @@ static struct lpm_trie_node *lpm_trie_node_alloc(const struct lpm_trie *trie,
 	if (value)
 		size += trie->map.value_size;
 
-	node = kmalloc(size, GFP_ATOMIC | __GFP_NOWARN);
+	node = kmalloc_node(size, GFP_ATOMIC | __GFP_NOWARN,
+			    trie->map.numa_node);
 	if (!node)
 		return NULL;
 
@@ -405,6 +406,8 @@ static int trie_delete_elem(struct bpf_map *map, void *key)
 #define LPM_KEY_SIZE_MAX	LPM_KEY_SIZE(LPM_DATA_SIZE_MAX)
 #define LPM_KEY_SIZE_MIN	LPM_KEY_SIZE(LPM_DATA_SIZE_MIN)
 
+#define LPM_CREATE_FLAG_MASK	(BPF_F_NO_PREALLOC | BPF_F_NUMA_NODE)
+
 static struct bpf_map *trie_alloc(union bpf_attr *attr)
 {
 	struct lpm_trie *trie;
@@ -416,7 +419,8 @@ static struct bpf_map *trie_alloc(union bpf_attr *attr)
 
 	/* check sanity of attributes */
 	if (attr->max_entries == 0 ||
-	    attr->map_flags != BPF_F_NO_PREALLOC ||
+	    !(attr->map_flags & BPF_F_NO_PREALLOC) ||
+	    attr->map_flags & ~LPM_CREATE_FLAG_MASK ||
 	    attr->key_size < LPM_KEY_SIZE_MIN ||
 	    attr->key_size > LPM_KEY_SIZE_MAX ||
 	    attr->value_size < LPM_VAL_SIZE_MIN ||
@@ -433,6 +437,7 @@ static struct bpf_map *trie_alloc(union bpf_attr *attr)
 	trie->map.value_size = attr->value_size;
 	trie->map.max_entries = attr->max_entries;
 	trie->map.map_flags = attr->map_flags;
+	trie->map.numa_node = bpf_map_attr_numa_node(attr);
 	trie->data_size = attr->key_size -
 			  offsetof(struct bpf_lpm_trie_key, data);
 	trie->max_prefixlen = trie->data_size * 8;
diff --git a/kernel/bpf/sockmap.c b/kernel/bpf/sockmap.c
index 39de541fbcdc..78b2bb9370ac 100644
--- a/kernel/bpf/sockmap.c
+++ b/kernel/bpf/sockmap.c
@@ -443,7 +443,9 @@ static struct smap_psock *smap_init_psock(struct sock *sock,
 {
 	struct smap_psock *psock;
 
-	psock = kzalloc(sizeof(struct smap_psock), GFP_ATOMIC | __GFP_NOWARN);
+	psock = kzalloc_node(sizeof(struct smap_psock),
+			     GFP_ATOMIC | __GFP_NOWARN,
+			     stab->map.numa_node);
 	if (!psock)
 		return ERR_PTR(-ENOMEM);
 
@@ -465,7 +467,7 @@ static struct bpf_map *sock_map_alloc(union bpf_attr *attr)
 
 	/* check sanity of attributes */
 	if (attr->max_entries == 0 || attr->key_size != 4 ||
-	    attr->value_size != 4 || attr->map_flags)
+	    attr->value_size != 4 || attr->map_flags & ~BPF_F_NUMA_NODE)
 		return ERR_PTR(-EINVAL);
 
 	if (attr->value_size > KMALLOC_MAX_SIZE)
@@ -481,6 +483,7 @@ static struct bpf_map *sock_map_alloc(union bpf_attr *attr)
 	stab->map.value_size = attr->value_size;
 	stab->map.max_entries = attr->max_entries;
 	stab->map.map_flags = attr->map_flags;
+	stab->map.numa_node = bpf_map_attr_numa_node(attr);
 
 	/* make sure page count doesn't overflow */
 	cost = (u64) stab->map.max_entries * sizeof(struct sock *);
@@ -495,7 +498,8 @@ static struct bpf_map *sock_map_alloc(union bpf_attr *attr)
 		goto free_stab;
 
 	stab->sock_map = bpf_map_area_alloc(stab->map.max_entries *
-					    sizeof(struct sock *));
+					    sizeof(struct sock *),
+					    stab->map.numa_node);
 	if (!stab->sock_map)
 		goto free_stab;
 
diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c
index 31147d730abf..135be433e9a0 100644
--- a/kernel/bpf/stackmap.c
+++ b/kernel/bpf/stackmap.c
@@ -31,7 +31,8 @@ static int prealloc_elems_and_freelist(struct bpf_stack_map *smap)
 	u32 elem_size = sizeof(struct stack_map_bucket) + smap->map.value_size;
 	int err;
 
-	smap->elems = bpf_map_area_alloc(elem_size * smap->map.max_entries);
+	smap->elems = bpf_map_area_alloc(elem_size * smap->map.max_entries,
+					 smap->map.numa_node);
 	if (!smap->elems)
 		return -ENOMEM;
 
@@ -59,7 +60,7 @@ static struct bpf_map *stack_map_alloc(union bpf_attr *attr)
 	if (!capable(CAP_SYS_ADMIN))
 		return ERR_PTR(-EPERM);
 
-	if (attr->map_flags)
+	if (attr->map_flags & ~BPF_F_NUMA_NODE)
 		return ERR_PTR(-EINVAL);
 
 	/* check sanity of attributes */
@@ -75,7 +76,7 @@ static struct bpf_map *stack_map_alloc(union bpf_attr *attr)
 	if (cost >= U32_MAX - PAGE_SIZE)
 		return ERR_PTR(-E2BIG);
 
-	smap = bpf_map_area_alloc(cost);
+	smap = bpf_map_area_alloc(cost, bpf_map_attr_numa_node(attr));
 	if (!smap)
 		return ERR_PTR(-ENOMEM);
 
@@ -91,6 +92,7 @@ static struct bpf_map *stack_map_alloc(union bpf_attr *attr)
 	smap->map.map_flags = attr->map_flags;
 	smap->n_buckets = n_buckets;
 	smap->map.pages = round_up(cost, PAGE_SIZE) >> PAGE_SHIFT;
+	smap->map.numa_node = bpf_map_attr_numa_node(attr);
 
 	err = bpf_map_precharge_memlock(smap->map.pages);
 	if (err)
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index b8cb1b3c9bfb..9378f3ba2cbf 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -105,7 +105,7 @@ static struct bpf_map *find_and_alloc_map(union bpf_attr *attr)
 	return map;
 }
 
-void *bpf_map_area_alloc(size_t size)
+void *bpf_map_area_alloc(size_t size, int numa_node)
 {
 	/* We definitely need __GFP_NORETRY, so OOM killer doesn't
 	 * trigger under memory pressure as we really just want to
@@ -115,12 +115,13 @@ void *bpf_map_area_alloc(size_t size)
 	void *area;
 
 	if (size <= (PAGE_SIZE << PAGE_ALLOC_COSTLY_ORDER)) {
-		area = kmalloc(size, GFP_USER | flags);
+		area = kmalloc_node(size, GFP_USER | flags, numa_node);
 		if (area != NULL)
 			return area;
 	}
 
-	return __vmalloc(size, GFP_KERNEL | flags, PAGE_KERNEL);
+	return __vmalloc_node_flags_caller(size, numa_node, GFP_KERNEL | flags,
+					   __builtin_return_address(0));
 }
 
 void bpf_map_area_free(void *area)
@@ -309,10 +310,11 @@ int bpf_map_new_fd(struct bpf_map *map)
 		   offsetof(union bpf_attr, CMD##_LAST_FIELD) - \
 		   sizeof(attr->CMD##_LAST_FIELD)) != NULL
 
-#define BPF_MAP_CREATE_LAST_FIELD inner_map_fd
+#define BPF_MAP_CREATE_LAST_FIELD numa_node
 /* called via syscall */
 static int map_create(union bpf_attr *attr)
 {
+	int numa_node = bpf_map_attr_numa_node(attr);
 	struct bpf_map *map;
 	int err;
 
@@ -320,6 +322,10 @@ static int map_create(union bpf_attr *attr)
 	if (err)
 		return -EINVAL;
 
+	if (numa_node != NUMA_NO_NODE &&
+	    (numa_node >= nr_node_ids || !node_online(numa_node)))
+		return -EINVAL;
+
 	/* find map type and init map: hashtable vs rbtree vs bloom vs ... */
 	map = find_and_alloc_map(attr);
 	if (IS_ERR(map))
-- 
cgit v1.2.3


From d6e1e46f69fbe956e877cdd00dbfb002baddf577 Mon Sep 17 00:00:00 2001
From: "David S. Miller" <davem@davemloft.net>
Date: Sat, 19 Aug 2017 23:34:03 -0700
Subject: bpf: linux/bpf.h needs linux/numa.h

Reported-by: kbuild test robot <fengguang.wu@intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/bpf.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 55b88e329804..830f472d8df5 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -14,6 +14,7 @@
 #include <linux/percpu.h>
 #include <linux/err.h>
 #include <linux/rbtree_latch.h>
+#include <linux/numa.h>
 
 struct perf_event;
 struct bpf_prog;
-- 
cgit v1.2.3


From 5405fa26c25e18ab735daddc46c8a7a78f138f70 Mon Sep 17 00:00:00 2001
From: Gal Pressman <galp@mellanox.com>
Date: Thu, 15 Jun 2017 18:29:23 +0300
Subject: net/mlx5: Add PCIe outbound stalls counters infrastructure

Add capability bit in MCAM register and counters to MPCNT register.

Signed-off-by: Gal Pressman <galp@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
---
 include/linux/mlx5/mlx5_ifc.h | 17 ++++++++++++++---
 1 file changed, 14 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h
index c99daffc3c3c..ba533b39c885 100644
--- a/include/linux/mlx5/mlx5_ifc.h
+++ b/include/linux/mlx5/mlx5_ifc.h
@@ -1854,7 +1854,17 @@ struct mlx5_ifc_pcie_perf_cntrs_grp_data_layout_bits {
 
 	u8         crc_error_tlp[0x20];
 
-	u8         reserved_at_140[0x680];
+	u8         reserved_at_140[0x40];
+
+	u8         outbound_stalled_reads[0x20];
+
+	u8         outbound_stalled_writes[0x20];
+
+	u8         outbound_stalled_reads_events[0x20];
+
+	u8         outbound_stalled_writes_events[0x20];
+
+	u8         reserved_at_200[0x5c0];
 };
 
 struct mlx5_ifc_cmd_inter_comp_event_bits {
@@ -7744,8 +7754,9 @@ struct mlx5_ifc_pcam_reg_bits {
 };
 
 struct mlx5_ifc_mcam_enhanced_features_bits {
-	u8         reserved_at_0[0x7d];
-
+	u8         reserved_at_0[0x7b];
+	u8         pcie_outbound_stalled[0x1];
+	u8         reserved_at_7c[0x1];
 	u8         mtpps_enh_out_per_adj[0x1];
 	u8         mtpps_fs[0x1];
 	u8         pcie_performance_group[0x1];
-- 
cgit v1.2.3


From 2dba07975acbc30c77a22d38030ee5a86ab8a748 Mon Sep 17 00:00:00 2001
From: Gal Pressman <galp@mellanox.com>
Date: Sun, 18 Jun 2017 14:56:45 +0300
Subject: net/mlx5: Add RX buffer fullness counters infrastructure

Add capability bit in PCAM register and counters to PPCNT register.

Signed-off-by: Gal Pressman <galp@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
---
 include/linux/mlx5/mlx5_ifc.h | 15 +++++++++++++--
 1 file changed, 13 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h
index ba533b39c885..cf7ff52c594e 100644
--- a/include/linux/mlx5/mlx5_ifc.h
+++ b/include/linux/mlx5/mlx5_ifc.h
@@ -1538,7 +1538,17 @@ struct mlx5_ifc_eth_extended_cntrs_grp_data_layout_bits {
 
 	u8         port_transmit_wait_low[0x20];
 
-	u8         reserved_at_40[0x780];
+	u8         reserved_at_40[0x100];
+
+	u8         rx_buffer_almost_full_high[0x20];
+
+	u8         rx_buffer_almost_full_low[0x20];
+
+	u8         rx_buffer_full_high[0x20];
+
+	u8         rx_buffer_full_low[0x20];
+
+	u8         reserved_at_1c0[0x600];
 };
 
 struct mlx5_ifc_eth_3635_cntrs_grp_data_layout_bits {
@@ -7723,8 +7733,9 @@ struct mlx5_ifc_peir_reg_bits {
 };
 
 struct mlx5_ifc_pcam_enhanced_features_bits {
-	u8         reserved_at_0[0x7c];
+	u8         reserved_at_0[0x7b];
 
+	u8         rx_buffer_fullness_counters[0x1];
 	u8         ptys_connector_type[0x1];
 	u8         reserved_at_7d[0x1];
 	u8         ppcnt_discard_group[0x1];
-- 
cgit v1.2.3


From efae7f78c45ba37bdc23a95d219b59ac85bdd0a7 Mon Sep 17 00:00:00 2001
From: Eran Ben Elisha <eranbe@mellanox.com>
Date: Fri, 12 May 2017 02:47:02 +0300
Subject: net/mlx5e: Add outbound PCI buffer overflow counter

Add outbound_pci_buffer_overflow to ethtool output for monitoring the
number of packets that were dropped due to lack of PCIe buffers on
receive path from NIC port toward the host(s).

This counter is valid only in case that tx_overflow_buffer_pkt is
supported in MCAM enhanced features.

Signed-off-by: Eran Ben Elisha <eranbe@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
---
 drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c | 12 ++++++++++--
 drivers/net/ethernet/mellanox/mlx5/core/en_stats.h   | 14 ++++++++++++++
 include/linux/mlx5/mlx5_ifc.h                        |  6 ++++--
 3 files changed, 28 insertions(+), 4 deletions(-)

(limited to 'include')

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c b/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c
index 8c013a521319..d453a11f41fe 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c
@@ -250,9 +250,13 @@ static void mlx5e_fill_stats_strings(struct mlx5e_priv *priv, uint8_t *data)
 		strcpy(data + (idx++) * ETH_GSTRING_LEN,
 		       pcie_perf_stats_desc[i].format);
 
-	for (i = 0; i < NUM_PCIE_PERF_STALL_COUNTERS(priv); i++)
+	for (i = 0; i < NUM_PCIE_PERF_COUNTERS64(priv); i++)
 		strcpy(data + (idx++) * ETH_GSTRING_LEN,
-		       pcie_perf_stall_stats_desc[i].format);
+		       pcie_perf_stats_desc64[i].format);
+
+	for (i = 0; i < NUM_PCIE_PERF_STALL_COUNTERS(priv); i++)
+	 strcpy(data + (idx++) * ETH_GSTRING_LEN,
+		pcie_perf_stall_stats_desc[i].format);
 
 	for (prio = 0; prio < NUM_PPORT_PRIO; prio++) {
 		for (i = 0; i < NUM_PPORT_PER_PRIO_TRAFFIC_COUNTERS; i++)
@@ -389,6 +393,10 @@ void mlx5e_ethtool_get_ethtool_stats(struct mlx5e_priv *priv,
 		data[idx++] = MLX5E_READ_CTR32_BE(&priv->stats.pcie.pcie_perf_counters,
 						  pcie_perf_stats_desc, i);
 
+	for (i = 0; i < NUM_PCIE_PERF_COUNTERS64(priv); i++)
+		data[idx++] = MLX5E_READ_CTR64_BE(&priv->stats.pcie.pcie_perf_counters,
+						  pcie_perf_stats_desc64, i);
+
 	for (i = 0; i < NUM_PCIE_PERF_STALL_COUNTERS(priv); i++)
 		data[idx++] = MLX5E_READ_CTR32_BE(&priv->stats.pcie.pcie_perf_counters,
 						  pcie_perf_stall_stats_desc, i);
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_stats.h b/drivers/net/ethernet/mellanox/mlx5/core/en_stats.h
index be49df4bedd9..40b5c73e5e26 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_stats.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_stats.h
@@ -307,6 +307,12 @@ static const struct counter_desc pport_eth_ext_stats_desc[] = {
 	MLX5_GET(mpcnt_reg, (pcie_stats)->pcie_perf_counters, \
 		 counter_set.pcie_perf_cntrs_grp_data_layout.c)
 
+#define PCIE_PERF_OFF64(c) \
+	MLX5_BYTE_OFF(mpcnt_reg, counter_set.pcie_perf_cntrs_grp_data_layout.c##_high)
+#define PCIE_PERF_GET64(pcie_stats, c) \
+	MLX5_GET64(mpcnt_reg, (pcie_stats)->pcie_perf_counters, \
+		   counter_set.pcie_perf_cntrs_grp_data_layout.c##_high)
+
 struct mlx5e_pcie_stats {
 	__be64 pcie_perf_counters[MLX5_ST_SZ_QW(mpcnt_reg)];
 };
@@ -316,6 +322,10 @@ static const struct counter_desc pcie_perf_stats_desc[] = {
 	{ "tx_pci_signal_integrity", PCIE_PERF_OFF(tx_errors) },
 };
 
+static const struct counter_desc pcie_perf_stats_desc64[] = {
+	{ "outbound_pci_buffer_overflow", PCIE_PERF_OFF64(tx_overflow_buffer_pkt) },
+};
+
 static const struct counter_desc pcie_perf_stall_stats_desc[] = {
 	{ "outbound_pci_stalled_rd", PCIE_PERF_OFF(outbound_stalled_reads) },
 	{ "outbound_pci_stalled_wr", PCIE_PERF_OFF(outbound_stalled_writes) },
@@ -415,6 +425,9 @@ static const struct counter_desc sq_stats_desc[] = {
 #define NUM_PCIE_PERF_COUNTERS(priv) \
 	(ARRAY_SIZE(pcie_perf_stats_desc) * \
 	 MLX5_CAP_MCAM_FEATURE((priv)->mdev, pcie_performance_group))
+#define NUM_PCIE_PERF_COUNTERS64(priv) \
+	(ARRAY_SIZE(pcie_perf_stats_desc64) * \
+	 MLX5_CAP_MCAM_FEATURE((priv)->mdev, tx_overflow_buffer_pkt))
 #define NUM_PCIE_PERF_STALL_COUNTERS(priv) \
 	(ARRAY_SIZE(pcie_perf_stall_stats_desc) * \
 	 MLX5_CAP_MCAM_FEATURE((priv)->mdev, pcie_outbound_stalled))
@@ -433,6 +446,7 @@ static const struct counter_desc sq_stats_desc[] = {
 					 NUM_PPORT_PRIO + \
 					 NUM_PPORT_ETH_EXT_COUNTERS(priv))
 #define NUM_PCIE_COUNTERS(priv)		(NUM_PCIE_PERF_COUNTERS(priv) + \
+					 NUM_PCIE_PERF_COUNTERS64(priv) +\
 					 NUM_PCIE_PERF_STALL_COUNTERS(priv))
 #define NUM_RQ_STATS			ARRAY_SIZE(rq_stats_desc)
 #define NUM_SQ_STATS			ARRAY_SIZE(sq_stats_desc)
diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h
index cf7ff52c594e..ae7d09b9c52f 100644
--- a/include/linux/mlx5/mlx5_ifc.h
+++ b/include/linux/mlx5/mlx5_ifc.h
@@ -1864,7 +1864,9 @@ struct mlx5_ifc_pcie_perf_cntrs_grp_data_layout_bits {
 
 	u8         crc_error_tlp[0x20];
 
-	u8         reserved_at_140[0x40];
+	u8         tx_overflow_buffer_pkt_high[0x20];
+
+	u8         tx_overflow_buffer_pkt_low[0x20];
 
 	u8         outbound_stalled_reads[0x20];
 
@@ -7767,7 +7769,7 @@ struct mlx5_ifc_pcam_reg_bits {
 struct mlx5_ifc_mcam_enhanced_features_bits {
 	u8         reserved_at_0[0x7b];
 	u8         pcie_outbound_stalled[0x1];
-	u8         reserved_at_7c[0x1];
+	u8         tx_overflow_buffer_pkt[0x1];
 	u8         mtpps_enh_out_per_adj[0x1];
 	u8         mtpps_fs[0x1];
 	u8         pcie_performance_group[0x1];
-- 
cgit v1.2.3


From 67c672ecd3a535f00efea18baac81983a8956084 Mon Sep 17 00:00:00 2001
From: Mauro Carvalho Chehab <mchehab@s-opensource.com>
Date: Sat, 12 Aug 2017 05:57:05 -0400
Subject: media: v4l2-ctrls.h: better document the arguments for v4l2_ctrl_fill

The arguments for this function are pointers. Make it clear at
its documentation.

Signed-off-by: Hans Verkuil <hans.verkuil@cisco.com>
Signed-off-by: Mauro Carvalho Chehab <mchehab@s-opensource.com>
---
 include/media/v4l2-ctrls.h | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

(limited to 'include')

diff --git a/include/media/v4l2-ctrls.h b/include/media/v4l2-ctrls.h
index 2d2aed56922f..dacfe54057f8 100644
--- a/include/media/v4l2-ctrls.h
+++ b/include/media/v4l2-ctrls.h
@@ -340,17 +340,17 @@ struct v4l2_ctrl_config {
  * v4l2_ctrl_fill - Fill in the control fields based on the control ID.
  *
  * @id: ID of the control
- * @name: name of the control
- * @type: type of the control
- * @min: minimum value for the control
- * @max: maximum value for the control
- * @step: control step
- * @def: default value for the control
- * @flags: flags to be used on the control
+ * @name: pointer to be filled with a string with the name of the control
+ * @type: pointer for storing the type of the control
+ * @min: pointer for storing the minimum value for the control
+ * @max: pointer for storing the maximum value for the control
+ * @step: pointer for storing the control step
+ * @def: pointer for storing the default value for the control
+ * @flags: pointer for storing the flags to be used on the control
  *
  * This works for all standard V4L2 controls.
  * For non-standard controls it will only fill in the given arguments
- * and @name will be %NULL.
+ * and @name content will be set to %NULL.
  *
  * This function will overwrite the contents of @name, @type and @flags.
  * The contents of @min, @max, @step and @def may be modified depending on
-- 
cgit v1.2.3


From 9a6b2a87405a5022660022722d4a830b768e8033 Mon Sep 17 00:00:00 2001
From: Hans Verkuil <hverkuil@xs4all.nl>
Date: Tue, 15 Aug 2017 15:26:25 -0400
Subject: media: cec: rename pin events/function

The CEC_EVENT_PIN_LOW/HIGH defines and the cec_queue_pin_event() function
did not specify that these were about CEC pin events.

Since in the future there will also be HPD pin events it is wise to rename
the event defines and function to CEC_EVENT_PIN_CEC_LOW/HIGH and
cec_queue_pin_cec_event() now before these become part of the ABI.

Signed-off-by: Hans Verkuil <hans.verkuil@cisco.com>
Signed-off-by: Mauro Carvalho Chehab <mchehab@s-opensource.com>
---
 Documentation/media/uapi/cec/cec-ioc-adap-g-caps.rst | 2 +-
 Documentation/media/uapi/cec/cec-ioc-dqevent.rst     | 8 ++++----
 Documentation/media/uapi/cec/cec-ioc-g-mode.rst      | 2 +-
 drivers/media/cec/cec-adap.c                         | 7 ++++---
 drivers/media/cec/cec-api.c                          | 4 ++--
 drivers/media/cec/cec-pin.c                          | 5 +++--
 include/media/cec.h                                  | 9 +++++----
 include/uapi/linux/cec.h                             | 4 ++--
 8 files changed, 22 insertions(+), 19 deletions(-)

(limited to 'include')

diff --git a/Documentation/media/uapi/cec/cec-ioc-adap-g-caps.rst b/Documentation/media/uapi/cec/cec-ioc-adap-g-caps.rst
index 0a7aa21f24f4..6c1f6efb822e 100644
--- a/Documentation/media/uapi/cec/cec-ioc-adap-g-caps.rst
+++ b/Documentation/media/uapi/cec/cec-ioc-adap-g-caps.rst
@@ -127,7 +127,7 @@ returns the information to the application. The ioctl never fails.
       - 0x00000080
       - The CEC hardware can monitor CEC pin changes from low to high voltage
         and vice versa. When in pin monitoring mode the application will
-	receive ``CEC_EVENT_PIN_LOW`` and ``CEC_EVENT_PIN_HIGH`` events.
+	receive ``CEC_EVENT_PIN_CEC_LOW`` and ``CEC_EVENT_PIN_CEC_HIGH`` events.
 
 
diff --git a/Documentation/media/uapi/cec/cec-ioc-dqevent.rst b/Documentation/media/uapi/cec/cec-ioc-dqevent.rst
index 766d8b0ce431..db615e3405c0 100644
--- a/Documentation/media/uapi/cec/cec-ioc-dqevent.rst
+++ b/Documentation/media/uapi/cec/cec-ioc-dqevent.rst
@@ -146,16 +146,16 @@ it is guaranteed that the state did change in between the two events.
       - 2
       - Generated if one or more CEC messages were lost because the
 	application didn't dequeue CEC messages fast enough.
-    * .. _`CEC-EVENT-PIN-LOW`:
+    * .. _`CEC-EVENT-PIN-CEC-LOW`:
 
-      - ``CEC_EVENT_PIN_LOW``
+      - ``CEC_EVENT_PIN_CEC_LOW``
       - 3
       - Generated if the CEC pin goes from a high voltage to a low voltage.
         Only applies to adapters that have the ``CEC_CAP_MONITOR_PIN``
 	capability set.
-    * .. _`CEC-EVENT-PIN-HIGH`:
+    * .. _`CEC-EVENT-PIN-CEC-HIGH`:
 
-      - ``CEC_EVENT_PIN_HIGH``
+      - ``CEC_EVENT_PIN_CEC_HIGH``
       - 4
       - Generated if the CEC pin goes from a low voltage to a high voltage.
         Only applies to adapters that have the ``CEC_CAP_MONITOR_PIN``
diff --git a/Documentation/media/uapi/cec/cec-ioc-g-mode.rst b/Documentation/media/uapi/cec/cec-ioc-g-mode.rst
index 494154e9d449..4d8e0647e832 100644
--- a/Documentation/media/uapi/cec/cec-ioc-g-mode.rst
+++ b/Documentation/media/uapi/cec/cec-ioc-g-mode.rst
@@ -159,7 +159,7 @@ Available follower modes are:
 	This mode requires that the :ref:`CEC_CAP_MONITOR_PIN <CEC-CAP-MONITOR-PIN>`
 	capability is set, otherwise the ``EINVAL`` error code is returned.
 	While in pin monitoring mode this file descriptor can receive the
-	``CEC_EVENT_PIN_LOW`` and ``CEC_EVENT_PIN_HIGH`` events to see the
+	``CEC_EVENT_PIN_CEC_LOW`` and ``CEC_EVENT_PIN_CEC_HIGH`` events to see the
 	low-level CEC pin transitions. This is very useful for debugging.
 	This mode is only allowed if the process has the ``CAP_NET_ADMIN``
 	capability. If that is not set, then the ``EPERM`` error code is returned.
diff --git a/drivers/media/cec/cec-adap.c b/drivers/media/cec/cec-adap.c
index 8ac39ddf892c..d9adeb505c09 100644
--- a/drivers/media/cec/cec-adap.c
+++ b/drivers/media/cec/cec-adap.c
@@ -154,10 +154,11 @@ static void cec_queue_event(struct cec_adapter *adap,
 }
 
 /* Notify userspace that the CEC pin changed state at the given time. */
-void cec_queue_pin_event(struct cec_adapter *adap, bool is_high, ktime_t ts)
+void cec_queue_pin_cec_event(struct cec_adapter *adap, bool is_high, ktime_t ts)
 {
 	struct cec_event ev = {
-		.event = is_high ? CEC_EVENT_PIN_HIGH : CEC_EVENT_PIN_LOW,
+		.event = is_high ? CEC_EVENT_PIN_CEC_HIGH :
+				   CEC_EVENT_PIN_CEC_LOW,
 	};
 	struct cec_fh *fh;
 
@@ -167,7 +168,7 @@ void cec_queue_pin_event(struct cec_adapter *adap, bool is_high, ktime_t ts)
 			cec_queue_event_fh(fh, &ev, ktime_to_ns(ts));
 	mutex_unlock(&adap->devnode.lock);
 }
-EXPORT_SYMBOL_GPL(cec_queue_pin_event);
+EXPORT_SYMBOL_GPL(cec_queue_pin_cec_event);
 
 /*
  * Queue a new message for this filehandle.
diff --git a/drivers/media/cec/cec-api.c b/drivers/media/cec/cec-api.c
index 00d43d74020f..87649b0c6381 100644
--- a/drivers/media/cec/cec-api.c
+++ b/drivers/media/cec/cec-api.c
@@ -449,8 +449,8 @@ static long cec_s_mode(struct cec_adapter *adap, struct cec_fh *fh,
 			.flags = CEC_EVENT_FL_INITIAL_STATE,
 		};
 
-		ev.event = adap->pin->cur_value ? CEC_EVENT_PIN_HIGH :
-						  CEC_EVENT_PIN_LOW;
+		ev.event = adap->pin->cur_value ? CEC_EVENT_PIN_CEC_HIGH :
+						  CEC_EVENT_PIN_CEC_LOW;
 		cec_queue_event_fh(fh, &ev, 0);
 #endif
 		adap->monitor_pin_cnt++;
diff --git a/drivers/media/cec/cec-pin.c b/drivers/media/cec/cec-pin.c
index 03f800e5ec1f..31a26d3b8bd8 100644
--- a/drivers/media/cec/cec-pin.c
+++ b/drivers/media/cec/cec-pin.c
@@ -609,8 +609,9 @@ static int cec_pin_thread_func(void *_adap)
 		while (atomic_read(&pin->work_pin_events)) {
 			unsigned int idx = pin->work_pin_events_rd;
 
-			cec_queue_pin_event(adap, pin->work_pin_is_high[idx],
-					    pin->work_pin_ts[idx]);
+			cec_queue_pin_cec_event(adap,
+						pin->work_pin_is_high[idx],
+						pin->work_pin_ts[idx]);
 			pin->work_pin_events_rd = (idx + 1) % CEC_NUM_PIN_EVENTS;
 			atomic_dec(&pin->work_pin_events);
 		}
diff --git a/include/media/cec.h b/include/media/cec.h
index 1bec7bde4792..224359c9941a 100644
--- a/include/media/cec.h
+++ b/include/media/cec.h
@@ -91,7 +91,7 @@ struct cec_event_entry {
 };
 
 #define CEC_NUM_CORE_EVENTS 2
-#define CEC_NUM_EVENTS CEC_EVENT_PIN_HIGH
+#define CEC_NUM_EVENTS CEC_EVENT_PIN_CEC_HIGH
 
 struct cec_fh {
 	struct list_head	list;
@@ -280,14 +280,15 @@ static inline void cec_received_msg(struct cec_adapter *adap,
 }
 
 /**
- * cec_queue_pin_event() - queue a pin event with a given timestamp.
+ * cec_queue_pin_cec_event() - queue a CEC pin event with a given timestamp.
  *
  * @adap:	pointer to the cec adapter
- * @is_high:	when true the pin is high, otherwise it is low
+ * @is_high:	when true the CEC pin is high, otherwise it is low
  * @ts:		the timestamp for this event
  *
  */
-void cec_queue_pin_event(struct cec_adapter *adap, bool is_high, ktime_t ts);
+void cec_queue_pin_cec_event(struct cec_adapter *adap,
+			     bool is_high, ktime_t ts);
 
 /**
  * cec_get_edid_phys_addr() - find and return the physical address
diff --git a/include/uapi/linux/cec.h b/include/uapi/linux/cec.h
index d87a67b0bb06..4351c3481aea 100644
--- a/include/uapi/linux/cec.h
+++ b/include/uapi/linux/cec.h
@@ -408,8 +408,8 @@ struct cec_log_addrs {
  * didn't empty the message queue in time
  */
 #define CEC_EVENT_LOST_MSGS		2
-#define CEC_EVENT_PIN_LOW		3
-#define CEC_EVENT_PIN_HIGH		4
+#define CEC_EVENT_PIN_CEC_LOW		3
+#define CEC_EVENT_PIN_CEC_HIGH		4
 
 #define CEC_EVENT_FL_INITIAL_STATE	(1 << 0)
 #define CEC_EVENT_FL_DROPPED_EVENTS	(1 << 1)
-- 
cgit v1.2.3


From cb7474949371ba8d7591a62924f05b627a2601d9 Mon Sep 17 00:00:00 2001
From: Hans Verkuil <hverkuil@xs4all.nl>
Date: Wed, 16 Aug 2017 03:13:02 -0400
Subject: media: cec-pin: fix irq handling

The free_irq() function could be called from interrupt context,
which is invalid. Move this to the thread.

In the interrupt handler we just request that the thread disables
the irq. This is done through an atomic so we don't need to add
any spinlocks.

Signed-off-by: Hans Verkuil <hans.verkuil@cisco.com>
Signed-off-by: Mauro Carvalho Chehab <mchehab@s-opensource.com>
---
 drivers/media/cec/cec-pin.c | 34 +++++++++++++++++++++-------------
 include/media/cec-pin.h     |  6 +++++-
 2 files changed, 26 insertions(+), 14 deletions(-)

(limited to 'include')

diff --git a/drivers/media/cec/cec-pin.c b/drivers/media/cec/cec-pin.c
index 31a26d3b8bd8..970774de3dcd 100644
--- a/drivers/media/cec/cec-pin.c
+++ b/drivers/media/cec/cec-pin.c
@@ -557,7 +557,7 @@ static enum hrtimer_restart cec_pin_timer(struct hrtimer *timer)
 		    adap->is_configured || adap->monitor_all_cnt)
 			break;
 		/* Switch to interrupt mode */
-		pin->work_enable_irq = true;
+		atomic_set(&pin->work_irq_change, CEC_PIN_IRQ_ENABLE);
 		pin->state = CEC_ST_RX_IRQ;
 		wake_up_interruptible(&pin->kthread_waitq);
 		return HRTIMER_NORESTART;
@@ -591,7 +591,7 @@ static int cec_pin_thread_func(void *_adap)
 			kthread_should_stop() ||
 			pin->work_rx_msg.len ||
 			pin->work_tx_status ||
-			pin->work_enable_irq ||
+			atomic_read(&pin->work_irq_change) ||
 			atomic_read(&pin->work_pin_events));
 
 		if (pin->work_rx_msg.len) {
@@ -606,6 +606,7 @@ static int cec_pin_thread_func(void *_adap)
 			cec_transmit_attempt_done_ts(adap, tx_status,
 						     pin->work_tx_ts);
 		}
+
 		while (atomic_read(&pin->work_pin_events)) {
 			unsigned int idx = pin->work_pin_events_rd;
 
@@ -615,14 +616,26 @@ static int cec_pin_thread_func(void *_adap)
 			pin->work_pin_events_rd = (idx + 1) % CEC_NUM_PIN_EVENTS;
 			atomic_dec(&pin->work_pin_events);
 		}
-		if (pin->work_enable_irq) {
-			pin->work_enable_irq = false;
+
+		switch (atomic_xchg(&pin->work_irq_change,
+				    CEC_PIN_IRQ_UNCHANGED)) {
+		case CEC_PIN_IRQ_DISABLE:
+			pin->ops->disable_irq(adap);
+			cec_pin_high(pin);
+			cec_pin_to_idle(pin);
+			hrtimer_start(&pin->timer, 0, HRTIMER_MODE_REL);
+			break;
+		case CEC_PIN_IRQ_ENABLE:
 			pin->enable_irq_failed = !pin->ops->enable_irq(adap);
 			if (pin->enable_irq_failed) {
 				cec_pin_to_idle(pin);
 				hrtimer_start(&pin->timer, 0, HRTIMER_MODE_REL);
 			}
+			break;
+		default:
+			break;
 		}
+
 		if (kthread_should_stop())
 			break;
 	}
@@ -641,7 +654,7 @@ static int cec_pin_adap_enable(struct cec_adapter *adap, bool enable)
 		cec_pin_to_idle(pin);
 		pin->tx_msg.len = 0;
 		pin->timer_ts = 0;
-		pin->work_enable_irq = false;
+		atomic_set(&pin->work_irq_change, CEC_PIN_IRQ_UNCHANGED);
 		pin->kthread = kthread_run(cec_pin_thread_func, adap,
 					   "cec-pin");
 		if (IS_ERR(pin->kthread)) {
@@ -682,7 +695,7 @@ static int cec_pin_adap_transmit(struct cec_adapter *adap, u8 attempts,
 	pin->work_tx_status = 0;
 	pin->tx_bit = 0;
 	if (pin->state == CEC_ST_RX_IRQ) {
-		pin->work_enable_irq = false;
+		atomic_set(&pin->work_irq_change, CEC_PIN_IRQ_UNCHANGED);
 		pin->ops->disable_irq(adap);
 		cec_pin_high(pin);
 		cec_pin_to_idle(pin);
@@ -745,13 +758,8 @@ void cec_pin_changed(struct cec_adapter *adap, bool value)
 
 	cec_pin_update(pin, value, false);
 	if (!value && (adap->is_configuring || adap->is_configured ||
-		       adap->monitor_all_cnt)) {
-		pin->work_enable_irq = false;
-		pin->ops->disable_irq(adap);
-		cec_pin_high(pin);
-		cec_pin_to_idle(pin);
-		hrtimer_start(&pin->timer, 0, HRTIMER_MODE_REL);
-	}
+		       adap->monitor_all_cnt))
+		atomic_set(&pin->work_irq_change, CEC_PIN_IRQ_DISABLE);
 }
 EXPORT_SYMBOL_GPL(cec_pin_changed);
 
diff --git a/include/media/cec-pin.h b/include/media/cec-pin.h
index 44b82d29d480..d28d07fa312e 100644
--- a/include/media/cec-pin.h
+++ b/include/media/cec-pin.h
@@ -113,6 +113,10 @@ struct cec_pin_ops {
 
 #define CEC_NUM_PIN_EVENTS 128
 
+#define CEC_PIN_IRQ_UNCHANGED	0
+#define CEC_PIN_IRQ_DISABLE	1
+#define CEC_PIN_IRQ_ENABLE	2
+
 struct cec_pin {
 	struct cec_adapter		*adap;
 	const struct cec_pin_ops	*ops;
@@ -137,8 +141,8 @@ struct cec_pin {
 
 	struct cec_msg			work_rx_msg;
 	u8				work_tx_status;
-	bool				work_enable_irq;
 	ktime_t				work_tx_ts;
+	atomic_t			work_irq_change;
 	atomic_t			work_pin_events;
 	unsigned int			work_pin_events_wr;
 	unsigned int			work_pin_events_rd;
-- 
cgit v1.2.3


From 68d9c47b1679ec8d55a005d39fc7a958ece82095 Mon Sep 17 00:00:00 2001
From: Rob Herring <robh@kernel.org>
Date: Fri, 21 Jul 2017 15:28:33 -0400
Subject: media: Convert to using %pOF instead of full_name
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Now that we have a custom printf format specifier, convert users of
full_name to use %pOF instead. This is preparation to remove storing
of the full path string for each node.

Signed-off-by: Rob Herring <robh@kernel.org>
Acked-by: Niklas Söderlund <niklas.soderlund+renesas@ragnatech.se>
Acked-by: Laurent Pinchart <laurent.pinchart@ideasonboard.com>
Reviewed-by: Matthias Brugger <matthias.bgg@gmail.com>
Acked-by: Nicolas Ferre <nicolas.ferre@microchip.com>
Acked-by: Lad, Prabhakar <prabhakar.csengg@gmail.com>
Cc: Kyungmin Park <kyungmin.park@samsung.com>
Cc: Andrzej Hajda <a.hajda@samsung.com>
Cc: Mauro Carvalho Chehab <mchehab@kernel.org>
Cc: Songjun Wu <songjun.wu@microchip.com>
Cc: Kukjin Kim <kgene@kernel.org>
Cc: Krzysztof Kozlowski <krzk@kernel.org>
Cc: Javier Martinez Canillas <javier@osg.samsung.com>
Cc: Minghsiu Tsai <minghsiu.tsai@mediatek.com>
Cc: Houlong Wei <houlong.wei@mediatek.com>
Cc: Andrew-CT Chen <andrew-ct.chen@mediatek.com>
Cc: Guennadi Liakhovetski <g.liakhovetski@gmx.de>
Cc: Hyun Kwon <hyun.kwon@xilinx.com>
Cc: Michal Simek <michal.simek@xilinx.com>
Cc: "Sören Brinkmann" <soren.brinkmann@xilinx.com>
Cc: linux-arm-kernel@lists.infradead.org
Cc: linux-samsung-soc@vger.kernel.org
Cc: linux-mediatek@lists.infradead.org
Cc: linux-renesas-soc@vger.kernel.org
Reviewed-by: Sylwester Nawrocki <s.nawrocki@samsung.com>
Signed-off-by: Hans Verkuil <hans.verkuil@cisco.com>
Signed-off-by: Mauro Carvalho Chehab <mchehab@s-opensource.com>
---
 drivers/media/i2c/s5c73m3/s5c73m3-core.c       |  3 +-
 drivers/media/i2c/s5k5baf.c                    |  7 ++--
 drivers/media/platform/am437x/am437x-vpfe.c    |  4 +-
 drivers/media/platform/atmel/atmel-isc.c       |  4 +-
 drivers/media/platform/davinci/vpif_capture.c  | 16 ++++----
 drivers/media/platform/exynos4-is/fimc-is.c    |  8 ++--
 drivers/media/platform/exynos4-is/fimc-lite.c  |  3 +-
 drivers/media/platform/exynos4-is/media-dev.c  |  8 ++--
 drivers/media/platform/exynos4-is/mipi-csis.c  |  4 +-
 drivers/media/platform/mtk-mdp/mtk_mdp_comp.c  |  6 +--
 drivers/media/platform/mtk-mdp/mtk_mdp_core.c  |  8 ++--
 drivers/media/platform/omap3isp/isp.c          |  8 ++--
 drivers/media/platform/pxa_camera.c            |  2 +-
 drivers/media/platform/rcar-vin/rcar-core.c    |  4 +-
 drivers/media/platform/soc_camera/soc_camera.c |  6 +--
 drivers/media/platform/xilinx/xilinx-vipp.c    | 52 +++++++++++++-------------
 drivers/media/v4l2-core/v4l2-clk.c             |  3 +-
 include/media/v4l2-clk.h                       |  4 +-
 18 files changed, 70 insertions(+), 80 deletions(-)

(limited to 'include')

diff --git a/drivers/media/i2c/s5c73m3/s5c73m3-core.c b/drivers/media/i2c/s5c73m3/s5c73m3-core.c
index f434fb2ee6fc..cdc4f2392ef9 100644
--- a/drivers/media/i2c/s5c73m3/s5c73m3-core.c
+++ b/drivers/media/i2c/s5c73m3/s5c73m3-core.c
@@ -1635,8 +1635,7 @@ static int s5c73m3_get_platform_data(struct s5c73m3 *state)
 
 	node_ep = of_graph_get_next_endpoint(node, NULL);
 	if (!node_ep) {
-		dev_warn(dev, "no endpoint defined for node: %s\n",
-						node->full_name);
+		dev_warn(dev, "no endpoint defined for node: %pOF\n", node);
 		return 0;
 	}
 
diff --git a/drivers/media/i2c/s5k5baf.c b/drivers/media/i2c/s5k5baf.c
index f01722dc04d0..ff46d2c96cea 100644
--- a/drivers/media/i2c/s5k5baf.c
+++ b/drivers/media/i2c/s5k5baf.c
@@ -1863,8 +1863,7 @@ static int s5k5baf_parse_device_node(struct s5k5baf *state, struct device *dev)
 
 	node_ep = of_graph_get_next_endpoint(node, NULL);
 	if (!node_ep) {
-		dev_err(dev, "no endpoint defined at node %s\n",
-			node->full_name);
+		dev_err(dev, "no endpoint defined at node %pOF\n", node);
 		return -EINVAL;
 	}
 
@@ -1882,8 +1881,8 @@ static int s5k5baf_parse_device_node(struct s5k5baf *state, struct device *dev)
 	case V4L2_MBUS_PARALLEL:
 		break;
 	default:
-		dev_err(dev, "unsupported bus in endpoint defined at node %s\n",
-			node->full_name);
+		dev_err(dev, "unsupported bus in endpoint defined at node %pOF\n",
+			node);
 		return -EINVAL;
 	}
 
diff --git a/drivers/media/platform/am437x/am437x-vpfe.c b/drivers/media/platform/am437x/am437x-vpfe.c
index 466aba8b0e00..dfcc484cab89 100644
--- a/drivers/media/platform/am437x/am437x-vpfe.c
+++ b/drivers/media/platform/am437x/am437x-vpfe.c
@@ -2490,8 +2490,8 @@ vpfe_get_pdata(struct platform_device *pdev)
 
 		rem = of_graph_get_remote_port_parent(endpoint);
 		if (!rem) {
-			dev_err(&pdev->dev, "Remote device at %s not found\n",
-				endpoint->full_name);
+			dev_err(&pdev->dev, "Remote device at %pOF not found\n",
+				endpoint);
 			goto done;
 		}
 
diff --git a/drivers/media/platform/atmel/atmel-isc.c b/drivers/media/platform/atmel/atmel-isc.c
index d4df3d4ccd85..d7103c5f92c3 100644
--- a/drivers/media/platform/atmel/atmel-isc.c
+++ b/drivers/media/platform/atmel/atmel-isc.c
@@ -1700,8 +1700,8 @@ static int isc_parse_dt(struct device *dev, struct isc_device *isc)
 
 		rem = of_graph_get_remote_port_parent(epn);
 		if (!rem) {
-			dev_notice(dev, "Remote device at %s not found\n",
-				   of_node_full_name(epn));
+			dev_notice(dev, "Remote device at %pOF not found\n",
+				   epn);
 			continue;
 		}
 
diff --git a/drivers/media/platform/davinci/vpif_capture.c b/drivers/media/platform/davinci/vpif_capture.c
index fe2a5748e554..0ef36cec21d1 100644
--- a/drivers/media/platform/davinci/vpif_capture.c
+++ b/drivers/media/platform/davinci/vpif_capture.c
@@ -1397,9 +1397,9 @@ static int vpif_async_bound(struct v4l2_async_notifier *notifier,
 			vpif_obj.config->chan_config->inputs[i].subdev_name =
 				(char *)to_of_node(subdev->fwnode)->full_name;
 			vpif_dbg(2, debug,
-				 "%s: setting input %d subdev_name = %s\n",
+				 "%s: setting input %d subdev_name = %pOF\n",
 				 __func__, i,
-				 to_of_node(subdev->fwnode)->full_name);
+				 to_of_node(subdev->fwnode));
 			return 0;
 		}
 	}
@@ -1557,8 +1557,8 @@ vpif_capture_get_pdata(struct platform_device *pdev)
 			dev_err(&pdev->dev, "Could not parse the endpoint\n");
 			goto done;
 		}
-		dev_dbg(&pdev->dev, "Endpoint %s, bus_width = %d\n",
-			endpoint->full_name, bus_cfg.bus.parallel.bus_width);
+		dev_dbg(&pdev->dev, "Endpoint %pOF, bus_width = %d\n",
+			endpoint, bus_cfg.bus.parallel.bus_width);
 		flags = bus_cfg.bus.parallel.flags;
 
 		if (flags & V4L2_MBUS_HSYNC_ACTIVE_HIGH)
@@ -1569,13 +1569,13 @@ vpif_capture_get_pdata(struct platform_device *pdev)
 
 		rem = of_graph_get_remote_port_parent(endpoint);
 		if (!rem) {
-			dev_dbg(&pdev->dev, "Remote device at %s not found\n",
-				endpoint->full_name);
+			dev_dbg(&pdev->dev, "Remote device at %pOF not found\n",
+				endpoint);
 			goto done;
 		}
 
-		dev_dbg(&pdev->dev, "Remote device %s, %s found\n",
-			rem->name, rem->full_name);
+		dev_dbg(&pdev->dev, "Remote device %s, %pOF found\n",
+			rem->name, rem);
 		sdinfo->name = rem->full_name;
 
 		pdata->asd[i] = devm_kzalloc(&pdev->dev,
diff --git a/drivers/media/platform/exynos4-is/fimc-is.c b/drivers/media/platform/exynos4-is/fimc-is.c
index 340d906db370..5ddb2321e9e4 100644
--- a/drivers/media/platform/exynos4-is/fimc-is.c
+++ b/drivers/media/platform/exynos4-is/fimc-is.c
@@ -174,8 +174,8 @@ static int fimc_is_parse_sensor_config(struct fimc_is *is, unsigned int index,
 
 	sensor->drvdata = fimc_is_sensor_get_drvdata(node);
 	if (!sensor->drvdata) {
-		dev_err(&is->pdev->dev, "no driver data found for: %s\n",
-							 node->full_name);
+		dev_err(&is->pdev->dev, "no driver data found for: %pOF\n",
+							 node);
 		return -EINVAL;
 	}
 
@@ -191,8 +191,8 @@ static int fimc_is_parse_sensor_config(struct fimc_is *is, unsigned int index,
 	/* Use MIPI-CSIS channel id to determine the ISP I2C bus index. */
 	ret = of_property_read_u32(port, "reg", &tmp);
 	if (ret < 0) {
-		dev_err(&is->pdev->dev, "reg property not found at: %s\n",
-							 port->full_name);
+		dev_err(&is->pdev->dev, "reg property not found at: %pOF\n",
+							 port);
 		of_node_put(port);
 		return ret;
 	}
diff --git a/drivers/media/platform/exynos4-is/fimc-lite.c b/drivers/media/platform/exynos4-is/fimc-lite.c
index 30282c512e23..4a3c9948ca54 100644
--- a/drivers/media/platform/exynos4-is/fimc-lite.c
+++ b/drivers/media/platform/exynos4-is/fimc-lite.c
@@ -1493,8 +1493,7 @@ static int fimc_lite_probe(struct platform_device *pdev)
 
 	if (!drv_data || fimc->index >= drv_data->num_instances ||
 						fimc->index < 0) {
-		dev_err(dev, "Wrong %s node alias\n",
-					dev->of_node->full_name);
+		dev_err(dev, "Wrong %pOF node alias\n", dev->of_node);
 		return -EINVAL;
 	}
 
diff --git a/drivers/media/platform/exynos4-is/media-dev.c b/drivers/media/platform/exynos4-is/media-dev.c
index 7d1cf78846c4..d4656d5175d7 100644
--- a/drivers/media/platform/exynos4-is/media-dev.c
+++ b/drivers/media/platform/exynos4-is/media-dev.c
@@ -412,8 +412,8 @@ static int fimc_md_parse_port_node(struct fimc_md *fmd,
 	rem = of_graph_get_remote_port_parent(ep);
 	of_node_put(ep);
 	if (rem == NULL) {
-		v4l2_info(&fmd->v4l2_dev, "Remote device at %s not found\n",
-							ep->full_name);
+		v4l2_info(&fmd->v4l2_dev, "Remote device at %pOF not found\n",
+							ep);
 		return 0;
 	}
 
@@ -430,8 +430,8 @@ static int fimc_md_parse_port_node(struct fimc_md *fmd,
 		 */
 		pd->sensor_bus_type = FIMC_BUS_TYPE_MIPI_CSI2;
 	} else {
-		v4l2_err(&fmd->v4l2_dev, "Wrong port id (%u) at node %s\n",
-			 endpoint.base.port, rem->full_name);
+		v4l2_err(&fmd->v4l2_dev, "Wrong port id (%u) at node %pOF\n",
+			 endpoint.base.port, rem);
 	}
 	/*
 	 * For FIMC-IS handled sensors, that are placed under i2c-isp device
diff --git a/drivers/media/platform/exynos4-is/mipi-csis.c b/drivers/media/platform/exynos4-is/mipi-csis.c
index 98c89873c2dc..560aadabcb11 100644
--- a/drivers/media/platform/exynos4-is/mipi-csis.c
+++ b/drivers/media/platform/exynos4-is/mipi-csis.c
@@ -730,8 +730,8 @@ static int s5pcsis_parse_dt(struct platform_device *pdev,
 
 	node = of_graph_get_next_endpoint(node, NULL);
 	if (!node) {
-		dev_err(&pdev->dev, "No port node at %s\n",
-				pdev->dev.of_node->full_name);
+		dev_err(&pdev->dev, "No port node at %pOF\n",
+				pdev->dev.of_node);
 		return -EINVAL;
 	}
 	/* Get port node and validate MIPI-CSI channel id. */
diff --git a/drivers/media/platform/mtk-mdp/mtk_mdp_comp.c b/drivers/media/platform/mtk-mdp/mtk_mdp_comp.c
index aa8f9fd1f1a2..e728d32d9408 100644
--- a/drivers/media/platform/mtk-mdp/mtk_mdp_comp.c
+++ b/drivers/media/platform/mtk-mdp/mtk_mdp_comp.c
@@ -134,15 +134,13 @@ int mtk_mdp_comp_init(struct device *dev, struct device_node *node,
 	larb_node = of_parse_phandle(node, "mediatek,larb", 0);
 	if (!larb_node) {
 		dev_err(dev,
-			"Missing mediadek,larb phandle in %s node\n",
-			node->full_name);
+			"Missing mediadek,larb phandle in %pOF node\n", node);
 		return -EINVAL;
 	}
 
 	larb_pdev = of_find_device_by_node(larb_node);
 	if (!larb_pdev) {
-		dev_warn(dev, "Waiting for larb device %s\n",
-			 larb_node->full_name);
+		dev_warn(dev, "Waiting for larb device %pOF\n", larb_node);
 		of_node_put(larb_node);
 		return -EPROBE_DEFER;
 	}
diff --git a/drivers/media/platform/mtk-mdp/mtk_mdp_core.c b/drivers/media/platform/mtk-mdp/mtk_mdp_core.c
index 81347558b24a..bbb24fb95b95 100644
--- a/drivers/media/platform/mtk-mdp/mtk_mdp_core.c
+++ b/drivers/media/platform/mtk-mdp/mtk_mdp_core.c
@@ -137,16 +137,16 @@ static int mtk_mdp_probe(struct platform_device *pdev)
 			continue;
 
 		if (!of_device_is_available(node)) {
-			dev_err(dev, "Skipping disabled component %s\n",
-				node->full_name);
+			dev_err(dev, "Skipping disabled component %pOF\n",
+				node);
 			continue;
 		}
 
 		comp_type = (enum mtk_mdp_comp_type)of_id->data;
 		comp_id = mtk_mdp_comp_get_id(dev, node, comp_type);
 		if (comp_id < 0) {
-			dev_warn(dev, "Skipping unknown component %s\n",
-				 node->full_name);
+			dev_warn(dev, "Skipping unknown component %pOF\n",
+				 node);
 			continue;
 		}
 
diff --git a/drivers/media/platform/omap3isp/isp.c b/drivers/media/platform/omap3isp/isp.c
index c3014c82d64d..83aea08b832d 100644
--- a/drivers/media/platform/omap3isp/isp.c
+++ b/drivers/media/platform/omap3isp/isp.c
@@ -2024,8 +2024,8 @@ static int isp_fwnode_parse(struct device *dev, struct fwnode_handle *fwnode,
 	if (ret)
 		return ret;
 
-	dev_dbg(dev, "parsing endpoint %s, interface %u\n",
-		to_of_node(fwnode)->full_name, vep.base.port);
+	dev_dbg(dev, "parsing endpoint %pOF, interface %u\n",
+		to_of_node(fwnode), vep.base.port);
 
 	switch (vep.base.port) {
 	case ISP_OF_PHY_PARALLEL:
@@ -2136,8 +2136,8 @@ static int isp_fwnode_parse(struct device *dev, struct fwnode_handle *fwnode,
 		break;
 
 	default:
-		dev_warn(dev, "%s: invalid interface %u\n",
-			 to_of_node(fwnode)->full_name, vep.base.port);
+		dev_warn(dev, "%pOF: invalid interface %u\n",
+			 to_of_node(fwnode), vep.base.port);
 		return -EINVAL;
 	}
 
diff --git a/drivers/media/platform/pxa_camera.c b/drivers/media/platform/pxa_camera.c
index 4a800a4147ca..edca993c2b1f 100644
--- a/drivers/media/platform/pxa_camera.c
+++ b/drivers/media/platform/pxa_camera.c
@@ -2331,7 +2331,7 @@ static int pxa_camera_pdata_from_dt(struct device *dev,
 		asd->match.fwnode.fwnode = of_fwnode_handle(remote);
 		of_node_put(remote);
 	} else {
-		dev_notice(dev, "no remote for %s\n", of_node_full_name(np));
+		dev_notice(dev, "no remote for %pOF\n", np);
 	}
 
 out:
diff --git a/drivers/media/platform/rcar-vin/rcar-core.c b/drivers/media/platform/rcar-vin/rcar-core.c
index 77dff047c41c..142de447aaaa 100644
--- a/drivers/media/platform/rcar-vin/rcar-core.c
+++ b/drivers/media/platform/rcar-vin/rcar-core.c
@@ -222,8 +222,8 @@ static int rvin_digital_graph_init(struct rvin_dev *vin)
 
 	subdevs[0] = &vin->digital.asd;
 
-	vin_dbg(vin, "Found digital subdevice %s\n",
-		of_node_full_name(to_of_node(subdevs[0]->match.fwnode.fwnode)));
+	vin_dbg(vin, "Found digital subdevice %pOF\n",
+		to_of_node(subdevs[0]->match.fwnode.fwnode));
 
 	vin->notifier.num_subdevs = 1;
 	vin->notifier.subdevs = subdevs;
diff --git a/drivers/media/platform/soc_camera/soc_camera.c b/drivers/media/platform/soc_camera/soc_camera.c
index dd349efae3af..1f3c450c7a69 100644
--- a/drivers/media/platform/soc_camera/soc_camera.c
+++ b/drivers/media/platform/soc_camera/soc_camera.c
@@ -1550,8 +1550,7 @@ static int soc_of_bind(struct soc_camera_host *ici,
 		v4l2_clk_name_i2c(clk_name, sizeof(clk_name),
 				  client->adapter->nr, client->addr);
 	else
-		v4l2_clk_name_of(clk_name, sizeof(clk_name),
-				 of_node_full_name(remote));
+		v4l2_clk_name_of(clk_name, sizeof(clk_name), remote);
 
 	icd->clk = v4l2_clk_register(&soc_camera_clk_ops, clk_name, icd);
 	if (IS_ERR(icd->clk)) {
@@ -1590,8 +1589,7 @@ static void scan_of_host(struct soc_camera_host *ici)
 
 		ren = of_graph_get_remote_port(epn);
 		if (!ren) {
-			dev_notice(dev, "no remote for %s\n",
-				   of_node_full_name(epn));
+			dev_notice(dev, "no remote for %pOF\n", epn);
 			continue;
 		}
 
diff --git a/drivers/media/platform/xilinx/xilinx-vipp.c b/drivers/media/platform/xilinx/xilinx-vipp.c
index ac4704388920..ebfdf334d99c 100644
--- a/drivers/media/platform/xilinx/xilinx-vipp.c
+++ b/drivers/media/platform/xilinx/xilinx-vipp.c
@@ -90,12 +90,12 @@ static int xvip_graph_build_one(struct xvip_composite_device *xdev,
 		of_node_put(ep);
 		ep = next;
 
-		dev_dbg(xdev->dev, "processing endpoint %s\n", ep->full_name);
+		dev_dbg(xdev->dev, "processing endpoint %pOF\n", ep);
 
 		ret = v4l2_fwnode_parse_link(of_fwnode_handle(ep), &link);
 		if (ret < 0) {
-			dev_err(xdev->dev, "failed to parse link for %s\n",
-				ep->full_name);
+			dev_err(xdev->dev, "failed to parse link for %pOF\n",
+				ep);
 			continue;
 		}
 
@@ -103,9 +103,9 @@ static int xvip_graph_build_one(struct xvip_composite_device *xdev,
 		 * the link.
 		 */
 		if (link.local_port >= local->num_pads) {
-			dev_err(xdev->dev, "invalid port number %u for %s\n",
+			dev_err(xdev->dev, "invalid port number %u for %pOF\n",
 				link.local_port,
-				to_of_node(link.local_node)->full_name);
+				to_of_node(link.local_node));
 			v4l2_fwnode_put_link(&link);
 			ret = -EINVAL;
 			break;
@@ -114,8 +114,8 @@ static int xvip_graph_build_one(struct xvip_composite_device *xdev,
 		local_pad = &local->pads[link.local_port];
 
 		if (local_pad->flags & MEDIA_PAD_FL_SINK) {
-			dev_dbg(xdev->dev, "skipping sink port %s:%u\n",
-				to_of_node(link.local_node)->full_name,
+			dev_dbg(xdev->dev, "skipping sink port %pOF:%u\n",
+				to_of_node(link.local_node),
 				link.local_port);
 			v4l2_fwnode_put_link(&link);
 			continue;
@@ -123,8 +123,8 @@ static int xvip_graph_build_one(struct xvip_composite_device *xdev,
 
 		/* Skip DMA engines, they will be processed separately. */
 		if (link.remote_node == of_fwnode_handle(xdev->dev->of_node)) {
-			dev_dbg(xdev->dev, "skipping DMA port %s:%u\n",
-				to_of_node(link.local_node)->full_name,
+			dev_dbg(xdev->dev, "skipping DMA port %pOF:%u\n",
+				to_of_node(link.local_node),
 				link.local_port);
 			v4l2_fwnode_put_link(&link);
 			continue;
@@ -134,8 +134,8 @@ static int xvip_graph_build_one(struct xvip_composite_device *xdev,
 		ent = xvip_graph_find_entity(xdev,
 					     to_of_node(link.remote_node));
 		if (ent == NULL) {
-			dev_err(xdev->dev, "no entity found for %s\n",
-				to_of_node(link.remote_node)->full_name);
+			dev_err(xdev->dev, "no entity found for %pOF\n",
+				to_of_node(link.remote_node));
 			v4l2_fwnode_put_link(&link);
 			ret = -ENODEV;
 			break;
@@ -144,9 +144,8 @@ static int xvip_graph_build_one(struct xvip_composite_device *xdev,
 		remote = ent->entity;
 
 		if (link.remote_port >= remote->num_pads) {
-			dev_err(xdev->dev, "invalid port number %u on %s\n",
-				link.remote_port,
-				to_of_node(link.remote_node)->full_name);
+			dev_err(xdev->dev, "invalid port number %u on %pOF\n",
+				link.remote_port, to_of_node(link.remote_node));
 			v4l2_fwnode_put_link(&link);
 			ret = -EINVAL;
 			break;
@@ -216,12 +215,12 @@ static int xvip_graph_build_dma(struct xvip_composite_device *xdev)
 		of_node_put(ep);
 		ep = next;
 
-		dev_dbg(xdev->dev, "processing endpoint %s\n", ep->full_name);
+		dev_dbg(xdev->dev, "processing endpoint %pOF\n", ep);
 
 		ret = v4l2_fwnode_parse_link(of_fwnode_handle(ep), &link);
 		if (ret < 0) {
-			dev_err(xdev->dev, "failed to parse link for %s\n",
-				ep->full_name);
+			dev_err(xdev->dev, "failed to parse link for %pOF\n",
+				ep);
 			continue;
 		}
 
@@ -242,17 +241,17 @@ static int xvip_graph_build_dma(struct xvip_composite_device *xdev)
 		ent = xvip_graph_find_entity(xdev,
 					     to_of_node(link.remote_node));
 		if (ent == NULL) {
-			dev_err(xdev->dev, "no entity found for %s\n",
-				to_of_node(link.remote_node)->full_name);
+			dev_err(xdev->dev, "no entity found for %pOF\n",
+				to_of_node(link.remote_node));
 			v4l2_fwnode_put_link(&link);
 			ret = -ENODEV;
 			break;
 		}
 
 		if (link.remote_port >= ent->entity->num_pads) {
-			dev_err(xdev->dev, "invalid port number %u on %s\n",
+			dev_err(xdev->dev, "invalid port number %u on %pOF\n",
 				link.remote_port,
-				to_of_node(link.remote_node)->full_name);
+				to_of_node(link.remote_node));
 			v4l2_fwnode_put_link(&link);
 			ret = -EINVAL;
 			break;
@@ -337,8 +336,8 @@ static int xvip_graph_notify_bound(struct v4l2_async_notifier *notifier,
 			continue;
 
 		if (entity->subdev) {
-			dev_err(xdev->dev, "duplicate subdev for node %s\n",
-				entity->node->full_name);
+			dev_err(xdev->dev, "duplicate subdev for node %pOF\n",
+				entity->node);
 			return -EINVAL;
 		}
 
@@ -360,14 +359,14 @@ static int xvip_graph_parse_one(struct xvip_composite_device *xdev,
 	struct device_node *ep = NULL;
 	int ret = 0;
 
-	dev_dbg(xdev->dev, "parsing node %s\n", node->full_name);
+	dev_dbg(xdev->dev, "parsing node %pOF\n", node);
 
 	while (1) {
 		ep = of_graph_get_next_endpoint(node, ep);
 		if (ep == NULL)
 			break;
 
-		dev_dbg(xdev->dev, "handling endpoint %s\n", ep->full_name);
+		dev_dbg(xdev->dev, "handling endpoint %pOF\n", ep);
 
 		remote = of_graph_get_remote_port_parent(ep);
 		if (remote == NULL) {
@@ -452,8 +451,7 @@ static int xvip_graph_dma_init_one(struct xvip_composite_device *xdev,
 
 	ret = xvip_dma_init(xdev, dma, type, index);
 	if (ret < 0) {
-		dev_err(xdev->dev, "%s initialization failed\n",
-			node->full_name);
+		dev_err(xdev->dev, "%pOF initialization failed\n", node);
 		return ret;
 	}
 
diff --git a/drivers/media/v4l2-core/v4l2-clk.c b/drivers/media/v4l2-core/v4l2-clk.c
index 297e10e69898..90628d7a04de 100644
--- a/drivers/media/v4l2-core/v4l2-clk.c
+++ b/drivers/media/v4l2-core/v4l2-clk.c
@@ -61,8 +61,7 @@ struct v4l2_clk *v4l2_clk_get(struct device *dev, const char *id)
 
 	/* if dev_name is not found, try use the OF name to find again  */
 	if (PTR_ERR(clk) == -ENODEV && dev->of_node) {
-		v4l2_clk_name_of(clk_name, sizeof(clk_name),
-				 of_node_full_name(dev->of_node));
+		v4l2_clk_name_of(clk_name, sizeof(clk_name), dev->of_node);
 		clk = v4l2_clk_find(clk_name);
 	}
 
diff --git a/include/media/v4l2-clk.h b/include/media/v4l2-clk.h
index 2b94662d005c..7ec857f805a6 100644
--- a/include/media/v4l2-clk.h
+++ b/include/media/v4l2-clk.h
@@ -70,7 +70,7 @@ static inline struct v4l2_clk *v4l2_clk_register_fixed(const char *dev_id,
 #define v4l2_clk_name_i2c(name, size, adap, client) snprintf(name, size, \
 			  "%d-%04x", adap, client)
 
-#define v4l2_clk_name_of(name, size, of_full_name) snprintf(name, size, \
-			  "of-%s", of_full_name)
+#define v4l2_clk_name_of(name, size, node) snprintf(name, size, \
+			  "of-%pOF", node)
 
 #endif
-- 
cgit v1.2.3


From 518f4b26be1ebf6ce220c4e37b5c7e5410c4d064 Mon Sep 17 00:00:00 2001
From: Sean Young <sean@mess.org>
Date: Sat, 1 Jul 2017 12:13:19 -0400
Subject: media: rc-core: rename input_name to device_name

When an ir-spi is registered, you get this message.

rc rc0: Unspecified device as /devices/platform/soc/3f215080.spi/spi_master/spi32766/spi32766.128/rc/rc0

"Unspecified device" refers to input_name, which makes no sense for IR
TX only devices. So, rename to device_name.

Also make driver_name const char* so that no casts are needed anywhere.

Now ir-spi reports:

rc rc0: IR SPI as /devices/platform/soc/3f215080.spi/spi_master/spi32766/spi32766.128/rc/rc0

Signed-off-by: Sean Young <sean@mess.org>
Signed-off-by: Mauro Carvalho Chehab <mchehab@s-opensource.com>
---
 drivers/hid/hid-picolcd_cir.c               |  2 +-
 drivers/media/cec/cec-core.c                |  4 ++--
 drivers/media/common/siano/smsir.c          |  4 ++--
 drivers/media/i2c/ir-kbd-i2c.c              |  2 +-
 drivers/media/pci/bt8xx/bttv-input.c        |  2 +-
 drivers/media/pci/cx23885/cx23885-input.c   |  2 +-
 drivers/media/pci/cx88/cx88-input.c         |  2 +-
 drivers/media/pci/dm1105/dm1105.c           |  2 +-
 drivers/media/pci/mantis/mantis_common.h    |  2 +-
 drivers/media/pci/mantis/mantis_input.c     |  4 ++--
 drivers/media/pci/saa7134/saa7134-input.c   |  2 +-
 drivers/media/pci/smipcie/smipcie-ir.c      |  4 ++--
 drivers/media/pci/smipcie/smipcie.h         |  2 +-
 drivers/media/pci/ttpci/budget-ci.c         |  2 +-
 drivers/media/rc/ati_remote.c               |  2 +-
 drivers/media/rc/ene_ir.c                   |  4 ++--
 drivers/media/rc/fintek-cir.c               |  2 +-
 drivers/media/rc/gpio-ir-recv.c             |  2 +-
 drivers/media/rc/igorplugusb.c              |  2 +-
 drivers/media/rc/iguanair.c                 |  2 +-
 drivers/media/rc/img-ir/img-ir-hw.c         |  2 +-
 drivers/media/rc/img-ir/img-ir-raw.c        |  2 +-
 drivers/media/rc/imon.c                     |  2 +-
 drivers/media/rc/ir-hix5hd2.c               |  2 +-
 drivers/media/rc/ir-spi.c                   |  1 +
 drivers/media/rc/ite-cir.c                  |  2 +-
 drivers/media/rc/mceusb.c                   |  2 +-
 drivers/media/rc/meson-ir.c                 |  2 +-
 drivers/media/rc/mtk-cir.c                  |  2 +-
 drivers/media/rc/nuvoton-cir.c              |  2 +-
 drivers/media/rc/rc-loopback.c              |  2 +-
 drivers/media/rc/rc-main.c                  |  8 ++++----
 drivers/media/rc/redrat3.c                  |  2 +-
 drivers/media/rc/serial_ir.c                | 10 +++++-----
 drivers/media/rc/sir_ir.c                   |  2 +-
 drivers/media/rc/st_rc.c                    |  2 +-
 drivers/media/rc/streamzap.c                |  2 +-
 drivers/media/rc/sunxi-cir.c                |  2 +-
 drivers/media/rc/ttusbir.c                  |  2 +-
 drivers/media/rc/winbond-cir.c              |  2 +-
 drivers/media/usb/au0828/au0828-input.c     |  2 +-
 drivers/media/usb/dvb-usb-v2/dvb_usb_core.c |  5 ++---
 drivers/media/usb/dvb-usb/dvb-usb-remote.c  |  2 +-
 drivers/media/usb/em28xx/em28xx-input.c     |  2 +-
 drivers/media/usb/tm6000/tm6000-input.c     |  2 +-
 include/media/cec.h                         |  2 +-
 include/media/rc-core.h                     |  6 +++---
 47 files changed, 62 insertions(+), 62 deletions(-)

(limited to 'include')

diff --git a/drivers/hid/hid-picolcd_cir.c b/drivers/hid/hid-picolcd_cir.c
index 8ffbb6f65a65..b3b2233f3c65 100644
--- a/drivers/hid/hid-picolcd_cir.c
+++ b/drivers/hid/hid-picolcd_cir.c
@@ -116,7 +116,7 @@ int picolcd_init_cir(struct picolcd_data *data, struct hid_report *report)
 	rdev->allowed_protocols = RC_BIT_ALL_IR_DECODER;
 	rdev->open             = picolcd_cir_open;
 	rdev->close            = picolcd_cir_close;
-	rdev->input_name       = data->hdev->name;
+	rdev->device_name      = data->hdev->name;
 	rdev->input_phys       = data->hdev->phys;
 	rdev->input_id.bustype = data->hdev->bus;
 	rdev->input_id.vendor  = data->hdev->vendor;
diff --git a/drivers/media/cec/cec-core.c b/drivers/media/cec/cec-core.c
index 52f085ba104a..efb7bbbc941f 100644
--- a/drivers/media/cec/cec-core.c
+++ b/drivers/media/cec/cec-core.c
@@ -263,12 +263,12 @@ struct cec_adapter *cec_allocate_adapter(const struct cec_adap_ops *ops,
 		return ERR_PTR(-ENOMEM);
 	}
 
-	snprintf(adap->input_name, sizeof(adap->input_name),
+	snprintf(adap->device_name, sizeof(adap->device_name),
 		 "RC for %s", name);
 	snprintf(adap->input_phys, sizeof(adap->input_phys),
 		 "%s/input0", name);
 
-	adap->rc->input_name = adap->input_name;
+	adap->rc->device_name = adap->device_name;
 	adap->rc->input_phys = adap->input_phys;
 	adap->rc->input_id.bustype = BUS_CEC;
 	adap->rc->input_id.vendor = 0;
diff --git a/drivers/media/common/siano/smsir.c b/drivers/media/common/siano/smsir.c
index 7c898b06d85c..941c342896cc 100644
--- a/drivers/media/common/siano/smsir.c
+++ b/drivers/media/common/siano/smsir.c
@@ -73,7 +73,7 @@ int sms_ir_init(struct smscore_device_t *coredev)
 	strlcpy(coredev->ir.phys, coredev->devpath, sizeof(coredev->ir.phys));
 	strlcat(coredev->ir.phys, "/ir0", sizeof(coredev->ir.phys));
 
-	dev->input_name = coredev->ir.name;
+	dev->device_name = coredev->ir.name;
 	dev->input_phys = coredev->ir.phys;
 	dev->dev.parent = coredev->device;
 
@@ -91,7 +91,7 @@ int sms_ir_init(struct smscore_device_t *coredev)
 	dev->driver_name = MODULE_NAME;
 
 	pr_debug("Input device (IR) %s is set for key events\n",
-		 dev->input_name);
+		 dev->device_name);
 
 	err = rc_register_device(dev);
 	if (err < 0) {
diff --git a/drivers/media/i2c/ir-kbd-i2c.c b/drivers/media/i2c/ir-kbd-i2c.c
index cee7fd9cf08b..af909bf5dd5b 100644
--- a/drivers/media/i2c/ir-kbd-i2c.c
+++ b/drivers/media/i2c/ir-kbd-i2c.c
@@ -452,7 +452,7 @@ static int ir_probe(struct i2c_client *client, const struct i2c_device_id *id)
 	 */
 	rc->input_id.bustype = BUS_I2C;
 	rc->input_phys       = ir->phys;
-	rc->input_name	     = ir->name;
+	rc->device_name	     = ir->name;
 
 	/*
 	 * Initialize the other fields of rc_dev
diff --git a/drivers/media/pci/bt8xx/bttv-input.c b/drivers/media/pci/bt8xx/bttv-input.c
index 2fd07a8afcd2..bb8eda51ee27 100644
--- a/drivers/media/pci/bt8xx/bttv-input.c
+++ b/drivers/media/pci/bt8xx/bttv-input.c
@@ -535,7 +535,7 @@ int bttv_input_init(struct bttv *btv)
 	snprintf(ir->phys, sizeof(ir->phys), "pci-%s/ir0",
 		 pci_name(btv->c.pci));
 
-	rc->input_name = ir->name;
+	rc->device_name = ir->name;
 	rc->input_phys = ir->phys;
 	rc->input_id.bustype = BUS_PCI;
 	rc->input_id.version = 1;
diff --git a/drivers/media/pci/cx23885/cx23885-input.c b/drivers/media/pci/cx23885/cx23885-input.c
index 4367cb3162b6..c9b8e3f4d9fb 100644
--- a/drivers/media/pci/cx23885/cx23885-input.c
+++ b/drivers/media/pci/cx23885/cx23885-input.c
@@ -351,7 +351,7 @@ int cx23885_input_init(struct cx23885_dev *dev)
 	}
 
 	kernel_ir->rc = rc;
-	rc->input_name = kernel_ir->name;
+	rc->device_name = kernel_ir->name;
 	rc->input_phys = kernel_ir->phys;
 	rc->input_id.bustype = BUS_PCI;
 	rc->input_id.version = 1;
diff --git a/drivers/media/pci/cx88/cx88-input.c b/drivers/media/pci/cx88/cx88-input.c
index 01f2e472a2a0..a5dbee776455 100644
--- a/drivers/media/pci/cx88/cx88-input.c
+++ b/drivers/media/pci/cx88/cx88-input.c
@@ -464,7 +464,7 @@ int cx88_ir_init(struct cx88_core *core, struct pci_dev *pci)
 	snprintf(ir->name, sizeof(ir->name), "cx88 IR (%s)", core->board.name);
 	snprintf(ir->phys, sizeof(ir->phys), "pci-%s/ir0", pci_name(pci));
 
-	dev->input_name = ir->name;
+	dev->device_name = ir->name;
 	dev->input_phys = ir->phys;
 	dev->input_id.bustype = BUS_PCI;
 	dev->input_id.version = 1;
diff --git a/drivers/media/pci/dm1105/dm1105.c b/drivers/media/pci/dm1105/dm1105.c
index 190ca9c17237..0bc618f36385 100644
--- a/drivers/media/pci/dm1105/dm1105.c
+++ b/drivers/media/pci/dm1105/dm1105.c
@@ -748,7 +748,7 @@ static int dm1105_ir_init(struct dm1105_dev *dm1105)
 
 	dev->driver_name = MODULE_NAME;
 	dev->map_name = RC_MAP_DM1105_NEC;
-	dev->input_name = "DVB on-card IR receiver";
+	dev->device_name = "DVB on-card IR receiver";
 	dev->input_phys = dm1105->ir.input_phys;
 	dev->input_id.bustype = BUS_PCI;
 	dev->input_id.version = 1;
diff --git a/drivers/media/pci/mantis/mantis_common.h b/drivers/media/pci/mantis/mantis_common.h
index d48778a366a9..a664c319ef0a 100644
--- a/drivers/media/pci/mantis/mantis_common.h
+++ b/drivers/media/pci/mantis/mantis_common.h
@@ -176,7 +176,7 @@ struct mantis_pci {
 	struct work_struct	uart_work;
 
 	struct rc_dev		*rc;
-	char			input_name[80];
+	char			device_name[80];
 	char			input_phys[80];
 	char			*rc_map_name;
 };
diff --git a/drivers/media/pci/mantis/mantis_input.c b/drivers/media/pci/mantis/mantis_input.c
index 50d10cb7d49d..001b7f827699 100644
--- a/drivers/media/pci/mantis/mantis_input.c
+++ b/drivers/media/pci/mantis/mantis_input.c
@@ -46,12 +46,12 @@ int mantis_input_init(struct mantis_pci *mantis)
 		goto out;
 	}
 
-	snprintf(mantis->input_name, sizeof(mantis->input_name),
+	snprintf(mantis->device_name, sizeof(mantis->device_name),
 		 "Mantis %s IR receiver", mantis->hwconfig->model_name);
 	snprintf(mantis->input_phys, sizeof(mantis->input_phys),
 		 "pci-%s/ir0", pci_name(mantis->pdev));
 
-	dev->input_name         = mantis->input_name;
+	dev->device_name        = mantis->device_name;
 	dev->input_phys         = mantis->input_phys;
 	dev->input_id.bustype   = BUS_PCI;
 	dev->input_id.vendor    = mantis->vendor_id;
diff --git a/drivers/media/pci/saa7134/saa7134-input.c b/drivers/media/pci/saa7134/saa7134-input.c
index 78849c19f68a..ba1fc77a6f7b 100644
--- a/drivers/media/pci/saa7134/saa7134-input.c
+++ b/drivers/media/pci/saa7134/saa7134-input.c
@@ -870,7 +870,7 @@ int saa7134_input_init1(struct saa7134_dev *dev)
 	if (raw_decode)
 		rc->driver_type = RC_DRIVER_IR_RAW;
 
-	rc->input_name = ir->name;
+	rc->device_name = ir->name;
 	rc->input_phys = ir->phys;
 	rc->input_id.bustype = BUS_PCI;
 	rc->input_id.version = 1;
diff --git a/drivers/media/pci/smipcie/smipcie-ir.c b/drivers/media/pci/smipcie/smipcie-ir.c
index d2730c3fdbae..fc3375720a35 100644
--- a/drivers/media/pci/smipcie/smipcie-ir.c
+++ b/drivers/media/pci/smipcie/smipcie-ir.c
@@ -188,14 +188,14 @@ int smi_ir_init(struct smi_dev *dev)
 		return -ENOMEM;
 
 	/* init input device */
-	snprintf(ir->input_name, sizeof(ir->input_name), "IR (%s)",
+	snprintf(ir->device_name, sizeof(ir->device_name), "IR (%s)",
 		 dev->info->name);
 	snprintf(ir->input_phys, sizeof(ir->input_phys), "pci-%s/ir0",
 		 pci_name(dev->pci_dev));
 
 	rc_dev->driver_name = "SMI_PCIe";
 	rc_dev->input_phys = ir->input_phys;
-	rc_dev->input_name = ir->input_name;
+	rc_dev->device_name = ir->device_name;
 	rc_dev->input_id.bustype = BUS_PCI;
 	rc_dev->input_id.version = 1;
 	rc_dev->input_id.vendor = dev->pci_dev->subsystem_vendor;
diff --git a/drivers/media/pci/smipcie/smipcie.h b/drivers/media/pci/smipcie/smipcie.h
index 611e4f02cadd..c8368c78ddd5 100644
--- a/drivers/media/pci/smipcie/smipcie.h
+++ b/drivers/media/pci/smipcie/smipcie.h
@@ -240,7 +240,7 @@ struct smi_rc {
 	struct smi_dev *dev;
 	struct rc_dev *rc_dev;
 	char input_phys[64];
-	char input_name[64];
+	char device_name[64];
 	struct work_struct work;
 	u8 irData[256];
 
diff --git a/drivers/media/pci/ttpci/budget-ci.c b/drivers/media/pci/ttpci/budget-ci.c
index 5b8aab4c7b81..cf185c83cc9e 100644
--- a/drivers/media/pci/ttpci/budget-ci.c
+++ b/drivers/media/pci/ttpci/budget-ci.c
@@ -186,7 +186,7 @@ static int msp430_ir_init(struct budget_ci *budget_ci)
 		 "pci-%s/ir0", pci_name(saa->pci));
 
 	dev->driver_name = MODULE_NAME;
-	dev->input_name = budget_ci->ir.name;
+	dev->device_name = budget_ci->ir.name;
 	dev->input_phys = budget_ci->ir.phys;
 	dev->input_id.bustype = BUS_PCI;
 	dev->input_id.version = 1;
diff --git a/drivers/media/rc/ati_remote.c b/drivers/media/rc/ati_remote.c
index a4c6ad4f67c1..a7f76002b30a 100644
--- a/drivers/media/rc/ati_remote.c
+++ b/drivers/media/rc/ati_remote.c
@@ -766,7 +766,7 @@ static void ati_remote_rc_init(struct ati_remote *ati_remote)
 	rdev->open = ati_remote_rc_open;
 	rdev->close = ati_remote_rc_close;
 
-	rdev->input_name = ati_remote->rc_name;
+	rdev->device_name = ati_remote->rc_name;
 	rdev->input_phys = ati_remote->rc_phys;
 
 	usb_to_input_id(ati_remote->udev, &rdev->input_id);
diff --git a/drivers/media/rc/ene_ir.c b/drivers/media/rc/ene_ir.c
index 60da963f40dc..41f6b1c52407 100644
--- a/drivers/media/rc/ene_ir.c
+++ b/drivers/media/rc/ene_ir.c
@@ -1060,7 +1060,7 @@ static int ene_probe(struct pnp_dev *pnp_dev, const struct pnp_device_id *id)
 	rdev->s_idle = ene_set_idle;
 	rdev->driver_name = ENE_DRIVER_NAME;
 	rdev->map_name = RC_MAP_RC6_MCE;
-	rdev->input_name = "ENE eHome Infrared Remote Receiver";
+	rdev->device_name = "ENE eHome Infrared Remote Receiver";
 
 	if (dev->hw_learning_and_tx_capable) {
 		rdev->s_learning_mode = ene_set_learning_mode;
@@ -1070,7 +1070,7 @@ static int ene_probe(struct pnp_dev *pnp_dev, const struct pnp_device_id *id)
 		rdev->s_tx_carrier = ene_set_tx_carrier;
 		rdev->s_tx_duty_cycle = ene_set_tx_duty_cycle;
 		rdev->s_carrier_report = ene_set_carrier_report;
-		rdev->input_name = "ENE eHome Infrared Remote Transceiver";
+		rdev->device_name = "ENE eHome Infrared Remote Transceiver";
 	}
 
 	dev->rdev = rdev;
diff --git a/drivers/media/rc/fintek-cir.c b/drivers/media/rc/fintek-cir.c
index 0d3562712f27..57155e4c9f38 100644
--- a/drivers/media/rc/fintek-cir.c
+++ b/drivers/media/rc/fintek-cir.c
@@ -532,7 +532,7 @@ static int fintek_probe(struct pnp_dev *pdev, const struct pnp_device_id *dev_id
 	rdev->allowed_protocols = RC_BIT_ALL_IR_DECODER;
 	rdev->open = fintek_open;
 	rdev->close = fintek_close;
-	rdev->input_name = FINTEK_DESCRIPTION;
+	rdev->device_name = FINTEK_DESCRIPTION;
 	rdev->input_phys = "fintek/cir0";
 	rdev->input_id.bustype = BUS_HOST;
 	rdev->input_id.vendor = VENDOR_ID_FINTEK;
diff --git a/drivers/media/rc/gpio-ir-recv.c b/drivers/media/rc/gpio-ir-recv.c
index b4f773b9dc1d..561c27a4be64 100644
--- a/drivers/media/rc/gpio-ir-recv.c
+++ b/drivers/media/rc/gpio-ir-recv.c
@@ -150,7 +150,7 @@ static int gpio_ir_recv_probe(struct platform_device *pdev)
 	}
 
 	rcdev->priv = gpio_dev;
-	rcdev->input_name = GPIO_IR_DEVICE_NAME;
+	rcdev->device_name = GPIO_IR_DEVICE_NAME;
 	rcdev->input_phys = GPIO_IR_DEVICE_NAME "/input0";
 	rcdev->input_id.bustype = BUS_HOST;
 	rcdev->input_id.vendor = 0x0001;
diff --git a/drivers/media/rc/igorplugusb.c b/drivers/media/rc/igorplugusb.c
index cb6d4f1247da..5babc6371df4 100644
--- a/drivers/media/rc/igorplugusb.c
+++ b/drivers/media/rc/igorplugusb.c
@@ -194,7 +194,7 @@ static int igorplugusb_probe(struct usb_interface *intf,
 	if (!rc)
 		goto fail;
 
-	rc->input_name = DRIVER_DESC;
+	rc->device_name = DRIVER_DESC;
 	rc->input_phys = ir->phys;
 	usb_to_input_id(udev, &rc->input_id);
 	rc->dev.parent = &intf->dev;
diff --git a/drivers/media/rc/iguanair.c b/drivers/media/rc/iguanair.c
index 8711a7ff55cc..4357dd36d7b9 100644
--- a/drivers/media/rc/iguanair.c
+++ b/drivers/media/rc/iguanair.c
@@ -487,7 +487,7 @@ static int iguanair_probe(struct usb_interface *intf,
 
 	usb_make_path(ir->udev, ir->phys, sizeof(ir->phys));
 
-	rc->input_name = ir->name;
+	rc->device_name = ir->name;
 	rc->input_phys = ir->phys;
 	usb_to_input_id(ir->udev, &rc->input_id);
 	rc->dev.parent = &intf->dev;
diff --git a/drivers/media/rc/img-ir/img-ir-hw.c b/drivers/media/rc/img-ir/img-ir-hw.c
index 8d1439622533..dd46973e0cbf 100644
--- a/drivers/media/rc/img-ir/img-ir-hw.c
+++ b/drivers/media/rc/img-ir/img-ir-hw.c
@@ -1083,7 +1083,7 @@ int img_ir_probe_hw(struct img_ir_priv *priv)
 	rdev->priv = priv;
 	rdev->map_name = RC_MAP_EMPTY;
 	rdev->allowed_protocols = img_ir_allowed_protos(priv);
-	rdev->input_name = "IMG Infrared Decoder";
+	rdev->device_name = "IMG Infrared Decoder";
 	rdev->s_filter = img_ir_set_normal_filter;
 	rdev->s_wakeup_filter = img_ir_set_wakeup_filter;
 
diff --git a/drivers/media/rc/img-ir/img-ir-raw.c b/drivers/media/rc/img-ir/img-ir-raw.c
index 8d2f8e2006e7..7f23a863310c 100644
--- a/drivers/media/rc/img-ir/img-ir-raw.c
+++ b/drivers/media/rc/img-ir/img-ir-raw.c
@@ -117,7 +117,7 @@ int img_ir_probe_raw(struct img_ir_priv *priv)
 	}
 	rdev->priv = priv;
 	rdev->map_name = RC_MAP_EMPTY;
-	rdev->input_name = "IMG Infrared Decoder Raw";
+	rdev->device_name = "IMG Infrared Decoder Raw";
 
 	/* Register raw decoder */
 	error = rc_register_device(rdev);
diff --git a/drivers/media/rc/imon.c b/drivers/media/rc/imon.c
index 717ba782d7a7..ab560fdaae7c 100644
--- a/drivers/media/rc/imon.c
+++ b/drivers/media/rc/imon.c
@@ -2063,7 +2063,7 @@ static struct rc_dev *imon_init_rdev(struct imon_context *ictx)
 		      sizeof(ictx->phys_rdev));
 	strlcat(ictx->phys_rdev, "/input0", sizeof(ictx->phys_rdev));
 
-	rdev->input_name = ictx->name_rdev;
+	rdev->device_name = ictx->name_rdev;
 	rdev->input_phys = ictx->phys_rdev;
 	usb_to_input_id(ictx->usbdev_intf0, &rdev->input_id);
 	rdev->dev.parent = ictx->dev;
diff --git a/drivers/media/rc/ir-hix5hd2.c b/drivers/media/rc/ir-hix5hd2.c
index 50951f686852..0e639fb6f7b9 100644
--- a/drivers/media/rc/ir-hix5hd2.c
+++ b/drivers/media/rc/ir-hix5hd2.c
@@ -249,7 +249,7 @@ static int hix5hd2_ir_probe(struct platform_device *pdev)
 	rdev->driver_name = IR_HIX5HD2_NAME;
 	map_name = of_get_property(node, "linux,rc-map-name", NULL);
 	rdev->map_name = map_name ?: RC_MAP_EMPTY;
-	rdev->input_name = IR_HIX5HD2_NAME;
+	rdev->device_name = IR_HIX5HD2_NAME;
 	rdev->input_phys = IR_HIX5HD2_NAME "/input0";
 	rdev->input_id.bustype = BUS_HOST;
 	rdev->input_id.vendor = 0x0001;
diff --git a/drivers/media/rc/ir-spi.c b/drivers/media/rc/ir-spi.c
index 7e383b3fedd5..29ed0638cb74 100644
--- a/drivers/media/rc/ir-spi.c
+++ b/drivers/media/rc/ir-spi.c
@@ -155,6 +155,7 @@ static int ir_spi_probe(struct spi_device *spi)
 	idata->rc->tx_ir           = ir_spi_tx;
 	idata->rc->s_tx_carrier    = ir_spi_set_tx_carrier;
 	idata->rc->s_tx_duty_cycle = ir_spi_set_duty_cycle;
+	idata->rc->device_name	   = "IR SPI";
 	idata->rc->driver_name     = IR_SPI_DRIVER_NAME;
 	idata->rc->priv            = idata;
 	idata->spi                 = spi;
diff --git a/drivers/media/rc/ite-cir.c b/drivers/media/rc/ite-cir.c
index e9e4befbbebb..c8eea30b4e50 100644
--- a/drivers/media/rc/ite-cir.c
+++ b/drivers/media/rc/ite-cir.c
@@ -1576,7 +1576,7 @@ static int ite_probe(struct pnp_dev *pdev, const struct pnp_device_id
 		rdev->s_tx_duty_cycle = ite_set_tx_duty_cycle;
 	}
 
-	rdev->input_name = dev_desc->model;
+	rdev->device_name = dev_desc->model;
 	rdev->input_id.bustype = BUS_HOST;
 	rdev->input_id.vendor = PCI_VENDOR_ID_ITE;
 	rdev->input_id.product = 0;
diff --git a/drivers/media/rc/mceusb.c b/drivers/media/rc/mceusb.c
index eb130694bbb8..d9c7bbd25253 100644
--- a/drivers/media/rc/mceusb.c
+++ b/drivers/media/rc/mceusb.c
@@ -1264,7 +1264,7 @@ static struct rc_dev *mceusb_init_rc_dev(struct mceusb_dev *ir)
 
 	usb_make_path(ir->usbdev, ir->phys, sizeof(ir->phys));
 
-	rc->input_name = ir->name;
+	rc->device_name = ir->name;
 	rc->input_phys = ir->phys;
 	usb_to_input_id(ir->usbdev, &rc->input_id);
 	rc->dev.parent = dev;
diff --git a/drivers/media/rc/meson-ir.c b/drivers/media/rc/meson-ir.c
index 65566d569cb1..dfe3da487be0 100644
--- a/drivers/media/rc/meson-ir.c
+++ b/drivers/media/rc/meson-ir.c
@@ -138,7 +138,7 @@ static int meson_ir_probe(struct platform_device *pdev)
 	}
 
 	ir->rc->priv = ir;
-	ir->rc->input_name = DRIVER_NAME;
+	ir->rc->device_name = DRIVER_NAME;
 	ir->rc->input_phys = DRIVER_NAME "/input0";
 	ir->rc->input_id.bustype = BUS_HOST;
 	map_name = of_get_property(node, "linux,rc-map-name", NULL);
diff --git a/drivers/media/rc/mtk-cir.c b/drivers/media/rc/mtk-cir.c
index 667277205fee..da4461fabce6 100644
--- a/drivers/media/rc/mtk-cir.c
+++ b/drivers/media/rc/mtk-cir.c
@@ -343,7 +343,7 @@ static int mtk_ir_probe(struct platform_device *pdev)
 	}
 
 	ir->rc->priv = ir;
-	ir->rc->input_name = MTK_IR_DEV;
+	ir->rc->device_name = MTK_IR_DEV;
 	ir->rc->input_phys = MTK_IR_DEV "/input0";
 	ir->rc->input_id.bustype = BUS_HOST;
 	ir->rc->input_id.vendor = 0x0001;
diff --git a/drivers/media/rc/nuvoton-cir.c b/drivers/media/rc/nuvoton-cir.c
index ec4b25bd2ec2..c44f723081cb 100644
--- a/drivers/media/rc/nuvoton-cir.c
+++ b/drivers/media/rc/nuvoton-cir.c
@@ -1134,7 +1134,7 @@ static int nvt_probe(struct pnp_dev *pdev, const struct pnp_device_id *dev_id)
 	rdev->tx_ir = nvt_tx_ir;
 	rdev->s_tx_carrier = nvt_set_tx_carrier;
 	rdev->s_wakeup_filter = nvt_ir_raw_set_wakeup_filter;
-	rdev->input_name = "Nuvoton w836x7hg Infrared Remote Transceiver";
+	rdev->device_name = "Nuvoton w836x7hg Infrared Remote Transceiver";
 	rdev->input_phys = "nuvoton/cir0";
 	rdev->input_id.bustype = BUS_HOST;
 	rdev->input_id.vendor = PCI_VENDOR_ID_WINBOND2;
diff --git a/drivers/media/rc/rc-loopback.c b/drivers/media/rc/rc-loopback.c
index 62195af24fbe..46cc9c29d68a 100644
--- a/drivers/media/rc/rc-loopback.c
+++ b/drivers/media/rc/rc-loopback.c
@@ -219,7 +219,7 @@ static int __init loop_init(void)
 		return -ENOMEM;
 	}
 
-	rc->input_name		= "rc-core loopback device";
+	rc->device_name		= "rc-core loopback device";
 	rc->input_phys		= "rc-core/virtual";
 	rc->input_id.bustype	= BUS_VIRTUAL;
 	rc->input_id.version	= 1;
diff --git a/drivers/media/rc/rc-main.c b/drivers/media/rc/rc-main.c
index 33c04ccccdff..f306e67b8b66 100644
--- a/drivers/media/rc/rc-main.c
+++ b/drivers/media/rc/rc-main.c
@@ -530,7 +530,7 @@ u32 rc_g_keycode_from_table(struct rc_dev *dev, u32 scancode)
 
 	if (keycode != KEY_RESERVED)
 		IR_dprintk(1, "%s: scancode 0x%04x keycode 0x%02x\n",
-			   dev->input_name, scancode, keycode);
+			   dev->device_name, scancode, keycode);
 
 	return keycode;
 }
@@ -663,7 +663,7 @@ static void ir_do_keydown(struct rc_dev *dev, enum rc_type protocol,
 		dev->last_keycode = keycode;
 
 		IR_dprintk(1, "%s: key down event, key 0x%04x, protocol 0x%04x, scancode 0x%08x\n",
-			   dev->input_name, keycode, protocol, scancode);
+			   dev->device_name, keycode, protocol, scancode);
 		input_report_key(dev->input_dev, keycode, 1);
 
 		led_trigger_event(led_feedback, LED_FULL);
@@ -1663,7 +1663,7 @@ static int rc_prepare_rx_device(struct rc_dev *dev)
 	dev->input_dev->dev.parent = &dev->dev;
 	memcpy(&dev->input_dev->id, &dev->input_id, sizeof(dev->input_id));
 	dev->input_dev->phys = dev->input_phys;
-	dev->input_dev->name = dev->input_name;
+	dev->input_dev->name = dev->device_name;
 
 	return 0;
 
@@ -1759,7 +1759,7 @@ int rc_register_device(struct rc_dev *dev)
 
 	path = kobject_get_path(&dev->dev.kobj, GFP_KERNEL);
 	dev_info(&dev->dev, "%s as %s\n",
-		dev->input_name ?: "Unspecified device", path ?: "N/A");
+		 dev->device_name ?: "Unspecified device", path ?: "N/A");
 	kfree(path);
 
 	if (dev->driver_type != RC_DRIVER_IR_RAW_TX) {
diff --git a/drivers/media/rc/redrat3.c b/drivers/media/rc/redrat3.c
index 56d43be2756b..29fa37b99553 100644
--- a/drivers/media/rc/redrat3.c
+++ b/drivers/media/rc/redrat3.c
@@ -951,7 +951,7 @@ static struct rc_dev *redrat3_init_rc_dev(struct redrat3_dev *rr3)
 
 	usb_make_path(rr3->udev, rr3->phys, sizeof(rr3->phys));
 
-	rc->input_name = rr3->name;
+	rc->device_name = rr3->name;
 	rc->input_phys = rr3->phys;
 	usb_to_input_id(rr3->udev, &rc->input_id);
 	rc->dev.parent = dev;
diff --git a/drivers/media/rc/serial_ir.c b/drivers/media/rc/serial_ir.c
index 77d5d4cbed0a..9a5e9fa01196 100644
--- a/drivers/media/rc/serial_ir.c
+++ b/drivers/media/rc/serial_ir.c
@@ -513,19 +513,19 @@ static int serial_ir_probe(struct platform_device *dev)
 
 	switch (type) {
 	case IR_HOMEBREW:
-		rcdev->input_name = "Serial IR type home-brew";
+		rcdev->device_name = "Serial IR type home-brew";
 		break;
 	case IR_IRDEO:
-		rcdev->input_name = "Serial IR type IRdeo";
+		rcdev->device_name = "Serial IR type IRdeo";
 		break;
 	case IR_IRDEO_REMOTE:
-		rcdev->input_name = "Serial IR type IRdeo remote";
+		rcdev->device_name = "Serial IR type IRdeo remote";
 		break;
 	case IR_ANIMAX:
-		rcdev->input_name = "Serial IR type AnimaX";
+		rcdev->device_name = "Serial IR type AnimaX";
 		break;
 	case IR_IGOR:
-		rcdev->input_name = "Serial IR type IgorPlug";
+		rcdev->device_name = "Serial IR type IgorPlug";
 		break;
 	}
 
diff --git a/drivers/media/rc/sir_ir.c b/drivers/media/rc/sir_ir.c
index be67f7c9a75b..83b4410664af 100644
--- a/drivers/media/rc/sir_ir.c
+++ b/drivers/media/rc/sir_ir.c
@@ -308,7 +308,7 @@ static int sir_ir_probe(struct platform_device *dev)
 	if (!rcdev)
 		return -ENOMEM;
 
-	rcdev->input_name = "SIR IrDA port";
+	rcdev->device_name = "SIR IrDA port";
 	rcdev->input_phys = KBUILD_MODNAME "/input0";
 	rcdev->input_id.bustype = BUS_HOST;
 	rcdev->input_id.vendor = 0x0001;
diff --git a/drivers/media/rc/st_rc.c b/drivers/media/rc/st_rc.c
index a08e1dd06124..272de9c8a9f6 100644
--- a/drivers/media/rc/st_rc.c
+++ b/drivers/media/rc/st_rc.c
@@ -299,7 +299,7 @@ static int st_rc_probe(struct platform_device *pdev)
 	rdev->close = st_rc_close;
 	rdev->driver_name = IR_ST_NAME;
 	rdev->map_name = RC_MAP_EMPTY;
-	rdev->input_name = "ST Remote Control Receiver";
+	rdev->device_name = "ST Remote Control Receiver";
 
 	ret = rc_register_device(rdev);
 	if (ret < 0)
diff --git a/drivers/media/rc/streamzap.c b/drivers/media/rc/streamzap.c
index b09c45abb5f3..829f2b348a46 100644
--- a/drivers/media/rc/streamzap.c
+++ b/drivers/media/rc/streamzap.c
@@ -299,7 +299,7 @@ static struct rc_dev *streamzap_init_rc_dev(struct streamzap_ir *sz)
 	usb_make_path(sz->usbdev, sz->phys, sizeof(sz->phys));
 	strlcat(sz->phys, "/input0", sizeof(sz->phys));
 
-	rdev->input_name = sz->name;
+	rdev->device_name = sz->name;
 	rdev->input_phys = sz->phys;
 	usb_to_input_id(sz->usbdev, &rdev->input_id);
 	rdev->dev.parent = dev;
diff --git a/drivers/media/rc/sunxi-cir.c b/drivers/media/rc/sunxi-cir.c
index 4b785dd775c1..87933eb14205 100644
--- a/drivers/media/rc/sunxi-cir.c
+++ b/drivers/media/rc/sunxi-cir.c
@@ -215,7 +215,7 @@ static int sunxi_ir_probe(struct platform_device *pdev)
 	}
 
 	ir->rc->priv = ir;
-	ir->rc->input_name = SUNXI_IR_DEV;
+	ir->rc->device_name = SUNXI_IR_DEV;
 	ir->rc->input_phys = "sunxi-ir/input0";
 	ir->rc->input_id.bustype = BUS_HOST;
 	ir->rc->input_id.vendor = 0x0001;
diff --git a/drivers/media/rc/ttusbir.c b/drivers/media/rc/ttusbir.c
index 23be7702e2df..5002a91e830e 100644
--- a/drivers/media/rc/ttusbir.c
+++ b/drivers/media/rc/ttusbir.c
@@ -309,7 +309,7 @@ static int ttusbir_probe(struct usb_interface *intf,
 
 	usb_make_path(tt->udev, tt->phys, sizeof(tt->phys));
 
-	rc->input_name = DRIVER_DESC;
+	rc->device_name = DRIVER_DESC;
 	rc->input_phys = tt->phys;
 	usb_to_input_id(tt->udev, &rc->input_id);
 	rc->dev.parent = &intf->dev;
diff --git a/drivers/media/rc/winbond-cir.c b/drivers/media/rc/winbond-cir.c
index 5a4d4a611197..ea7be6d35ff8 100644
--- a/drivers/media/rc/winbond-cir.c
+++ b/drivers/media/rc/winbond-cir.c
@@ -1068,7 +1068,7 @@ wbcir_probe(struct pnp_dev *device, const struct pnp_device_id *dev_id)
 	}
 
 	data->dev->driver_name = DRVNAME;
-	data->dev->input_name = WBCIR_NAME;
+	data->dev->device_name = WBCIR_NAME;
 	data->dev->input_phys = "wbcir/cir0";
 	data->dev->input_id.bustype = BUS_HOST;
 	data->dev->input_id.vendor = PCI_VENDOR_ID_WINBOND;
diff --git a/drivers/media/usb/au0828/au0828-input.c b/drivers/media/usb/au0828/au0828-input.c
index 9d82ec0a4b64..9ae42ebefa8a 100644
--- a/drivers/media/usb/au0828/au0828-input.c
+++ b/drivers/media/usb/au0828/au0828-input.c
@@ -335,7 +335,7 @@ int au0828_rc_register(struct au0828_dev *dev)
 	usb_make_path(dev->usbdev, ir->phys, sizeof(ir->phys));
 	strlcat(ir->phys, "/input0", sizeof(ir->phys));
 
-	rc->input_name = ir->name;
+	rc->device_name = ir->name;
 	rc->input_phys = ir->phys;
 	rc->input_id.bustype = BUS_USB;
 	rc->input_id.version = 1;
diff --git a/drivers/media/usb/dvb-usb-v2/dvb_usb_core.c b/drivers/media/usb/dvb-usb-v2/dvb_usb_core.c
index 955fb0d07507..096bb75a24e5 100644
--- a/drivers/media/usb/dvb-usb-v2/dvb_usb_core.c
+++ b/drivers/media/usb/dvb-usb-v2/dvb_usb_core.c
@@ -154,13 +154,12 @@ static int dvb_usbv2_remote_init(struct dvb_usb_device *d)
 	}
 
 	dev->dev.parent = &d->udev->dev;
-	dev->input_name = d->name;
+	dev->device_name = d->name;
 	usb_make_path(d->udev, d->rc_phys, sizeof(d->rc_phys));
 	strlcat(d->rc_phys, "/ir0", sizeof(d->rc_phys));
 	dev->input_phys = d->rc_phys;
 	usb_to_input_id(d->udev, &dev->input_id);
-	/* TODO: likely RC-core should took const char * */
-	dev->driver_name = (char *) d->props->driver_name;
+	dev->driver_name = d->props->driver_name;
 	dev->map_name = d->rc.map_name;
 	dev->allowed_protocols = d->rc.allowed_protos;
 	dev->change_protocol = d->rc.change_protocol;
diff --git a/drivers/media/usb/dvb-usb/dvb-usb-remote.c b/drivers/media/usb/dvb-usb/dvb-usb-remote.c
index f05f1fc80729..0b03f9bd9c26 100644
--- a/drivers/media/usb/dvb-usb/dvb-usb-remote.c
+++ b/drivers/media/usb/dvb-usb/dvb-usb-remote.c
@@ -279,7 +279,7 @@ static int rc_core_dvb_usb_remote_init(struct dvb_usb_device *d)
 	dev->change_protocol = d->props.rc.core.change_protocol;
 	dev->allowed_protocols = d->props.rc.core.allowed_protos;
 	usb_to_input_id(d->udev, &dev->input_id);
-	dev->input_name = "IR-receiver inside an USB DVB receiver";
+	dev->device_name = "IR-receiver inside an USB DVB receiver";
 	dev->input_phys = d->rc_phys;
 	dev->dev.parent = &d->udev->dev;
 	dev->priv = d;
diff --git a/drivers/media/usb/em28xx/em28xx-input.c b/drivers/media/usb/em28xx/em28xx-input.c
index ca9673917ad5..d8746b96a0f8 100644
--- a/drivers/media/usb/em28xx/em28xx-input.c
+++ b/drivers/media/usb/em28xx/em28xx-input.c
@@ -807,7 +807,7 @@ static int em28xx_ir_init(struct em28xx *dev)
 	usb_make_path(udev, ir->phys, sizeof(ir->phys));
 	strlcat(ir->phys, "/input0", sizeof(ir->phys));
 
-	rc->input_name = ir->name;
+	rc->device_name = ir->name;
 	rc->input_phys = ir->phys;
 	rc->input_id.bustype = BUS_USB;
 	rc->input_id.version = 1;
diff --git a/drivers/media/usb/tm6000/tm6000-input.c b/drivers/media/usb/tm6000/tm6000-input.c
index 1a033f57fcc1..83e33aef0105 100644
--- a/drivers/media/usb/tm6000/tm6000-input.c
+++ b/drivers/media/usb/tm6000/tm6000-input.c
@@ -458,7 +458,7 @@ int tm6000_ir_init(struct tm6000_core *dev)
 	rc_type = RC_BIT_UNKNOWN;
 	tm6000_ir_change_protocol(rc, &rc_type);
 
-	rc->input_name = ir->name;
+	rc->device_name = ir->name;
 	rc->input_phys = ir->phys;
 	rc->input_id.bustype = BUS_USB;
 	rc->input_id.version = 1;
diff --git a/include/media/cec.h b/include/media/cec.h
index 224359c9941a..d97aa6c32abd 100644
--- a/include/media/cec.h
+++ b/include/media/cec.h
@@ -203,7 +203,7 @@ struct cec_adapter {
 	u16 phys_addrs[15];
 	u32 sequence;
 
-	char input_name[32];
+	char device_name[32];
 	char input_phys[32];
 	char input_drv[32];
 };
diff --git a/include/media/rc-core.h b/include/media/rc-core.h
index 78dea39a9b39..16bd89caa22d 100644
--- a/include/media/rc-core.h
+++ b/include/media/rc-core.h
@@ -72,7 +72,7 @@ enum rc_filter_type {
  * @dev: driver model's view of this device
  * @managed_alloc: devm_rc_allocate_device was used to create rc_dev
  * @sysfs_groups: sysfs attribute groups
- * @input_name: name of the input child device
+ * @device_name: name of the rc child device
  * @input_phys: physical path to the input child device
  * @input_id: id of the input child device (struct input_id)
  * @driver_name: name of the hardware driver which registered this device
@@ -138,10 +138,10 @@ struct rc_dev {
 	struct device			dev;
 	bool				managed_alloc;
 	const struct attribute_group	*sysfs_groups[5];
-	const char			*input_name;
+	const char			*device_name;
 	const char			*input_phys;
 	struct input_id			input_id;
-	char				*driver_name;
+	const char			*driver_name;
 	const char			*map_name;
 	struct rc_map			rc_map;
 	struct mutex			lock;
-- 
cgit v1.2.3


From e8ffda78623677f7935b30901a173405b4861bda Mon Sep 17 00:00:00 2001
From: Shawn Guo <shawn.guo@linaro.org>
Date: Sun, 30 Jul 2017 09:23:11 -0400
Subject: media: rc: ir-nec-decoder: move scancode composing code into a shared
 function

The NEC scancode composing and protocol type detection in
ir_nec_decode() is generic enough to be a shared function.  Let's create
an inline function in rc-core.h, so that other remote control drivers
can reuse this function to save some code.

Signed-off-by: Shawn Guo <shawn.guo@linaro.org>
Signed-off-by: Sean Young <sean@mess.org>
Signed-off-by: Mauro Carvalho Chehab <mchehab@s-opensource.com>
---
 drivers/media/rc/ir-nec-decoder.c | 32 +++-----------------------------
 include/media/rc-core.h           | 31 +++++++++++++++++++++++++++++++
 2 files changed, 34 insertions(+), 29 deletions(-)

(limited to 'include')

diff --git a/drivers/media/rc/ir-nec-decoder.c b/drivers/media/rc/ir-nec-decoder.c
index 75b9137f6faf..23d2bc8c190d 100644
--- a/drivers/media/rc/ir-nec-decoder.c
+++ b/drivers/media/rc/ir-nec-decoder.c
@@ -51,7 +51,6 @@ static int ir_nec_decode(struct rc_dev *dev, struct ir_raw_event ev)
 	u32 scancode;
 	enum rc_type rc_type;
 	u8 address, not_address, command, not_command;
-	bool send_32bits = false;
 
 	if (!is_timing_event(ev)) {
 		if (ev.reset)
@@ -157,34 +156,9 @@ static int ir_nec_decode(struct rc_dev *dev, struct ir_raw_event ev)
 		command	    = bitrev8((data->bits >>  8) & 0xff);
 		not_command = bitrev8((data->bits >>  0) & 0xff);
 
-		if ((command ^ not_command) != 0xff) {
-			IR_dprintk(1, "NEC checksum error: received 0x%08x\n",
-				   data->bits);
-			send_32bits = true;
-		}
-
-		if (send_32bits) {
-			/* NEC transport, but modified protocol, used by at
-			 * least Apple and TiVo remotes */
-			scancode = not_address << 24 |
-				address     << 16 |
-				not_command <<  8 |
-				command;
-			IR_dprintk(1, "NEC (modified) scancode 0x%08x\n", scancode);
-			rc_type = RC_TYPE_NEC32;
-		} else if ((address ^ not_address) != 0xff) {
-			/* Extended NEC */
-			scancode = address     << 16 |
-				   not_address <<  8 |
-				   command;
-			IR_dprintk(1, "NEC (Ext) scancode 0x%06x\n", scancode);
-			rc_type = RC_TYPE_NECX;
-		} else {
-			/* Normal NEC */
-			scancode = address << 8 | command;
-			IR_dprintk(1, "NEC scancode 0x%04x\n", scancode);
-			rc_type = RC_TYPE_NEC;
-		}
+		scancode = ir_nec_bytes_to_scancode(address, not_address,
+						    command, not_command,
+						    &rc_type);
 
 		if (data->is_nec_x)
 			data->necx_repeat = true;
diff --git a/include/media/rc-core.h b/include/media/rc-core.h
index 16bd89caa22d..b6c18840d125 100644
--- a/include/media/rc-core.h
+++ b/include/media/rc-core.h
@@ -340,4 +340,35 @@ static inline u32 ir_extract_bits(u32 data, u32 mask)
 	return value;
 }
 
+/* Get NEC scancode and protocol type from address and command bytes */
+static inline u32 ir_nec_bytes_to_scancode(u8 address, u8 not_address,
+					   u8 command, u8 not_command,
+					   enum rc_type *protocol)
+{
+	u32 scancode;
+
+	if ((command ^ not_command) != 0xff) {
+		/* NEC transport, but modified protocol, used by at
+		 * least Apple and TiVo remotes
+		 */
+		scancode = not_address << 24 |
+			address     << 16 |
+			not_command <<  8 |
+			command;
+		*protocol = RC_TYPE_NEC32;
+	} else if ((address ^ not_address) != 0xff) {
+		/* Extended NEC */
+		scancode = address     << 16 |
+			   not_address <<  8 |
+			   command;
+		*protocol = RC_TYPE_NECX;
+	} else {
+		/* Normal NEC */
+		scancode = address << 8 | command;
+		*protocol = RC_TYPE_NEC;
+	}
+
+	return scancode;
+}
+
 #endif /* _RC_CORE */
-- 
cgit v1.2.3


From b429996ced6f599b20ffc79789a54c65a21b0d96 Mon Sep 17 00:00:00 2001
From: Shawn Guo <shawn.guo@linaro.org>
Date: Sun, 30 Jul 2017 09:23:13 -0400
Subject: media: rc: add zx-irdec remote control driver

It adds the remote control driver and corresponding keymap file for
IRDEC block found on ZTE ZX family SoCs.

Signed-off-by: Shawn Guo <shawn.guo@linaro.org>
Signed-off-by: Sean Young <sean@mess.org>
Signed-off-by: Mauro Carvalho Chehab <mchehab@s-opensource.com>
---
 drivers/media/rc/Kconfig               |  11 ++
 drivers/media/rc/Makefile              |   1 +
 drivers/media/rc/keymaps/Makefile      |   3 +-
 drivers/media/rc/keymaps/rc-zx-irdec.c |  79 ++++++++++++++
 drivers/media/rc/zx-irdec.c            | 183 +++++++++++++++++++++++++++++++++
 include/media/rc-map.h                 |   1 +
 6 files changed, 277 insertions(+), 1 deletion(-)
 create mode 100644 drivers/media/rc/keymaps/rc-zx-irdec.c
 create mode 100644 drivers/media/rc/zx-irdec.c

(limited to 'include')

diff --git a/drivers/media/rc/Kconfig b/drivers/media/rc/Kconfig
index d27be2516974..d9ce8ff55d0c 100644
--- a/drivers/media/rc/Kconfig
+++ b/drivers/media/rc/Kconfig
@@ -469,6 +469,17 @@ config IR_SIR
 	   To compile this driver as a module, choose M here: the module will
 	   be called sir-ir.
 
+config IR_ZX
+	tristate "ZTE ZX IR remote control"
+	depends on RC_CORE
+	depends on ARCH_ZX || COMPILE_TEST
+	---help---
+	   Say Y if you want to use the IR remote control available
+	   on ZTE ZX family SoCs.
+
+	   To compile this driver as a module, choose M here: the
+	   module will be called zx-irdec.
+
 endif #RC_DEVICES
 
 endif #RC_CORE
diff --git a/drivers/media/rc/Makefile b/drivers/media/rc/Makefile
index 466c40216347..9bc6a3980ed0 100644
--- a/drivers/media/rc/Makefile
+++ b/drivers/media/rc/Makefile
@@ -43,3 +43,4 @@ obj-$(CONFIG_IR_IMG) += img-ir/
 obj-$(CONFIG_IR_SERIAL) += serial_ir.o
 obj-$(CONFIG_IR_SIR) += sir_ir.o
 obj-$(CONFIG_IR_MTK) += mtk-cir.o
+obj-$(CONFIG_IR_ZX) += zx-irdec.o
diff --git a/drivers/media/rc/keymaps/Makefile b/drivers/media/rc/keymaps/Makefile
index 2945f99907b5..af6496d709fb 100644
--- a/drivers/media/rc/keymaps/Makefile
+++ b/drivers/media/rc/keymaps/Makefile
@@ -109,4 +109,5 @@ obj-$(CONFIG_RC_MAP) += rc-adstech-dvb-t-pci.o \
 			rc-videomate-tv-pvr.o \
 			rc-winfast.o \
 			rc-winfast-usbii-deluxe.o \
-			rc-su3000.o
+			rc-su3000.o \
+			rc-zx-irdec.o
diff --git a/drivers/media/rc/keymaps/rc-zx-irdec.c b/drivers/media/rc/keymaps/rc-zx-irdec.c
new file mode 100644
index 000000000000..cc889df47eb8
--- /dev/null
+++ b/drivers/media/rc/keymaps/rc-zx-irdec.c
@@ -0,0 +1,79 @@
+/*
+ * Copyright (C) 2017 Sanechips Technology Co., Ltd.
+ * Copyright 2017 Linaro Ltd.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/module.h>
+#include <media/rc-map.h>
+
+static struct rc_map_table zx_irdec_table[] = {
+	{ 0x01, KEY_1 },
+	{ 0x02, KEY_2 },
+	{ 0x03, KEY_3 },
+	{ 0x04, KEY_4 },
+	{ 0x05, KEY_5 },
+	{ 0x06, KEY_6 },
+	{ 0x07, KEY_7 },
+	{ 0x08, KEY_8 },
+	{ 0x09, KEY_9 },
+	{ 0x31, KEY_0 },
+	{ 0x16, KEY_DELETE },
+	{ 0x0a, KEY_MODE },		/* Input method */
+	{ 0x0c, KEY_VOLUMEUP },
+	{ 0x18, KEY_VOLUMEDOWN },
+	{ 0x0b, KEY_CHANNELUP },
+	{ 0x15, KEY_CHANNELDOWN },
+	{ 0x0d, KEY_PAGEUP },
+	{ 0x13, KEY_PAGEDOWN },
+	{ 0x46, KEY_FASTFORWARD },
+	{ 0x43, KEY_REWIND },
+	{ 0x44, KEY_PLAYPAUSE },
+	{ 0x45, KEY_STOP },
+	{ 0x49, KEY_OK },
+	{ 0x47, KEY_UP },
+	{ 0x4b, KEY_DOWN },
+	{ 0x48, KEY_LEFT },
+	{ 0x4a, KEY_RIGHT },
+	{ 0x4d, KEY_MENU },
+	{ 0x56, KEY_APPSELECT },	/* Application */
+	{ 0x4c, KEY_BACK },
+	{ 0x1e, KEY_INFO },
+	{ 0x4e, KEY_F1 },
+	{ 0x4f, KEY_F2 },
+	{ 0x50, KEY_F3 },
+	{ 0x51, KEY_F4 },
+	{ 0x1c, KEY_AUDIO },
+	{ 0x12, KEY_MUTE },
+	{ 0x11, KEY_DOT },		/* Location */
+	{ 0x1d, KEY_SETUP },
+	{ 0x40, KEY_POWER },
+};
+
+static struct rc_map_list zx_irdec_map = {
+	.map = {
+		.scan = zx_irdec_table,
+		.size = ARRAY_SIZE(zx_irdec_table),
+		.rc_type = RC_TYPE_NEC,
+		.name = RC_MAP_ZX_IRDEC,
+	}
+};
+
+static int __init init_rc_map_zx_irdec(void)
+{
+	return rc_map_register(&zx_irdec_map);
+}
+
+static void __exit exit_rc_map_zx_irdec(void)
+{
+	rc_map_unregister(&zx_irdec_map);
+}
+
+module_init(init_rc_map_zx_irdec)
+module_exit(exit_rc_map_zx_irdec)
+
+MODULE_AUTHOR("Shawn Guo <shawn.guo@linaro.org>");
+MODULE_LICENSE("GPL v2");
diff --git a/drivers/media/rc/zx-irdec.c b/drivers/media/rc/zx-irdec.c
new file mode 100644
index 000000000000..9452bac9262c
--- /dev/null
+++ b/drivers/media/rc/zx-irdec.c
@@ -0,0 +1,183 @@
+/*
+ * Copyright (C) 2017 Sanechips Technology Co., Ltd.
+ * Copyright 2017 Linaro Ltd.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/device.h>
+#include <linux/err.h>
+#include <linux/interrupt.h>
+#include <linux/io.h>
+#include <linux/module.h>
+#include <linux/of_platform.h>
+#include <linux/platform_device.h>
+
+#include <media/rc-core.h>
+
+#define DRIVER_NAME		"zx-irdec"
+
+#define ZX_IR_ENABLE		0x04
+#define ZX_IREN			BIT(0)
+#define ZX_IR_CTRL		0x08
+#define ZX_DEGL_MASK		GENMASK(21, 20)
+#define ZX_DEGL_VALUE(x)	(((x) << 20) & ZX_DEGL_MASK)
+#define ZX_WDBEGIN_MASK		GENMASK(18, 8)
+#define ZX_WDBEGIN_VALUE(x)	(((x) << 8) & ZX_WDBEGIN_MASK)
+#define ZX_IR_INTEN		0x10
+#define ZX_IR_INTSTCLR		0x14
+#define ZX_IR_CODE		0x30
+#define ZX_IR_CNUM		0x34
+#define ZX_NECRPT		BIT(16)
+
+struct zx_irdec {
+	void __iomem *base;
+	struct rc_dev *rcd;
+};
+
+static void zx_irdec_set_mask(struct zx_irdec *irdec, unsigned int reg,
+			      u32 mask, u32 value)
+{
+	u32 data;
+
+	data = readl(irdec->base + reg);
+	data &= ~mask;
+	data |= value & mask;
+	writel(data, irdec->base + reg);
+}
+
+static irqreturn_t zx_irdec_irq(int irq, void *dev_id)
+{
+	struct zx_irdec *irdec = dev_id;
+	u8 address, not_address;
+	u8 command, not_command;
+	u32 rawcode, scancode;
+	enum rc_type rc_type;
+
+	/* Clear interrupt */
+	writel(1, irdec->base + ZX_IR_INTSTCLR);
+
+	/* Check repeat frame */
+	if (readl(irdec->base + ZX_IR_CNUM) & ZX_NECRPT) {
+		rc_repeat(irdec->rcd);
+		goto done;
+	}
+
+	rawcode = readl(irdec->base + ZX_IR_CODE);
+	not_command = (rawcode >> 24) & 0xff;
+	command = (rawcode >> 16) & 0xff;
+	not_address = (rawcode >> 8) & 0xff;
+	address = rawcode & 0xff;
+
+	scancode = ir_nec_bytes_to_scancode(address, not_address,
+					    command, not_command,
+					    &rc_type);
+	rc_keydown(irdec->rcd, rc_type, scancode, 0);
+
+done:
+	return IRQ_HANDLED;
+}
+
+static int zx_irdec_probe(struct platform_device *pdev)
+{
+	struct device *dev = &pdev->dev;
+	struct zx_irdec *irdec;
+	struct resource *res;
+	struct rc_dev *rcd;
+	int irq;
+	int ret;
+
+	irdec = devm_kzalloc(dev, sizeof(*irdec), GFP_KERNEL);
+	if (!irdec)
+		return -ENOMEM;
+
+	res = platform_get_resource(pdev, IORESOURCE_MEM, 0);
+	irdec->base = devm_ioremap_resource(dev, res);
+	if (IS_ERR(irdec->base))
+		return PTR_ERR(irdec->base);
+
+	irq = platform_get_irq(pdev, 0);
+	if (irq < 0)
+		return irq;
+
+	rcd = devm_rc_allocate_device(dev, RC_DRIVER_SCANCODE);
+	if (!rcd) {
+		dev_err(dev, "failed to allocate rc device\n");
+		return -ENOMEM;
+	}
+
+	irdec->rcd = rcd;
+
+	rcd->priv = irdec;
+	rcd->input_phys = DRIVER_NAME "/input0";
+	rcd->input_id.bustype = BUS_HOST;
+	rcd->map_name = RC_MAP_ZX_IRDEC;
+	rcd->allowed_protocols = RC_BIT_NEC | RC_BIT_NECX | RC_BIT_NEC32;
+	rcd->driver_name = DRIVER_NAME;
+	rcd->device_name = DRIVER_NAME;
+
+	platform_set_drvdata(pdev, irdec);
+
+	ret = devm_rc_register_device(dev, rcd);
+	if (ret) {
+		dev_err(dev, "failed to register rc device\n");
+		return ret;
+	}
+
+	ret = devm_request_irq(dev, irq, zx_irdec_irq, 0, NULL, irdec);
+	if (ret) {
+		dev_err(dev, "failed to request irq\n");
+		return ret;
+	}
+
+	/*
+	 * Initialize deglitch level and watchdog counter beginner as
+	 * recommended by vendor BSP code.
+	 */
+	zx_irdec_set_mask(irdec, ZX_IR_CTRL, ZX_DEGL_MASK, ZX_DEGL_VALUE(0));
+	zx_irdec_set_mask(irdec, ZX_IR_CTRL, ZX_WDBEGIN_MASK,
+			  ZX_WDBEGIN_VALUE(0x21c));
+
+	/* Enable interrupt */
+	writel(1, irdec->base + ZX_IR_INTEN);
+
+	/* Enable the decoder */
+	zx_irdec_set_mask(irdec, ZX_IR_ENABLE, ZX_IREN, ZX_IREN);
+
+	return 0;
+}
+
+static int zx_irdec_remove(struct platform_device *pdev)
+{
+	struct zx_irdec *irdec = platform_get_drvdata(pdev);
+
+	/* Disable the decoder */
+	zx_irdec_set_mask(irdec, ZX_IR_ENABLE, ZX_IREN, 0);
+
+	/* Disable interrupt */
+	writel(0, irdec->base + ZX_IR_INTEN);
+
+	return 0;
+}
+
+static const struct of_device_id zx_irdec_match[] = {
+	{ .compatible = "zte,zx296718-irdec" },
+	{ },
+};
+MODULE_DEVICE_TABLE(of, zx_irdec_match);
+
+static struct platform_driver zx_irdec_driver = {
+	.probe = zx_irdec_probe,
+	.remove = zx_irdec_remove,
+	.driver = {
+		.name = DRIVER_NAME,
+		.of_match_table	= zx_irdec_match,
+	},
+};
+module_platform_driver(zx_irdec_driver);
+
+MODULE_DESCRIPTION("ZTE ZX IR remote control driver");
+MODULE_AUTHOR("Shawn Guo <shawn.guo@linaro.org>");
+MODULE_LICENSE("GPL v2");
diff --git a/include/media/rc-map.h b/include/media/rc-map.h
index 1a815a572fa1..c69482852f29 100644
--- a/include/media/rc-map.h
+++ b/include/media/rc-map.h
@@ -313,6 +313,7 @@ struct rc_map *rc_map_get(const char *name);
 #define RC_MAP_WINFAST                   "rc-winfast"
 #define RC_MAP_WINFAST_USBII_DELUXE      "rc-winfast-usbii-deluxe"
 #define RC_MAP_SU3000                    "rc-su3000"
+#define RC_MAP_ZX_IRDEC                  "rc-zx-irdec"
 
 /*
  * Please, do not just append newer Remote Controller names at the end.
-- 
cgit v1.2.3


From 86fe1ac0d563477b1d10d49a92237e3f3d74e7be Mon Sep 17 00:00:00 2001
From: Sean Young <sean@mess.org>
Date: Mon, 7 Aug 2017 08:38:10 -0400
Subject: media: rc: simplify ir_raw_event_store_edge()

Since commit 12749b198fa4 ("[media] rc: saa7134: add trailing space for
timely decoding"), the workaround of inserting reset events is no
longer needed.

Note that the initial reset is not needed either; other rc-core drivers
that don't use ir_raw_event_store_edge() never call this at all.

Verified on a HVR-1150 and Raspberry Pi.

Fixes: 3f5c4c73322e ("[media] rc: fix ghost keypresses with certain hw")

Signed-off-by: Sean Young <sean@mess.org>
Signed-off-by: Mauro Carvalho Chehab <mchehab@s-opensource.com>
---
 drivers/media/pci/saa7134/saa7134-input.c |  2 +-
 drivers/media/rc/gpio-ir-recv.c           |  6 +-----
 drivers/media/rc/img-ir/img-ir-raw.c      |  4 ++--
 drivers/media/rc/rc-core-priv.h           |  1 -
 drivers/media/rc/rc-ir-raw.c              | 31 +++++--------------------------
 include/media/rc-core.h                   | 10 +---------
 6 files changed, 10 insertions(+), 44 deletions(-)

(limited to 'include')

diff --git a/drivers/media/pci/saa7134/saa7134-input.c b/drivers/media/pci/saa7134/saa7134-input.c
index 4b58c129be92..e7b386ee3ff9 100644
--- a/drivers/media/pci/saa7134/saa7134-input.c
+++ b/drivers/media/pci/saa7134/saa7134-input.c
@@ -1055,7 +1055,7 @@ static int saa7134_raw_decode_irq(struct saa7134_dev *dev)
 	saa_clearb(SAA7134_GPIO_GPMODE3, SAA7134_GPIO_GPRESCAN);
 	saa_setb(SAA7134_GPIO_GPMODE3, SAA7134_GPIO_GPRESCAN);
 	space = saa_readl(SAA7134_GPIO_GPSTATUS0 >> 2) & ir->mask_keydown;
-	ir_raw_event_store_edge(dev->remote->dev, space ? IR_SPACE : IR_PULSE);
+	ir_raw_event_store_edge(dev->remote->dev, !space);
 
 	return 1;
 }
diff --git a/drivers/media/rc/gpio-ir-recv.c b/drivers/media/rc/gpio-ir-recv.c
index 512e31593a77..24c7ac8f1b82 100644
--- a/drivers/media/rc/gpio-ir-recv.c
+++ b/drivers/media/rc/gpio-ir-recv.c
@@ -76,7 +76,6 @@ static irqreturn_t gpio_ir_recv_irq(int irq, void *dev_id)
 	struct gpio_rc_dev *gpio_dev = dev_id;
 	int gval;
 	int rc = 0;
-	enum raw_event_type type = IR_SPACE;
 
 	gval = gpio_get_value(gpio_dev->gpio_nr);
 
@@ -86,10 +85,7 @@ static irqreturn_t gpio_ir_recv_irq(int irq, void *dev_id)
 	if (gpio_dev->active_low)
 		gval = !gval;
 
-	if (gval == 1)
-		type = IR_PULSE;
-
-	rc = ir_raw_event_store_edge(gpio_dev->rcdev, type);
+	rc = ir_raw_event_store_edge(gpio_dev->rcdev, gval == 1);
 	if (rc < 0)
 		goto err_get_value;
 
diff --git a/drivers/media/rc/img-ir/img-ir-raw.c b/drivers/media/rc/img-ir/img-ir-raw.c
index 7f23a863310c..64714efc1145 100644
--- a/drivers/media/rc/img-ir/img-ir-raw.c
+++ b/drivers/media/rc/img-ir/img-ir-raw.c
@@ -40,9 +40,9 @@ static void img_ir_refresh_raw(struct img_ir_priv *priv, u32 irq_status)
 
 	/* report the edge to the IR raw decoders */
 	if (ir_status) /* low */
-		ir_raw_event_store_edge(rc_dev, IR_SPACE);
+		ir_raw_event_store_edge(rc_dev, false);
 	else /* high */
-		ir_raw_event_store_edge(rc_dev, IR_PULSE);
+		ir_raw_event_store_edge(rc_dev, true);
 	ir_raw_event_handle(rc_dev);
 }
 
diff --git a/drivers/media/rc/rc-core-priv.h b/drivers/media/rc/rc-core-priv.h
index cae13efc1a88..5e5b10fbc47e 100644
--- a/drivers/media/rc/rc-core-priv.h
+++ b/drivers/media/rc/rc-core-priv.h
@@ -41,7 +41,6 @@ struct ir_raw_event_ctrl {
 	/* fifo for the pulse/space durations */
 	DECLARE_KFIFO(kfifo, struct ir_raw_event, MAX_IR_EVENT_SIZE);
 	ktime_t				last_event;	/* when last event occurred */
-	enum raw_event_type		last_type;	/* last event type */
 	struct rc_dev			*dev;		/* pointer to the parent rc_dev */
 	/* edge driver */
 	struct timer_list edge_handle;
diff --git a/drivers/media/rc/rc-ir-raw.c b/drivers/media/rc/rc-ir-raw.c
index ef5efd994eef..1761be8c7028 100644
--- a/drivers/media/rc/rc-ir-raw.c
+++ b/drivers/media/rc/rc-ir-raw.c
@@ -88,7 +88,7 @@ EXPORT_SYMBOL_GPL(ir_raw_event_store);
 /**
  * ir_raw_event_store_edge() - notify raw ir decoders of the start of a pulse/space
  * @dev:	the struct rc_dev device descriptor
- * @type:	the type of the event that has occurred
+ * @pulse:	true for pulse, false for space
  *
  * This routine (which may be called from an interrupt context) is used to
  * store the beginning of an ir pulse or space (or the start/end of ir
@@ -96,43 +96,22 @@ EXPORT_SYMBOL_GPL(ir_raw_event_store);
  * hardware which does not provide durations directly but only interrupts
  * (or similar events) on state change.
  */
-int ir_raw_event_store_edge(struct rc_dev *dev, enum raw_event_type type)
+int ir_raw_event_store_edge(struct rc_dev *dev, bool pulse)
 {
 	ktime_t			now;
-	s64			delta; /* ns */
 	DEFINE_IR_RAW_EVENT(ev);
 	int			rc = 0;
-	int			delay;
 
 	if (!dev->raw)
 		return -EINVAL;
 
 	now = ktime_get();
-	delta = ktime_to_ns(ktime_sub(now, dev->raw->last_event));
-	delay = MS_TO_NS(dev->input_dev->rep[REP_DELAY]);
+	ev.duration = ktime_sub(now, dev->raw->last_event);
+	ev.pulse = !pulse;
 
-	/* Check for a long duration since last event or if we're
-	 * being called for the first time, note that delta can't
-	 * possibly be negative.
-	 */
-	if (delta > delay || !dev->raw->last_type)
-		type |= IR_START_EVENT;
-	else
-		ev.duration = delta;
-
-	if (type & IR_START_EVENT)
-		ir_raw_event_reset(dev);
-	else if (dev->raw->last_type & IR_SPACE) {
-		ev.pulse = false;
-		rc = ir_raw_event_store(dev, &ev);
-	} else if (dev->raw->last_type & IR_PULSE) {
-		ev.pulse = true;
-		rc = ir_raw_event_store(dev, &ev);
-	} else
-		return 0;
+	rc = ir_raw_event_store(dev, &ev);
 
 	dev->raw->last_event = now;
-	dev->raw->last_type = type;
 
 	/* timer could be set to timeout (125ms by default) */
 	if (!timer_pending(&dev->raw->edge_handle) ||
diff --git a/include/media/rc-core.h b/include/media/rc-core.h
index b6c18840d125..5be527ff851d 100644
--- a/include/media/rc-core.h
+++ b/include/media/rc-core.h
@@ -272,14 +272,6 @@ u32 rc_g_keycode_from_table(struct rc_dev *dev, u32 scancode);
  * The Raw interface is specific to InfraRed. It may be a good idea to
  * split it later into a separate header.
  */
-
-enum raw_event_type {
-	IR_SPACE        = (1 << 0),
-	IR_PULSE        = (1 << 1),
-	IR_START_EVENT  = (1 << 2),
-	IR_STOP_EVENT   = (1 << 3),
-};
-
 struct ir_raw_event {
 	union {
 		u32             duration;
@@ -308,7 +300,7 @@ static inline void init_ir_raw_event(struct ir_raw_event *ev)
 
 void ir_raw_event_handle(struct rc_dev *dev);
 int ir_raw_event_store(struct rc_dev *dev, struct ir_raw_event *ev);
-int ir_raw_event_store_edge(struct rc_dev *dev, enum raw_event_type type);
+int ir_raw_event_store_edge(struct rc_dev *dev, bool pulse);
 int ir_raw_event_store_with_filter(struct rc_dev *dev,
 				struct ir_raw_event *ev);
 void ir_raw_event_set_idle(struct rc_dev *dev, bool idle);
-- 
cgit v1.2.3


From a9a249a2c997506a64eaee22f1458fda893f62a8 Mon Sep 17 00:00:00 2001
From: Hans Verkuil <hans.verkuil@cisco.com>
Date: Mon, 7 Aug 2017 09:31:24 -0400
Subject: media: cec: fix remote control passthrough

The 'Press and Hold' operation was not correctly implemented, in
particular the requirement that the repeat doesn't start until
the second identical keypress arrives. The REP_DELAY value also
had to be adjusted (see the comment in the code) to achieve the
desired behavior.

The 'enabled_protocols' field was also never set, fix that too. Since
CEC is a fixed protocol the driver has to set this field.

Signed-off-by: Hans Verkuil <hans.verkuil@cisco.com>
Acked-by: Sean Young <sean@mess.org>
Signed-off-by: Mauro Carvalho Chehab <mchehab@s-opensource.com>
---
 drivers/media/cec/cec-adap.c | 56 ++++++++++++++++++++++++++++++++++++++++----
 drivers/media/cec/cec-core.c | 13 ++++++++++
 include/media/cec.h          |  5 ++++
 3 files changed, 69 insertions(+), 5 deletions(-)

(limited to 'include')

diff --git a/drivers/media/cec/cec-adap.c b/drivers/media/cec/cec-adap.c
index d9adeb505c09..31d25e00d011 100644
--- a/drivers/media/cec/cec-adap.c
+++ b/drivers/media/cec/cec-adap.c
@@ -1767,6 +1767,9 @@ static int cec_receive_notify(struct cec_adapter *adap, struct cec_msg *msg,
 	int la_idx = cec_log_addr2idx(adap, dest_laddr);
 	bool from_unregistered = init_laddr == 0xf;
 	struct cec_msg tx_cec_msg = { };
+#ifdef CONFIG_MEDIA_CEC_RC
+	int scancode;
+#endif
 
 	dprintk(2, "%s: %*ph\n", __func__, msg->len, msg->msg);
 
@@ -1855,11 +1858,9 @@ static int cec_receive_notify(struct cec_adapter *adap, struct cec_msg *msg,
 		 */
 		case 0x60:
 			if (msg->len == 2)
-				rc_keydown(adap->rc, RC_TYPE_CEC,
-					   msg->msg[2], 0);
+				scancode = msg->msg[2];
 			else
-				rc_keydown(adap->rc, RC_TYPE_CEC,
-					   msg->msg[2] << 8 | msg->msg[3], 0);
+				scancode = msg->msg[2] << 8 | msg->msg[3];
 			break;
 		/*
 		 * Other function messages that are not handled.
@@ -1872,11 +1873,54 @@ static int cec_receive_notify(struct cec_adapter *adap, struct cec_msg *msg,
 		 */
 		case 0x56: case 0x57:
 		case 0x67: case 0x68: case 0x69: case 0x6a:
+			scancode = -1;
 			break;
 		default:
-			rc_keydown(adap->rc, RC_TYPE_CEC, msg->msg[2], 0);
+			scancode = msg->msg[2];
+			break;
+		}
+
+		/* Was repeating, but keypress timed out */
+		if (adap->rc_repeating && !adap->rc->keypressed) {
+			adap->rc_repeating = false;
+			adap->rc_last_scancode = -1;
+		}
+		/* Different keypress from last time, ends repeat mode */
+		if (adap->rc_last_scancode != scancode) {
+			rc_keyup(adap->rc);
+			adap->rc_repeating = false;
+		}
+		/* We can't handle this scancode */
+		if (scancode < 0) {
+			adap->rc_last_scancode = scancode;
+			break;
+		}
+
+		/* Send key press */
+		rc_keydown(adap->rc, RC_TYPE_CEC, scancode, 0);
+
+		/* When in repeating mode, we're done */
+		if (adap->rc_repeating)
+			break;
+
+		/*
+		 * We are not repeating, but the new scancode is
+		 * the same as the last one, and this second key press is
+		 * within 550 ms (the 'Follower Safety Timeout') from the
+		 * previous key press, so we now enable the repeating mode.
+		 */
+		if (adap->rc_last_scancode == scancode &&
+		    msg->rx_ts - adap->rc_last_keypress < 550 * NSEC_PER_MSEC) {
+			adap->rc_repeating = true;
 			break;
 		}
+		/*
+		 * Not in repeating mode, so avoid triggering repeat mode
+		 * by calling keyup.
+		 */
+		rc_keyup(adap->rc);
+		adap->rc_last_scancode = scancode;
+		adap->rc_last_keypress = msg->rx_ts;
 #endif
 		break;
 
@@ -1886,6 +1930,8 @@ static int cec_receive_notify(struct cec_adapter *adap, struct cec_msg *msg,
 			break;
 #ifdef CONFIG_MEDIA_CEC_RC
 		rc_keyup(adap->rc);
+		adap->rc_repeating = false;
+		adap->rc_last_scancode = -1;
 #endif
 		break;
 
diff --git a/drivers/media/cec/cec-core.c b/drivers/media/cec/cec-core.c
index efb7bbbc941f..fcd01577cd1c 100644
--- a/drivers/media/cec/cec-core.c
+++ b/drivers/media/cec/cec-core.c
@@ -276,9 +276,11 @@ struct cec_adapter *cec_allocate_adapter(const struct cec_adap_ops *ops,
 	adap->rc->input_id.version = 1;
 	adap->rc->driver_name = CEC_NAME;
 	adap->rc->allowed_protocols = RC_BIT_CEC;
+	adap->rc->enabled_protocols = RC_BIT_CEC;
 	adap->rc->priv = adap;
 	adap->rc->map_name = RC_MAP_CEC;
 	adap->rc->timeout = MS_TO_NS(100);
+	adap->rc_last_scancode = -1;
 #endif
 	return adap;
 }
@@ -310,6 +312,17 @@ int cec_register_adapter(struct cec_adapter *adap,
 			adap->rc = NULL;
 			return res;
 		}
+		/*
+		 * The REP_DELAY for CEC is really the time between the initial
+		 * 'User Control Pressed' message and the second. The first
+		 * keypress is always seen as non-repeating, the second
+		 * (provided it has the same UI Command) will start the 'Press
+		 * and Hold' (aka repeat) behavior. By setting REP_DELAY to the
+		 * same value as REP_PERIOD the expected CEC behavior is
+		 * reproduced.
+		 */
+		adap->rc->input_dev->rep[REP_DELAY] =
+			adap->rc->input_dev->rep[REP_PERIOD];
 	}
 #endif
 
diff --git a/include/media/cec.h b/include/media/cec.h
index d97aa6c32abd..60b26fc18464 100644
--- a/include/media/cec.h
+++ b/include/media/cec.h
@@ -190,6 +190,11 @@ struct cec_adapter {
 
 	u32 tx_timeouts;
 
+#ifdef CONFIG_MEDIA_CEC_RC
+	bool rc_repeating;
+	int rc_last_scancode;
+	u64 rc_last_keypress;
+#endif
 #ifdef CONFIG_CEC_NOTIFIER
 	struct cec_notifier *notifier;
 #endif
-- 
cgit v1.2.3


From 6d741bfed5ed06ed42a16d30f1ed7afdcaf7f092 Mon Sep 17 00:00:00 2001
From: Sean Young <sean@mess.org>
Date: Mon, 7 Aug 2017 16:20:58 -0400
Subject: media: rc: rename RC_TYPE_* to RC_PROTO_* and RC_BIT_* to
 RC_PROTO_BIT_*

RC_TYPE is confusing and it's just the protocol. So rename it.

Suggested-by: Hans Verkuil <hans.verkuil@cisco.com>
Signed-off-by: Sean Young <sean@mess.org>
Acked-by: Hans Verkuil <hans.verkuil@cisco.com>
Signed-off-by: Mauro Carvalho Chehab <mchehab@s-opensource.com>
---
 drivers/hid/hid-picolcd_cir.c                      |   2 +-
 drivers/media/cec/cec-adap.c                       |   2 +-
 drivers/media/cec/cec-core.c                       |   4 +-
 drivers/media/common/siano/smsir.c                 |   2 +-
 drivers/media/i2c/ir-kbd-i2c.c                     |  57 +++---
 drivers/media/pci/bt8xx/bttv-input.c               |  16 +-
 drivers/media/pci/cx18/cx18-i2c.c                  |   4 +-
 drivers/media/pci/cx23885/cx23885-input.c          |  14 +-
 drivers/media/pci/cx88/cx88-input.c                |  28 +--
 drivers/media/pci/dm1105/dm1105.c                  |   2 +-
 drivers/media/pci/ivtv/ivtv-i2c.c                  |  14 +-
 drivers/media/pci/mantis/mantis_input.c            |   2 +-
 drivers/media/pci/saa7134/saa7134-input.c          |  46 +++--
 drivers/media/pci/smipcie/smipcie-ir.c             |   2 +-
 drivers/media/pci/ttpci/budget-ci.c                |   5 +-
 drivers/media/rc/ati_remote.c                      |   5 +-
 drivers/media/rc/ene_ir.c                          |   2 +-
 drivers/media/rc/fintek-cir.c                      |   2 +-
 drivers/media/rc/gpio-ir-recv.c                    |   2 +-
 drivers/media/rc/igorplugusb.c                     |   9 +-
 drivers/media/rc/iguanair.c                        |   2 +-
 drivers/media/rc/img-ir/img-ir-hw.c                |   4 +-
 drivers/media/rc/img-ir/img-ir-hw.h                |   4 +-
 drivers/media/rc/img-ir/img-ir-jvc.c               |   4 +-
 drivers/media/rc/img-ir/img-ir-nec.c               |  20 +-
 drivers/media/rc/img-ir/img-ir-rc5.c               |   4 +-
 drivers/media/rc/img-ir/img-ir-rc6.c               |   4 +-
 drivers/media/rc/img-ir/img-ir-sanyo.c             |   4 +-
 drivers/media/rc/img-ir/img-ir-sharp.c             |   4 +-
 drivers/media/rc/img-ir/img-ir-sony.c              |  27 +--
 drivers/media/rc/imon.c                            |  49 ++---
 drivers/media/rc/ir-hix5hd2.c                      |   2 +-
 drivers/media/rc/ir-jvc-decoder.c                  |   6 +-
 drivers/media/rc/ir-mce_kbd-decoder.c              |   6 +-
 drivers/media/rc/ir-nec-decoder.c                  |  17 +-
 drivers/media/rc/ir-rc5-decoder.c                  |  25 +--
 drivers/media/rc/ir-rc6-decoder.c                  |  30 +--
 drivers/media/rc/ir-sanyo-decoder.c                |   6 +-
 drivers/media/rc/ir-sharp-decoder.c                |   6 +-
 drivers/media/rc/ir-sony-decoder.c                 |  23 +--
 drivers/media/rc/ir-xmp-decoder.c                  |   4 +-
 drivers/media/rc/ite-cir.c                         |   2 +-
 drivers/media/rc/keymaps/rc-adstech-dvb-t-pci.c    |   8 +-
 drivers/media/rc/keymaps/rc-alink-dtu-m.c          |   8 +-
 drivers/media/rc/keymaps/rc-anysee.c               |   8 +-
 drivers/media/rc/keymaps/rc-apac-viewcomp.c        |   8 +-
 drivers/media/rc/keymaps/rc-asus-pc39.c            |   8 +-
 drivers/media/rc/keymaps/rc-asus-ps3-100.c         |   8 +-
 drivers/media/rc/keymaps/rc-ati-tv-wonder-hd-600.c |   8 +-
 drivers/media/rc/keymaps/rc-ati-x10.c              |   8 +-
 drivers/media/rc/keymaps/rc-avermedia-a16d.c       |   8 +-
 drivers/media/rc/keymaps/rc-avermedia-cardbus.c    |   8 +-
 drivers/media/rc/keymaps/rc-avermedia-dvbt.c       |   8 +-
 drivers/media/rc/keymaps/rc-avermedia-m135a.c      |   8 +-
 .../media/rc/keymaps/rc-avermedia-m733a-rm-k6.c    |   8 +-
 drivers/media/rc/keymaps/rc-avermedia-rm-ks.c      |   8 +-
 drivers/media/rc/keymaps/rc-avermedia.c            |   8 +-
 drivers/media/rc/keymaps/rc-avertv-303.c           |   8 +-
 drivers/media/rc/keymaps/rc-azurewave-ad-tu700.c   |   8 +-
 drivers/media/rc/keymaps/rc-behold-columbus.c      |   8 +-
 drivers/media/rc/keymaps/rc-behold.c               |   8 +-
 drivers/media/rc/keymaps/rc-budget-ci-old.c        |   8 +-
 drivers/media/rc/keymaps/rc-cec.c                  |   2 +-
 drivers/media/rc/keymaps/rc-cinergy-1400.c         |   8 +-
 drivers/media/rc/keymaps/rc-cinergy.c              |   8 +-
 drivers/media/rc/keymaps/rc-d680-dmb.c             |   8 +-
 drivers/media/rc/keymaps/rc-delock-61959.c         |   8 +-
 drivers/media/rc/keymaps/rc-dib0700-nec.c          |   8 +-
 drivers/media/rc/keymaps/rc-dib0700-rc5.c          |   8 +-
 drivers/media/rc/keymaps/rc-digitalnow-tinytwin.c  |   8 +-
 drivers/media/rc/keymaps/rc-digittrade.c           |   8 +-
 drivers/media/rc/keymaps/rc-dm1105-nec.c           |   8 +-
 drivers/media/rc/keymaps/rc-dntv-live-dvb-t.c      |   8 +-
 drivers/media/rc/keymaps/rc-dntv-live-dvbt-pro.c   |   8 +-
 drivers/media/rc/keymaps/rc-dtt200u.c              |   8 +-
 drivers/media/rc/keymaps/rc-dvbsky.c               |   8 +-
 drivers/media/rc/keymaps/rc-dvico-mce.c            |   8 +-
 drivers/media/rc/keymaps/rc-dvico-portable.c       |   8 +-
 drivers/media/rc/keymaps/rc-em-terratec.c          |   8 +-
 drivers/media/rc/keymaps/rc-encore-enltv-fm53.c    |   8 +-
 drivers/media/rc/keymaps/rc-encore-enltv.c         |   8 +-
 drivers/media/rc/keymaps/rc-encore-enltv2.c        |   8 +-
 drivers/media/rc/keymaps/rc-evga-indtube.c         |   8 +-
 drivers/media/rc/keymaps/rc-eztv.c                 |   8 +-
 drivers/media/rc/keymaps/rc-flydvb.c               |   8 +-
 drivers/media/rc/keymaps/rc-flyvideo.c             |   8 +-
 drivers/media/rc/keymaps/rc-fusionhdtv-mce.c       |   8 +-
 drivers/media/rc/keymaps/rc-gadmei-rm008z.c        |   8 +-
 drivers/media/rc/keymaps/rc-geekbox.c              |   8 +-
 drivers/media/rc/keymaps/rc-genius-tvgo-a11mce.c   |   8 +-
 drivers/media/rc/keymaps/rc-gotview7135.c          |   8 +-
 drivers/media/rc/keymaps/rc-hauppauge.c            |   8 +-
 drivers/media/rc/keymaps/rc-imon-mce.c             |   8 +-
 drivers/media/rc/keymaps/rc-imon-pad.c             |   8 +-
 drivers/media/rc/keymaps/rc-iodata-bctv7e.c        |   8 +-
 drivers/media/rc/keymaps/rc-it913x-v1.c            |   8 +-
 drivers/media/rc/keymaps/rc-it913x-v2.c            |   8 +-
 drivers/media/rc/keymaps/rc-kaiomy.c               |   8 +-
 drivers/media/rc/keymaps/rc-kworld-315u.c          |   8 +-
 drivers/media/rc/keymaps/rc-kworld-pc150u.c        |   8 +-
 .../media/rc/keymaps/rc-kworld-plus-tv-analog.c    |   8 +-
 drivers/media/rc/keymaps/rc-leadtek-y04g0051.c     |   8 +-
 drivers/media/rc/keymaps/rc-lme2510.c              |   8 +-
 drivers/media/rc/keymaps/rc-manli.c                |   8 +-
 .../media/rc/keymaps/rc-medion-x10-digitainer.c    |   8 +-
 drivers/media/rc/keymaps/rc-medion-x10-or2x.c      |   8 +-
 drivers/media/rc/keymaps/rc-medion-x10.c           |   8 +-
 drivers/media/rc/keymaps/rc-msi-digivox-ii.c       |   8 +-
 drivers/media/rc/keymaps/rc-msi-digivox-iii.c      |   8 +-
 drivers/media/rc/keymaps/rc-msi-tvanywhere-plus.c  |   8 +-
 drivers/media/rc/keymaps/rc-msi-tvanywhere.c       |   8 +-
 drivers/media/rc/keymaps/rc-nebula.c               |   8 +-
 .../media/rc/keymaps/rc-nec-terratec-cinergy-xs.c  |   8 +-
 drivers/media/rc/keymaps/rc-norwood.c              |   8 +-
 drivers/media/rc/keymaps/rc-npgtech.c              |   8 +-
 drivers/media/rc/keymaps/rc-pctv-sedna.c           |   8 +-
 drivers/media/rc/keymaps/rc-pinnacle-color.c       |   8 +-
 drivers/media/rc/keymaps/rc-pinnacle-grey.c        |   8 +-
 drivers/media/rc/keymaps/rc-pinnacle-pctv-hd.c     |   8 +-
 drivers/media/rc/keymaps/rc-pixelview-002t.c       |   8 +-
 drivers/media/rc/keymaps/rc-pixelview-mk12.c       |   8 +-
 drivers/media/rc/keymaps/rc-pixelview-new.c        |   8 +-
 drivers/media/rc/keymaps/rc-pixelview.c            |   8 +-
 .../media/rc/keymaps/rc-powercolor-real-angel.c    |   8 +-
 drivers/media/rc/keymaps/rc-proteus-2309.c         |   8 +-
 drivers/media/rc/keymaps/rc-purpletv.c             |   8 +-
 drivers/media/rc/keymaps/rc-pv951.c                |   8 +-
 drivers/media/rc/keymaps/rc-rc6-mce.c              |   8 +-
 .../media/rc/keymaps/rc-real-audio-220-32-keys.c   |   8 +-
 drivers/media/rc/keymaps/rc-reddo.c                |   8 +-
 drivers/media/rc/keymaps/rc-snapstream-firefly.c   |   8 +-
 drivers/media/rc/keymaps/rc-streamzap.c            |   8 +-
 drivers/media/rc/keymaps/rc-su3000.c               |   8 +-
 drivers/media/rc/keymaps/rc-tbs-nec.c              |   8 +-
 drivers/media/rc/keymaps/rc-technisat-ts35.c       |   8 +-
 drivers/media/rc/keymaps/rc-technisat-usb2.c       |   8 +-
 .../media/rc/keymaps/rc-terratec-cinergy-c-pci.c   |   8 +-
 .../media/rc/keymaps/rc-terratec-cinergy-s2-hd.c   |   8 +-
 drivers/media/rc/keymaps/rc-terratec-cinergy-xs.c  |   8 +-
 drivers/media/rc/keymaps/rc-terratec-slim-2.c      |   8 +-
 drivers/media/rc/keymaps/rc-terratec-slim.c        |   8 +-
 drivers/media/rc/keymaps/rc-tevii-nec.c            |   8 +-
 drivers/media/rc/keymaps/rc-tivo.c                 |   8 +-
 .../media/rc/keymaps/rc-total-media-in-hand-02.c   |   8 +-
 drivers/media/rc/keymaps/rc-total-media-in-hand.c  |   8 +-
 drivers/media/rc/keymaps/rc-trekstor.c             |   8 +-
 drivers/media/rc/keymaps/rc-tt-1500.c              |   8 +-
 drivers/media/rc/keymaps/rc-twinhan-dtv-cab-ci.c   |   8 +-
 drivers/media/rc/keymaps/rc-twinhan1027.c          |   8 +-
 drivers/media/rc/keymaps/rc-videomate-m1f.c        |   8 +-
 drivers/media/rc/keymaps/rc-videomate-s350.c       |   8 +-
 drivers/media/rc/keymaps/rc-videomate-tv-pvr.c     |   8 +-
 drivers/media/rc/keymaps/rc-winfast-usbii-deluxe.c |   8 +-
 drivers/media/rc/keymaps/rc-winfast.c              |   8 +-
 drivers/media/rc/keymaps/rc-zx-irdec.c             |   2 +-
 drivers/media/rc/mceusb.c                          |   2 +-
 drivers/media/rc/meson-ir.c                        |   2 +-
 drivers/media/rc/mtk-cir.c                         |   2 +-
 drivers/media/rc/nuvoton-cir.c                     |   4 +-
 drivers/media/rc/rc-core-priv.h                    |   2 +-
 drivers/media/rc/rc-ir-raw.c                       |   4 +-
 drivers/media/rc/rc-loopback.c                     |   4 +-
 drivers/media/rc/rc-main.c                         | 152 ++++++++-------
 drivers/media/rc/redrat3.c                         |   2 +-
 drivers/media/rc/serial_ir.c                       |   2 +-
 drivers/media/rc/sir_ir.c                          |   2 +-
 drivers/media/rc/st_rc.c                           |   2 +-
 drivers/media/rc/streamzap.c                       |   2 +-
 drivers/media/rc/sunxi-cir.c                       |   2 +-
 drivers/media/rc/ttusbir.c                         |   2 +-
 drivers/media/rc/winbond-cir.c                     |  33 ++--
 drivers/media/rc/zx-irdec.c                        |   9 +-
 drivers/media/usb/au0828/au0828-input.c            |   4 +-
 drivers/media/usb/cx231xx/cx231xx-input.c          |   6 +-
 drivers/media/usb/dvb-usb-v2/af9015.c              |  11 +-
 drivers/media/usb/dvb-usb-v2/af9035.c              |  14 +-
 drivers/media/usb/dvb-usb-v2/anysee.c              |   4 +-
 drivers/media/usb/dvb-usb-v2/az6007.c              |  11 +-
 drivers/media/usb/dvb-usb-v2/dvb_usb.h             |   2 +-
 drivers/media/usb/dvb-usb-v2/dvbsky.c              |   4 +-
 drivers/media/usb/dvb-usb-v2/lmedm04.c             |   6 +-
 drivers/media/usb/dvb-usb-v2/rtl28xxu.c            |  13 +-
 drivers/media/usb/dvb-usb/cxusb.c                  |  30 +--
 drivers/media/usb/dvb-usb/dib0700.h                |   2 +-
 drivers/media/usb/dvb-usb/dib0700_core.c           |  28 +--
 drivers/media/usb/dvb-usb/dib0700_devices.c        | 152 +++++++--------
 drivers/media/usb/dvb-usb/dtt200u.c                |  12 +-
 drivers/media/usb/dvb-usb/dvb-usb.h                |   2 +-
 drivers/media/usb/dvb-usb/dw2102.c                 |  21 +-
 drivers/media/usb/dvb-usb/m920x.c                  |   4 +-
 drivers/media/usb/dvb-usb/pctv452e.c               |   6 +-
 drivers/media/usb/dvb-usb/technisat-usb2.c         |   2 +-
 drivers/media/usb/dvb-usb/ttusb2.c                 |   4 +-
 drivers/media/usb/em28xx/em28xx-input.c            | 124 ++++++------
 drivers/media/usb/hdpvr/hdpvr-i2c.c                |   3 +-
 drivers/media/usb/pvrusb2/pvrusb2-i2c-core.c       |  10 +-
 drivers/media/usb/tm6000/tm6000-input.c            |  38 ++--
 include/media/i2c/ir-kbd-i2c.h                     |   8 +-
 include/media/rc-core.h                            |  33 ++--
 include/media/rc-map.h                             | 215 +++++++++++----------
 200 files changed, 1204 insertions(+), 1158 deletions(-)

(limited to 'include')

diff --git a/drivers/hid/hid-picolcd_cir.c b/drivers/hid/hid-picolcd_cir.c
index b3b2233f3c65..32747b7f917e 100644
--- a/drivers/hid/hid-picolcd_cir.c
+++ b/drivers/hid/hid-picolcd_cir.c
@@ -113,7 +113,7 @@ int picolcd_init_cir(struct picolcd_data *data, struct hid_report *report)
 		return -ENOMEM;
 
 	rdev->priv             = data;
-	rdev->allowed_protocols = RC_BIT_ALL_IR_DECODER;
+	rdev->allowed_protocols = RC_PROTO_BIT_ALL_IR_DECODER;
 	rdev->open             = picolcd_cir_open;
 	rdev->close            = picolcd_cir_close;
 	rdev->device_name      = data->hdev->name;
diff --git a/drivers/media/cec/cec-adap.c b/drivers/media/cec/cec-adap.c
index 31d25e00d011..9f8b64f2b253 100644
--- a/drivers/media/cec/cec-adap.c
+++ b/drivers/media/cec/cec-adap.c
@@ -1897,7 +1897,7 @@ static int cec_receive_notify(struct cec_adapter *adap, struct cec_msg *msg,
 		}
 
 		/* Send key press */
-		rc_keydown(adap->rc, RC_TYPE_CEC, scancode, 0);
+		rc_keydown(adap->rc, RC_PROTO_CEC, scancode, 0);
 
 		/* When in repeating mode, we're done */
 		if (adap->rc_repeating)
diff --git a/drivers/media/cec/cec-core.c b/drivers/media/cec/cec-core.c
index fcd01577cd1c..e9db90997b0a 100644
--- a/drivers/media/cec/cec-core.c
+++ b/drivers/media/cec/cec-core.c
@@ -275,8 +275,8 @@ struct cec_adapter *cec_allocate_adapter(const struct cec_adap_ops *ops,
 	adap->rc->input_id.product = 0;
 	adap->rc->input_id.version = 1;
 	adap->rc->driver_name = CEC_NAME;
-	adap->rc->allowed_protocols = RC_BIT_CEC;
-	adap->rc->enabled_protocols = RC_BIT_CEC;
+	adap->rc->allowed_protocols = RC_PROTO_BIT_CEC;
+	adap->rc->enabled_protocols = RC_PROTO_BIT_CEC;
 	adap->rc->priv = adap;
 	adap->rc->map_name = RC_MAP_CEC;
 	adap->rc->timeout = MS_TO_NS(100);
diff --git a/drivers/media/common/siano/smsir.c b/drivers/media/common/siano/smsir.c
index 941c342896cc..e77bb0c95e69 100644
--- a/drivers/media/common/siano/smsir.c
+++ b/drivers/media/common/siano/smsir.c
@@ -86,7 +86,7 @@ int sms_ir_init(struct smscore_device_t *coredev)
 #endif
 
 	dev->priv = coredev;
-	dev->allowed_protocols = RC_BIT_ALL_IR_DECODER;
+	dev->allowed_protocols = RC_PROTO_BIT_ALL_IR_DECODER;
 	dev->map_name = sms_get_board(board_id)->rc_codes;
 	dev->driver_name = MODULE_NAME;
 
diff --git a/drivers/media/i2c/ir-kbd-i2c.c b/drivers/media/i2c/ir-kbd-i2c.c
index af909bf5dd5b..a374e2a0ac3d 100644
--- a/drivers/media/i2c/ir-kbd-i2c.c
+++ b/drivers/media/i2c/ir-kbd-i2c.c
@@ -59,8 +59,8 @@ module_param(debug, int, 0644);    /* debug level (0,1,2) */
 
 /* ----------------------------------------------------------------------- */
 
-static int get_key_haup_common(struct IR_i2c *ir, enum rc_type *protocol,
-					u32 *scancode, u8 *ptoggle, int size)
+static int get_key_haup_common(struct IR_i2c *ir, enum rc_proto *protocol,
+			       u32 *scancode, u8 *ptoggle, int size)
 {
 	unsigned char buf[6];
 	int start, range, toggle, dev, code, ircode, vendor;
@@ -99,7 +99,7 @@ static int get_key_haup_common(struct IR_i2c *ir, enum rc_type *protocol,
 		dprintk(1, "ir hauppauge (rc5): s%d r%d t%d dev=%d code=%d\n",
 			start, range, toggle, dev, code);
 
-		*protocol = RC_TYPE_RC5;
+		*protocol = RC_PROTO_RC5;
 		*scancode = RC_SCANCODE_RC5(dev, code);
 		*ptoggle = toggle;
 
@@ -111,13 +111,13 @@ static int get_key_haup_common(struct IR_i2c *ir, enum rc_type *protocol,
 
 		if (vendor == 0x800f) {
 			*ptoggle = (dev & 0x80) != 0;
-			*protocol = RC_TYPE_RC6_MCE;
+			*protocol = RC_PROTO_RC6_MCE;
 			dev &= 0x7f;
 			dprintk(1, "ir hauppauge (rc6-mce): t%d vendor=%d dev=%d code=%d\n",
 						*ptoggle, vendor, dev, code);
 		} else {
 			*ptoggle = 0;
-			*protocol = RC_TYPE_RC6_6A_32;
+			*protocol = RC_PROTO_RC6_6A_32;
 			dprintk(1, "ir hauppauge (rc6-6a-32): vendor=%d dev=%d code=%d\n",
 							vendor, dev, code);
 		}
@@ -130,13 +130,13 @@ static int get_key_haup_common(struct IR_i2c *ir, enum rc_type *protocol,
 	return 0;
 }
 
-static int get_key_haup(struct IR_i2c *ir, enum rc_type *protocol,
+static int get_key_haup(struct IR_i2c *ir, enum rc_proto *protocol,
 			u32 *scancode, u8 *toggle)
 {
 	return get_key_haup_common(ir, protocol, scancode, toggle, 3);
 }
 
-static int get_key_haup_xvr(struct IR_i2c *ir, enum rc_type *protocol,
+static int get_key_haup_xvr(struct IR_i2c *ir, enum rc_proto *protocol,
 			    u32 *scancode, u8 *toggle)
 {
 	int ret;
@@ -155,7 +155,7 @@ static int get_key_haup_xvr(struct IR_i2c *ir, enum rc_type *protocol,
 	return get_key_haup_common(ir, protocol, scancode, toggle, 6);
 }
 
-static int get_key_pixelview(struct IR_i2c *ir, enum rc_type *protocol,
+static int get_key_pixelview(struct IR_i2c *ir, enum rc_proto *protocol,
 			     u32 *scancode, u8 *toggle)
 {
 	unsigned char b;
@@ -166,13 +166,13 @@ static int get_key_pixelview(struct IR_i2c *ir, enum rc_type *protocol,
 		return -EIO;
 	}
 
-	*protocol = RC_TYPE_OTHER;
+	*protocol = RC_PROTO_OTHER;
 	*scancode = b;
 	*toggle = 0;
 	return 1;
 }
 
-static int get_key_fusionhdtv(struct IR_i2c *ir, enum rc_type *protocol,
+static int get_key_fusionhdtv(struct IR_i2c *ir, enum rc_proto *protocol,
 			      u32 *scancode, u8 *toggle)
 {
 	unsigned char buf[4];
@@ -191,13 +191,13 @@ static int get_key_fusionhdtv(struct IR_i2c *ir, enum rc_type *protocol,
 	if(buf[0] != 0x1 ||  buf[1] != 0xfe)
 		return 0;
 
-	*protocol = RC_TYPE_UNKNOWN;
+	*protocol = RC_PROTO_UNKNOWN;
 	*scancode = buf[2];
 	*toggle = 0;
 	return 1;
 }
 
-static int get_key_knc1(struct IR_i2c *ir, enum rc_type *protocol,
+static int get_key_knc1(struct IR_i2c *ir, enum rc_proto *protocol,
 			u32 *scancode, u8 *toggle)
 {
 	unsigned char b;
@@ -221,13 +221,13 @@ static int get_key_knc1(struct IR_i2c *ir, enum rc_type *protocol,
 		/* keep old data */
 		return 1;
 
-	*protocol = RC_TYPE_UNKNOWN;
+	*protocol = RC_PROTO_UNKNOWN;
 	*scancode = b;
 	*toggle = 0;
 	return 1;
 }
 
-static int get_key_avermedia_cardbus(struct IR_i2c *ir, enum rc_type *protocol,
+static int get_key_avermedia_cardbus(struct IR_i2c *ir, enum rc_proto *protocol,
 				     u32 *scancode, u8 *toggle)
 {
 	unsigned char subaddr, key, keygroup;
@@ -262,7 +262,7 @@ static int get_key_avermedia_cardbus(struct IR_i2c *ir, enum rc_type *protocol,
 	}
 	key |= (keygroup & 1) << 6;
 
-	*protocol = RC_TYPE_UNKNOWN;
+	*protocol = RC_PROTO_UNKNOWN;
 	*scancode = key;
 	if (ir->c->addr == 0x41) /* AVerMedia EM78P153 */
 		*scancode |= keygroup << 8;
@@ -274,7 +274,7 @@ static int get_key_avermedia_cardbus(struct IR_i2c *ir, enum rc_type *protocol,
 
 static int ir_key_poll(struct IR_i2c *ir)
 {
-	enum rc_type protocol;
+	enum rc_proto protocol;
 	u32 scancode;
 	u8 toggle;
 	int rc;
@@ -315,7 +315,7 @@ static int ir_probe(struct i2c_client *client, const struct i2c_device_id *id)
 {
 	char *ir_codes = NULL;
 	const char *name = NULL;
-	u64 rc_type = RC_BIT_UNKNOWN;
+	u64 rc_proto = RC_PROTO_BIT_UNKNOWN;
 	struct IR_i2c *ir;
 	struct rc_dev *rc = NULL;
 	struct i2c_adapter *adap = client->adapter;
@@ -334,7 +334,7 @@ static int ir_probe(struct i2c_client *client, const struct i2c_device_id *id)
 	case 0x64:
 		name        = "Pixelview";
 		ir->get_key = get_key_pixelview;
-		rc_type     = RC_BIT_OTHER;
+		rc_proto    = RC_PROTO_BIT_OTHER;
 		ir_codes    = RC_MAP_EMPTY;
 		break;
 	case 0x18:
@@ -342,38 +342,39 @@ static int ir_probe(struct i2c_client *client, const struct i2c_device_id *id)
 	case 0x1a:
 		name        = "Hauppauge";
 		ir->get_key = get_key_haup;
-		rc_type     = RC_BIT_RC5;
+		rc_proto    = RC_PROTO_BIT_RC5;
 		ir_codes    = RC_MAP_HAUPPAUGE;
 		break;
 	case 0x30:
 		name        = "KNC One";
 		ir->get_key = get_key_knc1;
-		rc_type     = RC_BIT_OTHER;
+		rc_proto    = RC_PROTO_BIT_OTHER;
 		ir_codes    = RC_MAP_EMPTY;
 		break;
 	case 0x6b:
 		name        = "FusionHDTV";
 		ir->get_key = get_key_fusionhdtv;
-		rc_type     = RC_BIT_UNKNOWN;
+		rc_proto    = RC_PROTO_BIT_UNKNOWN;
 		ir_codes    = RC_MAP_FUSIONHDTV_MCE;
 		break;
 	case 0x40:
 		name        = "AVerMedia Cardbus remote";
 		ir->get_key = get_key_avermedia_cardbus;
-		rc_type     = RC_BIT_OTHER;
+		rc_proto    = RC_PROTO_BIT_OTHER;
 		ir_codes    = RC_MAP_AVERMEDIA_CARDBUS;
 		break;
 	case 0x41:
 		name        = "AVerMedia EM78P153";
 		ir->get_key = get_key_avermedia_cardbus;
-		rc_type     = RC_BIT_OTHER;
+		rc_proto    = RC_PROTO_BIT_OTHER;
 		/* RM-KV remote, seems to be same as RM-K6 */
 		ir_codes    = RC_MAP_AVERMEDIA_M733A_RM_K6;
 		break;
 	case 0x71:
 		name        = "Hauppauge/Zilog Z8";
 		ir->get_key = get_key_haup_xvr;
-		rc_type     = RC_BIT_RC5 | RC_BIT_RC6_MCE | RC_BIT_RC6_6A_32;
+		rc_proto    = RC_PROTO_BIT_RC5 | RC_PROTO_BIT_RC6_MCE |
+							RC_PROTO_BIT_RC6_6A_32;
 		ir_codes    = RC_MAP_HAUPPAUGE;
 		break;
 	}
@@ -388,7 +389,7 @@ static int ir_probe(struct i2c_client *client, const struct i2c_device_id *id)
 
 		name = init_data->name;
 		if (init_data->type)
-			rc_type = init_data->type;
+			rc_proto = init_data->type;
 
 		if (init_data->polling_interval)
 			ir->polling_interval = init_data->polling_interval;
@@ -431,7 +432,7 @@ static int ir_probe(struct i2c_client *client, const struct i2c_device_id *id)
 	ir->rc = rc;
 
 	/* Make sure we are all setup before going on */
-	if (!name || !ir->get_key || !rc_type || !ir_codes) {
+	if (!name || !ir->get_key || !rc_proto || !ir_codes) {
 		dprintk(1, ": Unsupported device at address 0x%02x\n",
 			addr);
 		err = -ENODEV;
@@ -458,8 +459,8 @@ static int ir_probe(struct i2c_client *client, const struct i2c_device_id *id)
 	 * Initialize the other fields of rc_dev
 	 */
 	rc->map_name       = ir->ir_codes;
-	rc->allowed_protocols = rc_type;
-	rc->enabled_protocols = rc_type;
+	rc->allowed_protocols = rc_proto;
+	rc->enabled_protocols = rc_proto;
 	if (!rc->driver_name)
 		rc->driver_name = MODULE_NAME;
 
diff --git a/drivers/media/pci/bt8xx/bttv-input.c b/drivers/media/pci/bt8xx/bttv-input.c
index bb8eda51ee27..73d655d073d6 100644
--- a/drivers/media/pci/bt8xx/bttv-input.c
+++ b/drivers/media/pci/bt8xx/bttv-input.c
@@ -69,12 +69,13 @@ static void ir_handle_key(struct bttv *btv)
 
 	if ((ir->mask_keydown && (gpio & ir->mask_keydown)) ||
 	    (ir->mask_keyup   && !(gpio & ir->mask_keyup))) {
-		rc_keydown_notimeout(ir->dev, RC_TYPE_UNKNOWN, data, 0);
+		rc_keydown_notimeout(ir->dev, RC_PROTO_UNKNOWN, data, 0);
 	} else {
 		/* HACK: Probably, ir->mask_keydown is missing
 		   for this board */
 		if (btv->c.type == BTTV_BOARD_WINFAST2000)
-			rc_keydown_notimeout(ir->dev, RC_TYPE_UNKNOWN, data, 0);
+			rc_keydown_notimeout(ir->dev, RC_PROTO_UNKNOWN, data,
+					     0);
 
 		rc_keyup(ir->dev);
 	}
@@ -99,7 +100,7 @@ static void ir_enltv_handle_key(struct bttv *btv)
 			gpio, data,
 			(gpio & ir->mask_keyup) ? " up" : "up/down");
 
-		rc_keydown_notimeout(ir->dev, RC_TYPE_UNKNOWN, data, 0);
+		rc_keydown_notimeout(ir->dev, RC_PROTO_UNKNOWN, data, 0);
 		if (keyup)
 			rc_keyup(ir->dev);
 	} else {
@@ -113,7 +114,8 @@ static void ir_enltv_handle_key(struct bttv *btv)
 		if (keyup)
 			rc_keyup(ir->dev);
 		else
-			rc_keydown_notimeout(ir->dev, RC_TYPE_UNKNOWN, data, 0);
+			rc_keydown_notimeout(ir->dev, RC_PROTO_UNKNOWN, data,
+					     0);
 	}
 
 	ir->last_gpio = data | keyup;
@@ -235,7 +237,7 @@ static void bttv_rc5_timer_end(unsigned long data)
 	}
 
 	scancode = RC_SCANCODE_RC5(system, command);
-	rc_keydown(ir->dev, RC_TYPE_RC5, scancode, toggle);
+	rc_keydown(ir->dev, RC_PROTO_RC5, scancode, toggle);
 	dprintk("scancode %x, toggle %x\n", scancode, toggle);
 }
 
@@ -327,7 +329,7 @@ static void bttv_ir_stop(struct bttv *btv)
  * Get_key functions used by I2C remotes
  */
 
-static int get_key_pv951(struct IR_i2c *ir, enum rc_type *protocol,
+static int get_key_pv951(struct IR_i2c *ir, enum rc_proto *protocol,
 			 u32 *scancode, u8 *toggle)
 {
 	unsigned char b;
@@ -355,7 +357,7 @@ static int get_key_pv951(struct IR_i2c *ir, enum rc_type *protocol,
 	 * 	   the device is bound to the vendor-provided RC.
 	 */
 
-	*protocol = RC_TYPE_UNKNOWN;
+	*protocol = RC_PROTO_UNKNOWN;
 	*scancode = b;
 	*toggle = 0;
 	return 1;
diff --git a/drivers/media/pci/cx18/cx18-i2c.c b/drivers/media/pci/cx18/cx18-i2c.c
index eabdd4c5520a..b89fbcbfb491 100644
--- a/drivers/media/pci/cx18/cx18-i2c.c
+++ b/drivers/media/pci/cx18/cx18-i2c.c
@@ -93,8 +93,8 @@ static int cx18_i2c_new_ir(struct cx18 *cx, struct i2c_adapter *adap, u32 hw,
 	case CX18_HW_Z8F0811_IR_RX_HAUP:
 		init_data->ir_codes = RC_MAP_HAUPPAUGE;
 		init_data->internal_get_key_func = IR_KBD_GET_KEY_HAUP_XVR;
-		init_data->type = RC_BIT_RC5 | RC_BIT_RC6_MCE |
-							RC_BIT_RC6_6A_32;
+		init_data->type = RC_PROTO_BIT_RC5 | RC_PROTO_BIT_RC6_MCE |
+							RC_PROTO_BIT_RC6_6A_32;
 		init_data->name = cx->card_name;
 		info.platform_data = init_data;
 		break;
diff --git a/drivers/media/pci/cx23885/cx23885-input.c b/drivers/media/pci/cx23885/cx23885-input.c
index c9b8e3f4d9fb..944b70831f12 100644
--- a/drivers/media/pci/cx23885/cx23885-input.c
+++ b/drivers/media/pci/cx23885/cx23885-input.c
@@ -284,32 +284,32 @@ int cx23885_input_init(struct cx23885_dev *dev)
 	case CX23885_BOARD_HAUPPAUGE_HVR1290:
 	case CX23885_BOARD_HAUPPAUGE_HVR1250:
 		/* Integrated CX2388[58] IR controller */
-		allowed_protos = RC_BIT_ALL_IR_DECODER;
+		allowed_protos = RC_PROTO_BIT_ALL_IR_DECODER;
 		/* The grey Hauppauge RC-5 remote */
 		rc_map = RC_MAP_HAUPPAUGE;
 		break;
 	case CX23885_BOARD_TERRATEC_CINERGY_T_PCIE_DUAL:
 		/* Integrated CX23885 IR controller */
-		allowed_protos = RC_BIT_ALL_IR_DECODER;
+		allowed_protos = RC_PROTO_BIT_ALL_IR_DECODER;
 		/* The grey Terratec remote with orange buttons */
 		rc_map = RC_MAP_NEC_TERRATEC_CINERGY_XS;
 		break;
 	case CX23885_BOARD_TEVII_S470:
 		/* Integrated CX23885 IR controller */
-		allowed_protos = RC_BIT_ALL_IR_DECODER;
+		allowed_protos = RC_PROTO_BIT_ALL_IR_DECODER;
 		/* A guess at the remote */
 		rc_map = RC_MAP_TEVII_NEC;
 		break;
 	case CX23885_BOARD_MYGICA_X8507:
 		/* Integrated CX23885 IR controller */
-		allowed_protos = RC_BIT_ALL_IR_DECODER;
+		allowed_protos = RC_PROTO_BIT_ALL_IR_DECODER;
 		/* A guess at the remote */
 		rc_map = RC_MAP_TOTAL_MEDIA_IN_HAND_02;
 		break;
 	case CX23885_BOARD_TBS_6980:
 	case CX23885_BOARD_TBS_6981:
 		/* Integrated CX23885 IR controller */
-		allowed_protos = RC_BIT_ALL_IR_DECODER;
+		allowed_protos = RC_PROTO_BIT_ALL_IR_DECODER;
 		/* A guess at the remote */
 		rc_map = RC_MAP_TBS_NEC;
 		break;
@@ -320,12 +320,12 @@ int cx23885_input_init(struct cx23885_dev *dev)
 	case CX23885_BOARD_DVBSKY_S952:
 	case CX23885_BOARD_DVBSKY_T982:
 		/* Integrated CX23885 IR controller */
-		allowed_protos = RC_BIT_ALL_IR_DECODER;
+		allowed_protos = RC_PROTO_BIT_ALL_IR_DECODER;
 		rc_map = RC_MAP_DVBSKY;
 		break;
 	case CX23885_BOARD_TT_CT2_4500_CI:
 		/* Integrated CX23885 IR controller */
-		allowed_protos = RC_BIT_ALL_IR_DECODER;
+		allowed_protos = RC_PROTO_BIT_ALL_IR_DECODER;
 		rc_map = RC_MAP_TT_1500;
 		break;
 	default:
diff --git a/drivers/media/pci/cx88/cx88-input.c b/drivers/media/pci/cx88/cx88-input.c
index a5dbee776455..e02449bf2041 100644
--- a/drivers/media/pci/cx88/cx88-input.c
+++ b/drivers/media/pci/cx88/cx88-input.c
@@ -132,7 +132,7 @@ static void cx88_ir_handle_key(struct cx88_IR *ir)
 
 		data = (data << 4) | ((gpio_key & 0xf0) >> 4);
 
-		rc_keydown(ir->dev, RC_TYPE_UNKNOWN, data, 0);
+		rc_keydown(ir->dev, RC_PROTO_UNKNOWN, data, 0);
 
 	} else if (ir->core->boardnr == CX88_BOARD_PROLINK_PLAYTVPVR ||
 		   ir->core->boardnr == CX88_BOARD_PIXELVIEW_PLAYTV_ULTRA_PRO) {
@@ -146,7 +146,7 @@ static void cx88_ir_handle_key(struct cx88_IR *ir)
 		scancode = RC_SCANCODE_NECX(addr, cmd);
 
 		if (0 == (gpio & ir->mask_keyup))
-			rc_keydown_notimeout(ir->dev, RC_TYPE_NECX, scancode,
+			rc_keydown_notimeout(ir->dev, RC_PROTO_NECX, scancode,
 					     0);
 		else
 			rc_keyup(ir->dev);
@@ -154,20 +154,22 @@ static void cx88_ir_handle_key(struct cx88_IR *ir)
 	} else if (ir->mask_keydown) {
 		/* bit set on keydown */
 		if (gpio & ir->mask_keydown)
-			rc_keydown_notimeout(ir->dev, RC_TYPE_UNKNOWN, data, 0);
+			rc_keydown_notimeout(ir->dev, RC_PROTO_UNKNOWN, data,
+					     0);
 		else
 			rc_keyup(ir->dev);
 
 	} else if (ir->mask_keyup) {
 		/* bit cleared on keydown */
 		if (0 == (gpio & ir->mask_keyup))
-			rc_keydown_notimeout(ir->dev, RC_TYPE_UNKNOWN, data, 0);
+			rc_keydown_notimeout(ir->dev, RC_PROTO_UNKNOWN, data,
+					     0);
 		else
 			rc_keyup(ir->dev);
 
 	} else {
 		/* can't distinguish keydown/up :-/ */
-		rc_keydown_notimeout(ir->dev, RC_TYPE_UNKNOWN, data, 0);
+		rc_keydown_notimeout(ir->dev, RC_PROTO_UNKNOWN, data, 0);
 		rc_keyup(ir->dev);
 	}
 }
@@ -267,7 +269,7 @@ int cx88_ir_init(struct cx88_core *core, struct pci_dev *pci)
 	struct cx88_IR *ir;
 	struct rc_dev *dev;
 	char *ir_codes = NULL;
-	u64 rc_type = RC_BIT_OTHER;
+	u64 rc_proto = RC_PROTO_BIT_OTHER;
 	int err = -ENOMEM;
 	u32 hardware_mask = 0;	/* For devices with a hardware mask, when
 				 * used with a full-code IR table
@@ -348,7 +350,7 @@ int cx88_ir_init(struct cx88_core *core, struct pci_dev *pci)
 		 * 002-T mini RC, provided with newer PV hardware
 		 */
 		ir_codes = RC_MAP_PIXELVIEW_MK12;
-		rc_type = RC_BIT_NECX;
+		rc_proto = RC_PROTO_BIT_NECX;
 		ir->gpio_addr = MO_GP1_IO;
 		ir->mask_keyup = 0x80;
 		ir->polling = 10; /* ms */
@@ -487,7 +489,7 @@ int cx88_ir_init(struct cx88_core *core, struct pci_dev *pci)
 		dev->timeout = 10 * 1000 * 1000; /* 10 ms */
 	} else {
 		dev->driver_type = RC_DRIVER_SCANCODE;
-		dev->allowed_protocols = rc_type;
+		dev->allowed_protocols = rc_proto;
 	}
 
 	ir->core = core;
@@ -557,7 +559,7 @@ void cx88_ir_irq(struct cx88_core *core)
 	ir_raw_event_handle(ir->dev);
 }
 
-static int get_key_pvr2000(struct IR_i2c *ir, enum rc_type *protocol,
+static int get_key_pvr2000(struct IR_i2c *ir, enum rc_proto *protocol,
 			   u32 *scancode, u8 *toggle)
 {
 	int flags, code;
@@ -582,7 +584,7 @@ static int get_key_pvr2000(struct IR_i2c *ir, enum rc_type *protocol,
 	dprintk("IR Key/Flags: (0x%02x/0x%02x)\n",
 		code & 0xff, flags & 0xff);
 
-	*protocol = RC_TYPE_UNKNOWN;
+	*protocol = RC_PROTO_UNKNOWN;
 	*scancode = code & 0xff;
 	*toggle = 0;
 	return 1;
@@ -612,7 +614,7 @@ void cx88_i2c_init_ir(struct cx88_core *core)
 	case CX88_BOARD_LEADTEK_PVR2000:
 		addr_list = pvr2000_addr_list;
 		core->init_data.name = "cx88 Leadtek PVR 2000 remote";
-		core->init_data.type = RC_BIT_UNKNOWN;
+		core->init_data.type = RC_PROTO_BIT_UNKNOWN;
 		core->init_data.get_key = get_key_pvr2000;
 		core->init_data.ir_codes = RC_MAP_EMPTY;
 		break;
@@ -633,8 +635,8 @@ void cx88_i2c_init_ir(struct cx88_core *core)
 			/* Hauppauge XVR */
 			core->init_data.name = "cx88 Hauppauge XVR remote";
 			core->init_data.ir_codes = RC_MAP_HAUPPAUGE;
-			core->init_data.type = RC_BIT_RC5 | RC_BIT_RC6_MCE |
-							RC_BIT_RC6_6A_32;
+			core->init_data.type = RC_PROTO_BIT_RC5 |
+				RC_PROTO_BIT_RC6_MCE | RC_PROTO_BIT_RC6_6A_32;
 			core->init_data.internal_get_key_func = IR_KBD_GET_KEY_HAUP_XVR;
 
 			info.platform_data = &core->init_data;
diff --git a/drivers/media/pci/dm1105/dm1105.c b/drivers/media/pci/dm1105/dm1105.c
index 0bc618f36385..7c3900dec368 100644
--- a/drivers/media/pci/dm1105/dm1105.c
+++ b/drivers/media/pci/dm1105/dm1105.c
@@ -675,7 +675,7 @@ static void dm1105_emit_key(struct work_struct *work)
 	data = (ircom >> 8) & 0x7f;
 
 	/* FIXME: UNKNOWN because we don't generate a full NEC scancode (yet?) */
-	rc_keydown(ir->dev, RC_TYPE_UNKNOWN, data, 0);
+	rc_keydown(ir->dev, RC_PROTO_UNKNOWN, data, 0);
 }
 
 /* work handler */
diff --git a/drivers/media/pci/ivtv/ivtv-i2c.c b/drivers/media/pci/ivtv/ivtv-i2c.c
index dea80efd5836..69b4fa6f0362 100644
--- a/drivers/media/pci/ivtv/ivtv-i2c.c
+++ b/drivers/media/pci/ivtv/ivtv-i2c.c
@@ -148,7 +148,7 @@ static const char * const hw_devicenames[] = {
 	"ir_video",		/* IVTV_HW_I2C_IR_RX_ADAPTEC */
 };
 
-static int get_key_adaptec(struct IR_i2c *ir, enum rc_type *protocol,
+static int get_key_adaptec(struct IR_i2c *ir, enum rc_proto *protocol,
 			   u32 *scancode, u8 *toggle)
 {
 	unsigned char keybuf[4];
@@ -168,7 +168,7 @@ static int get_key_adaptec(struct IR_i2c *ir, enum rc_type *protocol,
 	keybuf[2] &= 0x7f;
 	keybuf[3] |= 0x80;
 
-	*protocol = RC_TYPE_UNKNOWN;
+	*protocol = RC_PROTO_UNKNOWN;
 	*scancode = keybuf[3] | keybuf[2] << 8 | keybuf[1] << 16 |keybuf[0] << 24;
 	*toggle = 0;
 	return 1;
@@ -201,22 +201,22 @@ static int ivtv_i2c_new_ir(struct ivtv *itv, u32 hw, const char *type, u8 addr)
 		init_data->ir_codes = RC_MAP_AVERMEDIA_CARDBUS;
 		init_data->internal_get_key_func =
 					IR_KBD_GET_KEY_AVERMEDIA_CARDBUS;
-		init_data->type = RC_BIT_OTHER;
+		init_data->type = RC_PROTO_BIT_OTHER;
 		init_data->name = "AVerMedia AVerTV card";
 		break;
 	case IVTV_HW_I2C_IR_RX_HAUP_EXT:
 	case IVTV_HW_I2C_IR_RX_HAUP_INT:
 		init_data->ir_codes = RC_MAP_HAUPPAUGE;
 		init_data->internal_get_key_func = IR_KBD_GET_KEY_HAUP;
-		init_data->type = RC_BIT_RC5;
+		init_data->type = RC_PROTO_BIT_RC5;
 		init_data->name = itv->card_name;
 		break;
 	case IVTV_HW_Z8F0811_IR_RX_HAUP:
 		/* Default to grey remote */
 		init_data->ir_codes = RC_MAP_HAUPPAUGE;
 		init_data->internal_get_key_func = IR_KBD_GET_KEY_HAUP_XVR;
-		init_data->type = RC_BIT_RC5 | RC_BIT_RC6_MCE |
-							RC_BIT_RC6_6A_32;
+		init_data->type = RC_PROTO_BIT_RC5 | RC_PROTO_BIT_RC6_MCE |
+							RC_PROTO_BIT_RC6_6A_32;
 		init_data->name = itv->card_name;
 		break;
 	case IVTV_HW_I2C_IR_RX_ADAPTEC:
@@ -224,7 +224,7 @@ static int ivtv_i2c_new_ir(struct ivtv *itv, u32 hw, const char *type, u8 addr)
 		init_data->name = itv->card_name;
 		/* FIXME: The protocol and RC_MAP needs to be corrected */
 		init_data->ir_codes = RC_MAP_EMPTY;
-		init_data->type = RC_BIT_UNKNOWN;
+		init_data->type = RC_PROTO_BIT_UNKNOWN;
 		break;
 	}
 
diff --git a/drivers/media/pci/mantis/mantis_input.c b/drivers/media/pci/mantis/mantis_input.c
index 001b7f827699..7519dcc934dd 100644
--- a/drivers/media/pci/mantis/mantis_input.c
+++ b/drivers/media/pci/mantis/mantis_input.c
@@ -31,7 +31,7 @@
 void mantis_input_process(struct mantis_pci *mantis, int scancode)
 {
 	if (mantis->rc)
-		rc_keydown(mantis->rc, RC_TYPE_UNKNOWN, scancode, 0);
+		rc_keydown(mantis->rc, RC_PROTO_UNKNOWN, scancode, 0);
 }
 
 int mantis_input_init(struct mantis_pci *mantis)
diff --git a/drivers/media/pci/saa7134/saa7134-input.c b/drivers/media/pci/saa7134/saa7134-input.c
index a14b86d88afb..9337e4615519 100644
--- a/drivers/media/pci/saa7134/saa7134-input.c
+++ b/drivers/media/pci/saa7134/saa7134-input.c
@@ -83,14 +83,16 @@ static int build_key(struct saa7134_dev *dev)
 		if (data == ir->mask_keycode)
 			rc_keyup(ir->dev);
 		else
-			rc_keydown_notimeout(ir->dev, RC_TYPE_UNKNOWN, data, 0);
+			rc_keydown_notimeout(ir->dev, RC_PROTO_UNKNOWN, data,
+					     0);
 		return 0;
 	}
 
 	if (ir->polling) {
 		if ((ir->mask_keydown  &&  (0 != (gpio & ir->mask_keydown))) ||
 		    (ir->mask_keyup    &&  (0 == (gpio & ir->mask_keyup)))) {
-			rc_keydown_notimeout(ir->dev, RC_TYPE_UNKNOWN, data, 0);
+			rc_keydown_notimeout(ir->dev, RC_PROTO_UNKNOWN, data,
+					     0);
 		} else {
 			rc_keyup(ir->dev);
 		}
@@ -98,7 +100,8 @@ static int build_key(struct saa7134_dev *dev)
 	else {	/* IRQ driven mode - handle key press and release in one go */
 		if ((ir->mask_keydown  &&  (0 != (gpio & ir->mask_keydown))) ||
 		    (ir->mask_keyup    &&  (0 == (gpio & ir->mask_keyup)))) {
-			rc_keydown_notimeout(ir->dev, RC_TYPE_UNKNOWN, data, 0);
+			rc_keydown_notimeout(ir->dev, RC_PROTO_UNKNOWN, data,
+					     0);
 			rc_keyup(ir->dev);
 		}
 	}
@@ -108,7 +111,7 @@ static int build_key(struct saa7134_dev *dev)
 
 /* --------------------- Chip specific I2C key builders ----------------- */
 
-static int get_key_flydvb_trio(struct IR_i2c *ir, enum rc_type *protocol,
+static int get_key_flydvb_trio(struct IR_i2c *ir, enum rc_proto *protocol,
 			       u32 *scancode, u8 *toggle)
 {
 	int gpio;
@@ -154,13 +157,14 @@ static int get_key_flydvb_trio(struct IR_i2c *ir, enum rc_type *protocol,
 		return -EIO;
 	}
 
-	*protocol = RC_TYPE_UNKNOWN;
+	*protocol = RC_PROTO_UNKNOWN;
 	*scancode = b;
 	*toggle = 0;
 	return 1;
 }
 
-static int get_key_msi_tvanywhere_plus(struct IR_i2c *ir, enum rc_type *protocol,
+static int get_key_msi_tvanywhere_plus(struct IR_i2c *ir,
+				       enum rc_proto *protocol,
 				       u32 *scancode, u8 *toggle)
 {
 	unsigned char b;
@@ -201,14 +205,14 @@ static int get_key_msi_tvanywhere_plus(struct IR_i2c *ir, enum rc_type *protocol
 	/* Button pressed */
 
 	input_dbg("get_key_msi_tvanywhere_plus: Key = 0x%02X\n", b);
-	*protocol = RC_TYPE_UNKNOWN;
+	*protocol = RC_PROTO_UNKNOWN;
 	*scancode = b;
 	*toggle = 0;
 	return 1;
 }
 
 /* copied and modified from get_key_msi_tvanywhere_plus() */
-static int get_key_kworld_pc150u(struct IR_i2c *ir, enum rc_type *protocol,
+static int get_key_kworld_pc150u(struct IR_i2c *ir, enum rc_proto *protocol,
 				 u32 *scancode, u8 *toggle)
 {
 	unsigned char b;
@@ -249,13 +253,13 @@ static int get_key_kworld_pc150u(struct IR_i2c *ir, enum rc_type *protocol,
 	/* Button pressed */
 
 	input_dbg("get_key_kworld_pc150u: Key = 0x%02X\n", b);
-	*protocol = RC_TYPE_UNKNOWN;
+	*protocol = RC_PROTO_UNKNOWN;
 	*scancode = b;
 	*toggle = 0;
 	return 1;
 }
 
-static int get_key_purpletv(struct IR_i2c *ir, enum rc_type *protocol,
+static int get_key_purpletv(struct IR_i2c *ir, enum rc_proto *protocol,
 			    u32 *scancode, u8 *toggle)
 {
 	unsigned char b;
@@ -274,13 +278,13 @@ static int get_key_purpletv(struct IR_i2c *ir, enum rc_type *protocol,
 	if (b & 0x80)
 		return 1;
 
-	*protocol = RC_TYPE_UNKNOWN;
+	*protocol = RC_PROTO_UNKNOWN;
 	*scancode = b;
 	*toggle = 0;
 	return 1;
 }
 
-static int get_key_hvr1110(struct IR_i2c *ir, enum rc_type *protocol,
+static int get_key_hvr1110(struct IR_i2c *ir, enum rc_proto *protocol,
 			   u32 *scancode, u8 *toggle)
 {
 	unsigned char buf[5];
@@ -304,14 +308,14 @@ static int get_key_hvr1110(struct IR_i2c *ir, enum rc_type *protocol,
 	 *
 	 * FIXME: start bits could maybe be used...?
 	 */
-	*protocol = RC_TYPE_RC5;
+	*protocol = RC_PROTO_RC5;
 	*scancode = RC_SCANCODE_RC5(buf[3] & 0x1f, buf[4] >> 2);
 	*toggle = !!(buf[3] & 0x40);
 	return 1;
 }
 
 
-static int get_key_beholdm6xx(struct IR_i2c *ir, enum rc_type *protocol,
+static int get_key_beholdm6xx(struct IR_i2c *ir, enum rc_proto *protocol,
 			      u32 *scancode, u8 *toggle)
 {
 	unsigned char data[12];
@@ -338,7 +342,7 @@ static int get_key_beholdm6xx(struct IR_i2c *ir, enum rc_type *protocol,
 	if (data[9] != (unsigned char)(~data[8]))
 		return 0;
 
-	*protocol = RC_TYPE_NECX;
+	*protocol = RC_PROTO_NECX;
 	*scancode = RC_SCANCODE_NECX(data[11] << 8 | data[10], data[9]);
 	*toggle = 0;
 	return 1;
@@ -347,7 +351,7 @@ static int get_key_beholdm6xx(struct IR_i2c *ir, enum rc_type *protocol,
 /* Common (grey or coloured) pinnacle PCTV remote handling
  *
  */
-static int get_key_pinnacle(struct IR_i2c *ir, enum rc_type *protocol,
+static int get_key_pinnacle(struct IR_i2c *ir, enum rc_proto *protocol,
 			    u32 *scancode, u8 *toggle, int parity_offset,
 			    int marker, int code_modulo)
 {
@@ -384,7 +388,7 @@ static int get_key_pinnacle(struct IR_i2c *ir, enum rc_type *protocol,
 
 	code %= code_modulo;
 
-	*protocol = RC_TYPE_UNKNOWN;
+	*protocol = RC_PROTO_UNKNOWN;
 	*scancode = code;
 	*toggle = 0;
 
@@ -401,7 +405,7 @@ static int get_key_pinnacle(struct IR_i2c *ir, enum rc_type *protocol,
  *
  * Sylvain Pasche <sylvain.pasche@gmail.com>
  */
-static int get_key_pinnacle_grey(struct IR_i2c *ir, enum rc_type *protocol,
+static int get_key_pinnacle_grey(struct IR_i2c *ir, enum rc_proto *protocol,
 				 u32 *scancode, u8 *toggle)
 {
 
@@ -413,7 +417,7 @@ static int get_key_pinnacle_grey(struct IR_i2c *ir, enum rc_type *protocol,
  *
  * Ricardo Cerqueira <v4l@cerqueira.org>
  */
-static int get_key_pinnacle_color(struct IR_i2c *ir, enum rc_type *protocol,
+static int get_key_pinnacle_color(struct IR_i2c *ir, enum rc_proto *protocol,
 				  u32 *scancode, u8 *toggle)
 {
 	/* code_modulo parameter (0x88) is used to reduce code value to fit inside IR_KEYTAB_SIZE
@@ -858,7 +862,7 @@ int saa7134_input_init1(struct saa7134_dev *dev)
 	rc->close = saa7134_ir_close;
 	if (raw_decode) {
 		rc->driver_type = RC_DRIVER_IR_RAW;
-		rc->allowed_protocols = RC_BIT_ALL_IR_DECODER;
+		rc->allowed_protocols = RC_PROTO_BIT_ALL_IR_DECODER;
 	}
 
 	rc->device_name = ir->name;
@@ -1022,7 +1026,7 @@ void saa7134_probe_i2c_ir(struct saa7134_dev *dev)
 		dev->init_data.name = "BeholdTV";
 		dev->init_data.get_key = get_key_beholdm6xx;
 		dev->init_data.ir_codes = RC_MAP_BEHOLD;
-		dev->init_data.type = RC_BIT_NECX;
+		dev->init_data.type = RC_PROTO_BIT_NECX;
 		info.addr = 0x2d;
 		break;
 	case SAA7134_BOARD_AVERMEDIA_CARDBUS_501:
diff --git a/drivers/media/pci/smipcie/smipcie-ir.c b/drivers/media/pci/smipcie/smipcie-ir.c
index fc3375720a35..c5595af6b976 100644
--- a/drivers/media/pci/smipcie/smipcie-ir.c
+++ b/drivers/media/pci/smipcie/smipcie-ir.c
@@ -144,7 +144,7 @@ static void smi_ir_decode(struct work_struct *work)
 			rc5_system = (dwIRCode & 0x7C0) >> 6;
 			toggle = (dwIRCode & 0x800) ? 1 : 0;
 			scancode = rc5_system << 8 | rc5_command;
-			rc_keydown(rc_dev, RC_TYPE_RC5, scancode, toggle);
+			rc_keydown(rc_dev, RC_PROTO_RC5, scancode, toggle);
 		}
 	}
 end_ir_decode:
diff --git a/drivers/media/pci/ttpci/budget-ci.c b/drivers/media/pci/ttpci/budget-ci.c
index cf185c83cc9e..57af11804fd6 100644
--- a/drivers/media/pci/ttpci/budget-ci.c
+++ b/drivers/media/pci/ttpci/budget-ci.c
@@ -158,14 +158,15 @@ static void msp430_ir_interrupt(unsigned long data)
 		return;
 
 	if (budget_ci->ir.full_rc5) {
-		rc_keydown(dev, RC_TYPE_RC5,
+		rc_keydown(dev, RC_PROTO_RC5,
 			   RC_SCANCODE_RC5(budget_ci->ir.rc5_device, budget_ci->ir.ir_key),
 			   !!(command & 0x20));
 		return;
 	}
 
 	/* FIXME: We should generate complete scancodes for all devices */
-	rc_keydown(dev, RC_TYPE_UNKNOWN, budget_ci->ir.ir_key, !!(command & 0x20));
+	rc_keydown(dev, RC_PROTO_UNKNOWN, budget_ci->ir.ir_key,
+		   !!(command & 0x20));
 }
 
 static int msp430_ir_init(struct budget_ci *budget_ci)
diff --git a/drivers/media/rc/ati_remote.c b/drivers/media/rc/ati_remote.c
index a7f76002b30a..d0871d60a723 100644
--- a/drivers/media/rc/ati_remote.c
+++ b/drivers/media/rc/ati_remote.c
@@ -622,7 +622,8 @@ static void ati_remote_input_report(struct urb *urb)
 				* it would cause ghost repeats which would be a
 				* regression for this driver.
 				*/
-				rc_keydown_notimeout(ati_remote->rdev, RC_TYPE_OTHER,
+				rc_keydown_notimeout(ati_remote->rdev,
+						     RC_PROTO_OTHER,
 						     scancode, data[2]);
 				rc_keyup(ati_remote->rdev);
 			}
@@ -760,7 +761,7 @@ static void ati_remote_rc_init(struct ati_remote *ati_remote)
 	struct rc_dev *rdev = ati_remote->rdev;
 
 	rdev->priv = ati_remote;
-	rdev->allowed_protocols = RC_BIT_OTHER;
+	rdev->allowed_protocols = RC_PROTO_BIT_OTHER;
 	rdev->driver_name = "ati_remote";
 
 	rdev->open = ati_remote_rc_open;
diff --git a/drivers/media/rc/ene_ir.c b/drivers/media/rc/ene_ir.c
index 41f6b1c52407..af7ba23e16e1 100644
--- a/drivers/media/rc/ene_ir.c
+++ b/drivers/media/rc/ene_ir.c
@@ -1053,7 +1053,7 @@ static int ene_probe(struct pnp_dev *pnp_dev, const struct pnp_device_id *id)
 	if (!dev->hw_learning_and_tx_capable)
 		learning_mode_force = false;
 
-	rdev->allowed_protocols = RC_BIT_ALL_IR_DECODER;
+	rdev->allowed_protocols = RC_PROTO_BIT_ALL_IR_DECODER;
 	rdev->priv = dev;
 	rdev->open = ene_open;
 	rdev->close = ene_close;
diff --git a/drivers/media/rc/fintek-cir.c b/drivers/media/rc/fintek-cir.c
index 57155e4c9f38..f2639d0c2fca 100644
--- a/drivers/media/rc/fintek-cir.c
+++ b/drivers/media/rc/fintek-cir.c
@@ -529,7 +529,7 @@ static int fintek_probe(struct pnp_dev *pdev, const struct pnp_device_id *dev_id
 
 	/* Set up the rc device */
 	rdev->priv = fintek;
-	rdev->allowed_protocols = RC_BIT_ALL_IR_DECODER;
+	rdev->allowed_protocols = RC_PROTO_BIT_ALL_IR_DECODER;
 	rdev->open = fintek_open;
 	rdev->close = fintek_close;
 	rdev->device_name = FINTEK_DESCRIPTION;
diff --git a/drivers/media/rc/gpio-ir-recv.c b/drivers/media/rc/gpio-ir-recv.c
index 24c7ac8f1b82..7248b3662285 100644
--- a/drivers/media/rc/gpio-ir-recv.c
+++ b/drivers/media/rc/gpio-ir-recv.c
@@ -143,7 +143,7 @@ static int gpio_ir_recv_probe(struct platform_device *pdev)
 	if (pdata->allowed_protos)
 		rcdev->allowed_protocols = pdata->allowed_protos;
 	else
-		rcdev->allowed_protocols = RC_BIT_ALL_IR_DECODER;
+		rcdev->allowed_protocols = RC_PROTO_BIT_ALL_IR_DECODER;
 	rcdev->map_name = pdata->map_name ?: RC_MAP_EMPTY;
 
 	gpio_dev->rcdev = rcdev;
diff --git a/drivers/media/rc/igorplugusb.c b/drivers/media/rc/igorplugusb.c
index 5babc6371df4..a5ea86be8f44 100644
--- a/drivers/media/rc/igorplugusb.c
+++ b/drivers/media/rc/igorplugusb.c
@@ -202,10 +202,11 @@ static int igorplugusb_probe(struct usb_interface *intf,
 	 * This device can only store 36 pulses + spaces, which is not enough
 	 * for the NEC protocol and many others.
 	 */
-	rc->allowed_protocols = RC_BIT_ALL_IR_DECODER & ~(RC_BIT_NEC |
-			RC_BIT_NECX | RC_BIT_NEC32 | RC_BIT_RC6_6A_20 |
-			RC_BIT_RC6_6A_24 | RC_BIT_RC6_6A_32 | RC_BIT_RC6_MCE |
-			RC_BIT_SONY20 | RC_BIT_SANYO);
+	rc->allowed_protocols = RC_PROTO_BIT_ALL_IR_DECODER &
+		~(RC_PROTO_BIT_NEC | RC_PROTO_BIT_NECX | RC_PROTO_BIT_NEC32 |
+		  RC_PROTO_BIT_RC6_6A_20 | RC_PROTO_BIT_RC6_6A_24 |
+		  RC_PROTO_BIT_RC6_6A_32 | RC_PROTO_BIT_RC6_MCE |
+		  RC_PROTO_BIT_SONY20 | RC_PROTO_BIT_SANYO);
 
 	rc->priv = ir;
 	rc->driver_name = DRIVER_NAME;
diff --git a/drivers/media/rc/iguanair.c b/drivers/media/rc/iguanair.c
index 4357dd36d7b9..30e24da67226 100644
--- a/drivers/media/rc/iguanair.c
+++ b/drivers/media/rc/iguanair.c
@@ -491,7 +491,7 @@ static int iguanair_probe(struct usb_interface *intf,
 	rc->input_phys = ir->phys;
 	usb_to_input_id(ir->udev, &rc->input_id);
 	rc->dev.parent = &intf->dev;
-	rc->allowed_protocols = RC_BIT_ALL_IR_DECODER;
+	rc->allowed_protocols = RC_PROTO_BIT_ALL_IR_DECODER;
 	rc->priv = ir;
 	rc->open = iguanair_open;
 	rc->close = iguanair_close;
diff --git a/drivers/media/rc/img-ir/img-ir-hw.c b/drivers/media/rc/img-ir/img-ir-hw.c
index dd46973e0cbf..82fdf4cc0824 100644
--- a/drivers/media/rc/img-ir/img-ir-hw.c
+++ b/drivers/media/rc/img-ir/img-ir-hw.c
@@ -589,7 +589,7 @@ static void img_ir_set_decoder(struct img_ir_priv *priv,
 	/* clear the wakeup scancode filter */
 	rdev->scancode_wakeup_filter.data = 0;
 	rdev->scancode_wakeup_filter.mask = 0;
-	rdev->wakeup_protocol = RC_TYPE_UNKNOWN;
+	rdev->wakeup_protocol = RC_PROTO_UNKNOWN;
 
 	/* clear raw filters */
 	_img_ir_set_filter(priv, NULL);
@@ -823,7 +823,7 @@ static void img_ir_handle_data(struct img_ir_priv *priv, u32 len, u64 raw)
 	int ret = IMG_IR_SCANCODE;
 	struct img_ir_scancode_req request;
 
-	request.protocol = RC_TYPE_UNKNOWN;
+	request.protocol = RC_PROTO_UNKNOWN;
 	request.toggle   = 0;
 
 	if (dec->scancode)
diff --git a/drivers/media/rc/img-ir/img-ir-hw.h b/drivers/media/rc/img-ir/img-ir-hw.h
index 91a297731661..58b68dd6c67d 100644
--- a/drivers/media/rc/img-ir/img-ir-hw.h
+++ b/drivers/media/rc/img-ir/img-ir-hw.h
@@ -135,13 +135,13 @@ struct img_ir_timing_regvals {
 /**
  * struct img_ir_scancode_req - Scancode request data.
  * @protocol:	Protocol code of received message (defaults to
- *		RC_TYPE_UNKNOWN).
+ *		RC_PROTO_UNKNOWN).
  * @scancode:	Scan code of received message (must be written by
  *		handler if IMG_IR_SCANCODE is returned).
  * @toggle:	Toggle bit (defaults to 0).
  */
 struct img_ir_scancode_req {
-	enum rc_type protocol;
+	enum rc_proto protocol;
 	u32 scancode;
 	u8 toggle;
 };
diff --git a/drivers/media/rc/img-ir/img-ir-jvc.c b/drivers/media/rc/img-ir/img-ir-jvc.c
index d3e2fc0bcfe1..4b07c76fbe1b 100644
--- a/drivers/media/rc/img-ir/img-ir-jvc.c
+++ b/drivers/media/rc/img-ir/img-ir-jvc.c
@@ -23,7 +23,7 @@ static int img_ir_jvc_scancode(int len, u64 raw, u64 enabled_protocols,
 	cust = (raw >> 0) & 0xff;
 	data = (raw >> 8) & 0xff;
 
-	request->protocol = RC_TYPE_JVC;
+	request->protocol = RC_PROTO_JVC;
 	request->scancode = cust << 8 | data;
 	return IMG_IR_SCANCODE;
 }
@@ -52,7 +52,7 @@ static int img_ir_jvc_filter(const struct rc_scancode_filter *in,
  *          http://support.jvc.com/consumer/support/documents/RemoteCodes.pdf
  */
 struct img_ir_decoder img_ir_jvc = {
-	.type = RC_BIT_JVC,
+	.type = RC_PROTO_BIT_JVC,
 	.control = {
 		.decoden = 1,
 		.code_type = IMG_IR_CODETYPE_PULSEDIST,
diff --git a/drivers/media/rc/img-ir/img-ir-nec.c b/drivers/media/rc/img-ir/img-ir-nec.c
index 044fd42b22a0..2fc0678ad2d7 100644
--- a/drivers/media/rc/img-ir/img-ir-nec.c
+++ b/drivers/media/rc/img-ir/img-ir-nec.c
@@ -35,20 +35,20 @@ static int img_ir_nec_scancode(int len, u64 raw, u64 enabled_protocols,
 				bitrev8(addr_inv) << 16 |
 				bitrev8(data)     <<  8 |
 				bitrev8(data_inv);
-		request->protocol = RC_TYPE_NEC32;
+		request->protocol = RC_PROTO_NEC32;
 	} else if ((addr_inv ^ addr) != 0xff) {
 		/* Extended NEC */
 		/* scan encoding: AAaaDD */
 		request->scancode = addr     << 16 |
 				addr_inv <<  8 |
 				data;
-		request->protocol = RC_TYPE_NECX;
+		request->protocol = RC_PROTO_NECX;
 	} else {
 		/* Normal NEC */
 		/* scan encoding: AADD */
 		request->scancode = addr << 8 |
 				data;
-		request->protocol = RC_TYPE_NEC;
+		request->protocol = RC_PROTO_NEC;
 	}
 	return IMG_IR_SCANCODE;
 }
@@ -63,7 +63,7 @@ static int img_ir_nec_filter(const struct rc_scancode_filter *in,
 	data       = in->data & 0xff;
 	data_m     = in->mask & 0xff;
 
-	protocols &= RC_BIT_NEC | RC_BIT_NECX | RC_BIT_NEC32;
+	protocols &= RC_PROTO_BIT_NEC | RC_PROTO_BIT_NECX | RC_PROTO_BIT_NEC32;
 
 	/*
 	 * If only one bit is set, we were requested to do an exact
@@ -72,14 +72,14 @@ static int img_ir_nec_filter(const struct rc_scancode_filter *in,
 	 */
 	if (!is_power_of_2(protocols)) {
 		if ((in->data | in->mask) & 0xff000000)
-			protocols = RC_BIT_NEC32;
+			protocols = RC_PROTO_BIT_NEC32;
 		else if ((in->data | in->mask) & 0x00ff0000)
-			protocols = RC_BIT_NECX;
+			protocols = RC_PROTO_BIT_NECX;
 		else
-			protocols = RC_BIT_NEC;
+			protocols = RC_PROTO_BIT_NEC;
 	}
 
-	if (protocols == RC_BIT_NEC32) {
+	if (protocols == RC_PROTO_BIT_NEC32) {
 		/* 32-bit NEC (used by Apple and TiVo remotes) */
 		/* scan encoding: as transmitted, MSBit = first received bit */
 		addr       = bitrev8(in->data >> 24);
@@ -90,7 +90,7 @@ static int img_ir_nec_filter(const struct rc_scancode_filter *in,
 		data_m     = bitrev8(in->mask >>  8);
 		data_inv   = bitrev8(in->data >>  0);
 		data_inv_m = bitrev8(in->mask >>  0);
-	} else if (protocols == RC_BIT_NECX) {
+	} else if (protocols == RC_PROTO_BIT_NECX) {
 		/* Extended NEC */
 		/* scan encoding AAaaDD */
 		addr       = (in->data >> 16) & 0xff;
@@ -128,7 +128,7 @@ static int img_ir_nec_filter(const struct rc_scancode_filter *in,
  *        http://wiki.altium.com/display/ADOH/NEC+Infrared+Transmission+Protocol
  */
 struct img_ir_decoder img_ir_nec = {
-	.type = RC_BIT_NEC | RC_BIT_NECX | RC_BIT_NEC32,
+	.type = RC_PROTO_BIT_NEC | RC_PROTO_BIT_NECX | RC_PROTO_BIT_NEC32,
 	.control = {
 		.decoden = 1,
 		.code_type = IMG_IR_CODETYPE_PULSEDIST,
diff --git a/drivers/media/rc/img-ir/img-ir-rc5.c b/drivers/media/rc/img-ir/img-ir-rc5.c
index a8a28a377eee..a1bc8705472b 100644
--- a/drivers/media/rc/img-ir/img-ir-rc5.c
+++ b/drivers/media/rc/img-ir/img-ir-rc5.c
@@ -33,7 +33,7 @@ static int img_ir_rc5_scancode(int len, u64 raw, u64 enabled_protocols,
 	if (!start)
 		return -EINVAL;
 
-	request->protocol = RC_TYPE_RC5;
+	request->protocol = RC_PROTO_RC5;
 	request->scancode = addr << 8 | cmd;
 	request->toggle   = tgl;
 	return IMG_IR_SCANCODE;
@@ -52,7 +52,7 @@ static int img_ir_rc5_filter(const struct rc_scancode_filter *in,
  * see http://www.sbprojects.com/knowledge/ir/rc5.php
  */
 struct img_ir_decoder img_ir_rc5 = {
-	.type      = RC_BIT_RC5,
+	.type      = RC_PROTO_BIT_RC5,
 	.control   = {
 		.bitoriend2	= 1,
 		.code_type	= IMG_IR_CODETYPE_BIPHASE,
diff --git a/drivers/media/rc/img-ir/img-ir-rc6.c b/drivers/media/rc/img-ir/img-ir-rc6.c
index de1e27534968..5f34f59ca257 100644
--- a/drivers/media/rc/img-ir/img-ir-rc6.c
+++ b/drivers/media/rc/img-ir/img-ir-rc6.c
@@ -54,7 +54,7 @@ static int img_ir_rc6_scancode(int len, u64 raw, u64 enabled_protocols,
 	if (mode)
 		return -EINVAL;
 
-	request->protocol = RC_TYPE_RC6_0;
+	request->protocol = RC_PROTO_RC6_0;
 	request->scancode = addr << 8 | cmd;
 	request->toggle	  = trl2;
 	return IMG_IR_SCANCODE;
@@ -73,7 +73,7 @@ static int img_ir_rc6_filter(const struct rc_scancode_filter *in,
  * see http://www.sbprojects.com/knowledge/ir/rc6.php
  */
 struct img_ir_decoder img_ir_rc6 = {
-	.type		= RC_BIT_RC6_0,
+	.type		= RC_PROTO_BIT_RC6_0,
 	.control	= {
 		.bitorien	= 1,
 		.code_type	= IMG_IR_CODETYPE_BIPHASE,
diff --git a/drivers/media/rc/img-ir/img-ir-sanyo.c b/drivers/media/rc/img-ir/img-ir-sanyo.c
index f394994ffc22..55a755bb437c 100644
--- a/drivers/media/rc/img-ir/img-ir-sanyo.c
+++ b/drivers/media/rc/img-ir/img-ir-sanyo.c
@@ -44,7 +44,7 @@ static int img_ir_sanyo_scancode(int len, u64 raw, u64 enabled_protocols,
 		return -EINVAL;
 
 	/* Normal Sanyo */
-	request->protocol = RC_TYPE_SANYO;
+	request->protocol = RC_PROTO_SANYO;
 	request->scancode = addr << 8 | data;
 	return IMG_IR_SCANCODE;
 }
@@ -80,7 +80,7 @@ static int img_ir_sanyo_filter(const struct rc_scancode_filter *in,
 
 /* Sanyo decoder */
 struct img_ir_decoder img_ir_sanyo = {
-	.type = RC_BIT_SANYO,
+	.type = RC_PROTO_BIT_SANYO,
 	.control = {
 		.decoden = 1,
 		.code_type = IMG_IR_CODETYPE_PULSEDIST,
diff --git a/drivers/media/rc/img-ir/img-ir-sharp.c b/drivers/media/rc/img-ir/img-ir-sharp.c
index fe5acc4f030e..2d2530902cfa 100644
--- a/drivers/media/rc/img-ir/img-ir-sharp.c
+++ b/drivers/media/rc/img-ir/img-ir-sharp.c
@@ -32,7 +32,7 @@ static int img_ir_sharp_scancode(int len, u64 raw, u64 enabled_protocols,
 		/* probably the second half of the message */
 		return -EINVAL;
 
-	request->protocol = RC_TYPE_SHARP;
+	request->protocol = RC_PROTO_SHARP;
 	request->scancode = addr << 8 | cmd;
 	return IMG_IR_SCANCODE;
 }
@@ -73,7 +73,7 @@ static int img_ir_sharp_filter(const struct rc_scancode_filter *in,
  * See also http://www.sbprojects.com/knowledge/ir/sharp.php
  */
 struct img_ir_decoder img_ir_sharp = {
-	.type = RC_BIT_SHARP,
+	.type = RC_PROTO_BIT_SHARP,
 	.control = {
 		.decoden = 0,
 		.decodend2 = 1,
diff --git a/drivers/media/rc/img-ir/img-ir-sony.c b/drivers/media/rc/img-ir/img-ir-sony.c
index 3fcba271a419..a942d0be908c 100644
--- a/drivers/media/rc/img-ir/img-ir-sony.c
+++ b/drivers/media/rc/img-ir/img-ir-sony.c
@@ -19,32 +19,32 @@ static int img_ir_sony_scancode(int len, u64 raw, u64 enabled_protocols,
 
 	switch (len) {
 	case 12:
-		if (!(enabled_protocols & RC_BIT_SONY12))
+		if (!(enabled_protocols & RC_PROTO_BIT_SONY12))
 			return -EINVAL;
 		func   = raw & 0x7f;	/* first 7 bits */
 		raw    >>= 7;
 		dev    = raw & 0x1f;	/* next 5 bits */
 		subdev = 0;
-		request->protocol = RC_TYPE_SONY12;
+		request->protocol = RC_PROTO_SONY12;
 		break;
 	case 15:
-		if (!(enabled_protocols & RC_BIT_SONY15))
+		if (!(enabled_protocols & RC_PROTO_BIT_SONY15))
 			return -EINVAL;
 		func   = raw & 0x7f;	/* first 7 bits */
 		raw    >>= 7;
 		dev    = raw & 0xff;	/* next 8 bits */
 		subdev = 0;
-		request->protocol = RC_TYPE_SONY15;
+		request->protocol = RC_PROTO_SONY15;
 		break;
 	case 20:
-		if (!(enabled_protocols & RC_BIT_SONY20))
+		if (!(enabled_protocols & RC_PROTO_BIT_SONY20))
 			return -EINVAL;
 		func   = raw & 0x7f;	/* first 7 bits */
 		raw    >>= 7;
 		dev    = raw & 0x1f;	/* next 5 bits */
 		raw    >>= 5;
 		subdev = raw & 0xff;	/* next 8 bits */
-		request->protocol = RC_TYPE_SONY20;
+		request->protocol = RC_PROTO_SONY20;
 		break;
 	default:
 		return -EINVAL;
@@ -68,7 +68,8 @@ static int img_ir_sony_filter(const struct rc_scancode_filter *in,
 	func     = (in->data >> 0)  & 0x7f;
 	func_m   = (in->mask >> 0)  & 0x7f;
 
-	protocols &= RC_BIT_SONY12 | RC_BIT_SONY15 | RC_BIT_SONY20;
+	protocols &= RC_PROTO_BIT_SONY12 | RC_PROTO_BIT_SONY15 |
+							RC_PROTO_BIT_SONY20;
 
 	/*
 	 * If only one bit is set, we were requested to do an exact
@@ -77,20 +78,20 @@ static int img_ir_sony_filter(const struct rc_scancode_filter *in,
 	 */
 	if (!is_power_of_2(protocols)) {
 		if (subdev & subdev_m)
-			protocols = RC_BIT_SONY20;
+			protocols = RC_PROTO_BIT_SONY20;
 		else if (dev & dev_m & 0xe0)
-			protocols = RC_BIT_SONY15;
+			protocols = RC_PROTO_BIT_SONY15;
 		else
-			protocols = RC_BIT_SONY12;
+			protocols = RC_PROTO_BIT_SONY12;
 	}
 
-	if (protocols == RC_BIT_SONY20) {
+	if (protocols == RC_PROTO_BIT_SONY20) {
 		/* can't encode subdev and higher device bits */
 		if (dev & dev_m & 0xe0)
 			return -EINVAL;
 		len = 20;
 		dev_m &= 0x1f;
-	} else if (protocols == RC_BIT_SONY15) {
+	} else if (protocols == RC_PROTO_BIT_SONY15) {
 		len = 15;
 		subdev_m = 0;
 	} else {
@@ -128,7 +129,7 @@ static int img_ir_sony_filter(const struct rc_scancode_filter *in,
  *          http://picprojects.org.uk/projects/sirc/sonysirc.pdf
  */
 struct img_ir_decoder img_ir_sony = {
-	.type = RC_BIT_SONY12 | RC_BIT_SONY15 | RC_BIT_SONY20,
+	.type = RC_PROTO_BIT_SONY12 | RC_PROTO_BIT_SONY15 | RC_PROTO_BIT_SONY20,
 	.control = {
 		.decoden = 1,
 		.code_type = IMG_IR_CODETYPE_PULSELEN,
diff --git a/drivers/media/rc/imon.c b/drivers/media/rc/imon.c
index ab560fdaae7c..7b3f31cc63d2 100644
--- a/drivers/media/rc/imon.c
+++ b/drivers/media/rc/imon.c
@@ -148,7 +148,7 @@ struct imon_context {
 	u32 last_keycode;		/* last reported input keycode */
 	u32 rc_scancode;		/* the computed remote scancode */
 	u8 rc_toggle;			/* the computed remote toggle bit */
-	u64 rc_type;			/* iMON or MCE (RC6) IR protocol? */
+	u64 rc_proto;			/* iMON or MCE (RC6) IR protocol? */
 	bool release_code;		/* some keys send a release code */
 
 	u8 display_type;		/* store the display type */
@@ -1118,7 +1118,7 @@ static void imon_touch_display_timeout(unsigned long data)
  * it is not, so we must acquire it prior to calling send_packet, which
  * requires that the lock is held.
  */
-static int imon_ir_change_protocol(struct rc_dev *rc, u64 *rc_type)
+static int imon_ir_change_protocol(struct rc_dev *rc, u64 *rc_proto)
 {
 	int retval;
 	struct imon_context *ictx = rc->priv;
@@ -1127,25 +1127,25 @@ static int imon_ir_change_protocol(struct rc_dev *rc, u64 *rc_type)
 	unsigned char ir_proto_packet[] = {
 		0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x86 };
 
-	if (*rc_type && !(*rc_type & rc->allowed_protocols))
+	if (*rc_proto && !(*rc_proto & rc->allowed_protocols))
 		dev_warn(dev, "Looks like you're trying to use an IR protocol this device does not support\n");
 
-	if (*rc_type & RC_BIT_RC6_MCE) {
+	if (*rc_proto & RC_PROTO_BIT_RC6_MCE) {
 		dev_dbg(dev, "Configuring IR receiver for MCE protocol\n");
 		ir_proto_packet[0] = 0x01;
-		*rc_type = RC_BIT_RC6_MCE;
-	} else if (*rc_type & RC_BIT_OTHER) {
+		*rc_proto = RC_PROTO_BIT_RC6_MCE;
+	} else if (*rc_proto & RC_PROTO_BIT_OTHER) {
 		dev_dbg(dev, "Configuring IR receiver for iMON protocol\n");
 		if (!pad_stabilize)
 			dev_dbg(dev, "PAD stabilize functionality disabled\n");
 		/* ir_proto_packet[0] = 0x00; // already the default */
-		*rc_type = RC_BIT_OTHER;
+		*rc_proto = RC_PROTO_BIT_OTHER;
 	} else {
 		dev_warn(dev, "Unsupported IR protocol specified, overriding to iMON IR protocol\n");
 		if (!pad_stabilize)
 			dev_dbg(dev, "PAD stabilize functionality disabled\n");
 		/* ir_proto_packet[0] = 0x00; // already the default */
-		*rc_type = RC_BIT_OTHER;
+		*rc_proto = RC_PROTO_BIT_OTHER;
 	}
 
 	memcpy(ictx->usb_tx_buf, &ir_proto_packet, sizeof(ir_proto_packet));
@@ -1159,7 +1159,7 @@ static int imon_ir_change_protocol(struct rc_dev *rc, u64 *rc_type)
 	if (retval)
 		goto out;
 
-	ictx->rc_type = *rc_type;
+	ictx->rc_proto = *rc_proto;
 	ictx->pad_mouse = false;
 
 out:
@@ -1435,7 +1435,7 @@ static void imon_pad_to_keys(struct imon_context *ictx, unsigned char *buf)
 		rel_x = buf[2];
 		rel_y = buf[3];
 
-		if (ictx->rc_type == RC_BIT_OTHER && pad_stabilize) {
+		if (ictx->rc_proto == RC_PROTO_BIT_OTHER && pad_stabilize) {
 			if ((buf[1] == 0) && ((rel_x != 0) || (rel_y != 0))) {
 				dir = stabilize((int)rel_x, (int)rel_y,
 						timeout, threshold);
@@ -1502,7 +1502,7 @@ static void imon_pad_to_keys(struct imon_context *ictx, unsigned char *buf)
 		buf[0] = 0x01;
 		buf[1] = buf[4] = buf[5] = buf[6] = buf[7] = 0;
 
-		if (ictx->rc_type == RC_BIT_OTHER && pad_stabilize) {
+		if (ictx->rc_proto == RC_PROTO_BIT_OTHER && pad_stabilize) {
 			dir = stabilize((int)rel_x, (int)rel_y,
 					timeout, threshold);
 			if (!dir) {
@@ -1706,7 +1706,7 @@ static void imon_incoming_scancode(struct imon_context *ictx,
 		ictx->release_code = false;
 	} else {
 		scancode = be32_to_cpu(*((__be32 *)buf));
-		if (ictx->rc_type == RC_BIT_RC6_MCE) {
+		if (ictx->rc_proto == RC_PROTO_BIT_RC6_MCE) {
 			ktype = IMON_KEY_IMON;
 			if (buf[0] == 0x80)
 				ktype = IMON_KEY_MCE;
@@ -1769,10 +1769,10 @@ static void imon_incoming_scancode(struct imon_context *ictx,
 		if (press_type == 0)
 			rc_keyup(ictx->rdev);
 		else {
-			if (ictx->rc_type == RC_BIT_RC6_MCE ||
-			    ictx->rc_type == RC_BIT_OTHER)
+			if (ictx->rc_proto == RC_PROTO_BIT_RC6_MCE ||
+			    ictx->rc_proto == RC_PROTO_BIT_OTHER)
 				rc_keydown(ictx->rdev,
-					   ictx->rc_type == RC_BIT_RC6_MCE ? RC_TYPE_RC6_MCE : RC_TYPE_OTHER,
+					   ictx->rc_proto == RC_PROTO_BIT_RC6_MCE ? RC_PROTO_RC6_MCE : RC_PROTO_OTHER,
 					   ictx->rc_scancode, ictx->rc_toggle);
 			spin_lock_irqsave(&ictx->kc_lock, flags);
 			ictx->last_keycode = ictx->kc;
@@ -1936,7 +1936,7 @@ static void imon_get_ffdc_type(struct imon_context *ictx)
 {
 	u8 ffdc_cfg_byte = ictx->usb_rx_buf[6];
 	u8 detected_display_type = IMON_DISPLAY_TYPE_NONE;
-	u64 allowed_protos = RC_BIT_OTHER;
+	u64 allowed_protos = RC_PROTO_BIT_OTHER;
 
 	switch (ffdc_cfg_byte) {
 	/* iMON Knob, no display, iMON IR + vol knob */
@@ -1967,27 +1967,27 @@ static void imon_get_ffdc_type(struct imon_context *ictx)
 	case 0x9e:
 		dev_info(ictx->dev, "0xffdc iMON VFD, MCE IR");
 		detected_display_type = IMON_DISPLAY_TYPE_VFD;
-		allowed_protos = RC_BIT_RC6_MCE;
+		allowed_protos = RC_PROTO_BIT_RC6_MCE;
 		break;
 	/* iMON LCD, MCE IR */
 	case 0x9f:
 		dev_info(ictx->dev, "0xffdc iMON LCD, MCE IR");
 		detected_display_type = IMON_DISPLAY_TYPE_LCD;
-		allowed_protos = RC_BIT_RC6_MCE;
+		allowed_protos = RC_PROTO_BIT_RC6_MCE;
 		break;
 	default:
 		dev_info(ictx->dev, "Unknown 0xffdc device, defaulting to VFD and iMON IR");
 		detected_display_type = IMON_DISPLAY_TYPE_VFD;
 		/* We don't know which one it is, allow user to set the
 		 * RC6 one from userspace if OTHER wasn't correct. */
-		allowed_protos |= RC_BIT_RC6_MCE;
+		allowed_protos |= RC_PROTO_BIT_RC6_MCE;
 		break;
 	}
 
 	printk(KERN_CONT " (id 0x%02x)\n", ffdc_cfg_byte);
 
 	ictx->display_type = detected_display_type;
-	ictx->rc_type = allowed_protos;
+	ictx->rc_proto = allowed_protos;
 }
 
 static void imon_set_display_type(struct imon_context *ictx)
@@ -2070,10 +2070,11 @@ static struct rc_dev *imon_init_rdev(struct imon_context *ictx)
 
 	rdev->priv = ictx;
 	if (ictx->dev_descr->flags & IMON_IR_RAW)
-		rdev->allowed_protocols = RC_BIT_ALL_IR_DECODER;
+		rdev->allowed_protocols = RC_PROTO_BIT_ALL_IR_DECODER;
 	else
 		/* iMON PAD or MCE */
-		rdev->allowed_protocols = RC_BIT_OTHER | RC_BIT_RC6_MCE;
+		rdev->allowed_protocols = RC_PROTO_BIT_OTHER |
+					  RC_PROTO_BIT_RC6_MCE;
 	rdev->change_protocol = imon_ir_change_protocol;
 	rdev->driver_name = MOD_NAME;
 
@@ -2086,12 +2087,12 @@ static struct rc_dev *imon_init_rdev(struct imon_context *ictx)
 
 	if (ictx->product == 0xffdc) {
 		imon_get_ffdc_type(ictx);
-		rdev->allowed_protocols = ictx->rc_type;
+		rdev->allowed_protocols = ictx->rc_proto;
 	}
 
 	imon_set_display_type(ictx);
 
-	if (ictx->rc_type == RC_BIT_RC6_MCE ||
+	if (ictx->rc_proto == RC_PROTO_BIT_RC6_MCE ||
 	    ictx->dev_descr->flags & IMON_IR_RAW)
 		rdev->map_name = RC_MAP_IMON_MCE;
 	else
diff --git a/drivers/media/rc/ir-hix5hd2.c b/drivers/media/rc/ir-hix5hd2.c
index 0e639fb6f7b9..0ce11c41dfae 100644
--- a/drivers/media/rc/ir-hix5hd2.c
+++ b/drivers/media/rc/ir-hix5hd2.c
@@ -242,7 +242,7 @@ static int hix5hd2_ir_probe(struct platform_device *pdev)
 	clk_prepare_enable(priv->clock);
 	priv->rate = clk_get_rate(priv->clock);
 
-	rdev->allowed_protocols = RC_BIT_ALL_IR_DECODER;
+	rdev->allowed_protocols = RC_PROTO_BIT_ALL_IR_DECODER;
 	rdev->priv = priv;
 	rdev->open = hix5hd2_ir_open;
 	rdev->close = hix5hd2_ir_close;
diff --git a/drivers/media/rc/ir-jvc-decoder.c b/drivers/media/rc/ir-jvc-decoder.c
index 674bf156edcb..e2bd68c42edf 100644
--- a/drivers/media/rc/ir-jvc-decoder.c
+++ b/drivers/media/rc/ir-jvc-decoder.c
@@ -137,7 +137,7 @@ again:
 			scancode = (bitrev8((data->bits >> 8) & 0xff) << 8) |
 				   (bitrev8((data->bits >> 0) & 0xff) << 0);
 			IR_dprintk(1, "JVC scancode 0x%04x\n", scancode);
-			rc_keydown(dev, RC_TYPE_JVC, scancode, data->toggle);
+			rc_keydown(dev, RC_PROTO_JVC, scancode, data->toggle);
 			data->first = false;
 			data->old_bits = data->bits;
 		} else if (data->bits == data->old_bits) {
@@ -193,7 +193,7 @@ static const struct ir_raw_timings_pd ir_jvc_timings = {
  *		-ENOBUFS if there isn't enough space in the array to fit the
  *		encoding. In this case all @max events will have been written.
  */
-static int ir_jvc_encode(enum rc_type protocol, u32 scancode,
+static int ir_jvc_encode(enum rc_proto protocol, u32 scancode,
 			 struct ir_raw_event *events, unsigned int max)
 {
 	struct ir_raw_event *e = events;
@@ -209,7 +209,7 @@ static int ir_jvc_encode(enum rc_type protocol, u32 scancode,
 }
 
 static struct ir_raw_handler jvc_handler = {
-	.protocols	= RC_BIT_JVC,
+	.protocols	= RC_PROTO_BIT_JVC,
 	.decode		= ir_jvc_decode,
 	.encode		= ir_jvc_encode,
 };
diff --git a/drivers/media/rc/ir-mce_kbd-decoder.c b/drivers/media/rc/ir-mce_kbd-decoder.c
index 5de14ae77421..7c572a643656 100644
--- a/drivers/media/rc/ir-mce_kbd-decoder.c
+++ b/drivers/media/rc/ir-mce_kbd-decoder.c
@@ -444,14 +444,14 @@ static const struct ir_raw_timings_manchester ir_mce_kbd_timings = {
  *              -ENOBUFS if there isn't enough space in the array to fit the
  *              encoding. In this case all @max events will have been written.
  */
-static int ir_mce_kbd_encode(enum rc_type protocol, u32 scancode,
+static int ir_mce_kbd_encode(enum rc_proto protocol, u32 scancode,
 			     struct ir_raw_event *events, unsigned int max)
 {
 	struct ir_raw_event *e = events;
 	int len, ret;
 	u64 raw;
 
-	if (protocol == RC_TYPE_MCIR2_KBD) {
+	if (protocol == RC_PROTO_MCIR2_KBD) {
 		raw = scancode |
 		      ((u64)MCIR2_KEYBOARD_HEADER << MCIR2_KEYBOARD_NBITS);
 		len = MCIR2_KEYBOARD_NBITS + MCIR2_HEADER_NBITS + 1;
@@ -469,7 +469,7 @@ static int ir_mce_kbd_encode(enum rc_type protocol, u32 scancode,
 }
 
 static struct ir_raw_handler mce_kbd_handler = {
-	.protocols	= RC_BIT_MCIR2_KBD | RC_BIT_MCIR2_MSE,
+	.protocols	= RC_PROTO_BIT_MCIR2_KBD | RC_PROTO_BIT_MCIR2_MSE,
 	.decode		= ir_mce_kbd_decode,
 	.encode		= ir_mce_kbd_encode,
 	.raw_register	= ir_mce_kbd_register,
diff --git a/drivers/media/rc/ir-nec-decoder.c b/drivers/media/rc/ir-nec-decoder.c
index 23d2bc8c190d..817c18f2ddd1 100644
--- a/drivers/media/rc/ir-nec-decoder.c
+++ b/drivers/media/rc/ir-nec-decoder.c
@@ -49,7 +49,7 @@ static int ir_nec_decode(struct rc_dev *dev, struct ir_raw_event ev)
 {
 	struct nec_dec *data = &dev->raw->nec;
 	u32 scancode;
-	enum rc_type rc_type;
+	enum rc_proto rc_proto;
 	u8 address, not_address, command, not_command;
 
 	if (!is_timing_event(ev)) {
@@ -158,12 +158,12 @@ static int ir_nec_decode(struct rc_dev *dev, struct ir_raw_event ev)
 
 		scancode = ir_nec_bytes_to_scancode(address, not_address,
 						    command, not_command,
-						    &rc_type);
+						    &rc_proto);
 
 		if (data->is_nec_x)
 			data->necx_repeat = true;
 
-		rc_keydown(dev, rc_type, scancode, 0);
+		rc_keydown(dev, rc_proto, scancode, 0);
 		data->state = STATE_INACTIVE;
 		return 0;
 	}
@@ -180,19 +180,19 @@ static int ir_nec_decode(struct rc_dev *dev, struct ir_raw_event ev)
  * @scancode:	a single NEC scancode.
  * @raw:	raw data to be modulated.
  */
-static u32 ir_nec_scancode_to_raw(enum rc_type protocol, u32 scancode)
+static u32 ir_nec_scancode_to_raw(enum rc_proto protocol, u32 scancode)
 {
 	unsigned int addr, addr_inv, data, data_inv;
 
 	data = scancode & 0xff;
 
-	if (protocol == RC_TYPE_NEC32) {
+	if (protocol == RC_PROTO_NEC32) {
 		/* 32-bit NEC (used by Apple and TiVo remotes) */
 		/* scan encoding: aaAAddDD */
 		addr_inv   = (scancode >> 24) & 0xff;
 		addr       = (scancode >> 16) & 0xff;
 		data_inv   = (scancode >>  8) & 0xff;
-	} else if (protocol == RC_TYPE_NECX) {
+	} else if (protocol == RC_PROTO_NECX) {
 		/* Extended NEC */
 		/* scan encoding AAaaDD */
 		addr       = (scancode >> 16) & 0xff;
@@ -236,7 +236,7 @@ static const struct ir_raw_timings_pd ir_nec_timings = {
  *		-ENOBUFS if there isn't enough space in the array to fit the
  *		encoding. In this case all @max events will have been written.
  */
-static int ir_nec_encode(enum rc_type protocol, u32 scancode,
+static int ir_nec_encode(enum rc_proto protocol, u32 scancode,
 			 struct ir_raw_event *events, unsigned int max)
 {
 	struct ir_raw_event *e = events;
@@ -255,7 +255,8 @@ static int ir_nec_encode(enum rc_type protocol, u32 scancode,
 }
 
 static struct ir_raw_handler nec_handler = {
-	.protocols	= RC_BIT_NEC | RC_BIT_NECX | RC_BIT_NEC32,
+	.protocols	= RC_PROTO_BIT_NEC | RC_PROTO_BIT_NECX |
+							RC_PROTO_BIT_NEC32,
 	.decode		= ir_nec_decode,
 	.encode		= ir_nec_encode,
 };
diff --git a/drivers/media/rc/ir-rc5-decoder.c b/drivers/media/rc/ir-rc5-decoder.c
index fcfedf95def7..1292f534de43 100644
--- a/drivers/media/rc/ir-rc5-decoder.c
+++ b/drivers/media/rc/ir-rc5-decoder.c
@@ -51,7 +51,7 @@ static int ir_rc5_decode(struct rc_dev *dev, struct ir_raw_event ev)
 	struct rc5_dec *data = &dev->raw->rc5;
 	u8 toggle;
 	u32 scancode;
-	enum rc_type protocol;
+	enum rc_proto protocol;
 
 	if (!is_timing_event(ev)) {
 		if (ev.reset)
@@ -124,7 +124,7 @@ again:
 		if (data->is_rc5x && data->count == RC5X_NBITS) {
 			/* RC5X */
 			u8 xdata, command, system;
-			if (!(dev->enabled_protocols & RC_BIT_RC5X_20)) {
+			if (!(dev->enabled_protocols & RC_PROTO_BIT_RC5X_20)) {
 				data->state = STATE_INACTIVE;
 				return 0;
 			}
@@ -134,12 +134,12 @@ again:
 			toggle   = (data->bits & 0x20000) ? 1 : 0;
 			command += (data->bits & 0x40000) ? 0 : 0x40;
 			scancode = system << 16 | command << 8 | xdata;
-			protocol = RC_TYPE_RC5X_20;
+			protocol = RC_PROTO_RC5X_20;
 
 		} else if (!data->is_rc5x && data->count == RC5_NBITS) {
 			/* RC5 */
 			u8 command, system;
-			if (!(dev->enabled_protocols & RC_BIT_RC5)) {
+			if (!(dev->enabled_protocols & RC_PROTO_BIT_RC5)) {
 				data->state = STATE_INACTIVE;
 				return 0;
 			}
@@ -148,12 +148,12 @@ again:
 			toggle   = (data->bits & 0x00800) ? 1 : 0;
 			command += (data->bits & 0x01000) ? 0 : 0x40;
 			scancode = system << 8 | command;
-			protocol = RC_TYPE_RC5;
+			protocol = RC_PROTO_RC5;
 
 		} else if (!data->is_rc5x && data->count == RC5_SZ_NBITS) {
 			/* RC5 StreamZap */
 			u8 command, system;
-			if (!(dev->enabled_protocols & RC_BIT_RC5_SZ)) {
+			if (!(dev->enabled_protocols & RC_PROTO_BIT_RC5_SZ)) {
 				data->state = STATE_INACTIVE;
 				return 0;
 			}
@@ -161,7 +161,7 @@ again:
 			system   = (data->bits & 0x02FC0) >> 6;
 			toggle   = (data->bits & 0x01000) ? 1 : 0;
 			scancode = system << 6 | command;
-			protocol = RC_TYPE_RC5_SZ;
+			protocol = RC_PROTO_RC5_SZ;
 
 		} else
 			break;
@@ -221,7 +221,7 @@ static const struct ir_raw_timings_manchester ir_rc5_sz_timings = {
  *		encoding. In this case all @max events will have been written.
  *		-EINVAL if the scancode is ambiguous or invalid.
  */
-static int ir_rc5_encode(enum rc_type protocol, u32 scancode,
+static int ir_rc5_encode(enum rc_proto protocol, u32 scancode,
 			 struct ir_raw_event *events, unsigned int max)
 {
 	int ret;
@@ -229,7 +229,7 @@ static int ir_rc5_encode(enum rc_type protocol, u32 scancode,
 	unsigned int data, xdata, command, commandx, system, pre_space_data;
 
 	/* Detect protocol and convert scancode to raw data */
-	if (protocol == RC_TYPE_RC5) {
+	if (protocol == RC_PROTO_RC5) {
 		/* decode scancode */
 		command  = (scancode & 0x003f) >> 0;
 		commandx = (scancode & 0x0040) >> 6;
@@ -242,7 +242,7 @@ static int ir_rc5_encode(enum rc_type protocol, u32 scancode,
 					    RC5_NBITS, data);
 		if (ret < 0)
 			return ret;
-	} else if (protocol == RC_TYPE_RC5X_20) {
+	} else if (protocol == RC_PROTO_RC5X_20) {
 		/* decode scancode */
 		xdata    = (scancode & 0x00003f) >> 0;
 		command  = (scancode & 0x003f00) >> 8;
@@ -264,7 +264,7 @@ static int ir_rc5_encode(enum rc_type protocol, u32 scancode,
 					    data);
 		if (ret < 0)
 			return ret;
-	} else if (protocol == RC_TYPE_RC5_SZ) {
+	} else if (protocol == RC_PROTO_RC5_SZ) {
 		/* RC5-SZ scancode is raw enough for Manchester as it is */
 		ret = ir_raw_gen_manchester(&e, max, &ir_rc5_sz_timings,
 					    RC5_SZ_NBITS, scancode & 0x2fff);
@@ -278,7 +278,8 @@ static int ir_rc5_encode(enum rc_type protocol, u32 scancode,
 }
 
 static struct ir_raw_handler rc5_handler = {
-	.protocols	= RC_BIT_RC5 | RC_BIT_RC5X_20 | RC_BIT_RC5_SZ,
+	.protocols	= RC_PROTO_BIT_RC5 | RC_PROTO_BIT_RC5X_20 |
+							RC_PROTO_BIT_RC5_SZ,
 	.decode		= ir_rc5_decode,
 	.encode		= ir_rc5_encode,
 };
diff --git a/drivers/media/rc/ir-rc6-decoder.c b/drivers/media/rc/ir-rc6-decoder.c
index 6fe2268dada0..5d0d2fe3b7a7 100644
--- a/drivers/media/rc/ir-rc6-decoder.c
+++ b/drivers/media/rc/ir-rc6-decoder.c
@@ -88,7 +88,7 @@ static int ir_rc6_decode(struct rc_dev *dev, struct ir_raw_event ev)
 	struct rc6_dec *data = &dev->raw->rc6;
 	u32 scancode;
 	u8 toggle;
-	enum rc_type protocol;
+	enum rc_proto protocol;
 
 	if (!is_timing_event(ev)) {
 		if (ev.reset)
@@ -229,7 +229,7 @@ again:
 		case RC6_MODE_0:
 			scancode = data->body;
 			toggle = data->toggle;
-			protocol = RC_TYPE_RC6_0;
+			protocol = RC_PROTO_RC6_0;
 			IR_dprintk(1, "RC6(0) scancode 0x%04x (toggle: %u)\n",
 				   scancode, toggle);
 			break;
@@ -244,20 +244,20 @@ again:
 			scancode = data->body;
 			switch (data->count) {
 			case 20:
-				protocol = RC_TYPE_RC6_6A_20;
+				protocol = RC_PROTO_RC6_6A_20;
 				toggle = 0;
 				break;
 			case 24:
-				protocol = RC_TYPE_RC6_6A_24;
+				protocol = RC_PROTO_RC6_6A_24;
 				toggle = 0;
 				break;
 			case 32:
 				if ((scancode & RC6_6A_LCC_MASK) == RC6_6A_MCE_CC) {
-					protocol = RC_TYPE_RC6_MCE;
+					protocol = RC_PROTO_RC6_MCE;
 					toggle = !!(scancode & RC6_6A_MCE_TOGGLE_MASK);
 					scancode &= ~RC6_6A_MCE_TOGGLE_MASK;
 				} else {
-					protocol = RC_TYPE_RC6_6A_32;
+					protocol = RC_PROTO_RC6_6A_32;
 					toggle = 0;
 				}
 				break;
@@ -322,13 +322,13 @@ static const struct ir_raw_timings_manchester ir_rc6_timings[4] = {
  *		encoding. In this case all @max events will have been written.
  *		-EINVAL if the scancode is ambiguous or invalid.
  */
-static int ir_rc6_encode(enum rc_type protocol, u32 scancode,
+static int ir_rc6_encode(enum rc_proto protocol, u32 scancode,
 			 struct ir_raw_event *events, unsigned int max)
 {
 	int ret;
 	struct ir_raw_event *e = events;
 
-	if (protocol == RC_TYPE_RC6_0) {
+	if (protocol == RC_PROTO_RC6_0) {
 		/* Modulate the preamble */
 		ret = ir_raw_gen_manchester(&e, max, &ir_rc6_timings[0], 0, 0);
 		if (ret < 0)
@@ -358,14 +358,14 @@ static int ir_rc6_encode(enum rc_type protocol, u32 scancode,
 		int bits;
 
 		switch (protocol) {
-		case RC_TYPE_RC6_MCE:
-		case RC_TYPE_RC6_6A_32:
+		case RC_PROTO_RC6_MCE:
+		case RC_PROTO_RC6_6A_32:
 			bits = 32;
 			break;
-		case RC_TYPE_RC6_6A_24:
+		case RC_PROTO_RC6_6A_24:
 			bits = 24;
 			break;
-		case RC_TYPE_RC6_6A_20:
+		case RC_PROTO_RC6_6A_20:
 			bits = 20;
 			break;
 		default:
@@ -403,9 +403,9 @@ static int ir_rc6_encode(enum rc_type protocol, u32 scancode,
 }
 
 static struct ir_raw_handler rc6_handler = {
-	.protocols	= RC_BIT_RC6_0 | RC_BIT_RC6_6A_20 |
-			  RC_BIT_RC6_6A_24 | RC_BIT_RC6_6A_32 |
-			  RC_BIT_RC6_MCE,
+	.protocols	= RC_PROTO_BIT_RC6_0 | RC_PROTO_BIT_RC6_6A_20 |
+			  RC_PROTO_BIT_RC6_6A_24 | RC_PROTO_BIT_RC6_6A_32 |
+			  RC_PROTO_BIT_RC6_MCE,
 	.decode		= ir_rc6_decode,
 	.encode		= ir_rc6_encode,
 };
diff --git a/drivers/media/rc/ir-sanyo-decoder.c b/drivers/media/rc/ir-sanyo-decoder.c
index e6a906a34f90..758c60956850 100644
--- a/drivers/media/rc/ir-sanyo-decoder.c
+++ b/drivers/media/rc/ir-sanyo-decoder.c
@@ -161,7 +161,7 @@ static int ir_sanyo_decode(struct rc_dev *dev, struct ir_raw_event ev)
 
 		scancode = address << 8 | command;
 		IR_dprintk(1, "SANYO scancode: 0x%06x\n", scancode);
-		rc_keydown(dev, RC_TYPE_SANYO, scancode, 0);
+		rc_keydown(dev, RC_PROTO_SANYO, scancode, 0);
 		data->state = STATE_INACTIVE;
 		return 0;
 	}
@@ -195,7 +195,7 @@ static const struct ir_raw_timings_pd ir_sanyo_timings = {
  *		-ENOBUFS if there isn't enough space in the array to fit the
  *		encoding. In this case all @max events will have been written.
  */
-static int ir_sanyo_encode(enum rc_type protocol, u32 scancode,
+static int ir_sanyo_encode(enum rc_proto protocol, u32 scancode,
 			   struct ir_raw_event *events, unsigned int max)
 {
 	struct ir_raw_event *e = events;
@@ -215,7 +215,7 @@ static int ir_sanyo_encode(enum rc_type protocol, u32 scancode,
 }
 
 static struct ir_raw_handler sanyo_handler = {
-	.protocols	= RC_BIT_SANYO,
+	.protocols	= RC_PROTO_BIT_SANYO,
 	.decode		= ir_sanyo_decode,
 	.encode		= ir_sanyo_encode,
 };
diff --git a/drivers/media/rc/ir-sharp-decoder.c b/drivers/media/rc/ir-sharp-decoder.c
index b47e89e2c1bd..ed43a4212479 100644
--- a/drivers/media/rc/ir-sharp-decoder.c
+++ b/drivers/media/rc/ir-sharp-decoder.c
@@ -161,7 +161,7 @@ static int ir_sharp_decode(struct rc_dev *dev, struct ir_raw_event ev)
 		scancode = address << 8 | command;
 		IR_dprintk(1, "Sharp scancode 0x%04x\n", scancode);
 
-		rc_keydown(dev, RC_TYPE_SHARP, scancode, 0);
+		rc_keydown(dev, RC_PROTO_SHARP, scancode, 0);
 		data->state = STATE_INACTIVE;
 		return 0;
 	}
@@ -196,7 +196,7 @@ static const struct ir_raw_timings_pd ir_sharp_timings = {
  *		-ENOBUFS if there isn't enough space in the array to fit the
  *		encoding. In this case all @max events will have been written.
  */
-static int ir_sharp_encode(enum rc_type protocol, u32 scancode,
+static int ir_sharp_encode(enum rc_proto protocol, u32 scancode,
 			   struct ir_raw_event *events, unsigned int max)
 {
 	struct ir_raw_event *e = events;
@@ -223,7 +223,7 @@ static int ir_sharp_encode(enum rc_type protocol, u32 scancode,
 }
 
 static struct ir_raw_handler sharp_handler = {
-	.protocols	= RC_BIT_SHARP,
+	.protocols	= RC_PROTO_BIT_SHARP,
 	.decode		= ir_sharp_decode,
 	.encode		= ir_sharp_encode,
 };
diff --git a/drivers/media/rc/ir-sony-decoder.c b/drivers/media/rc/ir-sony-decoder.c
index 355fa8198f5a..a47ced763031 100644
--- a/drivers/media/rc/ir-sony-decoder.c
+++ b/drivers/media/rc/ir-sony-decoder.c
@@ -42,7 +42,7 @@ enum sony_state {
 static int ir_sony_decode(struct rc_dev *dev, struct ir_raw_event ev)
 {
 	struct sony_dec *data = &dev->raw->sony;
-	enum rc_type protocol;
+	enum rc_proto protocol;
 	u32 scancode;
 	u8 device, subdevice, function;
 
@@ -121,31 +121,31 @@ static int ir_sony_decode(struct rc_dev *dev, struct ir_raw_event ev)
 
 		switch (data->count) {
 		case 12:
-			if (!(dev->enabled_protocols & RC_BIT_SONY12))
+			if (!(dev->enabled_protocols & RC_PROTO_BIT_SONY12))
 				goto finish_state_machine;
 
 			device    = bitrev8((data->bits <<  3) & 0xF8);
 			subdevice = 0;
 			function  = bitrev8((data->bits >>  4) & 0xFE);
-			protocol = RC_TYPE_SONY12;
+			protocol = RC_PROTO_SONY12;
 			break;
 		case 15:
-			if (!(dev->enabled_protocols & RC_BIT_SONY15))
+			if (!(dev->enabled_protocols & RC_PROTO_BIT_SONY15))
 				goto finish_state_machine;
 
 			device    = bitrev8((data->bits >>  0) & 0xFF);
 			subdevice = 0;
 			function  = bitrev8((data->bits >>  7) & 0xFE);
-			protocol = RC_TYPE_SONY15;
+			protocol = RC_PROTO_SONY15;
 			break;
 		case 20:
-			if (!(dev->enabled_protocols & RC_BIT_SONY20))
+			if (!(dev->enabled_protocols & RC_PROTO_BIT_SONY20))
 				goto finish_state_machine;
 
 			device    = bitrev8((data->bits >>  5) & 0xF8);
 			subdevice = bitrev8((data->bits >>  0) & 0xFF);
 			function  = bitrev8((data->bits >> 12) & 0xFE);
-			protocol = RC_TYPE_SONY20;
+			protocol = RC_PROTO_SONY20;
 			break;
 		default:
 			IR_dprintk(1, "Sony invalid bitcount %u\n", data->count);
@@ -190,17 +190,17 @@ static const struct ir_raw_timings_pl ir_sony_timings = {
  *		-ENOBUFS if there isn't enough space in the array to fit the
  *		encoding. In this case all @max events will have been written.
  */
-static int ir_sony_encode(enum rc_type protocol, u32 scancode,
+static int ir_sony_encode(enum rc_proto protocol, u32 scancode,
 			  struct ir_raw_event *events, unsigned int max)
 {
 	struct ir_raw_event *e = events;
 	u32 raw, len;
 	int ret;
 
-	if (protocol == RC_TYPE_SONY12) {
+	if (protocol == RC_PROTO_SONY12) {
 		raw = (scancode & 0x7f) | ((scancode & 0x1f0000) >> 9);
 		len = 12;
-	} else if (protocol == RC_TYPE_SONY15) {
+	} else if (protocol == RC_PROTO_SONY15) {
 		raw = (scancode & 0x7f) | ((scancode & 0xff0000) >> 9);
 		len = 15;
 	} else {
@@ -217,7 +217,8 @@ static int ir_sony_encode(enum rc_type protocol, u32 scancode,
 }
 
 static struct ir_raw_handler sony_handler = {
-	.protocols	= RC_BIT_SONY12 | RC_BIT_SONY15 | RC_BIT_SONY20,
+	.protocols	= RC_PROTO_BIT_SONY12 | RC_PROTO_BIT_SONY15 |
+							RC_PROTO_BIT_SONY20,
 	.decode		= ir_sony_decode,
 	.encode		= ir_sony_encode,
 };
diff --git a/drivers/media/rc/ir-xmp-decoder.c b/drivers/media/rc/ir-xmp-decoder.c
index 18596190bbb8..6f464be1c8d7 100644
--- a/drivers/media/rc/ir-xmp-decoder.c
+++ b/drivers/media/rc/ir-xmp-decoder.c
@@ -141,7 +141,7 @@ static int ir_xmp_decode(struct rc_dev *dev, struct ir_raw_event ev)
 			IR_dprintk(1, "XMP scancode 0x%06x\n", scancode);
 
 			if (toggle == 0) {
-				rc_keydown(dev, RC_TYPE_XMP, scancode, 0);
+				rc_keydown(dev, RC_PROTO_XMP, scancode, 0);
 			} else {
 				rc_repeat(dev);
 				IR_dprintk(1, "Repeat last key\n");
@@ -196,7 +196,7 @@ static int ir_xmp_decode(struct rc_dev *dev, struct ir_raw_event ev)
 }
 
 static struct ir_raw_handler xmp_handler = {
-	.protocols	= RC_BIT_XMP,
+	.protocols	= RC_PROTO_BIT_XMP,
 	.decode		= ir_xmp_decode,
 };
 
diff --git a/drivers/media/rc/ite-cir.c b/drivers/media/rc/ite-cir.c
index c8eea30b4e50..65e104c7ddfc 100644
--- a/drivers/media/rc/ite-cir.c
+++ b/drivers/media/rc/ite-cir.c
@@ -1556,7 +1556,7 @@ static int ite_probe(struct pnp_dev *pdev, const struct pnp_device_id
 
 	/* set up ir-core props */
 	rdev->priv = itdev;
-	rdev->allowed_protocols = RC_BIT_ALL_IR_DECODER;
+	rdev->allowed_protocols = RC_PROTO_BIT_ALL_IR_DECODER;
 	rdev->open = ite_open;
 	rdev->close = ite_close;
 	rdev->s_idle = ite_s_idle;
diff --git a/drivers/media/rc/keymaps/rc-adstech-dvb-t-pci.c b/drivers/media/rc/keymaps/rc-adstech-dvb-t-pci.c
index 01d901fbfc8b..2d303c2cee3b 100644
--- a/drivers/media/rc/keymaps/rc-adstech-dvb-t-pci.c
+++ b/drivers/media/rc/keymaps/rc-adstech-dvb-t-pci.c
@@ -66,10 +66,10 @@ static struct rc_map_table adstech_dvb_t_pci[] = {
 
 static struct rc_map_list adstech_dvb_t_pci_map = {
 	.map = {
-		.scan    = adstech_dvb_t_pci,
-		.size    = ARRAY_SIZE(adstech_dvb_t_pci),
-		.rc_type = RC_TYPE_UNKNOWN,	/* Legacy IR type */
-		.name    = RC_MAP_ADSTECH_DVB_T_PCI,
+		.scan     = adstech_dvb_t_pci,
+		.size     = ARRAY_SIZE(adstech_dvb_t_pci),
+		.rc_proto = RC_PROTO_UNKNOWN,	/* Legacy IR type */
+		.name     = RC_MAP_ADSTECH_DVB_T_PCI,
 	}
 };
 
diff --git a/drivers/media/rc/keymaps/rc-alink-dtu-m.c b/drivers/media/rc/keymaps/rc-alink-dtu-m.c
index 4e6ade8e616f..3818c33734a1 100644
--- a/drivers/media/rc/keymaps/rc-alink-dtu-m.c
+++ b/drivers/media/rc/keymaps/rc-alink-dtu-m.c
@@ -45,10 +45,10 @@ static struct rc_map_table alink_dtu_m[] = {
 
 static struct rc_map_list alink_dtu_m_map = {
 	.map = {
-		.scan    = alink_dtu_m,
-		.size    = ARRAY_SIZE(alink_dtu_m),
-		.rc_type = RC_TYPE_NEC,
-		.name    = RC_MAP_ALINK_DTU_M,
+		.scan     = alink_dtu_m,
+		.size     = ARRAY_SIZE(alink_dtu_m),
+		.rc_proto = RC_PROTO_NEC,
+		.name     = RC_MAP_ALINK_DTU_M,
 	}
 };
 
diff --git a/drivers/media/rc/keymaps/rc-anysee.c b/drivers/media/rc/keymaps/rc-anysee.c
index c735fe10a390..e75e51b34d29 100644
--- a/drivers/media/rc/keymaps/rc-anysee.c
+++ b/drivers/media/rc/keymaps/rc-anysee.c
@@ -70,10 +70,10 @@ static struct rc_map_table anysee[] = {
 
 static struct rc_map_list anysee_map = {
 	.map = {
-		.scan    = anysee,
-		.size    = ARRAY_SIZE(anysee),
-		.rc_type = RC_TYPE_NEC,
-		.name    = RC_MAP_ANYSEE,
+		.scan     = anysee,
+		.size     = ARRAY_SIZE(anysee),
+		.rc_proto = RC_PROTO_NEC,
+		.name     = RC_MAP_ANYSEE,
 	}
 };
 
diff --git a/drivers/media/rc/keymaps/rc-apac-viewcomp.c b/drivers/media/rc/keymaps/rc-apac-viewcomp.c
index bf9efa007e1c..65bc8957d9c3 100644
--- a/drivers/media/rc/keymaps/rc-apac-viewcomp.c
+++ b/drivers/media/rc/keymaps/rc-apac-viewcomp.c
@@ -57,10 +57,10 @@ static struct rc_map_table apac_viewcomp[] = {
 
 static struct rc_map_list apac_viewcomp_map = {
 	.map = {
-		.scan    = apac_viewcomp,
-		.size    = ARRAY_SIZE(apac_viewcomp),
-		.rc_type = RC_TYPE_UNKNOWN,	/* Legacy IR type */
-		.name    = RC_MAP_APAC_VIEWCOMP,
+		.scan     = apac_viewcomp,
+		.size     = ARRAY_SIZE(apac_viewcomp),
+		.rc_proto = RC_PROTO_UNKNOWN,	/* Legacy IR type */
+		.name     = RC_MAP_APAC_VIEWCOMP,
 	}
 };
 
diff --git a/drivers/media/rc/keymaps/rc-asus-pc39.c b/drivers/media/rc/keymaps/rc-asus-pc39.c
index 9e674ba5dd4f..530e1d1158d1 100644
--- a/drivers/media/rc/keymaps/rc-asus-pc39.c
+++ b/drivers/media/rc/keymaps/rc-asus-pc39.c
@@ -68,10 +68,10 @@ static struct rc_map_table asus_pc39[] = {
 
 static struct rc_map_list asus_pc39_map = {
 	.map = {
-		.scan    = asus_pc39,
-		.size    = ARRAY_SIZE(asus_pc39),
-		.rc_type = RC_TYPE_RC5,
-		.name    = RC_MAP_ASUS_PC39,
+		.scan     = asus_pc39,
+		.size     = ARRAY_SIZE(asus_pc39),
+		.rc_proto = RC_PROTO_RC5,
+		.name     = RC_MAP_ASUS_PC39,
 	}
 };
 
diff --git a/drivers/media/rc/keymaps/rc-asus-ps3-100.c b/drivers/media/rc/keymaps/rc-asus-ps3-100.c
index e45de35f528f..c91ba332984c 100644
--- a/drivers/media/rc/keymaps/rc-asus-ps3-100.c
+++ b/drivers/media/rc/keymaps/rc-asus-ps3-100.c
@@ -67,10 +67,10 @@ static struct rc_map_table asus_ps3_100[] = {
 
 static struct rc_map_list asus_ps3_100_map = {
 .map = {
-	.scan    = asus_ps3_100,
-	.size    = ARRAY_SIZE(asus_ps3_100),
-	.rc_type = RC_TYPE_RC5,
-	.name    = RC_MAP_ASUS_PS3_100,
+	.scan     = asus_ps3_100,
+	.size     = ARRAY_SIZE(asus_ps3_100),
+	.rc_proto = RC_PROTO_RC5,
+	.name     = RC_MAP_ASUS_PS3_100,
 }
 };
 
diff --git a/drivers/media/rc/keymaps/rc-ati-tv-wonder-hd-600.c b/drivers/media/rc/keymaps/rc-ati-tv-wonder-hd-600.c
index 91392d4cfd6d..11b4bdd2392b 100644
--- a/drivers/media/rc/keymaps/rc-ati-tv-wonder-hd-600.c
+++ b/drivers/media/rc/keymaps/rc-ati-tv-wonder-hd-600.c
@@ -46,10 +46,10 @@ static struct rc_map_table ati_tv_wonder_hd_600[] = {
 
 static struct rc_map_list ati_tv_wonder_hd_600_map = {
 	.map = {
-		.scan    = ati_tv_wonder_hd_600,
-		.size    = ARRAY_SIZE(ati_tv_wonder_hd_600),
-		.rc_type = RC_TYPE_UNKNOWN,	/* Legacy IR type */
-		.name    = RC_MAP_ATI_TV_WONDER_HD_600,
+		.scan     = ati_tv_wonder_hd_600,
+		.size     = ARRAY_SIZE(ati_tv_wonder_hd_600),
+		.rc_proto = RC_PROTO_UNKNOWN,	/* Legacy IR type */
+		.name     = RC_MAP_ATI_TV_WONDER_HD_600,
 	}
 };
 
diff --git a/drivers/media/rc/keymaps/rc-ati-x10.c b/drivers/media/rc/keymaps/rc-ati-x10.c
index 4bdc709ec54d..11f1eb6ad712 100644
--- a/drivers/media/rc/keymaps/rc-ati-x10.c
+++ b/drivers/media/rc/keymaps/rc-ati-x10.c
@@ -114,10 +114,10 @@ static struct rc_map_table ati_x10[] = {
 
 static struct rc_map_list ati_x10_map = {
 	.map = {
-		.scan    = ati_x10,
-		.size    = ARRAY_SIZE(ati_x10),
-		.rc_type = RC_TYPE_OTHER,
-		.name    = RC_MAP_ATI_X10,
+		.scan     = ati_x10,
+		.size     = ARRAY_SIZE(ati_x10),
+		.rc_proto = RC_PROTO_OTHER,
+		.name     = RC_MAP_ATI_X10,
 	}
 };
 
diff --git a/drivers/media/rc/keymaps/rc-avermedia-a16d.c b/drivers/media/rc/keymaps/rc-avermedia-a16d.c
index ff30a71d623e..510dc90ebf49 100644
--- a/drivers/media/rc/keymaps/rc-avermedia-a16d.c
+++ b/drivers/media/rc/keymaps/rc-avermedia-a16d.c
@@ -52,10 +52,10 @@ static struct rc_map_table avermedia_a16d[] = {
 
 static struct rc_map_list avermedia_a16d_map = {
 	.map = {
-		.scan    = avermedia_a16d,
-		.size    = ARRAY_SIZE(avermedia_a16d),
-		.rc_type = RC_TYPE_UNKNOWN,	/* Legacy IR type */
-		.name    = RC_MAP_AVERMEDIA_A16D,
+		.scan     = avermedia_a16d,
+		.size     = ARRAY_SIZE(avermedia_a16d),
+		.rc_proto = RC_PROTO_UNKNOWN,	/* Legacy IR type */
+		.name     = RC_MAP_AVERMEDIA_A16D,
 	}
 };
 
diff --git a/drivers/media/rc/keymaps/rc-avermedia-cardbus.c b/drivers/media/rc/keymaps/rc-avermedia-cardbus.c
index d7471a6de9b4..4bbc1e68d1b8 100644
--- a/drivers/media/rc/keymaps/rc-avermedia-cardbus.c
+++ b/drivers/media/rc/keymaps/rc-avermedia-cardbus.c
@@ -74,10 +74,10 @@ static struct rc_map_table avermedia_cardbus[] = {
 
 static struct rc_map_list avermedia_cardbus_map = {
 	.map = {
-		.scan    = avermedia_cardbus,
-		.size    = ARRAY_SIZE(avermedia_cardbus),
-		.rc_type = RC_TYPE_UNKNOWN,	/* Legacy IR type */
-		.name    = RC_MAP_AVERMEDIA_CARDBUS,
+		.scan     = avermedia_cardbus,
+		.size     = ARRAY_SIZE(avermedia_cardbus),
+		.rc_proto = RC_PROTO_UNKNOWN,	/* Legacy IR type */
+		.name     = RC_MAP_AVERMEDIA_CARDBUS,
 	}
 };
 
diff --git a/drivers/media/rc/keymaps/rc-avermedia-dvbt.c b/drivers/media/rc/keymaps/rc-avermedia-dvbt.c
index e2417d6331fe..f6b8547dbad3 100644
--- a/drivers/media/rc/keymaps/rc-avermedia-dvbt.c
+++ b/drivers/media/rc/keymaps/rc-avermedia-dvbt.c
@@ -55,10 +55,10 @@ static struct rc_map_table avermedia_dvbt[] = {
 
 static struct rc_map_list avermedia_dvbt_map = {
 	.map = {
-		.scan    = avermedia_dvbt,
-		.size    = ARRAY_SIZE(avermedia_dvbt),
-		.rc_type = RC_TYPE_UNKNOWN,	/* Legacy IR type */
-		.name    = RC_MAP_AVERMEDIA_DVBT,
+		.scan     = avermedia_dvbt,
+		.size     = ARRAY_SIZE(avermedia_dvbt),
+		.rc_proto = RC_PROTO_UNKNOWN,	/* Legacy IR type */
+		.name     = RC_MAP_AVERMEDIA_DVBT,
 	}
 };
 
diff --git a/drivers/media/rc/keymaps/rc-avermedia-m135a.c b/drivers/media/rc/keymaps/rc-avermedia-m135a.c
index 843598a5f1b5..9882e2cde975 100644
--- a/drivers/media/rc/keymaps/rc-avermedia-m135a.c
+++ b/drivers/media/rc/keymaps/rc-avermedia-m135a.c
@@ -124,10 +124,10 @@ static struct rc_map_table avermedia_m135a[] = {
 
 static struct rc_map_list avermedia_m135a_map = {
 	.map = {
-		.scan    = avermedia_m135a,
-		.size    = ARRAY_SIZE(avermedia_m135a),
-		.rc_type = RC_TYPE_NEC,
-		.name    = RC_MAP_AVERMEDIA_M135A,
+		.scan     = avermedia_m135a,
+		.size     = ARRAY_SIZE(avermedia_m135a),
+		.rc_proto = RC_PROTO_NEC,
+		.name     = RC_MAP_AVERMEDIA_M135A,
 	}
 };
 
diff --git a/drivers/media/rc/keymaps/rc-avermedia-m733a-rm-k6.c b/drivers/media/rc/keymaps/rc-avermedia-m733a-rm-k6.c
index b24e7481ac21..d86126e10375 100644
--- a/drivers/media/rc/keymaps/rc-avermedia-m733a-rm-k6.c
+++ b/drivers/media/rc/keymaps/rc-avermedia-m733a-rm-k6.c
@@ -72,10 +72,10 @@ static struct rc_map_table avermedia_m733a_rm_k6[] = {
 
 static struct rc_map_list avermedia_m733a_rm_k6_map = {
 	.map = {
-		.scan    = avermedia_m733a_rm_k6,
-		.size    = ARRAY_SIZE(avermedia_m733a_rm_k6),
-		.rc_type = RC_TYPE_NEC,
-		.name    = RC_MAP_AVERMEDIA_M733A_RM_K6,
+		.scan     = avermedia_m733a_rm_k6,
+		.size     = ARRAY_SIZE(avermedia_m733a_rm_k6),
+		.rc_proto = RC_PROTO_NEC,
+		.name     = RC_MAP_AVERMEDIA_M733A_RM_K6,
 	}
 };
 
diff --git a/drivers/media/rc/keymaps/rc-avermedia-rm-ks.c b/drivers/media/rc/keymaps/rc-avermedia-rm-ks.c
index 2583400ca1b4..5d92d36d9174 100644
--- a/drivers/media/rc/keymaps/rc-avermedia-rm-ks.c
+++ b/drivers/media/rc/keymaps/rc-avermedia-rm-ks.c
@@ -56,10 +56,10 @@ static struct rc_map_table avermedia_rm_ks[] = {
 
 static struct rc_map_list avermedia_rm_ks_map = {
 	.map = {
-		.scan    = avermedia_rm_ks,
-		.size    = ARRAY_SIZE(avermedia_rm_ks),
-		.rc_type = RC_TYPE_NEC,
-		.name    = RC_MAP_AVERMEDIA_RM_KS,
+		.scan     = avermedia_rm_ks,
+		.size     = ARRAY_SIZE(avermedia_rm_ks),
+		.rc_proto = RC_PROTO_NEC,
+		.name     = RC_MAP_AVERMEDIA_RM_KS,
 	}
 };
 
diff --git a/drivers/media/rc/keymaps/rc-avermedia.c b/drivers/media/rc/keymaps/rc-avermedia.c
index 3f68fbecc188..6503f11c7df5 100644
--- a/drivers/media/rc/keymaps/rc-avermedia.c
+++ b/drivers/media/rc/keymaps/rc-avermedia.c
@@ -63,10 +63,10 @@ static struct rc_map_table avermedia[] = {
 
 static struct rc_map_list avermedia_map = {
 	.map = {
-		.scan    = avermedia,
-		.size    = ARRAY_SIZE(avermedia),
-		.rc_type = RC_TYPE_UNKNOWN,	/* Legacy IR type */
-		.name    = RC_MAP_AVERMEDIA,
+		.scan     = avermedia,
+		.size     = ARRAY_SIZE(avermedia),
+		.rc_proto = RC_PROTO_UNKNOWN,	/* Legacy IR type */
+		.name     = RC_MAP_AVERMEDIA,
 	}
 };
 
diff --git a/drivers/media/rc/keymaps/rc-avertv-303.c b/drivers/media/rc/keymaps/rc-avertv-303.c
index c35bc5b835c4..fbdd7ada57ce 100644
--- a/drivers/media/rc/keymaps/rc-avertv-303.c
+++ b/drivers/media/rc/keymaps/rc-avertv-303.c
@@ -62,10 +62,10 @@ static struct rc_map_table avertv_303[] = {
 
 static struct rc_map_list avertv_303_map = {
 	.map = {
-		.scan    = avertv_303,
-		.size    = ARRAY_SIZE(avertv_303),
-		.rc_type = RC_TYPE_UNKNOWN,	/* Legacy IR type */
-		.name    = RC_MAP_AVERTV_303,
+		.scan     = avertv_303,
+		.size     = ARRAY_SIZE(avertv_303),
+		.rc_proto = RC_PROTO_UNKNOWN,	/* Legacy IR type */
+		.name     = RC_MAP_AVERTV_303,
 	}
 };
 
diff --git a/drivers/media/rc/keymaps/rc-azurewave-ad-tu700.c b/drivers/media/rc/keymaps/rc-azurewave-ad-tu700.c
index ea7f2d0f31eb..18d7dcb869b0 100644
--- a/drivers/media/rc/keymaps/rc-azurewave-ad-tu700.c
+++ b/drivers/media/rc/keymaps/rc-azurewave-ad-tu700.c
@@ -79,10 +79,10 @@ static struct rc_map_table azurewave_ad_tu700[] = {
 
 static struct rc_map_list azurewave_ad_tu700_map = {
 	.map = {
-		.scan    = azurewave_ad_tu700,
-		.size    = ARRAY_SIZE(azurewave_ad_tu700),
-		.rc_type = RC_TYPE_NEC,
-		.name    = RC_MAP_AZUREWAVE_AD_TU700,
+		.scan     = azurewave_ad_tu700,
+		.size     = ARRAY_SIZE(azurewave_ad_tu700),
+		.rc_proto = RC_PROTO_NEC,
+		.name     = RC_MAP_AZUREWAVE_AD_TU700,
 	}
 };
 
diff --git a/drivers/media/rc/keymaps/rc-behold-columbus.c b/drivers/media/rc/keymaps/rc-behold-columbus.c
index 1fc344e9daa7..d256743be998 100644
--- a/drivers/media/rc/keymaps/rc-behold-columbus.c
+++ b/drivers/media/rc/keymaps/rc-behold-columbus.c
@@ -85,10 +85,10 @@ static struct rc_map_table behold_columbus[] = {
 
 static struct rc_map_list behold_columbus_map = {
 	.map = {
-		.scan    = behold_columbus,
-		.size    = ARRAY_SIZE(behold_columbus),
-		.rc_type = RC_TYPE_UNKNOWN,	/* Legacy IR type */
-		.name    = RC_MAP_BEHOLD_COLUMBUS,
+		.scan     = behold_columbus,
+		.size     = ARRAY_SIZE(behold_columbus),
+		.rc_proto = RC_PROTO_UNKNOWN,	/* Legacy IR type */
+		.name     = RC_MAP_BEHOLD_COLUMBUS,
 	}
 };
 
diff --git a/drivers/media/rc/keymaps/rc-behold.c b/drivers/media/rc/keymaps/rc-behold.c
index 520a96f2ff86..93dc795adc67 100644
--- a/drivers/media/rc/keymaps/rc-behold.c
+++ b/drivers/media/rc/keymaps/rc-behold.c
@@ -118,10 +118,10 @@ static struct rc_map_table behold[] = {
 
 static struct rc_map_list behold_map = {
 	.map = {
-		.scan    = behold,
-		.size    = ARRAY_SIZE(behold),
-		.rc_type = RC_TYPE_NEC,
-		.name    = RC_MAP_BEHOLD,
+		.scan     = behold,
+		.size     = ARRAY_SIZE(behold),
+		.rc_proto = RC_PROTO_NEC,
+		.name     = RC_MAP_BEHOLD,
 	}
 };
 
diff --git a/drivers/media/rc/keymaps/rc-budget-ci-old.c b/drivers/media/rc/keymaps/rc-budget-ci-old.c
index b196a5f436a3..81ea1424d9e5 100644
--- a/drivers/media/rc/keymaps/rc-budget-ci-old.c
+++ b/drivers/media/rc/keymaps/rc-budget-ci-old.c
@@ -70,10 +70,10 @@ static struct rc_map_table budget_ci_old[] = {
 
 static struct rc_map_list budget_ci_old_map = {
 	.map = {
-		.scan    = budget_ci_old,
-		.size    = ARRAY_SIZE(budget_ci_old),
-		.rc_type = RC_TYPE_UNKNOWN,	/* Legacy IR type */
-		.name    = RC_MAP_BUDGET_CI_OLD,
+		.scan     = budget_ci_old,
+		.size     = ARRAY_SIZE(budget_ci_old),
+		.rc_proto = RC_PROTO_UNKNOWN,	/* Legacy IR type */
+		.name     = RC_MAP_BUDGET_CI_OLD,
 	}
 };
 
diff --git a/drivers/media/rc/keymaps/rc-cec.c b/drivers/media/rc/keymaps/rc-cec.c
index 354c8e724b8e..76d34abb7c85 100644
--- a/drivers/media/rc/keymaps/rc-cec.c
+++ b/drivers/media/rc/keymaps/rc-cec.c
@@ -160,7 +160,7 @@ static struct rc_map_list cec_map = {
 	.map = {
 		.scan		= cec,
 		.size		= ARRAY_SIZE(cec),
-		.rc_type	= RC_TYPE_CEC,
+		.rc_proto	= RC_PROTO_CEC,
 		.name		= RC_MAP_CEC,
 	}
 };
diff --git a/drivers/media/rc/keymaps/rc-cinergy-1400.c b/drivers/media/rc/keymaps/rc-cinergy-1400.c
index a099c080bf8c..bcb96b3dda85 100644
--- a/drivers/media/rc/keymaps/rc-cinergy-1400.c
+++ b/drivers/media/rc/keymaps/rc-cinergy-1400.c
@@ -61,10 +61,10 @@ static struct rc_map_table cinergy_1400[] = {
 
 static struct rc_map_list cinergy_1400_map = {
 	.map = {
-		.scan    = cinergy_1400,
-		.size    = ARRAY_SIZE(cinergy_1400),
-		.rc_type = RC_TYPE_UNKNOWN,	/* Legacy IR type */
-		.name    = RC_MAP_CINERGY_1400,
+		.scan     = cinergy_1400,
+		.size     = ARRAY_SIZE(cinergy_1400),
+		.rc_proto = RC_PROTO_UNKNOWN,	/* Legacy IR type */
+		.name     = RC_MAP_CINERGY_1400,
 	}
 };
 
diff --git a/drivers/media/rc/keymaps/rc-cinergy.c b/drivers/media/rc/keymaps/rc-cinergy.c
index b0f4328bdd6f..fd56c402aae5 100644
--- a/drivers/media/rc/keymaps/rc-cinergy.c
+++ b/drivers/media/rc/keymaps/rc-cinergy.c
@@ -55,10 +55,10 @@ static struct rc_map_table cinergy[] = {
 
 static struct rc_map_list cinergy_map = {
 	.map = {
-		.scan    = cinergy,
-		.size    = ARRAY_SIZE(cinergy),
-		.rc_type = RC_TYPE_UNKNOWN,	/* Legacy IR type */
-		.name    = RC_MAP_CINERGY,
+		.scan     = cinergy,
+		.size     = ARRAY_SIZE(cinergy),
+		.rc_proto = RC_PROTO_UNKNOWN,	/* Legacy IR type */
+		.name     = RC_MAP_CINERGY,
 	}
 };
 
diff --git a/drivers/media/rc/keymaps/rc-d680-dmb.c b/drivers/media/rc/keymaps/rc-d680-dmb.c
index bb5745d29d8a..2c94b9d88b67 100644
--- a/drivers/media/rc/keymaps/rc-d680-dmb.c
+++ b/drivers/media/rc/keymaps/rc-d680-dmb.c
@@ -51,10 +51,10 @@ static struct rc_map_table rc_map_d680_dmb_table[] = {
 
 static struct rc_map_list d680_dmb_map = {
 	.map = {
-		.scan    = rc_map_d680_dmb_table,
-		.size    = ARRAY_SIZE(rc_map_d680_dmb_table),
-		.rc_type = RC_TYPE_UNKNOWN,	/* Legacy IR type */
-		.name    = RC_MAP_D680_DMB,
+		.scan     = rc_map_d680_dmb_table,
+		.size     = ARRAY_SIZE(rc_map_d680_dmb_table),
+		.rc_proto = RC_PROTO_UNKNOWN,	/* Legacy IR type */
+		.name     = RC_MAP_D680_DMB,
 	}
 };
 
diff --git a/drivers/media/rc/keymaps/rc-delock-61959.c b/drivers/media/rc/keymaps/rc-delock-61959.c
index 01bed864f09d..62de69d78d92 100644
--- a/drivers/media/rc/keymaps/rc-delock-61959.c
+++ b/drivers/media/rc/keymaps/rc-delock-61959.c
@@ -58,10 +58,10 @@ static struct rc_map_table delock_61959[] = {
 
 static struct rc_map_list delock_61959_map = {
 	.map = {
-		.scan    = delock_61959,
-		.size    = ARRAY_SIZE(delock_61959),
-		.rc_type = RC_TYPE_NEC,
-		.name    = RC_MAP_DELOCK_61959,
+		.scan     = delock_61959,
+		.size     = ARRAY_SIZE(delock_61959),
+		.rc_proto = RC_PROTO_NEC,
+		.name     = RC_MAP_DELOCK_61959,
 	}
 };
 
diff --git a/drivers/media/rc/keymaps/rc-dib0700-nec.c b/drivers/media/rc/keymaps/rc-dib0700-nec.c
index a0fa543c9f9e..1b4df106b7b5 100644
--- a/drivers/media/rc/keymaps/rc-dib0700-nec.c
+++ b/drivers/media/rc/keymaps/rc-dib0700-nec.c
@@ -101,10 +101,10 @@ static struct rc_map_table dib0700_nec_table[] = {
 
 static struct rc_map_list dib0700_nec_map = {
 	.map = {
-		.scan    = dib0700_nec_table,
-		.size    = ARRAY_SIZE(dib0700_nec_table),
-		.rc_type = RC_TYPE_NEC,
-		.name    = RC_MAP_DIB0700_NEC_TABLE,
+		.scan     = dib0700_nec_table,
+		.size     = ARRAY_SIZE(dib0700_nec_table),
+		.rc_proto = RC_PROTO_NEC,
+		.name     = RC_MAP_DIB0700_NEC_TABLE,
 	}
 };
 
diff --git a/drivers/media/rc/keymaps/rc-dib0700-rc5.c b/drivers/media/rc/keymaps/rc-dib0700-rc5.c
index 907941145eb7..b0f8151bb824 100644
--- a/drivers/media/rc/keymaps/rc-dib0700-rc5.c
+++ b/drivers/media/rc/keymaps/rc-dib0700-rc5.c
@@ -212,10 +212,10 @@ static struct rc_map_table dib0700_rc5_table[] = {
 
 static struct rc_map_list dib0700_rc5_map = {
 	.map = {
-		.scan    = dib0700_rc5_table,
-		.size    = ARRAY_SIZE(dib0700_rc5_table),
-		.rc_type = RC_TYPE_RC5,
-		.name    = RC_MAP_DIB0700_RC5_TABLE,
+		.scan     = dib0700_rc5_table,
+		.size     = ARRAY_SIZE(dib0700_rc5_table),
+		.rc_proto = RC_PROTO_RC5,
+		.name     = RC_MAP_DIB0700_RC5_TABLE,
 	}
 };
 
diff --git a/drivers/media/rc/keymaps/rc-digitalnow-tinytwin.c b/drivers/media/rc/keymaps/rc-digitalnow-tinytwin.c
index bed78acb9198..01ca8b39359f 100644
--- a/drivers/media/rc/keymaps/rc-digitalnow-tinytwin.c
+++ b/drivers/media/rc/keymaps/rc-digitalnow-tinytwin.c
@@ -75,10 +75,10 @@ static struct rc_map_table digitalnow_tinytwin[] = {
 
 static struct rc_map_list digitalnow_tinytwin_map = {
 	.map = {
-		.scan    = digitalnow_tinytwin,
-		.size    = ARRAY_SIZE(digitalnow_tinytwin),
-		.rc_type = RC_TYPE_NEC,
-		.name    = RC_MAP_DIGITALNOW_TINYTWIN,
+		.scan     = digitalnow_tinytwin,
+		.size     = ARRAY_SIZE(digitalnow_tinytwin),
+		.rc_proto = RC_PROTO_NEC,
+		.name     = RC_MAP_DIGITALNOW_TINYTWIN,
 	}
 };
 
diff --git a/drivers/media/rc/keymaps/rc-digittrade.c b/drivers/media/rc/keymaps/rc-digittrade.c
index a3b97a1fe223..a54b1d632ca6 100644
--- a/drivers/media/rc/keymaps/rc-digittrade.c
+++ b/drivers/media/rc/keymaps/rc-digittrade.c
@@ -59,10 +59,10 @@ static struct rc_map_table digittrade[] = {
 
 static struct rc_map_list digittrade_map = {
 	.map = {
-		.scan    = digittrade,
-		.size    = ARRAY_SIZE(digittrade),
-		.rc_type = RC_TYPE_NEC,
-		.name    = RC_MAP_DIGITTRADE,
+		.scan     = digittrade,
+		.size     = ARRAY_SIZE(digittrade),
+		.rc_proto = RC_PROTO_NEC,
+		.name     = RC_MAP_DIGITTRADE,
 	}
 };
 
diff --git a/drivers/media/rc/keymaps/rc-dm1105-nec.c b/drivers/media/rc/keymaps/rc-dm1105-nec.c
index 46e7ae414cc8..c353445d10ed 100644
--- a/drivers/media/rc/keymaps/rc-dm1105-nec.c
+++ b/drivers/media/rc/keymaps/rc-dm1105-nec.c
@@ -53,10 +53,10 @@ static struct rc_map_table dm1105_nec[] = {
 
 static struct rc_map_list dm1105_nec_map = {
 	.map = {
-		.scan    = dm1105_nec,
-		.size    = ARRAY_SIZE(dm1105_nec),
-		.rc_type = RC_TYPE_UNKNOWN,	/* Legacy IR type */
-		.name    = RC_MAP_DM1105_NEC,
+		.scan     = dm1105_nec,
+		.size     = ARRAY_SIZE(dm1105_nec),
+		.rc_proto = RC_PROTO_UNKNOWN,	/* Legacy IR type */
+		.name     = RC_MAP_DM1105_NEC,
 	}
 };
 
diff --git a/drivers/media/rc/keymaps/rc-dntv-live-dvb-t.c b/drivers/media/rc/keymaps/rc-dntv-live-dvb-t.c
index d2826b46fea2..5bafd5b70f5e 100644
--- a/drivers/media/rc/keymaps/rc-dntv-live-dvb-t.c
+++ b/drivers/media/rc/keymaps/rc-dntv-live-dvb-t.c
@@ -55,10 +55,10 @@ static struct rc_map_table dntv_live_dvb_t[] = {
 
 static struct rc_map_list dntv_live_dvb_t_map = {
 	.map = {
-		.scan    = dntv_live_dvb_t,
-		.size    = ARRAY_SIZE(dntv_live_dvb_t),
-		.rc_type = RC_TYPE_UNKNOWN,	/* Legacy IR type */
-		.name    = RC_MAP_DNTV_LIVE_DVB_T,
+		.scan     = dntv_live_dvb_t,
+		.size     = ARRAY_SIZE(dntv_live_dvb_t),
+		.rc_proto = RC_PROTO_UNKNOWN,	/* Legacy IR type */
+		.name     = RC_MAP_DNTV_LIVE_DVB_T,
 	}
 };
 
diff --git a/drivers/media/rc/keymaps/rc-dntv-live-dvbt-pro.c b/drivers/media/rc/keymaps/rc-dntv-live-dvbt-pro.c
index 0d74769467b5..360167c8829b 100644
--- a/drivers/media/rc/keymaps/rc-dntv-live-dvbt-pro.c
+++ b/drivers/media/rc/keymaps/rc-dntv-live-dvbt-pro.c
@@ -74,10 +74,10 @@ static struct rc_map_table dntv_live_dvbt_pro[] = {
 
 static struct rc_map_list dntv_live_dvbt_pro_map = {
 	.map = {
-		.scan    = dntv_live_dvbt_pro,
-		.size    = ARRAY_SIZE(dntv_live_dvbt_pro),
-		.rc_type = RC_TYPE_UNKNOWN,	/* Legacy IR type */
-		.name    = RC_MAP_DNTV_LIVE_DVBT_PRO,
+		.scan     = dntv_live_dvbt_pro,
+		.size     = ARRAY_SIZE(dntv_live_dvbt_pro),
+		.rc_proto = RC_PROTO_UNKNOWN,	/* Legacy IR type */
+		.name     = RC_MAP_DNTV_LIVE_DVBT_PRO,
 	}
 };
 
diff --git a/drivers/media/rc/keymaps/rc-dtt200u.c b/drivers/media/rc/keymaps/rc-dtt200u.c
index 25650e9e4664..c932d8b6c509 100644
--- a/drivers/media/rc/keymaps/rc-dtt200u.c
+++ b/drivers/media/rc/keymaps/rc-dtt200u.c
@@ -35,10 +35,10 @@ static struct rc_map_table dtt200u_table[] = {
 
 static struct rc_map_list dtt200u_map = {
 	.map = {
-		.scan    = dtt200u_table,
-		.size    = ARRAY_SIZE(dtt200u_table),
-		.rc_type = RC_TYPE_NEC,
-		.name    = RC_MAP_DTT200U,
+		.scan     = dtt200u_table,
+		.size     = ARRAY_SIZE(dtt200u_table),
+		.rc_proto = RC_PROTO_NEC,
+		.name     = RC_MAP_DTT200U,
 	}
 };
 
diff --git a/drivers/media/rc/keymaps/rc-dvbsky.c b/drivers/media/rc/keymaps/rc-dvbsky.c
index c5115a1165d1..d6c0b4c1e20e 100644
--- a/drivers/media/rc/keymaps/rc-dvbsky.c
+++ b/drivers/media/rc/keymaps/rc-dvbsky.c
@@ -54,10 +54,10 @@ static struct rc_map_table rc5_dvbsky[] = {
 
 static struct rc_map_list rc5_dvbsky_map = {
 	.map = {
-		.scan    = rc5_dvbsky,
-		.size    = ARRAY_SIZE(rc5_dvbsky),
-		.rc_type = RC_TYPE_RC5,
-		.name    = RC_MAP_DVBSKY,
+		.scan     = rc5_dvbsky,
+		.size     = ARRAY_SIZE(rc5_dvbsky),
+		.rc_proto = RC_PROTO_RC5,
+		.name     = RC_MAP_DVBSKY,
 	}
 };
 
diff --git a/drivers/media/rc/keymaps/rc-dvico-mce.c b/drivers/media/rc/keymaps/rc-dvico-mce.c
index d1e861f4d095..e4cee190b923 100644
--- a/drivers/media/rc/keymaps/rc-dvico-mce.c
+++ b/drivers/media/rc/keymaps/rc-dvico-mce.c
@@ -61,10 +61,10 @@ static struct rc_map_table rc_map_dvico_mce_table[] = {
 
 static struct rc_map_list dvico_mce_map = {
 	.map = {
-		.scan    = rc_map_dvico_mce_table,
-		.size    = ARRAY_SIZE(rc_map_dvico_mce_table),
-		.rc_type = RC_TYPE_NEC,
-		.name    = RC_MAP_DVICO_MCE,
+		.scan     = rc_map_dvico_mce_table,
+		.size     = ARRAY_SIZE(rc_map_dvico_mce_table),
+		.rc_proto = RC_PROTO_NEC,
+		.name     = RC_MAP_DVICO_MCE,
 	}
 };
 
diff --git a/drivers/media/rc/keymaps/rc-dvico-portable.c b/drivers/media/rc/keymaps/rc-dvico-portable.c
index ac4cb515cbf1..cdd21f54aa61 100644
--- a/drivers/media/rc/keymaps/rc-dvico-portable.c
+++ b/drivers/media/rc/keymaps/rc-dvico-portable.c
@@ -52,10 +52,10 @@ static struct rc_map_table rc_map_dvico_portable_table[] = {
 
 static struct rc_map_list dvico_portable_map = {
 	.map = {
-		.scan    = rc_map_dvico_portable_table,
-		.size    = ARRAY_SIZE(rc_map_dvico_portable_table),
-		.rc_type = RC_TYPE_NEC,
-		.name    = RC_MAP_DVICO_PORTABLE,
+		.scan     = rc_map_dvico_portable_table,
+		.size     = ARRAY_SIZE(rc_map_dvico_portable_table),
+		.rc_proto = RC_PROTO_NEC,
+		.name     = RC_MAP_DVICO_PORTABLE,
 	}
 };
 
diff --git a/drivers/media/rc/keymaps/rc-em-terratec.c b/drivers/media/rc/keymaps/rc-em-terratec.c
index 7f1e06be175b..18e1a2679c20 100644
--- a/drivers/media/rc/keymaps/rc-em-terratec.c
+++ b/drivers/media/rc/keymaps/rc-em-terratec.c
@@ -46,10 +46,10 @@ static struct rc_map_table em_terratec[] = {
 
 static struct rc_map_list em_terratec_map = {
 	.map = {
-		.scan    = em_terratec,
-		.size    = ARRAY_SIZE(em_terratec),
-		.rc_type = RC_TYPE_UNKNOWN,	/* Legacy IR type */
-		.name    = RC_MAP_EM_TERRATEC,
+		.scan     = em_terratec,
+		.size     = ARRAY_SIZE(em_terratec),
+		.rc_proto = RC_PROTO_UNKNOWN,	/* Legacy IR type */
+		.name     = RC_MAP_EM_TERRATEC,
 	}
 };
 
diff --git a/drivers/media/rc/keymaps/rc-encore-enltv-fm53.c b/drivers/media/rc/keymaps/rc-encore-enltv-fm53.c
index 4fc3904daf06..72ffd5cb0108 100644
--- a/drivers/media/rc/keymaps/rc-encore-enltv-fm53.c
+++ b/drivers/media/rc/keymaps/rc-encore-enltv-fm53.c
@@ -58,10 +58,10 @@ static struct rc_map_table encore_enltv_fm53[] = {
 
 static struct rc_map_list encore_enltv_fm53_map = {
 	.map = {
-		.scan    = encore_enltv_fm53,
-		.size    = ARRAY_SIZE(encore_enltv_fm53),
-		.rc_type = RC_TYPE_UNKNOWN,	/* Legacy IR type */
-		.name    = RC_MAP_ENCORE_ENLTV_FM53,
+		.scan     = encore_enltv_fm53,
+		.size     = ARRAY_SIZE(encore_enltv_fm53),
+		.rc_proto = RC_PROTO_UNKNOWN,	/* Legacy IR type */
+		.name     = RC_MAP_ENCORE_ENLTV_FM53,
 	}
 };
 
diff --git a/drivers/media/rc/keymaps/rc-encore-enltv.c b/drivers/media/rc/keymaps/rc-encore-enltv.c
index f1914e23d203..e0381e7aa964 100644
--- a/drivers/media/rc/keymaps/rc-encore-enltv.c
+++ b/drivers/media/rc/keymaps/rc-encore-enltv.c
@@ -89,10 +89,10 @@ static struct rc_map_table encore_enltv[] = {
 
 static struct rc_map_list encore_enltv_map = {
 	.map = {
-		.scan    = encore_enltv,
-		.size    = ARRAY_SIZE(encore_enltv),
-		.rc_type = RC_TYPE_UNKNOWN,	/* Legacy IR type */
-		.name    = RC_MAP_ENCORE_ENLTV,
+		.scan     = encore_enltv,
+		.size     = ARRAY_SIZE(encore_enltv),
+		.rc_proto = RC_PROTO_UNKNOWN,	/* Legacy IR type */
+		.name     = RC_MAP_ENCORE_ENLTV,
 	}
 };
 
diff --git a/drivers/media/rc/keymaps/rc-encore-enltv2.c b/drivers/media/rc/keymaps/rc-encore-enltv2.c
index 9c6c55240d18..e9b0bfba319c 100644
--- a/drivers/media/rc/keymaps/rc-encore-enltv2.c
+++ b/drivers/media/rc/keymaps/rc-encore-enltv2.c
@@ -67,10 +67,10 @@ static struct rc_map_table encore_enltv2[] = {
 
 static struct rc_map_list encore_enltv2_map = {
 	.map = {
-		.scan    = encore_enltv2,
-		.size    = ARRAY_SIZE(encore_enltv2),
-		.rc_type = RC_TYPE_UNKNOWN,	/* Legacy IR type */
-		.name    = RC_MAP_ENCORE_ENLTV2,
+		.scan     = encore_enltv2,
+		.size     = ARRAY_SIZE(encore_enltv2),
+		.rc_proto = RC_PROTO_UNKNOWN,	/* Legacy IR type */
+		.name     = RC_MAP_ENCORE_ENLTV2,
 	}
 };
 
diff --git a/drivers/media/rc/keymaps/rc-evga-indtube.c b/drivers/media/rc/keymaps/rc-evga-indtube.c
index 2370d2a3deb6..b77c5e908668 100644
--- a/drivers/media/rc/keymaps/rc-evga-indtube.c
+++ b/drivers/media/rc/keymaps/rc-evga-indtube.c
@@ -38,10 +38,10 @@ static struct rc_map_table evga_indtube[] = {
 
 static struct rc_map_list evga_indtube_map = {
 	.map = {
-		.scan    = evga_indtube,
-		.size    = ARRAY_SIZE(evga_indtube),
-		.rc_type = RC_TYPE_UNKNOWN,	/* Legacy IR type */
-		.name    = RC_MAP_EVGA_INDTUBE,
+		.scan     = evga_indtube,
+		.size     = ARRAY_SIZE(evga_indtube),
+		.rc_proto = RC_PROTO_UNKNOWN,	/* Legacy IR type */
+		.name     = RC_MAP_EVGA_INDTUBE,
 	}
 };
 
diff --git a/drivers/media/rc/keymaps/rc-eztv.c b/drivers/media/rc/keymaps/rc-eztv.c
index b5c96ed84376..5013b3b2aa93 100644
--- a/drivers/media/rc/keymaps/rc-eztv.c
+++ b/drivers/media/rc/keymaps/rc-eztv.c
@@ -73,10 +73,10 @@ static struct rc_map_table eztv[] = {
 
 static struct rc_map_list eztv_map = {
 	.map = {
-		.scan    = eztv,
-		.size    = ARRAY_SIZE(eztv),
-		.rc_type = RC_TYPE_UNKNOWN,	/* Legacy IR type */
-		.name    = RC_MAP_EZTV,
+		.scan     = eztv,
+		.size     = ARRAY_SIZE(eztv),
+		.rc_proto = RC_PROTO_UNKNOWN,	/* Legacy IR type */
+		.name     = RC_MAP_EZTV,
 	}
 };
 
diff --git a/drivers/media/rc/keymaps/rc-flydvb.c b/drivers/media/rc/keymaps/rc-flydvb.c
index 25cb89fac03c..418b32521273 100644
--- a/drivers/media/rc/keymaps/rc-flydvb.c
+++ b/drivers/media/rc/keymaps/rc-flydvb.c
@@ -54,10 +54,10 @@ static struct rc_map_table flydvb[] = {
 
 static struct rc_map_list flydvb_map = {
 	.map = {
-		.scan    = flydvb,
-		.size    = ARRAY_SIZE(flydvb),
-		.rc_type = RC_TYPE_UNKNOWN,	/* Legacy IR type */
-		.name    = RC_MAP_FLYDVB,
+		.scan     = flydvb,
+		.size     = ARRAY_SIZE(flydvb),
+		.rc_proto = RC_PROTO_UNKNOWN,	/* Legacy IR type */
+		.name     = RC_MAP_FLYDVB,
 	}
 };
 
diff --git a/drivers/media/rc/keymaps/rc-flyvideo.c b/drivers/media/rc/keymaps/rc-flyvideo.c
index e71377dd0534..93fb87ecf061 100644
--- a/drivers/media/rc/keymaps/rc-flyvideo.c
+++ b/drivers/media/rc/keymaps/rc-flyvideo.c
@@ -47,10 +47,10 @@ static struct rc_map_table flyvideo[] = {
 
 static struct rc_map_list flyvideo_map = {
 	.map = {
-		.scan    = flyvideo,
-		.size    = ARRAY_SIZE(flyvideo),
-		.rc_type = RC_TYPE_UNKNOWN,	/* Legacy IR type */
-		.name    = RC_MAP_FLYVIDEO,
+		.scan     = flyvideo,
+		.size     = ARRAY_SIZE(flyvideo),
+		.rc_proto = RC_PROTO_UNKNOWN,	/* Legacy IR type */
+		.name     = RC_MAP_FLYVIDEO,
 	}
 };
 
diff --git a/drivers/media/rc/keymaps/rc-fusionhdtv-mce.c b/drivers/media/rc/keymaps/rc-fusionhdtv-mce.c
index cf0608dc83d5..9ed3f749262b 100644
--- a/drivers/media/rc/keymaps/rc-fusionhdtv-mce.c
+++ b/drivers/media/rc/keymaps/rc-fusionhdtv-mce.c
@@ -75,10 +75,10 @@ static struct rc_map_table fusionhdtv_mce[] = {
 
 static struct rc_map_list fusionhdtv_mce_map = {
 	.map = {
-		.scan    = fusionhdtv_mce,
-		.size    = ARRAY_SIZE(fusionhdtv_mce),
-		.rc_type = RC_TYPE_UNKNOWN,	/* Legacy IR type */
-		.name    = RC_MAP_FUSIONHDTV_MCE,
+		.scan     = fusionhdtv_mce,
+		.size     = ARRAY_SIZE(fusionhdtv_mce),
+		.rc_proto = RC_PROTO_UNKNOWN,	/* Legacy IR type */
+		.name     = RC_MAP_FUSIONHDTV_MCE,
 	}
 };
 
diff --git a/drivers/media/rc/keymaps/rc-gadmei-rm008z.c b/drivers/media/rc/keymaps/rc-gadmei-rm008z.c
index 03575bdb2eca..3443b721d092 100644
--- a/drivers/media/rc/keymaps/rc-gadmei-rm008z.c
+++ b/drivers/media/rc/keymaps/rc-gadmei-rm008z.c
@@ -58,10 +58,10 @@ static struct rc_map_table gadmei_rm008z[] = {
 
 static struct rc_map_list gadmei_rm008z_map = {
 	.map = {
-		.scan    = gadmei_rm008z,
-		.size    = ARRAY_SIZE(gadmei_rm008z),
-		.rc_type = RC_TYPE_UNKNOWN,	/* Legacy IR type */
-		.name    = RC_MAP_GADMEI_RM008Z,
+		.scan     = gadmei_rm008z,
+		.size     = ARRAY_SIZE(gadmei_rm008z),
+		.rc_proto = RC_PROTO_UNKNOWN,	/* Legacy IR type */
+		.name     = RC_MAP_GADMEI_RM008Z,
 	}
 };
 
diff --git a/drivers/media/rc/keymaps/rc-geekbox.c b/drivers/media/rc/keymaps/rc-geekbox.c
index affc4c481888..4aa1b54bb52e 100644
--- a/drivers/media/rc/keymaps/rc-geekbox.c
+++ b/drivers/media/rc/keymaps/rc-geekbox.c
@@ -31,10 +31,10 @@ static struct rc_map_table geekbox[] = {
 
 static struct rc_map_list geekbox_map = {
 	.map = {
-		.scan    = geekbox,
-		.size    = ARRAY_SIZE(geekbox),
-		.rc_type = RC_TYPE_NEC,
-		.name    = RC_MAP_GEEKBOX,
+		.scan     = geekbox,
+		.size     = ARRAY_SIZE(geekbox),
+		.rc_proto = RC_PROTO_NEC,
+		.name     = RC_MAP_GEEKBOX,
 	}
 };
 
diff --git a/drivers/media/rc/keymaps/rc-genius-tvgo-a11mce.c b/drivers/media/rc/keymaps/rc-genius-tvgo-a11mce.c
index b2ab13b0dcb1..d140e8d45bcc 100644
--- a/drivers/media/rc/keymaps/rc-genius-tvgo-a11mce.c
+++ b/drivers/media/rc/keymaps/rc-genius-tvgo-a11mce.c
@@ -61,10 +61,10 @@ static struct rc_map_table genius_tvgo_a11mce[] = {
 
 static struct rc_map_list genius_tvgo_a11mce_map = {
 	.map = {
-		.scan    = genius_tvgo_a11mce,
-		.size    = ARRAY_SIZE(genius_tvgo_a11mce),
-		.rc_type = RC_TYPE_UNKNOWN,	/* Legacy IR type */
-		.name    = RC_MAP_GENIUS_TVGO_A11MCE,
+		.scan     = genius_tvgo_a11mce,
+		.size     = ARRAY_SIZE(genius_tvgo_a11mce),
+		.rc_proto = RC_PROTO_UNKNOWN,	/* Legacy IR type */
+		.name     = RC_MAP_GENIUS_TVGO_A11MCE,
 	}
 };
 
diff --git a/drivers/media/rc/keymaps/rc-gotview7135.c b/drivers/media/rc/keymaps/rc-gotview7135.c
index 229a36ac7f0a..51230fbb52ba 100644
--- a/drivers/media/rc/keymaps/rc-gotview7135.c
+++ b/drivers/media/rc/keymaps/rc-gotview7135.c
@@ -56,10 +56,10 @@ static struct rc_map_table gotview7135[] = {
 
 static struct rc_map_list gotview7135_map = {
 	.map = {
-		.scan    = gotview7135,
-		.size    = ARRAY_SIZE(gotview7135),
-		.rc_type = RC_TYPE_UNKNOWN,	/* Legacy IR type */
-		.name    = RC_MAP_GOTVIEW7135,
+		.scan     = gotview7135,
+		.size     = ARRAY_SIZE(gotview7135),
+		.rc_proto = RC_PROTO_UNKNOWN,	/* Legacy IR type */
+		.name     = RC_MAP_GOTVIEW7135,
 	}
 };
 
diff --git a/drivers/media/rc/keymaps/rc-hauppauge.c b/drivers/media/rc/keymaps/rc-hauppauge.c
index 36d57f7c532b..890164b68d64 100644
--- a/drivers/media/rc/keymaps/rc-hauppauge.c
+++ b/drivers/media/rc/keymaps/rc-hauppauge.c
@@ -269,10 +269,10 @@ static struct rc_map_table rc5_hauppauge_new[] = {
 
 static struct rc_map_list rc5_hauppauge_new_map = {
 	.map = {
-		.scan    = rc5_hauppauge_new,
-		.size    = ARRAY_SIZE(rc5_hauppauge_new),
-		.rc_type = RC_TYPE_RC5,
-		.name    = RC_MAP_HAUPPAUGE,
+		.scan     = rc5_hauppauge_new,
+		.size     = ARRAY_SIZE(rc5_hauppauge_new),
+		.rc_proto = RC_PROTO_RC5,
+		.name     = RC_MAP_HAUPPAUGE,
 	}
 };
 
diff --git a/drivers/media/rc/keymaps/rc-imon-mce.c b/drivers/media/rc/keymaps/rc-imon-mce.c
index f0da960560b0..6a69ce1451f1 100644
--- a/drivers/media/rc/keymaps/rc-imon-mce.c
+++ b/drivers/media/rc/keymaps/rc-imon-mce.c
@@ -118,11 +118,11 @@ static struct rc_map_table imon_mce[] = {
 
 static struct rc_map_list imon_mce_map = {
 	.map = {
-		.scan    = imon_mce,
-		.size    = ARRAY_SIZE(imon_mce),
+		.scan     = imon_mce,
+		.size     = ARRAY_SIZE(imon_mce),
 		/* its RC6, but w/a hardware decoder */
-		.rc_type = RC_TYPE_RC6_MCE,
-		.name    = RC_MAP_IMON_MCE,
+		.rc_proto = RC_PROTO_RC6_MCE,
+		.name     = RC_MAP_IMON_MCE,
 	}
 };
 
diff --git a/drivers/media/rc/keymaps/rc-imon-pad.c b/drivers/media/rc/keymaps/rc-imon-pad.c
index 999c6295c70e..a7296ffbf218 100644
--- a/drivers/media/rc/keymaps/rc-imon-pad.c
+++ b/drivers/media/rc/keymaps/rc-imon-pad.c
@@ -132,11 +132,11 @@ static struct rc_map_table imon_pad[] = {
 
 static struct rc_map_list imon_pad_map = {
 	.map = {
-		.scan    = imon_pad,
-		.size    = ARRAY_SIZE(imon_pad),
+		.scan     = imon_pad,
+		.size     = ARRAY_SIZE(imon_pad),
 		/* actual protocol details unknown, hardware decoder */
-		.rc_type = RC_TYPE_OTHER,
-		.name    = RC_MAP_IMON_PAD,
+		.rc_proto = RC_PROTO_OTHER,
+		.name     = RC_MAP_IMON_PAD,
 	}
 };
 
diff --git a/drivers/media/rc/keymaps/rc-iodata-bctv7e.c b/drivers/media/rc/keymaps/rc-iodata-bctv7e.c
index 9ee154cb0c6b..8cf87a15c4f2 100644
--- a/drivers/media/rc/keymaps/rc-iodata-bctv7e.c
+++ b/drivers/media/rc/keymaps/rc-iodata-bctv7e.c
@@ -65,10 +65,10 @@ static struct rc_map_table iodata_bctv7e[] = {
 
 static struct rc_map_list iodata_bctv7e_map = {
 	.map = {
-		.scan    = iodata_bctv7e,
-		.size    = ARRAY_SIZE(iodata_bctv7e),
-		.rc_type = RC_TYPE_UNKNOWN,	/* Legacy IR type */
-		.name    = RC_MAP_IODATA_BCTV7E,
+		.scan     = iodata_bctv7e,
+		.size     = ARRAY_SIZE(iodata_bctv7e),
+		.rc_proto = RC_PROTO_UNKNOWN,	/* Legacy IR type */
+		.name     = RC_MAP_IODATA_BCTV7E,
 	}
 };
 
diff --git a/drivers/media/rc/keymaps/rc-it913x-v1.c b/drivers/media/rc/keymaps/rc-it913x-v1.c
index 0ac775fd109d..908d14848ae8 100644
--- a/drivers/media/rc/keymaps/rc-it913x-v1.c
+++ b/drivers/media/rc/keymaps/rc-it913x-v1.c
@@ -71,10 +71,10 @@ static struct rc_map_table it913x_v1_rc[] = {
 
 static struct rc_map_list it913x_v1_map = {
 	.map = {
-		.scan    = it913x_v1_rc,
-		.size    = ARRAY_SIZE(it913x_v1_rc),
-		.rc_type = RC_TYPE_NEC,
-		.name    = RC_MAP_IT913X_V1,
+		.scan     = it913x_v1_rc,
+		.size     = ARRAY_SIZE(it913x_v1_rc),
+		.rc_proto = RC_PROTO_NEC,
+		.name     = RC_MAP_IT913X_V1,
 	}
 };
 
diff --git a/drivers/media/rc/keymaps/rc-it913x-v2.c b/drivers/media/rc/keymaps/rc-it913x-v2.c
index bd42a30ec06f..05ab7fa4f90b 100644
--- a/drivers/media/rc/keymaps/rc-it913x-v2.c
+++ b/drivers/media/rc/keymaps/rc-it913x-v2.c
@@ -70,10 +70,10 @@ static struct rc_map_table it913x_v2_rc[] = {
 
 static struct rc_map_list it913x_v2_map = {
 	.map = {
-		.scan    = it913x_v2_rc,
-		.size    = ARRAY_SIZE(it913x_v2_rc),
-		.rc_type = RC_TYPE_NEC,
-		.name    = RC_MAP_IT913X_V2,
+		.scan     = it913x_v2_rc,
+		.size     = ARRAY_SIZE(it913x_v2_rc),
+		.rc_proto = RC_PROTO_NEC,
+		.name     = RC_MAP_IT913X_V2,
 	}
 };
 
diff --git a/drivers/media/rc/keymaps/rc-kaiomy.c b/drivers/media/rc/keymaps/rc-kaiomy.c
index 60803a732c08..e791f1e1b43b 100644
--- a/drivers/media/rc/keymaps/rc-kaiomy.c
+++ b/drivers/media/rc/keymaps/rc-kaiomy.c
@@ -64,10 +64,10 @@ static struct rc_map_table kaiomy[] = {
 
 static struct rc_map_list kaiomy_map = {
 	.map = {
-		.scan    = kaiomy,
-		.size    = ARRAY_SIZE(kaiomy),
-		.rc_type = RC_TYPE_UNKNOWN,	/* Legacy IR type */
-		.name    = RC_MAP_KAIOMY,
+		.scan     = kaiomy,
+		.size     = ARRAY_SIZE(kaiomy),
+		.rc_proto = RC_PROTO_UNKNOWN,	/* Legacy IR type */
+		.name     = RC_MAP_KAIOMY,
 	}
 };
 
diff --git a/drivers/media/rc/keymaps/rc-kworld-315u.c b/drivers/media/rc/keymaps/rc-kworld-315u.c
index ba087eed1ed9..71dce0138f0e 100644
--- a/drivers/media/rc/keymaps/rc-kworld-315u.c
+++ b/drivers/media/rc/keymaps/rc-kworld-315u.c
@@ -60,10 +60,10 @@ static struct rc_map_table kworld_315u[] = {
 
 static struct rc_map_list kworld_315u_map = {
 	.map = {
-		.scan    = kworld_315u,
-		.size    = ARRAY_SIZE(kworld_315u),
-		.rc_type = RC_TYPE_NEC,
-		.name    = RC_MAP_KWORLD_315U,
+		.scan     = kworld_315u,
+		.size     = ARRAY_SIZE(kworld_315u),
+		.rc_proto = RC_PROTO_NEC,
+		.name     = RC_MAP_KWORLD_315U,
 	}
 };
 
diff --git a/drivers/media/rc/keymaps/rc-kworld-pc150u.c b/drivers/media/rc/keymaps/rc-kworld-pc150u.c
index b92e571f4def..3846059060aa 100644
--- a/drivers/media/rc/keymaps/rc-kworld-pc150u.c
+++ b/drivers/media/rc/keymaps/rc-kworld-pc150u.c
@@ -78,10 +78,10 @@ static struct rc_map_table kworld_pc150u[] = {
 
 static struct rc_map_list kworld_pc150u_map = {
 	.map = {
-		.scan    = kworld_pc150u,
-		.size    = ARRAY_SIZE(kworld_pc150u),
-		.rc_type = RC_TYPE_UNKNOWN,	/* Legacy IR type */
-		.name    = RC_MAP_KWORLD_PC150U,
+		.scan     = kworld_pc150u,
+		.size     = ARRAY_SIZE(kworld_pc150u),
+		.rc_proto = RC_PROTO_UNKNOWN,	/* Legacy IR type */
+		.name     = RC_MAP_KWORLD_PC150U,
 	}
 };
 
diff --git a/drivers/media/rc/keymaps/rc-kworld-plus-tv-analog.c b/drivers/media/rc/keymaps/rc-kworld-plus-tv-analog.c
index edc868564f99..e0322ed16c94 100644
--- a/drivers/media/rc/keymaps/rc-kworld-plus-tv-analog.c
+++ b/drivers/media/rc/keymaps/rc-kworld-plus-tv-analog.c
@@ -76,10 +76,10 @@ static struct rc_map_table kworld_plus_tv_analog[] = {
 
 static struct rc_map_list kworld_plus_tv_analog_map = {
 	.map = {
-		.scan    = kworld_plus_tv_analog,
-		.size    = ARRAY_SIZE(kworld_plus_tv_analog),
-		.rc_type = RC_TYPE_UNKNOWN,	/* Legacy IR type */
-		.name    = RC_MAP_KWORLD_PLUS_TV_ANALOG,
+		.scan     = kworld_plus_tv_analog,
+		.size     = ARRAY_SIZE(kworld_plus_tv_analog),
+		.rc_proto = RC_PROTO_UNKNOWN,	/* Legacy IR type */
+		.name     = RC_MAP_KWORLD_PLUS_TV_ANALOG,
 	}
 };
 
diff --git a/drivers/media/rc/keymaps/rc-leadtek-y04g0051.c b/drivers/media/rc/keymaps/rc-leadtek-y04g0051.c
index 03d762d986ee..e534a5601b6d 100644
--- a/drivers/media/rc/keymaps/rc-leadtek-y04g0051.c
+++ b/drivers/media/rc/keymaps/rc-leadtek-y04g0051.c
@@ -76,10 +76,10 @@ static struct rc_map_table leadtek_y04g0051[] = {
 
 static struct rc_map_list leadtek_y04g0051_map = {
 	.map = {
-		.scan    = leadtek_y04g0051,
-		.size    = ARRAY_SIZE(leadtek_y04g0051),
-		.rc_type = RC_TYPE_NEC,
-		.name    = RC_MAP_LEADTEK_Y04G0051,
+		.scan     = leadtek_y04g0051,
+		.size     = ARRAY_SIZE(leadtek_y04g0051),
+		.rc_proto = RC_PROTO_NEC,
+		.name     = RC_MAP_LEADTEK_Y04G0051,
 	}
 };
 
diff --git a/drivers/media/rc/keymaps/rc-lme2510.c b/drivers/media/rc/keymaps/rc-lme2510.c
index 2b0027c41332..9c93f90f5c2b 100644
--- a/drivers/media/rc/keymaps/rc-lme2510.c
+++ b/drivers/media/rc/keymaps/rc-lme2510.c
@@ -87,10 +87,10 @@ static struct rc_map_table lme2510_rc[] = {
 
 static struct rc_map_list lme2510_map = {
 	.map = {
-		.scan    = lme2510_rc,
-		.size    = ARRAY_SIZE(lme2510_rc),
-		.rc_type = RC_TYPE_NEC,
-		.name    = RC_MAP_LME2510,
+		.scan     = lme2510_rc,
+		.size     = ARRAY_SIZE(lme2510_rc),
+		.rc_proto = RC_PROTO_NEC,
+		.name     = RC_MAP_LME2510,
 	}
 };
 
diff --git a/drivers/media/rc/keymaps/rc-manli.c b/drivers/media/rc/keymaps/rc-manli.c
index 92424ef2aaa6..da566902a4dd 100644
--- a/drivers/media/rc/keymaps/rc-manli.c
+++ b/drivers/media/rc/keymaps/rc-manli.c
@@ -111,10 +111,10 @@ static struct rc_map_table manli[] = {
 
 static struct rc_map_list manli_map = {
 	.map = {
-		.scan    = manli,
-		.size    = ARRAY_SIZE(manli),
-		.rc_type = RC_TYPE_UNKNOWN,	/* Legacy IR type */
-		.name    = RC_MAP_MANLI,
+		.scan     = manli,
+		.size     = ARRAY_SIZE(manli),
+		.rc_proto = RC_PROTO_UNKNOWN,	/* Legacy IR type */
+		.name     = RC_MAP_MANLI,
 	}
 };
 
diff --git a/drivers/media/rc/keymaps/rc-medion-x10-digitainer.c b/drivers/media/rc/keymaps/rc-medion-x10-digitainer.c
index 966f9b3c71da..c9973340e546 100644
--- a/drivers/media/rc/keymaps/rc-medion-x10-digitainer.c
+++ b/drivers/media/rc/keymaps/rc-medion-x10-digitainer.c
@@ -98,10 +98,10 @@ static struct rc_map_table medion_x10_digitainer[] = {
 
 static struct rc_map_list medion_x10_digitainer_map = {
 	.map = {
-		.scan    = medion_x10_digitainer,
-		.size    = ARRAY_SIZE(medion_x10_digitainer),
-		.rc_type = RC_TYPE_OTHER,
-		.name    = RC_MAP_MEDION_X10_DIGITAINER,
+		.scan     = medion_x10_digitainer,
+		.size     = ARRAY_SIZE(medion_x10_digitainer),
+		.rc_proto = RC_PROTO_OTHER,
+		.name     = RC_MAP_MEDION_X10_DIGITAINER,
 	}
 };
 
diff --git a/drivers/media/rc/keymaps/rc-medion-x10-or2x.c b/drivers/media/rc/keymaps/rc-medion-x10-or2x.c
index b077300ecb5c..103ad88d242c 100644
--- a/drivers/media/rc/keymaps/rc-medion-x10-or2x.c
+++ b/drivers/media/rc/keymaps/rc-medion-x10-or2x.c
@@ -83,10 +83,10 @@ static struct rc_map_table medion_x10_or2x[] = {
 
 static struct rc_map_list medion_x10_or2x_map = {
 	.map = {
-		.scan    = medion_x10_or2x,
-		.size    = ARRAY_SIZE(medion_x10_or2x),
-		.rc_type = RC_TYPE_OTHER,
-		.name    = RC_MAP_MEDION_X10_OR2X,
+		.scan     = medion_x10_or2x,
+		.size     = ARRAY_SIZE(medion_x10_or2x),
+		.rc_proto = RC_PROTO_OTHER,
+		.name     = RC_MAP_MEDION_X10_OR2X,
 	}
 };
 
diff --git a/drivers/media/rc/keymaps/rc-medion-x10.c b/drivers/media/rc/keymaps/rc-medion-x10.c
index 479cdb897810..bbffa5dfe420 100644
--- a/drivers/media/rc/keymaps/rc-medion-x10.c
+++ b/drivers/media/rc/keymaps/rc-medion-x10.c
@@ -93,10 +93,10 @@ static struct rc_map_table medion_x10[] = {
 
 static struct rc_map_list medion_x10_map = {
 	.map = {
-		.scan    = medion_x10,
-		.size    = ARRAY_SIZE(medion_x10),
-		.rc_type = RC_TYPE_OTHER,
-		.name    = RC_MAP_MEDION_X10,
+		.scan     = medion_x10,
+		.size     = ARRAY_SIZE(medion_x10),
+		.rc_proto = RC_PROTO_OTHER,
+		.name     = RC_MAP_MEDION_X10,
 	}
 };
 
diff --git a/drivers/media/rc/keymaps/rc-msi-digivox-ii.c b/drivers/media/rc/keymaps/rc-msi-digivox-ii.c
index 2fa71d0d72d7..94aa12d4b73c 100644
--- a/drivers/media/rc/keymaps/rc-msi-digivox-ii.c
+++ b/drivers/media/rc/keymaps/rc-msi-digivox-ii.c
@@ -44,10 +44,10 @@ static struct rc_map_table msi_digivox_ii[] = {
 
 static struct rc_map_list msi_digivox_ii_map = {
 	.map = {
-		.scan    = msi_digivox_ii,
-		.size    = ARRAY_SIZE(msi_digivox_ii),
-		.rc_type = RC_TYPE_NEC,
-		.name    = RC_MAP_MSI_DIGIVOX_II,
+		.scan     = msi_digivox_ii,
+		.size     = ARRAY_SIZE(msi_digivox_ii),
+		.rc_proto = RC_PROTO_NEC,
+		.name     = RC_MAP_MSI_DIGIVOX_II,
 	}
 };
 
diff --git a/drivers/media/rc/keymaps/rc-msi-digivox-iii.c b/drivers/media/rc/keymaps/rc-msi-digivox-iii.c
index 303a0b73175b..8fec0c1dcb12 100644
--- a/drivers/media/rc/keymaps/rc-msi-digivox-iii.c
+++ b/drivers/media/rc/keymaps/rc-msi-digivox-iii.c
@@ -62,10 +62,10 @@ static struct rc_map_table msi_digivox_iii[] = {
 
 static struct rc_map_list msi_digivox_iii_map = {
 	.map = {
-		.scan    = msi_digivox_iii,
-		.size    = ARRAY_SIZE(msi_digivox_iii),
-		.rc_type = RC_TYPE_NEC,
-		.name    = RC_MAP_MSI_DIGIVOX_III,
+		.scan     = msi_digivox_iii,
+		.size     = ARRAY_SIZE(msi_digivox_iii),
+		.rc_proto = RC_PROTO_NEC,
+		.name     = RC_MAP_MSI_DIGIVOX_III,
 	}
 };
 
diff --git a/drivers/media/rc/keymaps/rc-msi-tvanywhere-plus.c b/drivers/media/rc/keymaps/rc-msi-tvanywhere-plus.c
index fd7a55c56167..dfa0ed1d7667 100644
--- a/drivers/media/rc/keymaps/rc-msi-tvanywhere-plus.c
+++ b/drivers/media/rc/keymaps/rc-msi-tvanywhere-plus.c
@@ -100,10 +100,10 @@ static struct rc_map_table msi_tvanywhere_plus[] = {
 
 static struct rc_map_list msi_tvanywhere_plus_map = {
 	.map = {
-		.scan    = msi_tvanywhere_plus,
-		.size    = ARRAY_SIZE(msi_tvanywhere_plus),
-		.rc_type = RC_TYPE_UNKNOWN,	/* Legacy IR type */
-		.name    = RC_MAP_MSI_TVANYWHERE_PLUS,
+		.scan     = msi_tvanywhere_plus,
+		.size     = ARRAY_SIZE(msi_tvanywhere_plus),
+		.rc_proto = RC_PROTO_UNKNOWN,	/* Legacy IR type */
+		.name     = RC_MAP_MSI_TVANYWHERE_PLUS,
 	}
 };
 
diff --git a/drivers/media/rc/keymaps/rc-msi-tvanywhere.c b/drivers/media/rc/keymaps/rc-msi-tvanywhere.c
index 4233a8d4d63e..2111816a3f59 100644
--- a/drivers/media/rc/keymaps/rc-msi-tvanywhere.c
+++ b/drivers/media/rc/keymaps/rc-msi-tvanywhere.c
@@ -46,10 +46,10 @@ static struct rc_map_table msi_tvanywhere[] = {
 
 static struct rc_map_list msi_tvanywhere_map = {
 	.map = {
-		.scan    = msi_tvanywhere,
-		.size    = ARRAY_SIZE(msi_tvanywhere),
-		.rc_type = RC_TYPE_UNKNOWN,	/* Legacy IR type */
-		.name    = RC_MAP_MSI_TVANYWHERE,
+		.scan     = msi_tvanywhere,
+		.size     = ARRAY_SIZE(msi_tvanywhere),
+		.rc_proto = RC_PROTO_UNKNOWN,	/* Legacy IR type */
+		.name     = RC_MAP_MSI_TVANYWHERE,
 	}
 };
 
diff --git a/drivers/media/rc/keymaps/rc-nebula.c b/drivers/media/rc/keymaps/rc-nebula.c
index 4c50f33c7c41..109b6e1a8b1a 100644
--- a/drivers/media/rc/keymaps/rc-nebula.c
+++ b/drivers/media/rc/keymaps/rc-nebula.c
@@ -73,10 +73,10 @@ static struct rc_map_table nebula[] = {
 
 static struct rc_map_list nebula_map = {
 	.map = {
-		.scan    = nebula,
-		.size    = ARRAY_SIZE(nebula),
-		.rc_type = RC_TYPE_RC5,
-		.name    = RC_MAP_NEBULA,
+		.scan     = nebula,
+		.size     = ARRAY_SIZE(nebula),
+		.rc_proto = RC_PROTO_RC5,
+		.name     = RC_MAP_NEBULA,
 	}
 };
 
diff --git a/drivers/media/rc/keymaps/rc-nec-terratec-cinergy-xs.c b/drivers/media/rc/keymaps/rc-nec-terratec-cinergy-xs.c
index 292bbad35d21..bb2d3a2962c0 100644
--- a/drivers/media/rc/keymaps/rc-nec-terratec-cinergy-xs.c
+++ b/drivers/media/rc/keymaps/rc-nec-terratec-cinergy-xs.c
@@ -134,10 +134,10 @@ static struct rc_map_table nec_terratec_cinergy_xs[] = {
 
 static struct rc_map_list nec_terratec_cinergy_xs_map = {
 	.map = {
-		.scan    = nec_terratec_cinergy_xs,
-		.size    = ARRAY_SIZE(nec_terratec_cinergy_xs),
-		.rc_type = RC_TYPE_NEC,
-		.name    = RC_MAP_NEC_TERRATEC_CINERGY_XS,
+		.scan     = nec_terratec_cinergy_xs,
+		.size     = ARRAY_SIZE(nec_terratec_cinergy_xs),
+		.rc_proto = RC_PROTO_NEC,
+		.name     = RC_MAP_NEC_TERRATEC_CINERGY_XS,
 	}
 };
 
diff --git a/drivers/media/rc/keymaps/rc-norwood.c b/drivers/media/rc/keymaps/rc-norwood.c
index ca1b82a2c54f..cd25df336749 100644
--- a/drivers/media/rc/keymaps/rc-norwood.c
+++ b/drivers/media/rc/keymaps/rc-norwood.c
@@ -62,10 +62,10 @@ static struct rc_map_table norwood[] = {
 
 static struct rc_map_list norwood_map = {
 	.map = {
-		.scan    = norwood,
-		.size    = ARRAY_SIZE(norwood),
-		.rc_type = RC_TYPE_UNKNOWN,	/* Legacy IR type */
-		.name    = RC_MAP_NORWOOD,
+		.scan     = norwood,
+		.size     = ARRAY_SIZE(norwood),
+		.rc_proto = RC_PROTO_UNKNOWN,	/* Legacy IR type */
+		.name     = RC_MAP_NORWOOD,
 	}
 };
 
diff --git a/drivers/media/rc/keymaps/rc-npgtech.c b/drivers/media/rc/keymaps/rc-npgtech.c
index 1fb946024512..140bbc20a764 100644
--- a/drivers/media/rc/keymaps/rc-npgtech.c
+++ b/drivers/media/rc/keymaps/rc-npgtech.c
@@ -57,10 +57,10 @@ static struct rc_map_table npgtech[] = {
 
 static struct rc_map_list npgtech_map = {
 	.map = {
-		.scan    = npgtech,
-		.size    = ARRAY_SIZE(npgtech),
-		.rc_type = RC_TYPE_UNKNOWN,	/* Legacy IR type */
-		.name    = RC_MAP_NPGTECH,
+		.scan     = npgtech,
+		.size     = ARRAY_SIZE(npgtech),
+		.rc_proto = RC_PROTO_UNKNOWN,	/* Legacy IR type */
+		.name     = RC_MAP_NPGTECH,
 	}
 };
 
diff --git a/drivers/media/rc/keymaps/rc-pctv-sedna.c b/drivers/media/rc/keymaps/rc-pctv-sedna.c
index 5ef01ab3fd50..52b4558b7bd0 100644
--- a/drivers/media/rc/keymaps/rc-pctv-sedna.c
+++ b/drivers/media/rc/keymaps/rc-pctv-sedna.c
@@ -57,10 +57,10 @@ static struct rc_map_table pctv_sedna[] = {
 
 static struct rc_map_list pctv_sedna_map = {
 	.map = {
-		.scan    = pctv_sedna,
-		.size    = ARRAY_SIZE(pctv_sedna),
-		.rc_type = RC_TYPE_UNKNOWN,	/* Legacy IR type */
-		.name    = RC_MAP_PCTV_SEDNA,
+		.scan     = pctv_sedna,
+		.size     = ARRAY_SIZE(pctv_sedna),
+		.rc_proto = RC_PROTO_UNKNOWN,	/* Legacy IR type */
+		.name     = RC_MAP_PCTV_SEDNA,
 	}
 };
 
diff --git a/drivers/media/rc/keymaps/rc-pinnacle-color.c b/drivers/media/rc/keymaps/rc-pinnacle-color.c
index a218b471a4ca..973c9c34e304 100644
--- a/drivers/media/rc/keymaps/rc-pinnacle-color.c
+++ b/drivers/media/rc/keymaps/rc-pinnacle-color.c
@@ -71,10 +71,10 @@ static struct rc_map_table pinnacle_color[] = {
 
 static struct rc_map_list pinnacle_color_map = {
 	.map = {
-		.scan    = pinnacle_color,
-		.size    = ARRAY_SIZE(pinnacle_color),
-		.rc_type = RC_TYPE_UNKNOWN,	/* Legacy IR type */
-		.name    = RC_MAP_PINNACLE_COLOR,
+		.scan     = pinnacle_color,
+		.size     = ARRAY_SIZE(pinnacle_color),
+		.rc_proto = RC_PROTO_UNKNOWN,	/* Legacy IR type */
+		.name     = RC_MAP_PINNACLE_COLOR,
 	}
 };
 
diff --git a/drivers/media/rc/keymaps/rc-pinnacle-grey.c b/drivers/media/rc/keymaps/rc-pinnacle-grey.c
index 4a3f467a47a2..22e44b0d2a93 100644
--- a/drivers/media/rc/keymaps/rc-pinnacle-grey.c
+++ b/drivers/media/rc/keymaps/rc-pinnacle-grey.c
@@ -66,10 +66,10 @@ static struct rc_map_table pinnacle_grey[] = {
 
 static struct rc_map_list pinnacle_grey_map = {
 	.map = {
-		.scan    = pinnacle_grey,
-		.size    = ARRAY_SIZE(pinnacle_grey),
-		.rc_type = RC_TYPE_UNKNOWN,	/* Legacy IR type */
-		.name    = RC_MAP_PINNACLE_GREY,
+		.scan     = pinnacle_grey,
+		.size     = ARRAY_SIZE(pinnacle_grey),
+		.rc_proto = RC_PROTO_UNKNOWN,	/* Legacy IR type */
+		.name     = RC_MAP_PINNACLE_GREY,
 	}
 };
 
diff --git a/drivers/media/rc/keymaps/rc-pinnacle-pctv-hd.c b/drivers/media/rc/keymaps/rc-pinnacle-pctv-hd.c
index e89cc10b68bf..186dcf8e0491 100644
--- a/drivers/media/rc/keymaps/rc-pinnacle-pctv-hd.c
+++ b/drivers/media/rc/keymaps/rc-pinnacle-pctv-hd.c
@@ -47,10 +47,10 @@ static struct rc_map_table pinnacle_pctv_hd[] = {
 
 static struct rc_map_list pinnacle_pctv_hd_map = {
 	.map = {
-		.scan    = pinnacle_pctv_hd,
-		.size    = ARRAY_SIZE(pinnacle_pctv_hd),
-		.rc_type = RC_TYPE_RC5,
-		.name    = RC_MAP_PINNACLE_PCTV_HD,
+		.scan     = pinnacle_pctv_hd,
+		.size     = ARRAY_SIZE(pinnacle_pctv_hd),
+		.rc_proto = RC_PROTO_RC5,
+		.name     = RC_MAP_PINNACLE_PCTV_HD,
 	}
 };
 
diff --git a/drivers/media/rc/keymaps/rc-pixelview-002t.c b/drivers/media/rc/keymaps/rc-pixelview-002t.c
index d967c3816fdc..b235ada2e28f 100644
--- a/drivers/media/rc/keymaps/rc-pixelview-002t.c
+++ b/drivers/media/rc/keymaps/rc-pixelview-002t.c
@@ -54,10 +54,10 @@ static struct rc_map_table pixelview_002t[] = {
 
 static struct rc_map_list pixelview_map = {
 	.map = {
-		.scan    = pixelview_002t,
-		.size    = ARRAY_SIZE(pixelview_002t),
-		.rc_type = RC_TYPE_NEC,
-		.name    = RC_MAP_PIXELVIEW_002T,
+		.scan     = pixelview_002t,
+		.size     = ARRAY_SIZE(pixelview_002t),
+		.rc_proto = RC_PROTO_NEC,
+		.name     = RC_MAP_PIXELVIEW_002T,
 	}
 };
 
diff --git a/drivers/media/rc/keymaps/rc-pixelview-mk12.c b/drivers/media/rc/keymaps/rc-pixelview-mk12.c
index 224d0efaa6e5..453d52d663fe 100644
--- a/drivers/media/rc/keymaps/rc-pixelview-mk12.c
+++ b/drivers/media/rc/keymaps/rc-pixelview-mk12.c
@@ -60,10 +60,10 @@ static struct rc_map_table pixelview_mk12[] = {
 
 static struct rc_map_list pixelview_map = {
 	.map = {
-		.scan    = pixelview_mk12,
-		.size    = ARRAY_SIZE(pixelview_mk12),
-		.rc_type = RC_TYPE_NEC,
-		.name    = RC_MAP_PIXELVIEW_MK12,
+		.scan     = pixelview_mk12,
+		.size     = ARRAY_SIZE(pixelview_mk12),
+		.rc_proto = RC_PROTO_NEC,
+		.name     = RC_MAP_PIXELVIEW_MK12,
 	}
 };
 
diff --git a/drivers/media/rc/keymaps/rc-pixelview-new.c b/drivers/media/rc/keymaps/rc-pixelview-new.c
index 781d788d6b6d..ef97095ec8f1 100644
--- a/drivers/media/rc/keymaps/rc-pixelview-new.c
+++ b/drivers/media/rc/keymaps/rc-pixelview-new.c
@@ -60,10 +60,10 @@ static struct rc_map_table pixelview_new[] = {
 
 static struct rc_map_list pixelview_new_map = {
 	.map = {
-		.scan    = pixelview_new,
-		.size    = ARRAY_SIZE(pixelview_new),
-		.rc_type = RC_TYPE_UNKNOWN,	/* Legacy IR type */
-		.name    = RC_MAP_PIXELVIEW_NEW,
+		.scan     = pixelview_new,
+		.size     = ARRAY_SIZE(pixelview_new),
+		.rc_proto = RC_PROTO_UNKNOWN,	/* Legacy IR type */
+		.name     = RC_MAP_PIXELVIEW_NEW,
 	}
 };
 
diff --git a/drivers/media/rc/keymaps/rc-pixelview.c b/drivers/media/rc/keymaps/rc-pixelview.c
index 39e6feaa35a3..cfd8f80d3617 100644
--- a/drivers/media/rc/keymaps/rc-pixelview.c
+++ b/drivers/media/rc/keymaps/rc-pixelview.c
@@ -59,10 +59,10 @@ static struct rc_map_table pixelview[] = {
 
 static struct rc_map_list pixelview_map = {
 	.map = {
-		.scan    = pixelview,
-		.size    = ARRAY_SIZE(pixelview),
-		.rc_type = RC_TYPE_UNKNOWN,	/* Legacy IR type */
-		.name    = RC_MAP_PIXELVIEW,
+		.scan     = pixelview,
+		.size     = ARRAY_SIZE(pixelview),
+		.rc_proto = RC_PROTO_UNKNOWN,	/* Legacy IR type */
+		.name     = RC_MAP_PIXELVIEW,
 	}
 };
 
diff --git a/drivers/media/rc/keymaps/rc-powercolor-real-angel.c b/drivers/media/rc/keymaps/rc-powercolor-real-angel.c
index e96fa3ab9f4b..b63f82bcf29a 100644
--- a/drivers/media/rc/keymaps/rc-powercolor-real-angel.c
+++ b/drivers/media/rc/keymaps/rc-powercolor-real-angel.c
@@ -58,10 +58,10 @@ static struct rc_map_table powercolor_real_angel[] = {
 
 static struct rc_map_list powercolor_real_angel_map = {
 	.map = {
-		.scan    = powercolor_real_angel,
-		.size    = ARRAY_SIZE(powercolor_real_angel),
-		.rc_type = RC_TYPE_UNKNOWN,	/* Legacy IR type */
-		.name    = RC_MAP_POWERCOLOR_REAL_ANGEL,
+		.scan     = powercolor_real_angel,
+		.size     = ARRAY_SIZE(powercolor_real_angel),
+		.rc_proto = RC_PROTO_UNKNOWN,	/* Legacy IR type */
+		.name     = RC_MAP_POWERCOLOR_REAL_ANGEL,
 	}
 };
 
diff --git a/drivers/media/rc/keymaps/rc-proteus-2309.c b/drivers/media/rc/keymaps/rc-proteus-2309.c
index eef626ee02df..be34c517e4e1 100644
--- a/drivers/media/rc/keymaps/rc-proteus-2309.c
+++ b/drivers/media/rc/keymaps/rc-proteus-2309.c
@@ -46,10 +46,10 @@ static struct rc_map_table proteus_2309[] = {
 
 static struct rc_map_list proteus_2309_map = {
 	.map = {
-		.scan    = proteus_2309,
-		.size    = ARRAY_SIZE(proteus_2309),
-		.rc_type = RC_TYPE_UNKNOWN,	/* Legacy IR type */
-		.name    = RC_MAP_PROTEUS_2309,
+		.scan     = proteus_2309,
+		.size     = ARRAY_SIZE(proteus_2309),
+		.rc_proto = RC_PROTO_UNKNOWN,	/* Legacy IR type */
+		.name     = RC_MAP_PROTEUS_2309,
 	}
 };
 
diff --git a/drivers/media/rc/keymaps/rc-purpletv.c b/drivers/media/rc/keymaps/rc-purpletv.c
index cec6fe466829..84c40b97ee00 100644
--- a/drivers/media/rc/keymaps/rc-purpletv.c
+++ b/drivers/media/rc/keymaps/rc-purpletv.c
@@ -58,10 +58,10 @@ static struct rc_map_table purpletv[] = {
 
 static struct rc_map_list purpletv_map = {
 	.map = {
-		.scan    = purpletv,
-		.size    = ARRAY_SIZE(purpletv),
-		.rc_type = RC_TYPE_UNKNOWN,	/* Legacy IR type */
-		.name    = RC_MAP_PURPLETV,
+		.scan     = purpletv,
+		.size     = ARRAY_SIZE(purpletv),
+		.rc_proto = RC_PROTO_UNKNOWN,	/* Legacy IR type */
+		.name     = RC_MAP_PURPLETV,
 	}
 };
 
diff --git a/drivers/media/rc/keymaps/rc-pv951.c b/drivers/media/rc/keymaps/rc-pv951.c
index 5ac89ce8c053..be190ddebfc4 100644
--- a/drivers/media/rc/keymaps/rc-pv951.c
+++ b/drivers/media/rc/keymaps/rc-pv951.c
@@ -55,10 +55,10 @@ static struct rc_map_table pv951[] = {
 
 static struct rc_map_list pv951_map = {
 	.map = {
-		.scan    = pv951,
-		.size    = ARRAY_SIZE(pv951),
-		.rc_type = RC_TYPE_UNKNOWN,	/* Legacy IR type */
-		.name    = RC_MAP_PV951,
+		.scan     = pv951,
+		.size     = ARRAY_SIZE(pv951),
+		.rc_proto = RC_PROTO_UNKNOWN,	/* Legacy IR type */
+		.name     = RC_MAP_PV951,
 	}
 };
 
diff --git a/drivers/media/rc/keymaps/rc-rc6-mce.c b/drivers/media/rc/keymaps/rc-rc6-mce.c
index 5be567506bcd..0d87b20a0c43 100644
--- a/drivers/media/rc/keymaps/rc-rc6-mce.c
+++ b/drivers/media/rc/keymaps/rc-rc6-mce.c
@@ -96,10 +96,10 @@ static struct rc_map_table rc6_mce[] = {
 
 static struct rc_map_list rc6_mce_map = {
 	.map = {
-		.scan    = rc6_mce,
-		.size    = ARRAY_SIZE(rc6_mce),
-		.rc_type = RC_TYPE_RC6_MCE,
-		.name    = RC_MAP_RC6_MCE,
+		.scan     = rc6_mce,
+		.size     = ARRAY_SIZE(rc6_mce),
+		.rc_proto = RC_PROTO_RC6_MCE,
+		.name     = RC_MAP_RC6_MCE,
 	}
 };
 
diff --git a/drivers/media/rc/keymaps/rc-real-audio-220-32-keys.c b/drivers/media/rc/keymaps/rc-real-audio-220-32-keys.c
index 9f778bd091db..957fa21747ea 100644
--- a/drivers/media/rc/keymaps/rc-real-audio-220-32-keys.c
+++ b/drivers/media/rc/keymaps/rc-real-audio-220-32-keys.c
@@ -55,10 +55,10 @@ static struct rc_map_table real_audio_220_32_keys[] = {
 
 static struct rc_map_list real_audio_220_32_keys_map = {
 	.map = {
-		.scan    = real_audio_220_32_keys,
-		.size    = ARRAY_SIZE(real_audio_220_32_keys),
-		.rc_type = RC_TYPE_UNKNOWN,	/* Legacy IR type */
-		.name    = RC_MAP_REAL_AUDIO_220_32_KEYS,
+		.scan     = real_audio_220_32_keys,
+		.size     = ARRAY_SIZE(real_audio_220_32_keys),
+		.rc_proto = RC_PROTO_UNKNOWN,	/* Legacy IR type */
+		.name     = RC_MAP_REAL_AUDIO_220_32_KEYS,
 	}
 };
 
diff --git a/drivers/media/rc/keymaps/rc-reddo.c b/drivers/media/rc/keymaps/rc-reddo.c
index b80b336e9284..3b37acc7b144 100644
--- a/drivers/media/rc/keymaps/rc-reddo.c
+++ b/drivers/media/rc/keymaps/rc-reddo.c
@@ -62,10 +62,10 @@ static struct rc_map_table reddo[] = {
 
 static struct rc_map_list reddo_map = {
 	.map = {
-		.scan    = reddo,
-		.size    = ARRAY_SIZE(reddo),
-		.rc_type = RC_TYPE_NEC,
-		.name    = RC_MAP_REDDO,
+		.scan     = reddo,
+		.size     = ARRAY_SIZE(reddo),
+		.rc_proto = RC_PROTO_NEC,
+		.name     = RC_MAP_REDDO,
 	}
 };
 
diff --git a/drivers/media/rc/keymaps/rc-snapstream-firefly.c b/drivers/media/rc/keymaps/rc-snapstream-firefly.c
index c7f33ec719b4..30630a6f76ac 100644
--- a/drivers/media/rc/keymaps/rc-snapstream-firefly.c
+++ b/drivers/media/rc/keymaps/rc-snapstream-firefly.c
@@ -83,10 +83,10 @@ static struct rc_map_table snapstream_firefly[] = {
 
 static struct rc_map_list snapstream_firefly_map = {
 	.map = {
-		.scan    = snapstream_firefly,
-		.size    = ARRAY_SIZE(snapstream_firefly),
-		.rc_type = RC_TYPE_OTHER,
-		.name    = RC_MAP_SNAPSTREAM_FIREFLY,
+		.scan     = snapstream_firefly,
+		.size     = ARRAY_SIZE(snapstream_firefly),
+		.rc_proto = RC_PROTO_OTHER,
+		.name     = RC_MAP_SNAPSTREAM_FIREFLY,
 	}
 };
 
diff --git a/drivers/media/rc/keymaps/rc-streamzap.c b/drivers/media/rc/keymaps/rc-streamzap.c
index 23c061174ed7..b53bca9e4576 100644
--- a/drivers/media/rc/keymaps/rc-streamzap.c
+++ b/drivers/media/rc/keymaps/rc-streamzap.c
@@ -57,10 +57,10 @@ static struct rc_map_table streamzap[] = {
 
 static struct rc_map_list streamzap_map = {
 	.map = {
-		.scan    = streamzap,
-		.size    = ARRAY_SIZE(streamzap),
-		.rc_type = RC_TYPE_RC5_SZ,
-		.name    = RC_MAP_STREAMZAP,
+		.scan     = streamzap,
+		.size     = ARRAY_SIZE(streamzap),
+		.rc_proto = RC_PROTO_RC5_SZ,
+		.name     = RC_MAP_STREAMZAP,
 	}
 };
 
diff --git a/drivers/media/rc/keymaps/rc-su3000.c b/drivers/media/rc/keymaps/rc-su3000.c
index 8dbd3e9bc951..d9af7e3c55d9 100644
--- a/drivers/media/rc/keymaps/rc-su3000.c
+++ b/drivers/media/rc/keymaps/rc-su3000.c
@@ -51,10 +51,10 @@ static struct rc_map_table su3000[] = {
 
 static struct rc_map_list su3000_map = {
 	.map = {
-		.scan    = su3000,
-		.size    = ARRAY_SIZE(su3000),
-		.rc_type = RC_TYPE_RC5,
-		.name    = RC_MAP_SU3000,
+		.scan     = su3000,
+		.size     = ARRAY_SIZE(su3000),
+		.rc_proto = RC_PROTO_RC5,
+		.name     = RC_MAP_SU3000,
 	}
 };
 
diff --git a/drivers/media/rc/keymaps/rc-tbs-nec.c b/drivers/media/rc/keymaps/rc-tbs-nec.c
index 24ce2a252502..05facc043272 100644
--- a/drivers/media/rc/keymaps/rc-tbs-nec.c
+++ b/drivers/media/rc/keymaps/rc-tbs-nec.c
@@ -52,10 +52,10 @@ static struct rc_map_table tbs_nec[] = {
 
 static struct rc_map_list tbs_nec_map = {
 	.map = {
-		.scan    = tbs_nec,
-		.size    = ARRAY_SIZE(tbs_nec),
-		.rc_type = RC_TYPE_UNKNOWN,	/* Legacy IR type */
-		.name    = RC_MAP_TBS_NEC,
+		.scan     = tbs_nec,
+		.size     = ARRAY_SIZE(tbs_nec),
+		.rc_proto = RC_PROTO_UNKNOWN,	/* Legacy IR type */
+		.name     = RC_MAP_TBS_NEC,
 	}
 };
 
diff --git a/drivers/media/rc/keymaps/rc-technisat-ts35.c b/drivers/media/rc/keymaps/rc-technisat-ts35.c
index 3328cbefabad..dff7021734ba 100644
--- a/drivers/media/rc/keymaps/rc-technisat-ts35.c
+++ b/drivers/media/rc/keymaps/rc-technisat-ts35.c
@@ -53,10 +53,10 @@ static struct rc_map_table technisat_ts35[] = {
 
 static struct rc_map_list technisat_ts35_map = {
 	.map = {
-		.scan    = technisat_ts35,
-		.size    = ARRAY_SIZE(technisat_ts35),
-		.rc_type = RC_TYPE_UNKNOWN,
-		.name    = RC_MAP_TECHNISAT_TS35,
+		.scan     = technisat_ts35,
+		.size     = ARRAY_SIZE(technisat_ts35),
+		.rc_proto = RC_PROTO_UNKNOWN,
+		.name     = RC_MAP_TECHNISAT_TS35,
 	}
 };
 
diff --git a/drivers/media/rc/keymaps/rc-technisat-usb2.c b/drivers/media/rc/keymaps/rc-technisat-usb2.c
index 02c9c243c060..58b3baf5ee96 100644
--- a/drivers/media/rc/keymaps/rc-technisat-usb2.c
+++ b/drivers/media/rc/keymaps/rc-technisat-usb2.c
@@ -66,10 +66,10 @@ static struct rc_map_table technisat_usb2[] = {
 
 static struct rc_map_list technisat_usb2_map = {
 	.map = {
-		.scan    = technisat_usb2,
-		.size    = ARRAY_SIZE(technisat_usb2),
-		.rc_type = RC_TYPE_RC5,
-		.name    = RC_MAP_TECHNISAT_USB2,
+		.scan     = technisat_usb2,
+		.size     = ARRAY_SIZE(technisat_usb2),
+		.rc_proto = RC_PROTO_RC5,
+		.name     = RC_MAP_TECHNISAT_USB2,
 	}
 };
 
diff --git a/drivers/media/rc/keymaps/rc-terratec-cinergy-c-pci.c b/drivers/media/rc/keymaps/rc-terratec-cinergy-c-pci.c
index 7958f458527a..7ae88ccf1def 100644
--- a/drivers/media/rc/keymaps/rc-terratec-cinergy-c-pci.c
+++ b/drivers/media/rc/keymaps/rc-terratec-cinergy-c-pci.c
@@ -65,10 +65,10 @@ static struct rc_map_table terratec_cinergy_c_pci[] = {
 
 static struct rc_map_list terratec_cinergy_c_pci_map = {
 	.map = {
-		.scan    = terratec_cinergy_c_pci,
-		.size    = ARRAY_SIZE(terratec_cinergy_c_pci),
-		.rc_type = RC_TYPE_UNKNOWN,	/* Legacy IR type */
-		.name    = RC_MAP_TERRATEC_CINERGY_C_PCI,
+		.scan     = terratec_cinergy_c_pci,
+		.size     = ARRAY_SIZE(terratec_cinergy_c_pci),
+		.rc_proto = RC_PROTO_UNKNOWN,	/* Legacy IR type */
+		.name     = RC_MAP_TERRATEC_CINERGY_C_PCI,
 	}
 };
 
diff --git a/drivers/media/rc/keymaps/rc-terratec-cinergy-s2-hd.c b/drivers/media/rc/keymaps/rc-terratec-cinergy-s2-hd.c
index 1e096bbda4a0..bf0171b05ac2 100644
--- a/drivers/media/rc/keymaps/rc-terratec-cinergy-s2-hd.c
+++ b/drivers/media/rc/keymaps/rc-terratec-cinergy-s2-hd.c
@@ -63,10 +63,10 @@ static struct rc_map_table terratec_cinergy_s2_hd[] = {
 
 static struct rc_map_list terratec_cinergy_s2_hd_map = {
 	.map = {
-		.scan    = terratec_cinergy_s2_hd,
-		.size    = ARRAY_SIZE(terratec_cinergy_s2_hd),
-		.rc_type = RC_TYPE_UNKNOWN,	/* Legacy IR type */
-		.name    = RC_MAP_TERRATEC_CINERGY_S2_HD,
+		.scan     = terratec_cinergy_s2_hd,
+		.size     = ARRAY_SIZE(terratec_cinergy_s2_hd),
+		.rc_proto = RC_PROTO_UNKNOWN,	/* Legacy IR type */
+		.name     = RC_MAP_TERRATEC_CINERGY_S2_HD,
 	}
 };
 
diff --git a/drivers/media/rc/keymaps/rc-terratec-cinergy-xs.c b/drivers/media/rc/keymaps/rc-terratec-cinergy-xs.c
index 97eb83ab5a35..3d0f6f7e5bea 100644
--- a/drivers/media/rc/keymaps/rc-terratec-cinergy-xs.c
+++ b/drivers/media/rc/keymaps/rc-terratec-cinergy-xs.c
@@ -69,10 +69,10 @@ static struct rc_map_table terratec_cinergy_xs[] = {
 
 static struct rc_map_list terratec_cinergy_xs_map = {
 	.map = {
-		.scan    = terratec_cinergy_xs,
-		.size    = ARRAY_SIZE(terratec_cinergy_xs),
-		.rc_type = RC_TYPE_UNKNOWN,	/* Legacy IR type */
-		.name    = RC_MAP_TERRATEC_CINERGY_XS,
+		.scan     = terratec_cinergy_xs,
+		.size     = ARRAY_SIZE(terratec_cinergy_xs),
+		.rc_proto = RC_PROTO_UNKNOWN,	/* Legacy IR type */
+		.name     = RC_MAP_TERRATEC_CINERGY_XS,
 	}
 };
 
diff --git a/drivers/media/rc/keymaps/rc-terratec-slim-2.c b/drivers/media/rc/keymaps/rc-terratec-slim-2.c
index 4c149ef712dc..df57e0a45820 100644
--- a/drivers/media/rc/keymaps/rc-terratec-slim-2.c
+++ b/drivers/media/rc/keymaps/rc-terratec-slim-2.c
@@ -49,10 +49,10 @@ static struct rc_map_table terratec_slim_2[] = {
 
 static struct rc_map_list terratec_slim_2_map = {
 	.map = {
-		.scan    = terratec_slim_2,
-		.size    = ARRAY_SIZE(terratec_slim_2),
-		.rc_type = RC_TYPE_NEC,
-		.name    = RC_MAP_TERRATEC_SLIM_2,
+		.scan     = terratec_slim_2,
+		.size     = ARRAY_SIZE(terratec_slim_2),
+		.rc_proto = RC_PROTO_NEC,
+		.name     = RC_MAP_TERRATEC_SLIM_2,
 	}
 };
 
diff --git a/drivers/media/rc/keymaps/rc-terratec-slim.c b/drivers/media/rc/keymaps/rc-terratec-slim.c
index 3d8a19cdb5a2..628272c58d65 100644
--- a/drivers/media/rc/keymaps/rc-terratec-slim.c
+++ b/drivers/media/rc/keymaps/rc-terratec-slim.c
@@ -56,10 +56,10 @@ static struct rc_map_table terratec_slim[] = {
 
 static struct rc_map_list terratec_slim_map = {
 	.map = {
-		.scan    = terratec_slim,
-		.size    = ARRAY_SIZE(terratec_slim),
-		.rc_type = RC_TYPE_NEC,
-		.name    = RC_MAP_TERRATEC_SLIM,
+		.scan     = terratec_slim,
+		.size     = ARRAY_SIZE(terratec_slim),
+		.rc_proto = RC_PROTO_NEC,
+		.name     = RC_MAP_TERRATEC_SLIM,
 	}
 };
 
diff --git a/drivers/media/rc/keymaps/rc-tevii-nec.c b/drivers/media/rc/keymaps/rc-tevii-nec.c
index 38e0c0875596..31f8a0fd1f2c 100644
--- a/drivers/media/rc/keymaps/rc-tevii-nec.c
+++ b/drivers/media/rc/keymaps/rc-tevii-nec.c
@@ -65,10 +65,10 @@ static struct rc_map_table tevii_nec[] = {
 
 static struct rc_map_list tevii_nec_map = {
 	.map = {
-		.scan    = tevii_nec,
-		.size    = ARRAY_SIZE(tevii_nec),
-		.rc_type = RC_TYPE_UNKNOWN,	/* Legacy IR type */
-		.name    = RC_MAP_TEVII_NEC,
+		.scan     = tevii_nec,
+		.size     = ARRAY_SIZE(tevii_nec),
+		.rc_proto = RC_PROTO_UNKNOWN,	/* Legacy IR type */
+		.name     = RC_MAP_TEVII_NEC,
 	}
 };
 
diff --git a/drivers/media/rc/keymaps/rc-tivo.c b/drivers/media/rc/keymaps/rc-tivo.c
index 5cc1b456e329..1962e33c8f4e 100644
--- a/drivers/media/rc/keymaps/rc-tivo.c
+++ b/drivers/media/rc/keymaps/rc-tivo.c
@@ -75,10 +75,10 @@ static struct rc_map_table tivo[] = {
 
 static struct rc_map_list tivo_map = {
 	.map = {
-		.scan    = tivo,
-		.size    = ARRAY_SIZE(tivo),
-		.rc_type = RC_TYPE_NEC,
-		.name    = RC_MAP_TIVO,
+		.scan     = tivo,
+		.size     = ARRAY_SIZE(tivo),
+		.rc_proto = RC_PROTO_NEC,
+		.name     = RC_MAP_TIVO,
 	}
 };
 
diff --git a/drivers/media/rc/keymaps/rc-total-media-in-hand-02.c b/drivers/media/rc/keymaps/rc-total-media-in-hand-02.c
index 47270f72ebf0..eeeca142f7b1 100644
--- a/drivers/media/rc/keymaps/rc-total-media-in-hand-02.c
+++ b/drivers/media/rc/keymaps/rc-total-media-in-hand-02.c
@@ -62,10 +62,10 @@ static struct rc_map_table total_media_in_hand_02[] = {
 
 static struct rc_map_list total_media_in_hand_02_map = {
 	.map = {
-		.scan    = total_media_in_hand_02,
-		.size    = ARRAY_SIZE(total_media_in_hand_02),
-		.rc_type = RC_TYPE_RC5,
-		.name    = RC_MAP_TOTAL_MEDIA_IN_HAND_02,
+		.scan     = total_media_in_hand_02,
+		.size     = ARRAY_SIZE(total_media_in_hand_02),
+		.rc_proto = RC_PROTO_RC5,
+		.name     = RC_MAP_TOTAL_MEDIA_IN_HAND_02,
 	}
 };
 
diff --git a/drivers/media/rc/keymaps/rc-total-media-in-hand.c b/drivers/media/rc/keymaps/rc-total-media-in-hand.c
index 5b9f9ec13680..bc73bee309d8 100644
--- a/drivers/media/rc/keymaps/rc-total-media-in-hand.c
+++ b/drivers/media/rc/keymaps/rc-total-media-in-hand.c
@@ -62,10 +62,10 @@ static struct rc_map_table total_media_in_hand[] = {
 
 static struct rc_map_list total_media_in_hand_map = {
 	.map = {
-		.scan    = total_media_in_hand,
-		.size    = ARRAY_SIZE(total_media_in_hand),
-		.rc_type = RC_TYPE_NEC,
-		.name    = RC_MAP_TOTAL_MEDIA_IN_HAND,
+		.scan     = total_media_in_hand,
+		.size     = ARRAY_SIZE(total_media_in_hand),
+		.rc_proto = RC_PROTO_NEC,
+		.name     = RC_MAP_TOTAL_MEDIA_IN_HAND,
 	}
 };
 
diff --git a/drivers/media/rc/keymaps/rc-trekstor.c b/drivers/media/rc/keymaps/rc-trekstor.c
index f9a2e0fabb9f..63f966219342 100644
--- a/drivers/media/rc/keymaps/rc-trekstor.c
+++ b/drivers/media/rc/keymaps/rc-trekstor.c
@@ -57,10 +57,10 @@ static struct rc_map_table trekstor[] = {
 
 static struct rc_map_list trekstor_map = {
 	.map = {
-		.scan    = trekstor,
-		.size    = ARRAY_SIZE(trekstor),
-		.rc_type = RC_TYPE_NEC,
-		.name    = RC_MAP_TREKSTOR,
+		.scan     = trekstor,
+		.size     = ARRAY_SIZE(trekstor),
+		.rc_proto = RC_PROTO_NEC,
+		.name     = RC_MAP_TREKSTOR,
 	}
 };
 
diff --git a/drivers/media/rc/keymaps/rc-tt-1500.c b/drivers/media/rc/keymaps/rc-tt-1500.c
index c766d3b2b6b0..374c230705d2 100644
--- a/drivers/media/rc/keymaps/rc-tt-1500.c
+++ b/drivers/media/rc/keymaps/rc-tt-1500.c
@@ -59,10 +59,10 @@ static struct rc_map_table tt_1500[] = {
 
 static struct rc_map_list tt_1500_map = {
 	.map = {
-		.scan    = tt_1500,
-		.size    = ARRAY_SIZE(tt_1500),
-		.rc_type = RC_TYPE_RC5,
-		.name    = RC_MAP_TT_1500,
+		.scan     = tt_1500,
+		.size     = ARRAY_SIZE(tt_1500),
+		.rc_proto = RC_PROTO_RC5,
+		.name     = RC_MAP_TT_1500,
 	}
 };
 
diff --git a/drivers/media/rc/keymaps/rc-twinhan-dtv-cab-ci.c b/drivers/media/rc/keymaps/rc-twinhan-dtv-cab-ci.c
index 202500cb3061..240d720d440c 100644
--- a/drivers/media/rc/keymaps/rc-twinhan-dtv-cab-ci.c
+++ b/drivers/media/rc/keymaps/rc-twinhan-dtv-cab-ci.c
@@ -75,10 +75,10 @@ static struct rc_map_table twinhan_dtv_cab_ci[] = {
 
 static struct rc_map_list twinhan_dtv_cab_ci_map = {
 	.map = {
-		.scan    = twinhan_dtv_cab_ci,
-		.size    = ARRAY_SIZE(twinhan_dtv_cab_ci),
-		.rc_type = RC_TYPE_UNKNOWN,	/* Legacy IR type */
-		.name    = RC_MAP_TWINHAN_DTV_CAB_CI,
+		.scan     = twinhan_dtv_cab_ci,
+		.size     = ARRAY_SIZE(twinhan_dtv_cab_ci),
+		.rc_proto = RC_PROTO_UNKNOWN,	/* Legacy IR type */
+		.name     = RC_MAP_TWINHAN_DTV_CAB_CI,
 	}
 };
 
diff --git a/drivers/media/rc/keymaps/rc-twinhan1027.c b/drivers/media/rc/keymaps/rc-twinhan1027.c
index 509299b90c90..2275b37c61d2 100644
--- a/drivers/media/rc/keymaps/rc-twinhan1027.c
+++ b/drivers/media/rc/keymaps/rc-twinhan1027.c
@@ -64,10 +64,10 @@ static struct rc_map_table twinhan_vp1027[] = {
 
 static struct rc_map_list twinhan_vp1027_map = {
 	.map = {
-		.scan    = twinhan_vp1027,
-		.size    = ARRAY_SIZE(twinhan_vp1027),
-		.rc_type = RC_TYPE_UNKNOWN,	/* Legacy IR type */
-		.name    = RC_MAP_TWINHAN_VP1027_DVBS,
+		.scan     = twinhan_vp1027,
+		.size     = ARRAY_SIZE(twinhan_vp1027),
+		.rc_proto = RC_PROTO_UNKNOWN,	/* Legacy IR type */
+		.name     = RC_MAP_TWINHAN_VP1027_DVBS,
 	}
 };
 
diff --git a/drivers/media/rc/keymaps/rc-videomate-m1f.c b/drivers/media/rc/keymaps/rc-videomate-m1f.c
index 23ee05e53949..fe02e047bd01 100644
--- a/drivers/media/rc/keymaps/rc-videomate-m1f.c
+++ b/drivers/media/rc/keymaps/rc-videomate-m1f.c
@@ -69,10 +69,10 @@ static struct rc_map_table videomate_k100[] = {
 
 static struct rc_map_list videomate_k100_map = {
 	.map = {
-		.scan    = videomate_k100,
-		.size    = ARRAY_SIZE(videomate_k100),
-		.rc_type = RC_TYPE_UNKNOWN,     /* Legacy IR type */
-		.name    = RC_MAP_VIDEOMATE_K100,
+		.scan     = videomate_k100,
+		.size     = ARRAY_SIZE(videomate_k100),
+		.rc_proto = RC_PROTO_UNKNOWN,     /* Legacy IR type */
+		.name     = RC_MAP_VIDEOMATE_K100,
 	}
 };
 
diff --git a/drivers/media/rc/keymaps/rc-videomate-s350.c b/drivers/media/rc/keymaps/rc-videomate-s350.c
index 8a354775a2d8..b4f103269872 100644
--- a/drivers/media/rc/keymaps/rc-videomate-s350.c
+++ b/drivers/media/rc/keymaps/rc-videomate-s350.c
@@ -62,10 +62,10 @@ static struct rc_map_table videomate_s350[] = {
 
 static struct rc_map_list videomate_s350_map = {
 	.map = {
-		.scan    = videomate_s350,
-		.size    = ARRAY_SIZE(videomate_s350),
-		.rc_type = RC_TYPE_UNKNOWN,	/* Legacy IR type */
-		.name    = RC_MAP_VIDEOMATE_S350,
+		.scan     = videomate_s350,
+		.size     = ARRAY_SIZE(videomate_s350),
+		.rc_proto = RC_PROTO_UNKNOWN,	/* Legacy IR type */
+		.name     = RC_MAP_VIDEOMATE_S350,
 	}
 };
 
diff --git a/drivers/media/rc/keymaps/rc-videomate-tv-pvr.c b/drivers/media/rc/keymaps/rc-videomate-tv-pvr.c
index eb0cda7766c4..c431fdf44057 100644
--- a/drivers/media/rc/keymaps/rc-videomate-tv-pvr.c
+++ b/drivers/media/rc/keymaps/rc-videomate-tv-pvr.c
@@ -64,10 +64,10 @@ static struct rc_map_table videomate_tv_pvr[] = {
 
 static struct rc_map_list videomate_tv_pvr_map = {
 	.map = {
-		.scan    = videomate_tv_pvr,
-		.size    = ARRAY_SIZE(videomate_tv_pvr),
-		.rc_type = RC_TYPE_UNKNOWN,	/* Legacy IR type */
-		.name    = RC_MAP_VIDEOMATE_TV_PVR,
+		.scan     = videomate_tv_pvr,
+		.size     = ARRAY_SIZE(videomate_tv_pvr),
+		.rc_proto = RC_PROTO_UNKNOWN,	/* Legacy IR type */
+		.name     = RC_MAP_VIDEOMATE_TV_PVR,
 	}
 };
 
diff --git a/drivers/media/rc/keymaps/rc-winfast-usbii-deluxe.c b/drivers/media/rc/keymaps/rc-winfast-usbii-deluxe.c
index c1dd598e828e..5a437e61bd5d 100644
--- a/drivers/media/rc/keymaps/rc-winfast-usbii-deluxe.c
+++ b/drivers/media/rc/keymaps/rc-winfast-usbii-deluxe.c
@@ -59,10 +59,10 @@ static struct rc_map_table winfast_usbii_deluxe[] = {
 
 static struct rc_map_list winfast_usbii_deluxe_map = {
 	.map = {
-		.scan    = winfast_usbii_deluxe,
-		.size    = ARRAY_SIZE(winfast_usbii_deluxe),
-		.rc_type = RC_TYPE_UNKNOWN,	/* Legacy IR type */
-		.name    = RC_MAP_WINFAST_USBII_DELUXE,
+		.scan     = winfast_usbii_deluxe,
+		.size     = ARRAY_SIZE(winfast_usbii_deluxe),
+		.rc_proto = RC_PROTO_UNKNOWN,	/* Legacy IR type */
+		.name     = RC_MAP_WINFAST_USBII_DELUXE,
 	}
 };
 
diff --git a/drivers/media/rc/keymaps/rc-winfast.c b/drivers/media/rc/keymaps/rc-winfast.c
index 8a779da1e973..53685d1f9a47 100644
--- a/drivers/media/rc/keymaps/rc-winfast.c
+++ b/drivers/media/rc/keymaps/rc-winfast.c
@@ -79,10 +79,10 @@ static struct rc_map_table winfast[] = {
 
 static struct rc_map_list winfast_map = {
 	.map = {
-		.scan    = winfast,
-		.size    = ARRAY_SIZE(winfast),
-		.rc_type = RC_TYPE_UNKNOWN,	/* Legacy IR type */
-		.name    = RC_MAP_WINFAST,
+		.scan     = winfast,
+		.size     = ARRAY_SIZE(winfast),
+		.rc_proto = RC_PROTO_UNKNOWN,	/* Legacy IR type */
+		.name     = RC_MAP_WINFAST,
 	}
 };
 
diff --git a/drivers/media/rc/keymaps/rc-zx-irdec.c b/drivers/media/rc/keymaps/rc-zx-irdec.c
index cc889df47eb8..5bf3ab002afc 100644
--- a/drivers/media/rc/keymaps/rc-zx-irdec.c
+++ b/drivers/media/rc/keymaps/rc-zx-irdec.c
@@ -57,7 +57,7 @@ static struct rc_map_list zx_irdec_map = {
 	.map = {
 		.scan = zx_irdec_table,
 		.size = ARRAY_SIZE(zx_irdec_table),
-		.rc_type = RC_TYPE_NEC,
+		.rc_proto = RC_PROTO_NEC,
 		.name = RC_MAP_ZX_IRDEC,
 	}
 };
diff --git a/drivers/media/rc/mceusb.c b/drivers/media/rc/mceusb.c
index 60258889b162..bf7aaff3aa37 100644
--- a/drivers/media/rc/mceusb.c
+++ b/drivers/media/rc/mceusb.c
@@ -1267,7 +1267,7 @@ static struct rc_dev *mceusb_init_rc_dev(struct mceusb_dev *ir)
 	usb_to_input_id(ir->usbdev, &rc->input_id);
 	rc->dev.parent = dev;
 	rc->priv = ir;
-	rc->allowed_protocols = RC_BIT_ALL_IR_DECODER;
+	rc->allowed_protocols = RC_PROTO_BIT_ALL_IR_DECODER;
 	rc->timeout = MS_TO_NS(100);
 	if (!ir->flags.no_tx) {
 		rc->s_tx_mask = mceusb_set_tx_mask;
diff --git a/drivers/media/rc/meson-ir.c b/drivers/media/rc/meson-ir.c
index dfe3da487be0..f2204eb77e2a 100644
--- a/drivers/media/rc/meson-ir.c
+++ b/drivers/media/rc/meson-ir.c
@@ -143,7 +143,7 @@ static int meson_ir_probe(struct platform_device *pdev)
 	ir->rc->input_id.bustype = BUS_HOST;
 	map_name = of_get_property(node, "linux,rc-map-name", NULL);
 	ir->rc->map_name = map_name ? map_name : RC_MAP_EMPTY;
-	ir->rc->allowed_protocols = RC_BIT_ALL_IR_DECODER;
+	ir->rc->allowed_protocols = RC_PROTO_BIT_ALL_IR_DECODER;
 	ir->rc->rx_resolution = US_TO_NS(MESON_TRATE);
 	ir->rc->timeout = MS_TO_NS(200);
 	ir->rc->driver_name = DRIVER_NAME;
diff --git a/drivers/media/rc/mtk-cir.c b/drivers/media/rc/mtk-cir.c
index da4461fabce6..e88eb64e8e69 100644
--- a/drivers/media/rc/mtk-cir.c
+++ b/drivers/media/rc/mtk-cir.c
@@ -353,7 +353,7 @@ static int mtk_ir_probe(struct platform_device *pdev)
 	ir->rc->map_name = map_name ?: RC_MAP_EMPTY;
 	ir->rc->dev.parent = dev;
 	ir->rc->driver_name = MTK_IR_DEV;
-	ir->rc->allowed_protocols = RC_BIT_ALL;
+	ir->rc->allowed_protocols = RC_PROTO_BIT_ALL;
 	ir->rc->rx_resolution = MTK_IR_SAMPLE;
 	ir->rc->timeout = MTK_MAX_SAMPLES * (MTK_IR_SAMPLE + 1);
 
diff --git a/drivers/media/rc/nuvoton-cir.c b/drivers/media/rc/nuvoton-cir.c
index 5249cb58ea73..5e1d866a61a5 100644
--- a/drivers/media/rc/nuvoton-cir.c
+++ b/drivers/media/rc/nuvoton-cir.c
@@ -1023,8 +1023,8 @@ static int nvt_probe(struct pnp_dev *pdev, const struct pnp_device_id *dev_id)
 
 	/* Set up the rc device */
 	rdev->priv = nvt;
-	rdev->allowed_protocols = RC_BIT_ALL_IR_DECODER;
-	rdev->allowed_wakeup_protocols = RC_BIT_ALL_IR_ENCODER;
+	rdev->allowed_protocols = RC_PROTO_BIT_ALL_IR_DECODER;
+	rdev->allowed_wakeup_protocols = RC_PROTO_BIT_ALL_IR_ENCODER;
 	rdev->encode_wakeup = true;
 	rdev->open = nvt_open;
 	rdev->close = nvt_close;
diff --git a/drivers/media/rc/rc-core-priv.h b/drivers/media/rc/rc-core-priv.h
index 5e5b10fbc47e..7da9c96cb058 100644
--- a/drivers/media/rc/rc-core-priv.h
+++ b/drivers/media/rc/rc-core-priv.h
@@ -27,7 +27,7 @@ struct ir_raw_handler {
 
 	u64 protocols; /* which are handled by this handler */
 	int (*decode)(struct rc_dev *dev, struct ir_raw_event event);
-	int (*encode)(enum rc_type protocol, u32 scancode,
+	int (*encode)(enum rc_proto protocol, u32 scancode,
 		      struct ir_raw_event *events, unsigned int max);
 
 	/* These two should only be used by the lirc decoder */
diff --git a/drivers/media/rc/rc-ir-raw.c b/drivers/media/rc/rc-ir-raw.c
index 1761be8c7028..f495709e28fb 100644
--- a/drivers/media/rc/rc-ir-raw.c
+++ b/drivers/media/rc/rc-ir-raw.c
@@ -213,7 +213,7 @@ ir_raw_get_allowed_protocols(void)
 	return atomic64_read(&available_protocols);
 }
 
-static int change_protocol(struct rc_dev *dev, u64 *rc_type)
+static int change_protocol(struct rc_dev *dev, u64 *rc_proto)
 {
 	/* the caller will update dev->enabled_protocols */
 	return 0;
@@ -450,7 +450,7 @@ EXPORT_SYMBOL(ir_raw_gen_pl);
  *		-EINVAL if the scancode is ambiguous or invalid, or if no
  *		compatible encoder was found.
  */
-int ir_raw_encode_scancode(enum rc_type protocol, u32 scancode,
+int ir_raw_encode_scancode(enum rc_proto protocol, u32 scancode,
 			   struct ir_raw_event *events, unsigned int max)
 {
 	struct ir_raw_handler *handler;
diff --git a/drivers/media/rc/rc-loopback.c b/drivers/media/rc/rc-loopback.c
index 46cc9c29d68a..3822d9ebcb46 100644
--- a/drivers/media/rc/rc-loopback.c
+++ b/drivers/media/rc/rc-loopback.c
@@ -226,8 +226,8 @@ static int __init loop_init(void)
 	rc->driver_name		= DRIVER_NAME;
 	rc->map_name		= RC_MAP_EMPTY;
 	rc->priv		= &loopdev;
-	rc->allowed_protocols	= RC_BIT_ALL_IR_DECODER;
-	rc->allowed_wakeup_protocols = RC_BIT_ALL_IR_ENCODER;
+	rc->allowed_protocols	= RC_PROTO_BIT_ALL_IR_DECODER;
+	rc->allowed_wakeup_protocols = RC_PROTO_BIT_ALL_IR_ENCODER;
 	rc->encode_wakeup	= true;
 	rc->timeout		= 100 * 1000 * 1000; /* 100 ms */
 	rc->min_timeout		= 1;
diff --git a/drivers/media/rc/rc-main.c b/drivers/media/rc/rc-main.c
index f2d3cc450d08..981cccd6b988 100644
--- a/drivers/media/rc/rc-main.c
+++ b/drivers/media/rc/rc-main.c
@@ -35,48 +35,48 @@ static const struct {
 	unsigned int repeat_period;
 	unsigned int scancode_bits;
 } protocols[] = {
-	[RC_TYPE_UNKNOWN] = { .name = "unknown", .repeat_period = 250 },
-	[RC_TYPE_OTHER] = { .name = "other", .repeat_period = 250 },
-	[RC_TYPE_RC5] = { .name = "rc-5",
+	[RC_PROTO_UNKNOWN] = { .name = "unknown", .repeat_period = 250 },
+	[RC_PROTO_OTHER] = { .name = "other", .repeat_period = 250 },
+	[RC_PROTO_RC5] = { .name = "rc-5",
 		.scancode_bits = 0x1f7f, .repeat_period = 164 },
-	[RC_TYPE_RC5X_20] = { .name = "rc-5x-20",
+	[RC_PROTO_RC5X_20] = { .name = "rc-5x-20",
 		.scancode_bits = 0x1f7f3f, .repeat_period = 164 },
-	[RC_TYPE_RC5_SZ] = { .name = "rc-5-sz",
+	[RC_PROTO_RC5_SZ] = { .name = "rc-5-sz",
 		.scancode_bits = 0x2fff, .repeat_period = 164 },
-	[RC_TYPE_JVC] = { .name = "jvc",
+	[RC_PROTO_JVC] = { .name = "jvc",
 		.scancode_bits = 0xffff, .repeat_period = 250 },
-	[RC_TYPE_SONY12] = { .name = "sony-12",
+	[RC_PROTO_SONY12] = { .name = "sony-12",
 		.scancode_bits = 0x1f007f, .repeat_period = 100 },
-	[RC_TYPE_SONY15] = { .name = "sony-15",
+	[RC_PROTO_SONY15] = { .name = "sony-15",
 		.scancode_bits = 0xff007f, .repeat_period = 100 },
-	[RC_TYPE_SONY20] = { .name = "sony-20",
+	[RC_PROTO_SONY20] = { .name = "sony-20",
 		.scancode_bits = 0x1fff7f, .repeat_period = 100 },
-	[RC_TYPE_NEC] = { .name = "nec",
+	[RC_PROTO_NEC] = { .name = "nec",
 		.scancode_bits = 0xffff, .repeat_period = 160 },
-	[RC_TYPE_NECX] = { .name = "nec-x",
+	[RC_PROTO_NECX] = { .name = "nec-x",
 		.scancode_bits = 0xffffff, .repeat_period = 160 },
-	[RC_TYPE_NEC32] = { .name = "nec-32",
+	[RC_PROTO_NEC32] = { .name = "nec-32",
 		.scancode_bits = 0xffffffff, .repeat_period = 160 },
-	[RC_TYPE_SANYO] = { .name = "sanyo",
+	[RC_PROTO_SANYO] = { .name = "sanyo",
 		.scancode_bits = 0x1fffff, .repeat_period = 250 },
-	[RC_TYPE_MCIR2_KBD] = { .name = "mcir2-kbd",
+	[RC_PROTO_MCIR2_KBD] = { .name = "mcir2-kbd",
 		.scancode_bits = 0xffff, .repeat_period = 150 },
-	[RC_TYPE_MCIR2_MSE] = { .name = "mcir2-mse",
+	[RC_PROTO_MCIR2_MSE] = { .name = "mcir2-mse",
 		.scancode_bits = 0x1fffff, .repeat_period = 150 },
-	[RC_TYPE_RC6_0] = { .name = "rc-6-0",
+	[RC_PROTO_RC6_0] = { .name = "rc-6-0",
 		.scancode_bits = 0xffff, .repeat_period = 164 },
-	[RC_TYPE_RC6_6A_20] = { .name = "rc-6-6a-20",
+	[RC_PROTO_RC6_6A_20] = { .name = "rc-6-6a-20",
 		.scancode_bits = 0xfffff, .repeat_period = 164 },
-	[RC_TYPE_RC6_6A_24] = { .name = "rc-6-6a-24",
+	[RC_PROTO_RC6_6A_24] = { .name = "rc-6-6a-24",
 		.scancode_bits = 0xffffff, .repeat_period = 164 },
-	[RC_TYPE_RC6_6A_32] = { .name = "rc-6-6a-32",
+	[RC_PROTO_RC6_6A_32] = { .name = "rc-6-6a-32",
 		.scancode_bits = 0xffffffff, .repeat_period = 164 },
-	[RC_TYPE_RC6_MCE] = { .name = "rc-6-mce",
+	[RC_PROTO_RC6_MCE] = { .name = "rc-6-mce",
 		.scancode_bits = 0xffff7fff, .repeat_period = 164 },
-	[RC_TYPE_SHARP] = { .name = "sharp",
+	[RC_PROTO_SHARP] = { .name = "sharp",
 		.scancode_bits = 0x1fff, .repeat_period = 250 },
-	[RC_TYPE_XMP] = { .name = "xmp", .repeat_period = 250 },
-	[RC_TYPE_CEC] = { .name = "cec", .repeat_period = 550 },
+	[RC_PROTO_XMP] = { .name = "xmp", .repeat_period = 250 },
+	[RC_PROTO_CEC] = { .name = "cec", .repeat_period = 550 },
 };
 
 /* Used to keep track of known keymaps */
@@ -156,10 +156,10 @@ static struct rc_map_table empty[] = {
 
 static struct rc_map_list empty_map = {
 	.map = {
-		.scan    = empty,
-		.size    = ARRAY_SIZE(empty),
-		.rc_type = RC_TYPE_UNKNOWN,	/* Legacy IR type */
-		.name    = RC_MAP_EMPTY,
+		.scan     = empty,
+		.size     = ARRAY_SIZE(empty),
+		.rc_proto = RC_PROTO_UNKNOWN,	/* Legacy IR type */
+		.name     = RC_MAP_EMPTY,
 	}
 };
 
@@ -167,7 +167,7 @@ static struct rc_map_list empty_map = {
  * ir_create_table() - initializes a scancode table
  * @rc_map:	the rc_map to initialize
  * @name:	name to assign to the table
- * @rc_type:	ir type to assign to the new table
+ * @rc_proto:	ir type to assign to the new table
  * @size:	initial size of the table
  * @return:	zero on success or a negative error code
  *
@@ -175,12 +175,12 @@ static struct rc_map_list empty_map = {
  * memory to hold at least the specified number of elements.
  */
 static int ir_create_table(struct rc_map *rc_map,
-			   const char *name, u64 rc_type, size_t size)
+			   const char *name, u64 rc_proto, size_t size)
 {
 	rc_map->name = kstrdup(name, GFP_KERNEL);
 	if (!rc_map->name)
 		return -ENOMEM;
-	rc_map->rc_type = rc_type;
+	rc_map->rc_proto = rc_proto;
 	rc_map->alloc = roundup_pow_of_two(size * sizeof(struct rc_map_table));
 	rc_map->size = rc_map->alloc / sizeof(struct rc_map_table);
 	rc_map->scan = kmalloc(rc_map->alloc, GFP_KERNEL);
@@ -435,7 +435,7 @@ static int ir_setkeytable(struct rc_dev *dev,
 	int rc;
 
 	rc = ir_create_table(rc_map, from->name,
-			     from->rc_type, from->size);
+			     from->rc_proto, from->size);
 	if (rc)
 		return rc;
 
@@ -688,7 +688,7 @@ EXPORT_SYMBOL_GPL(rc_repeat);
  * This function is used internally to register a keypress, it must be
  * called with keylock held.
  */
-static void ir_do_keydown(struct rc_dev *dev, enum rc_type protocol,
+static void ir_do_keydown(struct rc_dev *dev, enum rc_proto protocol,
 			  u32 scancode, u32 keycode, u8 toggle)
 {
 	bool new_event = (!dev->keypressed		 ||
@@ -730,7 +730,8 @@ static void ir_do_keydown(struct rc_dev *dev, enum rc_type protocol,
  * This routine is used to signal that a key has been pressed on the
  * remote control.
  */
-void rc_keydown(struct rc_dev *dev, enum rc_type protocol, u32 scancode, u8 toggle)
+void rc_keydown(struct rc_dev *dev, enum rc_proto protocol, u32 scancode,
+		u8 toggle)
 {
 	unsigned long flags;
 	u32 keycode = rc_g_keycode_from_table(dev, scancode);
@@ -759,7 +760,7 @@ EXPORT_SYMBOL_GPL(rc_keydown);
  * This routine is used to signal that a key has been pressed on the
  * remote control. The driver must manually call rc_keyup() at a later stage.
  */
-void rc_keydown_notimeout(struct rc_dev *dev, enum rc_type protocol,
+void rc_keydown_notimeout(struct rc_dev *dev, enum rc_proto protocol,
 			  u32 scancode, u8 toggle)
 {
 	unsigned long flags;
@@ -782,7 +783,7 @@ static int rc_validate_filter(struct rc_dev *dev,
 			      struct rc_scancode_filter *filter)
 {
 	u32 mask, s = filter->data;
-	enum rc_type protocol = dev->wakeup_protocol;
+	enum rc_proto protocol = dev->wakeup_protocol;
 
 	if (protocol >= ARRAY_SIZE(protocols))
 		return -EINVAL;
@@ -790,19 +791,19 @@ static int rc_validate_filter(struct rc_dev *dev,
 	mask = protocols[protocol].scancode_bits;
 
 	switch (protocol) {
-	case RC_TYPE_NECX:
+	case RC_PROTO_NECX:
 		if ((((s >> 16) ^ ~(s >> 8)) & 0xff) == 0)
 			return -EINVAL;
 		break;
-	case RC_TYPE_NEC32:
+	case RC_PROTO_NEC32:
 		if ((((s >> 24) ^ ~(s >> 16)) & 0xff) == 0)
 			return -EINVAL;
 		break;
-	case RC_TYPE_RC6_MCE:
+	case RC_PROTO_RC6_MCE:
 		if ((s & 0xffff0000) != 0x800f0000)
 			return -EINVAL;
 		break;
-	case RC_TYPE_RC6_6A_32:
+	case RC_PROTO_RC6_6A_32:
 		if ((s & 0xffff0000) == 0x800f0000)
 			return -EINVAL;
 		break;
@@ -890,30 +891,30 @@ static const struct {
 	const char	*name;
 	const char	*module_name;
 } proto_names[] = {
-	{ RC_BIT_NONE,		"none",		NULL			},
-	{ RC_BIT_OTHER,		"other",	NULL			},
-	{ RC_BIT_UNKNOWN,	"unknown",	NULL			},
-	{ RC_BIT_RC5 |
-	  RC_BIT_RC5X_20,	"rc-5",		"ir-rc5-decoder"	},
-	{ RC_BIT_NEC |
-	  RC_BIT_NECX |
-	  RC_BIT_NEC32,		"nec",		"ir-nec-decoder"	},
-	{ RC_BIT_RC6_0 |
-	  RC_BIT_RC6_6A_20 |
-	  RC_BIT_RC6_6A_24 |
-	  RC_BIT_RC6_6A_32 |
-	  RC_BIT_RC6_MCE,	"rc-6",		"ir-rc6-decoder"	},
-	{ RC_BIT_JVC,		"jvc",		"ir-jvc-decoder"	},
-	{ RC_BIT_SONY12 |
-	  RC_BIT_SONY15 |
-	  RC_BIT_SONY20,	"sony",		"ir-sony-decoder"	},
-	{ RC_BIT_RC5_SZ,	"rc-5-sz",	"ir-rc5-decoder"	},
-	{ RC_BIT_SANYO,		"sanyo",	"ir-sanyo-decoder"	},
-	{ RC_BIT_SHARP,		"sharp",	"ir-sharp-decoder"	},
-	{ RC_BIT_MCIR2_KBD |
-	  RC_BIT_MCIR2_MSE,	"mce_kbd",	"ir-mce_kbd-decoder"	},
-	{ RC_BIT_XMP,		"xmp",		"ir-xmp-decoder"	},
-	{ RC_BIT_CEC,		"cec",		NULL			},
+	{ RC_PROTO_BIT_NONE,	"none",		NULL			},
+	{ RC_PROTO_BIT_OTHER,	"other",	NULL			},
+	{ RC_PROTO_BIT_UNKNOWN,	"unknown",	NULL			},
+	{ RC_PROTO_BIT_RC5 |
+	  RC_PROTO_BIT_RC5X_20,	"rc-5",		"ir-rc5-decoder"	},
+	{ RC_PROTO_BIT_NEC |
+	  RC_PROTO_BIT_NECX |
+	  RC_PROTO_BIT_NEC32,	"nec",		"ir-nec-decoder"	},
+	{ RC_PROTO_BIT_RC6_0 |
+	  RC_PROTO_BIT_RC6_6A_20 |
+	  RC_PROTO_BIT_RC6_6A_24 |
+	  RC_PROTO_BIT_RC6_6A_32 |
+	  RC_PROTO_BIT_RC6_MCE,	"rc-6",		"ir-rc6-decoder"	},
+	{ RC_PROTO_BIT_JVC,	"jvc",		"ir-jvc-decoder"	},
+	{ RC_PROTO_BIT_SONY12 |
+	  RC_PROTO_BIT_SONY15 |
+	  RC_PROTO_BIT_SONY20,	"sony",		"ir-sony-decoder"	},
+	{ RC_PROTO_BIT_RC5_SZ,	"rc-5-sz",	"ir-rc5-decoder"	},
+	{ RC_PROTO_BIT_SANYO,	"sanyo",	"ir-sanyo-decoder"	},
+	{ RC_PROTO_BIT_SHARP,	"sharp",	"ir-sharp-decoder"	},
+	{ RC_PROTO_BIT_MCIR2_KBD |
+	  RC_PROTO_BIT_MCIR2_MSE, "mce_kbd",	"ir-mce_kbd-decoder"	},
+	{ RC_PROTO_BIT_XMP,	"xmp",		"ir-xmp-decoder"	},
+	{ RC_PROTO_BIT_CEC,	"cec",		NULL			},
 };
 
 /**
@@ -1083,8 +1084,9 @@ static void ir_raw_load_modules(u64 *protocols)
 	int i, ret;
 
 	for (i = 0; i < ARRAY_SIZE(proto_names); i++) {
-		if (proto_names[i].type == RC_BIT_NONE ||
-		    proto_names[i].type & (RC_BIT_OTHER | RC_BIT_UNKNOWN))
+		if (proto_names[i].type == RC_PROTO_BIT_NONE ||
+		    proto_names[i].type & (RC_PROTO_BIT_OTHER |
+					   RC_PROTO_BIT_UNKNOWN))
 			continue;
 
 		available = ir_raw_get_allowed_protocols();
@@ -1302,7 +1304,7 @@ static ssize_t store_filter(struct device *device,
 		 * Refuse to set a filter unless a protocol is enabled
 		 * and the filter is valid for that protocol
 		 */
-		if (dev->wakeup_protocol != RC_TYPE_UNKNOWN)
+		if (dev->wakeup_protocol != RC_PROTO_UNKNOWN)
 			ret = rc_validate_filter(dev, &new_filter);
 		else
 			ret = -EINVAL;
@@ -1349,7 +1351,7 @@ static ssize_t show_wakeup_protocols(struct device *device,
 {
 	struct rc_dev *dev = to_rc_dev(device);
 	u64 allowed;
-	enum rc_type enabled;
+	enum rc_proto enabled;
 	char *tmp = buf;
 	int i;
 
@@ -1398,7 +1400,7 @@ static ssize_t store_wakeup_protocols(struct device *device,
 				      const char *buf, size_t len)
 {
 	struct rc_dev *dev = to_rc_dev(device);
-	enum rc_type protocol;
+	enum rc_proto protocol;
 	ssize_t rc;
 	u64 allowed;
 	int i;
@@ -1408,7 +1410,7 @@ static ssize_t store_wakeup_protocols(struct device *device,
 	allowed = dev->allowed_wakeup_protocols;
 
 	if (sysfs_streq(buf, "none")) {
-		protocol = RC_TYPE_UNKNOWN;
+		protocol = RC_PROTO_UNKNOWN;
 	} else {
 		for (i = 0; i < ARRAY_SIZE(protocols); i++) {
 			if ((allowed & (1ULL << i)) &&
@@ -1438,7 +1440,7 @@ static ssize_t store_wakeup_protocols(struct device *device,
 		dev->wakeup_protocol = protocol;
 		IR_dprintk(1, "Wakeup protocol changed to %d\n", protocol);
 
-		if (protocol == RC_TYPE_RC6_MCE)
+		if (protocol == RC_PROTO_RC6_MCE)
 			dev->scancode_wakeup_filter.data = 0x800f0000;
 		else
 			dev->scancode_wakeup_filter.data = 0;
@@ -1619,7 +1621,7 @@ static int rc_prepare_rx_device(struct rc_dev *dev)
 {
 	int rc;
 	struct rc_map *rc_map;
-	u64 rc_type;
+	u64 rc_proto;
 
 	if (!dev->map_name)
 		return -EINVAL;
@@ -1634,17 +1636,17 @@ static int rc_prepare_rx_device(struct rc_dev *dev)
 	if (rc)
 		return rc;
 
-	rc_type = BIT_ULL(rc_map->rc_type);
+	rc_proto = BIT_ULL(rc_map->rc_proto);
 
 	if (dev->change_protocol) {
-		rc = dev->change_protocol(dev, &rc_type);
+		rc = dev->change_protocol(dev, &rc_proto);
 		if (rc < 0)
 			goto out_table;
-		dev->enabled_protocols = rc_type;
+		dev->enabled_protocols = rc_proto;
 	}
 
 	if (dev->driver_type == RC_DRIVER_IR_RAW)
-		ir_raw_load_modules(&rc_type);
+		ir_raw_load_modules(&rc_proto);
 
 	set_bit(EV_KEY, dev->input_dev->evbit);
 	set_bit(EV_REP, dev->input_dev->evbit);
diff --git a/drivers/media/rc/redrat3.c b/drivers/media/rc/redrat3.c
index 29fa37b99553..6784cb9fc4e7 100644
--- a/drivers/media/rc/redrat3.c
+++ b/drivers/media/rc/redrat3.c
@@ -956,7 +956,7 @@ static struct rc_dev *redrat3_init_rc_dev(struct redrat3_dev *rr3)
 	usb_to_input_id(rr3->udev, &rc->input_id);
 	rc->dev.parent = dev;
 	rc->priv = rr3;
-	rc->allowed_protocols = RC_BIT_ALL_IR_DECODER;
+	rc->allowed_protocols = RC_PROTO_BIT_ALL_IR_DECODER;
 	rc->min_timeout = MS_TO_NS(RR3_RX_MIN_TIMEOUT);
 	rc->max_timeout = MS_TO_NS(RR3_RX_MAX_TIMEOUT);
 	rc->timeout = US_TO_NS(redrat3_get_timeout(rr3));
diff --git a/drivers/media/rc/serial_ir.c b/drivers/media/rc/serial_ir.c
index 9a5e9fa01196..4b8d5f38baf6 100644
--- a/drivers/media/rc/serial_ir.c
+++ b/drivers/media/rc/serial_ir.c
@@ -537,7 +537,7 @@ static int serial_ir_probe(struct platform_device *dev)
 	rcdev->open = serial_ir_open;
 	rcdev->close = serial_ir_close;
 	rcdev->dev.parent = &serial_ir.pdev->dev;
-	rcdev->allowed_protocols = RC_BIT_ALL_IR_DECODER;
+	rcdev->allowed_protocols = RC_PROTO_BIT_ALL_IR_DECODER;
 	rcdev->driver_name = KBUILD_MODNAME;
 	rcdev->map_name = RC_MAP_RC6_MCE;
 	rcdev->min_timeout = 1;
diff --git a/drivers/media/rc/sir_ir.c b/drivers/media/rc/sir_ir.c
index 83b4410664af..bc906fb128d5 100644
--- a/drivers/media/rc/sir_ir.c
+++ b/drivers/media/rc/sir_ir.c
@@ -315,7 +315,7 @@ static int sir_ir_probe(struct platform_device *dev)
 	rcdev->input_id.product = 0x0001;
 	rcdev->input_id.version = 0x0100;
 	rcdev->tx_ir = sir_tx_ir;
-	rcdev->allowed_protocols = RC_BIT_ALL_IR_DECODER;
+	rcdev->allowed_protocols = RC_PROTO_BIT_ALL_IR_DECODER;
 	rcdev->driver_name = KBUILD_MODNAME;
 	rcdev->map_name = RC_MAP_RC6_MCE;
 	rcdev->timeout = IR_DEFAULT_TIMEOUT;
diff --git a/drivers/media/rc/st_rc.c b/drivers/media/rc/st_rc.c
index dc15dc8a7d21..a8e39c635f34 100644
--- a/drivers/media/rc/st_rc.c
+++ b/drivers/media/rc/st_rc.c
@@ -290,7 +290,7 @@ static int st_rc_probe(struct platform_device *pdev)
 	platform_set_drvdata(pdev, rc_dev);
 	st_rc_hardware_init(rc_dev);
 
-	rdev->allowed_protocols = RC_BIT_ALL_IR_DECODER;
+	rdev->allowed_protocols = RC_PROTO_BIT_ALL_IR_DECODER;
 	/* rx sampling rate is 10Mhz */
 	rdev->rx_resolution = 100;
 	rdev->timeout = US_TO_NS(MAX_SYMB_TIME);
diff --git a/drivers/media/rc/streamzap.c b/drivers/media/rc/streamzap.c
index 829f2b348a46..f03a174ddf9d 100644
--- a/drivers/media/rc/streamzap.c
+++ b/drivers/media/rc/streamzap.c
@@ -304,7 +304,7 @@ static struct rc_dev *streamzap_init_rc_dev(struct streamzap_ir *sz)
 	usb_to_input_id(sz->usbdev, &rdev->input_id);
 	rdev->dev.parent = dev;
 	rdev->priv = sz;
-	rdev->allowed_protocols = RC_BIT_ALL_IR_DECODER;
+	rdev->allowed_protocols = RC_PROTO_BIT_ALL_IR_DECODER;
 	rdev->driver_name = DRIVER_NAME;
 	rdev->map_name = RC_MAP_STREAMZAP;
 
diff --git a/drivers/media/rc/sunxi-cir.c b/drivers/media/rc/sunxi-cir.c
index f619b76273be..97f367b446c4 100644
--- a/drivers/media/rc/sunxi-cir.c
+++ b/drivers/media/rc/sunxi-cir.c
@@ -224,7 +224,7 @@ static int sunxi_ir_probe(struct platform_device *pdev)
 	ir->map_name = of_get_property(dn, "linux,rc-map-name", NULL);
 	ir->rc->map_name = ir->map_name ?: RC_MAP_EMPTY;
 	ir->rc->dev.parent = dev;
-	ir->rc->allowed_protocols = RC_BIT_ALL_IR_DECODER;
+	ir->rc->allowed_protocols = RC_PROTO_BIT_ALL_IR_DECODER;
 	ir->rc->rx_resolution = SUNXI_IR_SAMPLE;
 	ir->rc->timeout = MS_TO_NS(SUNXI_IR_TIMEOUT);
 	ir->rc->driver_name = SUNXI_IR_DEV;
diff --git a/drivers/media/rc/ttusbir.c b/drivers/media/rc/ttusbir.c
index 5002a91e830e..aafea3c5170b 100644
--- a/drivers/media/rc/ttusbir.c
+++ b/drivers/media/rc/ttusbir.c
@@ -313,7 +313,7 @@ static int ttusbir_probe(struct usb_interface *intf,
 	rc->input_phys = tt->phys;
 	usb_to_input_id(tt->udev, &rc->input_id);
 	rc->dev.parent = &intf->dev;
-	rc->allowed_protocols = RC_BIT_ALL_IR_DECODER;
+	rc->allowed_protocols = RC_PROTO_BIT_ALL_IR_DECODER;
 	rc->priv = tt;
 	rc->driver_name = DRIVER_NAME;
 	rc->map_name = RC_MAP_TT_1500;
diff --git a/drivers/media/rc/winbond-cir.c b/drivers/media/rc/winbond-cir.c
index a18eb232ed81..3ca7ab48293d 100644
--- a/drivers/media/rc/winbond-cir.c
+++ b/drivers/media/rc/winbond-cir.c
@@ -697,7 +697,7 @@ wbcir_shutdown(struct pnp_dev *device)
 	}
 
 	switch (rc->wakeup_protocol) {
-	case RC_TYPE_RC5:
+	case RC_PROTO_RC5:
 		/* Mask = 13 bits, ex toggle */
 		mask[0]  = (mask_sc & 0x003f);
 		mask[0] |= (mask_sc & 0x0300) >> 2;
@@ -714,7 +714,7 @@ wbcir_shutdown(struct pnp_dev *device)
 		proto = IR_PROTOCOL_RC5;
 		break;
 
-	case RC_TYPE_NEC:
+	case RC_PROTO_NEC:
 		mask[1] = bitrev8(mask_sc);
 		mask[0] = mask[1];
 		mask[3] = bitrev8(mask_sc >> 8);
@@ -728,7 +728,7 @@ wbcir_shutdown(struct pnp_dev *device)
 		proto = IR_PROTOCOL_NEC;
 		break;
 
-	case RC_TYPE_NECX:
+	case RC_PROTO_NECX:
 		mask[1] = bitrev8(mask_sc);
 		mask[0] = mask[1];
 		mask[2] = bitrev8(mask_sc >> 8);
@@ -742,7 +742,7 @@ wbcir_shutdown(struct pnp_dev *device)
 		proto = IR_PROTOCOL_NEC;
 		break;
 
-	case RC_TYPE_NEC32:
+	case RC_PROTO_NEC32:
 		mask[0] = bitrev8(mask_sc);
 		mask[1] = bitrev8(mask_sc >> 8);
 		mask[2] = bitrev8(mask_sc >> 16);
@@ -756,7 +756,7 @@ wbcir_shutdown(struct pnp_dev *device)
 		proto = IR_PROTOCOL_NEC;
 		break;
 
-	case RC_TYPE_RC6_0:
+	case RC_PROTO_RC6_0:
 		/* Command */
 		match[0] = wbcir_to_rc6cells(wake_sc >> 0);
 		mask[0]  = wbcir_to_rc6cells(mask_sc >> 0);
@@ -779,9 +779,9 @@ wbcir_shutdown(struct pnp_dev *device)
 		proto = IR_PROTOCOL_RC6;
 		break;
 
-	case RC_TYPE_RC6_6A_24:
-	case RC_TYPE_RC6_6A_32:
-	case RC_TYPE_RC6_MCE:
+	case RC_PROTO_RC6_6A_24:
+	case RC_PROTO_RC6_6A_32:
+	case RC_PROTO_RC6_MCE:
 		i = 0;
 
 		/* Command */
@@ -800,13 +800,13 @@ wbcir_shutdown(struct pnp_dev *device)
 		match[i]  = wbcir_to_rc6cells(wake_sc >> 16);
 		mask[i++] = wbcir_to_rc6cells(mask_sc >> 16);
 
-		if (rc->wakeup_protocol == RC_TYPE_RC6_6A_20) {
+		if (rc->wakeup_protocol == RC_PROTO_RC6_6A_20) {
 			rc6_csl = 52;
 		} else {
 			match[i]  = wbcir_to_rc6cells(wake_sc >> 20);
 			mask[i++] = wbcir_to_rc6cells(mask_sc >> 20);
 
-			if (rc->wakeup_protocol == RC_TYPE_RC6_6A_24) {
+			if (rc->wakeup_protocol == RC_PROTO_RC6_6A_24) {
 				rc6_csl = 60;
 			} else {
 				/* Customer range bit and bits 15 - 8 */
@@ -1086,12 +1086,13 @@ wbcir_probe(struct pnp_dev *device, const struct pnp_device_id *dev_id)
 	data->dev->timeout = IR_DEFAULT_TIMEOUT;
 	data->dev->max_timeout = 10 * IR_DEFAULT_TIMEOUT;
 	data->dev->rx_resolution = US_TO_NS(2);
-	data->dev->allowed_protocols = RC_BIT_ALL_IR_DECODER;
-	data->dev->allowed_wakeup_protocols = RC_BIT_NEC | RC_BIT_NECX |
-			RC_BIT_NEC32 | RC_BIT_RC5 | RC_BIT_RC6_0 |
-			RC_BIT_RC6_6A_20 | RC_BIT_RC6_6A_24 |
-			RC_BIT_RC6_6A_32 | RC_BIT_RC6_MCE;
-	data->dev->wakeup_protocol = RC_TYPE_RC6_MCE;
+	data->dev->allowed_protocols = RC_PROTO_BIT_ALL_IR_DECODER;
+	data->dev->allowed_wakeup_protocols = RC_PROTO_BIT_NEC |
+		RC_PROTO_BIT_NECX | RC_PROTO_BIT_NEC32 | RC_PROTO_BIT_RC5 |
+		RC_PROTO_BIT_RC6_0 | RC_PROTO_BIT_RC6_6A_20 |
+		RC_PROTO_BIT_RC6_6A_24 | RC_PROTO_BIT_RC6_6A_32 |
+		RC_PROTO_BIT_RC6_MCE;
+	data->dev->wakeup_protocol = RC_PROTO_RC6_MCE;
 	data->dev->scancode_wakeup_filter.data = 0x800f040c;
 	data->dev->scancode_wakeup_filter.mask = 0xffff7fff;
 	data->dev->s_wakeup_filter = wbcir_set_wakeup_filter;
diff --git a/drivers/media/rc/zx-irdec.c b/drivers/media/rc/zx-irdec.c
index 9452bac9262c..12d322ec8a29 100644
--- a/drivers/media/rc/zx-irdec.c
+++ b/drivers/media/rc/zx-irdec.c
@@ -54,7 +54,7 @@ static irqreturn_t zx_irdec_irq(int irq, void *dev_id)
 	u8 address, not_address;
 	u8 command, not_command;
 	u32 rawcode, scancode;
-	enum rc_type rc_type;
+	enum rc_proto rc_proto;
 
 	/* Clear interrupt */
 	writel(1, irdec->base + ZX_IR_INTSTCLR);
@@ -73,8 +73,8 @@ static irqreturn_t zx_irdec_irq(int irq, void *dev_id)
 
 	scancode = ir_nec_bytes_to_scancode(address, not_address,
 					    command, not_command,
-					    &rc_type);
-	rc_keydown(irdec->rcd, rc_type, scancode, 0);
+					    &rc_proto);
+	rc_keydown(irdec->rcd, rc_proto, scancode, 0);
 
 done:
 	return IRQ_HANDLED;
@@ -114,7 +114,8 @@ static int zx_irdec_probe(struct platform_device *pdev)
 	rcd->input_phys = DRIVER_NAME "/input0";
 	rcd->input_id.bustype = BUS_HOST;
 	rcd->map_name = RC_MAP_ZX_IRDEC;
-	rcd->allowed_protocols = RC_BIT_NEC | RC_BIT_NECX | RC_BIT_NEC32;
+	rcd->allowed_protocols = RC_PROTO_BIT_NEC | RC_PROTO_BIT_NECX |
+							RC_PROTO_BIT_NEC32;
 	rcd->driver_name = DRIVER_NAME;
 	rcd->device_name = DRIVER_NAME;
 
diff --git a/drivers/media/usb/au0828/au0828-input.c b/drivers/media/usb/au0828/au0828-input.c
index 9ae42ebefa8a..7996eb83a54e 100644
--- a/drivers/media/usb/au0828/au0828-input.c
+++ b/drivers/media/usb/au0828/au0828-input.c
@@ -343,8 +343,8 @@ int au0828_rc_register(struct au0828_dev *dev)
 	rc->input_id.product = le16_to_cpu(dev->usbdev->descriptor.idProduct);
 	rc->dev.parent = &dev->usbdev->dev;
 	rc->driver_name = "au0828-input";
-	rc->allowed_protocols = RC_BIT_NEC | RC_BIT_NECX | RC_BIT_NEC32 |
-								RC_BIT_RC5;
+	rc->allowed_protocols = RC_PROTO_BIT_NEC | RC_PROTO_BIT_NECX |
+				RC_PROTO_BIT_NEC32 | RC_PROTO_BIT_RC5;
 
 	/* all done */
 	err = rc_register_device(rc);
diff --git a/drivers/media/usb/cx231xx/cx231xx-input.c b/drivers/media/usb/cx231xx/cx231xx-input.c
index eecf074b0a48..02ebeb16055f 100644
--- a/drivers/media/usb/cx231xx/cx231xx-input.c
+++ b/drivers/media/usb/cx231xx/cx231xx-input.c
@@ -24,7 +24,7 @@
 
 #define MODULE_NAME "cx231xx-input"
 
-static int get_key_isdbt(struct IR_i2c *ir, enum rc_type *protocol,
+static int get_key_isdbt(struct IR_i2c *ir, enum rc_proto *protocol,
 			 u32 *pscancode, u8 *toggle)
 {
 	int	rc;
@@ -50,7 +50,7 @@ static int get_key_isdbt(struct IR_i2c *ir, enum rc_type *protocol,
 
 	dev_dbg(&ir->rc->dev, "cmd %02x, scan = %02x\n", cmd, scancode);
 
-	*protocol = RC_TYPE_OTHER;
+	*protocol = RC_PROTO_OTHER;
 	*pscancode = scancode;
 	*toggle = 0;
 	return 1;
@@ -91,7 +91,7 @@ int cx231xx_ir_init(struct cx231xx *dev)
 	/* The i2c micro-controller only outputs the cmd part of NEC protocol */
 	dev->init_data.rc_dev->scancode_mask = 0xff;
 	dev->init_data.rc_dev->driver_name = "cx231xx";
-	dev->init_data.type = RC_BIT_NEC;
+	dev->init_data.type = RC_PROTO_BIT_NEC;
 	info.addr = 0x30;
 
 	/* Load and bind ir-kbd-i2c */
diff --git a/drivers/media/usb/dvb-usb-v2/af9015.c b/drivers/media/usb/dvb-usb-v2/af9015.c
index 23bbbf367b51..8013659c41b1 100644
--- a/drivers/media/usb/dvb-usb-v2/af9015.c
+++ b/drivers/media/usb/dvb-usb-v2/af9015.c
@@ -1237,7 +1237,7 @@ static int af9015_rc_query(struct dvb_usb_device *d)
 
 	/* Only process key if canary killed */
 	if (buf[16] != 0xff && buf[0] != 0x01) {
-		enum rc_type proto;
+		enum rc_proto proto;
 		dev_dbg(&d->udev->dev, "%s: key pressed %*ph\n",
 				__func__, 4, buf + 12);
 
@@ -1253,13 +1253,13 @@ static int af9015_rc_query(struct dvb_usb_device *d)
 				/* NEC */
 				state->rc_keycode = RC_SCANCODE_NEC(buf[12],
 								    buf[14]);
-				proto = RC_TYPE_NEC;
+				proto = RC_PROTO_NEC;
 			} else {
 				/* NEC extended*/
 				state->rc_keycode = RC_SCANCODE_NECX(buf[12] << 8 |
 								     buf[13],
 								     buf[14]);
-				proto = RC_TYPE_NECX;
+				proto = RC_PROTO_NECX;
 			}
 		} else {
 			/* 32 bit NEC */
@@ -1267,7 +1267,7 @@ static int af9015_rc_query(struct dvb_usb_device *d)
 							      buf[13] << 16 |
 							      buf[14] << 8  |
 							      buf[15]);
-			proto = RC_TYPE_NEC32;
+			proto = RC_PROTO_NEC32;
 		}
 		rc_keydown(d->rc_dev, proto, state->rc_keycode, 0);
 	} else {
@@ -1336,7 +1336,8 @@ static int af9015_get_rc_config(struct dvb_usb_device *d, struct dvb_usb_rc *rc)
 	if (!rc->map_name)
 		rc->map_name = RC_MAP_EMPTY;
 
-	rc->allowed_protos = RC_BIT_NEC | RC_BIT_NECX | RC_BIT_NEC32;
+	rc->allowed_protos = RC_PROTO_BIT_NEC | RC_PROTO_BIT_NECX |
+						RC_PROTO_BIT_NEC32;
 	rc->query = af9015_rc_query;
 	rc->interval = 500;
 
diff --git a/drivers/media/usb/dvb-usb-v2/af9035.c b/drivers/media/usb/dvb-usb-v2/af9035.c
index ccf4a5c68877..666d319d3d1a 100644
--- a/drivers/media/usb/dvb-usb-v2/af9035.c
+++ b/drivers/media/usb/dvb-usb-v2/af9035.c
@@ -1828,7 +1828,7 @@ static int af9035_rc_query(struct dvb_usb_device *d)
 {
 	struct usb_interface *intf = d->intf;
 	int ret;
-	enum rc_type proto;
+	enum rc_proto proto;
 	u32 key;
 	u8 buf[4];
 	struct usb_req req = { CMD_IR_GET, 0, 0, NULL, 4, buf };
@@ -1843,17 +1843,17 @@ static int af9035_rc_query(struct dvb_usb_device *d)
 		if ((buf[0] + buf[1]) == 0xff) {
 			/* NEC standard 16bit */
 			key = RC_SCANCODE_NEC(buf[0], buf[2]);
-			proto = RC_TYPE_NEC;
+			proto = RC_PROTO_NEC;
 		} else {
 			/* NEC extended 24bit */
 			key = RC_SCANCODE_NECX(buf[0] << 8 | buf[1], buf[2]);
-			proto = RC_TYPE_NECX;
+			proto = RC_PROTO_NECX;
 		}
 	} else {
 		/* NEC full code 32bit */
 		key = RC_SCANCODE_NEC32(buf[0] << 24 | buf[1] << 16 |
 					buf[2] << 8  | buf[3]);
-		proto = RC_TYPE_NEC32;
+		proto = RC_PROTO_NEC32;
 	}
 
 	dev_dbg(&intf->dev, "%*ph\n", 4, buf);
@@ -1881,11 +1881,11 @@ static int af9035_get_rc_config(struct dvb_usb_device *d, struct dvb_usb_rc *rc)
 		switch (state->ir_type) {
 		case 0: /* NEC */
 		default:
-			rc->allowed_protos = RC_BIT_NEC | RC_BIT_NECX |
-								RC_BIT_NEC32;
+			rc->allowed_protos = RC_PROTO_BIT_NEC |
+					RC_PROTO_BIT_NECX | RC_PROTO_BIT_NEC32;
 			break;
 		case 1: /* RC6 */
-			rc->allowed_protos = RC_BIT_RC6_MCE;
+			rc->allowed_protos = RC_PROTO_BIT_RC6_MCE;
 			break;
 		}
 
diff --git a/drivers/media/usb/dvb-usb-v2/anysee.c b/drivers/media/usb/dvb-usb-v2/anysee.c
index 6795c0c609b1..20ee7eea2a91 100644
--- a/drivers/media/usb/dvb-usb-v2/anysee.c
+++ b/drivers/media/usb/dvb-usb-v2/anysee.c
@@ -1142,7 +1142,7 @@ static int anysee_rc_query(struct dvb_usb_device *d)
 	if (ircode[0]) {
 		dev_dbg(&d->udev->dev, "%s: key pressed %02x\n", __func__,
 				ircode[1]);
-		rc_keydown(d->rc_dev, RC_TYPE_NEC,
+		rc_keydown(d->rc_dev, RC_PROTO_NEC,
 			   RC_SCANCODE_NEC(0x08, ircode[1]), 0);
 	}
 
@@ -1151,7 +1151,7 @@ static int anysee_rc_query(struct dvb_usb_device *d)
 
 static int anysee_get_rc_config(struct dvb_usb_device *d, struct dvb_usb_rc *rc)
 {
-	rc->allowed_protos = RC_BIT_NEC;
+	rc->allowed_protos = RC_PROTO_BIT_NEC;
 	rc->query          = anysee_rc_query;
 	rc->interval       = 250;  /* windows driver uses 500ms */
 
diff --git a/drivers/media/usb/dvb-usb-v2/az6007.c b/drivers/media/usb/dvb-usb-v2/az6007.c
index 72f26300c236..1414d59e85ba 100644
--- a/drivers/media/usb/dvb-usb-v2/az6007.c
+++ b/drivers/media/usb/dvb-usb-v2/az6007.c
@@ -208,7 +208,7 @@ static int az6007_rc_query(struct dvb_usb_device *d)
 {
 	struct az6007_device_state *st = d_to_priv(d);
 	unsigned code;
-	enum rc_type proto;
+	enum rc_proto proto;
 
 	az6007_read(d, AZ6007_READ_IR, 0, 0, st->data, 10);
 
@@ -218,18 +218,18 @@ static int az6007_rc_query(struct dvb_usb_device *d)
 	if ((st->data[3] ^ st->data[4]) == 0xff) {
 		if ((st->data[1] ^ st->data[2]) == 0xff) {
 			code = RC_SCANCODE_NEC(st->data[1], st->data[3]);
-			proto = RC_TYPE_NEC;
+			proto = RC_PROTO_NEC;
 		} else {
 			code = RC_SCANCODE_NECX(st->data[1] << 8 | st->data[2],
 						st->data[3]);
-			proto = RC_TYPE_NECX;
+			proto = RC_PROTO_NECX;
 		}
 	} else {
 		code = RC_SCANCODE_NEC32(st->data[1] << 24 |
 					 st->data[2] << 16 |
 					 st->data[3] << 8  |
 					 st->data[4]);
-		proto = RC_TYPE_NEC32;
+		proto = RC_PROTO_NEC32;
 	}
 
 	rc_keydown(d->rc_dev, proto, code, st->data[5]);
@@ -241,7 +241,8 @@ static int az6007_get_rc_config(struct dvb_usb_device *d, struct dvb_usb_rc *rc)
 {
 	pr_debug("Getting az6007 Remote Control properties\n");
 
-	rc->allowed_protos = RC_BIT_NEC | RC_BIT_NECX | RC_BIT_NEC32;
+	rc->allowed_protos = RC_PROTO_BIT_NEC | RC_PROTO_BIT_NECX |
+						RC_PROTO_BIT_NEC32;
 	rc->query          = az6007_rc_query;
 	rc->interval       = 400;
 
diff --git a/drivers/media/usb/dvb-usb-v2/dvb_usb.h b/drivers/media/usb/dvb-usb-v2/dvb_usb.h
index 35f27e2e4e28..0005bdb2207d 100644
--- a/drivers/media/usb/dvb-usb-v2/dvb_usb.h
+++ b/drivers/media/usb/dvb-usb-v2/dvb_usb.h
@@ -138,7 +138,7 @@ struct dvb_usb_driver_info {
 struct dvb_usb_rc {
 	const char *map_name;
 	u64 allowed_protos;
-	int (*change_protocol)(struct rc_dev *dev, u64 *rc_type);
+	int (*change_protocol)(struct rc_dev *dev, u64 *rc_proto);
 	int (*query) (struct dvb_usb_device *d);
 	unsigned int interval;
 	enum rc_driver_type driver_type;
diff --git a/drivers/media/usb/dvb-usb-v2/dvbsky.c b/drivers/media/usb/dvb-usb-v2/dvbsky.c
index 5730760e4e93..131b6c08e199 100644
--- a/drivers/media/usb/dvb-usb-v2/dvbsky.c
+++ b/drivers/media/usb/dvb-usb-v2/dvbsky.c
@@ -211,7 +211,7 @@ static int dvbsky_rc_query(struct dvb_usb_device *d)
 		rc5_system = (code & 0x7C0) >> 6;
 		toggle = (code & 0x800) ? 1 : 0;
 		scancode = rc5_system << 8 | rc5_command;
-		rc_keydown(d->rc_dev, RC_TYPE_RC5, scancode, toggle);
+		rc_keydown(d->rc_dev, RC_PROTO_RC5, scancode, toggle);
 	}
 	return 0;
 }
@@ -223,7 +223,7 @@ static int dvbsky_get_rc_config(struct dvb_usb_device *d, struct dvb_usb_rc *rc)
 		return 0;
 	}
 
-	rc->allowed_protos = RC_BIT_RC5;
+	rc->allowed_protos = RC_PROTO_BIT_RC5;
 	rc->query          = dvbsky_rc_query;
 	rc->interval       = 300;
 	return 0;
diff --git a/drivers/media/usb/dvb-usb-v2/lmedm04.c b/drivers/media/usb/dvb-usb-v2/lmedm04.c
index a91fdad8f8d4..5e320fa4a795 100644
--- a/drivers/media/usb/dvb-usb-v2/lmedm04.c
+++ b/drivers/media/usb/dvb-usb-v2/lmedm04.c
@@ -347,8 +347,8 @@ static void lme2510_int_response(struct urb *lme_urb)
 						ibuf[5]);
 
 			deb_info(1, "INT Key = 0x%08x", key);
-			rc_keydown(adap_to_d(adap)->rc_dev, RC_TYPE_NEC32, key,
-									0);
+			rc_keydown(adap_to_d(adap)->rc_dev, RC_PROTO_NEC32, key,
+				   0);
 			break;
 		case 0xbb:
 			switch (st->tuner_config) {
@@ -1232,7 +1232,7 @@ static int lme2510_get_stream_config(struct dvb_frontend *fe, u8 *ts_type,
 static int lme2510_get_rc_config(struct dvb_usb_device *d,
 	struct dvb_usb_rc *rc)
 {
-	rc->allowed_protos = RC_BIT_NEC32;
+	rc->allowed_protos = RC_PROTO_BIT_NEC32;
 	return 0;
 }
 
diff --git a/drivers/media/usb/dvb-usb-v2/rtl28xxu.c b/drivers/media/usb/dvb-usb-v2/rtl28xxu.c
index e16ca07acf1d..95a7b9123f8e 100644
--- a/drivers/media/usb/dvb-usb-v2/rtl28xxu.c
+++ b/drivers/media/usb/dvb-usb-v2/rtl28xxu.c
@@ -1631,24 +1631,24 @@ static int rtl2831u_rc_query(struct dvb_usb_device *d)
 		goto err;
 
 	if (buf[4] & 0x01) {
-		enum rc_type proto;
+		enum rc_proto proto;
 
 		if (buf[2] == (u8) ~buf[3]) {
 			if (buf[0] == (u8) ~buf[1]) {
 				/* NEC standard (16 bit) */
 				rc_code = RC_SCANCODE_NEC(buf[0], buf[2]);
-				proto = RC_TYPE_NEC;
+				proto = RC_PROTO_NEC;
 			} else {
 				/* NEC extended (24 bit) */
 				rc_code = RC_SCANCODE_NECX(buf[0] << 8 | buf[1],
 							   buf[2]);
-				proto = RC_TYPE_NECX;
+				proto = RC_PROTO_NECX;
 			}
 		} else {
 			/* NEC full (32 bit) */
 			rc_code = RC_SCANCODE_NEC32(buf[0] << 24 | buf[1] << 16 |
 						    buf[2] << 8  | buf[3]);
-			proto = RC_TYPE_NEC32;
+			proto = RC_PROTO_NEC32;
 		}
 
 		rc_keydown(d->rc_dev, proto, rc_code, 0);
@@ -1673,7 +1673,8 @@ static int rtl2831u_get_rc_config(struct dvb_usb_device *d,
 		struct dvb_usb_rc *rc)
 {
 	rc->map_name = RC_MAP_EMPTY;
-	rc->allowed_protos = RC_BIT_NEC | RC_BIT_NECX | RC_BIT_NEC32;
+	rc->allowed_protos = RC_PROTO_BIT_NEC | RC_PROTO_BIT_NECX |
+							RC_PROTO_BIT_NEC32;
 	rc->query = rtl2831u_rc_query;
 	rc->interval = 400;
 
@@ -1778,7 +1779,7 @@ static int rtl2832u_get_rc_config(struct dvb_usb_device *d,
 	/* load empty to enable rc */
 	if (!rc->map_name)
 		rc->map_name = RC_MAP_EMPTY;
-	rc->allowed_protos = RC_BIT_ALL_IR_DECODER;
+	rc->allowed_protos = RC_PROTO_BIT_ALL_IR_DECODER;
 	rc->driver_type = RC_DRIVER_IR_RAW;
 	rc->query = rtl2832u_rc_query;
 	rc->interval = 200;
diff --git a/drivers/media/usb/dvb-usb/cxusb.c b/drivers/media/usb/dvb-usb/cxusb.c
index 99a3f3625944..37dea0adc695 100644
--- a/drivers/media/usb/dvb-usb/cxusb.c
+++ b/drivers/media/usb/dvb-usb/cxusb.c
@@ -458,7 +458,7 @@ static int cxusb_rc_query(struct dvb_usb_device *d)
 	cxusb_ctrl_msg(d, CMD_GET_IR_CODE, NULL, 0, ircode, 4);
 
 	if (ircode[2] || ircode[3])
-		rc_keydown(d->rc_dev, RC_TYPE_NEC,
+		rc_keydown(d->rc_dev, RC_PROTO_NEC,
 			   RC_SCANCODE_NEC(~ircode[2] & 0xff, ircode[3]), 0);
 	return 0;
 }
@@ -473,7 +473,7 @@ static int cxusb_bluebird2_rc_query(struct dvb_usb_device *d)
 		return 0;
 
 	if (ircode[1] || ircode[2])
-		rc_keydown(d->rc_dev, RC_TYPE_NEC,
+		rc_keydown(d->rc_dev, RC_PROTO_NEC,
 			   RC_SCANCODE_NEC(~ircode[1] & 0xff, ircode[2]), 0);
 	return 0;
 }
@@ -486,7 +486,7 @@ static int cxusb_d680_dmb_rc_query(struct dvb_usb_device *d)
 		return 0;
 
 	if (ircode[0] || ircode[1])
-		rc_keydown(d->rc_dev, RC_TYPE_UNKNOWN,
+		rc_keydown(d->rc_dev, RC_PROTO_UNKNOWN,
 			   RC_SCANCODE_RC5(ircode[0], ircode[1]), 0);
 	return 0;
 }
@@ -1646,7 +1646,7 @@ static struct dvb_usb_device_properties cxusb_bluebird_lgh064f_properties = {
 		.rc_codes	= RC_MAP_DVICO_PORTABLE,
 		.module_name	= KBUILD_MODNAME,
 		.rc_query	= cxusb_rc_query,
-		.allowed_protos = RC_BIT_NEC,
+		.allowed_protos = RC_PROTO_BIT_NEC,
 	},
 
 	.generic_bulk_ctrl_endpoint = 0x01,
@@ -1703,7 +1703,7 @@ static struct dvb_usb_device_properties cxusb_bluebird_dee1601_properties = {
 		.rc_codes	= RC_MAP_DVICO_MCE,
 		.module_name	= KBUILD_MODNAME,
 		.rc_query	= cxusb_rc_query,
-		.allowed_protos = RC_BIT_NEC,
+		.allowed_protos = RC_PROTO_BIT_NEC,
 	},
 
 	.generic_bulk_ctrl_endpoint = 0x01,
@@ -1768,7 +1768,7 @@ static struct dvb_usb_device_properties cxusb_bluebird_lgz201_properties = {
 		.rc_codes	= RC_MAP_DVICO_PORTABLE,
 		.module_name	= KBUILD_MODNAME,
 		.rc_query	= cxusb_rc_query,
-		.allowed_protos = RC_BIT_NEC,
+		.allowed_protos = RC_PROTO_BIT_NEC,
 	},
 
 	.generic_bulk_ctrl_endpoint = 0x01,
@@ -1824,7 +1824,7 @@ static struct dvb_usb_device_properties cxusb_bluebird_dtt7579_properties = {
 		.rc_codes	= RC_MAP_DVICO_PORTABLE,
 		.module_name	= KBUILD_MODNAME,
 		.rc_query	= cxusb_rc_query,
-		.allowed_protos = RC_BIT_NEC,
+		.allowed_protos = RC_PROTO_BIT_NEC,
 	},
 
 	.generic_bulk_ctrl_endpoint = 0x01,
@@ -1879,7 +1879,7 @@ static struct dvb_usb_device_properties cxusb_bluebird_dualdig4_properties = {
 		.rc_codes	= RC_MAP_DVICO_MCE,
 		.module_name	= KBUILD_MODNAME,
 		.rc_query	= cxusb_bluebird2_rc_query,
-		.allowed_protos = RC_BIT_NEC,
+		.allowed_protos = RC_PROTO_BIT_NEC,
 	},
 
 	.num_device_descs = 1,
@@ -1933,7 +1933,7 @@ static struct dvb_usb_device_properties cxusb_bluebird_nano2_properties = {
 		.rc_codes	= RC_MAP_DVICO_PORTABLE,
 		.module_name	= KBUILD_MODNAME,
 		.rc_query       = cxusb_bluebird2_rc_query,
-		.allowed_protos = RC_BIT_NEC,
+		.allowed_protos = RC_PROTO_BIT_NEC,
 	},
 
 	.num_device_descs = 1,
@@ -1989,7 +1989,7 @@ static struct dvb_usb_device_properties cxusb_bluebird_nano2_needsfirmware_prope
 		.rc_codes	= RC_MAP_DVICO_PORTABLE,
 		.module_name	= KBUILD_MODNAME,
 		.rc_query	= cxusb_rc_query,
-		.allowed_protos = RC_BIT_NEC,
+		.allowed_protos = RC_PROTO_BIT_NEC,
 	},
 
 	.num_device_descs = 1,
@@ -2088,7 +2088,7 @@ struct dvb_usb_device_properties cxusb_bluebird_dualdig4_rev2_properties = {
 		.rc_codes	= RC_MAP_DVICO_MCE,
 		.module_name	= KBUILD_MODNAME,
 		.rc_query	= cxusb_rc_query,
-		.allowed_protos = RC_BIT_NEC,
+		.allowed_protos = RC_PROTO_BIT_NEC,
 	},
 
 	.num_device_descs = 1,
@@ -2142,7 +2142,7 @@ static struct dvb_usb_device_properties cxusb_d680_dmb_properties = {
 		.rc_codes	= RC_MAP_D680_DMB,
 		.module_name	= KBUILD_MODNAME,
 		.rc_query       = cxusb_d680_dmb_rc_query,
-		.allowed_protos = RC_BIT_UNKNOWN,
+		.allowed_protos = RC_PROTO_BIT_UNKNOWN,
 	},
 
 	.num_device_descs = 1,
@@ -2197,7 +2197,7 @@ static struct dvb_usb_device_properties cxusb_mygica_d689_properties = {
 		.rc_codes	= RC_MAP_D680_DMB,
 		.module_name	= KBUILD_MODNAME,
 		.rc_query       = cxusb_d680_dmb_rc_query,
-		.allowed_protos = RC_BIT_UNKNOWN,
+		.allowed_protos = RC_PROTO_BIT_UNKNOWN,
 	},
 
 	.num_device_descs = 1,
@@ -2251,7 +2251,7 @@ static struct dvb_usb_device_properties cxusb_mygica_t230_properties = {
 		.rc_codes	= RC_MAP_TOTAL_MEDIA_IN_HAND_02,
 		.module_name	= KBUILD_MODNAME,
 		.rc_query       = cxusb_d680_dmb_rc_query,
-		.allowed_protos = RC_BIT_UNKNOWN,
+		.allowed_protos = RC_PROTO_BIT_UNKNOWN,
 	},
 
 	.num_device_descs = 1,
@@ -2305,7 +2305,7 @@ static struct dvb_usb_device_properties cxusb_mygica_t230c_properties = {
 		.rc_codes	= RC_MAP_TOTAL_MEDIA_IN_HAND_02,
 		.module_name	= KBUILD_MODNAME,
 		.rc_query       = cxusb_d680_dmb_rc_query,
-		.allowed_protos = RC_BIT_UNKNOWN,
+		.allowed_protos = RC_PROTO_BIT_UNKNOWN,
 	},
 
 	.num_device_descs = 1,
diff --git a/drivers/media/usb/dvb-usb/dib0700.h b/drivers/media/usb/dvb-usb/dib0700.h
index 8fd8f5b489d2..f89ab3b5a6c4 100644
--- a/drivers/media/usb/dvb-usb/dib0700.h
+++ b/drivers/media/usb/dvb-usb/dib0700.h
@@ -64,7 +64,7 @@ extern int dib0700_streaming_ctrl(struct dvb_usb_adapter *adap, int onoff);
 extern struct i2c_algorithm dib0700_i2c_algo;
 extern int dib0700_identify_state(struct usb_device *udev, struct dvb_usb_device_properties *props,
 			struct dvb_usb_device_description **desc, int *cold);
-extern int dib0700_change_protocol(struct rc_dev *dev, u64 *rc_type);
+extern int dib0700_change_protocol(struct rc_dev *dev, u64 *rc_proto);
 extern int dib0700_set_i2c_speed(struct dvb_usb_device *d, u16 scl_kHz);
 
 extern int dib0700_device_count;
diff --git a/drivers/media/usb/dvb-usb/dib0700_core.c b/drivers/media/usb/dvb-usb/dib0700_core.c
index bea1b4764a66..1ee7ec558293 100644
--- a/drivers/media/usb/dvb-usb/dib0700_core.c
+++ b/drivers/media/usb/dvb-usb/dib0700_core.c
@@ -638,7 +638,7 @@ int dib0700_streaming_ctrl(struct dvb_usb_adapter *adap, int onoff)
 	return ret;
 }
 
-int dib0700_change_protocol(struct rc_dev *rc, u64 *rc_type)
+int dib0700_change_protocol(struct rc_dev *rc, u64 *rc_proto)
 {
 	struct dvb_usb_device *d = rc->priv;
 	struct dib0700_state *st = d->priv;
@@ -654,19 +654,19 @@ int dib0700_change_protocol(struct rc_dev *rc, u64 *rc_type)
 	st->buf[2] = 0;
 
 	/* Set the IR mode */
-	if (*rc_type & RC_BIT_RC5) {
+	if (*rc_proto & RC_PROTO_BIT_RC5) {
 		new_proto = 1;
-		*rc_type = RC_BIT_RC5;
-	} else if (*rc_type & RC_BIT_NEC) {
+		*rc_proto = RC_PROTO_BIT_RC5;
+	} else if (*rc_proto & RC_PROTO_BIT_NEC) {
 		new_proto = 0;
-		*rc_type = RC_BIT_NEC;
-	} else if (*rc_type & RC_BIT_RC6_MCE) {
+		*rc_proto = RC_PROTO_BIT_NEC;
+	} else if (*rc_proto & RC_PROTO_BIT_RC6_MCE) {
 		if (st->fw_version < 0x10200) {
 			ret = -EINVAL;
 			goto out;
 		}
 		new_proto = 2;
-		*rc_type = RC_BIT_RC6_MCE;
+		*rc_proto = RC_PROTO_BIT_RC6_MCE;
 	} else {
 		ret = -EINVAL;
 		goto out;
@@ -680,7 +680,7 @@ int dib0700_change_protocol(struct rc_dev *rc, u64 *rc_type)
 		goto out;
 	}
 
-	d->props.rc.core.protocol = *rc_type;
+	d->props.rc.core.protocol = *rc_proto;
 
 out:
 	mutex_unlock(&d->usb_mutex);
@@ -712,7 +712,7 @@ static void dib0700_rc_urb_completion(struct urb *purb)
 {
 	struct dvb_usb_device *d = purb->context;
 	struct dib0700_rc_response *poll_reply;
-	enum rc_type protocol;
+	enum rc_proto protocol;
 	u32 keycode;
 	u8 toggle;
 
@@ -745,7 +745,7 @@ static void dib0700_rc_urb_completion(struct urb *purb)
 		 purb->actual_length);
 
 	switch (d->props.rc.core.protocol) {
-	case RC_BIT_NEC:
+	case RC_PROTO_BIT_NEC:
 		toggle = 0;
 
 		/* NEC protocol sends repeat code as 0 0 0 FF */
@@ -764,25 +764,25 @@ static void dib0700_rc_urb_completion(struct urb *purb)
 						     poll_reply->nec.not_system << 16 |
 						     poll_reply->nec.data       << 8  |
 						     poll_reply->nec.not_data);
-			protocol = RC_TYPE_NEC32;
+			protocol = RC_PROTO_NEC32;
 		} else if ((poll_reply->nec.system ^ poll_reply->nec.not_system) != 0xff) {
 			deb_data("NEC extended protocol\n");
 			keycode = RC_SCANCODE_NECX(poll_reply->nec.system << 8 |
 						    poll_reply->nec.not_system,
 						    poll_reply->nec.data);
 
-			protocol = RC_TYPE_NECX;
+			protocol = RC_PROTO_NECX;
 		} else {
 			deb_data("NEC normal protocol\n");
 			keycode = RC_SCANCODE_NEC(poll_reply->nec.system,
 						   poll_reply->nec.data);
-			protocol = RC_TYPE_NEC;
+			protocol = RC_PROTO_NEC;
 		}
 
 		break;
 	default:
 		deb_data("RC5 protocol\n");
-		protocol = RC_TYPE_RC5;
+		protocol = RC_PROTO_RC5;
 		toggle = poll_reply->report_id;
 		keycode = RC_SCANCODE_RC5(poll_reply->rc5.system, poll_reply->rc5.data);
 
diff --git a/drivers/media/usb/dvb-usb/dib0700_devices.c b/drivers/media/usb/dvb-usb/dib0700_devices.c
index 6a57fc6d3472..6020170fe99a 100644
--- a/drivers/media/usb/dvb-usb/dib0700_devices.c
+++ b/drivers/media/usb/dvb-usb/dib0700_devices.c
@@ -514,7 +514,7 @@ static int stk7700ph_tuner_attach(struct dvb_usb_adapter *adap)
  */
 static int dib0700_rc_query_old_firmware(struct dvb_usb_device *d)
 {
-	enum rc_type protocol;
+	enum rc_proto protocol;
 	u32 scancode;
 	u8 toggle;
 	int i;
@@ -547,7 +547,7 @@ static int dib0700_rc_query_old_firmware(struct dvb_usb_device *d)
 	dib0700_rc_setup(d, NULL); /* reset ir sensor data to prevent false events */
 
 	switch (d->props.rc.core.protocol) {
-	case RC_BIT_NEC:
+	case RC_PROTO_BIT_NEC:
 		/* NEC protocol sends repeat code as 0 0 0 FF */
 		if ((st->buf[3 - 2] == 0x00) && (st->buf[3 - 3] == 0x00) &&
 		    (st->buf[3] == 0xff)) {
@@ -555,14 +555,14 @@ static int dib0700_rc_query_old_firmware(struct dvb_usb_device *d)
 			return 0;
 		}
 
-		protocol = RC_TYPE_NEC;
+		protocol = RC_PROTO_NEC;
 		scancode = RC_SCANCODE_NEC(st->buf[3 - 2], st->buf[3 - 3]);
 		toggle = 0;
 		break;
 
 	default:
 		/* RC-5 protocol changes toggle bit on new keypress */
-		protocol = RC_TYPE_RC5;
+		protocol = RC_PROTO_RC5;
 		scancode = RC_SCANCODE_RC5(st->buf[3 - 2], st->buf[3 - 3]);
 		toggle = st->buf[3 - 1];
 		break;
@@ -3909,9 +3909,9 @@ struct dvb_usb_device_properties dib0700_devices[] = {
 			.rc_interval      = DEFAULT_RC_INTERVAL,
 			.rc_codes         = RC_MAP_DIB0700_RC5_TABLE,
 			.rc_query         = dib0700_rc_query_old_firmware,
-			.allowed_protos   = RC_BIT_RC5 |
-					    RC_BIT_RC6_MCE |
-					    RC_BIT_NEC,
+			.allowed_protos   = RC_PROTO_BIT_RC5 |
+					    RC_PROTO_BIT_RC6_MCE |
+					    RC_PROTO_BIT_NEC,
 			.change_protocol  = dib0700_change_protocol,
 		},
 	}, { DIB0700_DEFAULT_DEVICE_PROPERTIES,
@@ -3949,9 +3949,9 @@ struct dvb_usb_device_properties dib0700_devices[] = {
 			.rc_interval      = DEFAULT_RC_INTERVAL,
 			.rc_codes         = RC_MAP_DIB0700_RC5_TABLE,
 			.rc_query         = dib0700_rc_query_old_firmware,
-			.allowed_protos   = RC_BIT_RC5 |
-					    RC_BIT_RC6_MCE |
-					    RC_BIT_NEC,
+			.allowed_protos   = RC_PROTO_BIT_RC5 |
+					    RC_PROTO_BIT_RC6_MCE |
+					    RC_PROTO_BIT_NEC,
 			.change_protocol = dib0700_change_protocol,
 		},
 	}, { DIB0700_DEFAULT_DEVICE_PROPERTIES,
@@ -4014,9 +4014,9 @@ struct dvb_usb_device_properties dib0700_devices[] = {
 			.rc_interval      = DEFAULT_RC_INTERVAL,
 			.rc_codes         = RC_MAP_DIB0700_RC5_TABLE,
 			.rc_query         = dib0700_rc_query_old_firmware,
-			.allowed_protos   = RC_BIT_RC5 |
-					    RC_BIT_RC6_MCE |
-					    RC_BIT_NEC,
+			.allowed_protos   = RC_PROTO_BIT_RC5 |
+					    RC_PROTO_BIT_RC6_MCE |
+					    RC_PROTO_BIT_NEC,
 			.change_protocol = dib0700_change_protocol,
 		},
 	}, { DIB0700_DEFAULT_DEVICE_PROPERTIES,
@@ -4059,9 +4059,9 @@ struct dvb_usb_device_properties dib0700_devices[] = {
 			.rc_codes         = RC_MAP_DIB0700_RC5_TABLE,
 			.module_name	  = "dib0700",
 			.rc_query         = dib0700_rc_query_old_firmware,
-			.allowed_protos   = RC_BIT_RC5 |
-					    RC_BIT_RC6_MCE |
-					    RC_BIT_NEC,
+			.allowed_protos   = RC_PROTO_BIT_RC5 |
+					    RC_PROTO_BIT_RC6_MCE |
+					    RC_PROTO_BIT_NEC,
 			.change_protocol = dib0700_change_protocol,
 		},
 	}, { DIB0700_DEFAULT_DEVICE_PROPERTIES,
@@ -4140,9 +4140,9 @@ struct dvb_usb_device_properties dib0700_devices[] = {
 			.rc_codes         = RC_MAP_DIB0700_RC5_TABLE,
 			.module_name	  = "dib0700",
 			.rc_query         = dib0700_rc_query_old_firmware,
-			.allowed_protos   = RC_BIT_RC5 |
-					    RC_BIT_RC6_MCE |
-					    RC_BIT_NEC,
+			.allowed_protos   = RC_PROTO_BIT_RC5 |
+					    RC_PROTO_BIT_RC6_MCE |
+					    RC_PROTO_BIT_NEC,
 			.change_protocol  = dib0700_change_protocol,
 		},
 	}, { DIB0700_DEFAULT_DEVICE_PROPERTIES,
@@ -4185,9 +4185,9 @@ struct dvb_usb_device_properties dib0700_devices[] = {
 			.rc_codes         = RC_MAP_DIB0700_RC5_TABLE,
 			.module_name	  = "dib0700",
 			.rc_query         = dib0700_rc_query_old_firmware,
-			.allowed_protos   = RC_BIT_RC5 |
-					    RC_BIT_RC6_MCE |
-					    RC_BIT_NEC,
+			.allowed_protos   = RC_PROTO_BIT_RC5 |
+					    RC_PROTO_BIT_RC6_MCE |
+					    RC_PROTO_BIT_NEC,
 			.change_protocol  = dib0700_change_protocol,
 		},
 	}, { DIB0700_DEFAULT_DEVICE_PROPERTIES,
@@ -4242,9 +4242,9 @@ struct dvb_usb_device_properties dib0700_devices[] = {
 			.rc_codes         = RC_MAP_DIB0700_RC5_TABLE,
 			.module_name	  = "dib0700",
 			.rc_query         = dib0700_rc_query_old_firmware,
-			.allowed_protos   = RC_BIT_RC5 |
-					    RC_BIT_RC6_MCE |
-					    RC_BIT_NEC,
+			.allowed_protos   = RC_PROTO_BIT_RC5 |
+					    RC_PROTO_BIT_RC6_MCE |
+					    RC_PROTO_BIT_NEC,
 			.change_protocol = dib0700_change_protocol,
 		},
 	}, { DIB0700_DEFAULT_DEVICE_PROPERTIES,
@@ -4308,9 +4308,9 @@ struct dvb_usb_device_properties dib0700_devices[] = {
 			.rc_codes         = RC_MAP_DIB0700_RC5_TABLE,
 			.module_name	  = "dib0700",
 			.rc_query         = dib0700_rc_query_old_firmware,
-			.allowed_protos   = RC_BIT_RC5 |
-					    RC_BIT_RC6_MCE |
-					    RC_BIT_NEC,
+			.allowed_protos   = RC_PROTO_BIT_RC5 |
+					    RC_PROTO_BIT_RC6_MCE |
+					    RC_PROTO_BIT_NEC,
 			.change_protocol = dib0700_change_protocol,
 		},
 	}, { DIB0700_DEFAULT_DEVICE_PROPERTIES,
@@ -4357,9 +4357,9 @@ struct dvb_usb_device_properties dib0700_devices[] = {
 			.rc_codes         = RC_MAP_DIB0700_NEC_TABLE,
 			.module_name	  = "dib0700",
 			.rc_query         = dib0700_rc_query_old_firmware,
-			.allowed_protos   = RC_BIT_RC5 |
-					    RC_BIT_RC6_MCE |
-					    RC_BIT_NEC,
+			.allowed_protos   = RC_PROTO_BIT_RC5 |
+					    RC_PROTO_BIT_RC6_MCE |
+					    RC_PROTO_BIT_NEC,
 			.change_protocol  = dib0700_change_protocol,
 		},
 	}, { DIB0700_DEFAULT_DEVICE_PROPERTIES,
@@ -4430,9 +4430,9 @@ struct dvb_usb_device_properties dib0700_devices[] = {
 			.rc_codes         = RC_MAP_DIB0700_RC5_TABLE,
 			.module_name	  = "dib0700",
 			.rc_query         = dib0700_rc_query_old_firmware,
-			.allowed_protos   = RC_BIT_RC5 |
-					    RC_BIT_RC6_MCE |
-					    RC_BIT_NEC,
+			.allowed_protos   = RC_PROTO_BIT_RC5 |
+					    RC_PROTO_BIT_RC6_MCE |
+					    RC_PROTO_BIT_NEC,
 			.change_protocol  = dib0700_change_protocol,
 		},
 	}, { DIB0700_DEFAULT_DEVICE_PROPERTIES,
@@ -4466,9 +4466,9 @@ struct dvb_usb_device_properties dib0700_devices[] = {
 			.rc_codes         = RC_MAP_DIB0700_RC5_TABLE,
 			.module_name	  = "dib0700",
 			.rc_query         = dib0700_rc_query_old_firmware,
-			.allowed_protos   = RC_BIT_RC5 |
-					    RC_BIT_RC6_MCE |
-					    RC_BIT_NEC,
+			.allowed_protos   = RC_PROTO_BIT_RC5 |
+					    RC_PROTO_BIT_RC6_MCE |
+					    RC_PROTO_BIT_NEC,
 			.change_protocol  = dib0700_change_protocol,
 		},
 	}, { DIB0700_DEFAULT_DEVICE_PROPERTIES,
@@ -4542,9 +4542,9 @@ struct dvb_usb_device_properties dib0700_devices[] = {
 			.rc_codes         = RC_MAP_DIB0700_RC5_TABLE,
 			.module_name	  = "dib0700",
 			.rc_query         = dib0700_rc_query_old_firmware,
-			.allowed_protos   = RC_BIT_RC5 |
-					    RC_BIT_RC6_MCE |
-					    RC_BIT_NEC,
+			.allowed_protos   = RC_PROTO_BIT_RC5 |
+					    RC_PROTO_BIT_RC6_MCE |
+					    RC_PROTO_BIT_NEC,
 			.change_protocol  = dib0700_change_protocol,
 		},
 	}, { DIB0700_DEFAULT_DEVICE_PROPERTIES,
@@ -4586,9 +4586,9 @@ struct dvb_usb_device_properties dib0700_devices[] = {
 			.rc_codes         = RC_MAP_DIB0700_NEC_TABLE,
 			.module_name	  = "dib0700",
 			.rc_query         = dib0700_rc_query_old_firmware,
-			.allowed_protos   = RC_BIT_RC5 |
-					    RC_BIT_RC6_MCE |
-					    RC_BIT_NEC,
+			.allowed_protos   = RC_PROTO_BIT_RC5 |
+					    RC_PROTO_BIT_RC6_MCE |
+					    RC_PROTO_BIT_NEC,
 			.change_protocol  = dib0700_change_protocol,
 		},
 	}, { DIB0700_DEFAULT_DEVICE_PROPERTIES,
@@ -4635,9 +4635,9 @@ struct dvb_usb_device_properties dib0700_devices[] = {
 			.rc_codes         = RC_MAP_DIB0700_RC5_TABLE,
 			.module_name	  = "dib0700",
 			.rc_query         = dib0700_rc_query_old_firmware,
-			.allowed_protos   = RC_BIT_RC5 |
-					    RC_BIT_RC6_MCE |
-					    RC_BIT_NEC,
+			.allowed_protos   = RC_PROTO_BIT_RC5 |
+					    RC_PROTO_BIT_RC6_MCE |
+					    RC_PROTO_BIT_NEC,
 			.change_protocol  = dib0700_change_protocol,
 		},
 	}, { DIB0700_DEFAULT_DEVICE_PROPERTIES,
@@ -4672,9 +4672,9 @@ struct dvb_usb_device_properties dib0700_devices[] = {
 			.rc_codes         = RC_MAP_DIB0700_RC5_TABLE,
 			.module_name	  = "dib0700",
 			.rc_query         = dib0700_rc_query_old_firmware,
-			.allowed_protos   = RC_BIT_RC5 |
-					    RC_BIT_RC6_MCE |
-					    RC_BIT_NEC,
+			.allowed_protos   = RC_PROTO_BIT_RC5 |
+					    RC_PROTO_BIT_RC6_MCE |
+					    RC_PROTO_BIT_NEC,
 			.change_protocol  = dib0700_change_protocol,
 		},
 	}, { DIB0700_DEFAULT_DEVICE_PROPERTIES,
@@ -4709,9 +4709,9 @@ struct dvb_usb_device_properties dib0700_devices[] = {
 			.rc_codes         = RC_MAP_DIB0700_RC5_TABLE,
 			.module_name	  = "dib0700",
 			.rc_query         = dib0700_rc_query_old_firmware,
-			.allowed_protos   = RC_BIT_RC5 |
-					    RC_BIT_RC6_MCE |
-					    RC_BIT_NEC,
+			.allowed_protos   = RC_PROTO_BIT_RC5 |
+					    RC_PROTO_BIT_RC6_MCE |
+					    RC_PROTO_BIT_NEC,
 			.change_protocol  = dib0700_change_protocol,
 		},
 	}, { DIB0700_DEFAULT_DEVICE_PROPERTIES,
@@ -4746,9 +4746,9 @@ struct dvb_usb_device_properties dib0700_devices[] = {
 			.rc_codes         = RC_MAP_DIB0700_RC5_TABLE,
 			.module_name	  = "dib0700",
 			.rc_query         = dib0700_rc_query_old_firmware,
-			.allowed_protos   = RC_BIT_RC5 |
-					    RC_BIT_RC6_MCE |
-					    RC_BIT_NEC,
+			.allowed_protos   = RC_PROTO_BIT_RC5 |
+					    RC_PROTO_BIT_RC6_MCE |
+					    RC_PROTO_BIT_NEC,
 			.change_protocol  = dib0700_change_protocol,
 		},
 	}, { DIB0700_DEFAULT_DEVICE_PROPERTIES,
@@ -4783,9 +4783,9 @@ struct dvb_usb_device_properties dib0700_devices[] = {
 			.rc_codes         = RC_MAP_DIB0700_RC5_TABLE,
 			.module_name	  = "dib0700",
 			.rc_query         = dib0700_rc_query_old_firmware,
-			.allowed_protos   = RC_BIT_RC5 |
-					    RC_BIT_RC6_MCE |
-					    RC_BIT_NEC,
+			.allowed_protos   = RC_PROTO_BIT_RC5 |
+					    RC_PROTO_BIT_RC6_MCE |
+					    RC_PROTO_BIT_NEC,
 			.change_protocol  = dib0700_change_protocol,
 		},
 	}, { DIB0700_DEFAULT_DEVICE_PROPERTIES,
@@ -4820,9 +4820,9 @@ struct dvb_usb_device_properties dib0700_devices[] = {
 			.rc_codes         = RC_MAP_DIB0700_RC5_TABLE,
 			.module_name	  = "dib0700",
 			.rc_query         = dib0700_rc_query_old_firmware,
-			.allowed_protos   = RC_BIT_RC5 |
-					    RC_BIT_RC6_MCE |
-					    RC_BIT_NEC,
+			.allowed_protos   = RC_PROTO_BIT_RC5 |
+					    RC_PROTO_BIT_RC6_MCE |
+					    RC_PROTO_BIT_NEC,
 			.change_protocol  = dib0700_change_protocol,
 		},
 	}, { DIB0700_DEFAULT_DEVICE_PROPERTIES,
@@ -4871,9 +4871,9 @@ struct dvb_usb_device_properties dib0700_devices[] = {
 			.rc_codes         = RC_MAP_DIB0700_RC5_TABLE,
 			.module_name	  = "dib0700",
 			.rc_query         = dib0700_rc_query_old_firmware,
-			.allowed_protos   = RC_BIT_RC5 |
-					    RC_BIT_RC6_MCE |
-					    RC_BIT_NEC,
+			.allowed_protos   = RC_PROTO_BIT_RC5 |
+					    RC_PROTO_BIT_RC6_MCE |
+					    RC_PROTO_BIT_NEC,
 			.change_protocol  = dib0700_change_protocol,
 		},
 	}, { DIB0700_DEFAULT_DEVICE_PROPERTIES,
@@ -4906,9 +4906,9 @@ struct dvb_usb_device_properties dib0700_devices[] = {
 			.rc_codes         = RC_MAP_DIB0700_RC5_TABLE,
 			.module_name	  = "dib0700",
 			.rc_query         = dib0700_rc_query_old_firmware,
-			.allowed_protos   = RC_BIT_RC5 |
-					    RC_BIT_RC6_MCE |
-					    RC_BIT_NEC,
+			.allowed_protos   = RC_PROTO_BIT_RC5 |
+					    RC_PROTO_BIT_RC6_MCE |
+					    RC_PROTO_BIT_NEC,
 			.change_protocol  = dib0700_change_protocol,
 		},
 	}, { DIB0700_DEFAULT_DEVICE_PROPERTIES,
@@ -4943,9 +4943,9 @@ struct dvb_usb_device_properties dib0700_devices[] = {
 			.rc_codes         = RC_MAP_DIB0700_RC5_TABLE,
 			.module_name	  = "dib0700",
 			.rc_query         = dib0700_rc_query_old_firmware,
-			.allowed_protos   = RC_BIT_RC5 |
-					    RC_BIT_RC6_MCE |
-					    RC_BIT_NEC,
+			.allowed_protos   = RC_PROTO_BIT_RC5 |
+					    RC_PROTO_BIT_RC6_MCE |
+					    RC_PROTO_BIT_NEC,
 			.change_protocol  = dib0700_change_protocol,
 		},
 	}, { DIB0700_DEFAULT_DEVICE_PROPERTIES,
@@ -4981,9 +4981,9 @@ struct dvb_usb_device_properties dib0700_devices[] = {
 			.rc_codes         = RC_MAP_DIB0700_RC5_TABLE,
 			.module_name	  = "dib0700",
 			.rc_query         = dib0700_rc_query_old_firmware,
-			.allowed_protos   = RC_BIT_RC5 |
-					    RC_BIT_RC6_MCE |
-					    RC_BIT_NEC,
+			.allowed_protos   = RC_PROTO_BIT_RC5 |
+					    RC_PROTO_BIT_RC6_MCE |
+					    RC_PROTO_BIT_NEC,
 			.change_protocol  = dib0700_change_protocol,
 		},
 	}, { DIB0700_DEFAULT_DEVICE_PROPERTIES,
@@ -5035,9 +5035,9 @@ struct dvb_usb_device_properties dib0700_devices[] = {
 			.rc_codes         = RC_MAP_DIB0700_RC5_TABLE,
 			.module_name  = "dib0700",
 			.rc_query         = dib0700_rc_query_old_firmware,
-			.allowed_protos   = RC_BIT_RC5 |
-				RC_BIT_RC6_MCE |
-				RC_BIT_NEC,
+			.allowed_protos   = RC_PROTO_BIT_RC5 |
+				RC_PROTO_BIT_RC6_MCE |
+				RC_PROTO_BIT_NEC,
 			.change_protocol  = dib0700_change_protocol,
 		},
 	},
diff --git a/drivers/media/usb/dvb-usb/dtt200u.c b/drivers/media/usb/dvb-usb/dtt200u.c
index fcbff7fb0c4e..512370786696 100644
--- a/drivers/media/usb/dvb-usb/dtt200u.c
+++ b/drivers/media/usb/dvb-usb/dtt200u.c
@@ -100,14 +100,14 @@ static int dtt200u_rc_query(struct dvb_usb_device *d)
 		goto ret;
 
 	if (st->data[0] == 1) {
-		enum rc_type proto = RC_TYPE_NEC;
+		enum rc_proto proto = RC_PROTO_NEC;
 
 		scancode = st->data[1];
 		if ((u8) ~st->data[1] != st->data[2]) {
 			/* Extended NEC */
 			scancode = scancode << 8;
 			scancode |= st->data[2];
-			proto = RC_TYPE_NECX;
+			proto = RC_PROTO_NECX;
 		}
 		scancode = scancode << 8;
 		scancode |= st->data[3];
@@ -213,7 +213,7 @@ static struct dvb_usb_device_properties dtt200u_properties = {
 		.rc_interval     = 300,
 		.rc_codes        = RC_MAP_DTT200U,
 		.rc_query        = dtt200u_rc_query,
-		.allowed_protos  = RC_BIT_NEC,
+		.allowed_protos  = RC_PROTO_BIT_NEC,
 	},
 
 	.generic_bulk_ctrl_endpoint = 0x01,
@@ -265,7 +265,7 @@ static struct dvb_usb_device_properties wt220u_properties = {
 		.rc_interval     = 300,
 		.rc_codes        = RC_MAP_DTT200U,
 		.rc_query        = dtt200u_rc_query,
-		.allowed_protos  = RC_BIT_NEC,
+		.allowed_protos  = RC_PROTO_BIT_NEC,
 	},
 
 	.generic_bulk_ctrl_endpoint = 0x01,
@@ -317,7 +317,7 @@ static struct dvb_usb_device_properties wt220u_fc_properties = {
 		.rc_interval     = 300,
 		.rc_codes        = RC_MAP_DTT200U,
 		.rc_query        = dtt200u_rc_query,
-		.allowed_protos  = RC_BIT_NEC,
+		.allowed_protos  = RC_PROTO_BIT_NEC,
 	},
 
 	.generic_bulk_ctrl_endpoint = 0x01,
@@ -369,7 +369,7 @@ static struct dvb_usb_device_properties wt220u_zl0353_properties = {
 		.rc_interval     = 300,
 		.rc_codes        = RC_MAP_DTT200U,
 		.rc_query        = dtt200u_rc_query,
-		.allowed_protos  = RC_BIT_NEC,
+		.allowed_protos  = RC_PROTO_BIT_NEC,
 	},
 
 	.generic_bulk_ctrl_endpoint = 0x01,
diff --git a/drivers/media/usb/dvb-usb/dvb-usb.h b/drivers/media/usb/dvb-usb/dvb-usb.h
index 67f898b6f6d0..72468fdffa18 100644
--- a/drivers/media/usb/dvb-usb/dvb-usb.h
+++ b/drivers/media/usb/dvb-usb/dvb-usb.h
@@ -202,7 +202,7 @@ struct dvb_rc {
 	u64 protocol;
 	u64 allowed_protos;
 	enum rc_driver_type driver_type;
-	int (*change_protocol)(struct rc_dev *dev, u64 *rc_type);
+	int (*change_protocol)(struct rc_dev *dev, u64 *rc_proto);
 	char *module_name;
 	int (*rc_query) (struct dvb_usb_device *d);
 	int rc_interval;
diff --git a/drivers/media/usb/dvb-usb/dw2102.c b/drivers/media/usb/dvb-usb/dw2102.c
index 57b187240110..11109b1e641f 100644
--- a/drivers/media/usb/dvb-usb/dw2102.c
+++ b/drivers/media/usb/dvb-usb/dw2102.c
@@ -1671,7 +1671,7 @@ static int dw2102_rc_query(struct dvb_usb_device *d)
 		if (msg.buf[0] != 0xff) {
 			deb_rc("%s: rc code: %x, %x\n",
 					__func__, key[0], key[1]);
-			rc_keydown(d->rc_dev, RC_TYPE_UNKNOWN, key[0], 0);
+			rc_keydown(d->rc_dev, RC_PROTO_UNKNOWN, key[0], 0);
 		}
 	}
 
@@ -1692,7 +1692,8 @@ static int prof_rc_query(struct dvb_usb_device *d)
 		if (msg.buf[0] != 0xff) {
 			deb_rc("%s: rc code: %x, %x\n",
 					__func__, key[0], key[1]);
-			rc_keydown(d->rc_dev, RC_TYPE_UNKNOWN, key[0]^0xff, 0);
+			rc_keydown(d->rc_dev, RC_PROTO_UNKNOWN, key[0] ^ 0xff,
+				   0);
 		}
 	}
 
@@ -1713,7 +1714,7 @@ static int su3000_rc_query(struct dvb_usb_device *d)
 		if (msg.buf[0] != 0xff) {
 			deb_rc("%s: rc code: %x, %x\n",
 					__func__, key[0], key[1]);
-			rc_keydown(d->rc_dev, RC_TYPE_RC5,
+			rc_keydown(d->rc_dev, RC_PROTO_RC5,
 				   RC_SCANCODE_RC5(key[1], key[0]), 0);
 		}
 	}
@@ -1912,7 +1913,7 @@ static struct dvb_usb_device_properties dw2102_properties = {
 		.rc_interval = 150,
 		.rc_codes = RC_MAP_DM1105_NEC,
 		.module_name = "dw2102",
-		.allowed_protos   = RC_BIT_NEC,
+		.allowed_protos   = RC_PROTO_BIT_NEC,
 		.rc_query = dw2102_rc_query,
 	},
 
@@ -1967,7 +1968,7 @@ static struct dvb_usb_device_properties dw2104_properties = {
 		.rc_interval = 150,
 		.rc_codes = RC_MAP_DM1105_NEC,
 		.module_name = "dw2102",
-		.allowed_protos   = RC_BIT_NEC,
+		.allowed_protos   = RC_PROTO_BIT_NEC,
 		.rc_query = dw2102_rc_query,
 	},
 
@@ -2018,7 +2019,7 @@ static struct dvb_usb_device_properties dw3101_properties = {
 		.rc_interval = 150,
 		.rc_codes = RC_MAP_DM1105_NEC,
 		.module_name = "dw2102",
-		.allowed_protos   = RC_BIT_NEC,
+		.allowed_protos   = RC_PROTO_BIT_NEC,
 		.rc_query = dw2102_rc_query,
 	},
 
@@ -2067,7 +2068,7 @@ static struct dvb_usb_device_properties s6x0_properties = {
 		.rc_interval = 150,
 		.rc_codes = RC_MAP_TEVII_NEC,
 		.module_name = "dw2102",
-		.allowed_protos   = RC_BIT_NEC,
+		.allowed_protos   = RC_PROTO_BIT_NEC,
 		.rc_query = dw2102_rc_query,
 	},
 
@@ -2161,7 +2162,7 @@ static struct dvb_usb_device_properties su3000_properties = {
 		.rc_interval = 150,
 		.rc_codes = RC_MAP_SU3000,
 		.module_name = "dw2102",
-		.allowed_protos   = RC_BIT_RC5,
+		.allowed_protos   = RC_PROTO_BIT_RC5,
 		.rc_query = su3000_rc_query,
 	},
 
@@ -2230,7 +2231,7 @@ static struct dvb_usb_device_properties t220_properties = {
 		.rc_interval = 150,
 		.rc_codes = RC_MAP_SU3000,
 		.module_name = "dw2102",
-		.allowed_protos   = RC_BIT_RC5,
+		.allowed_protos   = RC_PROTO_BIT_RC5,
 		.rc_query = su3000_rc_query,
 	},
 
@@ -2279,7 +2280,7 @@ static struct dvb_usb_device_properties tt_s2_4600_properties = {
 		.rc_interval = 250,
 		.rc_codes = RC_MAP_TT_1500,
 		.module_name = "dw2102",
-		.allowed_protos   = RC_BIT_RC5,
+		.allowed_protos   = RC_PROTO_BIT_RC5,
 		.rc_query = su3000_rc_query,
 	},
 
diff --git a/drivers/media/usb/dvb-usb/m920x.c b/drivers/media/usb/dvb-usb/m920x.c
index 70672e1e5ec7..32081c2ce0da 100644
--- a/drivers/media/usb/dvb-usb/m920x.c
+++ b/drivers/media/usb/dvb-usb/m920x.c
@@ -241,7 +241,7 @@ static int m920x_rc_core_query(struct dvb_usb_device *d)
 	else if (state == REMOTE_KEY_REPEAT)
 		rc_repeat(d->rc_dev);
 	else
-		rc_keydown(d->rc_dev, RC_TYPE_UNKNOWN, rc_state[1], 0);
+		rc_keydown(d->rc_dev, RC_PROTO_UNKNOWN, rc_state[1], 0);
 
 out:
 	kfree(rc_state);
@@ -1208,7 +1208,7 @@ static struct dvb_usb_device_properties vp7049_properties = {
 		.rc_interval    = 150,
 		.rc_codes       = RC_MAP_TWINHAN_VP1027_DVBS,
 		.rc_query       = m920x_rc_core_query,
-		.allowed_protos = RC_BIT_UNKNOWN,
+		.allowed_protos = RC_PROTO_BIT_UNKNOWN,
 	},
 
 	.size_of_priv     = sizeof(struct m920x_state),
diff --git a/drivers/media/usb/dvb-usb/pctv452e.c b/drivers/media/usb/dvb-usb/pctv452e.c
index d54ebe7e0215..601ade7ca48d 100644
--- a/drivers/media/usb/dvb-usb/pctv452e.c
+++ b/drivers/media/usb/dvb-usb/pctv452e.c
@@ -600,7 +600,7 @@ static int pctv452e_rc_query(struct dvb_usb_device *d)
 			info("%s: cmd=0x%02x sys=0x%02x\n",
 				__func__, rx[6], rx[7]);
 
-		rc_keydown(d->rc_dev, RC_TYPE_RC5, state->last_rc_key, 0);
+		rc_keydown(d->rc_dev, RC_PROTO_RC5, state->last_rc_key, 0);
 	} else if (state->last_rc_key) {
 		rc_keyup(d->rc_dev);
 		state->last_rc_key = 0;
@@ -958,7 +958,7 @@ static struct dvb_usb_device_properties pctv452e_properties = {
 
 	.rc.core = {
 		.rc_codes	= RC_MAP_DIB0700_RC5_TABLE,
-		.allowed_protos	= RC_BIT_RC5,
+		.allowed_protos	= RC_PROTO_BIT_RC5,
 		.rc_query	= pctv452e_rc_query,
 		.rc_interval	= 100,
 	},
@@ -1011,7 +1011,7 @@ static struct dvb_usb_device_properties tt_connect_s2_3600_properties = {
 
 	.rc.core = {
 		.rc_codes	= RC_MAP_TT_1500,
-		.allowed_protos	= RC_BIT_RC5,
+		.allowed_protos	= RC_PROTO_BIT_RC5,
 		.rc_query	= pctv452e_rc_query,
 		.rc_interval	= 100,
 	},
diff --git a/drivers/media/usb/dvb-usb/technisat-usb2.c b/drivers/media/usb/dvb-usb/technisat-usb2.c
index 9f7dd1afcb15..18d0f8f5283f 100644
--- a/drivers/media/usb/dvb-usb/technisat-usb2.c
+++ b/drivers/media/usb/dvb-usb/technisat-usb2.c
@@ -749,7 +749,7 @@ static struct dvb_usb_device_properties technisat_usb2_devices = {
 		.rc_codes    = RC_MAP_TECHNISAT_USB2,
 		.module_name = "technisat-usb2",
 		.rc_query    = technisat_usb2_rc_query,
-		.allowed_protos = RC_BIT_ALL_IR_DECODER,
+		.allowed_protos = RC_PROTO_BIT_ALL_IR_DECODER,
 		.driver_type    = RC_DRIVER_IR_RAW,
 	}
 };
diff --git a/drivers/media/usb/dvb-usb/ttusb2.c b/drivers/media/usb/dvb-usb/ttusb2.c
index 9e0d6a4166d2..e7020f245f53 100644
--- a/drivers/media/usb/dvb-usb/ttusb2.c
+++ b/drivers/media/usb/dvb-usb/ttusb2.c
@@ -459,7 +459,7 @@ static int tt3650_rc_query(struct dvb_usb_device *d)
 		/* got a "press" event */
 		st->last_rc_key = RC_SCANCODE_RC5(rx[3], rx[2]);
 		deb_info("%s: cmd=0x%02x sys=0x%02x\n", __func__, rx[2], rx[3]);
-		rc_keydown(d->rc_dev, RC_TYPE_RC5, st->last_rc_key, rx[1]);
+		rc_keydown(d->rc_dev, RC_PROTO_RC5, st->last_rc_key, rx[1]);
 	} else if (st->last_rc_key) {
 		rc_keyup(d->rc_dev);
 		st->last_rc_key = 0;
@@ -766,7 +766,7 @@ static struct dvb_usb_device_properties ttusb2_properties_ct3650 = {
 		.rc_interval      = 150, /* Less than IR_KEYPRESS_TIMEOUT */
 		.rc_codes         = RC_MAP_TT_1500,
 		.rc_query         = tt3650_rc_query,
-		.allowed_protos   = RC_BIT_RC5,
+		.allowed_protos   = RC_PROTO_BIT_RC5,
 	},
 
 	.num_adapters = 1,
diff --git a/drivers/media/usb/em28xx/em28xx-input.c b/drivers/media/usb/em28xx/em28xx-input.c
index d8746b96a0f8..046223de1e91 100644
--- a/drivers/media/usb/em28xx/em28xx-input.c
+++ b/drivers/media/usb/em28xx/em28xx-input.c
@@ -55,7 +55,7 @@ struct em28xx_ir_poll_result {
 	unsigned int toggle_bit:1;
 	unsigned int read_count:7;
 
-	enum rc_type protocol;
+	enum rc_proto protocol;
 	u32 scancode;
 };
 
@@ -70,11 +70,12 @@ struct em28xx_IR {
 	struct delayed_work work;
 	unsigned int full_code:1;
 	unsigned int last_readcount;
-	u64 rc_type;
+	u64 rc_proto;
 
 	struct i2c_client *i2c_client;
 
-	int  (*get_key_i2c)(struct i2c_client *ir, enum rc_type *protocol, u32 *scancode);
+	int  (*get_key_i2c)(struct i2c_client *ir, enum rc_proto *protocol,
+			    u32 *scancode);
 	int  (*get_key)(struct em28xx_IR *, struct em28xx_ir_poll_result *);
 };
 
@@ -83,7 +84,7 @@ struct em28xx_IR {
  **********************************************************/
 
 static int em28xx_get_key_terratec(struct i2c_client *i2c_dev,
-				   enum rc_type *protocol, u32 *scancode)
+				   enum rc_proto *protocol, u32 *scancode)
 {
 	unsigned char b;
 
@@ -101,13 +102,13 @@ static int em28xx_get_key_terratec(struct i2c_client *i2c_dev,
 		/* keep old data */
 		return 1;
 
-	*protocol = RC_TYPE_UNKNOWN;
+	*protocol = RC_PROTO_UNKNOWN;
 	*scancode = b;
 	return 1;
 }
 
 static int em28xx_get_key_em_haup(struct i2c_client *i2c_dev,
-				  enum rc_type *protocol, u32 *scancode)
+				  enum rc_proto *protocol, u32 *scancode)
 {
 	unsigned char buf[2];
 	int size;
@@ -131,13 +132,14 @@ static int em28xx_get_key_em_haup(struct i2c_client *i2c_dev,
 	 * So, the code translation is not complete. Yet, it is enough to
 	 * work with the provided RC5 IR.
 	 */
-	*protocol = RC_TYPE_RC5;
+	*protocol = RC_PROTO_RC5;
 	*scancode = (bitrev8(buf[1]) & 0x1f) << 8 | bitrev8(buf[0]) >> 2;
 	return 1;
 }
 
 static int em28xx_get_key_pinnacle_usb_grey(struct i2c_client *i2c_dev,
-					    enum rc_type *protocol, u32 *scancode)
+					    enum rc_proto *protocol,
+					    u32 *scancode)
 {
 	unsigned char buf[3];
 
@@ -149,13 +151,14 @@ static int em28xx_get_key_pinnacle_usb_grey(struct i2c_client *i2c_dev,
 	if (buf[0] != 0x00)
 		return 0;
 
-	*protocol = RC_TYPE_UNKNOWN;
+	*protocol = RC_PROTO_UNKNOWN;
 	*scancode = buf[2] & 0x3f;
 	return 1;
 }
 
 static int em28xx_get_key_winfast_usbii_deluxe(struct i2c_client *i2c_dev,
-					       enum rc_type *protocol, u32 *scancode)
+					       enum rc_proto *protocol,
+					       u32 *scancode)
 {
 	unsigned char subaddr, keydetect, key;
 
@@ -175,7 +178,7 @@ static int em28xx_get_key_winfast_usbii_deluxe(struct i2c_client *i2c_dev,
 	if (key == 0x00)
 		return 0;
 
-	*protocol = RC_TYPE_UNKNOWN;
+	*protocol = RC_PROTO_UNKNOWN;
 	*scancode = key;
 	return 1;
 }
@@ -207,19 +210,19 @@ static int default_polling_getkey(struct em28xx_IR *ir,
 	poll_result->read_count = (msg[0] & 0x7f);
 
 	/* Remote Control Address/Data (Regs 0x46/0x47) */
-	switch (ir->rc_type) {
-	case RC_BIT_RC5:
-		poll_result->protocol = RC_TYPE_RC5;
+	switch (ir->rc_proto) {
+	case RC_PROTO_BIT_RC5:
+		poll_result->protocol = RC_PROTO_RC5;
 		poll_result->scancode = RC_SCANCODE_RC5(msg[1], msg[2]);
 		break;
 
-	case RC_BIT_NEC:
-		poll_result->protocol = RC_TYPE_NEC;
+	case RC_PROTO_BIT_NEC:
+		poll_result->protocol = RC_PROTO_NEC;
 		poll_result->scancode = RC_SCANCODE_NEC(msg[1], msg[2]);
 		break;
 
 	default:
-		poll_result->protocol = RC_TYPE_UNKNOWN;
+		poll_result->protocol = RC_PROTO_UNKNOWN;
 		poll_result->scancode = msg[1] << 8 | msg[2];
 		break;
 	}
@@ -252,37 +255,37 @@ static int em2874_polling_getkey(struct em28xx_IR *ir,
 	 * Remote Control Address (Reg 0x52)
 	 * Remote Control Data (Reg 0x53-0x55)
 	 */
-	switch (ir->rc_type) {
-	case RC_BIT_RC5:
-		poll_result->protocol = RC_TYPE_RC5;
+	switch (ir->rc_proto) {
+	case RC_PROTO_BIT_RC5:
+		poll_result->protocol = RC_PROTO_RC5;
 		poll_result->scancode = RC_SCANCODE_RC5(msg[1], msg[2]);
 		break;
 
-	case RC_BIT_NEC:
+	case RC_PROTO_BIT_NEC:
 		poll_result->scancode = msg[1] << 8 | msg[2];
 		if ((msg[3] ^ msg[4]) != 0xff) {	/* 32 bits NEC */
-			poll_result->protocol = RC_TYPE_NEC32;
+			poll_result->protocol = RC_PROTO_NEC32;
 			poll_result->scancode = RC_SCANCODE_NEC32((msg[1] << 24) |
 								  (msg[2] << 16) |
 								  (msg[3] << 8)  |
 								  (msg[4]));
 		} else if ((msg[1] ^ msg[2]) != 0xff) {	/* 24 bits NEC */
-			poll_result->protocol = RC_TYPE_NECX;
+			poll_result->protocol = RC_PROTO_NECX;
 			poll_result->scancode = RC_SCANCODE_NECX(msg[1] << 8 |
 								 msg[2], msg[3]);
 		} else {				/* Normal NEC */
-			poll_result->protocol = RC_TYPE_NEC;
+			poll_result->protocol = RC_PROTO_NEC;
 			poll_result->scancode = RC_SCANCODE_NEC(msg[1], msg[3]);
 		}
 		break;
 
-	case RC_BIT_RC6_0:
-		poll_result->protocol = RC_TYPE_RC6_0;
+	case RC_PROTO_BIT_RC6_0:
+		poll_result->protocol = RC_PROTO_RC6_0;
 		poll_result->scancode = RC_SCANCODE_RC6_0(msg[1], msg[2]);
 		break;
 
 	default:
-		poll_result->protocol = RC_TYPE_UNKNOWN;
+		poll_result->protocol = RC_PROTO_UNKNOWN;
 		poll_result->scancode = (msg[1] << 24) | (msg[2] << 16) |
 					(msg[3] << 8)  | msg[4];
 		break;
@@ -298,7 +301,7 @@ static int em2874_polling_getkey(struct em28xx_IR *ir,
 static int em28xx_i2c_ir_handle_key(struct em28xx_IR *ir)
 {
 	static u32 scancode;
-	enum rc_type protocol;
+	enum rc_proto protocol;
 	int rc;
 
 	rc = ir->get_key_i2c(ir->i2c_client, &protocol, &scancode);
@@ -338,7 +341,7 @@ static void em28xx_ir_handle_key(struct em28xx_IR *ir)
 				   poll_result.toggle_bit);
 		else
 			rc_keydown(ir->rc,
-				   RC_TYPE_UNKNOWN,
+				   RC_PROTO_UNKNOWN,
 				   poll_result.scancode & 0xff,
 				   poll_result.toggle_bit);
 
@@ -383,70 +386,71 @@ static void em28xx_ir_stop(struct rc_dev *rc)
 	cancel_delayed_work_sync(&ir->work);
 }
 
-static int em2860_ir_change_protocol(struct rc_dev *rc_dev, u64 *rc_type)
+static int em2860_ir_change_protocol(struct rc_dev *rc_dev, u64 *rc_proto)
 {
 	struct em28xx_IR *ir = rc_dev->priv;
 	struct em28xx *dev = ir->dev;
 
 	/* Adjust xclk based on IR table for RC5/NEC tables */
-	if (*rc_type & RC_BIT_RC5) {
+	if (*rc_proto & RC_PROTO_BIT_RC5) {
 		dev->board.xclk |= EM28XX_XCLK_IR_RC5_MODE;
 		ir->full_code = 1;
-		*rc_type = RC_BIT_RC5;
-	} else if (*rc_type & RC_BIT_NEC) {
+		*rc_proto = RC_PROTO_BIT_RC5;
+	} else if (*rc_proto & RC_PROTO_BIT_NEC) {
 		dev->board.xclk &= ~EM28XX_XCLK_IR_RC5_MODE;
 		ir->full_code = 1;
-		*rc_type = RC_BIT_NEC;
-	} else if (*rc_type & RC_BIT_UNKNOWN) {
-		*rc_type = RC_BIT_UNKNOWN;
+		*rc_proto = RC_PROTO_BIT_NEC;
+	} else if (*rc_proto & RC_PROTO_BIT_UNKNOWN) {
+		*rc_proto = RC_PROTO_BIT_UNKNOWN;
 	} else {
-		*rc_type = ir->rc_type;
+		*rc_proto = ir->rc_proto;
 		return -EINVAL;
 	}
 	em28xx_write_reg_bits(dev, EM28XX_R0F_XCLK, dev->board.xclk,
 			      EM28XX_XCLK_IR_RC5_MODE);
 
-	ir->rc_type = *rc_type;
+	ir->rc_proto = *rc_proto;
 
 	return 0;
 }
 
-static int em2874_ir_change_protocol(struct rc_dev *rc_dev, u64 *rc_type)
+static int em2874_ir_change_protocol(struct rc_dev *rc_dev, u64 *rc_proto)
 {
 	struct em28xx_IR *ir = rc_dev->priv;
 	struct em28xx *dev = ir->dev;
 	u8 ir_config = EM2874_IR_RC5;
 
 	/* Adjust xclk and set type based on IR table for RC5/NEC/RC6 tables */
-	if (*rc_type & RC_BIT_RC5) {
+	if (*rc_proto & RC_PROTO_BIT_RC5) {
 		dev->board.xclk |= EM28XX_XCLK_IR_RC5_MODE;
 		ir->full_code = 1;
-		*rc_type = RC_BIT_RC5;
-	} else if (*rc_type & RC_BIT_NEC) {
+		*rc_proto = RC_PROTO_BIT_RC5;
+	} else if (*rc_proto & RC_PROTO_BIT_NEC) {
 		dev->board.xclk &= ~EM28XX_XCLK_IR_RC5_MODE;
 		ir_config = EM2874_IR_NEC | EM2874_IR_NEC_NO_PARITY;
 		ir->full_code = 1;
-		*rc_type = RC_BIT_NEC;
-	} else if (*rc_type & RC_BIT_RC6_0) {
+		*rc_proto = RC_PROTO_BIT_NEC;
+	} else if (*rc_proto & RC_PROTO_BIT_RC6_0) {
 		dev->board.xclk |= EM28XX_XCLK_IR_RC5_MODE;
 		ir_config = EM2874_IR_RC6_MODE_0;
 		ir->full_code = 1;
-		*rc_type = RC_BIT_RC6_0;
-	} else if (*rc_type & RC_BIT_UNKNOWN) {
-		*rc_type = RC_BIT_UNKNOWN;
+		*rc_proto = RC_PROTO_BIT_RC6_0;
+	} else if (*rc_proto & RC_PROTO_BIT_UNKNOWN) {
+		*rc_proto = RC_PROTO_BIT_UNKNOWN;
 	} else {
-		*rc_type = ir->rc_type;
+		*rc_proto = ir->rc_proto;
 		return -EINVAL;
 	}
 	em28xx_write_regs(dev, EM2874_R50_IR_CONFIG, &ir_config, 1);
 	em28xx_write_reg_bits(dev, EM28XX_R0F_XCLK, dev->board.xclk,
 			      EM28XX_XCLK_IR_RC5_MODE);
 
-	ir->rc_type = *rc_type;
+	ir->rc_proto = *rc_proto;
 
 	return 0;
 }
-static int em28xx_ir_change_protocol(struct rc_dev *rc_dev, u64 *rc_type)
+
+static int em28xx_ir_change_protocol(struct rc_dev *rc_dev, u64 *rc_proto)
 {
 	struct em28xx_IR *ir = rc_dev->priv;
 	struct em28xx *dev = ir->dev;
@@ -455,12 +459,12 @@ static int em28xx_ir_change_protocol(struct rc_dev *rc_dev, u64 *rc_type)
 	switch (dev->chip_id) {
 	case CHIP_ID_EM2860:
 	case CHIP_ID_EM2883:
-		return em2860_ir_change_protocol(rc_dev, rc_type);
+		return em2860_ir_change_protocol(rc_dev, rc_proto);
 	case CHIP_ID_EM2884:
 	case CHIP_ID_EM2874:
 	case CHIP_ID_EM28174:
 	case CHIP_ID_EM28178:
-		return em2874_ir_change_protocol(rc_dev, rc_type);
+		return em2874_ir_change_protocol(rc_dev, rc_proto);
 	default:
 		dev_err(&ir->dev->intf->dev,
 			"Unrecognized em28xx chip id 0x%02x: IR not supported\n",
@@ -686,7 +690,7 @@ static int em28xx_ir_init(struct em28xx *dev)
 	struct em28xx_IR *ir;
 	struct rc_dev *rc;
 	int err = -ENOMEM;
-	u64 rc_type;
+	u64 rc_proto;
 	u16 i2c_rc_dev_addr = 0;
 
 	if (dev->is_audio_only) {
@@ -749,7 +753,7 @@ static int em28xx_ir_init(struct em28xx *dev)
 		case EM2820_BOARD_HAUPPAUGE_WINTV_USB_2:
 			rc->map_name = RC_MAP_HAUPPAUGE;
 			ir->get_key_i2c = em28xx_get_key_em_haup;
-			rc->allowed_protocols = RC_BIT_RC5;
+			rc->allowed_protocols = RC_PROTO_BIT_RC5;
 			break;
 		case EM2820_BOARD_LEADTEK_WINFAST_USBII_DELUXE:
 			rc->map_name = RC_MAP_WINFAST_USBII_DELUXE;
@@ -771,7 +775,8 @@ static int em28xx_ir_init(struct em28xx *dev)
 		switch (dev->chip_id) {
 		case CHIP_ID_EM2860:
 		case CHIP_ID_EM2883:
-			rc->allowed_protocols = RC_BIT_RC5 | RC_BIT_NEC;
+			rc->allowed_protocols = RC_PROTO_BIT_RC5 |
+						RC_PROTO_BIT_NEC;
 			ir->get_key = default_polling_getkey;
 			break;
 		case CHIP_ID_EM2884:
@@ -779,8 +784,9 @@ static int em28xx_ir_init(struct em28xx *dev)
 		case CHIP_ID_EM28174:
 		case CHIP_ID_EM28178:
 			ir->get_key = em2874_polling_getkey;
-			rc->allowed_protocols = RC_BIT_RC5 | RC_BIT_NEC |
-				RC_BIT_NECX | RC_BIT_NEC32 | RC_BIT_RC6_0;
+			rc->allowed_protocols = RC_PROTO_BIT_RC5 |
+				RC_PROTO_BIT_NEC | RC_PROTO_BIT_NECX |
+				RC_PROTO_BIT_NEC32 | RC_PROTO_BIT_RC6_0;
 			break;
 		default:
 			err = -ENODEV;
@@ -791,8 +797,8 @@ static int em28xx_ir_init(struct em28xx *dev)
 		rc->map_name = dev->board.ir_codes;
 
 		/* By default, keep protocol field untouched */
-		rc_type = RC_BIT_UNKNOWN;
-		err = em28xx_ir_change_protocol(rc, &rc_type);
+		rc_proto = RC_PROTO_BIT_UNKNOWN;
+		err = em28xx_ir_change_protocol(rc, &rc_proto);
 		if (err)
 			goto error;
 	}
diff --git a/drivers/media/usb/hdpvr/hdpvr-i2c.c b/drivers/media/usb/hdpvr/hdpvr-i2c.c
index fcab55038d99..b9074b978ae3 100644
--- a/drivers/media/usb/hdpvr/hdpvr-i2c.c
+++ b/drivers/media/usb/hdpvr/hdpvr-i2c.c
@@ -55,7 +55,8 @@ struct i2c_client *hdpvr_register_ir_rx_i2c(struct hdpvr_device *dev)
 	/* Our default information for ir-kbd-i2c.c to use */
 	init_data->ir_codes = RC_MAP_HAUPPAUGE;
 	init_data->internal_get_key_func = IR_KBD_GET_KEY_HAUP_XVR;
-	init_data->type = RC_BIT_RC5 | RC_BIT_RC6_MCE | RC_BIT_RC6_6A_32;
+	init_data->type = RC_PROTO_BIT_RC5 | RC_PROTO_BIT_RC6_MCE |
+			  RC_PROTO_BIT_RC6_6A_32;
 	init_data->name = "HD-PVR";
 	init_data->polling_interval = 405; /* ms, duplicated from Windows */
 	hdpvr_ir_rx_i2c_board_info.platform_data = init_data;
diff --git a/drivers/media/usb/pvrusb2/pvrusb2-i2c-core.c b/drivers/media/usb/pvrusb2/pvrusb2-i2c-core.c
index 20a52b785fff..e3c8f9414e45 100644
--- a/drivers/media/usb/pvrusb2/pvrusb2-i2c-core.c
+++ b/drivers/media/usb/pvrusb2/pvrusb2-i2c-core.c
@@ -567,7 +567,7 @@ static void pvr2_i2c_register_ir(struct pvr2_hdw *hdw)
 	case PVR2_IR_SCHEME_29XXX: /* Original 29xxx device */
 		init_data->ir_codes              = RC_MAP_HAUPPAUGE;
 		init_data->internal_get_key_func = IR_KBD_GET_KEY_HAUP;
-		init_data->type                  = RC_BIT_RC5;
+		init_data->type                  = RC_PROTO_BIT_RC5;
 		init_data->name                  = hdw->hdw_desc->description;
 		init_data->polling_interval      = 100; /* ms From ir-kbd-i2c */
 		/* IR Receiver */
@@ -580,11 +580,11 @@ static void pvr2_i2c_register_ir(struct pvr2_hdw *hdw)
 		break;
 	case PVR2_IR_SCHEME_ZILOG:     /* HVR-1950 style */
 	case PVR2_IR_SCHEME_24XXX_MCE: /* 24xxx MCE device */
-		init_data->ir_codes              = RC_MAP_HAUPPAUGE;
+		init_data->ir_codes = RC_MAP_HAUPPAUGE;
 		init_data->internal_get_key_func = IR_KBD_GET_KEY_HAUP_XVR;
-		init_data->type                  = RC_BIT_RC5 | RC_BIT_RC6_MCE |
-							RC_BIT_RC6_6A_32;
-		init_data->name                  = hdw->hdw_desc->description;
+		init_data->type = RC_PROTO_BIT_RC5 | RC_PROTO_BIT_RC6_MCE |
+							RC_PROTO_BIT_RC6_6A_32;
+		init_data->name = hdw->hdw_desc->description;
 		/* IR Receiver */
 		info.addr          = 0x71;
 		info.platform_data = init_data;
diff --git a/drivers/media/usb/tm6000/tm6000-input.c b/drivers/media/usb/tm6000/tm6000-input.c
index 83e33aef0105..91889ad9cdd7 100644
--- a/drivers/media/usb/tm6000/tm6000-input.c
+++ b/drivers/media/usb/tm6000/tm6000-input.c
@@ -66,7 +66,7 @@ struct tm6000_IR {
 	struct urb		*int_urb;
 
 	/* IR device properties */
-	u64			rc_type;
+	u64			rc_proto;
 };
 
 void tm6000_ir_wait(struct tm6000_core *dev, u8 state)
@@ -103,13 +103,13 @@ static int tm6000_ir_config(struct tm6000_IR *ir)
 	 * IR, in order to discard such decoding
 	 */
 
-	switch (ir->rc_type) {
-	case RC_BIT_NEC:
+	switch (ir->rc_proto) {
+	case RC_PROTO_BIT_NEC:
 		leader = 900;	/* ms */
 		pulse  = 700;	/* ms - the actual value would be 562 */
 		break;
 	default:
-	case RC_BIT_RC5:
+	case RC_PROTO_BIT_RC5:
 		leader = 900;	/* ms - from the NEC decoding */
 		pulse  = 1780;	/* ms - The actual value would be 1776 */
 		break;
@@ -117,12 +117,12 @@ static int tm6000_ir_config(struct tm6000_IR *ir)
 
 	pulse = ir_clock_mhz * pulse;
 	leader = ir_clock_mhz * leader;
-	if (ir->rc_type == RC_BIT_NEC)
+	if (ir->rc_proto == RC_PROTO_BIT_NEC)
 		leader = leader | 0x8000;
 
 	dprintk(2, "%s: %s, %d MHz, leader = 0x%04x, pulse = 0x%06x \n",
 		__func__,
-		(ir->rc_type == RC_BIT_NEC) ? "NEC" : "RC-5",
+		(ir->rc_proto == RC_PROTO_BIT_NEC) ? "NEC" : "RC-5",
 		ir_clock_mhz, leader, pulse);
 
 	/* Remote WAKEUP = enable, normal mode, from IR decoder output */
@@ -162,24 +162,24 @@ static void tm6000_ir_keydown(struct tm6000_IR *ir,
 {
 	u8 device, command;
 	u32 scancode;
-	enum rc_type protocol;
+	enum rc_proto protocol;
 
 	if (len < 1)
 		return;
 
 	command = buf[0];
 	device = (len > 1 ? buf[1] : 0x0);
-	switch (ir->rc_type) {
-	case RC_BIT_RC5:
-		protocol = RC_TYPE_RC5;
+	switch (ir->rc_proto) {
+	case RC_PROTO_BIT_RC5:
+		protocol = RC_PROTO_RC5;
 		scancode = RC_SCANCODE_RC5(device, command);
 		break;
-	case RC_BIT_NEC:
-		protocol = RC_TYPE_NEC;
+	case RC_PROTO_BIT_NEC:
+		protocol = RC_PROTO_NEC;
 		scancode = RC_SCANCODE_NEC(device, command);
 		break;
 	default:
-		protocol = RC_TYPE_OTHER;
+		protocol = RC_PROTO_OTHER;
 		scancode = RC_SCANCODE_OTHER(device << 8 | command);
 		break;
 	}
@@ -311,7 +311,7 @@ static void tm6000_ir_stop(struct rc_dev *rc)
 	cancel_delayed_work_sync(&ir->work);
 }
 
-static int tm6000_ir_change_protocol(struct rc_dev *rc, u64 *rc_type)
+static int tm6000_ir_change_protocol(struct rc_dev *rc, u64 *rc_proto)
 {
 	struct tm6000_IR *ir = rc->priv;
 
@@ -320,7 +320,7 @@ static int tm6000_ir_change_protocol(struct rc_dev *rc, u64 *rc_type)
 
 	dprintk(2, "%s\n",__func__);
 
-	ir->rc_type = *rc_type;
+	ir->rc_proto = *rc_proto;
 
 	tm6000_ir_config(ir);
 	/* TODO */
@@ -409,7 +409,7 @@ int tm6000_ir_init(struct tm6000_core *dev)
 	struct tm6000_IR *ir;
 	struct rc_dev *rc;
 	int err = -ENOMEM;
-	u64 rc_type;
+	u64 rc_proto;
 
 	if (!enable_ir)
 		return -ENODEV;
@@ -433,7 +433,7 @@ int tm6000_ir_init(struct tm6000_core *dev)
 	ir->rc = rc;
 
 	/* input setup */
-	rc->allowed_protocols = RC_BIT_RC5 | RC_BIT_NEC;
+	rc->allowed_protocols = RC_PROTO_BIT_RC5 | RC_PROTO_BIT_NEC;
 	/* Needed, in order to support NEC remotes with 24 or 32 bits */
 	rc->scancode_mask = 0xffff;
 	rc->priv = ir;
@@ -455,8 +455,8 @@ int tm6000_ir_init(struct tm6000_core *dev)
 	usb_make_path(dev->udev, ir->phys, sizeof(ir->phys));
 	strlcat(ir->phys, "/input0", sizeof(ir->phys));
 
-	rc_type = RC_BIT_UNKNOWN;
-	tm6000_ir_change_protocol(rc, &rc_type);
+	rc_proto = RC_PROTO_BIT_UNKNOWN;
+	tm6000_ir_change_protocol(rc, &rc_proto);
 
 	rc->device_name = ir->name;
 	rc->input_phys = ir->phys;
diff --git a/include/media/i2c/ir-kbd-i2c.h b/include/media/i2c/ir-kbd-i2c.h
index d8564354debb..ac8c55617a79 100644
--- a/include/media/i2c/ir-kbd-i2c.h
+++ b/include/media/i2c/ir-kbd-i2c.h
@@ -20,7 +20,8 @@ struct IR_i2c {
 	struct delayed_work    work;
 	char                   name[32];
 	char                   phys[32];
-	int                    (*get_key)(struct IR_i2c *ir, enum rc_type *protocol,
+	int                    (*get_key)(struct IR_i2c *ir,
+					  enum rc_proto *protocol,
 					  u32 *scancode, u8 *toggle);
 };
 
@@ -38,14 +39,15 @@ enum ir_kbd_get_key_fn {
 struct IR_i2c_init_data {
 	char			*ir_codes;
 	const char		*name;
-	u64			type; /* RC_BIT_RC5, etc */
+	u64			type; /* RC_PROTO_BIT_RC5, etc */
 	u32			polling_interval; /* 0 means DEFAULT_POLLING_INTERVAL */
 
 	/*
 	 * Specify either a function pointer or a value indicating one of
 	 * ir_kbd_i2c's internal get_key functions
 	 */
-	int                    (*get_key)(struct IR_i2c *ir, enum rc_type *protocol,
+	int                    (*get_key)(struct IR_i2c *ir,
+					  enum rc_proto *protocol,
 					  u32 *scancode, u8 *toggle);
 	enum ir_kbd_get_key_fn internal_get_key_func;
 
diff --git a/include/media/rc-core.h b/include/media/rc-core.h
index 5be527ff851d..314a1edb6189 100644
--- a/include/media/rc-core.h
+++ b/include/media/rc-core.h
@@ -87,11 +87,12 @@ enum rc_filter_type {
  * @idle: used to keep track of RX state
  * @encode_wakeup: wakeup filtering uses IR encode API, therefore the allowed
  *	wakeup protocols is the set of all raw encoders
- * @allowed_protocols: bitmask with the supported RC_BIT_* protocols
- * @enabled_protocols: bitmask with the enabled RC_BIT_* protocols
- * @allowed_wakeup_protocols: bitmask with the supported RC_BIT_* wakeup protocols
- * @wakeup_protocol: the enabled RC_TYPE_* wakeup protocol or
- *	RC_TYPE_UNKNOWN if disabled.
+ * @allowed_protocols: bitmask with the supported RC_PROTO_BIT_* protocols
+ * @enabled_protocols: bitmask with the enabled RC_PROTO_BIT_* protocols
+ * @allowed_wakeup_protocols: bitmask with the supported RC_PROTO_BIT_* wakeup
+ *	protocols
+ * @wakeup_protocol: the enabled RC_PROTO_* wakeup protocol or
+ *	RC_PROTO_UNKNOWN if disabled.
  * @scancode_filter: scancode filter
  * @scancode_wakeup_filter: scancode wakeup filters
  * @scancode_mask: some hardware decoders are not capable of providing the full
@@ -154,7 +155,7 @@ struct rc_dev {
 	u64				allowed_protocols;
 	u64				enabled_protocols;
 	u64				allowed_wakeup_protocols;
-	enum rc_type			wakeup_protocol;
+	enum rc_proto			wakeup_protocol;
 	struct rc_scancode_filter	scancode_filter;
 	struct rc_scancode_filter	scancode_wakeup_filter;
 	u32				scancode_mask;
@@ -165,7 +166,7 @@ struct rc_dev {
 	unsigned long			keyup_jiffies;
 	struct timer_list		timer_keyup;
 	u32				last_keycode;
-	enum rc_type			last_protocol;
+	enum rc_proto			last_protocol;
 	u32				last_scancode;
 	u8				last_toggle;
 	u32				timeout;
@@ -173,7 +174,7 @@ struct rc_dev {
 	u32				max_timeout;
 	u32				rx_resolution;
 	u32				tx_resolution;
-	int				(*change_protocol)(struct rc_dev *dev, u64 *rc_type);
+	int				(*change_protocol)(struct rc_dev *dev, u64 *rc_proto);
 	int				(*open)(struct rc_dev *dev);
 	void				(*close)(struct rc_dev *dev);
 	int				(*s_tx_mask)(struct rc_dev *dev, u32 mask);
@@ -262,8 +263,10 @@ int rc_open(struct rc_dev *rdev);
 void rc_close(struct rc_dev *rdev);
 
 void rc_repeat(struct rc_dev *dev);
-void rc_keydown(struct rc_dev *dev, enum rc_type protocol, u32 scancode, u8 toggle);
-void rc_keydown_notimeout(struct rc_dev *dev, enum rc_type protocol, u32 scancode, u8 toggle);
+void rc_keydown(struct rc_dev *dev, enum rc_proto protocol, u32 scancode,
+		u8 toggle);
+void rc_keydown_notimeout(struct rc_dev *dev, enum rc_proto protocol,
+			  u32 scancode, u8 toggle);
 void rc_keyup(struct rc_dev *dev);
 u32 rc_g_keycode_from_table(struct rc_dev *dev, u32 scancode);
 
@@ -304,7 +307,7 @@ int ir_raw_event_store_edge(struct rc_dev *dev, bool pulse);
 int ir_raw_event_store_with_filter(struct rc_dev *dev,
 				struct ir_raw_event *ev);
 void ir_raw_event_set_idle(struct rc_dev *dev, bool idle);
-int ir_raw_encode_scancode(enum rc_type protocol, u32 scancode,
+int ir_raw_encode_scancode(enum rc_proto protocol, u32 scancode,
 			   struct ir_raw_event *events, unsigned int max);
 
 static inline void ir_raw_event_reset(struct rc_dev *dev)
@@ -335,7 +338,7 @@ static inline u32 ir_extract_bits(u32 data, u32 mask)
 /* Get NEC scancode and protocol type from address and command bytes */
 static inline u32 ir_nec_bytes_to_scancode(u8 address, u8 not_address,
 					   u8 command, u8 not_command,
-					   enum rc_type *protocol)
+					   enum rc_proto *protocol)
 {
 	u32 scancode;
 
@@ -347,17 +350,17 @@ static inline u32 ir_nec_bytes_to_scancode(u8 address, u8 not_address,
 			address     << 16 |
 			not_command <<  8 |
 			command;
-		*protocol = RC_TYPE_NEC32;
+		*protocol = RC_PROTO_NEC32;
 	} else if ((address ^ not_address) != 0xff) {
 		/* Extended NEC */
 		scancode = address     << 16 |
 			   not_address <<  8 |
 			   command;
-		*protocol = RC_TYPE_NECX;
+		*protocol = RC_PROTO_NECX;
 	} else {
 		/* Normal NEC */
 		scancode = address << 8 | command;
-		*protocol = RC_TYPE_NEC;
+		*protocol = RC_PROTO_NEC;
 	}
 
 	return scancode;
diff --git a/include/media/rc-map.h b/include/media/rc-map.h
index c69482852f29..2a160e6e823c 100644
--- a/include/media/rc-map.h
+++ b/include/media/rc-map.h
@@ -12,113 +12,122 @@
 #include <linux/input.h>
 
 /**
- * enum rc_type - type of the Remote Controller protocol
+ * enum rc_proto - the Remote Controller protocol
  *
- * @RC_TYPE_UNKNOWN: Protocol not known
- * @RC_TYPE_OTHER: Protocol known but proprietary
- * @RC_TYPE_RC5: Philips RC5 protocol
- * @RC_TYPE_RC5X_20: Philips RC5x 20 bit protocol
- * @RC_TYPE_RC5_SZ: StreamZap variant of RC5
- * @RC_TYPE_JVC: JVC protocol
- * @RC_TYPE_SONY12: Sony 12 bit protocol
- * @RC_TYPE_SONY15: Sony 15 bit protocol
- * @RC_TYPE_SONY20: Sony 20 bit protocol
- * @RC_TYPE_NEC: NEC protocol
- * @RC_TYPE_NECX: Extended NEC protocol
- * @RC_TYPE_NEC32: NEC 32 bit protocol
- * @RC_TYPE_SANYO: Sanyo protocol
- * @RC_TYPE_MCIR2_KBD: RC6-ish MCE keyboard
- * @RC_TYPE_MCIR2_MSE: RC6-ish MCE mouse
- * @RC_TYPE_RC6_0: Philips RC6-0-16 protocol
- * @RC_TYPE_RC6_6A_20: Philips RC6-6A-20 protocol
- * @RC_TYPE_RC6_6A_24: Philips RC6-6A-24 protocol
- * @RC_TYPE_RC6_6A_32: Philips RC6-6A-32 protocol
- * @RC_TYPE_RC6_MCE: MCE (Philips RC6-6A-32 subtype) protocol
- * @RC_TYPE_SHARP: Sharp protocol
- * @RC_TYPE_XMP: XMP protocol
- * @RC_TYPE_CEC: CEC protocol
+ * @RC_PROTO_UNKNOWN: Protocol not known
+ * @RC_PROTO_OTHER: Protocol known but proprietary
+ * @RC_PROTO_RC5: Philips RC5 protocol
+ * @RC_PROTO_RC5X_20: Philips RC5x 20 bit protocol
+ * @RC_PROTO_RC5_SZ: StreamZap variant of RC5
+ * @RC_PROTO_JVC: JVC protocol
+ * @RC_PROTO_SONY12: Sony 12 bit protocol
+ * @RC_PROTO_SONY15: Sony 15 bit protocol
+ * @RC_PROTO_SONY20: Sony 20 bit protocol
+ * @RC_PROTO_NEC: NEC protocol
+ * @RC_PROTO_NECX: Extended NEC protocol
+ * @RC_PROTO_NEC32: NEC 32 bit protocol
+ * @RC_PROTO_SANYO: Sanyo protocol
+ * @RC_PROTO_MCIR2_KBD: RC6-ish MCE keyboard
+ * @RC_PROTO_MCIR2_MSE: RC6-ish MCE mouse
+ * @RC_PROTO_RC6_0: Philips RC6-0-16 protocol
+ * @RC_PROTO_RC6_6A_20: Philips RC6-6A-20 protocol
+ * @RC_PROTO_RC6_6A_24: Philips RC6-6A-24 protocol
+ * @RC_PROTO_RC6_6A_32: Philips RC6-6A-32 protocol
+ * @RC_PROTO_RC6_MCE: MCE (Philips RC6-6A-32 subtype) protocol
+ * @RC_PROTO_SHARP: Sharp protocol
+ * @RC_PROTO_XMP: XMP protocol
+ * @RC_PROTO_CEC: CEC protocol
  */
-enum rc_type {
-	RC_TYPE_UNKNOWN		= 0,
-	RC_TYPE_OTHER		= 1,
-	RC_TYPE_RC5		= 2,
-	RC_TYPE_RC5X_20		= 3,
-	RC_TYPE_RC5_SZ		= 4,
-	RC_TYPE_JVC		= 5,
-	RC_TYPE_SONY12		= 6,
-	RC_TYPE_SONY15		= 7,
-	RC_TYPE_SONY20		= 8,
-	RC_TYPE_NEC		= 9,
-	RC_TYPE_NECX		= 10,
-	RC_TYPE_NEC32		= 11,
-	RC_TYPE_SANYO		= 12,
-	RC_TYPE_MCIR2_KBD	= 13,
-	RC_TYPE_MCIR2_MSE	= 14,
-	RC_TYPE_RC6_0		= 15,
-	RC_TYPE_RC6_6A_20	= 16,
-	RC_TYPE_RC6_6A_24	= 17,
-	RC_TYPE_RC6_6A_32	= 18,
-	RC_TYPE_RC6_MCE		= 19,
-	RC_TYPE_SHARP		= 20,
-	RC_TYPE_XMP		= 21,
-	RC_TYPE_CEC		= 22,
+enum rc_proto {
+	RC_PROTO_UNKNOWN	= 0,
+	RC_PROTO_OTHER		= 1,
+	RC_PROTO_RC5		= 2,
+	RC_PROTO_RC5X_20	= 3,
+	RC_PROTO_RC5_SZ		= 4,
+	RC_PROTO_JVC		= 5,
+	RC_PROTO_SONY12		= 6,
+	RC_PROTO_SONY15		= 7,
+	RC_PROTO_SONY20		= 8,
+	RC_PROTO_NEC		= 9,
+	RC_PROTO_NECX		= 10,
+	RC_PROTO_NEC32		= 11,
+	RC_PROTO_SANYO		= 12,
+	RC_PROTO_MCIR2_KBD	= 13,
+	RC_PROTO_MCIR2_MSE	= 14,
+	RC_PROTO_RC6_0		= 15,
+	RC_PROTO_RC6_6A_20	= 16,
+	RC_PROTO_RC6_6A_24	= 17,
+	RC_PROTO_RC6_6A_32	= 18,
+	RC_PROTO_RC6_MCE	= 19,
+	RC_PROTO_SHARP		= 20,
+	RC_PROTO_XMP		= 21,
+	RC_PROTO_CEC		= 22,
 };
 
-#define RC_BIT_NONE		0ULL
-#define RC_BIT_UNKNOWN		BIT_ULL(RC_TYPE_UNKNOWN)
-#define RC_BIT_OTHER		BIT_ULL(RC_TYPE_OTHER)
-#define RC_BIT_RC5		BIT_ULL(RC_TYPE_RC5)
-#define RC_BIT_RC5X_20		BIT_ULL(RC_TYPE_RC5X_20)
-#define RC_BIT_RC5_SZ		BIT_ULL(RC_TYPE_RC5_SZ)
-#define RC_BIT_JVC		BIT_ULL(RC_TYPE_JVC)
-#define RC_BIT_SONY12		BIT_ULL(RC_TYPE_SONY12)
-#define RC_BIT_SONY15		BIT_ULL(RC_TYPE_SONY15)
-#define RC_BIT_SONY20		BIT_ULL(RC_TYPE_SONY20)
-#define RC_BIT_NEC		BIT_ULL(RC_TYPE_NEC)
-#define RC_BIT_NECX		BIT_ULL(RC_TYPE_NECX)
-#define RC_BIT_NEC32		BIT_ULL(RC_TYPE_NEC32)
-#define RC_BIT_SANYO		BIT_ULL(RC_TYPE_SANYO)
-#define RC_BIT_MCIR2_KBD	BIT_ULL(RC_TYPE_MCIR2_KBD)
-#define RC_BIT_MCIR2_MSE	BIT_ULL(RC_TYPE_MCIR2_MSE)
-#define RC_BIT_RC6_0		BIT_ULL(RC_TYPE_RC6_0)
-#define RC_BIT_RC6_6A_20	BIT_ULL(RC_TYPE_RC6_6A_20)
-#define RC_BIT_RC6_6A_24	BIT_ULL(RC_TYPE_RC6_6A_24)
-#define RC_BIT_RC6_6A_32	BIT_ULL(RC_TYPE_RC6_6A_32)
-#define RC_BIT_RC6_MCE		BIT_ULL(RC_TYPE_RC6_MCE)
-#define RC_BIT_SHARP		BIT_ULL(RC_TYPE_SHARP)
-#define RC_BIT_XMP		BIT_ULL(RC_TYPE_XMP)
-#define RC_BIT_CEC		BIT_ULL(RC_TYPE_CEC)
+#define RC_PROTO_BIT_NONE		0ULL
+#define RC_PROTO_BIT_UNKNOWN		BIT_ULL(RC_PROTO_UNKNOWN)
+#define RC_PROTO_BIT_OTHER		BIT_ULL(RC_PROTO_OTHER)
+#define RC_PROTO_BIT_RC5		BIT_ULL(RC_PROTO_RC5)
+#define RC_PROTO_BIT_RC5X_20		BIT_ULL(RC_PROTO_RC5X_20)
+#define RC_PROTO_BIT_RC5_SZ		BIT_ULL(RC_PROTO_RC5_SZ)
+#define RC_PROTO_BIT_JVC		BIT_ULL(RC_PROTO_JVC)
+#define RC_PROTO_BIT_SONY12		BIT_ULL(RC_PROTO_SONY12)
+#define RC_PROTO_BIT_SONY15		BIT_ULL(RC_PROTO_SONY15)
+#define RC_PROTO_BIT_SONY20		BIT_ULL(RC_PROTO_SONY20)
+#define RC_PROTO_BIT_NEC		BIT_ULL(RC_PROTO_NEC)
+#define RC_PROTO_BIT_NECX		BIT_ULL(RC_PROTO_NECX)
+#define RC_PROTO_BIT_NEC32		BIT_ULL(RC_PROTO_NEC32)
+#define RC_PROTO_BIT_SANYO		BIT_ULL(RC_PROTO_SANYO)
+#define RC_PROTO_BIT_MCIR2_KBD		BIT_ULL(RC_PROTO_MCIR2_KBD)
+#define RC_PROTO_BIT_MCIR2_MSE		BIT_ULL(RC_PROTO_MCIR2_MSE)
+#define RC_PROTO_BIT_RC6_0		BIT_ULL(RC_PROTO_RC6_0)
+#define RC_PROTO_BIT_RC6_6A_20		BIT_ULL(RC_PROTO_RC6_6A_20)
+#define RC_PROTO_BIT_RC6_6A_24		BIT_ULL(RC_PROTO_RC6_6A_24)
+#define RC_PROTO_BIT_RC6_6A_32		BIT_ULL(RC_PROTO_RC6_6A_32)
+#define RC_PROTO_BIT_RC6_MCE		BIT_ULL(RC_PROTO_RC6_MCE)
+#define RC_PROTO_BIT_SHARP		BIT_ULL(RC_PROTO_SHARP)
+#define RC_PROTO_BIT_XMP		BIT_ULL(RC_PROTO_XMP)
+#define RC_PROTO_BIT_CEC		BIT_ULL(RC_PROTO_CEC)
 
-#define RC_BIT_ALL	(RC_BIT_UNKNOWN | RC_BIT_OTHER | \
-			 RC_BIT_RC5 | RC_BIT_RC5X_20 | RC_BIT_RC5_SZ | \
-			 RC_BIT_JVC | \
-			 RC_BIT_SONY12 | RC_BIT_SONY15 | RC_BIT_SONY20 | \
-			 RC_BIT_NEC | RC_BIT_NECX | RC_BIT_NEC32 | \
-			 RC_BIT_SANYO | \
-			 RC_BIT_MCIR2_KBD | RC_BIT_MCIR2_MSE | \
-			 RC_BIT_RC6_0 | RC_BIT_RC6_6A_20 | RC_BIT_RC6_6A_24 | \
-			 RC_BIT_RC6_6A_32 | RC_BIT_RC6_MCE | RC_BIT_SHARP | \
-			 RC_BIT_XMP | RC_BIT_CEC)
+#define RC_PROTO_BIT_ALL \
+			(RC_PROTO_BIT_UNKNOWN | RC_PROTO_BIT_OTHER | \
+			 RC_PROTO_BIT_RC5 | RC_PROTO_BIT_RC5X_20 | \
+			 RC_PROTO_BIT_RC5_SZ | RC_PROTO_BIT_JVC | \
+			 RC_PROTO_BIT_SONY12 | RC_PROTO_BIT_SONY15 | \
+			 RC_PROTO_BIT_SONY20 | RC_PROTO_BIT_NEC | \
+			 RC_PROTO_BIT_NECX | RC_PROTO_BIT_NEC32 | \
+			 RC_PROTO_BIT_SANYO | \
+			 RC_PROTO_BIT_MCIR2_KBD | RC_PROTO_BIT_MCIR2_MSE | \
+			 RC_PROTO_BIT_RC6_0 | RC_PROTO_BIT_RC6_6A_20 | \
+			 RC_PROTO_BIT_RC6_6A_24 | RC_PROTO_BIT_RC6_6A_32 | \
+			 RC_PROTO_BIT_RC6_MCE | RC_PROTO_BIT_SHARP | \
+			 RC_PROTO_BIT_XMP | RC_PROTO_BIT_CEC)
 /* All rc protocols for which we have decoders */
-#define RC_BIT_ALL_IR_DECODER \
-			(RC_BIT_RC5 | RC_BIT_RC5X_20 | RC_BIT_RC5_SZ | \
-			 RC_BIT_JVC | \
-			 RC_BIT_SONY12 | RC_BIT_SONY15 | RC_BIT_SONY20 | \
-			 RC_BIT_NEC | RC_BIT_NECX | RC_BIT_NEC32 | \
-			 RC_BIT_SANYO | RC_BIT_MCIR2_KBD | RC_BIT_MCIR2_MSE | \
-			 RC_BIT_RC6_0 | RC_BIT_RC6_6A_20 | RC_BIT_RC6_6A_24 | \
-			 RC_BIT_RC6_6A_32 | RC_BIT_RC6_MCE | RC_BIT_SHARP | \
-			 RC_BIT_XMP)
+#define RC_PROTO_BIT_ALL_IR_DECODER \
+			(RC_PROTO_BIT_RC5 | RC_PROTO_BIT_RC5X_20 | \
+			 RC_PROTO_BIT_RC5_SZ | RC_PROTO_BIT_JVC | \
+			 RC_PROTO_BIT_SONY12 | RC_PROTO_BIT_SONY15 | \
+			 RC_PROTO_BIT_SONY20 | RC_PROTO_BIT_NEC | \
+			 RC_PROTO_BIT_NECX | RC_PROTO_BIT_NEC32 | \
+			 RC_PROTO_BIT_SANYO | RC_PROTO_BIT_MCIR2_KBD | \
+			 RC_PROTO_BIT_MCIR2_MSE | \
+			 RC_PROTO_BIT_RC6_0 | RC_PROTO_BIT_RC6_6A_20 | \
+			 RC_PROTO_BIT_RC6_6A_24 |  RC_PROTO_BIT_RC6_6A_32 | \
+			 RC_PROTO_BIT_RC6_MCE | RC_PROTO_BIT_SHARP | \
+			 RC_PROTO_BIT_XMP)
 
-#define RC_BIT_ALL_IR_ENCODER \
-			(RC_BIT_RC5 | RC_BIT_RC5X_20 | RC_BIT_RC5_SZ | \
-			 RC_BIT_JVC | \
-			 RC_BIT_SONY12 | RC_BIT_SONY15 | RC_BIT_SONY20 | \
-			 RC_BIT_NEC | RC_BIT_NECX | RC_BIT_NEC32 | \
-			 RC_BIT_SANYO | RC_BIT_MCIR2_KBD | RC_BIT_MCIR2_MSE | \
-			 RC_BIT_RC6_0 | RC_BIT_RC6_6A_20 | RC_BIT_RC6_6A_24 | \
-			 RC_BIT_RC6_6A_32 | RC_BIT_RC6_MCE | \
-			 RC_BIT_SHARP)
+#define RC_PROTO_BIT_ALL_IR_ENCODER \
+			(RC_PROTO_BIT_RC5 | RC_PROTO_BIT_RC5X_20 | \
+			 RC_PROTO_BIT_RC5_SZ | RC_PROTO_BIT_JVC | \
+			 RC_PROTO_BIT_SONY12 | RC_PROTO_BIT_SONY15 | \
+			 RC_PROTO_BIT_SONY20 |  RC_PROTO_BIT_NEC | \
+			 RC_PROTO_BIT_NECX | RC_PROTO_BIT_NEC32 | \
+			 RC_PROTO_BIT_SANYO | RC_PROTO_BIT_MCIR2_KBD | \
+			 RC_PROTO_BIT_MCIR2_MSE | \
+			 RC_PROTO_BIT_RC6_0 | RC_PROTO_BIT_RC6_6A_20 | \
+			 RC_PROTO_BIT_RC6_6A_24 | \
+			 RC_PROTO_BIT_RC6_6A_32 | RC_PROTO_BIT_RC6_MCE | \
+			 RC_PROTO_BIT_SHARP)
 
 #define RC_SCANCODE_UNKNOWN(x)			(x)
 #define RC_SCANCODE_OTHER(x)			(x)
@@ -148,8 +157,8 @@ struct rc_map_table {
  * @size: Max number of entries
  * @len: Number of entries that are in use
  * @alloc: size of \*scan, in bytes
- * @rc_type: type of the remote controller protocol, as defined at
- *	     enum &rc_type
+ * @rc_proto: type of the remote controller protocol, as defined at
+ *	     enum &rc_proto
  * @name: name of the key map table
  * @lock: lock to protect access to this structure
  */
@@ -158,7 +167,7 @@ struct rc_map {
 	unsigned int		size;
 	unsigned int		len;
 	unsigned int		alloc;
-	enum rc_type		rc_type;
+	enum rc_proto		rc_proto;
 	const char		*name;
 	spinlock_t		lock;
 };
-- 
cgit v1.2.3


From f00fd7ae4f409abb7b2e5d099248832548199f0c Mon Sep 17 00:00:00 2001
From: Jonathan Corbet <corbet@lwn.net>
Date: Sun, 30 Jul 2017 16:14:42 -0600
Subject: PATCH] iio: Fix some documentation warnings

The kerneldoc description for the trig_readonly field of struct iio_dev
lacked a colon, leading to this doc build warning:

  ./include/linux/iio/iio.h:603: warning: No description found for parameter 'trig_readonly'

A similar issue for iio_trigger_set_immutable() in trigger.h yielded:

  ./include/linux/iio/trigger.h:151: warning: No description found for parameter 'indio_dev'
  ./include/linux/iio/trigger.h:151: warning: No description found for parameter 'trig'

Fix the formatting and silence the warnings.

Signed-off-by: Jonathan Corbet <corbet@lwn.net>
Signed-off-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
---
 include/linux/iio/iio.h     | 2 +-
 include/linux/iio/trigger.h | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/include/linux/iio/iio.h b/include/linux/iio/iio.h
index d68bec297a45..c380daa40c0e 100644
--- a/include/linux/iio/iio.h
+++ b/include/linux/iio/iio.h
@@ -535,7 +535,7 @@ struct iio_buffer_setup_ops {
  * @scan_timestamp:	[INTERN] set if any buffers have requested timestamp
  * @scan_index_timestamp:[INTERN] cache of the index to the timestamp
  * @trig:		[INTERN] current device trigger (buffer modes)
- * @trig_readonly	[INTERN] mark the current trigger immutable
+ * @trig_readonly:	[INTERN] mark the current trigger immutable
  * @pollfunc:		[DRIVER] function run on trigger being received
  * @pollfunc_event:	[DRIVER] function run on events trigger being received
  * @channels:		[DRIVER] channel specification structure table
diff --git a/include/linux/iio/trigger.h b/include/linux/iio/trigger.h
index ea08302f2d7b..7142d8d6e470 100644
--- a/include/linux/iio/trigger.h
+++ b/include/linux/iio/trigger.h
@@ -144,8 +144,8 @@ void devm_iio_trigger_unregister(struct device *dev,
 /**
  * iio_trigger_set_immutable() - set an immutable trigger on destination
  *
- * @indio_dev - IIO device structure containing the device
- * @trig - trigger to assign to device
+ * @indio_dev: IIO device structure containing the device
+ * @trig: trigger to assign to device
  *
  **/
 int iio_trigger_set_immutable(struct iio_dev *indio_dev, struct iio_trigger *trig);
-- 
cgit v1.2.3


From 3bde7afdabe9f37974af806abe646c2ca43c67c7 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <trond.myklebust@primarydata.com>
Date: Sun, 20 Aug 2017 11:33:25 -0400
Subject: NFS: Remove unused parameter gfp_flags from nfs_pageio_init()

Now that the mirror allocation has been moved, the parameter can go.
Also remove the redundant symbol export.

Signed-off-by: Trond Myklebust <trond.myklebust@primarydata.com>
---
 fs/nfs/pagelist.c        | 4 +---
 fs/nfs/read.c            | 2 +-
 fs/nfs/write.c           | 2 +-
 include/linux/nfs_page.h | 3 +--
 4 files changed, 4 insertions(+), 7 deletions(-)

(limited to 'include')

diff --git a/fs/nfs/pagelist.c b/fs/nfs/pagelist.c
index b7d193e2a243..ee66d8c3336c 100644
--- a/fs/nfs/pagelist.c
+++ b/fs/nfs/pagelist.c
@@ -711,8 +711,7 @@ void nfs_pageio_init(struct nfs_pageio_descriptor *desc,
 		     const struct nfs_pgio_completion_ops *compl_ops,
 		     const struct nfs_rw_ops *rw_ops,
 		     size_t bsize,
-		     int io_flags,
-		     gfp_t gfp_flags)
+		     int io_flags)
 {
 	desc->pg_moreio = 0;
 	desc->pg_inode = inode;
@@ -733,7 +732,6 @@ void nfs_pageio_init(struct nfs_pageio_descriptor *desc,
 	desc->pg_mirrors = desc->pg_mirrors_static;
 	nfs_pageio_mirror_init(&desc->pg_mirrors[0], bsize);
 }
-EXPORT_SYMBOL_GPL(nfs_pageio_init);
 
 /**
  * nfs_pgio_result - Basic pageio error handling
diff --git a/fs/nfs/read.c b/fs/nfs/read.c
index a8421d9dab6a..0d42573d423d 100644
--- a/fs/nfs/read.c
+++ b/fs/nfs/read.c
@@ -68,7 +68,7 @@ void nfs_pageio_init_read(struct nfs_pageio_descriptor *pgio,
 		pg_ops = server->pnfs_curr_ld->pg_read_ops;
 #endif
 	nfs_pageio_init(pgio, inode, pg_ops, compl_ops, &nfs_rw_read_ops,
-			server->rsize, 0, GFP_KERNEL);
+			server->rsize, 0);
 }
 EXPORT_SYMBOL_GPL(nfs_pageio_init_read);
 
diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index b1af5dee5e0a..ae78ac0a7a8a 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -1452,7 +1452,7 @@ void nfs_pageio_init_write(struct nfs_pageio_descriptor *pgio,
 		pg_ops = server->pnfs_curr_ld->pg_write_ops;
 #endif
 	nfs_pageio_init(pgio, inode, pg_ops, compl_ops, &nfs_rw_write_ops,
-			server->wsize, ioflags, GFP_NOIO);
+			server->wsize, ioflags);
 }
 EXPORT_SYMBOL_GPL(nfs_pageio_init_write);
 
diff --git a/include/linux/nfs_page.h b/include/linux/nfs_page.h
index d67b67ae6c8b..8b1a35aad0c3 100644
--- a/include/linux/nfs_page.h
+++ b/include/linux/nfs_page.h
@@ -125,8 +125,7 @@ extern	void nfs_pageio_init(struct nfs_pageio_descriptor *desc,
 			     const struct nfs_pgio_completion_ops *compl_ops,
 			     const struct nfs_rw_ops *rw_ops,
 			     size_t bsize,
-			     int how,
-			     gfp_t gfp_flags);
+			     int how);
 extern	int nfs_pageio_add_request(struct nfs_pageio_descriptor *,
 				   struct nfs_page *);
 extern  int nfs_pageio_resend(struct nfs_pageio_descriptor *,
-- 
cgit v1.2.3


From 68a66d149a8c78ec6720f268597302883e48e9fa Mon Sep 17 00:00:00 2001
From: Konstantin Khlebnikov <khlebnikov@yandex-team.ru>
Date: Sat, 19 Aug 2017 15:37:07 +0300
Subject: net_sched: fix order of queue length updates in qdisc_replace()

This important to call qdisc_tree_reduce_backlog() after changing queue
length. Parent qdisc should deactivate class in ->qlen_notify() called from
qdisc_tree_reduce_backlog() but this happens only if qdisc->q.qlen in zero.

Missed class deactivations leads to crashes/warnings at picking packets
from empty qdisc and corrupting state at reactivating this class in future.

Signed-off-by: Konstantin Khlebnikov <khlebnikov@yandex-team.ru>
Fixes: 86a7996cc8a0 ("net_sched: introduce qdisc_replace() helper")
Acked-by: Cong Wang <xiyou.wangcong@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/sch_generic.h | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h
index 1c123e2b2415..67f815e5d525 100644
--- a/include/net/sch_generic.h
+++ b/include/net/sch_generic.h
@@ -806,8 +806,11 @@ static inline struct Qdisc *qdisc_replace(struct Qdisc *sch, struct Qdisc *new,
 	old = *pold;
 	*pold = new;
 	if (old != NULL) {
-		qdisc_tree_reduce_backlog(old, old->q.qlen, old->qstats.backlog);
+		unsigned int qlen = old->q.qlen;
+		unsigned int backlog = old->qstats.backlog;
+
 		qdisc_reset(old);
+		qdisc_tree_reduce_backlog(old, qlen, backlog);
 	}
 	sch_tree_unlock(sch);
 
-- 
cgit v1.2.3


From 9928688492aea161eeeaad7af34f372794cb2f53 Mon Sep 17 00:00:00 2001
From: Inki Dae <inki.dae@samsung.com>
Date: Mon, 3 Jul 2017 17:42:17 +0900
Subject: drm/bridge: change return type of drm_bridge_add function

This patch changes return type of drm_bridge_add function.

This function never return negative value but returns only 0.
So it changes the return type of this function to void one.

Signed-off-by: Inki Dae <inki.dae@samsung.com>
Signed-off-by: Archit Taneja <architt@codeaurora.org>
Link: https://patchwork.freedesktop.org/patch/msgid/1499071350-25168-2-git-send-email-inki.dae@samsung.com
---
 drivers/gpu/drm/drm_bridge.c | 7 +------
 include/drm/drm_bridge.h     | 2 +-
 2 files changed, 2 insertions(+), 7 deletions(-)

(limited to 'include')

diff --git a/drivers/gpu/drm/drm_bridge.c b/drivers/gpu/drm/drm_bridge.c
index dc8cdfe1dcac..1638bfe9627c 100644
--- a/drivers/gpu/drm/drm_bridge.c
+++ b/drivers/gpu/drm/drm_bridge.c
@@ -67,17 +67,12 @@ static LIST_HEAD(bridge_list);
  * drm_bridge_add - add the given bridge to the global bridge list
  *
  * @bridge: bridge control structure
- *
- * RETURNS:
- * Unconditionally returns Zero.
  */
-int drm_bridge_add(struct drm_bridge *bridge)
+void drm_bridge_add(struct drm_bridge *bridge)
 {
 	mutex_lock(&bridge_lock);
 	list_add_tail(&bridge->list, &bridge_list);
 	mutex_unlock(&bridge_lock);
-
-	return 0;
 }
 EXPORT_SYMBOL(drm_bridge_add);
 
diff --git a/include/drm/drm_bridge.h b/include/drm/drm_bridge.h
index 6522d4cbc9d9..682d01ba920c 100644
--- a/include/drm/drm_bridge.h
+++ b/include/drm/drm_bridge.h
@@ -245,7 +245,7 @@ struct drm_bridge {
 	void *driver_private;
 };
 
-int drm_bridge_add(struct drm_bridge *bridge);
+void drm_bridge_add(struct drm_bridge *bridge);
 void drm_bridge_remove(struct drm_bridge *bridge);
 struct drm_bridge *of_drm_find_bridge(struct device_node *np);
 int drm_bridge_attach(struct drm_encoder *encoder, struct drm_bridge *bridge,
-- 
cgit v1.2.3


From 9d6105e19f617406aea46dd19281080c7c5ae0d8 Mon Sep 17 00:00:00 2001
From: Elaine Zhang <zhangqing@rock-chips.com>
Date: Mon, 21 Aug 2017 03:28:34 +0200
Subject: mfd: rk808: Fix up the chip id get failed

the rk8xx chip id is:
((MSB << 8) | LSB) & 0xfff0

Signed-off-by: Elaine Zhang <zhangqing@rock-chips.com>
Signed-off-by: Joseph Chen <chenjh@rock-chips.com>
Signed-off-by: Heiko Stuebner <heiko@sntech.de>
Signed-off-by: Lee Jones <lee.jones@linaro.org>
---
 drivers/mfd/rk808.c       | 21 +++++++++++++++------
 include/linux/mfd/rk808.h |  1 +
 2 files changed, 16 insertions(+), 6 deletions(-)

(limited to 'include')

diff --git a/drivers/mfd/rk808.c b/drivers/mfd/rk808.c
index fd087cbb0bde..8e60ebaeaadb 100644
--- a/drivers/mfd/rk808.c
+++ b/drivers/mfd/rk808.c
@@ -325,7 +325,7 @@ static int rk808_probe(struct i2c_client *client,
 	void (*pm_pwroff_fn)(void);
 	int nr_pre_init_regs;
 	int nr_cells;
-	int pm_off = 0;
+	int pm_off = 0, msb, lsb;
 	int ret;
 	int i;
 
@@ -333,14 +333,23 @@ static int rk808_probe(struct i2c_client *client,
 	if (!rk808)
 		return -ENOMEM;
 
-	rk808->variant = i2c_smbus_read_word_data(client, RK808_ID_MSB);
-	if (rk808->variant < 0) {
-		dev_err(&client->dev, "Failed to read the chip id at 0x%02x\n",
+	/* Read chip variant */
+	msb = i2c_smbus_read_byte_data(client, RK808_ID_MSB);
+	if (msb < 0) {
+		dev_err(&client->dev, "failed to read the chip id at 0x%x\n",
 			RK808_ID_MSB);
-		return rk808->variant;
+		return msb;
 	}
 
-	dev_dbg(&client->dev, "Chip id: 0x%x\n", (unsigned int)rk808->variant);
+	lsb = i2c_smbus_read_byte_data(client, RK808_ID_LSB);
+	if (lsb < 0) {
+		dev_err(&client->dev, "failed to read the chip id at 0x%x\n",
+			RK808_ID_LSB);
+		return lsb;
+	}
+
+	rk808->variant = ((msb << 8) | lsb) & RK8XX_ID_MSK;
+	dev_info(&client->dev, "chip id: 0x%x\n", (unsigned int)rk808->variant);
 
 	switch (rk808->variant) {
 	case RK808_ID:
diff --git a/include/linux/mfd/rk808.h b/include/linux/mfd/rk808.h
index 83701ef7d3c7..54feb140c210 100644
--- a/include/linux/mfd/rk808.h
+++ b/include/linux/mfd/rk808.h
@@ -298,6 +298,7 @@ enum rk818_reg {
 #define VOUT_LO_INT	BIT(0)
 #define CLK32KOUT2_EN	BIT(0)
 
+#define RK8XX_ID_MSK			0xfff0
 enum {
 	BUCK_ILMIN_50MA,
 	BUCK_ILMIN_100MA,
-- 
cgit v1.2.3


From a5247ca6708ff7dc089d2774c131455774153eb6 Mon Sep 17 00:00:00 2001
From: Elaine Zhang <zhangqing@rock-chips.com>
Date: Mon, 21 Aug 2017 03:28:35 +0200
Subject: mfd: rk808: Add rk805 regs addr and ID

Signed-off-by: Elaine Zhang <zhangqing@rock-chips.com>
Signed-off-by: Joseph Chen <chenjh@rock-chips.com>
Signed-off-by: Heiko Stuebner <heiko@sntech.de>
Signed-off-by: Lee Jones <lee.jones@linaro.org>
---
 include/linux/mfd/rk808.h | 120 ++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 120 insertions(+)

(limited to 'include')

diff --git a/include/linux/mfd/rk808.h b/include/linux/mfd/rk808.h
index 54feb140c210..d3156594674c 100644
--- a/include/linux/mfd/rk808.h
+++ b/include/linux/mfd/rk808.h
@@ -206,6 +206,97 @@ enum rk818_reg {
 #define RK818_USB_ILMIN_2000MA		0x7
 #define RK818_USB_CHG_SD_VSEL_MASK	0x70
 
+/* RK805 */
+enum rk805_reg {
+	RK805_ID_DCDC1,
+	RK805_ID_DCDC2,
+	RK805_ID_DCDC3,
+	RK805_ID_DCDC4,
+	RK805_ID_LDO1,
+	RK805_ID_LDO2,
+	RK805_ID_LDO3,
+};
+
+/* CONFIG REGISTER */
+#define RK805_VB_MON_REG		0x21
+#define RK805_THERMAL_REG		0x22
+
+/* POWER CHANNELS ENABLE REGISTER */
+#define RK805_DCDC_EN_REG		0x23
+#define RK805_SLP_DCDC_EN_REG		0x25
+#define RK805_SLP_LDO_EN_REG		0x26
+#define RK805_LDO_EN_REG		0x27
+
+/* BUCK AND LDO CONFIG REGISTER */
+#define RK805_BUCK_LDO_SLP_LP_EN_REG	0x2A
+#define RK805_BUCK1_CONFIG_REG		0x2E
+#define RK805_BUCK1_ON_VSEL_REG		0x2F
+#define RK805_BUCK1_SLP_VSEL_REG	0x30
+#define RK805_BUCK2_CONFIG_REG		0x32
+#define RK805_BUCK2_ON_VSEL_REG		0x33
+#define RK805_BUCK2_SLP_VSEL_REG	0x34
+#define RK805_BUCK3_CONFIG_REG		0x36
+#define RK805_BUCK4_CONFIG_REG		0x37
+#define RK805_BUCK4_ON_VSEL_REG		0x38
+#define RK805_BUCK4_SLP_VSEL_REG	0x39
+#define RK805_LDO1_ON_VSEL_REG		0x3B
+#define RK805_LDO1_SLP_VSEL_REG		0x3C
+#define RK805_LDO2_ON_VSEL_REG		0x3D
+#define RK805_LDO2_SLP_VSEL_REG		0x3E
+#define RK805_LDO3_ON_VSEL_REG		0x3F
+#define RK805_LDO3_SLP_VSEL_REG		0x40
+
+/* INTERRUPT REGISTER */
+#define RK805_PWRON_LP_INT_TIME_REG	0x47
+#define RK805_PWRON_DB_REG		0x48
+#define RK805_DEV_CTRL_REG		0x4B
+#define RK805_INT_STS_REG		0x4C
+#define RK805_INT_STS_MSK_REG		0x4D
+#define RK805_GPIO_IO_POL_REG		0x50
+#define RK805_OUT_REG			0x52
+#define RK805_ON_SOURCE_REG		0xAE
+#define RK805_OFF_SOURCE_REG		0xAF
+
+#define RK805_NUM_REGULATORS		7
+
+#define RK805_PWRON_FALL_RISE_INT_EN	0x0
+#define RK805_PWRON_FALL_RISE_INT_MSK	0x81
+
+/* RK805 IRQ Definitions */
+#define RK805_IRQ_PWRON_RISE		0
+#define RK805_IRQ_VB_LOW		1
+#define RK805_IRQ_PWRON			2
+#define RK805_IRQ_PWRON_LP		3
+#define RK805_IRQ_HOTDIE		4
+#define RK805_IRQ_RTC_ALARM		5
+#define RK805_IRQ_RTC_PERIOD		6
+#define RK805_IRQ_PWRON_FALL		7
+
+#define RK805_IRQ_PWRON_RISE_MSK	BIT(0)
+#define RK805_IRQ_VB_LOW_MSK		BIT(1)
+#define RK805_IRQ_PWRON_MSK		BIT(2)
+#define RK805_IRQ_PWRON_LP_MSK		BIT(3)
+#define RK805_IRQ_HOTDIE_MSK		BIT(4)
+#define RK805_IRQ_RTC_ALARM_MSK		BIT(5)
+#define RK805_IRQ_RTC_PERIOD_MSK	BIT(6)
+#define RK805_IRQ_PWRON_FALL_MSK	BIT(7)
+
+#define RK805_PWR_RISE_INT_STATUS	BIT(0)
+#define RK805_VB_LOW_INT_STATUS		BIT(1)
+#define RK805_PWRON_INT_STATUS		BIT(2)
+#define RK805_PWRON_LP_INT_STATUS	BIT(3)
+#define RK805_HOTDIE_INT_STATUS		BIT(4)
+#define RK805_ALARM_INT_STATUS		BIT(5)
+#define RK805_PERIOD_INT_STATUS		BIT(6)
+#define RK805_PWR_FALL_INT_STATUS	BIT(7)
+
+#define RK805_BUCK1_2_ILMAX_MASK	(3 << 6)
+#define RK805_BUCK3_4_ILMAX_MASK        (3 << 3)
+#define RK805_RTC_PERIOD_INT_MASK	(1 << 6)
+#define RK805_RTC_ALARM_INT_MASK	(1 << 5)
+#define RK805_INT_ALARM_EN		(1 << 3)
+#define RK805_INT_TIMER_EN		(1 << 2)
+
 /* RK808 IRQ Definitions */
 #define RK808_IRQ_VOUT_LO	0
 #define RK808_IRQ_VB_LO		1
@@ -298,7 +389,14 @@ enum rk818_reg {
 #define VOUT_LO_INT	BIT(0)
 #define CLK32KOUT2_EN	BIT(0)
 
+#define TEMP115C			0x0c
+#define TEMP_HOTDIE_MSK			0x0c
+#define SLP_SD_MSK			(0x3 << 2)
+#define SHUTDOWN_FUN			(0x2 << 2)
+#define SLEEP_FUN			(0x1 << 2)
 #define RK8XX_ID_MSK			0xfff0
+#define FPWM_MODE			BIT(7)
+
 enum {
 	BUCK_ILMIN_50MA,
 	BUCK_ILMIN_100MA,
@@ -322,6 +420,28 @@ enum {
 };
 
 enum {
+	RK805_BUCK1_2_ILMAX_2500MA,
+	RK805_BUCK1_2_ILMAX_3000MA,
+	RK805_BUCK1_2_ILMAX_3500MA,
+	RK805_BUCK1_2_ILMAX_4000MA,
+};
+
+enum {
+	RK805_BUCK3_ILMAX_1500MA,
+	RK805_BUCK3_ILMAX_2000MA,
+	RK805_BUCK3_ILMAX_2500MA,
+	RK805_BUCK3_ILMAX_3000MA,
+};
+
+enum {
+	RK805_BUCK4_ILMAX_2000MA,
+	RK805_BUCK4_ILMAX_2500MA,
+	RK805_BUCK4_ILMAX_3000MA,
+	RK805_BUCK4_ILMAX_3500MA,
+};
+
+enum {
+	RK805_ID = 0x8050,
 	RK808_ID = 0x0000,
 	RK818_ID = 0x8181,
 };
-- 
cgit v1.2.3


From 6c83fd5142c68294acb0e857b7bac2ce8a5077f7 Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Mon, 21 Aug 2017 14:06:46 +0200
Subject: quota: Add lock annotations to struct members

Add annotation which lock protects which struct members to struct dquot
and struct mem_dqinfo.

Signed-off-by: Jan Kara <jack@suse.cz>
---
 include/linux/quota.h | 22 +++++++++++-----------
 1 file changed, 11 insertions(+), 11 deletions(-)

(limited to 'include')

diff --git a/include/linux/quota.h b/include/linux/quota.h
index 074123975595..5ac9de4fcd6f 100644
--- a/include/linux/quota.h
+++ b/include/linux/quota.h
@@ -223,12 +223,12 @@ struct mem_dqinfo {
 	struct quota_format_type *dqi_format;
 	int dqi_fmt_id;		/* Id of the dqi_format - used when turning
 				 * quotas on after remount RW */
-	struct list_head dqi_dirty_list;	/* List of dirty dquots */
-	unsigned long dqi_flags;
-	unsigned int dqi_bgrace;
-	unsigned int dqi_igrace;
-	qsize_t dqi_max_spc_limit;
-	qsize_t dqi_max_ino_limit;
+	struct list_head dqi_dirty_list;	/* List of dirty dquots [dq_list_lock] */
+	unsigned long dqi_flags;	/* DFQ_ flags [dq_data_lock] */
+	unsigned int dqi_bgrace;	/* Space grace time [dq_data_lock] */
+	unsigned int dqi_igrace;	/* Inode grace time [dq_data_lock] */
+	qsize_t dqi_max_spc_limit;	/* Maximum space limit [static] */
+	qsize_t dqi_max_ino_limit;	/* Maximum inode limit [static] */
 	void *dqi_priv;
 };
 
@@ -293,16 +293,16 @@ static inline void dqstats_dec(unsigned int type)
 				 * clear them when it sees fit. */
 
 struct dquot {
-	struct hlist_node dq_hash;	/* Hash list in memory */
-	struct list_head dq_inuse;	/* List of all quotas */
-	struct list_head dq_free;	/* Free list element */
-	struct list_head dq_dirty;	/* List of dirty dquots */
+	struct hlist_node dq_hash;	/* Hash list in memory [dq_list_lock] */
+	struct list_head dq_inuse;	/* List of all quotas [dq_list_lock] */
+	struct list_head dq_free;	/* Free list element [dq_list_lock] */
+	struct list_head dq_dirty;	/* List of dirty dquots [dq_list_lock] */
 	struct mutex dq_lock;		/* dquot IO lock */
 	spinlock_t dq_dqb_lock;		/* Lock protecting dq_dqb changes */
 	atomic_t dq_count;		/* Use count */
 	struct super_block *dq_sb;	/* superblock this applies to */
 	struct kqid dq_id;		/* ID this applies to (uid, gid, projid) */
-	loff_t dq_off;			/* Offset of dquot on disk */
+	loff_t dq_off;			/* Offset of dquot on disk [dq_lock, stable once set] */
 	unsigned long dq_flags;		/* See DQ_* */
 	struct mem_dqblk dq_dqb;	/* Diskquota usage [dq_dqb_lock] */
 };
-- 
cgit v1.2.3


From 7467c9d9598993d8531b2f76c090a5743000612b Mon Sep 17 00:00:00 2001
From: Sudeep Holla <sudeep.holla@arm.com>
Date: Mon, 24 Jul 2017 14:55:13 +0100
Subject: of: return of_get_cpu_node from of_cpu_device_node_get if CPUs are
 not registered

Instead of the callsites choosing between of_cpu_device_node_get if the
CPUs are registered as of_node is populated by then and of_get_cpu_node
when the CPUs are not yet registered as CPU of_nodes are not yet stashed
thereby needing to parse the device tree, we can call of_get_cpu_node
in case the CPUs are not yet registered.

This will allow to use of_cpu_device_node_get anywhere hiding the
details from the caller.

Cc: Rob Herring <robh+dt@kernel.org>
Cc: Frank Rowand <frowand.list@gmail.com>
Signed-off-by: Sudeep Holla <sudeep.holla@arm.com>
Signed-off-by: Rob Herring <robh@kernel.org>
---
 include/linux/of_device.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/of_device.h b/include/linux/of_device.h
index b4ad8b4f8506..611502524425 100644
--- a/include/linux/of_device.h
+++ b/include/linux/of_device.h
@@ -50,7 +50,7 @@ static inline struct device_node *of_cpu_device_node_get(int cpu)
 	struct device *cpu_dev;
 	cpu_dev = get_cpu_device(cpu);
 	if (!cpu_dev)
-		return NULL;
+		return of_get_cpu_node(cpu, NULL);
 	return of_node_get(cpu_dev->of_node);
 }
 
-- 
cgit v1.2.3


From 89e49506bc62520f93e64a278293444319a6aebb Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Thu, 17 Aug 2017 16:47:00 +0200
Subject: dsa: remove unused net_device arg from handlers

compile tested only, but saw no warnings/errors with
allmodconfig build.

Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/dsa.h     | 6 ++----
 net/dsa/dsa.c         | 4 ++--
 net/dsa/tag_brcm.c    | 3 +--
 net/dsa/tag_dsa.c     | 3 +--
 net/dsa/tag_edsa.c    | 3 +--
 net/dsa/tag_ksz.c     | 3 +--
 net/dsa/tag_lan9303.c | 2 +-
 net/dsa/tag_mtk.c     | 3 +--
 net/dsa/tag_qca.c     | 3 +--
 net/dsa/tag_trailer.c | 3 +--
 10 files changed, 12 insertions(+), 21 deletions(-)

(limited to 'include')

diff --git a/include/net/dsa.h b/include/net/dsa.h
index 7f46b521313e..398ca8d70ccd 100644
--- a/include/net/dsa.h
+++ b/include/net/dsa.h
@@ -104,8 +104,7 @@ struct packet_type;
 struct dsa_device_ops {
 	struct sk_buff *(*xmit)(struct sk_buff *skb, struct net_device *dev);
 	struct sk_buff *(*rcv)(struct sk_buff *skb, struct net_device *dev,
-			       struct packet_type *pt,
-			       struct net_device *orig_dev);
+			       struct packet_type *pt);
 	int (*flow_dissect)(const struct sk_buff *skb, __be16 *proto,
 			    int *offset);
 };
@@ -134,8 +133,7 @@ struct dsa_switch_tree {
 	/* Copy of tag_ops->rcv for faster access in hot path */
 	struct sk_buff *	(*rcv)(struct sk_buff *skb,
 				       struct net_device *dev,
-				       struct packet_type *pt,
-				       struct net_device *orig_dev);
+				       struct packet_type *pt);
 
 	/*
 	 * The switch port to which the CPU is attached.
diff --git a/net/dsa/dsa.c b/net/dsa/dsa.c
index 99e38af85fc5..03c58b0eb082 100644
--- a/net/dsa/dsa.c
+++ b/net/dsa/dsa.c
@@ -186,7 +186,7 @@ struct net_device *dsa_dev_to_net_device(struct device *dev)
 EXPORT_SYMBOL_GPL(dsa_dev_to_net_device);
 
 static int dsa_switch_rcv(struct sk_buff *skb, struct net_device *dev,
-			  struct packet_type *pt, struct net_device *orig_dev)
+			  struct packet_type *pt, struct net_device *unused)
 {
 	struct dsa_switch_tree *dst = dev->dsa_ptr;
 	struct sk_buff *nskb = NULL;
@@ -202,7 +202,7 @@ static int dsa_switch_rcv(struct sk_buff *skb, struct net_device *dev,
 	if (!skb)
 		return 0;
 
-	nskb = dst->rcv(skb, dev, pt, orig_dev);
+	nskb = dst->rcv(skb, dev, pt);
 	if (!nskb) {
 		kfree_skb(skb);
 		return 0;
diff --git a/net/dsa/tag_brcm.c b/net/dsa/tag_brcm.c
index c697d9815177..de74c3f77818 100644
--- a/net/dsa/tag_brcm.c
+++ b/net/dsa/tag_brcm.c
@@ -89,8 +89,7 @@ static struct sk_buff *brcm_tag_xmit(struct sk_buff *skb, struct net_device *dev
 }
 
 static struct sk_buff *brcm_tag_rcv(struct sk_buff *skb, struct net_device *dev,
-				    struct packet_type *pt,
-				    struct net_device *orig_dev)
+				    struct packet_type *pt)
 {
 	struct dsa_switch_tree *dst = dev->dsa_ptr;
 	struct dsa_port *cpu_dp = dsa_get_cpu_port(dst);
diff --git a/net/dsa/tag_dsa.c b/net/dsa/tag_dsa.c
index 12867a4b458f..fbf9ca954773 100644
--- a/net/dsa/tag_dsa.c
+++ b/net/dsa/tag_dsa.c
@@ -65,8 +65,7 @@ static struct sk_buff *dsa_xmit(struct sk_buff *skb, struct net_device *dev)
 }
 
 static struct sk_buff *dsa_rcv(struct sk_buff *skb, struct net_device *dev,
-			       struct packet_type *pt,
-			       struct net_device *orig_dev)
+			       struct packet_type *pt)
 {
 	struct dsa_switch_tree *dst = dev->dsa_ptr;
 	struct dsa_switch *ds;
diff --git a/net/dsa/tag_edsa.c b/net/dsa/tag_edsa.c
index 67a9d26f9075..76367ba1b2e2 100644
--- a/net/dsa/tag_edsa.c
+++ b/net/dsa/tag_edsa.c
@@ -78,8 +78,7 @@ static struct sk_buff *edsa_xmit(struct sk_buff *skb, struct net_device *dev)
 }
 
 static struct sk_buff *edsa_rcv(struct sk_buff *skb, struct net_device *dev,
-				struct packet_type *pt,
-				struct net_device *orig_dev)
+				struct packet_type *pt)
 {
 	struct dsa_switch_tree *dst = dev->dsa_ptr;
 	struct dsa_switch *ds;
diff --git a/net/dsa/tag_ksz.c b/net/dsa/tag_ksz.c
index de66ca8e6201..17f30675c15c 100644
--- a/net/dsa/tag_ksz.c
+++ b/net/dsa/tag_ksz.c
@@ -76,8 +76,7 @@ static struct sk_buff *ksz_xmit(struct sk_buff *skb, struct net_device *dev)
 }
 
 static struct sk_buff *ksz_rcv(struct sk_buff *skb, struct net_device *dev,
-			       struct packet_type *pt,
-			       struct net_device *orig_dev)
+			       struct packet_type *pt)
 {
 	struct dsa_switch_tree *dst = dev->dsa_ptr;
 	struct dsa_port *cpu_dp = dsa_get_cpu_port(dst);
diff --git a/net/dsa/tag_lan9303.c b/net/dsa/tag_lan9303.c
index e23e7635fa00..0b9826105e42 100644
--- a/net/dsa/tag_lan9303.c
+++ b/net/dsa/tag_lan9303.c
@@ -68,7 +68,7 @@ static struct sk_buff *lan9303_xmit(struct sk_buff *skb, struct net_device *dev)
 }
 
 static struct sk_buff *lan9303_rcv(struct sk_buff *skb, struct net_device *dev,
-			struct packet_type *pt, struct net_device *orig_dev)
+			struct packet_type *pt)
 {
 	u16 *lan9303_tag;
 	struct dsa_switch_tree *dst = dev->dsa_ptr;
diff --git a/net/dsa/tag_mtk.c b/net/dsa/tag_mtk.c
index 02163c045a96..ec8ee5f43255 100644
--- a/net/dsa/tag_mtk.c
+++ b/net/dsa/tag_mtk.c
@@ -44,8 +44,7 @@ static struct sk_buff *mtk_tag_xmit(struct sk_buff *skb,
 }
 
 static struct sk_buff *mtk_tag_rcv(struct sk_buff *skb, struct net_device *dev,
-				   struct packet_type *pt,
-				   struct net_device *orig_dev)
+				   struct packet_type *pt)
 {
 	struct dsa_switch_tree *dst = dev->dsa_ptr;
 	struct dsa_switch *ds;
diff --git a/net/dsa/tag_qca.c b/net/dsa/tag_qca.c
index 1867a3d11f28..1d4c70711c0f 100644
--- a/net/dsa/tag_qca.c
+++ b/net/dsa/tag_qca.c
@@ -63,8 +63,7 @@ static struct sk_buff *qca_tag_xmit(struct sk_buff *skb, struct net_device *dev)
 }
 
 static struct sk_buff *qca_tag_rcv(struct sk_buff *skb, struct net_device *dev,
-				   struct packet_type *pt,
-				   struct net_device *orig_dev)
+				   struct packet_type *pt)
 {
 	struct dsa_switch_tree *dst = dev->dsa_ptr;
 	struct dsa_port *cpu_dp = dsa_get_cpu_port(dst);
diff --git a/net/dsa/tag_trailer.c b/net/dsa/tag_trailer.c
index b09e56214005..8707157dea32 100644
--- a/net/dsa/tag_trailer.c
+++ b/net/dsa/tag_trailer.c
@@ -56,8 +56,7 @@ static struct sk_buff *trailer_xmit(struct sk_buff *skb, struct net_device *dev)
 }
 
 static struct sk_buff *trailer_rcv(struct sk_buff *skb, struct net_device *dev,
-				   struct packet_type *pt,
-				   struct net_device *orig_dev)
+				   struct packet_type *pt)
 {
 	struct dsa_switch_tree *dst = dev->dsa_ptr;
 	struct dsa_port *cpu_dp = dsa_get_cpu_port(dst);
-- 
cgit v1.2.3


From dd1c1f2f2028a7b851f701fc6a8ebe39dcb95e7c Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Mon, 21 Aug 2017 17:35:02 +0200
Subject: pids: make task_tgid_nr_ns() safe

This was reported many times, and this was even mentioned in commit
52ee2dfdd4f5 ("pids: refactor vnr/nr_ns helpers to make them safe") but
somehow nobody bothered to fix the obvious problem: task_tgid_nr_ns() is
not safe because task->group_leader points to nowhere after the exiting
task passes exit_notify(), rcu_read_lock() can not help.

We really need to change __unhash_process() to nullify group_leader,
parent, and real_parent, but this needs some cleanups.  Until then we
can turn task_tgid_nr_ns() into another user of __task_pid_nr_ns() and
fix the problem.

Reported-by: Troy Kensinger <tkensinger@google.com>
Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/pid.h   |  4 +++-
 include/linux/sched.h | 51 +++++++++++++++++++++++++++------------------------
 kernel/pid.c          | 11 ++++-------
 3 files changed, 34 insertions(+), 32 deletions(-)

(limited to 'include')

diff --git a/include/linux/pid.h b/include/linux/pid.h
index 4d179316e431..719582744a2e 100644
--- a/include/linux/pid.h
+++ b/include/linux/pid.h
@@ -8,7 +8,9 @@ enum pid_type
 	PIDTYPE_PID,
 	PIDTYPE_PGID,
 	PIDTYPE_SID,
-	PIDTYPE_MAX
+	PIDTYPE_MAX,
+	/* only valid to __task_pid_nr_ns() */
+	__PIDTYPE_TGID
 };
 
 /*
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 8337e2db0bb2..c05ac5f5aa03 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1163,13 +1163,6 @@ static inline pid_t task_tgid_nr(struct task_struct *tsk)
 	return tsk->tgid;
 }
 
-extern pid_t task_tgid_nr_ns(struct task_struct *tsk, struct pid_namespace *ns);
-
-static inline pid_t task_tgid_vnr(struct task_struct *tsk)
-{
-	return pid_vnr(task_tgid(tsk));
-}
-
 /**
  * pid_alive - check that a task structure is not stale
  * @p: Task structure to be checked.
@@ -1185,23 +1178,6 @@ static inline int pid_alive(const struct task_struct *p)
 	return p->pids[PIDTYPE_PID].pid != NULL;
 }
 
-static inline pid_t task_ppid_nr_ns(const struct task_struct *tsk, struct pid_namespace *ns)
-{
-	pid_t pid = 0;
-
-	rcu_read_lock();
-	if (pid_alive(tsk))
-		pid = task_tgid_nr_ns(rcu_dereference(tsk->real_parent), ns);
-	rcu_read_unlock();
-
-	return pid;
-}
-
-static inline pid_t task_ppid_nr(const struct task_struct *tsk)
-{
-	return task_ppid_nr_ns(tsk, &init_pid_ns);
-}
-
 static inline pid_t task_pgrp_nr_ns(struct task_struct *tsk, struct pid_namespace *ns)
 {
 	return __task_pid_nr_ns(tsk, PIDTYPE_PGID, ns);
@@ -1223,6 +1199,33 @@ static inline pid_t task_session_vnr(struct task_struct *tsk)
 	return __task_pid_nr_ns(tsk, PIDTYPE_SID, NULL);
 }
 
+static inline pid_t task_tgid_nr_ns(struct task_struct *tsk, struct pid_namespace *ns)
+{
+	return __task_pid_nr_ns(tsk, __PIDTYPE_TGID, ns);
+}
+
+static inline pid_t task_tgid_vnr(struct task_struct *tsk)
+{
+	return __task_pid_nr_ns(tsk, __PIDTYPE_TGID, NULL);
+}
+
+static inline pid_t task_ppid_nr_ns(const struct task_struct *tsk, struct pid_namespace *ns)
+{
+	pid_t pid = 0;
+
+	rcu_read_lock();
+	if (pid_alive(tsk))
+		pid = task_tgid_nr_ns(rcu_dereference(tsk->real_parent), ns);
+	rcu_read_unlock();
+
+	return pid;
+}
+
+static inline pid_t task_ppid_nr(const struct task_struct *tsk)
+{
+	return task_ppid_nr_ns(tsk, &init_pid_ns);
+}
+
 /* Obsolete, do not use: */
 static inline pid_t task_pgrp_nr(struct task_struct *tsk)
 {
diff --git a/kernel/pid.c b/kernel/pid.c
index c69c30d827e5..020dedbdf066 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -527,8 +527,11 @@ pid_t __task_pid_nr_ns(struct task_struct *task, enum pid_type type,
 	if (!ns)
 		ns = task_active_pid_ns(current);
 	if (likely(pid_alive(task))) {
-		if (type != PIDTYPE_PID)
+		if (type != PIDTYPE_PID) {
+			if (type == __PIDTYPE_TGID)
+				type = PIDTYPE_PID;
 			task = task->group_leader;
+		}
 		nr = pid_nr_ns(rcu_dereference(task->pids[type].pid), ns);
 	}
 	rcu_read_unlock();
@@ -537,12 +540,6 @@ pid_t __task_pid_nr_ns(struct task_struct *task, enum pid_type type,
 }
 EXPORT_SYMBOL(__task_pid_nr_ns);
 
-pid_t task_tgid_nr_ns(struct task_struct *tsk, struct pid_namespace *ns)
-{
-	return pid_nr_ns(task_tgid(tsk), ns);
-}
-EXPORT_SYMBOL(task_tgid_nr_ns);
-
 struct pid_namespace *task_active_pid_ns(struct task_struct *tsk)
 {
 	return ns_of_pid(task_pid(tsk));
-- 
cgit v1.2.3


From 7d3f0cd43feea1636dd7746f22fe8249b34d1b79 Mon Sep 17 00:00:00 2001
From: Gao Feng <gfree.wind@vip.163.com>
Date: Fri, 18 Aug 2017 15:23:24 +0800
Subject: net: sched: Add the invalid handle check in qdisc_class_find

Add the invalid handle "0" check to avoid unnecessary search, because
the qdisc uses the skb->priority as the handle value to look up, and
it is "0" usually.

Signed-off-by: Gao Feng <gfree.wind@vip.163.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/sch_generic.h | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'include')

diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h
index 5865db91976b..107c52432245 100644
--- a/include/net/sch_generic.h
+++ b/include/net/sch_generic.h
@@ -393,6 +393,9 @@ qdisc_class_find(const struct Qdisc_class_hash *hash, u32 id)
 	struct Qdisc_class_common *cl;
 	unsigned int h;
 
+	if (!id)
+		return NULL;
+
 	h = qdisc_class_hash(id, hash->hashmask);
 	hlist_for_each_entry(cl, &hash->hash[h], hnode) {
 		if (cl->classid == id)
-- 
cgit v1.2.3


From 9762e7ff163213e18ed8e7cc723d74b54bf9cb98 Mon Sep 17 00:00:00 2001
From: Elaine Zhang <zhangqing@rock-chips.com>
Date: Fri, 18 Aug 2017 11:49:24 +0800
Subject: clk: rockchip: add rk3228 sclk_sdio_src ID

This patch exports sdio src clock for dts reference.

Signed-off-by: Elaine Zhang <zhangqing@rock-chips.com>
Signed-off-by: Heiko Stuebner <heiko@sntech.de>
---
 include/dt-bindings/clock/rk3228-cru.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include')

diff --git a/include/dt-bindings/clock/rk3228-cru.h b/include/dt-bindings/clock/rk3228-cru.h
index 56f841c22801..55655ab0a4c4 100644
--- a/include/dt-bindings/clock/rk3228-cru.h
+++ b/include/dt-bindings/clock/rk3228-cru.h
@@ -49,6 +49,7 @@
 #define SCLK_EMMC_DRV		117
 #define SCLK_SDMMC_SAMPLE	118
 #define SCLK_SDIO_SAMPLE	119
+#define SCLK_SDIO_SRC		120
 #define SCLK_EMMC_SAMPLE	121
 #define SCLK_VOP		122
 #define SCLK_HDMI_HDCP		123
-- 
cgit v1.2.3


From 1858698e0ac63f4fd3a633064893b5fd6ef8aa8d Mon Sep 17 00:00:00 2001
From: Elaine Zhang <zhangqing@rock-chips.com>
Date: Mon, 21 Aug 2017 16:16:04 +0800
Subject: clk: rockchip: add rv1108 ACLK_GAMC and PCLK_GMAC ID

This patch exports gmac aclk and pclk for dts reference.

Signed-off-by: Elaine Zhang <zhangqing@rock-chips.com>
Reviewed-by: David Wu <david.wu@rock-chips.com>
Signed-off-by: Heiko Stuebner <heiko@sntech.de>
---
 include/dt-bindings/clock/rv1108-cru.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include')

diff --git a/include/dt-bindings/clock/rv1108-cru.h b/include/dt-bindings/clock/rv1108-cru.h
index f269d833e41a..2239ae2a19b9 100644
--- a/include/dt-bindings/clock/rv1108-cru.h
+++ b/include/dt-bindings/clock/rv1108-cru.h
@@ -110,6 +110,7 @@
 #define ACLK_CIF2			207
 #define ACLK_CIF3			208
 #define ACLK_PERI			209
+#define ACLK_GMAC			210
 
 /* pclk gates */
 #define PCLK_GPIO1			256
@@ -141,6 +142,7 @@
 #define PCLK_EFUSE0			282
 #define PCLK_EFUSE1			283
 #define PCLK_WDT			284
+#define PCLK_GMAC			285
 
 /* hclk gates */
 #define HCLK_I2S0_8CH			320
-- 
cgit v1.2.3


From c56f16dab0dfc8a37bc6133f2c2a02ffb873648b Mon Sep 17 00:00:00 2001
From: Chao Yu <yuchao0@huawei.com>
Date: Fri, 11 Aug 2017 18:00:15 +0800
Subject: f2fs: add tracepoint for f2fs_gc

This patch adds tracepoint for f2fs_gc.

Signed-off-by: Chao Yu <yuchao0@huawei.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
 fs/f2fs/gc.c                |  50 +++++++++++++++------
 include/trace/events/f2fs.h | 107 ++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 143 insertions(+), 14 deletions(-)

(limited to 'include')

diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c
index e60480f71bb5..57bea2182f30 100644
--- a/fs/f2fs/gc.c
+++ b/fs/f2fs/gc.c
@@ -919,7 +919,7 @@ static int do_garbage_collect(struct f2fs_sb_info *sbi,
 	struct blk_plug plug;
 	unsigned int segno = start_segno;
 	unsigned int end_segno = start_segno + sbi->segs_per_sec;
-	int sec_freed = 0;
+	int seg_freed = 0;
 	unsigned char type = IS_DATASEG(get_seg_entry(sbi, segno)->type) ?
 						SUM_TYPE_DATA : SUM_TYPE_NODE;
 
@@ -965,6 +965,10 @@ static int do_garbage_collect(struct f2fs_sb_info *sbi,
 								gc_type);
 
 		stat_inc_seg_count(sbi, type, gc_type);
+
+		if (gc_type == FG_GC &&
+				get_valid_blocks(sbi, segno, false) == 0)
+			seg_freed++;
 next:
 		f2fs_put_page(sum_page, 0);
 	}
@@ -975,21 +979,17 @@ next:
 
 	blk_finish_plug(&plug);
 
-	if (gc_type == FG_GC &&
-		get_valid_blocks(sbi, start_segno, true) == 0)
-		sec_freed = 1;
-
 	stat_inc_call_count(sbi->stat_info);
 
-	return sec_freed;
+	return seg_freed;
 }
 
 int f2fs_gc(struct f2fs_sb_info *sbi, bool sync,
 			bool background, unsigned int segno)
 {
 	int gc_type = sync ? FG_GC : BG_GC;
-	int sec_freed = 0;
-	int ret;
+	int sec_freed = 0, seg_freed = 0, total_freed = 0;
+	int ret = 0;
 	struct cp_control cpc;
 	unsigned int init_segno = segno;
 	struct gc_inode_list gc_list = {
@@ -997,6 +997,15 @@ int f2fs_gc(struct f2fs_sb_info *sbi, bool sync,
 		.iroot = RADIX_TREE_INIT(GFP_NOFS),
 	};
 
+	trace_f2fs_gc_begin(sbi->sb, sync, background,
+				get_pages(sbi, F2FS_DIRTY_NODES),
+				get_pages(sbi, F2FS_DIRTY_DENTS),
+				get_pages(sbi, F2FS_DIRTY_IMETA),
+				free_sections(sbi),
+				free_segments(sbi),
+				reserved_segments(sbi),
+				prefree_segments(sbi));
+
 	cpc.reason = __get_cp_reason(sbi);
 gc_more:
 	if (unlikely(!(sbi->sb->s_flags & MS_ACTIVE))) {
@@ -1023,17 +1032,20 @@ gc_more:
 			gc_type = FG_GC;
 	}
 
-	ret = -EINVAL;
 	/* f2fs_balance_fs doesn't need to do BG_GC in critical path. */
-	if (gc_type == BG_GC && !background)
+	if (gc_type == BG_GC && !background) {
+		ret = -EINVAL;
 		goto stop;
-	if (!__get_victim(sbi, &segno, gc_type))
+	}
+	if (!__get_victim(sbi, &segno, gc_type)) {
+		ret = -ENODATA;
 		goto stop;
-	ret = 0;
+	}
 
-	if (do_garbage_collect(sbi, segno, &gc_list, gc_type) &&
-			gc_type == FG_GC)
+	seg_freed = do_garbage_collect(sbi, segno, &gc_list, gc_type);
+	if (gc_type == FG_GC && seg_freed == sbi->segs_per_sec)
 		sec_freed++;
+	total_freed += seg_freed;
 
 	if (gc_type == FG_GC)
 		sbi->cur_victim_sec = NULL_SEGNO;
@@ -1050,6 +1062,16 @@ gc_more:
 stop:
 	SIT_I(sbi)->last_victim[ALLOC_NEXT] = 0;
 	SIT_I(sbi)->last_victim[FLUSH_DEVICE] = init_segno;
+
+	trace_f2fs_gc_end(sbi->sb, ret, total_freed, sec_freed,
+				get_pages(sbi, F2FS_DIRTY_NODES),
+				get_pages(sbi, F2FS_DIRTY_DENTS),
+				get_pages(sbi, F2FS_DIRTY_IMETA),
+				free_sections(sbi),
+				free_segments(sbi),
+				reserved_segments(sbi),
+				prefree_segments(sbi));
+
 	mutex_unlock(&sbi->gc_mutex);
 
 	put_gc_inode(&gc_list);
diff --git a/include/trace/events/f2fs.h b/include/trace/events/f2fs.h
index db5c98256147..8955e75dd48e 100644
--- a/include/trace/events/f2fs.h
+++ b/include/trace/events/f2fs.h
@@ -569,6 +569,113 @@ TRACE_EVENT(f2fs_background_gc,
 		__entry->free)
 );
 
+TRACE_EVENT(f2fs_gc_begin,
+
+	TP_PROTO(struct super_block *sb, bool sync, bool background,
+			long long dirty_nodes, long long dirty_dents,
+			long long dirty_imeta, unsigned int free_sec,
+			unsigned int free_seg, int reserved_seg,
+			unsigned int prefree_seg),
+
+	TP_ARGS(sb, sync, background, dirty_nodes, dirty_dents, dirty_imeta,
+		free_sec, free_seg, reserved_seg, prefree_seg),
+
+	TP_STRUCT__entry(
+		__field(dev_t,		dev)
+		__field(bool,		sync)
+		__field(bool,		background)
+		__field(long long,	dirty_nodes)
+		__field(long long,	dirty_dents)
+		__field(long long,	dirty_imeta)
+		__field(unsigned int,	free_sec)
+		__field(unsigned int,	free_seg)
+		__field(int,		reserved_seg)
+		__field(unsigned int,	prefree_seg)
+	),
+
+	TP_fast_assign(
+		__entry->dev		= sb->s_dev;
+		__entry->sync		= sync;
+		__entry->background	= background;
+		__entry->dirty_nodes	= dirty_nodes;
+		__entry->dirty_dents	= dirty_dents;
+		__entry->dirty_imeta	= dirty_imeta;
+		__entry->free_sec	= free_sec;
+		__entry->free_seg	= free_seg;
+		__entry->reserved_seg	= reserved_seg;
+		__entry->prefree_seg	= prefree_seg;
+	),
+
+	TP_printk("dev = (%d,%d), sync = %d, background = %d, nodes = %lld, "
+		"dents = %lld, imeta = %lld, free_sec:%u, free_seg:%u, "
+		"rsv_seg:%d, prefree_seg:%u",
+		show_dev(__entry->dev),
+		__entry->sync,
+		__entry->background,
+		__entry->dirty_nodes,
+		__entry->dirty_dents,
+		__entry->dirty_imeta,
+		__entry->free_sec,
+		__entry->free_seg,
+		__entry->reserved_seg,
+		__entry->prefree_seg)
+);
+
+TRACE_EVENT(f2fs_gc_end,
+
+	TP_PROTO(struct super_block *sb, int ret, int seg_freed,
+			int sec_freed, long long dirty_nodes,
+			long long dirty_dents, long long dirty_imeta,
+			unsigned int free_sec, unsigned int free_seg,
+			int reserved_seg, unsigned int prefree_seg),
+
+	TP_ARGS(sb, ret, seg_freed, sec_freed, dirty_nodes, dirty_dents,
+		dirty_imeta, free_sec, free_seg, reserved_seg, prefree_seg),
+
+	TP_STRUCT__entry(
+		__field(dev_t,		dev)
+		__field(int,		ret)
+		__field(int,		seg_freed)
+		__field(int,		sec_freed)
+		__field(long long,	dirty_nodes)
+		__field(long long,	dirty_dents)
+		__field(long long,	dirty_imeta)
+		__field(unsigned int,	free_sec)
+		__field(unsigned int,	free_seg)
+		__field(int,		reserved_seg)
+		__field(unsigned int,	prefree_seg)
+	),
+
+	TP_fast_assign(
+		__entry->dev		= sb->s_dev;
+		__entry->ret		= ret;
+		__entry->seg_freed	= seg_freed;
+		__entry->sec_freed	= sec_freed;
+		__entry->dirty_nodes	= dirty_nodes;
+		__entry->dirty_dents	= dirty_dents;
+		__entry->dirty_imeta	= dirty_imeta;
+		__entry->free_sec	= free_sec;
+		__entry->free_seg	= free_seg;
+		__entry->reserved_seg	= reserved_seg;
+		__entry->prefree_seg	= prefree_seg;
+	),
+
+	TP_printk("dev = (%d,%d), ret = %d, seg_freed = %d, sec_freed = %d, "
+		"nodes = %lld, dents = %lld, imeta = %lld, free_sec:%u, "
+		"free_seg:%u, rsv_seg:%d, prefree_seg:%u",
+		show_dev(__entry->dev),
+		__entry->ret,
+		__entry->seg_freed,
+		__entry->sec_freed,
+		__entry->dirty_nodes,
+		__entry->dirty_dents,
+		__entry->dirty_imeta,
+		__entry->free_sec,
+		__entry->free_seg,
+		__entry->reserved_seg,
+		__entry->prefree_seg)
+);
+
 TRACE_EVENT(f2fs_get_victim,
 
 	TP_PROTO(struct super_block *sb, int type, int gc_type,
-- 
cgit v1.2.3


From c7d0045b08a36c2fb7874efc48d747613c6a1ccf Mon Sep 17 00:00:00 2001
From: Elaine Zhang <zhangqing@rock-chips.com>
Date: Mon, 21 Aug 2017 16:16:06 +0800
Subject: clk: rockchip: rename rv1108 macphy clock to mac

This MAC has no internal phy for rv1108 and the whole clock
infrastructure hasn't been used yet, so is safe to fix.

Signed-off-by: Elaine Zhang <zhangqing@rock-chips.com>
Reviewed-by: David Wu <david.wu@rock-chips.com>
Signed-off-by: Heiko Stuebner <heiko@sntech.de>
---
 drivers/clk/rockchip/clk-rv1108.c      | 12 ++++++------
 include/dt-bindings/clock/rv1108-cru.h |  6 +++---
 2 files changed, 9 insertions(+), 9 deletions(-)

(limited to 'include')

diff --git a/drivers/clk/rockchip/clk-rv1108.c b/drivers/clk/rockchip/clk-rv1108.c
index 0e441ec21e90..c9045f85b498 100644
--- a/drivers/clk/rockchip/clk-rv1108.c
+++ b/drivers/clk/rockchip/clk-rv1108.c
@@ -140,7 +140,7 @@ PNAME(mux_pll_src_dpll_gpll_usb480m_p)	= { "dpll", "gpll", "usb480m" };
 PNAME(mux_uart0_p)		= { "uart0_src", "uart0_frac", "xin24m" };
 PNAME(mux_uart1_p)		= { "uart1_src", "uart1_frac", "xin24m" };
 PNAME(mux_uart2_p)		= { "uart2_src", "uart2_frac", "xin24m" };
-PNAME(mux_sclk_macphy_p)	= { "ext_gmac", "sclk_macphy_pre" };
+PNAME(mux_sclk_mac_p)		= { "ext_gmac", "sclk_mac_pre" };
 PNAME(mux_i2s0_pre_p)		= { "i2s0_src", "i2s0_frac", "ext_i2s", "xin12m" };
 PNAME(mux_i2s_out_p)		= { "i2s0_pre", "xin12m" };
 PNAME(mux_i2s1_p)		= { "i2s1_src", "i2s1_frac", "dummy", "xin12m" };
@@ -755,14 +755,14 @@ static struct rockchip_clk_branch rv1108_clk_branches[] __initdata = {
 			RV1108_CLKGATE_CON(5), 4, GFLAGS),
 	GATE(HCLK_SFC, "hclk_sfc", "hclk_periph", 0, RV1108_CLKGATE_CON(15), 10, GFLAGS),
 
-	COMPOSITE(SCLK_MACPHY_PRE, "sclk_macphy_pre", mux_pll_src_apll_gpll_p, 0,
+	COMPOSITE(SCLK_MAC_PRE, "sclk_mac_pre", mux_pll_src_apll_gpll_p, 0,
 			RV1108_CLKSEL_CON(24), 12, 1, MFLAGS, 0, 5, DFLAGS,
 			RV1108_CLKGATE_CON(4), 10, GFLAGS),
-	MUX(SCLK_MACPHY, "sclk_macphy", mux_sclk_macphy_p, CLK_SET_RATE_PARENT,
+	MUX(SCLK_MAC, "sclk_mac", mux_sclk_mac_p, CLK_SET_RATE_PARENT,
 			RV1108_CLKSEL_CON(24), 8, 1, MFLAGS),
-	GATE(SCLK_MACPHY_RX, "sclk_macphy_rx", "sclk_macphy", 0, RV1108_CLKGATE_CON(4), 8, GFLAGS),
-	GATE(SCLK_MAC_REF, "sclk_mac_ref", "sclk_macphy", 0, RV1108_CLKGATE_CON(4), 6, GFLAGS),
-	GATE(SCLK_MAC_REFOUT, "sclk_mac_refout", "sclk_macphy", 0, RV1108_CLKGATE_CON(4), 7, GFLAGS),
+	GATE(SCLK_MAC_RX, "sclk_mac_rx", "sclk_mac", 0, RV1108_CLKGATE_CON(4), 8, GFLAGS),
+	GATE(SCLK_MAC_REF, "sclk_mac_ref", "sclk_mac", 0, RV1108_CLKGATE_CON(4), 6, GFLAGS),
+	GATE(SCLK_MAC_REFOUT, "sclk_mac_refout", "sclk_mac", 0, RV1108_CLKGATE_CON(4), 7, GFLAGS),
 	GATE(ACLK_GMAC, "aclk_gmac", "aclk_periph", 0, RV1108_CLKGATE_CON(15), 4, GFLAGS),
 	GATE(PCLK_GMAC, "pclk_gmac", "pclk_periph", 0, RV1108_CLKGATE_CON(15), 5, GFLAGS),
 
diff --git a/include/dt-bindings/clock/rv1108-cru.h b/include/dt-bindings/clock/rv1108-cru.h
index 2239ae2a19b9..d8d0e0456dc2 100644
--- a/include/dt-bindings/clock/rv1108-cru.h
+++ b/include/dt-bindings/clock/rv1108-cru.h
@@ -67,9 +67,9 @@
 #define SCLK_SPI			108
 #define SCLK_SARADC			109
 #define SCLK_TSADC			110
-#define SCLK_MACPHY_PRE			111
-#define SCLK_MACPHY			112
-#define SCLK_MACPHY_RX			113
+#define SCLK_MAC_PRE			111
+#define SCLK_MAC			112
+#define SCLK_MAC_RX			113
 #define SCLK_MAC_REF			114
 #define SCLK_MAC_REFOUT			115
 #define SCLK_DSP_PFM			116
-- 
cgit v1.2.3


From c678fa66341c7b82a57cfed0ba3656162e970f99 Mon Sep 17 00:00:00 2001
From: Dave Jiang <dave.jiang@intel.com>
Date: Mon, 21 Aug 2017 10:23:13 -0700
Subject: dmaengine: remove DMA_SG as it is dead code in kernel

There are no in kernel consumers for DMA_SG op. Removing operation,
dead code, and test code in dmatest.

Signed-off-by: Dave Jiang <dave.jiang@intel.com>
Reviewed-by: Linus Walleij <linus.walleij@linaro.org>
Cc: Gary Hook <gary.hook@amd.com>
Cc: Ludovic Desroches <ludovic.desroches@microchip.com>
Cc: Kedareswara rao Appana <appana.durga.rao@xilinx.com>
Cc: Li Yang <leoyang.li@nxp.com>
Cc: Michal Simek <michal.simek@xilinx.com>
Signed-off-by: Vinod Koul <vinod.koul@intel.com>
---
 Documentation/dmaengine/provider.txt |   7 --
 drivers/crypto/ccp/ccp-dmaengine.c   |  23 -----
 drivers/dma/at_hdmac.c               | 140 +-----------------------------
 drivers/dma/dmaengine.c              |   2 -
 drivers/dma/dmatest.c                |  36 +-------
 drivers/dma/fsldma.c                 | 118 -------------------------
 drivers/dma/mv_xor.c                 | 162 +----------------------------------
 drivers/dma/nbpfaxi.c                |  17 ----
 drivers/dma/ste_dma40.c              |  18 ----
 drivers/dma/xgene-dma.c              | 155 +--------------------------------
 drivers/dma/xilinx/zynqmp_dma.c      |  94 --------------------
 include/linux/dmaengine.h            |  19 ----
 12 files changed, 5 insertions(+), 786 deletions(-)

(limited to 'include')

diff --git a/Documentation/dmaengine/provider.txt b/Documentation/dmaengine/provider.txt
index e33bc1c8ed2c..a75f52ff2e49 100644
--- a/Documentation/dmaengine/provider.txt
+++ b/Documentation/dmaengine/provider.txt
@@ -181,13 +181,6 @@ Currently, the types available are:
     - Used by the client drivers to register a callback that will be
       called on a regular basis through the DMA controller interrupt
 
-  * DMA_SG
-    - The device supports memory to memory scatter-gather
-      transfers.
-    - Even though a plain memcpy can look like a particular case of a
-      scatter-gather transfer, with a single chunk to transfer, it's a
-      distinct transaction type in the mem2mem transfers case
-
   * DMA_PRIVATE
     - The devices only supports slave transfers, and as such isn't
       available for async transfers.
diff --git a/drivers/crypto/ccp/ccp-dmaengine.c b/drivers/crypto/ccp/ccp-dmaengine.c
index e00be01fbf5a..3f7d54f4728c 100644
--- a/drivers/crypto/ccp/ccp-dmaengine.c
+++ b/drivers/crypto/ccp/ccp-dmaengine.c
@@ -502,27 +502,6 @@ static struct dma_async_tx_descriptor *ccp_prep_dma_memcpy(
 	return &desc->tx_desc;
 }
 
-static struct dma_async_tx_descriptor *ccp_prep_dma_sg(
-	struct dma_chan *dma_chan, struct scatterlist *dst_sg,
-	unsigned int dst_nents, struct scatterlist *src_sg,
-	unsigned int src_nents, unsigned long flags)
-{
-	struct ccp_dma_chan *chan = container_of(dma_chan, struct ccp_dma_chan,
-						 dma_chan);
-	struct ccp_dma_desc *desc;
-
-	dev_dbg(chan->ccp->dev,
-		"%s - src=%p, src_nents=%u dst=%p, dst_nents=%u, flags=%#lx\n",
-		__func__, src_sg, src_nents, dst_sg, dst_nents, flags);
-
-	desc = ccp_create_desc(dma_chan, dst_sg, dst_nents, src_sg, src_nents,
-			       flags);
-	if (!desc)
-		return NULL;
-
-	return &desc->tx_desc;
-}
-
 static struct dma_async_tx_descriptor *ccp_prep_dma_interrupt(
 	struct dma_chan *dma_chan, unsigned long flags)
 {
@@ -704,7 +683,6 @@ int ccp_dmaengine_register(struct ccp_device *ccp)
 	dma_dev->directions = DMA_MEM_TO_MEM;
 	dma_dev->residue_granularity = DMA_RESIDUE_GRANULARITY_DESCRIPTOR;
 	dma_cap_set(DMA_MEMCPY, dma_dev->cap_mask);
-	dma_cap_set(DMA_SG, dma_dev->cap_mask);
 	dma_cap_set(DMA_INTERRUPT, dma_dev->cap_mask);
 
 	/* The DMA channels for this device can be set to public or private,
@@ -740,7 +718,6 @@ int ccp_dmaengine_register(struct ccp_device *ccp)
 
 	dma_dev->device_free_chan_resources = ccp_free_chan_resources;
 	dma_dev->device_prep_dma_memcpy = ccp_prep_dma_memcpy;
-	dma_dev->device_prep_dma_sg = ccp_prep_dma_sg;
 	dma_dev->device_prep_dma_interrupt = ccp_prep_dma_interrupt;
 	dma_dev->device_issue_pending = ccp_issue_pending;
 	dma_dev->device_tx_status = ccp_tx_status;
diff --git a/drivers/dma/at_hdmac.c b/drivers/dma/at_hdmac.c
index 1baf3404a365..fbab271b3bf9 100644
--- a/drivers/dma/at_hdmac.c
+++ b/drivers/dma/at_hdmac.c
@@ -1202,138 +1202,6 @@ err:
 	return NULL;
 }
 
-/**
- * atc_prep_dma_sg - prepare memory to memory scather-gather operation
- * @chan: the channel to prepare operation on
- * @dst_sg: destination scatterlist
- * @dst_nents: number of destination scatterlist entries
- * @src_sg: source scatterlist
- * @src_nents: number of source scatterlist entries
- * @flags: tx descriptor status flags
- */
-static struct dma_async_tx_descriptor *
-atc_prep_dma_sg(struct dma_chan *chan,
-		struct scatterlist *dst_sg, unsigned int dst_nents,
-		struct scatterlist *src_sg, unsigned int src_nents,
-		unsigned long flags)
-{
-	struct at_dma_chan	*atchan = to_at_dma_chan(chan);
-	struct at_desc		*desc = NULL;
-	struct at_desc		*first = NULL;
-	struct at_desc		*prev = NULL;
-	unsigned int		src_width;
-	unsigned int		dst_width;
-	size_t			xfer_count;
-	u32			ctrla;
-	u32			ctrlb;
-	size_t			dst_len = 0, src_len = 0;
-	dma_addr_t		dst = 0, src = 0;
-	size_t			len = 0, total_len = 0;
-
-	if (unlikely(dst_nents == 0 || src_nents == 0))
-		return NULL;
-
-	if (unlikely(dst_sg == NULL || src_sg == NULL))
-		return NULL;
-
-	ctrlb =   ATC_DEFAULT_CTRLB | ATC_IEN
-		| ATC_SRC_ADDR_MODE_INCR
-		| ATC_DST_ADDR_MODE_INCR
-		| ATC_FC_MEM2MEM;
-
-	/*
-	 * loop until there is either no more source or no more destination
-	 * scatterlist entry
-	 */
-	while (true) {
-
-		/* prepare the next transfer */
-		if (dst_len == 0) {
-
-			/* no more destination scatterlist entries */
-			if (!dst_sg || !dst_nents)
-				break;
-
-			dst = sg_dma_address(dst_sg);
-			dst_len = sg_dma_len(dst_sg);
-
-			dst_sg = sg_next(dst_sg);
-			dst_nents--;
-		}
-
-		if (src_len == 0) {
-
-			/* no more source scatterlist entries */
-			if (!src_sg || !src_nents)
-				break;
-
-			src = sg_dma_address(src_sg);
-			src_len = sg_dma_len(src_sg);
-
-			src_sg = sg_next(src_sg);
-			src_nents--;
-		}
-
-		len = min_t(size_t, src_len, dst_len);
-		if (len == 0)
-			continue;
-
-		/* take care for the alignment */
-		src_width = dst_width = atc_get_xfer_width(src, dst, len);
-
-		ctrla = ATC_SRC_WIDTH(src_width) |
-			ATC_DST_WIDTH(dst_width);
-
-		/*
-		 * The number of transfers to set up refer to the source width
-		 * that depends on the alignment.
-		 */
-		xfer_count = len >> src_width;
-		if (xfer_count > ATC_BTSIZE_MAX) {
-			xfer_count = ATC_BTSIZE_MAX;
-			len = ATC_BTSIZE_MAX << src_width;
-		}
-
-		/* create the transfer */
-		desc = atc_desc_get(atchan);
-		if (!desc)
-			goto err_desc_get;
-
-		desc->lli.saddr = src;
-		desc->lli.daddr = dst;
-		desc->lli.ctrla = ctrla | xfer_count;
-		desc->lli.ctrlb = ctrlb;
-
-		desc->txd.cookie = 0;
-		desc->len = len;
-
-		atc_desc_chain(&first, &prev, desc);
-
-		/* update the lengths and addresses for the next loop cycle */
-		dst_len -= len;
-		src_len -= len;
-		dst += len;
-		src += len;
-
-		total_len += len;
-	}
-
-	/* First descriptor of the chain embedds additional information */
-	first->txd.cookie = -EBUSY;
-	first->total_len = total_len;
-
-	/* set end-of-link to the last link descriptor of list*/
-	set_desc_eol(desc);
-
-	first->txd.flags = flags; /* client is in control of this ack */
-
-	return &first->txd;
-
-err_desc_get:
-	atc_desc_put(atchan, first);
-	return NULL;
-}
-
 /**
  * atc_dma_cyclic_check_values
  * Check for too big/unaligned periods and unaligned DMA buffer
@@ -1933,14 +1801,12 @@ static int __init at_dma_probe(struct platform_device *pdev)
 
 	/* setup platform data for each SoC */
 	dma_cap_set(DMA_MEMCPY, at91sam9rl_config.cap_mask);
-	dma_cap_set(DMA_SG, at91sam9rl_config.cap_mask);
 	dma_cap_set(DMA_INTERLEAVE, at91sam9g45_config.cap_mask);
 	dma_cap_set(DMA_MEMCPY, at91sam9g45_config.cap_mask);
 	dma_cap_set(DMA_MEMSET, at91sam9g45_config.cap_mask);
 	dma_cap_set(DMA_MEMSET_SG, at91sam9g45_config.cap_mask);
 	dma_cap_set(DMA_PRIVATE, at91sam9g45_config.cap_mask);
 	dma_cap_set(DMA_SLAVE, at91sam9g45_config.cap_mask);
-	dma_cap_set(DMA_SG, at91sam9g45_config.cap_mask);
 
 	/* get DMA parameters from controller type */
 	plat_dat = at_dma_get_driver_data(pdev);
@@ -2078,16 +1944,12 @@ static int __init at_dma_probe(struct platform_device *pdev)
 		atdma->dma_common.residue_granularity = DMA_RESIDUE_GRANULARITY_BURST;
 	}
 
-	if (dma_has_cap(DMA_SG, atdma->dma_common.cap_mask))
-		atdma->dma_common.device_prep_dma_sg = atc_prep_dma_sg;
-
 	dma_writel(atdma, EN, AT_DMA_ENABLE);
 
-	dev_info(&pdev->dev, "Atmel AHB DMA Controller ( %s%s%s%s), %d channels\n",
+	dev_info(&pdev->dev, "Atmel AHB DMA Controller ( %s%s%s), %d channels\n",
 	  dma_has_cap(DMA_MEMCPY, atdma->dma_common.cap_mask) ? "cpy " : "",
 	  dma_has_cap(DMA_MEMSET, atdma->dma_common.cap_mask) ? "set " : "",
 	  dma_has_cap(DMA_SLAVE, atdma->dma_common.cap_mask)  ? "slave " : "",
-	  dma_has_cap(DMA_SG, atdma->dma_common.cap_mask)  ? "sg-cpy " : "",
 	  plat_dat->nr_channels);
 
 	dma_async_device_register(&atdma->dma_common);
diff --git a/drivers/dma/dmaengine.c b/drivers/dma/dmaengine.c
index d9118ec23025..428b1414263a 100644
--- a/drivers/dma/dmaengine.c
+++ b/drivers/dma/dmaengine.c
@@ -937,8 +937,6 @@ int dma_async_device_register(struct dma_device *device)
 		!device->device_prep_dma_memset);
 	BUG_ON(dma_has_cap(DMA_INTERRUPT, device->cap_mask) &&
 		!device->device_prep_dma_interrupt);
-	BUG_ON(dma_has_cap(DMA_SG, device->cap_mask) &&
-		!device->device_prep_dma_sg);
 	BUG_ON(dma_has_cap(DMA_CYCLIC, device->cap_mask) &&
 		!device->device_prep_dma_cyclic);
 	BUG_ON(dma_has_cap(DMA_INTERLEAVE, device->cap_mask) &&
diff --git a/drivers/dma/dmatest.c b/drivers/dma/dmatest.c
index 35cb83b39192..34ff53290b03 100644
--- a/drivers/dma/dmatest.c
+++ b/drivers/dma/dmatest.c
@@ -52,15 +52,10 @@ module_param(iterations, uint, S_IRUGO | S_IWUSR);
 MODULE_PARM_DESC(iterations,
 		"Iterations before stopping test (default: infinite)");
 
-static unsigned int sg_buffers = 1;
-module_param(sg_buffers, uint, S_IRUGO | S_IWUSR);
-MODULE_PARM_DESC(sg_buffers,
-		"Number of scatter gather buffers (default: 1)");
-
 static unsigned int dmatest;
 module_param(dmatest, uint, S_IRUGO | S_IWUSR);
 MODULE_PARM_DESC(dmatest,
-		"dmatest 0-memcpy 1-slave_sg 2-memset (default: 0)");
+		"dmatest 0-memcpy 1-memset (default: 0)");
 
 static unsigned int xor_sources = 3;
 module_param(xor_sources, uint, S_IRUGO | S_IWUSR);
@@ -471,9 +466,6 @@ static int dmatest_func(void *data)
 		align = dev->fill_align;
 		src_cnt = dst_cnt = 1;
 		is_memset = true;
-	} else if (thread->type == DMA_SG) {
-		align = dev->copy_align;
-		src_cnt = dst_cnt = sg_buffers;
 	} else if (thread->type == DMA_XOR) {
 		/* force odd to ensure dst = src */
 		src_cnt = min_odd(params->xor_sources | 1, dev->max_xor);
@@ -553,8 +545,6 @@ static int dmatest_func(void *data)
 		dma_addr_t srcs[src_cnt];
 		dma_addr_t *dsts;
 		unsigned int src_off, dst_off, len;
-		struct scatterlist tx_sg[src_cnt];
-		struct scatterlist rx_sg[src_cnt];
 
 		total_tests++;
 
@@ -650,15 +640,6 @@ static int dmatest_func(void *data)
 			um->bidi_cnt++;
 		}
 
-		sg_init_table(tx_sg, src_cnt);
-		sg_init_table(rx_sg, src_cnt);
-		for (i = 0; i < src_cnt; i++) {
-			sg_dma_address(&rx_sg[i]) = srcs[i];
-			sg_dma_address(&tx_sg[i]) = dsts[i] + dst_off;
-			sg_dma_len(&tx_sg[i]) = len;
-			sg_dma_len(&rx_sg[i]) = len;
-		}
-
 		if (thread->type == DMA_MEMCPY)
 			tx = dev->device_prep_dma_memcpy(chan,
 							 dsts[0] + dst_off,
@@ -668,9 +649,6 @@ static int dmatest_func(void *data)
 						dsts[0] + dst_off,
 						*(thread->srcs[0] + src_off),
 						len, flags);
-		else if (thread->type == DMA_SG)
-			tx = dev->device_prep_dma_sg(chan, tx_sg, src_cnt,
-						     rx_sg, src_cnt, flags);
 		else if (thread->type == DMA_XOR)
 			tx = dev->device_prep_dma_xor(chan,
 						      dsts[0] + dst_off,
@@ -853,8 +831,6 @@ static int dmatest_add_threads(struct dmatest_info *info,
 		op = "copy";
 	else if (type == DMA_MEMSET)
 		op = "set";
-	else if (type == DMA_SG)
-		op = "sg";
 	else if (type == DMA_XOR)
 		op = "xor";
 	else if (type == DMA_PQ)
@@ -916,15 +892,8 @@ static int dmatest_add_channel(struct dmatest_info *info,
 	}
 
 	if (dma_has_cap(DMA_MEMSET, dma_dev->cap_mask)) {
-		if (dmatest == 2) {
-			cnt = dmatest_add_threads(info, dtc, DMA_MEMSET);
-			thread_count += cnt > 0 ? cnt : 0;
-		}
-	}
-
-	if (dma_has_cap(DMA_SG, dma_dev->cap_mask)) {
 		if (dmatest == 1) {
-			cnt = dmatest_add_threads(info, dtc, DMA_SG);
+			cnt = dmatest_add_threads(info, dtc, DMA_MEMSET);
 			thread_count += cnt > 0 ? cnt : 0;
 		}
 	}
@@ -1002,7 +971,6 @@ static void run_threaded_test(struct dmatest_info *info)
 	request_channels(info, DMA_MEMCPY);
 	request_channels(info, DMA_MEMSET);
 	request_channels(info, DMA_XOR);
-	request_channels(info, DMA_SG);
 	request_channels(info, DMA_PQ);
 }
 
diff --git a/drivers/dma/fsldma.c b/drivers/dma/fsldma.c
index 3b8b752ede2d..3eaece888e75 100644
--- a/drivers/dma/fsldma.c
+++ b/drivers/dma/fsldma.c
@@ -825,122 +825,6 @@ fail:
 	return NULL;
 }
 
-static struct dma_async_tx_descriptor *fsl_dma_prep_sg(struct dma_chan *dchan,
-	struct scatterlist *dst_sg, unsigned int dst_nents,
-	struct scatterlist *src_sg, unsigned int src_nents,
-	unsigned long flags)
-{
-	struct fsl_desc_sw *first = NULL, *prev = NULL, *new = NULL;
-	struct fsldma_chan *chan = to_fsl_chan(dchan);
-	size_t dst_avail, src_avail;
-	dma_addr_t dst, src;
-	size_t len;
-
-	/* basic sanity checks */
-	if (dst_nents == 0 || src_nents == 0)
-		return NULL;
-
-	if (dst_sg == NULL || src_sg == NULL)
-		return NULL;
-
-	/*
-	 * TODO: should we check that both scatterlists have the same
-	 * TODO: number of bytes in total? Is that really an error?
-	 */
-
-	/* get prepared for the loop */
-	dst_avail = sg_dma_len(dst_sg);
-	src_avail = sg_dma_len(src_sg);
-
-	/* run until we are out of scatterlist entries */
-	while (true) {
-
-		/* create the largest transaction possible */
-		len = min_t(size_t, src_avail, dst_avail);
-		len = min_t(size_t, len, FSL_DMA_BCR_MAX_CNT);
-		if (len == 0)
-			goto fetch;
-
-		dst = sg_dma_address(dst_sg) + sg_dma_len(dst_sg) - dst_avail;
-		src = sg_dma_address(src_sg) + sg_dma_len(src_sg) - src_avail;
-
-		/* allocate and populate the descriptor */
-		new = fsl_dma_alloc_descriptor(chan);
-		if (!new) {
-			chan_err(chan, "%s\n", msg_ld_oom);
-			goto fail;
-		}
-
-		set_desc_cnt(chan, &new->hw, len);
-		set_desc_src(chan, &new->hw, src);
-		set_desc_dst(chan, &new->hw, dst);
-
-		if (!first)
-			first = new;
-		else
-			set_desc_next(chan, &prev->hw, new->async_tx.phys);
-
-		new->async_tx.cookie = 0;
-		async_tx_ack(&new->async_tx);
-		prev = new;
-
-		/* Insert the link descriptor to the LD ring */
-		list_add_tail(&new->node, &first->tx_list);
-
-		/* update metadata */
-		dst_avail -= len;
-		src_avail -= len;
-
-fetch:
-		/* fetch the next dst scatterlist entry */
-		if (dst_avail == 0) {
-
-			/* no more entries: we're done */
-			if (dst_nents == 0)
-				break;
-
-			/* fetch the next entry: if there are no more: done */
-			dst_sg = sg_next(dst_sg);
-			if (dst_sg == NULL)
-				break;
-
-			dst_nents--;
-			dst_avail = sg_dma_len(dst_sg);
-		}
-
-		/* fetch the next src scatterlist entry */
-		if (src_avail == 0) {
-
-			/* no more entries: we're done */
-			if (src_nents == 0)
-				break;
-
-			/* fetch the next entry: if there are no more: done */
-			src_sg = sg_next(src_sg);
-			if (src_sg == NULL)
-				break;
-
-			src_nents--;
-			src_avail = sg_dma_len(src_sg);
-		}
-	}
-
-	new->async_tx.flags = flags; /* client is in control of this ack */
-	new->async_tx.cookie = -EBUSY;
-
-	/* Set End-of-link to the last link descriptor of new list */
-	set_ld_eol(chan, new);
-
-	return &first->async_tx;
-
-fail:
-	if (!first)
-		return NULL;
-
-	fsldma_free_desc_list_reverse(chan, &first->tx_list);
-	return NULL;
-}
-
 static int fsl_dma_device_terminate_all(struct dma_chan *dchan)
 {
 	struct fsldma_chan *chan;
@@ -1357,12 +1241,10 @@ static int fsldma_of_probe(struct platform_device *op)
 	fdev->irq = irq_of_parse_and_map(op->dev.of_node, 0);
 
 	dma_cap_set(DMA_MEMCPY, fdev->common.cap_mask);
-	dma_cap_set(DMA_SG, fdev->common.cap_mask);
 	dma_cap_set(DMA_SLAVE, fdev->common.cap_mask);
 	fdev->common.device_alloc_chan_resources = fsl_dma_alloc_chan_resources;
 	fdev->common.device_free_chan_resources = fsl_dma_free_chan_resources;
 	fdev->common.device_prep_dma_memcpy = fsl_dma_prep_memcpy;
-	fdev->common.device_prep_dma_sg = fsl_dma_prep_sg;
 	fdev->common.device_tx_status = fsl_tx_status;
 	fdev->common.device_issue_pending = fsl_dma_memcpy_issue_pending;
 	fdev->common.device_config = fsl_dma_device_config;
diff --git a/drivers/dma/mv_xor.c b/drivers/dma/mv_xor.c
index 25bc5b103aa2..1993889003fd 100644
--- a/drivers/dma/mv_xor.c
+++ b/drivers/dma/mv_xor.c
@@ -68,36 +68,6 @@ static void mv_desc_init(struct mv_xor_desc_slot *desc,
 	hw_desc->byte_count = byte_count;
 }
 
-/* Populate the descriptor */
-static void mv_xor_config_sg_ll_desc(struct mv_xor_desc_slot *desc,
-				     dma_addr_t dma_src, dma_addr_t dma_dst,
-				     u32 len, struct mv_xor_desc_slot *prev)
-{
-	struct mv_xor_desc *hw_desc = desc->hw_desc;
-
-	hw_desc->status = XOR_DESC_DMA_OWNED;
-	hw_desc->phy_next_desc = 0;
-	/* Configure for XOR with only one src address -> MEMCPY */
-	hw_desc->desc_command = XOR_DESC_OPERATION_XOR | (0x1 << 0);
-	hw_desc->phy_dest_addr = dma_dst;
-	hw_desc->phy_src_addr[0] = dma_src;
-	hw_desc->byte_count = len;
-
-	if (prev) {
-		struct mv_xor_desc *hw_prev = prev->hw_desc;
-
-		hw_prev->phy_next_desc = desc->async_tx.phys;
-	}
-}
-
-static void mv_xor_desc_config_eod(struct mv_xor_desc_slot *desc)
-{
-	struct mv_xor_desc *hw_desc = desc->hw_desc;
-
-	/* Enable end-of-descriptor interrupt */
-	hw_desc->desc_command |= XOR_DESC_EOD_INT_EN;
-}
-
 static void mv_desc_set_mode(struct mv_xor_desc_slot *desc)
 {
 	struct mv_xor_desc *hw_desc = desc->hw_desc;
@@ -662,132 +632,6 @@ mv_xor_prep_dma_interrupt(struct dma_chan *chan, unsigned long flags)
 	return mv_xor_prep_dma_xor(chan, dest, &src, 1, len, flags);
 }
 
-/**
- * mv_xor_prep_dma_sg - prepare descriptors for a memory sg transaction
- * @chan: DMA channel
- * @dst_sg: Destination scatter list
- * @dst_sg_len: Number of entries in destination scatter list
- * @src_sg: Source scatter list
- * @src_sg_len: Number of entries in source scatter list
- * @flags: transfer ack flags
- *
- * Return: Async transaction descriptor on success and NULL on failure
- */
-static struct dma_async_tx_descriptor *
-mv_xor_prep_dma_sg(struct dma_chan *chan, struct scatterlist *dst_sg,
-		   unsigned int dst_sg_len, struct scatterlist *src_sg,
-		   unsigned int src_sg_len, unsigned long flags)
-{
-	struct mv_xor_chan *mv_chan = to_mv_xor_chan(chan);
-	struct mv_xor_desc_slot *new;
-	struct mv_xor_desc_slot *first = NULL;
-	struct mv_xor_desc_slot *prev = NULL;
-	size_t len, dst_avail, src_avail;
-	dma_addr_t dma_dst, dma_src;
-	int desc_cnt = 0;
-	int ret;
-
-	dev_dbg(mv_chan_to_devp(mv_chan),
-		"%s dst_sg_len: %d src_sg_len: %d flags: %ld\n",
-		__func__, dst_sg_len, src_sg_len, flags);
-
-	dst_avail = sg_dma_len(dst_sg);
-	src_avail = sg_dma_len(src_sg);
-
-	/* Run until we are out of scatterlist entries */
-	while (true) {
-		/* Allocate and populate the descriptor */
-		desc_cnt++;
-		new = mv_chan_alloc_slot(mv_chan);
-		if (!new) {
-			dev_err(mv_chan_to_devp(mv_chan),
-				"Out of descriptors (desc_cnt=%d)!\n",
-				desc_cnt);
-			goto err;
-		}
-
-		len = min_t(size_t, src_avail, dst_avail);
-		len = min_t(size_t, len, MV_XOR_MAX_BYTE_COUNT);
-		if (len == 0)
-			goto fetch;
-
-		if (len < MV_XOR_MIN_BYTE_COUNT) {
-			dev_err(mv_chan_to_devp(mv_chan),
-				"Transfer size of %zu too small!\n", len);
-			goto err;
-		}
-
-		dma_dst = sg_dma_address(dst_sg) + sg_dma_len(dst_sg) -
-			dst_avail;
-		dma_src = sg_dma_address(src_sg) + sg_dma_len(src_sg) -
-			src_avail;
-
-		/* Check if a new window needs to get added for 'dst' */
-		ret = mv_xor_add_io_win(mv_chan, dma_dst);
-		if (ret)
-			goto err;
-
-		/* Check if a new window needs to get added for 'src' */
-		ret = mv_xor_add_io_win(mv_chan, dma_src);
-		if (ret)
-			goto err;
-
-		/* Populate the descriptor */
-		mv_xor_config_sg_ll_desc(new, dma_src, dma_dst, len, prev);
-		prev = new;
-		dst_avail -= len;
-		src_avail -= len;
-
-		if (!first)
-			first = new;
-		else
-			list_move_tail(&new->node, &first->sg_tx_list);
-
-fetch:
-		/* Fetch the next dst scatterlist entry */
-		if (dst_avail == 0) {
-			if (dst_sg_len == 0)
-				break;
-
-			/* Fetch the next entry: if there are no more: done */
-			dst_sg = sg_next(dst_sg);
-			if (dst_sg == NULL)
-				break;
-
-			dst_sg_len--;
-			dst_avail = sg_dma_len(dst_sg);
-		}
-
-		/* Fetch the next src scatterlist entry */
-		if (src_avail == 0) {
-			if (src_sg_len == 0)
-				break;
-
-			/* Fetch the next entry: if there are no more: done */
-			src_sg = sg_next(src_sg);
-			if (src_sg == NULL)
-				break;
-
-			src_sg_len--;
-			src_avail = sg_dma_len(src_sg);
-		}
-	}
-
-	/* Set the EOD flag in the last descriptor */
-	mv_xor_desc_config_eod(new);
-	first->async_tx.flags = flags;
-
-	return &first->async_tx;
-
-err:
-	/* Cleanup: Move all descriptors back into the free list */
-	spin_lock_bh(&mv_chan->lock);
-	mv_desc_clean_slot(first, mv_chan);
-	spin_unlock_bh(&mv_chan->lock);
-
-	return NULL;
-}
-
 static void mv_xor_free_chan_resources(struct dma_chan *chan)
 {
 	struct mv_xor_chan *mv_chan = to_mv_xor_chan(chan);
@@ -1254,8 +1098,6 @@ mv_xor_channel_add(struct mv_xor_device *xordev,
 		dma_dev->device_prep_dma_interrupt = mv_xor_prep_dma_interrupt;
 	if (dma_has_cap(DMA_MEMCPY, dma_dev->cap_mask))
 		dma_dev->device_prep_dma_memcpy = mv_xor_prep_dma_memcpy;
-	if (dma_has_cap(DMA_SG, dma_dev->cap_mask))
-		dma_dev->device_prep_dma_sg = mv_xor_prep_dma_sg;
 	if (dma_has_cap(DMA_XOR, dma_dev->cap_mask)) {
 		dma_dev->max_xor = 8;
 		dma_dev->device_prep_dma_xor = mv_xor_prep_dma_xor;
@@ -1305,11 +1147,10 @@ mv_xor_channel_add(struct mv_xor_device *xordev,
 			goto err_free_irq;
 	}
 
-	dev_info(&pdev->dev, "Marvell XOR (%s): ( %s%s%s%s)\n",
+	dev_info(&pdev->dev, "Marvell XOR (%s): ( %s%s%s)\n",
 		 mv_chan->op_in_desc ? "Descriptor Mode" : "Registers Mode",
 		 dma_has_cap(DMA_XOR, dma_dev->cap_mask) ? "xor " : "",
 		 dma_has_cap(DMA_MEMCPY, dma_dev->cap_mask) ? "cpy " : "",
-		 dma_has_cap(DMA_SG, dma_dev->cap_mask) ? "sg " : "",
 		 dma_has_cap(DMA_INTERRUPT, dma_dev->cap_mask) ? "intr " : "");
 
 	dma_async_device_register(dma_dev);
@@ -1552,7 +1393,6 @@ static int mv_xor_probe(struct platform_device *pdev)
 
 			dma_cap_zero(cap_mask);
 			dma_cap_set(DMA_MEMCPY, cap_mask);
-			dma_cap_set(DMA_SG, cap_mask);
 			dma_cap_set(DMA_XOR, cap_mask);
 			dma_cap_set(DMA_INTERRUPT, cap_mask);
 
diff --git a/drivers/dma/nbpfaxi.c b/drivers/dma/nbpfaxi.c
index 3f45b9bdf201..d3f918a9ee76 100644
--- a/drivers/dma/nbpfaxi.c
+++ b/drivers/dma/nbpfaxi.c
@@ -1005,21 +1005,6 @@ static struct dma_async_tx_descriptor *nbpf_prep_memcpy(
 			    DMA_MEM_TO_MEM, flags);
 }
 
-static struct dma_async_tx_descriptor *nbpf_prep_memcpy_sg(
-	struct dma_chan *dchan,
-	struct scatterlist *dst_sg, unsigned int dst_nents,
-	struct scatterlist *src_sg, unsigned int src_nents,
-	unsigned long flags)
-{
-	struct nbpf_channel *chan = nbpf_to_chan(dchan);
-
-	if (dst_nents != src_nents)
-		return NULL;
-
-	return nbpf_prep_sg(chan, src_sg, dst_sg, src_nents,
-			    DMA_MEM_TO_MEM, flags);
-}
-
 static struct dma_async_tx_descriptor *nbpf_prep_slave_sg(
 	struct dma_chan *dchan, struct scatterlist *sgl, unsigned int sg_len,
 	enum dma_transfer_direction direction, unsigned long flags, void *context)
@@ -1417,13 +1402,11 @@ static int nbpf_probe(struct platform_device *pdev)
 	dma_cap_set(DMA_MEMCPY, dma_dev->cap_mask);
 	dma_cap_set(DMA_SLAVE, dma_dev->cap_mask);
 	dma_cap_set(DMA_PRIVATE, dma_dev->cap_mask);
-	dma_cap_set(DMA_SG, dma_dev->cap_mask);
 
 	/* Common and MEMCPY operations */
 	dma_dev->device_alloc_chan_resources
 		= nbpf_alloc_chan_resources;
 	dma_dev->device_free_chan_resources = nbpf_free_chan_resources;
-	dma_dev->device_prep_dma_sg = nbpf_prep_memcpy_sg;
 	dma_dev->device_prep_dma_memcpy = nbpf_prep_memcpy;
 	dma_dev->device_tx_status = nbpf_tx_status;
 	dma_dev->device_issue_pending = nbpf_issue_pending;
diff --git a/drivers/dma/ste_dma40.c b/drivers/dma/ste_dma40.c
index c3052fbfd092..97e1d8b00e22 100644
--- a/drivers/dma/ste_dma40.c
+++ b/drivers/dma/ste_dma40.c
@@ -2484,19 +2484,6 @@ static struct dma_async_tx_descriptor *d40_prep_memcpy(struct dma_chan *chan,
 			   DMA_MEM_TO_MEM, dma_flags);
 }
 
-static struct dma_async_tx_descriptor *
-d40_prep_memcpy_sg(struct dma_chan *chan,
-		   struct scatterlist *dst_sg, unsigned int dst_nents,
-		   struct scatterlist *src_sg, unsigned int src_nents,
-		   unsigned long dma_flags)
-{
-	if (dst_nents != src_nents)
-		return NULL;
-
-	return d40_prep_sg(chan, src_sg, dst_sg, src_nents,
-			   DMA_MEM_TO_MEM, dma_flags);
-}
-
 static struct dma_async_tx_descriptor *
 d40_prep_slave_sg(struct dma_chan *chan, struct scatterlist *sgl,
 		  unsigned int sg_len, enum dma_transfer_direction direction,
@@ -2821,9 +2808,6 @@ static void d40_ops_init(struct d40_base *base, struct dma_device *dev)
 		dev->copy_align = DMAENGINE_ALIGN_4_BYTES;
 	}
 
-	if (dma_has_cap(DMA_SG, dev->cap_mask))
-		dev->device_prep_dma_sg = d40_prep_memcpy_sg;
-
 	if (dma_has_cap(DMA_CYCLIC, dev->cap_mask))
 		dev->device_prep_dma_cyclic = dma40_prep_dma_cyclic;
 
@@ -2865,7 +2849,6 @@ static int __init d40_dmaengine_init(struct d40_base *base,
 
 	dma_cap_zero(base->dma_memcpy.cap_mask);
 	dma_cap_set(DMA_MEMCPY, base->dma_memcpy.cap_mask);
-	dma_cap_set(DMA_SG, base->dma_memcpy.cap_mask);
 
 	d40_ops_init(base, &base->dma_memcpy);
 
@@ -2883,7 +2866,6 @@ static int __init d40_dmaengine_init(struct d40_base *base,
 	dma_cap_zero(base->dma_both.cap_mask);
 	dma_cap_set(DMA_SLAVE, base->dma_both.cap_mask);
 	dma_cap_set(DMA_MEMCPY, base->dma_both.cap_mask);
-	dma_cap_set(DMA_SG, base->dma_both.cap_mask);
 	dma_cap_set(DMA_CYCLIC, base->dma_slave.cap_mask);
 
 	d40_ops_init(base, &base->dma_both);
diff --git a/drivers/dma/xgene-dma.c b/drivers/dma/xgene-dma.c
index 8b693b712d0f..8648d2394ab6 100644
--- a/drivers/dma/xgene-dma.c
+++ b/drivers/dma/xgene-dma.c
@@ -425,48 +425,6 @@ static void xgene_dma_init_desc(struct xgene_dma_desc_hw *desc,
 				XGENE_DMA_DESC_HOENQ_NUM_POS);
 }
 
-static void xgene_dma_prep_cpy_desc(struct xgene_dma_chan *chan,
-				    struct xgene_dma_desc_sw *desc_sw,
-				    dma_addr_t dst, dma_addr_t src,
-				    size_t len)
-{
-	struct xgene_dma_desc_hw *desc1, *desc2;
-	int i;
-
-	/* Get 1st descriptor */
-	desc1 = &desc_sw->desc1;
-	xgene_dma_init_desc(desc1, chan->tx_ring.dst_ring_num);
-
-	/* Set destination address */
-	desc1->m2 |= cpu_to_le64(XGENE_DMA_DESC_DR_BIT);
-	desc1->m3 |= cpu_to_le64(dst);
-
-	/* Set 1st source address */
-	xgene_dma_set_src_buffer(&desc1->m1, &len, &src);
-
-	if (!len)
-		return;
-
-	/*
-	 * We need to split this source buffer,
-	 * and need to use 2nd descriptor
-	 */
-	desc2 = &desc_sw->desc2;
-	desc1->m0 |= cpu_to_le64(XGENE_DMA_DESC_NV_BIT);
-
-	/* Set 2nd to 5th source address */
-	for (i = 0; i < 4 && len; i++)
-		xgene_dma_set_src_buffer(xgene_dma_lookup_ext8(desc2, i),
-					 &len, &src);
-
-	/* Invalidate unused source address field */
-	for (; i < 4; i++)
-		xgene_dma_invalidate_buffer(xgene_dma_lookup_ext8(desc2, i));
-
-	/* Updated flag that we have prepared 64B descriptor */
-	desc_sw->flags |= XGENE_DMA_FLAG_64B_DESC;
-}
-
 static void xgene_dma_prep_xor_desc(struct xgene_dma_chan *chan,
 				    struct xgene_dma_desc_sw *desc_sw,
 				    dma_addr_t *dst, dma_addr_t *src,
@@ -891,114 +849,6 @@ static void xgene_dma_free_chan_resources(struct dma_chan *dchan)
 	chan->desc_pool = NULL;
 }
 
-static struct dma_async_tx_descriptor *xgene_dma_prep_sg(
-	struct dma_chan *dchan, struct scatterlist *dst_sg,
-	u32 dst_nents, struct scatterlist *src_sg,
-	u32 src_nents, unsigned long flags)
-{
-	struct xgene_dma_desc_sw *first = NULL, *new = NULL;
-	struct xgene_dma_chan *chan;
-	size_t dst_avail, src_avail;
-	dma_addr_t dst, src;
-	size_t len;
-
-	if (unlikely(!dchan))
-		return NULL;
-
-	if (unlikely(!dst_nents || !src_nents))
-		return NULL;
-
-	if (unlikely(!dst_sg || !src_sg))
-		return NULL;
-
-	chan = to_dma_chan(dchan);
-
-	/* Get prepared for the loop */
-	dst_avail = sg_dma_len(dst_sg);
-	src_avail = sg_dma_len(src_sg);
-	dst_nents--;
-	src_nents--;
-
-	/* Run until we are out of scatterlist entries */
-	while (true) {
-		/* Create the largest transaction possible */
-		len = min_t(size_t, src_avail, dst_avail);
-		len = min_t(size_t, len, XGENE_DMA_MAX_64B_DESC_BYTE_CNT);
-		if (len == 0)
-			goto fetch;
-
-		dst = sg_dma_address(dst_sg) + sg_dma_len(dst_sg) - dst_avail;
-		src = sg_dma_address(src_sg) + sg_dma_len(src_sg) - src_avail;
-
-		/* Allocate the link descriptor from DMA pool */
-		new = xgene_dma_alloc_descriptor(chan);
-		if (!new)
-			goto fail;
-
-		/* Prepare DMA descriptor */
-		xgene_dma_prep_cpy_desc(chan, new, dst, src, len);
-
-		if (!first)
-			first = new;
-
-		new->tx.cookie = 0;
-		async_tx_ack(&new->tx);
-
-		/* update metadata */
-		dst_avail -= len;
-		src_avail -= len;
-
-		/* Insert the link descriptor to the LD ring */
-		list_add_tail(&new->node, &first->tx_list);
-
-fetch:
-		/* fetch the next dst scatterlist entry */
-		if (dst_avail == 0) {
-			/* no more entries: we're done */
-			if (dst_nents == 0)
-				break;
-
-			/* fetch the next entry: if there are no more: done */
-			dst_sg = sg_next(dst_sg);
-			if (!dst_sg)
-				break;
-
-			dst_nents--;
-			dst_avail = sg_dma_len(dst_sg);
-		}
-
-		/* fetch the next src scatterlist entry */
-		if (src_avail == 0) {
-			/* no more entries: we're done */
-			if (src_nents == 0)
-				break;
-
-			/* fetch the next entry: if there are no more: done */
-			src_sg = sg_next(src_sg);
-			if (!src_sg)
-				break;
-
-			src_nents--;
-			src_avail = sg_dma_len(src_sg);
-		}
-	}
-
-	if (!new)
-		return NULL;
-
-	new->tx.flags = flags; /* client is in control of this ack */
-	new->tx.cookie = -EBUSY;
-	list_splice(&first->tx_list, &new->tx_list);
-
-	return &new->tx;
-fail:
-	if (!first)
-		return NULL;
-
-	xgene_dma_free_desc_list(chan, &first->tx_list);
-	return NULL;
-}
-
 static struct dma_async_tx_descriptor *xgene_dma_prep_xor(
 	struct dma_chan *dchan, dma_addr_t dst,	dma_addr_t *src,
 	u32 src_cnt, size_t len, unsigned long flags)
@@ -1653,7 +1503,6 @@ static void xgene_dma_set_caps(struct xgene_dma_chan *chan,
 	dma_cap_zero(dma_dev->cap_mask);
 
 	/* Set DMA device capability */
-	dma_cap_set(DMA_SG, dma_dev->cap_mask);
 
 	/* Basically here, the X-Gene SoC DMA engine channel 0 supports XOR
 	 * and channel 1 supports XOR, PQ both. First thing here is we have
@@ -1679,7 +1528,6 @@ static void xgene_dma_set_caps(struct xgene_dma_chan *chan,
 	dma_dev->device_free_chan_resources = xgene_dma_free_chan_resources;
 	dma_dev->device_issue_pending = xgene_dma_issue_pending;
 	dma_dev->device_tx_status = xgene_dma_tx_status;
-	dma_dev->device_prep_dma_sg = xgene_dma_prep_sg;
 
 	if (dma_has_cap(DMA_XOR, dma_dev->cap_mask)) {
 		dma_dev->device_prep_dma_xor = xgene_dma_prep_xor;
@@ -1731,8 +1579,7 @@ static int xgene_dma_async_register(struct xgene_dma *pdma, int id)
 
 	/* DMA capability info */
 	dev_info(pdma->dev,
-		 "%s: CAPABILITY ( %s%s%s)\n", dma_chan_name(&chan->dma_chan),
-		 dma_has_cap(DMA_SG, dma_dev->cap_mask) ? "SGCPY " : "",
+		 "%s: CAPABILITY ( %s%s)\n", dma_chan_name(&chan->dma_chan),
 		 dma_has_cap(DMA_XOR, dma_dev->cap_mask) ? "XOR " : "",
 		 dma_has_cap(DMA_PQ, dma_dev->cap_mask) ? "PQ " : "");
 
diff --git a/drivers/dma/xilinx/zynqmp_dma.c b/drivers/dma/xilinx/zynqmp_dma.c
index 47f64192d2fd..1ee1241ca797 100644
--- a/drivers/dma/xilinx/zynqmp_dma.c
+++ b/drivers/dma/xilinx/zynqmp_dma.c
@@ -829,98 +829,6 @@ static struct dma_async_tx_descriptor *zynqmp_dma_prep_memcpy(
 	return &first->async_tx;
 }
 
-/**
- * zynqmp_dma_prep_slave_sg - prepare descriptors for a memory sg transaction
- * @dchan: DMA channel
- * @dst_sg: Destination scatter list
- * @dst_sg_len: Number of entries in destination scatter list
- * @src_sg: Source scatter list
- * @src_sg_len: Number of entries in source scatter list
- * @flags: transfer ack flags
- *
- * Return: Async transaction descriptor on success and NULL on failure
- */
-static struct dma_async_tx_descriptor *zynqmp_dma_prep_sg(
-			struct dma_chan *dchan, struct scatterlist *dst_sg,
-			unsigned int dst_sg_len, struct scatterlist *src_sg,
-			unsigned int src_sg_len, unsigned long flags)
-{
-	struct zynqmp_dma_desc_sw *new, *first = NULL;
-	struct zynqmp_dma_chan *chan = to_chan(dchan);
-	void *desc = NULL, *prev = NULL;
-	size_t len, dst_avail, src_avail;
-	dma_addr_t dma_dst, dma_src;
-	u32 desc_cnt = 0, i;
-	struct scatterlist *sg;
-
-	for_each_sg(src_sg, sg, src_sg_len, i)
-		desc_cnt += DIV_ROUND_UP(sg_dma_len(sg),
-					 ZYNQMP_DMA_MAX_TRANS_LEN);
-
-	spin_lock_bh(&chan->lock);
-	if (desc_cnt > chan->desc_free_cnt) {
-		spin_unlock_bh(&chan->lock);
-		dev_dbg(chan->dev, "chan %p descs are not available\n", chan);
-		return NULL;
-	}
-	chan->desc_free_cnt = chan->desc_free_cnt - desc_cnt;
-	spin_unlock_bh(&chan->lock);
-
-	dst_avail = sg_dma_len(dst_sg);
-	src_avail = sg_dma_len(src_sg);
-
-	/* Run until we are out of scatterlist entries */
-	while (true) {
-		/* Allocate and populate the descriptor */
-		new = zynqmp_dma_get_descriptor(chan);
-		desc = (struct zynqmp_dma_desc_ll *)new->src_v;
-		len = min_t(size_t, src_avail, dst_avail);
-		len = min_t(size_t, len, ZYNQMP_DMA_MAX_TRANS_LEN);
-		if (len == 0)
-			goto fetch;
-		dma_dst = sg_dma_address(dst_sg) + sg_dma_len(dst_sg) -
-			dst_avail;
-		dma_src = sg_dma_address(src_sg) + sg_dma_len(src_sg) -
-			src_avail;
-
-		zynqmp_dma_config_sg_ll_desc(chan, desc, dma_src, dma_dst,
-					     len, prev);
-		prev = desc;
-		dst_avail -= len;
-		src_avail -= len;
-
-		if (!first)
-			first = new;
-		else
-			list_add_tail(&new->node, &first->tx_list);
-fetch:
-		/* Fetch the next dst scatterlist entry */
-		if (dst_avail == 0) {
-			if (dst_sg_len == 0)
-				break;
-			dst_sg = sg_next(dst_sg);
-			if (dst_sg == NULL)
-				break;
-			dst_sg_len--;
-			dst_avail = sg_dma_len(dst_sg);
-		}
-		/* Fetch the next src scatterlist entry */
-		if (src_avail == 0) {
-			if (src_sg_len == 0)
-				break;
-			src_sg = sg_next(src_sg);
-			if (src_sg == NULL)
-				break;
-			src_sg_len--;
-			src_avail = sg_dma_len(src_sg);
-		}
-	}
-
-	zynqmp_dma_desc_config_eod(chan, desc);
-	first->async_tx.flags = flags;
-	return &first->async_tx;
-}
-
 /**
  * zynqmp_dma_chan_remove - Channel remove function
  * @chan: ZynqMP DMA channel pointer
@@ -1064,11 +972,9 @@ static int zynqmp_dma_probe(struct platform_device *pdev)
 	INIT_LIST_HEAD(&zdev->common.channels);
 
 	dma_set_mask(&pdev->dev, DMA_BIT_MASK(44));
-	dma_cap_set(DMA_SG, zdev->common.cap_mask);
 	dma_cap_set(DMA_MEMCPY, zdev->common.cap_mask);
 
 	p = &zdev->common;
-	p->device_prep_dma_sg = zynqmp_dma_prep_sg;
 	p->device_prep_dma_memcpy = zynqmp_dma_prep_memcpy;
 	p->device_terminate_all = zynqmp_dma_device_terminate_all;
 	p->device_issue_pending = zynqmp_dma_issue_pending;
diff --git a/include/linux/dmaengine.h b/include/linux/dmaengine.h
index 533680860865..64fbd380c430 100644
--- a/include/linux/dmaengine.h
+++ b/include/linux/dmaengine.h
@@ -68,7 +68,6 @@ enum dma_transaction_type {
 	DMA_MEMSET,
 	DMA_MEMSET_SG,
 	DMA_INTERRUPT,
-	DMA_SG,
 	DMA_PRIVATE,
 	DMA_ASYNC_TX,
 	DMA_SLAVE,
@@ -771,11 +770,6 @@ struct dma_device {
 		unsigned int nents, int value, unsigned long flags);
 	struct dma_async_tx_descriptor *(*device_prep_dma_interrupt)(
 		struct dma_chan *chan, unsigned long flags);
-	struct dma_async_tx_descriptor *(*device_prep_dma_sg)(
-		struct dma_chan *chan,
-		struct scatterlist *dst_sg, unsigned int dst_nents,
-		struct scatterlist *src_sg, unsigned int src_nents,
-		unsigned long flags);
 
 	struct dma_async_tx_descriptor *(*device_prep_slave_sg)(
 		struct dma_chan *chan, struct scatterlist *sgl,
@@ -905,19 +899,6 @@ static inline struct dma_async_tx_descriptor *dmaengine_prep_dma_memcpy(
 						    len, flags);
 }
 
-static inline struct dma_async_tx_descriptor *dmaengine_prep_dma_sg(
-		struct dma_chan *chan,
-		struct scatterlist *dst_sg, unsigned int dst_nents,
-		struct scatterlist *src_sg, unsigned int src_nents,
-		unsigned long flags)
-{
-	if (!chan || !chan->device || !chan->device->device_prep_dma_sg)
-		return NULL;
-
-	return chan->device->device_prep_dma_sg(chan, dst_sg, dst_nents,
-			src_sg, src_nents, flags);
-}
-
 /**
  * dmaengine_terminate_all() - Terminate all active DMA transfers
  * @chan: The channel for which to terminate the transfers
-- 
cgit v1.2.3


From 6f7473c524cc4f875dcd9397ec9a6ec039bd08b6 Mon Sep 17 00:00:00 2001
From: Rabin Vincent <rabinv@axis.com>
Date: Thu, 10 Aug 2017 14:53:52 +0200
Subject: crypto: hash - add crypto_(un)register_ahashes()

There are already helpers to (un)register multiple normal
and AEAD algos.  Add one for ahashes too.

Signed-off-by: Lars Persson <larper@axis.com>
Signed-off-by: Rabin Vincent <rabinv@axis.com>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 crypto/ahash.c                 | 29 +++++++++++++++++++++++++++++
 include/crypto/internal/hash.h |  2 ++
 2 files changed, 31 insertions(+)

(limited to 'include')

diff --git a/crypto/ahash.c b/crypto/ahash.c
index 826cd7ab4d4a..5e8666e6ccae 100644
--- a/crypto/ahash.c
+++ b/crypto/ahash.c
@@ -588,6 +588,35 @@ int crypto_unregister_ahash(struct ahash_alg *alg)
 }
 EXPORT_SYMBOL_GPL(crypto_unregister_ahash);
 
+int crypto_register_ahashes(struct ahash_alg *algs, int count)
+{
+	int i, ret;
+
+	for (i = 0; i < count; i++) {
+		ret = crypto_register_ahash(&algs[i]);
+		if (ret)
+			goto err;
+	}
+
+	return 0;
+
+err:
+	for (--i; i >= 0; --i)
+		crypto_unregister_ahash(&algs[i]);
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(crypto_register_ahashes);
+
+void crypto_unregister_ahashes(struct ahash_alg *algs, int count)
+{
+	int i;
+
+	for (i = count - 1; i >= 0; --i)
+		crypto_unregister_ahash(&algs[i]);
+}
+EXPORT_SYMBOL_GPL(crypto_unregister_ahashes);
+
 int ahash_register_instance(struct crypto_template *tmpl,
 			    struct ahash_instance *inst)
 {
diff --git a/include/crypto/internal/hash.h b/include/crypto/internal/hash.h
index f6d9af3efa45..f0b44c16e88f 100644
--- a/include/crypto/internal/hash.h
+++ b/include/crypto/internal/hash.h
@@ -76,6 +76,8 @@ static inline int crypto_ahash_walk_last(struct crypto_hash_walk *walk)
 
 int crypto_register_ahash(struct ahash_alg *alg);
 int crypto_unregister_ahash(struct ahash_alg *alg);
+int crypto_register_ahashes(struct ahash_alg *algs, int count);
+void crypto_unregister_ahashes(struct ahash_alg *algs, int count);
 int ahash_register_instance(struct crypto_template *tmpl,
 			    struct ahash_instance *inst);
 void ahash_free_instance(struct crypto_instance *inst);
-- 
cgit v1.2.3


From 67dfabe3a01be6b0cc4c0056aa546f0279ed6279 Mon Sep 17 00:00:00 2001
From: Sebastian Reichel <sebastian.reichel@collabora.co.uk>
Date: Mon, 21 Aug 2017 16:54:01 +0200
Subject: mfd: da9052: Add register details for TSI

Add register details an channels definition for using the TSI
registers in the hwmon driver.

Signed-off-by: Sebastian Reichel <sebastian.reichel@collabora.co.uk>
Signed-off-by: Lee Jones <lee.jones@linaro.org>
---
 include/linux/mfd/da9052/da9052.h |  6 ++++++
 include/linux/mfd/da9052/reg.h    | 11 ++++++++++-
 2 files changed, 16 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/mfd/da9052/da9052.h b/include/linux/mfd/da9052/da9052.h
index ce9230af09c2..ae5b663836d0 100644
--- a/include/linux/mfd/da9052/da9052.h
+++ b/include/linux/mfd/da9052/da9052.h
@@ -45,6 +45,12 @@
 #define DA9052_ADC_TJUNC	8
 #define DA9052_ADC_VBBAT	9
 
+/* TSI channel has its own 4 channel mux */
+#define DA9052_ADC_TSI_XP	70
+#define DA9052_ADC_TSI_XN	71
+#define DA9052_ADC_TSI_YP	72
+#define DA9052_ADC_TSI_YN	73
+
 #define DA9052_IRQ_DCIN	0
 #define DA9052_IRQ_VBUS	1
 #define DA9052_IRQ_DCINREM	2
diff --git a/include/linux/mfd/da9052/reg.h b/include/linux/mfd/da9052/reg.h
index 5010f978725c..76780ea8849c 100644
--- a/include/linux/mfd/da9052/reg.h
+++ b/include/linux/mfd/da9052/reg.h
@@ -690,7 +690,10 @@
 /* TSI CONTROL REGISTER B BITS */
 #define DA9052_TSICONTB_ADCREF		0X80
 #define DA9052_TSICONTB_TSIMAN		0X40
-#define DA9052_TSICONTB_TSIMUX		0X30
+#define DA9052_TSICONTB_TSIMUX_XP	0X00
+#define DA9052_TSICONTB_TSIMUX_YP	0X10
+#define DA9052_TSICONTB_TSIMUX_XN	0X20
+#define DA9052_TSICONTB_TSIMUX_YN	0X30
 #define DA9052_TSICONTB_TSISEL3	0X08
 #define DA9052_TSICONTB_TSISEL2	0X04
 #define DA9052_TSICONTB_TSISEL1	0X02
@@ -705,8 +708,14 @@
 /* TSI CO-ORDINATE LSB RESULT REGISTER BITS */
 #define DA9052_TSILSB_PENDOWN		0X40
 #define DA9052_TSILSB_TSIZL		0X30
+#define DA9052_TSILSB_TSIZL_SHIFT	4
+#define DA9052_TSILSB_TSIZL_BITS	2
 #define DA9052_TSILSB_TSIYL		0X0C
+#define DA9052_TSILSB_TSIYL_SHIFT	2
+#define DA9052_TSILSB_TSIYL_BITS	2
 #define DA9052_TSILSB_TSIXL		0X03
+#define DA9052_TSILSB_TSIXL_SHIFT	0
+#define DA9052_TSILSB_TSIXL_BITS	2
 
 /* TSI Z MEASUREMENT MSB RESULT REGISTER BIT */
 #define DA9052_TSIZMSB_TSIZM		0XFF
-- 
cgit v1.2.3


From 25a8ef26fd47e4b60cfae094a43ad9bfc49e676b Mon Sep 17 00:00:00 2001
From: Ville Syrjälä <ville.syrjala@linux.intel.com>
Date: Fri, 18 Aug 2017 16:49:51 +0300
Subject: drm/dp: Add defines for DP SDP types
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Add defines for the secondary data packet (SDP) types from the spec.
These are the DP specific ones, and in addition HDMI infoframe types
(see enum hdmi_infoframe_type) are also valid SDP types.

v2: Add more SDP types
v3: Note the DP version that added each SDP type (Rodrigo)

Cc: dri-devel@lists.freedesktop.org
Reviewed-by: Rodrigo Vivi <rodrigo.vivi@intel.com>
Signed-off-by: Ville Syrjälä <ville.syrjala@linux.intel.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20170818134958.15502-2-ville.syrjala@linux.intel.com
---
 include/drm/drm_dp_helper.h | 12 ++++++++++++
 1 file changed, 12 insertions(+)

(limited to 'include')

diff --git a/include/drm/drm_dp_helper.h b/include/drm/drm_dp_helper.h
index b17476a6909c..2c412a15cfa1 100644
--- a/include/drm/drm_dp_helper.h
+++ b/include/drm/drm_dp_helper.h
@@ -871,6 +871,18 @@ void drm_dp_link_train_channel_eq_delay(const u8 dpcd[DP_RECEIVER_CAP_SIZE]);
 u8 drm_dp_link_rate_to_bw_code(int link_rate);
 int drm_dp_bw_code_to_link_rate(u8 link_bw);
 
+#define DP_SDP_AUDIO_TIMESTAMP		0x01
+#define DP_SDP_AUDIO_STREAM		0x02
+#define DP_SDP_EXTENSION		0x04 /* DP 1.1 */
+#define DP_SDP_AUDIO_COPYMANAGEMENT	0x05 /* DP 1.2 */
+#define DP_SDP_ISRC			0x06 /* DP 1.2 */
+#define DP_SDP_VSC			0x07 /* DP 1.2 */
+#define DP_SDP_CAMERA_GENERIC(i)	(0x08 + (i)) /* 0-7, DP 1.3 */
+#define DP_SDP_PPS			0x10 /* DP 1.4 */
+#define DP_SDP_VSC_EXT_VESA		0x20 /* DP 1.4 */
+#define DP_SDP_VSC_EXT_CEA		0x21 /* DP 1.4 */
+/* 0x80+ CEA-861 infoframe types */
+
 struct edp_sdp_header {
 	u8 HB0; /* Secondary Data Packet ID */
 	u8 HB1; /* Secondary Data Packet Type */
-- 
cgit v1.2.3


From a9467d954226f1a513cfe789a3a39d8fc73b5d16 Mon Sep 17 00:00:00 2001
From: Yong Wu <yong.wu@mediatek.com>
Date: Mon, 21 Aug 2017 19:00:15 +0800
Subject: iommu/mediatek: Move MTK_M4U_TO_LARB/PORT into mtk_iommu.c

The definition of MTK_M4U_TO_LARB and MTK_M4U_TO_PORT are shared by
all the gen2 M4U HWs. Thus, Move them out from mt8173-larb-port.h,
and put them into the c file.

Suggested-by: Honghui Zhang <honghui.zhang@mediatek.com>
Signed-off-by: Yong Wu <yong.wu@mediatek.com>
Signed-off-by: Joerg Roedel <jroedel@suse.de>
---
 drivers/iommu/mtk_iommu.c                     | 8 +++++++-
 include/dt-bindings/memory/mt8173-larb-port.h | 4 ----
 2 files changed, 7 insertions(+), 5 deletions(-)

(limited to 'include')

diff --git a/drivers/iommu/mtk_iommu.c b/drivers/iommu/mtk_iommu.c
index 91c6d367ab35..4db6c8f66b0c 100644
--- a/drivers/iommu/mtk_iommu.c
+++ b/drivers/iommu/mtk_iommu.c
@@ -31,7 +31,6 @@
 #include <linux/slab.h>
 #include <linux/spinlock.h>
 #include <asm/barrier.h>
-#include <dt-bindings/memory/mt8173-larb-port.h>
 #include <soc/mediatek/smi.h>
 
 #include "mtk_iommu.h"
@@ -93,6 +92,13 @@
 
 #define MTK_PROTECT_PA_ALIGN			128
 
+/*
+ * Get the local arbiter ID and the portid within the larb arbiter
+ * from mtk_m4u_id which is defined by MTK_M4U_ID.
+ */
+#define MTK_M4U_TO_LARB(id)		(((id) >> 5) & 0x7)
+#define MTK_M4U_TO_PORT(id)		((id) & 0x1f)
+
 struct mtk_iommu_domain {
 	spinlock_t			pgtlock; /* lock for page table */
 
diff --git a/include/dt-bindings/memory/mt8173-larb-port.h b/include/dt-bindings/memory/mt8173-larb-port.h
index 5fef5d1f8f82..111b4b0ec85a 100644
--- a/include/dt-bindings/memory/mt8173-larb-port.h
+++ b/include/dt-bindings/memory/mt8173-larb-port.h
@@ -15,10 +15,6 @@
 #define __DTS_IOMMU_PORT_MT8173_H
 
 #define MTK_M4U_ID(larb, port)		(((larb) << 5) | (port))
-/* Local arbiter ID */
-#define MTK_M4U_TO_LARB(id)		(((id) >> 5) & 0x7)
-/* PortID within the local arbiter */
-#define MTK_M4U_TO_PORT(id)		((id) & 0x1f)
 
 #define M4U_LARB0_ID			0
 #define M4U_LARB1_ID			1
-- 
cgit v1.2.3


From e6dec92308628cff5f1f8bd1bcdf87581c9dc676 Mon Sep 17 00:00:00 2001
From: Yong Wu <yong.wu@mediatek.com>
Date: Mon, 21 Aug 2017 19:00:16 +0800
Subject: iommu/mediatek: Add mt2712 IOMMU support

The M4U IP blocks in mt2712 is MTK's generation2 M4U which use the
ARM Short-descriptor like mt8173, and most of the HW registers are
the same.

The difference is that there are 2 M4U HWs in mt2712 while there's
only one in mt8173. The purpose of 2 M4U HWs is for balance the
bandwidth.

Normally if there are 2 M4U HWs, there should be 2 iommu domains,
each M4U has a iommu domain.

Signed-off-by: Yong Wu <yong.wu@mediatek.com>
Signed-off-by: Joerg Roedel <jroedel@suse.de>
---
 drivers/iommu/mtk_iommu.c  | 71 ++++++++++++++++++++++------------------------
 drivers/iommu/mtk_iommu.h  |  7 +++++
 drivers/memory/mtk-smi.c   | 54 +++++++++++++++++++++++++++++++++--
 include/soc/mediatek/smi.h |  2 +-
 4 files changed, 93 insertions(+), 41 deletions(-)

(limited to 'include')

diff --git a/drivers/iommu/mtk_iommu.c b/drivers/iommu/mtk_iommu.c
index 4db6c8f66b0c..df23e0201336 100644
--- a/drivers/iommu/mtk_iommu.c
+++ b/drivers/iommu/mtk_iommu.c
@@ -53,7 +53,11 @@
 
 #define REG_MMU_CTRL_REG			0x110
 #define F_MMU_PREFETCH_RT_REPLACE_MOD		BIT(4)
-#define F_MMU_TF_PROTECT_SEL(prot)		(((prot) & 0x3) << 5)
+#define F_MMU_TF_PROTECT_SEL_SHIFT(data) \
+	((data)->m4u_plat == M4U_MT2712 ? 4 : 5)
+/* It's named by F_MMU_TF_PROT_SEL in mt2712. */
+#define F_MMU_TF_PROTECT_SEL(prot, data) \
+	(((prot) & 0x3) << F_MMU_TF_PROTECT_SEL_SHIFT(data))
 
 #define REG_MMU_IVRP_PADDR			0x114
 #define F_MMU_IVRP_PA_SET(pa, ext)		(((pa) >> 1) | ((!!(ext)) << 31))
@@ -96,7 +100,7 @@
  * Get the local arbiter ID and the portid within the larb arbiter
  * from mtk_m4u_id which is defined by MTK_M4U_ID.
  */
-#define MTK_M4U_TO_LARB(id)		(((id) >> 5) & 0x7)
+#define MTK_M4U_TO_LARB(id)		(((id) >> 5) & 0xf)
 #define MTK_M4U_TO_PORT(id)		((id) & 0x1f)
 
 struct mtk_iommu_domain {
@@ -307,10 +311,6 @@ static int mtk_iommu_attach_device(struct iommu_domain *domain,
 			data->m4u_dom = NULL;
 			return ret;
 		}
-	} else if (data->m4u_dom != dom) {
-		/* All the client devices should be in the same m4u domain */
-		dev_err(dev, "try to attach into the error iommu domain\n");
-		return -EPERM;
 	}
 
 	mtk_iommu_config(data, dev, true);
@@ -470,8 +470,9 @@ static int mtk_iommu_hw_init(const struct mtk_iommu_data *data)
 		return ret;
 	}
 
-	regval = F_MMU_PREFETCH_RT_REPLACE_MOD |
-		F_MMU_TF_PROTECT_SEL(2);
+	regval = F_MMU_TF_PROTECT_SEL(2, data);
+	if (data->m4u_plat == M4U_MT8173)
+		regval |= F_MMU_PREFETCH_RT_REPLACE_MOD;
 	writel_relaxed(regval, data->base + REG_MMU_CTRL_REG);
 
 	regval = F_L2_MULIT_HIT_EN |
@@ -493,9 +494,11 @@ static int mtk_iommu_hw_init(const struct mtk_iommu_data *data)
 
 	writel_relaxed(F_MMU_IVRP_PA_SET(data->protect_base, data->enable_4GB),
 		       data->base + REG_MMU_IVRP_PADDR);
-
 	writel_relaxed(0, data->base + REG_MMU_DCM_DIS);
-	writel_relaxed(0, data->base + REG_MMU_STANDARD_AXI_MODE);
+
+	/* It's MISC control register whose default value is ok except mt8173.*/
+	if (data->m4u_plat == M4U_MT8173)
+		writel_relaxed(0, data->base + REG_MMU_STANDARD_AXI_MODE);
 
 	if (devm_request_irq(data->dev, data->irq, mtk_iommu_isr, 0,
 			     dev_name(data->dev), (void *)data)) {
@@ -527,6 +530,7 @@ static int mtk_iommu_probe(struct platform_device *pdev)
 	if (!data)
 		return -ENOMEM;
 	data->dev = dev;
+	data->m4u_plat = (enum mtk_iommu_plat)of_device_get_match_data(dev);
 
 	/* Protect memory. HW will access here while translation fault.*/
 	protect = devm_kzalloc(dev, MTK_PROTECT_PA_ALIGN * 2, GFP_KERNEL);
@@ -560,6 +564,7 @@ static int mtk_iommu_probe(struct platform_device *pdev)
 	for (i = 0; i < larb_nr; i++) {
 		struct device_node *larbnode;
 		struct platform_device *plarbdev;
+		u32 id;
 
 		larbnode = of_parse_phandle(dev->of_node, "mediatek,larbs", i);
 		if (!larbnode)
@@ -568,17 +573,14 @@ static int mtk_iommu_probe(struct platform_device *pdev)
 		if (!of_device_is_available(larbnode))
 			continue;
 
+		ret = of_property_read_u32(larbnode, "mediatek,larb-id", &id);
+		if (ret)/* The id is consecutive if there is no this property */
+			id = i;
+
 		plarbdev = of_find_device_by_node(larbnode);
-		if (!plarbdev) {
-			plarbdev = of_platform_device_create(
-						larbnode, NULL,
-						platform_bus_type.dev_root);
-			if (!plarbdev) {
-				of_node_put(larbnode);
-				return -EPROBE_DEFER;
-			}
-		}
-		data->smi_imu.larb_imu[i].dev = &plarbdev->dev;
+		if (!plarbdev)
+			return -EPROBE_DEFER;
+		data->smi_imu.larb_imu[id].dev = &plarbdev->dev;
 
 		component_match_add_release(dev, &match, release_of,
 					    compare_of, larbnode);
@@ -646,8 +648,6 @@ static int __maybe_unused mtk_iommu_resume(struct device *dev)
 	struct mtk_iommu_suspend_reg *reg = &data->reg;
 	void __iomem *base = data->base;
 
-	writel_relaxed(data->m4u_dom->cfg.arm_v7s_cfg.ttbr[0],
-		       base + REG_MMU_PT_BASE_ADDR);
 	writel_relaxed(reg->standard_axi_mode,
 		       base + REG_MMU_STANDARD_AXI_MODE);
 	writel_relaxed(reg->dcm_dis, base + REG_MMU_DCM_DIS);
@@ -656,15 +656,19 @@ static int __maybe_unused mtk_iommu_resume(struct device *dev)
 	writel_relaxed(reg->int_main_control, base + REG_MMU_INT_MAIN_CONTROL);
 	writel_relaxed(F_MMU_IVRP_PA_SET(data->protect_base, data->enable_4GB),
 		       base + REG_MMU_IVRP_PADDR);
+	if (data->m4u_dom)
+		writel(data->m4u_dom->cfg.arm_v7s_cfg.ttbr[0],
+		       base + REG_MMU_PT_BASE_ADDR);
 	return 0;
 }
 
-const struct dev_pm_ops mtk_iommu_pm_ops = {
+static const struct dev_pm_ops mtk_iommu_pm_ops = {
 	SET_SYSTEM_SLEEP_PM_OPS(mtk_iommu_suspend, mtk_iommu_resume)
 };
 
 static const struct of_device_id mtk_iommu_of_ids[] = {
-	{ .compatible = "mediatek,mt8173-m4u", },
+	{ .compatible = "mediatek,mt2712-m4u", .data = (void *)M4U_MT2712},
+	{ .compatible = "mediatek,mt8173-m4u", .data = (void *)M4U_MT8173},
 	{}
 };
 
@@ -673,27 +677,20 @@ static struct platform_driver mtk_iommu_driver = {
 	.remove	= mtk_iommu_remove,
 	.driver	= {
 		.name = "mtk-iommu",
-		.of_match_table = mtk_iommu_of_ids,
+		.of_match_table = of_match_ptr(mtk_iommu_of_ids),
 		.pm = &mtk_iommu_pm_ops,
 	}
 };
 
-static int mtk_iommu_init_fn(struct device_node *np)
+static int __init mtk_iommu_init(void)
 {
 	int ret;
-	struct platform_device *pdev;
-
-	pdev = of_platform_device_create(np, NULL, platform_bus_type.dev_root);
-	if (!pdev)
-		return -ENOMEM;
 
 	ret = platform_driver_register(&mtk_iommu_driver);
-	if (ret) {
-		pr_err("%s: Failed to register driver\n", __func__);
-		return ret;
-	}
+	if (ret != 0)
+		pr_err("Failed to register MTK IOMMU driver\n");
 
-	return 0;
+	return ret;
 }
 
-IOMMU_OF_DECLARE(mtkm4u, "mediatek,mt8173-m4u", mtk_iommu_init_fn);
+subsys_initcall(mtk_iommu_init)
diff --git a/drivers/iommu/mtk_iommu.h b/drivers/iommu/mtk_iommu.h
index c06cc91b5d9a..462e593b7d71 100644
--- a/drivers/iommu/mtk_iommu.h
+++ b/drivers/iommu/mtk_iommu.h
@@ -34,6 +34,12 @@ struct mtk_iommu_suspend_reg {
 	u32				int_main_control;
 };
 
+enum mtk_iommu_plat {
+	M4U_MT2701,
+	M4U_MT2712,
+	M4U_MT8173,
+};
+
 struct mtk_iommu_domain;
 
 struct mtk_iommu_data {
@@ -50,6 +56,7 @@ struct mtk_iommu_data {
 	bool				tlb_flush_active;
 
 	struct iommu_device		iommu;
+	enum mtk_iommu_plat		m4u_plat;
 };
 
 static inline int compare_of(struct device *dev, void *data)
diff --git a/drivers/memory/mtk-smi.c b/drivers/memory/mtk-smi.c
index 13f8c45dbf0d..8ffe3216d092 100644
--- a/drivers/memory/mtk-smi.c
+++ b/drivers/memory/mtk-smi.c
@@ -23,7 +23,10 @@
 #include <soc/mediatek/smi.h>
 #include <dt-bindings/memory/mt2701-larb-port.h>
 
+/* mt8173 */
 #define SMI_LARB_MMU_EN		0xf00
+
+/* mt2701 */
 #define REG_SMI_SECUR_CON_BASE		0x5c0
 
 /* every register control 8 port, register offset 0x4 */
@@ -41,6 +44,10 @@
 /* mt2701 domain should be set to 3 */
 #define SMI_SECUR_CON_VAL_DOMAIN(id)	(0x3 << ((((id) & 0x7) << 2) + 1))
 
+/* mt2712 */
+#define SMI_LARB_NONSEC_CON(id)	(0x380 + ((id) * 4))
+#define F_MMU_EN		BIT(0)
+
 struct mtk_smi_larb_gen {
 	bool need_larbid;
 	int port_in_larb[MTK_LARB_NR_MAX + 1];
@@ -149,6 +156,15 @@ mtk_smi_larb_bind(struct device *dev, struct device *master, void *data)
 	struct mtk_smi_iommu *smi_iommu = data;
 	unsigned int         i;
 
+	if (larb->larb_gen->need_larbid) {
+		larb->mmu = &smi_iommu->larb_imu[larb->larbid].mmu;
+		return 0;
+	}
+
+	/*
+	 * If there is no larbid property, Loop to find the corresponding
+	 * iommu information.
+	 */
 	for (i = 0; i < smi_iommu->larb_nr; i++) {
 		if (dev == smi_iommu->larb_imu[i].dev) {
 			/* The 'mmu' may be updated in iommu-attach/detach. */
@@ -159,13 +175,32 @@ mtk_smi_larb_bind(struct device *dev, struct device *master, void *data)
 	return -ENODEV;
 }
 
-static void mtk_smi_larb_config_port(struct device *dev)
+static void mtk_smi_larb_config_port_mt2712(struct device *dev)
 {
 	struct mtk_smi_larb *larb = dev_get_drvdata(dev);
+	u32 reg;
+	int i;
 
-	writel(*larb->mmu, larb->base + SMI_LARB_MMU_EN);
+	/*
+	 * larb 8/9 is the bdpsys larb, the iommu_en is enabled defaultly.
+	 * Don't need to set it again.
+	 */
+	if (larb->larbid == 8 || larb->larbid == 9)
+		return;
+
+	for_each_set_bit(i, (unsigned long *)larb->mmu, 32) {
+		reg = readl_relaxed(larb->base + SMI_LARB_NONSEC_CON(i));
+		reg |= F_MMU_EN;
+		writel(reg, larb->base + SMI_LARB_NONSEC_CON(i));
+	}
 }
 
+static void mtk_smi_larb_config_port_mt8173(struct device *dev)
+{
+	struct mtk_smi_larb *larb = dev_get_drvdata(dev);
+
+	writel(*larb->mmu, larb->base + SMI_LARB_MMU_EN);
+}
 
 static void mtk_smi_larb_config_port_gen1(struct device *dev)
 {
@@ -211,7 +246,7 @@ static const struct component_ops mtk_smi_larb_component_ops = {
 
 static const struct mtk_smi_larb_gen mtk_smi_larb_mt8173 = {
 	/* mt8173 do not need the port in larb */
-	.config_port = mtk_smi_larb_config_port,
+	.config_port = mtk_smi_larb_config_port_mt8173,
 };
 
 static const struct mtk_smi_larb_gen mtk_smi_larb_mt2701 = {
@@ -223,6 +258,11 @@ static const struct mtk_smi_larb_gen mtk_smi_larb_mt2701 = {
 	.config_port = mtk_smi_larb_config_port_gen1,
 };
 
+static const struct mtk_smi_larb_gen mtk_smi_larb_mt2712 = {
+	.need_larbid = true,
+	.config_port = mtk_smi_larb_config_port_mt2712,
+};
+
 static const struct of_device_id mtk_smi_larb_of_ids[] = {
 	{
 		.compatible = "mediatek,mt8173-smi-larb",
@@ -232,6 +272,10 @@ static const struct of_device_id mtk_smi_larb_of_ids[] = {
 		.compatible = "mediatek,mt2701-smi-larb",
 		.data = &mtk_smi_larb_mt2701
 	},
+	{
+		.compatible = "mediatek,mt2712-smi-larb",
+		.data = &mtk_smi_larb_mt2712
+	},
 	{}
 };
 
@@ -318,6 +362,10 @@ static const struct of_device_id mtk_smi_common_of_ids[] = {
 		.compatible = "mediatek,mt2701-smi-common",
 		.data = (void *)MTK_SMI_GEN1
 	},
+	{
+		.compatible = "mediatek,mt2712-smi-common",
+		.data = (void *)MTK_SMI_GEN2
+	},
 	{}
 };
 
diff --git a/include/soc/mediatek/smi.h b/include/soc/mediatek/smi.h
index 8893c5eacd07..5201e9022c86 100644
--- a/include/soc/mediatek/smi.h
+++ b/include/soc/mediatek/smi.h
@@ -19,7 +19,7 @@
 
 #ifdef CONFIG_MTK_SMI
 
-#define MTK_LARB_NR_MAX		8
+#define MTK_LARB_NR_MAX		16
 
 #define MTK_SMI_MMU_EN(port)	BIT(port)
 
-- 
cgit v1.2.3


From 18c90df9f2c00cb35ab8ba747aa0f742ee6bbf6a Mon Sep 17 00:00:00 2001
From: Romain Perier <romain.perier@collabora.com>
Date: Tue, 22 Aug 2017 13:46:59 +0200
Subject: mlx5: Replace PCI pool old API

The PCI pool API is deprecated. This commit replaces the PCI pool old
API by the appropriate function with the DMA pool API.

Signed-off-by: Romain Perier <romain.perier@collabora.com>
Reviewed-by: Peter Senna Tschudin <peter.senna@collabora.com>
Acked-by: Doug Ledford <dledford@redhat.com>
Tested-by: Doug Ledford <dledford@redhat.com>
Signed-off-by: Doug Ledford <dledford@redhat.com>
---
 drivers/net/ethernet/mellanox/mlx5/core/cmd.c | 11 ++++++-----
 include/linux/mlx5/driver.h                   |  2 +-
 2 files changed, 7 insertions(+), 6 deletions(-)

(limited to 'include')

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/cmd.c b/drivers/net/ethernet/mellanox/mlx5/core/cmd.c
index f5a2c605749f..7a40e8790f75 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/cmd.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/cmd.c
@@ -1095,7 +1095,7 @@ static struct mlx5_cmd_mailbox *alloc_cmd_box(struct mlx5_core_dev *dev,
 	if (!mailbox)
 		return ERR_PTR(-ENOMEM);
 
-	mailbox->buf = pci_pool_zalloc(dev->cmd.pool, flags,
+	mailbox->buf = dma_pool_zalloc(dev->cmd.pool, flags,
 				       &mailbox->dma);
 	if (!mailbox->buf) {
 		mlx5_core_dbg(dev, "failed allocation\n");
@@ -1110,7 +1110,7 @@ static struct mlx5_cmd_mailbox *alloc_cmd_box(struct mlx5_core_dev *dev,
 static void free_cmd_box(struct mlx5_core_dev *dev,
 			 struct mlx5_cmd_mailbox *mailbox)
 {
-	pci_pool_free(dev->cmd.pool, mailbox->buf, mailbox->dma);
+	dma_pool_free(dev->cmd.pool, mailbox->buf, mailbox->dma);
 	kfree(mailbox);
 }
 
@@ -1759,7 +1759,8 @@ int mlx5_cmd_init(struct mlx5_core_dev *dev)
 		return -EINVAL;
 	}
 
-	cmd->pool = pci_pool_create("mlx5_cmd", dev->pdev, size, align, 0);
+	cmd->pool = dma_pool_create("mlx5_cmd", &dev->pdev->dev, size, align,
+				    0);
 	if (!cmd->pool)
 		return -ENOMEM;
 
@@ -1849,7 +1850,7 @@ err_free_page:
 	free_cmd_page(dev, cmd);
 
 err_free_pool:
-	pci_pool_destroy(cmd->pool);
+	dma_pool_destroy(cmd->pool);
 
 	return err;
 }
@@ -1863,6 +1864,6 @@ void mlx5_cmd_cleanup(struct mlx5_core_dev *dev)
 	destroy_workqueue(cmd->wq);
 	destroy_msg_cache(dev);
 	free_cmd_page(dev, cmd);
-	pci_pool_destroy(cmd->pool);
+	dma_pool_destroy(cmd->pool);
 }
 EXPORT_SYMBOL(mlx5_cmd_cleanup);
diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h
index 099fe311f272..db40bc4055c7 100644
--- a/include/linux/mlx5/driver.h
+++ b/include/linux/mlx5/driver.h
@@ -299,7 +299,7 @@ struct mlx5_cmd {
 	struct semaphore pages_sem;
 	int	mode;
 	struct mlx5_cmd_work_ent *ent_arr[MLX5_MAX_COMMANDS];
-	struct pci_pool *pool;
+	struct dma_pool *pool;
 	struct mlx5_cmd_debug dbg;
 	struct cmd_msg_cache cache[MLX5_NUM_COMMAND_CACHES];
 	int checksum_disabled;
-- 
cgit v1.2.3


From c5cff8561d2d0006e972bd114afd51f082fee77c Mon Sep 17 00:00:00 2001
From: Wei Wang <weiwan@google.com>
Date: Mon, 21 Aug 2017 09:47:10 -0700
Subject: ipv6: add rcu grace period before freeing fib6_node

We currently keep rt->rt6i_node pointing to the fib6_node for the route.
And some functions make use of this pointer to dereference the fib6_node
from rt structure, e.g. rt6_check(). However, as there is neither
refcount nor rcu taken when dereferencing rt->rt6i_node, it could
potentially cause crashes as rt->rt6i_node could be set to NULL by other
CPUs when doing a route deletion.
This patch introduces an rcu grace period before freeing fib6_node and
makes sure the functions that dereference it takes rcu_read_lock().

Note: there is no "Fixes" tag because this bug was there in a very
early stage.

Signed-off-by: Wei Wang <weiwan@google.com>
Acked-by: Eric Dumazet <edumazet@google.com>
Acked-by: Martin KaFai Lau <kafai@fb.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/ip6_fib.h | 30 +++++++++++++++++++++++++++++-
 net/ipv6/ip6_fib.c    | 20 ++++++++++++++++----
 net/ipv6/route.c      | 14 +++++++++++---
 3 files changed, 56 insertions(+), 8 deletions(-)

(limited to 'include')

diff --git a/include/net/ip6_fib.h b/include/net/ip6_fib.h
index 1a88008cc6f5..e9c59db92942 100644
--- a/include/net/ip6_fib.h
+++ b/include/net/ip6_fib.h
@@ -70,6 +70,7 @@ struct fib6_node {
 	__u16			fn_flags;
 	int			fn_sernum;
 	struct rt6_info		*rr_ptr;
+	struct rcu_head		rcu;
 };
 
 #ifndef CONFIG_IPV6_SUBTREES
@@ -167,13 +168,40 @@ static inline void rt6_update_expires(struct rt6_info *rt0, int timeout)
 	rt0->rt6i_flags |= RTF_EXPIRES;
 }
 
+/* Function to safely get fn->sernum for passed in rt
+ * and store result in passed in cookie.
+ * Return true if we can get cookie safely
+ * Return false if not
+ */
+static inline bool rt6_get_cookie_safe(const struct rt6_info *rt,
+				       u32 *cookie)
+{
+	struct fib6_node *fn;
+	bool status = false;
+
+	rcu_read_lock();
+	fn = rcu_dereference(rt->rt6i_node);
+
+	if (fn) {
+		*cookie = fn->fn_sernum;
+		status = true;
+	}
+
+	rcu_read_unlock();
+	return status;
+}
+
 static inline u32 rt6_get_cookie(const struct rt6_info *rt)
 {
+	u32 cookie = 0;
+
 	if (rt->rt6i_flags & RTF_PCPU ||
 	    (unlikely(!list_empty(&rt->rt6i_uncached)) && rt->dst.from))
 		rt = (struct rt6_info *)(rt->dst.from);
 
-	return rt->rt6i_node ? rt->rt6i_node->fn_sernum : 0;
+	rt6_get_cookie_safe(rt, &cookie);
+
+	return cookie;
 }
 
 static inline void ip6_rt_put(struct rt6_info *rt)
diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c
index 5cc0ea038198..a5ebf86f6be8 100644
--- a/net/ipv6/ip6_fib.c
+++ b/net/ipv6/ip6_fib.c
@@ -148,11 +148,23 @@ static struct fib6_node *node_alloc(void)
 	return fn;
 }
 
-static void node_free(struct fib6_node *fn)
+static void node_free_immediate(struct fib6_node *fn)
+{
+	kmem_cache_free(fib6_node_kmem, fn);
+}
+
+static void node_free_rcu(struct rcu_head *head)
 {
+	struct fib6_node *fn = container_of(head, struct fib6_node, rcu);
+
 	kmem_cache_free(fib6_node_kmem, fn);
 }
 
+static void node_free(struct fib6_node *fn)
+{
+	call_rcu(&fn->rcu, node_free_rcu);
+}
+
 static void rt6_free_pcpu(struct rt6_info *non_pcpu_rt)
 {
 	int cpu;
@@ -601,9 +613,9 @@ insert_above:
 
 		if (!in || !ln) {
 			if (in)
-				node_free(in);
+				node_free_immediate(in);
 			if (ln)
-				node_free(ln);
+				node_free_immediate(ln);
 			return ERR_PTR(-ENOMEM);
 		}
 
@@ -1038,7 +1050,7 @@ int fib6_add(struct fib6_node *root, struct rt6_info *rt,
 				   root, and then (in failure) stale node
 				   in main tree.
 				 */
-				node_free(sfn);
+				node_free_immediate(sfn);
 				err = PTR_ERR(sn);
 				goto failure;
 			}
diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index 94d6a13d47f0..a9d3564caf49 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -1289,7 +1289,9 @@ static void rt6_dst_from_metrics_check(struct rt6_info *rt)
 
 static struct dst_entry *rt6_check(struct rt6_info *rt, u32 cookie)
 {
-	if (!rt->rt6i_node || (rt->rt6i_node->fn_sernum != cookie))
+	u32 rt_cookie;
+
+	if (!rt6_get_cookie_safe(rt, &rt_cookie) || rt_cookie != cookie)
 		return NULL;
 
 	if (rt6_check_expired(rt))
@@ -1357,8 +1359,14 @@ static void ip6_link_failure(struct sk_buff *skb)
 		if (rt->rt6i_flags & RTF_CACHE) {
 			if (dst_hold_safe(&rt->dst))
 				ip6_del_rt(rt);
-		} else if (rt->rt6i_node && (rt->rt6i_flags & RTF_DEFAULT)) {
-			rt->rt6i_node->fn_sernum = -1;
+		} else {
+			struct fib6_node *fn;
+
+			rcu_read_lock();
+			fn = rcu_dereference(rt->rt6i_node);
+			if (fn && (rt->rt6i_flags & RTF_DEFAULT))
+				fn->fn_sernum = -1;
+			rcu_read_unlock();
 		}
 	}
 }
-- 
cgit v1.2.3


From 16570d3da0938e0c46c31e5f97c9c8452025d2e7 Mon Sep 17 00:00:00 2001
From: Sebastian Sanchez <sebastian.sanchez@intel.com>
Date: Fri, 4 Aug 2017 13:52:20 -0700
Subject: IB/hfi1: Remove pmtu from the QP structure

The pmtu field doens't have be stored in the QP structure
as it can easily be calculated when needed.

Reviewed-by: Mike Marciniszyn <mike.marciniszyn@intel.com>
Signed-off-by: Sebastian Sanchez <sebastian.sanchez@intel.com>
Signed-off-by: Dennis Dalessandro <dennis.dalessandro@intel.com>
Signed-off-by: Doug Ledford <dledford@redhat.com>
---
 drivers/infiniband/sw/rdmavt/qp.c | 3 +--
 include/rdma/rdmavt_qp.h          | 1 -
 2 files changed, 1 insertion(+), 3 deletions(-)

(limited to 'include')

diff --git a/drivers/infiniband/sw/rdmavt/qp.c b/drivers/infiniband/sw/rdmavt/qp.c
index 1878a97364aa..eb0c3d60c584 100644
--- a/drivers/infiniband/sw/rdmavt/qp.c
+++ b/drivers/infiniband/sw/rdmavt/qp.c
@@ -1243,7 +1243,6 @@ int rvt_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr,
 
 	if (attr_mask & IB_QP_PATH_MTU) {
 		qp->pmtu = rdi->driver_f.mtu_from_qp(rdi, qp, pmtu);
-		qp->path_mtu = rdi->driver_f.mtu_to_path_mtu(qp->pmtu);
 		qp->log_pmtu = ilog2(qp->pmtu);
 	}
 
@@ -1366,7 +1365,7 @@ int rvt_query_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr,
 
 	attr->qp_state = qp->state;
 	attr->cur_qp_state = attr->qp_state;
-	attr->path_mtu = qp->path_mtu;
+	attr->path_mtu = rdi->driver_f.mtu_to_path_mtu(qp->pmtu);
 	attr->path_mig_state = qp->s_mig_state;
 	attr->qkey = qp->qkey;
 	attr->rq_psn = qp->r_psn & rdi->dparms.psn_mask;
diff --git a/include/rdma/rdmavt_qp.h b/include/rdma/rdmavt_qp.h
index 07e2fffa6de6..8fbafb0ce674 100644
--- a/include/rdma/rdmavt_qp.h
+++ b/include/rdma/rdmavt_qp.h
@@ -277,7 +277,6 @@ struct rvt_qp {
 
 	unsigned long timeout_jiffies;  /* computed from timeout */
 
-	enum ib_mtu path_mtu;
 	int srate_mbps;		/* s_srate (below) converted to Mbit/s */
 	pid_t pid;		/* pid for user mode QPs */
 	u32 remote_qpn;
-- 
cgit v1.2.3


From 13c19222889daf91da36b7fb63b5d5d9ce89b377 Mon Sep 17 00:00:00 2001
From: Don Hiatt <don.hiatt@intel.com>
Date: Fri, 4 Aug 2017 13:53:51 -0700
Subject: IB/rdmavt, hfi1, qib: Modify check_ah() to account for extended LIDs

rvt_check_ah() delegates lid verification to underlying
driver. Underlying driver uses different conditions to
check for dlid depending on whether the device supports
extended LIDs

Reviewed-by: Dennis Dalessandro <dennis.dalessandro@intel.com>
Signed-off-by: Dasaratharaman Chandramouli <dasaratharaman.chandramouli@intel.com>
Signed-off-by: Don Hiatt <don.hiatt@intel.com>
Signed-off-by: Dennis Dalessandro <dennis.dalessandro@intel.com>
Signed-off-by: Doug Ledford <dledford@redhat.com>
---
 drivers/infiniband/hw/hfi1/common.h   |  9 ---------
 drivers/infiniband/hw/hfi1/mad.c      |  5 +++--
 drivers/infiniband/hw/hfi1/verbs.c    |  5 +++++
 drivers/infiniband/hw/qib/qib_verbs.c |  9 +++++++++
 drivers/infiniband/sw/rdmavt/ah.c     | 10 ----------
 drivers/infiniband/sw/rdmavt/qp.c     | 29 +++++++++++++++++++++++------
 include/rdma/opa_addr.h               | 18 ++++++++++++++++++
 7 files changed, 58 insertions(+), 27 deletions(-)

(limited to 'include')

diff --git a/drivers/infiniband/hw/hfi1/common.h b/drivers/infiniband/hw/hfi1/common.h
index ba9ab971ced9..aa416ef93c1a 100644
--- a/drivers/infiniband/hw/hfi1/common.h
+++ b/drivers/infiniband/hw/hfi1/common.h
@@ -333,15 +333,6 @@ struct diag_pkt {
 
 #define DEFAULT_P_KEY LIM_MGMT_P_KEY
 
-/**
- * 0xF8 - 4 bits of multicast range and 1 bit for collective range
- * Example: For 24 bit LID space,
- * Multicast range: 0xF00000 to 0xF7FFFF
- * Collective range: 0xF80000 to 0xFFFFFE
- */
-#define HFI1_MCAST_NR 0x4 /* Number of top bits set */
-#define HFI1_COLLECTIVE_NR 0x1 /* Number of bits after MCAST_NR */
-
 #define HFI1_PSM_IOC_BASE_SEQ 0x0
 
 static inline __u64 rhf_to_cpu(const __le32 *rbuf)
diff --git a/drivers/infiniband/hw/hfi1/mad.c b/drivers/infiniband/hw/hfi1/mad.c
index cd1f6f841f34..21fadb4b510c 100644
--- a/drivers/infiniband/hw/hfi1/mad.c
+++ b/drivers/infiniband/hw/hfi1/mad.c
@@ -46,6 +46,7 @@
  */
 
 #include <linux/net.h>
+#include <rdma/opa_addr.h>
 #define OPA_NUM_PKEY_BLOCKS_PER_SMP (OPA_SMP_DR_DATA_SIZE \
 			/ (OPA_PARTITION_TABLE_BLK_SIZE * sizeof(u16)))
 
@@ -905,8 +906,8 @@ static int __subn_get_opa_portinfo(struct opa_smp *smp, u32 am, u8 *data,
 	pi->buffer_units = cpu_to_be32(buffer_units);
 
 	pi->opa_cap_mask = cpu_to_be16(ibp->rvp.port_cap3_flags);
-	pi->collectivemask_multicastmask = ((HFI1_COLLECTIVE_NR & 0x7)
-					    << 3 | (HFI1_MCAST_NR & 0x7));
+	pi->collectivemask_multicastmask = ((OPA_COLLECTIVE_NR & 0x7)
+					    << 3 | (OPA_MCAST_NR & 0x7));
 
 	/* HFI supports a replay buffer 128 LTPs in size */
 	pi->replay_depth.buffer = 0x80;
diff --git a/drivers/infiniband/hw/hfi1/verbs.c b/drivers/infiniband/hw/hfi1/verbs.c
index c88c03c11555..97ca42beb023 100644
--- a/drivers/infiniband/hw/hfi1/verbs.c
+++ b/drivers/infiniband/hw/hfi1/verbs.c
@@ -53,6 +53,7 @@
 #include <linux/rculist.h>
 #include <linux/mm.h>
 #include <linux/vmalloc.h>
+#include <rdma/opa_addr.h>
 
 #include "hfi.h"
 #include "common.h"
@@ -1461,6 +1462,10 @@ static int hfi1_check_ah(struct ib_device *ibdev, struct rdma_ah_attr *ah_attr)
 	struct hfi1_devdata *dd;
 	u8 sc5;
 
+	if (hfi1_check_mcast(rdma_ah_get_dlid(ah_attr)) &&
+	    !(rdma_ah_get_ah_flags(ah_attr) & IB_AH_GRH))
+		return -EINVAL;
+
 	/* test the mapping for validity */
 	ibp = to_iport(ibdev, rdma_ah_get_port_num(ah_attr));
 	ppd = ppd_from_ibp(ibp);
diff --git a/drivers/infiniband/hw/qib/qib_verbs.c b/drivers/infiniband/hw/qib/qib_verbs.c
index ac42dce7e281..9d92aeb8d9a1 100644
--- a/drivers/infiniband/hw/qib/qib_verbs.c
+++ b/drivers/infiniband/hw/qib/qib_verbs.c
@@ -1341,6 +1341,15 @@ int qib_check_ah(struct ib_device *ibdev, struct rdma_ah_attr *ah_attr)
 	if (rdma_ah_get_sl(ah_attr) > 15)
 		return -EINVAL;
 
+	if (rdma_ah_get_dlid(ah_attr) == 0)
+		return -EINVAL;
+	if (rdma_ah_get_dlid(ah_attr) >=
+		be16_to_cpu(IB_MULTICAST_LID_BASE) &&
+	    rdma_ah_get_dlid(ah_attr) !=
+		be16_to_cpu(IB_LID_PERMISSIVE) &&
+	    !(rdma_ah_get_ah_flags(ah_attr) & IB_AH_GRH))
+		return -EINVAL;
+
 	return 0;
 }
 
diff --git a/drivers/infiniband/sw/rdmavt/ah.c b/drivers/infiniband/sw/rdmavt/ah.c
index a96d4aa80ae8..ba3639a0d77c 100644
--- a/drivers/infiniband/sw/rdmavt/ah.c
+++ b/drivers/infiniband/sw/rdmavt/ah.c
@@ -66,8 +66,6 @@ int rvt_check_ah(struct ib_device *ibdev,
 	int port_num = rdma_ah_get_port_num(ah_attr);
 	struct ib_port_attr port_attr;
 	struct rvt_dev_info *rdi = ib_to_rvt(ibdev);
-	enum rdma_link_layer link = rdma_port_get_link_layer(ibdev, port_num);
-	u32 dlid = rdma_ah_get_dlid(ah_attr);
 	u8 ah_flags = rdma_ah_get_ah_flags(ah_attr);
 	u8 static_rate = rdma_ah_get_static_rate(ah_attr);
 
@@ -83,14 +81,6 @@ int rvt_check_ah(struct ib_device *ibdev,
 	if ((ah_flags & IB_AH_GRH) &&
 	    rdma_ah_read_grh(ah_attr)->sgid_index >= port_attr.gid_tbl_len)
 		return -EINVAL;
-	if (link != IB_LINK_LAYER_ETHERNET) {
-		if (dlid == 0)
-			return -EINVAL;
-		if (dlid >= be16_to_cpu(IB_MULTICAST_LID_BASE) &&
-		    dlid != be16_to_cpu(IB_LID_PERMISSIVE) &&
-		    !(ah_flags & IB_AH_GRH))
-			return -EINVAL;
-	}
 	if (rdi->driver_f.check_ah)
 		return rdi->driver_f.check_ah(ibdev, ah_attr);
 	return 0;
diff --git a/drivers/infiniband/sw/rdmavt/qp.c b/drivers/infiniband/sw/rdmavt/qp.c
index eb0c3d60c584..6f6525d24a2f 100644
--- a/drivers/infiniband/sw/rdmavt/qp.c
+++ b/drivers/infiniband/sw/rdmavt/qp.c
@@ -52,6 +52,7 @@
 #include <linux/slab.h>
 #include <rdma/ib_verbs.h>
 #include <rdma/ib_hdrs.h>
+#include <rdma/opa_addr.h>
 #include "qp.h"
 #include "vt.h"
 #include "trace.h"
@@ -1066,6 +1067,7 @@ int rvt_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr,
 	int mig = 0;
 	int pmtu = 0; /* for gcc warning only */
 	enum rdma_link_layer link;
+	int opa_ah;
 
 	link = rdma_port_get_link_layer(ibqp->device, qp->port_num);
 
@@ -1076,6 +1078,7 @@ int rvt_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr,
 	cur_state = attr_mask & IB_QP_CUR_STATE ?
 		attr->cur_qp_state : qp->state;
 	new_state = attr_mask & IB_QP_STATE ? attr->qp_state : cur_state;
+	opa_ah = rdma_cap_opa_ah(ibqp->device, qp->port_num);
 
 	if (!ib_modify_qp_is_ok(cur_state, new_state, ibqp->qp_type,
 				attr_mask, link))
@@ -1086,17 +1089,31 @@ int rvt_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr,
 		goto inval;
 
 	if (attr_mask & IB_QP_AV) {
-		if (rdma_ah_get_dlid(&attr->ah_attr) >=
-		    be16_to_cpu(IB_MULTICAST_LID_BASE))
-			goto inval;
+		if (opa_ah) {
+			if (rdma_ah_get_dlid(&attr->ah_attr) >=
+				opa_get_mcast_base(OPA_MCAST_NR))
+				goto inval;
+		} else {
+			if (rdma_ah_get_dlid(&attr->ah_attr) >=
+				be16_to_cpu(IB_MULTICAST_LID_BASE))
+				goto inval;
+		}
+
 		if (rvt_check_ah(qp->ibqp.device, &attr->ah_attr))
 			goto inval;
 	}
 
 	if (attr_mask & IB_QP_ALT_PATH) {
-		if (rdma_ah_get_dlid(&attr->alt_ah_attr) >=
-		    be16_to_cpu(IB_MULTICAST_LID_BASE))
-			goto inval;
+		if (opa_ah) {
+			if (rdma_ah_get_dlid(&attr->alt_ah_attr) >=
+				opa_get_mcast_base(OPA_MCAST_NR))
+				goto inval;
+		} else {
+			if (rdma_ah_get_dlid(&attr->alt_ah_attr) >=
+				be16_to_cpu(IB_MULTICAST_LID_BASE))
+				goto inval;
+		}
+
 		if (rvt_check_ah(qp->ibqp.device, &attr->alt_ah_attr))
 			goto inval;
 		if (attr->alt_pkey_index >= rvt_get_npkeys(rdi))
diff --git a/include/rdma/opa_addr.h b/include/rdma/opa_addr.h
index 9b5e642cf550..8d3ad4ecbea1 100644
--- a/include/rdma/opa_addr.h
+++ b/include/rdma/opa_addr.h
@@ -48,10 +48,21 @@
 #ifndef OPA_ADDR_H
 #define OPA_ADDR_H
 
+#include <rdma/opa_smi.h>
+
 #define	OPA_SPECIAL_OUI		(0x00066AULL)
 #define OPA_MAKE_ID(x)          (cpu_to_be64(OPA_SPECIAL_OUI << 40 | (x)))
 #define OPA_TO_IB_UCAST_LID(x) (((x) >= be16_to_cpu(IB_MULTICAST_LID_BASE)) \
 				? 0 : x)
+/**
+ * 0xF8 - 4 bits of multicast range and 1 bit for collective range
+ * Example: For 24 bit LID space,
+ * Multicast range: 0xF00000 to 0xF7FFFF
+ * Collective range: 0xF80000 to 0xFFFFFE
+ */
+#define OPA_MCAST_NR 0x4 /* Number of top bits set */
+#define OPA_COLLECTIVE_NR 0x1 /* Number of bits after MCAST_NR */
+
 /**
  * ib_is_opa_gid: Returns true if the top 24 bits of the gid
  * contains the OPA_STL_OUI identifier. This identifies that
@@ -95,4 +106,11 @@ static inline bool opa_is_extended_lid(u32 dlid, u32 slid)
 	else
 		return false;
 }
+
+/* Get multicast lid base */
+static inline u32 opa_get_mcast_base(u32 nr_top_bits)
+{
+	return (be32_to_cpu(OPA_LID_PERMISSIVE) << (32 - nr_top_bits));
+}
+
 #endif /* OPA_ADDR_H */
-- 
cgit v1.2.3


From 72c07e2b671eda1cf3e8ebabc664f542f673b997 Mon Sep 17 00:00:00 2001
From: Don Hiatt <don.hiatt@intel.com>
Date: Fri, 4 Aug 2017 13:53:58 -0700
Subject: IB/hfi1: Add support to receive 16B bypass packets

We introduce a struct hfi1_16b_header to support 16B headers.
16B bypass packets are received by the driver and processed
similar to 9B packets. Add basic support to handle 16B packets.

Reviewed-by: Dennis Dalessandro <dennis.dalessandro@intel.com>
Signed-off-by: Don Hiatt <don.hiatt@intel.com>
Signed-off-by: Dennis Dalessandro <dennis.dalessandro@intel.com>
Signed-off-by: Doug Ledford <dledford@redhat.com>
---
 drivers/infiniband/hw/hfi1/chip.c      |   6 ++
 drivers/infiniband/hw/hfi1/common.h    |   1 +
 drivers/infiniband/hw/hfi1/driver.c    | 127 ++++++++++++++++++++++++++++----
 drivers/infiniband/hw/hfi1/hfi.h       | 131 ++++++++++++++++++++++++++++++++-
 drivers/infiniband/hw/hfi1/rc.c        |   2 +-
 drivers/infiniband/hw/hfi1/uc.c        |   2 +-
 drivers/infiniband/hw/hfi1/ud.c        |   4 +-
 drivers/infiniband/hw/hfi1/verbs.c     |  17 +++--
 drivers/infiniband/hw/hfi1/verbs.h     |  13 ++++
 drivers/infiniband/hw/hfi1/vnic.h      |  15 ----
 drivers/infiniband/hw/hfi1/vnic_main.c |   4 +-
 include/rdma/opa_vnic.h                |   3 -
 12 files changed, 274 insertions(+), 51 deletions(-)

(limited to 'include')

diff --git a/drivers/infiniband/hw/hfi1/chip.c b/drivers/infiniband/hw/hfi1/chip.c
index 1446c16bc8a8..ee1324cce25a 100644
--- a/drivers/infiniband/hw/hfi1/chip.c
+++ b/drivers/infiniband/hw/hfi1/chip.c
@@ -14468,6 +14468,7 @@ void hfi1_deinit_vnic_rsm(struct hfi1_devdata *dd)
 static void init_rxe(struct hfi1_devdata *dd)
 {
 	struct rsm_map_table *rmt;
+	u64 val;
 
 	/* enable all receive errors */
 	write_csr(dd, RCV_ERR_MASK, ~0ull);
@@ -14492,6 +14493,11 @@ static void init_rxe(struct hfi1_devdata *dd)
 	 * (64 bytes).  Max_Payload_Size is possibly modified upward in
 	 * tune_pcie_caps() which is called after this routine.
 	 */
+
+	/* Have 16 bytes (4DW) of bypass header available in header queue */
+	val = read_csr(dd, RCV_BYPASS);
+	val |= (4ull << 16);
+	write_csr(dd, RCV_BYPASS, val);
 }
 
 static void init_other(struct hfi1_devdata *dd)
diff --git a/drivers/infiniband/hw/hfi1/common.h b/drivers/infiniband/hw/hfi1/common.h
index aa416ef93c1a..3e27794ec750 100644
--- a/drivers/infiniband/hw/hfi1/common.h
+++ b/drivers/infiniband/hw/hfi1/common.h
@@ -327,6 +327,7 @@ struct diag_pkt {
 /* misc. */
 #define SC15_PACKET 0xF
 #define SIZE_OF_CRC 1
+#define SIZE_OF_LT 1
 
 #define LIM_MGMT_P_KEY       0x7FFF
 #define FULL_MGMT_P_KEY      0xFFFF
diff --git a/drivers/infiniband/hw/hfi1/driver.c b/drivers/infiniband/hw/hfi1/driver.c
index 14f2a00c13c2..5280d82c344e 100644
--- a/drivers/infiniband/hw/hfi1/driver.c
+++ b/drivers/infiniband/hw/hfi1/driver.c
@@ -237,6 +237,13 @@ static inline struct ib_header *hfi1_get_msgheader(struct hfi1_devdata *dd,
 	return (struct ib_header *)hfi1_get_header(dd, rhf_addr);
 }
 
+static inline struct hfi1_16b_header
+		*hfi1_get_16B_header(struct hfi1_devdata *dd,
+				     __le32 *rhf_addr)
+{
+	return (struct hfi1_16b_header *)hfi1_get_header(dd, rhf_addr);
+}
+
 /*
  * Validate and encode the a given RcvArray Buffer size.
  * The function will check whether the given size falls within
@@ -925,6 +932,11 @@ static inline int set_armed_to_active(struct hfi1_ctxtdata *rcd,
 		struct ib_header *hdr = hfi1_get_msgheader(packet->rcd->dd,
 							   packet->rhf_addr);
 		sc = hfi1_9B_get_sc5(hdr, packet->rhf);
+	} else if (etype == RHF_RCV_TYPE_BYPASS) {
+		struct hfi1_16b_header *hdr = hfi1_get_16B_header(
+						packet->rcd->dd,
+						packet->rhf_addr);
+		sc = hfi1_16B_get_sc(hdr);
 	}
 	if (sc != SC15_PACKET) {
 		int hwstate = driver_lstate(rcd->ppd);
@@ -1386,9 +1398,14 @@ static int hfi1_setup_9B_packet(struct hfi1_packet *packet)
 	}
 
 	/* Query commonly used fields from packet header */
+	packet->payload = packet->ebuf;
 	packet->opcode = ib_bth_get_opcode(packet->ohdr);
 	packet->slid = ib_get_slid(hdr);
 	packet->dlid = ib_get_dlid(hdr);
+	if (unlikely((packet->dlid >= be16_to_cpu(IB_MULTICAST_LID_BASE)) &&
+		     (packet->dlid != be16_to_cpu(IB_LID_PERMISSIVE))))
+		packet->dlid += opa_get_mcast_base(OPA_MCAST_NR) -
+				be16_to_cpu(IB_MULTICAST_LID_BASE);
 	packet->sl = ib_get_sl(hdr);
 	packet->sc = hfi1_9B_get_sc5(hdr, packet->rhf);
 	packet->pad = ib_bth_get_pad(packet->ohdr);
@@ -1402,6 +1419,73 @@ drop:
 	return -EINVAL;
 }
 
+static int hfi1_setup_bypass_packet(struct hfi1_packet *packet)
+{
+	/*
+	 * Bypass packets have a different header/payload split
+	 * compared to an IB packet.
+	 * Current split is set such that 16 bytes of the actual
+	 * header is in the header buffer and the remining is in
+	 * the eager buffer. We chose 16 since hfi1 driver only
+	 * supports 16B bypass packets and we will be able to
+	 * receive the entire LRH with such a split.
+	 */
+
+	struct hfi1_ctxtdata *rcd = packet->rcd;
+	struct hfi1_pportdata *ppd = rcd->ppd;
+	struct hfi1_ibport *ibp = &ppd->ibport_data;
+	u8 l4;
+	u8 grh_len;
+
+	packet->hdr = (struct hfi1_16b_header *)
+			hfi1_get_16B_header(packet->rcd->dd,
+					    packet->rhf_addr);
+	packet->hlen = (u8 *)packet->rhf_addr - (u8 *)packet->hdr;
+
+	l4 = hfi1_16B_get_l4(packet->hdr);
+	if (l4 == OPA_16B_L4_IB_LOCAL) {
+		grh_len = 0;
+		packet->ohdr = packet->ebuf;
+		packet->grh = NULL;
+	} else if (l4 == OPA_16B_L4_IB_GLOBAL) {
+		u32 vtf;
+
+		grh_len = sizeof(struct ib_grh);
+		packet->ohdr = packet->ebuf + grh_len;
+		packet->grh = packet->ebuf;
+		if (packet->grh->next_hdr != IB_GRH_NEXT_HDR)
+			goto drop;
+		vtf = be32_to_cpu(packet->grh->version_tclass_flow);
+		if ((vtf >> IB_GRH_VERSION_SHIFT) != IB_GRH_VERSION)
+			goto drop;
+	} else {
+		goto drop;
+	}
+
+	/* Query commonly used fields from packet header */
+	packet->opcode = ib_bth_get_opcode(packet->ohdr);
+	packet->hlen = hdr_len_by_opcode[packet->opcode] + 8 + grh_len;
+	packet->payload = packet->ebuf + packet->hlen - (4 * sizeof(u32));
+	packet->slid = hfi1_16B_get_slid(packet->hdr);
+	packet->dlid = hfi1_16B_get_dlid(packet->hdr);
+	if (unlikely(hfi1_is_16B_mcast(packet->dlid)))
+		packet->dlid += opa_get_mcast_base(OPA_MCAST_NR) -
+				opa_get_lid(opa_get_mcast_base(OPA_MCAST_NR),
+					    16B);
+	packet->sc = hfi1_16B_get_sc(packet->hdr);
+	packet->sl = ibp->sc_to_sl[packet->sc];
+	packet->pad = hfi1_16B_bth_get_pad(packet->ohdr);
+	packet->extra_byte = SIZE_OF_LT;
+	packet->fecn = hfi1_16B_get_fecn(packet->hdr);
+	packet->becn = hfi1_16B_get_becn(packet->hdr);
+
+	return 0;
+drop:
+	hfi1_cdbg(PKT, "%s: packet dropped\n", __func__);
+	ibp->rvp.n_pkt_drops++;
+	return -EINVAL;
+}
+
 void handle_eflags(struct hfi1_packet *packet)
 {
 	struct hfi1_ctxtdata *rcd = packet->rcd;
@@ -1464,8 +1548,8 @@ static inline bool hfi1_is_vnic_packet(struct hfi1_packet *packet)
 	if (packet->rcd->is_vnic)
 		return true;
 
-	if ((HFI1_GET_L2_TYPE(packet->ebuf) == OPA_VNIC_L2_TYPE) &&
-	    (HFI1_GET_L4_TYPE(packet->ebuf) == OPA_VNIC_L4_ETHR))
+	if ((hfi1_16B_get_l2(packet->ebuf) == OPA_16B_L2_TYPE) &&
+	    (hfi1_16B_get_l4(packet->ebuf) == OPA_16B_L4_ETHR))
 		return true;
 
 	return false;
@@ -1475,25 +1559,38 @@ int process_receive_bypass(struct hfi1_packet *packet)
 {
 	struct hfi1_devdata *dd = packet->rcd->dd;
 
-	if (unlikely(rhf_err_flags(packet->rhf))) {
-		handle_eflags(packet);
-	} else if (hfi1_is_vnic_packet(packet)) {
+	if (hfi1_is_vnic_packet(packet)) {
 		hfi1_vnic_bypass_rcv(packet);
 		return RHF_RCV_CONTINUE;
 	}
 
-	dd_dev_err(dd, "Unsupported bypass packet. Dropping\n");
-	incr_cntr64(&dd->sw_rcv_bypass_packet_errors);
-	if (!(dd->err_info_rcvport.status_and_code & OPA_EI_STATUS_SMASK)) {
-		u64 *flits = packet->ebuf;
+	if (hfi1_setup_bypass_packet(packet))
+		return RHF_RCV_CONTINUE;
+
+	if (unlikely(rhf_err_flags(packet->rhf))) {
+		handle_eflags(packet);
+		return RHF_RCV_CONTINUE;
+	}
 
-		if (flits && !(packet->rhf & RHF_LEN_ERR)) {
-			dd->err_info_rcvport.packet_flit1 = flits[0];
-			dd->err_info_rcvport.packet_flit2 =
-				packet->tlen > sizeof(flits[0]) ? flits[1] : 0;
+	if (hfi1_16B_get_l2(packet->hdr) == 0x2) {
+		hfi1_16B_rcv(packet);
+	} else {
+		dd_dev_err(dd,
+			   "Bypass packets other than 16B are not supported in normal operation. Dropping\n");
+		incr_cntr64(&dd->sw_rcv_bypass_packet_errors);
+		if (!(dd->err_info_rcvport.status_and_code &
+		      OPA_EI_STATUS_SMASK)) {
+			u64 *flits = packet->ebuf;
+
+			if (flits && !(packet->rhf & RHF_LEN_ERR)) {
+				dd->err_info_rcvport.packet_flit1 = flits[0];
+				dd->err_info_rcvport.packet_flit2 =
+					packet->tlen > sizeof(flits[0]) ?
+					flits[1] : 0;
+			}
+			dd->err_info_rcvport.status_and_code |=
+				(OPA_EI_STATUS_SMASK | BAD_L2_ERR);
 		}
-		dd->err_info_rcvport.status_and_code |=
-			(OPA_EI_STATUS_SMASK | BAD_L2_ERR);
 	}
 	return RHF_RCV_CONTINUE;
 }
diff --git a/drivers/infiniband/hw/hfi1/hfi.h b/drivers/infiniband/hw/hfi1/hfi.h
index fa9160f68bb7..dbbad760cad4 100644
--- a/drivers/infiniband/hw/hfi1/hfi.h
+++ b/drivers/infiniband/hw/hfi1/hfi.h
@@ -66,6 +66,7 @@
 #include <linux/i2c.h>
 #include <linux/i2c-algo-bit.h>
 #include <rdma/ib_hdrs.h>
+#include <rdma/opa_addr.h>
 #include <linux/rhashtable.h>
 #include <linux/netdevice.h>
 #include <rdma/rdma_vt.h>
@@ -325,6 +326,7 @@ struct hfi1_ctxtdata {
 struct hfi1_packet {
 	void *ebuf;
 	void *hdr;
+	void *payload;
 	struct hfi1_ctxtdata *rcd;
 	__le32 *rhf_addr;
 	struct rvt_qp *qp;
@@ -351,6 +353,83 @@ struct hfi1_packet {
 	bool fecn;
 };
 
+/*
+ * OPA 16B Header
+ */
+#define OPA_16B_L4_MASK		0xFFull
+#define OPA_16B_SC_MASK		0x1F00000ull
+#define OPA_16B_SC_SHIFT	20
+#define OPA_16B_LID_MASK	0xFFFFFull
+#define OPA_16B_DLID_MASK	0xF000ull
+#define OPA_16B_DLID_SHIFT	20
+#define OPA_16B_DLID_HIGH_SHIFT	12
+#define OPA_16B_SLID_MASK	0xF00ull
+#define OPA_16B_SLID_SHIFT	20
+#define OPA_16B_SLID_HIGH_SHIFT	8
+#define OPA_16B_BECN_MASK       0x80000000ull
+#define OPA_16B_BECN_SHIFT      31
+#define OPA_16B_FECN_MASK       0x10000000ull
+#define OPA_16B_FECN_SHIFT      28
+#define OPA_16B_L2_MASK		0x60000000ull
+#define OPA_16B_L2_SHIFT	29
+
+/*
+ * OPA 16B L2/L4 Encodings
+ */
+#define OPA_16B_L2_TYPE		0x02
+#define OPA_16B_L4_IB_LOCAL	0x09
+#define OPA_16B_L4_IB_GLOBAL	0x0A
+#define OPA_16B_L4_ETHR		OPA_VNIC_L4_ETHR
+
+static inline u8 hfi1_16B_get_l4(struct hfi1_16b_header *hdr)
+{
+	return (u8)(hdr->lrh[2] & OPA_16B_L4_MASK);
+}
+
+static inline u8 hfi1_16B_get_sc(struct hfi1_16b_header *hdr)
+{
+	return (u8)((hdr->lrh[1] & OPA_16B_SC_MASK) >> OPA_16B_SC_SHIFT);
+}
+
+static inline u32 hfi1_16B_get_dlid(struct hfi1_16b_header *hdr)
+{
+	return (u32)((hdr->lrh[1] & OPA_16B_LID_MASK) |
+		     (((hdr->lrh[2] & OPA_16B_DLID_MASK) >>
+		     OPA_16B_DLID_HIGH_SHIFT) << OPA_16B_DLID_SHIFT));
+}
+
+static inline u32 hfi1_16B_get_slid(struct hfi1_16b_header *hdr)
+{
+	return (u32)((hdr->lrh[0] & OPA_16B_LID_MASK) |
+		     (((hdr->lrh[2] & OPA_16B_SLID_MASK) >>
+		     OPA_16B_SLID_HIGH_SHIFT) << OPA_16B_SLID_SHIFT));
+}
+
+static inline u8 hfi1_16B_get_becn(struct hfi1_16b_header *hdr)
+{
+	return (u8)((hdr->lrh[0] & OPA_16B_BECN_MASK) >> OPA_16B_BECN_SHIFT);
+}
+
+static inline u8 hfi1_16B_get_fecn(struct hfi1_16b_header *hdr)
+{
+	return (u8)((hdr->lrh[1] & OPA_16B_FECN_MASK) >> OPA_16B_FECN_SHIFT);
+}
+
+static inline u8 hfi1_16B_get_l2(struct hfi1_16b_header *hdr)
+{
+	return (u8)((hdr->lrh[1] & OPA_16B_L2_MASK) >> OPA_16B_L2_SHIFT);
+}
+
+/*
+ * BTH
+ */
+#define OPA_16B_BTH_PAD_MASK	7
+static inline u8 hfi1_16B_bth_get_pad(struct ib_other_headers *ohdr)
+{
+	return (u8)((be32_to_cpu(ohdr->bth[0]) >> IB_BTH_PAD_SHIFT) &
+		   OPA_16B_BTH_PAD_MASK);
+}
+
 struct rvt_sge_state;
 
 /*
@@ -2084,11 +2163,55 @@ int hfi1_tempsense_rd(struct hfi1_devdata *dd, struct hfi1_temp *temp);
 
 /*
  * hfi1_check_mcast- Check if the given lid is
- * in the IB multicast range.
+ * in the OPA multicast range.
+ *
+ * The LID might either reside in ah.dlid or might be
+ * in the GRH of the address handle as DGID if extended
+ * addresses are in use.
  */
-static inline bool hfi1_check_mcast(u16 lid)
+static inline bool hfi1_check_mcast(u32 lid)
+{
+	return ((lid >= opa_get_mcast_base(OPA_MCAST_NR)) &&
+		(lid != be32_to_cpu(OPA_LID_PERMISSIVE)));
+}
+
+#define opa_get_lid(lid, format)	\
+	__opa_get_lid(lid, OPA_PORT_PACKET_FORMAT_##format)
+
+/* Convert a lid to a specific lid space */
+static inline u32 __opa_get_lid(u32 lid, u8 format)
+{
+	bool is_mcast = hfi1_check_mcast(lid);
+
+	switch (format) {
+	case OPA_PORT_PACKET_FORMAT_8B:
+	case OPA_PORT_PACKET_FORMAT_10B:
+		if (is_mcast)
+			return (lid - opa_get_mcast_base(OPA_MCAST_NR) +
+				0xF0000);
+		return lid & 0xFFFFF;
+	case OPA_PORT_PACKET_FORMAT_16B:
+		if (is_mcast)
+			return (lid - opa_get_mcast_base(OPA_MCAST_NR) +
+				0xF00000);
+		return lid & 0xFFFFFF;
+	case OPA_PORT_PACKET_FORMAT_9B:
+		if (is_mcast)
+			return (lid -
+				opa_get_mcast_base(OPA_MCAST_NR) +
+				be16_to_cpu(IB_MULTICAST_LID_BASE));
+		else
+			return lid & 0xFFFF;
+	default:
+		return lid;
+	}
+}
+
+/* Return true if the given lid is the OPA 16B multicast range */
+static inline bool hfi1_is_16B_mcast(u32 lid)
 {
-	return ((lid >= be16_to_cpu(IB_MULTICAST_LID_BASE)) &&
-		(lid != be16_to_cpu(IB_LID_PERMISSIVE)));
+	return ((lid >=
+		opa_get_lid(opa_get_mcast_base(OPA_MCAST_NR), 16B)) &&
+		(lid != opa_get_lid(be32_to_cpu(OPA_LID_PERMISSIVE), 16B)));
 }
 #endif                          /* _HFI1_KERNEL_H */
diff --git a/drivers/infiniband/hw/hfi1/rc.c b/drivers/infiniband/hw/hfi1/rc.c
index baa67bf0772b..cf74a56e20e5 100644
--- a/drivers/infiniband/hw/hfi1/rc.c
+++ b/drivers/infiniband/hw/hfi1/rc.c
@@ -1916,7 +1916,7 @@ void process_becn(struct hfi1_pportdata *ppd, u8 sl, u16 rlid, u32 lqpn,
 void hfi1_rc_rcv(struct hfi1_packet *packet)
 {
 	struct hfi1_ctxtdata *rcd = packet->rcd;
-	void *data = packet->ebuf;
+	void *data = packet->payload;
 	u32 tlen = packet->tlen;
 	struct rvt_qp *qp = packet->qp;
 	struct hfi1_ibport *ibp = rcd_to_iport(rcd);
diff --git a/drivers/infiniband/hw/hfi1/uc.c b/drivers/infiniband/hw/hfi1/uc.c
index 76c2451a53d7..366f7b9517fe 100644
--- a/drivers/infiniband/hw/hfi1/uc.c
+++ b/drivers/infiniband/hw/hfi1/uc.c
@@ -297,7 +297,7 @@ bail_no_tx:
 void hfi1_uc_rcv(struct hfi1_packet *packet)
 {
 	struct hfi1_ibport *ibp = rcd_to_iport(packet->rcd);
-	void *data = packet->ebuf;
+	void *data = packet->payload;
 	u32 tlen = packet->tlen;
 	struct rvt_qp *qp = packet->qp;
 	struct ib_other_headers *ohdr = packet->ohdr;
diff --git a/drivers/infiniband/hw/hfi1/ud.c b/drivers/infiniband/hw/hfi1/ud.c
index 6bf7a1b08491..dcf8c14c6d0e 100644
--- a/drivers/infiniband/hw/hfi1/ud.c
+++ b/drivers/infiniband/hw/hfi1/ud.c
@@ -667,11 +667,10 @@ void hfi1_ud_rcv(struct hfi1_packet *packet)
 	struct hfi1_ibport *ibp = rcd_to_iport(packet->rcd);
 	struct hfi1_pportdata *ppd = ppd_from_ibp(ibp);
 	struct ib_header *hdr = packet->hdr;
-	void *data = packet->ebuf;
+	void *data = packet->payload;
 	u32 tlen = packet->tlen;
 	struct rvt_qp *qp = packet->qp;
 	u8 sc5 = hfi1_9B_get_sc5(hdr, packet->rhf);
-	u32 bth1;
 	u8 sl_from_sc;
 	u8 extra_bytes = packet->pad;
 	u8 opcode = packet->opcode;
@@ -679,7 +678,6 @@ void hfi1_ud_rcv(struct hfi1_packet *packet)
 	u32 dlid = packet->dlid;
 	u32 slid = packet->slid;
 
-	bth1 = be32_to_cpu(ohdr->bth[1]);
 	qkey = ib_get_qkey(ohdr);
 	src_qp = ib_get_sqpn(ohdr);
 	pkey = ib_bth_get_pkey(ohdr);
diff --git a/drivers/infiniband/hw/hfi1/verbs.c b/drivers/infiniband/hw/hfi1/verbs.c
index 97ca42beb023..ebddab1a06f4 100644
--- a/drivers/infiniband/hw/hfi1/verbs.c
+++ b/drivers/infiniband/hw/hfi1/verbs.c
@@ -571,7 +571,7 @@ static inline void hfi1_handle_packet(struct hfi1_packet *packet,
 			goto drop;
 		mcast = rvt_mcast_find(&ibp->rvp,
 				       &packet->grh->dgid,
-				       packet->dlid);
+				       opa_get_lid(packet->dlid, 9B));
 		if (!mcast)
 			goto drop;
 		list_for_each_entry_rcu(p, &mcast->qp_list, list) {
@@ -627,14 +627,17 @@ drop:
 void hfi1_ib_rcv(struct hfi1_packet *packet)
 {
 	struct hfi1_ctxtdata *rcd = packet->rcd;
-	bool is_mcast = false;
 
-	if (unlikely(hfi1_check_mcast(packet->dlid)))
-		is_mcast = true;
+	trace_input_ibhdr(rcd->dd, packet, !!(rhf_dc_info(packet->rhf)));
+	hfi1_handle_packet(packet, hfi1_check_mcast(packet->dlid));
+}
+
+void hfi1_16B_rcv(struct hfi1_packet *packet)
+{
+	struct hfi1_ctxtdata *rcd = packet->rcd;
 
-	trace_input_ibhdr(rcd->dd, packet,
-			  !!(packet->rhf & RHF_DC_INFO_SMASK));
-	hfi1_handle_packet(packet, is_mcast);
+	trace_input_ibhdr(rcd->dd, packet, false);
+	hfi1_handle_packet(packet, hfi1_check_mcast(packet->dlid));
 }
 
 /*
diff --git a/drivers/infiniband/hw/hfi1/verbs.h b/drivers/infiniband/hw/hfi1/verbs.h
index 34267c7ef85a..590aab270f98 100644
--- a/drivers/infiniband/hw/hfi1/verbs.h
+++ b/drivers/infiniband/hw/hfi1/verbs.h
@@ -104,6 +104,17 @@ enum {
 	HFI1_HAS_GRH = (1 << 0),
 };
 
+struct hfi1_16b_header {
+	u32 lrh[4];
+	union {
+		struct {
+			struct ib_grh grh;
+			struct ib_other_headers oth;
+		} l;
+		struct ib_other_headers oth;
+	} u;
+} __packed;
+
 struct hfi1_ahg_info {
 	u32 ahgdesc[2];
 	u16 tx_flags;
@@ -378,6 +389,8 @@ void hfi1_unregister_ib_device(struct hfi1_devdata *);
 
 void hfi1_ib_rcv(struct hfi1_packet *packet);
 
+void hfi1_16B_rcv(struct hfi1_packet *packet);
+
 unsigned hfi1_get_npkeys(struct hfi1_devdata *);
 
 int hfi1_verbs_send_dma(struct rvt_qp *qp, struct hfi1_pkt_state *ps,
diff --git a/drivers/infiniband/hw/hfi1/vnic.h b/drivers/infiniband/hw/hfi1/vnic.h
index eec7c1424991..5ae781514e32 100644
--- a/drivers/infiniband/hw/hfi1/vnic.h
+++ b/drivers/infiniband/hw/hfi1/vnic.h
@@ -54,21 +54,6 @@
 #define HFI1_VNIC_MAX_TXQ     16
 #define HFI1_VNIC_MAX_PAD     12
 
-/* L2 header definitions */
-#define HFI1_L2_TYPE_OFFSET     0x7
-#define HFI1_L2_TYPE_SHFT       0x5
-#define HFI1_L2_TYPE_MASK       0x3
-
-#define HFI1_GET_L2_TYPE(hdr)                                            \
-	((*((u8 *)(hdr) + HFI1_L2_TYPE_OFFSET) >> HFI1_L2_TYPE_SHFT) &   \
-	 HFI1_L2_TYPE_MASK)
-
-/* L4 type definitions */
-#define HFI1_L4_TYPE_OFFSET 8
-
-#define HFI1_GET_L4_TYPE(data)   \
-	(*((u8 *)(data) + HFI1_L4_TYPE_OFFSET))
-
 /* L4 header definitions */
 #define HFI1_VNIC_L4_HDR_OFFSET  OPA_VNIC_L2_HDR_LEN
 
diff --git a/drivers/infiniband/hw/hfi1/vnic_main.c b/drivers/infiniband/hw/hfi1/vnic_main.c
index 2917a238a343..f419cbb05928 100644
--- a/drivers/infiniband/hw/hfi1/vnic_main.c
+++ b/drivers/infiniband/hw/hfi1/vnic_main.c
@@ -564,8 +564,8 @@ void hfi1_vnic_bypass_rcv(struct hfi1_packet *packet)
 	int l4_type, vesw_id = -1;
 	u8 q_idx;
 
-	l4_type = HFI1_GET_L4_TYPE(packet->ebuf);
-	if (likely(l4_type == OPA_VNIC_L4_ETHR)) {
+	l4_type = hfi1_16B_get_l4(packet->ebuf);
+	if (likely(l4_type == OPA_16B_L4_ETHR)) {
 		vesw_id = HFI1_VNIC_GET_VESWID(packet->ebuf);
 		vinfo = idr_find(&dd->vnic.vesw_idr, vesw_id);
 
diff --git a/include/rdma/opa_vnic.h b/include/rdma/opa_vnic.h
index 39d6890616a6..0c07a70bd7f6 100644
--- a/include/rdma/opa_vnic.h
+++ b/include/rdma/opa_vnic.h
@@ -54,9 +54,6 @@
 
 #include <rdma/ib_verbs.h>
 
-/* VNIC uses 16B header format */
-#define OPA_VNIC_L2_TYPE    0x2
-
 /* 16 header bytes + 2 reserved bytes */
 #define OPA_VNIC_L2_HDR_LEN   (16 + 2)
 
-- 
cgit v1.2.3


From d98bb7f7e6fa29d45008370084d5cabac7ac69ed Mon Sep 17 00:00:00 2001
From: Don Hiatt <don.hiatt@intel.com>
Date: Fri, 4 Aug 2017 13:54:16 -0700
Subject: IB/hfi1: Determine 9B/16B L2 header type based on Address handle

When address handle attributes are initialized, the LIDs are
transformed to be in the 32 bit LID space.
When constructing the header, hfi1 driver will look at the LID
to determine the packet header to be created.

Reviewed-by: Dennis Dalessandro <dennis.dalessandro@intel.com>
Signed-off-by: Dasaratharaman Chandramouli <dasaratharaman.chandramouli@intel.com>
Signed-off-by: Don Hiatt <don.hiatt@intel.com>
Signed-off-by: Dennis Dalessandro <dennis.dalessandro@intel.com>
Signed-off-by: Doug Ledford <dledford@redhat.com>
---
 drivers/infiniband/core/sa_query.c   | 21 +++++---
 drivers/infiniband/core/uverbs_cmd.c |  3 ++
 drivers/infiniband/hw/hfi1/hfi.h     | 92 ++++++++++++++++++++++++++++++++++++
 drivers/infiniband/hw/hfi1/qp.c      | 28 +++++++++++
 drivers/infiniband/hw/hfi1/verbs.c   | 12 +++++
 drivers/infiniband/hw/hfi1/verbs.h   |  1 +
 include/rdma/ib_verbs.h              | 15 ++++++
 include/rdma/opa_addr.h              |  4 +-
 8 files changed, 168 insertions(+), 8 deletions(-)

(limited to 'include')

diff --git a/drivers/infiniband/core/sa_query.c b/drivers/infiniband/core/sa_query.c
index da29e2863c84..0179b21bad34 100644
--- a/drivers/infiniband/core/sa_query.c
+++ b/drivers/infiniband/core/sa_query.c
@@ -50,6 +50,7 @@
 #include <uapi/rdma/ib_user_sa.h>
 #include <rdma/ib_marshall.h>
 #include <rdma/ib_addr.h>
+#include <rdma/opa_addr.h>
 #include "sa.h"
 #include "core_priv.h"
 
@@ -1239,6 +1240,11 @@ int ib_init_ah_from_path(struct ib_device *device, u8 port_num,
 	ah_attr->type = rdma_ah_find_type(device, port_num);
 
 	rdma_ah_set_dlid(ah_attr, be32_to_cpu(sa_path_get_dlid(rec)));
+
+	if ((ah_attr->type == RDMA_AH_ATTR_TYPE_OPA) &&
+	    (rdma_ah_get_dlid(ah_attr) == be16_to_cpu(IB_LID_PERMISSIVE)))
+		rdma_ah_set_make_grd(ah_attr, true);
+
 	rdma_ah_set_sl(ah_attr, rec->sl);
 	rdma_ah_set_path_bits(ah_attr, be32_to_cpu(sa_path_get_slid(rec)) &
 			      get_src_path_mask(device, port_num));
@@ -2288,12 +2294,15 @@ static void update_sm_ah(struct work_struct *work)
 	rdma_ah_set_sl(&ah_attr, port_attr.sm_sl);
 	rdma_ah_set_port_num(&ah_attr, port->port_num);
 	if (port_attr.grh_required) {
-		rdma_ah_set_ah_flags(&ah_attr, IB_AH_GRH);
-
-		rdma_ah_set_subnet_prefix(&ah_attr,
-					  cpu_to_be64(port_attr.subnet_prefix));
-		rdma_ah_set_interface_id(&ah_attr,
-					 cpu_to_be64(IB_SA_WELL_KNOWN_GUID));
+		if (ah_attr.type == RDMA_AH_ATTR_TYPE_OPA) {
+			rdma_ah_set_make_grd(&ah_attr, true);
+		} else {
+			rdma_ah_set_ah_flags(&ah_attr, IB_AH_GRH);
+			rdma_ah_set_subnet_prefix(&ah_attr,
+						  cpu_to_be64(port_attr.subnet_prefix));
+			rdma_ah_set_interface_id(&ah_attr,
+						 cpu_to_be64(IB_SA_WELL_KNOWN_GUID));
+		}
 	}
 
 	new_ah->ah = rdma_create_ah(port->agent->qp->pd, &ah_attr);
diff --git a/drivers/infiniband/core/uverbs_cmd.c b/drivers/infiniband/core/uverbs_cmd.c
index 7ea5a3bb5a04..dc7d773a96ec 100644
--- a/drivers/infiniband/core/uverbs_cmd.c
+++ b/drivers/infiniband/core/uverbs_cmd.c
@@ -2009,6 +2009,7 @@ static int modify_qp(struct ib_uverbs_file *file,
 	rdma_ah_set_static_rate(&attr->ah_attr, cmd->base.dest.static_rate);
 	rdma_ah_set_port_num(&attr->ah_attr,
 			     cmd->base.dest.port_num);
+	rdma_ah_set_make_grd(&attr->ah_attr, false);
 
 	attr->alt_ah_attr.type = rdma_ah_find_type(qp->device,
 						   cmd->base.dest.port_num);
@@ -2032,6 +2033,7 @@ static int modify_qp(struct ib_uverbs_file *file,
 				cmd->base.alt_dest.static_rate);
 	rdma_ah_set_port_num(&attr->alt_ah_attr,
 			     cmd->base.alt_dest.port_num);
+	rdma_ah_set_make_grd(&attr->alt_ah_attr, false);
 
 	ret = ib_modify_qp_with_udata(qp, attr,
 				      modify_qp_mask(qp->qp_type,
@@ -2584,6 +2586,7 @@ ssize_t ib_uverbs_create_ah(struct ib_uverbs_file *file,
 	}
 
 	attr.type = rdma_ah_find_type(ib_dev, cmd.attr.port_num);
+	rdma_ah_set_make_grd(&attr, false);
 	rdma_ah_set_dlid(&attr, cmd.attr.dlid);
 	rdma_ah_set_sl(&attr, cmd.attr.sl);
 	rdma_ah_set_path_bits(&attr, cmd.attr.src_path_bits);
diff --git a/drivers/infiniband/hw/hfi1/hfi.h b/drivers/infiniband/hw/hfi1/hfi.h
index ee19660ca2fa..cec9590870ba 100644
--- a/drivers/infiniband/hw/hfi1/hfi.h
+++ b/drivers/infiniband/hw/hfi1/hfi.h
@@ -70,6 +70,7 @@
 #include <linux/rhashtable.h>
 #include <linux/netdevice.h>
 #include <rdma/rdma_vt.h>
+#include <rdma/opa_addr.h>
 
 #include "chip_registers.h"
 #include "common.h"
@@ -353,6 +354,10 @@ struct hfi1_packet {
 	bool fecn;
 };
 
+/* Packet types */
+#define HFI1_PKT_TYPE_9B  0
+#define HFI1_PKT_TYPE_16B 1
+
 /*
  * OPA 16B Header
  */
@@ -2170,6 +2175,31 @@ int hfi1_tempsense_rd(struct hfi1_devdata *dd, struct hfi1_temp *temp);
 #define DD_DEV_ENTRY(dd)       __string(dev, dev_name(&(dd)->pcidev->dev))
 #define DD_DEV_ASSIGN(dd)      __assign_str(dev, dev_name(&(dd)->pcidev->dev))
 
+static inline void hfi1_update_ah_attr(struct ib_device *ibdev,
+				       struct rdma_ah_attr *attr)
+{
+	struct hfi1_pportdata *ppd;
+	struct hfi1_ibport *ibp;
+	u32 dlid = rdma_ah_get_dlid(attr);
+
+	/*
+	 * Kernel clients may not have setup GRH information
+	 * Set that here.
+	 */
+	ibp = to_iport(ibdev, rdma_ah_get_port_num(attr));
+	ppd = ppd_from_ibp(ibp);
+	if ((((dlid >= be16_to_cpu(IB_MULTICAST_LID_BASE)) ||
+	      (ppd->lid >= be16_to_cpu(IB_MULTICAST_LID_BASE))) &&
+	    (dlid != be32_to_cpu(OPA_LID_PERMISSIVE)) &&
+	    (dlid != be16_to_cpu(IB_LID_PERMISSIVE)) &&
+	    (!(rdma_ah_get_ah_flags(attr) & IB_AH_GRH))) ||
+	    (rdma_ah_get_make_grd(attr))) {
+		rdma_ah_set_ah_flags(attr, IB_AH_GRH);
+		rdma_ah_set_interface_id(attr, OPA_MAKE_ID(dlid));
+		rdma_ah_set_subnet_prefix(attr, ibp->rvp.gid_prefix);
+	}
+}
+
 /*
  * hfi1_check_mcast- Check if the given lid is
  * in the OPA multicast range.
@@ -2223,4 +2253,66 @@ static inline bool hfi1_is_16B_mcast(u32 lid)
 		opa_get_lid(opa_get_mcast_base(OPA_MCAST_NR), 16B)) &&
 		(lid != opa_get_lid(be32_to_cpu(OPA_LID_PERMISSIVE), 16B)));
 }
+
+static inline void hfi1_make_opa_lid(struct rdma_ah_attr *attr)
+{
+	const struct ib_global_route *grh = rdma_ah_read_grh(attr);
+	u32 dlid = rdma_ah_get_dlid(attr);
+
+	/* Modify ah_attr.dlid to be in the 32 bit LID space.
+	 * This is how the address will be laid out:
+	 * Assuming MCAST_NR to be 4,
+	 * 32 bit permissive LID = 0xFFFFFFFF
+	 * Multicast LID range = 0xFFFFFFFE to 0xF0000000
+	 * Unicast LID range = 0xEFFFFFFF to 1
+	 * Invalid LID = 0
+	 */
+	if (ib_is_opa_gid(&grh->dgid))
+		dlid = opa_get_lid_from_gid(&grh->dgid);
+	else if ((dlid >= be16_to_cpu(IB_MULTICAST_LID_BASE)) &&
+		 (dlid != be16_to_cpu(IB_LID_PERMISSIVE)) &&
+		 (dlid != be32_to_cpu(OPA_LID_PERMISSIVE)))
+		dlid = dlid - be16_to_cpu(IB_MULTICAST_LID_BASE) +
+			opa_get_mcast_base(OPA_MCAST_NR);
+	else if (dlid == be16_to_cpu(IB_LID_PERMISSIVE))
+		dlid = be32_to_cpu(OPA_LID_PERMISSIVE);
+
+	rdma_ah_set_dlid(attr, dlid);
+}
+
+static inline u8 hfi1_get_packet_type(u32 lid)
+{
+	/* 9B if lid > 0xF0000000 */
+	if (lid >= opa_get_mcast_base(OPA_MCAST_NR))
+		return HFI1_PKT_TYPE_9B;
+
+	/* 16B if lid > 0xC000 */
+	if (lid >= opa_get_lid(opa_get_mcast_base(OPA_MCAST_NR), 9B))
+		return HFI1_PKT_TYPE_16B;
+
+	return HFI1_PKT_TYPE_9B;
+}
+
+static inline bool hfi1_get_hdr_type(u32 lid, struct rdma_ah_attr *attr)
+{
+	/*
+	 * If there was an incoming 16B packet with permissive
+	 * LIDs, OPA GIDs would have been programmed when those
+	 * packets were received. A 16B packet will have to
+	 * be sent in response to that packet. Return a 16B
+	 * header type if that's the case.
+	 */
+	if (rdma_ah_get_dlid(attr) == be32_to_cpu(OPA_LID_PERMISSIVE))
+		return (ib_is_opa_gid(&rdma_ah_read_grh(attr)->dgid)) ?
+			HFI1_PKT_TYPE_16B : HFI1_PKT_TYPE_9B;
+
+	/*
+	 * Return a 16B header type if either the the destination
+	 * or source lid is extended.
+	 */
+	if (hfi1_get_packet_type(rdma_ah_get_dlid(attr)) == HFI1_PKT_TYPE_16B)
+		return HFI1_PKT_TYPE_16B;
+
+	return hfi1_get_packet_type(lid);
+}
 #endif                          /* _HFI1_KERNEL_H */
diff --git a/drivers/infiniband/hw/hfi1/qp.c b/drivers/infiniband/hw/hfi1/qp.c
index b801d8469956..0fca6dfe8d9f 100644
--- a/drivers/infiniband/hw/hfi1/qp.c
+++ b/drivers/infiniband/hw/hfi1/qp.c
@@ -232,6 +232,31 @@ int hfi1_check_modify_qp(struct rvt_qp *qp, struct ib_qp_attr *attr,
 	return 0;
 }
 
+/*
+ * qp_set_16b - Set the hdr_type based on whether the slid or the
+ * dlid in the connection is extended. Only applicable for RC and UC
+ * QPs. UD QPs determine this on the fly from the ah in the wqe
+ */
+static inline void qp_set_16b(struct rvt_qp *qp)
+{
+	struct hfi1_pportdata *ppd;
+	struct hfi1_ibport *ibp;
+	struct hfi1_qp_priv *priv = qp->priv;
+
+	/* Update ah_attr to account for extended LIDs */
+	hfi1_update_ah_attr(qp->ibqp.device, &qp->remote_ah_attr);
+
+	/* Create 32 bit LIDs */
+	hfi1_make_opa_lid(&qp->remote_ah_attr);
+
+	if (!(rdma_ah_get_ah_flags(&qp->remote_ah_attr) & IB_AH_GRH))
+		return;
+
+	ibp = to_iport(qp->ibqp.device, qp->port_num);
+	ppd = ppd_from_ibp(ibp);
+	priv->hdr_type = hfi1_get_hdr_type(ppd->lid, &qp->remote_ah_attr);
+}
+
 void hfi1_modify_qp(struct rvt_qp *qp, struct ib_qp_attr *attr,
 		    int attr_mask, struct ib_udata *udata)
 {
@@ -242,6 +267,7 @@ void hfi1_modify_qp(struct rvt_qp *qp, struct ib_qp_attr *attr,
 		priv->s_sc = ah_to_sc(ibqp->device, &qp->remote_ah_attr);
 		priv->s_sde = qp_to_sdma_engine(qp, priv->s_sc);
 		priv->s_sendcontext = qp_to_send_context(qp, priv->s_sc);
+		qp_set_16b(qp);
 	}
 
 	if (attr_mask & IB_QP_PATH_MIG_STATE &&
@@ -251,6 +277,7 @@ void hfi1_modify_qp(struct rvt_qp *qp, struct ib_qp_attr *attr,
 		priv->s_sc = ah_to_sc(ibqp->device, &qp->remote_ah_attr);
 		priv->s_sde = qp_to_sdma_engine(qp, priv->s_sc);
 		priv->s_sendcontext = qp_to_send_context(qp, priv->s_sc);
+		qp_set_16b(qp);
 	}
 }
 
@@ -751,6 +778,7 @@ void hfi1_migrate_qp(struct rvt_qp *qp)
 	qp->s_flags |= RVT_S_AHG_CLEAR;
 	priv->s_sc = ah_to_sc(qp->ibqp.device, &qp->remote_ah_attr);
 	priv->s_sde = qp_to_sdma_engine(qp, priv->s_sc);
+	qp_set_16b(qp);
 
 	ev.device = qp->ibqp.device;
 	ev.element.qp = &qp->ibqp;
diff --git a/drivers/infiniband/hw/hfi1/verbs.c b/drivers/infiniband/hw/hfi1/verbs.c
index 0b1556fed47e..18b27276f202 100644
--- a/drivers/infiniband/hw/hfi1/verbs.c
+++ b/drivers/infiniband/hw/hfi1/verbs.c
@@ -1421,6 +1421,15 @@ static int query_port(struct rvt_dev_info *rdi, u8 port_num,
 	props->active_mtu = !valid_ib_mtu(ppd->ibmtu) ? props->max_mtu :
 		mtu_to_enum(ppd->ibmtu, IB_MTU_2048);
 
+	/*
+	 * sm_lid of 0xFFFF needs special handling so that it can
+	 * be differentiated from a permissve LID of 0xFFFF.
+	 * We set the grh_required flag here so the SA can program
+	 * the DGID in the address handle appropriately
+	 */
+	if (props->sm_lid == be16_to_cpu(IB_LID_PERMISSIVE))
+		props->grh_required = true;
+
 	return 0;
 }
 
@@ -1528,6 +1537,7 @@ static void hfi1_notify_new_ah(struct ib_device *ibdev,
 	struct hfi1_pportdata *ppd;
 	struct hfi1_devdata *dd;
 	u8 sc5;
+	struct rdma_ah_attr *attr = &ah->attr;
 
 	/*
 	 * Do not trust reading anything from rvt_ah at this point as it is not
@@ -1537,6 +1547,8 @@ static void hfi1_notify_new_ah(struct ib_device *ibdev,
 	ibp = to_iport(ibdev, rdma_ah_get_port_num(ah_attr));
 	ppd = ppd_from_ibp(ibp);
 	sc5 = ibp->sl_to_sc[rdma_ah_get_sl(&ah->attr)];
+	hfi1_update_ah_attr(ibdev, attr);
+	hfi1_make_opa_lid(attr);
 	dd = dd_from_ppd(ppd);
 	ah->vl = sc_to_vlt(dd, sc5);
 	if (ah->vl < num_vls || ah->vl == 15)
diff --git a/drivers/infiniband/hw/hfi1/verbs.h b/drivers/infiniband/hw/hfi1/verbs.h
index 68577a0c922b..d3dd0c01b8f6 100644
--- a/drivers/infiniband/hw/hfi1/verbs.h
+++ b/drivers/infiniband/hw/hfi1/verbs.h
@@ -147,6 +147,7 @@ struct hfi1_qp_priv {
 	u8 s_sc;		                  /* SC[0..4] for next packet */
 	struct iowait s_iowait;
 	struct rvt_qp *owner;
+	u8 hdr_type; /* 9B or 16B */
 };
 
 /*
diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h
index 70a183179224..8f263930c56f 100644
--- a/include/rdma/ib_verbs.h
+++ b/include/rdma/ib_verbs.h
@@ -864,6 +864,7 @@ struct roce_ah_attr {
 struct opa_ah_attr {
 	u32			dlid;
 	u8			src_path_bits;
+	bool			make_grd;
 };
 
 struct rdma_ah_attr {
@@ -3625,6 +3626,20 @@ static inline u8 rdma_ah_get_path_bits(const struct rdma_ah_attr *attr)
 	return 0;
 }
 
+static inline void rdma_ah_set_make_grd(struct rdma_ah_attr *attr,
+					bool make_grd)
+{
+	if (attr->type == RDMA_AH_ATTR_TYPE_OPA)
+		attr->opa.make_grd = make_grd;
+}
+
+static inline bool rdma_ah_get_make_grd(const struct rdma_ah_attr *attr)
+{
+	if (attr->type == RDMA_AH_ATTR_TYPE_OPA)
+		return attr->opa.make_grd;
+	return false;
+}
+
 static inline void rdma_ah_set_port_num(struct rdma_ah_attr *attr, u8 port_num)
 {
 	attr->port_num = port_num;
diff --git a/include/rdma/opa_addr.h b/include/rdma/opa_addr.h
index 8d3ad4ecbea1..9ae126fb8648 100644
--- a/include/rdma/opa_addr.h
+++ b/include/rdma/opa_addr.h
@@ -71,7 +71,7 @@
  *
  * @gid: The Global identifier
  */
-static inline bool ib_is_opa_gid(union ib_gid *gid)
+static inline bool ib_is_opa_gid(const union ib_gid *gid)
 {
 	return ((be64_to_cpu(gid->global.interface_id) >> 40) ==
 		OPA_SPECIAL_OUI);
@@ -84,7 +84,7 @@ static inline bool ib_is_opa_gid(union ib_gid *gid)
  *
  * @gid: The Global identifier
  */
-static inline u32 opa_get_lid_from_gid(union ib_gid *gid)
+static inline u32 opa_get_lid_from_gid(const union ib_gid *gid)
 {
 	return be64_to_cpu(gid->global.interface_id) & 0xFFFFFFFF;
 }
-- 
cgit v1.2.3


From 88733e3b845024cb2324a68469a4a25fdd9c0a6c Mon Sep 17 00:00:00 2001
From: Don Hiatt <don.hiatt@intel.com>
Date: Fri, 4 Aug 2017 13:54:23 -0700
Subject: IB/hfi1: Add 16B UD support

Add 16B bypass packet support for UD traffic types.

Reviewed-by: Dennis Dalessandro <dennis.dalessandro@intel.com>
Signed-off-by: Dasaratharaman Chandramouli <dasaratharaman.chandramouli@intel.com>
Signed-off-by: Don Hiatt <don.hiatt@intel.com>
Signed-off-by: Dennis Dalessandro <dennis.dalessandro@intel.com>
Signed-off-by: Doug Ledford <dledford@redhat.com>
---
 drivers/infiniband/hw/hfi1/driver.c |  35 +--
 drivers/infiniband/hw/hfi1/hfi.h    | 117 +++++++++-
 drivers/infiniband/hw/hfi1/mad.c    |   8 +-
 drivers/infiniband/hw/hfi1/ruc.c    |   4 +-
 drivers/infiniband/hw/hfi1/ud.c     | 421 +++++++++++++++++++++++++++---------
 drivers/infiniband/hw/hfi1/verbs.h  |   2 +-
 include/rdma/opa_addr.h             |   1 +
 7 files changed, 457 insertions(+), 131 deletions(-)

(limited to 'include')

diff --git a/drivers/infiniband/hw/hfi1/driver.c b/drivers/infiniband/hw/hfi1/driver.c
index ae6a90d2a31c..fc7085d6cf3f 100644
--- a/drivers/infiniband/hw/hfi1/driver.c
+++ b/drivers/infiniband/hw/hfi1/driver.c
@@ -437,23 +437,33 @@ void hfi1_process_ecn_slowpath(struct rvt_qp *qp, struct hfi1_packet *pkt,
 			       bool do_cnp)
 {
 	struct hfi1_ibport *ibp = to_iport(qp->ibqp.device, qp->port_num);
-	struct ib_header *hdr = pkt->hdr;
 	struct ib_other_headers *ohdr = pkt->ohdr;
 	struct ib_grh *grh = pkt->grh;
 	u32 rqpn = 0, bth1;
-	u16 rlid, dlid = ib_get_dlid(hdr);
-	u8 sc, svc_type;
+	u16 pkey, rlid, dlid = ib_get_dlid(pkt->hdr);
+	u8 hdr_type, sc, svc_type;
 	bool is_mcast = false;
 
+	if (pkt->etype == RHF_RCV_TYPE_BYPASS) {
+		is_mcast = hfi1_is_16B_mcast(dlid);
+		pkey = hfi1_16B_get_pkey(pkt->hdr);
+		sc = hfi1_16B_get_sc(pkt->hdr);
+		hdr_type = HFI1_PKT_TYPE_16B;
+	} else {
+		is_mcast = (dlid > be16_to_cpu(IB_MULTICAST_LID_BASE)) &&
+			   (dlid != be16_to_cpu(IB_LID_PERMISSIVE));
+		pkey = ib_bth_get_pkey(ohdr);
+		sc = hfi1_9B_get_sc5(pkt->hdr, pkt->rhf);
+		hdr_type = HFI1_PKT_TYPE_9B;
+	}
+
 	switch (qp->ibqp.qp_type) {
 	case IB_QPT_SMI:
 	case IB_QPT_GSI:
 	case IB_QPT_UD:
-		rlid = ib_get_slid(hdr);
-		rqpn = ib_get_sqpn(ohdr);
+		rlid = ib_get_slid(pkt->hdr);
+		rqpn = ib_get_sqpn(pkt->ohdr);
 		svc_type = IB_CC_SVCTYPE_UD;
-		is_mcast = (dlid > be16_to_cpu(IB_MULTICAST_LID_BASE)) &&
-			(dlid != be16_to_cpu(IB_LID_PERMISSIVE));
 		break;
 	case IB_QPT_UC:
 		rlid = rdma_ah_get_dlid(&qp->remote_ah_attr);
@@ -469,14 +479,11 @@ void hfi1_process_ecn_slowpath(struct rvt_qp *qp, struct hfi1_packet *pkt,
 		return;
 	}
 
-	sc = hfi1_9B_get_sc5(hdr, pkt->rhf);
-
 	bth1 = be32_to_cpu(ohdr->bth[1]);
-	if (do_cnp && (bth1 & IB_FECN_SMASK)) {
-		u16 pkey = ib_bth_get_pkey(ohdr);
-
-		return_cnp(ibp, qp, rqpn, pkey, dlid, rlid, sc, grh);
-	}
+	/* Call appropriate CNP handler */
+	if (do_cnp && (bth1 & IB_FECN_SMASK))
+		hfi1_handle_cnp_tbl[hdr_type](ibp, qp, rqpn, pkey,
+					      dlid, rlid, sc, grh);
 
 	if (!is_mcast && (bth1 & IB_BECN_SMASK)) {
 		struct hfi1_pportdata *ppd = ppd_from_ibp(ibp);
diff --git a/drivers/infiniband/hw/hfi1/hfi.h b/drivers/infiniband/hw/hfi1/hfi.h
index cec9590870ba..7e21192da8e1 100644
--- a/drivers/infiniband/hw/hfi1/hfi.h
+++ b/drivers/infiniband/hw/hfi1/hfi.h
@@ -831,6 +831,10 @@ struct hfi1_pportdata {
 typedef int (*rhf_rcv_function_ptr)(struct hfi1_packet *packet);
 
 typedef void (*opcode_handler)(struct hfi1_packet *packet);
+typedef void (*hfi1_make_req)(struct rvt_qp *qp,
+			      struct hfi1_pkt_state *ps,
+			      struct rvt_swqe *wqe);
+
 
 /* return values for the RHF receive functions */
 #define RHF_RCV_CONTINUE  0	/* keep going */
@@ -1373,6 +1377,13 @@ void hfi1_set_vnic_msix_info(struct hfi1_ctxtdata *rcd);
 void hfi1_reset_vnic_msix_info(struct hfi1_ctxtdata *rcd);
 
 extern const struct pci_device_id hfi1_pci_tbl[];
+void hfi1_make_ud_req_9B(struct rvt_qp *qp,
+			 struct hfi1_pkt_state *ps,
+			 struct rvt_swqe *wqe);
+
+void hfi1_make_ud_req_16B(struct rvt_qp *qp,
+			  struct hfi1_pkt_state *ps,
+			  struct rvt_swqe *wqe);
 
 /* receive packet handler dispositions */
 #define RCV_PKT_OK      0x0 /* keep going */
@@ -1507,6 +1518,18 @@ void process_becn(struct hfi1_pportdata *ppd, u8 sl,  u16 rlid, u32 lqpn,
 void return_cnp(struct hfi1_ibport *ibp, struct rvt_qp *qp, u32 remote_qpn,
 		u32 pkey, u32 slid, u32 dlid, u8 sc5,
 		const struct ib_grh *old_grh);
+void return_cnp_16B(struct hfi1_ibport *ibp, struct rvt_qp *qp,
+		    u32 remote_qpn, u32 pkey, u32 slid, u32 dlid,
+		    u8 sc5, const struct ib_grh *old_grh);
+typedef void (*hfi1_handle_cnp)(struct hfi1_ibport *ibp, struct rvt_qp *qp,
+				u32 remote_qpn, u32 pkey, u32 slid, u32 dlid,
+				u8 sc5, const struct ib_grh *old_grh);
+
+/* We support only two types - 9B and 16B for now */
+static const hfi1_handle_cnp hfi1_handle_cnp_tbl[2] = {
+	[HFI1_PKT_TYPE_9B] = &return_cnp,
+	[HFI1_PKT_TYPE_16B] = &return_cnp_16B
+};
 #define PKEY_CHECK_INVALID -1
 int egress_pkey_check(struct hfi1_pportdata *ppd, __be16 *lrh, __be32 *bth,
 		      u8 sc5, int8_t s_pkey_index);
@@ -1747,12 +1770,22 @@ static inline bool process_ecn(struct rvt_qp *qp, struct hfi1_packet *pkt,
 			       bool do_cnp)
 {
 	struct ib_other_headers *ohdr = pkt->ohdr;
-	u32 bth1;
 
-	bth1 = be32_to_cpu(ohdr->bth[1]);
-	if (unlikely(bth1 & (IB_BECN_SMASK | IB_FECN_SMASK))) {
+	u32 bth1;
+	bool becn = false;
+	bool fecn = false;
+
+	if (pkt->etype == RHF_RCV_TYPE_BYPASS) {
+		fecn = hfi1_16B_get_fecn(pkt->hdr);
+		becn = hfi1_16B_get_becn(pkt->hdr);
+	} else {
+		bth1 = be32_to_cpu(ohdr->bth[1]);
+		fecn = bth1 & IB_FECN_SMASK;
+		becn = bth1 & IB_BECN_SMASK;
+	}
+	if (unlikely(fecn || becn)) {
 		hfi1_process_ecn_slowpath(qp, pkt, do_cnp);
-		return !!(bth1 & IB_FECN_SMASK);
+		return fecn;
 	}
 	return false;
 }
@@ -2315,4 +2348,80 @@ static inline bool hfi1_get_hdr_type(u32 lid, struct rdma_ah_attr *attr)
 
 	return hfi1_get_packet_type(lid);
 }
+
+static inline void hfi1_make_ext_grh(struct hfi1_packet *packet,
+				     struct ib_grh *grh, u32 slid,
+				     u32 dlid)
+{
+	struct hfi1_ibport *ibp = &packet->rcd->ppd->ibport_data;
+	struct hfi1_pportdata *ppd = ppd_from_ibp(ibp);
+
+	if (!ibp)
+		return;
+
+	grh->hop_limit = 1;
+	grh->sgid.global.subnet_prefix = ibp->rvp.gid_prefix;
+	if (slid == opa_get_lid(be32_to_cpu(OPA_LID_PERMISSIVE), 16B))
+		grh->sgid.global.interface_id =
+			OPA_MAKE_ID(be32_to_cpu(OPA_LID_PERMISSIVE));
+	else
+		grh->sgid.global.interface_id = OPA_MAKE_ID(slid);
+
+	/*
+	 * Upper layers (like mad) may compare the dgid in the
+	 * wc that is obtained here with the sgid_index in
+	 * the wr. Since sgid_index in wr is always 0 for
+	 * extended lids, set the dgid here to the default
+	 * IB gid.
+	 */
+	grh->dgid.global.subnet_prefix = ibp->rvp.gid_prefix;
+	grh->dgid.global.interface_id =
+		cpu_to_be64(ppd->guids[HFI1_PORT_GUID_INDEX]);
+}
+
+static inline int hfi1_get_16b_padding(u32 hdr_size, u32 payload)
+{
+	return -(hdr_size + payload + (SIZE_OF_CRC << 2) +
+		     SIZE_OF_LT) & 0x7;
+}
+
+static inline void hfi1_make_ib_hdr(struct ib_header *hdr,
+				    u16 lrh0, u16 len,
+				    u16 dlid, u16 slid)
+{
+	hdr->lrh[0] = cpu_to_be16(lrh0);
+	hdr->lrh[1] = cpu_to_be16(dlid);
+	hdr->lrh[2] = cpu_to_be16(len);
+	hdr->lrh[3] = cpu_to_be16(slid);
+}
+
+static inline void hfi1_make_16b_hdr(struct hfi1_16b_header *hdr,
+				     u32 slid, u32 dlid,
+				     u16 len, u16 pkey,
+				     u8 becn, u8 fecn, u8 l4,
+				     u8 sc)
+{
+	u32 lrh0 = 0;
+	u32 lrh1 = 0x40000000;
+	u32 lrh2 = 0;
+	u32 lrh3 = 0;
+
+	lrh0 = (lrh0 & ~OPA_16B_BECN_MASK) | (becn << OPA_16B_BECN_SHIFT);
+	lrh0 = (lrh0 & ~OPA_16B_LEN_MASK) | (len << OPA_16B_LEN_SHIFT);
+	lrh0 = (lrh0 & ~OPA_16B_LID_MASK)  | (slid & OPA_16B_LID_MASK);
+	lrh1 = (lrh1 & ~OPA_16B_FECN_MASK) | (fecn << OPA_16B_FECN_SHIFT);
+	lrh1 = (lrh1 & ~OPA_16B_SC_MASK) | (sc << OPA_16B_SC_SHIFT);
+	lrh1 = (lrh1 & ~OPA_16B_LID_MASK) | (dlid & OPA_16B_LID_MASK);
+	lrh2 = (lrh2 & ~OPA_16B_SLID_MASK) |
+		((slid >> OPA_16B_SLID_SHIFT) << OPA_16B_SLID_HIGH_SHIFT);
+	lrh2 = (lrh2 & ~OPA_16B_DLID_MASK) |
+		((dlid >> OPA_16B_DLID_SHIFT) << OPA_16B_DLID_HIGH_SHIFT);
+	lrh2 = (lrh2 & ~OPA_16B_PKEY_MASK) | (pkey << OPA_16B_PKEY_SHIFT);
+	lrh2 = (lrh2 & ~OPA_16B_L4_MASK) | l4;
+
+	hdr->lrh[0] = lrh0;
+	hdr->lrh[1] = lrh1;
+	hdr->lrh[2] = lrh2;
+	hdr->lrh[3] = lrh3;
+}
 #endif                          /* _HFI1_KERNEL_H */
diff --git a/drivers/infiniband/hw/hfi1/mad.c b/drivers/infiniband/hw/hfi1/mad.c
index 21fadb4b510c..1509bc6b76d8 100644
--- a/drivers/infiniband/hw/hfi1/mad.c
+++ b/drivers/infiniband/hw/hfi1/mad.c
@@ -373,12 +373,10 @@ static struct trap_node *create_trap_node(u8 type, __be16 trap_num, u32 lid)
  * Send a bad P_Key trap (ch. 14.3.8).
  */
 void hfi1_bad_pkey(struct hfi1_ibport *ibp, u32 key, u32 sl,
-		   u32 qp1, u32 qp2, u16 lid1, u16 lid2)
+		   u32 qp1, u32 qp2, u32 lid1, u32 lid2)
 {
 	struct trap_node *trap;
 	u32 lid = ppd_from_ibp(ibp)->lid;
-	u32 _lid1 = lid1;
-	u32 _lid2 = lid2;
 
 	ibp->rvp.n_pkt_drops++;
 	ibp->rvp.pkey_violations++;
@@ -389,8 +387,8 @@ void hfi1_bad_pkey(struct hfi1_ibport *ibp, u32 key, u32 sl,
 		return;
 
 	/* Send violation trap */
-	trap->data.ntc_257_258.lid1 = cpu_to_be32(_lid1);
-	trap->data.ntc_257_258.lid2 = cpu_to_be32(_lid2);
+	trap->data.ntc_257_258.lid1 = cpu_to_be32(lid1);
+	trap->data.ntc_257_258.lid2 = cpu_to_be32(lid2);
 	trap->data.ntc_257_258.key = cpu_to_be32(key);
 	trap->data.ntc_257_258.sl = sl << 3;
 	trap->data.ntc_257_258.qp1 = cpu_to_be32(qp1);
diff --git a/drivers/infiniband/hw/hfi1/ruc.c b/drivers/infiniband/hw/hfi1/ruc.c
index d252f8f2207a..6839bfae933c 100644
--- a/drivers/infiniband/hw/hfi1/ruc.c
+++ b/drivers/infiniband/hw/hfi1/ruc.c
@@ -649,7 +649,7 @@ done:
  * @ibp: a pointer to the IB port
  * @hdr: a pointer to the GRH header being constructed
  * @grh: the global route address to send to
- * @hwords: the number of 32 bit words of header being sent
+ * @hwords: size of header after grh being sent in dwords
  * @nwords: the number of 32 bit words of data being sent
  *
  * Return the size of the header in 32 bit words.
@@ -661,7 +661,7 @@ u32 hfi1_make_grh(struct hfi1_ibport *ibp, struct ib_grh *hdr,
 		cpu_to_be32((IB_GRH_VERSION << IB_GRH_VERSION_SHIFT) |
 			    (grh->traffic_class << IB_GRH_TCLASS_SHIFT) |
 			    (grh->flow_label << IB_GRH_FLOW_SHIFT));
-	hdr->paylen = cpu_to_be16((hwords - 2 + nwords + SIZE_OF_CRC) << 2);
+	hdr->paylen = cpu_to_be16((hwords + nwords) << 2);
 	/* next_hdr is defined by C8-7 in ch. 8.4.1 */
 	hdr->next_hdr = IB_GRH_NEXT_HDR;
 	hdr->hop_limit = grh->hop_limit;
diff --git a/drivers/infiniband/hw/hfi1/ud.c b/drivers/infiniband/hw/hfi1/ud.c
index b708376b67da..2ba74fdd6f15 100644
--- a/drivers/infiniband/hw/hfi1/ud.c
+++ b/drivers/infiniband/hw/hfi1/ud.c
@@ -53,6 +53,12 @@
 #include "verbs_txreq.h"
 #include "qp.h"
 
+/* We support only two types - 9B and 16B for now */
+static const hfi1_make_req hfi1_make_ud_req_tbl[2] = {
+	[HFI1_PKT_TYPE_9B] = &hfi1_make_ud_req_9B,
+	[HFI1_PKT_TYPE_16B] = &hfi1_make_ud_req_16B
+};
+
 /**
  * ud_loopback - handle send on loopback QPs
  * @sqp: the sending QP
@@ -67,6 +73,7 @@ static void ud_loopback(struct rvt_qp *sqp, struct rvt_swqe *swqe)
 {
 	struct hfi1_ibport *ibp = to_iport(sqp->ibqp.device, sqp->port_num);
 	struct hfi1_pportdata *ppd;
+	struct hfi1_qp_priv *priv = sqp->priv;
 	struct rvt_qp *qp;
 	struct rdma_ah_attr *ah_attr;
 	unsigned long flags;
@@ -102,7 +109,7 @@ static void ud_loopback(struct rvt_qp *sqp, struct rvt_swqe *swqe)
 
 	if (qp->ibqp.qp_num > 1) {
 		u16 pkey;
-		u16 slid;
+		u32 slid;
 		u8 sc5 = ibp->sl_to_sc[rdma_ah_get_sl(ah_attr)];
 
 		pkey = hfi1_get_pkey(ibp, sqp->s_pkey_index);
@@ -176,9 +183,33 @@ static void ud_loopback(struct rvt_qp *sqp, struct rvt_swqe *swqe)
 
 	if (rdma_ah_get_ah_flags(ah_attr) & IB_AH_GRH) {
 		struct ib_grh grh;
-		const struct ib_global_route *grd = rdma_ah_read_grh(ah_attr);
+		struct ib_global_route grd = *(rdma_ah_read_grh(ah_attr));
+
+		/*
+		 * For loopback packets with extended LIDs, the
+		 * sgid_index in the GRH is 0 and the dgid is
+		 * OPA GID of the sender. While creating a response
+		 * to the loopback packet, IB core creates the new
+		 * sgid_index from the DGID and that will be the
+		 * OPA_GID_INDEX. The new dgid is from the sgid
+		 * index and that will be in the IB GID format.
+		 *
+		 * We now have a case where the sent packet had a
+		 * different sgid_index and dgid compared to the
+		 * one that was received in response.
+		 *
+		 * Fix this inconsistency.
+		 */
+		if (priv->hdr_type == HFI1_PKT_TYPE_16B) {
+			if (grd.sgid_index == 0)
+				grd.sgid_index = OPA_GID_INDEX;
 
-		hfi1_make_grh(ibp, &grh, grd, 0, 0);
+			if (ib_is_opa_gid(&grd.dgid))
+				grd.dgid.global.interface_id =
+				cpu_to_be64(ppd->guids[HFI1_PORT_GUID_INDEX]);
+		}
+
+		hfi1_make_grh(ibp, &grh, &grd, 0, 0);
 		hfi1_copy_sge(&qp->r_sge, &grh,
 			      sizeof(grh), true, false);
 		wc.wc_flags |= IB_WC_GRH;
@@ -235,7 +266,7 @@ static void ud_loopback(struct rvt_qp *sqp, struct rvt_swqe *swqe)
 		wc.pkey_index = 0;
 	}
 	wc.slid = ppd->lid | (rdma_ah_get_path_bits(ah_attr) &
-			      ((1 << ppd->lmc) - 1));
+				   ((1 << ppd->lmc) - 1));
 	/* Check for loopback when the port lid is not set */
 	if (wc.slid == 0 && sqp->ibqp.qp_type == IB_QPT_GSI)
 		wc.slid = be16_to_cpu(IB_LID_PERMISSIVE);
@@ -252,6 +283,183 @@ drop:
 	rcu_read_unlock();
 }
 
+static void hfi1_make_bth_deth(struct rvt_qp *qp, struct rvt_swqe *wqe,
+			       struct ib_other_headers *ohdr,
+			       u16 *pkey, u32 extra_bytes, bool bypass)
+{
+	u32 bth0;
+	struct hfi1_ibport *ibp;
+
+	ibp = to_iport(qp->ibqp.device, qp->port_num);
+	if (wqe->wr.opcode == IB_WR_SEND_WITH_IMM) {
+		ohdr->u.ud.imm_data = wqe->wr.ex.imm_data;
+		bth0 = IB_OPCODE_UD_SEND_ONLY_WITH_IMMEDIATE << 24;
+	} else {
+		bth0 = IB_OPCODE_UD_SEND_ONLY << 24;
+	}
+
+	if (wqe->wr.send_flags & IB_SEND_SOLICITED)
+		bth0 |= IB_BTH_SOLICITED;
+	bth0 |= extra_bytes << 20;
+	if (qp->ibqp.qp_type == IB_QPT_GSI || qp->ibqp.qp_type == IB_QPT_SMI)
+		*pkey = hfi1_get_pkey(ibp, wqe->ud_wr.pkey_index);
+	else
+		*pkey = hfi1_get_pkey(ibp, qp->s_pkey_index);
+	if (!bypass)
+		bth0 |= *pkey;
+	ohdr->bth[0] = cpu_to_be32(bth0);
+	ohdr->bth[1] = cpu_to_be32(wqe->ud_wr.remote_qpn);
+	ohdr->bth[2] = cpu_to_be32(mask_psn(wqe->psn));
+	/*
+	 * Qkeys with the high order bit set mean use the
+	 * qkey from the QP context instead of the WR (see 10.2.5).
+	 */
+	ohdr->u.ud.deth[0] = cpu_to_be32((int)wqe->ud_wr.remote_qkey < 0 ?
+					 qp->qkey : wqe->ud_wr.remote_qkey);
+	ohdr->u.ud.deth[1] = cpu_to_be32(qp->ibqp.qp_num);
+}
+
+void hfi1_make_ud_req_9B(struct rvt_qp *qp, struct hfi1_pkt_state *ps,
+			 struct rvt_swqe *wqe)
+{
+	u32 nwords, extra_bytes;
+	u16 len, slid, dlid, pkey;
+	u16 lrh0 = 0;
+	u8 sc5;
+	struct hfi1_qp_priv *priv = qp->priv;
+	struct ib_other_headers *ohdr;
+	struct rdma_ah_attr *ah_attr;
+	struct hfi1_pportdata *ppd;
+	struct hfi1_ibport *ibp;
+	struct ib_grh *grh;
+
+	ibp = to_iport(qp->ibqp.device, qp->port_num);
+	ppd = ppd_from_ibp(ibp);
+	ah_attr = &ibah_to_rvtah(wqe->ud_wr.ah)->attr;
+
+	extra_bytes = -wqe->length & 3;
+	nwords = ((wqe->length + extra_bytes) >> 2) + SIZE_OF_CRC;
+	/* header size in dwords LRH+BTH+DETH = (8+12+8)/4. */
+	qp->s_hdrwords = 7;
+	if (wqe->wr.opcode == IB_WR_SEND_WITH_IMM)
+		qp->s_hdrwords++;
+
+	if (rdma_ah_get_ah_flags(ah_attr) & IB_AH_GRH) {
+		grh = &ps->s_txreq->phdr.hdr.ibh.u.l.grh;
+		qp->s_hdrwords += hfi1_make_grh(ibp, grh,
+						rdma_ah_read_grh(ah_attr),
+						qp->s_hdrwords - 2, nwords);
+		lrh0 = HFI1_LRH_GRH;
+		ohdr = &ps->s_txreq->phdr.hdr.ibh.u.l.oth;
+	} else {
+		lrh0 = HFI1_LRH_BTH;
+		ohdr = &ps->s_txreq->phdr.hdr.ibh.u.oth;
+	}
+
+	sc5 = ibp->sl_to_sc[rdma_ah_get_sl(ah_attr)];
+	lrh0 |= (rdma_ah_get_sl(ah_attr) & 0xf) << 4;
+	if (qp->ibqp.qp_type == IB_QPT_SMI) {
+		lrh0 |= 0xF000; /* Set VL (see ch. 13.5.3.1) */
+		priv->s_sc = 0xf;
+	} else {
+		lrh0 |= (sc5 & 0xf) << 12;
+		priv->s_sc = sc5;
+	}
+
+	dlid = opa_get_lid(rdma_ah_get_dlid(ah_attr), 9B);
+	if (dlid == be16_to_cpu(IB_LID_PERMISSIVE)) {
+		slid = be16_to_cpu(IB_LID_PERMISSIVE);
+	} else {
+		u16 lid = (u16)ppd->lid;
+
+		if (lid) {
+			lid |= rdma_ah_get_path_bits(ah_attr) &
+				((1 << ppd->lmc) - 1);
+			slid = lid;
+		} else {
+			slid = be16_to_cpu(IB_LID_PERMISSIVE);
+		}
+	}
+	hfi1_make_bth_deth(qp, wqe, ohdr, &pkey, extra_bytes, false);
+	len = qp->s_hdrwords + nwords;
+
+	/* Setup the packet */
+	ps->s_txreq->phdr.hdr.hdr_type = HFI1_PKT_TYPE_9B;
+	hfi1_make_ib_hdr(&ps->s_txreq->phdr.hdr.ibh,
+			 lrh0, len, dlid, slid);
+}
+
+void hfi1_make_ud_req_16B(struct rvt_qp *qp, struct hfi1_pkt_state *ps,
+			  struct rvt_swqe *wqe)
+{
+	struct hfi1_qp_priv *priv = qp->priv;
+	struct ib_other_headers *ohdr;
+	struct rdma_ah_attr *ah_attr;
+	struct hfi1_pportdata *ppd;
+	struct hfi1_ibport *ibp;
+	u32 dlid, slid, nwords, extra_bytes;
+	u16 len, pkey;
+	u8 l4, sc5;
+
+	ibp = to_iport(qp->ibqp.device, qp->port_num);
+	ppd = ppd_from_ibp(ibp);
+	ah_attr = &ibah_to_rvtah(wqe->ud_wr.ah)->attr;
+	/* header size in dwords 16B LRH+BTH+DETH = (16+12+8)/4. */
+	qp->s_hdrwords = 9;
+	if (wqe->wr.opcode == IB_WR_SEND_WITH_IMM)
+		qp->s_hdrwords++;
+
+	/* SW provides space for CRC and LT for bypass packets. */
+	extra_bytes = hfi1_get_16b_padding((qp->s_hdrwords << 2),
+					   wqe->length);
+	nwords = ((wqe->length + extra_bytes + SIZE_OF_LT) >> 2) + SIZE_OF_CRC;
+
+	if ((rdma_ah_get_ah_flags(ah_attr) & IB_AH_GRH) &&
+	    hfi1_check_mcast(rdma_ah_get_dlid(ah_attr))) {
+		struct ib_grh *grh;
+		struct ib_global_route *grd = rdma_ah_retrieve_grh(ah_attr);
+		/*
+		 * Ensure OPA GIDs are transformed to IB gids
+		 * before creating the GRH.
+		 */
+		if (grd->sgid_index == OPA_GID_INDEX) {
+			dd_dev_warn(ppd->dd, "Bad sgid_index. sgid_index: %d\n",
+				    grd->sgid_index);
+			grd->sgid_index = 0;
+		}
+		grh = &ps->s_txreq->phdr.hdr.opah.u.l.grh;
+		qp->s_hdrwords += hfi1_make_grh(ibp, grh, grd,
+					qp->s_hdrwords - 4, nwords);
+		ohdr = &ps->s_txreq->phdr.hdr.opah.u.l.oth;
+		l4 = OPA_16B_L4_IB_GLOBAL;
+	} else {
+		ohdr = &ps->s_txreq->phdr.hdr.opah.u.oth;
+		l4 = OPA_16B_L4_IB_LOCAL;
+	}
+
+	sc5 = ibp->sl_to_sc[rdma_ah_get_sl(ah_attr)];
+	if (qp->ibqp.qp_type == IB_QPT_SMI)
+		priv->s_sc = 0xf;
+	else
+		priv->s_sc = sc5;
+
+	dlid = opa_get_lid(rdma_ah_get_dlid(ah_attr), 16B);
+	if (!ppd->lid)
+		slid = be32_to_cpu(OPA_LID_PERMISSIVE);
+	else
+		slid = ppd->lid | (rdma_ah_get_path_bits(ah_attr) &
+			   ((1 << ppd->lmc) - 1));
+
+	hfi1_make_bth_deth(qp, wqe, ohdr, &pkey, extra_bytes, true);
+	/* Convert dwords to flits */
+	len = (qp->s_hdrwords + nwords) >> 1;
+
+	/* Setup the packet */
+	ps->s_txreq->phdr.hdr.hdr_type = HFI1_PKT_TYPE_16B;
+	hfi1_make_16b_hdr(&ps->s_txreq->phdr.hdr.opah,
+			  slid, dlid, len, pkey, 0, 0, l4, priv->s_sc);
+}
+
 /**
  * hfi1_make_ud_req - construct a UD request packet
  * @qp: the QP
@@ -263,18 +471,12 @@ drop:
 int hfi1_make_ud_req(struct rvt_qp *qp, struct hfi1_pkt_state *ps)
 {
 	struct hfi1_qp_priv *priv = qp->priv;
-	struct ib_other_headers *ohdr;
 	struct rdma_ah_attr *ah_attr;
 	struct hfi1_pportdata *ppd;
 	struct hfi1_ibport *ibp;
 	struct rvt_swqe *wqe;
-	u32 nwords;
-	u32 extra_bytes;
-	u32 bth0;
-	u16 lrh0;
-	u16 lid;
 	int next_cur;
-	u8 sc5;
+	u32 lid;
 
 	ps->s_txreq = get_txreq(ps->dev, qp);
 	if (IS_ERR(ps->s_txreq))
@@ -311,13 +513,14 @@ int hfi1_make_ud_req(struct rvt_qp *qp, struct hfi1_pkt_state *ps)
 	ibp = to_iport(qp->ibqp.device, qp->port_num);
 	ppd = ppd_from_ibp(ibp);
 	ah_attr = &ibah_to_rvtah(wqe->ud_wr.ah)->attr;
-	if (rdma_ah_get_dlid(ah_attr) < be16_to_cpu(IB_MULTICAST_LID_BASE) ||
-	    rdma_ah_get_dlid(ah_attr) == be16_to_cpu(IB_LID_PERMISSIVE)) {
+	priv->hdr_type = hfi1_get_hdr_type(ppd->lid, ah_attr);
+	if ((!hfi1_check_mcast(rdma_ah_get_dlid(ah_attr))) ||
+	    (rdma_ah_get_dlid(ah_attr) == be32_to_cpu(OPA_LID_PERMISSIVE))) {
 		lid = rdma_ah_get_dlid(ah_attr) & ~((1 << ppd->lmc) - 1);
 		if (unlikely(!loopback &&
-			     (lid == ppd->lid ||
-			      (lid == be16_to_cpu(IB_LID_PERMISSIVE) &&
-			      qp->ibqp.qp_type == IB_QPT_GSI)))) {
+			     ((lid == ppd->lid) ||
+			      ((lid == be32_to_cpu(OPA_LID_PERMISSIVE)) &&
+			       (qp->ibqp.qp_type == IB_QPT_GSI))))) {
 			unsigned long tflags = ps->flags;
 			/*
 			 * If DMAs are in progress, we can't generate
@@ -341,11 +544,6 @@ int hfi1_make_ud_req(struct rvt_qp *qp, struct hfi1_pkt_state *ps)
 	}
 
 	qp->s_cur = next_cur;
-	extra_bytes = -wqe->length & 3;
-	nwords = (wqe->length + extra_bytes) >> 2;
-
-	/* header size in 32-bit words LRH+BTH+DETH = (8+12+8)/4. */
-	qp->s_hdrwords = 7;
 	ps->s_txreq->s_cur_size = wqe->length;
 	ps->s_txreq->ss = &qp->s_sge;
 	qp->s_srate = rdma_ah_get_static_rate(ah_attr);
@@ -356,78 +554,12 @@ int hfi1_make_ud_req(struct rvt_qp *qp, struct hfi1_pkt_state *ps)
 	qp->s_sge.num_sge = wqe->wr.num_sge;
 	qp->s_sge.total_len = wqe->length;
 
-	if (rdma_ah_get_ah_flags(ah_attr) & IB_AH_GRH) {
-		/* Header size in 32-bit words. */
-		qp->s_hdrwords +=
-			hfi1_make_grh(ibp,
-				      &ps->s_txreq->phdr.hdr.ibh.u.l.grh,
-				      rdma_ah_read_grh(ah_attr),
-				      qp->s_hdrwords, nwords);
-		lrh0 = HFI1_LRH_GRH;
-		ohdr = &ps->s_txreq->phdr.hdr.ibh.u.l.oth;
-		/*
-		 * Don't worry about sending to locally attached multicast
-		 * QPs.  It is unspecified by the spec. what happens.
-		 */
-	} else {
-		/* Header size in 32-bit words. */
-		lrh0 = HFI1_LRH_BTH;
-		ohdr = &ps->s_txreq->phdr.hdr.ibh.u.oth;
-	}
-	if (wqe->wr.opcode == IB_WR_SEND_WITH_IMM) {
-		qp->s_hdrwords++;
-		ohdr->u.ud.imm_data = wqe->wr.ex.imm_data;
-		bth0 = IB_OPCODE_UD_SEND_ONLY_WITH_IMMEDIATE << 24;
-	} else {
-		bth0 = IB_OPCODE_UD_SEND_ONLY << 24;
-	}
-	sc5 = ibp->sl_to_sc[rdma_ah_get_sl(ah_attr)];
-	lrh0 |= (rdma_ah_get_sl(ah_attr) & 0xf) << 4;
-	if (qp->ibqp.qp_type == IB_QPT_SMI) {
-		lrh0 |= 0xF000; /* Set VL (see ch. 13.5.3.1) */
-		priv->s_sc = 0xf;
-	} else {
-		lrh0 |= (sc5 & 0xf) << 12;
-		priv->s_sc = sc5;
-	}
+	/* Make the appropriate header */
+	hfi1_make_ud_req_tbl[priv->hdr_type](qp, ps, qp->s_wqe);
 	priv->s_sde = qp_to_sdma_engine(qp, priv->s_sc);
 	ps->s_txreq->sde = priv->s_sde;
 	priv->s_sendcontext = qp_to_send_context(qp, priv->s_sc);
 	ps->s_txreq->psc = priv->s_sendcontext;
-	ps->s_txreq->phdr.hdr.ibh.lrh[0] = cpu_to_be16(lrh0);
-	ps->s_txreq->phdr.hdr.ibh.lrh[1] =
-		cpu_to_be16(rdma_ah_get_dlid(ah_attr));
-	ps->s_txreq->phdr.hdr.ibh.lrh[2] =
-		cpu_to_be16(qp->s_hdrwords + nwords + SIZE_OF_CRC);
-	if (rdma_ah_get_dlid(ah_attr) == be16_to_cpu(IB_LID_PERMISSIVE)) {
-		ps->s_txreq->phdr.hdr.ibh.lrh[3] = IB_LID_PERMISSIVE;
-	} else {
-		lid = ppd->lid;
-		if (lid) {
-			lid |= rdma_ah_get_path_bits(ah_attr) &
-				((1 << ppd->lmc) - 1);
-			ps->s_txreq->phdr.hdr.ibh.lrh[3] = cpu_to_be16(lid);
-		} else {
-			ps->s_txreq->phdr.hdr.ibh.lrh[3] = IB_LID_PERMISSIVE;
-		}
-	}
-	if (wqe->wr.send_flags & IB_SEND_SOLICITED)
-		bth0 |= IB_BTH_SOLICITED;
-	bth0 |= extra_bytes << 20;
-	if (qp->ibqp.qp_type == IB_QPT_GSI || qp->ibqp.qp_type == IB_QPT_SMI)
-		bth0 |= hfi1_get_pkey(ibp, wqe->ud_wr.pkey_index);
-	else
-		bth0 |= hfi1_get_pkey(ibp, qp->s_pkey_index);
-	ohdr->bth[0] = cpu_to_be32(bth0);
-	ohdr->bth[1] = cpu_to_be32(wqe->ud_wr.remote_qpn);
-	ohdr->bth[2] = cpu_to_be32(mask_psn(wqe->psn));
-	/*
-	 * Qkeys with the high order bit set mean use the
-	 * qkey from the QP context instead of the WR (see 10.2.5).
-	 */
-	ohdr->u.ud.deth[0] = cpu_to_be32((int)wqe->ud_wr.remote_qkey < 0 ?
-					 qp->qkey : wqe->ud_wr.remote_qkey);
-	ohdr->u.ud.deth[1] = cpu_to_be32(qp->ibqp.qp_num);
 	/* disarm any ahg */
 	priv->s_ahg->ahgcount = 0;
 	priv->s_ahg->ahgidx = 0;
@@ -497,6 +629,64 @@ int hfi1_lookup_pkey_idx(struct hfi1_ibport *ibp, u16 pkey)
 	return -1;
 }
 
+void return_cnp_16B(struct hfi1_ibport *ibp, struct rvt_qp *qp,
+		    u32 remote_qpn, u32 pkey, u32 slid, u32 dlid,
+		    u8 sc5, const struct ib_grh *old_grh)
+{
+	u64 pbc, pbc_flags = 0;
+	u32 bth0, plen, vl, hwords = 7;
+	u16 len;
+	u8 l4;
+	struct hfi1_16b_header hdr;
+	struct ib_other_headers *ohdr;
+	struct pio_buf *pbuf;
+	struct send_context *ctxt = qp_to_send_context(qp, sc5);
+	struct hfi1_pportdata *ppd = ppd_from_ibp(ibp);
+	u32 nwords;
+
+	/* Populate length */
+	nwords = ((hfi1_get_16b_padding(hwords << 2, 0) +
+		   SIZE_OF_LT) >> 2) + SIZE_OF_CRC;
+	if (old_grh) {
+		struct ib_grh *grh = &hdr.u.l.grh;
+
+		grh->version_tclass_flow = old_grh->version_tclass_flow;
+		grh->paylen = cpu_to_be16((hwords - 4 + nwords) << 2);
+		grh->hop_limit = 0xff;
+		grh->sgid = old_grh->dgid;
+		grh->dgid = old_grh->sgid;
+		ohdr = &hdr.u.l.oth;
+		l4 = OPA_16B_L4_IB_GLOBAL;
+		hwords += sizeof(struct ib_grh) / sizeof(u32);
+	} else {
+		ohdr = &hdr.u.oth;
+		l4 = OPA_16B_L4_IB_LOCAL;
+	}
+
+	/* BIT 16 to 19 is TVER. Bit 20 to 22 is pad cnt */
+	bth0 = (IB_OPCODE_CNP << 24) | (1 << 16) |
+	       (hfi1_get_16b_padding(hwords << 2, 0) << 20);
+	ohdr->bth[0] = cpu_to_be32(bth0);
+
+	ohdr->bth[1] = cpu_to_be32(remote_qpn);
+	ohdr->bth[2] = 0; /* PSN 0 */
+
+	/* Convert dwords to flits */
+	len = (hwords + nwords) >> 1;
+	hfi1_make_16b_hdr(&hdr, slid, dlid, len, pkey, 1, 0, l4, sc5);
+
+	plen = 2 /* PBC */ + hwords + nwords;
+	pbc_flags |= PBC_PACKET_BYPASS | PBC_INSERT_BYPASS_ICRC;
+	vl = sc_to_vlt(ppd->dd, sc5);
+	pbc = create_pbc(ppd, pbc_flags, qp->srate_mbps, vl, plen);
+	if (ctxt) {
+		pbuf = sc_buffer_alloc(ctxt, plen, NULL, NULL);
+		if (pbuf)
+			ppd->dd->pio_inline_send(ppd->dd, pbuf, pbc,
+						 &hdr, hwords);
+	}
+}
+
 void return_cnp(struct hfi1_ibport *ibp, struct rvt_qp *qp, u32 remote_qpn,
 		u32 pkey, u32 slid, u32 dlid, u8 sc5,
 		const struct ib_grh *old_grh)
@@ -535,11 +725,7 @@ void return_cnp(struct hfi1_ibport *ibp, struct rvt_qp *qp, u32 remote_qpn,
 	ohdr->bth[1] = cpu_to_be32(remote_qpn | (1 << IB_BECN_SHIFT));
 	ohdr->bth[2] = 0; /* PSN 0 */
 
-	hdr.lrh[0] = cpu_to_be16(lrh0);
-	hdr.lrh[1] = cpu_to_be16(dlid);
-	hdr.lrh[2] = cpu_to_be16(hwords + SIZE_OF_CRC);
-	hdr.lrh[3] = cpu_to_be16(slid);
-
+	hfi1_make_ib_hdr(&hdr, lrh0, hwords + SIZE_OF_CRC, dlid, slid);
 	plen = 2 /* PBC */ + hwords;
 	pbc_flags |= (ib_is_sc5(sc5) << PBC_DC_INFO_SHIFT);
 	vl = sc_to_vlt(ppd->dd, sc5);
@@ -672,18 +858,33 @@ void hfi1_ud_rcv(struct hfi1_packet *packet)
 	void *data = packet->payload;
 	u32 tlen = packet->tlen;
 	struct rvt_qp *qp = packet->qp;
-	u8 sc5 = hfi1_9B_get_sc5(hdr, packet->rhf);
+	u8 sc5 = packet->sc;
 	u8 sl_from_sc;
-	u8 extra_bytes = packet->pad;
 	u8 opcode = packet->opcode;
 	u8 sl = packet->sl;
 	u32 dlid = packet->dlid;
 	u32 slid = packet->slid;
+	u8 extra_bytes;
+	bool dlid_is_permissive;
+	bool slid_is_permissive;
 
+	extra_bytes = packet->pad + packet->extra_byte + (SIZE_OF_CRC << 2);
 	qkey = ib_get_qkey(ohdr);
 	src_qp = ib_get_sqpn(ohdr);
-	pkey = ib_bth_get_pkey(ohdr);
-	extra_bytes += (SIZE_OF_CRC << 2);
+
+	if (packet->etype == RHF_RCV_TYPE_BYPASS) {
+		u32 permissive_lid =
+			opa_get_lid(be32_to_cpu(OPA_LID_PERMISSIVE), 16B);
+
+		pkey = hfi1_16B_get_pkey(packet->hdr);
+		dlid_is_permissive = (dlid == permissive_lid);
+		slid_is_permissive = (slid == permissive_lid);
+	} else {
+		hdr = packet->hdr;
+		pkey = ib_bth_get_pkey(ohdr);
+		dlid_is_permissive = (dlid == be16_to_cpu(IB_LID_PERMISSIVE));
+		slid_is_permissive = (slid == be16_to_cpu(IB_LID_PERMISSIVE));
+	}
 	sl_from_sc = ibp->sc_to_sl[sc5];
 
 	process_ecn(qp, packet, (opcode != IB_OPCODE_CNP));
@@ -701,8 +902,7 @@ void hfi1_ud_rcv(struct hfi1_packet *packet)
 	 * and the QKEY matches (see 9.6.1.4.1 and 9.6.1.5.1).
 	 */
 	if (qp->ibqp.qp_num) {
-		if (unlikely(hdr->lrh[1] == IB_LID_PERMISSIVE ||
-			     hdr->lrh[3] == IB_LID_PERMISSIVE))
+		if (unlikely(dlid_is_permissive || slid_is_permissive))
 			goto drop;
 		if (qp->ibqp.qp_num > 1) {
 			if (unlikely(rcv_pkey_check(ppd, pkey, sc5, slid))) {
@@ -740,8 +940,7 @@ void hfi1_ud_rcv(struct hfi1_packet *packet)
 
 		if (tlen > 2048)
 			goto drop;
-		if ((hdr->lrh[1] == IB_LID_PERMISSIVE ||
-		     hdr->lrh[3] == IB_LID_PERMISSIVE) &&
+		if ((dlid_is_permissive || slid_is_permissive) &&
 		    smp->mgmt_class != IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE)
 			goto drop;
 
@@ -794,7 +993,18 @@ void hfi1_ud_rcv(struct hfi1_packet *packet)
 		goto drop;
 	}
 	if (packet->grh) {
-		hfi1_copy_sge(&qp->r_sge, &hdr->u.l.grh,
+		hfi1_copy_sge(&qp->r_sge, packet->grh,
+			      sizeof(struct ib_grh), true, false);
+		wc.wc_flags |= IB_WC_GRH;
+	} else if (packet->etype == RHF_RCV_TYPE_BYPASS) {
+		struct ib_grh grh;
+		/*
+		 * Assuming we only created 16B on the send side
+		 * if we want to use large LIDs, since GRH was stripped
+		 * out when creating 16B, add back the GRH here.
+		 */
+		hfi1_make_ext_grh(packet, &grh, slid, dlid);
+		hfi1_copy_sge(&qp->r_sge, &grh,
 			      sizeof(struct ib_grh), true, false);
 		wc.wc_flags |= IB_WC_GRH;
 	} else {
@@ -827,14 +1037,15 @@ void hfi1_ud_rcv(struct hfi1_packet *packet)
 	} else {
 		wc.pkey_index = 0;
 	}
-
+	if (slid_is_permissive)
+		slid = be32_to_cpu(OPA_LID_PERMISSIVE);
 	wc.slid = slid;
 	wc.sl = sl_from_sc;
 
 	/*
 	 * Save the LMC lower bits if the destination LID is a unicast LID.
 	 */
-	wc.dlid_path_bits = dlid >= be16_to_cpu(IB_MULTICAST_LID_BASE) ? 0 :
+	wc.dlid_path_bits = hfi1_check_mcast(dlid) ? 0 :
 		dlid & ((1 << ppd_from_ibp(ibp)->lmc) - 1);
 	wc.port_num = qp->port_num;
 	/* Signal completion event if the solicited bit is set. */
diff --git a/drivers/infiniband/hw/hfi1/verbs.h b/drivers/infiniband/hw/hfi1/verbs.h
index d3dd0c01b8f6..4928ee4f92c1 100644
--- a/drivers/infiniband/hw/hfi1/verbs.h
+++ b/drivers/infiniband/hw/hfi1/verbs.h
@@ -259,7 +259,7 @@ static inline int hfi1_send_ok(struct rvt_qp *qp)
  * This must be called with s_lock held.
  */
 void hfi1_bad_pkey(struct hfi1_ibport *ibp, u32 key, u32 sl,
-		   u32 qp1, u32 qp2, u16 lid1, u16 lid2);
+		   u32 qp1, u32 qp2, u32 lid1, u32 lid2);
 void hfi1_cap_mask_chg(struct rvt_dev_info *rdi, u8 port_num);
 void hfi1_sys_guid_chg(struct hfi1_ibport *ibp);
 void hfi1_node_desc_chg(struct hfi1_ibport *ibp);
diff --git a/include/rdma/opa_addr.h b/include/rdma/opa_addr.h
index 9ae126fb8648..e6e90f18e6d5 100644
--- a/include/rdma/opa_addr.h
+++ b/include/rdma/opa_addr.h
@@ -54,6 +54,7 @@
 #define OPA_MAKE_ID(x)          (cpu_to_be64(OPA_SPECIAL_OUI << 40 | (x)))
 #define OPA_TO_IB_UCAST_LID(x) (((x) >= be16_to_cpu(IB_MULTICAST_LID_BASE)) \
 				? 0 : x)
+#define OPA_GID_INDEX		0x1
 /**
  * 0xF8 - 4 bits of multicast range and 1 bit for collective range
  * Example: For 24 bit LID space,
-- 
cgit v1.2.3


From 51e658f5dd362cc8666f3f5ec1986660e3e51047 Mon Sep 17 00:00:00 2001
From: Dasaratharaman Chandramouli <dasaratharaman.chandramouli@intel.com>
Date: Fri, 4 Aug 2017 13:54:35 -0700
Subject: IB/rdmavt, hfi1, qib: Enhance rdmavt and hfi1 to use 32 bit lids

Increase lid used in hfi1 driver to 32 bits. qib continues
to use 16 bit lids.

Reviewed-by: Dennis Dalessandro <dennis.dalessandro@intel.com>
Signed-off-by: Dasaratharaman Chandramouli <dasaratharaman.chandramouli@intel.com>
Signed-off-by: Don Hiatt <don.hiatt@intel.com>
Signed-off-by: Dennis Dalessandro <dennis.dalessandro@intel.com>
Signed-off-by: Doug Ledford <dledford@redhat.com>
---
 drivers/infiniband/hw/hfi1/chip.c   | 12 +++--
 drivers/infiniband/hw/hfi1/hfi.h    |  2 +-
 drivers/infiniband/hw/hfi1/mad.c    | 91 ++++++++++++++++++++++++++++++++-----
 drivers/infiniband/hw/hfi1/verbs.c  | 23 +---------
 drivers/infiniband/hw/hfi1/verbs.h  |  2 -
 drivers/infiniband/hw/qib/qib_mad.c |  4 +-
 include/rdma/rdma_vt.h              |  2 +-
 7 files changed, 93 insertions(+), 43 deletions(-)

(limited to 'include')

diff --git a/drivers/infiniband/hw/hfi1/chip.c b/drivers/infiniband/hw/hfi1/chip.c
index ee1324cce25a..cbda37386982 100644
--- a/drivers/infiniband/hw/hfi1/chip.c
+++ b/drivers/infiniband/hw/hfi1/chip.c
@@ -10067,10 +10067,16 @@ static void set_lidlmc(struct hfi1_pportdata *ppd)
 	struct hfi1_devdata *dd = ppd->dd;
 	u32 mask = ~((1U << ppd->lmc) - 1);
 	u64 c1 = read_csr(ppd->dd, DCC_CFG_PORT_CONFIG1);
+	u32 lid;
 
+	/*
+	 * Program 0 in CSR if port lid is extended. This prevents
+	 * 9B packets being sent out for large lids.
+	 */
+	lid = (ppd->lid >= be16_to_cpu(IB_MULTICAST_LID_BASE)) ? 0 : ppd->lid;
 	c1 &= ~(DCC_CFG_PORT_CONFIG1_TARGET_DLID_SMASK
 		| DCC_CFG_PORT_CONFIG1_DLID_MASK_SMASK);
-	c1 |= ((ppd->lid & DCC_CFG_PORT_CONFIG1_TARGET_DLID_MASK)
+	c1 |= ((lid & DCC_CFG_PORT_CONFIG1_TARGET_DLID_MASK)
 			<< DCC_CFG_PORT_CONFIG1_TARGET_DLID_SHIFT) |
 	      ((mask & DCC_CFG_PORT_CONFIG1_DLID_MASK_MASK)
 			<< DCC_CFG_PORT_CONFIG1_DLID_MASK_SHIFT);
@@ -10081,7 +10087,7 @@ static void set_lidlmc(struct hfi1_pportdata *ppd)
 	 */
 	sreg = ((mask & SEND_CTXT_CHECK_SLID_MASK_MASK) <<
 			SEND_CTXT_CHECK_SLID_MASK_SHIFT) |
-	       (((ppd->lid & mask) & SEND_CTXT_CHECK_SLID_VALUE_MASK) <<
+	       (((lid & mask) & SEND_CTXT_CHECK_SLID_VALUE_MASK) <<
 			SEND_CTXT_CHECK_SLID_VALUE_SHIFT);
 
 	for (i = 0; i < dd->chip_send_contexts; i++) {
@@ -10091,7 +10097,7 @@ static void set_lidlmc(struct hfi1_pportdata *ppd)
 	}
 
 	/* Now we have to do the same thing for the sdma engines */
-	sdma_update_lmc(dd, mask, ppd->lid);
+	sdma_update_lmc(dd, mask, lid);
 }
 
 static const char *state_completed_string(u32 completed)
diff --git a/drivers/infiniband/hw/hfi1/hfi.h b/drivers/infiniband/hw/hfi1/hfi.h
index b07f42cfa5bf..52cae1146b80 100644
--- a/drivers/infiniband/hw/hfi1/hfi.h
+++ b/drivers/infiniband/hw/hfi1/hfi.h
@@ -718,7 +718,7 @@ struct hfi1_pportdata {
 	u32 ibmaxlen;
 	u32 current_egress_rate; /* units [10^6 bits/sec] */
 	/* LID programmed for this instance */
-	u16 lid;
+	u32 lid;
 	/* list of pkeys programmed; 0 if not set */
 	u16 pkeys[MAX_PKEY_VALUES];
 	u16 link_width_supported;
diff --git a/drivers/infiniband/hw/hfi1/mad.c b/drivers/infiniband/hw/hfi1/mad.c
index 1509bc6b76d8..cdcb4d021480 100644
--- a/drivers/infiniband/hw/hfi1/mad.c
+++ b/drivers/infiniband/hw/hfi1/mad.c
@@ -234,6 +234,61 @@ static void subn_handle_opa_trap_repress(struct hfi1_ibport *ibp,
 	spin_unlock_irqrestore(&ibp->rvp.lock, flags);
 }
 
+static void hfi1_update_sm_ah_attr(struct hfi1_ibport *ibp,
+				   struct rdma_ah_attr *attr, u32 dlid)
+{
+	rdma_ah_set_dlid(attr, dlid);
+	rdma_ah_set_port_num(attr, ppd_from_ibp(ibp)->port);
+	if (dlid >= be16_to_cpu(IB_MULTICAST_LID_BASE)) {
+		struct ib_global_route *grh = rdma_ah_retrieve_grh(attr);
+
+		rdma_ah_set_ah_flags(attr, IB_AH_GRH);
+		grh->sgid_index = 0;
+		grh->hop_limit = 1;
+		grh->dgid.global.subnet_prefix =
+			ibp->rvp.gid_prefix;
+		grh->dgid.global.interface_id = OPA_MAKE_ID(dlid);
+	}
+}
+
+static int hfi1_modify_qp0_ah(struct hfi1_ibport *ibp,
+			      struct rvt_ah *ah, u32 dlid)
+{
+	struct rdma_ah_attr attr;
+	struct rvt_qp *qp0;
+	int ret = -EINVAL;
+
+	memset(&attr, 0, sizeof(attr));
+	attr.type = ah->ibah.type;
+	hfi1_update_sm_ah_attr(ibp, &attr, dlid);
+	rcu_read_lock();
+	qp0 = rcu_dereference(ibp->rvp.qp[0]);
+	if (qp0)
+		ret = rdma_modify_ah(&ah->ibah, &attr);
+	rcu_read_unlock();
+	return ret;
+}
+
+static struct ib_ah *hfi1_create_qp0_ah(struct hfi1_ibport *ibp, u32 dlid)
+{
+	struct rdma_ah_attr attr;
+	struct ib_ah *ah = ERR_PTR(-EINVAL);
+	struct rvt_qp *qp0;
+	struct hfi1_pportdata *ppd = ppd_from_ibp(ibp);
+	struct hfi1_devdata *dd = dd_from_ppd(ppd);
+	u8 port_num = ppd->port;
+
+	memset(&attr, 0, sizeof(attr));
+	attr.type = rdma_ah_find_type(&dd->verbs_dev.rdi.ibdev, port_num);
+	hfi1_update_sm_ah_attr(ibp, &attr, dlid);
+	rcu_read_lock();
+	qp0 = rcu_dereference(ibp->rvp.qp[0]);
+	if (qp0)
+		ah = rdma_create_ah(qp0->ibqp.pd, &attr);
+	rcu_read_unlock();
+	return ah;
+}
+
 static void send_trap(struct hfi1_ibport *ibp, struct trap_node *trap)
 {
 	struct ib_mad_send_buf *send_buf;
@@ -1283,8 +1338,8 @@ static int __subn_set_opa_portinfo(struct opa_smp *smp, u32 am, u8 *data,
 	struct hfi1_ibport *ibp;
 	u8 clientrereg;
 	unsigned long flags;
-	u32 smlid, opa_lid; /* tmp vars to hold LID values */
-	u16 lid;
+	u32 smlid;
+	u32 lid;
 	u8 ls_old, ls_new, ps_new;
 	u8 vls;
 	u8 msl;
@@ -1301,22 +1356,20 @@ static int __subn_set_opa_portinfo(struct opa_smp *smp, u32 am, u8 *data,
 		return reply((struct ib_mad_hdr *)smp);
 	}
 
-	opa_lid = be32_to_cpu(pi->lid);
-	if (opa_lid & 0xFFFF0000) {
-		pr_warn("OPA_PortInfo lid out of range: %X\n", opa_lid);
+	lid = be32_to_cpu(pi->lid);
+	if (lid & 0xFF000000) {
+		pr_warn("OPA_PortInfo lid out of range: %X\n", lid);
 		smp->status |= IB_SMP_INVALID_FIELD;
 		goto get_only;
 	}
 
-	lid = (u16)(opa_lid & 0x0000FFFF);
 
 	smlid = be32_to_cpu(pi->sm_lid);
-	if (smlid & 0xFFFF0000) {
+	if (smlid & 0xFF000000) {
 		pr_warn("OPA_PortInfo SM lid out of range: %X\n", smlid);
 		smp->status |= IB_SMP_INVALID_FIELD;
 		goto get_only;
 	}
-	smlid &= 0x0000FFFF;
 
 	clientrereg = (pi->clientrereg_subnettimeout &
 			OPA_PI_MASK_CLIENT_REREGISTER);
@@ -1331,12 +1384,16 @@ static int __subn_set_opa_portinfo(struct opa_smp *smp, u32 am, u8 *data,
 	ls_old = driver_lstate(ppd);
 
 	ibp->rvp.mkey = pi->mkey;
-	ibp->rvp.gid_prefix = pi->subnet_prefix;
+	if (ibp->rvp.gid_prefix != pi->subnet_prefix) {
+		ibp->rvp.gid_prefix = pi->subnet_prefix;
+		event.event = IB_EVENT_GID_CHANGE;
+		ib_dispatch_event(&event);
+	}
 	ibp->rvp.mkey_lease_period = be16_to_cpu(pi->mkey_lease_period);
 
 	/* Must be a valid unicast LID address. */
 	if ((lid == 0 && ls_old > IB_PORT_INIT) ||
-	    lid >= be16_to_cpu(IB_MULTICAST_LID_BASE)) {
+	     (hfi1_is_16B_mcast(lid))) {
 		smp->status |= IB_SMP_INVALID_FIELD;
 		pr_warn("SubnSet(OPA_PortInfo) lid invalid 0x%x\n",
 			lid);
@@ -1349,6 +1406,16 @@ static int __subn_set_opa_portinfo(struct opa_smp *smp, u32 am, u8 *data,
 		hfi1_set_lid(ppd, lid, pi->mkeyprotect_lmc & OPA_PI_MASK_LMC);
 		event.event = IB_EVENT_LID_CHANGE;
 		ib_dispatch_event(&event);
+
+		if (HFI1_PORT_GUID_INDEX + 1 < HFI1_GUIDS_PER_PORT) {
+			/* Manufacture GID from LID to support extended
+			 * addresses
+			 */
+			ppd->guids[HFI1_PORT_GUID_INDEX + 1] =
+				be64_to_cpu(OPA_MAKE_ID(lid));
+			event.event = IB_EVENT_GID_CHANGE;
+			ib_dispatch_event(&event);
+		}
 	}
 
 	msl = pi->smsl & OPA_PI_MASK_SMSL;
@@ -1359,7 +1426,7 @@ static int __subn_set_opa_portinfo(struct opa_smp *smp, u32 am, u8 *data,
 
 	/* Must be a valid unicast LID address. */
 	if ((smlid == 0 && ls_old > IB_PORT_INIT) ||
-	    smlid >= be16_to_cpu(IB_MULTICAST_LID_BASE)) {
+	     (hfi1_is_16B_mcast(smlid))) {
 		smp->status |= IB_SMP_INVALID_FIELD;
 		pr_warn("SubnSet(OPA_PortInfo) smlid invalid 0x%x\n", smlid);
 	} else if (smlid != ibp->rvp.sm_lid || msl != ibp->rvp.sm_sl) {
@@ -1367,7 +1434,7 @@ static int __subn_set_opa_portinfo(struct opa_smp *smp, u32 am, u8 *data,
 		spin_lock_irqsave(&ibp->rvp.lock, flags);
 		if (ibp->rvp.sm_ah) {
 			if (smlid != ibp->rvp.sm_lid)
-				rdma_ah_set_dlid(&ibp->rvp.sm_ah->attr, smlid);
+				hfi1_modify_qp0_ah(ibp, ibp->rvp.sm_ah, smlid);
 			if (msl != ibp->rvp.sm_sl)
 				rdma_ah_set_sl(&ibp->rvp.sm_ah->attr, msl);
 		}
diff --git a/drivers/infiniband/hw/hfi1/verbs.c b/drivers/infiniband/hw/hfi1/verbs.c
index 18b27276f202..83565e5f46d0 100644
--- a/drivers/infiniband/hw/hfi1/verbs.c
+++ b/drivers/infiniband/hw/hfi1/verbs.c
@@ -1394,7 +1394,7 @@ static int query_port(struct rvt_dev_info *rdi, u8 port_num,
 	struct hfi1_ibdev *verbs_dev = dev_from_rdi(rdi);
 	struct hfi1_devdata *dd = dd_from_dev(verbs_dev);
 	struct hfi1_pportdata *ppd = &dd->pport[port_num - 1];
-	u16 lid = ppd->lid;
+	u32 lid = ppd->lid;
 
 	/* props being zeroed by the caller, avoid zeroing it here */
 	props->lid = lid ? lid : 0;
@@ -1555,27 +1555,6 @@ static void hfi1_notify_new_ah(struct ib_device *ibdev,
 		ah->log_pmtu = ilog2(dd->vld[ah->vl].mtu);
 }
 
-struct ib_ah *hfi1_create_qp0_ah(struct hfi1_ibport *ibp, u16 dlid)
-{
-	struct rdma_ah_attr attr;
-	struct ib_ah *ah = ERR_PTR(-EINVAL);
-	struct rvt_qp *qp0;
-	struct hfi1_pportdata *ppd = ppd_from_ibp(ibp);
-	struct hfi1_devdata *dd = dd_from_ppd(ppd);
-	u8 port_num = ppd->port;
-
-	memset(&attr, 0, sizeof(attr));
-	attr.type = rdma_ah_find_type(&dd->verbs_dev.rdi.ibdev, port_num);
-	rdma_ah_set_dlid(&attr, dlid);
-	rdma_ah_set_port_num(&attr, ppd_from_ibp(ibp)->port);
-	rcu_read_lock();
-	qp0 = rcu_dereference(ibp->rvp.qp[0]);
-	if (qp0)
-		ah = rdma_create_ah(qp0->ibqp.pd, &attr);
-	rcu_read_unlock();
-	return ah;
-}
-
 /**
  * hfi1_get_npkeys - return the size of the PKEY table for context 0
  * @dd: the hfi1_ib device
diff --git a/drivers/infiniband/hw/hfi1/verbs.h b/drivers/infiniband/hw/hfi1/verbs.h
index 4928ee4f92c1..ab1618e32d9c 100644
--- a/drivers/infiniband/hw/hfi1/verbs.h
+++ b/drivers/infiniband/hw/hfi1/verbs.h
@@ -334,8 +334,6 @@ void hfi1_rc_hdrerr(
 
 u8 ah_to_sc(struct ib_device *ibdev, struct rdma_ah_attr *ah_attr);
 
-struct ib_ah *hfi1_create_qp0_ah(struct hfi1_ibport *ibp, u16 dlid);
-
 void hfi1_rc_send_complete(struct rvt_qp *qp, struct hfi1_opa_header *opah);
 
 void hfi1_ud_rcv(struct hfi1_packet *packet);
diff --git a/drivers/infiniband/hw/qib/qib_mad.c b/drivers/infiniband/hw/qib/qib_mad.c
index 6b9b43b944e3..549c71971a1f 100644
--- a/drivers/infiniband/hw/qib/qib_mad.c
+++ b/drivers/infiniband/hw/qib/qib_mad.c
@@ -105,7 +105,7 @@ static void qib_send_trap(struct qib_ibport *ibp, void *data, unsigned len)
 		if (ibp->rvp.sm_lid != be16_to_cpu(IB_LID_PERMISSIVE)) {
 			struct ib_ah *ah;
 
-			ah = qib_create_qp0_ah(ibp, ibp->rvp.sm_lid);
+			ah = qib_create_qp0_ah(ibp, (u16)ibp->rvp.sm_lid);
 			if (IS_ERR(ah))
 				ret = PTR_ERR(ah);
 			else {
@@ -496,7 +496,7 @@ static int subn_get_portinfo(struct ib_smp *smp, struct ib_device *ibdev,
 		pip->mkey = ibp->rvp.mkey;
 	pip->gid_prefix = ibp->rvp.gid_prefix;
 	pip->lid = cpu_to_be16(ppd->lid);
-	pip->sm_lid = cpu_to_be16(ibp->rvp.sm_lid);
+	pip->sm_lid = cpu_to_be16((u16)ibp->rvp.sm_lid);
 	pip->cap_mask = cpu_to_be32(ibp->rvp.port_cap_flags);
 	/* pip->diag_code; */
 	pip->mkey_lease_period = cpu_to_be16(ibp->rvp.mkey_lease_period);
diff --git a/include/rdma/rdma_vt.h b/include/rdma/rdma_vt.h
index fdfac0fd2f82..1d94f3c264ba 100644
--- a/include/rdma/rdma_vt.h
+++ b/include/rdma/rdma_vt.h
@@ -91,7 +91,7 @@ struct rvt_ibport {
 	__be16 pma_counter_select[5];
 	u16 pma_tag;
 	u16 mkey_lease_period;
-	u16 sm_lid;
+	u32 sm_lid;
 	u8 sm_sl;
 	u8 mkeyprot;
 	u8 subnet_timeout;
-- 
cgit v1.2.3


From ec0d8b8a63ee760bca1bccc6769d6210e05ded29 Mon Sep 17 00:00:00 2001
From: Kamenee Arumugame <kamenee.arumugam@intel.com>
Date: Sun, 13 Aug 2017 08:08:46 -0700
Subject: IB/hfi1: Stricter bounds checking of MAD trap index

The macro size is valid. This change makes it less ambiguous.
Bounds check trap type for better security.

Reviewed-by: Michael J. Ruhl <michael.j.ruhl@intel.com>
Signed-off-by: Kamenee Arumugam <kamenee.arumugam@intel.com>
Signed-off-by: Dennis Dalessandro <dennis.dalessandro@intel.com>
Signed-off-by: Doug Ledford <dledford@redhat.com>
---
 drivers/infiniband/hw/hfi1/mad.c | 13 ++++++++++++-
 include/rdma/rdma_vt.h           |  2 +-
 2 files changed, 13 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/drivers/infiniband/hw/hfi1/mad.c b/drivers/infiniband/hw/hfi1/mad.c
index 37b19bfae02a..661ba707fc60 100644
--- a/drivers/infiniband/hw/hfi1/mad.c
+++ b/drivers/infiniband/hw/hfi1/mad.c
@@ -151,13 +151,24 @@ static struct trap_node *check_and_add_trap(struct hfi1_ibport *ibp,
 	unsigned long flags;
 	unsigned long timeout;
 	int found = 0;
+	unsigned int queue_id;
+	static int trap_count;
+
+	queue_id = trap->data.generic_type & 0x0F;
+	if (queue_id >= RVT_MAX_TRAP_LISTS) {
+		trap_count++;
+		pr_err_ratelimited("hfi1: Invalid trap 0x%0x dropped. Total dropped: %d\n",
+				  trap->data.generic_type, trap_count);
+		kfree(trap);
+		return NULL;
+	}
 
 	/*
 	 * Since the retry (handle timeout) does not remove a trap request
 	 * from the list, all we have to do is compare the node.
 	 */
 	spin_lock_irqsave(&ibp->rvp.lock, flags);
-	trap_list = &ibp->rvp.trap_lists[trap->data.generic_type & 0x0F];
+	trap_list = &ibp->rvp.trap_lists[queue_id];
 
 	list_for_each_entry(node, &trap_list->list, list) {
 		if (node == trap) {
diff --git a/include/rdma/rdma_vt.h b/include/rdma/rdma_vt.h
index 1d94f3c264ba..1ba84a78f1c5 100644
--- a/include/rdma/rdma_vt.h
+++ b/include/rdma/rdma_vt.h
@@ -64,7 +64,7 @@
 #define RVT_MAX_PKEY_VALUES 16
 
 #define RVT_MAX_TRAP_LEN 100 /* Limit pending trap list */
-#define RVT_MAX_TRAP_LISTS ((IB_NOTICE_TYPE_INFO & 0x0F) + 1)
+#define RVT_MAX_TRAP_LISTS 5 /*((IB_NOTICE_TYPE_INFO & 0x0F) + 1)*/
 #define RVT_TRAP_TIMEOUT 4096 /* 4.096 usec */
 
 struct trap_list {
-- 
cgit v1.2.3


From e3bf14bdc17a8e917f337760cc7cacf3232d7dbc Mon Sep 17 00:00:00 2001
From: Jason Gunthorpe <jgunthorpe@obsidianresearch.com>
Date: Mon, 14 Aug 2017 14:57:39 -0600
Subject: rdma: Autoload netlink client modules

If a message comes in and we do not have the client in the table, then
try to load the module supplying that client using MODULE_ALIAS to find
it.

This duplicates the scheme seen in other netlink muxes (eg nfnetlink).

Signed-off-by: Jason Gunthorpe <jgunthorpe@obsidianresearch.com>
Reviewed-by: Leon Romanovsky <leonro@mellanox.com>
Signed-off-by: Doug Ledford <dledford@redhat.com>
---
 drivers/infiniband/core/cma.c     |  2 ++
 drivers/infiniband/core/device.c  |  2 ++
 drivers/infiniband/core/iwcm.c    |  2 ++
 drivers/infiniband/core/netlink.c |  9 +++++++++
 drivers/infiniband/core/nldev.c   |  3 +++
 include/rdma/rdma_netlink.h       | 12 ++++++++++++
 6 files changed, 30 insertions(+)

(limited to 'include')

diff --git a/drivers/infiniband/core/cma.c b/drivers/infiniband/core/cma.c
index d8edd8b11561..b76de2e2b209 100644
--- a/drivers/infiniband/core/cma.c
+++ b/drivers/infiniband/core/cma.c
@@ -4537,5 +4537,7 @@ static void __exit cma_cleanup(void)
 	destroy_workqueue(cma_wq);
 }
 
+MODULE_ALIAS_RDMA_NETLINK(RDMA_NL_RDMA_CM, 1);
+
 module_init(cma_init);
 module_exit(cma_cleanup);
diff --git a/drivers/infiniband/core/device.c b/drivers/infiniband/core/device.c
index 91d7cea1a0b9..fc6be1175183 100644
--- a/drivers/infiniband/core/device.c
+++ b/drivers/infiniband/core/device.c
@@ -1252,5 +1252,7 @@ static void __exit ib_core_cleanup(void)
 	destroy_workqueue(ib_wq);
 }
 
+MODULE_ALIAS_RDMA_NETLINK(RDMA_NL_LS, 4);
+
 module_init(ib_core_init);
 module_exit(ib_core_cleanup);
diff --git a/drivers/infiniband/core/iwcm.c b/drivers/infiniband/core/iwcm.c
index e33528e102f8..fcf42f6bb82a 100644
--- a/drivers/infiniband/core/iwcm.c
+++ b/drivers/infiniband/core/iwcm.c
@@ -1200,5 +1200,7 @@ static void __exit iw_cm_cleanup(void)
 	iwpm_exit(RDMA_NL_IWCM);
 }
 
+MODULE_ALIAS_RDMA_NETLINK(RDMA_NL_IWCM, 2);
+
 module_init(iw_cm_init);
 module_exit(iw_cm_cleanup);
diff --git a/drivers/infiniband/core/netlink.c b/drivers/infiniband/core/netlink.c
index f782697cf4d8..e685148dd3e6 100644
--- a/drivers/infiniband/core/netlink.c
+++ b/drivers/infiniband/core/netlink.c
@@ -84,6 +84,15 @@ static bool is_nl_valid(unsigned int type, unsigned int op)
 		return false;
 
 	cb_table = rdma_nl_types[type].cb_table;
+#ifdef CONFIG_MODULES
+	if (!cb_table) {
+		mutex_unlock(&rdma_nl_mutex);
+		request_module("rdma-netlink-subsys-%d", type);
+		mutex_lock(&rdma_nl_mutex);
+		cb_table = rdma_nl_types[type].cb_table;
+	}
+#endif
+
 	if (!cb_table || (!cb_table[op].dump && !cb_table[op].doit))
 		return false;
 	return true;
diff --git a/drivers/infiniband/core/nldev.c b/drivers/infiniband/core/nldev.c
index 474022274e09..3ba24c428c3b 100644
--- a/drivers/infiniband/core/nldev.c
+++ b/drivers/infiniband/core/nldev.c
@@ -30,6 +30,7 @@
  * POSSIBILITY OF SUCH DAMAGE.
  */
 
+#include <linux/module.h>
 #include <net/netlink.h>
 #include <rdma/rdma_netlink.h>
 
@@ -320,3 +321,5 @@ void __exit nldev_exit(void)
 {
 	rdma_nl_unregister(RDMA_NL_NLDEV);
 }
+
+MODULE_ALIAS_RDMA_NETLINK(RDMA_NL_NLDEV, 5);
diff --git a/include/rdma/rdma_netlink.h b/include/rdma/rdma_netlink.h
index e25bf1988846..2d878596b1e0 100644
--- a/include/rdma/rdma_netlink.h
+++ b/include/rdma/rdma_netlink.h
@@ -17,6 +17,18 @@ enum rdma_nl_flags {
 	RDMA_NL_ADMIN_PERM	= 1 << 0,
 };
 
+/* Define this module as providing netlink services for NETLINK_RDMA, with
+ * index _index.  Since the client indexes were setup in a uapi header as an
+ * enum and we do no want to change that, the user must supply the expanded
+ * constant as well and the compiler checks they are the same.
+ */
+#define MODULE_ALIAS_RDMA_NETLINK(_index, _val)                                \
+	static inline void __chk_##_index(void)                                \
+	{                                                                      \
+		BUILD_BUG_ON(_index != _val);                                  \
+	}                                                                      \
+	MODULE_ALIAS("rdma-netlink-subsys-" __stringify(_val))
+
 /**
  * Register client in RDMA netlink.
  * @index: Index of the added client
-- 
cgit v1.2.3


From 111993692741a7044e6c01b428cecf1071de3d0b Mon Sep 17 00:00:00 2001
From: Tonghao Zhang <xiangxia.m.yue@gmail.com>
Date: Mon, 21 Aug 2017 23:33:49 -0700
Subject: tcp: Remove the unused parameter for tcp_try_fastopen.

Signed-off-by: Tonghao Zhang <xiangxia.m.yue@gmail.com>
Acked-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/tcp.h       | 3 +--
 net/ipv4/tcp_fastopen.c | 6 ++----
 net/ipv4/tcp_input.c    | 2 +-
 3 files changed, 4 insertions(+), 7 deletions(-)

(limited to 'include')

diff --git a/include/net/tcp.h b/include/net/tcp.h
index afdab3781425..a995004ae946 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -1533,8 +1533,7 @@ int tcp_fastopen_reset_cipher(void *key, unsigned int len);
 void tcp_fastopen_add_skb(struct sock *sk, struct sk_buff *skb);
 struct sock *tcp_try_fastopen(struct sock *sk, struct sk_buff *skb,
 			      struct request_sock *req,
-			      struct tcp_fastopen_cookie *foc,
-			      struct dst_entry *dst);
+			      struct tcp_fastopen_cookie *foc);
 void tcp_fastopen_init_key_once(bool publish);
 bool tcp_fastopen_cookie_check(struct sock *sk, u16 *mss,
 			     struct tcp_fastopen_cookie *cookie);
diff --git a/net/ipv4/tcp_fastopen.c b/net/ipv4/tcp_fastopen.c
index ce9c7fef200f..e3c33220c418 100644
--- a/net/ipv4/tcp_fastopen.c
+++ b/net/ipv4/tcp_fastopen.c
@@ -171,7 +171,6 @@ void tcp_fastopen_add_skb(struct sock *sk, struct sk_buff *skb)
 
 static struct sock *tcp_fastopen_create_child(struct sock *sk,
 					      struct sk_buff *skb,
-					      struct dst_entry *dst,
 					      struct request_sock *req)
 {
 	struct tcp_sock *tp;
@@ -278,8 +277,7 @@ static bool tcp_fastopen_queue_check(struct sock *sk)
  */
 struct sock *tcp_try_fastopen(struct sock *sk, struct sk_buff *skb,
 			      struct request_sock *req,
-			      struct tcp_fastopen_cookie *foc,
-			      struct dst_entry *dst)
+			      struct tcp_fastopen_cookie *foc)
 {
 	struct tcp_fastopen_cookie valid_foc = { .len = -1 };
 	bool syn_data = TCP_SKB_CB(skb)->end_seq != TCP_SKB_CB(skb)->seq + 1;
@@ -312,7 +310,7 @@ struct sock *tcp_try_fastopen(struct sock *sk, struct sk_buff *skb,
 		 * data in SYN_RECV state.
 		 */
 fastopen:
-		child = tcp_fastopen_create_child(sk, skb, dst, req);
+		child = tcp_fastopen_create_child(sk, skb, req);
 		if (child) {
 			foc->len = -1;
 			NET_INC_STATS(sock_net(sk),
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index ab908949ee95..d3421ee9a10a 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -6150,7 +6150,7 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops,
 	tcp_openreq_init_rwin(req, sk, dst);
 	if (!want_cookie) {
 		tcp_reqsk_record_syn(sk, req, skb);
-		fastopen_sk = tcp_try_fastopen(sk, skb, req, &foc, dst);
+		fastopen_sk = tcp_try_fastopen(sk, skb, req, &foc);
 	}
 	if (fastopen_sk) {
 		af_ops->send_synack(fastopen_sk, dst, &fl, req,
-- 
cgit v1.2.3


From 84e54fe0a5eaed696dee4019c396f8396f5a908b Mon Sep 17 00:00:00 2001
From: William Tu <u9012063@gmail.com>
Date: Tue, 22 Aug 2017 09:40:28 -0700
Subject: gre: introduce native tunnel support for ERSPAN

The patch adds ERSPAN type II tunnel support.  The implementation
is based on the draft at [1].  One of the purposes is for Linux
box to be able to receive ERSPAN monitoring traffic sent from
the Cisco switch, by creating a ERSPAN tunnel device.
In addition, the patch also adds ERSPAN TX, so Linux virtual
switch can redirect monitored traffic to the ERSPAN tunnel device.
The traffic will be encapsulated into ERSPAN and sent out.

The implementation reuses tunnel key as ERSPAN session ID, and
field 'erspan' as ERSPAN Index fields:
./ip link add dev ers11 type erspan seq key 100 erspan 123 \
			local 172.16.1.200 remote 172.16.1.100

To use the above device as ERSPAN receiver, configure
Nexus 5000 switch as below:

monitor session 100 type erspan-source
  erspan-id 123
  vrf default
  destination ip 172.16.1.200
  source interface Ethernet1/11 both
  source interface Ethernet1/12 both
  no shut
monitor erspan origin ip-address 172.16.1.100 global

[1] https://tools.ietf.org/html/draft-foschiano-erspan-01
[2] iproute2 patch: http://marc.info/?l=linux-netdev&m=150306086924951&w=2
[3] test script: http://marc.info/?l=linux-netdev&m=150231021807304&w=2

Signed-off-by: William Tu <u9012063@gmail.com>
Signed-off-by: Meenakshi Vohra <mvohra@vmware.com>
Cc: Alexey Kuznetsov <kuznet@ms2.inr.ac.ru>
Cc: Hideaki YOSHIFUJI <yoshfuji@linux-ipv6.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/erspan.h           |  61 ++++++++++
 include/net/ip_tunnels.h       |   3 +
 include/uapi/linux/if_ether.h  |   1 +
 include/uapi/linux/if_tunnel.h |   1 +
 net/ipv4/ip_gre.c              | 269 +++++++++++++++++++++++++++++++++++++++++
 5 files changed, 335 insertions(+)
 create mode 100644 include/net/erspan.h

(limited to 'include')

diff --git a/include/net/erspan.h b/include/net/erspan.h
new file mode 100644
index 000000000000..ca94fc86865e
--- /dev/null
+++ b/include/net/erspan.h
@@ -0,0 +1,61 @@
+#ifndef __LINUX_ERSPAN_H
+#define __LINUX_ERSPAN_H
+
+/*
+ * GRE header for ERSPAN encapsulation (8 octets [34:41]) -- 8 bytes
+ *       0                   1                   2                   3
+ *      0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
+ *     +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ *     |0|0|0|1|0|00000|000000000|00000|    Protocol Type for ERSPAN   |
+ *     +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ *     |      Sequence Number (increments per packet per session)      |
+ *     +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ *
+ *  Note that in the above GRE header [RFC1701] out of the C, R, K, S,
+ *  s, Recur, Flags, Version fields only S (bit 03) is set to 1. The
+ *  other fields are set to zero, so only a sequence number follows.
+ *
+ *  ERSPAN Type II header (8 octets [42:49])
+ *  0                   1                   2                   3
+ *  0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
+ * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ * |  Ver  |          VLAN         | COS | En|T|    Session ID     |
+ * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ * |      Reserved         |                  Index                |
+ * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ *
+ * GRE proto ERSPAN type II = 0x88BE, type III = 0x22EB
+ */
+
+#define ERSPAN_VERSION	0x1
+
+#define VER_MASK	0xf000
+#define VLAN_MASK	0x0fff
+#define COS_MASK	0xe000
+#define EN_MASK		0x1800
+#define T_MASK		0x0400
+#define ID_MASK		0x03ff
+#define INDEX_MASK	0xfffff
+
+enum erspan_encap_type {
+	ERSPAN_ENCAP_NOVLAN = 0x0,	/* originally without VLAN tag */
+	ERSPAN_ENCAP_ISL = 0x1,		/* originally ISL encapsulated */
+	ERSPAN_ENCAP_8021Q = 0x2,	/* originally 802.1Q encapsulated */
+	ERSPAN_ENCAP_INFRAME = 0x3,	/* VLAN tag perserved in frame */
+};
+
+struct erspan_metadata {
+	__be32 index;   /* type II */
+};
+
+struct erspanhdr {
+	__be16 ver_vlan;
+#define VER_OFFSET  12
+	__be16 session_id;
+#define COS_OFFSET  13
+#define EN_OFFSET   11
+#define T_OFFSET    10
+	struct erspan_metadata md;
+};
+
+#endif
diff --git a/include/net/ip_tunnels.h b/include/net/ip_tunnels.h
index 520809912f03..625c29329372 100644
--- a/include/net/ip_tunnels.h
+++ b/include/net/ip_tunnels.h
@@ -115,6 +115,9 @@ struct ip_tunnel {
 	u32		o_seqno;	/* The last output seqno */
 	int		tun_hlen;	/* Precalculated header length */
 
+	/* This field used only by ERSPAN */
+	u32		index;		/* ERSPAN type II index */
+
 	struct dst_cache dst_cache;
 
 	struct ip_tunnel_parm parms;
diff --git a/include/uapi/linux/if_ether.h b/include/uapi/linux/if_ether.h
index 5bc9bfd816b7..efeb1190c2ca 100644
--- a/include/uapi/linux/if_ether.h
+++ b/include/uapi/linux/if_ether.h
@@ -66,6 +66,7 @@
 #define ETH_P_ATALK	0x809B		/* Appletalk DDP		*/
 #define ETH_P_AARP	0x80F3		/* Appletalk AARP		*/
 #define ETH_P_8021Q	0x8100          /* 802.1Q VLAN Extended Header  */
+#define ETH_P_ERSPAN	0x88BE		/* ERSPAN type II		*/
 #define ETH_P_IPX	0x8137		/* IPX over DIX			*/
 #define ETH_P_IPV6	0x86DD		/* IPv6 over bluebook		*/
 #define ETH_P_PAUSE	0x8808		/* IEEE Pause frames. See 802.3 31B */
diff --git a/include/uapi/linux/if_tunnel.h b/include/uapi/linux/if_tunnel.h
index 6792d1967d31..2e520883c054 100644
--- a/include/uapi/linux/if_tunnel.h
+++ b/include/uapi/linux/if_tunnel.h
@@ -134,6 +134,7 @@ enum {
 	IFLA_GRE_COLLECT_METADATA,
 	IFLA_GRE_IGNORE_DF,
 	IFLA_GRE_FWMARK,
+	IFLA_GRE_ERSPAN_INDEX,
 	__IFLA_GRE_MAX,
 };
 
diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c
index 7a7829e839c2..6e8a62289e03 100644
--- a/net/ipv4/ip_gre.c
+++ b/net/ipv4/ip_gre.c
@@ -48,6 +48,7 @@
 #include <net/rtnetlink.h>
 #include <net/gre.h>
 #include <net/dst_metadata.h>
+#include <net/erspan.h>
 
 /*
    Problems & solutions
@@ -115,6 +116,7 @@ static int ipgre_tunnel_init(struct net_device *dev);
 
 static unsigned int ipgre_net_id __read_mostly;
 static unsigned int gre_tap_net_id __read_mostly;
+static unsigned int erspan_net_id __read_mostly;
 
 static void ipgre_err(struct sk_buff *skb, u32 info,
 		      const struct tnl_ptk_info *tpi)
@@ -246,6 +248,56 @@ static void gre_err(struct sk_buff *skb, u32 info)
 	ipgre_err(skb, info, &tpi);
 }
 
+static int erspan_rcv(struct sk_buff *skb, struct tnl_ptk_info *tpi,
+		      int gre_hdr_len)
+{
+	struct net *net = dev_net(skb->dev);
+	struct metadata_dst *tun_dst = NULL;
+	struct ip_tunnel_net *itn;
+	struct ip_tunnel *tunnel;
+	struct erspanhdr *ershdr;
+	const struct iphdr *iph;
+	__be32 session_id;
+	__be32 index;
+	int len;
+
+	itn = net_generic(net, erspan_net_id);
+	iph = ip_hdr(skb);
+	len = gre_hdr_len + sizeof(*ershdr);
+
+	if (unlikely(!pskb_may_pull(skb, len)))
+		return -ENOMEM;
+
+	iph = ip_hdr(skb);
+	ershdr = (struct erspanhdr *)(skb->data + gre_hdr_len);
+
+	/* The original GRE header does not have key field,
+	 * Use ERSPAN 10-bit session ID as key.
+	 */
+	session_id = cpu_to_be32(ntohs(ershdr->session_id));
+	tpi->key = session_id;
+	index = ershdr->md.index;
+	tunnel = ip_tunnel_lookup(itn, skb->dev->ifindex,
+				  tpi->flags | TUNNEL_KEY,
+				  iph->saddr, iph->daddr, tpi->key);
+
+	if (tunnel) {
+		if (__iptunnel_pull_header(skb,
+					   gre_hdr_len + sizeof(*ershdr),
+					   htons(ETH_P_TEB),
+					   false, false) < 0)
+			goto drop;
+
+		tunnel->index = ntohl(index);
+		skb_reset_mac_header(skb);
+		ip_tunnel_rcv(tunnel, skb, tpi, tun_dst, log_ecn_error);
+		return PACKET_RCVD;
+	}
+drop:
+	kfree_skb(skb);
+	return PACKET_RCVD;
+}
+
 static int __ipgre_rcv(struct sk_buff *skb, const struct tnl_ptk_info *tpi,
 		       struct ip_tunnel_net *itn, int hdr_len, bool raw_proto)
 {
@@ -328,6 +380,11 @@ static int gre_rcv(struct sk_buff *skb)
 	if (hdr_len < 0)
 		goto drop;
 
+	if (unlikely(tpi.proto == htons(ETH_P_ERSPAN))) {
+		if (erspan_rcv(skb, &tpi, hdr_len) == PACKET_RCVD)
+			return 0;
+	}
+
 	if (ipgre_rcv(skb, &tpi, hdr_len) == PACKET_RCVD)
 		return 0;
 
@@ -503,6 +560,81 @@ free_skb:
 	return NETDEV_TX_OK;
 }
 
+static inline u8 tos_to_cos(u8 tos)
+{
+	u8 dscp, cos;
+
+	dscp = tos >> 2;
+	cos = dscp >> 3;
+	return cos;
+}
+
+static void erspan_build_header(struct sk_buff *skb,
+				__be32 id, u32 index, bool truncate)
+{
+	struct iphdr *iphdr = ip_hdr(skb);
+	struct ethhdr *eth = eth_hdr(skb);
+	enum erspan_encap_type enc_type;
+	struct erspanhdr *ershdr;
+	struct qtag_prefix {
+		__be16 eth_type;
+		__be16 tci;
+	} *qp;
+	u16 vlan_tci = 0;
+
+	enc_type = ERSPAN_ENCAP_NOVLAN;
+
+	/* If mirrored packet has vlan tag, extract tci and
+	 *  perserve vlan header in the mirrored frame.
+	 */
+	if (eth->h_proto == htons(ETH_P_8021Q)) {
+		qp = (struct qtag_prefix *)(skb->data + 2 * ETH_ALEN);
+		vlan_tci = ntohs(qp->tci);
+		enc_type = ERSPAN_ENCAP_INFRAME;
+	}
+
+	skb_push(skb, sizeof(*ershdr));
+	ershdr = (struct erspanhdr *)skb->data;
+	memset(ershdr, 0, sizeof(*ershdr));
+
+	ershdr->ver_vlan = htons((vlan_tci & VLAN_MASK) |
+				 (ERSPAN_VERSION << VER_OFFSET));
+	ershdr->session_id = htons((u16)(ntohl(id) & ID_MASK) |
+			   ((tos_to_cos(iphdr->tos) << COS_OFFSET) & COS_MASK) |
+			   (enc_type << EN_OFFSET & EN_MASK) |
+			   ((truncate << T_OFFSET) & T_MASK));
+	ershdr->md.index = htonl(index & INDEX_MASK);
+}
+
+static netdev_tx_t erspan_xmit(struct sk_buff *skb,
+			       struct net_device *dev)
+{
+	struct ip_tunnel *tunnel = netdev_priv(dev);
+	bool truncate = false;
+
+	if (gre_handle_offloads(skb, false))
+		goto free_skb;
+
+	if (skb_cow_head(skb, dev->needed_headroom))
+		goto free_skb;
+
+	if (skb->len > dev->mtu) {
+		pskb_trim(skb, dev->mtu);
+		truncate = true;
+	}
+
+	/* Push ERSPAN header */
+	erspan_build_header(skb, tunnel->parms.o_key, tunnel->index, truncate);
+	tunnel->parms.o_flags &= ~TUNNEL_KEY;
+	__gre_xmit(skb, dev, &tunnel->parms.iph, htons(ETH_P_ERSPAN));
+	return NETDEV_TX_OK;
+
+free_skb:
+	kfree_skb(skb);
+	dev->stats.tx_dropped++;
+	return NETDEV_TX_OK;
+}
+
 static netdev_tx_t gre_tap_xmit(struct sk_buff *skb,
 				struct net_device *dev)
 {
@@ -828,6 +960,39 @@ out:
 	return ipgre_tunnel_validate(tb, data, extack);
 }
 
+static int erspan_validate(struct nlattr *tb[], struct nlattr *data[],
+			   struct netlink_ext_ack *extack)
+{
+	__be16 flags = 0;
+	int ret;
+
+	if (!data)
+		return 0;
+
+	ret = ipgre_tap_validate(tb, data, extack);
+	if (ret)
+		return ret;
+
+	/* ERSPAN should only have GRE sequence and key flag */
+	flags |= nla_get_be16(data[IFLA_GRE_OFLAGS]);
+	flags |= nla_get_be16(data[IFLA_GRE_IFLAGS]);
+	if (flags != (GRE_SEQ | GRE_KEY))
+		return -EINVAL;
+
+	/* ERSPAN Session ID only has 10-bit. Since we reuse
+	 * 32-bit key field as ID, check it's range.
+	 */
+	if (data[IFLA_GRE_IKEY] &&
+	    (ntohl(nla_get_be32(data[IFLA_GRE_IKEY])) & ~ID_MASK))
+		return -EINVAL;
+
+	if (data[IFLA_GRE_OKEY] &&
+	    (ntohl(nla_get_be32(data[IFLA_GRE_OKEY])) & ~ID_MASK))
+		return -EINVAL;
+
+	return 0;
+}
+
 static int ipgre_netlink_parms(struct net_device *dev,
 				struct nlattr *data[],
 				struct nlattr *tb[],
@@ -892,6 +1057,13 @@ static int ipgre_netlink_parms(struct net_device *dev,
 	if (data[IFLA_GRE_FWMARK])
 		*fwmark = nla_get_u32(data[IFLA_GRE_FWMARK]);
 
+	if (data[IFLA_GRE_ERSPAN_INDEX]) {
+		t->index = nla_get_u32(data[IFLA_GRE_ERSPAN_INDEX]);
+
+		if (t->index & ~INDEX_MASK)
+			return -EINVAL;
+	}
+
 	return 0;
 }
 
@@ -949,6 +1121,36 @@ static const struct net_device_ops gre_tap_netdev_ops = {
 	.ndo_fill_metadata_dst	= gre_fill_metadata_dst,
 };
 
+static int erspan_tunnel_init(struct net_device *dev)
+{
+	struct ip_tunnel *tunnel = netdev_priv(dev);
+	int t_hlen;
+
+	tunnel->tun_hlen = 8;
+	tunnel->parms.iph.protocol = IPPROTO_GRE;
+	t_hlen = tunnel->hlen + sizeof(struct iphdr) + sizeof(struct erspanhdr);
+
+	dev->needed_headroom = LL_MAX_HEADER + t_hlen + 4;
+	dev->mtu = ETH_DATA_LEN - t_hlen - 4;
+	dev->features		|= GRE_FEATURES;
+	dev->hw_features	|= GRE_FEATURES;
+	dev->priv_flags		|= IFF_LIVE_ADDR_CHANGE;
+
+	return ip_tunnel_init(dev);
+}
+
+static const struct net_device_ops erspan_netdev_ops = {
+	.ndo_init		= erspan_tunnel_init,
+	.ndo_uninit		= ip_tunnel_uninit,
+	.ndo_start_xmit		= erspan_xmit,
+	.ndo_set_mac_address	= eth_mac_addr,
+	.ndo_validate_addr	= eth_validate_addr,
+	.ndo_change_mtu		= ip_tunnel_change_mtu,
+	.ndo_get_stats64	= ip_tunnel_get_stats64,
+	.ndo_get_iflink		= ip_tunnel_get_iflink,
+	.ndo_fill_metadata_dst	= gre_fill_metadata_dst,
+};
+
 static void ipgre_tap_setup(struct net_device *dev)
 {
 	ether_setup(dev);
@@ -1041,6 +1243,8 @@ static size_t ipgre_get_size(const struct net_device *dev)
 		nla_total_size(1) +
 		/* IFLA_GRE_FWMARK */
 		nla_total_size(4) +
+		/* IFLA_GRE_ERSPAN_INDEX */
+		nla_total_size(4) +
 		0;
 }
 
@@ -1083,12 +1287,25 @@ static int ipgre_fill_info(struct sk_buff *skb, const struct net_device *dev)
 			goto nla_put_failure;
 	}
 
+	if (t->index)
+		if (nla_put_u32(skb, IFLA_GRE_ERSPAN_INDEX, t->index))
+			goto nla_put_failure;
+
 	return 0;
 
 nla_put_failure:
 	return -EMSGSIZE;
 }
 
+static void erspan_setup(struct net_device *dev)
+{
+	ether_setup(dev);
+	dev->netdev_ops = &erspan_netdev_ops;
+	dev->priv_flags &= ~IFF_TX_SKB_SHARING;
+	dev->priv_flags |= IFF_LIVE_ADDR_CHANGE;
+	ip_tunnel_setup(dev, erspan_net_id);
+}
+
 static const struct nla_policy ipgre_policy[IFLA_GRE_MAX + 1] = {
 	[IFLA_GRE_LINK]		= { .type = NLA_U32 },
 	[IFLA_GRE_IFLAGS]	= { .type = NLA_U16 },
@@ -1107,6 +1324,7 @@ static const struct nla_policy ipgre_policy[IFLA_GRE_MAX + 1] = {
 	[IFLA_GRE_COLLECT_METADATA]	= { .type = NLA_FLAG },
 	[IFLA_GRE_IGNORE_DF]	= { .type = NLA_U8 },
 	[IFLA_GRE_FWMARK]	= { .type = NLA_U32 },
+	[IFLA_GRE_ERSPAN_INDEX]	= { .type = NLA_U32 },
 };
 
 static struct rtnl_link_ops ipgre_link_ops __read_mostly = {
@@ -1139,6 +1357,21 @@ static struct rtnl_link_ops ipgre_tap_ops __read_mostly = {
 	.get_link_net	= ip_tunnel_get_link_net,
 };
 
+static struct rtnl_link_ops erspan_link_ops __read_mostly = {
+	.kind		= "erspan",
+	.maxtype	= IFLA_GRE_MAX,
+	.policy		= ipgre_policy,
+	.priv_size	= sizeof(struct ip_tunnel),
+	.setup		= erspan_setup,
+	.validate	= erspan_validate,
+	.newlink	= ipgre_newlink,
+	.changelink	= ipgre_changelink,
+	.dellink	= ip_tunnel_dellink,
+	.get_size	= ipgre_get_size,
+	.fill_info	= ipgre_fill_info,
+	.get_link_net	= ip_tunnel_get_link_net,
+};
+
 struct net_device *gretap_fb_dev_create(struct net *net, const char *name,
 					u8 name_assign_type)
 {
@@ -1202,6 +1435,26 @@ static struct pernet_operations ipgre_tap_net_ops = {
 	.size = sizeof(struct ip_tunnel_net),
 };
 
+static int __net_init erspan_init_net(struct net *net)
+{
+	return ip_tunnel_init_net(net, erspan_net_id,
+				  &erspan_link_ops, "erspan0");
+}
+
+static void __net_exit erspan_exit_net(struct net *net)
+{
+	struct ip_tunnel_net *itn = net_generic(net, erspan_net_id);
+
+	ip_tunnel_delete_net(itn, &erspan_link_ops);
+}
+
+static struct pernet_operations erspan_net_ops = {
+	.init = erspan_init_net,
+	.exit = erspan_exit_net,
+	.id   = &erspan_net_id,
+	.size = sizeof(struct ip_tunnel_net),
+};
+
 static int __init ipgre_init(void)
 {
 	int err;
@@ -1216,6 +1469,10 @@ static int __init ipgre_init(void)
 	if (err < 0)
 		goto pnet_tap_faied;
 
+	err = register_pernet_device(&erspan_net_ops);
+	if (err < 0)
+		goto pnet_erspan_failed;
+
 	err = gre_add_protocol(&ipgre_protocol, GREPROTO_CISCO);
 	if (err < 0) {
 		pr_info("%s: can't add protocol\n", __func__);
@@ -1230,13 +1487,21 @@ static int __init ipgre_init(void)
 	if (err < 0)
 		goto tap_ops_failed;
 
+	err = rtnl_link_register(&erspan_link_ops);
+	if (err < 0)
+		goto erspan_link_failed;
+
 	return 0;
 
+erspan_link_failed:
+	rtnl_link_unregister(&ipgre_tap_ops);
 tap_ops_failed:
 	rtnl_link_unregister(&ipgre_link_ops);
 rtnl_link_failed:
 	gre_del_protocol(&ipgre_protocol, GREPROTO_CISCO);
 add_proto_failed:
+	unregister_pernet_device(&erspan_net_ops);
+pnet_erspan_failed:
 	unregister_pernet_device(&ipgre_tap_net_ops);
 pnet_tap_faied:
 	unregister_pernet_device(&ipgre_net_ops);
@@ -1247,9 +1512,11 @@ static void __exit ipgre_fini(void)
 {
 	rtnl_link_unregister(&ipgre_tap_ops);
 	rtnl_link_unregister(&ipgre_link_ops);
+	rtnl_link_unregister(&erspan_link_ops);
 	gre_del_protocol(&ipgre_protocol, GREPROTO_CISCO);
 	unregister_pernet_device(&ipgre_tap_net_ops);
 	unregister_pernet_device(&ipgre_net_ops);
+	unregister_pernet_device(&erspan_net_ops);
 }
 
 module_init(ipgre_init);
@@ -1257,5 +1524,7 @@ module_exit(ipgre_fini);
 MODULE_LICENSE("GPL");
 MODULE_ALIAS_RTNL_LINK("gre");
 MODULE_ALIAS_RTNL_LINK("gretap");
+MODULE_ALIAS_RTNL_LINK("erspan");
 MODULE_ALIAS_NETDEV("gre0");
 MODULE_ALIAS_NETDEV("gretap0");
+MODULE_ALIAS_NETDEV("erspan0");
-- 
cgit v1.2.3


From 5cdcf4c6a638591ec0e98c57404a19e7f9997567 Mon Sep 17 00:00:00 2001
From: Martijn Coenen <maco@android.com>
Date: Fri, 28 Jul 2017 13:56:06 +0200
Subject: ANDROID: binder: add padding to binder_fd_array_object.

binder_fd_array_object starts with a 4-byte header,
followed by a few fields that are 8 bytes when
ANDROID_BINDER_IPC_32BIT=N.

This can cause alignment issues in a 64-bit kernel
with a 32-bit userspace, as on x86_32 an 8-byte primitive
may be aligned to a 4-byte address. Pad with a __u32
to fix this.

Signed-off-by: Martijn Coenen <maco@android.com>
Cc: stable <stable@vger.kernel.org> # 4.11+
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/uapi/linux/android/binder.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include')

diff --git a/include/uapi/linux/android/binder.h b/include/uapi/linux/android/binder.h
index 51f891fb1b18..7668b5791c91 100644
--- a/include/uapi/linux/android/binder.h
+++ b/include/uapi/linux/android/binder.h
@@ -132,6 +132,7 @@ enum {
 
 /* struct binder_fd_array_object - object describing an array of fds in a buffer
  * @hdr:		common header structure
+ * @pad:		padding to ensure correct alignment
  * @num_fds:		number of file descriptors in the buffer
  * @parent:		index in offset array to buffer holding the fd array
  * @parent_offset:	start offset of fd array in the buffer
@@ -152,6 +153,7 @@ enum {
  */
 struct binder_fd_array_object {
 	struct binder_object_header	hdr;
+	__u32				pad;
 	binder_size_t			num_fds;
 	binder_size_t			parent;
 	binder_size_t			parent_offset;
-- 
cgit v1.2.3


From 3946d18765be0b0fe278bea4f87a54cf13858019 Mon Sep 17 00:00:00 2001
From: Dmitry Torokhov <dmitry.torokhov@gmail.com>
Date: Mon, 14 Aug 2017 21:59:55 -0700
Subject: gpio: add gpio_add_lookup_tables() to add several tables at once

When converting legacy board to use gpiod API() there might be several
lookup tables in board file, let's provide a way to register them all at
once.

Reviewed-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Reviewed-by: Mika Westerberg <mika.westerberg@linux.intel.com>
Signed-off-by: Dmitry Torokhov <dmitry.torokhov@gmail.com>
Signed-off-by: Linus Walleij <linus.walleij@linaro.org>
---
 drivers/gpio/gpiolib.c       | 17 +++++++++++++++++
 include/linux/gpio/machine.h |  3 +++
 2 files changed, 20 insertions(+)

(limited to 'include')

diff --git a/drivers/gpio/gpiolib.c b/drivers/gpio/gpiolib.c
index 18bba1748a35..e452768f316d 100644
--- a/drivers/gpio/gpiolib.c
+++ b/drivers/gpio/gpiolib.c
@@ -3010,6 +3010,23 @@ void gpiod_set_raw_array_value_cansleep(unsigned int array_size,
 }
 EXPORT_SYMBOL_GPL(gpiod_set_raw_array_value_cansleep);
 
+/**
+ * gpiod_add_lookup_tables() - register GPIO device consumers
+ * @tables: list of tables of consumers to register
+ * @n: number of tables in the list
+ */
+void gpiod_add_lookup_tables(struct gpiod_lookup_table **tables, size_t n)
+{
+	unsigned int i;
+
+	mutex_lock(&gpio_lookup_lock);
+
+	for (i = 0; i < n; i++)
+		list_add_tail(&tables[i]->list, &gpio_lookup_list);
+
+	mutex_unlock(&gpio_lookup_lock);
+}
+
 /**
  * gpiod_set_array_value_cansleep() - assign values to an array of GPIOs
  * @array_size: number of elements in the descriptor / value arrays
diff --git a/include/linux/gpio/machine.h b/include/linux/gpio/machine.h
index 6e76b16fcade..ba4ccfd900f9 100644
--- a/include/linux/gpio/machine.h
+++ b/include/linux/gpio/machine.h
@@ -60,11 +60,14 @@ struct gpiod_lookup_table {
 
 #ifdef CONFIG_GPIOLIB
 void gpiod_add_lookup_table(struct gpiod_lookup_table *table);
+void gpiod_add_lookup_tables(struct gpiod_lookup_table **tables, size_t n);
 void gpiod_remove_lookup_table(struct gpiod_lookup_table *table);
 #else
 static inline
 void gpiod_add_lookup_table(struct gpiod_lookup_table *table) {}
 static inline
+void gpiod_add_lookup_tables(struct gpiod_lookup_table **tables, size_t n) {}
+static inline
 void gpiod_remove_lookup_table(struct gpiod_lookup_table *table) {}
 #endif
 
-- 
cgit v1.2.3


From 0edc23ea2692fe75e941ec00867e661eb15f67fa Mon Sep 17 00:00:00 2001
From: Marc Zyngier <marc.zyngier@arm.com>
Date: Mon, 19 Dec 2016 17:01:52 +0000
Subject: irqchip/gic-v3: Add VLPI/DirectLPI discovery

Add helper functions that probe for VLPI and DirectLPI properties.

Reviewed-by: Eric Auger <eric.auger@redhat.com>
Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Marc Zyngier <marc.zyngier@arm.com>
---
 drivers/irqchip/irq-gic-v3.c       | 24 +++++++++++++++++++++++-
 include/linux/irqchip/arm-gic-v3.h |  3 +++
 2 files changed, 26 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/drivers/irqchip/irq-gic-v3.c b/drivers/irqchip/irq-gic-v3.c
index fba5f668be59..65fabd5f2ec6 100644
--- a/drivers/irqchip/irq-gic-v3.c
+++ b/drivers/irqchip/irq-gic-v3.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (C) 2013, 2014 ARM Limited, All Rights Reserved.
+ * Copyright (C) 2013-2017 ARM Limited, All Rights Reserved.
  * Author: Marc Zyngier <marc.zyngier@arm.com>
  *
  * This program is free software; you can redistribute it and/or modify
@@ -504,6 +504,24 @@ static int gic_populate_rdist(void)
 	return -ENODEV;
 }
 
+static int __gic_update_vlpi_properties(struct redist_region *region,
+					void __iomem *ptr)
+{
+	u64 typer = gic_read_typer(ptr + GICR_TYPER);
+	gic_data.rdists.has_vlpis &= !!(typer & GICR_TYPER_VLPIS);
+	gic_data.rdists.has_direct_lpi &= !!(typer & GICR_TYPER_DirectLPIS);
+
+	return 1;
+}
+
+static void gic_update_vlpi_properties(void)
+{
+	gic_iterate_rdists(__gic_update_vlpi_properties);
+	pr_info("%sVLPI support, %sdirect LPI support\n",
+		!gic_data.rdists.has_vlpis ? "no " : "",
+		!gic_data.rdists.has_direct_lpi ? "no " : "");
+}
+
 static void gic_cpu_sys_reg_init(void)
 {
 	/*
@@ -968,6 +986,8 @@ static int __init gic_init_bases(void __iomem *dist_base,
 	gic_data.domain = irq_domain_create_tree(handle, &gic_irq_domain_ops,
 						 &gic_data);
 	gic_data.rdists.rdist = alloc_percpu(typeof(*gic_data.rdists.rdist));
+	gic_data.rdists.has_vlpis = true;
+	gic_data.rdists.has_direct_lpi = true;
 
 	if (WARN_ON(!gic_data.domain) || WARN_ON(!gic_data.rdists.rdist)) {
 		err = -ENOMEM;
@@ -976,6 +996,8 @@ static int __init gic_init_bases(void __iomem *dist_base,
 
 	set_handle_irq(gic_handle_irq);
 
+	gic_update_vlpi_properties();
+
 	if (IS_ENABLED(CONFIG_ARM_GIC_V3_ITS) && gic_dist_supports_lpis())
 		its_init(handle, &gic_data.rdists, gic_data.domain);
 
diff --git a/include/linux/irqchip/arm-gic-v3.h b/include/linux/irqchip/arm-gic-v3.h
index 6a1f87ff94e2..20a553423ac7 100644
--- a/include/linux/irqchip/arm-gic-v3.h
+++ b/include/linux/irqchip/arm-gic-v3.h
@@ -204,6 +204,7 @@
 
 #define GICR_TYPER_PLPIS		(1U << 0)
 #define GICR_TYPER_VLPIS		(1U << 1)
+#define GICR_TYPER_DirectLPIS		(1U << 3)
 #define GICR_TYPER_LAST			(1U << 4)
 
 #define GIC_V3_REDIST_SIZE		0x20000
@@ -487,6 +488,8 @@ struct rdists {
 	struct page		*prop_page;
 	int			id_bits;
 	u64			flags;
+	bool			has_vlpis;
+	bool			has_direct_lpi;
 };
 
 struct irq_domain;
-- 
cgit v1.2.3


From 3dfa576bfb453482314b596931a59a4951428058 Mon Sep 17 00:00:00 2001
From: Marc Zyngier <marc.zyngier@arm.com>
Date: Mon, 19 Dec 2016 17:25:54 +0000
Subject: irqchip/gic-v3-its: Add probing for VLPI properties

Add the probing code for the ITS VLPI support. This includes
configuring the ITS number if not supporting the single VMOVP
command feature.

Reviewed-by: Eric Auger <eric.auger@redhat.com>
Signed-off-by: Marc Zyngier <marc.zyngier@arm.com>
---
 drivers/irqchip/irq-gic-v3-its.c   | 71 +++++++++++++++++++++++++++++++++++---
 include/linux/irqchip/arm-gic-v3.h |  5 +++
 2 files changed, 72 insertions(+), 4 deletions(-)

(limited to 'include')

diff --git a/drivers/irqchip/irq-gic-v3-its.c b/drivers/irqchip/irq-gic-v3-its.c
index 2232250f1442..89da961e949e 100644
--- a/drivers/irqchip/irq-gic-v3-its.c
+++ b/drivers/irqchip/irq-gic-v3-its.c
@@ -101,6 +101,7 @@ struct its_node {
 	u32			ite_size;
 	u32			device_ids;
 	int			numa_node;
+	bool			is_v4;
 };
 
 #define ITS_ITT_ALIGN		SZ_256
@@ -133,6 +134,14 @@ static DEFINE_SPINLOCK(its_lock);
 static struct rdists *gic_rdists;
 static struct irq_domain *its_parent;
 
+/*
+ * We have a maximum number of 16 ITSs in the whole system if we're
+ * using the ITSList mechanism
+ */
+#define ITS_LIST_MAX		16
+
+static unsigned long its_list_map;
+
 #define gic_data_rdist()		(raw_cpu_ptr(gic_rdists->rdist))
 #define gic_data_rdist_rd_base()	(gic_data_rdist()->rd_base)
 
@@ -1679,13 +1688,51 @@ static int its_init_domain(struct fwnode_handle *handle, struct its_node *its)
 	return 0;
 }
 
+static int __init its_compute_its_list_map(struct resource *res,
+					   void __iomem *its_base)
+{
+	int its_number;
+	u32 ctlr;
+
+	/*
+	 * This is assumed to be done early enough that we're
+	 * guaranteed to be single-threaded, hence no
+	 * locking. Should this change, we should address
+	 * this.
+	 */
+	its_number = find_first_zero_bit(&its_list_map, ITS_LIST_MAX);
+	if (its_number >= ITS_LIST_MAX) {
+		pr_err("ITS@%pa: No ITSList entry available!\n",
+		       &res->start);
+		return -EINVAL;
+	}
+
+	ctlr = readl_relaxed(its_base + GITS_CTLR);
+	ctlr &= ~GITS_CTLR_ITS_NUMBER;
+	ctlr |= its_number << GITS_CTLR_ITS_NUMBER_SHIFT;
+	writel_relaxed(ctlr, its_base + GITS_CTLR);
+	ctlr = readl_relaxed(its_base + GITS_CTLR);
+	if ((ctlr & GITS_CTLR_ITS_NUMBER) != (its_number << GITS_CTLR_ITS_NUMBER_SHIFT)) {
+		its_number = ctlr & GITS_CTLR_ITS_NUMBER;
+		its_number >>= GITS_CTLR_ITS_NUMBER_SHIFT;
+	}
+
+	if (test_and_set_bit(its_number, &its_list_map)) {
+		pr_err("ITS@%pa: Duplicate ITSList entry %d\n",
+		       &res->start, its_number);
+		return -EINVAL;
+	}
+
+	return its_number;
+}
+
 static int __init its_probe_one(struct resource *res,
 				struct fwnode_handle *handle, int numa_node)
 {
 	struct its_node *its;
 	void __iomem *its_base;
-	u32 val;
-	u64 baser, tmp;
+	u32 val, ctlr;
+	u64 baser, tmp, typer;
 	int err;
 
 	its_base = ioremap(res->start, resource_size(res));
@@ -1718,9 +1765,24 @@ static int __init its_probe_one(struct resource *res,
 	raw_spin_lock_init(&its->lock);
 	INIT_LIST_HEAD(&its->entry);
 	INIT_LIST_HEAD(&its->its_device_list);
+	typer = gic_read_typer(its_base + GITS_TYPER);
 	its->base = its_base;
 	its->phys_base = res->start;
-	its->ite_size = ((gic_read_typer(its_base + GITS_TYPER) >> 4) & 0xf) + 1;
+	its->ite_size = GITS_TYPER_ITT_ENTRY_SIZE(typer);
+	its->is_v4 = !!(typer & GITS_TYPER_VLPIS);
+	if (its->is_v4) {
+		if (!(typer & GITS_TYPER_VMOVP)) {
+			err = its_compute_its_list_map(res, its_base);
+			if (err < 0)
+				goto out_free_its;
+
+			pr_info("ITS@%pa: Using ITS number %d\n",
+				&res->start, err);
+		} else {
+			pr_info("ITS@%pa: Single VMOVP capable\n", &res->start);
+		}
+	}
+
 	its->numa_node = numa_node;
 
 	its->cmd_base = (void *)__get_free_pages(GFP_KERNEL | __GFP_ZERO,
@@ -1767,7 +1829,8 @@ static int __init its_probe_one(struct resource *res,
 	}
 
 	gits_write_cwriter(0, its->base + GITS_CWRITER);
-	writel_relaxed(GITS_CTLR_ENABLE, its->base + GITS_CTLR);
+	ctlr = readl_relaxed(its->base + GITS_CTLR);
+	writel_relaxed(ctlr | GITS_CTLR_ENABLE, its->base + GITS_CTLR);
 
 	err = its_init_domain(handle, its);
 	if (err)
diff --git a/include/linux/irqchip/arm-gic-v3.h b/include/linux/irqchip/arm-gic-v3.h
index 20a553423ac7..af8c55105fc2 100644
--- a/include/linux/irqchip/arm-gic-v3.h
+++ b/include/linux/irqchip/arm-gic-v3.h
@@ -235,15 +235,20 @@
 #define GITS_TRANSLATER			0x10040
 
 #define GITS_CTLR_ENABLE		(1U << 0)
+#define	GITS_CTLR_ITS_NUMBER_SHIFT	4
+#define	GITS_CTLR_ITS_NUMBER		(0xFU << GITS_CTLR_ITS_NUMBER_SHIFT)
 #define GITS_CTLR_QUIESCENT		(1U << 31)
 
 #define GITS_TYPER_PLPIS		(1UL << 0)
+#define GITS_TYPER_VLPIS		(1UL << 1)
 #define GITS_TYPER_ITT_ENTRY_SIZE_SHIFT	4
+#define GITS_TYPER_ITT_ENTRY_SIZE(r)	((((r) >> GITS_TYPER_ITT_ENTRY_SIZE_SHIFT) & 0x1f) + 1)
 #define GITS_TYPER_IDBITS_SHIFT		8
 #define GITS_TYPER_DEVBITS_SHIFT	13
 #define GITS_TYPER_DEVBITS(r)		((((r) >> GITS_TYPER_DEVBITS_SHIFT) & 0x1f) + 1)
 #define GITS_TYPER_PTA			(1UL << 19)
 #define GITS_TYPER_HWCOLLCNT_SHIFT	24
+#define GITS_TYPER_VMOVP		(1ULL << 37)
 
 #define GITS_IIDR_REV_SHIFT		12
 #define GITS_IIDR_REV_MASK		(0xf << GITS_IIDR_REV_SHIFT)
-- 
cgit v1.2.3


From 5158bd5597e4c0939db3ecefbcbf492c425e611c Mon Sep 17 00:00:00 2001
From: Jean-Louis Thekekara <jeanlouis.thekekara@parrot.com>
Date: Thu, 29 Jun 2017 19:08:30 +0200
Subject: mtd: nand: remove hard-coded NAND ids length

This commit removes hard-coded '8' used for looping into
struct nand_chip.id.data array.

NAND_MAX_ID_LEN has been introduced by Artem Bityutskiy in
53552d22bfe1f for defining ids length in nand_flash_ids[] list.

This commit unifies ids length in nand base driver.

Signed-off-by: Jean-Louis Thekekara <jeanlouis.thekekara@parrot.com>
Signed-off-by: Boris Brezillon <boris.brezillon@free-electrons.com>
---
 drivers/mtd/nand/nand_base.c |  4 ++--
 include/linux/mtd/nand.h     | 10 +++++-----
 2 files changed, 7 insertions(+), 7 deletions(-)

(limited to 'include')

diff --git a/drivers/mtd/nand/nand_base.c b/drivers/mtd/nand/nand_base.c
index abcb15fba484..ab2804379b0d 100644
--- a/drivers/mtd/nand/nand_base.c
+++ b/drivers/mtd/nand/nand_base.c
@@ -3886,7 +3886,7 @@ static int nand_detect(struct nand_chip *chip, struct nand_flash_dev *type)
 	chip->cmdfunc(mtd, NAND_CMD_READID, 0x00, -1);
 
 	/* Read entire ID string */
-	for (i = 0; i < 8; i++)
+	for (i = 0; i < ARRAY_SIZE(chip->id.data); i++)
 		id_data[i] = chip->read_byte(mtd);
 
 	if (id_data[0] != maf_id || id_data[1] != dev_id) {
@@ -3895,7 +3895,7 @@ static int nand_detect(struct nand_chip *chip, struct nand_flash_dev *type)
 		return -ENODEV;
 	}
 
-	chip->id.len = nand_id_len(id_data, 8);
+	chip->id.len = nand_id_len(id_data, ARRAY_SIZE(chip->id.data));
 
 	/* Try to identify manufacturer */
 	manufacturer = nand_get_manufacturer(maf_id);
diff --git a/include/linux/mtd/nand.h b/include/linux/mtd/nand.h
index 8a6522e82f03..297684013977 100644
--- a/include/linux/mtd/nand.h
+++ b/include/linux/mtd/nand.h
@@ -439,14 +439,16 @@ struct nand_jedec_params {
 	__le16 crc;
 } __packed;
 
+/* The maximum expected count of bytes in the NAND ID sequence */
+#define NAND_MAX_ID_LEN 8
+
 /**
  * struct nand_id - NAND id structure
- * @data: buffer containing the id bytes. Currently 8 bytes large, but can
- *	  be extended if required.
+ * @data: buffer containing the id bytes.
  * @len: ID length.
  */
 struct nand_id {
-	u8 data[8];
+	u8 data[NAND_MAX_ID_LEN];
 	int len;
 };
 
@@ -1018,8 +1020,6 @@ static inline void *nand_get_manufacturer_data(struct nand_chip *chip)
 #define NAND_MFR_ATO		0x9b
 #define NAND_MFR_WINBOND	0xef
 
-/* The maximum expected count of bytes in the NAND ID sequence */
-#define NAND_MAX_ID_LEN 8
 
 /*
  * A helper for defining older NAND chips where the second ID byte fully
-- 
cgit v1.2.3


From e59ad6ff83b444517067046bd837595a1ab84900 Mon Sep 17 00:00:00 2001
From: Andrea Adami <andrea.adami@gmail.com>
Date: Mon, 14 Aug 2017 22:48:33 +0200
Subject: mtd: nand: sharpsl: Add partition parsers platform data

With the introduction of sharpslpart partition parser we can now read the
offsets from NAND: we specify the list of the parsers as platform data, with
cmdlinepart and ofpart parsers first allowing to override the part. table
written in NAND. This is done in the board files using this driver.

Thus, we need to extend sharpsl_nand_platform_data to consider the partition
parsers.

Signed-off-by: Andrea Adami <andrea.adami@gmail.com>
Signed-off-by: Boris Brezillon <boris.brezillon@free-electrons.com>
---
 include/linux/mtd/sharpsl.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include')

diff --git a/include/linux/mtd/sharpsl.h b/include/linux/mtd/sharpsl.h
index 65e91d0fa981..6381a7d6c289 100644
--- a/include/linux/mtd/sharpsl.h
+++ b/include/linux/mtd/sharpsl.h
@@ -17,4 +17,5 @@ struct sharpsl_nand_platform_data {
 	const struct mtd_ooblayout_ops *ecc_layout;
 	struct mtd_partition	*partitions;
 	unsigned int		nr_partitions;
+	const char *const	*part_parsers;
 };
-- 
cgit v1.2.3


From 827dba9d626325a65727c4074913b2d3c7e7ea4d Mon Sep 17 00:00:00 2001
From: Andrea Adami <andrea.adami@gmail.com>
Date: Mon, 14 Aug 2017 22:48:34 +0200
Subject: mfd: tmio: Add partition parsers platform data

With the introduction of sharpslpart partition parser we can now read the
offsets from NAND: we specify the list of the parsers as platform data, with
cmdlinepart and ofpart parsers first allowing to override the part. table
written in NAND. This is done in the board files using this driver.

Thus, we need to extend tmio_nand_data to consider the partition parsers.

Signed-off-by: Andrea Adami <andrea.adami@gmail.com>
Acked-by: Lee Jones <lee.jones@linaro.org>
Acked-by: Wolfram Sang <wsa+renesas@sang-engineering.com>
Signed-off-by: Boris Brezillon <boris.brezillon@free-electrons.com>
---
 include/linux/mfd/tmio.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include')

diff --git a/include/linux/mfd/tmio.h b/include/linux/mfd/tmio.h
index 26e8f8c0a6db..357b6cfdf34a 100644
--- a/include/linux/mfd/tmio.h
+++ b/include/linux/mfd/tmio.h
@@ -139,6 +139,7 @@ struct tmio_nand_data {
 	struct nand_bbt_descr	*badblock_pattern;
 	struct mtd_partition	*partition;
 	unsigned int		num_partitions;
+	const char *const	*part_parsers;
 };
 
 #define FBIO_TMIO_ACC_WRITE	0x7C639300
-- 
cgit v1.2.3


From c2ee070fb00365d7841f6661dcdc7fbe6620bdf8 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Wed, 23 Aug 2017 19:10:31 +0200
Subject: block: cache the partition index in struct block_device

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/block_dev.c     | 1 +
 include/linux/fs.h | 1 +
 2 files changed, 2 insertions(+)

(limited to 'include')

diff --git a/fs/block_dev.c b/fs/block_dev.c
index 9941dc8342df..d29d1c70f833 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -1451,6 +1451,7 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part)
 		bdev->bd_disk = disk;
 		bdev->bd_queue = disk->queue;
 		bdev->bd_contains = bdev;
+		bdev->bd_partno = partno;
 
 		if (!partno) {
 			ret = -ENXIO;
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 6e1fd5d21248..706dd3a972d2 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -427,6 +427,7 @@ struct block_device {
 #endif
 	struct block_device *	bd_contains;
 	unsigned		bd_block_size;
+	u8			bd_partno;
 	struct hd_struct *	bd_part;
 	/* number of times partitions within this device have been opened. */
 	unsigned		bd_part_count;
-- 
cgit v1.2.3


From 74d46992e0d9dee7f1f376de0d56d31614c8a17a Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Wed, 23 Aug 2017 19:10:32 +0200
Subject: block: replace bi_bdev with a gendisk pointer and partitions index

This way we don't need a block_device structure to submit I/O.  The
block_device has different life time rules from the gendisk and
request_queue and is usually only available when the block device node
is open.  Other callers need to explicitly create one (e.g. the lightnvm
passthrough code, or the new nvme multipathing code).

For the actual I/O path all that we need is the gendisk, which exists
once per block device.  But given that the block layer also does
partition remapping we additionally need a partition index, which is
used for said remapping in generic_make_request.

Note that all the block drivers generally want request_queue or
sometimes the gendisk, so this removes a layer of indirection all
over the stack.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 arch/powerpc/sysdev/axonram.c       |   2 +-
 block/bio-integrity.c               |  18 +++----
 block/bio.c                         |  10 ++--
 block/blk-core.c                    | 100 ++++++++++++++++++------------------
 block/blk-flush.c                   |   2 +-
 block/blk-lib.c                     |   8 +--
 block/blk-merge.c                   |   2 +-
 block/blk-zoned.c                   |   4 +-
 drivers/block/brd.c                 |   5 +-
 drivers/block/drbd/drbd_actlog.c    |   2 +-
 drivers/block/drbd/drbd_bitmap.c    |   2 +-
 drivers/block/drbd/drbd_int.h       |   4 +-
 drivers/block/drbd/drbd_receiver.c  |   4 +-
 drivers/block/drbd/drbd_req.c       |   2 +-
 drivers/block/drbd/drbd_worker.c    |   2 +-
 drivers/block/floppy.c              |   2 +-
 drivers/block/pktcdvd.c             |  11 ++--
 drivers/block/xen-blkback/blkback.c |   4 +-
 drivers/md/bcache/debug.c           |   2 +-
 drivers/md/bcache/io.c              |   2 +-
 drivers/md/bcache/journal.c         |   6 +--
 drivers/md/bcache/request.c         |  16 +++---
 drivers/md/bcache/super.c           |   6 +--
 drivers/md/bcache/writeback.c       |   5 +-
 drivers/md/dm-bio-record.h          |   9 ++--
 drivers/md/dm-bufio.c               |   2 +-
 drivers/md/dm-cache-target.c        |   4 +-
 drivers/md/dm-crypt.c               |   4 +-
 drivers/md/dm-delay.c               |   4 +-
 drivers/md/dm-era-target.c          |   2 +-
 drivers/md/dm-flakey.c              |   2 +-
 drivers/md/dm-integrity.c           |  11 ++--
 drivers/md/dm-io.c                  |   2 +-
 drivers/md/dm-linear.c              |   2 +-
 drivers/md/dm-log-writes.c          |   8 +--
 drivers/md/dm-mpath.c               |   2 +-
 drivers/md/dm-raid1.c               |  12 ++---
 drivers/md/dm-snap.c                |  16 +++---
 drivers/md/dm-stripe.c              |  10 ++--
 drivers/md/dm-switch.c              |   2 +-
 drivers/md/dm-thin.c                |   6 +--
 drivers/md/dm-verity-target.c       |   2 +-
 drivers/md/dm-zoned-metadata.c      |   6 +--
 drivers/md/dm-zoned-target.c        |   4 +-
 drivers/md/dm.c                     |  10 ++--
 drivers/md/faulty.c                 |   4 +-
 drivers/md/linear.c                 |   6 +--
 drivers/md/md.c                     |  10 ++--
 drivers/md/md.h                     |   9 +++-
 drivers/md/multipath.c              |   8 +--
 drivers/md/raid0.c                  |   7 ++-
 drivers/md/raid1.c                  |  34 ++++++------
 drivers/md/raid10.c                 |  50 +++++++++---------
 drivers/md/raid5-cache.c            |   6 +--
 drivers/md/raid5-ppl.c              |   6 +--
 drivers/md/raid5.c                  |  12 ++---
 drivers/nvdimm/nd.h                 |   4 +-
 drivers/nvme/host/core.c            |  11 +---
 drivers/nvme/host/lightnvm.c        |  15 +-----
 drivers/nvme/target/io-cmd.c        |   6 +--
 drivers/s390/block/dcssblk.c        |   4 +-
 drivers/s390/block/xpram.c          |   2 +-
 drivers/target/target_core_iblock.c |   4 +-
 fs/block_dev.c                      |   4 +-
 fs/btrfs/check-integrity.c          |  12 ++---
 fs/btrfs/disk-io.c                  |   2 +-
 fs/btrfs/extent_io.c                |   6 +--
 fs/btrfs/raid56.c                   |   8 +--
 fs/btrfs/scrub.c                    |  12 ++---
 fs/btrfs/volumes.c                  |   2 +-
 fs/buffer.c                         |   4 +-
 fs/crypto/bio.c                     |   2 +-
 fs/direct-io.c                      |   8 +--
 fs/exofs/ore.c                      |   2 +-
 fs/ext4/page-io.c                   |   4 +-
 fs/ext4/readpage.c                  |   2 +-
 fs/f2fs/data.c                      |   5 +-
 fs/f2fs/segment.c                   |   2 +-
 fs/gfs2/lops.c                      |   2 +-
 fs/gfs2/meta_io.c                   |   2 +-
 fs/gfs2/ops_fstype.c                |   2 +-
 fs/hfsplus/wrapper.c                |   2 +-
 fs/iomap.c                          |   4 +-
 fs/jfs/jfs_logmgr.c                 |   4 +-
 fs/jfs/jfs_metapage.c               |   4 +-
 fs/mpage.c                          |   2 +-
 fs/nfs/blocklayout/blocklayout.c    |   2 +-
 fs/nilfs2/segbuf.c                  |   2 +-
 fs/ocfs2/cluster/heartbeat.c        |   2 +-
 fs/xfs/xfs_aops.c                   |   2 +-
 fs/xfs/xfs_buf.c                    |   2 +-
 include/linux/bio.h                 |  18 +++++++
 include/linux/blk_types.h           |   3 +-
 include/trace/events/bcache.h       |   6 +--
 include/trace/events/block.h        |  16 +++---
 include/trace/events/f2fs.h         |   2 +-
 kernel/power/swap.c                 |   5 +-
 kernel/trace/blktrace.c             |   2 +-
 mm/page_io.c                        |  17 +++---
 99 files changed, 358 insertions(+), 357 deletions(-)

(limited to 'include')

diff --git a/arch/powerpc/sysdev/axonram.c b/arch/powerpc/sysdev/axonram.c
index 2799706106c6..1e15deacccaf 100644
--- a/arch/powerpc/sysdev/axonram.c
+++ b/arch/powerpc/sysdev/axonram.c
@@ -110,7 +110,7 @@ axon_ram_irq_handler(int irq, void *dev)
 static blk_qc_t
 axon_ram_make_request(struct request_queue *queue, struct bio *bio)
 {
-	struct axon_ram_bank *bank = bio->bi_bdev->bd_disk->private_data;
+	struct axon_ram_bank *bank = bio->bi_disk->private_data;
 	unsigned long phys_mem, phys_end;
 	void *user_mem;
 	struct bio_vec vec;
diff --git a/block/bio-integrity.c b/block/bio-integrity.c
index 5fa9a740fd99..fc71e6172869 100644
--- a/block/bio-integrity.c
+++ b/block/bio-integrity.c
@@ -146,7 +146,7 @@ int bio_integrity_add_page(struct bio *bio, struct page *page,
 	iv = bip->bip_vec + bip->bip_vcnt;
 
 	if (bip->bip_vcnt &&
-	    bvec_gap_to_prev(bdev_get_queue(bio->bi_bdev),
+	    bvec_gap_to_prev(bio->bi_disk->queue,
 			     &bip->bip_vec[bip->bip_vcnt - 1], offset))
 		return 0;
 
@@ -190,7 +190,7 @@ static inline unsigned int bio_integrity_bytes(struct blk_integrity *bi,
 static blk_status_t bio_integrity_process(struct bio *bio,
 		struct bvec_iter *proc_iter, integrity_processing_fn *proc_fn)
 {
-	struct blk_integrity *bi = bdev_get_integrity(bio->bi_bdev);
+	struct blk_integrity *bi = blk_get_integrity(bio->bi_disk);
 	struct blk_integrity_iter iter;
 	struct bvec_iter bviter;
 	struct bio_vec bv;
@@ -199,7 +199,7 @@ static blk_status_t bio_integrity_process(struct bio *bio,
 	void *prot_buf = page_address(bip->bip_vec->bv_page) +
 		bip->bip_vec->bv_offset;
 
-	iter.disk_name = bio->bi_bdev->bd_disk->disk_name;
+	iter.disk_name = bio->bi_disk->disk_name;
 	iter.interval = 1 << bi->interval_exp;
 	iter.seed = proc_iter->bi_sector;
 	iter.prot_buf = prot_buf;
@@ -236,8 +236,8 @@ static blk_status_t bio_integrity_process(struct bio *bio,
 bool bio_integrity_prep(struct bio *bio)
 {
 	struct bio_integrity_payload *bip;
-	struct blk_integrity *bi;
-	struct request_queue *q;
+	struct blk_integrity *bi = blk_get_integrity(bio->bi_disk);
+	struct request_queue *q = bio->bi_disk->queue;
 	void *buf;
 	unsigned long start, end;
 	unsigned int len, nr_pages;
@@ -245,11 +245,9 @@ bool bio_integrity_prep(struct bio *bio)
 	unsigned int intervals;
 	blk_status_t status;
 
-	bi = bdev_get_integrity(bio->bi_bdev);
 	if (!bi)
 		return true;
 
-	q = bdev_get_queue(bio->bi_bdev);
 	if (bio_op(bio) != REQ_OP_READ && bio_op(bio) != REQ_OP_WRITE)
 		return true;
 
@@ -354,7 +352,7 @@ static void bio_integrity_verify_fn(struct work_struct *work)
 	struct bio_integrity_payload *bip =
 		container_of(work, struct bio_integrity_payload, bip_work);
 	struct bio *bio = bip->bip_bio;
-	struct blk_integrity *bi = bdev_get_integrity(bio->bi_bdev);
+	struct blk_integrity *bi = blk_get_integrity(bio->bi_disk);
 	struct bvec_iter iter = bio->bi_iter;
 
 	/*
@@ -411,7 +409,7 @@ bool __bio_integrity_endio(struct bio *bio)
 void bio_integrity_advance(struct bio *bio, unsigned int bytes_done)
 {
 	struct bio_integrity_payload *bip = bio_integrity(bio);
-	struct blk_integrity *bi = bdev_get_integrity(bio->bi_bdev);
+	struct blk_integrity *bi = blk_get_integrity(bio->bi_disk);
 	unsigned bytes = bio_integrity_bytes(bi, bytes_done >> 9);
 
 	bip->bip_iter.bi_sector += bytes_done >> 9;
@@ -428,7 +426,7 @@ EXPORT_SYMBOL(bio_integrity_advance);
 void bio_integrity_trim(struct bio *bio)
 {
 	struct bio_integrity_payload *bip = bio_integrity(bio);
-	struct blk_integrity *bi = bdev_get_integrity(bio->bi_bdev);
+	struct blk_integrity *bi = blk_get_integrity(bio->bi_disk);
 
 	bip->bip_iter.bi_size = bio_integrity_bytes(bi, bio_sectors(bio));
 }
diff --git a/block/bio.c b/block/bio.c
index ecd1a9c7a301..6745759028da 100644
--- a/block/bio.c
+++ b/block/bio.c
@@ -593,10 +593,10 @@ void __bio_clone_fast(struct bio *bio, struct bio *bio_src)
 	BUG_ON(bio->bi_pool && BVEC_POOL_IDX(bio));
 
 	/*
-	 * most users will be overriding ->bi_bdev with a new target,
+	 * most users will be overriding ->bi_disk with a new target,
 	 * so we don't set nor calculate new physical/hw segment counts here
 	 */
-	bio->bi_bdev = bio_src->bi_bdev;
+	bio->bi_disk = bio_src->bi_disk;
 	bio_set_flag(bio, BIO_CLONED);
 	bio->bi_opf = bio_src->bi_opf;
 	bio->bi_write_hint = bio_src->bi_write_hint;
@@ -681,7 +681,7 @@ struct bio *bio_clone_bioset(struct bio *bio_src, gfp_t gfp_mask,
 	bio = bio_alloc_bioset(gfp_mask, bio_segments(bio_src), bs);
 	if (!bio)
 		return NULL;
-	bio->bi_bdev		= bio_src->bi_bdev;
+	bio->bi_disk		= bio_src->bi_disk;
 	bio->bi_opf		= bio_src->bi_opf;
 	bio->bi_write_hint	= bio_src->bi_write_hint;
 	bio->bi_iter.bi_sector	= bio_src->bi_iter.bi_sector;
@@ -1830,8 +1830,8 @@ again:
 		goto again;
 	}
 
-	if (bio->bi_bdev && bio_flagged(bio, BIO_TRACE_COMPLETION)) {
-		trace_block_bio_complete(bdev_get_queue(bio->bi_bdev), bio,
+	if (bio->bi_disk && bio_flagged(bio, BIO_TRACE_COMPLETION)) {
+		trace_block_bio_complete(bio->bi_disk->queue, bio,
 					 blk_status_to_errno(bio->bi_status));
 		bio_clear_flag(bio, BIO_TRACE_COMPLETION);
 	}
diff --git a/block/blk-core.c b/block/blk-core.c
index d579501f24ba..fc1af9097dff 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -1910,40 +1910,15 @@ out_unlock:
 	return BLK_QC_T_NONE;
 }
 
-/*
- * If bio->bi_dev is a partition, remap the location
- */
-static inline void blk_partition_remap(struct bio *bio)
-{
-	struct block_device *bdev = bio->bi_bdev;
-
-	/*
-	 * Zone reset does not include bi_size so bio_sectors() is always 0.
-	 * Include a test for the reset op code and perform the remap if needed.
-	 */
-	if (bdev != bdev->bd_contains &&
-	    (bio_sectors(bio) || bio_op(bio) == REQ_OP_ZONE_RESET)) {
-		struct hd_struct *p = bdev->bd_part;
-
-		bio->bi_iter.bi_sector += p->start_sect;
-		bio->bi_bdev = bdev->bd_contains;
-
-		trace_block_bio_remap(bdev_get_queue(bio->bi_bdev), bio,
-				      bdev->bd_dev,
-				      bio->bi_iter.bi_sector - p->start_sect);
-	}
-}
-
 static void handle_bad_sector(struct bio *bio)
 {
 	char b[BDEVNAME_SIZE];
 
 	printk(KERN_INFO "attempt to access beyond end of device\n");
 	printk(KERN_INFO "%s: rw=%d, want=%Lu, limit=%Lu\n",
-			bdevname(bio->bi_bdev, b),
-			bio->bi_opf,
+			bio_devname(bio, b), bio->bi_opf,
 			(unsigned long long)bio_end_sector(bio),
-			(long long)(i_size_read(bio->bi_bdev->bd_inode) >> 9));
+			(long long)get_capacity(bio->bi_disk));
 }
 
 #ifdef CONFIG_FAIL_MAKE_REQUEST
@@ -1981,6 +1956,38 @@ static inline bool should_fail_request(struct hd_struct *part,
 
 #endif /* CONFIG_FAIL_MAKE_REQUEST */
 
+/*
+ * Remap block n of partition p to block n+start(p) of the disk.
+ */
+static inline int blk_partition_remap(struct bio *bio)
+{
+	struct hd_struct *p;
+	int ret = 0;
+
+	/*
+	 * Zone reset does not include bi_size so bio_sectors() is always 0.
+	 * Include a test for the reset op code and perform the remap if needed.
+	 */
+	if (!bio->bi_partno ||
+	    (!bio_sectors(bio) && bio_op(bio) != REQ_OP_ZONE_RESET))
+		return 0;
+
+	rcu_read_lock();
+	p = __disk_get_part(bio->bi_disk, bio->bi_partno);
+	if (likely(p && !should_fail_request(p, bio->bi_iter.bi_size))) {
+		bio->bi_iter.bi_sector += p->start_sect;
+		bio->bi_partno = 0;
+		trace_block_bio_remap(bio->bi_disk->queue, bio, part_devt(p),
+				bio->bi_iter.bi_sector - p->start_sect);
+	} else {
+		printk("%s: fail for partition %d\n", __func__, bio->bi_partno);
+		ret = -EIO;
+	}
+	rcu_read_unlock();
+
+	return ret;
+}
+
 /*
  * Check whether this bio extends beyond the end of the device.
  */
@@ -1992,7 +1999,7 @@ static inline int bio_check_eod(struct bio *bio, unsigned int nr_sectors)
 		return 0;
 
 	/* Test device or partition size, when known. */
-	maxsector = i_size_read(bio->bi_bdev->bd_inode) >> 9;
+	maxsector = get_capacity(bio->bi_disk);
 	if (maxsector) {
 		sector_t sector = bio->bi_iter.bi_sector;
 
@@ -2017,20 +2024,18 @@ generic_make_request_checks(struct bio *bio)
 	int nr_sectors = bio_sectors(bio);
 	blk_status_t status = BLK_STS_IOERR;
 	char b[BDEVNAME_SIZE];
-	struct hd_struct *part;
 
 	might_sleep();
 
 	if (bio_check_eod(bio, nr_sectors))
 		goto end_io;
 
-	q = bdev_get_queue(bio->bi_bdev);
+	q = bio->bi_disk->queue;
 	if (unlikely(!q)) {
 		printk(KERN_ERR
 		       "generic_make_request: Trying to access "
 			"nonexistent block-device %s (%Lu)\n",
-			bdevname(bio->bi_bdev, b),
-			(long long) bio->bi_iter.bi_sector);
+			bio_devname(bio, b), (long long)bio->bi_iter.bi_sector);
 		goto end_io;
 	}
 
@@ -2042,17 +2047,11 @@ generic_make_request_checks(struct bio *bio)
 	if ((bio->bi_opf & REQ_NOWAIT) && !queue_is_rq_based(q))
 		goto not_supported;
 
-	part = bio->bi_bdev->bd_part;
-	if (should_fail_request(part, bio->bi_iter.bi_size) ||
-	    should_fail_request(&part_to_disk(part)->part0,
-				bio->bi_iter.bi_size))
+	if (should_fail_request(&bio->bi_disk->part0, bio->bi_iter.bi_size))
 		goto end_io;
 
-	/*
-	 * If this device has partitions, remap block n
-	 * of partition p to block n+start(p) of the disk.
-	 */
-	blk_partition_remap(bio);
+	if (blk_partition_remap(bio))
+		goto end_io;
 
 	if (bio_check_eod(bio, nr_sectors))
 		goto end_io;
@@ -2081,16 +2080,16 @@ generic_make_request_checks(struct bio *bio)
 			goto not_supported;
 		break;
 	case REQ_OP_WRITE_SAME:
-		if (!bdev_write_same(bio->bi_bdev))
+		if (!q->limits.max_write_same_sectors)
 			goto not_supported;
 		break;
 	case REQ_OP_ZONE_REPORT:
 	case REQ_OP_ZONE_RESET:
-		if (!bdev_is_zoned(bio->bi_bdev))
+		if (!blk_queue_is_zoned(q))
 			goto not_supported;
 		break;
 	case REQ_OP_WRITE_ZEROES:
-		if (!bdev_write_zeroes_sectors(bio->bi_bdev))
+		if (!q->limits.max_write_zeroes_sectors)
 			goto not_supported;
 		break;
 	default:
@@ -2197,7 +2196,7 @@ blk_qc_t generic_make_request(struct bio *bio)
 	bio_list_init(&bio_list_on_stack[0]);
 	current->bio_list = bio_list_on_stack;
 	do {
-		struct request_queue *q = bdev_get_queue(bio->bi_bdev);
+		struct request_queue *q = bio->bi_disk->queue;
 
 		if (likely(blk_queue_enter(q, bio->bi_opf & REQ_NOWAIT) == 0)) {
 			struct bio_list lower, same;
@@ -2215,7 +2214,7 @@ blk_qc_t generic_make_request(struct bio *bio)
 			bio_list_init(&lower);
 			bio_list_init(&same);
 			while ((bio = bio_list_pop(&bio_list_on_stack[0])) != NULL)
-				if (q == bdev_get_queue(bio->bi_bdev))
+				if (q == bio->bi_disk->queue)
 					bio_list_add(&same, bio);
 				else
 					bio_list_add(&lower, bio);
@@ -2258,7 +2257,7 @@ blk_qc_t submit_bio(struct bio *bio)
 		unsigned int count;
 
 		if (unlikely(bio_op(bio) == REQ_OP_WRITE_SAME))
-			count = bdev_logical_block_size(bio->bi_bdev) >> 9;
+			count = queue_logical_block_size(bio->bi_disk->queue);
 		else
 			count = bio_sectors(bio);
 
@@ -2275,8 +2274,7 @@ blk_qc_t submit_bio(struct bio *bio)
 			current->comm, task_pid_nr(current),
 				op_is_write(bio_op(bio)) ? "WRITE" : "READ",
 				(unsigned long long)bio->bi_iter.bi_sector,
-				bdevname(bio->bi_bdev, b),
-				count);
+				bio_devname(bio, b), count);
 		}
 	}
 
@@ -3049,8 +3047,8 @@ void blk_rq_bio_prep(struct request_queue *q, struct request *rq,
 	rq->__data_len = bio->bi_iter.bi_size;
 	rq->bio = rq->biotail = bio;
 
-	if (bio->bi_bdev)
-		rq->rq_disk = bio->bi_bdev->bd_disk;
+	if (bio->bi_disk)
+		rq->rq_disk = bio->bi_disk;
 }
 
 #if ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE
diff --git a/block/blk-flush.c b/block/blk-flush.c
index ed5fe322abba..83b7d5b41c79 100644
--- a/block/blk-flush.c
+++ b/block/blk-flush.c
@@ -525,7 +525,7 @@ int blkdev_issue_flush(struct block_device *bdev, gfp_t gfp_mask,
 		return -ENXIO;
 
 	bio = bio_alloc(gfp_mask, 0);
-	bio->bi_bdev = bdev;
+	bio_set_dev(bio, bdev);
 	bio->bi_opf = REQ_OP_WRITE | REQ_PREFLUSH;
 
 	ret = submit_bio_wait(bio);
diff --git a/block/blk-lib.c b/block/blk-lib.c
index 3fe0aec90597..e01adb5145b3 100644
--- a/block/blk-lib.c
+++ b/block/blk-lib.c
@@ -77,7 +77,7 @@ int __blkdev_issue_discard(struct block_device *bdev, sector_t sector,
 
 		bio = next_bio(bio, 0, gfp_mask);
 		bio->bi_iter.bi_sector = sector;
-		bio->bi_bdev = bdev;
+		bio_set_dev(bio, bdev);
 		bio_set_op_attrs(bio, op, 0);
 
 		bio->bi_iter.bi_size = req_sects << 9;
@@ -168,7 +168,7 @@ static int __blkdev_issue_write_same(struct block_device *bdev, sector_t sector,
 	while (nr_sects) {
 		bio = next_bio(bio, 1, gfp_mask);
 		bio->bi_iter.bi_sector = sector;
-		bio->bi_bdev = bdev;
+		bio_set_dev(bio, bdev);
 		bio->bi_vcnt = 1;
 		bio->bi_io_vec->bv_page = page;
 		bio->bi_io_vec->bv_offset = 0;
@@ -241,7 +241,7 @@ static int __blkdev_issue_write_zeroes(struct block_device *bdev,
 	while (nr_sects) {
 		bio = next_bio(bio, 0, gfp_mask);
 		bio->bi_iter.bi_sector = sector;
-		bio->bi_bdev = bdev;
+		bio_set_dev(bio, bdev);
 		bio->bi_opf = REQ_OP_WRITE_ZEROES;
 		if (flags & BLKDEV_ZERO_NOUNMAP)
 			bio->bi_opf |= REQ_NOUNMAP;
@@ -323,7 +323,7 @@ int __blkdev_issue_zeroout(struct block_device *bdev, sector_t sector,
 		bio = next_bio(bio, __blkdev_sectors_to_bio_pages(nr_sects),
 			       gfp_mask);
 		bio->bi_iter.bi_sector = sector;
-		bio->bi_bdev   = bdev;
+		bio_set_dev(bio, bdev);
 		bio_set_op_attrs(bio, REQ_OP_WRITE, 0);
 
 		while (nr_sects != 0) {
diff --git a/block/blk-merge.c b/block/blk-merge.c
index 05f116bfb99d..aa524cad5bea 100644
--- a/block/blk-merge.c
+++ b/block/blk-merge.c
@@ -786,7 +786,7 @@ bool blk_rq_merge_ok(struct request *rq, struct bio *bio)
 		return false;
 
 	/* must be same device and not a special request */
-	if (rq->rq_disk != bio->bi_bdev->bd_disk || req_no_special_merge(rq))
+	if (rq->rq_disk != bio->bi_disk || req_no_special_merge(rq))
 		return false;
 
 	/* only merge integrity protected bio into ditto rq */
diff --git a/block/blk-zoned.c b/block/blk-zoned.c
index 3bd15d8095b1..ff57fb51b338 100644
--- a/block/blk-zoned.c
+++ b/block/blk-zoned.c
@@ -116,7 +116,7 @@ int blkdev_report_zones(struct block_device *bdev,
 	if (!bio)
 		return -ENOMEM;
 
-	bio->bi_bdev = bdev;
+	bio_set_dev(bio, bdev);
 	bio->bi_iter.bi_sector = blk_zone_start(q, sector);
 	bio_set_op_attrs(bio, REQ_OP_ZONE_REPORT, 0);
 
@@ -234,7 +234,7 @@ int blkdev_reset_zones(struct block_device *bdev,
 
 		bio = bio_alloc(gfp_mask, 0);
 		bio->bi_iter.bi_sector = sector;
-		bio->bi_bdev = bdev;
+		bio_set_dev(bio, bdev);
 		bio_set_op_attrs(bio, REQ_OP_ZONE_RESET, 0);
 
 		ret = submit_bio_wait(bio);
diff --git a/drivers/block/brd.c b/drivers/block/brd.c
index 104b71c0490d..006e1cb7e6f0 100644
--- a/drivers/block/brd.c
+++ b/drivers/block/brd.c
@@ -294,14 +294,13 @@ out:
 
 static blk_qc_t brd_make_request(struct request_queue *q, struct bio *bio)
 {
-	struct block_device *bdev = bio->bi_bdev;
-	struct brd_device *brd = bdev->bd_disk->private_data;
+	struct brd_device *brd = bio->bi_disk->private_data;
 	struct bio_vec bvec;
 	sector_t sector;
 	struct bvec_iter iter;
 
 	sector = bio->bi_iter.bi_sector;
-	if (bio_end_sector(bio) > get_capacity(bdev->bd_disk))
+	if (bio_end_sector(bio) > get_capacity(bio->bi_disk))
 		goto io_error;
 
 	bio_for_each_segment(bvec, bio, iter) {
diff --git a/drivers/block/drbd/drbd_actlog.c b/drivers/block/drbd/drbd_actlog.c
index e02c45cd3c5a..5f0eaee8c8a7 100644
--- a/drivers/block/drbd/drbd_actlog.c
+++ b/drivers/block/drbd/drbd_actlog.c
@@ -151,7 +151,7 @@ static int _drbd_md_sync_page_io(struct drbd_device *device,
 	op_flags |= REQ_SYNC;
 
 	bio = bio_alloc_drbd(GFP_NOIO);
-	bio->bi_bdev = bdev->md_bdev;
+	bio_set_dev(bio, bdev->md_bdev);
 	bio->bi_iter.bi_sector = sector;
 	err = -EIO;
 	if (bio_add_page(bio, device->md_io.page, size, 0) != size)
diff --git a/drivers/block/drbd/drbd_bitmap.c b/drivers/block/drbd/drbd_bitmap.c
index 809fd245c3dc..bd97908c766f 100644
--- a/drivers/block/drbd/drbd_bitmap.c
+++ b/drivers/block/drbd/drbd_bitmap.c
@@ -1019,7 +1019,7 @@ static void bm_page_io_async(struct drbd_bm_aio_ctx *ctx, int page_nr) __must_ho
 		bm_store_page_idx(page, page_nr);
 	} else
 		page = b->bm_pages[page_nr];
-	bio->bi_bdev = device->ldev->md_bdev;
+	bio_set_dev(bio, device->ldev->md_bdev);
 	bio->bi_iter.bi_sector = on_disk_sector;
 	/* bio_add_page of a single page to an empty bio will always succeed,
 	 * according to api.  Do we want to assert that? */
diff --git a/drivers/block/drbd/drbd_int.h b/drivers/block/drbd/drbd_int.h
index d17b6e6393c7..819f9d0bc875 100644
--- a/drivers/block/drbd/drbd_int.h
+++ b/drivers/block/drbd/drbd_int.h
@@ -1628,8 +1628,8 @@ static inline void drbd_generic_make_request(struct drbd_device *device,
 					     int fault_type, struct bio *bio)
 {
 	__release(local);
-	if (!bio->bi_bdev) {
-		drbd_err(device, "drbd_generic_make_request: bio->bi_bdev == NULL\n");
+	if (!bio->bi_disk) {
+		drbd_err(device, "drbd_generic_make_request: bio->bi_disk == NULL\n");
 		bio->bi_status = BLK_STS_IOERR;
 		bio_endio(bio);
 		return;
diff --git a/drivers/block/drbd/drbd_receiver.c b/drivers/block/drbd/drbd_receiver.c
index c7e95e6380fb..ece6e5d7dc3f 100644
--- a/drivers/block/drbd/drbd_receiver.c
+++ b/drivers/block/drbd/drbd_receiver.c
@@ -1265,7 +1265,7 @@ static void submit_one_flush(struct drbd_device *device, struct issue_flush_cont
 
 	octx->device = device;
 	octx->ctx = ctx;
-	bio->bi_bdev = device->ldev->backing_bdev;
+	bio_set_dev(bio, device->ldev->backing_bdev);
 	bio->bi_private = octx;
 	bio->bi_end_io = one_flush_endio;
 	bio->bi_opf = REQ_OP_FLUSH | REQ_PREFLUSH;
@@ -1548,7 +1548,7 @@ next_bio:
 	}
 	/* > peer_req->i.sector, unless this is the first bio */
 	bio->bi_iter.bi_sector = sector;
-	bio->bi_bdev = device->ldev->backing_bdev;
+	bio_set_dev(bio, device->ldev->backing_bdev);
 	bio_set_op_attrs(bio, op, op_flags);
 	bio->bi_private = peer_req;
 	bio->bi_end_io = drbd_peer_request_endio;
diff --git a/drivers/block/drbd/drbd_req.c b/drivers/block/drbd/drbd_req.c
index 8d6b5d137b5e..447c975f5481 100644
--- a/drivers/block/drbd/drbd_req.c
+++ b/drivers/block/drbd/drbd_req.c
@@ -1179,7 +1179,7 @@ drbd_submit_req_private_bio(struct drbd_request *req)
 	else
 		type = DRBD_FAULT_DT_RD;
 
-	bio->bi_bdev = device->ldev->backing_bdev;
+	bio_set_dev(bio, device->ldev->backing_bdev);
 
 	/* State may have changed since we grabbed our reference on the
 	 * ->ldev member. Double check, and short-circuit to endio.
diff --git a/drivers/block/drbd/drbd_worker.c b/drivers/block/drbd/drbd_worker.c
index 1d8726a8df34..c268d886c4f0 100644
--- a/drivers/block/drbd/drbd_worker.c
+++ b/drivers/block/drbd/drbd_worker.c
@@ -1513,7 +1513,7 @@ int w_restart_disk_io(struct drbd_work *w, int cancel)
 		drbd_al_begin_io(device, &req->i);
 
 	drbd_req_make_private_bio(req, req->master_bio);
-	req->private_bio->bi_bdev = device->ldev->backing_bdev;
+	bio_set_dev(req->private_bio, device->ldev->backing_bdev);
 	generic_make_request(req->private_bio);
 
 	return 0;
diff --git a/drivers/block/floppy.c b/drivers/block/floppy.c
index 9c00f29e40c1..60c086a53609 100644
--- a/drivers/block/floppy.c
+++ b/drivers/block/floppy.c
@@ -4134,7 +4134,7 @@ static int __floppy_read_block_0(struct block_device *bdev, int drive)
 	cbdata.drive = drive;
 
 	bio_init(&bio, &bio_vec, 1);
-	bio.bi_bdev = bdev;
+	bio_set_dev(&bio, bdev);
 	bio_add_page(&bio, page, size, 0);
 
 	bio.bi_iter.bi_sector = 0;
diff --git a/drivers/block/pktcdvd.c b/drivers/block/pktcdvd.c
index 6b8b097abbb9..67974796c350 100644
--- a/drivers/block/pktcdvd.c
+++ b/drivers/block/pktcdvd.c
@@ -1028,7 +1028,7 @@ static void pkt_gather_data(struct pktcdvd_device *pd, struct packet_data *pkt)
 		bio = pkt->r_bios[f];
 		bio_reset(bio);
 		bio->bi_iter.bi_sector = pkt->sector + f * (CD_FRAMESIZE >> 9);
-		bio->bi_bdev = pd->bdev;
+		bio_set_dev(bio, pd->bdev);
 		bio->bi_end_io = pkt_end_io_read;
 		bio->bi_private = pkt;
 
@@ -1122,7 +1122,7 @@ static int pkt_start_recovery(struct packet_data *pkt)
 	pkt->sector = new_sector;
 
 	bio_reset(pkt->bio);
-	pkt->bio->bi_bdev = pd->bdev;
+	bio_set_set(pkt->bio, pd->bdev);
 	bio_set_op_attrs(pkt->bio, REQ_OP_WRITE, 0);
 	pkt->bio->bi_iter.bi_sector = new_sector;
 	pkt->bio->bi_iter.bi_size = pkt->frames * CD_FRAMESIZE;
@@ -1267,7 +1267,7 @@ static void pkt_start_write(struct pktcdvd_device *pd, struct packet_data *pkt)
 
 	bio_reset(pkt->w_bio);
 	pkt->w_bio->bi_iter.bi_sector = pkt->sector;
-	pkt->w_bio->bi_bdev = pd->bdev;
+	bio_set_dev(pkt->w_bio, pd->bdev);
 	pkt->w_bio->bi_end_io = pkt_end_io_packet_write;
 	pkt->w_bio->bi_private = pkt;
 
@@ -2314,7 +2314,7 @@ static void pkt_make_request_read(struct pktcdvd_device *pd, struct bio *bio)
 
 	psd->pd = pd;
 	psd->bio = bio;
-	cloned_bio->bi_bdev = pd->bdev;
+	bio_set_dev(cloned_bio, pd->bdev);
 	cloned_bio->bi_private = psd;
 	cloned_bio->bi_end_io = pkt_end_io_read_cloned;
 	pd->stats.secs_r += bio_sectors(bio);
@@ -2415,8 +2415,7 @@ static blk_qc_t pkt_make_request(struct request_queue *q, struct bio *bio)
 
 	pd = q->queuedata;
 	if (!pd) {
-		pr_err("%s incorrect request queue\n",
-		       bdevname(bio->bi_bdev, b));
+		pr_err("%s incorrect request queue\n", bio_devname(bio, b));
 		goto end_io;
 	}
 
diff --git a/drivers/block/xen-blkback/blkback.c b/drivers/block/xen-blkback/blkback.c
index 5f3a813e7ae0..987d665e82de 100644
--- a/drivers/block/xen-blkback/blkback.c
+++ b/drivers/block/xen-blkback/blkback.c
@@ -1363,7 +1363,7 @@ static int dispatch_rw_block_io(struct xen_blkif_ring *ring,
 				goto fail_put_bio;
 
 			biolist[nbio++] = bio;
-			bio->bi_bdev    = preq.bdev;
+			bio_set_dev(bio, preq.bdev);
 			bio->bi_private = pending_req;
 			bio->bi_end_io  = end_block_io_op;
 			bio->bi_iter.bi_sector  = preq.sector_number;
@@ -1382,7 +1382,7 @@ static int dispatch_rw_block_io(struct xen_blkif_ring *ring,
 			goto fail_put_bio;
 
 		biolist[nbio++] = bio;
-		bio->bi_bdev    = preq.bdev;
+		bio_set_dev(bio, preq.bdev);
 		bio->bi_private = pending_req;
 		bio->bi_end_io  = end_block_io_op;
 		bio_set_op_attrs(bio, operation, operation_flags);
diff --git a/drivers/md/bcache/debug.c b/drivers/md/bcache/debug.c
index 35a5a7210e51..61076eda2e6d 100644
--- a/drivers/md/bcache/debug.c
+++ b/drivers/md/bcache/debug.c
@@ -49,7 +49,7 @@ void bch_btree_verify(struct btree *b)
 	v->keys.ops = b->keys.ops;
 
 	bio = bch_bbio_alloc(b->c);
-	bio->bi_bdev		= PTR_CACHE(b->c, &b->key, 0)->bdev;
+	bio_set_dev(bio, PTR_CACHE(b->c, &b->key, 0)->bdev);
 	bio->bi_iter.bi_sector	= PTR_OFFSET(&b->key, 0);
 	bio->bi_iter.bi_size	= KEY_SIZE(&v->key) << 9;
 	bio->bi_opf		= REQ_OP_READ | REQ_META;
diff --git a/drivers/md/bcache/io.c b/drivers/md/bcache/io.c
index 6a9b85095e7b..7e871bdc0097 100644
--- a/drivers/md/bcache/io.c
+++ b/drivers/md/bcache/io.c
@@ -34,7 +34,7 @@ void __bch_submit_bbio(struct bio *bio, struct cache_set *c)
 	struct bbio *b = container_of(bio, struct bbio, bio);
 
 	bio->bi_iter.bi_sector	= PTR_OFFSET(&b->key, 0);
-	bio->bi_bdev		= PTR_CACHE(c, &b->key, 0)->bdev;
+	bio_set_dev(bio, PTR_CACHE(c, &b->key, 0)->bdev);
 
 	b->submit_time_us = local_clock_us();
 	closure_bio_submit(bio, bio->bi_private);
diff --git a/drivers/md/bcache/journal.c b/drivers/md/bcache/journal.c
index 0352d05e495c..7e1d1c3ba33a 100644
--- a/drivers/md/bcache/journal.c
+++ b/drivers/md/bcache/journal.c
@@ -53,7 +53,7 @@ reread:		left = ca->sb.bucket_size - offset;
 
 		bio_reset(bio);
 		bio->bi_iter.bi_sector	= bucket + offset;
-		bio->bi_bdev	= ca->bdev;
+		bio_set_dev(bio, ca->bdev);
 		bio->bi_iter.bi_size	= len << 9;
 
 		bio->bi_end_io	= journal_read_endio;
@@ -452,7 +452,7 @@ static void do_journal_discard(struct cache *ca)
 		bio_set_op_attrs(bio, REQ_OP_DISCARD, 0);
 		bio->bi_iter.bi_sector	= bucket_to_sector(ca->set,
 						ca->sb.d[ja->discard_idx]);
-		bio->bi_bdev		= ca->bdev;
+		bio_set_dev(bio, ca->bdev);
 		bio->bi_iter.bi_size	= bucket_bytes(ca);
 		bio->bi_end_io		= journal_discard_endio;
 
@@ -623,7 +623,7 @@ static void journal_write_unlocked(struct closure *cl)
 
 		bio_reset(bio);
 		bio->bi_iter.bi_sector	= PTR_OFFSET(k, i);
-		bio->bi_bdev	= ca->bdev;
+		bio_set_dev(bio, ca->bdev);
 		bio->bi_iter.bi_size = sectors << 9;
 
 		bio->bi_end_io	= journal_write_endio;
diff --git a/drivers/md/bcache/request.c b/drivers/md/bcache/request.c
index 72eb97176403..0e1463d0c334 100644
--- a/drivers/md/bcache/request.c
+++ b/drivers/md/bcache/request.c
@@ -607,7 +607,7 @@ static void request_endio(struct bio *bio)
 static void bio_complete(struct search *s)
 {
 	if (s->orig_bio) {
-		struct request_queue *q = bdev_get_queue(s->orig_bio->bi_bdev);
+		struct request_queue *q = s->orig_bio->bi_disk->queue;
 		generic_end_io_acct(q, bio_data_dir(s->orig_bio),
 				    &s->d->disk->part0, s->start_time);
 
@@ -735,7 +735,7 @@ static void cached_dev_read_done(struct closure *cl)
 	if (s->iop.bio) {
 		bio_reset(s->iop.bio);
 		s->iop.bio->bi_iter.bi_sector = s->cache_miss->bi_iter.bi_sector;
-		s->iop.bio->bi_bdev = s->cache_miss->bi_bdev;
+		bio_copy_dev(s->iop.bio, s->cache_miss);
 		s->iop.bio->bi_iter.bi_size = s->insert_bio_sectors << 9;
 		bch_bio_map(s->iop.bio, NULL);
 
@@ -794,7 +794,7 @@ static int cached_dev_cache_miss(struct btree *b, struct search *s,
 	    !(bio->bi_opf & REQ_META) &&
 	    s->iop.c->gc_stats.in_use < CUTOFF_CACHE_READA)
 		reada = min_t(sector_t, dc->readahead >> 9,
-			      bdev_sectors(bio->bi_bdev) - bio_end_sector(bio));
+			      get_capacity(bio->bi_disk) - bio_end_sector(bio));
 
 	s->insert_bio_sectors = min(sectors, bio_sectors(bio) + reada);
 
@@ -820,7 +820,7 @@ static int cached_dev_cache_miss(struct btree *b, struct search *s,
 		goto out_submit;
 
 	cache_bio->bi_iter.bi_sector	= miss->bi_iter.bi_sector;
-	cache_bio->bi_bdev		= miss->bi_bdev;
+	bio_copy_dev(cache_bio, miss);
 	cache_bio->bi_iter.bi_size	= s->insert_bio_sectors << 9;
 
 	cache_bio->bi_end_io	= request_endio;
@@ -919,7 +919,7 @@ static void cached_dev_write(struct cached_dev *dc, struct search *s)
 			struct bio *flush = bio_alloc_bioset(GFP_NOIO, 0,
 							     dc->disk.bio_split);
 
-			flush->bi_bdev	= bio->bi_bdev;
+			bio_copy_dev(flush, bio);
 			flush->bi_end_io = request_endio;
 			flush->bi_private = cl;
 			flush->bi_opf = REQ_OP_WRITE | REQ_PREFLUSH;
@@ -956,13 +956,13 @@ static blk_qc_t cached_dev_make_request(struct request_queue *q,
 					struct bio *bio)
 {
 	struct search *s;
-	struct bcache_device *d = bio->bi_bdev->bd_disk->private_data;
+	struct bcache_device *d = bio->bi_disk->private_data;
 	struct cached_dev *dc = container_of(d, struct cached_dev, disk);
 	int rw = bio_data_dir(bio);
 
 	generic_start_io_acct(q, rw, bio_sectors(bio), &d->disk->part0);
 
-	bio->bi_bdev = dc->bdev;
+	bio_set_dev(bio, dc->bdev);
 	bio->bi_iter.bi_sector += dc->sb.data_offset;
 
 	if (cached_dev_get(dc)) {
@@ -1072,7 +1072,7 @@ static blk_qc_t flash_dev_make_request(struct request_queue *q,
 {
 	struct search *s;
 	struct closure *cl;
-	struct bcache_device *d = bio->bi_bdev->bd_disk->private_data;
+	struct bcache_device *d = bio->bi_disk->private_data;
 	int rw = bio_data_dir(bio);
 
 	generic_start_io_acct(q, rw, bio_sectors(bio), &d->disk->part0);
diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c
index 8352fad765f6..974d832e54a6 100644
--- a/drivers/md/bcache/super.c
+++ b/drivers/md/bcache/super.c
@@ -257,7 +257,7 @@ void bch_write_bdev_super(struct cached_dev *dc, struct closure *parent)
 	closure_init(cl, parent);
 
 	bio_reset(bio);
-	bio->bi_bdev	= dc->bdev;
+	bio_set_dev(bio, dc->bdev);
 	bio->bi_end_io	= write_bdev_super_endio;
 	bio->bi_private = dc;
 
@@ -303,7 +303,7 @@ void bcache_write_super(struct cache_set *c)
 		SET_CACHE_SYNC(&ca->sb, CACHE_SYNC(&c->sb));
 
 		bio_reset(bio);
-		bio->bi_bdev	= ca->bdev;
+		bio_set_dev(bio, ca->bdev);
 		bio->bi_end_io	= write_super_endio;
 		bio->bi_private = ca;
 
@@ -508,7 +508,7 @@ static void prio_io(struct cache *ca, uint64_t bucket, int op,
 	closure_init_stack(cl);
 
 	bio->bi_iter.bi_sector	= bucket * ca->sb.bucket_size;
-	bio->bi_bdev		= ca->bdev;
+	bio_set_dev(bio, ca->bdev);
 	bio->bi_iter.bi_size	= bucket_bytes(ca);
 
 	bio->bi_end_io	= prio_endio;
diff --git a/drivers/md/bcache/writeback.c b/drivers/md/bcache/writeback.c
index 42c66e76f05e..c49022a8dc9d 100644
--- a/drivers/md/bcache/writeback.c
+++ b/drivers/md/bcache/writeback.c
@@ -181,7 +181,7 @@ static void write_dirty(struct closure *cl)
 	dirty_init(w);
 	bio_set_op_attrs(&io->bio, REQ_OP_WRITE, 0);
 	io->bio.bi_iter.bi_sector = KEY_START(&w->key);
-	io->bio.bi_bdev		= io->dc->bdev;
+	bio_set_dev(&io->bio, io->dc->bdev);
 	io->bio.bi_end_io	= dirty_endio;
 
 	closure_bio_submit(&io->bio, cl);
@@ -250,8 +250,7 @@ static void read_dirty(struct cached_dev *dc)
 		dirty_init(w);
 		bio_set_op_attrs(&io->bio, REQ_OP_READ, 0);
 		io->bio.bi_iter.bi_sector = PTR_OFFSET(&w->key, 0);
-		io->bio.bi_bdev		= PTR_CACHE(dc->disk.c,
-						    &w->key, 0)->bdev;
+		bio_set_dev(&io->bio, PTR_CACHE(dc->disk.c, &w->key, 0)->bdev);
 		io->bio.bi_end_io	= read_dirty_endio;
 
 		if (bio_alloc_pages(&io->bio, GFP_KERNEL))
diff --git a/drivers/md/dm-bio-record.h b/drivers/md/dm-bio-record.h
index dd3646111561..c82578af56a5 100644
--- a/drivers/md/dm-bio-record.h
+++ b/drivers/md/dm-bio-record.h
@@ -18,21 +18,24 @@
  */
 
 struct dm_bio_details {
-	struct block_device *bi_bdev;
+	struct gendisk *bi_disk;
+	u8 bi_partno;
 	unsigned long bi_flags;
 	struct bvec_iter bi_iter;
 };
 
 static inline void dm_bio_record(struct dm_bio_details *bd, struct bio *bio)
 {
-	bd->bi_bdev = bio->bi_bdev;
+	bd->bi_disk = bio->bi_disk;
+	bd->bi_partno = bio->bi_partno;
 	bd->bi_flags = bio->bi_flags;
 	bd->bi_iter = bio->bi_iter;
 }
 
 static inline void dm_bio_restore(struct dm_bio_details *bd, struct bio *bio)
 {
-	bio->bi_bdev = bd->bi_bdev;
+	bio->bi_disk = bd->bi_disk;
+	bio->bi_partno = bd->bi_partno;
 	bio->bi_flags = bd->bi_flags;
 	bio->bi_iter = bd->bi_iter;
 }
diff --git a/drivers/md/dm-bufio.c b/drivers/md/dm-bufio.c
index 44f4a8ac95bd..9601225e0ae9 100644
--- a/drivers/md/dm-bufio.c
+++ b/drivers/md/dm-bufio.c
@@ -616,7 +616,7 @@ static void use_inline_bio(struct dm_buffer *b, int rw, sector_t sector,
 
 	bio_init(&b->bio, b->bio_vec, DM_BUFIO_INLINE_VECS);
 	b->bio.bi_iter.bi_sector = sector;
-	b->bio.bi_bdev = b->c->bdev;
+	bio_set_dev(&b->bio, b->c->bdev);
 	b->bio.bi_end_io = inline_endio;
 	/*
 	 * Use of .bi_private isn't a problem here because
diff --git a/drivers/md/dm-cache-target.c b/drivers/md/dm-cache-target.c
index c5ea03fc7ee1..dcac25c2be7a 100644
--- a/drivers/md/dm-cache-target.c
+++ b/drivers/md/dm-cache-target.c
@@ -833,7 +833,7 @@ static bool is_discarded_oblock(struct cache *cache, dm_oblock_t b)
  *--------------------------------------------------------------*/
 static void remap_to_origin(struct cache *cache, struct bio *bio)
 {
-	bio->bi_bdev = cache->origin_dev->bdev;
+	bio_set_dev(bio, cache->origin_dev->bdev);
 }
 
 static void remap_to_cache(struct cache *cache, struct bio *bio,
@@ -842,7 +842,7 @@ static void remap_to_cache(struct cache *cache, struct bio *bio,
 	sector_t bi_sector = bio->bi_iter.bi_sector;
 	sector_t block = from_cblock(cblock);
 
-	bio->bi_bdev = cache->cache_dev->bdev;
+	bio_set_dev(bio, cache->cache_dev->bdev);
 	if (!block_size_is_power_of_two(cache))
 		bio->bi_iter.bi_sector =
 			(block * cache->sectors_per_block) +
diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c
index 73c2e270cda6..ca99147208a9 100644
--- a/drivers/md/dm-crypt.c
+++ b/drivers/md/dm-crypt.c
@@ -1544,7 +1544,7 @@ static void clone_init(struct dm_crypt_io *io, struct bio *clone)
 
 	clone->bi_private = io;
 	clone->bi_end_io  = crypt_endio;
-	clone->bi_bdev    = cc->dev->bdev;
+	bio_set_dev(clone, cc->dev->bdev);
 	clone->bi_opf	  = io->base_bio->bi_opf;
 }
 
@@ -2793,7 +2793,7 @@ static int crypt_map(struct dm_target *ti, struct bio *bio)
 	 */
 	if (unlikely(bio->bi_opf & REQ_PREFLUSH ||
 	    bio_op(bio) == REQ_OP_DISCARD)) {
-		bio->bi_bdev = cc->dev->bdev;
+		bio_set_dev(bio, cc->dev->bdev);
 		if (bio_sectors(bio))
 			bio->bi_iter.bi_sector = cc->start +
 				dm_target_offset(ti, bio->bi_iter.bi_sector);
diff --git a/drivers/md/dm-delay.c b/drivers/md/dm-delay.c
index ae3158795d26..2209a9700acd 100644
--- a/drivers/md/dm-delay.c
+++ b/drivers/md/dm-delay.c
@@ -282,7 +282,7 @@ static int delay_map(struct dm_target *ti, struct bio *bio)
 	struct delay_c *dc = ti->private;
 
 	if ((bio_data_dir(bio) == WRITE) && (dc->dev_write)) {
-		bio->bi_bdev = dc->dev_write->bdev;
+		bio_set_dev(bio, dc->dev_write->bdev);
 		if (bio_sectors(bio))
 			bio->bi_iter.bi_sector = dc->start_write +
 				dm_target_offset(ti, bio->bi_iter.bi_sector);
@@ -290,7 +290,7 @@ static int delay_map(struct dm_target *ti, struct bio *bio)
 		return delay_bio(dc, dc->write_delay, bio);
 	}
 
-	bio->bi_bdev = dc->dev_read->bdev;
+	bio_set_dev(bio, dc->dev_read->bdev);
 	bio->bi_iter.bi_sector = dc->start_read +
 		dm_target_offset(ti, bio->bi_iter.bi_sector);
 
diff --git a/drivers/md/dm-era-target.c b/drivers/md/dm-era-target.c
index e7ba89f98d8d..ba84b8d62cd0 100644
--- a/drivers/md/dm-era-target.c
+++ b/drivers/md/dm-era-target.c
@@ -1192,7 +1192,7 @@ static dm_block_t get_block(struct era *era, struct bio *bio)
 
 static void remap_to_origin(struct era *era, struct bio *bio)
 {
-	bio->bi_bdev = era->origin_dev->bdev;
+	bio_set_dev(bio, era->origin_dev->bdev);
 }
 
 /*----------------------------------------------------------------
diff --git a/drivers/md/dm-flakey.c b/drivers/md/dm-flakey.c
index e2c7234931bc..7146c2d9762d 100644
--- a/drivers/md/dm-flakey.c
+++ b/drivers/md/dm-flakey.c
@@ -274,7 +274,7 @@ static void flakey_map_bio(struct dm_target *ti, struct bio *bio)
 {
 	struct flakey_c *fc = ti->private;
 
-	bio->bi_bdev = fc->dev->bdev;
+	bio_set_dev(bio, fc->dev->bdev);
 	if (bio_sectors(bio) || bio_op(bio) == REQ_OP_ZONE_RESET)
 		bio->bi_iter.bi_sector =
 			flakey_map_sector(ti, bio->bi_iter.bi_sector);
diff --git a/drivers/md/dm-integrity.c b/drivers/md/dm-integrity.c
index 3acce09bba35..27c0f223f8ea 100644
--- a/drivers/md/dm-integrity.c
+++ b/drivers/md/dm-integrity.c
@@ -250,7 +250,8 @@ struct dm_integrity_io {
 
 	struct completion *completion;
 
-	struct block_device *orig_bi_bdev;
+	struct gendisk *orig_bi_disk;
+	u8 orig_bi_partno;
 	bio_end_io_t *orig_bi_end_io;
 	struct bio_integrity_payload *orig_bi_integrity;
 	struct bvec_iter orig_bi_iter;
@@ -1164,7 +1165,8 @@ static void integrity_end_io(struct bio *bio)
 	struct dm_integrity_io *dio = dm_per_bio_data(bio, sizeof(struct dm_integrity_io));
 
 	bio->bi_iter = dio->orig_bi_iter;
-	bio->bi_bdev = dio->orig_bi_bdev;
+	bio->bi_disk = dio->orig_bi_disk;
+	bio->bi_partno = dio->orig_bi_partno;
 	if (dio->orig_bi_integrity) {
 		bio->bi_integrity = dio->orig_bi_integrity;
 		bio->bi_opf |= REQ_INTEGRITY;
@@ -1681,8 +1683,9 @@ sleep:
 
 	dio->orig_bi_iter = bio->bi_iter;
 
-	dio->orig_bi_bdev = bio->bi_bdev;
-	bio->bi_bdev = ic->dev->bdev;
+	dio->orig_bi_disk = bio->bi_disk;
+	dio->orig_bi_partno = bio->bi_partno;
+	bio_set_dev(bio, ic->dev->bdev);
 
 	dio->orig_bi_integrity = bio_integrity(bio);
 	bio->bi_integrity = NULL;
diff --git a/drivers/md/dm-io.c b/drivers/md/dm-io.c
index 25039607f3cb..b4357ed4d541 100644
--- a/drivers/md/dm-io.c
+++ b/drivers/md/dm-io.c
@@ -347,7 +347,7 @@ static void do_region(int op, int op_flags, unsigned region,
 
 		bio = bio_alloc_bioset(GFP_NOIO, num_bvecs, io->client->bios);
 		bio->bi_iter.bi_sector = where->sector + (where->count - remaining);
-		bio->bi_bdev = where->bdev;
+		bio_set_dev(bio, where->bdev);
 		bio->bi_end_io = endio;
 		bio_set_op_attrs(bio, op, op_flags);
 		store_io_and_region_in_bio(bio, io, region);
diff --git a/drivers/md/dm-linear.c b/drivers/md/dm-linear.c
index 41971a090e34..405eca206d67 100644
--- a/drivers/md/dm-linear.c
+++ b/drivers/md/dm-linear.c
@@ -88,7 +88,7 @@ static void linear_map_bio(struct dm_target *ti, struct bio *bio)
 {
 	struct linear_c *lc = ti->private;
 
-	bio->bi_bdev = lc->dev->bdev;
+	bio_set_dev(bio, lc->dev->bdev);
 	if (bio_sectors(bio) || bio_op(bio) == REQ_OP_ZONE_RESET)
 		bio->bi_iter.bi_sector =
 			linear_map_sector(ti, bio->bi_iter.bi_sector);
diff --git a/drivers/md/dm-log-writes.c b/drivers/md/dm-log-writes.c
index a1da0eb58a93..534a254eb977 100644
--- a/drivers/md/dm-log-writes.c
+++ b/drivers/md/dm-log-writes.c
@@ -198,7 +198,7 @@ static int write_metadata(struct log_writes_c *lc, void *entry,
 	}
 	bio->bi_iter.bi_size = 0;
 	bio->bi_iter.bi_sector = sector;
-	bio->bi_bdev = lc->logdev->bdev;
+	bio_set_dev(bio, lc->logdev->bdev);
 	bio->bi_end_io = log_end_io;
 	bio->bi_private = lc;
 	bio_set_op_attrs(bio, REQ_OP_WRITE, 0);
@@ -263,7 +263,7 @@ static int log_one_block(struct log_writes_c *lc,
 	}
 	bio->bi_iter.bi_size = 0;
 	bio->bi_iter.bi_sector = sector;
-	bio->bi_bdev = lc->logdev->bdev;
+	bio_set_dev(bio, lc->logdev->bdev);
 	bio->bi_end_io = log_end_io;
 	bio->bi_private = lc;
 	bio_set_op_attrs(bio, REQ_OP_WRITE, 0);
@@ -285,7 +285,7 @@ static int log_one_block(struct log_writes_c *lc,
 			}
 			bio->bi_iter.bi_size = 0;
 			bio->bi_iter.bi_sector = sector;
-			bio->bi_bdev = lc->logdev->bdev;
+			bio_set_dev(bio, lc->logdev->bdev);
 			bio->bi_end_io = log_end_io;
 			bio->bi_private = lc;
 			bio_set_op_attrs(bio, REQ_OP_WRITE, 0);
@@ -539,7 +539,7 @@ static void normal_map_bio(struct dm_target *ti, struct bio *bio)
 {
 	struct log_writes_c *lc = ti->private;
 
-	bio->bi_bdev = lc->dev->bdev;
+	bio_set_dev(bio, lc->dev->bdev);
 }
 
 static int log_writes_map(struct dm_target *ti, struct bio *bio)
diff --git a/drivers/md/dm-mpath.c b/drivers/md/dm-mpath.c
index 0e8ab5bb3575..573046bd5c46 100644
--- a/drivers/md/dm-mpath.c
+++ b/drivers/md/dm-mpath.c
@@ -566,7 +566,7 @@ static int __multipath_map_bio(struct multipath *m, struct bio *bio, struct dm_m
 	mpio->nr_bytes = nr_bytes;
 
 	bio->bi_status = 0;
-	bio->bi_bdev = pgpath->path.dev->bdev;
+	bio_set_dev(bio, pgpath->path.dev->bdev);
 	bio->bi_opf |= REQ_FAILFAST_TRANSPORT;
 
 	if (pgpath->pg->ps.type->start_io)
diff --git a/drivers/md/dm-raid1.c b/drivers/md/dm-raid1.c
index a4fbd911d566..c0b82136b2d1 100644
--- a/drivers/md/dm-raid1.c
+++ b/drivers/md/dm-raid1.c
@@ -145,7 +145,7 @@ static void dispatch_bios(void *context, struct bio_list *bio_list)
 
 struct dm_raid1_bio_record {
 	struct mirror *m;
-	/* if details->bi_bdev == NULL, details were not saved */
+	/* if details->bi_disk == NULL, details were not saved */
 	struct dm_bio_details details;
 	region_t write_region;
 };
@@ -464,7 +464,7 @@ static sector_t map_sector(struct mirror *m, struct bio *bio)
 
 static void map_bio(struct mirror *m, struct bio *bio)
 {
-	bio->bi_bdev = m->dev->bdev;
+	bio_set_dev(bio, m->dev->bdev);
 	bio->bi_iter.bi_sector = map_sector(m, bio);
 }
 
@@ -1199,7 +1199,7 @@ static int mirror_map(struct dm_target *ti, struct bio *bio)
 	struct dm_raid1_bio_record *bio_record =
 	  dm_per_bio_data(bio, sizeof(struct dm_raid1_bio_record));
 
-	bio_record->details.bi_bdev = NULL;
+	bio_record->details.bi_disk = NULL;
 
 	if (rw == WRITE) {
 		/* Save region for mirror_end_io() handler */
@@ -1266,7 +1266,7 @@ static int mirror_end_io(struct dm_target *ti, struct bio *bio,
 		goto out;
 
 	if (unlikely(*error)) {
-		if (!bio_record->details.bi_bdev) {
+		if (!bio_record->details.bi_disk) {
 			/*
 			 * There wasn't enough memory to record necessary
 			 * information for a retry or there was no other
@@ -1291,7 +1291,7 @@ static int mirror_end_io(struct dm_target *ti, struct bio *bio,
 			bd = &bio_record->details;
 
 			dm_bio_restore(bd, bio);
-			bio_record->details.bi_bdev = NULL;
+			bio_record->details.bi_disk = NULL;
 			bio->bi_status = 0;
 
 			queue_bio(ms, bio, rw);
@@ -1301,7 +1301,7 @@ static int mirror_end_io(struct dm_target *ti, struct bio *bio,
 	}
 
 out:
-	bio_record->details.bi_bdev = NULL;
+	bio_record->details.bi_disk = NULL;
 
 	return DM_ENDIO_DONE;
 }
diff --git a/drivers/md/dm-snap.c b/drivers/md/dm-snap.c
index 1ba41048b438..1113b42e1eda 100644
--- a/drivers/md/dm-snap.c
+++ b/drivers/md/dm-snap.c
@@ -1663,7 +1663,7 @@ __find_pending_exception(struct dm_snapshot *s,
 static void remap_exception(struct dm_snapshot *s, struct dm_exception *e,
 			    struct bio *bio, chunk_t chunk)
 {
-	bio->bi_bdev = s->cow->bdev;
+	bio_set_dev(bio, s->cow->bdev);
 	bio->bi_iter.bi_sector =
 		chunk_to_sector(s->store, dm_chunk_number(e->new_chunk) +
 				(chunk - e->old_chunk)) +
@@ -1681,7 +1681,7 @@ static int snapshot_map(struct dm_target *ti, struct bio *bio)
 	init_tracked_chunk(bio);
 
 	if (bio->bi_opf & REQ_PREFLUSH) {
-		bio->bi_bdev = s->cow->bdev;
+		bio_set_dev(bio, s->cow->bdev);
 		return DM_MAPIO_REMAPPED;
 	}
 
@@ -1769,7 +1769,7 @@ static int snapshot_map(struct dm_target *ti, struct bio *bio)
 			goto out;
 		}
 	} else {
-		bio->bi_bdev = s->origin->bdev;
+		bio_set_dev(bio, s->origin->bdev);
 		track_chunk(s, bio, chunk);
 	}
 
@@ -1802,9 +1802,9 @@ static int snapshot_merge_map(struct dm_target *ti, struct bio *bio)
 
 	if (bio->bi_opf & REQ_PREFLUSH) {
 		if (!dm_bio_get_target_bio_nr(bio))
-			bio->bi_bdev = s->origin->bdev;
+			bio_set_dev(bio, s->origin->bdev);
 		else
-			bio->bi_bdev = s->cow->bdev;
+			bio_set_dev(bio, s->cow->bdev);
 		return DM_MAPIO_REMAPPED;
 	}
 
@@ -1824,7 +1824,7 @@ static int snapshot_merge_map(struct dm_target *ti, struct bio *bio)
 		    chunk >= s->first_merging_chunk &&
 		    chunk < (s->first_merging_chunk +
 			     s->num_merging_chunks)) {
-			bio->bi_bdev = s->origin->bdev;
+			bio_set_dev(bio, s->origin->bdev);
 			bio_list_add(&s->bios_queued_during_merge, bio);
 			r = DM_MAPIO_SUBMITTED;
 			goto out_unlock;
@@ -1838,7 +1838,7 @@ static int snapshot_merge_map(struct dm_target *ti, struct bio *bio)
 	}
 
 redirect_to_origin:
-	bio->bi_bdev = s->origin->bdev;
+	bio_set_dev(bio, s->origin->bdev);
 
 	if (bio_data_dir(bio) == WRITE) {
 		up_write(&s->lock);
@@ -2285,7 +2285,7 @@ static int origin_map(struct dm_target *ti, struct bio *bio)
 	struct dm_origin *o = ti->private;
 	unsigned available_sectors;
 
-	bio->bi_bdev = o->dev->bdev;
+	bio_set_dev(bio, o->dev->bdev);
 
 	if (unlikely(bio->bi_opf & REQ_PREFLUSH))
 		return DM_MAPIO_REMAPPED;
diff --git a/drivers/md/dm-stripe.c b/drivers/md/dm-stripe.c
index a0375530b07f..ab50d7c4377f 100644
--- a/drivers/md/dm-stripe.c
+++ b/drivers/md/dm-stripe.c
@@ -270,7 +270,7 @@ static int stripe_map_range(struct stripe_c *sc, struct bio *bio,
 	stripe_map_range_sector(sc, bio_end_sector(bio),
 				target_stripe, &end);
 	if (begin < end) {
-		bio->bi_bdev = sc->stripe[target_stripe].dev->bdev;
+		bio_set_dev(bio, sc->stripe[target_stripe].dev->bdev);
 		bio->bi_iter.bi_sector = begin +
 			sc->stripe[target_stripe].physical_start;
 		bio->bi_iter.bi_size = to_bytes(end - begin);
@@ -291,7 +291,7 @@ static int stripe_map(struct dm_target *ti, struct bio *bio)
 	if (bio->bi_opf & REQ_PREFLUSH) {
 		target_bio_nr = dm_bio_get_target_bio_nr(bio);
 		BUG_ON(target_bio_nr >= sc->stripes);
-		bio->bi_bdev = sc->stripe[target_bio_nr].dev->bdev;
+		bio_set_dev(bio, sc->stripe[target_bio_nr].dev->bdev);
 		return DM_MAPIO_REMAPPED;
 	}
 	if (unlikely(bio_op(bio) == REQ_OP_DISCARD) ||
@@ -306,7 +306,7 @@ static int stripe_map(struct dm_target *ti, struct bio *bio)
 			  &stripe, &bio->bi_iter.bi_sector);
 
 	bio->bi_iter.bi_sector += sc->stripe[stripe].physical_start;
-	bio->bi_bdev = sc->stripe[stripe].dev->bdev;
+	bio_set_dev(bio, sc->stripe[stripe].dev->bdev);
 
 	return DM_MAPIO_REMAPPED;
 }
@@ -430,9 +430,7 @@ static int stripe_end_io(struct dm_target *ti, struct bio *bio,
 		return DM_ENDIO_DONE;
 
 	memset(major_minor, 0, sizeof(major_minor));
-	sprintf(major_minor, "%d:%d",
-		MAJOR(disk_devt(bio->bi_bdev->bd_disk)),
-		MINOR(disk_devt(bio->bi_bdev->bd_disk)));
+	sprintf(major_minor, "%d:%d", MAJOR(bio_dev(bio)), MINOR(bio_dev(bio)));
 
 	/*
 	 * Test to see which stripe drive triggered the event
diff --git a/drivers/md/dm-switch.c b/drivers/md/dm-switch.c
index 871c18fe000d..2dcea4c56f37 100644
--- a/drivers/md/dm-switch.c
+++ b/drivers/md/dm-switch.c
@@ -322,7 +322,7 @@ static int switch_map(struct dm_target *ti, struct bio *bio)
 	sector_t offset = dm_target_offset(ti, bio->bi_iter.bi_sector);
 	unsigned path_nr = switch_get_path_nr(sctx, offset);
 
-	bio->bi_bdev = sctx->path_list[path_nr].dmdev->bdev;
+	bio_set_dev(bio, sctx->path_list[path_nr].dmdev->bdev);
 	bio->bi_iter.bi_sector = sctx->path_list[path_nr].start + offset;
 
 	return DM_MAPIO_REMAPPED;
diff --git a/drivers/md/dm-thin.c b/drivers/md/dm-thin.c
index 9dec2f8cc739..69d88aee3055 100644
--- a/drivers/md/dm-thin.c
+++ b/drivers/md/dm-thin.c
@@ -679,7 +679,7 @@ static void remap(struct thin_c *tc, struct bio *bio, dm_block_t block)
 	struct pool *pool = tc->pool;
 	sector_t bi_sector = bio->bi_iter.bi_sector;
 
-	bio->bi_bdev = tc->pool_dev->bdev;
+	bio_set_dev(bio, tc->pool_dev->bdev);
 	if (block_size_is_power_of_two(pool))
 		bio->bi_iter.bi_sector =
 			(block << pool->sectors_per_block_shift) |
@@ -691,7 +691,7 @@ static void remap(struct thin_c *tc, struct bio *bio, dm_block_t block)
 
 static void remap_to_origin(struct thin_c *tc, struct bio *bio)
 {
-	bio->bi_bdev = tc->origin_dev->bdev;
+	bio_set_dev(bio, tc->origin_dev->bdev);
 }
 
 static int bio_triggers_commit(struct thin_c *tc, struct bio *bio)
@@ -3313,7 +3313,7 @@ static int pool_map(struct dm_target *ti, struct bio *bio)
 	 * As this is a singleton target, ti->begin is always zero.
 	 */
 	spin_lock_irqsave(&pool->lock, flags);
-	bio->bi_bdev = pt->data_dev->bdev;
+	bio_set_dev(bio, pt->data_dev->bdev);
 	r = DM_MAPIO_REMAPPED;
 	spin_unlock_irqrestore(&pool->lock, flags);
 
diff --git a/drivers/md/dm-verity-target.c b/drivers/md/dm-verity-target.c
index b46705ebf01f..1c5b6185c79d 100644
--- a/drivers/md/dm-verity-target.c
+++ b/drivers/md/dm-verity-target.c
@@ -637,7 +637,7 @@ static int verity_map(struct dm_target *ti, struct bio *bio)
 	struct dm_verity *v = ti->private;
 	struct dm_verity_io *io;
 
-	bio->bi_bdev = v->data_dev->bdev;
+	bio_set_dev(bio, v->data_dev->bdev);
 	bio->bi_iter.bi_sector = verity_map_sector(v, bio->bi_iter.bi_sector);
 
 	if (((unsigned)bio->bi_iter.bi_sector | bio_sectors(bio)) &
diff --git a/drivers/md/dm-zoned-metadata.c b/drivers/md/dm-zoned-metadata.c
index a4fa2ada6883..70485de37b66 100644
--- a/drivers/md/dm-zoned-metadata.c
+++ b/drivers/md/dm-zoned-metadata.c
@@ -409,7 +409,7 @@ static struct dmz_mblock *dmz_fetch_mblock(struct dmz_metadata *zmd,
 	}
 
 	bio->bi_iter.bi_sector = dmz_blk2sect(block);
-	bio->bi_bdev = zmd->dev->bdev;
+	bio_set_dev(bio, zmd->dev->bdev);
 	bio->bi_private = mblk;
 	bio->bi_end_io = dmz_mblock_bio_end_io;
 	bio_set_op_attrs(bio, REQ_OP_READ, REQ_META | REQ_PRIO);
@@ -564,7 +564,7 @@ static void dmz_write_mblock(struct dmz_metadata *zmd, struct dmz_mblock *mblk,
 	set_bit(DMZ_META_WRITING, &mblk->state);
 
 	bio->bi_iter.bi_sector = dmz_blk2sect(block);
-	bio->bi_bdev = zmd->dev->bdev;
+	bio_set_dev(bio, zmd->dev->bdev);
 	bio->bi_private = mblk;
 	bio->bi_end_io = dmz_mblock_bio_end_io;
 	bio_set_op_attrs(bio, REQ_OP_WRITE, REQ_META | REQ_PRIO);
@@ -586,7 +586,7 @@ static int dmz_rdwr_block(struct dmz_metadata *zmd, int op, sector_t block,
 		return -ENOMEM;
 
 	bio->bi_iter.bi_sector = dmz_blk2sect(block);
-	bio->bi_bdev = zmd->dev->bdev;
+	bio_set_dev(bio, zmd->dev->bdev);
 	bio_set_op_attrs(bio, op, REQ_SYNC | REQ_META | REQ_PRIO);
 	bio_add_page(bio, page, DMZ_BLOCK_SIZE, 0);
 	ret = submit_bio_wait(bio);
diff --git a/drivers/md/dm-zoned-target.c b/drivers/md/dm-zoned-target.c
index b08bbbd4d902..b87c1741da4b 100644
--- a/drivers/md/dm-zoned-target.c
+++ b/drivers/md/dm-zoned-target.c
@@ -238,7 +238,7 @@ static void dmz_submit_write_bio(struct dmz_target *dmz, struct dm_zone *zone,
 	struct dmz_bioctx *bioctx = dm_per_bio_data(bio, sizeof(struct dmz_bioctx));
 
 	/* Setup and submit the BIO */
-	bio->bi_bdev = dmz->dev->bdev;
+	bio_set_dev(bio, dmz->dev->bdev);
 	bio->bi_iter.bi_sector = dmz_start_sect(dmz->metadata, zone) + dmz_blk2sect(chunk_block);
 	atomic_inc(&bioctx->ref);
 	generic_make_request(bio);
@@ -586,7 +586,7 @@ static int dmz_map(struct dm_target *ti, struct bio *bio)
 		      (unsigned long long)dmz_chunk_block(dmz->dev, dmz_bio_block(bio)),
 		      (unsigned int)dmz_bio_blocks(bio));
 
-	bio->bi_bdev = dev->bdev;
+	bio_set_dev(bio, dev->bdev);
 
 	if (!nr_sectors && bio_op(bio) != REQ_OP_WRITE)
 		return DM_MAPIO_REMAPPED;
diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index 8612a2d1ccd9..b28b9ce8f4ff 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -851,10 +851,10 @@ static void clone_endio(struct bio *bio)
 
 	if (unlikely(error == BLK_STS_TARGET)) {
 		if (bio_op(bio) == REQ_OP_WRITE_SAME &&
-		    !bdev_get_queue(bio->bi_bdev)->limits.max_write_same_sectors)
+		    !bio->bi_disk->queue->limits.max_write_same_sectors)
 			disable_write_same(md);
 		if (bio_op(bio) == REQ_OP_WRITE_ZEROES &&
-		    !bdev_get_queue(bio->bi_bdev)->limits.max_write_zeroes_sectors)
+		    !bio->bi_disk->queue->limits.max_write_zeroes_sectors)
 			disable_write_zeroes(md);
 	}
 
@@ -1215,8 +1215,8 @@ static void __map_bio(struct dm_target_io *tio)
 		break;
 	case DM_MAPIO_REMAPPED:
 		/* the bio has been remapped so dispatch it */
-		trace_block_bio_remap(bdev_get_queue(clone->bi_bdev), clone,
-				      tio->io->bio->bi_bdev->bd_dev, sector);
+		trace_block_bio_remap(clone->bi_disk->queue, clone,
+				      bio_dev(tio->io->bio), sector);
 		generic_make_request(clone);
 		break;
 	case DM_MAPIO_KILL:
@@ -1796,7 +1796,7 @@ static struct mapped_device *alloc_dev(int minor)
 		goto bad;
 
 	bio_init(&md->flush_bio, NULL, 0);
-	md->flush_bio.bi_bdev = md->bdev;
+	bio_set_dev(&md->flush_bio, md->bdev);
 	md->flush_bio.bi_opf = REQ_OP_WRITE | REQ_PREFLUSH | REQ_SYNC;
 
 	dm_stats_init(&md->stats);
diff --git a/drivers/md/faulty.c b/drivers/md/faulty.c
index 06a64d5d8c6c..38264b38420f 100644
--- a/drivers/md/faulty.c
+++ b/drivers/md/faulty.c
@@ -216,12 +216,12 @@ static bool faulty_make_request(struct mddev *mddev, struct bio *bio)
 	if (failit) {
 		struct bio *b = bio_clone_fast(bio, GFP_NOIO, mddev->bio_set);
 
-		b->bi_bdev = conf->rdev->bdev;
+		bio_set_dev(b, conf->rdev->bdev);
 		b->bi_private = bio;
 		b->bi_end_io = faulty_fail;
 		bio = b;
 	} else
-		bio->bi_bdev = conf->rdev->bdev;
+		bio_set_dev(bio, conf->rdev->bdev);
 
 	generic_make_request(bio);
 	return true;
diff --git a/drivers/md/linear.c b/drivers/md/linear.c
index 5f1eb9189542..c464fb48039a 100644
--- a/drivers/md/linear.c
+++ b/drivers/md/linear.c
@@ -275,17 +275,17 @@ static bool linear_make_request(struct mddev *mddev, struct bio *bio)
 		bio = split;
 	}
 
-	bio->bi_bdev = tmp_dev->rdev->bdev;
+	bio_set_dev(bio, tmp_dev->rdev->bdev);
 	bio->bi_iter.bi_sector = bio->bi_iter.bi_sector -
 		start_sector + data_offset;
 
 	if (unlikely((bio_op(bio) == REQ_OP_DISCARD) &&
-		     !blk_queue_discard(bdev_get_queue(bio->bi_bdev)))) {
+		     !blk_queue_discard(bio->bi_disk->queue))) {
 		/* Just ignore it */
 		bio_endio(bio);
 	} else {
 		if (mddev->gendisk)
-			trace_block_bio_remap(bdev_get_queue(bio->bi_bdev),
+			trace_block_bio_remap(bio->bi_disk->queue,
 					      bio, disk_devt(mddev->gendisk),
 					      bio_sector);
 		mddev_check_writesame(mddev, bio);
diff --git a/drivers/md/md.c b/drivers/md/md.c
index c99634612fc4..0afdc1bfd7cb 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -422,7 +422,7 @@ static void submit_flushes(struct work_struct *ws)
 			bi = bio_alloc_mddev(GFP_NOIO, 0, mddev);
 			bi->bi_end_io = md_end_flush;
 			bi->bi_private = rdev;
-			bi->bi_bdev = rdev->bdev;
+			bio_set_dev(bi, rdev->bdev);
 			bi->bi_opf = REQ_OP_WRITE | REQ_PREFLUSH;
 			atomic_inc(&mddev->flush_pending);
 			submit_bio(bi);
@@ -772,7 +772,7 @@ void md_super_write(struct mddev *mddev, struct md_rdev *rdev,
 
 	atomic_inc(&rdev->nr_pending);
 
-	bio->bi_bdev = rdev->meta_bdev ? rdev->meta_bdev : rdev->bdev;
+	bio_set_dev(bio, rdev->meta_bdev ? rdev->meta_bdev : rdev->bdev);
 	bio->bi_iter.bi_sector = sector;
 	bio_add_page(bio, page, size, 0);
 	bio->bi_private = rdev;
@@ -803,8 +803,10 @@ int sync_page_io(struct md_rdev *rdev, sector_t sector, int size,
 	struct bio *bio = md_bio_alloc_sync(rdev->mddev);
 	int ret;
 
-	bio->bi_bdev = (metadata_op && rdev->meta_bdev) ?
-		rdev->meta_bdev : rdev->bdev;
+	if (metadata_op && rdev->meta_bdev)
+		bio_set_dev(bio, rdev->meta_bdev);
+	else
+		bio_set_dev(bio, rdev->bdev);
 	bio_set_op_attrs(bio, op, op_flags);
 	if (metadata_op)
 		bio->bi_iter.bi_sector = sector + rdev->sb_start;
diff --git a/drivers/md/md.h b/drivers/md/md.h
index 09db03455801..c0d436fb88f0 100644
--- a/drivers/md/md.h
+++ b/drivers/md/md.h
@@ -509,6 +509,11 @@ static inline void md_sync_acct(struct block_device *bdev, unsigned long nr_sect
 	atomic_add(nr_sectors, &bdev->bd_contains->bd_disk->sync_io);
 }
 
+static inline void md_sync_acct_bio(struct bio *bio, unsigned long nr_sectors)
+{
+	atomic_add(nr_sectors, &bio->bi_disk->sync_io);
+}
+
 struct md_personality
 {
 	char *name;
@@ -721,14 +726,14 @@ static inline void mddev_clear_unsupported_flags(struct mddev *mddev,
 static inline void mddev_check_writesame(struct mddev *mddev, struct bio *bio)
 {
 	if (bio_op(bio) == REQ_OP_WRITE_SAME &&
-	    !bdev_get_queue(bio->bi_bdev)->limits.max_write_same_sectors)
+	    !bio->bi_disk->queue->limits.max_write_same_sectors)
 		mddev->queue->limits.max_write_same_sectors = 0;
 }
 
 static inline void mddev_check_write_zeroes(struct mddev *mddev, struct bio *bio)
 {
 	if (bio_op(bio) == REQ_OP_WRITE_ZEROES &&
-	    !bdev_get_queue(bio->bi_bdev)->limits.max_write_zeroes_sectors)
+	    !bio->bi_disk->queue->limits.max_write_zeroes_sectors)
 		mddev->queue->limits.max_write_zeroes_sectors = 0;
 }
 #endif /* _MD_MD_H */
diff --git a/drivers/md/multipath.c b/drivers/md/multipath.c
index 23a162ba6c56..b68e0666b9b0 100644
--- a/drivers/md/multipath.c
+++ b/drivers/md/multipath.c
@@ -134,7 +134,7 @@ static bool multipath_make_request(struct mddev *mddev, struct bio * bio)
 	__bio_clone_fast(&mp_bh->bio, bio);
 
 	mp_bh->bio.bi_iter.bi_sector += multipath->rdev->data_offset;
-	mp_bh->bio.bi_bdev = multipath->rdev->bdev;
+	bio_set_dev(&mp_bh->bio, multipath->rdev->bdev);
 	mp_bh->bio.bi_opf |= REQ_FAILFAST_TRANSPORT;
 	mp_bh->bio.bi_end_io = multipath_end_request;
 	mp_bh->bio.bi_private = mp_bh;
@@ -345,17 +345,17 @@ static void multipathd(struct md_thread *thread)
 
 		if ((mp_bh->path = multipath_map (conf))<0) {
 			pr_err("multipath: %s: unrecoverable IO read error for block %llu\n",
-			       bdevname(bio->bi_bdev,b),
+			       bio_devname(bio, b),
 			       (unsigned long long)bio->bi_iter.bi_sector);
 			multipath_end_bh_io(mp_bh, BLK_STS_IOERR);
 		} else {
 			pr_err("multipath: %s: redirecting sector %llu to another IO path\n",
-			       bdevname(bio->bi_bdev,b),
+			       bio_devname(bio, b),
 			       (unsigned long long)bio->bi_iter.bi_sector);
 			*bio = *(mp_bh->master_bio);
 			bio->bi_iter.bi_sector +=
 				conf->multipaths[mp_bh->path].rdev->data_offset;
-			bio->bi_bdev = conf->multipaths[mp_bh->path].rdev->bdev;
+			bio_set_dev(bio, conf->multipaths[mp_bh->path].rdev->bdev);
 			bio->bi_opf |= REQ_FAILFAST_TRANSPORT;
 			bio->bi_end_io = multipath_end_request;
 			bio->bi_private = mp_bh;
diff --git a/drivers/md/raid0.c b/drivers/md/raid0.c
index 94d9ae9b0fd0..05a4521b832f 100644
--- a/drivers/md/raid0.c
+++ b/drivers/md/raid0.c
@@ -588,14 +588,13 @@ static bool raid0_make_request(struct mddev *mddev, struct bio *bio)
 
 	zone = find_zone(mddev->private, &sector);
 	tmp_dev = map_sector(mddev, zone, sector, &sector);
-	bio->bi_bdev = tmp_dev->bdev;
+	bio_set_dev(bio, tmp_dev->bdev);
 	bio->bi_iter.bi_sector = sector + zone->dev_start +
 		tmp_dev->data_offset;
 
 	if (mddev->gendisk)
-		trace_block_bio_remap(bdev_get_queue(bio->bi_bdev),
-				      bio, disk_devt(mddev->gendisk),
-				      bio_sector);
+		trace_block_bio_remap(bio->bi_disk->queue, bio,
+				disk_devt(mddev->gendisk), bio_sector);
 	mddev_check_writesame(mddev, bio);
 	mddev_check_write_zeroes(mddev, bio);
 	generic_make_request(bio);
diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index f50958ded9f0..baf5e358d22a 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -786,13 +786,13 @@ static void flush_bio_list(struct r1conf *conf, struct bio *bio)
 
 	while (bio) { /* submit pending writes */
 		struct bio *next = bio->bi_next;
-		struct md_rdev *rdev = (void*)bio->bi_bdev;
+		struct md_rdev *rdev = (void *)bio->bi_disk;
 		bio->bi_next = NULL;
-		bio->bi_bdev = rdev->bdev;
+		bio_set_dev(bio, rdev->bdev);
 		if (test_bit(Faulty, &rdev->flags)) {
 			bio_io_error(bio);
 		} else if (unlikely((bio_op(bio) == REQ_OP_DISCARD) &&
-				    !blk_queue_discard(bdev_get_queue(bio->bi_bdev))))
+				    !blk_queue_discard(bio->bi_disk->queue)))
 			/* Just ignore it */
 			bio_endio(bio);
 		else
@@ -1273,7 +1273,7 @@ static void raid1_read_request(struct mddev *mddev, struct bio *bio,
 
 	read_bio->bi_iter.bi_sector = r1_bio->sector +
 		mirror->rdev->data_offset;
-	read_bio->bi_bdev = mirror->rdev->bdev;
+	bio_set_dev(read_bio, mirror->rdev->bdev);
 	read_bio->bi_end_io = raid1_end_read_request;
 	bio_set_op_attrs(read_bio, op, do_sync);
 	if (test_bit(FailFast, &mirror->rdev->flags) &&
@@ -1282,9 +1282,8 @@ static void raid1_read_request(struct mddev *mddev, struct bio *bio,
 	read_bio->bi_private = r1_bio;
 
 	if (mddev->gendisk)
-	        trace_block_bio_remap(bdev_get_queue(read_bio->bi_bdev),
-	                              read_bio, disk_devt(mddev->gendisk),
-	                              r1_bio->sector);
+	        trace_block_bio_remap(read_bio->bi_disk->queue, read_bio,
+				disk_devt(mddev->gendisk), r1_bio->sector);
 
 	generic_make_request(read_bio);
 }
@@ -1496,7 +1495,7 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio,
 
 		mbio->bi_iter.bi_sector	= (r1_bio->sector +
 				   conf->mirrors[i].rdev->data_offset);
-		mbio->bi_bdev = conf->mirrors[i].rdev->bdev;
+		bio_set_dev(mbio, conf->mirrors[i].rdev->bdev);
 		mbio->bi_end_io	= raid1_end_write_request;
 		mbio->bi_opf = bio_op(bio) | (bio->bi_opf & (REQ_SYNC | REQ_FUA));
 		if (test_bit(FailFast, &conf->mirrors[i].rdev->flags) &&
@@ -1508,11 +1507,11 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio,
 		atomic_inc(&r1_bio->remaining);
 
 		if (mddev->gendisk)
-			trace_block_bio_remap(bdev_get_queue(mbio->bi_bdev),
+			trace_block_bio_remap(mbio->bi_disk->queue,
 					      mbio, disk_devt(mddev->gendisk),
 					      r1_bio->sector);
 		/* flush_pending_writes() needs access to the rdev so...*/
-		mbio->bi_bdev = (void*)conf->mirrors[i].rdev;
+		mbio->bi_disk = (void *)conf->mirrors[i].rdev;
 
 		cb = blk_check_plugged(raid1_unplug, mddev, sizeof(*plug));
 		if (cb)
@@ -1990,8 +1989,7 @@ static int fix_sync_read_error(struct r1bio *r1_bio)
 			 * Don't fail devices as that won't really help.
 			 */
 			pr_crit_ratelimited("md/raid1:%s: %s: unrecoverable I/O read error for block %llu\n",
-					    mdname(mddev),
-					    bdevname(bio->bi_bdev, b),
+					    mdname(mddev), bio_devname(bio, b),
 					    (unsigned long long)r1_bio->sector);
 			for (d = 0; d < conf->raid_disks * 2; d++) {
 				rdev = conf->mirrors[d].rdev;
@@ -2082,7 +2080,7 @@ static void process_checks(struct r1bio *r1_bio)
 		b->bi_status = status;
 		b->bi_iter.bi_sector = r1_bio->sector +
 			conf->mirrors[i].rdev->data_offset;
-		b->bi_bdev = conf->mirrors[i].rdev->bdev;
+		bio_set_dev(b, conf->mirrors[i].rdev->bdev);
 		b->bi_end_io = end_sync_read;
 		rp->raid_bio = r1_bio;
 		b->bi_private = rp;
@@ -2350,7 +2348,7 @@ static int narrow_write_error(struct r1bio *r1_bio, int i)
 
 		bio_trim(wbio, sector - r1_bio->sector, sectors);
 		wbio->bi_iter.bi_sector += rdev->data_offset;
-		wbio->bi_bdev = rdev->bdev;
+		bio_set_dev(wbio, rdev->bdev);
 
 		if (submit_bio_wait(wbio) < 0)
 			/* failure! */
@@ -2440,7 +2438,6 @@ static void handle_read_error(struct r1conf *conf, struct r1bio *r1_bio)
 	struct mddev *mddev = conf->mddev;
 	struct bio *bio;
 	struct md_rdev *rdev;
-	dev_t bio_dev;
 	sector_t bio_sector;
 
 	clear_bit(R1BIO_ReadError, &r1_bio->state);
@@ -2454,7 +2451,6 @@ static void handle_read_error(struct r1conf *conf, struct r1bio *r1_bio)
 	 */
 
 	bio = r1_bio->bios[r1_bio->read_disk];
-	bio_dev = bio->bi_bdev->bd_dev;
 	bio_sector = conf->mirrors[r1_bio->read_disk].rdev->data_offset + r1_bio->sector;
 	bio_put(bio);
 	r1_bio->bios[r1_bio->read_disk] = NULL;
@@ -2727,7 +2723,7 @@ static sector_t raid1_sync_request(struct mddev *mddev, sector_t sector_nr,
 		if (bio->bi_end_io) {
 			atomic_inc(&rdev->nr_pending);
 			bio->bi_iter.bi_sector = sector_nr + rdev->data_offset;
-			bio->bi_bdev = rdev->bdev;
+			bio_set_dev(bio, rdev->bdev);
 			if (test_bit(FailFast, &rdev->flags))
 				bio->bi_opf |= MD_FAILFAST;
 		}
@@ -2853,7 +2849,7 @@ static sector_t raid1_sync_request(struct mddev *mddev, sector_t sector_nr,
 			bio = r1_bio->bios[i];
 			if (bio->bi_end_io == end_sync_read) {
 				read_targets--;
-				md_sync_acct(bio->bi_bdev, nr_sectors);
+				md_sync_acct_bio(bio, nr_sectors);
 				if (read_targets == 1)
 					bio->bi_opf &= ~MD_FAILFAST;
 				generic_make_request(bio);
@@ -2862,7 +2858,7 @@ static sector_t raid1_sync_request(struct mddev *mddev, sector_t sector_nr,
 	} else {
 		atomic_set(&r1_bio->remaining, 1);
 		bio = r1_bio->bios[r1_bio->read_disk];
-		md_sync_acct(bio->bi_bdev, nr_sectors);
+		md_sync_acct_bio(bio, nr_sectors);
 		if (read_targets == 1)
 			bio->bi_opf &= ~MD_FAILFAST;
 		generic_make_request(bio);
diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index f55d4cc085f6..d1f948e371e0 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -901,13 +901,13 @@ static void flush_pending_writes(struct r10conf *conf)
 
 		while (bio) { /* submit pending writes */
 			struct bio *next = bio->bi_next;
-			struct md_rdev *rdev = (void*)bio->bi_bdev;
+			struct md_rdev *rdev = (void*)bio->bi_disk;
 			bio->bi_next = NULL;
-			bio->bi_bdev = rdev->bdev;
+			bio_set_dev(bio, rdev->bdev);
 			if (test_bit(Faulty, &rdev->flags)) {
 				bio_io_error(bio);
 			} else if (unlikely((bio_op(bio) ==  REQ_OP_DISCARD) &&
-					    !blk_queue_discard(bdev_get_queue(bio->bi_bdev))))
+					    !blk_queue_discard(bio->bi_disk->queue)))
 				/* Just ignore it */
 				bio_endio(bio);
 			else
@@ -1085,13 +1085,13 @@ static void raid10_unplug(struct blk_plug_cb *cb, bool from_schedule)
 
 	while (bio) { /* submit pending writes */
 		struct bio *next = bio->bi_next;
-		struct md_rdev *rdev = (void*)bio->bi_bdev;
+		struct md_rdev *rdev = (void*)bio->bi_disk;
 		bio->bi_next = NULL;
-		bio->bi_bdev = rdev->bdev;
+		bio_set_dev(bio, rdev->bdev);
 		if (test_bit(Faulty, &rdev->flags)) {
 			bio_io_error(bio);
 		} else if (unlikely((bio_op(bio) ==  REQ_OP_DISCARD) &&
-				    !blk_queue_discard(bdev_get_queue(bio->bi_bdev))))
+				    !blk_queue_discard(bio->bi_disk->queue)))
 			/* Just ignore it */
 			bio_endio(bio);
 		else
@@ -1200,7 +1200,7 @@ static void raid10_read_request(struct mddev *mddev, struct bio *bio,
 
 	read_bio->bi_iter.bi_sector = r10_bio->devs[slot].addr +
 		choose_data_offset(r10_bio, rdev);
-	read_bio->bi_bdev = rdev->bdev;
+	bio_set_dev(read_bio, rdev->bdev);
 	read_bio->bi_end_io = raid10_end_read_request;
 	bio_set_op_attrs(read_bio, op, do_sync);
 	if (test_bit(FailFast, &rdev->flags) &&
@@ -1209,7 +1209,7 @@ static void raid10_read_request(struct mddev *mddev, struct bio *bio,
 	read_bio->bi_private = r10_bio;
 
 	if (mddev->gendisk)
-	        trace_block_bio_remap(bdev_get_queue(read_bio->bi_bdev),
+	        trace_block_bio_remap(read_bio->bi_disk->queue,
 	                              read_bio, disk_devt(mddev->gendisk),
 	                              r10_bio->sector);
 	generic_make_request(read_bio);
@@ -1249,7 +1249,7 @@ static void raid10_write_one_disk(struct mddev *mddev, struct r10bio *r10_bio,
 
 	mbio->bi_iter.bi_sector	= (r10_bio->devs[n_copy].addr +
 				   choose_data_offset(r10_bio, rdev));
-	mbio->bi_bdev = rdev->bdev;
+	bio_set_dev(mbio, rdev->bdev);
 	mbio->bi_end_io	= raid10_end_write_request;
 	bio_set_op_attrs(mbio, op, do_sync | do_fua);
 	if (!replacement && test_bit(FailFast,
@@ -1259,11 +1259,11 @@ static void raid10_write_one_disk(struct mddev *mddev, struct r10bio *r10_bio,
 	mbio->bi_private = r10_bio;
 
 	if (conf->mddev->gendisk)
-		trace_block_bio_remap(bdev_get_queue(mbio->bi_bdev),
+		trace_block_bio_remap(mbio->bi_disk->queue,
 				      mbio, disk_devt(conf->mddev->gendisk),
 				      r10_bio->sector);
 	/* flush_pending_writes() needs access to the rdev so...*/
-	mbio->bi_bdev = (void *)rdev;
+	mbio->bi_disk = (void *)rdev;
 
 	atomic_inc(&r10_bio->remaining);
 
@@ -2094,7 +2094,7 @@ static void sync_request_write(struct mddev *mddev, struct r10bio *r10_bio)
 		if (test_bit(FailFast, &conf->mirrors[d].rdev->flags))
 			tbio->bi_opf |= MD_FAILFAST;
 		tbio->bi_iter.bi_sector += conf->mirrors[d].rdev->data_offset;
-		tbio->bi_bdev = conf->mirrors[d].rdev->bdev;
+		bio_set_dev(tbio, conf->mirrors[d].rdev->bdev);
 		generic_make_request(tbio);
 	}
 
@@ -2552,7 +2552,7 @@ static int narrow_write_error(struct r10bio *r10_bio, int i)
 		wsector = r10_bio->devs[i].addr + (sector - r10_bio->sector);
 		wbio->bi_iter.bi_sector = wsector +
 				   choose_data_offset(r10_bio, rdev);
-		wbio->bi_bdev = rdev->bdev;
+		bio_set_dev(wbio, rdev->bdev);
 		bio_set_op_attrs(wbio, REQ_OP_WRITE, 0);
 
 		if (submit_bio_wait(wbio) < 0)
@@ -2575,7 +2575,6 @@ static void handle_read_error(struct mddev *mddev, struct r10bio *r10_bio)
 	struct bio *bio;
 	struct r10conf *conf = mddev->private;
 	struct md_rdev *rdev = r10_bio->devs[slot].rdev;
-	dev_t bio_dev;
 	sector_t bio_last_sector;
 
 	/* we got a read error. Maybe the drive is bad.  Maybe just
@@ -2587,7 +2586,6 @@ static void handle_read_error(struct mddev *mddev, struct r10bio *r10_bio)
 	 * frozen.
 	 */
 	bio = r10_bio->devs[slot].bio;
-	bio_dev = bio->bi_bdev->bd_dev;
 	bio_last_sector = r10_bio->devs[slot].addr + rdev->data_offset + r10_bio->sectors;
 	bio_put(bio);
 	r10_bio->devs[slot].bio = NULL;
@@ -2950,7 +2948,7 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr,
 
 	/* Again, very different code for resync and recovery.
 	 * Both must result in an r10bio with a list of bios that
-	 * have bi_end_io, bi_sector, bi_bdev set,
+	 * have bi_end_io, bi_sector, bi_disk set,
 	 * and bi_private set to the r10bio.
 	 * For recovery, we may actually create several r10bios
 	 * with 2 bios in each, that correspond to the bios in the main one.
@@ -3095,7 +3093,7 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr,
 				from_addr = r10_bio->devs[j].addr;
 				bio->bi_iter.bi_sector = from_addr +
 					rdev->data_offset;
-				bio->bi_bdev = rdev->bdev;
+				bio_set_dev(bio, rdev->bdev);
 				atomic_inc(&rdev->nr_pending);
 				/* and we write to 'i' (if not in_sync) */
 
@@ -3117,7 +3115,7 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr,
 					bio_set_op_attrs(bio, REQ_OP_WRITE, 0);
 					bio->bi_iter.bi_sector = to_addr
 						+ mrdev->data_offset;
-					bio->bi_bdev = mrdev->bdev;
+					bio_set_dev(bio, mrdev->bdev);
 					atomic_inc(&r10_bio->remaining);
 				} else
 					r10_bio->devs[1].bio->bi_end_io = NULL;
@@ -3143,7 +3141,7 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr,
 				bio_set_op_attrs(bio, REQ_OP_WRITE, 0);
 				bio->bi_iter.bi_sector = to_addr +
 					mreplace->data_offset;
-				bio->bi_bdev = mreplace->bdev;
+				bio_set_dev(bio, mreplace->bdev);
 				atomic_inc(&r10_bio->remaining);
 				break;
 			}
@@ -3289,7 +3287,7 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr,
 			if (test_bit(FailFast, &rdev->flags))
 				bio->bi_opf |= MD_FAILFAST;
 			bio->bi_iter.bi_sector = sector + rdev->data_offset;
-			bio->bi_bdev = rdev->bdev;
+			bio_set_dev(bio, rdev->bdev);
 			count++;
 
 			rdev = rcu_dereference(conf->mirrors[d].replacement);
@@ -3311,7 +3309,7 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr,
 			if (test_bit(FailFast, &rdev->flags))
 				bio->bi_opf |= MD_FAILFAST;
 			bio->bi_iter.bi_sector = sector + rdev->data_offset;
-			bio->bi_bdev = rdev->bdev;
+			bio_set_dev(bio, rdev->bdev);
 			count++;
 			rcu_read_unlock();
 		}
@@ -3367,7 +3365,7 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr,
 		r10_bio->sectors = nr_sectors;
 
 		if (bio->bi_end_io == end_sync_read) {
-			md_sync_acct(bio->bi_bdev, nr_sectors);
+			md_sync_acct_bio(bio, nr_sectors);
 			bio->bi_status = 0;
 			generic_make_request(bio);
 		}
@@ -4383,7 +4381,7 @@ read_more:
 
 	read_bio = bio_alloc_mddev(GFP_KERNEL, RESYNC_PAGES, mddev);
 
-	read_bio->bi_bdev = rdev->bdev;
+	bio_set_dev(read_bio, rdev->bdev);
 	read_bio->bi_iter.bi_sector = (r10_bio->devs[r10_bio->read_slot].addr
 			       + rdev->data_offset);
 	read_bio->bi_private = r10_bio;
@@ -4417,7 +4415,7 @@ read_more:
 		if (!rdev2 || test_bit(Faulty, &rdev2->flags))
 			continue;
 
-		b->bi_bdev = rdev2->bdev;
+		bio_set_dev(b, rdev2->bdev);
 		b->bi_iter.bi_sector = r10_bio->devs[s/2].addr +
 			rdev2->new_data_offset;
 		b->bi_end_io = end_reshape_write;
@@ -4449,7 +4447,7 @@ read_more:
 	r10_bio->sectors = nr_sectors;
 
 	/* Now submit the read */
-	md_sync_acct(read_bio->bi_bdev, r10_bio->sectors);
+	md_sync_acct_bio(read_bio, r10_bio->sectors);
 	atomic_inc(&r10_bio->remaining);
 	read_bio->bi_next = NULL;
 	generic_make_request(read_bio);
@@ -4511,7 +4509,7 @@ static void reshape_request_write(struct mddev *mddev, struct r10bio *r10_bio)
 		}
 		atomic_inc(&rdev->nr_pending);
 		rcu_read_unlock();
-		md_sync_acct(b->bi_bdev, r10_bio->sectors);
+		md_sync_acct_bio(b, r10_bio->sectors);
 		atomic_inc(&r10_bio->remaining);
 		b->bi_next = NULL;
 		generic_make_request(b);
diff --git a/drivers/md/raid5-cache.c b/drivers/md/raid5-cache.c
index bfa1e907c472..f253a9c583c1 100644
--- a/drivers/md/raid5-cache.c
+++ b/drivers/md/raid5-cache.c
@@ -728,7 +728,7 @@ static struct bio *r5l_bio_alloc(struct r5l_log *log)
 	struct bio *bio = bio_alloc_bioset(GFP_NOIO, BIO_MAX_PAGES, log->bs);
 
 	bio_set_op_attrs(bio, REQ_OP_WRITE, 0);
-	bio->bi_bdev = log->rdev->bdev;
+	bio_set_dev(bio, log->rdev->bdev);
 	bio->bi_iter.bi_sector = log->rdev->data_offset + log->log_start;
 
 	return bio;
@@ -1291,7 +1291,7 @@ void r5l_flush_stripe_to_raid(struct r5l_log *log)
 	if (!do_flush)
 		return;
 	bio_reset(&log->flush_bio);
-	log->flush_bio.bi_bdev = log->rdev->bdev;
+	bio_set_dev(&log->flush_bio, log->rdev->bdev);
 	log->flush_bio.bi_end_io = r5l_log_flush_endio;
 	log->flush_bio.bi_opf = REQ_OP_WRITE | REQ_PREFLUSH;
 	submit_bio(&log->flush_bio);
@@ -1669,7 +1669,7 @@ static int r5l_recovery_fetch_ra_pool(struct r5l_log *log,
 				      sector_t offset)
 {
 	bio_reset(ctx->ra_bio);
-	ctx->ra_bio->bi_bdev = log->rdev->bdev;
+	bio_set_dev(ctx->ra_bio, log->rdev->bdev);
 	bio_set_op_attrs(ctx->ra_bio, REQ_OP_READ, 0);
 	ctx->ra_bio->bi_iter.bi_sector = log->rdev->data_offset + offset;
 
diff --git a/drivers/md/raid5-ppl.c b/drivers/md/raid5-ppl.c
index 44ad5baf3206..1e237c40d6fa 100644
--- a/drivers/md/raid5-ppl.c
+++ b/drivers/md/raid5-ppl.c
@@ -415,7 +415,7 @@ static void ppl_submit_iounit_bio(struct ppl_io_unit *io, struct bio *bio)
 	pr_debug("%s: seq: %llu size: %u sector: %llu dev: %s\n",
 		 __func__, io->seq, bio->bi_iter.bi_size,
 		 (unsigned long long)bio->bi_iter.bi_sector,
-		 bdevname(bio->bi_bdev, b));
+		 bio_devname(bio, b));
 
 	submit_bio(bio);
 }
@@ -453,7 +453,7 @@ static void ppl_submit_iounit(struct ppl_io_unit *io)
 
 	bio->bi_end_io = ppl_log_endio;
 	bio->bi_opf = REQ_OP_WRITE | REQ_FUA;
-	bio->bi_bdev = log->rdev->bdev;
+	bio_set_dev(bio, log->rdev->bdev);
 	bio->bi_iter.bi_sector = log->rdev->ppl.sector;
 	bio_add_page(bio, io->header_page, PAGE_SIZE, 0);
 
@@ -468,7 +468,7 @@ static void ppl_submit_iounit(struct ppl_io_unit *io)
 			bio = bio_alloc_bioset(GFP_NOIO, BIO_MAX_PAGES,
 					       ppl_conf->bs);
 			bio->bi_opf = prev->bi_opf;
-			bio->bi_bdev = prev->bi_bdev;
+			bio_copy_dev(bio, prev);
 			bio->bi_iter.bi_sector = bio_end_sector(prev);
 			bio_add_page(bio, sh->ppl_page, PAGE_SIZE, 0);
 
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index d687aeb1b538..3ae8bbceb6c4 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -1096,7 +1096,7 @@ again:
 
 			set_bit(STRIPE_IO_STARTED, &sh->state);
 
-			bi->bi_bdev = rdev->bdev;
+			bio_set_dev(bi, rdev->bdev);
 			bio_set_op_attrs(bi, op, op_flags);
 			bi->bi_end_io = op_is_write(op)
 				? raid5_end_write_request
@@ -1145,7 +1145,7 @@ again:
 				set_bit(R5_DOUBLE_LOCKED, &sh->dev[i].flags);
 
 			if (conf->mddev->gendisk)
-				trace_block_bio_remap(bdev_get_queue(bi->bi_bdev),
+				trace_block_bio_remap(bi->bi_disk->queue,
 						      bi, disk_devt(conf->mddev->gendisk),
 						      sh->dev[i].sector);
 			if (should_defer && op_is_write(op))
@@ -1160,7 +1160,7 @@ again:
 
 			set_bit(STRIPE_IO_STARTED, &sh->state);
 
-			rbi->bi_bdev = rrdev->bdev;
+			bio_set_dev(rbi, rrdev->bdev);
 			bio_set_op_attrs(rbi, op, op_flags);
 			BUG_ON(!op_is_write(op));
 			rbi->bi_end_io = raid5_end_write_request;
@@ -1193,7 +1193,7 @@ again:
 			if (op == REQ_OP_DISCARD)
 				rbi->bi_vcnt = 0;
 			if (conf->mddev->gendisk)
-				trace_block_bio_remap(bdev_get_queue(rbi->bi_bdev),
+				trace_block_bio_remap(rbi->bi_disk->queue,
 						      rbi, disk_devt(conf->mddev->gendisk),
 						      sh->dev[i].sector);
 			if (should_defer && op_is_write(op))
@@ -5233,7 +5233,7 @@ static int raid5_read_one_chunk(struct mddev *mddev, struct bio *raid_bio)
 		atomic_inc(&rdev->nr_pending);
 		rcu_read_unlock();
 		raid_bio->bi_next = (void*)rdev;
-		align_bi->bi_bdev =  rdev->bdev;
+		bio_set_dev(align_bi, rdev->bdev);
 		bio_clear_flag(align_bi, BIO_SEG_VALID);
 
 		if (is_badblock(rdev, align_bi->bi_iter.bi_sector,
@@ -5255,7 +5255,7 @@ static int raid5_read_one_chunk(struct mddev *mddev, struct bio *raid_bio)
 		spin_unlock_irq(&conf->device_lock);
 
 		if (mddev->gendisk)
-			trace_block_bio_remap(bdev_get_queue(align_bi->bi_bdev),
+			trace_block_bio_remap(align_bi->bi_disk->queue,
 					      align_bi, disk_devt(mddev->gendisk),
 					      raid_bio->bi_iter.bi_sector);
 		generic_make_request(align_bi);
diff --git a/drivers/nvdimm/nd.h b/drivers/nvdimm/nd.h
index 73062da3177f..a87f793f2945 100644
--- a/drivers/nvdimm/nd.h
+++ b/drivers/nvdimm/nd.h
@@ -390,7 +390,7 @@ int nd_region_activate(struct nd_region *nd_region);
 void __nd_iostat_start(struct bio *bio, unsigned long *start);
 static inline bool nd_iostat_start(struct bio *bio, unsigned long *start)
 {
-	struct gendisk *disk = bio->bi_bdev->bd_disk;
+	struct gendisk *disk = bio->bi_disk;
 
 	if (!blk_queue_io_stat(disk->queue))
 		return false;
@@ -402,7 +402,7 @@ static inline bool nd_iostat_start(struct bio *bio, unsigned long *start)
 }
 static inline void nd_iostat_end(struct bio *bio, unsigned long start)
 {
-	struct gendisk *disk = bio->bi_bdev->bd_disk;
+	struct gendisk *disk = bio->bi_disk;
 
 	generic_end_io_acct(disk->queue, bio_data_dir(bio), &disk->part0,
 				start);
diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index c49f1f8b2e57..f03452db7938 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -613,11 +613,7 @@ int __nvme_submit_user_cmd(struct request_queue *q, struct nvme_command *cmd,
 
 		if (!disk)
 			goto submit;
-		bio->bi_bdev = bdget_disk(disk, 0);
-		if (!bio->bi_bdev) {
-			ret = -ENODEV;
-			goto out_unmap;
-		}
+		bio->bi_disk = disk;
 
 		if (meta_buffer && meta_len) {
 			struct bio_integrity_payload *bip;
@@ -668,11 +664,8 @@ int __nvme_submit_user_cmd(struct request_queue *q, struct nvme_command *cmd,
  out_free_meta:
 	kfree(meta);
  out_unmap:
-	if (bio) {
-		if (disk && bio->bi_bdev)
-			bdput(bio->bi_bdev);
+	if (bio)
 		blk_rq_unmap_user(bio);
-	}
  out:
 	blk_mq_free_request(req);
 	return ret;
diff --git a/drivers/nvme/host/lightnvm.c b/drivers/nvme/host/lightnvm.c
index be8541335e31..c1a28569e843 100644
--- a/drivers/nvme/host/lightnvm.c
+++ b/drivers/nvme/host/lightnvm.c
@@ -643,17 +643,9 @@ static int nvme_nvm_submit_user_cmd(struct request_queue *q,
 			vcmd->ph_rw.metadata = cpu_to_le64(metadata_dma);
 		}
 
-		if (!disk)
-			goto submit;
-
-		bio->bi_bdev = bdget_disk(disk, 0);
-		if (!bio->bi_bdev) {
-			ret = -ENODEV;
-			goto err_meta;
-		}
+		bio->bi_disk = disk;
 	}
 
-submit:
 	blk_execute_rq(q, NULL, rq, 0);
 
 	if (nvme_req(rq)->flags & NVME_REQ_CANCELLED)
@@ -673,11 +665,8 @@ err_meta:
 	if (meta_buf && meta_len)
 		dma_pool_free(dev->dma_pool, metadata, metadata_dma);
 err_map:
-	if (bio) {
-		if (disk && bio->bi_bdev)
-			bdput(bio->bi_bdev);
+	if (bio)
 		blk_rq_unmap_user(bio);
-	}
 err_ppa:
 	if (ppa_buf && ppa_len)
 		dma_pool_free(dev->dma_pool, ppa_list, ppa_dma);
diff --git a/drivers/nvme/target/io-cmd.c b/drivers/nvme/target/io-cmd.c
index 3b4d47a6abdb..0d4c23dc4532 100644
--- a/drivers/nvme/target/io-cmd.c
+++ b/drivers/nvme/target/io-cmd.c
@@ -68,7 +68,7 @@ static void nvmet_execute_rw(struct nvmet_req *req)
 
 	nvmet_inline_bio_init(req);
 	bio = &req->inline_bio;
-	bio->bi_bdev = req->ns->bdev;
+	bio_set_dev(bio, req->ns->bdev);
 	bio->bi_iter.bi_sector = sector;
 	bio->bi_private = req;
 	bio->bi_end_io = nvmet_bio_done;
@@ -80,7 +80,7 @@ static void nvmet_execute_rw(struct nvmet_req *req)
 			struct bio *prev = bio;
 
 			bio = bio_alloc(GFP_KERNEL, min(sg_cnt, BIO_MAX_PAGES));
-			bio->bi_bdev = req->ns->bdev;
+			bio_set_dev(bio, req->ns->bdev);
 			bio->bi_iter.bi_sector = sector;
 			bio_set_op_attrs(bio, op, op_flags);
 
@@ -104,7 +104,7 @@ static void nvmet_execute_flush(struct nvmet_req *req)
 	nvmet_inline_bio_init(req);
 	bio = &req->inline_bio;
 
-	bio->bi_bdev = req->ns->bdev;
+	bio_set_dev(bio, req->ns->bdev);
 	bio->bi_private = req;
 	bio->bi_end_io = nvmet_bio_done;
 	bio->bi_opf = REQ_OP_WRITE | REQ_PREFLUSH;
diff --git a/drivers/s390/block/dcssblk.c b/drivers/s390/block/dcssblk.c
index 68bae4f6bd88..7abb240847c0 100644
--- a/drivers/s390/block/dcssblk.c
+++ b/drivers/s390/block/dcssblk.c
@@ -856,14 +856,14 @@ dcssblk_make_request(struct request_queue *q, struct bio *bio)
 	blk_queue_split(q, &bio);
 
 	bytes_done = 0;
-	dev_info = bio->bi_bdev->bd_disk->private_data;
+	dev_info = bio->bi_disk->private_data;
 	if (dev_info == NULL)
 		goto fail;
 	if ((bio->bi_iter.bi_sector & 7) != 0 ||
 	    (bio->bi_iter.bi_size & 4095) != 0)
 		/* Request is not page-aligned. */
 		goto fail;
-	if (bio_end_sector(bio) > get_capacity(bio->bi_bdev->bd_disk)) {
+	if (bio_end_sector(bio) > get_capacity(bio->bi_disk)) {
 		/* Request beyond end of DCSS segment. */
 		goto fail;
 	}
diff --git a/drivers/s390/block/xpram.c b/drivers/s390/block/xpram.c
index a48f0d40c1d2..571a0709e1e5 100644
--- a/drivers/s390/block/xpram.c
+++ b/drivers/s390/block/xpram.c
@@ -183,7 +183,7 @@ static unsigned long xpram_highest_page_index(void)
  */
 static blk_qc_t xpram_make_request(struct request_queue *q, struct bio *bio)
 {
-	xpram_device_t *xdev = bio->bi_bdev->bd_disk->private_data;
+	xpram_device_t *xdev = bio->bi_disk->private_data;
 	struct bio_vec bvec;
 	struct bvec_iter iter;
 	unsigned int index;
diff --git a/drivers/target/target_core_iblock.c b/drivers/target/target_core_iblock.c
index ee7c7fa55dad..07c814c42648 100644
--- a/drivers/target/target_core_iblock.c
+++ b/drivers/target/target_core_iblock.c
@@ -338,7 +338,7 @@ iblock_get_bio(struct se_cmd *cmd, sector_t lba, u32 sg_num, int op,
 		return NULL;
 	}
 
-	bio->bi_bdev = ib_dev->ibd_bd;
+	bio_set_dev(bio, ib_dev->ibd_bd);
 	bio->bi_private = cmd;
 	bio->bi_end_io = &iblock_bio_done;
 	bio->bi_iter.bi_sector = lba;
@@ -395,7 +395,7 @@ iblock_execute_sync_cache(struct se_cmd *cmd)
 
 	bio = bio_alloc(GFP_KERNEL, 0);
 	bio->bi_end_io = iblock_end_io_flush;
-	bio->bi_bdev = ib_dev->ibd_bd;
+	bio_set_dev(bio, ib_dev->ibd_bd);
 	bio->bi_opf = REQ_OP_WRITE | REQ_PREFLUSH;
 	if (!immed)
 		bio->bi_private = cmd;
diff --git a/fs/block_dev.c b/fs/block_dev.c
index d29d1c70f833..bb715b2fcfb8 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -223,7 +223,7 @@ __blkdev_direct_IO_simple(struct kiocb *iocb, struct iov_iter *iter,
 	}
 
 	bio_init(&bio, vecs, nr_pages);
-	bio.bi_bdev = bdev;
+	bio_set_dev(&bio, bdev);
 	bio.bi_iter.bi_sector = pos >> 9;
 	bio.bi_write_hint = iocb->ki_hint;
 	bio.bi_private = current;
@@ -362,7 +362,7 @@ __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter, int nr_pages)
 
 	blk_start_plug(&plug);
 	for (;;) {
-		bio->bi_bdev = bdev;
+		bio_set_dev(bio, bdev);
 		bio->bi_iter.bi_sector = pos >> 9;
 		bio->bi_write_hint = iocb->ki_hint;
 		bio->bi_private = dio;
diff --git a/fs/btrfs/check-integrity.c b/fs/btrfs/check-integrity.c
index 9d3854839038..fb07e3c22b9a 100644
--- a/fs/btrfs/check-integrity.c
+++ b/fs/btrfs/check-integrity.c
@@ -1635,7 +1635,7 @@ static int btrfsic_read_block(struct btrfsic_state *state,
 		unsigned int j;
 
 		bio = btrfs_io_bio_alloc(num_pages - i);
-		bio->bi_bdev = block_ctx->dev->bdev;
+		bio_set_dev(bio, block_ctx->dev->bdev);
 		bio->bi_iter.bi_sector = dev_bytenr >> 9;
 		bio_set_op_attrs(bio, REQ_OP_READ, 0);
 
@@ -2803,7 +2803,7 @@ static void __btrfsic_submit_bio(struct bio *bio)
 	mutex_lock(&btrfsic_mutex);
 	/* since btrfsic_submit_bio() is also called before
 	 * btrfsic_mount(), this might return NULL */
-	dev_state = btrfsic_dev_state_lookup(bio->bi_bdev->bd_dev);
+	dev_state = btrfsic_dev_state_lookup(bio_dev(bio));
 	if (NULL != dev_state &&
 	    (bio_op(bio) == REQ_OP_WRITE) && bio_has_data(bio)) {
 		unsigned int i = 0;
@@ -2819,10 +2819,10 @@ static void __btrfsic_submit_bio(struct bio *bio)
 		bio_is_patched = 0;
 		if (dev_state->state->print_mask &
 		    BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH)
-			pr_info("submit_bio(rw=%d,0x%x, bi_vcnt=%u, bi_sector=%llu (bytenr %llu), bi_bdev=%p)\n",
+			pr_info("submit_bio(rw=%d,0x%x, bi_vcnt=%u, bi_sector=%llu (bytenr %llu), bi_disk=%p)\n",
 			       bio_op(bio), bio->bi_opf, segs,
 			       (unsigned long long)bio->bi_iter.bi_sector,
-			       dev_bytenr, bio->bi_bdev);
+			       dev_bytenr, bio->bi_disk);
 
 		mapped_datav = kmalloc_array(segs,
 					     sizeof(*mapped_datav), GFP_NOFS);
@@ -2851,8 +2851,8 @@ static void __btrfsic_submit_bio(struct bio *bio)
 	} else if (NULL != dev_state && (bio->bi_opf & REQ_PREFLUSH)) {
 		if (dev_state->state->print_mask &
 		    BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH)
-			pr_info("submit_bio(rw=%d,0x%x FLUSH, bdev=%p)\n",
-			       bio_op(bio), bio->bi_opf, bio->bi_bdev);
+			pr_info("submit_bio(rw=%d,0x%x FLUSH, disk=%p)\n",
+			       bio_op(bio), bio->bi_opf, bio->bi_disk);
 		if (!dev_state->dummy_block_for_bio_bh_flush.is_iodone) {
 			if ((dev_state->state->print_mask &
 			     (BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH |
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 080e2ebb8aa0..0640c27e63e9 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -3499,7 +3499,7 @@ static void write_dev_flush(struct btrfs_device *device)
 
 	bio_reset(bio);
 	bio->bi_end_io = btrfs_end_empty_barrier;
-	bio->bi_bdev = device->bdev;
+	bio_set_dev(bio, device->bdev);
 	bio->bi_opf = REQ_OP_WRITE | REQ_SYNC | REQ_PREFLUSH;
 	init_completion(&device->flush_wait);
 	bio->bi_private = &device->flush_wait;
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 0aff9b278c19..42b12a85ab49 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -2033,7 +2033,7 @@ int repair_io_failure(struct btrfs_fs_info *fs_info, u64 ino, u64 start,
 		bio_put(bio);
 		return -EIO;
 	}
-	bio->bi_bdev = dev->bdev;
+	bio_set_dev(bio, dev->bdev);
 	bio->bi_opf = REQ_OP_WRITE | REQ_SYNC;
 	bio_add_page(bio, page, length, pg_offset);
 
@@ -2335,7 +2335,7 @@ struct bio *btrfs_create_repair_bio(struct inode *inode, struct bio *failed_bio,
 	bio = btrfs_io_bio_alloc(1);
 	bio->bi_end_io = endio_func;
 	bio->bi_iter.bi_sector = failrec->logical >> 9;
-	bio->bi_bdev = fs_info->fs_devices->latest_bdev;
+	bio_set_dev(bio, fs_info->fs_devices->latest_bdev);
 	bio->bi_iter.bi_size = 0;
 	bio->bi_private = data;
 
@@ -2675,7 +2675,7 @@ struct bio *btrfs_bio_alloc(struct block_device *bdev, u64 first_byte)
 	struct bio *bio;
 
 	bio = bio_alloc_bioset(GFP_NOFS, BIO_MAX_PAGES, btrfs_bioset);
-	bio->bi_bdev = bdev;
+	bio_set_dev(bio, bdev);
 	bio->bi_iter.bi_sector = first_byte >> 9;
 	btrfs_io_bio_init(btrfs_io_bio(bio));
 	return bio;
diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c
index 208638384cd2..d268cb633735 100644
--- a/fs/btrfs/raid56.c
+++ b/fs/btrfs/raid56.c
@@ -1090,7 +1090,8 @@ static int rbio_add_io_page(struct btrfs_raid_bio *rbio,
 		 */
 		if (last_end == disk_start && stripe->dev->bdev &&
 		    !last->bi_status &&
-		    last->bi_bdev == stripe->dev->bdev) {
+		    last->bi_disk == stripe->dev->bdev->bd_disk &&
+		    last->bi_partno == stripe->dev->bdev->bd_partno) {
 			ret = bio_add_page(last, page, PAGE_SIZE, 0);
 			if (ret == PAGE_SIZE)
 				return 0;
@@ -1100,7 +1101,7 @@ static int rbio_add_io_page(struct btrfs_raid_bio *rbio,
 	/* put a new bio on the list */
 	bio = btrfs_io_bio_alloc(bio_max_len >> PAGE_SHIFT ?: 1);
 	bio->bi_iter.bi_size = 0;
-	bio->bi_bdev = stripe->dev->bdev;
+	bio_set_dev(bio, stripe->dev->bdev);
 	bio->bi_iter.bi_sector = disk_start >> 9;
 
 	bio_add_page(bio, page, PAGE_SIZE, 0);
@@ -1347,7 +1348,8 @@ static int find_bio_stripe(struct btrfs_raid_bio *rbio,
 		stripe_start = stripe->physical;
 		if (physical >= stripe_start &&
 		    physical < stripe_start + rbio->stripe_len &&
-		    bio->bi_bdev == stripe->dev->bdev) {
+		    bio->bi_disk == stripe->dev->bdev->bd_disk &&
+		    bio->bi_partno == stripe->dev->bdev->bd_partno) {
 			return i;
 		}
 	}
diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
index 6f1e4c984b94..b0b71e8e4c36 100644
--- a/fs/btrfs/scrub.c
+++ b/fs/btrfs/scrub.c
@@ -1738,7 +1738,7 @@ static void scrub_recheck_block(struct btrfs_fs_info *fs_info,
 
 		WARN_ON(!page->page);
 		bio = btrfs_io_bio_alloc(1);
-		bio->bi_bdev = page->dev->bdev;
+		bio_set_dev(bio, page->dev->bdev);
 
 		bio_add_page(bio, page->page, PAGE_SIZE, 0);
 		if (!retry_failed_mirror && scrub_is_page_on_raid56(page)) {
@@ -1826,7 +1826,7 @@ static int scrub_repair_page_from_good_copy(struct scrub_block *sblock_bad,
 		}
 
 		bio = btrfs_io_bio_alloc(1);
-		bio->bi_bdev = page_bad->dev->bdev;
+		bio_set_dev(bio, page_bad->dev->bdev);
 		bio->bi_iter.bi_sector = page_bad->physical >> 9;
 		bio_set_op_attrs(bio, REQ_OP_WRITE, 0);
 
@@ -1921,7 +1921,7 @@ again:
 
 		bio->bi_private = sbio;
 		bio->bi_end_io = scrub_wr_bio_end_io;
-		bio->bi_bdev = sbio->dev->bdev;
+		bio_set_dev(bio, sbio->dev->bdev);
 		bio->bi_iter.bi_sector = sbio->physical >> 9;
 		bio_set_op_attrs(bio, REQ_OP_WRITE, 0);
 		sbio->status = 0;
@@ -1964,7 +1964,7 @@ static void scrub_wr_submit(struct scrub_ctx *sctx)
 
 	sbio = sctx->wr_curr_bio;
 	sctx->wr_curr_bio = NULL;
-	WARN_ON(!sbio->bio->bi_bdev);
+	WARN_ON(!sbio->bio->bi_disk);
 	scrub_pending_bio_inc(sctx);
 	/* process all writes in a single worker thread. Then the block layer
 	 * orders the requests before sending them to the driver which
@@ -2321,7 +2321,7 @@ again:
 
 		bio->bi_private = sbio;
 		bio->bi_end_io = scrub_bio_end_io;
-		bio->bi_bdev = sbio->dev->bdev;
+		bio_set_dev(bio, sbio->dev->bdev);
 		bio->bi_iter.bi_sector = sbio->physical >> 9;
 		bio_set_op_attrs(bio, REQ_OP_READ, 0);
 		sbio->status = 0;
@@ -4627,7 +4627,7 @@ static int write_page_nocow(struct scrub_ctx *sctx,
 	bio = btrfs_io_bio_alloc(1);
 	bio->bi_iter.bi_size = 0;
 	bio->bi_iter.bi_sector = physical_for_dev_replace >> 9;
-	bio->bi_bdev = dev->bdev;
+	bio_set_dev(bio, dev->bdev);
 	bio->bi_opf = REQ_OP_WRITE | REQ_SYNC;
 	ret = bio_add_page(bio, page, PAGE_SIZE, 0);
 	if (ret != PAGE_SIZE) {
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index e8b9a269fdde..f9f0f474a64f 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -6188,7 +6188,7 @@ static void submit_stripe_bio(struct btrfs_bio *bbio, struct bio *bio,
 		rcu_read_unlock();
 	}
 #endif
-	bio->bi_bdev = dev->bdev;
+	bio_set_dev(bio, dev->bdev);
 
 	btrfs_bio_counter_inc_noblocked(fs_info);
 
diff --git a/fs/buffer.c b/fs/buffer.c
index 5715dac7821f..50e51a67dc78 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -3057,7 +3057,7 @@ void guard_bio_eod(int op, struct bio *bio)
 	struct bio_vec *bvec = &bio->bi_io_vec[bio->bi_vcnt - 1];
 	unsigned truncated_bytes;
 
-	maxsector = i_size_read(bio->bi_bdev->bd_inode) >> 9;
+	maxsector = get_capacity(bio->bi_disk);
 	if (!maxsector)
 		return;
 
@@ -3116,7 +3116,7 @@ static int submit_bh_wbc(int op, int op_flags, struct buffer_head *bh,
 	}
 
 	bio->bi_iter.bi_sector = bh->b_blocknr * (bh->b_size >> 9);
-	bio->bi_bdev = bh->b_bdev;
+	bio_set_dev(bio, bh->b_bdev);
 	bio->bi_write_hint = write_hint;
 
 	bio_add_page(bio, bh->b_page, bh->b_size, bh_offset(bh));
diff --git a/fs/crypto/bio.c b/fs/crypto/bio.c
index 6181e9526860..483784d5eb73 100644
--- a/fs/crypto/bio.c
+++ b/fs/crypto/bio.c
@@ -115,7 +115,7 @@ int fscrypt_zeroout_range(const struct inode *inode, pgoff_t lblk,
 			err = -ENOMEM;
 			goto errout;
 		}
-		bio->bi_bdev = inode->i_sb->s_bdev;
+		bio_set_dev(bio, inode->i_sb->s_bdev);
 		bio->bi_iter.bi_sector =
 			pblk << (inode->i_sb->s_blocksize_bits - 9);
 		bio_set_op_attrs(bio, REQ_OP_WRITE, 0);
diff --git a/fs/direct-io.c b/fs/direct-io.c
index 08cf27811e5a..5fa2211e49ae 100644
--- a/fs/direct-io.c
+++ b/fs/direct-io.c
@@ -111,7 +111,7 @@ struct dio {
 	int op;
 	int op_flags;
 	blk_qc_t bio_cookie;
-	struct block_device *bio_bdev;
+	struct gendisk *bio_disk;
 	struct inode *inode;
 	loff_t i_size;			/* i_size when submitted */
 	dio_iodone_t *end_io;		/* IO completion function */
@@ -377,7 +377,7 @@ dio_bio_alloc(struct dio *dio, struct dio_submit *sdio,
 	 */
 	bio = bio_alloc(GFP_KERNEL, nr_vecs);
 
-	bio->bi_bdev = bdev;
+	bio_set_dev(bio, bdev);
 	bio->bi_iter.bi_sector = first_sector;
 	bio_set_op_attrs(bio, dio->op, dio->op_flags);
 	if (dio->is_async)
@@ -412,7 +412,7 @@ static inline void dio_bio_submit(struct dio *dio, struct dio_submit *sdio)
 	if (dio->is_async && dio->op == REQ_OP_READ && dio->should_dirty)
 		bio_set_pages_dirty(bio);
 
-	dio->bio_bdev = bio->bi_bdev;
+	dio->bio_disk = bio->bi_disk;
 
 	if (sdio->submit_io) {
 		sdio->submit_io(bio, dio->inode, sdio->logical_offset_in_bio);
@@ -458,7 +458,7 @@ static struct bio *dio_await_one(struct dio *dio)
 		dio->waiter = current;
 		spin_unlock_irqrestore(&dio->bio_lock, flags);
 		if (!(dio->iocb->ki_flags & IOCB_HIPRI) ||
-		    !blk_mq_poll(bdev_get_queue(dio->bio_bdev), dio->bio_cookie))
+		    !blk_mq_poll(dio->bio_disk->queue, dio->bio_cookie))
 			io_schedule();
 		/* wake up sets us TASK_RUNNING */
 		spin_lock_irqsave(&dio->bio_lock, flags);
diff --git a/fs/exofs/ore.c b/fs/exofs/ore.c
index 8bb72807e70d..3c6a9c156b7a 100644
--- a/fs/exofs/ore.c
+++ b/fs/exofs/ore.c
@@ -869,7 +869,7 @@ static int _write_mirror(struct ore_io_state *ios, int cur_comp)
 					goto out;
 				}
 
-				bio->bi_bdev = NULL;
+				bio->bi_disk = NULL;
 				bio->bi_next = NULL;
 				per_dev->offset = master_dev->offset;
 				per_dev->length = master_dev->length;
diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c
index c2fce4478cca..55ad7dd149d0 100644
--- a/fs/ext4/page-io.c
+++ b/fs/ext4/page-io.c
@@ -300,7 +300,7 @@ static void ext4_end_bio(struct bio *bio)
 	char b[BDEVNAME_SIZE];
 
 	if (WARN_ONCE(!io_end, "io_end is NULL: %s: sector %Lu len %u err %d\n",
-		      bdevname(bio->bi_bdev, b),
+		      bio_devname(bio, b),
 		      (long long) bio->bi_iter.bi_sector,
 		      (unsigned) bio_sectors(bio),
 		      bio->bi_status)) {
@@ -375,7 +375,7 @@ static int io_submit_init_bio(struct ext4_io_submit *io,
 		return -ENOMEM;
 	wbc_init_bio(io->io_wbc, bio);
 	bio->bi_iter.bi_sector = bh->b_blocknr * (bh->b_size >> 9);
-	bio->bi_bdev = bh->b_bdev;
+	bio_set_dev(bio, bh->b_bdev);
 	bio->bi_end_io = ext4_end_bio;
 	bio->bi_private = ext4_get_io_end(io->io_end);
 	io->io_bio = bio;
diff --git a/fs/ext4/readpage.c b/fs/ext4/readpage.c
index 40a5497b0f60..04c90643af7a 100644
--- a/fs/ext4/readpage.c
+++ b/fs/ext4/readpage.c
@@ -254,7 +254,7 @@ int ext4_mpage_readpages(struct address_space *mapping,
 					fscrypt_release_ctx(ctx);
 				goto set_error_page;
 			}
-			bio->bi_bdev = bdev;
+			bio_set_dev(bio, bdev);
 			bio->bi_iter.bi_sector = blocks[0] << (blkbits - 9);
 			bio->bi_end_io = mpage_end_io;
 			bio->bi_private = ctx;
diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c
index 87c1f4150c64..a791aac4c5af 100644
--- a/fs/f2fs/data.c
+++ b/fs/f2fs/data.c
@@ -142,7 +142,7 @@ struct block_device *f2fs_target_device(struct f2fs_sb_info *sbi,
 		}
 	}
 	if (bio) {
-		bio->bi_bdev = bdev;
+		bio_set_dev(bio, bdev);
 		bio->bi_iter.bi_sector = SECTOR_FROM_BLOCK(blk_addr);
 	}
 	return bdev;
@@ -161,7 +161,8 @@ int f2fs_target_device_index(struct f2fs_sb_info *sbi, block_t blkaddr)
 static bool __same_bdev(struct f2fs_sb_info *sbi,
 				block_t blk_addr, struct bio *bio)
 {
-	return f2fs_target_device(sbi, blk_addr, NULL) == bio->bi_bdev;
+	struct block_device *b = f2fs_target_device(sbi, blk_addr, NULL);
+	return bio->bi_disk == b->bd_disk && bio->bi_partno == b->bd_partno;
 }
 
 /*
diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c
index f964b68718c1..6f8fc4a6e701 100644
--- a/fs/f2fs/segment.c
+++ b/fs/f2fs/segment.c
@@ -447,7 +447,7 @@ static int __submit_flush_wait(struct f2fs_sb_info *sbi,
 	int ret;
 
 	bio->bi_opf = REQ_OP_WRITE | REQ_SYNC | REQ_PREFLUSH;
-	bio->bi_bdev = bdev;
+	bio_set_dev(bio, bdev);
 	ret = submit_bio_wait(bio);
 	bio_put(bio);
 
diff --git a/fs/gfs2/lops.c b/fs/gfs2/lops.c
index 3010f9edd177..720c19ada0f9 100644
--- a/fs/gfs2/lops.c
+++ b/fs/gfs2/lops.c
@@ -265,7 +265,7 @@ static struct bio *gfs2_log_alloc_bio(struct gfs2_sbd *sdp, u64 blkno)
 
 	bio = bio_alloc(GFP_NOIO, BIO_MAX_PAGES);
 	bio->bi_iter.bi_sector = blkno * (sb->s_blocksize >> 9);
-	bio->bi_bdev = sb->s_bdev;
+	bio_set_dev(bio, sb->s_bdev);
 	bio->bi_end_io = gfs2_end_log_write;
 	bio->bi_private = sdp;
 
diff --git a/fs/gfs2/meta_io.c b/fs/gfs2/meta_io.c
index fabe1614f879..39433a173baa 100644
--- a/fs/gfs2/meta_io.c
+++ b/fs/gfs2/meta_io.c
@@ -221,7 +221,7 @@ static void gfs2_submit_bhs(int op, int op_flags, struct buffer_head *bhs[],
 
 		bio = bio_alloc(GFP_NOIO, num);
 		bio->bi_iter.bi_sector = bh->b_blocknr * (bh->b_size >> 9);
-		bio->bi_bdev = bh->b_bdev;
+		bio_set_dev(bio, bh->b_bdev);
 		while (num > 0) {
 			bh = *bhs;
 			if (!bio_add_page(bio, bh->b_page, bh->b_size, bh_offset(bh))) {
diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c
index e76058d34b74..8155e16076e1 100644
--- a/fs/gfs2/ops_fstype.c
+++ b/fs/gfs2/ops_fstype.c
@@ -242,7 +242,7 @@ static int gfs2_read_super(struct gfs2_sbd *sdp, sector_t sector, int silent)
 
 	bio = bio_alloc(GFP_NOFS, 1);
 	bio->bi_iter.bi_sector = sector * (sb->s_blocksize >> 9);
-	bio->bi_bdev = sb->s_bdev;
+	bio_set_dev(bio, sb->s_bdev);
 	bio_add_page(bio, page, PAGE_SIZE, 0);
 
 	bio->bi_end_io = end_bio_io_page;
diff --git a/fs/hfsplus/wrapper.c b/fs/hfsplus/wrapper.c
index e254fa0f0697..10032b919a85 100644
--- a/fs/hfsplus/wrapper.c
+++ b/fs/hfsplus/wrapper.c
@@ -65,7 +65,7 @@ int hfsplus_submit_bio(struct super_block *sb, sector_t sector,
 
 	bio = bio_alloc(GFP_NOIO, 1);
 	bio->bi_iter.bi_sector = sector;
-	bio->bi_bdev = sb->s_bdev;
+	bio_set_dev(bio, sb->s_bdev);
 	bio_set_op_attrs(bio, op, op_flags);
 
 	if (op != WRITE && data)
diff --git a/fs/iomap.c b/fs/iomap.c
index 039266128b7f..77be8850997b 100644
--- a/fs/iomap.c
+++ b/fs/iomap.c
@@ -805,7 +805,7 @@ iomap_dio_zero(struct iomap_dio *dio, struct iomap *iomap, loff_t pos,
 	struct bio *bio;
 
 	bio = bio_alloc(GFP_KERNEL, 1);
-	bio->bi_bdev = iomap->bdev;
+	bio_set_dev(bio, iomap->bdev);
 	bio->bi_iter.bi_sector =
 		iomap->blkno + ((pos - iomap->offset) >> 9);
 	bio->bi_private = dio;
@@ -884,7 +884,7 @@ iomap_dio_actor(struct inode *inode, loff_t pos, loff_t length,
 			return 0;
 
 		bio = bio_alloc(GFP_KERNEL, nr_pages);
-		bio->bi_bdev = iomap->bdev;
+		bio_set_dev(bio, iomap->bdev);
 		bio->bi_iter.bi_sector =
 			iomap->blkno + ((pos - iomap->offset) >> 9);
 		bio->bi_write_hint = dio->iocb->ki_hint;
diff --git a/fs/jfs/jfs_logmgr.c b/fs/jfs/jfs_logmgr.c
index a21f0e9eecd4..0e5d412c0b01 100644
--- a/fs/jfs/jfs_logmgr.c
+++ b/fs/jfs/jfs_logmgr.c
@@ -1995,7 +1995,7 @@ static int lbmRead(struct jfs_log * log, int pn, struct lbuf ** bpp)
 	bio = bio_alloc(GFP_NOFS, 1);
 
 	bio->bi_iter.bi_sector = bp->l_blkno << (log->l2bsize - 9);
-	bio->bi_bdev = log->bdev;
+	bio_set_dev(bio, log->bdev);
 
 	bio_add_page(bio, bp->l_page, LOGPSIZE, bp->l_offset);
 	BUG_ON(bio->bi_iter.bi_size != LOGPSIZE);
@@ -2139,7 +2139,7 @@ static void lbmStartIO(struct lbuf * bp)
 
 	bio = bio_alloc(GFP_NOFS, 1);
 	bio->bi_iter.bi_sector = bp->l_blkno << (log->l2bsize - 9);
-	bio->bi_bdev = log->bdev;
+	bio_set_dev(bio, log->bdev);
 
 	bio_add_page(bio, bp->l_page, LOGPSIZE, bp->l_offset);
 	BUG_ON(bio->bi_iter.bi_size != LOGPSIZE);
diff --git a/fs/jfs/jfs_metapage.c b/fs/jfs/jfs_metapage.c
index 65120a471729..1c4b9ad4d7ab 100644
--- a/fs/jfs/jfs_metapage.c
+++ b/fs/jfs/jfs_metapage.c
@@ -430,7 +430,7 @@ static int metapage_writepage(struct page *page, struct writeback_control *wbc)
 		len = min(xlen, (int)JFS_SBI(inode->i_sb)->nbperpage);
 
 		bio = bio_alloc(GFP_NOFS, 1);
-		bio->bi_bdev = inode->i_sb->s_bdev;
+		bio_set_dev(bio, inode->i_sb->s_bdev);
 		bio->bi_iter.bi_sector = pblock << (inode->i_blkbits - 9);
 		bio->bi_end_io = metapage_write_end_io;
 		bio->bi_private = page;
@@ -510,7 +510,7 @@ static int metapage_readpage(struct file *fp, struct page *page)
 				submit_bio(bio);
 
 			bio = bio_alloc(GFP_NOFS, 1);
-			bio->bi_bdev = inode->i_sb->s_bdev;
+			bio_set_dev(bio, inode->i_sb->s_bdev);
 			bio->bi_iter.bi_sector =
 				pblock << (inode->i_blkbits - 9);
 			bio->bi_end_io = metapage_read_end_io;
diff --git a/fs/mpage.c b/fs/mpage.c
index 2e4c41ccb5c9..37bb77c1302c 100644
--- a/fs/mpage.c
+++ b/fs/mpage.c
@@ -83,7 +83,7 @@ mpage_alloc(struct block_device *bdev,
 	}
 
 	if (bio) {
-		bio->bi_bdev = bdev;
+		bio_set_dev(bio, bdev);
 		bio->bi_iter.bi_sector = first_sector;
 	}
 	return bio;
diff --git a/fs/nfs/blocklayout/blocklayout.c b/fs/nfs/blocklayout/blocklayout.c
index d8863a804b15..995d707537da 100644
--- a/fs/nfs/blocklayout/blocklayout.c
+++ b/fs/nfs/blocklayout/blocklayout.c
@@ -130,7 +130,7 @@ bl_alloc_init_bio(int npg, struct block_device *bdev, sector_t disk_sector,
 
 	if (bio) {
 		bio->bi_iter.bi_sector = disk_sector;
-		bio->bi_bdev = bdev;
+		bio_set_dev(bio, bdev);
 		bio->bi_end_io = end_io;
 		bio->bi_private = par;
 	}
diff --git a/fs/nilfs2/segbuf.c b/fs/nilfs2/segbuf.c
index e73c86d9855c..6c5009cc4e6f 100644
--- a/fs/nilfs2/segbuf.c
+++ b/fs/nilfs2/segbuf.c
@@ -400,7 +400,7 @@ static struct bio *nilfs_alloc_seg_bio(struct the_nilfs *nilfs, sector_t start,
 			bio = bio_alloc(GFP_NOIO, nr_vecs);
 	}
 	if (likely(bio)) {
-		bio->bi_bdev = nilfs->ns_bdev;
+		bio_set_dev(bio, nilfs->ns_bdev);
 		bio->bi_iter.bi_sector =
 			start << (nilfs->ns_blocksize_bits - 9);
 	}
diff --git a/fs/ocfs2/cluster/heartbeat.c b/fs/ocfs2/cluster/heartbeat.c
index ffe003982d95..6aea15746a56 100644
--- a/fs/ocfs2/cluster/heartbeat.c
+++ b/fs/ocfs2/cluster/heartbeat.c
@@ -554,7 +554,7 @@ static struct bio *o2hb_setup_one_bio(struct o2hb_region *reg,
 
 	/* Must put everything in 512 byte sectors for the bio... */
 	bio->bi_iter.bi_sector = (reg->hr_start_block + cs) << (bits - 9);
-	bio->bi_bdev = reg->hr_bdev;
+	bio_set_dev(bio, reg->hr_bdev);
 	bio->bi_private = wc;
 	bio->bi_end_io = o2hb_bio_end_io;
 	bio_set_op_attrs(bio, op, op_flags);
diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c
index 6bf120bb1a17..c8ca03a5a08f 100644
--- a/fs/xfs/xfs_aops.c
+++ b/fs/xfs/xfs_aops.c
@@ -517,7 +517,7 @@ xfs_init_bio_from_bh(
 	struct buffer_head	*bh)
 {
 	bio->bi_iter.bi_sector = bh->b_blocknr * (bh->b_size >> 9);
-	bio->bi_bdev = bh->b_bdev;
+	bio_set_dev(bio, bh->b_bdev);
 }
 
 static struct xfs_ioend *
diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c
index 72f038492ba8..b1c9711e79a4 100644
--- a/fs/xfs/xfs_buf.c
+++ b/fs/xfs/xfs_buf.c
@@ -1281,7 +1281,7 @@ next_chunk:
 	nr_pages = min(total_nr_pages, BIO_MAX_PAGES);
 
 	bio = bio_alloc(GFP_NOIO, nr_pages);
-	bio->bi_bdev = bp->b_target->bt_bdev;
+	bio_set_dev(bio, bp->b_target->bt_bdev);
 	bio->bi_iter.bi_sector = sector;
 	bio->bi_end_io = xfs_buf_bio_end_io;
 	bio->bi_private = bp;
diff --git a/include/linux/bio.h b/include/linux/bio.h
index 9276788a9b24..a8fe7935332f 100644
--- a/include/linux/bio.h
+++ b/include/linux/bio.h
@@ -494,6 +494,24 @@ extern struct bio_vec *bvec_alloc(gfp_t, int, unsigned long *, mempool_t *);
 extern void bvec_free(mempool_t *, struct bio_vec *, unsigned int);
 extern unsigned int bvec_nr_vecs(unsigned short idx);
 
+#define bio_set_dev(bio, bdev) 			\
+do {						\
+	(bio)->bi_disk = (bdev)->bd_disk;	\
+	(bio)->bi_partno = (bdev)->bd_partno;	\
+} while (0)
+
+#define bio_copy_dev(dst, src)			\
+do {						\
+	(dst)->bi_disk = (src)->bi_disk;	\
+	(dst)->bi_partno = (src)->bi_partno;	\
+} while (0)
+
+#define bio_dev(bio) \
+	disk_devt((bio)->bi_disk)
+
+#define bio_devname(bio, buf) \
+	__bdevname(bio_dev(bio), (buf))
+
 #ifdef CONFIG_BLK_CGROUP
 int bio_associate_blkcg(struct bio *bio, struct cgroup_subsys_state *blkcg_css);
 int bio_associate_current(struct bio *bio);
diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h
index d2eb87c84d82..a2d2aa709cef 100644
--- a/include/linux/blk_types.h
+++ b/include/linux/blk_types.h
@@ -48,7 +48,8 @@ struct blk_issue_stat {
  */
 struct bio {
 	struct bio		*bi_next;	/* request queue link */
-	struct block_device	*bi_bdev;
+	struct gendisk		*bi_disk;
+	u8			bi_partno;
 	blk_status_t		bi_status;
 	unsigned int		bi_opf;		/* bottom bits req flags,
 						 * top bits REQ_OP. Use
diff --git a/include/trace/events/bcache.h b/include/trace/events/bcache.h
index df3e9ae5ad8d..daf749138ff8 100644
--- a/include/trace/events/bcache.h
+++ b/include/trace/events/bcache.h
@@ -21,7 +21,7 @@ DECLARE_EVENT_CLASS(bcache_request,
 	),
 
 	TP_fast_assign(
-		__entry->dev		= bio->bi_bdev->bd_dev;
+		__entry->dev		= bio_dev(bio);
 		__entry->orig_major	= d->disk->major;
 		__entry->orig_minor	= d->disk->first_minor;
 		__entry->sector		= bio->bi_iter.bi_sector;
@@ -98,7 +98,7 @@ DECLARE_EVENT_CLASS(bcache_bio,
 	),
 
 	TP_fast_assign(
-		__entry->dev		= bio->bi_bdev->bd_dev;
+		__entry->dev		= bio_dev(bio);
 		__entry->sector		= bio->bi_iter.bi_sector;
 		__entry->nr_sector	= bio->bi_iter.bi_size >> 9;
 		blk_fill_rwbs(__entry->rwbs, bio->bi_opf, bio->bi_iter.bi_size);
@@ -133,7 +133,7 @@ TRACE_EVENT(bcache_read,
 	),
 
 	TP_fast_assign(
-		__entry->dev		= bio->bi_bdev->bd_dev;
+		__entry->dev		= bio_dev(bio);
 		__entry->sector		= bio->bi_iter.bi_sector;
 		__entry->nr_sector	= bio->bi_iter.bi_size >> 9;
 		blk_fill_rwbs(__entry->rwbs, bio->bi_opf, bio->bi_iter.bi_size);
diff --git a/include/trace/events/block.h b/include/trace/events/block.h
index d0dbe60d8a6d..f815aaaef755 100644
--- a/include/trace/events/block.h
+++ b/include/trace/events/block.h
@@ -236,8 +236,7 @@ TRACE_EVENT(block_bio_bounce,
 	),
 
 	TP_fast_assign(
-		__entry->dev		= bio->bi_bdev ?
-					  bio->bi_bdev->bd_dev : 0;
+		__entry->dev		= bio_dev(bio);
 		__entry->sector		= bio->bi_iter.bi_sector;
 		__entry->nr_sector	= bio_sectors(bio);
 		blk_fill_rwbs(__entry->rwbs, bio->bi_opf, bio->bi_iter.bi_size);
@@ -274,7 +273,7 @@ TRACE_EVENT(block_bio_complete,
 	),
 
 	TP_fast_assign(
-		__entry->dev		= bio->bi_bdev->bd_dev;
+		__entry->dev		= bio_dev(bio);
 		__entry->sector		= bio->bi_iter.bi_sector;
 		__entry->nr_sector	= bio_sectors(bio);
 		__entry->error		= error;
@@ -302,7 +301,7 @@ DECLARE_EVENT_CLASS(block_bio_merge,
 	),
 
 	TP_fast_assign(
-		__entry->dev		= bio->bi_bdev->bd_dev;
+		__entry->dev		= bio_dev(bio);
 		__entry->sector		= bio->bi_iter.bi_sector;
 		__entry->nr_sector	= bio_sectors(bio);
 		blk_fill_rwbs(__entry->rwbs, bio->bi_opf, bio->bi_iter.bi_size);
@@ -369,7 +368,7 @@ TRACE_EVENT(block_bio_queue,
 	),
 
 	TP_fast_assign(
-		__entry->dev		= bio->bi_bdev->bd_dev;
+		__entry->dev		= bio_dev(bio);
 		__entry->sector		= bio->bi_iter.bi_sector;
 		__entry->nr_sector	= bio_sectors(bio);
 		blk_fill_rwbs(__entry->rwbs, bio->bi_opf, bio->bi_iter.bi_size);
@@ -397,7 +396,8 @@ DECLARE_EVENT_CLASS(block_get_rq,
         ),
 
 	TP_fast_assign(
-		__entry->dev		= bio ? bio->bi_bdev->bd_dev : 0;
+		__entry->dev		= bio ? bio_dev(bio) : 0;
+		__entry->dev		= bio_dev(bio);
 		__entry->sector		= bio ? bio->bi_iter.bi_sector : 0;
 		__entry->nr_sector	= bio ? bio_sectors(bio) : 0;
 		blk_fill_rwbs(__entry->rwbs,
@@ -532,7 +532,7 @@ TRACE_EVENT(block_split,
 	),
 
 	TP_fast_assign(
-		__entry->dev		= bio->bi_bdev->bd_dev;
+		__entry->dev		= bio_dev(bio);
 		__entry->sector		= bio->bi_iter.bi_sector;
 		__entry->new_sector	= new_sector;
 		blk_fill_rwbs(__entry->rwbs, bio->bi_opf, bio->bi_iter.bi_size);
@@ -573,7 +573,7 @@ TRACE_EVENT(block_bio_remap,
 	),
 
 	TP_fast_assign(
-		__entry->dev		= bio->bi_bdev->bd_dev;
+		__entry->dev		= bio_dev(bio);
 		__entry->sector		= bio->bi_iter.bi_sector;
 		__entry->nr_sector	= bio_sectors(bio);
 		__entry->old_dev	= dev;
diff --git a/include/trace/events/f2fs.h b/include/trace/events/f2fs.h
index 6f77a2755abb..bc4dd7837e4c 100644
--- a/include/trace/events/f2fs.h
+++ b/include/trace/events/f2fs.h
@@ -829,7 +829,7 @@ DECLARE_EVENT_CLASS(f2fs__bio,
 
 	TP_fast_assign(
 		__entry->dev		= sb->s_dev;
-		__entry->target		= bio->bi_bdev->bd_dev;
+		__entry->target		= bio_dev(bio);
 		__entry->op		= bio_op(bio);
 		__entry->op_flags	= bio->bi_opf;
 		__entry->type		= type;
diff --git a/kernel/power/swap.c b/kernel/power/swap.c
index 57d22571f306..d7cdc426ee38 100644
--- a/kernel/power/swap.c
+++ b/kernel/power/swap.c
@@ -242,8 +242,7 @@ static void hib_end_io(struct bio *bio)
 
 	if (bio->bi_status) {
 		printk(KERN_ALERT "Read-error on swap-device (%u:%u:%Lu)\n",
-				imajor(bio->bi_bdev->bd_inode),
-				iminor(bio->bi_bdev->bd_inode),
+				MAJOR(bio_dev(bio)), MINOR(bio_dev(bio)),
 				(unsigned long long)bio->bi_iter.bi_sector);
 	}
 
@@ -270,7 +269,7 @@ static int hib_submit_io(int op, int op_flags, pgoff_t page_off, void *addr,
 
 	bio = bio_alloc(__GFP_RECLAIM | __GFP_HIGH, 1);
 	bio->bi_iter.bi_sector = page_off * (PAGE_SIZE >> 9);
-	bio->bi_bdev = hib_resume_bdev;
+	bio_set_dev(bio, hib_resume_bdev);
 	bio_set_op_attrs(bio, op, op_flags);
 
 	if (bio_add_page(bio, page, PAGE_SIZE, 0) < PAGE_SIZE) {
diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c
index 7724de18d2fe..2a685b45b73b 100644
--- a/kernel/trace/blktrace.c
+++ b/kernel/trace/blktrace.c
@@ -963,7 +963,7 @@ static void blk_add_trace_bio_remap(void *ignore,
 		return;
 
 	r.device_from = cpu_to_be32(dev);
-	r.device_to   = cpu_to_be32(bio->bi_bdev->bd_dev);
+	r.device_to   = cpu_to_be32(bio_dev(bio));
 	r.sector_from = cpu_to_be64(from);
 
 	__blk_add_trace(bt, bio->bi_iter.bi_sector, bio->bi_iter.bi_size,
diff --git a/mm/page_io.c b/mm/page_io.c
index b6c4ac388209..9cf1bc751d79 100644
--- a/mm/page_io.c
+++ b/mm/page_io.c
@@ -31,7 +31,10 @@ static struct bio *get_swap_bio(gfp_t gfp_flags,
 
 	bio = bio_alloc(gfp_flags, 1);
 	if (bio) {
-		bio->bi_iter.bi_sector = map_swap_page(page, &bio->bi_bdev);
+		struct block_device *bdev;
+
+		bio->bi_iter.bi_sector = map_swap_page(page, &bdev);
+		bio_set_dev(bio, bdev);
 		bio->bi_iter.bi_sector <<= PAGE_SHIFT - 9;
 		bio->bi_end_io = end_io;
 
@@ -57,8 +60,7 @@ void end_swap_bio_write(struct bio *bio)
 		 */
 		set_page_dirty(page);
 		pr_alert("Write-error on swap-device (%u:%u:%llu)\n",
-			 imajor(bio->bi_bdev->bd_inode),
-			 iminor(bio->bi_bdev->bd_inode),
+			 MAJOR(bio_dev(bio)), MINOR(bio_dev(bio)),
 			 (unsigned long long)bio->bi_iter.bi_sector);
 		ClearPageReclaim(page);
 	}
@@ -123,8 +125,7 @@ static void end_swap_bio_read(struct bio *bio)
 		SetPageError(page);
 		ClearPageUptodate(page);
 		pr_alert("Read-error on swap-device (%u:%u:%llu)\n",
-			 imajor(bio->bi_bdev->bd_inode),
-			 iminor(bio->bi_bdev->bd_inode),
+			 MAJOR(bio_dev(bio)), MINOR(bio_dev(bio)),
 			 (unsigned long long)bio->bi_iter.bi_sector);
 		goto out;
 	}
@@ -338,7 +339,7 @@ int swap_readpage(struct page *page, bool do_poll)
 	int ret = 0;
 	struct swap_info_struct *sis = page_swap_info(page);
 	blk_qc_t qc;
-	struct block_device *bdev;
+	struct gendisk *disk;
 
 	VM_BUG_ON_PAGE(!PageSwapCache(page), page);
 	VM_BUG_ON_PAGE(!PageLocked(page), page);
@@ -377,7 +378,7 @@ int swap_readpage(struct page *page, bool do_poll)
 		ret = -ENOMEM;
 		goto out;
 	}
-	bdev = bio->bi_bdev;
+	disk = bio->bi_disk;
 	bio->bi_private = current;
 	bio_set_op_attrs(bio, REQ_OP_READ, 0);
 	count_vm_event(PSWPIN);
@@ -388,7 +389,7 @@ int swap_readpage(struct page *page, bool do_poll)
 		if (!READ_ONCE(bio->bi_private))
 			break;
 
-		if (!blk_mq_poll(bdev_get_queue(bdev), qc))
+		if (!blk_mq_poll(disk->queue, qc))
 			break;
 	}
 	__set_current_state(TASK_RUNNING);
-- 
cgit v1.2.3


From 1e6ec9ea89d30739b9447c1860fcb07fc29f3aef Mon Sep 17 00:00:00 2001
From: Omar Sandoval <osandov@fb.com>
Date: Wed, 23 Aug 2017 14:54:59 -0700
Subject: Revert "loop: support 4k physical blocksize"

There's some stuff still up in the air, let's not get stuck with a
subpar ABI. I'll follow up with something better for 4.14.

Signed-off-by: Omar Sandoval <osandov@fb.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/block/loop.c      | 42 ++++++------------------------------------
 drivers/block/loop.h      |  1 -
 include/uapi/linux/loop.h |  3 ---
 3 files changed, 6 insertions(+), 40 deletions(-)

(limited to 'include')

diff --git a/drivers/block/loop.c b/drivers/block/loop.c
index ef8334949b42..f321b96405f5 100644
--- a/drivers/block/loop.c
+++ b/drivers/block/loop.c
@@ -221,8 +221,7 @@ static void __loop_update_dio(struct loop_device *lo, bool dio)
 }
 
 static int
-figure_loop_size(struct loop_device *lo, loff_t offset, loff_t sizelimit,
-		 loff_t logical_blocksize)
+figure_loop_size(struct loop_device *lo, loff_t offset, loff_t sizelimit)
 {
 	loff_t size = get_size(offset, sizelimit, lo->lo_backing_file);
 	sector_t x = (sector_t)size;
@@ -234,12 +233,6 @@ figure_loop_size(struct loop_device *lo, loff_t offset, loff_t sizelimit,
 		lo->lo_offset = offset;
 	if (lo->lo_sizelimit != sizelimit)
 		lo->lo_sizelimit = sizelimit;
-	if (lo->lo_flags & LO_FLAGS_BLOCKSIZE) {
-		lo->lo_logical_blocksize = logical_blocksize;
-		blk_queue_physical_block_size(lo->lo_queue, lo->lo_blocksize);
-		blk_queue_logical_block_size(lo->lo_queue,
-					     lo->lo_logical_blocksize);
-	}
 	set_capacity(lo->lo_disk, x);
 	bd_set_size(bdev, (loff_t)get_capacity(bdev->bd_disk) << 9);
 	/* let user-space know about the new size */
@@ -820,7 +813,6 @@ static void loop_config_discard(struct loop_device *lo)
 	struct file *file = lo->lo_backing_file;
 	struct inode *inode = file->f_mapping->host;
 	struct request_queue *q = lo->lo_queue;
-	int lo_bits = 9;
 
 	/*
 	 * We use punch hole to reclaim the free space used by the
@@ -840,11 +832,9 @@ static void loop_config_discard(struct loop_device *lo)
 
 	q->limits.discard_granularity = inode->i_sb->s_blocksize;
 	q->limits.discard_alignment = 0;
-	if (lo->lo_flags & LO_FLAGS_BLOCKSIZE)
-		lo_bits = blksize_bits(lo->lo_logical_blocksize);
 
-	blk_queue_max_discard_sectors(q, UINT_MAX >> lo_bits);
-	blk_queue_max_write_zeroes_sectors(q, UINT_MAX >> lo_bits);
+	blk_queue_max_discard_sectors(q, UINT_MAX >> 9);
+	blk_queue_max_write_zeroes_sectors(q, UINT_MAX >> 9);
 	queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, q);
 }
 
@@ -938,7 +928,6 @@ static int loop_set_fd(struct loop_device *lo, fmode_t mode,
 
 	lo->use_dio = false;
 	lo->lo_blocksize = lo_blocksize;
-	lo->lo_logical_blocksize = 512;
 	lo->lo_device = bdev;
 	lo->lo_flags = lo_flags;
 	lo->lo_backing_file = file;
@@ -1104,7 +1093,6 @@ loop_set_status(struct loop_device *lo, const struct loop_info64 *info)
 	int err;
 	struct loop_func_table *xfer;
 	kuid_t uid = current_uid();
-	int lo_flags = lo->lo_flags;
 
 	if (lo->lo_encrypt_key_size &&
 	    !uid_eq(lo->lo_key_owner, uid) &&
@@ -1137,26 +1125,9 @@ loop_set_status(struct loop_device *lo, const struct loop_info64 *info)
 	if (err)
 		goto exit;
 
-	if (info->lo_flags & LO_FLAGS_BLOCKSIZE) {
-		if (!(lo->lo_flags & LO_FLAGS_BLOCKSIZE))
-			lo->lo_logical_blocksize = 512;
-		lo->lo_flags |= LO_FLAGS_BLOCKSIZE;
-		if (LO_INFO_BLOCKSIZE(info) != 512 &&
-		    LO_INFO_BLOCKSIZE(info) != 1024 &&
-		    LO_INFO_BLOCKSIZE(info) != 2048 &&
-		    LO_INFO_BLOCKSIZE(info) != 4096)
-			return -EINVAL;
-		if (LO_INFO_BLOCKSIZE(info) > lo->lo_blocksize)
-			return -EINVAL;
-	}
-
 	if (lo->lo_offset != info->lo_offset ||
-	    lo->lo_sizelimit != info->lo_sizelimit ||
-	    lo->lo_flags != lo_flags ||
-	    ((lo->lo_flags & LO_FLAGS_BLOCKSIZE) &&
-	     lo->lo_logical_blocksize != LO_INFO_BLOCKSIZE(info))) {
-		if (figure_loop_size(lo, info->lo_offset, info->lo_sizelimit,
-				     LO_INFO_BLOCKSIZE(info))) {
+	    lo->lo_sizelimit != info->lo_sizelimit) {
+		if (figure_loop_size(lo, info->lo_offset, info->lo_sizelimit)) {
 			err = -EFBIG;
 			goto exit;
 		}
@@ -1348,8 +1319,7 @@ static int loop_set_capacity(struct loop_device *lo)
 	if (unlikely(lo->lo_state != Lo_bound))
 		return -ENXIO;
 
-	return figure_loop_size(lo, lo->lo_offset, lo->lo_sizelimit,
-				lo->lo_logical_blocksize);
+	return figure_loop_size(lo, lo->lo_offset, lo->lo_sizelimit);
 }
 
 static int loop_set_dio(struct loop_device *lo, unsigned long arg)
diff --git a/drivers/block/loop.h b/drivers/block/loop.h
index 2c096b9a17b8..fecd3f97ef8c 100644
--- a/drivers/block/loop.h
+++ b/drivers/block/loop.h
@@ -49,7 +49,6 @@ struct loop_device {
 	struct file *	lo_backing_file;
 	struct block_device *lo_device;
 	unsigned	lo_blocksize;
-	unsigned	lo_logical_blocksize;
 	void		*key_data; 
 
 	gfp_t		old_gfp_mask;
diff --git a/include/uapi/linux/loop.h b/include/uapi/linux/loop.h
index a3960f98679c..c8125ec1f4f2 100644
--- a/include/uapi/linux/loop.h
+++ b/include/uapi/linux/loop.h
@@ -22,7 +22,6 @@ enum {
 	LO_FLAGS_AUTOCLEAR	= 4,
 	LO_FLAGS_PARTSCAN	= 8,
 	LO_FLAGS_DIRECT_IO	= 16,
-	LO_FLAGS_BLOCKSIZE	= 32,
 };
 
 #include <asm/posix_types.h>	/* for __kernel_old_dev_t */
@@ -60,8 +59,6 @@ struct loop_info64 {
 	__u64		   lo_init[2];
 };
 
-#define LO_INFO_BLOCKSIZE(l) (l)->lo_init[0]
-
 /*
  * Loop filter types
  */
-- 
cgit v1.2.3


From 69a6beab085264cdcdb9e3160b1a2ae61b2efde1 Mon Sep 17 00:00:00 2001
From: Srinivas Kandagatla <srinivas.kandagatla@linaro.org>
Date: Mon, 14 Aug 2017 12:26:34 +0200
Subject: clk: msm8996-gcc: add missing smmu clks

This patch adds missing LPASS smmu clks which are required by the audio driver.

Signed-off-by: Srinivas Kandagatla <srinivas.kandagatla@linaro.org>
Signed-off-by: Stephen Boyd <sboyd@codeaurora.org>
---
 drivers/clk/qcom/gcc-msm8996.c               | 28 ++++++++++++++++++++++++++++
 include/dt-bindings/clock/qcom,gcc-msm8996.h |  2 ++
 2 files changed, 30 insertions(+)

(limited to 'include')

diff --git a/drivers/clk/qcom/gcc-msm8996.c b/drivers/clk/qcom/gcc-msm8996.c
index 8abc200d4fd3..7ddec886fcd3 100644
--- a/drivers/clk/qcom/gcc-msm8996.c
+++ b/drivers/clk/qcom/gcc-msm8996.c
@@ -2730,6 +2730,32 @@ static struct clk_fixed_factor ufs_rx_cfg_clk_src = {
 	},
 };
 
+static struct clk_branch gcc_hlos1_vote_lpass_core_smmu_clk = {
+	.halt_reg = 0x7d010,
+	.halt_check = BRANCH_HALT_VOTED,
+	.clkr = {
+		.enable_reg = 0x7d010,
+		.enable_mask = BIT(0),
+		.hw.init = &(struct clk_init_data){
+			.name = "hlos1_vote_lpass_core_smmu_clk",
+			.ops = &clk_branch2_ops,
+		},
+	},
+};
+
+static struct clk_branch gcc_hlos1_vote_lpass_adsp_smmu_clk = {
+	.halt_reg = 0x7d014,
+	.halt_check = BRANCH_HALT_VOTED,
+	.clkr = {
+		.enable_reg = 0x7d014,
+		.enable_mask = BIT(0),
+		.hw.init = &(struct clk_init_data){
+			.name = "hlos1_vote_lpass_adsp_smmu_clk",
+			.ops = &clk_branch2_ops,
+		},
+	},
+};
+
 static struct clk_branch gcc_ufs_rx_cfg_clk = {
 	.halt_reg = 0x75014,
 	.clkr = {
@@ -3307,6 +3333,8 @@ static struct clk_regmap *gcc_msm8996_clocks[] = {
 	[GCC_UFS_AHB_CLK] = &gcc_ufs_ahb_clk.clkr,
 	[GCC_UFS_TX_CFG_CLK] = &gcc_ufs_tx_cfg_clk.clkr,
 	[GCC_UFS_RX_CFG_CLK] = &gcc_ufs_rx_cfg_clk.clkr,
+	[GCC_HLOS1_VOTE_LPASS_CORE_SMMU_CLK] = &gcc_hlos1_vote_lpass_core_smmu_clk.clkr,
+	[GCC_HLOS1_VOTE_LPASS_ADSP_SMMU_CLK] = &gcc_hlos1_vote_lpass_adsp_smmu_clk.clkr,
 	[GCC_UFS_TX_SYMBOL_0_CLK] = &gcc_ufs_tx_symbol_0_clk.clkr,
 	[GCC_UFS_RX_SYMBOL_0_CLK] = &gcc_ufs_rx_symbol_0_clk.clkr,
 	[GCC_UFS_RX_SYMBOL_1_CLK] = &gcc_ufs_rx_symbol_1_clk.clkr,
diff --git a/include/dt-bindings/clock/qcom,gcc-msm8996.h b/include/dt-bindings/clock/qcom,gcc-msm8996.h
index 1f5c42254798..75b07cf5eed0 100644
--- a/include/dt-bindings/clock/qcom,gcc-msm8996.h
+++ b/include/dt-bindings/clock/qcom,gcc-msm8996.h
@@ -233,6 +233,8 @@
 #define GCC_PCIE_CLKREF_CLK					216
 #define GCC_RX2_USB2_CLKREF_CLK					217
 #define GCC_RX1_USB2_CLKREF_CLK					218
+#define GCC_HLOS1_VOTE_LPASS_CORE_SMMU_CLK			219
+#define GCC_HLOS1_VOTE_LPASS_ADSP_SMMU_CLK			220
 
 #define GCC_SYSTEM_NOC_BCR					0
 #define GCC_CONFIG_NOC_BCR					1
-- 
cgit v1.2.3


From 143c97cc652949893c8056c679012f0aeccb80e5 Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@linux-foundation.org>
Date: Wed, 23 Aug 2017 18:16:11 -0700
Subject: Revert "pty: fix the cached path of the pty slave file descriptor in
 the master"

This reverts commit c8c03f1858331e85d397bacccd34ef409aae993c.

It turns out that while fixing the ptmx file descriptor to have the
correct 'struct path' to the associated slave pty is a really good
thing, it breaks some user space tools for a very annoying reason.

The problem is that /dev/ptmx and its associated slave pty (/dev/pts/X)
are on different mounts.  That was what caused us to have the wrong path
in the first place (we would mix up the vfsmount of the 'ptmx' node,
with the dentry of the pty slave node), but it also means that now while
we use the right vfsmount, having the pty master open also keeps the pts
mount busy.

And it turn sout that that makes 'pbuilder' very unhappy, as noted by
Stefan Lippers-Hollmann:

 "This patch introduces a regression for me when using pbuilder
  0.228.7[2] (a helper to build Debian packages in a chroot and to
  create and update its chroots) when trying to umount /dev/ptmx (inside
  the chroot) on Debian/ unstable (full log and pbuilder configuration
  file[3] attached).

  [...]
  Setting up build-essential (12.3) ...
  Processing triggers for libc-bin (2.24-15) ...
  I: unmounting dev/ptmx filesystem
  W: Could not unmount dev/ptmx: umount: /var/cache/pbuilder/build/1340/dev/ptmx: target is busy
          (In some cases useful info about processes that
           use the device is found by lsof(8) or fuser(1).)"

apparently pbuilder tries to unmount the /dev/pts filesystem while still
holding at least one master node open, which is arguably not very nice,
but we don't break user space even when fixing other bugs.

So this commit has to be reverted.

I'll try to figure out a way to avoid caching the path to the slave pty
in the master pty.  The only thing that actually wants that slave pty
path is the "TIOCGPTPEER" ioctl, and I think we could just recreate the
path at that time.

Reported-by: Stefan Lippers-Hollmann <s.l-h@gmx.de>
Cc: Eric W Biederman <ebiederm@xmission.com>
Cc: Christian Brauner <christian.brauner@canonical.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/tty/pty.c         | 7 ++-----
 fs/devpts/inode.c         | 4 +---
 include/linux/devpts_fs.h | 2 +-
 3 files changed, 4 insertions(+), 9 deletions(-)

(limited to 'include')

diff --git a/drivers/tty/pty.c b/drivers/tty/pty.c
index 1fc80ea87c13..284749fb0f6b 100644
--- a/drivers/tty/pty.c
+++ b/drivers/tty/pty.c
@@ -793,7 +793,6 @@ static int ptmx_open(struct inode *inode, struct file *filp)
 	struct tty_struct *tty;
 	struct path *pts_path;
 	struct dentry *dentry;
-	struct vfsmount *mnt;
 	int retval;
 	int index;
 
@@ -806,7 +805,7 @@ static int ptmx_open(struct inode *inode, struct file *filp)
 	if (retval)
 		return retval;
 
-	fsi = devpts_acquire(filp, &mnt);
+	fsi = devpts_acquire(filp);
 	if (IS_ERR(fsi)) {
 		retval = PTR_ERR(fsi);
 		goto out_free_file;
@@ -850,7 +849,7 @@ static int ptmx_open(struct inode *inode, struct file *filp)
 	pts_path = kmalloc(sizeof(struct path), GFP_KERNEL);
 	if (!pts_path)
 		goto err_release;
-	pts_path->mnt = mnt;
+	pts_path->mnt = filp->f_path.mnt;
 	pts_path->dentry = dentry;
 	path_get(pts_path);
 	tty->link->driver_data = pts_path;
@@ -867,7 +866,6 @@ err_path_put:
 	path_put(pts_path);
 	kfree(pts_path);
 err_release:
-	mntput(mnt);
 	tty_unlock(tty);
 	// This will also put-ref the fsi
 	tty_release(inode, filp);
@@ -876,7 +874,6 @@ out:
 	devpts_kill_index(fsi, index);
 out_put_fsi:
 	devpts_release(fsi);
-	mntput(mnt);
 out_free_file:
 	tty_free_file(filp);
 	return retval;
diff --git a/fs/devpts/inode.c b/fs/devpts/inode.c
index 44dfbca9306f..108df2e3602c 100644
--- a/fs/devpts/inode.c
+++ b/fs/devpts/inode.c
@@ -133,7 +133,7 @@ static inline struct pts_fs_info *DEVPTS_SB(struct super_block *sb)
 	return sb->s_fs_info;
 }
 
-struct pts_fs_info *devpts_acquire(struct file *filp, struct vfsmount **ptsmnt)
+struct pts_fs_info *devpts_acquire(struct file *filp)
 {
 	struct pts_fs_info *result;
 	struct path path;
@@ -142,7 +142,6 @@ struct pts_fs_info *devpts_acquire(struct file *filp, struct vfsmount **ptsmnt)
 
 	path = filp->f_path;
 	path_get(&path);
-	*ptsmnt = NULL;
 
 	/* Has the devpts filesystem already been found? */
 	sb = path.mnt->mnt_sb;
@@ -166,7 +165,6 @@ struct pts_fs_info *devpts_acquire(struct file *filp, struct vfsmount **ptsmnt)
 	 * pty code needs to hold extra references in case of last /dev/tty close
 	 */
 	atomic_inc(&sb->s_active);
-	*ptsmnt = mntget(path.mnt);
 	result = DEVPTS_SB(sb);
 
 out:
diff --git a/include/linux/devpts_fs.h b/include/linux/devpts_fs.h
index 7883e901f65c..277ab9af9ac2 100644
--- a/include/linux/devpts_fs.h
+++ b/include/linux/devpts_fs.h
@@ -19,7 +19,7 @@
 
 struct pts_fs_info;
 
-struct pts_fs_info *devpts_acquire(struct file *, struct vfsmount **ptsmnt);
+struct pts_fs_info *devpts_acquire(struct file *);
 void devpts_release(struct pts_fs_info *);
 
 int devpts_new_index(struct pts_fs_info *);
-- 
cgit v1.2.3


From c4335fdd38227788178953c101b77180504d7ea0 Mon Sep 17 00:00:00 2001
From: gengdongjiu <gengdongjiu@huawei.com>
Date: Thu, 17 Aug 2017 20:07:18 +0800
Subject: ACPI: APEI: fix the wrong iteration of generic error status block

The revision 0x300 generic error data entry is different
from the old version, but currently iterating through the
GHES estatus blocks does not take into account this difference.
This will lead to failure to get the right data entry if GHES
has revision 0x300 error data entry.

Update the GHES estatus iteration macro to properly increment using
acpi_hest_get_next(), and correct the iteration termination condition
because the status block data length only includes error data
length.

Convert the CPER estatus checking and printing iteration logic
to use same macro.

Signed-off-by: Dongjiu Geng <gengdongjiu@huawei.com>
Tested-by: Tyler Baicar <tbaicar@codeaurora.org>
Reviewed-by: Borislav Petkov <bp@suse.de>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/acpi/apei/apei-internal.h |  5 -----
 drivers/firmware/efi/cper.c       | 12 ++----------
 include/acpi/ghes.h               |  5 +++++
 3 files changed, 7 insertions(+), 15 deletions(-)

(limited to 'include')

diff --git a/drivers/acpi/apei/apei-internal.h b/drivers/acpi/apei/apei-internal.h
index 6e9f14c0a71b..cb4126051f62 100644
--- a/drivers/acpi/apei/apei-internal.h
+++ b/drivers/acpi/apei/apei-internal.h
@@ -120,11 +120,6 @@ int apei_exec_collect_resources(struct apei_exec_context *ctx,
 struct dentry;
 struct dentry *apei_get_debugfs_dir(void);
 
-#define apei_estatus_for_each_section(estatus, section)			\
-	for (section = (struct acpi_hest_generic_data *)(estatus + 1);	\
-	     (void *)section - (void *)estatus < estatus->data_length;	\
-	     section = (void *)(section+1) + section->error_data_length)
-
 static inline u32 cper_estatus_len(struct acpi_hest_generic_status *estatus)
 {
 	if (estatus->raw_data_length)
diff --git a/drivers/firmware/efi/cper.c b/drivers/firmware/efi/cper.c
index 48a8f69da42a..bf3672a81e49 100644
--- a/drivers/firmware/efi/cper.c
+++ b/drivers/firmware/efi/cper.c
@@ -606,7 +606,6 @@ void cper_estatus_print(const char *pfx,
 			const struct acpi_hest_generic_status *estatus)
 {
 	struct acpi_hest_generic_data *gdata;
-	unsigned int data_len;
 	int sec_no = 0;
 	char newpfx[64];
 	__u16 severity;
@@ -617,14 +616,10 @@ void cper_estatus_print(const char *pfx,
 		       "It has been corrected by h/w "
 		       "and requires no further action");
 	printk("%s""event severity: %s\n", pfx, cper_severity_str(severity));
-	data_len = estatus->data_length;
-	gdata = (struct acpi_hest_generic_data *)(estatus + 1);
 	snprintf(newpfx, sizeof(newpfx), "%s%s", pfx, INDENT_SP);
 
-	while (data_len >= acpi_hest_get_size(gdata)) {
+	apei_estatus_for_each_section(estatus, gdata) {
 		cper_estatus_print_section(newpfx, gdata, sec_no);
-		data_len -= acpi_hest_get_record_size(gdata);
-		gdata = acpi_hest_get_next(gdata);
 		sec_no++;
 	}
 }
@@ -653,15 +648,12 @@ int cper_estatus_check(const struct acpi_hest_generic_status *estatus)
 	if (rc)
 		return rc;
 	data_len = estatus->data_length;
-	gdata = (struct acpi_hest_generic_data *)(estatus + 1);
 
-	while (data_len >= acpi_hest_get_size(gdata)) {
+	apei_estatus_for_each_section(estatus, gdata) {
 		gedata_len = acpi_hest_get_error_length(gdata);
 		if (gedata_len > data_len - acpi_hest_get_size(gdata))
 			return -EINVAL;
-
 		data_len -= acpi_hest_get_record_size(gdata);
-		gdata = acpi_hest_get_next(gdata);
 	}
 	if (data_len)
 		return -EINVAL;
diff --git a/include/acpi/ghes.h b/include/acpi/ghes.h
index 9f26e01186ae..9061c5c743b3 100644
--- a/include/acpi/ghes.h
+++ b/include/acpi/ghes.h
@@ -113,6 +113,11 @@ static inline void *acpi_hest_get_next(struct acpi_hest_generic_data *gdata)
 	return (void *)(gdata) + acpi_hest_get_record_size(gdata);
 }
 
+#define apei_estatus_for_each_section(estatus, section)			\
+	for (section = (struct acpi_hest_generic_data *)(estatus + 1);	\
+	     (void *)section - (void *)(estatus + 1) < estatus->data_length; \
+	     section = acpi_hest_get_next(section))
+
 int ghes_notify_sea(void);
 
 #endif /* GHES_H */
-- 
cgit v1.2.3


From 98aaa913b4ed250324429f0a9e6d5f77a3b5276c Mon Sep 17 00:00:00 2001
From: Mike Maloney <maloney@google.com>
Date: Tue, 22 Aug 2017 17:08:48 -0400
Subject: tcp: Extend SOF_TIMESTAMPING_RX_SOFTWARE to TCP recvmsg

When SOF_TIMESTAMPING_RX_SOFTWARE is enabled for tcp sockets, return the
timestamp corresponding to the highest sequence number data returned.

Previously the skb->tstamp is overwritten when a TCP packet is placed
in the out of order queue.  While the packet is in the ooo queue, save the
timestamp in the TCB_SKB_CB.  This space is shared with the gso_*
options which are only used on the tx path, and a previously unused 4
byte hole.

When skbs are coalesced either in the sk_receive_queue or the
out_of_order_queue always choose the timestamp of the appended skb to
maintain the invariant of returning the timestamp of the last byte in
the recvmsg buffer.

Signed-off-by: Mike Maloney <maloney@google.com>
Acked-by: Willem de Bruijn <willemb@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/tcp.h    |  9 +++++++-
 net/ipv4/tcp.c       | 65 ++++++++++++++++++++++++++++++++++++++++++++++++++++
 net/ipv4/tcp_input.c | 35 ++++++++++++++++++++++++----
 net/ipv4/tcp_ipv4.c  |  2 ++
 net/ipv6/tcp_ipv6.c  |  2 ++
 5 files changed, 108 insertions(+), 5 deletions(-)

(limited to 'include')

diff --git a/include/net/tcp.h b/include/net/tcp.h
index a995004ae946..c614ff135b66 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -774,6 +774,12 @@ struct tcp_skb_cb {
 			u16	tcp_gso_segs;
 			u16	tcp_gso_size;
 		};
+
+		/* Used to stash the receive timestamp while this skb is in the
+		 * out of order queue, as skb->tstamp is overwritten by the
+		 * rbnode.
+		 */
+		ktime_t		swtstamp;
 	};
 	__u8		tcp_flags;	/* TCP header flags. (tcp[13])	*/
 
@@ -790,7 +796,8 @@ struct tcp_skb_cb {
 	__u8		ip_dsfield;	/* IPv4 tos or IPv6 dsfield	*/
 	__u8		txstamp_ack:1,	/* Record TX timestamp for ack? */
 			eor:1,		/* Is skb MSG_EOR marked? */
-			unused:6;
+			has_rxtstamp:1,	/* SKB has a RX timestamp	*/
+			unused:5;
 	__u32		ack_seq;	/* Sequence number ACK'd	*/
 	union {
 		struct {
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index d25e3bcca66b..0cce4472b4a1 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -269,6 +269,7 @@
 #include <linux/err.h>
 #include <linux/time.h>
 #include <linux/slab.h>
+#include <linux/errqueue.h>
 
 #include <net/icmp.h>
 #include <net/inet_common.h>
@@ -1695,6 +1696,61 @@ int tcp_peek_len(struct socket *sock)
 }
 EXPORT_SYMBOL(tcp_peek_len);
 
+static void tcp_update_recv_tstamps(struct sk_buff *skb,
+				    struct scm_timestamping *tss)
+{
+	if (skb->tstamp)
+		tss->ts[0] = ktime_to_timespec(skb->tstamp);
+	else
+		tss->ts[0] = (struct timespec) {0};
+
+	if (skb_hwtstamps(skb)->hwtstamp)
+		tss->ts[2] = ktime_to_timespec(skb_hwtstamps(skb)->hwtstamp);
+	else
+		tss->ts[2] = (struct timespec) {0};
+}
+
+/* Similar to __sock_recv_timestamp, but does not require an skb */
+void tcp_recv_timestamp(struct msghdr *msg, const struct sock *sk,
+			struct scm_timestamping *tss)
+{
+	struct timeval tv;
+	bool has_timestamping = false;
+
+	if (tss->ts[0].tv_sec || tss->ts[0].tv_nsec) {
+		if (sock_flag(sk, SOCK_RCVTSTAMP)) {
+			if (sock_flag(sk, SOCK_RCVTSTAMPNS)) {
+				put_cmsg(msg, SOL_SOCKET, SCM_TIMESTAMPNS,
+					 sizeof(tss->ts[0]), &tss->ts[0]);
+			} else {
+				tv.tv_sec = tss->ts[0].tv_sec;
+				tv.tv_usec = tss->ts[0].tv_nsec / 1000;
+
+				put_cmsg(msg, SOL_SOCKET, SCM_TIMESTAMP,
+					 sizeof(tv), &tv);
+			}
+		}
+
+		if (sk->sk_tsflags & SOF_TIMESTAMPING_SOFTWARE)
+			has_timestamping = true;
+		else
+			tss->ts[0] = (struct timespec) {0};
+	}
+
+	if (tss->ts[2].tv_sec || tss->ts[2].tv_nsec) {
+		if (sk->sk_tsflags & SOF_TIMESTAMPING_RAW_HARDWARE)
+			has_timestamping = true;
+		else
+			tss->ts[2] = (struct timespec) {0};
+	}
+
+	if (has_timestamping) {
+		tss->ts[1] = (struct timespec) {0};
+		put_cmsg(msg, SOL_SOCKET, SCM_TIMESTAMPING,
+			 sizeof(*tss), tss);
+	}
+}
+
 /*
  *	This routine copies from a sock struct into the user buffer.
  *
@@ -1716,6 +1772,8 @@ int tcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int nonblock,
 	long timeo;
 	struct sk_buff *skb, *last;
 	u32 urg_hole = 0;
+	struct scm_timestamping tss;
+	bool has_tss = false;
 
 	if (unlikely(flags & MSG_ERRQUEUE))
 		return inet_recv_error(sk, msg, len, addr_len);
@@ -1911,6 +1969,10 @@ skip_copy:
 		if (used + offset < skb->len)
 			continue;
 
+		if (TCP_SKB_CB(skb)->has_rxtstamp) {
+			tcp_update_recv_tstamps(skb, &tss);
+			has_tss = true;
+		}
 		if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN)
 			goto found_fin_ok;
 		if (!(flags & MSG_PEEK))
@@ -1929,6 +1991,9 @@ skip_copy:
 	 * on connected socket. I was just happy when found this 8) --ANK
 	 */
 
+	if (has_tss)
+		tcp_recv_timestamp(msg, sk, &tss);
+
 	/* Clean up data we have read: This will do ACK frames. */
 	tcp_cleanup_rbuf(sk, copied);
 
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index d3421ee9a10a..568ccfd6dd37 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -4246,9 +4246,15 @@ static void tcp_sack_remove(struct tcp_sock *tp)
 	tp->rx_opt.num_sacks = num_sacks;
 }
 
+enum tcp_queue {
+	OOO_QUEUE,
+	RCV_QUEUE,
+};
+
 /**
  * tcp_try_coalesce - try to merge skb to prior one
  * @sk: socket
+ * @dest: destination queue
  * @to: prior buffer
  * @from: buffer to add in queue
  * @fragstolen: pointer to boolean
@@ -4260,6 +4266,7 @@ static void tcp_sack_remove(struct tcp_sock *tp)
  * Returns true if caller should free @from instead of queueing it
  */
 static bool tcp_try_coalesce(struct sock *sk,
+			     enum tcp_queue dest,
 			     struct sk_buff *to,
 			     struct sk_buff *from,
 			     bool *fragstolen)
@@ -4281,6 +4288,15 @@ static bool tcp_try_coalesce(struct sock *sk,
 	TCP_SKB_CB(to)->end_seq = TCP_SKB_CB(from)->end_seq;
 	TCP_SKB_CB(to)->ack_seq = TCP_SKB_CB(from)->ack_seq;
 	TCP_SKB_CB(to)->tcp_flags |= TCP_SKB_CB(from)->tcp_flags;
+
+	if (TCP_SKB_CB(from)->has_rxtstamp) {
+		TCP_SKB_CB(to)->has_rxtstamp = true;
+		if (dest == OOO_QUEUE)
+			TCP_SKB_CB(to)->swtstamp = TCP_SKB_CB(from)->swtstamp;
+		else
+			to->tstamp = from->tstamp;
+	}
+
 	return true;
 }
 
@@ -4315,6 +4331,9 @@ static void tcp_ofo_queue(struct sock *sk)
 		}
 		p = rb_next(p);
 		rb_erase(&skb->rbnode, &tp->out_of_order_queue);
+		/* Replace tstamp which was stomped by rbnode */
+		if (TCP_SKB_CB(skb)->has_rxtstamp)
+			skb->tstamp = TCP_SKB_CB(skb)->swtstamp;
 
 		if (unlikely(!after(TCP_SKB_CB(skb)->end_seq, tp->rcv_nxt))) {
 			SOCK_DEBUG(sk, "ofo packet was already received\n");
@@ -4326,7 +4345,8 @@ static void tcp_ofo_queue(struct sock *sk)
 			   TCP_SKB_CB(skb)->end_seq);
 
 		tail = skb_peek_tail(&sk->sk_receive_queue);
-		eaten = tail && tcp_try_coalesce(sk, tail, skb, &fragstolen);
+		eaten = tail && tcp_try_coalesce(sk, RCV_QUEUE,
+						 tail, skb, &fragstolen);
 		tcp_rcv_nxt_update(tp, TCP_SKB_CB(skb)->end_seq);
 		fin = TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN;
 		if (!eaten)
@@ -4380,6 +4400,10 @@ static void tcp_data_queue_ofo(struct sock *sk, struct sk_buff *skb)
 		return;
 	}
 
+	/* Stash tstamp to avoid being stomped on by rbnode */
+	if (TCP_SKB_CB(skb)->has_rxtstamp)
+		TCP_SKB_CB(skb)->swtstamp = skb->tstamp;
+
 	inet_csk_schedule_ack(sk);
 
 	NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPOFOQUEUE);
@@ -4405,7 +4429,8 @@ static void tcp_data_queue_ofo(struct sock *sk, struct sk_buff *skb)
 	/* In the typical case, we are adding an skb to the end of the list.
 	 * Use of ooo_last_skb avoids the O(Log(N)) rbtree lookup.
 	 */
-	if (tcp_try_coalesce(sk, tp->ooo_last_skb, skb, &fragstolen)) {
+	if (tcp_try_coalesce(sk, OOO_QUEUE, tp->ooo_last_skb,
+			     skb, &fragstolen)) {
 coalesce_done:
 		tcp_grow_window(sk, skb);
 		kfree_skb_partial(skb, fragstolen);
@@ -4455,7 +4480,8 @@ coalesce_done:
 				__kfree_skb(skb1);
 				goto merge_right;
 			}
-		} else if (tcp_try_coalesce(sk, skb1, skb, &fragstolen)) {
+		} else if (tcp_try_coalesce(sk, OOO_QUEUE, skb1,
+					    skb, &fragstolen)) {
 			goto coalesce_done;
 		}
 		p = &parent->rb_right;
@@ -4506,7 +4532,8 @@ static int __must_check tcp_queue_rcv(struct sock *sk, struct sk_buff *skb, int
 
 	__skb_pull(skb, hdrlen);
 	eaten = (tail &&
-		 tcp_try_coalesce(sk, tail, skb, fragstolen)) ? 1 : 0;
+		 tcp_try_coalesce(sk, RCV_QUEUE, tail,
+				  skb, fragstolen)) ? 1 : 0;
 	tcp_rcv_nxt_update(tcp_sk(sk), TCP_SKB_CB(skb)->end_seq);
 	if (!eaten) {
 		__skb_queue_tail(&sk->sk_receive_queue, skb);
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 5af8b809dfbc..a63486afa7a7 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -1637,6 +1637,8 @@ int tcp_v4_rcv(struct sk_buff *skb)
 	TCP_SKB_CB(skb)->tcp_tw_isn = 0;
 	TCP_SKB_CB(skb)->ip_dsfield = ipv4_get_dsfield(iph);
 	TCP_SKB_CB(skb)->sacked	 = 0;
+	TCP_SKB_CB(skb)->has_rxtstamp =
+			skb->tstamp || skb_hwtstamps(skb)->hwtstamp;
 
 lookup:
 	sk = __inet_lookup_skb(&tcp_hashinfo, skb, __tcp_hdrlen(th), th->source,
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index d79a1af3252e..abba3bc2a3d9 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -1394,6 +1394,8 @@ static void tcp_v6_fill_cb(struct sk_buff *skb, const struct ipv6hdr *hdr,
 	TCP_SKB_CB(skb)->tcp_tw_isn = 0;
 	TCP_SKB_CB(skb)->ip_dsfield = ipv6_get_dsfield(hdr);
 	TCP_SKB_CB(skb)->sacked = 0;
+	TCP_SKB_CB(skb)->has_rxtstamp =
+			skb->tstamp || skb_hwtstamps(skb)->hwtstamp;
 }
 
 static int tcp_v6_rcv(struct sk_buff *skb)
-- 
cgit v1.2.3


From cd0a137acbb66208368353723f5f1480995cf1c4 Mon Sep 17 00:00:00 2001
From: Florian Fainelli <f.fainelli@gmail.com>
Date: Tue, 22 Aug 2017 15:12:14 -0700
Subject: net: core: Specify skb_pad()/skb_put_padto() SKB freeing

Rename skb_pad() into __skb_pad() and make it take a third argument:
free_on_error which controls whether kfree_skb() should be called or
not, skb_pad() directly makes use of it and passes true to preserve its
existing behavior. Do exactly the same thing with __skb_put_padto() and
skb_put_padto().

Suggested-by: David Miller <davem@davemloft.net>
Signed-off-by: Florian Fainelli <f.fainelli@gmail.com>
Reviewed-by: Woojung Huh <Woojung.Huh@microchip.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/skbuff.h | 41 +++++++++++++++++++++++++++++++++++++----
 net/core/skbuff.c      | 13 ++++++++-----
 2 files changed, 45 insertions(+), 9 deletions(-)

(limited to 'include')

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index dbe29b6c9bd6..d67a8182e5eb 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -973,7 +973,23 @@ int __must_check skb_to_sgvec_nomark(struct sk_buff *skb, struct scatterlist *sg
 int __must_check skb_to_sgvec(struct sk_buff *skb, struct scatterlist *sg,
 			      int offset, int len);
 int skb_cow_data(struct sk_buff *skb, int tailbits, struct sk_buff **trailer);
-int skb_pad(struct sk_buff *skb, int pad);
+int __skb_pad(struct sk_buff *skb, int pad, bool free_on_error);
+
+/**
+ *	skb_pad			-	zero pad the tail of an skb
+ *	@skb: buffer to pad
+ *	@pad: space to pad
+ *
+ *	Ensure that a buffer is followed by a padding area that is zero
+ *	filled. Used by network drivers which may DMA or transfer data
+ *	beyond the buffer end onto the wire.
+ *
+ *	May return error in out of memory cases. The skb is freed on error.
+ */
+static inline int skb_pad(struct sk_buff *skb, int pad)
+{
+	return __skb_pad(skb, pad, true);
+}
 #define dev_kfree_skb(a)	consume_skb(a)
 
 int skb_append_datato_frags(struct sock *sk, struct sk_buff *skb,
@@ -2825,25 +2841,42 @@ static inline int skb_padto(struct sk_buff *skb, unsigned int len)
  *	skb_put_padto - increase size and pad an skbuff up to a minimal size
  *	@skb: buffer to pad
  *	@len: minimal length
+ *	@free_on_error: free buffer on error
  *
  *	Pads up a buffer to ensure the trailing bytes exist and are
  *	blanked. If the buffer already contains sufficient data it
  *	is untouched. Otherwise it is extended. Returns zero on
- *	success. The skb is freed on error.
+ *	success. The skb is freed on error if @free_on_error is true.
  */
-static inline int skb_put_padto(struct sk_buff *skb, unsigned int len)
+static inline int __skb_put_padto(struct sk_buff *skb, unsigned int len,
+				  bool free_on_error)
 {
 	unsigned int size = skb->len;
 
 	if (unlikely(size < len)) {
 		len -= size;
-		if (skb_pad(skb, len))
+		if (__skb_pad(skb, len, free_on_error))
 			return -ENOMEM;
 		__skb_put(skb, len);
 	}
 	return 0;
 }
 
+/**
+ *	skb_put_padto - increase size and pad an skbuff up to a minimal size
+ *	@skb: buffer to pad
+ *	@len: minimal length
+ *
+ *	Pads up a buffer to ensure the trailing bytes exist and are
+ *	blanked. If the buffer already contains sufficient data it
+ *	is untouched. Otherwise it is extended. Returns zero on
+ *	success. The skb is freed on error.
+ */
+static inline int skb_put_padto(struct sk_buff *skb, unsigned int len)
+{
+	return __skb_put_padto(skb, len, true);
+}
+
 static inline int skb_add_data(struct sk_buff *skb,
 			       struct iov_iter *from, int copy)
 {
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index f990eb8b30a9..e07556606284 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -1363,18 +1363,20 @@ struct sk_buff *skb_copy_expand(const struct sk_buff *skb,
 EXPORT_SYMBOL(skb_copy_expand);
 
 /**
- *	skb_pad			-	zero pad the tail of an skb
+ *	__skb_pad		-	zero pad the tail of an skb
  *	@skb: buffer to pad
  *	@pad: space to pad
+ *	@free_on_error: free buffer on error
  *
  *	Ensure that a buffer is followed by a padding area that is zero
  *	filled. Used by network drivers which may DMA or transfer data
  *	beyond the buffer end onto the wire.
  *
- *	May return error in out of memory cases. The skb is freed on error.
+ *	May return error in out of memory cases. The skb is freed on error
+ *	if @free_on_error is true.
  */
 
-int skb_pad(struct sk_buff *skb, int pad)
+int __skb_pad(struct sk_buff *skb, int pad, bool free_on_error)
 {
 	int err;
 	int ntail;
@@ -1403,10 +1405,11 @@ int skb_pad(struct sk_buff *skb, int pad)
 	return 0;
 
 free_skb:
-	kfree_skb(skb);
+	if (free_on_error)
+		kfree_skb(skb);
 	return err;
 }
-EXPORT_SYMBOL(skb_pad);
+EXPORT_SYMBOL(__skb_pad);
 
 /**
  *	pskb_put - add data to the tail of a potentially fragmented buffer
-- 
cgit v1.2.3


From f9cbe9a556afca9e82df9aebe4412d93769566b5 Mon Sep 17 00:00:00 2001
From: Antoine Ténart <antoine.tenart@free-electrons.com>
Date: Wed, 23 Aug 2017 09:46:54 +0200
Subject: net: define the TSO header size in net/tso.h

The TSO header size was defined in many drivers. Factorize the code and
define its size in net/tso.h.

Signed-off-by: Antoine Tenart <antoine.tenart@free-electrons.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/cavium/thunder/nicvf_queues.h | 1 -
 drivers/net/ethernet/freescale/fec_main.c          | 1 -
 drivers/net/ethernet/marvell/mv643xx_eth.c         | 2 --
 drivers/net/ethernet/marvell/mvneta.c              | 3 ---
 include/net/tso.h                                  | 2 ++
 5 files changed, 2 insertions(+), 7 deletions(-)

(limited to 'include')

diff --git a/drivers/net/ethernet/cavium/thunder/nicvf_queues.h b/drivers/net/ethernet/cavium/thunder/nicvf_queues.h
index 57858522c33c..67d1a3230773 100644
--- a/drivers/net/ethernet/cavium/thunder/nicvf_queues.h
+++ b/drivers/net/ethernet/cavium/thunder/nicvf_queues.h
@@ -277,7 +277,6 @@ struct snd_queue {
 	u16		xdp_free_cnt;
 	bool		is_xdp;
 
-#define	TSO_HEADER_SIZE	128
 	/* For TSO segment's header */
 	char		*tso_hdrs;
 	dma_addr_t	tso_hdrs_phys;
diff --git a/drivers/net/ethernet/freescale/fec_main.c b/drivers/net/ethernet/freescale/fec_main.c
index df09b254553d..56f56d6ada9c 100644
--- a/drivers/net/ethernet/freescale/fec_main.c
+++ b/drivers/net/ethernet/freescale/fec_main.c
@@ -226,7 +226,6 @@ MODULE_PARM_DESC(macaddr, "FEC Ethernet MAC address");
 
 #define COPYBREAK_DEFAULT	256
 
-#define TSO_HEADER_SIZE		128
 /* Max number of allowed TCP segments for software TSO */
 #define FEC_MAX_TSO_SEGS	100
 #define FEC_MAX_SKB_DESCS	(FEC_MAX_TSO_SEGS * 2 + MAX_SKB_FRAGS)
diff --git a/drivers/net/ethernet/marvell/mv643xx_eth.c b/drivers/net/ethernet/marvell/mv643xx_eth.c
index 9c94ea9b2b80..fb2d533ae4ef 100644
--- a/drivers/net/ethernet/marvell/mv643xx_eth.c
+++ b/drivers/net/ethernet/marvell/mv643xx_eth.c
@@ -183,8 +183,6 @@ static char mv643xx_eth_driver_version[] = "1.4";
 #define DEFAULT_TX_QUEUE_SIZE	512
 #define SKB_DMA_REALIGN		((PAGE_SIZE - NET_SKB_PAD) % SMP_CACHE_BYTES)
 
-#define TSO_HEADER_SIZE		128
-
 /* Max number of allowed TCP segments for software TSO */
 #define MV643XX_MAX_TSO_SEGS 100
 #define MV643XX_MAX_SKB_DESCS (MV643XX_MAX_TSO_SEGS * 2 + MAX_SKB_FRAGS)
diff --git a/drivers/net/ethernet/marvell/mvneta.c b/drivers/net/ethernet/marvell/mvneta.c
index 0aab74c2a209..35ff1ecfcff0 100644
--- a/drivers/net/ethernet/marvell/mvneta.c
+++ b/drivers/net/ethernet/marvell/mvneta.c
@@ -281,9 +281,6 @@
  */
 #define MVNETA_RSS_LU_TABLE_SIZE	1
 
-/* TSO header size */
-#define TSO_HEADER_SIZE 128
-
 /* Max number of Rx descriptors */
 #define MVNETA_MAX_RXD 128
 
diff --git a/include/net/tso.h b/include/net/tso.h
index b7be852bfe9d..9a56c39e6d0a 100644
--- a/include/net/tso.h
+++ b/include/net/tso.h
@@ -3,6 +3,8 @@
 
 #include <net/ip.h>
 
+#define TSO_HEADER_SIZE		128
+
 struct tso_t {
 	int next_frag_idx;
 	void *data;
-- 
cgit v1.2.3


From e457d86ada27cbd2f46ded75d4b4bc06e26d0e2e Mon Sep 17 00:00:00 2001
From: Jiri Pirko <jiri@mellanox.com>
Date: Wed, 23 Aug 2017 10:08:19 +0200
Subject: net: sched: add couple of goto_chain helpers

Add helpers to find out if a gact instance is goto_chain termination
action and to get chain index.

Signed-off-by: Jiri Pirko <jiri@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/tc_act/tc_gact.h | 20 ++++++++++++++++----
 1 file changed, 16 insertions(+), 4 deletions(-)

(limited to 'include')

diff --git a/include/net/tc_act/tc_gact.h b/include/net/tc_act/tc_gact.h
index d576374c4d6f..41afe1ce7b16 100644
--- a/include/net/tc_act/tc_gact.h
+++ b/include/net/tc_act/tc_gact.h
@@ -15,7 +15,8 @@ struct tcf_gact {
 };
 #define to_gact(a) ((struct tcf_gact *)a)
 
-static inline bool __is_tcf_gact_act(const struct tc_action *a, int act)
+static inline bool __is_tcf_gact_act(const struct tc_action *a, int act,
+				     bool is_ext)
 {
 #ifdef CONFIG_NET_CLS_ACT
 	struct tcf_gact *gact;
@@ -24,7 +25,8 @@ static inline bool __is_tcf_gact_act(const struct tc_action *a, int act)
 		return false;
 
 	gact = to_gact(a);
-	if (gact->tcf_action == act)
+	if ((!is_ext && gact->tcf_action == act) ||
+	    (is_ext && TC_ACT_EXT_CMP(gact->tcf_action, act)))
 		return true;
 
 #endif
@@ -33,12 +35,22 @@ static inline bool __is_tcf_gact_act(const struct tc_action *a, int act)
 
 static inline bool is_tcf_gact_shot(const struct tc_action *a)
 {
-	return __is_tcf_gact_act(a, TC_ACT_SHOT);
+	return __is_tcf_gact_act(a, TC_ACT_SHOT, false);
 }
 
 static inline bool is_tcf_gact_trap(const struct tc_action *a)
 {
-	return __is_tcf_gact_act(a, TC_ACT_TRAP);
+	return __is_tcf_gact_act(a, TC_ACT_TRAP, false);
+}
+
+static inline bool is_tcf_gact_goto_chain(const struct tc_action *a)
+{
+	return __is_tcf_gact_act(a, TC_ACT_GOTO_CHAIN, true);
+}
+
+static inline u32 tcf_gact_goto_chain_index(const struct tc_action *a)
+{
+	return a->goto_chain->index;
 }
 
 #endif /* __NET_TC_GACT_H */
-- 
cgit v1.2.3


From 1b688a19a92223cf2d1892c9d05d64dc397b33e3 Mon Sep 17 00:00:00 2001
From: Edward Cree <ecree@solarflare.com>
Date: Wed, 23 Aug 2017 15:10:50 +0100
Subject: bpf/verifier: remove varlen_map_value_access flag

The optimisation it does is broken when the 'new' register value has a
 variable offset and the 'old' was constant.  I broke it with my pointer
 types unification (see Fixes tag below), before which the 'new' value
 would have type PTR_TO_MAP_VALUE_ADJ and would thus not compare equal;
 other changes in that patch mean that its original behaviour (ignore
 min/max values) cannot be restored.
Tests on a sample set of cilium programs show no change in count of
 processed instructions.

Fixes: f1174f77b50c ("bpf/verifier: rework value tracking")
Signed-off-by: Edward Cree <ecree@solarflare.com>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/bpf_verifier.h |  1 -
 kernel/bpf/verifier.c        | 41 ++++++++++++-----------------------------
 2 files changed, 12 insertions(+), 30 deletions(-)

(limited to 'include')

diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h
index 91d07efed2ba..d8f131a36fd0 100644
--- a/include/linux/bpf_verifier.h
+++ b/include/linux/bpf_verifier.h
@@ -125,7 +125,6 @@ struct bpf_verifier_env {
 	u32 id_gen;			/* used to generate unique reg IDs */
 	bool allow_ptr_leaks;
 	bool seen_direct_write;
-	bool varlen_map_value_access;
 	struct bpf_insn_aux_data *insn_aux_data; /* array of per-insn state */
 };
 
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index fdbaa6086559..711bdbd22cea 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -832,11 +832,6 @@ static int check_map_access(struct bpf_verifier_env *env, u32 regno,
 	 */
 	if (log_level)
 		print_verifier_state(state);
-	/* If the offset is variable, we will need to be stricter in state
-	 * pruning from now on.
-	 */
-	if (!tnum_is_const(reg->var_off))
-		env->varlen_map_value_access = true;
 	/* The minimum value is only important with signed
 	 * comparisons where we can't assume the floor of a
 	 * value is 0.  If we are using signed variables for our
@@ -3247,9 +3242,8 @@ static bool check_ids(u32 old_id, u32 cur_id, struct idpair *idmap)
 }
 
 /* Returns true if (rold safe implies rcur safe) */
-static bool regsafe(struct bpf_reg_state *rold,
-		    struct bpf_reg_state *rcur,
-		    bool varlen_map_access, struct idpair *idmap)
+static bool regsafe(struct bpf_reg_state *rold, struct bpf_reg_state *rcur,
+		    struct idpair *idmap)
 {
 	if (!(rold->live & REG_LIVE_READ))
 		/* explored state didn't use this */
@@ -3281,22 +3275,14 @@ static bool regsafe(struct bpf_reg_state *rold,
 			       tnum_is_unknown(rold->var_off);
 		}
 	case PTR_TO_MAP_VALUE:
-		if (varlen_map_access) {
-			/* If the new min/max/var_off satisfy the old ones and
-			 * everything else matches, we are OK.
-			 * We don't care about the 'id' value, because nothing
-			 * uses it for PTR_TO_MAP_VALUE (only for ..._OR_NULL)
-			 */
-			return memcmp(rold, rcur, offsetof(struct bpf_reg_state, id)) == 0 &&
-			       range_within(rold, rcur) &&
-			       tnum_in(rold->var_off, rcur->var_off);
-		} else {
-			/* If the ranges/var_off were not the same, but
-			 * everything else was and we didn't do a variable
-			 * access into a map then we are a-ok.
-			 */
-			return memcmp(rold, rcur, offsetof(struct bpf_reg_state, id)) == 0;
-		}
+		/* If the new min/max/var_off satisfy the old ones and
+		 * everything else matches, we are OK.
+		 * We don't care about the 'id' value, because nothing
+		 * uses it for PTR_TO_MAP_VALUE (only for ..._OR_NULL)
+		 */
+		return memcmp(rold, rcur, offsetof(struct bpf_reg_state, id)) == 0 &&
+		       range_within(rold, rcur) &&
+		       tnum_in(rold->var_off, rcur->var_off);
 	case PTR_TO_MAP_VALUE_OR_NULL:
 		/* a PTR_TO_MAP_VALUE could be safe to use as a
 		 * PTR_TO_MAP_VALUE_OR_NULL into the same map.
@@ -3380,7 +3366,6 @@ static bool states_equal(struct bpf_verifier_env *env,
 			 struct bpf_verifier_state *old,
 			 struct bpf_verifier_state *cur)
 {
-	bool varlen_map_access = env->varlen_map_value_access;
 	struct idpair *idmap;
 	bool ret = false;
 	int i;
@@ -3391,8 +3376,7 @@ static bool states_equal(struct bpf_verifier_env *env,
 		return false;
 
 	for (i = 0; i < MAX_BPF_REG; i++) {
-		if (!regsafe(&old->regs[i], &cur->regs[i], varlen_map_access,
-			     idmap))
+		if (!regsafe(&old->regs[i], &cur->regs[i], idmap))
 			goto out_free;
 	}
 
@@ -3412,7 +3396,7 @@ static bool states_equal(struct bpf_verifier_env *env,
 			continue;
 		if (!regsafe(&old->spilled_regs[i / BPF_REG_SIZE],
 			     &cur->spilled_regs[i / BPF_REG_SIZE],
-			     varlen_map_access, idmap))
+			     idmap))
 			/* when explored and current stack slot are both storing
 			 * spilled registers, check that stored pointers types
 			 * are the same as well.
@@ -3555,7 +3539,6 @@ static int do_check(struct bpf_verifier_env *env)
 	init_reg_state(regs);
 	state->parent = NULL;
 	insn_idx = 0;
-	env->varlen_map_value_access = false;
 	for (;;) {
 		struct bpf_insn *insn;
 		u8 class;
-- 
cgit v1.2.3


From 8e9cd9ce90d48369b2c5ddd79fe3d4a4cb1ccb56 Mon Sep 17 00:00:00 2001
From: Edward Cree <ecree@solarflare.com>
Date: Wed, 23 Aug 2017 15:11:21 +0100
Subject: bpf/verifier: document liveness analysis

The liveness tracking algorithm is quite subtle; add comments to explain it.

Signed-off-by: Edward Cree <ecree@solarflare.com>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/bpf_verifier.h | 13 +++++++++++++
 kernel/bpf/verifier.c        | 28 +++++++++++++++++++++++++++-
 2 files changed, 40 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h
index d8f131a36fd0..b8d200f60a40 100644
--- a/include/linux/bpf_verifier.h
+++ b/include/linux/bpf_verifier.h
@@ -21,6 +21,19 @@
  */
 #define BPF_MAX_VAR_SIZ	INT_MAX
 
+/* Liveness marks, used for registers and spilled-regs (in stack slots).
+ * Read marks propagate upwards until they find a write mark; they record that
+ * "one of this state's descendants read this reg" (and therefore the reg is
+ * relevant for states_equal() checks).
+ * Write marks collect downwards and do not propagate; they record that "the
+ * straight-line code that reached this state (from its parent) wrote this reg"
+ * (and therefore that reads propagated from this state or its descendants
+ * should not propagate to its parent).
+ * A state with a write mark can receive read marks; it just won't propagate
+ * them to its parent, since the write mark is a property, not of the state,
+ * but of the link between it and its parent.  See mark_reg_read() and
+ * mark_stack_slot_read() in kernel/bpf/verifier.c.
+ */
 enum bpf_reg_liveness {
 	REG_LIVE_NONE = 0, /* reg hasn't been read or written this branch */
 	REG_LIVE_READ, /* reg was read, so we're sensitive to initial value */
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 711bdbd22cea..d690c7dd1f1a 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -3417,6 +3417,12 @@ out_free:
 	return ret;
 }
 
+/* A write screens off any subsequent reads; but write marks come from the
+ * straight-line code between a state and its parent.  When we arrive at a
+ * jump target (in the first iteration of the propagate_liveness() loop),
+ * we didn't arrive by the straight-line code, so read marks in state must
+ * propagate to parent regardless of state's write marks.
+ */
 static bool do_propagate_liveness(const struct bpf_verifier_state *state,
 				  struct bpf_verifier_state *parent)
 {
@@ -3457,6 +3463,15 @@ static bool do_propagate_liveness(const struct bpf_verifier_state *state,
 	return touched;
 }
 
+/* "parent" is "a state from which we reach the current state", but initially
+ * it is not the state->parent (i.e. "the state whose straight-line code leads
+ * to the current state"), instead it is the state that happened to arrive at
+ * a (prunable) equivalent of the current state.  See comment above
+ * do_propagate_liveness() for consequences of this.
+ * This function is just a more efficient way of calling mark_reg_read() or
+ * mark_stack_slot_read() on each reg in "parent" that is read in "state",
+ * though it requires that parent != state->parent in the call arguments.
+ */
 static void propagate_liveness(const struct bpf_verifier_state *state,
 			       struct bpf_verifier_state *parent)
 {
@@ -3485,6 +3500,12 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx)
 			/* reached equivalent register/stack state,
 			 * prune the search.
 			 * Registers read by the continuation are read by us.
+			 * If we have any write marks in env->cur_state, they
+			 * will prevent corresponding reads in the continuation
+			 * from reaching our parent (an explored_state).  Our
+			 * own state will get the read marks recorded, but
+			 * they'll be immediately forgotten as we're pruning
+			 * this state and will pop a new one.
 			 */
 			propagate_liveness(&sl->state, &env->cur_state);
 			return 1;
@@ -3508,7 +3529,12 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx)
 	env->explored_states[insn_idx] = new_sl;
 	/* connect new state to parentage chain */
 	env->cur_state.parent = &new_sl->state;
-	/* clear liveness marks in current state */
+	/* clear write marks in current state: the writes we did are not writes
+	 * our child did, so they don't screen off its reads from us.
+	 * (There are no read marks in current state, because reads always mark
+	 * their parent and current state never has children yet.  Only
+	 * explored_states can get read marks.)
+	 */
 	for (i = 0; i < BPF_REG_FP; i++)
 		env->cur_state.regs[i].live = REG_LIVE_NONE;
 	for (i = 0; i < MAX_BPF_STACK / BPF_REG_SIZE; i++)
-- 
cgit v1.2.3


From ecda85e70277ef24e44a1f6bc00243cebd19f985 Mon Sep 17 00:00:00 2001
From: Juergen Gross <jgross@suse.com>
Date: Wed, 16 Aug 2017 19:31:57 +0200
Subject: x86/lguest: Remove lguest support

Lguest seems to be rather unused these days. It has seen only patches
ensuring it still builds the last two years and its official state is
"Odd Fixes".

Remove it in order to be able to clean up the paravirt code.

Signed-off-by: Juergen Gross <jgross@suse.com>
Acked-by: Rusty Russell <rusty@rustcorp.com.au>
Acked-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: boris.ostrovsky@oracle.com
Cc: lguest@lists.ozlabs.org
Cc: rusty@rustcorp.com.au
Cc: xen-devel@lists.xenproject.org
Link: http://lkml.kernel.org/r/20170816173157.8633-3-jgross@suse.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 MAINTAINERS                           |   11 -
 arch/x86/Kbuild                       |    3 -
 arch/x86/Kconfig                      |    2 -
 arch/x86/include/asm/lguest.h         |   91 -
 arch/x86/include/asm/lguest_hcall.h   |   74 -
 arch/x86/include/asm/processor.h      |    2 +-
 arch/x86/include/uapi/asm/bootparam.h |    2 +-
 arch/x86/kernel/asm-offsets_32.c      |   20 -
 arch/x86/kernel/head_32.S             |    2 -
 arch/x86/kernel/platform-quirks.c     |    1 -
 arch/x86/kvm/Kconfig                  |    1 -
 arch/x86/lguest/Kconfig               |   14 -
 arch/x86/lguest/Makefile              |    2 -
 arch/x86/lguest/boot.c                | 1558 ---------------
 arch/x86/lguest/head_32.S             |  192 --
 drivers/Makefile                      |    1 -
 drivers/block/Kconfig                 |    2 +-
 drivers/char/Kconfig                  |    2 +-
 drivers/char/virtio_console.c         |    2 +-
 drivers/lguest/Kconfig                |   13 -
 drivers/lguest/Makefile               |   26 -
 drivers/lguest/README                 |   47 -
 drivers/lguest/core.c                 |  398 ----
 drivers/lguest/hypercalls.c           |  304 ---
 drivers/lguest/interrupts_and_traps.c |  706 -------
 drivers/lguest/lg.h                   |  258 ---
 drivers/lguest/lguest_user.c          |  446 -----
 drivers/lguest/page_tables.c          | 1239 ------------
 drivers/lguest/segments.c             |  228 ---
 drivers/lguest/x86/core.c             |  724 -------
 drivers/lguest/x86/switcher_32.S      |  388 ----
 drivers/net/Kconfig                   |    2 +-
 drivers/tty/hvc/Kconfig               |    2 +-
 drivers/virtio/Kconfig                |    4 +-
 include/linux/lguest.h                |   73 -
 include/linux/lguest_launcher.h       |   44 -
 include/uapi/linux/virtio_ring.h      |    4 +-
 tools/Makefile                        |   11 +-
 tools/lguest/.gitignore               |    2 -
 tools/lguest/Makefile                 |   14 -
 tools/lguest/extract                  |   58 -
 tools/lguest/lguest.c                 | 3420 ---------------------------------
 tools/lguest/lguest.txt               |  125 --
 43 files changed, 16 insertions(+), 10502 deletions(-)
 delete mode 100644 arch/x86/include/asm/lguest.h
 delete mode 100644 arch/x86/include/asm/lguest_hcall.h
 delete mode 100644 arch/x86/lguest/Kconfig
 delete mode 100644 arch/x86/lguest/Makefile
 delete mode 100644 arch/x86/lguest/boot.c
 delete mode 100644 arch/x86/lguest/head_32.S
 delete mode 100644 drivers/lguest/Kconfig
 delete mode 100644 drivers/lguest/Makefile
 delete mode 100644 drivers/lguest/README
 delete mode 100644 drivers/lguest/core.c
 delete mode 100644 drivers/lguest/hypercalls.c
 delete mode 100644 drivers/lguest/interrupts_and_traps.c
 delete mode 100644 drivers/lguest/lg.h
 delete mode 100644 drivers/lguest/lguest_user.c
 delete mode 100644 drivers/lguest/page_tables.c
 delete mode 100644 drivers/lguest/segments.c
 delete mode 100644 drivers/lguest/x86/core.c
 delete mode 100644 drivers/lguest/x86/switcher_32.S
 delete mode 100644 include/linux/lguest.h
 delete mode 100644 include/linux/lguest_launcher.h
 delete mode 100644 tools/lguest/.gitignore
 delete mode 100644 tools/lguest/Makefile
 delete mode 100644 tools/lguest/extract
 delete mode 100644 tools/lguest/lguest.c
 delete mode 100644 tools/lguest/lguest.txt

(limited to 'include')

diff --git a/MAINTAINERS b/MAINTAINERS
index 84d6a8277cbd..6c8b66d2adcb 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -7640,17 +7640,6 @@ T:	git git://linuxtv.org/mkrufky/tuners.git
 S:	Maintained
 F:	drivers/media/dvb-frontends/lgdt3305.*
 
-LGUEST
-M:	Rusty Russell <rusty@rustcorp.com.au>
-L:	lguest@lists.ozlabs.org
-W:	http://lguest.ozlabs.org/
-S:	Odd Fixes
-F:	arch/x86/include/asm/lguest*.h
-F:	arch/x86/lguest/
-F:	drivers/lguest/
-F:	include/linux/lguest*.h
-F:	tools/lguest/
-
 LIBATA PATA ARASAN COMPACT FLASH CONTROLLER
 M:	Viresh Kumar <vireshk@kernel.org>
 L:	linux-ide@vger.kernel.org
diff --git a/arch/x86/Kbuild b/arch/x86/Kbuild
index 586b786b3edf..f65a804b86f0 100644
--- a/arch/x86/Kbuild
+++ b/arch/x86/Kbuild
@@ -10,9 +10,6 @@ obj-$(CONFIG_XEN) += xen/
 # Hyper-V paravirtualization support
 obj-$(CONFIG_HYPERVISOR_GUEST) += hyperv/
 
-# lguest paravirtualization support
-obj-$(CONFIG_LGUEST_GUEST) += lguest/
-
 obj-y += realmode/
 obj-y += kernel/
 obj-y += mm/
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 9b302121584d..651021713385 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -777,8 +777,6 @@ config KVM_DEBUG_FS
 	  Statistics are displayed in debugfs filesystem. Enabling this option
 	  may incur significant overhead.
 
-source "arch/x86/lguest/Kconfig"
-
 config PARAVIRT_TIME_ACCOUNTING
 	bool "Paravirtual steal time accounting"
 	depends on PARAVIRT
diff --git a/arch/x86/include/asm/lguest.h b/arch/x86/include/asm/lguest.h
deleted file mode 100644
index 73d0c9b92087..000000000000
--- a/arch/x86/include/asm/lguest.h
+++ /dev/null
@@ -1,91 +0,0 @@
-#ifndef _ASM_X86_LGUEST_H
-#define _ASM_X86_LGUEST_H
-
-#define GDT_ENTRY_LGUEST_CS	10
-#define GDT_ENTRY_LGUEST_DS	11
-#define LGUEST_CS		(GDT_ENTRY_LGUEST_CS * 8)
-#define LGUEST_DS		(GDT_ENTRY_LGUEST_DS * 8)
-
-#ifndef __ASSEMBLY__
-#include <asm/desc.h>
-
-#define GUEST_PL 1
-
-/* Page for Switcher text itself, then two pages per cpu */
-#define SWITCHER_TEXT_PAGES (1)
-#define SWITCHER_STACK_PAGES (2 * nr_cpu_ids)
-#define TOTAL_SWITCHER_PAGES (SWITCHER_TEXT_PAGES + SWITCHER_STACK_PAGES)
-
-/* Where we map the Switcher, in both Host and Guest. */
-extern unsigned long switcher_addr;
-
-/* Found in switcher.S */
-extern unsigned long default_idt_entries[];
-
-/* Declarations for definitions in arch/x86/lguest/head_32.S */
-extern char lguest_noirq_iret[];
-extern const char lgstart_cli[], lgend_cli[];
-extern const char lgstart_pushf[], lgend_pushf[];
-
-extern void lguest_iret(void);
-extern void lguest_init(void);
-
-struct lguest_regs {
-	/* Manually saved part. */
-	unsigned long eax, ebx, ecx, edx;
-	unsigned long esi, edi, ebp;
-	unsigned long gs;
-	unsigned long fs, ds, es;
-	unsigned long trapnum, errcode;
-	/* Trap pushed part */
-	unsigned long eip;
-	unsigned long cs;
-	unsigned long eflags;
-	unsigned long esp;
-	unsigned long ss;
-};
-
-/* This is a guest-specific page (mapped ro) into the guest. */
-struct lguest_ro_state {
-	/* Host information we need to restore when we switch back. */
-	u32 host_cr3;
-	struct desc_ptr host_idt_desc;
-	struct desc_ptr host_gdt_desc;
-	u32 host_sp;
-
-	/* Fields which are used when guest is running. */
-	struct desc_ptr guest_idt_desc;
-	struct desc_ptr guest_gdt_desc;
-	struct x86_hw_tss guest_tss;
-	struct desc_struct guest_idt[IDT_ENTRIES];
-	struct desc_struct guest_gdt[GDT_ENTRIES];
-};
-
-struct lg_cpu_arch {
-	/* The GDT entries copied into lguest_ro_state when running. */
-	struct desc_struct gdt[GDT_ENTRIES];
-
-	/* The IDT entries: some copied into lguest_ro_state when running. */
-	struct desc_struct idt[IDT_ENTRIES];
-
-	/* The address of the last guest-visible pagefault (ie. cr2). */
-	unsigned long last_pagefault;
-};
-
-static inline void lguest_set_ts(void)
-{
-	u32 cr0;
-
-	cr0 = read_cr0();
-	if (!(cr0 & 8))
-		write_cr0(cr0 | 8);
-}
-
-/* Full 4G segment descriptors, suitable for CS and DS. */
-#define FULL_EXEC_SEGMENT \
-	((struct desc_struct)GDT_ENTRY_INIT(0xc09b, 0, 0xfffff))
-#define FULL_SEGMENT ((struct desc_struct)GDT_ENTRY_INIT(0xc093, 0, 0xfffff))
-
-#endif /* __ASSEMBLY__ */
-
-#endif /* _ASM_X86_LGUEST_H */
diff --git a/arch/x86/include/asm/lguest_hcall.h b/arch/x86/include/asm/lguest_hcall.h
deleted file mode 100644
index 6c119cfae218..000000000000
--- a/arch/x86/include/asm/lguest_hcall.h
+++ /dev/null
@@ -1,74 +0,0 @@
-/* Architecture specific portion of the lguest hypercalls */
-#ifndef _ASM_X86_LGUEST_HCALL_H
-#define _ASM_X86_LGUEST_HCALL_H
-
-#define LHCALL_FLUSH_ASYNC	0
-#define LHCALL_LGUEST_INIT	1
-#define LHCALL_SHUTDOWN		2
-#define LHCALL_NEW_PGTABLE	4
-#define LHCALL_FLUSH_TLB	5
-#define LHCALL_LOAD_IDT_ENTRY	6
-#define LHCALL_SET_STACK	7
-#define LHCALL_SET_CLOCKEVENT	9
-#define LHCALL_HALT		10
-#define LHCALL_SET_PMD		13
-#define LHCALL_SET_PTE		14
-#define LHCALL_SET_PGD		15
-#define LHCALL_LOAD_TLS		16
-#define LHCALL_LOAD_GDT_ENTRY	18
-#define LHCALL_SEND_INTERRUPTS	19
-
-#define LGUEST_TRAP_ENTRY 0x1F
-
-/* Argument number 3 to LHCALL_LGUEST_SHUTDOWN */
-#define LGUEST_SHUTDOWN_POWEROFF	1
-#define LGUEST_SHUTDOWN_RESTART		2
-
-#ifndef __ASSEMBLY__
-#include <asm/hw_irq.h>
-
-/*G:030
- * But first, how does our Guest contact the Host to ask for privileged
- * operations?  There are two ways: the direct way is to make a "hypercall",
- * to make requests of the Host Itself.
- *
- * Our hypercall mechanism uses the highest unused trap code (traps 32 and
- * above are used by real hardware interrupts).  Seventeen hypercalls are
- * available: the hypercall number is put in the %eax register, and the
- * arguments (when required) are placed in %ebx, %ecx, %edx and %esi.
- * If a return value makes sense, it's returned in %eax.
- *
- * Grossly invalid calls result in Sudden Death at the hands of the vengeful
- * Host, rather than returning failure.  This reflects Winston Churchill's
- * definition of a gentleman: "someone who is only rude intentionally".
- */
-static inline unsigned long
-hcall(unsigned long call,
-      unsigned long arg1, unsigned long arg2, unsigned long arg3,
-      unsigned long arg4)
-{
-	/* "int" is the Intel instruction to trigger a trap. */
-	asm volatile("int $" __stringify(LGUEST_TRAP_ENTRY)
-		     /* The call in %eax (aka "a") might be overwritten */
-		     : "=a"(call)
-		       /* The arguments are in %eax, %ebx, %ecx, %edx & %esi */
-		     : "a"(call), "b"(arg1), "c"(arg2), "d"(arg3), "S"(arg4)
-		       /* "memory" means this might write somewhere in memory.
-			* This isn't true for all calls, but it's safe to tell
-			* gcc that it might happen so it doesn't get clever. */
-		     : "memory");
-	return call;
-}
-/*:*/
-
-/* Can't use our min() macro here: needs to be a constant */
-#define LGUEST_IRQS (NR_IRQS < 32 ? NR_IRQS: 32)
-
-#define LHCALL_RING_SIZE 64
-struct hcall_args {
-	/* These map directly onto eax/ebx/ecx/edx/esi in struct lguest_regs */
-	unsigned long arg0, arg1, arg2, arg3, arg4;
-};
-
-#endif /* !__ASSEMBLY__ */
-#endif /* _ASM_X86_LGUEST_HCALL_H */
diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
index 0b03d655db7c..abc99b9c7ffd 100644
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -662,7 +662,7 @@ static inline void sync_core(void)
 	 * In case NMI unmasking or performance ever becomes a problem,
 	 * the next best option appears to be MOV-to-CR2 and an
 	 * unconditional jump.  That sequence also works on all CPUs,
-	 * but it will fault at CPL3 (i.e. Xen PV and lguest).
+	 * but it will fault at CPL3 (i.e. Xen PV).
 	 *
 	 * CPUID is the conventional way, but it's nasty: it doesn't
 	 * exist on some 486-like CPUs, and it usually exits to a
diff --git a/arch/x86/include/uapi/asm/bootparam.h b/arch/x86/include/uapi/asm/bootparam.h
index ddef37b16af2..66b8f93333d1 100644
--- a/arch/x86/include/uapi/asm/bootparam.h
+++ b/arch/x86/include/uapi/asm/bootparam.h
@@ -201,7 +201,7 @@ struct boot_params {
  *
  * @X86_SUBARCH_PC: Should be used if the hardware is enumerable using standard
  *	PC mechanisms (PCI, ACPI) and doesn't need a special boot flow.
- * @X86_SUBARCH_LGUEST: Used for x86 hypervisor demo, lguest
+ * @X86_SUBARCH_LGUEST: Used for x86 hypervisor demo, lguest, deprecated
  * @X86_SUBARCH_XEN: Used for Xen guest types which follow the PV boot path,
  * 	which start at asm startup_xen() entry point and later jump to the C
  * 	xen_start_kernel() entry point. Both domU and dom0 type of guests are
diff --git a/arch/x86/kernel/asm-offsets_32.c b/arch/x86/kernel/asm-offsets_32.c
index 880aa093268d..710edab9e644 100644
--- a/arch/x86/kernel/asm-offsets_32.c
+++ b/arch/x86/kernel/asm-offsets_32.c
@@ -4,9 +4,6 @@
 
 #include <asm/ucontext.h>
 
-#include <linux/lguest.h>
-#include "../../../drivers/lguest/lg.h"
-
 #define __SYSCALL_I386(nr, sym, qual) [nr] = 1,
 static char syscalls[] = {
 #include <asm/syscalls_32.h>
@@ -62,23 +59,6 @@ void foo(void)
 	OFFSET(stack_canary_offset, stack_canary, canary);
 #endif
 
-#if defined(CONFIG_LGUEST) || defined(CONFIG_LGUEST_GUEST) || defined(CONFIG_LGUEST_MODULE)
-	BLANK();
-	OFFSET(LGUEST_DATA_irq_enabled, lguest_data, irq_enabled);
-	OFFSET(LGUEST_DATA_irq_pending, lguest_data, irq_pending);
-
-	BLANK();
-	OFFSET(LGUEST_PAGES_host_gdt_desc, lguest_pages, state.host_gdt_desc);
-	OFFSET(LGUEST_PAGES_host_idt_desc, lguest_pages, state.host_idt_desc);
-	OFFSET(LGUEST_PAGES_host_cr3, lguest_pages, state.host_cr3);
-	OFFSET(LGUEST_PAGES_host_sp, lguest_pages, state.host_sp);
-	OFFSET(LGUEST_PAGES_guest_gdt_desc, lguest_pages,state.guest_gdt_desc);
-	OFFSET(LGUEST_PAGES_guest_idt_desc, lguest_pages,state.guest_idt_desc);
-	OFFSET(LGUEST_PAGES_guest_gdt, lguest_pages, state.guest_gdt);
-	OFFSET(LGUEST_PAGES_regs_trapnum, lguest_pages, regs.trapnum);
-	OFFSET(LGUEST_PAGES_regs_errcode, lguest_pages, regs.errcode);
-	OFFSET(LGUEST_PAGES_regs, lguest_pages, regs);
-#endif
 	BLANK();
 	DEFINE(__NR_syscall_max, sizeof(syscalls) - 1);
 	DEFINE(NR_syscalls, sizeof(syscalls));
diff --git a/arch/x86/kernel/head_32.S b/arch/x86/kernel/head_32.S
index 0332664eb158..29da9599fec0 100644
--- a/arch/x86/kernel/head_32.S
+++ b/arch/x86/kernel/head_32.S
@@ -155,7 +155,6 @@ ENTRY(startup_32)
 	jmp *%eax
 
 .Lbad_subarch:
-WEAK(lguest_entry)
 WEAK(xen_entry)
 	/* Unknown implementation; there's really
 	   nothing we can do at this point. */
@@ -165,7 +164,6 @@ WEAK(xen_entry)
 
 subarch_entries:
 	.long .Ldefault_entry		/* normal x86/PC */
-	.long lguest_entry		/* lguest hypervisor */
 	.long xen_entry			/* Xen hypervisor */
 	.long .Ldefault_entry		/* Moorestown MID */
 num_subarch_entries = (. - subarch_entries) / 4
diff --git a/arch/x86/kernel/platform-quirks.c b/arch/x86/kernel/platform-quirks.c
index 91271122f0df..502a77d0adb0 100644
--- a/arch/x86/kernel/platform-quirks.c
+++ b/arch/x86/kernel/platform-quirks.c
@@ -16,7 +16,6 @@ void __init x86_early_init_platform_quirks(void)
 		x86_platform.legacy.reserve_bios_regions = 1;
 		break;
 	case X86_SUBARCH_XEN:
-	case X86_SUBARCH_LGUEST:
 		x86_platform.legacy.devices.pnpbios = 0;
 		x86_platform.legacy.rtc = 0;
 		break;
diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig
index 2688c7dc5323..3ea624452f93 100644
--- a/arch/x86/kvm/Kconfig
+++ b/arch/x86/kvm/Kconfig
@@ -89,6 +89,5 @@ config KVM_MMU_AUDIT
 # OK, it's a little counter-intuitive to do this, but it puts it neatly under
 # the virtualization menu.
 source drivers/vhost/Kconfig
-source drivers/lguest/Kconfig
 
 endif # VIRTUALIZATION
diff --git a/arch/x86/lguest/Kconfig b/arch/x86/lguest/Kconfig
deleted file mode 100644
index 08f41caada45..000000000000
--- a/arch/x86/lguest/Kconfig
+++ /dev/null
@@ -1,14 +0,0 @@
-config LGUEST_GUEST
-	bool "Lguest guest support"
-	depends on X86_32 && PARAVIRT && PCI
-	select TTY
-	select VIRTUALIZATION
-	select VIRTIO
-	select VIRTIO_CONSOLE
-	help
-	  Lguest is a tiny in-kernel hypervisor.  Selecting this will
-	  allow your kernel to boot under lguest.  This option will increase
-	  your kernel size by about 10k.  If in doubt, say N.
-
-	  If you say Y here, make sure you say Y (or M) to the virtio block
-	  and net drivers which lguest needs.
diff --git a/arch/x86/lguest/Makefile b/arch/x86/lguest/Makefile
deleted file mode 100644
index 8f38d577a2fa..000000000000
--- a/arch/x86/lguest/Makefile
+++ /dev/null
@@ -1,2 +0,0 @@
-obj-y		:= head_32.o boot.o
-CFLAGS_boot.o	:= $(call cc-option, -fno-stack-protector)
diff --git a/arch/x86/lguest/boot.c b/arch/x86/lguest/boot.c
deleted file mode 100644
index 99472698c931..000000000000
--- a/arch/x86/lguest/boot.c
+++ /dev/null
@@ -1,1558 +0,0 @@
-/*P:010
- * A hypervisor allows multiple Operating Systems to run on a single machine.
- * To quote David Wheeler: "Any problem in computer science can be solved with
- * another layer of indirection."
- *
- * We keep things simple in two ways.  First, we start with a normal Linux
- * kernel and insert a module (lg.ko) which allows us to run other Linux
- * kernels the same way we'd run processes.  We call the first kernel the Host,
- * and the others the Guests.  The program which sets up and configures Guests
- * (such as the example in tools/lguest/lguest.c) is called the Launcher.
- *
- * Secondly, we only run specially modified Guests, not normal kernels: setting
- * CONFIG_LGUEST_GUEST to "y" compiles this file into the kernel so it knows
- * how to be a Guest at boot time.  This means that you can use the same kernel
- * you boot normally (ie. as a Host) as a Guest.
- *
- * These Guests know that they cannot do privileged operations, such as disable
- * interrupts, and that they have to ask the Host to do such things explicitly.
- * This file consists of all the replacements for such low-level native
- * hardware operations: these special Guest versions call the Host.
- *
- * So how does the kernel know it's a Guest?  We'll see that later, but let's
- * just say that we end up here where we replace the native functions various
- * "paravirt" structures with our Guest versions, then boot like normal.
-:*/
-
-/*
- * Copyright (C) 2006, Rusty Russell <rusty@rustcorp.com.au> IBM Corporation.
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
- * NON INFRINGEMENT.  See the GNU General Public License for more
- * details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
- */
-#include <linux/kernel.h>
-#include <linux/start_kernel.h>
-#include <linux/string.h>
-#include <linux/console.h>
-#include <linux/screen_info.h>
-#include <linux/irq.h>
-#include <linux/interrupt.h>
-#include <linux/clocksource.h>
-#include <linux/clockchips.h>
-#include <linux/lguest.h>
-#include <linux/lguest_launcher.h>
-#include <linux/virtio_console.h>
-#include <linux/pm.h>
-#include <linux/export.h>
-#include <linux/pci.h>
-#include <linux/virtio_pci.h>
-#include <asm/acpi.h>
-#include <asm/apic.h>
-#include <asm/lguest.h>
-#include <asm/paravirt.h>
-#include <asm/param.h>
-#include <asm/page.h>
-#include <asm/pgtable.h>
-#include <asm/desc.h>
-#include <asm/setup.h>
-#include <asm/e820/api.h>
-#include <asm/mce.h>
-#include <asm/io.h>
-#include <asm/fpu/api.h>
-#include <asm/stackprotector.h>
-#include <asm/reboot.h>		/* for struct machine_ops */
-#include <asm/kvm_para.h>
-#include <asm/pci_x86.h>
-#include <asm/pci-direct.h>
-
-/*G:010
- * Welcome to the Guest!
- *
- * The Guest in our tale is a simple creature: identical to the Host but
- * behaving in simplified but equivalent ways.  In particular, the Guest is the
- * same kernel as the Host (or at least, built from the same source code).
-:*/
-
-struct lguest_data lguest_data = {
-	.hcall_status = { [0 ... LHCALL_RING_SIZE-1] = 0xFF },
-	.noirq_iret = (u32)lguest_noirq_iret,
-	.kernel_address = PAGE_OFFSET,
-	.blocked_interrupts = { 1 }, /* Block timer interrupts */
-	.syscall_vec = IA32_SYSCALL_VECTOR,
-};
-
-/*G:037
- * async_hcall() is pretty simple: I'm quite proud of it really.  We have a
- * ring buffer of stored hypercalls which the Host will run though next time we
- * do a normal hypercall.  Each entry in the ring has 5 slots for the hypercall
- * arguments, and a "hcall_status" word which is 0 if the call is ready to go,
- * and 255 once the Host has finished with it.
- *
- * If we come around to a slot which hasn't been finished, then the table is
- * full and we just make the hypercall directly.  This has the nice side
- * effect of causing the Host to run all the stored calls in the ring buffer
- * which empties it for next time!
- */
-static void async_hcall(unsigned long call, unsigned long arg1,
-			unsigned long arg2, unsigned long arg3,
-			unsigned long arg4)
-{
-	/* Note: This code assumes we're uniprocessor. */
-	static unsigned int next_call;
-	unsigned long flags;
-
-	/*
-	 * Disable interrupts if not already disabled: we don't want an
-	 * interrupt handler making a hypercall while we're already doing
-	 * one!
-	 */
-	local_irq_save(flags);
-	if (lguest_data.hcall_status[next_call] != 0xFF) {
-		/* Table full, so do normal hcall which will flush table. */
-		hcall(call, arg1, arg2, arg3, arg4);
-	} else {
-		lguest_data.hcalls[next_call].arg0 = call;
-		lguest_data.hcalls[next_call].arg1 = arg1;
-		lguest_data.hcalls[next_call].arg2 = arg2;
-		lguest_data.hcalls[next_call].arg3 = arg3;
-		lguest_data.hcalls[next_call].arg4 = arg4;
-		/* Arguments must all be written before we mark it to go */
-		wmb();
-		lguest_data.hcall_status[next_call] = 0;
-		if (++next_call == LHCALL_RING_SIZE)
-			next_call = 0;
-	}
-	local_irq_restore(flags);
-}
-
-/*G:035
- * Notice the lazy_hcall() above, rather than hcall().  This is our first real
- * optimization trick!
- *
- * When lazy_mode is set, it means we're allowed to defer all hypercalls and do
- * them as a batch when lazy_mode is eventually turned off.  Because hypercalls
- * are reasonably expensive, batching them up makes sense.  For example, a
- * large munmap might update dozens of page table entries: that code calls
- * paravirt_enter_lazy_mmu(), does the dozen updates, then calls
- * lguest_leave_lazy_mode().
- *
- * So, when we're in lazy mode, we call async_hcall() to store the call for
- * future processing:
- */
-static void lazy_hcall1(unsigned long call, unsigned long arg1)
-{
-	if (paravirt_get_lazy_mode() == PARAVIRT_LAZY_NONE)
-		hcall(call, arg1, 0, 0, 0);
-	else
-		async_hcall(call, arg1, 0, 0, 0);
-}
-
-/* You can imagine what lazy_hcall2, 3 and 4 look like. :*/
-static void lazy_hcall2(unsigned long call,
-			unsigned long arg1,
-			unsigned long arg2)
-{
-	if (paravirt_get_lazy_mode() == PARAVIRT_LAZY_NONE)
-		hcall(call, arg1, arg2, 0, 0);
-	else
-		async_hcall(call, arg1, arg2, 0, 0);
-}
-
-static void lazy_hcall3(unsigned long call,
-			unsigned long arg1,
-			unsigned long arg2,
-			unsigned long arg3)
-{
-	if (paravirt_get_lazy_mode() == PARAVIRT_LAZY_NONE)
-		hcall(call, arg1, arg2, arg3, 0);
-	else
-		async_hcall(call, arg1, arg2, arg3, 0);
-}
-
-#ifdef CONFIG_X86_PAE
-static void lazy_hcall4(unsigned long call,
-			unsigned long arg1,
-			unsigned long arg2,
-			unsigned long arg3,
-			unsigned long arg4)
-{
-	if (paravirt_get_lazy_mode() == PARAVIRT_LAZY_NONE)
-		hcall(call, arg1, arg2, arg3, arg4);
-	else
-		async_hcall(call, arg1, arg2, arg3, arg4);
-}
-#endif
-
-/*G:036
- * When lazy mode is turned off, we issue the do-nothing hypercall to
- * flush any stored calls, and call the generic helper to reset the
- * per-cpu lazy mode variable.
- */
-static void lguest_leave_lazy_mmu_mode(void)
-{
-	hcall(LHCALL_FLUSH_ASYNC, 0, 0, 0, 0);
-	paravirt_leave_lazy_mmu();
-}
-
-/*
- * We also catch the end of context switch; we enter lazy mode for much of
- * that too, so again we need to flush here.
- *
- * (Technically, this is lazy CPU mode, and normally we're in lazy MMU
- * mode, but unlike Xen, lguest doesn't care about the difference).
- */
-static void lguest_end_context_switch(struct task_struct *next)
-{
-	hcall(LHCALL_FLUSH_ASYNC, 0, 0, 0, 0);
-	paravirt_end_context_switch(next);
-}
-
-/*G:032
- * After that diversion we return to our first native-instruction
- * replacements: four functions for interrupt control.
- *
- * The simplest way of implementing these would be to have "turn interrupts
- * off" and "turn interrupts on" hypercalls.  Unfortunately, this is too slow:
- * these are by far the most commonly called functions of those we override.
- *
- * So instead we keep an "irq_enabled" field inside our "struct lguest_data",
- * which the Guest can update with a single instruction.  The Host knows to
- * check there before it tries to deliver an interrupt.
- */
-
-/*
- * save_flags() is expected to return the processor state (ie. "flags").  The
- * flags word contains all kind of stuff, but in practice Linux only cares
- * about the interrupt flag.  Our "save_flags()" just returns that.
- */
-asmlinkage __visible unsigned long lguest_save_fl(void)
-{
-	return lguest_data.irq_enabled;
-}
-
-/* Interrupts go off... */
-asmlinkage __visible void lguest_irq_disable(void)
-{
-	lguest_data.irq_enabled = 0;
-}
-
-/*
- * Let's pause a moment.  Remember how I said these are called so often?
- * Jeremy Fitzhardinge optimized them so hard early in 2009 that he had to
- * break some rules.  In particular, these functions are assumed to save their
- * own registers if they need to: normal C functions assume they can trash the
- * eax register.  To use normal C functions, we use
- * PV_CALLEE_SAVE_REGS_THUNK(), which pushes %eax onto the stack, calls the
- * C function, then restores it.
- */
-PV_CALLEE_SAVE_REGS_THUNK(lguest_save_fl);
-PV_CALLEE_SAVE_REGS_THUNK(lguest_irq_disable);
-/*:*/
-
-/* These are in head_32.S */
-extern void lg_irq_enable(void);
-extern void lg_restore_fl(unsigned long flags);
-
-/*M:003
- * We could be more efficient in our checking of outstanding interrupts, rather
- * than using a branch.  One way would be to put the "irq_enabled" field in a
- * page by itself, and have the Host write-protect it when an interrupt comes
- * in when irqs are disabled.  There will then be a page fault as soon as
- * interrupts are re-enabled.
- *
- * A better method is to implement soft interrupt disable generally for x86:
- * instead of disabling interrupts, we set a flag.  If an interrupt does come
- * in, we then disable them for real.  This is uncommon, so we could simply use
- * a hypercall for interrupt control and not worry about efficiency.
-:*/
-
-/*G:034
- * The Interrupt Descriptor Table (IDT).
- *
- * The IDT tells the processor what to do when an interrupt comes in.  Each
- * entry in the table is a 64-bit descriptor: this holds the privilege level,
- * address of the handler, and... well, who cares?  The Guest just asks the
- * Host to make the change anyway, because the Host controls the real IDT.
- */
-static void lguest_write_idt_entry(gate_desc *dt,
-				   int entrynum, const gate_desc *g)
-{
-	/*
-	 * The gate_desc structure is 8 bytes long: we hand it to the Host in
-	 * two 32-bit chunks.  The whole 32-bit kernel used to hand descriptors
-	 * around like this; typesafety wasn't a big concern in Linux's early
-	 * years.
-	 */
-	u32 *desc = (u32 *)g;
-	/* Keep the local copy up to date. */
-	native_write_idt_entry(dt, entrynum, g);
-	/* Tell Host about this new entry. */
-	hcall(LHCALL_LOAD_IDT_ENTRY, entrynum, desc[0], desc[1], 0);
-}
-
-/*
- * Changing to a different IDT is very rare: we keep the IDT up-to-date every
- * time it is written, so we can simply loop through all entries and tell the
- * Host about them.
- */
-static void lguest_load_idt(const struct desc_ptr *desc)
-{
-	unsigned int i;
-	struct desc_struct *idt = (void *)desc->address;
-
-	for (i = 0; i < (desc->size+1)/8; i++)
-		hcall(LHCALL_LOAD_IDT_ENTRY, i, idt[i].a, idt[i].b, 0);
-}
-
-/*
- * The Global Descriptor Table.
- *
- * The Intel architecture defines another table, called the Global Descriptor
- * Table (GDT).  You tell the CPU where it is (and its size) using the "lgdt"
- * instruction, and then several other instructions refer to entries in the
- * table.  There are three entries which the Switcher needs, so the Host simply
- * controls the entire thing and the Guest asks it to make changes using the
- * LOAD_GDT hypercall.
- *
- * This is the exactly like the IDT code.
- */
-static void lguest_load_gdt(const struct desc_ptr *desc)
-{
-	unsigned int i;
-	struct desc_struct *gdt = (void *)desc->address;
-
-	for (i = 0; i < (desc->size+1)/8; i++)
-		hcall(LHCALL_LOAD_GDT_ENTRY, i, gdt[i].a, gdt[i].b, 0);
-}
-
-/*
- * For a single GDT entry which changes, we simply change our copy and
- * then tell the host about it.
- */
-static void lguest_write_gdt_entry(struct desc_struct *dt, int entrynum,
-				   const void *desc, int type)
-{
-	native_write_gdt_entry(dt, entrynum, desc, type);
-	/* Tell Host about this new entry. */
-	hcall(LHCALL_LOAD_GDT_ENTRY, entrynum,
-	      dt[entrynum].a, dt[entrynum].b, 0);
-}
-
-/*
- * There are three "thread local storage" GDT entries which change
- * on every context switch (these three entries are how glibc implements
- * __thread variables).  As an optimization, we have a hypercall
- * specifically for this case.
- *
- * Wouldn't it be nicer to have a general LOAD_GDT_ENTRIES hypercall
- * which took a range of entries?
- */
-static void lguest_load_tls(struct thread_struct *t, unsigned int cpu)
-{
-	/*
-	 * There's one problem which normal hardware doesn't have: the Host
-	 * can't handle us removing entries we're currently using.  So we clear
-	 * the GS register here: if it's needed it'll be reloaded anyway.
-	 */
-	lazy_load_gs(0);
-	lazy_hcall2(LHCALL_LOAD_TLS, __pa(&t->tls_array), cpu);
-}
-
-/*G:038
- * That's enough excitement for now, back to ploughing through each of the
- * different pv_ops structures (we're about 1/3 of the way through).
- *
- * This is the Local Descriptor Table, another weird Intel thingy.  Linux only
- * uses this for some strange applications like Wine.  We don't do anything
- * here, so they'll get an informative and friendly Segmentation Fault.
- */
-static void lguest_set_ldt(const void *addr, unsigned entries)
-{
-}
-
-/*
- * This loads a GDT entry into the "Task Register": that entry points to a
- * structure called the Task State Segment.  Some comments scattered though the
- * kernel code indicate that this used for task switching in ages past, along
- * with blood sacrifice and astrology.
- *
- * Now there's nothing interesting in here that we don't get told elsewhere.
- * But the native version uses the "ltr" instruction, which makes the Host
- * complain to the Guest about a Segmentation Fault and it'll oops.  So we
- * override the native version with a do-nothing version.
- */
-static void lguest_load_tr_desc(void)
-{
-}
-
-/*
- * The "cpuid" instruction is a way of querying both the CPU identity
- * (manufacturer, model, etc) and its features.  It was introduced before the
- * Pentium in 1993 and keeps getting extended by both Intel, AMD and others.
- * As you might imagine, after a decade and a half this treatment, it is now a
- * giant ball of hair.  Its entry in the current Intel manual runs to 28 pages.
- *
- * This instruction even it has its own Wikipedia entry.  The Wikipedia entry
- * has been translated into 6 languages.  I am not making this up!
- *
- * We could get funky here and identify ourselves as "GenuineLguest", but
- * instead we just use the real "cpuid" instruction.  Then I pretty much turned
- * off feature bits until the Guest booted.  (Don't say that: you'll damage
- * lguest sales!)  Shut up, inner voice!  (Hey, just pointing out that this is
- * hardly future proof.)  No one's listening!  They don't like you anyway,
- * parenthetic weirdo!
- *
- * Replacing the cpuid so we can turn features off is great for the kernel, but
- * anyone (including userspace) can just use the raw "cpuid" instruction and
- * the Host won't even notice since it isn't privileged.  So we try not to get
- * too worked up about it.
- */
-static void lguest_cpuid(unsigned int *ax, unsigned int *bx,
-			 unsigned int *cx, unsigned int *dx)
-{
-	int function = *ax;
-
-	native_cpuid(ax, bx, cx, dx);
-	switch (function) {
-	/*
-	 * CPUID 0 gives the highest legal CPUID number (and the ID string).
-	 * We futureproof our code a little by sticking to known CPUID values.
-	 */
-	case 0:
-		if (*ax > 5)
-			*ax = 5;
-		break;
-
-	/*
-	 * CPUID 1 is a basic feature request.
-	 *
-	 * CX: we only allow kernel to see SSE3, CMPXCHG16B and SSSE3
-	 * DX: SSE, SSE2, FXSR, MMX, CMOV, CMPXCHG8B, TSC, FPU and PAE.
-	 */
-	case 1:
-		*cx &= 0x00002201;
-		*dx &= 0x07808151;
-		/*
-		 * The Host can do a nice optimization if it knows that the
-		 * kernel mappings (addresses above 0xC0000000 or whatever
-		 * PAGE_OFFSET is set to) haven't changed.  But Linux calls
-		 * flush_tlb_user() for both user and kernel mappings unless
-		 * the Page Global Enable (PGE) feature bit is set.
-		 */
-		*dx |= 0x00002000;
-		/*
-		 * We also lie, and say we're family id 5.  6 or greater
-		 * leads to a rdmsr in early_init_intel which we can't handle.
-		 * Family ID is returned as bits 8-12 in ax.
-		 */
-		*ax &= 0xFFFFF0FF;
-		*ax |= 0x00000500;
-		break;
-
-	/*
-	 * This is used to detect if we're running under KVM.  We might be,
-	 * but that's a Host matter, not us.  So say we're not.
-	 */
-	case KVM_CPUID_SIGNATURE:
-		*bx = *cx = *dx = 0;
-		break;
-
-	/*
-	 * 0x80000000 returns the highest Extended Function, so we futureproof
-	 * like we do above by limiting it to known fields.
-	 */
-	case 0x80000000:
-		if (*ax > 0x80000008)
-			*ax = 0x80000008;
-		break;
-
-	/*
-	 * PAE systems can mark pages as non-executable.  Linux calls this the
-	 * NX bit.  Intel calls it XD (eXecute Disable), AMD EVP (Enhanced
-	 * Virus Protection).  We just switch it off here, since we don't
-	 * support it.
-	 */
-	case 0x80000001:
-		*dx &= ~(1 << 20);
-		break;
-	}
-}
-
-/*
- * Intel has four control registers, imaginatively named cr0, cr2, cr3 and cr4.
- * I assume there's a cr1, but it hasn't bothered us yet, so we'll not bother
- * it.  The Host needs to know when the Guest wants to change them, so we have
- * a whole series of functions like read_cr0() and write_cr0().
- *
- * We start with cr0.  cr0 allows you to turn on and off all kinds of basic
- * features, but the only cr0 bit that Linux ever used at runtime was the
- * horrifically-named Task Switched (TS) bit at bit 3 (ie. 8)
- *
- * What does the TS bit do?  Well, it causes the CPU to trap (interrupt 7) if
- * the floating point unit is used.  Which allows us to restore FPU state
- * lazily after a task switch if we wanted to, but wouldn't a name like
- * "FPUTRAP bit" be a little less cryptic?
- *
- * Fortunately, Linux keeps it simple and doesn't use TS, so we can ignore
- * cr0.
- */
-static void lguest_write_cr0(unsigned long val)
-{
-}
-
-static unsigned long lguest_read_cr0(void)
-{
-	return 0;
-}
-
-/*
- * cr2 is the virtual address of the last page fault, which the Guest only ever
- * reads.  The Host kindly writes this into our "struct lguest_data", so we
- * just read it out of there.
- */
-static unsigned long lguest_read_cr2(void)
-{
-	return lguest_data.cr2;
-}
-
-/* See lguest_set_pte() below. */
-static bool cr3_changed = false;
-static unsigned long current_cr3;
-
-/*
- * cr3 is the current toplevel pagetable page: the principle is the same as
- * cr0.  Keep a local copy, and tell the Host when it changes.
- */
-static void lguest_write_cr3(unsigned long cr3)
-{
-	lazy_hcall1(LHCALL_NEW_PGTABLE, cr3);
-	current_cr3 = cr3;
-
-	/* These two page tables are simple, linear, and used during boot */
-	if (cr3 != __pa_symbol(swapper_pg_dir) &&
-	    cr3 != __pa_symbol(initial_page_table))
-		cr3_changed = true;
-}
-
-static unsigned long lguest_read_cr3(void)
-{
-	return current_cr3;
-}
-
-/* cr4 is used to enable and disable PGE, but we don't care. */
-static unsigned long lguest_read_cr4(void)
-{
-	return 0;
-}
-
-static void lguest_write_cr4(unsigned long val)
-{
-}
-
-/*
- * Page Table Handling.
- *
- * Now would be a good time to take a rest and grab a coffee or similarly
- * relaxing stimulant.  The easy parts are behind us, and the trek gradually
- * winds uphill from here.
- *
- * Quick refresher: memory is divided into "pages" of 4096 bytes each.  The CPU
- * maps virtual addresses to physical addresses using "page tables".  We could
- * use one huge index of 1 million entries: each address is 4 bytes, so that's
- * 1024 pages just to hold the page tables.   But since most virtual addresses
- * are unused, we use a two level index which saves space.  The cr3 register
- * contains the physical address of the top level "page directory" page, which
- * contains physical addresses of up to 1024 second-level pages.  Each of these
- * second level pages contains up to 1024 physical addresses of actual pages,
- * or Page Table Entries (PTEs).
- *
- * Here's a diagram, where arrows indicate physical addresses:
- *
- * cr3 ---> +---------+
- *	    |  	   --------->+---------+
- *	    |	      |	     | PADDR1  |
- *	  Mid-level   |	     | PADDR2  |
- *	  (PMD) page  |	     | 	       |
- *	    |	      |	   Lower-level |
- *	    |	      |	   (PTE) page  |
- *	    |	      |	     |	       |
- *	      ....    	     	 ....
- *
- * So to convert a virtual address to a physical address, we look up the top
- * level, which points us to the second level, which gives us the physical
- * address of that page.  If the top level entry was not present, or the second
- * level entry was not present, then the virtual address is invalid (we
- * say "the page was not mapped").
- *
- * Put another way, a 32-bit virtual address is divided up like so:
- *
- *  1 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
- * |<---- 10 bits ---->|<---- 10 bits ---->|<------ 12 bits ------>|
- *    Index into top     Index into second      Offset within page
- *  page directory page    pagetable page
- *
- * Now, unfortunately, this isn't the whole story: Intel added Physical Address
- * Extension (PAE) to allow 32 bit systems to use 64GB of memory (ie. 36 bits).
- * These are held in 64-bit page table entries, so we can now only fit 512
- * entries in a page, and the neat three-level tree breaks down.
- *
- * The result is a four level page table:
- *
- * cr3 --> [ 4 Upper  ]
- *	   [   Level  ]
- *	   [  Entries ]
- *	   [(PUD Page)]---> +---------+
- *	 		    |  	   --------->+---------+
- *	 		    |	      |	     | PADDR1  |
- *	 		  Mid-level   |	     | PADDR2  |
- *	 		  (PMD) page  |	     | 	       |
- *	 		    |	      |	   Lower-level |
- *	 		    |	      |	   (PTE) page  |
- *	 		    |	      |	     |	       |
- *	 		      ....    	     	 ....
- *
- *
- * And the virtual address is decoded as:
- *
- *         1 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
- *      |<-2->|<--- 9 bits ---->|<---- 9 bits --->|<------ 12 bits ------>|
- * Index into    Index into mid    Index into lower    Offset within page
- * top entries   directory page     pagetable page
- *
- * It's too hard to switch between these two formats at runtime, so Linux only
- * supports one or the other depending on whether CONFIG_X86_PAE is set.  Many
- * distributions turn it on, and not just for people with silly amounts of
- * memory: the larger PTE entries allow room for the NX bit, which lets the
- * kernel disable execution of pages and increase security.
- *
- * This was a problem for lguest, which couldn't run on these distributions;
- * then Matias Zabaljauregui figured it all out and implemented it, and only a
- * handful of puppies were crushed in the process!
- *
- * Back to our point: the kernel spends a lot of time changing both the
- * top-level page directory and lower-level pagetable pages.  The Guest doesn't
- * know physical addresses, so while it maintains these page tables exactly
- * like normal, it also needs to keep the Host informed whenever it makes a
- * change: the Host will create the real page tables based on the Guests'.
- */
-
-/*
- * The Guest calls this after it has set a second-level entry (pte), ie. to map
- * a page into a process' address space.  We tell the Host the toplevel and
- * address this corresponds to.  The Guest uses one pagetable per process, so
- * we need to tell the Host which one we're changing (mm->pgd).
- */
-static void lguest_pte_update(struct mm_struct *mm, unsigned long addr,
-			       pte_t *ptep)
-{
-#ifdef CONFIG_X86_PAE
-	/* PAE needs to hand a 64 bit page table entry, so it uses two args. */
-	lazy_hcall4(LHCALL_SET_PTE, __pa(mm->pgd), addr,
-		    ptep->pte_low, ptep->pte_high);
-#else
-	lazy_hcall3(LHCALL_SET_PTE, __pa(mm->pgd), addr, ptep->pte_low);
-#endif
-}
-
-/* This is the "set and update" combo-meal-deal version. */
-static void lguest_set_pte_at(struct mm_struct *mm, unsigned long addr,
-			      pte_t *ptep, pte_t pteval)
-{
-	native_set_pte(ptep, pteval);
-	lguest_pte_update(mm, addr, ptep);
-}
-
-/*
- * The Guest calls lguest_set_pud to set a top-level entry and lguest_set_pmd
- * to set a middle-level entry when PAE is activated.
- *
- * Again, we set the entry then tell the Host which page we changed,
- * and the index of the entry we changed.
- */
-#ifdef CONFIG_X86_PAE
-static void lguest_set_pud(pud_t *pudp, pud_t pudval)
-{
-	native_set_pud(pudp, pudval);
-
-	/* 32 bytes aligned pdpt address and the index. */
-	lazy_hcall2(LHCALL_SET_PGD, __pa(pudp) & 0xFFFFFFE0,
-		   (__pa(pudp) & 0x1F) / sizeof(pud_t));
-}
-
-static void lguest_set_pmd(pmd_t *pmdp, pmd_t pmdval)
-{
-	native_set_pmd(pmdp, pmdval);
-	lazy_hcall2(LHCALL_SET_PMD, __pa(pmdp) & PAGE_MASK,
-		   (__pa(pmdp) & (PAGE_SIZE - 1)) / sizeof(pmd_t));
-}
-#else
-
-/* The Guest calls lguest_set_pmd to set a top-level entry when !PAE. */
-static void lguest_set_pmd(pmd_t *pmdp, pmd_t pmdval)
-{
-	native_set_pmd(pmdp, pmdval);
-	lazy_hcall2(LHCALL_SET_PGD, __pa(pmdp) & PAGE_MASK,
-		   (__pa(pmdp) & (PAGE_SIZE - 1)) / sizeof(pmd_t));
-}
-#endif
-
-/*
- * There are a couple of legacy places where the kernel sets a PTE, but we
- * don't know the top level any more.  This is useless for us, since we don't
- * know which pagetable is changing or what address, so we just tell the Host
- * to forget all of them.  Fortunately, this is very rare.
- *
- * ... except in early boot when the kernel sets up the initial pagetables,
- * which makes booting astonishingly slow: 48 seconds!  So we don't even tell
- * the Host anything changed until we've done the first real page table switch,
- * which brings boot back to 4.3 seconds.
- */
-static void lguest_set_pte(pte_t *ptep, pte_t pteval)
-{
-	native_set_pte(ptep, pteval);
-	if (cr3_changed)
-		lazy_hcall1(LHCALL_FLUSH_TLB, 1);
-}
-
-#ifdef CONFIG_X86_PAE
-/*
- * With 64-bit PTE values, we need to be careful setting them: if we set 32
- * bits at a time, the hardware could see a weird half-set entry.  These
- * versions ensure we update all 64 bits at once.
- */
-static void lguest_set_pte_atomic(pte_t *ptep, pte_t pte)
-{
-	native_set_pte_atomic(ptep, pte);
-	if (cr3_changed)
-		lazy_hcall1(LHCALL_FLUSH_TLB, 1);
-}
-
-static void lguest_pte_clear(struct mm_struct *mm, unsigned long addr,
-			     pte_t *ptep)
-{
-	native_pte_clear(mm, addr, ptep);
-	lguest_pte_update(mm, addr, ptep);
-}
-
-static void lguest_pmd_clear(pmd_t *pmdp)
-{
-	lguest_set_pmd(pmdp, __pmd(0));
-}
-#endif
-
-/*
- * Unfortunately for Lguest, the pv_mmu_ops for page tables were based on
- * native page table operations.  On native hardware you can set a new page
- * table entry whenever you want, but if you want to remove one you have to do
- * a TLB flush (a TLB is a little cache of page table entries kept by the CPU).
- *
- * So the lguest_set_pte_at() and lguest_set_pmd() functions above are only
- * called when a valid entry is written, not when it's removed (ie. marked not
- * present).  Instead, this is where we come when the Guest wants to remove a
- * page table entry: we tell the Host to set that entry to 0 (ie. the present
- * bit is zero).
- */
-static void lguest_flush_tlb_single(unsigned long addr)
-{
-	/* Simply set it to zero: if it was not, it will fault back in. */
-	lazy_hcall3(LHCALL_SET_PTE, current_cr3, addr, 0);
-}
-
-/*
- * This is what happens after the Guest has removed a large number of entries.
- * This tells the Host that any of the page table entries for userspace might
- * have changed, ie. virtual addresses below PAGE_OFFSET.
- */
-static void lguest_flush_tlb_user(void)
-{
-	lazy_hcall1(LHCALL_FLUSH_TLB, 0);
-}
-
-/*
- * This is called when the kernel page tables have changed.  That's not very
- * common (unless the Guest is using highmem, which makes the Guest extremely
- * slow), so it's worth separating this from the user flushing above.
- */
-static void lguest_flush_tlb_kernel(void)
-{
-	lazy_hcall1(LHCALL_FLUSH_TLB, 1);
-}
-
-/*
- * The Unadvanced Programmable Interrupt Controller.
- *
- * This is an attempt to implement the simplest possible interrupt controller.
- * I spent some time looking though routines like set_irq_chip_and_handler,
- * set_irq_chip_and_handler_name, set_irq_chip_data and set_phasers_to_stun and
- * I *think* this is as simple as it gets.
- *
- * We can tell the Host what interrupts we want blocked ready for using the
- * lguest_data.interrupts bitmap, so disabling (aka "masking") them is as
- * simple as setting a bit.  We don't actually "ack" interrupts as such, we
- * just mask and unmask them.  I wonder if we should be cleverer?
- */
-static void disable_lguest_irq(struct irq_data *data)
-{
-	set_bit(data->irq, lguest_data.blocked_interrupts);
-}
-
-static void enable_lguest_irq(struct irq_data *data)
-{
-	clear_bit(data->irq, lguest_data.blocked_interrupts);
-}
-
-/* This structure describes the lguest IRQ controller. */
-static struct irq_chip lguest_irq_controller = {
-	.name		= "lguest",
-	.irq_mask	= disable_lguest_irq,
-	.irq_mask_ack	= disable_lguest_irq,
-	.irq_unmask	= enable_lguest_irq,
-};
-
-/*
- * Interrupt descriptors are allocated as-needed, but low-numbered ones are
- * reserved by the generic x86 code.  So we ignore irq_alloc_desc_at if it
- * tells us the irq is already used: other errors (ie. ENOMEM) we take
- * seriously.
- */
-static int lguest_setup_irq(unsigned int irq)
-{
-	struct irq_desc *desc;
-	int err;
-
-	/* Returns -ve error or vector number. */
-	err = irq_alloc_desc_at(irq, 0);
-	if (err < 0 && err != -EEXIST)
-		return err;
-
-	/*
-	 * Tell the Linux infrastructure that the interrupt is
-	 * controlled by our level-based lguest interrupt controller.
-	 */
-	irq_set_chip_and_handler_name(irq, &lguest_irq_controller,
-				      handle_level_irq, "level");
-
-	/* Some systems map "vectors" to interrupts weirdly.  Not us! */
-	desc = irq_to_desc(irq);
-	__this_cpu_write(vector_irq[FIRST_EXTERNAL_VECTOR + irq], desc);
-	return 0;
-}
-
-static int lguest_enable_irq(struct pci_dev *dev)
-{
-	int err;
-	u8 line = 0;
-
-	/* We literally use the PCI interrupt line as the irq number. */
-	pci_read_config_byte(dev, PCI_INTERRUPT_LINE, &line);
-	err = lguest_setup_irq(line);
-	if (!err)
-		dev->irq = line;
-	return err;
-}
-
-/* We don't do hotplug PCI, so this shouldn't be called. */
-static void lguest_disable_irq(struct pci_dev *dev)
-{
-	WARN_ON(1);
-}
-
-/*
- * This sets up the Interrupt Descriptor Table (IDT) entry for each hardware
- * interrupt (except 128, which is used for system calls).
- */
-static void __init lguest_init_IRQ(void)
-{
-	unsigned int i;
-
-	for (i = FIRST_EXTERNAL_VECTOR; i < FIRST_SYSTEM_VECTOR; i++) {
-		if (i != IA32_SYSCALL_VECTOR)
-			set_intr_gate(i, irq_entries_start +
-					8 * (i - FIRST_EXTERNAL_VECTOR));
-	}
-
-	/*
-	 * This call is required to set up for 4k stacks, where we have
-	 * separate stacks for hard and soft interrupts.
-	 */
-	irq_ctx_init(smp_processor_id());
-}
-
-/*
- * Time.
- *
- * It would be far better for everyone if the Guest had its own clock, but
- * until then the Host gives us the time on every interrupt.
- */
-static void lguest_get_wallclock(struct timespec *now)
-{
-	*now = lguest_data.time;
-}
-
-/*
- * The TSC is an Intel thing called the Time Stamp Counter.  The Host tells us
- * what speed it runs at, or 0 if it's unusable as a reliable clock source.
- * This matches what we want here: if we return 0 from this function, the x86
- * TSC clock will give up and not register itself.
- */
-static unsigned long lguest_tsc_khz(void)
-{
-	return lguest_data.tsc_khz;
-}
-
-/*
- * If we can't use the TSC, the kernel falls back to our lower-priority
- * "lguest_clock", where we read the time value given to us by the Host.
- */
-static u64 lguest_clock_read(struct clocksource *cs)
-{
-	unsigned long sec, nsec;
-
-	/*
-	 * Since the time is in two parts (seconds and nanoseconds), we risk
-	 * reading it just as it's changing from 99 & 0.999999999 to 100 and 0,
-	 * and getting 99 and 0.  As Linux tends to come apart under the stress
-	 * of time travel, we must be careful:
-	 */
-	do {
-		/* First we read the seconds part. */
-		sec = lguest_data.time.tv_sec;
-		/*
-		 * This read memory barrier tells the compiler and the CPU that
-		 * this can't be reordered: we have to complete the above
-		 * before going on.
-		 */
-		rmb();
-		/* Now we read the nanoseconds part. */
-		nsec = lguest_data.time.tv_nsec;
-		/* Make sure we've done that. */
-		rmb();
-		/* Now if the seconds part has changed, try again. */
-	} while (unlikely(lguest_data.time.tv_sec != sec));
-
-	/* Our lguest clock is in real nanoseconds. */
-	return sec*1000000000ULL + nsec;
-}
-
-/* This is the fallback clocksource: lower priority than the TSC clocksource. */
-static struct clocksource lguest_clock = {
-	.name		= "lguest",
-	.rating		= 200,
-	.read		= lguest_clock_read,
-	.mask		= CLOCKSOURCE_MASK(64),
-	.flags		= CLOCK_SOURCE_IS_CONTINUOUS,
-};
-
-/*
- * We also need a "struct clock_event_device": Linux asks us to set it to go
- * off some time in the future.  Actually, James Morris figured all this out, I
- * just applied the patch.
- */
-static int lguest_clockevent_set_next_event(unsigned long delta,
-                                           struct clock_event_device *evt)
-{
-	/* FIXME: I don't think this can ever happen, but James tells me he had
-	 * to put this code in.  Maybe we should remove it now.  Anyone? */
-	if (delta < LG_CLOCK_MIN_DELTA) {
-		if (printk_ratelimit())
-			printk(KERN_DEBUG "%s: small delta %lu ns\n",
-			       __func__, delta);
-		return -ETIME;
-	}
-
-	/* Please wake us this far in the future. */
-	hcall(LHCALL_SET_CLOCKEVENT, delta, 0, 0, 0);
-	return 0;
-}
-
-static int lguest_clockevent_shutdown(struct clock_event_device *evt)
-{
-	/* A 0 argument shuts the clock down. */
-	hcall(LHCALL_SET_CLOCKEVENT, 0, 0, 0, 0);
-	return 0;
-}
-
-/* This describes our primitive timer chip. */
-static struct clock_event_device lguest_clockevent = {
-	.name                   = "lguest",
-	.features               = CLOCK_EVT_FEAT_ONESHOT,
-	.set_next_event         = lguest_clockevent_set_next_event,
-	.set_state_shutdown	= lguest_clockevent_shutdown,
-	.rating                 = INT_MAX,
-	.mult                   = 1,
-	.shift                  = 0,
-	.min_delta_ns           = LG_CLOCK_MIN_DELTA,
-	.min_delta_ticks        = LG_CLOCK_MIN_DELTA,
-	.max_delta_ns           = LG_CLOCK_MAX_DELTA,
-	.max_delta_ticks        = LG_CLOCK_MAX_DELTA,
-};
-
-/*
- * This is the Guest timer interrupt handler (hardware interrupt 0).  We just
- * call the clockevent infrastructure and it does whatever needs doing.
- */
-static void lguest_time_irq(struct irq_desc *desc)
-{
-	unsigned long flags;
-
-	/* Don't interrupt us while this is running. */
-	local_irq_save(flags);
-	lguest_clockevent.event_handler(&lguest_clockevent);
-	local_irq_restore(flags);
-}
-
-/*
- * At some point in the boot process, we get asked to set up our timing
- * infrastructure.  The kernel doesn't expect timer interrupts before this, but
- * we cleverly initialized the "blocked_interrupts" field of "struct
- * lguest_data" so that timer interrupts were blocked until now.
- */
-static void lguest_time_init(void)
-{
-	/* Set up the timer interrupt (0) to go to our simple timer routine */
-	if (lguest_setup_irq(0) != 0)
-		panic("Could not set up timer irq");
-	irq_set_handler(0, lguest_time_irq);
-
-	clocksource_register_hz(&lguest_clock, NSEC_PER_SEC);
-
-	/* We can't set cpumask in the initializer: damn C limitations!  Set it
-	 * here and register our timer device. */
-	lguest_clockevent.cpumask = cpumask_of(0);
-	clockevents_register_device(&lguest_clockevent);
-
-	/* Finally, we unblock the timer interrupt. */
-	clear_bit(0, lguest_data.blocked_interrupts);
-}
-
-/*
- * Miscellaneous bits and pieces.
- *
- * Here is an oddball collection of functions which the Guest needs for things
- * to work.  They're pretty simple.
- */
-
-/*
- * The Guest needs to tell the Host what stack it expects traps to use.  For
- * native hardware, this is part of the Task State Segment mentioned above in
- * lguest_load_tr_desc(), but to help hypervisors there's this special call.
- *
- * We tell the Host the segment we want to use (__KERNEL_DS is the kernel data
- * segment), the privilege level (we're privilege level 1, the Host is 0 and
- * will not tolerate us trying to use that), the stack pointer, and the number
- * of pages in the stack.
- */
-static void lguest_load_sp0(struct tss_struct *tss,
-			    struct thread_struct *thread)
-{
-	lazy_hcall3(LHCALL_SET_STACK, __KERNEL_DS | 0x1, thread->sp0,
-		   THREAD_SIZE / PAGE_SIZE);
-	tss->x86_tss.sp0 = thread->sp0;
-}
-
-/* Let's just say, I wouldn't do debugging under a Guest. */
-static unsigned long lguest_get_debugreg(int regno)
-{
-	/* FIXME: Implement */
-	return 0;
-}
-
-static void lguest_set_debugreg(int regno, unsigned long value)
-{
-	/* FIXME: Implement */
-}
-
-/*
- * There are times when the kernel wants to make sure that no memory writes are
- * caught in the cache (that they've all reached real hardware devices).  This
- * doesn't matter for the Guest which has virtual hardware.
- *
- * On the Pentium 4 and above, cpuid() indicates that the Cache Line Flush
- * (clflush) instruction is available and the kernel uses that.  Otherwise, it
- * uses the older "Write Back and Invalidate Cache" (wbinvd) instruction.
- * Unlike clflush, wbinvd can only be run at privilege level 0.  So we can
- * ignore clflush, but replace wbinvd.
- */
-static void lguest_wbinvd(void)
-{
-}
-
-/*
- * If the Guest expects to have an Advanced Programmable Interrupt Controller,
- * we play dumb by ignoring writes and returning 0 for reads.  So it's no
- * longer Programmable nor Controlling anything, and I don't think 8 lines of
- * code qualifies for Advanced.  It will also never interrupt anything.  It
- * does, however, allow us to get through the Linux boot code.
- */
-#ifdef CONFIG_X86_LOCAL_APIC
-static void lguest_apic_write(u32 reg, u32 v)
-{
-}
-
-static u32 lguest_apic_read(u32 reg)
-{
-	return 0;
-}
-
-static u64 lguest_apic_icr_read(void)
-{
-	return 0;
-}
-
-static void lguest_apic_icr_write(u32 low, u32 id)
-{
-	/* Warn to see if there's any stray references */
-	WARN_ON(1);
-}
-
-static void lguest_apic_wait_icr_idle(void)
-{
-	return;
-}
-
-static u32 lguest_apic_safe_wait_icr_idle(void)
-{
-	return 0;
-}
-
-static void set_lguest_basic_apic_ops(void)
-{
-	apic->read = lguest_apic_read;
-	apic->write = lguest_apic_write;
-	apic->icr_read = lguest_apic_icr_read;
-	apic->icr_write = lguest_apic_icr_write;
-	apic->wait_icr_idle = lguest_apic_wait_icr_idle;
-	apic->safe_wait_icr_idle = lguest_apic_safe_wait_icr_idle;
-};
-#endif
-
-/* STOP!  Until an interrupt comes in. */
-static void lguest_safe_halt(void)
-{
-	hcall(LHCALL_HALT, 0, 0, 0, 0);
-}
-
-/*
- * The SHUTDOWN hypercall takes a string to describe what's happening, and
- * an argument which says whether this to restart (reboot) the Guest or not.
- *
- * Note that the Host always prefers that the Guest speak in physical addresses
- * rather than virtual addresses, so we use __pa() here.
- */
-static void lguest_power_off(void)
-{
-	hcall(LHCALL_SHUTDOWN, __pa("Power down"),
-	      LGUEST_SHUTDOWN_POWEROFF, 0, 0);
-}
-
-/*
- * Panicing.
- *
- * Don't.  But if you did, this is what happens.
- */
-static int lguest_panic(struct notifier_block *nb, unsigned long l, void *p)
-{
-	hcall(LHCALL_SHUTDOWN, __pa(p), LGUEST_SHUTDOWN_POWEROFF, 0, 0);
-	/* The hcall won't return, but to keep gcc happy, we're "done". */
-	return NOTIFY_DONE;
-}
-
-static struct notifier_block paniced = {
-	.notifier_call = lguest_panic
-};
-
-/* Setting up memory is fairly easy. */
-static __init char *lguest_memory_setup(void)
-{
-	/*
-	 * The Linux bootloader header contains an "e820" memory map: the
-	 * Launcher populated the first entry with our memory limit.
-	 */
-	e820__range_add(boot_params.e820_table[0].addr,
-			  boot_params.e820_table[0].size,
-			  boot_params.e820_table[0].type);
-
-	/* This string is for the boot messages. */
-	return "LGUEST";
-}
-
-/* Offset within PCI config space of BAR access capability. */
-static int console_cfg_offset = 0;
-static int console_access_cap;
-
-/* Set up so that we access off in bar0 (on bus 0, device 1, function 0) */
-static void set_cfg_window(u32 cfg_offset, u32 off)
-{
-	write_pci_config_byte(0, 1, 0,
-			      cfg_offset + offsetof(struct virtio_pci_cap, bar),
-			      0);
-	write_pci_config(0, 1, 0,
-			 cfg_offset + offsetof(struct virtio_pci_cap, length),
-			 4);
-	write_pci_config(0, 1, 0,
-			 cfg_offset + offsetof(struct virtio_pci_cap, offset),
-			 off);
-}
-
-static void write_bar_via_cfg(u32 cfg_offset, u32 off, u32 val)
-{
-	/*
-	 * We could set this up once, then leave it; nothing else in the *
-	 * kernel should touch these registers.  But if it went wrong, that
-	 * would be a horrible bug to find.
-	 */
-	set_cfg_window(cfg_offset, off);
-	write_pci_config(0, 1, 0,
-			 cfg_offset + sizeof(struct virtio_pci_cap), val);
-}
-
-static void probe_pci_console(void)
-{
-	u8 cap, common_cap = 0, device_cap = 0;
-	u32 device_len;
-
-	/* Avoid recursive printk into here. */
-	console_cfg_offset = -1;
-
-	if (!early_pci_allowed()) {
-		printk(KERN_ERR "lguest: early PCI access not allowed!\n");
-		return;
-	}
-
-	/* We expect a console PCI device at BUS0, slot 1. */
-	if (read_pci_config(0, 1, 0, 0) != 0x10431AF4) {
-		printk(KERN_ERR "lguest: PCI device is %#x!\n",
-		       read_pci_config(0, 1, 0, 0));
-		return;
-	}
-
-	/* Find the capabilities we need (must be in bar0) */
-	cap = read_pci_config_byte(0, 1, 0, PCI_CAPABILITY_LIST);
-	while (cap) {
-		u8 vndr = read_pci_config_byte(0, 1, 0, cap);
-		if (vndr == PCI_CAP_ID_VNDR) {
-			u8 type, bar;
-
-			type = read_pci_config_byte(0, 1, 0,
-			    cap + offsetof(struct virtio_pci_cap, cfg_type));
-			bar = read_pci_config_byte(0, 1, 0,
-			    cap + offsetof(struct virtio_pci_cap, bar));
-
-			switch (type) {
-			case VIRTIO_PCI_CAP_DEVICE_CFG:
-				if (bar == 0)
-					device_cap = cap;
-				break;
-			case VIRTIO_PCI_CAP_PCI_CFG:
-				console_access_cap = cap;
-				break;
-			}
-		}
-		cap = read_pci_config_byte(0, 1, 0, cap + PCI_CAP_LIST_NEXT);
-	}
-	if (!device_cap || !console_access_cap) {
-		printk(KERN_ERR "lguest: No caps (%u/%u/%u) in console!\n",
-		       common_cap, device_cap, console_access_cap);
-		return;
-	}
-
-	/*
-	 * Note that we can't check features, until we've set the DRIVER
-	 * status bit.  We don't want to do that until we have a real driver,
-	 * so we just check that the device-specific config has room for
-	 * emerg_wr.  If it doesn't support VIRTIO_CONSOLE_F_EMERG_WRITE
-	 * it should ignore the access.
-	 */
-	device_len = read_pci_config(0, 1, 0,
-			device_cap + offsetof(struct virtio_pci_cap, length));
-	if (device_len < (offsetof(struct virtio_console_config, emerg_wr)
-			  + sizeof(u32))) {
-		printk(KERN_ERR "lguest: console missing emerg_wr field\n");
-		return;
-	}
-
-	console_cfg_offset = read_pci_config(0, 1, 0,
-			device_cap + offsetof(struct virtio_pci_cap, offset));
-	printk(KERN_INFO "lguest: Console via virtio-pci emerg_wr\n");
-}
-
-/*
- * We will eventually use the virtio console device to produce console output,
- * but before that is set up we use the virtio PCI console's backdoor mmio
- * access and the "emergency" write facility (which is legal even before the
- * device is configured).
- */
-static __init int early_put_chars(u32 vtermno, const char *buf, int count)
-{
-	/* If we couldn't find PCI console, forget it. */
-	if (console_cfg_offset < 0)
-		return count;
-
-	if (unlikely(!console_cfg_offset)) {
-		probe_pci_console();
-		if (console_cfg_offset < 0)
-			return count;
-	}
-
-	write_bar_via_cfg(console_access_cap,
-			  console_cfg_offset
-			  + offsetof(struct virtio_console_config, emerg_wr),
-			  buf[0]);
-	return 1;
-}
-
-/*
- * Rebooting also tells the Host we're finished, but the RESTART flag tells the
- * Launcher to reboot us.
- */
-static void lguest_restart(char *reason)
-{
-	hcall(LHCALL_SHUTDOWN, __pa(reason), LGUEST_SHUTDOWN_RESTART, 0, 0);
-}
-
-/*G:050
- * Patching (Powerfully Placating Performance Pedants)
- *
- * We have already seen that pv_ops structures let us replace simple native
- * instructions with calls to the appropriate back end all throughout the
- * kernel.  This allows the same kernel to run as a Guest and as a native
- * kernel, but it's slow because of all the indirect branches.
- *
- * Remember that David Wheeler quote about "Any problem in computer science can
- * be solved with another layer of indirection"?  The rest of that quote is
- * "... But that usually will create another problem."  This is the first of
- * those problems.
- *
- * Our current solution is to allow the paravirt back end to optionally patch
- * over the indirect calls to replace them with something more efficient.  We
- * patch two of the simplest of the most commonly called functions: disable
- * interrupts and save interrupts.  We usually have 6 or 10 bytes to patch
- * into: the Guest versions of these operations are small enough that we can
- * fit comfortably.
- *
- * First we need assembly templates of each of the patchable Guest operations,
- * and these are in head_32.S.
- */
-
-/*G:060 We construct a table from the assembler templates: */
-static const struct lguest_insns
-{
-	const char *start, *end;
-} lguest_insns[] = {
-	[PARAVIRT_PATCH(pv_irq_ops.irq_disable)] = { lgstart_cli, lgend_cli },
-	[PARAVIRT_PATCH(pv_irq_ops.save_fl)] = { lgstart_pushf, lgend_pushf },
-};
-
-/*
- * Now our patch routine is fairly simple (based on the native one in
- * paravirt.c).  If we have a replacement, we copy it in and return how much of
- * the available space we used.
- */
-static unsigned lguest_patch(u8 type, u16 clobber, void *ibuf,
-			     unsigned long addr, unsigned len)
-{
-	unsigned int insn_len;
-
-	/* Don't do anything special if we don't have a replacement */
-	if (type >= ARRAY_SIZE(lguest_insns) || !lguest_insns[type].start)
-		return paravirt_patch_default(type, clobber, ibuf, addr, len);
-
-	insn_len = lguest_insns[type].end - lguest_insns[type].start;
-
-	/* Similarly if it can't fit (doesn't happen, but let's be thorough). */
-	if (len < insn_len)
-		return paravirt_patch_default(type, clobber, ibuf, addr, len);
-
-	/* Copy in our instructions. */
-	memcpy(ibuf, lguest_insns[type].start, insn_len);
-	return insn_len;
-}
-
-/*G:029
- * Once we get to lguest_init(), we know we're a Guest.  The various
- * pv_ops structures in the kernel provide points for (almost) every routine we
- * have to override to avoid privileged instructions.
- */
-__init void lguest_init(void)
-{
-	/* We're under lguest. */
-	pv_info.name = "lguest";
-	/* We're running at privilege level 1, not 0 as normal. */
-	pv_info.kernel_rpl = 1;
-	/* Everyone except Xen runs with this set. */
-	pv_info.shared_kernel_pmd = 1;
-
-	/*
-	 * We set up all the lguest overrides for sensitive operations.  These
-	 * are detailed with the operations themselves.
-	 */
-
-	/* Interrupt-related operations */
-	pv_irq_ops.save_fl = PV_CALLEE_SAVE(lguest_save_fl);
-	pv_irq_ops.restore_fl = __PV_IS_CALLEE_SAVE(lg_restore_fl);
-	pv_irq_ops.irq_disable = PV_CALLEE_SAVE(lguest_irq_disable);
-	pv_irq_ops.irq_enable = __PV_IS_CALLEE_SAVE(lg_irq_enable);
-	pv_irq_ops.safe_halt = lguest_safe_halt;
-
-	/* Setup operations */
-	pv_init_ops.patch = lguest_patch;
-
-	/* Intercepts of various CPU instructions */
-	pv_cpu_ops.load_gdt = lguest_load_gdt;
-	pv_cpu_ops.cpuid = lguest_cpuid;
-	pv_cpu_ops.load_idt = lguest_load_idt;
-	pv_cpu_ops.iret = lguest_iret;
-	pv_cpu_ops.load_sp0 = lguest_load_sp0;
-	pv_cpu_ops.load_tr_desc = lguest_load_tr_desc;
-	pv_cpu_ops.set_ldt = lguest_set_ldt;
-	pv_cpu_ops.load_tls = lguest_load_tls;
-	pv_cpu_ops.get_debugreg = lguest_get_debugreg;
-	pv_cpu_ops.set_debugreg = lguest_set_debugreg;
-	pv_cpu_ops.read_cr0 = lguest_read_cr0;
-	pv_cpu_ops.write_cr0 = lguest_write_cr0;
-	pv_cpu_ops.read_cr4 = lguest_read_cr4;
-	pv_cpu_ops.write_cr4 = lguest_write_cr4;
-	pv_cpu_ops.write_gdt_entry = lguest_write_gdt_entry;
-	pv_cpu_ops.write_idt_entry = lguest_write_idt_entry;
-	pv_cpu_ops.wbinvd = lguest_wbinvd;
-	pv_cpu_ops.start_context_switch = paravirt_start_context_switch;
-	pv_cpu_ops.end_context_switch = lguest_end_context_switch;
-
-	/* Pagetable management */
-	pv_mmu_ops.write_cr3 = lguest_write_cr3;
-	pv_mmu_ops.flush_tlb_user = lguest_flush_tlb_user;
-	pv_mmu_ops.flush_tlb_single = lguest_flush_tlb_single;
-	pv_mmu_ops.flush_tlb_kernel = lguest_flush_tlb_kernel;
-	pv_mmu_ops.set_pte = lguest_set_pte;
-	pv_mmu_ops.set_pte_at = lguest_set_pte_at;
-	pv_mmu_ops.set_pmd = lguest_set_pmd;
-#ifdef CONFIG_X86_PAE
-	pv_mmu_ops.set_pte_atomic = lguest_set_pte_atomic;
-	pv_mmu_ops.pte_clear = lguest_pte_clear;
-	pv_mmu_ops.pmd_clear = lguest_pmd_clear;
-	pv_mmu_ops.set_pud = lguest_set_pud;
-#endif
-	pv_mmu_ops.read_cr2 = lguest_read_cr2;
-	pv_mmu_ops.read_cr3 = lguest_read_cr3;
-	pv_mmu_ops.lazy_mode.enter = paravirt_enter_lazy_mmu;
-	pv_mmu_ops.lazy_mode.leave = lguest_leave_lazy_mmu_mode;
-	pv_mmu_ops.lazy_mode.flush = paravirt_flush_lazy_mmu;
-	pv_mmu_ops.pte_update = lguest_pte_update;
-
-#ifdef CONFIG_X86_LOCAL_APIC
-	/* APIC read/write intercepts */
-	set_lguest_basic_apic_ops();
-#endif
-
-	x86_init.resources.memory_setup = lguest_memory_setup;
-	x86_init.irqs.intr_init = lguest_init_IRQ;
-	x86_init.timers.timer_init = lguest_time_init;
-	x86_platform.calibrate_tsc = lguest_tsc_khz;
-	x86_platform.get_wallclock =  lguest_get_wallclock;
-
-	/*
-	 * Now is a good time to look at the implementations of these functions
-	 * before returning to the rest of lguest_init().
-	 */
-
-	/*G:070
-	 * Now we've seen all the paravirt_ops, we return to
-	 * lguest_init() where the rest of the fairly chaotic boot setup
-	 * occurs.
-	 */
-
-	/*
-	 * The stack protector is a weird thing where gcc places a canary
-	 * value on the stack and then checks it on return.  This file is
-	 * compiled with -fno-stack-protector it, so we got this far without
-	 * problems.  The value of the canary is kept at offset 20 from the
-	 * %gs register, so we need to set that up before calling C functions
-	 * in other files.
-	 */
-	setup_stack_canary_segment(0);
-
-	/*
-	 * We could just call load_stack_canary_segment(), but we might as well
-	 * call switch_to_new_gdt() which loads the whole table and sets up the
-	 * per-cpu segment descriptor register %fs as well.
-	 */
-	switch_to_new_gdt(0);
-
-	/*
-	 * The Host<->Guest Switcher lives at the top of our address space, and
-	 * the Host told us how big it is when we made LGUEST_INIT hypercall:
-	 * it put the answer in lguest_data.reserve_mem
-	 */
-	reserve_top_address(lguest_data.reserve_mem);
-
-	/* Hook in our special panic hypercall code. */
-	atomic_notifier_chain_register(&panic_notifier_list, &paniced);
-
-	/*
-	 * This is messy CPU setup stuff which the native boot code does before
-	 * start_kernel, so we have to do, too:
-	 */
-	cpu_detect(&new_cpu_data);
-	/* head.S usually sets up the first capability word, so do it here. */
-	new_cpu_data.x86_capability[CPUID_1_EDX] = cpuid_edx(1);
-
-	/* Math is always hard! */
-	set_cpu_cap(&new_cpu_data, X86_FEATURE_FPU);
-
-	/* We don't have features.  We have puppies!  Puppies! */
-#ifdef CONFIG_X86_MCE
-	mca_cfg.disabled = true;
-#endif
-#ifdef CONFIG_ACPI
-	acpi_disabled = 1;
-#endif
-
-	/*
-	 * We set the preferred console to "hvc".  This is the "hypervisor
-	 * virtual console" driver written by the PowerPC people, which we also
-	 * adapted for lguest's use.
-	 */
-	add_preferred_console("hvc", 0, NULL);
-
-	/* Register our very early console. */
-	virtio_cons_early_init(early_put_chars);
-
-	/* Don't let ACPI try to control our PCI interrupts. */
-	disable_acpi();
-
-	/* We control them ourselves, by overriding these two hooks. */
-	pcibios_enable_irq = lguest_enable_irq;
-	pcibios_disable_irq = lguest_disable_irq;
-
-	/*
-	 * Last of all, we set the power management poweroff hook to point to
-	 * the Guest routine to power off, and the reboot hook to our restart
-	 * routine.
-	 */
-	pm_power_off = lguest_power_off;
-	machine_ops.restart = lguest_restart;
-
-	/*
-	 * Now we're set up, call i386_start_kernel() in head32.c and we proceed
-	 * to boot as normal.  It never returns.
-	 */
-	i386_start_kernel();
-}
-/*
- * This marks the end of stage II of our journey, The Guest.
- *
- * It is now time for us to explore the layer of virtual drivers and complete
- * our understanding of the Guest in "make Drivers".
- */
diff --git a/arch/x86/lguest/head_32.S b/arch/x86/lguest/head_32.S
deleted file mode 100644
index d5ae63f5ec5d..000000000000
--- a/arch/x86/lguest/head_32.S
+++ /dev/null
@@ -1,192 +0,0 @@
-#include <linux/linkage.h>
-#include <linux/lguest.h>
-#include <asm/lguest_hcall.h>
-#include <asm/asm-offsets.h>
-#include <asm/thread_info.h>
-#include <asm/processor-flags.h>
-
-/*G:020
-
- * Our story starts with the bzImage: booting starts at startup_32 in
- * arch/x86/boot/compressed/head_32.S.  This merely uncompresses the real
- * kernel in place and then jumps into it: startup_32 in
- * arch/x86/kernel/head_32.S.  Both routines expects a boot header in the %esi
- * register, which is created by the bootloader (the Launcher in our case).
- *
- * The startup_32 function does very little: it clears the uninitialized global
- * C variables which we expect to be zero (ie. BSS) and then copies the boot
- * header and kernel command line somewhere safe, and populates some initial
- * page tables.  Finally it checks the 'hardware_subarch' field.  This was
- * introduced in 2.6.24 for lguest and Xen: if it's set to '1' (lguest's
- * assigned number), then it calls us here.
- *
- * WARNING: be very careful here!  We're running at addresses equal to physical
- * addresses (around 0), not above PAGE_OFFSET as most code expects
- * (eg. 0xC0000000).  Jumps are relative, so they're OK, but we can't touch any
- * data without remembering to subtract __PAGE_OFFSET!
- *
- * The .section line puts this code in .init.text so it will be discarded after
- * boot.
- */
-.section .init.text, "ax", @progbits
-ENTRY(lguest_entry)
-	/*
-	 * We make the "initialization" hypercall now to tell the Host where
-	 * our lguest_data struct is.
-	 */
-	movl $LHCALL_LGUEST_INIT, %eax
-	movl $lguest_data - __PAGE_OFFSET, %ebx
-	int $LGUEST_TRAP_ENTRY
-
-	/* Now turn our pagetables on; setup by arch/x86/kernel/head_32.S. */
-	movl $LHCALL_NEW_PGTABLE, %eax
-	movl $(initial_page_table - __PAGE_OFFSET), %ebx
-	int $LGUEST_TRAP_ENTRY
-
-	/* Set up the initial stack so we can run C code. */
-	movl $(init_thread_union+THREAD_SIZE),%esp
-
-	/* Jumps are relative: we're running __PAGE_OFFSET too low. */
-	jmp lguest_init+__PAGE_OFFSET
-
-/*G:055
- * We create a macro which puts the assembler code between lgstart_ and lgend_
- * markers.  These templates are put in the .text section: they can't be
- * discarded after boot as we may need to patch modules, too.
- */
-.text
-#define LGUEST_PATCH(name, insns...)			\
-	lgstart_##name:	insns; lgend_##name:;		\
-	.globl lgstart_##name; .globl lgend_##name
-
-LGUEST_PATCH(cli, movl $0, lguest_data+LGUEST_DATA_irq_enabled)
-LGUEST_PATCH(pushf, movl lguest_data+LGUEST_DATA_irq_enabled, %eax)
-
-/*G:033
- * But using those wrappers is inefficient (we'll see why that doesn't matter
- * for save_fl and irq_disable later).  If we write our routines carefully in
- * assembler, we can avoid clobbering any registers and avoid jumping through
- * the wrapper functions.
- *
- * I skipped over our first piece of assembler, but this one is worth studying
- * in a bit more detail so I'll describe in easy stages.  First, the routine to
- * enable interrupts:
- */
-ENTRY(lg_irq_enable)
-	/*
-	 * The reverse of irq_disable, this sets lguest_data.irq_enabled to
-	 * X86_EFLAGS_IF (ie. "Interrupts enabled").
-	 */
-	movl $X86_EFLAGS_IF, lguest_data+LGUEST_DATA_irq_enabled
-	/*
-	 * But now we need to check if the Host wants to know: there might have
-	 * been interrupts waiting to be delivered, in which case it will have
-	 * set lguest_data.irq_pending to X86_EFLAGS_IF.  If it's not zero, we
-	 * jump to send_interrupts, otherwise we're done.
-	 */
-	cmpl $0, lguest_data+LGUEST_DATA_irq_pending
-	jnz send_interrupts
-	/*
-	 * One cool thing about x86 is that you can do many things without using
-	 * a register.  In this case, the normal path hasn't needed to save or
-	 * restore any registers at all!
-	 */
-	ret
-send_interrupts:
-	/*
-	 * OK, now we need a register: eax is used for the hypercall number,
-	 * which is LHCALL_SEND_INTERRUPTS.
-	 *
-	 * We used not to bother with this pending detection at all, which was
-	 * much simpler.  Sooner or later the Host would realize it had to
-	 * send us an interrupt.  But that turns out to make performance 7
-	 * times worse on a simple tcp benchmark.  So now we do this the hard
-	 * way.
-	 */
-	pushl %eax
-	movl $LHCALL_SEND_INTERRUPTS, %eax
-	/* This is the actual hypercall trap. */
-	int  $LGUEST_TRAP_ENTRY
-	/* Put eax back the way we found it. */
-	popl %eax
-	ret
-
-/*
- * Finally, the "popf" or "restore flags" routine.  The %eax register holds the
- * flags (in practice, either X86_EFLAGS_IF or 0): if it's X86_EFLAGS_IF we're
- * enabling interrupts again, if it's 0 we're leaving them off.
- */
-ENTRY(lg_restore_fl)
-	/* This is just "lguest_data.irq_enabled = flags;" */
-	movl %eax, lguest_data+LGUEST_DATA_irq_enabled
-	/*
-	 * Now, if the %eax value has enabled interrupts and
-	 * lguest_data.irq_pending is set, we want to tell the Host so it can
-	 * deliver any outstanding interrupts.  Fortunately, both values will
-	 * be X86_EFLAGS_IF (ie. 512) in that case, and the "testl"
-	 * instruction will AND them together for us.  If both are set, we
-	 * jump to send_interrupts.
-	 */
-	testl lguest_data+LGUEST_DATA_irq_pending, %eax
-	jnz send_interrupts
-	/* Again, the normal path has used no extra registers.  Clever, huh? */
-	ret
-/*:*/
-
-/* These demark the EIP where host should never deliver interrupts. */
-.global lguest_noirq_iret
-
-/*M:004
- * When the Host reflects a trap or injects an interrupt into the Guest, it
- * sets the eflags interrupt bit on the stack based on lguest_data.irq_enabled,
- * so the Guest iret logic does the right thing when restoring it.  However,
- * when the Host sets the Guest up for direct traps, such as system calls, the
- * processor is the one to push eflags onto the stack, and the interrupt bit
- * will be 1 (in reality, interrupts are always enabled in the Guest).
- *
- * This turns out to be harmless: the only trap which should happen under Linux
- * with interrupts disabled is Page Fault (due to our lazy mapping of vmalloc
- * regions), which has to be reflected through the Host anyway.  If another
- * trap *does* go off when interrupts are disabled, the Guest will panic, and
- * we'll never get to this iret!
-:*/
-
-/*G:045
- * There is one final paravirt_op that the Guest implements, and glancing at it
- * you can see why I left it to last.  It's *cool*!  It's in *assembler*!
- *
- * The "iret" instruction is used to return from an interrupt or trap.  The
- * stack looks like this:
- *   old address
- *   old code segment & privilege level
- *   old processor flags ("eflags")
- *
- * The "iret" instruction pops those values off the stack and restores them all
- * at once.  The only problem is that eflags includes the Interrupt Flag which
- * the Guest can't change: the CPU will simply ignore it when we do an "iret".
- * So we have to copy eflags from the stack to lguest_data.irq_enabled before
- * we do the "iret".
- *
- * There are two problems with this: firstly, we can't clobber any registers
- * and secondly, the whole thing needs to be atomic.  The first problem
- * is solved by using "push memory"/"pop memory" instruction pair for copying.
- *
- * The second is harder: copying eflags to lguest_data.irq_enabled will turn
- * interrupts on before we're finished, so we could be interrupted before we
- * return to userspace or wherever.  Our solution to this is to tell the
- * Host that it is *never* to interrupt us there, even if interrupts seem to be
- * enabled. (It's not necessary to protect pop instruction, since
- * data gets updated only after it completes, so we only need to protect
- * one instruction, iret).
- */
-ENTRY(lguest_iret)
-	pushl	2*4(%esp)
-	/*
-	 * Note the %ss: segment prefix here.  Normal data accesses use the
-	 * "ds" segment, but that will have already been restored for whatever
-	 * we're returning to (such as userspace): we can't trust it.  The %ss:
-	 * prefix makes sure we use the stack segment, which is still valid.
-	 */
-	popl	%ss:lguest_data+LGUEST_DATA_irq_enabled
-lguest_noirq_iret:
-	iret
diff --git a/drivers/Makefile b/drivers/Makefile
index dfdcda00bfe3..d90fdc413648 100644
--- a/drivers/Makefile
+++ b/drivers/Makefile
@@ -125,7 +125,6 @@ obj-$(CONFIG_ACCESSIBILITY)	+= accessibility/
 obj-$(CONFIG_ISDN)		+= isdn/
 obj-$(CONFIG_EDAC)		+= edac/
 obj-$(CONFIG_EISA)		+= eisa/
-obj-y				+= lguest/
 obj-$(CONFIG_CPU_FREQ)		+= cpufreq/
 obj-$(CONFIG_CPU_IDLE)		+= cpuidle/
 obj-y				+= mmc/
diff --git a/drivers/block/Kconfig b/drivers/block/Kconfig
index 8ddc98279c8f..80aaf3420e12 100644
--- a/drivers/block/Kconfig
+++ b/drivers/block/Kconfig
@@ -470,7 +470,7 @@ config VIRTIO_BLK
 	depends on VIRTIO
 	---help---
 	  This is the virtual block driver for virtio.  It can be used with
-          lguest or QEMU based VMMs (like KVM or Xen).  Say Y or M.
+          QEMU based VMMs (like KVM or Xen).  Say Y or M.
 
 config VIRTIO_BLK_SCSI
 	bool "SCSI passthrough request for the Virtio block driver"
diff --git a/drivers/char/Kconfig b/drivers/char/Kconfig
index ccd239ab879f..623714344600 100644
--- a/drivers/char/Kconfig
+++ b/drivers/char/Kconfig
@@ -161,7 +161,7 @@ config VIRTIO_CONSOLE
 	depends on VIRTIO && TTY
 	select HVC_DRIVER
 	help
-	  Virtio console for use with lguest and other hypervisors.
+	  Virtio console for use with hypervisors.
 
 	  Also serves as a general-purpose serial device for data
 	  transfer between the guest and host.  Character devices at
diff --git a/drivers/char/virtio_console.c b/drivers/char/virtio_console.c
index ad843eb02ae7..4d229dde6522 100644
--- a/drivers/char/virtio_console.c
+++ b/drivers/char/virtio_console.c
@@ -1130,7 +1130,7 @@ static const struct file_operations port_fops = {
  * We turn the characters into a scatter-gather list, add it to the
  * output queue and then kick the Host.  Then we sit here waiting for
  * it to finish: inefficient in theory, but in practice
- * implementations will do it immediately (lguest's Launcher does).
+ * implementations will do it immediately.
  */
 static int put_chars(u32 vtermno, const char *buf, int count)
 {
diff --git a/drivers/lguest/Kconfig b/drivers/lguest/Kconfig
deleted file mode 100644
index 169172d2ba05..000000000000
--- a/drivers/lguest/Kconfig
+++ /dev/null
@@ -1,13 +0,0 @@
-config LGUEST
-	tristate "Linux hypervisor example code"
-	depends on X86_32 && EVENTFD && TTY && PCI_DIRECT
-	select HVC_DRIVER
-	---help---
-	  This is a very simple module which allows you to run
-	  multiple instances of the same Linux kernel, using the
-	  "lguest" command found in the tools/lguest directory.
-
-	  Note that "lguest" is pronounced to rhyme with "fell quest",
-	  not "rustyvisor". See tools/lguest/lguest.txt.
-
-	  If unsure, say N.  If curious, say M.  If masochistic, say Y.
diff --git a/drivers/lguest/Makefile b/drivers/lguest/Makefile
deleted file mode 100644
index 16f52ee73994..000000000000
--- a/drivers/lguest/Makefile
+++ /dev/null
@@ -1,26 +0,0 @@
-# Host requires the other files, which can be a module.
-obj-$(CONFIG_LGUEST)	+= lg.o
-lg-y = core.o hypercalls.o page_tables.o interrupts_and_traps.o \
-	segments.o lguest_user.o
-
-lg-$(CONFIG_X86_32) += x86/switcher_32.o x86/core.o
-
-Preparation Preparation!: PREFIX=P
-Guest: PREFIX=G
-Drivers: PREFIX=D
-Launcher: PREFIX=L
-Host: PREFIX=H
-Switcher: PREFIX=S
-Mastery: PREFIX=M
-Beer:
-	@for f in Preparation Guest Drivers Launcher Host Switcher Mastery; do echo "{==- $$f -==}"; make -s $$f; done; echo "{==-==}"
-Preparation Preparation! Guest Drivers Launcher Host Switcher Mastery:
-	@sh ../../tools/lguest/extract $(PREFIX) `find ../../* -name '*.[chS]' -wholename '*lguest*'`
-Puppy:
-	@clear
-	@printf "      __  \n (___()'\`;\n /,    /\`\n \\\\\\\"--\\\\\\   \n"
-	@sleep 2; clear; printf "\n\n   Sit!\n\n"; sleep 1; clear
-	@printf "    __    \n   ()'\`;  \n   /\\|\` \n  /  |  \n(/_)_|_   \n"
-	@sleep 2; clear; printf "\n\n  Stand!\n\n"; sleep 1; clear
-	@printf "    __    \n   ()'\`;  \n   /\\|\` \n  /._.= \n /| /     \n(_\_)_    \n"
-	@sleep 2; clear; printf "\n\n  Good puppy!\n\n"; sleep 1; clear
diff --git a/drivers/lguest/README b/drivers/lguest/README
deleted file mode 100644
index b7db39a64c66..000000000000
--- a/drivers/lguest/README
+++ /dev/null
@@ -1,47 +0,0 @@
-Welcome, friend reader, to lguest.
-
-Lguest is an adventure, with you, the reader, as Hero.  I can't think of many
-5000-line projects which offer both such capability and glimpses of future
-potential; it is an exciting time to be delving into the source!
-
-But be warned; this is an arduous journey of several hours or more!  And as we
-know, all true Heroes are driven by a Noble Goal.  Thus I offer a Beer (or
-equivalent) to anyone I meet who has completed this documentation.
-
-So get comfortable and keep your wits about you (both quick and humorous).
-Along your way to the Noble Goal, you will also gain masterly insight into
-lguest, and hypervisors and x86 virtualization in general.
-
-Our Quest is in seven parts: (best read with C highlighting turned on)
-
-I) Preparation
-	- In which our potential hero is flown quickly over the landscape for a
-	  taste of its scope.  Suitable for the armchair coders and other such
-	  persons of faint constitution.
-
-II) Guest
-	- Where we encounter the first tantalising wisps of code, and come to
-	  understand the details of the life of a Guest kernel.
-
-III) Drivers
-	- Whereby the Guest finds its voice and become useful, and our
-	  understanding of the Guest is completed.
-
-IV) Launcher
-	- Where we trace back to the creation of the Guest, and thus begin our
-	  understanding of the Host.
-
-V) Host
-	- Where we master the Host code, through a long and tortuous journey.
-	  Indeed, it is here that our hero is tested in the Bit of Despair.
-
-VI) Switcher
-	- Where our understanding of the intertwined nature of Guests and Hosts
-	  is completed.
-
-VII) Mastery
-	- Where our fully fledged hero grapples with the Great Question:
-	  "What next?"
-
-make Preparation!
-Rusty Russell.
diff --git a/drivers/lguest/core.c b/drivers/lguest/core.c
deleted file mode 100644
index 395ed1961dbf..000000000000
--- a/drivers/lguest/core.c
+++ /dev/null
@@ -1,398 +0,0 @@
-/*P:400
- * This contains run_guest() which actually calls into the Host<->Guest
- * Switcher and analyzes the return, such as determining if the Guest wants the
- * Host to do something.  This file also contains useful helper routines.
-:*/
-#include <linux/module.h>
-#include <linux/stringify.h>
-#include <linux/stddef.h>
-#include <linux/io.h>
-#include <linux/mm.h>
-#include <linux/sched/signal.h>
-#include <linux/vmalloc.h>
-#include <linux/cpu.h>
-#include <linux/freezer.h>
-#include <linux/highmem.h>
-#include <linux/slab.h>
-#include <asm/paravirt.h>
-#include <asm/pgtable.h>
-#include <linux/uaccess.h>
-#include <asm/poll.h>
-#include <asm/asm-offsets.h>
-#include "lg.h"
-
-unsigned long switcher_addr;
-struct page **lg_switcher_pages;
-static struct vm_struct *switcher_text_vma;
-static struct vm_struct *switcher_stacks_vma;
-
-/* This One Big lock protects all inter-guest data structures. */
-DEFINE_MUTEX(lguest_lock);
-
-/*H:010
- * We need to set up the Switcher at a high virtual address.  Remember the
- * Switcher is a few hundred bytes of assembler code which actually changes the
- * CPU to run the Guest, and then changes back to the Host when a trap or
- * interrupt happens.
- *
- * The Switcher code must be at the same virtual address in the Guest as the
- * Host since it will be running as the switchover occurs.
- *
- * Trying to map memory at a particular address is an unusual thing to do, so
- * it's not a simple one-liner.
- */
-static __init int map_switcher(void)
-{
-	int i, err;
-
-	/*
-	 * Map the Switcher in to high memory.
-	 *
-	 * It turns out that if we choose the address 0xFFC00000 (4MB under the
-	 * top virtual address), it makes setting up the page tables really
-	 * easy.
-	 */
-
-	/* We assume Switcher text fits into a single page. */
-	if (end_switcher_text - start_switcher_text > PAGE_SIZE) {
-		printk(KERN_ERR "lguest: switcher text too large (%zu)\n",
-		       end_switcher_text - start_switcher_text);
-		return -EINVAL;
-	}
-
-	/*
-	 * We allocate an array of struct page pointers.  map_vm_area() wants
-	 * this, rather than just an array of pages.
-	 */
-	lg_switcher_pages = kmalloc(sizeof(lg_switcher_pages[0])
-				    * TOTAL_SWITCHER_PAGES,
-				    GFP_KERNEL);
-	if (!lg_switcher_pages) {
-		err = -ENOMEM;
-		goto out;
-	}
-
-	/*
-	 * Now we actually allocate the pages.  The Guest will see these pages,
-	 * so we make sure they're zeroed.
-	 */
-	for (i = 0; i < TOTAL_SWITCHER_PAGES; i++) {
-		lg_switcher_pages[i] = alloc_page(GFP_KERNEL|__GFP_ZERO);
-		if (!lg_switcher_pages[i]) {
-			err = -ENOMEM;
-			goto free_some_pages;
-		}
-	}
-
-	/*
-	 * Copy in the compiled-in Switcher code (from x86/switcher_32.S).
-	 * It goes in the first page, which we map in momentarily.
-	 */
-	memcpy(kmap(lg_switcher_pages[0]), start_switcher_text,
-	       end_switcher_text - start_switcher_text);
-	kunmap(lg_switcher_pages[0]);
-
-	/*
-	 * We place the Switcher underneath the fixmap area, which is the
-	 * highest virtual address we can get.  This is important, since we
-	 * tell the Guest it can't access this memory, so we want its ceiling
-	 * as high as possible.
-	 */
-	switcher_addr = FIXADDR_START - TOTAL_SWITCHER_PAGES*PAGE_SIZE;
-
-	/*
-	 * Now we reserve the "virtual memory area"s we want.  We might
-	 * not get them in theory, but in practice it's worked so far.
-	 *
-	 * We want the switcher text to be read-only and executable, and
-	 * the stacks to be read-write and non-executable.
-	 */
-	switcher_text_vma = __get_vm_area(PAGE_SIZE, VM_ALLOC|VM_NO_GUARD,
-					  switcher_addr,
-					  switcher_addr + PAGE_SIZE);
-
-	if (!switcher_text_vma) {
-		err = -ENOMEM;
-		printk("lguest: could not map switcher pages high\n");
-		goto free_pages;
-	}
-
-	switcher_stacks_vma = __get_vm_area(SWITCHER_STACK_PAGES * PAGE_SIZE,
-					    VM_ALLOC|VM_NO_GUARD,
-					    switcher_addr + PAGE_SIZE,
-					    switcher_addr + TOTAL_SWITCHER_PAGES * PAGE_SIZE);
-	if (!switcher_stacks_vma) {
-		err = -ENOMEM;
-		printk("lguest: could not map switcher pages high\n");
-		goto free_text_vma;
-	}
-
-	/*
-	 * This code actually sets up the pages we've allocated to appear at
-	 * switcher_addr.  map_vm_area() takes the vma we allocated above, the
-	 * kind of pages we're mapping (kernel text pages and kernel writable
-	 * pages respectively), and a pointer to our array of struct pages.
-	 */
-	err = map_vm_area(switcher_text_vma, PAGE_KERNEL_RX, lg_switcher_pages);
-	if (err) {
-		printk("lguest: text map_vm_area failed: %i\n", err);
-		goto free_vmas;
-	}
-
-	err = map_vm_area(switcher_stacks_vma, PAGE_KERNEL,
-			  lg_switcher_pages + SWITCHER_TEXT_PAGES);
-	if (err) {
-		printk("lguest: stacks map_vm_area failed: %i\n", err);
-		goto free_vmas;
-	}
-
-	/*
-	 * Now the Switcher is mapped at the right address, we can't fail!
-	 */
-	printk(KERN_INFO "lguest: mapped switcher at %p\n",
-	       switcher_text_vma->addr);
-	/* And we succeeded... */
-	return 0;
-
-free_vmas:
-	/* Undoes map_vm_area and __get_vm_area */
-	vunmap(switcher_stacks_vma->addr);
-free_text_vma:
-	vunmap(switcher_text_vma->addr);
-free_pages:
-	i = TOTAL_SWITCHER_PAGES;
-free_some_pages:
-	for (--i; i >= 0; i--)
-		__free_pages(lg_switcher_pages[i], 0);
-	kfree(lg_switcher_pages);
-out:
-	return err;
-}
-/*:*/
-
-/* Cleaning up the mapping when the module is unloaded is almost... too easy. */
-static void unmap_switcher(void)
-{
-	unsigned int i;
-
-	/* vunmap() undoes *both* map_vm_area() and __get_vm_area(). */
-	vunmap(switcher_text_vma->addr);
-	vunmap(switcher_stacks_vma->addr);
-	/* Now we just need to free the pages we copied the switcher into */
-	for (i = 0; i < TOTAL_SWITCHER_PAGES; i++)
-		__free_pages(lg_switcher_pages[i], 0);
-	kfree(lg_switcher_pages);
-}
-
-/*H:032
- * Dealing With Guest Memory.
- *
- * Before we go too much further into the Host, we need to grok the routines
- * we use to deal with Guest memory.
- *
- * When the Guest gives us (what it thinks is) a physical address, we can use
- * the normal copy_from_user() & copy_to_user() on the corresponding place in
- * the memory region allocated by the Launcher.
- *
- * But we can't trust the Guest: it might be trying to access the Launcher
- * code.  We have to check that the range is below the pfn_limit the Launcher
- * gave us.  We have to make sure that addr + len doesn't give us a false
- * positive by overflowing, too.
- */
-bool lguest_address_ok(const struct lguest *lg,
-		       unsigned long addr, unsigned long len)
-{
-	return addr+len <= lg->pfn_limit * PAGE_SIZE && (addr+len >= addr);
-}
-
-/*
- * This routine copies memory from the Guest.  Here we can see how useful the
- * kill_lguest() routine we met in the Launcher can be: we return a random
- * value (all zeroes) instead of needing to return an error.
- */
-void __lgread(struct lg_cpu *cpu, void *b, unsigned long addr, unsigned bytes)
-{
-	if (!lguest_address_ok(cpu->lg, addr, bytes)
-	    || copy_from_user(b, cpu->lg->mem_base + addr, bytes) != 0) {
-		/* copy_from_user should do this, but as we rely on it... */
-		memset(b, 0, bytes);
-		kill_guest(cpu, "bad read address %#lx len %u", addr, bytes);
-	}
-}
-
-/* This is the write (copy into Guest) version. */
-void __lgwrite(struct lg_cpu *cpu, unsigned long addr, const void *b,
-	       unsigned bytes)
-{
-	if (!lguest_address_ok(cpu->lg, addr, bytes)
-	    || copy_to_user(cpu->lg->mem_base + addr, b, bytes) != 0)
-		kill_guest(cpu, "bad write address %#lx len %u", addr, bytes);
-}
-/*:*/
-
-/*H:030
- * Let's jump straight to the the main loop which runs the Guest.
- * Remember, this is called by the Launcher reading /dev/lguest, and we keep
- * going around and around until something interesting happens.
- */
-int run_guest(struct lg_cpu *cpu, unsigned long __user *user)
-{
-	/* If the launcher asked for a register with LHREQ_GETREG */
-	if (cpu->reg_read) {
-		if (put_user(*cpu->reg_read, user))
-			return -EFAULT;
-		cpu->reg_read = NULL;
-		return sizeof(*cpu->reg_read);
-	}
-
-	/* We stop running once the Guest is dead. */
-	while (!cpu->lg->dead) {
-		unsigned int irq;
-		bool more;
-
-		/* First we run any hypercalls the Guest wants done. */
-		if (cpu->hcall)
-			do_hypercalls(cpu);
-
-		/* Do we have to tell the Launcher about a trap? */
-		if (cpu->pending.trap) {
-			if (copy_to_user(user, &cpu->pending,
-					 sizeof(cpu->pending)))
-				return -EFAULT;
-			return sizeof(cpu->pending);
-		}
-
-		/*
-		 * All long-lived kernel loops need to check with this horrible
-		 * thing called the freezer.  If the Host is trying to suspend,
-		 * it stops us.
-		 */
-		try_to_freeze();
-
-		/* Check for signals */
-		if (signal_pending(current))
-			return -ERESTARTSYS;
-
-		/*
-		 * Check if there are any interrupts which can be delivered now:
-		 * if so, this sets up the hander to be executed when we next
-		 * run the Guest.
-		 */
-		irq = interrupt_pending(cpu, &more);
-		if (irq < LGUEST_IRQS)
-			try_deliver_interrupt(cpu, irq, more);
-
-		/*
-		 * Just make absolutely sure the Guest is still alive.  One of
-		 * those hypercalls could have been fatal, for example.
-		 */
-		if (cpu->lg->dead)
-			break;
-
-		/*
-		 * If the Guest asked to be stopped, we sleep.  The Guest's
-		 * clock timer will wake us.
-		 */
-		if (cpu->halted) {
-			set_current_state(TASK_INTERRUPTIBLE);
-			/*
-			 * Just before we sleep, make sure no interrupt snuck in
-			 * which we should be doing.
-			 */
-			if (interrupt_pending(cpu, &more) < LGUEST_IRQS)
-				set_current_state(TASK_RUNNING);
-			else
-				schedule();
-			continue;
-		}
-
-		/*
-		 * OK, now we're ready to jump into the Guest.  First we put up
-		 * the "Do Not Disturb" sign:
-		 */
-		local_irq_disable();
-
-		/* Actually run the Guest until something happens. */
-		lguest_arch_run_guest(cpu);
-
-		/* Now we're ready to be interrupted or moved to other CPUs */
-		local_irq_enable();
-
-		/* Now we deal with whatever happened to the Guest. */
-		lguest_arch_handle_trap(cpu);
-	}
-
-	/* Special case: Guest is 'dead' but wants a reboot. */
-	if (cpu->lg->dead == ERR_PTR(-ERESTART))
-		return -ERESTART;
-
-	/* The Guest is dead => "No such file or directory" */
-	return -ENOENT;
-}
-
-/*H:000
- * Welcome to the Host!
- *
- * By this point your brain has been tickled by the Guest code and numbed by
- * the Launcher code; prepare for it to be stretched by the Host code.  This is
- * the heart.  Let's begin at the initialization routine for the Host's lg
- * module.
- */
-static int __init init(void)
-{
-	int err;
-
-	/* Lguest can't run under Xen, VMI or itself.  It does Tricky Stuff. */
-	if (get_kernel_rpl() != 0) {
-		printk("lguest is afraid of being a guest\n");
-		return -EPERM;
-	}
-
-	/* First we put the Switcher up in very high virtual memory. */
-	err = map_switcher();
-	if (err)
-		goto out;
-
-	/* We might need to reserve an interrupt vector. */
-	err = init_interrupts();
-	if (err)
-		goto unmap;
-
-	/* /dev/lguest needs to be registered. */
-	err = lguest_device_init();
-	if (err)
-		goto free_interrupts;
-
-	/* Finally we do some architecture-specific setup. */
-	lguest_arch_host_init();
-
-	/* All good! */
-	return 0;
-
-free_interrupts:
-	free_interrupts();
-unmap:
-	unmap_switcher();
-out:
-	return err;
-}
-
-/* Cleaning up is just the same code, backwards.  With a little French. */
-static void __exit fini(void)
-{
-	lguest_device_remove();
-	free_interrupts();
-	unmap_switcher();
-
-	lguest_arch_host_fini();
-}
-/*:*/
-
-/*
- * The Host side of lguest can be a module.  This is a nice way for people to
- * play with it.
- */
-module_init(init);
-module_exit(fini);
-MODULE_LICENSE("GPL");
-MODULE_AUTHOR("Rusty Russell <rusty@rustcorp.com.au>");
diff --git a/drivers/lguest/hypercalls.c b/drivers/lguest/hypercalls.c
deleted file mode 100644
index 601f81c04873..000000000000
--- a/drivers/lguest/hypercalls.c
+++ /dev/null
@@ -1,304 +0,0 @@
-/*P:500
- * Just as userspace programs request kernel operations through a system
- * call, the Guest requests Host operations through a "hypercall".  You might
- * notice this nomenclature doesn't really follow any logic, but the name has
- * been around for long enough that we're stuck with it.  As you'd expect, this
- * code is basically a one big switch statement.
-:*/
-
-/*  Copyright (C) 2006 Rusty Russell IBM Corporation
-
-    This program is free software; you can redistribute it and/or modify
-    it under the terms of the GNU General Public License as published by
-    the Free Software Foundation; either version 2 of the License, or
-    (at your option) any later version.
-
-    This program is distributed in the hope that it will be useful,
-    but WITHOUT ANY WARRANTY; without even the implied warranty of
-    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-    GNU General Public License for more details.
-
-    You should have received a copy of the GNU General Public License
-    along with this program; if not, write to the Free Software
-    Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301 USA
-*/
-#include <linux/uaccess.h>
-#include <linux/syscalls.h>
-#include <linux/mm.h>
-#include <linux/ktime.h>
-#include <asm/page.h>
-#include <asm/pgtable.h>
-#include "lg.h"
-
-/*H:120
- * This is the core hypercall routine: where the Guest gets what it wants.
- * Or gets killed.  Or, in the case of LHCALL_SHUTDOWN, both.
- */
-static void do_hcall(struct lg_cpu *cpu, struct hcall_args *args)
-{
-	switch (args->arg0) {
-	case LHCALL_FLUSH_ASYNC:
-		/*
-		 * This call does nothing, except by breaking out of the Guest
-		 * it makes us process all the asynchronous hypercalls.
-		 */
-		break;
-	case LHCALL_SEND_INTERRUPTS:
-		/*
-		 * This call does nothing too, but by breaking out of the Guest
-		 * it makes us process any pending interrupts.
-		 */
-		break;
-	case LHCALL_LGUEST_INIT:
-		/*
-		 * You can't get here unless you're already initialized.  Don't
-		 * do that.
-		 */
-		kill_guest(cpu, "already have lguest_data");
-		break;
-	case LHCALL_SHUTDOWN: {
-		char msg[128];
-		/*
-		 * Shutdown is such a trivial hypercall that we do it in five
-		 * lines right here.
-		 *
-		 * If the lgread fails, it will call kill_guest() itself; the
-		 * kill_guest() with the message will be ignored.
-		 */
-		__lgread(cpu, msg, args->arg1, sizeof(msg));
-		msg[sizeof(msg)-1] = '\0';
-		kill_guest(cpu, "CRASH: %s", msg);
-		if (args->arg2 == LGUEST_SHUTDOWN_RESTART)
-			cpu->lg->dead = ERR_PTR(-ERESTART);
-		break;
-	}
-	case LHCALL_FLUSH_TLB:
-		/* FLUSH_TLB comes in two flavors, depending on the argument: */
-		if (args->arg1)
-			guest_pagetable_clear_all(cpu);
-		else
-			guest_pagetable_flush_user(cpu);
-		break;
-
-	/*
-	 * All these calls simply pass the arguments through to the right
-	 * routines.
-	 */
-	case LHCALL_NEW_PGTABLE:
-		guest_new_pagetable(cpu, args->arg1);
-		break;
-	case LHCALL_SET_STACK:
-		guest_set_stack(cpu, args->arg1, args->arg2, args->arg3);
-		break;
-	case LHCALL_SET_PTE:
-#ifdef CONFIG_X86_PAE
-		guest_set_pte(cpu, args->arg1, args->arg2,
-				__pte(args->arg3 | (u64)args->arg4 << 32));
-#else
-		guest_set_pte(cpu, args->arg1, args->arg2, __pte(args->arg3));
-#endif
-		break;
-	case LHCALL_SET_PGD:
-		guest_set_pgd(cpu->lg, args->arg1, args->arg2);
-		break;
-#ifdef CONFIG_X86_PAE
-	case LHCALL_SET_PMD:
-		guest_set_pmd(cpu->lg, args->arg1, args->arg2);
-		break;
-#endif
-	case LHCALL_SET_CLOCKEVENT:
-		guest_set_clockevent(cpu, args->arg1);
-		break;
-	case LHCALL_HALT:
-		/* Similarly, this sets the halted flag for run_guest(). */
-		cpu->halted = 1;
-		break;
-	default:
-		/* It should be an architecture-specific hypercall. */
-		if (lguest_arch_do_hcall(cpu, args))
-			kill_guest(cpu, "Bad hypercall %li\n", args->arg0);
-	}
-}
-
-/*H:124
- * Asynchronous hypercalls are easy: we just look in the array in the
- * Guest's "struct lguest_data" to see if any new ones are marked "ready".
- *
- * We are careful to do these in order: obviously we respect the order the
- * Guest put them in the ring, but we also promise the Guest that they will
- * happen before any normal hypercall (which is why we check this before
- * checking for a normal hcall).
- */
-static void do_async_hcalls(struct lg_cpu *cpu)
-{
-	unsigned int i;
-	u8 st[LHCALL_RING_SIZE];
-
-	/* For simplicity, we copy the entire call status array in at once. */
-	if (copy_from_user(&st, &cpu->lg->lguest_data->hcall_status, sizeof(st)))
-		return;
-
-	/* We process "struct lguest_data"s hcalls[] ring once. */
-	for (i = 0; i < ARRAY_SIZE(st); i++) {
-		struct hcall_args args;
-		/*
-		 * We remember where we were up to from last time.  This makes
-		 * sure that the hypercalls are done in the order the Guest
-		 * places them in the ring.
-		 */
-		unsigned int n = cpu->next_hcall;
-
-		/* 0xFF means there's no call here (yet). */
-		if (st[n] == 0xFF)
-			break;
-
-		/*
-		 * OK, we have hypercall.  Increment the "next_hcall" cursor,
-		 * and wrap back to 0 if we reach the end.
-		 */
-		if (++cpu->next_hcall == LHCALL_RING_SIZE)
-			cpu->next_hcall = 0;
-
-		/*
-		 * Copy the hypercall arguments into a local copy of the
-		 * hcall_args struct.
-		 */
-		if (copy_from_user(&args, &cpu->lg->lguest_data->hcalls[n],
-				   sizeof(struct hcall_args))) {
-			kill_guest(cpu, "Fetching async hypercalls");
-			break;
-		}
-
-		/* Do the hypercall, same as a normal one. */
-		do_hcall(cpu, &args);
-
-		/* Mark the hypercall done. */
-		if (put_user(0xFF, &cpu->lg->lguest_data->hcall_status[n])) {
-			kill_guest(cpu, "Writing result for async hypercall");
-			break;
-		}
-
-		/*
-		 * Stop doing hypercalls if they want to notify the Launcher:
-		 * it needs to service this first.
-		 */
-		if (cpu->pending.trap)
-			break;
-	}
-}
-
-/*
- * Last of all, we look at what happens first of all.  The very first time the
- * Guest makes a hypercall, we end up here to set things up:
- */
-static void initialize(struct lg_cpu *cpu)
-{
-	/*
-	 * You can't do anything until you're initialized.  The Guest knows the
-	 * rules, so we're unforgiving here.
-	 */
-	if (cpu->hcall->arg0 != LHCALL_LGUEST_INIT) {
-		kill_guest(cpu, "hypercall %li before INIT", cpu->hcall->arg0);
-		return;
-	}
-
-	if (lguest_arch_init_hypercalls(cpu))
-		kill_guest(cpu, "bad guest page %p", cpu->lg->lguest_data);
-
-	/*
-	 * The Guest tells us where we're not to deliver interrupts by putting
-	 * the instruction address into "struct lguest_data".
-	 */
-	if (get_user(cpu->lg->noirq_iret, &cpu->lg->lguest_data->noirq_iret))
-		kill_guest(cpu, "bad guest page %p", cpu->lg->lguest_data);
-
-	/*
-	 * We write the current time into the Guest's data page once so it can
-	 * set its clock.
-	 */
-	write_timestamp(cpu);
-
-	/* page_tables.c will also do some setup. */
-	page_table_guest_data_init(cpu);
-
-	/*
-	 * This is the one case where the above accesses might have been the
-	 * first write to a Guest page.  This may have caused a copy-on-write
-	 * fault, but the old page might be (read-only) in the Guest
-	 * pagetable.
-	 */
-	guest_pagetable_clear_all(cpu);
-}
-/*:*/
-
-/*M:013
- * If a Guest reads from a page (so creates a mapping) that it has never
- * written to, and then the Launcher writes to it (ie. the output of a virtual
- * device), the Guest will still see the old page.  In practice, this never
- * happens: why would the Guest read a page which it has never written to?  But
- * a similar scenario might one day bite us, so it's worth mentioning.
- *
- * Note that if we used a shared anonymous mapping in the Launcher instead of
- * mapping /dev/zero private, we wouldn't worry about cop-on-write.  And we
- * need that to switch the Launcher to processes (away from threads) anyway.
-:*/
-
-/*H:100
- * Hypercalls
- *
- * Remember from the Guest, hypercalls come in two flavors: normal and
- * asynchronous.  This file handles both of types.
- */
-void do_hypercalls(struct lg_cpu *cpu)
-{
-	/* Not initialized yet?  This hypercall must do it. */
-	if (unlikely(!cpu->lg->lguest_data)) {
-		/* Set up the "struct lguest_data" */
-		initialize(cpu);
-		/* Hcall is done. */
-		cpu->hcall = NULL;
-		return;
-	}
-
-	/*
-	 * The Guest has initialized.
-	 *
-	 * Look in the hypercall ring for the async hypercalls:
-	 */
-	do_async_hcalls(cpu);
-
-	/*
-	 * If we stopped reading the hypercall ring because the Guest did a
-	 * NOTIFY to the Launcher, we want to return now.  Otherwise we do
-	 * the hypercall.
-	 */
-	if (!cpu->pending.trap) {
-		do_hcall(cpu, cpu->hcall);
-		/*
-		 * Tricky point: we reset the hcall pointer to mark the
-		 * hypercall as "done".  We use the hcall pointer rather than
-		 * the trap number to indicate a hypercall is pending.
-		 * Normally it doesn't matter: the Guest will run again and
-		 * update the trap number before we come back here.
-		 *
-		 * However, if we are signalled or the Guest sends I/O to the
-		 * Launcher, the run_guest() loop will exit without running the
-		 * Guest.  When it comes back it would try to re-run the
-		 * hypercall.  Finding that bug sucked.
-		 */
-		cpu->hcall = NULL;
-	}
-}
-
-/*
- * This routine supplies the Guest with time: it's used for wallclock time at
- * initial boot and as a rough time source if the TSC isn't available.
- */
-void write_timestamp(struct lg_cpu *cpu)
-{
-	struct timespec now;
-	ktime_get_real_ts(&now);
-	if (copy_to_user(&cpu->lg->lguest_data->time,
-			 &now, sizeof(struct timespec)))
-		kill_guest(cpu, "Writing timestamp");
-}
diff --git a/drivers/lguest/interrupts_and_traps.c b/drivers/lguest/interrupts_and_traps.c
deleted file mode 100644
index 67392b6ab845..000000000000
--- a/drivers/lguest/interrupts_and_traps.c
+++ /dev/null
@@ -1,706 +0,0 @@
-/*P:800
- * Interrupts (traps) are complicated enough to earn their own file.
- * There are three classes of interrupts:
- *
- * 1) Real hardware interrupts which occur while we're running the Guest,
- * 2) Interrupts for virtual devices attached to the Guest, and
- * 3) Traps and faults from the Guest.
- *
- * Real hardware interrupts must be delivered to the Host, not the Guest.
- * Virtual interrupts must be delivered to the Guest, but we make them look
- * just like real hardware would deliver them.  Traps from the Guest can be set
- * up to go directly back into the Guest, but sometimes the Host wants to see
- * them first, so we also have a way of "reflecting" them into the Guest as if
- * they had been delivered to it directly.
-:*/
-#include <linux/uaccess.h>
-#include <linux/interrupt.h>
-#include <linux/module.h>
-#include <linux/sched.h>
-#include "lg.h"
-
-/* Allow Guests to use a non-128 (ie. non-Linux) syscall trap. */
-static unsigned int syscall_vector = IA32_SYSCALL_VECTOR;
-module_param(syscall_vector, uint, 0444);
-
-/* The address of the interrupt handler is split into two bits: */
-static unsigned long idt_address(u32 lo, u32 hi)
-{
-	return (lo & 0x0000FFFF) | (hi & 0xFFFF0000);
-}
-
-/*
- * The "type" of the interrupt handler is a 4 bit field: we only support a
- * couple of types.
- */
-static int idt_type(u32 lo, u32 hi)
-{
-	return (hi >> 8) & 0xF;
-}
-
-/* An IDT entry can't be used unless the "present" bit is set. */
-static bool idt_present(u32 lo, u32 hi)
-{
-	return (hi & 0x8000);
-}
-
-/*
- * We need a helper to "push" a value onto the Guest's stack, since that's a
- * big part of what delivering an interrupt does.
- */
-static void push_guest_stack(struct lg_cpu *cpu, unsigned long *gstack, u32 val)
-{
-	/* Stack grows upwards: move stack then write value. */
-	*gstack -= 4;
-	lgwrite(cpu, *gstack, u32, val);
-}
-
-/*H:210
- * The push_guest_interrupt_stack() routine saves Guest state on the stack for
- * an interrupt or trap.  The mechanics of delivering traps and interrupts to
- * the Guest are the same, except some traps have an "error code" which gets
- * pushed onto the stack as well: the caller tells us if this is one.
- *
- * We set up the stack just like the CPU does for a real interrupt, so it's
- * identical for the Guest (and the standard "iret" instruction will undo
- * it).
- */
-static void push_guest_interrupt_stack(struct lg_cpu *cpu, bool has_err)
-{
-	unsigned long gstack, origstack;
-	u32 eflags, ss, irq_enable;
-	unsigned long virtstack;
-
-	/*
-	 * There are two cases for interrupts: one where the Guest is already
-	 * in the kernel, and a more complex one where the Guest is in
-	 * userspace.  We check the privilege level to find out.
-	 */
-	if ((cpu->regs->ss&0x3) != GUEST_PL) {
-		/*
-		 * The Guest told us their kernel stack with the SET_STACK
-		 * hypercall: both the virtual address and the segment.
-		 */
-		virtstack = cpu->esp1;
-		ss = cpu->ss1;
-
-		origstack = gstack = guest_pa(cpu, virtstack);
-		/*
-		 * We push the old stack segment and pointer onto the new
-		 * stack: when the Guest does an "iret" back from the interrupt
-		 * handler the CPU will notice they're dropping privilege
-		 * levels and expect these here.
-		 */
-		push_guest_stack(cpu, &gstack, cpu->regs->ss);
-		push_guest_stack(cpu, &gstack, cpu->regs->esp);
-	} else {
-		/* We're staying on the same Guest (kernel) stack. */
-		virtstack = cpu->regs->esp;
-		ss = cpu->regs->ss;
-
-		origstack = gstack = guest_pa(cpu, virtstack);
-	}
-
-	/*
-	 * Remember that we never let the Guest actually disable interrupts, so
-	 * the "Interrupt Flag" bit is always set.  We copy that bit from the
-	 * Guest's "irq_enabled" field into the eflags word: we saw the Guest
-	 * copy it back in "lguest_iret".
-	 */
-	eflags = cpu->regs->eflags;
-	if (get_user(irq_enable, &cpu->lg->lguest_data->irq_enabled) == 0
-	    && !(irq_enable & X86_EFLAGS_IF))
-		eflags &= ~X86_EFLAGS_IF;
-
-	/*
-	 * An interrupt is expected to push three things on the stack: the old
-	 * "eflags" word, the old code segment, and the old instruction
-	 * pointer.
-	 */
-	push_guest_stack(cpu, &gstack, eflags);
-	push_guest_stack(cpu, &gstack, cpu->regs->cs);
-	push_guest_stack(cpu, &gstack, cpu->regs->eip);
-
-	/* For the six traps which supply an error code, we push that, too. */
-	if (has_err)
-		push_guest_stack(cpu, &gstack, cpu->regs->errcode);
-
-	/* Adjust the stack pointer and stack segment. */
-	cpu->regs->ss = ss;
-	cpu->regs->esp = virtstack + (gstack - origstack);
-}
-
-/*
- * This actually makes the Guest start executing the given interrupt/trap
- * handler.
- *
- * "lo" and "hi" are the two parts of the Interrupt Descriptor Table for this
- * interrupt or trap.  It's split into two parts for traditional reasons: gcc
- * on i386 used to be frightened by 64 bit numbers.
- */
-static void guest_run_interrupt(struct lg_cpu *cpu, u32 lo, u32 hi)
-{
-	/* If we're already in the kernel, we don't change stacks. */
-	if ((cpu->regs->ss&0x3) != GUEST_PL)
-		cpu->regs->ss = cpu->esp1;
-
-	/*
-	 * Set the code segment and the address to execute.
-	 */
-	cpu->regs->cs = (__KERNEL_CS|GUEST_PL);
-	cpu->regs->eip = idt_address(lo, hi);
-
-	/*
-	 * Trapping always clears these flags:
-	 * TF: Trap flag
-	 * VM: Virtual 8086 mode
-	 * RF: Resume
-	 * NT: Nested task.
-	 */
-	cpu->regs->eflags &=
-		~(X86_EFLAGS_TF|X86_EFLAGS_VM|X86_EFLAGS_RF|X86_EFLAGS_NT);
-
-	/*
-	 * There are two kinds of interrupt handlers: 0xE is an "interrupt
-	 * gate" which expects interrupts to be disabled on entry.
-	 */
-	if (idt_type(lo, hi) == 0xE)
-		if (put_user(0, &cpu->lg->lguest_data->irq_enabled))
-			kill_guest(cpu, "Disabling interrupts");
-}
-
-/* This restores the eflags word which was pushed on the stack by a trap */
-static void restore_eflags(struct lg_cpu *cpu)
-{
-	/* This is the physical address of the stack. */
-	unsigned long stack_pa = guest_pa(cpu, cpu->regs->esp);
-
-	/*
-	 * Stack looks like this:
-	 * Address	Contents
-	 * esp		EIP
-	 * esp + 4	CS
-	 * esp + 8	EFLAGS
-	 */
-	cpu->regs->eflags = lgread(cpu, stack_pa + 8, u32);
-	cpu->regs->eflags &=
-		~(X86_EFLAGS_TF|X86_EFLAGS_VM|X86_EFLAGS_RF|X86_EFLAGS_NT);
-}
-
-/*H:205
- * Virtual Interrupts.
- *
- * interrupt_pending() returns the first pending interrupt which isn't blocked
- * by the Guest.  It is called before every entry to the Guest, and just before
- * we go to sleep when the Guest has halted itself.
- */
-unsigned int interrupt_pending(struct lg_cpu *cpu, bool *more)
-{
-	unsigned int irq;
-	DECLARE_BITMAP(blk, LGUEST_IRQS);
-
-	/* If the Guest hasn't even initialized yet, we can do nothing. */
-	if (!cpu->lg->lguest_data)
-		return LGUEST_IRQS;
-
-	/*
-	 * Take our "irqs_pending" array and remove any interrupts the Guest
-	 * wants blocked: the result ends up in "blk".
-	 */
-	if (copy_from_user(&blk, cpu->lg->lguest_data->blocked_interrupts,
-			   sizeof(blk)))
-		return LGUEST_IRQS;
-	bitmap_andnot(blk, cpu->irqs_pending, blk, LGUEST_IRQS);
-
-	/* Find the first interrupt. */
-	irq = find_first_bit(blk, LGUEST_IRQS);
-	*more = find_next_bit(blk, LGUEST_IRQS, irq+1);
-
-	return irq;
-}
-
-/*
- * This actually diverts the Guest to running an interrupt handler, once an
- * interrupt has been identified by interrupt_pending().
- */
-void try_deliver_interrupt(struct lg_cpu *cpu, unsigned int irq, bool more)
-{
-	struct desc_struct *idt;
-
-	BUG_ON(irq >= LGUEST_IRQS);
-
-	/* If they're halted, interrupts restart them. */
-	if (cpu->halted) {
-		/* Re-enable interrupts. */
-		if (put_user(X86_EFLAGS_IF, &cpu->lg->lguest_data->irq_enabled))
-			kill_guest(cpu, "Re-enabling interrupts");
-		cpu->halted = 0;
-	} else {
-		/* Otherwise we check if they have interrupts disabled. */
-		u32 irq_enabled;
-		if (get_user(irq_enabled, &cpu->lg->lguest_data->irq_enabled))
-			irq_enabled = 0;
-		if (!irq_enabled) {
-			/* Make sure they know an IRQ is pending. */
-			put_user(X86_EFLAGS_IF,
-				 &cpu->lg->lguest_data->irq_pending);
-			return;
-		}
-	}
-
-	/*
-	 * Look at the IDT entry the Guest gave us for this interrupt.  The
-	 * first 32 (FIRST_EXTERNAL_VECTOR) entries are for traps, so we skip
-	 * over them.
-	 */
-	idt = &cpu->arch.idt[FIRST_EXTERNAL_VECTOR+irq];
-	/* If they don't have a handler (yet?), we just ignore it */
-	if (idt_present(idt->a, idt->b)) {
-		/* OK, mark it no longer pending and deliver it. */
-		clear_bit(irq, cpu->irqs_pending);
-
-		/*
-		 * They may be about to iret, where they asked us never to
-		 * deliver interrupts.  In this case, we can emulate that iret
-		 * then immediately deliver the interrupt.  This is basically
-		 * a noop: the iret would pop the interrupt frame and restore
-		 * eflags, and then we'd set it up again.  So just restore the
-		 * eflags word and jump straight to the handler in this case.
-		 *
-		 * Denys Vlasenko points out that this isn't quite right: if
-		 * the iret was returning to userspace, then that interrupt
-		 * would reset the stack pointer (which the Guest told us
-		 * about via LHCALL_SET_STACK).  But unless the Guest is being
-		 * *really* weird, that will be the same as the current stack
-		 * anyway.
-		 */
-		if (cpu->regs->eip == cpu->lg->noirq_iret) {
-			restore_eflags(cpu);
-		} else {
-			/*
-			 * set_guest_interrupt() takes a flag to say whether
-			 * this interrupt pushes an error code onto the stack
-			 * as well: virtual interrupts never do.
-			 */
-			push_guest_interrupt_stack(cpu, false);
-		}
-		/* Actually make Guest cpu jump to handler. */
-		guest_run_interrupt(cpu, idt->a, idt->b);
-	}
-
-	/*
-	 * Every time we deliver an interrupt, we update the timestamp in the
-	 * Guest's lguest_data struct.  It would be better for the Guest if we
-	 * did this more often, but it can actually be quite slow: doing it
-	 * here is a compromise which means at least it gets updated every
-	 * timer interrupt.
-	 */
-	write_timestamp(cpu);
-
-	/*
-	 * If there are no other interrupts we want to deliver, clear
-	 * the pending flag.
-	 */
-	if (!more)
-		put_user(0, &cpu->lg->lguest_data->irq_pending);
-}
-
-/* And this is the routine when we want to set an interrupt for the Guest. */
-void set_interrupt(struct lg_cpu *cpu, unsigned int irq)
-{
-	/*
-	 * Next time the Guest runs, the core code will see if it can deliver
-	 * this interrupt.
-	 */
-	set_bit(irq, cpu->irqs_pending);
-
-	/*
-	 * Make sure it sees it; it might be asleep (eg. halted), or running
-	 * the Guest right now, in which case kick_process() will knock it out.
-	 */
-	if (!wake_up_process(cpu->tsk))
-		kick_process(cpu->tsk);
-}
-/*:*/
-
-/*
- * Linux uses trap 128 for system calls.  Plan9 uses 64, and Ron Minnich sent
- * me a patch, so we support that too.  It'd be a big step for lguest if half
- * the Plan 9 user base were to start using it.
- *
- * Actually now I think of it, it's possible that Ron *is* half the Plan 9
- * userbase.  Oh well.
- */
-bool could_be_syscall(unsigned int num)
-{
-	/* Normal Linux IA32_SYSCALL_VECTOR or reserved vector? */
-	return num == IA32_SYSCALL_VECTOR || num == syscall_vector;
-}
-
-/* The syscall vector it wants must be unused by Host. */
-bool check_syscall_vector(struct lguest *lg)
-{
-	u32 vector;
-
-	if (get_user(vector, &lg->lguest_data->syscall_vec))
-		return false;
-
-	return could_be_syscall(vector);
-}
-
-int init_interrupts(void)
-{
-	/* If they want some strange system call vector, reserve it now */
-	if (syscall_vector != IA32_SYSCALL_VECTOR) {
-		if (test_bit(syscall_vector, used_vectors) ||
-		    vector_used_by_percpu_irq(syscall_vector)) {
-			printk(KERN_ERR "lg: couldn't reserve syscall %u\n",
-				 syscall_vector);
-			return -EBUSY;
-		}
-		set_bit(syscall_vector, used_vectors);
-	}
-
-	return 0;
-}
-
-void free_interrupts(void)
-{
-	if (syscall_vector != IA32_SYSCALL_VECTOR)
-		clear_bit(syscall_vector, used_vectors);
-}
-
-/*H:220
- * Now we've got the routines to deliver interrupts, delivering traps like
- * page fault is easy.  The only trick is that Intel decided that some traps
- * should have error codes:
- */
-static bool has_err(unsigned int trap)
-{
-	return (trap == 8 || (trap >= 10 && trap <= 14) || trap == 17);
-}
-
-/* deliver_trap() returns true if it could deliver the trap. */
-bool deliver_trap(struct lg_cpu *cpu, unsigned int num)
-{
-	/*
-	 * Trap numbers are always 8 bit, but we set an impossible trap number
-	 * for traps inside the Switcher, so check that here.
-	 */
-	if (num >= ARRAY_SIZE(cpu->arch.idt))
-		return false;
-
-	/*
-	 * Early on the Guest hasn't set the IDT entries (or maybe it put a
-	 * bogus one in): if we fail here, the Guest will be killed.
-	 */
-	if (!idt_present(cpu->arch.idt[num].a, cpu->arch.idt[num].b))
-		return false;
-	push_guest_interrupt_stack(cpu, has_err(num));
-	guest_run_interrupt(cpu, cpu->arch.idt[num].a,
-			    cpu->arch.idt[num].b);
-	return true;
-}
-
-/*H:250
- * Here's the hard part: returning to the Host every time a trap happens
- * and then calling deliver_trap() and re-entering the Guest is slow.
- * Particularly because Guest userspace system calls are traps (usually trap
- * 128).
- *
- * So we'd like to set up the IDT to tell the CPU to deliver traps directly
- * into the Guest.  This is possible, but the complexities cause the size of
- * this file to double!  However, 150 lines of code is worth writing for taking
- * system calls down from 1750ns to 270ns.  Plus, if lguest didn't do it, all
- * the other hypervisors would beat it up at lunchtime.
- *
- * This routine indicates if a particular trap number could be delivered
- * directly.
- *
- * Unfortunately, Linux 4.6 started using an interrupt gate instead of a
- * trap gate for syscalls, so this trick is ineffective.  See Mastery for
- * how we could do this anyway...
- */
-static bool direct_trap(unsigned int num)
-{
-	/*
-	 * Hardware interrupts don't go to the Guest at all (except system
-	 * call).
-	 */
-	if (num >= FIRST_EXTERNAL_VECTOR && !could_be_syscall(num))
-		return false;
-
-	/*
-	 * The Host needs to see page faults (for shadow paging and to save the
-	 * fault address), general protection faults (in/out emulation) and
-	 * device not available (TS handling) and of course, the hypercall trap.
-	 */
-	return num != 14 && num != 13 && num != 7 && num != LGUEST_TRAP_ENTRY;
-}
-/*:*/
-
-/*M:005
- * The Guest has the ability to turn its interrupt gates into trap gates,
- * if it is careful.  The Host will let trap gates can go directly to the
- * Guest, but the Guest needs the interrupts atomically disabled for an
- * interrupt gate.  The Host could provide a mechanism to register more
- * "no-interrupt" regions, and the Guest could point the trap gate at
- * instructions within that region, where it can safely disable interrupts.
- */
-
-/*M:006
- * The Guests do not use the sysenter (fast system call) instruction,
- * because it's hardcoded to enter privilege level 0 and so can't go direct.
- * It's about twice as fast as the older "int 0x80" system call, so it might
- * still be worthwhile to handle it in the Switcher and lcall down to the
- * Guest.  The sysenter semantics are hairy tho: search for that keyword in
- * entry.S
-:*/
-
-/*H:260
- * When we make traps go directly into the Guest, we need to make sure
- * the kernel stack is valid (ie. mapped in the page tables).  Otherwise, the
- * CPU trying to deliver the trap will fault while trying to push the interrupt
- * words on the stack: this is called a double fault, and it forces us to kill
- * the Guest.
- *
- * Which is deeply unfair, because (literally!) it wasn't the Guests' fault.
- */
-void pin_stack_pages(struct lg_cpu *cpu)
-{
-	unsigned int i;
-
-	/*
-	 * Depending on the CONFIG_4KSTACKS option, the Guest can have one or
-	 * two pages of stack space.
-	 */
-	for (i = 0; i < cpu->lg->stack_pages; i++)
-		/*
-		 * The stack grows *upwards*, so the address we're given is the
-		 * start of the page after the kernel stack.  Subtract one to
-		 * get back onto the first stack page, and keep subtracting to
-		 * get to the rest of the stack pages.
-		 */
-		pin_page(cpu, cpu->esp1 - 1 - i * PAGE_SIZE);
-}
-
-/*
- * Direct traps also mean that we need to know whenever the Guest wants to use
- * a different kernel stack, so we can change the guest TSS to use that
- * stack.  The TSS entries expect a virtual address, so unlike most addresses
- * the Guest gives us, the "esp" (stack pointer) value here is virtual, not
- * physical.
- *
- * In Linux each process has its own kernel stack, so this happens a lot: we
- * change stacks on each context switch.
- */
-void guest_set_stack(struct lg_cpu *cpu, u32 seg, u32 esp, unsigned int pages)
-{
-	/*
-	 * You're not allowed a stack segment with privilege level 0: bad Guest!
-	 */
-	if ((seg & 0x3) != GUEST_PL)
-		kill_guest(cpu, "bad stack segment %i", seg);
-	/* We only expect one or two stack pages. */
-	if (pages > 2)
-		kill_guest(cpu, "bad stack pages %u", pages);
-	/* Save where the stack is, and how many pages */
-	cpu->ss1 = seg;
-	cpu->esp1 = esp;
-	cpu->lg->stack_pages = pages;
-	/* Make sure the new stack pages are mapped */
-	pin_stack_pages(cpu);
-}
-
-/*
- * All this reference to mapping stacks leads us neatly into the other complex
- * part of the Host: page table handling.
- */
-
-/*H:235
- * This is the routine which actually checks the Guest's IDT entry and
- * transfers it into the entry in "struct lguest":
- */
-static void set_trap(struct lg_cpu *cpu, struct desc_struct *trap,
-		     unsigned int num, u32 lo, u32 hi)
-{
-	u8 type = idt_type(lo, hi);
-
-	/* We zero-out a not-present entry */
-	if (!idt_present(lo, hi)) {
-		trap->a = trap->b = 0;
-		return;
-	}
-
-	/* We only support interrupt and trap gates. */
-	if (type != 0xE && type != 0xF)
-		kill_guest(cpu, "bad IDT type %i", type);
-
-	/*
-	 * We only copy the handler address, present bit, privilege level and
-	 * type.  The privilege level controls where the trap can be triggered
-	 * manually with an "int" instruction.  This is usually GUEST_PL,
-	 * except for system calls which userspace can use.
-	 */
-	trap->a = ((__KERNEL_CS|GUEST_PL)<<16) | (lo&0x0000FFFF);
-	trap->b = (hi&0xFFFFEF00);
-}
-
-/*H:230
- * While we're here, dealing with delivering traps and interrupts to the
- * Guest, we might as well complete the picture: how the Guest tells us where
- * it wants them to go.  This would be simple, except making traps fast
- * requires some tricks.
- *
- * We saw the Guest setting Interrupt Descriptor Table (IDT) entries with the
- * LHCALL_LOAD_IDT_ENTRY hypercall before: that comes here.
- */
-void load_guest_idt_entry(struct lg_cpu *cpu, unsigned int num, u32 lo, u32 hi)
-{
-	/*
-	 * Guest never handles: NMI, doublefault, spurious interrupt or
-	 * hypercall.  We ignore when it tries to set them.
-	 */
-	if (num == 2 || num == 8 || num == 15 || num == LGUEST_TRAP_ENTRY)
-		return;
-
-	/*
-	 * Mark the IDT as changed: next time the Guest runs we'll know we have
-	 * to copy this again.
-	 */
-	cpu->changed |= CHANGED_IDT;
-
-	/* Check that the Guest doesn't try to step outside the bounds. */
-	if (num >= ARRAY_SIZE(cpu->arch.idt))
-		kill_guest(cpu, "Setting idt entry %u", num);
-	else
-		set_trap(cpu, &cpu->arch.idt[num], num, lo, hi);
-}
-
-/*
- * The default entry for each interrupt points into the Switcher routines which
- * simply return to the Host.  The run_guest() loop will then call
- * deliver_trap() to bounce it back into the Guest.
- */
-static void default_idt_entry(struct desc_struct *idt,
-			      int trap,
-			      const unsigned long handler,
-			      const struct desc_struct *base)
-{
-	/* A present interrupt gate. */
-	u32 flags = 0x8e00;
-
-	/*
-	 * Set the privilege level on the entry for the hypercall: this allows
-	 * the Guest to use the "int" instruction to trigger it.
-	 */
-	if (trap == LGUEST_TRAP_ENTRY)
-		flags |= (GUEST_PL << 13);
-	else if (base)
-		/*
-		 * Copy privilege level from what Guest asked for.  This allows
-		 * debug (int 3) traps from Guest userspace, for example.
-		 */
-		flags |= (base->b & 0x6000);
-
-	/* Now pack it into the IDT entry in its weird format. */
-	idt->a = (LGUEST_CS<<16) | (handler&0x0000FFFF);
-	idt->b = (handler&0xFFFF0000) | flags;
-}
-
-/* When the Guest first starts, we put default entries into the IDT. */
-void setup_default_idt_entries(struct lguest_ro_state *state,
-			       const unsigned long *def)
-{
-	unsigned int i;
-
-	for (i = 0; i < ARRAY_SIZE(state->guest_idt); i++)
-		default_idt_entry(&state->guest_idt[i], i, def[i], NULL);
-}
-
-/*H:240
- * We don't use the IDT entries in the "struct lguest" directly, instead
- * we copy them into the IDT which we've set up for Guests on this CPU, just
- * before we run the Guest.  This routine does that copy.
- */
-void copy_traps(const struct lg_cpu *cpu, struct desc_struct *idt,
-		const unsigned long *def)
-{
-	unsigned int i;
-
-	/*
-	 * We can simply copy the direct traps, otherwise we use the default
-	 * ones in the Switcher: they will return to the Host.
-	 */
-	for (i = 0; i < ARRAY_SIZE(cpu->arch.idt); i++) {
-		const struct desc_struct *gidt = &cpu->arch.idt[i];
-
-		/* If no Guest can ever override this trap, leave it alone. */
-		if (!direct_trap(i))
-			continue;
-
-		/*
-		 * Only trap gates (type 15) can go direct to the Guest.
-		 * Interrupt gates (type 14) disable interrupts as they are
-		 * entered, which we never let the Guest do.  Not present
-		 * entries (type 0x0) also can't go direct, of course.
-		 *
-		 * If it can't go direct, we still need to copy the priv. level:
-		 * they might want to give userspace access to a software
-		 * interrupt.
-		 */
-		if (idt_type(gidt->a, gidt->b) == 0xF)
-			idt[i] = *gidt;
-		else
-			default_idt_entry(&idt[i], i, def[i], gidt);
-	}
-}
-
-/*H:200
- * The Guest Clock.
- *
- * There are two sources of virtual interrupts.  We saw one in lguest_user.c:
- * the Launcher sending interrupts for virtual devices.  The other is the Guest
- * timer interrupt.
- *
- * The Guest uses the LHCALL_SET_CLOCKEVENT hypercall to tell us how long to
- * the next timer interrupt (in nanoseconds).  We use the high-resolution timer
- * infrastructure to set a callback at that time.
- *
- * 0 means "turn off the clock".
- */
-void guest_set_clockevent(struct lg_cpu *cpu, unsigned long delta)
-{
-	ktime_t expires;
-
-	if (unlikely(delta == 0)) {
-		/* Clock event device is shutting down. */
-		hrtimer_cancel(&cpu->hrt);
-		return;
-	}
-
-	/*
-	 * We use wallclock time here, so the Guest might not be running for
-	 * all the time between now and the timer interrupt it asked for.  This
-	 * is almost always the right thing to do.
-	 */
-	expires = ktime_add_ns(ktime_get_real(), delta);
-	hrtimer_start(&cpu->hrt, expires, HRTIMER_MODE_ABS);
-}
-
-/* This is the function called when the Guest's timer expires. */
-static enum hrtimer_restart clockdev_fn(struct hrtimer *timer)
-{
-	struct lg_cpu *cpu = container_of(timer, struct lg_cpu, hrt);
-
-	/* Remember the first interrupt is the timer interrupt. */
-	set_interrupt(cpu, 0);
-	return HRTIMER_NORESTART;
-}
-
-/* This sets up the timer for this Guest. */
-void init_clockdev(struct lg_cpu *cpu)
-{
-	hrtimer_init(&cpu->hrt, CLOCK_REALTIME, HRTIMER_MODE_ABS);
-	cpu->hrt.function = clockdev_fn;
-}
diff --git a/drivers/lguest/lg.h b/drivers/lguest/lg.h
deleted file mode 100644
index 2356a2318034..000000000000
--- a/drivers/lguest/lg.h
+++ /dev/null
@@ -1,258 +0,0 @@
-#ifndef _LGUEST_H
-#define _LGUEST_H
-
-#ifndef __ASSEMBLY__
-#include <linux/types.h>
-#include <linux/init.h>
-#include <linux/stringify.h>
-#include <linux/lguest.h>
-#include <linux/lguest_launcher.h>
-#include <linux/wait.h>
-#include <linux/hrtimer.h>
-#include <linux/err.h>
-#include <linux/slab.h>
-
-#include <asm/lguest.h>
-
-struct pgdir {
-	unsigned long gpgdir;
-	bool switcher_mapped;
-	int last_host_cpu;
-	pgd_t *pgdir;
-};
-
-/* We have two pages shared with guests, per cpu.  */
-struct lguest_pages {
-	/* This is the stack page mapped rw in guest */
-	char spare[PAGE_SIZE - sizeof(struct lguest_regs)];
-	struct lguest_regs regs;
-
-	/* This is the host state & guest descriptor page, ro in guest */
-	struct lguest_ro_state state;
-} __attribute__((aligned(PAGE_SIZE)));
-
-#define CHANGED_IDT		1
-#define CHANGED_GDT		2
-#define CHANGED_GDT_TLS		4 /* Actually a subset of CHANGED_GDT */
-#define CHANGED_ALL	        3
-
-struct lg_cpu {
-	unsigned int id;
-	struct lguest *lg;
-	struct task_struct *tsk;
-	struct mm_struct *mm; 	/* == tsk->mm, but that becomes NULL on exit */
-
-	u32 cr2;
-	u32 esp1;
-	u16 ss1;
-
-	/* Bitmap of what has changed: see CHANGED_* above. */
-	int changed;
-
-	/* Pending operation. */
-	struct lguest_pending pending;
-
-	unsigned long *reg_read; /* register from LHREQ_GETREG */
-
-	/* At end of a page shared mapped over lguest_pages in guest. */
-	unsigned long regs_page;
-	struct lguest_regs *regs;
-
-	struct lguest_pages *last_pages;
-
-	/* Initialization mode: linear map everything. */
-	bool linear_pages;
-	int cpu_pgd; /* Which pgd this cpu is currently using */
-
-	/* If a hypercall was asked for, this points to the arguments. */
-	struct hcall_args *hcall;
-	u32 next_hcall;
-
-	/* Virtual clock device */
-	struct hrtimer hrt;
-
-	/* Did the Guest tell us to halt? */
-	int halted;
-
-	/* Pending virtual interrupts */
-	DECLARE_BITMAP(irqs_pending, LGUEST_IRQS);
-
-	struct lg_cpu_arch arch;
-};
-
-/* The private info the thread maintains about the guest. */
-struct lguest {
-	struct lguest_data __user *lguest_data;
-	struct lg_cpu cpus[NR_CPUS];
-	unsigned int nr_cpus;
-
-	/* Valid guest memory pages must be < this. */
-	u32 pfn_limit;
-
-	/* Device memory is >= pfn_limit and < device_limit. */
-	u32 device_limit;
-
-	/*
-	 * This provides the offset to the base of guest-physical memory in the
-	 * Launcher.
-	 */
-	void __user *mem_base;
-	unsigned long kernel_address;
-
-	struct pgdir pgdirs[4];
-
-	unsigned long noirq_iret;
-
-	unsigned int stack_pages;
-	u32 tsc_khz;
-
-	/* Dead? */
-	const char *dead;
-};
-
-extern struct mutex lguest_lock;
-
-/* core.c: */
-bool lguest_address_ok(const struct lguest *lg,
-		       unsigned long addr, unsigned long len);
-void __lgread(struct lg_cpu *, void *, unsigned long, unsigned);
-void __lgwrite(struct lg_cpu *, unsigned long, const void *, unsigned);
-extern struct page **lg_switcher_pages;
-
-/*H:035
- * Using memory-copy operations like that is usually inconvient, so we
- * have the following helper macros which read and write a specific type (often
- * an unsigned long).
- *
- * This reads into a variable of the given type then returns that.
- */
-#define lgread(cpu, addr, type)						\
-	({ type _v; __lgread((cpu), &_v, (addr), sizeof(_v)); _v; })
-
-/* This checks that the variable is of the given type, then writes it out. */
-#define lgwrite(cpu, addr, type, val)				\
-	do {							\
-		typecheck(type, val);				\
-		__lgwrite((cpu), (addr), &(val), sizeof(val));	\
-	} while(0)
-/* (end of memory access helper routines) :*/
-
-int run_guest(struct lg_cpu *cpu, unsigned long __user *user);
-
-/*
- * Helper macros to obtain the first 12 or the last 20 bits, this is only the
- * first step in the migration to the kernel types.  pte_pfn is already defined
- * in the kernel.
- */
-#define pgd_flags(x)	(pgd_val(x) & ~PAGE_MASK)
-#define pgd_pfn(x)	(pgd_val(x) >> PAGE_SHIFT)
-#define pmd_flags(x)    (pmd_val(x) & ~PAGE_MASK)
-#define pmd_pfn(x)	(pmd_val(x) >> PAGE_SHIFT)
-
-/* interrupts_and_traps.c: */
-unsigned int interrupt_pending(struct lg_cpu *cpu, bool *more);
-void try_deliver_interrupt(struct lg_cpu *cpu, unsigned int irq, bool more);
-void set_interrupt(struct lg_cpu *cpu, unsigned int irq);
-bool deliver_trap(struct lg_cpu *cpu, unsigned int num);
-void load_guest_idt_entry(struct lg_cpu *cpu, unsigned int i,
-			  u32 low, u32 hi);
-void guest_set_stack(struct lg_cpu *cpu, u32 seg, u32 esp, unsigned int pages);
-void pin_stack_pages(struct lg_cpu *cpu);
-void setup_default_idt_entries(struct lguest_ro_state *state,
-			       const unsigned long *def);
-void copy_traps(const struct lg_cpu *cpu, struct desc_struct *idt,
-		const unsigned long *def);
-void guest_set_clockevent(struct lg_cpu *cpu, unsigned long delta);
-bool send_notify_to_eventfd(struct lg_cpu *cpu);
-void init_clockdev(struct lg_cpu *cpu);
-bool check_syscall_vector(struct lguest *lg);
-bool could_be_syscall(unsigned int num);
-int init_interrupts(void);
-void free_interrupts(void);
-
-/* segments.c: */
-void setup_default_gdt_entries(struct lguest_ro_state *state);
-void setup_guest_gdt(struct lg_cpu *cpu);
-void load_guest_gdt_entry(struct lg_cpu *cpu, unsigned int i,
-			  u32 low, u32 hi);
-void guest_load_tls(struct lg_cpu *cpu, unsigned long tls_array);
-void copy_gdt(const struct lg_cpu *cpu, struct desc_struct *gdt);
-void copy_gdt_tls(const struct lg_cpu *cpu, struct desc_struct *gdt);
-
-/* page_tables.c: */
-int init_guest_pagetable(struct lguest *lg);
-void free_guest_pagetable(struct lguest *lg);
-void guest_new_pagetable(struct lg_cpu *cpu, unsigned long pgtable);
-void guest_set_pgd(struct lguest *lg, unsigned long gpgdir, u32 i);
-#ifdef CONFIG_X86_PAE
-void guest_set_pmd(struct lguest *lg, unsigned long gpgdir, u32 i);
-#endif
-void guest_pagetable_clear_all(struct lg_cpu *cpu);
-void guest_pagetable_flush_user(struct lg_cpu *cpu);
-void guest_set_pte(struct lg_cpu *cpu, unsigned long gpgdir,
-		   unsigned long vaddr, pte_t val);
-void map_switcher_in_guest(struct lg_cpu *cpu, struct lguest_pages *pages);
-bool demand_page(struct lg_cpu *cpu, unsigned long cr2, int errcode,
-		 unsigned long *iomem);
-void pin_page(struct lg_cpu *cpu, unsigned long vaddr);
-bool __guest_pa(struct lg_cpu *cpu, unsigned long vaddr, unsigned long *paddr);
-unsigned long guest_pa(struct lg_cpu *cpu, unsigned long vaddr);
-void page_table_guest_data_init(struct lg_cpu *cpu);
-
-/* <arch>/core.c: */
-void lguest_arch_host_init(void);
-void lguest_arch_host_fini(void);
-void lguest_arch_run_guest(struct lg_cpu *cpu);
-void lguest_arch_handle_trap(struct lg_cpu *cpu);
-int lguest_arch_init_hypercalls(struct lg_cpu *cpu);
-int lguest_arch_do_hcall(struct lg_cpu *cpu, struct hcall_args *args);
-void lguest_arch_setup_regs(struct lg_cpu *cpu, unsigned long start);
-unsigned long *lguest_arch_regptr(struct lg_cpu *cpu, size_t reg_off, bool any);
-
-/* <arch>/switcher.S: */
-extern char start_switcher_text[], end_switcher_text[], switch_to_guest[];
-
-/* lguest_user.c: */
-int lguest_device_init(void);
-void lguest_device_remove(void);
-
-/* hypercalls.c: */
-void do_hypercalls(struct lg_cpu *cpu);
-void write_timestamp(struct lg_cpu *cpu);
-
-/*L:035
- * Let's step aside for the moment, to study one important routine that's used
- * widely in the Host code.
- *
- * There are many cases where the Guest can do something invalid, like pass crap
- * to a hypercall.  Since only the Guest kernel can make hypercalls, it's quite
- * acceptable to simply terminate the Guest and give the Launcher a nicely
- * formatted reason.  It's also simpler for the Guest itself, which doesn't
- * need to check most hypercalls for "success"; if you're still running, it
- * succeeded.
- *
- * Once this is called, the Guest will never run again, so most Host code can
- * call this then continue as if nothing had happened.  This means many
- * functions don't have to explicitly return an error code, which keeps the
- * code simple.
- *
- * It also means that this can be called more than once: only the first one is
- * remembered.  The only trick is that we still need to kill the Guest even if
- * we can't allocate memory to store the reason.  Linux has a neat way of
- * packing error codes into invalid pointers, so we use that here.
- *
- * Like any macro which uses an "if", it is safely wrapped in a run-once "do {
- * } while(0)".
- */
-#define kill_guest(cpu, fmt...)					\
-do {								\
-	if (!(cpu)->lg->dead) {					\
-		(cpu)->lg->dead = kasprintf(GFP_ATOMIC, fmt);	\
-		if (!(cpu)->lg->dead)				\
-			(cpu)->lg->dead = ERR_PTR(-ENOMEM);	\
-	}							\
-} while(0)
-/* (End of aside) :*/
-
-#endif	/* __ASSEMBLY__ */
-#endif	/* _LGUEST_H */
diff --git a/drivers/lguest/lguest_user.c b/drivers/lguest/lguest_user.c
deleted file mode 100644
index 1a6787bc9386..000000000000
--- a/drivers/lguest/lguest_user.c
+++ /dev/null
@@ -1,446 +0,0 @@
-/*P:200 This contains all the /dev/lguest code, whereby the userspace
- * launcher controls and communicates with the Guest.  For example,
- * the first write will tell us the Guest's memory layout and entry
- * point.  A read will run the Guest until something happens, such as
- * a signal or the Guest accessing a device.
-:*/
-#include <linux/uaccess.h>
-#include <linux/miscdevice.h>
-#include <linux/fs.h>
-#include <linux/sched.h>
-#include <linux/sched/mm.h>
-#include <linux/file.h>
-#include <linux/slab.h>
-#include <linux/export.h>
-#include "lg.h"
-
-/*L:052
-  The Launcher can get the registers, and also set some of them.
-*/
-static int getreg_setup(struct lg_cpu *cpu, const unsigned long __user *input)
-{
-	unsigned long which;
-
-	/* We re-use the ptrace structure to specify which register to read. */
-	if (get_user(which, input) != 0)
-		return -EFAULT;
-
-	/*
-	 * We set up the cpu register pointer, and their next read will
-	 * actually get the value (instead of running the guest).
-	 *
-	 * The last argument 'true' says we can access any register.
-	 */
-	cpu->reg_read = lguest_arch_regptr(cpu, which, true);
-	if (!cpu->reg_read)
-		return -ENOENT;
-
-	/* And because this is a write() call, we return the length used. */
-	return sizeof(unsigned long) * 2;
-}
-
-static int setreg(struct lg_cpu *cpu, const unsigned long __user *input)
-{
-	unsigned long which, value, *reg;
-
-	/* We re-use the ptrace structure to specify which register to read. */
-	if (get_user(which, input) != 0)
-		return -EFAULT;
-	input++;
-	if (get_user(value, input) != 0)
-		return -EFAULT;
-
-	/* The last argument 'false' means we can't access all registers. */
-	reg = lguest_arch_regptr(cpu, which, false);
-	if (!reg)
-		return -ENOENT;
-
-	*reg = value;
-
-	/* And because this is a write() call, we return the length used. */
-	return sizeof(unsigned long) * 3;
-}
-
-/*L:050
- * Sending an interrupt is done by writing LHREQ_IRQ and an interrupt
- * number to /dev/lguest.
- */
-static int user_send_irq(struct lg_cpu *cpu, const unsigned long __user *input)
-{
-	unsigned long irq;
-
-	if (get_user(irq, input) != 0)
-		return -EFAULT;
-	if (irq >= LGUEST_IRQS)
-		return -EINVAL;
-
-	/*
-	 * Next time the Guest runs, the core code will see if it can deliver
-	 * this interrupt.
-	 */
-	set_interrupt(cpu, irq);
-	return 0;
-}
-
-/*L:053
- * Deliver a trap: this is used by the Launcher if it can't emulate
- * an instruction.
- */
-static int trap(struct lg_cpu *cpu, const unsigned long __user *input)
-{
-	unsigned long trapnum;
-
-	if (get_user(trapnum, input) != 0)
-		return -EFAULT;
-
-	if (!deliver_trap(cpu, trapnum))
-		return -EINVAL;
-
-	return 0;
-}
-
-/*L:040
- * Once our Guest is initialized, the Launcher makes it run by reading
- * from /dev/lguest.
- */
-static ssize_t read(struct file *file, char __user *user, size_t size,loff_t*o)
-{
-	struct lguest *lg = file->private_data;
-	struct lg_cpu *cpu;
-	unsigned int cpu_id = *o;
-
-	/* You must write LHREQ_INITIALIZE first! */
-	if (!lg)
-		return -EINVAL;
-
-	/* Watch out for arbitrary vcpu indexes! */
-	if (cpu_id >= lg->nr_cpus)
-		return -EINVAL;
-
-	cpu = &lg->cpus[cpu_id];
-
-	/* If you're not the task which owns the Guest, go away. */
-	if (current != cpu->tsk)
-		return -EPERM;
-
-	/* If the Guest is already dead, we indicate why */
-	if (lg->dead) {
-		size_t len;
-
-		/* lg->dead either contains an error code, or a string. */
-		if (IS_ERR(lg->dead))
-			return PTR_ERR(lg->dead);
-
-		/* We can only return as much as the buffer they read with. */
-		len = min(size, strlen(lg->dead)+1);
-		if (copy_to_user(user, lg->dead, len) != 0)
-			return -EFAULT;
-		return len;
-	}
-
-	/*
-	 * If we returned from read() last time because the Guest sent I/O,
-	 * clear the flag.
-	 */
-	if (cpu->pending.trap)
-		cpu->pending.trap = 0;
-
-	/* Run the Guest until something interesting happens. */
-	return run_guest(cpu, (unsigned long __user *)user);
-}
-
-/*L:025
- * This actually initializes a CPU.  For the moment, a Guest is only
- * uniprocessor, so "id" is always 0.
- */
-static int lg_cpu_start(struct lg_cpu *cpu, unsigned id, unsigned long start_ip)
-{
-	/* We have a limited number of CPUs in the lguest struct. */
-	if (id >= ARRAY_SIZE(cpu->lg->cpus))
-		return -EINVAL;
-
-	/* Set up this CPU's id, and pointer back to the lguest struct. */
-	cpu->id = id;
-	cpu->lg = container_of(cpu, struct lguest, cpus[id]);
-	cpu->lg->nr_cpus++;
-
-	/* Each CPU has a timer it can set. */
-	init_clockdev(cpu);
-
-	/*
-	 * We need a complete page for the Guest registers: they are accessible
-	 * to the Guest and we can only grant it access to whole pages.
-	 */
-	cpu->regs_page = get_zeroed_page(GFP_KERNEL);
-	if (!cpu->regs_page)
-		return -ENOMEM;
-
-	/* We actually put the registers at the end of the page. */
-	cpu->regs = (void *)cpu->regs_page + PAGE_SIZE - sizeof(*cpu->regs);
-
-	/*
-	 * Now we initialize the Guest's registers, handing it the start
-	 * address.
-	 */
-	lguest_arch_setup_regs(cpu, start_ip);
-
-	/*
-	 * We keep a pointer to the Launcher task (ie. current task) for when
-	 * other Guests want to wake this one (eg. console input).
-	 */
-	cpu->tsk = current;
-
-	/*
-	 * We need to keep a pointer to the Launcher's memory map, because if
-	 * the Launcher dies we need to clean it up.  If we don't keep a
-	 * reference, it is destroyed before close() is called.
-	 */
-	cpu->mm = get_task_mm(cpu->tsk);
-
-	/*
-	 * We remember which CPU's pages this Guest used last, for optimization
-	 * when the same Guest runs on the same CPU twice.
-	 */
-	cpu->last_pages = NULL;
-
-	/* No error == success. */
-	return 0;
-}
-
-/*L:020
- * The initialization write supplies 3 pointer sized (32 or 64 bit) values (in
- * addition to the LHREQ_INITIALIZE value).  These are:
- *
- * base: The start of the Guest-physical memory inside the Launcher memory.
- *
- * pfnlimit: The highest (Guest-physical) page number the Guest should be
- * allowed to access.  The Guest memory lives inside the Launcher, so it sets
- * this to ensure the Guest can only reach its own memory.
- *
- * start: The first instruction to execute ("eip" in x86-speak).
- */
-static int initialize(struct file *file, const unsigned long __user *input)
-{
-	/* "struct lguest" contains all we (the Host) know about a Guest. */
-	struct lguest *lg;
-	int err;
-	unsigned long args[4];
-
-	/*
-	 * We grab the Big Lguest lock, which protects against multiple
-	 * simultaneous initializations.
-	 */
-	mutex_lock(&lguest_lock);
-	/* You can't initialize twice!  Close the device and start again... */
-	if (file->private_data) {
-		err = -EBUSY;
-		goto unlock;
-	}
-
-	if (copy_from_user(args, input, sizeof(args)) != 0) {
-		err = -EFAULT;
-		goto unlock;
-	}
-
-	lg = kzalloc(sizeof(*lg), GFP_KERNEL);
-	if (!lg) {
-		err = -ENOMEM;
-		goto unlock;
-	}
-
-	/* Populate the easy fields of our "struct lguest" */
-	lg->mem_base = (void __user *)args[0];
-	lg->pfn_limit = args[1];
-	lg->device_limit = args[3];
-
-	/* This is the first cpu (cpu 0) and it will start booting at args[2] */
-	err = lg_cpu_start(&lg->cpus[0], 0, args[2]);
-	if (err)
-		goto free_lg;
-
-	/*
-	 * Initialize the Guest's shadow page tables.  This allocates
-	 * memory, so can fail.
-	 */
-	err = init_guest_pagetable(lg);
-	if (err)
-		goto free_regs;
-
-	/* We keep our "struct lguest" in the file's private_data. */
-	file->private_data = lg;
-
-	mutex_unlock(&lguest_lock);
-
-	/* And because this is a write() call, we return the length used. */
-	return sizeof(args);
-
-free_regs:
-	/* FIXME: This should be in free_vcpu */
-	free_page(lg->cpus[0].regs_page);
-free_lg:
-	kfree(lg);
-unlock:
-	mutex_unlock(&lguest_lock);
-	return err;
-}
-
-/*L:010
- * The first operation the Launcher does must be a write.  All writes
- * start with an unsigned long number: for the first write this must be
- * LHREQ_INITIALIZE to set up the Guest.  After that the Launcher can use
- * writes of other values to send interrupts or set up receipt of notifications.
- *
- * Note that we overload the "offset" in the /dev/lguest file to indicate what
- * CPU number we're dealing with.  Currently this is always 0 since we only
- * support uniprocessor Guests, but you can see the beginnings of SMP support
- * here.
- */
-static ssize_t write(struct file *file, const char __user *in,
-		     size_t size, loff_t *off)
-{
-	/*
-	 * Once the Guest is initialized, we hold the "struct lguest" in the
-	 * file private data.
-	 */
-	struct lguest *lg = file->private_data;
-	const unsigned long __user *input = (const unsigned long __user *)in;
-	unsigned long req;
-	struct lg_cpu *uninitialized_var(cpu);
-	unsigned int cpu_id = *off;
-
-	/* The first value tells us what this request is. */
-	if (get_user(req, input) != 0)
-		return -EFAULT;
-	input++;
-
-	/* If you haven't initialized, you must do that first. */
-	if (req != LHREQ_INITIALIZE) {
-		if (!lg || (cpu_id >= lg->nr_cpus))
-			return -EINVAL;
-		cpu = &lg->cpus[cpu_id];
-
-		/* Once the Guest is dead, you can only read() why it died. */
-		if (lg->dead)
-			return -ENOENT;
-	}
-
-	switch (req) {
-	case LHREQ_INITIALIZE:
-		return initialize(file, input);
-	case LHREQ_IRQ:
-		return user_send_irq(cpu, input);
-	case LHREQ_GETREG:
-		return getreg_setup(cpu, input);
-	case LHREQ_SETREG:
-		return setreg(cpu, input);
-	case LHREQ_TRAP:
-		return trap(cpu, input);
-	default:
-		return -EINVAL;
-	}
-}
-
-static int open(struct inode *inode, struct file *file)
-{
-	file->private_data = NULL;
-
-	return 0;
-}
-
-/*L:060
- * The final piece of interface code is the close() routine.  It reverses
- * everything done in initialize().  This is usually called because the
- * Launcher exited.
- *
- * Note that the close routine returns 0 or a negative error number: it can't
- * really fail, but it can whine.  I blame Sun for this wart, and K&R C for
- * letting them do it.
-:*/
-static int close(struct inode *inode, struct file *file)
-{
-	struct lguest *lg = file->private_data;
-	unsigned int i;
-
-	/* If we never successfully initialized, there's nothing to clean up */
-	if (!lg)
-		return 0;
-
-	/*
-	 * We need the big lock, to protect from inter-guest I/O and other
-	 * Launchers initializing guests.
-	 */
-	mutex_lock(&lguest_lock);
-
-	/* Free up the shadow page tables for the Guest. */
-	free_guest_pagetable(lg);
-
-	for (i = 0; i < lg->nr_cpus; i++) {
-		/* Cancels the hrtimer set via LHCALL_SET_CLOCKEVENT. */
-		hrtimer_cancel(&lg->cpus[i].hrt);
-		/* We can free up the register page we allocated. */
-		free_page(lg->cpus[i].regs_page);
-		/*
-		 * Now all the memory cleanups are done, it's safe to release
-		 * the Launcher's memory management structure.
-		 */
-		mmput(lg->cpus[i].mm);
-	}
-
-	/*
-	 * If lg->dead doesn't contain an error code it will be NULL or a
-	 * kmalloc()ed string, either of which is ok to hand to kfree().
-	 */
-	if (!IS_ERR(lg->dead))
-		kfree(lg->dead);
-	/* Free the memory allocated to the lguest_struct */
-	kfree(lg);
-	/* Release lock and exit. */
-	mutex_unlock(&lguest_lock);
-
-	return 0;
-}
-
-/*L:000
- * Welcome to our journey through the Launcher!
- *
- * The Launcher is the Host userspace program which sets up, runs and services
- * the Guest.  In fact, many comments in the Drivers which refer to "the Host"
- * doing things are inaccurate: the Launcher does all the device handling for
- * the Guest, but the Guest can't know that.
- *
- * Just to confuse you: to the Host kernel, the Launcher *is* the Guest and we
- * shall see more of that later.
- *
- * We begin our understanding with the Host kernel interface which the Launcher
- * uses: reading and writing a character device called /dev/lguest.  All the
- * work happens in the read(), write() and close() routines:
- */
-static const struct file_operations lguest_fops = {
-	.owner	 = THIS_MODULE,
-	.open	 = open,
-	.release = close,
-	.write	 = write,
-	.read	 = read,
-	.llseek  = default_llseek,
-};
-/*:*/
-
-/*
- * This is a textbook example of a "misc" character device.  Populate a "struct
- * miscdevice" and register it with misc_register().
- */
-static struct miscdevice lguest_dev = {
-	.minor	= MISC_DYNAMIC_MINOR,
-	.name	= "lguest",
-	.fops	= &lguest_fops,
-};
-
-int __init lguest_device_init(void)
-{
-	return misc_register(&lguest_dev);
-}
-
-void __exit lguest_device_remove(void)
-{
-	misc_deregister(&lguest_dev);
-}
diff --git a/drivers/lguest/page_tables.c b/drivers/lguest/page_tables.c
deleted file mode 100644
index 0bc127e9f16a..000000000000
--- a/drivers/lguest/page_tables.c
+++ /dev/null
@@ -1,1239 +0,0 @@
-/*P:700
- * The pagetable code, on the other hand, still shows the scars of
- * previous encounters.  It's functional, and as neat as it can be in the
- * circumstances, but be wary, for these things are subtle and break easily.
- * The Guest provides a virtual to physical mapping, but we can neither trust
- * it nor use it: we verify and convert it here then point the CPU to the
- * converted Guest pages when running the Guest.
-:*/
-
-/* Copyright (C) Rusty Russell IBM Corporation 2013.
- * GPL v2 and any later version */
-#include <linux/mm.h>
-#include <linux/gfp.h>
-#include <linux/types.h>
-#include <linux/spinlock.h>
-#include <linux/random.h>
-#include <linux/percpu.h>
-#include <asm/tlbflush.h>
-#include <linux/uaccess.h>
-#include "lg.h"
-
-/*M:008
- * We hold reference to pages, which prevents them from being swapped.
- * It'd be nice to have a callback in the "struct mm_struct" when Linux wants
- * to swap out.  If we had this, and a shrinker callback to trim PTE pages, we
- * could probably consider launching Guests as non-root.
-:*/
-
-/*H:300
- * The Page Table Code
- *
- * We use two-level page tables for the Guest, or three-level with PAE.  If
- * you're not entirely comfortable with virtual addresses, physical addresses
- * and page tables then I recommend you review arch/x86/lguest/boot.c's "Page
- * Table Handling" (with diagrams!).
- *
- * The Guest keeps page tables, but we maintain the actual ones here: these are
- * called "shadow" page tables.  Which is a very Guest-centric name: these are
- * the real page tables the CPU uses, although we keep them up to date to
- * reflect the Guest's.  (See what I mean about weird naming?  Since when do
- * shadows reflect anything?)
- *
- * Anyway, this is the most complicated part of the Host code.  There are seven
- * parts to this:
- *  (i) Looking up a page table entry when the Guest faults,
- *  (ii) Making sure the Guest stack is mapped,
- *  (iii) Setting up a page table entry when the Guest tells us one has changed,
- *  (iv) Switching page tables,
- *  (v) Flushing (throwing away) page tables,
- *  (vi) Mapping the Switcher when the Guest is about to run,
- *  (vii) Setting up the page tables initially.
-:*/
-
-/*
- * The Switcher uses the complete top PTE page.  That's 1024 PTE entries (4MB)
- * or 512 PTE entries with PAE (2MB).
- */
-#define SWITCHER_PGD_INDEX (PTRS_PER_PGD - 1)
-
-/*
- * For PAE we need the PMD index as well. We use the last 2MB, so we
- * will need the last pmd entry of the last pmd page.
- */
-#ifdef CONFIG_X86_PAE
-#define CHECK_GPGD_MASK		_PAGE_PRESENT
-#else
-#define CHECK_GPGD_MASK		_PAGE_TABLE
-#endif
-
-/*H:320
- * The page table code is curly enough to need helper functions to keep it
- * clear and clean.  The kernel itself provides many of them; one advantage
- * of insisting that the Guest and Host use the same CONFIG_X86_PAE setting.
- *
- * There are two functions which return pointers to the shadow (aka "real")
- * page tables.
- *
- * spgd_addr() takes the virtual address and returns a pointer to the top-level
- * page directory entry (PGD) for that address.  Since we keep track of several
- * page tables, the "i" argument tells us which one we're interested in (it's
- * usually the current one).
- */
-static pgd_t *spgd_addr(struct lg_cpu *cpu, u32 i, unsigned long vaddr)
-{
-	unsigned int index = pgd_index(vaddr);
-
-	/* Return a pointer index'th pgd entry for the i'th page table. */
-	return &cpu->lg->pgdirs[i].pgdir[index];
-}
-
-#ifdef CONFIG_X86_PAE
-/*
- * This routine then takes the PGD entry given above, which contains the
- * address of the PMD page.  It then returns a pointer to the PMD entry for the
- * given address.
- */
-static pmd_t *spmd_addr(struct lg_cpu *cpu, pgd_t spgd, unsigned long vaddr)
-{
-	unsigned int index = pmd_index(vaddr);
-	pmd_t *page;
-
-	/* You should never call this if the PGD entry wasn't valid */
-	BUG_ON(!(pgd_flags(spgd) & _PAGE_PRESENT));
-	page = __va(pgd_pfn(spgd) << PAGE_SHIFT);
-
-	return &page[index];
-}
-#endif
-
-/*
- * This routine then takes the page directory entry returned above, which
- * contains the address of the page table entry (PTE) page.  It then returns a
- * pointer to the PTE entry for the given address.
- */
-static pte_t *spte_addr(struct lg_cpu *cpu, pgd_t spgd, unsigned long vaddr)
-{
-#ifdef CONFIG_X86_PAE
-	pmd_t *pmd = spmd_addr(cpu, spgd, vaddr);
-	pte_t *page = __va(pmd_pfn(*pmd) << PAGE_SHIFT);
-
-	/* You should never call this if the PMD entry wasn't valid */
-	BUG_ON(!(pmd_flags(*pmd) & _PAGE_PRESENT));
-#else
-	pte_t *page = __va(pgd_pfn(spgd) << PAGE_SHIFT);
-	/* You should never call this if the PGD entry wasn't valid */
-	BUG_ON(!(pgd_flags(spgd) & _PAGE_PRESENT));
-#endif
-
-	return &page[pte_index(vaddr)];
-}
-
-/*
- * These functions are just like the above, except they access the Guest
- * page tables.  Hence they return a Guest address.
- */
-static unsigned long gpgd_addr(struct lg_cpu *cpu, unsigned long vaddr)
-{
-	unsigned int index = vaddr >> (PGDIR_SHIFT);
-	return cpu->lg->pgdirs[cpu->cpu_pgd].gpgdir + index * sizeof(pgd_t);
-}
-
-#ifdef CONFIG_X86_PAE
-/* Follow the PGD to the PMD. */
-static unsigned long gpmd_addr(pgd_t gpgd, unsigned long vaddr)
-{
-	unsigned long gpage = pgd_pfn(gpgd) << PAGE_SHIFT;
-	BUG_ON(!(pgd_flags(gpgd) & _PAGE_PRESENT));
-	return gpage + pmd_index(vaddr) * sizeof(pmd_t);
-}
-
-/* Follow the PMD to the PTE. */
-static unsigned long gpte_addr(struct lg_cpu *cpu,
-			       pmd_t gpmd, unsigned long vaddr)
-{
-	unsigned long gpage = pmd_pfn(gpmd) << PAGE_SHIFT;
-
-	BUG_ON(!(pmd_flags(gpmd) & _PAGE_PRESENT));
-	return gpage + pte_index(vaddr) * sizeof(pte_t);
-}
-#else
-/* Follow the PGD to the PTE (no mid-level for !PAE). */
-static unsigned long gpte_addr(struct lg_cpu *cpu,
-				pgd_t gpgd, unsigned long vaddr)
-{
-	unsigned long gpage = pgd_pfn(gpgd) << PAGE_SHIFT;
-
-	BUG_ON(!(pgd_flags(gpgd) & _PAGE_PRESENT));
-	return gpage + pte_index(vaddr) * sizeof(pte_t);
-}
-#endif
-/*:*/
-
-/*M:007
- * get_pfn is slow: we could probably try to grab batches of pages here as
- * an optimization (ie. pre-faulting).
-:*/
-
-/*H:350
- * This routine takes a page number given by the Guest and converts it to
- * an actual, physical page number.  It can fail for several reasons: the
- * virtual address might not be mapped by the Launcher, the write flag is set
- * and the page is read-only, or the write flag was set and the page was
- * shared so had to be copied, but we ran out of memory.
- *
- * This holds a reference to the page, so release_pte() is careful to put that
- * back.
- */
-static unsigned long get_pfn(unsigned long virtpfn, int write)
-{
-	struct page *page;
-
-	/* gup me one page at this address please! */
-	if (get_user_pages_fast(virtpfn << PAGE_SHIFT, 1, write, &page) == 1)
-		return page_to_pfn(page);
-
-	/* This value indicates failure. */
-	return -1UL;
-}
-
-/*H:340
- * Converting a Guest page table entry to a shadow (ie. real) page table
- * entry can be a little tricky.  The flags are (almost) the same, but the
- * Guest PTE contains a virtual page number: the CPU needs the real page
- * number.
- */
-static pte_t gpte_to_spte(struct lg_cpu *cpu, pte_t gpte, int write)
-{
-	unsigned long pfn, base, flags;
-
-	/*
-	 * The Guest sets the global flag, because it thinks that it is using
-	 * PGE.  We only told it to use PGE so it would tell us whether it was
-	 * flushing a kernel mapping or a userspace mapping.  We don't actually
-	 * use the global bit, so throw it away.
-	 */
-	flags = (pte_flags(gpte) & ~_PAGE_GLOBAL);
-
-	/* The Guest's pages are offset inside the Launcher. */
-	base = (unsigned long)cpu->lg->mem_base / PAGE_SIZE;
-
-	/*
-	 * We need a temporary "unsigned long" variable to hold the answer from
-	 * get_pfn(), because it returns 0xFFFFFFFF on failure, which wouldn't
-	 * fit in spte.pfn.  get_pfn() finds the real physical number of the
-	 * page, given the virtual number.
-	 */
-	pfn = get_pfn(base + pte_pfn(gpte), write);
-	if (pfn == -1UL) {
-		kill_guest(cpu, "failed to get page %lu", pte_pfn(gpte));
-		/*
-		 * When we destroy the Guest, we'll go through the shadow page
-		 * tables and release_pte() them.  Make sure we don't think
-		 * this one is valid!
-		 */
-		flags = 0;
-	}
-	/* Now we assemble our shadow PTE from the page number and flags. */
-	return pfn_pte(pfn, __pgprot(flags));
-}
-
-/*H:460 And to complete the chain, release_pte() looks like this: */
-static void release_pte(pte_t pte)
-{
-	/*
-	 * Remember that get_user_pages_fast() took a reference to the page, in
-	 * get_pfn()?  We have to put it back now.
-	 */
-	if (pte_flags(pte) & _PAGE_PRESENT)
-		put_page(pte_page(pte));
-}
-/*:*/
-
-static bool gpte_in_iomem(struct lg_cpu *cpu, pte_t gpte)
-{
-	/* We don't handle large pages. */
-	if (pte_flags(gpte) & _PAGE_PSE)
-		return false;
-
-	return (pte_pfn(gpte) >= cpu->lg->pfn_limit
-		&& pte_pfn(gpte) < cpu->lg->device_limit);
-}
-
-static bool check_gpte(struct lg_cpu *cpu, pte_t gpte)
-{
-	if ((pte_flags(gpte) & _PAGE_PSE) ||
-	    pte_pfn(gpte) >= cpu->lg->pfn_limit) {
-		kill_guest(cpu, "bad page table entry");
-		return false;
-	}
-	return true;
-}
-
-static bool check_gpgd(struct lg_cpu *cpu, pgd_t gpgd)
-{
-	if ((pgd_flags(gpgd) & ~CHECK_GPGD_MASK) ||
-	    (pgd_pfn(gpgd) >= cpu->lg->pfn_limit)) {
-		kill_guest(cpu, "bad page directory entry");
-		return false;
-	}
-	return true;
-}
-
-#ifdef CONFIG_X86_PAE
-static bool check_gpmd(struct lg_cpu *cpu, pmd_t gpmd)
-{
-	if ((pmd_flags(gpmd) & ~_PAGE_TABLE) ||
-	    (pmd_pfn(gpmd) >= cpu->lg->pfn_limit)) {
-		kill_guest(cpu, "bad page middle directory entry");
-		return false;
-	}
-	return true;
-}
-#endif
-
-/*H:331
- * This is the core routine to walk the shadow page tables and find the page
- * table entry for a specific address.
- *
- * If allocate is set, then we allocate any missing levels, setting the flags
- * on the new page directory and mid-level directories using the arguments
- * (which are copied from the Guest's page table entries).
- */
-static pte_t *find_spte(struct lg_cpu *cpu, unsigned long vaddr, bool allocate,
-			int pgd_flags, int pmd_flags)
-{
-	pgd_t *spgd;
-	/* Mid level for PAE. */
-#ifdef CONFIG_X86_PAE
-	pmd_t *spmd;
-#endif
-
-	/* Get top level entry. */
-	spgd = spgd_addr(cpu, cpu->cpu_pgd, vaddr);
-	if (!(pgd_flags(*spgd) & _PAGE_PRESENT)) {
-		/* No shadow entry: allocate a new shadow PTE page. */
-		unsigned long ptepage;
-
-		/* If they didn't want us to allocate anything, stop. */
-		if (!allocate)
-			return NULL;
-
-		ptepage = get_zeroed_page(GFP_KERNEL);
-		/*
-		 * This is not really the Guest's fault, but killing it is
-		 * simple for this corner case.
-		 */
-		if (!ptepage) {
-			kill_guest(cpu, "out of memory allocating pte page");
-			return NULL;
-		}
-		/*
-		 * And we copy the flags to the shadow PGD entry.  The page
-		 * number in the shadow PGD is the page we just allocated.
-		 */
-		set_pgd(spgd, __pgd(__pa(ptepage) | pgd_flags));
-	}
-
-	/*
-	 * Intel's Physical Address Extension actually uses three levels of
-	 * page tables, so we need to look in the mid-level.
-	 */
-#ifdef CONFIG_X86_PAE
-	/* Now look at the mid-level shadow entry. */
-	spmd = spmd_addr(cpu, *spgd, vaddr);
-
-	if (!(pmd_flags(*spmd) & _PAGE_PRESENT)) {
-		/* No shadow entry: allocate a new shadow PTE page. */
-		unsigned long ptepage;
-
-		/* If they didn't want us to allocate anything, stop. */
-		if (!allocate)
-			return NULL;
-
-		ptepage = get_zeroed_page(GFP_KERNEL);
-
-		/*
-		 * This is not really the Guest's fault, but killing it is
-		 * simple for this corner case.
-		 */
-		if (!ptepage) {
-			kill_guest(cpu, "out of memory allocating pmd page");
-			return NULL;
-		}
-
-		/*
-		 * And we copy the flags to the shadow PMD entry.  The page
-		 * number in the shadow PMD is the page we just allocated.
-		 */
-		set_pmd(spmd, __pmd(__pa(ptepage) | pmd_flags));
-	}
-#endif
-
-	/* Get the pointer to the shadow PTE entry we're going to set. */
-	return spte_addr(cpu, *spgd, vaddr);
-}
-
-/*H:330
- * (i) Looking up a page table entry when the Guest faults.
- *
- * We saw this call in run_guest(): when we see a page fault in the Guest, we
- * come here.  That's because we only set up the shadow page tables lazily as
- * they're needed, so we get page faults all the time and quietly fix them up
- * and return to the Guest without it knowing.
- *
- * If we fixed up the fault (ie. we mapped the address), this routine returns
- * true.  Otherwise, it was a real fault and we need to tell the Guest.
- *
- * There's a corner case: they're trying to access memory between
- * pfn_limit and device_limit, which is I/O memory.  In this case, we
- * return false and set @iomem to the physical address, so the the
- * Launcher can handle the instruction manually.
- */
-bool demand_page(struct lg_cpu *cpu, unsigned long vaddr, int errcode,
-		 unsigned long *iomem)
-{
-	unsigned long gpte_ptr;
-	pte_t gpte;
-	pte_t *spte;
-	pmd_t gpmd;
-	pgd_t gpgd;
-
-	*iomem = 0;
-
-	/* We never demand page the Switcher, so trying is a mistake. */
-	if (vaddr >= switcher_addr)
-		return false;
-
-	/* First step: get the top-level Guest page table entry. */
-	if (unlikely(cpu->linear_pages)) {
-		/* Faking up a linear mapping. */
-		gpgd = __pgd(CHECK_GPGD_MASK);
-	} else {
-		gpgd = lgread(cpu, gpgd_addr(cpu, vaddr), pgd_t);
-		/* Toplevel not present?  We can't map it in. */
-		if (!(pgd_flags(gpgd) & _PAGE_PRESENT))
-			return false;
-
-		/* 
-		 * This kills the Guest if it has weird flags or tries to
-		 * refer to a "physical" address outside the bounds.
-		 */
-		if (!check_gpgd(cpu, gpgd))
-			return false;
-	}
-
-	/* This "mid-level" entry is only used for non-linear, PAE mode. */
-	gpmd = __pmd(_PAGE_TABLE);
-
-#ifdef CONFIG_X86_PAE
-	if (likely(!cpu->linear_pages)) {
-		gpmd = lgread(cpu, gpmd_addr(gpgd, vaddr), pmd_t);
-		/* Middle level not present?  We can't map it in. */
-		if (!(pmd_flags(gpmd) & _PAGE_PRESENT))
-			return false;
-
-		/* 
-		 * This kills the Guest if it has weird flags or tries to
-		 * refer to a "physical" address outside the bounds.
-		 */
-		if (!check_gpmd(cpu, gpmd))
-			return false;
-	}
-
-	/*
-	 * OK, now we look at the lower level in the Guest page table: keep its
-	 * address, because we might update it later.
-	 */
-	gpte_ptr = gpte_addr(cpu, gpmd, vaddr);
-#else
-	/*
-	 * OK, now we look at the lower level in the Guest page table: keep its
-	 * address, because we might update it later.
-	 */
-	gpte_ptr = gpte_addr(cpu, gpgd, vaddr);
-#endif
-
-	if (unlikely(cpu->linear_pages)) {
-		/* Linear?  Make up a PTE which points to same page. */
-		gpte = __pte((vaddr & PAGE_MASK) | _PAGE_RW | _PAGE_PRESENT);
-	} else {
-		/* Read the actual PTE value. */
-		gpte = lgread(cpu, gpte_ptr, pte_t);
-	}
-
-	/* If this page isn't in the Guest page tables, we can't page it in. */
-	if (!(pte_flags(gpte) & _PAGE_PRESENT))
-		return false;
-
-	/*
-	 * Check they're not trying to write to a page the Guest wants
-	 * read-only (bit 2 of errcode == write).
-	 */
-	if ((errcode & 2) && !(pte_flags(gpte) & _PAGE_RW))
-		return false;
-
-	/* User access to a kernel-only page? (bit 3 == user access) */
-	if ((errcode & 4) && !(pte_flags(gpte) & _PAGE_USER))
-		return false;
-
-	/* If they're accessing io memory, we expect a fault. */
-	if (gpte_in_iomem(cpu, gpte)) {
-		*iomem = (pte_pfn(gpte) << PAGE_SHIFT) | (vaddr & ~PAGE_MASK);
-		return false;
-	}
-
-	/*
-	 * Check that the Guest PTE flags are OK, and the page number is below
-	 * the pfn_limit (ie. not mapping the Launcher binary).
-	 */
-	if (!check_gpte(cpu, gpte))
-		return false;
-
-	/* Add the _PAGE_ACCESSED and (for a write) _PAGE_DIRTY flag */
-	gpte = pte_mkyoung(gpte);
-	if (errcode & 2)
-		gpte = pte_mkdirty(gpte);
-
-	/* Get the pointer to the shadow PTE entry we're going to set. */
-	spte = find_spte(cpu, vaddr, true, pgd_flags(gpgd), pmd_flags(gpmd));
-	if (!spte)
-		return false;
-
-	/*
-	 * If there was a valid shadow PTE entry here before, we release it.
-	 * This can happen with a write to a previously read-only entry.
-	 */
-	release_pte(*spte);
-
-	/*
-	 * If this is a write, we insist that the Guest page is writable (the
-	 * final arg to gpte_to_spte()).
-	 */
-	if (pte_dirty(gpte))
-		*spte = gpte_to_spte(cpu, gpte, 1);
-	else
-		/*
-		 * If this is a read, don't set the "writable" bit in the page
-		 * table entry, even if the Guest says it's writable.  That way
-		 * we will come back here when a write does actually occur, so
-		 * we can update the Guest's _PAGE_DIRTY flag.
-		 */
-		set_pte(spte, gpte_to_spte(cpu, pte_wrprotect(gpte), 0));
-
-	/*
-	 * Finally, we write the Guest PTE entry back: we've set the
-	 * _PAGE_ACCESSED and maybe the _PAGE_DIRTY flags.
-	 */
-	if (likely(!cpu->linear_pages))
-		lgwrite(cpu, gpte_ptr, pte_t, gpte);
-
-	/*
-	 * The fault is fixed, the page table is populated, the mapping
-	 * manipulated, the result returned and the code complete.  A small
-	 * delay and a trace of alliteration are the only indications the Guest
-	 * has that a page fault occurred at all.
-	 */
-	return true;
-}
-
-/*H:360
- * (ii) Making sure the Guest stack is mapped.
- *
- * Remember that direct traps into the Guest need a mapped Guest kernel stack.
- * pin_stack_pages() calls us here: we could simply call demand_page(), but as
- * we've seen that logic is quite long, and usually the stack pages are already
- * mapped, so it's overkill.
- *
- * This is a quick version which answers the question: is this virtual address
- * mapped by the shadow page tables, and is it writable?
- */
-static bool page_writable(struct lg_cpu *cpu, unsigned long vaddr)
-{
-	pte_t *spte;
-	unsigned long flags;
-
-	/* You can't put your stack in the Switcher! */
-	if (vaddr >= switcher_addr)
-		return false;
-
-	/* If there's no shadow PTE, it's not writable. */
-	spte = find_spte(cpu, vaddr, false, 0, 0);
-	if (!spte)
-		return false;
-
-	/*
-	 * Check the flags on the pte entry itself: it must be present and
-	 * writable.
-	 */
-	flags = pte_flags(*spte);
-	return (flags & (_PAGE_PRESENT|_PAGE_RW)) == (_PAGE_PRESENT|_PAGE_RW);
-}
-
-/*
- * So, when pin_stack_pages() asks us to pin a page, we check if it's already
- * in the page tables, and if not, we call demand_page() with error code 2
- * (meaning "write").
- */
-void pin_page(struct lg_cpu *cpu, unsigned long vaddr)
-{
-	unsigned long iomem;
-
-	if (!page_writable(cpu, vaddr) && !demand_page(cpu, vaddr, 2, &iomem))
-		kill_guest(cpu, "bad stack page %#lx", vaddr);
-}
-/*:*/
-
-#ifdef CONFIG_X86_PAE
-static void release_pmd(pmd_t *spmd)
-{
-	/* If the entry's not present, there's nothing to release. */
-	if (pmd_flags(*spmd) & _PAGE_PRESENT) {
-		unsigned int i;
-		pte_t *ptepage = __va(pmd_pfn(*spmd) << PAGE_SHIFT);
-		/* For each entry in the page, we might need to release it. */
-		for (i = 0; i < PTRS_PER_PTE; i++)
-			release_pte(ptepage[i]);
-		/* Now we can free the page of PTEs */
-		free_page((long)ptepage);
-		/* And zero out the PMD entry so we never release it twice. */
-		set_pmd(spmd, __pmd(0));
-	}
-}
-
-static void release_pgd(pgd_t *spgd)
-{
-	/* If the entry's not present, there's nothing to release. */
-	if (pgd_flags(*spgd) & _PAGE_PRESENT) {
-		unsigned int i;
-		pmd_t *pmdpage = __va(pgd_pfn(*spgd) << PAGE_SHIFT);
-
-		for (i = 0; i < PTRS_PER_PMD; i++)
-			release_pmd(&pmdpage[i]);
-
-		/* Now we can free the page of PMDs */
-		free_page((long)pmdpage);
-		/* And zero out the PGD entry so we never release it twice. */
-		set_pgd(spgd, __pgd(0));
-	}
-}
-
-#else /* !CONFIG_X86_PAE */
-/*H:450
- * If we chase down the release_pgd() code, the non-PAE version looks like
- * this.  The PAE version is almost identical, but instead of calling
- * release_pte it calls release_pmd(), which looks much like this.
- */
-static void release_pgd(pgd_t *spgd)
-{
-	/* If the entry's not present, there's nothing to release. */
-	if (pgd_flags(*spgd) & _PAGE_PRESENT) {
-		unsigned int i;
-		/*
-		 * Converting the pfn to find the actual PTE page is easy: turn
-		 * the page number into a physical address, then convert to a
-		 * virtual address (easy for kernel pages like this one).
-		 */
-		pte_t *ptepage = __va(pgd_pfn(*spgd) << PAGE_SHIFT);
-		/* For each entry in the page, we might need to release it. */
-		for (i = 0; i < PTRS_PER_PTE; i++)
-			release_pte(ptepage[i]);
-		/* Now we can free the page of PTEs */
-		free_page((long)ptepage);
-		/* And zero out the PGD entry so we never release it twice. */
-		*spgd = __pgd(0);
-	}
-}
-#endif
-
-/*H:445
- * We saw flush_user_mappings() twice: once from the flush_user_mappings()
- * hypercall and once in new_pgdir() when we re-used a top-level pgdir page.
- * It simply releases every PTE page from 0 up to the Guest's kernel address.
- */
-static void flush_user_mappings(struct lguest *lg, int idx)
-{
-	unsigned int i;
-	/* Release every pgd entry up to the kernel's address. */
-	for (i = 0; i < pgd_index(lg->kernel_address); i++)
-		release_pgd(lg->pgdirs[idx].pgdir + i);
-}
-
-/*H:440
- * (v) Flushing (throwing away) page tables,
- *
- * The Guest has a hypercall to throw away the page tables: it's used when a
- * large number of mappings have been changed.
- */
-void guest_pagetable_flush_user(struct lg_cpu *cpu)
-{
-	/* Drop the userspace part of the current page table. */
-	flush_user_mappings(cpu->lg, cpu->cpu_pgd);
-}
-/*:*/
-
-/* We walk down the guest page tables to get a guest-physical address */
-bool __guest_pa(struct lg_cpu *cpu, unsigned long vaddr, unsigned long *paddr)
-{
-	pgd_t gpgd;
-	pte_t gpte;
-#ifdef CONFIG_X86_PAE
-	pmd_t gpmd;
-#endif
-
-	/* Still not set up?  Just map 1:1. */
-	if (unlikely(cpu->linear_pages)) {
-		*paddr = vaddr;
-		return true;
-	}
-
-	/* First step: get the top-level Guest page table entry. */
-	gpgd = lgread(cpu, gpgd_addr(cpu, vaddr), pgd_t);
-	/* Toplevel not present?  We can't map it in. */
-	if (!(pgd_flags(gpgd) & _PAGE_PRESENT))
-		goto fail;
-
-#ifdef CONFIG_X86_PAE
-	gpmd = lgread(cpu, gpmd_addr(gpgd, vaddr), pmd_t);
-	if (!(pmd_flags(gpmd) & _PAGE_PRESENT))
-		goto fail;
-	gpte = lgread(cpu, gpte_addr(cpu, gpmd, vaddr), pte_t);
-#else
-	gpte = lgread(cpu, gpte_addr(cpu, gpgd, vaddr), pte_t);
-#endif
-	if (!(pte_flags(gpte) & _PAGE_PRESENT))
-		goto fail;
-
-	*paddr = pte_pfn(gpte) * PAGE_SIZE | (vaddr & ~PAGE_MASK);
-	return true;
-
-fail:
-	*paddr = -1UL;
-	return false;
-}
-
-/*
- * This is the version we normally use: kills the Guest if it uses a
- * bad address
- */
-unsigned long guest_pa(struct lg_cpu *cpu, unsigned long vaddr)
-{
-	unsigned long paddr;
-
-	if (!__guest_pa(cpu, vaddr, &paddr))
-		kill_guest(cpu, "Bad address %#lx", vaddr);
-	return paddr;
-}
-
-/*
- * We keep several page tables.  This is a simple routine to find the page
- * table (if any) corresponding to this top-level address the Guest has given
- * us.
- */
-static unsigned int find_pgdir(struct lguest *lg, unsigned long pgtable)
-{
-	unsigned int i;
-	for (i = 0; i < ARRAY_SIZE(lg->pgdirs); i++)
-		if (lg->pgdirs[i].pgdir && lg->pgdirs[i].gpgdir == pgtable)
-			break;
-	return i;
-}
-
-/*H:435
- * And this is us, creating the new page directory.  If we really do
- * allocate a new one (and so the kernel parts are not there), we set
- * blank_pgdir.
- */
-static unsigned int new_pgdir(struct lg_cpu *cpu,
-			      unsigned long gpgdir,
-			      int *blank_pgdir)
-{
-	unsigned int next;
-
-	/*
-	 * We pick one entry at random to throw out.  Choosing the Least
-	 * Recently Used might be better, but this is easy.
-	 */
-	next = prandom_u32() % ARRAY_SIZE(cpu->lg->pgdirs);
-	/* If it's never been allocated at all before, try now. */
-	if (!cpu->lg->pgdirs[next].pgdir) {
-		cpu->lg->pgdirs[next].pgdir =
-					(pgd_t *)get_zeroed_page(GFP_KERNEL);
-		/* If the allocation fails, just keep using the one we have */
-		if (!cpu->lg->pgdirs[next].pgdir)
-			next = cpu->cpu_pgd;
-		else {
-			/*
-			 * This is a blank page, so there are no kernel
-			 * mappings: caller must map the stack!
-			 */
-			*blank_pgdir = 1;
-		}
-	}
-	/* Record which Guest toplevel this shadows. */
-	cpu->lg->pgdirs[next].gpgdir = gpgdir;
-	/* Release all the non-kernel mappings. */
-	flush_user_mappings(cpu->lg, next);
-
-	/* This hasn't run on any CPU at all. */
-	cpu->lg->pgdirs[next].last_host_cpu = -1;
-
-	return next;
-}
-
-/*H:501
- * We do need the Switcher code mapped at all times, so we allocate that
- * part of the Guest page table here.  We map the Switcher code immediately,
- * but defer mapping of the guest register page and IDT/LDT etc page until
- * just before we run the guest in map_switcher_in_guest().
- *
- * We *could* do this setup in map_switcher_in_guest(), but at that point
- * we've interrupts disabled, and allocating pages like that is fraught: we
- * can't sleep if we need to free up some memory.
- */
-static bool allocate_switcher_mapping(struct lg_cpu *cpu)
-{
-	int i;
-
-	for (i = 0; i < TOTAL_SWITCHER_PAGES; i++) {
-		pte_t *pte = find_spte(cpu, switcher_addr + i * PAGE_SIZE, true,
-				       CHECK_GPGD_MASK, _PAGE_TABLE);
-		if (!pte)
-			return false;
-
-		/*
-		 * Map the switcher page if not already there.  It might
-		 * already be there because we call allocate_switcher_mapping()
-		 * in guest_set_pgd() just in case it did discard our Switcher
-		 * mapping, but it probably didn't.
-		 */
-		if (i == 0 && !(pte_flags(*pte) & _PAGE_PRESENT)) {
-			/* Get a reference to the Switcher page. */
-			get_page(lg_switcher_pages[0]);
-			/* Create a read-only, exectuable, kernel-style PTE */
-			set_pte(pte,
-				mk_pte(lg_switcher_pages[0], PAGE_KERNEL_RX));
-		}
-	}
-	cpu->lg->pgdirs[cpu->cpu_pgd].switcher_mapped = true;
-	return true;
-}
-
-/*H:470
- * Finally, a routine which throws away everything: all PGD entries in all
- * the shadow page tables, including the Guest's kernel mappings.  This is used
- * when we destroy the Guest.
- */
-static void release_all_pagetables(struct lguest *lg)
-{
-	unsigned int i, j;
-
-	/* Every shadow pagetable this Guest has */
-	for (i = 0; i < ARRAY_SIZE(lg->pgdirs); i++) {
-		if (!lg->pgdirs[i].pgdir)
-			continue;
-
-		/* Every PGD entry. */
-		for (j = 0; j < PTRS_PER_PGD; j++)
-			release_pgd(lg->pgdirs[i].pgdir + j);
-		lg->pgdirs[i].switcher_mapped = false;
-		lg->pgdirs[i].last_host_cpu = -1;
-	}
-}
-
-/*
- * We also throw away everything when a Guest tells us it's changed a kernel
- * mapping.  Since kernel mappings are in every page table, it's easiest to
- * throw them all away.  This traps the Guest in amber for a while as
- * everything faults back in, but it's rare.
- */
-void guest_pagetable_clear_all(struct lg_cpu *cpu)
-{
-	release_all_pagetables(cpu->lg);
-	/* We need the Guest kernel stack mapped again. */
-	pin_stack_pages(cpu);
-	/* And we need Switcher allocated. */
-	if (!allocate_switcher_mapping(cpu))
-		kill_guest(cpu, "Cannot populate switcher mapping");
-}
-
-/*H:430
- * (iv) Switching page tables
- *
- * Now we've seen all the page table setting and manipulation, let's see
- * what happens when the Guest changes page tables (ie. changes the top-level
- * pgdir).  This occurs on almost every context switch.
- */
-void guest_new_pagetable(struct lg_cpu *cpu, unsigned long pgtable)
-{
-	int newpgdir, repin = 0;
-
-	/*
-	 * The very first time they call this, we're actually running without
-	 * any page tables; we've been making it up.  Throw them away now.
-	 */
-	if (unlikely(cpu->linear_pages)) {
-		release_all_pagetables(cpu->lg);
-		cpu->linear_pages = false;
-		/* Force allocation of a new pgdir. */
-		newpgdir = ARRAY_SIZE(cpu->lg->pgdirs);
-	} else {
-		/* Look to see if we have this one already. */
-		newpgdir = find_pgdir(cpu->lg, pgtable);
-	}
-
-	/*
-	 * If not, we allocate or mug an existing one: if it's a fresh one,
-	 * repin gets set to 1.
-	 */
-	if (newpgdir == ARRAY_SIZE(cpu->lg->pgdirs))
-		newpgdir = new_pgdir(cpu, pgtable, &repin);
-	/* Change the current pgd index to the new one. */
-	cpu->cpu_pgd = newpgdir;
-	/*
-	 * If it was completely blank, we map in the Guest kernel stack and
-	 * the Switcher.
-	 */
-	if (repin)
-		pin_stack_pages(cpu);
-
-	if (!cpu->lg->pgdirs[cpu->cpu_pgd].switcher_mapped) {
-		if (!allocate_switcher_mapping(cpu))
-			kill_guest(cpu, "Cannot populate switcher mapping");
-	}
-}
-/*:*/
-
-/*M:009
- * Since we throw away all mappings when a kernel mapping changes, our
- * performance sucks for guests using highmem.  In fact, a guest with
- * PAGE_OFFSET 0xc0000000 (the default) and more than about 700MB of RAM is
- * usually slower than a Guest with less memory.
- *
- * This, of course, cannot be fixed.  It would take some kind of... well, I
- * don't know, but the term "puissant code-fu" comes to mind.
-:*/
-
-/*H:420
- * This is the routine which actually sets the page table entry for then
- * "idx"'th shadow page table.
- *
- * Normally, we can just throw out the old entry and replace it with 0: if they
- * use it demand_page() will put the new entry in.  We need to do this anyway:
- * The Guest expects _PAGE_ACCESSED to be set on its PTE the first time a page
- * is read from, and _PAGE_DIRTY when it's written to.
- *
- * But Avi Kivity pointed out that most Operating Systems (Linux included) set
- * these bits on PTEs immediately anyway.  This is done to save the CPU from
- * having to update them, but it helps us the same way: if they set
- * _PAGE_ACCESSED then we can put a read-only PTE entry in immediately, and if
- * they set _PAGE_DIRTY then we can put a writable PTE entry in immediately.
- */
-static void __guest_set_pte(struct lg_cpu *cpu, int idx,
-		       unsigned long vaddr, pte_t gpte)
-{
-	/* Look up the matching shadow page directory entry. */
-	pgd_t *spgd = spgd_addr(cpu, idx, vaddr);
-#ifdef CONFIG_X86_PAE
-	pmd_t *spmd;
-#endif
-
-	/* If the top level isn't present, there's no entry to update. */
-	if (pgd_flags(*spgd) & _PAGE_PRESENT) {
-#ifdef CONFIG_X86_PAE
-		spmd = spmd_addr(cpu, *spgd, vaddr);
-		if (pmd_flags(*spmd) & _PAGE_PRESENT) {
-#endif
-			/* Otherwise, start by releasing the existing entry. */
-			pte_t *spte = spte_addr(cpu, *spgd, vaddr);
-			release_pte(*spte);
-
-			/*
-			 * If they're setting this entry as dirty or accessed,
-			 * we might as well put that entry they've given us in
-			 * now.  This shaves 10% off a copy-on-write
-			 * micro-benchmark.
-			 */
-			if ((pte_flags(gpte) & (_PAGE_DIRTY | _PAGE_ACCESSED))
-			    && !gpte_in_iomem(cpu, gpte)) {
-				if (!check_gpte(cpu, gpte))
-					return;
-				set_pte(spte,
-					gpte_to_spte(cpu, gpte,
-						pte_flags(gpte) & _PAGE_DIRTY));
-			} else {
-				/*
-				 * Otherwise kill it and we can demand_page()
-				 * it in later.
-				 */
-				set_pte(spte, __pte(0));
-			}
-#ifdef CONFIG_X86_PAE
-		}
-#endif
-	}
-}
-
-/*H:410
- * Updating a PTE entry is a little trickier.
- *
- * We keep track of several different page tables (the Guest uses one for each
- * process, so it makes sense to cache at least a few).  Each of these have
- * identical kernel parts: ie. every mapping above PAGE_OFFSET is the same for
- * all processes.  So when the page table above that address changes, we update
- * all the page tables, not just the current one.  This is rare.
- *
- * The benefit is that when we have to track a new page table, we can keep all
- * the kernel mappings.  This speeds up context switch immensely.
- */
-void guest_set_pte(struct lg_cpu *cpu,
-		   unsigned long gpgdir, unsigned long vaddr, pte_t gpte)
-{
-	/* We don't let you remap the Switcher; we need it to get back! */
-	if (vaddr >= switcher_addr) {
-		kill_guest(cpu, "attempt to set pte into Switcher pages");
-		return;
-	}
-
-	/*
-	 * Kernel mappings must be changed on all top levels.  Slow, but doesn't
-	 * happen often.
-	 */
-	if (vaddr >= cpu->lg->kernel_address) {
-		unsigned int i;
-		for (i = 0; i < ARRAY_SIZE(cpu->lg->pgdirs); i++)
-			if (cpu->lg->pgdirs[i].pgdir)
-				__guest_set_pte(cpu, i, vaddr, gpte);
-	} else {
-		/* Is this page table one we have a shadow for? */
-		int pgdir = find_pgdir(cpu->lg, gpgdir);
-		if (pgdir != ARRAY_SIZE(cpu->lg->pgdirs))
-			/* If so, do the update. */
-			__guest_set_pte(cpu, pgdir, vaddr, gpte);
-	}
-}
-
-/*H:400
- * (iii) Setting up a page table entry when the Guest tells us one has changed.
- *
- * Just like we did in interrupts_and_traps.c, it makes sense for us to deal
- * with the other side of page tables while we're here: what happens when the
- * Guest asks for a page table to be updated?
- *
- * We already saw that demand_page() will fill in the shadow page tables when
- * needed, so we can simply remove shadow page table entries whenever the Guest
- * tells us they've changed.  When the Guest tries to use the new entry it will
- * fault and demand_page() will fix it up.
- *
- * So with that in mind here's our code to update a (top-level) PGD entry:
- */
-void guest_set_pgd(struct lguest *lg, unsigned long gpgdir, u32 idx)
-{
-	int pgdir;
-
-	if (idx > PTRS_PER_PGD) {
-		kill_guest(&lg->cpus[0], "Attempt to set pgd %u/%u",
-			   idx, PTRS_PER_PGD);
-		return;
-	}
-
-	/* If they're talking about a page table we have a shadow for... */
-	pgdir = find_pgdir(lg, gpgdir);
-	if (pgdir < ARRAY_SIZE(lg->pgdirs)) {
-		/* ... throw it away. */
-		release_pgd(lg->pgdirs[pgdir].pgdir + idx);
-		/* That might have been the Switcher mapping, remap it. */
-		if (!allocate_switcher_mapping(&lg->cpus[0])) {
-			kill_guest(&lg->cpus[0],
-				   "Cannot populate switcher mapping");
-		}
-		lg->pgdirs[pgdir].last_host_cpu = -1;
-	}
-}
-
-#ifdef CONFIG_X86_PAE
-/* For setting a mid-level, we just throw everything away.  It's easy. */
-void guest_set_pmd(struct lguest *lg, unsigned long pmdp, u32 idx)
-{
-	guest_pagetable_clear_all(&lg->cpus[0]);
-}
-#endif
-
-/*H:500
- * (vii) Setting up the page tables initially.
- *
- * When a Guest is first created, set initialize a shadow page table which
- * we will populate on future faults.  The Guest doesn't have any actual
- * pagetables yet, so we set linear_pages to tell demand_page() to fake it
- * for the moment.
- *
- * We do need the Switcher to be mapped at all times, so we allocate that
- * part of the Guest page table here.
- */
-int init_guest_pagetable(struct lguest *lg)
-{
-	struct lg_cpu *cpu = &lg->cpus[0];
-	int allocated = 0;
-
-	/* lg (and lg->cpus[]) starts zeroed: this allocates a new pgdir */
-	cpu->cpu_pgd = new_pgdir(cpu, 0, &allocated);
-	if (!allocated)
-		return -ENOMEM;
-
-	/* We start with a linear mapping until the initialize. */
-	cpu->linear_pages = true;
-
-	/* Allocate the page tables for the Switcher. */
-	if (!allocate_switcher_mapping(cpu)) {
-		release_all_pagetables(lg);
-		return -ENOMEM;
-	}
-
-	return 0;
-}
-
-/*H:508 When the Guest calls LHCALL_LGUEST_INIT we do more setup. */
-void page_table_guest_data_init(struct lg_cpu *cpu)
-{
-	/*
-	 * We tell the Guest that it can't use the virtual addresses
-	 * used by the Switcher.  This trick is equivalent to 4GB -
-	 * switcher_addr.
-	 */
-	u32 top = ~switcher_addr + 1;
-
-	/* We get the kernel address: above this is all kernel memory. */
-	if (get_user(cpu->lg->kernel_address,
-		     &cpu->lg->lguest_data->kernel_address)
-		/*
-		 * We tell the Guest that it can't use the top virtual
-		 * addresses (used by the Switcher).
-		 */
-	    || put_user(top, &cpu->lg->lguest_data->reserve_mem)) {
-		kill_guest(cpu, "bad guest page %p", cpu->lg->lguest_data);
-		return;
-	}
-
-	/*
-	 * In flush_user_mappings() we loop from 0 to
-	 * "pgd_index(lg->kernel_address)".  This assumes it won't hit the
-	 * Switcher mappings, so check that now.
-	 */
-	if (cpu->lg->kernel_address >= switcher_addr)
-		kill_guest(cpu, "bad kernel address %#lx",
-				 cpu->lg->kernel_address);
-}
-
-/* When a Guest dies, our cleanup is fairly simple. */
-void free_guest_pagetable(struct lguest *lg)
-{
-	unsigned int i;
-
-	/* Throw away all page table pages. */
-	release_all_pagetables(lg);
-	/* Now free the top levels: free_page() can handle 0 just fine. */
-	for (i = 0; i < ARRAY_SIZE(lg->pgdirs); i++)
-		free_page((long)lg->pgdirs[i].pgdir);
-}
-
-/*H:481
- * This clears the Switcher mappings for cpu #i.
- */
-static void remove_switcher_percpu_map(struct lg_cpu *cpu, unsigned int i)
-{
-	unsigned long base = switcher_addr + PAGE_SIZE + i * PAGE_SIZE*2;
-	pte_t *pte;
-
-	/* Clear the mappings for both pages. */
-	pte = find_spte(cpu, base, false, 0, 0);
-	release_pte(*pte);
-	set_pte(pte, __pte(0));
-
-	pte = find_spte(cpu, base + PAGE_SIZE, false, 0, 0);
-	release_pte(*pte);
-	set_pte(pte, __pte(0));
-}
-
-/*H:480
- * (vi) Mapping the Switcher when the Guest is about to run.
- *
- * The Switcher and the two pages for this CPU need to be visible in the Guest
- * (and not the pages for other CPUs).
- *
- * The pages for the pagetables have all been allocated before: we just need
- * to make sure the actual PTEs are up-to-date for the CPU we're about to run
- * on.
- */
-void map_switcher_in_guest(struct lg_cpu *cpu, struct lguest_pages *pages)
-{
-	unsigned long base;
-	struct page *percpu_switcher_page, *regs_page;
-	pte_t *pte;
-	struct pgdir *pgdir = &cpu->lg->pgdirs[cpu->cpu_pgd];
-
-	/* Switcher page should always be mapped by now! */
-	BUG_ON(!pgdir->switcher_mapped);
-
-	/* 
-	 * Remember that we have two pages for each Host CPU, so we can run a
-	 * Guest on each CPU without them interfering.  We need to make sure
-	 * those pages are mapped correctly in the Guest, but since we usually
-	 * run on the same CPU, we cache that, and only update the mappings
-	 * when we move.
-	 */
-	if (pgdir->last_host_cpu == raw_smp_processor_id())
-		return;
-
-	/* -1 means unknown so we remove everything. */
-	if (pgdir->last_host_cpu == -1) {
-		unsigned int i;
-		for_each_possible_cpu(i)
-			remove_switcher_percpu_map(cpu, i);
-	} else {
-		/* We know exactly what CPU mapping to remove. */
-		remove_switcher_percpu_map(cpu, pgdir->last_host_cpu);
-	}
-
-	/*
-	 * When we're running the Guest, we want the Guest's "regs" page to
-	 * appear where the first Switcher page for this CPU is.  This is an
-	 * optimization: when the Switcher saves the Guest registers, it saves
-	 * them into the first page of this CPU's "struct lguest_pages": if we
-	 * make sure the Guest's register page is already mapped there, we
-	 * don't have to copy them out again.
-	 */
-	/* Find the shadow PTE for this regs page. */
-	base = switcher_addr + PAGE_SIZE
-		+ raw_smp_processor_id() * sizeof(struct lguest_pages);
-	pte = find_spte(cpu, base, false, 0, 0);
-	regs_page = pfn_to_page(__pa(cpu->regs_page) >> PAGE_SHIFT);
-	get_page(regs_page);
-	set_pte(pte, mk_pte(regs_page, __pgprot(__PAGE_KERNEL & ~_PAGE_GLOBAL)));
-
-	/*
-	 * We map the second page of the struct lguest_pages read-only in
-	 * the Guest: the IDT, GDT and other things it's not supposed to
-	 * change.
-	 */
-	pte = find_spte(cpu, base + PAGE_SIZE, false, 0, 0);
-	percpu_switcher_page
-		= lg_switcher_pages[1 + raw_smp_processor_id()*2 + 1];
-	get_page(percpu_switcher_page);
-	set_pte(pte, mk_pte(percpu_switcher_page,
-			    __pgprot(__PAGE_KERNEL_RO & ~_PAGE_GLOBAL)));
-
-	pgdir->last_host_cpu = raw_smp_processor_id();
-}
-
-/*H:490
- * We've made it through the page table code.  Perhaps our tired brains are
- * still processing the details, or perhaps we're simply glad it's over.
- *
- * If nothing else, note that all this complexity in juggling shadow page tables
- * in sync with the Guest's page tables is for one reason: for most Guests this
- * page table dance determines how bad performance will be.  This is why Xen
- * uses exotic direct Guest pagetable manipulation, and why both Intel and AMD
- * have implemented shadow page table support directly into hardware.
- *
- * There is just one file remaining in the Host.
- */
diff --git a/drivers/lguest/segments.c b/drivers/lguest/segments.c
deleted file mode 100644
index c4fb424dfddb..000000000000
--- a/drivers/lguest/segments.c
+++ /dev/null
@@ -1,228 +0,0 @@
-/*P:600
- * The x86 architecture has segments, which involve a table of descriptors
- * which can be used to do funky things with virtual address interpretation.
- * We originally used to use segments so the Guest couldn't alter the
- * Guest<->Host Switcher, and then we had to trim Guest segments, and restore
- * for userspace per-thread segments, but trim again for on userspace->kernel
- * transitions...  This nightmarish creation was contained within this file,
- * where we knew not to tread without heavy armament and a change of underwear.
- *
- * In these modern times, the segment handling code consists of simple sanity
- * checks, and the worst you'll experience reading this code is butterfly-rash
- * from frolicking through its parklike serenity.
-:*/
-#include "lg.h"
-
-/*H:600
- * Segments & The Global Descriptor Table
- *
- * (That title sounds like a bad Nerdcore group.  Not to suggest that there are
- * any good Nerdcore groups, but in high school a friend of mine had a band
- * called Joe Fish and the Chips, so there are definitely worse band names).
- *
- * To refresh: the GDT is a table of 8-byte values describing segments.  Once
- * set up, these segments can be loaded into one of the 6 "segment registers".
- *
- * GDT entries are passed around as "struct desc_struct"s, which like IDT
- * entries are split into two 32-bit members, "a" and "b".  One day, someone
- * will clean that up, and be declared a Hero.  (No pressure, I'm just saying).
- *
- * Anyway, the GDT entry contains a base (the start address of the segment), a
- * limit (the size of the segment - 1), and some flags.  Sounds simple, and it
- * would be, except those zany Intel engineers decided that it was too boring
- * to put the base at one end, the limit at the other, and the flags in
- * between.  They decided to shotgun the bits at random throughout the 8 bytes,
- * like so:
- *
- * 0               16                     40       48  52  56     63
- * [ limit part 1 ][     base part 1     ][ flags ][li][fl][base ]
- *                                                  mit ags part 2
- *                                                part 2
- *
- * As a result, this file contains a certain amount of magic numeracy.  Let's
- * begin.
- */
-
-/*
- * There are several entries we don't let the Guest set.  The TSS entry is the
- * "Task State Segment" which controls all kinds of delicate things.  The
- * LGUEST_CS and LGUEST_DS entries are reserved for the Switcher, and the
- * the Guest can't be trusted to deal with double faults.
- */
-static bool ignored_gdt(unsigned int num)
-{
-	return (num == GDT_ENTRY_TSS
-		|| num == GDT_ENTRY_LGUEST_CS
-		|| num == GDT_ENTRY_LGUEST_DS
-		|| num == GDT_ENTRY_DOUBLEFAULT_TSS);
-}
-
-/*H:630
- * Once the Guest gave us new GDT entries, we fix them up a little.  We
- * don't care if they're invalid: the worst that can happen is a General
- * Protection Fault in the Switcher when it restores a Guest segment register
- * which tries to use that entry.  Then we kill the Guest for causing such a
- * mess: the message will be "unhandled trap 256".
- */
-static void fixup_gdt_table(struct lg_cpu *cpu, unsigned start, unsigned end)
-{
-	unsigned int i;
-
-	for (i = start; i < end; i++) {
-		/*
-		 * We never copy these ones to real GDT, so we don't care what
-		 * they say
-		 */
-		if (ignored_gdt(i))
-			continue;
-
-		/*
-		 * Segment descriptors contain a privilege level: the Guest is
-		 * sometimes careless and leaves this as 0, even though it's
-		 * running at privilege level 1.  If so, we fix it here.
-		 */
-		if (cpu->arch.gdt[i].dpl == 0)
-			cpu->arch.gdt[i].dpl |= GUEST_PL;
-
-		/*
-		 * Each descriptor has an "accessed" bit.  If we don't set it
-		 * now, the CPU will try to set it when the Guest first loads
-		 * that entry into a segment register.  But the GDT isn't
-		 * writable by the Guest, so bad things can happen.
-		 */
-		cpu->arch.gdt[i].type |= 0x1;
-	}
-}
-
-/*H:610
- * Like the IDT, we never simply use the GDT the Guest gives us.  We keep
- * a GDT for each CPU, and copy across the Guest's entries each time we want to
- * run the Guest on that CPU.
- *
- * This routine is called at boot or modprobe time for each CPU to set up the
- * constant GDT entries: the ones which are the same no matter what Guest we're
- * running.
- */
-void setup_default_gdt_entries(struct lguest_ro_state *state)
-{
-	struct desc_struct *gdt = state->guest_gdt;
-	unsigned long tss = (unsigned long)&state->guest_tss;
-
-	/* The Switcher segments are full 0-4G segments, privilege level 0 */
-	gdt[GDT_ENTRY_LGUEST_CS] = FULL_EXEC_SEGMENT;
-	gdt[GDT_ENTRY_LGUEST_DS] = FULL_SEGMENT;
-
-	/*
-	 * The TSS segment refers to the TSS entry for this particular CPU.
-	 */
-	gdt[GDT_ENTRY_TSS].a = 0;
-	gdt[GDT_ENTRY_TSS].b = 0;
-
-	gdt[GDT_ENTRY_TSS].limit0 = 0x67;
-	gdt[GDT_ENTRY_TSS].base0  = tss & 0xFFFF;
-	gdt[GDT_ENTRY_TSS].base1  = (tss >> 16) & 0xFF;
-	gdt[GDT_ENTRY_TSS].base2  = tss >> 24;
-	gdt[GDT_ENTRY_TSS].type   = 0x9; /* 32-bit TSS (available) */
-	gdt[GDT_ENTRY_TSS].p      = 0x1; /* Entry is present */
-	gdt[GDT_ENTRY_TSS].dpl    = 0x0; /* Privilege level 0 */
-	gdt[GDT_ENTRY_TSS].s      = 0x0; /* system segment */
-
-}
-
-/*
- * This routine sets up the initial Guest GDT for booting.  All entries start
- * as 0 (unusable).
- */
-void setup_guest_gdt(struct lg_cpu *cpu)
-{
-	/*
-	 * Start with full 0-4G segments...except the Guest is allowed to use
-	 * them, so set the privilege level appropriately in the flags.
-	 */
-	cpu->arch.gdt[GDT_ENTRY_KERNEL_CS] = FULL_EXEC_SEGMENT;
-	cpu->arch.gdt[GDT_ENTRY_KERNEL_DS] = FULL_SEGMENT;
-	cpu->arch.gdt[GDT_ENTRY_KERNEL_CS].dpl |= GUEST_PL;
-	cpu->arch.gdt[GDT_ENTRY_KERNEL_DS].dpl |= GUEST_PL;
-}
-
-/*H:650
- * An optimization of copy_gdt(), for just the three "thead-local storage"
- * entries.
- */
-void copy_gdt_tls(const struct lg_cpu *cpu, struct desc_struct *gdt)
-{
-	unsigned int i;
-
-	for (i = GDT_ENTRY_TLS_MIN; i <= GDT_ENTRY_TLS_MAX; i++)
-		gdt[i] = cpu->arch.gdt[i];
-}
-
-/*H:640
- * When the Guest is run on a different CPU, or the GDT entries have changed,
- * copy_gdt() is called to copy the Guest's GDT entries across to this CPU's
- * GDT.
- */
-void copy_gdt(const struct lg_cpu *cpu, struct desc_struct *gdt)
-{
-	unsigned int i;
-
-	/*
-	 * The default entries from setup_default_gdt_entries() are not
-	 * replaced.  See ignored_gdt() above.
-	 */
-	for (i = 0; i < GDT_ENTRIES; i++)
-		if (!ignored_gdt(i))
-			gdt[i] = cpu->arch.gdt[i];
-}
-
-/*H:620
- * This is where the Guest asks us to load a new GDT entry
- * (LHCALL_LOAD_GDT_ENTRY).  We tweak the entry and copy it in.
- */
-void load_guest_gdt_entry(struct lg_cpu *cpu, u32 num, u32 lo, u32 hi)
-{
-	/*
-	 * We assume the Guest has the same number of GDT entries as the
-	 * Host, otherwise we'd have to dynamically allocate the Guest GDT.
-	 */
-	if (num >= ARRAY_SIZE(cpu->arch.gdt)) {
-		kill_guest(cpu, "too many gdt entries %i", num);
-		return;
-	}
-
-	/* Set it up, then fix it. */
-	cpu->arch.gdt[num].a = lo;
-	cpu->arch.gdt[num].b = hi;
-	fixup_gdt_table(cpu, num, num+1);
-	/*
-	 * Mark that the GDT changed so the core knows it has to copy it again,
-	 * even if the Guest is run on the same CPU.
-	 */
-	cpu->changed |= CHANGED_GDT;
-}
-
-/*
- * This is the fast-track version for just changing the three TLS entries.
- * Remember that this happens on every context switch, so it's worth
- * optimizing.  But wouldn't it be neater to have a single hypercall to cover
- * both cases?
- */
-void guest_load_tls(struct lg_cpu *cpu, unsigned long gtls)
-{
-	struct desc_struct *tls = &cpu->arch.gdt[GDT_ENTRY_TLS_MIN];
-
-	__lgread(cpu, tls, gtls, sizeof(*tls)*GDT_ENTRY_TLS_ENTRIES);
-	fixup_gdt_table(cpu, GDT_ENTRY_TLS_MIN, GDT_ENTRY_TLS_MAX+1);
-	/* Note that just the TLS entries have changed. */
-	cpu->changed |= CHANGED_GDT_TLS;
-}
-
-/*H:660
- * With this, we have finished the Host.
- *
- * Five of the seven parts of our task are complete.  You have made it through
- * the Bit of Despair (I think that's somewhere in the page table code,
- * myself).
- *
- * Next, we examine "make Switcher".  It's short, but intense.
- */
diff --git a/drivers/lguest/x86/core.c b/drivers/lguest/x86/core.c
deleted file mode 100644
index b4f79b923aea..000000000000
--- a/drivers/lguest/x86/core.c
+++ /dev/null
@@ -1,724 +0,0 @@
-/*
- * Copyright (C) 2006, Rusty Russell <rusty@rustcorp.com.au> IBM Corporation.
- * Copyright (C) 2007, Jes Sorensen <jes@sgi.com> SGI.
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
- * NON INFRINGEMENT.  See the GNU General Public License for more
- * details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
- */
-/*P:450
- * This file contains the x86-specific lguest code.  It used to be all
- * mixed in with drivers/lguest/core.c but several foolhardy code slashers
- * wrestled most of the dependencies out to here in preparation for porting
- * lguest to other architectures (see what I mean by foolhardy?).
- *
- * This also contains a couple of non-obvious setup and teardown pieces which
- * were implemented after days of debugging pain.
-:*/
-#include <linux/kernel.h>
-#include <linux/start_kernel.h>
-#include <linux/string.h>
-#include <linux/console.h>
-#include <linux/screen_info.h>
-#include <linux/irq.h>
-#include <linux/interrupt.h>
-#include <linux/clocksource.h>
-#include <linux/clockchips.h>
-#include <linux/cpu.h>
-#include <linux/lguest.h>
-#include <linux/lguest_launcher.h>
-#include <asm/paravirt.h>
-#include <asm/param.h>
-#include <asm/page.h>
-#include <asm/pgtable.h>
-#include <asm/desc.h>
-#include <asm/setup.h>
-#include <asm/lguest.h>
-#include <linux/uaccess.h>
-#include <asm/fpu/internal.h>
-#include <asm/tlbflush.h>
-#include "../lg.h"
-
-static int cpu_had_pge;
-
-static struct {
-	unsigned long offset;
-	unsigned short segment;
-} lguest_entry;
-
-/* Offset from where switcher.S was compiled to where we've copied it */
-static unsigned long switcher_offset(void)
-{
-	return switcher_addr - (unsigned long)start_switcher_text;
-}
-
-/* This cpu's struct lguest_pages (after the Switcher text page) */
-static struct lguest_pages *lguest_pages(unsigned int cpu)
-{
-	return &(((struct lguest_pages *)(switcher_addr + PAGE_SIZE))[cpu]);
-}
-
-static DEFINE_PER_CPU(struct lg_cpu *, lg_last_cpu);
-
-/*S:010
- * We approach the Switcher.
- *
- * Remember that each CPU has two pages which are visible to the Guest when it
- * runs on that CPU.  This has to contain the state for that Guest: we copy the
- * state in just before we run the Guest.
- *
- * Each Guest has "changed" flags which indicate what has changed in the Guest
- * since it last ran.  We saw this set in interrupts_and_traps.c and
- * segments.c.
- */
-static void copy_in_guest_info(struct lg_cpu *cpu, struct lguest_pages *pages)
-{
-	/*
-	 * Copying all this data can be quite expensive.  We usually run the
-	 * same Guest we ran last time (and that Guest hasn't run anywhere else
-	 * meanwhile).  If that's not the case, we pretend everything in the
-	 * Guest has changed.
-	 */
-	if (__this_cpu_read(lg_last_cpu) != cpu || cpu->last_pages != pages) {
-		__this_cpu_write(lg_last_cpu, cpu);
-		cpu->last_pages = pages;
-		cpu->changed = CHANGED_ALL;
-	}
-
-	/*
-	 * These copies are pretty cheap, so we do them unconditionally: */
-	/* Save the current Host top-level page directory.
-	 */
-	pages->state.host_cr3 = __pa(current->mm->pgd);
-	/*
-	 * Set up the Guest's page tables to see this CPU's pages (and no
-	 * other CPU's pages).
-	 */
-	map_switcher_in_guest(cpu, pages);
-	/*
-	 * Set up the two "TSS" members which tell the CPU what stack to use
-	 * for traps which do directly into the Guest (ie. traps at privilege
-	 * level 1).
-	 */
-	pages->state.guest_tss.sp1 = cpu->esp1;
-	pages->state.guest_tss.ss1 = cpu->ss1;
-
-	/* Copy direct-to-Guest trap entries. */
-	if (cpu->changed & CHANGED_IDT)
-		copy_traps(cpu, pages->state.guest_idt, default_idt_entries);
-
-	/* Copy all GDT entries which the Guest can change. */
-	if (cpu->changed & CHANGED_GDT)
-		copy_gdt(cpu, pages->state.guest_gdt);
-	/* If only the TLS entries have changed, copy them. */
-	else if (cpu->changed & CHANGED_GDT_TLS)
-		copy_gdt_tls(cpu, pages->state.guest_gdt);
-
-	/* Mark the Guest as unchanged for next time. */
-	cpu->changed = 0;
-}
-
-/* Finally: the code to actually call into the Switcher to run the Guest. */
-static void run_guest_once(struct lg_cpu *cpu, struct lguest_pages *pages)
-{
-	/* This is a dummy value we need for GCC's sake. */
-	unsigned int clobber;
-
-	/*
-	 * Copy the guest-specific information into this CPU's "struct
-	 * lguest_pages".
-	 */
-	copy_in_guest_info(cpu, pages);
-
-	/*
-	 * Set the trap number to 256 (impossible value).  If we fault while
-	 * switching to the Guest (bad segment registers or bug), this will
-	 * cause us to abort the Guest.
-	 */
-	cpu->regs->trapnum = 256;
-
-	/*
-	 * Now: we push the "eflags" register on the stack, then do an "lcall".
-	 * This is how we change from using the kernel code segment to using
-	 * the dedicated lguest code segment, as well as jumping into the
-	 * Switcher.
-	 *
-	 * The lcall also pushes the old code segment (KERNEL_CS) onto the
-	 * stack, then the address of this call.  This stack layout happens to
-	 * exactly match the stack layout created by an interrupt...
-	 */
-	asm volatile("pushf; lcall *%4"
-		     /*
-		      * This is how we tell GCC that %eax ("a") and %ebx ("b")
-		      * are changed by this routine.  The "=" means output.
-		      */
-		     : "=a"(clobber), "=b"(clobber)
-		     /*
-		      * %eax contains the pages pointer.  ("0" refers to the
-		      * 0-th argument above, ie "a").  %ebx contains the
-		      * physical address of the Guest's top-level page
-		      * directory.
-		      */
-		     : "0"(pages), 
-		       "1"(__pa(cpu->lg->pgdirs[cpu->cpu_pgd].pgdir)),
-		       "m"(lguest_entry)
-		     /*
-		      * We tell gcc that all these registers could change,
-		      * which means we don't have to save and restore them in
-		      * the Switcher.
-		      */
-		     : "memory", "%edx", "%ecx", "%edi", "%esi");
-}
-/*:*/
-
-unsigned long *lguest_arch_regptr(struct lg_cpu *cpu, size_t reg_off, bool any)
-{
-	switch (reg_off) {
-	case offsetof(struct pt_regs, bx):
-		return &cpu->regs->ebx;
-	case offsetof(struct pt_regs, cx):
-		return &cpu->regs->ecx;
-	case offsetof(struct pt_regs, dx):
-		return &cpu->regs->edx;
-	case offsetof(struct pt_regs, si):
-		return &cpu->regs->esi;
-	case offsetof(struct pt_regs, di):
-		return &cpu->regs->edi;
-	case offsetof(struct pt_regs, bp):
-		return &cpu->regs->ebp;
-	case offsetof(struct pt_regs, ax):
-		return &cpu->regs->eax;
-	case offsetof(struct pt_regs, ip):
-		return &cpu->regs->eip;
-	case offsetof(struct pt_regs, sp):
-		return &cpu->regs->esp;
-	}
-
-	/* Launcher can read these, but we don't allow any setting. */
-	if (any) {
-		switch (reg_off) {
-		case offsetof(struct pt_regs, ds):
-			return &cpu->regs->ds;
-		case offsetof(struct pt_regs, es):
-			return &cpu->regs->es;
-		case offsetof(struct pt_regs, fs):
-			return &cpu->regs->fs;
-		case offsetof(struct pt_regs, gs):
-			return &cpu->regs->gs;
-		case offsetof(struct pt_regs, cs):
-			return &cpu->regs->cs;
-		case offsetof(struct pt_regs, flags):
-			return &cpu->regs->eflags;
-		case offsetof(struct pt_regs, ss):
-			return &cpu->regs->ss;
-		}
-	}
-
-	return NULL;
-}
-
-/*M:002
- * There are hooks in the scheduler which we can register to tell when we
- * get kicked off the CPU (preempt_notifier_register()).  This would allow us
- * to lazily disable SYSENTER which would regain some performance, and should
- * also simplify copy_in_guest_info().  Note that we'd still need to restore
- * things when we exit to Launcher userspace, but that's fairly easy.
- *
- * We could also try using these hooks for PGE, but that might be too expensive.
- *
- * The hooks were designed for KVM, but we can also put them to good use.
-:*/
-
-/*H:040
- * This is the i386-specific code to setup and run the Guest.  Interrupts
- * are disabled: we own the CPU.
- */
-void lguest_arch_run_guest(struct lg_cpu *cpu)
-{
-	/*
-	 * SYSENTER is an optimized way of doing system calls.  We can't allow
-	 * it because it always jumps to privilege level 0.  A normal Guest
-	 * won't try it because we don't advertise it in CPUID, but a malicious
-	 * Guest (or malicious Guest userspace program) could, so we tell the
-	 * CPU to disable it before running the Guest.
-	 */
-	if (boot_cpu_has(X86_FEATURE_SEP))
-		wrmsr(MSR_IA32_SYSENTER_CS, 0, 0);
-
-	/*
-	 * Now we actually run the Guest.  It will return when something
-	 * interesting happens, and we can examine its registers to see what it
-	 * was doing.
-	 */
-	run_guest_once(cpu, lguest_pages(raw_smp_processor_id()));
-
-	/*
-	 * Note that the "regs" structure contains two extra entries which are
-	 * not really registers: a trap number which says what interrupt or
-	 * trap made the switcher code come back, and an error code which some
-	 * traps set.
-	 */
-
-	 /* Restore SYSENTER if it's supposed to be on. */
-	 if (boot_cpu_has(X86_FEATURE_SEP))
-		wrmsr(MSR_IA32_SYSENTER_CS, __KERNEL_CS, 0);
-
-	/*
-	 * If the Guest page faulted, then the cr2 register will tell us the
-	 * bad virtual address.  We have to grab this now, because once we
-	 * re-enable interrupts an interrupt could fault and thus overwrite
-	 * cr2, or we could even move off to a different CPU.
-	 */
-	if (cpu->regs->trapnum == 14)
-		cpu->arch.last_pagefault = read_cr2();
-	/*
-	 * Similarly, if we took a trap because the Guest used the FPU,
-	 * we have to restore the FPU it expects to see.
-	 * fpu__restore() may sleep and we may even move off to
-	 * a different CPU. So all the critical stuff should be done
-	 * before this.
-	 */
-	else if (cpu->regs->trapnum == 7 && !fpregs_active())
-		fpu__restore(&current->thread.fpu);
-}
-
-/*H:130
- * Now we've examined the hypercall code; our Guest can make requests.
- * Our Guest is usually so well behaved; it never tries to do things it isn't
- * allowed to, and uses hypercalls instead.  Unfortunately, Linux's paravirtual
- * infrastructure isn't quite complete, because it doesn't contain replacements
- * for the Intel I/O instructions.  As a result, the Guest sometimes fumbles
- * across one during the boot process as it probes for various things which are
- * usually attached to a PC.
- *
- * When the Guest uses one of these instructions, we get a trap (General
- * Protection Fault) and come here.  We queue this to be sent out to the
- * Launcher to handle.
- */
-
-/*
- * The eip contains the *virtual* address of the Guest's instruction:
- * we copy the instruction here so the Launcher doesn't have to walk
- * the page tables to decode it.  We handle the case (eg. in a kernel
- * module) where the instruction is over two pages, and the pages are
- * virtually but not physically contiguous.
- *
- * The longest possible x86 instruction is 15 bytes, but we don't handle
- * anything that strange.
- */
-static void copy_from_guest(struct lg_cpu *cpu,
-			    void *dst, unsigned long vaddr, size_t len)
-{
-	size_t to_page_end = PAGE_SIZE - (vaddr % PAGE_SIZE);
-	unsigned long paddr;
-
-	BUG_ON(len > PAGE_SIZE);
-
-	/* If it goes over a page, copy in two parts. */
-	if (len > to_page_end) {
-		/* But make sure the next page is mapped! */
-		if (__guest_pa(cpu, vaddr + to_page_end, &paddr))
-			copy_from_guest(cpu, dst + to_page_end,
-					vaddr + to_page_end,
-					len - to_page_end);
-		else
-			/* Otherwise fill with zeroes. */
-			memset(dst + to_page_end, 0, len - to_page_end);
-		len = to_page_end;
-	}
-
-	/* This will kill the guest if it isn't mapped, but that
-	 * shouldn't happen. */
-	__lgread(cpu, dst, guest_pa(cpu, vaddr), len);
-}
-
-
-static void setup_emulate_insn(struct lg_cpu *cpu)
-{
-	cpu->pending.trap = 13;
-	copy_from_guest(cpu, cpu->pending.insn, cpu->regs->eip,
-			sizeof(cpu->pending.insn));
-}
-
-static void setup_iomem_insn(struct lg_cpu *cpu, unsigned long iomem_addr)
-{
-	cpu->pending.trap = 14;
-	cpu->pending.addr = iomem_addr;
-	copy_from_guest(cpu, cpu->pending.insn, cpu->regs->eip,
-			sizeof(cpu->pending.insn));
-}
-
-/*H:050 Once we've re-enabled interrupts, we look at why the Guest exited. */
-void lguest_arch_handle_trap(struct lg_cpu *cpu)
-{
-	unsigned long iomem_addr;
-
-	switch (cpu->regs->trapnum) {
-	case 13: /* We've intercepted a General Protection Fault. */
-		/* Hand to Launcher to emulate those pesky IN and OUT insns */
-		if (cpu->regs->errcode == 0) {
-			setup_emulate_insn(cpu);
-			return;
-		}
-		break;
-	case 14: /* We've intercepted a Page Fault. */
-		/*
-		 * The Guest accessed a virtual address that wasn't mapped.
-		 * This happens a lot: we don't actually set up most of the page
-		 * tables for the Guest at all when we start: as it runs it asks
-		 * for more and more, and we set them up as required. In this
-		 * case, we don't even tell the Guest that the fault happened.
-		 *
-		 * The errcode tells whether this was a read or a write, and
-		 * whether kernel or userspace code.
-		 */
-		if (demand_page(cpu, cpu->arch.last_pagefault,
-				cpu->regs->errcode, &iomem_addr))
-			return;
-
-		/* Was this an access to memory mapped IO? */
-		if (iomem_addr) {
-			/* Tell Launcher, let it handle it. */
-			setup_iomem_insn(cpu, iomem_addr);
-			return;
-		}
-
-		/*
-		 * OK, it's really not there (or not OK): the Guest needs to
-		 * know.  We write out the cr2 value so it knows where the
-		 * fault occurred.
-		 *
-		 * Note that if the Guest were really messed up, this could
-		 * happen before it's done the LHCALL_LGUEST_INIT hypercall, so
-		 * lg->lguest_data could be NULL
-		 */
-		if (cpu->lg->lguest_data &&
-		    put_user(cpu->arch.last_pagefault,
-			     &cpu->lg->lguest_data->cr2))
-			kill_guest(cpu, "Writing cr2");
-		break;
-	case 7: /* We've intercepted a Device Not Available fault. */
-		/* No special handling is needed here. */
-		break;
-	case 32 ... 255:
-		/* This might be a syscall. */
-		if (could_be_syscall(cpu->regs->trapnum))
-			break;
-
-		/*
-		 * Other values mean a real interrupt occurred, in which case
-		 * the Host handler has already been run. We just do a
-		 * friendly check if another process should now be run, then
-		 * return to run the Guest again.
-		 */
-		cond_resched();
-		return;
-	case LGUEST_TRAP_ENTRY:
-		/*
-		 * Our 'struct hcall_args' maps directly over our regs: we set
-		 * up the pointer now to indicate a hypercall is pending.
-		 */
-		cpu->hcall = (struct hcall_args *)cpu->regs;
-		return;
-	}
-
-	/* We didn't handle the trap, so it needs to go to the Guest. */
-	if (!deliver_trap(cpu, cpu->regs->trapnum))
-		/*
-		 * If the Guest doesn't have a handler (either it hasn't
-		 * registered any yet, or it's one of the faults we don't let
-		 * it handle), it dies with this cryptic error message.
-		 */
-		kill_guest(cpu, "unhandled trap %li at %#lx (%#lx)",
-			   cpu->regs->trapnum, cpu->regs->eip,
-			   cpu->regs->trapnum == 14 ? cpu->arch.last_pagefault
-			   : cpu->regs->errcode);
-}
-
-/*
- * Now we can look at each of the routines this calls, in increasing order of
- * complexity: do_hypercalls(), emulate_insn(), maybe_do_interrupt(),
- * deliver_trap() and demand_page().  After all those, we'll be ready to
- * examine the Switcher, and our philosophical understanding of the Host/Guest
- * duality will be complete.
-:*/
-static void adjust_pge(void *on)
-{
-	if (on)
-		cr4_set_bits(X86_CR4_PGE);
-	else
-		cr4_clear_bits(X86_CR4_PGE);
-}
-
-/*H:020
- * Now the Switcher is mapped and every thing else is ready, we need to do
- * some more i386-specific initialization.
- */
-void __init lguest_arch_host_init(void)
-{
-	int i;
-
-	/*
-	 * Most of the x86/switcher_32.S doesn't care that it's been moved; on
-	 * Intel, jumps are relative, and it doesn't access any references to
-	 * external code or data.
-	 *
-	 * The only exception is the interrupt handlers in switcher.S: their
-	 * addresses are placed in a table (default_idt_entries), so we need to
-	 * update the table with the new addresses.  switcher_offset() is a
-	 * convenience function which returns the distance between the
-	 * compiled-in switcher code and the high-mapped copy we just made.
-	 */
-	for (i = 0; i < IDT_ENTRIES; i++)
-		default_idt_entries[i] += switcher_offset();
-
-	/*
-	 * Set up the Switcher's per-cpu areas.
-	 *
-	 * Each CPU gets two pages of its own within the high-mapped region
-	 * (aka. "struct lguest_pages").  Much of this can be initialized now,
-	 * but some depends on what Guest we are running (which is set up in
-	 * copy_in_guest_info()).
-	 */
-	for_each_possible_cpu(i) {
-		/* lguest_pages() returns this CPU's two pages. */
-		struct lguest_pages *pages = lguest_pages(i);
-		/* This is a convenience pointer to make the code neater. */
-		struct lguest_ro_state *state = &pages->state;
-
-		/*
-		 * The Global Descriptor Table: the Host has a different one
-		 * for each CPU.  We keep a descriptor for the GDT which says
-		 * where it is and how big it is (the size is actually the last
-		 * byte, not the size, hence the "-1").
-		 */
-		state->host_gdt_desc.size = GDT_SIZE-1;
-		state->host_gdt_desc.address = (long)get_cpu_gdt_rw(i);
-
-		/*
-		 * All CPUs on the Host use the same Interrupt Descriptor
-		 * Table, so we just use store_idt(), which gets this CPU's IDT
-		 * descriptor.
-		 */
-		store_idt(&state->host_idt_desc);
-
-		/*
-		 * The descriptors for the Guest's GDT and IDT can be filled
-		 * out now, too.  We copy the GDT & IDT into ->guest_gdt and
-		 * ->guest_idt before actually running the Guest.
-		 */
-		state->guest_idt_desc.size = sizeof(state->guest_idt)-1;
-		state->guest_idt_desc.address = (long)&state->guest_idt;
-		state->guest_gdt_desc.size = sizeof(state->guest_gdt)-1;
-		state->guest_gdt_desc.address = (long)&state->guest_gdt;
-
-		/*
-		 * We know where we want the stack to be when the Guest enters
-		 * the Switcher: in pages->regs.  The stack grows upwards, so
-		 * we start it at the end of that structure.
-		 */
-		state->guest_tss.sp0 = (long)(&pages->regs + 1);
-		/*
-		 * And this is the GDT entry to use for the stack: we keep a
-		 * couple of special LGUEST entries.
-		 */
-		state->guest_tss.ss0 = LGUEST_DS;
-
-		/*
-		 * x86 can have a finegrained bitmap which indicates what I/O
-		 * ports the process can use.  We set it to the end of our
-		 * structure, meaning "none".
-		 */
-		state->guest_tss.io_bitmap_base = sizeof(state->guest_tss);
-
-		/*
-		 * Some GDT entries are the same across all Guests, so we can
-		 * set them up now.
-		 */
-		setup_default_gdt_entries(state);
-		/* Most IDT entries are the same for all Guests, too.*/
-		setup_default_idt_entries(state, default_idt_entries);
-
-		/*
-		 * The Host needs to be able to use the LGUEST segments on this
-		 * CPU, too, so put them in the Host GDT.
-		 */
-		get_cpu_gdt_rw(i)[GDT_ENTRY_LGUEST_CS] = FULL_EXEC_SEGMENT;
-		get_cpu_gdt_rw(i)[GDT_ENTRY_LGUEST_DS] = FULL_SEGMENT;
-	}
-
-	/*
-	 * In the Switcher, we want the %cs segment register to use the
-	 * LGUEST_CS GDT entry: we've put that in the Host and Guest GDTs, so
-	 * it will be undisturbed when we switch.  To change %cs and jump we
-	 * need this structure to feed to Intel's "lcall" instruction.
-	 */
-	lguest_entry.offset = (long)switch_to_guest + switcher_offset();
-	lguest_entry.segment = LGUEST_CS;
-
-	/*
-	 * Finally, we need to turn off "Page Global Enable".  PGE is an
-	 * optimization where page table entries are specially marked to show
-	 * they never change.  The Host kernel marks all the kernel pages this
-	 * way because it's always present, even when userspace is running.
-	 *
-	 * Lguest breaks this: unbeknownst to the rest of the Host kernel, we
-	 * switch to the Guest kernel.  If you don't disable this on all CPUs,
-	 * you'll get really weird bugs that you'll chase for two days.
-	 *
-	 * I used to turn PGE off every time we switched to the Guest and back
-	 * on when we return, but that slowed the Switcher down noticibly.
-	 */
-
-	/*
-	 * We don't need the complexity of CPUs coming and going while we're
-	 * doing this.
-	 */
-	get_online_cpus();
-	if (boot_cpu_has(X86_FEATURE_PGE)) { /* We have a broader idea of "global". */
-		/* Remember that this was originally set (for cleanup). */
-		cpu_had_pge = 1;
-		/*
-		 * adjust_pge is a helper function which sets or unsets the PGE
-		 * bit on its CPU, depending on the argument (0 == unset).
-		 */
-		on_each_cpu(adjust_pge, (void *)0, 1);
-		/* Turn off the feature in the global feature set. */
-		clear_cpu_cap(&boot_cpu_data, X86_FEATURE_PGE);
-	}
-	put_online_cpus();
-}
-/*:*/
-
-void __exit lguest_arch_host_fini(void)
-{
-	/* If we had PGE before we started, turn it back on now. */
-	get_online_cpus();
-	if (cpu_had_pge) {
-		set_cpu_cap(&boot_cpu_data, X86_FEATURE_PGE);
-		/* adjust_pge's argument "1" means set PGE. */
-		on_each_cpu(adjust_pge, (void *)1, 1);
-	}
-	put_online_cpus();
-}
-
-
-/*H:122 The i386-specific hypercalls simply farm out to the right functions. */
-int lguest_arch_do_hcall(struct lg_cpu *cpu, struct hcall_args *args)
-{
-	switch (args->arg0) {
-	case LHCALL_LOAD_GDT_ENTRY:
-		load_guest_gdt_entry(cpu, args->arg1, args->arg2, args->arg3);
-		break;
-	case LHCALL_LOAD_IDT_ENTRY:
-		load_guest_idt_entry(cpu, args->arg1, args->arg2, args->arg3);
-		break;
-	case LHCALL_LOAD_TLS:
-		guest_load_tls(cpu, args->arg1);
-		break;
-	default:
-		/* Bad Guest.  Bad! */
-		return -EIO;
-	}
-	return 0;
-}
-
-/*H:126 i386-specific hypercall initialization: */
-int lguest_arch_init_hypercalls(struct lg_cpu *cpu)
-{
-	u32 tsc_speed;
-
-	/*
-	 * The pointer to the Guest's "struct lguest_data" is the only argument.
-	 * We check that address now.
-	 */
-	if (!lguest_address_ok(cpu->lg, cpu->hcall->arg1,
-			       sizeof(*cpu->lg->lguest_data)))
-		return -EFAULT;
-
-	/*
-	 * Having checked it, we simply set lg->lguest_data to point straight
-	 * into the Launcher's memory at the right place and then use
-	 * copy_to_user/from_user from now on, instead of lgread/write.  I put
-	 * this in to show that I'm not immune to writing stupid
-	 * optimizations.
-	 */
-	cpu->lg->lguest_data = cpu->lg->mem_base + cpu->hcall->arg1;
-
-	/*
-	 * We insist that the Time Stamp Counter exist and doesn't change with
-	 * cpu frequency.  Some devious chip manufacturers decided that TSC
-	 * changes could be handled in software.  I decided that time going
-	 * backwards might be good for benchmarks, but it's bad for users.
-	 *
-	 * We also insist that the TSC be stable: the kernel detects unreliable
-	 * TSCs for its own purposes, and we use that here.
-	 */
-	if (boot_cpu_has(X86_FEATURE_CONSTANT_TSC) && !check_tsc_unstable())
-		tsc_speed = tsc_khz;
-	else
-		tsc_speed = 0;
-	if (put_user(tsc_speed, &cpu->lg->lguest_data->tsc_khz))
-		return -EFAULT;
-
-	/* The interrupt code might not like the system call vector. */
-	if (!check_syscall_vector(cpu->lg))
-		kill_guest(cpu, "bad syscall vector");
-
-	return 0;
-}
-/*:*/
-
-/*L:030
- * Most of the Guest's registers are left alone: we used get_zeroed_page() to
- * allocate the structure, so they will be 0.
- */
-void lguest_arch_setup_regs(struct lg_cpu *cpu, unsigned long start)
-{
-	struct lguest_regs *regs = cpu->regs;
-
-	/*
-	 * There are four "segment" registers which the Guest needs to boot:
-	 * The "code segment" register (cs) refers to the kernel code segment
-	 * __KERNEL_CS, and the "data", "extra" and "stack" segment registers
-	 * refer to the kernel data segment __KERNEL_DS.
-	 *
-	 * The privilege level is packed into the lower bits.  The Guest runs
-	 * at privilege level 1 (GUEST_PL).
-	 */
-	regs->ds = regs->es = regs->ss = __KERNEL_DS|GUEST_PL;
-	regs->cs = __KERNEL_CS|GUEST_PL;
-
-	/*
-	 * The "eflags" register contains miscellaneous flags.  Bit 1 (0x002)
-	 * is supposed to always be "1".  Bit 9 (0x200) controls whether
-	 * interrupts are enabled.  We always leave interrupts enabled while
-	 * running the Guest.
-	 */
-	regs->eflags = X86_EFLAGS_IF | X86_EFLAGS_FIXED;
-
-	/*
-	 * The "Extended Instruction Pointer" register says where the Guest is
-	 * running.
-	 */
-	regs->eip = start;
-
-	/*
-	 * %esi points to our boot information, at physical address 0, so don't
-	 * touch it.
-	 */
-
-	/* There are a couple of GDT entries the Guest expects at boot. */
-	setup_guest_gdt(cpu);
-}
diff --git a/drivers/lguest/x86/switcher_32.S b/drivers/lguest/x86/switcher_32.S
deleted file mode 100644
index 40634b0db9f7..000000000000
--- a/drivers/lguest/x86/switcher_32.S
+++ /dev/null
@@ -1,388 +0,0 @@
-/*P:900
- * This is the Switcher: code which sits at 0xFFC00000 (or 0xFFE00000) astride
- * both the Host and Guest to do the low-level Guest<->Host switch.  It is as
- * simple as it can be made, but it's naturally very specific to x86.
- *
- * You have now completed Preparation.  If this has whet your appetite; if you
- * are feeling invigorated and refreshed then the next, more challenging stage
- * can be found in "make Guest".
- :*/
-
-/*M:012
- * Lguest is meant to be simple: my rule of thumb is that 1% more LOC must
- * gain at least 1% more performance.  Since neither LOC nor performance can be
- * measured beforehand, it generally means implementing a feature then deciding
- * if it's worth it.  And once it's implemented, who can say no?
- *
- * This is why I haven't implemented this idea myself.  I want to, but I
- * haven't.  You could, though.
- *
- * The main place where lguest performance sucks is Guest page faulting.  When
- * a Guest userspace process hits an unmapped page we switch back to the Host,
- * walk the page tables, find it's not mapped, switch back to the Guest page
- * fault handler, which calls a hypercall to set the page table entry, then
- * finally returns to userspace.  That's two round-trips.
- *
- * If we had a small walker in the Switcher, we could quickly check the Guest
- * page table and if the page isn't mapped, immediately reflect the fault back
- * into the Guest.  This means the Switcher would have to know the top of the
- * Guest page table and the page fault handler address.
- *
- * For simplicity, the Guest should only handle the case where the privilege
- * level of the fault is 3 and probably only not present or write faults.  It
- * should also detect recursive faults, and hand the original fault to the
- * Host (which is actually really easy).
- *
- * Two questions remain.  Would the performance gain outweigh the complexity?
- * And who would write the verse documenting it?
-:*/
-
-/*M:011
- * Lguest64 handles NMI.  This gave me NMI envy (until I looked at their
- * code).  It's worth doing though, since it would let us use oprofile in the
- * Host when a Guest is running.
-:*/
-
-/*S:100
- * Welcome to the Switcher itself!
- *
- * This file contains the low-level code which changes the CPU to run the Guest
- * code, and returns to the Host when something happens.  Understand this, and
- * you understand the heart of our journey.
- *
- * Because this is in assembler rather than C, our tale switches from prose to
- * verse.  First I tried limericks:
- *
- *	There once was an eax reg,
- *	To which our pointer was fed,
- *	It needed an add,
- *	Which asm-offsets.h had
- *	But this limerick is hurting my head.
- *
- * Next I tried haikus, but fitting the required reference to the seasons in
- * every stanza was quickly becoming tiresome:
- *
- *	The %eax reg
- *	Holds "struct lguest_pages" now:
- *	Cherry blossoms fall.
- *
- * Then I started with Heroic Verse, but the rhyming requirement leeched away
- * the content density and led to some uniquely awful oblique rhymes:
- *
- *	These constants are coming from struct offsets
- *	For use within the asm switcher text.
- *
- * Finally, I settled for something between heroic hexameter, and normal prose
- * with inappropriate linebreaks.  Anyway, it aint no Shakespeare.
- */
-
-// Not all kernel headers work from assembler
-// But these ones are needed: the ENTRY() define
-// And constants extracted from struct offsets
-// To avoid magic numbers and breakage:
-// Should they change the compiler can't save us
-// Down here in the depths of assembler code.
-#include <linux/linkage.h>
-#include <asm/asm-offsets.h>
-#include <asm/page.h>
-#include <asm/segment.h>
-#include <asm/lguest.h>
-
-// We mark the start of the code to copy
-// It's placed in .text tho it's never run here
-// You'll see the trick macro at the end
-// Which interleaves data and text to effect.
-.text
-ENTRY(start_switcher_text)
-
-// When we reach switch_to_guest we have just left
-// The safe and comforting shores of C code
-// %eax has the "struct lguest_pages" to use
-// Where we save state and still see it from the Guest
-// And %ebx holds the Guest shadow pagetable:
-// Once set we have truly left Host behind.
-ENTRY(switch_to_guest)
-	// We told gcc all its regs could fade,
-	// Clobbered by our journey into the Guest
-	// We could have saved them, if we tried
-	// But time is our master and cycles count.
-
-	// Segment registers must be saved for the Host
-	// We push them on the Host stack for later
-	pushl	%es
-	pushl	%ds
-	pushl	%gs
-	pushl	%fs
-	// But the compiler is fickle, and heeds
-	// No warning of %ebp clobbers
-	// When frame pointers are used.  That register
-	// Must be saved and restored or chaos strikes.
-	pushl	%ebp
-	// The Host's stack is done, now save it away
-	// In our "struct lguest_pages" at offset
-	// Distilled into asm-offsets.h
-	movl	%esp, LGUEST_PAGES_host_sp(%eax)
-
-	// All saved and there's now five steps before us:
-	// Stack, GDT, IDT, TSS
-	// Then last of all the page tables are flipped.
-
-	// Yet beware that our stack pointer must be
-	// Always valid lest an NMI hits
-	// %edx does the duty here as we juggle
-	// %eax is lguest_pages: our stack lies within.
-	movl	%eax, %edx
-	addl	$LGUEST_PAGES_regs, %edx
-	movl	%edx, %esp
-
-	// The Guest's GDT we so carefully
-	// Placed in the "struct lguest_pages" before
-	lgdt	LGUEST_PAGES_guest_gdt_desc(%eax)
-
-	// The Guest's IDT we did partially
-	// Copy to "struct lguest_pages" as well.
-	lidt	LGUEST_PAGES_guest_idt_desc(%eax)
-
-	// The TSS entry which controls traps
-	// Must be loaded up with "ltr" now:
-	// The GDT entry that TSS uses 
-	// Changes type when we load it: damn Intel!
-	// For after we switch over our page tables
-	// That entry will be read-only: we'd crash.
-	movl	$(GDT_ENTRY_TSS*8), %edx
-	ltr	%dx
-
-	// Look back now, before we take this last step!
-	// The Host's TSS entry was also marked used;
-	// Let's clear it again for our return.
-	// The GDT descriptor of the Host
-	// Points to the table after two "size" bytes
-	movl	(LGUEST_PAGES_host_gdt_desc+2)(%eax), %edx
-	// Clear "used" from type field (byte 5, bit 2)
-	andb	$0xFD, (GDT_ENTRY_TSS*8 + 5)(%edx)
-
-	// Once our page table's switched, the Guest is live!
-	// The Host fades as we run this final step.
-	// Our "struct lguest_pages" is now read-only.
-	movl	%ebx, %cr3
-
-	// The page table change did one tricky thing:
-	// The Guest's register page has been mapped
-	// Writable under our %esp (stack) --
-	// We can simply pop off all Guest regs.
-	popl	%eax
-	popl	%ebx
-	popl	%ecx
-	popl	%edx
-	popl	%esi
-	popl	%edi
-	popl	%ebp
-	popl	%gs
-	popl	%fs
-	popl	%ds
-	popl	%es
-
-	// Near the base of the stack lurk two strange fields
-	// Which we fill as we exit the Guest
-	// These are the trap number and its error
-	// We can simply step past them on our way.
-	addl	$8, %esp
-
-	// The last five stack slots hold return address
-	// And everything needed to switch privilege
-	// From Switcher's level 0 to Guest's 1,
-	// And the stack where the Guest had last left it.
-	// Interrupts are turned back on: we are Guest.
-	iret
-
-// We tread two paths to switch back to the Host
-// Yet both must save Guest state and restore Host
-// So we put the routine in a macro.
-#define SWITCH_TO_HOST							\
-	/* We save the Guest state: all registers first			\
-	 * Laid out just as "struct lguest_regs" defines */		\
-	pushl	%es;							\
-	pushl	%ds;							\
-	pushl	%fs;							\
-	pushl	%gs;							\
-	pushl	%ebp;							\
-	pushl	%edi;							\
-	pushl	%esi;							\
-	pushl	%edx;							\
-	pushl	%ecx;							\
-	pushl	%ebx;							\
-	pushl	%eax;							\
-	/* Our stack and our code are using segments			\
-	 * Set in the TSS and IDT					\
-	 * Yet if we were to touch data we'd use			\
-	 * Whatever data segment the Guest had.				\
-	 * Load the lguest ds segment for now. */			\
-	movl	$(LGUEST_DS), %eax;					\
-	movl	%eax, %ds;						\
-	/* So where are we?  Which CPU, which struct?			\
-	 * The stack is our clue: our TSS starts			\
-	 * It at the end of "struct lguest_pages".			\
-	 * Or we may have stumbled while restoring			\
-	 * Our Guest segment regs while in switch_to_guest,		\
-	 * The fault pushed atop that part-unwound stack.		\
-	 * If we round the stack down to the page start			\
-	 * We're at the start of "struct lguest_pages". */		\
-	movl	%esp, %eax;						\
-	andl	$(~(1 << PAGE_SHIFT - 1)), %eax;			\
-	/* Save our trap number: the switch will obscure it		\
-	 * (In the Host the Guest regs are not mapped here)		\
-	 * %ebx holds it safe for deliver_to_host */			\
-	movl	LGUEST_PAGES_regs_trapnum(%eax), %ebx;			\
-	/* The Host GDT, IDT and stack!					\
-	 * All these lie safely hidden from the Guest:			\
-	 * We must return to the Host page tables			\
-	 * (Hence that was saved in struct lguest_pages) */		\
-	movl	LGUEST_PAGES_host_cr3(%eax), %edx;			\
-	movl	%edx, %cr3;						\
-	/* As before, when we looked back at the Host			\
-	 * As we left and marked TSS unused				\
-	 * So must we now for the Guest left behind. */			\
-	andb	$0xFD, (LGUEST_PAGES_guest_gdt+GDT_ENTRY_TSS*8+5)(%eax); \
-	/* Switch to Host's GDT, IDT. */				\
-	lgdt	LGUEST_PAGES_host_gdt_desc(%eax);			\
-	lidt	LGUEST_PAGES_host_idt_desc(%eax);			\
-	/* Restore the Host's stack where its saved regs lie */		\
-	movl	LGUEST_PAGES_host_sp(%eax), %esp;			\
-	/* Last the TSS: our Host is returned */			\
-	movl	$(GDT_ENTRY_TSS*8), %edx;				\
-	ltr	%dx;							\
-	/* Restore now the regs saved right at the first. */		\
-	popl	%ebp;							\
-	popl	%fs;							\
-	popl	%gs;							\
-	popl	%ds;							\
-	popl	%es
-
-// The first path is trod when the Guest has trapped:
-// (Which trap it was has been pushed on the stack).
-// We need only switch back, and the Host will decode
-// Why we came home, and what needs to be done.
-return_to_host:
-	SWITCH_TO_HOST
-	iret
-
-// We are lead to the second path like so:
-// An interrupt, with some cause external
-// Has ajerked us rudely from the Guest's code
-// Again we must return home to the Host
-deliver_to_host:
-	SWITCH_TO_HOST
-	// But now we must go home via that place
-	// Where that interrupt was supposed to go
-	// Had we not been ensconced, running the Guest.
-	// Here we see the trickness of run_guest_once():
-	// The Host stack is formed like an interrupt
-	// With EIP, CS and EFLAGS layered.
-	// Interrupt handlers end with "iret"
-	// And that will take us home at long long last.
-
-	// But first we must find the handler to call!
-	// The IDT descriptor for the Host
-	// Has two bytes for size, and four for address:
-	// %edx will hold it for us for now.
-	movl	(LGUEST_PAGES_host_idt_desc+2)(%eax), %edx
-	// We now know the table address we need,
-	// And saved the trap's number inside %ebx.
-	// Yet the pointer to the handler is smeared
-	// Across the bits of the table entry.
-	// What oracle can tell us how to extract
-	// From such a convoluted encoding?
-	// I consulted gcc, and it gave
-	// These instructions, which I gladly credit:
-	leal	(%edx,%ebx,8), %eax
-	movzwl	(%eax),%edx
-	movl	4(%eax), %eax
-	xorw	%ax, %ax
-	orl	%eax, %edx
-	// Now the address of the handler's in %edx
-	// We call it now: its "iret" drops us home.
-	jmp	*%edx
-
-// Every interrupt can come to us here
-// But we must truly tell each apart.
-// They number two hundred and fifty six
-// And each must land in a different spot,
-// Push its number on stack, and join the stream.
-
-// And worse, a mere six of the traps stand apart
-// And push on their stack an addition:
-// An error number, thirty two bits long
-// So we punish the other two fifty
-// And make them push a zero so they match.
-
-// Yet two fifty six entries is long
-// And all will look most the same as the last
-// So we create a macro which can make
-// As many entries as we need to fill.
-
-// Note the change to .data then .text:
-// We plant the address of each entry
-// Into a (data) table for the Host
-// To know where each Guest interrupt should go.
-.macro IRQ_STUB N TARGET
-	.data; .long 1f; .text; 1:
- // Trap eight, ten through fourteen and seventeen
- // Supply an error number.  Else zero.
- .if (\N <> 8) && (\N < 10 || \N > 14) && (\N <> 17)
-	pushl	$0
- .endif
-	pushl	$\N
-	jmp	\TARGET
-	ALIGN
-.endm
-
-// This macro creates numerous entries
-// Using GAS macros which out-power C's.
-.macro IRQ_STUBS FIRST LAST TARGET
- irq=\FIRST
- .rept \LAST-\FIRST+1
-	IRQ_STUB irq \TARGET
-  irq=irq+1
- .endr
-.endm
-
-// Here's the marker for our pointer table
-// Laid in the data section just before
-// Each macro places the address of code
-// Forming an array: each one points to text
-// Which handles interrupt in its turn.
-.data
-.global default_idt_entries
-default_idt_entries:
-.text
-	// The first two traps go straight back to the Host
-	IRQ_STUBS 0 1 return_to_host
-	// We'll say nothing, yet, about NMI
-	IRQ_STUB 2 handle_nmi
-	// Other traps also return to the Host
-	IRQ_STUBS 3 31 return_to_host
-	// All interrupts go via their handlers
-	IRQ_STUBS 32 127 deliver_to_host
-	// 'Cept system calls coming from userspace
-	// Are to go to the Guest, never the Host.
-	IRQ_STUB 128 return_to_host
-	IRQ_STUBS 129 255 deliver_to_host
-
-// The NMI, what a fabulous beast
-// Which swoops in and stops us no matter that
-// We're suspended between heaven and hell,
-// (Or more likely between the Host and Guest)
-// When in it comes!  We are dazed and confused
-// So we do the simplest thing which one can.
-// Though we've pushed the trap number and zero
-// We discard them, return, and hope we live.
-handle_nmi:
-	addl	$8, %esp
-	iret
-
-// We are done; all that's left is Mastery
-// And "make Mastery" is a journey long
-// Designed to make your fingers itch to code.
-
-// Here ends the text, the file and poem.
-ENTRY(end_switcher_text)
diff --git a/drivers/net/Kconfig b/drivers/net/Kconfig
index 83a1616903f8..aba0d652095b 100644
--- a/drivers/net/Kconfig
+++ b/drivers/net/Kconfig
@@ -333,7 +333,7 @@ config VIRTIO_NET
 	depends on VIRTIO
 	---help---
 	  This is the virtual network driver for virtio.  It can be used with
-	  lguest or QEMU based VMMs (like KVM or Xen).  Say Y or M.
+	  QEMU based VMMs (like KVM or Xen).  Say Y or M.
 
 config NLMON
 	tristate "Virtual netlink monitoring device"
diff --git a/drivers/tty/hvc/Kconfig b/drivers/tty/hvc/Kconfig
index b8d5ea0ae26b..fec457edad14 100644
--- a/drivers/tty/hvc/Kconfig
+++ b/drivers/tty/hvc/Kconfig
@@ -4,7 +4,7 @@ config HVC_DRIVER
 	bool
 	help
 	  Generic "hypervisor virtual console" infrastructure for various
-	  hypervisors (pSeries, iSeries, Xen, lguest).
+	  hypervisors (pSeries, iSeries, Xen).
 	  It will automatically be selected if one of the back-end console drivers
 	  is selected.
 
diff --git a/drivers/virtio/Kconfig b/drivers/virtio/Kconfig
index 623f72334fa5..cff773f15b7e 100644
--- a/drivers/virtio/Kconfig
+++ b/drivers/virtio/Kconfig
@@ -2,8 +2,8 @@ config VIRTIO
 	tristate
 	---help---
 	  This option is selected by any driver which implements the virtio
-	  bus, such as CONFIG_VIRTIO_PCI, CONFIG_VIRTIO_MMIO, CONFIG_LGUEST,
-	  CONFIG_RPMSG or CONFIG_S390_GUEST.
+	  bus, such as CONFIG_VIRTIO_PCI, CONFIG_VIRTIO_MMIO, CONFIG_RPMSG
+	  or CONFIG_S390_GUEST.
 
 menu "Virtio drivers"
 
diff --git a/include/linux/lguest.h b/include/linux/lguest.h
deleted file mode 100644
index 6db19f35f7c5..000000000000
--- a/include/linux/lguest.h
+++ /dev/null
@@ -1,73 +0,0 @@
-/*
- * Things the lguest guest needs to know.  Note: like all lguest interfaces,
- * this is subject to wild and random change between versions.
- */
-#ifndef _LINUX_LGUEST_H
-#define _LINUX_LGUEST_H
-
-#ifndef __ASSEMBLY__
-#include <linux/time.h>
-#include <asm/irq.h>
-#include <asm/lguest_hcall.h>
-
-#define LG_CLOCK_MIN_DELTA	100UL
-#define LG_CLOCK_MAX_DELTA	ULONG_MAX
-
-/*G:031
- * The second method of communicating with the Host is to via "struct
- * lguest_data".  Once the Guest's initialization hypercall tells the Host where
- * this is, the Guest and Host both publish information in it.
-:*/
-struct lguest_data {
-	/*
-	 * 512 == enabled (same as eflags in normal hardware).  The Guest
-	 * changes interrupts so often that a hypercall is too slow.
-	 */
-	unsigned int irq_enabled;
-	/* Fine-grained interrupt disabling by the Guest */
-	DECLARE_BITMAP(blocked_interrupts, LGUEST_IRQS);
-
-	/*
-	 * The Host writes the virtual address of the last page fault here,
-	 * which saves the Guest a hypercall.  CR2 is the native register where
-	 * this address would normally be found.
-	 */
-	unsigned long cr2;
-
-	/* Wallclock time set by the Host. */
-	struct timespec time;
-
-	/*
-	 * Interrupt pending set by the Host.  The Guest should do a hypercall
-	 * if it re-enables interrupts and sees this set (to X86_EFLAGS_IF).
-	 */
-	int irq_pending;
-
-	/*
-	 * Async hypercall ring.  Instead of directly making hypercalls, we can
-	 * place them in here for processing the next time the Host wants.
-	 * This batching can be quite efficient.
-	 */
-
-	/* 0xFF == done (set by Host), 0 == pending (set by Guest). */
-	u8 hcall_status[LHCALL_RING_SIZE];
-	/* The actual registers for the hypercalls. */
-	struct hcall_args hcalls[LHCALL_RING_SIZE];
-
-/* Fields initialized by the Host at boot: */
-	/* Memory not to try to access */
-	unsigned long reserve_mem;
-	/* KHz for the TSC clock. */
-	u32 tsc_khz;
-
-/* Fields initialized by the Guest at boot: */
-	/* Instruction to suppress interrupts even if enabled */
-	unsigned long noirq_iret;
-	/* Address above which page tables are all identical. */
-	unsigned long kernel_address;
-	/* The vector to try to use for system calls (0x40 or 0x80). */
-	unsigned int syscall_vec;
-};
-extern struct lguest_data lguest_data;
-#endif /* __ASSEMBLY__ */
-#endif	/* _LINUX_LGUEST_H */
diff --git a/include/linux/lguest_launcher.h b/include/linux/lguest_launcher.h
deleted file mode 100644
index acd5b12565cc..000000000000
--- a/include/linux/lguest_launcher.h
+++ /dev/null
@@ -1,44 +0,0 @@
-#ifndef _LINUX_LGUEST_LAUNCHER
-#define _LINUX_LGUEST_LAUNCHER
-/* Everything the "lguest" userspace program needs to know. */
-#include <linux/types.h>
-
-/*D:010
- * Drivers
- *
- * The Guest needs devices to do anything useful.  Since we don't let it touch
- * real devices (think of the damage it could do!) we provide virtual devices.
- * We emulate a PCI bus with virtio devices on it; we used to have our own
- * lguest bus which was far simpler, but this tests the virtio 1.0 standard.
- *
- * Virtio devices are also used by kvm, so we can simply reuse their optimized
- * device drivers.  And one day when everyone uses virtio, my plan will be
- * complete.  Bwahahahah!
- */
-
-/* Write command first word is a request. */
-enum lguest_req
-{
-	LHREQ_INITIALIZE, /* + base, pfnlimit, start */
-	LHREQ_GETDMA, /* No longer used */
-	LHREQ_IRQ, /* + irq */
-	LHREQ_BREAK, /* No longer used */
-	LHREQ_EVENTFD, /* No longer used. */
-	LHREQ_GETREG, /* + offset within struct pt_regs (then read value). */
-	LHREQ_SETREG, /* + offset within struct pt_regs, value. */
-	LHREQ_TRAP, /* + trap number to deliver to guest. */
-};
-
-/*
- * This is what read() of the lguest fd populates.  trap ==
- * LGUEST_TRAP_ENTRY for an LHCALL_NOTIFY (addr is the
- * argument), 14 for a page fault in the MMIO region (addr is
- * the trap address, insn is the instruction), or 13 for a GPF
- * (insn is the instruction).
- */
-struct lguest_pending {
-	__u8 trap;
-	__u8 insn[7];
-	__u32 addr;
-};
-#endif /* _LINUX_LGUEST_LAUNCHER */
diff --git a/include/uapi/linux/virtio_ring.h b/include/uapi/linux/virtio_ring.h
index c07295969b7e..6d5d5faa989b 100644
--- a/include/uapi/linux/virtio_ring.h
+++ b/include/uapi/linux/virtio_ring.h
@@ -1,7 +1,7 @@
 #ifndef _UAPI_LINUX_VIRTIO_RING_H
 #define _UAPI_LINUX_VIRTIO_RING_H
-/* An interface for efficient virtio implementation, currently for use by KVM
- * and lguest, but hopefully others soon.  Do NOT change this since it will
+/* An interface for efficient virtio implementation, currently for use by KVM,
+ * but hopefully others soon.  Do NOT change this since it will
  * break existing servers and clients.
  *
  * This header is BSD licensed so anyone can use the definitions to implement
diff --git a/tools/Makefile b/tools/Makefile
index 221e1ce78b06..a19b176b914b 100644
--- a/tools/Makefile
+++ b/tools/Makefile
@@ -18,7 +18,6 @@ help:
 	@echo '  iio                    - IIO tools'
 	@echo '  kvm_stat               - top-like utility for displaying kvm statistics'
 	@echo '  leds                   - LEDs  tools'
-	@echo '  lguest                 - a minimal 32-bit x86 hypervisor'
 	@echo '  liblockdep             - user-space wrapper for kernel locking-validator'
 	@echo '  net                    - misc networking tools'
 	@echo '  perf                   - Linux performance measurement and analysis tool'
@@ -90,7 +89,7 @@ freefall: FORCE
 kvm_stat: FORCE
 	$(call descend,kvm/$@)
 
-all: acpi cgroup cpupower gpio hv firewire lguest liblockdep \
+all: acpi cgroup cpupower gpio hv firewire liblockdep \
 		perf selftests turbostat usb \
 		virtio vm net x86_energy_perf_policy \
 		tmon freefall objtool kvm_stat
@@ -101,7 +100,7 @@ acpi_install:
 cpupower_install:
 	$(call descend,power/$(@:_install=),install)
 
-cgroup_install firewire_install gpio_install hv_install lguest_install perf_install usb_install virtio_install vm_install net_install objtool_install:
+cgroup_install firewire_install gpio_install hv_install perf_install usb_install virtio_install vm_install net_install objtool_install:
 	$(call descend,$(@:_install=),install)
 
 liblockdep_install:
@@ -123,7 +122,7 @@ kvm_stat_install:
 	$(call descend,kvm/$(@:_install=),install)
 
 install: acpi_install cgroup_install cpupower_install gpio_install \
-		hv_install firewire_install lguest_install liblockdep_install \
+		hv_install firewire_install liblockdep_install \
 		perf_install selftests_install turbostat_install usb_install \
 		virtio_install vm_install net_install x86_energy_perf_policy_install \
 		tmon_install freefall_install objtool_install kvm_stat_install
@@ -134,7 +133,7 @@ acpi_clean:
 cpupower_clean:
 	$(call descend,power/cpupower,clean)
 
-cgroup_clean hv_clean firewire_clean lguest_clean spi_clean usb_clean virtio_clean vm_clean net_clean iio_clean gpio_clean objtool_clean leds_clean:
+cgroup_clean hv_clean firewire_clean spi_clean usb_clean virtio_clean vm_clean net_clean iio_clean gpio_clean objtool_clean leds_clean:
 	$(call descend,$(@:_clean=),clean)
 
 liblockdep_clean:
@@ -168,7 +167,7 @@ freefall_clean:
 build_clean:
 	$(call descend,build,clean)
 
-clean: acpi_clean cgroup_clean cpupower_clean hv_clean firewire_clean lguest_clean \
+clean: acpi_clean cgroup_clean cpupower_clean hv_clean firewire_clean \
 		perf_clean selftests_clean turbostat_clean spi_clean usb_clean virtio_clean \
 		vm_clean net_clean iio_clean x86_energy_perf_policy_clean tmon_clean \
 		freefall_clean build_clean libbpf_clean libsubcmd_clean liblockdep_clean \
diff --git a/tools/lguest/.gitignore b/tools/lguest/.gitignore
deleted file mode 100644
index 8d9a8383a52e..000000000000
--- a/tools/lguest/.gitignore
+++ /dev/null
@@ -1,2 +0,0 @@
-lguest
-include
diff --git a/tools/lguest/Makefile b/tools/lguest/Makefile
deleted file mode 100644
index d04599a79802..000000000000
--- a/tools/lguest/Makefile
+++ /dev/null
@@ -1,14 +0,0 @@
-# This creates the demonstration utility "lguest" which runs a Linux guest.
-CFLAGS:=-m32 -Wall -Wmissing-declarations -Wmissing-prototypes -O3 -U_FORTIFY_SOURCE -Iinclude
-
-all: lguest
-
-include/linux/virtio_types.h: ../../include/uapi/linux/virtio_types.h
-	mkdir -p include/linux 2>&1 || true
-	ln -sf ../../../../include/uapi/linux/virtio_types.h $@
-
-lguest: include/linux/virtio_types.h
-
-clean:
-	rm -f lguest
-	rm -rf include
diff --git a/tools/lguest/extract b/tools/lguest/extract
deleted file mode 100644
index 7730bb6e4b94..000000000000
--- a/tools/lguest/extract
+++ /dev/null
@@ -1,58 +0,0 @@
-#! /bin/sh
-
-set -e
-
-PREFIX=$1
-shift
-
-trap 'rm -r $TMPDIR' 0
-TMPDIR=`mktemp -d`
-
-exec 3>/dev/null
-for f; do
-    while IFS="
-" read -r LINE; do
-	case "$LINE" in
-	    *$PREFIX:[0-9]*:\**)
-		NUM=`echo "$LINE" | sed "s/.*$PREFIX:\([0-9]*\).*/\1/"`
-		if [ -f $TMPDIR/$NUM ]; then
-		    echo "$TMPDIR/$NUM already exits prior to $f"
-		    exit 1
-		fi
-		exec 3>>$TMPDIR/$NUM
-		echo $f | sed 's,\.\./,,g' > $TMPDIR/.$NUM
-		/bin/echo "$LINE" | sed -e "s/$PREFIX:[0-9]*//" -e "s/:\*/*/" >&3
-		;;
-	    *$PREFIX:[0-9]*)
-		NUM=`echo "$LINE" | sed "s/.*$PREFIX:\([0-9]*\).*/\1/"`
-		if [ -f $TMPDIR/$NUM ]; then
-		    echo "$TMPDIR/$NUM already exits prior to $f"
-		    exit 1
-		fi
-		exec 3>>$TMPDIR/$NUM
-		echo $f | sed 's,\.\./,,g' > $TMPDIR/.$NUM
-		/bin/echo "$LINE" | sed "s/$PREFIX:[0-9]*//" >&3
-		;;
-	    *:\**)
-		/bin/echo "$LINE" | sed -e "s/:\*/*/" -e "s,/\*\*/,," >&3
-		echo >&3
-		exec 3>/dev/null
-		;;
-	    *)
-		/bin/echo "$LINE" >&3
-		;;
-	esac
-    done < $f
-    echo >&3
-    exec 3>/dev/null
-done
-
-LASTFILE=""
-for f in $TMPDIR/*; do
-    if [ "$LASTFILE" != $(cat $TMPDIR/.$(basename $f) ) ]; then
-	LASTFILE=$(cat $TMPDIR/.$(basename $f) )
-	echo "[ $LASTFILE ]"
-    fi
-    cat $f
-done
-
diff --git a/tools/lguest/lguest.c b/tools/lguest/lguest.c
deleted file mode 100644
index 897cd6f3f687..000000000000
--- a/tools/lguest/lguest.c
+++ /dev/null
@@ -1,3420 +0,0 @@
-/*P:100
- * This is the Launcher code, a simple program which lays out the "physical"
- * memory for the new Guest by mapping the kernel image and the virtual
- * devices, then opens /dev/lguest to tell the kernel about the Guest and
- * control it.
-:*/
-#define _LARGEFILE64_SOURCE
-#define _GNU_SOURCE
-#include <stdio.h>
-#include <string.h>
-#include <unistd.h>
-#include <err.h>
-#include <stdint.h>
-#include <stdlib.h>
-#include <elf.h>
-#include <sys/mman.h>
-#include <sys/param.h>
-#include <sys/types.h>
-#include <sys/stat.h>
-#include <sys/wait.h>
-#include <sys/eventfd.h>
-#include <fcntl.h>
-#include <stdbool.h>
-#include <errno.h>
-#include <ctype.h>
-#include <sys/socket.h>
-#include <sys/ioctl.h>
-#include <sys/time.h>
-#include <time.h>
-#include <netinet/in.h>
-#include <net/if.h>
-#include <linux/sockios.h>
-#include <linux/if_tun.h>
-#include <sys/uio.h>
-#include <termios.h>
-#include <getopt.h>
-#include <assert.h>
-#include <sched.h>
-#include <limits.h>
-#include <stddef.h>
-#include <signal.h>
-#include <pwd.h>
-#include <grp.h>
-#include <sys/user.h>
-#include <linux/pci_regs.h>
-
-#ifndef VIRTIO_F_ANY_LAYOUT
-#define VIRTIO_F_ANY_LAYOUT		27
-#endif
-
-/*L:110
- * We can ignore the 43 include files we need for this program, but I do want
- * to draw attention to the use of kernel-style types.
- *
- * As Linus said, "C is a Spartan language, and so should your naming be."  I
- * like these abbreviations, so we define them here.  Note that u64 is always
- * unsigned long long, which works on all Linux systems: this means that we can
- * use %llu in printf for any u64.
- */
-typedef unsigned long long u64;
-typedef uint32_t u32;
-typedef uint16_t u16;
-typedef uint8_t u8;
-/*:*/
-
-#define VIRTIO_CONFIG_NO_LEGACY
-#define VIRTIO_PCI_NO_LEGACY
-#define VIRTIO_BLK_NO_LEGACY
-#define VIRTIO_NET_NO_LEGACY
-
-/* Use in-kernel ones, which defines VIRTIO_F_VERSION_1 */
-#include "../../include/uapi/linux/virtio_config.h"
-#include "../../include/uapi/linux/virtio_net.h"
-#include "../../include/uapi/linux/virtio_blk.h"
-#include "../../include/uapi/linux/virtio_console.h"
-#include "../../include/uapi/linux/virtio_rng.h"
-#include <linux/virtio_ring.h>
-#include "../../include/uapi/linux/virtio_pci.h"
-#include <asm/bootparam.h>
-#include "../../include/linux/lguest_launcher.h"
-
-#define BRIDGE_PFX "bridge:"
-#ifndef SIOCBRADDIF
-#define SIOCBRADDIF	0x89a2		/* add interface to bridge      */
-#endif
-/* We can have up to 256 pages for devices. */
-#define DEVICE_PAGES 256
-/* This will occupy 3 pages: it must be a power of 2. */
-#define VIRTQUEUE_NUM 256
-
-/*L:120
- * verbose is both a global flag and a macro.  The C preprocessor allows
- * this, and although I wouldn't recommend it, it works quite nicely here.
- */
-static bool verbose;
-#define verbose(args...) \
-	do { if (verbose) printf(args); } while(0)
-/*:*/
-
-/* The pointer to the start of guest memory. */
-static void *guest_base;
-/* The maximum guest physical address allowed, and maximum possible. */
-static unsigned long guest_limit, guest_max, guest_mmio;
-/* The /dev/lguest file descriptor. */
-static int lguest_fd;
-
-/* a per-cpu variable indicating whose vcpu is currently running */
-static unsigned int __thread cpu_id;
-
-/* 5 bit device number in the PCI_CONFIG_ADDR => 32 only */
-#define MAX_PCI_DEVICES 32
-
-/* This is our list of devices. */
-struct device_list {
-	/* Counter to assign interrupt numbers. */
-	unsigned int next_irq;
-
-	/* Counter to print out convenient device numbers. */
-	unsigned int device_num;
-
-	/* PCI devices. */
-	struct device *pci[MAX_PCI_DEVICES];
-};
-
-/* The list of Guest devices, based on command line arguments. */
-static struct device_list devices;
-
-/*
- * Just like struct virtio_pci_cfg_cap in uapi/linux/virtio_pci.h,
- * but uses a u32 explicitly for the data.
- */
-struct virtio_pci_cfg_cap_u32 {
-	struct virtio_pci_cap cap;
-	u32 pci_cfg_data; /* Data for BAR access. */
-};
-
-struct virtio_pci_mmio {
-	struct virtio_pci_common_cfg cfg;
-	u16 notify;
-	u8 isr;
-	u8 padding;
-	/* Device-specific configuration follows this. */
-};
-
-/* This is the layout (little-endian) of the PCI config space. */
-struct pci_config {
-	u16 vendor_id, device_id;
-	u16 command, status;
-	u8 revid, prog_if, subclass, class;
-	u8 cacheline_size, lat_timer, header_type, bist;
-	u32 bar[6];
-	u32 cardbus_cis_ptr;
-	u16 subsystem_vendor_id, subsystem_device_id;
-	u32 expansion_rom_addr;
-	u8 capabilities, reserved1[3];
-	u32 reserved2;
-	u8 irq_line, irq_pin, min_grant, max_latency;
-
-	/* Now, this is the linked capability list. */
-	struct virtio_pci_cap common;
-	struct virtio_pci_notify_cap notify;
-	struct virtio_pci_cap isr;
-	struct virtio_pci_cap device;
-	struct virtio_pci_cfg_cap_u32 cfg_access;
-};
-
-/* The device structure describes a single device. */
-struct device {
-	/* The name of this device, for --verbose. */
-	const char *name;
-
-	/* Any queues attached to this device */
-	struct virtqueue *vq;
-
-	/* Is it operational */
-	bool running;
-
-	/* Has it written FEATURES_OK but not re-checked it? */
-	bool wrote_features_ok;
-
-	/* PCI configuration */
-	union {
-		struct pci_config config;
-		u32 config_words[sizeof(struct pci_config) / sizeof(u32)];
-	};
-
-	/* Features we offer, and those accepted. */
-	u64 features, features_accepted;
-
-	/* Device-specific config hangs off the end of this. */
-	struct virtio_pci_mmio *mmio;
-
-	/* PCI MMIO resources (all in BAR0) */
-	size_t mmio_size;
-	u32 mmio_addr;
-
-	/* Device-specific data. */
-	void *priv;
-};
-
-/* The virtqueue structure describes a queue attached to a device. */
-struct virtqueue {
-	struct virtqueue *next;
-
-	/* Which device owns me. */
-	struct device *dev;
-
-	/* Name for printing errors. */
-	const char *name;
-
-	/* The actual ring of buffers. */
-	struct vring vring;
-
-	/* The information about this virtqueue (we only use queue_size on) */
-	struct virtio_pci_common_cfg pci_config;
-
-	/* Last available index we saw. */
-	u16 last_avail_idx;
-
-	/* How many are used since we sent last irq? */
-	unsigned int pending_used;
-
-	/* Eventfd where Guest notifications arrive. */
-	int eventfd;
-
-	/* Function for the thread which is servicing this virtqueue. */
-	void (*service)(struct virtqueue *vq);
-	pid_t thread;
-};
-
-/* Remember the arguments to the program so we can "reboot" */
-static char **main_args;
-
-/* The original tty settings to restore on exit. */
-static struct termios orig_term;
-
-/*
- * We have to be careful with barriers: our devices are all run in separate
- * threads and so we need to make sure that changes visible to the Guest happen
- * in precise order.
- */
-#define wmb() __asm__ __volatile__("" : : : "memory")
-#define rmb() __asm__ __volatile__("lock; addl $0,0(%%esp)" : : : "memory")
-#define mb() __asm__ __volatile__("lock; addl $0,0(%%esp)" : : : "memory")
-
-/* Wrapper for the last available index.  Makes it easier to change. */
-#define lg_last_avail(vq)	((vq)->last_avail_idx)
-
-/*
- * The virtio configuration space is defined to be little-endian.  x86 is
- * little-endian too, but it's nice to be explicit so we have these helpers.
- */
-#define cpu_to_le16(v16) (v16)
-#define cpu_to_le32(v32) (v32)
-#define cpu_to_le64(v64) (v64)
-#define le16_to_cpu(v16) (v16)
-#define le32_to_cpu(v32) (v32)
-#define le64_to_cpu(v64) (v64)
-
-/*
- * A real device would ignore weird/non-compliant driver behaviour.  We
- * stop and flag it, to help debugging Linux problems.
- */
-#define bad_driver(d, fmt, ...) \
-	errx(1, "%s: bad driver: " fmt, (d)->name, ## __VA_ARGS__)
-#define bad_driver_vq(vq, fmt, ...)			       \
-	errx(1, "%s vq %s: bad driver: " fmt, (vq)->dev->name, \
-	     vq->name, ## __VA_ARGS__)
-
-/* Is this iovec empty? */
-static bool iov_empty(const struct iovec iov[], unsigned int num_iov)
-{
-	unsigned int i;
-
-	for (i = 0; i < num_iov; i++)
-		if (iov[i].iov_len)
-			return false;
-	return true;
-}
-
-/* Take len bytes from the front of this iovec. */
-static void iov_consume(struct device *d,
-			struct iovec iov[], unsigned num_iov,
-			void *dest, unsigned len)
-{
-	unsigned int i;
-
-	for (i = 0; i < num_iov; i++) {
-		unsigned int used;
-
-		used = iov[i].iov_len < len ? iov[i].iov_len : len;
-		if (dest) {
-			memcpy(dest, iov[i].iov_base, used);
-			dest += used;
-		}
-		iov[i].iov_base += used;
-		iov[i].iov_len -= used;
-		len -= used;
-	}
-	if (len != 0)
-		bad_driver(d, "iovec too short!");
-}
-
-/*L:100
- * The Launcher code itself takes us out into userspace, that scary place where
- * pointers run wild and free!  Unfortunately, like most userspace programs,
- * it's quite boring (which is why everyone likes to hack on the kernel!).
- * Perhaps if you make up an Lguest Drinking Game at this point, it will get
- * you through this section.  Or, maybe not.
- *
- * The Launcher sets up a big chunk of memory to be the Guest's "physical"
- * memory and stores it in "guest_base".  In other words, Guest physical ==
- * Launcher virtual with an offset.
- *
- * This can be tough to get your head around, but usually it just means that we
- * use these trivial conversion functions when the Guest gives us its
- * "physical" addresses:
- */
-static void *from_guest_phys(unsigned long addr)
-{
-	return guest_base + addr;
-}
-
-static unsigned long to_guest_phys(const void *addr)
-{
-	return (addr - guest_base);
-}
-
-/*L:130
- * Loading the Kernel.
- *
- * We start with couple of simple helper routines.  open_or_die() avoids
- * error-checking code cluttering the callers:
- */
-static int open_or_die(const char *name, int flags)
-{
-	int fd = open(name, flags);
-	if (fd < 0)
-		err(1, "Failed to open %s", name);
-	return fd;
-}
-
-/* map_zeroed_pages() takes a number of pages. */
-static void *map_zeroed_pages(unsigned int num)
-{
-	int fd = open_or_die("/dev/zero", O_RDONLY);
-	void *addr;
-
-	/*
-	 * We use a private mapping (ie. if we write to the page, it will be
-	 * copied). We allocate an extra two pages PROT_NONE to act as guard
-	 * pages against read/write attempts that exceed allocated space.
-	 */
-	addr = mmap(NULL, getpagesize() * (num+2),
-		    PROT_NONE, MAP_PRIVATE, fd, 0);
-
-	if (addr == MAP_FAILED)
-		err(1, "Mmapping %u pages of /dev/zero", num);
-
-	if (mprotect(addr + getpagesize(), getpagesize() * num,
-		     PROT_READ|PROT_WRITE) == -1)
-		err(1, "mprotect rw %u pages failed", num);
-
-	/*
-	 * One neat mmap feature is that you can close the fd, and it
-	 * stays mapped.
-	 */
-	close(fd);
-
-	/* Return address after PROT_NONE page */
-	return addr + getpagesize();
-}
-
-/* Get some bytes which won't be mapped into the guest. */
-static unsigned long get_mmio_region(size_t size)
-{
-	unsigned long addr = guest_mmio;
-	size_t i;
-
-	if (!size)
-		return addr;
-
-	/* Size has to be a power of 2 (and multiple of 16) */
-	for (i = 1; i < size; i <<= 1);
-
-	guest_mmio += i;
-
-	return addr;
-}
-
-/*
- * This routine is used to load the kernel or initrd.  It tries mmap, but if
- * that fails (Plan 9's kernel file isn't nicely aligned on page boundaries),
- * it falls back to reading the memory in.
- */
-static void map_at(int fd, void *addr, unsigned long offset, unsigned long len)
-{
-	ssize_t r;
-
-	/*
-	 * We map writable even though for some segments are marked read-only.
-	 * The kernel really wants to be writable: it patches its own
-	 * instructions.
-	 *
-	 * MAP_PRIVATE means that the page won't be copied until a write is
-	 * done to it.  This allows us to share untouched memory between
-	 * Guests.
-	 */
-	if (mmap(addr, len, PROT_READ|PROT_WRITE,
-		 MAP_FIXED|MAP_PRIVATE, fd, offset) != MAP_FAILED)
-		return;
-
-	/* pread does a seek and a read in one shot: saves a few lines. */
-	r = pread(fd, addr, len, offset);
-	if (r != len)
-		err(1, "Reading offset %lu len %lu gave %zi", offset, len, r);
-}
-
-/*
- * This routine takes an open vmlinux image, which is in ELF, and maps it into
- * the Guest memory.  ELF = Embedded Linking Format, which is the format used
- * by all modern binaries on Linux including the kernel.
- *
- * The ELF headers give *two* addresses: a physical address, and a virtual
- * address.  We use the physical address; the Guest will map itself to the
- * virtual address.
- *
- * We return the starting address.
- */
-static unsigned long map_elf(int elf_fd, const Elf32_Ehdr *ehdr)
-{
-	Elf32_Phdr phdr[ehdr->e_phnum];
-	unsigned int i;
-
-	/*
-	 * Sanity checks on the main ELF header: an x86 executable with a
-	 * reasonable number of correctly-sized program headers.
-	 */
-	if (ehdr->e_type != ET_EXEC
-	    || ehdr->e_machine != EM_386
-	    || ehdr->e_phentsize != sizeof(Elf32_Phdr)
-	    || ehdr->e_phnum < 1 || ehdr->e_phnum > 65536U/sizeof(Elf32_Phdr))
-		errx(1, "Malformed elf header");
-
-	/*
-	 * An ELF executable contains an ELF header and a number of "program"
-	 * headers which indicate which parts ("segments") of the program to
-	 * load where.
-	 */
-
-	/* We read in all the program headers at once: */
-	if (lseek(elf_fd, ehdr->e_phoff, SEEK_SET) < 0)
-		err(1, "Seeking to program headers");
-	if (read(elf_fd, phdr, sizeof(phdr)) != sizeof(phdr))
-		err(1, "Reading program headers");
-
-	/*
-	 * Try all the headers: there are usually only three.  A read-only one,
-	 * a read-write one, and a "note" section which we don't load.
-	 */
-	for (i = 0; i < ehdr->e_phnum; i++) {
-		/* If this isn't a loadable segment, we ignore it */
-		if (phdr[i].p_type != PT_LOAD)
-			continue;
-
-		verbose("Section %i: size %i addr %p\n",
-			i, phdr[i].p_memsz, (void *)phdr[i].p_paddr);
-
-		/* We map this section of the file at its physical address. */
-		map_at(elf_fd, from_guest_phys(phdr[i].p_paddr),
-		       phdr[i].p_offset, phdr[i].p_filesz);
-	}
-
-	/* The entry point is given in the ELF header. */
-	return ehdr->e_entry;
-}
-
-/*L:150
- * A bzImage, unlike an ELF file, is not meant to be loaded.  You're supposed
- * to jump into it and it will unpack itself.  We used to have to perform some
- * hairy magic because the unpacking code scared me.
- *
- * Fortunately, Jeremy Fitzhardinge convinced me it wasn't that hard and wrote
- * a small patch to jump over the tricky bits in the Guest, so now we just read
- * the funky header so we know where in the file to load, and away we go!
- */
-static unsigned long load_bzimage(int fd)
-{
-	struct boot_params boot;
-	int r;
-	/* Modern bzImages get loaded at 1M. */
-	void *p = from_guest_phys(0x100000);
-
-	/*
-	 * Go back to the start of the file and read the header.  It should be
-	 * a Linux boot header (see Documentation/x86/boot.txt)
-	 */
-	lseek(fd, 0, SEEK_SET);
-	read(fd, &boot, sizeof(boot));
-
-	/* Inside the setup_hdr, we expect the magic "HdrS" */
-	if (memcmp(&boot.hdr.header, "HdrS", 4) != 0)
-		errx(1, "This doesn't look like a bzImage to me");
-
-	/* Skip over the extra sectors of the header. */
-	lseek(fd, (boot.hdr.setup_sects+1) * 512, SEEK_SET);
-
-	/* Now read everything into memory. in nice big chunks. */
-	while ((r = read(fd, p, 65536)) > 0)
-		p += r;
-
-	/* Finally, code32_start tells us where to enter the kernel. */
-	return boot.hdr.code32_start;
-}
-
-/*L:140
- * Loading the kernel is easy when it's a "vmlinux", but most kernels
- * come wrapped up in the self-decompressing "bzImage" format.  With a little
- * work, we can load those, too.
- */
-static unsigned long load_kernel(int fd)
-{
-	Elf32_Ehdr hdr;
-
-	/* Read in the first few bytes. */
-	if (read(fd, &hdr, sizeof(hdr)) != sizeof(hdr))
-		err(1, "Reading kernel");
-
-	/* If it's an ELF file, it starts with "\177ELF" */
-	if (memcmp(hdr.e_ident, ELFMAG, SELFMAG) == 0)
-		return map_elf(fd, &hdr);
-
-	/* Otherwise we assume it's a bzImage, and try to load it. */
-	return load_bzimage(fd);
-}
-
-/*
- * This is a trivial little helper to align pages.  Andi Kleen hated it because
- * it calls getpagesize() twice: "it's dumb code."
- *
- * Kernel guys get really het up about optimization, even when it's not
- * necessary.  I leave this code as a reaction against that.
- */
-static inline unsigned long page_align(unsigned long addr)
-{
-	/* Add upwards and truncate downwards. */
-	return ((addr + getpagesize()-1) & ~(getpagesize()-1));
-}
-
-/*L:180
- * An "initial ram disk" is a disk image loaded into memory along with the
- * kernel which the kernel can use to boot from without needing any drivers.
- * Most distributions now use this as standard: the initrd contains the code to
- * load the appropriate driver modules for the current machine.
- *
- * Importantly, James Morris works for RedHat, and Fedora uses initrds for its
- * kernels.  He sent me this (and tells me when I break it).
- */
-static unsigned long load_initrd(const char *name, unsigned long mem)
-{
-	int ifd;
-	struct stat st;
-	unsigned long len;
-
-	ifd = open_or_die(name, O_RDONLY);
-	/* fstat() is needed to get the file size. */
-	if (fstat(ifd, &st) < 0)
-		err(1, "fstat() on initrd '%s'", name);
-
-	/*
-	 * We map the initrd at the top of memory, but mmap wants it to be
-	 * page-aligned, so we round the size up for that.
-	 */
-	len = page_align(st.st_size);
-	map_at(ifd, from_guest_phys(mem - len), 0, st.st_size);
-	/*
-	 * Once a file is mapped, you can close the file descriptor.  It's a
-	 * little odd, but quite useful.
-	 */
-	close(ifd);
-	verbose("mapped initrd %s size=%lu @ %p\n", name, len, (void*)mem-len);
-
-	/* We return the initrd size. */
-	return len;
-}
-/*:*/
-
-/*
- * Simple routine to roll all the commandline arguments together with spaces
- * between them.
- */
-static void concat(char *dst, char *args[])
-{
-	unsigned int i, len = 0;
-
-	for (i = 0; args[i]; i++) {
-		if (i) {
-			strcat(dst+len, " ");
-			len++;
-		}
-		strcpy(dst+len, args[i]);
-		len += strlen(args[i]);
-	}
-	/* In case it's empty. */
-	dst[len] = '\0';
-}
-
-/*L:185
- * This is where we actually tell the kernel to initialize the Guest.  We
- * saw the arguments it expects when we looked at initialize() in lguest_user.c:
- * the base of Guest "physical" memory, the top physical page to allow and the
- * entry point for the Guest.
- */
-static void tell_kernel(unsigned long start)
-{
-	unsigned long args[] = { LHREQ_INITIALIZE,
-				 (unsigned long)guest_base,
-				 guest_limit / getpagesize(), start,
-				 (guest_mmio+getpagesize()-1) / getpagesize() };
-	verbose("Guest: %p - %p (%#lx, MMIO %#lx)\n",
-		guest_base, guest_base + guest_limit,
-		guest_limit, guest_mmio);
-	lguest_fd = open_or_die("/dev/lguest", O_RDWR);
-	if (write(lguest_fd, args, sizeof(args)) < 0)
-		err(1, "Writing to /dev/lguest");
-}
-/*:*/
-
-/*L:200
- * Device Handling.
- *
- * When the Guest gives us a buffer, it sends an array of addresses and sizes.
- * We need to make sure it's not trying to reach into the Launcher itself, so
- * we have a convenient routine which checks it and exits with an error message
- * if something funny is going on:
- */
-static void *_check_pointer(struct device *d,
-			    unsigned long addr, unsigned int size,
-			    unsigned int line)
-{
-	/*
-	 * Check if the requested address and size exceeds the allocated memory,
-	 * or addr + size wraps around.
-	 */
-	if ((addr + size) > guest_limit || (addr + size) < addr)
-		bad_driver(d, "%s:%i: Invalid address %#lx",
-			   __FILE__, line, addr);
-	/*
-	 * We return a pointer for the caller's convenience, now we know it's
-	 * safe to use.
-	 */
-	return from_guest_phys(addr);
-}
-/* A macro which transparently hands the line number to the real function. */
-#define check_pointer(d,addr,size) _check_pointer(d, addr, size, __LINE__)
-
-/*
- * Each buffer in the virtqueues is actually a chain of descriptors.  This
- * function returns the next descriptor in the chain, or vq->vring.num if we're
- * at the end.
- */
-static unsigned next_desc(struct device *d, struct vring_desc *desc,
-			  unsigned int i, unsigned int max)
-{
-	unsigned int next;
-
-	/* If this descriptor says it doesn't chain, we're done. */
-	if (!(desc[i].flags & VRING_DESC_F_NEXT))
-		return max;
-
-	/* Check they're not leading us off end of descriptors. */
-	next = desc[i].next;
-	/* Make sure compiler knows to grab that: we don't want it changing! */
-	wmb();
-
-	if (next >= max)
-		bad_driver(d, "Desc next is %u", next);
-
-	return next;
-}
-
-/*
- * This actually sends the interrupt for this virtqueue, if we've used a
- * buffer.
- */
-static void trigger_irq(struct virtqueue *vq)
-{
-	unsigned long buf[] = { LHREQ_IRQ, vq->dev->config.irq_line };
-
-	/* Don't inform them if nothing used. */
-	if (!vq->pending_used)
-		return;
-	vq->pending_used = 0;
-
-	/*
-	 * 2.4.7.1:
-	 *
-	 *  If the VIRTIO_F_EVENT_IDX feature bit is not negotiated:
-	 *    The driver MUST set flags to 0 or 1. 
-	 */
-	if (vq->vring.avail->flags > 1)
-		bad_driver_vq(vq, "avail->flags = %u\n", vq->vring.avail->flags);
-
-	/*
-	 * 2.4.7.2:
-	 *
-	 *  If the VIRTIO_F_EVENT_IDX feature bit is not negotiated:
-	 *
-	 *     - The device MUST ignore the used_event value.
-	 *     - After the device writes a descriptor index into the used ring:
-	 *         - If flags is 1, the device SHOULD NOT send an interrupt.
-	 *         - If flags is 0, the device MUST send an interrupt.
-	 */
-	if (vq->vring.avail->flags & VRING_AVAIL_F_NO_INTERRUPT) {
-		return;
-	}
-
-	/*
-	 * 4.1.4.5.1:
-	 *
-	 *  If MSI-X capability is disabled, the device MUST set the Queue
-	 *  Interrupt bit in ISR status before sending a virtqueue notification
-	 *  to the driver.
-	 */
-	vq->dev->mmio->isr = 0x1;
-
-	/* Send the Guest an interrupt tell them we used something up. */
-	if (write(lguest_fd, buf, sizeof(buf)) != 0)
-		err(1, "Triggering irq %i", vq->dev->config.irq_line);
-}
-
-/*
- * This looks in the virtqueue for the first available buffer, and converts
- * it to an iovec for convenient access.  Since descriptors consist of some
- * number of output then some number of input descriptors, it's actually two
- * iovecs, but we pack them into one and note how many of each there were.
- *
- * This function waits if necessary, and returns the descriptor number found.
- */
-static unsigned wait_for_vq_desc(struct virtqueue *vq,
-				 struct iovec iov[],
-				 unsigned int *out_num, unsigned int *in_num)
-{
-	unsigned int i, head, max;
-	struct vring_desc *desc;
-	u16 last_avail = lg_last_avail(vq);
-
-	/*
-	 * 2.4.7.1:
-	 *
-	 *   The driver MUST handle spurious interrupts from the device.
-	 *
-	 * That's why this is a while loop.
-	 */
-
-	/* There's nothing available? */
-	while (last_avail == vq->vring.avail->idx) {
-		u64 event;
-
-		/*
-		 * Since we're about to sleep, now is a good time to tell the
-		 * Guest about what we've used up to now.
-		 */
-		trigger_irq(vq);
-
-		/* OK, now we need to know about added descriptors. */
-		vq->vring.used->flags &= ~VRING_USED_F_NO_NOTIFY;
-
-		/*
-		 * They could have slipped one in as we were doing that: make
-		 * sure it's written, then check again.
-		 */
-		mb();
-		if (last_avail != vq->vring.avail->idx) {
-			vq->vring.used->flags |= VRING_USED_F_NO_NOTIFY;
-			break;
-		}
-
-		/* Nothing new?  Wait for eventfd to tell us they refilled. */
-		if (read(vq->eventfd, &event, sizeof(event)) != sizeof(event))
-			errx(1, "Event read failed?");
-
-		/* We don't need to be notified again. */
-		vq->vring.used->flags |= VRING_USED_F_NO_NOTIFY;
-	}
-
-	/* Check it isn't doing very strange things with descriptor numbers. */
-	if ((u16)(vq->vring.avail->idx - last_avail) > vq->vring.num)
-		bad_driver_vq(vq, "Guest moved used index from %u to %u",
-			      last_avail, vq->vring.avail->idx);
-
-	/* 
-	 * Make sure we read the descriptor number *after* we read the ring
-	 * update; don't let the cpu or compiler change the order.
-	 */
-	rmb();
-
-	/*
-	 * Grab the next descriptor number they're advertising, and increment
-	 * the index we've seen.
-	 */
-	head = vq->vring.avail->ring[last_avail % vq->vring.num];
-	lg_last_avail(vq)++;
-
-	/* If their number is silly, that's a fatal mistake. */
-	if (head >= vq->vring.num)
-		bad_driver_vq(vq, "Guest says index %u is available", head);
-
-	/* When we start there are none of either input nor output. */
-	*out_num = *in_num = 0;
-
-	max = vq->vring.num;
-	desc = vq->vring.desc;
-	i = head;
-
-	/*
-	 * We have to read the descriptor after we read the descriptor number,
-	 * but there's a data dependency there so the CPU shouldn't reorder
-	 * that: no rmb() required.
-	 */
-
-	do {
-		/*
-		 * If this is an indirect entry, then this buffer contains a
-		 * descriptor table which we handle as if it's any normal
-		 * descriptor chain.
-		 */
-		if (desc[i].flags & VRING_DESC_F_INDIRECT) {
-			/* 2.4.5.3.1:
-			 *
-			 *  The driver MUST NOT set the VIRTQ_DESC_F_INDIRECT
-			 *  flag unless the VIRTIO_F_INDIRECT_DESC feature was
-			 *  negotiated.
-			 */
-			if (!(vq->dev->features_accepted &
-			      (1<<VIRTIO_RING_F_INDIRECT_DESC)))
-				bad_driver_vq(vq, "vq indirect not negotiated");
-
-			/*
-			 * 2.4.5.3.1:
-			 *
-			 *   The driver MUST NOT set the VIRTQ_DESC_F_INDIRECT
-			 *   flag within an indirect descriptor (ie. only one
-			 *   table per descriptor).
-			 */
-			if (desc != vq->vring.desc)
-				bad_driver_vq(vq, "Indirect within indirect");
-
-			/*
-			 * Proposed update VIRTIO-134 spells this out:
-			 *
-			 *   A driver MUST NOT set both VIRTQ_DESC_F_INDIRECT
-			 *   and VIRTQ_DESC_F_NEXT in flags.
-			 */
-			if (desc[i].flags & VRING_DESC_F_NEXT)
-				bad_driver_vq(vq, "indirect and next together");
-
-			if (desc[i].len % sizeof(struct vring_desc))
-				bad_driver_vq(vq,
-					      "Invalid size for indirect table");
-			/*
-			 * 2.4.5.3.2:
-			 *
-			 *  The device MUST ignore the write-only flag
-			 *  (flags&VIRTQ_DESC_F_WRITE) in the descriptor that
-			 *  refers to an indirect table.
-			 *
-			 * We ignore it here: :)
-			 */
-
-			max = desc[i].len / sizeof(struct vring_desc);
-			desc = check_pointer(vq->dev, desc[i].addr, desc[i].len);
-			i = 0;
-
-			/* 2.4.5.3.1:
-			 *
-			 *  A driver MUST NOT create a descriptor chain longer
-			 *  than the Queue Size of the device.
-			 */
-			if (max > vq->pci_config.queue_size)
-				bad_driver_vq(vq,
-					      "indirect has too many entries");
-		}
-
-		/* Grab the first descriptor, and check it's OK. */
-		iov[*out_num + *in_num].iov_len = desc[i].len;
-		iov[*out_num + *in_num].iov_base
-			= check_pointer(vq->dev, desc[i].addr, desc[i].len);
-		/* If this is an input descriptor, increment that count. */
-		if (desc[i].flags & VRING_DESC_F_WRITE)
-			(*in_num)++;
-		else {
-			/*
-			 * If it's an output descriptor, they're all supposed
-			 * to come before any input descriptors.
-			 */
-			if (*in_num)
-				bad_driver_vq(vq,
-					      "Descriptor has out after in");
-			(*out_num)++;
-		}
-
-		/* If we've got too many, that implies a descriptor loop. */
-		if (*out_num + *in_num > max)
-			bad_driver_vq(vq, "Looped descriptor");
-	} while ((i = next_desc(vq->dev, desc, i, max)) != max);
-
-	return head;
-}
-
-/*
- * After we've used one of their buffers, we tell the Guest about it.  Sometime
- * later we'll want to send them an interrupt using trigger_irq(); note that
- * wait_for_vq_desc() does that for us if it has to wait.
- */
-static void add_used(struct virtqueue *vq, unsigned int head, int len)
-{
-	struct vring_used_elem *used;
-
-	/*
-	 * The virtqueue contains a ring of used buffers.  Get a pointer to the
-	 * next entry in that used ring.
-	 */
-	used = &vq->vring.used->ring[vq->vring.used->idx % vq->vring.num];
-	used->id = head;
-	used->len = len;
-	/* Make sure buffer is written before we update index. */
-	wmb();
-	vq->vring.used->idx++;
-	vq->pending_used++;
-}
-
-/* And here's the combo meal deal.  Supersize me! */
-static void add_used_and_trigger(struct virtqueue *vq, unsigned head, int len)
-{
-	add_used(vq, head, len);
-	trigger_irq(vq);
-}
-
-/*
- * The Console
- *
- * We associate some data with the console for our exit hack.
- */
-struct console_abort {
-	/* How many times have they hit ^C? */
-	int count;
-	/* When did they start? */
-	struct timeval start;
-};
-
-/* This is the routine which handles console input (ie. stdin). */
-static void console_input(struct virtqueue *vq)
-{
-	int len;
-	unsigned int head, in_num, out_num;
-	struct console_abort *abort = vq->dev->priv;
-	struct iovec iov[vq->vring.num];
-
-	/* Make sure there's a descriptor available. */
-	head = wait_for_vq_desc(vq, iov, &out_num, &in_num);
-	if (out_num)
-		bad_driver_vq(vq, "Output buffers in console in queue?");
-
-	/* Read into it.  This is where we usually wait. */
-	len = readv(STDIN_FILENO, iov, in_num);
-	if (len <= 0) {
-		/* Ran out of input? */
-		warnx("Failed to get console input, ignoring console.");
-		/*
-		 * For simplicity, dying threads kill the whole Launcher.  So
-		 * just nap here.
-		 */
-		for (;;)
-			pause();
-	}
-
-	/* Tell the Guest we used a buffer. */
-	add_used_and_trigger(vq, head, len);
-
-	/*
-	 * Three ^C within one second?  Exit.
-	 *
-	 * This is such a hack, but works surprisingly well.  Each ^C has to
-	 * be in a buffer by itself, so they can't be too fast.  But we check
-	 * that we get three within about a second, so they can't be too
-	 * slow.
-	 */
-	if (len != 1 || ((char *)iov[0].iov_base)[0] != 3) {
-		abort->count = 0;
-		return;
-	}
-
-	abort->count++;
-	if (abort->count == 1)
-		gettimeofday(&abort->start, NULL);
-	else if (abort->count == 3) {
-		struct timeval now;
-		gettimeofday(&now, NULL);
-		/* Kill all Launcher processes with SIGINT, like normal ^C */
-		if (now.tv_sec <= abort->start.tv_sec+1)
-			kill(0, SIGINT);
-		abort->count = 0;
-	}
-}
-
-/* This is the routine which handles console output (ie. stdout). */
-static void console_output(struct virtqueue *vq)
-{
-	unsigned int head, out, in;
-	struct iovec iov[vq->vring.num];
-
-	/* We usually wait in here, for the Guest to give us something. */
-	head = wait_for_vq_desc(vq, iov, &out, &in);
-	if (in)
-		bad_driver_vq(vq, "Input buffers in console output queue?");
-
-	/* writev can return a partial write, so we loop here. */
-	while (!iov_empty(iov, out)) {
-		int len = writev(STDOUT_FILENO, iov, out);
-		if (len <= 0) {
-			warn("Write to stdout gave %i (%d)", len, errno);
-			break;
-		}
-		iov_consume(vq->dev, iov, out, NULL, len);
-	}
-
-	/*
-	 * We're finished with that buffer: if we're going to sleep,
-	 * wait_for_vq_desc() will prod the Guest with an interrupt.
-	 */
-	add_used(vq, head, 0);
-}
-
-/*
- * The Network
- *
- * Handling output for network is also simple: we get all the output buffers
- * and write them to /dev/net/tun.
- */
-struct net_info {
-	int tunfd;
-};
-
-static void net_output(struct virtqueue *vq)
-{
-	struct net_info *net_info = vq->dev->priv;
-	unsigned int head, out, in;
-	struct iovec iov[vq->vring.num];
-
-	/* We usually wait in here for the Guest to give us a packet. */
-	head = wait_for_vq_desc(vq, iov, &out, &in);
-	if (in)
-		bad_driver_vq(vq, "Input buffers in net output queue?");
-	/*
-	 * Send the whole thing through to /dev/net/tun.  It expects the exact
-	 * same format: what a coincidence!
-	 */
-	if (writev(net_info->tunfd, iov, out) < 0)
-		warnx("Write to tun failed (%d)?", errno);
-
-	/*
-	 * Done with that one; wait_for_vq_desc() will send the interrupt if
-	 * all packets are processed.
-	 */
-	add_used(vq, head, 0);
-}
-
-/*
- * Handling network input is a bit trickier, because I've tried to optimize it.
- *
- * First we have a helper routine which tells is if from this file descriptor
- * (ie. the /dev/net/tun device) will block:
- */
-static bool will_block(int fd)
-{
-	fd_set fdset;
-	struct timeval zero = { 0, 0 };
-	FD_ZERO(&fdset);
-	FD_SET(fd, &fdset);
-	return select(fd+1, &fdset, NULL, NULL, &zero) != 1;
-}
-
-/*
- * This handles packets coming in from the tun device to our Guest.  Like all
- * service routines, it gets called again as soon as it returns, so you don't
- * see a while(1) loop here.
- */
-static void net_input(struct virtqueue *vq)
-{
-	int len;
-	unsigned int head, out, in;
-	struct iovec iov[vq->vring.num];
-	struct net_info *net_info = vq->dev->priv;
-
-	/*
-	 * Get a descriptor to write an incoming packet into.  This will also
-	 * send an interrupt if they're out of descriptors.
-	 */
-	head = wait_for_vq_desc(vq, iov, &out, &in);
-	if (out)
-		bad_driver_vq(vq, "Output buffers in net input queue?");
-
-	/*
-	 * If it looks like we'll block reading from the tun device, send them
-	 * an interrupt.
-	 */
-	if (vq->pending_used && will_block(net_info->tunfd))
-		trigger_irq(vq);
-
-	/*
-	 * Read in the packet.  This is where we normally wait (when there's no
-	 * incoming network traffic).
-	 */
-	len = readv(net_info->tunfd, iov, in);
-	if (len <= 0)
-		warn("Failed to read from tun (%d).", errno);
-
-	/*
-	 * Mark that packet buffer as used, but don't interrupt here.  We want
-	 * to wait until we've done as much work as we can.
-	 */
-	add_used(vq, head, len);
-}
-/*:*/
-
-/* This is the helper to create threads: run the service routine in a loop. */
-static int do_thread(void *_vq)
-{
-	struct virtqueue *vq = _vq;
-
-	for (;;)
-		vq->service(vq);
-	return 0;
-}
-
-/*
- * When a child dies, we kill our entire process group with SIGTERM.  This
- * also has the side effect that the shell restores the console for us!
- */
-static void kill_launcher(int signal)
-{
-	kill(0, SIGTERM);
-}
-
-static void reset_vq_pci_config(struct virtqueue *vq)
-{
-	vq->pci_config.queue_size = VIRTQUEUE_NUM;
-	vq->pci_config.queue_enable = 0;
-}
-
-static void reset_device(struct device *dev)
-{
-	struct virtqueue *vq;
-
-	verbose("Resetting device %s\n", dev->name);
-
-	/* Clear any features they've acked. */
-	dev->features_accepted = 0;
-
-	/* We're going to be explicitly killing threads, so ignore them. */
-	signal(SIGCHLD, SIG_IGN);
-
-	/*
-	 * 4.1.4.3.1:
-	 *
-	 *   The device MUST present a 0 in queue_enable on reset. 
-	 *
-	 * This means we set it here, and reset the saved ones in every vq.
-	 */
-	dev->mmio->cfg.queue_enable = 0;
-
-	/* Get rid of the virtqueue threads */
-	for (vq = dev->vq; vq; vq = vq->next) {
-		vq->last_avail_idx = 0;
-		reset_vq_pci_config(vq);
-		if (vq->thread != (pid_t)-1) {
-			kill(vq->thread, SIGTERM);
-			waitpid(vq->thread, NULL, 0);
-			vq->thread = (pid_t)-1;
-		}
-	}
-	dev->running = false;
-	dev->wrote_features_ok = false;
-
-	/* Now we care if threads die. */
-	signal(SIGCHLD, (void *)kill_launcher);
-}
-
-static void cleanup_devices(void)
-{
-	unsigned int i;
-
-	for (i = 1; i < MAX_PCI_DEVICES; i++) {
-		struct device *d = devices.pci[i];
-		if (!d)
-			continue;
-		reset_device(d);
-	}
-
-	/* If we saved off the original terminal settings, restore them now. */
-	if (orig_term.c_lflag & (ISIG|ICANON|ECHO))
-		tcsetattr(STDIN_FILENO, TCSANOW, &orig_term);
-}
-
-/*L:217
- * We do PCI.  This is mainly done to let us test the kernel virtio PCI
- * code.
- */
-
-/* Linux expects a PCI host bridge: ours is a dummy, and first on the bus. */
-static struct device pci_host_bridge;
-
-static void init_pci_host_bridge(void)
-{
-	pci_host_bridge.name = "PCI Host Bridge";
-	pci_host_bridge.config.class = 0x06; /* bridge */
-	pci_host_bridge.config.subclass = 0; /* host bridge */
-	devices.pci[0] = &pci_host_bridge;
-}
-
-/* The IO ports used to read the PCI config space. */
-#define PCI_CONFIG_ADDR 0xCF8
-#define PCI_CONFIG_DATA 0xCFC
-
-/*
- * Not really portable, but does help readability: this is what the Guest
- * writes to the PCI_CONFIG_ADDR IO port.
- */
-union pci_config_addr {
-	struct {
-		unsigned mbz: 2;
-		unsigned offset: 6;
-		unsigned funcnum: 3;
-		unsigned devnum: 5;
-		unsigned busnum: 8;
-		unsigned reserved: 7;
-		unsigned enabled : 1;
-	} bits;
-	u32 val;
-};
-
-/*
- * We cache what they wrote to the address port, so we know what they're
- * talking about when they access the data port.
- */
-static union pci_config_addr pci_config_addr;
-
-static struct device *find_pci_device(unsigned int index)
-{
-	return devices.pci[index];
-}
-
-/* PCI can do 1, 2 and 4 byte reads; we handle that here. */
-static void ioread(u16 off, u32 v, u32 mask, u32 *val)
-{
-	assert(off < 4);
-	assert(mask == 0xFF || mask == 0xFFFF || mask == 0xFFFFFFFF);
-	*val = (v >> (off * 8)) & mask;
-}
-
-/* PCI can do 1, 2 and 4 byte writes; we handle that here. */
-static void iowrite(u16 off, u32 v, u32 mask, u32 *dst)
-{
-	assert(off < 4);
-	assert(mask == 0xFF || mask == 0xFFFF || mask == 0xFFFFFFFF);
-	*dst &= ~(mask << (off * 8));
-	*dst |= (v & mask) << (off * 8);
-}
-
-/*
- * Where PCI_CONFIG_DATA accesses depends on the previous write to
- * PCI_CONFIG_ADDR.
- */
-static struct device *dev_and_reg(u32 *reg)
-{
-	if (!pci_config_addr.bits.enabled)
-		return NULL;
-
-	if (pci_config_addr.bits.funcnum != 0)
-		return NULL;
-
-	if (pci_config_addr.bits.busnum != 0)
-		return NULL;
-
-	if (pci_config_addr.bits.offset * 4 >= sizeof(struct pci_config))
-		return NULL;
-
-	*reg = pci_config_addr.bits.offset;
-	return find_pci_device(pci_config_addr.bits.devnum);
-}
-
-/*
- * We can get invalid combinations of values while they're writing, so we
- * only fault if they try to write with some invalid bar/offset/length.
- */
-static bool valid_bar_access(struct device *d,
-			     struct virtio_pci_cfg_cap_u32 *cfg_access)
-{
-	/* We only have 1 bar (BAR0) */
-	if (cfg_access->cap.bar != 0)
-		return false;
-
-	/* Check it's within BAR0. */
-	if (cfg_access->cap.offset >= d->mmio_size
-	    || cfg_access->cap.offset + cfg_access->cap.length > d->mmio_size)
-		return false;
-
-	/* Check length is 1, 2 or 4. */
-	if (cfg_access->cap.length != 1
-	    && cfg_access->cap.length != 2
-	    && cfg_access->cap.length != 4)
-		return false;
-
-	/*
-	 * 4.1.4.7.2:
-	 *
-	 *  The driver MUST NOT write a cap.offset which is not a multiple of
-	 *  cap.length (ie. all accesses MUST be aligned).
-	 */
-	if (cfg_access->cap.offset % cfg_access->cap.length != 0)
-		return false;
-
-	/* Return pointer into word in BAR0. */
-	return true;
-}
-
-/* Is this accessing the PCI config address port?. */
-static bool is_pci_addr_port(u16 port)
-{
-	return port >= PCI_CONFIG_ADDR && port < PCI_CONFIG_ADDR + 4;
-}
-
-static bool pci_addr_iowrite(u16 port, u32 mask, u32 val)
-{
-	iowrite(port - PCI_CONFIG_ADDR, val, mask,
-		&pci_config_addr.val);
-	verbose("PCI%s: %#x/%x: bus %u dev %u func %u reg %u\n",
-		pci_config_addr.bits.enabled ? "" : " DISABLED",
-		val, mask,
-		pci_config_addr.bits.busnum,
-		pci_config_addr.bits.devnum,
-		pci_config_addr.bits.funcnum,
-		pci_config_addr.bits.offset);
-	return true;
-}
-
-static void pci_addr_ioread(u16 port, u32 mask, u32 *val)
-{
-	ioread(port - PCI_CONFIG_ADDR, pci_config_addr.val, mask, val);
-}
-
-/* Is this accessing the PCI config data port?. */
-static bool is_pci_data_port(u16 port)
-{
-	return port >= PCI_CONFIG_DATA && port < PCI_CONFIG_DATA + 4;
-}
-
-static void emulate_mmio_write(struct device *d, u32 off, u32 val, u32 mask);
-
-static bool pci_data_iowrite(u16 port, u32 mask, u32 val)
-{
-	u32 reg, portoff;
-	struct device *d = dev_and_reg(&reg);
-
-	/* Complain if they don't belong to a device. */
-	if (!d)
-		return false;
-
-	/* They can do 1 byte writes, etc. */
-	portoff = port - PCI_CONFIG_DATA;
-
-	/*
-	 * PCI uses a weird way to determine the BAR size: the OS
-	 * writes all 1's, and sees which ones stick.
-	 */
-	if (&d->config_words[reg] == &d->config.bar[0]) {
-		int i;
-
-		iowrite(portoff, val, mask, &d->config.bar[0]);
-		for (i = 0; (1 << i) < d->mmio_size; i++)
-			d->config.bar[0] &= ~(1 << i);
-		return true;
-	} else if ((&d->config_words[reg] > &d->config.bar[0]
-		    && &d->config_words[reg] <= &d->config.bar[6])
-		   || &d->config_words[reg] == &d->config.expansion_rom_addr) {
-		/* Allow writing to any other BAR, or expansion ROM */
-		iowrite(portoff, val, mask, &d->config_words[reg]);
-		return true;
-		/* We let them override latency timer and cacheline size */
-	} else if (&d->config_words[reg] == (void *)&d->config.cacheline_size) {
-		/* Only let them change the first two fields. */
-		if (mask == 0xFFFFFFFF)
-			mask = 0xFFFF;
-		iowrite(portoff, val, mask, &d->config_words[reg]);
-		return true;
-	} else if (&d->config_words[reg] == (void *)&d->config.command
-		   && mask == 0xFFFF) {
-		/* Ignore command writes. */
-		return true;
-	} else if (&d->config_words[reg]
-		   == (void *)&d->config.cfg_access.cap.bar
-		   || &d->config_words[reg]
-		   == &d->config.cfg_access.cap.length
-		   || &d->config_words[reg]
-		   == &d->config.cfg_access.cap.offset) {
-
-		/*
-		 * The VIRTIO_PCI_CAP_PCI_CFG capability
-		 * provides a backdoor to access the MMIO
-		 * regions without mapping them.  Weird, but
-		 * useful.
-		 */
-		iowrite(portoff, val, mask, &d->config_words[reg]);
-		return true;
-	} else if (&d->config_words[reg] == &d->config.cfg_access.pci_cfg_data) {
-		u32 write_mask;
-
-		/*
-		 * 4.1.4.7.1:
-		 *
-		 *  Upon detecting driver write access to pci_cfg_data, the
-		 *  device MUST execute a write access at offset cap.offset at
-		 *  BAR selected by cap.bar using the first cap.length bytes
-		 *  from pci_cfg_data.
-		 */
-
-		/* Must be bar 0 */
-		if (!valid_bar_access(d, &d->config.cfg_access))
-			return false;
-
-		iowrite(portoff, val, mask, &d->config.cfg_access.pci_cfg_data);
-
-		/*
-		 * Now emulate a write.  The mask we use is set by
-		 * len, *not* this write!
-		 */
-		write_mask = (1ULL<<(8*d->config.cfg_access.cap.length)) - 1;
-		verbose("Window writing %#x/%#x to bar %u, offset %u len %u\n",
-			d->config.cfg_access.pci_cfg_data, write_mask,
-			d->config.cfg_access.cap.bar,
-			d->config.cfg_access.cap.offset,
-			d->config.cfg_access.cap.length);
-
-		emulate_mmio_write(d, d->config.cfg_access.cap.offset,
-				   d->config.cfg_access.pci_cfg_data,
-				   write_mask);
-		return true;
-	}
-
-	/*
-	 * 4.1.4.1:
-	 *
-	 *  The driver MUST NOT write into any field of the capability
-	 *  structure, with the exception of those with cap_type
-	 *  VIRTIO_PCI_CAP_PCI_CFG...
-	 */
-	return false;
-}
-
-static u32 emulate_mmio_read(struct device *d, u32 off, u32 mask);
-
-static void pci_data_ioread(u16 port, u32 mask, u32 *val)
-{
-	u32 reg;
-	struct device *d = dev_and_reg(&reg);
-
-	if (!d)
-		return;
-
-	/* Read through the PCI MMIO access window is special */
-	if (&d->config_words[reg] == &d->config.cfg_access.pci_cfg_data) {
-		u32 read_mask;
-
-		/*
-		 * 4.1.4.7.1:
-		 *
-		 *  Upon detecting driver read access to pci_cfg_data, the
-		 *  device MUST execute a read access of length cap.length at
-		 *  offset cap.offset at BAR selected by cap.bar and store the
-		 *  first cap.length bytes in pci_cfg_data.
-		 */
-		/* Must be bar 0 */
-		if (!valid_bar_access(d, &d->config.cfg_access))
-			bad_driver(d,
-			     "Invalid cfg_access to bar%u, offset %u len %u",
-			     d->config.cfg_access.cap.bar,
-			     d->config.cfg_access.cap.offset,
-			     d->config.cfg_access.cap.length);
-
-		/*
-		 * Read into the window.  The mask we use is set by
-		 * len, *not* this read!
-		 */
-		read_mask = (1ULL<<(8*d->config.cfg_access.cap.length))-1;
-		d->config.cfg_access.pci_cfg_data
-			= emulate_mmio_read(d,
-					    d->config.cfg_access.cap.offset,
-					    read_mask);
-		verbose("Window read %#x/%#x from bar %u, offset %u len %u\n",
-			d->config.cfg_access.pci_cfg_data, read_mask,
-			d->config.cfg_access.cap.bar,
-			d->config.cfg_access.cap.offset,
-			d->config.cfg_access.cap.length);
-	}
-	ioread(port - PCI_CONFIG_DATA, d->config_words[reg], mask, val);
-}
-
-/*L:216
- * This is where we emulate a handful of Guest instructions.  It's ugly
- * and we used to do it in the kernel but it grew over time.
- */
-
-/*
- * We use the ptrace syscall's pt_regs struct to talk about registers
- * to lguest: these macros convert the names to the offsets.
- */
-#define getreg(name) getreg_off(offsetof(struct user_regs_struct, name))
-#define setreg(name, val) \
-	setreg_off(offsetof(struct user_regs_struct, name), (val))
-
-static u32 getreg_off(size_t offset)
-{
-	u32 r;
-	unsigned long args[] = { LHREQ_GETREG, offset };
-
-	if (pwrite(lguest_fd, args, sizeof(args), cpu_id) < 0)
-		err(1, "Getting register %u", offset);
-	if (pread(lguest_fd, &r, sizeof(r), cpu_id) != sizeof(r))
-		err(1, "Reading register %u", offset);
-
-	return r;
-}
-
-static void setreg_off(size_t offset, u32 val)
-{
-	unsigned long args[] = { LHREQ_SETREG, offset, val };
-
-	if (pwrite(lguest_fd, args, sizeof(args), cpu_id) < 0)
-		err(1, "Setting register %u", offset);
-}
-
-/* Get register by instruction encoding */
-static u32 getreg_num(unsigned regnum, u32 mask)
-{
-	/* 8 bit ops use regnums 4-7 for high parts of word */
-	if (mask == 0xFF && (regnum & 0x4))
-		return getreg_num(regnum & 0x3, 0xFFFF) >> 8;
-
-	switch (regnum) {
-	case 0: return getreg(eax) & mask;
-	case 1: return getreg(ecx) & mask;
-	case 2: return getreg(edx) & mask;
-	case 3: return getreg(ebx) & mask;
-	case 4: return getreg(esp) & mask;
-	case 5: return getreg(ebp) & mask;
-	case 6: return getreg(esi) & mask;
-	case 7: return getreg(edi) & mask;
-	}
-	abort();
-}
-
-/* Set register by instruction encoding */
-static void setreg_num(unsigned regnum, u32 val, u32 mask)
-{
-	/* Don't try to set bits out of range */
-	assert(~(val & ~mask));
-
-	/* 8 bit ops use regnums 4-7 for high parts of word */
-	if (mask == 0xFF && (regnum & 0x4)) {
-		/* Construct the 16 bits we want. */
-		val = (val << 8) | getreg_num(regnum & 0x3, 0xFF);
-		setreg_num(regnum & 0x3, val, 0xFFFF);
-		return;
-	}
-
-	switch (regnum) {
-	case 0: setreg(eax, val | (getreg(eax) & ~mask)); return;
-	case 1: setreg(ecx, val | (getreg(ecx) & ~mask)); return;
-	case 2: setreg(edx, val | (getreg(edx) & ~mask)); return;
-	case 3: setreg(ebx, val | (getreg(ebx) & ~mask)); return;
-	case 4: setreg(esp, val | (getreg(esp) & ~mask)); return;
-	case 5: setreg(ebp, val | (getreg(ebp) & ~mask)); return;
-	case 6: setreg(esi, val | (getreg(esi) & ~mask)); return;
-	case 7: setreg(edi, val | (getreg(edi) & ~mask)); return;
-	}
-	abort();
-}
-
-/* Get bytes of displacement appended to instruction, from r/m encoding */
-static u32 insn_displacement_len(u8 mod_reg_rm)
-{
-	/* Switch on the mod bits */
-	switch (mod_reg_rm >> 6) {
-	case 0:
-		/* If mod == 0, and r/m == 101, 16-bit displacement follows */
-		if ((mod_reg_rm & 0x7) == 0x5)
-			return 2;
-		/* Normally, mod == 0 means no literal displacement */
-		return 0;
-	case 1:
-		/* One byte displacement */
-		return 1;
-	case 2:
-		/* Four byte displacement */
-		return 4;
-	case 3:
-		/* Register mode */
-		return 0;
-	}
-	abort();
-}
-
-static void emulate_insn(const u8 insn[])
-{
-	unsigned long args[] = { LHREQ_TRAP, 13 };
-	unsigned int insnlen = 0, in = 0, small_operand = 0, byte_access;
-	unsigned int eax, port, mask;
-	/*
-	 * Default is to return all-ones on IO port reads, which traditionally
-	 * means "there's nothing there".
-	 */
-	u32 val = 0xFFFFFFFF;
-
-	/*
-	 * This must be the Guest kernel trying to do something, not userspace!
-	 * The bottom two bits of the CS segment register are the privilege
-	 * level.
-	 */
-	if ((getreg(xcs) & 3) != 0x1)
-		goto no_emulate;
-
-	/* Decoding x86 instructions is icky. */
-
-	/*
-	 * Around 2.6.33, the kernel started using an emulation for the
-	 * cmpxchg8b instruction in early boot on many configurations.  This
-	 * code isn't paravirtualized, and it tries to disable interrupts.
-	 * Ignore it, which will Mostly Work.
-	 */
-	if (insn[insnlen] == 0xfa) {
-		/* "cli", or Clear Interrupt Enable instruction.  Skip it. */
-		insnlen = 1;
-		goto skip_insn;
-	}
-
-	/*
-	 * 0x66 is an "operand prefix".  It means a 16, not 32 bit in/out.
-	 */
-	if (insn[insnlen] == 0x66) {
-		small_operand = 1;
-		/* The instruction is 1 byte so far, read the next byte. */
-		insnlen = 1;
-	}
-
-	/* If the lower bit isn't set, it's a single byte access */
-	byte_access = !(insn[insnlen] & 1);
-
-	/*
-	 * Now we can ignore the lower bit and decode the 4 opcodes
-	 * we need to emulate.
-	 */
-	switch (insn[insnlen] & 0xFE) {
-	case 0xE4: /* in     <next byte>,%al */
-		port = insn[insnlen+1];
-		insnlen += 2;
-		in = 1;
-		break;
-	case 0xEC: /* in     (%dx),%al */
-		port = getreg(edx) & 0xFFFF;
-		insnlen += 1;
-		in = 1;
-		break;
-	case 0xE6: /* out    %al,<next byte> */
-		port = insn[insnlen+1];
-		insnlen += 2;
-		break;
-	case 0xEE: /* out    %al,(%dx) */
-		port = getreg(edx) & 0xFFFF;
-		insnlen += 1;
-		break;
-	default:
-		/* OK, we don't know what this is, can't emulate. */
-		goto no_emulate;
-	}
-
-	/* Set a mask of the 1, 2 or 4 bytes, depending on size of IO */
-	if (byte_access)
-		mask = 0xFF;
-	else if (small_operand)
-		mask = 0xFFFF;
-	else
-		mask = 0xFFFFFFFF;
-
-	/*
-	 * If it was an "IN" instruction, they expect the result to be read
-	 * into %eax, so we change %eax.
-	 */
-	eax = getreg(eax);
-
-	if (in) {
-		/* This is the PS/2 keyboard status; 1 means ready for output */
-		if (port == 0x64)
-			val = 1;
-		else if (is_pci_addr_port(port))
-			pci_addr_ioread(port, mask, &val);
-		else if (is_pci_data_port(port))
-			pci_data_ioread(port, mask, &val);
-
-		/* Clear the bits we're about to read */
-		eax &= ~mask;
-		/* Copy bits in from val. */
-		eax |= val & mask;
-		/* Now update the register. */
-		setreg(eax, eax);
-	} else {
-		if (is_pci_addr_port(port)) {
-			if (!pci_addr_iowrite(port, mask, eax))
-				goto bad_io;
-		} else if (is_pci_data_port(port)) {
-			if (!pci_data_iowrite(port, mask, eax))
-				goto bad_io;
-		}
-		/* There are many other ports, eg. CMOS clock, serial
-		 * and parallel ports, so we ignore them all. */
-	}
-
-	verbose("IO %s of %x to %u: %#08x\n",
-		in ? "IN" : "OUT", mask, port, eax);
-skip_insn:
-	/* Finally, we've "done" the instruction, so move past it. */
-	setreg(eip, getreg(eip) + insnlen);
-	return;
-
-bad_io:
-	warnx("Attempt to %s port %u (%#x mask)",
-	      in ? "read from" : "write to", port, mask);
-
-no_emulate:
-	/* Inject trap into Guest. */
-	if (write(lguest_fd, args, sizeof(args)) < 0)
-		err(1, "Reinjecting trap 13 for fault at %#x", getreg(eip));
-}
-
-static struct device *find_mmio_region(unsigned long paddr, u32 *off)
-{
-	unsigned int i;
-
-	for (i = 1; i < MAX_PCI_DEVICES; i++) {
-		struct device *d = devices.pci[i];
-
-		if (!d)
-			continue;
-		if (paddr < d->mmio_addr)
-			continue;
-		if (paddr >= d->mmio_addr + d->mmio_size)
-			continue;
-		*off = paddr - d->mmio_addr;
-		return d;
-	}
-	return NULL;
-}
-
-/* FIXME: Use vq array. */
-static struct virtqueue *vq_by_num(struct device *d, u32 num)
-{
-	struct virtqueue *vq = d->vq;
-
-	while (num-- && vq)
-		vq = vq->next;
-
-	return vq;
-}
-
-static void save_vq_config(const struct virtio_pci_common_cfg *cfg,
-			   struct virtqueue *vq)
-{
-	vq->pci_config = *cfg;
-}
-
-static void restore_vq_config(struct virtio_pci_common_cfg *cfg,
-			      struct virtqueue *vq)
-{
-	/* Only restore the per-vq part */
-	size_t off = offsetof(struct virtio_pci_common_cfg, queue_size);
-
-	memcpy((void *)cfg + off, (void *)&vq->pci_config + off,
-	       sizeof(*cfg) - off);
-}
-
-/*
- * 4.1.4.3.2:
- *
- *  The driver MUST configure the other virtqueue fields before
- *  enabling the virtqueue with queue_enable.
- *
- * When they enable the virtqueue, we check that their setup is valid.
- */
-static void check_virtqueue(struct device *d, struct virtqueue *vq)
-{
-	/* Because lguest is 32 bit, all the descriptor high bits must be 0 */
-	if (vq->pci_config.queue_desc_hi
-	    || vq->pci_config.queue_avail_hi
-	    || vq->pci_config.queue_used_hi)
-		bad_driver_vq(vq, "invalid 64-bit queue address");
-
-	/*
-	 * 2.4.1:
-	 *
-	 *  The driver MUST ensure that the physical address of the first byte
-	 *  of each virtqueue part is a multiple of the specified alignment
-	 *  value in the above table.
-	 */
-	if (vq->pci_config.queue_desc_lo % 16
-	    || vq->pci_config.queue_avail_lo % 2
-	    || vq->pci_config.queue_used_lo % 4)
-		bad_driver_vq(vq, "invalid alignment in queue addresses");
-
-	/* Initialize the virtqueue and check they're all in range. */
-	vq->vring.num = vq->pci_config.queue_size;
-	vq->vring.desc = check_pointer(vq->dev,
-				       vq->pci_config.queue_desc_lo,
-				       sizeof(*vq->vring.desc) * vq->vring.num);
-	vq->vring.avail = check_pointer(vq->dev,
-					vq->pci_config.queue_avail_lo,
-					sizeof(*vq->vring.avail)
-					+ (sizeof(vq->vring.avail->ring[0])
-					   * vq->vring.num));
-	vq->vring.used = check_pointer(vq->dev,
-				       vq->pci_config.queue_used_lo,
-				       sizeof(*vq->vring.used)
-				       + (sizeof(vq->vring.used->ring[0])
-					  * vq->vring.num));
-
-	/*
-	 * 2.4.9.1:
-	 *
-	 *   The driver MUST initialize flags in the used ring to 0
-	 *   when allocating the used ring.
-	 */
-	if (vq->vring.used->flags != 0)
-		bad_driver_vq(vq, "invalid initial used.flags %#x",
-			      vq->vring.used->flags);
-}
-
-static void start_virtqueue(struct virtqueue *vq)
-{
-	/*
-	 * Create stack for thread.  Since the stack grows upwards, we point
-	 * the stack pointer to the end of this region.
-	 */
-	char *stack = malloc(32768);
-
-	/* Create a zero-initialized eventfd. */
-	vq->eventfd = eventfd(0, 0);
-	if (vq->eventfd < 0)
-		err(1, "Creating eventfd");
-
-	/*
-	 * CLONE_VM: because it has to access the Guest memory, and SIGCHLD so
-	 * we get a signal if it dies.
-	 */
-	vq->thread = clone(do_thread, stack + 32768, CLONE_VM | SIGCHLD, vq);
-	if (vq->thread == (pid_t)-1)
-		err(1, "Creating clone");
-}
-
-static void start_virtqueues(struct device *d)
-{
-	struct virtqueue *vq;
-
-	for (vq = d->vq; vq; vq = vq->next) {
-		if (vq->pci_config.queue_enable)
-			start_virtqueue(vq);
-	}
-}
-
-static void emulate_mmio_write(struct device *d, u32 off, u32 val, u32 mask)
-{
-	struct virtqueue *vq;
-
-	switch (off) {
-	case offsetof(struct virtio_pci_mmio, cfg.device_feature_select):
-		/*
-		 * 4.1.4.3.1:
-		 *
-		 * The device MUST present the feature bits it is offering in
-		 * device_feature, starting at bit device_feature_select ∗ 32
-		 * for any device_feature_select written by the driver
-		 */
-		if (val == 0)
-			d->mmio->cfg.device_feature = d->features;
-		else if (val == 1)
-			d->mmio->cfg.device_feature = (d->features >> 32);
-		else
-			d->mmio->cfg.device_feature = 0;
-		goto feature_write_through32;
-	case offsetof(struct virtio_pci_mmio, cfg.guest_feature_select):
-		if (val > 1)
-			bad_driver(d, "Unexpected driver select %u", val);
-		goto feature_write_through32;
-	case offsetof(struct virtio_pci_mmio, cfg.guest_feature):
-		if (d->mmio->cfg.guest_feature_select == 0) {
-			d->features_accepted &= ~((u64)0xFFFFFFFF);
-			d->features_accepted |= val;
-		} else {
-			assert(d->mmio->cfg.guest_feature_select == 1);
-			d->features_accepted &= 0xFFFFFFFF;
-			d->features_accepted |= ((u64)val) << 32;
-		}
-		/*
-		 * 2.2.1:
-		 *
-		 *   The driver MUST NOT accept a feature which the device did
-		 *   not offer
-		 */
-		if (d->features_accepted & ~d->features)
-			bad_driver(d, "over-accepted features %#llx of %#llx",
-				   d->features_accepted, d->features);
-		goto feature_write_through32;
-	case offsetof(struct virtio_pci_mmio, cfg.device_status): {
-		u8 prev;
-
-		verbose("%s: device status -> %#x\n", d->name, val);
-		/*
-		 * 4.1.4.3.1:
-		 * 
-		 *  The device MUST reset when 0 is written to device_status,
-		 *  and present a 0 in device_status once that is done.
-		 */
-		if (val == 0) {
-			reset_device(d);
-			goto write_through8;
-		}
-
-		/* 2.1.1: The driver MUST NOT clear a device status bit. */
-		if (d->mmio->cfg.device_status & ~val)
-			bad_driver(d, "unset of device status bit %#x -> %#x",
-				   d->mmio->cfg.device_status, val);
-
-		/*
-		 * 2.1.2:
-		 *
-		 *  The device MUST NOT consume buffers or notify the driver
-		 *  before DRIVER_OK.
-		 */
-		if (val & VIRTIO_CONFIG_S_DRIVER_OK
-		    && !(d->mmio->cfg.device_status & VIRTIO_CONFIG_S_DRIVER_OK))
-			start_virtqueues(d);
-
-		/*
-		 * 3.1.1:
-		 *
-		 *   The driver MUST follow this sequence to initialize a device:
-		 *   - Reset the device.
-		 *   - Set the ACKNOWLEDGE status bit: the guest OS has
-                 *     notice the device.
-		 *   - Set the DRIVER status bit: the guest OS knows how
-                 *     to drive the device.
-		 *   - Read device feature bits, and write the subset
-		 *     of feature bits understood by the OS and driver
-		 *     to the device. During this step the driver MAY
-		 *     read (but MUST NOT write) the device-specific
-		 *     configuration fields to check that it can
-		 *     support the device before accepting it.
-		 *   - Set the FEATURES_OK status bit.  The driver
-		 *     MUST not accept new feature bits after this
-		 *     step.
-		 *   - Re-read device status to ensure the FEATURES_OK
-		 *     bit is still set: otherwise, the device does
-		 *     not support our subset of features and the
-		 *     device is unusable.
-		 *   - Perform device-specific setup, including
-		 *     discovery of virtqueues for the device,
-		 *     optional per-bus setup, reading and possibly
-		 *     writing the device’s virtio configuration
-		 *     space, and population of virtqueues.
-		 *   - Set the DRIVER_OK status bit. At this point the
-                 *     device is “live”.
-		 */
-		prev = 0;
-		switch (val & ~d->mmio->cfg.device_status) {
-		case VIRTIO_CONFIG_S_DRIVER_OK:
-			prev |= VIRTIO_CONFIG_S_FEATURES_OK; /* fall thru */
-		case VIRTIO_CONFIG_S_FEATURES_OK:
-			prev |= VIRTIO_CONFIG_S_DRIVER; /* fall thru */
-		case VIRTIO_CONFIG_S_DRIVER:
-			prev |= VIRTIO_CONFIG_S_ACKNOWLEDGE; /* fall thru */
-		case VIRTIO_CONFIG_S_ACKNOWLEDGE:
-			break;
-		default:
-			bad_driver(d, "unknown device status bit %#x -> %#x",
-				   d->mmio->cfg.device_status, val);
-		}
-		if (d->mmio->cfg.device_status != prev)
-			bad_driver(d, "unexpected status transition %#x -> %#x",
-				   d->mmio->cfg.device_status, val);
-
-		/* If they just wrote FEATURES_OK, we make sure they read */
-		switch (val & ~d->mmio->cfg.device_status) {
-		case VIRTIO_CONFIG_S_FEATURES_OK:
-			d->wrote_features_ok = true;
-			break;
-		case VIRTIO_CONFIG_S_DRIVER_OK:
-			if (d->wrote_features_ok)
-				bad_driver(d, "did not re-read FEATURES_OK");
-			break;
-		}
-		goto write_through8;
-	}
-	case offsetof(struct virtio_pci_mmio, cfg.queue_select):
-		vq = vq_by_num(d, val);
-		/*
-		 * 4.1.4.3.1:
-		 *
-		 *  The device MUST present a 0 in queue_size if the virtqueue
-		 *  corresponding to the current queue_select is unavailable.
-		 */
-		if (!vq) {
-			d->mmio->cfg.queue_size = 0;
-			goto write_through16;
-		}
-		/* Save registers for old vq, if it was a valid vq */
-		if (d->mmio->cfg.queue_size)
-			save_vq_config(&d->mmio->cfg,
-				       vq_by_num(d, d->mmio->cfg.queue_select));
-		/* Restore the registers for the queue they asked for */
-		restore_vq_config(&d->mmio->cfg, vq);
-		goto write_through16;
-	case offsetof(struct virtio_pci_mmio, cfg.queue_size):
-		/*
-		 * 4.1.4.3.2:
-		 *
-		 *  The driver MUST NOT write a value which is not a power of 2
-		 *  to queue_size.
-		 */
-		if (val & (val-1))
-			bad_driver(d, "invalid queue size %u", val);
-		if (d->mmio->cfg.queue_enable)
-			bad_driver(d, "changing queue size on live device");
-		goto write_through16;
-	case offsetof(struct virtio_pci_mmio, cfg.queue_msix_vector):
-		bad_driver(d, "attempt to set MSIX vector to %u", val);
-	case offsetof(struct virtio_pci_mmio, cfg.queue_enable): {
-		struct virtqueue *vq = vq_by_num(d, d->mmio->cfg.queue_select);
-
-		/*
-		 * 4.1.4.3.2:
-		 *
-		 *  The driver MUST NOT write a 0 to queue_enable.
-		 */
-		if (val != 1)
-			bad_driver(d, "setting queue_enable to %u", val);
-
-		/*
-		 * 3.1.1:
-		 *
-		 *  7. Perform device-specific setup, including discovery of
-		 *     virtqueues for the device, optional per-bus setup,
-		 *     reading and possibly writing the device’s virtio
-		 *     configuration space, and population of virtqueues.
-		 *  8. Set the DRIVER_OK status bit.
-		 *
-		 * All our devices require all virtqueues to be enabled, so
-		 * they should have done that before setting DRIVER_OK.
-		 */
-		if (d->mmio->cfg.device_status & VIRTIO_CONFIG_S_DRIVER_OK)
-			bad_driver(d, "enabling vq after DRIVER_OK");
-
-		d->mmio->cfg.queue_enable = val;
-		save_vq_config(&d->mmio->cfg, vq);
-		check_virtqueue(d, vq);
-		goto write_through16;
-	}
-	case offsetof(struct virtio_pci_mmio, cfg.queue_notify_off):
-		bad_driver(d, "attempt to write to queue_notify_off");
-	case offsetof(struct virtio_pci_mmio, cfg.queue_desc_lo):
-	case offsetof(struct virtio_pci_mmio, cfg.queue_desc_hi):
-	case offsetof(struct virtio_pci_mmio, cfg.queue_avail_lo):
-	case offsetof(struct virtio_pci_mmio, cfg.queue_avail_hi):
-	case offsetof(struct virtio_pci_mmio, cfg.queue_used_lo):
-	case offsetof(struct virtio_pci_mmio, cfg.queue_used_hi):
-		/*
-		 * 4.1.4.3.2:
-		 *
-		 *  The driver MUST configure the other virtqueue fields before
-		 *  enabling the virtqueue with queue_enable.
-		 */
-		if (d->mmio->cfg.queue_enable)
-			bad_driver(d, "changing queue on live device");
-
-		/*
-		 * 3.1.1:
-		 *
-		 *  The driver MUST follow this sequence to initialize a device:
-		 *...
-		 *  5. Set the FEATURES_OK status bit. The driver MUST not
-		 *  accept new feature bits after this step.
-		 */
-		if (!(d->mmio->cfg.device_status & VIRTIO_CONFIG_S_FEATURES_OK))
-			bad_driver(d, "setting up vq before FEATURES_OK");
-
-		/*
-		 *  6. Re-read device status to ensure the FEATURES_OK bit is
-		 *     still set...
-		 */
-		if (d->wrote_features_ok)
-			bad_driver(d, "didn't re-read FEATURES_OK before setup");
-
-		goto write_through32;
-	case offsetof(struct virtio_pci_mmio, notify):
-		vq = vq_by_num(d, val);
-		if (!vq)
-			bad_driver(d, "Invalid vq notification on %u", val);
-		/* Notify the process handling this vq by adding 1 to eventfd */
-		write(vq->eventfd, "\1\0\0\0\0\0\0\0", 8);
-		goto write_through16;
-	case offsetof(struct virtio_pci_mmio, isr):
-		bad_driver(d, "Unexpected write to isr");
-	/* Weird corner case: write to emerg_wr of console */
-	case sizeof(struct virtio_pci_mmio)
-		+ offsetof(struct virtio_console_config, emerg_wr):
-		if (strcmp(d->name, "console") == 0) {
-			char c = val;
-			write(STDOUT_FILENO, &c, 1);
-			goto write_through32;
-		}
-		/* Fall through... */
-	default:
-		/*
-		 * 4.1.4.3.2:
-		 *
-		 *   The driver MUST NOT write to device_feature, num_queues,
-		 *   config_generation or queue_notify_off.
-		 */
-		bad_driver(d, "Unexpected write to offset %u", off);
-	}
-
-feature_write_through32:
-	/*
-	 * 3.1.1:
-	 *
-	 *   The driver MUST follow this sequence to initialize a device:
-	 *...
-	 *   - Set the DRIVER status bit: the guest OS knows how
-	 *     to drive the device.
-	 *   - Read device feature bits, and write the subset
-	 *     of feature bits understood by the OS and driver
-	 *     to the device.
-	 *...
-	 *   - Set the FEATURES_OK status bit. The driver MUST not
-	 *     accept new feature bits after this step.
-	 */
-	if (!(d->mmio->cfg.device_status & VIRTIO_CONFIG_S_DRIVER))
-		bad_driver(d, "feature write before VIRTIO_CONFIG_S_DRIVER");
-	if (d->mmio->cfg.device_status & VIRTIO_CONFIG_S_FEATURES_OK)
-		bad_driver(d, "feature write after VIRTIO_CONFIG_S_FEATURES_OK");
-
-	/*
-	 * 4.1.3.1:
-	 *
-	 *  The driver MUST access each field using the “natural” access
-	 *  method, i.e. 32-bit accesses for 32-bit fields, 16-bit accesses for
-	 *  16-bit fields and 8-bit accesses for 8-bit fields.
-	 */
-write_through32:
-	if (mask != 0xFFFFFFFF) {
-		bad_driver(d, "non-32-bit write to offset %u (%#x)",
-			   off, getreg(eip));
-		return;
-	}
-	memcpy((char *)d->mmio + off, &val, 4);
-	return;
-
-write_through16:
-	if (mask != 0xFFFF)
-		bad_driver(d, "non-16-bit write to offset %u (%#x)",
-			   off, getreg(eip));
-	memcpy((char *)d->mmio + off, &val, 2);
-	return;
-
-write_through8:
-	if (mask != 0xFF)
-		bad_driver(d, "non-8-bit write to offset %u (%#x)",
-			   off, getreg(eip));
-	memcpy((char *)d->mmio + off, &val, 1);
-	return;
-}
-
-static u32 emulate_mmio_read(struct device *d, u32 off, u32 mask)
-{
-	u8 isr;
-	u32 val = 0;
-
-	switch (off) {
-	case offsetof(struct virtio_pci_mmio, cfg.device_feature_select):
-	case offsetof(struct virtio_pci_mmio, cfg.device_feature):
-	case offsetof(struct virtio_pci_mmio, cfg.guest_feature_select):
-	case offsetof(struct virtio_pci_mmio, cfg.guest_feature):
-		/*
-		 * 3.1.1:
-		 *
-		 *   The driver MUST follow this sequence to initialize a device:
-		 *...
-		 *   - Set the DRIVER status bit: the guest OS knows how
-		 *     to drive the device.
-		 *   - Read device feature bits, and write the subset
-		 *     of feature bits understood by the OS and driver
-		 *     to the device.
-		 */
-		if (!(d->mmio->cfg.device_status & VIRTIO_CONFIG_S_DRIVER))
-			bad_driver(d,
-				   "feature read before VIRTIO_CONFIG_S_DRIVER");
-		goto read_through32;
-	case offsetof(struct virtio_pci_mmio, cfg.msix_config):
-		bad_driver(d, "read of msix_config");
-	case offsetof(struct virtio_pci_mmio, cfg.num_queues):
-		goto read_through16;
-	case offsetof(struct virtio_pci_mmio, cfg.device_status):
-		/* As they did read, any write of FEATURES_OK is now fine. */
-		d->wrote_features_ok = false;
-		goto read_through8;
-	case offsetof(struct virtio_pci_mmio, cfg.config_generation):
-		/*
-		 * 4.1.4.3.1:
-		 *
-		 *  The device MUST present a changed config_generation after
-		 *  the driver has read a device-specific configuration value
-		 *  which has changed since any part of the device-specific
-		 *  configuration was last read.
-		 *
-		 * This is simple: none of our devices change config, so this
-		 * is always 0.
-		 */
-		goto read_through8;
-	case offsetof(struct virtio_pci_mmio, notify):
-		/*
-		 * 3.1.1:
-		 *
-		 *   The driver MUST NOT notify the device before setting
-		 *   DRIVER_OK.
-		 */
-		if (!(d->mmio->cfg.device_status & VIRTIO_CONFIG_S_DRIVER_OK))
-			bad_driver(d, "notify before VIRTIO_CONFIG_S_DRIVER_OK");
-		goto read_through16;
-	case offsetof(struct virtio_pci_mmio, isr):
-		if (mask != 0xFF)
-			bad_driver(d, "non-8-bit read from offset %u (%#x)",
-				   off, getreg(eip));
-		isr = d->mmio->isr;
-		/*
-		 * 4.1.4.5.1:
-		 *
-		 *  The device MUST reset ISR status to 0 on driver read. 
-		 */
-		d->mmio->isr = 0;
-		return isr;
-	case offsetof(struct virtio_pci_mmio, padding):
-		bad_driver(d, "read from padding (%#x)", getreg(eip));
-	default:
-		/* Read from device config space, beware unaligned overflow */
-		if (off > d->mmio_size - 4)
-			bad_driver(d, "read past end (%#x)", getreg(eip));
-
-		/*
-		 * 3.1.1:
-		 *  The driver MUST follow this sequence to initialize a device:
-		 *...
-		 *  3. Set the DRIVER status bit: the guest OS knows how to
-		 *  drive the device.
-		 *  4. Read device feature bits, and write the subset of
-		 *  feature bits understood by the OS and driver to the
-		 *  device. During this step the driver MAY read (but MUST NOT
-		 *  write) the device-specific configuration fields to check
-		 *  that it can support the device before accepting it.
-		 */
-		if (!(d->mmio->cfg.device_status & VIRTIO_CONFIG_S_DRIVER))
-			bad_driver(d,
-				   "config read before VIRTIO_CONFIG_S_DRIVER");
-
-		if (mask == 0xFFFFFFFF)
-			goto read_through32;
-		else if (mask == 0xFFFF)
-			goto read_through16;
-		else
-			goto read_through8;
-	}
-
-	/*
-	 * 4.1.3.1:
-	 *
-	 *  The driver MUST access each field using the “natural” access
-	 *  method, i.e. 32-bit accesses for 32-bit fields, 16-bit accesses for
-	 *  16-bit fields and 8-bit accesses for 8-bit fields.
-	 */
-read_through32:
-	if (mask != 0xFFFFFFFF)
-		bad_driver(d, "non-32-bit read to offset %u (%#x)",
-			   off, getreg(eip));
-	memcpy(&val, (char *)d->mmio + off, 4);
-	return val;
-
-read_through16:
-	if (mask != 0xFFFF)
-		bad_driver(d, "non-16-bit read to offset %u (%#x)",
-			   off, getreg(eip));
-	memcpy(&val, (char *)d->mmio + off, 2);
-	return val;
-
-read_through8:
-	if (mask != 0xFF)
-		bad_driver(d, "non-8-bit read to offset %u (%#x)",
-			   off, getreg(eip));
-	memcpy(&val, (char *)d->mmio + off, 1);
-	return val;
-}
-
-static void emulate_mmio(unsigned long paddr, const u8 *insn)
-{
-	u32 val, off, mask = 0xFFFFFFFF, insnlen = 0;
-	struct device *d = find_mmio_region(paddr, &off);
-	unsigned long args[] = { LHREQ_TRAP, 14 };
-
-	if (!d) {
-		warnx("MMIO touching %#08lx (not a device)", paddr);
-		goto reinject;
-	}
-
-	/* Prefix makes it a 16 bit op */
-	if (insn[0] == 0x66) {
-		mask = 0xFFFF;
-		insnlen++;
-	}
-
-	/* iowrite */
-	if (insn[insnlen] == 0x89) {
-		/* Next byte is r/m byte: bits 3-5 are register. */
-		val = getreg_num((insn[insnlen+1] >> 3) & 0x7, mask);
-		emulate_mmio_write(d, off, val, mask);
-		insnlen += 2 + insn_displacement_len(insn[insnlen+1]);
-	} else if (insn[insnlen] == 0x8b) { /* ioread */
-		/* Next byte is r/m byte: bits 3-5 are register. */
-		val = emulate_mmio_read(d, off, mask);
-		setreg_num((insn[insnlen+1] >> 3) & 0x7, val, mask);
-		insnlen += 2 + insn_displacement_len(insn[insnlen+1]);
-	} else if (insn[0] == 0x88) { /* 8-bit iowrite */
-		mask = 0xff;
-		/* Next byte is r/m byte: bits 3-5 are register. */
-		val = getreg_num((insn[1] >> 3) & 0x7, mask);
-		emulate_mmio_write(d, off, val, mask);
-		insnlen = 2 + insn_displacement_len(insn[1]);
-	} else if (insn[0] == 0x8a) { /* 8-bit ioread */
-		mask = 0xff;
-		val = emulate_mmio_read(d, off, mask);
-		setreg_num((insn[1] >> 3) & 0x7, val, mask);
-		insnlen = 2 + insn_displacement_len(insn[1]);
-	} else {
-		warnx("Unknown MMIO instruction touching %#08lx:"
-		     " %02x %02x %02x %02x at %u",
-		     paddr, insn[0], insn[1], insn[2], insn[3], getreg(eip));
-	reinject:
-		/* Inject trap into Guest. */
-		if (write(lguest_fd, args, sizeof(args)) < 0)
-			err(1, "Reinjecting trap 14 for fault at %#x",
-			    getreg(eip));
-		return;
-	}
-
-	/* Finally, we've "done" the instruction, so move past it. */
-	setreg(eip, getreg(eip) + insnlen);
-}
-
-/*L:190
- * Device Setup
- *
- * All devices need a descriptor so the Guest knows it exists, and a "struct
- * device" so the Launcher can keep track of it.  We have common helper
- * routines to allocate and manage them.
- */
-static void add_pci_virtqueue(struct device *dev,
-			      void (*service)(struct virtqueue *),
-			      const char *name)
-{
-	struct virtqueue **i, *vq = malloc(sizeof(*vq));
-
-	/* Initialize the virtqueue */
-	vq->next = NULL;
-	vq->last_avail_idx = 0;
-	vq->dev = dev;
-	vq->name = name;
-
-	/*
-	 * This is the routine the service thread will run, and its Process ID
-	 * once it's running.
-	 */
-	vq->service = service;
-	vq->thread = (pid_t)-1;
-
-	/* Initialize the configuration. */
-	reset_vq_pci_config(vq);
-	vq->pci_config.queue_notify_off = 0;
-
-	/* Add one to the number of queues */
-	vq->dev->mmio->cfg.num_queues++;
-
-	/*
-	 * Add to tail of list, so dev->vq is first vq, dev->vq->next is
-	 * second.
-	 */
-	for (i = &dev->vq; *i; i = &(*i)->next);
-	*i = vq;
-}
-
-/* The Guest accesses the feature bits via the PCI common config MMIO region */
-static void add_pci_feature(struct device *dev, unsigned bit)
-{
-	dev->features |= (1ULL << bit);
-}
-
-/* For devices with no config. */
-static void no_device_config(struct device *dev)
-{
-	dev->mmio_addr = get_mmio_region(dev->mmio_size);
-
-	dev->config.bar[0] = dev->mmio_addr;
-	/* Bottom 4 bits must be zero */
-	assert(~(dev->config.bar[0] & 0xF));
-}
-
-/* This puts the device config into BAR0 */
-static void set_device_config(struct device *dev, const void *conf, size_t len)
-{
-	/* Set up BAR 0 */
-	dev->mmio_size += len;
-	dev->mmio = realloc(dev->mmio, dev->mmio_size);
-	memcpy(dev->mmio + 1, conf, len);
-
-	/*
-	 * 4.1.4.6:
-	 *
-	 *  The device MUST present at least one VIRTIO_PCI_CAP_DEVICE_CFG
-	 *  capability for any device type which has a device-specific
-	 *  configuration.
-	 */
-	/* Hook up device cfg */
-	dev->config.cfg_access.cap.cap_next
-		= offsetof(struct pci_config, device);
-
-	/*
-	 * 4.1.4.6.1:
-	 *
-	 *  The offset for the device-specific configuration MUST be 4-byte
-	 *  aligned.
-	 */
-	assert(dev->config.cfg_access.cap.cap_next % 4 == 0);
-
-	/* Fix up device cfg field length. */
-	dev->config.device.length = len;
-
-	/* The rest is the same as the no-config case */
-	no_device_config(dev);
-}
-
-static void init_cap(struct virtio_pci_cap *cap, size_t caplen, int type,
-		     size_t bar_offset, size_t bar_bytes, u8 next)
-{
-	cap->cap_vndr = PCI_CAP_ID_VNDR;
-	cap->cap_next = next;
-	cap->cap_len = caplen;
-	cap->cfg_type = type;
-	cap->bar = 0;
-	memset(cap->padding, 0, sizeof(cap->padding));
-	cap->offset = bar_offset;
-	cap->length = bar_bytes;
-}
-
-/*
- * This sets up the pci_config structure, as defined in the virtio 1.0
- * standard (and PCI standard).
- */
-static void init_pci_config(struct pci_config *pci, u16 type,
-			    u8 class, u8 subclass)
-{
-	size_t bar_offset, bar_len;
-
-	/*
-	 * 4.1.4.4.1:
-	 *
-	 *  The device MUST either present notify_off_multiplier as an even
-	 *  power of 2, or present notify_off_multiplier as 0.
-	 *
-	 * 2.1.2:
-	 *
-	 *   The device MUST initialize device status to 0 upon reset. 
-	 */
-	memset(pci, 0, sizeof(*pci));
-
-	/* 4.1.2.1: Devices MUST have the PCI Vendor ID 0x1AF4 */
-	pci->vendor_id = 0x1AF4;
-	/* 4.1.2.1: ... PCI Device ID calculated by adding 0x1040 ... */
-	pci->device_id = 0x1040 + type;
-
-	/*
-	 * PCI have specific codes for different types of devices.
-	 * Linux doesn't care, but it's a good clue for people looking
-	 * at the device.
-	 */
-	pci->class = class;
-	pci->subclass = subclass;
-
-	/*
-	 * 4.1.2.1:
-	 *
-	 *  Non-transitional devices SHOULD have a PCI Revision ID of 1 or
-	 *  higher
-	 */
-	pci->revid = 1;
-
-	/*
-	 * 4.1.2.1:
-	 *
-	 *  Non-transitional devices SHOULD have a PCI Subsystem Device ID of
-	 *  0x40 or higher.
-	 */
-	pci->subsystem_device_id = 0x40;
-
-	/* We use our dummy interrupt controller, and irq_line is the irq */
-	pci->irq_line = devices.next_irq++;
-	pci->irq_pin = 0;
-
-	/* Support for extended capabilities. */
-	pci->status = (1 << 4);
-
-	/* Link them in. */
-	/*
-	 * 4.1.4.3.1:
-	 *
-	 *  The device MUST present at least one common configuration
-	 *  capability.
-	 */
-	pci->capabilities = offsetof(struct pci_config, common);
-
-	/* 4.1.4.3.1 ... offset MUST be 4-byte aligned. */
-	assert(pci->capabilities % 4 == 0);
-
-	bar_offset = offsetof(struct virtio_pci_mmio, cfg);
-	bar_len = sizeof(((struct virtio_pci_mmio *)0)->cfg);
-	init_cap(&pci->common, sizeof(pci->common), VIRTIO_PCI_CAP_COMMON_CFG,
-		 bar_offset, bar_len,
-		 offsetof(struct pci_config, notify));
-
-	/*
-	 * 4.1.4.4.1:
-	 *
-	 *  The device MUST present at least one notification capability.
-	 */
-	bar_offset += bar_len;
-	bar_len = sizeof(((struct virtio_pci_mmio *)0)->notify);
-
-	/*
-	 * 4.1.4.4.1:
-	 *
-	 *  The cap.offset MUST be 2-byte aligned.
-	 */
-	assert(pci->common.cap_next % 2 == 0);
-
-	/* FIXME: Use a non-zero notify_off, for per-queue notification? */
-	/*
-	 * 4.1.4.4.1:
-	 *
-	 *  The value cap.length presented by the device MUST be at least 2 and
-	 *  MUST be large enough to support queue notification offsets for all
-	 *  supported queues in all possible configurations.
-	 */
-	assert(bar_len >= 2);
-
-	init_cap(&pci->notify.cap, sizeof(pci->notify),
-		 VIRTIO_PCI_CAP_NOTIFY_CFG,
-		 bar_offset, bar_len,
-		 offsetof(struct pci_config, isr));
-
-	bar_offset += bar_len;
-	bar_len = sizeof(((struct virtio_pci_mmio *)0)->isr);
-	/*
-	 * 4.1.4.5.1:
-	 *
-	 *  The device MUST present at least one VIRTIO_PCI_CAP_ISR_CFG
-	 *  capability.
-	 */
-	init_cap(&pci->isr, sizeof(pci->isr),
-		 VIRTIO_PCI_CAP_ISR_CFG,
-		 bar_offset, bar_len,
-		 offsetof(struct pci_config, cfg_access));
-
-	/*
-	 * 4.1.4.7.1:
-	 *
-	 * The device MUST present at least one VIRTIO_PCI_CAP_PCI_CFG
-	 * capability.
-	 */
-	/* This doesn't have any presence in the BAR */
-	init_cap(&pci->cfg_access.cap, sizeof(pci->cfg_access),
-		 VIRTIO_PCI_CAP_PCI_CFG,
-		 0, 0, 0);
-
-	bar_offset += bar_len + sizeof(((struct virtio_pci_mmio *)0)->padding);
-	assert(bar_offset == sizeof(struct virtio_pci_mmio));
-
-	/*
-	 * This gets sewn in and length set in set_device_config().
-	 * Some devices don't have a device configuration interface, so
-	 * we never expose this if we don't call set_device_config().
-	 */
-	init_cap(&pci->device, sizeof(pci->device), VIRTIO_PCI_CAP_DEVICE_CFG,
-		 bar_offset, 0, 0);
-}
-
-/*
- * This routine does all the creation and setup of a new device, but we don't
- * actually place the MMIO region until we know the size (if any) of the
- * device-specific config.  And we don't actually start the service threads
- * until later.
- *
- * See what I mean about userspace being boring?
- */
-static struct device *new_pci_device(const char *name, u16 type,
-				     u8 class, u8 subclass)
-{
-	struct device *dev = malloc(sizeof(*dev));
-
-	/* Now we populate the fields one at a time. */
-	dev->name = name;
-	dev->vq = NULL;
-	dev->running = false;
-	dev->wrote_features_ok = false;
-	dev->mmio_size = sizeof(struct virtio_pci_mmio);
-	dev->mmio = calloc(1, dev->mmio_size);
-	dev->features = (u64)1 << VIRTIO_F_VERSION_1;
-	dev->features_accepted = 0;
-
-	if (devices.device_num + 1 >= MAX_PCI_DEVICES)
-		errx(1, "Can only handle 31 PCI devices");
-
-	init_pci_config(&dev->config, type, class, subclass);
-	assert(!devices.pci[devices.device_num+1]);
-	devices.pci[++devices.device_num] = dev;
-
-	return dev;
-}
-
-/*
- * Our first setup routine is the console.  It's a fairly simple device, but
- * UNIX tty handling makes it uglier than it could be.
- */
-static void setup_console(void)
-{
-	struct device *dev;
-	struct virtio_console_config conf;
-
-	/* If we can save the initial standard input settings... */
-	if (tcgetattr(STDIN_FILENO, &orig_term) == 0) {
-		struct termios term = orig_term;
-		/*
-		 * Then we turn off echo, line buffering and ^C etc: We want a
-		 * raw input stream to the Guest.
-		 */
-		term.c_lflag &= ~(ISIG|ICANON|ECHO);
-		tcsetattr(STDIN_FILENO, TCSANOW, &term);
-	}
-
-	dev = new_pci_device("console", VIRTIO_ID_CONSOLE, 0x07, 0x00);
-
-	/* We store the console state in dev->priv, and initialize it. */
-	dev->priv = malloc(sizeof(struct console_abort));
-	((struct console_abort *)dev->priv)->count = 0;
-
-	/*
-	 * The console needs two virtqueues: the input then the output.  When
-	 * they put something the input queue, we make sure we're listening to
-	 * stdin.  When they put something in the output queue, we write it to
-	 * stdout.
-	 */
-	add_pci_virtqueue(dev, console_input, "input");
-	add_pci_virtqueue(dev, console_output, "output");
-
-	/* We need a configuration area for the emerg_wr early writes. */
-	add_pci_feature(dev, VIRTIO_CONSOLE_F_EMERG_WRITE);
-	set_device_config(dev, &conf, sizeof(conf));
-
-	verbose("device %u: console\n", devices.device_num);
-}
-/*:*/
-
-/*M:010
- * Inter-guest networking is an interesting area.  Simplest is to have a
- * --sharenet=<name> option which opens or creates a named pipe.  This can be
- * used to send packets to another guest in a 1:1 manner.
- *
- * More sophisticated is to use one of the tools developed for project like UML
- * to do networking.
- *
- * Faster is to do virtio bonding in kernel.  Doing this 1:1 would be
- * completely generic ("here's my vring, attach to your vring") and would work
- * for any traffic.  Of course, namespace and permissions issues need to be
- * dealt with.  A more sophisticated "multi-channel" virtio_net.c could hide
- * multiple inter-guest channels behind one interface, although it would
- * require some manner of hotplugging new virtio channels.
- *
- * Finally, we could use a virtio network switch in the kernel, ie. vhost.
-:*/
-
-static u32 str2ip(const char *ipaddr)
-{
-	unsigned int b[4];
-
-	if (sscanf(ipaddr, "%u.%u.%u.%u", &b[0], &b[1], &b[2], &b[3]) != 4)
-		errx(1, "Failed to parse IP address '%s'", ipaddr);
-	return (b[0] << 24) | (b[1] << 16) | (b[2] << 8) | b[3];
-}
-
-static void str2mac(const char *macaddr, unsigned char mac[6])
-{
-	unsigned int m[6];
-	if (sscanf(macaddr, "%02x:%02x:%02x:%02x:%02x:%02x",
-		   &m[0], &m[1], &m[2], &m[3], &m[4], &m[5]) != 6)
-		errx(1, "Failed to parse mac address '%s'", macaddr);
-	mac[0] = m[0];
-	mac[1] = m[1];
-	mac[2] = m[2];
-	mac[3] = m[3];
-	mac[4] = m[4];
-	mac[5] = m[5];
-}
-
-/*
- * This code is "adapted" from libbridge: it attaches the Host end of the
- * network device to the bridge device specified by the command line.
- *
- * This is yet another James Morris contribution (I'm an IP-level guy, so I
- * dislike bridging), and I just try not to break it.
- */
-static void add_to_bridge(int fd, const char *if_name, const char *br_name)
-{
-	int ifidx;
-	struct ifreq ifr;
-
-	if (!*br_name)
-		errx(1, "must specify bridge name");
-
-	ifidx = if_nametoindex(if_name);
-	if (!ifidx)
-		errx(1, "interface %s does not exist!", if_name);
-
-	strncpy(ifr.ifr_name, br_name, IFNAMSIZ);
-	ifr.ifr_name[IFNAMSIZ-1] = '\0';
-	ifr.ifr_ifindex = ifidx;
-	if (ioctl(fd, SIOCBRADDIF, &ifr) < 0)
-		err(1, "can't add %s to bridge %s", if_name, br_name);
-}
-
-/*
- * This sets up the Host end of the network device with an IP address, brings
- * it up so packets will flow, the copies the MAC address into the hwaddr
- * pointer.
- */
-static void configure_device(int fd, const char *tapif, u32 ipaddr)
-{
-	struct ifreq ifr;
-	struct sockaddr_in sin;
-
-	memset(&ifr, 0, sizeof(ifr));
-	strcpy(ifr.ifr_name, tapif);
-
-	/* Don't read these incantations.  Just cut & paste them like I did! */
-	sin.sin_family = AF_INET;
-	sin.sin_addr.s_addr = htonl(ipaddr);
-	memcpy(&ifr.ifr_addr, &sin, sizeof(sin));
-	if (ioctl(fd, SIOCSIFADDR, &ifr) != 0)
-		err(1, "Setting %s interface address", tapif);
-	ifr.ifr_flags = IFF_UP;
-	if (ioctl(fd, SIOCSIFFLAGS, &ifr) != 0)
-		err(1, "Bringing interface %s up", tapif);
-}
-
-static int get_tun_device(char tapif[IFNAMSIZ])
-{
-	struct ifreq ifr;
-	int vnet_hdr_sz;
-	int netfd;
-
-	/* Start with this zeroed.  Messy but sure. */
-	memset(&ifr, 0, sizeof(ifr));
-
-	/*
-	 * We open the /dev/net/tun device and tell it we want a tap device.  A
-	 * tap device is like a tun device, only somehow different.  To tell
-	 * the truth, I completely blundered my way through this code, but it
-	 * works now!
-	 */
-	netfd = open_or_die("/dev/net/tun", O_RDWR);
-	ifr.ifr_flags = IFF_TAP | IFF_NO_PI | IFF_VNET_HDR;
-	strcpy(ifr.ifr_name, "tap%d");
-	if (ioctl(netfd, TUNSETIFF, &ifr) != 0)
-		err(1, "configuring /dev/net/tun");
-
-	if (ioctl(netfd, TUNSETOFFLOAD,
-		  TUN_F_CSUM|TUN_F_TSO4|TUN_F_TSO6|TUN_F_TSO_ECN) != 0)
-		err(1, "Could not set features for tun device");
-
-	/*
-	 * We don't need checksums calculated for packets coming in this
-	 * device: trust us!
-	 */
-	ioctl(netfd, TUNSETNOCSUM, 1);
-
-	/*
-	 * In virtio before 1.0 (aka legacy virtio), we added a 16-bit
-	 * field at the end of the network header iff
-	 * VIRTIO_NET_F_MRG_RXBUF was negotiated.  For virtio 1.0,
-	 * that became the norm, but we need to tell the tun device
-	 * about our expanded header (which is called
-	 * virtio_net_hdr_mrg_rxbuf in the legacy system).
-	 */
-	vnet_hdr_sz = sizeof(struct virtio_net_hdr_v1);
-	if (ioctl(netfd, TUNSETVNETHDRSZ, &vnet_hdr_sz) != 0)
-		err(1, "Setting tun header size to %u", vnet_hdr_sz);
-
-	memcpy(tapif, ifr.ifr_name, IFNAMSIZ);
-	return netfd;
-}
-
-/*L:195
- * Our network is a Host<->Guest network.  This can either use bridging or
- * routing, but the principle is the same: it uses the "tun" device to inject
- * packets into the Host as if they came in from a normal network card.  We
- * just shunt packets between the Guest and the tun device.
- */
-static void setup_tun_net(char *arg)
-{
-	struct device *dev;
-	struct net_info *net_info = malloc(sizeof(*net_info));
-	int ipfd;
-	u32 ip = INADDR_ANY;
-	bool bridging = false;
-	char tapif[IFNAMSIZ], *p;
-	struct virtio_net_config conf;
-
-	net_info->tunfd = get_tun_device(tapif);
-
-	/* First we create a new network device. */
-	dev = new_pci_device("net", VIRTIO_ID_NET, 0x02, 0x00);
-	dev->priv = net_info;
-
-	/* Network devices need a recv and a send queue, just like console. */
-	add_pci_virtqueue(dev, net_input, "rx");
-	add_pci_virtqueue(dev, net_output, "tx");
-
-	/*
-	 * We need a socket to perform the magic network ioctls to bring up the
-	 * tap interface, connect to the bridge etc.  Any socket will do!
-	 */
-	ipfd = socket(PF_INET, SOCK_DGRAM, IPPROTO_IP);
-	if (ipfd < 0)
-		err(1, "opening IP socket");
-
-	/* If the command line was --tunnet=bridge:<name> do bridging. */
-	if (!strncmp(BRIDGE_PFX, arg, strlen(BRIDGE_PFX))) {
-		arg += strlen(BRIDGE_PFX);
-		bridging = true;
-	}
-
-	/* A mac address may follow the bridge name or IP address */
-	p = strchr(arg, ':');
-	if (p) {
-		str2mac(p+1, conf.mac);
-		add_pci_feature(dev, VIRTIO_NET_F_MAC);
-		*p = '\0';
-	}
-
-	/* arg is now either an IP address or a bridge name */
-	if (bridging)
-		add_to_bridge(ipfd, tapif, arg);
-	else
-		ip = str2ip(arg);
-
-	/* Set up the tun device. */
-	configure_device(ipfd, tapif, ip);
-
-	/* Expect Guest to handle everything except UFO */
-	add_pci_feature(dev, VIRTIO_NET_F_CSUM);
-	add_pci_feature(dev, VIRTIO_NET_F_GUEST_CSUM);
-	add_pci_feature(dev, VIRTIO_NET_F_GUEST_TSO4);
-	add_pci_feature(dev, VIRTIO_NET_F_GUEST_TSO6);
-	add_pci_feature(dev, VIRTIO_NET_F_GUEST_ECN);
-	add_pci_feature(dev, VIRTIO_NET_F_HOST_TSO4);
-	add_pci_feature(dev, VIRTIO_NET_F_HOST_TSO6);
-	add_pci_feature(dev, VIRTIO_NET_F_HOST_ECN);
-	/* We handle indirect ring entries */
-	add_pci_feature(dev, VIRTIO_RING_F_INDIRECT_DESC);
-	set_device_config(dev, &conf, sizeof(conf));
-
-	/* We don't need the socket any more; setup is done. */
-	close(ipfd);
-
-	if (bridging)
-		verbose("device %u: tun %s attached to bridge: %s\n",
-			devices.device_num, tapif, arg);
-	else
-		verbose("device %u: tun %s: %s\n",
-			devices.device_num, tapif, arg);
-}
-/*:*/
-
-/* This hangs off device->priv. */
-struct vblk_info {
-	/* The size of the file. */
-	off64_t len;
-
-	/* The file descriptor for the file. */
-	int fd;
-
-};
-
-/*L:210
- * The Disk
- *
- * The disk only has one virtqueue, so it only has one thread.  It is really
- * simple: the Guest asks for a block number and we read or write that position
- * in the file.
- *
- * Before we serviced each virtqueue in a separate thread, that was unacceptably
- * slow: the Guest waits until the read is finished before running anything
- * else, even if it could have been doing useful work.
- *
- * We could have used async I/O, except it's reputed to suck so hard that
- * characters actually go missing from your code when you try to use it.
- */
-static void blk_request(struct virtqueue *vq)
-{
-	struct vblk_info *vblk = vq->dev->priv;
-	unsigned int head, out_num, in_num, wlen;
-	int ret, i;
-	u8 *in;
-	struct virtio_blk_outhdr out;
-	struct iovec iov[vq->vring.num];
-	off64_t off;
-
-	/*
-	 * Get the next request, where we normally wait.  It triggers the
-	 * interrupt to acknowledge previously serviced requests (if any).
-	 */
-	head = wait_for_vq_desc(vq, iov, &out_num, &in_num);
-
-	/* Copy the output header from the front of the iov (adjusts iov) */
-	iov_consume(vq->dev, iov, out_num, &out, sizeof(out));
-
-	/* Find and trim end of iov input array, for our status byte. */
-	in = NULL;
-	for (i = out_num + in_num - 1; i >= out_num; i--) {
-		if (iov[i].iov_len > 0) {
-			in = iov[i].iov_base + iov[i].iov_len - 1;
-			iov[i].iov_len--;
-			break;
-		}
-	}
-	if (!in)
-		bad_driver_vq(vq, "Bad virtblk cmd with no room for status");
-
-	/*
-	 * For historical reasons, block operations are expressed in 512 byte
-	 * "sectors".
-	 */
-	off = out.sector * 512;
-
-	if (out.type & VIRTIO_BLK_T_OUT) {
-		/*
-		 * Write
-		 *
-		 * Move to the right location in the block file.  This can fail
-		 * if they try to write past end.
-		 */
-		if (lseek64(vblk->fd, off, SEEK_SET) != off)
-			err(1, "Bad seek to sector %llu", out.sector);
-
-		ret = writev(vblk->fd, iov, out_num);
-		verbose("WRITE to sector %llu: %i\n", out.sector, ret);
-
-		/*
-		 * Grr... Now we know how long the descriptor they sent was, we
-		 * make sure they didn't try to write over the end of the block
-		 * file (possibly extending it).
-		 */
-		if (ret > 0 && off + ret > vblk->len) {
-			/* Trim it back to the correct length */
-			ftruncate64(vblk->fd, vblk->len);
-			/* Die, bad Guest, die. */
-			bad_driver_vq(vq, "Write past end %llu+%u", off, ret);
-		}
-
-		wlen = sizeof(*in);
-		*in = (ret >= 0 ? VIRTIO_BLK_S_OK : VIRTIO_BLK_S_IOERR);
-	} else if (out.type & VIRTIO_BLK_T_FLUSH) {
-		/* Flush */
-		ret = fdatasync(vblk->fd);
-		verbose("FLUSH fdatasync: %i\n", ret);
-		wlen = sizeof(*in);
-		*in = (ret >= 0 ? VIRTIO_BLK_S_OK : VIRTIO_BLK_S_IOERR);
-	} else {
-		/*
-		 * Read
-		 *
-		 * Move to the right location in the block file.  This can fail
-		 * if they try to read past end.
-		 */
-		if (lseek64(vblk->fd, off, SEEK_SET) != off)
-			err(1, "Bad seek to sector %llu", out.sector);
-
-		ret = readv(vblk->fd, iov + out_num, in_num);
-		if (ret >= 0) {
-			wlen = sizeof(*in) + ret;
-			*in = VIRTIO_BLK_S_OK;
-		} else {
-			wlen = sizeof(*in);
-			*in = VIRTIO_BLK_S_IOERR;
-		}
-	}
-
-	/* Finished that request. */
-	add_used(vq, head, wlen);
-}
-
-/*L:198 This actually sets up a virtual block device. */
-static void setup_block_file(const char *filename)
-{
-	struct device *dev;
-	struct vblk_info *vblk;
-	struct virtio_blk_config conf;
-
-	/* Create the device. */
-	dev = new_pci_device("block", VIRTIO_ID_BLOCK, 0x01, 0x80);
-
-	/* The device has one virtqueue, where the Guest places requests. */
-	add_pci_virtqueue(dev, blk_request, "request");
-
-	/* Allocate the room for our own bookkeeping */
-	vblk = dev->priv = malloc(sizeof(*vblk));
-
-	/* First we open the file and store the length. */
-	vblk->fd = open_or_die(filename, O_RDWR|O_LARGEFILE);
-	vblk->len = lseek64(vblk->fd, 0, SEEK_END);
-
-	/* Tell Guest how many sectors this device has. */
-	conf.capacity = cpu_to_le64(vblk->len / 512);
-
-	/*
-	 * Tell Guest not to put in too many descriptors at once: two are used
-	 * for the in and out elements.
-	 */
-	add_pci_feature(dev, VIRTIO_BLK_F_SEG_MAX);
-	conf.seg_max = cpu_to_le32(VIRTQUEUE_NUM - 2);
-
-	set_device_config(dev, &conf, sizeof(struct virtio_blk_config));
-
-	verbose("device %u: virtblock %llu sectors\n",
-		devices.device_num, le64_to_cpu(conf.capacity));
-}
-
-/*L:211
- * Our random number generator device reads from /dev/urandom into the Guest's
- * input buffers.  The usual case is that the Guest doesn't want random numbers
- * and so has no buffers although /dev/urandom is still readable, whereas
- * console is the reverse.
- *
- * The same logic applies, however.
- */
-struct rng_info {
-	int rfd;
-};
-
-static void rng_input(struct virtqueue *vq)
-{
-	int len;
-	unsigned int head, in_num, out_num, totlen = 0;
-	struct rng_info *rng_info = vq->dev->priv;
-	struct iovec iov[vq->vring.num];
-
-	/* First we need a buffer from the Guests's virtqueue. */
-	head = wait_for_vq_desc(vq, iov, &out_num, &in_num);
-	if (out_num)
-		bad_driver_vq(vq, "Output buffers in rng?");
-
-	/*
-	 * Just like the console write, we loop to cover the whole iovec.
-	 * In this case, short reads actually happen quite a bit.
-	 */
-	while (!iov_empty(iov, in_num)) {
-		len = readv(rng_info->rfd, iov, in_num);
-		if (len <= 0)
-			err(1, "Read from /dev/urandom gave %i", len);
-		iov_consume(vq->dev, iov, in_num, NULL, len);
-		totlen += len;
-	}
-
-	/* Tell the Guest about the new input. */
-	add_used(vq, head, totlen);
-}
-
-/*L:199
- * This creates a "hardware" random number device for the Guest.
- */
-static void setup_rng(void)
-{
-	struct device *dev;
-	struct rng_info *rng_info = malloc(sizeof(*rng_info));
-
-	/* Our device's private info simply contains the /dev/urandom fd. */
-	rng_info->rfd = open_or_die("/dev/urandom", O_RDONLY);
-
-	/* Create the new device. */
-	dev = new_pci_device("rng", VIRTIO_ID_RNG, 0xff, 0);
-	dev->priv = rng_info;
-
-	/* The device has one virtqueue, where the Guest places inbufs. */
-	add_pci_virtqueue(dev, rng_input, "input");
-
-	/* We don't have any configuration space */
-	no_device_config(dev);
-
-	verbose("device %u: rng\n", devices.device_num);
-}
-/* That's the end of device setup. */
-
-/*L:230 Reboot is pretty easy: clean up and exec() the Launcher afresh. */
-static void __attribute__((noreturn)) restart_guest(void)
-{
-	unsigned int i;
-
-	/*
-	 * Since we don't track all open fds, we simply close everything beyond
-	 * stderr.
-	 */
-	for (i = 3; i < FD_SETSIZE; i++)
-		close(i);
-
-	/* Reset all the devices (kills all threads). */
-	cleanup_devices();
-
-	execv(main_args[0], main_args);
-	err(1, "Could not exec %s", main_args[0]);
-}
-
-/*L:220
- * Finally we reach the core of the Launcher which runs the Guest, serves
- * its input and output, and finally, lays it to rest.
- */
-static void __attribute__((noreturn)) run_guest(void)
-{
-	for (;;) {
-		struct lguest_pending notify;
-		int readval;
-
-		/* We read from the /dev/lguest device to run the Guest. */
-		readval = pread(lguest_fd, &notify, sizeof(notify), cpu_id);
-		if (readval == sizeof(notify)) {
-			if (notify.trap == 13) {
-				verbose("Emulating instruction at %#x\n",
-					getreg(eip));
-				emulate_insn(notify.insn);
-			} else if (notify.trap == 14) {
-				verbose("Emulating MMIO at %#x\n",
-					getreg(eip));
-				emulate_mmio(notify.addr, notify.insn);
-			} else
-				errx(1, "Unknown trap %i addr %#08x\n",
-				     notify.trap, notify.addr);
-		/* ENOENT means the Guest died.  Reading tells us why. */
-		} else if (errno == ENOENT) {
-			char reason[1024] = { 0 };
-			pread(lguest_fd, reason, sizeof(reason)-1, cpu_id);
-			errx(1, "%s", reason);
-		/* ERESTART means that we need to reboot the guest */
-		} else if (errno == ERESTART) {
-			restart_guest();
-		/* Anything else means a bug or incompatible change. */
-		} else
-			err(1, "Running guest failed");
-	}
-}
-/*L:240
- * This is the end of the Launcher.  The good news: we are over halfway
- * through!  The bad news: the most fiendish part of the code still lies ahead
- * of us.
- *
- * Are you ready?  Take a deep breath and join me in the core of the Host, in
- * "make Host".
-:*/
-
-static struct option opts[] = {
-	{ "verbose", 0, NULL, 'v' },
-	{ "tunnet", 1, NULL, 't' },
-	{ "block", 1, NULL, 'b' },
-	{ "rng", 0, NULL, 'r' },
-	{ "initrd", 1, NULL, 'i' },
-	{ "username", 1, NULL, 'u' },
-	{ "chroot", 1, NULL, 'c' },
-	{ NULL },
-};
-static void usage(void)
-{
-	errx(1, "Usage: lguest [--verbose] "
-	     "[--tunnet=(<ipaddr>:<macaddr>|bridge:<bridgename>:<macaddr>)\n"
-	     "|--block=<filename>|--initrd=<filename>]...\n"
-	     "<mem-in-mb> vmlinux [args...]");
-}
-
-/*L:105 The main routine is where the real work begins: */
-int main(int argc, char *argv[])
-{
-	/* Memory, code startpoint and size of the (optional) initrd. */
-	unsigned long mem = 0, start, initrd_size = 0;
-	/* Two temporaries. */
-	int i, c;
-	/* The boot information for the Guest. */
-	struct boot_params *boot;
-	/* If they specify an initrd file to load. */
-	const char *initrd_name = NULL;
-
-	/* Password structure for initgroups/setres[gu]id */
-	struct passwd *user_details = NULL;
-
-	/* Directory to chroot to */
-	char *chroot_path = NULL;
-
-	/* Save the args: we "reboot" by execing ourselves again. */
-	main_args = argv;
-
-	/*
-	 * First we initialize the device list.  We remember next interrupt
-	 * number to use for devices (1: remember that 0 is used by the timer).
-	 */
-	devices.next_irq = 1;
-
-	/* We're CPU 0.  In fact, that's the only CPU possible right now. */
-	cpu_id = 0;
-
-	/*
-	 * We need to know how much memory so we can set up the device
-	 * descriptor and memory pages for the devices as we parse the command
-	 * line.  So we quickly look through the arguments to find the amount
-	 * of memory now.
-	 */
-	for (i = 1; i < argc; i++) {
-		if (argv[i][0] != '-') {
-			mem = atoi(argv[i]) * 1024 * 1024;
-			/*
-			 * We start by mapping anonymous pages over all of
-			 * guest-physical memory range.  This fills it with 0,
-			 * and ensures that the Guest won't be killed when it
-			 * tries to access it.
-			 */
-			guest_base = map_zeroed_pages(mem / getpagesize()
-						      + DEVICE_PAGES);
-			guest_limit = mem;
-			guest_max = guest_mmio = mem + DEVICE_PAGES*getpagesize();
-			break;
-		}
-	}
-
-	/* If we exit via err(), this kills all the threads, restores tty. */
-	atexit(cleanup_devices);
-
-	/* We always have a console device, and it's always device 1. */
-	setup_console();
-
-	/* The options are fairly straight-forward */
-	while ((c = getopt_long(argc, argv, "v", opts, NULL)) != EOF) {
-		switch (c) {
-		case 'v':
-			verbose = true;
-			break;
-		case 't':
-			setup_tun_net(optarg);
-			break;
-		case 'b':
-			setup_block_file(optarg);
-			break;
-		case 'r':
-			setup_rng();
-			break;
-		case 'i':
-			initrd_name = optarg;
-			break;
-		case 'u':
-			user_details = getpwnam(optarg);
-			if (!user_details)
-				err(1, "getpwnam failed, incorrect username?");
-			break;
-		case 'c':
-			chroot_path = optarg;
-			break;
-		default:
-			warnx("Unknown argument %s", argv[optind]);
-			usage();
-		}
-	}
-	/*
-	 * After the other arguments we expect memory and kernel image name,
-	 * followed by command line arguments for the kernel.
-	 */
-	if (optind + 2 > argc)
-		usage();
-
-	verbose("Guest base is at %p\n", guest_base);
-
-	/* Initialize the (fake) PCI host bridge device. */
-	init_pci_host_bridge();
-
-	/* Now we load the kernel */
-	start = load_kernel(open_or_die(argv[optind+1], O_RDONLY));
-
-	/* Boot information is stashed at physical address 0 */
-	boot = from_guest_phys(0);
-
-	/* Map the initrd image if requested (at top of physical memory) */
-	if (initrd_name) {
-		initrd_size = load_initrd(initrd_name, mem);
-		/*
-		 * These are the location in the Linux boot header where the
-		 * start and size of the initrd are expected to be found.
-		 */
-		boot->hdr.ramdisk_image = mem - initrd_size;
-		boot->hdr.ramdisk_size = initrd_size;
-		/* The bootloader type 0xFF means "unknown"; that's OK. */
-		boot->hdr.type_of_loader = 0xFF;
-	}
-
-	/*
-	 * The Linux boot header contains an "E820" memory map: ours is a
-	 * simple, single region.
-	 */
-	boot->e820_entries = 1;
-	boot->e820_table[0] = ((struct e820_entry) { 0, mem, E820_TYPE_RAM });
-	/*
-	 * The boot header contains a command line pointer: we put the command
-	 * line after the boot header.
-	 */
-	boot->hdr.cmd_line_ptr = to_guest_phys(boot + 1);
-	/* We use a simple helper to copy the arguments separated by spaces. */
-	concat((char *)(boot + 1), argv+optind+2);
-
-	/* Set kernel alignment to 16M (CONFIG_PHYSICAL_ALIGN) */
-	boot->hdr.kernel_alignment = 0x1000000;
-
-	/* Boot protocol version: 2.07 supports the fields for lguest. */
-	boot->hdr.version = 0x207;
-
-	/* X86_SUBARCH_LGUEST tells the Guest it's an lguest. */
-	boot->hdr.hardware_subarch = X86_SUBARCH_LGUEST;
-
-	/* Tell the entry path not to try to reload segment registers. */
-	boot->hdr.loadflags |= KEEP_SEGMENTS;
-
-	/* We don't support tboot: */
-	boot->tboot_addr = 0;
-
-	/* Ensure this is 0 to prevent APM from loading: */
-	boot->apm_bios_info.version = 0;
-
-	/* We tell the kernel to initialize the Guest. */
-	tell_kernel(start);
-
-	/* Ensure that we terminate if a device-servicing child dies. */
-	signal(SIGCHLD, kill_launcher);
-
-	/* If requested, chroot to a directory */
-	if (chroot_path) {
-		if (chroot(chroot_path) != 0)
-			err(1, "chroot(\"%s\") failed", chroot_path);
-
-		if (chdir("/") != 0)
-			err(1, "chdir(\"/\") failed");
-
-		verbose("chroot done\n");
-	}
-
-	/* If requested, drop privileges */
-	if (user_details) {
-		uid_t u;
-		gid_t g;
-
-		u = user_details->pw_uid;
-		g = user_details->pw_gid;
-
-		if (initgroups(user_details->pw_name, g) != 0)
-			err(1, "initgroups failed");
-
-		if (setresgid(g, g, g) != 0)
-			err(1, "setresgid failed");
-
-		if (setresuid(u, u, u) != 0)
-			err(1, "setresuid failed");
-
-		verbose("Dropping privileges completed\n");
-	}
-
-	/* Finally, run the Guest.  This doesn't return. */
-	run_guest();
-}
-/*:*/
-
-/*M:999
- * Mastery is done: you now know everything I do.
- *
- * But surely you have seen code, features and bugs in your wanderings which
- * you now yearn to attack?  That is the real game, and I look forward to you
- * patching and forking lguest into the Your-Name-Here-visor.
- *
- * Farewell, and good coding!
- * Rusty Russell.
- */
diff --git a/tools/lguest/lguest.txt b/tools/lguest/lguest.txt
deleted file mode 100644
index 06e1f4649511..000000000000
--- a/tools/lguest/lguest.txt
+++ /dev/null
@@ -1,125 +0,0 @@
-      __
- (___()'`;  Rusty's Remarkably Unreliable Guide to Lguest
- /,    /`      - or, A Young Coder's Illustrated Hypervisor
- \\"--\\    http://lguest.ozlabs.org
-
-Lguest is designed to be a minimal 32-bit x86 hypervisor for the Linux kernel,
-for Linux developers and users to experiment with virtualization with the
-minimum of complexity.  Nonetheless, it should have sufficient features to
-make it useful for specific tasks, and, of course, you are encouraged to fork
-and enhance it (see drivers/lguest/README).
-
-Features:
-
-- Kernel module which runs in a normal kernel.
-- Simple I/O model for communication.
-- Simple program to create new guests.
-- Logo contains cute puppies: http://lguest.ozlabs.org
-
-Developer features:
-
-- Fun to hack on.
-- No ABI: being tied to a specific kernel anyway, you can change anything.
-- Many opportunities for improvement or feature implementation.
-
-Running Lguest:
-
-- The easiest way to run lguest is to use same kernel as guest and host.
-  You can configure them differently, but usually it's easiest not to.
-
-  You will need to configure your kernel with the following options:
-
-  "Processor type and features":
-     "Paravirtualized guest support" = Y
-        "Lguest guest support" = Y
-     "High Memory Support" = off/4GB
-     "Alignment value to which kernel should be aligned" = 0x100000
-        (CONFIG_PARAVIRT=y, CONFIG_LGUEST_GUEST=y, CONFIG_HIGHMEM64G=n and
-         CONFIG_PHYSICAL_ALIGN=0x100000)
-
-  "Device Drivers":
-     "Block devices"
-        "Virtio block driver" = M/Y
-     "Network device support"
-        "Universal TUN/TAP device driver support" = M/Y
-        "Virtio network driver" = M/Y
-           (CONFIG_VIRTIO_BLK=m, CONFIG_VIRTIO_NET=m and CONFIG_TUN=m)
-
-  "Virtualization"
-     "Linux hypervisor example code" = M/Y
-        (CONFIG_LGUEST=m)
-
-- A tool called "lguest" is available in this directory: type "make"
-  to build it.  If you didn't build your kernel in-tree, use "make
-  O=<builddir>".
-
-- Create or find a root disk image.  There are several useful ones
-  around, such as the xm-test tiny root image at
-	  http://xm-test.xensource.com/ramdisks/initrd-1.1-i386.img
-
-  For more serious work, I usually use a distribution ISO image and
-  install it under qemu, then make multiple copies:
-
-	  dd if=/dev/zero of=rootfile bs=1M count=2048
-	  qemu -cdrom image.iso -hda rootfile -net user -net nic -boot d
-
-  Make sure that you install a getty on /dev/hvc0 if you want to log in on the
-  console!
-
-- "modprobe lg" if you built it as a module.
-
-- Run an lguest as root:
-
-      tools/lguest/lguest 64 vmlinux --tunnet=192.168.19.1 \
-        --block=rootfile root=/dev/vda
-
-   Explanation:
-    64: the amount of memory to use, in MB.
-
-    vmlinux: the kernel image found in the top of your build directory.  You
-       can also use a standard bzImage.
-
-    --tunnet=192.168.19.1: configures a "tap" device for networking with this
-       IP address.
-
-    --block=rootfile: a file or block device which becomes /dev/vda
-       inside the guest.
-
-    root=/dev/vda: this (and anything else on the command line) are
-       kernel boot parameters.
-
-- Configuring networking.  I usually have the host masquerade, using
-  "iptables -t nat -A POSTROUTING -o eth0 -j MASQUERADE" and "echo 1 >
-  /proc/sys/net/ipv4/ip_forward".  In this example, I would configure
-  eth0 inside the guest at 192.168.19.2.
-
-  Another method is to bridge the tap device to an external interface
-  using --tunnet=bridge:<bridgename>, and perhaps run dhcp on the guest
-  to obtain an IP address.  The bridge needs to be configured first:
-  this option simply adds the tap interface to it.
-
-  A simple example on my system:
-
-    ifconfig eth0 0.0.0.0
-    brctl addbr lg0
-    ifconfig lg0 up
-    brctl addif lg0 eth0
-    dhclient lg0
-
-  Then use --tunnet=bridge:lg0 when launching the guest.
-
-  See:
-  
-    http://www.linuxfoundation.org/collaborate/workgroups/networking/bridge
-    
-  for general information on how to get bridging to work.
-
-- Random number generation. Using the --rng option will provide a
-  /dev/hwrng in the guest that will read from the host's /dev/random.
-  Use this option in conjunction with rng-tools (see ../hw_random.txt)
-  to provide entropy to the guest kernel's /dev/random.
-
-There is a helpful mailing list at http://ozlabs.org/mailman/listinfo/lguest
-
-Good luck!
-Rusty Russell rusty@rustcorp.com.au.
-- 
cgit v1.2.3


From c84f5683f6e9fee78e054431d89121225ccb7464 Mon Sep 17 00:00:00 2001
From: Priit Laes <plaes@plaes.org>
Date: Wed, 23 Aug 2017 20:23:29 +0300
Subject: clk: sunxi-ng: Add sun4i/sun7i CCU driver

Introduce a clock controller driver for sun4i A10 and sun7i A20
series SoCs.

Signed-off-by: Priit Laes <plaes@plaes.org>
Signed-off-by: Maxime Ripard <maxime.ripard@free-electrons.com>
---
 drivers/clk/sunxi-ng/Kconfig              |   13 +
 drivers/clk/sunxi-ng/Makefile             |    1 +
 drivers/clk/sunxi-ng/ccu-sun4i-a10.c      | 1456 +++++++++++++++++++++++++++++
 drivers/clk/sunxi-ng/ccu-sun4i-a10.h      |   61 ++
 include/dt-bindings/clock/sun4i-a10-ccu.h |  200 ++++
 include/dt-bindings/clock/sun7i-a20-ccu.h |   53 ++
 include/dt-bindings/reset/sun4i-a10-ccu.h |   69 ++
 7 files changed, 1853 insertions(+)
 create mode 100644 drivers/clk/sunxi-ng/ccu-sun4i-a10.c
 create mode 100644 drivers/clk/sunxi-ng/ccu-sun4i-a10.h
 create mode 100644 include/dt-bindings/clock/sun4i-a10-ccu.h
 create mode 100644 include/dt-bindings/clock/sun7i-a20-ccu.h
 create mode 100644 include/dt-bindings/reset/sun4i-a10-ccu.h

(limited to 'include')

diff --git a/drivers/clk/sunxi-ng/Kconfig b/drivers/clk/sunxi-ng/Kconfig
index 7a360737fe3c..6427d0ebe2de 100644
--- a/drivers/clk/sunxi-ng/Kconfig
+++ b/drivers/clk/sunxi-ng/Kconfig
@@ -11,6 +11,19 @@ config SUN50I_A64_CCU
 	default ARM64 && ARCH_SUNXI
 	depends on (ARM64 && ARCH_SUNXI) || COMPILE_TEST
 
+config SUN4I_A10_CCU
+	bool "Support for the Allwinner A10/A20 CCU"
+	select SUNXI_CCU_DIV
+	select SUNXI_CCU_MULT
+	select SUNXI_CCU_NK
+	select SUNXI_CCU_NKM
+	select SUNXI_CCU_NM
+	select SUNXI_CCU_MP
+	select SUNXI_CCU_PHASE
+	default MACH_SUN4I
+	default MACH_SUN7I
+	depends on MACH_SUN4I || MACH_SUN7I || COMPILE_TEST
+
 config SUN5I_CCU
 	bool "Support for the Allwinner sun5i family CCM"
 	default MACH_SUN5I
diff --git a/drivers/clk/sunxi-ng/Makefile b/drivers/clk/sunxi-ng/Makefile
index 58741cd647c3..1fd8fe9c1e3e 100644
--- a/drivers/clk/sunxi-ng/Makefile
+++ b/drivers/clk/sunxi-ng/Makefile
@@ -19,6 +19,7 @@ lib-$(CONFIG_SUNXI_CCU)		+= ccu_mp.o
 
 # SoC support
 obj-$(CONFIG_SUN50I_A64_CCU)	+= ccu-sun50i-a64.o
+obj-$(CONFIG_SUN4I_A10_CCU)	+= ccu-sun4i-a10.o
 obj-$(CONFIG_SUN5I_CCU)		+= ccu-sun5i.o
 obj-$(CONFIG_SUN6I_A31_CCU)	+= ccu-sun6i-a31.o
 obj-$(CONFIG_SUN8I_A23_CCU)	+= ccu-sun8i-a23.o
diff --git a/drivers/clk/sunxi-ng/ccu-sun4i-a10.c b/drivers/clk/sunxi-ng/ccu-sun4i-a10.c
new file mode 100644
index 000000000000..286b0049b7b6
--- /dev/null
+++ b/drivers/clk/sunxi-ng/ccu-sun4i-a10.c
@@ -0,0 +1,1456 @@
+/*
+ * Copyright (c) 2017 Priit Laes <plaes@plaes.org>.
+ * Copyright (c) 2017 Maxime Ripard.
+ * Copyright (c) 2017 Jonathan Liu.
+ *
+ * This software is licensed under the terms of the GNU General Public
+ * License version 2, as published by the Free Software Foundation, and
+ * may be copied, distributed, and modified under those terms.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ */
+
+#include <linux/clk-provider.h>
+#include <linux/of_address.h>
+
+#include "ccu_common.h"
+#include "ccu_reset.h"
+
+#include "ccu_div.h"
+#include "ccu_gate.h"
+#include "ccu_mp.h"
+#include "ccu_mult.h"
+#include "ccu_nk.h"
+#include "ccu_nkm.h"
+#include "ccu_nkmp.h"
+#include "ccu_nm.h"
+#include "ccu_phase.h"
+
+#include "ccu-sun4i-a10.h"
+
+static struct ccu_nkmp pll_core_clk = {
+	.enable		= BIT(31),
+	.n		= _SUNXI_CCU_MULT_OFFSET(8, 5, 0),
+	.k		= _SUNXI_CCU_MULT(4, 2),
+	.m		= _SUNXI_CCU_DIV(0, 2),
+	.p		= _SUNXI_CCU_DIV(16, 2),
+	.common		= {
+		.reg		= 0x000,
+		.hw.init	= CLK_HW_INIT("pll-core",
+					      "hosc",
+					      &ccu_nkmp_ops,
+					      0),
+	},
+};
+
+/*
+ * The Audio PLL is supposed to have 4 outputs: 3 fixed factors from
+ * the base (2x, 4x and 8x), and one variable divider (the one true
+ * pll audio).
+ *
+ * We don't have any need for the variable divider for now, so we just
+ * hardcode it to match with the clock names.
+ */
+#define SUN4I_PLL_AUDIO_REG	0x008
+static struct ccu_nm pll_audio_base_clk = {
+	.enable		= BIT(31),
+	.n		= _SUNXI_CCU_MULT_OFFSET(8, 7, 0),
+	.m		= _SUNXI_CCU_DIV_OFFSET(0, 5, 0),
+	.common		= {
+		.reg		= 0x008,
+		.hw.init	= CLK_HW_INIT("pll-audio-base",
+					      "hosc",
+					      &ccu_nm_ops,
+					      0),
+	},
+
+};
+
+static struct ccu_mult pll_video0_clk = {
+	.enable		= BIT(31),
+	.mult		= _SUNXI_CCU_MULT_OFFSET_MIN_MAX(0, 7, 0, 9, 127),
+	.frac		= _SUNXI_CCU_FRAC(BIT(15), BIT(14),
+					  270000000, 297000000),
+	.common		= {
+		.reg		= 0x010,
+		.features	= (CCU_FEATURE_FRACTIONAL |
+				   CCU_FEATURE_ALL_PREDIV),
+		.prediv		= 8,
+		.hw.init	= CLK_HW_INIT("pll-video0",
+					      "hosc",
+					      &ccu_mult_ops,
+					      0),
+	},
+};
+
+static struct ccu_nkmp pll_ve_sun4i_clk = {
+	.enable		= BIT(31),
+	.n		= _SUNXI_CCU_MULT_OFFSET(8, 5, 0),
+	.k		= _SUNXI_CCU_MULT(4, 2),
+	.m		= _SUNXI_CCU_DIV(0, 2),
+	.p		= _SUNXI_CCU_DIV(16, 2),
+	.common		= {
+		.reg		= 0x018,
+		.hw.init	= CLK_HW_INIT("pll-ve",
+					      "hosc",
+					      &ccu_nkmp_ops,
+					      0),
+	},
+};
+
+static struct ccu_nk pll_ve_sun7i_clk = {
+	.enable		= BIT(31),
+	.n		= _SUNXI_CCU_MULT_OFFSET(8, 5, 0),
+	.k		= _SUNXI_CCU_MULT(4, 2),
+	.common		= {
+		.reg		= 0x018,
+		.hw.init	= CLK_HW_INIT("pll-ve",
+					      "hosc",
+					      &ccu_nk_ops,
+					      0),
+	},
+};
+
+static struct ccu_nk pll_ddr_base_clk = {
+	.enable		= BIT(31),
+	.n		= _SUNXI_CCU_MULT_OFFSET(8, 5, 0),
+	.k		= _SUNXI_CCU_MULT(4, 2),
+	.common		= {
+		.reg		= 0x020,
+		.hw.init	= CLK_HW_INIT("pll-ddr-base",
+					      "hosc",
+					      &ccu_nk_ops,
+					      0),
+	},
+};
+
+static SUNXI_CCU_M(pll_ddr_clk, "pll-ddr", "pll-ddr-base", 0x020, 0, 2,
+		   CLK_IS_CRITICAL);
+
+static struct ccu_div pll_ddr_other_clk = {
+	.div		= _SUNXI_CCU_DIV_FLAGS(16, 2, CLK_DIVIDER_POWER_OF_TWO),
+	.common		= {
+		.reg		= 0x020,
+		.hw.init	= CLK_HW_INIT("pll-ddr-other", "pll-ddr-base",
+					      &ccu_div_ops,
+					      0),
+	},
+};
+
+static struct ccu_nk pll_periph_base_clk = {
+	.enable		= BIT(31),
+	.n		= _SUNXI_CCU_MULT_OFFSET(8, 5, 0),
+	.k		= _SUNXI_CCU_MULT(4, 2),
+	.common		= {
+		.reg		= 0x028,
+		.hw.init	= CLK_HW_INIT("pll-periph-base",
+					      "hosc",
+					      &ccu_nk_ops,
+					      0),
+	},
+};
+
+static CLK_FIXED_FACTOR(pll_periph_clk, "pll-periph", "pll-periph-base",
+			2, 1, CLK_SET_RATE_PARENT);
+
+/* Not documented on A10 */
+static struct ccu_div pll_periph_sata_clk = {
+	.enable		= BIT(14),
+	.div		= _SUNXI_CCU_DIV(0, 2),
+	.fixed_post_div	= 6,
+	.common		= {
+		.reg		= 0x028,
+		.features	= CCU_FEATURE_FIXED_POSTDIV,
+		.hw.init	= CLK_HW_INIT("pll-periph-sata",
+					      "pll-periph-base",
+					      &ccu_div_ops, 0),
+	},
+};
+
+static struct ccu_mult pll_video1_clk = {
+	.enable		= BIT(31),
+	.mult		= _SUNXI_CCU_MULT_OFFSET_MIN_MAX(0, 7, 0, 9, 127),
+	.frac		= _SUNXI_CCU_FRAC(BIT(15), BIT(14),
+				  270000000, 297000000),
+	.common		= {
+		.reg		= 0x030,
+		.features	= (CCU_FEATURE_FRACTIONAL |
+				   CCU_FEATURE_ALL_PREDIV),
+		.prediv		= 8,
+		.hw.init	= CLK_HW_INIT("pll-video1",
+					      "hosc",
+					      &ccu_mult_ops,
+					      0),
+	},
+};
+
+/* Not present on A10 */
+static struct ccu_nk pll_gpu_clk = {
+	.enable		= BIT(31),
+	.n		= _SUNXI_CCU_MULT_OFFSET(8, 5, 0),
+	.k		= _SUNXI_CCU_MULT(4, 2),
+	.common		= {
+		.reg		= 0x040,
+		.hw.init	= CLK_HW_INIT("pll-gpu",
+					      "hosc",
+					      &ccu_nk_ops,
+					      0),
+	},
+};
+
+static SUNXI_CCU_GATE(hosc_clk,	"hosc",	"osc24M", 0x050, BIT(0), 0);
+
+static const char *const cpu_parents[] = { "osc32k", "hosc",
+					   "pll-core", "pll-periph" };
+static const struct ccu_mux_fixed_prediv cpu_predivs[] = {
+	{ .index = 3, .div = 3, },
+};
+
+#define SUN4I_AHB_REG		0x054
+static struct ccu_mux cpu_clk = {
+	.mux		= {
+		.shift		= 16,
+		.width		= 2,
+		.fixed_predivs	= cpu_predivs,
+		.n_predivs	= ARRAY_SIZE(cpu_predivs),
+	},
+	.common		= {
+		.reg		= 0x054,
+		.features	= CCU_FEATURE_FIXED_PREDIV,
+		.hw.init	= CLK_HW_INIT_PARENTS("cpu",
+						      cpu_parents,
+						      &ccu_mux_ops,
+						      CLK_IS_CRITICAL),
+	}
+};
+
+static SUNXI_CCU_M(axi_clk, "axi", "cpu", 0x054, 0, 2, 0);
+
+static struct ccu_div ahb_sun4i_clk = {
+	.div		= _SUNXI_CCU_DIV_FLAGS(4, 2, CLK_DIVIDER_POWER_OF_TWO),
+	.common		= {
+		.reg		= 0x054,
+		.hw.init	= CLK_HW_INIT("ahb", "axi", &ccu_div_ops, 0),
+	},
+};
+
+static const char *const ahb_sun7i_parents[] = { "axi", "pll-periph",
+						 "pll-periph" };
+static const struct ccu_mux_fixed_prediv ahb_sun7i_predivs[] = {
+	{ .index = 1, .div = 2, },
+	{ /* Sentinel */ },
+};
+static struct ccu_div ahb_sun7i_clk = {
+	.div		= _SUNXI_CCU_DIV_FLAGS(4, 2, CLK_DIVIDER_POWER_OF_TWO),
+	.mux		= {
+		.shift		= 6,
+		.width		= 2,
+		.fixed_predivs	= ahb_sun7i_predivs,
+		.n_predivs	= ARRAY_SIZE(ahb_sun7i_predivs),
+	},
+
+	.common		= {
+		.reg		= 0x054,
+		.hw.init	= CLK_HW_INIT_PARENTS("ahb",
+						      ahb_sun7i_parents,
+						      &ccu_div_ops,
+						      0),
+	},
+};
+
+static struct clk_div_table apb0_div_table[] = {
+	{ .val = 0, .div = 2 },
+	{ .val = 1, .div = 2 },
+	{ .val = 2, .div = 4 },
+	{ .val = 3, .div = 8 },
+	{ /* Sentinel */ },
+};
+static SUNXI_CCU_DIV_TABLE(apb0_clk, "apb0", "ahb",
+			   0x054, 8, 2, apb0_div_table, 0);
+
+static const char *const apb1_parents[] = { "hosc", "pll-periph", "osc32k" };
+static SUNXI_CCU_MP_WITH_MUX(apb1_clk, "apb1", apb1_parents, 0x058,
+			     0, 5,	/* M */
+			     16, 2,	/* P */
+			     24, 2,	/* mux */
+			     0);
+
+/* Not present on A20 */
+static SUNXI_CCU_GATE(axi_dram_clk,	"axi-dram",	"ahb",
+		      0x05c, BIT(31), 0);
+
+static SUNXI_CCU_GATE(ahb_otg_clk,	"ahb-otg",	"ahb",
+		      0x060, BIT(0), 0);
+static SUNXI_CCU_GATE(ahb_ehci0_clk,	"ahb-ehci0",	"ahb",
+		      0x060, BIT(1), 0);
+static SUNXI_CCU_GATE(ahb_ohci0_clk,	"ahb-ohci0",	"ahb",
+		      0x060, BIT(2), 0);
+static SUNXI_CCU_GATE(ahb_ehci1_clk,	"ahb-ehci1",	"ahb",
+		      0x060, BIT(3), 0);
+static SUNXI_CCU_GATE(ahb_ohci1_clk,	"ahb-ohci1",	"ahb",
+		      0x060, BIT(4), 0);
+static SUNXI_CCU_GATE(ahb_ss_clk,	"ahb-ss",	"ahb",
+		      0x060, BIT(5), 0);
+static SUNXI_CCU_GATE(ahb_dma_clk,	"ahb-dma",	"ahb",
+		      0x060, BIT(6), 0);
+static SUNXI_CCU_GATE(ahb_bist_clk,	"ahb-bist",	"ahb",
+		      0x060, BIT(7), 0);
+static SUNXI_CCU_GATE(ahb_mmc0_clk,	"ahb-mmc0",	"ahb",
+		      0x060, BIT(8), 0);
+static SUNXI_CCU_GATE(ahb_mmc1_clk,	"ahb-mmc1",	"ahb",
+		      0x060, BIT(9), 0);
+static SUNXI_CCU_GATE(ahb_mmc2_clk,	"ahb-mmc2",	"ahb",
+		      0x060, BIT(10), 0);
+static SUNXI_CCU_GATE(ahb_mmc3_clk,	"ahb-mmc3",	"ahb",
+		      0x060, BIT(11), 0);
+static SUNXI_CCU_GATE(ahb_ms_clk,	"ahb-ms",	"ahb",
+		      0x060, BIT(12), 0);
+static SUNXI_CCU_GATE(ahb_nand_clk,	"ahb-nand",	"ahb",
+		      0x060, BIT(13), 0);
+static SUNXI_CCU_GATE(ahb_sdram_clk,	"ahb-sdram",	"ahb",
+		      0x060, BIT(14), CLK_IS_CRITICAL);
+
+static SUNXI_CCU_GATE(ahb_ace_clk,	"ahb-ace",	"ahb",
+		      0x060, BIT(16), 0);
+static SUNXI_CCU_GATE(ahb_emac_clk,	"ahb-emac",	"ahb",
+		      0x060, BIT(17), 0);
+static SUNXI_CCU_GATE(ahb_ts_clk,	"ahb-ts",	"ahb",
+		      0x060, BIT(18), 0);
+static SUNXI_CCU_GATE(ahb_spi0_clk,	"ahb-spi0",	"ahb",
+		      0x060, BIT(20), 0);
+static SUNXI_CCU_GATE(ahb_spi1_clk,	"ahb-spi1",	"ahb",
+		      0x060, BIT(21), 0);
+static SUNXI_CCU_GATE(ahb_spi2_clk,	"ahb-spi2",	"ahb",
+		      0x060, BIT(22), 0);
+static SUNXI_CCU_GATE(ahb_spi3_clk,	"ahb-spi3",	"ahb",
+		      0x060, BIT(23), 0);
+static SUNXI_CCU_GATE(ahb_pata_clk,	"ahb-pata",	"ahb",
+		      0x060, BIT(24), 0);
+/* Not documented on A20 */
+static SUNXI_CCU_GATE(ahb_sata_clk,	"ahb-sata",	"ahb",
+		      0x060, BIT(25), 0);
+/* Not present on A20 */
+static SUNXI_CCU_GATE(ahb_gps_clk,	"ahb-gps",	"ahb",
+		      0x060, BIT(26), 0);
+/* Not present on A10 */
+static SUNXI_CCU_GATE(ahb_hstimer_clk,	"ahb-hstimer",	"ahb",
+		      0x060, BIT(28), 0);
+
+static SUNXI_CCU_GATE(ahb_ve_clk,	"ahb-ve",	"ahb",
+		      0x064, BIT(0), 0);
+static SUNXI_CCU_GATE(ahb_tvd_clk,	"ahb-tvd",	"ahb",
+		      0x064, BIT(1), 0);
+static SUNXI_CCU_GATE(ahb_tve0_clk,	"ahb-tve0",	"ahb",
+		      0x064, BIT(2), 0);
+static SUNXI_CCU_GATE(ahb_tve1_clk,	"ahb-tve1",	"ahb",
+		      0x064, BIT(3), 0);
+static SUNXI_CCU_GATE(ahb_lcd0_clk,	"ahb-lcd0",	"ahb",
+		      0x064, BIT(4), 0);
+static SUNXI_CCU_GATE(ahb_lcd1_clk,	"ahb-lcd1",	"ahb",
+		      0x064, BIT(5), 0);
+static SUNXI_CCU_GATE(ahb_csi0_clk,	"ahb-csi0",	"ahb",
+		      0x064, BIT(8), 0);
+static SUNXI_CCU_GATE(ahb_csi1_clk,	"ahb-csi1",	"ahb",
+		      0x064, BIT(9), 0);
+/* Not present on A10 */
+static SUNXI_CCU_GATE(ahb_hdmi1_clk,	"ahb-hdmi1",	"ahb",
+		      0x064, BIT(10), 0);
+static SUNXI_CCU_GATE(ahb_hdmi0_clk,	"ahb-hdmi0",	"ahb",
+		      0x064, BIT(11), 0);
+static SUNXI_CCU_GATE(ahb_de_be0_clk,	"ahb-de-be0",	"ahb",
+		      0x064, BIT(12), 0);
+static SUNXI_CCU_GATE(ahb_de_be1_clk,	"ahb-de-be1",	"ahb",
+		      0x064, BIT(13), 0);
+static SUNXI_CCU_GATE(ahb_de_fe0_clk,	"ahb-de-fe0",	"ahb",
+		      0x064, BIT(14), 0);
+static SUNXI_CCU_GATE(ahb_de_fe1_clk,	"ahb-de-fe1",	"ahb",
+		      0x064, BIT(15), 0);
+/* Not present on A10 */
+static SUNXI_CCU_GATE(ahb_gmac_clk,	"ahb-gmac",	"ahb",
+		      0x064, BIT(17), 0);
+static SUNXI_CCU_GATE(ahb_mp_clk,	"ahb-mp",	"ahb",
+		      0x064, BIT(18), 0);
+static SUNXI_CCU_GATE(ahb_gpu_clk,	"ahb-gpu",	"ahb",
+		      0x064, BIT(20), 0);
+
+static SUNXI_CCU_GATE(apb0_codec_clk,	"apb0-codec",	"apb0",
+		      0x068, BIT(0), 0);
+static SUNXI_CCU_GATE(apb0_spdif_clk,	"apb0-spdif",	"apb0",
+		      0x068, BIT(1), 0);
+static SUNXI_CCU_GATE(apb0_ac97_clk,	"apb0-ac97",	"apb0",
+		      0x068, BIT(2), 0);
+static SUNXI_CCU_GATE(apb0_i2s0_clk,	"apb0-i2s0",	"apb0",
+		      0x068, BIT(3), 0);
+/* Not present on A10 */
+static SUNXI_CCU_GATE(apb0_i2s1_clk,	"apb0-i2s1",	"apb0",
+		      0x068, BIT(4), 0);
+static SUNXI_CCU_GATE(apb0_pio_clk,	"apb0-pio",	"apb0",
+		      0x068, BIT(5), 0);
+static SUNXI_CCU_GATE(apb0_ir0_clk,	"apb0-ir0",	"apb0",
+		      0x068, BIT(6), 0);
+static SUNXI_CCU_GATE(apb0_ir1_clk,	"apb0-ir1",	"apb0",
+		      0x068, BIT(7), 0);
+/* Not present on A10 */
+static SUNXI_CCU_GATE(apb0_i2s2_clk,	"apb0-i2s2",	"apb0",
+		      0x068, BIT(8), 0);
+static SUNXI_CCU_GATE(apb0_keypad_clk,	"apb0-keypad",	"apb0",
+		      0x068, BIT(10), 0);
+
+static SUNXI_CCU_GATE(apb1_i2c0_clk,	"apb1-i2c0",	"apb1",
+		      0x06c, BIT(0), 0);
+static SUNXI_CCU_GATE(apb1_i2c1_clk,	"apb1-i2c1",	"apb1",
+		      0x06c, BIT(1), 0);
+static SUNXI_CCU_GATE(apb1_i2c2_clk,	"apb1-i2c2",	"apb1",
+		      0x06c, BIT(2), 0);
+/* Not present on A10 */
+static SUNXI_CCU_GATE(apb1_i2c3_clk,	"apb1-i2c3",	"apb1",
+		      0x06c, BIT(3), 0);
+static SUNXI_CCU_GATE(apb1_can_clk,	"apb1-can",	"apb1",
+		      0x06c, BIT(4), 0);
+static SUNXI_CCU_GATE(apb1_scr_clk,	"apb1-scr",	"apb1",
+		      0x06c, BIT(5), 0);
+static SUNXI_CCU_GATE(apb1_ps20_clk,	"apb1-ps20",	"apb1",
+		      0x06c, BIT(6), 0);
+static SUNXI_CCU_GATE(apb1_ps21_clk,	"apb1-ps21",	"apb1",
+		      0x06c, BIT(7), 0);
+/* Not present on A10 */
+static SUNXI_CCU_GATE(apb1_i2c4_clk,	"apb1-i2c4",	"apb1",
+		      0x06c, BIT(15), 0);
+static SUNXI_CCU_GATE(apb1_uart0_clk,	"apb1-uart0",	"apb1",
+		      0x06c, BIT(16), 0);
+static SUNXI_CCU_GATE(apb1_uart1_clk,	"apb1-uart1",	"apb1",
+		      0x06c, BIT(17), 0);
+static SUNXI_CCU_GATE(apb1_uart2_clk,	"apb1-uart2",	"apb1",
+		      0x06c, BIT(18), 0);
+static SUNXI_CCU_GATE(apb1_uart3_clk,	"apb1-uart3",	"apb1",
+		      0x06c, BIT(19), 0);
+static SUNXI_CCU_GATE(apb1_uart4_clk,	"apb1-uart4",	"apb1",
+		      0x06c, BIT(20), 0);
+static SUNXI_CCU_GATE(apb1_uart5_clk,	"apb1-uart5",	"apb1",
+		      0x06c, BIT(21), 0);
+static SUNXI_CCU_GATE(apb1_uart6_clk,	"apb1-uart6",	"apb1",
+		      0x06c, BIT(22), 0);
+static SUNXI_CCU_GATE(apb1_uart7_clk,	"apb1-uart7",	"apb1",
+		      0x06c, BIT(23), 0);
+
+static const char *const mod0_default_parents[] = { "hosc", "pll-periph",
+						     "pll-ddr-other" };
+static SUNXI_CCU_MP_WITH_MUX_GATE(nand_clk, "nand", mod0_default_parents, 0x080,
+				  0, 4,		/* M */
+				  16, 2,	/* P */
+				  24, 2,	/* mux */
+				  BIT(31),	/* gate */
+				  0);
+
+/* Undocumented on A10 */
+static SUNXI_CCU_MP_WITH_MUX_GATE(ms_clk, "ms", mod0_default_parents, 0x084,
+				  0, 4,		/* M */
+				  16, 2,	/* P */
+				  24, 2,	/* mux */
+				  BIT(31),	/* gate */
+				  0);
+
+static SUNXI_CCU_MP_WITH_MUX_GATE(mmc0_clk, "mmc0", mod0_default_parents, 0x088,
+				  0, 4,		/* M */
+				  16, 2,	/* P */
+				  24, 2,	/* mux */
+				  BIT(31),	/* gate */
+				  0);
+
+/* MMC output and sample clocks are not present on A10 */
+static SUNXI_CCU_PHASE(mmc0_output_clk, "mmc0_output", "mmc0",
+		       0x088, 8, 3, 0);
+static SUNXI_CCU_PHASE(mmc0_sample_clk, "mmc0_sample", "mmc0",
+		       0x088, 20, 3, 0);
+
+static SUNXI_CCU_MP_WITH_MUX_GATE(mmc1_clk, "mmc1", mod0_default_parents, 0x08c,
+				  0, 4,		/* M */
+				  16, 2,	/* P */
+				  24, 2,	/* mux */
+				  BIT(31),	/* gate */
+				  0);
+
+/* MMC output and sample clocks are not present on A10 */
+static SUNXI_CCU_PHASE(mmc1_output_clk, "mmc1_output", "mmc1",
+		       0x08c, 8, 3, 0);
+static SUNXI_CCU_PHASE(mmc1_sample_clk, "mmc1_sample", "mmc1",
+		       0x08c, 20, 3, 0);
+
+static SUNXI_CCU_MP_WITH_MUX_GATE(mmc2_clk, "mmc2", mod0_default_parents, 0x090,
+				  0, 4,		/* M */
+				  16, 2,	/* P */
+				  24, 2,	/* mux */
+				  BIT(31),	/* gate */
+				  0);
+
+/* MMC output and sample clocks are not present on A10 */
+static SUNXI_CCU_PHASE(mmc2_output_clk, "mmc2_output", "mmc2",
+		       0x090, 8, 3, 0);
+static SUNXI_CCU_PHASE(mmc2_sample_clk, "mmc2_sample", "mmc2",
+		       0x090, 20, 3, 0);
+
+static SUNXI_CCU_MP_WITH_MUX_GATE(mmc3_clk, "mmc3", mod0_default_parents, 0x094,
+				  0, 4,		/* M */
+				  16, 2,	/* P */
+				  24, 2,	/* mux */
+				  BIT(31),	/* gate */
+				  0);
+
+/* MMC output and sample clocks are not present on A10 */
+static SUNXI_CCU_PHASE(mmc3_output_clk, "mmc3_output", "mmc3",
+		       0x094, 8, 3, 0);
+static SUNXI_CCU_PHASE(mmc3_sample_clk, "mmc3_sample", "mmc3",
+		       0x094, 20, 3, 0);
+
+static SUNXI_CCU_MP_WITH_MUX_GATE(ts_clk, "ts", mod0_default_parents, 0x098,
+				  0, 4,		/* M */
+				  16, 2,	/* P */
+				  24, 2,	/* mux */
+				  BIT(31),	/* gate */
+				  0);
+
+static SUNXI_CCU_MP_WITH_MUX_GATE(ss_clk, "ss", mod0_default_parents, 0x09c,
+				  0, 4,		/* M */
+				  16, 2,	/* P */
+				  24, 2,	/* mux */
+				  BIT(31),	/* gate */
+				  0);
+
+static SUNXI_CCU_MP_WITH_MUX_GATE(spi0_clk, "spi0", mod0_default_parents, 0x0a0,
+				  0, 4,		/* M */
+				  16, 2,	/* P */
+				  24, 2,	/* mux */
+				  BIT(31),	/* gate */
+				  0);
+
+static SUNXI_CCU_MP_WITH_MUX_GATE(spi1_clk, "spi1", mod0_default_parents, 0x0a4,
+				  0, 4,		/* M */
+				  16, 2,	/* P */
+				  24, 2,	/* mux */
+				  BIT(31),	/* gate */
+				  0);
+
+static SUNXI_CCU_MP_WITH_MUX_GATE(spi2_clk, "spi2", mod0_default_parents, 0x0a8,
+				  0, 4,		/* M */
+				  16, 2,	/* P */
+				  24, 2,	/* mux */
+				  BIT(31),	/* gate */
+				  0);
+
+/* Undocumented on A10 */
+static SUNXI_CCU_MP_WITH_MUX_GATE(pata_clk, "pata", mod0_default_parents, 0x0ac,
+				  0, 4,		/* M */
+				  16, 2,	/* P */
+				  24, 2,	/* mux */
+				  BIT(31),	/* gate */
+				  0);
+
+/* TODO: Check whether A10 actually supports osc32k as 4th parent? */
+static const char *const ir_parents_sun4i[] = { "hosc", "pll-periph",
+						"pll-ddr-other" };
+static SUNXI_CCU_MP_WITH_MUX_GATE(ir0_sun4i_clk, "ir0", ir_parents_sun4i, 0x0b0,
+				  0, 4,		/* M */
+				  16, 2,	/* P */
+				  24, 2,	/* mux */
+				  BIT(31),	/* gate */
+				  0);
+
+static SUNXI_CCU_MP_WITH_MUX_GATE(ir1_sun4i_clk, "ir1", ir_parents_sun4i, 0x0b4,
+				  0, 4,		/* M */
+				  16, 2,	/* P */
+				  24, 2,	/* mux */
+				  BIT(31),	/* gate */
+				  0);
+static const char *const ir_parents_sun7i[] = { "hosc", "pll-periph",
+						"pll-ddr-other", "osc32k" };
+static SUNXI_CCU_MP_WITH_MUX_GATE(ir0_sun7i_clk, "ir0", ir_parents_sun7i, 0x0b0,
+				  0, 4,		/* M */
+				  16, 2,	/* P */
+				  24, 2,	/* mux */
+				  BIT(31),	/* gate */
+				  0);
+
+static SUNXI_CCU_MP_WITH_MUX_GATE(ir1_sun7i_clk, "ir1", ir_parents_sun7i, 0x0b4,
+				  0, 4,		/* M */
+				  16, 2,	/* P */
+				  24, 2,	/* mux */
+				  BIT(31),	/* gate */
+				  0);
+
+static const char *const audio_parents[] = { "pll-audio-8x", "pll-audio-4x",
+					      "pll-audio-2x", "pll-audio" };
+static SUNXI_CCU_MUX_WITH_GATE(i2s0_clk, "i2s0", audio_parents,
+			       0x0b8, 16, 2, BIT(31), CLK_SET_RATE_PARENT);
+
+static SUNXI_CCU_MUX_WITH_GATE(ac97_clk, "ac97", audio_parents,
+			       0x0bc, 16, 2, BIT(31), CLK_SET_RATE_PARENT);
+
+/* Undocumented on A10 */
+static SUNXI_CCU_MUX_WITH_GATE(spdif_clk, "spdif", audio_parents,
+			       0x0c0, 16, 2, BIT(31), CLK_SET_RATE_PARENT);
+
+static const char *const keypad_parents[] = { "hosc", "losc"};
+static const u8 keypad_table[] = { 0, 2 };
+static struct ccu_mp keypad_clk = {
+	.enable		= BIT(31),
+	.m		= _SUNXI_CCU_DIV(0, 5),
+	.p		= _SUNXI_CCU_DIV(16, 2),
+	.mux		= _SUNXI_CCU_MUX_TABLE(24, 2, keypad_table),
+	.common		= {
+		.reg		= 0x0c4,
+		.hw.init	= CLK_HW_INIT_PARENTS("keypad",
+						      keypad_parents,
+						      &ccu_mp_ops,
+						      0),
+	},
+};
+
+/*
+ * SATA supports external clock as parent via BIT(24) and is probably an
+ * optional crystal or oscillator that can be connected to the
+ * SATA-CLKM / SATA-CLKP pins.
+ */
+static const char *const sata_parents[] = {"pll-periph-sata", "sata-ext"};
+static SUNXI_CCU_MUX_WITH_GATE(sata_clk, "sata", sata_parents,
+			       0x0c8, 24, 1, BIT(31), CLK_SET_RATE_PARENT);
+
+
+static SUNXI_CCU_GATE(usb_ohci0_clk,	"usb-ohci0",	"pll-periph",
+		      0x0cc, BIT(6), 0);
+static SUNXI_CCU_GATE(usb_ohci1_clk,	"usb-ohci1",	"pll-periph",
+		      0x0cc, BIT(7), 0);
+static SUNXI_CCU_GATE(usb_phy_clk,	"usb-phy",	"pll-periph",
+		      0x0cc, BIT(8), 0);
+
+/* TODO: GPS CLK 0x0d0 */
+
+static SUNXI_CCU_MP_WITH_MUX_GATE(spi3_clk, "spi3", mod0_default_parents, 0x0d4,
+				  0, 4,		/* M */
+				  16, 2,	/* P */
+				  24, 2,	/* mux */
+				  BIT(31),	/* gate */
+				  0);
+
+/* Not present on A10 */
+static SUNXI_CCU_MUX_WITH_GATE(i2s1_clk, "i2s1", audio_parents,
+			       0x0d8, 16, 2, BIT(31), CLK_SET_RATE_PARENT);
+
+/* Not present on A10 */
+static SUNXI_CCU_MUX_WITH_GATE(i2s2_clk, "i2s2", audio_parents,
+			       0x0dc, 16, 2, BIT(31), CLK_SET_RATE_PARENT);
+
+static SUNXI_CCU_GATE(dram_ve_clk,	"dram-ve",	"pll-ddr",
+		      0x100, BIT(0), 0);
+static SUNXI_CCU_GATE(dram_csi0_clk,	"dram-csi0",	"pll-ddr",
+		      0x100, BIT(1), 0);
+static SUNXI_CCU_GATE(dram_csi1_clk,	"dram-csi1",	"pll-ddr",
+		      0x100, BIT(2), 0);
+static SUNXI_CCU_GATE(dram_ts_clk,	"dram-ts",	"pll-ddr",
+		      0x100, BIT(3), 0);
+static SUNXI_CCU_GATE(dram_tvd_clk,	"dram-tvd",	"pll-ddr",
+		      0x100, BIT(4), 0);
+static SUNXI_CCU_GATE(dram_tve0_clk,	"dram-tve0",	"pll-ddr",
+		      0x100, BIT(5), 0);
+static SUNXI_CCU_GATE(dram_tve1_clk,	"dram-tve1",	"pll-ddr",
+		      0x100, BIT(6), 0);
+
+/* Clock seems to be critical only on sun4i */
+static SUNXI_CCU_GATE(dram_out_clk,	"dram-out",	"pll-ddr",
+		      0x100, BIT(15), CLK_IS_CRITICAL);
+static SUNXI_CCU_GATE(dram_de_fe1_clk,	"dram-de-fe1",	"pll-ddr",
+		      0x100, BIT(24), 0);
+static SUNXI_CCU_GATE(dram_de_fe0_clk,	"dram-de-fe0",	"pll-ddr",
+		      0x100, BIT(25), 0);
+static SUNXI_CCU_GATE(dram_de_be0_clk,	"dram-de-be0",	"pll-ddr",
+		      0x100, BIT(26), 0);
+static SUNXI_CCU_GATE(dram_de_be1_clk,	"dram-de-be1",	"pll-ddr",
+		      0x100, BIT(27), 0);
+static SUNXI_CCU_GATE(dram_mp_clk,	"dram-mp",	"pll-ddr",
+		      0x100, BIT(28), 0);
+static SUNXI_CCU_GATE(dram_ace_clk,	"dram-ace",	"pll-ddr",
+		      0x100, BIT(29), 0);
+
+static const char *const de_parents[] = { "pll-video0", "pll-video1",
+					   "pll-ddr-other" };
+static SUNXI_CCU_M_WITH_MUX_GATE(de_be0_clk, "de-be0", de_parents,
+				 0x104, 0, 4, 24, 2, BIT(31), 0);
+
+static SUNXI_CCU_M_WITH_MUX_GATE(de_be1_clk, "de-be1", de_parents,
+				 0x108, 0, 4, 24, 2, BIT(31), 0);
+
+static SUNXI_CCU_M_WITH_MUX_GATE(de_fe0_clk, "de-fe0", de_parents,
+				 0x10c, 0, 4, 24, 2, BIT(31), 0);
+
+static SUNXI_CCU_M_WITH_MUX_GATE(de_fe1_clk, "de-fe1", de_parents,
+				 0x110, 0, 4, 24, 2, BIT(31), 0);
+
+/* Undocumented on A10 */
+static SUNXI_CCU_M_WITH_MUX_GATE(de_mp_clk, "de-mp", de_parents,
+				 0x114, 0, 4, 24, 2, BIT(31), 0);
+
+static const char *const disp_parents[] = { "pll-video0", "pll-video1",
+					    "pll-video0-2x", "pll-video1-2x" };
+static SUNXI_CCU_MUX_WITH_GATE(tcon0_ch0_clk, "tcon0-ch0-sclk", disp_parents,
+			       0x118, 24, 2, BIT(31), CLK_SET_RATE_PARENT);
+static SUNXI_CCU_MUX_WITH_GATE(tcon1_ch0_clk, "tcon1-ch0-sclk", disp_parents,
+			       0x11c, 24, 2, BIT(31), CLK_SET_RATE_PARENT);
+
+static const char *const csi_sclk_parents[] = { "pll-video0", "pll-ve",
+						"pll-ddr-other", "pll-periph" };
+
+static SUNXI_CCU_M_WITH_MUX_GATE(csi_sclk_clk, "csi-sclk",
+				 csi_sclk_parents,
+				 0x120, 0, 4, 24, 2, BIT(31), 0);
+
+/* TVD clock setup for A10 */
+static const char *const tvd_parents[] = { "pll-video0", "pll-video1" };
+static SUNXI_CCU_MUX_WITH_GATE(tvd_sun4i_clk, "tvd", tvd_parents,
+			       0x128, 24, 1, BIT(31), 0);
+
+/* TVD clock setup for A20 */
+static SUNXI_CCU_MP_WITH_MUX_GATE(tvd_sclk2_sun7i_clk,
+				  "tvd-sclk2", tvd_parents,
+				  0x128,
+				  0, 4,		/* M */
+				  16, 4,	/* P */
+				  8, 1,		/* mux */
+				  BIT(15),	/* gate */
+				  0);
+
+static SUNXI_CCU_M_WITH_GATE(tvd_sclk1_sun7i_clk, "tvd-sclk1", "tvd-sclk2",
+			     0x128, 0, 4, BIT(31), 0);
+
+static SUNXI_CCU_M_WITH_MUX_GATE(tcon0_ch1_sclk2_clk, "tcon0-ch1-sclk2",
+				 disp_parents,
+				 0x12c, 0, 4, 24, 2, BIT(31),
+				 CLK_SET_RATE_PARENT);
+
+static SUNXI_CCU_M_WITH_GATE(tcon0_ch1_clk,
+			     "tcon0-ch1-sclk1", "tcon0-ch1-sclk2",
+			     0x12c, 11, 1, BIT(15),
+			     CLK_SET_RATE_PARENT);
+
+static SUNXI_CCU_M_WITH_MUX_GATE(tcon1_ch1_sclk2_clk, "tcon1-ch1-sclk2",
+				 disp_parents,
+				 0x130, 0, 4, 24, 2, BIT(31),
+				 CLK_SET_RATE_PARENT);
+
+static SUNXI_CCU_M_WITH_GATE(tcon1_ch1_clk,
+			     "tcon1-ch1-sclk1", "tcon1-ch1-sclk2",
+			     0x130, 11, 1, BIT(15),
+			     CLK_SET_RATE_PARENT);
+
+static const char *const csi_parents[] = { "hosc", "pll-video0", "pll-video1",
+					   "pll-video0-2x", "pll-video1-2x"};
+static const u8 csi_table[] = { 0, 1, 2, 5, 6};
+static SUNXI_CCU_M_WITH_MUX_TABLE_GATE(csi0_clk, "csi0",
+				       csi_parents, csi_table,
+				       0x134, 0, 5, 24, 3, BIT(31), 0);
+
+static SUNXI_CCU_M_WITH_MUX_TABLE_GATE(csi1_clk, "csi1",
+				       csi_parents, csi_table,
+				       0x138, 0, 5, 24, 3, BIT(31), 0);
+
+static SUNXI_CCU_M_WITH_GATE(ve_clk, "ve", "pll-ve", 0x13c, 16, 8, BIT(31), 0);
+
+static SUNXI_CCU_GATE(codec_clk, "codec", "pll-audio",
+		      0x140, BIT(31), CLK_SET_RATE_PARENT);
+
+static SUNXI_CCU_GATE(avs_clk, "avs", "hosc", 0x144, BIT(31), 0);
+
+static const char *const ace_parents[] = { "pll-ve", "pll-ddr-other" };
+static SUNXI_CCU_M_WITH_MUX_GATE(ace_clk, "ace", ace_parents,
+				 0x148, 0, 4, 24, 1, BIT(31), 0);
+
+static SUNXI_CCU_M_WITH_MUX_GATE(hdmi_clk, "hdmi", disp_parents,
+				 0x150, 0, 4, 24, 2, BIT(31),
+				 CLK_SET_RATE_PARENT);
+
+static const char *const gpu_parents_sun4i[] = { "pll-video0", "pll-ve",
+						 "pll-ddr-other",
+						 "pll-video1" };
+static SUNXI_CCU_M_WITH_MUX_GATE(gpu_sun4i_clk, "gpu", gpu_parents_sun4i,
+				 0x154, 0, 4, 24, 2, BIT(31),
+				 CLK_SET_RATE_PARENT);
+
+static const char *const gpu_parents_sun7i[] = { "pll-video0", "pll-ve",
+						 "pll-ddr-other", "pll-video1",
+						 "pll-gpu" };
+static const u8 gpu_table_sun7i[] = { 0, 1, 2, 3, 4 };
+static SUNXI_CCU_M_WITH_MUX_TABLE_GATE(gpu_sun7i_clk, "gpu",
+				       gpu_parents_sun7i, gpu_table_sun7i,
+				       0x154, 0, 4, 24, 3, BIT(31),
+				       CLK_SET_RATE_PARENT);
+
+static const char *const mbus_sun4i_parents[] = { "hosc", "pll-periph",
+						  "pll-ddr-other" };
+static SUNXI_CCU_MP_WITH_MUX_GATE(mbus_sun4i_clk, "mbus", mbus_sun4i_parents,
+				  0x15c, 0, 4, 16, 2, 24, 2, BIT(31),
+				  0);
+static const char *const mbus_sun7i_parents[] = { "hosc", "pll-periph-base",
+						  "pll-ddr-other" };
+static SUNXI_CCU_MP_WITH_MUX_GATE(mbus_sun7i_clk, "mbus", mbus_sun7i_parents,
+				  0x15c, 0, 4, 16, 2, 24, 2, BIT(31),
+				  CLK_IS_CRITICAL);
+
+static SUNXI_CCU_GATE(hdmi1_slow_clk, "hdmi1-slow", "hosc", 0x178, BIT(31), 0);
+
+static const char *const hdmi1_parents[] = { "pll-video0", "pll-video1" };
+static const u8 hdmi1_table[] = { 0, 1};
+static SUNXI_CCU_M_WITH_MUX_TABLE_GATE(hdmi1_clk, "hdmi1",
+				       hdmi1_parents, hdmi1_table,
+				       0x17c, 0, 4, 24, 2, BIT(31),
+				       CLK_SET_RATE_PARENT);
+
+static const char *const out_parents[] = { "hosc", "osc32k", "hosc" };
+static const struct ccu_mux_fixed_prediv clk_out_predivs[] = {
+	{ .index = 0, .div = 750, },
+};
+
+static struct ccu_mp out_a_clk = {
+	.enable		= BIT(31),
+	.m		= _SUNXI_CCU_DIV(8, 5),
+	.p		= _SUNXI_CCU_DIV(20, 2),
+	.mux		= {
+		.shift		= 24,
+		.width		= 2,
+		.fixed_predivs	= clk_out_predivs,
+		.n_predivs	= ARRAY_SIZE(clk_out_predivs),
+	},
+	.common		= {
+		.reg		= 0x1f0,
+		.features	= CCU_FEATURE_FIXED_PREDIV,
+		.hw.init	= CLK_HW_INIT_PARENTS("out-a",
+						      out_parents,
+						      &ccu_mp_ops,
+						      0),
+	},
+};
+static struct ccu_mp out_b_clk = {
+	.enable		= BIT(31),
+	.m		= _SUNXI_CCU_DIV(8, 5),
+	.p		= _SUNXI_CCU_DIV(20, 2),
+	.mux		= {
+		.shift		= 24,
+		.width		= 2,
+		.fixed_predivs	= clk_out_predivs,
+		.n_predivs	= ARRAY_SIZE(clk_out_predivs),
+	},
+	.common		= {
+		.reg		= 0x1f4,
+		.features	= CCU_FEATURE_FIXED_PREDIV,
+		.hw.init	= CLK_HW_INIT_PARENTS("out-b",
+						      out_parents,
+						      &ccu_mp_ops,
+						      0),
+	},
+};
+
+static struct ccu_common *sun4i_sun7i_ccu_clks[] = {
+	&hosc_clk.common,
+	&pll_core_clk.common,
+	&pll_audio_base_clk.common,
+	&pll_video0_clk.common,
+	&pll_ve_sun4i_clk.common,
+	&pll_ve_sun7i_clk.common,
+	&pll_ddr_base_clk.common,
+	&pll_ddr_clk.common,
+	&pll_ddr_other_clk.common,
+	&pll_periph_base_clk.common,
+	&pll_periph_sata_clk.common,
+	&pll_video1_clk.common,
+	&pll_gpu_clk.common,
+	&cpu_clk.common,
+	&axi_clk.common,
+	&axi_dram_clk.common,
+	&ahb_sun4i_clk.common,
+	&ahb_sun7i_clk.common,
+	&apb0_clk.common,
+	&apb1_clk.common,
+	&ahb_otg_clk.common,
+	&ahb_ehci0_clk.common,
+	&ahb_ohci0_clk.common,
+	&ahb_ehci1_clk.common,
+	&ahb_ohci1_clk.common,
+	&ahb_ss_clk.common,
+	&ahb_dma_clk.common,
+	&ahb_bist_clk.common,
+	&ahb_mmc0_clk.common,
+	&ahb_mmc1_clk.common,
+	&ahb_mmc2_clk.common,
+	&ahb_mmc3_clk.common,
+	&ahb_ms_clk.common,
+	&ahb_nand_clk.common,
+	&ahb_sdram_clk.common,
+	&ahb_ace_clk.common,
+	&ahb_emac_clk.common,
+	&ahb_ts_clk.common,
+	&ahb_spi0_clk.common,
+	&ahb_spi1_clk.common,
+	&ahb_spi2_clk.common,
+	&ahb_spi3_clk.common,
+	&ahb_pata_clk.common,
+	&ahb_sata_clk.common,
+	&ahb_gps_clk.common,
+	&ahb_hstimer_clk.common,
+	&ahb_ve_clk.common,
+	&ahb_tvd_clk.common,
+	&ahb_tve0_clk.common,
+	&ahb_tve1_clk.common,
+	&ahb_lcd0_clk.common,
+	&ahb_lcd1_clk.common,
+	&ahb_csi0_clk.common,
+	&ahb_csi1_clk.common,
+	&ahb_hdmi1_clk.common,
+	&ahb_hdmi0_clk.common,
+	&ahb_de_be0_clk.common,
+	&ahb_de_be1_clk.common,
+	&ahb_de_fe0_clk.common,
+	&ahb_de_fe1_clk.common,
+	&ahb_gmac_clk.common,
+	&ahb_mp_clk.common,
+	&ahb_gpu_clk.common,
+	&apb0_codec_clk.common,
+	&apb0_spdif_clk.common,
+	&apb0_ac97_clk.common,
+	&apb0_i2s0_clk.common,
+	&apb0_i2s1_clk.common,
+	&apb0_pio_clk.common,
+	&apb0_ir0_clk.common,
+	&apb0_ir1_clk.common,
+	&apb0_i2s2_clk.common,
+	&apb0_keypad_clk.common,
+	&apb1_i2c0_clk.common,
+	&apb1_i2c1_clk.common,
+	&apb1_i2c2_clk.common,
+	&apb1_i2c3_clk.common,
+	&apb1_can_clk.common,
+	&apb1_scr_clk.common,
+	&apb1_ps20_clk.common,
+	&apb1_ps21_clk.common,
+	&apb1_i2c4_clk.common,
+	&apb1_uart0_clk.common,
+	&apb1_uart1_clk.common,
+	&apb1_uart2_clk.common,
+	&apb1_uart3_clk.common,
+	&apb1_uart4_clk.common,
+	&apb1_uart5_clk.common,
+	&apb1_uart6_clk.common,
+	&apb1_uart7_clk.common,
+	&nand_clk.common,
+	&ms_clk.common,
+	&mmc0_clk.common,
+	&mmc0_output_clk.common,
+	&mmc0_sample_clk.common,
+	&mmc1_clk.common,
+	&mmc1_output_clk.common,
+	&mmc1_sample_clk.common,
+	&mmc2_clk.common,
+	&mmc2_output_clk.common,
+	&mmc2_sample_clk.common,
+	&mmc3_clk.common,
+	&mmc3_output_clk.common,
+	&mmc3_sample_clk.common,
+	&ts_clk.common,
+	&ss_clk.common,
+	&spi0_clk.common,
+	&spi1_clk.common,
+	&spi2_clk.common,
+	&pata_clk.common,
+	&ir0_sun4i_clk.common,
+	&ir1_sun4i_clk.common,
+	&ir0_sun7i_clk.common,
+	&ir1_sun7i_clk.common,
+	&i2s0_clk.common,
+	&ac97_clk.common,
+	&spdif_clk.common,
+	&keypad_clk.common,
+	&sata_clk.common,
+	&usb_ohci0_clk.common,
+	&usb_ohci1_clk.common,
+	&usb_phy_clk.common,
+	&spi3_clk.common,
+	&i2s1_clk.common,
+	&i2s2_clk.common,
+	&dram_ve_clk.common,
+	&dram_csi0_clk.common,
+	&dram_csi1_clk.common,
+	&dram_ts_clk.common,
+	&dram_tvd_clk.common,
+	&dram_tve0_clk.common,
+	&dram_tve1_clk.common,
+	&dram_out_clk.common,
+	&dram_de_fe1_clk.common,
+	&dram_de_fe0_clk.common,
+	&dram_de_be0_clk.common,
+	&dram_de_be1_clk.common,
+	&dram_mp_clk.common,
+	&dram_ace_clk.common,
+	&de_be0_clk.common,
+	&de_be1_clk.common,
+	&de_fe0_clk.common,
+	&de_fe1_clk.common,
+	&de_mp_clk.common,
+	&tcon0_ch0_clk.common,
+	&tcon1_ch0_clk.common,
+	&csi_sclk_clk.common,
+	&tvd_sun4i_clk.common,
+	&tvd_sclk1_sun7i_clk.common,
+	&tvd_sclk2_sun7i_clk.common,
+	&tcon0_ch1_sclk2_clk.common,
+	&tcon0_ch1_clk.common,
+	&tcon1_ch1_sclk2_clk.common,
+	&tcon1_ch1_clk.common,
+	&csi0_clk.common,
+	&csi1_clk.common,
+	&ve_clk.common,
+	&codec_clk.common,
+	&avs_clk.common,
+	&ace_clk.common,
+	&hdmi_clk.common,
+	&gpu_sun4i_clk.common,
+	&gpu_sun7i_clk.common,
+	&mbus_sun4i_clk.common,
+	&mbus_sun7i_clk.common,
+	&hdmi1_slow_clk.common,
+	&hdmi1_clk.common,
+	&out_a_clk.common,
+	&out_b_clk.common
+};
+
+/* Post-divider for pll-audio is hardcoded to 4 */
+static CLK_FIXED_FACTOR(pll_audio_clk, "pll-audio",
+			"pll-audio-base", 4, 1, CLK_SET_RATE_PARENT);
+static CLK_FIXED_FACTOR(pll_audio_2x_clk, "pll-audio-2x",
+			"pll-audio-base", 2, 1, CLK_SET_RATE_PARENT);
+static CLK_FIXED_FACTOR(pll_audio_4x_clk, "pll-audio-4x",
+			"pll-audio-base", 1, 1, CLK_SET_RATE_PARENT);
+static CLK_FIXED_FACTOR(pll_audio_8x_clk, "pll-audio-8x",
+			"pll-audio-base", 1, 2, CLK_SET_RATE_PARENT);
+static CLK_FIXED_FACTOR(pll_video0_2x_clk, "pll-video0-2x",
+			"pll-video0", 1, 2, CLK_SET_RATE_PARENT);
+static CLK_FIXED_FACTOR(pll_video1_2x_clk, "pll-video1-2x",
+			"pll-video1", 1, 2, CLK_SET_RATE_PARENT);
+
+
+static struct clk_hw_onecell_data sun4i_a10_hw_clks = {
+	.hws	= {
+		[CLK_HOSC]		= &hosc_clk.common.hw,
+		[CLK_PLL_CORE]		= &pll_core_clk.common.hw,
+		[CLK_PLL_AUDIO_BASE]	= &pll_audio_base_clk.common.hw,
+		[CLK_PLL_AUDIO]		= &pll_audio_clk.hw,
+		[CLK_PLL_AUDIO_2X]	= &pll_audio_2x_clk.hw,
+		[CLK_PLL_AUDIO_4X]	= &pll_audio_4x_clk.hw,
+		[CLK_PLL_AUDIO_8X]	= &pll_audio_8x_clk.hw,
+		[CLK_PLL_VIDEO0]	= &pll_video0_clk.common.hw,
+		[CLK_PLL_VIDEO0_2X]	= &pll_video0_2x_clk.hw,
+		[CLK_PLL_VE]		= &pll_ve_sun4i_clk.common.hw,
+		[CLK_PLL_DDR_BASE]	= &pll_ddr_base_clk.common.hw,
+		[CLK_PLL_DDR]		= &pll_ddr_clk.common.hw,
+		[CLK_PLL_DDR_OTHER]	= &pll_ddr_other_clk.common.hw,
+		[CLK_PLL_PERIPH_BASE]	= &pll_periph_base_clk.common.hw,
+		[CLK_PLL_PERIPH]	= &pll_periph_clk.hw,
+		[CLK_PLL_PERIPH_SATA]	= &pll_periph_sata_clk.common.hw,
+		[CLK_PLL_VIDEO1]	= &pll_video1_clk.common.hw,
+		[CLK_PLL_VIDEO1_2X]	= &pll_video1_2x_clk.hw,
+		[CLK_CPU]		= &cpu_clk.common.hw,
+		[CLK_AXI]		= &axi_clk.common.hw,
+		[CLK_AXI_DRAM]		= &axi_dram_clk.common.hw,
+		[CLK_AHB]		= &ahb_sun4i_clk.common.hw,
+		[CLK_APB0]		= &apb0_clk.common.hw,
+		[CLK_APB1]		= &apb1_clk.common.hw,
+		[CLK_AHB_OTG]		= &ahb_otg_clk.common.hw,
+		[CLK_AHB_EHCI0]		= &ahb_ehci0_clk.common.hw,
+		[CLK_AHB_OHCI0]		= &ahb_ohci0_clk.common.hw,
+		[CLK_AHB_EHCI1]		= &ahb_ehci1_clk.common.hw,
+		[CLK_AHB_OHCI1]		= &ahb_ohci1_clk.common.hw,
+		[CLK_AHB_SS]		= &ahb_ss_clk.common.hw,
+		[CLK_AHB_DMA]		= &ahb_dma_clk.common.hw,
+		[CLK_AHB_BIST]		= &ahb_bist_clk.common.hw,
+		[CLK_AHB_MMC0]		= &ahb_mmc0_clk.common.hw,
+		[CLK_AHB_MMC1]		= &ahb_mmc1_clk.common.hw,
+		[CLK_AHB_MMC2]		= &ahb_mmc2_clk.common.hw,
+		[CLK_AHB_MMC3]		= &ahb_mmc3_clk.common.hw,
+		[CLK_AHB_MS]		= &ahb_ms_clk.common.hw,
+		[CLK_AHB_NAND]		= &ahb_nand_clk.common.hw,
+		[CLK_AHB_SDRAM]		= &ahb_sdram_clk.common.hw,
+		[CLK_AHB_ACE]		= &ahb_ace_clk.common.hw,
+		[CLK_AHB_EMAC]		= &ahb_emac_clk.common.hw,
+		[CLK_AHB_TS]		= &ahb_ts_clk.common.hw,
+		[CLK_AHB_SPI0]		= &ahb_spi0_clk.common.hw,
+		[CLK_AHB_SPI1]		= &ahb_spi1_clk.common.hw,
+		[CLK_AHB_SPI2]		= &ahb_spi2_clk.common.hw,
+		[CLK_AHB_SPI3]		= &ahb_spi3_clk.common.hw,
+		[CLK_AHB_PATA]		= &ahb_pata_clk.common.hw,
+		[CLK_AHB_SATA]		= &ahb_sata_clk.common.hw,
+		[CLK_AHB_GPS]		= &ahb_gps_clk.common.hw,
+		[CLK_AHB_VE]		= &ahb_ve_clk.common.hw,
+		[CLK_AHB_TVD]		= &ahb_tvd_clk.common.hw,
+		[CLK_AHB_TVE0]		= &ahb_tve0_clk.common.hw,
+		[CLK_AHB_TVE1]		= &ahb_tve1_clk.common.hw,
+		[CLK_AHB_LCD0]		= &ahb_lcd0_clk.common.hw,
+		[CLK_AHB_LCD1]		= &ahb_lcd1_clk.common.hw,
+		[CLK_AHB_CSI0]		= &ahb_csi0_clk.common.hw,
+		[CLK_AHB_CSI1]		= &ahb_csi1_clk.common.hw,
+		[CLK_AHB_HDMI0]		= &ahb_hdmi0_clk.common.hw,
+		[CLK_AHB_DE_BE0]	= &ahb_de_be0_clk.common.hw,
+		[CLK_AHB_DE_BE1]	= &ahb_de_be1_clk.common.hw,
+		[CLK_AHB_DE_FE0]	= &ahb_de_fe0_clk.common.hw,
+		[CLK_AHB_DE_FE1]	= &ahb_de_fe1_clk.common.hw,
+		[CLK_AHB_MP]		= &ahb_mp_clk.common.hw,
+		[CLK_AHB_GPU]		= &ahb_gpu_clk.common.hw,
+		[CLK_APB0_CODEC]	= &apb0_codec_clk.common.hw,
+		[CLK_APB0_SPDIF]	= &apb0_spdif_clk.common.hw,
+		[CLK_APB0_AC97]		= &apb0_ac97_clk.common.hw,
+		[CLK_APB0_I2S0]		= &apb0_i2s0_clk.common.hw,
+		[CLK_APB0_PIO]		= &apb0_pio_clk.common.hw,
+		[CLK_APB0_IR0]		= &apb0_ir0_clk.common.hw,
+		[CLK_APB0_IR1]		= &apb0_ir1_clk.common.hw,
+		[CLK_APB0_KEYPAD]	= &apb0_keypad_clk.common.hw,
+		[CLK_APB1_I2C0]		= &apb1_i2c0_clk.common.hw,
+		[CLK_APB1_I2C1]		= &apb1_i2c1_clk.common.hw,
+		[CLK_APB1_I2C2]		= &apb1_i2c2_clk.common.hw,
+		[CLK_APB1_CAN]		= &apb1_can_clk.common.hw,
+		[CLK_APB1_SCR]		= &apb1_scr_clk.common.hw,
+		[CLK_APB1_PS20]		= &apb1_ps20_clk.common.hw,
+		[CLK_APB1_PS21]		= &apb1_ps21_clk.common.hw,
+		[CLK_APB1_UART0]	= &apb1_uart0_clk.common.hw,
+		[CLK_APB1_UART1]	= &apb1_uart1_clk.common.hw,
+		[CLK_APB1_UART2]	= &apb1_uart2_clk.common.hw,
+		[CLK_APB1_UART3]	= &apb1_uart3_clk.common.hw,
+		[CLK_APB1_UART4]	= &apb1_uart4_clk.common.hw,
+		[CLK_APB1_UART5]	= &apb1_uart5_clk.common.hw,
+		[CLK_APB1_UART6]	= &apb1_uart6_clk.common.hw,
+		[CLK_APB1_UART7]	= &apb1_uart7_clk.common.hw,
+		[CLK_NAND]		= &nand_clk.common.hw,
+		[CLK_MS]		= &ms_clk.common.hw,
+		[CLK_MMC0]		= &mmc0_clk.common.hw,
+		[CLK_MMC1]		= &mmc1_clk.common.hw,
+		[CLK_MMC2]		= &mmc2_clk.common.hw,
+		[CLK_MMC3]		= &mmc3_clk.common.hw,
+		[CLK_TS]		= &ts_clk.common.hw,
+		[CLK_SS]		= &ss_clk.common.hw,
+		[CLK_SPI0]		= &spi0_clk.common.hw,
+		[CLK_SPI1]		= &spi1_clk.common.hw,
+		[CLK_SPI2]		= &spi2_clk.common.hw,
+		[CLK_PATA]		= &pata_clk.common.hw,
+		[CLK_IR0]		= &ir0_sun4i_clk.common.hw,
+		[CLK_IR1]		= &ir1_sun4i_clk.common.hw,
+		[CLK_I2S0]		= &i2s0_clk.common.hw,
+		[CLK_AC97]		= &ac97_clk.common.hw,
+		[CLK_SPDIF]		= &spdif_clk.common.hw,
+		[CLK_KEYPAD]		= &keypad_clk.common.hw,
+		[CLK_SATA]		= &sata_clk.common.hw,
+		[CLK_USB_OHCI0]		= &usb_ohci0_clk.common.hw,
+		[CLK_USB_OHCI1]		= &usb_ohci1_clk.common.hw,
+		[CLK_USB_PHY]		= &usb_phy_clk.common.hw,
+		/* CLK_GPS is unimplemented */
+		[CLK_SPI3]		= &spi3_clk.common.hw,
+		[CLK_DRAM_VE]		= &dram_ve_clk.common.hw,
+		[CLK_DRAM_CSI0]		= &dram_csi0_clk.common.hw,
+		[CLK_DRAM_CSI1]		= &dram_csi1_clk.common.hw,
+		[CLK_DRAM_TS]		= &dram_ts_clk.common.hw,
+		[CLK_DRAM_TVD]		= &dram_tvd_clk.common.hw,
+		[CLK_DRAM_TVE0]		= &dram_tve0_clk.common.hw,
+		[CLK_DRAM_TVE1]		= &dram_tve1_clk.common.hw,
+		[CLK_DRAM_OUT]		= &dram_out_clk.common.hw,
+		[CLK_DRAM_DE_FE1]	= &dram_de_fe1_clk.common.hw,
+		[CLK_DRAM_DE_FE0]	= &dram_de_fe0_clk.common.hw,
+		[CLK_DRAM_DE_BE0]	= &dram_de_be0_clk.common.hw,
+		[CLK_DRAM_DE_BE1]	= &dram_de_be1_clk.common.hw,
+		[CLK_DRAM_MP]		= &dram_mp_clk.common.hw,
+		[CLK_DRAM_ACE]		= &dram_ace_clk.common.hw,
+		[CLK_DE_BE0]		= &de_be0_clk.common.hw,
+		[CLK_DE_BE1]		= &de_be1_clk.common.hw,
+		[CLK_DE_FE0]		= &de_fe0_clk.common.hw,
+		[CLK_DE_FE1]		= &de_fe1_clk.common.hw,
+		[CLK_DE_MP]		= &de_mp_clk.common.hw,
+		[CLK_TCON0_CH0]		= &tcon0_ch0_clk.common.hw,
+		[CLK_TCON1_CH0]		= &tcon1_ch0_clk.common.hw,
+		[CLK_CSI_SCLK]		= &csi_sclk_clk.common.hw,
+		[CLK_TVD]		= &tvd_sun4i_clk.common.hw,
+		[CLK_TCON0_CH1_SCLK2]	= &tcon0_ch1_sclk2_clk.common.hw,
+		[CLK_TCON0_CH1]		= &tcon0_ch1_clk.common.hw,
+		[CLK_TCON1_CH1_SCLK2]	= &tcon1_ch1_sclk2_clk.common.hw,
+		[CLK_TCON1_CH1]		= &tcon1_ch1_clk.common.hw,
+		[CLK_CSI0]		= &csi0_clk.common.hw,
+		[CLK_CSI1]		= &csi1_clk.common.hw,
+		[CLK_VE]		= &ve_clk.common.hw,
+		[CLK_CODEC]		= &codec_clk.common.hw,
+		[CLK_AVS]		= &avs_clk.common.hw,
+		[CLK_ACE]		= &ace_clk.common.hw,
+		[CLK_HDMI]		= &hdmi_clk.common.hw,
+		[CLK_GPU]		= &gpu_sun7i_clk.common.hw,
+		[CLK_MBUS]		= &mbus_sun4i_clk.common.hw,
+	},
+	.num	= CLK_NUMBER_SUN4I,
+};
+static struct clk_hw_onecell_data sun7i_a20_hw_clks = {
+	.hws	= {
+		[CLK_HOSC]		= &hosc_clk.common.hw,
+		[CLK_PLL_CORE]		= &pll_core_clk.common.hw,
+		[CLK_PLL_AUDIO_BASE]	= &pll_audio_base_clk.common.hw,
+		[CLK_PLL_AUDIO]		= &pll_audio_clk.hw,
+		[CLK_PLL_AUDIO_2X]	= &pll_audio_2x_clk.hw,
+		[CLK_PLL_AUDIO_4X]	= &pll_audio_4x_clk.hw,
+		[CLK_PLL_AUDIO_8X]	= &pll_audio_8x_clk.hw,
+		[CLK_PLL_VIDEO0]	= &pll_video0_clk.common.hw,
+		[CLK_PLL_VIDEO0_2X]	= &pll_video0_2x_clk.hw,
+		[CLK_PLL_VE]		= &pll_ve_sun7i_clk.common.hw,
+		[CLK_PLL_DDR_BASE]	= &pll_ddr_base_clk.common.hw,
+		[CLK_PLL_DDR]		= &pll_ddr_clk.common.hw,
+		[CLK_PLL_DDR_OTHER]	= &pll_ddr_other_clk.common.hw,
+		[CLK_PLL_PERIPH_BASE]	= &pll_periph_base_clk.common.hw,
+		[CLK_PLL_PERIPH]	= &pll_periph_clk.hw,
+		[CLK_PLL_PERIPH_SATA]	= &pll_periph_sata_clk.common.hw,
+		[CLK_PLL_VIDEO1]	= &pll_video1_clk.common.hw,
+		[CLK_PLL_VIDEO1_2X]	= &pll_video1_2x_clk.hw,
+		[CLK_PLL_GPU]		= &pll_gpu_clk.common.hw,
+		[CLK_CPU]		= &cpu_clk.common.hw,
+		[CLK_AXI]		= &axi_clk.common.hw,
+		[CLK_AHB]		= &ahb_sun7i_clk.common.hw,
+		[CLK_APB0]		= &apb0_clk.common.hw,
+		[CLK_APB1]		= &apb1_clk.common.hw,
+		[CLK_AHB_OTG]		= &ahb_otg_clk.common.hw,
+		[CLK_AHB_EHCI0]		= &ahb_ehci0_clk.common.hw,
+		[CLK_AHB_OHCI0]		= &ahb_ohci0_clk.common.hw,
+		[CLK_AHB_EHCI1]		= &ahb_ehci1_clk.common.hw,
+		[CLK_AHB_OHCI1]		= &ahb_ohci1_clk.common.hw,
+		[CLK_AHB_SS]		= &ahb_ss_clk.common.hw,
+		[CLK_AHB_DMA]		= &ahb_dma_clk.common.hw,
+		[CLK_AHB_BIST]		= &ahb_bist_clk.common.hw,
+		[CLK_AHB_MMC0]		= &ahb_mmc0_clk.common.hw,
+		[CLK_AHB_MMC1]		= &ahb_mmc1_clk.common.hw,
+		[CLK_AHB_MMC2]		= &ahb_mmc2_clk.common.hw,
+		[CLK_AHB_MMC3]		= &ahb_mmc3_clk.common.hw,
+		[CLK_AHB_MS]		= &ahb_ms_clk.common.hw,
+		[CLK_AHB_NAND]		= &ahb_nand_clk.common.hw,
+		[CLK_AHB_SDRAM]		= &ahb_sdram_clk.common.hw,
+		[CLK_AHB_ACE]		= &ahb_ace_clk.common.hw,
+		[CLK_AHB_EMAC]		= &ahb_emac_clk.common.hw,
+		[CLK_AHB_TS]		= &ahb_ts_clk.common.hw,
+		[CLK_AHB_SPI0]		= &ahb_spi0_clk.common.hw,
+		[CLK_AHB_SPI1]		= &ahb_spi1_clk.common.hw,
+		[CLK_AHB_SPI2]		= &ahb_spi2_clk.common.hw,
+		[CLK_AHB_SPI3]		= &ahb_spi3_clk.common.hw,
+		[CLK_AHB_PATA]		= &ahb_pata_clk.common.hw,
+		[CLK_AHB_SATA]		= &ahb_sata_clk.common.hw,
+		[CLK_AHB_HSTIMER]	= &ahb_hstimer_clk.common.hw,
+		[CLK_AHB_VE]		= &ahb_ve_clk.common.hw,
+		[CLK_AHB_TVD]		= &ahb_tvd_clk.common.hw,
+		[CLK_AHB_TVE0]		= &ahb_tve0_clk.common.hw,
+		[CLK_AHB_TVE1]		= &ahb_tve1_clk.common.hw,
+		[CLK_AHB_LCD0]		= &ahb_lcd0_clk.common.hw,
+		[CLK_AHB_LCD1]		= &ahb_lcd1_clk.common.hw,
+		[CLK_AHB_CSI0]		= &ahb_csi0_clk.common.hw,
+		[CLK_AHB_CSI1]		= &ahb_csi1_clk.common.hw,
+		[CLK_AHB_HDMI1]		= &ahb_hdmi1_clk.common.hw,
+		[CLK_AHB_HDMI0]		= &ahb_hdmi0_clk.common.hw,
+		[CLK_AHB_DE_BE0]	= &ahb_de_be0_clk.common.hw,
+		[CLK_AHB_DE_BE1]	= &ahb_de_be1_clk.common.hw,
+		[CLK_AHB_DE_FE0]	= &ahb_de_fe0_clk.common.hw,
+		[CLK_AHB_DE_FE1]	= &ahb_de_fe1_clk.common.hw,
+		[CLK_AHB_GMAC]		= &ahb_gmac_clk.common.hw,
+		[CLK_AHB_MP]		= &ahb_mp_clk.common.hw,
+		[CLK_AHB_GPU]		= &ahb_gpu_clk.common.hw,
+		[CLK_APB0_CODEC]	= &apb0_codec_clk.common.hw,
+		[CLK_APB0_SPDIF]	= &apb0_spdif_clk.common.hw,
+		[CLK_APB0_AC97]		= &apb0_ac97_clk.common.hw,
+		[CLK_APB0_I2S0]		= &apb0_i2s0_clk.common.hw,
+		[CLK_APB0_I2S1]		= &apb0_i2s1_clk.common.hw,
+		[CLK_APB0_PIO]		= &apb0_pio_clk.common.hw,
+		[CLK_APB0_IR0]		= &apb0_ir0_clk.common.hw,
+		[CLK_APB0_IR1]		= &apb0_ir1_clk.common.hw,
+		[CLK_APB0_I2S2]		= &apb0_i2s2_clk.common.hw,
+		[CLK_APB0_KEYPAD]	= &apb0_keypad_clk.common.hw,
+		[CLK_APB1_I2C0]		= &apb1_i2c0_clk.common.hw,
+		[CLK_APB1_I2C1]		= &apb1_i2c1_clk.common.hw,
+		[CLK_APB1_I2C2]		= &apb1_i2c2_clk.common.hw,
+		[CLK_APB1_I2C3]		= &apb1_i2c3_clk.common.hw,
+		[CLK_APB1_CAN]		= &apb1_can_clk.common.hw,
+		[CLK_APB1_SCR]		= &apb1_scr_clk.common.hw,
+		[CLK_APB1_PS20]		= &apb1_ps20_clk.common.hw,
+		[CLK_APB1_PS21]		= &apb1_ps21_clk.common.hw,
+		[CLK_APB1_I2C4]		= &apb1_i2c4_clk.common.hw,
+		[CLK_APB1_UART0]	= &apb1_uart0_clk.common.hw,
+		[CLK_APB1_UART1]	= &apb1_uart1_clk.common.hw,
+		[CLK_APB1_UART2]	= &apb1_uart2_clk.common.hw,
+		[CLK_APB1_UART3]	= &apb1_uart3_clk.common.hw,
+		[CLK_APB1_UART4]	= &apb1_uart4_clk.common.hw,
+		[CLK_APB1_UART5]	= &apb1_uart5_clk.common.hw,
+		[CLK_APB1_UART6]	= &apb1_uart6_clk.common.hw,
+		[CLK_APB1_UART7]	= &apb1_uart7_clk.common.hw,
+		[CLK_NAND]		= &nand_clk.common.hw,
+		[CLK_MS]		= &ms_clk.common.hw,
+		[CLK_MMC0]		= &mmc0_clk.common.hw,
+		[CLK_MMC0_OUTPUT]	= &mmc0_output_clk.common.hw,
+		[CLK_MMC0_SAMPLE]	= &mmc0_sample_clk.common.hw,
+		[CLK_MMC1]		= &mmc1_clk.common.hw,
+		[CLK_MMC1_OUTPUT]	= &mmc1_output_clk.common.hw,
+		[CLK_MMC1_SAMPLE]	= &mmc1_sample_clk.common.hw,
+		[CLK_MMC2]		= &mmc2_clk.common.hw,
+		[CLK_MMC2_OUTPUT]	= &mmc2_output_clk.common.hw,
+		[CLK_MMC2_SAMPLE]	= &mmc2_sample_clk.common.hw,
+		[CLK_MMC3]		= &mmc3_clk.common.hw,
+		[CLK_MMC3_OUTPUT]	= &mmc3_output_clk.common.hw,
+		[CLK_MMC3_SAMPLE]	= &mmc3_sample_clk.common.hw,
+		[CLK_TS]		= &ts_clk.common.hw,
+		[CLK_SS]		= &ss_clk.common.hw,
+		[CLK_SPI0]		= &spi0_clk.common.hw,
+		[CLK_SPI1]		= &spi1_clk.common.hw,
+		[CLK_SPI2]		= &spi2_clk.common.hw,
+		[CLK_PATA]		= &pata_clk.common.hw,
+		[CLK_IR0]		= &ir0_sun7i_clk.common.hw,
+		[CLK_IR1]		= &ir1_sun7i_clk.common.hw,
+		[CLK_I2S0]		= &i2s0_clk.common.hw,
+		[CLK_AC97]		= &ac97_clk.common.hw,
+		[CLK_SPDIF]		= &spdif_clk.common.hw,
+		[CLK_KEYPAD]		= &keypad_clk.common.hw,
+		[CLK_SATA]		= &sata_clk.common.hw,
+		[CLK_USB_OHCI0]		= &usb_ohci0_clk.common.hw,
+		[CLK_USB_OHCI1]		= &usb_ohci1_clk.common.hw,
+		[CLK_USB_PHY]		= &usb_phy_clk.common.hw,
+		/* CLK_GPS is unimplemented */
+		[CLK_SPI3]		= &spi3_clk.common.hw,
+		[CLK_I2S1]		= &i2s1_clk.common.hw,
+		[CLK_I2S2]		= &i2s2_clk.common.hw,
+		[CLK_DRAM_VE]		= &dram_ve_clk.common.hw,
+		[CLK_DRAM_CSI0]		= &dram_csi0_clk.common.hw,
+		[CLK_DRAM_CSI1]		= &dram_csi1_clk.common.hw,
+		[CLK_DRAM_TS]		= &dram_ts_clk.common.hw,
+		[CLK_DRAM_TVD]		= &dram_tvd_clk.common.hw,
+		[CLK_DRAM_TVE0]		= &dram_tve0_clk.common.hw,
+		[CLK_DRAM_TVE1]		= &dram_tve1_clk.common.hw,
+		[CLK_DRAM_OUT]		= &dram_out_clk.common.hw,
+		[CLK_DRAM_DE_FE1]	= &dram_de_fe1_clk.common.hw,
+		[CLK_DRAM_DE_FE0]	= &dram_de_fe0_clk.common.hw,
+		[CLK_DRAM_DE_BE0]	= &dram_de_be0_clk.common.hw,
+		[CLK_DRAM_DE_BE1]	= &dram_de_be1_clk.common.hw,
+		[CLK_DRAM_MP]		= &dram_mp_clk.common.hw,
+		[CLK_DRAM_ACE]		= &dram_ace_clk.common.hw,
+		[CLK_DE_BE0]		= &de_be0_clk.common.hw,
+		[CLK_DE_BE1]		= &de_be1_clk.common.hw,
+		[CLK_DE_FE0]		= &de_fe0_clk.common.hw,
+		[CLK_DE_FE1]		= &de_fe1_clk.common.hw,
+		[CLK_DE_MP]		= &de_mp_clk.common.hw,
+		[CLK_TCON0_CH0]		= &tcon0_ch0_clk.common.hw,
+		[CLK_TCON1_CH0]		= &tcon1_ch0_clk.common.hw,
+		[CLK_CSI_SCLK]		= &csi_sclk_clk.common.hw,
+		[CLK_TVD_SCLK2]		= &tvd_sclk2_sun7i_clk.common.hw,
+		[CLK_TVD]		= &tvd_sclk1_sun7i_clk.common.hw,
+		[CLK_TCON0_CH1_SCLK2]	= &tcon0_ch1_sclk2_clk.common.hw,
+		[CLK_TCON0_CH1]		= &tcon0_ch1_clk.common.hw,
+		[CLK_TCON1_CH1_SCLK2]	= &tcon1_ch1_sclk2_clk.common.hw,
+		[CLK_TCON1_CH1]		= &tcon1_ch1_clk.common.hw,
+		[CLK_CSI0]		= &csi0_clk.common.hw,
+		[CLK_CSI1]		= &csi1_clk.common.hw,
+		[CLK_VE]		= &ve_clk.common.hw,
+		[CLK_CODEC]		= &codec_clk.common.hw,
+		[CLK_AVS]		= &avs_clk.common.hw,
+		[CLK_ACE]		= &ace_clk.common.hw,
+		[CLK_HDMI]		= &hdmi_clk.common.hw,
+		[CLK_GPU]		= &gpu_sun7i_clk.common.hw,
+		[CLK_MBUS]		= &mbus_sun7i_clk.common.hw,
+		[CLK_HDMI1_SLOW]	= &hdmi1_slow_clk.common.hw,
+		[CLK_HDMI1]		= &hdmi1_clk.common.hw,
+		[CLK_OUT_A]		= &out_a_clk.common.hw,
+		[CLK_OUT_B]		= &out_b_clk.common.hw,
+	},
+	.num	= CLK_NUMBER_SUN7I,
+};
+
+static struct ccu_reset_map sunxi_a10_a20_ccu_resets[] = {
+	[RST_USB_PHY0]		= { 0x0cc, BIT(0) },
+	[RST_USB_PHY1]		= { 0x0cc, BIT(1) },
+	[RST_USB_PHY2]		= { 0x0cc, BIT(2) },
+	[RST_GPS]		= { 0x0d0, BIT(0) },
+	[RST_DE_BE0]		= { 0x104, BIT(30) },
+	[RST_DE_BE1]		= { 0x108, BIT(30) },
+	[RST_DE_FE0]		= { 0x10c, BIT(30) },
+	[RST_DE_FE1]		= { 0x110, BIT(30) },
+	[RST_DE_MP]		= { 0x114, BIT(30) },
+	[RST_TVE0]		= { 0x118, BIT(29) },
+	[RST_TCON0]		= { 0x118, BIT(30) },
+	[RST_TVE1]		= { 0x11c, BIT(29) },
+	[RST_TCON1]		= { 0x11c, BIT(30) },
+	[RST_CSI0]		= { 0x134, BIT(30) },
+	[RST_CSI1]		= { 0x138, BIT(30) },
+	[RST_VE]		= { 0x13c, BIT(0) },
+	[RST_ACE]		= { 0x148, BIT(16) },
+	[RST_LVDS]		= { 0x14c, BIT(0) },
+	[RST_GPU]		= { 0x154, BIT(30) },
+	[RST_HDMI_H]		= { 0x170, BIT(0) },
+	[RST_HDMI_SYS]		= { 0x170, BIT(1) },
+	[RST_HDMI_AUDIO_DMA]	= { 0x170, BIT(2) },
+};
+
+static const struct sunxi_ccu_desc sun4i_a10_ccu_desc = {
+	.ccu_clks	= sun4i_sun7i_ccu_clks,
+	.num_ccu_clks	= ARRAY_SIZE(sun4i_sun7i_ccu_clks),
+
+	.hw_clks	= &sun4i_a10_hw_clks,
+
+	.resets		= sunxi_a10_a20_ccu_resets,
+	.num_resets	= ARRAY_SIZE(sunxi_a10_a20_ccu_resets),
+};
+
+static const struct sunxi_ccu_desc sun7i_a20_ccu_desc = {
+	.ccu_clks	= sun4i_sun7i_ccu_clks,
+	.num_ccu_clks	= ARRAY_SIZE(sun4i_sun7i_ccu_clks),
+
+	.hw_clks	= &sun7i_a20_hw_clks,
+
+	.resets		= sunxi_a10_a20_ccu_resets,
+	.num_resets	= ARRAY_SIZE(sunxi_a10_a20_ccu_resets),
+};
+
+static void __init sun4i_ccu_init(struct device_node *node,
+				  const struct sunxi_ccu_desc *desc)
+{
+	void __iomem *reg;
+	u32 val;
+
+	reg = of_io_request_and_map(node, 0, of_node_full_name(node));
+	if (IS_ERR(reg)) {
+		pr_err("%s: Could not map the clock registers\n",
+		       of_node_full_name(node));
+		return;
+	}
+
+	/* Force the PLL-Audio-1x divider to 4 */
+	val = readl(reg + SUN4I_PLL_AUDIO_REG);
+	val &= ~GENMASK(29, 26);
+	writel(val | (4 << 26), reg + SUN4I_PLL_AUDIO_REG);
+
+	/*
+	 * Use the peripheral PLL6 as the AHB parent, instead of CPU /
+	 * AXI which have rate changes due to cpufreq.
+	 *
+	 * This is especially a big deal for the HS timer whose parent
+	 * clock is AHB.
+	 *
+	 * NB! These bits are undocumented in A10 manual.
+	 */
+	val = readl(reg + SUN4I_AHB_REG);
+	val &= ~GENMASK(7, 6);
+	writel(val | (2 << 6), reg + SUN4I_AHB_REG);
+
+	sunxi_ccu_probe(node, reg, desc);
+}
+
+static void __init sun4i_a10_ccu_setup(struct device_node *node)
+{
+	sun4i_ccu_init(node, &sun4i_a10_ccu_desc);
+}
+CLK_OF_DECLARE(sun4i_a10_ccu, "allwinner,sun4i-a10-ccu",
+	       sun4i_a10_ccu_setup);
+
+static void __init sun7i_a20_ccu_setup(struct device_node *node)
+{
+	sun4i_ccu_init(node, &sun7i_a20_ccu_desc);
+}
+CLK_OF_DECLARE(sun7i_a20_ccu, "allwinner,sun7i-a20-ccu",
+	       sun7i_a20_ccu_setup);
diff --git a/drivers/clk/sunxi-ng/ccu-sun4i-a10.h b/drivers/clk/sunxi-ng/ccu-sun4i-a10.h
new file mode 100644
index 000000000000..c5947c7c050e
--- /dev/null
+++ b/drivers/clk/sunxi-ng/ccu-sun4i-a10.h
@@ -0,0 +1,61 @@
+/*
+ * Copyright 2017 Priit Laes
+ *
+ * Priit Laes <plaes@plaes.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ */
+
+#ifndef _CCU_SUN4I_A10_H_
+#define _CCU_SUN4I_A10_H_
+
+#include <dt-bindings/clock/sun4i-a10-ccu.h>
+#include <dt-bindings/clock/sun7i-a20-ccu.h>
+#include <dt-bindings/reset/sun4i-a10-ccu.h>
+
+/* The HOSC is exported */
+#define CLK_PLL_CORE		2
+#define CLK_PLL_AUDIO_BASE	3
+#define CLK_PLL_AUDIO		4
+#define CLK_PLL_AUDIO_2X	5
+#define CLK_PLL_AUDIO_4X	6
+#define CLK_PLL_AUDIO_8X	7
+#define CLK_PLL_VIDEO0		8
+#define CLK_PLL_VIDEO0_2X	9
+#define CLK_PLL_VE		10
+#define CLK_PLL_DDR_BASE	11
+#define CLK_PLL_DDR		12
+#define CLK_PLL_DDR_OTHER	13
+#define CLK_PLL_PERIPH_BASE	14
+#define CLK_PLL_PERIPH		15
+#define CLK_PLL_PERIPH_SATA	16
+#define CLK_PLL_VIDEO1		17
+#define CLK_PLL_VIDEO1_2X	18
+#define CLK_PLL_GPU		19
+
+/* The CPU clock is exported */
+#define CLK_AXI			21
+#define CLK_AXI_DRAM		22
+#define CLK_AHB			23
+#define CLK_APB0		24
+#define CLK_APB1		25
+
+/* AHB gates are exported (23..68) */
+/* APB0 gates are exported (69..78) */
+/* APB1 gates are exported (79..95) */
+/* IP module clocks are exported (96..128) */
+/* DRAM gates are exported (129..142)*/
+/* Media (display engine clocks & etc) are exported (143..169) */
+
+#define CLK_NUMBER_SUN4I	(CLK_MBUS + 1)
+#define CLK_NUMBER_SUN7I	(CLK_OUT_B + 1)
+
+#endif /* _CCU_SUN4I_A10_H_ */
diff --git a/include/dt-bindings/clock/sun4i-a10-ccu.h b/include/dt-bindings/clock/sun4i-a10-ccu.h
new file mode 100644
index 000000000000..c5a53f38d654
--- /dev/null
+++ b/include/dt-bindings/clock/sun4i-a10-ccu.h
@@ -0,0 +1,200 @@
+/*
+ * Copyright (C) 2017 Priit Laes <plaes@plaes.org>
+ *
+ * This file is dual-licensed: you can use it either under the terms
+ * of the GPL or the X11 license, at your option. Note that this dual
+ * licensing only applies to this file, and not this project as a
+ * whole.
+ *
+ *  a) This file is free software; you can redistribute it and/or
+ *     modify it under the terms of the GNU General Public License as
+ *     published by the Free Software Foundation; either version 2 of the
+ *     License, or (at your option) any later version.
+ *
+ *     This file is distributed in the hope that it will be useful,
+ *     but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *     GNU General Public License for more details.
+ *
+ * Or, alternatively,
+ *
+ *  b) Permission is hereby granted, free of charge, to any person
+ *     obtaining a copy of this software and associated documentation
+ *     files (the "Software"), to deal in the Software without
+ *     restriction, including without limitation the rights to use,
+ *     copy, modify, merge, publish, distribute, sublicense, and/or
+ *     sell copies of the Software, and to permit persons to whom the
+ *     Software is furnished to do so, subject to the following
+ *     conditions:
+ *
+ *     The above copyright notice and this permission notice shall be
+ *     included in all copies or substantial portions of the Software.
+ *
+ *     THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ *     EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+ *     OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ *     NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+ *     HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+ *     FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ *     OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#ifndef _DT_BINDINGS_CLK_SUN4I_A10_H_
+#define _DT_BINDINGS_CLK_SUN4I_A10_H_
+
+#define CLK_HOSC		1
+#define CLK_CPU			20
+
+/* AHB Gates */
+#define CLK_AHB_OTG		26
+#define CLK_AHB_EHCI0		27
+#define CLK_AHB_OHCI0		28
+#define CLK_AHB_EHCI1		29
+#define CLK_AHB_OHCI1		30
+#define CLK_AHB_SS		31
+#define CLK_AHB_DMA		32
+#define CLK_AHB_BIST		33
+#define CLK_AHB_MMC0		34
+#define CLK_AHB_MMC1		35
+#define CLK_AHB_MMC2		36
+#define CLK_AHB_MMC3		37
+#define CLK_AHB_MS		38
+#define CLK_AHB_NAND		39
+#define CLK_AHB_SDRAM		40
+#define CLK_AHB_ACE		41
+#define CLK_AHB_EMAC		42
+#define CLK_AHB_TS		43
+#define CLK_AHB_SPI0		44
+#define CLK_AHB_SPI1		45
+#define CLK_AHB_SPI2		46
+#define CLK_AHB_SPI3		47
+#define CLK_AHB_PATA		48
+#define CLK_AHB_SATA		49
+#define CLK_AHB_GPS		50
+#define CLK_AHB_HSTIMER		51
+#define CLK_AHB_VE		52
+#define CLK_AHB_TVD		53
+#define CLK_AHB_TVE0		54
+#define CLK_AHB_TVE1		55
+#define CLK_AHB_LCD0		56
+#define CLK_AHB_LCD1		57
+#define CLK_AHB_CSI0		58
+#define CLK_AHB_CSI1		59
+#define CLK_AHB_HDMI0		60
+#define CLK_AHB_HDMI1		61
+#define CLK_AHB_DE_BE0		62
+#define CLK_AHB_DE_BE1		63
+#define CLK_AHB_DE_FE0		64
+#define CLK_AHB_DE_FE1		65
+#define CLK_AHB_GMAC		66
+#define CLK_AHB_MP		67
+#define CLK_AHB_GPU		68
+
+/* APB0 Gates */
+#define CLK_APB0_CODEC		69
+#define CLK_APB0_SPDIF		70
+#define CLK_APB0_I2S0		71
+#define CLK_APB0_AC97		72
+#define CLK_APB0_I2S1		73
+#define CLK_APB0_PIO		74
+#define CLK_APB0_IR0		75
+#define CLK_APB0_IR1		76
+#define CLK_APB0_I2S2		77
+#define CLK_APB0_KEYPAD		78
+
+/* APB1 Gates */
+#define CLK_APB1_I2C0		79
+#define CLK_APB1_I2C1		80
+#define CLK_APB1_I2C2		81
+#define CLK_APB1_I2C3		82
+#define CLK_APB1_CAN		83
+#define CLK_APB1_SCR		84
+#define CLK_APB1_PS20		85
+#define CLK_APB1_PS21		86
+#define CLK_APB1_I2C4		87
+#define CLK_APB1_UART0		88
+#define CLK_APB1_UART1		89
+#define CLK_APB1_UART2		90
+#define CLK_APB1_UART3		91
+#define CLK_APB1_UART4		92
+#define CLK_APB1_UART5		93
+#define CLK_APB1_UART6		94
+#define CLK_APB1_UART7		95
+
+/* IP clocks */
+#define CLK_NAND		96
+#define CLK_MS			97
+#define CLK_MMC0		98
+#define CLK_MMC0_OUTPUT		99
+#define CLK_MMC0_SAMPLE		100
+#define CLK_MMC1		101
+#define CLK_MMC1_OUTPUT		102
+#define CLK_MMC1_SAMPLE		103
+#define CLK_MMC2		104
+#define CLK_MMC2_OUTPUT		105
+#define CLK_MMC2_SAMPLE		106
+#define CLK_MMC3		107
+#define CLK_MMC3_OUTPUT		108
+#define CLK_MMC3_SAMPLE		109
+#define CLK_TS			110
+#define CLK_SS			111
+#define CLK_SPI0		112
+#define CLK_SPI1		113
+#define CLK_SPI2		114
+#define CLK_PATA		115
+#define CLK_IR0			116
+#define CLK_IR1			117
+#define CLK_I2S0		118
+#define CLK_AC97		119
+#define CLK_SPDIF		120
+#define CLK_KEYPAD		121
+#define CLK_SATA		122
+#define CLK_USB_OHCI0		123
+#define CLK_USB_OHCI1		124
+#define CLK_USB_PHY		125
+#define CLK_GPS			126
+#define CLK_SPI3		127
+#define CLK_I2S1		128
+#define CLK_I2S2		129
+
+/* DRAM Gates */
+#define CLK_DRAM_VE		130
+#define CLK_DRAM_CSI0		131
+#define CLK_DRAM_CSI1		132
+#define CLK_DRAM_TS		133
+#define CLK_DRAM_TVD		134
+#define CLK_DRAM_TVE0		135
+#define CLK_DRAM_TVE1		136
+#define CLK_DRAM_OUT		137
+#define CLK_DRAM_DE_FE1		138
+#define CLK_DRAM_DE_FE0		139
+#define CLK_DRAM_DE_BE0		140
+#define CLK_DRAM_DE_BE1		141
+#define CLK_DRAM_MP		142
+#define CLK_DRAM_ACE		143
+
+/* Display Engine Clocks */
+#define CLK_DE_BE0		144
+#define CLK_DE_BE1		145
+#define CLK_DE_FE0		146
+#define CLK_DE_FE1		147
+#define CLK_DE_MP		148
+#define CLK_TCON0_CH0		149
+#define CLK_TCON1_CH0		150
+#define CLK_CSI_SCLK		151
+#define CLK_TVD_SCLK2		152
+#define CLK_TVD			153
+#define CLK_TCON0_CH1_SCLK2	154
+#define CLK_TCON0_CH1		155
+#define CLK_TCON1_CH1_SCLK2	156
+#define CLK_TCON1_CH1		157
+#define CLK_CSI0		158
+#define CLK_CSI1		159
+#define CLK_CODEC		160
+#define CLK_VE			161
+#define CLK_AVS			162
+#define CLK_ACE			163
+#define CLK_HDMI		164
+#define CLK_GPU			165
+
+#endif /* _DT_BINDINGS_CLK_SUN4I_A10_H_ */
diff --git a/include/dt-bindings/clock/sun7i-a20-ccu.h b/include/dt-bindings/clock/sun7i-a20-ccu.h
new file mode 100644
index 000000000000..045a5178da0c
--- /dev/null
+++ b/include/dt-bindings/clock/sun7i-a20-ccu.h
@@ -0,0 +1,53 @@
+/*
+ * Copyright (C) 2017 Priit Laes <plaes@plaes.org>
+ *
+ * This file is dual-licensed: you can use it either under the terms
+ * of the GPL or the X11 license, at your option. Note that this dual
+ * licensing only applies to this file, and not this project as a
+ * whole.
+ *
+ *  a) This file is free software; you can redistribute it and/or
+ *     modify it under the terms of the GNU General Public License as
+ *     published by the Free Software Foundation; either version 2 of the
+ *     License, or (at your option) any later version.
+ *
+ *     This file is distributed in the hope that it will be useful,
+ *     but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *     GNU General Public License for more details.
+ *
+ * Or, alternatively,
+ *
+ *  b) Permission is hereby granted, free of charge, to any person
+ *     obtaining a copy of this software and associated documentation
+ *     files (the "Software"), to deal in the Software without
+ *     restriction, including without limitation the rights to use,
+ *     copy, modify, merge, publish, distribute, sublicense, and/or
+ *     sell copies of the Software, and to permit persons to whom the
+ *     Software is furnished to do so, subject to the following
+ *     conditions:
+ *
+ *     The above copyright notice and this permission notice shall be
+ *     included in all copies or substantial portions of the Software.
+ *
+ *     THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ *     EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+ *     OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ *     NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+ *     HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+ *     FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ *     OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#ifndef _DT_BINDINGS_CLK_SUN7I_A20_H_
+#define _DT_BINDINGS_CLK_SUN7I_A20_H_
+
+#include <dt-bindings/clock/sun4i-a10-ccu.h>
+
+#define CLK_MBUS		166
+#define CLK_HDMI1_SLOW		167
+#define CLK_HDMI1		168
+#define CLK_OUT_A		169
+#define CLK_OUT_B		170
+
+#endif /* _DT_BINDINGS_CLK_SUN7I_A20_H_ */
diff --git a/include/dt-bindings/reset/sun4i-a10-ccu.h b/include/dt-bindings/reset/sun4i-a10-ccu.h
new file mode 100644
index 000000000000..5f4480bedc8a
--- /dev/null
+++ b/include/dt-bindings/reset/sun4i-a10-ccu.h
@@ -0,0 +1,69 @@
+/*
+ * Copyright (C) 2017 Priit Laes <plaes@plaes.org>
+ *
+ * This file is dual-licensed: you can use it either under the terms
+ * of the GPL or the X11 license, at your option. Note that this dual
+ * licensing only applies to this file, and not this project as a
+ * whole.
+ *
+ *  a) This file is free software; you can redistribute it and/or
+ *     modify it under the terms of the GNU General Public License as
+ *     published by the Free Software Foundation; either version 2 of the
+ *     License, or (at your option) any later version.
+ *
+ *     This file is distributed in the hope that it will be useful,
+ *     but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *     GNU General Public License for more details.
+ *
+ * Or, alternatively,
+ *
+ *  b) Permission is hereby granted, free of charge, to any person
+ *     obtaining a copy of this software and associated documentation
+ *     files (the "Software"), to deal in the Software without
+ *     restriction, including without limitation the rights to use,
+ *     copy, modify, merge, publish, distribute, sublicense, and/or
+ *     sell copies of the Software, and to permit persons to whom the
+ *     Software is furnished to do so, subject to the following
+ *     conditions:
+ *
+ *     The above copyright notice and this permission notice shall be
+ *     included in all copies or substantial portions of the Software.
+ *
+ *     THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ *     EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+ *     OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ *     NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+ *     HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+ *     WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ *     FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ *     OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#ifndef _DT_BINDINGS_RST_SUN4I_A10_H
+#define _DT_BINDINGS_RST_SUN4I_A10_H
+
+#define	RST_USB_PHY0		1
+#define	RST_USB_PHY1		2
+#define	RST_USB_PHY2		3
+#define	RST_GPS			4
+#define	RST_DE_BE0		5
+#define	RST_DE_BE1		6
+#define	RST_DE_FE0		7
+#define	RST_DE_FE1		8
+#define	RST_DE_MP		9
+#define	RST_TVE0		10
+#define	RST_TCON0		11
+#define	RST_TVE1		12
+#define	RST_TCON1		13
+#define	RST_CSI0		14
+#define	RST_CSI1		15
+#define	RST_VE			16
+#define	RST_ACE			17
+#define	RST_LVDS		18
+#define	RST_GPU			19
+#define	RST_HDMI_H		20
+#define	RST_HDMI_SYS		21
+#define	RST_HDMI_AUDIO_DMA	22
+
+#endif /* DT_BINDINGS_RST_SUN4I_A10_H */
-- 
cgit v1.2.3


From 581821ae7f7e2c4547945c65f1bcd357f5915aa5 Mon Sep 17 00:00:00 2001
From: Dmitry Osipenko <digetx@gmail.com>
Date: Wed, 16 Aug 2017 13:32:39 +0300
Subject: usb: chipidea: udc: Support SKB alignment quirk

NVIDIA Tegra20 UDC can't cope with unaligned DMA and require a USB gadget
quirk that avoids SKB buffer alignment to be set in order to make Ethernet
Gadget working. Later Tegra generations do not require that quirk. Let's
add a new platform data flag that allows to enable USB gadget quirk for
platforms that require it.

Signed-off-by: Dmitry Osipenko <digetx@gmail.com>
Acked-by: Peter Chen <peter.chen@nxp.com>
Signed-off-by: Peter Chen <peter.chen@nxp.com>
---
 drivers/usb/chipidea/udc.c   | 3 +++
 include/linux/usb/chipidea.h | 1 +
 2 files changed, 4 insertions(+)

(limited to 'include')

diff --git a/drivers/usb/chipidea/udc.c b/drivers/usb/chipidea/udc.c
index 6502c13331e8..fe8a90543ea3 100644
--- a/drivers/usb/chipidea/udc.c
+++ b/drivers/usb/chipidea/udc.c
@@ -1896,6 +1896,9 @@ static int udc_start(struct ci_hdrc *ci)
 	ci->gadget.name         = ci->platdata->name;
 	ci->gadget.otg_caps	= otg_caps;
 
+	if (ci->platdata->flags & CI_HDRC_REQUIRES_ALIGNED_DMA)
+		ci->gadget.quirk_avoids_skb_reserve = 1;
+
 	if (ci->is_otg && (otg_caps->hnp_support || otg_caps->srp_support ||
 						otg_caps->adp_support))
 		ci->gadget.is_otg = 1;
diff --git a/include/linux/usb/chipidea.h b/include/linux/usb/chipidea.h
index c5fdfcf99828..d725cff7268d 100644
--- a/include/linux/usb/chipidea.h
+++ b/include/linux/usb/chipidea.h
@@ -58,6 +58,7 @@ struct ci_hdrc_platform_data {
 #define CI_HDRC_OVERRIDE_TX_BURST	BIT(10)
 #define CI_HDRC_OVERRIDE_RX_BURST	BIT(11)
 #define CI_HDRC_OVERRIDE_PHY_CONTROL	BIT(12) /* Glue layer manages phy */
+#define CI_HDRC_REQUIRES_ALIGNED_DMA	BIT(13)
 	enum usb_dr_mode	dr_mode;
 #define CI_HDRC_CONTROLLER_RESET_EVENT		0
 #define CI_HDRC_CONTROLLER_STOPPED_EVENT	1
-- 
cgit v1.2.3


From 07533c6765de05199417ec73f9c2a495ddd29473 Mon Sep 17 00:00:00 2001
From: Gal Pressman <galp@mellanox.com>
Date: Mon, 21 Aug 2017 17:54:21 +0300
Subject: net/mlx5: Remove a leftover unused variable

mlx5_core_wq is no longer being used and should be removed
from the code.

Signed-off-by: Gal Pressman <galp@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
---
 include/linux/mlx5/driver.h | 2 --
 1 file changed, 2 deletions(-)

(limited to 'include')

diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h
index d26f18b39c4a..d5b6f6a9fcc5 100644
--- a/include/linux/mlx5/driver.h
+++ b/include/linux/mlx5/driver.h
@@ -890,8 +890,6 @@ static inline void *mlx5_buf_offset(struct mlx5_buf *buf, int offset)
 		return buf->direct.buf + offset;
 }
 
-extern struct workqueue_struct *mlx5_core_wq;
-
 #define STRUCT_FIELD(header, field) \
 	.struct_offset_bytes = offsetof(struct ib_unpacked_ ## header, field),      \
 	.struct_size_bytes   = sizeof((struct ib_unpacked_ ## header *)0)->field
-- 
cgit v1.2.3


From 667cb65ae5ad01523505d48d6cfd92bd1d3c9785 Mon Sep 17 00:00:00 2001
From: Matan Barak <matanb@mellanox.com>
Date: Mon, 7 Aug 2017 11:14:11 +0300
Subject: net/mlx5: Don't store reserved part in FTEs and FGs

The current code stores fte_match_param in the software representation
of FTEs and FGs. fte_match_param contains a large reserved area at the
bottom of the struct. Since downstream patches are going to hash this
part, we would like to avoid doing so on a reserved part.

Signed-off-by: Matan Barak <matanb@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
---
 drivers/net/ethernet/mellanox/mlx5/core/fs_cmd.c  |  2 +-
 drivers/net/ethernet/mellanox/mlx5/core/fs_core.c |  8 --------
 drivers/net/ethernet/mellanox/mlx5/core/fs_core.h | 16 ++++++++++++++--
 include/linux/mlx5/device.h                       |  2 +-
 4 files changed, 16 insertions(+), 12 deletions(-)

(limited to 'include')

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fs_cmd.c b/drivers/net/ethernet/mellanox/mlx5/core/fs_cmd.c
index 16b32f31d691..e0d0efd903bc 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/fs_cmd.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/fs_cmd.c
@@ -263,7 +263,7 @@ static int mlx5_cmd_set_fte(struct mlx5_core_dev *dev,
 	MLX5_SET(flow_context, in_flow_context, modify_header_id, fte->modify_id);
 	in_match_value = MLX5_ADDR_OF(flow_context, in_flow_context,
 				      match_value);
-	memcpy(in_match_value, &fte->val, MLX5_ST_SZ_BYTES(fte_match_param));
+	memcpy(in_match_value, &fte->val, sizeof(fte->val));
 
 	in_dests = MLX5_ADDR_OF(flow_context, in_flow_context, destination);
 	if (fte->action & MLX5_FLOW_CONTEXT_ACTION_FWD_DEST) {
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c b/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c
index af245fcdba70..986c5e50e872 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c
@@ -371,21 +371,14 @@ static void del_rule(struct fs_node *node)
 	struct mlx5_flow_table *ft;
 	struct mlx5_flow_group *fg;
 	struct fs_fte *fte;
-	u32	*match_value;
 	int modify_mask;
 	struct mlx5_core_dev *dev = get_dev(node);
-	int match_len = MLX5_ST_SZ_BYTES(fte_match_param);
 	int err;
 	bool update_fte = false;
 
-	match_value = kvzalloc(match_len, GFP_KERNEL);
-	if (!match_value)
-		return;
-
 	fs_get_obj(rule, node);
 	fs_get_obj(fte, rule->node.parent);
 	fs_get_obj(fg, fte->node.parent);
-	memcpy(match_value, fte->val, sizeof(fte->val));
 	fs_get_obj(ft, fg->node.parent);
 	list_del(&rule->node.list);
 	if (rule->sw_action == MLX5_FLOW_CONTEXT_ACTION_FWD_NEXT_PRIO) {
@@ -415,7 +408,6 @@ out:
 				       "%s can't del rule fg id=%d fte_index=%d\n",
 				       __func__, fg->id, fte->index);
 	}
-	kvfree(match_value);
 }
 
 static void del_fte(struct fs_node *node)
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fs_core.h b/drivers/net/ethernet/mellanox/mlx5/core/fs_core.h
index 5fbae885558f..bfbc081d17dc 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/fs_core.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/fs_core.h
@@ -144,10 +144,22 @@ struct mlx5_fc {
 	struct mlx5_fc_cache cache ____cacheline_aligned_in_smp;
 };
 
+#define MLX5_FTE_MATCH_PARAM_RESERVED	reserved_at_600
+/* Calculate the fte_match_param length and without the reserved length.
+ * Make sure the reserved field is the last.
+ */
+#define MLX5_ST_SZ_DW_MATCH_PARAM					    \
+	((MLX5_BYTE_OFF(fte_match_param, MLX5_FTE_MATCH_PARAM_RESERVED) / sizeof(u32)) + \
+	 BUILD_BUG_ON_ZERO(MLX5_ST_SZ_BYTES(fte_match_param) !=		     \
+			   MLX5_FLD_SZ_BYTES(fte_match_param,		     \
+					     MLX5_FTE_MATCH_PARAM_RESERVED) +\
+			   MLX5_BYTE_OFF(fte_match_param,		     \
+					 MLX5_FTE_MATCH_PARAM_RESERVED)))
+
 /* Type of children is mlx5_flow_rule */
 struct fs_fte {
 	struct fs_node			node;
-	u32				val[MLX5_ST_SZ_DW(fte_match_param)];
+	u32				val[MLX5_ST_SZ_DW_MATCH_PARAM];
 	u32				dests_size;
 	u32				flow_tag;
 	u32				index;
@@ -175,7 +187,7 @@ struct mlx5_flow_namespace {
 
 struct mlx5_flow_group_mask {
 	u8	match_criteria_enable;
-	u32	match_criteria[MLX5_ST_SZ_DW(fte_match_param)];
+	u32	match_criteria[MLX5_ST_SZ_DW_MATCH_PARAM];
 };
 
 /* Type of children is fs_fte */
diff --git a/include/linux/mlx5/device.h b/include/linux/mlx5/device.h
index f31a0b5377e1..3c7442b56460 100644
--- a/include/linux/mlx5/device.h
+++ b/include/linux/mlx5/device.h
@@ -48,7 +48,7 @@
 /* helper macros */
 #define __mlx5_nullp(typ) ((struct mlx5_ifc_##typ##_bits *)0)
 #define __mlx5_bit_sz(typ, fld) sizeof(__mlx5_nullp(typ)->fld)
-#define __mlx5_bit_off(typ, fld) ((unsigned)(unsigned long)(&(__mlx5_nullp(typ)->fld)))
+#define __mlx5_bit_off(typ, fld) (offsetof(struct mlx5_ifc_##typ##_bits, fld))
 #define __mlx5_dw_off(typ, fld) (__mlx5_bit_off(typ, fld) / 32)
 #define __mlx5_64_off(typ, fld) (__mlx5_bit_off(typ, fld) / 64)
 #define __mlx5_dw_bit_off(typ, fld) (32 - __mlx5_bit_sz(typ, fld) - (__mlx5_bit_off(typ, fld) & 0x1f))
-- 
cgit v1.2.3


From 50b4d485528d1dbe0bd249f2073140e3444f4a7b Mon Sep 17 00:00:00 2001
From: Benjamin Block <bblock@linux.vnet.ibm.com>
Date: Thu, 24 Aug 2017 01:57:56 +0200
Subject: bsg-lib: fix kernel panic resulting from missing allocation of
 reply-buffer

Since we split the scsi_request out of struct request bsg fails to
provide a reply-buffer for the drivers. This was done via the pointer
for sense-data, that is not preallocated anymore.

Failing to allocate/assign it results in illegal dereferences because
LLDs use this pointer unquestioned.

An example panic on s390x, using the zFCP driver, looks like this (I had
debugging on, otherwise NULL-pointer dereferences wouldn't even panic on
s390x):

Unable to handle kernel pointer dereference in virtual kernel address space
Failing address: 6b6b6b6b6b6b6000 TEID: 6b6b6b6b6b6b6403
Fault in home space mode while using kernel ASCE.
AS:0000000001590007 R3:0000000000000024
Oops: 0038 ilc:2 [#1] PREEMPT SMP DEBUG_PAGEALLOC
Modules linked in: <Long List>
CPU: 2 PID: 0 Comm: swapper/2 Not tainted 4.12.0-bsg-regression+ #3
Hardware name: IBM 2964 N96 702 (z/VM 6.4.0)
task: 0000000065cb0100 task.stack: 0000000065cb4000
Krnl PSW : 0704e00180000000 000003ff801e4156 (zfcp_fc_ct_els_job_handler+0x16/0x58 [zfcp])
           R:0 T:1 IO:1 EX:1 Key:0 M:1 W:0 P:0 AS:3 CC:2 PM:0 RI:0 EA:3
Krnl GPRS: 0000000000000001 000000005fa9d0d0 000000005fa9d078 0000000000e16866
           000003ff00000290 6b6b6b6b6b6b6b6b 0000000059f78f00 000000000000000f
           00000000593a0958 00000000593a0958 0000000060d88800 000000005ddd4c38
           0000000058b50100 07000000659cba08 000003ff801e8556 00000000659cb9a8
Krnl Code: 000003ff801e4146: e31020500004        lg      %r1,80(%r2)
           000003ff801e414c: 58402040           l       %r4,64(%r2)
          #000003ff801e4150: e35020200004       lg      %r5,32(%r2)
          >000003ff801e4156: 50405004           st      %r4,4(%r5)
           000003ff801e415a: e54c50080000       mvhi    8(%r5),0
           000003ff801e4160: e33010280012       lt      %r3,40(%r1)
           000003ff801e4166: a718fffb           lhi     %r1,-5
           000003ff801e416a: 1803               lr      %r0,%r3
Call Trace:
([<000003ff801e8556>] zfcp_fsf_req_complete+0x726/0x768 [zfcp])
 [<000003ff801ea82a>] zfcp_fsf_reqid_check+0x102/0x180 [zfcp]
 [<000003ff801eb980>] zfcp_qdio_int_resp+0x230/0x278 [zfcp]
 [<00000000009b91b6>] qdio_kick_handler+0x2ae/0x2c8
 [<00000000009b9e3e>] __tiqdio_inbound_processing+0x406/0xc10
 [<00000000001684c2>] tasklet_action+0x15a/0x1d8
 [<0000000000bd28ec>] __do_softirq+0x3ec/0x848
 [<00000000001675a4>] irq_exit+0x74/0xf8
 [<000000000010dd6a>] do_IRQ+0xba/0xf0
 [<0000000000bd19e8>] io_int_handler+0x104/0x2d4
 [<00000000001033b6>] enabled_wait+0xb6/0x188
([<000000000010339e>] enabled_wait+0x9e/0x188)
 [<000000000010396a>] arch_cpu_idle+0x32/0x50
 [<0000000000bd0112>] default_idle_call+0x52/0x68
 [<00000000001cd0fa>] do_idle+0x102/0x188
 [<00000000001cd41e>] cpu_startup_entry+0x3e/0x48
 [<0000000000118c64>] smp_start_secondary+0x11c/0x130
 [<0000000000bd2016>] restart_int_handler+0x62/0x78
 [<0000000000000000>]           (null)
INFO: lockdep is turned off.
Last Breaking-Event-Address:
 [<000003ff801e41d6>] zfcp_fc_ct_job_handler+0x3e/0x48 [zfcp]

Kernel panic - not syncing: Fatal exception in interrupt

This patch moves bsg-lib to allocate and setup struct bsg_job ahead of
time, including the allocation of a buffer for the reply-data.

This means, struct bsg_job is not allocated separately anymore, but as part
of struct request allocation - similar to struct scsi_cmd. Reflect this in
the function names that used to handle creation/destruction of struct
bsg_job.

Reported-by: Steffen Maier <maier@linux.vnet.ibm.com>
Suggested-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Benjamin Block <bblock@linux.vnet.ibm.com>
Fixes: 82ed4db499b8 ("block: split scsi_request out of struct request")
Cc: <stable@vger.kernel.org> #4.11+
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/bsg-lib.c         | 74 +++++++++++++++++++++++++++++--------------------
 include/linux/blkdev.h  |  1 -
 include/linux/bsg-lib.h |  2 ++
 3 files changed, 46 insertions(+), 31 deletions(-)

(limited to 'include')

diff --git a/block/bsg-lib.c b/block/bsg-lib.c
index c4513b23f57a..dd56d7460cb9 100644
--- a/block/bsg-lib.c
+++ b/block/bsg-lib.c
@@ -29,26 +29,25 @@
 #include <scsi/scsi_cmnd.h>
 
 /**
- * bsg_destroy_job - routine to teardown/delete a bsg job
+ * bsg_teardown_job - routine to teardown a bsg job
  * @job: bsg_job that is to be torn down
  */
-static void bsg_destroy_job(struct kref *kref)
+static void bsg_teardown_job(struct kref *kref)
 {
 	struct bsg_job *job = container_of(kref, struct bsg_job, kref);
 	struct request *rq = job->req;
 
-	blk_end_request_all(rq, BLK_STS_OK);
-
 	put_device(job->dev);	/* release reference for the request */
 
 	kfree(job->request_payload.sg_list);
 	kfree(job->reply_payload.sg_list);
-	kfree(job);
+
+	blk_end_request_all(rq, BLK_STS_OK);
 }
 
 void bsg_job_put(struct bsg_job *job)
 {
-	kref_put(&job->kref, bsg_destroy_job);
+	kref_put(&job->kref, bsg_teardown_job);
 }
 EXPORT_SYMBOL_GPL(bsg_job_put);
 
@@ -100,7 +99,7 @@ EXPORT_SYMBOL_GPL(bsg_job_done);
  */
 static void bsg_softirq_done(struct request *rq)
 {
-	struct bsg_job *job = rq->special;
+	struct bsg_job *job = blk_mq_rq_to_pdu(rq);
 
 	bsg_job_put(job);
 }
@@ -122,33 +121,20 @@ static int bsg_map_buffer(struct bsg_buffer *buf, struct request *req)
 }
 
 /**
- * bsg_create_job - create the bsg_job structure for the bsg request
+ * bsg_prepare_job - create the bsg_job structure for the bsg request
  * @dev: device that is being sent the bsg request
  * @req: BSG request that needs a job structure
  */
-static int bsg_create_job(struct device *dev, struct request *req)
+static int bsg_prepare_job(struct device *dev, struct request *req)
 {
 	struct request *rsp = req->next_rq;
-	struct request_queue *q = req->q;
 	struct scsi_request *rq = scsi_req(req);
-	struct bsg_job *job;
+	struct bsg_job *job = blk_mq_rq_to_pdu(req);
 	int ret;
 
-	BUG_ON(req->special);
-
-	job = kzalloc(sizeof(struct bsg_job) + q->bsg_job_size, GFP_KERNEL);
-	if (!job)
-		return -ENOMEM;
-
-	req->special = job;
-	job->req = req;
-	if (q->bsg_job_size)
-		job->dd_data = (void *)&job[1];
 	job->request = rq->cmd;
 	job->request_len = rq->cmd_len;
-	job->reply = rq->sense;
-	job->reply_len = SCSI_SENSE_BUFFERSIZE;	/* Size of sense buffer
-						 * allocated */
+
 	if (req->bio) {
 		ret = bsg_map_buffer(&job->request_payload, req);
 		if (ret)
@@ -187,7 +173,6 @@ static void bsg_request_fn(struct request_queue *q)
 {
 	struct device *dev = q->queuedata;
 	struct request *req;
-	struct bsg_job *job;
 	int ret;
 
 	if (!get_device(dev))
@@ -199,7 +184,7 @@ static void bsg_request_fn(struct request_queue *q)
 			break;
 		spin_unlock_irq(q->queue_lock);
 
-		ret = bsg_create_job(dev, req);
+		ret = bsg_prepare_job(dev, req);
 		if (ret) {
 			scsi_req(req)->result = ret;
 			blk_end_request_all(req, BLK_STS_OK);
@@ -207,8 +192,7 @@ static void bsg_request_fn(struct request_queue *q)
 			continue;
 		}
 
-		job = req->special;
-		ret = q->bsg_job_fn(job);
+		ret = q->bsg_job_fn(blk_mq_rq_to_pdu(req));
 		spin_lock_irq(q->queue_lock);
 		if (ret)
 			break;
@@ -219,6 +203,35 @@ static void bsg_request_fn(struct request_queue *q)
 	spin_lock_irq(q->queue_lock);
 }
 
+static int bsg_init_rq(struct request_queue *q, struct request *req, gfp_t gfp)
+{
+	struct bsg_job *job = blk_mq_rq_to_pdu(req);
+	struct scsi_request *sreq = &job->sreq;
+
+	memset(job, 0, sizeof(*job));
+
+	scsi_req_init(sreq);
+	sreq->sense_len = SCSI_SENSE_BUFFERSIZE;
+	sreq->sense = kzalloc(sreq->sense_len, gfp);
+	if (!sreq->sense)
+		return -ENOMEM;
+
+	job->req = req;
+	job->reply = sreq->sense;
+	job->reply_len = sreq->sense_len;
+	job->dd_data = job + 1;
+
+	return 0;
+}
+
+static void bsg_exit_rq(struct request_queue *q, struct request *req)
+{
+	struct bsg_job *job = blk_mq_rq_to_pdu(req);
+	struct scsi_request *sreq = &job->sreq;
+
+	kfree(sreq->sense);
+}
+
 /**
  * bsg_setup_queue - Create and add the bsg hooks so we can receive requests
  * @dev: device to attach bsg device to
@@ -235,7 +248,9 @@ struct request_queue *bsg_setup_queue(struct device *dev, char *name,
 	q = blk_alloc_queue(GFP_KERNEL);
 	if (!q)
 		return ERR_PTR(-ENOMEM);
-	q->cmd_size = sizeof(struct scsi_request);
+	q->cmd_size = sizeof(struct bsg_job) + dd_job_size;
+	q->init_rq_fn = bsg_init_rq;
+	q->exit_rq_fn = bsg_exit_rq;
 	q->request_fn = bsg_request_fn;
 
 	ret = blk_init_allocated_queue(q);
@@ -243,7 +258,6 @@ struct request_queue *bsg_setup_queue(struct device *dev, char *name,
 		goto out_cleanup_queue;
 
 	q->queuedata = dev;
-	q->bsg_job_size = dd_job_size;
 	q->bsg_job_fn = job_fn;
 	queue_flag_set_unlocked(QUEUE_FLAG_BIDI, q);
 	queue_flag_set_unlocked(QUEUE_FLAG_SCSI_PASSTHROUGH, q);
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 25f6a0cb27d3..2a5d52fa90f5 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -568,7 +568,6 @@ struct request_queue {
 
 #if defined(CONFIG_BLK_DEV_BSG)
 	bsg_job_fn		*bsg_job_fn;
-	int			bsg_job_size;
 	struct bsg_class_device bsg_dev;
 #endif
 
diff --git a/include/linux/bsg-lib.h b/include/linux/bsg-lib.h
index e34dde2da0ef..637a20cfb237 100644
--- a/include/linux/bsg-lib.h
+++ b/include/linux/bsg-lib.h
@@ -24,6 +24,7 @@
 #define _BLK_BSG_
 
 #include <linux/blkdev.h>
+#include <scsi/scsi_request.h>
 
 struct request;
 struct device;
@@ -37,6 +38,7 @@ struct bsg_buffer {
 };
 
 struct bsg_job {
+	struct scsi_request sreq;
 	struct device *dev;
 	struct request *req;
 
-- 
cgit v1.2.3


From ea369ea6d828930c0cf0cacf81f6fd9dff61992d Mon Sep 17 00:00:00 2001
From: Alexandre Belloni <alexandre.belloni@free-electrons.com>
Date: Wed, 23 Aug 2017 02:33:04 +0200
Subject: rtc: remove .open() and .release()

There are no driver left using .open and .release. There is no good use
case for them as there is nothing the character device interface does that
should not be done in the sysfs interface or in-kernel interface.

Remove those callbacks now to avoid future confusion.

Signed-off-by: Alexandre Belloni <alexandre.belloni@free-electrons.com>
---
 drivers/rtc/rtc-dev.c | 20 ++++----------------
 include/linux/rtc.h   |  2 --
 2 files changed, 4 insertions(+), 18 deletions(-)

(limited to 'include')

diff --git a/drivers/rtc/rtc-dev.c b/drivers/rtc/rtc-dev.c
index 794bc4fa4937..00efe24a6063 100644
--- a/drivers/rtc/rtc-dev.c
+++ b/drivers/rtc/rtc-dev.c
@@ -24,28 +24,19 @@ static dev_t rtc_devt;
 
 static int rtc_dev_open(struct inode *inode, struct file *file)
 {
-	int err;
 	struct rtc_device *rtc = container_of(inode->i_cdev,
 					struct rtc_device, char_dev);
-	const struct rtc_class_ops *ops = rtc->ops;
 
 	if (test_and_set_bit_lock(RTC_DEV_BUSY, &rtc->flags))
 		return -EBUSY;
 
 	file->private_data = rtc;
 
-	err = ops->open ? ops->open(rtc->dev.parent) : 0;
-	if (err == 0) {
-		spin_lock_irq(&rtc->irq_lock);
-		rtc->irq_data = 0;
-		spin_unlock_irq(&rtc->irq_lock);
-
-		return 0;
-	}
+	spin_lock_irq(&rtc->irq_lock);
+	rtc->irq_data = 0;
+	spin_unlock_irq(&rtc->irq_lock);
 
-	/* something has gone wrong */
-	clear_bit_unlock(RTC_DEV_BUSY, &rtc->flags);
-	return err;
+	return 0;
 }
 
 #ifdef CONFIG_RTC_INTF_DEV_UIE_EMUL
@@ -438,9 +429,6 @@ static int rtc_dev_release(struct inode *inode, struct file *file)
 	rtc_update_irq_enable(rtc, 0);
 	rtc_irq_set_state(rtc, NULL, 0);
 
-	if (rtc->ops->release)
-		rtc->ops->release(rtc->dev.parent);
-
 	clear_bit_unlock(RTC_DEV_BUSY, &rtc->flags);
 	return 0;
 }
diff --git a/include/linux/rtc.h b/include/linux/rtc.h
index 0a0f0d14a5fb..e6d0f9c1cafd 100644
--- a/include/linux/rtc.h
+++ b/include/linux/rtc.h
@@ -72,8 +72,6 @@ extern struct class *rtc_class;
  * issued through ioctl() ...
  */
 struct rtc_class_ops {
-	int (*open)(struct device *);
-	void (*release)(struct device *);
 	int (*ioctl)(struct device *, unsigned int, unsigned long);
 	int (*read_time)(struct device *, struct rtc_time *);
 	int (*set_time)(struct device *, struct rtc_time *);
-- 
cgit v1.2.3


From ea5311c7e752dbec9bfbdd79992a8772b37f32fa Mon Sep 17 00:00:00 2001
From: Alex Williamson <alex.williamson@redhat.com>
Date: Thu, 10 Aug 2017 10:54:31 -0600
Subject: PCI: Fix PCIe capability sizes

PCI_CAP_EXP_ENDPOINT_SIZEOF_V1 defines the size of the PCIe capability
structure for v1 devices with link, but we also have a need in the vfio
code for sizing the capability for devices without link, such as Root
Complex Integrated Endpoints.  Create a separate define for this ending the
structure before the link fields.

Additionally, this reveals that PCI_CAP_EXP_ENDPOINT_SIZEOF_V2 is currently
incorrect, ending the capability length before the v2 link fields.  Rename
this to specify an RC Integrated Endpoint (no link) capability length and
move PCI_CAP_EXP_ENDPOINT_SIZEOF_V2 to include the link fields as we have
for the v1 version.

Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
[bhelgaas: add "_" in "PCI_CAP_EXP_RC ENDPOINT_SIZEOF_V2 44"]
Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
Reviewed-by: Eric Auger <eric.auger@redhat.com>
---
 include/uapi/linux/pci_regs.h | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/uapi/linux/pci_regs.h b/include/uapi/linux/pci_regs.h
index c22d3ebaca20..e185d2d39ea6 100644
--- a/include/uapi/linux/pci_regs.h
+++ b/include/uapi/linux/pci_regs.h
@@ -513,6 +513,7 @@
 #define  PCI_EXP_DEVSTA_URD	0x0008	/* Unsupported Request Detected */
 #define  PCI_EXP_DEVSTA_AUXPD	0x0010	/* AUX Power Detected */
 #define  PCI_EXP_DEVSTA_TRPND	0x0020	/* Transactions Pending */
+#define PCI_CAP_EXP_RC_ENDPOINT_SIZEOF_V1	12	/* v1 endpoints without link end here */
 #define PCI_EXP_LNKCAP		12	/* Link Capabilities */
 #define  PCI_EXP_LNKCAP_SLS	0x0000000f /* Supported Link Speeds */
 #define  PCI_EXP_LNKCAP_SLS_2_5GB 0x00000001 /* LNKCAP2 SLS Vector bit 0 */
@@ -556,7 +557,7 @@
 #define  PCI_EXP_LNKSTA_DLLLA	0x2000	/* Data Link Layer Link Active */
 #define  PCI_EXP_LNKSTA_LBMS	0x4000	/* Link Bandwidth Management Status */
 #define  PCI_EXP_LNKSTA_LABS	0x8000	/* Link Autonomous Bandwidth Status */
-#define PCI_CAP_EXP_ENDPOINT_SIZEOF_V1	20	/* v1 endpoints end here */
+#define PCI_CAP_EXP_ENDPOINT_SIZEOF_V1	20	/* v1 endpoints with link end here */
 #define PCI_EXP_SLTCAP		20	/* Slot Capabilities */
 #define  PCI_EXP_SLTCAP_ABP	0x00000001 /* Attention Button Present */
 #define  PCI_EXP_SLTCAP_PCP	0x00000002 /* Power Controller Present */
@@ -639,7 +640,7 @@
 #define  PCI_EXP_DEVCTL2_OBFF_MSGB_EN	0x4000	/* Enable OBFF Message type B */
 #define  PCI_EXP_DEVCTL2_OBFF_WAKE_EN	0x6000	/* OBFF using WAKE# signaling */
 #define PCI_EXP_DEVSTA2		42	/* Device Status 2 */
-#define PCI_CAP_EXP_ENDPOINT_SIZEOF_V2	44	/* v2 endpoints end here */
+#define PCI_CAP_EXP_RC_ENDPOINT_SIZEOF_V2	44	/* v2 endpoints without link end here */
 #define PCI_EXP_LNKCAP2		44	/* Link Capabilities 2 */
 #define  PCI_EXP_LNKCAP2_SLS_2_5GB	0x00000002 /* Supported Speed 2.5GT/s */
 #define  PCI_EXP_LNKCAP2_SLS_5_0GB	0x00000004 /* Supported Speed 5.0GT/s */
@@ -647,6 +648,7 @@
 #define  PCI_EXP_LNKCAP2_CROSSLINK	0x00000100 /* Crosslink supported */
 #define PCI_EXP_LNKCTL2		48	/* Link Control 2 */
 #define PCI_EXP_LNKSTA2		50	/* Link Status 2 */
+#define PCI_CAP_EXP_ENDPOINT_SIZEOF_V2	52	/* v2 endpoints with link end here */
 #define PCI_EXP_SLTCAP2		52	/* Slot Capabilities 2 */
 #define PCI_EXP_SLTCTL2		56	/* Slot Control 2 */
 #define PCI_EXP_SLTSTA2		58	/* Slot Status 2 */
-- 
cgit v1.2.3


From f20c4ea49ec4708de97248927ac6138c2d14eba9 Mon Sep 17 00:00:00 2001
From: Dongdong Liu <liudongdong3@huawei.com>
Date: Sat, 19 Aug 2017 17:07:20 +0800
Subject: PCI/DPC: Add eDPC support

Add eDPC support. Get and print the RP PIO error information when the
trigger condition is RP PIO error.

For more information on eDPC, please see PCI Express Base Specification
Revision 3.1, section 6.2.10.3, or view the PCI-SIG eDPC ECN here:
https://pcisig.com/sites/default/files/specification_documents/ECN_Enhanced_DPC_2012-11-19_final.pdf

Signed-off-by: Dongdong Liu <liudongdong3@huawei.com>
Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
Reviewed-by: Keith Busch <keith.busch@intel.com>
---
 drivers/pci/pcie/pcie-dpc.c   | 162 ++++++++++++++++++++++++++++++++++++++++++
 include/uapi/linux/pci_regs.h |  10 +++
 2 files changed, 172 insertions(+)

(limited to 'include')

diff --git a/drivers/pci/pcie/pcie-dpc.c b/drivers/pci/pcie/pcie-dpc.c
index c39f32e42b4d..8428d7455e49 100644
--- a/drivers/pci/pcie/pcie-dpc.c
+++ b/drivers/pci/pcie/pcie-dpc.c
@@ -16,11 +16,55 @@
 #include <linux/pcieport_if.h>
 #include "../pci.h"
 
+struct rp_pio_header_log_regs {
+	u32 dw0;
+	u32 dw1;
+	u32 dw2;
+	u32 dw3;
+};
+
+struct dpc_rp_pio_regs {
+	u32 status;
+	u32 mask;
+	u32 severity;
+	u32 syserror;
+	u32 exception;
+
+	struct rp_pio_header_log_regs header_log;
+	u32 impspec_log;
+	u32 tlp_prefix_log[4];
+	u32 log_size;
+	u16 first_error;
+};
+
 struct dpc_dev {
 	struct pcie_device	*dev;
 	struct work_struct	work;
 	int			cap_pos;
 	bool			rp;
+	u32			rp_pio_status;
+};
+
+static const char * const rp_pio_error_string[] = {
+	"Configuration Request received UR Completion",	 /* Bit Position 0  */
+	"Configuration Request received CA Completion",	 /* Bit Position 1  */
+	"Configuration Request Completion Timeout",	 /* Bit Position 2  */
+	NULL,
+	NULL,
+	NULL,
+	NULL,
+	NULL,
+	"I/O Request received UR Completion",		 /* Bit Position 8  */
+	"I/O Request received CA Completion",		 /* Bit Position 9  */
+	"I/O Request Completion Timeout",		 /* Bit Position 10 */
+	NULL,
+	NULL,
+	NULL,
+	NULL,
+	NULL,
+	"Memory Request received UR Completion",	 /* Bit Position 16 */
+	"Memory Request received CA Completion",	 /* Bit Position 17 */
+	"Memory Request Completion Timeout",		 /* Bit Position 18 */
 };
 
 static int dpc_wait_rp_inactive(struct dpc_dev *dpc)
@@ -79,10 +123,124 @@ static void interrupt_event_handler(struct work_struct *work)
 	dpc_wait_link_inactive(pdev);
 	if (dpc->rp && dpc_wait_rp_inactive(dpc))
 		return;
+	if (dpc->rp && dpc->rp_pio_status) {
+		pci_write_config_dword(pdev,
+				      dpc->cap_pos + PCI_EXP_DPC_RP_PIO_STATUS,
+				      dpc->rp_pio_status);
+		dpc->rp_pio_status = 0;
+	}
+
 	pci_write_config_word(pdev, dpc->cap_pos + PCI_EXP_DPC_STATUS,
 		PCI_EXP_DPC_STATUS_TRIGGER | PCI_EXP_DPC_STATUS_INTERRUPT);
 }
 
+static void dpc_rp_pio_print_tlp_header(struct device *dev,
+					struct rp_pio_header_log_regs *t)
+{
+	dev_err(dev, "TLP Header: %#010x %#010x %#010x %#010x\n",
+		t->dw0, t->dw1, t->dw2, t->dw3);
+}
+
+static void dpc_rp_pio_print_error(struct dpc_dev *dpc,
+				   struct dpc_rp_pio_regs *rp_pio)
+{
+	struct device *dev = &dpc->dev->device;
+	int i;
+	u32 status;
+
+	dev_err(dev, "rp_pio_status: %#010x, rp_pio_mask: %#010x\n",
+		rp_pio->status, rp_pio->mask);
+
+	dev_err(dev, "RP PIO severity=%#010x, syserror=%#010x, exception=%#010x\n",
+		rp_pio->severity, rp_pio->syserror, rp_pio->exception);
+
+	status = (rp_pio->status & ~rp_pio->mask);
+
+	for (i = 0; i < ARRAY_SIZE(rp_pio_error_string); i++) {
+		if (!(status & (1 << i)))
+			continue;
+
+		dev_err(dev, "[%2d] %s%s\n", i, rp_pio_error_string[i],
+			rp_pio->first_error == i ? " (First)" : "");
+	}
+
+	dpc_rp_pio_print_tlp_header(dev, &rp_pio->header_log);
+	if (rp_pio->log_size == 4)
+		return;
+	dev_err(dev, "RP PIO ImpSpec Log %#010x\n", rp_pio->impspec_log);
+
+	for (i = 0; i < rp_pio->log_size - 5; i++)
+		dev_err(dev, "TLP Prefix Header: dw%d, %#010x\n", i,
+			rp_pio->tlp_prefix_log[i]);
+}
+
+static void dpc_rp_pio_get_info(struct dpc_dev *dpc,
+				struct dpc_rp_pio_regs *rp_pio)
+{
+	struct pci_dev *pdev = dpc->dev->port;
+	struct device *dev = &dpc->dev->device;
+	int i;
+	u16 cap;
+	u16 status;
+
+	pci_read_config_dword(pdev, dpc->cap_pos + PCI_EXP_DPC_RP_PIO_STATUS,
+			      &rp_pio->status);
+	pci_read_config_dword(pdev, dpc->cap_pos + PCI_EXP_DPC_RP_PIO_MASK,
+			      &rp_pio->mask);
+
+	pci_read_config_dword(pdev, dpc->cap_pos + PCI_EXP_DPC_RP_PIO_SEVERITY,
+			      &rp_pio->severity);
+	pci_read_config_dword(pdev, dpc->cap_pos + PCI_EXP_DPC_RP_PIO_SYSERROR,
+			      &rp_pio->syserror);
+	pci_read_config_dword(pdev, dpc->cap_pos + PCI_EXP_DPC_RP_PIO_EXCEPTION,
+			      &rp_pio->exception);
+
+	/* Get First Error Pointer */
+	pci_read_config_word(pdev, dpc->cap_pos + PCI_EXP_DPC_STATUS, &status);
+	rp_pio->first_error = (status & 0x1f00) >> 8;
+
+	pci_read_config_word(pdev, dpc->cap_pos + PCI_EXP_DPC_CAP, &cap);
+	rp_pio->log_size = (cap & PCI_EXP_DPC_RP_PIO_LOG_SIZE) >> 8;
+	if (rp_pio->log_size < 4 || rp_pio->log_size > 9) {
+		dev_err(dev, "RP PIO log size %u is invalid\n",
+			rp_pio->log_size);
+		return;
+	}
+
+	pci_read_config_dword(pdev,
+			      dpc->cap_pos + PCI_EXP_DPC_RP_PIO_HEADER_LOG,
+			      &rp_pio->header_log.dw0);
+	pci_read_config_dword(pdev,
+			      dpc->cap_pos + PCI_EXP_DPC_RP_PIO_HEADER_LOG + 4,
+			      &rp_pio->header_log.dw1);
+	pci_read_config_dword(pdev,
+			      dpc->cap_pos + PCI_EXP_DPC_RP_PIO_HEADER_LOG + 8,
+			      &rp_pio->header_log.dw2);
+	pci_read_config_dword(pdev,
+			      dpc->cap_pos + PCI_EXP_DPC_RP_PIO_HEADER_LOG + 12,
+			      &rp_pio->header_log.dw3);
+	if (rp_pio->log_size == 4)
+		return;
+
+	pci_read_config_dword(pdev,
+			      dpc->cap_pos + PCI_EXP_DPC_RP_PIO_IMPSPEC_LOG,
+			      &rp_pio->impspec_log);
+	for (i = 0; i < rp_pio->log_size - 5; i++)
+		pci_read_config_dword(pdev,
+			dpc->cap_pos + PCI_EXP_DPC_RP_PIO_TLPPREFIX_LOG,
+			&rp_pio->tlp_prefix_log[i]);
+}
+
+static void dpc_process_rp_pio_error(struct dpc_dev *dpc)
+{
+	struct dpc_rp_pio_regs rp_pio_regs;
+
+	dpc_rp_pio_get_info(dpc, &rp_pio_regs);
+	dpc_rp_pio_print_error(dpc, &rp_pio_regs);
+
+	dpc->rp_pio_status = rp_pio_regs.status;
+}
+
 static irqreturn_t dpc_irq(int irq, void *context)
 {
 	struct dpc_dev *dpc = (struct dpc_dev *)context;
@@ -109,6 +267,10 @@ static irqreturn_t dpc_irq(int irq, void *context)
 			 (ext_reason == 0) ? "RP PIO error" :
 			 (ext_reason == 1) ? "software trigger" :
 					     "reserved error");
+		/* show RP PIO error detail information */
+		if (reason == 3 && ext_reason == 0)
+			dpc_process_rp_pio_error(dpc);
+
 		schedule_work(&dpc->work);
 	}
 	return IRQ_HANDLED;
diff --git a/include/uapi/linux/pci_regs.h b/include/uapi/linux/pci_regs.h
index c22d3ebaca20..1ce96275531c 100644
--- a/include/uapi/linux/pci_regs.h
+++ b/include/uapi/linux/pci_regs.h
@@ -967,6 +967,7 @@
 #define  PCI_EXP_DPC_CAP_RP_EXT		0x20	/* Root Port Extensions for DPC */
 #define  PCI_EXP_DPC_CAP_POISONED_TLP	0x40	/* Poisoned TLP Egress Blocking Supported */
 #define  PCI_EXP_DPC_CAP_SW_TRIGGER	0x80	/* Software Triggering Supported */
+#define  PCI_EXP_DPC_RP_PIO_LOG_SIZE	0xF00	/* RP PIO log size */
 #define  PCI_EXP_DPC_CAP_DL_ACTIVE	0x1000	/* ERR_COR signal on DL_Active supported */
 
 #define PCI_EXP_DPC_CTL			6	/* DPC control */
@@ -980,6 +981,15 @@
 
 #define PCI_EXP_DPC_SOURCE_ID		10	/* DPC Source Identifier */
 
+#define PCI_EXP_DPC_RP_PIO_STATUS	 0x0C	/* RP PIO Status */
+#define PCI_EXP_DPC_RP_PIO_MASK		 0x10	/* RP PIO MASK */
+#define PCI_EXP_DPC_RP_PIO_SEVERITY	 0x14	/* RP PIO Severity */
+#define PCI_EXP_DPC_RP_PIO_SYSERROR	 0x18	/* RP PIO SysError */
+#define PCI_EXP_DPC_RP_PIO_EXCEPTION	 0x1C	/* RP PIO Exception */
+#define PCI_EXP_DPC_RP_PIO_HEADER_LOG	 0x20	/* RP PIO Header Log */
+#define PCI_EXP_DPC_RP_PIO_IMPSPEC_LOG	 0x30	/* RP PIO ImpSpec Log */
+#define PCI_EXP_DPC_RP_PIO_TLPPREFIX_LOG 0x34	/* RP PIO TLP Prefix Log */
+
 /* Precision Time Measurement */
 #define PCI_PTM_CAP			0x04	    /* PTM Capability */
 #define  PCI_PTM_CAP_REQ		0x00000001  /* Requester capable */
-- 
cgit v1.2.3


From 1177009131bee310421f5c04c43d3777cbacbdc8 Mon Sep 17 00:00:00 2001
From: Arkadi Sharshevsky <arkadis@mellanox.com>
Date: Thu, 24 Aug 2017 08:39:59 +0200
Subject: devlink: Add Ethernet header for dpipe

This will be used by the IPv4 host table which will be introduced in the
following patches. This header is global and can be reused by many
drivers.

Signed-off-by: Arkadi Sharshevsky <arkadis@mellanox.com>
Signed-off-by: Jiri Pirko <jiri@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/devlink.h        |  1 +
 include/uapi/linux/devlink.h |  8 ++++++++
 net/core/devlink.c           | 17 +++++++++++++++++
 3 files changed, 26 insertions(+)

(limited to 'include')

diff --git a/include/net/devlink.h b/include/net/devlink.h
index ed7687bbf5d0..ddb8b5227580 100644
--- a/include/net/devlink.h
+++ b/include/net/devlink.h
@@ -328,6 +328,7 @@ int devlink_dpipe_action_put(struct sk_buff *skb,
 			     struct devlink_dpipe_action *action);
 int devlink_dpipe_match_put(struct sk_buff *skb,
 			    struct devlink_dpipe_match *match);
+extern struct devlink_dpipe_header devlink_dpipe_header_ethernet;
 
 #else
 
diff --git a/include/uapi/linux/devlink.h b/include/uapi/linux/devlink.h
index b0e807ac53bb..a855f8dcc8ee 100644
--- a/include/uapi/linux/devlink.h
+++ b/include/uapi/linux/devlink.h
@@ -226,4 +226,12 @@ enum devlink_dpipe_action_type {
 	DEVLINK_DPIPE_ACTION_TYPE_FIELD_MODIFY,
 };
 
+enum devlink_dpipe_field_ethernet_id {
+	DEVLINK_DPIPE_FIELD_ETHERNET_DST_MAC,
+};
+
+enum devlink_dpipe_header_id {
+	DEVLINK_DPIPE_HEADER_ETHERNET,
+};
+
 #endif /* _UAPI_LINUX_DEVLINK_H_ */
diff --git a/net/core/devlink.c b/net/core/devlink.c
index a0adfc31a3fe..4f6f3d601785 100644
--- a/net/core/devlink.c
+++ b/net/core/devlink.c
@@ -29,6 +29,23 @@
 #define CREATE_TRACE_POINTS
 #include <trace/events/devlink.h>
 
+static struct devlink_dpipe_field devlink_dpipe_fields_ethernet[] = {
+	{
+		.name = "destination_mac",
+		.id = DEVLINK_DPIPE_FIELD_ETHERNET_DST_MAC,
+		.bitwidth = 48,
+	},
+};
+
+struct devlink_dpipe_header devlink_dpipe_header_ethernet = {
+	.name = "ethernet",
+	.id = DEVLINK_DPIPE_HEADER_ETHERNET,
+	.fields = devlink_dpipe_fields_ethernet,
+	.fields_count = ARRAY_SIZE(devlink_dpipe_fields_ethernet),
+	.global = true,
+};
+EXPORT_SYMBOL(devlink_dpipe_header_ethernet);
+
 EXPORT_TRACEPOINT_SYMBOL_GPL(devlink_hwmsg);
 
 static LIST_HEAD(devlink_list);
-- 
cgit v1.2.3


From 3fb886ecea93605a8ea14e258ff3158b8966781e Mon Sep 17 00:00:00 2001
From: Arkadi Sharshevsky <arkadis@mellanox.com>
Date: Thu, 24 Aug 2017 08:40:00 +0200
Subject: devlink: Add IPv4 header for dpipe

This will be used by the IPv4 host table which will be introduced in the
following patches. This header is global and can be reused by many
drivers.

Signed-off-by: Arkadi Sharshevsky <arkadis@mellanox.com>
Signed-off-by: Jiri Pirko <jiri@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/devlink.h        |  1 +
 include/uapi/linux/devlink.h |  5 +++++
 net/core/devlink.c           | 17 +++++++++++++++++
 3 files changed, 23 insertions(+)

(limited to 'include')

diff --git a/include/net/devlink.h b/include/net/devlink.h
index ddb8b5227580..8ff8a6f77f29 100644
--- a/include/net/devlink.h
+++ b/include/net/devlink.h
@@ -329,6 +329,7 @@ int devlink_dpipe_action_put(struct sk_buff *skb,
 int devlink_dpipe_match_put(struct sk_buff *skb,
 			    struct devlink_dpipe_match *match);
 extern struct devlink_dpipe_header devlink_dpipe_header_ethernet;
+extern struct devlink_dpipe_header devlink_dpipe_header_ipv4;
 
 #else
 
diff --git a/include/uapi/linux/devlink.h b/include/uapi/linux/devlink.h
index a855f8dcc8ee..6c172548589d 100644
--- a/include/uapi/linux/devlink.h
+++ b/include/uapi/linux/devlink.h
@@ -230,8 +230,13 @@ enum devlink_dpipe_field_ethernet_id {
 	DEVLINK_DPIPE_FIELD_ETHERNET_DST_MAC,
 };
 
+enum devlink_dpipe_field_ipv4_id {
+	DEVLINK_DPIPE_FIELD_IPV4_DST_IP,
+};
+
 enum devlink_dpipe_header_id {
 	DEVLINK_DPIPE_HEADER_ETHERNET,
+	DEVLINK_DPIPE_HEADER_IPV4,
 };
 
 #endif /* _UAPI_LINUX_DEVLINK_H_ */
diff --git a/net/core/devlink.c b/net/core/devlink.c
index 4f6f3d601785..fcf5dd662882 100644
--- a/net/core/devlink.c
+++ b/net/core/devlink.c
@@ -46,6 +46,23 @@ struct devlink_dpipe_header devlink_dpipe_header_ethernet = {
 };
 EXPORT_SYMBOL(devlink_dpipe_header_ethernet);
 
+static struct devlink_dpipe_field devlink_dpipe_fields_ipv4[] = {
+	{
+		.name = "destination ip",
+		.id = DEVLINK_DPIPE_FIELD_IPV4_DST_IP,
+		.bitwidth = 32,
+	},
+};
+
+struct devlink_dpipe_header devlink_dpipe_header_ipv4 = {
+	.name = "ipv4",
+	.id = DEVLINK_DPIPE_HEADER_IPV4,
+	.fields = devlink_dpipe_fields_ipv4,
+	.fields_count = ARRAY_SIZE(devlink_dpipe_fields_ipv4),
+	.global = true,
+};
+EXPORT_SYMBOL(devlink_dpipe_header_ipv4);
+
 EXPORT_TRACEPOINT_SYMBOL_GPL(devlink_hwmsg);
 
 static LIST_HEAD(devlink_list);
-- 
cgit v1.2.3


From ffd3cdccf214cf0df08856a6738544076c4cd548 Mon Sep 17 00:00:00 2001
From: Arkadi Sharshevsky <arkadis@mellanox.com>
Date: Thu, 24 Aug 2017 08:40:02 +0200
Subject: devlink: Add support for dynamic table size

Up until now the dpipe table's size was static and known at registration
time. The host table does not have constant size and it is resized in
dynamic manner. In order to support this behavior the size is changed
to be obtained dynamically via an op.

This patch also adjust the current dpipe table for the new API.

Signed-off-by: Arkadi Sharshevsky <arkadis@mellanox.com>
Signed-off-by: Jiri Pirko <jiri@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/mellanox/mlxsw/spectrum_dpipe.c | 13 +++++++++----
 include/net/devlink.h                                |  7 +++----
 net/core/devlink.c                                   | 12 +++++++-----
 3 files changed, 19 insertions(+), 13 deletions(-)

(limited to 'include')

diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum_dpipe.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum_dpipe.c
index e9e1f7aeb04f..9eb265bd5430 100644
--- a/drivers/net/ethernet/mellanox/mlxsw/spectrum_dpipe.c
+++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum_dpipe.c
@@ -301,24 +301,29 @@ static int mlxsw_sp_dpipe_table_erif_counters_update(void *priv, bool enable)
 	return 0;
 }
 
+static u64 mlxsw_sp_dpipe_table_erif_size_get(void *priv)
+{
+	struct mlxsw_sp *mlxsw_sp = priv;
+
+	return MLXSW_CORE_RES_GET(mlxsw_sp->core, MAX_RIFS);
+}
+
 static struct devlink_dpipe_table_ops mlxsw_sp_erif_ops = {
 	.matches_dump = mlxsw_sp_dpipe_table_erif_matches_dump,
 	.actions_dump = mlxsw_sp_dpipe_table_erif_actions_dump,
 	.entries_dump = mlxsw_sp_dpipe_table_erif_entries_dump,
 	.counters_set_update = mlxsw_sp_dpipe_table_erif_counters_update,
+	.size_get = mlxsw_sp_dpipe_table_erif_size_get,
 };
 
 static int mlxsw_sp_dpipe_erif_table_init(struct mlxsw_sp *mlxsw_sp)
 {
 	struct devlink *devlink = priv_to_devlink(mlxsw_sp->core);
-	u64 table_size;
 
-	table_size = MLXSW_CORE_RES_GET(mlxsw_sp->core, MAX_RIFS);
 	return devlink_dpipe_table_register(devlink,
 					    MLXSW_SP_DPIPE_TABLE_NAME_ERIF,
 					    &mlxsw_sp_erif_ops,
-					    mlxsw_sp, table_size,
-					    false);
+					    mlxsw_sp, false);
 }
 
 static void mlxsw_sp_dpipe_erif_table_fini(struct mlxsw_sp *mlxsw_sp)
diff --git a/include/net/devlink.h b/include/net/devlink.h
index 8ff8a6f77f29..e96272b2cfec 100644
--- a/include/net/devlink.h
+++ b/include/net/devlink.h
@@ -178,7 +178,6 @@ struct devlink_dpipe_table_ops;
  * struct devlink_dpipe_table - table object
  * @priv: private
  * @name: table name
- * @size: maximum number of entries
  * @counters_enabled: indicates if counters are active
  * @counter_control_extern: indicates if counter control is in dpipe or
  *			    external tool
@@ -189,7 +188,6 @@ struct devlink_dpipe_table {
 	void *priv;
 	struct list_head list;
 	const char *name;
-	u64 size;
 	bool counters_enabled;
 	bool counter_control_extern;
 	struct devlink_dpipe_table_ops *table_ops;
@@ -204,6 +202,7 @@ struct devlink_dpipe_table {
  * @counters_set_update - when changing the counter status hardware sync
  *			  maybe needed to allocate/free counter related
  *			  resources
+ * @size_get - get size
  */
 struct devlink_dpipe_table_ops {
 	int (*actions_dump)(void *priv, struct sk_buff *skb);
@@ -211,6 +210,7 @@ struct devlink_dpipe_table_ops {
 	int (*entries_dump)(void *priv, bool counters_enabled,
 			    struct devlink_dpipe_dump_ctx *dump_ctx);
 	int (*counters_set_update)(void *priv, bool enable);
+	u64 (*size_get)(void *priv);
 };
 
 /**
@@ -311,8 +311,7 @@ void devlink_sb_unregister(struct devlink *devlink, unsigned int sb_index);
 int devlink_dpipe_table_register(struct devlink *devlink,
 				 const char *table_name,
 				 struct devlink_dpipe_table_ops *table_ops,
-				 void *priv, u64 size,
-				 bool counter_control_extern);
+				 void *priv, bool counter_control_extern);
 void devlink_dpipe_table_unregister(struct devlink *devlink,
 				    const char *table_name);
 int devlink_dpipe_headers_register(struct devlink *devlink,
diff --git a/net/core/devlink.c b/net/core/devlink.c
index fcf5dd662882..87062ff36ab4 100644
--- a/net/core/devlink.c
+++ b/net/core/devlink.c
@@ -1647,13 +1647,15 @@ static int devlink_dpipe_table_put(struct sk_buff *skb,
 				   struct devlink_dpipe_table *table)
 {
 	struct nlattr *table_attr;
+	u64 table_size;
 
+	table_size = table->table_ops->size_get(table->priv);
 	table_attr = nla_nest_start(skb, DEVLINK_ATTR_DPIPE_TABLE);
 	if (!table_attr)
 		return -EMSGSIZE;
 
 	if (nla_put_string(skb, DEVLINK_ATTR_DPIPE_TABLE_NAME, table->name) ||
-	    nla_put_u64_64bit(skb, DEVLINK_ATTR_DPIPE_TABLE_SIZE, table->size,
+	    nla_put_u64_64bit(skb, DEVLINK_ATTR_DPIPE_TABLE_SIZE, table_size,
 			      DEVLINK_ATTR_PAD))
 		goto nla_put_failure;
 	if (nla_put_u8(skb, DEVLINK_ATTR_DPIPE_TABLE_COUNTERS_ENABLED,
@@ -2718,20 +2720,21 @@ EXPORT_SYMBOL_GPL(devlink_dpipe_table_counter_enabled);
  *	@table_name: table name
  *	@table_ops: table ops
  *	@priv: priv
- *	@size: size
  *	@counter_control_extern: external control for counters
  */
 int devlink_dpipe_table_register(struct devlink *devlink,
 				 const char *table_name,
 				 struct devlink_dpipe_table_ops *table_ops,
-				 void *priv, u64 size,
-				 bool counter_control_extern)
+				 void *priv, bool counter_control_extern)
 {
 	struct devlink_dpipe_table *table;
 
 	if (devlink_dpipe_table_find(&devlink->dpipe_table_list, table_name))
 		return -EEXIST;
 
+	if (WARN_ON(!table_ops->size_get))
+		return -EINVAL;
+
 	table = kzalloc(sizeof(*table), GFP_KERNEL);
 	if (!table)
 		return -ENOMEM;
@@ -2739,7 +2742,6 @@ int devlink_dpipe_table_register(struct devlink *devlink,
 	table->name = table_name;
 	table->table_ops = table_ops;
 	table->priv = priv;
-	table->size = size;
 	table->counter_control_extern = counter_control_extern;
 
 	mutex_lock(&devlink_mutex);
-- 
cgit v1.2.3


From 3580732448f128c39e7325912bc4368ade5dce7d Mon Sep 17 00:00:00 2001
From: Arkadi Sharshevsky <arkadis@mellanox.com>
Date: Thu, 24 Aug 2017 08:40:03 +0200
Subject: devlink: Move dpipe entry clear function into devlink

The entry clear routine can be shared between the drivers, thus it is
moved inside devlink.

Signed-off-by: Arkadi Sharshevsky <arkadis@mellanox.com>
Signed-off-by: Jiri Pirko <jiri@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 .../net/ethernet/mellanox/mlxsw/spectrum_dpipe.c   | 24 ++--------------------
 include/net/devlink.h                              |  6 ++++++
 net/core/devlink.c                                 | 22 ++++++++++++++++++++
 3 files changed, 30 insertions(+), 22 deletions(-)

(limited to 'include')

diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum_dpipe.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum_dpipe.c
index 9eb265bd5430..1305df9c6bf9 100644
--- a/drivers/net/ethernet/mellanox/mlxsw/spectrum_dpipe.c
+++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum_dpipe.c
@@ -114,26 +114,6 @@ static int mlxsw_sp_dpipe_table_erif_matches_dump(void *priv,
 	return devlink_dpipe_match_put(skb, &match);
 }
 
-static void mlxsw_sp_erif_entry_clear(struct devlink_dpipe_entry *entry)
-{
-	unsigned int value_count, value_index;
-	struct devlink_dpipe_value *value;
-
-	value = entry->action_values;
-	value_count = entry->action_values_count;
-	for (value_index = 0; value_index < value_count; value_index++) {
-		kfree(value[value_index].value);
-		kfree(value[value_index].mask);
-	}
-
-	value = entry->match_values;
-	value_count = entry->match_values_count;
-	for (value_index = 0; value_index < value_count; value_index++) {
-		kfree(value[value_index].value);
-		kfree(value[value_index].mask);
-	}
-}
-
 static void
 mlxsw_sp_erif_match_action_prepare(struct devlink_dpipe_match *match,
 				   struct devlink_dpipe_action *action)
@@ -270,12 +250,12 @@ start_again:
 		goto start_again;
 	rtnl_unlock();
 
-	mlxsw_sp_erif_entry_clear(&entry);
+	devlink_dpipe_entry_clear(&entry);
 	return 0;
 err_entry_append:
 err_entry_get:
 	rtnl_unlock();
-	mlxsw_sp_erif_entry_clear(&entry);
+	devlink_dpipe_entry_clear(&entry);
 	return err;
 }
 
diff --git a/include/net/devlink.h b/include/net/devlink.h
index e96272b2cfec..047a0c54f652 100644
--- a/include/net/devlink.h
+++ b/include/net/devlink.h
@@ -323,6 +323,7 @@ int devlink_dpipe_entry_ctx_prepare(struct devlink_dpipe_dump_ctx *dump_ctx);
 int devlink_dpipe_entry_ctx_append(struct devlink_dpipe_dump_ctx *dump_ctx,
 				   struct devlink_dpipe_entry *entry);
 int devlink_dpipe_entry_ctx_close(struct devlink_dpipe_dump_ctx *dump_ctx);
+void devlink_dpipe_entry_clear(struct devlink_dpipe_entry *entry);
 int devlink_dpipe_action_put(struct sk_buff *skb,
 			     struct devlink_dpipe_action *action);
 int devlink_dpipe_match_put(struct sk_buff *skb,
@@ -448,6 +449,11 @@ devlink_dpipe_entry_ctx_close(struct devlink_dpipe_dump_ctx *dump_ctx)
 	return 0;
 }
 
+static inline void
+devlink_dpipe_entry_clear(struct devlink_dpipe_entry *entry)
+{
+}
+
 static inline int
 devlink_dpipe_action_put(struct sk_buff *skb,
 			 struct devlink_dpipe_action *action)
diff --git a/net/core/devlink.c b/net/core/devlink.c
index 87062ff36ab4..194708aa5f11 100644
--- a/net/core/devlink.c
+++ b/net/core/devlink.c
@@ -1996,6 +1996,28 @@ int devlink_dpipe_entry_ctx_close(struct devlink_dpipe_dump_ctx *dump_ctx)
 }
 EXPORT_SYMBOL_GPL(devlink_dpipe_entry_ctx_close);
 
+void devlink_dpipe_entry_clear(struct devlink_dpipe_entry *entry)
+
+{
+	unsigned int value_count, value_index;
+	struct devlink_dpipe_value *value;
+
+	value = entry->action_values;
+	value_count = entry->action_values_count;
+	for (value_index = 0; value_index < value_count; value_index++) {
+		kfree(value[value_index].value);
+		kfree(value[value_index].mask);
+	}
+
+	value = entry->match_values;
+	value_count = entry->match_values_count;
+	for (value_index = 0; value_index < value_count; value_index++) {
+		kfree(value[value_index].value);
+		kfree(value[value_index].mask);
+	}
+}
+EXPORT_SYMBOL(devlink_dpipe_entry_clear);
+
 static int devlink_dpipe_entries_fill(struct genl_info *info,
 				      enum devlink_command cmd, int flags,
 				      struct devlink_dpipe_table *table)
-- 
cgit v1.2.3


From 0d03510038bda70b5a4a252e8216822e6ce0cbdb Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Sat, 12 Aug 2017 00:57:02 +0200
Subject: netfilter: conntrack: compute l3proto nla size at compile time

avoids a pointer and allows struct to be const later on.

Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/net/netfilter/nf_conntrack_l3proto.h   | 19 ++++++++-----------
 net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c | 13 +++++++------
 net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c | 14 ++++++++------
 net/netfilter/nf_conntrack_netlink.c           |  3 ++-
 net/netfilter/nf_conntrack_proto.c             |  9 +++------
 5 files changed, 28 insertions(+), 30 deletions(-)

(limited to 'include')

diff --git a/include/net/netfilter/nf_conntrack_l3proto.h b/include/net/netfilter/nf_conntrack_l3proto.h
index 1b8de164d744..6a27ffea7480 100644
--- a/include/net/netfilter/nf_conntrack_l3proto.h
+++ b/include/net/netfilter/nf_conntrack_l3proto.h
@@ -20,6 +20,9 @@ struct nf_conntrack_l3proto {
 	/* L3 Protocol Family number. ex) PF_INET */
 	u_int16_t l3proto;
 
+	/* size of tuple nlattr, fills a hole */
+	u16 nla_size;
+
 	/* Protocol name */
 	const char *name;
 
@@ -49,23 +52,17 @@ struct nf_conntrack_l3proto {
 	int (*get_l4proto)(const struct sk_buff *skb, unsigned int nhoff,
 			   unsigned int *dataoff, u_int8_t *protonum);
 
+#if IS_ENABLED(CONFIG_NF_CT_NETLINK)
 	int (*tuple_to_nlattr)(struct sk_buff *skb,
 			       const struct nf_conntrack_tuple *t);
-
-	/* Called when netns wants to use connection tracking */
-	int (*net_ns_get)(struct net *);
-	void (*net_ns_put)(struct net *);
-
-	/*
-	 * Calculate size of tuple nlattr
-	 */
-	int (*nlattr_tuple_size)(void);
-
 	int (*nlattr_to_tuple)(struct nlattr *tb[],
 			       struct nf_conntrack_tuple *t);
 	const struct nla_policy *nla_policy;
+#endif
 
-	size_t nla_size;
+	/* Called when netns wants to use connection tracking */
+	int (*net_ns_get)(struct net *);
+	void (*net_ns_put)(struct net *);
 
 	/* Module (if any) which this is connected to. */
 	struct module *me;
diff --git a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c
index de5f0e6ddd1b..9fb8cb033d80 100644
--- a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c
+++ b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c
@@ -303,11 +303,6 @@ static int ipv4_nlattr_to_tuple(struct nlattr *tb[],
 
 	return 0;
 }
-
-static int ipv4_nlattr_tuple_size(void)
-{
-	return nla_policy_len(ipv4_nla_policy, CTA_IP_MAX + 1);
-}
 #endif
 
 static struct nf_sockopt_ops so_getorigdst = {
@@ -365,9 +360,10 @@ struct nf_conntrack_l3proto nf_conntrack_l3proto_ipv4 __read_mostly = {
 	.get_l4proto	 = ipv4_get_l4proto,
 #if IS_ENABLED(CONFIG_NF_CT_NETLINK)
 	.tuple_to_nlattr = ipv4_tuple_to_nlattr,
-	.nlattr_tuple_size = ipv4_nlattr_tuple_size,
 	.nlattr_to_tuple = ipv4_nlattr_to_tuple,
 	.nla_policy	 = ipv4_nla_policy,
+	.nla_size	 = NLA_ALIGN(NLA_HDRLEN + sizeof(u32)) + /* CTA_IP_V4_SRC */
+			   NLA_ALIGN(NLA_HDRLEN + sizeof(u32)),  /* CTA_IP_V4_DST */
 #endif
 	.net_ns_get	 = ipv4_hooks_register,
 	.net_ns_put	 = ipv4_hooks_unregister,
@@ -421,6 +417,11 @@ static int __init nf_conntrack_l3proto_ipv4_init(void)
 
 	need_conntrack();
 
+#if IS_ENABLED(CONFIG_NF_CT_NETLINK)
+	if (WARN_ON(nla_policy_len(ipv4_nla_policy, CTA_IP_MAX + 1) !=
+	    nf_conntrack_l3proto_ipv4.nla_size))
+		return -EINVAL;
+#endif
 	ret = nf_register_sockopt(&so_getorigdst);
 	if (ret < 0) {
 		pr_err("Unable to register netfilter socket option\n");
diff --git a/net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c b/net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c
index ddef5ee9e0a8..6b4d59fd0214 100644
--- a/net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c
+++ b/net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c
@@ -308,11 +308,6 @@ static int ipv6_nlattr_to_tuple(struct nlattr *tb[],
 
 	return 0;
 }
-
-static int ipv6_nlattr_tuple_size(void)
-{
-	return nla_policy_len(ipv6_nla_policy, CTA_IP_MAX + 1);
-}
 #endif
 
 static int ipv6_hooks_register(struct net *net)
@@ -360,9 +355,10 @@ struct nf_conntrack_l3proto nf_conntrack_l3proto_ipv6 __read_mostly = {
 	.get_l4proto		= ipv6_get_l4proto,
 #if IS_ENABLED(CONFIG_NF_CT_NETLINK)
 	.tuple_to_nlattr	= ipv6_tuple_to_nlattr,
-	.nlattr_tuple_size	= ipv6_nlattr_tuple_size,
 	.nlattr_to_tuple	= ipv6_nlattr_to_tuple,
 	.nla_policy		= ipv6_nla_policy,
+	.nla_size		= NLA_ALIGN(NLA_HDRLEN + sizeof(u32[4])) +
+				  NLA_ALIGN(NLA_HDRLEN + sizeof(u32[4])),
 #endif
 	.net_ns_get		= ipv6_hooks_register,
 	.net_ns_put		= ipv6_hooks_unregister,
@@ -421,6 +417,12 @@ static int __init nf_conntrack_l3proto_ipv6_init(void)
 
 	need_conntrack();
 
+#if IS_ENABLED(CONFIG_NF_CT_NETLINK)
+	if (WARN_ON(nla_policy_len(ipv6_nla_policy, CTA_IP_MAX + 1) !=
+	    nf_conntrack_l3proto_ipv6.nla_size))
+		return -EINVAL;
+#endif
+
 	ret = nf_register_sockopt(&so_getorigdst6);
 	if (ret < 0) {
 		pr_err("Unable to register netfilter socket option\n");
diff --git a/net/netfilter/nf_conntrack_netlink.c b/net/netfilter/nf_conntrack_netlink.c
index f4ca48817f66..b59a453a0fd8 100644
--- a/net/netfilter/nf_conntrack_netlink.c
+++ b/net/netfilter/nf_conntrack_netlink.c
@@ -540,7 +540,8 @@ static inline size_t ctnetlink_proto_size(const struct nf_conn *ct)
 	size_t len = 0;
 
 	l3proto = __nf_ct_l3proto_find(nf_ct_l3num(ct));
-	len += l3proto->nla_size;
+	len = l3proto->nla_size;
+	len *= 3u; /* ORIG, REPLY, MASTER */
 
 	l4proto = __nf_ct_l4proto_find(nf_ct_l3num(ct), nf_ct_protonum(ct));
 	len += l4proto->nla_size;
diff --git a/net/netfilter/nf_conntrack_proto.c b/net/netfilter/nf_conntrack_proto.c
index 27810cf816a6..85104a27cc89 100644
--- a/net/netfilter/nf_conntrack_proto.c
+++ b/net/netfilter/nf_conntrack_proto.c
@@ -214,10 +214,10 @@ int nf_ct_l3proto_register(struct nf_conntrack_l3proto *proto)
 
 	if (proto->l3proto >= NFPROTO_NUMPROTO)
 		return -EBUSY;
-
-	if (proto->tuple_to_nlattr && !proto->nlattr_tuple_size)
+#if IS_ENABLED(CONFIG_NF_CT_NETLINK)
+	if (proto->tuple_to_nlattr && proto->nla_size == 0)
 		return -EINVAL;
-
+#endif
 	mutex_lock(&nf_ct_proto_mutex);
 	old = rcu_dereference_protected(nf_ct_l3protos[proto->l3proto],
 					lockdep_is_held(&nf_ct_proto_mutex));
@@ -226,9 +226,6 @@ int nf_ct_l3proto_register(struct nf_conntrack_l3proto *proto)
 		goto out_unlock;
 	}
 
-	if (proto->nlattr_tuple_size)
-		proto->nla_size = 3 * proto->nlattr_tuple_size();
-
 	rcu_assign_pointer(nf_ct_l3protos[proto->l3proto], proto);
 
 out_unlock:
-- 
cgit v1.2.3


From a3134d537f8209f5b149d7ed9f287047158845f0 Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Sat, 12 Aug 2017 00:57:03 +0200
Subject: netfilter: conntrack: remove protocol name from l3proto struct

no need to waste storage for something that is only needed
in one place and can be deduced from protocol number.

Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/net/netfilter/nf_conntrack_l3proto.h   |  3 ---
 net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c |  1 -
 net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c |  1 -
 net/netfilter/nf_conntrack_l3proto_generic.c   |  1 -
 net/netfilter/nf_conntrack_standalone.c        | 12 +++++++++++-
 5 files changed, 11 insertions(+), 7 deletions(-)

(limited to 'include')

diff --git a/include/net/netfilter/nf_conntrack_l3proto.h b/include/net/netfilter/nf_conntrack_l3proto.h
index 6a27ffea7480..e31861e4fa6a 100644
--- a/include/net/netfilter/nf_conntrack_l3proto.h
+++ b/include/net/netfilter/nf_conntrack_l3proto.h
@@ -23,9 +23,6 @@ struct nf_conntrack_l3proto {
 	/* size of tuple nlattr, fills a hole */
 	u16 nla_size;
 
-	/* Protocol name */
-	const char *name;
-
 	/*
 	 * Try to fill in the third arg: nhoff is offset of l3 proto
          * hdr.  Return true if possible.
diff --git a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c
index 9fb8cb033d80..9f7ea862068c 100644
--- a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c
+++ b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c
@@ -353,7 +353,6 @@ static void ipv4_hooks_unregister(struct net *net)
 
 struct nf_conntrack_l3proto nf_conntrack_l3proto_ipv4 __read_mostly = {
 	.l3proto	 = PF_INET,
-	.name		 = "ipv4",
 	.pkt_to_tuple	 = ipv4_pkt_to_tuple,
 	.invert_tuple	 = ipv4_invert_tuple,
 	.print_tuple	 = ipv4_print_tuple,
diff --git a/net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c b/net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c
index 6b4d59fd0214..91d37fbe28de 100644
--- a/net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c
+++ b/net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c
@@ -348,7 +348,6 @@ static void ipv6_hooks_unregister(struct net *net)
 
 struct nf_conntrack_l3proto nf_conntrack_l3proto_ipv6 __read_mostly = {
 	.l3proto		= PF_INET6,
-	.name			= "ipv6",
 	.pkt_to_tuple		= ipv6_pkt_to_tuple,
 	.invert_tuple		= ipv6_invert_tuple,
 	.print_tuple		= ipv6_print_tuple,
diff --git a/net/netfilter/nf_conntrack_l3proto_generic.c b/net/netfilter/nf_conntrack_l3proto_generic.c
index cf9ace70bece..0387971582bc 100644
--- a/net/netfilter/nf_conntrack_l3proto_generic.c
+++ b/net/netfilter/nf_conntrack_l3proto_generic.c
@@ -64,7 +64,6 @@ static int generic_get_l4proto(const struct sk_buff *skb, unsigned int nhoff,
 
 struct nf_conntrack_l3proto nf_conntrack_l3proto_generic __read_mostly = {
 	.l3proto	 = PF_UNSPEC,
-	.name		 = "unknown",
 	.pkt_to_tuple	 = generic_pkt_to_tuple,
 	.invert_tuple	 = generic_invert_tuple,
 	.print_tuple	 = generic_print_tuple,
diff --git a/net/netfilter/nf_conntrack_standalone.c b/net/netfilter/nf_conntrack_standalone.c
index 5b6c675d55b1..359d7e6a4503 100644
--- a/net/netfilter/nf_conntrack_standalone.c
+++ b/net/netfilter/nf_conntrack_standalone.c
@@ -198,6 +198,16 @@ ct_show_delta_time(struct seq_file *s, const struct nf_conn *ct)
 }
 #endif
 
+static const char* l3proto_name(u16 proto)
+{
+	switch (proto) {
+	case AF_INET: return "ipv4";
+	case AF_INET6: return "ipv6";
+	}
+
+	return "unknown";
+}
+
 /* return 0 on success, 1 in case of error */
 static int ct_seq_show(struct seq_file *s, void *v)
 {
@@ -231,7 +241,7 @@ static int ct_seq_show(struct seq_file *s, void *v)
 
 	ret = -ENOSPC;
 	seq_printf(s, "%-8s %u %-8s %u %ld ",
-		   l3proto->name, nf_ct_l3num(ct),
+		   l3proto_name(l3proto->l3proto), nf_ct_l3num(ct),
 		   l4proto->name, nf_ct_protonum(ct),
 		   nf_ct_expires(ct)  / HZ);
 
-- 
cgit v1.2.3


From 09ec82f5af99d1e35614eb0844b920fc335a313d Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Sat, 12 Aug 2017 00:57:04 +0200
Subject: netfilter: conntrack: remove protocol name from l4proto struct

no need to waste storage for something that is only needed
in one place and can be deduced from protocol number.

Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/net/netfilter/nf_conntrack_l4proto.h   |  3 ---
 net/ipv4/netfilter/nf_conntrack_proto_icmp.c   |  1 -
 net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c |  1 -
 net/netfilter/nf_conntrack_proto.c             |  8 ++++----
 net/netfilter/nf_conntrack_proto_dccp.c        |  2 --
 net/netfilter/nf_conntrack_proto_generic.c     |  1 -
 net/netfilter/nf_conntrack_proto_gre.c         |  1 -
 net/netfilter/nf_conntrack_proto_sctp.c        |  2 --
 net/netfilter/nf_conntrack_proto_tcp.c         |  2 --
 net/netfilter/nf_conntrack_proto_udp.c         |  4 ----
 net/netfilter/nf_conntrack_standalone.c        | 17 ++++++++++++++++-
 11 files changed, 20 insertions(+), 22 deletions(-)

(limited to 'include')

diff --git a/include/net/netfilter/nf_conntrack_l4proto.h b/include/net/netfilter/nf_conntrack_l4proto.h
index b6e27cafb1d9..47c16bae5e00 100644
--- a/include/net/netfilter/nf_conntrack_l4proto.h
+++ b/include/net/netfilter/nf_conntrack_l4proto.h
@@ -108,9 +108,6 @@ struct nf_conntrack_l4proto {
 	/* Return the per-net protocol part. */
 	struct nf_proto_net *(*get_net_proto)(struct net *net);
 
-	/* Protocol name */
-	const char *name;
-
 	/* Module (if any) which this is connected to. */
 	struct module *me;
 };
diff --git a/net/ipv4/netfilter/nf_conntrack_proto_icmp.c b/net/ipv4/netfilter/nf_conntrack_proto_icmp.c
index 73c591d8a9a8..fdbeb03e4600 100644
--- a/net/ipv4/netfilter/nf_conntrack_proto_icmp.c
+++ b/net/ipv4/netfilter/nf_conntrack_proto_icmp.c
@@ -362,7 +362,6 @@ struct nf_conntrack_l4proto nf_conntrack_l4proto_icmp __read_mostly =
 {
 	.l3proto		= PF_INET,
 	.l4proto		= IPPROTO_ICMP,
-	.name			= "icmp",
 	.pkt_to_tuple		= icmp_pkt_to_tuple,
 	.invert_tuple		= icmp_invert_tuple,
 	.print_tuple		= icmp_print_tuple,
diff --git a/net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c b/net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c
index d5f028e33f65..805ab122767a 100644
--- a/net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c
+++ b/net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c
@@ -367,7 +367,6 @@ struct nf_conntrack_l4proto nf_conntrack_l4proto_icmpv6 __read_mostly =
 {
 	.l3proto		= PF_INET6,
 	.l4proto		= IPPROTO_ICMPV6,
-	.name			= "icmpv6",
 	.pkt_to_tuple		= icmpv6_pkt_to_tuple,
 	.invert_tuple		= icmpv6_invert_tuple,
 	.print_tuple		= icmpv6_print_tuple,
diff --git a/net/netfilter/nf_conntrack_proto.c b/net/netfilter/nf_conntrack_proto.c
index 85104a27cc89..0ecab7163d62 100644
--- a/net/netfilter/nf_conntrack_proto.c
+++ b/net/netfilter/nf_conntrack_proto.c
@@ -437,8 +437,8 @@ int nf_ct_l4proto_register(struct nf_conntrack_l4proto *l4proto[],
 	}
 	if (i != num_proto) {
 		ver = l4proto[i]->l3proto == PF_INET6 ? 6 : 4;
-		pr_err("nf_conntrack_ipv%d: can't register %s%d proto.\n",
-		       ver, l4proto[i]->name, ver);
+		pr_err("nf_conntrack_ipv%d: can't register l4 %d proto.\n",
+		       ver, l4proto[i]->l4proto);
 		nf_ct_l4proto_unregister(l4proto, i);
 	}
 	return ret;
@@ -458,8 +458,8 @@ int nf_ct_l4proto_pernet_register(struct net *net,
 			break;
 	}
 	if (i != num_proto) {
-		pr_err("nf_conntrack_%s%d: pernet registration failed\n",
-		       l4proto[i]->name,
+		pr_err("nf_conntrack_proto_%d %d: pernet registration failed\n",
+		       l4proto[i]->l4proto,
 		       l4proto[i]->l3proto == PF_INET6 ? 6 : 4);
 		nf_ct_l4proto_pernet_unregister(net, l4proto, i);
 	}
diff --git a/net/netfilter/nf_conntrack_proto_dccp.c b/net/netfilter/nf_conntrack_proto_dccp.c
index 4707d997558a..a0492184a0a8 100644
--- a/net/netfilter/nf_conntrack_proto_dccp.c
+++ b/net/netfilter/nf_conntrack_proto_dccp.c
@@ -880,7 +880,6 @@ static struct nf_proto_net *dccp_get_net_proto(struct net *net)
 struct nf_conntrack_l4proto nf_conntrack_l4proto_dccp4 __read_mostly = {
 	.l3proto		= AF_INET,
 	.l4proto		= IPPROTO_DCCP,
-	.name			= "dccp",
 	.pkt_to_tuple		= dccp_pkt_to_tuple,
 	.invert_tuple		= dccp_invert_tuple,
 	.new			= dccp_new,
@@ -916,7 +915,6 @@ EXPORT_SYMBOL_GPL(nf_conntrack_l4proto_dccp4);
 struct nf_conntrack_l4proto nf_conntrack_l4proto_dccp6 __read_mostly = {
 	.l3proto		= AF_INET6,
 	.l4proto		= IPPROTO_DCCP,
-	.name			= "dccp",
 	.pkt_to_tuple		= dccp_pkt_to_tuple,
 	.invert_tuple		= dccp_invert_tuple,
 	.new			= dccp_new,
diff --git a/net/netfilter/nf_conntrack_proto_generic.c b/net/netfilter/nf_conntrack_proto_generic.c
index d5868bad33a7..4fe8b3312823 100644
--- a/net/netfilter/nf_conntrack_proto_generic.c
+++ b/net/netfilter/nf_conntrack_proto_generic.c
@@ -187,7 +187,6 @@ struct nf_conntrack_l4proto nf_conntrack_l4proto_generic __read_mostly =
 {
 	.l3proto		= PF_UNSPEC,
 	.l4proto		= 255,
-	.name			= "unknown",
 	.pkt_to_tuple		= generic_pkt_to_tuple,
 	.invert_tuple		= generic_invert_tuple,
 	.print_tuple		= generic_print_tuple,
diff --git a/net/netfilter/nf_conntrack_proto_gre.c b/net/netfilter/nf_conntrack_proto_gre.c
index 87bb40a3feb5..984bcfdbd4d7 100644
--- a/net/netfilter/nf_conntrack_proto_gre.c
+++ b/net/netfilter/nf_conntrack_proto_gre.c
@@ -364,7 +364,6 @@ static int gre_init_net(struct net *net, u_int16_t proto)
 static struct nf_conntrack_l4proto nf_conntrack_l4proto_gre4 __read_mostly = {
 	.l3proto	 = AF_INET,
 	.l4proto	 = IPPROTO_GRE,
-	.name		 = "gre",
 	.pkt_to_tuple	 = gre_pkt_to_tuple,
 	.invert_tuple	 = gre_invert_tuple,
 	.print_tuple	 = gre_print_tuple,
diff --git a/net/netfilter/nf_conntrack_proto_sctp.c b/net/netfilter/nf_conntrack_proto_sctp.c
index 6eef29d2eec4..1d7a995ea049 100644
--- a/net/netfilter/nf_conntrack_proto_sctp.c
+++ b/net/netfilter/nf_conntrack_proto_sctp.c
@@ -791,7 +791,6 @@ static struct nf_proto_net *sctp_get_net_proto(struct net *net)
 struct nf_conntrack_l4proto nf_conntrack_l4proto_sctp4 __read_mostly = {
 	.l3proto		= PF_INET,
 	.l4proto 		= IPPROTO_SCTP,
-	.name 			= "sctp",
 	.pkt_to_tuple 		= sctp_pkt_to_tuple,
 	.invert_tuple 		= sctp_invert_tuple,
 	.print_tuple 		= sctp_print_tuple,
@@ -828,7 +827,6 @@ EXPORT_SYMBOL_GPL(nf_conntrack_l4proto_sctp4);
 struct nf_conntrack_l4proto nf_conntrack_l4proto_sctp6 __read_mostly = {
 	.l3proto		= PF_INET6,
 	.l4proto 		= IPPROTO_SCTP,
-	.name 			= "sctp",
 	.pkt_to_tuple 		= sctp_pkt_to_tuple,
 	.invert_tuple 		= sctp_invert_tuple,
 	.print_tuple 		= sctp_print_tuple,
diff --git a/net/netfilter/nf_conntrack_proto_tcp.c b/net/netfilter/nf_conntrack_proto_tcp.c
index 9758a7dfd83e..e3e59e3d0592 100644
--- a/net/netfilter/nf_conntrack_proto_tcp.c
+++ b/net/netfilter/nf_conntrack_proto_tcp.c
@@ -1556,7 +1556,6 @@ struct nf_conntrack_l4proto nf_conntrack_l4proto_tcp4 __read_mostly =
 {
 	.l3proto		= PF_INET,
 	.l4proto 		= IPPROTO_TCP,
-	.name 			= "tcp",
 	.pkt_to_tuple 		= tcp_pkt_to_tuple,
 	.invert_tuple 		= tcp_invert_tuple,
 	.print_tuple 		= tcp_print_tuple,
@@ -1594,7 +1593,6 @@ struct nf_conntrack_l4proto nf_conntrack_l4proto_tcp6 __read_mostly =
 {
 	.l3proto		= PF_INET6,
 	.l4proto 		= IPPROTO_TCP,
-	.name 			= "tcp",
 	.pkt_to_tuple 		= tcp_pkt_to_tuple,
 	.invert_tuple 		= tcp_invert_tuple,
 	.print_tuple 		= tcp_print_tuple,
diff --git a/net/netfilter/nf_conntrack_proto_udp.c b/net/netfilter/nf_conntrack_proto_udp.c
index f6ebce6178ca..ec861a1169f1 100644
--- a/net/netfilter/nf_conntrack_proto_udp.c
+++ b/net/netfilter/nf_conntrack_proto_udp.c
@@ -313,7 +313,6 @@ struct nf_conntrack_l4proto nf_conntrack_l4proto_udp4 __read_mostly =
 {
 	.l3proto		= PF_INET,
 	.l4proto		= IPPROTO_UDP,
-	.name			= "udp",
 	.allow_clash		= true,
 	.pkt_to_tuple		= udp_pkt_to_tuple,
 	.invert_tuple		= udp_invert_tuple,
@@ -347,7 +346,6 @@ struct nf_conntrack_l4proto nf_conntrack_l4proto_udplite4 __read_mostly =
 {
 	.l3proto		= PF_INET,
 	.l4proto		= IPPROTO_UDPLITE,
-	.name			= "udplite",
 	.allow_clash		= true,
 	.pkt_to_tuple		= udp_pkt_to_tuple,
 	.invert_tuple		= udp_invert_tuple,
@@ -381,7 +379,6 @@ struct nf_conntrack_l4proto nf_conntrack_l4proto_udp6 __read_mostly =
 {
 	.l3proto		= PF_INET6,
 	.l4proto		= IPPROTO_UDP,
-	.name			= "udp",
 	.allow_clash		= true,
 	.pkt_to_tuple		= udp_pkt_to_tuple,
 	.invert_tuple		= udp_invert_tuple,
@@ -415,7 +412,6 @@ struct nf_conntrack_l4proto nf_conntrack_l4proto_udplite6 __read_mostly =
 {
 	.l3proto		= PF_INET6,
 	.l4proto		= IPPROTO_UDPLITE,
-	.name			= "udplite",
 	.allow_clash		= true,
 	.pkt_to_tuple		= udp_pkt_to_tuple,
 	.invert_tuple		= udp_invert_tuple,
diff --git a/net/netfilter/nf_conntrack_standalone.c b/net/netfilter/nf_conntrack_standalone.c
index 359d7e6a4503..b28f9e93f574 100644
--- a/net/netfilter/nf_conntrack_standalone.c
+++ b/net/netfilter/nf_conntrack_standalone.c
@@ -208,6 +208,21 @@ static const char* l3proto_name(u16 proto)
 	return "unknown";
 }
 
+static const char* l4proto_name(u16 proto)
+{
+	switch (proto) {
+	case IPPROTO_ICMP: return "icmp";
+	case IPPROTO_TCP: return "tcp";
+	case IPPROTO_UDP: return "udp";
+	case IPPROTO_DCCP: return "dccp";
+	case IPPROTO_GRE: return "gre";
+	case IPPROTO_SCTP: return "sctp";
+	case IPPROTO_UDPLITE: return "udplite";
+	}
+
+	return "unknown";
+}
+
 /* return 0 on success, 1 in case of error */
 static int ct_seq_show(struct seq_file *s, void *v)
 {
@@ -242,7 +257,7 @@ static int ct_seq_show(struct seq_file *s, void *v)
 	ret = -ENOSPC;
 	seq_printf(s, "%-8s %u %-8s %u %ld ",
 		   l3proto_name(l3proto->l3proto), nf_ct_l3num(ct),
-		   l4proto->name, nf_ct_protonum(ct),
+		   l4proto_name(l4proto->l4proto), nf_ct_protonum(ct),
 		   nf_ct_expires(ct)  / HZ);
 
 	if (l4proto->print_conntrack)
-- 
cgit v1.2.3


From 036c69400a34cf8f6d39c88325dd510462809e7b Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Sat, 12 Aug 2017 00:57:05 +0200
Subject: netfilter: conntrack: reduce size of l4protocol trackers

can use u16 for both, shrinks size by another 8 bytes.

Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/net/netfilter/nf_conntrack_l4proto.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/net/netfilter/nf_conntrack_l4proto.h b/include/net/netfilter/nf_conntrack_l4proto.h
index 47c16bae5e00..15c58dd3f701 100644
--- a/include/net/netfilter/nf_conntrack_l4proto.h
+++ b/include/net/netfilter/nf_conntrack_l4proto.h
@@ -92,12 +92,12 @@ struct nf_conntrack_l4proto {
 
 #if IS_ENABLED(CONFIG_NF_CT_NETLINK_TIMEOUT)
 	struct {
-		size_t obj_size;
 		int (*nlattr_to_obj)(struct nlattr *tb[],
 				     struct net *net, void *data);
 		int (*obj_to_nlattr)(struct sk_buff *skb, const void *data);
 
-		unsigned int nlattr_max;
+		u16 obj_size;
+		u16 nlattr_max;
 		const struct nla_policy *nla_policy;
 	} ctnl_timeout;
 #endif
-- 
cgit v1.2.3


From 91950833dd5a34ac6336aa88da6d43aaeb56ac6d Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Sat, 12 Aug 2017 00:57:06 +0200
Subject: netfilter: conntrack: place print_tuple in procfs part

CONFIG_NF_CONNTRACK_PROCFS is deprecated, no need to use a function
pointer in the trackers for this. Place the printf formatting in
the one place that uses it.

Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/net/netfilter/nf_conntrack_l3proto.h   |  4 --
 include/net/netfilter/nf_conntrack_l4proto.h   |  4 --
 net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c |  8 ----
 net/ipv4/netfilter/nf_conntrack_proto_icmp.c   | 11 -----
 net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c |  8 ----
 net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c | 11 -----
 net/netfilter/nf_conntrack_l3proto_generic.c   |  6 ---
 net/netfilter/nf_conntrack_proto_dccp.c        | 10 -----
 net/netfilter/nf_conntrack_proto_generic.c     |  7 ----
 net/netfilter/nf_conntrack_proto_gre.c         | 10 -----
 net/netfilter/nf_conntrack_proto_sctp.c        | 11 -----
 net/netfilter/nf_conntrack_proto_tcp.c         | 11 -----
 net/netfilter/nf_conntrack_proto_udp.c         | 13 ------
 net/netfilter/nf_conntrack_standalone.c        | 58 +++++++++++++++++++++++++-
 14 files changed, 56 insertions(+), 116 deletions(-)

(limited to 'include')

diff --git a/include/net/netfilter/nf_conntrack_l3proto.h b/include/net/netfilter/nf_conntrack_l3proto.h
index e31861e4fa6a..dabb53b0913c 100644
--- a/include/net/netfilter/nf_conntrack_l3proto.h
+++ b/include/net/netfilter/nf_conntrack_l3proto.h
@@ -37,10 +37,6 @@ struct nf_conntrack_l3proto {
 	bool (*invert_tuple)(struct nf_conntrack_tuple *inverse,
 			     const struct nf_conntrack_tuple *orig);
 
-	/* Print out the per-protocol part of the tuple. */
-	void (*print_tuple)(struct seq_file *s,
-			    const struct nf_conntrack_tuple *);
-
 	/*
 	 * Called before tracking. 
 	 *	*dataoff: offset of protocol header (TCP, UDP,...) in skb
diff --git a/include/net/netfilter/nf_conntrack_l4proto.h b/include/net/netfilter/nf_conntrack_l4proto.h
index 15c58dd3f701..7e8da04a5eb6 100644
--- a/include/net/netfilter/nf_conntrack_l4proto.h
+++ b/include/net/netfilter/nf_conntrack_l4proto.h
@@ -61,10 +61,6 @@ struct nf_conntrack_l4proto {
 	/* called by gc worker if table is full */
 	bool (*can_early_drop)(const struct nf_conn *ct);
 
-	/* Print out the per-protocol part of the tuple. Return like seq_* */
-	void (*print_tuple)(struct seq_file *s,
-			    const struct nf_conntrack_tuple *);
-
 	/* Print out the private part of the conntrack. */
 	void (*print_conntrack)(struct seq_file *s, struct nf_conn *);
 
diff --git a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c
index 9f7ea862068c..fe374da4bc13 100644
--- a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c
+++ b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c
@@ -63,13 +63,6 @@ static bool ipv4_invert_tuple(struct nf_conntrack_tuple *tuple,
 	return true;
 }
 
-static void ipv4_print_tuple(struct seq_file *s,
-			    const struct nf_conntrack_tuple *tuple)
-{
-	seq_printf(s, "src=%pI4 dst=%pI4 ",
-		   &tuple->src.u3.ip, &tuple->dst.u3.ip);
-}
-
 static int ipv4_get_l4proto(const struct sk_buff *skb, unsigned int nhoff,
 			    unsigned int *dataoff, u_int8_t *protonum)
 {
@@ -355,7 +348,6 @@ struct nf_conntrack_l3proto nf_conntrack_l3proto_ipv4 __read_mostly = {
 	.l3proto	 = PF_INET,
 	.pkt_to_tuple	 = ipv4_pkt_to_tuple,
 	.invert_tuple	 = ipv4_invert_tuple,
-	.print_tuple	 = ipv4_print_tuple,
 	.get_l4proto	 = ipv4_get_l4proto,
 #if IS_ENABLED(CONFIG_NF_CT_NETLINK)
 	.tuple_to_nlattr = ipv4_tuple_to_nlattr,
diff --git a/net/ipv4/netfilter/nf_conntrack_proto_icmp.c b/net/ipv4/netfilter/nf_conntrack_proto_icmp.c
index fdbeb03e4600..434b4e20f6db 100644
--- a/net/ipv4/netfilter/nf_conntrack_proto_icmp.c
+++ b/net/ipv4/netfilter/nf_conntrack_proto_icmp.c
@@ -71,16 +71,6 @@ static bool icmp_invert_tuple(struct nf_conntrack_tuple *tuple,
 	return true;
 }
 
-/* Print out the per-protocol part of the tuple. */
-static void icmp_print_tuple(struct seq_file *s,
-			    const struct nf_conntrack_tuple *tuple)
-{
-	seq_printf(s, "type=%u code=%u id=%u ",
-		   tuple->dst.u.icmp.type,
-		   tuple->dst.u.icmp.code,
-		   ntohs(tuple->src.u.icmp.id));
-}
-
 static unsigned int *icmp_get_timeouts(struct net *net)
 {
 	return &icmp_pernet(net)->timeout;
@@ -364,7 +354,6 @@ struct nf_conntrack_l4proto nf_conntrack_l4proto_icmp __read_mostly =
 	.l4proto		= IPPROTO_ICMP,
 	.pkt_to_tuple		= icmp_pkt_to_tuple,
 	.invert_tuple		= icmp_invert_tuple,
-	.print_tuple		= icmp_print_tuple,
 	.packet			= icmp_packet,
 	.get_timeouts		= icmp_get_timeouts,
 	.new			= icmp_new,
diff --git a/net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c b/net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c
index 91d37fbe28de..fe01dc953c56 100644
--- a/net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c
+++ b/net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c
@@ -67,13 +67,6 @@ static bool ipv6_invert_tuple(struct nf_conntrack_tuple *tuple,
 	return true;
 }
 
-static void ipv6_print_tuple(struct seq_file *s,
-			    const struct nf_conntrack_tuple *tuple)
-{
-	seq_printf(s, "src=%pI6 dst=%pI6 ",
-		   tuple->src.u3.ip6, tuple->dst.u3.ip6);
-}
-
 static int ipv6_get_l4proto(const struct sk_buff *skb, unsigned int nhoff,
 			    unsigned int *dataoff, u_int8_t *protonum)
 {
@@ -350,7 +343,6 @@ struct nf_conntrack_l3proto nf_conntrack_l3proto_ipv6 __read_mostly = {
 	.l3proto		= PF_INET6,
 	.pkt_to_tuple		= ipv6_pkt_to_tuple,
 	.invert_tuple		= ipv6_invert_tuple,
-	.print_tuple		= ipv6_print_tuple,
 	.get_l4proto		= ipv6_get_l4proto,
 #if IS_ENABLED(CONFIG_NF_CT_NETLINK)
 	.tuple_to_nlattr	= ipv6_tuple_to_nlattr,
diff --git a/net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c b/net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c
index 805ab122767a..808f63e2e1ff 100644
--- a/net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c
+++ b/net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c
@@ -84,16 +84,6 @@ static bool icmpv6_invert_tuple(struct nf_conntrack_tuple *tuple,
 	return true;
 }
 
-/* Print out the per-protocol part of the tuple. */
-static void icmpv6_print_tuple(struct seq_file *s,
-			      const struct nf_conntrack_tuple *tuple)
-{
-	seq_printf(s, "type=%u code=%u id=%u ",
-		   tuple->dst.u.icmp.type,
-		   tuple->dst.u.icmp.code,
-		   ntohs(tuple->src.u.icmp.id));
-}
-
 static unsigned int *icmpv6_get_timeouts(struct net *net)
 {
 	return &icmpv6_pernet(net)->timeout;
@@ -369,7 +359,6 @@ struct nf_conntrack_l4proto nf_conntrack_l4proto_icmpv6 __read_mostly =
 	.l4proto		= IPPROTO_ICMPV6,
 	.pkt_to_tuple		= icmpv6_pkt_to_tuple,
 	.invert_tuple		= icmpv6_invert_tuple,
-	.print_tuple		= icmpv6_print_tuple,
 	.packet			= icmpv6_packet,
 	.get_timeouts		= icmpv6_get_timeouts,
 	.new			= icmpv6_new,
diff --git a/net/netfilter/nf_conntrack_l3proto_generic.c b/net/netfilter/nf_conntrack_l3proto_generic.c
index 0387971582bc..397e6911214f 100644
--- a/net/netfilter/nf_conntrack_l3proto_generic.c
+++ b/net/netfilter/nf_conntrack_l3proto_generic.c
@@ -49,11 +49,6 @@ static bool generic_invert_tuple(struct nf_conntrack_tuple *tuple,
 	return true;
 }
 
-static void generic_print_tuple(struct seq_file *s,
-				const struct nf_conntrack_tuple *tuple)
-{
-}
-
 static int generic_get_l4proto(const struct sk_buff *skb, unsigned int nhoff,
 			       unsigned int *dataoff, u_int8_t *protonum)
 {
@@ -66,7 +61,6 @@ struct nf_conntrack_l3proto nf_conntrack_l3proto_generic __read_mostly = {
 	.l3proto	 = PF_UNSPEC,
 	.pkt_to_tuple	 = generic_pkt_to_tuple,
 	.invert_tuple	 = generic_invert_tuple,
-	.print_tuple	 = generic_print_tuple,
 	.get_l4proto	 = generic_get_l4proto,
 };
 EXPORT_SYMBOL_GPL(nf_conntrack_l3proto_generic);
diff --git a/net/netfilter/nf_conntrack_proto_dccp.c b/net/netfilter/nf_conntrack_proto_dccp.c
index a0492184a0a8..d2df49ac390a 100644
--- a/net/netfilter/nf_conntrack_proto_dccp.c
+++ b/net/netfilter/nf_conntrack_proto_dccp.c
@@ -623,14 +623,6 @@ static bool dccp_can_early_drop(const struct nf_conn *ct)
 	return false;
 }
 
-static void dccp_print_tuple(struct seq_file *s,
-			     const struct nf_conntrack_tuple *tuple)
-{
-	seq_printf(s, "sport=%hu dport=%hu ",
-		   ntohs(tuple->src.u.dccp.port),
-		   ntohs(tuple->dst.u.dccp.port));
-}
-
 static void dccp_print_conntrack(struct seq_file *s, struct nf_conn *ct)
 {
 	seq_printf(s, "%s ", dccp_state_names[ct->proto.dccp.state]);
@@ -887,7 +879,6 @@ struct nf_conntrack_l4proto nf_conntrack_l4proto_dccp4 __read_mostly = {
 	.get_timeouts		= dccp_get_timeouts,
 	.error			= dccp_error,
 	.can_early_drop		= dccp_can_early_drop,
-	.print_tuple		= dccp_print_tuple,
 	.print_conntrack	= dccp_print_conntrack,
 #if IS_ENABLED(CONFIG_NF_CT_NETLINK)
 	.to_nlattr		= dccp_to_nlattr,
@@ -922,7 +913,6 @@ struct nf_conntrack_l4proto nf_conntrack_l4proto_dccp6 __read_mostly = {
 	.get_timeouts		= dccp_get_timeouts,
 	.error			= dccp_error,
 	.can_early_drop		= dccp_can_early_drop,
-	.print_tuple		= dccp_print_tuple,
 	.print_conntrack	= dccp_print_conntrack,
 #if IS_ENABLED(CONFIG_NF_CT_NETLINK)
 	.to_nlattr		= dccp_to_nlattr,
diff --git a/net/netfilter/nf_conntrack_proto_generic.c b/net/netfilter/nf_conntrack_proto_generic.c
index 4fe8b3312823..2bc3d0c1a5bf 100644
--- a/net/netfilter/nf_conntrack_proto_generic.c
+++ b/net/netfilter/nf_conntrack_proto_generic.c
@@ -62,12 +62,6 @@ static bool generic_invert_tuple(struct nf_conntrack_tuple *tuple,
 	return true;
 }
 
-/* Print out the per-protocol part of the tuple. */
-static void generic_print_tuple(struct seq_file *s,
-				const struct nf_conntrack_tuple *tuple)
-{
-}
-
 static unsigned int *generic_get_timeouts(struct net *net)
 {
 	return &(generic_pernet(net)->timeout);
@@ -189,7 +183,6 @@ struct nf_conntrack_l4proto nf_conntrack_l4proto_generic __read_mostly =
 	.l4proto		= 255,
 	.pkt_to_tuple		= generic_pkt_to_tuple,
 	.invert_tuple		= generic_invert_tuple,
-	.print_tuple		= generic_print_tuple,
 	.packet			= generic_packet,
 	.get_timeouts		= generic_get_timeouts,
 	.new			= generic_new,
diff --git a/net/netfilter/nf_conntrack_proto_gre.c b/net/netfilter/nf_conntrack_proto_gre.c
index 984bcfdbd4d7..cd28095dd7a4 100644
--- a/net/netfilter/nf_conntrack_proto_gre.c
+++ b/net/netfilter/nf_conntrack_proto_gre.c
@@ -224,15 +224,6 @@ static bool gre_pkt_to_tuple(const struct sk_buff *skb, unsigned int dataoff,
 	return true;
 }
 
-/* print gre part of tuple */
-static void gre_print_tuple(struct seq_file *s,
-			    const struct nf_conntrack_tuple *tuple)
-{
-	seq_printf(s, "srckey=0x%x dstkey=0x%x ",
-		   ntohs(tuple->src.u.gre.key),
-		   ntohs(tuple->dst.u.gre.key));
-}
-
 /* print private data for conntrack */
 static void gre_print_conntrack(struct seq_file *s, struct nf_conn *ct)
 {
@@ -366,7 +357,6 @@ static struct nf_conntrack_l4proto nf_conntrack_l4proto_gre4 __read_mostly = {
 	.l4proto	 = IPPROTO_GRE,
 	.pkt_to_tuple	 = gre_pkt_to_tuple,
 	.invert_tuple	 = gre_invert_tuple,
-	.print_tuple	 = gre_print_tuple,
 	.print_conntrack = gre_print_conntrack,
 	.get_timeouts    = gre_get_timeouts,
 	.packet		 = gre_packet,
diff --git a/net/netfilter/nf_conntrack_proto_sctp.c b/net/netfilter/nf_conntrack_proto_sctp.c
index 1d7a995ea049..da83b401be17 100644
--- a/net/netfilter/nf_conntrack_proto_sctp.c
+++ b/net/netfilter/nf_conntrack_proto_sctp.c
@@ -174,15 +174,6 @@ static bool sctp_invert_tuple(struct nf_conntrack_tuple *tuple,
 	return true;
 }
 
-/* Print out the per-protocol part of the tuple. */
-static void sctp_print_tuple(struct seq_file *s,
-			     const struct nf_conntrack_tuple *tuple)
-{
-	seq_printf(s, "sport=%hu dport=%hu ",
-		   ntohs(tuple->src.u.sctp.port),
-		   ntohs(tuple->dst.u.sctp.port));
-}
-
 /* Print out the private part of the conntrack. */
 static void sctp_print_conntrack(struct seq_file *s, struct nf_conn *ct)
 {
@@ -793,7 +784,6 @@ struct nf_conntrack_l4proto nf_conntrack_l4proto_sctp4 __read_mostly = {
 	.l4proto 		= IPPROTO_SCTP,
 	.pkt_to_tuple 		= sctp_pkt_to_tuple,
 	.invert_tuple 		= sctp_invert_tuple,
-	.print_tuple 		= sctp_print_tuple,
 	.print_conntrack	= sctp_print_conntrack,
 	.packet 		= sctp_packet,
 	.get_timeouts		= sctp_get_timeouts,
@@ -829,7 +819,6 @@ struct nf_conntrack_l4proto nf_conntrack_l4proto_sctp6 __read_mostly = {
 	.l4proto 		= IPPROTO_SCTP,
 	.pkt_to_tuple 		= sctp_pkt_to_tuple,
 	.invert_tuple 		= sctp_invert_tuple,
-	.print_tuple 		= sctp_print_tuple,
 	.print_conntrack	= sctp_print_conntrack,
 	.packet 		= sctp_packet,
 	.get_timeouts		= sctp_get_timeouts,
diff --git a/net/netfilter/nf_conntrack_proto_tcp.c b/net/netfilter/nf_conntrack_proto_tcp.c
index e3e59e3d0592..c868b36b8945 100644
--- a/net/netfilter/nf_conntrack_proto_tcp.c
+++ b/net/netfilter/nf_conntrack_proto_tcp.c
@@ -301,15 +301,6 @@ static bool tcp_invert_tuple(struct nf_conntrack_tuple *tuple,
 	return true;
 }
 
-/* Print out the per-protocol part of the tuple. */
-static void tcp_print_tuple(struct seq_file *s,
-			    const struct nf_conntrack_tuple *tuple)
-{
-	seq_printf(s, "sport=%hu dport=%hu ",
-		   ntohs(tuple->src.u.tcp.port),
-		   ntohs(tuple->dst.u.tcp.port));
-}
-
 /* Print out the private part of the conntrack. */
 static void tcp_print_conntrack(struct seq_file *s, struct nf_conn *ct)
 {
@@ -1558,7 +1549,6 @@ struct nf_conntrack_l4proto nf_conntrack_l4proto_tcp4 __read_mostly =
 	.l4proto 		= IPPROTO_TCP,
 	.pkt_to_tuple 		= tcp_pkt_to_tuple,
 	.invert_tuple 		= tcp_invert_tuple,
-	.print_tuple 		= tcp_print_tuple,
 	.print_conntrack 	= tcp_print_conntrack,
 	.packet 		= tcp_packet,
 	.get_timeouts		= tcp_get_timeouts,
@@ -1595,7 +1585,6 @@ struct nf_conntrack_l4proto nf_conntrack_l4proto_tcp6 __read_mostly =
 	.l4proto 		= IPPROTO_TCP,
 	.pkt_to_tuple 		= tcp_pkt_to_tuple,
 	.invert_tuple 		= tcp_invert_tuple,
-	.print_tuple 		= tcp_print_tuple,
 	.print_conntrack 	= tcp_print_conntrack,
 	.packet 		= tcp_packet,
 	.get_timeouts		= tcp_get_timeouts,
diff --git a/net/netfilter/nf_conntrack_proto_udp.c b/net/netfilter/nf_conntrack_proto_udp.c
index ec861a1169f1..dcf3030d2226 100644
--- a/net/netfilter/nf_conntrack_proto_udp.c
+++ b/net/netfilter/nf_conntrack_proto_udp.c
@@ -63,15 +63,6 @@ static bool udp_invert_tuple(struct nf_conntrack_tuple *tuple,
 	return true;
 }
 
-/* Print out the per-protocol part of the tuple. */
-static void udp_print_tuple(struct seq_file *s,
-			    const struct nf_conntrack_tuple *tuple)
-{
-	seq_printf(s, "sport=%hu dport=%hu ",
-		   ntohs(tuple->src.u.udp.port),
-		   ntohs(tuple->dst.u.udp.port));
-}
-
 static unsigned int *udp_get_timeouts(struct net *net)
 {
 	return udp_pernet(net)->timeouts;
@@ -316,7 +307,6 @@ struct nf_conntrack_l4proto nf_conntrack_l4proto_udp4 __read_mostly =
 	.allow_clash		= true,
 	.pkt_to_tuple		= udp_pkt_to_tuple,
 	.invert_tuple		= udp_invert_tuple,
-	.print_tuple		= udp_print_tuple,
 	.packet			= udp_packet,
 	.get_timeouts		= udp_get_timeouts,
 	.new			= udp_new,
@@ -349,7 +339,6 @@ struct nf_conntrack_l4proto nf_conntrack_l4proto_udplite4 __read_mostly =
 	.allow_clash		= true,
 	.pkt_to_tuple		= udp_pkt_to_tuple,
 	.invert_tuple		= udp_invert_tuple,
-	.print_tuple		= udp_print_tuple,
 	.packet			= udp_packet,
 	.get_timeouts		= udp_get_timeouts,
 	.new			= udp_new,
@@ -382,7 +371,6 @@ struct nf_conntrack_l4proto nf_conntrack_l4proto_udp6 __read_mostly =
 	.allow_clash		= true,
 	.pkt_to_tuple		= udp_pkt_to_tuple,
 	.invert_tuple		= udp_invert_tuple,
-	.print_tuple		= udp_print_tuple,
 	.packet			= udp_packet,
 	.get_timeouts		= udp_get_timeouts,
 	.new			= udp_new,
@@ -415,7 +403,6 @@ struct nf_conntrack_l4proto nf_conntrack_l4proto_udplite6 __read_mostly =
 	.allow_clash		= true,
 	.pkt_to_tuple		= udp_pkt_to_tuple,
 	.invert_tuple		= udp_invert_tuple,
-	.print_tuple		= udp_print_tuple,
 	.packet			= udp_packet,
 	.get_timeouts		= udp_get_timeouts,
 	.new			= udp_new,
diff --git a/net/netfilter/nf_conntrack_standalone.c b/net/netfilter/nf_conntrack_standalone.c
index b28f9e93f574..9eb85858d764 100644
--- a/net/netfilter/nf_conntrack_standalone.c
+++ b/net/netfilter/nf_conntrack_standalone.c
@@ -41,8 +41,62 @@ print_tuple(struct seq_file *s, const struct nf_conntrack_tuple *tuple,
             const struct nf_conntrack_l3proto *l3proto,
             const struct nf_conntrack_l4proto *l4proto)
 {
-	l3proto->print_tuple(s, tuple);
-	l4proto->print_tuple(s, tuple);
+	switch (l3proto->l3proto) {
+	case NFPROTO_IPV4:
+		seq_printf(s, "src=%pI4 dst=%pI4 ",
+			   &tuple->src.u3.ip, &tuple->dst.u3.ip);
+		break;
+	case NFPROTO_IPV6:
+		seq_printf(s, "src=%pI6 dst=%pI6 ",
+			   tuple->src.u3.ip6, tuple->dst.u3.ip6);
+		break;
+	default:
+		break;
+	}
+
+	switch (l4proto->l4proto) {
+	case IPPROTO_ICMP:
+		seq_printf(s, "type=%u code=%u id=%u ",
+			   tuple->dst.u.icmp.type,
+			   tuple->dst.u.icmp.code,
+			   ntohs(tuple->src.u.icmp.id));
+		break;
+	case IPPROTO_TCP:
+		seq_printf(s, "sport=%hu dport=%hu ",
+			   ntohs(tuple->src.u.tcp.port),
+			   ntohs(tuple->dst.u.tcp.port));
+		break;
+	case IPPROTO_UDPLITE: /* fallthrough */
+	case IPPROTO_UDP:
+		seq_printf(s, "sport=%hu dport=%hu ",
+			   ntohs(tuple->src.u.udp.port),
+			   ntohs(tuple->dst.u.udp.port));
+
+		break;
+	case IPPROTO_DCCP:
+		seq_printf(s, "sport=%hu dport=%hu ",
+			   ntohs(tuple->src.u.dccp.port),
+			   ntohs(tuple->dst.u.dccp.port));
+		break;
+	case IPPROTO_SCTP:
+		seq_printf(s, "sport=%hu dport=%hu ",
+			   ntohs(tuple->src.u.sctp.port),
+			   ntohs(tuple->dst.u.sctp.port));
+		break;
+	case IPPROTO_ICMPV6:
+		seq_printf(s, "type=%u code=%u id=%u ",
+			   tuple->dst.u.icmp.type,
+			   tuple->dst.u.icmp.code,
+			   ntohs(tuple->src.u.icmp.id));
+		break;
+	case IPPROTO_GRE:
+		seq_printf(s, "srckey=0x%x dstkey=0x%x ",
+			   ntohs(tuple->src.u.gre.key),
+			   ntohs(tuple->dst.u.gre.key));
+		break;
+	default:
+		break;
+	}
 }
 EXPORT_SYMBOL_GPL(print_tuple);
 
-- 
cgit v1.2.3


From ea48cc83cf612fddb4e8868369348b9f936cfedb Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Sat, 12 Aug 2017 00:57:07 +0200
Subject: netfilter: conntrack: print_conntrack only needed if
 CONFIG_NF_CONNTRACK_PROCFS

Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/net/netfilter/nf_conntrack_l4proto.h | 7 ++++---
 net/netfilter/nf_conntrack_proto_dccp.c      | 6 ++++++
 net/netfilter/nf_conntrack_proto_gre.c       | 4 ++++
 net/netfilter/nf_conntrack_proto_sctp.c      | 6 ++++++
 net/netfilter/nf_conntrack_proto_tcp.c       | 6 ++++++
 5 files changed, 26 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/include/net/netfilter/nf_conntrack_l4proto.h b/include/net/netfilter/nf_conntrack_l4proto.h
index 7e8da04a5eb6..4976ef92dc78 100644
--- a/include/net/netfilter/nf_conntrack_l4proto.h
+++ b/include/net/netfilter/nf_conntrack_l4proto.h
@@ -61,9 +61,6 @@ struct nf_conntrack_l4proto {
 	/* called by gc worker if table is full */
 	bool (*can_early_drop)(const struct nf_conn *ct);
 
-	/* Print out the private part of the conntrack. */
-	void (*print_conntrack)(struct seq_file *s, struct nf_conn *);
-
 	/* Return the array of timeouts for this protocol. */
 	unsigned int *(*get_timeouts)(struct net *net);
 
@@ -96,6 +93,10 @@ struct nf_conntrack_l4proto {
 		u16 nlattr_max;
 		const struct nla_policy *nla_policy;
 	} ctnl_timeout;
+#endif
+#ifdef CONFIG_NF_CONNTRACK_PROCFS
+	/* Print out the private part of the conntrack. */
+	void (*print_conntrack)(struct seq_file *s, struct nf_conn *);
 #endif
 	unsigned int	*net_id;
 	/* Init l4proto pernet data */
diff --git a/net/netfilter/nf_conntrack_proto_dccp.c b/net/netfilter/nf_conntrack_proto_dccp.c
index d2df49ac390a..188347571fc7 100644
--- a/net/netfilter/nf_conntrack_proto_dccp.c
+++ b/net/netfilter/nf_conntrack_proto_dccp.c
@@ -623,10 +623,12 @@ static bool dccp_can_early_drop(const struct nf_conn *ct)
 	return false;
 }
 
+#ifdef CONFIG_NF_CONNTRACK_PROCFS
 static void dccp_print_conntrack(struct seq_file *s, struct nf_conn *ct)
 {
 	seq_printf(s, "%s ", dccp_state_names[ct->proto.dccp.state]);
 }
+#endif
 
 #if IS_ENABLED(CONFIG_NF_CT_NETLINK)
 static int dccp_to_nlattr(struct sk_buff *skb, struct nlattr *nla,
@@ -879,7 +881,9 @@ struct nf_conntrack_l4proto nf_conntrack_l4proto_dccp4 __read_mostly = {
 	.get_timeouts		= dccp_get_timeouts,
 	.error			= dccp_error,
 	.can_early_drop		= dccp_can_early_drop,
+#ifdef CONFIG_NF_CONNTRACK_PROCFS
 	.print_conntrack	= dccp_print_conntrack,
+#endif
 #if IS_ENABLED(CONFIG_NF_CT_NETLINK)
 	.to_nlattr		= dccp_to_nlattr,
 	.nlattr_size		= dccp_nlattr_size,
@@ -913,7 +917,9 @@ struct nf_conntrack_l4proto nf_conntrack_l4proto_dccp6 __read_mostly = {
 	.get_timeouts		= dccp_get_timeouts,
 	.error			= dccp_error,
 	.can_early_drop		= dccp_can_early_drop,
+#ifdef CONFIG_NF_CONNTRACK_PROCFS
 	.print_conntrack	= dccp_print_conntrack,
+#endif
 #if IS_ENABLED(CONFIG_NF_CT_NETLINK)
 	.to_nlattr		= dccp_to_nlattr,
 	.nlattr_size		= dccp_nlattr_size,
diff --git a/net/netfilter/nf_conntrack_proto_gre.c b/net/netfilter/nf_conntrack_proto_gre.c
index cd28095dd7a4..c0e3a23ac23a 100644
--- a/net/netfilter/nf_conntrack_proto_gre.c
+++ b/net/netfilter/nf_conntrack_proto_gre.c
@@ -224,6 +224,7 @@ static bool gre_pkt_to_tuple(const struct sk_buff *skb, unsigned int dataoff,
 	return true;
 }
 
+#ifdef CONFIG_NF_CONNTRACK_PROCFS
 /* print private data for conntrack */
 static void gre_print_conntrack(struct seq_file *s, struct nf_conn *ct)
 {
@@ -231,6 +232,7 @@ static void gre_print_conntrack(struct seq_file *s, struct nf_conn *ct)
 		   (ct->proto.gre.timeout / HZ),
 		   (ct->proto.gre.stream_timeout / HZ));
 }
+#endif
 
 static unsigned int *gre_get_timeouts(struct net *net)
 {
@@ -357,7 +359,9 @@ static struct nf_conntrack_l4proto nf_conntrack_l4proto_gre4 __read_mostly = {
 	.l4proto	 = IPPROTO_GRE,
 	.pkt_to_tuple	 = gre_pkt_to_tuple,
 	.invert_tuple	 = gre_invert_tuple,
+#ifdef CONFIG_NF_CONNTRACK_PROCFS
 	.print_conntrack = gre_print_conntrack,
+#endif
 	.get_timeouts    = gre_get_timeouts,
 	.packet		 = gre_packet,
 	.new		 = gre_new,
diff --git a/net/netfilter/nf_conntrack_proto_sctp.c b/net/netfilter/nf_conntrack_proto_sctp.c
index da83b401be17..890b5c73368d 100644
--- a/net/netfilter/nf_conntrack_proto_sctp.c
+++ b/net/netfilter/nf_conntrack_proto_sctp.c
@@ -174,11 +174,13 @@ static bool sctp_invert_tuple(struct nf_conntrack_tuple *tuple,
 	return true;
 }
 
+#ifdef CONFIG_NF_CONNTRACK_PROCFS
 /* Print out the private part of the conntrack. */
 static void sctp_print_conntrack(struct seq_file *s, struct nf_conn *ct)
 {
 	seq_printf(s, "%s ", sctp_conntrack_names[ct->proto.sctp.state]);
 }
+#endif
 
 #define for_each_sctp_chunk(skb, sch, _sch, offset, dataoff, count)	\
 for ((offset) = (dataoff) + sizeof(struct sctphdr), (count) = 0;	\
@@ -784,7 +786,9 @@ struct nf_conntrack_l4proto nf_conntrack_l4proto_sctp4 __read_mostly = {
 	.l4proto 		= IPPROTO_SCTP,
 	.pkt_to_tuple 		= sctp_pkt_to_tuple,
 	.invert_tuple 		= sctp_invert_tuple,
+#ifdef CONFIG_NF_CONNTRACK_PROCFS
 	.print_conntrack	= sctp_print_conntrack,
+#endif
 	.packet 		= sctp_packet,
 	.get_timeouts		= sctp_get_timeouts,
 	.new 			= sctp_new,
@@ -819,7 +823,9 @@ struct nf_conntrack_l4proto nf_conntrack_l4proto_sctp6 __read_mostly = {
 	.l4proto 		= IPPROTO_SCTP,
 	.pkt_to_tuple 		= sctp_pkt_to_tuple,
 	.invert_tuple 		= sctp_invert_tuple,
+#ifdef CONFIG_NF_CONNTRACK_PROCFS
 	.print_conntrack	= sctp_print_conntrack,
+#endif
 	.packet 		= sctp_packet,
 	.get_timeouts		= sctp_get_timeouts,
 	.new 			= sctp_new,
diff --git a/net/netfilter/nf_conntrack_proto_tcp.c b/net/netfilter/nf_conntrack_proto_tcp.c
index c868b36b8945..33c52d9ab2f5 100644
--- a/net/netfilter/nf_conntrack_proto_tcp.c
+++ b/net/netfilter/nf_conntrack_proto_tcp.c
@@ -301,11 +301,13 @@ static bool tcp_invert_tuple(struct nf_conntrack_tuple *tuple,
 	return true;
 }
 
+#ifdef CONFIG_NF_CONNTRACK_PROCFS
 /* Print out the private part of the conntrack. */
 static void tcp_print_conntrack(struct seq_file *s, struct nf_conn *ct)
 {
 	seq_printf(s, "%s ", tcp_conntrack_names[ct->proto.tcp.state]);
 }
+#endif
 
 static unsigned int get_conntrack_index(const struct tcphdr *tcph)
 {
@@ -1549,7 +1551,9 @@ struct nf_conntrack_l4proto nf_conntrack_l4proto_tcp4 __read_mostly =
 	.l4proto 		= IPPROTO_TCP,
 	.pkt_to_tuple 		= tcp_pkt_to_tuple,
 	.invert_tuple 		= tcp_invert_tuple,
+#ifdef CONFIG_NF_CONNTRACK_PROCFS
 	.print_conntrack 	= tcp_print_conntrack,
+#endif
 	.packet 		= tcp_packet,
 	.get_timeouts		= tcp_get_timeouts,
 	.new 			= tcp_new,
@@ -1585,7 +1589,9 @@ struct nf_conntrack_l4proto nf_conntrack_l4proto_tcp6 __read_mostly =
 	.l4proto 		= IPPROTO_TCP,
 	.pkt_to_tuple 		= tcp_pkt_to_tuple,
 	.invert_tuple 		= tcp_invert_tuple,
+#ifdef CONFIG_NF_CONNTRACK_PROCFS
 	.print_conntrack 	= tcp_print_conntrack,
+#endif
 	.packet 		= tcp_packet,
 	.get_timeouts		= tcp_get_timeouts,
 	.new 			= tcp_new,
-- 
cgit v1.2.3


From b3480fe059ac9121b5714205b4ddae14b59ef4be Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Sat, 12 Aug 2017 00:57:08 +0200
Subject: netfilter: conntrack: make protocol tracker pointers const

Doesn't change generated code, but will make it easier to eventually
make the actual trackers themselvers const.

Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/net/netfilter/nf_conntrack_l3proto.h |  6 +++---
 include/net/netfilter/nf_conntrack_l4proto.h |  4 ++--
 include/net/netfilter/nf_conntrack_timeout.h |  2 +-
 net/netfilter/nf_conntrack_core.c            | 12 ++++++------
 net/netfilter/nf_conntrack_netlink.c         | 22 +++++++++++-----------
 net/netfilter/nf_conntrack_proto.c           | 20 ++++++++++----------
 net/netfilter/nfnetlink_cttimeout.c          | 14 +++++++-------
 net/netfilter/xt_CT.c                        |  2 +-
 net/openvswitch/conntrack.c                  |  4 ++--
 9 files changed, 43 insertions(+), 43 deletions(-)

(limited to 'include')

diff --git a/include/net/netfilter/nf_conntrack_l3proto.h b/include/net/netfilter/nf_conntrack_l3proto.h
index dabb53b0913c..6269deecbee7 100644
--- a/include/net/netfilter/nf_conntrack_l3proto.h
+++ b/include/net/netfilter/nf_conntrack_l3proto.h
@@ -64,10 +64,10 @@ struct nf_conntrack_l3proto {
 extern struct nf_conntrack_l3proto __rcu *nf_ct_l3protos[NFPROTO_NUMPROTO];
 
 /* Protocol global registration. */
-int nf_ct_l3proto_register(struct nf_conntrack_l3proto *proto);
-void nf_ct_l3proto_unregister(struct nf_conntrack_l3proto *proto);
+int nf_ct_l3proto_register(const struct nf_conntrack_l3proto *proto);
+void nf_ct_l3proto_unregister(const struct nf_conntrack_l3proto *proto);
 
-struct nf_conntrack_l3proto *nf_ct_l3proto_find_get(u_int16_t l3proto);
+const struct nf_conntrack_l3proto *nf_ct_l3proto_find_get(u_int16_t l3proto);
 
 /* Existing built-in protocols */
 extern struct nf_conntrack_l3proto nf_conntrack_l3proto_generic;
diff --git a/include/net/netfilter/nf_conntrack_l4proto.h b/include/net/netfilter/nf_conntrack_l4proto.h
index 4976ef92dc78..d4933d56809d 100644
--- a/include/net/netfilter/nf_conntrack_l4proto.h
+++ b/include/net/netfilter/nf_conntrack_l4proto.h
@@ -114,10 +114,10 @@ extern struct nf_conntrack_l4proto nf_conntrack_l4proto_generic;
 
 #define MAX_NF_CT_PROTO 256
 
-struct nf_conntrack_l4proto *__nf_ct_l4proto_find(u_int16_t l3proto,
+const struct nf_conntrack_l4proto *__nf_ct_l4proto_find(u_int16_t l3proto,
 						  u_int8_t l4proto);
 
-struct nf_conntrack_l4proto *nf_ct_l4proto_find_get(u_int16_t l3proto,
+const struct nf_conntrack_l4proto *nf_ct_l4proto_find_get(u_int16_t l3proto,
 						    u_int8_t l4proto);
 void nf_ct_l4proto_put(const struct nf_conntrack_l4proto *p);
 
diff --git a/include/net/netfilter/nf_conntrack_timeout.h b/include/net/netfilter/nf_conntrack_timeout.h
index b222957062b5..483d104fa254 100644
--- a/include/net/netfilter/nf_conntrack_timeout.h
+++ b/include/net/netfilter/nf_conntrack_timeout.h
@@ -16,7 +16,7 @@ struct ctnl_timeout {
 	refcount_t		refcnt;
 	char			name[CTNL_TIMEOUT_NAME_MAX];
 	__u16			l3num;
-	struct nf_conntrack_l4proto *l4proto;
+	const struct nf_conntrack_l4proto *l4proto;
 	char			data[0];
 };
 
diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c
index f2f00eaf217d..c23df7c9cd59 100644
--- a/net/netfilter/nf_conntrack_core.c
+++ b/net/netfilter/nf_conntrack_core.c
@@ -250,8 +250,8 @@ bool nf_ct_get_tuplepr(const struct sk_buff *skb, unsigned int nhoff,
 		       u_int16_t l3num,
 		       struct net *net, struct nf_conntrack_tuple *tuple)
 {
-	struct nf_conntrack_l3proto *l3proto;
-	struct nf_conntrack_l4proto *l4proto;
+	const struct nf_conntrack_l3proto *l3proto;
+	const struct nf_conntrack_l4proto *l4proto;
 	unsigned int protoff;
 	u_int8_t protonum;
 	int ret;
@@ -400,7 +400,7 @@ static void
 destroy_conntrack(struct nf_conntrack *nfct)
 {
 	struct nf_conn *ct = (struct nf_conn *)nfct;
-	struct nf_conntrack_l4proto *l4proto;
+	const struct nf_conntrack_l4proto *l4proto;
 
 	pr_debug("destroy_conntrack(%p)\n", ct);
 	NF_CT_ASSERT(atomic_read(&nfct->use) == 0);
@@ -694,7 +694,7 @@ static int nf_ct_resolve_clash(struct net *net, struct sk_buff *skb,
 {
 	/* This is the conntrack entry already in hashes that won race. */
 	struct nf_conn *ct = nf_ct_tuplehash_to_ctrack(h);
-	struct nf_conntrack_l4proto *l4proto;
+	const struct nf_conntrack_l4proto *l4proto;
 
 	l4proto = __nf_ct_l4proto_find(nf_ct_l3num(ct), nf_ct_protonum(ct));
 	if (l4proto->allow_clash &&
@@ -1344,10 +1344,10 @@ unsigned int
 nf_conntrack_in(struct net *net, u_int8_t pf, unsigned int hooknum,
 		struct sk_buff *skb)
 {
+	const struct nf_conntrack_l3proto *l3proto;
+	const struct nf_conntrack_l4proto *l4proto;
 	struct nf_conn *ct, *tmpl;
 	enum ip_conntrack_info ctinfo;
-	struct nf_conntrack_l3proto *l3proto;
-	struct nf_conntrack_l4proto *l4proto;
 	unsigned int *timeouts;
 	unsigned int dataoff;
 	u_int8_t protonum;
diff --git a/net/netfilter/nf_conntrack_netlink.c b/net/netfilter/nf_conntrack_netlink.c
index b59a453a0fd8..de4053d84364 100644
--- a/net/netfilter/nf_conntrack_netlink.c
+++ b/net/netfilter/nf_conntrack_netlink.c
@@ -109,9 +109,9 @@ nla_put_failure:
 static int ctnetlink_dump_tuples(struct sk_buff *skb,
 				 const struct nf_conntrack_tuple *tuple)
 {
+	const struct nf_conntrack_l3proto *l3proto;
+	const struct nf_conntrack_l4proto *l4proto;
 	int ret;
-	struct nf_conntrack_l3proto *l3proto;
-	struct nf_conntrack_l4proto *l4proto;
 
 	rcu_read_lock();
 	l3proto = __nf_ct_l3proto_find(tuple->src.l3num);
@@ -163,7 +163,7 @@ nla_put_failure:
 
 static int ctnetlink_dump_protoinfo(struct sk_buff *skb, struct nf_conn *ct)
 {
-	struct nf_conntrack_l4proto *l4proto;
+	const struct nf_conntrack_l4proto *l4proto;
 	struct nlattr *nest_proto;
 	int ret;
 
@@ -535,9 +535,9 @@ nla_put_failure:
 
 static inline size_t ctnetlink_proto_size(const struct nf_conn *ct)
 {
-	struct nf_conntrack_l3proto *l3proto;
-	struct nf_conntrack_l4proto *l4proto;
-	size_t len = 0;
+	const struct nf_conntrack_l3proto *l3proto;
+	const struct nf_conntrack_l4proto *l4proto;
+	size_t len;
 
 	l3proto = __nf_ct_l3proto_find(nf_ct_l3num(ct));
 	len = l3proto->nla_size;
@@ -936,8 +936,8 @@ static const struct nla_policy proto_nla_policy[CTA_PROTO_MAX+1] = {
 static int ctnetlink_parse_tuple_proto(struct nlattr *attr,
 				       struct nf_conntrack_tuple *tuple)
 {
+	const struct nf_conntrack_l4proto *l4proto;
 	struct nlattr *tb[CTA_PROTO_MAX+1];
-	struct nf_conntrack_l4proto *l4proto;
 	int ret = 0;
 
 	ret = nla_parse_nested(tb, CTA_PROTO_MAX, attr, proto_nla_policy,
@@ -1580,8 +1580,8 @@ static int ctnetlink_change_protoinfo(struct nf_conn *ct,
 				      const struct nlattr * const cda[])
 {
 	const struct nlattr *attr = cda[CTA_PROTOINFO];
+	const struct nf_conntrack_l4proto *l4proto;
 	struct nlattr *tb[CTA_PROTOINFO_MAX+1];
-	struct nf_conntrack_l4proto *l4proto;
 	int err = 0;
 
 	err = nla_parse_nested(tb, CTA_PROTOINFO_MAX, attr, protoinfo_policy,
@@ -2475,11 +2475,11 @@ static int ctnetlink_exp_dump_mask(struct sk_buff *skb,
 				   const struct nf_conntrack_tuple *tuple,
 				   const struct nf_conntrack_tuple_mask *mask)
 {
-	int ret;
-	struct nf_conntrack_l3proto *l3proto;
-	struct nf_conntrack_l4proto *l4proto;
+	const struct nf_conntrack_l3proto *l3proto;
+	const struct nf_conntrack_l4proto *l4proto;
 	struct nf_conntrack_tuple m;
 	struct nlattr *nest_parms;
+	int ret;
 
 	memset(&m, 0xFF, sizeof(m));
 	memcpy(&m.src.u3, &mask->src.u3, sizeof(m.src.u3));
diff --git a/net/netfilter/nf_conntrack_proto.c b/net/netfilter/nf_conntrack_proto.c
index 0ecab7163d62..b3e489c859ec 100644
--- a/net/netfilter/nf_conntrack_proto.c
+++ b/net/netfilter/nf_conntrack_proto.c
@@ -65,7 +65,7 @@ nf_ct_unregister_sysctl(struct ctl_table_header **header,
 }
 #endif
 
-struct nf_conntrack_l4proto *
+const struct nf_conntrack_l4proto *
 __nf_ct_l4proto_find(u_int16_t l3proto, u_int8_t l4proto)
 {
 	if (unlikely(l3proto >= NFPROTO_NUMPROTO || nf_ct_protos[l3proto] == NULL))
@@ -77,7 +77,7 @@ EXPORT_SYMBOL_GPL(__nf_ct_l4proto_find);
 
 /* this is guaranteed to always return a valid protocol helper, since
  * it falls back to generic_protocol */
-struct nf_conntrack_l3proto *
+const struct nf_conntrack_l3proto *
 nf_ct_l3proto_find_get(u_int16_t l3proto)
 {
 	struct nf_conntrack_l3proto *p;
@@ -95,8 +95,8 @@ EXPORT_SYMBOL_GPL(nf_ct_l3proto_find_get);
 int
 nf_ct_l3proto_try_module_get(unsigned short l3proto)
 {
+	const struct nf_conntrack_l3proto *p;
 	int ret;
-	struct nf_conntrack_l3proto *p;
 
 retry:	p = nf_ct_l3proto_find_get(l3proto);
 	if (p == &nf_conntrack_l3proto_generic) {
@@ -173,10 +173,10 @@ void nf_ct_netns_put(struct net *net, u8 nfproto)
 }
 EXPORT_SYMBOL_GPL(nf_ct_netns_put);
 
-struct nf_conntrack_l4proto *
+const struct nf_conntrack_l4proto *
 nf_ct_l4proto_find_get(u_int16_t l3num, u_int8_t l4num)
 {
-	struct nf_conntrack_l4proto *p;
+	const struct nf_conntrack_l4proto *p;
 
 	rcu_read_lock();
 	p = __nf_ct_l4proto_find(l3num, l4num);
@@ -196,18 +196,18 @@ EXPORT_SYMBOL_GPL(nf_ct_l4proto_put);
 
 static int kill_l3proto(struct nf_conn *i, void *data)
 {
-	return nf_ct_l3num(i) == ((struct nf_conntrack_l3proto *)data)->l3proto;
+	return nf_ct_l3num(i) == ((const struct nf_conntrack_l3proto *)data)->l3proto;
 }
 
 static int kill_l4proto(struct nf_conn *i, void *data)
 {
-	struct nf_conntrack_l4proto *l4proto;
+	const struct nf_conntrack_l4proto *l4proto;
 	l4proto = data;
 	return nf_ct_protonum(i) == l4proto->l4proto &&
 	       nf_ct_l3num(i) == l4proto->l3proto;
 }
 
-int nf_ct_l3proto_register(struct nf_conntrack_l3proto *proto)
+int nf_ct_l3proto_register(const struct nf_conntrack_l3proto *proto)
 {
 	int ret = 0;
 	struct nf_conntrack_l3proto *old;
@@ -235,7 +235,7 @@ out_unlock:
 }
 EXPORT_SYMBOL_GPL(nf_ct_l3proto_register);
 
-void nf_ct_l3proto_unregister(struct nf_conntrack_l3proto *proto)
+void nf_ct_l3proto_unregister(const struct nf_conntrack_l3proto *proto)
 {
 	BUG_ON(proto->l3proto >= NFPROTO_NUMPROTO);
 
@@ -249,7 +249,7 @@ void nf_ct_l3proto_unregister(struct nf_conntrack_l3proto *proto)
 
 	synchronize_rcu();
 	/* Remove all contrack entries for this protocol */
-	nf_ct_iterate_destroy(kill_l3proto, proto);
+	nf_ct_iterate_destroy(kill_l3proto, (void*)proto);
 }
 EXPORT_SYMBOL_GPL(nf_ct_l3proto_unregister);
 
diff --git a/net/netfilter/nfnetlink_cttimeout.c b/net/netfilter/nfnetlink_cttimeout.c
index fcabccc99f0d..32b1c0b44e79 100644
--- a/net/netfilter/nfnetlink_cttimeout.c
+++ b/net/netfilter/nfnetlink_cttimeout.c
@@ -75,7 +75,7 @@ static int cttimeout_new_timeout(struct net *net, struct sock *ctnl,
 {
 	__u16 l3num;
 	__u8 l4num;
-	struct nf_conntrack_l4proto *l4proto;
+	const struct nf_conntrack_l4proto *l4proto;
 	struct ctnl_timeout *timeout, *matching = NULL;
 	char *name;
 	int ret;
@@ -159,7 +159,7 @@ ctnl_timeout_fill_info(struct sk_buff *skb, u32 portid, u32 seq, u32 type,
 	struct nlmsghdr *nlh;
 	struct nfgenmsg *nfmsg;
 	unsigned int flags = portid ? NLM_F_MULTI : 0;
-	struct nf_conntrack_l4proto *l4proto = timeout->l4proto;
+	const struct nf_conntrack_l4proto *l4proto = timeout->l4proto;
 
 	event = nfnl_msg_type(NFNL_SUBSYS_CTNETLINK_TIMEOUT, event);
 	nlh = nlmsg_put(skb, portid, seq, event, sizeof(*nfmsg), flags);
@@ -364,10 +364,10 @@ static int cttimeout_default_set(struct net *net, struct sock *ctnl,
 				 const struct nlattr * const cda[],
 				 struct netlink_ext_ack *extack)
 {
+	const struct nf_conntrack_l4proto *l4proto;
+	unsigned int *timeouts;
 	__u16 l3num;
 	__u8 l4num;
-	struct nf_conntrack_l4proto *l4proto;
-	unsigned int *timeouts;
 	int ret;
 
 	if (!cda[CTA_TIMEOUT_L3PROTO] ||
@@ -454,11 +454,11 @@ static int cttimeout_default_get(struct net *net, struct sock *ctnl,
 				 const struct nlattr * const cda[],
 				 struct netlink_ext_ack *extack)
 {
-	__u16 l3num;
-	__u8 l4num;
-	struct nf_conntrack_l4proto *l4proto;
+	const struct nf_conntrack_l4proto *l4proto;
 	struct sk_buff *skb2;
 	int ret, err;
+	__u16 l3num;
+	__u8 l4num;
 
 	if (!cda[CTA_TIMEOUT_L3PROTO] || !cda[CTA_TIMEOUT_L4PROTO])
 		return -EINVAL;
diff --git a/net/netfilter/xt_CT.c b/net/netfilter/xt_CT.c
index 623ef37de886..5a152e2acfd5 100644
--- a/net/netfilter/xt_CT.c
+++ b/net/netfilter/xt_CT.c
@@ -121,9 +121,9 @@ xt_ct_set_timeout(struct nf_conn *ct, const struct xt_tgchk_param *par,
 {
 #ifdef CONFIG_NF_CONNTRACK_TIMEOUT
 	typeof(nf_ct_timeout_find_get_hook) timeout_find_get;
+	const struct nf_conntrack_l4proto *l4proto;
 	struct ctnl_timeout *timeout;
 	struct nf_conn_timeout *timeout_ext;
-	struct nf_conntrack_l4proto *l4proto;
 	int ret = 0;
 	u8 proto;
 
diff --git a/net/openvswitch/conntrack.c b/net/openvswitch/conntrack.c
index e3c4c6c3fef7..283363534647 100644
--- a/net/openvswitch/conntrack.c
+++ b/net/openvswitch/conntrack.c
@@ -579,8 +579,8 @@ static struct nf_conn *
 ovs_ct_find_existing(struct net *net, const struct nf_conntrack_zone *zone,
 		     u8 l3num, struct sk_buff *skb, bool natted)
 {
-	struct nf_conntrack_l3proto *l3proto;
-	struct nf_conntrack_l4proto *l4proto;
+	const struct nf_conntrack_l3proto *l3proto;
+	const struct nf_conntrack_l4proto *l4proto;
 	struct nf_conntrack_tuple tuple;
 	struct nf_conntrack_tuple_hash *h;
 	struct nf_conn *ct;
-- 
cgit v1.2.3


From 2facaad6000f2322eb40ca379aced31c957f0a41 Mon Sep 17 00:00:00 2001
From: Jesper Dangaard Brouer <brouer@redhat.com>
Date: Thu, 24 Aug 2017 12:33:08 +0200
Subject: xdp: make generic xdp redirect use tracepoint trace_xdp_redirect

If the xdp_do_generic_redirect() call fails, it trigger the
trace_xdp_exception tracepoint.  It seems better to use the same
tracepoint trace_xdp_redirect, as the native xdp_do_redirect{,_map} does.

Signed-off-by: Jesper Dangaard Brouer <brouer@redhat.com>
Acked-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/filter.h |  3 ++-
 net/core/dev.c         |  4 ++--
 net/core/filter.c      | 35 +++++++++++++++++++++++------------
 3 files changed, 27 insertions(+), 15 deletions(-)

(limited to 'include')

diff --git a/include/linux/filter.h b/include/linux/filter.h
index 7015116331af..d29e58fde364 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -718,7 +718,8 @@ struct bpf_prog *bpf_patch_insn_single(struct bpf_prog *prog, u32 off,
  * because we only track one map and force a flush when the map changes.
  * This does not appear to be a real limitation for existing software.
  */
-int xdp_do_generic_redirect(struct net_device *dev, struct sk_buff *skb);
+int xdp_do_generic_redirect(struct net_device *dev, struct sk_buff *skb,
+			    struct bpf_prog *prog);
 int xdp_do_redirect(struct net_device *dev,
 		    struct xdp_buff *xdp,
 		    struct bpf_prog *prog);
diff --git a/net/core/dev.c b/net/core/dev.c
index 40b28e417072..270b54754821 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -3953,7 +3953,8 @@ int do_xdp_generic(struct bpf_prog *xdp_prog, struct sk_buff *skb)
 		if (act != XDP_PASS) {
 			switch (act) {
 			case XDP_REDIRECT:
-				err = xdp_do_generic_redirect(skb->dev, skb);
+				err = xdp_do_generic_redirect(skb->dev, skb,
+							      xdp_prog);
 				if (err)
 					goto out_redir;
 			/* fallthru to submit skb */
@@ -3966,7 +3967,6 @@ int do_xdp_generic(struct bpf_prog *xdp_prog, struct sk_buff *skb)
 	}
 	return XDP_PASS;
 out_redir:
-	trace_xdp_exception(skb->dev, xdp_prog, XDP_REDIRECT);
 	kfree_skb(skb);
 	return XDP_DROP;
 }
diff --git a/net/core/filter.c b/net/core/filter.c
index a5e5f31b41b9..a04680331033 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -2553,26 +2553,37 @@ out:
 }
 EXPORT_SYMBOL_GPL(xdp_do_redirect);
 
-int xdp_do_generic_redirect(struct net_device *dev, struct sk_buff *skb)
+int xdp_do_generic_redirect(struct net_device *dev, struct sk_buff *skb,
+			    struct bpf_prog *xdp_prog)
 {
 	struct redirect_info *ri = this_cpu_ptr(&redirect_info);
-	unsigned int len;
 	u32 index = ri->ifindex;
+	struct net_device *fwd;
+	unsigned int len;
+	int err = 0;
 
-	dev = dev_get_by_index_rcu(dev_net(dev), index);
+	fwd = dev_get_by_index_rcu(dev_net(dev), index);
 	ri->ifindex = 0;
-	if (unlikely(!dev)) {
-		return -EINVAL;
+	if (unlikely(!fwd)) {
+		err = -EINVAL;
+		goto out;
 	}
 
-	if (unlikely(!(dev->flags & IFF_UP)))
-		return -ENETDOWN;
-	len = dev->mtu + dev->hard_header_len + VLAN_HLEN;
-	if (skb->len > len)
-		return -E2BIG;
+	if (unlikely(!(fwd->flags & IFF_UP))) {
+		err = -ENETDOWN;
+		goto out;
+	}
 
-	skb->dev = dev;
-	return 0;
+	len = fwd->mtu + fwd->hard_header_len + VLAN_HLEN;
+	if (skb->len > len) {
+		err = -EMSGSIZE;
+		goto out;
+	}
+
+	skb->dev = fwd;
+out:
+	trace_xdp_redirect(dev, fwd, xdp_prog, XDP_REDIRECT, err);
+	return err;
 }
 EXPORT_SYMBOL_GPL(xdp_do_generic_redirect);
 
-- 
cgit v1.2.3


From a873585587205750e7accfb2c93c29239ffa6e09 Mon Sep 17 00:00:00 2001
From: Jesper Dangaard Brouer <brouer@redhat.com>
Date: Thu, 24 Aug 2017 12:33:18 +0200
Subject: xdp: remove net_device names from xdp_redirect tracepoint

There is too much overhead in the current trace_xdp_redirect
tracepoint as it does strcpy and strlen on the net_device names.

Besides, exposing the ifindex/index is actually the information that
is needed in the tracepoint to diagnose issues.  When a lookup fails
(either ifindex or devmap index) then there is a need for saying which
to_index that have issues.

V2: Adjust args to be aligned with trace_xdp_exception.

Signed-off-by: Jesper Dangaard Brouer <brouer@redhat.com>
Acked-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/trace/events/xdp.h | 24 ++++++++++++------------
 net/core/filter.c          |  6 +++---
 2 files changed, 15 insertions(+), 15 deletions(-)

(limited to 'include')

diff --git a/include/trace/events/xdp.h b/include/trace/events/xdp.h
index 0e42e69f773b..cd37706c6f55 100644
--- a/include/trace/events/xdp.h
+++ b/include/trace/events/xdp.h
@@ -51,33 +51,33 @@ TRACE_EVENT(xdp_exception,
 
 TRACE_EVENT(xdp_redirect,
 
-	TP_PROTO(const struct net_device *from,
-		 const struct net_device *to,
-		 const struct bpf_prog *xdp, u32 act, int err),
+	TP_PROTO(const struct net_device *dev,
+		 const struct bpf_prog *xdp, u32 act,
+		 int to_index, int err),
 
-	TP_ARGS(from, to, xdp, act, err),
+	TP_ARGS(dev, xdp, act, to_index, err),
 
 	TP_STRUCT__entry(
-		__string(name_from, from->name)
-		__string(name_to, to->name)
 		__array(u8, prog_tag, 8)
 		__field(u32, act)
+		__field(int, ifindex)
+		__field(int, to_index)
 		__field(int, err)
 	),
 
 	TP_fast_assign(
 		BUILD_BUG_ON(sizeof(__entry->prog_tag) != sizeof(xdp->tag));
 		memcpy(__entry->prog_tag, xdp->tag, sizeof(xdp->tag));
-		__assign_str(name_from, from->name);
-		__assign_str(name_to, to->name);
-		__entry->act = act;
-		__entry->err = err;
+		__entry->act		= act;
+		__entry->ifindex	= dev->ifindex;
+		__entry->to_index	= to_index;
+		__entry->err		= err;
 	),
 
-	TP_printk("prog=%s from=%s to=%s action=%s err=%d",
+	TP_printk("prog=%s action=%s ifindex=%d to_index=%d err=%d",
 		  __print_hex_str(__entry->prog_tag, 8),
-		  __get_str(name_from), __get_str(name_to),
 		  __print_symbolic(__entry->act, __XDP_ACT_SYM_TAB),
+		  __entry->ifindex, __entry->to_index,
 		  __entry->err)
 );
 #endif /* _TRACE_XDP_H */
diff --git a/net/core/filter.c b/net/core/filter.c
index a04680331033..4bcd6baa80c9 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -2524,7 +2524,7 @@ static int xdp_do_redirect_map(struct net_device *dev, struct xdp_buff *xdp,
 	if (likely(!err))
 		ri->map_to_flush = map;
 out:
-	trace_xdp_redirect(dev, fwd, xdp_prog, XDP_REDIRECT, err);
+	trace_xdp_redirect(dev, xdp_prog, XDP_REDIRECT, index, err);
 	return err;
 }
 
@@ -2548,7 +2548,7 @@ int xdp_do_redirect(struct net_device *dev, struct xdp_buff *xdp,
 
 	err = __bpf_tx_xdp(fwd, NULL, xdp, 0);
 out:
-	trace_xdp_redirect(dev, fwd, xdp_prog, XDP_REDIRECT, err);
+	trace_xdp_redirect(dev, xdp_prog, XDP_REDIRECT, index, err);
 	return err;
 }
 EXPORT_SYMBOL_GPL(xdp_do_redirect);
@@ -2582,7 +2582,7 @@ int xdp_do_generic_redirect(struct net_device *dev, struct sk_buff *skb,
 
 	skb->dev = fwd;
 out:
-	trace_xdp_redirect(dev, fwd, xdp_prog, XDP_REDIRECT, err);
+	trace_xdp_redirect(dev, xdp_prog, XDP_REDIRECT, index, err);
 	return err;
 }
 EXPORT_SYMBOL_GPL(xdp_do_generic_redirect);
-- 
cgit v1.2.3


From 315ec3990efd71f87e556cf7827a1ac2d565d5e8 Mon Sep 17 00:00:00 2001
From: Jesper Dangaard Brouer <brouer@redhat.com>
Date: Thu, 24 Aug 2017 12:33:23 +0200
Subject: xdp: get tracepoints xdp_exception and xdp_redirect in sync

Remove the net_device string name from the xdp_exception tracepoint,
like the xdp_redirect tracepoint.

Align the TP_STRUCT to have common entries between these two
tracepoint.

Signed-off-by: Jesper Dangaard Brouer <brouer@redhat.com>
Acked-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/trace/events/xdp.h | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

(limited to 'include')

diff --git a/include/trace/events/xdp.h b/include/trace/events/xdp.h
index cd37706c6f55..27cf2ef35f5f 100644
--- a/include/trace/events/xdp.h
+++ b/include/trace/events/xdp.h
@@ -31,22 +31,22 @@ TRACE_EVENT(xdp_exception,
 	TP_ARGS(dev, xdp, act),
 
 	TP_STRUCT__entry(
-		__string(name, dev->name)
 		__array(u8, prog_tag, 8)
 		__field(u32, act)
+		__field(int, ifindex)
 	),
 
 	TP_fast_assign(
 		BUILD_BUG_ON(sizeof(__entry->prog_tag) != sizeof(xdp->tag));
 		memcpy(__entry->prog_tag, xdp->tag, sizeof(xdp->tag));
-		__assign_str(name, dev->name);
-		__entry->act = act;
+		__entry->act		= act;
+		__entry->ifindex	= dev->ifindex;
 	),
 
-	TP_printk("prog=%s device=%s action=%s",
+	TP_printk("prog=%s action=%s ifindex=%d",
 		  __print_hex_str(__entry->prog_tag, 8),
-		  __get_str(name),
-		  __print_symbolic(__entry->act, __XDP_ACT_SYM_TAB))
+		  __print_symbolic(__entry->act, __XDP_ACT_SYM_TAB),
+		  __entry->ifindex)
 );
 
 TRACE_EVENT(xdp_redirect,
-- 
cgit v1.2.3


From 498ca3c82a7b11e152a46c253f6b2087c929ce00 Mon Sep 17 00:00:00 2001
From: Noa Osherovich <noaos@mellanox.com>
Date: Wed, 23 Aug 2017 08:35:40 +0300
Subject: IB/core: Avoid accessing non-allocated memory when inferring port
 type

Commit 44c58487d51a ("IB/core: Define 'ib' and 'roce' rdma_ah_attr types")
introduced the concept of type in ah_attr:
 * During ib_register_device, each port is checked for its type which
   is stored in ib_device's port_immutable array.
 * During uverbs' modify_qp, the type is inferred using the port number
   in ib_uverbs_qp_dest struct (address vector) by accessing the
   relevant port_immutable array and the type is passed on to
   providers.

IB spec (version 1.3) enforces a valid port value only in Reset to
Init. During Init to RTR, the address vector must be valid but port
number is not mentioned as a field in the address vector, so its
value is not validated, which leads to accesses to a non-allocated
memory when inferring the port type.

Save the real port number in ib_qp during modify to Init (when the
comp_mask indicates that the port number is valid) and use this value
to infer the port type.

Avoid copying the address vector fields if the matching bit is not set
in the attr_mask. Address vector can't be modified before the port, so
no valid flow is affected.

Fixes: 44c58487d51a ('IB/core: Define 'ib' and 'roce' rdma_ah_attr types')
Signed-off-by: Noa Osherovich <noaos@mellanox.com>
Reviewed-by: Yishai Hadas <yishaih@mellanox.com>
Signed-off-by: Leon Romanovsky <leon@kernel.org>
Signed-off-by: Doug Ledford <dledford@redhat.com>
---
 drivers/infiniband/core/uverbs_cmd.c | 11 +++++++----
 drivers/infiniband/core/verbs.c      |  7 ++++++-
 include/rdma/ib_verbs.h              |  1 +
 3 files changed, 14 insertions(+), 5 deletions(-)

(limited to 'include')

diff --git a/drivers/infiniband/core/uverbs_cmd.c b/drivers/infiniband/core/uverbs_cmd.c
index 55822ae71955..739bd69ef1d4 100644
--- a/drivers/infiniband/core/uverbs_cmd.c
+++ b/drivers/infiniband/core/uverbs_cmd.c
@@ -1522,6 +1522,7 @@ static int create_qp(struct ib_uverbs_file *file,
 		qp->qp_type	  = attr.qp_type;
 		atomic_set(&qp->usecnt, 0);
 		atomic_inc(&pd->usecnt);
+		qp->port = 0;
 		if (attr.send_cq)
 			atomic_inc(&attr.send_cq->usecnt);
 		if (attr.recv_cq)
@@ -1962,8 +1963,9 @@ static int modify_qp(struct ib_uverbs_file *file,
 	attr->alt_timeout	  = cmd->base.alt_timeout;
 	attr->rate_limit	  = cmd->rate_limit;
 
-	attr->ah_attr.type = rdma_ah_find_type(qp->device,
-					       cmd->base.dest.port_num);
+	if (cmd->base.attr_mask & IB_QP_AV)
+		attr->ah_attr.type = rdma_ah_find_type(qp->device,
+						       cmd->base.dest.port_num);
 	if (cmd->base.dest.is_global) {
 		rdma_ah_set_grh(&attr->ah_attr, NULL,
 				cmd->base.dest.flow_label,
@@ -1981,8 +1983,9 @@ static int modify_qp(struct ib_uverbs_file *file,
 	rdma_ah_set_port_num(&attr->ah_attr,
 			     cmd->base.dest.port_num);
 
-	attr->alt_ah_attr.type = rdma_ah_find_type(qp->device,
-						   cmd->base.dest.port_num);
+	if (cmd->base.attr_mask & IB_QP_ALT_PATH)
+		attr->alt_ah_attr.type =
+			rdma_ah_find_type(qp->device, cmd->base.dest.port_num);
 	if (cmd->base.alt_dest.is_global) {
 		rdma_ah_set_grh(&attr->alt_ah_attr, NULL,
 				cmd->base.alt_dest.flow_label,
diff --git a/drivers/infiniband/core/verbs.c b/drivers/infiniband/core/verbs.c
index 7f8fe443df46..b456e3ca1876 100644
--- a/drivers/infiniband/core/verbs.c
+++ b/drivers/infiniband/core/verbs.c
@@ -838,6 +838,7 @@ struct ib_qp *ib_create_qp(struct ib_pd *pd,
 	spin_lock_init(&qp->mr_lock);
 	INIT_LIST_HEAD(&qp->rdma_mrs);
 	INIT_LIST_HEAD(&qp->sig_mrs);
+	qp->port = 0;
 
 	if (qp_init_attr->qp_type == IB_QPT_XRC_TGT)
 		return ib_create_xrc_qp(qp, qp_init_attr);
@@ -1297,7 +1298,11 @@ int ib_modify_qp_with_udata(struct ib_qp *qp, struct ib_qp_attr *attr,
 		if (ret)
 			return ret;
 	}
-	return ib_security_modify_qp(qp, attr, attr_mask, udata);
+	ret = ib_security_modify_qp(qp, attr, attr_mask, udata);
+	if (!ret && (attr_mask & IB_QP_PORT))
+		qp->port = attr->port_num;
+
+	return ret;
 }
 EXPORT_SYMBOL(ib_modify_qp_with_udata);
 
diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h
index b5732432bb29..88c32aba32f7 100644
--- a/include/rdma/ib_verbs.h
+++ b/include/rdma/ib_verbs.h
@@ -1683,6 +1683,7 @@ struct ib_qp {
 	enum ib_qp_type		qp_type;
 	struct ib_rwq_ind_table *rwq_ind_tbl;
 	struct ib_qp_security  *qp_sec;
+	u8			port;
 };
 
 struct ib_mr {
-- 
cgit v1.2.3


From 311fc65c9fb9c966bca8e6f3ff8132ce57344ab9 Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Thu, 24 Aug 2017 15:13:29 -0500
Subject: pty: Repair TIOCGPTPEER

The implementation of TIOCGPTPEER has two issues.

When /dev/ptmx (as opposed to /dev/pts/ptmx) is opened the wrong
vfsmount is passed to dentry_open.  Which results in the kernel displaying
the wrong pathname for the peer.

The second is simply by caching the vfsmount and dentry of the peer it leaves
them open, in a way they were not previously Which because of the inreased
reference counts can cause unnecessary behaviour differences resulting in
regressions.

To fix these move the ioctl into tty_io.c at a generic level allowing
the ioctl to have access to the struct file on which the ioctl is
being called.  This allows the path of the slave to be derived when
opening the slave through TIOCGPTPEER instead of requiring the path to
the slave be cached.  Thus removing the need for caching the path.

A new function devpts_ptmx_path is factored out of devpts_acquire and
used to implement a function devpts_mntget.   The new function devpts_mntget
takes a filp to perform the lookup on and fsi so that it can confirm
that the superblock that is found by devpts_ptmx_path is the proper superblock.

v2: Lots of fixes to make the code actually work
v3: Suggestions by Linus
    - Removed the unnecessary initialization of filp in ptm_open_peer
    - Simplified devpts_ptmx_path as gotos are no longer required

[ This is the fix for the issue that was reverted in commit
  143c97cc6529, but this time without breaking 'pbuilder' due to
  increased reference counts   - Linus ]

Fixes: 54ebbfb16034 ("tty: add TIOCGPTPEER ioctl")
Reported-by: Christian Brauner <christian.brauner@canonical.com>
Reported-and-tested-by: Stefan Lippers-Hollmann <s.l-h@gmx.de>
Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/tty/pty.c         | 64 ++++++++++++++++++++--------------------------
 drivers/tty/tty_io.c      |  3 +++
 fs/devpts/inode.c         | 65 +++++++++++++++++++++++++++++++++++------------
 include/linux/devpts_fs.h | 10 ++++++++
 4 files changed, 89 insertions(+), 53 deletions(-)

(limited to 'include')

diff --git a/drivers/tty/pty.c b/drivers/tty/pty.c
index 284749fb0f6b..a6d5164c33a9 100644
--- a/drivers/tty/pty.c
+++ b/drivers/tty/pty.c
@@ -69,13 +69,8 @@ static void pty_close(struct tty_struct *tty, struct file *filp)
 #ifdef CONFIG_UNIX98_PTYS
 		if (tty->driver == ptm_driver) {
 			mutex_lock(&devpts_mutex);
-			if (tty->link->driver_data) {
-				struct path *path = tty->link->driver_data;
-
-				devpts_pty_kill(path->dentry);
-				path_put(path);
-				kfree(path);
-			}
+			if (tty->link->driver_data)
+				devpts_pty_kill(tty->link->driver_data);
 			mutex_unlock(&devpts_mutex);
 		}
 #endif
@@ -607,25 +602,24 @@ static inline void legacy_pty_init(void) { }
 static struct cdev ptmx_cdev;
 
 /**
- *	pty_open_peer - open the peer of a pty
- *	@tty: the peer of the pty being opened
+ *	ptm_open_peer - open the peer of a pty
+ *	@master: the open struct file of the ptmx device node
+ *	@tty: the master of the pty being opened
+ *	@flags: the flags for open
  *
- *	Open the cached dentry in tty->link, providing a safe way for userspace
- *	to get the slave end of a pty (where they have the master fd and cannot
- *	access or trust the mount namespace /dev/pts was mounted inside).
+ *	Provide a race free way for userspace to open the slave end of a pty
+ *	(where they have the master fd and cannot access or trust the mount
+ *	namespace /dev/pts was mounted inside).
  */
-static struct file *pty_open_peer(struct tty_struct *tty, int flags)
-{
-	if (tty->driver->subtype != PTY_TYPE_MASTER)
-		return ERR_PTR(-EIO);
-	return dentry_open(tty->link->driver_data, flags, current_cred());
-}
-
-static int pty_get_peer(struct tty_struct *tty, int flags)
+int ptm_open_peer(struct file *master, struct tty_struct *tty, int flags)
 {
 	int fd = -1;
-	struct file *filp = NULL;
+	struct file *filp;
 	int retval = -EINVAL;
+	struct path path;
+
+	if (tty->driver != ptm_driver)
+		return -EIO;
 
 	fd = get_unused_fd_flags(0);
 	if (fd < 0) {
@@ -633,7 +627,16 @@ static int pty_get_peer(struct tty_struct *tty, int flags)
 		goto err;
 	}
 
-	filp = pty_open_peer(tty, flags);
+	/* Compute the slave's path */
+	path.mnt = devpts_mntget(master, tty->driver_data);
+	if (IS_ERR(path.mnt)) {
+		retval = PTR_ERR(path.mnt);
+		goto err_put;
+	}
+	path.dentry = tty->link->driver_data;
+
+	filp = dentry_open(&path, flags, current_cred());
+	mntput(path.mnt);
 	if (IS_ERR(filp)) {
 		retval = PTR_ERR(filp);
 		goto err_put;
@@ -662,8 +665,6 @@ static int pty_unix98_ioctl(struct tty_struct *tty,
 		return pty_get_pktmode(tty, (int __user *)arg);
 	case TIOCGPTN: /* Get PT Number */
 		return put_user(tty->index, (unsigned int __user *)arg);
-	case TIOCGPTPEER: /* Open the other end */
-		return pty_get_peer(tty, (int) arg);
 	case TIOCSIG:    /* Send signal to other side of pty */
 		return pty_signal(tty, (int) arg);
 	}
@@ -791,7 +792,6 @@ static int ptmx_open(struct inode *inode, struct file *filp)
 {
 	struct pts_fs_info *fsi;
 	struct tty_struct *tty;
-	struct path *pts_path;
 	struct dentry *dentry;
 	int retval;
 	int index;
@@ -845,26 +845,16 @@ static int ptmx_open(struct inode *inode, struct file *filp)
 		retval = PTR_ERR(dentry);
 		goto err_release;
 	}
-	/* We need to cache a fake path for TIOCGPTPEER. */
-	pts_path = kmalloc(sizeof(struct path), GFP_KERNEL);
-	if (!pts_path)
-		goto err_release;
-	pts_path->mnt = filp->f_path.mnt;
-	pts_path->dentry = dentry;
-	path_get(pts_path);
-	tty->link->driver_data = pts_path;
+	tty->link->driver_data = dentry;
 
 	retval = ptm_driver->ops->open(tty, filp);
 	if (retval)
-		goto err_path_put;
+		goto err_release;
 
 	tty_debug_hangup(tty, "opening (count=%d)\n", tty->count);
 
 	tty_unlock(tty);
 	return 0;
-err_path_put:
-	path_put(pts_path);
-	kfree(pts_path);
 err_release:
 	tty_unlock(tty);
 	// This will also put-ref the fsi
diff --git a/drivers/tty/tty_io.c b/drivers/tty/tty_io.c
index 974b13d24401..10c4038c0e8d 100644
--- a/drivers/tty/tty_io.c
+++ b/drivers/tty/tty_io.c
@@ -2518,6 +2518,9 @@ long tty_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
 	case TIOCSSERIAL:
 		tty_warn_deprecated_flags(p);
 		break;
+	case TIOCGPTPEER:
+		/* Special because the struct file is needed */
+		return ptm_open_peer(file, tty, (int)arg);
 	default:
 		retval = tty_jobctrl_ioctl(tty, real_tty, file, cmd, arg);
 		if (retval != -ENOIOCTLCMD)
diff --git a/fs/devpts/inode.c b/fs/devpts/inode.c
index 108df2e3602c..7eae33ffa3fc 100644
--- a/fs/devpts/inode.c
+++ b/fs/devpts/inode.c
@@ -133,6 +133,50 @@ static inline struct pts_fs_info *DEVPTS_SB(struct super_block *sb)
 	return sb->s_fs_info;
 }
 
+static int devpts_ptmx_path(struct path *path)
+{
+	struct super_block *sb;
+	int err;
+
+	/* Has the devpts filesystem already been found? */
+	if (path->mnt->mnt_sb->s_magic == DEVPTS_SUPER_MAGIC)
+		return 0;
+
+	/* Is a devpts filesystem at "pts" in the same directory? */
+	err = path_pts(path);
+	if (err)
+		return err;
+
+	/* Is the path the root of a devpts filesystem? */
+	sb = path->mnt->mnt_sb;
+	if ((sb->s_magic != DEVPTS_SUPER_MAGIC) ||
+	    (path->mnt->mnt_root != sb->s_root))
+		return -ENODEV;
+
+	return 0;
+}
+
+struct vfsmount *devpts_mntget(struct file *filp, struct pts_fs_info *fsi)
+{
+	struct path path;
+	int err;
+
+	path = filp->f_path;
+	path_get(&path);
+
+	err = devpts_ptmx_path(&path);
+	dput(path.dentry);
+	if (err) {
+		mntput(path.mnt);
+		path.mnt = ERR_PTR(err);
+	}
+	if (DEVPTS_SB(path.mnt->mnt_sb) != fsi) {
+		mntput(path.mnt);
+		path.mnt = ERR_PTR(-ENODEV);
+	}
+	return path.mnt;
+}
+
 struct pts_fs_info *devpts_acquire(struct file *filp)
 {
 	struct pts_fs_info *result;
@@ -143,27 +187,16 @@ struct pts_fs_info *devpts_acquire(struct file *filp)
 	path = filp->f_path;
 	path_get(&path);
 
-	/* Has the devpts filesystem already been found? */
-	sb = path.mnt->mnt_sb;
-	if (sb->s_magic != DEVPTS_SUPER_MAGIC) {
-		/* Is a devpts filesystem at "pts" in the same directory? */
-		err = path_pts(&path);
-		if (err) {
-			result = ERR_PTR(err);
-			goto out;
-		}
-
-		/* Is the path the root of a devpts filesystem? */
-		result = ERR_PTR(-ENODEV);
-		sb = path.mnt->mnt_sb;
-		if ((sb->s_magic != DEVPTS_SUPER_MAGIC) ||
-		    (path.mnt->mnt_root != sb->s_root))
-			goto out;
+	err = devpts_ptmx_path(&path);
+	if (err) {
+		result = ERR_PTR(err);
+		goto out;
 	}
 
 	/*
 	 * pty code needs to hold extra references in case of last /dev/tty close
 	 */
+	sb = path.mnt->mnt_sb;
 	atomic_inc(&sb->s_active);
 	result = DEVPTS_SB(sb);
 
diff --git a/include/linux/devpts_fs.h b/include/linux/devpts_fs.h
index 277ab9af9ac2..100cb4343763 100644
--- a/include/linux/devpts_fs.h
+++ b/include/linux/devpts_fs.h
@@ -19,6 +19,7 @@
 
 struct pts_fs_info;
 
+struct vfsmount *devpts_mntget(struct file *, struct pts_fs_info *);
 struct pts_fs_info *devpts_acquire(struct file *);
 void devpts_release(struct pts_fs_info *);
 
@@ -32,6 +33,15 @@ void *devpts_get_priv(struct dentry *);
 /* unlink */
 void devpts_pty_kill(struct dentry *);
 
+/* in pty.c */
+int ptm_open_peer(struct file *master, struct tty_struct *tty, int flags);
+
+#else
+static inline int
+ptm_open_peer(struct file *master, struct tty_struct *tty, int flags)
+{
+	return -EIO;
+}
 #endif
 
 
-- 
cgit v1.2.3


From 55f2467cd717241dd941153d4db1e23c91aecf98 Mon Sep 17 00:00:00 2001
From: Maor Gottlieb <maorg@mellanox.com>
Date: Thu, 17 Aug 2017 15:50:35 +0300
Subject: RDMA/mlx4: Fix create qp command alignment

Avoid extra padding by replacing the order of inl_recv_sz and reserved,
otherwise 'mlx4_ib_create_qp' structure might be larger than legacy user
input leading to copy of some garbage data from the user space buffer.

Fixes: ea30b966f7dd ('IB/mlx4: Add inline-receive support')
Signed-off-by: Maor Gottlieb <maorg@mellanox.com>
Signed-off-by: Leon Romanovsky <leon@kernel.org>
Signed-off-by: Doug Ledford <dledford@redhat.com>
---
 include/uapi/rdma/mlx4-abi.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/uapi/rdma/mlx4-abi.h b/include/uapi/rdma/mlx4-abi.h
index d915cab37ec3..21cce1a4c4dd 100644
--- a/include/uapi/rdma/mlx4-abi.h
+++ b/include/uapi/rdma/mlx4-abi.h
@@ -111,8 +111,8 @@ struct mlx4_ib_create_qp {
 	__u8	log_sq_bb_count;
 	__u8	log_sq_stride;
 	__u8	sq_no_prefetch;
-	__u32	inl_recv_sz;
 	__u8	reserved;
+	__u32	inl_recv_sz;
 };
 
 struct mlx4_ib_create_wq {
-- 
cgit v1.2.3


From dcc9881e6767559c04faf15804ac145a2ea026cb Mon Sep 17 00:00:00 2001
From: Leon Romanovsky <leonro@mellanox.com>
Date: Thu, 17 Aug 2017 15:50:36 +0300
Subject: RDMA/(core, ulp): Convert register/unregister event handler to be
 void

The functions ib_register_event_handler() and
ib_unregister_event_handler() always returned success and they can't fail.

Let's convert those functions to be void, remove redundant checks and
cleanup tons of goto statements.

Signed-off-by: Leon Romanovsky <leonro@mellanox.com>
Reviewed-by: Dennis Dalessandro <dennis.dalessandro@intel.com>
Signed-off-by: Doug Ledford <dledford@redhat.com>
---
 drivers/infiniband/core/cache.c                 | 23 ++++++++---------------
 drivers/infiniband/core/device.c                |  8 ++------
 drivers/infiniband/core/sa_query.c              |  3 +--
 drivers/infiniband/core/uverbs_main.c           | 13 +------------
 drivers/infiniband/ulp/ipoib/ipoib_main.c       | 10 +---------
 drivers/infiniband/ulp/iser/iser_verbs.c        |  6 ++----
 drivers/infiniband/ulp/opa_vnic/opa_vnic_vema.c |  7 +------
 drivers/infiniband/ulp/srpt/ib_srpt.c           |  5 ++---
 include/rdma/ib_verbs.h                         |  4 ++--
 9 files changed, 20 insertions(+), 59 deletions(-)

(limited to 'include')

diff --git a/drivers/infiniband/core/cache.c b/drivers/infiniband/core/cache.c
index efc94304dee3..77515638c55c 100644
--- a/drivers/infiniband/core/cache.c
+++ b/drivers/infiniband/core/cache.c
@@ -1199,30 +1199,23 @@ int ib_cache_setup_one(struct ib_device *device)
 	device->cache.ports =
 		kzalloc(sizeof(*device->cache.ports) *
 			(rdma_end_port(device) - rdma_start_port(device) + 1), GFP_KERNEL);
-	if (!device->cache.ports) {
-		err = -ENOMEM;
-		goto out;
-	}
+	if (!device->cache.ports)
+		return -ENOMEM;
 
 	err = gid_table_setup_one(device);
-	if (err)
-		goto out;
+	if (err) {
+		kfree(device->cache.ports);
+		device->cache.ports = NULL;
+		return err;
+	}
 
 	for (p = 0; p <= rdma_end_port(device) - rdma_start_port(device); ++p)
 		ib_cache_update(device, p + rdma_start_port(device), true);
 
 	INIT_IB_EVENT_HANDLER(&device->cache.event_handler,
 			      device, ib_cache_event);
-	err = ib_register_event_handler(&device->cache.event_handler);
-	if (err)
-		goto err;
-
+	ib_register_event_handler(&device->cache.event_handler);
 	return 0;
-
-err:
-	gid_table_cleanup_one(device);
-out:
-	return err;
 }
 
 void ib_cache_release_one(struct ib_device *device)
diff --git a/drivers/infiniband/core/device.c b/drivers/infiniband/core/device.c
index fc6be1175183..ec4786777447 100644
--- a/drivers/infiniband/core/device.c
+++ b/drivers/infiniband/core/device.c
@@ -747,7 +747,7 @@ EXPORT_SYMBOL(ib_set_client_data);
  * chapter 11 of the InfiniBand Architecture Specification).  This
  * callback may occur in interrupt context.
  */
-int ib_register_event_handler  (struct ib_event_handler *event_handler)
+void ib_register_event_handler(struct ib_event_handler *event_handler)
 {
 	unsigned long flags;
 
@@ -755,8 +755,6 @@ int ib_register_event_handler  (struct ib_event_handler *event_handler)
 	list_add_tail(&event_handler->list,
 		      &event_handler->device->event_handler_list);
 	spin_unlock_irqrestore(&event_handler->device->event_handler_lock, flags);
-
-	return 0;
 }
 EXPORT_SYMBOL(ib_register_event_handler);
 
@@ -767,15 +765,13 @@ EXPORT_SYMBOL(ib_register_event_handler);
  * Unregister an event handler registered with
  * ib_register_event_handler().
  */
-int ib_unregister_event_handler(struct ib_event_handler *event_handler)
+void ib_unregister_event_handler(struct ib_event_handler *event_handler)
 {
 	unsigned long flags;
 
 	spin_lock_irqsave(&event_handler->device->event_handler_lock, flags);
 	list_del(&event_handler->list);
 	spin_unlock_irqrestore(&event_handler->device->event_handler_lock, flags);
-
-	return 0;
 }
 EXPORT_SYMBOL(ib_unregister_event_handler);
 
diff --git a/drivers/infiniband/core/sa_query.c b/drivers/infiniband/core/sa_query.c
index 0179b21bad34..ab5e1024fea9 100644
--- a/drivers/infiniband/core/sa_query.c
+++ b/drivers/infiniband/core/sa_query.c
@@ -2417,8 +2417,7 @@ static void ib_sa_add_one(struct ib_device *device)
 	 */
 
 	INIT_IB_EVENT_HANDLER(&sa_dev->event_handler, device, ib_sa_event);
-	if (ib_register_event_handler(&sa_dev->event_handler))
-		goto err;
+	ib_register_event_handler(&sa_dev->event_handler);
 
 	for (i = 0; i <= e - s; ++i) {
 		if (rdma_cap_ib_sa(device, i + 1))
diff --git a/drivers/infiniband/core/uverbs_main.c b/drivers/infiniband/core/uverbs_main.c
index 5e530d2bee44..defeda33e27f 100644
--- a/drivers/infiniband/core/uverbs_main.c
+++ b/drivers/infiniband/core/uverbs_main.c
@@ -595,7 +595,6 @@ struct file *ib_uverbs_alloc_async_event_file(struct ib_uverbs_file *uverbs_file
 {
 	struct ib_uverbs_async_event_file *ev_file;
 	struct file *filp;
-	int ret;
 
 	ev_file = kzalloc(sizeof(*ev_file), GFP_KERNEL);
 	if (!ev_file)
@@ -621,21 +620,11 @@ struct file *ib_uverbs_alloc_async_event_file(struct ib_uverbs_file *uverbs_file
 	INIT_IB_EVENT_HANDLER(&uverbs_file->event_handler,
 			      ib_dev,
 			      ib_uverbs_event_handler);
-	ret = ib_register_event_handler(&uverbs_file->event_handler);
-	if (ret)
-		goto err_put_file;
-
+	ib_register_event_handler(&uverbs_file->event_handler);
 	/* At that point async file stuff was fully set */
 
 	return filp;
 
-err_put_file:
-	fput(filp);
-	kref_put(&uverbs_file->async_file->ref,
-		 ib_uverbs_release_async_event_file);
-	uverbs_file->async_file = NULL;
-	return ERR_PTR(ret);
-
 err_put_refs:
 	kref_put(&ev_file->uverbs_file->ref, ib_uverbs_release_file);
 	kref_put(&ev_file->ref, ib_uverbs_release_async_event_file);
diff --git a/drivers/infiniband/ulp/ipoib/ipoib_main.c b/drivers/infiniband/ulp/ipoib/ipoib_main.c
index ee9f5d281b37..344e8d3d47bd 100644
--- a/drivers/infiniband/ulp/ipoib/ipoib_main.c
+++ b/drivers/infiniband/ulp/ipoib/ipoib_main.c
@@ -2227,13 +2227,7 @@ static struct net_device *ipoib_add_port(const char *format,
 
 	INIT_IB_EVENT_HANDLER(&priv->event_handler,
 			      priv->ca, ipoib_event);
-	result = ib_register_event_handler(&priv->event_handler);
-	if (result < 0) {
-		printk(KERN_WARNING "%s: ib_register_event_handler failed for "
-		       "port %d (ret = %d)\n",
-		       hca->name, port, result);
-		goto event_failed;
-	}
+	ib_register_event_handler(&priv->event_handler);
 
 	result = register_netdev(priv->dev);
 	if (result) {
@@ -2266,8 +2260,6 @@ register_failed:
 	set_bit(IPOIB_STOP_NEIGH_GC, &priv->flags);
 	cancel_delayed_work(&priv->neigh_reap_task);
 	flush_workqueue(priv->wq);
-
-event_failed:
 	ipoib_dev_cleanup(priv->dev);
 
 device_init_failed:
diff --git a/drivers/infiniband/ulp/iser/iser_verbs.c b/drivers/infiniband/ulp/iser/iser_verbs.c
index 26a004e97ae0..55a73b0ed4c6 100644
--- a/drivers/infiniband/ulp/iser/iser_verbs.c
+++ b/drivers/infiniband/ulp/iser/iser_verbs.c
@@ -106,9 +106,7 @@ static int iser_create_device_ib_res(struct iser_device *device)
 
 	INIT_IB_EVENT_HANDLER(&device->event_handler, ib_dev,
 			      iser_event_handler);
-	if (ib_register_event_handler(&device->event_handler))
-		goto cq_err;
-
+	ib_register_event_handler(&device->event_handler);
 	return 0;
 
 cq_err:
@@ -141,7 +139,7 @@ static void iser_free_device_ib_res(struct iser_device *device)
 		comp->cq = NULL;
 	}
 
-	(void)ib_unregister_event_handler(&device->event_handler);
+	ib_unregister_event_handler(&device->event_handler);
 	ib_dealloc_pd(device->pd);
 
 	kfree(device->comps);
diff --git a/drivers/infiniband/ulp/opa_vnic/opa_vnic_vema.c b/drivers/infiniband/ulp/opa_vnic/opa_vnic_vema.c
index 57b862b94dca..21f0b481edcc 100644
--- a/drivers/infiniband/ulp/opa_vnic/opa_vnic_vema.c
+++ b/drivers/infiniband/ulp/opa_vnic/opa_vnic_vema.c
@@ -954,12 +954,7 @@ static int vema_register(struct opa_vnic_ctrl_port *cport)
 
 		INIT_IB_EVENT_HANDLER(&port->event_handler,
 				      cport->ibdev, opa_vnic_event);
-		ret = ib_register_event_handler(&port->event_handler);
-		if (ret) {
-			c_err("port %d: event handler register failed\n", i);
-			vema_unregister(cport);
-			return ret;
-		}
+		ib_register_event_handler(&port->event_handler);
 
 		idr_init(&port->vport_idr);
 		mutex_init(&port->lock);
diff --git a/drivers/infiniband/ulp/srpt/ib_srpt.c b/drivers/infiniband/ulp/srpt/ib_srpt.c
index 402275be0931..9e8e9220f816 100644
--- a/drivers/infiniband/ulp/srpt/ib_srpt.c
+++ b/drivers/infiniband/ulp/srpt/ib_srpt.c
@@ -2238,7 +2238,7 @@ static int srpt_write_pending(struct se_cmd *se_cmd)
 				cqe, first_wr);
 		cqe = NULL;
 	}
-	
+
 	ret = ib_post_send(ch->qp, first_wr, &bad_wr);
 	if (ret) {
 		pr_err("%s: ib_post_send() returned %d for %d (avail: %d)\n",
@@ -2530,8 +2530,7 @@ static void srpt_add_one(struct ib_device *device)
 
 	INIT_IB_EVENT_HANDLER(&sdev->event_handler, sdev->device,
 			      srpt_event_handler);
-	if (ib_register_event_handler(&sdev->event_handler))
-		goto err_cm;
+	ib_register_event_handler(&sdev->event_handler);
 
 	sdev->ioctx_ring = (struct srpt_recv_ioctx **)
 		srpt_alloc_ioctx_ring(sdev, sdev->srq_size,
diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h
index c155c105589d..e536a052e5dd 100644
--- a/include/rdma/ib_verbs.h
+++ b/include/rdma/ib_verbs.h
@@ -2413,8 +2413,8 @@ int ib_modify_qp_is_ok(enum ib_qp_state cur_state, enum ib_qp_state next_state,
 		       enum ib_qp_type type, enum ib_qp_attr_mask mask,
 		       enum rdma_link_layer ll);
 
-int ib_register_event_handler  (struct ib_event_handler *event_handler);
-int ib_unregister_event_handler(struct ib_event_handler *event_handler);
+void ib_register_event_handler(struct ib_event_handler *event_handler);
+void ib_unregister_event_handler(struct ib_event_handler *event_handler);
 void ib_dispatch_event(struct ib_event *event);
 
 int ib_query_port(struct ib_device *device,
-- 
cgit v1.2.3


From 78b57f9529225111b440e6e5150f52f5d44e3c60 Mon Sep 17 00:00:00 2001
From: Leon Romanovsky <leonro@mellanox.com>
Date: Thu, 17 Aug 2017 15:50:37 +0300
Subject: RDMA/core: Cleanup device capability enum

Cleanup patch prior exporting the ib_device_cap_flags
to the user space. In this patch, we are aligning the
indentation, removing IB_DEVICE_INIT_TYPE and IB_DEVICE_RESERVED
fields, because it is not used in the kernel.

Signed-off-by: Leon Romanovsky <leonro@mellanox.com>
Reviewed-by: Dennis Dalessandro <dennis.dalessandro@intel.com>
Signed-off-by: Doug Ledford <dledford@redhat.com>
---
 include/rdma/ib_verbs.h | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h
index e536a052e5dd..355c7a328e0b 100644
--- a/include/rdma/ib_verbs.h
+++ b/include/rdma/ib_verbs.h
@@ -170,7 +170,7 @@ enum ib_device_cap_flags {
 	IB_DEVICE_UD_AV_PORT_ENFORCE		= (1 << 6),
 	IB_DEVICE_CURR_QP_STATE_MOD		= (1 << 7),
 	IB_DEVICE_SHUTDOWN_PORT			= (1 << 8),
-	IB_DEVICE_INIT_TYPE			= (1 << 9),
+	/* Not in use, former INIT_TYPE		= (1 << 9),*/
 	IB_DEVICE_PORT_ACTIVE_EVENT		= (1 << 10),
 	IB_DEVICE_SYS_IMAGE_GUID		= (1 << 11),
 	IB_DEVICE_RC_RNR_NAK_GEN		= (1 << 12),
@@ -185,7 +185,7 @@ enum ib_device_cap_flags {
 	 * which will always contain a usable lkey.
 	 */
 	IB_DEVICE_LOCAL_DMA_LKEY		= (1 << 15),
-	IB_DEVICE_RESERVED /* old SEND_W_INV */	= (1 << 16),
+	/* Reserved, old SEND_W_INV		= (1 << 16),*/
 	IB_DEVICE_MEM_WINDOW			= (1 << 17),
 	/*
 	 * Devices should set IB_DEVICE_UD_IP_SUM if they support
@@ -220,7 +220,7 @@ enum ib_device_cap_flags {
 	 * of I/O operations with single completion queue managed
 	 * by hardware.
 	 */
-	IB_DEVICE_CROSS_CHANNEL		= (1 << 27),
+	IB_DEVICE_CROSS_CHANNEL			= (1 << 27),
 	IB_DEVICE_MANAGED_FLOW_STEERING		= (1 << 29),
 	IB_DEVICE_SIGNATURE_HANDOVER		= (1 << 30),
 	IB_DEVICE_ON_DEMAND_PAGING		= (1ULL << 31),
-- 
cgit v1.2.3


From 078b3573030346df0cdc46d798c0f434dc53c2cc Mon Sep 17 00:00:00 2001
From: Guy Levi <guyle@mellanox.com>
Date: Thu, 17 Aug 2017 15:50:47 +0300
Subject: IB/mlx4: Fix struct mlx4_ib_create_wq alignment

The mlx4 ABI defines to have structures with alignment of 64B.

Fixes: 400b1ebcfe31 ("IB/mlx4: Add support for WQ related verbs")
Signed-off-by: Guy Levi <guyle@mellanox.com>
Signed-off-by: Leon Romanovsky <leon@kernel.org>
Signed-off-by: Doug Ledford <dledford@redhat.com>
---
 drivers/infiniband/hw/mlx4/qp.c | 9 ++++-----
 include/uapi/rdma/mlx4-abi.h    | 1 -
 2 files changed, 4 insertions(+), 6 deletions(-)

(limited to 'include')

diff --git a/drivers/infiniband/hw/mlx4/qp.c b/drivers/infiniband/hw/mlx4/qp.c
index 0d2923c2225f..c3958fcfed75 100644
--- a/drivers/infiniband/hw/mlx4/qp.c
+++ b/drivers/infiniband/hw/mlx4/qp.c
@@ -1046,9 +1046,8 @@ static int create_qp_common(struct mlx4_ib_dev *dev, struct ib_pd *pd,
 		}
 
 		if (src == MLX4_IB_RWQ_SRC) {
-			if (ucmd.wq.comp_mask || ucmd.wq.reserved1 ||
-			    ucmd.wq.reserved[0] || ucmd.wq.reserved[1] ||
-			    ucmd.wq.reserved[2]) {
+			if (ucmd.wq.comp_mask || ucmd.wq.reserved[0] ||
+			    ucmd.wq.reserved[1] || ucmd.wq.reserved[2]) {
 				pr_debug("user command isn't supported\n");
 				err = -EOPNOTSUPP;
 				goto err;
@@ -4146,8 +4145,8 @@ struct ib_wq *mlx4_ib_create_wq(struct ib_pd *pd,
 	if (!(udata && pd->uobject))
 		return ERR_PTR(-EINVAL);
 
-	required_cmd_sz = offsetof(typeof(ucmd), reserved) +
-			  sizeof(ucmd.reserved);
+	required_cmd_sz = offsetof(typeof(ucmd), comp_mask) +
+			  sizeof(ucmd.comp_mask);
 	if (udata->inlen < required_cmd_sz) {
 		pr_debug("invalid inlen\n");
 		return ERR_PTR(-EINVAL);
diff --git a/include/uapi/rdma/mlx4-abi.h b/include/uapi/rdma/mlx4-abi.h
index 21cce1a4c4dd..0e10102861b5 100644
--- a/include/uapi/rdma/mlx4-abi.h
+++ b/include/uapi/rdma/mlx4-abi.h
@@ -121,7 +121,6 @@ struct mlx4_ib_create_wq {
 	__u8	log_range_size;
 	__u8	reserved[3];
 	__u32   comp_mask;
-	__u32   reserved1;
 };
 
 struct mlx4_ib_modify_wq {
-- 
cgit v1.2.3


From b23673f86fd0d9ccbc088e88e29899b4d3f0f055 Mon Sep 17 00:00:00 2001
From: Guy Levi <guyle@mellanox.com>
Date: Thu, 17 Aug 2017 15:50:48 +0300
Subject: IB/mlx4: Remove redundant attribute in mlx4_ib_create_qp_rss struct

rx_key_len is not in use and needs to be removed.

Fixes: 3078f5f1bd8b ("IB/mlx4: Add support for RSS QP")
Signed-off-by: Guy Levi <guyle@mellanox.com>
Signed-off-by: Leon Romanovsky <leon@kernel.org>
Signed-off-by: Doug Ledford <dledford@redhat.com>
---
 include/uapi/rdma/mlx4-abi.h | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/uapi/rdma/mlx4-abi.h b/include/uapi/rdma/mlx4-abi.h
index 0e10102861b5..c55f60e05f86 100644
--- a/include/uapi/rdma/mlx4-abi.h
+++ b/include/uapi/rdma/mlx4-abi.h
@@ -98,8 +98,7 @@ struct mlx4_ib_create_srq_resp {
 struct mlx4_ib_create_qp_rss {
 	__u64   rx_hash_fields_mask;
 	__u8    rx_hash_function;
-	__u8    rx_key_len;
-	__u8    reserved[6];
+	__u8    reserved[7];
 	__u8    rx_hash_key[40];
 	__u32   comp_mask;
 	__u32   reserved1;
-- 
cgit v1.2.3


From 96dc3fc5f1d66b20cdf839d571c7b907e08d5d00 Mon Sep 17 00:00:00 2001
From: Noa Osherovich <noaos@mellanox.com>
Date: Thu, 17 Aug 2017 15:52:28 +0300
Subject: IB/mlx5: Expose software parsing for Raw Ethernet QP

Software parsing (SWP) is a feature that can be used to instruct the
device to stop using its internal parser and to parse packets on the
transmit path according to offsets set for each packets.

Through this feature, the device allows the handling of checksum and
LSO by the hardware according to the location of IP and TCP/UDP
headers.

Enable SW parsing on Raw Ethernet send queue by default if firmware
supports it and report these capabilities to user space.

Signed-off-by: Noa Osherovich <noaos@mellanox.com>
Reviewed-by: Maor Gottlieb <maorg@mellanox.com>
Signed-off-by: Leon Romanovsky <leon@kernel.org>
Signed-off-by: Doug Ledford <dledford@redhat.com>
---
 drivers/infiniband/hw/mlx5/main.c | 21 +++++++++++++++++++++
 drivers/infiniband/hw/mlx5/qp.c   |  3 +++
 include/uapi/rdma/mlx5-abi.h      | 17 +++++++++++++++++
 3 files changed, 41 insertions(+)

(limited to 'include')

diff --git a/drivers/infiniband/hw/mlx5/main.c b/drivers/infiniband/hw/mlx5/main.c
index 762ef6bf219e..07789450ec42 100644
--- a/drivers/infiniband/hw/mlx5/main.c
+++ b/drivers/infiniband/hw/mlx5/main.c
@@ -811,6 +811,27 @@ static int mlx5_ib_query_device(struct ib_device *ibdev,
 	if (field_avail(typeof(resp), reserved, uhw->outlen))
 		resp.response_length += sizeof(resp.reserved);
 
+	if (field_avail(typeof(resp), sw_parsing_caps,
+			uhw->outlen)) {
+		resp.response_length += sizeof(resp.sw_parsing_caps);
+		if (MLX5_CAP_ETH(mdev, swp)) {
+			resp.sw_parsing_caps.sw_parsing_offloads |=
+				MLX5_IB_SW_PARSING;
+
+			if (MLX5_CAP_ETH(mdev, swp_csum))
+				resp.sw_parsing_caps.sw_parsing_offloads |=
+					MLX5_IB_SW_PARSING_CSUM;
+
+			if (MLX5_CAP_ETH(mdev, swp_lso))
+				resp.sw_parsing_caps.sw_parsing_offloads |=
+					MLX5_IB_SW_PARSING_LSO;
+
+			if (resp.sw_parsing_caps.sw_parsing_offloads)
+				resp.sw_parsing_caps.supported_qpts =
+					BIT(IB_QPT_RAW_PACKET);
+		}
+	}
+
 	if (uhw->outlen) {
 		err = ib_copy_to_udata(uhw, &resp, resp.response_length);
 
diff --git a/drivers/infiniband/hw/mlx5/qp.c b/drivers/infiniband/hw/mlx5/qp.c
index bc49d14e0a00..656773196f27 100644
--- a/drivers/infiniband/hw/mlx5/qp.c
+++ b/drivers/infiniband/hw/mlx5/qp.c
@@ -1088,6 +1088,9 @@ static int create_raw_packet_qp_sq(struct mlx5_ib_dev *dev,
 	MLX5_SET(sqc, sqc, cqn, MLX5_GET(qpc, qpc, cqn_snd));
 	MLX5_SET(sqc, sqc, tis_lst_sz, 1);
 	MLX5_SET(sqc, sqc, tis_num_0, sq->tisn);
+	if (MLX5_CAP_GEN(dev->mdev, eth_net_offloads) &&
+	    MLX5_CAP_ETH(dev->mdev, swp))
+		MLX5_SET(sqc, sqc, allow_swp, 1);
 
 	wq = MLX5_ADDR_OF(sqc, sqc, wq);
 	MLX5_SET(wq, wq, wq_type, MLX5_WQ_TYPE_CYCLIC);
diff --git a/include/uapi/rdma/mlx5-abi.h b/include/uapi/rdma/mlx5-abi.h
index 0b3d30837a9f..64d398e662cd 100644
--- a/include/uapi/rdma/mlx5-abi.h
+++ b/include/uapi/rdma/mlx5-abi.h
@@ -168,6 +168,22 @@ struct mlx5_packet_pacing_caps {
 	__u32 reserved;
 };
 
+enum mlx5_ib_sw_parsing_offloads {
+	MLX5_IB_SW_PARSING = 1 << 0,
+	MLX5_IB_SW_PARSING_CSUM = 1 << 1,
+	MLX5_IB_SW_PARSING_LSO = 1 << 2,
+};
+
+struct mlx5_ib_sw_parsing_caps {
+	__u32 sw_parsing_offloads; /* enum mlx5_ib_sw_parsing_offloads */
+
+	/* Corresponding bit will be set if qp type from
+	 * 'enum ib_qp_type' is supported, e.g.
+	 * supported_qpts |= 1 << IB_QPT_RAW_PACKET
+	 */
+	__u32 supported_qpts;
+};
+
 struct mlx5_ib_query_device_resp {
 	__u32	comp_mask;
 	__u32	response_length;
@@ -177,6 +193,7 @@ struct mlx5_ib_query_device_resp {
 	struct	mlx5_packet_pacing_caps packet_pacing_caps;
 	__u32	mlx5_ib_support_multi_pkt_send_wqes;
 	__u32	reserved;
+	struct mlx5_ib_sw_parsing_caps sw_parsing_caps;
 };
 
 struct mlx5_ib_create_cq {
-- 
cgit v1.2.3


From 8b7ff7f3b301de52924cb2cf3fed47b181893116 Mon Sep 17 00:00:00 2001
From: Ilya Lesokhin <ilyal@mellanox.com>
Date: Thu, 17 Aug 2017 15:52:29 +0300
Subject: IB/mlx5: Enable UMR for MRs created with reg_create

This patch is the first step in decoupling UMR usage and
allocation from the MR cache. The only functional change
in this patch is to enables UMR for MRs created with
reg_create.

This change fixes a bug where ODP memory regions that
were not allocated from the MR cache did not have UMR
enabled.

Signed-off-by: Ilya Lesokhin <ilyal@mellanox.com>
Signed-off-by: Leon Romanovsky <leon@kernel.org>
Signed-off-by: Doug Ledford <dledford@redhat.com>
---
 drivers/infiniband/hw/mlx5/mlx5_ib.h |  2 +-
 drivers/infiniband/hw/mlx5/mr.c      | 33 ++++++++++++++-------------------
 include/linux/mlx5/driver.h          |  2 +-
 3 files changed, 16 insertions(+), 21 deletions(-)

(limited to 'include')

diff --git a/drivers/infiniband/hw/mlx5/mlx5_ib.h b/drivers/infiniband/hw/mlx5/mlx5_ib.h
index 7ac991070020..b3380e8beacf 100644
--- a/drivers/infiniband/hw/mlx5/mlx5_ib.h
+++ b/drivers/infiniband/hw/mlx5/mlx5_ib.h
@@ -503,7 +503,7 @@ struct mlx5_ib_mr {
 	struct mlx5_shared_mr_info	*smr_info;
 	struct list_head	list;
 	int			order;
-	int			umred;
+	bool			allocated_from_cache;
 	int			npages;
 	struct mlx5_ib_dev     *dev;
 	u32 out[MLX5_ST_SZ_DW(create_mkey_out)];
diff --git a/drivers/infiniband/hw/mlx5/mr.c b/drivers/infiniband/hw/mlx5/mr.c
index a0eb2f96179a..bc87016021e3 100644
--- a/drivers/infiniband/hw/mlx5/mr.c
+++ b/drivers/infiniband/hw/mlx5/mr.c
@@ -48,8 +48,7 @@ enum {
 #define MLX5_UMR_ALIGN 2048
 
 static int clean_mr(struct mlx5_ib_mr *mr);
-static int max_umr_order(struct mlx5_ib_dev *dev);
-static int use_umr(struct mlx5_ib_dev *dev, int order);
+static int mr_cache_max_order(struct mlx5_ib_dev *dev);
 static int unreg_umr(struct mlx5_ib_dev *dev, struct mlx5_ib_mr *mr);
 
 static int destroy_mkey(struct mlx5_ib_dev *dev, struct mlx5_ib_mr *mr)
@@ -184,7 +183,7 @@ static int add_keys(struct mlx5_ib_dev *dev, int c, int num)
 			break;
 		}
 		mr->order = ent->order;
-		mr->umred = 1;
+		mr->allocated_from_cache = 1;
 		mr->dev = dev;
 
 		MLX5_SET(mkc, mkc, free, 1);
@@ -497,7 +496,7 @@ static struct mlx5_ib_mr *alloc_cached_mr(struct mlx5_ib_dev *dev, int order)
 	int i;
 
 	c = order2idx(dev, order);
-	last_umr_cache_entry = order2idx(dev, max_umr_order(dev));
+	last_umr_cache_entry = order2idx(dev, mr_cache_max_order(dev));
 	if (c < 0 || c > last_umr_cache_entry) {
 		mlx5_ib_warn(dev, "order %d, cache index %d\n", order, c);
 		return NULL;
@@ -677,12 +676,12 @@ int mlx5_mr_cache_init(struct mlx5_ib_dev *dev)
 		INIT_DELAYED_WORK(&ent->dwork, delayed_cache_work_func);
 		queue_work(cache->wq, &ent->work);
 
-		if (i > MAX_UMR_CACHE_ENTRY) {
+		if (i > MR_CACHE_LAST_STD_ENTRY) {
 			mlx5_odp_init_mr_cache_entry(ent);
 			continue;
 		}
 
-		if (!use_umr(dev, ent->order))
+		if (ent->order > mr_cache_max_order(dev))
 			continue;
 
 		ent->page = PAGE_SHIFT;
@@ -819,18 +818,13 @@ static int get_octo_len(u64 addr, u64 len, int page_size)
 	return (npages + 1) / 2;
 }
 
-static int max_umr_order(struct mlx5_ib_dev *dev)
+static int mr_cache_max_order(struct mlx5_ib_dev *dev)
 {
 	if (MLX5_CAP_GEN(dev->mdev, umr_extended_translation_offset))
-		return MAX_UMR_CACHE_ENTRY + 2;
+		return MR_CACHE_LAST_STD_ENTRY + 2;
 	return MLX5_MAX_UMR_SHIFT;
 }
 
-static int use_umr(struct mlx5_ib_dev *dev, int order)
-{
-	return order <= max_umr_order(dev);
-}
-
 static int mr_umem_get(struct ib_pd *pd, u64 start, u64 length,
 		       int access_flags, struct ib_umem **umem,
 		       int *npages, int *page_shift, int *ncont,
@@ -1149,6 +1143,7 @@ static struct mlx5_ib_mr *reg_create(struct ib_mr *ibmr, struct ib_pd *pd,
 	MLX5_SET(mkc, mkc, rr, !!(access_flags & IB_ACCESS_REMOTE_READ));
 	MLX5_SET(mkc, mkc, lw, !!(access_flags & IB_ACCESS_LOCAL_WRITE));
 	MLX5_SET(mkc, mkc, lr, 1);
+	MLX5_SET(mkc, mkc, umr_en, 1);
 
 	MLX5_SET64(mkc, mkc, start_addr, virt_addr);
 	MLX5_SET64(mkc, mkc, len, length);
@@ -1231,7 +1226,7 @@ struct ib_mr *mlx5_ib_reg_user_mr(struct ib_pd *pd, u64 start, u64 length,
         if (err < 0)
 		return ERR_PTR(err);
 
-	if (use_umr(dev, order)) {
+	if (order <= mr_cache_max_order(dev)) {
 		mr = reg_umr(pd, umem, virt_addr, length, ncont, page_shift,
 			     order, access_flags);
 		if (PTR_ERR(mr) == -EAGAIN) {
@@ -1355,7 +1350,7 @@ int mlx5_ib_rereg_user_mr(struct ib_mr *ib_mr, int flags, u64 start,
 		/*
 		 * UMR can't be used - MKey needs to be replaced.
 		 */
-		if (mr->umred) {
+		if (mr->allocated_from_cache) {
 			err = unreg_umr(dev, mr);
 			if (err)
 				mlx5_ib_warn(dev, "Failed to unregister MR\n");
@@ -1373,7 +1368,7 @@ int mlx5_ib_rereg_user_mr(struct ib_mr *ib_mr, int flags, u64 start,
 		if (IS_ERR(mr))
 			return PTR_ERR(mr);
 
-		mr->umred = 0;
+		mr->allocated_from_cache = 0;
 	} else {
 		/*
 		 * Send a UMR WQE
@@ -1461,7 +1456,7 @@ mlx5_free_priv_descs(struct mlx5_ib_mr *mr)
 static int clean_mr(struct mlx5_ib_mr *mr)
 {
 	struct mlx5_ib_dev *dev = to_mdev(mr->ibmr.device);
-	int umred = mr->umred;
+	int allocated_from_cache = mr->allocated_from_cache;
 	int err;
 
 	if (mr->sig) {
@@ -1479,7 +1474,7 @@ static int clean_mr(struct mlx5_ib_mr *mr)
 
 	mlx5_free_priv_descs(mr);
 
-	if (!umred) {
+	if (!allocated_from_cache) {
 		err = destroy_mkey(dev, mr);
 		if (err) {
 			mlx5_ib_warn(dev, "failed to destroy mkey 0x%x (%d)\n",
@@ -1490,7 +1485,7 @@ static int clean_mr(struct mlx5_ib_mr *mr)
 		mlx5_mr_cache_free(dev, mr);
 	}
 
-	if (!umred)
+	if (!allocated_from_cache)
 		kfree(mr);
 
 	return 0;
diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h
index db40bc4055c7..99d88624ad07 100644
--- a/include/linux/mlx5/driver.h
+++ b/include/linux/mlx5/driver.h
@@ -1093,7 +1093,7 @@ enum {
 };
 
 enum {
-	MAX_UMR_CACHE_ENTRY = 20,
+	MR_CACHE_LAST_STD_ENTRY = 20,
 	MLX5_IMR_MTT_CACHE_ENTRY,
 	MLX5_IMR_KSM_CACHE_ENTRY,
 	MAX_MR_CACHE_ENTRIES
-- 
cgit v1.2.3


From a550ddfc543e250798048cf4eabe721cd85ac724 Mon Sep 17 00:00:00 2001
From: Yishai Hadas <yishaih@mellanox.com>
Date: Thu, 17 Aug 2017 15:52:33 +0300
Subject: IB/mlx5: Add support for multi underlay QP

Set underlay QPN as part of flow rule when it's applicable.

There is one root flow table in the NIC RX namespace and all the
underlay QPs steer the traffic to this flow table.
In order to prevent QP to get traffic which is not target to its
underlay QP, we need to set the underlay QP number as part of
the steering matching.

Note:
When multicast traffic is sent the QPN filtering is done by the firmware
as some early step. Adding the QPN match on the flow table entry is
wrong as by that time the target QPN holds the multicast address (e.g.
FF(s)) and it won't match.

Signed-off-by: Yishai Hadas <yishaih@mellanox.com>
Signed-off-by: Leon Romanovsky <leon@kernel.org>
Signed-off-by: Doug Ledford <dledford@redhat.com>
---
 drivers/infiniband/hw/mlx5/main.c | 49 +++++++++++++++++++++++++++++++++------
 include/linux/mlx5/mlx5_ifc.h     |  8 +++++--
 2 files changed, 48 insertions(+), 9 deletions(-)

(limited to 'include')

diff --git a/drivers/infiniband/hw/mlx5/main.c b/drivers/infiniband/hw/mlx5/main.c
index 07789450ec42..8f7e46090f9a 100644
--- a/drivers/infiniband/hw/mlx5/main.c
+++ b/drivers/infiniband/hw/mlx5/main.c
@@ -2125,7 +2125,7 @@ static int parse_flow_attr(struct mlx5_core_dev *mdev, u32 *match_c,
  * it won't fall into the multicast flow steering table and this rule
  * could steal other multicast packets.
  */
-static bool flow_is_multicast_only(struct ib_flow_attr *ib_attr)
+static bool flow_is_multicast_only(const struct ib_flow_attr *ib_attr)
 {
 	union ib_flow_spec *flow_spec;
 
@@ -2337,10 +2337,31 @@ static struct mlx5_ib_flow_prio *get_flow_table(struct mlx5_ib_dev *dev,
 	return err ? ERR_PTR(err) : prio;
 }
 
-static struct mlx5_ib_flow_handler *create_flow_rule(struct mlx5_ib_dev *dev,
-						     struct mlx5_ib_flow_prio *ft_prio,
-						     const struct ib_flow_attr *flow_attr,
-						     struct mlx5_flow_destination *dst)
+static void set_underlay_qp(struct mlx5_ib_dev *dev,
+			    struct mlx5_flow_spec *spec,
+			    u32 underlay_qpn)
+{
+	void *misc_params_c = MLX5_ADDR_OF(fte_match_param,
+					   spec->match_criteria,
+					   misc_parameters);
+	void *misc_params_v = MLX5_ADDR_OF(fte_match_param, spec->match_value,
+					   misc_parameters);
+
+	if (underlay_qpn &&
+	    MLX5_CAP_FLOWTABLE_NIC_RX(dev->mdev,
+				      ft_field_support.bth_dst_qp)) {
+		MLX5_SET(fte_match_set_misc,
+			 misc_params_v, bth_dst_qp, underlay_qpn);
+		MLX5_SET(fte_match_set_misc,
+			 misc_params_c, bth_dst_qp, 0xffffff);
+	}
+}
+
+static struct mlx5_ib_flow_handler *_create_flow_rule(struct mlx5_ib_dev *dev,
+						      struct mlx5_ib_flow_prio *ft_prio,
+						      const struct ib_flow_attr *flow_attr,
+						      struct mlx5_flow_destination *dst,
+						      u32 underlay_qpn)
 {
 	struct mlx5_flow_table	*ft = ft_prio->flow_table;
 	struct mlx5_ib_flow_handler *handler;
@@ -2376,6 +2397,9 @@ static struct mlx5_ib_flow_handler *create_flow_rule(struct mlx5_ib_dev *dev,
 		ib_flow += ((union ib_flow_spec *)ib_flow)->size;
 	}
 
+	if (!flow_is_multicast_only(flow_attr))
+		set_underlay_qp(dev, spec, underlay_qpn);
+
 	spec->match_criteria_enable = get_match_criteria_enable(spec->match_criteria);
 	if (is_drop) {
 		flow_act.action = MLX5_FLOW_CONTEXT_ACTION_DROP;
@@ -2415,6 +2439,14 @@ free:
 	return err ? ERR_PTR(err) : handler;
 }
 
+static struct mlx5_ib_flow_handler *create_flow_rule(struct mlx5_ib_dev *dev,
+						     struct mlx5_ib_flow_prio *ft_prio,
+						     const struct ib_flow_attr *flow_attr,
+						     struct mlx5_flow_destination *dst)
+{
+	return _create_flow_rule(dev, ft_prio, flow_attr, dst, 0);
+}
+
 static struct mlx5_ib_flow_handler *create_dont_trap_rule(struct mlx5_ib_dev *dev,
 							  struct mlx5_ib_flow_prio *ft_prio,
 							  struct ib_flow_attr *flow_attr,
@@ -2551,6 +2583,7 @@ static struct ib_flow *mlx5_ib_create_flow(struct ib_qp *qp,
 	struct mlx5_ib_flow_prio *ft_prio_tx = NULL;
 	struct mlx5_ib_flow_prio *ft_prio;
 	int err;
+	int underlay_qpn;
 
 	if (flow_attr->priority > MLX5_IB_FLOW_LAST_PRIO)
 		return ERR_PTR(-ENOMEM);
@@ -2591,8 +2624,10 @@ static struct ib_flow *mlx5_ib_create_flow(struct ib_qp *qp,
 			handler = create_dont_trap_rule(dev, ft_prio,
 							flow_attr, dst);
 		} else {
-			handler = create_flow_rule(dev, ft_prio, flow_attr,
-						   dst);
+			underlay_qpn = (mqp->flags & MLX5_IB_QP_UNDERLAY) ?
+					mqp->underlay_qpn : 0;
+			handler = _create_flow_rule(dev, ft_prio, flow_attr,
+						    dst, underlay_qpn);
 		}
 	} else if (flow_attr->type == IB_FLOW_ATTR_ALL_DEFAULT ||
 		   flow_attr->type == IB_FLOW_ATTR_MC_DEFAULT) {
diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h
index 5bae70eb25af..6563500c85de 100644
--- a/include/linux/mlx5/mlx5_ifc.h
+++ b/include/linux/mlx5/mlx5_ifc.h
@@ -295,8 +295,10 @@ struct mlx5_ifc_flow_table_fields_supported_bits {
 	u8         inner_tcp_dport[0x1];
 	u8         inner_tcp_flags[0x1];
 	u8         reserved_at_37[0x9];
+	u8         reserved_at_40[0x1a];
+	u8         bth_dst_qp[0x1];
 
-	u8         reserved_at_40[0x40];
+	u8         reserved_at_5b[0x25];
 };
 
 struct mlx5_ifc_flow_table_prop_layout_bits {
@@ -432,7 +434,9 @@ struct mlx5_ifc_fte_match_set_misc_bits {
 	u8         reserved_at_100[0xc];
 	u8         inner_ipv6_flow_label[0x14];
 
-	u8         reserved_at_120[0xe0];
+	u8         reserved_at_120[0x28];
+	u8         bth_dst_qp[0x18];
+	u8         reserved_at_160[0xa0];
 };
 
 struct mlx5_ifc_cmd_pas_bits {
-- 
cgit v1.2.3


From 795b609c8b59f8f20fa9d72bf8b4ae3b8aa5582c Mon Sep 17 00:00:00 2001
From: Bodong Wang <bodong@mellanox.com>
Date: Thu, 17 Aug 2017 15:52:34 +0300
Subject: IB/mlx5: Allow posting multi packet send WQEs if hardware supports

Set the field to allow posting multi packet send WQEs if hardware
supports this feature. This doesn't mean the send WQEs will be for
multi packet unless the send WQE was prepared according to multi
packet send WQE format.

User space shall use flag MLX5_IB_ALLOW_MPW to check if hardware
supports MPW and allows MPW in SQ context.

Signed-off-by: Bodong Wang <bodong@mellanox.com>
Reviewed-by: Daniel Jurgens <danielj@mellanox.com>
Signed-off-by: Leon Romanovsky <leon@kernel.org>
Signed-off-by: Doug Ledford <dledford@redhat.com>
---
 drivers/infiniband/hw/mlx5/main.c | 5 +++--
 drivers/infiniband/hw/mlx5/qp.c   | 2 ++
 include/linux/mlx5/mlx5_ifc.h     | 2 +-
 include/uapi/rdma/mlx5-abi.h      | 5 +++++
 4 files changed, 11 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/drivers/infiniband/hw/mlx5/main.c b/drivers/infiniband/hw/mlx5/main.c
index 8f7e46090f9a..ba0a97d6f677 100644
--- a/drivers/infiniband/hw/mlx5/main.c
+++ b/drivers/infiniband/hw/mlx5/main.c
@@ -802,8 +802,9 @@ static int mlx5_ib_query_device(struct ib_device *ibdev,
 
 	if (field_avail(typeof(resp), mlx5_ib_support_multi_pkt_send_wqes,
 			uhw->outlen)) {
-		resp.mlx5_ib_support_multi_pkt_send_wqes =
-			MLX5_CAP_ETH(mdev, multi_pkt_send_wqe);
+		if (MLX5_CAP_ETH(mdev, multi_pkt_send_wqe))
+			resp.mlx5_ib_support_multi_pkt_send_wqes =
+				MLX5_IB_ALLOW_MPW;
 		resp.response_length +=
 			sizeof(resp.mlx5_ib_support_multi_pkt_send_wqes);
 	}
diff --git a/drivers/infiniband/hw/mlx5/qp.c b/drivers/infiniband/hw/mlx5/qp.c
index 656773196f27..d6df88a78d5e 100644
--- a/drivers/infiniband/hw/mlx5/qp.c
+++ b/drivers/infiniband/hw/mlx5/qp.c
@@ -1083,6 +1083,8 @@ static int create_raw_packet_qp_sq(struct mlx5_ib_dev *dev,
 
 	sqc = MLX5_ADDR_OF(create_sq_in, in, ctx);
 	MLX5_SET(sqc, sqc, flush_in_error_en, 1);
+	if (MLX5_CAP_ETH(dev->mdev, multi_pkt_send_wqe))
+		MLX5_SET(sqc, sqc, allow_multi_pkt_send_wqe, 1);
 	MLX5_SET(sqc, sqc, state, MLX5_SQC_STATE_RST);
 	MLX5_SET(sqc, sqc, user_index, MLX5_GET(qpc, qpc, user_index));
 	MLX5_SET(sqc, sqc, cqn, MLX5_GET(qpc, qpc, cqn_snd));
diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h
index 6563500c85de..6865e60ba473 100644
--- a/include/linux/mlx5/mlx5_ifc.h
+++ b/include/linux/mlx5/mlx5_ifc.h
@@ -2445,7 +2445,7 @@ struct mlx5_ifc_sqc_bits {
 	u8         cd_master[0x1];
 	u8         fre[0x1];
 	u8         flush_in_error_en[0x1];
-	u8         reserved_at_4[0x1];
+	u8         allow_multi_pkt_send_wqe[0x1];
 	u8	   min_wqe_inline_mode[0x3];
 	u8         state[0x4];
 	u8         reg_umr[0x1];
diff --git a/include/uapi/rdma/mlx5-abi.h b/include/uapi/rdma/mlx5-abi.h
index 64d398e662cd..a61a512e5747 100644
--- a/include/uapi/rdma/mlx5-abi.h
+++ b/include/uapi/rdma/mlx5-abi.h
@@ -168,6 +168,11 @@ struct mlx5_packet_pacing_caps {
 	__u32 reserved;
 };
 
+enum mlx5_ib_mpw_caps {
+	MPW_RESERVED		= 1 << 0,
+	MLX5_IB_ALLOW_MPW	= 1 << 1,
+};
+
 enum mlx5_ib_sw_parsing_offloads {
 	MLX5_IB_SW_PARSING = 1 << 0,
 	MLX5_IB_SW_PARSING_CSUM = 1 << 1,
-- 
cgit v1.2.3


From 050da902adde8faf6b1bef15ac4876ae145358f4 Mon Sep 17 00:00:00 2001
From: Bodong Wang <bodong@mellanox.com>
Date: Thu, 17 Aug 2017 15:52:35 +0300
Subject: IB/mlx5: Report mlx5 enhanced multi packet WQE capability

Expose enhanced multi packet WQE capability to user space through
query_device by uhw.

Signed-off-by: Bodong Wang <bodong@mellanox.com>
Reviewed-by: Daniel Jurgens <danielj@mellanox.com>
Signed-off-by: Leon Romanovsky <leon@kernel.org>
Signed-off-by: Doug Ledford <dledford@redhat.com>
---
 drivers/infiniband/hw/mlx5/main.c | 5 +++++
 include/linux/mlx5/mlx5_ifc.h     | 2 +-
 include/uapi/rdma/mlx5-abi.h      | 1 +
 3 files changed, 7 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/drivers/infiniband/hw/mlx5/main.c b/drivers/infiniband/hw/mlx5/main.c
index ba0a97d6f677..62e6298810e7 100644
--- a/drivers/infiniband/hw/mlx5/main.c
+++ b/drivers/infiniband/hw/mlx5/main.c
@@ -805,6 +805,11 @@ static int mlx5_ib_query_device(struct ib_device *ibdev,
 		if (MLX5_CAP_ETH(mdev, multi_pkt_send_wqe))
 			resp.mlx5_ib_support_multi_pkt_send_wqes =
 				MLX5_IB_ALLOW_MPW;
+
+		if (MLX5_CAP_ETH(mdev, enhanced_multi_pkt_send_wqe))
+			resp.mlx5_ib_support_multi_pkt_send_wqes |=
+				MLX5_IB_SUPPORT_EMPW;
+
 		resp.response_length +=
 			sizeof(resp.mlx5_ib_support_multi_pkt_send_wqes);
 	}
diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h
index 6865e60ba473..4eff0b8a1482 100644
--- a/include/linux/mlx5/mlx5_ifc.h
+++ b/include/linux/mlx5/mlx5_ifc.h
@@ -604,7 +604,7 @@ struct mlx5_ifc_per_protocol_networking_offload_caps_bits {
 	u8         rss_ind_tbl_cap[0x4];
 	u8         reg_umr_sq[0x1];
 	u8         scatter_fcs[0x1];
-	u8         reserved_at_1a[0x1];
+	u8         enhanced_multi_pkt_send_wqe[0x1];
 	u8         tunnel_lso_const_out_ip_id[0x1];
 	u8         reserved_at_1c[0x2];
 	u8         tunnel_statless_gre[0x1];
diff --git a/include/uapi/rdma/mlx5-abi.h b/include/uapi/rdma/mlx5-abi.h
index a61a512e5747..1791bf123ba9 100644
--- a/include/uapi/rdma/mlx5-abi.h
+++ b/include/uapi/rdma/mlx5-abi.h
@@ -171,6 +171,7 @@ struct mlx5_packet_pacing_caps {
 enum mlx5_ib_mpw_caps {
 	MPW_RESERVED		= 1 << 0,
 	MLX5_IB_ALLOW_MPW	= 1 << 1,
+	MLX5_IB_SUPPORT_EMPW	= 1 << 2,
 };
 
 enum mlx5_ib_sw_parsing_offloads {
-- 
cgit v1.2.3


From 22b6722bfa591ba03d6a0c5521b600d4ab2d9a27 Mon Sep 17 00:00:00 2001
From: Jakub Sitnicki <jkbs@redhat.com>
Date: Wed, 23 Aug 2017 09:55:41 +0200
Subject: ipv6: Add sysctl for per namespace flow label reflection

Reflecting IPv6 Flow Label at server nodes is useful in environments
that employ multipath routing to load balance the requests. As "IPv6
Flow Label Reflection" standard draft [1] points out - ICMPv6 PTB error
messages generated in response to a downstream packets from the server
can be routed by a load balancer back to the original server without
looking at transport headers, if the server applies the flow label
reflection. This enables the Path MTU Discovery past the ECMP router in
load-balance or anycast environments where each server node is reachable
by only one path.

Introduce a sysctl to enable flow label reflection per net namespace for
all newly created sockets. Same could be earlier achieved only per
socket by setting the IPV6_FL_F_REFLECT flag for the IPV6_FLOWLABEL_MGR
socket option.

[1] https://tools.ietf.org/html/draft-wang-6man-flow-label-reflection-01

Signed-off-by: Jakub Sitnicki <jkbs@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 Documentation/networking/ip-sysctl.txt | 9 +++++++++
 include/net/netns/ipv6.h               | 1 +
 net/ipv6/af_inet6.c                    | 1 +
 net/ipv6/sysctl_net_ipv6.c             | 8 ++++++++
 4 files changed, 19 insertions(+)

(limited to 'include')

diff --git a/Documentation/networking/ip-sysctl.txt b/Documentation/networking/ip-sysctl.txt
index 84c9b8cee780..6b0bc0f71534 100644
--- a/Documentation/networking/ip-sysctl.txt
+++ b/Documentation/networking/ip-sysctl.txt
@@ -1350,6 +1350,15 @@ flowlabel_state_ranges - BOOLEAN
 	FALSE: disabled
 	Default: true
 
+flowlabel_reflect - BOOLEAN
+	Automatically reflect the flow label. Needed for Path MTU
+	Discovery to work with Equal Cost Multipath Routing in anycast
+	environments. See RFC 7690 and:
+	https://tools.ietf.org/html/draft-wang-6man-flow-label-reflection-01
+	TRUE: enabled
+	FALSE: disabled
+	Default: FALSE
+
 anycast_src_echo_reply - BOOLEAN
 	Controls the use of anycast addresses as source addresses for ICMPv6
 	echo reply
diff --git a/include/net/netns/ipv6.h b/include/net/netns/ipv6.h
index 0e50bf3ed097..2544f9760a42 100644
--- a/include/net/netns/ipv6.h
+++ b/include/net/netns/ipv6.h
@@ -36,6 +36,7 @@ struct netns_sysctl_ipv6 {
 	int idgen_retries;
 	int idgen_delay;
 	int flowlabel_state_ranges;
+	int flowlabel_reflect;
 };
 
 struct netns_ipv6 {
diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c
index 3b58ee709f33..fe5262fd6aa5 100644
--- a/net/ipv6/af_inet6.c
+++ b/net/ipv6/af_inet6.c
@@ -211,6 +211,7 @@ lookup_protocol:
 	np->mc_loop	= 1;
 	np->pmtudisc	= IPV6_PMTUDISC_WANT;
 	np->autoflowlabel = ip6_default_np_autolabel(net);
+	np->repflow	= net->ipv6.sysctl.flowlabel_reflect;
 	sk->sk_ipv6only	= net->ipv6.sysctl.bindv6only;
 
 	/* Init the ipv4 part of the socket since we can have sockets
diff --git a/net/ipv6/sysctl_net_ipv6.c b/net/ipv6/sysctl_net_ipv6.c
index 69c50e737c54..6fbf8ae5e52c 100644
--- a/net/ipv6/sysctl_net_ipv6.c
+++ b/net/ipv6/sysctl_net_ipv6.c
@@ -90,6 +90,13 @@ static struct ctl_table ipv6_table_template[] = {
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec
 	},
+	{
+		.procname	= "flowlabel_reflect",
+		.data		= &init_net.ipv6.sysctl.flowlabel_reflect,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec,
+	},
 	{ }
 };
 
@@ -149,6 +156,7 @@ static int __net_init ipv6_sysctl_net_init(struct net *net)
 	ipv6_table[6].data = &net->ipv6.sysctl.idgen_delay;
 	ipv6_table[7].data = &net->ipv6.sysctl.flowlabel_state_ranges;
 	ipv6_table[8].data = &net->ipv6.sysctl.ip_nonlocal_bind;
+	ipv6_table[9].data = &net->ipv6.sysctl.flowlabel_reflect;
 
 	ipv6_route_table = ipv6_route_sysctl_init(net);
 	if (!ipv6_route_table)
-- 
cgit v1.2.3


From 790c6056686cc4dd5b149b330bbd5ae208d4d721 Mon Sep 17 00:00:00 2001
From: "David S. Miller" <davem@davemloft.net>
Date: Thu, 24 Aug 2017 18:10:46 -0700
Subject: devlink: Fix devlink_dpipe_table_register() stub signature.

One too many arguments compared to the non-stub version.

Reported-by: kbuild test robot <fengguang.wu@intel.com>
Fixes: ffd3cdccf214 ("devlink: Add support for dynamic table size")
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/devlink.h | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/net/devlink.h b/include/net/devlink.h
index 047a0c54f652..aaf7178127a2 100644
--- a/include/net/devlink.h
+++ b/include/net/devlink.h
@@ -402,8 +402,7 @@ static inline int
 devlink_dpipe_table_register(struct devlink *devlink,
 			     const char *table_name,
 			     struct devlink_dpipe_table_ops *table_ops,
-			     void *priv, u64 size,
-			     bool counter_control_extern)
+			     void *priv, bool counter_control_extern)
 {
 	return 0;
 }
-- 
cgit v1.2.3


From 29825717123fb9cfb9e709327d565c2f2fa89903 Mon Sep 17 00:00:00 2001
From: Jakub Sitnicki <jkbs@redhat.com>
Date: Wed, 23 Aug 2017 09:58:28 +0200
Subject: net: Extend struct flowi6 with multipath hash

Allow for functions that fill out the IPv6 flow info to also pass a hash
computed over the skb contents. The hash value will drive the multipath
routing decisions.

This is intended for special treatment of ICMPv6 errors, where we would
like to make a routing decision based on the flow identifying the
offending IPv6 datagram that triggered the error, rather than the flow
of the ICMP error itself.

Signed-off-by: Jakub Sitnicki <jkbs@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/flow.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include')

diff --git a/include/net/flow.h b/include/net/flow.h
index f3dc61b29bb5..eb60cee30b44 100644
--- a/include/net/flow.h
+++ b/include/net/flow.h
@@ -149,6 +149,7 @@ struct flowi6 {
 #define fl6_ipsec_spi		uli.spi
 #define fl6_mh_type		uli.mht.type
 #define fl6_gre_key		uli.gre_key
+	__u32			mp_hash;
 } __attribute__((__aligned__(BITS_PER_LONG/8)));
 
 struct flowidn {
-- 
cgit v1.2.3


From 23aebdacb05dab9efdf22b9e0413491cbd5f128f Mon Sep 17 00:00:00 2001
From: Jakub Sitnicki <jkbs@redhat.com>
Date: Wed, 23 Aug 2017 09:58:29 +0200
Subject: ipv6: Compute multipath hash for ICMP errors from offending packet

When forwarding or sending out an ICMPv6 error, look at the embedded
packet that triggered the error and compute a flow hash over its
headers.

This let's us route the ICMP error together with the flow it belongs to
when multipath (ECMP) routing is in use, which in turn makes Path MTU
Discovery work in ECMP load-balanced or anycast setups (RFC 7690).

Granted, end-hosts behind the ECMP router (aka servers) need to reflect
the IPv6 Flow Label for PMTUD to work.

The code is organized to be in parallel with ipv4 stack:

  ip_multipath_l3_keys -> ip6_multipath_l3_keys
  fib_multipath_hash   -> rt6_multipath_hash

Signed-off-by: Jakub Sitnicki <jkbs@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/ip6_route.h |  1 +
 net/ipv6/icmp.c         |  1 +
 net/ipv6/route.c        | 50 +++++++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 52 insertions(+)

(limited to 'include')

diff --git a/include/net/ip6_route.h b/include/net/ip6_route.h
index 907d39a42f6b..882bc3c7ccde 100644
--- a/include/net/ip6_route.h
+++ b/include/net/ip6_route.h
@@ -115,6 +115,7 @@ static inline int ip6_route_get_saddr(struct net *net, struct rt6_info *rt,
 
 struct rt6_info *rt6_lookup(struct net *net, const struct in6_addr *daddr,
 			    const struct in6_addr *saddr, int oif, int flags);
+u32 rt6_multipath_hash(const struct flowi6 *fl6, const struct sk_buff *skb);
 
 struct dst_entry *icmp6_dst_alloc(struct net_device *dev, struct flowi6 *fl6);
 
diff --git a/net/ipv6/icmp.c b/net/ipv6/icmp.c
index 4f82830fc068..dd7608cf1d72 100644
--- a/net/ipv6/icmp.c
+++ b/net/ipv6/icmp.c
@@ -519,6 +519,7 @@ static void icmp6_send(struct sk_buff *skb, u8 type, u8 code, __u32 info,
 	fl6.fl6_icmp_type = type;
 	fl6.fl6_icmp_code = code;
 	fl6.flowi6_uid = sock_net_uid(net, NULL);
+	fl6.mp_hash = rt6_multipath_hash(&fl6, skb);
 	security_skb_classify_flow(skb, flowi6_to_flowi(&fl6));
 
 	sk = icmpv6_xmit_lock(net);
diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index 9b02064c3335..6c4dd5796a31 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -1214,6 +1214,54 @@ struct dst_entry *ip6_route_input_lookup(struct net *net,
 }
 EXPORT_SYMBOL_GPL(ip6_route_input_lookup);
 
+static void ip6_multipath_l3_keys(const struct sk_buff *skb,
+				  struct flow_keys *keys)
+{
+	const struct ipv6hdr *outer_iph = ipv6_hdr(skb);
+	const struct ipv6hdr *key_iph = outer_iph;
+	const struct ipv6hdr *inner_iph;
+	const struct icmp6hdr *icmph;
+	struct ipv6hdr _inner_iph;
+
+	if (likely(outer_iph->nexthdr != IPPROTO_ICMPV6))
+		goto out;
+
+	icmph = icmp6_hdr(skb);
+	if (icmph->icmp6_type != ICMPV6_DEST_UNREACH &&
+	    icmph->icmp6_type != ICMPV6_PKT_TOOBIG &&
+	    icmph->icmp6_type != ICMPV6_TIME_EXCEED &&
+	    icmph->icmp6_type != ICMPV6_PARAMPROB)
+		goto out;
+
+	inner_iph = skb_header_pointer(skb,
+				       skb_transport_offset(skb) + sizeof(*icmph),
+				       sizeof(_inner_iph), &_inner_iph);
+	if (!inner_iph)
+		goto out;
+
+	key_iph = inner_iph;
+out:
+	memset(keys, 0, sizeof(*keys));
+	keys->control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
+	keys->addrs.v6addrs.src = key_iph->saddr;
+	keys->addrs.v6addrs.dst = key_iph->daddr;
+	keys->tags.flow_label = ip6_flowinfo(key_iph);
+	keys->basic.ip_proto = key_iph->nexthdr;
+}
+
+/* if skb is set it will be used and fl6 can be NULL */
+u32 rt6_multipath_hash(const struct flowi6 *fl6, const struct sk_buff *skb)
+{
+	struct flow_keys hash_keys;
+
+	if (skb) {
+		ip6_multipath_l3_keys(skb, &hash_keys);
+		return flow_hash_from_keys(&hash_keys);
+	}
+
+	return get_hash_from_flowi6(fl6);
+}
+
 void ip6_route_input(struct sk_buff *skb)
 {
 	const struct ipv6hdr *iph = ipv6_hdr(skb);
@@ -1232,6 +1280,8 @@ void ip6_route_input(struct sk_buff *skb)
 	tun_info = skb_tunnel_info(skb);
 	if (tun_info && !(tun_info->mode & IP_TUNNEL_INFO_TX))
 		fl6.flowi6_tun_key.tun_id = tun_info->key.tun_id;
+	if (unlikely(fl6.flowi6_proto == IPPROTO_ICMPV6))
+		fl6.mp_hash = rt6_multipath_hash(&fl6, skb);
 	skb_dst_drop(skb);
 	skb_dst_set(skb, ip6_route_input_lookup(net, skb->dev, &fl6, flags));
 }
-- 
cgit v1.2.3


From 2412e927604e0af7e17ae4b688b85a1e87e378fe Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Tue, 1 Aug 2017 11:59:49 -0400
Subject: sunrpc: Const-ify instances of struct svc_xprt_ops

Close an attack vector by moving the arrays of server-side transport
methods to read-only memory.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: J. Bruce Fields <bfields@redhat.com>
---
 include/linux/sunrpc/svc_xprt.h          | 4 ++--
 net/sunrpc/svcsock.c                     | 6 +++---
 net/sunrpc/xprtrdma/svc_rdma_transport.c | 4 ++--
 3 files changed, 7 insertions(+), 7 deletions(-)

(limited to 'include')

diff --git a/include/linux/sunrpc/svc_xprt.h b/include/linux/sunrpc/svc_xprt.h
index ddb7f94a9d06..6a2ad38f5458 100644
--- a/include/linux/sunrpc/svc_xprt.h
+++ b/include/linux/sunrpc/svc_xprt.h
@@ -31,7 +31,7 @@ struct svc_xprt_ops {
 struct svc_xprt_class {
 	const char		*xcl_name;
 	struct module		*xcl_owner;
-	struct svc_xprt_ops	*xcl_ops;
+	const struct svc_xprt_ops *xcl_ops;
 	struct list_head	xcl_list;
 	u32			xcl_max_payload;
 	int			xcl_ident;
@@ -49,7 +49,7 @@ struct svc_xpt_user {
 
 struct svc_xprt {
 	struct svc_xprt_class	*xpt_class;
-	struct svc_xprt_ops	*xpt_ops;
+	const struct svc_xprt_ops *xpt_ops;
 	struct kref		xpt_ref;
 	struct list_head	xpt_list;
 	struct list_head	xpt_ready;
diff --git a/net/sunrpc/svcsock.c b/net/sunrpc/svcsock.c
index 2b720fa35c4f..b434f524d677 100644
--- a/net/sunrpc/svcsock.c
+++ b/net/sunrpc/svcsock.c
@@ -687,7 +687,7 @@ static struct svc_xprt *svc_udp_create(struct svc_serv *serv,
 	return svc_create_socket(serv, IPPROTO_UDP, net, sa, salen, flags);
 }
 
-static struct svc_xprt_ops svc_udp_ops = {
+static const struct svc_xprt_ops svc_udp_ops = {
 	.xpo_create = svc_udp_create,
 	.xpo_recvfrom = svc_udp_recvfrom,
 	.xpo_sendto = svc_udp_sendto,
@@ -1229,7 +1229,7 @@ static void svc_bc_tcp_sock_detach(struct svc_xprt *xprt)
 {
 }
 
-static struct svc_xprt_ops svc_tcp_bc_ops = {
+static const struct svc_xprt_ops svc_tcp_bc_ops = {
 	.xpo_create = svc_bc_tcp_create,
 	.xpo_detach = svc_bc_tcp_sock_detach,
 	.xpo_free = svc_bc_sock_free,
@@ -1263,7 +1263,7 @@ static void svc_cleanup_bc_xprt_sock(void)
 }
 #endif /* CONFIG_SUNRPC_BACKCHANNEL */
 
-static struct svc_xprt_ops svc_tcp_ops = {
+static const struct svc_xprt_ops svc_tcp_ops = {
 	.xpo_create = svc_tcp_create,
 	.xpo_recvfrom = svc_tcp_recvfrom,
 	.xpo_sendto = svc_tcp_sendto,
diff --git a/net/sunrpc/xprtrdma/svc_rdma_transport.c b/net/sunrpc/xprtrdma/svc_rdma_transport.c
index e660d4965b18..2aa8473dcc97 100644
--- a/net/sunrpc/xprtrdma/svc_rdma_transport.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_transport.c
@@ -70,7 +70,7 @@ static int svc_rdma_has_wspace(struct svc_xprt *xprt);
 static int svc_rdma_secure_port(struct svc_rqst *);
 static void svc_rdma_kill_temp_xprt(struct svc_xprt *);
 
-static struct svc_xprt_ops svc_rdma_ops = {
+static const struct svc_xprt_ops svc_rdma_ops = {
 	.xpo_create = svc_rdma_create,
 	.xpo_recvfrom = svc_rdma_recvfrom,
 	.xpo_sendto = svc_rdma_sendto,
@@ -98,7 +98,7 @@ static struct svc_xprt *svc_rdma_bc_create(struct svc_serv *, struct net *,
 static void svc_rdma_bc_detach(struct svc_xprt *);
 static void svc_rdma_bc_free(struct svc_xprt *);
 
-static struct svc_xprt_ops svc_rdma_bc_ops = {
+static const struct svc_xprt_ops svc_rdma_bc_ops = {
 	.xpo_create = svc_rdma_bc_create,
 	.xpo_detach = svc_rdma_bc_detach,
 	.xpo_free = svc_rdma_bc_free,
-- 
cgit v1.2.3


From afea5657c20b11ec9f895ac5cc33b560fb1e0276 Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Tue, 1 Aug 2017 12:00:06 -0400
Subject: sunrpc: Const-ify struct sv_serv_ops

Close an attack vector by moving the arrays of per-server methods to
read-only memory.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: J. Bruce Fields <bfields@redhat.com>
---
 fs/lockd/svc.c             |  2 +-
 fs/nfs/callback.c          | 10 +++++-----
 fs/nfsd/nfssvc.c           |  2 +-
 include/linux/sunrpc/svc.h |  6 +++---
 net/sunrpc/svc.c           |  6 +++---
 5 files changed, 13 insertions(+), 13 deletions(-)

(limited to 'include')

diff --git a/fs/lockd/svc.c b/fs/lockd/svc.c
index 726b6cecf430..b995bdc13976 100644
--- a/fs/lockd/svc.c
+++ b/fs/lockd/svc.c
@@ -396,7 +396,7 @@ out_rqst:
 	return error;
 }
 
-static struct svc_serv_ops lockd_sv_ops = {
+static const struct svc_serv_ops lockd_sv_ops = {
 	.svo_shutdown		= svc_rpcb_cleanup,
 	.svo_enqueue_xprt	= svc_xprt_do_enqueue,
 };
diff --git a/fs/nfs/callback.c b/fs/nfs/callback.c
index 34323877ec13..2cddf7f437e6 100644
--- a/fs/nfs/callback.c
+++ b/fs/nfs/callback.c
@@ -226,26 +226,26 @@ err_bind:
 	return ret;
 }
 
-static struct svc_serv_ops nfs40_cb_sv_ops = {
+static const struct svc_serv_ops nfs40_cb_sv_ops = {
 	.svo_function		= nfs4_callback_svc,
 	.svo_enqueue_xprt	= svc_xprt_do_enqueue,
 	.svo_setup		= svc_set_num_threads_sync,
 	.svo_module		= THIS_MODULE,
 };
 #if defined(CONFIG_NFS_V4_1)
-static struct svc_serv_ops nfs41_cb_sv_ops = {
+static const struct svc_serv_ops nfs41_cb_sv_ops = {
 	.svo_function		= nfs41_callback_svc,
 	.svo_enqueue_xprt	= svc_xprt_do_enqueue,
 	.svo_setup		= svc_set_num_threads_sync,
 	.svo_module		= THIS_MODULE,
 };
 
-static struct svc_serv_ops *nfs4_cb_sv_ops[] = {
+static const struct svc_serv_ops *nfs4_cb_sv_ops[] = {
 	[0] = &nfs40_cb_sv_ops,
 	[1] = &nfs41_cb_sv_ops,
 };
 #else
-static struct svc_serv_ops *nfs4_cb_sv_ops[] = {
+static const struct svc_serv_ops *nfs4_cb_sv_ops[] = {
 	[0] = &nfs40_cb_sv_ops,
 	[1] = NULL,
 };
@@ -254,8 +254,8 @@ static struct svc_serv_ops *nfs4_cb_sv_ops[] = {
 static struct svc_serv *nfs_callback_create_svc(int minorversion)
 {
 	struct nfs_callback_data *cb_info = &nfs_callback_info[minorversion];
+	const struct svc_serv_ops *sv_ops;
 	struct svc_serv *serv;
-	struct svc_serv_ops *sv_ops;
 
 	/*
 	 * Check whether we're already up and running.
diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c
index 063ae7de2c12..7e3af3ef0917 100644
--- a/fs/nfsd/nfssvc.c
+++ b/fs/nfsd/nfssvc.c
@@ -475,7 +475,7 @@ static int nfsd_get_default_max_blksize(void)
 	return ret;
 }
 
-static struct svc_serv_ops nfsd_thread_sv_ops = {
+static const struct svc_serv_ops nfsd_thread_sv_ops = {
 	.svo_shutdown		= nfsd_last_thread,
 	.svo_function		= nfsd,
 	.svo_enqueue_xprt	= svc_xprt_do_enqueue,
diff --git a/include/linux/sunrpc/svc.h b/include/linux/sunrpc/svc.h
index a3f8af9bd543..38f561b2dda3 100644
--- a/include/linux/sunrpc/svc.h
+++ b/include/linux/sunrpc/svc.h
@@ -99,7 +99,7 @@ struct svc_serv {
 
 	unsigned int		sv_nrpools;	/* number of thread pools */
 	struct svc_pool *	sv_pools;	/* array of thread pools */
-	struct svc_serv_ops	*sv_ops;	/* server operations */
+	const struct svc_serv_ops *sv_ops;	/* server operations */
 #if defined(CONFIG_SUNRPC_BACKCHANNEL)
 	struct list_head	sv_cb_list;	/* queue for callback requests
 						 * that arrive over the same
@@ -465,7 +465,7 @@ int svc_rpcb_setup(struct svc_serv *serv, struct net *net);
 void svc_rpcb_cleanup(struct svc_serv *serv, struct net *net);
 int svc_bind(struct svc_serv *serv, struct net *net);
 struct svc_serv *svc_create(struct svc_program *, unsigned int,
-			    struct svc_serv_ops *);
+			    const struct svc_serv_ops *);
 struct svc_rqst *svc_rqst_alloc(struct svc_serv *serv,
 					struct svc_pool *pool, int node);
 struct svc_rqst *svc_prepare_thread(struct svc_serv *serv,
@@ -475,7 +475,7 @@ void		   svc_exit_thread(struct svc_rqst *);
 unsigned int	   svc_pool_map_get(void);
 void		   svc_pool_map_put(void);
 struct svc_serv *  svc_create_pooled(struct svc_program *, unsigned int,
-			struct svc_serv_ops *);
+			const struct svc_serv_ops *);
 int		   svc_set_num_threads(struct svc_serv *, struct svc_pool *, int);
 int		   svc_set_num_threads_sync(struct svc_serv *, struct svc_pool *, int);
 int		   svc_pool_stats_open(struct svc_serv *serv, struct file *file);
diff --git a/net/sunrpc/svc.c b/net/sunrpc/svc.c
index 85ce0db5b0a6..aa04666f929d 100644
--- a/net/sunrpc/svc.c
+++ b/net/sunrpc/svc.c
@@ -421,7 +421,7 @@ __svc_init_bc(struct svc_serv *serv)
  */
 static struct svc_serv *
 __svc_create(struct svc_program *prog, unsigned int bufsize, int npools,
-	     struct svc_serv_ops *ops)
+	     const struct svc_serv_ops *ops)
 {
 	struct svc_serv	*serv;
 	unsigned int vers;
@@ -486,7 +486,7 @@ __svc_create(struct svc_program *prog, unsigned int bufsize, int npools,
 
 struct svc_serv *
 svc_create(struct svc_program *prog, unsigned int bufsize,
-	   struct svc_serv_ops *ops)
+	   const struct svc_serv_ops *ops)
 {
 	return __svc_create(prog, bufsize, /*npools*/1, ops);
 }
@@ -494,7 +494,7 @@ EXPORT_SYMBOL_GPL(svc_create);
 
 struct svc_serv *
 svc_create_pooled(struct svc_program *prog, unsigned int bufsize,
-		  struct svc_serv_ops *ops)
+		  const struct svc_serv_ops *ops)
 {
 	struct svc_serv *serv;
 	unsigned int npools = svc_pool_map_get();
-- 
cgit v1.2.3


From 551143d8d954fe398324a5caa276f518466c428b Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Thu, 24 Aug 2017 21:12:28 -0700
Subject: net_sched: fix a refcount_t issue with noop_qdisc

syzkaller reported a refcount_t warning [1]

Issue here is that noop_qdisc refcnt was never really considered as
a true refcount, since qdisc_destroy() found TCQ_F_BUILTIN set :

if (qdisc->flags & TCQ_F_BUILTIN ||
    !refcount_dec_and_test(&qdisc->refcnt)))
	return;

Meaning that all atomic_inc() we did on noop_qdisc.refcnt were not
really needed, but harmless until refcount_t came.

To fix this problem, we simply need to not increment noop_qdisc.refcnt,
since we never decrement it.

[1]
refcount_t: increment on 0; use-after-free.
------------[ cut here ]------------
WARNING: CPU: 0 PID: 21754 at lib/refcount.c:152 refcount_inc+0x47/0x50 lib/refcount.c:152
Kernel panic - not syncing: panic_on_warn set ...

CPU: 0 PID: 21754 Comm: syz-executor7 Not tainted 4.13.0-rc6+ #20
Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011
Call Trace:
 __dump_stack lib/dump_stack.c:16 [inline]
 dump_stack+0x194/0x257 lib/dump_stack.c:52
 panic+0x1e4/0x417 kernel/panic.c:180
 __warn+0x1c4/0x1d9 kernel/panic.c:541
 report_bug+0x211/0x2d0 lib/bug.c:183
 fixup_bug+0x40/0x90 arch/x86/kernel/traps.c:190
 do_trap_no_signal arch/x86/kernel/traps.c:224 [inline]
 do_trap+0x260/0x390 arch/x86/kernel/traps.c:273
 do_error_trap+0x120/0x390 arch/x86/kernel/traps.c:310
 do_invalid_op+0x1b/0x20 arch/x86/kernel/traps.c:323
 invalid_op+0x1e/0x30 arch/x86/entry/entry_64.S:846
RIP: 0010:refcount_inc+0x47/0x50 lib/refcount.c:152
RSP: 0018:ffff8801c43477a0 EFLAGS: 00010282
RAX: 000000000000002b RBX: ffffffff86093c14 RCX: 0000000000000000
RDX: 000000000000002b RSI: ffffffff8159314e RDI: ffffed0038868ee8
RBP: ffff8801c43477a8 R08: 0000000000000001 R09: 0000000000000000
R10: 0000000000000000 R11: 0000000000000000 R12: ffffffff86093ac0
R13: 0000000000000001 R14: ffff8801d0f3bac0 R15: dffffc0000000000
 attach_default_qdiscs net/sched/sch_generic.c:792 [inline]
 dev_activate+0x7d3/0xaa0 net/sched/sch_generic.c:833
 __dev_open+0x227/0x330 net/core/dev.c:1380
 __dev_change_flags+0x695/0x990 net/core/dev.c:6726
 dev_change_flags+0x88/0x140 net/core/dev.c:6792
 dev_ifsioc+0x5a6/0x930 net/core/dev_ioctl.c:256
 dev_ioctl+0x2bc/0xf90 net/core/dev_ioctl.c:554
 sock_do_ioctl+0x94/0xb0 net/socket.c:968
 sock_ioctl+0x2c2/0x440 net/socket.c:1058
 vfs_ioctl fs/ioctl.c:45 [inline]
 do_vfs_ioctl+0x1b1/0x1520 fs/ioctl.c:685
 SYSC_ioctl fs/ioctl.c:700 [inline]
 SyS_ioctl+0x8f/0xc0 fs/ioctl.c:691

Fixes: 7b9364050246 ("net, sched: convert Qdisc.refcnt from atomic_t to refcount_t")
Signed-off-by: Eric Dumazet <edumazet@google.com>
Reported-by: Dmitry Vyukov <dvyukov@google.com>
Cc: Reshetova, Elena <elena.reshetova@intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/sch_generic.h | 7 +++++++
 net/sched/sch_api.c       | 6 +++---
 net/sched/sch_generic.c   | 2 +-
 3 files changed, 11 insertions(+), 4 deletions(-)

(limited to 'include')

diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h
index 67f815e5d525..c1109cdbbfa6 100644
--- a/include/net/sch_generic.h
+++ b/include/net/sch_generic.h
@@ -101,6 +101,13 @@ struct Qdisc {
 	spinlock_t		busylock ____cacheline_aligned_in_smp;
 };
 
+static inline void qdisc_refcount_inc(struct Qdisc *qdisc)
+{
+	if (qdisc->flags & TCQ_F_BUILTIN)
+		return;
+	refcount_inc(&qdisc->refcnt);
+}
+
 static inline bool qdisc_is_running(const struct Qdisc *qdisc)
 {
 	return (raw_read_seqcount(&qdisc->running) & 1) ? true : false;
diff --git a/net/sched/sch_api.c b/net/sched/sch_api.c
index a3fa144b8648..4fb5a3222d0d 100644
--- a/net/sched/sch_api.c
+++ b/net/sched/sch_api.c
@@ -836,7 +836,7 @@ static int qdisc_graft(struct net_device *dev, struct Qdisc *parent,
 
 			old = dev_graft_qdisc(dev_queue, new);
 			if (new && i > 0)
-				refcount_inc(&new->refcnt);
+				qdisc_refcount_inc(new);
 
 			if (!ingress)
 				qdisc_destroy(old);
@@ -847,7 +847,7 @@ skip:
 			notify_and_destroy(net, skb, n, classid,
 					   dev->qdisc, new);
 			if (new && !new->ops->attach)
-				refcount_inc(&new->refcnt);
+				qdisc_refcount_inc(new);
 			dev->qdisc = new ? : &noop_qdisc;
 
 			if (new && new->ops->attach)
@@ -1256,7 +1256,7 @@ replay:
 				if (q == p ||
 				    (p && check_loop(q, p, 0)))
 					return -ELOOP;
-				refcount_inc(&q->refcnt);
+				qdisc_refcount_inc(q);
 				goto graft;
 			} else {
 				if (!q)
diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c
index 57ba406f1437..4ba6da5fb254 100644
--- a/net/sched/sch_generic.c
+++ b/net/sched/sch_generic.c
@@ -785,7 +785,7 @@ static void attach_default_qdiscs(struct net_device *dev)
 	    dev->priv_flags & IFF_NO_QUEUE) {
 		netdev_for_each_tx_queue(dev, attach_one_default_qdisc, NULL);
 		dev->qdisc = txq->qdisc_sleeping;
-		refcount_inc(&dev->qdisc->refcnt);
+		qdisc_refcount_inc(dev->qdisc);
 	} else {
 		qdisc = qdisc_create_dflt(txq, &mq_qdisc_ops, TC_H_ROOT);
 		if (qdisc) {
-- 
cgit v1.2.3


From 3fd87127073292538047adf1c9c757e9cab0dd56 Mon Sep 17 00:00:00 2001
From: Eric Biggers <ebiggers@google.com>
Date: Thu, 24 Aug 2017 14:38:51 -0700
Subject: strparser: initialize all callbacks

commit bbb03029a899 ("strparser: Generalize strparser") added more
function pointers to 'struct strp_callbacks'; however, kcm_attach() was
not updated to initialize them.  This could cause the ->lock() and/or
->unlock() function pointers to be set to garbage values, causing a
crash in strp_work().

Fix the bug by moving the callback structs into static memory, so
unspecified members are zeroed.  Also constify them while we're at it.

This bug was found by syzkaller, which encountered the following splat:

    IP: 0x55
    PGD 3b1ca067
    P4D 3b1ca067
    PUD 3b12f067
    PMD 0

    Oops: 0010 [#1] SMP KASAN
    Dumping ftrace buffer:
       (ftrace buffer empty)
    Modules linked in:
    CPU: 2 PID: 1194 Comm: kworker/u8:1 Not tainted 4.13.0-rc4-next-20170811 #2
    Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS Bochs 01/01/2011
    Workqueue: kstrp strp_work
    task: ffff88006bb0e480 task.stack: ffff88006bb10000
    RIP: 0010:0x55
    RSP: 0018:ffff88006bb17540 EFLAGS: 00010246
    RAX: dffffc0000000000 RBX: ffff88006ce4bd60 RCX: 0000000000000000
    RDX: 1ffff1000d9c97bd RSI: 0000000000000000 RDI: ffff88006ce4bc48
    RBP: ffff88006bb17558 R08: ffffffff81467ab2 R09: 0000000000000000
    R10: ffff88006bb17438 R11: ffff88006bb17940 R12: ffff88006ce4bc48
    R13: ffff88003c683018 R14: ffff88006bb17980 R15: ffff88003c683000
    FS:  0000000000000000(0000) GS:ffff88006de00000(0000) knlGS:0000000000000000
    CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
    CR2: 0000000000000055 CR3: 000000003c145000 CR4: 00000000000006e0
    DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
    DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
    Call Trace:
     process_one_work+0xbf3/0x1bc0 kernel/workqueue.c:2098
     worker_thread+0x223/0x1860 kernel/workqueue.c:2233
     kthread+0x35e/0x430 kernel/kthread.c:231
     ret_from_fork+0x2a/0x40 arch/x86/entry/entry_64.S:431
    Code:  Bad RIP value.
    RIP: 0x55 RSP: ffff88006bb17540
    CR2: 0000000000000055
    ---[ end trace f0e4920047069cee ]---

Here is a C reproducer (requires CONFIG_BPF_SYSCALL=y and
CONFIG_AF_KCM=y):

    #include <linux/bpf.h>
    #include <linux/kcm.h>
    #include <linux/types.h>
    #include <stdint.h>
    #include <sys/ioctl.h>
    #include <sys/socket.h>
    #include <sys/syscall.h>
    #include <unistd.h>

    static const struct bpf_insn bpf_insns[3] = {
        { .code = 0xb7 }, /* BPF_MOV64_IMM(0, 0) */
        { .code = 0x95 }, /* BPF_EXIT_INSN() */
    };

    static const union bpf_attr bpf_attr = {
        .prog_type = 1,
        .insn_cnt = 2,
        .insns = (uintptr_t)&bpf_insns,
        .license = (uintptr_t)"",
    };

    int main(void)
    {
        int bpf_fd = syscall(__NR_bpf, BPF_PROG_LOAD,
                             &bpf_attr, sizeof(bpf_attr));
        int inet_fd = socket(AF_INET, SOCK_STREAM, 0);
        int kcm_fd = socket(AF_KCM, SOCK_DGRAM, 0);

        ioctl(kcm_fd, SIOCKCMATTACH,
              &(struct kcm_attach) { .fd = inet_fd, .bpf_fd = bpf_fd });
    }

Fixes: bbb03029a899 ("strparser: Generalize strparser")
Cc: Dmitry Vyukov <dvyukov@google.com>
Cc: Tom Herbert <tom@quantonium.net>
Signed-off-by: Eric Biggers <ebiggers@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 Documentation/networking/strparser.txt |  2 +-
 include/net/strparser.h                |  2 +-
 kernel/bpf/sockmap.c                   | 10 +++++-----
 net/kcm/kcmsock.c                      | 11 +++++------
 net/strparser/strparser.c              |  2 +-
 5 files changed, 13 insertions(+), 14 deletions(-)

(limited to 'include')

diff --git a/Documentation/networking/strparser.txt b/Documentation/networking/strparser.txt
index fe01302471ae..13081b3decef 100644
--- a/Documentation/networking/strparser.txt
+++ b/Documentation/networking/strparser.txt
@@ -35,7 +35,7 @@ Functions
 =========
 
 strp_init(struct strparser *strp, struct sock *sk,
-	  struct strp_callbacks *cb)
+	  const struct strp_callbacks *cb)
 
      Called to initialize a stream parser. strp is a struct of type
      strparser that is allocated by the upper layer. sk is the TCP
diff --git a/include/net/strparser.h b/include/net/strparser.h
index 4fe966a0ad92..7dc131d62ad5 100644
--- a/include/net/strparser.h
+++ b/include/net/strparser.h
@@ -138,7 +138,7 @@ void strp_done(struct strparser *strp);
 void strp_stop(struct strparser *strp);
 void strp_check_rcv(struct strparser *strp);
 int strp_init(struct strparser *strp, struct sock *sk,
-	      struct strp_callbacks *cb);
+	      const struct strp_callbacks *cb);
 void strp_data_ready(struct strparser *strp);
 int strp_process(struct strparser *strp, struct sk_buff *orig_skb,
 		 unsigned int orig_offset, size_t orig_len,
diff --git a/kernel/bpf/sockmap.c b/kernel/bpf/sockmap.c
index 78b2bb9370ac..617c239590c2 100644
--- a/kernel/bpf/sockmap.c
+++ b/kernel/bpf/sockmap.c
@@ -368,12 +368,12 @@ static int smap_read_sock_done(struct strparser *strp, int err)
 static int smap_init_sock(struct smap_psock *psock,
 			  struct sock *sk)
 {
-	struct strp_callbacks cb;
+	static const struct strp_callbacks cb = {
+		.rcv_msg = smap_read_sock_strparser,
+		.parse_msg = smap_parse_func_strparser,
+		.read_sock_done = smap_read_sock_done,
+	};
 
-	memset(&cb, 0, sizeof(cb));
-	cb.rcv_msg = smap_read_sock_strparser;
-	cb.parse_msg = smap_parse_func_strparser;
-	cb.read_sock_done = smap_read_sock_done;
 	return strp_init(&psock->strp, sk, &cb);
 }
 
diff --git a/net/kcm/kcmsock.c b/net/kcm/kcmsock.c
index 88ce73288247..48e993b2dbcf 100644
--- a/net/kcm/kcmsock.c
+++ b/net/kcm/kcmsock.c
@@ -1376,7 +1376,11 @@ static int kcm_attach(struct socket *sock, struct socket *csock,
 	struct kcm_psock *psock = NULL, *tpsock;
 	struct list_head *head;
 	int index = 0;
-	struct strp_callbacks cb;
+	static const struct strp_callbacks cb = {
+		.rcv_msg = kcm_rcv_strparser,
+		.parse_msg = kcm_parse_func_strparser,
+		.read_sock_done = kcm_read_sock_done,
+	};
 	int err;
 
 	csk = csock->sk;
@@ -1391,11 +1395,6 @@ static int kcm_attach(struct socket *sock, struct socket *csock,
 	psock->sk = csk;
 	psock->bpf_prog = prog;
 
-	cb.rcv_msg = kcm_rcv_strparser;
-	cb.abort_parser = NULL;
-	cb.parse_msg = kcm_parse_func_strparser;
-	cb.read_sock_done = kcm_read_sock_done;
-
 	err = strp_init(&psock->strp, csk, &cb);
 	if (err) {
 		kmem_cache_free(kcm_psockp, psock);
diff --git a/net/strparser/strparser.c b/net/strparser/strparser.c
index 434aa6637a52..d4ea46a5f233 100644
--- a/net/strparser/strparser.c
+++ b/net/strparser/strparser.c
@@ -472,7 +472,7 @@ static void strp_sock_unlock(struct strparser *strp)
 }
 
 int strp_init(struct strparser *strp, struct sock *sk,
-	      struct strp_callbacks *cb)
+	      const struct strp_callbacks *cb)
 {
 
 	if (!cb || !cb->rcv_msg || !cb->parse_msg)
-- 
cgit v1.2.3


From 6ae5fa61d27dcb055f4198bcf6c8dbbf1bb33f52 Mon Sep 17 00:00:00 2001
From: Andi Kleen <ak@linux.intel.com>
Date: Wed, 16 Aug 2017 15:21:54 -0700
Subject: perf/x86: Fix data source decoding for Skylake

Skylake changed the encoding of the PEBS data source field.
Some combinations are not available anymore, but some new cases
e.g. for L4 cache hit are added.

Fix up the conversion table for Skylake, similar as had been done
for Nehalem.

On Skylake server the encoding for L4 actually means persistent
memory. Handle this case too.

To properly describe it in the abstracted perf format I had to add
some new fields. Since a hit can have only one level add a new
field that is an enumeration, not a bit field to describe
the level. It can describe any level. Some numbers are also
used to describe PMEM and LFB.

Also add a new generic remote flag that can be combined with
the generic level to signify a remote cache.

And there is an extension field for the snoop indication to handle
the Forward state.

I didn't add a generic flag for hops because it's not needed
for Skylake.

I changed the existing encodings for older CPUs to also fill in the
new level and remote fields.

Signed-off-by: Andi Kleen <ak@linux.intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Madhavan Srinivasan <maddy@linux.vnet.ibm.com>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: acme@kernel.org
Cc: jolsa@kernel.org
Link: http://lkml.kernel.org/r/20170816222156.19953-3-andi@firstfloor.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/events/intel/core.c    |  2 ++
 arch/x86/events/intel/ds.c      | 51 ++++++++++++++++++++++++++---------------
 arch/x86/events/perf_event.h    |  2 ++
 include/uapi/linux/perf_event.h | 30 ++++++++++++++++++++++--
 4 files changed, 64 insertions(+), 21 deletions(-)

(limited to 'include')

diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c
index c3439a36dcf9..6f342001ec6a 100644
--- a/arch/x86/events/intel/core.c
+++ b/arch/x86/events/intel/core.c
@@ -4208,6 +4208,8 @@ __init int intel_pmu_init(void)
 						  skl_format_attr);
 		WARN_ON(!x86_pmu.format_attrs);
 		x86_pmu.cpu_events = hsw_events_attrs;
+		intel_pmu_pebs_data_source_skl(
+			boot_cpu_data.x86_model == INTEL_FAM6_SKYLAKE_X);
 		pr_cont("Skylake events, ");
 		break;
 
diff --git a/arch/x86/events/intel/ds.c b/arch/x86/events/intel/ds.c
index 3ccdf8cb4495..98e36e0c791c 100644
--- a/arch/x86/events/intel/ds.c
+++ b/arch/x86/events/intel/ds.c
@@ -49,34 +49,47 @@ union intel_x86_pebs_dse {
  */
 #define P(a, b) PERF_MEM_S(a, b)
 #define OP_LH (P(OP, LOAD) | P(LVL, HIT))
+#define LEVEL(x) P(LVLNUM, x)
+#define REM P(REMOTE, REMOTE)
 #define SNOOP_NONE_MISS (P(SNOOP, NONE) | P(SNOOP, MISS))
 
 /* Version for Sandy Bridge and later */
 static u64 pebs_data_source[] = {
-	P(OP, LOAD) | P(LVL, MISS) | P(LVL, L3) | P(SNOOP, NA),/* 0x00:ukn L3 */
-	OP_LH | P(LVL, L1)  | P(SNOOP, NONE),	/* 0x01: L1 local */
-	OP_LH | P(LVL, LFB) | P(SNOOP, NONE),	/* 0x02: LFB hit */
-	OP_LH | P(LVL, L2)  | P(SNOOP, NONE),	/* 0x03: L2 hit */
-	OP_LH | P(LVL, L3)  | P(SNOOP, NONE),	/* 0x04: L3 hit */
-	OP_LH | P(LVL, L3)  | P(SNOOP, MISS),	/* 0x05: L3 hit, snoop miss */
-	OP_LH | P(LVL, L3)  | P(SNOOP, HIT),	/* 0x06: L3 hit, snoop hit */
-	OP_LH | P(LVL, L3)  | P(SNOOP, HITM),	/* 0x07: L3 hit, snoop hitm */
-	OP_LH | P(LVL, REM_CCE1) | P(SNOOP, HIT),  /* 0x08: L3 miss snoop hit */
-	OP_LH | P(LVL, REM_CCE1) | P(SNOOP, HITM), /* 0x09: L3 miss snoop hitm*/
-	OP_LH | P(LVL, LOC_RAM)  | P(SNOOP, HIT),  /* 0x0a: L3 miss, shared */
-	OP_LH | P(LVL, REM_RAM1) | P(SNOOP, HIT),  /* 0x0b: L3 miss, shared */
-	OP_LH | P(LVL, LOC_RAM)  | SNOOP_NONE_MISS,/* 0x0c: L3 miss, excl */
-	OP_LH | P(LVL, REM_RAM1) | SNOOP_NONE_MISS,/* 0x0d: L3 miss, excl */
-	OP_LH | P(LVL, IO)  | P(SNOOP, NONE), /* 0x0e: I/O */
-	OP_LH | P(LVL, UNC) | P(SNOOP, NONE), /* 0x0f: uncached */
+	P(OP, LOAD) | P(LVL, MISS) | LEVEL(L3) | P(SNOOP, NA),/* 0x00:ukn L3 */
+	OP_LH | P(LVL, L1)  | LEVEL(L1) | P(SNOOP, NONE),  /* 0x01: L1 local */
+	OP_LH | P(LVL, LFB) | LEVEL(LFB) | P(SNOOP, NONE), /* 0x02: LFB hit */
+	OP_LH | P(LVL, L2)  | LEVEL(L2) | P(SNOOP, NONE),  /* 0x03: L2 hit */
+	OP_LH | P(LVL, L3)  | LEVEL(L3) | P(SNOOP, NONE),  /* 0x04: L3 hit */
+	OP_LH | P(LVL, L3)  | LEVEL(L3) | P(SNOOP, MISS),  /* 0x05: L3 hit, snoop miss */
+	OP_LH | P(LVL, L3)  | LEVEL(L3) | P(SNOOP, HIT),   /* 0x06: L3 hit, snoop hit */
+	OP_LH | P(LVL, L3)  | LEVEL(L3) | P(SNOOP, HITM),  /* 0x07: L3 hit, snoop hitm */
+	OP_LH | P(LVL, REM_CCE1) | REM | LEVEL(L3) | P(SNOOP, HIT),  /* 0x08: L3 miss snoop hit */
+	OP_LH | P(LVL, REM_CCE1) | REM | LEVEL(L3) | P(SNOOP, HITM), /* 0x09: L3 miss snoop hitm*/
+	OP_LH | P(LVL, LOC_RAM)  | LEVEL(RAM) | P(SNOOP, HIT),       /* 0x0a: L3 miss, shared */
+	OP_LH | P(LVL, REM_RAM1) | REM | LEVEL(L3) | P(SNOOP, HIT),  /* 0x0b: L3 miss, shared */
+	OP_LH | P(LVL, LOC_RAM)  | LEVEL(RAM) | SNOOP_NONE_MISS,     /* 0x0c: L3 miss, excl */
+	OP_LH | P(LVL, REM_RAM1) | LEVEL(RAM) | REM | SNOOP_NONE_MISS, /* 0x0d: L3 miss, excl */
+	OP_LH | P(LVL, IO)  | LEVEL(NA) | P(SNOOP, NONE), /* 0x0e: I/O */
+	OP_LH | P(LVL, UNC) | LEVEL(NA) | P(SNOOP, NONE), /* 0x0f: uncached */
 };
 
 /* Patch up minor differences in the bits */
 void __init intel_pmu_pebs_data_source_nhm(void)
 {
-	pebs_data_source[0x05] = OP_LH | P(LVL, L3)  | P(SNOOP, HIT);
-	pebs_data_source[0x06] = OP_LH | P(LVL, L3)  | P(SNOOP, HITM);
-	pebs_data_source[0x07] = OP_LH | P(LVL, L3)  | P(SNOOP, HITM);
+	pebs_data_source[0x05] = OP_LH | P(LVL, L3) | LEVEL(L3) | P(SNOOP, HIT);
+	pebs_data_source[0x06] = OP_LH | P(LVL, L3) | LEVEL(L3) | P(SNOOP, HITM);
+	pebs_data_source[0x07] = OP_LH | P(LVL, L3) | LEVEL(L3) | P(SNOOP, HITM);
+}
+
+void __init intel_pmu_pebs_data_source_skl(bool pmem)
+{
+	u64 pmem_or_l4 = pmem ? LEVEL(PMEM) : LEVEL(L4);
+
+	pebs_data_source[0x08] = OP_LH | pmem_or_l4 | P(SNOOP, HIT);
+	pebs_data_source[0x09] = OP_LH | pmem_or_l4 | REM | P(SNOOP, HIT);
+	pebs_data_source[0x0b] = OP_LH | LEVEL(RAM) | REM | P(SNOOP, NONE);
+	pebs_data_source[0x0c] = OP_LH | LEVEL(ANY_CACHE) | REM | P(SNOOPX, FWD);
+	pebs_data_source[0x0d] = OP_LH | LEVEL(ANY_CACHE) | REM | P(SNOOP, HITM);
 }
 
 static u64 precise_store_data(u64 status)
diff --git a/arch/x86/events/perf_event.h b/arch/x86/events/perf_event.h
index 2e9636e4068f..0f7dad8bd358 100644
--- a/arch/x86/events/perf_event.h
+++ b/arch/x86/events/perf_event.h
@@ -948,6 +948,8 @@ void intel_pmu_lbr_init_knl(void);
 
 void intel_pmu_pebs_data_source_nhm(void);
 
+void intel_pmu_pebs_data_source_skl(bool pmem);
+
 int intel_pmu_setup_lbr_filter(struct perf_event *event);
 
 void intel_pt_interrupt(void);
diff --git a/include/uapi/linux/perf_event.h b/include/uapi/linux/perf_event.h
index 642db5fa3286..2a37ae925d85 100644
--- a/include/uapi/linux/perf_event.h
+++ b/include/uapi/linux/perf_event.h
@@ -954,14 +954,20 @@ union perf_mem_data_src {
 			mem_snoop:5,	/* snoop mode */
 			mem_lock:2,	/* lock instr */
 			mem_dtlb:7,	/* tlb access */
-			mem_rsvd:31;
+			mem_lvl_num:4,	/* memory hierarchy level number */
+			mem_remote:1,   /* remote */
+			mem_snoopx:2,	/* snoop mode, ext */
+			mem_rsvd:24;
 	};
 };
 #elif defined(__BIG_ENDIAN_BITFIELD)
 union perf_mem_data_src {
 	__u64 val;
 	struct {
-		__u64	mem_rsvd:31,
+		__u64	mem_rsvd:24,
+			mem_snoopx:2,	/* snoop mode, ext */
+			mem_remote:1,   /* remote */
+			mem_lvl_num:4,	/* memory hierarchy level number */
 			mem_dtlb:7,	/* tlb access */
 			mem_lock:2,	/* lock instr */
 			mem_snoop:5,	/* snoop mode */
@@ -998,6 +1004,22 @@ union perf_mem_data_src {
 #define PERF_MEM_LVL_UNC	0x2000 /* Uncached memory */
 #define PERF_MEM_LVL_SHIFT	5
 
+#define PERF_MEM_REMOTE_REMOTE	0x01  /* Remote */
+#define PERF_MEM_REMOTE_SHIFT	37
+
+#define PERF_MEM_LVLNUM_L1	0x01 /* L1 */
+#define PERF_MEM_LVLNUM_L2	0x02 /* L2 */
+#define PERF_MEM_LVLNUM_L3	0x03 /* L3 */
+#define PERF_MEM_LVLNUM_L4	0x04 /* L4 */
+/* 5-0xa available */
+#define PERF_MEM_LVLNUM_ANY_CACHE 0x0b /* Any cache */
+#define PERF_MEM_LVLNUM_LFB	0x0c /* LFB */
+#define PERF_MEM_LVLNUM_RAM	0x0d /* RAM */
+#define PERF_MEM_LVLNUM_PMEM	0x0e /* PMEM */
+#define PERF_MEM_LVLNUM_NA	0x0f /* N/A */
+
+#define PERF_MEM_LVLNUM_SHIFT	33
+
 /* snoop mode */
 #define PERF_MEM_SNOOP_NA	0x01 /* not available */
 #define PERF_MEM_SNOOP_NONE	0x02 /* no snoop */
@@ -1006,6 +1028,10 @@ union perf_mem_data_src {
 #define PERF_MEM_SNOOP_HITM	0x10 /* snoop hit modified */
 #define PERF_MEM_SNOOP_SHIFT	19
 
+#define PERF_MEM_SNOOPX_FWD	0x01 /* forward */
+/* 1 free */
+#define PERF_MEM_SNOOPX_SHIFT	37
+
 /* locked instruction */
 #define PERF_MEM_LOCK_NA	0x01 /* not available */
 #define PERF_MEM_LOCK_LOCKED	0x02 /* locked transaction */
-- 
cgit v1.2.3


From 0e709703af5bbc9ea6e75e1f99c2dd0dae261869 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Fri, 11 Aug 2017 16:04:50 +0200
Subject: mm, locking/barriers: Clarify tlb_flush_pending() barriers

Better document the ordering around tlb_flush_pending().

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/mm_types.h | 78 ++++++++++++++++++++++++++++--------------------
 1 file changed, 45 insertions(+), 33 deletions(-)

(limited to 'include')

diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index dc1edec05a3f..57378c7cb5f8 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -526,30 +526,6 @@ extern void tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm,
 extern void tlb_finish_mmu(struct mmu_gather *tlb,
 				unsigned long start, unsigned long end);
 
-/*
- * Memory barriers to keep this state in sync are graciously provided by
- * the page table locks, outside of which no page table modifications happen.
- * The barriers are used to ensure the order between tlb_flush_pending updates,
- * which happen while the lock is not taken, and the PTE updates, which happen
- * while the lock is taken, are serialized.
- */
-static inline bool mm_tlb_flush_pending(struct mm_struct *mm)
-{
-	/*
-	 * Must be called with PTL held; such that our PTL acquire will have
-	 * observed the store from set_tlb_flush_pending().
-	 */
-	return atomic_read(&mm->tlb_flush_pending) > 0;
-}
-
-/*
- * Returns true if there are two above TLB batching threads in parallel.
- */
-static inline bool mm_tlb_flush_nested(struct mm_struct *mm)
-{
-	return atomic_read(&mm->tlb_flush_pending) > 1;
-}
-
 static inline void init_tlb_flush_pending(struct mm_struct *mm)
 {
 	atomic_set(&mm->tlb_flush_pending, 0);
@@ -558,7 +534,6 @@ static inline void init_tlb_flush_pending(struct mm_struct *mm)
 static inline void inc_tlb_flush_pending(struct mm_struct *mm)
 {
 	atomic_inc(&mm->tlb_flush_pending);
-
 	/*
 	 * The only time this value is relevant is when there are indeed pages
 	 * to flush. And we'll only flush pages after changing them, which
@@ -580,24 +555,61 @@ static inline void inc_tlb_flush_pending(struct mm_struct *mm)
 	 *	flush_tlb_range();
 	 *	atomic_dec(&mm->tlb_flush_pending);
 	 *
-	 * So the =true store is constrained by the PTL unlock, and the =false
-	 * store is constrained by the TLB invalidate.
+	 * Where the increment if constrained by the PTL unlock, it thus
+	 * ensures that the increment is visible if the PTE modification is
+	 * visible. After all, if there is no PTE modification, nobody cares
+	 * about TLB flushes either.
+	 *
+	 * This very much relies on users (mm_tlb_flush_pending() and
+	 * mm_tlb_flush_nested()) only caring about _specific_ PTEs (and
+	 * therefore specific PTLs), because with SPLIT_PTE_PTLOCKS and RCpc
+	 * locks (PPC) the unlock of one doesn't order against the lock of
+	 * another PTL.
+	 *
+	 * The decrement is ordered by the flush_tlb_range(), such that
+	 * mm_tlb_flush_pending() will not return false unless all flushes have
+	 * completed.
 	 */
 }
 
-/* Clearing is done after a TLB flush, which also provides a barrier. */
 static inline void dec_tlb_flush_pending(struct mm_struct *mm)
 {
 	/*
-	 * Guarantee that the tlb_flush_pending does not not leak into the
-	 * critical section, since we must order the PTE change and changes to
-	 * the pending TLB flush indication. We could have relied on TLB flush
-	 * as a memory barrier, but this behavior is not clearly documented.
+	 * See inc_tlb_flush_pending().
+	 *
+	 * This cannot be smp_mb__before_atomic() because smp_mb() simply does
+	 * not order against TLB invalidate completion, which is what we need.
+	 *
+	 * Therefore we must rely on tlb_flush_*() to guarantee order.
 	 */
-	smp_mb__before_atomic();
 	atomic_dec(&mm->tlb_flush_pending);
 }
 
+static inline bool mm_tlb_flush_pending(struct mm_struct *mm)
+{
+	/*
+	 * Must be called after having acquired the PTL; orders against that
+	 * PTLs release and therefore ensures that if we observe the modified
+	 * PTE we must also observe the increment from inc_tlb_flush_pending().
+	 *
+	 * That is, it only guarantees to return true if there is a flush
+	 * pending for _this_ PTL.
+	 */
+	return atomic_read(&mm->tlb_flush_pending);
+}
+
+static inline bool mm_tlb_flush_nested(struct mm_struct *mm)
+{
+	/*
+	 * Similar to mm_tlb_flush_pending(), we must have acquired the PTL
+	 * for which there is a TLB flush pending in order to guarantee
+	 * we've seen both that PTE modification and the increment.
+	 *
+	 * (no requirement on actually still holding the PTL, that is irrelevant)
+	 */
+	return atomic_read(&mm->tlb_flush_pending) > 1;
+}
+
 struct vm_fault;
 
 struct vm_special_mapping {
-- 
cgit v1.2.3


From e6f3faa734a00c606b7b06c6b9f15e5627d3245b Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Wed, 23 Aug 2017 13:23:30 +0200
Subject: locking/lockdep: Fix workqueue crossrelease annotation

The new completion/crossrelease annotations interact unfavourable with
the extant flush_work()/flush_workqueue() annotations.

The problem is that when a single work class does:

  wait_for_completion(&C)

and

  complete(&C)

in different executions, we'll build dependencies like:

  lock_map_acquire(W)
  complete_acquire(C)

and

  lock_map_acquire(W)
  complete_release(C)

which results in the dependency chain: W->C->W, which lockdep thinks
spells deadlock, even though there is no deadlock potential since
works are ran concurrently.

One possibility would be to change the work 'lock' to recursive-read,
but that would mean hitting a lockdep limitation on recursive locks.
Also, unconditinoally switching to recursive-read here would fail to
detect the actual deadlock on single-threaded workqueues, which do
have a problem with this.

For now, forcefully disregard these locks for crossrelease.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Acked-by: Tejun Heo <tj@kernel.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: boqun.feng@gmail.com
Cc: byungchul.park@lge.com
Cc: david@fromorbit.com
Cc: johannes@sipsolutions.net
Cc: oleg@redhat.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/irqflags.h |  4 ++--
 include/linux/lockdep.h  | 10 +++++----
 kernel/locking/lockdep.c | 56 +++++++++++++++++++++++++++++++-----------------
 kernel/workqueue.c       | 23 +++++++++++++++++++-
 4 files changed, 66 insertions(+), 27 deletions(-)

(limited to 'include')

diff --git a/include/linux/irqflags.h b/include/linux/irqflags.h
index 5fdd93bb9300..9bc050bc81b2 100644
--- a/include/linux/irqflags.h
+++ b/include/linux/irqflags.h
@@ -26,7 +26,7 @@
 # define trace_hardirq_enter()			\
 do {						\
 	current->hardirq_context++;		\
-	crossrelease_hist_start(XHLOCK_HARD);	\
+	crossrelease_hist_start(XHLOCK_HARD, 0);\
 } while (0)
 # define trace_hardirq_exit()			\
 do {						\
@@ -36,7 +36,7 @@ do {						\
 # define lockdep_softirq_enter()		\
 do {						\
 	current->softirq_context++;		\
-	crossrelease_hist_start(XHLOCK_SOFT);	\
+	crossrelease_hist_start(XHLOCK_SOFT, 0);\
 } while (0)
 # define lockdep_softirq_exit()			\
 do {						\
diff --git a/include/linux/lockdep.h b/include/linux/lockdep.h
index fc827cab6d6e..78bb7133abed 100644
--- a/include/linux/lockdep.h
+++ b/include/linux/lockdep.h
@@ -18,6 +18,8 @@ extern int lock_stat;
 
 #define MAX_LOCKDEP_SUBCLASSES		8UL
 
+#include <linux/types.h>
+
 #ifdef CONFIG_LOCKDEP
 
 #include <linux/linkage.h>
@@ -578,11 +580,11 @@ extern void lock_commit_crosslock(struct lockdep_map *lock);
 #define STATIC_LOCKDEP_MAP_INIT(_name, _key) \
 	{ .name = (_name), .key = (void *)(_key), .cross = 0, }
 
-extern void crossrelease_hist_start(enum xhlock_context_t c);
+extern void crossrelease_hist_start(enum xhlock_context_t c, bool force);
 extern void crossrelease_hist_end(enum xhlock_context_t c);
 extern void lockdep_init_task(struct task_struct *task);
 extern void lockdep_free_task(struct task_struct *task);
-#else
+#else /* !CROSSRELEASE */
 #define lockdep_init_map_crosslock(m, n, k, s) do {} while (0)
 /*
  * To initialize a lockdep_map statically use this macro.
@@ -591,11 +593,11 @@ extern void lockdep_free_task(struct task_struct *task);
 #define STATIC_LOCKDEP_MAP_INIT(_name, _key) \
 	{ .name = (_name), .key = (void *)(_key), }
 
-static inline void crossrelease_hist_start(enum xhlock_context_t c) {}
+static inline void crossrelease_hist_start(enum xhlock_context_t c, bool force) {}
 static inline void crossrelease_hist_end(enum xhlock_context_t c) {}
 static inline void lockdep_init_task(struct task_struct *task) {}
 static inline void lockdep_free_task(struct task_struct *task) {}
-#endif
+#endif /* CROSSRELEASE */
 
 #ifdef CONFIG_LOCK_STAT
 
diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c
index 66011c9f5df3..f73ca595b81e 100644
--- a/kernel/locking/lockdep.c
+++ b/kernel/locking/lockdep.c
@@ -4629,7 +4629,7 @@ asmlinkage __visible void lockdep_sys_exit(void)
 	 * the index to point to the last entry, which is already invalid.
 	 */
 	crossrelease_hist_end(XHLOCK_PROC);
-	crossrelease_hist_start(XHLOCK_PROC);
+	crossrelease_hist_start(XHLOCK_PROC, false);
 }
 
 void lockdep_rcu_suspicious(const char *file, const int line, const char *s)
@@ -4725,25 +4725,25 @@ static inline void invalidate_xhlock(struct hist_lock *xhlock)
 /*
  * Lock history stacks; we have 3 nested lock history stacks:
  *
- *   Hard IRQ
- *   Soft IRQ
- *   History / Task
+ *   HARD(IRQ)
+ *   SOFT(IRQ)
+ *   PROC(ess)
  *
- * The thing is that once we complete a (Hard/Soft) IRQ the future task locks
- * should not depend on any of the locks observed while running the IRQ.
+ * The thing is that once we complete a HARD/SOFT IRQ the future task locks
+ * should not depend on any of the locks observed while running the IRQ.  So
+ * what we do is rewind the history buffer and erase all our knowledge of that
+ * temporal event.
  *
- * So what we do is rewind the history buffer and erase all our knowledge of
- * that temporal event.
- */
-
-/*
- * We need this to annotate lock history boundaries. Take for instance
- * workqueues; each work is independent of the last. The completion of a future
- * work does not depend on the completion of a past work (in general).
- * Therefore we must not carry that (lock) dependency across works.
+ * The PROCess one is special though; it is used to annotate independence
+ * inside a task.
+ *
+ * Take for instance workqueues; each work is independent of the last. The
+ * completion of a future work does not depend on the completion of a past work
+ * (in general). Therefore we must not carry that (lock) dependency across
+ * works.
  *
  * This is true for many things; pretty much all kthreads fall into this
- * pattern, where they have an 'idle' state and future completions do not
+ * pattern, where they have an invariant state and future completions do not
  * depend on past completions. Its just that since they all have the 'same'
  * form -- the kthread does the same over and over -- it doesn't typically
  * matter.
@@ -4751,15 +4751,31 @@ static inline void invalidate_xhlock(struct hist_lock *xhlock)
  * The same is true for system-calls, once a system call is completed (we've
  * returned to userspace) the next system call does not depend on the lock
  * history of the previous system call.
+ *
+ * They key property for independence, this invariant state, is that it must be
+ * a point where we hold no locks and have no history. Because if we were to
+ * hold locks, the restore at _end() would not necessarily recover it's history
+ * entry. Similarly, independence per-definition means it does not depend on
+ * prior state.
  */
-void crossrelease_hist_start(enum xhlock_context_t c)
+void crossrelease_hist_start(enum xhlock_context_t c, bool force)
 {
 	struct task_struct *cur = current;
 
-	if (cur->xhlocks) {
-		cur->xhlock_idx_hist[c] = cur->xhlock_idx;
-		cur->hist_id_save[c] = cur->hist_id;
+	if (!cur->xhlocks)
+		return;
+
+	/*
+	 * We call this at an invariant point, no current state, no history.
+	 */
+	if (c == XHLOCK_PROC) {
+		/* verified the former, ensure the latter */
+		WARN_ON_ONCE(!force && cur->lockdep_depth);
+		invalidate_xhlock(&xhlock(cur->xhlock_idx));
 	}
+
+	cur->xhlock_idx_hist[c] = cur->xhlock_idx;
+	cur->hist_id_save[c]    = cur->hist_id;
 }
 
 void crossrelease_hist_end(enum xhlock_context_t c)
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 8ad214dc15a9..c0331891dec1 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -2093,7 +2093,28 @@ __acquires(&pool->lock)
 
 	lock_map_acquire(&pwq->wq->lockdep_map);
 	lock_map_acquire(&lockdep_map);
-	crossrelease_hist_start(XHLOCK_PROC);
+	/*
+	 * Strictly speaking we should do start(PROC) without holding any
+	 * locks, that is, before these two lock_map_acquire()'s.
+	 *
+	 * However, that would result in:
+	 *
+	 *   A(W1)
+	 *   WFC(C)
+	 *		A(W1)
+	 *		C(C)
+	 *
+	 * Which would create W1->C->W1 dependencies, even though there is no
+	 * actual deadlock possible. There are two solutions, using a
+	 * read-recursive acquire on the work(queue) 'locks', but this will then
+	 * hit the lockdep limitation on recursive locks, or simly discard
+	 * these locks.
+	 *
+	 * AFAICT there is no possible deadlock scenario between the
+	 * flush_work() and complete() primitives (except for single-threaded
+	 * workqueues), so hiding them isn't a problem.
+	 */
+	crossrelease_hist_start(XHLOCK_PROC, true);
 	trace_workqueue_execute_start(work);
 	worker->current_func(work);
 	/*
-- 
cgit v1.2.3


From 3d52a7924172adf651eda8f1c95ff38146059307 Mon Sep 17 00:00:00 2001
From: Shreyas NC <shreyas.nc@intel.com>
Date: Wed, 23 Aug 2017 19:33:49 +0530
Subject: ASoC: Intel: uapi: Add new tokens for module common data

The module private data can be modelled independent of its instances so
that it can be reused by the module instances. So move module data to
common manifest which can be referenced by the module instances.

This requires new tokens to be defined to accommodate these changes. The
new tokens will specify buffer sizes, DSP cycles and respective indexes
corresponding to the pcm params in the topology manifest so that driver
need not compute them.

Signed-off-by: Shreyas NC <shreyas.nc@intel.com>
Signed-off-by: Guneshwor Singh <guneshwor.o.singh@intel.com>
Acked-By: Vinod Koul <vinod.koul@intel.com>
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 include/uapi/sound/snd_sst_tokens.h | 92 ++++++++++++++++++++++++++++++++++++-
 1 file changed, 91 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/uapi/sound/snd_sst_tokens.h b/include/uapi/sound/snd_sst_tokens.h
index dedb2056160d..f691e421f5e8 100644
--- a/include/uapi/sound/snd_sst_tokens.h
+++ b/include/uapi/sound/snd_sst_tokens.h
@@ -163,8 +163,71 @@
  *
  * %SKL_TKN_U32_DMA_BUF_SIZE:	DMA buffer size in millisec
  *
+ * %SKL_TKN_U32_PIPE_DIR:       Specifies pipe direction. Can be
+ *                              playback/capture.
+ *
+ * %SKL_TKN_U32_NUM_CONFIGS:    Number of pipe configs
+ *
+ * %SKL_TKN_U32_PATH_MEM_PGS:   Size of memory (in pages) required for pipeline
+ *                              and its data
+ *
+ * %SKL_TKN_U32_PIPE_CONFIG_ID: Config id for the modules in the pipe
+ *                              and PCM params supported by that pipe
+ *                              config. This is used as index to fill
+ *                              up the pipe config and module config
+ *                              structure.
+ *
+ * %SKL_TKN_U32_CFG_FREQ:
+ * %SKL_TKN_U8_CFG_CHAN:
+ * %SKL_TKN_U8_CFG_BPS:         PCM params (freq, channels, bits per sample)
+ *                              supported for each of the pipe configs.
+ *
+ * %SKL_TKN_CFG_MOD_RES_ID:     Module's resource index for each of the
+ *                              pipe config
+ *
+ * %SKL_TKN_CFG_MOD_FMT_ID:     Module's interface index for each of the
+ *                              pipe config
+ *
+ * %SKL_TKN_U8_NUM_MOD:         Number of modules in the manifest
+ *
+ * %SKL_TKN_MM_U8_MOD_IDX:      Current index of the module in the manifest
+ *
+ * %SKL_TKN_MM_U8_NUM_RES:      Number of resources for the module
+ *
+ * %SKL_TKN_MM_U8_NUM_INTF:     Number of interfaces for the module
+ *
+ * %SKL_TKN_MM_U32_RES_ID:      Resource index for the resource info to
+ *                              be filled into.
+ *                              A module can support multiple resource
+ *                              configuration and is represnted as a
+ *                              resource table. This index is used to
+ *                              fill information into appropriate index.
+ *
+ * %SKL_TKN_MM_U32_CPS:         DSP cycles per second
+ *
+ * %SKL_TKN_MM_U32_DMA_SIZE:    Allocated buffer size for gateway DMA
+ *
+ * %SKL_TKN_MM_U32_CPC:         DSP cycles allocated per frame
+ *
+ * %SKL_TKN_MM_U32_RES_PIN_ID:  Resource pin index in the module
+ *
+ * %SKL_TKN_MM_U32_INTF_PIN_ID: Interface index in the module
+ *
+ * %SKL_TKN_MM_U32_PIN_BUF:     Buffer size of the module pin
+ *
+ * %SKL_TKN_MM_U32_FMT_ID:      Format index for each of the interface/
+ *                              format information to be filled into.
+ *
+ * %SKL_TKN_MM_U32_NUM_IN_FMT:  Number of input formats
+ * %SKL_TKN_MM_U32_NUM_OUT_FMT: Number of output formats
+ *
  * module_id and loadable flags dont have tokens as these values will be
  * read from the DSP FW manifest
+ *
+ * Tokens defined can be used either in the manifest or widget private data.
+ *
+ * SKL_TKN_MM is used as a suffix for all tokens that represent
+ * module data in the manifest.
  */
 enum SKL_TKNS {
 	SKL_TKN_UUID = 1,
@@ -218,7 +281,34 @@ enum SKL_TKNS {
 	SKL_TKL_U32_D0I3_CAPS, /* Typo added at v4.10 */
 	SKL_TKN_U32_D0I3_CAPS = SKL_TKL_U32_D0I3_CAPS,
 	SKL_TKN_U32_DMA_BUF_SIZE,
-	SKL_TKN_MAX = SKL_TKN_U32_DMA_BUF_SIZE,
+
+	SKL_TKN_U32_PIPE_DIRECTION,
+	SKL_TKN_U32_PIPE_CONFIG_ID,
+	SKL_TKN_U32_NUM_CONFIGS,
+	SKL_TKN_U32_PATH_MEM_PGS,
+
+	SKL_TKN_U32_CFG_FREQ,
+	SKL_TKN_U8_CFG_CHAN,
+	SKL_TKN_U8_CFG_BPS,
+	SKL_TKN_CFG_MOD_RES_ID,
+	SKL_TKN_CFG_MOD_FMT_ID,
+	SKL_TKN_U8_NUM_MOD,
+
+	SKL_TKN_MM_U8_MOD_IDX,
+	SKL_TKN_MM_U8_NUM_RES,
+	SKL_TKN_MM_U8_NUM_INTF,
+	SKL_TKN_MM_U32_RES_ID,
+	SKL_TKN_MM_U32_CPS,
+	SKL_TKN_MM_U32_DMA_SIZE,
+	SKL_TKN_MM_U32_CPC,
+	SKL_TKN_MM_U32_RES_PIN_ID,
+	SKL_TKN_MM_U32_INTF_PIN_ID,
+	SKL_TKN_MM_U32_PIN_BUF,
+	SKL_TKN_MM_U32_FMT_ID,
+	SKL_TKN_MM_U32_NUM_IN_FMT,
+	SKL_TKN_MM_U32_NUM_OUT_FMT,
+
+	SKL_TKN_MAX = SKL_TKN_MM_U32_NUM_OUT_FMT,
 };
 
 #endif
-- 
cgit v1.2.3


From b8972bf0521ca7ee3c8d27da0fd101fe648acfc2 Mon Sep 17 00:00:00 2001
From: Kuninori Morimoto <kuninori.morimoto.gx@renesas.com>
Date: Fri, 25 Aug 2017 00:29:01 +0000
Subject: ASoC: remove duplicate definition of controls/num_controls

snd_soc_component and snd_soc_component_driver both have
controls/num_controls, but these are duplicated.
Let's remove duplicated definition.

Signed-off-by: Kuninori Morimoto <kuninori.morimoto.gx@renesas.com>
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 include/sound/soc.h  | 2 --
 sound/soc/soc-core.c | 9 ++++-----
 2 files changed, 4 insertions(+), 7 deletions(-)

(limited to 'include')

diff --git a/include/sound/soc.h b/include/sound/soc.h
index de57a8ad057b..e4b4746032ba 100644
--- a/include/sound/soc.h
+++ b/include/sound/soc.h
@@ -862,8 +862,6 @@ struct snd_soc_component {
 	/* Don't use these, use snd_soc_component_get_dapm() */
 	struct snd_soc_dapm_context dapm;
 
-	const struct snd_kcontrol_new *controls;
-	unsigned int num_controls;
 	const struct snd_soc_dapm_widget *dapm_widgets;
 	unsigned int num_dapm_widgets;
 	const struct snd_soc_dapm_route *dapm_routes;
diff --git a/sound/soc/soc-core.c b/sound/soc/soc-core.c
index 3bb8c63564cf..1317706f8e69 100644
--- a/sound/soc/soc-core.c
+++ b/sound/soc/soc-core.c
@@ -1495,9 +1495,10 @@ static int soc_probe_component(struct snd_soc_card *card,
 		}
 	}
 
-	if (component->controls)
-		snd_soc_add_component_controls(component, component->controls,
-				     component->num_controls);
+	if (component->driver->controls)
+		snd_soc_add_component_controls(component,
+					       component->driver->controls,
+					       component->driver->num_controls);
 	if (component->dapm_routes)
 		snd_soc_dapm_add_routes(dapm, component->dapm_routes,
 					component->num_dapm_routes);
@@ -3184,8 +3185,6 @@ static int snd_soc_component_initialize(struct snd_soc_component *component,
 	if (driver->stream_event)
 		dapm->stream_event = snd_soc_component_stream_event;
 
-	component->controls = driver->controls;
-	component->num_controls = driver->num_controls;
 	component->dapm_widgets = driver->dapm_widgets;
 	component->num_dapm_widgets = driver->num_dapm_widgets;
 	component->dapm_routes = driver->dapm_routes;
-- 
cgit v1.2.3


From 688d0ebf49a97aa6959aebec5d177a8b821f27d1 Mon Sep 17 00:00:00 2001
From: Kuninori Morimoto <kuninori.morimoto.gx@renesas.com>
Date: Fri, 25 Aug 2017 00:29:20 +0000
Subject: ASoC: remove duplicate definition of dapm_widgets/num_dapm_widgets

snd_soc_component and snd_soc_component_driver both have
dapm_widgets/num_dapm_widgets, but these are duplicated.
Let's remove duplicated definition.

Signed-off-by: Kuninori Morimoto <kuninori.morimoto.gx@renesas.com>
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 include/sound/soc.h  | 2 --
 sound/soc/soc-core.c | 9 ++++-----
 2 files changed, 4 insertions(+), 7 deletions(-)

(limited to 'include')

diff --git a/include/sound/soc.h b/include/sound/soc.h
index e4b4746032ba..3e0070ed6883 100644
--- a/include/sound/soc.h
+++ b/include/sound/soc.h
@@ -862,8 +862,6 @@ struct snd_soc_component {
 	/* Don't use these, use snd_soc_component_get_dapm() */
 	struct snd_soc_dapm_context dapm;
 
-	const struct snd_soc_dapm_widget *dapm_widgets;
-	unsigned int num_dapm_widgets;
 	const struct snd_soc_dapm_route *dapm_routes;
 	unsigned int num_dapm_routes;
 	struct snd_soc_codec *codec;
diff --git a/sound/soc/soc-core.c b/sound/soc/soc-core.c
index 1317706f8e69..3a3236dc996d 100644
--- a/sound/soc/soc-core.c
+++ b/sound/soc/soc-core.c
@@ -1451,9 +1451,10 @@ static int soc_probe_component(struct snd_soc_card *card,
 
 	soc_init_component_debugfs(component);
 
-	if (component->dapm_widgets) {
-		ret = snd_soc_dapm_new_controls(dapm, component->dapm_widgets,
-			component->num_dapm_widgets);
+	if (component->driver->dapm_widgets) {
+		ret = snd_soc_dapm_new_controls(dapm,
+					component->driver->dapm_widgets,
+					component->driver->num_dapm_widgets);
 
 		if (ret != 0) {
 			dev_err(component->dev,
@@ -3185,8 +3186,6 @@ static int snd_soc_component_initialize(struct snd_soc_component *component,
 	if (driver->stream_event)
 		dapm->stream_event = snd_soc_component_stream_event;
 
-	component->dapm_widgets = driver->dapm_widgets;
-	component->num_dapm_widgets = driver->num_dapm_widgets;
 	component->dapm_routes = driver->dapm_routes;
 	component->num_dapm_routes = driver->num_dapm_routes;
 
-- 
cgit v1.2.3


From 6969b2bae6dbe7f277a7c018b66d2f3ad662129f Mon Sep 17 00:00:00 2001
From: Kuninori Morimoto <kuninori.morimoto.gx@renesas.com>
Date: Fri, 25 Aug 2017 00:29:41 +0000
Subject: ASoC: remove duplicate definition of dapm_routes/num_dapm_routes

snd_soc_component and snd_soc_component_driver both have
dapm_routes/num_dapm_routes, but these are duplicated.
Let's remove duplicated definition.

Signed-off-by: Kuninori Morimoto <kuninori.morimoto.gx@renesas.com>
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 include/sound/soc.h  |  2 --
 sound/soc/soc-core.c | 10 ++++------
 2 files changed, 4 insertions(+), 8 deletions(-)

(limited to 'include')

diff --git a/include/sound/soc.h b/include/sound/soc.h
index 3e0070ed6883..2bcb38045dc1 100644
--- a/include/sound/soc.h
+++ b/include/sound/soc.h
@@ -862,8 +862,6 @@ struct snd_soc_component {
 	/* Don't use these, use snd_soc_component_get_dapm() */
 	struct snd_soc_dapm_context dapm;
 
-	const struct snd_soc_dapm_route *dapm_routes;
-	unsigned int num_dapm_routes;
 	struct snd_soc_codec *codec;
 
 	int (*probe)(struct snd_soc_component *);
diff --git a/sound/soc/soc-core.c b/sound/soc/soc-core.c
index 3a3236dc996d..b8f96a41b594 100644
--- a/sound/soc/soc-core.c
+++ b/sound/soc/soc-core.c
@@ -1500,9 +1500,10 @@ static int soc_probe_component(struct snd_soc_card *card,
 		snd_soc_add_component_controls(component,
 					       component->driver->controls,
 					       component->driver->num_controls);
-	if (component->dapm_routes)
-		snd_soc_dapm_add_routes(dapm, component->dapm_routes,
-					component->num_dapm_routes);
+	if (component->driver->dapm_routes)
+		snd_soc_dapm_add_routes(dapm,
+					component->driver->dapm_routes,
+					component->driver->num_dapm_routes);
 
 	list_add(&dapm->list, &card->dapm_list);
 	list_add(&component->card_list, &card->component_dev_list);
@@ -3186,9 +3187,6 @@ static int snd_soc_component_initialize(struct snd_soc_component *component,
 	if (driver->stream_event)
 		dapm->stream_event = snd_soc_component_stream_event;
 
-	component->dapm_routes = driver->dapm_routes;
-	component->num_dapm_routes = driver->num_dapm_routes;
-
 	INIT_LIST_HEAD(&component->dai_list);
 	mutex_init(&component->io_mutex);
 
-- 
cgit v1.2.3


From 5237e95f63761477b7ea45499d08b89383a77eab Mon Sep 17 00:00:00 2001
From: Robin Murphy <robin.murphy@arm.com>
Date: Mon, 24 Jul 2017 18:29:27 +0100
Subject: dma-mapping: reduce dma_mapping_error inline bloat

Thanks to the nested inlining, all drivers correctly calling
dma_mapping_error() after a mapping a page or single buffer generate two
calls to get_arch_dma_ops() per callsite, which all adds up to a fair
old chunk of useless code, e.g. ~3KB for an arm64 defconfig plus extras:

   text	   data	    bss	    dec	    hex	filename
13051391	1503898	 327768	14883057	 e318f1	vmlinux.o.old
13050751	1503898	 327768	14882417	 e31671	vmlinux.o.new

Give the compiler a hand by making it clear we want the same ops.

Signed-off-by: Robin Murphy <robin.murphy@arm.com>
Reviewed-by: Marek Szyprowski <m.szyprowski@samsung.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 include/linux/dma-mapping.h | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/include/linux/dma-mapping.h b/include/linux/dma-mapping.h
index 03c0196a6f24..66d8ea68f40b 100644
--- a/include/linux/dma-mapping.h
+++ b/include/linux/dma-mapping.h
@@ -565,10 +565,11 @@ static inline void dma_free_noncoherent(struct device *dev, size_t size,
 
 static inline int dma_mapping_error(struct device *dev, dma_addr_t dma_addr)
 {
-	debug_dma_mapping_error(dev, dma_addr);
+	const struct dma_map_ops *ops = get_dma_ops(dev);
 
-	if (get_dma_ops(dev)->mapping_error)
-		return get_dma_ops(dev)->mapping_error(dev, dma_addr);
+	debug_dma_mapping_error(dev, dma_addr);
+	if (ops->mapping_error)
+		return ops->mapping_error(dev, dma_addr);
 	return 0;
 }
 
-- 
cgit v1.2.3


From 30d6e0a4190d37740e9447e4e4815f06992dd8c3 Mon Sep 17 00:00:00 2001
From: Jiri Slaby <jslaby@suse.cz>
Date: Thu, 24 Aug 2017 09:31:05 +0200
Subject: futex: Remove duplicated code and fix undefined behaviour

There is code duplicated over all architecture's headers for
futex_atomic_op_inuser. Namely op decoding, access_ok check for uaddr,
and comparison of the result.

Remove this duplication and leave up to the arches only the needed
assembly which is now in arch_futex_atomic_op_inuser.

This effectively distributes the Will Deacon's arm64 fix for undefined
behaviour reported by UBSAN to all architectures. The fix was done in
commit 5f16a046f8e1 (arm64: futex: Fix undefined behaviour with
FUTEX_OP_OPARG_SHIFT usage). Look there for an example dump.

And as suggested by Thomas, check for negative oparg too, because it was
also reported to cause undefined behaviour report.

Note that s390 removed access_ok check in d12a29703 ("s390/uaccess:
remove pointless access_ok() checks") as access_ok there returns true.
We introduce it back to the helper for the sake of simplicity (it gets
optimized away anyway).

Signed-off-by: Jiri Slaby <jslaby@suse.cz>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Acked-by: Russell King <rmk+kernel@armlinux.org.uk>
Acked-by: Michael Ellerman <mpe@ellerman.id.au> (powerpc)
Acked-by: Heiko Carstens <heiko.carstens@de.ibm.com> [s390]
Acked-by: Chris Metcalf <cmetcalf@mellanox.com> [for tile]
Reviewed-by: Darren Hart (VMware) <dvhart@infradead.org>
Reviewed-by: Will Deacon <will.deacon@arm.com> [core/arm64]
Cc: linux-mips@linux-mips.org
Cc: Rich Felker <dalias@libc.org>
Cc: linux-ia64@vger.kernel.org
Cc: linux-sh@vger.kernel.org
Cc: peterz@infradead.org
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Max Filippov <jcmvbkbc@gmail.com>
Cc: Paul Mackerras <paulus@samba.org>
Cc: sparclinux@vger.kernel.org
Cc: Jonas Bonn <jonas@southpole.se>
Cc: linux-s390@vger.kernel.org
Cc: linux-arch@vger.kernel.org
Cc: Yoshinori Sato <ysato@users.sourceforge.jp>
Cc: linux-hexagon@vger.kernel.org
Cc: Helge Deller <deller@gmx.de>
Cc: "James E.J. Bottomley" <jejb@parisc-linux.org>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Matt Turner <mattst88@gmail.com>
Cc: linux-snps-arc@lists.infradead.org
Cc: Fenghua Yu <fenghua.yu@intel.com>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: linux-xtensa@linux-xtensa.org
Cc: Stefan Kristiansson <stefan.kristiansson@saunalahti.fi>
Cc: openrisc@lists.librecores.org
Cc: Ivan Kokshaysky <ink@jurassic.park.msu.ru>
Cc: Stafford Horne <shorne@gmail.com>
Cc: linux-arm-kernel@lists.infradead.org
Cc: Richard Henderson <rth@twiddle.net>
Cc: Chris Zankel <chris@zankel.net>
Cc: Michal Simek <monstr@monstr.eu>
Cc: Tony Luck <tony.luck@intel.com>
Cc: linux-parisc@vger.kernel.org
Cc: Vineet Gupta <vgupta@synopsys.com>
Cc: Ralf Baechle <ralf@linux-mips.org>
Cc: Richard Kuo <rkuo@codeaurora.org>
Cc: linux-alpha@vger.kernel.org
Cc: Martin Schwidefsky <schwidefsky@de.ibm.com>
Cc: linuxppc-dev@lists.ozlabs.org
Cc: "David S. Miller" <davem@davemloft.net>
Link: http://lkml.kernel.org/r/20170824073105.3901-1-jslaby@suse.cz
---
 arch/alpha/include/asm/futex.h      | 26 ++++---------------
 arch/arc/include/asm/futex.h        | 40 ++++-------------------------
 arch/arm/include/asm/futex.h        | 26 +++----------------
 arch/arm64/include/asm/futex.h      | 26 +++----------------
 arch/frv/include/asm/futex.h        |  3 ++-
 arch/frv/kernel/futex.c             | 27 +++-----------------
 arch/hexagon/include/asm/futex.h    | 38 +++-------------------------
 arch/ia64/include/asm/futex.h       | 25 +++----------------
 arch/microblaze/include/asm/futex.h | 38 +++-------------------------
 arch/mips/include/asm/futex.h       | 25 +++----------------
 arch/openrisc/include/asm/futex.h   | 39 +++--------------------------
 arch/parisc/include/asm/futex.h     | 26 +++----------------
 arch/powerpc/include/asm/futex.h    | 26 ++++---------------
 arch/s390/include/asm/futex.h       | 23 ++++-------------
 arch/sh/include/asm/futex.h         | 26 +++----------------
 arch/sparc/include/asm/futex_64.h   | 26 ++++---------------
 arch/tile/include/asm/futex.h       | 40 ++++-------------------------
 arch/x86/include/asm/futex.h        | 40 ++++-------------------------
 arch/xtensa/include/asm/futex.h     | 27 ++++----------------
 include/asm-generic/futex.h         | 50 +++++++------------------------------
 kernel/futex.c                      | 39 +++++++++++++++++++++++++++++
 21 files changed, 130 insertions(+), 506 deletions(-)

(limited to 'include')

diff --git a/arch/alpha/include/asm/futex.h b/arch/alpha/include/asm/futex.h
index fb01dfb760c2..05a70edd57b6 100644
--- a/arch/alpha/include/asm/futex.h
+++ b/arch/alpha/include/asm/futex.h
@@ -25,18 +25,10 @@
 	:	"r" (uaddr), "r"(oparg)				\
 	:	"memory")
 
-static inline int futex_atomic_op_inuser (int encoded_op, u32 __user *uaddr)
+static inline int arch_futex_atomic_op_inuser(int op, int oparg, int *oval,
+		u32 __user *uaddr)
 {
-	int op = (encoded_op >> 28) & 7;
-	int cmp = (encoded_op >> 24) & 15;
-	int oparg = (encoded_op << 8) >> 20;
-	int cmparg = (encoded_op << 20) >> 20;
 	int oldval = 0, ret;
-	if (encoded_op & (FUTEX_OP_OPARG_SHIFT << 28))
-		oparg = 1 << oparg;
-
-	if (!access_ok(VERIFY_WRITE, uaddr, sizeof(u32)))
-		return -EFAULT;
 
 	pagefault_disable();
 
@@ -62,17 +54,9 @@ static inline int futex_atomic_op_inuser (int encoded_op, u32 __user *uaddr)
 
 	pagefault_enable();
 
-	if (!ret) {
-		switch (cmp) {
-		case FUTEX_OP_CMP_EQ: ret = (oldval == cmparg); break;
-		case FUTEX_OP_CMP_NE: ret = (oldval != cmparg); break;
-		case FUTEX_OP_CMP_LT: ret = (oldval < cmparg); break;
-		case FUTEX_OP_CMP_GE: ret = (oldval >= cmparg); break;
-		case FUTEX_OP_CMP_LE: ret = (oldval <= cmparg); break;
-		case FUTEX_OP_CMP_GT: ret = (oldval > cmparg); break;
-		default: ret = -ENOSYS;
-		}
-	}
+	if (!ret)
+		*oval = oldval;
+
 	return ret;
 }
 
diff --git a/arch/arc/include/asm/futex.h b/arch/arc/include/asm/futex.h
index 11e1b1f3acda..eb887dd13e74 100644
--- a/arch/arc/include/asm/futex.h
+++ b/arch/arc/include/asm/futex.h
@@ -73,20 +73,11 @@
 
 #endif
 
-static inline int futex_atomic_op_inuser(int encoded_op, u32 __user *uaddr)
+static inline int arch_futex_atomic_op_inuser(int op, int oparg, int *oval,
+		u32 __user *uaddr)
 {
-	int op = (encoded_op >> 28) & 7;
-	int cmp = (encoded_op >> 24) & 15;
-	int oparg = (encoded_op << 8) >> 20;
-	int cmparg = (encoded_op << 20) >> 20;
 	int oldval = 0, ret;
 
-	if (encoded_op & (FUTEX_OP_OPARG_SHIFT << 28))
-		oparg = 1 << oparg;
-
-	if (!access_ok(VERIFY_WRITE, uaddr, sizeof(int)))
-		return -EFAULT;
-
 #ifndef CONFIG_ARC_HAS_LLSC
 	preempt_disable();	/* to guarantee atomic r-m-w of futex op */
 #endif
@@ -118,30 +109,9 @@ static inline int futex_atomic_op_inuser(int encoded_op, u32 __user *uaddr)
 	preempt_enable();
 #endif
 
-	if (!ret) {
-		switch (cmp) {
-		case FUTEX_OP_CMP_EQ:
-			ret = (oldval == cmparg);
-			break;
-		case FUTEX_OP_CMP_NE:
-			ret = (oldval != cmparg);
-			break;
-		case FUTEX_OP_CMP_LT:
-			ret = (oldval < cmparg);
-			break;
-		case FUTEX_OP_CMP_GE:
-			ret = (oldval >= cmparg);
-			break;
-		case FUTEX_OP_CMP_LE:
-			ret = (oldval <= cmparg);
-			break;
-		case FUTEX_OP_CMP_GT:
-			ret = (oldval > cmparg);
-			break;
-		default:
-			ret = -ENOSYS;
-		}
-	}
+	if (!ret)
+		*oval = oldval;
+
 	return ret;
 }
 
diff --git a/arch/arm/include/asm/futex.h b/arch/arm/include/asm/futex.h
index 6795368ad023..cc414382dab4 100644
--- a/arch/arm/include/asm/futex.h
+++ b/arch/arm/include/asm/futex.h
@@ -128,20 +128,10 @@ futex_atomic_cmpxchg_inatomic(u32 *uval, u32 __user *uaddr,
 #endif /* !SMP */
 
 static inline int
-futex_atomic_op_inuser (int encoded_op, u32 __user *uaddr)
+arch_futex_atomic_op_inuser(int op, int oparg, int *oval, u32 __user *uaddr)
 {
-	int op = (encoded_op >> 28) & 7;
-	int cmp = (encoded_op >> 24) & 15;
-	int oparg = (encoded_op << 8) >> 20;
-	int cmparg = (encoded_op << 20) >> 20;
 	int oldval = 0, ret, tmp;
 
-	if (encoded_op & (FUTEX_OP_OPARG_SHIFT << 28))
-		oparg = 1 << oparg;
-
-	if (!access_ok(VERIFY_WRITE, uaddr, sizeof(u32)))
-		return -EFAULT;
-
 #ifndef CONFIG_SMP
 	preempt_disable();
 #endif
@@ -172,17 +162,9 @@ futex_atomic_op_inuser (int encoded_op, u32 __user *uaddr)
 	preempt_enable();
 #endif
 
-	if (!ret) {
-		switch (cmp) {
-		case FUTEX_OP_CMP_EQ: ret = (oldval == cmparg); break;
-		case FUTEX_OP_CMP_NE: ret = (oldval != cmparg); break;
-		case FUTEX_OP_CMP_LT: ret = (oldval < cmparg); break;
-		case FUTEX_OP_CMP_GE: ret = (oldval >= cmparg); break;
-		case FUTEX_OP_CMP_LE: ret = (oldval <= cmparg); break;
-		case FUTEX_OP_CMP_GT: ret = (oldval > cmparg); break;
-		default: ret = -ENOSYS;
-		}
-	}
+	if (!ret)
+		*oval = oldval;
+
 	return ret;
 }
 
diff --git a/arch/arm64/include/asm/futex.h b/arch/arm64/include/asm/futex.h
index f32b42e8725d..5bb2fd4674e7 100644
--- a/arch/arm64/include/asm/futex.h
+++ b/arch/arm64/include/asm/futex.h
@@ -48,20 +48,10 @@ do {									\
 } while (0)
 
 static inline int
-futex_atomic_op_inuser(unsigned int encoded_op, u32 __user *uaddr)
+arch_futex_atomic_op_inuser(int op, int oparg, int *oval, u32 __user *uaddr)
 {
-	int op = (encoded_op >> 28) & 7;
-	int cmp = (encoded_op >> 24) & 15;
-	int oparg = (int)(encoded_op << 8) >> 20;
-	int cmparg = (int)(encoded_op << 20) >> 20;
 	int oldval = 0, ret, tmp;
 
-	if (encoded_op & (FUTEX_OP_OPARG_SHIFT << 28))
-		oparg = 1U << (oparg & 0x1f);
-
-	if (!access_ok(VERIFY_WRITE, uaddr, sizeof(u32)))
-		return -EFAULT;
-
 	pagefault_disable();
 
 	switch (op) {
@@ -91,17 +81,9 @@ futex_atomic_op_inuser(unsigned int encoded_op, u32 __user *uaddr)
 
 	pagefault_enable();
 
-	if (!ret) {
-		switch (cmp) {
-		case FUTEX_OP_CMP_EQ: ret = (oldval == cmparg); break;
-		case FUTEX_OP_CMP_NE: ret = (oldval != cmparg); break;
-		case FUTEX_OP_CMP_LT: ret = (oldval < cmparg); break;
-		case FUTEX_OP_CMP_GE: ret = (oldval >= cmparg); break;
-		case FUTEX_OP_CMP_LE: ret = (oldval <= cmparg); break;
-		case FUTEX_OP_CMP_GT: ret = (oldval > cmparg); break;
-		default: ret = -ENOSYS;
-		}
-	}
+	if (!ret)
+		*oval = oldval;
+
 	return ret;
 }
 
diff --git a/arch/frv/include/asm/futex.h b/arch/frv/include/asm/futex.h
index 2e1da71e27a4..ab346f5f8820 100644
--- a/arch/frv/include/asm/futex.h
+++ b/arch/frv/include/asm/futex.h
@@ -7,7 +7,8 @@
 #include <asm/errno.h>
 #include <linux/uaccess.h>
 
-extern int futex_atomic_op_inuser(int encoded_op, u32 __user *uaddr);
+extern int arch_futex_atomic_op_inuser(int op, int oparg, int *oval,
+		u32 __user *uaddr);
 
 static inline int
 futex_atomic_cmpxchg_inatomic(u32 *uval, u32 __user *uaddr,
diff --git a/arch/frv/kernel/futex.c b/arch/frv/kernel/futex.c
index d155ca9e5098..37f7b2bf7f73 100644
--- a/arch/frv/kernel/futex.c
+++ b/arch/frv/kernel/futex.c
@@ -186,20 +186,10 @@ static inline int atomic_futex_op_xchg_xor(int oparg, u32 __user *uaddr, int *_o
 /*
  * do the futex operations
  */
-int futex_atomic_op_inuser(int encoded_op, u32 __user *uaddr)
+int arch_futex_atomic_op_inuser(int op, int oparg, int *oval, u32 __user *uaddr)
 {
-	int op = (encoded_op >> 28) & 7;
-	int cmp = (encoded_op >> 24) & 15;
-	int oparg = (encoded_op << 8) >> 20;
-	int cmparg = (encoded_op << 20) >> 20;
 	int oldval = 0, ret;
 
-	if (encoded_op & (FUTEX_OP_OPARG_SHIFT << 28))
-		oparg = 1 << oparg;
-
-	if (!access_ok(VERIFY_WRITE, uaddr, sizeof(u32)))
-		return -EFAULT;
-
 	pagefault_disable();
 
 	switch (op) {
@@ -225,18 +215,9 @@ int futex_atomic_op_inuser(int encoded_op, u32 __user *uaddr)
 
 	pagefault_enable();
 
-	if (!ret) {
-		switch (cmp) {
-		case FUTEX_OP_CMP_EQ: ret = (oldval == cmparg); break;
-		case FUTEX_OP_CMP_NE: ret = (oldval != cmparg); break;
-		case FUTEX_OP_CMP_LT: ret = (oldval < cmparg); break;
-		case FUTEX_OP_CMP_GE: ret = (oldval >= cmparg); break;
-		case FUTEX_OP_CMP_LE: ret = (oldval <= cmparg); break;
-		case FUTEX_OP_CMP_GT: ret = (oldval > cmparg); break;
-		default: ret = -ENOSYS; break;
-		}
-	}
+	if (!ret)
+		*oval = oldval;
 
 	return ret;
 
-} /* end futex_atomic_op_inuser() */
+} /* end arch_futex_atomic_op_inuser() */
diff --git a/arch/hexagon/include/asm/futex.h b/arch/hexagon/include/asm/futex.h
index 7e597f8434da..c607b77c8215 100644
--- a/arch/hexagon/include/asm/futex.h
+++ b/arch/hexagon/include/asm/futex.h
@@ -31,18 +31,9 @@
 
 
 static inline int
-futex_atomic_op_inuser(int encoded_op, int __user *uaddr)
+arch_futex_atomic_op_inuser(int op, int oparg, int *oval, u32 __user *uaddr)
 {
-	int op = (encoded_op >> 28) & 7;
-	int cmp = (encoded_op >> 24) & 15;
-	int oparg = (encoded_op << 8) >> 20;
-	int cmparg = (encoded_op << 20) >> 20;
 	int oldval = 0, ret;
-	if (encoded_op & (FUTEX_OP_OPARG_SHIFT << 28))
-		oparg = 1 << oparg;
-
-	if (!access_ok(VERIFY_WRITE, uaddr, sizeof(int)))
-		return -EFAULT;
 
 	pagefault_disable();
 
@@ -72,30 +63,9 @@ futex_atomic_op_inuser(int encoded_op, int __user *uaddr)
 
 	pagefault_enable();
 
-	if (!ret) {
-		switch (cmp) {
-		case FUTEX_OP_CMP_EQ:
-			ret = (oldval == cmparg);
-			break;
-		case FUTEX_OP_CMP_NE:
-			ret = (oldval != cmparg);
-			break;
-		case FUTEX_OP_CMP_LT:
-			ret = (oldval < cmparg);
-			break;
-		case FUTEX_OP_CMP_GE:
-			ret = (oldval >= cmparg);
-			break;
-		case FUTEX_OP_CMP_LE:
-			ret = (oldval <= cmparg);
-			break;
-		case FUTEX_OP_CMP_GT:
-			ret = (oldval > cmparg);
-			break;
-		default:
-			ret = -ENOSYS;
-		}
-	}
+	if (!ret)
+		*oval = oldval;
+
 	return ret;
 }
 
diff --git a/arch/ia64/include/asm/futex.h b/arch/ia64/include/asm/futex.h
index 76acbcd5c060..6d67dc1eaf2b 100644
--- a/arch/ia64/include/asm/futex.h
+++ b/arch/ia64/include/asm/futex.h
@@ -45,18 +45,9 @@ do {									\
 } while (0)
 
 static inline int
-futex_atomic_op_inuser (int encoded_op, u32 __user *uaddr)
+arch_futex_atomic_op_inuser(int op, int oparg, int *oval, u32 __user *uaddr)
 {
-	int op = (encoded_op >> 28) & 7;
-	int cmp = (encoded_op >> 24) & 15;
-	int oparg = (encoded_op << 8) >> 20;
-	int cmparg = (encoded_op << 20) >> 20;
 	int oldval = 0, ret;
-	if (encoded_op & (FUTEX_OP_OPARG_SHIFT << 28))
-		oparg = 1 << oparg;
-
-	if (! access_ok (VERIFY_WRITE, uaddr, sizeof(u32)))
-		return -EFAULT;
 
 	pagefault_disable();
 
@@ -84,17 +75,9 @@ futex_atomic_op_inuser (int encoded_op, u32 __user *uaddr)
 
 	pagefault_enable();
 
-	if (!ret) {
-		switch (cmp) {
-		case FUTEX_OP_CMP_EQ: ret = (oldval == cmparg); break;
-		case FUTEX_OP_CMP_NE: ret = (oldval != cmparg); break;
-		case FUTEX_OP_CMP_LT: ret = (oldval < cmparg); break;
-		case FUTEX_OP_CMP_GE: ret = (oldval >= cmparg); break;
-		case FUTEX_OP_CMP_LE: ret = (oldval <= cmparg); break;
-		case FUTEX_OP_CMP_GT: ret = (oldval > cmparg); break;
-		default: ret = -ENOSYS;
-		}
-	}
+	if (!ret)
+		*oval = oldval;
+
 	return ret;
 }
 
diff --git a/arch/microblaze/include/asm/futex.h b/arch/microblaze/include/asm/futex.h
index 01848f056f43..a9dad9e5e132 100644
--- a/arch/microblaze/include/asm/futex.h
+++ b/arch/microblaze/include/asm/futex.h
@@ -29,18 +29,9 @@
 })
 
 static inline int
-futex_atomic_op_inuser(int encoded_op, u32 __user *uaddr)
+arch_futex_atomic_op_inuser(int op, int oparg, int *oval, u32 __user *uaddr)
 {
-	int op = (encoded_op >> 28) & 7;
-	int cmp = (encoded_op >> 24) & 15;
-	int oparg = (encoded_op << 8) >> 20;
-	int cmparg = (encoded_op << 20) >> 20;
 	int oldval = 0, ret;
-	if (encoded_op & (FUTEX_OP_OPARG_SHIFT << 28))
-		oparg = 1 << oparg;
-
-	if (!access_ok(VERIFY_WRITE, uaddr, sizeof(u32)))
-		return -EFAULT;
 
 	pagefault_disable();
 
@@ -66,30 +57,9 @@ futex_atomic_op_inuser(int encoded_op, u32 __user *uaddr)
 
 	pagefault_enable();
 
-	if (!ret) {
-		switch (cmp) {
-		case FUTEX_OP_CMP_EQ:
-			ret = (oldval == cmparg);
-			break;
-		case FUTEX_OP_CMP_NE:
-			ret = (oldval != cmparg);
-			break;
-		case FUTEX_OP_CMP_LT:
-			ret = (oldval < cmparg);
-			break;
-		case FUTEX_OP_CMP_GE:
-			ret = (oldval >= cmparg);
-			break;
-		case FUTEX_OP_CMP_LE:
-			ret = (oldval <= cmparg);
-			break;
-		case FUTEX_OP_CMP_GT:
-			ret = (oldval > cmparg);
-			break;
-		default:
-			ret = -ENOSYS;
-		}
-	}
+	if (!ret)
+		*oval = oldval;
+
 	return ret;
 }
 
diff --git a/arch/mips/include/asm/futex.h b/arch/mips/include/asm/futex.h
index 1de190bdfb9c..a9e61ea54ca9 100644
--- a/arch/mips/include/asm/futex.h
+++ b/arch/mips/include/asm/futex.h
@@ -83,18 +83,9 @@
 }
 
 static inline int
-futex_atomic_op_inuser(int encoded_op, u32 __user *uaddr)
+arch_futex_atomic_op_inuser(int op, int oparg, int *oval, u32 __user *uaddr)
 {
-	int op = (encoded_op >> 28) & 7;
-	int cmp = (encoded_op >> 24) & 15;
-	int oparg = (encoded_op << 8) >> 20;
-	int cmparg = (encoded_op << 20) >> 20;
 	int oldval = 0, ret;
-	if (encoded_op & (FUTEX_OP_OPARG_SHIFT << 28))
-		oparg = 1 << oparg;
-
-	if (! access_ok (VERIFY_WRITE, uaddr, sizeof(u32)))
-		return -EFAULT;
 
 	pagefault_disable();
 
@@ -125,17 +116,9 @@ futex_atomic_op_inuser(int encoded_op, u32 __user *uaddr)
 
 	pagefault_enable();
 
-	if (!ret) {
-		switch (cmp) {
-		case FUTEX_OP_CMP_EQ: ret = (oldval == cmparg); break;
-		case FUTEX_OP_CMP_NE: ret = (oldval != cmparg); break;
-		case FUTEX_OP_CMP_LT: ret = (oldval < cmparg); break;
-		case FUTEX_OP_CMP_GE: ret = (oldval >= cmparg); break;
-		case FUTEX_OP_CMP_LE: ret = (oldval <= cmparg); break;
-		case FUTEX_OP_CMP_GT: ret = (oldval > cmparg); break;
-		default: ret = -ENOSYS;
-		}
-	}
+	if (!ret)
+		*oval = oldval;
+
 	return ret;
 }
 
diff --git a/arch/openrisc/include/asm/futex.h b/arch/openrisc/include/asm/futex.h
index 778087341977..8fed278a24b8 100644
--- a/arch/openrisc/include/asm/futex.h
+++ b/arch/openrisc/include/asm/futex.h
@@ -30,20 +30,10 @@
 })
 
 static inline int
-futex_atomic_op_inuser(int encoded_op, u32 __user *uaddr)
+arch_futex_atomic_op_inuser(int op, int oparg, int *oval, u32 __user *uaddr)
 {
-	int op = (encoded_op >> 28) & 7;
-	int cmp = (encoded_op >> 24) & 15;
-	int oparg = (encoded_op << 8) >> 20;
-	int cmparg = (encoded_op << 20) >> 20;
 	int oldval = 0, ret;
 
-	if (encoded_op & (FUTEX_OP_OPARG_SHIFT << 28))
-		oparg = 1 << oparg;
-
-	if (!access_ok(VERIFY_WRITE, uaddr, sizeof(u32)))
-		return -EFAULT;
-
 	pagefault_disable();
 
 	switch (op) {
@@ -68,30 +58,9 @@ futex_atomic_op_inuser(int encoded_op, u32 __user *uaddr)
 
 	pagefault_enable();
 
-	if (!ret) {
-		switch (cmp) {
-		case FUTEX_OP_CMP_EQ:
-			ret = (oldval == cmparg);
-			break;
-		case FUTEX_OP_CMP_NE:
-			ret = (oldval != cmparg);
-			break;
-		case FUTEX_OP_CMP_LT:
-			ret = (oldval < cmparg);
-			break;
-		case FUTEX_OP_CMP_GE:
-			ret = (oldval >= cmparg);
-			break;
-		case FUTEX_OP_CMP_LE:
-			ret = (oldval <= cmparg);
-			break;
-		case FUTEX_OP_CMP_GT:
-			ret = (oldval > cmparg);
-			break;
-		default:
-			ret = -ENOSYS;
-		}
-	}
+	if (!ret)
+		*oval = oldval;
+
 	return ret;
 }
 
diff --git a/arch/parisc/include/asm/futex.h b/arch/parisc/include/asm/futex.h
index 0ba14300cd8e..c601aab2fb36 100644
--- a/arch/parisc/include/asm/futex.h
+++ b/arch/parisc/include/asm/futex.h
@@ -32,22 +32,12 @@ _futex_spin_unlock_irqrestore(u32 __user *uaddr, unsigned long int *flags)
 }
 
 static inline int
-futex_atomic_op_inuser (int encoded_op, u32 __user *uaddr)
+arch_futex_atomic_op_inuser(int op, int oparg, int *oval, u32 __user *uaddr)
 {
 	unsigned long int flags;
-	int op = (encoded_op >> 28) & 7;
-	int cmp = (encoded_op >> 24) & 15;
-	int oparg = (encoded_op << 8) >> 20;
-	int cmparg = (encoded_op << 20) >> 20;
 	int oldval, ret;
 	u32 tmp;
 
-	if (encoded_op & (FUTEX_OP_OPARG_SHIFT << 28))
-		oparg = 1 << oparg;
-
-	if (!access_ok(VERIFY_WRITE, uaddr, sizeof(*uaddr)))
-		return -EFAULT;
-
 	_futex_spin_lock_irqsave(uaddr, &flags);
 	pagefault_disable();
 
@@ -85,17 +75,9 @@ out_pagefault_enable:
 	pagefault_enable();
 	_futex_spin_unlock_irqrestore(uaddr, &flags);
 
-	if (ret == 0) {
-		switch (cmp) {
-		case FUTEX_OP_CMP_EQ: ret = (oldval == cmparg); break;
-		case FUTEX_OP_CMP_NE: ret = (oldval != cmparg); break;
-		case FUTEX_OP_CMP_LT: ret = (oldval < cmparg); break;
-		case FUTEX_OP_CMP_GE: ret = (oldval >= cmparg); break;
-		case FUTEX_OP_CMP_LE: ret = (oldval <= cmparg); break;
-		case FUTEX_OP_CMP_GT: ret = (oldval > cmparg); break;
-		default: ret = -ENOSYS;
-		}
-	}
+	if (!ret)
+		*oval = oldval;
+
 	return ret;
 }
 
diff --git a/arch/powerpc/include/asm/futex.h b/arch/powerpc/include/asm/futex.h
index eaada6c92344..719ed9b61ea7 100644
--- a/arch/powerpc/include/asm/futex.h
+++ b/arch/powerpc/include/asm/futex.h
@@ -29,18 +29,10 @@
 	: "b" (uaddr), "i" (-EFAULT), "r" (oparg) \
 	: "cr0", "memory")
 
-static inline int futex_atomic_op_inuser (int encoded_op, u32 __user *uaddr)
+static inline int arch_futex_atomic_op_inuser(int op, int oparg, int *oval,
+		u32 __user *uaddr)
 {
-	int op = (encoded_op >> 28) & 7;
-	int cmp = (encoded_op >> 24) & 15;
-	int oparg = (encoded_op << 8) >> 20;
-	int cmparg = (encoded_op << 20) >> 20;
 	int oldval = 0, ret;
-	if (encoded_op & (FUTEX_OP_OPARG_SHIFT << 28))
-		oparg = 1 << oparg;
-
-	if (! access_ok (VERIFY_WRITE, uaddr, sizeof(u32)))
-		return -EFAULT;
 
 	pagefault_disable();
 
@@ -66,17 +58,9 @@ static inline int futex_atomic_op_inuser (int encoded_op, u32 __user *uaddr)
 
 	pagefault_enable();
 
-	if (!ret) {
-		switch (cmp) {
-		case FUTEX_OP_CMP_EQ: ret = (oldval == cmparg); break;
-		case FUTEX_OP_CMP_NE: ret = (oldval != cmparg); break;
-		case FUTEX_OP_CMP_LT: ret = (oldval < cmparg); break;
-		case FUTEX_OP_CMP_GE: ret = (oldval >= cmparg); break;
-		case FUTEX_OP_CMP_LE: ret = (oldval <= cmparg); break;
-		case FUTEX_OP_CMP_GT: ret = (oldval > cmparg); break;
-		default: ret = -ENOSYS;
-		}
-	}
+	if (!ret)
+		*oval = oldval;
+
 	return ret;
 }
 
diff --git a/arch/s390/include/asm/futex.h b/arch/s390/include/asm/futex.h
index a4811aa0304d..8f8eec9e1198 100644
--- a/arch/s390/include/asm/futex.h
+++ b/arch/s390/include/asm/futex.h
@@ -21,17 +21,12 @@
 		: "0" (-EFAULT), "d" (oparg), "a" (uaddr),		\
 		  "m" (*uaddr) : "cc");
 
-static inline int futex_atomic_op_inuser(int encoded_op, u32 __user *uaddr)
+static inline int arch_futex_atomic_op_inuser(int op, int oparg, int *oval,
+		u32 __user *uaddr)
 {
-	int op = (encoded_op >> 28) & 7;
-	int cmp = (encoded_op >> 24) & 15;
-	int oparg = (encoded_op << 8) >> 20;
-	int cmparg = (encoded_op << 20) >> 20;
 	int oldval = 0, newval, ret;
 
 	load_kernel_asce();
-	if (encoded_op & (FUTEX_OP_OPARG_SHIFT << 28))
-		oparg = 1 << oparg;
 
 	pagefault_disable();
 	switch (op) {
@@ -60,17 +55,9 @@ static inline int futex_atomic_op_inuser(int encoded_op, u32 __user *uaddr)
 	}
 	pagefault_enable();
 
-	if (!ret) {
-		switch (cmp) {
-		case FUTEX_OP_CMP_EQ: ret = (oldval == cmparg); break;
-		case FUTEX_OP_CMP_NE: ret = (oldval != cmparg); break;
-		case FUTEX_OP_CMP_LT: ret = (oldval < cmparg); break;
-		case FUTEX_OP_CMP_GE: ret = (oldval >= cmparg); break;
-		case FUTEX_OP_CMP_LE: ret = (oldval <= cmparg); break;
-		case FUTEX_OP_CMP_GT: ret = (oldval > cmparg); break;
-		default: ret = -ENOSYS;
-		}
-	}
+	if (!ret)
+		*oval = oldval;
+
 	return ret;
 }
 
diff --git a/arch/sh/include/asm/futex.h b/arch/sh/include/asm/futex.h
index d0078747d308..8f8cf941a8cd 100644
--- a/arch/sh/include/asm/futex.h
+++ b/arch/sh/include/asm/futex.h
@@ -27,21 +27,12 @@ futex_atomic_cmpxchg_inatomic(u32 *uval, u32 __user *uaddr,
 	return atomic_futex_op_cmpxchg_inatomic(uval, uaddr, oldval, newval);
 }
 
-static inline int futex_atomic_op_inuser(int encoded_op, u32 __user *uaddr)
+static inline int arch_futex_atomic_op_inuser(int op, u32 oparg, int *oval,
+		u32 __user *uaddr)
 {
-	int op = (encoded_op >> 28) & 7;
-	int cmp = (encoded_op >> 24) & 15;
-	u32 oparg = (encoded_op << 8) >> 20;
-	u32 cmparg = (encoded_op << 20) >> 20;
 	u32 oldval, newval, prev;
 	int ret;
 
-	if (encoded_op & (FUTEX_OP_OPARG_SHIFT << 28))
-		oparg = 1 << oparg;
-
-	if (!access_ok(VERIFY_WRITE, uaddr, sizeof(u32)))
-		return -EFAULT;
-
 	pagefault_disable();
 
 	do {
@@ -80,17 +71,8 @@ static inline int futex_atomic_op_inuser(int encoded_op, u32 __user *uaddr)
 
 	pagefault_enable();
 
-	if (!ret) {
-		switch (cmp) {
-		case FUTEX_OP_CMP_EQ: ret = (oldval == cmparg); break;
-		case FUTEX_OP_CMP_NE: ret = (oldval != cmparg); break;
-		case FUTEX_OP_CMP_LT: ret = ((int)oldval < (int)cmparg); break;
-		case FUTEX_OP_CMP_GE: ret = ((int)oldval >= (int)cmparg); break;
-		case FUTEX_OP_CMP_LE: ret = ((int)oldval <= (int)cmparg); break;
-		case FUTEX_OP_CMP_GT: ret = ((int)oldval > (int)cmparg); break;
-		default: ret = -ENOSYS;
-		}
-	}
+	if (!ret)
+		*oval = oldval;
 
 	return ret;
 }
diff --git a/arch/sparc/include/asm/futex_64.h b/arch/sparc/include/asm/futex_64.h
index 4e899b0dabf7..1cfd89d92208 100644
--- a/arch/sparc/include/asm/futex_64.h
+++ b/arch/sparc/include/asm/futex_64.h
@@ -29,22 +29,14 @@
 	: "r" (uaddr), "r" (oparg), "i" (-EFAULT)	\
 	: "memory")
 
-static inline int futex_atomic_op_inuser(int encoded_op, u32 __user *uaddr)
+static inline int arch_futex_atomic_op_inuser(int op, int oparg, int *oval,
+		u32 __user *uaddr)
 {
-	int op = (encoded_op >> 28) & 7;
-	int cmp = (encoded_op >> 24) & 15;
-	int oparg = (encoded_op << 8) >> 20;
-	int cmparg = (encoded_op << 20) >> 20;
 	int oldval = 0, ret, tem;
 
-	if (unlikely(!access_ok(VERIFY_WRITE, uaddr, sizeof(u32))))
-		return -EFAULT;
 	if (unlikely((((unsigned long) uaddr) & 0x3UL)))
 		return -EINVAL;
 
-	if (encoded_op & (FUTEX_OP_OPARG_SHIFT << 28))
-		oparg = 1 << oparg;
-
 	pagefault_disable();
 
 	switch (op) {
@@ -69,17 +61,9 @@ static inline int futex_atomic_op_inuser(int encoded_op, u32 __user *uaddr)
 
 	pagefault_enable();
 
-	if (!ret) {
-		switch (cmp) {
-		case FUTEX_OP_CMP_EQ: ret = (oldval == cmparg); break;
-		case FUTEX_OP_CMP_NE: ret = (oldval != cmparg); break;
-		case FUTEX_OP_CMP_LT: ret = (oldval < cmparg); break;
-		case FUTEX_OP_CMP_GE: ret = (oldval >= cmparg); break;
-		case FUTEX_OP_CMP_LE: ret = (oldval <= cmparg); break;
-		case FUTEX_OP_CMP_GT: ret = (oldval > cmparg); break;
-		default: ret = -ENOSYS;
-		}
-	}
+	if (!ret)
+		*oval = oldval;
+
 	return ret;
 }
 
diff --git a/arch/tile/include/asm/futex.h b/arch/tile/include/asm/futex.h
index e64a1b75fc38..83c1e639b411 100644
--- a/arch/tile/include/asm/futex.h
+++ b/arch/tile/include/asm/futex.h
@@ -106,12 +106,9 @@
 	lock = __atomic_hashed_lock((int __force *)uaddr)
 #endif
 
-static inline int futex_atomic_op_inuser(int encoded_op, u32 __user *uaddr)
+static inline int arch_futex_atomic_op_inuser(int op, u32 oparg, int *oval,
+		u32 __user *uaddr)
 {
-	int op = (encoded_op >> 28) & 7;
-	int cmp = (encoded_op >> 24) & 15;
-	int oparg = (encoded_op << 8) >> 20;
-	int cmparg = (encoded_op << 20) >> 20;
 	int uninitialized_var(val), ret;
 
 	__futex_prolog();
@@ -119,12 +116,6 @@ static inline int futex_atomic_op_inuser(int encoded_op, u32 __user *uaddr)
 	/* The 32-bit futex code makes this assumption, so validate it here. */
 	BUILD_BUG_ON(sizeof(atomic_t) != sizeof(int));
 
-	if (encoded_op & (FUTEX_OP_OPARG_SHIFT << 28))
-		oparg = 1 << oparg;
-
-	if (!access_ok(VERIFY_WRITE, uaddr, sizeof(u32)))
-		return -EFAULT;
-
 	pagefault_disable();
 	switch (op) {
 	case FUTEX_OP_SET:
@@ -148,30 +139,9 @@ static inline int futex_atomic_op_inuser(int encoded_op, u32 __user *uaddr)
 	}
 	pagefault_enable();
 
-	if (!ret) {
-		switch (cmp) {
-		case FUTEX_OP_CMP_EQ:
-			ret = (val == cmparg);
-			break;
-		case FUTEX_OP_CMP_NE:
-			ret = (val != cmparg);
-			break;
-		case FUTEX_OP_CMP_LT:
-			ret = (val < cmparg);
-			break;
-		case FUTEX_OP_CMP_GE:
-			ret = (val >= cmparg);
-			break;
-		case FUTEX_OP_CMP_LE:
-			ret = (val <= cmparg);
-			break;
-		case FUTEX_OP_CMP_GT:
-			ret = (val > cmparg);
-			break;
-		default:
-			ret = -ENOSYS;
-		}
-	}
+	if (!ret)
+		*oval = val;
+
 	return ret;
 }
 
diff --git a/arch/x86/include/asm/futex.h b/arch/x86/include/asm/futex.h
index b4c1f5453436..f4dc9b63bdda 100644
--- a/arch/x86/include/asm/futex.h
+++ b/arch/x86/include/asm/futex.h
@@ -41,20 +41,11 @@
 		       "+m" (*uaddr), "=&r" (tem)		\
 		     : "r" (oparg), "i" (-EFAULT), "1" (0))
 
-static inline int futex_atomic_op_inuser(int encoded_op, u32 __user *uaddr)
+static inline int arch_futex_atomic_op_inuser(int op, int oparg, int *oval,
+		u32 __user *uaddr)
 {
-	int op = (encoded_op >> 28) & 7;
-	int cmp = (encoded_op >> 24) & 15;
-	int oparg = (encoded_op << 8) >> 20;
-	int cmparg = (encoded_op << 20) >> 20;
 	int oldval = 0, ret, tem;
 
-	if (encoded_op & (FUTEX_OP_OPARG_SHIFT << 28))
-		oparg = 1 << oparg;
-
-	if (!access_ok(VERIFY_WRITE, uaddr, sizeof(u32)))
-		return -EFAULT;
-
 	pagefault_disable();
 
 	switch (op) {
@@ -80,30 +71,9 @@ static inline int futex_atomic_op_inuser(int encoded_op, u32 __user *uaddr)
 
 	pagefault_enable();
 
-	if (!ret) {
-		switch (cmp) {
-		case FUTEX_OP_CMP_EQ:
-			ret = (oldval == cmparg);
-			break;
-		case FUTEX_OP_CMP_NE:
-			ret = (oldval != cmparg);
-			break;
-		case FUTEX_OP_CMP_LT:
-			ret = (oldval < cmparg);
-			break;
-		case FUTEX_OP_CMP_GE:
-			ret = (oldval >= cmparg);
-			break;
-		case FUTEX_OP_CMP_LE:
-			ret = (oldval <= cmparg);
-			break;
-		case FUTEX_OP_CMP_GT:
-			ret = (oldval > cmparg);
-			break;
-		default:
-			ret = -ENOSYS;
-		}
-	}
+	if (!ret)
+		*oval = oldval;
+
 	return ret;
 }
 
diff --git a/arch/xtensa/include/asm/futex.h b/arch/xtensa/include/asm/futex.h
index b39531babec0..eaaf1ebcc7a4 100644
--- a/arch/xtensa/include/asm/futex.h
+++ b/arch/xtensa/include/asm/futex.h
@@ -44,18 +44,10 @@
 	: "r" (uaddr), "I" (-EFAULT), "r" (oparg)	\
 	: "memory")
 
-static inline int futex_atomic_op_inuser(int encoded_op, u32 __user *uaddr)
+static inline int arch_futex_atomic_op_inuser(int op, int oparg, int *oval,
+		u32 __user *uaddr)
 {
-	int op = (encoded_op >> 28) & 7;
-	int cmp = (encoded_op >> 24) & 15;
-	int oparg = (encoded_op << 8) >> 20;
-	int cmparg = (encoded_op << 20) >> 20;
 	int oldval = 0, ret;
-	if (encoded_op & (FUTEX_OP_OPARG_SHIFT << 28))
-		oparg = 1 << oparg;
-
-	if (!access_ok(VERIFY_WRITE, uaddr, sizeof(u32)))
-		return -EFAULT;
 
 #if !XCHAL_HAVE_S32C1I
 	return -ENOSYS;
@@ -89,19 +81,10 @@ static inline int futex_atomic_op_inuser(int encoded_op, u32 __user *uaddr)
 
 	pagefault_enable();
 
-	if (ret)
-		return ret;
+	if (!ret)
+		*oval = oldval;
 
-	switch (cmp) {
-	case FUTEX_OP_CMP_EQ: return (oldval == cmparg);
-	case FUTEX_OP_CMP_NE: return (oldval != cmparg);
-	case FUTEX_OP_CMP_LT: return (oldval < cmparg);
-	case FUTEX_OP_CMP_GE: return (oldval >= cmparg);
-	case FUTEX_OP_CMP_LE: return (oldval <= cmparg);
-	case FUTEX_OP_CMP_GT: return (oldval > cmparg);
-	}
-
-	return -ENOSYS;
+	return ret;
 }
 
 static inline int
diff --git a/include/asm-generic/futex.h b/include/asm-generic/futex.h
index bf2d34c9d804..f0d8b1c51343 100644
--- a/include/asm-generic/futex.h
+++ b/include/asm-generic/futex.h
@@ -13,7 +13,7 @@
  */
 
 /**
- * futex_atomic_op_inuser() - Atomic arithmetic operation with constant
+ * arch_futex_atomic_op_inuser() - Atomic arithmetic operation with constant
  *			  argument and comparison of the previous
  *			  futex value with another constant.
  *
@@ -25,18 +25,11 @@
  * <0 - On error
  */
 static inline int
-futex_atomic_op_inuser(int encoded_op, u32 __user *uaddr)
+arch_futex_atomic_op_inuser(int op, u32 oparg, int *oval, u32 __user *uaddr)
 {
-	int op = (encoded_op >> 28) & 7;
-	int cmp = (encoded_op >> 24) & 15;
-	int oparg = (encoded_op << 8) >> 20;
-	int cmparg = (encoded_op << 20) >> 20;
 	int oldval, ret;
 	u32 tmp;
 
-	if (encoded_op & (FUTEX_OP_OPARG_SHIFT << 28))
-		oparg = 1 << oparg;
-
 	preempt_disable();
 	pagefault_disable();
 
@@ -74,17 +67,9 @@ out_pagefault_enable:
 	pagefault_enable();
 	preempt_enable();
 
-	if (ret == 0) {
-		switch (cmp) {
-		case FUTEX_OP_CMP_EQ: ret = (oldval == cmparg); break;
-		case FUTEX_OP_CMP_NE: ret = (oldval != cmparg); break;
-		case FUTEX_OP_CMP_LT: ret = (oldval < cmparg); break;
-		case FUTEX_OP_CMP_GE: ret = (oldval >= cmparg); break;
-		case FUTEX_OP_CMP_LE: ret = (oldval <= cmparg); break;
-		case FUTEX_OP_CMP_GT: ret = (oldval > cmparg); break;
-		default: ret = -ENOSYS;
-		}
-	}
+	if (ret == 0)
+		*oval = oldval;
+
 	return ret;
 }
 
@@ -126,18 +111,9 @@ futex_atomic_cmpxchg_inatomic(u32 *uval, u32 __user *uaddr,
 
 #else
 static inline int
-futex_atomic_op_inuser (int encoded_op, u32 __user *uaddr)
+arch_futex_atomic_op_inuser(int op, u32 oparg, int *oval, u32 __user *uaddr)
 {
-	int op = (encoded_op >> 28) & 7;
-	int cmp = (encoded_op >> 24) & 15;
-	int oparg = (encoded_op << 8) >> 20;
-	int cmparg = (encoded_op << 20) >> 20;
 	int oldval = 0, ret;
-	if (encoded_op & (FUTEX_OP_OPARG_SHIFT << 28))
-		oparg = 1 << oparg;
-
-	if (! access_ok (VERIFY_WRITE, uaddr, sizeof(u32)))
-		return -EFAULT;
 
 	pagefault_disable();
 
@@ -153,17 +129,9 @@ futex_atomic_op_inuser (int encoded_op, u32 __user *uaddr)
 
 	pagefault_enable();
 
-	if (!ret) {
-		switch (cmp) {
-		case FUTEX_OP_CMP_EQ: ret = (oldval == cmparg); break;
-		case FUTEX_OP_CMP_NE: ret = (oldval != cmparg); break;
-		case FUTEX_OP_CMP_LT: ret = (oldval < cmparg); break;
-		case FUTEX_OP_CMP_GE: ret = (oldval >= cmparg); break;
-		case FUTEX_OP_CMP_LE: ret = (oldval <= cmparg); break;
-		case FUTEX_OP_CMP_GT: ret = (oldval > cmparg); break;
-		default: ret = -ENOSYS;
-		}
-	}
+	if (!ret)
+		*oval = oldval;
+
 	return ret;
 }
 
diff --git a/kernel/futex.c b/kernel/futex.c
index 0939255fc750..3d38eaf05492 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -1551,6 +1551,45 @@ out:
 	return ret;
 }
 
+static int futex_atomic_op_inuser(unsigned int encoded_op, u32 __user *uaddr)
+{
+	unsigned int op =	  (encoded_op & 0x70000000) >> 28;
+	unsigned int cmp =	  (encoded_op & 0x0f000000) >> 24;
+	int oparg = sign_extend32((encoded_op & 0x00fff000) >> 12, 12);
+	int cmparg = sign_extend32(encoded_op & 0x00000fff, 12);
+	int oldval, ret;
+
+	if (encoded_op & (FUTEX_OP_OPARG_SHIFT << 28)) {
+		if (oparg < 0 || oparg > 31)
+			return -EINVAL;
+		oparg = 1 << oparg;
+	}
+
+	if (!access_ok(VERIFY_WRITE, uaddr, sizeof(u32)))
+		return -EFAULT;
+
+	ret = arch_futex_atomic_op_inuser(op, oparg, &oldval, uaddr);
+	if (ret)
+		return ret;
+
+	switch (cmp) {
+	case FUTEX_OP_CMP_EQ:
+		return oldval == cmparg;
+	case FUTEX_OP_CMP_NE:
+		return oldval != cmparg;
+	case FUTEX_OP_CMP_LT:
+		return oldval < cmparg;
+	case FUTEX_OP_CMP_GE:
+		return oldval >= cmparg;
+	case FUTEX_OP_CMP_LE:
+		return oldval <= cmparg;
+	case FUTEX_OP_CMP_GT:
+		return oldval > cmparg;
+	default:
+		return -ENOSYS;
+	}
+}
+
 /*
  * Wake up all waiters hashed on the physical page that is mapped
  * to this virtual address:
-- 
cgit v1.2.3


From f52d53a9a0e08e3a003f0f3ff8e163125319af5a Mon Sep 17 00:00:00 2001
From: Bart Van Assche <bart.vanassche@wdc.com>
Date: Fri, 25 Aug 2017 13:46:27 -0700
Subject: scsi: Remove an obsolete function declaration

Commit e9c787e65c0c ("scsi: allocate scsi_cmnd structures as part of
struct request") removed the scsi_get_command() function. Hence also
remove the declaration of that function.

Signed-off-by: Bart Van Assche <bart.vanassche@wdc.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Cc: Hannes Reinecke <hare@suse.com>
Cc: Johannes Thumshirn <jthumshirn@suse.de>
Signed-off-by: Martin K. Petersen <martin.petersen@oracle.com>
---
 include/scsi/scsi_cmnd.h | 1 -
 1 file changed, 1 deletion(-)

(limited to 'include')

diff --git a/include/scsi/scsi_cmnd.h b/include/scsi/scsi_cmnd.h
index a1266d318c85..f5afcff8d76f 100644
--- a/include/scsi/scsi_cmnd.h
+++ b/include/scsi/scsi_cmnd.h
@@ -158,7 +158,6 @@ static inline struct scsi_driver *scsi_cmd_to_driver(struct scsi_cmnd *cmd)
 	return *(struct scsi_driver **)cmd->request->rq_disk->private_data;
 }
 
-extern struct scsi_cmnd *scsi_get_command(struct scsi_device *, gfp_t);
 extern void scsi_put_command(struct scsi_cmnd *);
 extern void scsi_finish_command(struct scsi_cmnd *cmd);
 
-- 
cgit v1.2.3


From c2e872a27c610a5e2a71444f86470f765ed9dfce Mon Sep 17 00:00:00 2001
From: Bart Van Assche <bart.vanassche@wdc.com>
Date: Fri, 25 Aug 2017 13:46:28 -0700
Subject: scsi: Avoid sign extension of scsi_device.type

This patch avoids that smatch reports the following:

drivers/scsi/scsi_sysfs.c:506 scsi_bus_uevent() warn: argument 3 to %02x specifier has type 'char'
drivers/scsi/scsi_sysfs.c:872 sdev_show_modalias() warn: argument 4 to %02x specifier has type 'char'

Signed-off-by: Bart Van Assche <bart.vanassche@wdc.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Hannes Reinecke <hare@suse.de>
Cc: Johannes Thumshirn <jthumshirn@suse.de>
Signed-off-by: Martin K. Petersen <martin.petersen@oracle.com>
---
 include/scsi/scsi_device.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/scsi/scsi_device.h b/include/scsi/scsi_device.h
index 0979a5f3b69a..f054f3f43c75 100644
--- a/include/scsi/scsi_device.h
+++ b/include/scsi/scsi_device.h
@@ -111,7 +111,7 @@ struct scsi_device {
 	unsigned sector_size;	/* size in bytes */
 
 	void *hostdata;		/* available to low-level driver */
-	char type;
+	unsigned char type;
 	char scsi_level;
 	char inq_periph_qual;	/* PQ from INQUIRY data */	
 	struct mutex inquiry_mutex;
-- 
cgit v1.2.3


From bed2213d01de474eb8a6f3891070eec6be6fe772 Mon Sep 17 00:00:00 2001
From: Bart Van Assche <bart.vanassche@wdc.com>
Date: Fri, 25 Aug 2017 13:46:32 -0700
Subject: scsi: Use blk_mq_rq_to_pdu() to convert a request to a SCSI command
 pointer

Since commit e9c787e65c0c ("scsi: allocate scsi_cmnd structures as
part of struct request") struct request and struct scsi_cmnd are
adjacent. This means that there is now an alternative to reading
req->special to convert a pointer to a prepared request into a
SCSI command pointer, namely by using blk_mq_rq_to_pdu(). Make
this change where appropriate. Although this patch does not
change any functionality, it slightly improves performance and
slightly improves readability.

Signed-off-by: Bart Van Assche <bart.vanassche@sandisk.com>
Reviewed-by: Hannes Reinecke <hare@suse.com>
Reviewed-by: Johannes Thumshirn <jthumshirn@suse.de>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Martin K. Petersen <martin.petersen@oracle.com>
---
 drivers/scsi/scsi_error.c |  2 +-
 drivers/scsi/scsi_lib.c   | 18 +++++++++---------
 include/scsi/scsi_tcq.h   |  2 +-
 3 files changed, 11 insertions(+), 11 deletions(-)

(limited to 'include')

diff --git a/drivers/scsi/scsi_error.c b/drivers/scsi/scsi_error.c
index 01b2d2055edf..38942050b265 100644
--- a/drivers/scsi/scsi_error.c
+++ b/drivers/scsi/scsi_error.c
@@ -259,7 +259,7 @@ void scsi_eh_scmd_add(struct scsi_cmnd *scmd)
  */
 enum blk_eh_timer_return scsi_times_out(struct request *req)
 {
-	struct scsi_cmnd *scmd = req->special;
+	struct scsi_cmnd *scmd = blk_mq_rq_to_pdu(req);
 	enum blk_eh_timer_return rtn = BLK_EH_NOT_HANDLED;
 	struct Scsi_Host *host = scmd->device->host;
 
diff --git a/drivers/scsi/scsi_lib.c b/drivers/scsi/scsi_lib.c
index 9e8ce0d66c5a..0270b35f7680 100644
--- a/drivers/scsi/scsi_lib.c
+++ b/drivers/scsi/scsi_lib.c
@@ -627,7 +627,7 @@ static void scsi_release_bidi_buffers(struct scsi_cmnd *cmd)
 static bool scsi_end_request(struct request *req, blk_status_t error,
 		unsigned int bytes, unsigned int bidi_bytes)
 {
-	struct scsi_cmnd *cmd = req->special;
+	struct scsi_cmnd *cmd = blk_mq_rq_to_pdu(req);
 	struct scsi_device *sdev = cmd->device;
 	struct request_queue *q = sdev->request_queue;
 
@@ -1171,7 +1171,7 @@ void scsi_init_command(struct scsi_device *dev, struct scsi_cmnd *cmd)
 
 static int scsi_setup_scsi_cmnd(struct scsi_device *sdev, struct request *req)
 {
-	struct scsi_cmnd *cmd = req->special;
+	struct scsi_cmnd *cmd = blk_mq_rq_to_pdu(req);
 
 	/*
 	 * Passthrough requests may transfer data, in which case they must
@@ -1202,7 +1202,7 @@ static int scsi_setup_scsi_cmnd(struct scsi_device *sdev, struct request *req)
  */
 static int scsi_setup_fs_cmnd(struct scsi_device *sdev, struct request *req)
 {
-	struct scsi_cmnd *cmd = req->special;
+	struct scsi_cmnd *cmd = blk_mq_rq_to_pdu(req);
 
 	if (unlikely(sdev->handler && sdev->handler->prep_fn)) {
 		int ret = sdev->handler->prep_fn(sdev, req);
@@ -1217,7 +1217,7 @@ static int scsi_setup_fs_cmnd(struct scsi_device *sdev, struct request *req)
 
 static int scsi_setup_cmnd(struct scsi_device *sdev, struct request *req)
 {
-	struct scsi_cmnd *cmd = req->special;
+	struct scsi_cmnd *cmd = blk_mq_rq_to_pdu(req);
 
 	if (!blk_rq_bytes(req))
 		cmd->sc_data_direction = DMA_NONE;
@@ -1354,7 +1354,7 @@ out:
 
 static void scsi_unprep_fn(struct request_queue *q, struct request *req)
 {
-	scsi_uninit_cmd(req->special);
+	scsi_uninit_cmd(blk_mq_rq_to_pdu(req));
 }
 
 /*
@@ -1545,7 +1545,7 @@ static int scsi_lld_busy(struct request_queue *q)
  */
 static void scsi_kill_request(struct request *req, struct request_queue *q)
 {
-	struct scsi_cmnd *cmd = req->special;
+	struct scsi_cmnd *cmd = blk_mq_rq_to_pdu(req);
 	struct scsi_device *sdev;
 	struct scsi_target *starget;
 	struct Scsi_Host *shost;
@@ -1576,7 +1576,7 @@ static void scsi_kill_request(struct request *req, struct request_queue *q)
 
 static void scsi_softirq_done(struct request *rq)
 {
-	struct scsi_cmnd *cmd = rq->special;
+	struct scsi_cmnd *cmd = blk_mq_rq_to_pdu(rq);
 	unsigned long wait_for = (cmd->allowed + 1) * rq->timeout;
 	int disposition;
 
@@ -1764,8 +1764,8 @@ static void scsi_request_fn(struct request_queue *q)
 			blk_start_request(req);
 
 		spin_unlock_irq(q->queue_lock);
-		cmd = req->special;
-		if (unlikely(cmd == NULL)) {
+		cmd = blk_mq_rq_to_pdu(req);
+		if (cmd != req->special) {
 			printk(KERN_CRIT "impossible request in %s.\n"
 					 "please mail a stack trace to "
 					 "linux-scsi@vger.kernel.org\n",
diff --git a/include/scsi/scsi_tcq.h b/include/scsi/scsi_tcq.h
index 4416b1026189..5b416debf101 100644
--- a/include/scsi/scsi_tcq.h
+++ b/include/scsi/scsi_tcq.h
@@ -39,7 +39,7 @@ static inline struct scsi_cmnd *scsi_host_find_tag(struct Scsi_Host *shost,
 
 	if (!req)
 		return NULL;
-	return req->special;
+	return blk_mq_rq_to_pdu(req);
 }
 
 #endif /* CONFIG_BLOCK */
-- 
cgit v1.2.3


From 85aa93be7558b009fd8b9f0171d8b8a2b44397f7 Mon Sep 17 00:00:00 2001
From: Bart Van Assche <bart.vanassche@wdc.com>
Date: Fri, 25 Aug 2017 13:46:41 -0700
Subject: scsi: scsi_transport_srp: Suppress a W=1 compiler warning

Avoid that the following compiler warning is reported when building
with W=1:

drivers/scsi/scsi_transport_srp.c:92:19: warning: comparison is always false due to limited range of data type [-Wtype-limits]

Signed-off-by: Bart Van Assche <bart.vanassche@wdc.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Cc: Hannes Reinecke <hare@suse.de>
Cc: Johannes Thumshirn <jthumshirn@suse.de>
Signed-off-by: Martin K. Petersen <martin.petersen@oracle.com>
---
 drivers/scsi/scsi_transport_srp.c | 2 +-
 include/scsi/scsi_transport_srp.h | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/drivers/scsi/scsi_transport_srp.c b/drivers/scsi/scsi_transport_srp.c
index 698cc4681706..4f6f01cf9968 100644
--- a/drivers/scsi/scsi_transport_srp.c
+++ b/drivers/scsi/scsi_transport_srp.c
@@ -78,7 +78,7 @@ static inline struct srp_rport *shost_to_rport(struct Scsi_Host *shost)
  * parameters must be such that multipath can detect failed paths timely.
  * Hence do not allow all three parameters to be disabled simultaneously.
  */
-int srp_tmo_valid(int reconnect_delay, int fast_io_fail_tmo, int dev_loss_tmo)
+int srp_tmo_valid(int reconnect_delay, int fast_io_fail_tmo, long dev_loss_tmo)
 {
 	if (reconnect_delay < 0 && fast_io_fail_tmo < 0 && dev_loss_tmo < 0)
 		return -EINVAL;
diff --git a/include/scsi/scsi_transport_srp.h b/include/scsi/scsi_transport_srp.h
index dd096330734e..56ae198acc73 100644
--- a/include/scsi/scsi_transport_srp.h
+++ b/include/scsi/scsi_transport_srp.h
@@ -111,7 +111,7 @@ extern struct srp_rport *srp_rport_add(struct Scsi_Host *,
 				       struct srp_rport_identifiers *);
 extern void srp_rport_del(struct srp_rport *);
 extern int srp_tmo_valid(int reconnect_delay, int fast_io_fail_tmo,
-			 int dev_loss_tmo);
+			 long dev_loss_tmo);
 int srp_parse_tmo(int *tmo, const char *buf);
 extern int srp_reconnect_rport(struct srp_rport *rport);
 extern void srp_start_tl_fail_timers(struct srp_rport *rport);
-- 
cgit v1.2.3


From cc199e78460565eeab0399875dbf9da8e2901c42 Mon Sep 17 00:00:00 2001
From: Hannes Reinecke <hare@suse.de>
Date: Fri, 25 Aug 2017 13:57:02 +0200
Subject: scsi: libsas: move bus_reset_handler() to target_reset_handler()

The bus reset handler is calling I_T Nexus reset, which logically is a
target reset as it need to specify both the initiator and the target.
So move it to target reset.

Signed-off-by: Hannes Reinecke <hare@suse.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Johannes Thumshirn <jthumshirn@suse.de>
Signed-off-by: Martin K. Petersen <martin.petersen@oracle.com>
---
 drivers/scsi/aic94xx/aic94xx_init.c   |  2 +-
 drivers/scsi/hisi_sas/hisi_sas_main.c |  2 +-
 drivers/scsi/isci/init.c              |  2 +-
 drivers/scsi/libsas/sas_scsi_host.c   | 12 ++++++------
 drivers/scsi/mvsas/mv_init.c          |  2 +-
 drivers/scsi/pm8001/pm8001_init.c     |  2 +-
 include/scsi/libsas.h                 |  2 +-
 7 files changed, 12 insertions(+), 12 deletions(-)

(limited to 'include')

diff --git a/drivers/scsi/aic94xx/aic94xx_init.c b/drivers/scsi/aic94xx/aic94xx_init.c
index a240feee16e5..6c838865ac5a 100644
--- a/drivers/scsi/aic94xx/aic94xx_init.c
+++ b/drivers/scsi/aic94xx/aic94xx_init.c
@@ -70,7 +70,7 @@ static struct scsi_host_template aic94xx_sht = {
 	.max_sectors		= SCSI_DEFAULT_MAX_SECTORS,
 	.use_clustering		= ENABLE_CLUSTERING,
 	.eh_device_reset_handler	= sas_eh_device_reset_handler,
-	.eh_bus_reset_handler	= sas_eh_bus_reset_handler,
+	.eh_target_reset_handler	= sas_eh_target_reset_handler,
 	.target_destroy		= sas_target_destroy,
 	.ioctl			= sas_ioctl,
 	.track_queue_depth	= 1,
diff --git a/drivers/scsi/hisi_sas/hisi_sas_main.c b/drivers/scsi/hisi_sas/hisi_sas_main.c
index bdef111434b8..16664f2e15fb 100644
--- a/drivers/scsi/hisi_sas/hisi_sas_main.c
+++ b/drivers/scsi/hisi_sas/hisi_sas_main.c
@@ -1562,7 +1562,7 @@ static struct scsi_host_template _hisi_sas_sht = {
 	.max_sectors		= SCSI_DEFAULT_MAX_SECTORS,
 	.use_clustering		= ENABLE_CLUSTERING,
 	.eh_device_reset_handler = sas_eh_device_reset_handler,
-	.eh_bus_reset_handler	= sas_eh_bus_reset_handler,
+	.eh_target_reset_handler = sas_eh_target_reset_handler,
 	.target_destroy		= sas_target_destroy,
 	.ioctl			= sas_ioctl,
 };
diff --git a/drivers/scsi/isci/init.c b/drivers/scsi/isci/init.c
index 45371179ab87..922e3e56c90d 100644
--- a/drivers/scsi/isci/init.c
+++ b/drivers/scsi/isci/init.c
@@ -166,7 +166,7 @@ static struct scsi_host_template isci_sht = {
 	.use_clustering			= ENABLE_CLUSTERING,
 	.eh_abort_handler		= sas_eh_abort_handler,
 	.eh_device_reset_handler        = sas_eh_device_reset_handler,
-	.eh_bus_reset_handler           = sas_eh_bus_reset_handler,
+	.eh_target_reset_handler        = sas_eh_target_reset_handler,
 	.target_destroy			= sas_target_destroy,
 	.ioctl				= sas_ioctl,
 	.shost_attrs			= isci_host_attrs,
diff --git a/drivers/scsi/libsas/sas_scsi_host.c b/drivers/scsi/libsas/sas_scsi_host.c
index fc90b8c65860..ea8ad06ff582 100644
--- a/drivers/scsi/libsas/sas_scsi_host.c
+++ b/drivers/scsi/libsas/sas_scsi_host.c
@@ -526,7 +526,7 @@ int sas_eh_device_reset_handler(struct scsi_cmnd *cmd)
 	return FAILED;
 }
 
-int sas_eh_bus_reset_handler(struct scsi_cmnd *cmd)
+int sas_eh_target_reset_handler(struct scsi_cmnd *cmd)
 {
 	int res;
 	struct Scsi_Host *host = cmd->device->host;
@@ -554,15 +554,15 @@ static int try_to_reset_cmd_device(struct scsi_cmnd *cmd)
 	struct Scsi_Host *shost = cmd->device->host;
 
 	if (!shost->hostt->eh_device_reset_handler)
-		goto try_bus_reset;
+		goto try_target_reset;
 
 	res = shost->hostt->eh_device_reset_handler(cmd);
 	if (res == SUCCESS)
 		return res;
 
-try_bus_reset:
-	if (shost->hostt->eh_bus_reset_handler)
-		return shost->hostt->eh_bus_reset_handler(cmd);
+try_target_reset:
+	if (shost->hostt->eh_target_reset_handler)
+		return shost->hostt->eh_target_reset_handler(cmd);
 
 	return FAILED;
 }
@@ -993,6 +993,6 @@ EXPORT_SYMBOL_GPL(sas_bios_param);
 EXPORT_SYMBOL_GPL(sas_task_abort);
 EXPORT_SYMBOL_GPL(sas_phy_reset);
 EXPORT_SYMBOL_GPL(sas_eh_device_reset_handler);
-EXPORT_SYMBOL_GPL(sas_eh_bus_reset_handler);
+EXPORT_SYMBOL_GPL(sas_eh_target_reset_handler);
 EXPORT_SYMBOL_GPL(sas_target_destroy);
 EXPORT_SYMBOL_GPL(sas_ioctl);
diff --git a/drivers/scsi/mvsas/mv_init.c b/drivers/scsi/mvsas/mv_init.c
index f0a096a1e276..718c88de328b 100644
--- a/drivers/scsi/mvsas/mv_init.c
+++ b/drivers/scsi/mvsas/mv_init.c
@@ -61,7 +61,7 @@ static struct scsi_host_template mvs_sht = {
 	.max_sectors		= SCSI_DEFAULT_MAX_SECTORS,
 	.use_clustering		= ENABLE_CLUSTERING,
 	.eh_device_reset_handler = sas_eh_device_reset_handler,
-	.eh_bus_reset_handler	= sas_eh_bus_reset_handler,
+	.eh_target_reset_handler = sas_eh_target_reset_handler,
 	.target_destroy		= sas_target_destroy,
 	.ioctl			= sas_ioctl,
 	.shost_attrs		= mvst_host_attrs,
diff --git a/drivers/scsi/pm8001/pm8001_init.c b/drivers/scsi/pm8001/pm8001_init.c
index 2908881bad51..0e013f76b582 100644
--- a/drivers/scsi/pm8001/pm8001_init.c
+++ b/drivers/scsi/pm8001/pm8001_init.c
@@ -86,7 +86,7 @@ static struct scsi_host_template pm8001_sht = {
 	.max_sectors		= SCSI_DEFAULT_MAX_SECTORS,
 	.use_clustering		= ENABLE_CLUSTERING,
 	.eh_device_reset_handler = sas_eh_device_reset_handler,
-	.eh_bus_reset_handler	= sas_eh_bus_reset_handler,
+	.eh_target_reset_handler = sas_eh_target_reset_handler,
 	.target_destroy		= sas_target_destroy,
 	.ioctl			= sas_ioctl,
 	.shost_attrs		= pm8001_host_attrs,
diff --git a/include/scsi/libsas.h b/include/scsi/libsas.h
index cfaeed256ab2..e7c012ce5ecd 100644
--- a/include/scsi/libsas.h
+++ b/include/scsi/libsas.h
@@ -714,7 +714,7 @@ void sas_init_dev(struct domain_device *);
 void sas_task_abort(struct sas_task *);
 int sas_eh_abort_handler(struct scsi_cmnd *cmd);
 int sas_eh_device_reset_handler(struct scsi_cmnd *cmd);
-int sas_eh_bus_reset_handler(struct scsi_cmnd *cmd);
+int sas_eh_target_reset_handler(struct scsi_cmnd *cmd);
 
 extern void sas_target_destroy(struct scsi_target *);
 extern int sas_slave_alloc(struct scsi_device *);
-- 
cgit v1.2.3


From fbb113f773482496d601e0bd934e680b35876016 Mon Sep 17 00:00:00 2001
From: Stefan Assmann <sassmann@kpanic.de>
Date: Thu, 29 Jun 2017 15:12:24 +0200
Subject: i40e/i40evf: rename vf_offload_flags to vf_cap_flags in struct
 virtchnl_vf_resource

The current name of vf_offload_flags indicates that the bitmap is
limited to offload related features. Make this more generic by renaming
it to vf_cap_flags, which allows for other capabilities besides
offloading to be added.

Signed-off-by: Stefan Assmann <sassmann@kpanic.de>
Tested-by: Andrew Bowers <andrewx.bowers@intel.com>
Signed-off-by: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
---
 drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c | 22 +++++++++++-----------
 drivers/net/ethernet/intel/i40evf/i40e_common.c    |  2 +-
 drivers/net/ethernet/intel/i40evf/i40evf.h         | 10 +++++-----
 drivers/net/ethernet/intel/i40evf/i40evf_main.c    | 12 ++++++------
 include/linux/avf/virtchnl.h                       |  4 ++--
 5 files changed, 25 insertions(+), 25 deletions(-)

(limited to 'include')

diff --git a/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c b/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c
index 3ef67dc094fc..057c77be96e4 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c
+++ b/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c
@@ -1528,39 +1528,39 @@ static int i40e_vc_get_vf_resources_msg(struct i40e_vf *vf, u8 *msg)
 				  VIRTCHNL_VF_OFFLOAD_RSS_REG |
 				  VIRTCHNL_VF_OFFLOAD_VLAN;
 
-	vfres->vf_offload_flags = VIRTCHNL_VF_OFFLOAD_L2;
+	vfres->vf_cap_flags = VIRTCHNL_VF_OFFLOAD_L2;
 	vsi = pf->vsi[vf->lan_vsi_idx];
 	if (!vsi->info.pvid)
-		vfres->vf_offload_flags |= VIRTCHNL_VF_OFFLOAD_VLAN;
+		vfres->vf_cap_flags |= VIRTCHNL_VF_OFFLOAD_VLAN;
 
 	if (i40e_vf_client_capable(pf, vf->vf_id) &&
 	    (vf->driver_caps & VIRTCHNL_VF_OFFLOAD_IWARP)) {
-		vfres->vf_offload_flags |= VIRTCHNL_VF_OFFLOAD_IWARP;
+		vfres->vf_cap_flags |= VIRTCHNL_VF_OFFLOAD_IWARP;
 		set_bit(I40E_VF_STATE_IWARPENA, &vf->vf_states);
 	}
 
 	if (vf->driver_caps & VIRTCHNL_VF_OFFLOAD_RSS_PF) {
-		vfres->vf_offload_flags |= VIRTCHNL_VF_OFFLOAD_RSS_PF;
+		vfres->vf_cap_flags |= VIRTCHNL_VF_OFFLOAD_RSS_PF;
 	} else {
 		if ((pf->hw_features & I40E_HW_RSS_AQ_CAPABLE) &&
 		    (vf->driver_caps & VIRTCHNL_VF_OFFLOAD_RSS_AQ))
-			vfres->vf_offload_flags |= VIRTCHNL_VF_OFFLOAD_RSS_AQ;
+			vfres->vf_cap_flags |= VIRTCHNL_VF_OFFLOAD_RSS_AQ;
 		else
-			vfres->vf_offload_flags |= VIRTCHNL_VF_OFFLOAD_RSS_REG;
+			vfres->vf_cap_flags |= VIRTCHNL_VF_OFFLOAD_RSS_REG;
 	}
 
 	if (pf->hw_features & I40E_HW_MULTIPLE_TCP_UDP_RSS_PCTYPE) {
 		if (vf->driver_caps & VIRTCHNL_VF_OFFLOAD_RSS_PCTYPE_V2)
-			vfres->vf_offload_flags |=
+			vfres->vf_cap_flags |=
 				VIRTCHNL_VF_OFFLOAD_RSS_PCTYPE_V2;
 	}
 
 	if (vf->driver_caps & VIRTCHNL_VF_OFFLOAD_ENCAP)
-		vfres->vf_offload_flags |= VIRTCHNL_VF_OFFLOAD_ENCAP;
+		vfres->vf_cap_flags |= VIRTCHNL_VF_OFFLOAD_ENCAP;
 
 	if ((pf->hw_features & I40E_HW_OUTER_UDP_CSUM_CAPABLE) &&
 	    (vf->driver_caps & VIRTCHNL_VF_OFFLOAD_ENCAP_CSUM))
-		vfres->vf_offload_flags |= VIRTCHNL_VF_OFFLOAD_ENCAP_CSUM;
+		vfres->vf_cap_flags |= VIRTCHNL_VF_OFFLOAD_ENCAP_CSUM;
 
 	if (vf->driver_caps & VIRTCHNL_VF_OFFLOAD_RX_POLLING) {
 		if (pf->flags & I40E_FLAG_MFP_ENABLED) {
@@ -1570,12 +1570,12 @@ static int i40e_vc_get_vf_resources_msg(struct i40e_vf *vf, u8 *msg)
 			aq_ret = I40E_ERR_PARAM;
 			goto err;
 		}
-		vfres->vf_offload_flags |= VIRTCHNL_VF_OFFLOAD_RX_POLLING;
+		vfres->vf_cap_flags |= VIRTCHNL_VF_OFFLOAD_RX_POLLING;
 	}
 
 	if (pf->hw_features & I40E_HW_WB_ON_ITR_CAPABLE) {
 		if (vf->driver_caps & VIRTCHNL_VF_OFFLOAD_WB_ON_ITR)
-			vfres->vf_offload_flags |=
+			vfres->vf_cap_flags |=
 					VIRTCHNL_VF_OFFLOAD_WB_ON_ITR;
 	}
 
diff --git a/drivers/net/ethernet/intel/i40evf/i40e_common.c b/drivers/net/ethernet/intel/i40evf/i40e_common.c
index 1dd1938f594f..d69c2e44cd1a 100644
--- a/drivers/net/ethernet/intel/i40evf/i40e_common.c
+++ b/drivers/net/ethernet/intel/i40evf/i40e_common.c
@@ -1104,7 +1104,7 @@ void i40e_vf_parse_hw_config(struct i40e_hw *hw,
 	hw->dev_caps.num_rx_qp = msg->num_queue_pairs;
 	hw->dev_caps.num_tx_qp = msg->num_queue_pairs;
 	hw->dev_caps.num_msix_vectors_vf = msg->max_vectors;
-	hw->dev_caps.dcb = msg->vf_offload_flags &
+	hw->dev_caps.dcb = msg->vf_cap_flags &
 			   VIRTCHNL_VF_OFFLOAD_L2;
 	hw->dev_caps.fcoe = 0;
 	for (i = 0; i < msg->num_vsis; i++) {
diff --git a/drivers/net/ethernet/intel/i40evf/i40evf.h b/drivers/net/ethernet/intel/i40evf/i40evf.h
index 7f905368fc93..d310544c6c6e 100644
--- a/drivers/net/ethernet/intel/i40evf/i40evf.h
+++ b/drivers/net/ethernet/intel/i40evf/i40evf.h
@@ -277,19 +277,19 @@ struct i40evf_adapter {
 	enum virtchnl_link_speed link_speed;
 	enum virtchnl_ops current_op;
 #define CLIENT_ALLOWED(_a) ((_a)->vf_res ? \
-			    (_a)->vf_res->vf_offload_flags & \
+			    (_a)->vf_res->vf_cap_flags & \
 				VIRTCHNL_VF_OFFLOAD_IWARP : \
 			    0)
 #define CLIENT_ENABLED(_a) ((_a)->cinst)
 /* RSS by the PF should be preferred over RSS via other methods. */
-#define RSS_PF(_a) ((_a)->vf_res->vf_offload_flags & \
+#define RSS_PF(_a) ((_a)->vf_res->vf_cap_flags & \
 		    VIRTCHNL_VF_OFFLOAD_RSS_PF)
-#define RSS_AQ(_a) ((_a)->vf_res->vf_offload_flags & \
+#define RSS_AQ(_a) ((_a)->vf_res->vf_cap_flags & \
 		    VIRTCHNL_VF_OFFLOAD_RSS_AQ)
-#define RSS_REG(_a) (!((_a)->vf_res->vf_offload_flags & \
+#define RSS_REG(_a) (!((_a)->vf_res->vf_cap_flags & \
 		       (VIRTCHNL_VF_OFFLOAD_RSS_AQ | \
 			VIRTCHNL_VF_OFFLOAD_RSS_PF)))
-#define VLAN_ALLOWED(_a) ((_a)->vf_res->vf_offload_flags & \
+#define VLAN_ALLOWED(_a) ((_a)->vf_res->vf_cap_flags & \
 			  VIRTCHNL_VF_OFFLOAD_VLAN)
 	struct virtchnl_vf_resource *vf_res; /* incl. all VSIs */
 	struct virtchnl_vsi_resource *vsi_res; /* our LAN VSI */
diff --git a/drivers/net/ethernet/intel/i40evf/i40evf_main.c b/drivers/net/ethernet/intel/i40evf/i40evf_main.c
index 8603911cc550..4a36c2ee3837 100644
--- a/drivers/net/ethernet/intel/i40evf/i40evf_main.c
+++ b/drivers/net/ethernet/intel/i40evf/i40evf_main.c
@@ -1418,7 +1418,7 @@ static int i40evf_init_rss(struct i40evf_adapter *adapter)
 
 	if (!RSS_PF(adapter)) {
 		/* Enable PCTYPES for RSS, TCP/UDP with IPv4/IPv6 */
-		if (adapter->vf_res->vf_offload_flags &
+		if (adapter->vf_res->vf_cap_flags &
 		    VIRTCHNL_VF_OFFLOAD_RSS_PCTYPE_V2)
 			adapter->hena = I40E_DEFAULT_RSS_HENA_EXPANDED;
 		else
@@ -2371,7 +2371,7 @@ static netdev_features_t i40evf_fix_features(struct net_device *netdev,
 	struct i40evf_adapter *adapter = netdev_priv(netdev);
 
 	features &= ~I40EVF_VLAN_FEATURES;
-	if (adapter->vf_res->vf_offload_flags & VIRTCHNL_VF_OFFLOAD_VLAN)
+	if (adapter->vf_res->vf_cap_flags & VIRTCHNL_VF_OFFLOAD_VLAN)
 		features |= I40EVF_VLAN_FEATURES;
 	return features;
 }
@@ -2458,7 +2458,7 @@ int i40evf_process_config(struct i40evf_adapter *adapter)
 	/* advertise to stack only if offloads for encapsulated packets is
 	 * supported
 	 */
-	if (vfres->vf_offload_flags & VIRTCHNL_VF_OFFLOAD_ENCAP) {
+	if (vfres->vf_cap_flags & VIRTCHNL_VF_OFFLOAD_ENCAP) {
 		hw_enc_features |= NETIF_F_GSO_UDP_TUNNEL	|
 				   NETIF_F_GSO_GRE		|
 				   NETIF_F_GSO_GRE_CSUM		|
@@ -2468,7 +2468,7 @@ int i40evf_process_config(struct i40evf_adapter *adapter)
 				   NETIF_F_GSO_PARTIAL		|
 				   0;
 
-		if (!(vfres->vf_offload_flags &
+		if (!(vfres->vf_cap_flags &
 		      VIRTCHNL_VF_OFFLOAD_ENCAP_CSUM))
 			netdev->gso_partial_features |=
 				NETIF_F_GSO_UDP_TUNNEL_CSUM;
@@ -2496,7 +2496,7 @@ int i40evf_process_config(struct i40evf_adapter *adapter)
 	adapter->vsi.work_limit = I40E_DEFAULT_IRQ_WORK;
 	vsi->netdev = adapter->netdev;
 	vsi->qs_handle = adapter->vsi_res->qset_handle;
-	if (vfres->vf_offload_flags & VIRTCHNL_VF_OFFLOAD_RSS_PF) {
+	if (vfres->vf_cap_flags & VIRTCHNL_VF_OFFLOAD_RSS_PF) {
 		adapter->rss_key_size = vfres->rss_key_size;
 		adapter->rss_lut_size = vfres->rss_lut_size;
 	} else {
@@ -2664,7 +2664,7 @@ static void i40evf_init_task(struct work_struct *work)
 	if (err)
 		goto err_sw_init;
 	i40evf_map_rings_to_vectors(adapter);
-	if (adapter->vf_res->vf_offload_flags &
+	if (adapter->vf_res->vf_cap_flags &
 	    VIRTCHNL_VF_OFFLOAD_WB_ON_ITR)
 		adapter->flags |= I40EVF_FLAG_WB_ON_ITR_CAPABLE;
 
diff --git a/include/linux/avf/virtchnl.h b/include/linux/avf/virtchnl.h
index c893b9520a67..becfca2ae94e 100644
--- a/include/linux/avf/virtchnl.h
+++ b/include/linux/avf/virtchnl.h
@@ -223,7 +223,7 @@ struct virtchnl_vsi_resource {
 
 VIRTCHNL_CHECK_STRUCT_LEN(16, virtchnl_vsi_resource);
 
-/* VF offload flags
+/* VF capability flags
  * VIRTCHNL_VF_OFFLOAD_L2 flag is inclusive of base mode L2 offloads including
  * TX/RX Checksum offloading and TSO for non-tunnelled packets.
  */
@@ -251,7 +251,7 @@ struct virtchnl_vf_resource {
 	u16 max_vectors;
 	u16 max_mtu;
 
-	u32 vf_offload_flags;
+	u32 vf_cap_flags;
 	u32 rss_key_size;
 	u32 rss_lut_size;
 
-- 
cgit v1.2.3


From 32d99d0b670299720dd0db92a974c9612c230889 Mon Sep 17 00:00:00 2001
From: David Lebrun <david.lebrun@uclouvain.be>
Date: Fri, 25 Aug 2017 09:56:44 +0200
Subject: ipv6: sr: add support for ip4ip6 encapsulation

This patch enables the SRv6 encapsulation mode to carry an IPv4 payload.
All the infrastructure was already present, I just had to add a parameter
to seg6_do_srh_encap() to specify the inner packet protocol, and perform
some additional checks.

Usage example:
ip route add 1.2.3.4 encap seg6 mode encap segs fc00::1,fc00::2 dev eth0

Signed-off-by: David Lebrun <david.lebrun@uclouvain.be>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/seg6.h       |  3 ++-
 net/ipv6/seg6_iptunnel.c | 47 +++++++++++++++++++++++++++++++++++++----------
 net/ipv6/seg6_local.c    |  2 +-
 3 files changed, 40 insertions(+), 12 deletions(-)

(limited to 'include')

diff --git a/include/net/seg6.h b/include/net/seg6.h
index 5379f550f521..099bad59dc90 100644
--- a/include/net/seg6.h
+++ b/include/net/seg6.h
@@ -60,7 +60,8 @@ extern int seg6_local_init(void);
 extern void seg6_local_exit(void);
 
 extern bool seg6_validate_srh(struct ipv6_sr_hdr *srh, int len);
-extern int seg6_do_srh_encap(struct sk_buff *skb, struct ipv6_sr_hdr *osrh);
+extern int seg6_do_srh_encap(struct sk_buff *skb, struct ipv6_sr_hdr *osrh,
+			     int proto);
 extern int seg6_do_srh_inline(struct sk_buff *skb, struct ipv6_sr_hdr *osrh);
 
 #endif
diff --git a/net/ipv6/seg6_iptunnel.c b/net/ipv6/seg6_iptunnel.c
index 501233040570..5bec7817a7b9 100644
--- a/net/ipv6/seg6_iptunnel.c
+++ b/net/ipv6/seg6_iptunnel.c
@@ -91,7 +91,7 @@ static void set_tun_src(struct net *net, struct net_device *dev,
 }
 
 /* encapsulate an IPv6 packet within an outer IPv6 header with a given SRH */
-int seg6_do_srh_encap(struct sk_buff *skb, struct ipv6_sr_hdr *osrh)
+int seg6_do_srh_encap(struct sk_buff *skb, struct ipv6_sr_hdr *osrh, int proto)
 {
 	struct net *net = dev_net(skb_dst(skb)->dev);
 	struct ipv6hdr *hdr, *inner_hdr;
@@ -116,15 +116,22 @@ int seg6_do_srh_encap(struct sk_buff *skb, struct ipv6_sr_hdr *osrh)
 	 * hlim will be decremented in ip6_forward() afterwards and
 	 * decapsulation will overwrite inner hlim with outer hlim
 	 */
-	ip6_flow_hdr(hdr, ip6_tclass(ip6_flowinfo(inner_hdr)),
-		     ip6_flowlabel(inner_hdr));
-	hdr->hop_limit = inner_hdr->hop_limit;
+
+	if (skb->protocol == htons(ETH_P_IPV6)) {
+		ip6_flow_hdr(hdr, ip6_tclass(ip6_flowinfo(inner_hdr)),
+			     ip6_flowlabel(inner_hdr));
+		hdr->hop_limit = inner_hdr->hop_limit;
+	} else {
+		ip6_flow_hdr(hdr, 0, 0);
+		hdr->hop_limit = ip6_dst_hoplimit(skb_dst(skb));
+	}
+
 	hdr->nexthdr = NEXTHDR_ROUTING;
 
 	isrh = (void *)hdr + sizeof(*hdr);
 	memcpy(isrh, osrh, hdrlen);
 
-	isrh->nexthdr = NEXTHDR_IPV6;
+	isrh->nexthdr = proto;
 
 	hdr->daddr = isrh->segments[isrh->first_segment];
 	set_tun_src(net, skb->dev, &hdr->daddr, &hdr->saddr);
@@ -199,7 +206,7 @@ static int seg6_do_srh(struct sk_buff *skb)
 {
 	struct dst_entry *dst = skb_dst(skb);
 	struct seg6_iptunnel_encap *tinfo;
-	int err = 0;
+	int proto, err = 0;
 
 	tinfo = seg6_encap_lwtunnel(dst->lwtstate);
 
@@ -210,17 +217,31 @@ static int seg6_do_srh(struct sk_buff *skb)
 
 	switch (tinfo->mode) {
 	case SEG6_IPTUN_MODE_INLINE:
+		if (skb->protocol != htons(ETH_P_IPV6))
+			return -EINVAL;
+
 		err = seg6_do_srh_inline(skb, tinfo->srh);
+		if (err)
+			return err;
+
 		skb_reset_inner_headers(skb);
 		break;
 	case SEG6_IPTUN_MODE_ENCAP:
-		err = seg6_do_srh_encap(skb, tinfo->srh);
+		if (skb->protocol == htons(ETH_P_IPV6))
+			proto = IPPROTO_IPV6;
+		else if (skb->protocol == htons(ETH_P_IP))
+			proto = IPPROTO_IPIP;
+		else
+			return -EINVAL;
+
+		err = seg6_do_srh_encap(skb, tinfo->srh, proto);
+		if (err)
+			return err;
+
+		skb->protocol = htons(ETH_P_IPV6);
 		break;
 	}
 
-	if (err)
-		return err;
-
 	ipv6_hdr(skb)->payload_len = htons(skb->len - sizeof(struct ipv6hdr));
 	skb_set_transport_header(skb, sizeof(struct ipv6hdr));
 
@@ -334,6 +355,9 @@ static int seg6_build_state(struct nlattr *nla,
 	struct seg6_lwt *slwt;
 	int err;
 
+	if (family != AF_INET && family != AF_INET6)
+		return -EINVAL;
+
 	err = nla_parse_nested(tb, SEG6_IPTUNNEL_MAX, nla,
 			       seg6_iptunnel_policy, extack);
 
@@ -356,6 +380,9 @@ static int seg6_build_state(struct nlattr *nla,
 
 	switch (tuninfo->mode) {
 	case SEG6_IPTUN_MODE_INLINE:
+		if (family != AF_INET6)
+			return -EINVAL;
+
 		break;
 	case SEG6_IPTUN_MODE_ENCAP:
 		break;
diff --git a/net/ipv6/seg6_local.c b/net/ipv6/seg6_local.c
index 147680e7a00c..609b94e970de 100644
--- a/net/ipv6/seg6_local.c
+++ b/net/ipv6/seg6_local.c
@@ -290,7 +290,7 @@ static int input_action_end_b6_encap(struct sk_buff *skb,
 	skb_reset_inner_headers(skb);
 	skb->encapsulation = 1;
 
-	err = seg6_do_srh_encap(skb, slwt->srh);
+	err = seg6_do_srh_encap(skb, slwt->srh, IPPROTO_IPV6);
 	if (err)
 		goto drop;
 
-- 
cgit v1.2.3


From 38ee7f2d47565689f35662d488d25e7afc43477d Mon Sep 17 00:00:00 2001
From: David Lebrun <david.lebrun@uclouvain.be>
Date: Fri, 25 Aug 2017 09:56:45 +0200
Subject: ipv6: sr: add support for encapsulation of L2 frames

This patch implements the L2 frame encapsulation mechanism, referred to
as T.Encaps.L2 in the SRv6 specifications [1].

A new type of SRv6 tunnel mode is added (SEG6_IPTUN_MODE_L2ENCAP). It only
accepts packets with an existing MAC header (i.e., it will not work for
locally generated packets). The resulting packet looks like IPv6 -> SRH ->
Ethernet -> original L3 payload. The next header field of the SRH is set to
NEXTHDR_NONE.

[1] https://tools.ietf.org/html/draft-filsfils-spring-srv6-network-programming-01

Signed-off-by: David Lebrun <david.lebrun@uclouvain.be>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/seg6_iptunnel.h | 18 ++++++++++++++----
 net/ipv6/seg6_iptunnel.c           | 25 +++++++++++++++++++++++--
 2 files changed, 37 insertions(+), 6 deletions(-)

(limited to 'include')

diff --git a/include/uapi/linux/seg6_iptunnel.h b/include/uapi/linux/seg6_iptunnel.h
index b6e5a0a1afd7..b23df9f58354 100644
--- a/include/uapi/linux/seg6_iptunnel.h
+++ b/include/uapi/linux/seg6_iptunnel.h
@@ -33,16 +33,26 @@ struct seg6_iptunnel_encap {
 enum {
 	SEG6_IPTUN_MODE_INLINE,
 	SEG6_IPTUN_MODE_ENCAP,
+	SEG6_IPTUN_MODE_L2ENCAP,
 };
 
 #ifdef __KERNEL__
 
 static inline size_t seg6_lwt_headroom(struct seg6_iptunnel_encap *tuninfo)
 {
-	int encap = (tuninfo->mode == SEG6_IPTUN_MODE_ENCAP);
-
-	return ((tuninfo->srh->hdrlen + 1) << 3) +
-	       (encap * sizeof(struct ipv6hdr));
+	int head = 0;
+
+	switch (tuninfo->mode) {
+	case SEG6_IPTUN_MODE_INLINE:
+		break;
+	case SEG6_IPTUN_MODE_ENCAP:
+		head = sizeof(struct ipv6hdr);
+		break;
+	case SEG6_IPTUN_MODE_L2ENCAP:
+		return 0;
+	}
+
+	return ((tuninfo->srh->hdrlen + 1) << 3) + head;
 }
 
 #endif
diff --git a/net/ipv6/seg6_iptunnel.c b/net/ipv6/seg6_iptunnel.c
index 5bec7817a7b9..bd6cc688bd19 100644
--- a/net/ipv6/seg6_iptunnel.c
+++ b/net/ipv6/seg6_iptunnel.c
@@ -238,6 +238,22 @@ static int seg6_do_srh(struct sk_buff *skb)
 		if (err)
 			return err;
 
+		skb->protocol = htons(ETH_P_IPV6);
+		break;
+	case SEG6_IPTUN_MODE_L2ENCAP:
+		if (!skb_mac_header_was_set(skb))
+			return -EINVAL;
+
+		if (pskb_expand_head(skb, skb->mac_len, 0, GFP_ATOMIC) < 0)
+			return -ENOMEM;
+
+		skb_mac_header_rebuild(skb);
+		skb_push(skb, skb->mac_len);
+
+		err = seg6_do_srh_encap(skb, tinfo->srh, NEXTHDR_NONE);
+		if (err)
+			return err;
+
 		skb->protocol = htons(ETH_P_IPV6);
 		break;
 	}
@@ -386,6 +402,8 @@ static int seg6_build_state(struct nlattr *nla,
 		break;
 	case SEG6_IPTUN_MODE_ENCAP:
 		break;
+	case SEG6_IPTUN_MODE_L2ENCAP:
+		break;
 	default:
 		return -EINVAL;
 	}
@@ -409,8 +427,11 @@ static int seg6_build_state(struct nlattr *nla,
 	memcpy(&slwt->tuninfo, tuninfo, tuninfo_len);
 
 	newts->type = LWTUNNEL_ENCAP_SEG6;
-	newts->flags |= LWTUNNEL_STATE_OUTPUT_REDIRECT |
-			LWTUNNEL_STATE_INPUT_REDIRECT;
+	newts->flags |= LWTUNNEL_STATE_INPUT_REDIRECT;
+
+	if (tuninfo->mode != SEG6_IPTUN_MODE_L2ENCAP)
+		newts->flags |= LWTUNNEL_STATE_OUTPUT_REDIRECT;
+
 	newts->headroom = seg6_lwt_headroom(tuninfo);
 
 	*ts = newts;
-- 
cgit v1.2.3


From ebfa00c5745660fe7f0a91eea88d4dff658486c4 Mon Sep 17 00:00:00 2001
From: Sabrina Dubroca <sd@queasysnail.net>
Date: Fri, 25 Aug 2017 13:10:12 +0200
Subject: tcp: fix refcnt leak with ebpf congestion control

There are a few bugs around refcnt handling in the new BPF congestion
control setsockopt:

 - The new ca is assigned to icsk->icsk_ca_ops even in the case where we
   cannot get a reference on it. This would lead to a use after free,
   since that ca is going away soon.

 - Changing the congestion control case doesn't release the refcnt on
   the previous ca.

 - In the reinit case, we first leak a reference on the old ca, then we
   call tcp_reinit_congestion_control on the ca that we have just
   assigned, leading to deinitializing the wrong ca (->release of the
   new ca on the old ca's data) and releasing the refcount on the ca
   that we actually want to use.

This is visible by building (for example) BIC as a module and setting
net.ipv4.tcp_congestion_control=bic, and using tcp_cong_kern.c from
samples/bpf.

This patch fixes the refcount issues, and moves reinit back into tcp
core to avoid passing a ca pointer back to BPF.

Fixes: 91b5b21c7c16 ("bpf: Add support for changing congestion control")
Signed-off-by: Sabrina Dubroca <sd@queasysnail.net>
Acked-by: Lawrence Brakmo <brakmo@fb.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/tcp.h   |  4 +---
 net/core/filter.c   |  7 ++-----
 net/ipv4/tcp.c      |  2 +-
 net/ipv4/tcp_cong.c | 19 ++++++++++++++-----
 4 files changed, 18 insertions(+), 14 deletions(-)

(limited to 'include')

diff --git a/include/net/tcp.h b/include/net/tcp.h
index ada65e767b28..f642a39f9eee 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -1004,9 +1004,7 @@ void tcp_get_default_congestion_control(char *name);
 void tcp_get_available_congestion_control(char *buf, size_t len);
 void tcp_get_allowed_congestion_control(char *buf, size_t len);
 int tcp_set_allowed_congestion_control(char *allowed);
-int tcp_set_congestion_control(struct sock *sk, const char *name, bool load);
-void tcp_reinit_congestion_control(struct sock *sk,
-				   const struct tcp_congestion_ops *ca);
+int tcp_set_congestion_control(struct sock *sk, const char *name, bool load, bool reinit);
 u32 tcp_slow_start(struct tcp_sock *tp, u32 acked);
 void tcp_cong_avoid_ai(struct tcp_sock *tp, u32 w, u32 acked);
 
diff --git a/net/core/filter.c b/net/core/filter.c
index 8eb81e5fae08..169974998c76 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -2836,15 +2836,12 @@ BPF_CALL_5(bpf_setsockopt, struct bpf_sock_ops_kern *, bpf_sock,
 		   sk->sk_prot->setsockopt == tcp_setsockopt) {
 		if (optname == TCP_CONGESTION) {
 			char name[TCP_CA_NAME_MAX];
+			bool reinit = bpf_sock->op > BPF_SOCK_OPS_NEEDS_ECN;
 
 			strncpy(name, optval, min_t(long, optlen,
 						    TCP_CA_NAME_MAX-1));
 			name[TCP_CA_NAME_MAX-1] = 0;
-			ret = tcp_set_congestion_control(sk, name, false);
-			if (!ret && bpf_sock->op > BPF_SOCK_OPS_NEEDS_ECN)
-				/* replacing an existing ca */
-				tcp_reinit_congestion_control(sk,
-					inet_csk(sk)->icsk_ca_ops);
+			ret = tcp_set_congestion_control(sk, name, false, reinit);
 		} else {
 			struct tcp_sock *tp = tcp_sk(sk);
 
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 71ce33decd97..a3e91b552edc 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -2481,7 +2481,7 @@ static int do_tcp_setsockopt(struct sock *sk, int level,
 		name[val] = 0;
 
 		lock_sock(sk);
-		err = tcp_set_congestion_control(sk, name, true);
+		err = tcp_set_congestion_control(sk, name, true, true);
 		release_sock(sk);
 		return err;
 	}
diff --git a/net/ipv4/tcp_cong.c b/net/ipv4/tcp_cong.c
index fde983f6376b..421ea1b918da 100644
--- a/net/ipv4/tcp_cong.c
+++ b/net/ipv4/tcp_cong.c
@@ -189,8 +189,8 @@ void tcp_init_congestion_control(struct sock *sk)
 		INET_ECN_dontxmit(sk);
 }
 
-void tcp_reinit_congestion_control(struct sock *sk,
-				   const struct tcp_congestion_ops *ca)
+static void tcp_reinit_congestion_control(struct sock *sk,
+					  const struct tcp_congestion_ops *ca)
 {
 	struct inet_connection_sock *icsk = inet_csk(sk);
 
@@ -338,7 +338,7 @@ out:
  * tcp_reinit_congestion_control (if the current congestion control was
  * already initialized.
  */
-int tcp_set_congestion_control(struct sock *sk, const char *name, bool load)
+int tcp_set_congestion_control(struct sock *sk, const char *name, bool load, bool reinit)
 {
 	struct inet_connection_sock *icsk = inet_csk(sk);
 	const struct tcp_congestion_ops *ca;
@@ -360,9 +360,18 @@ int tcp_set_congestion_control(struct sock *sk, const char *name, bool load)
 	if (!ca) {
 		err = -ENOENT;
 	} else if (!load) {
-		icsk->icsk_ca_ops = ca;
-		if (!try_module_get(ca->owner))
+		const struct tcp_congestion_ops *old_ca = icsk->icsk_ca_ops;
+
+		if (try_module_get(ca->owner)) {
+			if (reinit) {
+				tcp_reinit_congestion_control(sk, ca);
+			} else {
+				icsk->icsk_ca_ops = ca;
+				module_put(old_ca->owner);
+			}
+		} else {
 			err = -EBUSY;
+		}
 	} else if (!((ca->flags & TCP_CONG_NON_RESTRICTED) ||
 		     ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))) {
 		err = -EPERM;
-- 
cgit v1.2.3


From 143976ce992fcf3bfc0f4d15d5726bb492dcf262 Mon Sep 17 00:00:00 2001
From: WANG Cong <xiyou.wangcong@gmail.com>
Date: Thu, 24 Aug 2017 16:51:29 -0700
Subject: net_sched: remove tc class reference counting

For TC classes, their ->get() and ->put() are always paired, and the
reference counting is completely useless, because:

1) For class modification and dumping paths, we already hold RTNL lock,
   so all of these ->get(),->change(),->put() are atomic.

2) For filter bindiing/unbinding, we use other reference counter than
   this one, and they should have RTNL lock too.

3) For ->qlen_notify(), it is special because it is called on ->enqueue()
   path, but we already hold qdisc tree lock there, and we hold this
   tree lock when graft or delete the class too, so it should not be gone
   or changed until we release the tree lock.

Therefore, this patch removes ->get() and ->put(), but:

1) Adds a new ->find() to find the pointer to a class by classid, no
   refcnt.

2) Move the original class destroy upon the last refcnt into ->delete(),
   right after releasing tree lock. This is fine because the class is
   already removed from hash when holding the lock.

For those who also use ->put() as ->unbind(), just rename them to reflect
this change.

Cc: Jamal Hadi Salim <jhs@mojatatu.com>
Signed-off-by: Cong Wang <xiyou.wangcong@gmail.com>
Acked-by: Jiri Pirko <jiri@mellanox.com>
Acked-by: Jamal Hadi Salim <jhs@mojatatu.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/sch_generic.h |  3 +--
 net/sched/cls_api.c       | 17 ++++++-----------
 net/sched/sch_api.c       | 21 ++++++++-------------
 net/sched/sch_atm.c       | 30 ++++++++++++++++--------------
 net/sched/sch_cbq.c       | 41 ++++-------------------------------------
 net/sched/sch_drr.c       | 30 +++++-------------------------
 net/sched/sch_dsmark.c    | 17 ++++++++---------
 net/sched/sch_fq_codel.c  |  9 ++++-----
 net/sched/sch_hfsc.c      | 32 +++++---------------------------
 net/sched/sch_htb.c       | 33 +++++++--------------------------
 net/sched/sch_ingress.c   | 20 +++++++++-----------
 net/sched/sch_mq.c        |  9 ++-------
 net/sched/sch_mqprio.c    |  9 ++-------
 net/sched/sch_multiq.c    | 11 +++++------
 net/sched/sch_netem.c     |  9 ++-------
 net/sched/sch_prio.c      | 11 +++++------
 net/sched/sch_qfq.c       | 30 +++++-------------------------
 net/sched/sch_red.c       |  9 ++-------
 net/sched/sch_sfb.c       |  9 ++++-----
 net/sched/sch_sfq.c       |  9 ++++-----
 net/sched/sch_tbf.c       |  9 ++-------
 21 files changed, 106 insertions(+), 262 deletions(-)

(limited to 'include')

diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h
index 1688f0f6c7ba..d817585aa5df 100644
--- a/include/net/sch_generic.h
+++ b/include/net/sch_generic.h
@@ -147,8 +147,7 @@ struct Qdisc_class_ops {
 	void			(*qlen_notify)(struct Qdisc *, unsigned long);
 
 	/* Class manipulation routines */
-	unsigned long		(*get)(struct Qdisc *, u32 classid);
-	void			(*put)(struct Qdisc *, unsigned long);
+	unsigned long		(*find)(struct Qdisc *, u32 classid);
 	int			(*change)(struct Qdisc *, u32, u32,
 					struct nlattr **, unsigned long *);
 	int			(*delete)(struct Qdisc *, unsigned long);
diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c
index eef6b077f30e..d470a4e2de58 100644
--- a/net/sched/cls_api.c
+++ b/net/sched/cls_api.c
@@ -586,7 +586,7 @@ replay:
 
 	/* Do we search for filter, attached to class? */
 	if (TC_H_MIN(parent)) {
-		cl = cops->get(q, parent);
+		cl = cops->find(q, parent);
 		if (cl == 0)
 			return -ENOENT;
 	}
@@ -716,8 +716,6 @@ replay:
 errout:
 	if (chain)
 		tcf_chain_put(chain);
-	if (cl)
-		cops->put(q, cl);
 	if (err == -EAGAIN)
 		/* Replay the request. */
 		goto replay;
@@ -822,17 +820,17 @@ static int tc_dump_tfilter(struct sk_buff *skb, struct netlink_callback *cb)
 		goto out;
 	cops = q->ops->cl_ops;
 	if (!cops)
-		goto errout;
+		goto out;
 	if (!cops->tcf_block)
-		goto errout;
+		goto out;
 	if (TC_H_MIN(tcm->tcm_parent)) {
-		cl = cops->get(q, tcm->tcm_parent);
+		cl = cops->find(q, tcm->tcm_parent);
 		if (cl == 0)
-			goto errout;
+			goto out;
 	}
 	block = cops->tcf_block(q, cl);
 	if (!block)
-		goto errout;
+		goto out;
 
 	index_start = cb->args[0];
 	index = 0;
@@ -847,9 +845,6 @@ static int tc_dump_tfilter(struct sk_buff *skb, struct netlink_callback *cb)
 
 	cb->args[0] = index;
 
-errout:
-	if (cl)
-		cops->put(q, cl);
 out:
 	return skb->len;
 }
diff --git a/net/sched/sch_api.c b/net/sched/sch_api.c
index 3ef4eb578739..e7f8e4bfd4ec 100644
--- a/net/sched/sch_api.c
+++ b/net/sched/sch_api.c
@@ -153,7 +153,7 @@ int register_qdisc(struct Qdisc_ops *qops)
 	if (qops->cl_ops) {
 		const struct Qdisc_class_ops *cops = qops->cl_ops;
 
-		if (!(cops->get && cops->put && cops->walk && cops->leaf))
+		if (!(cops->find && cops->walk && cops->leaf))
 			goto out_einval;
 
 		if (cops->tcf_block && !(cops->bind_tcf && cops->unbind_tcf))
@@ -320,12 +320,11 @@ static struct Qdisc *qdisc_leaf(struct Qdisc *p, u32 classid)
 
 	if (cops == NULL)
 		return NULL;
-	cl = cops->get(p, classid);
+	cl = cops->find(p, classid);
 
 	if (cl == 0)
 		return NULL;
 	leaf = cops->leaf(p, cl);
-	cops->put(p, cl);
 	return leaf;
 }
 
@@ -756,9 +755,8 @@ void qdisc_tree_reduce_backlog(struct Qdisc *sch, unsigned int n,
 		}
 		cops = sch->ops->cl_ops;
 		if (notify && cops->qlen_notify) {
-			cl = cops->get(sch, parentid);
+			cl = cops->find(sch, parentid);
 			cops->qlen_notify(sch, cl);
-			cops->put(sch, cl);
 		}
 		sch->q.qlen -= n;
 		sch->qstats.backlog -= len;
@@ -955,11 +953,11 @@ skip:
 
 		err = -EOPNOTSUPP;
 		if (cops && cops->graft) {
-			unsigned long cl = cops->get(parent, classid);
-			if (cl) {
+			unsigned long cl = cops->find(parent, classid);
+
+			if (cl)
 				err = cops->graft(parent, cl, new, &old);
-				cops->put(parent, cl);
-			} else
+			else
 				err = -ENOENT;
 		}
 		if (!err)
@@ -1739,7 +1737,7 @@ static int tc_ctl_tclass(struct sk_buff *skb, struct nlmsghdr *n,
 		clid = TC_H_MAKE(qid, clid);
 
 	if (clid)
-		cl = cops->get(q, clid);
+		cl = cops->find(q, clid);
 
 	if (cl == 0) {
 		err = -ENOENT;
@@ -1773,9 +1771,6 @@ static int tc_ctl_tclass(struct sk_buff *skb, struct nlmsghdr *n,
 		tclass_notify(net, skb, n, q, new_cl, RTM_NEWTCLASS);
 
 out:
-	if (cl)
-		cops->put(q, cl);
-
 	return err;
 }
 
diff --git a/net/sched/sch_atm.c b/net/sched/sch_atm.c
index 2732950766a9..c5fcdf1a58a0 100644
--- a/net/sched/sch_atm.c
+++ b/net/sched/sch_atm.c
@@ -108,23 +108,29 @@ static struct Qdisc *atm_tc_leaf(struct Qdisc *sch, unsigned long cl)
 	return flow ? flow->q : NULL;
 }
 
-static unsigned long atm_tc_get(struct Qdisc *sch, u32 classid)
+static unsigned long atm_tc_find(struct Qdisc *sch, u32 classid)
 {
 	struct atm_qdisc_data *p __maybe_unused = qdisc_priv(sch);
 	struct atm_flow_data *flow;
 
-	pr_debug("atm_tc_get(sch %p,[qdisc %p],classid %x)\n", sch, p, classid);
+	pr_debug("%s(sch %p,[qdisc %p],classid %x)\n", __func__, sch, p, classid);
 	flow = lookup_flow(sch, classid);
-	if (flow)
-		flow->ref++;
-	pr_debug("atm_tc_get: flow %p\n", flow);
+	pr_debug("%s: flow %p\n", __func__, flow);
 	return (unsigned long)flow;
 }
 
 static unsigned long atm_tc_bind_filter(struct Qdisc *sch,
 					unsigned long parent, u32 classid)
 {
-	return atm_tc_get(sch, classid);
+	struct atm_qdisc_data *p __maybe_unused = qdisc_priv(sch);
+	struct atm_flow_data *flow;
+
+	pr_debug("%s(sch %p,[qdisc %p],classid %x)\n", __func__, sch, p, classid);
+	flow = lookup_flow(sch, classid);
+	if (flow)
+		flow->ref++;
+	pr_debug("%s: flow %p\n", __func__, flow);
+	return (unsigned long)flow;
 }
 
 /*
@@ -234,7 +240,7 @@ static int atm_tc_change(struct Qdisc *sch, u32 classid, u32 parent,
 		excess = NULL;
 	else {
 		excess = (struct atm_flow_data *)
-			atm_tc_get(sch, nla_get_u32(tb[TCA_ATM_EXCESS]));
+			atm_tc_find(sch, nla_get_u32(tb[TCA_ATM_EXCESS]));
 		if (!excess)
 			return -ENOENT;
 	}
@@ -262,10 +268,9 @@ static int atm_tc_change(struct Qdisc *sch, u32 classid, u32 parent,
 
 		for (i = 1; i < 0x8000; i++) {
 			classid = TC_H_MAKE(sch->handle, 0x8000 | i);
-			cl = atm_tc_get(sch, classid);
+			cl = atm_tc_find(sch, classid);
 			if (!cl)
 				break;
-			atm_tc_put(sch, cl);
 		}
 	}
 	pr_debug("atm_tc_change: new id %x\n", classid);
@@ -305,8 +310,6 @@ static int atm_tc_change(struct Qdisc *sch, u32 classid, u32 parent,
 	*arg = (unsigned long)flow;
 	return 0;
 err_out:
-	if (excess)
-		atm_tc_put(sch, (unsigned long)excess);
 	sockfd_put(sock);
 	return error;
 }
@@ -377,7 +380,7 @@ static int atm_tc_enqueue(struct sk_buff *skb, struct Qdisc *sch,
 	result = TC_ACT_OK;	/* be nice to gcc */
 	flow = NULL;
 	if (TC_H_MAJ(skb->priority) != sch->handle ||
-	    !(flow = (struct atm_flow_data *)atm_tc_get(sch, skb->priority))) {
+	    !(flow = (struct atm_flow_data *)atm_tc_find(sch, skb->priority))) {
 		struct tcf_proto *fl;
 
 		list_for_each_entry(flow, &p->flows, list) {
@@ -655,8 +658,7 @@ static int atm_tc_dump(struct Qdisc *sch, struct sk_buff *skb)
 static const struct Qdisc_class_ops atm_class_ops = {
 	.graft		= atm_tc_graft,
 	.leaf		= atm_tc_leaf,
-	.get		= atm_tc_get,
-	.put		= atm_tc_put,
+	.find		= atm_tc_find,
 	.change		= atm_tc_change,
 	.delete		= atm_tc_delete,
 	.walk		= atm_tc_walk,
diff --git a/net/sched/sch_cbq.c b/net/sched/sch_cbq.c
index 1bdb0106f342..3ec8bec109bb 100644
--- a/net/sched/sch_cbq.c
+++ b/net/sched/sch_cbq.c
@@ -129,7 +129,6 @@ struct cbq_class {
 	struct tcf_proto __rcu	*filter_list;
 	struct tcf_block	*block;
 
-	int			refcnt;
 	int			filters;
 
 	struct cbq_class	*defaults[TC_PRIO_MAX + 1];
@@ -1155,7 +1154,6 @@ static int cbq_init(struct Qdisc *sch, struct nlattr *opt)
 	if (err < 0)
 		goto put_rtab;
 
-	q->link.refcnt = 1;
 	q->link.sibling = &q->link;
 	q->link.common.classid = sch->handle;
 	q->link.qdisc = sch;
@@ -1388,16 +1386,11 @@ static void cbq_qlen_notify(struct Qdisc *sch, unsigned long arg)
 	cbq_deactivate_class(cl);
 }
 
-static unsigned long cbq_get(struct Qdisc *sch, u32 classid)
+static unsigned long cbq_find(struct Qdisc *sch, u32 classid)
 {
 	struct cbq_sched_data *q = qdisc_priv(sch);
-	struct cbq_class *cl = cbq_class_lookup(q, classid);
 
-	if (cl) {
-		cl->refcnt++;
-		return (unsigned long)cl;
-	}
-	return 0;
+	return (unsigned long)cbq_class_lookup(q, classid);
 }
 
 static void cbq_destroy_class(struct Qdisc *sch, struct cbq_class *cl)
@@ -1443,25 +1436,6 @@ static void cbq_destroy(struct Qdisc *sch)
 	qdisc_class_hash_destroy(&q->clhash);
 }
 
-static void cbq_put(struct Qdisc *sch, unsigned long arg)
-{
-	struct cbq_class *cl = (struct cbq_class *)arg;
-
-	if (--cl->refcnt == 0) {
-#ifdef CONFIG_NET_CLS_ACT
-		spinlock_t *root_lock = qdisc_root_sleeping_lock(sch);
-		struct cbq_sched_data *q = qdisc_priv(sch);
-
-		spin_lock_bh(root_lock);
-		if (q->rx_class == cl)
-			q->rx_class = NULL;
-		spin_unlock_bh(root_lock);
-#endif
-
-		cbq_destroy_class(sch, cl);
-	}
-}
-
 static int
 cbq_change_class(struct Qdisc *sch, u32 classid, u32 parentid, struct nlattr **tca,
 		 unsigned long *arg)
@@ -1608,7 +1582,6 @@ cbq_change_class(struct Qdisc *sch, u32 classid, u32 parentid, struct nlattr **t
 
 	cl->R_tab = rtab;
 	rtab = NULL;
-	cl->refcnt = 1;
 	cl->q = qdisc_create_dflt(sch->dev_queue, &pfifo_qdisc_ops, classid);
 	if (!cl->q)
 		cl->q = &noop_qdisc;
@@ -1689,12 +1662,7 @@ static int cbq_delete(struct Qdisc *sch, unsigned long arg)
 	cbq_rmprio(q, cl);
 	sch_tree_unlock(sch);
 
-	BUG_ON(--cl->refcnt == 0);
-	/*
-	 * This shouldn't happen: we "hold" one cops->get() when called
-	 * from tc_ctl_tclass; the destroy method is done from cops->put().
-	 */
-
+	cbq_destroy_class(sch, cl);
 	return 0;
 }
 
@@ -1760,8 +1728,7 @@ static const struct Qdisc_class_ops cbq_class_ops = {
 	.graft		=	cbq_graft,
 	.leaf		=	cbq_leaf,
 	.qlen_notify	=	cbq_qlen_notify,
-	.get		=	cbq_get,
-	.put		=	cbq_put,
+	.find		=	cbq_find,
 	.change		=	cbq_change_class,
 	.delete		=	cbq_delete,
 	.walk		=	cbq_walk,
diff --git a/net/sched/sch_drr.c b/net/sched/sch_drr.c
index 1d2f6235dfcf..2d0e8d4bdc29 100644
--- a/net/sched/sch_drr.c
+++ b/net/sched/sch_drr.c
@@ -20,7 +20,6 @@
 
 struct drr_class {
 	struct Qdisc_class_common	common;
-	unsigned int			refcnt;
 	unsigned int			filter_cnt;
 
 	struct gnet_stats_basic_packed		bstats;
@@ -111,7 +110,6 @@ static int drr_change_class(struct Qdisc *sch, u32 classid, u32 parentid,
 	if (cl == NULL)
 		return -ENOBUFS;
 
-	cl->refcnt	   = 1;
 	cl->common.classid = classid;
 	cl->quantum	   = quantum;
 	cl->qdisc	   = qdisc_create_dflt(sch->dev_queue,
@@ -163,32 +161,15 @@ static int drr_delete_class(struct Qdisc *sch, unsigned long arg)
 	drr_purge_queue(cl);
 	qdisc_class_hash_remove(&q->clhash, &cl->common);
 
-	BUG_ON(--cl->refcnt == 0);
-	/*
-	 * This shouldn't happen: we "hold" one cops->get() when called
-	 * from tc_ctl_tclass; the destroy method is done from cops->put().
-	 */
-
 	sch_tree_unlock(sch);
-	return 0;
-}
-
-static unsigned long drr_get_class(struct Qdisc *sch, u32 classid)
-{
-	struct drr_class *cl = drr_find_class(sch, classid);
 
-	if (cl != NULL)
-		cl->refcnt++;
-
-	return (unsigned long)cl;
+	drr_destroy_class(sch, cl);
+	return 0;
 }
 
-static void drr_put_class(struct Qdisc *sch, unsigned long arg)
+static unsigned long drr_search_class(struct Qdisc *sch, u32 classid)
 {
-	struct drr_class *cl = (struct drr_class *)arg;
-
-	if (--cl->refcnt == 0)
-		drr_destroy_class(sch, cl);
+	return (unsigned long)drr_find_class(sch, classid);
 }
 
 static struct tcf_block *drr_tcf_block(struct Qdisc *sch, unsigned long cl)
@@ -478,8 +459,7 @@ static void drr_destroy_qdisc(struct Qdisc *sch)
 static const struct Qdisc_class_ops drr_class_ops = {
 	.change		= drr_change_class,
 	.delete		= drr_delete_class,
-	.get		= drr_get_class,
-	.put		= drr_put_class,
+	.find		= drr_search_class,
 	.tcf_block	= drr_tcf_block,
 	.bind_tcf	= drr_bind_tcf,
 	.unbind_tcf	= drr_unbind_tcf,
diff --git a/net/sched/sch_dsmark.c b/net/sched/sch_dsmark.c
index 6d94fcc3592a..2836c80c7aa5 100644
--- a/net/sched/sch_dsmark.c
+++ b/net/sched/sch_dsmark.c
@@ -85,21 +85,21 @@ static struct Qdisc *dsmark_leaf(struct Qdisc *sch, unsigned long arg)
 	return p->q;
 }
 
-static unsigned long dsmark_get(struct Qdisc *sch, u32 classid)
+static unsigned long dsmark_find(struct Qdisc *sch, u32 classid)
 {
-	pr_debug("%s(sch %p,[qdisc %p],classid %x)\n",
-		 __func__, sch, qdisc_priv(sch), classid);
-
 	return TC_H_MIN(classid) + 1;
 }
 
 static unsigned long dsmark_bind_filter(struct Qdisc *sch,
 					unsigned long parent, u32 classid)
 {
-	return dsmark_get(sch, classid);
+	pr_debug("%s(sch %p,[qdisc %p],classid %x)\n",
+		 __func__, sch, qdisc_priv(sch), classid);
+
+	return dsmark_find(sch, classid);
 }
 
-static void dsmark_put(struct Qdisc *sch, unsigned long cl)
+static void dsmark_unbind_filter(struct Qdisc *sch, unsigned long cl)
 {
 }
 
@@ -469,14 +469,13 @@ nla_put_failure:
 static const struct Qdisc_class_ops dsmark_class_ops = {
 	.graft		=	dsmark_graft,
 	.leaf		=	dsmark_leaf,
-	.get		=	dsmark_get,
-	.put		=	dsmark_put,
+	.find		=	dsmark_find,
 	.change		=	dsmark_change,
 	.delete		=	dsmark_delete,
 	.walk		=	dsmark_walk,
 	.tcf_block	=	dsmark_tcf_block,
 	.bind_tcf	=	dsmark_bind_filter,
-	.unbind_tcf	=	dsmark_put,
+	.unbind_tcf	=	dsmark_unbind_filter,
 	.dump		=	dsmark_dump_class,
 };
 
diff --git a/net/sched/sch_fq_codel.c b/net/sched/sch_fq_codel.c
index 337f2d6d81e4..7699b50688cd 100644
--- a/net/sched/sch_fq_codel.c
+++ b/net/sched/sch_fq_codel.c
@@ -579,7 +579,7 @@ static struct Qdisc *fq_codel_leaf(struct Qdisc *sch, unsigned long arg)
 	return NULL;
 }
 
-static unsigned long fq_codel_get(struct Qdisc *sch, u32 classid)
+static unsigned long fq_codel_find(struct Qdisc *sch, u32 classid)
 {
 	return 0;
 }
@@ -592,7 +592,7 @@ static unsigned long fq_codel_bind(struct Qdisc *sch, unsigned long parent,
 	return 0;
 }
 
-static void fq_codel_put(struct Qdisc *q, unsigned long cl)
+static void fq_codel_unbind(struct Qdisc *q, unsigned long cl)
 {
 }
 
@@ -683,11 +683,10 @@ static void fq_codel_walk(struct Qdisc *sch, struct qdisc_walker *arg)
 
 static const struct Qdisc_class_ops fq_codel_class_ops = {
 	.leaf		=	fq_codel_leaf,
-	.get		=	fq_codel_get,
-	.put		=	fq_codel_put,
+	.find		=	fq_codel_find,
 	.tcf_block	=	fq_codel_tcf_block,
 	.bind_tcf	=	fq_codel_bind,
-	.unbind_tcf	=	fq_codel_put,
+	.unbind_tcf	=	fq_codel_unbind,
 	.dump		=	fq_codel_dump_class,
 	.dump_stats	=	fq_codel_dump_class_stats,
 	.walk		=	fq_codel_walk,
diff --git a/net/sched/sch_hfsc.c b/net/sched/sch_hfsc.c
index 15f09cb9f1ff..7c7820d0fdc7 100644
--- a/net/sched/sch_hfsc.c
+++ b/net/sched/sch_hfsc.c
@@ -110,7 +110,6 @@ enum hfsc_class_flags {
 
 struct hfsc_class {
 	struct Qdisc_class_common cl_common;
-	unsigned int	refcnt;		/* usage count */
 
 	struct gnet_stats_basic_packed bstats;
 	struct gnet_stats_queue qstats;
@@ -1045,7 +1044,6 @@ hfsc_change_class(struct Qdisc *sch, u32 classid, u32 parentid,
 		hfsc_change_usc(cl, usc, 0);
 
 	cl->cl_common.classid = classid;
-	cl->refcnt    = 1;
 	cl->sched     = q;
 	cl->cl_parent = parent;
 	cl->qdisc = qdisc_create_dflt(sch->dev_queue,
@@ -1101,13 +1099,9 @@ hfsc_delete_class(struct Qdisc *sch, unsigned long arg)
 	hfsc_purge_queue(sch, cl);
 	qdisc_class_hash_remove(&q->clhash, &cl->cl_common);
 
-	BUG_ON(--cl->refcnt == 0);
-	/*
-	 * This shouldn't happen: we "hold" one cops->get() when called
-	 * from tc_ctl_tclass; the destroy method is done from cops->put().
-	 */
-
 	sch_tree_unlock(sch);
+
+	hfsc_destroy_class(sch, cl);
 	return 0;
 }
 
@@ -1208,23 +1202,9 @@ hfsc_qlen_notify(struct Qdisc *sch, unsigned long arg)
 }
 
 static unsigned long
-hfsc_get_class(struct Qdisc *sch, u32 classid)
-{
-	struct hfsc_class *cl = hfsc_find_class(classid, sch);
-
-	if (cl != NULL)
-		cl->refcnt++;
-
-	return (unsigned long)cl;
-}
-
-static void
-hfsc_put_class(struct Qdisc *sch, unsigned long arg)
+hfsc_search_class(struct Qdisc *sch, u32 classid)
 {
-	struct hfsc_class *cl = (struct hfsc_class *)arg;
-
-	if (--cl->refcnt == 0)
-		hfsc_destroy_class(sch, cl);
+	return (unsigned long)hfsc_find_class(classid, sch);
 }
 
 static unsigned long
@@ -1413,7 +1393,6 @@ hfsc_init_qdisc(struct Qdisc *sch, struct nlattr *opt)
 		goto err_tcf;
 
 	q->root.cl_common.classid = sch->handle;
-	q->root.refcnt  = 1;
 	q->root.sched   = q;
 	q->root.qdisc = qdisc_create_dflt(sch->dev_queue, &pfifo_qdisc_ops,
 					  sch->handle);
@@ -1661,8 +1640,7 @@ static const struct Qdisc_class_ops hfsc_class_ops = {
 	.graft		= hfsc_graft_class,
 	.leaf		= hfsc_class_leaf,
 	.qlen_notify	= hfsc_qlen_notify,
-	.get		= hfsc_get_class,
-	.put		= hfsc_put_class,
+	.find		= hfsc_search_class,
 	.bind_tcf	= hfsc_bind_tcf,
 	.unbind_tcf	= hfsc_unbind_tcf,
 	.tcf_block	= hfsc_tcf_block,
diff --git a/net/sched/sch_htb.c b/net/sched/sch_htb.c
index dcf3c85e1f4f..f955b59d3c7c 100644
--- a/net/sched/sch_htb.c
+++ b/net/sched/sch_htb.c
@@ -107,7 +107,6 @@ struct htb_class {
 	struct tcf_proto __rcu	*filter_list;	/* class attached filters */
 	struct tcf_block	*block;
 	int			filter_cnt;
-	int			refcnt;		/* usage count of this class */
 
 	int			level;		/* our level (see above) */
 	unsigned int		children;
@@ -193,6 +192,10 @@ static inline struct htb_class *htb_find(u32 handle, struct Qdisc *sch)
 	return container_of(clc, struct htb_class, common);
 }
 
+static unsigned long htb_search(struct Qdisc *sch, u32 handle)
+{
+	return (unsigned long)htb_find(handle, sch);
+}
 /**
  * htb_classify - classify a packet into class
  *
@@ -1189,14 +1192,6 @@ static void htb_qlen_notify(struct Qdisc *sch, unsigned long arg)
 	htb_deactivate(qdisc_priv(sch), cl);
 }
 
-static unsigned long htb_get(struct Qdisc *sch, u32 classid)
-{
-	struct htb_class *cl = htb_find(classid, sch);
-	if (cl)
-		cl->refcnt++;
-	return (unsigned long)cl;
-}
-
 static inline int htb_parent_last_child(struct htb_class *cl)
 {
 	if (!cl->parent)
@@ -1316,22 +1311,10 @@ static int htb_delete(struct Qdisc *sch, unsigned long arg)
 	if (last_child)
 		htb_parent_to_leaf(q, cl, new_q);
 
-	BUG_ON(--cl->refcnt == 0);
-	/*
-	 * This shouldn't happen: we "hold" one cops->get() when called
-	 * from tc_ctl_tclass; the destroy method is done from cops->put().
-	 */
-
 	sch_tree_unlock(sch);
-	return 0;
-}
 
-static void htb_put(struct Qdisc *sch, unsigned long arg)
-{
-	struct htb_class *cl = (struct htb_class *)arg;
-
-	if (--cl->refcnt == 0)
-		htb_destroy_class(sch, cl);
+	htb_destroy_class(sch, cl);
+	return 0;
 }
 
 static int htb_change_class(struct Qdisc *sch, u32 classid,
@@ -1422,7 +1405,6 @@ static int htb_change_class(struct Qdisc *sch, u32 classid,
 			}
 		}
 
-		cl->refcnt = 1;
 		cl->children = 0;
 		INIT_LIST_HEAD(&cl->un.leaf.drop_list);
 		RB_CLEAR_NODE(&cl->pq_node);
@@ -1598,8 +1580,7 @@ static const struct Qdisc_class_ops htb_class_ops = {
 	.graft		=	htb_graft,
 	.leaf		=	htb_leaf,
 	.qlen_notify	=	htb_qlen_notify,
-	.get		=	htb_get,
-	.put		=	htb_put,
+	.find		=	htb_search,
 	.change		=	htb_change_class,
 	.delete		=	htb_delete,
 	.walk		=	htb_walk,
diff --git a/net/sched/sch_ingress.c b/net/sched/sch_ingress.c
index a15c543c3569..44de4ee51ce9 100644
--- a/net/sched/sch_ingress.c
+++ b/net/sched/sch_ingress.c
@@ -27,7 +27,7 @@ static struct Qdisc *ingress_leaf(struct Qdisc *sch, unsigned long arg)
 	return NULL;
 }
 
-static unsigned long ingress_get(struct Qdisc *sch, u32 classid)
+static unsigned long ingress_find(struct Qdisc *sch, u32 classid)
 {
 	return TC_H_MIN(classid) + 1;
 }
@@ -35,10 +35,10 @@ static unsigned long ingress_get(struct Qdisc *sch, u32 classid)
 static unsigned long ingress_bind_filter(struct Qdisc *sch,
 					 unsigned long parent, u32 classid)
 {
-	return ingress_get(sch, classid);
+	return ingress_find(sch, classid);
 }
 
-static void ingress_put(struct Qdisc *sch, unsigned long cl)
+static void ingress_unbind_filter(struct Qdisc *sch, unsigned long cl)
 {
 }
 
@@ -94,12 +94,11 @@ nla_put_failure:
 
 static const struct Qdisc_class_ops ingress_class_ops = {
 	.leaf		=	ingress_leaf,
-	.get		=	ingress_get,
-	.put		=	ingress_put,
+	.find		=	ingress_find,
 	.walk		=	ingress_walk,
 	.tcf_block	=	ingress_tcf_block,
 	.bind_tcf	=	ingress_bind_filter,
-	.unbind_tcf	=	ingress_put,
+	.unbind_tcf	=	ingress_unbind_filter,
 };
 
 static struct Qdisc_ops ingress_qdisc_ops __read_mostly = {
@@ -117,7 +116,7 @@ struct clsact_sched_data {
 	struct tcf_block *egress_block;
 };
 
-static unsigned long clsact_get(struct Qdisc *sch, u32 classid)
+static unsigned long clsact_find(struct Qdisc *sch, u32 classid)
 {
 	switch (TC_H_MIN(classid)) {
 	case TC_H_MIN(TC_H_MIN_INGRESS):
@@ -131,7 +130,7 @@ static unsigned long clsact_get(struct Qdisc *sch, u32 classid)
 static unsigned long clsact_bind_filter(struct Qdisc *sch,
 					unsigned long parent, u32 classid)
 {
-	return clsact_get(sch, classid);
+	return clsact_find(sch, classid);
 }
 
 static struct tcf_block *clsact_tcf_block(struct Qdisc *sch, unsigned long cl)
@@ -183,12 +182,11 @@ static void clsact_destroy(struct Qdisc *sch)
 
 static const struct Qdisc_class_ops clsact_class_ops = {
 	.leaf		=	ingress_leaf,
-	.get		=	clsact_get,
-	.put		=	ingress_put,
+	.find		=	clsact_find,
 	.walk		=	ingress_walk,
 	.tcf_block	=	clsact_tcf_block,
 	.bind_tcf	=	clsact_bind_filter,
-	.unbind_tcf	=	ingress_put,
+	.unbind_tcf	=	ingress_unbind_filter,
 };
 
 static struct Qdisc_ops clsact_qdisc_ops __read_mostly = {
diff --git a/net/sched/sch_mq.c b/net/sched/sch_mq.c
index cadfdd4f1e52..f3a3e507422b 100644
--- a/net/sched/sch_mq.c
+++ b/net/sched/sch_mq.c
@@ -165,7 +165,7 @@ static struct Qdisc *mq_leaf(struct Qdisc *sch, unsigned long cl)
 	return dev_queue->qdisc_sleeping;
 }
 
-static unsigned long mq_get(struct Qdisc *sch, u32 classid)
+static unsigned long mq_find(struct Qdisc *sch, u32 classid)
 {
 	unsigned int ntx = TC_H_MIN(classid);
 
@@ -174,10 +174,6 @@ static unsigned long mq_get(struct Qdisc *sch, u32 classid)
 	return ntx;
 }
 
-static void mq_put(struct Qdisc *sch, unsigned long cl)
-{
-}
-
 static int mq_dump_class(struct Qdisc *sch, unsigned long cl,
 			 struct sk_buff *skb, struct tcmsg *tcm)
 {
@@ -223,8 +219,7 @@ static const struct Qdisc_class_ops mq_class_ops = {
 	.select_queue	= mq_select_queue,
 	.graft		= mq_graft,
 	.leaf		= mq_leaf,
-	.get		= mq_get,
-	.put		= mq_put,
+	.find		= mq_find,
 	.walk		= mq_walk,
 	.dump		= mq_dump_class,
 	.dump_stats	= mq_dump_class_stats,
diff --git a/net/sched/sch_mqprio.c b/net/sched/sch_mqprio.c
index 2165a05994b7..6bcdfe6e7b63 100644
--- a/net/sched/sch_mqprio.c
+++ b/net/sched/sch_mqprio.c
@@ -277,7 +277,7 @@ static struct Qdisc *mqprio_leaf(struct Qdisc *sch, unsigned long cl)
 	return dev_queue->qdisc_sleeping;
 }
 
-static unsigned long mqprio_get(struct Qdisc *sch, u32 classid)
+static unsigned long mqprio_find(struct Qdisc *sch, u32 classid)
 {
 	struct net_device *dev = qdisc_dev(sch);
 	unsigned int ntx = TC_H_MIN(classid);
@@ -287,10 +287,6 @@ static unsigned long mqprio_get(struct Qdisc *sch, u32 classid)
 	return ntx;
 }
 
-static void mqprio_put(struct Qdisc *sch, unsigned long cl)
-{
-}
-
 static int mqprio_dump_class(struct Qdisc *sch, unsigned long cl,
 			 struct sk_buff *skb, struct tcmsg *tcm)
 {
@@ -403,8 +399,7 @@ static void mqprio_walk(struct Qdisc *sch, struct qdisc_walker *arg)
 static const struct Qdisc_class_ops mqprio_class_ops = {
 	.graft		= mqprio_graft,
 	.leaf		= mqprio_leaf,
-	.get		= mqprio_get,
-	.put		= mqprio_put,
+	.find		= mqprio_find,
 	.walk		= mqprio_walk,
 	.dump		= mqprio_dump_class,
 	.dump_stats	= mqprio_dump_class_stats,
diff --git a/net/sched/sch_multiq.c b/net/sched/sch_multiq.c
index f143b7bbaa0d..a5df979b6248 100644
--- a/net/sched/sch_multiq.c
+++ b/net/sched/sch_multiq.c
@@ -306,7 +306,7 @@ multiq_leaf(struct Qdisc *sch, unsigned long arg)
 	return q->queues[band];
 }
 
-static unsigned long multiq_get(struct Qdisc *sch, u32 classid)
+static unsigned long multiq_find(struct Qdisc *sch, u32 classid)
 {
 	struct multiq_sched_data *q = qdisc_priv(sch);
 	unsigned long band = TC_H_MIN(classid);
@@ -319,11 +319,11 @@ static unsigned long multiq_get(struct Qdisc *sch, u32 classid)
 static unsigned long multiq_bind(struct Qdisc *sch, unsigned long parent,
 				 u32 classid)
 {
-	return multiq_get(sch, classid);
+	return multiq_find(sch, classid);
 }
 
 
-static void multiq_put(struct Qdisc *q, unsigned long cl)
+static void multiq_unbind(struct Qdisc *q, unsigned long cl)
 {
 }
 
@@ -385,12 +385,11 @@ static struct tcf_block *multiq_tcf_block(struct Qdisc *sch, unsigned long cl)
 static const struct Qdisc_class_ops multiq_class_ops = {
 	.graft		=	multiq_graft,
 	.leaf		=	multiq_leaf,
-	.get		=	multiq_get,
-	.put		=	multiq_put,
+	.find		=	multiq_find,
 	.walk		=	multiq_walk,
 	.tcf_block	=	multiq_tcf_block,
 	.bind_tcf	=	multiq_bind,
-	.unbind_tcf	=	multiq_put,
+	.unbind_tcf	=	multiq_unbind,
 	.dump		=	multiq_dump_class,
 	.dump_stats	=	multiq_dump_class_stats,
 };
diff --git a/net/sched/sch_netem.c b/net/sched/sch_netem.c
index 1b3dd6190e93..cf5aad0aabfc 100644
--- a/net/sched/sch_netem.c
+++ b/net/sched/sch_netem.c
@@ -1096,15 +1096,11 @@ static struct Qdisc *netem_leaf(struct Qdisc *sch, unsigned long arg)
 	return q->qdisc;
 }
 
-static unsigned long netem_get(struct Qdisc *sch, u32 classid)
+static unsigned long netem_find(struct Qdisc *sch, u32 classid)
 {
 	return 1;
 }
 
-static void netem_put(struct Qdisc *sch, unsigned long arg)
-{
-}
-
 static void netem_walk(struct Qdisc *sch, struct qdisc_walker *walker)
 {
 	if (!walker->stop) {
@@ -1120,8 +1116,7 @@ static void netem_walk(struct Qdisc *sch, struct qdisc_walker *walker)
 static const struct Qdisc_class_ops netem_class_ops = {
 	.graft		=	netem_graft,
 	.leaf		=	netem_leaf,
-	.get		=	netem_get,
-	.put		=	netem_put,
+	.find		=	netem_find,
 	.walk		=	netem_walk,
 	.dump		=	netem_dump_class,
 };
diff --git a/net/sched/sch_prio.c b/net/sched/sch_prio.c
index e3e364cc9a70..f31b28f788c0 100644
--- a/net/sched/sch_prio.c
+++ b/net/sched/sch_prio.c
@@ -260,7 +260,7 @@ prio_leaf(struct Qdisc *sch, unsigned long arg)
 	return q->queues[band];
 }
 
-static unsigned long prio_get(struct Qdisc *sch, u32 classid)
+static unsigned long prio_find(struct Qdisc *sch, u32 classid)
 {
 	struct prio_sched_data *q = qdisc_priv(sch);
 	unsigned long band = TC_H_MIN(classid);
@@ -272,11 +272,11 @@ static unsigned long prio_get(struct Qdisc *sch, u32 classid)
 
 static unsigned long prio_bind(struct Qdisc *sch, unsigned long parent, u32 classid)
 {
-	return prio_get(sch, classid);
+	return prio_find(sch, classid);
 }
 
 
-static void prio_put(struct Qdisc *q, unsigned long cl)
+static void prio_unbind(struct Qdisc *q, unsigned long cl)
 {
 }
 
@@ -338,12 +338,11 @@ static struct tcf_block *prio_tcf_block(struct Qdisc *sch, unsigned long cl)
 static const struct Qdisc_class_ops prio_class_ops = {
 	.graft		=	prio_graft,
 	.leaf		=	prio_leaf,
-	.get		=	prio_get,
-	.put		=	prio_put,
+	.find		=	prio_find,
 	.walk		=	prio_walk,
 	.tcf_block	=	prio_tcf_block,
 	.bind_tcf	=	prio_bind,
-	.unbind_tcf	=	prio_put,
+	.unbind_tcf	=	prio_unbind,
 	.dump		=	prio_dump_class,
 	.dump_stats	=	prio_dump_class_stats,
 };
diff --git a/net/sched/sch_qfq.c b/net/sched/sch_qfq.c
index 9caa959f91e1..cd661a7f81e6 100644
--- a/net/sched/sch_qfq.c
+++ b/net/sched/sch_qfq.c
@@ -132,7 +132,6 @@ struct qfq_aggregate;
 struct qfq_class {
 	struct Qdisc_class_common common;
 
-	unsigned int refcnt;
 	unsigned int filter_cnt;
 
 	struct gnet_stats_basic_packed bstats;
@@ -477,7 +476,6 @@ static int qfq_change_class(struct Qdisc *sch, u32 classid, u32 parentid,
 	if (cl == NULL)
 		return -ENOBUFS;
 
-	cl->refcnt = 1;
 	cl->common.classid = classid;
 	cl->deficit = lmax;
 
@@ -555,32 +553,15 @@ static int qfq_delete_class(struct Qdisc *sch, unsigned long arg)
 	qfq_purge_queue(cl);
 	qdisc_class_hash_remove(&q->clhash, &cl->common);
 
-	BUG_ON(--cl->refcnt == 0);
-	/*
-	 * This shouldn't happen: we "hold" one cops->get() when called
-	 * from tc_ctl_tclass; the destroy method is done from cops->put().
-	 */
-
 	sch_tree_unlock(sch);
-	return 0;
-}
-
-static unsigned long qfq_get_class(struct Qdisc *sch, u32 classid)
-{
-	struct qfq_class *cl = qfq_find_class(sch, classid);
-
-	if (cl != NULL)
-		cl->refcnt++;
 
-	return (unsigned long)cl;
+	qfq_destroy_class(sch, cl);
+	return 0;
 }
 
-static void qfq_put_class(struct Qdisc *sch, unsigned long arg)
+static unsigned long qfq_search_class(struct Qdisc *sch, u32 classid)
 {
-	struct qfq_class *cl = (struct qfq_class *)arg;
-
-	if (--cl->refcnt == 0)
-		qfq_destroy_class(sch, cl);
+	return (unsigned long)qfq_find_class(sch, classid);
 }
 
 static struct tcf_block *qfq_tcf_block(struct Qdisc *sch, unsigned long cl)
@@ -1510,8 +1491,7 @@ static void qfq_destroy_qdisc(struct Qdisc *sch)
 static const struct Qdisc_class_ops qfq_class_ops = {
 	.change		= qfq_change_class,
 	.delete		= qfq_delete_class,
-	.get		= qfq_get_class,
-	.put		= qfq_put_class,
+	.find		= qfq_search_class,
 	.tcf_block	= qfq_tcf_block,
 	.bind_tcf	= qfq_bind_tcf,
 	.unbind_tcf	= qfq_unbind_tcf,
diff --git a/net/sched/sch_red.c b/net/sched/sch_red.c
index 11292adce412..93b9d70a9b28 100644
--- a/net/sched/sch_red.c
+++ b/net/sched/sch_red.c
@@ -311,15 +311,11 @@ static struct Qdisc *red_leaf(struct Qdisc *sch, unsigned long arg)
 	return q->qdisc;
 }
 
-static unsigned long red_get(struct Qdisc *sch, u32 classid)
+static unsigned long red_find(struct Qdisc *sch, u32 classid)
 {
 	return 1;
 }
 
-static void red_put(struct Qdisc *sch, unsigned long arg)
-{
-}
-
 static void red_walk(struct Qdisc *sch, struct qdisc_walker *walker)
 {
 	if (!walker->stop) {
@@ -335,8 +331,7 @@ static void red_walk(struct Qdisc *sch, struct qdisc_walker *walker)
 static const struct Qdisc_class_ops red_class_ops = {
 	.graft		=	red_graft,
 	.leaf		=	red_leaf,
-	.get		=	red_get,
-	.put		=	red_put,
+	.find		=	red_find,
 	.walk		=	red_walk,
 	.dump		=	red_dump_class,
 };
diff --git a/net/sched/sch_sfb.c b/net/sched/sch_sfb.c
index 11fb6ec878d6..cc39e170b4aa 100644
--- a/net/sched/sch_sfb.c
+++ b/net/sched/sch_sfb.c
@@ -632,12 +632,12 @@ static struct Qdisc *sfb_leaf(struct Qdisc *sch, unsigned long arg)
 	return q->qdisc;
 }
 
-static unsigned long sfb_get(struct Qdisc *sch, u32 classid)
+static unsigned long sfb_find(struct Qdisc *sch, u32 classid)
 {
 	return 1;
 }
 
-static void sfb_put(struct Qdisc *sch, unsigned long arg)
+static void sfb_unbind(struct Qdisc *sch, unsigned long arg)
 {
 }
 
@@ -683,14 +683,13 @@ static unsigned long sfb_bind(struct Qdisc *sch, unsigned long parent,
 static const struct Qdisc_class_ops sfb_class_ops = {
 	.graft		=	sfb_graft,
 	.leaf		=	sfb_leaf,
-	.get		=	sfb_get,
-	.put		=	sfb_put,
+	.find		=	sfb_find,
 	.change		=	sfb_change_class,
 	.delete		=	sfb_delete,
 	.walk		=	sfb_walk,
 	.tcf_block	=	sfb_tcf_block,
 	.bind_tcf	=	sfb_bind,
-	.unbind_tcf	=	sfb_put,
+	.unbind_tcf	=	sfb_unbind,
 	.dump		=	sfb_dump_class,
 };
 
diff --git a/net/sched/sch_sfq.c b/net/sched/sch_sfq.c
index 82469ef9655e..b46e0121dd1a 100644
--- a/net/sched/sch_sfq.c
+++ b/net/sched/sch_sfq.c
@@ -808,7 +808,7 @@ static struct Qdisc *sfq_leaf(struct Qdisc *sch, unsigned long arg)
 	return NULL;
 }
 
-static unsigned long sfq_get(struct Qdisc *sch, u32 classid)
+static unsigned long sfq_find(struct Qdisc *sch, u32 classid)
 {
 	return 0;
 }
@@ -821,7 +821,7 @@ static unsigned long sfq_bind(struct Qdisc *sch, unsigned long parent,
 	return 0;
 }
 
-static void sfq_put(struct Qdisc *q, unsigned long cl)
+static void sfq_unbind(struct Qdisc *q, unsigned long cl)
 {
 }
 
@@ -885,11 +885,10 @@ static void sfq_walk(struct Qdisc *sch, struct qdisc_walker *arg)
 
 static const struct Qdisc_class_ops sfq_class_ops = {
 	.leaf		=	sfq_leaf,
-	.get		=	sfq_get,
-	.put		=	sfq_put,
+	.find		=	sfq_find,
 	.tcf_block	=	sfq_tcf_block,
 	.bind_tcf	=	sfq_bind,
-	.unbind_tcf	=	sfq_put,
+	.unbind_tcf	=	sfq_unbind,
 	.dump		=	sfq_dump_class,
 	.dump_stats	=	sfq_dump_class_stats,
 	.walk		=	sfq_walk,
diff --git a/net/sched/sch_tbf.c b/net/sched/sch_tbf.c
index b2e4b6ad241a..d5dba972ab06 100644
--- a/net/sched/sch_tbf.c
+++ b/net/sched/sch_tbf.c
@@ -510,15 +510,11 @@ static struct Qdisc *tbf_leaf(struct Qdisc *sch, unsigned long arg)
 	return q->qdisc;
 }
 
-static unsigned long tbf_get(struct Qdisc *sch, u32 classid)
+static unsigned long tbf_find(struct Qdisc *sch, u32 classid)
 {
 	return 1;
 }
 
-static void tbf_put(struct Qdisc *sch, unsigned long arg)
-{
-}
-
 static void tbf_walk(struct Qdisc *sch, struct qdisc_walker *walker)
 {
 	if (!walker->stop) {
@@ -534,8 +530,7 @@ static void tbf_walk(struct Qdisc *sch, struct qdisc_walker *walker)
 static const struct Qdisc_class_ops tbf_class_ops = {
 	.graft		=	tbf_graft,
 	.leaf		=	tbf_leaf,
-	.get		=	tbf_get,
-	.put		=	tbf_put,
+	.find		=	tbf_find,
 	.walk		=	tbf_walk,
 	.dump		=	tbf_dump_class,
 };
-- 
cgit v1.2.3


From 3cd904ecbb5d0bcf36dfca7e726bdfd6d3644334 Mon Sep 17 00:00:00 2001
From: WANG Cong <xiyou.wangcong@gmail.com>
Date: Thu, 24 Aug 2017 16:51:30 -0700
Subject: net_sched: kill u32_node pointer in Qdisc

It is ugly to hide a u32-filter-specific pointer inside Qdisc,
this breaks the TC layers:

1. Qdisc is a generic representation, should not have any specific
   data of any type

2. Qdisc layer is above filter layer, should only save filters in
   the list of struct tcf_proto.

This pointer is used as the head of the chain of u32 hash tables,
that is struct tc_u_hnode, because u32 filter is very special,
it allows to create multiple hash tables within one qdisc and
across multiple u32 filters.

Instead of using this ugly pointer, we can just save it in a global
hash table key'ed by (dev ifindex, qdisc handle), therefore we can
still treat it as a per qdisc basis data structure conceptually.

Of course, because of network namespaces, this key is not unique
at all, but it is fine as we already have a pointer to Qdisc in
struct tc_u_common, we can just compare the pointers when collision.

And this only affects slow paths, has no impact to fast path,
thanks to the pointer ->tp_c.

Cc: Jamal Hadi Salim <jhs@mojatatu.com>
Cc: Jiri Pirko <jiri@resnulli.us>
Signed-off-by: Cong Wang <xiyou.wangcong@gmail.com>
Acked-by: Jiri Pirko <jiri@mellanox.com>
Acked-by: Jamal Hadi Salim <jhs@mojatatu.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/sch_generic.h |  1 -
 net/sched/cls_u32.c       | 57 +++++++++++++++++++++++++++++++++++++++++++----
 2 files changed, 53 insertions(+), 5 deletions(-)

(limited to 'include')

diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h
index d817585aa5df..c30b634c5f82 100644
--- a/include/net/sch_generic.h
+++ b/include/net/sch_generic.h
@@ -75,7 +75,6 @@ struct Qdisc {
 	struct hlist_node       hash;
 	u32			handle;
 	u32			parent;
-	void			*u32_node;
 
 	struct netdev_queue	*dev_queue;
 
diff --git a/net/sched/cls_u32.c b/net/sched/cls_u32.c
index af22742d2847..99ea4c74dd5b 100644
--- a/net/sched/cls_u32.c
+++ b/net/sched/cls_u32.c
@@ -40,6 +40,8 @@
 #include <linux/rtnetlink.h>
 #include <linux/skbuff.h>
 #include <linux/bitmap.h>
+#include <linux/netdevice.h>
+#include <linux/hash.h>
 #include <net/netlink.h>
 #include <net/act_api.h>
 #include <net/pkt_cls.h>
@@ -92,6 +94,7 @@ struct tc_u_common {
 	struct Qdisc		*q;
 	int			refcnt;
 	u32			hgenerator;
+	struct hlist_node	hnode;
 	struct rcu_head		rcu;
 };
 
@@ -323,12 +326,40 @@ static u32 gen_new_htid(struct tc_u_common *tp_c)
 	return i > 0 ? (tp_c->hgenerator|0x800)<<20 : 0;
 }
 
+static struct hlist_head *tc_u_common_hash;
+
+#define U32_HASH_SHIFT 10
+#define U32_HASH_SIZE (1 << U32_HASH_SHIFT)
+
+static unsigned int tc_u_hash(const struct tcf_proto *tp)
+{
+	struct net_device *dev = tp->q->dev_queue->dev;
+	u32 qhandle = tp->q->handle;
+	int ifindex = dev->ifindex;
+
+	return hash_64((u64)ifindex << 32 | qhandle, U32_HASH_SHIFT);
+}
+
+static struct tc_u_common *tc_u_common_find(const struct tcf_proto *tp)
+{
+	struct tc_u_common *tc;
+	unsigned int h;
+
+	h = tc_u_hash(tp);
+	hlist_for_each_entry(tc, &tc_u_common_hash[h], hnode) {
+		if (tc->q == tp->q)
+			return tc;
+	}
+	return NULL;
+}
+
 static int u32_init(struct tcf_proto *tp)
 {
 	struct tc_u_hnode *root_ht;
 	struct tc_u_common *tp_c;
+	unsigned int h;
 
-	tp_c = tp->q->u32_node;
+	tp_c = tc_u_common_find(tp);
 
 	root_ht = kzalloc(sizeof(*root_ht), GFP_KERNEL);
 	if (root_ht == NULL)
@@ -345,7 +376,10 @@ static int u32_init(struct tcf_proto *tp)
 			return -ENOBUFS;
 		}
 		tp_c->q = tp->q;
-		tp->q->u32_node = tp_c;
+		INIT_HLIST_NODE(&tp_c->hnode);
+
+		h = tc_u_hash(tp);
+		hlist_add_head(&tp_c->hnode, &tc_u_common_hash[h]);
 	}
 
 	tp_c->refcnt++;
@@ -585,7 +619,7 @@ static void u32_destroy(struct tcf_proto *tp)
 	if (--tp_c->refcnt == 0) {
 		struct tc_u_hnode *ht;
 
-		tp->q->u32_node = NULL;
+		hlist_del(&tp_c->hnode);
 
 		for (ht = rtnl_dereference(tp_c->hlist);
 		     ht;
@@ -1213,6 +1247,8 @@ static struct tcf_proto_ops cls_u32_ops __read_mostly = {
 
 static int __init init_u32(void)
 {
+	int i, ret;
+
 	pr_info("u32 classifier\n");
 #ifdef CONFIG_CLS_U32_PERF
 	pr_info("    Performance counters on\n");
@@ -1223,12 +1259,25 @@ static int __init init_u32(void)
 #ifdef CONFIG_NET_CLS_ACT
 	pr_info("    Actions configured\n");
 #endif
-	return register_tcf_proto_ops(&cls_u32_ops);
+	tc_u_common_hash = kvmalloc_array(U32_HASH_SIZE,
+					  sizeof(struct hlist_head),
+					  GFP_KERNEL);
+	if (!tc_u_common_hash)
+		return -ENOMEM;
+
+	for (i = 0; i < U32_HASH_SIZE; i++)
+		INIT_HLIST_HEAD(&tc_u_common_hash[i]);
+
+	ret = register_tcf_proto_ops(&cls_u32_ops);
+	if (ret)
+		kvfree(tc_u_common_hash);
+	return ret;
 }
 
 static void __exit exit_u32(void)
 {
 	unregister_tcf_proto_ops(&cls_u32_ops);
+	kvfree(tc_u_common_hash);
 }
 
 module_init(init_u32)
-- 
cgit v1.2.3


From 64f0f5d18a47c703c85576375cc010e83dac6a48 Mon Sep 17 00:00:00 2001
From: Paolo Abeni <pabeni@redhat.com>
Date: Fri, 25 Aug 2017 14:31:01 +0200
Subject: udp6: set rx_dst_cookie on rx_dst updates

Currently, in the udp6 code, the dst cookie is not initialized/updated
concurrently with the RX dst used by early demux.

As a result, the dst_check() in the early_demux path always fails,
the rx dst cache is always invalidated, and we can't really
leverage significant gain from the demux lookup.

Fix it adding udp6 specific variant of sk_rx_dst_set() and use it
to set the dst cookie when the dst entry is really changed.

The issue is there since the introduction of early demux for ipv6.

Fixes: 5425077d73e0 ("net: ipv6: Add early demux handler for UDP unicast")
Acked-by: Hannes Frederic Sowa <hannes@stressinduktion.org>
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/udp.h |  2 +-
 net/ipv4/udp.c    |  4 +++-
 net/ipv6/udp.c    | 11 ++++++++++-
 3 files changed, 14 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/include/net/udp.h b/include/net/udp.h
index 586de4b811b5..626c2d8a70c5 100644
--- a/include/net/udp.h
+++ b/include/net/udp.h
@@ -260,7 +260,7 @@ static inline struct sk_buff *skb_recv_udp(struct sock *sk, unsigned int flags,
 }
 
 void udp_v4_early_demux(struct sk_buff *skb);
-void udp_sk_rx_dst_set(struct sock *sk, struct dst_entry *dst);
+bool udp_sk_rx_dst_set(struct sock *sk, struct dst_entry *dst);
 int udp_get_port(struct sock *sk, unsigned short snum,
 		 int (*saddr_cmp)(const struct sock *,
 				  const struct sock *));
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index cd1d044a7fa5..a6dc48d76a29 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -1929,14 +1929,16 @@ drop:
 /* For TCP sockets, sk_rx_dst is protected by socket lock
  * For UDP, we use xchg() to guard against concurrent changes.
  */
-void udp_sk_rx_dst_set(struct sock *sk, struct dst_entry *dst)
+bool udp_sk_rx_dst_set(struct sock *sk, struct dst_entry *dst)
 {
 	struct dst_entry *old;
 
 	if (dst_hold_safe(dst)) {
 		old = xchg(&sk->sk_rx_dst, dst);
 		dst_release(old);
+		return old != dst;
 	}
+	return false;
 }
 EXPORT_SYMBOL(udp_sk_rx_dst_set);
 
diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c
index 20039c8501eb..d6886228e1d0 100644
--- a/net/ipv6/udp.c
+++ b/net/ipv6/udp.c
@@ -768,6 +768,15 @@ start_lookup:
 	return 0;
 }
 
+static void udp6_sk_rx_dst_set(struct sock *sk, struct dst_entry *dst)
+{
+	if (udp_sk_rx_dst_set(sk, dst)) {
+		const struct rt6_info *rt = (const struct rt6_info *)dst;
+
+		inet6_sk(sk)->rx_dst_cookie = rt6_get_cookie(rt);
+	}
+}
+
 int __udp6_lib_rcv(struct sk_buff *skb, struct udp_table *udptable,
 		   int proto)
 {
@@ -817,7 +826,7 @@ int __udp6_lib_rcv(struct sk_buff *skb, struct udp_table *udptable,
 		int ret;
 
 		if (unlikely(sk->sk_rx_dst != dst))
-			udp_sk_rx_dst_set(sk, dst);
+			udp6_sk_rx_dst_set(sk, dst);
 
 		ret = udpv6_queue_rcv_skb(sk, skb);
 		sock_put(sk);
-- 
cgit v1.2.3


From ccc829ba3624beb9a703fc995d016b836d9eead8 Mon Sep 17 00:00:00 2001
From: Matthew Garrett <mjg59@google.com>
Date: Fri, 25 Aug 2017 16:50:15 +0100
Subject: efi/libstub: Enable reset attack mitigation

If a machine is reset while secrets are present in RAM, it may be
possible for code executed after the reboot to extract those secrets
from untouched memory. The Trusted Computing Group specified a mechanism
for requesting that the firmware clear all RAM on reset before booting
another OS. This is done by setting the MemoryOverwriteRequestControl
variable at startup. If userspace can ensure that all secrets are
removed as part of a controlled shutdown, it can reset this variable to
0 before triggering a hardware reboot.

Signed-off-by: Matthew Garrett <mjg59@google.com>
Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Matt Fleming <matt@codeblueprint.co.uk>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-efi@vger.kernel.org
Link: http://lkml.kernel.org/r/20170825155019.6740-2-ard.biesheuvel@linaro.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/boot/compressed/eboot.c        |  3 ++
 drivers/firmware/efi/Kconfig            | 10 ++++++
 drivers/firmware/efi/libstub/Makefile   |  1 +
 drivers/firmware/efi/libstub/arm-stub.c |  3 ++
 drivers/firmware/efi/libstub/tpm.c      | 58 +++++++++++++++++++++++++++++++++
 include/linux/efi.h                     |  7 ++++
 6 files changed, 82 insertions(+)
 create mode 100644 drivers/firmware/efi/libstub/tpm.c

(limited to 'include')

diff --git a/arch/x86/boot/compressed/eboot.c b/arch/x86/boot/compressed/eboot.c
index c3e869eaef0c..a1686f3dc295 100644
--- a/arch/x86/boot/compressed/eboot.c
+++ b/arch/x86/boot/compressed/eboot.c
@@ -997,6 +997,9 @@ struct boot_params *efi_main(struct efi_config *c,
 	if (boot_params->secure_boot == efi_secureboot_mode_unset)
 		boot_params->secure_boot = efi_get_secureboot(sys_table);
 
+	/* Ask the firmware to clear memory on unclean shutdown */
+	efi_enable_reset_attack_mitigation(sys_table);
+
 	setup_graphics(boot_params);
 
 	setup_efi_pci(boot_params);
diff --git a/drivers/firmware/efi/Kconfig b/drivers/firmware/efi/Kconfig
index 394db40ed374..2b4c39fdfa91 100644
--- a/drivers/firmware/efi/Kconfig
+++ b/drivers/firmware/efi/Kconfig
@@ -151,6 +151,16 @@ config APPLE_PROPERTIES
 
 	  If unsure, say Y if you have a Mac.  Otherwise N.
 
+config RESET_ATTACK_MITIGATION
+	bool "Reset memory attack mitigation"
+	depends on EFI_STUB
+	help
+	  Request that the firmware clear the contents of RAM after a reboot
+	  using the TCG Platform Reset Attack Mitigation specification. This
+	  protects against an attacker forcibly rebooting the system while it
+	  still contains secrets in RAM, booting another OS and extracting the
+	  secrets.
+
 endmenu
 
 config UEFI_CPER
diff --git a/drivers/firmware/efi/libstub/Makefile b/drivers/firmware/efi/libstub/Makefile
index cf81e6cf5ae8..dedf9bde44db 100644
--- a/drivers/firmware/efi/libstub/Makefile
+++ b/drivers/firmware/efi/libstub/Makefile
@@ -30,6 +30,7 @@ OBJECT_FILES_NON_STANDARD	:= y
 KCOV_INSTRUMENT			:= n
 
 lib-y				:= efi-stub-helper.o gop.o secureboot.o
+lib-$(CONFIG_RESET_ATTACK_MITIGATION) += tpm.o
 
 # include the stub's generic dependencies from lib/ when building for ARM/arm64
 arm-deps := fdt_rw.c fdt_ro.c fdt_wip.c fdt.c fdt_empty_tree.c fdt_sw.c sort.c
diff --git a/drivers/firmware/efi/libstub/arm-stub.c b/drivers/firmware/efi/libstub/arm-stub.c
index 8181ac179d14..1cb2d1c070c3 100644
--- a/drivers/firmware/efi/libstub/arm-stub.c
+++ b/drivers/firmware/efi/libstub/arm-stub.c
@@ -192,6 +192,9 @@ unsigned long efi_entry(void *handle, efi_system_table_t *sys_table,
 		goto fail_free_cmdline;
 	}
 
+	/* Ask the firmware to clear memory on unclean shutdown */
+	efi_enable_reset_attack_mitigation(sys_table);
+
 	secure_boot = efi_get_secureboot(sys_table);
 
 	/*
diff --git a/drivers/firmware/efi/libstub/tpm.c b/drivers/firmware/efi/libstub/tpm.c
new file mode 100644
index 000000000000..6224cdbc9669
--- /dev/null
+++ b/drivers/firmware/efi/libstub/tpm.c
@@ -0,0 +1,58 @@
+/*
+ * TPM handling.
+ *
+ * Copyright (C) 2016 CoreOS, Inc
+ * Copyright (C) 2017 Google, Inc.
+ *     Matthew Garrett <mjg59@google.com>
+ *
+ * This file is part of the Linux kernel, and is made available under the
+ * terms of the GNU General Public License version 2.
+ */
+#include <linux/efi.h>
+#include <asm/efi.h>
+
+#include "efistub.h"
+
+static const efi_char16_t efi_MemoryOverWriteRequest_name[] = {
+	'M', 'e', 'm', 'o', 'r', 'y', 'O', 'v', 'e', 'r', 'w', 'r', 'i', 't',
+	'e', 'R', 'e', 'q', 'u', 'e', 's', 't', 'C', 'o', 'n', 't', 'r', 'o',
+	'l', 0
+};
+
+#define MEMORY_ONLY_RESET_CONTROL_GUID \
+	EFI_GUID(0xe20939be, 0x32d4, 0x41be, 0xa1, 0x50, 0x89, 0x7f, 0x85, 0xd4, 0x98, 0x29)
+
+#define get_efi_var(name, vendor, ...) \
+	efi_call_runtime(get_variable, \
+			 (efi_char16_t *)(name), (efi_guid_t *)(vendor), \
+			 __VA_ARGS__)
+
+#define set_efi_var(name, vendor, ...) \
+	efi_call_runtime(set_variable, \
+			 (efi_char16_t *)(name), (efi_guid_t *)(vendor), \
+			 __VA_ARGS__)
+
+/*
+ * Enable reboot attack mitigation. This requests that the firmware clear the
+ * RAM on next reboot before proceeding with boot, ensuring that any secrets
+ * are cleared. If userland has ensured that all secrets have been removed
+ * from RAM before reboot it can simply reset this variable.
+ */
+void efi_enable_reset_attack_mitigation(efi_system_table_t *sys_table_arg)
+{
+	u8 val = 1;
+	efi_guid_t var_guid = MEMORY_ONLY_RESET_CONTROL_GUID;
+	efi_status_t status;
+	unsigned long datasize = 0;
+
+	status = get_efi_var(efi_MemoryOverWriteRequest_name, &var_guid,
+			     NULL, &datasize, NULL);
+
+	if (status == EFI_NOT_FOUND)
+		return;
+
+	set_efi_var(efi_MemoryOverWriteRequest_name, &var_guid,
+		    EFI_VARIABLE_NON_VOLATILE |
+		    EFI_VARIABLE_BOOTSERVICE_ACCESS |
+		    EFI_VARIABLE_RUNTIME_ACCESS, sizeof(val), &val);
+}
diff --git a/include/linux/efi.h b/include/linux/efi.h
index 4e47f78430be..c241acca0b15 100644
--- a/include/linux/efi.h
+++ b/include/linux/efi.h
@@ -1504,6 +1504,13 @@ enum efi_secureboot_mode {
 };
 enum efi_secureboot_mode efi_get_secureboot(efi_system_table_t *sys_table);
 
+#ifdef CONFIG_RESET_ATTACK_MITIGATION
+void efi_enable_reset_attack_mitigation(efi_system_table_t *sys_table_arg);
+#else
+static inline void
+efi_enable_reset_attack_mitigation(efi_system_table_t *sys_table_arg) { }
+#endif
+
 /*
  * Arch code can implement the following three template macros, avoiding
  * reptition for the void/non-void return cases of {__,}efi_call_virt():
-- 
cgit v1.2.3


From c2ceb5fd4e921506e86208b82fca716a2c3aad59 Mon Sep 17 00:00:00 2001
From: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Date: Fri, 25 Aug 2017 16:50:16 +0100
Subject: efi/random: Increase size of firmware supplied randomness

The crng code requires at least 64 bytes (2 * CHACHA20_BLOCK_SIZE)
to complete the fast boot-time init, so provide that many bytes
when invoking UEFI protocols to seed the entropy pool. Also, add
a notice so we can tell from the boot log when the seeding actually
took place.

Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Matt Fleming <matt@codeblueprint.co.uk>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-efi@vger.kernel.org
Link: http://lkml.kernel.org/r/20170825155019.6740-3-ard.biesheuvel@linaro.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 drivers/firmware/efi/efi.c            |  3 ++-
 drivers/firmware/efi/libstub/random.c | 10 ++++------
 include/linux/efi.h                   |  2 ++
 3 files changed, 8 insertions(+), 7 deletions(-)

(limited to 'include')

diff --git a/drivers/firmware/efi/efi.c b/drivers/firmware/efi/efi.c
index a32e1460ade8..c8a27a2c30c1 100644
--- a/drivers/firmware/efi/efi.c
+++ b/drivers/firmware/efi/efi.c
@@ -541,6 +541,7 @@ int __init efi_config_parse_tables(void *config_tables, int count, int sz,
 			if (seed != NULL) {
 				add_device_randomness(seed->bits, seed->size);
 				early_memunmap(seed, sizeof(*seed) + size);
+				pr_notice("seeding entropy pool\n");
 			} else {
 				pr_err("Could not map UEFI random seed!\n");
 			}
@@ -900,7 +901,7 @@ static int update_efi_random_seed(struct notifier_block *nb,
 
 	seed = memremap(efi.rng_seed, sizeof(*seed), MEMREMAP_WB);
 	if (seed != NULL) {
-		size = min(seed->size, 32U);
+		size = min(seed->size, EFI_RANDOM_SEED_SIZE);
 		memunmap(seed);
 	} else {
 		pr_err("Could not map UEFI random seed!\n");
diff --git a/drivers/firmware/efi/libstub/random.c b/drivers/firmware/efi/libstub/random.c
index 7e72954d5860..e0e603a89aa9 100644
--- a/drivers/firmware/efi/libstub/random.c
+++ b/drivers/firmware/efi/libstub/random.c
@@ -145,8 +145,6 @@ efi_status_t efi_random_alloc(efi_system_table_t *sys_table_arg,
 	return status;
 }
 
-#define RANDOM_SEED_SIZE	32
-
 efi_status_t efi_random_get_seed(efi_system_table_t *sys_table_arg)
 {
 	efi_guid_t rng_proto = EFI_RNG_PROTOCOL_GUID;
@@ -162,25 +160,25 @@ efi_status_t efi_random_get_seed(efi_system_table_t *sys_table_arg)
 		return status;
 
 	status = efi_call_early(allocate_pool, EFI_RUNTIME_SERVICES_DATA,
-				sizeof(*seed) + RANDOM_SEED_SIZE,
+				sizeof(*seed) + EFI_RANDOM_SEED_SIZE,
 				(void **)&seed);
 	if (status != EFI_SUCCESS)
 		return status;
 
-	status = rng->get_rng(rng, &rng_algo_raw, RANDOM_SEED_SIZE,
+	status = rng->get_rng(rng, &rng_algo_raw, EFI_RANDOM_SEED_SIZE,
 			      seed->bits);
 	if (status == EFI_UNSUPPORTED)
 		/*
 		 * Use whatever algorithm we have available if the raw algorithm
 		 * is not implemented.
 		 */
-		status = rng->get_rng(rng, NULL, RANDOM_SEED_SIZE,
+		status = rng->get_rng(rng, NULL, EFI_RANDOM_SEED_SIZE,
 				      seed->bits);
 
 	if (status != EFI_SUCCESS)
 		goto err_freepool;
 
-	seed->size = RANDOM_SEED_SIZE;
+	seed->size = EFI_RANDOM_SEED_SIZE;
 	status = efi_call_early(install_configuration_table, &rng_table_guid,
 				seed);
 	if (status != EFI_SUCCESS)
diff --git a/include/linux/efi.h b/include/linux/efi.h
index c241acca0b15..33d41df062bc 100644
--- a/include/linux/efi.h
+++ b/include/linux/efi.h
@@ -1571,6 +1571,8 @@ efi_status_t efi_exit_boot_services(efi_system_table_t *sys_table,
 				    void *priv,
 				    efi_exit_boot_map_processing priv_func);
 
+#define EFI_RANDOM_SEED_SIZE		64U
+
 struct linux_efi_random_seed {
 	u32	size;
 	u8	bits[];
-- 
cgit v1.2.3


From 28e11b15b6606c3308f87f7c9c4c9e404eddde6d Mon Sep 17 00:00:00 2001
From: Hans Verkuil <hans.verkuil@cisco.com>
Date: Sun, 20 Aug 2017 06:53:10 -0400
Subject: media: cec: replace pin->cur_value by adap->cec_pin_is_high

The current CEC pin value (0 or 1) was part of the cec_pin struct,
but that assumes that CEC pin monitoring can only be used with
a driver that uses the low-level CEC pin framework.

But hardware that has both a high-level API and can monitor the
CEC pin at low-level at the same time does not need to depend on
the cec pin framework.

To support such devices remove the cur_value field from struct cec_pin
and add a cec_pin_is_high field to cec_adapter. This also makes it
possible to drop the '#ifdef CONFIG_CEC_PIN' in cec-api.c.

Signed-off-by: Hans Verkuil <hans.verkuil@cisco.com>
Signed-off-by: Mauro Carvalho Chehab <mchehab@s-opensource.com>
---
 drivers/media/cec/cec-api.c  | 6 ++----
 drivers/media/cec/cec-core.c | 1 +
 drivers/media/cec/cec-pin.c  | 5 ++---
 include/media/cec-pin.h      | 1 -
 include/media/cec.h          | 1 +
 5 files changed, 6 insertions(+), 8 deletions(-)

(limited to 'include')

diff --git a/drivers/media/cec/cec-api.c b/drivers/media/cec/cec-api.c
index 87649b0c6381..a079f7fe018c 100644
--- a/drivers/media/cec/cec-api.c
+++ b/drivers/media/cec/cec-api.c
@@ -444,15 +444,13 @@ static long cec_s_mode(struct cec_adapter *adap, struct cec_fh *fh,
 	if (mode_follower == CEC_MODE_FOLLOWER)
 		adap->follower_cnt++;
 	if (mode_follower == CEC_MODE_MONITOR_PIN) {
-#ifdef CONFIG_CEC_PIN
 		struct cec_event ev = {
 			.flags = CEC_EVENT_FL_INITIAL_STATE,
 		};
 
-		ev.event = adap->pin->cur_value ? CEC_EVENT_PIN_CEC_HIGH :
-						  CEC_EVENT_PIN_CEC_LOW;
+		ev.event = adap->cec_pin_is_high ? CEC_EVENT_PIN_CEC_HIGH :
+						   CEC_EVENT_PIN_CEC_LOW;
 		cec_queue_event_fh(fh, &ev, 0);
-#endif
 		adap->monitor_pin_cnt++;
 	}
 	if (mode_follower == CEC_MODE_EXCL_FOLLOWER ||
diff --git a/drivers/media/cec/cec-core.c b/drivers/media/cec/cec-core.c
index e9db90997b0a..648136e552d5 100644
--- a/drivers/media/cec/cec-core.c
+++ b/drivers/media/cec/cec-core.c
@@ -227,6 +227,7 @@ struct cec_adapter *cec_allocate_adapter(const struct cec_adap_ops *ops,
 		return ERR_PTR(-ENOMEM);
 	strlcpy(adap->name, name, sizeof(adap->name));
 	adap->phys_addr = CEC_PHYS_ADDR_INVALID;
+	adap->cec_pin_is_high = true;
 	adap->log_addrs.cec_version = CEC_OP_CEC_VERSION_2_0;
 	adap->log_addrs.vendor_id = CEC_VENDOR_ID_NONE;
 	adap->capabilities = caps;
diff --git a/drivers/media/cec/cec-pin.c b/drivers/media/cec/cec-pin.c
index 970774de3dcd..c003b8eac617 100644
--- a/drivers/media/cec/cec-pin.c
+++ b/drivers/media/cec/cec-pin.c
@@ -88,10 +88,10 @@ static const struct cec_state states[CEC_PIN_STATES] = {
 
 static void cec_pin_update(struct cec_pin *pin, bool v, bool force)
 {
-	if (!force && v == pin->cur_value)
+	if (!force && v == pin->adap->cec_pin_is_high)
 		return;
 
-	pin->cur_value = v;
+	pin->adap->cec_pin_is_high = v;
 	if (atomic_read(&pin->work_pin_events) < CEC_NUM_PIN_EVENTS) {
 		pin->work_pin_is_high[pin->work_pin_events_wr] = v;
 		pin->work_pin_ts[pin->work_pin_events_wr] = ktime_get();
@@ -781,7 +781,6 @@ struct cec_adapter *cec_pin_allocate_adapter(const struct cec_pin_ops *pin_ops,
 	if (pin == NULL)
 		return ERR_PTR(-ENOMEM);
 	pin->ops = pin_ops;
-	pin->cur_value = true;
 	hrtimer_init(&pin->timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
 	pin->timer.function = cec_pin_timer;
 	init_waitqueue_head(&pin->kthread_waitq);
diff --git a/include/media/cec-pin.h b/include/media/cec-pin.h
index d28d07fa312e..f09cc9579d53 100644
--- a/include/media/cec-pin.h
+++ b/include/media/cec-pin.h
@@ -128,7 +128,6 @@ struct cec_pin {
 	u16				la_mask;
 	bool				enabled;
 	bool				monitor_all;
-	bool				cur_value;
 	bool				rx_eom;
 	bool				enable_irq_failed;
 	enum cec_pin_state		state;
diff --git a/include/media/cec.h b/include/media/cec.h
index 60b26fc18464..df6b3bd31284 100644
--- a/include/media/cec.h
+++ b/include/media/cec.h
@@ -180,6 +180,7 @@ struct cec_adapter {
 	bool needs_hpd;
 	bool is_configuring;
 	bool is_configured;
+	bool cec_pin_is_high;
 	u32 monitor_all_cnt;
 	u32 monitor_pin_cnt;
 	u32 follower_cnt;
-- 
cgit v1.2.3


From 1526c704b3c32640b5e6cdc1662b0698603e9d4f Mon Sep 17 00:00:00 2001
From: Sakari Ailus <sakari.ailus@linux.intel.com>
Date: Mon, 14 Aug 2017 06:22:14 -0400
Subject: media: v4l: fwnode: The clock lane is the first lane in
 lane_polarities

The clock lane is the first lane in the lane_polarities array. Reflect this
consistently by putting the number of data lanes after the number of clock
lanes.

Fixes: 4ee236219f6d ("media: v4l2-fwnode: suppress a warning at OF parsing logic")

Signed-off-by: Sakari Ailus <sakari.ailus@linux.intel.com>
Signed-off-by: Mauro Carvalho Chehab <mchehab@s-opensource.com>
---
 drivers/media/v4l2-core/v4l2-fwnode.c | 2 +-
 include/media/v4l2-fwnode.h           | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/drivers/media/v4l2-core/v4l2-fwnode.c b/drivers/media/v4l2-core/v4l2-fwnode.c
index 3f8eed008700..df7169b5ed8c 100644
--- a/drivers/media/v4l2-core/v4l2-fwnode.c
+++ b/drivers/media/v4l2-core/v4l2-fwnode.c
@@ -48,7 +48,7 @@ static int v4l2_fwnode_endpoint_parse_csi2_bus(struct fwnode_handle *fwnode,
 
 	rval = fwnode_property_read_u32_array(fwnode, "data-lanes", NULL, 0);
 	if (rval > 0) {
-		u32 array[MAX_DATA_LANES + 1];
+		u32 array[1 + MAX_DATA_LANES];
 
 		bus->num_data_lanes = min_t(int, MAX_DATA_LANES, rval);
 
diff --git a/include/media/v4l2-fwnode.h b/include/media/v4l2-fwnode.h
index cb34dcb0bb65..08e743fb7944 100644
--- a/include/media/v4l2-fwnode.h
+++ b/include/media/v4l2-fwnode.h
@@ -42,7 +42,7 @@ struct v4l2_fwnode_bus_mipi_csi2 {
 	unsigned char data_lanes[MAX_DATA_LANES];
 	unsigned char clock_lane;
 	unsigned short num_data_lanes;
-	bool lane_polarities[MAX_DATA_LANES + 1];
+	bool lane_polarities[1 + MAX_DATA_LANES];
 };
 
 /**
-- 
cgit v1.2.3


From ad3cdf3e1f37299fc1422f7828243635249a3dde Mon Sep 17 00:00:00 2001
From: Sakari Ailus <sakari.ailus@linux.intel.com>
Date: Mon, 14 Aug 2017 06:43:07 -0400
Subject: media: v4l: fwnode: Use a less clash-prone name for MAX_DATA_LANES
 macro

Avoid using a generic name such as MAX_DATA_LANES in a header file widely
included in drivers. Instead, call it V4L2_FWNODE_CSI2_MAX_DATA_LANES.

Fixes: 4ee236219f6d ("media: v4l2-fwnode: suppress a warning at OF parsing logic")

Signed-off-by: Sakari Ailus <sakari.ailus@linux.intel.com>
Signed-off-by: Mauro Carvalho Chehab <mchehab@s-opensource.com>
---
 drivers/media/v4l2-core/v4l2-fwnode.c | 5 +++--
 include/media/v4l2-fwnode.h           | 6 +++---
 2 files changed, 6 insertions(+), 5 deletions(-)

(limited to 'include')

diff --git a/drivers/media/v4l2-core/v4l2-fwnode.c b/drivers/media/v4l2-core/v4l2-fwnode.c
index df7169b5ed8c..40b2fbfe8865 100644
--- a/drivers/media/v4l2-core/v4l2-fwnode.c
+++ b/drivers/media/v4l2-core/v4l2-fwnode.c
@@ -48,9 +48,10 @@ static int v4l2_fwnode_endpoint_parse_csi2_bus(struct fwnode_handle *fwnode,
 
 	rval = fwnode_property_read_u32_array(fwnode, "data-lanes", NULL, 0);
 	if (rval > 0) {
-		u32 array[1 + MAX_DATA_LANES];
+		u32 array[1 + V4L2_FWNODE_CSI2_MAX_DATA_LANES];
 
-		bus->num_data_lanes = min_t(int, MAX_DATA_LANES, rval);
+		bus->num_data_lanes =
+			min_t(int, V4L2_FWNODE_CSI2_MAX_DATA_LANES, rval);
 
 		fwnode_property_read_u32_array(fwnode, "data-lanes", array,
 					       bus->num_data_lanes);
diff --git a/include/media/v4l2-fwnode.h b/include/media/v4l2-fwnode.h
index 08e743fb7944..7adec9851d9e 100644
--- a/include/media/v4l2-fwnode.h
+++ b/include/media/v4l2-fwnode.h
@@ -26,7 +26,7 @@
 
 struct fwnode_handle;
 
-#define MAX_DATA_LANES	4
+#define V4L2_FWNODE_CSI2_MAX_DATA_LANES	4
 
 /**
  * struct v4l2_fwnode_bus_mipi_csi2 - MIPI CSI-2 bus data structure
@@ -39,10 +39,10 @@ struct fwnode_handle;
  */
 struct v4l2_fwnode_bus_mipi_csi2 {
 	unsigned int flags;
-	unsigned char data_lanes[MAX_DATA_LANES];
+	unsigned char data_lanes[V4L2_FWNODE_CSI2_MAX_DATA_LANES];
 	unsigned char clock_lane;
 	unsigned short num_data_lanes;
-	bool lane_polarities[1 + MAX_DATA_LANES];
+	bool lane_polarities[1 + V4L2_FWNODE_CSI2_MAX_DATA_LANES];
 };
 
 /**
-- 
cgit v1.2.3


From 5b6f9abe5a49df81737fbbfba890ee5b093f8168 Mon Sep 17 00:00:00 2001
From: Stanimir Varbanov <stanimir.varbanov@linaro.org>
Date: Mon, 21 Aug 2017 07:34:10 -0400
Subject: media: vb2: add bidirectional flag in vb2_queue

This change is intended to give to the v4l2 drivers a choice to
change the default behavior of the v4l2-core DMA mapping direction
from DMA_TO/FROM_DEVICE (depending on the buffer type CAPTURE or
OUTPUT) to DMA_BIDIRECTIONAL during queue_init time.

Initially the issue with DMA mapping direction has been found in
Venus encoder driver where the hardware (firmware side) adds few
lines padding on bottom of the image buffer, and the consequence
is triggering of IOMMU protection faults.

This will help supporting venus encoder (and probably other drivers
in the future) which wants to map output type of buffers as
read/write.

Signed-off-by: Stanimir Varbanov <stanimir.varbanov@linaro.org>
Acked-by: Marek Szyprowski <m.szyprowski@samsung.com>
Signed-off-by: Hans Verkuil <hansverk@cisco.com>
Signed-off-by: Mauro Carvalho Chehab <mchehab@s-opensource.com>
---
 drivers/media/v4l2-core/videobuf2-core.c       | 17 ++++++++---------
 drivers/media/v4l2-core/videobuf2-dma-contig.c |  3 ++-
 drivers/media/v4l2-core/videobuf2-dma-sg.c     |  6 ++++--
 drivers/media/v4l2-core/videobuf2-vmalloc.c    |  6 ++++--
 include/media/videobuf2-core.h                 | 13 +++++++++++++
 5 files changed, 31 insertions(+), 14 deletions(-)

(limited to 'include')

diff --git a/drivers/media/v4l2-core/videobuf2-core.c b/drivers/media/v4l2-core/videobuf2-core.c
index 0924594989b4..cb115ba6a1d2 100644
--- a/drivers/media/v4l2-core/videobuf2-core.c
+++ b/drivers/media/v4l2-core/videobuf2-core.c
@@ -194,8 +194,6 @@ static void __enqueue_in_driver(struct vb2_buffer *vb);
 static int __vb2_buf_mem_alloc(struct vb2_buffer *vb)
 {
 	struct vb2_queue *q = vb->vb2_queue;
-	enum dma_data_direction dma_dir =
-		q->is_output ? DMA_TO_DEVICE : DMA_FROM_DEVICE;
 	void *mem_priv;
 	int plane;
 	int ret = -ENOMEM;
@@ -209,7 +207,7 @@ static int __vb2_buf_mem_alloc(struct vb2_buffer *vb)
 
 		mem_priv = call_ptr_memop(vb, alloc,
 				q->alloc_devs[plane] ? : q->dev,
-				q->dma_attrs, size, dma_dir, q->gfp_flags);
+				q->dma_attrs, size, q->dma_dir, q->gfp_flags);
 		if (IS_ERR_OR_NULL(mem_priv)) {
 			if (mem_priv)
 				ret = PTR_ERR(mem_priv);
@@ -978,8 +976,6 @@ static int __prepare_userptr(struct vb2_buffer *vb, const void *pb)
 	void *mem_priv;
 	unsigned int plane;
 	int ret = 0;
-	enum dma_data_direction dma_dir =
-		q->is_output ? DMA_TO_DEVICE : DMA_FROM_DEVICE;
 	bool reacquired = vb->planes[0].mem_priv == NULL;
 
 	memset(planes, 0, sizeof(planes[0]) * vb->num_planes);
@@ -1030,7 +1026,7 @@ static int __prepare_userptr(struct vb2_buffer *vb, const void *pb)
 		mem_priv = call_ptr_memop(vb, get_userptr,
 				q->alloc_devs[plane] ? : q->dev,
 				planes[plane].m.userptr,
-				planes[plane].length, dma_dir);
+				planes[plane].length, q->dma_dir);
 		if (IS_ERR(mem_priv)) {
 			dprintk(1, "failed acquiring userspace memory for plane %d\n",
 				plane);
@@ -1096,8 +1092,6 @@ static int __prepare_dmabuf(struct vb2_buffer *vb, const void *pb)
 	void *mem_priv;
 	unsigned int plane;
 	int ret = 0;
-	enum dma_data_direction dma_dir =
-		q->is_output ? DMA_TO_DEVICE : DMA_FROM_DEVICE;
 	bool reacquired = vb->planes[0].mem_priv == NULL;
 
 	memset(planes, 0, sizeof(planes[0]) * vb->num_planes);
@@ -1156,7 +1150,7 @@ static int __prepare_dmabuf(struct vb2_buffer *vb, const void *pb)
 		/* Acquire each plane's memory */
 		mem_priv = call_ptr_memop(vb, attach_dmabuf,
 				q->alloc_devs[plane] ? : q->dev,
-				dbuf, planes[plane].length, dma_dir);
+				dbuf, planes[plane].length, q->dma_dir);
 		if (IS_ERR(mem_priv)) {
 			dprintk(1, "failed to attach dmabuf\n");
 			ret = PTR_ERR(mem_priv);
@@ -2003,6 +1997,11 @@ int vb2_core_queue_init(struct vb2_queue *q)
 	if (q->buf_struct_size == 0)
 		q->buf_struct_size = sizeof(struct vb2_buffer);
 
+	if (q->bidirectional)
+		q->dma_dir = DMA_BIDIRECTIONAL;
+	else
+		q->dma_dir = q->is_output ? DMA_TO_DEVICE : DMA_FROM_DEVICE;
+
 	return 0;
 }
 EXPORT_SYMBOL_GPL(vb2_core_queue_init);
diff --git a/drivers/media/v4l2-core/videobuf2-dma-contig.c b/drivers/media/v4l2-core/videobuf2-dma-contig.c
index 5b90a66b9e78..9f389f36566d 100644
--- a/drivers/media/v4l2-core/videobuf2-dma-contig.c
+++ b/drivers/media/v4l2-core/videobuf2-dma-contig.c
@@ -508,7 +508,8 @@ static void *vb2_dc_get_userptr(struct device *dev, unsigned long vaddr,
 	buf->dma_dir = dma_dir;
 
 	offset = vaddr & ~PAGE_MASK;
-	vec = vb2_create_framevec(vaddr, size, dma_dir == DMA_FROM_DEVICE);
+	vec = vb2_create_framevec(vaddr, size, dma_dir == DMA_FROM_DEVICE ||
+					       dma_dir == DMA_BIDIRECTIONAL);
 	if (IS_ERR(vec)) {
 		ret = PTR_ERR(vec);
 		goto fail_buf;
diff --git a/drivers/media/v4l2-core/videobuf2-dma-sg.c b/drivers/media/v4l2-core/videobuf2-dma-sg.c
index 54f33938d45b..6808231a6bdc 100644
--- a/drivers/media/v4l2-core/videobuf2-dma-sg.c
+++ b/drivers/media/v4l2-core/videobuf2-dma-sg.c
@@ -239,7 +239,8 @@ static void *vb2_dma_sg_get_userptr(struct device *dev, unsigned long vaddr,
 	buf->offset = vaddr & ~PAGE_MASK;
 	buf->size = size;
 	buf->dma_sgt = &buf->sg_table;
-	vec = vb2_create_framevec(vaddr, size, buf->dma_dir == DMA_FROM_DEVICE);
+	vec = vb2_create_framevec(vaddr, size, dma_dir == DMA_FROM_DEVICE ||
+					       dma_dir == DMA_BIDIRECTIONAL);
 	if (IS_ERR(vec))
 		goto userptr_fail_pfnvec;
 	buf->vec = vec;
@@ -292,7 +293,8 @@ static void vb2_dma_sg_put_userptr(void *buf_priv)
 		vm_unmap_ram(buf->vaddr, buf->num_pages);
 	sg_free_table(buf->dma_sgt);
 	while (--i >= 0) {
-		if (buf->dma_dir == DMA_FROM_DEVICE)
+		if (buf->dma_dir == DMA_FROM_DEVICE ||
+		    buf->dma_dir == DMA_BIDIRECTIONAL)
 			set_page_dirty_lock(buf->pages[i]);
 	}
 	vb2_destroy_framevec(buf->vec);
diff --git a/drivers/media/v4l2-core/videobuf2-vmalloc.c b/drivers/media/v4l2-core/videobuf2-vmalloc.c
index 6bc130fe84f6..3a7c80cd1a17 100644
--- a/drivers/media/v4l2-core/videobuf2-vmalloc.c
+++ b/drivers/media/v4l2-core/videobuf2-vmalloc.c
@@ -87,7 +87,8 @@ static void *vb2_vmalloc_get_userptr(struct device *dev, unsigned long vaddr,
 	buf->dma_dir = dma_dir;
 	offset = vaddr & ~PAGE_MASK;
 	buf->size = size;
-	vec = vb2_create_framevec(vaddr, size, dma_dir == DMA_FROM_DEVICE);
+	vec = vb2_create_framevec(vaddr, size, dma_dir == DMA_FROM_DEVICE ||
+					       dma_dir == DMA_BIDIRECTIONAL);
 	if (IS_ERR(vec)) {
 		ret = PTR_ERR(vec);
 		goto fail_pfnvec_create;
@@ -137,7 +138,8 @@ static void vb2_vmalloc_put_userptr(void *buf_priv)
 		pages = frame_vector_pages(buf->vec);
 		if (vaddr)
 			vm_unmap_ram((void *)vaddr, n_pages);
-		if (buf->dma_dir == DMA_FROM_DEVICE)
+		if (buf->dma_dir == DMA_FROM_DEVICE ||
+		    buf->dma_dir == DMA_BIDIRECTIONAL)
 			for (i = 0; i < n_pages; i++)
 				set_page_dirty_lock(pages[i]);
 	} else {
diff --git a/include/media/videobuf2-core.h b/include/media/videobuf2-core.h
index cb97c224be73..ef9b64398c8c 100644
--- a/include/media/videobuf2-core.h
+++ b/include/media/videobuf2-core.h
@@ -427,6 +427,16 @@ struct vb2_buf_ops {
  * @dev:	device to use for the default allocation context if the driver
  *		doesn't fill in the @alloc_devs array.
  * @dma_attrs:	DMA attributes to use for the DMA.
+ * @bidirectional: when this flag is set the DMA direction for the buffers of
+ *		this queue will be overridden with DMA_BIDIRECTIONAL direction.
+ *		This is useful in cases where the hardware (firmware) writes to
+ *		a buffer which is mapped as read (DMA_TO_DEVICE), or reads from
+ *		buffer which is mapped for write (DMA_FROM_DEVICE) in order
+ *		to satisfy some internal hardware restrictions or adds a padding
+ *		needed by the processing algorithm. In case the DMA mapping is
+ *		not bidirectional but the hardware (firmware) trying to access
+ *		the buffer (in the opposite direction) this could lead to an
+ *		IOMMU protection faults.
  * @fileio_read_once:		report EOF after reading the first buffer
  * @fileio_write_immediately:	queue buffer after each write() call
  * @allow_zero_bytesused:	allow bytesused == 0 to be passed to the driver
@@ -465,6 +475,7 @@ struct vb2_buf_ops {
  * Private elements (won't appear at the uAPI book):
  * @mmap_lock:	private mutex used when buffers are allocated/freed/mmapped
  * @memory:	current memory type used
+ * @dma_dir:	DMA mapping direction.
  * @bufs:	videobuf buffer structures
  * @num_buffers: number of allocated/used buffers
  * @queued_list: list of buffers currently queued from userspace
@@ -495,6 +506,7 @@ struct vb2_queue {
 	unsigned int			io_modes;
 	struct device			*dev;
 	unsigned long			dma_attrs;
+	unsigned			bidirectional:1;
 	unsigned			fileio_read_once:1;
 	unsigned			fileio_write_immediately:1;
 	unsigned			allow_zero_bytesused:1;
@@ -516,6 +528,7 @@ struct vb2_queue {
 	/* private: internal use only */
 	struct mutex			mmap_lock;
 	unsigned int			memory;
+	enum dma_data_direction		dma_dir;
 	struct vb2_buffer		*bufs[VB2_MAX_FRAME];
 	unsigned int			num_buffers;
 
-- 
cgit v1.2.3


From d1b3437ed780cd97b4b1300db96a4d8faae6fea1 Mon Sep 17 00:00:00 2001
From: Sakari Ailus <sakari.ailus@linux.intel.com>
Date: Tue, 8 Aug 2017 09:29:58 -0400
Subject: media: v4l: Add packed Bayer raw12 pixel formats

These formats are compressed 12-bit raw bayer formats with four different
pixel orders. They are similar to 10-bit variants. The formats added by
this patch are

	V4L2_PIX_FMT_SBGGR12P
	V4L2_PIX_FMT_SGBRG12P
	V4L2_PIX_FMT_SGRBG12P
	V4L2_PIX_FMT_SRGGB12P

Signed-off-by: Sakari Ailus <sakari.ailus@linux.intel.com>
Signed-off-by: Hans Verkuil <hans.verkuil@cisco.com>
Signed-off-by: Mauro Carvalho Chehab <mchehab@s-opensource.com>
---
 Documentation/media/uapi/v4l/pixfmt-rgb.rst      |   1 +
 Documentation/media/uapi/v4l/pixfmt-srggb12p.rst | 103 +++++++++++++++++++++++
 drivers/media/v4l2-core/v4l2-ioctl.c             |  12 ++-
 include/uapi/linux/videodev2.h                   |   5 ++
 4 files changed, 117 insertions(+), 4 deletions(-)
 create mode 100644 Documentation/media/uapi/v4l/pixfmt-srggb12p.rst

(limited to 'include')

diff --git a/Documentation/media/uapi/v4l/pixfmt-rgb.rst b/Documentation/media/uapi/v4l/pixfmt-rgb.rst
index b0f35136021e..4cc27195dc79 100644
--- a/Documentation/media/uapi/v4l/pixfmt-rgb.rst
+++ b/Documentation/media/uapi/v4l/pixfmt-rgb.rst
@@ -17,4 +17,5 @@ RGB Formats
     pixfmt-srggb10alaw8
     pixfmt-srggb10dpcm8
     pixfmt-srggb12
+    pixfmt-srggb12p
     pixfmt-srggb16
diff --git a/Documentation/media/uapi/v4l/pixfmt-srggb12p.rst b/Documentation/media/uapi/v4l/pixfmt-srggb12p.rst
new file mode 100644
index 000000000000..c0541e5acd01
--- /dev/null
+++ b/Documentation/media/uapi/v4l/pixfmt-srggb12p.rst
@@ -0,0 +1,103 @@
+.. -*- coding: utf-8; mode: rst -*-
+
+.. _V4L2-PIX-FMT-SRGGB12P:
+.. _v4l2-pix-fmt-sbggr12p:
+.. _v4l2-pix-fmt-sgbrg12p:
+.. _v4l2-pix-fmt-sgrbg12p:
+
+*******************************************************************************************************************************
+V4L2_PIX_FMT_SRGGB12P ('pRAA'), V4L2_PIX_FMT_SGRBG12P ('pgAA'), V4L2_PIX_FMT_SGBRG12P ('pGAA'), V4L2_PIX_FMT_SBGGR12P ('pBAA'),
+*******************************************************************************************************************************
+
+
+12-bit packed Bayer formats
+
+
+Description
+===========
+
+These four pixel formats are packed raw sRGB / Bayer formats with 12
+bits per colour. Every two consecutive samples are packed into three
+bytes. Each of the first two bytes contain the 8 high order bits of
+the pixels, and the third byte contains the four least significants
+bits of each pixel, in the same order.
+
+Each n-pixel row contains n/2 green samples and n/2 blue or red
+samples, with alternating green-red and green-blue rows. They are
+conventionally described as GRGR... BGBG..., RGRG... GBGB..., etc.
+Below is an example of a small V4L2_PIX_FMT_SBGGR12P image:
+
+**Byte Order.**
+Each cell is one byte.
+
+
+
+.. flat-table::
+    :header-rows:  0
+    :stub-columns: 0
+    :widths:       2 1 1 1 1 1 1
+
+
+    -  .. row 1
+
+       -  start + 0:
+
+       -  B\ :sub:`00high`
+
+       -  G\ :sub:`01high`
+
+       -  G\ :sub:`01low`\ (bits 7--4) B\ :sub:`00low`\ (bits 3--0)
+
+       -  B\ :sub:`02high`
+
+       -  G\ :sub:`03high`
+
+       -  G\ :sub:`03low`\ (bits 7--4) B\ :sub:`02low`\ (bits 3--0)
+
+    -  .. row 2
+
+       -  start + 6:
+
+       -  G\ :sub:`10high`
+
+       -  R\ :sub:`11high`
+
+       -  R\ :sub:`11low`\ (bits 7--4) G\ :sub:`10low`\ (bits 3--0)
+
+       -  G\ :sub:`12high`
+
+       -  R\ :sub:`13high`
+
+       -  R\ :sub:`13low`\ (bits 3--2) G\ :sub:`12low`\ (bits 3--0)
+
+    -  .. row 3
+
+       -  start + 12:
+
+       -  B\ :sub:`20high`
+
+       -  G\ :sub:`21high`
+
+       -  G\ :sub:`21low`\ (bits 7--4) B\ :sub:`20low`\ (bits 3--0)
+
+       -  B\ :sub:`22high`
+
+       -  G\ :sub:`23high`
+
+       -  G\ :sub:`23low`\ (bits 7--4) B\ :sub:`22low`\ (bits 3--0)
+
+    -  .. row 4
+
+       -  start + 18:
+
+       -  G\ :sub:`30high`
+
+       -  R\ :sub:`31high`
+
+       -  R\ :sub:`31low`\ (bits 7--4) G\ :sub:`30low`\ (bits 3--0)
+
+       -  G\ :sub:`32high`
+
+       -  R\ :sub:`33high`
+
+       -  R\ :sub:`33low`\ (bits 3--2) G\ :sub:`32low`\ (bits 3--0)
diff --git a/drivers/media/v4l2-core/v4l2-ioctl.c b/drivers/media/v4l2-core/v4l2-ioctl.c
index cab63bb49c97..b60a6b0841d1 100644
--- a/drivers/media/v4l2-core/v4l2-ioctl.c
+++ b/drivers/media/v4l2-core/v4l2-ioctl.c
@@ -1195,10 +1195,6 @@ static void v4l_fill_fmtdesc(struct v4l2_fmtdesc *fmt)
 	case V4L2_PIX_FMT_SGBRG10:	descr = "10-bit Bayer GBGB/RGRG"; break;
 	case V4L2_PIX_FMT_SGRBG10:	descr = "10-bit Bayer GRGR/BGBG"; break;
 	case V4L2_PIX_FMT_SRGGB10:	descr = "10-bit Bayer RGRG/GBGB"; break;
-	case V4L2_PIX_FMT_SBGGR12:	descr = "12-bit Bayer BGBG/GRGR"; break;
-	case V4L2_PIX_FMT_SGBRG12:	descr = "12-bit Bayer GBGB/RGRG"; break;
-	case V4L2_PIX_FMT_SGRBG12:	descr = "12-bit Bayer GRGR/BGBG"; break;
-	case V4L2_PIX_FMT_SRGGB12:	descr = "12-bit Bayer RGRG/GBGB"; break;
 	case V4L2_PIX_FMT_SBGGR10P:	descr = "10-bit Bayer BGBG/GRGR Packed"; break;
 	case V4L2_PIX_FMT_SGBRG10P:	descr = "10-bit Bayer GBGB/RGRG Packed"; break;
 	case V4L2_PIX_FMT_SGRBG10P:	descr = "10-bit Bayer GRGR/BGBG Packed"; break;
@@ -1211,6 +1207,14 @@ static void v4l_fill_fmtdesc(struct v4l2_fmtdesc *fmt)
 	case V4L2_PIX_FMT_SGBRG10DPCM8:	descr = "8-bit Bayer GBGB/RGRG (DPCM)"; break;
 	case V4L2_PIX_FMT_SGRBG10DPCM8:	descr = "8-bit Bayer GRGR/BGBG (DPCM)"; break;
 	case V4L2_PIX_FMT_SRGGB10DPCM8:	descr = "8-bit Bayer RGRG/GBGB (DPCM)"; break;
+	case V4L2_PIX_FMT_SBGGR12:	descr = "12-bit Bayer BGBG/GRGR"; break;
+	case V4L2_PIX_FMT_SGBRG12:	descr = "12-bit Bayer GBGB/RGRG"; break;
+	case V4L2_PIX_FMT_SGRBG12:	descr = "12-bit Bayer GRGR/BGBG"; break;
+	case V4L2_PIX_FMT_SRGGB12:	descr = "12-bit Bayer RGRG/GBGB"; break;
+	case V4L2_PIX_FMT_SBGGR12P:	descr = "12-bit Bayer BGBG/GRGR Packed"; break;
+	case V4L2_PIX_FMT_SGBRG12P:	descr = "12-bit Bayer GBGB/RGRG Packed"; break;
+	case V4L2_PIX_FMT_SGRBG12P:	descr = "12-bit Bayer GRGR/BGBG Packed"; break;
+	case V4L2_PIX_FMT_SRGGB12P:	descr = "12-bit Bayer RGRG/GBGB Packed"; break;
 	case V4L2_PIX_FMT_SBGGR16:	descr = "16-bit Bayer BGBG/GRGR"; break;
 	case V4L2_PIX_FMT_SGBRG16:	descr = "16-bit Bayer GBGB/RGRG"; break;
 	case V4L2_PIX_FMT_SGRBG16:	descr = "16-bit Bayer GRGR/BGBG"; break;
diff --git a/include/uapi/linux/videodev2.h b/include/uapi/linux/videodev2.h
index 45cf7359822c..185d6a0acc06 100644
--- a/include/uapi/linux/videodev2.h
+++ b/include/uapi/linux/videodev2.h
@@ -603,6 +603,11 @@ struct v4l2_pix_format {
 #define V4L2_PIX_FMT_SGBRG12 v4l2_fourcc('G', 'B', '1', '2') /* 12  GBGB.. RGRG.. */
 #define V4L2_PIX_FMT_SGRBG12 v4l2_fourcc('B', 'A', '1', '2') /* 12  GRGR.. BGBG.. */
 #define V4L2_PIX_FMT_SRGGB12 v4l2_fourcc('R', 'G', '1', '2') /* 12  RGRG.. GBGB.. */
+	/* 12bit raw bayer packed, 6 bytes for every 4 pixels */
+#define V4L2_PIX_FMT_SBGGR12P v4l2_fourcc('p', 'B', 'C', 'C')
+#define V4L2_PIX_FMT_SGBRG12P v4l2_fourcc('p', 'G', 'C', 'C')
+#define V4L2_PIX_FMT_SGRBG12P v4l2_fourcc('p', 'g', 'C', 'C')
+#define V4L2_PIX_FMT_SRGGB12P v4l2_fourcc('p', 'R', 'C', 'C')
 #define V4L2_PIX_FMT_SBGGR16 v4l2_fourcc('B', 'Y', 'R', '2') /* 16  BGBG.. GRGR.. */
 #define V4L2_PIX_FMT_SGBRG16 v4l2_fourcc('G', 'B', '1', '6') /* 16  GBGB.. RGRG.. */
 #define V4L2_PIX_FMT_SGRBG16 v4l2_fourcc('G', 'R', '1', '6') /* 16  GRGR.. BGBG.. */
-- 
cgit v1.2.3


From 503dd28af108888c505e8d6a86f4acf5eb20f3b7 Mon Sep 17 00:00:00 2001
From: Sakari Ailus <sakari.ailus@linux.intel.com>
Date: Tue, 18 Jul 2017 09:26:59 -0400
Subject: media: v4l2-flash-led-class: Create separate sub-devices for
 indicators

The V4L2 flash interface allows controlling multiple LEDs through a single
sub-devices if, and only if, these LEDs are of different types. This
approach scales badly for flash controllers that drive multiple flash LEDs
or for LED specific associations. Essentially, the original assumption of a
LED driver chip that drives a single flash LED and an indicator LED is no
longer valid.

Address the matter by registering one sub-device per LED.

Signed-off-by: Sakari Ailus <sakari.ailus@linux.intel.com>
Reviewed-by: Jacek Anaszewski <jacek.anaszewski@gmail.com>
Acked-by: Pavel Machek <pavel@ucw.cz>
Reviewed-by: Rui Miguel Silva <rmfrfs@gmail.com> (for greybus/light)
Acked-by: Hans Verkuil <hans.verkuil@cisco.com>
Signed-off-by: Mauro Carvalho Chehab <mchehab@s-opensource.com>
---
 drivers/leds/leds-aat1290.c                    |   4 +-
 drivers/leds/leds-max77693.c                   |   4 +-
 drivers/media/v4l2-core/v4l2-flash-led-class.c | 113 +++++++++++++++----------
 drivers/staging/greybus/light.c                |  23 +++--
 include/media/v4l2-flash-led-class.h           |  40 ++++++---
 5 files changed, 117 insertions(+), 67 deletions(-)

(limited to 'include')

diff --git a/drivers/leds/leds-aat1290.c b/drivers/leds/leds-aat1290.c
index a21e19297745..424898e6c69d 100644
--- a/drivers/leds/leds-aat1290.c
+++ b/drivers/leds/leds-aat1290.c
@@ -432,7 +432,7 @@ static void aat1290_init_v4l2_flash_config(struct aat1290_led *led,
 	strlcpy(v4l2_sd_cfg->dev_name, led_cdev->name,
 		sizeof(v4l2_sd_cfg->dev_name));
 
-	s = &v4l2_sd_cfg->torch_intensity;
+	s = &v4l2_sd_cfg->intensity;
 	s->min = led->mm_current_scale[0];
 	s->max = led_cfg->max_mm_current;
 	s->step = 1;
@@ -504,7 +504,7 @@ static int aat1290_led_probe(struct platform_device *pdev)
 
 	/* Create V4L2 Flash subdev. */
 	led->v4l2_flash = v4l2_flash_init(dev, of_fwnode_handle(sub_node),
-					  fled_cdev, NULL, &v4l2_flash_ops,
+					  fled_cdev, &v4l2_flash_ops,
 					  &v4l2_sd_cfg);
 	if (IS_ERR(led->v4l2_flash)) {
 		ret = PTR_ERR(led->v4l2_flash);
diff --git a/drivers/leds/leds-max77693.c b/drivers/leds/leds-max77693.c
index 2d3062d53325..adf0f191f794 100644
--- a/drivers/leds/leds-max77693.c
+++ b/drivers/leds/leds-max77693.c
@@ -856,7 +856,7 @@ static void max77693_init_v4l2_flash_config(struct max77693_sub_led *sub_led,
 		 "%s %d-%04x", sub_led->fled_cdev.led_cdev.name,
 		 i2c_adapter_id(i2c->adapter), i2c->addr);
 
-	s = &v4l2_sd_cfg->torch_intensity;
+	s = &v4l2_sd_cfg->intensity;
 	s->min = TORCH_IOUT_MIN;
 	s->max = sub_led->fled_cdev.led_cdev.max_brightness * TORCH_IOUT_STEP;
 	s->step = TORCH_IOUT_STEP;
@@ -931,7 +931,7 @@ static int max77693_register_led(struct max77693_sub_led *sub_led,
 
 	/* Register in the V4L2 subsystem. */
 	sub_led->v4l2_flash = v4l2_flash_init(dev, of_fwnode_handle(sub_node),
-					      fled_cdev, NULL, &v4l2_flash_ops,
+					      fled_cdev, &v4l2_flash_ops,
 					      &v4l2_sd_cfg);
 	if (IS_ERR(sub_led->v4l2_flash)) {
 		ret = PTR_ERR(sub_led->v4l2_flash);
diff --git a/drivers/media/v4l2-core/v4l2-flash-led-class.c b/drivers/media/v4l2-core/v4l2-flash-led-class.c
index aabc85dbb8b5..4ceef217de83 100644
--- a/drivers/media/v4l2-core/v4l2-flash-led-class.c
+++ b/drivers/media/v4l2-core/v4l2-flash-led-class.c
@@ -197,7 +197,7 @@ static int v4l2_flash_s_ctrl(struct v4l2_ctrl *c)
 {
 	struct v4l2_flash *v4l2_flash = v4l2_ctrl_to_v4l2_flash(c);
 	struct led_classdev_flash *fled_cdev = v4l2_flash->fled_cdev;
-	struct led_classdev *led_cdev = &fled_cdev->led_cdev;
+	struct led_classdev *led_cdev = fled_cdev ? &fled_cdev->led_cdev : NULL;
 	struct v4l2_ctrl **ctrls = v4l2_flash->ctrls;
 	bool external_strobe;
 	int ret = 0;
@@ -299,10 +299,26 @@ static void __fill_ctrl_init_data(struct v4l2_flash *v4l2_flash,
 			  struct v4l2_flash_ctrl_data *ctrl_init_data)
 {
 	struct led_classdev_flash *fled_cdev = v4l2_flash->fled_cdev;
-	struct led_classdev *led_cdev = &fled_cdev->led_cdev;
+	struct led_classdev *led_cdev = fled_cdev ? &fled_cdev->led_cdev : NULL;
 	struct v4l2_ctrl_config *ctrl_cfg;
 	u32 mask;
 
+	/* Init INDICATOR_INTENSITY ctrl data */
+	if (v4l2_flash->iled_cdev) {
+		ctrl_init_data[INDICATOR_INTENSITY].cid =
+					V4L2_CID_FLASH_INDICATOR_INTENSITY;
+		ctrl_cfg = &ctrl_init_data[INDICATOR_INTENSITY].config;
+		__lfs_to_v4l2_ctrl_config(&flash_cfg->intensity,
+					  ctrl_cfg);
+		ctrl_cfg->id = V4L2_CID_FLASH_INDICATOR_INTENSITY;
+		ctrl_cfg->min = 0;
+		ctrl_cfg->flags = V4L2_CTRL_FLAG_VOLATILE |
+				  V4L2_CTRL_FLAG_EXECUTE_ON_WRITE;
+	}
+
+	if (!led_cdev || WARN_ON(!(led_cdev->flags & LED_DEV_CAP_FLASH)))
+		return;
+
 	/* Init FLASH_FAULT ctrl data */
 	if (flash_cfg->flash_faults) {
 		ctrl_init_data[FLASH_FAULT].cid = V4L2_CID_FLASH_FAULT;
@@ -330,27 +346,11 @@ static void __fill_ctrl_init_data(struct v4l2_flash *v4l2_flash,
 	/* Init TORCH_INTENSITY ctrl data */
 	ctrl_init_data[TORCH_INTENSITY].cid = V4L2_CID_FLASH_TORCH_INTENSITY;
 	ctrl_cfg = &ctrl_init_data[TORCH_INTENSITY].config;
-	__lfs_to_v4l2_ctrl_config(&flash_cfg->torch_intensity, ctrl_cfg);
+	__lfs_to_v4l2_ctrl_config(&flash_cfg->intensity, ctrl_cfg);
 	ctrl_cfg->id = V4L2_CID_FLASH_TORCH_INTENSITY;
 	ctrl_cfg->flags = V4L2_CTRL_FLAG_VOLATILE |
 			  V4L2_CTRL_FLAG_EXECUTE_ON_WRITE;
 
-	/* Init INDICATOR_INTENSITY ctrl data */
-	if (v4l2_flash->iled_cdev) {
-		ctrl_init_data[INDICATOR_INTENSITY].cid =
-					V4L2_CID_FLASH_INDICATOR_INTENSITY;
-		ctrl_cfg = &ctrl_init_data[INDICATOR_INTENSITY].config;
-		__lfs_to_v4l2_ctrl_config(&flash_cfg->indicator_intensity,
-					  ctrl_cfg);
-		ctrl_cfg->id = V4L2_CID_FLASH_INDICATOR_INTENSITY;
-		ctrl_cfg->min = 0;
-		ctrl_cfg->flags = V4L2_CTRL_FLAG_VOLATILE |
-				  V4L2_CTRL_FLAG_EXECUTE_ON_WRITE;
-	}
-
-	if (!(led_cdev->flags & LED_DEV_CAP_FLASH))
-		return;
-
 	/* Init FLASH_STROBE ctrl data */
 	ctrl_init_data[FLASH_STROBE].cid = V4L2_CID_FLASH_STROBE;
 	ctrl_cfg = &ctrl_init_data[FLASH_STROBE].config;
@@ -485,7 +485,9 @@ static int __sync_device_with_v4l2_controls(struct v4l2_flash *v4l2_flash)
 	struct v4l2_ctrl **ctrls = v4l2_flash->ctrls;
 	int ret = 0;
 
-	v4l2_flash_set_led_brightness(v4l2_flash, ctrls[TORCH_INTENSITY]);
+	if (ctrls[TORCH_INTENSITY])
+		v4l2_flash_set_led_brightness(v4l2_flash,
+					      ctrls[TORCH_INTENSITY]);
 
 	if (ctrls[INDICATOR_INTENSITY])
 		v4l2_flash_set_led_brightness(v4l2_flash,
@@ -527,19 +529,21 @@ static int v4l2_flash_open(struct v4l2_subdev *sd, struct v4l2_subdev_fh *fh)
 {
 	struct v4l2_flash *v4l2_flash = v4l2_subdev_to_v4l2_flash(sd);
 	struct led_classdev_flash *fled_cdev = v4l2_flash->fled_cdev;
-	struct led_classdev *led_cdev = &fled_cdev->led_cdev;
+	struct led_classdev *led_cdev = fled_cdev ? &fled_cdev->led_cdev : NULL;
 	struct led_classdev *led_cdev_ind = v4l2_flash->iled_cdev;
 	int ret = 0;
 
 	if (!v4l2_fh_is_singular(&fh->vfh))
 		return 0;
 
-	mutex_lock(&led_cdev->led_access);
+	if (led_cdev) {
+		mutex_lock(&led_cdev->led_access);
 
-	led_sysfs_disable(led_cdev);
-	led_trigger_remove(led_cdev);
+		led_sysfs_disable(led_cdev);
+		led_trigger_remove(led_cdev);
 
-	mutex_unlock(&led_cdev->led_access);
+		mutex_unlock(&led_cdev->led_access);
+	}
 
 	if (led_cdev_ind) {
 		mutex_lock(&led_cdev_ind->led_access);
@@ -556,9 +560,11 @@ static int v4l2_flash_open(struct v4l2_subdev *sd, struct v4l2_subdev_fh *fh)
 
 	return 0;
 out_sync_device:
-	mutex_lock(&led_cdev->led_access);
-	led_sysfs_enable(led_cdev);
-	mutex_unlock(&led_cdev->led_access);
+	if (led_cdev) {
+		mutex_lock(&led_cdev->led_access);
+		led_sysfs_enable(led_cdev);
+		mutex_unlock(&led_cdev->led_access);
+	}
 
 	if (led_cdev_ind) {
 		mutex_lock(&led_cdev_ind->led_access);
@@ -573,21 +579,24 @@ static int v4l2_flash_close(struct v4l2_subdev *sd, struct v4l2_subdev_fh *fh)
 {
 	struct v4l2_flash *v4l2_flash = v4l2_subdev_to_v4l2_flash(sd);
 	struct led_classdev_flash *fled_cdev = v4l2_flash->fled_cdev;
-	struct led_classdev *led_cdev = &fled_cdev->led_cdev;
+	struct led_classdev *led_cdev = fled_cdev ? &fled_cdev->led_cdev : NULL;
 	struct led_classdev *led_cdev_ind = v4l2_flash->iled_cdev;
 	int ret = 0;
 
 	if (!v4l2_fh_is_singular(&fh->vfh))
 		return 0;
 
-	mutex_lock(&led_cdev->led_access);
+	if (led_cdev) {
+		mutex_lock(&led_cdev->led_access);
 
-	if (v4l2_flash->ctrls[STROBE_SOURCE])
-		ret = v4l2_ctrl_s_ctrl(v4l2_flash->ctrls[STROBE_SOURCE],
+		if (v4l2_flash->ctrls[STROBE_SOURCE])
+			ret = v4l2_ctrl_s_ctrl(
+				v4l2_flash->ctrls[STROBE_SOURCE],
 				V4L2_FLASH_STROBE_SOURCE_SOFTWARE);
-	led_sysfs_enable(led_cdev);
+		led_sysfs_enable(led_cdev);
 
-	mutex_unlock(&led_cdev->led_access);
+		mutex_unlock(&led_cdev->led_access);
+	}
 
 	if (led_cdev_ind) {
 		mutex_lock(&led_cdev_ind->led_access);
@@ -605,25 +614,19 @@ static const struct v4l2_subdev_internal_ops v4l2_flash_subdev_internal_ops = {
 
 static const struct v4l2_subdev_ops v4l2_flash_subdev_ops;
 
-struct v4l2_flash *v4l2_flash_init(
+static struct v4l2_flash *__v4l2_flash_init(
 	struct device *dev, struct fwnode_handle *fwn,
-	struct led_classdev_flash *fled_cdev,
-	struct led_classdev *iled_cdev,
-	const struct v4l2_flash_ops *ops,
-	struct v4l2_flash_config *config)
+	struct led_classdev_flash *fled_cdev, struct led_classdev *iled_cdev,
+	const struct v4l2_flash_ops *ops, struct v4l2_flash_config *config)
 {
 	struct v4l2_flash *v4l2_flash;
-	struct led_classdev *led_cdev;
 	struct v4l2_subdev *sd;
 	int ret;
 
-	if (!fled_cdev || !config)
+	if (!config)
 		return ERR_PTR(-EINVAL);
 
-	led_cdev = &fled_cdev->led_cdev;
-
-	v4l2_flash = devm_kzalloc(led_cdev->dev, sizeof(*v4l2_flash),
-					GFP_KERNEL);
+	v4l2_flash = devm_kzalloc(dev, sizeof(*v4l2_flash), GFP_KERNEL);
 	if (!v4l2_flash)
 		return ERR_PTR(-ENOMEM);
 
@@ -632,7 +635,7 @@ struct v4l2_flash *v4l2_flash_init(
 	v4l2_flash->iled_cdev = iled_cdev;
 	v4l2_flash->ops = ops;
 	sd->dev = dev;
-	sd->fwnode = fwn ? fwn : dev_fwnode(led_cdev->dev);
+	sd->fwnode = fwn ? fwn : dev_fwnode(dev);
 	v4l2_subdev_init(sd, &v4l2_flash_subdev_ops);
 	sd->internal_ops = &v4l2_flash_subdev_internal_ops;
 	sd->flags |= V4L2_SUBDEV_FL_HAS_DEVNODE;
@@ -664,8 +667,26 @@ err_init_controls:
 
 	return ERR_PTR(ret);
 }
+
+struct v4l2_flash *v4l2_flash_init(
+	struct device *dev, struct fwnode_handle *fwn,
+	struct led_classdev_flash *fled_cdev,
+	const struct v4l2_flash_ops *ops,
+	struct v4l2_flash_config *config)
+{
+	return __v4l2_flash_init(dev, fwn, fled_cdev, NULL, ops, config);
+}
 EXPORT_SYMBOL_GPL(v4l2_flash_init);
 
+struct v4l2_flash *v4l2_flash_indicator_init(
+	struct device *dev, struct fwnode_handle *fwn,
+	struct led_classdev *iled_cdev,
+	struct v4l2_flash_config *config)
+{
+	return __v4l2_flash_init(dev, fwn, NULL, iled_cdev, NULL, config);
+}
+EXPORT_SYMBOL_GPL(v4l2_flash_indicator_init);
+
 void v4l2_flash_release(struct v4l2_flash *v4l2_flash)
 {
 	struct v4l2_subdev *sd;
diff --git a/drivers/staging/greybus/light.c b/drivers/staging/greybus/light.c
index 81469d087e74..3f4148c92308 100644
--- a/drivers/staging/greybus/light.c
+++ b/drivers/staging/greybus/light.c
@@ -58,6 +58,7 @@ struct gb_light {
 	bool			ready;
 #if IS_REACHABLE(CONFIG_V4L2_FLASH_LED_CLASS)
 	struct v4l2_flash	*v4l2_flash;
+	struct v4l2_flash	*v4l2_flash_ind;
 #endif
 };
 
@@ -534,7 +535,7 @@ static int gb_lights_light_v4l2_register(struct gb_light *light)
 {
 	struct gb_connection *connection = get_conn_from_light(light);
 	struct device *dev = &connection->bundle->dev;
-	struct v4l2_flash_config sd_cfg = { {0} };
+	struct v4l2_flash_config sd_cfg = { {0} }, sd_cfg_ind = { {0} };
 	struct led_classdev_flash *fled;
 	struct led_classdev *iled = NULL;
 	struct gb_channel *channel_torch, *channel_ind, *channel_flash;
@@ -542,12 +543,12 @@ static int gb_lights_light_v4l2_register(struct gb_light *light)
 	channel_torch = get_channel_from_mode(light, GB_CHANNEL_MODE_TORCH);
 	if (channel_torch)
 		__gb_lights_channel_v4l2_config(&channel_torch->intensity_uA,
-						&sd_cfg.torch_intensity);
+						&sd_cfg.intensity);
 
 	channel_ind = get_channel_from_mode(light, GB_CHANNEL_MODE_INDICATOR);
 	if (channel_ind) {
 		__gb_lights_channel_v4l2_config(&channel_ind->intensity_uA,
-						&sd_cfg.indicator_intensity);
+						&sd_cfg_ind.intensity);
 		iled = &channel_ind->fled.led_cdev;
 	}
 
@@ -557,6 +558,8 @@ static int gb_lights_light_v4l2_register(struct gb_light *light)
 	fled = &channel_flash->fled;
 
 	snprintf(sd_cfg.dev_name, sizeof(sd_cfg.dev_name), "%s", light->name);
+	snprintf(sd_cfg_ind.dev_name, sizeof(sd_cfg_ind.dev_name),
+		 "%s indicator", light->name);
 
 	/* Set the possible values to faults, in our case all faults */
 	sd_cfg.flash_faults = LED_FAULT_OVER_VOLTAGE | LED_FAULT_TIMEOUT |
@@ -565,16 +568,26 @@ static int gb_lights_light_v4l2_register(struct gb_light *light)
 		LED_FAULT_UNDER_VOLTAGE | LED_FAULT_INPUT_VOLTAGE |
 		LED_FAULT_LED_OVER_TEMPERATURE;
 
-	light->v4l2_flash = v4l2_flash_init(dev, NULL, fled, iled,
-					    &v4l2_flash_ops, &sd_cfg);
+	light->v4l2_flash = v4l2_flash_init(dev, NULL, fled, &v4l2_flash_ops,
+					    &sd_cfg);
 	if (IS_ERR(light->v4l2_flash))
 		return PTR_ERR(light->v4l2_flash);
 
+	if (channel_ind) {
+		light->v4l2_flash_ind =
+			v4l2_flash_indicator_init(dev, NULL, iled, &sd_cfg_ind);
+		if (IS_ERR(light->v4l2_flash_ind)) {
+			v4l2_flash_release(light->v4l2_flash);
+			return PTR_ERR(light->v4l2_flash_ind);
+		}
+	}
+
 	return 0;
 }
 
 static void gb_lights_light_v4l2_unregister(struct gb_light *light)
 {
+	v4l2_flash_release(light->v4l2_flash_ind);
 	v4l2_flash_release(light->v4l2_flash);
 }
 #else
diff --git a/include/media/v4l2-flash-led-class.h b/include/media/v4l2-flash-led-class.h
index 54e31a805a88..1b515166ad60 100644
--- a/include/media/v4l2-flash-led-class.h
+++ b/include/media/v4l2-flash-led-class.h
@@ -56,8 +56,7 @@ struct v4l2_flash_ops {
  * struct v4l2_flash_config - V4L2 Flash sub-device initialization data
  * @dev_name:			the name of the media entity,
  *				unique in the system
- * @torch_intensity:		constraints for the LED in torch mode
- * @indicator_intensity:	constraints for the indicator LED
+ * @intensity:			non-flash strobe constraints for the LED
  * @flash_faults:		bitmask of flash faults that the LED flash class
  *				device can report; corresponding LED_FAULT* bit
  *				definitions are available in the header file
@@ -66,8 +65,7 @@ struct v4l2_flash_ops {
  */
 struct v4l2_flash_config {
 	char dev_name[32];
-	struct led_flash_setting torch_intensity;
-	struct led_flash_setting indicator_intensity;
+	struct led_flash_setting intensity;
 	u32 flash_faults;
 	unsigned int has_external_strobe:1;
 };
@@ -110,8 +108,6 @@ static inline struct v4l2_flash *v4l2_ctrl_to_v4l2_flash(struct v4l2_ctrl *c)
  * @dev:	flash device, e.g. an I2C device
  * @fwn:	fwnode_handle of the LED, may be NULL if the same as device's
  * @fled_cdev:	LED flash class device to wrap
- * @iled_cdev:	LED flash class device representing indicator LED associated
- *		with fled_cdev, may be NULL
  * @ops:	V4L2 Flash device ops
  * @config:	initialization data for V4L2 Flash sub-device
  *
@@ -124,9 +120,24 @@ static inline struct v4l2_flash *v4l2_ctrl_to_v4l2_flash(struct v4l2_ctrl *c)
 struct v4l2_flash *v4l2_flash_init(
 	struct device *dev, struct fwnode_handle *fwn,
 	struct led_classdev_flash *fled_cdev,
-	struct led_classdev *iled_cdev,
-	const struct v4l2_flash_ops *ops,
-	struct v4l2_flash_config *config);
+	const struct v4l2_flash_ops *ops, struct v4l2_flash_config *config);
+
+/**
+ * v4l2_flash_indicator_init - initialize V4L2 indicator sub-device
+ * @dev:	flash device, e.g. an I2C device
+ * @fwn:	fwnode_handle of the LED, may be NULL if the same as device's
+ * @iled_cdev:	LED flash class device representing the indicator LED
+ * @config:	initialization data for V4L2 Flash sub-device
+ *
+ * Create V4L2 Flash sub-device wrapping given LED subsystem device.
+ *
+ * Returns: A valid pointer, or, when an error occurs, the return
+ * value is encoded using ERR_PTR(). Use IS_ERR() to check and
+ * PTR_ERR() to obtain the numeric return value.
+ */
+struct v4l2_flash *v4l2_flash_indicator_init(
+	struct device *dev, struct fwnode_handle *fwn,
+	struct led_classdev *iled_cdev, struct v4l2_flash_config *config);
 
 /**
  * v4l2_flash_release - release V4L2 Flash sub-device
@@ -140,9 +151,14 @@ void v4l2_flash_release(struct v4l2_flash *v4l2_flash);
 static inline struct v4l2_flash *v4l2_flash_init(
 	struct device *dev, struct fwnode_handle *fwn,
 	struct led_classdev_flash *fled_cdev,
-	struct led_classdev *iled_cdev,
-	const struct v4l2_flash_ops *ops,
-	struct v4l2_flash_config *config)
+	const struct v4l2_flash_ops *ops, struct v4l2_flash_config *config)
+{
+	return NULL;
+}
+
+static inline struct v4l2_flash *v4l2_flash_indicator_init(
+	struct device *dev, struct fwnode_handle *fwn,
+	struct led_classdev *iled_cdev, struct v4l2_flash_config *config)
 {
 	return NULL;
 }
-- 
cgit v1.2.3


From 40cefffd35f62618210d9d598f2c362095693ffd Mon Sep 17 00:00:00 2001
From: Sakari Ailus <sakari.ailus@linux.intel.com>
Date: Wed, 19 Jul 2017 18:38:36 -0400
Subject: media: v4l2-flash-led-class: Document v4l2_flash_init() references

The v4l2_flash_init() keeps a reference to the ops struct but not to the
config struct (nor anything it contains). Document this.

Signed-off-by: Sakari Ailus <sakari.ailus@linux.intel.com>
Acked-by: Pavel Machek <pavel@ucw.cz>
Acked-by: Hans Verkuil <hans.verkuil@cisco.com>
Signed-off-by: Mauro Carvalho Chehab <mchehab@s-opensource.com>
---
 include/media/v4l2-flash-led-class.h | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'include')

diff --git a/include/media/v4l2-flash-led-class.h b/include/media/v4l2-flash-led-class.h
index 1b515166ad60..5c1d50f78e12 100644
--- a/include/media/v4l2-flash-led-class.h
+++ b/include/media/v4l2-flash-led-class.h
@@ -112,6 +112,9 @@ static inline struct v4l2_flash *v4l2_ctrl_to_v4l2_flash(struct v4l2_ctrl *c)
  * @config:	initialization data for V4L2 Flash sub-device
  *
  * Create V4L2 Flash sub-device wrapping given LED subsystem device.
+ * The ops pointer is stored by the V4L2 flash framework. No
+ * references are held to config nor its contents once this function
+ * has returned.
  *
  * Returns: A valid pointer, or, when an error occurs, the return
  * value is encoded using ERR_PTR(). Use IS_ERR() to check and
@@ -130,6 +133,9 @@ struct v4l2_flash *v4l2_flash_init(
  * @config:	initialization data for V4L2 Flash sub-device
  *
  * Create V4L2 Flash sub-device wrapping given LED subsystem device.
+ * The ops pointer is stored by the V4L2 flash framework. No
+ * references are held to config nor its contents once this function
+ * has returned.
  *
  * Returns: A valid pointer, or, when an error occurs, the return
  * value is encoded using ERR_PTR(). Use IS_ERR() to check and
-- 
cgit v1.2.3


From a1cc5a57e49e871ce5e610597af1bd6bea06cb5c Mon Sep 17 00:00:00 2001
From: Wolfram Sang <wsa@the-dreams.de>
Date: Sun, 13 Aug 2017 16:21:17 +0200
Subject: blackfin: merge the two TWI header files

There seems to be no need for separate ones since all users include both
files anyhow. Merge them because include/linux/i2c is to be deprecated.

Signed-off-by: Wolfram Sang <wsa@the-dreams.de>
---
 arch/blackfin/include/asm/bfin_twi.h | 134 +++++++++++++++++++++++++++++++-
 arch/blackfin/kernel/debug-mmrs.c    |   1 -
 drivers/i2c/busses/i2c-bfin-twi.c    |   1 -
 include/linux/i2c/bfin_twi.h         | 145 -----------------------------------
 4 files changed, 133 insertions(+), 148 deletions(-)
 delete mode 100644 include/linux/i2c/bfin_twi.h

(limited to 'include')

diff --git a/arch/blackfin/include/asm/bfin_twi.h b/arch/blackfin/include/asm/bfin_twi.h
index aaa0834d34aa..211e9c78f6fb 100644
--- a/arch/blackfin/include/asm/bfin_twi.h
+++ b/arch/blackfin/include/asm/bfin_twi.h
@@ -1,7 +1,7 @@
 /*
  * bfin_twi.h - interface to Blackfin TWIs
  *
- * Copyright 2005-2010 Analog Devices Inc.
+ * Copyright 2005-2014 Analog Devices Inc.
  *
  * Licensed under the GPL-2 or later.
  */
@@ -10,6 +10,138 @@
 #define __ASM_BFIN_TWI_H__
 
 #include <asm/blackfin.h>
+#include <linux/types.h>
+#include <linux/i2c.h>
+
+/*
+ * ADI twi registers layout
+ */
+struct bfin_twi_regs {
+	u16 clkdiv;
+	u16 dummy1;
+	u16 control;
+	u16 dummy2;
+	u16 slave_ctl;
+	u16 dummy3;
+	u16 slave_stat;
+	u16 dummy4;
+	u16 slave_addr;
+	u16 dummy5;
+	u16 master_ctl;
+	u16 dummy6;
+	u16 master_stat;
+	u16 dummy7;
+	u16 master_addr;
+	u16 dummy8;
+	u16 int_stat;
+	u16 dummy9;
+	u16 int_mask;
+	u16 dummy10;
+	u16 fifo_ctl;
+	u16 dummy11;
+	u16 fifo_stat;
+	u16 dummy12;
+	u32 __pad[20];
+	u16 xmt_data8;
+	u16 dummy13;
+	u16 xmt_data16;
+	u16 dummy14;
+	u16 rcv_data8;
+	u16 dummy15;
+	u16 rcv_data16;
+	u16 dummy16;
+};
+
+struct bfin_twi_iface {
+	int			irq;
+	spinlock_t		lock;
+	char			read_write;
+	u8			command;
+	u8			*transPtr;
+	int			readNum;
+	int			writeNum;
+	int			cur_mode;
+	int			manual_stop;
+	int			result;
+	struct i2c_adapter	adap;
+	struct completion	complete;
+	struct i2c_msg		*pmsg;
+	int			msg_num;
+	int			cur_msg;
+	u16			saved_clkdiv;
+	u16			saved_control;
+	struct bfin_twi_regs __iomem *regs_base;
+};
+
+/*  ********************  TWO-WIRE INTERFACE (TWI) MASKS  ********************/
+/* TWI_CLKDIV Macros (Use: *pTWI_CLKDIV = CLKLOW(x)|CLKHI(y);  ) */
+#define	CLKLOW(x)	((x) & 0xFF)	/* Periods Clock Is Held Low */
+#define CLKHI(y)	(((y)&0xFF)<<0x8) /* Periods Before New Clock Low */
+
+/* TWI_PRESCALE Masks */
+#define	PRESCALE	0x007F	/* SCLKs Per Internal Time Reference (10MHz) */
+#define	TWI_ENA		0x0080	/* TWI Enable */
+#define	SCCB		0x0200	/* SCCB Compatibility Enable */
+
+/* TWI_SLAVE_CTL Masks */
+#define	SEN		0x0001	/* Slave Enable */
+#define	SADD_LEN	0x0002	/* Slave Address Length */
+#define	STDVAL		0x0004	/* Slave Transmit Data Valid */
+#define	NAK		0x0008	/* NAK Generated At Conclusion Of Transfer */
+#define	GEN		0x0010	/* General Call Address Matching Enabled */
+
+/* TWI_SLAVE_STAT Masks	*/
+#define	SDIR		0x0001	/* Slave Transfer Direction (RX/TX*) */
+#define GCALL		0x0002	/* General Call Indicator */
+
+/* TWI_MASTER_CTL Masks	*/
+#define	MEN		0x0001	/* Master Mode Enable          */
+#define	MADD_LEN	0x0002	/* Master Address Length       */
+#define	MDIR		0x0004	/* Master Transmit Direction (RX/TX*) */
+#define	FAST		0x0008	/* Use Fast Mode Timing Specs  */
+#define	STOP		0x0010	/* Issue Stop Condition        */
+#define	RSTART		0x0020	/* Repeat Start or Stop* At End Of Transfer */
+#define	DCNT		0x3FC0	/* Data Bytes To Transfer      */
+#define	SDAOVR		0x4000	/* Serial Data Override        */
+#define	SCLOVR		0x8000	/* Serial Clock Override       */
+
+/* TWI_MASTER_STAT Masks */
+#define	MPROG		0x0001	/* Master Transfer In Progress */
+#define	LOSTARB		0x0002	/* Lost Arbitration Indicator (Xfer Aborted) */
+#define	ANAK		0x0004	/* Address Not Acknowledged    */
+#define	DNAK		0x0008	/* Data Not Acknowledged       */
+#define	BUFRDERR	0x0010	/* Buffer Read Error           */
+#define	BUFWRERR	0x0020	/* Buffer Write Error          */
+#define	SDASEN		0x0040	/* Serial Data Sense           */
+#define	SCLSEN		0x0080	/* Serial Clock Sense          */
+#define	BUSBUSY		0x0100	/* Bus Busy Indicator          */
+
+/* TWI_INT_SRC and TWI_INT_ENABLE Masks	*/
+#define	SINIT		0x0001	/* Slave Transfer Initiated    */
+#define	SCOMP		0x0002	/* Slave Transfer Complete     */
+#define	SERR		0x0004	/* Slave Transfer Error        */
+#define	SOVF		0x0008	/* Slave Overflow              */
+#define	MCOMP		0x0010	/* Master Transfer Complete    */
+#define	MERR		0x0020	/* Master Transfer Error       */
+#define	XMTSERV		0x0040	/* Transmit FIFO Service       */
+#define	RCVSERV		0x0080	/* Receive FIFO Service        */
+
+/* TWI_FIFO_CTRL Masks */
+#define	XMTFLUSH	0x0001	/* Transmit Buffer Flush                 */
+#define	RCVFLUSH	0x0002	/* Receive Buffer Flush                  */
+#define	XMTINTLEN	0x0004	/* Transmit Buffer Interrupt Length      */
+#define	RCVINTLEN	0x0008	/* Receive Buffer Interrupt Length       */
+
+/* TWI_FIFO_STAT Masks */
+#define	XMTSTAT		0x0003	/* Transmit FIFO Status                  */
+#define	XMT_EMPTY	0x0000	/* Transmit FIFO Empty                   */
+#define	XMT_HALF	0x0001	/* Transmit FIFO Has 1 Byte To Write     */
+#define	XMT_FULL	0x0003	/* Transmit FIFO Full (2 Bytes To Write) */
+
+#define	RCVSTAT		0x000C	/* Receive FIFO Status                   */
+#define	RCV_EMPTY	0x0000	/* Receive FIFO Empty                    */
+#define	RCV_HALF	0x0004	/* Receive FIFO Has 1 Byte To Read       */
+#define	RCV_FULL	0x000C	/* Receive FIFO Full (2 Bytes To Read)   */
 
 #define DEFINE_TWI_REG(reg_name, reg) \
 static inline u16 read_##reg_name(struct bfin_twi_iface *iface) \
diff --git a/arch/blackfin/kernel/debug-mmrs.c b/arch/blackfin/kernel/debug-mmrs.c
index e272bca93c64..f31ace221392 100644
--- a/arch/blackfin/kernel/debug-mmrs.c
+++ b/arch/blackfin/kernel/debug-mmrs.c
@@ -10,7 +10,6 @@
 #include <linux/fs.h>
 #include <linux/kernel.h>
 #include <linux/module.h>
-#include <linux/i2c/bfin_twi.h>
 #include <linux/gpio.h>
 
 #include <asm/blackfin.h>
diff --git a/drivers/i2c/busses/i2c-bfin-twi.c b/drivers/i2c/busses/i2c-bfin-twi.c
index 9fe942b8c610..ff3343186a82 100644
--- a/drivers/i2c/busses/i2c-bfin-twi.c
+++ b/drivers/i2c/busses/i2c-bfin-twi.c
@@ -21,7 +21,6 @@
 #include <linux/interrupt.h>
 #include <linux/platform_device.h>
 #include <linux/delay.h>
-#include <linux/i2c/bfin_twi.h>
 
 #include <asm/irq.h>
 #include <asm/portmux.h>
diff --git a/include/linux/i2c/bfin_twi.h b/include/linux/i2c/bfin_twi.h
deleted file mode 100644
index 135a4e0876ae..000000000000
--- a/include/linux/i2c/bfin_twi.h
+++ /dev/null
@@ -1,145 +0,0 @@
-/*
- * i2c-bfin-twi.h - interface to ADI TWI controller
- *
- * Copyright 2005-2014 Analog Devices Inc.
- *
- * Licensed under the GPL-2 or later.
- */
-
-#ifndef __I2C_BFIN_TWI_H__
-#define __I2C_BFIN_TWI_H__
-
-#include <linux/types.h>
-#include <linux/i2c.h>
-
-/*
- * ADI twi registers layout
- */
-struct bfin_twi_regs {
-	u16 clkdiv;
-	u16 dummy1;
-	u16 control;
-	u16 dummy2;
-	u16 slave_ctl;
-	u16 dummy3;
-	u16 slave_stat;
-	u16 dummy4;
-	u16 slave_addr;
-	u16 dummy5;
-	u16 master_ctl;
-	u16 dummy6;
-	u16 master_stat;
-	u16 dummy7;
-	u16 master_addr;
-	u16 dummy8;
-	u16 int_stat;
-	u16 dummy9;
-	u16 int_mask;
-	u16 dummy10;
-	u16 fifo_ctl;
-	u16 dummy11;
-	u16 fifo_stat;
-	u16 dummy12;
-	u32 __pad[20];
-	u16 xmt_data8;
-	u16 dummy13;
-	u16 xmt_data16;
-	u16 dummy14;
-	u16 rcv_data8;
-	u16 dummy15;
-	u16 rcv_data16;
-	u16 dummy16;
-};
-
-struct bfin_twi_iface {
-	int			irq;
-	spinlock_t		lock;
-	char			read_write;
-	u8			command;
-	u8			*transPtr;
-	int			readNum;
-	int			writeNum;
-	int			cur_mode;
-	int			manual_stop;
-	int			result;
-	struct i2c_adapter	adap;
-	struct completion	complete;
-	struct i2c_msg		*pmsg;
-	int			msg_num;
-	int			cur_msg;
-	u16			saved_clkdiv;
-	u16			saved_control;
-	struct bfin_twi_regs __iomem *regs_base;
-};
-
-/*  ********************  TWO-WIRE INTERFACE (TWI) MASKS  ********************/
-/* TWI_CLKDIV Macros (Use: *pTWI_CLKDIV = CLKLOW(x)|CLKHI(y);  ) */
-#define	CLKLOW(x)	((x) & 0xFF)	/* Periods Clock Is Held Low */
-#define CLKHI(y)	(((y)&0xFF)<<0x8) /* Periods Before New Clock Low */
-
-/* TWI_PRESCALE Masks */
-#define	PRESCALE	0x007F	/* SCLKs Per Internal Time Reference (10MHz) */
-#define	TWI_ENA		0x0080	/* TWI Enable */
-#define	SCCB		0x0200	/* SCCB Compatibility Enable */
-
-/* TWI_SLAVE_CTL Masks */
-#define	SEN		0x0001	/* Slave Enable */
-#define	SADD_LEN	0x0002	/* Slave Address Length */
-#define	STDVAL		0x0004	/* Slave Transmit Data Valid */
-#define	NAK		0x0008	/* NAK Generated At Conclusion Of Transfer */
-#define	GEN		0x0010	/* General Call Address Matching Enabled */
-
-/* TWI_SLAVE_STAT Masks	*/
-#define	SDIR		0x0001	/* Slave Transfer Direction (RX/TX*) */
-#define GCALL		0x0002	/* General Call Indicator */
-
-/* TWI_MASTER_CTL Masks	*/
-#define	MEN		0x0001	/* Master Mode Enable          */
-#define	MADD_LEN	0x0002	/* Master Address Length       */
-#define	MDIR		0x0004	/* Master Transmit Direction (RX/TX*) */
-#define	FAST		0x0008	/* Use Fast Mode Timing Specs  */
-#define	STOP		0x0010	/* Issue Stop Condition        */
-#define	RSTART		0x0020	/* Repeat Start or Stop* At End Of Transfer */
-#define	DCNT		0x3FC0	/* Data Bytes To Transfer      */
-#define	SDAOVR		0x4000	/* Serial Data Override        */
-#define	SCLOVR		0x8000	/* Serial Clock Override       */
-
-/* TWI_MASTER_STAT Masks */
-#define	MPROG		0x0001	/* Master Transfer In Progress */
-#define	LOSTARB		0x0002	/* Lost Arbitration Indicator (Xfer Aborted) */
-#define	ANAK		0x0004	/* Address Not Acknowledged    */
-#define	DNAK		0x0008	/* Data Not Acknowledged       */
-#define	BUFRDERR	0x0010	/* Buffer Read Error           */
-#define	BUFWRERR	0x0020	/* Buffer Write Error          */
-#define	SDASEN		0x0040	/* Serial Data Sense           */
-#define	SCLSEN		0x0080	/* Serial Clock Sense          */
-#define	BUSBUSY		0x0100	/* Bus Busy Indicator          */
-
-/* TWI_INT_SRC and TWI_INT_ENABLE Masks	*/
-#define	SINIT		0x0001	/* Slave Transfer Initiated    */
-#define	SCOMP		0x0002	/* Slave Transfer Complete     */
-#define	SERR		0x0004	/* Slave Transfer Error        */
-#define	SOVF		0x0008	/* Slave Overflow              */
-#define	MCOMP		0x0010	/* Master Transfer Complete    */
-#define	MERR		0x0020	/* Master Transfer Error       */
-#define	XMTSERV		0x0040	/* Transmit FIFO Service       */
-#define	RCVSERV		0x0080	/* Receive FIFO Service        */
-
-/* TWI_FIFO_CTRL Masks */
-#define	XMTFLUSH	0x0001	/* Transmit Buffer Flush                 */
-#define	RCVFLUSH	0x0002	/* Receive Buffer Flush                  */
-#define	XMTINTLEN	0x0004	/* Transmit Buffer Interrupt Length      */
-#define	RCVINTLEN	0x0008	/* Receive Buffer Interrupt Length       */
-
-/* TWI_FIFO_STAT Masks */
-#define	XMTSTAT		0x0003	/* Transmit FIFO Status                  */
-#define	XMT_EMPTY	0x0000	/* Transmit FIFO Empty                   */
-#define	XMT_HALF	0x0001	/* Transmit FIFO Has 1 Byte To Write     */
-#define	XMT_FULL	0x0003	/* Transmit FIFO Full (2 Bytes To Write) */
-
-#define	RCVSTAT		0x000C	/* Receive FIFO Status                   */
-#define	RCV_EMPTY	0x0000	/* Receive FIFO Empty                    */
-#define	RCV_HALF	0x0004	/* Receive FIFO Has 1 Byte To Read       */
-#define	RCV_FULL	0x000C	/* Receive FIFO Full (2 Bytes To Read)   */
-
-#endif
-- 
cgit v1.2.3


From 83b2a3c2ab24561cb6de45e6b155e3a7c4c1c91b Mon Sep 17 00:00:00 2001
From: Pierre-Hugues Husson <phh@phh.me>
Date: Sun, 27 Aug 2017 15:58:31 +0200
Subject: regulator: rn5t618: add RC5T619 PMIC support

Extend the driver to support Ricoh RC5T619.
Support the additional regulators and slightly different voltage ranges.

Signed-off-by: Pierre-Hugues Husson <phh@phh.me>
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 drivers/regulator/Kconfig             |  4 ++--
 drivers/regulator/rn5t618-regulator.c | 35 +++++++++++++++++++++++++++++++----
 include/linux/mfd/rn5t618.h           |  6 ++++++
 3 files changed, 39 insertions(+), 6 deletions(-)

(limited to 'include')

diff --git a/drivers/regulator/Kconfig b/drivers/regulator/Kconfig
index 99b9362331b5..a847f8231337 100644
--- a/drivers/regulator/Kconfig
+++ b/drivers/regulator/Kconfig
@@ -700,8 +700,8 @@ config REGULATOR_RN5T618
 	tristate "Ricoh RN5T567/618 voltage regulators"
 	depends on MFD_RN5T618
 	help
-	  Say y here to support the regulators found on Ricoh RN5T567 or
-	  RN5T618 PMIC.
+	  Say y here to support the regulators found on Ricoh RN5T567,
+	  RN5T618 or RC5T619 PMIC.
 
 config REGULATOR_RT5033
 	tristate "Richtek RT5033 Regulators"
diff --git a/drivers/regulator/rn5t618-regulator.c b/drivers/regulator/rn5t618-regulator.c
index 8d2819e36654..ef2be56460fe 100644
--- a/drivers/regulator/rn5t618-regulator.c
+++ b/drivers/regulator/rn5t618-regulator.c
@@ -79,6 +79,29 @@ static struct regulator_desc rn5t618_regulators[] = {
 	REG(LDORTC2, LDOEN2, BIT(5), LDORTC2DAC, 0x7f, 900000, 3500000, 25000),
 };
 
+static struct regulator_desc rc5t619_regulators[] = {
+	/* DCDC */
+	REG(DCDC1, DC1CTL, BIT(0), DC1DAC, 0xff, 600000, 3500000, 12500),
+	REG(DCDC2, DC2CTL, BIT(0), DC2DAC, 0xff, 600000, 3500000, 12500),
+	REG(DCDC3, DC3CTL, BIT(0), DC3DAC, 0xff, 600000, 3500000, 12500),
+	REG(DCDC4, DC4CTL, BIT(0), DC4DAC, 0xff, 600000, 3500000, 12500),
+	REG(DCDC5, DC5CTL, BIT(0), DC5DAC, 0xff, 600000, 3500000, 12500),
+	/* LDO */
+	REG(LDO1, LDOEN1, BIT(0), LDO1DAC, 0x7f, 900000, 3500000, 25000),
+	REG(LDO2, LDOEN1, BIT(1), LDO2DAC, 0x7f, 900000, 3500000, 25000),
+	REG(LDO3, LDOEN1, BIT(2), LDO3DAC, 0x7f, 900000, 3500000, 25000),
+	REG(LDO4, LDOEN1, BIT(3), LDO4DAC, 0x7f, 900000, 3500000, 25000),
+	REG(LDO5, LDOEN1, BIT(4), LDO5DAC, 0x7f, 600000, 3500000, 25000),
+	REG(LDO6, LDOEN1, BIT(5), LDO6DAC, 0x7f, 600000, 3500000, 25000),
+	REG(LDO7, LDOEN1, BIT(6), LDO7DAC, 0x7f, 900000, 3500000, 25000),
+	REG(LDO8, LDOEN1, BIT(7), LDO8DAC, 0x7f, 900000, 3500000, 25000),
+	REG(LDO9, LDOEN2, BIT(0), LDO9DAC, 0x7f, 900000, 3500000, 25000),
+	REG(LDO10, LDOEN2, BIT(0), LDO10DAC, 0x7f, 900000, 3500000, 25000),
+	/* LDO RTC */
+	REG(LDORTC1, LDOEN2, BIT(4), LDORTCDAC, 0x7f, 1700000, 3500000, 25000),
+	REG(LDORTC2, LDOEN2, BIT(5), LDORTC2DAC, 0x7f, 900000, 3500000, 25000),
+};
+
 static int rn5t618_regulator_probe(struct platform_device *pdev)
 {
 	struct rn5t618 *rn5t618 = dev_get_drvdata(pdev->dev.parent);
@@ -86,13 +109,20 @@ static int rn5t618_regulator_probe(struct platform_device *pdev)
 	struct regulator_dev *rdev;
 	struct regulator_desc *regulators;
 	int i;
+	int num_regulators = 0;
 
 	switch (rn5t618->variant) {
 	case RN5T567:
 		regulators = rn5t567_regulators;
+		num_regulators = ARRAY_SIZE(rn5t567_regulators);
 		break;
 	case RN5T618:
 		regulators = rn5t618_regulators;
+		num_regulators = ARRAY_SIZE(rn5t618_regulators);
+		break;
+	case RC5T619:
+		regulators = rc5t619_regulators;
+		num_regulators = ARRAY_SIZE(rc5t619_regulators);
 		break;
 	default:
 		return -EINVAL;
@@ -101,10 +131,7 @@ static int rn5t618_regulator_probe(struct platform_device *pdev)
 	config.dev = pdev->dev.parent;
 	config.regmap = rn5t618->regmap;
 
-	for (i = 0; i < RN5T618_REG_NUM; i++) {
-		if (!regulators[i].name)
-			continue;
-
+	for (i = 0; i < num_regulators; i++) {
 		rdev = devm_regulator_register(&pdev->dev,
 					       &regulators[i],
 					       &config);
diff --git a/include/linux/mfd/rn5t618.h b/include/linux/mfd/rn5t618.h
index e5a6cdeb77db..d61bc58aba8a 100644
--- a/include/linux/mfd/rn5t618.h
+++ b/include/linux/mfd/rn5t618.h
@@ -226,11 +226,17 @@ enum {
 	RN5T618_DCDC2,
 	RN5T618_DCDC3,
 	RN5T618_DCDC4,
+	RN5T618_DCDC5,
 	RN5T618_LDO1,
 	RN5T618_LDO2,
 	RN5T618_LDO3,
 	RN5T618_LDO4,
 	RN5T618_LDO5,
+	RN5T618_LDO6,
+	RN5T618_LDO7,
+	RN5T618_LDO8,
+	RN5T618_LDO9,
+	RN5T618_LDO10,
 	RN5T618_LDORTC1,
 	RN5T618_LDORTC2,
 	RN5T618_REG_NUM,
-- 
cgit v1.2.3


From 0cc3b0ec23ce4c69e1e890ed2b8d2fa932b14aad Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@linux-foundation.org>
Date: Sun, 27 Aug 2017 12:12:25 -0700
Subject: Clarify (and fix) MAX_LFS_FILESIZE macros

We have a MAX_LFS_FILESIZE macro that is meant to be filled in by
filesystems (and other IO targets) that know they are 64-bit clean and
don't have any 32-bit limits in their IO path.

It turns out that our 32-bit value for that limit was bogus.  On 32-bit,
the VM layer is limited by the page cache to only 32-bit index values,
but our logic for that was confusing and actually wrong.  We used to
define that value to

	(((loff_t)PAGE_SIZE << (BITS_PER_LONG-1))-1)

which is actually odd in several ways: it limits the index to 31 bits,
and then it limits files so that they can't have data in that last byte
of a page that has the highest 31-bit index (ie page index 0x7fffffff).

Neither of those limitations make sense.  The index is actually the full
32 bit unsigned value, and we can use that whole full page.  So the
maximum size of the file would logically be "PAGE_SIZE << BITS_PER_LONG".

However, we do wan tto avoid the maximum index, because we have code
that iterates over the page indexes, and we don't want that code to
overflow.  So the maximum size of a file on a 32-bit host should
actually be one page less than the full 32-bit index.

So the actual limit is ULONG_MAX << PAGE_SHIFT.  That means that we will
not actually be using the page of that last index (ULONG_MAX), but we
can grow a file up to that limit.

The wrong value of MAX_LFS_FILESIZE actually caused problems for Doug
Nazar, who was still using a 32-bit host, but with a 9.7TB 2 x RAID5
volume.  It turns out that our old MAX_LFS_FILESIZE was 8TiB (well, one
byte less), but the actual true VM limit is one page less than 16TiB.

This was invisible until commit c2a9737f45e2 ("vfs,mm: fix a dead loop
in truncate_inode_pages_range()"), which started applying that
MAX_LFS_FILESIZE limit to block devices too.

NOTE! On 64-bit, the page index isn't a limiter at all, and the limit is
actually just the offset type itself (loff_t), which is signed.  But for
clarity, on 64-bit, just use the maximum signed value, and don't make
people have to count the number of 'f' characters in the hex constant.

So just use LLONG_MAX for the 64-bit case.  That was what the value had
been before too, just written out as a hex constant.

Fixes: c2a9737f45e2 ("vfs,mm: fix a dead loop in truncate_inode_pages_range()")
Reported-and-tested-by: Doug Nazar <nazard@nazar.ca>
Cc: Andreas Dilger <adilger@dilger.ca>
Cc: Mark Fasheh <mfasheh@versity.com>
Cc: Joel Becker <jlbec@evilplan.org>
Cc: Dave Kleikamp <shaggy@kernel.org>
Cc: stable@kernel.org
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/fs.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/linux/fs.h b/include/linux/fs.h
index 6e1fd5d21248..cbfe127bccf8 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -907,9 +907,9 @@ static inline struct file *get_file(struct file *f)
 /* Page cache limit. The filesystems should put that into their s_maxbytes 
    limits, otherwise bad things can happen in VM. */ 
 #if BITS_PER_LONG==32
-#define MAX_LFS_FILESIZE	(((loff_t)PAGE_SIZE << (BITS_PER_LONG-1))-1)
+#define MAX_LFS_FILESIZE	((loff_t)ULONG_MAX << PAGE_SHIFT)
 #elif BITS_PER_LONG==64
-#define MAX_LFS_FILESIZE 	((loff_t)0x7fffffffffffffffLL)
+#define MAX_LFS_FILESIZE 	((loff_t)LLONG_MAX)
 #endif
 
 #define FL_POSIX	1
-- 
cgit v1.2.3


From a9e4998073d49a762a154a6b48a332ec6cb8e6b1 Mon Sep 17 00:00:00 2001
From: Colin Ian King <colin.king@canonical.com>
Date: Thu, 20 Jul 2017 18:12:07 -0400
Subject: media: dvb_frontend: ensure that inital front end status initialized

The fe_status variable s is not initialized meaning it can have any
random garbage status.  This could be problematic if fe->ops.tune is
false as s is not updated by the call to fe->ops.tune() and a
subsequent check on the change status will using a garbage value.
Fix this by adding FE_NONE to the enum fe_status and initializing
s to this.

Detected by CoverityScan, CID#112887 ("Uninitialized scalar variable")

Signed-off-by: Colin Ian King <colin.king@canonical.com>
Reviewed-by: Shuah Khan <shuahkh@osg.samsung.com>
Signed-off-by: Mauro Carvalho Chehab <mchehab@s-opensource.com>
---
 drivers/media/dvb-core/dvb_frontend.c | 2 +-
 include/uapi/linux/dvb/frontend.h     | 1 +
 2 files changed, 2 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/drivers/media/dvb-core/dvb_frontend.c b/drivers/media/dvb-core/dvb_frontend.c
index e3fff8f64d37..18cc3bbc699c 100644
--- a/drivers/media/dvb-core/dvb_frontend.c
+++ b/drivers/media/dvb-core/dvb_frontend.c
@@ -631,7 +631,7 @@ static int dvb_frontend_thread(void *data)
 	struct dvb_frontend *fe = data;
 	struct dtv_frontend_properties *c = &fe->dtv_property_cache;
 	struct dvb_frontend_private *fepriv = fe->frontend_priv;
-	enum fe_status s;
+	enum fe_status s = FE_NONE;
 	enum dvbfe_algo algo;
 	bool re_tune = false;
 	bool semheld = false;
diff --git a/include/uapi/linux/dvb/frontend.h b/include/uapi/linux/dvb/frontend.h
index 00a20cd21ee2..afc3972b0879 100644
--- a/include/uapi/linux/dvb/frontend.h
+++ b/include/uapi/linux/dvb/frontend.h
@@ -127,6 +127,7 @@ enum fe_sec_mini_cmd {
  *			to reset DiSEqC, tone and parameters
  */
 enum fe_status {
+	FE_NONE			= 0x00,
 	FE_HAS_SIGNAL		= 0x01,
 	FE_HAS_CARRIER		= 0x02,
 	FE_HAS_VITERBI		= 0x04,
-- 
cgit v1.2.3


From 8774370d268f2f43d8487d230e0d4fa1647759b3 Mon Sep 17 00:00:00 2001
From: Mariusz Stachura <mariusz.stachura@intel.com>
Date: Mon, 17 Jul 2017 22:09:45 -0700
Subject: i40e/i40evf: support for VF VLAN tag stripping control

This patch gives VF capability to control VLAN tag stripping via
ethtool. As rx-vlan-offload was fixed before, now the VF is able to
change it using "ethtool --offload <IF> rxvlan on/off" settings.

Signed-off-by: Mariusz Stachura <mariusz.stachura@intel.com>
Signed-off-by: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
---
 drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c | 60 ++++++++++++++++++++++
 drivers/net/ethernet/intel/i40evf/i40evf.h         |  4 ++
 drivers/net/ethernet/intel/i40evf/i40evf_main.c    | 33 ++++++++++++
 .../net/ethernet/intel/i40evf/i40evf_virtchnl.c    | 40 +++++++++++++++
 include/linux/avf/virtchnl.h                       |  5 ++
 5 files changed, 142 insertions(+)

(limited to 'include')

diff --git a/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c b/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c
index 27d87bef4ba3..4d1e670f490e 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c
+++ b/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c
@@ -2529,6 +2529,60 @@ err:
 	return i40e_vc_send_resp_to_vf(vf, VIRTCHNL_OP_SET_RSS_HENA, aq_ret);
 }
 
+/**
+ * i40e_vc_enable_vlan_stripping
+ * @vf: pointer to the VF info
+ * @msg: pointer to the msg buffer
+ * @msglen: msg length
+ *
+ * Enable vlan header stripping for the VF
+ **/
+static int i40e_vc_enable_vlan_stripping(struct i40e_vf *vf, u8 *msg,
+					 u16 msglen)
+{
+	struct i40e_vsi *vsi = vf->pf->vsi[vf->lan_vsi_idx];
+	i40e_status aq_ret = 0;
+
+	if (!test_bit(I40E_VF_STATE_ACTIVE, &vf->vf_states)) {
+		aq_ret = I40E_ERR_PARAM;
+		goto err;
+	}
+
+	i40e_vlan_stripping_enable(vsi);
+
+	/* send the response to the VF */
+err:
+	return i40e_vc_send_resp_to_vf(vf, VIRTCHNL_OP_ENABLE_VLAN_STRIPPING,
+				       aq_ret);
+}
+
+/**
+ * i40e_vc_disable_vlan_stripping
+ * @vf: pointer to the VF info
+ * @msg: pointer to the msg buffer
+ * @msglen: msg length
+ *
+ * Disable vlan header stripping for the VF
+ **/
+static int i40e_vc_disable_vlan_stripping(struct i40e_vf *vf, u8 *msg,
+					  u16 msglen)
+{
+	struct i40e_vsi *vsi = vf->pf->vsi[vf->lan_vsi_idx];
+	i40e_status aq_ret = 0;
+
+	if (!test_bit(I40E_VF_STATE_ACTIVE, &vf->vf_states)) {
+		aq_ret = I40E_ERR_PARAM;
+		goto err;
+	}
+
+	i40e_vlan_stripping_disable(vsi);
+
+	/* send the response to the VF */
+err:
+	return i40e_vc_send_resp_to_vf(vf, VIRTCHNL_OP_DISABLE_VLAN_STRIPPING,
+				       aq_ret);
+}
+
 /**
  * i40e_vc_process_vf_msg
  * @pf: pointer to the PF structure
@@ -2648,6 +2702,12 @@ int i40e_vc_process_vf_msg(struct i40e_pf *pf, s16 vf_id, u32 v_opcode,
 	case VIRTCHNL_OP_SET_RSS_HENA:
 		ret = i40e_vc_set_rss_hena(vf, msg, msglen);
 		break;
+	case VIRTCHNL_OP_ENABLE_VLAN_STRIPPING:
+		ret = i40e_vc_enable_vlan_stripping(vf, msg, msglen);
+		break;
+	case VIRTCHNL_OP_DISABLE_VLAN_STRIPPING:
+		ret = i40e_vc_disable_vlan_stripping(vf, msg, msglen);
+		break;
 
 	case VIRTCHNL_OP_UNKNOWN:
 	default:
diff --git a/drivers/net/ethernet/intel/i40evf/i40evf.h b/drivers/net/ethernet/intel/i40evf/i40evf.h
index e5293d35fb6a..82f69031e5cd 100644
--- a/drivers/net/ethernet/intel/i40evf/i40evf.h
+++ b/drivers/net/ethernet/intel/i40evf/i40evf.h
@@ -261,6 +261,8 @@ struct i40evf_adapter {
 #define I40EVF_FLAG_AQ_RELEASE_PROMISC		BIT(16)
 #define I40EVF_FLAG_AQ_REQUEST_ALLMULTI		BIT(17)
 #define I40EVF_FLAG_AQ_RELEASE_ALLMULTI		BIT(18)
+#define I40EVF_FLAG_AQ_ENABLE_VLAN_STRIPPING	BIT(19)
+#define I40EVF_FLAG_AQ_DISABLE_VLAN_STRIPPING	BIT(20)
 
 	/* OS defined structs */
 	struct net_device *netdev;
@@ -358,6 +360,8 @@ void i40evf_get_hena(struct i40evf_adapter *adapter);
 void i40evf_set_hena(struct i40evf_adapter *adapter);
 void i40evf_set_rss_key(struct i40evf_adapter *adapter);
 void i40evf_set_rss_lut(struct i40evf_adapter *adapter);
+void i40evf_enable_vlan_stripping(struct i40evf_adapter *adapter);
+void i40evf_disable_vlan_stripping(struct i40evf_adapter *adapter);
 void i40evf_virtchnl_completion(struct i40evf_adapter *adapter,
 				enum virtchnl_ops v_opcode,
 				i40e_status v_retval, u8 *msg, u16 msglen);
diff --git a/drivers/net/ethernet/intel/i40evf/i40evf_main.c b/drivers/net/ethernet/intel/i40evf/i40evf_main.c
index 258e8e27068b..9ee277e87f10 100644
--- a/drivers/net/ethernet/intel/i40evf/i40evf_main.c
+++ b/drivers/net/ethernet/intel/i40evf/i40evf_main.c
@@ -1676,6 +1676,16 @@ static void i40evf_watchdog_task(struct work_struct *work)
 		goto watchdog_done;
 	}
 
+	if (adapter->aq_required & I40EVF_FLAG_AQ_ENABLE_VLAN_STRIPPING) {
+		i40evf_enable_vlan_stripping(adapter);
+		goto watchdog_done;
+	}
+
+	if (adapter->aq_required & I40EVF_FLAG_AQ_DISABLE_VLAN_STRIPPING) {
+		i40evf_disable_vlan_stripping(adapter);
+		goto watchdog_done;
+	}
+
 	if (adapter->aq_required & I40EVF_FLAG_AQ_CONFIGURE_QUEUES) {
 		i40evf_configure_queues(adapter);
 		goto watchdog_done;
@@ -2293,6 +2303,28 @@ static int i40evf_change_mtu(struct net_device *netdev, int new_mtu)
 	return 0;
 }
 
+/**
+ * i40e_set_features - set the netdev feature flags
+ * @netdev: ptr to the netdev being adjusted
+ * @features: the feature set that the stack is suggesting
+ * Note: expects to be called while under rtnl_lock()
+ **/
+static int i40evf_set_features(struct net_device *netdev,
+			       netdev_features_t features)
+{
+	struct i40evf_adapter *adapter = netdev_priv(netdev);
+
+	if (!VLAN_ALLOWED(adapter))
+		return -EINVAL;
+
+	if (features & NETIF_F_HW_VLAN_CTAG_RX)
+		adapter->aq_required |= I40EVF_FLAG_AQ_ENABLE_VLAN_STRIPPING;
+	else
+		adapter->aq_required |= I40EVF_FLAG_AQ_DISABLE_VLAN_STRIPPING;
+
+	return 0;
+}
+
 /**
  * i40evf_features_check - Validate encapsulated packet conforms to limits
  * @skb: skb buff
@@ -2386,6 +2418,7 @@ static const struct net_device_ops i40evf_netdev_ops = {
 	.ndo_vlan_rx_kill_vid	= i40evf_vlan_rx_kill_vid,
 	.ndo_features_check	= i40evf_features_check,
 	.ndo_fix_features	= i40evf_fix_features,
+	.ndo_set_features	= i40evf_set_features,
 #ifdef CONFIG_NET_POLL_CONTROLLER
 	.ndo_poll_controller	= i40evf_netpoll,
 #endif
diff --git a/drivers/net/ethernet/intel/i40evf/i40evf_virtchnl.c b/drivers/net/ethernet/intel/i40evf/i40evf_virtchnl.c
index 6c403bf1bbb8..85876f4fb1fb 100644
--- a/drivers/net/ethernet/intel/i40evf/i40evf_virtchnl.c
+++ b/drivers/net/ethernet/intel/i40evf/i40evf_virtchnl.c
@@ -820,6 +820,46 @@ void i40evf_set_rss_lut(struct i40evf_adapter *adapter)
 	kfree(vrl);
 }
 
+/**
+ * i40evf_enable_vlan_stripping
+ * @adapter: adapter structure
+ *
+ * Request VLAN header stripping to be enabled
+ **/
+void i40evf_enable_vlan_stripping(struct i40evf_adapter *adapter)
+{
+	if (adapter->current_op != VIRTCHNL_OP_UNKNOWN) {
+		/* bail because we already have a command pending */
+		dev_err(&adapter->pdev->dev, "Cannot enable stripping, command %d pending\n",
+			adapter->current_op);
+		return;
+	}
+	adapter->current_op = VIRTCHNL_OP_ENABLE_VLAN_STRIPPING;
+	adapter->aq_required &= ~I40EVF_FLAG_AQ_ENABLE_VLAN_STRIPPING;
+	i40evf_send_pf_msg(adapter, VIRTCHNL_OP_ENABLE_VLAN_STRIPPING,
+			   NULL, 0);
+}
+
+/**
+ * i40evf_disable_vlan_stripping
+ * @adapter: adapter structure
+ *
+ * Request VLAN header stripping to be disabled
+ **/
+void i40evf_disable_vlan_stripping(struct i40evf_adapter *adapter)
+{
+	if (adapter->current_op != VIRTCHNL_OP_UNKNOWN) {
+		/* bail because we already have a command pending */
+		dev_err(&adapter->pdev->dev, "Cannot disable stripping, command %d pending\n",
+			adapter->current_op);
+		return;
+	}
+	adapter->current_op = VIRTCHNL_OP_DISABLE_VLAN_STRIPPING;
+	adapter->aq_required &= ~I40EVF_FLAG_AQ_DISABLE_VLAN_STRIPPING;
+	i40evf_send_pf_msg(adapter, VIRTCHNL_OP_DISABLE_VLAN_STRIPPING,
+			   NULL, 0);
+}
+
 /**
  * i40evf_print_link_message - print link up or down
  * @adapter: adapter structure
diff --git a/include/linux/avf/virtchnl.h b/include/linux/avf/virtchnl.h
index becfca2ae94e..2b038442c352 100644
--- a/include/linux/avf/virtchnl.h
+++ b/include/linux/avf/virtchnl.h
@@ -133,6 +133,8 @@ enum virtchnl_ops {
 	VIRTCHNL_OP_CONFIG_RSS_LUT = 24,
 	VIRTCHNL_OP_GET_RSS_HENA_CAPS = 25,
 	VIRTCHNL_OP_SET_RSS_HENA = 26,
+	VIRTCHNL_OP_ENABLE_VLAN_STRIPPING = 27,
+	VIRTCHNL_OP_DISABLE_VLAN_STRIPPING = 28,
 };
 
 /* This macro is used to generate a compilation error if a structure
@@ -686,6 +688,9 @@ virtchnl_vc_validate_vf_msg(struct virtchnl_version_info *ver, u32 v_opcode,
 	case VIRTCHNL_OP_SET_RSS_HENA:
 		valid_len = sizeof(struct virtchnl_rss_hena);
 		break;
+	case VIRTCHNL_OP_ENABLE_VLAN_STRIPPING:
+	case VIRTCHNL_OP_DISABLE_VLAN_STRIPPING:
+		break;
 	/* These are always errors coming from the VF. */
 	case VIRTCHNL_OP_EVENT:
 	case VIRTCHNL_OP_UNKNOWN:
-- 
cgit v1.2.3


From f75b0afa190d041c142bdc3f0c6022f61340b8a3 Mon Sep 17 00:00:00 2001
From: Chanwoo Choi <cwchoi00@gmail.com>
Date: Thu, 24 Aug 2017 10:42:50 +0900
Subject: PM / devfreq: Move private devfreq_update_stats() into devfreq

THe devfreq_update_stats() updates the 'struct devfreq_dev_status'
in order to get current status of devfreq device. It is only used
for the governors.

This patch moves the devfreq_update_stats() into devfreq directory.

Signed-off-by: Chanwoo Choi <cwchoi00@gmail.com>
Signed-off-by: MyungJoo Ham <myungjoo.ham@samsung.com>
---
 drivers/devfreq/governor.h |  4 ++++
 include/linux/devfreq.h    | 13 -------------
 2 files changed, 4 insertions(+), 13 deletions(-)

(limited to 'include')

diff --git a/drivers/devfreq/governor.h b/drivers/devfreq/governor.h
index a4f2fa1091e4..cfc50a61a90d 100644
--- a/drivers/devfreq/governor.h
+++ b/drivers/devfreq/governor.h
@@ -69,4 +69,8 @@ extern int devfreq_remove_governor(struct devfreq_governor *governor);
 
 extern int devfreq_update_status(struct devfreq *devfreq, unsigned long freq);
 
+static inline int devfreq_update_stats(struct devfreq *df)
+{
+	return df->profile->get_dev_status(df->dev.parent, &df->last_status);
+}
 #endif /* _GOVERNOR_H */
diff --git a/include/linux/devfreq.h b/include/linux/devfreq.h
index 6c220e4ebb6b..597294e0cc40 100644
--- a/include/linux/devfreq.h
+++ b/include/linux/devfreq.h
@@ -214,19 +214,6 @@ extern void devm_devfreq_unregister_notifier(struct device *dev,
 extern struct devfreq *devfreq_get_devfreq_by_phandle(struct device *dev,
 						int index);
 
-/**
- * devfreq_update_stats() - update the last_status pointer in struct devfreq
- * @df:		the devfreq instance whose status needs updating
- *
- *  Governors are recommended to use this function along with last_status,
- * which allows other entities to reuse the last_status without affecting
- * the values fetched later by governors.
- */
-static inline int devfreq_update_stats(struct devfreq *df)
-{
-	return df->profile->get_dev_status(df->dev.parent, &df->last_status);
-}
-
 #if IS_ENABLED(CONFIG_DEVFREQ_GOV_SIMPLE_ONDEMAND)
 /**
  * struct devfreq_simple_ondemand_data - void *data fed to struct devfreq
-- 
cgit v1.2.3


From 3e00ab4ac51c2ed47c28fd5000c47399f1a11cf5 Mon Sep 17 00:00:00 2001
From: Abhishek Sahu <absahu@codeaurora.org>
Date: Tue, 1 Aug 2017 19:41:42 +0530
Subject: dmaengine: add DMA_PREP_CMD for non-Data descriptors.

Some of the DMA controllers are capable of issuing the commands
to peripheral by the DMA. These commands can be list of register
reads/writes and its different from normal data reads/writes.
This patch adds new flag DMA_PREP_CMD in DMA API which tells
the driver that the data passed to DMA API is command data
and DMA controller driver will form descriptor in the required
format.

This flag can be used by any DMA controller driver which requires
the descriptor in different format for non-Data descriptors.

Signed-off-by: Abhishek Sahu <absahu@codeaurora.org>
Signed-off-by: Vinod Koul <vinod.koul@intel.com>
---
 Documentation/dmaengine/provider.txt | 7 +++++++
 include/linux/dmaengine.h            | 4 ++++
 2 files changed, 11 insertions(+)

(limited to 'include')

diff --git a/Documentation/dmaengine/provider.txt b/Documentation/dmaengine/provider.txt
index e33bc1c8ed2c..bfadbfdf13ed 100644
--- a/Documentation/dmaengine/provider.txt
+++ b/Documentation/dmaengine/provider.txt
@@ -395,6 +395,13 @@ where to put them)
 	  when DMA_CTRL_REUSE is already set
 	- Terminating the channel
 
+  * DMA_PREP_CMD
+    - If set, the client driver tells DMA controller that passed data in DMA
+      API is command data.
+    - Interpretation of command data is DMA controller specific. It can be
+      used for issuing commands to other peripherals/register reads/register
+      writes for which the descriptor should be in different format from
+      normal data descriptors.
 
 General Design Notes
 --------------------
diff --git a/include/linux/dmaengine.h b/include/linux/dmaengine.h
index 533680860865..dd4de1d40166 100644
--- a/include/linux/dmaengine.h
+++ b/include/linux/dmaengine.h
@@ -186,6 +186,9 @@ struct dma_interleaved_template {
  *  on the result of this operation
  * @DMA_CTRL_REUSE: client can reuse the descriptor and submit again till
  *  cleared or freed
+ * @DMA_PREP_CMD: tell the driver that the data passed to DMA API is command
+ *  data and the descriptor should be in different format from normal
+ *  data descriptors.
  */
 enum dma_ctrl_flags {
 	DMA_PREP_INTERRUPT = (1 << 0),
@@ -195,6 +198,7 @@ enum dma_ctrl_flags {
 	DMA_PREP_CONTINUE = (1 << 4),
 	DMA_PREP_FENCE = (1 << 5),
 	DMA_CTRL_REUSE = (1 << 6),
+	DMA_PREP_CMD = (1 << 7),
 };
 
 /**
-- 
cgit v1.2.3


From dfebb055f73a24e2c8756b837d9ce1a06457f5d7 Mon Sep 17 00:00:00 2001
From: Abhishek Sahu <absahu@codeaurora.org>
Date: Tue, 1 Aug 2017 19:41:43 +0530
Subject: dmaengine: qcom: bam_dma: wrapper functions for command descriptor

QCOM BAM also supports command descriptor which allows the SW to
create descriptors of type command which does not generate any
data transmissions but configures registers in the peripheral.
In command descriptor the 32bit address point to the start of
the command block which holds the command elements and the
16bit size define the size of the command block.

Each Command Element is structured by 4 words:
    Write command: address + cmd
                   register data
                   register mask
                   reserved

    Read command: address + cmd
                  read data result address,
                  reserved
                  reserved

This patch creates a new header file for BAM driver which contains the
structures and wrapper functions for command descriptor. This file will
be used by different QCOM peripheral drivers for forming the command
descriptor

Signed-off-by: Abhishek Sahu <absahu@codeaurora.org>
Signed-off-by: Vinod Koul <vinod.koul@intel.com>
---
 include/linux/dma/qcom_bam_dma.h | 79 ++++++++++++++++++++++++++++++++++++++++
 1 file changed, 79 insertions(+)
 create mode 100644 include/linux/dma/qcom_bam_dma.h

(limited to 'include')

diff --git a/include/linux/dma/qcom_bam_dma.h b/include/linux/dma/qcom_bam_dma.h
new file mode 100644
index 000000000000..077d43a358e5
--- /dev/null
+++ b/include/linux/dma/qcom_bam_dma.h
@@ -0,0 +1,79 @@
+/*
+ * Copyright (c) 2016-2017, The Linux Foundation. All rights reserved.
+ *
+ * This software is licensed under the terms of the GNU General Public
+ * License version 2, as published by the Free Software Foundation, and
+ * may be copied, distributed, and modified under those terms.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ */
+
+#ifndef _QCOM_BAM_DMA_H
+#define _QCOM_BAM_DMA_H
+
+#include <asm/byteorder.h>
+
+/*
+ * This data type corresponds to the native Command Element
+ * supported by BAM DMA Engine.
+ *
+ * @cmd_and_addr - upper 8 bits command and lower 24 bits register address.
+ * @data - for write command: content to be written into peripheral register.
+ *	   for read command: dest addr to write peripheral register value.
+ * @mask - register mask.
+ * @reserved - for future usage.
+ *
+ */
+struct bam_cmd_element {
+	__le32 cmd_and_addr;
+	__le32 data;
+	__le32 mask;
+	__le32 reserved;
+};
+
+/*
+ * This enum indicates the command type in a command element
+ */
+enum bam_command_type {
+	BAM_WRITE_COMMAND = 0,
+	BAM_READ_COMMAND,
+};
+
+/*
+ * prep_bam_ce_le32 - Wrapper function to prepare a single BAM command
+ * element with the data already in le32 format.
+ *
+ * @bam_ce: bam command element
+ * @addr: target address
+ * @cmd: BAM command
+ * @data: actual data for write and dest addr for read in le32
+ */
+static inline void
+bam_prep_ce_le32(struct bam_cmd_element *bam_ce, u32 addr,
+		 enum bam_command_type cmd, __le32 data)
+{
+	bam_ce->cmd_and_addr =
+		cpu_to_le32((addr & 0xffffff) | ((cmd & 0xff) << 24));
+	bam_ce->data = data;
+	bam_ce->mask = cpu_to_le32(0xffffffff);
+}
+
+/*
+ * bam_prep_ce - Wrapper function to prepare a single BAM command element
+ * with the data.
+ *
+ * @bam_ce: BAM command element
+ * @addr: target address
+ * @cmd: BAM command
+ * @data: actual data for write and dest addr for read
+ */
+static inline void
+bam_prep_ce(struct bam_cmd_element *bam_ce, u32 addr,
+	    enum bam_command_type cmd, u32 data)
+{
+	bam_prep_ce_le32(bam_ce, addr, cmd, cpu_to_le32(data));
+}
+#endif
-- 
cgit v1.2.3


From 604407890ecf624c2fb41013c82b22aade59b455 Mon Sep 17 00:00:00 2001
From: Bart Van Assche <bart.vanassche@wdc.com>
Date: Wed, 9 Aug 2017 11:32:11 -0700
Subject: dm: fix printk() rate limiting code

Using the same rate limiting state for different kinds of messages
is wrong because this can cause a high frequency message to suppress
a report of a low frequency message. Hence use a unique rate limiting
state per message type.

Fixes: 71a16736a15e ("dm: use local printk ratelimit")
Cc: stable@vger.kernel.org
Signed-off-by: Bart Van Assche <bart.vanassche@wdc.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
---
 drivers/md/dm.c               | 10 ----------
 include/linux/device-mapper.h | 41 ++++++++++++-----------------------------
 2 files changed, 12 insertions(+), 39 deletions(-)

(limited to 'include')

diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index 8298670757e9..d669fddd9290 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -27,16 +27,6 @@
 
 #define DM_MSG_PREFIX "core"
 
-#ifdef CONFIG_PRINTK
-/*
- * ratelimit state to be used in DMXXX_LIMIT().
- */
-DEFINE_RATELIMIT_STATE(dm_ratelimit_state,
-		       DEFAULT_RATELIMIT_INTERVAL,
-		       DEFAULT_RATELIMIT_BURST);
-EXPORT_SYMBOL(dm_ratelimit_state);
-#endif
-
 /*
  * Cookies are numeric values sent with CHANGE and REMOVE
  * uevents while resuming, removing or renaming the device.
diff --git a/include/linux/device-mapper.h b/include/linux/device-mapper.h
index 1473455d0341..4f2b3b2076c4 100644
--- a/include/linux/device-mapper.h
+++ b/include/linux/device-mapper.h
@@ -549,46 +549,29 @@ void *dm_vcalloc(unsigned long nmemb, unsigned long elem_size);
  *---------------------------------------------------------------*/
 #define DM_NAME "device-mapper"
 
-#ifdef CONFIG_PRINTK
-extern struct ratelimit_state dm_ratelimit_state;
-
-#define dm_ratelimit()	__ratelimit(&dm_ratelimit_state)
-#else
-#define dm_ratelimit()	0
-#endif
+#define DM_RATELIMIT(pr_func, fmt, ...)					\
+do {									\
+	static DEFINE_RATELIMIT_STATE(rs, DEFAULT_RATELIMIT_INTERVAL,	\
+				      DEFAULT_RATELIMIT_BURST);		\
+									\
+	if (__ratelimit(&rs))						\
+		pr_func(DM_FMT(fmt), ##__VA_ARGS__);			\
+} while (0)
 
 #define DM_FMT(fmt) DM_NAME ": " DM_MSG_PREFIX ": " fmt "\n"
 
 #define DMCRIT(fmt, ...) pr_crit(DM_FMT(fmt), ##__VA_ARGS__)
 
 #define DMERR(fmt, ...) pr_err(DM_FMT(fmt), ##__VA_ARGS__)
-#define DMERR_LIMIT(fmt, ...)						\
-do {									\
-	if (dm_ratelimit())						\
-		DMERR(fmt, ##__VA_ARGS__);				\
-} while (0)
-
+#define DMERR_LIMIT(fmt, ...) DM_RATELIMIT(pr_err, fmt, ##__VA_ARGS__)
 #define DMWARN(fmt, ...) pr_warn(DM_FMT(fmt), ##__VA_ARGS__)
-#define DMWARN_LIMIT(fmt, ...)						\
-do {									\
-	if (dm_ratelimit())						\
-		DMWARN(fmt, ##__VA_ARGS__);				\
-} while (0)
-
+#define DMWARN_LIMIT(fmt, ...) DM_RATELIMIT(pr_warn, fmt, ##__VA_ARGS__)
 #define DMINFO(fmt, ...) pr_info(DM_FMT(fmt), ##__VA_ARGS__)
-#define DMINFO_LIMIT(fmt, ...)						\
-do {									\
-	if (dm_ratelimit())						\
-		DMINFO(fmt, ##__VA_ARGS__);				\
-} while (0)
+#define DMINFO_LIMIT(fmt, ...) DM_RATELIMIT(pr_info, fmt, ##__VA_ARGS__)
 
 #ifdef CONFIG_DM_DEBUG
 #define DMDEBUG(fmt, ...) printk(KERN_DEBUG DM_FMT(fmt), ##__VA_ARGS__)
-#define DMDEBUG_LIMIT(fmt, ...)						\
-do {									\
-	if (dm_ratelimit())						\
-		DMDEBUG(fmt, ##__VA_ARGS__);				\
-} while (0)
+#define DMDEBUG_LIMIT(fmt, ...) DM_RATELIMIT(pr_debug, fmt, ##__VA_ARGS__)
 #else
 #define DMDEBUG(fmt, ...) no_printk(fmt, ##__VA_ARGS__)
 #define DMDEBUG_LIMIT(fmt, ...) no_printk(fmt, ##__VA_ARGS__)
-- 
cgit v1.2.3


From b97971bee55dc45420e0fe352d0b4df6e74716d4 Mon Sep 17 00:00:00 2001
From: Mike Leach <mike.leach@linaro.org>
Date: Wed, 2 Aug 2017 10:22:01 -0600
Subject: coresight: pmu: Adds return stack option to perf coresight pmu

Return stack is a programmable option on some ETM and PTM hardware.
Adds the option flags to enable this from the perf event command line.

Signed-off-by: Mike Leach <mike.leach@linaro.org>
Signed-off-by: Mathieu Poirier <mathieu.poirier@linaro.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/hwtracing/coresight/coresight-etm-perf.c | 2 ++
 include/linux/coresight-pmu.h                    | 1 +
 tools/include/linux/coresight-pmu.h              | 1 +
 3 files changed, 4 insertions(+)

(limited to 'include')

diff --git a/drivers/hwtracing/coresight/coresight-etm-perf.c b/drivers/hwtracing/coresight/coresight-etm-perf.c
index ad01dfeb2d68..8a0ad77574e7 100644
--- a/drivers/hwtracing/coresight/coresight-etm-perf.c
+++ b/drivers/hwtracing/coresight/coresight-etm-perf.c
@@ -53,10 +53,12 @@ static DEFINE_PER_CPU(struct coresight_device *, csdev_src);
 /* ETMv3.5/PTM's ETMCR is 'config' */
 PMU_FORMAT_ATTR(cycacc,		"config:" __stringify(ETM_OPT_CYCACC));
 PMU_FORMAT_ATTR(timestamp,	"config:" __stringify(ETM_OPT_TS));
+PMU_FORMAT_ATTR(retstack,	"config:" __stringify(ETM_OPT_RETSTK));
 
 static struct attribute *etm_config_formats_attr[] = {
 	&format_attr_cycacc.attr,
 	&format_attr_timestamp.attr,
+	&format_attr_retstack.attr,
 	NULL,
 };
 
diff --git a/include/linux/coresight-pmu.h b/include/linux/coresight-pmu.h
index 7d410260661b..45852c2cd096 100644
--- a/include/linux/coresight-pmu.h
+++ b/include/linux/coresight-pmu.h
@@ -24,6 +24,7 @@
 /* ETMv3.5/PTM's ETMCR config bit */
 #define ETM_OPT_CYCACC  12
 #define ETM_OPT_TS      28
+#define ETM_OPT_RETSTK	29
 
 static inline int coresight_get_trace_id(int cpu)
 {
diff --git a/tools/include/linux/coresight-pmu.h b/tools/include/linux/coresight-pmu.h
index 7d410260661b..45852c2cd096 100644
--- a/tools/include/linux/coresight-pmu.h
+++ b/tools/include/linux/coresight-pmu.h
@@ -24,6 +24,7 @@
 /* ETMv3.5/PTM's ETMCR config bit */
 #define ETM_OPT_CYCACC  12
 #define ETM_OPT_TS      28
+#define ETM_OPT_RETSTK	29
 
 static inline int coresight_get_trace_id(int cpu)
 {
-- 
cgit v1.2.3


From a09ac3974d0c0b25dfa70f8076546bc4876dd7a8 Mon Sep 17 00:00:00 2001
From: Okash Khawaja <okash.khawaja@gmail.com>
Date: Thu, 20 Jul 2017 08:22:36 +0100
Subject: tty: resolve tty contention between kernel and user space

The commit 12e84c71b7d4 ("tty: export tty_open_by_driver") exports
tty_open_by_device to allow tty to be opened from inside kernel which
works fine except that it doesn't handle contention with user space or
another kernel-space open of the same tty. For example, opening a tty
from user space while it is kernel opened results in failure and a
kernel log message about mismatch between tty->count and tty's file
open count.

This patch makes kernel access to tty exclusive, so that if a user
process or kernel opens a kernel opened tty, it gets -EBUSY. It does
this by adding TTY_KOPENED flag to tty->flags. When this flag is set,
tty_open_by_driver returns -EBUSY. Instead of overloading
tty_open_by_driver for both kernel and user space, this
patch creates a separate function tty_kopen which closely follows
tty_open_by_driver. tty_kclose closes the tty opened by tty_kopen.

To address the mismatch between tty->count and #fd's, this patch adds
#kopen's to the count before comparing it with tty->count. That way
check_tty_count reflects correct usage count.

Returning -EBUSY on tty open is a change in the interface. I have
tested this with minicom, picocom and commands like "echo foo >
/dev/ttyS0". They all correctly report "Device or resource busy" when
the tty is already kernel opened.

Signed-off-by: Okash Khawaja <okash.khawaja@gmail.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/tty/tty_io.c | 100 ++++++++++++++++++++++++++++++++++++++++++++++++---
 include/linux/tty.h  |  21 +++++++++++
 2 files changed, 116 insertions(+), 5 deletions(-)

(limited to 'include')

diff --git a/drivers/tty/tty_io.c b/drivers/tty/tty_io.c
index 10c4038c0e8d..b03504d4916e 100644
--- a/drivers/tty/tty_io.c
+++ b/drivers/tty/tty_io.c
@@ -280,7 +280,7 @@ static int check_tty_count(struct tty_struct *tty, const char *routine)
 {
 #ifdef CHECK_TTY_COUNT
 	struct list_head *p;
-	int count = 0;
+	int count = 0, kopen_count = 0;
 
 	spin_lock(&tty->files_lock);
 	list_for_each(p, &tty->tty_files) {
@@ -291,10 +291,12 @@ static int check_tty_count(struct tty_struct *tty, const char *routine)
 	    tty->driver->subtype == PTY_TYPE_SLAVE &&
 	    tty->link && tty->link->count)
 		count++;
-	if (tty->count != count) {
-		tty_warn(tty, "%s: tty->count(%d) != #fd's(%d)\n",
-			 routine, tty->count, count);
-		return count;
+	if (tty_port_kopened(tty->port))
+		kopen_count++;
+	if (tty->count != (count + kopen_count)) {
+		tty_warn(tty, "%s: tty->count(%d) != (#fd's(%d) + #kopen's(%d))\n",
+			 routine, tty->count, count, kopen_count);
+		return (count + kopen_count);
 	}
 #endif
 	return 0;
@@ -1512,6 +1514,38 @@ static int tty_release_checks(struct tty_struct *tty, int idx)
 	return 0;
 }
 
+/**
+ *      tty_kclose      -       closes tty opened by tty_kopen
+ *      @tty: tty device
+ *
+ *      Performs the final steps to release and free a tty device. It is the
+ *      same as tty_release_struct except that it also resets TTY_PORT_KOPENED
+ *      flag on tty->port.
+ */
+void tty_kclose(struct tty_struct *tty)
+{
+	/*
+	 * Ask the line discipline code to release its structures
+	 */
+	tty_ldisc_release(tty);
+
+	/* Wait for pending work before tty destruction commmences */
+	tty_flush_works(tty);
+
+	tty_debug_hangup(tty, "freeing structure\n");
+	/*
+	 * The release_tty function takes care of the details of clearing
+	 * the slots and preserving the termios structure. The tty_unlock_pair
+	 * should be safe as we keep a kref while the tty is locked (so the
+	 * unlock never unlocks a freed tty).
+	 */
+	mutex_lock(&tty_mutex);
+	tty_port_set_kopened(tty->port, 0);
+	release_tty(tty, tty->index);
+	mutex_unlock(&tty_mutex);
+}
+EXPORT_SYMBOL_GPL(tty_kclose);
+
 /**
  *	tty_release_struct	-	release a tty struct
  *	@tty: tty device
@@ -1785,6 +1819,56 @@ static struct tty_driver *tty_lookup_driver(dev_t device, struct file *filp,
 	return driver;
 }
 
+/**
+ *	tty_kopen	-	open a tty device for kernel
+ *	@device: dev_t of device to open
+ *
+ *	Opens tty exclusively for kernel. Performs the driver lookup,
+ *	makes sure it's not already opened and performs the first-time
+ *	tty initialization.
+ *
+ *	Returns the locked initialized &tty_struct
+ *
+ *	Claims the global tty_mutex to serialize:
+ *	  - concurrent first-time tty initialization
+ *	  - concurrent tty driver removal w/ lookup
+ *	  - concurrent tty removal from driver table
+ */
+struct tty_struct *tty_kopen(dev_t device)
+{
+	struct tty_struct *tty;
+	struct tty_driver *driver = NULL;
+	int index = -1;
+
+	mutex_lock(&tty_mutex);
+	driver = tty_lookup_driver(device, NULL, &index);
+	if (IS_ERR(driver)) {
+		mutex_unlock(&tty_mutex);
+		return ERR_CAST(driver);
+	}
+
+	/* check whether we're reopening an existing tty */
+	tty = tty_driver_lookup_tty(driver, NULL, index);
+	if (IS_ERR(tty))
+		goto out;
+
+	if (tty) {
+		/* drop kref from tty_driver_lookup_tty() */
+		tty_kref_put(tty);
+		tty = ERR_PTR(-EBUSY);
+	} else { /* tty_init_dev returns tty with the tty_lock held */
+		tty = tty_init_dev(driver, index);
+		if (IS_ERR(tty))
+			goto out;
+		tty_port_set_kopened(tty->port, 1);
+	}
+out:
+	mutex_unlock(&tty_mutex);
+	tty_driver_kref_put(driver);
+	return tty;
+}
+EXPORT_SYMBOL_GPL(tty_kopen);
+
 /**
  *	tty_open_by_driver	-	open a tty device
  *	@device: dev_t of device to open
@@ -1824,6 +1908,12 @@ struct tty_struct *tty_open_by_driver(dev_t device, struct inode *inode,
 	}
 
 	if (tty) {
+		if (tty_port_kopened(tty->port)) {
+			tty_kref_put(tty);
+			mutex_unlock(&tty_mutex);
+			tty = ERR_PTR(-EBUSY);
+			goto out;
+		}
 		mutex_unlock(&tty_mutex);
 		retval = tty_lock_interruptible(tty);
 		tty_kref_put(tty);  /* drop kref from tty_driver_lookup_tty() */
diff --git a/include/linux/tty.h b/include/linux/tty.h
index 79c30daf46a9..236d52b8be8c 100644
--- a/include/linux/tty.h
+++ b/include/linux/tty.h
@@ -261,6 +261,8 @@ struct tty_port {
  */
 #define TTY_PORT_CTS_FLOW	3	/* h/w flow control enabled */
 #define TTY_PORT_CHECK_CD	4	/* carrier detect enabled */
+#define TTY_PORT_KOPENED	5	/* device exclusively opened by
+					   kernel */
 
 /*
  * Where all of the state associated with a tty is kept while the tty
@@ -401,6 +403,8 @@ extern int __init tty_init(void);
 extern const char *tty_name(const struct tty_struct *tty);
 extern struct tty_struct *tty_open_by_driver(dev_t device, struct inode *inode,
 		struct file *filp);
+extern struct tty_struct *tty_kopen(dev_t device);
+extern void tty_kclose(struct tty_struct *tty);
 extern int tty_dev_name_to_number(const char *name, dev_t *number);
 #else
 static inline void tty_kref_put(struct tty_struct *tty)
@@ -425,6 +429,10 @@ static inline const char *tty_name(const struct tty_struct *tty)
 static inline struct tty_struct *tty_open_by_driver(dev_t device,
 		struct inode *inode, struct file *filp)
 { return NULL; }
+static inline struct tty_struct *tty_kopen(dev_t device)
+{ return ERR_PTR(-ENODEV); }
+static inline void tty_kclose(struct tty_struct *tty)
+{ }
 static inline int tty_dev_name_to_number(const char *name, dev_t *number)
 { return -ENOTSUPP; }
 #endif
@@ -652,6 +660,19 @@ static inline void tty_port_set_initialized(struct tty_port *port, bool val)
 		clear_bit(TTY_PORT_INITIALIZED, &port->iflags);
 }
 
+static inline bool tty_port_kopened(struct tty_port *port)
+{
+	return test_bit(TTY_PORT_KOPENED, &port->iflags);
+}
+
+static inline void tty_port_set_kopened(struct tty_port *port, bool val)
+{
+	if (val)
+		set_bit(TTY_PORT_KOPENED, &port->iflags);
+	else
+		clear_bit(TTY_PORT_KOPENED, &port->iflags);
+}
+
 extern struct tty_struct *tty_port_tty_get(struct tty_port *port);
 extern void tty_port_tty_set(struct tty_port *port, struct tty_struct *tty);
 extern int tty_port_carrier_raised(struct tty_port *port);
-- 
cgit v1.2.3


From a033c3b10a2bec387ccd9e39fadd94eef6519626 Mon Sep 17 00:00:00 2001
From: Okash Khawaja <okash.khawaja@gmail.com>
Date: Thu, 20 Jul 2017 08:22:38 +0100
Subject: tty: undo export of tty_open_by_driver

Since we have tty_kopen, we no longer need to export tty_open_by_driver.
This patch makes this function static.

Signed-off-by: Okash Khawaja <okash.khawaja@gmail.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/tty/tty_io.c | 3 +--
 include/linux/tty.h  | 5 -----
 2 files changed, 1 insertion(+), 7 deletions(-)

(limited to 'include')

diff --git a/drivers/tty/tty_io.c b/drivers/tty/tty_io.c
index b03504d4916e..195290f3803a 100644
--- a/drivers/tty/tty_io.c
+++ b/drivers/tty/tty_io.c
@@ -1885,7 +1885,7 @@ EXPORT_SYMBOL_GPL(tty_kopen);
  *	  - concurrent tty driver removal w/ lookup
  *	  - concurrent tty removal from driver table
  */
-struct tty_struct *tty_open_by_driver(dev_t device, struct inode *inode,
+static struct tty_struct *tty_open_by_driver(dev_t device, struct inode *inode,
 					     struct file *filp)
 {
 	struct tty_struct *tty;
@@ -1936,7 +1936,6 @@ out:
 	tty_driver_kref_put(driver);
 	return tty;
 }
-EXPORT_SYMBOL_GPL(tty_open_by_driver);
 
 /**
  *	tty_open		-	open a tty device
diff --git a/include/linux/tty.h b/include/linux/tty.h
index 236d52b8be8c..cf53eb539f6e 100644
--- a/include/linux/tty.h
+++ b/include/linux/tty.h
@@ -401,8 +401,6 @@ extern struct tty_struct *get_current_tty(void);
 /* tty_io.c */
 extern int __init tty_init(void);
 extern const char *tty_name(const struct tty_struct *tty);
-extern struct tty_struct *tty_open_by_driver(dev_t device, struct inode *inode,
-		struct file *filp);
 extern struct tty_struct *tty_kopen(dev_t device);
 extern void tty_kclose(struct tty_struct *tty);
 extern int tty_dev_name_to_number(const char *name, dev_t *number);
@@ -426,9 +424,6 @@ static inline int __init tty_init(void)
 { return 0; }
 static inline const char *tty_name(const struct tty_struct *tty)
 { return "(none)"; }
-static inline struct tty_struct *tty_open_by_driver(dev_t device,
-		struct inode *inode, struct file *filp)
-{ return NULL; }
 static inline struct tty_struct *tty_kopen(dev_t device)
 { return ERR_PTR(-ENODEV); }
 static inline void tty_kclose(struct tty_struct *tty)
-- 
cgit v1.2.3


From 00ee84159020728cbdbfc569b42c036ef20c6e55 Mon Sep 17 00:00:00 2001
From: sayli karnik <karniksayli1995@gmail.com>
Date: Fri, 16 Jun 2017 15:39:38 +0530
Subject: mod_devicetable: Remove excess description from structured comment

Remove excess member description to fix following warnings in sphinx
build:
Excess struct/union/enum/typedef member 'ver_major' description in 'fsl_mc_device_id'
Excess struct/union/enum/typedef member 'ver_minor' description in 'fsl_mc_device_id'

Signed-off-by: sayli karnik <karniksayli1995@gmail.com>
CC: Stuart Yoder <stuart.yoder@nxp.com>
Signed-off-by: Jonathan Corbet <corbet@lwn.net>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/linux/mod_devicetable.h | 2 --
 1 file changed, 2 deletions(-)

(limited to 'include')

diff --git a/include/linux/mod_devicetable.h b/include/linux/mod_devicetable.h
index 3f74ef2281e8..694cebb50f72 100644
--- a/include/linux/mod_devicetable.h
+++ b/include/linux/mod_devicetable.h
@@ -674,8 +674,6 @@ struct ulpi_device_id {
  * struct fsl_mc_device_id - MC object device identifier
  * @vendor: vendor ID
  * @obj_type: MC object type
- * @ver_major: MC object version major number
- * @ver_minor: MC object version minor number
  *
  * Type of entries in the "device Id" table for MC object devices supported by
  * a MC object device driver. The last entry of the table has vendor set to 0x0
-- 
cgit v1.2.3


From 9f757f415210a7c85e2784e4a1733ea78b2e4e88 Mon Sep 17 00:00:00 2001
From: Federico Vaga <federico.vaga@cern.ch>
Date: Tue, 18 Jul 2017 08:32:53 +0200
Subject: drivers/fmc: hide fmc operations behind helpers

This gave us more freedom to change/add/remove operations without
recompiling all device driver.

Typically, Carrier board implement the fmc operations, so they will not
use these helpers.

Signed-off-by: Federico Vaga <federico.vaga@cern.ch>
Tested-by: Pat Riehecky <riehecky@fnal.gov>
Acked-by: Alessandro Rubini <rubini@gnudd.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/fmc/fmc-chardev.c      |  3 +--
 drivers/fmc/fmc-core.c         | 55 ++++++++++++++++++++++++++++++++++++++++++
 drivers/fmc/fmc-match.c        |  2 +-
 drivers/fmc/fmc-trivial.c      | 20 ++++++---------
 drivers/fmc/fmc-write-eeprom.c |  8 +++---
 include/linux/fmc.h            | 18 ++++++++++++++
 6 files changed, 87 insertions(+), 19 deletions(-)

(limited to 'include')

diff --git a/drivers/fmc/fmc-chardev.c b/drivers/fmc/fmc-chardev.c
index ace6ef24d15e..5ecf4090a610 100644
--- a/drivers/fmc/fmc-chardev.c
+++ b/drivers/fmc/fmc-chardev.c
@@ -129,8 +129,7 @@ static int fc_probe(struct fmc_device *fmc)
 
 	struct fc_instance *fc;
 
-	if (fmc->op->validate)
-		index = fmc->op->validate(fmc, &fc_drv);
+	index = fmc_validate(fmc, &fc_drv);
 	if (index < 0)
 		return -EINVAL; /* not our device: invalid */
 
diff --git a/drivers/fmc/fmc-core.c b/drivers/fmc/fmc-core.c
index 353fc546fb08..5263d0607b64 100644
--- a/drivers/fmc/fmc-core.c
+++ b/drivers/fmc/fmc-core.c
@@ -118,6 +118,61 @@ static struct bin_attribute fmc_eeprom_attr = {
 	.write = fmc_write_eeprom,
 };
 
+int fmc_irq_request(struct fmc_device *fmc, irq_handler_t h,
+		    char *name, int flags)
+{
+	if (fmc->op->irq_request)
+		return fmc->op->irq_request(fmc, h, name, flags);
+	return -EPERM;
+}
+EXPORT_SYMBOL(fmc_irq_request);
+
+void fmc_irq_free(struct fmc_device *fmc)
+{
+	if (fmc->op->irq_free)
+		fmc->op->irq_free(fmc);
+}
+EXPORT_SYMBOL(fmc_irq_free);
+
+void fmc_irq_ack(struct fmc_device *fmc)
+{
+	if (likely(fmc->op->irq_ack))
+		fmc->op->irq_ack(fmc);
+}
+EXPORT_SYMBOL(fmc_irq_ack);
+
+int fmc_validate(struct fmc_device *fmc, struct fmc_driver *drv)
+{
+	if (fmc->op->validate)
+		return fmc->op->validate(fmc, drv);
+	return -EPERM;
+}
+EXPORT_SYMBOL(fmc_validate);
+
+int fmc_gpio_config(struct fmc_device *fmc, struct fmc_gpio *gpio, int ngpio)
+{
+	if (fmc->op->gpio_config)
+		return fmc->op->gpio_config(fmc, gpio, ngpio);
+	return -EPERM;
+}
+EXPORT_SYMBOL(fmc_gpio_config);
+
+int fmc_read_ee(struct fmc_device *fmc, int pos, void *d, int l)
+{
+	if (fmc->op->read_ee)
+		return fmc->op->read_ee(fmc, pos, d, l);
+	return -EPERM;
+}
+EXPORT_SYMBOL(fmc_read_ee);
+
+int fmc_write_ee(struct fmc_device *fmc, int pos, const void *d, int l)
+{
+	if (fmc->op->write_ee)
+		return fmc->op->write_ee(fmc, pos, d, l);
+	return -EPERM;
+}
+EXPORT_SYMBOL(fmc_write_ee);
+
 /*
  * Functions for client modules follow
  */
diff --git a/drivers/fmc/fmc-match.c b/drivers/fmc/fmc-match.c
index 104a5efc2207..a0956d1f7550 100644
--- a/drivers/fmc/fmc-match.c
+++ b/drivers/fmc/fmc-match.c
@@ -63,7 +63,7 @@ int fmc_fill_id_info(struct fmc_device *fmc)
 		if (!fmc->eeprom)
 			return -ENOMEM;
 		allocated = 1;
-		ret = fmc->op->read_ee(fmc, 0, fmc->eeprom, fmc->eeprom_len);
+		ret = fmc_read_ee(fmc, 0, fmc->eeprom, fmc->eeprom_len);
 		if (ret < 0)
 			goto out;
 	}
diff --git a/drivers/fmc/fmc-trivial.c b/drivers/fmc/fmc-trivial.c
index 6c590f54c79d..8defdee3e3a3 100644
--- a/drivers/fmc/fmc-trivial.c
+++ b/drivers/fmc/fmc-trivial.c
@@ -24,7 +24,7 @@ static irqreturn_t t_handler(int irq, void *dev_id)
 {
 	struct fmc_device *fmc = dev_id;
 
-	fmc->op->irq_ack(fmc);
+	fmc_irq_ack(fmc);
 	dev_info(&fmc->dev, "received irq %i\n", irq);
 	return IRQ_HANDLED;
 }
@@ -46,25 +46,21 @@ static int t_probe(struct fmc_device *fmc)
 	int ret;
 	int index = 0;
 
-	if (fmc->op->validate)
-		index = fmc->op->validate(fmc, &t_drv);
+	index = fmc_validate(fmc, &t_drv);
 	if (index < 0)
 		return -EINVAL; /* not our device: invalid */
 
-	ret = fmc->op->irq_request(fmc, t_handler, "fmc-trivial", IRQF_SHARED);
+	ret = fmc_irq_request(fmc, t_handler, "fmc-trivial", IRQF_SHARED);
 	if (ret < 0)
 		return ret;
 	/* ignore error code of call below, we really don't care */
-	fmc->op->gpio_config(fmc, t_gpio, ARRAY_SIZE(t_gpio));
+	fmc_gpio_config(fmc, t_gpio, ARRAY_SIZE(t_gpio));
 
-	/* Reprogram, if asked to. ESRCH == no filename specified */
-	ret = -ESRCH;
-	if (fmc->op->reprogram)
-		ret = fmc->op->reprogram(fmc, &t_drv, "");
-	if (ret == -ESRCH)
+	ret = fmc_reprogram(fmc, &t_drv, "", 0);
+	if (ret == -EPERM) /* programming not supported */
 		ret = 0;
 	if (ret < 0)
-		fmc->op->irq_free(fmc);
+		fmc_irq_free(fmc);
 
 	/* FIXME: reprogram LM32 too */
 	return ret;
@@ -72,7 +68,7 @@ static int t_probe(struct fmc_device *fmc)
 
 static int t_remove(struct fmc_device *fmc)
 {
-	fmc->op->irq_free(fmc);
+	fmc_irq_free(fmc);
 	return 0;
 }
 
diff --git a/drivers/fmc/fmc-write-eeprom.c b/drivers/fmc/fmc-write-eeprom.c
index 9bb2cbd5c9f2..3eb81bb1f1fc 100644
--- a/drivers/fmc/fmc-write-eeprom.c
+++ b/drivers/fmc/fmc-write-eeprom.c
@@ -50,7 +50,7 @@ static int fwe_run_tlv(struct fmc_device *fmc, const struct firmware *fw,
 		if (write) {
 			dev_info(&fmc->dev, "write %i bytes at 0x%04x\n",
 				 thislen, thisaddr);
-			err = fmc->op->write_ee(fmc, thisaddr, p + 5, thislen);
+			err = fmc_write_ee(fmc, thisaddr, p + 5, thislen);
 		}
 		if (err < 0) {
 			dev_err(&fmc->dev, "write failure @0x%04x\n",
@@ -70,7 +70,7 @@ static int fwe_run_bin(struct fmc_device *fmc, const struct firmware *fw)
 	int ret;
 
 	dev_info(&fmc->dev, "programming %zi bytes\n", fw->size);
-	ret = fmc->op->write_ee(fmc, 0, (void *)fw->data, fw->size);
+	ret = fmc_write_ee(fmc, 0, (void *)fw->data, fw->size);
 	if (ret < 0) {
 		dev_info(&fmc->dev, "write_eeprom: error %i\n", ret);
 		return ret;
@@ -115,8 +115,8 @@ static int fwe_probe(struct fmc_device *fmc)
 			KBUILD_MODNAME);
 		return -ENODEV;
 	}
-	if (fmc->op->validate)
-		index = fmc->op->validate(fmc, &fwe_drv);
+
+	index = fmc_validate(fmc, &fwe_drv);
 	if (index < 0) {
 		pr_err("%s: refusing device \"%s\"\n", KBUILD_MODNAME,
 		       dev_name(dev));
diff --git a/include/linux/fmc.h b/include/linux/fmc.h
index a5f0aa5c2a8d..8bb4a154bbdc 100644
--- a/include/linux/fmc.h
+++ b/include/linux/fmc.h
@@ -234,4 +234,22 @@ extern void fmc_free_id_info(struct fmc_device *fmc);
 extern void fmc_dump_eeprom(const struct fmc_device *fmc);
 extern void fmc_dump_sdb(const struct fmc_device *fmc);
 
+/* helpers for FMC operations */
+extern int fmc_irq_request(struct fmc_device *fmc, irq_handler_t h,
+			   char *name, int flags);
+extern void fmc_irq_free(struct fmc_device *fmc);
+extern void fmc_irq_ack(struct fmc_device *fmc);
+extern int fmc_validate(struct fmc_device *fmc, struct fmc_driver *drv);
+extern int fmc_gpio_config(struct fmc_device *fmc, struct fmc_gpio *gpio,
+			   int ngpio);
+extern int fmc_read_ee(struct fmc_device *fmc, int pos, void *d, int l);
+extern int fmc_write_ee(struct fmc_device *fmc, int pos, const void *d, int l);
+
+/* helpers for FMC operations */
+extern int fmc_irq_request(struct fmc_device *fmc, irq_handler_t h,
+			   char *name, int flags);
+extern void fmc_irq_free(struct fmc_device *fmc);
+extern void fmc_irq_ack(struct fmc_device *fmc);
+extern int fmc_validate(struct fmc_device *fmc, struct fmc_driver *drv);
+
 #endif /* __LINUX_FMC_H__ */
-- 
cgit v1.2.3


From 2071a3e94abd34d65bd40f1ff845f9cea300dfa6 Mon Sep 17 00:00:00 2001
From: Federico Vaga <federico.vaga@cern.ch>
Date: Tue, 18 Jul 2017 08:33:03 +0200
Subject: drivers/fmc: The only way to dump the SDB is from debugfs

Driver should not call fmc_sdb_dump() anymore. (actually they can but the
operation is not supported, so it will print an error message)

Signed-off-by: Federico Vaga <federico.vaga@cern.ch>
Tested-by: Pat Riehecky <riehecky@fnal.gov>
Acked-by: Alessandro Rubini <rubini@gnudd.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/fmc/Makefile      |   1 +
 drivers/fmc/fmc-core.c    |   7 +-
 drivers/fmc/fmc-debug.c   | 173 ++++++++++++++++++++++++++++++++++++++++++++++
 drivers/fmc/fmc-dump.c    |  41 -----------
 drivers/fmc/fmc-private.h |   9 +++
 drivers/fmc/fmc-sdb.c     |  99 +-------------------------
 include/linux/fmc.h       |   4 +-
 7 files changed, 195 insertions(+), 139 deletions(-)
 create mode 100644 drivers/fmc/fmc-debug.c
 create mode 100644 drivers/fmc/fmc-private.h

(limited to 'include')

diff --git a/drivers/fmc/Makefile b/drivers/fmc/Makefile
index b9452919739f..e809322e1bac 100644
--- a/drivers/fmc/Makefile
+++ b/drivers/fmc/Makefile
@@ -6,6 +6,7 @@ fmc-y += fmc-match.o
 fmc-y += fmc-sdb.o
 fmc-y += fru-parse.o
 fmc-y += fmc-dump.o
+fmc-y += fmc-debug.o
 
 obj-$(CONFIG_FMC_FAKEDEV) += fmc-fakedev.o
 obj-$(CONFIG_FMC_TRIVIAL) += fmc-trivial.o
diff --git a/drivers/fmc/fmc-core.c b/drivers/fmc/fmc-core.c
index 5263d0607b64..ef6d8acb0a81 100644
--- a/drivers/fmc/fmc-core.c
+++ b/drivers/fmc/fmc-core.c
@@ -13,6 +13,9 @@
 #include <linux/init.h>
 #include <linux/device.h>
 #include <linux/fmc.h>
+#include <linux/fmc-sdb.h>
+
+#include "fmc-private.h"
 
 static int fmc_check_version(unsigned long version, const char *name)
 {
@@ -289,7 +292,7 @@ int fmc_device_register_n(struct fmc_device **devs, int n)
 		}
 		/* This device went well, give information to the user */
 		fmc_dump_eeprom(fmc);
-		fmc_dump_sdb(fmc);
+		fmc_debug_init(fmc);
 	}
 	return 0;
 
@@ -301,6 +304,7 @@ out:
 
 	kfree(devarray);
 	for (i--; i >= 0; i--) {
+		fmc_debug_exit(devs[i]);
 		sysfs_remove_bin_file(&devs[i]->dev.kobj, &fmc_eeprom_attr);
 		device_del(&devs[i]->dev);
 		fmc_free_id_info(devs[i]);
@@ -328,6 +332,7 @@ void fmc_device_unregister_n(struct fmc_device **devs, int n)
 	kfree(devs[0]->devarray);
 
 	for (i = 0; i < n; i++) {
+		fmc_debug_exit(devs[i]);
 		sysfs_remove_bin_file(&devs[i]->dev.kobj, &fmc_eeprom_attr);
 		device_del(&devs[i]->dev);
 		fmc_free_id_info(devs[i]);
diff --git a/drivers/fmc/fmc-debug.c b/drivers/fmc/fmc-debug.c
new file mode 100644
index 000000000000..32930722770c
--- /dev/null
+++ b/drivers/fmc/fmc-debug.c
@@ -0,0 +1,173 @@
+/*
+ * Copyright (C) 2015 CERN (www.cern.ch)
+ * Author: Federico Vaga <federico.vaga@cern.ch>
+ *
+ * Released according to the GNU GPL, version 2 or any later version.
+ */
+
+#include <linux/module.h>
+#include <linux/device.h>
+#include <linux/init.h>
+#include <linux/fs.h>
+#include <linux/debugfs.h>
+#include <linux/seq_file.h>
+#include <asm/byteorder.h>
+
+#include <linux/fmc.h>
+#include <linux/sdb.h>
+#include <linux/fmc-sdb.h>
+
+#define FMC_DBG_SDB_DUMP "dump_sdb"
+
+static char *__strip_trailing_space(char *buf, char *str, int len)
+{
+	int i = len - 1;
+
+	memcpy(buf, str, len);
+	buf[len] = '\0';
+	while (i >= 0 && buf[i] == ' ')
+		buf[i--] = '\0';
+	return buf;
+}
+
+#define __sdb_string(buf, field) ({			\
+	BUILD_BUG_ON(sizeof(buf) < sizeof(field));	\
+	__strip_trailing_space(buf, (void *)(field), sizeof(field));	\
+		})
+
+/**
+ * We do not check seq_printf() errors because we want to see things in any case
+ */
+static void fmc_sdb_dump_recursive(struct fmc_device *fmc, struct seq_file *s,
+				   const struct sdb_array *arr)
+{
+	unsigned long base = arr->baseaddr;
+	int i, j, n = arr->len, level = arr->level;
+	char tmp[64];
+
+	for (i = 0; i < n; i++) {
+		union  sdb_record *r;
+		struct sdb_product *p;
+		struct sdb_component *c;
+
+		r = &arr->record[i];
+		c = &r->dev.sdb_component;
+		p = &c->product;
+
+		for (j = 0; j < level; j++)
+			seq_printf(s, "   ");
+		switch (r->empty.record_type) {
+		case sdb_type_interconnect:
+			seq_printf(s, "%08llx:%08x %.19s\n",
+				   __be64_to_cpu(p->vendor_id),
+				   __be32_to_cpu(p->device_id),
+				   p->name);
+			break;
+		case sdb_type_device:
+			seq_printf(s, "%08llx:%08x %.19s (%08llx-%08llx)\n",
+				   __be64_to_cpu(p->vendor_id),
+				   __be32_to_cpu(p->device_id),
+				   p->name,
+				   __be64_to_cpu(c->addr_first) + base,
+				   __be64_to_cpu(c->addr_last) + base);
+			break;
+		case sdb_type_bridge:
+			seq_printf(s, "%08llx:%08x %.19s (bridge: %08llx)\n",
+				   __be64_to_cpu(p->vendor_id),
+				   __be32_to_cpu(p->device_id),
+				   p->name,
+				   __be64_to_cpu(c->addr_first) + base);
+			if (IS_ERR(arr->subtree[i])) {
+				seq_printf(s, "SDB: (bridge error %li)\n",
+					 PTR_ERR(arr->subtree[i]));
+				break;
+			}
+			fmc_sdb_dump_recursive(fmc, s, arr->subtree[i]);
+			break;
+		case sdb_type_integration:
+			seq_printf(s, "integration\n");
+			break;
+		case sdb_type_repo_url:
+			seq_printf(s, "Synthesis repository: %s\n",
+					  __sdb_string(tmp, r->repo_url.repo_url));
+			break;
+		case sdb_type_synthesis:
+			seq_printf(s, "Bitstream '%s' ",
+					  __sdb_string(tmp, r->synthesis.syn_name));
+			seq_printf(s, "synthesized %08x by %s ",
+					  __be32_to_cpu(r->synthesis.date),
+					  __sdb_string(tmp, r->synthesis.user_name));
+			seq_printf(s, "(%s version %x), ",
+					  __sdb_string(tmp, r->synthesis.tool_name),
+					  __be32_to_cpu(r->synthesis.tool_version));
+			seq_printf(s, "commit %pm\n",
+					  r->synthesis.commit_id);
+			break;
+		case sdb_type_empty:
+			seq_printf(s, "empty\n");
+			break;
+		default:
+			seq_printf(s, "UNKNOWN TYPE 0x%02x\n",
+				   r->empty.record_type);
+			break;
+		}
+	}
+}
+
+static int fmc_sdb_dump(struct seq_file *s, void *offset)
+{
+	struct fmc_device *fmc = s->private;
+
+	if (!fmc->sdb) {
+		seq_printf(s, "no SDB information\n");
+		return 0;
+	}
+
+	seq_printf(s, "FMC: %s (%s), slot %i, device %s\n", dev_name(fmc->hwdev),
+	fmc->carrier_name, fmc->slot_id, dev_name(&fmc->dev));
+	/* Dump SDB information */
+	fmc_sdb_dump_recursive(fmc, s, fmc->sdb);
+
+	return 0;
+}
+
+
+static int fmc_sdb_dump_open(struct inode *inode, struct file *file)
+{
+	struct fmc_device *fmc = inode->i_private;
+
+	return single_open(file, fmc_sdb_dump, fmc);
+}
+
+
+const struct file_operations fmc_dbgfs_sdb_dump = {
+	.owner = THIS_MODULE,
+	.open  = fmc_sdb_dump_open,
+	.read = seq_read,
+	.llseek = seq_lseek,
+	.release = single_release,
+};
+
+int fmc_debug_init(struct fmc_device *fmc)
+{
+	fmc->dbg_dir = debugfs_create_dir(dev_name(&fmc->dev), NULL);
+	if (IS_ERR_OR_NULL(fmc->dbg_dir)) {
+		pr_err("FMC: Cannot create debugfs\n");
+		return PTR_ERR(fmc->dbg_dir);
+	}
+
+	fmc->dbg_sdb_dump = debugfs_create_file(FMC_DBG_SDB_DUMP, 0444,
+						fmc->dbg_dir, fmc,
+						&fmc_dbgfs_sdb_dump);
+	if (IS_ERR_OR_NULL(fmc->dbg_sdb_dump))
+		pr_err("FMC: Cannot create debugfs file %s\n",
+		       FMC_DBG_SDB_DUMP);
+
+	return 0;
+}
+
+void fmc_debug_exit(struct fmc_device *fmc)
+{
+	if (fmc->dbg_dir)
+		debugfs_remove_recursive(fmc->dbg_dir);
+}
diff --git a/drivers/fmc/fmc-dump.c b/drivers/fmc/fmc-dump.c
index c91afd6388f6..cd1df475b254 100644
--- a/drivers/fmc/fmc-dump.c
+++ b/drivers/fmc/fmc-dump.c
@@ -15,8 +15,6 @@
 
 static int fmc_must_dump_eeprom;
 module_param_named(dump_eeprom, fmc_must_dump_eeprom, int, 0644);
-static int fmc_must_dump_sdb;
-module_param_named(dump_sdb, fmc_must_dump_sdb, int, 0644);
 
 #define LINELEN 16
 
@@ -59,42 +57,3 @@ void fmc_dump_eeprom(const struct fmc_device *fmc)
 	for (i = 0; i < fmc->eeprom_len; i += LINELEN, line += LINELEN)
 		prev = dump_line(i, line, prev);
 }
-
-void fmc_dump_sdb(const struct fmc_device *fmc)
-{
-	const uint8_t *line, *prev;
-	int i, len;
-
-	if (!fmc->sdb)
-		return;
-	if (!fmc_must_dump_sdb)
-		return;
-
-	/* If the argument is not-zero, do simple dump (== show) */
-	if (fmc_must_dump_sdb > 0)
-		fmc_show_sdb_tree(fmc);
-
-	if (fmc_must_dump_sdb == 1)
-		return;
-
-	/* If bigger than 1, dump it seriously, to help debugging */
-
-	/*
-	 * Here we should really use libsdbfs (which is designed to
-	 * work in kernel space as well) , but it doesn't support
-	 * directories yet, and it requires better intergration (it
-	 * should be used instead of fmc-specific code).
-	 *
-	 * So, lazily, just dump the top-level array
-	 */
-	pr_info("FMC: %s (%s), slot %i, device %s\n", dev_name(fmc->hwdev),
-		fmc->carrier_name, fmc->slot_id, dev_name(&fmc->dev));
-	pr_info("FMC: poor dump of sdb first level:\n");
-
-	len = fmc->sdb->len * sizeof(union sdb_record);
-	line = (void *)fmc->sdb->record;
-	prev = NULL;
-	for (i = 0; i < len; i += LINELEN, line += LINELEN)
-		prev = dump_line(i, line, prev);
-	return;
-}
diff --git a/drivers/fmc/fmc-private.h b/drivers/fmc/fmc-private.h
new file mode 100644
index 000000000000..1e5136643bdc
--- /dev/null
+++ b/drivers/fmc/fmc-private.h
@@ -0,0 +1,9 @@
+/*
+ * Copyright (C) 2015 CERN (www.cern.ch)
+ * Author: Federico Vaga <federico.vaga@cern.ch>
+ *
+ * Released according to the GNU GPL, version 2 or any later version.
+ */
+
+extern int fmc_debug_init(struct fmc_device *fmc);
+extern void fmc_debug_exit(struct fmc_device *fmc);
diff --git a/drivers/fmc/fmc-sdb.c b/drivers/fmc/fmc-sdb.c
index 4603fdb74465..89e37a6cfc66 100644
--- a/drivers/fmc/fmc-sdb.c
+++ b/drivers/fmc/fmc-sdb.c
@@ -145,108 +145,15 @@ int fmc_reprogram(struct fmc_device *fmc, struct fmc_driver *d, char *gw,
 			sdb_entry);
 		return -ENODEV;
 	}
-	fmc_dump_sdb(fmc);
+
 	return 0;
 }
 EXPORT_SYMBOL(fmc_reprogram);
 
-static char *__strip_trailing_space(char *buf, char *str, int len)
-{
-	int i = len - 1;
-
-	memcpy(buf, str, len);
-	while(i >= 0 && buf[i] == ' ')
-		buf[i--] = '\0';
-	return buf;
-}
-
-#define __sdb_string(buf, field) ({			\
-	BUILD_BUG_ON(sizeof(buf) < sizeof(field));	\
-	__strip_trailing_space(buf, (void *)(field), sizeof(field));	\
-		})
-
-static void __fmc_show_sdb_tree(const struct fmc_device *fmc,
-				const struct sdb_array *arr)
-{
-	unsigned long base = arr->baseaddr;
-	int i, j, n = arr->len, level = arr->level;
-	char buf[64];
-
-	for (i = 0; i < n; i++) {
-		union  sdb_record *r;
-		struct sdb_product *p;
-		struct sdb_component *c;
-		r = &arr->record[i];
-		c = &r->dev.sdb_component;
-		p = &c->product;
-
-		dev_info(&fmc->dev, "SDB: ");
-
-		for (j = 0; j < level; j++)
-			printk(KERN_CONT "   ");
-		switch (r->empty.record_type) {
-		case sdb_type_interconnect:
-			printk(KERN_CONT "%08llx:%08x %.19s\n",
-			       __be64_to_cpu(p->vendor_id),
-			       __be32_to_cpu(p->device_id),
-			       p->name);
-			break;
-		case sdb_type_device:
-			printk(KERN_CONT "%08llx:%08x %.19s (%08llx-%08llx)\n",
-			       __be64_to_cpu(p->vendor_id),
-			       __be32_to_cpu(p->device_id),
-			       p->name,
-			       __be64_to_cpu(c->addr_first) + base,
-			       __be64_to_cpu(c->addr_last) + base);
-			break;
-		case sdb_type_bridge:
-			printk(KERN_CONT "%08llx:%08x %.19s (bridge: %08llx)\n",
-			       __be64_to_cpu(p->vendor_id),
-			       __be32_to_cpu(p->device_id),
-			       p->name,
-			       __be64_to_cpu(c->addr_first) + base);
-			if (IS_ERR(arr->subtree[i])) {
-				dev_info(&fmc->dev, "SDB: (bridge error %li)\n",
-					 PTR_ERR(arr->subtree[i]));
-				break;
-			}
-			__fmc_show_sdb_tree(fmc, arr->subtree[i]);
-			break;
-		case sdb_type_integration:
-			printk(KERN_CONT "integration\n");
-			break;
-		case sdb_type_repo_url:
-			printk(KERN_CONT "Synthesis repository: %s\n",
-			       __sdb_string(buf, r->repo_url.repo_url));
-			break;
-		case sdb_type_synthesis:
-			printk(KERN_CONT "Bitstream '%s' ",
-			       __sdb_string(buf, r->synthesis.syn_name));
-			printk(KERN_CONT "synthesized %08x by %s ",
-			       __be32_to_cpu(r->synthesis.date),
-			       __sdb_string(buf, r->synthesis.user_name));
-			printk(KERN_CONT "(%s version %x), ",
-			       __sdb_string(buf, r->synthesis.tool_name),
-			       __be32_to_cpu(r->synthesis.tool_version));
-			printk(KERN_CONT "commit %pm\n",
-			       r->synthesis.commit_id);
-			break;
-		case sdb_type_empty:
-			printk(KERN_CONT "empty\n");
-			break;
-		default:
-			printk(KERN_CONT "UNKNOWN TYPE 0x%02x\n",
-			       r->empty.record_type);
-			break;
-		}
-	}
-}
-
 void fmc_show_sdb_tree(const struct fmc_device *fmc)
 {
-	if (!fmc->sdb)
-		return;
-	__fmc_show_sdb_tree(fmc, fmc->sdb);
+	pr_err("%s: not supported anymore, use debugfs to dump SDB\n",
+		__func__);
 }
 EXPORT_SYMBOL(fmc_show_sdb_tree);
 
diff --git a/include/linux/fmc.h b/include/linux/fmc.h
index 8bb4a154bbdc..5c8df0c49fc3 100644
--- a/include/linux/fmc.h
+++ b/include/linux/fmc.h
@@ -180,6 +180,9 @@ struct fmc_device {
 	uint32_t device_id;		/* Filled by the device */
 	char *mezzanine_name;		/* Defaults to ``fmc'' */
 	void *mezzanine_data;
+
+	struct dentry *dbg_dir;
+	struct dentry *dbg_sdb_dump;
 };
 #define to_fmc_device(x) container_of((x), struct fmc_device, dev)
 
@@ -232,7 +235,6 @@ extern int fmc_match(struct device *dev, struct device_driver *drv);
 extern int fmc_fill_id_info(struct fmc_device *fmc);
 extern void fmc_free_id_info(struct fmc_device *fmc);
 extern void fmc_dump_eeprom(const struct fmc_device *fmc);
-extern void fmc_dump_sdb(const struct fmc_device *fmc);
 
 /* helpers for FMC operations */
 extern int fmc_irq_request(struct fmc_device *fmc, irq_handler_t h,
-- 
cgit v1.2.3


From 15b1b0f0d87e34a87172d9b83bd260543a991c2e Mon Sep 17 00:00:00 2001
From: Federico Vaga <federico.vaga@cern.ch>
Date: Tue, 18 Jul 2017 08:33:13 +0200
Subject: drivers/fmc: change registration prototype

Permit use of either fmc_device_register_n or fmc_device_register_n_gw
depending on the type of device in use.

Signed-off-by: Federico Vaga <federico.vaga@cern.ch>
Tested-by: Pat Riehecky <riehecky@fnal.gov>
Acked-by: Alessandro Rubini <rubini@gnudd.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/fmc/fmc-core.c | 15 ++++++++++++++-
 include/linux/fmc.h    | 13 +++++++++++--
 2 files changed, 25 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/drivers/fmc/fmc-core.c b/drivers/fmc/fmc-core.c
index ef6d8acb0a81..eabeac04ddda 100644
--- a/drivers/fmc/fmc-core.c
+++ b/drivers/fmc/fmc-core.c
@@ -199,7 +199,8 @@ EXPORT_SYMBOL(fmc_driver_unregister);
  * When a device set is registered, all eeproms must be read
  * and all FRUs must be parsed
  */
-int fmc_device_register_n(struct fmc_device **devs, int n)
+int fmc_device_register_n_gw(struct fmc_device **devs, int n,
+			  struct fmc_gateware *gw)
 {
 	struct fmc_device *fmc, **devarray;
 	uint32_t device_id;
@@ -313,8 +314,20 @@ out:
 	return ret;
 
 }
+EXPORT_SYMBOL(fmc_device_register_n_gw);
+
+int fmc_device_register_n(struct fmc_device **devs, int n)
+{
+	return fmc_device_register_n_gw(devs, n, NULL);
+}
 EXPORT_SYMBOL(fmc_device_register_n);
 
+int fmc_device_register_gw(struct fmc_device *fmc, struct fmc_gateware *gw)
+{
+	return fmc_device_register_n_gw(&fmc, 1, gw);
+}
+EXPORT_SYMBOL(fmc_device_register_gw);
+
 int fmc_device_register(struct fmc_device *fmc)
 {
 	return fmc_device_register_n(&fmc, 1);
diff --git a/include/linux/fmc.h b/include/linux/fmc.h
index 5c8df0c49fc3..b6c73d54ca46 100644
--- a/include/linux/fmc.h
+++ b/include/linux/fmc.h
@@ -220,14 +220,23 @@ static inline void fmc_set_drvdata(struct fmc_device *fmc, void *data)
 	dev_set_drvdata(&fmc->dev, data);
 }
 
-/* The 4 access points */
+struct fmc_gateware {
+	void *bitstream;
+	unsigned long len;
+};
+
+/* The 5 access points */
 extern int fmc_driver_register(struct fmc_driver *drv);
 extern void fmc_driver_unregister(struct fmc_driver *drv);
 extern int fmc_device_register(struct fmc_device *tdev);
+extern int fmc_device_register_gw(struct fmc_device *tdev,
+				  struct fmc_gateware *gw);
 extern void fmc_device_unregister(struct fmc_device *tdev);
 
-/* Two more for device sets, all driven by the same FPGA */
+/* Three more for device sets, all driven by the same FPGA */
 extern int fmc_device_register_n(struct fmc_device **devs, int n);
+extern int fmc_device_register_n_gw(struct fmc_device **devs, int n,
+				    struct fmc_gateware *gw);
 extern void fmc_device_unregister_n(struct fmc_device **devs, int n);
 
 /* Internal cross-calls between files; not exported to other modules */
-- 
cgit v1.2.3


From 9c0dda14951e9ed30b6f04e14bf64a3c69686c5b Mon Sep 17 00:00:00 2001
From: Federico Vaga <federico.vaga@cern.ch>
Date: Tue, 18 Jul 2017 08:33:24 +0200
Subject: drivers/fmc: carrier can program FPGA on registration

The initial FPGA may require programming before it is useful.

Signed-off-by: Federico Vaga <federico.vaga@cern.ch>
Tested-by: Pat Riehecky <riehecky@fnal.gov>
Acked-by: Alessandro Rubini <rubini@gnudd.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/fmc/fmc-core.c | 18 +++++++++++++++---
 drivers/fmc/fmc-sdb.c  | 24 ++++++++++++++++++++++++
 include/linux/fmc.h    |  4 ++++
 3 files changed, 43 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/drivers/fmc/fmc-core.c b/drivers/fmc/fmc-core.c
index eabeac04ddda..cec3b8db0d69 100644
--- a/drivers/fmc/fmc-core.c
+++ b/drivers/fmc/fmc-core.c
@@ -280,6 +280,21 @@ int fmc_device_register_n_gw(struct fmc_device **devs, int n,
 		else
 			dev_set_name(&fmc->dev, "%s-%04x", fmc->mezzanine_name,
 				     device_id);
+
+		if (gw) {
+			/*
+			 * The carrier already know the bitstream to load
+			 * for this set of FMC mezzanines.
+			 */
+			ret = fmc->op->reprogram_raw(fmc, NULL,
+						     gw->bitstream, gw->len);
+			if (ret) {
+				dev_warn(fmc->hwdev,
+					 "Invalid gateware for FMC mezzanine\n");
+				goto out;
+			}
+		}
+
 		ret = device_add(&fmc->dev);
 		if (ret < 0) {
 			dev_err(fmc->hwdev, "Slot %i: Failed in registering "
@@ -300,9 +315,6 @@ int fmc_device_register_n_gw(struct fmc_device **devs, int n,
 out1:
 	device_del(&fmc->dev);
 out:
-	fmc_free_id_info(fmc);
-	put_device(&fmc->dev);
-
 	kfree(devarray);
 	for (i--; i >= 0; i--) {
 		fmc_debug_exit(devs[i]);
diff --git a/drivers/fmc/fmc-sdb.c b/drivers/fmc/fmc-sdb.c
index 89e37a6cfc66..ffdc1762b580 100644
--- a/drivers/fmc/fmc-sdb.c
+++ b/drivers/fmc/fmc-sdb.c
@@ -126,6 +126,30 @@ int fmc_free_sdb_tree(struct fmc_device *fmc)
 }
 EXPORT_SYMBOL(fmc_free_sdb_tree);
 
+/* This helper calls reprogram and inizialized sdb as well */
+int fmc_reprogram_raw(struct fmc_device *fmc, struct fmc_driver *d,
+		      void *gw, unsigned long len, int sdb_entry)
+{
+	int ret;
+
+	ret = fmc->op->reprogram_raw(fmc, d, gw, len);
+	if (ret < 0)
+		return ret;
+	if (sdb_entry < 0)
+		return ret;
+
+	/* We are required to find SDB at a given offset */
+	ret = fmc_scan_sdb_tree(fmc, sdb_entry);
+	if (ret < 0) {
+		dev_err(&fmc->dev, "Can't find SDB at address 0x%x\n",
+			sdb_entry);
+		return -ENODEV;
+	}
+
+	return 0;
+}
+EXPORT_SYMBOL(fmc_reprogram_raw);
+
 /* This helper calls reprogram and inizialized sdb as well */
 int fmc_reprogram(struct fmc_device *fmc, struct fmc_driver *d, char *gw,
 			 int sdb_entry)
diff --git a/include/linux/fmc.h b/include/linux/fmc.h
index b6c73d54ca46..3dc8a1b2db7b 100644
--- a/include/linux/fmc.h
+++ b/include/linux/fmc.h
@@ -132,6 +132,8 @@ struct fmc_operations {
 	uint32_t (*read32)(struct fmc_device *fmc, int offset);
 	void (*write32)(struct fmc_device *fmc, uint32_t value, int offset);
 	int (*validate)(struct fmc_device *fmc, struct fmc_driver *drv);
+	int (*reprogram_raw)(struct fmc_device *f, struct fmc_driver *d,
+			     void *gw, unsigned long len);
 	int (*reprogram)(struct fmc_device *f, struct fmc_driver *d, char *gw);
 	int (*irq_request)(struct fmc_device *fmc, irq_handler_t h,
 			   char *name, int flags);
@@ -144,6 +146,8 @@ struct fmc_operations {
 };
 
 /* Prefer this helper rather than calling of fmc->reprogram directly */
+int fmc_reprogram_raw(struct fmc_device *fmc, struct fmc_driver *d,
+		      void *gw, unsigned long len, int sdb_entry);
 extern int fmc_reprogram(struct fmc_device *f, struct fmc_driver *d, char *gw,
 		     int sdb_entry);
 
-- 
cgit v1.2.3


From 6a7a81761bee2a6694c0a5fb1a16b263c96241c1 Mon Sep 17 00:00:00 2001
From: Jonathan Corbet <corbet@lwn.net>
Date: Thu, 24 Aug 2017 16:09:10 -0600
Subject: driver core: Document struct device:dma_ops

Commit 5657933dbb6e (treewide: Move dma_ops from struct dev_archdata into
struct device) added the dma_ops field to struct device, but did not
update the kerneldoc comment, yielding this warning:

  ./include/linux/device.h:969: warning: No description found for parameter 'dma_ops'

Add a description and bring a little peace to the world.

Signed-off-by: Jonathan Corbet <corbet@lwn.net>
Reviewed-by: Bart Van Assche <bart.vanassche@wdc.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/linux/device.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include')

diff --git a/include/linux/device.h b/include/linux/device.h
index 5db26b9c0fcc..d5bcc5f109eb 100644
--- a/include/linux/device.h
+++ b/include/linux/device.h
@@ -847,6 +847,7 @@ struct dev_links_info {
  * @msi_list:	Hosts MSI descriptors
  * @msi_domain: The generic MSI domain this device is using.
  * @numa_node:	NUMA node this device is close to.
+ * @dma_ops:    DMA mapping operations for this device.
  * @dma_mask:	Dma mask (if dma'ble device).
  * @coherent_dma_mask: Like dma_mask, but for alloc_coherent mapping as not all
  * 		hardware supports 64-bit addresses for consistent allocations
-- 
cgit v1.2.3


From ddc088238cd6988bb4ac3776f403d7ff9d3c7a63 Mon Sep 17 00:00:00 2001
From: Pawel Baldysiak <pawel.baldysiak@intel.com>
Date: Wed, 16 Aug 2017 17:13:45 +0200
Subject: md: Runtime support for multiple ppls

Increase PPL area to 1MB and use it as circular buffer to store PPL. The
entry with highest generation number is the latest one. If PPL to be
written is larger then space left in a buffer, rewind the buffer to the
start (don't wrap it).

Signed-off-by: Pawel Baldysiak <pawel.baldysiak@intel.com>
Signed-off-by: Artur Paszkiewicz <artur.paszkiewicz@intel.com>
Signed-off-by: Shaohua Li <shli@fb.com>
---
 drivers/md/md.c                | 16 +++++++++++++---
 drivers/md/md.h                |  1 +
 drivers/md/raid0.c             |  3 ++-
 drivers/md/raid1.c             |  3 ++-
 drivers/md/raid5-ppl.c         | 43 +++++++++++++++++++++++++++++++++++++++---
 drivers/md/raid5.c             |  1 +
 include/uapi/linux/raid/md_p.h |  4 +++-
 7 files changed, 62 insertions(+), 9 deletions(-)

(limited to 'include')

diff --git a/drivers/md/md.c b/drivers/md/md.c
index a74dc9963822..a7876237de10 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -1536,7 +1536,8 @@ static int super_1_load(struct md_rdev *rdev, struct md_rdev *refdev, int minor_
 	} else if (sb->bblog_offset != 0)
 		rdev->badblocks.shift = 0;
 
-	if (le32_to_cpu(sb->feature_map) & MD_FEATURE_PPL) {
+	if ((le32_to_cpu(sb->feature_map) &
+	    (MD_FEATURE_PPL | MD_FEATURE_MULTIPLE_PPLS))) {
 		rdev->ppl.offset = (__s16)le16_to_cpu(sb->ppl.offset);
 		rdev->ppl.size = le16_to_cpu(sb->ppl.size);
 		rdev->ppl.sector = rdev->sb_start + rdev->ppl.offset;
@@ -1655,10 +1656,15 @@ static int super_1_validate(struct mddev *mddev, struct md_rdev *rdev)
 		if (le32_to_cpu(sb->feature_map) & MD_FEATURE_JOURNAL)
 			set_bit(MD_HAS_JOURNAL, &mddev->flags);
 
-		if (le32_to_cpu(sb->feature_map) & MD_FEATURE_PPL) {
+		if (le32_to_cpu(sb->feature_map) &
+		    (MD_FEATURE_PPL | MD_FEATURE_MULTIPLE_PPLS)) {
 			if (le32_to_cpu(sb->feature_map) &
 			    (MD_FEATURE_BITMAP_OFFSET | MD_FEATURE_JOURNAL))
 				return -EINVAL;
+			if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_PPL) &&
+			    (le32_to_cpu(sb->feature_map) &
+					    MD_FEATURE_MULTIPLE_PPLS))
+				return -EINVAL;
 			set_bit(MD_HAS_PPL, &mddev->flags);
 		}
 	} else if (mddev->pers == NULL) {
@@ -1875,7 +1881,11 @@ retry:
 		sb->feature_map |= cpu_to_le32(MD_FEATURE_JOURNAL);
 
 	if (test_bit(MD_HAS_PPL, &mddev->flags)) {
-		sb->feature_map |= cpu_to_le32(MD_FEATURE_PPL);
+		if (test_bit(MD_HAS_MULTIPLE_PPLS, &mddev->flags))
+			sb->feature_map |=
+			    cpu_to_le32(MD_FEATURE_MULTIPLE_PPLS);
+		else
+			sb->feature_map |= cpu_to_le32(MD_FEATURE_PPL);
 		sb->ppl.offset = cpu_to_le16(rdev->ppl.offset);
 		sb->ppl.size = cpu_to_le16(rdev->ppl.size);
 	}
diff --git a/drivers/md/md.h b/drivers/md/md.h
index 09db03455801..d4bdfa5c223b 100644
--- a/drivers/md/md.h
+++ b/drivers/md/md.h
@@ -236,6 +236,7 @@ enum mddev_flags {
 				 * never cause the array to become failed.
 				 */
 	MD_HAS_PPL,		/* The raid array has PPL feature set */
+	MD_HAS_MULTIPLE_PPLS,	/* The raid array has multiple PPLs feature set */
 };
 
 enum mddev_sb_flags {
diff --git a/drivers/md/raid0.c b/drivers/md/raid0.c
index 6fb81704aff4..fd5e8e5efbef 100644
--- a/drivers/md/raid0.c
+++ b/drivers/md/raid0.c
@@ -30,7 +30,8 @@
 	((1L << MD_HAS_JOURNAL) |	\
 	 (1L << MD_JOURNAL_CLEAN) |	\
 	 (1L << MD_FAILFAST_SUPPORTED) |\
-	 (1L << MD_HAS_PPL))
+	 (1L << MD_HAS_PPL) |		\
+	 (1L << MD_HAS_MULTIPLE_PPLS))
 
 static int raid0_congested(struct mddev *mddev, int bits)
 {
diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index 79474f47eeef..1f5bd9475dc1 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -48,7 +48,8 @@
 #define UNSUPPORTED_MDDEV_FLAGS		\
 	((1L << MD_HAS_JOURNAL) |	\
 	 (1L << MD_JOURNAL_CLEAN) |	\
-	 (1L << MD_HAS_PPL))
+	 (1L << MD_HAS_PPL) |		\
+	 (1L << MD_HAS_MULTIPLE_PPLS))
 
 /*
  * Number of guaranteed r1bios in case of extreme VM load:
diff --git a/drivers/md/raid5-ppl.c b/drivers/md/raid5-ppl.c
index 44ad5baf3206..b313f17a6260 100644
--- a/drivers/md/raid5-ppl.c
+++ b/drivers/md/raid5-ppl.c
@@ -87,6 +87,8 @@
  * The current io_unit accepting new stripes is always at the end of the list.
  */
 
+#define PPL_SPACE_SIZE (128 * 1024)
+
 struct ppl_conf {
 	struct mddev *mddev;
 
@@ -122,6 +124,10 @@ struct ppl_log {
 					 * always at the end of io_list */
 	spinlock_t io_list_lock;
 	struct list_head io_list;	/* all io_units of this log */
+
+	sector_t next_io_sector;
+	unsigned int entry_space;
+	bool use_multippl;
 };
 
 #define PPL_IO_INLINE_BVECS 32
@@ -264,13 +270,12 @@ static int ppl_log_stripe(struct ppl_log *log, struct stripe_head *sh)
 	int i;
 	sector_t data_sector = 0;
 	int data_disks = 0;
-	unsigned int entry_space = (log->rdev->ppl.size << 9) - PPL_HEADER_SIZE;
 	struct r5conf *conf = sh->raid_conf;
 
 	pr_debug("%s: stripe: %llu\n", __func__, (unsigned long long)sh->sector);
 
 	/* check if current io_unit is full */
-	if (io && (io->pp_size == entry_space ||
+	if (io && (io->pp_size == log->entry_space ||
 		   io->entries_count == PPL_HDR_MAX_ENTRIES)) {
 		pr_debug("%s: add io_unit blocked by seq: %llu\n",
 			 __func__, io->seq);
@@ -451,12 +456,25 @@ static void ppl_submit_iounit(struct ppl_io_unit *io)
 	pplhdr->entries_count = cpu_to_le32(io->entries_count);
 	pplhdr->checksum = cpu_to_le32(~crc32c_le(~0, pplhdr, PPL_HEADER_SIZE));
 
+	/* Rewind the buffer if current PPL is larger then remaining space */
+	if (log->use_multippl &&
+	    log->rdev->ppl.sector + log->rdev->ppl.size - log->next_io_sector <
+	    (PPL_HEADER_SIZE + io->pp_size) >> 9)
+		log->next_io_sector = log->rdev->ppl.sector;
+
+
 	bio->bi_end_io = ppl_log_endio;
 	bio->bi_opf = REQ_OP_WRITE | REQ_FUA;
 	bio->bi_bdev = log->rdev->bdev;
-	bio->bi_iter.bi_sector = log->rdev->ppl.sector;
+	bio->bi_iter.bi_sector = log->next_io_sector;
 	bio_add_page(bio, io->header_page, PAGE_SIZE, 0);
 
+	pr_debug("%s: log->current_io_sector: %llu\n", __func__,
+	    (unsigned long long)log->next_io_sector);
+
+	if (log->use_multippl)
+		log->next_io_sector += (PPL_HEADER_SIZE + io->pp_size) >> 9;
+
 	list_for_each_entry(sh, &io->stripe_list, log_list) {
 		/* entries for full stripe writes have no partial parity */
 		if (test_bit(STRIPE_FULL_WRITE, &sh->state))
@@ -1031,6 +1049,7 @@ static int ppl_load(struct ppl_conf *ppl_conf)
 static void __ppl_exit_log(struct ppl_conf *ppl_conf)
 {
 	clear_bit(MD_HAS_PPL, &ppl_conf->mddev->flags);
+	clear_bit(MD_HAS_MULTIPLE_PPLS, &ppl_conf->mddev->flags);
 
 	kfree(ppl_conf->child_logs);
 
@@ -1099,6 +1118,22 @@ static int ppl_validate_rdev(struct md_rdev *rdev)
 	return 0;
 }
 
+static void ppl_init_child_log(struct ppl_log *log, struct md_rdev *rdev)
+{
+	if ((rdev->ppl.size << 9) >= (PPL_SPACE_SIZE +
+				      PPL_HEADER_SIZE) * 2) {
+		log->use_multippl = true;
+		set_bit(MD_HAS_MULTIPLE_PPLS,
+			&log->ppl_conf->mddev->flags);
+		log->entry_space = PPL_SPACE_SIZE;
+	} else {
+		log->use_multippl = false;
+		log->entry_space = (log->rdev->ppl.size << 9) -
+				   PPL_HEADER_SIZE;
+	}
+	log->next_io_sector = rdev->ppl.sector;
+}
+
 int ppl_init_log(struct r5conf *conf)
 {
 	struct ppl_conf *ppl_conf;
@@ -1196,6 +1231,7 @@ int ppl_init_log(struct r5conf *conf)
 			q = bdev_get_queue(rdev->bdev);
 			if (test_bit(QUEUE_FLAG_WC, &q->queue_flags))
 				need_cache_flush = true;
+			ppl_init_child_log(log, rdev);
 		}
 	}
 
@@ -1261,6 +1297,7 @@ int ppl_modify_log(struct r5conf *conf, struct md_rdev *rdev, bool add)
 		if (!ret) {
 			log->rdev = rdev;
 			ret = ppl_write_empty_header(log);
+			ppl_init_child_log(log, rdev);
 		}
 	} else {
 		log->rdev = NULL;
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 6af57c6c0533..049a958d3c1e 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -7236,6 +7236,7 @@ static int raid5_run(struct mddev *mddev)
 		pr_warn("md/raid:%s: using journal device and PPL not allowed - disabling PPL\n",
 			mdname(mddev));
 		clear_bit(MD_HAS_PPL, &mddev->flags);
+		clear_bit(MD_HAS_MULTIPLE_PPLS, &mddev->flags);
 	}
 
 	if (mddev->private == NULL)
diff --git a/include/uapi/linux/raid/md_p.h b/include/uapi/linux/raid/md_p.h
index d500bd224979..b9197976b660 100644
--- a/include/uapi/linux/raid/md_p.h
+++ b/include/uapi/linux/raid/md_p.h
@@ -324,9 +324,10 @@ struct mdp_superblock_1 {
 #define	MD_FEATURE_RECOVERY_BITMAP	128 /* recovery that is happening
 					     * is guided by bitmap.
 					     */
-#define MD_FEATURE_CLUSTERED		256 /* clustered MD */
+#define	MD_FEATURE_CLUSTERED		256 /* clustered MD */
 #define	MD_FEATURE_JOURNAL		512 /* support write cache */
 #define	MD_FEATURE_PPL			1024 /* support PPL */
+#define	MD_FEATURE_MULTIPLE_PPLS	2048 /* support for multiple PPLs */
 #define	MD_FEATURE_ALL			(MD_FEATURE_BITMAP_OFFSET	\
 					|MD_FEATURE_RECOVERY_OFFSET	\
 					|MD_FEATURE_RESHAPE_ACTIVE	\
@@ -338,6 +339,7 @@ struct mdp_superblock_1 {
 					|MD_FEATURE_CLUSTERED		\
 					|MD_FEATURE_JOURNAL		\
 					|MD_FEATURE_PPL			\
+					|MD_FEATURE_MULTIPLE_PPLS	\
 					)
 
 struct r5l_payload_header {
-- 
cgit v1.2.3


From e0552573575ef43daaee5b1a139952d6127f088c Mon Sep 17 00:00:00 2001
From: Fabio Estevam <fabio.estevam@nxp.com>
Date: Wed, 19 Jul 2017 23:35:29 -0300
Subject: misc: eeprom_93xx46: Include <linux/gpio/consumer.h>

eeprom_93xx46_platform_data struct has a 'struct gpio_desc'
type member, so it is better to include <linux/gpio/consumer.h>,
which provides 'struct gpio_desc' type.

Signed-off-by: Fabio Estevam <fabio.estevam@nxp.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/linux/eeprom_93xx46.h | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/linux/eeprom_93xx46.h b/include/linux/eeprom_93xx46.h
index 885f587a3555..915898759280 100644
--- a/include/linux/eeprom_93xx46.h
+++ b/include/linux/eeprom_93xx46.h
@@ -2,8 +2,7 @@
  * Module: eeprom_93xx46
  * platform description for 93xx46 EEPROMs.
  */
-
-struct gpio_desc;
+#include <linux/gpio/consumer.h>
 
 struct eeprom_93xx46_platform_data {
 	unsigned char	flags;
-- 
cgit v1.2.3


From d3cc8ca3ab6c1d78c26102870313fa36c48ea247 Mon Sep 17 00:00:00 2001
From: Ulrich Hecht <ulrich.hecht+renesas@gmail.com>
Date: Tue, 22 Aug 2017 08:27:07 +0200
Subject: mux: include compiler.h from mux/consumer.h

Required for __must_check.

Signed-off-by: Ulrich Hecht <ulrich.hecht+renesas@gmail.com>
Signed-off-by: Peter Rosin <peda@axentia.se>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/linux/mux/consumer.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include')

diff --git a/include/linux/mux/consumer.h b/include/linux/mux/consumer.h
index 5577e1b773c4..ea96d4c82be7 100644
--- a/include/linux/mux/consumer.h
+++ b/include/linux/mux/consumer.h
@@ -13,6 +13,8 @@
 #ifndef _LINUX_MUX_CONSUMER_H
 #define _LINUX_MUX_CONSUMER_H
 
+#include <linux/compiler.h>
+
 struct device;
 struct mux_control;
 
-- 
cgit v1.2.3


From 242b476f821b055ed0fb70a8eb6defa85baada9c Mon Sep 17 00:00:00 2001
From: Johannes Poehlmann <johannes.poehlmann@izt-labs.de>
Date: Tue, 25 Jul 2017 13:27:11 +0200
Subject: w1: ds1wm: fix register offset (bus shift) calculation

Replace incorrect register offsett calculation by
direct configuration of bus_shift in mfd-cell.

Indirect definition of address-shift by resource size
was unobvious and was wrong (should have used a binary log).

Signed-off-by: Johannes Poehlmann <johannes.poehlmann@izt-labs.de>
Acked-by: Evgeniy Polyakov <zbr@ioremap.net>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/w1/masters/ds1wm.c | 23 +++++++++++++++++++----
 include/linux/mfd/ds1wm.h  | 24 +++++++++++++++++-------
 2 files changed, 36 insertions(+), 11 deletions(-)

(limited to 'include')

diff --git a/drivers/w1/masters/ds1wm.c b/drivers/w1/masters/ds1wm.c
index fd2e9da27c4b..401e53e0482a 100644
--- a/drivers/w1/masters/ds1wm.c
+++ b/drivers/w1/masters/ds1wm.c
@@ -95,7 +95,7 @@ static struct {
 
 struct ds1wm_data {
 	void     __iomem *map;
-	int      bus_shift; /* # of shifts to calc register offsets */
+	unsigned int      bus_shift; /* # of shifts to calc register offsets */
 	struct platform_device *pdev;
 	const struct mfd_cell   *cell;
 	int      irq;
@@ -473,9 +473,6 @@ static int ds1wm_probe(struct platform_device *pdev)
 	if (!ds1wm_data->map)
 		return -ENOMEM;
 
-	/* calculate bus shift from mem resource */
-	ds1wm_data->bus_shift = resource_size(res) >> 3;
-
 	ds1wm_data->pdev = pdev;
 	ds1wm_data->cell = mfd_get_cell(pdev);
 	if (!ds1wm_data->cell)
@@ -484,6 +481,24 @@ static int ds1wm_probe(struct platform_device *pdev)
 	if (!plat)
 		return -ENODEV;
 
+	/* how many bits to shift register number to get register offset */
+	if (plat->bus_shift > 2) {
+		dev_err(&ds1wm_data->pdev->dev,
+			"illegal bus shift %d, not written",
+			ds1wm_data->bus_shift);
+		return -EINVAL;
+	}
+
+	ds1wm_data->bus_shift = plat->bus_shift;
+	/* make sure resource has space for 8 registers */
+	if ((8 << ds1wm_data->bus_shift) > resource_size(res)) {
+		dev_err(&ds1wm_data->pdev->dev,
+			"memory resource size %d to small, should be %d\n",
+			(int)resource_size(res),
+			8 << ds1wm_data->bus_shift);
+		return -EINVAL;
+	}
+
 	res = platform_get_resource(pdev, IORESOURCE_IRQ, 0);
 	if (!res)
 		return -ENXIO;
diff --git a/include/linux/mfd/ds1wm.h b/include/linux/mfd/ds1wm.h
index 38a372a0e285..79a01e8dc83f 100644
--- a/include/linux/mfd/ds1wm.h
+++ b/include/linux/mfd/ds1wm.h
@@ -1,13 +1,23 @@
-/* MFD cell driver data for the DS1WM driver */
+/* MFD cell driver data for the DS1WM driver
+ *
+ * to be defined in the MFD device that is
+ * using this driver for one of his sub devices
+ */
 
 struct ds1wm_driver_data {
 	int active_high;
 	int clock_rate;
-	/* in milliseconds, the amount of time to */
-	/* sleep following a reset pulse. Zero    */
-	/* should work if your bus devices recover*/
-	/* time respects the 1-wire spec since the*/
-	/* ds1wm implements the precise timings of*/
-	/* a reset pulse/presence detect sequence.*/
+	/* in milliseconds, the amount of time to
+	 * sleep following a reset pulse. Zero
+	 * should work if your bus devices recover
+	 * time respects the 1-wire spec since the
+	 * ds1wm implements the precise timings of
+	 * a reset pulse/presence detect sequence.
+	 */
 	unsigned int reset_recover_delay;
+
+	/* left shift of register number to get register address offsett.
+	 * Only 0,1,2 allowed for 8,16 or 32 bit bus width respectively
+	 */
+	unsigned int bus_shift;
 };
-- 
cgit v1.2.3


From baa8055de0293a91f87bef5f32296d9ddcba9c56 Mon Sep 17 00:00:00 2001
From: Johannes Poehlmann <johannes.poehlmann@izt-labs.de>
Date: Tue, 25 Jul 2017 13:27:12 +0200
Subject: w1: ds1wm: make endian clean and use standard io memory accessors

o Make endian clean, make HW-endianness configurable.

o Use ioread*, iowrite* instead of __raw_readb,__raw_writeb
  to also use memory-barriers when accessing HW-registers.
  We do not want reordering to happen here.

Both changes are tightly coupled, so I do them in one patch

Signed-off-by: Johannes Poehlmann <johannes.poehlmann@izt-labs.de>
Acked-by: Evgeniy Polyakov <zbr@ioremap.net>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/w1/masters/ds1wm.c | 60 ++++++++++++++++++++++++++++++++++++++++++++--
 include/linux/mfd/ds1wm.h  |  5 ++++
 2 files changed, 63 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/drivers/w1/masters/ds1wm.c b/drivers/w1/masters/ds1wm.c
index 401e53e0482a..d15575dfbc48 100644
--- a/drivers/w1/masters/ds1wm.c
+++ b/drivers/w1/masters/ds1wm.c
@@ -96,6 +96,7 @@ static struct {
 struct ds1wm_data {
 	void     __iomem *map;
 	unsigned int      bus_shift; /* # of shifts to calc register offsets */
+	bool      is_hw_big_endian;
 	struct platform_device *pdev;
 	const struct mfd_cell   *cell;
 	int      irq;
@@ -115,12 +116,65 @@ struct ds1wm_data {
 static inline void ds1wm_write_register(struct ds1wm_data *ds1wm_data, u32 reg,
 					u8 val)
 {
-	__raw_writeb(val, ds1wm_data->map + (reg << ds1wm_data->bus_shift));
+	if (ds1wm_data->is_hw_big_endian) {
+		switch (ds1wm_data->bus_shift) {
+		case 0:
+			iowrite8(val, ds1wm_data->map + (reg << 0));
+			break;
+		case 1:
+			iowrite16be((u16)val, ds1wm_data->map + (reg << 1));
+			break;
+		case 2:
+			iowrite32be((u32)val, ds1wm_data->map + (reg << 2));
+			break;
+		}
+	} else {
+		switch (ds1wm_data->bus_shift) {
+		case 0:
+			iowrite8(val, ds1wm_data->map + (reg << 0));
+			break;
+		case 1:
+			iowrite16((u16)val, ds1wm_data->map + (reg << 1));
+			break;
+		case 2:
+			iowrite32((u32)val, ds1wm_data->map + (reg << 2));
+			break;
+		}
+	}
 }
 
 static inline u8 ds1wm_read_register(struct ds1wm_data *ds1wm_data, u32 reg)
 {
-	return __raw_readb(ds1wm_data->map + (reg << ds1wm_data->bus_shift));
+	u32 val = 0;
+
+	if (ds1wm_data->is_hw_big_endian) {
+		switch (ds1wm_data->bus_shift) {
+		case 0:
+			val = ioread8(ds1wm_data->map + (reg << 0));
+			break;
+		case 1:
+			val = ioread16be(ds1wm_data->map + (reg << 1));
+			break;
+		case 2:
+			val = ioread32be(ds1wm_data->map + (reg << 2));
+			break;
+		}
+	} else {
+		switch (ds1wm_data->bus_shift) {
+		case 0:
+			val = ioread8(ds1wm_data->map + (reg << 0));
+			break;
+		case 1:
+			val = ioread16(ds1wm_data->map + (reg << 1));
+			break;
+		case 2:
+			val = ioread32(ds1wm_data->map + (reg << 2));
+			break;
+		}
+	}
+	dev_dbg(&ds1wm_data->pdev->dev,
+		"ds1wm_read_register reg: %d, 32 bit val:%x\n", reg, val);
+	return (u8)val;
 }
 
 
@@ -499,6 +553,8 @@ static int ds1wm_probe(struct platform_device *pdev)
 		return -EINVAL;
 	}
 
+	ds1wm_data->is_hw_big_endian = plat->is_hw_big_endian;
+
 	res = platform_get_resource(pdev, IORESOURCE_IRQ, 0);
 	if (!res)
 		return -ENXIO;
diff --git a/include/linux/mfd/ds1wm.h b/include/linux/mfd/ds1wm.h
index 79a01e8dc83f..2227c6a75d84 100644
--- a/include/linux/mfd/ds1wm.h
+++ b/include/linux/mfd/ds1wm.h
@@ -16,6 +16,11 @@ struct ds1wm_driver_data {
 	 */
 	unsigned int reset_recover_delay;
 
+	/* Say 1 here for big endian Hardware
+	 * (only relevant with bus-shift > 0
+	 */
+	bool is_hw_big_endian;
+
 	/* left shift of register number to get register address offsett.
 	 * Only 0,1,2 allowed for 8,16 or 32 bit bus width respectively
 	 */
-- 
cgit v1.2.3


From 7a14724f54bf9889fcb1a9f1d4aa4e1d2e969d93 Mon Sep 17 00:00:00 2001
From: Dan Williams <dan.j.williams@intel.com>
Date: Mon, 28 Aug 2017 08:33:20 -0700
Subject: libnvdimm: clean up command definitions

Remove the command payloads that do not have an associated libnvdimm
ioctl. I.e. remove the payloads that would only ever be carried in the
ND_CMD_CALL envelope. This prevents userspace from growing unnecessary
dependencies on this kernel header when userspace already has everything
it needs to craft and send these commands.

Cc: Jerry Hoemann <jerry.hoemann@hpe.com>
Reported-by: Yasunori Goto <y-goto@jp.fujitsu.com>
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 include/uapi/linux/ndctl.h | 37 -------------------------------------
 1 file changed, 37 deletions(-)

(limited to 'include')

diff --git a/include/uapi/linux/ndctl.h b/include/uapi/linux/ndctl.h
index 6d3c54264d8e..3f03567631cb 100644
--- a/include/uapi/linux/ndctl.h
+++ b/include/uapi/linux/ndctl.h
@@ -145,43 +145,6 @@ struct nd_cmd_clear_error {
 	__u64 cleared;
 } __packed;
 
-struct nd_cmd_trans_spa {
-	__u64 spa;
-	__u32 status;
-	__u8  flags;
-	__u8  _reserved[3];
-	__u64 trans_length;
-	__u32 num_nvdimms;
-	struct nd_nvdimm_device {
-		__u32 nfit_device_handle;
-		__u32 _reserved;
-		__u64 dpa;
-	} __packed devices[0];
-
-} __packed;
-
-struct nd_cmd_ars_err_inj {
-	__u64 err_inj_spa_range_base;
-	__u64 err_inj_spa_range_length;
-	__u8  err_inj_options;
-	__u32 status;
-} __packed;
-
-struct nd_cmd_ars_err_inj_clr {
-	__u64 err_inj_clr_spa_range_base;
-	__u64 err_inj_clr_spa_range_length;
-	__u32 status;
-} __packed;
-
-struct nd_cmd_ars_err_inj_stat {
-	__u32 status;
-	__u32 inj_err_rec_count;
-	struct nd_error_stat_query_record {
-		__u64 err_inj_stat_spa_range_base;
-		__u64 err_inj_stat_spa_range_length;
-	} __packed record[0];
-} __packed;
-
 enum {
 	ND_CMD_IMPLEMENTED = 0,
 
-- 
cgit v1.2.3


From d026d70a2e9460dc701d20b48672b80a2656006d Mon Sep 17 00:00:00 2001
From: Leonard Crestez <leonard.crestez@nxp.com>
Date: Wed, 26 Jul 2017 11:34:46 +0200
Subject: nvmem: core: Add nvmem_cell_read_u32

This function does a quick and easy read of an u32 value without any
kind of resource management code on the consumer side.

Signed-off-by: Leonard Crestez <leonard.crestez@nxp.com>
Reviewed-by: Shawn Guo <shawnguo@kernel.org>
Signed-off-by: Srinivas Kandagatla <srinivas.kandagatla@linaro.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/nvmem/core.c           | 37 +++++++++++++++++++++++++++++++++++++
 include/linux/nvmem-consumer.h |  7 +++++++
 2 files changed, 44 insertions(+)

(limited to 'include')

diff --git a/drivers/nvmem/core.c b/drivers/nvmem/core.c
index b0c60338296a..72a60dc8a429 100644
--- a/drivers/nvmem/core.c
+++ b/drivers/nvmem/core.c
@@ -1110,6 +1110,43 @@ int nvmem_cell_write(struct nvmem_cell *cell, void *buf, size_t len)
 }
 EXPORT_SYMBOL_GPL(nvmem_cell_write);
 
+/**
+ * nvmem_cell_read_u32() - Read a cell value as an u32
+ *
+ * @dev: Device that requests the nvmem cell.
+ * @cell_id: Name of nvmem cell to read.
+ * @val: pointer to output value.
+ *
+ * Return: 0 on success or negative errno.
+ */
+int nvmem_cell_read_u32(struct device *dev, const char *cell_id, u32 *val)
+{
+	struct nvmem_cell *cell;
+	void *buf;
+	size_t len;
+
+	cell = nvmem_cell_get(dev, cell_id);
+	if (IS_ERR(cell))
+		return PTR_ERR(cell);
+
+	buf = nvmem_cell_read(cell, &len);
+	if (IS_ERR(buf)) {
+		nvmem_cell_put(cell);
+		return PTR_ERR(buf);
+	}
+	if (len != sizeof(*val)) {
+		kfree(buf);
+		nvmem_cell_put(cell);
+		return -EINVAL;
+	}
+	memcpy(val, buf, sizeof(*val));
+
+	kfree(buf);
+	nvmem_cell_put(cell);
+	return 0;
+}
+EXPORT_SYMBOL_GPL(nvmem_cell_read_u32);
+
 /**
  * nvmem_device_cell_read() - Read a given nvmem device and cell
  *
diff --git a/include/linux/nvmem-consumer.h b/include/linux/nvmem-consumer.h
index c2256d746543..efafccffaee8 100644
--- a/include/linux/nvmem-consumer.h
+++ b/include/linux/nvmem-consumer.h
@@ -35,6 +35,7 @@ void nvmem_cell_put(struct nvmem_cell *cell);
 void devm_nvmem_cell_put(struct device *dev, struct nvmem_cell *cell);
 void *nvmem_cell_read(struct nvmem_cell *cell, size_t *len);
 int nvmem_cell_write(struct nvmem_cell *cell, void *buf, size_t len);
+int nvmem_cell_read_u32(struct device *dev, const char *cell_id, u32 *val);
 
 /* direct nvmem device read/write interface */
 struct nvmem_device *nvmem_device_get(struct device *dev, const char *name);
@@ -85,6 +86,12 @@ static inline int nvmem_cell_write(struct nvmem_cell *cell,
 	return -ENOSYS;
 }
 
+static inline int nvmem_cell_read_u32(struct device *dev,
+				      const char *cell_id, u32 *val)
+{
+	return -ENOSYS;
+}
+
 static inline struct nvmem_device *nvmem_device_get(struct device *dev,
 						    const char *name)
 {
-- 
cgit v1.2.3


From 4da69f49e73e3c79f079de62ea940cabbbf15ae7 Mon Sep 17 00:00:00 2001
From: Srinivas Kandagatla <srinivas.kandagatla@linaro.org>
Date: Wed, 26 Jul 2017 11:34:48 +0200
Subject: nvmem: include linux/err.h from header

missing err.h header file can cause simillar errors like below
in some configurations.
"error: implicit declaration of function 'ERR_PTR'
[-Werror=implicit-function-declaration]"

This adds the missing include to ensure we can always include
the header.

Signed-off-by: Srinivas Kandagatla <srinivas.kandagatla@linaro.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/linux/nvmem-consumer.h | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'include')

diff --git a/include/linux/nvmem-consumer.h b/include/linux/nvmem-consumer.h
index efafccffaee8..4e85447f7860 100644
--- a/include/linux/nvmem-consumer.h
+++ b/include/linux/nvmem-consumer.h
@@ -12,6 +12,9 @@
 #ifndef _LINUX_NVMEM_CONSUMER_H
 #define _LINUX_NVMEM_CONSUMER_H
 
+#include <linux/err.h>
+#include <linux/errno.h>
+
 struct device;
 struct device_node;
 /* consumer cookie */
-- 
cgit v1.2.3


From df770ff0586a494fabe68ffbe2898d7df5666663 Mon Sep 17 00:00:00 2001
From: Mike Leach <mike.leach@linaro.org>
Date: Wed, 2 Aug 2017 10:22:19 -0600
Subject: perf: cs-etm: Fix ETMv4 CONFIGR entry in perf.data file

The value passed into the perf.data file for the CONFIGR register in ETMv4
was incorrectly being set to the command line options/ETMv3 value.

Adds bit definitions and function to remap this value to the correct ETMv4
CONFIGR bit values for all selected options.

Signed-off-by: Mike Leach <mike.leach@linaro.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/linux/coresight-pmu.h       |  5 +++++
 tools/include/linux/coresight-pmu.h |  5 +++++
 tools/perf/arch/arm/util/cs-etm.c   | 28 +++++++++++++++++++++++++++-
 3 files changed, 37 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/coresight-pmu.h b/include/linux/coresight-pmu.h
index 45852c2cd096..edfeaba95429 100644
--- a/include/linux/coresight-pmu.h
+++ b/include/linux/coresight-pmu.h
@@ -26,6 +26,11 @@
 #define ETM_OPT_TS      28
 #define ETM_OPT_RETSTK	29
 
+/* ETMv4 CONFIGR programming bits for the ETM OPTs */
+#define ETM4_CFG_BIT_CYCACC	4
+#define ETM4_CFG_BIT_TS		11
+#define ETM4_CFG_BIT_RETSTK	12
+
 static inline int coresight_get_trace_id(int cpu)
 {
 	/*
diff --git a/tools/include/linux/coresight-pmu.h b/tools/include/linux/coresight-pmu.h
index 45852c2cd096..edfeaba95429 100644
--- a/tools/include/linux/coresight-pmu.h
+++ b/tools/include/linux/coresight-pmu.h
@@ -26,6 +26,11 @@
 #define ETM_OPT_TS      28
 #define ETM_OPT_RETSTK	29
 
+/* ETMv4 CONFIGR programming bits for the ETM OPTs */
+#define ETM4_CFG_BIT_CYCACC	4
+#define ETM4_CFG_BIT_TS		11
+#define ETM4_CFG_BIT_RETSTK	12
+
 static inline int coresight_get_trace_id(int cpu)
 {
 	/*
diff --git a/tools/perf/arch/arm/util/cs-etm.c b/tools/perf/arch/arm/util/cs-etm.c
index 7ce3d1a25133..fbfc055d3f4d 100644
--- a/tools/perf/arch/arm/util/cs-etm.c
+++ b/tools/perf/arch/arm/util/cs-etm.c
@@ -266,6 +266,32 @@ static u64 cs_etm_get_config(struct auxtrace_record *itr)
 	return config;
 }
 
+#ifndef BIT
+#define BIT(N) (1UL << (N))
+#endif
+
+static u64 cs_etmv4_get_config(struct auxtrace_record *itr)
+{
+	u64 config = 0;
+	u64 config_opts = 0;
+
+	/*
+	 * The perf event variable config bits represent both
+	 * the command line options and register programming
+	 * bits in ETMv3/PTM. For ETMv4 we must remap options
+	 * to real bits
+	 */
+	config_opts = cs_etm_get_config(itr);
+	if (config_opts & BIT(ETM_OPT_CYCACC))
+		config |= BIT(ETM4_CFG_BIT_CYCACC);
+	if (config_opts & BIT(ETM_OPT_TS))
+		config |= BIT(ETM4_CFG_BIT_TS);
+	if (config_opts & BIT(ETM_OPT_RETSTK))
+		config |= BIT(ETM4_CFG_BIT_RETSTK);
+
+	return config;
+}
+
 static size_t
 cs_etm_info_priv_size(struct auxtrace_record *itr __maybe_unused,
 		      struct perf_evlist *evlist __maybe_unused)
@@ -363,7 +389,7 @@ static void cs_etm_get_metadata(int cpu, u32 *offset,
 		magic = __perf_cs_etmv4_magic;
 		/* Get trace configuration register */
 		info->priv[*offset + CS_ETMV4_TRCCONFIGR] =
-						cs_etm_get_config(itr);
+						cs_etmv4_get_config(itr);
 		/* Get traceID from the framework */
 		info->priv[*offset + CS_ETMV4_TRCTRACEIDR] =
 						coresight_get_trace_id(cpu);
-- 
cgit v1.2.3


From 960632ece6949be1ab6f7a911faa4fa6e8305f4a Mon Sep 17 00:00:00 2001
From: Aaron Conole <aconole@bytheb.org>
Date: Thu, 24 Aug 2017 00:08:32 +0200
Subject: netfilter: convert hook list to an array

This converts the storage and layout of netfilter hook entries from a
linked list to an array.  After this commit, hook entries will be
stored adjacent in memory.  The next pointer is no longer required.

The ops pointers are stored at the end of the array as they are only
used in the register/unregister path and in the legacy br_netfilter code.

nf_unregister_net_hooks() is slower than needed as it just calls
nf_unregister_net_hook in a loop (i.e. at least n synchronize_net()
calls), this will be addressed in followup patch.

Test setup:
 - ixgbe 10gbit
 - netperf UDP_STREAM, 64 byte packets
 - 5 hooks: (raw + mangle prerouting, mangle+filter input, inet filter):
empty mangle and raw prerouting, mangle and filter input hooks:
353.9
this patch:
364.2

Signed-off-by: Aaron Conole <aconole@bytheb.org>
Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/linux/netdevice.h         |   2 +-
 include/linux/netfilter.h         |  45 +++---
 include/linux/netfilter_ingress.h |   4 +-
 include/net/netfilter/nf_queue.h  |   2 +-
 include/net/netns/netfilter.h     |   2 +-
 net/bridge/br_netfilter_hooks.c   |  19 ++-
 net/netfilter/core.c              | 297 ++++++++++++++++++++++++++++----------
 net/netfilter/nf_internals.h      |   3 +-
 net/netfilter/nf_queue.c          |  67 +++++----
 9 files changed, 307 insertions(+), 134 deletions(-)

(limited to 'include')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 614642eb7eb7..ca0a30127300 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -1811,7 +1811,7 @@ struct net_device {
 #endif
 	struct netdev_queue __rcu *ingress_queue;
 #ifdef CONFIG_NETFILTER_INGRESS
-	struct nf_hook_entry __rcu *nf_hooks_ingress;
+	struct nf_hook_entries __rcu *nf_hooks_ingress;
 #endif
 
 	unsigned char		broadcast[MAX_ADDR_LEN];
diff --git a/include/linux/netfilter.h b/include/linux/netfilter.h
index 22f081065d49..f84bca1703cd 100644
--- a/include/linux/netfilter.h
+++ b/include/linux/netfilter.h
@@ -72,25 +72,32 @@ struct nf_hook_ops {
 };
 
 struct nf_hook_entry {
-	struct nf_hook_entry __rcu	*next;
 	nf_hookfn			*hook;
 	void				*priv;
-	const struct nf_hook_ops	*orig_ops;
 };
 
-static inline void
-nf_hook_entry_init(struct nf_hook_entry *entry,	const struct nf_hook_ops *ops)
-{
-	entry->next = NULL;
-	entry->hook = ops->hook;
-	entry->priv = ops->priv;
-	entry->orig_ops = ops;
-}
+struct nf_hook_entries {
+	u16				num_hook_entries;
+	/* padding */
+	struct nf_hook_entry		hooks[];
+
+	/* trailer: pointers to original orig_ops of each hook.
+	 *
+	 * This is not part of struct nf_hook_entry since its only
+	 * needed in slow path (hook register/unregister).
+	 *
+	 * const struct nf_hook_ops     *orig_ops[]
+	 */
+};
 
-static inline int
-nf_hook_entry_priority(const struct nf_hook_entry *entry)
+static inline struct nf_hook_ops **nf_hook_entries_get_hook_ops(const struct nf_hook_entries *e)
 {
-	return entry->orig_ops->priority;
+	unsigned int n = e->num_hook_entries;
+	const void *hook_end;
+
+	hook_end = &e->hooks[n]; /* this is *past* ->hooks[]! */
+
+	return (struct nf_hook_ops **)hook_end;
 }
 
 static inline int
@@ -100,12 +107,6 @@ nf_hook_entry_hookfn(const struct nf_hook_entry *entry, struct sk_buff *skb,
 	return entry->hook(entry->priv, skb, state);
 }
 
-static inline const struct nf_hook_ops *
-nf_hook_entry_ops(const struct nf_hook_entry *entry)
-{
-	return entry->orig_ops;
-}
-
 static inline void nf_hook_state_init(struct nf_hook_state *p,
 				      unsigned int hook,
 				      u_int8_t pf,
@@ -168,7 +169,7 @@ extern struct static_key nf_hooks_needed[NFPROTO_NUMPROTO][NF_MAX_HOOKS];
 #endif
 
 int nf_hook_slow(struct sk_buff *skb, struct nf_hook_state *state,
-		 struct nf_hook_entry *entry);
+		 const struct nf_hook_entries *e, unsigned int i);
 
 /**
  *	nf_hook - call a netfilter hook
@@ -182,7 +183,7 @@ static inline int nf_hook(u_int8_t pf, unsigned int hook, struct net *net,
 			  struct net_device *indev, struct net_device *outdev,
 			  int (*okfn)(struct net *, struct sock *, struct sk_buff *))
 {
-	struct nf_hook_entry *hook_head;
+	struct nf_hook_entries *hook_head;
 	int ret = 1;
 
 #ifdef HAVE_JUMP_LABEL
@@ -200,7 +201,7 @@ static inline int nf_hook(u_int8_t pf, unsigned int hook, struct net *net,
 		nf_hook_state_init(&state, hook, pf, indev, outdev,
 				   sk, net, okfn);
 
-		ret = nf_hook_slow(skb, &state, hook_head);
+		ret = nf_hook_slow(skb, &state, hook_head, 0);
 	}
 	rcu_read_unlock();
 
diff --git a/include/linux/netfilter_ingress.h b/include/linux/netfilter_ingress.h
index 59476061de86..8d5dae1e2ff8 100644
--- a/include/linux/netfilter_ingress.h
+++ b/include/linux/netfilter_ingress.h
@@ -17,7 +17,7 @@ static inline bool nf_hook_ingress_active(const struct sk_buff *skb)
 /* caller must hold rcu_read_lock */
 static inline int nf_hook_ingress(struct sk_buff *skb)
 {
-	struct nf_hook_entry *e = rcu_dereference(skb->dev->nf_hooks_ingress);
+	struct nf_hook_entries *e = rcu_dereference(skb->dev->nf_hooks_ingress);
 	struct nf_hook_state state;
 	int ret;
 
@@ -30,7 +30,7 @@ static inline int nf_hook_ingress(struct sk_buff *skb)
 	nf_hook_state_init(&state, NF_NETDEV_INGRESS,
 			   NFPROTO_NETDEV, skb->dev, NULL, NULL,
 			   dev_net(skb->dev), NULL);
-	ret = nf_hook_slow(skb, &state, e);
+	ret = nf_hook_slow(skb, &state, e, 0);
 	if (ret == 0)
 		return -1;
 
diff --git a/include/net/netfilter/nf_queue.h b/include/net/netfilter/nf_queue.h
index 4454719ff849..39468720fc19 100644
--- a/include/net/netfilter/nf_queue.h
+++ b/include/net/netfilter/nf_queue.h
@@ -10,9 +10,9 @@ struct nf_queue_entry {
 	struct list_head	list;
 	struct sk_buff		*skb;
 	unsigned int		id;
+	unsigned int		hook_index;	/* index in hook_entries->hook[] */
 
 	struct nf_hook_state	state;
-	struct nf_hook_entry	*hook;
 	u16			size; /* sizeof(entry) + saved route keys */
 
 	/* extra space to store route keys */
diff --git a/include/net/netns/netfilter.h b/include/net/netns/netfilter.h
index cea396b53a60..72d66c8763d0 100644
--- a/include/net/netns/netfilter.h
+++ b/include/net/netns/netfilter.h
@@ -16,7 +16,7 @@ struct netns_nf {
 #ifdef CONFIG_SYSCTL
 	struct ctl_table_header *nf_log_dir_header;
 #endif
-	struct nf_hook_entry __rcu *hooks[NFPROTO_NUMPROTO][NF_MAX_HOOKS];
+	struct nf_hook_entries __rcu *hooks[NFPROTO_NUMPROTO][NF_MAX_HOOKS];
 #if IS_ENABLED(CONFIG_NF_DEFRAG_IPV4)
 	bool			defrag_ipv4;
 #endif
diff --git a/net/bridge/br_netfilter_hooks.c b/net/bridge/br_netfilter_hooks.c
index 626f4b2cef16..c2eea1b8737a 100644
--- a/net/bridge/br_netfilter_hooks.c
+++ b/net/bridge/br_netfilter_hooks.c
@@ -985,22 +985,25 @@ int br_nf_hook_thresh(unsigned int hook, struct net *net,
 		      int (*okfn)(struct net *, struct sock *,
 				  struct sk_buff *))
 {
-	struct nf_hook_entry *elem;
+	const struct nf_hook_entries *e;
 	struct nf_hook_state state;
+	struct nf_hook_ops **ops;
+	unsigned int i;
 	int ret;
 
-	for (elem = rcu_dereference(net->nf.hooks[NFPROTO_BRIDGE][hook]);
-	     elem && nf_hook_entry_priority(elem) <= NF_BR_PRI_BRNF;
-	     elem = rcu_dereference(elem->next))
-		;
-
-	if (!elem)
+	e = rcu_dereference(net->nf.hooks[NFPROTO_BRIDGE][hook]);
+	if (!e)
 		return okfn(net, sk, skb);
 
+	ops = nf_hook_entries_get_hook_ops(e);
+	for (i = 0; i < e->num_hook_entries &&
+	      ops[i]->priority <= NF_BR_PRI_BRNF; i++)
+		;
+
 	nf_hook_state_init(&state, hook, NFPROTO_BRIDGE, indev, outdev,
 			   sk, net, okfn);
 
-	ret = nf_hook_slow(skb, &state, elem);
+	ret = nf_hook_slow(skb, &state, e, i);
 	if (ret == 1)
 		ret = okfn(net, sk, skb);
 
diff --git a/net/netfilter/core.c b/net/netfilter/core.c
index 974cf2a3795a..1a9e23c9ab98 100644
--- a/net/netfilter/core.c
+++ b/net/netfilter/core.c
@@ -21,7 +21,7 @@
 #include <linux/inetdevice.h>
 #include <linux/proc_fs.h>
 #include <linux/mutex.h>
-#include <linux/slab.h>
+#include <linux/mm.h>
 #include <linux/rcupdate.h>
 #include <net/net_namespace.h>
 #include <net/sock.h>
@@ -62,10 +62,160 @@ EXPORT_SYMBOL(nf_hooks_needed);
 #endif
 
 static DEFINE_MUTEX(nf_hook_mutex);
+
+/* max hooks per family/hooknum */
+#define MAX_HOOK_COUNT		1024
+
 #define nf_entry_dereference(e) \
 	rcu_dereference_protected(e, lockdep_is_held(&nf_hook_mutex))
 
-static struct nf_hook_entry __rcu **nf_hook_entry_head(struct net *net, const struct nf_hook_ops *reg)
+static struct nf_hook_entries *allocate_hook_entries_size(u16 num)
+{
+	struct nf_hook_entries *e;
+	size_t alloc = sizeof(*e) +
+		       sizeof(struct nf_hook_entry) * num +
+		       sizeof(struct nf_hook_ops *) * num;
+
+	if (num == 0)
+		return NULL;
+
+	e = kvzalloc(alloc, GFP_KERNEL);
+	if (e)
+		e->num_hook_entries = num;
+	return e;
+}
+
+static unsigned int accept_all(void *priv,
+			       struct sk_buff *skb,
+			       const struct nf_hook_state *state)
+{
+	return NF_ACCEPT; /* ACCEPT makes nf_hook_slow call next hook */
+}
+
+static const struct nf_hook_ops dummy_ops = {
+	.hook = accept_all,
+	.priority = INT_MIN,
+};
+
+static struct nf_hook_entries *
+nf_hook_entries_grow(const struct nf_hook_entries *old,
+		     const struct nf_hook_ops *reg)
+{
+	unsigned int i, alloc_entries, nhooks, old_entries;
+	struct nf_hook_ops **orig_ops = NULL;
+	struct nf_hook_ops **new_ops;
+	struct nf_hook_entries *new;
+	bool inserted = false;
+
+	alloc_entries = 1;
+	old_entries = old ? old->num_hook_entries : 0;
+
+	if (old) {
+		orig_ops = nf_hook_entries_get_hook_ops(old);
+
+		for (i = 0; i < old_entries; i++) {
+			if (orig_ops[i] != &dummy_ops)
+				alloc_entries++;
+		}
+	}
+
+	if (alloc_entries > MAX_HOOK_COUNT)
+		return ERR_PTR(-E2BIG);
+
+	new = allocate_hook_entries_size(alloc_entries);
+	if (!new)
+		return ERR_PTR(-ENOMEM);
+
+	new_ops = nf_hook_entries_get_hook_ops(new);
+
+	i = 0;
+	nhooks = 0;
+	while (i < old_entries) {
+		if (orig_ops[i] == &dummy_ops) {
+			++i;
+			continue;
+		}
+		if (inserted || reg->priority > orig_ops[i]->priority) {
+			new_ops[nhooks] = (void *)orig_ops[i];
+			new->hooks[nhooks] = old->hooks[i];
+			i++;
+		} else {
+			new_ops[nhooks] = (void *)reg;
+			new->hooks[nhooks].hook = reg->hook;
+			new->hooks[nhooks].priv = reg->priv;
+			inserted = true;
+		}
+		nhooks++;
+	}
+
+	if (!inserted) {
+		new_ops[nhooks] = (void *)reg;
+		new->hooks[nhooks].hook = reg->hook;
+		new->hooks[nhooks].priv = reg->priv;
+	}
+
+	return new;
+}
+
+/*
+ * __nf_hook_entries_try_shrink - try to shrink hook array
+ *
+ * @pp -- location of hook blob
+ *
+ * Hook unregistration must always succeed, so to-be-removed hooks
+ * are replaced by a dummy one that will just move to next hook.
+ *
+ * This counts the current dummy hooks, attempts to allocate new blob,
+ * copies the live hooks, then replaces and discards old one.
+ *
+ * return values:
+ *
+ * Returns address to free, or NULL.
+ */
+static void *__nf_hook_entries_try_shrink(struct nf_hook_entries __rcu **pp)
+{
+	struct nf_hook_entries *old, *new = NULL;
+	unsigned int i, j, skip = 0, hook_entries;
+	struct nf_hook_ops **orig_ops;
+	struct nf_hook_ops **new_ops;
+
+	old = nf_entry_dereference(*pp);
+	if (WARN_ON_ONCE(!old))
+		return NULL;
+
+	orig_ops = nf_hook_entries_get_hook_ops(old);
+	for (i = 0; i < old->num_hook_entries; i++) {
+		if (orig_ops[i] == &dummy_ops)
+			skip++;
+	}
+
+	/* if skip == hook_entries all hooks have been removed */
+	hook_entries = old->num_hook_entries;
+	if (skip == hook_entries)
+		goto out_assign;
+
+	if (WARN_ON(skip == 0))
+		return NULL;
+
+	hook_entries -= skip;
+	new = allocate_hook_entries_size(hook_entries);
+	if (!new)
+		return NULL;
+
+	new_ops = nf_hook_entries_get_hook_ops(new);
+	for (i = 0, j = 0; i < old->num_hook_entries; i++) {
+		if (orig_ops[i] == &dummy_ops)
+			continue;
+		new->hooks[j] = old->hooks[i];
+		new_ops[j] = (void *)orig_ops[i];
+		j++;
+	}
+out_assign:
+	rcu_assign_pointer(*pp, new);
+	return old;
+}
+
+static struct nf_hook_entries __rcu **nf_hook_entry_head(struct net *net, const struct nf_hook_ops *reg)
 {
 	if (reg->pf != NFPROTO_NETDEV)
 		return net->nf.hooks[reg->pf]+reg->hooknum;
@@ -76,13 +226,14 @@ static struct nf_hook_entry __rcu **nf_hook_entry_head(struct net *net, const st
 			return &reg->dev->nf_hooks_ingress;
 	}
 #endif
+	WARN_ON_ONCE(1);
 	return NULL;
 }
 
 int nf_register_net_hook(struct net *net, const struct nf_hook_ops *reg)
 {
-	struct nf_hook_entry __rcu **pp;
-	struct nf_hook_entry *entry, *p;
+	struct nf_hook_entries *p, *new_hooks;
+	struct nf_hook_entries __rcu **pp;
 
 	if (reg->pf == NFPROTO_NETDEV) {
 #ifndef CONFIG_NETFILTER_INGRESS
@@ -98,23 +249,18 @@ int nf_register_net_hook(struct net *net, const struct nf_hook_ops *reg)
 	if (!pp)
 		return -EINVAL;
 
-	entry = kmalloc(sizeof(*entry), GFP_KERNEL);
-	if (!entry)
-		return -ENOMEM;
-
-	nf_hook_entry_init(entry, reg);
-
 	mutex_lock(&nf_hook_mutex);
 
-	/* Find the spot in the list */
-	for (; (p = nf_entry_dereference(*pp)) != NULL; pp = &p->next) {
-		if (reg->priority < nf_hook_entry_priority(p))
-			break;
-	}
-	rcu_assign_pointer(entry->next, p);
-	rcu_assign_pointer(*pp, entry);
+	p = nf_entry_dereference(*pp);
+	new_hooks = nf_hook_entries_grow(p, reg);
+
+	if (!IS_ERR(new_hooks))
+		rcu_assign_pointer(*pp, new_hooks);
 
 	mutex_unlock(&nf_hook_mutex);
+	if (IS_ERR(new_hooks))
+		return PTR_ERR(new_hooks);
+
 #ifdef CONFIG_NETFILTER_INGRESS
 	if (reg->pf == NFPROTO_NETDEV && reg->hooknum == NF_NETDEV_INGRESS)
 		net_inc_ingress_queue();
@@ -122,48 +268,74 @@ int nf_register_net_hook(struct net *net, const struct nf_hook_ops *reg)
 #ifdef HAVE_JUMP_LABEL
 	static_key_slow_inc(&nf_hooks_needed[reg->pf][reg->hooknum]);
 #endif
+	synchronize_net();
+	BUG_ON(p == new_hooks);
+	kvfree(p);
 	return 0;
 }
 EXPORT_SYMBOL(nf_register_net_hook);
 
-static struct nf_hook_entry *
-__nf_unregister_net_hook(struct net *net, const struct nf_hook_ops *reg)
+/*
+ * __nf_unregister_net_hook - remove a hook from blob
+ *
+ * @oldp: current address of hook blob
+ * @unreg: hook to unregister
+ *
+ * This cannot fail, hook unregistration must always succeed.
+ * Therefore replace the to-be-removed hook with a dummy hook.
+ */
+static void __nf_unregister_net_hook(struct nf_hook_entries *old,
+				     const struct nf_hook_ops *unreg)
 {
-	struct nf_hook_entry __rcu **pp;
-	struct nf_hook_entry *p;
-
-	pp = nf_hook_entry_head(net, reg);
-	if (WARN_ON_ONCE(!pp))
-		return NULL;
+	struct nf_hook_ops **orig_ops;
+	bool found = false;
+	unsigned int i;
 
-	mutex_lock(&nf_hook_mutex);
-	for (; (p = nf_entry_dereference(*pp)) != NULL; pp = &p->next) {
-		if (nf_hook_entry_ops(p) == reg) {
-			rcu_assign_pointer(*pp, p->next);
-			break;
-		}
-	}
-	mutex_unlock(&nf_hook_mutex);
-	if (!p) {
-		WARN(1, "nf_unregister_net_hook: hook not found!\n");
-		return NULL;
+	orig_ops = nf_hook_entries_get_hook_ops(old);
+	for (i = 0; i < old->num_hook_entries; i++) {
+		if (orig_ops[i] != unreg)
+			continue;
+		WRITE_ONCE(old->hooks[i].hook, accept_all);
+		WRITE_ONCE(orig_ops[i], &dummy_ops);
+		found = true;
+		break;
 	}
+
+	if (found) {
 #ifdef CONFIG_NETFILTER_INGRESS
-	if (reg->pf == NFPROTO_NETDEV && reg->hooknum == NF_NETDEV_INGRESS)
-		net_dec_ingress_queue();
+		if (unreg->pf == NFPROTO_NETDEV && unreg->hooknum == NF_NETDEV_INGRESS)
+			net_dec_ingress_queue();
 #endif
 #ifdef HAVE_JUMP_LABEL
-	static_key_slow_dec(&nf_hooks_needed[reg->pf][reg->hooknum]);
+		static_key_slow_dec(&nf_hooks_needed[unreg->pf][unreg->hooknum]);
 #endif
-
-	return p;
+	} else {
+		WARN_ONCE(1, "hook not found, pf %d num %d", unreg->pf, unreg->hooknum);
+	}
 }
 
 void nf_unregister_net_hook(struct net *net, const struct nf_hook_ops *reg)
 {
-	struct nf_hook_entry *p = __nf_unregister_net_hook(net, reg);
+	struct nf_hook_entries __rcu **pp;
+	struct nf_hook_entries *p;
 	unsigned int nfq;
 
+	pp = nf_hook_entry_head(net, reg);
+	if (!pp)
+		return;
+
+	mutex_lock(&nf_hook_mutex);
+
+	p = nf_entry_dereference(*pp);
+	if (WARN_ON_ONCE(!p)) {
+		mutex_unlock(&nf_hook_mutex);
+		return;
+	}
+
+	__nf_unregister_net_hook(p, reg);
+
+	p = __nf_hook_entries_try_shrink(pp);
+	mutex_unlock(&nf_hook_mutex);
 	if (!p)
 		return;
 
@@ -173,7 +345,7 @@ void nf_unregister_net_hook(struct net *net, const struct nf_hook_ops *reg)
 	nfq = nf_queue_nf_hook_drop(net);
 	if (nfq)
 		synchronize_net();
-	kfree(p);
+	kvfree(p);
 }
 EXPORT_SYMBOL(nf_unregister_net_hook);
 
@@ -200,46 +372,25 @@ EXPORT_SYMBOL(nf_register_net_hooks);
 void nf_unregister_net_hooks(struct net *net, const struct nf_hook_ops *reg,
 			     unsigned int hookcount)
 {
-	struct nf_hook_entry *to_free[16];
-	unsigned int i, n, nfq;
-
-	do {
-		n = min_t(unsigned int, hookcount, ARRAY_SIZE(to_free));
-
-		for (i = 0; i < n; i++)
-			to_free[i] = __nf_unregister_net_hook(net, &reg[i]);
-
-		synchronize_net();
-
-		/* need 2nd synchronize_net() if nfqueue is used, skb
-		 * can get reinjected right before nf_queue_hook_drop()
-		 */
-		nfq = nf_queue_nf_hook_drop(net);
-		if (nfq)
-			synchronize_net();
-
-		for (i = 0; i < n; i++)
-			kfree(to_free[i]);
+	unsigned int i;
 
-		reg += n;
-		hookcount -= n;
-	} while (hookcount > 0);
+	for (i = 0; i < hookcount; i++)
+		nf_unregister_net_hook(net, &reg[i]);
 }
 EXPORT_SYMBOL(nf_unregister_net_hooks);
 
 /* Returns 1 if okfn() needs to be executed by the caller,
  * -EPERM for NF_DROP, 0 otherwise.  Caller must hold rcu_read_lock. */
 int nf_hook_slow(struct sk_buff *skb, struct nf_hook_state *state,
-		 struct nf_hook_entry *entry)
+		 const struct nf_hook_entries *e, unsigned int s)
 {
 	unsigned int verdict;
 	int ret;
 
-	do {
-		verdict = nf_hook_entry_hookfn(entry, skb, state);
+	for (; s < e->num_hook_entries; s++) {
+		verdict = nf_hook_entry_hookfn(&e->hooks[s], skb, state);
 		switch (verdict & NF_VERDICT_MASK) {
 		case NF_ACCEPT:
-			entry = rcu_dereference(entry->next);
 			break;
 		case NF_DROP:
 			kfree_skb(skb);
@@ -248,8 +399,8 @@ int nf_hook_slow(struct sk_buff *skb, struct nf_hook_state *state,
 				ret = -EPERM;
 			return ret;
 		case NF_QUEUE:
-			ret = nf_queue(skb, state, &entry, verdict);
-			if (ret == 1 && entry)
+			ret = nf_queue(skb, state, e, s, verdict);
+			if (ret == 1)
 				continue;
 			return ret;
 		default:
@@ -258,7 +409,7 @@ int nf_hook_slow(struct sk_buff *skb, struct nf_hook_state *state,
 			 */
 			return 0;
 		}
-	} while (entry);
+	}
 
 	return 1;
 }
diff --git a/net/netfilter/nf_internals.h b/net/netfilter/nf_internals.h
index 19f00a47a710..bacd6363946e 100644
--- a/net/netfilter/nf_internals.h
+++ b/net/netfilter/nf_internals.h
@@ -13,7 +13,8 @@
 
 /* nf_queue.c */
 int nf_queue(struct sk_buff *skb, struct nf_hook_state *state,
-	     struct nf_hook_entry **entryp, unsigned int verdict);
+	     const struct nf_hook_entries *entries, unsigned int index,
+	     unsigned int verdict);
 unsigned int nf_queue_nf_hook_drop(struct net *net);
 
 /* nf_log.c */
diff --git a/net/netfilter/nf_queue.c b/net/netfilter/nf_queue.c
index 4f4d80a58fb5..f7e21953b1de 100644
--- a/net/netfilter/nf_queue.c
+++ b/net/netfilter/nf_queue.c
@@ -112,7 +112,8 @@ unsigned int nf_queue_nf_hook_drop(struct net *net)
 EXPORT_SYMBOL_GPL(nf_queue_nf_hook_drop);
 
 static int __nf_queue(struct sk_buff *skb, const struct nf_hook_state *state,
-		      struct nf_hook_entry *hook_entry, unsigned int queuenum)
+		      const struct nf_hook_entries *entries,
+		      unsigned int index, unsigned int queuenum)
 {
 	int status = -ENOENT;
 	struct nf_queue_entry *entry = NULL;
@@ -140,7 +141,7 @@ static int __nf_queue(struct sk_buff *skb, const struct nf_hook_state *state,
 	*entry = (struct nf_queue_entry) {
 		.skb	= skb,
 		.state	= *state,
-		.hook	= hook_entry,
+		.hook_index = index,
 		.size	= sizeof(*entry) + afinfo->route_key_size,
 	};
 
@@ -163,18 +164,16 @@ err:
 
 /* Packets leaving via this function must come back through nf_reinject(). */
 int nf_queue(struct sk_buff *skb, struct nf_hook_state *state,
-	     struct nf_hook_entry **entryp, unsigned int verdict)
+	     const struct nf_hook_entries *entries, unsigned int index,
+	     unsigned int verdict)
 {
-	struct nf_hook_entry *entry = *entryp;
 	int ret;
 
-	ret = __nf_queue(skb, state, entry, verdict >> NF_VERDICT_QBITS);
+	ret = __nf_queue(skb, state, entries, index, verdict >> NF_VERDICT_QBITS);
 	if (ret < 0) {
 		if (ret == -ESRCH &&
-		    (verdict & NF_VERDICT_FLAG_QUEUE_BYPASS)) {
-			*entryp = rcu_dereference(entry->next);
+		    (verdict & NF_VERDICT_FLAG_QUEUE_BYPASS))
 			return 1;
-		}
 		kfree_skb(skb);
 	}
 
@@ -183,33 +182,56 @@ int nf_queue(struct sk_buff *skb, struct nf_hook_state *state,
 
 static unsigned int nf_iterate(struct sk_buff *skb,
 			       struct nf_hook_state *state,
-			       struct nf_hook_entry **entryp)
+			       const struct nf_hook_entries *hooks,
+			       unsigned int *index)
 {
-	unsigned int verdict;
+	const struct nf_hook_entry *hook;
+	unsigned int verdict, i = *index;
 
-	do {
+	while (i < hooks->num_hook_entries) {
+		hook = &hooks->hooks[i];
 repeat:
-		verdict = nf_hook_entry_hookfn((*entryp), skb, state);
+		verdict = nf_hook_entry_hookfn(hook, skb, state);
 		if (verdict != NF_ACCEPT) {
 			if (verdict != NF_REPEAT)
 				return verdict;
 			goto repeat;
 		}
-		*entryp = rcu_dereference((*entryp)->next);
-	} while (*entryp);
+		i++;
+	}
 
+	*index = i;
 	return NF_ACCEPT;
 }
 
+/* Caller must hold rcu read-side lock */
 void nf_reinject(struct nf_queue_entry *entry, unsigned int verdict)
 {
-	struct nf_hook_entry *hook_entry = entry->hook;
+	const struct nf_hook_entry *hook_entry;
+	const struct nf_hook_entries *hooks;
 	struct sk_buff *skb = entry->skb;
 	const struct nf_afinfo *afinfo;
+	const struct net *net;
+	unsigned int i;
 	int err;
+	u8 pf;
+
+	net = entry->state.net;
+	pf = entry->state.pf;
+
+	hooks = rcu_dereference(net->nf.hooks[pf][entry->state.hook]);
 
 	nf_queue_entry_release_refs(entry);
 
+	i = entry->hook_index;
+	if (WARN_ON_ONCE(i >= hooks->num_hook_entries)) {
+		kfree_skb(skb);
+		kfree(entry);
+		return;
+	}
+
+	hook_entry = &hooks->hooks[i];
+
 	/* Continue traversal iff userspace said ok... */
 	if (verdict == NF_REPEAT)
 		verdict = nf_hook_entry_hookfn(hook_entry, skb, &entry->state);
@@ -221,27 +243,22 @@ void nf_reinject(struct nf_queue_entry *entry, unsigned int verdict)
 	}
 
 	if (verdict == NF_ACCEPT) {
-		hook_entry = rcu_dereference(hook_entry->next);
-		if (hook_entry)
 next_hook:
-			verdict = nf_iterate(skb, &entry->state, &hook_entry);
+		++i;
+		verdict = nf_iterate(skb, &entry->state, hooks, &i);
 	}
 
 	switch (verdict & NF_VERDICT_MASK) {
 	case NF_ACCEPT:
 	case NF_STOP:
-okfn:
 		local_bh_disable();
 		entry->state.okfn(entry->state.net, entry->state.sk, skb);
 		local_bh_enable();
 		break;
 	case NF_QUEUE:
-		err = nf_queue(skb, &entry->state, &hook_entry, verdict);
-		if (err == 1) {
-			if (hook_entry)
-				goto next_hook;
-			goto okfn;
-		}
+		err = nf_queue(skb, &entry->state, hooks, i, verdict);
+		if (err == 1)
+			goto next_hook;
 		break;
 	case NF_STOLEN:
 		break;
-- 
cgit v1.2.3


From 5916a22b83041b07d63191fe06206ae0fff6ec7a Mon Sep 17 00:00:00 2001
From: Eric Biggers <ebiggers@google.com>
Date: Thu, 22 Jun 2017 11:32:45 -0700
Subject: dm: constify argument arrays

The arrays of 'struct dm_arg' are never modified by the device-mapper
core, so constify them so that they are placed in .rodata.

(Exception: the args array in dm-raid cannot be constified because it is
allocated on the stack and modified.)

Signed-off-by: Eric Biggers <ebiggers@google.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
---
 drivers/md/dm-cache-target.c  |  4 ++--
 drivers/md/dm-crypt.c         |  2 +-
 drivers/md/dm-flakey.c        |  4 ++--
 drivers/md/dm-integrity.c     |  2 +-
 drivers/md/dm-mpath.c         | 10 +++++-----
 drivers/md/dm-switch.c        |  2 +-
 drivers/md/dm-table.c         |  7 ++++---
 drivers/md/dm-thin.c          |  2 +-
 drivers/md/dm-verity-target.c |  2 +-
 include/linux/device-mapper.h |  4 ++--
 10 files changed, 20 insertions(+), 19 deletions(-)

(limited to 'include')

diff --git a/drivers/md/dm-cache-target.c b/drivers/md/dm-cache-target.c
index c5ea03fc7ee1..b0a5503a2fd3 100644
--- a/drivers/md/dm-cache-target.c
+++ b/drivers/md/dm-cache-target.c
@@ -2306,7 +2306,7 @@ static void init_features(struct cache_features *cf)
 static int parse_features(struct cache_args *ca, struct dm_arg_set *as,
 			  char **error)
 {
-	static struct dm_arg _args[] = {
+	static const struct dm_arg _args[] = {
 		{0, 2, "Invalid number of cache feature arguments"},
 	};
 
@@ -2348,7 +2348,7 @@ static int parse_features(struct cache_args *ca, struct dm_arg_set *as,
 static int parse_policy(struct cache_args *ca, struct dm_arg_set *as,
 			char **error)
 {
-	static struct dm_arg _args[] = {
+	static const struct dm_arg _args[] = {
 		{0, 1024, "Invalid number of policy arguments"},
 	};
 
diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c
index cdf6b1e12460..abf16559ed49 100644
--- a/drivers/md/dm-crypt.c
+++ b/drivers/md/dm-crypt.c
@@ -2533,7 +2533,7 @@ static int crypt_ctr_optional(struct dm_target *ti, unsigned int argc, char **ar
 {
 	struct crypt_config *cc = ti->private;
 	struct dm_arg_set as;
-	static struct dm_arg _args[] = {
+	static const struct dm_arg _args[] = {
 		{0, 6, "Invalid number of feature args"},
 	};
 	unsigned int opt_params, val;
diff --git a/drivers/md/dm-flakey.c b/drivers/md/dm-flakey.c
index e2c7234931bc..d8bb371e63d7 100644
--- a/drivers/md/dm-flakey.c
+++ b/drivers/md/dm-flakey.c
@@ -51,7 +51,7 @@ static int parse_features(struct dm_arg_set *as, struct flakey_c *fc,
 	unsigned argc;
 	const char *arg_name;
 
-	static struct dm_arg _args[] = {
+	static const struct dm_arg _args[] = {
 		{0, 6, "Invalid number of feature args"},
 		{1, UINT_MAX, "Invalid corrupt bio byte"},
 		{0, 255, "Invalid corrupt value to write into bio byte (0-255)"},
@@ -178,7 +178,7 @@ static int parse_features(struct dm_arg_set *as, struct flakey_c *fc,
  */
 static int flakey_ctr(struct dm_target *ti, unsigned int argc, char **argv)
 {
-	static struct dm_arg _args[] = {
+	static const struct dm_arg _args[] = {
 		{0, UINT_MAX, "Invalid up interval"},
 		{0, UINT_MAX, "Invalid down interval"},
 	};
diff --git a/drivers/md/dm-integrity.c b/drivers/md/dm-integrity.c
index 47fd409b2e2a..293a19652d55 100644
--- a/drivers/md/dm-integrity.c
+++ b/drivers/md/dm-integrity.c
@@ -2780,7 +2780,7 @@ static int dm_integrity_ctr(struct dm_target *ti, unsigned argc, char **argv)
 	int r;
 	unsigned extra_args;
 	struct dm_arg_set as;
-	static struct dm_arg _args[] = {
+	static const struct dm_arg _args[] = {
 		{0, 9, "Invalid number of feature args"},
 	};
 	unsigned journal_sectors, interleave_sectors, buffer_sectors, journal_watermark, sync_msec;
diff --git a/drivers/md/dm-mpath.c b/drivers/md/dm-mpath.c
index 7406ededf875..bf280a99fa81 100644
--- a/drivers/md/dm-mpath.c
+++ b/drivers/md/dm-mpath.c
@@ -702,7 +702,7 @@ static int parse_path_selector(struct dm_arg_set *as, struct priority_group *pg,
 	struct path_selector_type *pst;
 	unsigned ps_argc;
 
-	static struct dm_arg _args[] = {
+	static const struct dm_arg _args[] = {
 		{0, 1024, "invalid number of path selector args"},
 	};
 
@@ -826,7 +826,7 @@ retain:
 static struct priority_group *parse_priority_group(struct dm_arg_set *as,
 						   struct multipath *m)
 {
-	static struct dm_arg _args[] = {
+	static const struct dm_arg _args[] = {
 		{1, 1024, "invalid number of paths"},
 		{0, 1024, "invalid number of selector args"}
 	};
@@ -902,7 +902,7 @@ static int parse_hw_handler(struct dm_arg_set *as, struct multipath *m)
 	int ret;
 	struct dm_target *ti = m->ti;
 
-	static struct dm_arg _args[] = {
+	static const struct dm_arg _args[] = {
 		{0, 1024, "invalid number of hardware handler args"},
 	};
 
@@ -954,7 +954,7 @@ static int parse_features(struct dm_arg_set *as, struct multipath *m)
 	struct dm_target *ti = m->ti;
 	const char *arg_name;
 
-	static struct dm_arg _args[] = {
+	static const struct dm_arg _args[] = {
 		{0, 8, "invalid number of feature args"},
 		{1, 50, "pg_init_retries must be between 1 and 50"},
 		{0, 60000, "pg_init_delay_msecs must be between 0 and 60000"},
@@ -1023,7 +1023,7 @@ static int parse_features(struct dm_arg_set *as, struct multipath *m)
 static int multipath_ctr(struct dm_target *ti, unsigned argc, char **argv)
 {
 	/* target arguments */
-	static struct dm_arg _args[] = {
+	static const struct dm_arg _args[] = {
 		{0, 1024, "invalid number of priority groups"},
 		{0, 1024, "invalid initial priority group number"},
 	};
diff --git a/drivers/md/dm-switch.c b/drivers/md/dm-switch.c
index 871c18fe000d..83a371d54412 100644
--- a/drivers/md/dm-switch.c
+++ b/drivers/md/dm-switch.c
@@ -251,7 +251,7 @@ static void switch_dtr(struct dm_target *ti)
  */
 static int switch_ctr(struct dm_target *ti, unsigned argc, char **argv)
 {
-	static struct dm_arg _args[] = {
+	static const struct dm_arg _args[] = {
 		{1, (KMALLOC_MAX_SIZE - sizeof(struct switch_ctx)) / sizeof(struct switch_path), "Invalid number of paths"},
 		{1, UINT_MAX, "Invalid region size"},
 		{0, 0, "Invalid number of optional args"},
diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c
index 28a4071cdf85..ef7b8f201f73 100644
--- a/drivers/md/dm-table.c
+++ b/drivers/md/dm-table.c
@@ -806,7 +806,8 @@ int dm_table_add_target(struct dm_table *t, const char *type,
 /*
  * Target argument parsing helpers.
  */
-static int validate_next_arg(struct dm_arg *arg, struct dm_arg_set *arg_set,
+static int validate_next_arg(const struct dm_arg *arg,
+			     struct dm_arg_set *arg_set,
 			     unsigned *value, char **error, unsigned grouped)
 {
 	const char *arg_str = dm_shift_arg(arg_set);
@@ -824,14 +825,14 @@ static int validate_next_arg(struct dm_arg *arg, struct dm_arg_set *arg_set,
 	return 0;
 }
 
-int dm_read_arg(struct dm_arg *arg, struct dm_arg_set *arg_set,
+int dm_read_arg(const struct dm_arg *arg, struct dm_arg_set *arg_set,
 		unsigned *value, char **error)
 {
 	return validate_next_arg(arg, arg_set, value, error, 0);
 }
 EXPORT_SYMBOL(dm_read_arg);
 
-int dm_read_arg_group(struct dm_arg *arg, struct dm_arg_set *arg_set,
+int dm_read_arg_group(const struct dm_arg *arg, struct dm_arg_set *arg_set,
 		      unsigned *value, char **error)
 {
 	return validate_next_arg(arg, arg_set, value, error, 1);
diff --git a/drivers/md/dm-thin.c b/drivers/md/dm-thin.c
index 9dec2f8cc739..9736621c2963 100644
--- a/drivers/md/dm-thin.c
+++ b/drivers/md/dm-thin.c
@@ -3041,7 +3041,7 @@ static int parse_pool_features(struct dm_arg_set *as, struct pool_features *pf,
 	unsigned argc;
 	const char *arg_name;
 
-	static struct dm_arg _args[] = {
+	static const struct dm_arg _args[] = {
 		{0, 4, "Invalid number of pool feature arguments"},
 	};
 
diff --git a/drivers/md/dm-verity-target.c b/drivers/md/dm-verity-target.c
index b46705ebf01f..79f18d4d7f02 100644
--- a/drivers/md/dm-verity-target.c
+++ b/drivers/md/dm-verity-target.c
@@ -839,7 +839,7 @@ static int verity_parse_opt_args(struct dm_arg_set *as, struct dm_verity *v)
 	struct dm_target *ti = v->ti;
 	const char *arg_name;
 
-	static struct dm_arg _args[] = {
+	static const struct dm_arg _args[] = {
 		{0, DM_VERITY_OPTS_MAX, "Invalid number of feature args"},
 	};
 
diff --git a/include/linux/device-mapper.h b/include/linux/device-mapper.h
index 4f2b3b2076c4..3b5fdf308148 100644
--- a/include/linux/device-mapper.h
+++ b/include/linux/device-mapper.h
@@ -387,7 +387,7 @@ struct dm_arg {
  * Validate the next argument, either returning it as *value or, if invalid,
  * returning -EINVAL and setting *error.
  */
-int dm_read_arg(struct dm_arg *arg, struct dm_arg_set *arg_set,
+int dm_read_arg(const struct dm_arg *arg, struct dm_arg_set *arg_set,
 		unsigned *value, char **error);
 
 /*
@@ -395,7 +395,7 @@ int dm_read_arg(struct dm_arg *arg, struct dm_arg_set *arg_set,
  * arg->min and arg->max further arguments. Either return the size as
  * *num_args or, if invalid, return -EINVAL and set *error.
  */
-int dm_read_arg_group(struct dm_arg *arg, struct dm_arg_set *arg_set,
+int dm_read_arg_group(const struct dm_arg *arg, struct dm_arg_set *arg_set,
 		      unsigned *num_args, char **error);
 
 /*
-- 
cgit v1.2.3


From 7521621e600aeefe5ffcc1f90ae26a42fc20c452 Mon Sep 17 00:00:00 2001
From: Michal Suchanek <msuchanek@suse.de>
Date: Fri, 11 Aug 2017 15:44:43 +0200
Subject: Do not disable driver and bus shutdown hook when class shutdown hook
 is set.

As seen from the implementation of the single class shutdown hook this
is not very sound design.

Rename the class shutdown hook to shutdown_pre to make it clear it runs
before the driver shutdown hook.

Signed-off-by: Michal Suchanek <msuchanek@suse.de>
Reviewed-by: Jarkko Sakkinen <jarkko.sakkinen@linux.intel.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/base/core.c         |  9 +++++----
 drivers/char/tpm/tpm-chip.c | 11 ++---------
 include/linux/device.h      |  4 ++--
 3 files changed, 9 insertions(+), 15 deletions(-)

(limited to 'include')

diff --git a/drivers/base/core.c b/drivers/base/core.c
index cb4b5b1f1b09..12ebd055724c 100644
--- a/drivers/base/core.c
+++ b/drivers/base/core.c
@@ -2796,11 +2796,12 @@ void device_shutdown(void)
 		pm_runtime_get_noresume(dev);
 		pm_runtime_barrier(dev);
 
-		if (dev->class && dev->class->shutdown) {
+		if (dev->class && dev->class->shutdown_pre) {
 			if (initcall_debug)
-				dev_info(dev, "shutdown\n");
-			dev->class->shutdown(dev);
-		} else if (dev->bus && dev->bus->shutdown) {
+				dev_info(dev, "shutdown_pre\n");
+			dev->class->shutdown_pre(dev);
+		}
+		if (dev->bus && dev->bus->shutdown) {
 			if (initcall_debug)
 				dev_info(dev, "shutdown\n");
 			dev->bus->shutdown(dev);
diff --git a/drivers/char/tpm/tpm-chip.c b/drivers/char/tpm/tpm-chip.c
index 67ec9d3d04f5..0eca20c5a80c 100644
--- a/drivers/char/tpm/tpm-chip.c
+++ b/drivers/char/tpm/tpm-chip.c
@@ -164,14 +164,7 @@ static int tpm_class_shutdown(struct device *dev)
 		chip->ops = NULL;
 		up_write(&chip->ops_sem);
 	}
-	/* Allow bus- and device-specific code to run. Note: since chip->ops
-	 * is NULL, more-specific shutdown code will not be able to issue TPM
-	 * commands.
-	 */
-	if (dev->bus && dev->bus->shutdown)
-		dev->bus->shutdown(dev);
-	else if (dev->driver && dev->driver->shutdown)
-		dev->driver->shutdown(dev);
+
 	return 0;
 }
 
@@ -214,7 +207,7 @@ struct tpm_chip *tpm_chip_alloc(struct device *pdev,
 	device_initialize(&chip->devs);
 
 	chip->dev.class = tpm_class;
-	chip->dev.class->shutdown = tpm_class_shutdown;
+	chip->dev.class->shutdown_pre = tpm_class_shutdown;
 	chip->dev.release = tpm_dev_release;
 	chip->dev.parent = pdev;
 	chip->dev.groups = chip->groups;
diff --git a/include/linux/device.h b/include/linux/device.h
index d5bcc5f109eb..c6f27207dbe8 100644
--- a/include/linux/device.h
+++ b/include/linux/device.h
@@ -375,7 +375,7 @@ int subsys_virtual_register(struct bus_type *subsys,
  * @suspend:	Used to put the device to sleep mode, usually to a low power
  *		state.
  * @resume:	Used to bring the device from the sleep mode.
- * @shutdown:	Called at shut-down time to quiesce the device.
+ * @shutdown_pre: Called at shut-down time before driver shutdown.
  * @ns_type:	Callbacks so sysfs can detemine namespaces.
  * @namespace:	Namespace of the device belongs to this class.
  * @pm:		The default device power management operations of this class.
@@ -404,7 +404,7 @@ struct class {
 
 	int (*suspend)(struct device *dev, pm_message_t state);
 	int (*resume)(struct device *dev);
-	int (*shutdown)(struct device *dev);
+	int (*shutdown_pre)(struct device *dev);
 
 	const struct kobj_ns_type_operations *ns_type;
 	const void *(*namespace)(struct device *dev);
-- 
cgit v1.2.3


From 35f0b6a779b8b7a98faefd7c1c660b4dac9a5c26 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Mon, 28 Aug 2017 08:28:08 +0200
Subject: libata: quirk read log on no-name M.2 SSD

Ido reported that reading the log page on his systems fails,
so quirk it as it won't support ZBC or security protocols.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reported-by: Ido Schimmel <idosch@mellanox.com>
Tested-by: Ido Schimmel <idosch@mellanox.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 drivers/ata/libata-core.c | 4 ++++
 include/linux/libata.h    | 1 +
 2 files changed, 5 insertions(+)

(limited to 'include')

diff --git a/drivers/ata/libata-core.c b/drivers/ata/libata-core.c
index fa7dd4394c02..697f5f896b19 100644
--- a/drivers/ata/libata-core.c
+++ b/drivers/ata/libata-core.c
@@ -2079,6 +2079,8 @@ unsigned int ata_read_log_page(struct ata_device *dev, u8 log,
 	 */
 	if (ap_flags & ATA_FLAG_NO_LOG_PAGE)
 		return AC_ERR_DEV;
+	if (dev->horkage & ATA_HORKAGE_NO_LOG_PAGE)
+		return AC_ERR_DEV;
 
 retry:
 	ata_tf_init(dev, &tf);
@@ -4578,6 +4580,8 @@ static const struct ata_blacklist_entry ata_device_blacklist [] = {
 	{ "WDC WD3000JD-*",		NULL,	ATA_HORKAGE_WD_BROKEN_LPM },
 	{ "WDC WD3200JD-*",		NULL,	ATA_HORKAGE_WD_BROKEN_LPM },
 
+
+	{ "M.2 (S42) 3ME3",		NULL,	ATA_HORKAGE_NO_LOG_PAGE },
 	/* End Marker */
 	{ }
 };
diff --git a/include/linux/libata.h b/include/linux/libata.h
index 931c32f1f18d..9e927ae7fced 100644
--- a/include/linux/libata.h
+++ b/include/linux/libata.h
@@ -438,6 +438,7 @@ enum {
 	ATA_HORKAGE_NO_DMA_LOG	= (1 << 23),	/* don't use DMA for log read */
 	ATA_HORKAGE_NOTRIM	= (1 << 24),	/* don't use TRIM */
 	ATA_HORKAGE_MAX_SEC_1024 = (1 << 25),	/* Limit max sects to 1024 */
+	ATA_HORKAGE_NO_LOG_PAGE = (1 << 26),	/* Doesn't like Get Log Page */
 
 	 /* DMA mask for user DMA control: User visible values; DO NOT
 	    renumber */
-- 
cgit v1.2.3


From 464bc0fd6273d518aee79fbd37211dd9bc35d863 Mon Sep 17 00:00:00 2001
From: John Fastabend <john.fastabend@gmail.com>
Date: Mon, 28 Aug 2017 07:10:04 -0700
Subject: bpf: convert sockmap field attach_bpf_fd2 to type

In the initial sockmap API we provided strparser and verdict programs
using a single attach command by extending the attach API with a the
attach_bpf_fd2 field.

However, if we add other programs in the future we will be adding a
field for every new possible type, attach_bpf_fd(3,4,..). This
seems a bit clumsy for an API. So lets push the programs using two
new type fields.

   BPF_SK_SKB_STREAM_PARSER
   BPF_SK_SKB_STREAM_VERDICT

This has the advantage of having a readable name and can easily be
extended in the future.

Updates to samples and sockmap included here also generalize tests
slightly to support upcoming patch for multiple map support.

Signed-off-by: John Fastabend <john.fastabend@gmail.com>
Fixes: 174a79ff9515 ("bpf: sockmap with sk redirect support")
Suggested-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/bpf.h                                |  10 +-
 include/uapi/linux/bpf.h                           |   9 +-
 kernel/bpf/sockmap.c                               |  25 ++--
 kernel/bpf/syscall.c                               |  38 ++----
 samples/sockmap/sockmap_kern.c                     |   6 +-
 samples/sockmap/sockmap_user.c                     |  12 +-
 tools/include/uapi/linux/bpf.h                     |   9 +-
 tools/lib/bpf/bpf.c                                |  14 +--
 tools/lib/bpf/bpf.h                                |   4 -
 tools/testing/selftests/bpf/bpf_helpers.h          |   3 +-
 tools/testing/selftests/bpf/sockmap_parse_prog.c   |   2 +-
 tools/testing/selftests/bpf/sockmap_verdict_prog.c |   2 +-
 tools/testing/selftests/bpf/test_maps.c            | 133 +++++++++------------
 13 files changed, 116 insertions(+), 151 deletions(-)

(limited to 'include')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 830f472d8df5..c2cb1b5c094e 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -39,8 +39,6 @@ struct bpf_map_ops {
 	void (*map_fd_put_ptr)(void *ptr);
 	u32 (*map_gen_lookup)(struct bpf_map *map, struct bpf_insn *insn_buf);
 	u32 (*map_fd_sys_lookup_elem)(void *ptr);
-	int (*map_attach)(struct bpf_map *map,
-			  struct bpf_prog *p1, struct bpf_prog *p2);
 };
 
 struct bpf_map {
@@ -387,11 +385,19 @@ static inline void __dev_map_flush(struct bpf_map *map)
 
 #if defined(CONFIG_STREAM_PARSER) && defined(CONFIG_BPF_SYSCALL)
 struct sock  *__sock_map_lookup_elem(struct bpf_map *map, u32 key);
+int sock_map_attach_prog(struct bpf_map *map, struct bpf_prog *prog, u32 type);
 #else
 static inline struct sock  *__sock_map_lookup_elem(struct bpf_map *map, u32 key)
 {
 	return NULL;
 }
+
+static inline int sock_map_attach_prog(struct bpf_map *map,
+				       struct bpf_prog *prog,
+				       u32 type)
+{
+	return -EOPNOTSUPP;
+}
 #endif
 
 /* verifier prototypes for helper functions called from eBPF programs */
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 843818dff96d..97227be3690c 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -136,7 +136,8 @@ enum bpf_attach_type {
 	BPF_CGROUP_INET_EGRESS,
 	BPF_CGROUP_INET_SOCK_CREATE,
 	BPF_CGROUP_SOCK_OPS,
-	BPF_CGROUP_SMAP_INGRESS,
+	BPF_SK_SKB_STREAM_PARSER,
+	BPF_SK_SKB_STREAM_VERDICT,
 	__MAX_BPF_ATTACH_TYPE
 };
 
@@ -224,7 +225,6 @@ union bpf_attr {
 		__u32		attach_bpf_fd;	/* eBPF program to attach */
 		__u32		attach_type;
 		__u32		attach_flags;
-		__u32		attach_bpf_fd2;
 	};
 
 	struct { /* anonymous struct used by BPF_PROG_TEST_RUN command */
@@ -580,14 +580,11 @@ union bpf_attr {
  *     @flags: reserved for future use
  *     Return: SK_REDIRECT
  *
- * int bpf_sock_map_update(skops, map, key, flags, map_flags)
+ * int bpf_sock_map_update(skops, map, key, flags)
  *	@skops: pointer to bpf_sock_ops
  *	@map: pointer to sockmap to update
  *	@key: key to insert/update sock in map
  *	@flags: same flags as map update elem
- *	@map_flags: sock map specific flags
- *	   bit 1: Enable strparser
- *	   other bits: reserved
  */
 #define __BPF_FUNC_MAPPER(FN)		\
 	FN(unspec),			\
diff --git a/kernel/bpf/sockmap.c b/kernel/bpf/sockmap.c
index 617c239590c2..cf570d108fd5 100644
--- a/kernel/bpf/sockmap.c
+++ b/kernel/bpf/sockmap.c
@@ -723,20 +723,24 @@ out:
 	return err;
 }
 
-static int sock_map_attach_prog(struct bpf_map *map,
-				struct bpf_prog *parse,
-				struct bpf_prog *verdict)
+int sock_map_attach_prog(struct bpf_map *map, struct bpf_prog *prog, u32 type)
 {
 	struct bpf_stab *stab = container_of(map, struct bpf_stab, map);
-	struct bpf_prog *_parse, *_verdict;
+	struct bpf_prog *orig;
 
-	_parse = xchg(&stab->bpf_parse, parse);
-	_verdict = xchg(&stab->bpf_verdict, verdict);
+	switch (type) {
+	case BPF_SK_SKB_STREAM_PARSER:
+		orig = xchg(&stab->bpf_parse, prog);
+		break;
+	case BPF_SK_SKB_STREAM_VERDICT:
+		orig = xchg(&stab->bpf_verdict, prog);
+		break;
+	default:
+		return -EOPNOTSUPP;
+	}
 
-	if (_parse)
-		bpf_prog_put(_parse);
-	if (_verdict)
-		bpf_prog_put(_verdict);
+	if (orig)
+		bpf_prog_put(orig);
 
 	return 0;
 }
@@ -777,7 +781,6 @@ const struct bpf_map_ops sock_map_ops = {
 	.map_get_next_key = sock_map_get_next_key,
 	.map_update_elem = sock_map_update_elem,
 	.map_delete_elem = sock_map_delete_elem,
-	.map_attach = sock_map_attach_prog,
 };
 
 BPF_CALL_5(bpf_sock_map_update, struct bpf_sock_ops_kern *, bpf_sock,
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 9378f3ba2cbf..021a05d9d800 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -1093,12 +1093,12 @@ static int bpf_obj_get(const union bpf_attr *attr)
 
 #ifdef CONFIG_CGROUP_BPF
 
-#define BPF_PROG_ATTACH_LAST_FIELD attach_bpf_fd2
+#define BPF_PROG_ATTACH_LAST_FIELD attach_flags
 
-static int sockmap_get_from_fd(const union bpf_attr *attr, int ptype)
+static int sockmap_get_from_fd(const union bpf_attr *attr)
 {
-	struct bpf_prog *prog1, *prog2;
 	int ufd = attr->target_fd;
+	struct bpf_prog *prog;
 	struct bpf_map *map;
 	struct fd f;
 	int err;
@@ -1108,29 +1108,16 @@ static int sockmap_get_from_fd(const union bpf_attr *attr, int ptype)
 	if (IS_ERR(map))
 		return PTR_ERR(map);
 
-	if (!map->ops->map_attach) {
-		fdput(f);
-		return -EOPNOTSUPP;
-	}
-
-	prog1 = bpf_prog_get_type(attr->attach_bpf_fd, ptype);
-	if (IS_ERR(prog1)) {
+	prog = bpf_prog_get_type(attr->attach_bpf_fd, BPF_PROG_TYPE_SK_SKB);
+	if (IS_ERR(prog)) {
 		fdput(f);
-		return PTR_ERR(prog1);
-	}
-
-	prog2 = bpf_prog_get_type(attr->attach_bpf_fd2, ptype);
-	if (IS_ERR(prog2)) {
-		fdput(f);
-		bpf_prog_put(prog1);
-		return PTR_ERR(prog2);
+		return PTR_ERR(prog);
 	}
 
-	err = map->ops->map_attach(map, prog1, prog2);
+	err = sock_map_attach_prog(map, prog, attr->attach_type);
 	if (err) {
 		fdput(f);
-		bpf_prog_put(prog1);
-		bpf_prog_put(prog2);
+		bpf_prog_put(prog);
 		return err;
 	}
 
@@ -1165,16 +1152,13 @@ static int bpf_prog_attach(const union bpf_attr *attr)
 	case BPF_CGROUP_SOCK_OPS:
 		ptype = BPF_PROG_TYPE_SOCK_OPS;
 		break;
-	case BPF_CGROUP_SMAP_INGRESS:
-		ptype = BPF_PROG_TYPE_SK_SKB;
-		break;
+	case BPF_SK_SKB_STREAM_PARSER:
+	case BPF_SK_SKB_STREAM_VERDICT:
+		return sockmap_get_from_fd(attr);
 	default:
 		return -EINVAL;
 	}
 
-	if (attr->attach_type == BPF_CGROUP_SMAP_INGRESS)
-		return sockmap_get_from_fd(attr, ptype);
-
 	prog = bpf_prog_get_type(attr->attach_bpf_fd, ptype);
 	if (IS_ERR(prog))
 		return PTR_ERR(prog);
diff --git a/samples/sockmap/sockmap_kern.c b/samples/sockmap/sockmap_kern.c
index 6ff986f7059b..f9b38ef82dc2 100644
--- a/samples/sockmap/sockmap_kern.c
+++ b/samples/sockmap/sockmap_kern.c
@@ -82,8 +82,7 @@ int bpf_sockmap(struct bpf_sock_ops *skops)
 		if (lport == 10000) {
 			ret = 1;
 			err = bpf_sock_map_update(skops, &sock_map, &ret,
-						  BPF_NOEXIST,
-						  BPF_SOCKMAP_STRPARSER);
+						  BPF_NOEXIST);
 			bpf_printk("passive(%i -> %i) map ctx update err: %d\n",
 				   lport, bpf_ntohl(rport), err);
 		}
@@ -95,8 +94,7 @@ int bpf_sockmap(struct bpf_sock_ops *skops)
 		if (bpf_ntohl(rport) == 10001) {
 			ret = 10;
 			err = bpf_sock_map_update(skops, &sock_map, &ret,
-						  BPF_NOEXIST,
-						  BPF_SOCKMAP_STRPARSER);
+						  BPF_NOEXIST);
 			bpf_printk("active(%i -> %i) map ctx update err: %d\n",
 				   lport, bpf_ntohl(rport), err);
 		}
diff --git a/samples/sockmap/sockmap_user.c b/samples/sockmap/sockmap_user.c
index fb78f5abefb4..7cc9d228216f 100644
--- a/samples/sockmap/sockmap_user.c
+++ b/samples/sockmap/sockmap_user.c
@@ -256,8 +256,16 @@ int main(int argc, char **argv)
 	}
 
 	/* Attach programs to sockmap */
-	err = __bpf_prog_attach(prog_fd[0], prog_fd[1], map_fd[0],
-				BPF_CGROUP_SMAP_INGRESS, 0);
+	err = bpf_prog_attach(prog_fd[0], map_fd[0],
+				BPF_SK_SKB_STREAM_PARSER, 0);
+	if (err) {
+		fprintf(stderr, "ERROR: bpf_prog_attach (sockmap): %d (%s)\n",
+			err, strerror(errno));
+		return err;
+	}
+
+	err = bpf_prog_attach(prog_fd[1], map_fd[0],
+				BPF_SK_SKB_STREAM_VERDICT, 0);
 	if (err) {
 		fprintf(stderr, "ERROR: bpf_prog_attach (sockmap): %d (%s)\n",
 			err, strerror(errno));
diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index f8f6377fd541..09ac590eefb1 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -136,7 +136,8 @@ enum bpf_attach_type {
 	BPF_CGROUP_INET_EGRESS,
 	BPF_CGROUP_INET_SOCK_CREATE,
 	BPF_CGROUP_SOCK_OPS,
-	BPF_CGROUP_SMAP_INGRESS,
+	BPF_SK_SKB_STREAM_PARSER,
+	BPF_SK_SKB_STREAM_VERDICT,
 	__MAX_BPF_ATTACH_TYPE
 };
 
@@ -227,7 +228,6 @@ union bpf_attr {
 		__u32		attach_bpf_fd;	/* eBPF program to attach */
 		__u32		attach_type;
 		__u32		attach_flags;
-		__u32		attach_bpf_fd2;
 	};
 
 	struct { /* anonymous struct used by BPF_PROG_TEST_RUN command */
@@ -572,14 +572,11 @@ union bpf_attr {
  *     @flags: reserved for future use
  *     Return: SK_REDIRECT
  *
- * int bpf_sock_map_update(skops, map, key, flags, map_flags)
+ * int bpf_sock_map_update(skops, map, key, flags)
  *	@skops: pointer to bpf_sock_ops
  *	@map: pointer to sockmap to update
  *	@key: key to insert/update sock in map
  *	@flags: same flags as map update elem
- *	@map_flags: sock map specific flags
- *	   bit 1: Enable strparser
- *	   other bits: reserved
  */
 #define __BPF_FUNC_MAPPER(FN)		\
 	FN(unspec),			\
diff --git a/tools/lib/bpf/bpf.c b/tools/lib/bpf/bpf.c
index a0717610b116..1d6907d379c9 100644
--- a/tools/lib/bpf/bpf.c
+++ b/tools/lib/bpf/bpf.c
@@ -235,28 +235,20 @@ int bpf_obj_get(const char *pathname)
 	return sys_bpf(BPF_OBJ_GET, &attr, sizeof(attr));
 }
 
-int __bpf_prog_attach(int prog_fd1, int prog_fd2, int target_fd,
-		      enum bpf_attach_type type,
-		      unsigned int flags)
+int bpf_prog_attach(int prog_fd, int target_fd, enum bpf_attach_type type,
+		    unsigned int flags)
 {
 	union bpf_attr attr;
 
 	bzero(&attr, sizeof(attr));
 	attr.target_fd	   = target_fd;
-	attr.attach_bpf_fd = prog_fd1;
-	attr.attach_bpf_fd2 = prog_fd2;
+	attr.attach_bpf_fd = prog_fd;
 	attr.attach_type   = type;
 	attr.attach_flags  = flags;
 
 	return sys_bpf(BPF_PROG_ATTACH, &attr, sizeof(attr));
 }
 
-int bpf_prog_attach(int prog_fd, int target_fd, enum bpf_attach_type type,
-		    unsigned int flags)
-{
-	return __bpf_prog_attach(prog_fd, 0, target_fd, type, flags);
-}
-
 int bpf_prog_detach(int target_fd, enum bpf_attach_type type)
 {
 	union bpf_attr attr;
diff --git a/tools/lib/bpf/bpf.h b/tools/lib/bpf/bpf.h
index 90e9d4e85d08..b8ea5843c39e 100644
--- a/tools/lib/bpf/bpf.h
+++ b/tools/lib/bpf/bpf.h
@@ -56,10 +56,6 @@ int bpf_obj_pin(int fd, const char *pathname);
 int bpf_obj_get(const char *pathname);
 int bpf_prog_attach(int prog_fd, int attachable_fd, enum bpf_attach_type type,
 		    unsigned int flags);
-int __bpf_prog_attach(int prog1, int prog2,
-		      int attachable_fd,
-		      enum bpf_attach_type type,
-		      unsigned int flags);
 int bpf_prog_detach(int attachable_fd, enum bpf_attach_type type);
 int bpf_prog_test_run(int prog_fd, int repeat, void *data, __u32 size,
 		      void *data_out, __u32 *size_out, __u32 *retval,
diff --git a/tools/testing/selftests/bpf/bpf_helpers.h b/tools/testing/selftests/bpf/bpf_helpers.h
index 98f3be26d390..36fb9161b34a 100644
--- a/tools/testing/selftests/bpf/bpf_helpers.h
+++ b/tools/testing/selftests/bpf/bpf_helpers.h
@@ -68,8 +68,7 @@ static int (*bpf_setsockopt)(void *ctx, int level, int optname, void *optval,
 static int (*bpf_sk_redirect_map)(void *map, int key, int flags) =
 	(void *) BPF_FUNC_sk_redirect_map;
 static int (*bpf_sock_map_update)(void *map, void *key, void *value,
-				  unsigned long long flags,
-				  unsigned long long map_lags) =
+				  unsigned long long flags) =
 	(void *) BPF_FUNC_sock_map_update;
 
 
diff --git a/tools/testing/selftests/bpf/sockmap_parse_prog.c b/tools/testing/selftests/bpf/sockmap_parse_prog.c
index 8b5453158399..710f43f42dc4 100644
--- a/tools/testing/selftests/bpf/sockmap_parse_prog.c
+++ b/tools/testing/selftests/bpf/sockmap_parse_prog.c
@@ -30,7 +30,7 @@ int bpf_prog1(struct __sk_buff *skb)
 	 */
 	d[0] = 1;
 
-	bpf_printk("data[0] = (%u): local_port %i remote %i\n",
+	bpf_printk("parse: data[0] = (%u): local_port %i remote %i\n",
 		   d[0], lport, bpf_ntohl(rport));
 	return skb->len;
 }
diff --git a/tools/testing/selftests/bpf/sockmap_verdict_prog.c b/tools/testing/selftests/bpf/sockmap_verdict_prog.c
index d5f9447b3808..0573c1db2519 100644
--- a/tools/testing/selftests/bpf/sockmap_verdict_prog.c
+++ b/tools/testing/selftests/bpf/sockmap_verdict_prog.c
@@ -40,7 +40,7 @@ int bpf_prog2(struct __sk_buff *skb)
 	d[6] = 0xe;
 	d[7] = 0xf;
 
-	bpf_printk("data[0] = (%u): local_port %i remote %i\n",
+	bpf_printk("verdict: data[0] = (%u): local_port %i remote %i redirect 5\n",
 		   d[0], lport, bpf_ntohl(rport));
 	return bpf_sk_redirect_map(&sock_map, 5, 0);
 }
diff --git a/tools/testing/selftests/bpf/test_maps.c b/tools/testing/selftests/bpf/test_maps.c
index 40b2d1faf02b..6df6e6257424 100644
--- a/tools/testing/selftests/bpf/test_maps.c
+++ b/tools/testing/selftests/bpf/test_maps.c
@@ -547,20 +547,26 @@ static void test_sockmap(int task, void *data)
 		goto out_sockmap;
 	}
 
-	/* Nothing attached so these should fail */
+	/* Test update without programs */
 	for (i = 0; i < 6; i++) {
 		err = bpf_map_update_elem(fd, &i, &sfd[i], BPF_ANY);
-		if (!err) {
-			printf("Failed invalid update sockmap '%i:%i'\n",
+		if (err) {
+			printf("Failed noprog update sockmap '%i:%i'\n",
 			       i, sfd[i]);
 			goto out_sockmap;
 		}
 	}
 
 	/* Test attaching bad fds */
-	err = __bpf_prog_attach(-1, -2, fd, BPF_CGROUP_SMAP_INGRESS, 0);
+	err = bpf_prog_attach(-1, fd, BPF_SK_SKB_STREAM_PARSER, 0);
 	if (!err) {
-		printf("Failed invalid prog attach\n");
+		printf("Failed invalid parser prog attach\n");
+		goto out_sockmap;
+	}
+
+	err = bpf_prog_attach(-1, fd, BPF_SK_SKB_STREAM_VERDICT, 0);
+	if (!err) {
+		printf("Failed invalid verdict prog attach\n");
 		goto out_sockmap;
 	}
 
@@ -591,14 +597,21 @@ static void test_sockmap(int task, void *data)
 		goto out_sockmap;
 	}
 
-	err = __bpf_prog_attach(parse_prog, verdict_prog, map_fd,
-				BPF_CGROUP_SMAP_INGRESS, 0);
+	err = bpf_prog_attach(parse_prog, map_fd,
+		      BPF_SK_SKB_STREAM_PARSER, 0);
+	if (err) {
+		printf("Failed bpf prog attach\n");
+		goto out_sockmap;
+	}
+
+	err = bpf_prog_attach(verdict_prog, map_fd,
+			      BPF_SK_SKB_STREAM_VERDICT, 0);
 	if (err) {
 		printf("Failed bpf prog attach\n");
 		goto out_sockmap;
 	}
 
-	/* Test map update elem */
+	/* Test map update elem afterwards fd lives in fd and map_fd */
 	for (i = 0; i < 6; i++) {
 		err = bpf_map_update_elem(map_fd, &i, &sfd[i], BPF_ANY);
 		if (err) {
@@ -649,96 +662,68 @@ static void test_sockmap(int task, void *data)
 		goto out_sockmap;
 	}
 
-	/* Delete the reset of the elems include some NULL elems */
-	for (i = 0; i < 6; i++) {
-		err = bpf_map_delete_elem(map_fd, &i);
-		if (err && (i == 0 || i == 1 || i >= 4)) {
-			printf("Failed delete  sockmap %i '%i:%i'\n",
-			       err, i, sfd[i]);
-			goto out_sockmap;
-		} else if (!err && (i == 2 || i == 3)) {
-			printf("Failed null delete sockmap %i '%i:%i'\n",
-			       err, i, sfd[i]);
-			goto out_sockmap;
-		}
-	}
-
-	/* Test having multiple SMAPs open and active on same fds */
-	err = __bpf_prog_attach(parse_prog, verdict_prog, fd,
-				BPF_CGROUP_SMAP_INGRESS, 0);
-	if (err) {
-		printf("Failed fd bpf prog attach\n");
-		goto out_sockmap;
-	}
-
-	for (i = 0; i < 6; i++) {
-		err = bpf_map_update_elem(fd, &i, &sfd[i], BPF_ANY);
-		if (err) {
-			printf("Failed fd update sockmap %i '%i:%i'\n",
-			       err, i, sfd[i]);
-			goto out_sockmap;
-		}
-	}
-
-	/* Test duplicate socket add of NOEXIST, ANY and EXIST */
-	i = 0;
+	/* Push fd into same slot */
+	i = 2;
 	err = bpf_map_update_elem(fd, &i, &sfd[i], BPF_NOEXIST);
 	if (!err) {
-		printf("Failed BPF_NOEXIST create\n");
+		printf("Failed allowed sockmap dup slot BPF_NOEXIST\n");
 		goto out_sockmap;
 	}
 
 	err = bpf_map_update_elem(fd, &i, &sfd[i], BPF_ANY);
 	if (err) {
-		printf("Failed sockmap update BPF_ANY\n");
+		printf("Failed sockmap update new slot BPF_ANY\n");
 		goto out_sockmap;
 	}
 
 	err = bpf_map_update_elem(fd, &i, &sfd[i], BPF_EXIST);
 	if (err) {
-		printf("Failed sockmap update BPF_EXIST\n");
+		printf("Failed sockmap update new slot BPF_EXIST\n");
 		goto out_sockmap;
 	}
 
-	/* The above were pushing fd into same slot try different slot now */
-	i = 2;
-	err = bpf_map_update_elem(fd, &i, &sfd[i], BPF_NOEXIST);
-	if (!err) {
-		printf("Failed BPF_NOEXIST create\n");
-		goto out_sockmap;
+	/* Delete the elems without programs */
+	for (i = 0; i < 6; i++) {
+		err = bpf_map_delete_elem(fd, &i);
+		if (err) {
+			printf("Failed delete sockmap %i '%i:%i'\n",
+			       err, i, sfd[i]);
+		}
 	}
 
-	err = bpf_map_update_elem(fd, &i, &sfd[i], BPF_ANY);
+	/* Test having multiple maps open and set with programs on same fds */
+	err = bpf_prog_attach(parse_prog, fd,
+			      BPF_SK_SKB_STREAM_PARSER, 0);
 	if (err) {
-		printf("Failed sockmap update BPF_ANY\n");
+		printf("Failed fd bpf parse prog attach\n");
 		goto out_sockmap;
 	}
-
-	err = bpf_map_update_elem(fd, &i, &sfd[i], BPF_EXIST);
+	err = bpf_prog_attach(verdict_prog, fd,
+			      BPF_SK_SKB_STREAM_VERDICT, 0);
 	if (err) {
-		printf("Failed sockmap update BPF_EXIST\n");
+		printf("Failed fd bpf verdict prog attach\n");
 		goto out_sockmap;
 	}
 
-	/* Try pushing fd into different map, this is not allowed at the
-	 * moment. Which programs would we use?
-	 */
-	err = bpf_map_update_elem(map_fd, &i, &sfd[i], BPF_NOEXIST);
-	if (!err) {
-		printf("Failed BPF_NOEXIST create\n");
-		goto out_sockmap;
-	}
-
-	err = bpf_map_update_elem(map_fd, &i, &sfd[i], BPF_ANY);
-	if (!err) {
-		printf("Failed sockmap update BPF_ANY\n");
-		goto out_sockmap;
-	}
-
-	err = bpf_map_update_elem(map_fd, &i, &sfd[i], BPF_EXIST);
-	if (!err) {
-		printf("Failed sockmap update BPF_EXIST\n");
-		goto out_sockmap;
+	for (i = 4; i < 6; i++) {
+		err = bpf_map_update_elem(fd, &i, &sfd[i], BPF_ANY);
+		if (!err) {
+			printf("Failed allowed duplicate programs in update ANY sockmap %i '%i:%i'\n",
+			       err, i, sfd[i]);
+			goto out_sockmap;
+		}
+		err = bpf_map_update_elem(fd, &i, &sfd[i], BPF_NOEXIST);
+		if (!err) {
+			printf("Failed allowed duplicate program in update NOEXIST sockmap  %i '%i:%i'\n",
+			       err, i, sfd[i]);
+			goto out_sockmap;
+		}
+		err = bpf_map_update_elem(fd, &i, &sfd[i], BPF_EXIST);
+		if (!err) {
+			printf("Failed allowed duplicate program in update EXIST sockmap  %i '%i:%i'\n",
+			       err, i, sfd[i]);
+			goto out_sockmap;
+		}
 	}
 
 	/* Test map close sockets */
-- 
cgit v1.2.3


From 2f857d04601a1bb56958b95a9f180bce0e91e5e6 Mon Sep 17 00:00:00 2001
From: John Fastabend <john.fastabend@gmail.com>
Date: Mon, 28 Aug 2017 07:10:25 -0700
Subject: bpf: sockmap, remove STRPARSER map_flags and add multi-map support

The addition of map_flags BPF_SOCKMAP_STRPARSER flags was to handle a
specific use case where we want to have BPF parse program disabled on
an entry in a sockmap.

However, Alexei found the API a bit cumbersome and I agreed. Lets
remove the STRPARSER flag and support the use case by allowing socks
to be in multiple maps. This allows users to create two maps one with
programs attached and one without. When socks are added to maps they
now inherit any programs attached to the map. This is a nice
generalization and IMO improves the API.

The API rules are less ambiguous and do not need a flag:

  - When a sock is added to a sockmap we have two cases,

     i. The sock map does not have any attached programs so
        we can add sock to map without inheriting bpf programs.
        The sock may exist in 0 or more other maps.

    ii. The sock map has an attached BPF program. To avoid duplicate
        bpf programs we only add the sock entry if it does not have
        an existing strparser/verdict attached, returning -EBUSY if
        a program is already attached. Otherwise attach the program
        and inherit strparser/verdict programs from the sock map.

This allows for socks to be in a multiple maps for redirects and
inherit a BPF program from a single map.

Also this patch simplifies the logic around BPF_{EXIST|NOEXIST|ANY}
flags. In the original patch I tried to be extra clever and only
update map entries when necessary. Now I've decided the complexity
is not worth it. If users constantly update an entry with the same
sock for no reason (i.e. update an entry without actually changing
any parameters on map or sock) we still do an alloc/release. Using
this and allowing multiple entries of a sock to exist in a map the
logic becomes much simpler.

Note: Now that multiple maps are supported the "maps" pointer called
when a socket is closed becomes a list of maps to remove the sock from.
To keep the map up to date when a sock is added to the sockmap we must
add the map/elem in the list. Likewise when it is removed we must
remove it from the list. This results in searching the per psock list
on delete operation. On TCP_CLOSE events we walk the list and remove
the psock from all map/entry locations. I don't see any perf
implications in this because at most I have a psock in two maps. If
a psock were to be in many maps its possibly this might be noticeable
on delete but I can't think of a reason to dup a psock in many maps.
The sk_callback_lock is used to protect read/writes to the list. This
was convenient because in all locations we were taking the lock
anyways just after working on the list. Also the lock is per sock so
in normal cases we shouldn't see any contention.

Suggested-by: Alexei Starovoitov <ast@kernel.org>
Fixes: 174a79ff9515 ("bpf: sockmap with sk redirect support")
Signed-off-by: John Fastabend <john.fastabend@gmail.com>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/bpf.h |   3 -
 kernel/bpf/sockmap.c     | 269 +++++++++++++++++++++++++++++------------------
 2 files changed, 165 insertions(+), 107 deletions(-)

(limited to 'include')

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 97227be3690c..08c206a863e1 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -143,9 +143,6 @@ enum bpf_attach_type {
 
 #define MAX_BPF_ATTACH_TYPE __MAX_BPF_ATTACH_TYPE
 
-/* If BPF_SOCKMAP_STRPARSER is used sockmap will use strparser on receive */
-#define BPF_SOCKMAP_STRPARSER	(1U << 0)
-
 /* If BPF_F_ALLOW_OVERRIDE flag is used in BPF_PROG_ATTACH command
  * to the given target_fd cgroup the descendent cgroup will be able to
  * override effective bpf program that was inherited from this cgroup
diff --git a/kernel/bpf/sockmap.c b/kernel/bpf/sockmap.c
index cf570d108fd5..a6882e54930b 100644
--- a/kernel/bpf/sockmap.c
+++ b/kernel/bpf/sockmap.c
@@ -13,15 +13,16 @@
 /* A BPF sock_map is used to store sock objects. This is primarly used
  * for doing socket redirect with BPF helper routines.
  *
- * A sock map may have two BPF programs attached to it, a program used
- * to parse packets and a program to provide a verdict and redirect
- * decision on the packet. If no BPF parse program is provided it is
- * assumed that every skb is a "message" (skb->len). Otherwise the
- * parse program is attached to strparser and used to build messages
- * that may span multiple skbs. The verdict program will either select
- * a socket to send/receive the skb on or provide the drop code indicating
- * the skb should be dropped. More actions may be added later as needed.
- * The default program will drop packets.
+ * A sock map may have BPF programs attached to it, currently a program
+ * used to parse packets and a program to provide a verdict and redirect
+ * decision on the packet are supported. Any programs attached to a sock
+ * map are inherited by sock objects when they are added to the map. If
+ * no BPF programs are attached the sock object may only be used for sock
+ * redirect.
+ *
+ * A sock object may be in multiple maps, but can only inherit a single
+ * parse or verdict program. If adding a sock object to a map would result
+ * in having multiple parsing programs the update will return an EBUSY error.
  *
  * For reference this program is similar to devmap used in XDP context
  * reviewing these together may be useful. For an example please review
@@ -44,15 +45,21 @@ struct bpf_stab {
 	struct sock **sock_map;
 	struct bpf_prog *bpf_parse;
 	struct bpf_prog *bpf_verdict;
-	refcount_t refcnt;
 };
 
 enum smap_psock_state {
 	SMAP_TX_RUNNING,
 };
 
+struct smap_psock_map_entry {
+	struct list_head list;
+	struct sock **entry;
+};
+
 struct smap_psock {
 	struct rcu_head	rcu;
+	/* refcnt is used inside sk_callback_lock */
+	u32 refcnt;
 
 	/* datapath variables */
 	struct sk_buff_head rxqueue;
@@ -66,10 +73,9 @@ struct smap_psock {
 	struct strparser strp;
 	struct bpf_prog *bpf_parse;
 	struct bpf_prog *bpf_verdict;
-	struct bpf_stab *stab;
+	struct list_head maps;
 
 	/* Back reference used when sock callback trigger sockmap operations */
-	int key;
 	struct sock *sock;
 	unsigned long state;
 
@@ -83,7 +89,7 @@ struct smap_psock {
 
 static inline struct smap_psock *smap_psock_sk(const struct sock *sk)
 {
-	return (struct smap_psock *)rcu_dereference_sk_user_data(sk);
+	return rcu_dereference_sk_user_data(sk);
 }
 
 static int smap_verdict_func(struct smap_psock *psock, struct sk_buff *skb)
@@ -149,11 +155,12 @@ static void smap_report_sk_error(struct smap_psock *psock, int err)
 	sk->sk_error_report(sk);
 }
 
-static void smap_release_sock(struct sock *sock);
+static void smap_release_sock(struct smap_psock *psock, struct sock *sock);
 
 /* Called with lock_sock(sk) held */
 static void smap_state_change(struct sock *sk)
 {
+	struct smap_psock_map_entry *e, *tmp;
 	struct smap_psock *psock;
 	struct sock *osk;
 
@@ -184,9 +191,15 @@ static void smap_state_change(struct sock *sk)
 		psock = smap_psock_sk(sk);
 		if (unlikely(!psock))
 			break;
-		osk = cmpxchg(&psock->stab->sock_map[psock->key], sk, NULL);
-		if (osk == sk)
-			smap_release_sock(sk);
+		write_lock_bh(&sk->sk_callback_lock);
+		list_for_each_entry_safe(e, tmp, &psock->maps, list) {
+			osk = cmpxchg(e->entry, sk, NULL);
+			if (osk == sk) {
+				list_del(&e->list);
+				smap_release_sock(psock, sk);
+			}
+		}
+		write_unlock_bh(&sk->sk_callback_lock);
 		break;
 	default:
 		psock = smap_psock_sk(sk);
@@ -289,9 +302,8 @@ static void smap_write_space(struct sock *sk)
 
 static void smap_stop_sock(struct smap_psock *psock, struct sock *sk)
 {
-	write_lock_bh(&sk->sk_callback_lock);
 	if (!psock->strp_enabled)
-		goto out;
+		return;
 	sk->sk_data_ready = psock->save_data_ready;
 	sk->sk_write_space = psock->save_write_space;
 	sk->sk_state_change = psock->save_state_change;
@@ -300,8 +312,6 @@ static void smap_stop_sock(struct smap_psock *psock, struct sock *sk)
 	psock->save_state_change = NULL;
 	strp_stop(&psock->strp);
 	psock->strp_enabled = false;
-out:
-	write_unlock_bh(&sk->sk_callback_lock);
 }
 
 static void smap_destroy_psock(struct rcu_head *rcu)
@@ -318,9 +328,11 @@ static void smap_destroy_psock(struct rcu_head *rcu)
 	schedule_work(&psock->gc_work);
 }
 
-static void smap_release_sock(struct sock *sock)
+static void smap_release_sock(struct smap_psock *psock, struct sock *sock)
 {
-	struct smap_psock *psock = smap_psock_sk(sock);
+	psock->refcnt--;
+	if (psock->refcnt)
+		return;
 
 	smap_stop_sock(psock, sock);
 	clear_bit(SMAP_TX_RUNNING, &psock->state);
@@ -414,6 +426,7 @@ static void sock_map_remove_complete(struct bpf_stab *stab)
 
 static void smap_gc_work(struct work_struct *w)
 {
+	struct smap_psock_map_entry *e, *tmp;
 	struct smap_psock *psock;
 
 	psock = container_of(w, struct smap_psock, gc_work);
@@ -431,8 +444,10 @@ static void smap_gc_work(struct work_struct *w)
 	if (psock->bpf_verdict)
 		bpf_prog_put(psock->bpf_verdict);
 
-	if (refcount_dec_and_test(&psock->stab->refcnt))
-		sock_map_remove_complete(psock->stab);
+	list_for_each_entry_safe(e, tmp, &psock->maps, list) {
+		list_del(&e->list);
+		kfree(e);
+	}
 
 	sock_put(psock->sock);
 	kfree(psock);
@@ -453,6 +468,8 @@ static struct smap_psock *smap_init_psock(struct sock *sock,
 	skb_queue_head_init(&psock->rxqueue);
 	INIT_WORK(&psock->tx_work, smap_tx_work);
 	INIT_WORK(&psock->gc_work, smap_gc_work);
+	INIT_LIST_HEAD(&psock->maps);
+	psock->refcnt = 1;
 
 	rcu_assign_sk_user_data(sock, psock);
 	sock_hold(sock);
@@ -503,13 +520,24 @@ static struct bpf_map *sock_map_alloc(union bpf_attr *attr)
 	if (!stab->sock_map)
 		goto free_stab;
 
-	refcount_set(&stab->refcnt, 1);
 	return &stab->map;
 free_stab:
 	kfree(stab);
 	return ERR_PTR(err);
 }
 
+static void smap_list_remove(struct smap_psock *psock, struct sock **entry)
+{
+	struct smap_psock_map_entry *e, *tmp;
+
+	list_for_each_entry_safe(e, tmp, &psock->maps, list) {
+		if (e->entry == entry) {
+			list_del(&e->list);
+			break;
+		}
+	}
+}
+
 static void sock_map_free(struct bpf_map *map)
 {
 	struct bpf_stab *stab = container_of(map, struct bpf_stab, map);
@@ -526,13 +554,18 @@ static void sock_map_free(struct bpf_map *map)
 	 */
 	rcu_read_lock();
 	for (i = 0; i < stab->map.max_entries; i++) {
+		struct smap_psock *psock;
 		struct sock *sock;
 
 		sock = xchg(&stab->sock_map[i], NULL);
 		if (!sock)
 			continue;
 
-		smap_release_sock(sock);
+		write_lock_bh(&sock->sk_callback_lock);
+		psock = smap_psock_sk(sock);
+		smap_list_remove(psock, &stab->sock_map[i]);
+		smap_release_sock(psock, sock);
+		write_unlock_bh(&sock->sk_callback_lock);
 	}
 	rcu_read_unlock();
 
@@ -541,8 +574,7 @@ static void sock_map_free(struct bpf_map *map)
 	if (stab->bpf_parse)
 		bpf_prog_put(stab->bpf_parse);
 
-	if (refcount_dec_and_test(&stab->refcnt))
-		sock_map_remove_complete(stab);
+	sock_map_remove_complete(stab);
 }
 
 static int sock_map_get_next_key(struct bpf_map *map, void *key, void *next_key)
@@ -576,6 +608,7 @@ struct sock  *__sock_map_lookup_elem(struct bpf_map *map, u32 key)
 static int sock_map_delete_elem(struct bpf_map *map, void *key)
 {
 	struct bpf_stab *stab = container_of(map, struct bpf_stab, map);
+	struct smap_psock *psock;
 	int k = *(u32 *)key;
 	struct sock *sock;
 
@@ -586,7 +619,17 @@ static int sock_map_delete_elem(struct bpf_map *map, void *key)
 	if (!sock)
 		return -EINVAL;
 
-	smap_release_sock(sock);
+	write_lock_bh(&sock->sk_callback_lock);
+	psock = smap_psock_sk(sock);
+	if (!psock)
+		goto out;
+
+	if (psock->bpf_parse)
+		smap_stop_sock(psock, sock);
+	smap_list_remove(psock, &stab->sock_map[k]);
+	smap_release_sock(psock, sock);
+out:
+	write_unlock_bh(&sock->sk_callback_lock);
 	return 0;
 }
 
@@ -601,29 +644,34 @@ static int sock_map_delete_elem(struct bpf_map *map, void *key)
  * and syncd so we are certain all references from the update/lookup/delete
  * operations as well as references in the data path are no longer in use.
  *
- * A psock object holds a refcnt on the sockmap it is attached to and this is
- * not decremented until after a RCU grace period and garbage collection occurs.
- * This ensures the map is not free'd until psocks linked to it are removed. The
- * map link is used when the independent sock events trigger map deletion.
+ * Psocks may exist in multiple maps, but only a single set of parse/verdict
+ * programs may be inherited from the maps it belongs to. A reference count
+ * is kept with the total number of references to the psock from all maps. The
+ * psock will not be released until this reaches zero. The psock and sock
+ * user data data use the sk_callback_lock to protect critical data structures
+ * from concurrent access. This allows us to avoid two updates from modifying
+ * the user data in sock and the lock is required anyways for modifying
+ * callbacks, we simply increase its scope slightly.
  *
- * Psocks may only participate in one sockmap at a time. Users that try to
- * join a single sock to multiple maps will get an error.
- *
- * Last, but not least, it is possible the socket is closed while running
- * an update on an existing psock. This will release the psock, but again
- * not until the update has completed due to rcu grace period rules.
+ * Rules to follow,
+ *  - psock must always be read inside RCU critical section
+ *  - sk_user_data must only be modified inside sk_callback_lock and read
+ *    inside RCU critical section.
+ *  - psock->maps list must only be read & modified inside sk_callback_lock
+ *  - sock_map must use READ_ONCE and (cmp)xchg operations
+ *  - BPF verdict/parse programs must use READ_ONCE and xchg operations
  */
 static int sock_map_ctx_update_elem(struct bpf_sock_ops_kern *skops,
 				    struct bpf_map *map,
-				    void *key, u64 flags, u64 map_flags)
+				    void *key, u64 flags)
 {
 	struct bpf_stab *stab = container_of(map, struct bpf_stab, map);
+	struct smap_psock_map_entry *e = NULL;
 	struct bpf_prog *verdict, *parse;
-	struct smap_psock *psock = NULL;
-	struct sock *old_sock, *sock;
+	struct sock *osock, *sock;
+	struct smap_psock *psock;
 	u32 i = *(u32 *)key;
-	bool update = false;
-	int err = 0;
+	int err;
 
 	if (unlikely(flags > BPF_EXIST))
 		return -EINVAL;
@@ -631,35 +679,22 @@ static int sock_map_ctx_update_elem(struct bpf_sock_ops_kern *skops,
 	if (unlikely(i >= stab->map.max_entries))
 		return -E2BIG;
 
-	if (unlikely(map_flags > BPF_SOCKMAP_STRPARSER))
-		return -EINVAL;
-
-	verdict = parse = NULL;
 	sock = READ_ONCE(stab->sock_map[i]);
-
-	if (flags == BPF_EXIST || flags == BPF_ANY) {
-		if (!sock && flags == BPF_EXIST) {
-			return -ENOENT;
-		} else if (sock && sock != skops->sk) {
-			return -EINVAL;
-		} else if (sock) {
-			psock = smap_psock_sk(sock);
-			if (unlikely(!psock))
-				return -EBUSY;
-			update = true;
-		}
-	} else if (sock && BPF_NOEXIST) {
+	if (flags == BPF_EXIST && !sock)
+		return -ENOENT;
+	else if (flags == BPF_NOEXIST && sock)
 		return -EEXIST;
-	}
 
-	/* reserve BPF programs early so can abort easily on failures */
-	if (map_flags & BPF_SOCKMAP_STRPARSER) {
-		verdict = READ_ONCE(stab->bpf_verdict);
-		parse = READ_ONCE(stab->bpf_parse);
+	sock = skops->sk;
 
-		if (!verdict || !parse)
-			return -ENOENT;
+	/* 1. If sock map has BPF programs those will be inherited by the
+	 * sock being added. If the sock is already attached to BPF programs
+	 * this results in an error.
+	 */
+	verdict = READ_ONCE(stab->bpf_verdict);
+	parse = READ_ONCE(stab->bpf_parse);
 
+	if (parse && verdict) {
 		/* bpf prog refcnt may be zero if a concurrent attach operation
 		 * removes the program after the above READ_ONCE() but before
 		 * we increment the refcnt. If this is the case abort with an
@@ -676,50 +711,78 @@ static int sock_map_ctx_update_elem(struct bpf_sock_ops_kern *skops,
 		}
 	}
 
-	if (!psock) {
-		sock = skops->sk;
-		if (rcu_dereference_sk_user_data(sock))
-			return -EEXIST;
+	write_lock_bh(&sock->sk_callback_lock);
+	psock = smap_psock_sk(sock);
+
+	/* 2. Do not allow inheriting programs if psock exists and has
+	 * already inherited programs. This would create confusion on
+	 * which parser/verdict program is running. If no psock exists
+	 * create one. Inside sk_callback_lock to ensure concurrent create
+	 * doesn't update user data.
+	 */
+	if (psock) {
+		if (READ_ONCE(psock->bpf_parse) && parse) {
+			err = -EBUSY;
+			goto out_progs;
+		}
+		psock->refcnt++;
+	} else {
 		psock = smap_init_psock(sock, stab);
 		if (IS_ERR(psock)) {
-			if (verdict)
-				bpf_prog_put(verdict);
-			if (parse)
-				bpf_prog_put(parse);
-			return PTR_ERR(psock);
+			err = PTR_ERR(psock);
+			goto out_progs;
 		}
-		psock->key = i;
-		psock->stab = stab;
-		refcount_inc(&stab->refcnt);
+
 		set_bit(SMAP_TX_RUNNING, &psock->state);
 	}
 
-	if (map_flags & BPF_SOCKMAP_STRPARSER) {
-		write_lock_bh(&sock->sk_callback_lock);
-		if (psock->strp_enabled)
-			goto start_done;
+	e = kzalloc(sizeof(*e), GFP_ATOMIC | __GFP_NOWARN);
+	if (!e) {
+		err = -ENOMEM;
+		goto out_progs;
+	}
+	e->entry = &stab->sock_map[i];
+
+	/* 3. At this point we have a reference to a valid psock that is
+	 * running. Attach any BPF programs needed.
+	 */
+	if (parse && verdict && !psock->strp_enabled) {
 		err = smap_init_sock(psock, sock);
 		if (err)
-			goto out;
+			goto out_free;
 		smap_init_progs(psock, stab, verdict, parse);
 		smap_start_sock(psock, sock);
-start_done:
-		write_unlock_bh(&sock->sk_callback_lock);
-	} else if (update) {
-		smap_stop_sock(psock, sock);
 	}
 
-	if (!update) {
-		old_sock = xchg(&stab->sock_map[i], skops->sk);
-		if (old_sock)
-			smap_release_sock(old_sock);
-	}
+	/* 4. Place psock in sockmap for use and stop any programs on
+	 * the old sock assuming its not the same sock we are replacing
+	 * it with. Because we can only have a single set of programs if
+	 * old_sock has a strp we can stop it.
+	 */
+	list_add_tail(&e->list, &psock->maps);
+	write_unlock_bh(&sock->sk_callback_lock);
 
+	osock = xchg(&stab->sock_map[i], sock);
+	if (osock) {
+		struct smap_psock *opsock = smap_psock_sk(osock);
+
+		write_lock_bh(&osock->sk_callback_lock);
+		if (osock != sock && parse)
+			smap_stop_sock(opsock, osock);
+		smap_list_remove(opsock, &stab->sock_map[i]);
+		smap_release_sock(opsock, osock);
+		write_unlock_bh(&osock->sk_callback_lock);
+	}
 	return 0;
-out:
+out_free:
+	smap_release_sock(psock, sock);
+out_progs:
+	if (verdict)
+		bpf_prog_put(verdict);
+	if (parse)
+		bpf_prog_put(parse);
 	write_unlock_bh(&sock->sk_callback_lock);
-	if (!update)
-		smap_release_sock(sock);
+	kfree(e);
 	return err;
 }
 
@@ -768,8 +831,7 @@ static int sock_map_update_elem(struct bpf_map *map,
 		return -EINVAL;
 	}
 
-	err = sock_map_ctx_update_elem(&skops, map, key,
-				       flags, BPF_SOCKMAP_STRPARSER);
+	err = sock_map_ctx_update_elem(&skops, map, key, flags);
 	fput(socket->file);
 	return err;
 }
@@ -783,11 +845,11 @@ const struct bpf_map_ops sock_map_ops = {
 	.map_delete_elem = sock_map_delete_elem,
 };
 
-BPF_CALL_5(bpf_sock_map_update, struct bpf_sock_ops_kern *, bpf_sock,
-	   struct bpf_map *, map, void *, key, u64, flags, u64, map_flags)
+BPF_CALL_4(bpf_sock_map_update, struct bpf_sock_ops_kern *, bpf_sock,
+	   struct bpf_map *, map, void *, key, u64, flags)
 {
 	WARN_ON_ONCE(!rcu_read_lock_held());
-	return sock_map_ctx_update_elem(bpf_sock, map, key, flags, map_flags);
+	return sock_map_ctx_update_elem(bpf_sock, map, key, flags);
 }
 
 const struct bpf_func_proto bpf_sock_map_update_proto = {
@@ -799,5 +861,4 @@ const struct bpf_func_proto bpf_sock_map_update_proto = {
 	.arg2_type	= ARG_CONST_MAP_PTR,
 	.arg3_type	= ARG_PTR_TO_MAP_KEY,
 	.arg4_type	= ARG_ANYTHING,
-	.arg5_type	= ARG_ANYTHING,
 };
-- 
cgit v1.2.3


From aef3ad103a686f21b746977d4ed21cc1af36f589 Mon Sep 17 00:00:00 2001
From: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Date: Sun, 13 Aug 2017 17:47:42 +0300
Subject: serial: core: remove unneeded irq_wake flag

There is no need to duplicate a flag which IRQ core takes care of.

Replace custom flag by IRQ core API that retrieves its state.

Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/tty/serial/serial_core.c | 9 +++------
 include/linux/serial_core.h      | 1 -
 2 files changed, 3 insertions(+), 7 deletions(-)

(limited to 'include')

diff --git a/drivers/tty/serial/serial_core.c b/drivers/tty/serial/serial_core.c
index 73ce4e2f7a93..220e1d088c9b 100644
--- a/drivers/tty/serial/serial_core.c
+++ b/drivers/tty/serial/serial_core.c
@@ -36,7 +36,7 @@
 #include <linux/delay.h>
 #include <linux/mutex.h>
 
-#include <asm/irq.h>
+#include <linux/irq.h>
 #include <linux/uaccess.h>
 
 /*
@@ -2083,8 +2083,7 @@ int uart_suspend_port(struct uart_driver *drv, struct uart_port *uport)
 
 	tty_dev = device_find_child(uport->dev, &match, serial_match_port);
 	if (tty_dev && device_may_wakeup(tty_dev)) {
-		if (!enable_irq_wake(uport->irq))
-			uport->irq_wake = 1;
+		enable_irq_wake(uport->irq);
 		put_device(tty_dev);
 		mutex_unlock(&port->mutex);
 		return 0;
@@ -2147,10 +2146,8 @@ int uart_resume_port(struct uart_driver *drv, struct uart_port *uport)
 
 	tty_dev = device_find_child(uport->dev, &match, serial_match_port);
 	if (!uport->suspended && device_may_wakeup(tty_dev)) {
-		if (uport->irq_wake) {
+		if (irqd_is_wakeup_set(irq_get_irq_data((uport->irq))))
 			disable_irq_wake(uport->irq);
-			uport->irq_wake = 0;
-		}
 		put_device(tty_dev);
 		mutex_unlock(&port->mutex);
 		return 0;
diff --git a/include/linux/serial_core.h b/include/linux/serial_core.h
index 8dc24c855a70..5553e04e59c9 100644
--- a/include/linux/serial_core.h
+++ b/include/linux/serial_core.h
@@ -247,7 +247,6 @@ struct uart_port {
 	struct device		*dev;			/* parent device */
 	unsigned char		hub6;			/* this should be in the 8250 driver */
 	unsigned char		suspended;
-	unsigned char		irq_wake;
 	unsigned char		unused[2];
 	const char		*name;			/* port name */
 	struct attribute_group	*attr_group;		/* port specific attributes */
-- 
cgit v1.2.3


From 63e8d4394a2d226803f47abd7287dbb6d21bf8e4 Mon Sep 17 00:00:00 2001
From: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Date: Tue, 22 Aug 2017 16:58:20 +0300
Subject: serial: pch_uart: Make port type explicit

It used to be a gap in port definitions after PORT_MAX_8250. Since the
new drivers are coming the gap become shorter and shorter until the
commit a2d6a987bfe4 ("serial: 8250: Add new port type for TI DA8xx/66AK2x")
completely removed it.

So, while type here is just a formality, make things a little bit more
explicit for this driver and move port types to UAPI header. Note,
it uses two types for now.

Fixes: fddceb8b5399 ("tty: 8250: Add 64byte UART support for FSL platforms")
Cc: Priyanka Jain <Priyanka.Jain@freescale.com>
Cc: Poonam Aggrwal <poonam.aggrwal@freescale.com>
Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/tty/serial/pch_uart.c    | 35 +++++++++++++++--------------------
 include/uapi/linux/serial_core.h |  5 ++++-
 2 files changed, 19 insertions(+), 21 deletions(-)

(limited to 'include')

diff --git a/drivers/tty/serial/pch_uart.c b/drivers/tty/serial/pch_uart.c
index 3788198f105b..ae8cfc81ffc5 100644
--- a/drivers/tty/serial/pch_uart.c
+++ b/drivers/tty/serial/pch_uart.c
@@ -46,11 +46,6 @@ enum {
 	PCH_UART_HANDLED_LS_INT_SHIFT,
 };
 
-enum {
-	PCH_UART_8LINE,
-	PCH_UART_2LINE,
-};
-
 #define PCH_UART_DRIVER_DEVICE "ttyPCH"
 
 /* Set the max number of UART port
@@ -267,7 +262,7 @@ struct eg20t_port {
 
 /**
  * struct pch_uart_driver_data - private data structure for UART-DMA
- * @port_type:			The number of DMA channel
+ * @port_type:			The type of UART port
  * @line_no:			UART port line number (0, 1, 2...)
  */
 struct pch_uart_driver_data {
@@ -290,17 +285,17 @@ enum pch_uart_num_t {
 };
 
 static struct pch_uart_driver_data drv_dat[] = {
-	[pch_et20t_uart0] = {PCH_UART_8LINE, 0},
-	[pch_et20t_uart1] = {PCH_UART_2LINE, 1},
-	[pch_et20t_uart2] = {PCH_UART_2LINE, 2},
-	[pch_et20t_uart3] = {PCH_UART_2LINE, 3},
-	[pch_ml7213_uart0] = {PCH_UART_8LINE, 0},
-	[pch_ml7213_uart1] = {PCH_UART_2LINE, 1},
-	[pch_ml7213_uart2] = {PCH_UART_2LINE, 2},
-	[pch_ml7223_uart0] = {PCH_UART_8LINE, 0},
-	[pch_ml7223_uart1] = {PCH_UART_2LINE, 1},
-	[pch_ml7831_uart0] = {PCH_UART_8LINE, 0},
-	[pch_ml7831_uart1] = {PCH_UART_2LINE, 1},
+	[pch_et20t_uart0] = {PORT_PCH_8LINE, 0},
+	[pch_et20t_uart1] = {PORT_PCH_2LINE, 1},
+	[pch_et20t_uart2] = {PORT_PCH_2LINE, 2},
+	[pch_et20t_uart3] = {PORT_PCH_2LINE, 3},
+	[pch_ml7213_uart0] = {PORT_PCH_8LINE, 0},
+	[pch_ml7213_uart1] = {PORT_PCH_2LINE, 1},
+	[pch_ml7213_uart2] = {PORT_PCH_2LINE, 2},
+	[pch_ml7223_uart0] = {PORT_PCH_8LINE, 0},
+	[pch_ml7223_uart1] = {PORT_PCH_2LINE, 1},
+	[pch_ml7831_uart0] = {PORT_PCH_8LINE, 0},
+	[pch_ml7831_uart1] = {PORT_PCH_2LINE, 1},
 };
 
 #ifdef CONFIG_SERIAL_PCH_UART_CONSOLE
@@ -1777,10 +1772,10 @@ static struct eg20t_port *pch_uart_init_port(struct pci_dev *pdev,
 		goto init_port_free_txbuf;
 
 	switch (port_type) {
-	case PORT_UNKNOWN:
+	case PORT_PCH_8LINE:
 		fifosize = 256; /* EG20T/ML7213: UART0 */
 		break;
-	case PORT_8250:
+	case PORT_PCH_2LINE:
 		fifosize = 64; /* EG20T:UART1~3  ML7213: UART1~2*/
 		break;
 	default:
@@ -1804,7 +1799,7 @@ static struct eg20t_port *pch_uart_init_port(struct pci_dev *pdev,
 
 	priv->fifo_size = fifosize;
 	priv->uartclk = pch_uart_get_uartclk();
-	priv->port_type = PORT_MAX_8250 + port_type + 1;
+	priv->port_type = port_type;
 	priv->port.dev = &pdev->dev;
 	priv->port.iobase = iobase;
 	priv->port.membase = NULL;
diff --git a/include/uapi/linux/serial_core.h b/include/uapi/linux/serial_core.h
index 38bea3217ead..502aa23c7e15 100644
--- a/include/uapi/linux/serial_core.h
+++ b/include/uapi/linux/serial_core.h
@@ -57,7 +57,6 @@
 #define PORT_RT2880	29	/* Ralink RT2880 internal UART */
 #define PORT_16550A_FSL64 30	/* Freescale 16550 UART with 64 FIFOs */
 #define PORT_DA830	31	/* TI DA8xx/66AK2x */
-#define PORT_MAX_8250	31	/* max port ID */
 
 /*
  * ARM specific type numbers.  These are not currently guaranteed
@@ -77,6 +76,10 @@
 #define PORT_SUNZILOG	38
 #define PORT_SUNSAB	39
 
+/* Intel EG20 */
+#define PORT_PCH_8LINE	44
+#define PORT_PCH_2LINE	45
+
 /* DEC */
 #define PORT_DZ		46
 #define PORT_ZS		47
-- 
cgit v1.2.3


From 3f3dac7e4d815cb7f929c0ed98c3a45a86852e53 Mon Sep 17 00:00:00 2001
From: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Date: Tue, 22 Aug 2017 16:58:21 +0300
Subject: serial: Remove unused port type

PORT_MFD is not in use since commit

	1bd187de5364 ("x86, intel-mid: remove Intel MID specific serial support")

Remove leftover.

Fixes: 1bd187de5364 ("x86, intel-mid: remove Intel MID specific serial support")
Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/uapi/linux/serial_core.h | 3 ---
 1 file changed, 3 deletions(-)

(limited to 'include')

diff --git a/include/uapi/linux/serial_core.h b/include/uapi/linux/serial_core.h
index 502aa23c7e15..00d335634271 100644
--- a/include/uapi/linux/serial_core.h
+++ b/include/uapi/linux/serial_core.h
@@ -209,9 +209,6 @@
 /* MAX310X */
 #define PORT_MAX310X	94
 
-/* High Speed UART for Medfield */
-#define PORT_MFD	95
-
 /* TI OMAP-UART */
 #define PORT_OMAP	96
 
-- 
cgit v1.2.3


From ee1c90cc2cea80638f559c552371ee6893ca9d9e Mon Sep 17 00:00:00 2001
From: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Date: Tue, 22 Aug 2017 16:58:22 +0300
Subject: serial: Fix port type numbering for TI DA8xx

The UAPI has a global list of unique numbers for different port types.
The commit
	a2d6a987bfe4 ("serial: 8250: Add new port type for TI DA8xx/66AK2x")
introduced a new port type and brought the collision with two other port
types.

Reuse 95 for it instead.

Fixes: a2d6a987bfe4 ("serial: 8250: Add new port type for TI DA8xx/66AK2x")
Cc: David Lechner <david@lechnology.com>
Cc: Sekhar Nori <nsekhar@ti.com>
Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/uapi/linux/serial_core.h | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/uapi/linux/serial_core.h b/include/uapi/linux/serial_core.h
index 00d335634271..dc2d7cb766ab 100644
--- a/include/uapi/linux/serial_core.h
+++ b/include/uapi/linux/serial_core.h
@@ -56,7 +56,6 @@
 #define PORT_ALTR_16550_F128 28 /* Altera 16550 UART with 128 FIFOs */
 #define PORT_RT2880	29	/* Ralink RT2880 internal UART */
 #define PORT_16550A_FSL64 30	/* Freescale 16550 UART with 64 FIFOs */
-#define PORT_DA830	31	/* TI DA8xx/66AK2x */
 
 /*
  * ARM specific type numbers.  These are not currently guaranteed
@@ -209,6 +208,9 @@
 /* MAX310X */
 #define PORT_MAX310X	94
 
+/* TI DA8xx/66AK2x */
+#define PORT_DA830	95
+
 /* TI OMAP-UART */
 #define PORT_OMAP	96
 
-- 
cgit v1.2.3


From 6e0a5de2136b26bd0b3501f2f362abfffeb47d1e Mon Sep 17 00:00:00 2001
From: Rafael Gago <rafael.gago@gmail.com>
Date: Mon, 31 Jul 2017 10:46:43 +0200
Subject: serial: 8250: Use hrtimers for rs485 delays

Previously the timers where based on the classic timers, giving a too
coarse resolution on systems with configs of less than 1000 HZ.

This patch changes the rs485 timers to hrtimers.

Signed-off-by: Rafael Gago Castano <rgc@hms.se>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/tty/serial/8250/8250_port.c | 63 +++++++++++++++++++++++--------------
 include/linux/serial_8250.h         |  7 +++--
 2 files changed, 44 insertions(+), 26 deletions(-)

(limited to 'include')

diff --git a/drivers/tty/serial/8250/8250_port.c b/drivers/tty/serial/8250/8250_port.c
index 25aae660cb9e..6b745e440b81 100644
--- a/drivers/tty/serial/8250/8250_port.c
+++ b/drivers/tty/serial/8250/8250_port.c
@@ -37,7 +37,7 @@
 #include <linux/slab.h>
 #include <linux/uaccess.h>
 #include <linux/pm_runtime.h>
-#include <linux/timer.h>
+#include <linux/ktime.h>
 
 #include <asm/io.h>
 #include <asm/irq.h>
@@ -553,8 +553,8 @@ static inline void serial8250_em485_rts_after_send(struct uart_8250_port *p)
 	serial8250_out_MCR(p, mcr);
 }
 
-static void serial8250_em485_handle_start_tx(unsigned long arg);
-static void serial8250_em485_handle_stop_tx(unsigned long arg);
+static enum hrtimer_restart serial8250_em485_handle_start_tx(struct hrtimer *t);
+static enum hrtimer_restart serial8250_em485_handle_stop_tx(struct hrtimer *t);
 
 void serial8250_clear_and_reinit_fifos(struct uart_8250_port *p)
 {
@@ -609,12 +609,14 @@ int serial8250_em485_init(struct uart_8250_port *p)
 	if (!p->em485)
 		return -ENOMEM;
 
-	setup_timer(&p->em485->stop_tx_timer,
-		serial8250_em485_handle_stop_tx, (unsigned long)p);
-	setup_timer(&p->em485->start_tx_timer,
-		serial8250_em485_handle_start_tx, (unsigned long)p);
+	hrtimer_init(&p->em485->stop_tx_timer, CLOCK_MONOTONIC,
+		     HRTIMER_MODE_REL);
+	hrtimer_init(&p->em485->start_tx_timer, CLOCK_MONOTONIC,
+		     HRTIMER_MODE_REL);
+	p->em485->stop_tx_timer.function = &serial8250_em485_handle_stop_tx;
+	p->em485->start_tx_timer.function = &serial8250_em485_handle_start_tx;
+	p->em485->port = p;
 	p->em485->active_timer = NULL;
-
 	serial8250_em485_rts_after_send(p);
 
 	return 0;
@@ -639,8 +641,8 @@ void serial8250_em485_destroy(struct uart_8250_port *p)
 	if (!p->em485)
 		return;
 
-	del_timer(&p->em485->start_tx_timer);
-	del_timer(&p->em485->stop_tx_timer);
+	hrtimer_cancel(&p->em485->start_tx_timer);
+	hrtimer_cancel(&p->em485->stop_tx_timer);
 
 	kfree(p->em485);
 	p->em485 = NULL;
@@ -1435,12 +1437,13 @@ static void __do_stop_tx_rs485(struct uart_8250_port *p)
 		serial_port_out(&p->port, UART_IER, p->ier);
 	}
 }
-
-static void serial8250_em485_handle_stop_tx(unsigned long arg)
+static enum hrtimer_restart serial8250_em485_handle_stop_tx(struct hrtimer *t)
 {
-	struct uart_8250_port *p = (struct uart_8250_port *)arg;
-	struct uart_8250_em485 *em485 = p->em485;
+	struct uart_8250_em485 *em485;
+	struct uart_8250_port *p;
 	unsigned long flags;
+	em485 = container_of(t, struct uart_8250_em485, stop_tx_timer);
+	p = em485->port;
 
 	serial8250_rpm_get(p);
 	spin_lock_irqsave(&p->port.lock, flags);
@@ -1451,6 +1454,16 @@ static void serial8250_em485_handle_stop_tx(unsigned long arg)
 	}
 	spin_unlock_irqrestore(&p->port.lock, flags);
 	serial8250_rpm_put(p);
+	return HRTIMER_NORESTART;
+}
+
+static void start_hrtimer_ms(struct hrtimer *hrt, unsigned long msec)
+{
+	long sec = msec / 1000;
+	long nsec = (msec % 1000) * 1000000;
+	ktime_t t = ktime_set(sec, nsec);
+
+	hrtimer_start(hrt, t, HRTIMER_MODE_REL);
 }
 
 static void __stop_tx_rs485(struct uart_8250_port *p)
@@ -1463,8 +1476,8 @@ static void __stop_tx_rs485(struct uart_8250_port *p)
 	 */
 	if (p->port.rs485.delay_rts_after_send > 0) {
 		em485->active_timer = &em485->stop_tx_timer;
-		mod_timer(&em485->stop_tx_timer, jiffies +
-			p->port.rs485.delay_rts_after_send * HZ / 1000);
+		start_hrtimer_ms(&em485->stop_tx_timer,
+				   p->port.rs485.delay_rts_after_send);
 	} else {
 		__do_stop_tx_rs485(p);
 	}
@@ -1494,8 +1507,8 @@ static inline void __stop_tx(struct uart_8250_port *p)
 		if ((lsr & BOTH_EMPTY) != BOTH_EMPTY)
 			return;
 
-		del_timer(&em485->start_tx_timer);
 		em485->active_timer = NULL;
+		hrtimer_cancel(&em485->start_tx_timer);
 
 		__stop_tx_rs485(p);
 	}
@@ -1558,8 +1571,9 @@ static inline void start_tx_rs485(struct uart_port *port)
 	if (!(up->port.rs485.flags & SER_RS485_RX_DURING_TX))
 		serial8250_stop_rx(&up->port);
 
-	del_timer(&em485->stop_tx_timer);
 	em485->active_timer = NULL;
+	if (hrtimer_is_queued(&em485->stop_tx_timer))
+		hrtimer_cancel(&em485->stop_tx_timer);
 
 	mcr = serial8250_in_MCR(up);
 	if (!!(up->port.rs485.flags & SER_RS485_RTS_ON_SEND) !=
@@ -1572,8 +1586,8 @@ static inline void start_tx_rs485(struct uart_port *port)
 
 		if (up->port.rs485.delay_rts_before_send > 0) {
 			em485->active_timer = &em485->start_tx_timer;
-			mod_timer(&em485->start_tx_timer, jiffies +
-				up->port.rs485.delay_rts_before_send * HZ / 1000);
+			start_hrtimer_ms(&em485->start_tx_timer,
+					 up->port.rs485.delay_rts_before_send);
 			return;
 		}
 	}
@@ -1581,11 +1595,13 @@ static inline void start_tx_rs485(struct uart_port *port)
 	__start_tx(port);
 }
 
-static void serial8250_em485_handle_start_tx(unsigned long arg)
+static enum hrtimer_restart serial8250_em485_handle_start_tx(struct hrtimer *t)
 {
-	struct uart_8250_port *p = (struct uart_8250_port *)arg;
-	struct uart_8250_em485 *em485 = p->em485;
+	struct uart_8250_em485 *em485;
+	struct uart_8250_port *p;
 	unsigned long flags;
+	em485 = container_of(t, struct uart_8250_em485, start_tx_timer);
+	p = em485->port;
 
 	spin_lock_irqsave(&p->port.lock, flags);
 	if (em485 &&
@@ -1594,6 +1610,7 @@ static void serial8250_em485_handle_start_tx(unsigned long arg)
 		em485->active_timer = NULL;
 	}
 	spin_unlock_irqrestore(&p->port.lock, flags);
+	return HRTIMER_NORESTART;
 }
 
 static void serial8250_start_tx(struct uart_port *port)
diff --git a/include/linux/serial_8250.h b/include/linux/serial_8250.h
index 61fbb440449c..a27ef5f56431 100644
--- a/include/linux/serial_8250.h
+++ b/include/linux/serial_8250.h
@@ -80,9 +80,10 @@ struct uart_8250_ops {
 };
 
 struct uart_8250_em485 {
-	struct timer_list	start_tx_timer; /* "rs485 start tx" timer */
-	struct timer_list	stop_tx_timer;  /* "rs485 stop tx" timer */
-	struct timer_list	*active_timer;  /* pointer to active timer */
+	struct hrtimer		start_tx_timer; /* "rs485 start tx" timer */
+	struct hrtimer		stop_tx_timer;  /* "rs485 stop tx" timer */
+	struct hrtimer		*active_timer;  /* pointer to active timer */
+	struct uart_8250_port	*port;          /* for hrtimer callbacks */
 };
 
 /*
-- 
cgit v1.2.3


From 1c16ae65e2502da05310b2ec56b3a1fd3efe6f4d Mon Sep 17 00:00:00 2001
From: Sean Wang <sean.wang@mediatek.com>
Date: Mon, 21 Aug 2017 01:17:56 +0800
Subject: serial: 8250: of: Add new port type for MediaTek BTIF controller on
 MT7622/23 SoC

MediaTek BTIF controller is the serial interface similar to UART but it
works only as the digital device which is mainly used to communicate with
the connectivity module called CONNSYS inside the SoC which could be mostly
found on those MediaTek SoCs with Bluetooth feature such as MT7622 and
MT7623 SoCs.

And the controller is made as being compatible with the 8250 register
layout with extra registers such as DMA enablement so it tends to be
integrated with reusing 8250 OF driver. However, DMA mode is not being
supported yet in the current driver.

Signed-off-by: Sean Wang <sean.wang@mediatek.com>
Suggested-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Acked-by: Arnd Bergmann <arnd@arndb.de>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/tty/serial/8250/8250_of.c   | 2 ++
 drivers/tty/serial/8250/8250_port.c | 8 ++++++++
 include/uapi/linux/serial_core.h    | 3 +++
 3 files changed, 13 insertions(+)

(limited to 'include')

diff --git a/drivers/tty/serial/8250/8250_of.c b/drivers/tty/serial/8250/8250_of.c
index 30eacbffaa51..1222c005fb98 100644
--- a/drivers/tty/serial/8250/8250_of.c
+++ b/drivers/tty/serial/8250/8250_of.c
@@ -313,6 +313,8 @@ static const struct of_device_id of_platform_serial_table[] = {
 		.data = (void *)PORT_ALTR_16550_F64, },
 	{ .compatible = "altr,16550-FIFO128",
 		.data = (void *)PORT_ALTR_16550_F128, },
+	{ .compatible = "mediatek,mtk-btif",
+		.data = (void *)PORT_MTK_BTIF, },
 	{ .compatible = "mrvl,mmp-uart",
 		.data = (void *)PORT_XSCALE, },
 	{ .compatible = "ti,da830-uart", .data = (void *)PORT_DA830, },
diff --git a/drivers/tty/serial/8250/8250_port.c b/drivers/tty/serial/8250/8250_port.c
index 6b745e440b81..4726aa276968 100644
--- a/drivers/tty/serial/8250/8250_port.c
+++ b/drivers/tty/serial/8250/8250_port.c
@@ -289,6 +289,14 @@ static const struct serial8250_config uart_config[] = {
 		.rxtrig_bytes	= {1, 4, 8, 14},
 		.flags		= UART_CAP_FIFO | UART_CAP_AFE,
 	},
+	[PORT_MTK_BTIF] = {
+		.name		= "MediaTek BTIF",
+		.fifo_size	= 16,
+		.tx_loadsz	= 16,
+		.fcr		= UART_FCR_ENABLE_FIFO |
+				  UART_FCR_CLEAR_RCVR | UART_FCR_CLEAR_XMIT,
+		.flags		= UART_CAP_FIFO,
+	},
 };
 
 /* Uart divisor latch read */
diff --git a/include/uapi/linux/serial_core.h b/include/uapi/linux/serial_core.h
index dc2d7cb766ab..50d71c436323 100644
--- a/include/uapi/linux/serial_core.h
+++ b/include/uapi/linux/serial_core.h
@@ -274,4 +274,7 @@
 /* MPS2 UART */
 #define PORT_MPS2UART	116
 
+/* MediaTek BTIF */
+#define PORT_MTK_BTIF	117
+
 #endif /* _UAPILINUX_SERIAL_CORE_H */
-- 
cgit v1.2.3


From 2ce8008711e4837c11e99a94df55406085d0d098 Mon Sep 17 00:00:00 2001
From: Johannes Thumshirn <jthumshirn@suse.de>
Date: Wed, 2 Aug 2017 09:58:52 +0200
Subject: mcb: introduce mcb_get_resource()

Introduce mcb_get_resource() as a common accessor to a mcb device's memory or
IRQ resources.

Signed-off-by: Johannes Thumshirn <jthumshirn@suse.de>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/mcb/mcb-core.c | 20 +++++++++++++++++++-
 include/linux/mcb.h    |  2 ++
 2 files changed, 21 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/drivers/mcb/mcb-core.c b/drivers/mcb/mcb-core.c
index 921a5d2a802b..bb5c5692dedc 100644
--- a/drivers/mcb/mcb-core.c
+++ b/drivers/mcb/mcb-core.c
@@ -417,6 +417,22 @@ void mcb_bus_add_devices(const struct mcb_bus *bus)
 }
 EXPORT_SYMBOL_GPL(mcb_bus_add_devices);
 
+/**
+ * mcb_get_resource() - get a resource for a mcb device
+ * @dev: the mcb device
+ * @type: the type of resource
+ */
+struct resource *mcb_get_resource(struct mcb_device *dev, unsigned int type)
+{
+	if (type == IORESOURCE_MEM)
+		return &dev->mem;
+	else if (type == IORESOURCE_IRQ)
+		return &dev->irq;
+	else
+		return NULL;
+}
+EXPORT_SYMBOL_GPL(mcb_get_resource);
+
 /**
  * mcb_request_mem() - Request memory
  * @dev: The @mcb_device the memory is for
@@ -460,7 +476,9 @@ EXPORT_SYMBOL_GPL(mcb_release_mem);
 
 static int __mcb_get_irq(struct mcb_device *dev)
 {
-	struct resource *irq = &dev->irq;
+	struct resource *irq;
+
+	irq = mcb_get_resource(dev, IORESOURCE_IRQ);
 
 	return irq->start;
 }
diff --git a/include/linux/mcb.h b/include/linux/mcb.h
index 4097ac9ea13a..b1a0ad9d23b3 100644
--- a/include/linux/mcb.h
+++ b/include/linux/mcb.h
@@ -136,5 +136,7 @@ extern struct resource *mcb_request_mem(struct mcb_device *dev,
 					const char *name);
 extern void mcb_release_mem(struct resource *mem);
 extern int mcb_get_irq(struct mcb_device *dev);
+extern struct resource *mcb_get_resource(struct mcb_device *dev,
+					 unsigned int type);
 
 #endif /* _LINUX_MCB_H */
-- 
cgit v1.2.3


From d01c3289e7d68162e32bc08c2b65dd1a216a7ef8 Mon Sep 17 00:00:00 2001
From: Masatake YAMATO <yamato@redhat.com>
Date: Tue, 18 Jul 2017 06:27:59 +0900
Subject: pty: show associative slave of ptmx in fdinfo

This patch adds "tty-index" field to /proc/PID/fdinfo/N if N
specifies /dev/ptmx. The field shows the index of associative
slave pts.

Though a minor number is given for each pts instance, ptmx is not.
It means there is no way in user-space to know the association between
file descriptors for pts/n and ptmx. (n = 0, 1, ...)

This is different from pipe. About pipe such association can be solved
by inode of pipefs.

Providing the way to know the association between pts/n and ptmx helps
users understand the status of running system. lsof can utilize this field.

Signed-off-by: Masatake YAMATO <yamato@redhat.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/tty/pty.c          | 8 +++++++-
 drivers/tty/tty_io.c       | 9 +++++++++
 include/linux/tty_driver.h | 2 ++
 3 files changed, 18 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/drivers/tty/pty.c b/drivers/tty/pty.c
index 284749fb0f6b..bb672ace562f 100644
--- a/drivers/tty/pty.c
+++ b/drivers/tty/pty.c
@@ -741,6 +741,11 @@ static void pty_unix98_remove(struct tty_driver *driver, struct tty_struct *tty)
 	}
 }
 
+static void pty_show_fdinfo(struct tty_struct *tty, struct seq_file *m)
+{
+	seq_printf(m, "tty-index:\t%d\n", tty->index);
+}
+
 static const struct tty_operations ptm_unix98_ops = {
 	.lookup = ptm_unix98_lookup,
 	.install = pty_unix98_install,
@@ -755,7 +760,8 @@ static const struct tty_operations ptm_unix98_ops = {
 	.ioctl = pty_unix98_ioctl,
 	.compat_ioctl = pty_unix98_compat_ioctl,
 	.resize = pty_resize,
-	.cleanup = pty_cleanup
+	.cleanup = pty_cleanup,
+	.show_fdinfo = pty_show_fdinfo,
 };
 
 static const struct tty_operations pty_unix98_ops = {
diff --git a/drivers/tty/tty_io.c b/drivers/tty/tty_io.c
index 974b13d24401..c3c8fd0ed535 100644
--- a/drivers/tty/tty_io.c
+++ b/drivers/tty/tty_io.c
@@ -462,6 +462,14 @@ static int hung_up_tty_fasync(int fd, struct file *file, int on)
 	return -ENOTTY;
 }
 
+static void tty_show_fdinfo(struct seq_file *m, struct file *file)
+{
+	struct tty_struct *tty = file_tty(file);
+
+	if (tty && tty->ops && tty->ops->show_fdinfo)
+		tty->ops->show_fdinfo(tty, m);
+}
+
 static const struct file_operations tty_fops = {
 	.llseek		= no_llseek,
 	.read		= tty_read,
@@ -472,6 +480,7 @@ static const struct file_operations tty_fops = {
 	.open		= tty_open,
 	.release	= tty_release,
 	.fasync		= tty_fasync,
+	.show_fdinfo	= tty_show_fdinfo,
 };
 
 static const struct file_operations console_fops = {
diff --git a/include/linux/tty_driver.h b/include/linux/tty_driver.h
index 00b2213f6a35..fcdc0f5d9098 100644
--- a/include/linux/tty_driver.h
+++ b/include/linux/tty_driver.h
@@ -243,6 +243,7 @@
 #include <linux/list.h>
 #include <linux/cdev.h>
 #include <linux/termios.h>
+#include <linux/seq_file.h>
 
 struct tty_struct;
 struct tty_driver;
@@ -285,6 +286,7 @@ struct tty_operations {
 	int (*set_termiox)(struct tty_struct *tty, struct termiox *tnew);
 	int (*get_icount)(struct tty_struct *tty,
 				struct serial_icounter_struct *icount);
+	void (*show_fdinfo)(struct tty_struct *tty, struct seq_file *m);
 #ifdef CONFIG_CONSOLE_POLL
 	int (*poll_init)(struct tty_driver *driver, int line, char *options);
 	int (*poll_get_char)(struct tty_driver *driver, int line);
-- 
cgit v1.2.3


From b6dccf7fae4331b0ea41cf087e3f02d5db9161dc Mon Sep 17 00:00:00 2001
From: Arnav Dawn <a.dawn@samsung.com>
Date: Wed, 12 Jul 2017 16:10:40 +0530
Subject: nvme: add support for FW activation without reset

This patch adds support for handling Fw activation without reset
On completion of FW-activation-starting AER, all queues are
paused till CSTS.PP is cleared or timed out (exceeds max time for
fw activtion MTFA). If device fails to clear CSTS.PP within MTFA,
driver issues reset controller.

Signed-off-by: Arnav Dawn <a.dawn@samsung.com>
Reviewed-by: Keith Busch <keith.busch@intel.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Sagi Grimberg <sagi@grimberg.me>
---
 drivers/nvme/host/core.c | 75 ++++++++++++++++++++++++++++++++++++++++++++++++
 drivers/nvme/host/nvme.h |  2 ++
 include/linux/nvme.h     |  9 ++++++
 3 files changed, 86 insertions(+)

(limited to 'include')

diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index c596dd3c58b1..1a1dedbac39b 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -76,6 +76,11 @@ static DEFINE_SPINLOCK(dev_list_lock);
 
 static struct class *nvme_class;
 
+static __le32 nvme_get_log_dw10(u8 lid, size_t size)
+{
+	return cpu_to_le32((((size / 4) - 1) << 16) | lid);
+}
+
 int nvme_reset_ctrl(struct nvme_ctrl *ctrl)
 {
 	if (!nvme_change_ctrl_state(ctrl, NVME_CTRL_RESETTING))
@@ -2534,6 +2539,71 @@ static void nvme_async_event_work(struct work_struct *work)
 	spin_unlock_irq(&ctrl->lock);
 }
 
+static bool nvme_ctrl_pp_status(struct nvme_ctrl *ctrl)
+{
+
+	u32 csts;
+
+	if (ctrl->ops->reg_read32(ctrl, NVME_REG_CSTS, &csts))
+		return false;
+
+	if (csts == ~0)
+		return false;
+
+	return ((ctrl->ctrl_config & NVME_CC_ENABLE) && (csts & NVME_CSTS_PP));
+}
+
+static void nvme_get_fw_slot_info(struct nvme_ctrl *ctrl)
+{
+	struct nvme_command c = { };
+	struct nvme_fw_slot_info_log *log;
+
+	log = kmalloc(sizeof(*log), GFP_KERNEL);
+	if (!log)
+		return;
+
+	c.common.opcode = nvme_admin_get_log_page;
+	c.common.nsid = cpu_to_le32(0xffffffff);
+	c.common.cdw10[0] = nvme_get_log_dw10(NVME_LOG_FW_SLOT, sizeof(*log));
+
+	if (!nvme_submit_sync_cmd(ctrl->admin_q, &c, log, sizeof(*log)))
+		dev_warn(ctrl->device,
+				"Get FW SLOT INFO log error\n");
+	kfree(log);
+}
+
+static void nvme_fw_act_work(struct work_struct *work)
+{
+	struct nvme_ctrl *ctrl = container_of(work,
+				struct nvme_ctrl, fw_act_work);
+	unsigned long fw_act_timeout;
+
+	if (ctrl->mtfa)
+		fw_act_timeout = jiffies +
+				msecs_to_jiffies(ctrl->mtfa * 100);
+	else
+		fw_act_timeout = jiffies +
+				msecs_to_jiffies(admin_timeout * 1000);
+
+	nvme_stop_queues(ctrl);
+	while (nvme_ctrl_pp_status(ctrl)) {
+		if (time_after(jiffies, fw_act_timeout)) {
+			dev_warn(ctrl->device,
+				"Fw activation timeout, reset controller\n");
+			nvme_reset_ctrl(ctrl);
+			break;
+		}
+		msleep(100);
+	}
+
+	if (ctrl->state != NVME_CTRL_LIVE)
+		return;
+
+	nvme_start_queues(ctrl);
+	/* read FW slot informationi to clear the AER*/
+	nvme_get_fw_slot_info(ctrl);
+}
+
 void nvme_complete_async_event(struct nvme_ctrl *ctrl, __le16 status,
 		union nvme_result *res)
 {
@@ -2560,6 +2630,9 @@ void nvme_complete_async_event(struct nvme_ctrl *ctrl, __le16 status,
 		dev_info(ctrl->device, "rescanning\n");
 		nvme_queue_scan(ctrl);
 		break;
+	case NVME_AER_NOTICE_FW_ACT_STARTING:
+		schedule_work(&ctrl->fw_act_work);
+		break;
 	default:
 		dev_warn(ctrl->device, "async event result %08x\n", result);
 	}
@@ -2607,6 +2680,7 @@ void nvme_stop_ctrl(struct nvme_ctrl *ctrl)
 	nvme_stop_keep_alive(ctrl);
 	flush_work(&ctrl->async_event_work);
 	flush_work(&ctrl->scan_work);
+	cancel_work_sync(&ctrl->fw_act_work);
 }
 EXPORT_SYMBOL_GPL(nvme_stop_ctrl);
 
@@ -2670,6 +2744,7 @@ int nvme_init_ctrl(struct nvme_ctrl *ctrl, struct device *dev,
 	ctrl->quirks = quirks;
 	INIT_WORK(&ctrl->scan_work, nvme_scan_work);
 	INIT_WORK(&ctrl->async_event_work, nvme_async_event_work);
+	INIT_WORK(&ctrl->fw_act_work, nvme_fw_act_work);
 
 	ret = nvme_set_instance(ctrl);
 	if (ret)
diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h
index 8f2a168ddc01..b40b9af4564f 100644
--- a/drivers/nvme/host/nvme.h
+++ b/drivers/nvme/host/nvme.h
@@ -142,6 +142,7 @@ struct nvme_ctrl {
 	u16 cntlid;
 
 	u32 ctrl_config;
+	u16 mtfa;
 	u32 queue_count;
 
 	u64 cap;
@@ -167,6 +168,7 @@ struct nvme_ctrl {
 	struct work_struct scan_work;
 	struct work_struct async_event_work;
 	struct delayed_work ka_work;
+	struct work_struct fw_act_work;
 
 	/* Power saving configuration */
 	u64 ps_max_latency_us;
diff --git a/include/linux/nvme.h b/include/linux/nvme.h
index 25d8225dbd04..5d0f2f4fc3b8 100644
--- a/include/linux/nvme.h
+++ b/include/linux/nvme.h
@@ -146,6 +146,7 @@ enum {
 	NVME_CSTS_RDY		= 1 << 0,
 	NVME_CSTS_CFS		= 1 << 1,
 	NVME_CSTS_NSSRO		= 1 << 4,
+	NVME_CSTS_PP		= 1 << 5,
 	NVME_CSTS_SHST_NORMAL	= 0 << 2,
 	NVME_CSTS_SHST_OCCUR	= 1 << 2,
 	NVME_CSTS_SHST_CMPLT	= 2 << 2,
@@ -376,6 +377,13 @@ struct nvme_smart_log {
 	__u8			rsvd216[296];
 };
 
+struct nvme_fw_slot_info_log {
+	__u8			afi;
+	__u8			rsvd1[7];
+	__le64			frs[7];
+	__u8			rsvd64[448];
+};
+
 enum {
 	NVME_SMART_CRIT_SPARE		= 1 << 0,
 	NVME_SMART_CRIT_TEMPERATURE	= 1 << 1,
@@ -386,6 +394,7 @@ enum {
 
 enum {
 	NVME_AER_NOTICE_NS_CHANGED	= 0x0002,
+	NVME_AER_NOTICE_FW_ACT_STARTING = 0x0102,
 };
 
 struct nvme_lba_range_type {
-- 
cgit v1.2.3


From 62346eaeb2f1a0524b35eaa2f479596f40491165 Mon Sep 17 00:00:00 2001
From: Arnav Dawn <a.dawn@samsung.com>
Date: Wed, 12 Jul 2017 16:11:53 +0530
Subject: nvme: define NVME_NSID_ALL

Define the constant "0xffffffff" (used as nsid for all namespaces)
as NVME_NSID_ALL.

Signed-off-by: Arnav Dawn <a.dawn@samsung.com>
Signed-off-by: Sagi Grimberg <sagi@grimberg.me>
---
 drivers/nvme/host/core.c | 6 +++---
 include/linux/nvme.h     | 2 ++
 2 files changed, 5 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index 1a1dedbac39b..a6afeedc009a 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -312,7 +312,7 @@ static int nvme_toggle_streams(struct nvme_ctrl *ctrl, bool enable)
 	memset(&c, 0, sizeof(c));
 
 	c.directive.opcode = nvme_admin_directive_send;
-	c.directive.nsid = cpu_to_le32(0xffffffff);
+	c.directive.nsid = cpu_to_le32(NVME_NSID_ALL);
 	c.directive.doper = NVME_DIR_SND_ID_OP_ENABLE;
 	c.directive.dtype = NVME_DIR_IDENTIFY;
 	c.directive.tdtype = NVME_DIR_STREAMS;
@@ -362,7 +362,7 @@ static int nvme_configure_directives(struct nvme_ctrl *ctrl)
 	if (ret)
 		return ret;
 
-	ret = nvme_get_stream_params(ctrl, &s, 0xffffffff);
+	ret = nvme_get_stream_params(ctrl, &s, NVME_NSID_ALL);
 	if (ret)
 		return ret;
 
@@ -2563,7 +2563,7 @@ static void nvme_get_fw_slot_info(struct nvme_ctrl *ctrl)
 		return;
 
 	c.common.opcode = nvme_admin_get_log_page;
-	c.common.nsid = cpu_to_le32(0xffffffff);
+	c.common.nsid = cpu_to_le32(NVME_NSID_ALL);
 	c.common.cdw10[0] = nvme_get_log_dw10(NVME_LOG_FW_SLOT, sizeof(*log));
 
 	if (!nvme_submit_sync_cmd(ctrl->admin_q, &c, log, sizeof(*log)))
diff --git a/include/linux/nvme.h b/include/linux/nvme.h
index 5d0f2f4fc3b8..1d79046bf9d4 100644
--- a/include/linux/nvme.h
+++ b/include/linux/nvme.h
@@ -32,6 +32,8 @@
 
 #define NVME_RDMA_IP_PORT	4420
 
+#define NVME_NSID_ALL		0xffffffff
+
 enum nvme_subsys_type {
 	NVME_NQN_DISC	= 1,		/* Discovery type target subsystem */
 	NVME_NQN_NVME	= 2,		/* NVME type target subsystem */
-- 
cgit v1.2.3


From dbf86b39005d26b21c52a23720e15fb850d71cdc Mon Sep 17 00:00:00 2001
From: Jon Derrick <jonathan.derrick@intel.com>
Date: Wed, 16 Aug 2017 09:51:29 +0200
Subject: nvme: add support for NVMe 1.3 Timestamp Feature

NVME's Timestamp feature allows controllers to be aware of the epoch
time in milliseconds. This patch adds the set features hook for various
transports through the identify path, so that resets and resumes can
update the controller as necessary.

Signed-off-by: Jon Derrick <jonathan.derrick@intel.com>
[hch: rebased on top of nvme-4.13 error handling changes,
      changed nvme_configure_timestamp to return the status]
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 drivers/nvme/host/core.c | 21 +++++++++++++++++++++
 include/linux/nvme.h     |  2 ++
 2 files changed, 23 insertions(+)

(limited to 'include')

diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index a6afeedc009a..0b979e16655e 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -1507,6 +1507,23 @@ static void nvme_set_queue_limits(struct nvme_ctrl *ctrl,
 	blk_queue_write_cache(q, vwc, vwc);
 }
 
+static int nvme_configure_timestamp(struct nvme_ctrl *ctrl)
+{
+	__le64 ts;
+	int ret;
+
+	if (!(ctrl->oncs & NVME_CTRL_ONCS_TIMESTAMP))
+		return 0;
+
+	ts = cpu_to_le64(ktime_to_ms(ktime_get_real()));
+	ret = nvme_set_features(ctrl, NVME_FEAT_TIMESTAMP, 0, &ts, sizeof(ts),
+			NULL);
+	if (ret)
+		dev_warn_once(ctrl->device,
+			"could not set timestamp (%d)\n", ret);
+	return ret;
+}
+
 static int nvme_configure_apst(struct nvme_ctrl *ctrl)
 {
 	/*
@@ -1861,6 +1878,10 @@ int nvme_init_identify(struct nvme_ctrl *ctrl)
 	ret = nvme_configure_apst(ctrl);
 	if (ret < 0)
 		return ret;
+	
+	ret = nvme_configure_timestamp(ctrl);
+	if (ret < 0)
+		return ret;
 
 	ret = nvme_configure_directives(ctrl);
 	if (ret < 0)
diff --git a/include/linux/nvme.h b/include/linux/nvme.h
index 1d79046bf9d4..a12b47073273 100644
--- a/include/linux/nvme.h
+++ b/include/linux/nvme.h
@@ -254,6 +254,7 @@ enum {
 	NVME_CTRL_ONCS_WRITE_UNCORRECTABLE	= 1 << 1,
 	NVME_CTRL_ONCS_DSM			= 1 << 2,
 	NVME_CTRL_ONCS_WRITE_ZEROES		= 1 << 3,
+	NVME_CTRL_ONCS_TIMESTAMP		= 1 << 6,
 	NVME_CTRL_VWC_PRESENT			= 1 << 0,
 	NVME_CTRL_OACS_SEC_SUPP                 = 1 << 0,
 	NVME_CTRL_OACS_DIRECTIVES		= 1 << 5,
@@ -688,6 +689,7 @@ enum {
 	NVME_FEAT_ASYNC_EVENT	= 0x0b,
 	NVME_FEAT_AUTO_PST	= 0x0c,
 	NVME_FEAT_HOST_MEM_BUF	= 0x0d,
+	NVME_FEAT_TIMESTAMP	= 0x0e,
 	NVME_FEAT_KATO		= 0x0f,
 	NVME_FEAT_SW_PROGRESS	= 0x80,
 	NVME_FEAT_HOST_ID	= 0x81,
-- 
cgit v1.2.3


From ad4e05b24c428d6125f6f10bd300600cae5079d4 Mon Sep 17 00:00:00 2001
From: Max Gurtovoy <maxg@mellanox.com>
Date: Sun, 13 Aug 2017 19:21:06 +0300
Subject: nvme: add symbolic constants for CC identifiers

Signed-off-by: Max Gurtovoy <maxg@mellanox.com>
Reviewed-by: Sagi Grimberg <sagi@grimberg.me>
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 drivers/nvme/target/core.c | 14 +++++++-------
 include/linux/nvme.h       | 24 +++++++++++++++---------
 2 files changed, 22 insertions(+), 16 deletions(-)

(limited to 'include')

diff --git a/drivers/nvme/target/core.c b/drivers/nvme/target/core.c
index ea7eb00dcb87..7c23eaf8e563 100644
--- a/drivers/nvme/target/core.c
+++ b/drivers/nvme/target/core.c
@@ -538,37 +538,37 @@ EXPORT_SYMBOL_GPL(nvmet_req_uninit);
 
 static inline bool nvmet_cc_en(u32 cc)
 {
-	return cc & 0x1;
+	return (cc >> NVME_CC_EN_SHIFT) & 0x1;
 }
 
 static inline u8 nvmet_cc_css(u32 cc)
 {
-	return (cc >> 4) & 0x7;
+	return (cc >> NVME_CC_CSS_SHIFT) & 0x7;
 }
 
 static inline u8 nvmet_cc_mps(u32 cc)
 {
-	return (cc >> 7) & 0xf;
+	return (cc >> NVME_CC_MPS_SHIFT) & 0xf;
 }
 
 static inline u8 nvmet_cc_ams(u32 cc)
 {
-	return (cc >> 11) & 0x7;
+	return (cc >> NVME_CC_AMS_SHIFT) & 0x7;
 }
 
 static inline u8 nvmet_cc_shn(u32 cc)
 {
-	return (cc >> 14) & 0x3;
+	return (cc >> NVME_CC_SHN_SHIFT) & 0x3;
 }
 
 static inline u8 nvmet_cc_iosqes(u32 cc)
 {
-	return (cc >> 16) & 0xf;
+	return (cc >> NVME_CC_IOSQES_SHIFT) & 0xf;
 }
 
 static inline u8 nvmet_cc_iocqes(u32 cc)
 {
-	return (cc >> 20) & 0xf;
+	return (cc >> NVME_CC_IOCQES_SHIFT) & 0xf;
 }
 
 static void nvmet_start_ctrl(struct nvmet_ctrl *ctrl)
diff --git a/include/linux/nvme.h b/include/linux/nvme.h
index a12b47073273..7b4322dc2975 100644
--- a/include/linux/nvme.h
+++ b/include/linux/nvme.h
@@ -135,16 +135,22 @@ enum {
 enum {
 	NVME_CC_ENABLE		= 1 << 0,
 	NVME_CC_CSS_NVM		= 0 << 4,
+	NVME_CC_EN_SHIFT	= 0,
+	NVME_CC_CSS_SHIFT	= 4,
 	NVME_CC_MPS_SHIFT	= 7,
-	NVME_CC_ARB_RR		= 0 << 11,
-	NVME_CC_ARB_WRRU	= 1 << 11,
-	NVME_CC_ARB_VS		= 7 << 11,
-	NVME_CC_SHN_NONE	= 0 << 14,
-	NVME_CC_SHN_NORMAL	= 1 << 14,
-	NVME_CC_SHN_ABRUPT	= 2 << 14,
-	NVME_CC_SHN_MASK	= 3 << 14,
-	NVME_CC_IOSQES		= NVME_NVM_IOSQES << 16,
-	NVME_CC_IOCQES		= NVME_NVM_IOCQES << 20,
+	NVME_CC_AMS_SHIFT	= 11,
+	NVME_CC_SHN_SHIFT	= 14,
+	NVME_CC_IOSQES_SHIFT	= 16,
+	NVME_CC_IOCQES_SHIFT	= 20,
+	NVME_CC_ARB_RR		= 0 << NVME_CC_AMS_SHIFT,
+	NVME_CC_ARB_WRRU	= 1 << NVME_CC_AMS_SHIFT,
+	NVME_CC_ARB_VS		= 7 << NVME_CC_AMS_SHIFT,
+	NVME_CC_SHN_NONE	= 0 << NVME_CC_SHN_SHIFT,
+	NVME_CC_SHN_NORMAL	= 1 << NVME_CC_SHN_SHIFT,
+	NVME_CC_SHN_ABRUPT	= 2 << NVME_CC_SHN_SHIFT,
+	NVME_CC_SHN_MASK	= 3 << NVME_CC_SHN_SHIFT,
+	NVME_CC_IOSQES		= NVME_NVM_IOSQES << NVME_CC_IOSQES_SHIFT,
+	NVME_CC_IOCQES		= NVME_NVM_IOCQES << NVME_CC_IOCQES_SHIFT,
 	NVME_CSTS_RDY		= 1 << 0,
 	NVME_CSTS_CFS		= 1 << 1,
 	NVME_CSTS_NSSRO		= 1 << 4,
-- 
cgit v1.2.3


From 60b43f627a71aaf233ef5af90f72e207c29781b4 Mon Sep 17 00:00:00 2001
From: Max Gurtovoy <maxg@mellanox.com>
Date: Sun, 13 Aug 2017 19:21:07 +0300
Subject: nvme: rename AMS symbolic constants to fit specification

Signed-off-by: Max Gurtovoy <maxg@mellanox.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 drivers/nvme/host/core.c | 2 +-
 include/linux/nvme.h     | 6 +++---
 2 files changed, 4 insertions(+), 4 deletions(-)

(limited to 'include')

diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index b2daeafbeadd..1db8de0bee87 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -1445,7 +1445,7 @@ int nvme_enable_ctrl(struct nvme_ctrl *ctrl, u64 cap)
 
 	ctrl->ctrl_config = NVME_CC_CSS_NVM;
 	ctrl->ctrl_config |= (page_shift - 12) << NVME_CC_MPS_SHIFT;
-	ctrl->ctrl_config |= NVME_CC_ARB_RR | NVME_CC_SHN_NONE;
+	ctrl->ctrl_config |= NVME_CC_AMS_RR | NVME_CC_SHN_NONE;
 	ctrl->ctrl_config |= NVME_CC_IOSQES | NVME_CC_IOCQES;
 	ctrl->ctrl_config |= NVME_CC_ENABLE;
 
diff --git a/include/linux/nvme.h b/include/linux/nvme.h
index 7b4322dc2975..05560c2ecc83 100644
--- a/include/linux/nvme.h
+++ b/include/linux/nvme.h
@@ -142,9 +142,9 @@ enum {
 	NVME_CC_SHN_SHIFT	= 14,
 	NVME_CC_IOSQES_SHIFT	= 16,
 	NVME_CC_IOCQES_SHIFT	= 20,
-	NVME_CC_ARB_RR		= 0 << NVME_CC_AMS_SHIFT,
-	NVME_CC_ARB_WRRU	= 1 << NVME_CC_AMS_SHIFT,
-	NVME_CC_ARB_VS		= 7 << NVME_CC_AMS_SHIFT,
+	NVME_CC_AMS_RR		= 0 << NVME_CC_AMS_SHIFT,
+	NVME_CC_AMS_WRRU	= 1 << NVME_CC_AMS_SHIFT,
+	NVME_CC_AMS_VS		= 7 << NVME_CC_AMS_SHIFT,
 	NVME_CC_SHN_NONE	= 0 << NVME_CC_SHN_SHIFT,
 	NVME_CC_SHN_NORMAL	= 1 << NVME_CC_SHN_SHIFT,
 	NVME_CC_SHN_ABRUPT	= 2 << NVME_CC_SHN_SHIFT,
-- 
cgit v1.2.3


From 48fa362b6c3f4d69bdb6310b46626049092475e0 Mon Sep 17 00:00:00 2001
From: James Smart <jsmart2021@gmail.com>
Date: Mon, 31 Jul 2017 13:21:14 -0700
Subject: nvmet-fc: simplify sg list handling

The existing nvmet_fc sg list handling has 2 faults:
a) the request between LLDD and transport has too large of an sg
   list (256 elements), which is normally 256k (64 elements).
b) sglist handling doesn't optimize on the fact that each element
   is a page.

This patch removes the static sg list in the request and uses the
dynamic list already present in the nvmet_fc transport. It also
simplies the handling of the sg list on multiple sequences to
take advantage of the per-page divisions.

Signed-off-by: James Smart <james.smart@broadcom.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 drivers/nvme/target/fc.c       | 48 +++++++++---------------------------------
 include/linux/nvme-fc-driver.h |  2 +-
 2 files changed, 11 insertions(+), 39 deletions(-)

(limited to 'include')

diff --git a/drivers/nvme/target/fc.c b/drivers/nvme/target/fc.c
index 309c84aa7595..421e43bf1dd7 100644
--- a/drivers/nvme/target/fc.c
+++ b/drivers/nvme/target/fc.c
@@ -58,7 +58,8 @@ struct nvmet_fc_ls_iod {
 	struct work_struct		work;
 } __aligned(sizeof(unsigned long long));
 
-#define NVMET_FC_MAX_KB_PER_XFR		256
+#define NVMET_FC_MAX_SEQ_LENGTH		(256 * 1024)
+#define NVMET_FC_MAX_XFR_SGENTS		(NVMET_FC_MAX_SEQ_LENGTH / PAGE_SIZE)
 
 enum nvmet_fcp_datadir {
 	NVMET_FCP_NODATA,
@@ -74,9 +75,7 @@ struct nvmet_fc_fcp_iod {
 	struct nvme_fc_ersp_iu		rspiubuf;
 	dma_addr_t			rspdma;
 	struct scatterlist		*data_sg;
-	struct scatterlist		*next_sg;
 	int				data_sg_cnt;
-	u32				next_sg_offset;
 	u32				total_length;
 	u32				offset;
 	enum nvmet_fcp_datadir		io_dir;
@@ -112,6 +111,7 @@ struct nvmet_fc_tgtport {
 	struct ida			assoc_cnt;
 	struct nvmet_port		*port;
 	struct kref			ref;
+	u32				max_sg_cnt;
 };
 
 struct nvmet_fc_defer_fcp_req {
@@ -994,6 +994,8 @@ nvmet_fc_register_targetport(struct nvmet_fc_port_info *pinfo,
 	INIT_LIST_HEAD(&newrec->assoc_list);
 	kref_init(&newrec->ref);
 	ida_init(&newrec->assoc_cnt);
+	newrec->max_sg_cnt = min_t(u32, NVMET_FC_MAX_XFR_SGENTS,
+					template->max_sgl_segments);
 
 	ret = nvmet_fc_alloc_ls_iodlist(newrec);
 	if (ret) {
@@ -1866,51 +1868,23 @@ nvmet_fc_transfer_fcp_data(struct nvmet_fc_tgtport *tgtport,
 				struct nvmet_fc_fcp_iod *fod, u8 op)
 {
 	struct nvmefc_tgt_fcp_req *fcpreq = fod->fcpreq;
-	struct scatterlist *sg, *datasg;
 	unsigned long flags;
-	u32 tlen, sg_off;
+	u32 tlen;
 	int ret;
 
 	fcpreq->op = op;
 	fcpreq->offset = fod->offset;
 	fcpreq->timeout = NVME_FC_TGTOP_TIMEOUT_SEC;
-	tlen = min_t(u32, (NVMET_FC_MAX_KB_PER_XFR * 1024),
+
+	tlen = min_t(u32, tgtport->max_sg_cnt * PAGE_SIZE,
 			(fod->total_length - fod->offset));
-	tlen = min_t(u32, tlen, NVME_FC_MAX_SEGMENTS * PAGE_SIZE);
-	tlen = min_t(u32, tlen, fod->tgtport->ops->max_sgl_segments
-					* PAGE_SIZE);
 	fcpreq->transfer_length = tlen;
 	fcpreq->transferred_length = 0;
 	fcpreq->fcp_error = 0;
 	fcpreq->rsplen = 0;
 
-	fcpreq->sg_cnt = 0;
-
-	datasg = fod->next_sg;
-	sg_off = fod->next_sg_offset;
-
-	for (sg = fcpreq->sg ; tlen; sg++) {
-		*sg = *datasg;
-		if (sg_off) {
-			sg->offset += sg_off;
-			sg->length -= sg_off;
-			sg->dma_address += sg_off;
-			sg_off = 0;
-		}
-		if (tlen < sg->length) {
-			sg->length = tlen;
-			fod->next_sg = datasg;
-			fod->next_sg_offset += tlen;
-		} else if (tlen == sg->length) {
-			fod->next_sg_offset = 0;
-			fod->next_sg = sg_next(datasg);
-		} else {
-			fod->next_sg_offset = 0;
-			datasg = sg_next(datasg);
-		}
-		tlen -= sg->length;
-		fcpreq->sg_cnt++;
-	}
+	fcpreq->sg = &fod->data_sg[fod->offset / PAGE_SIZE];
+	fcpreq->sg_cnt = DIV_ROUND_UP(tlen, PAGE_SIZE);
 
 	/*
 	 * If the last READDATA request: check if LLDD supports
@@ -2225,8 +2199,6 @@ nvmet_fc_handle_fcp_rqst(struct nvmet_fc_tgtport *tgtport,
 	fod->req.sg = fod->data_sg;
 	fod->req.sg_cnt = fod->data_sg_cnt;
 	fod->offset = 0;
-	fod->next_sg = fod->data_sg;
-	fod->next_sg_offset = 0;
 
 	if (fod->io_dir == NVMET_FCP_WRITE) {
 		/* pull the data over before invoking nvmet layer */
diff --git a/include/linux/nvme-fc-driver.h b/include/linux/nvme-fc-driver.h
index 2591878c1d48..9c5cb4480806 100644
--- a/include/linux/nvme-fc-driver.h
+++ b/include/linux/nvme-fc-driver.h
@@ -624,7 +624,7 @@ struct nvmefc_tgt_fcp_req {
 	u32			timeout;
 	u32			transfer_length;
 	struct fc_ba_rjt	ba_rjt;
-	struct scatterlist	sg[NVME_FC_MAX_SEGMENTS];
+	struct scatterlist	*sg;
 	int			sg_cnt;
 	void			*rspaddr;
 	dma_addr_t		rspdma;
-- 
cgit v1.2.3


From 01f33c336e2d298ea5d4ce5d6e5bcd12865cc30f Mon Sep 17 00:00:00 2001
From: Martin Wilck <mwilck@suse.com>
Date: Mon, 14 Aug 2017 22:12:38 +0200
Subject: string.h: add memcpy_and_pad()

This helper function is useful for the nvme subsystem, and maybe
others.

Note: the warnings reported by the kbuild test robot for this patch
are actually generated by the use of CONFIG_PROFILE_ALL_BRANCHES
together with __FORTIFY_INLINE.

Signed-off-by: Martin Wilck <mwilck@suse.com>
Reviewed-by: Sagi Grimberg <sagi@grimbeg.me>
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 include/linux/string.h | 30 ++++++++++++++++++++++++++++++
 1 file changed, 30 insertions(+)

(limited to 'include')

diff --git a/include/linux/string.h b/include/linux/string.h
index a467e617eeb0..0bec4151b0eb 100644
--- a/include/linux/string.h
+++ b/include/linux/string.h
@@ -200,6 +200,7 @@ static inline const char *kbasename(const char *path)
 void fortify_panic(const char *name) __noreturn __cold;
 void __read_overflow(void) __compiletime_error("detected read beyond size of object passed as 1st parameter");
 void __read_overflow2(void) __compiletime_error("detected read beyond size of object passed as 2nd parameter");
+void __read_overflow3(void) __compiletime_error("detected read beyond size of object passed as 3rd parameter");
 void __write_overflow(void) __compiletime_error("detected write beyond size of object passed as 1st parameter");
 
 #if !defined(__NO_FORTIFY) && defined(__OPTIMIZE__) && defined(CONFIG_FORTIFY_SOURCE)
@@ -395,4 +396,33 @@ __FORTIFY_INLINE char *strcpy(char *p, const char *q)
 
 #endif
 
+/**
+ * memcpy_and_pad - Copy one buffer to another with padding
+ * @dest: Where to copy to
+ * @dest_len: The destination buffer size
+ * @src: Where to copy from
+ * @count: The number of bytes to copy
+ * @pad: Character to use for padding if space is left in destination.
+ */
+__FORTIFY_INLINE void memcpy_and_pad(void *dest, size_t dest_len,
+				     const void *src, size_t count, int pad)
+{
+	size_t dest_size = __builtin_object_size(dest, 0);
+	size_t src_size = __builtin_object_size(src, 0);
+
+	if (__builtin_constant_p(dest_len) && __builtin_constant_p(count)) {
+		if (dest_size < dest_len && dest_size < count)
+			__write_overflow();
+		else if (src_size < dest_len && src_size < count)
+			__read_overflow3();
+	}
+	if (dest_size < dest_len)
+		fortify_panic(__func__);
+	if (dest_len > count) {
+		memcpy(dest, src, count);
+		memset(dest + count, pad,  dest_len - count);
+	} else
+		memcpy(dest, src, dest_len);
+}
+
 #endif /* _LINUX_STRING_H_ */
-- 
cgit v1.2.3


From 1a66a836da630cd70f3639208da549b549ce576b Mon Sep 17 00:00:00 2001
From: William Tu <u9012063@gmail.com>
Date: Fri, 25 Aug 2017 09:21:28 -0700
Subject: gre: add collect_md mode to ERSPAN tunnel

Similar to gre, vxlan, geneve, ipip tunnels, allow ERSPAN tunnels to
operate in 'collect metadata' mode.  bpf_skb_[gs]et_tunnel_key() helpers
can make use of it right away.  OVS can use it as well in the future.

Signed-off-by: William Tu <u9012063@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/ip_tunnels.h |   4 +-
 net/ipv4/ip_gre.c        | 102 +++++++++++++++++++++++++++++++++++++++++++++--
 2 files changed, 101 insertions(+), 5 deletions(-)

(limited to 'include')

diff --git a/include/net/ip_tunnels.h b/include/net/ip_tunnels.h
index 625c29329372..992652856fe8 100644
--- a/include/net/ip_tunnels.h
+++ b/include/net/ip_tunnels.h
@@ -154,8 +154,10 @@ struct ip_tunnel {
 #define TUNNEL_GENEVE_OPT	__cpu_to_be16(0x0800)
 #define TUNNEL_VXLAN_OPT	__cpu_to_be16(0x1000)
 #define TUNNEL_NOCACHE		__cpu_to_be16(0x2000)
+#define TUNNEL_ERSPAN_OPT	__cpu_to_be16(0x4000)
 
-#define TUNNEL_OPTIONS_PRESENT	(TUNNEL_GENEVE_OPT | TUNNEL_VXLAN_OPT)
+#define TUNNEL_OPTIONS_PRESENT \
+		(TUNNEL_GENEVE_OPT | TUNNEL_VXLAN_OPT | TUNNEL_ERSPAN_OPT)
 
 struct tnl_ptk_info {
 	__be16 flags;
diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c
index 453b7925b940..0162fb955b33 100644
--- a/net/ipv4/ip_gre.c
+++ b/net/ipv4/ip_gre.c
@@ -113,6 +113,8 @@ MODULE_PARM_DESC(log_ecn_error, "Log packets received with corrupted ECN");
 
 static struct rtnl_link_ops ipgre_link_ops __read_mostly;
 static int ipgre_tunnel_init(struct net_device *dev);
+static void erspan_build_header(struct sk_buff *skb,
+				__be32 id, u32 index, bool truncate);
 
 static unsigned int ipgre_net_id __read_mostly;
 static unsigned int gre_tap_net_id __read_mostly;
@@ -287,7 +289,33 @@ static int erspan_rcv(struct sk_buff *skb, struct tnl_ptk_info *tpi,
 					   false, false) < 0)
 			goto drop;
 
-		tunnel->index = ntohl(index);
+		if (tunnel->collect_md) {
+			struct ip_tunnel_info *info;
+			struct erspan_metadata *md;
+			__be64 tun_id;
+			__be16 flags;
+
+			tpi->flags |= TUNNEL_KEY;
+			flags = tpi->flags;
+			tun_id = key32_to_tunnel_id(tpi->key);
+
+			tun_dst = ip_tun_rx_dst(skb, flags,
+						tun_id, sizeof(*md));
+			if (!tun_dst)
+				return PACKET_REJECT;
+
+			md = ip_tunnel_info_opts(&tun_dst->u.tun_info);
+			if (!md)
+				return PACKET_REJECT;
+
+			md->index = index;
+			info = &tun_dst->u.tun_info;
+			info->key.tun_flags |= TUNNEL_ERSPAN_OPT;
+			info->options_len = sizeof(*md);
+		} else {
+			tunnel->index = ntohl(index);
+		}
+
 		skb_reset_mac_header(skb);
 		ip_tunnel_rcv(tunnel, skb, tpi, tun_dst, log_ecn_error);
 		return PACKET_RCVD;
@@ -523,6 +551,64 @@ err_free_skb:
 	dev->stats.tx_dropped++;
 }
 
+static void erspan_fb_xmit(struct sk_buff *skb, struct net_device *dev,
+			   __be16 proto)
+{
+	struct ip_tunnel *tunnel = netdev_priv(dev);
+	struct ip_tunnel_info *tun_info;
+	const struct ip_tunnel_key *key;
+	struct erspan_metadata *md;
+	struct rtable *rt = NULL;
+	bool truncate = false;
+	struct flowi4 fl;
+	int tunnel_hlen;
+	__be16 df;
+
+	tun_info = skb_tunnel_info(skb);
+	if (unlikely(!tun_info || !(tun_info->mode & IP_TUNNEL_INFO_TX) ||
+		     ip_tunnel_info_af(tun_info) != AF_INET))
+		goto err_free_skb;
+
+	key = &tun_info->key;
+
+	/* ERSPAN has fixed 8 byte GRE header */
+	tunnel_hlen = 8 + sizeof(struct erspanhdr);
+
+	rt = prepare_fb_xmit(skb, dev, &fl, tunnel_hlen);
+	if (!rt)
+		return;
+
+	if (gre_handle_offloads(skb, false))
+		goto err_free_rt;
+
+	if (skb->len > dev->mtu) {
+		pskb_trim(skb, dev->mtu);
+		truncate = true;
+	}
+
+	md = ip_tunnel_info_opts(tun_info);
+	if (!md)
+		goto err_free_rt;
+
+	erspan_build_header(skb, tunnel_id_to_key32(key->tun_id),
+			    ntohl(md->index), truncate);
+
+	gre_build_header(skb, 8, TUNNEL_SEQ,
+			 htons(ETH_P_ERSPAN), 0, htonl(tunnel->o_seqno++));
+
+	df = key->tun_flags & TUNNEL_DONT_FRAGMENT ?  htons(IP_DF) : 0;
+
+	iptunnel_xmit(skb->sk, rt, skb, fl.saddr, key->u.ipv4.dst, IPPROTO_GRE,
+		      key->tos, key->ttl, df, false);
+	return;
+
+err_free_rt:
+	ip_rt_put(rt);
+err_free_skb:
+	kfree_skb(skb);
+	dev->stats.tx_dropped++;
+}
+
 static int gre_fill_metadata_dst(struct net_device *dev, struct sk_buff *skb)
 {
 	struct ip_tunnel_info *info = skb_tunnel_info(skb);
@@ -636,6 +722,11 @@ static netdev_tx_t erspan_xmit(struct sk_buff *skb,
 	struct ip_tunnel *tunnel = netdev_priv(dev);
 	bool truncate = false;
 
+	if (tunnel->collect_md) {
+		erspan_fb_xmit(skb, dev, skb->protocol);
+		return NETDEV_TX_OK;
+	}
+
 	if (gre_handle_offloads(skb, false))
 		goto free_skb;
 
@@ -998,9 +1089,12 @@ static int erspan_validate(struct nlattr *tb[], struct nlattr *data[],
 		return ret;
 
 	/* ERSPAN should only have GRE sequence and key flag */
-	flags |= nla_get_be16(data[IFLA_GRE_OFLAGS]);
-	flags |= nla_get_be16(data[IFLA_GRE_IFLAGS]);
-	if (flags != (GRE_SEQ | GRE_KEY))
+	if (data[IFLA_GRE_OFLAGS])
+		flags |= nla_get_be16(data[IFLA_GRE_OFLAGS]);
+	if (data[IFLA_GRE_IFLAGS])
+		flags |= nla_get_be16(data[IFLA_GRE_IFLAGS]);
+	if (!data[IFLA_GRE_COLLECT_METADATA] &&
+	    flags != (GRE_SEQ | GRE_KEY))
 		return -EINVAL;
 
 	/* ERSPAN Session ID only has 10-bit. Since we reuse
-- 
cgit v1.2.3


From 4e587ea71bf924f7dac621f1351653bd41e446cb Mon Sep 17 00:00:00 2001
From: Wei Wang <weiwan@google.com>
Date: Fri, 25 Aug 2017 15:03:10 -0700
Subject: ipv6: fix sparse warning on rt6i_node

Commit c5cff8561d2d adds rcu grace period before freeing fib6_node. This
generates a new sparse warning on rt->rt6i_node related code:
  net/ipv6/route.c:1394:30: error: incompatible types in comparison
  expression (different address spaces)
  ./include/net/ip6_fib.h:187:14: error: incompatible types in comparison
  expression (different address spaces)

This commit adds "__rcu" tag for rt6i_node and makes sure corresponding
rcu API is used for it.
After this fix, sparse no longer generates the above warning.

Fixes: c5cff8561d2d ("ipv6: add rcu grace period before freeing fib6_node")
Signed-off-by: Wei Wang <weiwan@google.com>
Acked-by: Eric Dumazet <edumazet@google.com>
Acked-by: Martin KaFai Lau <kafai@fb.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/ip6_fib.h |  2 +-
 net/ipv6/addrconf.c   |  2 +-
 net/ipv6/ip6_fib.c    | 11 +++++++----
 net/ipv6/route.c      |  3 ++-
 4 files changed, 11 insertions(+), 7 deletions(-)

(limited to 'include')

diff --git a/include/net/ip6_fib.h b/include/net/ip6_fib.h
index e9c59db92942..af509f801084 100644
--- a/include/net/ip6_fib.h
+++ b/include/net/ip6_fib.h
@@ -105,7 +105,7 @@ struct rt6_info {
 	 * the same cache line.
 	 */
 	struct fib6_table		*rt6i_table;
-	struct fib6_node		*rt6i_node;
+	struct fib6_node __rcu		*rt6i_node;
 
 	struct in6_addr			rt6i_gateway;
 
diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c
index 3c46e9513a31..936e9ab4dda5 100644
--- a/net/ipv6/addrconf.c
+++ b/net/ipv6/addrconf.c
@@ -5556,7 +5556,7 @@ static void __ipv6_ifa_notify(int event, struct inet6_ifaddr *ifp)
 		 * our DAD process, so we don't need
 		 * to do it again
 		 */
-		if (!(ifp->rt->rt6i_node))
+		if (!rcu_access_pointer(ifp->rt->rt6i_node))
 			ip6_ins_rt(ifp->rt);
 		if (ifp->idev->cnf.forwarding)
 			addrconf_join_anycast(ifp);
diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c
index a5ebf86f6be8..10b4b1f8b838 100644
--- a/net/ipv6/ip6_fib.c
+++ b/net/ipv6/ip6_fib.c
@@ -889,7 +889,7 @@ add:
 
 		rt->dst.rt6_next = iter;
 		*ins = rt;
-		rt->rt6i_node = fn;
+		rcu_assign_pointer(rt->rt6i_node, fn);
 		atomic_inc(&rt->rt6i_ref);
 		if (!info->skip_notify)
 			inet6_rt_notify(RTM_NEWROUTE, rt, info, nlflags);
@@ -915,7 +915,7 @@ add:
 			return err;
 
 		*ins = rt;
-		rt->rt6i_node = fn;
+		rcu_assign_pointer(rt->rt6i_node, fn);
 		rt->dst.rt6_next = iter->dst.rt6_next;
 		atomic_inc(&rt->rt6i_ref);
 		if (!info->skip_notify)
@@ -1480,8 +1480,9 @@ static void fib6_del_route(struct fib6_node *fn, struct rt6_info **rtp,
 
 int fib6_del(struct rt6_info *rt, struct nl_info *info)
 {
+	struct fib6_node *fn = rcu_dereference_protected(rt->rt6i_node,
+				    lockdep_is_held(&rt->rt6i_table->tb6_lock));
 	struct net *net = info->nl_net;
-	struct fib6_node *fn = rt->rt6i_node;
 	struct rt6_info **rtp;
 
 #if RT6_DEBUG >= 2
@@ -1670,7 +1671,9 @@ static int fib6_clean_node(struct fib6_walker *w)
 			if (res) {
 #if RT6_DEBUG >= 2
 				pr_debug("%s: del failed: rt=%p@%p err=%d\n",
-					 __func__, rt, rt->rt6i_node, res);
+					 __func__, rt,
+					 rcu_access_pointer(rt->rt6i_node),
+					 res);
 #endif
 				continue;
 			}
diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index 48c8c92dcbd3..86fb2411e2bd 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -1383,7 +1383,8 @@ static void rt6_do_update_pmtu(struct rt6_info *rt, u32 mtu)
 static bool rt6_cache_allowed_for_pmtu(const struct rt6_info *rt)
 {
 	return !(rt->rt6i_flags & RTF_CACHE) &&
-		(rt->rt6i_flags & RTF_PCPU || rt->rt6i_node);
+		(rt->rt6i_flags & RTF_PCPU ||
+		 rcu_access_pointer(rt->rt6i_node));
 }
 
 static void __ip6_rt_update_pmtu(struct dst_entry *dst, const struct sock *sk,
-- 
cgit v1.2.3


From 4734b4f417126e8773b3983122ca935d02af80de Mon Sep 17 00:00:00 2001
From: Mike Marciniszyn <mike.marciniszyn@intel.com>
Date: Mon, 28 Aug 2017 11:23:45 -0700
Subject: IB/rdmavt: Add QP iterator API for QPs

There are currently 3 spots in the qib and hfi1 driver that have
knowledge of the internal QP hash list that should only be in
scope to rdmavt QP code.

Add an iterator API for processing all QPs to hide the
nature of the RCU hashlist.

The API consists of:
- rvt_qp_iter_init()
  * For iterating QPs one at a time for seq_file semantics
- rvt_qp_iter_next()
  * For iterating QPs one at a time for seq_file semantics
- rvt_qp_iter()
  * For iterating all QPs

The first two are used for things like seq_file prints.

The last is for code that just needs to iterate all QPs
in the system.

Reviewed-by: Dennis Dalessandro <dennis.dalessandro@intel.com>
Signed-off-by: Mike Marciniszyn <mike.marciniszyn@intel.com>
Signed-off-by: Dennis Dalessandro <dennis.dalessandro@intel.com>
Signed-off-by: Doug Ledford <dledford@redhat.com>
---
 drivers/infiniband/sw/rdmavt/qp.c | 144 ++++++++++++++++++++++++++++++++++++++
 include/rdma/rdmavt_qp.h          |  29 ++++++++
 2 files changed, 173 insertions(+)

(limited to 'include')

diff --git a/drivers/infiniband/sw/rdmavt/qp.c b/drivers/infiniband/sw/rdmavt/qp.c
index 3a238b00885e..9f70fd8665ab 100644
--- a/drivers/infiniband/sw/rdmavt/qp.c
+++ b/drivers/infiniband/sw/rdmavt/qp.c
@@ -2069,3 +2069,147 @@ enum hrtimer_restart rvt_rc_rnr_retry(struct hrtimer *t)
 	return HRTIMER_NORESTART;
 }
 EXPORT_SYMBOL(rvt_rc_rnr_retry);
+
+/**
+ * rvt_qp_iter_init - initial for QP iteration
+ * @rdi - rvt devinfo
+ * @v - u64 value
+ *
+ * This returns an iterator suitable for iterating QPs
+ * in the system.
+ *
+ * The @cb is a user defined callback and @v is a 64
+ * bit value passed to and relevant for processing in the
+ * @cb.  An example use case would be to alter QP processing
+ * based on criteria not part of the rvt_qp.
+ *
+ * Use cases that require memory allocation to succeed
+ * must preallocate appropriately.
+ *
+ * Return: a pointer to an rvt_qp_iter or NULL
+ */
+struct rvt_qp_iter *rvt_qp_iter_init(struct rvt_dev_info *rdi,
+				     u64 v,
+				     void (*cb)(struct rvt_qp *qp, u64 v))
+{
+	struct rvt_qp_iter *i;
+
+	i = kzalloc(sizeof(*i), GFP_KERNEL);
+	if (!i)
+		return NULL;
+
+	i->rdi = rdi;
+	/* number of special QPs (SMI/GSI) for device */
+	i->specials = rdi->ibdev.phys_port_cnt * 2;
+	i->v = v;
+	i->cb = cb;
+
+	return i;
+}
+EXPORT_SYMBOL(rvt_qp_iter_init);
+
+/**
+ * rvt_qp_iter_next - return the next QP in iter
+ * @iter - the iterator
+ *
+ * Fine grained QP iterator suitable for use
+ * with debugfs seq_file mechanisms.
+ *
+ * Updates iter->qp with the current QP when the return
+ * value is 0.
+ *
+ * Return: 0 - iter->qp is valid 1 - no more QPs
+ */
+int rvt_qp_iter_next(struct rvt_qp_iter *iter)
+	__must_hold(RCU)
+{
+	int n = iter->n;
+	int ret = 1;
+	struct rvt_qp *pqp = iter->qp;
+	struct rvt_qp *qp;
+	struct rvt_dev_info *rdi = iter->rdi;
+
+	/*
+	 * The approach is to consider the special qps
+	 * as additional table entries before the
+	 * real hash table.  Since the qp code sets
+	 * the qp->next hash link to NULL, this works just fine.
+	 *
+	 * iter->specials is 2 * # ports
+	 *
+	 * n = 0..iter->specials is the special qp indices
+	 *
+	 * n = iter->specials..rdi->qp_dev->qp_table_size+iter->specials are
+	 * the potential hash bucket entries
+	 *
+	 */
+	for (; n <  rdi->qp_dev->qp_table_size + iter->specials; n++) {
+		if (pqp) {
+			qp = rcu_dereference(pqp->next);
+		} else {
+			if (n < iter->specials) {
+				struct rvt_ibport *rvp;
+				int pidx;
+
+				pidx = n % rdi->ibdev.phys_port_cnt;
+				rvp = rdi->ports[pidx];
+				qp = rcu_dereference(rvp->qp[n & 1]);
+			} else {
+				qp = rcu_dereference(
+					rdi->qp_dev->qp_table[
+						(n - iter->specials)]);
+			}
+		}
+		pqp = qp;
+		if (qp) {
+			iter->qp = qp;
+			iter->n = n;
+			return 0;
+		}
+	}
+	return ret;
+}
+EXPORT_SYMBOL(rvt_qp_iter_next);
+
+/**
+ * rvt_qp_iter - iterate all QPs
+ * @rdi - rvt devinfo
+ * @v - a 64 bit value
+ * @cb - a callback
+ *
+ * This provides a way for iterating all QPs.
+ *
+ * The @cb is a user defined callback and @v is a 64
+ * bit value passed to and relevant for processing in the
+ * cb.  An example use case would be to alter QP processing
+ * based on criteria not part of the rvt_qp.
+ *
+ * The code has an internal iterator to simplify
+ * non seq_file use cases.
+ */
+void rvt_qp_iter(struct rvt_dev_info *rdi,
+		 u64 v,
+		 void (*cb)(struct rvt_qp *qp, u64 v))
+{
+	int ret;
+	struct rvt_qp_iter i = {
+		.rdi = rdi,
+		.specials = rdi->ibdev.phys_port_cnt * 2,
+		.v = v,
+		.cb = cb
+	};
+
+	rcu_read_lock();
+	do {
+		ret = rvt_qp_iter_next(&i);
+		if (!ret) {
+			rvt_get_qp(i.qp);
+			rcu_read_unlock();
+			i.cb(i.qp, i.v);
+			rcu_read_lock();
+			rvt_put_qp(i.qp);
+		}
+	} while (!ret);
+	rcu_read_unlock();
+}
+EXPORT_SYMBOL(rvt_qp_iter);
diff --git a/include/rdma/rdmavt_qp.h b/include/rdma/rdmavt_qp.h
index 8fbafb0ce674..dfeb311c30a1 100644
--- a/include/rdma/rdmavt_qp.h
+++ b/include/rdma/rdmavt_qp.h
@@ -673,4 +673,33 @@ void rvt_del_timers_sync(struct rvt_qp *qp);
 void rvt_stop_rc_timers(struct rvt_qp *qp);
 void rvt_add_retry_timer(struct rvt_qp *qp);
 
+/**
+ * struct rvt_qp_iter - the iterator for QPs
+ * @qp - the current QP
+ *
+ * This structure defines the current iterator
+ * state for sequenced access to all QPs relative
+ * to an rvt_dev_info.
+ */
+struct rvt_qp_iter {
+	struct rvt_qp *qp;
+	/* private: backpointer */
+	struct rvt_dev_info *rdi;
+	/* private: callback routine */
+	void (*cb)(struct rvt_qp *qp, u64 v);
+	/* private: for arg to callback routine */
+	u64 v;
+	/* private: number of SMI,GSI QPs for device */
+	int specials;
+	/* private: current iterator index */
+	int n;
+};
+
+struct rvt_qp_iter *rvt_qp_iter_init(struct rvt_dev_info *rdi,
+				     u64 v,
+				     void (*cb)(struct rvt_qp *qp, u64 v));
+int rvt_qp_iter_next(struct rvt_qp_iter *iter);
+void rvt_qp_iter(struct rvt_dev_info *rdi,
+		 u64 v,
+		 void (*cb)(struct rvt_qp *qp, u64 v));
 #endif          /* DEF_RDMAVT_INCQP_H */
-- 
cgit v1.2.3


From 0208da90def5776cef940f9de4ffe6ecef346207 Mon Sep 17 00:00:00 2001
From: Mike Marciniszyn <mike.marciniszyn@intel.com>
Date: Mon, 28 Aug 2017 11:24:10 -0700
Subject: IB/rdmavt: Handle dereg of inuse MRs properly

A destroy of an MR prior to destroying the QP can cause the following
diagnostic if the QP is referencing the MR being de-registered:

hfi1 0000:05:00.0: hfi1_0: rvt_dereg_mr timeout mr ffff8808562108
              00 pd ffff880859b20b00

The solution is to when the a non-zero refcount is encountered when
the MR is destroyed the QPs needs to be iterated looking for QPs in
the same PD as the MR.  If rvt_qp_mr_clean() detects any such QP
references the rkey/lkey, the QP needs to be put into an error state
via a call to rvt_qp_error() which will trigger the clean up of any
stuck references.

This solution is as specified in IBTA 1.3 Volume 1 11.2.10.5.

[This is reproduced with the 0.4.9 version of qperf and the rc_bw test]

Reviewed-by: Dennis Dalessandro <dennis.dalessandro@intel.com>
Signed-off-by: Mike Marciniszyn <mike.marciniszyn@intel.com>
Signed-off-by: Dennis Dalessandro <dennis.dalessandro@intel.com>
Signed-off-by: Doug Ledford <dledford@redhat.com>
---
 drivers/infiniband/sw/rdmavt/mr.c | 121 ++++++++++++++++++++++++++++++++------
 drivers/infiniband/sw/rdmavt/qp.c | 112 +++++++++++++++++++++++++++++++++--
 include/rdma/rdmavt_mr.h          |   3 +
 include/rdma/rdmavt_qp.h          |   1 +
 4 files changed, 216 insertions(+), 21 deletions(-)

(limited to 'include')

diff --git a/drivers/infiniband/sw/rdmavt/mr.c b/drivers/infiniband/sw/rdmavt/mr.c
index 1b3801f78e78..42713511b53b 100644
--- a/drivers/infiniband/sw/rdmavt/mr.c
+++ b/drivers/infiniband/sw/rdmavt/mr.c
@@ -440,6 +440,105 @@ bail_umem:
 	return ret;
 }
 
+/**
+ * rvt_dereg_clean_qp_cb - callback from iterator
+ * @qp - the qp
+ * @v - the mregion (as u64)
+ *
+ * This routine fields the callback for all QPs and
+ * for QPs in the same PD as the MR will call the
+ * rvt_qp_mr_clean() to potentially cleanup references.
+ */
+static void rvt_dereg_clean_qp_cb(struct rvt_qp *qp, u64 v)
+{
+	struct rvt_mregion *mr = (struct rvt_mregion *)v;
+
+	/* skip PDs that are not ours */
+	if (mr->pd != qp->ibqp.pd)
+		return;
+	rvt_qp_mr_clean(qp, mr->lkey);
+}
+
+/**
+ * rvt_dereg_clean_qps - find QPs for reference cleanup
+ * @mr - the MR that is being deregistered
+ *
+ * This routine iterates RC QPs looking for references
+ * to the lkey noted in mr.
+ */
+static void rvt_dereg_clean_qps(struct rvt_mregion *mr)
+{
+	struct rvt_dev_info *rdi = ib_to_rvt(mr->pd->device);
+
+	rvt_qp_iter(rdi, (u64)mr, rvt_dereg_clean_qp_cb);
+}
+
+/**
+ * rvt_check_refs - check references
+ * @mr - the megion
+ * @t - the caller identification
+ *
+ * This routine checks MRs holding a reference during
+ * when being de-registered.
+ *
+ * If the count is non-zero, the code calls a clean routine then
+ * waits for the timeout for the count to zero.
+ */
+static int rvt_check_refs(struct rvt_mregion *mr, const char *t)
+{
+	unsigned long timeout;
+	struct rvt_dev_info *rdi = ib_to_rvt(mr->pd->device);
+
+	if (percpu_ref_is_zero(&mr->refcount))
+		return 0;
+	/* avoid dma mr */
+	if (mr->lkey)
+		rvt_dereg_clean_qps(mr);
+	timeout = wait_for_completion_timeout(&mr->comp, 5 * HZ);
+	if (!timeout) {
+		rvt_pr_err(rdi,
+			   "%s timeout mr %p pd %p lkey %x refcount %ld\n",
+			   t, mr, mr->pd, mr->lkey,
+			   atomic_long_read(&mr->refcount.count));
+		rvt_get_mr(mr);
+		return -EBUSY;
+	}
+	return 0;
+}
+
+/**
+ * rvt_mr_has_lkey - is MR
+ * @mr - the mregion
+ * @lkey - the lkey
+ */
+bool rvt_mr_has_lkey(struct rvt_mregion *mr, u32 lkey)
+{
+	return mr && lkey == mr->lkey;
+}
+
+/**
+ * rvt_ss_has_lkey - is mr in sge tests
+ * @ss - the sge state
+ * @lkey
+ *
+ * This code tests for an MR in the indicated
+ * sge state.
+ */
+bool rvt_ss_has_lkey(struct rvt_sge_state *ss, u32 lkey)
+{
+	int i;
+	bool rval = false;
+
+	if (!ss->num_sge)
+		return rval;
+	/* first one */
+	rval = rvt_mr_has_lkey(ss->sge.mr, lkey);
+	/* any others */
+	for (i = 0; !rval && i < ss->num_sge - 1; i++)
+		rval = rvt_mr_has_lkey(ss->sg_list[i].mr, lkey);
+	return rval;
+}
+
 /**
  * rvt_dereg_mr - unregister and free a memory region
  * @ibmr: the memory region to free
@@ -453,22 +552,14 @@ bail_umem:
 int rvt_dereg_mr(struct ib_mr *ibmr)
 {
 	struct rvt_mr *mr = to_imr(ibmr);
-	struct rvt_dev_info *rdi = ib_to_rvt(ibmr->pd->device);
-	int ret = 0;
-	unsigned long timeout;
+	int ret;
 
 	rvt_free_lkey(&mr->mr);
 
 	rvt_put_mr(&mr->mr); /* will set completion if last */
-	timeout = wait_for_completion_timeout(&mr->mr.comp, 5 * HZ);
-	if (!timeout) {
-		rvt_pr_err(rdi,
-			   "rvt_dereg_mr timeout mr %p pd %p\n",
-			   mr, mr->mr.pd);
-		rvt_get_mr(&mr->mr);
-		ret = -EBUSY;
+	ret = rvt_check_refs(&mr->mr, __func__);
+	if (ret)
 		goto out;
-	}
 	rvt_deinit_mregion(&mr->mr);
 	if (mr->umem)
 		ib_umem_release(mr->umem);
@@ -761,16 +852,12 @@ int rvt_dealloc_fmr(struct ib_fmr *ibfmr)
 {
 	struct rvt_fmr *fmr = to_ifmr(ibfmr);
 	int ret = 0;
-	unsigned long timeout;
 
 	rvt_free_lkey(&fmr->mr);
 	rvt_put_mr(&fmr->mr); /* will set completion if last */
-	timeout = wait_for_completion_timeout(&fmr->mr.comp, 5 * HZ);
-	if (!timeout) {
-		rvt_get_mr(&fmr->mr);
-		ret = -EBUSY;
+	ret = rvt_check_refs(&fmr->mr, __func__);
+	if (ret)
 		goto out;
-	}
 	rvt_deinit_mregion(&fmr->mr);
 	kfree(fmr);
 out:
diff --git a/drivers/infiniband/sw/rdmavt/qp.c b/drivers/infiniband/sw/rdmavt/qp.c
index 9f70fd8665ab..22df09ae809e 100644
--- a/drivers/infiniband/sw/rdmavt/qp.c
+++ b/drivers/infiniband/sw/rdmavt/qp.c
@@ -458,10 +458,7 @@ static void rvt_clear_mr_refs(struct rvt_qp *qp, int clr_sends)
 		}
 	}
 
-	if (qp->ibqp.qp_type != IB_QPT_RC)
-		return;
-
-	for (n = 0; n < rvt_max_atomic(rdi); n++) {
+	for (n = 0; qp->s_ack_queue && n < rvt_max_atomic(rdi); n++) {
 		struct rvt_ack_entry *e = &qp->s_ack_queue[n];
 
 		if (e->rdma_sge.mr) {
@@ -471,6 +468,113 @@ static void rvt_clear_mr_refs(struct rvt_qp *qp, int clr_sends)
 	}
 }
 
+/**
+ * rvt_swqe_has_lkey - return true if lkey is used by swqe
+ * @wqe - the send wqe
+ * @lkey - the lkey
+ *
+ * Test the swqe for using lkey
+ */
+static bool rvt_swqe_has_lkey(struct rvt_swqe *wqe, u32 lkey)
+{
+	int i;
+
+	for (i = 0; i < wqe->wr.num_sge; i++) {
+		struct rvt_sge *sge = &wqe->sg_list[i];
+
+		if (rvt_mr_has_lkey(sge->mr, lkey))
+			return true;
+	}
+	return false;
+}
+
+/**
+ * rvt_qp_sends_has_lkey - return true is qp sends use lkey
+ * @qp - the rvt_qp
+ * @lkey - the lkey
+ */
+static bool rvt_qp_sends_has_lkey(struct rvt_qp *qp, u32 lkey)
+{
+	u32 s_last = qp->s_last;
+
+	while (s_last != qp->s_head) {
+		struct rvt_swqe *wqe = rvt_get_swqe_ptr(qp, s_last);
+
+		if (rvt_swqe_has_lkey(wqe, lkey))
+			return true;
+
+		if (++s_last >= qp->s_size)
+			s_last = 0;
+	}
+	if (qp->s_rdma_mr)
+		if (rvt_mr_has_lkey(qp->s_rdma_mr, lkey))
+			return true;
+	return false;
+}
+
+/**
+ * rvt_qp_acks_has_lkey - return true if acks have lkey
+ * @qp - the qp
+ * @lkey - the lkey
+ */
+static bool rvt_qp_acks_has_lkey(struct rvt_qp *qp, u32 lkey)
+{
+	int i;
+	struct rvt_dev_info *rdi = ib_to_rvt(qp->ibqp.device);
+
+	for (i = 0; qp->s_ack_queue && i < rvt_max_atomic(rdi); i++) {
+		struct rvt_ack_entry *e = &qp->s_ack_queue[i];
+
+		if (rvt_mr_has_lkey(e->rdma_sge.mr, lkey))
+			return true;
+	}
+	return false;
+}
+
+/*
+ * rvt_qp_mr_clean - clean up remote ops for lkey
+ * @qp - the qp
+ * @lkey - the lkey that is being de-registered
+ *
+ * This routine checks if the lkey is being used by
+ * the qp.
+ *
+ * If so, the qp is put into an error state to elminate
+ * any references from the qp.
+ */
+void rvt_qp_mr_clean(struct rvt_qp *qp, u32 lkey)
+{
+	bool lastwqe = false;
+
+	if (qp->ibqp.qp_type == IB_QPT_SMI ||
+	    qp->ibqp.qp_type == IB_QPT_GSI)
+		/* avoid special QPs */
+		return;
+	spin_lock_irq(&qp->r_lock);
+	spin_lock(&qp->s_hlock);
+	spin_lock(&qp->s_lock);
+
+	if (qp->state == IB_QPS_ERR || qp->state == IB_QPS_RESET)
+		goto check_lwqe;
+
+	if (rvt_ss_has_lkey(&qp->r_sge, lkey) ||
+	    rvt_qp_sends_has_lkey(qp, lkey) ||
+	    rvt_qp_acks_has_lkey(qp, lkey))
+		lastwqe = rvt_error_qp(qp, IB_WC_LOC_PROT_ERR);
+check_lwqe:
+	spin_unlock(&qp->s_lock);
+	spin_unlock(&qp->s_hlock);
+	spin_unlock_irq(&qp->r_lock);
+	if (lastwqe) {
+		struct ib_event ev;
+
+		ev.device = qp->ibqp.device;
+		ev.element.qp = &qp->ibqp;
+		ev.event = IB_EVENT_QP_LAST_WQE_REACHED;
+		qp->ibqp.event_handler(&ev, qp->ibqp.qp_context);
+	}
+}
+
 /**
  * rvt_remove_qp - remove qp form table
  * @rdi: rvt dev struct
diff --git a/include/rdma/rdmavt_mr.h b/include/rdma/rdmavt_mr.h
index f418bd5571a5..72a3856d4057 100644
--- a/include/rdma/rdmavt_mr.h
+++ b/include/rdma/rdmavt_mr.h
@@ -191,4 +191,7 @@ static inline void rvt_skip_sge(struct rvt_sge_state *ss, u32 length,
 	}
 }
 
+bool rvt_ss_has_lkey(struct rvt_sge_state *ss, u32 lkey);
+bool rvt_mr_has_lkey(struct rvt_mregion *mr, u32 lkey);
+
 #endif          /* DEF_RDMAVT_INCMRH */
diff --git a/include/rdma/rdmavt_qp.h b/include/rdma/rdmavt_qp.h
index dfeb311c30a1..0eed3d8752fa 100644
--- a/include/rdma/rdmavt_qp.h
+++ b/include/rdma/rdmavt_qp.h
@@ -702,4 +702,5 @@ int rvt_qp_iter_next(struct rvt_qp_iter *iter);
 void rvt_qp_iter(struct rvt_dev_info *rdi,
 		 u64 v,
 		 void (*cb)(struct rvt_qp *qp, u64 v));
+void rvt_qp_mr_clean(struct rvt_qp *qp, u32 lkey);
 #endif          /* DEF_RDMAVT_INCQP_H */
-- 
cgit v1.2.3


From b339752d054fb32863418452dff350a1086885b1 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Mon, 28 Aug 2017 14:51:27 -0700
Subject: cpumask: fix spurious cpumask_of_node() on non-NUMA multi-node
 configs

When !NUMA, cpumask_of_node(@node) equals cpu_online_mask regardless of
@node.  The assumption seems that if !NUMA, there shouldn't be more than
one node and thus reporting cpu_online_mask regardless of @node is
correct.  However, that assumption was broken years ago to support
DISCONTIGMEM and whether a system has multiple nodes or not is
separately controlled by NEED_MULTIPLE_NODES.

This means that, on a system with !NUMA && NEED_MULTIPLE_NODES,
cpumask_of_node() will report cpu_online_mask for all possible nodes,
indicating that the CPUs are associated with multiple nodes which is an
impossible configuration.

This bug has been around forever but doesn't look like it has caused any
noticeable symptoms.  However, it triggers a WARN recently added to
workqueue to verify NUMA affinity configuration.

Fix it by reporting empty cpumask on non-zero nodes if !NUMA.

Signed-off-by: Tejun Heo <tj@kernel.org>
Reported-and-tested-by: Geert Uytterhoeven <geert@linux-m68k.org>
Cc: stable@vger.kernel.org
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/asm-generic/topology.h | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/asm-generic/topology.h b/include/asm-generic/topology.h
index fc824e2828f3..5d2add1a6c96 100644
--- a/include/asm-generic/topology.h
+++ b/include/asm-generic/topology.h
@@ -48,7 +48,11 @@
 #define parent_node(node)	((void)(node),0)
 #endif
 #ifndef cpumask_of_node
-#define cpumask_of_node(node)	((void)node, cpu_online_mask)
+  #ifdef CONFIG_NEED_MULTIPLE_NODES
+    #define cpumask_of_node(node)	((node) == 0 ? cpu_online_mask : cpu_none_mask)
+  #else
+    #define cpumask_of_node(node)	((void)node, cpu_online_mask)
+  #endif
 #endif
 #ifndef pcibus_to_node
 #define pcibus_to_node(bus)	((void)(bus), -1)
-- 
cgit v1.2.3


From 5aa5911a0ed9355ebb02f83de4be9ba435a45a2c Mon Sep 17 00:00:00 2001
From: Toshi Kani <toshi.kani@hpe.com>
Date: Wed, 23 Aug 2017 16:54:43 -0600
Subject: ACPI / blacklist: add acpi_match_platform_list()

ACPI OEM ID / OEM Table ID / Revision can be used to identify
a platform based on ACPI firmware info.  acpi_blacklisted(),
intel_pstate_platform_pwr_mgmt_exists(), and some other funcs,
have been using similar check to detect a list of platforms
that require special handlings.

Move the platform check in acpi_blacklisted() to a new common
utility function, acpi_match_platform_list(), so that other
drivers do not have to implement their own version.

There is no change in functionality.

Signed-off-by: Toshi Kani <toshi.kani@hpe.com>
Reviewed-by: Borislav Petkov <bp@suse.de>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/acpi/blacklist.c | 83 ++++++++----------------------------------------
 drivers/acpi/utils.c     | 36 +++++++++++++++++++++
 include/linux/acpi.h     | 19 +++++++++++
 3 files changed, 69 insertions(+), 69 deletions(-)

(limited to 'include')

diff --git a/drivers/acpi/blacklist.c b/drivers/acpi/blacklist.c
index bb542acc0574..037fd537bbf6 100644
--- a/drivers/acpi/blacklist.c
+++ b/drivers/acpi/blacklist.c
@@ -30,30 +30,13 @@
 
 #include "internal.h"
 
-enum acpi_blacklist_predicates {
-	all_versions,
-	less_than_or_equal,
-	equal,
-	greater_than_or_equal,
-};
-
-struct acpi_blacklist_item {
-	char oem_id[7];
-	char oem_table_id[9];
-	u32 oem_revision;
-	char *table;
-	enum acpi_blacklist_predicates oem_revision_predicate;
-	char *reason;
-	u32 is_critical_error;
-};
-
 static struct dmi_system_id acpi_rev_dmi_table[] __initdata;
 
 /*
  * POLICY: If *anything* doesn't work, put it on the blacklist.
  *	   If they are critical errors, mark it critical, and abort driver load.
  */
-static struct acpi_blacklist_item acpi_blacklist[] __initdata = {
+static struct acpi_platform_list acpi_blacklist[] __initdata = {
 	/* Compaq Presario 1700 */
 	{"PTLTD ", "  DSDT  ", 0x06040000, ACPI_SIG_DSDT, less_than_or_equal,
 	 "Multiple problems", 1},
@@ -67,65 +50,27 @@ static struct acpi_blacklist_item acpi_blacklist[] __initdata = {
 	{"IBM   ", "TP600E  ", 0x00000105, ACPI_SIG_DSDT, less_than_or_equal,
 	 "Incorrect _ADR", 1},
 
-	{""}
+	{ }
 };
 
 int __init acpi_blacklisted(void)
 {
-	int i = 0;
+	int i;
 	int blacklisted = 0;
-	struct acpi_table_header table_header;
-
-	while (acpi_blacklist[i].oem_id[0] != '\0') {
-		if (acpi_get_table_header(acpi_blacklist[i].table, 0, &table_header)) {
-			i++;
-			continue;
-		}
-
-		if (strncmp(acpi_blacklist[i].oem_id, table_header.oem_id, 6)) {
-			i++;
-			continue;
-		}
-
-		if (strncmp
-		    (acpi_blacklist[i].oem_table_id, table_header.oem_table_id,
-		     8)) {
-			i++;
-			continue;
-		}
-
-		if ((acpi_blacklist[i].oem_revision_predicate == all_versions)
-		    || (acpi_blacklist[i].oem_revision_predicate ==
-			less_than_or_equal
-			&& table_header.oem_revision <=
-			acpi_blacklist[i].oem_revision)
-		    || (acpi_blacklist[i].oem_revision_predicate ==
-			greater_than_or_equal
-			&& table_header.oem_revision >=
-			acpi_blacklist[i].oem_revision)
-		    || (acpi_blacklist[i].oem_revision_predicate == equal
-			&& table_header.oem_revision ==
-			acpi_blacklist[i].oem_revision)) {
 
-			printk(KERN_ERR PREFIX
-			       "Vendor \"%6.6s\" System \"%8.8s\" "
-			       "Revision 0x%x has a known ACPI BIOS problem.\n",
-			       acpi_blacklist[i].oem_id,
-			       acpi_blacklist[i].oem_table_id,
-			       acpi_blacklist[i].oem_revision);
+	i = acpi_match_platform_list(acpi_blacklist);
+	if (i >= 0) {
+		pr_err(PREFIX "Vendor \"%6.6s\" System \"%8.8s\" Revision 0x%x has a known ACPI BIOS problem.\n",
+		       acpi_blacklist[i].oem_id,
+		       acpi_blacklist[i].oem_table_id,
+		       acpi_blacklist[i].oem_revision);
 
-			printk(KERN_ERR PREFIX
-			       "Reason: %s. This is a %s error\n",
-			       acpi_blacklist[i].reason,
-			       (acpi_blacklist[i].
-				is_critical_error ? "non-recoverable" :
-				"recoverable"));
+		pr_err(PREFIX "Reason: %s. This is a %s error\n",
+		       acpi_blacklist[i].reason,
+		       (acpi_blacklist[i].data ?
+			"non-recoverable" : "recoverable"));
 
-			blacklisted = acpi_blacklist[i].is_critical_error;
-			break;
-		} else {
-			i++;
-		}
+		blacklisted = acpi_blacklist[i].data;
 	}
 
 	(void)early_acpi_osi_init();
diff --git a/drivers/acpi/utils.c b/drivers/acpi/utils.c
index b9d956c916f5..0a9e5979aaa9 100644
--- a/drivers/acpi/utils.c
+++ b/drivers/acpi/utils.c
@@ -816,3 +816,39 @@ static int __init acpi_backlight(char *str)
 	return 1;
 }
 __setup("acpi_backlight=", acpi_backlight);
+
+/**
+ * acpi_match_platform_list - Check if the system matches with a given list
+ * @plat: pointer to acpi_platform_list table terminated by a NULL entry
+ *
+ * Return the matched index if the system is found in the platform list.
+ * Otherwise, return a negative error code.
+ */
+int acpi_match_platform_list(const struct acpi_platform_list *plat)
+{
+	struct acpi_table_header hdr;
+	int idx = 0;
+
+	if (acpi_disabled)
+		return -ENODEV;
+
+	for (; plat->oem_id[0]; plat++, idx++) {
+		if (ACPI_FAILURE(acpi_get_table_header(plat->table, 0, &hdr)))
+			continue;
+
+		if (strncmp(plat->oem_id, hdr.oem_id, ACPI_OEM_ID_SIZE))
+			continue;
+
+		if (strncmp(plat->oem_table_id, hdr.oem_table_id, ACPI_OEM_TABLE_ID_SIZE))
+			continue;
+
+		if ((plat->pred == all_versions) ||
+		    (plat->pred == less_than_or_equal && hdr.oem_revision <= plat->oem_revision) ||
+		    (plat->pred == greater_than_or_equal && hdr.oem_revision >= plat->oem_revision) ||
+		    (plat->pred == equal && hdr.oem_revision == plat->oem_revision))
+			return idx;
+	}
+
+	return -ENODEV;
+}
+EXPORT_SYMBOL(acpi_match_platform_list);
diff --git a/include/linux/acpi.h b/include/linux/acpi.h
index 27b4b6615263..1c8a97d8b09b 100644
--- a/include/linux/acpi.h
+++ b/include/linux/acpi.h
@@ -556,6 +556,25 @@ extern acpi_status acpi_pci_osc_control_set(acpi_handle handle,
 #define ACPI_OST_SC_DRIVER_LOAD_FAILURE		0x81
 #define ACPI_OST_SC_INSERT_NOT_SUPPORTED	0x82
 
+enum acpi_predicate {
+	all_versions,
+	less_than_or_equal,
+	equal,
+	greater_than_or_equal,
+};
+
+/* Table must be terminted by a NULL entry */
+struct acpi_platform_list {
+	char	oem_id[ACPI_OEM_ID_SIZE+1];
+	char	oem_table_id[ACPI_OEM_TABLE_ID_SIZE+1];
+	u32	oem_revision;
+	char	*table;
+	enum acpi_predicate pred;
+	char	*reason;
+	u32	data;
+};
+int acpi_match_platform_list(const struct acpi_platform_list *plat);
+
 extern void acpi_early_init(void);
 extern void acpi_subsystem_init(void);
 
-- 
cgit v1.2.3


From 5bf916ee0ab638c86edeaf4caeeade9ddf44d95d Mon Sep 17 00:00:00 2001
From: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Date: Sun, 27 Aug 2017 17:03:33 +0200
Subject: irda: move include/net/irda into staging subdirectory

And finally, move the irda include files into
drivers/staging/irda/include/net/irda.  Yes, it's a long path, but it
makes it easy for us to just add a Makefile directory path addition and
all of the net and drivers code "just works".

Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/staging/irda/drivers/Makefile              |   2 +
 drivers/staging/irda/include/net/irda/af_irda.h    |  87 ++++++
 drivers/staging/irda/include/net/irda/crc.h        |  29 ++
 drivers/staging/irda/include/net/irda/discovery.h  |  95 +++++++
 .../staging/irda/include/net/irda/ircomm_core.h    | 106 +++++++
 .../staging/irda/include/net/irda/ircomm_event.h   |  83 ++++++
 drivers/staging/irda/include/net/irda/ircomm_lmp.h |  36 +++
 .../staging/irda/include/net/irda/ircomm_param.h   | 147 ++++++++++
 drivers/staging/irda/include/net/irda/ircomm_ttp.h |  37 +++
 drivers/staging/irda/include/net/irda/ircomm_tty.h | 121 ++++++++
 .../irda/include/net/irda/ircomm_tty_attach.h      |  92 ++++++
 drivers/staging/irda/include/net/irda/irda.h       | 115 ++++++++
 .../staging/irda/include/net/irda/irda_device.h    | 285 +++++++++++++++++++
 drivers/staging/irda/include/net/irda/iriap.h      | 108 +++++++
 .../staging/irda/include/net/irda/iriap_event.h    |  85 ++++++
 .../staging/irda/include/net/irda/irias_object.h   | 108 +++++++
 .../staging/irda/include/net/irda/irlan_client.h   |  42 +++
 .../staging/irda/include/net/irda/irlan_common.h   | 230 +++++++++++++++
 drivers/staging/irda/include/net/irda/irlan_eth.h  |  32 +++
 .../staging/irda/include/net/irda/irlan_event.h    |  81 ++++++
 .../staging/irda/include/net/irda/irlan_filter.h   |  35 +++
 .../staging/irda/include/net/irda/irlan_provider.h |  52 ++++
 drivers/staging/irda/include/net/irda/irlap.h      | 311 +++++++++++++++++++++
 .../staging/irda/include/net/irda/irlap_event.h    | 129 +++++++++
 .../staging/irda/include/net/irda/irlap_frame.h    | 167 +++++++++++
 drivers/staging/irda/include/net/irda/irlmp.h      | 295 +++++++++++++++++++
 .../staging/irda/include/net/irda/irlmp_event.h    |  98 +++++++
 .../staging/irda/include/net/irda/irlmp_frame.h    |  62 ++++
 drivers/staging/irda/include/net/irda/irmod.h      | 109 ++++++++
 drivers/staging/irda/include/net/irda/irqueue.h    |  96 +++++++
 drivers/staging/irda/include/net/irda/irttp.h      | 210 ++++++++++++++
 drivers/staging/irda/include/net/irda/parameters.h | 100 +++++++
 drivers/staging/irda/include/net/irda/qos.h        | 101 +++++++
 drivers/staging/irda/include/net/irda/timer.h      | 105 +++++++
 drivers/staging/irda/include/net/irda/wrapper.h    |  58 ++++
 drivers/staging/irda/net/Makefile                  |   2 +
 include/net/irda/af_irda.h                         |  87 ------
 include/net/irda/crc.h                             |  29 --
 include/net/irda/discovery.h                       |  95 -------
 include/net/irda/ircomm_core.h                     | 106 -------
 include/net/irda/ircomm_event.h                    |  83 ------
 include/net/irda/ircomm_lmp.h                      |  36 ---
 include/net/irda/ircomm_param.h                    | 147 ----------
 include/net/irda/ircomm_ttp.h                      |  37 ---
 include/net/irda/ircomm_tty.h                      | 121 --------
 include/net/irda/ircomm_tty_attach.h               |  92 ------
 include/net/irda/irda.h                            | 115 --------
 include/net/irda/irda_device.h                     | 285 -------------------
 include/net/irda/iriap.h                           | 108 -------
 include/net/irda/iriap_event.h                     |  85 ------
 include/net/irda/irias_object.h                    | 108 -------
 include/net/irda/irlan_client.h                    |  42 ---
 include/net/irda/irlan_common.h                    | 230 ---------------
 include/net/irda/irlan_eth.h                       |  32 ---
 include/net/irda/irlan_event.h                     |  81 ------
 include/net/irda/irlan_filter.h                    |  35 ---
 include/net/irda/irlan_provider.h                  |  52 ----
 include/net/irda/irlap.h                           | 311 ---------------------
 include/net/irda/irlap_event.h                     | 129 ---------
 include/net/irda/irlap_frame.h                     | 167 -----------
 include/net/irda/irlmp.h                           | 295 -------------------
 include/net/irda/irlmp_event.h                     |  98 -------
 include/net/irda/irlmp_frame.h                     |  62 ----
 include/net/irda/irmod.h                           | 109 --------
 include/net/irda/irqueue.h                         |  96 -------
 include/net/irda/irttp.h                           | 210 --------------
 include/net/irda/parameters.h                      | 100 -------
 include/net/irda/qos.h                             | 101 -------
 include/net/irda/timer.h                           | 105 -------
 include/net/irda/wrapper.h                         |  58 ----
 70 files changed, 3851 insertions(+), 3847 deletions(-)
 create mode 100644 drivers/staging/irda/include/net/irda/af_irda.h
 create mode 100644 drivers/staging/irda/include/net/irda/crc.h
 create mode 100644 drivers/staging/irda/include/net/irda/discovery.h
 create mode 100644 drivers/staging/irda/include/net/irda/ircomm_core.h
 create mode 100644 drivers/staging/irda/include/net/irda/ircomm_event.h
 create mode 100644 drivers/staging/irda/include/net/irda/ircomm_lmp.h
 create mode 100644 drivers/staging/irda/include/net/irda/ircomm_param.h
 create mode 100644 drivers/staging/irda/include/net/irda/ircomm_ttp.h
 create mode 100644 drivers/staging/irda/include/net/irda/ircomm_tty.h
 create mode 100644 drivers/staging/irda/include/net/irda/ircomm_tty_attach.h
 create mode 100644 drivers/staging/irda/include/net/irda/irda.h
 create mode 100644 drivers/staging/irda/include/net/irda/irda_device.h
 create mode 100644 drivers/staging/irda/include/net/irda/iriap.h
 create mode 100644 drivers/staging/irda/include/net/irda/iriap_event.h
 create mode 100644 drivers/staging/irda/include/net/irda/irias_object.h
 create mode 100644 drivers/staging/irda/include/net/irda/irlan_client.h
 create mode 100644 drivers/staging/irda/include/net/irda/irlan_common.h
 create mode 100644 drivers/staging/irda/include/net/irda/irlan_eth.h
 create mode 100644 drivers/staging/irda/include/net/irda/irlan_event.h
 create mode 100644 drivers/staging/irda/include/net/irda/irlan_filter.h
 create mode 100644 drivers/staging/irda/include/net/irda/irlan_provider.h
 create mode 100644 drivers/staging/irda/include/net/irda/irlap.h
 create mode 100644 drivers/staging/irda/include/net/irda/irlap_event.h
 create mode 100644 drivers/staging/irda/include/net/irda/irlap_frame.h
 create mode 100644 drivers/staging/irda/include/net/irda/irlmp.h
 create mode 100644 drivers/staging/irda/include/net/irda/irlmp_event.h
 create mode 100644 drivers/staging/irda/include/net/irda/irlmp_frame.h
 create mode 100644 drivers/staging/irda/include/net/irda/irmod.h
 create mode 100644 drivers/staging/irda/include/net/irda/irqueue.h
 create mode 100644 drivers/staging/irda/include/net/irda/irttp.h
 create mode 100644 drivers/staging/irda/include/net/irda/parameters.h
 create mode 100644 drivers/staging/irda/include/net/irda/qos.h
 create mode 100644 drivers/staging/irda/include/net/irda/timer.h
 create mode 100644 drivers/staging/irda/include/net/irda/wrapper.h
 delete mode 100644 include/net/irda/af_irda.h
 delete mode 100644 include/net/irda/crc.h
 delete mode 100644 include/net/irda/discovery.h
 delete mode 100644 include/net/irda/ircomm_core.h
 delete mode 100644 include/net/irda/ircomm_event.h
 delete mode 100644 include/net/irda/ircomm_lmp.h
 delete mode 100644 include/net/irda/ircomm_param.h
 delete mode 100644 include/net/irda/ircomm_ttp.h
 delete mode 100644 include/net/irda/ircomm_tty.h
 delete mode 100644 include/net/irda/ircomm_tty_attach.h
 delete mode 100644 include/net/irda/irda.h
 delete mode 100644 include/net/irda/irda_device.h
 delete mode 100644 include/net/irda/iriap.h
 delete mode 100644 include/net/irda/iriap_event.h
 delete mode 100644 include/net/irda/irias_object.h
 delete mode 100644 include/net/irda/irlan_client.h
 delete mode 100644 include/net/irda/irlan_common.h
 delete mode 100644 include/net/irda/irlan_eth.h
 delete mode 100644 include/net/irda/irlan_event.h
 delete mode 100644 include/net/irda/irlan_filter.h
 delete mode 100644 include/net/irda/irlan_provider.h
 delete mode 100644 include/net/irda/irlap.h
 delete mode 100644 include/net/irda/irlap_event.h
 delete mode 100644 include/net/irda/irlap_frame.h
 delete mode 100644 include/net/irda/irlmp.h
 delete mode 100644 include/net/irda/irlmp_event.h
 delete mode 100644 include/net/irda/irlmp_frame.h
 delete mode 100644 include/net/irda/irmod.h
 delete mode 100644 include/net/irda/irqueue.h
 delete mode 100644 include/net/irda/irttp.h
 delete mode 100644 include/net/irda/parameters.h
 delete mode 100644 include/net/irda/qos.h
 delete mode 100644 include/net/irda/timer.h
 delete mode 100644 include/net/irda/wrapper.h

(limited to 'include')

diff --git a/drivers/staging/irda/drivers/Makefile b/drivers/staging/irda/drivers/Makefile
index 4c344433dae5..e2901b135528 100644
--- a/drivers/staging/irda/drivers/Makefile
+++ b/drivers/staging/irda/drivers/Makefile
@@ -5,6 +5,8 @@
 # Rewritten to use lists instead of if-statements.
 #
 
+subdir-ccflags-y += -I$(srctree)/drivers/staging/irda/include
+
 # FIR drivers
 obj-$(CONFIG_USB_IRDA)		+= irda-usb.o
 obj-$(CONFIG_SIGMATEL_FIR)	+= stir4200.o
diff --git a/drivers/staging/irda/include/net/irda/af_irda.h b/drivers/staging/irda/include/net/irda/af_irda.h
new file mode 100644
index 000000000000..0df574931522
--- /dev/null
+++ b/drivers/staging/irda/include/net/irda/af_irda.h
@@ -0,0 +1,87 @@
+/*********************************************************************
+ *                
+ * Filename:      af_irda.h
+ * Version:       1.0
+ * Description:   IrDA sockets declarations
+ * Status:        Stable
+ * Author:        Dag Brattli <dagb@cs.uit.no>
+ * Created at:    Tue Dec  9 21:13:12 1997
+ * Modified at:   Fri Jan 28 13:16:32 2000
+ * Modified by:   Dag Brattli <dagb@cs.uit.no>
+ * 
+ *     Copyright (c) 1998-2000 Dag Brattli, All Rights Reserved.
+ *     Copyright (c) 2000-2002 Jean Tourrilhes <jt@hpl.hp.com>
+ *      
+ *     This program is free software; you can redistribute it and/or 
+ *     modify it under the terms of the GNU General Public License as 
+ *     published by the Free Software Foundation; either version 2 of 
+ *     the License, or (at your option) any later version.
+ *  
+ *     Neither Dag Brattli nor University of Tromsø admit liability nor
+ *     provide warranty for any of this software. This material is 
+ *     provided "AS-IS" and at no charge.
+ *     
+ ********************************************************************/
+
+#ifndef AF_IRDA_H
+#define AF_IRDA_H
+
+#include <linux/irda.h>
+#include <net/irda/irda.h>
+#include <net/irda/iriap.h>		/* struct iriap_cb */
+#include <net/irda/irias_object.h>	/* struct ias_value */
+#include <net/irda/irlmp.h>		/* struct lsap_cb */
+#include <net/irda/irttp.h>		/* struct tsap_cb */
+#include <net/irda/discovery.h>		/* struct discovery_t */
+#include <net/sock.h>
+
+/* IrDA Socket */
+struct irda_sock {
+	/* struct sock has to be the first member of irda_sock */
+	struct sock sk;
+	__u32 saddr;          /* my local address */
+	__u32 daddr;          /* peer address */
+
+	struct lsap_cb *lsap; /* LSAP used by Ultra */
+	__u8  pid;            /* Protocol IP (PID) used by Ultra */
+
+	struct tsap_cb *tsap; /* TSAP used by this connection */
+	__u8 dtsap_sel;       /* remote TSAP address */
+	__u8 stsap_sel;       /* local TSAP address */
+	
+	__u32 max_sdu_size_rx;
+	__u32 max_sdu_size_tx;
+	__u32 max_data_size;
+	__u8  max_header_size;
+	struct qos_info qos_tx;
+
+	__u16_host_order mask;           /* Hint bits mask */
+	__u16_host_order hints;          /* Hint bits */
+
+	void *ckey;           /* IrLMP client handle */
+	void *skey;           /* IrLMP service handle */
+
+	struct ias_object *ias_obj;   /* Our service name + lsap in IAS */
+	struct iriap_cb *iriap;	      /* Used to query remote IAS */
+	struct ias_value *ias_result; /* Result of remote IAS query */
+
+	hashbin_t *cachelog;		/* Result of discovery query */
+	__u32 cachedaddr;	/* Result of selective discovery query */
+
+	int nslots;           /* Number of slots to use for discovery */
+
+	int errno;            /* status of the IAS query */
+
+	wait_queue_head_t query_wait;	/* Wait for the answer to a query */
+	struct timer_list watchdog;	/* Timeout for discovery */
+
+	LOCAL_FLOW tx_flow;
+	LOCAL_FLOW rx_flow;
+};
+
+static inline struct irda_sock *irda_sk(struct sock *sk)
+{
+	return (struct irda_sock *)sk;
+}
+
+#endif /* AF_IRDA_H */
diff --git a/drivers/staging/irda/include/net/irda/crc.h b/drivers/staging/irda/include/net/irda/crc.h
new file mode 100644
index 000000000000..f202296df9bb
--- /dev/null
+++ b/drivers/staging/irda/include/net/irda/crc.h
@@ -0,0 +1,29 @@
+/*********************************************************************
+ *                
+ * Filename:      crc.h
+ * Version:       
+ * Description:   CRC routines
+ * Status:        Experimental.
+ * Author:        Dag Brattli <dagb@cs.uit.no>
+ * Created at:    Mon Aug  4 20:40:53 1997
+ * Modified at:   Sun May  2 20:25:23 1999
+ * Modified by:   Dag Brattli <dagb@cs.uit.no>
+ * 
+ ********************************************************************/
+
+#ifndef IRDA_CRC_H
+#define IRDA_CRC_H
+
+#include <linux/types.h>
+#include <linux/crc-ccitt.h>
+
+#define INIT_FCS  0xffff   /* Initial FCS value */
+#define GOOD_FCS  0xf0b8   /* Good final FCS value */
+
+/* Recompute the FCS with one more character appended. */
+#define irda_fcs(fcs, c) crc_ccitt_byte(fcs, c)
+
+/* Recompute the FCS with len bytes appended. */
+#define irda_calc_crc16(fcs, buf, len) crc_ccitt(fcs, buf, len)
+
+#endif
diff --git a/drivers/staging/irda/include/net/irda/discovery.h b/drivers/staging/irda/include/net/irda/discovery.h
new file mode 100644
index 000000000000..63ae32530567
--- /dev/null
+++ b/drivers/staging/irda/include/net/irda/discovery.h
@@ -0,0 +1,95 @@
+/*********************************************************************
+ *                
+ * Filename:      discovery.h
+ * Version:       
+ * Description:   
+ * Status:        Experimental.
+ * Author:        Dag Brattli <dagb@cs.uit.no>
+ * Created at:    Tue Apr  6 16:53:53 1999
+ * Modified at:   Tue Oct  5 10:05:10 1999
+ * Modified by:   Dag Brattli <dagb@cs.uit.no>
+ * 
+ *     Copyright (c) 1999 Dag Brattli, All Rights Reserved.
+ *     Copyright (c) 2000-2002 Jean Tourrilhes <jt@hpl.hp.com>
+ *     
+ *     This program is free software; you can redistribute it and/or 
+ *     modify it under the terms of the GNU General Public License as 
+ *     published by the Free Software Foundation; either version 2 of 
+ *     the License, or (at your option) any later version.
+ * 
+ *     This program is distributed in the hope that it will be useful,
+ *     but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ *     GNU General Public License for more details.
+ * 
+ *     You should have received a copy of the GNU General Public License 
+ *     along with this program; if not, see <http://www.gnu.org/licenses/>.
+ *     
+ ********************************************************************/
+
+#ifndef DISCOVERY_H
+#define DISCOVERY_H
+
+#include <asm/param.h>
+
+#include <net/irda/irda.h>
+#include <net/irda/irqueue.h>		/* irda_queue_t */
+#include <net/irda/irlap_event.h>	/* LAP_REASON */
+
+#define DISCOVERY_EXPIRE_TIMEOUT (2*sysctl_discovery_timeout*HZ)
+#define DISCOVERY_DEFAULT_SLOTS  0
+
+/*
+ *  This type is used by the protocols that transmit 16 bits words in 
+ *  little endian format. A little endian machine stores MSB of word in
+ *  byte[1] and LSB in byte[0]. A big endian machine stores MSB in byte[0] 
+ *  and LSB in byte[1].
+ *
+ * This structure is used in the code for things that are endian neutral
+ * but that fit in a word so that we can manipulate them efficiently.
+ * By endian neutral, I mean things that are really an array of bytes,
+ * and always used as such, for example the hint bits. Jean II
+ */
+typedef union {
+	__u16 word;
+	__u8  byte[2];
+} __u16_host_order;
+
+/* Types of discovery */
+typedef enum {
+	DISCOVERY_LOG,		/* What's in our discovery log */
+	DISCOVERY_ACTIVE,	/* Doing our own discovery on the medium */
+	DISCOVERY_PASSIVE,	/* Peer doing discovery on the medium */
+	EXPIRY_TIMEOUT,		/* Entry expired due to timeout */
+} DISCOVERY_MODE;
+
+#define NICKNAME_MAX_LEN 21
+
+/* Basic discovery information about a peer */
+typedef struct irda_device_info		discinfo_t;	/* linux/irda.h */
+
+/*
+ * The DISCOVERY structure is used for both discovery requests and responses
+ */
+typedef struct discovery_t {
+	irda_queue_t	q;		/* Must be first! */
+
+	discinfo_t	data;		/* Basic discovery information */
+	int		name_len;	/* Length of nickname */
+
+	LAP_REASON	condition;	/* More info about the discovery */
+	int		gen_addr_bit;	/* Need to generate a new device
+					 * address? */
+	int		nslots;		/* Number of slots to use when
+					 * discovering */
+	unsigned long	timestamp;	/* Last time discovered */
+	unsigned long	firststamp;	/* First time discovered */
+} discovery_t;
+
+void irlmp_add_discovery(hashbin_t *cachelog, discovery_t *discovery);
+void irlmp_add_discovery_log(hashbin_t *cachelog, hashbin_t *log);
+void irlmp_expire_discoveries(hashbin_t *log, __u32 saddr, int force);
+struct irda_device_info *irlmp_copy_discoveries(hashbin_t *log, int *pn,
+						__u16 mask, int old_entries);
+
+#endif
diff --git a/drivers/staging/irda/include/net/irda/ircomm_core.h b/drivers/staging/irda/include/net/irda/ircomm_core.h
new file mode 100644
index 000000000000..2a580ce9edad
--- /dev/null
+++ b/drivers/staging/irda/include/net/irda/ircomm_core.h
@@ -0,0 +1,106 @@
+/*********************************************************************
+ *                
+ * Filename:      ircomm_core.h
+ * Version:       
+ * Description:   
+ * Status:        Experimental.
+ * Author:        Dag Brattli <dagb@cs.uit.no>
+ * Created at:    Wed Jun  9 08:58:43 1999
+ * Modified at:   Mon Dec 13 11:52:29 1999
+ * Modified by:   Dag Brattli <dagb@cs.uit.no>
+ * 
+ *     Copyright (c) 1999 Dag Brattli, All Rights Reserved.
+ *     
+ *     This program is free software; you can redistribute it and/or 
+ *     modify it under the terms of the GNU General Public License as 
+ *     published by the Free Software Foundation; either version 2 of 
+ *     the License, or (at your option) any later version.
+ * 
+ *     This program is distributed in the hope that it will be useful,
+ *     but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ *     GNU General Public License for more details.
+ * 
+ *     You should have received a copy of the GNU General Public License 
+ *     along with this program; if not, see <http://www.gnu.org/licenses/>.
+ *     
+ ********************************************************************/
+
+#ifndef IRCOMM_CORE_H
+#define IRCOMM_CORE_H
+
+#include <net/irda/irda.h>
+#include <net/irda/irqueue.h>
+#include <net/irda/ircomm_event.h>
+
+#define IRCOMM_MAGIC 0x98347298
+#define IRCOMM_HEADER_SIZE 1
+
+struct ircomm_cb;   /* Forward decl. */
+
+/*
+ * A small call-table, so we don't have to check the service-type whenever
+ * we want to do something
+ */
+typedef struct {
+	int (*data_request)(struct ircomm_cb *, struct sk_buff *, int clen);
+	int (*connect_request)(struct ircomm_cb *, struct sk_buff *, 
+			       struct ircomm_info *);
+	int (*connect_response)(struct ircomm_cb *, struct sk_buff *);
+	int (*disconnect_request)(struct ircomm_cb *, struct sk_buff *, 
+				  struct ircomm_info *);	
+} call_t;
+
+struct ircomm_cb {
+	irda_queue_t queue;
+	magic_t magic;
+
+	notify_t notify;
+	call_t   issue;
+
+	int state;
+	int line;            /* Which TTY line we are using */
+
+	struct tsap_cb *tsap;
+	struct lsap_cb *lsap;
+	
+	__u8 dlsap_sel;      /* Destination LSAP/TSAP selector */
+	__u8 slsap_sel;      /* Source LSAP/TSAP selector */
+
+	__u32 saddr;         /* Source device address (link we are using) */
+	__u32 daddr;         /* Destination device address */
+
+	int max_header_size; /* Header space we must reserve for each frame */
+	int max_data_size;   /* The amount of data we can fill in each frame */
+
+	LOCAL_FLOW flow_status; /* Used by ircomm_lmp */
+	int pkt_count;          /* Number of frames we have sent to IrLAP */
+
+	__u8 service_type;
+};
+
+extern hashbin_t *ircomm;
+
+struct ircomm_cb *ircomm_open(notify_t *notify, __u8 service_type, int line);
+int ircomm_close(struct ircomm_cb *self);
+
+int ircomm_data_request(struct ircomm_cb *self, struct sk_buff *skb);
+void ircomm_data_indication(struct ircomm_cb *self, struct sk_buff *skb);
+void ircomm_process_data(struct ircomm_cb *self, struct sk_buff *skb);
+int ircomm_control_request(struct ircomm_cb *self, struct sk_buff *skb);
+int ircomm_connect_request(struct ircomm_cb *self, __u8 dlsap_sel, 
+			   __u32 saddr, __u32 daddr, struct sk_buff *skb,
+			   __u8 service_type);
+void ircomm_connect_indication(struct ircomm_cb *self, struct sk_buff *skb,
+			       struct ircomm_info *info);
+void ircomm_connect_confirm(struct ircomm_cb *self, struct sk_buff *skb,
+			    struct ircomm_info *info);
+int ircomm_connect_response(struct ircomm_cb *self, struct sk_buff *userdata);
+int ircomm_disconnect_request(struct ircomm_cb *self, struct sk_buff *userdata);
+void ircomm_disconnect_indication(struct ircomm_cb *self, struct sk_buff *skb,
+				  struct ircomm_info *info);
+void ircomm_flow_request(struct ircomm_cb *self, LOCAL_FLOW flow);
+
+#define ircomm_is_connected(self) (self->state == IRCOMM_CONN)
+
+#endif
diff --git a/drivers/staging/irda/include/net/irda/ircomm_event.h b/drivers/staging/irda/include/net/irda/ircomm_event.h
new file mode 100644
index 000000000000..5bbc32998d57
--- /dev/null
+++ b/drivers/staging/irda/include/net/irda/ircomm_event.h
@@ -0,0 +1,83 @@
+/*********************************************************************
+ *                
+ * Filename:      ircomm_event.h
+ * Version:       
+ * Description:   
+ * Status:        Experimental.
+ * Author:        Dag Brattli <dagb@cs.uit.no>
+ * Created at:    Sun Jun  6 23:51:13 1999
+ * Modified at:   Thu Jun 10 08:36:25 1999
+ * Modified by:   Dag Brattli <dagb@cs.uit.no>
+ * 
+ *     Copyright (c) 1999 Dag Brattli, All Rights Reserved.
+ *     
+ *     This program is free software; you can redistribute it and/or 
+ *     modify it under the terms of the GNU General Public License as 
+ *     published by the Free Software Foundation; either version 2 of 
+ *     the License, or (at your option) any later version.
+ * 
+ *     This program is distributed in the hope that it will be useful,
+ *     but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ *     GNU General Public License for more details.
+ * 
+ *     You should have received a copy of the GNU General Public License 
+ *     along with this program; if not, see <http://www.gnu.org/licenses/>.
+ *     
+ ********************************************************************/
+
+#ifndef IRCOMM_EVENT_H
+#define IRCOMM_EVENT_H
+
+#include <net/irda/irmod.h>
+
+typedef enum {
+        IRCOMM_IDLE,
+        IRCOMM_WAITI,
+        IRCOMM_WAITR,
+        IRCOMM_CONN,
+} IRCOMM_STATE;
+
+/* IrCOMM Events */
+typedef enum {
+        IRCOMM_CONNECT_REQUEST,
+        IRCOMM_CONNECT_RESPONSE,
+        IRCOMM_TTP_CONNECT_INDICATION,
+	IRCOMM_LMP_CONNECT_INDICATION,
+        IRCOMM_TTP_CONNECT_CONFIRM,
+	IRCOMM_LMP_CONNECT_CONFIRM,
+
+        IRCOMM_LMP_DISCONNECT_INDICATION,
+	IRCOMM_TTP_DISCONNECT_INDICATION,
+        IRCOMM_DISCONNECT_REQUEST,
+
+        IRCOMM_TTP_DATA_INDICATION,
+	IRCOMM_LMP_DATA_INDICATION,
+        IRCOMM_DATA_REQUEST,
+        IRCOMM_CONTROL_REQUEST,
+        IRCOMM_CONTROL_INDICATION,
+} IRCOMM_EVENT;
+
+/*
+ * Used for passing information through the state-machine
+ */
+struct ircomm_info {
+        __u32     saddr;               /* Source device address */
+        __u32     daddr;               /* Destination device address */
+        __u8      dlsap_sel;
+        LM_REASON reason;              /* Reason for disconnect */
+	__u32     max_data_size;
+	__u32     max_header_size;
+
+	struct qos_info *qos;
+};
+
+extern const char *const ircomm_state[];
+
+struct ircomm_cb;   /* Forward decl. */
+
+int ircomm_do_event(struct ircomm_cb *self, IRCOMM_EVENT event,
+		    struct sk_buff *skb, struct ircomm_info *info);
+void ircomm_next_state(struct ircomm_cb *self, IRCOMM_STATE state);
+
+#endif
diff --git a/drivers/staging/irda/include/net/irda/ircomm_lmp.h b/drivers/staging/irda/include/net/irda/ircomm_lmp.h
new file mode 100644
index 000000000000..5042a5021a04
--- /dev/null
+++ b/drivers/staging/irda/include/net/irda/ircomm_lmp.h
@@ -0,0 +1,36 @@
+/*********************************************************************
+ *                
+ * Filename:      ircomm_lmp.h
+ * Version:       
+ * Description:   
+ * Status:        Experimental.
+ * Author:        Dag Brattli <dagb@cs.uit.no>
+ * Created at:    Wed Jun  9 10:06:07 1999
+ * Modified at:   Fri Aug 13 07:32:32 1999
+ * Modified by:   Dag Brattli <dagb@cs.uit.no>
+ * 
+ *     Copyright (c) 1999 Dag Brattli, All Rights Reserved.
+ *     
+ *     This program is free software; you can redistribute it and/or 
+ *     modify it under the terms of the GNU General Public License as 
+ *     published by the Free Software Foundation; either version 2 of 
+ *     the License, or (at your option) any later version.
+ * 
+ *     This program is distributed in the hope that it will be useful,
+ *     but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ *     GNU General Public License for more details.
+ * 
+ *     You should have received a copy of the GNU General Public License 
+ *     along with this program; if not, see <http://www.gnu.org/licenses/>.
+ *     
+ ********************************************************************/
+
+#ifndef IRCOMM_LMP_H
+#define IRCOMM_LMP_H
+
+#include <net/irda/ircomm_core.h>
+
+int ircomm_open_lsap(struct ircomm_cb *self);
+
+#endif
diff --git a/drivers/staging/irda/include/net/irda/ircomm_param.h b/drivers/staging/irda/include/net/irda/ircomm_param.h
new file mode 100644
index 000000000000..1f67432321c4
--- /dev/null
+++ b/drivers/staging/irda/include/net/irda/ircomm_param.h
@@ -0,0 +1,147 @@
+/*********************************************************************
+ *                
+ * Filename:      ircomm_param.h
+ * Version:       1.0
+ * Description:   Parameter handling for the IrCOMM protocol
+ * Status:        Experimental.
+ * Author:        Dag Brattli <dagb@cs.uit.no>
+ * Created at:    Mon Jun  7 08:47:28 1999
+ * Modified at:   Wed Aug 25 13:46:33 1999
+ * Modified by:   Dag Brattli <dagb@cs.uit.no>
+ * 
+ *     Copyright (c) 1999 Dag Brattli, All Rights Reserved.
+ *     
+ *     This program is free software; you can redistribute it and/or 
+ *     modify it under the terms of the GNU General Public License as 
+ *     published by the Free Software Foundation; either version 2 of 
+ *     the License, or (at your option) any later version.
+ * 
+ *     This program is distributed in the hope that it will be useful,
+ *     but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ *     GNU General Public License for more details.
+ * 
+ *     You should have received a copy of the GNU General Public License 
+ *     along with this program; if not, see <http://www.gnu.org/licenses/>.
+ *     
+ ********************************************************************/
+
+#ifndef IRCOMM_PARAMS_H
+#define IRCOMM_PARAMS_H
+
+#include <net/irda/parameters.h>
+
+/* Parameters common to all service types */
+#define IRCOMM_SERVICE_TYPE     0x00
+#define IRCOMM_PORT_TYPE        0x01 /* Only used in LM-IAS */
+#define IRCOMM_PORT_NAME        0x02 /* Only used in LM-IAS */
+
+/* Parameters for both 3 wire and 9 wire */
+#define IRCOMM_DATA_RATE        0x10
+#define IRCOMM_DATA_FORMAT      0x11
+#define IRCOMM_FLOW_CONTROL     0x12
+#define IRCOMM_XON_XOFF         0x13
+#define IRCOMM_ENQ_ACK          0x14
+#define IRCOMM_LINE_STATUS      0x15
+#define IRCOMM_BREAK            0x16
+
+/* Parameters for 9 wire */
+#define IRCOMM_DTE              0x20
+#define IRCOMM_DCE              0x21
+#define IRCOMM_POLL             0x22
+
+/* Service type (details) */
+#define IRCOMM_3_WIRE_RAW       0x01
+#define IRCOMM_3_WIRE           0x02
+#define IRCOMM_9_WIRE           0x04
+#define IRCOMM_CENTRONICS       0x08
+
+/* Port type (details) */
+#define IRCOMM_SERIAL           0x00
+#define IRCOMM_PARALLEL         0x01
+
+/* Data format (details) */
+#define IRCOMM_WSIZE_5          0x00
+#define IRCOMM_WSIZE_6          0x01
+#define IRCOMM_WSIZE_7          0x02
+#define IRCOMM_WSIZE_8          0x03
+
+#define IRCOMM_1_STOP_BIT       0x00
+#define IRCOMM_2_STOP_BIT       0x04 /* 1.5 if char len 5 */
+
+#define IRCOMM_PARITY_DISABLE   0x00
+#define IRCOMM_PARITY_ENABLE    0x08
+
+#define IRCOMM_PARITY_ODD       0x00
+#define IRCOMM_PARITY_EVEN      0x10
+#define IRCOMM_PARITY_MARK      0x20
+#define IRCOMM_PARITY_SPACE     0x30
+
+/* Flow control */
+#define IRCOMM_XON_XOFF_IN      0x01
+#define IRCOMM_XON_XOFF_OUT     0x02
+#define IRCOMM_RTS_CTS_IN       0x04
+#define IRCOMM_RTS_CTS_OUT      0x08
+#define IRCOMM_DSR_DTR_IN       0x10
+#define IRCOMM_DSR_DTR_OUT      0x20
+#define IRCOMM_ENQ_ACK_IN       0x40
+#define IRCOMM_ENQ_ACK_OUT      0x80
+
+/* Line status */
+#define IRCOMM_OVERRUN_ERROR    0x02
+#define IRCOMM_PARITY_ERROR     0x04
+#define IRCOMM_FRAMING_ERROR    0x08
+
+/* DTE (Data terminal equipment) line settings */
+#define IRCOMM_DELTA_DTR        0x01
+#define IRCOMM_DELTA_RTS        0x02
+#define IRCOMM_DTR              0x04
+#define IRCOMM_RTS              0x08
+
+/* DCE (Data communications equipment) line settings */
+#define IRCOMM_DELTA_CTS        0x01  /* Clear to send has changed */
+#define IRCOMM_DELTA_DSR        0x02  /* Data set ready has changed */
+#define IRCOMM_DELTA_RI         0x04  /* Ring indicator has changed */
+#define IRCOMM_DELTA_CD         0x08  /* Carrier detect has changed */
+#define IRCOMM_CTS              0x10  /* Clear to send is high */
+#define IRCOMM_DSR              0x20  /* Data set ready is high */
+#define IRCOMM_RI               0x40  /* Ring indicator is high */
+#define IRCOMM_CD               0x80  /* Carrier detect is high */
+#define IRCOMM_DCE_DELTA_ANY    0x0f
+
+/*
+ * Parameter state
+ */
+struct ircomm_params {
+	/* General control params */
+	__u8  service_type;
+	__u8  port_type;
+	char  port_name[32];
+
+	/* Control params for 3- and 9-wire service type */
+	__u32 data_rate;         /* Data rate in bps */
+	__u8  data_format;
+	__u8  flow_control;
+	char  xonxoff[2];
+	char  enqack[2];
+	__u8  line_status;
+	__u8  _break;
+
+	__u8  null_modem;
+
+	/* Control params for 9-wire service type */
+	__u8 dte;
+	__u8 dce;
+	__u8 poll;
+
+	/* Control params for Centronics service type */
+};
+
+struct ircomm_tty_cb; /* Forward decl. */
+
+int ircomm_param_request(struct ircomm_tty_cb *self, __u8 pi, int flush);
+
+extern pi_param_info_t ircomm_param_info;
+
+#endif /* IRCOMM_PARAMS_H */
+
diff --git a/drivers/staging/irda/include/net/irda/ircomm_ttp.h b/drivers/staging/irda/include/net/irda/ircomm_ttp.h
new file mode 100644
index 000000000000..c5627288bca3
--- /dev/null
+++ b/drivers/staging/irda/include/net/irda/ircomm_ttp.h
@@ -0,0 +1,37 @@
+/*********************************************************************
+ *                
+ * Filename:      ircomm_ttp.h
+ * Version:       
+ * Description:   
+ * Status:        Experimental.
+ * Author:        Dag Brattli <dagb@cs.uit.no>
+ * Created at:    Wed Jun  9 10:06:07 1999
+ * Modified at:   Fri Aug 13 07:32:22 1999
+ * Modified by:   Dag Brattli <dagb@cs.uit.no>
+ * 
+ *     Copyright (c) 1999 Dag Brattli, All Rights Reserved.
+ *     
+ *     This program is free software; you can redistribute it and/or 
+ *     modify it under the terms of the GNU General Public License as 
+ *     published by the Free Software Foundation; either version 2 of 
+ *     the License, or (at your option) any later version.
+ * 
+ *     This program is distributed in the hope that it will be useful,
+ *     but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ *     GNU General Public License for more details.
+ * 
+ *     You should have received a copy of the GNU General Public License 
+ *     along with this program; if not, see <http://www.gnu.org/licenses/>.
+ *     
+ ********************************************************************/
+
+#ifndef IRCOMM_TTP_H
+#define IRCOMM_TTP_H
+
+#include <net/irda/ircomm_core.h>
+
+int  ircomm_open_tsap(struct ircomm_cb *self);
+
+#endif
+
diff --git a/drivers/staging/irda/include/net/irda/ircomm_tty.h b/drivers/staging/irda/include/net/irda/ircomm_tty.h
new file mode 100644
index 000000000000..8d4f588974bc
--- /dev/null
+++ b/drivers/staging/irda/include/net/irda/ircomm_tty.h
@@ -0,0 +1,121 @@
+/*********************************************************************
+ *                
+ * Filename:      ircomm_tty.h
+ * Version:       
+ * Description:   
+ * Status:        Experimental.
+ * Author:        Dag Brattli <dagb@cs.uit.no>
+ * Created at:    Sun Jun  6 23:24:22 1999
+ * Modified at:   Fri Jan 28 13:16:57 2000
+ * Modified by:   Dag Brattli <dagb@cs.uit.no>
+ * 
+ *     Copyright (c) 1999-2000 Dag Brattli, All Rights Reserved.
+ *     
+ *     This program is free software; you can redistribute it and/or 
+ *     modify it under the terms of the GNU General Public License as 
+ *     published by the Free Software Foundation; either version 2 of 
+ *     the License, or (at your option) any later version.
+ * 
+ *     This program is distributed in the hope that it will be useful,
+ *     but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ *     GNU General Public License for more details.
+ * 
+ *     You should have received a copy of the GNU General Public License 
+ *     along with this program; if not, see <http://www.gnu.org/licenses/>.
+ *     
+ ********************************************************************/
+
+#ifndef IRCOMM_TTY_H
+#define IRCOMM_TTY_H
+
+#include <linux/serial.h>
+#include <linux/termios.h>
+#include <linux/timer.h>
+#include <linux/tty.h>		/* struct tty_struct */
+
+#include <net/irda/irias_object.h>
+#include <net/irda/ircomm_core.h>
+#include <net/irda/ircomm_param.h>
+
+#define IRCOMM_TTY_PORTS 32
+#define IRCOMM_TTY_MAGIC 0x3432
+#define IRCOMM_TTY_MAJOR 161
+#define IRCOMM_TTY_MINOR 0
+
+/* This is used as an initial value to max_header_size before the proper
+ * value is filled in (5 for ttp, 4 for lmp). This allow us to detect
+ * the state of the underlying connection. - Jean II */
+#define IRCOMM_TTY_HDR_UNINITIALISED	16
+/* Same for payload size. See qos.c for the smallest max data size */
+#define IRCOMM_TTY_DATA_UNINITIALISED	(64 - IRCOMM_TTY_HDR_UNINITIALISED)
+
+/*
+ * IrCOMM TTY driver state
+ */
+struct ircomm_tty_cb {
+	irda_queue_t queue;            /* Must be first */
+	struct tty_port port;
+	magic_t magic;
+
+	int state;                /* Connect state */
+
+	struct ircomm_cb *ircomm; /* IrCOMM layer instance */
+
+	struct sk_buff *tx_skb;   /* Transmit buffer */
+	struct sk_buff *ctrl_skb; /* Control data buffer */
+
+	/* Parameters */
+	struct ircomm_params settings;
+
+	__u8 service_type;        /* The service that we support */
+	int client;               /* True if we are a client */
+	LOCAL_FLOW flow;          /* IrTTP flow status */
+
+	int line;
+
+	__u8 dlsap_sel;
+	__u8 slsap_sel;
+
+	__u32 saddr;
+	__u32 daddr;
+
+	__u32 max_data_size;   /* Max data we can transmit in one packet */
+	__u32 max_header_size; /* The amount of header space we must reserve */
+	__u32 tx_data_size;	/* Max data size of current tx_skb */
+
+	struct iriap_cb *iriap; /* Instance used for querying remote IAS */
+	struct ias_object* obj;
+	void *skey;
+	void *ckey;
+
+	struct timer_list watchdog_timer;
+	struct work_struct  tqueue;
+
+	/* Protect concurent access to :
+	 *	o self->ctrl_skb
+	 *	o self->tx_skb
+	 * Maybe other things may gain to be protected as well...
+	 * Jean II */
+	spinlock_t spinlock;
+};
+
+void ircomm_tty_start(struct tty_struct *tty);
+void ircomm_tty_check_modem_status(struct ircomm_tty_cb *self);
+
+int ircomm_tty_tiocmget(struct tty_struct *tty);
+int ircomm_tty_tiocmset(struct tty_struct *tty, unsigned int set,
+			unsigned int clear);
+int ircomm_tty_ioctl(struct tty_struct *tty, unsigned int cmd,
+		     unsigned long arg);
+void ircomm_tty_set_termios(struct tty_struct *tty,
+			    struct ktermios *old_termios);
+
+#endif
+
+
+
+
+
+
+
diff --git a/drivers/staging/irda/include/net/irda/ircomm_tty_attach.h b/drivers/staging/irda/include/net/irda/ircomm_tty_attach.h
new file mode 100644
index 000000000000..20dcbdf258cf
--- /dev/null
+++ b/drivers/staging/irda/include/net/irda/ircomm_tty_attach.h
@@ -0,0 +1,92 @@
+/*********************************************************************
+ *                
+ * Filename:      ircomm_tty_attach.h
+ * Version:       
+ * Description:   
+ * Status:        Experimental.
+ * Author:        Dag Brattli <dagb@cs.uit.no>
+ * Created at:    Wed Jun  9 15:55:18 1999
+ * Modified at:   Fri Dec 10 21:04:55 1999
+ * Modified by:   Dag Brattli <dagb@cs.uit.no>
+ * 
+ *     Copyright (c) 1999 Dag Brattli, All Rights Reserved.
+ *     
+ *     This program is free software; you can redistribute it and/or 
+ *     modify it under the terms of the GNU General Public License as 
+ *     published by the Free Software Foundation; either version 2 of 
+ *     the License, or (at your option) any later version.
+ * 
+ *     This program is distributed in the hope that it will be useful,
+ *     but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ *     GNU General Public License for more details.
+ * 
+ *     You should have received a copy of the GNU General Public License 
+ *     along with this program; if not, see <http://www.gnu.org/licenses/>.
+ *     
+ ********************************************************************/
+
+#ifndef IRCOMM_TTY_ATTACH_H
+#define IRCOMM_TTY_ATTACH_H
+
+#include <net/irda/ircomm_tty.h>
+
+typedef enum {
+        IRCOMM_TTY_IDLE,
+	IRCOMM_TTY_SEARCH,
+        IRCOMM_TTY_QUERY_PARAMETERS,
+	IRCOMM_TTY_QUERY_LSAP_SEL,
+	IRCOMM_TTY_SETUP,
+        IRCOMM_TTY_READY,
+} IRCOMM_TTY_STATE;
+
+/* IrCOMM TTY Events */
+typedef enum {
+	IRCOMM_TTY_ATTACH_CABLE,
+	IRCOMM_TTY_DETACH_CABLE,
+	IRCOMM_TTY_DATA_REQUEST,
+	IRCOMM_TTY_DATA_INDICATION,
+	IRCOMM_TTY_DISCOVERY_REQUEST,
+	IRCOMM_TTY_DISCOVERY_INDICATION,
+	IRCOMM_TTY_CONNECT_CONFIRM,
+	IRCOMM_TTY_CONNECT_INDICATION,
+	IRCOMM_TTY_DISCONNECT_REQUEST,
+	IRCOMM_TTY_DISCONNECT_INDICATION,
+	IRCOMM_TTY_WD_TIMER_EXPIRED,
+	IRCOMM_TTY_GOT_PARAMETERS,
+	IRCOMM_TTY_GOT_LSAPSEL,
+} IRCOMM_TTY_EVENT;
+
+/* Used for passing information through the state-machine */
+struct ircomm_tty_info {
+        __u32     saddr;               /* Source device address */
+        __u32     daddr;               /* Destination device address */
+        __u8      dlsap_sel;
+};
+
+extern const char *const ircomm_state[];
+extern const char *const ircomm_tty_state[];
+
+int ircomm_tty_do_event(struct ircomm_tty_cb *self, IRCOMM_TTY_EVENT event,
+			struct sk_buff *skb, struct ircomm_tty_info *info);
+
+
+int  ircomm_tty_attach_cable(struct ircomm_tty_cb *self);
+void ircomm_tty_detach_cable(struct ircomm_tty_cb *self);
+void ircomm_tty_connect_confirm(void *instance, void *sap, 
+				struct qos_info *qos, 
+				__u32 max_sdu_size, 
+				__u8 max_header_size, 
+				struct sk_buff *skb);
+void ircomm_tty_disconnect_indication(void *instance, void *sap, 
+				      LM_REASON reason,
+				      struct sk_buff *skb);
+void ircomm_tty_connect_indication(void *instance, void *sap, 
+				   struct qos_info *qos, 
+				   __u32 max_sdu_size,
+				   __u8 max_header_size, 
+				   struct sk_buff *skb);
+int ircomm_tty_send_initial_parameters(struct ircomm_tty_cb *self);
+void ircomm_tty_link_established(struct ircomm_tty_cb *self);
+
+#endif /* IRCOMM_TTY_ATTACH_H */
diff --git a/drivers/staging/irda/include/net/irda/irda.h b/drivers/staging/irda/include/net/irda/irda.h
new file mode 100644
index 000000000000..92c8fb575213
--- /dev/null
+++ b/drivers/staging/irda/include/net/irda/irda.h
@@ -0,0 +1,115 @@
+/*********************************************************************
+ *                
+ * Filename:      irda.h
+ * Version:       1.0
+ * Description:   IrDA common include file for kernel internal use
+ * Status:        Stable
+ * Author:        Dag Brattli <dagb@cs.uit.no>
+ * Created at:    Tue Dec  9 21:13:12 1997
+ * Modified at:   Fri Jan 28 13:16:32 2000
+ * Modified by:   Dag Brattli <dagb@cs.uit.no>
+ * 
+ *     Copyright (c) 1998-2000 Dag Brattli, All Rights Reserved.
+ *     Copyright (c) 2000-2002 Jean Tourrilhes <jt@hpl.hp.com>
+ *      
+ *     This program is free software; you can redistribute it and/or 
+ *     modify it under the terms of the GNU General Public License as 
+ *     published by the Free Software Foundation; either version 2 of 
+ *     the License, or (at your option) any later version.
+ *  
+ *     Neither Dag Brattli nor University of Tromsø admit liability nor
+ *     provide warranty for any of this software. This material is 
+ *     provided "AS-IS" and at no charge.
+ *     
+ ********************************************************************/
+
+#ifndef NET_IRDA_H
+#define NET_IRDA_H
+
+#include <linux/skbuff.h>		/* struct sk_buff */
+#include <linux/kernel.h>
+#include <linux/if.h>			/* sa_family_t in <linux/irda.h> */
+#include <linux/irda.h>
+
+typedef __u32 magic_t;
+
+#ifndef TRUE
+#define TRUE 1
+#endif
+
+#ifndef FALSE 
+#define FALSE 0
+#endif
+
+/* Hack to do small backoff when setting media busy in IrLAP */
+#ifndef SMALL
+#define SMALL 5
+#endif
+
+#ifndef IRDA_MIN /* Lets not mix this MIN with other header files */
+#define IRDA_MIN(a, b) (((a) < (b)) ? (a) : (b))
+#endif
+
+#ifndef IRDA_ALIGN
+#  define IRDA_ALIGN __attribute__((aligned))
+#endif
+
+#ifdef CONFIG_IRDA_DEBUG
+#define IRDA_ASSERT(expr, func) \
+do { if(!(expr)) { \
+	printk( "Assertion failed! %s:%s:%d %s\n", \
+		__FILE__,__func__,__LINE__,(#expr) ); \
+	func } } while (0)
+#define IRDA_ASSERT_LABEL(label)	label
+#else
+#define IRDA_ASSERT(expr, func) do { (void)(expr); } while (0)
+#define IRDA_ASSERT_LABEL(label)
+#endif /* CONFIG_IRDA_DEBUG */
+
+/*
+ *  Magic numbers used by Linux-IrDA. Random numbers which must be unique to 
+ *  give the best protection
+ */
+
+#define IRTTY_MAGIC        0x2357
+#define LAP_MAGIC          0x1357
+#define LMP_MAGIC          0x4321
+#define LMP_LSAP_MAGIC     0x69333
+#define LMP_LAP_MAGIC      0x3432
+#define IRDA_DEVICE_MAGIC  0x63454
+#define IAS_MAGIC          0x007
+#define TTP_MAGIC          0x241169
+#define TTP_TSAP_MAGIC     0x4345
+#define IROBEX_MAGIC       0x341324
+#define HB_MAGIC           0x64534
+#define IRLAN_MAGIC        0x754
+#define IAS_OBJECT_MAGIC   0x34234
+#define IAS_ATTRIB_MAGIC   0x45232
+#define IRDA_TASK_MAGIC    0x38423
+
+#define IAS_DEVICE_ID 0x0000 /* Defined by IrDA, IrLMP section 4.1 (page 68) */
+#define IAS_PNP_ID    0xd342
+#define IAS_OBEX_ID   0x34323
+#define IAS_IRLAN_ID  0x34234
+#define IAS_IRCOMM_ID 0x2343
+#define IAS_IRLPT_ID  0x9876
+
+struct net_device;
+struct packet_type;
+
+void irda_proc_register(void);
+void irda_proc_unregister(void);
+
+int irda_sysctl_register(void);
+void irda_sysctl_unregister(void);
+
+int irsock_init(void);
+void irsock_cleanup(void);
+
+int irda_nl_register(void);
+void irda_nl_unregister(void);
+
+int irlap_driver_rcv(struct sk_buff *skb, struct net_device *dev,
+		     struct packet_type *ptype, struct net_device *orig_dev);
+
+#endif /* NET_IRDA_H */
diff --git a/drivers/staging/irda/include/net/irda/irda_device.h b/drivers/staging/irda/include/net/irda/irda_device.h
new file mode 100644
index 000000000000..664bf8178412
--- /dev/null
+++ b/drivers/staging/irda/include/net/irda/irda_device.h
@@ -0,0 +1,285 @@
+/*********************************************************************
+ *                
+ * Filename:      irda_device.h
+ * Version:       0.9
+ * Description:   Contains various declarations used by the drivers
+ * Status:        Experimental.
+ * Author:        Dag Brattli <dagb@cs.uit.no>
+ * Created at:    Tue Apr 14 12:41:42 1998
+ * Modified at:   Mon Mar 20 09:08:57 2000
+ * Modified by:   Dag Brattli <dagb@cs.uit.no>
+ * 
+ *     Copyright (c) 1999-2000 Dag Brattli, All Rights Reserved.
+ *     Copyright (c) 1998 Thomas Davis, <ratbert@radiks.net>,
+ *     Copyright (c) 2000-2002 Jean Tourrilhes <jt@hpl.hp.com>
+ *
+ *     This program is free software; you can redistribute it and/or 
+ *     modify it under the terms of the GNU General Public License as 
+ *     published by the Free Software Foundation; either version 2 of 
+ *     the License, or (at your option) any later version.
+ * 
+ *     This program is distributed in the hope that it will be useful,
+ *     but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ *     GNU General Public License for more details.
+ * 
+ *     You should have received a copy of the GNU General Public License 
+ *     along with this program; if not, see <http://www.gnu.org/licenses/>.
+ *     
+ ********************************************************************/
+
+/*
+ * This header contains all the IrDA definitions a driver really
+ * needs, and therefore the driver should not need to include
+ * any other IrDA headers - Jean II
+ */
+
+#ifndef IRDA_DEVICE_H
+#define IRDA_DEVICE_H
+
+#include <linux/tty.h>
+#include <linux/netdevice.h>
+#include <linux/spinlock.h>
+#include <linux/skbuff.h>		/* struct sk_buff */
+#include <linux/irda.h>
+#include <linux/types.h>
+
+#include <net/pkt_sched.h>
+#include <net/irda/irda.h>
+#include <net/irda/qos.h>		/* struct qos_info */
+#include <net/irda/irqueue.h>		/* irda_queue_t */
+
+/* A few forward declarations (to make compiler happy) */
+struct irlap_cb;
+
+/* Some non-standard interface flags (should not conflict with any in if.h) */
+#define IFF_SIR 	0x0001 /* Supports SIR speeds */
+#define IFF_MIR 	0x0002 /* Supports MIR speeds */
+#define IFF_FIR 	0x0004 /* Supports FIR speeds */
+#define IFF_VFIR        0x0008 /* Supports VFIR speeds */
+#define IFF_PIO   	0x0010 /* Supports PIO transfer of data */
+#define IFF_DMA		0x0020 /* Supports DMA transfer of data */
+#define IFF_SHM         0x0040 /* Supports shared memory data transfers */
+#define IFF_DONGLE      0x0080 /* Interface has a dongle attached */
+#define IFF_AIR         0x0100 /* Supports Advanced IR (AIR) standards */
+
+#define IO_XMIT 0x01
+#define IO_RECV 0x02
+
+typedef enum {
+	IRDA_IRLAP, /* IrDA mode, and deliver to IrLAP */
+	IRDA_RAW,   /* IrDA mode */
+	SHARP_ASK,
+	TV_REMOTE,  /* Also known as Consumer Electronics IR */
+} INFRARED_MODE;
+
+typedef enum {
+	IRDA_TASK_INIT,        /* All tasks are initialized with this state */
+	IRDA_TASK_DONE,        /* Signals that the task is finished */
+	IRDA_TASK_WAIT,
+	IRDA_TASK_WAIT1,
+	IRDA_TASK_WAIT2,
+	IRDA_TASK_WAIT3,
+	IRDA_TASK_CHILD_INIT,  /* Initializing child task */
+	IRDA_TASK_CHILD_WAIT,  /* Waiting for child task to finish */
+	IRDA_TASK_CHILD_DONE   /* Child task is finished */
+} IRDA_TASK_STATE;
+
+struct irda_task;
+typedef int (*IRDA_TASK_CALLBACK) (struct irda_task *task);
+
+struct irda_task {
+	irda_queue_t q;
+	magic_t magic;
+
+	IRDA_TASK_STATE state;
+	IRDA_TASK_CALLBACK function;
+	IRDA_TASK_CALLBACK finished;
+
+	struct irda_task *parent;
+	struct timer_list timer;
+
+	void *instance; /* Instance being called */
+	void *param;    /* Parameter to be used by instance */
+};
+
+/* Dongle info */
+struct dongle_reg;
+typedef struct {
+	struct dongle_reg *issue;     /* Registration info */
+	struct net_device *dev;           /* Device we are attached to */
+	struct irda_task *speed_task; /* Task handling speed change */
+	struct irda_task *reset_task; /* Task handling reset */
+	__u32 speed;                  /* Current speed */
+
+	/* Callbacks to the IrDA device driver */
+	int (*set_mode)(struct net_device *, int mode);
+	int (*read)(struct net_device *dev, __u8 *buf, int len);
+	int (*write)(struct net_device *dev, __u8 *buf, int len);
+	int (*set_dtr_rts)(struct net_device *dev, int dtr, int rts);
+} dongle_t;
+
+/* Dongle registration info */
+struct dongle_reg {
+	irda_queue_t q;         /* Must be first */
+	IRDA_DONGLE type;
+
+	void (*open)(dongle_t *dongle, struct qos_info *qos);
+	void (*close)(dongle_t *dongle);
+	int  (*reset)(struct irda_task *task);
+	int  (*change_speed)(struct irda_task *task);
+	struct module *owner;
+};
+
+/* 
+ * Per-packet information we need to hide inside sk_buff 
+ * (must not exceed 48 bytes, check with struct sk_buff)
+ * The default_qdisc_pad field is a temporary hack.
+ */
+struct irda_skb_cb {
+	unsigned int default_qdisc_pad;
+	magic_t magic;       /* Be sure that we can trust the information */
+	__u32   next_speed;  /* The Speed to be set *after* this frame */
+	__u16   mtt;         /* Minimum turn around time */
+	__u16   xbofs;       /* Number of xbofs required, used by SIR mode */
+	__u16   next_xbofs;  /* Number of xbofs required *after* this frame */
+	void    *context;    /* May be used by drivers */
+	void    (*destructor)(struct sk_buff *skb); /* Used for flow control */
+	__u16   xbofs_delay; /* Number of xbofs used for generating the mtt */
+	__u8    line;        /* Used by IrCOMM in IrLPT mode */
+};
+
+/* Chip specific info */
+typedef struct {
+	int cfg_base;         /* Config register IO base */
+        int sir_base;         /* SIR IO base */
+	int fir_base;         /* FIR IO base */
+	int mem_base;         /* Shared memory base */
+        int sir_ext;          /* Length of SIR iobase */
+	int fir_ext;          /* Length of FIR iobase */
+        int irq, irq2;        /* Interrupts used */
+        int dma, dma2;        /* DMA channel(s) used */
+        int fifo_size;        /* FIFO size */
+        int irqflags;         /* interrupt flags (ie, IRQF_SHARED) */
+	int direction;        /* Link direction, used by some FIR drivers */
+	int enabled;          /* Powered on? */
+	int suspended;        /* Suspended by APM */
+	__u32 speed;          /* Currently used speed */
+	__u32 new_speed;      /* Speed we must change to when Tx is finished */
+	int dongle_id;        /* Dongle or transceiver currently used */
+} chipio_t;
+
+/* IO buffer specific info (inspired by struct sk_buff) */
+typedef struct {
+	int state;            /* Receiving state (transmit state not used) */
+	int in_frame;         /* True if receiving frame */
+
+	__u8 *head;	      /* start of buffer */
+	__u8 *data;	      /* start of data in buffer */
+
+	int len;	      /* current length of data */
+	int truesize;	      /* total allocated size of buffer */
+	__u16 fcs;
+
+	struct sk_buff *skb;	/* ZeroCopy Rx in async_unwrap_char() */
+} iobuff_t;
+
+/* Maximum SIR frame (skb) that we expect to receive *unwrapped*.
+ * Max LAP MTU (I field) is 2048 bytes max (IrLAP 1.1, chapt 6.6.5, p40).
+ * Max LAP header is 2 bytes (for now).
+ * Max CRC is 2 bytes at SIR, 4 bytes at FIR. 
+ * Need 1 byte for skb_reserve() to align IP header for IrLAN.
+ * Add a few extra bytes just to be safe (buffer is power of two anyway)
+ * Jean II */
+#define IRDA_SKB_MAX_MTU	2064
+/* Maximum SIR frame that we expect to send, wrapped (i.e. with XBOFS
+ * and escaped characters on top of above). */
+#define IRDA_SIR_MAX_FRAME	4269
+
+/* The SIR unwrapper async_unwrap_char() will use a Rx-copy-break mechanism
+ * when using the optional ZeroCopy Rx, where only small frames are memcpy
+ * to a smaller skb to save memory. This is the threshold under which copy
+ * will happen (and over which it won't happen).
+ * Some FIR drivers may use this #define as well...
+ * This is the same value as various Ethernet drivers. - Jean II */
+#define IRDA_RX_COPY_THRESHOLD  256
+
+/* Function prototypes */
+int  irda_device_init(void);
+void irda_device_cleanup(void);
+
+/* IrLAP entry points used by the drivers.
+ * We declare them here to avoid the driver pulling a whole bunch stack
+ * headers they don't really need - Jean II */
+struct irlap_cb *irlap_open(struct net_device *dev, struct qos_info *qos,
+			    const char *hw_name);
+void irlap_close(struct irlap_cb *self);
+
+/* Interface to be uses by IrLAP */
+void irda_device_set_media_busy(struct net_device *dev, int status);
+int  irda_device_is_media_busy(struct net_device *dev);
+int  irda_device_is_receiving(struct net_device *dev);
+
+/* Interface for internal use */
+static inline int irda_device_txqueue_empty(const struct net_device *dev)
+{
+	return qdisc_all_tx_empty(dev);
+}
+int  irda_device_set_raw_mode(struct net_device* self, int status);
+struct net_device *alloc_irdadev(int sizeof_priv);
+
+void irda_setup_dma(int channel, dma_addr_t buffer, int count, int mode);
+
+/*
+ * Function irda_get_mtt (skb)
+ *
+ *    Utility function for getting the minimum turnaround time out of 
+ *    the skb, where it has been hidden in the cb field.
+ */
+static inline __u16 irda_get_mtt(const struct sk_buff *skb)
+{
+	const struct irda_skb_cb *cb = (const struct irda_skb_cb *) skb->cb;
+	return (cb->magic == LAP_MAGIC) ? cb->mtt : 10000;
+}
+
+/*
+ * Function irda_get_next_speed (skb)
+ *
+ *    Extract the speed that should be set *after* this frame from the skb
+ *
+ * Note : return -1 for user space frames
+ */
+static inline __u32 irda_get_next_speed(const struct sk_buff *skb)
+{
+	const struct irda_skb_cb *cb = (const struct irda_skb_cb *) skb->cb;
+	return (cb->magic == LAP_MAGIC) ? cb->next_speed : -1;
+}
+
+/*
+ * Function irda_get_next_xbofs (skb)
+ *
+ *    Extract the xbofs that should be set for this frame from the skb
+ *
+ * Note : default to 10 for user space frames
+ */
+static inline __u16 irda_get_xbofs(const struct sk_buff *skb)
+{
+	const struct irda_skb_cb *cb = (const struct irda_skb_cb *) skb->cb;
+	return (cb->magic == LAP_MAGIC) ? cb->xbofs : 10;
+}
+
+/*
+ * Function irda_get_next_xbofs (skb)
+ *
+ *    Extract the xbofs that should be set *after* this frame from the skb
+ *
+ * Note : return -1 for user space frames
+ */
+static inline __u16 irda_get_next_xbofs(const struct sk_buff *skb)
+{
+	const struct irda_skb_cb *cb = (const struct irda_skb_cb *) skb->cb;
+	return (cb->magic == LAP_MAGIC) ? cb->next_xbofs : -1;
+}
+#endif /* IRDA_DEVICE_H */
+
+
diff --git a/drivers/staging/irda/include/net/irda/iriap.h b/drivers/staging/irda/include/net/irda/iriap.h
new file mode 100644
index 000000000000..fcc896491a95
--- /dev/null
+++ b/drivers/staging/irda/include/net/irda/iriap.h
@@ -0,0 +1,108 @@
+/*********************************************************************
+ *                
+ * Filename:      iriap.h
+ * Version:       0.5
+ * Description:   Information Access Protocol (IAP)
+ * Status:        Experimental.
+ * Author:        Dag Brattli <dagb@cs.uit.no>
+ * Created at:    Thu Aug 21 00:02:07 1997
+ * Modified at:   Sat Dec 25 16:42:09 1999
+ * Modified by:   Dag Brattli <dagb@cs.uit.no>
+ * 
+ *     Copyright (c) 1997-1999 Dag Brattli <dagb@cs.uit.no>, 
+ *     All Rights Reserved.
+ *     
+ *     This program is free software; you can redistribute it and/or 
+ *     modify it under the terms of the GNU General Public License as 
+ *     published by the Free Software Foundation; either version 2 of 
+ *     the License, or (at your option) any later version.
+ *
+ *     Neither Dag Brattli nor University of Tromsø admit liability nor
+ *     provide warranty for any of this software. This material is 
+ *     provided "AS-IS" and at no charge.
+ *
+ ********************************************************************/
+
+#ifndef IRIAP_H
+#define IRIAP_H
+
+#include <linux/types.h>
+#include <linux/skbuff.h>
+
+#include <net/irda/iriap_event.h>
+#include <net/irda/irias_object.h>
+#include <net/irda/irqueue.h>		/* irda_queue_t */
+#include <net/irda/timer.h>		/* struct timer_list */
+
+#define IAP_LST 0x80
+#define IAP_ACK 0x40
+
+#define IAS_SERVER 0
+#define IAS_CLIENT 1
+
+/* IrIAP Op-codes */
+#define GET_INFO_BASE      0x01
+#define GET_OBJECTS        0x02
+#define GET_VALUE          0x03
+#define GET_VALUE_BY_CLASS 0x04
+#define GET_OBJECT_INFO    0x05
+#define GET_ATTRIB_NAMES   0x06
+
+#define IAS_SUCCESS        0
+#define IAS_CLASS_UNKNOWN  1
+#define IAS_ATTRIB_UNKNOWN 2
+#define IAS_DISCONNECT     10
+
+typedef void (*CONFIRM_CALLBACK)(int result, __u16 obj_id, 
+				 struct ias_value *value, void *priv);
+
+struct iriap_cb {
+	irda_queue_t q; /* Must be first */	
+	magic_t magic;  /* Magic cookie */
+
+	int          mode;   /* Client or server */
+
+	__u32        saddr;
+	__u32        daddr;
+	__u8         operation;
+
+	struct sk_buff *request_skb;
+	struct lsap_cb *lsap;
+	__u8 slsap_sel;
+
+	/* Client states */
+	IRIAP_STATE client_state;
+	IRIAP_STATE call_state;
+	
+	/* Server states */
+	IRIAP_STATE server_state;
+	IRIAP_STATE r_connect_state;
+	
+	CONFIRM_CALLBACK confirm;
+	void *priv;                /* Used to identify client */
+
+	__u8 max_header_size;
+	__u32 max_data_size;
+	
+	struct timer_list watchdog_timer;
+};
+
+int  iriap_init(void);
+void iriap_cleanup(void);
+
+struct iriap_cb *iriap_open(__u8 slsap_sel, int mode, void *priv, 
+			    CONFIRM_CALLBACK callback);
+void iriap_close(struct iriap_cb *self);
+
+int iriap_getvaluebyclass_request(struct iriap_cb *self, 
+				  __u32 saddr, __u32 daddr,
+				  char *name, char *attr);
+void iriap_connect_request(struct iriap_cb *self);
+void iriap_send_ack( struct iriap_cb *self);
+void iriap_call_indication(struct iriap_cb *self, struct sk_buff *skb);
+
+void iriap_register_server(void);
+
+#endif
+
+
diff --git a/drivers/staging/irda/include/net/irda/iriap_event.h b/drivers/staging/irda/include/net/irda/iriap_event.h
new file mode 100644
index 000000000000..89747f06d9eb
--- /dev/null
+++ b/drivers/staging/irda/include/net/irda/iriap_event.h
@@ -0,0 +1,85 @@
+/*********************************************************************
+ *                
+ * Filename:      iriap_event.h
+ * Version:       
+ * Description:   
+ * Status:        Experimental.
+ * Author:        Dag Brattli <dagb@cs.uit.no>
+ * Created at:    Mon Aug  4 20:40:53 1997
+ * Modified at:   Sun Oct 31 22:02:54 1999
+ * Modified by:   Dag Brattli <dagb@cs.uit.no>
+ * 
+ *     Copyright (c) 1998-1999 Dag Brattli <dagb@cs.uit.no>, All Rights Reserved.
+ *     
+ *     This program is free software; you can redistribute it and/or 
+ *     modify it under the terms of the GNU General Public License as 
+ *     published by the Free Software Foundation; either version 2 of 
+ *     the License, or (at your option) any later version.
+ *
+ *     Neither Dag Brattli nor University of Tromsø admit liability nor
+ *     provide warranty for any of this software. This material is 
+ *     provided "AS-IS" and at no charge.
+ *
+ ********************************************************************/
+
+#ifndef IRIAP_FSM_H
+#define IRIAP_FSM_H
+
+/* Forward because of circular include dependecies */
+struct iriap_cb;
+
+/* IrIAP states */
+typedef enum {
+	/* Client */
+	S_DISCONNECT,
+	S_CONNECTING,
+	S_CALL,
+
+	/* S-Call */
+	S_MAKE_CALL,
+	S_CALLING,
+	S_OUTSTANDING,
+	S_REPLYING,
+	S_WAIT_FOR_CALL,
+	S_WAIT_ACTIVE,
+
+	/* Server */
+	R_DISCONNECT,
+	R_CALL,
+	
+	/* R-Connect */
+	R_WAITING,
+	R_WAIT_ACTIVE,
+	R_RECEIVING,
+	R_EXECUTE,
+	R_RETURNING,
+} IRIAP_STATE;
+
+typedef enum {
+	IAP_CALL_REQUEST,
+	IAP_CALL_REQUEST_GVBC,
+	IAP_CALL_RESPONSE,
+	IAP_RECV_F_LST,
+	IAP_LM_DISCONNECT_INDICATION,
+	IAP_LM_CONNECT_INDICATION,
+	IAP_LM_CONNECT_CONFIRM,
+} IRIAP_EVENT;
+
+void iriap_next_client_state   (struct iriap_cb *self, IRIAP_STATE state);
+void iriap_next_call_state     (struct iriap_cb *self, IRIAP_STATE state);
+void iriap_next_server_state   (struct iriap_cb *self, IRIAP_STATE state);
+void iriap_next_r_connect_state(struct iriap_cb *self, IRIAP_STATE state);
+
+
+void iriap_do_client_event(struct iriap_cb *self, IRIAP_EVENT event, 
+			   struct sk_buff *skb);
+void iriap_do_call_event  (struct iriap_cb *self, IRIAP_EVENT event, 
+			   struct sk_buff *skb);
+
+void iriap_do_server_event   (struct iriap_cb *self, IRIAP_EVENT event, 
+			      struct sk_buff *skb);
+void iriap_do_r_connect_event(struct iriap_cb *self, IRIAP_EVENT event, 
+			      struct sk_buff *skb);
+
+#endif /* IRIAP_FSM_H */
+
diff --git a/drivers/staging/irda/include/net/irda/irias_object.h b/drivers/staging/irda/include/net/irda/irias_object.h
new file mode 100644
index 000000000000..83f78081799c
--- /dev/null
+++ b/drivers/staging/irda/include/net/irda/irias_object.h
@@ -0,0 +1,108 @@
+/*********************************************************************
+ *                
+ * Filename:      irias_object.h
+ * Version:       
+ * Description:   
+ * Status:        Experimental.
+ * Author:        Dag Brattli <dagb@cs.uit.no>
+ * Created at:    Thu Oct  1 22:49:50 1998
+ * Modified at:   Wed Dec 15 11:20:57 1999
+ * Modified by:   Dag Brattli <dagb@cs.uit.no>
+ * 
+ *     Copyright (c) 1998-1999 Dag Brattli, All Rights Reserved.
+ *      
+ *     This program is free software; you can redistribute it and/or 
+ *     modify it under the terms of the GNU General Public License as 
+ *     published by the Free Software Foundation; either version 2 of 
+ *     the License, or (at your option) any later version.
+ *  
+ *     Neither Dag Brattli nor University of Tromsø admit liability nor
+ *     provide warranty for any of this software. This material is 
+ *     provided "AS-IS" and at no charge.
+ *     
+ ********************************************************************/
+
+#ifndef LM_IAS_OBJECT_H
+#define LM_IAS_OBJECT_H
+
+#include <net/irda/irda.h>
+#include <net/irda/irqueue.h>
+
+/* LM-IAS Attribute types */
+#define IAS_MISSING 0
+#define IAS_INTEGER 1
+#define IAS_OCT_SEQ 2
+#define IAS_STRING  3
+
+/* Object ownership of attributes (user or kernel) */
+#define IAS_KERNEL_ATTR	0
+#define IAS_USER_ATTR	1
+
+/*
+ *  LM-IAS Object
+ */
+struct ias_object {
+	irda_queue_t q;     /* Must be first! */
+	magic_t magic;
+	
+	char  *name;
+	int   id;
+	hashbin_t *attribs;
+};
+
+/*
+ *  Values used by LM-IAS attributes
+ */
+struct ias_value {
+        __u8    type;    /* Value description */
+	__u8	owner;	/* Managed from user/kernel space */
+	int     charset; /* Only used by string type */
+        int     len;
+	
+	/* Value */
+	union {
+		int integer;
+		char *string;
+		__u8 *oct_seq;
+	} t;
+};
+
+/*
+ *  Attributes used by LM-IAS objects
+ */
+struct ias_attrib {
+	irda_queue_t q; /* Must be first! */
+	int magic;
+
+        char *name;   	         /* Attribute name */
+	struct ias_value *value; /* Attribute value */
+};
+
+struct ias_object *irias_new_object(char *name, int id);
+void irias_insert_object(struct ias_object *obj);
+int  irias_delete_object(struct ias_object *obj);
+int  irias_delete_attrib(struct ias_object *obj, struct ias_attrib *attrib,
+			 int cleanobject);
+void __irias_delete_object(struct ias_object *obj);
+
+void irias_add_integer_attrib(struct ias_object *obj, char *name, int value,
+			      int user);
+void irias_add_string_attrib(struct ias_object *obj, char *name, char *value,
+			     int user);
+void irias_add_octseq_attrib(struct ias_object *obj, char *name, __u8 *octets,
+			     int len, int user);
+int irias_object_change_attribute(char *obj_name, char *attrib_name, 
+				  struct ias_value *new_value);
+struct ias_object *irias_find_object(char *name);
+struct ias_attrib *irias_find_attrib(struct ias_object *obj, char *name);
+
+struct ias_value *irias_new_string_value(char *string);
+struct ias_value *irias_new_integer_value(int integer);
+struct ias_value *irias_new_octseq_value(__u8 *octseq , int len);
+struct ias_value *irias_new_missing_value(void);
+void irias_delete_value(struct ias_value *value);
+
+extern struct ias_value irias_missing;
+extern hashbin_t *irias_objects;
+
+#endif
diff --git a/drivers/staging/irda/include/net/irda/irlan_client.h b/drivers/staging/irda/include/net/irda/irlan_client.h
new file mode 100644
index 000000000000..fa8455eda280
--- /dev/null
+++ b/drivers/staging/irda/include/net/irda/irlan_client.h
@@ -0,0 +1,42 @@
+/*********************************************************************
+ *                
+ * Filename:      irlan_client.h
+ * Version:       0.3
+ * Description:   IrDA LAN access layer
+ * Status:        Experimental.
+ * Author:        Dag Brattli <dagb@cs.uit.no>
+ * Created at:    Sun Aug 31 20:14:37 1997
+ * Modified at:   Thu Apr 22 14:13:34 1999
+ * Modified by:   Dag Brattli <dagb@cs.uit.no>
+ * 
+ *     Copyright (c) 1998 Dag Brattli <dagb@cs.uit.no>, All Rights Reserved.
+ *     
+ *     This program is free software; you can redistribute it and/or 
+ *     modify it under the terms of the GNU General Public License as 
+ *     published by the Free Software Foundation; either version 2 of 
+ *     the License, or (at your option) any later version.
+ *
+ *     Neither Dag Brattli nor University of Tromsø admit liability nor
+ *     provide warranty for any of this software. This material is 
+ *     provided "AS-IS" and at no charge.
+ *
+ ********************************************************************/
+
+#ifndef IRLAN_CLIENT_H
+#define IRLAN_CLIENT_H
+
+#include <linux/kernel.h>
+#include <linux/types.h>
+#include <linux/skbuff.h>
+#include <linux/netdevice.h>
+
+#include <net/irda/irias_object.h>
+#include <net/irda/irlan_event.h>
+
+void irlan_client_discovery_indication(discinfo_t *, DISCOVERY_MODE, void *);
+void irlan_client_wakeup(struct irlan_cb *self, __u32 saddr, __u32 daddr);
+
+void irlan_client_parse_response(struct irlan_cb *self, struct sk_buff *skb);
+void irlan_client_get_value_confirm(int result, __u16 obj_id, 
+				    struct ias_value *value, void *priv);
+#endif
diff --git a/drivers/staging/irda/include/net/irda/irlan_common.h b/drivers/staging/irda/include/net/irda/irlan_common.h
new file mode 100644
index 000000000000..550c2d6ec7ff
--- /dev/null
+++ b/drivers/staging/irda/include/net/irda/irlan_common.h
@@ -0,0 +1,230 @@
+/*********************************************************************
+ *                
+ * Filename:      irlan_common.h
+ * Version:       0.8
+ * Description:   IrDA LAN access layer
+ * Status:        Experimental.
+ * Author:        Dag Brattli <dagb@cs.uit.no>
+ * Created at:    Sun Aug 31 20:14:37 1997
+ * Modified at:   Sun Oct 31 19:41:24 1999
+ * Modified by:   Dag Brattli <dagb@cs.uit.no>
+ * 
+ *     Copyright (c) 1998-1999 Dag Brattli <dagb@cs.uit.no>, 
+ *     All Rights Reserved.
+ *     
+ *     This program is free software; you can redistribute it and/or 
+ *     modify it under the terms of the GNU General Public License as 
+ *     published by the Free Software Foundation; either version 2 of 
+ *     the License, or (at your option) any later version.
+ *
+ *     Neither Dag Brattli nor University of Tromsø admit liability nor
+ *     provide warranty for any of this software. This material is 
+ *     provided "AS-IS" and at no charge.
+ *
+ ********************************************************************/
+
+#ifndef IRLAN_H
+#define IRLAN_H
+
+#include <asm/param.h>  /* for HZ */
+
+#include <linux/kernel.h>
+#include <linux/types.h>
+#include <linux/skbuff.h>
+#include <linux/netdevice.h>
+#include <linux/if_ether.h>
+
+#include <net/irda/irttp.h>
+
+#define IRLAN_MTU        1518
+#define IRLAN_TIMEOUT    10*HZ /* 10 seconds */
+
+/* Command packet types */
+#define CMD_GET_PROVIDER_INFO   0
+#define CMD_GET_MEDIA_CHAR      1
+#define CMD_OPEN_DATA_CHANNEL   2
+#define CMD_CLOSE_DATA_CHAN     3
+#define CMD_RECONNECT_DATA_CHAN 4
+#define CMD_FILTER_OPERATION    5
+
+/* Some responses */
+#define RSP_SUCCESS                 0
+#define RSP_INSUFFICIENT_RESOURCES  1
+#define RSP_INVALID_COMMAND_FORMAT  2
+#define RSP_COMMAND_NOT_SUPPORTED   3
+#define RSP_PARAM_NOT_SUPPORTED     4
+#define RSP_VALUE_NOT_SUPPORTED     5
+#define RSP_NOT_OPEN                6
+#define RSP_AUTHENTICATION_REQUIRED 7
+#define RSP_INVALID_PASSWORD        8
+#define RSP_PROTOCOL_ERROR          9
+#define RSP_ASYNCHRONOUS_ERROR    255
+
+/* Media types */
+#define MEDIA_802_3 1
+#define MEDIA_802_5 2
+
+/* Filter parameters */
+#define DATA_CHAN   1
+#define FILTER_TYPE 2
+#define FILTER_MODE 3
+
+/* Filter types */
+#define IRLAN_DIRECTED   0x01
+#define IRLAN_FUNCTIONAL 0x02
+#define IRLAN_GROUP      0x04
+#define IRLAN_MAC_FRAME  0x08
+#define IRLAN_MULTICAST  0x10
+#define IRLAN_BROADCAST  0x20
+#define IRLAN_IPX_SOCKET 0x40
+
+/* Filter modes */
+#define ALL     1
+#define FILTER  2
+#define NONE    3
+
+/* Filter operations */
+#define GET     1
+#define CLEAR   2
+#define ADD     3
+#define REMOVE  4
+#define DYNAMIC 5
+
+/* Access types */
+#define ACCESS_DIRECT  1
+#define ACCESS_PEER    2
+#define ACCESS_HOSTED  3
+
+#define IRLAN_BYTE   0
+#define IRLAN_SHORT  1
+#define IRLAN_ARRAY  2
+
+/* IrLAN sits on top if IrTTP */
+#define IRLAN_MAX_HEADER (TTP_HEADER+LMP_HEADER)
+/* 1 byte for the command code and 1 byte for the parameter count */
+#define IRLAN_CMD_HEADER 2
+
+#define IRLAN_STRING_PARAMETER_LEN(name, value) (1 + strlen((name)) + 2 \
+						+ strlen ((value)))
+#define IRLAN_BYTE_PARAMETER_LEN(name)          (1 + strlen((name)) + 2 + 1)
+#define IRLAN_SHORT_PARAMETER_LEN(name)         (1 + strlen((name)) + 2 + 2)
+
+/*
+ *  IrLAN client
+ */
+struct irlan_client_cb {
+	int state;
+
+	int open_retries;
+
+	struct tsap_cb *tsap_ctrl;
+	__u32 max_sdu_size;
+	__u8  max_header_size;
+	
+	int access_type;         /* Access type of provider */
+	__u8 reconnect_key[255];
+	__u8 key_len;
+	
+	__u16 recv_arb_val;
+	__u16 max_frame;
+	int filter_type;
+
+	int unicast_open;
+	int broadcast_open;
+
+	int tx_busy;
+	struct sk_buff_head txq; /* Transmit control queue */
+
+	struct iriap_cb *iriap;
+
+	struct timer_list kick_timer;
+};
+
+/*
+ * IrLAN provider
+ */
+struct irlan_provider_cb {
+	int state;
+	
+	struct tsap_cb *tsap_ctrl;
+	__u32 max_sdu_size;
+	__u8  max_header_size;
+
+	/*
+	 *  Store some values here which are used by the provider to parse
+	 *  the filter operations
+	 */
+	int data_chan;
+	int filter_type;
+	int filter_mode;
+	int filter_operation;
+	int filter_entry;
+	int access_type;     /* Access type */
+	__u16 send_arb_val;
+
+	__u8 mac_address[ETH_ALEN]; /* Generated MAC address for peer device */
+};
+
+/*
+ *  IrLAN control block
+ */
+struct irlan_cb {
+	int    magic;
+	struct list_head  dev_list;
+	struct net_device *dev;        /* Ethernet device structure*/
+
+	__u32 saddr;               /* Source device address */
+	__u32 daddr;               /* Destination device address */
+	int disconnect_reason;     /* Why we got disconnected */
+	
+	int media;                 /* Media type */
+	__u8 version[2];           /* IrLAN version */
+	
+	struct tsap_cb *tsap_data; /* Data TSAP */
+
+	int  use_udata;            /* Use Unit Data transfers */
+
+	__u8 stsap_sel_data;       /* Source data TSAP selector */
+	__u8 dtsap_sel_data;       /* Destination data TSAP selector */
+	__u8 dtsap_sel_ctrl;       /* Destination ctrl TSAP selector */
+
+	struct irlan_client_cb   client;   /* Client specific fields */
+	struct irlan_provider_cb provider; /* Provider specific fields */
+
+	__u32 max_sdu_size;
+	__u8  max_header_size;
+	
+	wait_queue_head_t open_wait;
+	struct timer_list watchdog_timer;
+};
+
+void irlan_close(struct irlan_cb *self);
+void irlan_close_tsaps(struct irlan_cb *self);
+
+int  irlan_register_netdev(struct irlan_cb *self);
+void irlan_ias_register(struct irlan_cb *self, __u8 tsap_sel);
+void irlan_start_watchdog_timer(struct irlan_cb *self, int timeout);
+
+void irlan_open_data_tsap(struct irlan_cb *self);
+
+int irlan_run_ctrl_tx_queue(struct irlan_cb *self);
+
+struct irlan_cb *irlan_get_any(void);
+void irlan_get_provider_info(struct irlan_cb *self);
+void irlan_get_media_char(struct irlan_cb *self);
+void irlan_open_data_channel(struct irlan_cb *self);
+void irlan_close_data_channel(struct irlan_cb *self);
+void irlan_set_multicast_filter(struct irlan_cb *self, int status);
+void irlan_set_broadcast_filter(struct irlan_cb *self, int status);
+
+int irlan_insert_byte_param(struct sk_buff *skb, char *param, __u8 value);
+int irlan_insert_short_param(struct sk_buff *skb, char *param, __u16 value);
+int irlan_insert_string_param(struct sk_buff *skb, char *param, char *value);
+int irlan_insert_array_param(struct sk_buff *skb, char *name, __u8 *value, 
+			     __u16 value_len);
+
+int irlan_extract_param(__u8 *buf, char *name, char *value, __u16 *len);
+
+#endif
+
+
diff --git a/drivers/staging/irda/include/net/irda/irlan_eth.h b/drivers/staging/irda/include/net/irda/irlan_eth.h
new file mode 100644
index 000000000000..de5c81691f33
--- /dev/null
+++ b/drivers/staging/irda/include/net/irda/irlan_eth.h
@@ -0,0 +1,32 @@
+/*********************************************************************
+ *                
+ * Filename:      irlan_eth.h
+ * Version:       
+ * Description:   
+ * Status:        Experimental.
+ * Author:        Dag Brattli <dagb@cs.uit.no>
+ * Created at:    Thu Oct 15 08:36:58 1998
+ * Modified at:   Fri May 14 23:29:00 1999
+ * Modified by:   Dag Brattli <dagb@cs.uit.no>
+ * 
+ *     Copyright (c) 1998-1999 Dag Brattli, All Rights Reserved.
+ *      
+ *     This program is free software; you can redistribute it and/or 
+ *     modify it under the terms of the GNU General Public License as 
+ *     published by the Free Software Foundation; either version 2 of 
+ *     the License, or (at your option) any later version.
+ *  
+ *     Neither Dag Brattli nor University of Tromsø admit liability nor
+ *     provide warranty for any of this software. This material is 
+ *     provided "AS-IS" and at no charge.
+ *     
+ ********************************************************************/
+
+#ifndef IRLAN_ETH_H
+#define IRLAN_ETH_H
+
+struct net_device *alloc_irlandev(const char *name);
+int  irlan_eth_receive(void *instance, void *sap, struct sk_buff *skb);
+
+void irlan_eth_flow_indication( void *instance, void *sap, LOCAL_FLOW flow);
+#endif
diff --git a/drivers/staging/irda/include/net/irda/irlan_event.h b/drivers/staging/irda/include/net/irda/irlan_event.h
new file mode 100644
index 000000000000..018b5a77e610
--- /dev/null
+++ b/drivers/staging/irda/include/net/irda/irlan_event.h
@@ -0,0 +1,81 @@
+/*********************************************************************
+ *                
+ * Filename:      irlan_event.h
+ * Version:       
+ * Description:   LAN access
+ * Status:        Experimental.
+ * Author:        Dag Brattli <dagb@cs.uit.no>
+ * Created at:    Sun Aug 31 20:14:37 1997
+ * Modified at:   Tue Feb  2 09:45:17 1999
+ * Modified by:   Dag Brattli <dagb@cs.uit.no>
+ * 
+ *     Copyright (c) 1997 Dag Brattli <dagb@cs.uit.no>, All Rights Reserved.
+ *     
+ *     This program is free software; you can redistribute it and/or 
+ *     modify it under the terms of the GNU General Public License as 
+ *     published by the Free Software Foundation; either version 2 of 
+ *     the License, or (at your option) any later version.
+ *
+ *     Neither Dag Brattli nor University of Tromsø admit liability nor
+ *     provide warranty for any of this software. This material is 
+ *     provided "AS-IS" and at no charge.
+ *
+ ********************************************************************/
+
+#ifndef IRLAN_EVENT_H
+#define IRLAN_EVENT_H
+
+#include <linux/kernel.h>
+#include <linux/skbuff.h>
+
+#include <net/irda/irlan_common.h>
+
+typedef enum {
+	IRLAN_IDLE,
+	IRLAN_QUERY,
+	IRLAN_CONN, 
+	IRLAN_INFO,
+	IRLAN_MEDIA,
+	IRLAN_OPEN,
+	IRLAN_WAIT,
+	IRLAN_ARB, 
+	IRLAN_DATA,
+	IRLAN_CLOSE,
+	IRLAN_SYNC
+} IRLAN_STATE;
+
+typedef enum {
+	IRLAN_DISCOVERY_INDICATION,
+	IRLAN_IAS_PROVIDER_AVAIL,
+	IRLAN_IAS_PROVIDER_NOT_AVAIL,
+	IRLAN_LAP_DISCONNECT,
+	IRLAN_LMP_DISCONNECT,
+	IRLAN_CONNECT_COMPLETE,
+	IRLAN_DATA_INDICATION,
+	IRLAN_DATA_CONNECT_INDICATION,
+	IRLAN_RETRY_CONNECT,
+
+	IRLAN_CONNECT_INDICATION,
+	IRLAN_GET_INFO_CMD,
+	IRLAN_GET_MEDIA_CMD,
+	IRLAN_OPEN_DATA_CMD,
+	IRLAN_FILTER_CONFIG_CMD,
+
+	IRLAN_CHECK_CON_ARB,
+	IRLAN_PROVIDER_SIGNAL,
+
+	IRLAN_WATCHDOG_TIMEOUT,
+} IRLAN_EVENT;
+
+extern const char * const irlan_state[];
+
+void irlan_do_client_event(struct irlan_cb *self, IRLAN_EVENT event, 
+			   struct sk_buff *skb);
+
+void irlan_do_provider_event(struct irlan_cb *self, IRLAN_EVENT event, 
+			     struct sk_buff *skb);
+
+void irlan_next_client_state(struct irlan_cb *self, IRLAN_STATE state);
+void irlan_next_provider_state(struct irlan_cb *self, IRLAN_STATE state);
+
+#endif
diff --git a/drivers/staging/irda/include/net/irda/irlan_filter.h b/drivers/staging/irda/include/net/irda/irlan_filter.h
new file mode 100644
index 000000000000..a5a2539485bd
--- /dev/null
+++ b/drivers/staging/irda/include/net/irda/irlan_filter.h
@@ -0,0 +1,35 @@
+/*********************************************************************
+ *                
+ * Filename:      irlan_filter.h
+ * Version:       
+ * Description:   
+ * Status:        Experimental.
+ * Author:        Dag Brattli <dagb@cs.uit.no>
+ * Created at:    Fri Jan 29 15:24:08 1999
+ * Modified at:   Sun Feb  7 23:35:31 1999
+ * Modified by:   Dag Brattli <dagb@cs.uit.no>
+ * 
+ *     Copyright (c) 1998 Dag Brattli, All Rights Reserved.
+ *      
+ *     This program is free software; you can redistribute it and/or 
+ *     modify it under the terms of the GNU General Public License as 
+ *     published by the Free Software Foundation; either version 2 of 
+ *     the License, or (at your option) any later version.
+ *  
+ *     Neither Dag Brattli nor University of Tromsø admit liability nor
+ *     provide warranty for any of this software. This material is 
+ *     provided "AS-IS" and at no charge.
+ *     
+ ********************************************************************/
+
+#ifndef IRLAN_FILTER_H
+#define IRLAN_FILTER_H
+
+void irlan_check_command_param(struct irlan_cb *self, char *param, 
+			       char *value);
+void irlan_filter_request(struct irlan_cb *self, struct sk_buff *skb);
+#ifdef CONFIG_PROC_FS
+void irlan_print_filter(struct seq_file *seq, int filter_type);
+#endif
+
+#endif /* IRLAN_FILTER_H */
diff --git a/drivers/staging/irda/include/net/irda/irlan_provider.h b/drivers/staging/irda/include/net/irda/irlan_provider.h
new file mode 100644
index 000000000000..92f3b0e1029b
--- /dev/null
+++ b/drivers/staging/irda/include/net/irda/irlan_provider.h
@@ -0,0 +1,52 @@
+/*********************************************************************
+ *                
+ * Filename:      irlan_provider.h
+ * Version:       0.1
+ * Description:   IrDA LAN access layer
+ * Status:        Experimental.
+ * Author:        Dag Brattli <dagb@cs.uit.no>
+ * Created at:    Sun Aug 31 20:14:37 1997
+ * Modified at:   Sun May  9 12:26:11 1999
+ * Modified by:   Dag Brattli <dagb@cs.uit.no>
+ * 
+ *     Copyright (c) 1998-1999 Dag Brattli <dagb@cs.uit.no>, All Rights Reserved.
+ *     
+ *     This program is free software; you can redistribute it and/or 
+ *     modify it under the terms of the GNU General Public License as 
+ *     published by the Free Software Foundation; either version 2 of 
+ *     the License, or (at your option) any later version.
+ *
+ *     Neither Dag Brattli nor University of Tromsø admit liability nor
+ *     provide warranty for any of this software. This material is 
+ *     provided "AS-IS" and at no charge.
+ *
+ ********************************************************************/
+
+#ifndef IRLAN_SERVER_H
+#define IRLAN_SERVER_H
+
+#include <linux/kernel.h>
+#include <linux/types.h>
+#include <linux/skbuff.h>
+#include <linux/netdevice.h>
+
+#include <net/irda/irlan_common.h>
+
+void irlan_provider_ctrl_disconnect_indication(void *instance, void *sap, 
+					       LM_REASON reason, 
+					       struct sk_buff *skb);
+
+
+void irlan_provider_connect_response(struct irlan_cb *, struct tsap_cb *);
+
+int irlan_parse_open_data_cmd(struct irlan_cb *self, struct sk_buff *skb);
+int irlan_provider_parse_command(struct irlan_cb *self, int cmd,
+				 struct sk_buff *skb);
+
+void irlan_provider_send_reply(struct irlan_cb *self, int command, 
+			       int ret_code);
+int irlan_provider_open_ctrl_tsap(struct irlan_cb *self);
+
+#endif
+
+
diff --git a/drivers/staging/irda/include/net/irda/irlap.h b/drivers/staging/irda/include/net/irda/irlap.h
new file mode 100644
index 000000000000..6f23e820618c
--- /dev/null
+++ b/drivers/staging/irda/include/net/irda/irlap.h
@@ -0,0 +1,311 @@
+/*********************************************************************
+ *                
+ * Filename:      irlap.h
+ * Version:       0.8
+ * Description:   An IrDA LAP driver for Linux
+ * Status:        Experimental.
+ * Author:        Dag Brattli <dagb@cs.uit.no>
+ * Created at:    Mon Aug  4 20:40:53 1997
+ * Modified at:   Fri Dec 10 13:21:17 1999
+ * Modified by:   Dag Brattli <dagb@cs.uit.no>
+ * 
+ *     Copyright (c) 1998-1999 Dag Brattli <dagb@cs.uit.no>, 
+ *     All Rights Reserved.
+ *     Copyright (c) 2000-2002 Jean Tourrilhes <jt@hpl.hp.com>
+ *     
+ *     This program is free software; you can redistribute it and/or 
+ *     modify it under the terms of the GNU General Public License as 
+ *     published by the Free Software Foundation; either version 2 of 
+ *     the License, or (at your option) any later version.
+ *
+ *     Neither Dag Brattli nor University of Tromsø admit liability nor
+ *     provide warranty for any of this software. This material is 
+ *     provided "AS-IS" and at no charge.
+ *
+ ********************************************************************/
+
+#ifndef IRLAP_H
+#define IRLAP_H
+
+#include <linux/types.h>
+#include <linux/skbuff.h>
+#include <linux/netdevice.h>
+#include <linux/timer.h>
+
+#include <net/irda/irqueue.h>		/* irda_queue_t */
+#include <net/irda/qos.h>		/* struct qos_info */
+#include <net/irda/discovery.h>		/* discovery_t */
+#include <net/irda/irlap_event.h>	/* IRLAP_STATE, ... */
+#include <net/irda/irmod.h>		/* struct notify_t */
+
+#define CONFIG_IRDA_DYNAMIC_WINDOW 1
+
+#define LAP_RELIABLE   1
+#define LAP_UNRELIABLE 0
+
+#define LAP_ADDR_HEADER 1  /* IrLAP Address Header */
+#define LAP_CTRL_HEADER 1  /* IrLAP Control Header */
+
+/* May be different when we get VFIR */
+#define LAP_MAX_HEADER (LAP_ADDR_HEADER + LAP_CTRL_HEADER)
+
+/* Each IrDA device gets a random 32 bits IRLAP device address */
+#define LAP_ALEN 4
+
+#define BROADCAST  0xffffffff /* Broadcast device address */
+#define CBROADCAST 0xfe       /* Connection broadcast address */
+#define XID_FORMAT 0x01       /* Discovery XID format */
+
+/* Nobody seems to use this constant. */
+#define LAP_WINDOW_SIZE 8
+/* We keep the LAP queue very small to minimise the amount of buffering.
+ * this improve latency and reduce resource consumption.
+ * This work only because we have synchronous refilling of IrLAP through
+ * the flow control mechanism (via scheduler and IrTTP).
+ * 2 buffers is the minimum we can work with, one that we send while polling
+ * IrTTP, and another to know that we should not send the pf bit.
+ * Jean II */
+#define LAP_HIGH_THRESHOLD     2
+/* Some rare non TTP clients don't implement flow control, and
+ * so don't comply with the above limit (and neither with this one).
+ * For IAP and management, it doesn't matter, because they never transmit much.
+ *.For IrLPT, this should be fixed.
+ * - Jean II */
+#define LAP_MAX_QUEUE 10
+/* Please note that all IrDA management frames (LMP/TTP conn req/disc and
+ * IAS queries) fall in the second category and are sent to LAP even if TTP
+ * is stopped. This means that those frames will wait only a maximum of
+ * two (2) data frames before beeing sent on the "wire", which speed up
+ * new socket setup when the link is saturated.
+ * Same story for two sockets competing for the medium : if one saturates
+ * the LAP, when the other want to transmit it only has to wait for
+ * maximum three (3) packets (2 + one scheduling), which improve performance
+ * of delay sensitive applications.
+ * Jean II */
+
+#define NR_EXPECTED     1
+#define NR_UNEXPECTED   0
+#define NR_INVALID     -1
+
+#define NS_EXPECTED     1
+#define NS_UNEXPECTED   0
+#define NS_INVALID     -1
+
+/*
+ *  Meta information passed within the IrLAP state machine
+ */
+struct irlap_info {
+	__u8 caddr;   /* Connection address */
+	__u8 control; /* Frame type */
+        __u8 cmd;
+
+	__u32 saddr;
+	__u32 daddr;
+	
+	int pf;        /* Poll/final bit set */
+
+	__u8  nr;      /* Sequence number of next frame expected */
+	__u8  ns;      /* Sequence number of frame sent */
+
+	int  S;        /* Number of slots */
+	int  slot;     /* Random chosen slot */
+	int  s;        /* Current slot */
+
+	discovery_t *discovery; /* Discovery information */
+};
+
+/* Main structure of IrLAP */
+struct irlap_cb {
+	irda_queue_t q;     /* Must be first */
+	magic_t magic;
+
+	/* Device we are attached to */
+	struct net_device  *netdev;
+	char		hw_name[2*IFNAMSIZ + 1];
+
+	/* Connection state */
+	volatile IRLAP_STATE state;       /* Current state */
+
+	/* Timers used by IrLAP */
+	struct timer_list query_timer;
+	struct timer_list slot_timer;
+	struct timer_list discovery_timer;
+	struct timer_list final_timer;
+	struct timer_list poll_timer;
+	struct timer_list wd_timer;
+	struct timer_list backoff_timer;
+
+	/* Media busy stuff */
+	struct timer_list media_busy_timer;
+	int media_busy;
+
+	/* Timeouts which will be different with different turn time */
+	int slot_timeout;
+	int poll_timeout;
+	int final_timeout;
+	int wd_timeout;
+
+	struct sk_buff_head txq;  /* Frames to be transmitted */
+	struct sk_buff_head txq_ultra;
+
+ 	__u8    caddr;        /* Connection address */
+	__u32   saddr;        /* Source device address */
+	__u32   daddr;        /* Destination device address */
+
+	int     retry_count;  /* Times tried to establish connection */
+	int     add_wait;     /* True if we are waiting for frame */
+
+	__u8    connect_pending;
+	__u8    disconnect_pending;
+
+	/*  To send a faster RR if tx queue empty */
+#ifdef CONFIG_IRDA_FAST_RR
+	int     fast_RR_timeout;
+	int     fast_RR;      
+#endif /* CONFIG_IRDA_FAST_RR */
+	
+	int N1; /* N1 * F-timer = Negitiated link disconnect warning threshold */
+	int N2; /* N2 * F-timer = Negitiated link disconnect time */
+	int N3; /* Connection retry count */
+
+	int     local_busy;
+	int     remote_busy;
+	int     xmitflag;
+
+	__u8    vs;            /* Next frame to be sent */
+	__u8    vr;            /* Next frame to be received */
+	__u8    va;            /* Last frame acked */
+ 	int     window;        /* Nr of I-frames allowed to send */
+	int     window_size;   /* Current negotiated window size */
+
+#ifdef CONFIG_IRDA_DYNAMIC_WINDOW
+	__u32   line_capacity; /* Number of bytes allowed to send */
+	__u32   bytes_left;    /* Number of bytes still allowed to transmit */
+#endif /* CONFIG_IRDA_DYNAMIC_WINDOW */
+
+	struct sk_buff_head wx_list;
+
+	__u8    ack_required;
+	
+	/* XID parameters */
+ 	__u8    S;           /* Number of slots */
+	__u8    slot;        /* Random chosen slot */
+ 	__u8    s;           /* Current slot */
+	int     frame_sent;  /* Have we sent reply? */
+
+	hashbin_t   *discovery_log;
+ 	discovery_t *discovery_cmd;
+
+	__u32 speed;		/* Link speed */
+
+	struct qos_info  qos_tx;   /* QoS requested by peer */
+	struct qos_info  qos_rx;   /* QoS requested by self */
+	struct qos_info *qos_dev;  /* QoS supported by device */
+
+	notify_t notify; /* Callbacks to IrLMP */
+
+	int    mtt_required;  /* Minimum turnaround time required */
+	int    xbofs_delay;   /* Nr of XBOF's used to MTT */
+	int    bofs_count;    /* Negotiated extra BOFs */
+	int    next_bofs;     /* Negotiated extra BOFs after next frame */
+
+	int    mode;     /* IrLAP mode (primary, secondary or monitor) */
+};
+
+/* 
+ *  Function prototypes 
+ */
+int irlap_init(void);
+void irlap_cleanup(void);
+
+struct irlap_cb *irlap_open(struct net_device *dev, struct qos_info *qos,
+			    const char *hw_name);
+void irlap_close(struct irlap_cb *self);
+
+void irlap_connect_request(struct irlap_cb *self, __u32 daddr, 
+			   struct qos_info *qos, int sniff);
+void irlap_connect_response(struct irlap_cb *self, struct sk_buff *skb);
+void irlap_connect_indication(struct irlap_cb *self, struct sk_buff *skb);
+void irlap_connect_confirm(struct irlap_cb *, struct sk_buff *skb);
+
+void irlap_data_indication(struct irlap_cb *, struct sk_buff *, int unreliable);
+void irlap_data_request(struct irlap_cb *, struct sk_buff *, int unreliable);
+
+#ifdef CONFIG_IRDA_ULTRA
+void irlap_unitdata_request(struct irlap_cb *, struct sk_buff *);
+void irlap_unitdata_indication(struct irlap_cb *, struct sk_buff *);
+#endif /* CONFIG_IRDA_ULTRA */
+
+void irlap_disconnect_request(struct irlap_cb *);
+void irlap_disconnect_indication(struct irlap_cb *, LAP_REASON reason);
+
+void irlap_status_indication(struct irlap_cb *, int quality_of_link);
+
+void irlap_test_request(__u8 *info, int len);
+
+void irlap_discovery_request(struct irlap_cb *, discovery_t *discovery);
+void irlap_discovery_confirm(struct irlap_cb *, hashbin_t *discovery_log);
+void irlap_discovery_indication(struct irlap_cb *, discovery_t *discovery);
+
+void irlap_reset_indication(struct irlap_cb *self);
+void irlap_reset_confirm(void);
+
+void irlap_update_nr_received(struct irlap_cb *, int nr);
+int irlap_validate_nr_received(struct irlap_cb *, int nr);
+int irlap_validate_ns_received(struct irlap_cb *, int ns);
+
+int  irlap_generate_rand_time_slot(int S, int s);
+void irlap_initiate_connection_state(struct irlap_cb *);
+void irlap_flush_all_queues(struct irlap_cb *);
+void irlap_wait_min_turn_around(struct irlap_cb *, struct qos_info *);
+
+void irlap_apply_default_connection_parameters(struct irlap_cb *self);
+void irlap_apply_connection_parameters(struct irlap_cb *self, int now);
+
+#define IRLAP_GET_HEADER_SIZE(self) (LAP_MAX_HEADER)
+#define IRLAP_GET_TX_QUEUE_LEN(self) skb_queue_len(&self->txq)
+
+/* Return TRUE if the node is in primary mode (i.e. master)
+ * - Jean II */
+static inline int irlap_is_primary(struct irlap_cb *self)
+{
+	int ret;
+	switch(self->state) {
+	case LAP_XMIT_P:
+	case LAP_NRM_P:
+		ret = 1;
+		break;
+	case LAP_XMIT_S:
+	case LAP_NRM_S:
+		ret = 0;
+		break;
+	default:
+		ret = -1;
+	}
+	return ret;
+}
+
+/* Clear a pending IrLAP disconnect. - Jean II */
+static inline void irlap_clear_disconnect(struct irlap_cb *self)
+{
+	self->disconnect_pending = FALSE;
+}
+
+/*
+ * Function irlap_next_state (self, state)
+ *
+ *    Switches state and provides debug information
+ *
+ */
+static inline void irlap_next_state(struct irlap_cb *self, IRLAP_STATE state)
+{
+	/*
+	if (!self || self->magic != LAP_MAGIC)
+		return;
+
+		pr_debug("next LAP state = %s\n", irlap_state[state]);
+	*/
+	self->state = state;
+}
+
+#endif
diff --git a/drivers/staging/irda/include/net/irda/irlap_event.h b/drivers/staging/irda/include/net/irda/irlap_event.h
new file mode 100644
index 000000000000..e4325fee1267
--- /dev/null
+++ b/drivers/staging/irda/include/net/irda/irlap_event.h
@@ -0,0 +1,129 @@
+/*********************************************************************
+ *                
+ *                
+ * Filename:      irlap_event.h
+ * Version:       0.1
+ * Description:   
+ * Status:        Experimental.
+ * Author:        Dag Brattli <dagb@cs.uit.no>
+ * Created at:    Sat Aug 16 00:59:29 1997
+ * Modified at:   Tue Dec 21 11:20:30 1999
+ * Modified by:   Dag Brattli <dagb@cs.uit.no>
+ * 
+ *     Copyright (c) 1998-1999 Dag Brattli <dagb@cs.uit.no>, 
+ *     All Rights Reserved.
+ *     Copyright (c) 2000-2002 Jean Tourrilhes <jt@hpl.hp.com>
+ *     
+ *     This program is free software; you can redistribute it and/or 
+ *     modify it under the terms of the GNU General Public License as 
+ *     published by the Free Software Foundation; either version 2 of 
+ *     the License, or (at your option) any later version.
+ * 
+ *     This program is distributed in the hope that it will be useful,
+ *     but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ *     GNU General Public License for more details.
+ * 
+ *     You should have received a copy of the GNU General Public License 
+ *     along with this program; if not, see <http://www.gnu.org/licenses/>.
+ *     
+ ********************************************************************/
+
+#ifndef IRLAP_EVENT_H
+#define IRLAP_EVENT_H
+
+#include <net/irda/irda.h>
+
+/* A few forward declarations (to make compiler happy) */
+struct irlap_cb;
+struct irlap_info;
+
+/* IrLAP States */
+typedef enum {
+	LAP_NDM,         /* Normal disconnected mode */
+	LAP_QUERY,
+	LAP_REPLY,
+	LAP_CONN,        /* Connect indication */
+	LAP_SETUP,       /* Setting up connection */
+	LAP_OFFLINE,     /* A really boring state */
+	LAP_XMIT_P,
+	LAP_PCLOSE,
+	LAP_NRM_P,       /* Normal response mode as primary */
+	LAP_RESET_WAIT,
+	LAP_RESET,
+	LAP_NRM_S,       /* Normal response mode as secondary */
+	LAP_XMIT_S,
+	LAP_SCLOSE,
+	LAP_RESET_CHECK,
+} IRLAP_STATE;
+
+/* IrLAP Events */
+typedef enum {
+	/* Services events */
+	DISCOVERY_REQUEST,
+	CONNECT_REQUEST,
+	CONNECT_RESPONSE,
+	DISCONNECT_REQUEST,
+	DATA_REQUEST,
+	RESET_REQUEST,
+	RESET_RESPONSE,
+
+	/* Send events */
+	SEND_I_CMD,
+	SEND_UI_FRAME,
+
+	/* Receive events */
+	RECV_DISCOVERY_XID_CMD,
+	RECV_DISCOVERY_XID_RSP,
+	RECV_SNRM_CMD,
+	RECV_TEST_CMD,
+	RECV_TEST_RSP,
+	RECV_UA_RSP,
+	RECV_DM_RSP,
+	RECV_RD_RSP,
+	RECV_I_CMD,
+	RECV_I_RSP,
+	RECV_UI_FRAME,
+	RECV_FRMR_RSP,
+	RECV_RR_CMD,
+	RECV_RR_RSP,
+	RECV_RNR_CMD,
+	RECV_RNR_RSP,
+	RECV_REJ_CMD,
+	RECV_REJ_RSP,
+	RECV_SREJ_CMD,
+	RECV_SREJ_RSP,
+	RECV_DISC_CMD,
+
+	/* Timer events */
+	SLOT_TIMER_EXPIRED,
+	QUERY_TIMER_EXPIRED,
+	FINAL_TIMER_EXPIRED,
+	POLL_TIMER_EXPIRED,
+	DISCOVERY_TIMER_EXPIRED,
+	WD_TIMER_EXPIRED,
+	BACKOFF_TIMER_EXPIRED,
+	MEDIA_BUSY_TIMER_EXPIRED,
+} IRLAP_EVENT;
+
+/*
+ * Disconnect reason code
+ */
+typedef enum { /* FIXME check the two first reason codes */
+	LAP_DISC_INDICATION=1, /* Received a disconnect request from peer */
+	LAP_NO_RESPONSE,       /* To many retransmits without response */
+	LAP_RESET_INDICATION,  /* To many retransmits, or invalid nr/ns */
+	LAP_FOUND_NONE,        /* No devices were discovered */
+	LAP_MEDIA_BUSY,
+	LAP_PRIMARY_CONFLICT,
+} LAP_REASON;
+
+extern const char *const irlap_state[];
+
+void irlap_do_event(struct irlap_cb *self, IRLAP_EVENT event, 
+		    struct sk_buff *skb, struct irlap_info *info);
+void irlap_print_event(IRLAP_EVENT event);
+
+int irlap_qos_negotiate(struct irlap_cb *self, struct sk_buff *skb);
+
+#endif
diff --git a/drivers/staging/irda/include/net/irda/irlap_frame.h b/drivers/staging/irda/include/net/irda/irlap_frame.h
new file mode 100644
index 000000000000..cbc12a926e5f
--- /dev/null
+++ b/drivers/staging/irda/include/net/irda/irlap_frame.h
@@ -0,0 +1,167 @@
+/*********************************************************************
+ *                
+ * Filename:      irlap_frame.h
+ * Version:       0.9
+ * Description:   IrLAP frame declarations
+ * Status:        Experimental.
+ * Author:        Dag Brattli <dagb@cs.uit.no>
+ * Created at:    Tue Aug 19 10:27:26 1997
+ * Modified at:   Sat Dec 25 21:07:26 1999
+ * Modified by:   Dag Brattli <dagb@cs.uit.no>
+ * 
+ *     Copyright (c) 1997-1999 Dag Brattli <dagb@cs.uit.no>,
+ *     All Rights Reserved.
+ *     Copyright (c) 2000-2002 Jean Tourrilhes <jt@hpl.hp.com>
+ *     
+ *     This program is free software; you can redistribute it and/or 
+ *     modify it under the terms of the GNU General Public License as 
+ *     published by the Free Software Foundation; either version 2 of 
+ *     the License, or (at your option) any later version.
+ * 
+ *     This program is distributed in the hope that it will be useful,
+ *     but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ *     GNU General Public License for more details.
+ * 
+ *     You should have received a copy of the GNU General Public License 
+ *     along with this program; if not, see <http://www.gnu.org/licenses/>.
+ *     
+ ********************************************************************/
+
+#ifndef IRLAP_FRAME_H
+#define IRLAP_FRAME_H
+
+#include <linux/skbuff.h>
+
+#include <net/irda/irda.h>
+
+/* A few forward declarations (to make compiler happy) */
+struct irlap_cb;
+struct discovery_t;
+
+/* Frame types and templates */
+#define INVALID   0xff
+
+/* Unnumbered (U) commands */
+#define SNRM_CMD  0x83 /* Set Normal Response Mode */
+#define DISC_CMD  0x43 /* Disconnect */
+#define XID_CMD   0x2f /* Exchange Station Identification */
+#define TEST_CMD  0xe3 /* Test */
+
+/* Unnumbered responses */
+#define RNRM_RSP  0x83 /* Request Normal Response Mode */
+#define UA_RSP    0x63 /* Unnumbered Acknowledgement */
+#define FRMR_RSP  0x87 /* Frame Reject */
+#define DM_RSP    0x0f /* Disconnect Mode */
+#define RD_RSP    0x43 /* Request Disconnection */
+#define XID_RSP   0xaf /* Exchange Station Identification */
+#define TEST_RSP  0xe3 /* Test frame */
+
+/* Supervisory (S) */
+#define RR        0x01 /* Receive Ready */
+#define REJ       0x09 /* Reject */
+#define RNR       0x05 /* Receive Not Ready */
+#define SREJ      0x0d /* Selective Reject */
+
+/* Information (I) */
+#define I_FRAME   0x00 /* Information Format */
+#define UI_FRAME  0x03 /* Unnumbered Information */
+
+#define CMD_FRAME 0x01
+#define RSP_FRAME 0x00
+
+#define PF_BIT    0x10 /* Poll/final bit */
+
+/* Some IrLAP field lengths */
+/*
+ * Only baud rate triplet is 4 bytes (PV can be 2 bytes).
+ * All others params (7) are 3 bytes, so that's 7*3 + 1*4 bytes.
+ */
+#define IRLAP_NEGOCIATION_PARAMS_LEN 25
+#define IRLAP_DISCOVERY_INFO_LEN     32
+
+struct disc_frame {
+	__u8 caddr;          /* Connection address */
+	__u8 control;
+} __packed;
+
+struct xid_frame {
+	__u8  caddr; /* Connection address */
+	__u8  control;
+	__u8  ident; /* Should always be XID_FORMAT */ 
+	__le32 saddr; /* Source device address */
+	__le32 daddr; /* Destination device address */
+	__u8  flags; /* Discovery flags */
+	__u8  slotnr;
+	__u8  version;
+} __packed;
+
+struct test_frame {
+	__u8 caddr;          /* Connection address */
+	__u8 control;
+	__le32 saddr;         /* Source device address */
+	__le32 daddr;         /* Destination device address */
+} __packed;
+
+struct ua_frame {
+	__u8 caddr;
+	__u8 control;
+	__le32 saddr; /* Source device address */
+	__le32 daddr; /* Dest device address */
+} __packed;
+
+struct dm_frame {
+	__u8 caddr;          /* Connection address */
+	__u8 control;
+} __packed;
+
+struct rd_frame {
+	__u8 caddr;          /* Connection address */
+	__u8 control;
+} __packed;
+
+struct rr_frame {
+	__u8 caddr;          /* Connection address */
+	__u8 control;
+} __packed;
+
+struct i_frame {
+	__u8 caddr;
+	__u8 control;
+} __packed;
+
+struct snrm_frame {
+	__u8  caddr;
+	__u8  control;
+	__le32 saddr;
+	__le32 daddr;
+	__u8  ncaddr;
+} __packed;
+
+void irlap_queue_xmit(struct irlap_cb *self, struct sk_buff *skb);
+void irlap_send_discovery_xid_frame(struct irlap_cb *, int S, __u8 s, 
+				    __u8 command,
+				    struct discovery_t *discovery);
+void irlap_send_snrm_frame(struct irlap_cb *, struct qos_info *);
+void irlap_send_test_frame(struct irlap_cb *self, __u8 caddr, __u32 daddr, 
+			   struct sk_buff *cmd);
+void irlap_send_ua_response_frame(struct irlap_cb *, struct qos_info *);
+void irlap_send_dm_frame(struct irlap_cb *self);
+void irlap_send_rd_frame(struct irlap_cb *self);
+void irlap_send_disc_frame(struct irlap_cb *self);
+void irlap_send_rr_frame(struct irlap_cb *self, int command);
+
+void irlap_send_data_primary(struct irlap_cb *, struct sk_buff *);
+void irlap_send_data_primary_poll(struct irlap_cb *, struct sk_buff *);
+void irlap_send_data_secondary(struct irlap_cb *, struct sk_buff *);
+void irlap_send_data_secondary_final(struct irlap_cb *, struct sk_buff *);
+void irlap_resend_rejected_frames(struct irlap_cb *, int command);
+void irlap_resend_rejected_frame(struct irlap_cb *self, int command);
+
+void irlap_send_ui_frame(struct irlap_cb *self, struct sk_buff *skb,
+			 __u8 caddr, int command);
+
+int irlap_insert_qos_negotiation_params(struct irlap_cb *self,
+					struct sk_buff *skb);
+
+#endif
diff --git a/drivers/staging/irda/include/net/irda/irlmp.h b/drivers/staging/irda/include/net/irda/irlmp.h
new file mode 100644
index 000000000000..f132924cc9da
--- /dev/null
+++ b/drivers/staging/irda/include/net/irda/irlmp.h
@@ -0,0 +1,295 @@
+/*********************************************************************
+ *                
+ * Filename:      irlmp.h
+ * Version:       0.9
+ * Description:   IrDA Link Management Protocol (LMP) layer
+ * Status:        Experimental.
+ * Author:        Dag Brattli <dagb@cs.uit.no>
+ * Created at:    Sun Aug 17 20:54:32 1997
+ * Modified at:   Fri Dec 10 13:23:01 1999
+ * Modified by:   Dag Brattli <dagb@cs.uit.no>
+ * 
+ *     Copyright (c) 1998-1999 Dag Brattli <dagb@cs.uit.no>, 
+ *     All Rights Reserved.
+ *     Copyright (c) 2000-2002 Jean Tourrilhes <jt@hpl.hp.com>
+ *     
+ *     This program is free software; you can redistribute it and/or 
+ *     modify it under the terms of the GNU General Public License as 
+ *     published by the Free Software Foundation; either version 2 of 
+ *     the License, or (at your option) any later version.
+ *
+ *     Neither Dag Brattli nor University of Tromsø admit liability nor
+ *     provide warranty for any of this software. This material is 
+ *     provided "AS-IS" and at no charge.
+ *
+ ********************************************************************/
+
+#ifndef IRLMP_H
+#define IRLMP_H
+
+#include <asm/param.h>  /* for HZ */
+
+#include <linux/types.h>
+
+#include <net/irda/irda.h>
+#include <net/irda/qos.h>
+#include <net/irda/irlap.h>		/* LAP_MAX_HEADER, ... */
+#include <net/irda/irlmp_event.h>
+#include <net/irda/irqueue.h>
+#include <net/irda/discovery.h>
+
+/* LSAP-SEL's */
+#define LSAP_MASK     0x7f
+#define LSAP_IAS      0x00
+#define LSAP_ANY      0xff
+#define LSAP_MAX      0x6f /* 0x70-0x7f are reserved */
+#define LSAP_CONNLESS 0x70 /* Connectionless LSAP, mostly used for Ultra */
+
+#define DEV_ADDR_ANY  0xffffffff
+
+#define LMP_HEADER          2    /* Dest LSAP + Source LSAP */
+#define LMP_CONTROL_HEADER  4    /* LMP_HEADER + opcode + parameter */
+#define LMP_PID_HEADER      1    /* Used by Ultra */
+#define LMP_MAX_HEADER      (LMP_CONTROL_HEADER+LAP_MAX_HEADER)
+
+#define LM_MAX_CONNECTIONS  10
+
+#define LM_IDLE_TIMEOUT     2*HZ /* 2 seconds for now */
+
+typedef enum {
+	S_PNP = 0,
+	S_PDA,
+	S_COMPUTER,
+	S_PRINTER,
+	S_MODEM,
+	S_FAX,
+	S_LAN,
+	S_TELEPHONY,
+	S_COMM,
+	S_OBEX,
+	S_ANY,
+	S_END,
+} SERVICE;
+
+/* For selective discovery */
+typedef void (*DISCOVERY_CALLBACK1) (discinfo_t *, DISCOVERY_MODE, void *);
+/* For expiry (the same) */
+typedef void (*DISCOVERY_CALLBACK2) (discinfo_t *, DISCOVERY_MODE, void *);
+
+typedef struct {
+	irda_queue_t queue; /* Must be first */
+
+	__u16_host_order hints; /* Hint bits */
+} irlmp_service_t;
+
+typedef struct {
+	irda_queue_t queue; /* Must be first */
+
+	__u16_host_order hint_mask;
+
+	DISCOVERY_CALLBACK1 disco_callback;	/* Selective discovery */
+	DISCOVERY_CALLBACK2 expir_callback;	/* Selective expiration */
+	void *priv;                /* Used to identify client */
+} irlmp_client_t;
+
+/*
+ *  Information about each logical LSAP connection
+ */
+struct lsap_cb {
+	irda_queue_t queue;      /* Must be first */
+	magic_t magic;
+
+	unsigned long connected;	/* set_bit used on this */
+	int  persistent;
+
+	__u8 slsap_sel;   /* Source (this) LSAP address */
+	__u8 dlsap_sel;   /* Destination LSAP address (if connected) */
+#ifdef CONFIG_IRDA_ULTRA
+	__u8 pid;         /* Used by connectionless LSAP */
+#endif /* CONFIG_IRDA_ULTRA */
+	struct sk_buff *conn_skb; /* Store skb here while connecting */
+
+	struct timer_list watchdog_timer;
+
+	LSAP_STATE      lsap_state;  /* Connection state */
+	notify_t        notify;      /* Indication/Confirm entry points */
+	struct qos_info qos;         /* QoS for this connection */
+
+	struct lap_cb *lap; /* Pointer to LAP connection structure */
+};
+
+/*
+ *  Used for caching the last slsap->dlsap->handle mapping
+ *
+ * We don't need to keep/match the remote address in the cache because
+ * we are associated with a specific LAP (which implies it).
+ * Jean II
+ */
+typedef struct {
+	int valid;
+
+	__u8 slsap_sel;
+	__u8 dlsap_sel;
+	struct lsap_cb *lsap;
+} CACHE_ENTRY;
+
+/*
+ *  Information about each registered IrLAP layer
+ */
+struct lap_cb {
+	irda_queue_t queue; /* Must be first */
+	magic_t magic;
+
+	int reason;    /* LAP disconnect reason */
+
+	IRLMP_STATE lap_state;
+
+	struct irlap_cb *irlap;   /* Instance of IrLAP layer */
+	hashbin_t *lsaps;         /* LSAP associated with this link */
+	struct lsap_cb *flow_next;	/* Next lsap to be polled for Tx */
+
+	__u8  caddr;  /* Connection address */
+ 	__u32 saddr;  /* Source device address */
+ 	__u32 daddr;  /* Destination device address */
+	
+	struct qos_info *qos;  /* LAP QoS for this session */
+	struct timer_list idle_timer;
+	
+#ifdef CONFIG_IRDA_CACHE_LAST_LSAP
+	/* The lsap cache was moved from struct irlmp_cb to here because
+	 * it must be associated with the specific LAP. Also, this
+	 * improves performance. - Jean II */
+	CACHE_ENTRY cache;  /* Caching last slsap->dlsap->handle mapping */
+#endif
+};
+
+/*
+ *  Main structure for IrLMP
+ */
+struct irlmp_cb {
+	magic_t magic;
+
+	__u8 conflict_flag;
+	
+	discovery_t discovery_cmd; /* Discovery command to use by IrLAP */
+	discovery_t discovery_rsp; /* Discovery response to use by IrLAP */
+
+	/* Last lsap picked automatically by irlmp_find_free_slsap() */
+	int	last_lsap_sel;
+
+	struct timer_list discovery_timer;
+
+ 	hashbin_t *links;         /* IrLAP connection table */
+	hashbin_t *unconnected_lsaps;
+ 	hashbin_t *clients;
+	hashbin_t *services;
+
+	hashbin_t *cachelog;	/* Current discovery log */
+
+	int running;
+
+	__u16_host_order hints; /* Hint bits */
+};
+
+/* Prototype declarations */
+int  irlmp_init(void);
+void irlmp_cleanup(void);
+struct lsap_cb *irlmp_open_lsap(__u8 slsap, notify_t *notify, __u8 pid);
+void irlmp_close_lsap( struct lsap_cb *self);
+
+__u16 irlmp_service_to_hint(int service);
+void *irlmp_register_service(__u16 hints);
+int irlmp_unregister_service(void *handle);
+void *irlmp_register_client(__u16 hint_mask, DISCOVERY_CALLBACK1 disco_clb,
+			    DISCOVERY_CALLBACK2 expir_clb, void *priv);
+int irlmp_unregister_client(void *handle);
+int irlmp_update_client(void *handle, __u16 hint_mask, 
+			DISCOVERY_CALLBACK1 disco_clb,
+			DISCOVERY_CALLBACK2 expir_clb, void *priv);
+
+void irlmp_register_link(struct irlap_cb *, __u32 saddr, notify_t *);
+void irlmp_unregister_link(__u32 saddr);
+
+int  irlmp_connect_request(struct lsap_cb *, __u8 dlsap_sel, 
+			   __u32 saddr, __u32 daddr,
+			   struct qos_info *, struct sk_buff *);
+void irlmp_connect_indication(struct lsap_cb *self, struct sk_buff *skb);
+int  irlmp_connect_response(struct lsap_cb *, struct sk_buff *);
+void irlmp_connect_confirm(struct lsap_cb *, struct sk_buff *);
+struct lsap_cb *irlmp_dup(struct lsap_cb *self, void *instance);
+
+void irlmp_disconnect_indication(struct lsap_cb *self, LM_REASON reason, 
+				 struct sk_buff *userdata);
+int  irlmp_disconnect_request(struct lsap_cb *, struct sk_buff *userdata);
+
+void irlmp_discovery_confirm(hashbin_t *discovery_log, DISCOVERY_MODE mode);
+void irlmp_discovery_request(int nslots);
+discinfo_t *irlmp_get_discoveries(int *pn, __u16 mask, int nslots);
+void irlmp_do_expiry(void);
+void irlmp_do_discovery(int nslots);
+discovery_t *irlmp_get_discovery_response(void);
+void irlmp_discovery_expiry(discinfo_t *expiry, int number);
+
+int  irlmp_data_request(struct lsap_cb *, struct sk_buff *);
+void irlmp_data_indication(struct lsap_cb *, struct sk_buff *);
+
+int  irlmp_udata_request(struct lsap_cb *, struct sk_buff *);
+void irlmp_udata_indication(struct lsap_cb *, struct sk_buff *);
+
+#ifdef CONFIG_IRDA_ULTRA
+int  irlmp_connless_data_request(struct lsap_cb *, struct sk_buff *, __u8);
+void irlmp_connless_data_indication(struct lsap_cb *, struct sk_buff *);
+#endif /* CONFIG_IRDA_ULTRA */
+
+void irlmp_status_indication(struct lap_cb *, LINK_STATUS link, LOCK_STATUS lock);
+void irlmp_flow_indication(struct lap_cb *self, LOCAL_FLOW flow);
+
+LM_REASON irlmp_convert_lap_reason(LAP_REASON);
+
+static inline __u32 irlmp_get_saddr(const struct lsap_cb *self)
+{
+	return (self && self->lap) ? self->lap->saddr : 0;
+}
+
+static inline __u32 irlmp_get_daddr(const struct lsap_cb *self)
+{
+	return (self && self->lap) ? self->lap->daddr : 0;
+}
+
+const char *irlmp_reason_str(LM_REASON reason);
+
+extern int sysctl_discovery_timeout;
+extern int sysctl_discovery_slots;
+extern int sysctl_discovery;
+extern int sysctl_lap_keepalive_time;	/* in ms, default is LM_IDLE_TIMEOUT */
+extern struct irlmp_cb *irlmp;
+
+/* Check if LAP queue is full.
+ * Used by IrTTP for low control, see comments in irlap.h - Jean II */
+static inline int irlmp_lap_tx_queue_full(struct lsap_cb *self)
+{
+	if (self == NULL)
+		return 0;
+	if (self->lap == NULL)
+		return 0;
+	if (self->lap->irlap == NULL)
+		return 0;
+
+	return IRLAP_GET_TX_QUEUE_LEN(self->lap->irlap) >= LAP_HIGH_THRESHOLD;
+}
+
+/* After doing a irlmp_dup(), this get one of the two socket back into
+ * a state where it's waiting incoming connections.
+ * Note : this can be used *only* if the socket is not yet connected
+ * (i.e. NO irlmp_connect_response() done on this socket).
+ * - Jean II */
+static inline void irlmp_listen(struct lsap_cb *self)
+{
+	self->dlsap_sel = LSAP_ANY;
+	self->lap = NULL;
+	self->lsap_state = LSAP_DISCONNECTED;
+	/* Started when we received the LM_CONNECT_INDICATION */
+	del_timer(&self->watchdog_timer);
+}
+
+#endif
diff --git a/drivers/staging/irda/include/net/irda/irlmp_event.h b/drivers/staging/irda/include/net/irda/irlmp_event.h
new file mode 100644
index 000000000000..9e4ec17a7449
--- /dev/null
+++ b/drivers/staging/irda/include/net/irda/irlmp_event.h
@@ -0,0 +1,98 @@
+/*********************************************************************
+ *                
+ * Filename:      irlmp_event.h
+ * Version:       0.1
+ * Description:   IrDA-LMP event handling
+ * Status:        Experimental.
+ * Author:        Dag Brattli <dagb@cs.uit.no>
+ * Created at:    Mon Aug  4 20:40:53 1997
+ * Modified at:   Thu Jul  8 12:18:54 1999
+ * Modified by:   Dag Brattli <dagb@cs.uit.no>
+ * 
+ *     Copyright (c) 1997, 1999 Dag Brattli <dagb@cs.uit.no>, 
+ *     All Rights Reserved.
+ *     Copyright (c) 2000-2002 Jean Tourrilhes <jt@hpl.hp.com>
+ *     
+ *     This program is free software; you can redistribute it and/or 
+ *     modify it under the terms of the GNU General Public License as 
+ *     published by the Free Software Foundation; either version 2 of 
+ *     the License, or (at your option) any later version.
+ *
+ *     Neither Dag Brattli nor University of Tromsø admit liability nor
+ *     provide warranty for any of this software. This material is 
+ *     provided "AS-IS" and at no charge.
+ *
+ ********************************************************************/
+
+#ifndef IRLMP_EVENT_H
+#define IRLMP_EVENT_H
+
+/* A few forward declarations (to make compiler happy) */
+struct irlmp_cb;
+struct lsap_cb;
+struct lap_cb;
+struct discovery_t;
+
+/* LAP states */
+typedef enum {
+	/* IrLAP connection control states */
+	LAP_STANDBY,             /* No LAP connection */
+	LAP_U_CONNECT,           /* Starting LAP connection */
+	LAP_ACTIVE,              /* LAP connection is active */
+} IRLMP_STATE;
+
+/* LSAP connection control states */
+typedef enum {
+	LSAP_DISCONNECTED,        /* No LSAP connection */
+	LSAP_CONNECT,             /* Connect indication from peer */
+	LSAP_CONNECT_PEND,        /* Connect request from service user */
+	LSAP_DATA_TRANSFER_READY, /* LSAP connection established */          
+	LSAP_SETUP,               /* Trying to set up LSAP connection */
+	LSAP_SETUP_PEND,          /* Request to start LAP connection */
+} LSAP_STATE;
+
+typedef enum {
+	/* LSAP events */
+ 	LM_CONNECT_REQUEST,
+ 	LM_CONNECT_CONFIRM,
+	LM_CONNECT_RESPONSE,
+ 	LM_CONNECT_INDICATION, 	
+	
+	LM_DISCONNECT_INDICATION,
+	LM_DISCONNECT_REQUEST,
+
+ 	LM_DATA_REQUEST,
+	LM_UDATA_REQUEST,
+ 	LM_DATA_INDICATION,
+	LM_UDATA_INDICATION,
+
+	LM_WATCHDOG_TIMEOUT,
+
+	/* IrLAP events */
+	LM_LAP_CONNECT_REQUEST,
+ 	LM_LAP_CONNECT_INDICATION, 
+ 	LM_LAP_CONNECT_CONFIRM,
+ 	LM_LAP_DISCONNECT_INDICATION, 
+	LM_LAP_DISCONNECT_REQUEST,
+	LM_LAP_DISCOVERY_REQUEST,
+ 	LM_LAP_DISCOVERY_CONFIRM,
+	LM_LAP_IDLE_TIMEOUT,
+} IRLMP_EVENT;
+
+extern const char *const irlmp_state[];
+extern const char *const irlsap_state[];
+
+void irlmp_watchdog_timer_expired(void *data);
+void irlmp_discovery_timer_expired(void *data);
+void irlmp_idle_timer_expired(void *data);
+
+void irlmp_do_lap_event(struct lap_cb *self, IRLMP_EVENT event, 
+			struct sk_buff *skb);
+int irlmp_do_lsap_event(struct lsap_cb *self, IRLMP_EVENT event, 
+			struct sk_buff *skb);
+
+#endif /* IRLMP_EVENT_H */
+
+
+
+
diff --git a/drivers/staging/irda/include/net/irda/irlmp_frame.h b/drivers/staging/irda/include/net/irda/irlmp_frame.h
new file mode 100644
index 000000000000..1906eb71422e
--- /dev/null
+++ b/drivers/staging/irda/include/net/irda/irlmp_frame.h
@@ -0,0 +1,62 @@
+/*********************************************************************
+ *                
+ * Filename:      irlmp_frame.h
+ * Version:       0.9
+ * Description:   
+ * Status:        Experimental.
+ * Author:        Dag Brattli <dagb@cs.uit.no>
+ * Created at:    Tue Aug 19 02:09:59 1997
+ * Modified at:   Fri Dec 10 13:21:53 1999
+ * Modified by:   Dag Brattli <dagb@cs.uit.no>
+ * 
+ *     Copyright (c) 1997, 1999 Dag Brattli <dagb@cs.uit.no>, 
+ *     All Rights Reserved.
+ *     
+ *     This program is free software; you can redistribute it and/or 
+ *     modify it under the terms of the GNU General Public License as 
+ *     published by the Free Software Foundation; either version 2 of 
+ *     the License, or (at your option) any later version.
+ *
+ *     Neither Dag Brattli nor University of Tromsø admit liability nor
+ *     provide warranty for any of this software. This material is 
+ *     provided "AS-IS" and at no charge.
+ *
+ ********************************************************************/
+
+#ifndef IRMLP_FRAME_H
+#define IRMLP_FRAME_H
+
+#include <linux/skbuff.h>
+
+#include <net/irda/discovery.h>
+
+/* IrLMP frame opcodes */
+#define CONNECT_CMD    0x01
+#define CONNECT_CNF    0x81
+#define DISCONNECT     0x02
+#define ACCESSMODE_CMD 0x03
+#define ACCESSMODE_CNF 0x83
+
+#define CONTROL_BIT    0x80
+
+void irlmp_send_data_pdu(struct lap_cb *self, __u8 dlsap, __u8 slsap,
+				int expedited, struct sk_buff *skb);
+void irlmp_send_lcf_pdu(struct lap_cb *self, __u8 dlsap, __u8 slsap, 
+			__u8 opcode, struct sk_buff *skb);
+void irlmp_link_data_indication(struct lap_cb *, struct sk_buff *, 
+				int unreliable);
+#ifdef CONFIG_IRDA_ULTRA
+void irlmp_link_unitdata_indication(struct lap_cb *, struct sk_buff *);
+#endif /* CONFIG_IRDA_ULTRA */
+
+void irlmp_link_connect_indication(struct lap_cb *, __u32 saddr, __u32 daddr,
+				   struct qos_info *qos, struct sk_buff *skb);
+void irlmp_link_connect_request(__u32 daddr);
+void irlmp_link_connect_confirm(struct lap_cb *self, struct qos_info *qos, 
+				struct sk_buff *skb);
+void irlmp_link_disconnect_indication(struct lap_cb *, struct irlap_cb *, 
+				      LAP_REASON reason, struct sk_buff *); 
+void irlmp_link_discovery_confirm(struct lap_cb *self, hashbin_t *log);
+void irlmp_link_discovery_indication(struct lap_cb *, discovery_t *discovery);
+
+#endif
diff --git a/drivers/staging/irda/include/net/irda/irmod.h b/drivers/staging/irda/include/net/irda/irmod.h
new file mode 100644
index 000000000000..86f0dbb8ee5d
--- /dev/null
+++ b/drivers/staging/irda/include/net/irda/irmod.h
@@ -0,0 +1,109 @@
+/*********************************************************************
+ *                
+ * Filename:      irmod.h
+ * Version:       0.3
+ * Description:   IrDA module and utilities functions
+ * Status:        Experimental.
+ * Author:        Dag Brattli <dagb@cs.uit.no>
+ * Created at:    Mon Dec 15 13:58:52 1997
+ * Modified at:   Fri Jan 28 13:15:24 2000
+ * Modified by:   Dag Brattli <dagb@cs.uit.no>
+ *
+ *     Copyright (c) 1998-2000 Dag Brattli, All Rights Reserved.
+ *     Copyright (c) 2000-2002 Jean Tourrilhes <jt@hpl.hp.com>
+ *      
+ *     This program is free software; you can redistribute it and/or 
+ *     modify it under the terms of the GNU General Public License as 
+ *     published by the Free Software Foundation; either version 2 of 
+ *     the License, or (at your option) any later version.
+ *  
+ *     Neither Dag Brattli nor University of Tromsø admit liability nor
+ *     provide warranty for any of this software. This material is 
+ *     provided "AS-IS" and at no charg.
+ *     
+ ********************************************************************/
+
+#ifndef IRMOD_H
+#define IRMOD_H
+
+/* Misc status information */
+typedef enum {
+	STATUS_OK,
+	STATUS_ABORTED,
+	STATUS_NO_ACTIVITY,
+	STATUS_NOISY,
+	STATUS_REMOTE,
+} LINK_STATUS;
+
+typedef enum {
+	LOCK_NO_CHANGE,
+	LOCK_LOCKED,
+	LOCK_UNLOCKED,
+} LOCK_STATUS;
+
+typedef enum { FLOW_STOP, FLOW_START } LOCAL_FLOW;
+
+/*  
+ *  IrLMP disconnect reasons. The order is very important, since they 
+ *  correspond to disconnect reasons sent in IrLMP disconnect frames, so
+ *  please do not touch :-)
+ */
+typedef enum {
+	LM_USER_REQUEST = 1,  /* User request */
+	LM_LAP_DISCONNECT,    /* Unexpected IrLAP disconnect */
+	LM_CONNECT_FAILURE,   /* Failed to establish IrLAP connection */
+	LM_LAP_RESET,         /* IrLAP reset */
+	LM_INIT_DISCONNECT,   /* Link Management initiated disconnect */
+	LM_LSAP_NOTCONN,      /* Data delivered on unconnected LSAP */
+	LM_NON_RESP_CLIENT,   /* Non responsive LM-MUX client */
+	LM_NO_AVAIL_CLIENT,   /* No available LM-MUX client */
+	LM_CONN_HALF_OPEN,    /* Connection is half open */
+	LM_BAD_SOURCE_ADDR,   /* Illegal source address (i.e 0x00) */
+} LM_REASON;
+#define LM_UNKNOWN 0xff       /* Unspecified disconnect reason */
+
+/* A few forward declarations (to make compiler happy) */
+struct qos_info;		/* in <net/irda/qos.h> */
+
+/*
+ *  Notify structure used between transport and link management layers
+ */
+typedef struct {
+	int (*data_indication)(void *priv, void *sap, struct sk_buff *skb);
+	int (*udata_indication)(void *priv, void *sap, struct sk_buff *skb);
+	void (*connect_confirm)(void *instance, void *sap, 
+				struct qos_info *qos, __u32 max_sdu_size,
+				__u8 max_header_size, struct sk_buff *skb);
+	void (*connect_indication)(void *instance, void *sap, 
+				   struct qos_info *qos, __u32 max_sdu_size, 
+				   __u8 max_header_size, struct sk_buff *skb);
+	void (*disconnect_indication)(void *instance, void *sap, 
+				      LM_REASON reason, struct sk_buff *);
+	void (*flow_indication)(void *instance, void *sap, LOCAL_FLOW flow);
+	void (*status_indication)(void *instance,
+				  LINK_STATUS link, LOCK_STATUS lock);
+	void *instance; /* Layer instance pointer */
+	char name[16];  /* Name of layer */
+} notify_t;
+
+#define NOTIFY_MAX_NAME 16
+
+/* Zero the notify structure */
+void irda_notify_init(notify_t *notify);
+
+/* Locking wrapper - Note the inverted logic on irda_lock().
+ * Those function basically return false if the lock is already in the
+ * position you want to set it. - Jean II */
+#define irda_lock(lock)		(! test_and_set_bit(0, (void *) (lock)))
+#define irda_unlock(lock)	(test_and_clear_bit(0, (void *) (lock)))
+
+#endif /* IRMOD_H */
+
+
+
+
+
+
+
+
+
diff --git a/drivers/staging/irda/include/net/irda/irqueue.h b/drivers/staging/irda/include/net/irda/irqueue.h
new file mode 100644
index 000000000000..37f512bd6733
--- /dev/null
+++ b/drivers/staging/irda/include/net/irda/irqueue.h
@@ -0,0 +1,96 @@
+/*********************************************************************
+ *                
+ * Filename:      irqueue.h
+ * Version:       0.3
+ * Description:   General queue implementation
+ * Status:        Experimental.
+ * Author:        Dag Brattli <dagb@cs.uit.no>
+ * Created at:    Tue Jun  9 13:26:50 1998
+ * Modified at:   Thu Oct  7 13:25:16 1999
+ * Modified by:   Dag Brattli <dagb@cs.uit.no>
+ * 
+ *     Copyright (C) 1998-1999, Aage Kvalnes <aage@cs.uit.no>
+ *     Copyright (c) 1998, Dag Brattli
+ *     All Rights Reserved.
+ *      
+ *     This code is taken from the Vortex Operating System written by Aage
+ *     Kvalnes and has been ported to Linux and Linux/IR by Dag Brattli
+ *
+ *     This program is free software; you can redistribute it and/or 
+ *     modify it under the terms of the GNU General Public License as 
+ *     published by the Free Software Foundation; either version 2 of 
+ *     the License, or (at your option) any later version.
+ *  
+ *     Neither Dag Brattli nor University of Tromsø admit liability nor
+ *     provide warranty for any of this software. This material is 
+ *     provided "AS-IS" and at no charge.
+ *     
+ ********************************************************************/
+
+#include <linux/types.h>
+#include <linux/spinlock.h>
+
+#ifndef IRDA_QUEUE_H
+#define IRDA_QUEUE_H
+
+#define NAME_SIZE      32
+
+/*
+ * Hash types (some flags can be xored)
+ * See comments in irqueue.c for which one to use...
+ */
+#define HB_NOLOCK	0	/* No concurent access prevention */
+#define HB_LOCK		1	/* Prevent concurent write with global lock */
+
+/*
+ * Hash defines
+ */
+#define HASHBIN_SIZE   8
+#define HASHBIN_MASK   0x7
+
+#ifndef IRDA_ALIGN 
+#define IRDA_ALIGN __attribute__((aligned))
+#endif
+
+#define Q_NULL { NULL, NULL, "", 0 }
+
+typedef void (*FREE_FUNC)(void *arg);
+
+struct irda_queue {
+	struct irda_queue *q_next;
+	struct irda_queue *q_prev;
+
+	char   q_name[NAME_SIZE];
+	long   q_hash;			/* Must be able to cast a (void *) */
+};
+typedef struct irda_queue irda_queue_t;
+
+typedef struct hashbin_t {
+	__u32      magic;
+	int        hb_type;
+	int        hb_size;
+	spinlock_t hb_spinlock;		/* HB_LOCK - Can be used by the user */
+
+	irda_queue_t* hb_queue[HASHBIN_SIZE] IRDA_ALIGN;
+
+	irda_queue_t* hb_current;
+} hashbin_t;
+
+hashbin_t *hashbin_new(int type);
+int      hashbin_delete(hashbin_t* hashbin, FREE_FUNC func);
+int      hashbin_clear(hashbin_t* hashbin, FREE_FUNC free_func);
+void     hashbin_insert(hashbin_t* hashbin, irda_queue_t* entry, long hashv, 
+			const char* name);
+void*    hashbin_remove(hashbin_t* hashbin, long hashv, const char* name);
+void*    hashbin_remove_first(hashbin_t *hashbin);
+void*	 hashbin_remove_this( hashbin_t* hashbin, irda_queue_t* entry);
+void*    hashbin_find(hashbin_t* hashbin, long hashv, const char* name);
+void*    hashbin_lock_find(hashbin_t* hashbin, long hashv, const char* name);
+void*    hashbin_find_next(hashbin_t* hashbin, long hashv, const char* name,
+			   void ** pnext);
+irda_queue_t *hashbin_get_first(hashbin_t *hashbin);
+irda_queue_t *hashbin_get_next(hashbin_t *hashbin);
+
+#define HASHBIN_GET_SIZE(hashbin) hashbin->hb_size
+
+#endif
diff --git a/drivers/staging/irda/include/net/irda/irttp.h b/drivers/staging/irda/include/net/irda/irttp.h
new file mode 100644
index 000000000000..98682d4bae8f
--- /dev/null
+++ b/drivers/staging/irda/include/net/irda/irttp.h
@@ -0,0 +1,210 @@
+/*********************************************************************
+ *                
+ * Filename:      irttp.h
+ * Version:       1.0
+ * Description:   Tiny Transport Protocol (TTP) definitions
+ * Status:        Experimental.
+ * Author:        Dag Brattli <dagb@cs.uit.no>
+ * Created at:    Sun Aug 31 20:14:31 1997
+ * Modified at:   Sun Dec 12 13:09:07 1999
+ * Modified by:   Dag Brattli <dagb@cs.uit.no>
+ * 
+ *     Copyright (c) 1998-1999 Dag Brattli <dagb@cs.uit.no>, 
+ *     All Rights Reserved.
+ *     Copyright (c) 2000-2002 Jean Tourrilhes <jt@hpl.hp.com>
+ *     
+ *     This program is free software; you can redistribute it and/or 
+ *     modify it under the terms of the GNU General Public License as 
+ *     published by the Free Software Foundation; either version 2 of 
+ *     the License, or (at your option) any later version.
+ *
+ *     Neither Dag Brattli nor University of Tromsø admit liability nor
+ *     provide warranty for any of this software. This material is 
+ *     provided "AS-IS" and at no charge.
+ *
+ ********************************************************************/
+
+#ifndef IRTTP_H
+#define IRTTP_H
+
+#include <linux/types.h>
+#include <linux/skbuff.h>
+#include <linux/spinlock.h>
+
+#include <net/irda/irda.h>
+#include <net/irda/irlmp.h>		/* struct lsap_cb */
+#include <net/irda/qos.h>		/* struct qos_info */
+#include <net/irda/irqueue.h>
+
+#define TTP_MAX_CONNECTIONS    LM_MAX_CONNECTIONS
+#define TTP_HEADER             1
+#define TTP_MAX_HEADER         (TTP_HEADER + LMP_MAX_HEADER)
+#define TTP_SAR_HEADER         5
+#define TTP_PARAMETERS         0x80
+#define TTP_MORE               0x80
+
+/* Transmission queue sizes */
+/* Worst case scenario, two window of data - Jean II */
+#define TTP_TX_MAX_QUEUE	14
+/* We need to keep at least 5 frames to make sure that we can refill
+ * appropriately the LAP layer. LAP keeps only two buffers, and we need
+ * to have 7 to make a full window - Jean II */
+#define TTP_TX_LOW_THRESHOLD	5
+/* Most clients are synchronous with respect to flow control, so we can
+ * keep a low number of Tx buffers in TTP - Jean II */
+#define TTP_TX_HIGH_THRESHOLD	7
+
+/* Receive queue sizes */
+/* Minimum of credit that the peer should hold.
+ * If the peer has less credits than 9 frames, we will explicitly send
+ * him some credits (through irttp_give_credit() and a specific frame).
+ * Note that when we give credits it's likely that it won't be sent in
+ * this LAP window, but in the next one. So, we make sure that the peer
+ * has something to send while waiting for credits (one LAP window == 7
+ * + 1 frames while he process the credits). - Jean II */
+#define TTP_RX_MIN_CREDIT	8
+/* This is the default maximum number of credits held by the peer, so the
+ * default maximum number of frames he can send us before needing flow
+ * control answer from us (this may be negociated differently at TSAP setup).
+ * We want to minimise the number of times we have to explicitly send some
+ * credit to the peer, hoping we can piggyback it on the return data. In
+ * particular, it doesn't make sense for us to send credit more than once
+ * per LAP window.
+ * Moreover, giving credits has some latency, so we need strictly more than
+ * a LAP window, otherwise we may already have credits in our Tx queue.
+ * But on the other hand, we don't want to keep too many Rx buffer here
+ * before starting to flow control the other end, so make it exactly one
+ * LAP window + 1 + MIN_CREDITS. - Jean II */
+#define TTP_RX_DEFAULT_CREDIT	16
+/* Maximum number of credits we can allow the peer to have, and therefore
+ * maximum Rx queue size.
+ * Note that we try to deliver packets to the higher layer every time we
+ * receive something, so in normal mode the Rx queue will never contains
+ * more than one or two packets. - Jean II */
+#define TTP_RX_MAX_CREDIT	21
+
+/* What clients should use when calling ttp_open_tsap() */
+#define DEFAULT_INITIAL_CREDIT	TTP_RX_DEFAULT_CREDIT
+
+/* Some priorities for disconnect requests */
+#define P_NORMAL    0
+#define P_HIGH      1
+
+#define TTP_SAR_DISABLE 0
+#define TTP_SAR_UNBOUND 0xffffffff
+
+/* Parameters */
+#define TTP_MAX_SDU_SIZE 0x01
+
+/*
+ *  This structure contains all data associated with one instance of a TTP 
+ *  connection.
+ */
+struct tsap_cb {
+	irda_queue_t q;            /* Must be first */
+	magic_t magic;        /* Just in case */
+
+	__u8 stsap_sel;       /* Source TSAP */
+	__u8 dtsap_sel;       /* Destination TSAP */
+
+	struct lsap_cb *lsap; /* Corresponding LSAP to this TSAP */
+
+	__u8 connected;       /* TSAP connected */
+	 
+	__u8 initial_credit;  /* Initial credit to give peer */
+
+        int avail_credit;    /* Available credit to return to peer */
+	int remote_credit;   /* Credit held by peer TTP entity */
+	int send_credit;     /* Credit held by local TTP entity */
+	
+	struct sk_buff_head tx_queue; /* Frames to be transmitted */
+	struct sk_buff_head rx_queue; /* Received frames */
+	struct sk_buff_head rx_fragments;
+	int tx_queue_lock;
+	int rx_queue_lock;
+	spinlock_t lock;
+
+	notify_t notify;       /* Callbacks to client layer */
+
+	struct net_device_stats stats;
+	struct timer_list todo_timer; 
+
+	__u32 max_seg_size;     /* Max data that fit into an IrLAP frame */
+	__u8  max_header_size;
+
+	int   rx_sdu_busy;     /* RxSdu.busy */
+	__u32 rx_sdu_size;     /* Current size of a partially received frame */
+	__u32 rx_max_sdu_size; /* Max receive user data size */
+
+	int tx_sdu_busy;       /* TxSdu.busy */
+	__u32 tx_max_sdu_size; /* Max transmit user data size */
+
+	int close_pend;        /* Close, but disconnect_pend */
+	unsigned long disconnect_pend; /* Disconnect, but still data to send */
+	struct sk_buff *disconnect_skb;
+};
+
+struct irttp_cb {
+	magic_t    magic;	
+	hashbin_t *tsaps;
+};
+
+int  irttp_init(void);
+void irttp_cleanup(void);
+
+struct tsap_cb *irttp_open_tsap(__u8 stsap_sel, int credit, notify_t *notify);
+int irttp_close_tsap(struct tsap_cb *self);
+
+int irttp_data_request(struct tsap_cb *self, struct sk_buff *skb);
+int irttp_udata_request(struct tsap_cb *self, struct sk_buff *skb);
+
+int irttp_connect_request(struct tsap_cb *self, __u8 dtsap_sel, 
+			  __u32 saddr, __u32 daddr,
+			  struct qos_info *qos, __u32 max_sdu_size, 
+			  struct sk_buff *userdata);
+int irttp_connect_response(struct tsap_cb *self, __u32 max_sdu_size, 
+			    struct sk_buff *userdata);
+int irttp_disconnect_request(struct tsap_cb *self, struct sk_buff *skb,
+			     int priority);
+void irttp_flow_request(struct tsap_cb *self, LOCAL_FLOW flow);
+struct tsap_cb *irttp_dup(struct tsap_cb *self, void *instance);
+
+static inline __u32 irttp_get_saddr(struct tsap_cb *self)
+{
+	return irlmp_get_saddr(self->lsap);
+}
+
+static inline __u32 irttp_get_daddr(struct tsap_cb *self)
+{
+	return irlmp_get_daddr(self->lsap);
+}
+
+static inline __u32 irttp_get_max_seg_size(struct tsap_cb *self)
+{
+	return self->max_seg_size;
+}
+
+/* After doing a irttp_dup(), this get one of the two socket back into
+ * a state where it's waiting incoming connections.
+ * Note : this can be used *only* if the socket is not yet connected
+ * (i.e. NO irttp_connect_response() done on this socket).
+ * - Jean II */
+static inline void irttp_listen(struct tsap_cb *self)
+{
+	irlmp_listen(self->lsap);
+	self->dtsap_sel = LSAP_ANY;
+}
+
+/* Return TRUE if the node is in primary mode (i.e. master)
+ * - Jean II */
+static inline int irttp_is_primary(struct tsap_cb *self)
+{
+	if ((self == NULL) ||
+	    (self->lsap == NULL) ||
+	    (self->lsap->lap == NULL) ||
+	    (self->lsap->lap->irlap == NULL))
+		return -2;
+	return irlap_is_primary(self->lsap->lap->irlap);
+}
+
+#endif /* IRTTP_H */
diff --git a/drivers/staging/irda/include/net/irda/parameters.h b/drivers/staging/irda/include/net/irda/parameters.h
new file mode 100644
index 000000000000..2d9cd0007cba
--- /dev/null
+++ b/drivers/staging/irda/include/net/irda/parameters.h
@@ -0,0 +1,100 @@
+/*********************************************************************
+ *                
+ * Filename:      parameters.h
+ * Version:       1.0
+ * Description:   A more general way to handle (pi,pl,pv) parameters
+ * Status:        Experimental.
+ * Author:        Dag Brattli <dagb@cs.uit.no>
+ * Created at:    Mon Jun  7 08:47:28 1999
+ * Modified at:   Sun Jan 30 14:05:14 2000
+ * Modified by:   Dag Brattli <dagb@cs.uit.no>
+ * 
+ *     Copyright (c) 1999-2000 Dag Brattli, All Rights Reserved.
+ *     
+ *     This program is free software; you can redistribute it and/or 
+ *     modify it under the terms of the GNU General Public License as 
+ *     published by the Free Software Foundation; either version 2 of 
+ *     the License, or (at your option) any later version.
+ * 
+ *     This program is distributed in the hope that it will be useful,
+ *     but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ *     GNU General Public License for more details.
+ * 
+ *     You should have received a copy of the GNU General Public License 
+ *     along with this program; if not, see <http://www.gnu.org/licenses/>.
+ *
+ *     Michel Dänzer <daenzer@debian.org>, 10/2001
+ *     - simplify irda_pv_t to avoid endianness issues
+ *     
+ ********************************************************************/
+
+#ifndef IRDA_PARAMS_H
+#define IRDA_PARAMS_H
+
+/*
+ *  The currently supported types. Beware not to change the sequence since
+ *  it a good reason why the sized integers has a value equal to their size
+ */
+typedef enum {
+	PV_INTEGER,      /* Integer of any (pl) length */
+	PV_INT_8_BITS,   /* Integer of 8 bits in length */
+	PV_INT_16_BITS,  /* Integer of 16 bits in length */
+	PV_STRING,       /* \0 terminated string */
+	PV_INT_32_BITS,  /* Integer of 32 bits in length */
+	PV_OCT_SEQ,      /* Octet sequence */
+	PV_NO_VALUE      /* Does not contain any value (pl=0) */
+} PV_TYPE;
+
+/* Bit 7 of type field */
+#define PV_BIG_ENDIAN    0x80 
+#define PV_LITTLE_ENDIAN 0x00
+#define PV_MASK          0x7f   /* To mask away endian bit */
+
+#define PV_PUT 0
+#define PV_GET 1
+
+typedef union {
+	char   *c;
+	__u32   i;
+	__u32 *ip;
+} irda_pv_t;
+
+typedef struct {
+	__u8 pi;
+	__u8 pl;
+	irda_pv_t pv;
+} irda_param_t;
+
+typedef int (*PI_HANDLER)(void *self, irda_param_t *param, int get);
+typedef int (*PV_HANDLER)(void *self, __u8 *buf, int len, __u8 pi,
+			  PV_TYPE type, PI_HANDLER func);
+
+typedef struct {
+	const PI_HANDLER func;  /* Handler for this parameter identifier */
+	PV_TYPE    type;  /* Data type for this parameter */
+} pi_minor_info_t;
+
+typedef struct {
+	const pi_minor_info_t *pi_minor_call_table;
+	int len;
+} pi_major_info_t;
+
+typedef struct {
+	const pi_major_info_t *tables;
+	int              len;
+	__u8             pi_mask;
+	int              pi_major_offset;
+} pi_param_info_t;
+
+int irda_param_pack(__u8 *buf, char *fmt, ...);
+
+int irda_param_insert(void *self, __u8 pi, __u8 *buf, int len, 
+		      pi_param_info_t *info);
+int irda_param_extract_all(void *self, __u8 *buf, int len, 
+			   pi_param_info_t *info);
+
+#define irda_param_insert_byte(buf,pi,pv) irda_param_pack(buf,"bbb",pi,1,pv)
+
+#endif /* IRDA_PARAMS_H */
+
diff --git a/drivers/staging/irda/include/net/irda/qos.h b/drivers/staging/irda/include/net/irda/qos.h
new file mode 100644
index 000000000000..05a5a249956f
--- /dev/null
+++ b/drivers/staging/irda/include/net/irda/qos.h
@@ -0,0 +1,101 @@
+/*********************************************************************
+ *                
+ * Filename:      qos.h
+ * Version:       1.0
+ * Description:   Quality of Service definitions
+ * Status:        Experimental.
+ * Author:        Dag Brattli <dagb@cs.uit.no>
+ * Created at:    Fri Sep 19 23:21:09 1997
+ * Modified at:   Thu Dec  2 13:51:54 1999
+ * Modified by:   Dag Brattli <dagb@cs.uit.no>
+ * 
+ *     Copyright (c) 1999 Dag Brattli, All Rights Reserved.
+ *     
+ *     This program is free software; you can redistribute it and/or 
+ *     modify it under the terms of the GNU General Public License as 
+ *     published by the Free Software Foundation; either version 2 of 
+ *     the License, or (at your option) any later version.
+ * 
+ *     This program is distributed in the hope that it will be useful,
+ *     but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ *     GNU General Public License for more details.
+ * 
+ *     You should have received a copy of the GNU General Public License 
+ *     along with this program; if not, see <http://www.gnu.org/licenses/>.
+ *     
+ ********************************************************************/
+
+#ifndef IRDA_QOS_H
+#define IRDA_QOS_H
+
+#include <linux/skbuff.h>
+
+#include <net/irda/parameters.h>
+
+#define PI_BAUD_RATE     0x01
+#define PI_MAX_TURN_TIME 0x82
+#define PI_DATA_SIZE     0x83
+#define PI_WINDOW_SIZE   0x84
+#define PI_ADD_BOFS      0x85
+#define PI_MIN_TURN_TIME 0x86
+#define PI_LINK_DISC     0x08
+
+#define IR_115200_MAX 0x3f
+
+/* Baud rates (first byte) */
+#define IR_2400     0x01
+#define IR_9600     0x02
+#define IR_19200    0x04
+#define IR_38400    0x08
+#define IR_57600    0x10
+#define IR_115200   0x20
+#define IR_576000   0x40
+#define IR_1152000  0x80
+
+/* Baud rates (second byte) */
+#define IR_4000000  0x01
+#define IR_16000000 0x02
+
+/* Quality of Service information */
+typedef struct {
+	__u32 value;
+	__u16 bits; /* LSB is first byte, MSB is second byte */
+} qos_value_t;
+
+struct qos_info {
+	magic_t magic;
+
+	qos_value_t baud_rate;       /* IR_11520O | ... */
+	qos_value_t max_turn_time;
+	qos_value_t data_size;
+	qos_value_t window_size;
+	qos_value_t additional_bofs;
+	qos_value_t min_turn_time;
+	qos_value_t link_disc_time;
+	
+	qos_value_t power;
+};
+
+extern int sysctl_max_baud_rate;
+extern int sysctl_max_inactive_time;
+
+void irda_init_max_qos_capabilies(struct qos_info *qos);
+void irda_qos_compute_intersection(struct qos_info *, struct qos_info *);
+
+__u32 irlap_max_line_capacity(__u32 speed, __u32 max_turn_time);
+
+void irda_qos_bits_to_value(struct qos_info *qos);
+
+/* So simple, how could we not inline those two ?
+ * Note : one byte is 10 bits if you include start and stop bits
+ * Jean II */
+#define irlap_min_turn_time_in_bytes(speed, min_turn_time) (	\
+	speed * min_turn_time / 10000000			\
+)
+#define irlap_xbofs_in_usec(speed, xbofs) (			\
+	xbofs * 10000000 / speed				\
+)
+
+#endif
+
diff --git a/drivers/staging/irda/include/net/irda/timer.h b/drivers/staging/irda/include/net/irda/timer.h
new file mode 100644
index 000000000000..d784f242cf7b
--- /dev/null
+++ b/drivers/staging/irda/include/net/irda/timer.h
@@ -0,0 +1,105 @@
+/*********************************************************************
+ *                
+ * Filename:      timer.h
+ * Version:       
+ * Description:   
+ * Status:        Experimental.
+ * Author:        Dag Brattli <dagb@cs.uit.no>
+ * Created at:    Sat Aug 16 00:59:29 1997
+ * Modified at:   Thu Oct  7 12:25:24 1999
+ * Modified by:   Dag Brattli <dagb@cs.uit.no>
+ * 
+ *     Copyright (c) 1997, 1998-1999 Dag Brattli <dagb@cs.uit.no>, 
+ *     All Rights Reserved.
+ *     Copyright (c) 2000-2002 Jean Tourrilhes <jt@hpl.hp.com>
+ *     
+ *     This program is free software; you can redistribute it and/or 
+ *     modify it under the terms of the GNU General Public License as 
+ *     published by the Free Software Foundation; either version 2 of 
+ *     the License, or (at your option) any later version.
+ *
+ *     Neither Dag Brattli nor University of Tromsø admit liability nor
+ *     provide warranty for any of this software. This material is 
+ *     provided "AS-IS" and at no charge.
+ *
+ ********************************************************************/
+
+#ifndef TIMER_H
+#define TIMER_H
+
+#include <linux/timer.h>
+#include <linux/jiffies.h>
+
+#include <asm/param.h>  /* for HZ */
+
+#include <net/irda/irda.h>
+
+/* A few forward declarations (to make compiler happy) */
+struct irlmp_cb;
+struct irlap_cb;
+struct lsap_cb;
+struct lap_cb;
+
+/* 
+ *  Timeout definitions, some defined in IrLAP 6.13.5 - p. 92
+ */
+#define POLL_TIMEOUT        (450*HZ/1000)    /* Must never exceed 500 ms */
+#define FINAL_TIMEOUT       (500*HZ/1000)    /* Must never exceed 500 ms */
+
+/* 
+ *  Normally twice of p-timer. Note 3, IrLAP 6.3.11.2 - p. 60 suggests
+ *  at least twice duration of the P-timer.
+ */
+#define WD_TIMEOUT          (POLL_TIMEOUT*2)
+
+#define MEDIABUSY_TIMEOUT   (500*HZ/1000)    /* 500 msec */
+#define SMALLBUSY_TIMEOUT   (100*HZ/1000)    /* 100 msec - IrLAP 6.13.4 */
+
+/*
+ *  Slot timer must never exceed 85 ms, and must always be at least 25 ms, 
+ *  suggested to  75-85 msec by IrDA lite. This doesn't work with a lot of
+ *  devices, and other stackes uses a lot more, so it's best we do it as well
+ *  (Note : this is the default value and sysctl overrides it - Jean II)
+ */
+#define SLOT_TIMEOUT            (90*HZ/1000)
+
+/* 
+ *  The latest discovery frame (XID) is longer due to the extra discovery
+ *  information (hints, device name...). This is its extra length.
+ *  We use that when setting the query timeout. Jean II
+ */
+#define XIDEXTRA_TIMEOUT        (34*HZ/1000)  /* 34 msec */
+
+#define WATCHDOG_TIMEOUT        (20*HZ)       /* 20 sec */
+
+typedef void (*TIMER_CALLBACK)(void *);
+
+static inline void irda_start_timer(struct timer_list *ptimer, int timeout, 
+				    void* data, TIMER_CALLBACK callback)
+{
+	ptimer->function = (void (*)(unsigned long)) callback;
+	ptimer->data = (unsigned long) data;
+	
+	/* Set new value for timer (update or add timer).
+	 * We use mod_timer() because it's more efficient and also
+	 * safer with respect to race conditions - Jean II */
+	mod_timer(ptimer, jiffies + timeout);
+}
+
+
+void irlap_start_slot_timer(struct irlap_cb *self, int timeout);
+void irlap_start_query_timer(struct irlap_cb *self, int S, int s);
+void irlap_start_final_timer(struct irlap_cb *self, int timeout);
+void irlap_start_wd_timer(struct irlap_cb *self, int timeout);
+void irlap_start_backoff_timer(struct irlap_cb *self, int timeout);
+
+void irlap_start_mbusy_timer(struct irlap_cb *self, int timeout);
+void irlap_stop_mbusy_timer(struct irlap_cb *);
+
+void irlmp_start_watchdog_timer(struct lsap_cb *, int timeout);
+void irlmp_start_discovery_timer(struct irlmp_cb *, int timeout);
+void irlmp_start_idle_timer(struct lap_cb *, int timeout);
+void irlmp_stop_idle_timer(struct lap_cb *self);
+
+#endif
+
diff --git a/drivers/staging/irda/include/net/irda/wrapper.h b/drivers/staging/irda/include/net/irda/wrapper.h
new file mode 100644
index 000000000000..eef53ebe3d76
--- /dev/null
+++ b/drivers/staging/irda/include/net/irda/wrapper.h
@@ -0,0 +1,58 @@
+/*********************************************************************
+ *                
+ * Filename:      wrapper.h
+ * Version:       1.2
+ * Description:   IrDA SIR async wrapper layer
+ * Status:        Experimental.
+ * Author:        Dag Brattli <dagb@cs.uit.no>
+ * Created at:    Mon Aug  4 20:40:53 1997
+ * Modified at:   Tue Jan 11 12:37:29 2000
+ * Modified by:   Dag Brattli <dagb@cs.uit.no>
+ * 
+ *     Copyright (c) 1998-2000 Dag Brattli <dagb@cs.uit.no>, 
+ *     All Rights Reserved.
+ *     
+ *     This program is free software; you can redistribute it and/or 
+ *     modify it under the terms of the GNU General Public License as 
+ *     published by the Free Software Foundation; either version 2 of 
+ *     the License, or (at your option) any later version.
+ *
+ *     Neither Dag Brattli nor University of Tromsø admit liability nor
+ *     provide warranty for any of this software. This material is 
+ *     provided "AS-IS" and at no charge.
+ *
+ ********************************************************************/
+
+#ifndef WRAPPER_H
+#define WRAPPER_H
+
+#include <linux/types.h>
+#include <linux/skbuff.h>
+#include <linux/netdevice.h>
+
+#include <net/irda/irda_device.h>	/* iobuff_t */
+
+#define BOF  0xc0 /* Beginning of frame */
+#define XBOF 0xff
+#define EOF  0xc1 /* End of frame */
+#define CE   0x7d /* Control escape */
+
+#define STA BOF  /* Start flag */
+#define STO EOF  /* End flag */
+
+#define IRDA_TRANS 0x20    /* Asynchronous transparency modifier */       
+
+/* States for receiving a frame in async mode */
+enum {
+	OUTSIDE_FRAME, 
+	BEGIN_FRAME, 
+	LINK_ESCAPE, 
+	INSIDE_FRAME
+};
+
+/* Proto definitions */
+int async_wrap_skb(struct sk_buff *skb, __u8 *tx_buff, int buffsize);
+void async_unwrap_char(struct net_device *dev, struct net_device_stats *stats,
+		       iobuff_t *buf, __u8 byte);
+
+#endif
diff --git a/drivers/staging/irda/net/Makefile b/drivers/staging/irda/net/Makefile
index 187f6c563a4b..bd1a635b88cf 100644
--- a/drivers/staging/irda/net/Makefile
+++ b/drivers/staging/irda/net/Makefile
@@ -2,6 +2,8 @@
 # Makefile for the Linux IrDA protocol layer.
 #
 
+subdir-ccflags-y += -I$(srctree)/drivers/staging/irda/include
+
 obj-$(CONFIG_IRDA) += irda.o
 obj-$(CONFIG_IRLAN) += irlan/
 obj-$(CONFIG_IRNET) += irnet/
diff --git a/include/net/irda/af_irda.h b/include/net/irda/af_irda.h
deleted file mode 100644
index 0df574931522..000000000000
--- a/include/net/irda/af_irda.h
+++ /dev/null
@@ -1,87 +0,0 @@
-/*********************************************************************
- *                
- * Filename:      af_irda.h
- * Version:       1.0
- * Description:   IrDA sockets declarations
- * Status:        Stable
- * Author:        Dag Brattli <dagb@cs.uit.no>
- * Created at:    Tue Dec  9 21:13:12 1997
- * Modified at:   Fri Jan 28 13:16:32 2000
- * Modified by:   Dag Brattli <dagb@cs.uit.no>
- * 
- *     Copyright (c) 1998-2000 Dag Brattli, All Rights Reserved.
- *     Copyright (c) 2000-2002 Jean Tourrilhes <jt@hpl.hp.com>
- *      
- *     This program is free software; you can redistribute it and/or 
- *     modify it under the terms of the GNU General Public License as 
- *     published by the Free Software Foundation; either version 2 of 
- *     the License, or (at your option) any later version.
- *  
- *     Neither Dag Brattli nor University of Tromsø admit liability nor
- *     provide warranty for any of this software. This material is 
- *     provided "AS-IS" and at no charge.
- *     
- ********************************************************************/
-
-#ifndef AF_IRDA_H
-#define AF_IRDA_H
-
-#include <linux/irda.h>
-#include <net/irda/irda.h>
-#include <net/irda/iriap.h>		/* struct iriap_cb */
-#include <net/irda/irias_object.h>	/* struct ias_value */
-#include <net/irda/irlmp.h>		/* struct lsap_cb */
-#include <net/irda/irttp.h>		/* struct tsap_cb */
-#include <net/irda/discovery.h>		/* struct discovery_t */
-#include <net/sock.h>
-
-/* IrDA Socket */
-struct irda_sock {
-	/* struct sock has to be the first member of irda_sock */
-	struct sock sk;
-	__u32 saddr;          /* my local address */
-	__u32 daddr;          /* peer address */
-
-	struct lsap_cb *lsap; /* LSAP used by Ultra */
-	__u8  pid;            /* Protocol IP (PID) used by Ultra */
-
-	struct tsap_cb *tsap; /* TSAP used by this connection */
-	__u8 dtsap_sel;       /* remote TSAP address */
-	__u8 stsap_sel;       /* local TSAP address */
-	
-	__u32 max_sdu_size_rx;
-	__u32 max_sdu_size_tx;
-	__u32 max_data_size;
-	__u8  max_header_size;
-	struct qos_info qos_tx;
-
-	__u16_host_order mask;           /* Hint bits mask */
-	__u16_host_order hints;          /* Hint bits */
-
-	void *ckey;           /* IrLMP client handle */
-	void *skey;           /* IrLMP service handle */
-
-	struct ias_object *ias_obj;   /* Our service name + lsap in IAS */
-	struct iriap_cb *iriap;	      /* Used to query remote IAS */
-	struct ias_value *ias_result; /* Result of remote IAS query */
-
-	hashbin_t *cachelog;		/* Result of discovery query */
-	__u32 cachedaddr;	/* Result of selective discovery query */
-
-	int nslots;           /* Number of slots to use for discovery */
-
-	int errno;            /* status of the IAS query */
-
-	wait_queue_head_t query_wait;	/* Wait for the answer to a query */
-	struct timer_list watchdog;	/* Timeout for discovery */
-
-	LOCAL_FLOW tx_flow;
-	LOCAL_FLOW rx_flow;
-};
-
-static inline struct irda_sock *irda_sk(struct sock *sk)
-{
-	return (struct irda_sock *)sk;
-}
-
-#endif /* AF_IRDA_H */
diff --git a/include/net/irda/crc.h b/include/net/irda/crc.h
deleted file mode 100644
index f202296df9bb..000000000000
--- a/include/net/irda/crc.h
+++ /dev/null
@@ -1,29 +0,0 @@
-/*********************************************************************
- *                
- * Filename:      crc.h
- * Version:       
- * Description:   CRC routines
- * Status:        Experimental.
- * Author:        Dag Brattli <dagb@cs.uit.no>
- * Created at:    Mon Aug  4 20:40:53 1997
- * Modified at:   Sun May  2 20:25:23 1999
- * Modified by:   Dag Brattli <dagb@cs.uit.no>
- * 
- ********************************************************************/
-
-#ifndef IRDA_CRC_H
-#define IRDA_CRC_H
-
-#include <linux/types.h>
-#include <linux/crc-ccitt.h>
-
-#define INIT_FCS  0xffff   /* Initial FCS value */
-#define GOOD_FCS  0xf0b8   /* Good final FCS value */
-
-/* Recompute the FCS with one more character appended. */
-#define irda_fcs(fcs, c) crc_ccitt_byte(fcs, c)
-
-/* Recompute the FCS with len bytes appended. */
-#define irda_calc_crc16(fcs, buf, len) crc_ccitt(fcs, buf, len)
-
-#endif
diff --git a/include/net/irda/discovery.h b/include/net/irda/discovery.h
deleted file mode 100644
index 63ae32530567..000000000000
--- a/include/net/irda/discovery.h
+++ /dev/null
@@ -1,95 +0,0 @@
-/*********************************************************************
- *                
- * Filename:      discovery.h
- * Version:       
- * Description:   
- * Status:        Experimental.
- * Author:        Dag Brattli <dagb@cs.uit.no>
- * Created at:    Tue Apr  6 16:53:53 1999
- * Modified at:   Tue Oct  5 10:05:10 1999
- * Modified by:   Dag Brattli <dagb@cs.uit.no>
- * 
- *     Copyright (c) 1999 Dag Brattli, All Rights Reserved.
- *     Copyright (c) 2000-2002 Jean Tourrilhes <jt@hpl.hp.com>
- *     
- *     This program is free software; you can redistribute it and/or 
- *     modify it under the terms of the GNU General Public License as 
- *     published by the Free Software Foundation; either version 2 of 
- *     the License, or (at your option) any later version.
- * 
- *     This program is distributed in the hope that it will be useful,
- *     but WITHOUT ANY WARRANTY; without even the implied warranty of
- *     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- *     GNU General Public License for more details.
- * 
- *     You should have received a copy of the GNU General Public License 
- *     along with this program; if not, see <http://www.gnu.org/licenses/>.
- *     
- ********************************************************************/
-
-#ifndef DISCOVERY_H
-#define DISCOVERY_H
-
-#include <asm/param.h>
-
-#include <net/irda/irda.h>
-#include <net/irda/irqueue.h>		/* irda_queue_t */
-#include <net/irda/irlap_event.h>	/* LAP_REASON */
-
-#define DISCOVERY_EXPIRE_TIMEOUT (2*sysctl_discovery_timeout*HZ)
-#define DISCOVERY_DEFAULT_SLOTS  0
-
-/*
- *  This type is used by the protocols that transmit 16 bits words in 
- *  little endian format. A little endian machine stores MSB of word in
- *  byte[1] and LSB in byte[0]. A big endian machine stores MSB in byte[0] 
- *  and LSB in byte[1].
- *
- * This structure is used in the code for things that are endian neutral
- * but that fit in a word so that we can manipulate them efficiently.
- * By endian neutral, I mean things that are really an array of bytes,
- * and always used as such, for example the hint bits. Jean II
- */
-typedef union {
-	__u16 word;
-	__u8  byte[2];
-} __u16_host_order;
-
-/* Types of discovery */
-typedef enum {
-	DISCOVERY_LOG,		/* What's in our discovery log */
-	DISCOVERY_ACTIVE,	/* Doing our own discovery on the medium */
-	DISCOVERY_PASSIVE,	/* Peer doing discovery on the medium */
-	EXPIRY_TIMEOUT,		/* Entry expired due to timeout */
-} DISCOVERY_MODE;
-
-#define NICKNAME_MAX_LEN 21
-
-/* Basic discovery information about a peer */
-typedef struct irda_device_info		discinfo_t;	/* linux/irda.h */
-
-/*
- * The DISCOVERY structure is used for both discovery requests and responses
- */
-typedef struct discovery_t {
-	irda_queue_t	q;		/* Must be first! */
-
-	discinfo_t	data;		/* Basic discovery information */
-	int		name_len;	/* Length of nickname */
-
-	LAP_REASON	condition;	/* More info about the discovery */
-	int		gen_addr_bit;	/* Need to generate a new device
-					 * address? */
-	int		nslots;		/* Number of slots to use when
-					 * discovering */
-	unsigned long	timestamp;	/* Last time discovered */
-	unsigned long	firststamp;	/* First time discovered */
-} discovery_t;
-
-void irlmp_add_discovery(hashbin_t *cachelog, discovery_t *discovery);
-void irlmp_add_discovery_log(hashbin_t *cachelog, hashbin_t *log);
-void irlmp_expire_discoveries(hashbin_t *log, __u32 saddr, int force);
-struct irda_device_info *irlmp_copy_discoveries(hashbin_t *log, int *pn,
-						__u16 mask, int old_entries);
-
-#endif
diff --git a/include/net/irda/ircomm_core.h b/include/net/irda/ircomm_core.h
deleted file mode 100644
index 2a580ce9edad..000000000000
--- a/include/net/irda/ircomm_core.h
+++ /dev/null
@@ -1,106 +0,0 @@
-/*********************************************************************
- *                
- * Filename:      ircomm_core.h
- * Version:       
- * Description:   
- * Status:        Experimental.
- * Author:        Dag Brattli <dagb@cs.uit.no>
- * Created at:    Wed Jun  9 08:58:43 1999
- * Modified at:   Mon Dec 13 11:52:29 1999
- * Modified by:   Dag Brattli <dagb@cs.uit.no>
- * 
- *     Copyright (c) 1999 Dag Brattli, All Rights Reserved.
- *     
- *     This program is free software; you can redistribute it and/or 
- *     modify it under the terms of the GNU General Public License as 
- *     published by the Free Software Foundation; either version 2 of 
- *     the License, or (at your option) any later version.
- * 
- *     This program is distributed in the hope that it will be useful,
- *     but WITHOUT ANY WARRANTY; without even the implied warranty of
- *     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- *     GNU General Public License for more details.
- * 
- *     You should have received a copy of the GNU General Public License 
- *     along with this program; if not, see <http://www.gnu.org/licenses/>.
- *     
- ********************************************************************/
-
-#ifndef IRCOMM_CORE_H
-#define IRCOMM_CORE_H
-
-#include <net/irda/irda.h>
-#include <net/irda/irqueue.h>
-#include <net/irda/ircomm_event.h>
-
-#define IRCOMM_MAGIC 0x98347298
-#define IRCOMM_HEADER_SIZE 1
-
-struct ircomm_cb;   /* Forward decl. */
-
-/*
- * A small call-table, so we don't have to check the service-type whenever
- * we want to do something
- */
-typedef struct {
-	int (*data_request)(struct ircomm_cb *, struct sk_buff *, int clen);
-	int (*connect_request)(struct ircomm_cb *, struct sk_buff *, 
-			       struct ircomm_info *);
-	int (*connect_response)(struct ircomm_cb *, struct sk_buff *);
-	int (*disconnect_request)(struct ircomm_cb *, struct sk_buff *, 
-				  struct ircomm_info *);	
-} call_t;
-
-struct ircomm_cb {
-	irda_queue_t queue;
-	magic_t magic;
-
-	notify_t notify;
-	call_t   issue;
-
-	int state;
-	int line;            /* Which TTY line we are using */
-
-	struct tsap_cb *tsap;
-	struct lsap_cb *lsap;
-	
-	__u8 dlsap_sel;      /* Destination LSAP/TSAP selector */
-	__u8 slsap_sel;      /* Source LSAP/TSAP selector */
-
-	__u32 saddr;         /* Source device address (link we are using) */
-	__u32 daddr;         /* Destination device address */
-
-	int max_header_size; /* Header space we must reserve for each frame */
-	int max_data_size;   /* The amount of data we can fill in each frame */
-
-	LOCAL_FLOW flow_status; /* Used by ircomm_lmp */
-	int pkt_count;          /* Number of frames we have sent to IrLAP */
-
-	__u8 service_type;
-};
-
-extern hashbin_t *ircomm;
-
-struct ircomm_cb *ircomm_open(notify_t *notify, __u8 service_type, int line);
-int ircomm_close(struct ircomm_cb *self);
-
-int ircomm_data_request(struct ircomm_cb *self, struct sk_buff *skb);
-void ircomm_data_indication(struct ircomm_cb *self, struct sk_buff *skb);
-void ircomm_process_data(struct ircomm_cb *self, struct sk_buff *skb);
-int ircomm_control_request(struct ircomm_cb *self, struct sk_buff *skb);
-int ircomm_connect_request(struct ircomm_cb *self, __u8 dlsap_sel, 
-			   __u32 saddr, __u32 daddr, struct sk_buff *skb,
-			   __u8 service_type);
-void ircomm_connect_indication(struct ircomm_cb *self, struct sk_buff *skb,
-			       struct ircomm_info *info);
-void ircomm_connect_confirm(struct ircomm_cb *self, struct sk_buff *skb,
-			    struct ircomm_info *info);
-int ircomm_connect_response(struct ircomm_cb *self, struct sk_buff *userdata);
-int ircomm_disconnect_request(struct ircomm_cb *self, struct sk_buff *userdata);
-void ircomm_disconnect_indication(struct ircomm_cb *self, struct sk_buff *skb,
-				  struct ircomm_info *info);
-void ircomm_flow_request(struct ircomm_cb *self, LOCAL_FLOW flow);
-
-#define ircomm_is_connected(self) (self->state == IRCOMM_CONN)
-
-#endif
diff --git a/include/net/irda/ircomm_event.h b/include/net/irda/ircomm_event.h
deleted file mode 100644
index 5bbc32998d57..000000000000
--- a/include/net/irda/ircomm_event.h
+++ /dev/null
@@ -1,83 +0,0 @@
-/*********************************************************************
- *                
- * Filename:      ircomm_event.h
- * Version:       
- * Description:   
- * Status:        Experimental.
- * Author:        Dag Brattli <dagb@cs.uit.no>
- * Created at:    Sun Jun  6 23:51:13 1999
- * Modified at:   Thu Jun 10 08:36:25 1999
- * Modified by:   Dag Brattli <dagb@cs.uit.no>
- * 
- *     Copyright (c) 1999 Dag Brattli, All Rights Reserved.
- *     
- *     This program is free software; you can redistribute it and/or 
- *     modify it under the terms of the GNU General Public License as 
- *     published by the Free Software Foundation; either version 2 of 
- *     the License, or (at your option) any later version.
- * 
- *     This program is distributed in the hope that it will be useful,
- *     but WITHOUT ANY WARRANTY; without even the implied warranty of
- *     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- *     GNU General Public License for more details.
- * 
- *     You should have received a copy of the GNU General Public License 
- *     along with this program; if not, see <http://www.gnu.org/licenses/>.
- *     
- ********************************************************************/
-
-#ifndef IRCOMM_EVENT_H
-#define IRCOMM_EVENT_H
-
-#include <net/irda/irmod.h>
-
-typedef enum {
-        IRCOMM_IDLE,
-        IRCOMM_WAITI,
-        IRCOMM_WAITR,
-        IRCOMM_CONN,
-} IRCOMM_STATE;
-
-/* IrCOMM Events */
-typedef enum {
-        IRCOMM_CONNECT_REQUEST,
-        IRCOMM_CONNECT_RESPONSE,
-        IRCOMM_TTP_CONNECT_INDICATION,
-	IRCOMM_LMP_CONNECT_INDICATION,
-        IRCOMM_TTP_CONNECT_CONFIRM,
-	IRCOMM_LMP_CONNECT_CONFIRM,
-
-        IRCOMM_LMP_DISCONNECT_INDICATION,
-	IRCOMM_TTP_DISCONNECT_INDICATION,
-        IRCOMM_DISCONNECT_REQUEST,
-
-        IRCOMM_TTP_DATA_INDICATION,
-	IRCOMM_LMP_DATA_INDICATION,
-        IRCOMM_DATA_REQUEST,
-        IRCOMM_CONTROL_REQUEST,
-        IRCOMM_CONTROL_INDICATION,
-} IRCOMM_EVENT;
-
-/*
- * Used for passing information through the state-machine
- */
-struct ircomm_info {
-        __u32     saddr;               /* Source device address */
-        __u32     daddr;               /* Destination device address */
-        __u8      dlsap_sel;
-        LM_REASON reason;              /* Reason for disconnect */
-	__u32     max_data_size;
-	__u32     max_header_size;
-
-	struct qos_info *qos;
-};
-
-extern const char *const ircomm_state[];
-
-struct ircomm_cb;   /* Forward decl. */
-
-int ircomm_do_event(struct ircomm_cb *self, IRCOMM_EVENT event,
-		    struct sk_buff *skb, struct ircomm_info *info);
-void ircomm_next_state(struct ircomm_cb *self, IRCOMM_STATE state);
-
-#endif
diff --git a/include/net/irda/ircomm_lmp.h b/include/net/irda/ircomm_lmp.h
deleted file mode 100644
index 5042a5021a04..000000000000
--- a/include/net/irda/ircomm_lmp.h
+++ /dev/null
@@ -1,36 +0,0 @@
-/*********************************************************************
- *                
- * Filename:      ircomm_lmp.h
- * Version:       
- * Description:   
- * Status:        Experimental.
- * Author:        Dag Brattli <dagb@cs.uit.no>
- * Created at:    Wed Jun  9 10:06:07 1999
- * Modified at:   Fri Aug 13 07:32:32 1999
- * Modified by:   Dag Brattli <dagb@cs.uit.no>
- * 
- *     Copyright (c) 1999 Dag Brattli, All Rights Reserved.
- *     
- *     This program is free software; you can redistribute it and/or 
- *     modify it under the terms of the GNU General Public License as 
- *     published by the Free Software Foundation; either version 2 of 
- *     the License, or (at your option) any later version.
- * 
- *     This program is distributed in the hope that it will be useful,
- *     but WITHOUT ANY WARRANTY; without even the implied warranty of
- *     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- *     GNU General Public License for more details.
- * 
- *     You should have received a copy of the GNU General Public License 
- *     along with this program; if not, see <http://www.gnu.org/licenses/>.
- *     
- ********************************************************************/
-
-#ifndef IRCOMM_LMP_H
-#define IRCOMM_LMP_H
-
-#include <net/irda/ircomm_core.h>
-
-int ircomm_open_lsap(struct ircomm_cb *self);
-
-#endif
diff --git a/include/net/irda/ircomm_param.h b/include/net/irda/ircomm_param.h
deleted file mode 100644
index 1f67432321c4..000000000000
--- a/include/net/irda/ircomm_param.h
+++ /dev/null
@@ -1,147 +0,0 @@
-/*********************************************************************
- *                
- * Filename:      ircomm_param.h
- * Version:       1.0
- * Description:   Parameter handling for the IrCOMM protocol
- * Status:        Experimental.
- * Author:        Dag Brattli <dagb@cs.uit.no>
- * Created at:    Mon Jun  7 08:47:28 1999
- * Modified at:   Wed Aug 25 13:46:33 1999
- * Modified by:   Dag Brattli <dagb@cs.uit.no>
- * 
- *     Copyright (c) 1999 Dag Brattli, All Rights Reserved.
- *     
- *     This program is free software; you can redistribute it and/or 
- *     modify it under the terms of the GNU General Public License as 
- *     published by the Free Software Foundation; either version 2 of 
- *     the License, or (at your option) any later version.
- * 
- *     This program is distributed in the hope that it will be useful,
- *     but WITHOUT ANY WARRANTY; without even the implied warranty of
- *     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- *     GNU General Public License for more details.
- * 
- *     You should have received a copy of the GNU General Public License 
- *     along with this program; if not, see <http://www.gnu.org/licenses/>.
- *     
- ********************************************************************/
-
-#ifndef IRCOMM_PARAMS_H
-#define IRCOMM_PARAMS_H
-
-#include <net/irda/parameters.h>
-
-/* Parameters common to all service types */
-#define IRCOMM_SERVICE_TYPE     0x00
-#define IRCOMM_PORT_TYPE        0x01 /* Only used in LM-IAS */
-#define IRCOMM_PORT_NAME        0x02 /* Only used in LM-IAS */
-
-/* Parameters for both 3 wire and 9 wire */
-#define IRCOMM_DATA_RATE        0x10
-#define IRCOMM_DATA_FORMAT      0x11
-#define IRCOMM_FLOW_CONTROL     0x12
-#define IRCOMM_XON_XOFF         0x13
-#define IRCOMM_ENQ_ACK          0x14
-#define IRCOMM_LINE_STATUS      0x15
-#define IRCOMM_BREAK            0x16
-
-/* Parameters for 9 wire */
-#define IRCOMM_DTE              0x20
-#define IRCOMM_DCE              0x21
-#define IRCOMM_POLL             0x22
-
-/* Service type (details) */
-#define IRCOMM_3_WIRE_RAW       0x01
-#define IRCOMM_3_WIRE           0x02
-#define IRCOMM_9_WIRE           0x04
-#define IRCOMM_CENTRONICS       0x08
-
-/* Port type (details) */
-#define IRCOMM_SERIAL           0x00
-#define IRCOMM_PARALLEL         0x01
-
-/* Data format (details) */
-#define IRCOMM_WSIZE_5          0x00
-#define IRCOMM_WSIZE_6          0x01
-#define IRCOMM_WSIZE_7          0x02
-#define IRCOMM_WSIZE_8          0x03
-
-#define IRCOMM_1_STOP_BIT       0x00
-#define IRCOMM_2_STOP_BIT       0x04 /* 1.5 if char len 5 */
-
-#define IRCOMM_PARITY_DISABLE   0x00
-#define IRCOMM_PARITY_ENABLE    0x08
-
-#define IRCOMM_PARITY_ODD       0x00
-#define IRCOMM_PARITY_EVEN      0x10
-#define IRCOMM_PARITY_MARK      0x20
-#define IRCOMM_PARITY_SPACE     0x30
-
-/* Flow control */
-#define IRCOMM_XON_XOFF_IN      0x01
-#define IRCOMM_XON_XOFF_OUT     0x02
-#define IRCOMM_RTS_CTS_IN       0x04
-#define IRCOMM_RTS_CTS_OUT      0x08
-#define IRCOMM_DSR_DTR_IN       0x10
-#define IRCOMM_DSR_DTR_OUT      0x20
-#define IRCOMM_ENQ_ACK_IN       0x40
-#define IRCOMM_ENQ_ACK_OUT      0x80
-
-/* Line status */
-#define IRCOMM_OVERRUN_ERROR    0x02
-#define IRCOMM_PARITY_ERROR     0x04
-#define IRCOMM_FRAMING_ERROR    0x08
-
-/* DTE (Data terminal equipment) line settings */
-#define IRCOMM_DELTA_DTR        0x01
-#define IRCOMM_DELTA_RTS        0x02
-#define IRCOMM_DTR              0x04
-#define IRCOMM_RTS              0x08
-
-/* DCE (Data communications equipment) line settings */
-#define IRCOMM_DELTA_CTS        0x01  /* Clear to send has changed */
-#define IRCOMM_DELTA_DSR        0x02  /* Data set ready has changed */
-#define IRCOMM_DELTA_RI         0x04  /* Ring indicator has changed */
-#define IRCOMM_DELTA_CD         0x08  /* Carrier detect has changed */
-#define IRCOMM_CTS              0x10  /* Clear to send is high */
-#define IRCOMM_DSR              0x20  /* Data set ready is high */
-#define IRCOMM_RI               0x40  /* Ring indicator is high */
-#define IRCOMM_CD               0x80  /* Carrier detect is high */
-#define IRCOMM_DCE_DELTA_ANY    0x0f
-
-/*
- * Parameter state
- */
-struct ircomm_params {
-	/* General control params */
-	__u8  service_type;
-	__u8  port_type;
-	char  port_name[32];
-
-	/* Control params for 3- and 9-wire service type */
-	__u32 data_rate;         /* Data rate in bps */
-	__u8  data_format;
-	__u8  flow_control;
-	char  xonxoff[2];
-	char  enqack[2];
-	__u8  line_status;
-	__u8  _break;
-
-	__u8  null_modem;
-
-	/* Control params for 9-wire service type */
-	__u8 dte;
-	__u8 dce;
-	__u8 poll;
-
-	/* Control params for Centronics service type */
-};
-
-struct ircomm_tty_cb; /* Forward decl. */
-
-int ircomm_param_request(struct ircomm_tty_cb *self, __u8 pi, int flush);
-
-extern pi_param_info_t ircomm_param_info;
-
-#endif /* IRCOMM_PARAMS_H */
-
diff --git a/include/net/irda/ircomm_ttp.h b/include/net/irda/ircomm_ttp.h
deleted file mode 100644
index c5627288bca3..000000000000
--- a/include/net/irda/ircomm_ttp.h
+++ /dev/null
@@ -1,37 +0,0 @@
-/*********************************************************************
- *                
- * Filename:      ircomm_ttp.h
- * Version:       
- * Description:   
- * Status:        Experimental.
- * Author:        Dag Brattli <dagb@cs.uit.no>
- * Created at:    Wed Jun  9 10:06:07 1999
- * Modified at:   Fri Aug 13 07:32:22 1999
- * Modified by:   Dag Brattli <dagb@cs.uit.no>
- * 
- *     Copyright (c) 1999 Dag Brattli, All Rights Reserved.
- *     
- *     This program is free software; you can redistribute it and/or 
- *     modify it under the terms of the GNU General Public License as 
- *     published by the Free Software Foundation; either version 2 of 
- *     the License, or (at your option) any later version.
- * 
- *     This program is distributed in the hope that it will be useful,
- *     but WITHOUT ANY WARRANTY; without even the implied warranty of
- *     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- *     GNU General Public License for more details.
- * 
- *     You should have received a copy of the GNU General Public License 
- *     along with this program; if not, see <http://www.gnu.org/licenses/>.
- *     
- ********************************************************************/
-
-#ifndef IRCOMM_TTP_H
-#define IRCOMM_TTP_H
-
-#include <net/irda/ircomm_core.h>
-
-int  ircomm_open_tsap(struct ircomm_cb *self);
-
-#endif
-
diff --git a/include/net/irda/ircomm_tty.h b/include/net/irda/ircomm_tty.h
deleted file mode 100644
index 8d4f588974bc..000000000000
--- a/include/net/irda/ircomm_tty.h
+++ /dev/null
@@ -1,121 +0,0 @@
-/*********************************************************************
- *                
- * Filename:      ircomm_tty.h
- * Version:       
- * Description:   
- * Status:        Experimental.
- * Author:        Dag Brattli <dagb@cs.uit.no>
- * Created at:    Sun Jun  6 23:24:22 1999
- * Modified at:   Fri Jan 28 13:16:57 2000
- * Modified by:   Dag Brattli <dagb@cs.uit.no>
- * 
- *     Copyright (c) 1999-2000 Dag Brattli, All Rights Reserved.
- *     
- *     This program is free software; you can redistribute it and/or 
- *     modify it under the terms of the GNU General Public License as 
- *     published by the Free Software Foundation; either version 2 of 
- *     the License, or (at your option) any later version.
- * 
- *     This program is distributed in the hope that it will be useful,
- *     but WITHOUT ANY WARRANTY; without even the implied warranty of
- *     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- *     GNU General Public License for more details.
- * 
- *     You should have received a copy of the GNU General Public License 
- *     along with this program; if not, see <http://www.gnu.org/licenses/>.
- *     
- ********************************************************************/
-
-#ifndef IRCOMM_TTY_H
-#define IRCOMM_TTY_H
-
-#include <linux/serial.h>
-#include <linux/termios.h>
-#include <linux/timer.h>
-#include <linux/tty.h>		/* struct tty_struct */
-
-#include <net/irda/irias_object.h>
-#include <net/irda/ircomm_core.h>
-#include <net/irda/ircomm_param.h>
-
-#define IRCOMM_TTY_PORTS 32
-#define IRCOMM_TTY_MAGIC 0x3432
-#define IRCOMM_TTY_MAJOR 161
-#define IRCOMM_TTY_MINOR 0
-
-/* This is used as an initial value to max_header_size before the proper
- * value is filled in (5 for ttp, 4 for lmp). This allow us to detect
- * the state of the underlying connection. - Jean II */
-#define IRCOMM_TTY_HDR_UNINITIALISED	16
-/* Same for payload size. See qos.c for the smallest max data size */
-#define IRCOMM_TTY_DATA_UNINITIALISED	(64 - IRCOMM_TTY_HDR_UNINITIALISED)
-
-/*
- * IrCOMM TTY driver state
- */
-struct ircomm_tty_cb {
-	irda_queue_t queue;            /* Must be first */
-	struct tty_port port;
-	magic_t magic;
-
-	int state;                /* Connect state */
-
-	struct ircomm_cb *ircomm; /* IrCOMM layer instance */
-
-	struct sk_buff *tx_skb;   /* Transmit buffer */
-	struct sk_buff *ctrl_skb; /* Control data buffer */
-
-	/* Parameters */
-	struct ircomm_params settings;
-
-	__u8 service_type;        /* The service that we support */
-	int client;               /* True if we are a client */
-	LOCAL_FLOW flow;          /* IrTTP flow status */
-
-	int line;
-
-	__u8 dlsap_sel;
-	__u8 slsap_sel;
-
-	__u32 saddr;
-	__u32 daddr;
-
-	__u32 max_data_size;   /* Max data we can transmit in one packet */
-	__u32 max_header_size; /* The amount of header space we must reserve */
-	__u32 tx_data_size;	/* Max data size of current tx_skb */
-
-	struct iriap_cb *iriap; /* Instance used for querying remote IAS */
-	struct ias_object* obj;
-	void *skey;
-	void *ckey;
-
-	struct timer_list watchdog_timer;
-	struct work_struct  tqueue;
-
-	/* Protect concurent access to :
-	 *	o self->ctrl_skb
-	 *	o self->tx_skb
-	 * Maybe other things may gain to be protected as well...
-	 * Jean II */
-	spinlock_t spinlock;
-};
-
-void ircomm_tty_start(struct tty_struct *tty);
-void ircomm_tty_check_modem_status(struct ircomm_tty_cb *self);
-
-int ircomm_tty_tiocmget(struct tty_struct *tty);
-int ircomm_tty_tiocmset(struct tty_struct *tty, unsigned int set,
-			unsigned int clear);
-int ircomm_tty_ioctl(struct tty_struct *tty, unsigned int cmd,
-		     unsigned long arg);
-void ircomm_tty_set_termios(struct tty_struct *tty,
-			    struct ktermios *old_termios);
-
-#endif
-
-
-
-
-
-
-
diff --git a/include/net/irda/ircomm_tty_attach.h b/include/net/irda/ircomm_tty_attach.h
deleted file mode 100644
index 20dcbdf258cf..000000000000
--- a/include/net/irda/ircomm_tty_attach.h
+++ /dev/null
@@ -1,92 +0,0 @@
-/*********************************************************************
- *                
- * Filename:      ircomm_tty_attach.h
- * Version:       
- * Description:   
- * Status:        Experimental.
- * Author:        Dag Brattli <dagb@cs.uit.no>
- * Created at:    Wed Jun  9 15:55:18 1999
- * Modified at:   Fri Dec 10 21:04:55 1999
- * Modified by:   Dag Brattli <dagb@cs.uit.no>
- * 
- *     Copyright (c) 1999 Dag Brattli, All Rights Reserved.
- *     
- *     This program is free software; you can redistribute it and/or 
- *     modify it under the terms of the GNU General Public License as 
- *     published by the Free Software Foundation; either version 2 of 
- *     the License, or (at your option) any later version.
- * 
- *     This program is distributed in the hope that it will be useful,
- *     but WITHOUT ANY WARRANTY; without even the implied warranty of
- *     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- *     GNU General Public License for more details.
- * 
- *     You should have received a copy of the GNU General Public License 
- *     along with this program; if not, see <http://www.gnu.org/licenses/>.
- *     
- ********************************************************************/
-
-#ifndef IRCOMM_TTY_ATTACH_H
-#define IRCOMM_TTY_ATTACH_H
-
-#include <net/irda/ircomm_tty.h>
-
-typedef enum {
-        IRCOMM_TTY_IDLE,
-	IRCOMM_TTY_SEARCH,
-        IRCOMM_TTY_QUERY_PARAMETERS,
-	IRCOMM_TTY_QUERY_LSAP_SEL,
-	IRCOMM_TTY_SETUP,
-        IRCOMM_TTY_READY,
-} IRCOMM_TTY_STATE;
-
-/* IrCOMM TTY Events */
-typedef enum {
-	IRCOMM_TTY_ATTACH_CABLE,
-	IRCOMM_TTY_DETACH_CABLE,
-	IRCOMM_TTY_DATA_REQUEST,
-	IRCOMM_TTY_DATA_INDICATION,
-	IRCOMM_TTY_DISCOVERY_REQUEST,
-	IRCOMM_TTY_DISCOVERY_INDICATION,
-	IRCOMM_TTY_CONNECT_CONFIRM,
-	IRCOMM_TTY_CONNECT_INDICATION,
-	IRCOMM_TTY_DISCONNECT_REQUEST,
-	IRCOMM_TTY_DISCONNECT_INDICATION,
-	IRCOMM_TTY_WD_TIMER_EXPIRED,
-	IRCOMM_TTY_GOT_PARAMETERS,
-	IRCOMM_TTY_GOT_LSAPSEL,
-} IRCOMM_TTY_EVENT;
-
-/* Used for passing information through the state-machine */
-struct ircomm_tty_info {
-        __u32     saddr;               /* Source device address */
-        __u32     daddr;               /* Destination device address */
-        __u8      dlsap_sel;
-};
-
-extern const char *const ircomm_state[];
-extern const char *const ircomm_tty_state[];
-
-int ircomm_tty_do_event(struct ircomm_tty_cb *self, IRCOMM_TTY_EVENT event,
-			struct sk_buff *skb, struct ircomm_tty_info *info);
-
-
-int  ircomm_tty_attach_cable(struct ircomm_tty_cb *self);
-void ircomm_tty_detach_cable(struct ircomm_tty_cb *self);
-void ircomm_tty_connect_confirm(void *instance, void *sap, 
-				struct qos_info *qos, 
-				__u32 max_sdu_size, 
-				__u8 max_header_size, 
-				struct sk_buff *skb);
-void ircomm_tty_disconnect_indication(void *instance, void *sap, 
-				      LM_REASON reason,
-				      struct sk_buff *skb);
-void ircomm_tty_connect_indication(void *instance, void *sap, 
-				   struct qos_info *qos, 
-				   __u32 max_sdu_size,
-				   __u8 max_header_size, 
-				   struct sk_buff *skb);
-int ircomm_tty_send_initial_parameters(struct ircomm_tty_cb *self);
-void ircomm_tty_link_established(struct ircomm_tty_cb *self);
-
-#endif /* IRCOMM_TTY_ATTACH_H */
diff --git a/include/net/irda/irda.h b/include/net/irda/irda.h
deleted file mode 100644
index 92c8fb575213..000000000000
--- a/include/net/irda/irda.h
+++ /dev/null
@@ -1,115 +0,0 @@
-/*********************************************************************
- *                
- * Filename:      irda.h
- * Version:       1.0
- * Description:   IrDA common include file for kernel internal use
- * Status:        Stable
- * Author:        Dag Brattli <dagb@cs.uit.no>
- * Created at:    Tue Dec  9 21:13:12 1997
- * Modified at:   Fri Jan 28 13:16:32 2000
- * Modified by:   Dag Brattli <dagb@cs.uit.no>
- * 
- *     Copyright (c) 1998-2000 Dag Brattli, All Rights Reserved.
- *     Copyright (c) 2000-2002 Jean Tourrilhes <jt@hpl.hp.com>
- *      
- *     This program is free software; you can redistribute it and/or 
- *     modify it under the terms of the GNU General Public License as 
- *     published by the Free Software Foundation; either version 2 of 
- *     the License, or (at your option) any later version.
- *  
- *     Neither Dag Brattli nor University of Tromsø admit liability nor
- *     provide warranty for any of this software. This material is 
- *     provided "AS-IS" and at no charge.
- *     
- ********************************************************************/
-
-#ifndef NET_IRDA_H
-#define NET_IRDA_H
-
-#include <linux/skbuff.h>		/* struct sk_buff */
-#include <linux/kernel.h>
-#include <linux/if.h>			/* sa_family_t in <linux/irda.h> */
-#include <linux/irda.h>
-
-typedef __u32 magic_t;
-
-#ifndef TRUE
-#define TRUE 1
-#endif
-
-#ifndef FALSE 
-#define FALSE 0
-#endif
-
-/* Hack to do small backoff when setting media busy in IrLAP */
-#ifndef SMALL
-#define SMALL 5
-#endif
-
-#ifndef IRDA_MIN /* Lets not mix this MIN with other header files */
-#define IRDA_MIN(a, b) (((a) < (b)) ? (a) : (b))
-#endif
-
-#ifndef IRDA_ALIGN
-#  define IRDA_ALIGN __attribute__((aligned))
-#endif
-
-#ifdef CONFIG_IRDA_DEBUG
-#define IRDA_ASSERT(expr, func) \
-do { if(!(expr)) { \
-	printk( "Assertion failed! %s:%s:%d %s\n", \
-		__FILE__,__func__,__LINE__,(#expr) ); \
-	func } } while (0)
-#define IRDA_ASSERT_LABEL(label)	label
-#else
-#define IRDA_ASSERT(expr, func) do { (void)(expr); } while (0)
-#define IRDA_ASSERT_LABEL(label)
-#endif /* CONFIG_IRDA_DEBUG */
-
-/*
- *  Magic numbers used by Linux-IrDA. Random numbers which must be unique to 
- *  give the best protection
- */
-
-#define IRTTY_MAGIC        0x2357
-#define LAP_MAGIC          0x1357
-#define LMP_MAGIC          0x4321
-#define LMP_LSAP_MAGIC     0x69333
-#define LMP_LAP_MAGIC      0x3432
-#define IRDA_DEVICE_MAGIC  0x63454
-#define IAS_MAGIC          0x007
-#define TTP_MAGIC          0x241169
-#define TTP_TSAP_MAGIC     0x4345
-#define IROBEX_MAGIC       0x341324
-#define HB_MAGIC           0x64534
-#define IRLAN_MAGIC        0x754
-#define IAS_OBJECT_MAGIC   0x34234
-#define IAS_ATTRIB_MAGIC   0x45232
-#define IRDA_TASK_MAGIC    0x38423
-
-#define IAS_DEVICE_ID 0x0000 /* Defined by IrDA, IrLMP section 4.1 (page 68) */
-#define IAS_PNP_ID    0xd342
-#define IAS_OBEX_ID   0x34323
-#define IAS_IRLAN_ID  0x34234
-#define IAS_IRCOMM_ID 0x2343
-#define IAS_IRLPT_ID  0x9876
-
-struct net_device;
-struct packet_type;
-
-void irda_proc_register(void);
-void irda_proc_unregister(void);
-
-int irda_sysctl_register(void);
-void irda_sysctl_unregister(void);
-
-int irsock_init(void);
-void irsock_cleanup(void);
-
-int irda_nl_register(void);
-void irda_nl_unregister(void);
-
-int irlap_driver_rcv(struct sk_buff *skb, struct net_device *dev,
-		     struct packet_type *ptype, struct net_device *orig_dev);
-
-#endif /* NET_IRDA_H */
diff --git a/include/net/irda/irda_device.h b/include/net/irda/irda_device.h
deleted file mode 100644
index 664bf8178412..000000000000
--- a/include/net/irda/irda_device.h
+++ /dev/null
@@ -1,285 +0,0 @@
-/*********************************************************************
- *                
- * Filename:      irda_device.h
- * Version:       0.9
- * Description:   Contains various declarations used by the drivers
- * Status:        Experimental.
- * Author:        Dag Brattli <dagb@cs.uit.no>
- * Created at:    Tue Apr 14 12:41:42 1998
- * Modified at:   Mon Mar 20 09:08:57 2000
- * Modified by:   Dag Brattli <dagb@cs.uit.no>
- * 
- *     Copyright (c) 1999-2000 Dag Brattli, All Rights Reserved.
- *     Copyright (c) 1998 Thomas Davis, <ratbert@radiks.net>,
- *     Copyright (c) 2000-2002 Jean Tourrilhes <jt@hpl.hp.com>
- *
- *     This program is free software; you can redistribute it and/or 
- *     modify it under the terms of the GNU General Public License as 
- *     published by the Free Software Foundation; either version 2 of 
- *     the License, or (at your option) any later version.
- * 
- *     This program is distributed in the hope that it will be useful,
- *     but WITHOUT ANY WARRANTY; without even the implied warranty of
- *     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- *     GNU General Public License for more details.
- * 
- *     You should have received a copy of the GNU General Public License 
- *     along with this program; if not, see <http://www.gnu.org/licenses/>.
- *     
- ********************************************************************/
-
-/*
- * This header contains all the IrDA definitions a driver really
- * needs, and therefore the driver should not need to include
- * any other IrDA headers - Jean II
- */
-
-#ifndef IRDA_DEVICE_H
-#define IRDA_DEVICE_H
-
-#include <linux/tty.h>
-#include <linux/netdevice.h>
-#include <linux/spinlock.h>
-#include <linux/skbuff.h>		/* struct sk_buff */
-#include <linux/irda.h>
-#include <linux/types.h>
-
-#include <net/pkt_sched.h>
-#include <net/irda/irda.h>
-#include <net/irda/qos.h>		/* struct qos_info */
-#include <net/irda/irqueue.h>		/* irda_queue_t */
-
-/* A few forward declarations (to make compiler happy) */
-struct irlap_cb;
-
-/* Some non-standard interface flags (should not conflict with any in if.h) */
-#define IFF_SIR 	0x0001 /* Supports SIR speeds */
-#define IFF_MIR 	0x0002 /* Supports MIR speeds */
-#define IFF_FIR 	0x0004 /* Supports FIR speeds */
-#define IFF_VFIR        0x0008 /* Supports VFIR speeds */
-#define IFF_PIO   	0x0010 /* Supports PIO transfer of data */
-#define IFF_DMA		0x0020 /* Supports DMA transfer of data */
-#define IFF_SHM         0x0040 /* Supports shared memory data transfers */
-#define IFF_DONGLE      0x0080 /* Interface has a dongle attached */
-#define IFF_AIR         0x0100 /* Supports Advanced IR (AIR) standards */
-
-#define IO_XMIT 0x01
-#define IO_RECV 0x02
-
-typedef enum {
-	IRDA_IRLAP, /* IrDA mode, and deliver to IrLAP */
-	IRDA_RAW,   /* IrDA mode */
-	SHARP_ASK,
-	TV_REMOTE,  /* Also known as Consumer Electronics IR */
-} INFRARED_MODE;
-
-typedef enum {
-	IRDA_TASK_INIT,        /* All tasks are initialized with this state */
-	IRDA_TASK_DONE,        /* Signals that the task is finished */
-	IRDA_TASK_WAIT,
-	IRDA_TASK_WAIT1,
-	IRDA_TASK_WAIT2,
-	IRDA_TASK_WAIT3,
-	IRDA_TASK_CHILD_INIT,  /* Initializing child task */
-	IRDA_TASK_CHILD_WAIT,  /* Waiting for child task to finish */
-	IRDA_TASK_CHILD_DONE   /* Child task is finished */
-} IRDA_TASK_STATE;
-
-struct irda_task;
-typedef int (*IRDA_TASK_CALLBACK) (struct irda_task *task);
-
-struct irda_task {
-	irda_queue_t q;
-	magic_t magic;
-
-	IRDA_TASK_STATE state;
-	IRDA_TASK_CALLBACK function;
-	IRDA_TASK_CALLBACK finished;
-
-	struct irda_task *parent;
-	struct timer_list timer;
-
-	void *instance; /* Instance being called */
-	void *param;    /* Parameter to be used by instance */
-};
-
-/* Dongle info */
-struct dongle_reg;
-typedef struct {
-	struct dongle_reg *issue;     /* Registration info */
-	struct net_device *dev;           /* Device we are attached to */
-	struct irda_task *speed_task; /* Task handling speed change */
-	struct irda_task *reset_task; /* Task handling reset */
-	__u32 speed;                  /* Current speed */
-
-	/* Callbacks to the IrDA device driver */
-	int (*set_mode)(struct net_device *, int mode);
-	int (*read)(struct net_device *dev, __u8 *buf, int len);
-	int (*write)(struct net_device *dev, __u8 *buf, int len);
-	int (*set_dtr_rts)(struct net_device *dev, int dtr, int rts);
-} dongle_t;
-
-/* Dongle registration info */
-struct dongle_reg {
-	irda_queue_t q;         /* Must be first */
-	IRDA_DONGLE type;
-
-	void (*open)(dongle_t *dongle, struct qos_info *qos);
-	void (*close)(dongle_t *dongle);
-	int  (*reset)(struct irda_task *task);
-	int  (*change_speed)(struct irda_task *task);
-	struct module *owner;
-};
-
-/* 
- * Per-packet information we need to hide inside sk_buff 
- * (must not exceed 48 bytes, check with struct sk_buff)
- * The default_qdisc_pad field is a temporary hack.
- */
-struct irda_skb_cb {
-	unsigned int default_qdisc_pad;
-	magic_t magic;       /* Be sure that we can trust the information */
-	__u32   next_speed;  /* The Speed to be set *after* this frame */
-	__u16   mtt;         /* Minimum turn around time */
-	__u16   xbofs;       /* Number of xbofs required, used by SIR mode */
-	__u16   next_xbofs;  /* Number of xbofs required *after* this frame */
-	void    *context;    /* May be used by drivers */
-	void    (*destructor)(struct sk_buff *skb); /* Used for flow control */
-	__u16   xbofs_delay; /* Number of xbofs used for generating the mtt */
-	__u8    line;        /* Used by IrCOMM in IrLPT mode */
-};
-
-/* Chip specific info */
-typedef struct {
-	int cfg_base;         /* Config register IO base */
-        int sir_base;         /* SIR IO base */
-	int fir_base;         /* FIR IO base */
-	int mem_base;         /* Shared memory base */
-        int sir_ext;          /* Length of SIR iobase */
-	int fir_ext;          /* Length of FIR iobase */
-        int irq, irq2;        /* Interrupts used */
-        int dma, dma2;        /* DMA channel(s) used */
-        int fifo_size;        /* FIFO size */
-        int irqflags;         /* interrupt flags (ie, IRQF_SHARED) */
-	int direction;        /* Link direction, used by some FIR drivers */
-	int enabled;          /* Powered on? */
-	int suspended;        /* Suspended by APM */
-	__u32 speed;          /* Currently used speed */
-	__u32 new_speed;      /* Speed we must change to when Tx is finished */
-	int dongle_id;        /* Dongle or transceiver currently used */
-} chipio_t;
-
-/* IO buffer specific info (inspired by struct sk_buff) */
-typedef struct {
-	int state;            /* Receiving state (transmit state not used) */
-	int in_frame;         /* True if receiving frame */
-
-	__u8 *head;	      /* start of buffer */
-	__u8 *data;	      /* start of data in buffer */
-
-	int len;	      /* current length of data */
-	int truesize;	      /* total allocated size of buffer */
-	__u16 fcs;
-
-	struct sk_buff *skb;	/* ZeroCopy Rx in async_unwrap_char() */
-} iobuff_t;
-
-/* Maximum SIR frame (skb) that we expect to receive *unwrapped*.
- * Max LAP MTU (I field) is 2048 bytes max (IrLAP 1.1, chapt 6.6.5, p40).
- * Max LAP header is 2 bytes (for now).
- * Max CRC is 2 bytes at SIR, 4 bytes at FIR. 
- * Need 1 byte for skb_reserve() to align IP header for IrLAN.
- * Add a few extra bytes just to be safe (buffer is power of two anyway)
- * Jean II */
-#define IRDA_SKB_MAX_MTU	2064
-/* Maximum SIR frame that we expect to send, wrapped (i.e. with XBOFS
- * and escaped characters on top of above). */
-#define IRDA_SIR_MAX_FRAME	4269
-
-/* The SIR unwrapper async_unwrap_char() will use a Rx-copy-break mechanism
- * when using the optional ZeroCopy Rx, where only small frames are memcpy
- * to a smaller skb to save memory. This is the threshold under which copy
- * will happen (and over which it won't happen).
- * Some FIR drivers may use this #define as well...
- * This is the same value as various Ethernet drivers. - Jean II */
-#define IRDA_RX_COPY_THRESHOLD  256
-
-/* Function prototypes */
-int  irda_device_init(void);
-void irda_device_cleanup(void);
-
-/* IrLAP entry points used by the drivers.
- * We declare them here to avoid the driver pulling a whole bunch stack
- * headers they don't really need - Jean II */
-struct irlap_cb *irlap_open(struct net_device *dev, struct qos_info *qos,
-			    const char *hw_name);
-void irlap_close(struct irlap_cb *self);
-
-/* Interface to be uses by IrLAP */
-void irda_device_set_media_busy(struct net_device *dev, int status);
-int  irda_device_is_media_busy(struct net_device *dev);
-int  irda_device_is_receiving(struct net_device *dev);
-
-/* Interface for internal use */
-static inline int irda_device_txqueue_empty(const struct net_device *dev)
-{
-	return qdisc_all_tx_empty(dev);
-}
-int  irda_device_set_raw_mode(struct net_device* self, int status);
-struct net_device *alloc_irdadev(int sizeof_priv);
-
-void irda_setup_dma(int channel, dma_addr_t buffer, int count, int mode);
-
-/*
- * Function irda_get_mtt (skb)
- *
- *    Utility function for getting the minimum turnaround time out of 
- *    the skb, where it has been hidden in the cb field.
- */
-static inline __u16 irda_get_mtt(const struct sk_buff *skb)
-{
-	const struct irda_skb_cb *cb = (const struct irda_skb_cb *) skb->cb;
-	return (cb->magic == LAP_MAGIC) ? cb->mtt : 10000;
-}
-
-/*
- * Function irda_get_next_speed (skb)
- *
- *    Extract the speed that should be set *after* this frame from the skb
- *
- * Note : return -1 for user space frames
- */
-static inline __u32 irda_get_next_speed(const struct sk_buff *skb)
-{
-	const struct irda_skb_cb *cb = (const struct irda_skb_cb *) skb->cb;
-	return (cb->magic == LAP_MAGIC) ? cb->next_speed : -1;
-}
-
-/*
- * Function irda_get_next_xbofs (skb)
- *
- *    Extract the xbofs that should be set for this frame from the skb
- *
- * Note : default to 10 for user space frames
- */
-static inline __u16 irda_get_xbofs(const struct sk_buff *skb)
-{
-	const struct irda_skb_cb *cb = (const struct irda_skb_cb *) skb->cb;
-	return (cb->magic == LAP_MAGIC) ? cb->xbofs : 10;
-}
-
-/*
- * Function irda_get_next_xbofs (skb)
- *
- *    Extract the xbofs that should be set *after* this frame from the skb
- *
- * Note : return -1 for user space frames
- */
-static inline __u16 irda_get_next_xbofs(const struct sk_buff *skb)
-{
-	const struct irda_skb_cb *cb = (const struct irda_skb_cb *) skb->cb;
-	return (cb->magic == LAP_MAGIC) ? cb->next_xbofs : -1;
-}
-#endif /* IRDA_DEVICE_H */
-
-
diff --git a/include/net/irda/iriap.h b/include/net/irda/iriap.h
deleted file mode 100644
index fcc896491a95..000000000000
--- a/include/net/irda/iriap.h
+++ /dev/null
@@ -1,108 +0,0 @@
-/*********************************************************************
- *                
- * Filename:      iriap.h
- * Version:       0.5
- * Description:   Information Access Protocol (IAP)
- * Status:        Experimental.
- * Author:        Dag Brattli <dagb@cs.uit.no>
- * Created at:    Thu Aug 21 00:02:07 1997
- * Modified at:   Sat Dec 25 16:42:09 1999
- * Modified by:   Dag Brattli <dagb@cs.uit.no>
- * 
- *     Copyright (c) 1997-1999 Dag Brattli <dagb@cs.uit.no>, 
- *     All Rights Reserved.
- *     
- *     This program is free software; you can redistribute it and/or 
- *     modify it under the terms of the GNU General Public License as 
- *     published by the Free Software Foundation; either version 2 of 
- *     the License, or (at your option) any later version.
- *
- *     Neither Dag Brattli nor University of Tromsø admit liability nor
- *     provide warranty for any of this software. This material is 
- *     provided "AS-IS" and at no charge.
- *
- ********************************************************************/
-
-#ifndef IRIAP_H
-#define IRIAP_H
-
-#include <linux/types.h>
-#include <linux/skbuff.h>
-
-#include <net/irda/iriap_event.h>
-#include <net/irda/irias_object.h>
-#include <net/irda/irqueue.h>		/* irda_queue_t */
-#include <net/irda/timer.h>		/* struct timer_list */
-
-#define IAP_LST 0x80
-#define IAP_ACK 0x40
-
-#define IAS_SERVER 0
-#define IAS_CLIENT 1
-
-/* IrIAP Op-codes */
-#define GET_INFO_BASE      0x01
-#define GET_OBJECTS        0x02
-#define GET_VALUE          0x03
-#define GET_VALUE_BY_CLASS 0x04
-#define GET_OBJECT_INFO    0x05
-#define GET_ATTRIB_NAMES   0x06
-
-#define IAS_SUCCESS        0
-#define IAS_CLASS_UNKNOWN  1
-#define IAS_ATTRIB_UNKNOWN 2
-#define IAS_DISCONNECT     10
-
-typedef void (*CONFIRM_CALLBACK)(int result, __u16 obj_id, 
-				 struct ias_value *value, void *priv);
-
-struct iriap_cb {
-	irda_queue_t q; /* Must be first */	
-	magic_t magic;  /* Magic cookie */
-
-	int          mode;   /* Client or server */
-
-	__u32        saddr;
-	__u32        daddr;
-	__u8         operation;
-
-	struct sk_buff *request_skb;
-	struct lsap_cb *lsap;
-	__u8 slsap_sel;
-
-	/* Client states */
-	IRIAP_STATE client_state;
-	IRIAP_STATE call_state;
-	
-	/* Server states */
-	IRIAP_STATE server_state;
-	IRIAP_STATE r_connect_state;
-	
-	CONFIRM_CALLBACK confirm;
-	void *priv;                /* Used to identify client */
-
-	__u8 max_header_size;
-	__u32 max_data_size;
-	
-	struct timer_list watchdog_timer;
-};
-
-int  iriap_init(void);
-void iriap_cleanup(void);
-
-struct iriap_cb *iriap_open(__u8 slsap_sel, int mode, void *priv, 
-			    CONFIRM_CALLBACK callback);
-void iriap_close(struct iriap_cb *self);
-
-int iriap_getvaluebyclass_request(struct iriap_cb *self, 
-				  __u32 saddr, __u32 daddr,
-				  char *name, char *attr);
-void iriap_connect_request(struct iriap_cb *self);
-void iriap_send_ack( struct iriap_cb *self);
-void iriap_call_indication(struct iriap_cb *self, struct sk_buff *skb);
-
-void iriap_register_server(void);
-
-#endif
-
-
diff --git a/include/net/irda/iriap_event.h b/include/net/irda/iriap_event.h
deleted file mode 100644
index 89747f06d9eb..000000000000
--- a/include/net/irda/iriap_event.h
+++ /dev/null
@@ -1,85 +0,0 @@
-/*********************************************************************
- *                
- * Filename:      iriap_event.h
- * Version:       
- * Description:   
- * Status:        Experimental.
- * Author:        Dag Brattli <dagb@cs.uit.no>
- * Created at:    Mon Aug  4 20:40:53 1997
- * Modified at:   Sun Oct 31 22:02:54 1999
- * Modified by:   Dag Brattli <dagb@cs.uit.no>
- * 
- *     Copyright (c) 1998-1999 Dag Brattli <dagb@cs.uit.no>, All Rights Reserved.
- *     
- *     This program is free software; you can redistribute it and/or 
- *     modify it under the terms of the GNU General Public License as 
- *     published by the Free Software Foundation; either version 2 of 
- *     the License, or (at your option) any later version.
- *
- *     Neither Dag Brattli nor University of Tromsø admit liability nor
- *     provide warranty for any of this software. This material is 
- *     provided "AS-IS" and at no charge.
- *
- ********************************************************************/
-
-#ifndef IRIAP_FSM_H
-#define IRIAP_FSM_H
-
-/* Forward because of circular include dependecies */
-struct iriap_cb;
-
-/* IrIAP states */
-typedef enum {
-	/* Client */
-	S_DISCONNECT,
-	S_CONNECTING,
-	S_CALL,
-
-	/* S-Call */
-	S_MAKE_CALL,
-	S_CALLING,
-	S_OUTSTANDING,
-	S_REPLYING,
-	S_WAIT_FOR_CALL,
-	S_WAIT_ACTIVE,
-
-	/* Server */
-	R_DISCONNECT,
-	R_CALL,
-	
-	/* R-Connect */
-	R_WAITING,
-	R_WAIT_ACTIVE,
-	R_RECEIVING,
-	R_EXECUTE,
-	R_RETURNING,
-} IRIAP_STATE;
-
-typedef enum {
-	IAP_CALL_REQUEST,
-	IAP_CALL_REQUEST_GVBC,
-	IAP_CALL_RESPONSE,
-	IAP_RECV_F_LST,
-	IAP_LM_DISCONNECT_INDICATION,
-	IAP_LM_CONNECT_INDICATION,
-	IAP_LM_CONNECT_CONFIRM,
-} IRIAP_EVENT;
-
-void iriap_next_client_state   (struct iriap_cb *self, IRIAP_STATE state);
-void iriap_next_call_state     (struct iriap_cb *self, IRIAP_STATE state);
-void iriap_next_server_state   (struct iriap_cb *self, IRIAP_STATE state);
-void iriap_next_r_connect_state(struct iriap_cb *self, IRIAP_STATE state);
-
-
-void iriap_do_client_event(struct iriap_cb *self, IRIAP_EVENT event, 
-			   struct sk_buff *skb);
-void iriap_do_call_event  (struct iriap_cb *self, IRIAP_EVENT event, 
-			   struct sk_buff *skb);
-
-void iriap_do_server_event   (struct iriap_cb *self, IRIAP_EVENT event, 
-			      struct sk_buff *skb);
-void iriap_do_r_connect_event(struct iriap_cb *self, IRIAP_EVENT event, 
-			      struct sk_buff *skb);
-
-#endif /* IRIAP_FSM_H */
-
diff --git a/include/net/irda/irias_object.h b/include/net/irda/irias_object.h
deleted file mode 100644
index 83f78081799c..000000000000
--- a/include/net/irda/irias_object.h
+++ /dev/null
@@ -1,108 +0,0 @@
-/*********************************************************************
- *                
- * Filename:      irias_object.h
- * Version:       
- * Description:   
- * Status:        Experimental.
- * Author:        Dag Brattli <dagb@cs.uit.no>
- * Created at:    Thu Oct  1 22:49:50 1998
- * Modified at:   Wed Dec 15 11:20:57 1999
- * Modified by:   Dag Brattli <dagb@cs.uit.no>
- * 
- *     Copyright (c) 1998-1999 Dag Brattli, All Rights Reserved.
- *      
- *     This program is free software; you can redistribute it and/or 
- *     modify it under the terms of the GNU General Public License as 
- *     published by the Free Software Foundation; either version 2 of 
- *     the License, or (at your option) any later version.
- *  
- *     Neither Dag Brattli nor University of Tromsø admit liability nor
- *     provide warranty for any of this software. This material is 
- *     provided "AS-IS" and at no charge.
- *     
- ********************************************************************/
-
-#ifndef LM_IAS_OBJECT_H
-#define LM_IAS_OBJECT_H
-
-#include <net/irda/irda.h>
-#include <net/irda/irqueue.h>
-
-/* LM-IAS Attribute types */
-#define IAS_MISSING 0
-#define IAS_INTEGER 1
-#define IAS_OCT_SEQ 2
-#define IAS_STRING  3
-
-/* Object ownership of attributes (user or kernel) */
-#define IAS_KERNEL_ATTR	0
-#define IAS_USER_ATTR	1
-
-/*
- *  LM-IAS Object
- */
-struct ias_object {
-	irda_queue_t q;     /* Must be first! */
-	magic_t magic;
-	
-	char  *name;
-	int   id;
-	hashbin_t *attribs;
-};
-
-/*
- *  Values used by LM-IAS attributes
- */
-struct ias_value {
-        __u8    type;    /* Value description */
-	__u8	owner;	/* Managed from user/kernel space */
-	int     charset; /* Only used by string type */
-        int     len;
-	
-	/* Value */
-	union {
-		int integer;
-		char *string;
-		__u8 *oct_seq;
-	} t;
-};
-
-/*
- *  Attributes used by LM-IAS objects
- */
-struct ias_attrib {
-	irda_queue_t q; /* Must be first! */
-	int magic;
-
-        char *name;   	         /* Attribute name */
-	struct ias_value *value; /* Attribute value */
-};
-
-struct ias_object *irias_new_object(char *name, int id);
-void irias_insert_object(struct ias_object *obj);
-int  irias_delete_object(struct ias_object *obj);
-int  irias_delete_attrib(struct ias_object *obj, struct ias_attrib *attrib,
-			 int cleanobject);
-void __irias_delete_object(struct ias_object *obj);
-
-void irias_add_integer_attrib(struct ias_object *obj, char *name, int value,
-			      int user);
-void irias_add_string_attrib(struct ias_object *obj, char *name, char *value,
-			     int user);
-void irias_add_octseq_attrib(struct ias_object *obj, char *name, __u8 *octets,
-			     int len, int user);
-int irias_object_change_attribute(char *obj_name, char *attrib_name, 
-				  struct ias_value *new_value);
-struct ias_object *irias_find_object(char *name);
-struct ias_attrib *irias_find_attrib(struct ias_object *obj, char *name);
-
-struct ias_value *irias_new_string_value(char *string);
-struct ias_value *irias_new_integer_value(int integer);
-struct ias_value *irias_new_octseq_value(__u8 *octseq , int len);
-struct ias_value *irias_new_missing_value(void);
-void irias_delete_value(struct ias_value *value);
-
-extern struct ias_value irias_missing;
-extern hashbin_t *irias_objects;
-
-#endif
diff --git a/include/net/irda/irlan_client.h b/include/net/irda/irlan_client.h
deleted file mode 100644
index fa8455eda280..000000000000
--- a/include/net/irda/irlan_client.h
+++ /dev/null
@@ -1,42 +0,0 @@
-/*********************************************************************
- *                
- * Filename:      irlan_client.h
- * Version:       0.3
- * Description:   IrDA LAN access layer
- * Status:        Experimental.
- * Author:        Dag Brattli <dagb@cs.uit.no>
- * Created at:    Sun Aug 31 20:14:37 1997
- * Modified at:   Thu Apr 22 14:13:34 1999
- * Modified by:   Dag Brattli <dagb@cs.uit.no>
- * 
- *     Copyright (c) 1998 Dag Brattli <dagb@cs.uit.no>, All Rights Reserved.
- *     
- *     This program is free software; you can redistribute it and/or 
- *     modify it under the terms of the GNU General Public License as 
- *     published by the Free Software Foundation; either version 2 of 
- *     the License, or (at your option) any later version.
- *
- *     Neither Dag Brattli nor University of Tromsø admit liability nor
- *     provide warranty for any of this software. This material is 
- *     provided "AS-IS" and at no charge.
- *
- ********************************************************************/
-
-#ifndef IRLAN_CLIENT_H
-#define IRLAN_CLIENT_H
-
-#include <linux/kernel.h>
-#include <linux/types.h>
-#include <linux/skbuff.h>
-#include <linux/netdevice.h>
-
-#include <net/irda/irias_object.h>
-#include <net/irda/irlan_event.h>
-
-void irlan_client_discovery_indication(discinfo_t *, DISCOVERY_MODE, void *);
-void irlan_client_wakeup(struct irlan_cb *self, __u32 saddr, __u32 daddr);
-
-void irlan_client_parse_response(struct irlan_cb *self, struct sk_buff *skb);
-void irlan_client_get_value_confirm(int result, __u16 obj_id, 
-				    struct ias_value *value, void *priv);
-#endif
diff --git a/include/net/irda/irlan_common.h b/include/net/irda/irlan_common.h
deleted file mode 100644
index 550c2d6ec7ff..000000000000
--- a/include/net/irda/irlan_common.h
+++ /dev/null
@@ -1,230 +0,0 @@
-/*********************************************************************
- *                
- * Filename:      irlan_common.h
- * Version:       0.8
- * Description:   IrDA LAN access layer
- * Status:        Experimental.
- * Author:        Dag Brattli <dagb@cs.uit.no>
- * Created at:    Sun Aug 31 20:14:37 1997
- * Modified at:   Sun Oct 31 19:41:24 1999
- * Modified by:   Dag Brattli <dagb@cs.uit.no>
- * 
- *     Copyright (c) 1998-1999 Dag Brattli <dagb@cs.uit.no>, 
- *     All Rights Reserved.
- *     
- *     This program is free software; you can redistribute it and/or 
- *     modify it under the terms of the GNU General Public License as 
- *     published by the Free Software Foundation; either version 2 of 
- *     the License, or (at your option) any later version.
- *
- *     Neither Dag Brattli nor University of Tromsø admit liability nor
- *     provide warranty for any of this software. This material is 
- *     provided "AS-IS" and at no charge.
- *
- ********************************************************************/
-
-#ifndef IRLAN_H
-#define IRLAN_H
-
-#include <asm/param.h>  /* for HZ */
-
-#include <linux/kernel.h>
-#include <linux/types.h>
-#include <linux/skbuff.h>
-#include <linux/netdevice.h>
-#include <linux/if_ether.h>
-
-#include <net/irda/irttp.h>
-
-#define IRLAN_MTU        1518
-#define IRLAN_TIMEOUT    10*HZ /* 10 seconds */
-
-/* Command packet types */
-#define CMD_GET_PROVIDER_INFO   0
-#define CMD_GET_MEDIA_CHAR      1
-#define CMD_OPEN_DATA_CHANNEL   2
-#define CMD_CLOSE_DATA_CHAN     3
-#define CMD_RECONNECT_DATA_CHAN 4
-#define CMD_FILTER_OPERATION    5
-
-/* Some responses */
-#define RSP_SUCCESS                 0
-#define RSP_INSUFFICIENT_RESOURCES  1
-#define RSP_INVALID_COMMAND_FORMAT  2
-#define RSP_COMMAND_NOT_SUPPORTED   3
-#define RSP_PARAM_NOT_SUPPORTED     4
-#define RSP_VALUE_NOT_SUPPORTED     5
-#define RSP_NOT_OPEN                6
-#define RSP_AUTHENTICATION_REQUIRED 7
-#define RSP_INVALID_PASSWORD        8
-#define RSP_PROTOCOL_ERROR          9
-#define RSP_ASYNCHRONOUS_ERROR    255
-
-/* Media types */
-#define MEDIA_802_3 1
-#define MEDIA_802_5 2
-
-/* Filter parameters */
-#define DATA_CHAN   1
-#define FILTER_TYPE 2
-#define FILTER_MODE 3
-
-/* Filter types */
-#define IRLAN_DIRECTED   0x01
-#define IRLAN_FUNCTIONAL 0x02
-#define IRLAN_GROUP      0x04
-#define IRLAN_MAC_FRAME  0x08
-#define IRLAN_MULTICAST  0x10
-#define IRLAN_BROADCAST  0x20
-#define IRLAN_IPX_SOCKET 0x40
-
-/* Filter modes */
-#define ALL     1
-#define FILTER  2
-#define NONE    3
-
-/* Filter operations */
-#define GET     1
-#define CLEAR   2
-#define ADD     3
-#define REMOVE  4
-#define DYNAMIC 5
-
-/* Access types */
-#define ACCESS_DIRECT  1
-#define ACCESS_PEER    2
-#define ACCESS_HOSTED  3
-
-#define IRLAN_BYTE   0
-#define IRLAN_SHORT  1
-#define IRLAN_ARRAY  2
-
-/* IrLAN sits on top if IrTTP */
-#define IRLAN_MAX_HEADER (TTP_HEADER+LMP_HEADER)
-/* 1 byte for the command code and 1 byte for the parameter count */
-#define IRLAN_CMD_HEADER 2
-
-#define IRLAN_STRING_PARAMETER_LEN(name, value) (1 + strlen((name)) + 2 \
-						+ strlen ((value)))
-#define IRLAN_BYTE_PARAMETER_LEN(name)          (1 + strlen((name)) + 2 + 1)
-#define IRLAN_SHORT_PARAMETER_LEN(name)         (1 + strlen((name)) + 2 + 2)
-
-/*
- *  IrLAN client
- */
-struct irlan_client_cb {
-	int state;
-
-	int open_retries;
-
-	struct tsap_cb *tsap_ctrl;
-	__u32 max_sdu_size;
-	__u8  max_header_size;
-	
-	int access_type;         /* Access type of provider */
-	__u8 reconnect_key[255];
-	__u8 key_len;
-	
-	__u16 recv_arb_val;
-	__u16 max_frame;
-	int filter_type;
-
-	int unicast_open;
-	int broadcast_open;
-
-	int tx_busy;
-	struct sk_buff_head txq; /* Transmit control queue */
-
-	struct iriap_cb *iriap;
-
-	struct timer_list kick_timer;
-};
-
-/*
- * IrLAN provider
- */
-struct irlan_provider_cb {
-	int state;
-	
-	struct tsap_cb *tsap_ctrl;
-	__u32 max_sdu_size;
-	__u8  max_header_size;
-
-	/*
-	 *  Store some values here which are used by the provider to parse
-	 *  the filter operations
-	 */
-	int data_chan;
-	int filter_type;
-	int filter_mode;
-	int filter_operation;
-	int filter_entry;
-	int access_type;     /* Access type */
-	__u16 send_arb_val;
-
-	__u8 mac_address[ETH_ALEN]; /* Generated MAC address for peer device */
-};
-
-/*
- *  IrLAN control block
- */
-struct irlan_cb {
-	int    magic;
-	struct list_head  dev_list;
-	struct net_device *dev;        /* Ethernet device structure*/
-
-	__u32 saddr;               /* Source device address */
-	__u32 daddr;               /* Destination device address */
-	int disconnect_reason;     /* Why we got disconnected */
-	
-	int media;                 /* Media type */
-	__u8 version[2];           /* IrLAN version */
-	
-	struct tsap_cb *tsap_data; /* Data TSAP */
-
-	int  use_udata;            /* Use Unit Data transfers */
-
-	__u8 stsap_sel_data;       /* Source data TSAP selector */
-	__u8 dtsap_sel_data;       /* Destination data TSAP selector */
-	__u8 dtsap_sel_ctrl;       /* Destination ctrl TSAP selector */
-
-	struct irlan_client_cb   client;   /* Client specific fields */
-	struct irlan_provider_cb provider; /* Provider specific fields */
-
-	__u32 max_sdu_size;
-	__u8  max_header_size;
-	
-	wait_queue_head_t open_wait;
-	struct timer_list watchdog_timer;
-};
-
-void irlan_close(struct irlan_cb *self);
-void irlan_close_tsaps(struct irlan_cb *self);
-
-int  irlan_register_netdev(struct irlan_cb *self);
-void irlan_ias_register(struct irlan_cb *self, __u8 tsap_sel);
-void irlan_start_watchdog_timer(struct irlan_cb *self, int timeout);
-
-void irlan_open_data_tsap(struct irlan_cb *self);
-
-int irlan_run_ctrl_tx_queue(struct irlan_cb *self);
-
-struct irlan_cb *irlan_get_any(void);
-void irlan_get_provider_info(struct irlan_cb *self);
-void irlan_get_media_char(struct irlan_cb *self);
-void irlan_open_data_channel(struct irlan_cb *self);
-void irlan_close_data_channel(struct irlan_cb *self);
-void irlan_set_multicast_filter(struct irlan_cb *self, int status);
-void irlan_set_broadcast_filter(struct irlan_cb *self, int status);
-
-int irlan_insert_byte_param(struct sk_buff *skb, char *param, __u8 value);
-int irlan_insert_short_param(struct sk_buff *skb, char *param, __u16 value);
-int irlan_insert_string_param(struct sk_buff *skb, char *param, char *value);
-int irlan_insert_array_param(struct sk_buff *skb, char *name, __u8 *value, 
-			     __u16 value_len);
-
-int irlan_extract_param(__u8 *buf, char *name, char *value, __u16 *len);
-
-#endif
-
-
diff --git a/include/net/irda/irlan_eth.h b/include/net/irda/irlan_eth.h
deleted file mode 100644
index de5c81691f33..000000000000
--- a/include/net/irda/irlan_eth.h
+++ /dev/null
@@ -1,32 +0,0 @@
-/*********************************************************************
- *                
- * Filename:      irlan_eth.h
- * Version:       
- * Description:   
- * Status:        Experimental.
- * Author:        Dag Brattli <dagb@cs.uit.no>
- * Created at:    Thu Oct 15 08:36:58 1998
- * Modified at:   Fri May 14 23:29:00 1999
- * Modified by:   Dag Brattli <dagb@cs.uit.no>
- * 
- *     Copyright (c) 1998-1999 Dag Brattli, All Rights Reserved.
- *      
- *     This program is free software; you can redistribute it and/or 
- *     modify it under the terms of the GNU General Public License as 
- *     published by the Free Software Foundation; either version 2 of 
- *     the License, or (at your option) any later version.
- *  
- *     Neither Dag Brattli nor University of Tromsø admit liability nor
- *     provide warranty for any of this software. This material is 
- *     provided "AS-IS" and at no charge.
- *     
- ********************************************************************/
-
-#ifndef IRLAN_ETH_H
-#define IRLAN_ETH_H
-
-struct net_device *alloc_irlandev(const char *name);
-int  irlan_eth_receive(void *instance, void *sap, struct sk_buff *skb);
-
-void irlan_eth_flow_indication( void *instance, void *sap, LOCAL_FLOW flow);
-#endif
diff --git a/include/net/irda/irlan_event.h b/include/net/irda/irlan_event.h
deleted file mode 100644
index 018b5a77e610..000000000000
--- a/include/net/irda/irlan_event.h
+++ /dev/null
@@ -1,81 +0,0 @@
-/*********************************************************************
- *                
- * Filename:      irlan_event.h
- * Version:       
- * Description:   LAN access
- * Status:        Experimental.
- * Author:        Dag Brattli <dagb@cs.uit.no>
- * Created at:    Sun Aug 31 20:14:37 1997
- * Modified at:   Tue Feb  2 09:45:17 1999
- * Modified by:   Dag Brattli <dagb@cs.uit.no>
- * 
- *     Copyright (c) 1997 Dag Brattli <dagb@cs.uit.no>, All Rights Reserved.
- *     
- *     This program is free software; you can redistribute it and/or 
- *     modify it under the terms of the GNU General Public License as 
- *     published by the Free Software Foundation; either version 2 of 
- *     the License, or (at your option) any later version.
- *
- *     Neither Dag Brattli nor University of Tromsø admit liability nor
- *     provide warranty for any of this software. This material is 
- *     provided "AS-IS" and at no charge.
- *
- ********************************************************************/
-
-#ifndef IRLAN_EVENT_H
-#define IRLAN_EVENT_H
-
-#include <linux/kernel.h>
-#include <linux/skbuff.h>
-
-#include <net/irda/irlan_common.h>
-
-typedef enum {
-	IRLAN_IDLE,
-	IRLAN_QUERY,
-	IRLAN_CONN, 
-	IRLAN_INFO,
-	IRLAN_MEDIA,
-	IRLAN_OPEN,
-	IRLAN_WAIT,
-	IRLAN_ARB, 
-	IRLAN_DATA,
-	IRLAN_CLOSE,
-	IRLAN_SYNC
-} IRLAN_STATE;
-
-typedef enum {
-	IRLAN_DISCOVERY_INDICATION,
-	IRLAN_IAS_PROVIDER_AVAIL,
-	IRLAN_IAS_PROVIDER_NOT_AVAIL,
-	IRLAN_LAP_DISCONNECT,
-	IRLAN_LMP_DISCONNECT,
-	IRLAN_CONNECT_COMPLETE,
-	IRLAN_DATA_INDICATION,
-	IRLAN_DATA_CONNECT_INDICATION,
-	IRLAN_RETRY_CONNECT,
-
-	IRLAN_CONNECT_INDICATION,
-	IRLAN_GET_INFO_CMD,
-	IRLAN_GET_MEDIA_CMD,
-	IRLAN_OPEN_DATA_CMD,
-	IRLAN_FILTER_CONFIG_CMD,
-
-	IRLAN_CHECK_CON_ARB,
-	IRLAN_PROVIDER_SIGNAL,
-
-	IRLAN_WATCHDOG_TIMEOUT,
-} IRLAN_EVENT;
-
-extern const char * const irlan_state[];
-
-void irlan_do_client_event(struct irlan_cb *self, IRLAN_EVENT event, 
-			   struct sk_buff *skb);
-
-void irlan_do_provider_event(struct irlan_cb *self, IRLAN_EVENT event, 
-			     struct sk_buff *skb);
-
-void irlan_next_client_state(struct irlan_cb *self, IRLAN_STATE state);
-void irlan_next_provider_state(struct irlan_cb *self, IRLAN_STATE state);
-
-#endif
diff --git a/include/net/irda/irlan_filter.h b/include/net/irda/irlan_filter.h
deleted file mode 100644
index a5a2539485bd..000000000000
--- a/include/net/irda/irlan_filter.h
+++ /dev/null
@@ -1,35 +0,0 @@
-/*********************************************************************
- *                
- * Filename:      irlan_filter.h
- * Version:       
- * Description:   
- * Status:        Experimental.
- * Author:        Dag Brattli <dagb@cs.uit.no>
- * Created at:    Fri Jan 29 15:24:08 1999
- * Modified at:   Sun Feb  7 23:35:31 1999
- * Modified by:   Dag Brattli <dagb@cs.uit.no>
- * 
- *     Copyright (c) 1998 Dag Brattli, All Rights Reserved.
- *      
- *     This program is free software; you can redistribute it and/or 
- *     modify it under the terms of the GNU General Public License as 
- *     published by the Free Software Foundation; either version 2 of 
- *     the License, or (at your option) any later version.
- *  
- *     Neither Dag Brattli nor University of Tromsø admit liability nor
- *     provide warranty for any of this software. This material is 
- *     provided "AS-IS" and at no charge.
- *     
- ********************************************************************/
-
-#ifndef IRLAN_FILTER_H
-#define IRLAN_FILTER_H
-
-void irlan_check_command_param(struct irlan_cb *self, char *param, 
-			       char *value);
-void irlan_filter_request(struct irlan_cb *self, struct sk_buff *skb);
-#ifdef CONFIG_PROC_FS
-void irlan_print_filter(struct seq_file *seq, int filter_type);
-#endif
-
-#endif /* IRLAN_FILTER_H */
diff --git a/include/net/irda/irlan_provider.h b/include/net/irda/irlan_provider.h
deleted file mode 100644
index 92f3b0e1029b..000000000000
--- a/include/net/irda/irlan_provider.h
+++ /dev/null
@@ -1,52 +0,0 @@
-/*********************************************************************
- *                
- * Filename:      irlan_provider.h
- * Version:       0.1
- * Description:   IrDA LAN access layer
- * Status:        Experimental.
- * Author:        Dag Brattli <dagb@cs.uit.no>
- * Created at:    Sun Aug 31 20:14:37 1997
- * Modified at:   Sun May  9 12:26:11 1999
- * Modified by:   Dag Brattli <dagb@cs.uit.no>
- * 
- *     Copyright (c) 1998-1999 Dag Brattli <dagb@cs.uit.no>, All Rights Reserved.
- *     
- *     This program is free software; you can redistribute it and/or 
- *     modify it under the terms of the GNU General Public License as 
- *     published by the Free Software Foundation; either version 2 of 
- *     the License, or (at your option) any later version.
- *
- *     Neither Dag Brattli nor University of Tromsø admit liability nor
- *     provide warranty for any of this software. This material is 
- *     provided "AS-IS" and at no charge.
- *
- ********************************************************************/
-
-#ifndef IRLAN_SERVER_H
-#define IRLAN_SERVER_H
-
-#include <linux/kernel.h>
-#include <linux/types.h>
-#include <linux/skbuff.h>
-#include <linux/netdevice.h>
-
-#include <net/irda/irlan_common.h>
-
-void irlan_provider_ctrl_disconnect_indication(void *instance, void *sap, 
-					       LM_REASON reason, 
-					       struct sk_buff *skb);
-
-
-void irlan_provider_connect_response(struct irlan_cb *, struct tsap_cb *);
-
-int irlan_parse_open_data_cmd(struct irlan_cb *self, struct sk_buff *skb);
-int irlan_provider_parse_command(struct irlan_cb *self, int cmd,
-				 struct sk_buff *skb);
-
-void irlan_provider_send_reply(struct irlan_cb *self, int command, 
-			       int ret_code);
-int irlan_provider_open_ctrl_tsap(struct irlan_cb *self);
-
-#endif
-
-
diff --git a/include/net/irda/irlap.h b/include/net/irda/irlap.h
deleted file mode 100644
index 6f23e820618c..000000000000
--- a/include/net/irda/irlap.h
+++ /dev/null
@@ -1,311 +0,0 @@
-/*********************************************************************
- *                
- * Filename:      irlap.h
- * Version:       0.8
- * Description:   An IrDA LAP driver for Linux
- * Status:        Experimental.
- * Author:        Dag Brattli <dagb@cs.uit.no>
- * Created at:    Mon Aug  4 20:40:53 1997
- * Modified at:   Fri Dec 10 13:21:17 1999
- * Modified by:   Dag Brattli <dagb@cs.uit.no>
- * 
- *     Copyright (c) 1998-1999 Dag Brattli <dagb@cs.uit.no>, 
- *     All Rights Reserved.
- *     Copyright (c) 2000-2002 Jean Tourrilhes <jt@hpl.hp.com>
- *     
- *     This program is free software; you can redistribute it and/or 
- *     modify it under the terms of the GNU General Public License as 
- *     published by the Free Software Foundation; either version 2 of 
- *     the License, or (at your option) any later version.
- *
- *     Neither Dag Brattli nor University of Tromsø admit liability nor
- *     provide warranty for any of this software. This material is 
- *     provided "AS-IS" and at no charge.
- *
- ********************************************************************/
-
-#ifndef IRLAP_H
-#define IRLAP_H
-
-#include <linux/types.h>
-#include <linux/skbuff.h>
-#include <linux/netdevice.h>
-#include <linux/timer.h>
-
-#include <net/irda/irqueue.h>		/* irda_queue_t */
-#include <net/irda/qos.h>		/* struct qos_info */
-#include <net/irda/discovery.h>		/* discovery_t */
-#include <net/irda/irlap_event.h>	/* IRLAP_STATE, ... */
-#include <net/irda/irmod.h>		/* struct notify_t */
-
-#define CONFIG_IRDA_DYNAMIC_WINDOW 1
-
-#define LAP_RELIABLE   1
-#define LAP_UNRELIABLE 0
-
-#define LAP_ADDR_HEADER 1  /* IrLAP Address Header */
-#define LAP_CTRL_HEADER 1  /* IrLAP Control Header */
-
-/* May be different when we get VFIR */
-#define LAP_MAX_HEADER (LAP_ADDR_HEADER + LAP_CTRL_HEADER)
-
-/* Each IrDA device gets a random 32 bits IRLAP device address */
-#define LAP_ALEN 4
-
-#define BROADCAST  0xffffffff /* Broadcast device address */
-#define CBROADCAST 0xfe       /* Connection broadcast address */
-#define XID_FORMAT 0x01       /* Discovery XID format */
-
-/* Nobody seems to use this constant. */
-#define LAP_WINDOW_SIZE 8
-/* We keep the LAP queue very small to minimise the amount of buffering.
- * this improve latency and reduce resource consumption.
- * This work only because we have synchronous refilling of IrLAP through
- * the flow control mechanism (via scheduler and IrTTP).
- * 2 buffers is the minimum we can work with, one that we send while polling
- * IrTTP, and another to know that we should not send the pf bit.
- * Jean II */
-#define LAP_HIGH_THRESHOLD     2
-/* Some rare non TTP clients don't implement flow control, and
- * so don't comply with the above limit (and neither with this one).
- * For IAP and management, it doesn't matter, because they never transmit much.
- *.For IrLPT, this should be fixed.
- * - Jean II */
-#define LAP_MAX_QUEUE 10
-/* Please note that all IrDA management frames (LMP/TTP conn req/disc and
- * IAS queries) fall in the second category and are sent to LAP even if TTP
- * is stopped. This means that those frames will wait only a maximum of
- * two (2) data frames before beeing sent on the "wire", which speed up
- * new socket setup when the link is saturated.
- * Same story for two sockets competing for the medium : if one saturates
- * the LAP, when the other want to transmit it only has to wait for
- * maximum three (3) packets (2 + one scheduling), which improve performance
- * of delay sensitive applications.
- * Jean II */
-
-#define NR_EXPECTED     1
-#define NR_UNEXPECTED   0
-#define NR_INVALID     -1
-
-#define NS_EXPECTED     1
-#define NS_UNEXPECTED   0
-#define NS_INVALID     -1
-
-/*
- *  Meta information passed within the IrLAP state machine
- */
-struct irlap_info {
-	__u8 caddr;   /* Connection address */
-	__u8 control; /* Frame type */
-        __u8 cmd;
-
-	__u32 saddr;
-	__u32 daddr;
-	
-	int pf;        /* Poll/final bit set */
-
-	__u8  nr;      /* Sequence number of next frame expected */
-	__u8  ns;      /* Sequence number of frame sent */
-
-	int  S;        /* Number of slots */
-	int  slot;     /* Random chosen slot */
-	int  s;        /* Current slot */
-
-	discovery_t *discovery; /* Discovery information */
-};
-
-/* Main structure of IrLAP */
-struct irlap_cb {
-	irda_queue_t q;     /* Must be first */
-	magic_t magic;
-
-	/* Device we are attached to */
-	struct net_device  *netdev;
-	char		hw_name[2*IFNAMSIZ + 1];
-
-	/* Connection state */
-	volatile IRLAP_STATE state;       /* Current state */
-
-	/* Timers used by IrLAP */
-	struct timer_list query_timer;
-	struct timer_list slot_timer;
-	struct timer_list discovery_timer;
-	struct timer_list final_timer;
-	struct timer_list poll_timer;
-	struct timer_list wd_timer;
-	struct timer_list backoff_timer;
-
-	/* Media busy stuff */
-	struct timer_list media_busy_timer;
-	int media_busy;
-
-	/* Timeouts which will be different with different turn time */
-	int slot_timeout;
-	int poll_timeout;
-	int final_timeout;
-	int wd_timeout;
-
-	struct sk_buff_head txq;  /* Frames to be transmitted */
-	struct sk_buff_head txq_ultra;
-
- 	__u8    caddr;        /* Connection address */
-	__u32   saddr;        /* Source device address */
-	__u32   daddr;        /* Destination device address */
-
-	int     retry_count;  /* Times tried to establish connection */
-	int     add_wait;     /* True if we are waiting for frame */
-
-	__u8    connect_pending;
-	__u8    disconnect_pending;
-
-	/*  To send a faster RR if tx queue empty */
-#ifdef CONFIG_IRDA_FAST_RR
-	int     fast_RR_timeout;
-	int     fast_RR;      
-#endif /* CONFIG_IRDA_FAST_RR */
-	
-	int N1; /* N1 * F-timer = Negitiated link disconnect warning threshold */
-	int N2; /* N2 * F-timer = Negitiated link disconnect time */
-	int N3; /* Connection retry count */
-
-	int     local_busy;
-	int     remote_busy;
-	int     xmitflag;
-
-	__u8    vs;            /* Next frame to be sent */
-	__u8    vr;            /* Next frame to be received */
-	__u8    va;            /* Last frame acked */
- 	int     window;        /* Nr of I-frames allowed to send */
-	int     window_size;   /* Current negotiated window size */
-
-#ifdef CONFIG_IRDA_DYNAMIC_WINDOW
-	__u32   line_capacity; /* Number of bytes allowed to send */
-	__u32   bytes_left;    /* Number of bytes still allowed to transmit */
-#endif /* CONFIG_IRDA_DYNAMIC_WINDOW */
-
-	struct sk_buff_head wx_list;
-
-	__u8    ack_required;
-	
-	/* XID parameters */
- 	__u8    S;           /* Number of slots */
-	__u8    slot;        /* Random chosen slot */
- 	__u8    s;           /* Current slot */
-	int     frame_sent;  /* Have we sent reply? */
-
-	hashbin_t   *discovery_log;
- 	discovery_t *discovery_cmd;
-
-	__u32 speed;		/* Link speed */
-
-	struct qos_info  qos_tx;   /* QoS requested by peer */
-	struct qos_info  qos_rx;   /* QoS requested by self */
-	struct qos_info *qos_dev;  /* QoS supported by device */
-
-	notify_t notify; /* Callbacks to IrLMP */
-
-	int    mtt_required;  /* Minimum turnaround time required */
-	int    xbofs_delay;   /* Nr of XBOF's used to MTT */
-	int    bofs_count;    /* Negotiated extra BOFs */
-	int    next_bofs;     /* Negotiated extra BOFs after next frame */
-
-	int    mode;     /* IrLAP mode (primary, secondary or monitor) */
-};
-
-/* 
- *  Function prototypes 
- */
-int irlap_init(void);
-void irlap_cleanup(void);
-
-struct irlap_cb *irlap_open(struct net_device *dev, struct qos_info *qos,
-			    const char *hw_name);
-void irlap_close(struct irlap_cb *self);
-
-void irlap_connect_request(struct irlap_cb *self, __u32 daddr, 
-			   struct qos_info *qos, int sniff);
-void irlap_connect_response(struct irlap_cb *self, struct sk_buff *skb);
-void irlap_connect_indication(struct irlap_cb *self, struct sk_buff *skb);
-void irlap_connect_confirm(struct irlap_cb *, struct sk_buff *skb);
-
-void irlap_data_indication(struct irlap_cb *, struct sk_buff *, int unreliable);
-void irlap_data_request(struct irlap_cb *, struct sk_buff *, int unreliable);
-
-#ifdef CONFIG_IRDA_ULTRA
-void irlap_unitdata_request(struct irlap_cb *, struct sk_buff *);
-void irlap_unitdata_indication(struct irlap_cb *, struct sk_buff *);
-#endif /* CONFIG_IRDA_ULTRA */
-
-void irlap_disconnect_request(struct irlap_cb *);
-void irlap_disconnect_indication(struct irlap_cb *, LAP_REASON reason);
-
-void irlap_status_indication(struct irlap_cb *, int quality_of_link);
-
-void irlap_test_request(__u8 *info, int len);
-
-void irlap_discovery_request(struct irlap_cb *, discovery_t *discovery);
-void irlap_discovery_confirm(struct irlap_cb *, hashbin_t *discovery_log);
-void irlap_discovery_indication(struct irlap_cb *, discovery_t *discovery);
-
-void irlap_reset_indication(struct irlap_cb *self);
-void irlap_reset_confirm(void);
-
-void irlap_update_nr_received(struct irlap_cb *, int nr);
-int irlap_validate_nr_received(struct irlap_cb *, int nr);
-int irlap_validate_ns_received(struct irlap_cb *, int ns);
-
-int  irlap_generate_rand_time_slot(int S, int s);
-void irlap_initiate_connection_state(struct irlap_cb *);
-void irlap_flush_all_queues(struct irlap_cb *);
-void irlap_wait_min_turn_around(struct irlap_cb *, struct qos_info *);
-
-void irlap_apply_default_connection_parameters(struct irlap_cb *self);
-void irlap_apply_connection_parameters(struct irlap_cb *self, int now);
-
-#define IRLAP_GET_HEADER_SIZE(self) (LAP_MAX_HEADER)
-#define IRLAP_GET_TX_QUEUE_LEN(self) skb_queue_len(&self->txq)
-
-/* Return TRUE if the node is in primary mode (i.e. master)
- * - Jean II */
-static inline int irlap_is_primary(struct irlap_cb *self)
-{
-	int ret;
-	switch(self->state) {
-	case LAP_XMIT_P:
-	case LAP_NRM_P:
-		ret = 1;
-		break;
-	case LAP_XMIT_S:
-	case LAP_NRM_S:
-		ret = 0;
-		break;
-	default:
-		ret = -1;
-	}
-	return ret;
-}
-
-/* Clear a pending IrLAP disconnect. - Jean II */
-static inline void irlap_clear_disconnect(struct irlap_cb *self)
-{
-	self->disconnect_pending = FALSE;
-}
-
-/*
- * Function irlap_next_state (self, state)
- *
- *    Switches state and provides debug information
- *
- */
-static inline void irlap_next_state(struct irlap_cb *self, IRLAP_STATE state)
-{
-	/*
-	if (!self || self->magic != LAP_MAGIC)
-		return;
-
-		pr_debug("next LAP state = %s\n", irlap_state[state]);
-	*/
-	self->state = state;
-}
-
-#endif
diff --git a/include/net/irda/irlap_event.h b/include/net/irda/irlap_event.h
deleted file mode 100644
index e4325fee1267..000000000000
--- a/include/net/irda/irlap_event.h
+++ /dev/null
@@ -1,129 +0,0 @@
-/*********************************************************************
- *                
- *                
- * Filename:      irlap_event.h
- * Version:       0.1
- * Description:   
- * Status:        Experimental.
- * Author:        Dag Brattli <dagb@cs.uit.no>
- * Created at:    Sat Aug 16 00:59:29 1997
- * Modified at:   Tue Dec 21 11:20:30 1999
- * Modified by:   Dag Brattli <dagb@cs.uit.no>
- * 
- *     Copyright (c) 1998-1999 Dag Brattli <dagb@cs.uit.no>, 
- *     All Rights Reserved.
- *     Copyright (c) 2000-2002 Jean Tourrilhes <jt@hpl.hp.com>
- *     
- *     This program is free software; you can redistribute it and/or 
- *     modify it under the terms of the GNU General Public License as 
- *     published by the Free Software Foundation; either version 2 of 
- *     the License, or (at your option) any later version.
- * 
- *     This program is distributed in the hope that it will be useful,
- *     but WITHOUT ANY WARRANTY; without even the implied warranty of
- *     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- *     GNU General Public License for more details.
- * 
- *     You should have received a copy of the GNU General Public License 
- *     along with this program; if not, see <http://www.gnu.org/licenses/>.
- *     
- ********************************************************************/
-
-#ifndef IRLAP_EVENT_H
-#define IRLAP_EVENT_H
-
-#include <net/irda/irda.h>
-
-/* A few forward declarations (to make compiler happy) */
-struct irlap_cb;
-struct irlap_info;
-
-/* IrLAP States */
-typedef enum {
-	LAP_NDM,         /* Normal disconnected mode */
-	LAP_QUERY,
-	LAP_REPLY,
-	LAP_CONN,        /* Connect indication */
-	LAP_SETUP,       /* Setting up connection */
-	LAP_OFFLINE,     /* A really boring state */
-	LAP_XMIT_P,
-	LAP_PCLOSE,
-	LAP_NRM_P,       /* Normal response mode as primary */
-	LAP_RESET_WAIT,
-	LAP_RESET,
-	LAP_NRM_S,       /* Normal response mode as secondary */
-	LAP_XMIT_S,
-	LAP_SCLOSE,
-	LAP_RESET_CHECK,
-} IRLAP_STATE;
-
-/* IrLAP Events */
-typedef enum {
-	/* Services events */
-	DISCOVERY_REQUEST,
-	CONNECT_REQUEST,
-	CONNECT_RESPONSE,
-	DISCONNECT_REQUEST,
-	DATA_REQUEST,
-	RESET_REQUEST,
-	RESET_RESPONSE,
-
-	/* Send events */
-	SEND_I_CMD,
-	SEND_UI_FRAME,
-
-	/* Receive events */
-	RECV_DISCOVERY_XID_CMD,
-	RECV_DISCOVERY_XID_RSP,
-	RECV_SNRM_CMD,
-	RECV_TEST_CMD,
-	RECV_TEST_RSP,
-	RECV_UA_RSP,
-	RECV_DM_RSP,
-	RECV_RD_RSP,
-	RECV_I_CMD,
-	RECV_I_RSP,
-	RECV_UI_FRAME,
-	RECV_FRMR_RSP,
-	RECV_RR_CMD,
-	RECV_RR_RSP,
-	RECV_RNR_CMD,
-	RECV_RNR_RSP,
-	RECV_REJ_CMD,
-	RECV_REJ_RSP,
-	RECV_SREJ_CMD,
-	RECV_SREJ_RSP,
-	RECV_DISC_CMD,
-
-	/* Timer events */
-	SLOT_TIMER_EXPIRED,
-	QUERY_TIMER_EXPIRED,
-	FINAL_TIMER_EXPIRED,
-	POLL_TIMER_EXPIRED,
-	DISCOVERY_TIMER_EXPIRED,
-	WD_TIMER_EXPIRED,
-	BACKOFF_TIMER_EXPIRED,
-	MEDIA_BUSY_TIMER_EXPIRED,
-} IRLAP_EVENT;
-
-/*
- * Disconnect reason code
- */
-typedef enum { /* FIXME check the two first reason codes */
-	LAP_DISC_INDICATION=1, /* Received a disconnect request from peer */
-	LAP_NO_RESPONSE,       /* To many retransmits without response */
-	LAP_RESET_INDICATION,  /* To many retransmits, or invalid nr/ns */
-	LAP_FOUND_NONE,        /* No devices were discovered */
-	LAP_MEDIA_BUSY,
-	LAP_PRIMARY_CONFLICT,
-} LAP_REASON;
-
-extern const char *const irlap_state[];
-
-void irlap_do_event(struct irlap_cb *self, IRLAP_EVENT event, 
-		    struct sk_buff *skb, struct irlap_info *info);
-void irlap_print_event(IRLAP_EVENT event);
-
-int irlap_qos_negotiate(struct irlap_cb *self, struct sk_buff *skb);
-
-#endif
diff --git a/include/net/irda/irlap_frame.h b/include/net/irda/irlap_frame.h
deleted file mode 100644
index cbc12a926e5f..000000000000
--- a/include/net/irda/irlap_frame.h
+++ /dev/null
@@ -1,167 +0,0 @@
-/*********************************************************************
- *                
- * Filename:      irlap_frame.h
- * Version:       0.9
- * Description:   IrLAP frame declarations
- * Status:        Experimental.
- * Author:        Dag Brattli <dagb@cs.uit.no>
- * Created at:    Tue Aug 19 10:27:26 1997
- * Modified at:   Sat Dec 25 21:07:26 1999
- * Modified by:   Dag Brattli <dagb@cs.uit.no>
- * 
- *     Copyright (c) 1997-1999 Dag Brattli <dagb@cs.uit.no>,
- *     All Rights Reserved.
- *     Copyright (c) 2000-2002 Jean Tourrilhes <jt@hpl.hp.com>
- *     
- *     This program is free software; you can redistribute it and/or 
- *     modify it under the terms of the GNU General Public License as 
- *     published by the Free Software Foundation; either version 2 of 
- *     the License, or (at your option) any later version.
- * 
- *     This program is distributed in the hope that it will be useful,
- *     but WITHOUT ANY WARRANTY; without even the implied warranty of
- *     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- *     GNU General Public License for more details.
- * 
- *     You should have received a copy of the GNU General Public License 
- *     along with this program; if not, see <http://www.gnu.org/licenses/>.
- *     
- ********************************************************************/
-
-#ifndef IRLAP_FRAME_H
-#define IRLAP_FRAME_H
-
-#include <linux/skbuff.h>
-
-#include <net/irda/irda.h>
-
-/* A few forward declarations (to make compiler happy) */
-struct irlap_cb;
-struct discovery_t;
-
-/* Frame types and templates */
-#define INVALID   0xff
-
-/* Unnumbered (U) commands */
-#define SNRM_CMD  0x83 /* Set Normal Response Mode */
-#define DISC_CMD  0x43 /* Disconnect */
-#define XID_CMD   0x2f /* Exchange Station Identification */
-#define TEST_CMD  0xe3 /* Test */
-
-/* Unnumbered responses */
-#define RNRM_RSP  0x83 /* Request Normal Response Mode */
-#define UA_RSP    0x63 /* Unnumbered Acknowledgement */
-#define FRMR_RSP  0x87 /* Frame Reject */
-#define DM_RSP    0x0f /* Disconnect Mode */
-#define RD_RSP    0x43 /* Request Disconnection */
-#define XID_RSP   0xaf /* Exchange Station Identification */
-#define TEST_RSP  0xe3 /* Test frame */
-
-/* Supervisory (S) */
-#define RR        0x01 /* Receive Ready */
-#define REJ       0x09 /* Reject */
-#define RNR       0x05 /* Receive Not Ready */
-#define SREJ      0x0d /* Selective Reject */
-
-/* Information (I) */
-#define I_FRAME   0x00 /* Information Format */
-#define UI_FRAME  0x03 /* Unnumbered Information */
-
-#define CMD_FRAME 0x01
-#define RSP_FRAME 0x00
-
-#define PF_BIT    0x10 /* Poll/final bit */
-
-/* Some IrLAP field lengths */
-/*
- * Only baud rate triplet is 4 bytes (PV can be 2 bytes).
- * All others params (7) are 3 bytes, so that's 7*3 + 1*4 bytes.
- */
-#define IRLAP_NEGOCIATION_PARAMS_LEN 25
-#define IRLAP_DISCOVERY_INFO_LEN     32
-
-struct disc_frame {
-	__u8 caddr;          /* Connection address */
-	__u8 control;
-} __packed;
-
-struct xid_frame {
-	__u8  caddr; /* Connection address */
-	__u8  control;
-	__u8  ident; /* Should always be XID_FORMAT */ 
-	__le32 saddr; /* Source device address */
-	__le32 daddr; /* Destination device address */
-	__u8  flags; /* Discovery flags */
-	__u8  slotnr;
-	__u8  version;
-} __packed;
-
-struct test_frame {
-	__u8 caddr;          /* Connection address */
-	__u8 control;
-	__le32 saddr;         /* Source device address */
-	__le32 daddr;         /* Destination device address */
-} __packed;
-
-struct ua_frame {
-	__u8 caddr;
-	__u8 control;
-	__le32 saddr; /* Source device address */
-	__le32 daddr; /* Dest device address */
-} __packed;
-
-struct dm_frame {
-	__u8 caddr;          /* Connection address */
-	__u8 control;
-} __packed;
-
-struct rd_frame {
-	__u8 caddr;          /* Connection address */
-	__u8 control;
-} __packed;
-
-struct rr_frame {
-	__u8 caddr;          /* Connection address */
-	__u8 control;
-} __packed;
-
-struct i_frame {
-	__u8 caddr;
-	__u8 control;
-} __packed;
-
-struct snrm_frame {
-	__u8  caddr;
-	__u8  control;
-	__le32 saddr;
-	__le32 daddr;
-	__u8  ncaddr;
-} __packed;
-
-void irlap_queue_xmit(struct irlap_cb *self, struct sk_buff *skb);
-void irlap_send_discovery_xid_frame(struct irlap_cb *, int S, __u8 s, 
-				    __u8 command,
-				    struct discovery_t *discovery);
-void irlap_send_snrm_frame(struct irlap_cb *, struct qos_info *);
-void irlap_send_test_frame(struct irlap_cb *self, __u8 caddr, __u32 daddr, 
-			   struct sk_buff *cmd);
-void irlap_send_ua_response_frame(struct irlap_cb *, struct qos_info *);
-void irlap_send_dm_frame(struct irlap_cb *self);
-void irlap_send_rd_frame(struct irlap_cb *self);
-void irlap_send_disc_frame(struct irlap_cb *self);
-void irlap_send_rr_frame(struct irlap_cb *self, int command);
-
-void irlap_send_data_primary(struct irlap_cb *, struct sk_buff *);
-void irlap_send_data_primary_poll(struct irlap_cb *, struct sk_buff *);
-void irlap_send_data_secondary(struct irlap_cb *, struct sk_buff *);
-void irlap_send_data_secondary_final(struct irlap_cb *, struct sk_buff *);
-void irlap_resend_rejected_frames(struct irlap_cb *, int command);
-void irlap_resend_rejected_frame(struct irlap_cb *self, int command);
-
-void irlap_send_ui_frame(struct irlap_cb *self, struct sk_buff *skb,
-			 __u8 caddr, int command);
-
-int irlap_insert_qos_negotiation_params(struct irlap_cb *self,
-					struct sk_buff *skb);
-
-#endif
diff --git a/include/net/irda/irlmp.h b/include/net/irda/irlmp.h
deleted file mode 100644
index f132924cc9da..000000000000
--- a/include/net/irda/irlmp.h
+++ /dev/null
@@ -1,295 +0,0 @@
-/*********************************************************************
- *                
- * Filename:      irlmp.h
- * Version:       0.9
- * Description:   IrDA Link Management Protocol (LMP) layer
- * Status:        Experimental.
- * Author:        Dag Brattli <dagb@cs.uit.no>
- * Created at:    Sun Aug 17 20:54:32 1997
- * Modified at:   Fri Dec 10 13:23:01 1999
- * Modified by:   Dag Brattli <dagb@cs.uit.no>
- * 
- *     Copyright (c) 1998-1999 Dag Brattli <dagb@cs.uit.no>, 
- *     All Rights Reserved.
- *     Copyright (c) 2000-2002 Jean Tourrilhes <jt@hpl.hp.com>
- *     
- *     This program is free software; you can redistribute it and/or 
- *     modify it under the terms of the GNU General Public License as 
- *     published by the Free Software Foundation; either version 2 of 
- *     the License, or (at your option) any later version.
- *
- *     Neither Dag Brattli nor University of Tromsø admit liability nor
- *     provide warranty for any of this software. This material is 
- *     provided "AS-IS" and at no charge.
- *
- ********************************************************************/
-
-#ifndef IRLMP_H
-#define IRLMP_H
-
-#include <asm/param.h>  /* for HZ */
-
-#include <linux/types.h>
-
-#include <net/irda/irda.h>
-#include <net/irda/qos.h>
-#include <net/irda/irlap.h>		/* LAP_MAX_HEADER, ... */
-#include <net/irda/irlmp_event.h>
-#include <net/irda/irqueue.h>
-#include <net/irda/discovery.h>
-
-/* LSAP-SEL's */
-#define LSAP_MASK     0x7f
-#define LSAP_IAS      0x00
-#define LSAP_ANY      0xff
-#define LSAP_MAX      0x6f /* 0x70-0x7f are reserved */
-#define LSAP_CONNLESS 0x70 /* Connectionless LSAP, mostly used for Ultra */
-
-#define DEV_ADDR_ANY  0xffffffff
-
-#define LMP_HEADER          2    /* Dest LSAP + Source LSAP */
-#define LMP_CONTROL_HEADER  4    /* LMP_HEADER + opcode + parameter */
-#define LMP_PID_HEADER      1    /* Used by Ultra */
-#define LMP_MAX_HEADER      (LMP_CONTROL_HEADER+LAP_MAX_HEADER)
-
-#define LM_MAX_CONNECTIONS  10
-
-#define LM_IDLE_TIMEOUT     2*HZ /* 2 seconds for now */
-
-typedef enum {
-	S_PNP = 0,
-	S_PDA,
-	S_COMPUTER,
-	S_PRINTER,
-	S_MODEM,
-	S_FAX,
-	S_LAN,
-	S_TELEPHONY,
-	S_COMM,
-	S_OBEX,
-	S_ANY,
-	S_END,
-} SERVICE;
-
-/* For selective discovery */
-typedef void (*DISCOVERY_CALLBACK1) (discinfo_t *, DISCOVERY_MODE, void *);
-/* For expiry (the same) */
-typedef void (*DISCOVERY_CALLBACK2) (discinfo_t *, DISCOVERY_MODE, void *);
-
-typedef struct {
-	irda_queue_t queue; /* Must be first */
-
-	__u16_host_order hints; /* Hint bits */
-} irlmp_service_t;
-
-typedef struct {
-	irda_queue_t queue; /* Must be first */
-
-	__u16_host_order hint_mask;
-
-	DISCOVERY_CALLBACK1 disco_callback;	/* Selective discovery */
-	DISCOVERY_CALLBACK2 expir_callback;	/* Selective expiration */
-	void *priv;                /* Used to identify client */
-} irlmp_client_t;
-
-/*
- *  Information about each logical LSAP connection
- */
-struct lsap_cb {
-	irda_queue_t queue;      /* Must be first */
-	magic_t magic;
-
-	unsigned long connected;	/* set_bit used on this */
-	int  persistent;
-
-	__u8 slsap_sel;   /* Source (this) LSAP address */
-	__u8 dlsap_sel;   /* Destination LSAP address (if connected) */
-#ifdef CONFIG_IRDA_ULTRA
-	__u8 pid;         /* Used by connectionless LSAP */
-#endif /* CONFIG_IRDA_ULTRA */
-	struct sk_buff *conn_skb; /* Store skb here while connecting */
-
-	struct timer_list watchdog_timer;
-
-	LSAP_STATE      lsap_state;  /* Connection state */
-	notify_t        notify;      /* Indication/Confirm entry points */
-	struct qos_info qos;         /* QoS for this connection */
-
-	struct lap_cb *lap; /* Pointer to LAP connection structure */
-};
-
-/*
- *  Used for caching the last slsap->dlsap->handle mapping
- *
- * We don't need to keep/match the remote address in the cache because
- * we are associated with a specific LAP (which implies it).
- * Jean II
- */
-typedef struct {
-	int valid;
-
-	__u8 slsap_sel;
-	__u8 dlsap_sel;
-	struct lsap_cb *lsap;
-} CACHE_ENTRY;
-
-/*
- *  Information about each registered IrLAP layer
- */
-struct lap_cb {
-	irda_queue_t queue; /* Must be first */
-	magic_t magic;
-
-	int reason;    /* LAP disconnect reason */
-
-	IRLMP_STATE lap_state;
-
-	struct irlap_cb *irlap;   /* Instance of IrLAP layer */
-	hashbin_t *lsaps;         /* LSAP associated with this link */
-	struct lsap_cb *flow_next;	/* Next lsap to be polled for Tx */
-
-	__u8  caddr;  /* Connection address */
- 	__u32 saddr;  /* Source device address */
- 	__u32 daddr;  /* Destination device address */
-	
-	struct qos_info *qos;  /* LAP QoS for this session */
-	struct timer_list idle_timer;
-	
-#ifdef CONFIG_IRDA_CACHE_LAST_LSAP
-	/* The lsap cache was moved from struct irlmp_cb to here because
-	 * it must be associated with the specific LAP. Also, this
-	 * improves performance. - Jean II */
-	CACHE_ENTRY cache;  /* Caching last slsap->dlsap->handle mapping */
-#endif
-};
-
-/*
- *  Main structure for IrLMP
- */
-struct irlmp_cb {
-	magic_t magic;
-
-	__u8 conflict_flag;
-	
-	discovery_t discovery_cmd; /* Discovery command to use by IrLAP */
-	discovery_t discovery_rsp; /* Discovery response to use by IrLAP */
-
-	/* Last lsap picked automatically by irlmp_find_free_slsap() */
-	int	last_lsap_sel;
-
-	struct timer_list discovery_timer;
-
- 	hashbin_t *links;         /* IrLAP connection table */
-	hashbin_t *unconnected_lsaps;
- 	hashbin_t *clients;
-	hashbin_t *services;
-
-	hashbin_t *cachelog;	/* Current discovery log */
-
-	int running;
-
-	__u16_host_order hints; /* Hint bits */
-};
-
-/* Prototype declarations */
-int  irlmp_init(void);
-void irlmp_cleanup(void);
-struct lsap_cb *irlmp_open_lsap(__u8 slsap, notify_t *notify, __u8 pid);
-void irlmp_close_lsap( struct lsap_cb *self);
-
-__u16 irlmp_service_to_hint(int service);
-void *irlmp_register_service(__u16 hints);
-int irlmp_unregister_service(void *handle);
-void *irlmp_register_client(__u16 hint_mask, DISCOVERY_CALLBACK1 disco_clb,
-			    DISCOVERY_CALLBACK2 expir_clb, void *priv);
-int irlmp_unregister_client(void *handle);
-int irlmp_update_client(void *handle, __u16 hint_mask, 
-			DISCOVERY_CALLBACK1 disco_clb,
-			DISCOVERY_CALLBACK2 expir_clb, void *priv);
-
-void irlmp_register_link(struct irlap_cb *, __u32 saddr, notify_t *);
-void irlmp_unregister_link(__u32 saddr);
-
-int  irlmp_connect_request(struct lsap_cb *, __u8 dlsap_sel, 
-			   __u32 saddr, __u32 daddr,
-			   struct qos_info *, struct sk_buff *);
-void irlmp_connect_indication(struct lsap_cb *self, struct sk_buff *skb);
-int  irlmp_connect_response(struct lsap_cb *, struct sk_buff *);
-void irlmp_connect_confirm(struct lsap_cb *, struct sk_buff *);
-struct lsap_cb *irlmp_dup(struct lsap_cb *self, void *instance);
-
-void irlmp_disconnect_indication(struct lsap_cb *self, LM_REASON reason, 
-				 struct sk_buff *userdata);
-int  irlmp_disconnect_request(struct lsap_cb *, struct sk_buff *userdata);
-
-void irlmp_discovery_confirm(hashbin_t *discovery_log, DISCOVERY_MODE mode);
-void irlmp_discovery_request(int nslots);
-discinfo_t *irlmp_get_discoveries(int *pn, __u16 mask, int nslots);
-void irlmp_do_expiry(void);
-void irlmp_do_discovery(int nslots);
-discovery_t *irlmp_get_discovery_response(void);
-void irlmp_discovery_expiry(discinfo_t *expiry, int number);
-
-int  irlmp_data_request(struct lsap_cb *, struct sk_buff *);
-void irlmp_data_indication(struct lsap_cb *, struct sk_buff *);
-
-int  irlmp_udata_request(struct lsap_cb *, struct sk_buff *);
-void irlmp_udata_indication(struct lsap_cb *, struct sk_buff *);
-
-#ifdef CONFIG_IRDA_ULTRA
-int  irlmp_connless_data_request(struct lsap_cb *, struct sk_buff *, __u8);
-void irlmp_connless_data_indication(struct lsap_cb *, struct sk_buff *);
-#endif /* CONFIG_IRDA_ULTRA */
-
-void irlmp_status_indication(struct lap_cb *, LINK_STATUS link, LOCK_STATUS lock);
-void irlmp_flow_indication(struct lap_cb *self, LOCAL_FLOW flow);
-
-LM_REASON irlmp_convert_lap_reason(LAP_REASON);
-
-static inline __u32 irlmp_get_saddr(const struct lsap_cb *self)
-{
-	return (self && self->lap) ? self->lap->saddr : 0;
-}
-
-static inline __u32 irlmp_get_daddr(const struct lsap_cb *self)
-{
-	return (self && self->lap) ? self->lap->daddr : 0;
-}
-
-const char *irlmp_reason_str(LM_REASON reason);
-
-extern int sysctl_discovery_timeout;
-extern int sysctl_discovery_slots;
-extern int sysctl_discovery;
-extern int sysctl_lap_keepalive_time;	/* in ms, default is LM_IDLE_TIMEOUT */
-extern struct irlmp_cb *irlmp;
-
-/* Check if LAP queue is full.
- * Used by IrTTP for low control, see comments in irlap.h - Jean II */
-static inline int irlmp_lap_tx_queue_full(struct lsap_cb *self)
-{
-	if (self == NULL)
-		return 0;
-	if (self->lap == NULL)
-		return 0;
-	if (self->lap->irlap == NULL)
-		return 0;
-
-	return IRLAP_GET_TX_QUEUE_LEN(self->lap->irlap) >= LAP_HIGH_THRESHOLD;
-}
-
-/* After doing a irlmp_dup(), this get one of the two socket back into
- * a state where it's waiting incoming connections.
- * Note : this can be used *only* if the socket is not yet connected
- * (i.e. NO irlmp_connect_response() done on this socket).
- * - Jean II */
-static inline void irlmp_listen(struct lsap_cb *self)
-{
-	self->dlsap_sel = LSAP_ANY;
-	self->lap = NULL;
-	self->lsap_state = LSAP_DISCONNECTED;
-	/* Started when we received the LM_CONNECT_INDICATION */
-	del_timer(&self->watchdog_timer);
-}
-
-#endif
diff --git a/include/net/irda/irlmp_event.h b/include/net/irda/irlmp_event.h
deleted file mode 100644
index 9e4ec17a7449..000000000000
--- a/include/net/irda/irlmp_event.h
+++ /dev/null
@@ -1,98 +0,0 @@
-/*********************************************************************
- *                
- * Filename:      irlmp_event.h
- * Version:       0.1
- * Description:   IrDA-LMP event handling
- * Status:        Experimental.
- * Author:        Dag Brattli <dagb@cs.uit.no>
- * Created at:    Mon Aug  4 20:40:53 1997
- * Modified at:   Thu Jul  8 12:18:54 1999
- * Modified by:   Dag Brattli <dagb@cs.uit.no>
- * 
- *     Copyright (c) 1997, 1999 Dag Brattli <dagb@cs.uit.no>, 
- *     All Rights Reserved.
- *     Copyright (c) 2000-2002 Jean Tourrilhes <jt@hpl.hp.com>
- *     
- *     This program is free software; you can redistribute it and/or 
- *     modify it under the terms of the GNU General Public License as 
- *     published by the Free Software Foundation; either version 2 of 
- *     the License, or (at your option) any later version.
- *
- *     Neither Dag Brattli nor University of Tromsø admit liability nor
- *     provide warranty for any of this software. This material is 
- *     provided "AS-IS" and at no charge.
- *
- ********************************************************************/
-
-#ifndef IRLMP_EVENT_H
-#define IRLMP_EVENT_H
-
-/* A few forward declarations (to make compiler happy) */
-struct irlmp_cb;
-struct lsap_cb;
-struct lap_cb;
-struct discovery_t;
-
-/* LAP states */
-typedef enum {
-	/* IrLAP connection control states */
-	LAP_STANDBY,             /* No LAP connection */
-	LAP_U_CONNECT,           /* Starting LAP connection */
-	LAP_ACTIVE,              /* LAP connection is active */
-} IRLMP_STATE;
-
-/* LSAP connection control states */
-typedef enum {
-	LSAP_DISCONNECTED,        /* No LSAP connection */
-	LSAP_CONNECT,             /* Connect indication from peer */
-	LSAP_CONNECT_PEND,        /* Connect request from service user */
-	LSAP_DATA_TRANSFER_READY, /* LSAP connection established */          
-	LSAP_SETUP,               /* Trying to set up LSAP connection */
-	LSAP_SETUP_PEND,          /* Request to start LAP connection */
-} LSAP_STATE;
-
-typedef enum {
-	/* LSAP events */
- 	LM_CONNECT_REQUEST,
- 	LM_CONNECT_CONFIRM,
-	LM_CONNECT_RESPONSE,
- 	LM_CONNECT_INDICATION, 	
-	
-	LM_DISCONNECT_INDICATION,
-	LM_DISCONNECT_REQUEST,
-
- 	LM_DATA_REQUEST,
-	LM_UDATA_REQUEST,
- 	LM_DATA_INDICATION,
-	LM_UDATA_INDICATION,
-
-	LM_WATCHDOG_TIMEOUT,
-
-	/* IrLAP events */
-	LM_LAP_CONNECT_REQUEST,
- 	LM_LAP_CONNECT_INDICATION, 
- 	LM_LAP_CONNECT_CONFIRM,
- 	LM_LAP_DISCONNECT_INDICATION, 
-	LM_LAP_DISCONNECT_REQUEST,
-	LM_LAP_DISCOVERY_REQUEST,
- 	LM_LAP_DISCOVERY_CONFIRM,
-	LM_LAP_IDLE_TIMEOUT,
-} IRLMP_EVENT;
-
-extern const char *const irlmp_state[];
-extern const char *const irlsap_state[];
-
-void irlmp_watchdog_timer_expired(void *data);
-void irlmp_discovery_timer_expired(void *data);
-void irlmp_idle_timer_expired(void *data);
-
-void irlmp_do_lap_event(struct lap_cb *self, IRLMP_EVENT event, 
-			struct sk_buff *skb);
-int irlmp_do_lsap_event(struct lsap_cb *self, IRLMP_EVENT event, 
-			struct sk_buff *skb);
-
-#endif /* IRLMP_EVENT_H */
-
-
-
-
diff --git a/include/net/irda/irlmp_frame.h b/include/net/irda/irlmp_frame.h
deleted file mode 100644
index 1906eb71422e..000000000000
--- a/include/net/irda/irlmp_frame.h
+++ /dev/null
@@ -1,62 +0,0 @@
-/*********************************************************************
- *                
- * Filename:      irlmp_frame.h
- * Version:       0.9
- * Description:   
- * Status:        Experimental.
- * Author:        Dag Brattli <dagb@cs.uit.no>
- * Created at:    Tue Aug 19 02:09:59 1997
- * Modified at:   Fri Dec 10 13:21:53 1999
- * Modified by:   Dag Brattli <dagb@cs.uit.no>
- * 
- *     Copyright (c) 1997, 1999 Dag Brattli <dagb@cs.uit.no>, 
- *     All Rights Reserved.
- *     
- *     This program is free software; you can redistribute it and/or 
- *     modify it under the terms of the GNU General Public License as 
- *     published by the Free Software Foundation; either version 2 of 
- *     the License, or (at your option) any later version.
- *
- *     Neither Dag Brattli nor University of Tromsø admit liability nor
- *     provide warranty for any of this software. This material is 
- *     provided "AS-IS" and at no charge.
- *
- ********************************************************************/
-
-#ifndef IRMLP_FRAME_H
-#define IRMLP_FRAME_H
-
-#include <linux/skbuff.h>
-
-#include <net/irda/discovery.h>
-
-/* IrLMP frame opcodes */
-#define CONNECT_CMD    0x01
-#define CONNECT_CNF    0x81
-#define DISCONNECT     0x02
-#define ACCESSMODE_CMD 0x03
-#define ACCESSMODE_CNF 0x83
-
-#define CONTROL_BIT    0x80
-
-void irlmp_send_data_pdu(struct lap_cb *self, __u8 dlsap, __u8 slsap,
-				int expedited, struct sk_buff *skb);
-void irlmp_send_lcf_pdu(struct lap_cb *self, __u8 dlsap, __u8 slsap, 
-			__u8 opcode, struct sk_buff *skb);
-void irlmp_link_data_indication(struct lap_cb *, struct sk_buff *, 
-				int unreliable);
-#ifdef CONFIG_IRDA_ULTRA
-void irlmp_link_unitdata_indication(struct lap_cb *, struct sk_buff *);
-#endif /* CONFIG_IRDA_ULTRA */
-
-void irlmp_link_connect_indication(struct lap_cb *, __u32 saddr, __u32 daddr,
-				   struct qos_info *qos, struct sk_buff *skb);
-void irlmp_link_connect_request(__u32 daddr);
-void irlmp_link_connect_confirm(struct lap_cb *self, struct qos_info *qos, 
-				struct sk_buff *skb);
-void irlmp_link_disconnect_indication(struct lap_cb *, struct irlap_cb *, 
-				      LAP_REASON reason, struct sk_buff *); 
-void irlmp_link_discovery_confirm(struct lap_cb *self, hashbin_t *log);
-void irlmp_link_discovery_indication(struct lap_cb *, discovery_t *discovery);
-
-#endif
diff --git a/include/net/irda/irmod.h b/include/net/irda/irmod.h
deleted file mode 100644
index 86f0dbb8ee5d..000000000000
--- a/include/net/irda/irmod.h
+++ /dev/null
@@ -1,109 +0,0 @@
-/*********************************************************************
- *                
- * Filename:      irmod.h
- * Version:       0.3
- * Description:   IrDA module and utilities functions
- * Status:        Experimental.
- * Author:        Dag Brattli <dagb@cs.uit.no>
- * Created at:    Mon Dec 15 13:58:52 1997
- * Modified at:   Fri Jan 28 13:15:24 2000
- * Modified by:   Dag Brattli <dagb@cs.uit.no>
- *
- *     Copyright (c) 1998-2000 Dag Brattli, All Rights Reserved.
- *     Copyright (c) 2000-2002 Jean Tourrilhes <jt@hpl.hp.com>
- *      
- *     This program is free software; you can redistribute it and/or 
- *     modify it under the terms of the GNU General Public License as 
- *     published by the Free Software Foundation; either version 2 of 
- *     the License, or (at your option) any later version.
- *  
- *     Neither Dag Brattli nor University of Tromsø admit liability nor
- *     provide warranty for any of this software. This material is 
- *     provided "AS-IS" and at no charg.
- *     
- ********************************************************************/
-
-#ifndef IRMOD_H
-#define IRMOD_H
-
-/* Misc status information */
-typedef enum {
-	STATUS_OK,
-	STATUS_ABORTED,
-	STATUS_NO_ACTIVITY,
-	STATUS_NOISY,
-	STATUS_REMOTE,
-} LINK_STATUS;
-
-typedef enum {
-	LOCK_NO_CHANGE,
-	LOCK_LOCKED,
-	LOCK_UNLOCKED,
-} LOCK_STATUS;
-
-typedef enum { FLOW_STOP, FLOW_START } LOCAL_FLOW;
-
-/*  
- *  IrLMP disconnect reasons. The order is very important, since they 
- *  correspond to disconnect reasons sent in IrLMP disconnect frames, so
- *  please do not touch :-)
- */
-typedef enum {
-	LM_USER_REQUEST = 1,  /* User request */
-	LM_LAP_DISCONNECT,    /* Unexpected IrLAP disconnect */
-	LM_CONNECT_FAILURE,   /* Failed to establish IrLAP connection */
-	LM_LAP_RESET,         /* IrLAP reset */
-	LM_INIT_DISCONNECT,   /* Link Management initiated disconnect */
-	LM_LSAP_NOTCONN,      /* Data delivered on unconnected LSAP */
-	LM_NON_RESP_CLIENT,   /* Non responsive LM-MUX client */
-	LM_NO_AVAIL_CLIENT,   /* No available LM-MUX client */
-	LM_CONN_HALF_OPEN,    /* Connection is half open */
-	LM_BAD_SOURCE_ADDR,   /* Illegal source address (i.e 0x00) */
-} LM_REASON;
-#define LM_UNKNOWN 0xff       /* Unspecified disconnect reason */
-
-/* A few forward declarations (to make compiler happy) */
-struct qos_info;		/* in <net/irda/qos.h> */
-
-/*
- *  Notify structure used between transport and link management layers
- */
-typedef struct {
-	int (*data_indication)(void *priv, void *sap, struct sk_buff *skb);
-	int (*udata_indication)(void *priv, void *sap, struct sk_buff *skb);
-	void (*connect_confirm)(void *instance, void *sap, 
-				struct qos_info *qos, __u32 max_sdu_size,
-				__u8 max_header_size, struct sk_buff *skb);
-	void (*connect_indication)(void *instance, void *sap, 
-				   struct qos_info *qos, __u32 max_sdu_size, 
-				   __u8 max_header_size, struct sk_buff *skb);
-	void (*disconnect_indication)(void *instance, void *sap, 
-				      LM_REASON reason, struct sk_buff *);
-	void (*flow_indication)(void *instance, void *sap, LOCAL_FLOW flow);
-	void (*status_indication)(void *instance,
-				  LINK_STATUS link, LOCK_STATUS lock);
-	void *instance; /* Layer instance pointer */
-	char name[16];  /* Name of layer */
-} notify_t;
-
-#define NOTIFY_MAX_NAME 16
-
-/* Zero the notify structure */
-void irda_notify_init(notify_t *notify);
-
-/* Locking wrapper - Note the inverted logic on irda_lock().
- * Those function basically return false if the lock is already in the
- * position you want to set it. - Jean II */
-#define irda_lock(lock)		(! test_and_set_bit(0, (void *) (lock)))
-#define irda_unlock(lock)	(test_and_clear_bit(0, (void *) (lock)))
-
-#endif /* IRMOD_H */
-
-
-
-
-
-
-
-
-
diff --git a/include/net/irda/irqueue.h b/include/net/irda/irqueue.h
deleted file mode 100644
index 37f512bd6733..000000000000
--- a/include/net/irda/irqueue.h
+++ /dev/null
@@ -1,96 +0,0 @@
-/*********************************************************************
- *                
- * Filename:      irqueue.h
- * Version:       0.3
- * Description:   General queue implementation
- * Status:        Experimental.
- * Author:        Dag Brattli <dagb@cs.uit.no>
- * Created at:    Tue Jun  9 13:26:50 1998
- * Modified at:   Thu Oct  7 13:25:16 1999
- * Modified by:   Dag Brattli <dagb@cs.uit.no>
- * 
- *     Copyright (C) 1998-1999, Aage Kvalnes <aage@cs.uit.no>
- *     Copyright (c) 1998, Dag Brattli
- *     All Rights Reserved.
- *      
- *     This code is taken from the Vortex Operating System written by Aage
- *     Kvalnes and has been ported to Linux and Linux/IR by Dag Brattli
- *
- *     This program is free software; you can redistribute it and/or 
- *     modify it under the terms of the GNU General Public License as 
- *     published by the Free Software Foundation; either version 2 of 
- *     the License, or (at your option) any later version.
- *  
- *     Neither Dag Brattli nor University of Tromsø admit liability nor
- *     provide warranty for any of this software. This material is 
- *     provided "AS-IS" and at no charge.
- *     
- ********************************************************************/
-
-#include <linux/types.h>
-#include <linux/spinlock.h>
-
-#ifndef IRDA_QUEUE_H
-#define IRDA_QUEUE_H
-
-#define NAME_SIZE      32
-
-/*
- * Hash types (some flags can be xored)
- * See comments in irqueue.c for which one to use...
- */
-#define HB_NOLOCK	0	/* No concurent access prevention */
-#define HB_LOCK		1	/* Prevent concurent write with global lock */
-
-/*
- * Hash defines
- */
-#define HASHBIN_SIZE   8
-#define HASHBIN_MASK   0x7
-
-#ifndef IRDA_ALIGN 
-#define IRDA_ALIGN __attribute__((aligned))
-#endif
-
-#define Q_NULL { NULL, NULL, "", 0 }
-
-typedef void (*FREE_FUNC)(void *arg);
-
-struct irda_queue {
-	struct irda_queue *q_next;
-	struct irda_queue *q_prev;
-
-	char   q_name[NAME_SIZE];
-	long   q_hash;			/* Must be able to cast a (void *) */
-};
-typedef struct irda_queue irda_queue_t;
-
-typedef struct hashbin_t {
-	__u32      magic;
-	int        hb_type;
-	int        hb_size;
-	spinlock_t hb_spinlock;		/* HB_LOCK - Can be used by the user */
-
-	irda_queue_t* hb_queue[HASHBIN_SIZE] IRDA_ALIGN;
-
-	irda_queue_t* hb_current;
-} hashbin_t;
-
-hashbin_t *hashbin_new(int type);
-int      hashbin_delete(hashbin_t* hashbin, FREE_FUNC func);
-int      hashbin_clear(hashbin_t* hashbin, FREE_FUNC free_func);
-void     hashbin_insert(hashbin_t* hashbin, irda_queue_t* entry, long hashv, 
-			const char* name);
-void*    hashbin_remove(hashbin_t* hashbin, long hashv, const char* name);
-void*    hashbin_remove_first(hashbin_t *hashbin);
-void*	 hashbin_remove_this( hashbin_t* hashbin, irda_queue_t* entry);
-void*    hashbin_find(hashbin_t* hashbin, long hashv, const char* name);
-void*    hashbin_lock_find(hashbin_t* hashbin, long hashv, const char* name);
-void*    hashbin_find_next(hashbin_t* hashbin, long hashv, const char* name,
-			   void ** pnext);
-irda_queue_t *hashbin_get_first(hashbin_t *hashbin);
-irda_queue_t *hashbin_get_next(hashbin_t *hashbin);
-
-#define HASHBIN_GET_SIZE(hashbin) hashbin->hb_size
-
-#endif
diff --git a/include/net/irda/irttp.h b/include/net/irda/irttp.h
deleted file mode 100644
index 98682d4bae8f..000000000000
--- a/include/net/irda/irttp.h
+++ /dev/null
@@ -1,210 +0,0 @@
-/*********************************************************************
- *                
- * Filename:      irttp.h
- * Version:       1.0
- * Description:   Tiny Transport Protocol (TTP) definitions
- * Status:        Experimental.
- * Author:        Dag Brattli <dagb@cs.uit.no>
- * Created at:    Sun Aug 31 20:14:31 1997
- * Modified at:   Sun Dec 12 13:09:07 1999
- * Modified by:   Dag Brattli <dagb@cs.uit.no>
- * 
- *     Copyright (c) 1998-1999 Dag Brattli <dagb@cs.uit.no>, 
- *     All Rights Reserved.
- *     Copyright (c) 2000-2002 Jean Tourrilhes <jt@hpl.hp.com>
- *     
- *     This program is free software; you can redistribute it and/or 
- *     modify it under the terms of the GNU General Public License as 
- *     published by the Free Software Foundation; either version 2 of 
- *     the License, or (at your option) any later version.
- *
- *     Neither Dag Brattli nor University of Tromsø admit liability nor
- *     provide warranty for any of this software. This material is 
- *     provided "AS-IS" and at no charge.
- *
- ********************************************************************/
-
-#ifndef IRTTP_H
-#define IRTTP_H
-
-#include <linux/types.h>
-#include <linux/skbuff.h>
-#include <linux/spinlock.h>
-
-#include <net/irda/irda.h>
-#include <net/irda/irlmp.h>		/* struct lsap_cb */
-#include <net/irda/qos.h>		/* struct qos_info */
-#include <net/irda/irqueue.h>
-
-#define TTP_MAX_CONNECTIONS    LM_MAX_CONNECTIONS
-#define TTP_HEADER             1
-#define TTP_MAX_HEADER         (TTP_HEADER + LMP_MAX_HEADER)
-#define TTP_SAR_HEADER         5
-#define TTP_PARAMETERS         0x80
-#define TTP_MORE               0x80
-
-/* Transmission queue sizes */
-/* Worst case scenario, two window of data - Jean II */
-#define TTP_TX_MAX_QUEUE	14
-/* We need to keep at least 5 frames to make sure that we can refill
- * appropriately the LAP layer. LAP keeps only two buffers, and we need
- * to have 7 to make a full window - Jean II */
-#define TTP_TX_LOW_THRESHOLD	5
-/* Most clients are synchronous with respect to flow control, so we can
- * keep a low number of Tx buffers in TTP - Jean II */
-#define TTP_TX_HIGH_THRESHOLD	7
-
-/* Receive queue sizes */
-/* Minimum of credit that the peer should hold.
- * If the peer has less credits than 9 frames, we will explicitly send
- * him some credits (through irttp_give_credit() and a specific frame).
- * Note that when we give credits it's likely that it won't be sent in
- * this LAP window, but in the next one. So, we make sure that the peer
- * has something to send while waiting for credits (one LAP window == 7
- * + 1 frames while he process the credits). - Jean II */
-#define TTP_RX_MIN_CREDIT	8
-/* This is the default maximum number of credits held by the peer, so the
- * default maximum number of frames he can send us before needing flow
- * control answer from us (this may be negociated differently at TSAP setup).
- * We want to minimise the number of times we have to explicitly send some
- * credit to the peer, hoping we can piggyback it on the return data. In
- * particular, it doesn't make sense for us to send credit more than once
- * per LAP window.
- * Moreover, giving credits has some latency, so we need strictly more than
- * a LAP window, otherwise we may already have credits in our Tx queue.
- * But on the other hand, we don't want to keep too many Rx buffer here
- * before starting to flow control the other end, so make it exactly one
- * LAP window + 1 + MIN_CREDITS. - Jean II */
-#define TTP_RX_DEFAULT_CREDIT	16
-/* Maximum number of credits we can allow the peer to have, and therefore
- * maximum Rx queue size.
- * Note that we try to deliver packets to the higher layer every time we
- * receive something, so in normal mode the Rx queue will never contains
- * more than one or two packets. - Jean II */
-#define TTP_RX_MAX_CREDIT	21
-
-/* What clients should use when calling ttp_open_tsap() */
-#define DEFAULT_INITIAL_CREDIT	TTP_RX_DEFAULT_CREDIT
-
-/* Some priorities for disconnect requests */
-#define P_NORMAL    0
-#define P_HIGH      1
-
-#define TTP_SAR_DISABLE 0
-#define TTP_SAR_UNBOUND 0xffffffff
-
-/* Parameters */
-#define TTP_MAX_SDU_SIZE 0x01
-
-/*
- *  This structure contains all data associated with one instance of a TTP 
- *  connection.
- */
-struct tsap_cb {
-	irda_queue_t q;            /* Must be first */
-	magic_t magic;        /* Just in case */
-
-	__u8 stsap_sel;       /* Source TSAP */
-	__u8 dtsap_sel;       /* Destination TSAP */
-
-	struct lsap_cb *lsap; /* Corresponding LSAP to this TSAP */
-
-	__u8 connected;       /* TSAP connected */
-	 
-	__u8 initial_credit;  /* Initial credit to give peer */
-
-        int avail_credit;    /* Available credit to return to peer */
-	int remote_credit;   /* Credit held by peer TTP entity */
-	int send_credit;     /* Credit held by local TTP entity */
-	
-	struct sk_buff_head tx_queue; /* Frames to be transmitted */
-	struct sk_buff_head rx_queue; /* Received frames */
-	struct sk_buff_head rx_fragments;
-	int tx_queue_lock;
-	int rx_queue_lock;
-	spinlock_t lock;
-
-	notify_t notify;       /* Callbacks to client layer */
-
-	struct net_device_stats stats;
-	struct timer_list todo_timer; 
-
-	__u32 max_seg_size;     /* Max data that fit into an IrLAP frame */
-	__u8  max_header_size;
-
-	int   rx_sdu_busy;     /* RxSdu.busy */
-	__u32 rx_sdu_size;     /* Current size of a partially received frame */
-	__u32 rx_max_sdu_size; /* Max receive user data size */
-
-	int tx_sdu_busy;       /* TxSdu.busy */
-	__u32 tx_max_sdu_size; /* Max transmit user data size */
-
-	int close_pend;        /* Close, but disconnect_pend */
-	unsigned long disconnect_pend; /* Disconnect, but still data to send */
-	struct sk_buff *disconnect_skb;
-};
-
-struct irttp_cb {
-	magic_t    magic;	
-	hashbin_t *tsaps;
-};
-
-int  irttp_init(void);
-void irttp_cleanup(void);
-
-struct tsap_cb *irttp_open_tsap(__u8 stsap_sel, int credit, notify_t *notify);
-int irttp_close_tsap(struct tsap_cb *self);
-
-int irttp_data_request(struct tsap_cb *self, struct sk_buff *skb);
-int irttp_udata_request(struct tsap_cb *self, struct sk_buff *skb);
-
-int irttp_connect_request(struct tsap_cb *self, __u8 dtsap_sel, 
-			  __u32 saddr, __u32 daddr,
-			  struct qos_info *qos, __u32 max_sdu_size, 
-			  struct sk_buff *userdata);
-int irttp_connect_response(struct tsap_cb *self, __u32 max_sdu_size, 
-			    struct sk_buff *userdata);
-int irttp_disconnect_request(struct tsap_cb *self, struct sk_buff *skb,
-			     int priority);
-void irttp_flow_request(struct tsap_cb *self, LOCAL_FLOW flow);
-struct tsap_cb *irttp_dup(struct tsap_cb *self, void *instance);
-
-static inline __u32 irttp_get_saddr(struct tsap_cb *self)
-{
-	return irlmp_get_saddr(self->lsap);
-}
-
-static inline __u32 irttp_get_daddr(struct tsap_cb *self)
-{
-	return irlmp_get_daddr(self->lsap);
-}
-
-static inline __u32 irttp_get_max_seg_size(struct tsap_cb *self)
-{
-	return self->max_seg_size;
-}
-
-/* After doing a irttp_dup(), this get one of the two socket back into
- * a state where it's waiting incoming connections.
- * Note : this can be used *only* if the socket is not yet connected
- * (i.e. NO irttp_connect_response() done on this socket).
- * - Jean II */
-static inline void irttp_listen(struct tsap_cb *self)
-{
-	irlmp_listen(self->lsap);
-	self->dtsap_sel = LSAP_ANY;
-}
-
-/* Return TRUE if the node is in primary mode (i.e. master)
- * - Jean II */
-static inline int irttp_is_primary(struct tsap_cb *self)
-{
-	if ((self == NULL) ||
-	    (self->lsap == NULL) ||
-	    (self->lsap->lap == NULL) ||
-	    (self->lsap->lap->irlap == NULL))
-		return -2;
-	return irlap_is_primary(self->lsap->lap->irlap);
-}
-
-#endif /* IRTTP_H */
diff --git a/include/net/irda/parameters.h b/include/net/irda/parameters.h
deleted file mode 100644
index 2d9cd0007cba..000000000000
--- a/include/net/irda/parameters.h
+++ /dev/null
@@ -1,100 +0,0 @@
-/*********************************************************************
- *                
- * Filename:      parameters.h
- * Version:       1.0
- * Description:   A more general way to handle (pi,pl,pv) parameters
- * Status:        Experimental.
- * Author:        Dag Brattli <dagb@cs.uit.no>
- * Created at:    Mon Jun  7 08:47:28 1999
- * Modified at:   Sun Jan 30 14:05:14 2000
- * Modified by:   Dag Brattli <dagb@cs.uit.no>
- * 
- *     Copyright (c) 1999-2000 Dag Brattli, All Rights Reserved.
- *     
- *     This program is free software; you can redistribute it and/or 
- *     modify it under the terms of the GNU General Public License as 
- *     published by the Free Software Foundation; either version 2 of 
- *     the License, or (at your option) any later version.
- * 
- *     This program is distributed in the hope that it will be useful,
- *     but WITHOUT ANY WARRANTY; without even the implied warranty of
- *     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- *     GNU General Public License for more details.
- * 
- *     You should have received a copy of the GNU General Public License 
- *     along with this program; if not, see <http://www.gnu.org/licenses/>.
- *
- *     Michel Dänzer <daenzer@debian.org>, 10/2001
- *     - simplify irda_pv_t to avoid endianness issues
- *     
- ********************************************************************/
-
-#ifndef IRDA_PARAMS_H
-#define IRDA_PARAMS_H
-
-/*
- *  The currently supported types. Beware not to change the sequence since
- *  it a good reason why the sized integers has a value equal to their size
- */
-typedef enum {
-	PV_INTEGER,      /* Integer of any (pl) length */
-	PV_INT_8_BITS,   /* Integer of 8 bits in length */
-	PV_INT_16_BITS,  /* Integer of 16 bits in length */
-	PV_STRING,       /* \0 terminated string */
-	PV_INT_32_BITS,  /* Integer of 32 bits in length */
-	PV_OCT_SEQ,      /* Octet sequence */
-	PV_NO_VALUE      /* Does not contain any value (pl=0) */
-} PV_TYPE;
-
-/* Bit 7 of type field */
-#define PV_BIG_ENDIAN    0x80 
-#define PV_LITTLE_ENDIAN 0x00
-#define PV_MASK          0x7f   /* To mask away endian bit */
-
-#define PV_PUT 0
-#define PV_GET 1
-
-typedef union {
-	char   *c;
-	__u32   i;
-	__u32 *ip;
-} irda_pv_t;
-
-typedef struct {
-	__u8 pi;
-	__u8 pl;
-	irda_pv_t pv;
-} irda_param_t;
-
-typedef int (*PI_HANDLER)(void *self, irda_param_t *param, int get);
-typedef int (*PV_HANDLER)(void *self, __u8 *buf, int len, __u8 pi,
-			  PV_TYPE type, PI_HANDLER func);
-
-typedef struct {
-	const PI_HANDLER func;  /* Handler for this parameter identifier */
-	PV_TYPE    type;  /* Data type for this parameter */
-} pi_minor_info_t;
-
-typedef struct {
-	const pi_minor_info_t *pi_minor_call_table;
-	int len;
-} pi_major_info_t;
-
-typedef struct {
-	const pi_major_info_t *tables;
-	int              len;
-	__u8             pi_mask;
-	int              pi_major_offset;
-} pi_param_info_t;
-
-int irda_param_pack(__u8 *buf, char *fmt, ...);
-
-int irda_param_insert(void *self, __u8 pi, __u8 *buf, int len, 
-		      pi_param_info_t *info);
-int irda_param_extract_all(void *self, __u8 *buf, int len, 
-			   pi_param_info_t *info);
-
-#define irda_param_insert_byte(buf,pi,pv) irda_param_pack(buf,"bbb",pi,1,pv)
-
-#endif /* IRDA_PARAMS_H */
-
diff --git a/include/net/irda/qos.h b/include/net/irda/qos.h
deleted file mode 100644
index 05a5a249956f..000000000000
--- a/include/net/irda/qos.h
+++ /dev/null
@@ -1,101 +0,0 @@
-/*********************************************************************
- *                
- * Filename:      qos.h
- * Version:       1.0
- * Description:   Quality of Service definitions
- * Status:        Experimental.
- * Author:        Dag Brattli <dagb@cs.uit.no>
- * Created at:    Fri Sep 19 23:21:09 1997
- * Modified at:   Thu Dec  2 13:51:54 1999
- * Modified by:   Dag Brattli <dagb@cs.uit.no>
- * 
- *     Copyright (c) 1999 Dag Brattli, All Rights Reserved.
- *     
- *     This program is free software; you can redistribute it and/or 
- *     modify it under the terms of the GNU General Public License as 
- *     published by the Free Software Foundation; either version 2 of 
- *     the License, or (at your option) any later version.
- * 
- *     This program is distributed in the hope that it will be useful,
- *     but WITHOUT ANY WARRANTY; without even the implied warranty of
- *     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- *     GNU General Public License for more details.
- * 
- *     You should have received a copy of the GNU General Public License 
- *     along with this program; if not, see <http://www.gnu.org/licenses/>.
- *     
- ********************************************************************/
-
-#ifndef IRDA_QOS_H
-#define IRDA_QOS_H
-
-#include <linux/skbuff.h>
-
-#include <net/irda/parameters.h>
-
-#define PI_BAUD_RATE     0x01
-#define PI_MAX_TURN_TIME 0x82
-#define PI_DATA_SIZE     0x83
-#define PI_WINDOW_SIZE   0x84
-#define PI_ADD_BOFS      0x85
-#define PI_MIN_TURN_TIME 0x86
-#define PI_LINK_DISC     0x08
-
-#define IR_115200_MAX 0x3f
-
-/* Baud rates (first byte) */
-#define IR_2400     0x01
-#define IR_9600     0x02
-#define IR_19200    0x04
-#define IR_38400    0x08
-#define IR_57600    0x10
-#define IR_115200   0x20
-#define IR_576000   0x40
-#define IR_1152000  0x80
-
-/* Baud rates (second byte) */
-#define IR_4000000  0x01
-#define IR_16000000 0x02
-
-/* Quality of Service information */
-typedef struct {
-	__u32 value;
-	__u16 bits; /* LSB is first byte, MSB is second byte */
-} qos_value_t;
-
-struct qos_info {
-	magic_t magic;
-
-	qos_value_t baud_rate;       /* IR_11520O | ... */
-	qos_value_t max_turn_time;
-	qos_value_t data_size;
-	qos_value_t window_size;
-	qos_value_t additional_bofs;
-	qos_value_t min_turn_time;
-	qos_value_t link_disc_time;
-	
-	qos_value_t power;
-};
-
-extern int sysctl_max_baud_rate;
-extern int sysctl_max_inactive_time;
-
-void irda_init_max_qos_capabilies(struct qos_info *qos);
-void irda_qos_compute_intersection(struct qos_info *, struct qos_info *);
-
-__u32 irlap_max_line_capacity(__u32 speed, __u32 max_turn_time);
-
-void irda_qos_bits_to_value(struct qos_info *qos);
-
-/* So simple, how could we not inline those two ?
- * Note : one byte is 10 bits if you include start and stop bits
- * Jean II */
-#define irlap_min_turn_time_in_bytes(speed, min_turn_time) (	\
-	speed * min_turn_time / 10000000			\
-)
-#define irlap_xbofs_in_usec(speed, xbofs) (			\
-	xbofs * 10000000 / speed				\
-)
-
-#endif
-
diff --git a/include/net/irda/timer.h b/include/net/irda/timer.h
deleted file mode 100644
index d784f242cf7b..000000000000
--- a/include/net/irda/timer.h
+++ /dev/null
@@ -1,105 +0,0 @@
-/*********************************************************************
- *                
- * Filename:      timer.h
- * Version:       
- * Description:   
- * Status:        Experimental.
- * Author:        Dag Brattli <dagb@cs.uit.no>
- * Created at:    Sat Aug 16 00:59:29 1997
- * Modified at:   Thu Oct  7 12:25:24 1999
- * Modified by:   Dag Brattli <dagb@cs.uit.no>
- * 
- *     Copyright (c) 1997, 1998-1999 Dag Brattli <dagb@cs.uit.no>, 
- *     All Rights Reserved.
- *     Copyright (c) 2000-2002 Jean Tourrilhes <jt@hpl.hp.com>
- *     
- *     This program is free software; you can redistribute it and/or 
- *     modify it under the terms of the GNU General Public License as 
- *     published by the Free Software Foundation; either version 2 of 
- *     the License, or (at your option) any later version.
- *
- *     Neither Dag Brattli nor University of Tromsø admit liability nor
- *     provide warranty for any of this software. This material is 
- *     provided "AS-IS" and at no charge.
- *
- ********************************************************************/
-
-#ifndef TIMER_H
-#define TIMER_H
-
-#include <linux/timer.h>
-#include <linux/jiffies.h>
-
-#include <asm/param.h>  /* for HZ */
-
-#include <net/irda/irda.h>
-
-/* A few forward declarations (to make compiler happy) */
-struct irlmp_cb;
-struct irlap_cb;
-struct lsap_cb;
-struct lap_cb;
-
-/* 
- *  Timeout definitions, some defined in IrLAP 6.13.5 - p. 92
- */
-#define POLL_TIMEOUT        (450*HZ/1000)    /* Must never exceed 500 ms */
-#define FINAL_TIMEOUT       (500*HZ/1000)    /* Must never exceed 500 ms */
-
-/* 
- *  Normally twice of p-timer. Note 3, IrLAP 6.3.11.2 - p. 60 suggests
- *  at least twice duration of the P-timer.
- */
-#define WD_TIMEOUT          (POLL_TIMEOUT*2)
-
-#define MEDIABUSY_TIMEOUT   (500*HZ/1000)    /* 500 msec */
-#define SMALLBUSY_TIMEOUT   (100*HZ/1000)    /* 100 msec - IrLAP 6.13.4 */
-
-/*
- *  Slot timer must never exceed 85 ms, and must always be at least 25 ms, 
- *  suggested to  75-85 msec by IrDA lite. This doesn't work with a lot of
- *  devices, and other stackes uses a lot more, so it's best we do it as well
- *  (Note : this is the default value and sysctl overrides it - Jean II)
- */
-#define SLOT_TIMEOUT            (90*HZ/1000)
-
-/* 
- *  The latest discovery frame (XID) is longer due to the extra discovery
- *  information (hints, device name...). This is its extra length.
- *  We use that when setting the query timeout. Jean II
- */
-#define XIDEXTRA_TIMEOUT        (34*HZ/1000)  /* 34 msec */
-
-#define WATCHDOG_TIMEOUT        (20*HZ)       /* 20 sec */
-
-typedef void (*TIMER_CALLBACK)(void *);
-
-static inline void irda_start_timer(struct timer_list *ptimer, int timeout, 
-				    void* data, TIMER_CALLBACK callback)
-{
-	ptimer->function = (void (*)(unsigned long)) callback;
-	ptimer->data = (unsigned long) data;
-	
-	/* Set new value for timer (update or add timer).
-	 * We use mod_timer() because it's more efficient and also
-	 * safer with respect to race conditions - Jean II */
-	mod_timer(ptimer, jiffies + timeout);
-}
-
-
-void irlap_start_slot_timer(struct irlap_cb *self, int timeout);
-void irlap_start_query_timer(struct irlap_cb *self, int S, int s);
-void irlap_start_final_timer(struct irlap_cb *self, int timeout);
-void irlap_start_wd_timer(struct irlap_cb *self, int timeout);
-void irlap_start_backoff_timer(struct irlap_cb *self, int timeout);
-
-void irlap_start_mbusy_timer(struct irlap_cb *self, int timeout);
-void irlap_stop_mbusy_timer(struct irlap_cb *);
-
-void irlmp_start_watchdog_timer(struct lsap_cb *, int timeout);
-void irlmp_start_discovery_timer(struct irlmp_cb *, int timeout);
-void irlmp_start_idle_timer(struct lap_cb *, int timeout);
-void irlmp_stop_idle_timer(struct lap_cb *self);
-
-#endif
-
diff --git a/include/net/irda/wrapper.h b/include/net/irda/wrapper.h
deleted file mode 100644
index eef53ebe3d76..000000000000
--- a/include/net/irda/wrapper.h
+++ /dev/null
@@ -1,58 +0,0 @@
-/*********************************************************************
- *                
- * Filename:      wrapper.h
- * Version:       1.2
- * Description:   IrDA SIR async wrapper layer
- * Status:        Experimental.
- * Author:        Dag Brattli <dagb@cs.uit.no>
- * Created at:    Mon Aug  4 20:40:53 1997
- * Modified at:   Tue Jan 11 12:37:29 2000
- * Modified by:   Dag Brattli <dagb@cs.uit.no>
- * 
- *     Copyright (c) 1998-2000 Dag Brattli <dagb@cs.uit.no>, 
- *     All Rights Reserved.
- *     
- *     This program is free software; you can redistribute it and/or 
- *     modify it under the terms of the GNU General Public License as 
- *     published by the Free Software Foundation; either version 2 of 
- *     the License, or (at your option) any later version.
- *
- *     Neither Dag Brattli nor University of Tromsø admit liability nor
- *     provide warranty for any of this software. This material is 
- *     provided "AS-IS" and at no charge.
- *
- ********************************************************************/
-
-#ifndef WRAPPER_H
-#define WRAPPER_H
-
-#include <linux/types.h>
-#include <linux/skbuff.h>
-#include <linux/netdevice.h>
-
-#include <net/irda/irda_device.h>	/* iobuff_t */
-
-#define BOF  0xc0 /* Beginning of frame */
-#define XBOF 0xff
-#define EOF  0xc1 /* End of frame */
-#define CE   0x7d /* Control escape */
-
-#define STA BOF  /* Start flag */
-#define STO EOF  /* End flag */
-
-#define IRDA_TRANS 0x20    /* Asynchronous transparency modifier */       
-
-/* States for receiving a frame in async mode */
-enum {
-	OUTSIDE_FRAME, 
-	BEGIN_FRAME, 
-	LINK_ESCAPE, 
-	INSIDE_FRAME
-};
-
-/* Proto definitions */
-int async_wrap_skb(struct sk_buff *skb, __u8 *tx_buff, int buffsize);
-void async_unwrap_char(struct net_device *dev, struct net_device_stats *stats,
-		       iobuff_t *buf, __u8 byte);
-
-#endif
-- 
cgit v1.2.3


From 21acf63013ed3d6fce3176cc34b74064052a31b4 Mon Sep 17 00:00:00 2001
From: Samuel Mendoza-Jonas <sam@mendozajonas.com>
Date: Mon, 28 Aug 2017 16:18:42 +1000
Subject: net/ncsi: Configure VLAN tag filter

Make use of the ndo_vlan_rx_{add,kill}_vid callbacks to have the NCSI
stack process new VLAN tags and configure the channel VLAN filter
appropriately.
Several VLAN tags can be set and a "Set VLAN Filter" packet must be sent
for each one, meaning the ncsi_dev_state_config_svf state must be
repeated. An internal list of VLAN tags is maintained, and compared
against the current channel's ncsi_channel_filter in order to keep track
within the state. VLAN filters are removed in a similar manner, with the
introduction of the ncsi_dev_state_config_clear_vids state. The maximum
number of VLAN tag filters is determined by the "Get Capabilities"
response from the channel.

Signed-off-by: Samuel Mendoza-Jonas <sam@mendozajonas.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/ncsi.h     |   2 +
 net/ncsi/internal.h    |  11 ++
 net/ncsi/ncsi-manage.c | 308 ++++++++++++++++++++++++++++++++++++++++++++++++-
 net/ncsi/ncsi-rsp.c    |   9 +-
 4 files changed, 326 insertions(+), 4 deletions(-)

(limited to 'include')

diff --git a/include/net/ncsi.h b/include/net/ncsi.h
index 68680baac0fd..1f96af46df49 100644
--- a/include/net/ncsi.h
+++ b/include/net/ncsi.h
@@ -28,6 +28,8 @@ struct ncsi_dev {
 };
 
 #ifdef CONFIG_NET_NCSI
+int ncsi_vlan_rx_add_vid(struct net_device *dev, __be16 proto, u16 vid);
+int ncsi_vlan_rx_kill_vid(struct net_device *dev, __be16 proto, u16 vid);
 struct ncsi_dev *ncsi_register_dev(struct net_device *dev,
 				   void (*notifier)(struct ncsi_dev *nd));
 int ncsi_start_dev(struct ncsi_dev *nd);
diff --git a/net/ncsi/internal.h b/net/ncsi/internal.h
index 1308a56f2591..af3d636534ef 100644
--- a/net/ncsi/internal.h
+++ b/net/ncsi/internal.h
@@ -180,6 +180,7 @@ struct ncsi_channel {
 #define NCSI_CHANNEL_INACTIVE		1
 #define NCSI_CHANNEL_ACTIVE		2
 #define NCSI_CHANNEL_INVISIBLE		3
+	bool                        reconfigure_needed;
 	spinlock_t                  lock;	/* Protect filters etc */
 	struct ncsi_package         *package;
 	struct ncsi_channel_version version;
@@ -235,6 +236,9 @@ enum {
 	ncsi_dev_state_probe_dp,
 	ncsi_dev_state_config_sp	= 0x0301,
 	ncsi_dev_state_config_cis,
+	ncsi_dev_state_config_clear_vids,
+	ncsi_dev_state_config_svf,
+	ncsi_dev_state_config_ev,
 	ncsi_dev_state_config_sma,
 	ncsi_dev_state_config_ebf,
 #if IS_ENABLED(CONFIG_IPV6)
@@ -253,6 +257,12 @@ enum {
 	ncsi_dev_state_suspend_done
 };
 
+struct vlan_vid {
+	struct list_head list;
+	__be16 proto;
+	u16 vid;
+};
+
 struct ncsi_dev_priv {
 	struct ncsi_dev     ndev;            /* Associated NCSI device     */
 	unsigned int        flags;           /* NCSI device flags          */
@@ -276,6 +286,7 @@ struct ncsi_dev_priv {
 	struct work_struct  work;            /* For channel management     */
 	struct packet_type  ptype;           /* NCSI packet Rx handler     */
 	struct list_head    node;            /* Form NCSI device list      */
+	struct list_head    vlan_vids;       /* List of active VLAN IDs */
 };
 
 struct ncsi_cmd_arg {
diff --git a/net/ncsi/ncsi-manage.c b/net/ncsi/ncsi-manage.c
index a3bd5fa8ad09..11904b3b702d 100644
--- a/net/ncsi/ncsi-manage.c
+++ b/net/ncsi/ncsi-manage.c
@@ -38,6 +38,25 @@ static inline int ncsi_filter_size(int table)
 	return sizes[table];
 }
 
+u32 *ncsi_get_filter(struct ncsi_channel *nc, int table, int index)
+{
+	struct ncsi_channel_filter *ncf;
+	int size;
+
+	ncf = nc->filters[table];
+	if (!ncf)
+		return NULL;
+
+	size = ncsi_filter_size(table);
+	if (size < 0)
+		return NULL;
+
+	return ncf->data + size * index;
+}
+
+/* Find the first active filter in a filter table that matches the given
+ * data parameter. If data is NULL, this returns the first active filter.
+ */
 int ncsi_find_filter(struct ncsi_channel *nc, int table, void *data)
 {
 	struct ncsi_channel_filter *ncf;
@@ -58,7 +77,7 @@ int ncsi_find_filter(struct ncsi_channel *nc, int table, void *data)
 	index = -1;
 	while ((index = find_next_bit(bitmap, ncf->total, index + 1))
 	       < ncf->total) {
-		if (!memcmp(ncf->data + size * index, data, size)) {
+		if (!data || !memcmp(ncf->data + size * index, data, size)) {
 			spin_unlock_irqrestore(&nc->lock, flags);
 			return index;
 		}
@@ -639,6 +658,95 @@ error:
 	nd->state = ncsi_dev_state_functional;
 }
 
+/* Check the VLAN filter bitmap for a set filter, and construct a
+ * "Set VLAN Filter - Disable" packet if found.
+ */
+static int clear_one_vid(struct ncsi_dev_priv *ndp, struct ncsi_channel *nc,
+			 struct ncsi_cmd_arg *nca)
+{
+	int index;
+	u32 *data;
+	u16 vid;
+
+	index = ncsi_find_filter(nc, NCSI_FILTER_VLAN, NULL);
+	if (index < 0) {
+		/* Filter table empty */
+		return -1;
+	}
+
+	data = ncsi_get_filter(nc, NCSI_FILTER_VLAN, index);
+	if (!data) {
+		netdev_err(ndp->ndev.dev,
+			   "ncsi: failed to retrieve filter %d\n", index);
+		/* Set the VLAN id to 0 - this will still disable the entry in
+		 * the filter table, but we won't know what it was.
+		 */
+		vid = 0;
+	} else {
+		vid = *(u16 *)data;
+	}
+
+	netdev_printk(KERN_DEBUG, ndp->ndev.dev,
+		      "ncsi: removed vlan tag %u at index %d\n",
+		      vid, index + 1);
+	ncsi_remove_filter(nc, NCSI_FILTER_VLAN, index);
+
+	nca->type = NCSI_PKT_CMD_SVF;
+	nca->words[1] = vid;
+	/* HW filter index starts at 1 */
+	nca->bytes[6] = index + 1;
+	nca->bytes[7] = 0x00;
+	return 0;
+}
+
+/* Find an outstanding VLAN tag and constuct a "Set VLAN Filter - Enable"
+ * packet.
+ */
+static int set_one_vid(struct ncsi_dev_priv *ndp, struct ncsi_channel *nc,
+		       struct ncsi_cmd_arg *nca)
+{
+	struct vlan_vid *vlan = NULL;
+	int index = 0;
+
+	list_for_each_entry_rcu(vlan, &ndp->vlan_vids, list) {
+		index = ncsi_find_filter(nc, NCSI_FILTER_VLAN, &vlan->vid);
+		if (index < 0) {
+			/* New tag to add */
+			netdev_printk(KERN_DEBUG, ndp->ndev.dev,
+				      "ncsi: new vlan id to set: %u\n",
+				      vlan->vid);
+			break;
+		}
+		netdev_printk(KERN_DEBUG, ndp->ndev.dev,
+			      "vid %u already at filter pos %d\n",
+			      vlan->vid, index);
+	}
+
+	if (!vlan || index >= 0) {
+		netdev_printk(KERN_DEBUG, ndp->ndev.dev,
+			      "no vlan ids left to set\n");
+		return -1;
+	}
+
+	index = ncsi_add_filter(nc, NCSI_FILTER_VLAN, &vlan->vid);
+	if (index < 0) {
+		netdev_err(ndp->ndev.dev,
+			   "Failed to add new VLAN tag, error %d\n", index);
+		return -1;
+	}
+
+	netdev_printk(KERN_DEBUG, ndp->ndev.dev,
+		      "ncsi: set vid %u in packet, index %u\n",
+		      vlan->vid, index + 1);
+	nca->type = NCSI_PKT_CMD_SVF;
+	nca->words[1] = vlan->vid;
+	/* HW filter index starts at 1 */
+	nca->bytes[6] = index + 1;
+	nca->bytes[7] = 0x01;
+
+	return 0;
+}
+
 static void ncsi_configure_channel(struct ncsi_dev_priv *ndp)
 {
 	struct ncsi_dev *nd = &ndp->ndev;
@@ -683,8 +791,11 @@ static void ncsi_configure_channel(struct ncsi_dev_priv *ndp)
 		if (ret)
 			goto error;
 
-		nd->state = ncsi_dev_state_config_sma;
+		nd->state = ncsi_dev_state_config_clear_vids;
 		break;
+	case ncsi_dev_state_config_clear_vids:
+	case ncsi_dev_state_config_svf:
+	case ncsi_dev_state_config_ev:
 	case ncsi_dev_state_config_sma:
 	case ncsi_dev_state_config_ebf:
 #if IS_ENABLED(CONFIG_IPV6)
@@ -699,11 +810,40 @@ static void ncsi_configure_channel(struct ncsi_dev_priv *ndp)
 		nca.package = np->id;
 		nca.channel = nc->id;
 
+		/* Clear any active filters on the channel before setting */
+		if (nd->state == ncsi_dev_state_config_clear_vids) {
+			ret = clear_one_vid(ndp, nc, &nca);
+			if (ret) {
+				nd->state = ncsi_dev_state_config_svf;
+				schedule_work(&ndp->work);
+				break;
+			}
+			/* Repeat */
+			nd->state = ncsi_dev_state_config_clear_vids;
+		/* Add known VLAN tags to the filter */
+		} else if (nd->state == ncsi_dev_state_config_svf) {
+			ret = set_one_vid(ndp, nc, &nca);
+			if (ret) {
+				nd->state = ncsi_dev_state_config_ev;
+				schedule_work(&ndp->work);
+				break;
+			}
+			/* Repeat */
+			nd->state = ncsi_dev_state_config_svf;
+		/* Enable/Disable the VLAN filter */
+		} else if (nd->state == ncsi_dev_state_config_ev) {
+			if (list_empty(&ndp->vlan_vids)) {
+				nca.type = NCSI_PKT_CMD_DV;
+			} else {
+				nca.type = NCSI_PKT_CMD_EV;
+				nca.bytes[3] = NCSI_CAP_VLAN_NO;
+			}
+			nd->state = ncsi_dev_state_config_sma;
+		} else if (nd->state == ncsi_dev_state_config_sma) {
 		/* Use first entry in unicast filter table. Note that
 		 * the MAC filter table starts from entry 1 instead of
 		 * 0.
 		 */
-		if (nd->state == ncsi_dev_state_config_sma) {
 			nca.type = NCSI_PKT_CMD_SMA;
 			for (index = 0; index < 6; index++)
 				nca.bytes[index] = dev->dev_addr[index];
@@ -751,6 +891,25 @@ static void ncsi_configure_channel(struct ncsi_dev_priv *ndp)
 		break;
 	case ncsi_dev_state_config_done:
 		spin_lock_irqsave(&nc->lock, flags);
+		if (nc->reconfigure_needed) {
+			/* This channel's configuration has been updated
+			 * part-way during the config state - start the
+			 * channel configuration over
+			 */
+			nc->reconfigure_needed = false;
+			nc->state = NCSI_CHANNEL_INACTIVE;
+			spin_unlock_irqrestore(&nc->lock, flags);
+
+			spin_lock_irqsave(&ndp->lock, flags);
+			list_add_tail_rcu(&nc->link, &ndp->channel_queue);
+			spin_unlock_irqrestore(&ndp->lock, flags);
+
+			netdev_printk(KERN_DEBUG, dev,
+				      "Dirty NCSI channel state reset\n");
+			ncsi_process_next_channel(ndp);
+			break;
+		}
+
 		if (nc->modes[NCSI_MODE_LINK].data[2] & 0x1) {
 			hot_nc = nc;
 			nc->state = NCSI_CHANNEL_ACTIVE;
@@ -1191,6 +1350,148 @@ static struct notifier_block ncsi_inet6addr_notifier = {
 };
 #endif /* CONFIG_IPV6 */
 
+static int ncsi_kick_channels(struct ncsi_dev_priv *ndp)
+{
+	struct ncsi_dev *nd = &ndp->ndev;
+	struct ncsi_channel *nc;
+	struct ncsi_package *np;
+	unsigned long flags;
+	unsigned int n = 0;
+
+	NCSI_FOR_EACH_PACKAGE(ndp, np) {
+		NCSI_FOR_EACH_CHANNEL(np, nc) {
+			spin_lock_irqsave(&nc->lock, flags);
+
+			/* Channels may be busy, mark dirty instead of
+			 * kicking if;
+			 * a) not ACTIVE (configured)
+			 * b) in the channel_queue (to be configured)
+			 * c) it's ndev is in the config state
+			 */
+			if (nc->state != NCSI_CHANNEL_ACTIVE) {
+				if ((ndp->ndev.state & 0xff00) ==
+						ncsi_dev_state_config ||
+						!list_empty(&nc->link)) {
+					netdev_printk(KERN_DEBUG, nd->dev,
+						      "ncsi: channel %p marked dirty\n",
+						      nc);
+					nc->reconfigure_needed = true;
+				}
+				spin_unlock_irqrestore(&nc->lock, flags);
+				continue;
+			}
+
+			spin_unlock_irqrestore(&nc->lock, flags);
+
+			ncsi_stop_channel_monitor(nc);
+			spin_lock_irqsave(&nc->lock, flags);
+			nc->state = NCSI_CHANNEL_INACTIVE;
+			spin_unlock_irqrestore(&nc->lock, flags);
+
+			spin_lock_irqsave(&ndp->lock, flags);
+			list_add_tail_rcu(&nc->link, &ndp->channel_queue);
+			spin_unlock_irqrestore(&ndp->lock, flags);
+
+			netdev_printk(KERN_DEBUG, nd->dev,
+				      "ncsi: kicked channel %p\n", nc);
+			n++;
+		}
+	}
+
+	return n;
+}
+
+int ncsi_vlan_rx_add_vid(struct net_device *dev, __be16 proto, u16 vid)
+{
+	struct ncsi_channel_filter *ncf;
+	struct ncsi_dev_priv *ndp;
+	unsigned int n_vids = 0;
+	struct vlan_vid *vlan;
+	struct ncsi_dev *nd;
+	bool found = false;
+
+	if (vid == 0)
+		return 0;
+
+	nd = ncsi_find_dev(dev);
+	if (!nd) {
+		netdev_warn(dev, "ncsi: No net_device?\n");
+		return 0;
+	}
+
+	ndp = TO_NCSI_DEV_PRIV(nd);
+	ncf = ndp->hot_channel->filters[NCSI_FILTER_VLAN];
+
+	/* Add the VLAN id to our internal list */
+	list_for_each_entry_rcu(vlan, &ndp->vlan_vids, list) {
+		n_vids++;
+		if (vlan->vid == vid) {
+			netdev_printk(KERN_DEBUG, dev,
+				      "vid %u already registered\n", vid);
+			return 0;
+		}
+	}
+
+	if (n_vids >= ncf->total) {
+		netdev_info(dev,
+			    "NCSI Channel supports up to %u VLAN tags but %u are already set\n",
+			    ncf->total, n_vids);
+		return -EINVAL;
+	}
+
+	vlan = kzalloc(sizeof(*vlan), GFP_KERNEL);
+	if (!vlan)
+		return -ENOMEM;
+
+	vlan->proto = proto;
+	vlan->vid = vid;
+	list_add_rcu(&vlan->list, &ndp->vlan_vids);
+
+	netdev_printk(KERN_DEBUG, dev, "Added new vid %u\n", vid);
+
+	found = ncsi_kick_channels(ndp) != 0;
+
+	return found ? ncsi_process_next_channel(ndp) : 0;
+}
+
+int ncsi_vlan_rx_kill_vid(struct net_device *dev, __be16 proto, u16 vid)
+{
+	struct vlan_vid *vlan, *tmp;
+	struct ncsi_dev_priv *ndp;
+	struct ncsi_dev *nd;
+	bool found = false;
+
+	if (vid == 0)
+		return 0;
+
+	nd = ncsi_find_dev(dev);
+	if (!nd) {
+		netdev_warn(dev, "ncsi: no net_device?\n");
+		return 0;
+	}
+
+	ndp = TO_NCSI_DEV_PRIV(nd);
+
+	/* Remove the VLAN id from our internal list */
+	list_for_each_entry_safe(vlan, tmp, &ndp->vlan_vids, list)
+		if (vlan->vid == vid) {
+			netdev_printk(KERN_DEBUG, dev,
+				      "vid %u found, removing\n", vid);
+			list_del_rcu(&vlan->list);
+			found = true;
+			kfree(vlan);
+		}
+
+	if (!found) {
+		netdev_err(dev, "ncsi: vid %u wasn't registered!\n", vid);
+		return -EINVAL;
+	}
+
+	found = ncsi_kick_channels(ndp) != 0;
+
+	return found ? ncsi_process_next_channel(ndp) : 0;
+}
+
 struct ncsi_dev *ncsi_register_dev(struct net_device *dev,
 				   void (*handler)(struct ncsi_dev *ndev))
 {
@@ -1215,6 +1516,7 @@ struct ncsi_dev *ncsi_register_dev(struct net_device *dev,
 	nd->handler = handler;
 	ndp->pending_req_num = 0;
 	INIT_LIST_HEAD(&ndp->channel_queue);
+	INIT_LIST_HEAD(&ndp->vlan_vids);
 	INIT_WORK(&ndp->work, ncsi_dev_work);
 
 	/* Initialize private NCSI device */
diff --git a/net/ncsi/ncsi-rsp.c b/net/ncsi/ncsi-rsp.c
index c1a191d790e2..265b9a892d41 100644
--- a/net/ncsi/ncsi-rsp.c
+++ b/net/ncsi/ncsi-rsp.c
@@ -694,7 +694,14 @@ static int ncsi_rsp_handler_gc(struct ncsi_request *nr)
 
 		ncf->index = i;
 		ncf->total = cnt;
-		ncf->bitmap = 0x0ul;
+		if (i == NCSI_FILTER_VLAN) {
+			/* Set VLAN filters active so they are cleared in
+			 * first configuration state
+			 */
+			ncf->bitmap = U64_MAX;
+		} else {
+			ncf->bitmap = 0x0ul;
+		}
 		nc->filters[i] = ncf;
 	}
 
-- 
cgit v1.2.3


From 2fd523c57e520899e05ec663d04743005bcef5b2 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Fri, 25 Aug 2017 17:06:13 +0200
Subject: dma-mapping: remove dma_alloc_noncoherent and dma_free_noncoherent

No users left, everyone switched to the _attrs versions.

Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 Documentation/DMA-API.txt            | 30 ++++++++++++++++--------------
 arch/metag/include/asm/dma-mapping.h |  2 +-
 arch/nios2/include/asm/dma-mapping.h |  2 +-
 arch/tile/include/asm/dma-mapping.h  |  4 ++--
 include/linux/dma-mapping.h          | 14 --------------
 5 files changed, 20 insertions(+), 32 deletions(-)

(limited to 'include')

diff --git a/Documentation/DMA-API.txt b/Documentation/DMA-API.txt
index 45b29326d719..ef3a04fcad65 100644
--- a/Documentation/DMA-API.txt
+++ b/Documentation/DMA-API.txt
@@ -515,14 +515,15 @@ API at all.
 ::
 
 	void *
-	dma_alloc_noncoherent(struct device *dev, size_t size,
-			      dma_addr_t *dma_handle, gfp_t flag)
+	dma_alloc_attrs(struct device *dev, size_t size, dma_addr_t *dma_handle,
+			gfp_t flag, unsigned long attrs)
 
-Identical to dma_alloc_coherent() except that the platform will
-choose to return either consistent or non-consistent memory as it sees
-fit.  By using this API, you are guaranteeing to the platform that you
-have all the correct and necessary sync points for this memory in the
-driver should it choose to return non-consistent memory.
+Identical to dma_alloc_coherent() except that when the
+DMA_ATTR_NON_CONSISTENT flags is passed in the attrs argument, the
+platform will choose to return either consistent or non-consistent memory
+as it sees fit.  By using this API, you are guaranteeing to the platform
+that you have all the correct and necessary sync points for this memory
+in the driver should it choose to return non-consistent memory.
 
 Note: where the platform can return consistent memory, it will
 guarantee that the sync points become nops.
@@ -535,12 +536,13 @@ that simply cannot make consistent memory.
 ::
 
 	void
-	dma_free_noncoherent(struct device *dev, size_t size, void *cpu_addr,
-			     dma_addr_t dma_handle)
+	dma_free_attrs(struct device *dev, size_t size, void *cpu_addr,
+		       dma_addr_t dma_handle, unsigned long attrs)
 
-Free memory allocated by the nonconsistent API.  All parameters must
-be identical to those passed in (and returned by
-dma_alloc_noncoherent()).
+Free memory allocated by the dma_alloc_attrs().  All parameters common
+parameters must identical to those otherwise passed to dma_fre_coherent,
+and the attrs argument must be identical to the attrs passed to
+dma_alloc_attrs().
 
 ::
 
@@ -564,8 +566,8 @@ memory or doing partial flushes.
 	dma_cache_sync(struct device *dev, void *vaddr, size_t size,
 		       enum dma_data_direction direction)
 
-Do a partial sync of memory that was allocated by
-dma_alloc_noncoherent(), starting at virtual address vaddr and
+Do a partial sync of memory that was allocated by dma_alloc_attrs() with
+the DMA_ATTR_NON_CONSISTENT flag starting at virtual address vaddr and
 continuing on for size.  Again, you *must* observe the cache line
 boundaries when doing this.
 
diff --git a/arch/metag/include/asm/dma-mapping.h b/arch/metag/include/asm/dma-mapping.h
index fad3dc3cb210..ea573be2b6d0 100644
--- a/arch/metag/include/asm/dma-mapping.h
+++ b/arch/metag/include/asm/dma-mapping.h
@@ -9,7 +9,7 @@ static inline const struct dma_map_ops *get_arch_dma_ops(struct bus_type *bus)
 }
 
 /*
- * dma_alloc_noncoherent() returns non-cacheable memory, so there's no need to
+ * dma_alloc_attrs() always returns non-cacheable memory, so there's no need to
  * do any flushing here.
  */
 static inline void
diff --git a/arch/nios2/include/asm/dma-mapping.h b/arch/nios2/include/asm/dma-mapping.h
index 7b3c6f280293..f8dc62222741 100644
--- a/arch/nios2/include/asm/dma-mapping.h
+++ b/arch/nios2/include/asm/dma-mapping.h
@@ -18,7 +18,7 @@ static inline const struct dma_map_ops *get_arch_dma_ops(struct bus_type *bus)
 }
 
 /*
- * dma_alloc_noncoherent() returns non-cacheable memory, so there's no need to
+ * dma_alloc_attrs() always returns non-cacheable memory, so there's no need to
  * do any flushing here.
  */
 static inline void dma_cache_sync(struct device *dev, void *vaddr, size_t size,
diff --git a/arch/tile/include/asm/dma-mapping.h b/arch/tile/include/asm/dma-mapping.h
index bbc71a29b2c6..7061dc8af43a 100644
--- a/arch/tile/include/asm/dma-mapping.h
+++ b/arch/tile/include/asm/dma-mapping.h
@@ -68,8 +68,8 @@ static inline bool dma_capable(struct device *dev, dma_addr_t addr, size_t size)
 int dma_set_mask(struct device *dev, u64 mask);
 
 /*
- * dma_alloc_noncoherent() is #defined to return coherent memory,
- * so there's no need to do any flushing here.
+ * dma_alloc_attrs() always returns non-cacheable memory, so there's no need to
+ * do any flushing here.
  */
 static inline void dma_cache_sync(struct device *dev, void *vaddr, size_t size,
 				  enum dma_data_direction direction)
diff --git a/include/linux/dma-mapping.h b/include/linux/dma-mapping.h
index 66d8ea68f40b..4c98cc96971f 100644
--- a/include/linux/dma-mapping.h
+++ b/include/linux/dma-mapping.h
@@ -549,20 +549,6 @@ static inline void dma_free_coherent(struct device *dev, size_t size,
 	return dma_free_attrs(dev, size, cpu_addr, dma_handle, 0);
 }
 
-static inline void *dma_alloc_noncoherent(struct device *dev, size_t size,
-		dma_addr_t *dma_handle, gfp_t gfp)
-{
-	return dma_alloc_attrs(dev, size, dma_handle, gfp,
-			       DMA_ATTR_NON_CONSISTENT);
-}
-
-static inline void dma_free_noncoherent(struct device *dev, size_t size,
-		void *cpu_addr, dma_addr_t dma_handle)
-{
-	dma_free_attrs(dev, size, cpu_addr, dma_handle,
-		       DMA_ATTR_NON_CONSISTENT);
-}
-
 static inline int dma_mapping_error(struct device *dev, dma_addr_t dma_addr)
 {
 	const struct dma_map_ops *ops = get_dma_ops(dev);
-- 
cgit v1.2.3


From 10674a03c633379fadb5b314abde975fba270058 Mon Sep 17 00:00:00 2001
From: Baolin Wang <baolin.wang@linaro.org>
Date: Tue, 29 Aug 2017 10:15:40 +0100
Subject: net: rxrpc: Replace time_t type with time64_t type

Since the 'expiry' variable of 'struct key_preparsed_payload' has been
changed to 'time64_t' type, which is year 2038 safe on 32bits system.

In net/rxrpc subsystem, we need convert 'u32' type to 'time64_t' type
when copying ticket expires time to 'prep->expiry', then this patch
introduces two helper functions to help convert 'u32' to 'time64_t'
type.

This patch also uses ktime_get_real_seconds() to get current time instead
of get_seconds() which is not year 2038 safe on 32bits system.

Signed-off-by: Baolin Wang <baolin.wang@linaro.org>
Signed-off-by: David Howells <dhowells@redhat.com>
---
 include/keys/rxrpc-type.h | 23 +++++++++++++++++++++++
 net/rxrpc/ar-internal.h   |  2 +-
 net/rxrpc/key.c           | 22 ++++++++++++++--------
 net/rxrpc/rxkad.c         | 14 +++++++-------
 4 files changed, 45 insertions(+), 16 deletions(-)

(limited to 'include')

diff --git a/include/keys/rxrpc-type.h b/include/keys/rxrpc-type.h
index 5de0673f333b..8cf829dbf20e 100644
--- a/include/keys/rxrpc-type.h
+++ b/include/keys/rxrpc-type.h
@@ -127,4 +127,27 @@ struct rxrpc_key_data_v1 {
 #define AFSTOKEN_K5_ADDRESSES_MAX	16	/* max K5 addresses */
 #define AFSTOKEN_K5_AUTHDATA_MAX	16	/* max K5 pieces of auth data */
 
+/*
+ * Truncate a time64_t to the range from 1970 to 2106 as in the network
+ * protocol.
+ */
+static inline u32 rxrpc_time64_to_u32(time64_t time)
+{
+	if (time < 0)
+		return 0;
+
+	if (time > UINT_MAX)
+		return UINT_MAX;
+
+	return (u32)time;
+}
+
+/*
+ * Extend u32 back to time64_t using the same 1970-2106 range.
+ */
+static inline time64_t rxrpc_u32_to_time64(u32 time)
+{
+	return (time64_t)time;
+}
+
 #endif /* _KEYS_RXRPC_TYPE_H */
diff --git a/net/rxrpc/ar-internal.h b/net/rxrpc/ar-internal.h
index 8c0db9b3e4ab..8cac66774de1 100644
--- a/net/rxrpc/ar-internal.h
+++ b/net/rxrpc/ar-internal.h
@@ -894,7 +894,7 @@ extern struct key_type key_type_rxrpc_s;
 
 int rxrpc_request_key(struct rxrpc_sock *, char __user *, int);
 int rxrpc_server_keyring(struct rxrpc_sock *, char __user *, int);
-int rxrpc_get_server_data_key(struct rxrpc_connection *, const void *, time_t,
+int rxrpc_get_server_data_key(struct rxrpc_connection *, const void *, time64_t,
 			      u32);
 
 /*
diff --git a/net/rxrpc/key.c b/net/rxrpc/key.c
index 54369225766e..e2d36619b53a 100644
--- a/net/rxrpc/key.c
+++ b/net/rxrpc/key.c
@@ -92,6 +92,7 @@ static int rxrpc_preparse_xdr_rxkad(struct key_preparsed_payload *prep,
 				    const __be32 *xdr, unsigned int toklen)
 {
 	struct rxrpc_key_token *token, **pptoken;
+	time64_t expiry;
 	size_t plen;
 	u32 tktlen;
 
@@ -158,8 +159,9 @@ static int rxrpc_preparse_xdr_rxkad(struct key_preparsed_payload *prep,
 	     pptoken = &(*pptoken)->next)
 		continue;
 	*pptoken = token;
-	if (token->kad->expiry < prep->expiry)
-		prep->expiry = token->kad->expiry;
+	expiry = rxrpc_u32_to_time64(token->kad->expiry);
+	if (expiry < prep->expiry)
+		prep->expiry = expiry;
 
 	_leave(" = 0");
 	return 0;
@@ -433,6 +435,7 @@ static int rxrpc_preparse_xdr_rxk5(struct key_preparsed_payload *prep,
 	struct rxrpc_key_token *token, **pptoken;
 	struct rxk5_key *rxk5;
 	const __be32 *end_xdr = xdr + (toklen >> 2);
+	time64_t expiry;
 	int ret;
 
 	_enter(",{%x,%x,%x,%x},%u",
@@ -533,8 +536,9 @@ static int rxrpc_preparse_xdr_rxk5(struct key_preparsed_payload *prep,
 	     pptoken = &(*pptoken)->next)
 		continue;
 	*pptoken = token;
-	if (token->kad->expiry < prep->expiry)
-		prep->expiry = token->kad->expiry;
+	expiry = rxrpc_u32_to_time64(token->kad->expiry);
+	if (expiry < prep->expiry)
+		prep->expiry = expiry;
 
 	_leave(" = 0");
 	return 0;
@@ -691,6 +695,7 @@ static int rxrpc_preparse(struct key_preparsed_payload *prep)
 {
 	const struct rxrpc_key_data_v1 *v1;
 	struct rxrpc_key_token *token, **pp;
+	time64_t expiry;
 	size_t plen;
 	u32 kver;
 	int ret;
@@ -777,8 +782,9 @@ static int rxrpc_preparse(struct key_preparsed_payload *prep)
 	while (*pp)
 		pp = &(*pp)->next;
 	*pp = token;
-	if (token->kad->expiry < prep->expiry)
-		prep->expiry = token->kad->expiry;
+	expiry = rxrpc_u32_to_time64(token->kad->expiry);
+	if (expiry < prep->expiry)
+		prep->expiry = expiry;
 	token = NULL;
 	ret = 0;
 
@@ -955,7 +961,7 @@ int rxrpc_server_keyring(struct rxrpc_sock *rx, char __user *optval,
  */
 int rxrpc_get_server_data_key(struct rxrpc_connection *conn,
 			      const void *session_key,
-			      time_t expiry,
+			      time64_t expiry,
 			      u32 kvno)
 {
 	const struct cred *cred = current_cred();
@@ -982,7 +988,7 @@ int rxrpc_get_server_data_key(struct rxrpc_connection *conn,
 	data.kver = 1;
 	data.v1.security_index = RXRPC_SECURITY_RXKAD;
 	data.v1.ticket_length = 0;
-	data.v1.expiry = expiry;
+	data.v1.expiry = rxrpc_time64_to_u32(expiry);
 	data.v1.kvno = 0;
 
 	memcpy(&data.v1.session_key, session_key, sizeof(data.v1.session_key));
diff --git a/net/rxrpc/rxkad.c b/net/rxrpc/rxkad.c
index 46d1a1f0b55b..34c86d2bcae5 100644
--- a/net/rxrpc/rxkad.c
+++ b/net/rxrpc/rxkad.c
@@ -854,7 +854,7 @@ static int rxkad_decrypt_ticket(struct rxrpc_connection *conn,
 				struct sk_buff *skb,
 				void *ticket, size_t ticket_len,
 				struct rxrpc_crypt *_session_key,
-				time_t *_expiry,
+				time64_t *_expiry,
 				u32 *_abort_code)
 {
 	struct skcipher_request *req;
@@ -864,7 +864,7 @@ static int rxkad_decrypt_ticket(struct rxrpc_connection *conn,
 	struct in_addr addr;
 	unsigned int life;
 	const char *eproto;
-	time_t issue, now;
+	time64_t issue, now;
 	bool little_endian;
 	int ret;
 	u32 abort_code;
@@ -960,15 +960,15 @@ static int rxkad_decrypt_ticket(struct rxrpc_connection *conn,
 	if (little_endian) {
 		__le32 stamp;
 		memcpy(&stamp, p, 4);
-		issue = le32_to_cpu(stamp);
+		issue = rxrpc_u32_to_time64(le32_to_cpu(stamp));
 	} else {
 		__be32 stamp;
 		memcpy(&stamp, p, 4);
-		issue = be32_to_cpu(stamp);
+		issue = rxrpc_u32_to_time64(be32_to_cpu(stamp));
 	}
 	p += 4;
-	now = get_seconds();
-	_debug("KIV ISSUE: %lx [%lx]", issue, now);
+	now = ktime_get_real_seconds();
+	_debug("KIV ISSUE: %llx [%llx]", issue, now);
 
 	/* check the ticket is in date */
 	if (issue > now) {
@@ -1053,7 +1053,7 @@ static int rxkad_verify_response(struct rxrpc_connection *conn,
 	struct rxrpc_skb_priv *sp = rxrpc_skb(skb);
 	struct rxrpc_crypt session_key;
 	const char *eproto;
-	time_t expiry;
+	time64_t expiry;
 	void *ticket;
 	u32 abort_code, version, kvno, ticket_len, level;
 	__be32 csum;
-- 
cgit v1.2.3


From e833251ad813168253fef9915aaf6a8c883337b0 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Tue, 29 Aug 2017 10:18:56 +0100
Subject: rxrpc: Add notification of end-of-Tx phase

Add a callback to rxrpc_kernel_send_data() so that a kernel service can get
a notification that the AF_RXRPC call has transitioned out the Tx phase and
is now waiting for a reply or a final ACK.

This is called from AF_RXRPC with the call state lock held so the
notification is guaranteed to come before any reply is passed back.

Further, modify the AFS filesystem to make use of this so that we don't have
to change the afs_call state before sending the last bit of data.

Signed-off-by: David Howells <dhowells@redhat.com>
---
 Documentation/networking/rxrpc.txt | 12 +++++++++-
 fs/afs/rxrpc.c                     | 46 +++++++++++++++++++++++++++++---------
 include/net/af_rxrpc.h             |  5 ++++-
 net/rxrpc/sendmsg.c                | 34 ++++++++++++++++++++++------
 4 files changed, 77 insertions(+), 20 deletions(-)

(limited to 'include')

diff --git a/Documentation/networking/rxrpc.txt b/Documentation/networking/rxrpc.txt
index 8c70ba5dee4d..92a3c3bd5ac3 100644
--- a/Documentation/networking/rxrpc.txt
+++ b/Documentation/networking/rxrpc.txt
@@ -818,10 +818,15 @@ The kernel interface functions are as follows:
 
  (*) Send data through a call.
 
+	typedef void (*rxrpc_notify_end_tx_t)(struct sock *sk,
+					      unsigned long user_call_ID,
+					      struct sk_buff *skb);
+
 	int rxrpc_kernel_send_data(struct socket *sock,
 				   struct rxrpc_call *call,
 				   struct msghdr *msg,
-				   size_t len);
+				   size_t len,
+				   rxrpc_notify_end_tx_t notify_end_rx);
 
      This is used to supply either the request part of a client call or the
      reply part of a server call.  msg.msg_iovlen and msg.msg_iov specify the
@@ -832,6 +837,11 @@ The kernel interface functions are as follows:
      The msg must not specify a destination address, control data or any flags
      other than MSG_MORE.  len is the total amount of data to transmit.
 
+     notify_end_rx can be NULL or it can be used to specify a function to be
+     called when the call changes state to end the Tx phase.  This function is
+     called with the call-state spinlock held to prevent any reply or final ACK
+     from being delivered first.
+
  (*) Receive data from a call.
 
 	int rxrpc_kernel_recv_data(struct socket *sock,
diff --git a/fs/afs/rxrpc.c b/fs/afs/rxrpc.c
index 10743043d431..0bf191f0dbaf 100644
--- a/fs/afs/rxrpc.c
+++ b/fs/afs/rxrpc.c
@@ -291,6 +291,19 @@ static void afs_load_bvec(struct afs_call *call, struct msghdr *msg,
 	iov_iter_bvec(&msg->msg_iter, WRITE | ITER_BVEC, bv, nr, bytes);
 }
 
+/*
+ * Advance the AFS call state when the RxRPC call ends the transmit phase.
+ */
+static void afs_notify_end_request_tx(struct sock *sock,
+				      struct rxrpc_call *rxcall,
+				      unsigned long call_user_ID)
+{
+	struct afs_call *call = (struct afs_call *)call_user_ID;
+
+	if (call->state == AFS_CALL_REQUESTING)
+		call->state = AFS_CALL_AWAIT_REPLY;
+}
+
 /*
  * attach the data from a bunch of pages on an inode to a call
  */
@@ -310,14 +323,8 @@ static int afs_send_pages(struct afs_call *call, struct msghdr *msg)
 		bytes = msg->msg_iter.count;
 		nr = msg->msg_iter.nr_segs;
 
-		/* Have to change the state *before* sending the last
-		 * packet as RxRPC might give us the reply before it
-		 * returns from sending the request.
-		 */
-		if (first + nr - 1 >= last)
-			call->state = AFS_CALL_AWAIT_REPLY;
-		ret = rxrpc_kernel_send_data(afs_socket, call->rxcall,
-					     msg, bytes);
+		ret = rxrpc_kernel_send_data(afs_socket, call->rxcall, msg,
+					     bytes, afs_notify_end_request_tx);
 		for (loop = 0; loop < nr; loop++)
 			put_page(bv[loop].bv_page);
 		if (ret < 0)
@@ -409,7 +416,8 @@ int afs_make_call(struct in_addr *addr, struct afs_call *call, gfp_t gfp,
 	if (!call->send_pages)
 		call->state = AFS_CALL_AWAIT_REPLY;
 	ret = rxrpc_kernel_send_data(afs_socket, rxcall,
-				     &msg, call->request_size);
+				     &msg, call->request_size,
+				     afs_notify_end_request_tx);
 	if (ret < 0)
 		goto error_do_abort;
 
@@ -740,6 +748,20 @@ static int afs_deliver_cm_op_id(struct afs_call *call)
 	return call->type->deliver(call);
 }
 
+/*
+ * Advance the AFS call state when an RxRPC service call ends the transmit
+ * phase.
+ */
+static void afs_notify_end_reply_tx(struct sock *sock,
+				    struct rxrpc_call *rxcall,
+				    unsigned long call_user_ID)
+{
+	struct afs_call *call = (struct afs_call *)call_user_ID;
+
+	if (call->state == AFS_CALL_REPLYING)
+		call->state = AFS_CALL_AWAIT_ACK;
+}
+
 /*
  * send an empty reply
  */
@@ -759,7 +781,8 @@ void afs_send_empty_reply(struct afs_call *call)
 	msg.msg_flags		= 0;
 
 	call->state = AFS_CALL_AWAIT_ACK;
-	switch (rxrpc_kernel_send_data(afs_socket, call->rxcall, &msg, 0)) {
+	switch (rxrpc_kernel_send_data(afs_socket, call->rxcall, &msg, 0,
+				       afs_notify_end_reply_tx)) {
 	case 0:
 		_leave(" [replied]");
 		return;
@@ -797,7 +820,8 @@ void afs_send_simple_reply(struct afs_call *call, const void *buf, size_t len)
 	msg.msg_flags		= 0;
 
 	call->state = AFS_CALL_AWAIT_ACK;
-	n = rxrpc_kernel_send_data(afs_socket, call->rxcall, &msg, len);
+	n = rxrpc_kernel_send_data(afs_socket, call->rxcall, &msg, len,
+				   afs_notify_end_reply_tx);
 	if (n >= 0) {
 		/* Success */
 		_leave(" [replied]");
diff --git a/include/net/af_rxrpc.h b/include/net/af_rxrpc.h
index c172709787af..07a47ee6f783 100644
--- a/include/net/af_rxrpc.h
+++ b/include/net/af_rxrpc.h
@@ -21,6 +21,8 @@ struct rxrpc_call;
 
 typedef void (*rxrpc_notify_rx_t)(struct sock *, struct rxrpc_call *,
 				  unsigned long);
+typedef void (*rxrpc_notify_end_tx_t)(struct sock *, struct rxrpc_call *,
+				      unsigned long);
 typedef void (*rxrpc_notify_new_call_t)(struct sock *, struct rxrpc_call *,
 					unsigned long);
 typedef void (*rxrpc_discard_new_call_t)(struct rxrpc_call *, unsigned long);
@@ -37,7 +39,8 @@ struct rxrpc_call *rxrpc_kernel_begin_call(struct socket *,
 					   gfp_t,
 					   rxrpc_notify_rx_t);
 int rxrpc_kernel_send_data(struct socket *, struct rxrpc_call *,
-			   struct msghdr *, size_t);
+			   struct msghdr *, size_t,
+			   rxrpc_notify_end_tx_t);
 int rxrpc_kernel_recv_data(struct socket *, struct rxrpc_call *,
 			   void *, size_t, size_t *, bool, u32 *);
 bool rxrpc_kernel_abort_call(struct socket *, struct rxrpc_call *,
diff --git a/net/rxrpc/sendmsg.c b/net/rxrpc/sendmsg.c
index bc7f0241d92b..344fdce89823 100644
--- a/net/rxrpc/sendmsg.c
+++ b/net/rxrpc/sendmsg.c
@@ -100,12 +100,24 @@ static inline void rxrpc_instant_resend(struct rxrpc_call *call, int ix)
 	spin_unlock_bh(&call->lock);
 }
 
+/*
+ * Notify the owner of the call that the transmit phase is ended and the last
+ * packet has been queued.
+ */
+static void rxrpc_notify_end_tx(struct rxrpc_sock *rx, struct rxrpc_call *call,
+				rxrpc_notify_end_tx_t notify_end_tx)
+{
+	if (notify_end_tx)
+		notify_end_tx(&rx->sk, call, call->user_call_ID);
+}
+
 /*
  * Queue a DATA packet for transmission, set the resend timeout and send the
  * packet immediately
  */
-static void rxrpc_queue_packet(struct rxrpc_call *call, struct sk_buff *skb,
-			       bool last)
+static void rxrpc_queue_packet(struct rxrpc_sock *rx, struct rxrpc_call *call,
+			       struct sk_buff *skb, bool last,
+			       rxrpc_notify_end_tx_t notify_end_tx)
 {
 	struct rxrpc_skb_priv *sp = rxrpc_skb(skb);
 	rxrpc_seq_t seq = sp->hdr.seq;
@@ -141,6 +153,7 @@ static void rxrpc_queue_packet(struct rxrpc_call *call, struct sk_buff *skb,
 		switch (call->state) {
 		case RXRPC_CALL_CLIENT_SEND_REQUEST:
 			call->state = RXRPC_CALL_CLIENT_AWAIT_REPLY;
+			rxrpc_notify_end_tx(rx, call, notify_end_tx);
 			break;
 		case RXRPC_CALL_SERVER_ACK_REQUEST:
 			call->state = RXRPC_CALL_SERVER_SEND_REPLY;
@@ -153,6 +166,7 @@ static void rxrpc_queue_packet(struct rxrpc_call *call, struct sk_buff *skb,
 				break;
 		case RXRPC_CALL_SERVER_SEND_REPLY:
 			call->state = RXRPC_CALL_SERVER_AWAIT_ACK;
+			rxrpc_notify_end_tx(rx, call, notify_end_tx);
 			break;
 		default:
 			break;
@@ -189,7 +203,8 @@ static void rxrpc_queue_packet(struct rxrpc_call *call, struct sk_buff *skb,
  */
 static int rxrpc_send_data(struct rxrpc_sock *rx,
 			   struct rxrpc_call *call,
-			   struct msghdr *msg, size_t len)
+			   struct msghdr *msg, size_t len,
+			   rxrpc_notify_end_tx_t notify_end_tx)
 {
 	struct rxrpc_skb_priv *sp;
 	struct sk_buff *skb;
@@ -350,7 +365,9 @@ static int rxrpc_send_data(struct rxrpc_sock *rx,
 			if (ret < 0)
 				goto out;
 
-			rxrpc_queue_packet(call, skb, !msg_data_left(msg) && !more);
+			rxrpc_queue_packet(rx, call, skb,
+					   !msg_data_left(msg) && !more,
+					   notify_end_tx);
 			skb = NULL;
 		}
 	} while (msg_data_left(msg) > 0);
@@ -611,7 +628,7 @@ int rxrpc_do_sendmsg(struct rxrpc_sock *rx, struct msghdr *msg, size_t len)
 		/* Reply phase not begun or not complete for service call. */
 		ret = -EPROTO;
 	} else {
-		ret = rxrpc_send_data(rx, call, msg, len);
+		ret = rxrpc_send_data(rx, call, msg, len, NULL);
 	}
 
 	mutex_unlock(&call->user_mutex);
@@ -631,6 +648,7 @@ error_release_sock:
  * @call: The call to send data through
  * @msg: The data to send
  * @len: The amount of data to send
+ * @notify_end_tx: Notification that the last packet is queued.
  *
  * Allow a kernel service to send data on a call.  The call must be in an state
  * appropriate to sending data.  No control data should be supplied in @msg,
@@ -638,7 +656,8 @@ error_release_sock:
  * more data to come, otherwise this data will end the transmission phase.
  */
 int rxrpc_kernel_send_data(struct socket *sock, struct rxrpc_call *call,
-			   struct msghdr *msg, size_t len)
+			   struct msghdr *msg, size_t len,
+			   rxrpc_notify_end_tx_t notify_end_tx)
 {
 	int ret;
 
@@ -656,7 +675,8 @@ int rxrpc_kernel_send_data(struct socket *sock, struct rxrpc_call *call,
 	case RXRPC_CALL_CLIENT_SEND_REQUEST:
 	case RXRPC_CALL_SERVER_ACK_REQUEST:
 	case RXRPC_CALL_SERVER_SEND_REPLY:
-		ret = rxrpc_send_data(rxrpc_sk(sock->sk), call, msg, len);
+		ret = rxrpc_send_data(rxrpc_sk(sock->sk), call, msg, len,
+				      notify_end_tx);
 		break;
 	case RXRPC_CALL_COMPLETE:
 		read_lock_bh(&call->state_lock);
-- 
cgit v1.2.3


From c038a58ccfd6704d4d7d60ed3d6a0fca13cf13a4 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Tue, 29 Aug 2017 10:19:01 +0100
Subject: rxrpc: Allow failed client calls to be retried

Allow a client call that failed on network error to be retried, provided
that the Tx queue still holds DATA packet 1.  This allows an operation to
be submitted to another server or another address for the same server
without having to repackage and re-encrypt the data so far processed.

Two new functions are provided:

 (1) rxrpc_kernel_check_call() - This is used to find out the completion
     state of a call to guess whether it can be retried and whether it
     should be retried.

 (2) rxrpc_kernel_retry_call() - Disconnect the call from its current
     connection, reset the state and submit it as a new client call to a
     new address.  The new address need not match the previous address.

A call may be retried even if all the data hasn't been loaded into it yet;
a partially constructed will be retained at the same point it was at when
an error condition was detected.  msg_data_left() can be used to find out
how much data was packaged before the error occurred.

Signed-off-by: David Howells <dhowells@redhat.com>
---
 Documentation/networking/rxrpc.txt |  45 ++++++++++++++++
 include/net/af_rxrpc.h             |  16 ++++++
 net/rxrpc/af_rxrpc.c               |  69 +++++++++++++++++++++++++
 net/rxrpc/ar-internal.h            |  19 +++----
 net/rxrpc/call_object.c            | 102 +++++++++++++++++++++++++++++++++++--
 net/rxrpc/conn_client.c            |  17 +++++--
 net/rxrpc/sendmsg.c                |  24 +++++----
 7 files changed, 261 insertions(+), 31 deletions(-)

(limited to 'include')

diff --git a/Documentation/networking/rxrpc.txt b/Documentation/networking/rxrpc.txt
index 92a3c3bd5ac3..810620153a44 100644
--- a/Documentation/networking/rxrpc.txt
+++ b/Documentation/networking/rxrpc.txt
@@ -975,6 +975,51 @@ The kernel interface functions are as follows:
      size should be set when the call is begun.  tx_total_len may not be less
      than zero.
 
+ (*) Check to see the completion state of a call so that the caller can assess
+     whether it needs to be retried.
+
+	enum rxrpc_call_completion {
+		RXRPC_CALL_SUCCEEDED,
+		RXRPC_CALL_REMOTELY_ABORTED,
+		RXRPC_CALL_LOCALLY_ABORTED,
+		RXRPC_CALL_LOCAL_ERROR,
+		RXRPC_CALL_NETWORK_ERROR,
+	};
+
+	int rxrpc_kernel_check_call(struct socket *sock, struct rxrpc_call *call,
+				    enum rxrpc_call_completion *_compl,
+				    u32 *_abort_code);
+
+     On return, -EINPROGRESS will be returned if the call is still ongoing; if
+     it is finished, *_compl will be set to indicate the manner of completion,
+     *_abort_code will be set to any abort code that occurred.  0 will be
+     returned on a successful completion, -ECONNABORTED will be returned if the
+     client failed due to a remote abort and anything else will return an
+     appropriate error code.
+
+     The caller should look at this information to decide if it's worth
+     retrying the call.
+
+ (*) Retry a client call.
+
+	int rxrpc_kernel_retry_call(struct socket *sock,
+				    struct rxrpc_call *call,
+				    struct sockaddr_rxrpc *srx,
+				    struct key *key);
+
+     This attempts to partially reinitialise a call and submit it again whilst
+     reusing the original call's Tx queue to avoid the need to repackage and
+     re-encrypt the data to be sent.  call indicates the call to retry, srx the
+     new address to send it to and key the encryption key to use for signing or
+     encrypting the packets.
+
+     For this to work, the first Tx data packet must still be in the transmit
+     queue, and currently this is only permitted for local and network errors
+     and the call must not have been aborted.  Any partially constructed Tx
+     packet is left as is and can continue being filled afterwards.
+
+     It returns 0 if the call was requeued and an error otherwise.
+
 
 =======================
 CONFIGURABLE PARAMETERS
diff --git a/include/net/af_rxrpc.h b/include/net/af_rxrpc.h
index 07a47ee6f783..3ac79150291f 100644
--- a/include/net/af_rxrpc.h
+++ b/include/net/af_rxrpc.h
@@ -19,6 +19,18 @@ struct sock;
 struct socket;
 struct rxrpc_call;
 
+/*
+ * Call completion condition (state == RXRPC_CALL_COMPLETE).
+ */
+enum rxrpc_call_completion {
+	RXRPC_CALL_SUCCEEDED,		/* - Normal termination */
+	RXRPC_CALL_REMOTELY_ABORTED,	/* - call aborted by peer */
+	RXRPC_CALL_LOCALLY_ABORTED,	/* - call aborted locally on error or close */
+	RXRPC_CALL_LOCAL_ERROR,		/* - call failed due to local error */
+	RXRPC_CALL_NETWORK_ERROR,	/* - call terminated by network error */
+	NR__RXRPC_CALL_COMPLETIONS
+};
+
 typedef void (*rxrpc_notify_rx_t)(struct sock *, struct rxrpc_call *,
 				  unsigned long);
 typedef void (*rxrpc_notify_end_tx_t)(struct sock *, struct rxrpc_call *,
@@ -51,5 +63,9 @@ void rxrpc_kernel_get_peer(struct socket *, struct rxrpc_call *,
 int rxrpc_kernel_charge_accept(struct socket *, rxrpc_notify_rx_t,
 			       rxrpc_user_attach_call_t, unsigned long, gfp_t);
 void rxrpc_kernel_set_tx_length(struct socket *, struct rxrpc_call *, s64);
+int rxrpc_kernel_retry_call(struct socket *, struct rxrpc_call *,
+			    struct sockaddr_rxrpc *, struct key *);
+int rxrpc_kernel_check_call(struct socket *, struct rxrpc_call *,
+			    enum rxrpc_call_completion *, u32 *);
 
 #endif /* _NET_RXRPC_H */
diff --git a/net/rxrpc/af_rxrpc.c b/net/rxrpc/af_rxrpc.c
index 31e97f714ca9..fb17552fd292 100644
--- a/net/rxrpc/af_rxrpc.c
+++ b/net/rxrpc/af_rxrpc.c
@@ -336,6 +336,75 @@ void rxrpc_kernel_end_call(struct socket *sock, struct rxrpc_call *call)
 }
 EXPORT_SYMBOL(rxrpc_kernel_end_call);
 
+/**
+ * rxrpc_kernel_check_call - Check a call's state
+ * @sock: The socket the call is on
+ * @call: The call to check
+ * @_compl: Where to store the completion state
+ * @_abort_code: Where to store any abort code
+ *
+ * Allow a kernel service to query the state of a call and find out the manner
+ * of its termination if it has completed.  Returns -EINPROGRESS if the call is
+ * still going, 0 if the call finished successfully, -ECONNABORTED if the call
+ * was aborted and an appropriate error if the call failed in some other way.
+ */
+int rxrpc_kernel_check_call(struct socket *sock, struct rxrpc_call *call,
+			    enum rxrpc_call_completion *_compl, u32 *_abort_code)
+{
+	if (call->state != RXRPC_CALL_COMPLETE)
+		return -EINPROGRESS;
+	smp_rmb();
+	*_compl = call->completion;
+	*_abort_code = call->abort_code;
+	return call->error;
+}
+EXPORT_SYMBOL(rxrpc_kernel_check_call);
+
+/**
+ * rxrpc_kernel_retry_call - Allow a kernel service to retry a call
+ * @sock: The socket the call is on
+ * @call: The call to retry
+ * @srx: The address of the peer to contact
+ * @key: The security context to use (defaults to socket setting)
+ *
+ * Allow a kernel service to try resending a client call that failed due to a
+ * network error to a new address.  The Tx queue is maintained intact, thereby
+ * relieving the need to re-encrypt any request data that has already been
+ * buffered.
+ */
+int rxrpc_kernel_retry_call(struct socket *sock, struct rxrpc_call *call,
+			    struct sockaddr_rxrpc *srx, struct key *key)
+{
+	struct rxrpc_conn_parameters cp;
+	struct rxrpc_sock *rx = rxrpc_sk(sock->sk);
+	int ret;
+
+	_enter("%d{%d}", call->debug_id, atomic_read(&call->usage));
+
+	if (!key)
+		key = rx->key;
+	if (key && !key->payload.data[0])
+		key = NULL; /* a no-security key */
+
+	memset(&cp, 0, sizeof(cp));
+	cp.local		= rx->local;
+	cp.key			= key;
+	cp.security_level	= 0;
+	cp.exclusive		= false;
+	cp.service_id		= srx->srx_service;
+
+	mutex_lock(&call->user_mutex);
+
+	ret = rxrpc_prepare_call_for_retry(rx, call);
+	if (ret == 0)
+		ret = rxrpc_retry_client_call(rx, call, &cp, srx, GFP_KERNEL);
+
+	mutex_unlock(&call->user_mutex);
+	_leave(" = %d", ret);
+	return ret;
+}
+EXPORT_SYMBOL(rxrpc_kernel_retry_call);
+
 /**
  * rxrpc_kernel_new_call_notification - Get notifications of new calls
  * @sock: The socket to intercept received messages on
diff --git a/net/rxrpc/ar-internal.h b/net/rxrpc/ar-internal.h
index 227e24794055..ea5600b747cc 100644
--- a/net/rxrpc/ar-internal.h
+++ b/net/rxrpc/ar-internal.h
@@ -445,6 +445,7 @@ enum rxrpc_call_flag {
 	RXRPC_CALL_EXPOSED,		/* The call was exposed to the world */
 	RXRPC_CALL_RX_LAST,		/* Received the last packet (at rxtx_top) */
 	RXRPC_CALL_TX_LAST,		/* Last packet in Tx buffer (at rxtx_top) */
+	RXRPC_CALL_TX_LASTQ,		/* Last packet has been queued */
 	RXRPC_CALL_SEND_PING,		/* A ping will need to be sent */
 	RXRPC_CALL_PINGING,		/* Ping in process */
 	RXRPC_CALL_RETRANS_TIMEOUT,	/* Retransmission due to timeout occurred */
@@ -481,18 +482,6 @@ enum rxrpc_call_state {
 	NR__RXRPC_CALL_STATES
 };
 
-/*
- * Call completion condition (state == RXRPC_CALL_COMPLETE).
- */
-enum rxrpc_call_completion {
-	RXRPC_CALL_SUCCEEDED,		/* - Normal termination */
-	RXRPC_CALL_REMOTELY_ABORTED,	/* - call aborted by peer */
-	RXRPC_CALL_LOCALLY_ABORTED,	/* - call aborted locally on error or close */
-	RXRPC_CALL_LOCAL_ERROR,		/* - call failed due to local error */
-	RXRPC_CALL_NETWORK_ERROR,	/* - call terminated by network error */
-	NR__RXRPC_CALL_COMPLETIONS
-};
-
 /*
  * Call Tx congestion management modes.
  */
@@ -687,9 +676,15 @@ struct rxrpc_call *rxrpc_new_client_call(struct rxrpc_sock *,
 					 struct rxrpc_conn_parameters *,
 					 struct sockaddr_rxrpc *,
 					 unsigned long, s64, gfp_t);
+int rxrpc_retry_client_call(struct rxrpc_sock *,
+			    struct rxrpc_call *,
+			    struct rxrpc_conn_parameters *,
+			    struct sockaddr_rxrpc *,
+			    gfp_t);
 void rxrpc_incoming_call(struct rxrpc_sock *, struct rxrpc_call *,
 			 struct sk_buff *);
 void rxrpc_release_call(struct rxrpc_sock *, struct rxrpc_call *);
+int rxrpc_prepare_call_for_retry(struct rxrpc_sock *, struct rxrpc_call *);
 void rxrpc_release_calls_on_socket(struct rxrpc_sock *);
 bool __rxrpc_queue_call(struct rxrpc_call *);
 bool rxrpc_queue_call(struct rxrpc_call *);
diff --git a/net/rxrpc/call_object.c b/net/rxrpc/call_object.c
index d7809a0620b4..fcdd6555a820 100644
--- a/net/rxrpc/call_object.c
+++ b/net/rxrpc/call_object.c
@@ -269,11 +269,6 @@ struct rxrpc_call *rxrpc_new_client_call(struct rxrpc_sock *rx,
 	trace_rxrpc_call(call, rxrpc_call_connected, atomic_read(&call->usage),
 			 here, NULL);
 
-	spin_lock_bh(&call->conn->params.peer->lock);
-	hlist_add_head(&call->error_link,
-		       &call->conn->params.peer->error_targets);
-	spin_unlock_bh(&call->conn->params.peer->lock);
-
 	rxrpc_start_call_timer(call);
 
 	_net("CALL new %d on CONN %d", call->debug_id, call->conn->debug_id);
@@ -303,6 +298,48 @@ error:
 	return ERR_PTR(ret);
 }
 
+/*
+ * Retry a call to a new address.  It is expected that the Tx queue of the call
+ * will contain data previously packaged for an old call.
+ */
+int rxrpc_retry_client_call(struct rxrpc_sock *rx,
+			    struct rxrpc_call *call,
+			    struct rxrpc_conn_parameters *cp,
+			    struct sockaddr_rxrpc *srx,
+			    gfp_t gfp)
+{
+	const void *here = __builtin_return_address(0);
+	int ret;
+
+	/* Set up or get a connection record and set the protocol parameters,
+	 * including channel number and call ID.
+	 */
+	ret = rxrpc_connect_call(call, cp, srx, gfp);
+	if (ret < 0)
+		goto error;
+
+	trace_rxrpc_call(call, rxrpc_call_connected, atomic_read(&call->usage),
+			 here, NULL);
+
+	rxrpc_start_call_timer(call);
+
+	_net("CALL new %d on CONN %d", call->debug_id, call->conn->debug_id);
+
+	if (!test_and_set_bit(RXRPC_CALL_EV_RESEND, &call->events))
+		rxrpc_queue_call(call);
+
+	_leave(" = 0");
+	return 0;
+
+error:
+	rxrpc_set_call_completion(call, RXRPC_CALL_LOCAL_ERROR,
+				  RX_CALL_DEAD, ret);
+	trace_rxrpc_call(call, rxrpc_call_error, atomic_read(&call->usage),
+			 here, ERR_PTR(ret));
+	_leave(" = %d", ret);
+	return ret;
+}
+
 /*
  * Set up an incoming call.  call->conn points to the connection.
  * This is called in BH context and isn't allowed to fail.
@@ -470,6 +507,61 @@ void rxrpc_release_call(struct rxrpc_sock *rx, struct rxrpc_call *call)
 	_leave("");
 }
 
+/*
+ * Prepare a kernel service call for retry.
+ */
+int rxrpc_prepare_call_for_retry(struct rxrpc_sock *rx, struct rxrpc_call *call)
+{
+	const void *here = __builtin_return_address(0);
+	int i;
+	u8 last = 0;
+
+	_enter("{%d,%d}", call->debug_id, atomic_read(&call->usage));
+
+	trace_rxrpc_call(call, rxrpc_call_release, atomic_read(&call->usage),
+			 here, (const void *)call->flags);
+
+	ASSERTCMP(call->state, ==, RXRPC_CALL_COMPLETE);
+	ASSERTCMP(call->completion, !=, RXRPC_CALL_REMOTELY_ABORTED);
+	ASSERTCMP(call->completion, !=, RXRPC_CALL_LOCALLY_ABORTED);
+	ASSERT(list_empty(&call->recvmsg_link));
+
+	del_timer_sync(&call->timer);
+
+	_debug("RELEASE CALL %p (%d CONN %p)", call, call->debug_id, call->conn);
+
+	if (call->conn)
+		rxrpc_disconnect_call(call);
+
+	if (rxrpc_is_service_call(call) ||
+	    !call->tx_phase ||
+	    call->tx_hard_ack != 0 ||
+	    call->rx_hard_ack != 0 ||
+	    call->rx_top != 0)
+		return -EINVAL;
+
+	call->state = RXRPC_CALL_UNINITIALISED;
+	call->completion = RXRPC_CALL_SUCCEEDED;
+	call->call_id = 0;
+	call->cid = 0;
+	call->cong_cwnd = 0;
+	call->cong_extra = 0;
+	call->cong_ssthresh = 0;
+	call->cong_mode = 0;
+	call->cong_dup_acks = 0;
+	call->cong_cumul_acks = 0;
+	call->acks_lowest_nak = 0;
+
+	for (i = 0; i < RXRPC_RXTX_BUFF_SIZE; i++) {
+		last |= call->rxtx_annotations[i];
+		call->rxtx_annotations[i] &= RXRPC_TX_ANNO_LAST;
+		call->rxtx_annotations[i] |= RXRPC_TX_ANNO_RETRANS;
+	}
+
+	_leave(" = 0");
+	return 0;
+}
+
 /*
  * release all the calls associated with a socket
  */
diff --git a/net/rxrpc/conn_client.c b/net/rxrpc/conn_client.c
index eb2157680399..5f9624bd311c 100644
--- a/net/rxrpc/conn_client.c
+++ b/net/rxrpc/conn_client.c
@@ -555,7 +555,10 @@ static void rxrpc_activate_one_channel(struct rxrpc_connection *conn,
 	trace_rxrpc_client(conn, channel, rxrpc_client_chan_activate);
 
 	write_lock_bh(&call->state_lock);
-	call->state = RXRPC_CALL_CLIENT_SEND_REQUEST;
+	if (!test_bit(RXRPC_CALL_TX_LASTQ, &call->flags))
+		call->state = RXRPC_CALL_CLIENT_SEND_REQUEST;
+	else
+		call->state = RXRPC_CALL_CLIENT_AWAIT_REPLY;
 	write_unlock_bh(&call->state_lock);
 
 	rxrpc_see_call(call);
@@ -688,15 +691,23 @@ int rxrpc_connect_call(struct rxrpc_call *call,
 
 	ret = rxrpc_get_client_conn(call, cp, srx, gfp);
 	if (ret < 0)
-		return ret;
+		goto out;
 
 	rxrpc_animate_client_conn(rxnet, call->conn);
 	rxrpc_activate_channels(call->conn);
 
 	ret = rxrpc_wait_for_channel(call, gfp);
-	if (ret < 0)
+	if (ret < 0) {
 		rxrpc_disconnect_client_call(call);
+		goto out;
+	}
+
+	spin_lock_bh(&call->conn->params.peer->lock);
+	hlist_add_head(&call->error_link,
+		       &call->conn->params.peer->error_targets);
+	spin_unlock_bh(&call->conn->params.peer->lock);
 
+out:
 	_leave(" = %d", ret);
 	return ret;
 }
diff --git a/net/rxrpc/sendmsg.c b/net/rxrpc/sendmsg.c
index 344fdce89823..9ea6f972767e 100644
--- a/net/rxrpc/sendmsg.c
+++ b/net/rxrpc/sendmsg.c
@@ -128,8 +128,10 @@ static void rxrpc_queue_packet(struct rxrpc_sock *rx, struct rxrpc_call *call,
 
 	ASSERTCMP(seq, ==, call->tx_top + 1);
 
-	if (last)
+	if (last) {
 		annotation |= RXRPC_TX_ANNO_LAST;
+		set_bit(RXRPC_CALL_TX_LASTQ, &call->flags);
+	}
 
 	/* We have to set the timestamp before queueing as the retransmit
 	 * algorithm can see the packet as soon as we queue it.
@@ -326,11 +328,6 @@ static int rxrpc_send_data(struct rxrpc_sock *rx,
 				call->tx_total_len -= copy;
 		}
 
-		/* check for the far side aborting the call or a network error
-		 * occurring */
-		if (call->state == RXRPC_CALL_COMPLETE)
-			goto call_terminated;
-
 		/* add the packet to the send queue if it's now full */
 		if (sp->remain <= 0 ||
 		    (msg_data_left(msg) == 0 && !more)) {
@@ -370,6 +367,16 @@ static int rxrpc_send_data(struct rxrpc_sock *rx,
 					   notify_end_tx);
 			skb = NULL;
 		}
+
+		/* Check for the far side aborting the call or a network error
+		 * occurring.  If this happens, save any packet that was under
+		 * construction so that in the case of a network error, the
+		 * call can be retried or redirected.
+		 */
+		if (call->state == RXRPC_CALL_COMPLETE) {
+			ret = call->error;
+			goto out;
+		}
 	} while (msg_data_left(msg) > 0);
 
 success:
@@ -379,11 +386,6 @@ out:
 	_leave(" = %d", ret);
 	return ret;
 
-call_terminated:
-	rxrpc_free_skb(skb, rxrpc_skb_tx_freed);
-	_leave(" = %d", call->error);
-	return call->error;
-
 maybe_error:
 	if (copied)
 		goto success;
-- 
cgit v1.2.3


From c3142dd8bedb8b78ee13c885dd92093fc8a50277 Mon Sep 17 00:00:00 2001
From: Hans de Goede <hdegoede@redhat.com>
Date: Tue, 15 Aug 2017 22:04:56 +0200
Subject: power: supply: Add power_supply_set_input_current_limit_from_supplier
 helper

On some devices the USB Type-C port power (USB PD 2.0) negotiation is
done by a separate port-controller IC, while the current limit is
controlled through another (charger) IC.

It has been decided to model this by modelling the external Type-C
power brick (adapter/charger) as a power-supply class device which
supplies the charger-IC, with its voltage-now and current-max representing
the negotiated voltage and max current draw.

This commit adds a power_supply_set_input_current_limit_from_supplier
helper function which charger power-supply drivers can call to get
the max-current from their supplier and have this applied
through their set_property call-back to their input-current-limit.

Signed-off-by: Hans de Goede <hdegoede@redhat.com>
Signed-off-by: Sebastian Reichel <sebastian.reichel@collabora.co.uk>
---
 drivers/power/supply/power_supply_core.c | 41 ++++++++++++++++++++++++++++++++
 include/linux/power_supply.h             |  2 ++
 2 files changed, 43 insertions(+)

(limited to 'include')

diff --git a/drivers/power/supply/power_supply_core.c b/drivers/power/supply/power_supply_core.c
index df5374afd214..02c6340ae36f 100644
--- a/drivers/power/supply/power_supply_core.c
+++ b/drivers/power/supply/power_supply_core.c
@@ -371,6 +371,47 @@ int power_supply_is_system_supplied(void)
 }
 EXPORT_SYMBOL_GPL(power_supply_is_system_supplied);
 
+static int __power_supply_get_supplier_max_current(struct device *dev,
+						   void *data)
+{
+	union power_supply_propval ret = {0,};
+	struct power_supply *epsy = dev_get_drvdata(dev);
+	struct power_supply *psy = data;
+
+	if (__power_supply_is_supplied_by(epsy, psy))
+		if (!epsy->desc->get_property(epsy,
+					      POWER_SUPPLY_PROP_CURRENT_MAX,
+					      &ret))
+			return ret.intval;
+
+	return 0;
+}
+
+int power_supply_set_input_current_limit_from_supplier(struct power_supply *psy)
+{
+	union power_supply_propval val = {0,};
+	int curr;
+
+	if (!psy->desc->set_property)
+		return -EINVAL;
+
+	/*
+	 * This function is not intended for use with a supply with multiple
+	 * suppliers, we simply pick the first supply to report a non 0
+	 * max-current.
+	 */
+	curr = class_for_each_device(power_supply_class, NULL, psy,
+				      __power_supply_get_supplier_max_current);
+	if (curr <= 0)
+		return (curr == 0) ? -ENODEV : curr;
+
+	val.intval = curr;
+
+	return psy->desc->set_property(psy,
+				POWER_SUPPLY_PROP_INPUT_CURRENT_LIMIT, &val);
+}
+EXPORT_SYMBOL_GPL(power_supply_set_input_current_limit_from_supplier);
+
 int power_supply_set_battery_charged(struct power_supply *psy)
 {
 	if (atomic_read(&psy->use_cnt) >= 0 &&
diff --git a/include/linux/power_supply.h b/include/linux/power_supply.h
index de89066b72b1..79e90b3d3288 100644
--- a/include/linux/power_supply.h
+++ b/include/linux/power_supply.h
@@ -332,6 +332,8 @@ extern int power_supply_get_battery_info(struct power_supply *psy,
 					 struct power_supply_battery_info *info);
 extern void power_supply_changed(struct power_supply *psy);
 extern int power_supply_am_i_supplied(struct power_supply *psy);
+extern int power_supply_set_input_current_limit_from_supplier(
+					 struct power_supply *psy);
 extern int power_supply_set_battery_charged(struct power_supply *psy);
 
 #ifdef CONFIG_POWER_SUPPLY
-- 
cgit v1.2.3


From 3a731c6414c94012328f485b4b1bb88ed841f9eb Mon Sep 17 00:00:00 2001
From: Liam Breck <kernel@networkimprov.net>
Date: Wed, 23 Aug 2017 20:36:14 -0700
Subject: power: supply: bq27xxx: Add chip IDs for previously shadowed chips

For the existing features, these chips act like others already ID'd,
so they had false but functional IDs. We will be adding features
which require correct IDs, so the following IDs are added:
BQ2752X, 531, 542, 546, 742, 425, 441, 621

Chip-specific features are now tracked by BQ27XXX_O_* flags in di->opts.

No functional changes to the driver.

Signed-off-by: Liam Breck <kernel@networkimprov.net>
Signed-off-by: Sebastian Reichel <sebastian.reichel@collabora.co.uk>
---
 drivers/power/supply/bq27xxx_battery.c     | 120 +++++++++++++++++------------
 drivers/power/supply/bq27xxx_battery_i2c.c |  16 ++--
 include/linux/power/bq27xxx_battery.h      |   9 +++
 3 files changed, 86 insertions(+), 59 deletions(-)

(limited to 'include')

diff --git a/drivers/power/supply/bq27xxx_battery.c b/drivers/power/supply/bq27xxx_battery.c
index 00322ccf09b5..cf0f19e2a639 100644
--- a/drivers/power/supply/bq27xxx_battery.c
+++ b/drivers/power/supply/bq27xxx_battery.c
@@ -221,6 +221,7 @@ static u8
 		[BQ27XXX_REG_AP] = INVALID_REG_ADDR,
 		BQ27XXX_DM_REG_ROWS,
 	},
+#define bq2752x_regs bq2751x_regs
 	bq27500_regs[BQ27XXX_REG_MAX] = {
 		[BQ27XXX_REG_CTRL] = 0x00,
 		[BQ27XXX_REG_TEMP] = 0x06,
@@ -401,6 +402,7 @@ static u8
 		[BQ27XXX_REG_AP] = 0x24,
 		BQ27XXX_DM_REG_ROWS,
 	},
+#define bq27531_regs bq27530_regs
 	bq27541_regs[BQ27XXX_REG_MAX] = {
 		[BQ27XXX_REG_CTRL] = 0x00,
 		[BQ27XXX_REG_TEMP] = 0x06,
@@ -421,6 +423,9 @@ static u8
 		[BQ27XXX_REG_AP] = 0x24,
 		BQ27XXX_DM_REG_ROWS,
 	},
+#define bq27542_regs bq27541_regs
+#define bq27546_regs bq27541_regs
+#define bq27742_regs bq27541_regs
 	bq27545_regs[BQ27XXX_REG_MAX] = {
 		[BQ27XXX_REG_CTRL] = 0x00,
 		[BQ27XXX_REG_TEMP] = 0x06,
@@ -461,6 +466,9 @@ static u8
 		[BQ27XXX_REG_AP] = 0x18,
 		BQ27XXX_DM_REG_ROWS,
 	};
+#define bq27425_regs bq27421_regs
+#define bq27441_regs bq27421_regs
+#define bq27621_regs bq27421_regs
 
 static enum power_supply_property bq27000_props[] = {
 	POWER_SUPPLY_PROP_STATUS,
@@ -539,6 +547,7 @@ static enum power_supply_property bq2751x_props[] = {
 	POWER_SUPPLY_PROP_HEALTH,
 	POWER_SUPPLY_PROP_MANUFACTURER,
 };
+#define bq2752x_props bq2751x_props
 
 static enum power_supply_property bq27500_props[] = {
 	POWER_SUPPLY_PROP_STATUS,
@@ -716,6 +725,7 @@ static enum power_supply_property bq27530_props[] = {
 	POWER_SUPPLY_PROP_CYCLE_COUNT,
 	POWER_SUPPLY_PROP_MANUFACTURER,
 };
+#define bq27531_props bq27530_props
 
 static enum power_supply_property bq27541_props[] = {
 	POWER_SUPPLY_PROP_STATUS,
@@ -735,6 +745,9 @@ static enum power_supply_property bq27541_props[] = {
 	POWER_SUPPLY_PROP_HEALTH,
 	POWER_SUPPLY_PROP_MANUFACTURER,
 };
+#define bq27542_props bq27541_props
+#define bq27546_props bq27541_props
+#define bq27742_props bq27541_props
 
 static enum power_supply_property bq27545_props[] = {
 	POWER_SUPPLY_PROP_STATUS,
@@ -768,33 +781,50 @@ static enum power_supply_property bq27421_props[] = {
 	POWER_SUPPLY_PROP_CHARGE_FULL_DESIGN,
 	POWER_SUPPLY_PROP_MANUFACTURER,
 };
+#define bq27425_props bq27421_props
+#define bq27441_props bq27421_props
+#define bq27621_props bq27421_props
 
-#define BQ27XXX_DATA(ref) {			\
+#define BQ27XXX_O_ZERO	0x00000001
+#define BQ27XXX_O_OTDC	0x00000002
+#define BQ27XXX_O_UTOT  0x00000004
+
+#define BQ27XXX_DATA(ref, opt) {		\
+	.opts = (opt),				\
 	.regs  = ref##_regs,			\
 	.props = ref##_props,			\
 	.props_size = ARRAY_SIZE(ref##_props) }
 
 static struct {
+	u32 opts;
 	u8 *regs;
 	enum power_supply_property *props;
 	size_t props_size;
 } bq27xxx_chip_data[] = {
-	[BQ27000]   = BQ27XXX_DATA(bq27000),
-	[BQ27010]   = BQ27XXX_DATA(bq27010),
-	[BQ2750X]   = BQ27XXX_DATA(bq2750x),
-	[BQ2751X]   = BQ27XXX_DATA(bq2751x),
-	[BQ27500]   = BQ27XXX_DATA(bq27500),
-	[BQ27510G1] = BQ27XXX_DATA(bq27510g1),
-	[BQ27510G2] = BQ27XXX_DATA(bq27510g2),
-	[BQ27510G3] = BQ27XXX_DATA(bq27510g3),
-	[BQ27520G1] = BQ27XXX_DATA(bq27520g1),
-	[BQ27520G2] = BQ27XXX_DATA(bq27520g2),
-	[BQ27520G3] = BQ27XXX_DATA(bq27520g3),
-	[BQ27520G4] = BQ27XXX_DATA(bq27520g4),
-	[BQ27530]   = BQ27XXX_DATA(bq27530),
-	[BQ27541]   = BQ27XXX_DATA(bq27541),
-	[BQ27545]   = BQ27XXX_DATA(bq27545),
-	[BQ27421]   = BQ27XXX_DATA(bq27421),
+	[BQ27000]   = BQ27XXX_DATA(bq27000,   BQ27XXX_O_ZERO),
+	[BQ27010]   = BQ27XXX_DATA(bq27010,   BQ27XXX_O_ZERO),
+	[BQ2750X]   = BQ27XXX_DATA(bq2750x,   BQ27XXX_O_OTDC),
+	[BQ2751X]   = BQ27XXX_DATA(bq2751x,   BQ27XXX_O_OTDC),
+	[BQ2752X]   = BQ27XXX_DATA(bq2752x,   BQ27XXX_O_OTDC),
+	[BQ27500]   = BQ27XXX_DATA(bq27500,   BQ27XXX_O_OTDC),
+	[BQ27510G1] = BQ27XXX_DATA(bq27510g1, BQ27XXX_O_OTDC),
+	[BQ27510G2] = BQ27XXX_DATA(bq27510g2, BQ27XXX_O_OTDC),
+	[BQ27510G3] = BQ27XXX_DATA(bq27510g3, BQ27XXX_O_OTDC),
+	[BQ27520G1] = BQ27XXX_DATA(bq27520g1, BQ27XXX_O_OTDC),
+	[BQ27520G2] = BQ27XXX_DATA(bq27520g2, BQ27XXX_O_OTDC),
+	[BQ27520G3] = BQ27XXX_DATA(bq27520g3, BQ27XXX_O_OTDC),
+	[BQ27520G4] = BQ27XXX_DATA(bq27520g4, BQ27XXX_O_OTDC),
+	[BQ27530]   = BQ27XXX_DATA(bq27530,   BQ27XXX_O_UTOT),
+	[BQ27531]   = BQ27XXX_DATA(bq27531,   BQ27XXX_O_UTOT),
+	[BQ27541]   = BQ27XXX_DATA(bq27541,   BQ27XXX_O_OTDC),
+	[BQ27542]   = BQ27XXX_DATA(bq27542,   BQ27XXX_O_OTDC),
+	[BQ27546]   = BQ27XXX_DATA(bq27546,   BQ27XXX_O_OTDC),
+	[BQ27742]   = BQ27XXX_DATA(bq27742,   BQ27XXX_O_OTDC),
+	[BQ27545]   = BQ27XXX_DATA(bq27545,   BQ27XXX_O_OTDC),
+	[BQ27421]   = BQ27XXX_DATA(bq27421,   BQ27XXX_O_UTOT),
+	[BQ27425]   = BQ27XXX_DATA(bq27425,   BQ27XXX_O_UTOT),
+	[BQ27441]   = BQ27XXX_DATA(bq27441,   BQ27XXX_O_UTOT),
+	[BQ27621]   = BQ27XXX_DATA(bq27621,   BQ27XXX_O_UTOT),
 };
 
 static DEFINE_MUTEX(bq27xxx_list_lock);
@@ -1327,7 +1357,7 @@ static int bq27xxx_battery_read_soc(struct bq27xxx_device_info *di)
 {
 	int soc;
 
-	if (di->chip == BQ27000 || di->chip == BQ27010)
+	if (di->opts & BQ27XXX_O_ZERO)
 		soc = bq27xxx_read(di, BQ27XXX_REG_SOC, true);
 	else
 		soc = bq27xxx_read(di, BQ27XXX_REG_SOC, false);
@@ -1353,7 +1383,7 @@ static int bq27xxx_battery_read_charge(struct bq27xxx_device_info *di, u8 reg)
 		return charge;
 	}
 
-	if (di->chip == BQ27000 || di->chip == BQ27010)
+	if (di->opts & BQ27XXX_O_ZERO)
 		charge *= BQ27XXX_CURRENT_CONSTANT / BQ27XXX_RS;
 	else
 		charge *= 1000;
@@ -1369,7 +1399,7 @@ static inline int bq27xxx_battery_read_nac(struct bq27xxx_device_info *di)
 {
 	int flags;
 
-	if (di->chip == BQ27000 || di->chip == BQ27010) {
+	if (di->opts & BQ27XXX_O_ZERO) {
 		flags = bq27xxx_read(di, BQ27XXX_REG_FLAGS, true);
 		if (flags >= 0 && (flags & BQ27000_FLAG_CI))
 			return -ENODATA;
@@ -1395,7 +1425,7 @@ static int bq27xxx_battery_read_dcap(struct bq27xxx_device_info *di)
 {
 	int dcap;
 
-	if (di->chip == BQ27000 || di->chip == BQ27010)
+	if (di->opts & BQ27XXX_O_ZERO)
 		dcap = bq27xxx_read(di, BQ27XXX_REG_DCAP, true);
 	else
 		dcap = bq27xxx_read(di, BQ27XXX_REG_DCAP, false);
@@ -1405,7 +1435,7 @@ static int bq27xxx_battery_read_dcap(struct bq27xxx_device_info *di)
 		return dcap;
 	}
 
-	if (di->chip == BQ27000 || di->chip == BQ27010)
+	if (di->opts & BQ27XXX_O_ZERO)
 		dcap = (dcap << 8) * BQ27XXX_CURRENT_CONSTANT / BQ27XXX_RS;
 	else
 		dcap *= 1000;
@@ -1427,7 +1457,7 @@ static int bq27xxx_battery_read_energy(struct bq27xxx_device_info *di)
 		return ae;
 	}
 
-	if (di->chip == BQ27000 || di->chip == BQ27010)
+	if (di->opts & BQ27XXX_O_ZERO)
 		ae *= BQ27XXX_POWER_CONSTANT / BQ27XXX_RS;
 	else
 		ae *= 1000;
@@ -1449,7 +1479,7 @@ static int bq27xxx_battery_read_temperature(struct bq27xxx_device_info *di)
 		return temp;
 	}
 
-	if (di->chip == BQ27000 || di->chip == BQ27010)
+	if (di->opts & BQ27XXX_O_ZERO)
 		temp = 5 * temp / 2;
 
 	return temp;
@@ -1506,7 +1536,7 @@ static int bq27xxx_battery_read_pwr_avg(struct bq27xxx_device_info *di)
 		return tval;
 	}
 
-	if (di->chip == BQ27000 || di->chip == BQ27010)
+	if (di->opts & BQ27XXX_O_ZERO)
 		return (tval * BQ27XXX_POWER_CONSTANT) / BQ27XXX_RS;
 	else
 		return tval;
@@ -1517,26 +1547,12 @@ static int bq27xxx_battery_read_pwr_avg(struct bq27xxx_device_info *di)
  */
 static bool bq27xxx_battery_overtemp(struct bq27xxx_device_info *di, u16 flags)
 {
-	switch (di->chip) {
-	case BQ2750X:
-	case BQ2751X:
-	case BQ27500:
-	case BQ27510G1:
-	case BQ27510G2:
-	case BQ27510G3:
-	case BQ27520G1:
-	case BQ27520G2:
-	case BQ27520G3:
-	case BQ27520G4:
-	case BQ27541:
-	case BQ27545:
+	if (di->opts & BQ27XXX_O_OTDC)
 		return flags & (BQ27XXX_FLAG_OTC | BQ27XXX_FLAG_OTD);
-	case BQ27530:
-	case BQ27421:
+        if (di->opts & BQ27XXX_O_UTOT)
 		return flags & BQ27XXX_FLAG_OT;
-	default:
-		return false;
-	}
+
+	return false;
 }
 
 /*
@@ -1544,7 +1560,7 @@ static bool bq27xxx_battery_overtemp(struct bq27xxx_device_info *di, u16 flags)
  */
 static bool bq27xxx_battery_undertemp(struct bq27xxx_device_info *di, u16 flags)
 {
-	if (di->chip == BQ27530 || di->chip == BQ27421)
+	if (di->opts & BQ27XXX_O_UTOT)
 		return flags & BQ27XXX_FLAG_UT;
 
 	return false;
@@ -1555,7 +1571,7 @@ static bool bq27xxx_battery_undertemp(struct bq27xxx_device_info *di, u16 flags)
  */
 static bool bq27xxx_battery_dead(struct bq27xxx_device_info *di, u16 flags)
 {
-	if (di->chip == BQ27000 || di->chip == BQ27010)
+	if (di->opts & BQ27XXX_O_ZERO)
 		return flags & (BQ27000_FLAG_EDV1 | BQ27000_FLAG_EDVF);
 	else
 		return flags & (BQ27XXX_FLAG_SOC1 | BQ27XXX_FLAG_SOCF);
@@ -1568,7 +1584,7 @@ static bool bq27xxx_battery_dead(struct bq27xxx_device_info *di, u16 flags)
 static int bq27xxx_battery_read_health(struct bq27xxx_device_info *di)
 {
 	int flags;
-	bool has_singe_flag = di->chip == BQ27000 || di->chip == BQ27010;
+	bool has_singe_flag = di->opts & BQ27XXX_O_ZERO;
 
 	flags = bq27xxx_read(di, BQ27XXX_REG_FLAGS, has_singe_flag);
 	if (flags < 0) {
@@ -1590,8 +1606,8 @@ static int bq27xxx_battery_read_health(struct bq27xxx_device_info *di)
 void bq27xxx_battery_update(struct bq27xxx_device_info *di)
 {
 	struct bq27xxx_reg_cache cache = {0, };
-	bool has_ci_flag = di->chip == BQ27000 || di->chip == BQ27010;
-	bool has_singe_flag = di->chip == BQ27000 || di->chip == BQ27010;
+	bool has_ci_flag = di->opts & BQ27XXX_O_ZERO;
+	bool has_singe_flag = di->opts & BQ27XXX_O_ZERO;
 
 	cache.flags = bq27xxx_read(di, BQ27XXX_REG_FLAGS, has_singe_flag);
 	if ((cache.flags & 0xff) == 0xff)
@@ -1669,7 +1685,7 @@ static int bq27xxx_battery_current(struct bq27xxx_device_info *di,
 		return curr;
 	}
 
-	if (di->chip == BQ27000 || di->chip == BQ27010) {
+	if (di->opts & BQ27XXX_O_ZERO) {
 		flags = bq27xxx_read(di, BQ27XXX_REG_FLAGS, true);
 		if (flags & BQ27000_FLAG_CHGS) {
 			dev_dbg(di->dev, "negative current!\n");
@@ -1690,7 +1706,7 @@ static int bq27xxx_battery_status(struct bq27xxx_device_info *di,
 {
 	int status;
 
-	if (di->chip == BQ27000 || di->chip == BQ27010) {
+	if (di->opts & BQ27XXX_O_ZERO) {
 		if (di->cache.flags & BQ27000_FLAG_FC)
 			status = POWER_SUPPLY_STATUS_FULL;
 		else if (di->cache.flags & BQ27000_FLAG_CHGS)
@@ -1718,7 +1734,7 @@ static int bq27xxx_battery_capacity_level(struct bq27xxx_device_info *di,
 {
 	int level;
 
-	if (di->chip == BQ27000 || di->chip == BQ27010) {
+	if (di->opts & BQ27XXX_O_ZERO) {
 		if (di->cache.flags & BQ27000_FLAG_FC)
 			level = POWER_SUPPLY_CAPACITY_LEVEL_FULL;
 		else if (di->cache.flags & BQ27000_FLAG_EDV1)
@@ -1883,7 +1899,9 @@ int bq27xxx_battery_setup(struct bq27xxx_device_info *di)
 
 	INIT_DELAYED_WORK(&di->work, bq27xxx_battery_poll);
 	mutex_init(&di->lock);
+
 	di->regs = bq27xxx_chip_data[di->chip].regs;
+	di->opts = bq27xxx_chip_data[di->chip].opts;
 
 	psy_desc = devm_kzalloc(di->dev, sizeof(*psy_desc), GFP_KERNEL);
 	if (!psy_desc)
diff --git a/drivers/power/supply/bq27xxx_battery_i2c.c b/drivers/power/supply/bq27xxx_battery_i2c.c
index a5972214f074..0b11ed472f33 100644
--- a/drivers/power/supply/bq27xxx_battery_i2c.c
+++ b/drivers/power/supply/bq27xxx_battery_i2c.c
@@ -230,7 +230,7 @@ static const struct i2c_device_id bq27xxx_i2c_id_table[] = {
 	{ "bq27210", BQ27010 },
 	{ "bq27500", BQ2750X },
 	{ "bq27510", BQ2751X },
-	{ "bq27520", BQ2751X },
+	{ "bq27520", BQ2752X },
 	{ "bq27500-1", BQ27500 },
 	{ "bq27510g1", BQ27510G1 },
 	{ "bq27510g2", BQ27510G2 },
@@ -240,16 +240,16 @@ static const struct i2c_device_id bq27xxx_i2c_id_table[] = {
 	{ "bq27520g3", BQ27520G3 },
 	{ "bq27520g4", BQ27520G4 },
 	{ "bq27530", BQ27530 },
-	{ "bq27531", BQ27530 },
+	{ "bq27531", BQ27531 },
 	{ "bq27541", BQ27541 },
-	{ "bq27542", BQ27541 },
-	{ "bq27546", BQ27541 },
-	{ "bq27742", BQ27541 },
+	{ "bq27542", BQ27542 },
+	{ "bq27546", BQ27546 },
+	{ "bq27742", BQ27742 },
 	{ "bq27545", BQ27545 },
 	{ "bq27421", BQ27421 },
-	{ "bq27425", BQ27421 },
-	{ "bq27441", BQ27421 },
-	{ "bq27621", BQ27421 },
+	{ "bq27425", BQ27425 },
+	{ "bq27441", BQ27441 },
+	{ "bq27621", BQ27621 },
 	{},
 };
 MODULE_DEVICE_TABLE(i2c, bq27xxx_i2c_id_table);
diff --git a/include/linux/power/bq27xxx_battery.h b/include/linux/power/bq27xxx_battery.h
index 89890437f9ab..e48e7a4e9cd6 100644
--- a/include/linux/power/bq27xxx_battery.h
+++ b/include/linux/power/bq27xxx_battery.h
@@ -6,6 +6,7 @@ enum bq27xxx_chip {
 	BQ27010, /* bq27010, bq27210 */
 	BQ2750X, /* bq27500 deprecated alias */
 	BQ2751X, /* bq27510, bq27520 deprecated alias */
+	BQ2752X,
 	BQ27500, /* bq27500/1 */
 	BQ27510G1, /* bq27510G1 */
 	BQ27510G2, /* bq27510G2 */
@@ -15,9 +16,16 @@ enum bq27xxx_chip {
 	BQ27520G3, /* bq27520G3 */
 	BQ27520G4, /* bq27520G4 */
 	BQ27530, /* bq27530, bq27531 */
+	BQ27531,
 	BQ27541, /* bq27541, bq27542, bq27546, bq27742 */
+	BQ27542,
+	BQ27546,
+	BQ27742,
 	BQ27545, /* bq27545 */
 	BQ27421, /* bq27421, bq27425, bq27441, bq27621 */
+	BQ27425,
+	BQ27441,
+	BQ27621,
 };
 
 struct bq27xxx_device_info;
@@ -47,6 +55,7 @@ struct bq27xxx_device_info {
 	int id;
 	enum bq27xxx_chip chip;
 	bool ram_chip;
+	u32 opts;
 	const char *name;
 	struct bq27xxx_dm_reg *dm_regs;
 	u32 unseal_key;
-- 
cgit v1.2.3


From 05045379b2740686020618c8cfd4b517cff9f918 Mon Sep 17 00:00:00 2001
From: Liam Breck <kernel@networkimprov.net>
Date: Wed, 23 Aug 2017 20:36:15 -0700
Subject: power: supply: bq27xxx: Enable data memory update for certain chips

Support data memory update on BQ27425. Parameters from TI datasheets are also
provided for BQ27500, 545, 421, 441, 621; however these are commented out,
as they are not tested.

Add BQ27XXX_O_CFGUP & _O_RAM for use in bq27xxx_chip_data[n].opts
and by data memory update functions.

Signed-off-by: Liam Breck <kernel@networkimprov.net>
Signed-off-by: Sebastian Reichel <sebastian.reichel@collabora.co.uk>
---
 drivers/power/supply/bq27xxx_battery.c | 175 ++++++++++++++++++++++++---------
 include/linux/power/bq27xxx_battery.h  |   1 -
 2 files changed, 126 insertions(+), 50 deletions(-)

(limited to 'include')

diff --git a/drivers/power/supply/bq27xxx_battery.c b/drivers/power/supply/bq27xxx_battery.c
index cf0f19e2a639..cf3e8a173bc6 100644
--- a/drivers/power/supply/bq27xxx_battery.c
+++ b/drivers/power/supply/bq27xxx_battery.c
@@ -58,8 +58,6 @@
 
 #include <linux/power/bq27xxx_battery.h>
 
-#define DRIVER_VERSION		"1.2.0"
-
 #define BQ27XXX_MANUFACTURER	"Texas Instruments"
 
 /* BQ27XXX Flags */
@@ -785,46 +783,138 @@ static enum power_supply_property bq27421_props[] = {
 #define bq27441_props bq27421_props
 #define bq27621_props bq27421_props
 
+struct bq27xxx_dm_reg {
+	u8 subclass_id;
+	u8 offset;
+	u8 bytes;
+	u16 min, max;
+};
+
+enum bq27xxx_dm_reg_id {
+	BQ27XXX_DM_DESIGN_CAPACITY = 0,
+	BQ27XXX_DM_DESIGN_ENERGY,
+	BQ27XXX_DM_TERMINATE_VOLTAGE,
+};
+
+#define bq27000_dm_regs 0
+#define bq27010_dm_regs 0
+#define bq2750x_dm_regs 0
+#define bq2751x_dm_regs 0
+#define bq2752x_dm_regs 0
+
+#if 0 /* not yet tested */
+static struct bq27xxx_dm_reg bq27500_dm_regs[] = {
+	[BQ27XXX_DM_DESIGN_CAPACITY]   = { 48, 10, 2,    0, 65535 },
+	[BQ27XXX_DM_DESIGN_ENERGY]     = { }, /* missing on chip */
+	[BQ27XXX_DM_TERMINATE_VOLTAGE] = { 80, 48, 2, 1000, 32767 },
+};
+#else
+#define bq27500_dm_regs 0
+#endif
+
+/* todo create data memory definitions from datasheets and test on chips */
+#define bq27510g1_dm_regs 0
+#define bq27510g2_dm_regs 0
+#define bq27510g3_dm_regs 0
+#define bq27520g1_dm_regs 0
+#define bq27520g2_dm_regs 0
+#define bq27520g3_dm_regs 0
+#define bq27520g4_dm_regs 0
+#define bq27530_dm_regs 0
+#define bq27531_dm_regs 0
+#define bq27541_dm_regs 0
+#define bq27542_dm_regs 0
+#define bq27546_dm_regs 0
+#define bq27742_dm_regs 0
+
+#if 0 /* not yet tested */
+static struct bq27xxx_dm_reg bq27545_dm_regs[] = {
+	[BQ27XXX_DM_DESIGN_CAPACITY]   = { 48, 23, 2,    0, 32767 },
+	[BQ27XXX_DM_DESIGN_ENERGY]     = { 48, 25, 2,    0, 32767 },
+	[BQ27XXX_DM_TERMINATE_VOLTAGE] = { 80, 67, 2, 2800,  3700 },
+};
+#else
+#define bq27545_dm_regs 0
+#endif
+
+#if 0 /* not yet tested */
+static struct bq27xxx_dm_reg bq27421_dm_regs[] = {
+	[BQ27XXX_DM_DESIGN_CAPACITY]   = { 82, 10, 2,    0,  8000 },
+	[BQ27XXX_DM_DESIGN_ENERGY]     = { 82, 12, 2,    0, 32767 },
+	[BQ27XXX_DM_TERMINATE_VOLTAGE] = { 82, 16, 2, 2500,  3700 },
+};
+#else
+#define bq27421_dm_regs 0
+#endif
+
+static struct bq27xxx_dm_reg bq27425_dm_regs[] = {
+	[BQ27XXX_DM_DESIGN_CAPACITY]   = { 82, 12, 2,    0, 32767 },
+	[BQ27XXX_DM_DESIGN_ENERGY]     = { 82, 14, 2,    0, 32767 },
+	[BQ27XXX_DM_TERMINATE_VOLTAGE] = { 82, 18, 2, 2800,  3700 },
+};
+
+#if 0 /* not yet tested */
+#define bq27441_dm_regs bq27421_dm_regs
+#else
+#define bq27441_dm_regs 0
+#endif
+
+#if 0 /* not yet tested */
+static struct bq27xxx_dm_reg bq27621_dm_regs[] = {
+	[BQ27XXX_DM_DESIGN_CAPACITY]   = { 82, 3, 2,    0,  8000 },
+	[BQ27XXX_DM_DESIGN_ENERGY]     = { 82, 5, 2,    0, 32767 },
+	[BQ27XXX_DM_TERMINATE_VOLTAGE] = { 82, 9, 2, 2500,  3700 },
+};
+#else
+#define bq27621_dm_regs 0
+#endif
+
 #define BQ27XXX_O_ZERO	0x00000001
 #define BQ27XXX_O_OTDC	0x00000002
 #define BQ27XXX_O_UTOT  0x00000004
+#define BQ27XXX_O_CFGUP	0x00000008
+#define BQ27XXX_O_RAM	0x00000010
 
-#define BQ27XXX_DATA(ref, opt) {		\
+#define BQ27XXX_DATA(ref, key, opt) {		\
 	.opts = (opt),				\
+	.unseal_key = key,			\
 	.regs  = ref##_regs,			\
+	.dm_regs = ref##_dm_regs,		\
 	.props = ref##_props,			\
 	.props_size = ARRAY_SIZE(ref##_props) }
 
 static struct {
 	u32 opts;
+	u32 unseal_key;
 	u8 *regs;
+	struct bq27xxx_dm_reg *dm_regs;
 	enum power_supply_property *props;
 	size_t props_size;
 } bq27xxx_chip_data[] = {
-	[BQ27000]   = BQ27XXX_DATA(bq27000,   BQ27XXX_O_ZERO),
-	[BQ27010]   = BQ27XXX_DATA(bq27010,   BQ27XXX_O_ZERO),
-	[BQ2750X]   = BQ27XXX_DATA(bq2750x,   BQ27XXX_O_OTDC),
-	[BQ2751X]   = BQ27XXX_DATA(bq2751x,   BQ27XXX_O_OTDC),
-	[BQ2752X]   = BQ27XXX_DATA(bq2752x,   BQ27XXX_O_OTDC),
-	[BQ27500]   = BQ27XXX_DATA(bq27500,   BQ27XXX_O_OTDC),
-	[BQ27510G1] = BQ27XXX_DATA(bq27510g1, BQ27XXX_O_OTDC),
-	[BQ27510G2] = BQ27XXX_DATA(bq27510g2, BQ27XXX_O_OTDC),
-	[BQ27510G3] = BQ27XXX_DATA(bq27510g3, BQ27XXX_O_OTDC),
-	[BQ27520G1] = BQ27XXX_DATA(bq27520g1, BQ27XXX_O_OTDC),
-	[BQ27520G2] = BQ27XXX_DATA(bq27520g2, BQ27XXX_O_OTDC),
-	[BQ27520G3] = BQ27XXX_DATA(bq27520g3, BQ27XXX_O_OTDC),
-	[BQ27520G4] = BQ27XXX_DATA(bq27520g4, BQ27XXX_O_OTDC),
-	[BQ27530]   = BQ27XXX_DATA(bq27530,   BQ27XXX_O_UTOT),
-	[BQ27531]   = BQ27XXX_DATA(bq27531,   BQ27XXX_O_UTOT),
-	[BQ27541]   = BQ27XXX_DATA(bq27541,   BQ27XXX_O_OTDC),
-	[BQ27542]   = BQ27XXX_DATA(bq27542,   BQ27XXX_O_OTDC),
-	[BQ27546]   = BQ27XXX_DATA(bq27546,   BQ27XXX_O_OTDC),
-	[BQ27742]   = BQ27XXX_DATA(bq27742,   BQ27XXX_O_OTDC),
-	[BQ27545]   = BQ27XXX_DATA(bq27545,   BQ27XXX_O_OTDC),
-	[BQ27421]   = BQ27XXX_DATA(bq27421,   BQ27XXX_O_UTOT),
-	[BQ27425]   = BQ27XXX_DATA(bq27425,   BQ27XXX_O_UTOT),
-	[BQ27441]   = BQ27XXX_DATA(bq27441,   BQ27XXX_O_UTOT),
-	[BQ27621]   = BQ27XXX_DATA(bq27621,   BQ27XXX_O_UTOT),
+	[BQ27000]   = BQ27XXX_DATA(bq27000,   0         , BQ27XXX_O_ZERO),
+	[BQ27010]   = BQ27XXX_DATA(bq27010,   0         , BQ27XXX_O_ZERO),
+	[BQ2750X]   = BQ27XXX_DATA(bq2750x,   0         , BQ27XXX_O_OTDC),
+	[BQ2751X]   = BQ27XXX_DATA(bq2751x,   0         , BQ27XXX_O_OTDC),
+	[BQ2752X]   = BQ27XXX_DATA(bq2752x,   0         , BQ27XXX_O_OTDC),
+	[BQ27500]   = BQ27XXX_DATA(bq27500,   0x04143672, BQ27XXX_O_OTDC),
+	[BQ27510G1] = BQ27XXX_DATA(bq27510g1, 0         , BQ27XXX_O_OTDC),
+	[BQ27510G2] = BQ27XXX_DATA(bq27510g2, 0         , BQ27XXX_O_OTDC),
+	[BQ27510G3] = BQ27XXX_DATA(bq27510g3, 0         , BQ27XXX_O_OTDC),
+	[BQ27520G1] = BQ27XXX_DATA(bq27520g1, 0         , BQ27XXX_O_OTDC),
+	[BQ27520G2] = BQ27XXX_DATA(bq27520g2, 0         , BQ27XXX_O_OTDC),
+	[BQ27520G3] = BQ27XXX_DATA(bq27520g3, 0         , BQ27XXX_O_OTDC),
+	[BQ27520G4] = BQ27XXX_DATA(bq27520g4, 0         , BQ27XXX_O_OTDC),
+	[BQ27530]   = BQ27XXX_DATA(bq27530,   0         , BQ27XXX_O_UTOT),
+	[BQ27531]   = BQ27XXX_DATA(bq27531,   0         , BQ27XXX_O_UTOT),
+	[BQ27541]   = BQ27XXX_DATA(bq27541,   0         , BQ27XXX_O_OTDC),
+	[BQ27542]   = BQ27XXX_DATA(bq27542,   0         , BQ27XXX_O_OTDC),
+	[BQ27546]   = BQ27XXX_DATA(bq27546,   0         , BQ27XXX_O_OTDC),
+	[BQ27742]   = BQ27XXX_DATA(bq27742,   0         , BQ27XXX_O_OTDC),
+	[BQ27545]   = BQ27XXX_DATA(bq27545,   0x04143672, BQ27XXX_O_OTDC),
+	[BQ27421]   = BQ27XXX_DATA(bq27421,   0x80008000, BQ27XXX_O_UTOT | BQ27XXX_O_CFGUP | BQ27XXX_O_RAM),
+	[BQ27425]   = BQ27XXX_DATA(bq27425,   0x04143672, BQ27XXX_O_UTOT | BQ27XXX_O_CFGUP),
+	[BQ27441]   = BQ27XXX_DATA(bq27441,   0x80008000, BQ27XXX_O_UTOT | BQ27XXX_O_CFGUP | BQ27XXX_O_RAM),
+	[BQ27621]   = BQ27XXX_DATA(bq27621,   0x80008000, BQ27XXX_O_UTOT | BQ27XXX_O_CFGUP | BQ27XXX_O_RAM),
 };
 
 static DEFINE_MUTEX(bq27xxx_list_lock);
@@ -834,13 +924,6 @@ static LIST_HEAD(bq27xxx_battery_devices);
 
 #define BQ27XXX_DM_SZ	32
 
-struct bq27xxx_dm_reg {
-	u8 subclass_id;
-	u8 offset;
-	u8 bytes;
-	u16 min, max;
-};
-
 /**
  * struct bq27xxx_dm_buf - chip data memory buffer
  * @class: data memory subclass_id
@@ -873,12 +956,6 @@ static inline u16 *bq27xxx_dm_reg_ptr(struct bq27xxx_dm_buf *buf,
 	return NULL;
 }
 
-enum bq27xxx_dm_reg_id {
-	BQ27XXX_DM_DESIGN_CAPACITY = 0,
-	BQ27XXX_DM_DESIGN_ENERGY,
-	BQ27XXX_DM_TERMINATE_VOLTAGE,
-};
-
 static const char * const bq27xxx_dm_reg_name[] = {
 	[BQ27XXX_DM_DESIGN_CAPACITY] = "design-capacity",
 	[BQ27XXX_DM_DESIGN_ENERGY] = "design-energy",
@@ -1121,9 +1198,9 @@ static void bq27xxx_battery_update_dm_block(struct bq27xxx_device_info *di,
 	}
 
 #ifdef CONFIG_BATTERY_BQ27XXX_DT_UPDATES_NVM
-	if (!di->ram_chip && !bq27xxx_dt_to_nvm) {
+	if (!(di->opts & BQ27XXX_O_RAM) && !bq27xxx_dt_to_nvm) {
 #else
-	if (!di->ram_chip) {
+	if (!(di->opts & BQ27XXX_O_RAM)) {
 #endif
 		/* devicetree and NVM differ; defer to NVM */
 		dev_warn(di->dev, "%s has %u; update to %u disallowed "
@@ -1159,7 +1236,7 @@ static int bq27xxx_battery_cfgupdate_priv(struct bq27xxx_device_info *di, bool a
 			return ret;
 	} while (!!(ret & BQ27XXX_FLAG_CFGUP) != active && --try);
 
-	if (!try) {
+	if (!try && di->chip != BQ27425) { // 425 has a bug
 		dev_err(di->dev, "timed out waiting for cfgupdate flag %d\n", active);
 		return -EINVAL;
 	}
@@ -1191,7 +1268,7 @@ static inline int bq27xxx_battery_soft_reset(struct bq27xxx_device_info *di)
 static int bq27xxx_battery_write_dm_block(struct bq27xxx_device_info *di,
 					  struct bq27xxx_dm_buf *buf)
 {
-	bool cfgup = di->chip == BQ27421; /* assume related chips need cfgupdate */
+	bool cfgup = di->opts & BQ27XXX_O_CFGUP;
 	int ret;
 
 	if (!buf->dirty)
@@ -1290,7 +1367,7 @@ static void bq27xxx_battery_set_config(struct bq27xxx_device_info *di,
 
 	bq27xxx_battery_seal(di);
 
-	if (updated && di->chip != BQ27421) { /* not a cfgupdate chip, so reset */
+	if (updated && !(di->opts & BQ27XXX_O_CFGUP)) {
 		bq27xxx_write(di, BQ27XXX_REG_CTRL, BQ27XXX_RESET, false);
 		BQ27XXX_MSLEEP(300); /* reset time is not documented */
 	}
@@ -1900,8 +1977,10 @@ int bq27xxx_battery_setup(struct bq27xxx_device_info *di)
 	INIT_DELAYED_WORK(&di->work, bq27xxx_battery_poll);
 	mutex_init(&di->lock);
 
-	di->regs = bq27xxx_chip_data[di->chip].regs;
-	di->opts = bq27xxx_chip_data[di->chip].opts;
+	di->regs       = bq27xxx_chip_data[di->chip].regs;
+	di->unseal_key = bq27xxx_chip_data[di->chip].unseal_key;
+	di->dm_regs    = bq27xxx_chip_data[di->chip].dm_regs;
+	di->opts       = bq27xxx_chip_data[di->chip].opts;
 
 	psy_desc = devm_kzalloc(di->dev, sizeof(*psy_desc), GFP_KERNEL);
 	if (!psy_desc)
@@ -1920,8 +1999,6 @@ int bq27xxx_battery_setup(struct bq27xxx_device_info *di)
 		return PTR_ERR(di->bat);
 	}
 
-	dev_info(di->dev, "support ver. %s enabled\n", DRIVER_VERSION);
-
 	bq27xxx_battery_settings(di);
 	bq27xxx_battery_update(di);
 
diff --git a/include/linux/power/bq27xxx_battery.h b/include/linux/power/bq27xxx_battery.h
index e48e7a4e9cd6..43194e02c1ee 100644
--- a/include/linux/power/bq27xxx_battery.h
+++ b/include/linux/power/bq27xxx_battery.h
@@ -54,7 +54,6 @@ struct bq27xxx_device_info {
 	struct device *dev;
 	int id;
 	enum bq27xxx_chip chip;
-	bool ram_chip;
 	u32 opts;
 	const char *name;
 	struct bq27xxx_dm_reg *dm_regs;
-- 
cgit v1.2.3


From 75e8387685f6c65feb195a4556110b58f852b848 Mon Sep 17 00:00:00 2001
From: Zhou Chengming <zhouchengming1@huawei.com>
Date: Fri, 25 Aug 2017 21:49:37 +0800
Subject: perf/ftrace: Fix double traces of perf on ftrace:function

When running perf on the ftrace:function tracepoint, there is a bug
which can be reproduced by:

  perf record -e ftrace:function -a sleep 20 &
  perf record -e ftrace:function ls
  perf script

              ls 10304 [005]   171.853235: ftrace:function:
  perf_output_begin
              ls 10304 [005]   171.853237: ftrace:function:
  perf_output_begin
              ls 10304 [005]   171.853239: ftrace:function:
  task_tgid_nr_ns
              ls 10304 [005]   171.853240: ftrace:function:
  task_tgid_nr_ns
              ls 10304 [005]   171.853242: ftrace:function:
  __task_pid_nr_ns
              ls 10304 [005]   171.853244: ftrace:function:
  __task_pid_nr_ns

We can see that all the function traces are doubled.

The problem is caused by the inconsistency of the register
function perf_ftrace_event_register() with the probe function
perf_ftrace_function_call(). The former registers one probe
for every perf_event. And the latter handles all perf_events
on the current cpu. So when two perf_events on the current cpu,
the traces of them will be doubled.

So this patch adds an extra parameter "event" for perf_tp_event,
only send sample data to this event when it's not NULL.

Signed-off-by: Zhou Chengming <zhouchengming1@huawei.com>
Reviewed-by: Jiri Olsa <jolsa@kernel.org>
Acked-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: acme@kernel.org
Cc: alexander.shishkin@linux.intel.com
Cc: huawei.libin@huawei.com
Link: http://lkml.kernel.org/r/1503668977-12526-1-git-send-email-zhouchengming1@huawei.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/perf_event.h      |  2 +-
 include/linux/trace_events.h    |  4 ++--
 kernel/events/core.c            | 13 +++++++++----
 kernel/trace/trace_event_perf.c |  4 +++-
 kernel/trace/trace_kprobe.c     |  4 ++--
 kernel/trace/trace_syscalls.c   |  4 ++--
 kernel/trace/trace_uprobe.c     |  2 +-
 7 files changed, 20 insertions(+), 13 deletions(-)

(limited to 'include')

diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index b14095bcf4bb..c00cd4b02f32 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -1201,7 +1201,7 @@ extern void perf_event_init(void);
 extern void perf_tp_event(u16 event_type, u64 count, void *record,
 			  int entry_size, struct pt_regs *regs,
 			  struct hlist_head *head, int rctx,
-			  struct task_struct *task);
+			  struct task_struct *task, struct perf_event *event);
 extern void perf_bp_event(struct perf_event *event, void *data);
 
 #ifndef perf_misc_flags
diff --git a/include/linux/trace_events.h b/include/linux/trace_events.h
index 536c80ff7ad9..5012b524283d 100644
--- a/include/linux/trace_events.h
+++ b/include/linux/trace_events.h
@@ -508,9 +508,9 @@ void perf_trace_run_bpf_submit(void *raw_data, int size, int rctx,
 static inline void
 perf_trace_buf_submit(void *raw_data, int size, int rctx, u16 type,
 		       u64 count, struct pt_regs *regs, void *head,
-		       struct task_struct *task)
+		       struct task_struct *task, struct perf_event *event)
 {
-	perf_tp_event(type, count, raw_data, size, regs, head, rctx, task);
+	perf_tp_event(type, count, raw_data, size, regs, head, rctx, task, event);
 }
 #endif
 
diff --git a/kernel/events/core.c b/kernel/events/core.c
index ce131d25622a..03ac9c8b02fb 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -7906,16 +7906,15 @@ void perf_trace_run_bpf_submit(void *raw_data, int size, int rctx,
 		}
 	}
 	perf_tp_event(call->event.type, count, raw_data, size, regs, head,
-		      rctx, task);
+		      rctx, task, NULL);
 }
 EXPORT_SYMBOL_GPL(perf_trace_run_bpf_submit);
 
 void perf_tp_event(u16 event_type, u64 count, void *record, int entry_size,
 		   struct pt_regs *regs, struct hlist_head *head, int rctx,
-		   struct task_struct *task)
+		   struct task_struct *task, struct perf_event *event)
 {
 	struct perf_sample_data data;
-	struct perf_event *event;
 
 	struct perf_raw_record raw = {
 		.frag = {
@@ -7929,9 +7928,15 @@ void perf_tp_event(u16 event_type, u64 count, void *record, int entry_size,
 
 	perf_trace_buf_update(record, event_type);
 
-	hlist_for_each_entry_rcu(event, head, hlist_entry) {
+	/* Use the given event instead of the hlist */
+	if (event) {
 		if (perf_tp_event_match(event, &data, regs))
 			perf_swevent_event(event, count, &data, regs);
+	} else {
+		hlist_for_each_entry_rcu(event, head, hlist_entry) {
+			if (perf_tp_event_match(event, &data, regs))
+				perf_swevent_event(event, count, &data, regs);
+		}
 	}
 
 	/*
diff --git a/kernel/trace/trace_event_perf.c b/kernel/trace/trace_event_perf.c
index 562fa69df5d3..13ba2d3f6a91 100644
--- a/kernel/trace/trace_event_perf.c
+++ b/kernel/trace/trace_event_perf.c
@@ -306,6 +306,7 @@ static void
 perf_ftrace_function_call(unsigned long ip, unsigned long parent_ip,
 			  struct ftrace_ops *ops, struct pt_regs *pt_regs)
 {
+	struct perf_event *event;
 	struct ftrace_entry *entry;
 	struct hlist_head *head;
 	struct pt_regs regs;
@@ -329,8 +330,9 @@ perf_ftrace_function_call(unsigned long ip, unsigned long parent_ip,
 
 	entry->ip = ip;
 	entry->parent_ip = parent_ip;
+	event = container_of(ops, struct perf_event, ftrace_ops);
 	perf_trace_buf_submit(entry, ENTRY_SIZE, rctx, TRACE_FN,
-			      1, &regs, head, NULL);
+			      1, &regs, head, NULL, event);
 
 #undef ENTRY_SIZE
 }
diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c
index c9b5aa10fbf9..8a907e12b6b9 100644
--- a/kernel/trace/trace_kprobe.c
+++ b/kernel/trace/trace_kprobe.c
@@ -1200,7 +1200,7 @@ kprobe_perf_func(struct trace_kprobe *tk, struct pt_regs *regs)
 	memset(&entry[1], 0, dsize);
 	store_trace_args(sizeof(*entry), &tk->tp, regs, (u8 *)&entry[1], dsize);
 	perf_trace_buf_submit(entry, size, rctx, call->event.type, 1, regs,
-			      head, NULL);
+			      head, NULL, NULL);
 }
 NOKPROBE_SYMBOL(kprobe_perf_func);
 
@@ -1236,7 +1236,7 @@ kretprobe_perf_func(struct trace_kprobe *tk, struct kretprobe_instance *ri,
 	entry->ret_ip = (unsigned long)ri->ret_addr;
 	store_trace_args(sizeof(*entry), &tk->tp, regs, (u8 *)&entry[1], dsize);
 	perf_trace_buf_submit(entry, size, rctx, call->event.type, 1, regs,
-			      head, NULL);
+			      head, NULL, NULL);
 }
 NOKPROBE_SYMBOL(kretprobe_perf_func);
 #endif	/* CONFIG_PERF_EVENTS */
diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c
index 5e10395da88e..74d9a86eccc0 100644
--- a/kernel/trace/trace_syscalls.c
+++ b/kernel/trace/trace_syscalls.c
@@ -596,7 +596,7 @@ static void perf_syscall_enter(void *ignore, struct pt_regs *regs, long id)
 			       (unsigned long *)&rec->args);
 	perf_trace_buf_submit(rec, size, rctx,
 			      sys_data->enter_event->event.type, 1, regs,
-			      head, NULL);
+			      head, NULL, NULL);
 }
 
 static int perf_sysenter_enable(struct trace_event_call *call)
@@ -667,7 +667,7 @@ static void perf_syscall_exit(void *ignore, struct pt_regs *regs, long ret)
 	rec->nr = syscall_nr;
 	rec->ret = syscall_get_return_value(current, regs);
 	perf_trace_buf_submit(rec, size, rctx, sys_data->exit_event->event.type,
-			      1, regs, head, NULL);
+			      1, regs, head, NULL, NULL);
 }
 
 static int perf_sysexit_enable(struct trace_event_call *call)
diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c
index a7581fec9681..4525e0271a53 100644
--- a/kernel/trace/trace_uprobe.c
+++ b/kernel/trace/trace_uprobe.c
@@ -1156,7 +1156,7 @@ static void __uprobe_perf_func(struct trace_uprobe *tu,
 	}
 
 	perf_trace_buf_submit(entry, size, rctx, call->event.type, 1, regs,
-			      head, NULL);
+			      head, NULL, NULL);
  out:
 	preempt_enable();
 }
-- 
cgit v1.2.3


From 6e44636aeab19259f804c8abca57a95ddc01df66 Mon Sep 17 00:00:00 2001
From: Artemy Kovalyov <artemyko@mellanox.com>
Date: Tue, 15 Aug 2017 11:59:02 +0300
Subject: net/mlx5: Update HW layout definitions

* add offload_type field to mlx5_ifc_qpc_bits
* update mlx5_ifc_xrqc_bits layout

Signed-off-by: Artemy Kovalyov <artemyko@mellanox.com>
Reviewed-by: Yossi Itigin <yosefe@mellanox.com>
Signed-off-by: Leon Romanovsky <leon@kernel.org>
Signed-off-by: Doug Ledford <dledford@redhat.com>
---
 include/linux/mlx5/mlx5_ifc.h | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h
index 4eff0b8a1482..e27283ab3667 100644
--- a/include/linux/mlx5/mlx5_ifc.h
+++ b/include/linux/mlx5/mlx5_ifc.h
@@ -2022,6 +2022,10 @@ enum {
 	MLX5_QPC_PM_STATE_MIGRATED  = 0x3,
 };
 
+enum {
+	MLX5_QPC_OFFLOAD_TYPE_RNDV  = 0x1,
+};
+
 enum {
 	MLX5_QPC_END_PADDING_MODE_SCATTER_AS_IS                = 0x0,
 	MLX5_QPC_END_PADDING_MODE_PAD_TO_CACHE_LINE_ALIGNMENT  = 0x1,
@@ -2065,7 +2069,8 @@ struct mlx5_ifc_qpc_bits {
 	u8         st[0x8];
 	u8         reserved_at_10[0x3];
 	u8         pm_state[0x2];
-	u8         reserved_at_15[0x7];
+	u8         reserved_at_15[0x3];
+	u8         offload_type[0x4];
 	u8         end_padding_mode[0x2];
 	u8         reserved_at_1e[0x2];
 
@@ -3010,7 +3015,7 @@ struct mlx5_ifc_xrqc_bits {
 
 	struct mlx5_ifc_tag_matching_topology_context_bits tag_matching_topology_context;
 
-	u8         reserved_at_180[0x880];
+	u8         reserved_at_180[0x280];
 
 	struct mlx5_ifc_wq_bits wq;
 };
-- 
cgit v1.2.3


From 6938fc1ee07e54c057430005f8dcaccabce027c3 Mon Sep 17 00:00:00 2001
From: Artemy Kovalyov <artemyko@mellanox.com>
Date: Thu, 17 Aug 2017 15:52:03 +0300
Subject: IB/core: Add XRQ capabilities

This patch adds following TM XRQ capabilities:

* max_rndv_hdr_size - Max size of rendezvous request message
* max_num_tags - Max number of entries in tag matching list
* max_ops - Max number of outstanding list operations
* max_sge - Max number of SGE in tag matching entry
* flags - the following flags are currently defined:
    - IB_TM_CAP_RC - Support tag matching on RC transport

Signed-off-by: Artemy Kovalyov <artemyko@mellanox.com>
Reviewed-by: Yossi Itigin <yosefe@mellanox.com>
Signed-off-by: Leon Romanovsky <leon@kernel.org>
Signed-off-by: Doug Ledford <dledford@redhat.com>
---
 include/rdma/ib_verbs.h | 19 +++++++++++++++++++
 1 file changed, 19 insertions(+)

(limited to 'include')

diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h
index 355c7a328e0b..cab0bdcfad51 100644
--- a/include/rdma/ib_verbs.h
+++ b/include/rdma/ib_verbs.h
@@ -280,6 +280,24 @@ struct ib_rss_caps {
 	u32 max_rwq_indirection_table_size;
 };
 
+enum ib_tm_cap_flags {
+	/*  Support tag matching on RC transport */
+	IB_TM_CAP_RC		    = 1 << 0,
+};
+
+struct ib_xrq_caps {
+	/* Max size of RNDV header */
+	u32 max_rndv_hdr_size;
+	/* Max number of entries in tag matching list */
+	u32 max_num_tags;
+	/* From enum ib_tm_cap_flags */
+	u32 flags;
+	/* Max number of outstanding list operations */
+	u32 max_ops;
+	/* Max number of SGE in tag matching entry */
+	u32 max_sge;
+};
+
 enum ib_cq_creation_flags {
 	IB_CQ_FLAGS_TIMESTAMP_COMPLETION   = 1 << 0,
 	IB_CQ_FLAGS_IGNORE_OVERRUN	   = 1 << 1,
@@ -340,6 +358,7 @@ struct ib_device_attr {
 	struct ib_rss_caps	rss_caps;
 	u32			max_wq_type_rq;
 	u32			raw_packet_caps; /* Use ib_raw_packet_caps enum */
+	struct ib_xrq_caps	xrq_caps;
 };
 
 enum ib_mtu {
-- 
cgit v1.2.3


From 1a56ff6daab1e062aadec582eb10e7090f0b370a Mon Sep 17 00:00:00 2001
From: Artemy Kovalyov <artemyko@mellanox.com>
Date: Thu, 17 Aug 2017 15:52:04 +0300
Subject: IB/core: Separate CQ handle in SRQ context

Before this change CQ attached to SRQ was part of XRC specific extension.
Moving CQ handle out makes it available to other types extending SRQ
functionality.

Signed-off-by: Artemy Kovalyov <artemyko@mellanox.com>
Reviewed-by: Yossi Itigin <yosefe@mellanox.com>
Signed-off-by: Leon Romanovsky <leon@kernel.org>
Signed-off-by: Doug Ledford <dledford@redhat.com>
---
 drivers/infiniband/core/uverbs_cmd.c | 27 +++++++++++++++++----------
 drivers/infiniband/core/verbs.c      | 16 +++++++++-------
 drivers/infiniband/hw/mlx4/srq.c     |  4 ++--
 drivers/infiniband/hw/mlx5/main.c    | 10 +++++-----
 drivers/infiniband/hw/mlx5/srq.c     | 11 +++++++----
 include/rdma/ib_verbs.h              | 31 ++++++++++++++++++++-----------
 6 files changed, 60 insertions(+), 39 deletions(-)

(limited to 'include')

diff --git a/drivers/infiniband/core/uverbs_cmd.c b/drivers/infiniband/core/uverbs_cmd.c
index 8e9fea03dec4..9f690af46a7e 100644
--- a/drivers/infiniband/core/uverbs_cmd.c
+++ b/drivers/infiniband/core/uverbs_cmd.c
@@ -3497,10 +3497,12 @@ static int __uverbs_create_xsrq(struct ib_uverbs_file *file,
 
 		obj->uxrcd = container_of(xrcd_uobj, struct ib_uxrcd_object, uobject);
 		atomic_inc(&obj->uxrcd->refcnt);
+	}
 
-		attr.ext.xrc.cq  = uobj_get_obj_read(cq, cmd->cq_handle,
-						     file->ucontext);
-		if (!attr.ext.xrc.cq) {
+	if (ib_srq_has_cq(cmd->srq_type)) {
+		attr.ext.cq  = uobj_get_obj_read(cq, cmd->cq_handle,
+						 file->ucontext);
+		if (!attr.ext.cq) {
 			ret = -EINVAL;
 			goto err_put_xrcd;
 		}
@@ -3535,10 +3537,13 @@ static int __uverbs_create_xsrq(struct ib_uverbs_file *file,
 	srq->event_handler = attr.event_handler;
 	srq->srq_context   = attr.srq_context;
 
+	if (ib_srq_has_cq(cmd->srq_type)) {
+		srq->ext.cq       = attr.ext.cq;
+		atomic_inc(&attr.ext.cq->usecnt);
+	}
+
 	if (cmd->srq_type == IB_SRQT_XRC) {
-		srq->ext.xrc.cq   = attr.ext.xrc.cq;
 		srq->ext.xrc.xrcd = attr.ext.xrc.xrcd;
-		atomic_inc(&attr.ext.xrc.cq->usecnt);
 		atomic_inc(&attr.ext.xrc.xrcd->usecnt);
 	}
 
@@ -3561,10 +3566,12 @@ static int __uverbs_create_xsrq(struct ib_uverbs_file *file,
 		goto err_copy;
 	}
 
-	if (cmd->srq_type == IB_SRQT_XRC) {
+	if (cmd->srq_type == IB_SRQT_XRC)
 		uobj_put_read(xrcd_uobj);
-		uobj_put_obj_read(attr.ext.xrc.cq);
-	}
+
+	if (ib_srq_has_cq(cmd->srq_type))
+		uobj_put_obj_read(attr.ext.cq);
+
 	uobj_put_obj_read(pd);
 	uobj_alloc_commit(&obj->uevent.uobject);
 
@@ -3577,8 +3584,8 @@ err_put:
 	uobj_put_obj_read(pd);
 
 err_put_cq:
-	if (cmd->srq_type == IB_SRQT_XRC)
-		uobj_put_obj_read(attr.ext.xrc.cq);
+	if (ib_srq_has_cq(cmd->srq_type))
+		uobj_put_obj_read(attr.ext.cq);
 
 err_put_xrcd:
 	if (cmd->srq_type == IB_SRQT_XRC) {
diff --git a/drivers/infiniband/core/verbs.c b/drivers/infiniband/core/verbs.c
index b29d0ff94463..ecb6c395f19b 100644
--- a/drivers/infiniband/core/verbs.c
+++ b/drivers/infiniband/core/verbs.c
@@ -622,11 +622,13 @@ struct ib_srq *ib_create_srq(struct ib_pd *pd,
 		srq->event_handler = srq_init_attr->event_handler;
 		srq->srq_context   = srq_init_attr->srq_context;
 		srq->srq_type      = srq_init_attr->srq_type;
+		if (ib_srq_has_cq(srq->srq_type)) {
+			srq->ext.cq   = srq_init_attr->ext.cq;
+			atomic_inc(&srq->ext.cq->usecnt);
+		}
 		if (srq->srq_type == IB_SRQT_XRC) {
 			srq->ext.xrc.xrcd = srq_init_attr->ext.xrc.xrcd;
-			srq->ext.xrc.cq   = srq_init_attr->ext.xrc.cq;
 			atomic_inc(&srq->ext.xrc.xrcd->usecnt);
-			atomic_inc(&srq->ext.xrc.cq->usecnt);
 		}
 		atomic_inc(&pd->usecnt);
 		atomic_set(&srq->usecnt, 0);
@@ -667,18 +669,18 @@ int ib_destroy_srq(struct ib_srq *srq)
 
 	pd = srq->pd;
 	srq_type = srq->srq_type;
-	if (srq_type == IB_SRQT_XRC) {
+	if (ib_srq_has_cq(srq_type))
+		cq = srq->ext.cq;
+	if (srq_type == IB_SRQT_XRC)
 		xrcd = srq->ext.xrc.xrcd;
-		cq = srq->ext.xrc.cq;
-	}
 
 	ret = srq->device->destroy_srq(srq);
 	if (!ret) {
 		atomic_dec(&pd->usecnt);
-		if (srq_type == IB_SRQT_XRC) {
+		if (srq_type == IB_SRQT_XRC)
 			atomic_dec(&xrcd->usecnt);
+		if (ib_srq_has_cq(srq_type))
 			atomic_dec(&cq->usecnt);
-		}
 	}
 
 	return ret;
diff --git a/drivers/infiniband/hw/mlx4/srq.c b/drivers/infiniband/hw/mlx4/srq.c
index dd7a2fce9df4..ebee56cbc0e2 100644
--- a/drivers/infiniband/hw/mlx4/srq.c
+++ b/drivers/infiniband/hw/mlx4/srq.c
@@ -178,8 +178,8 @@ struct ib_srq *mlx4_ib_create_srq(struct ib_pd *pd,
 		}
 	}
 
-	cqn = (init_attr->srq_type == IB_SRQT_XRC) ?
-		to_mcq(init_attr->ext.xrc.cq)->mcq.cqn : 0;
+	cqn = ib_srq_has_cq(init_attr->srq_type) ?
+		to_mcq(init_attr->ext.cq)->mcq.cqn : 0;
 	xrcdn = (init_attr->srq_type == IB_SRQT_XRC) ?
 		to_mxrcd(init_attr->ext.xrc.xrcd)->xrcdn :
 		(u16) dev->dev->caps.reserved_xrcds;
diff --git a/drivers/infiniband/hw/mlx5/main.c b/drivers/infiniband/hw/mlx5/main.c
index 62e6298810e7..7ad585257fd3 100644
--- a/drivers/infiniband/hw/mlx5/main.c
+++ b/drivers/infiniband/hw/mlx5/main.c
@@ -3217,7 +3217,7 @@ static int create_dev_resources(struct mlx5_ib_resources *devr)
 	attr.attr.max_sge = 1;
 	attr.attr.max_wr = 1;
 	attr.srq_type = IB_SRQT_XRC;
-	attr.ext.xrc.cq = devr->c0;
+	attr.ext.cq = devr->c0;
 	attr.ext.xrc.xrcd = devr->x0;
 
 	devr->s0 = mlx5_ib_create_srq(devr->p0, &attr, NULL);
@@ -3232,9 +3232,9 @@ static int create_dev_resources(struct mlx5_ib_resources *devr)
 	devr->s0->srq_context   = NULL;
 	devr->s0->srq_type      = IB_SRQT_XRC;
 	devr->s0->ext.xrc.xrcd	= devr->x0;
-	devr->s0->ext.xrc.cq	= devr->c0;
+	devr->s0->ext.cq	= devr->c0;
 	atomic_inc(&devr->s0->ext.xrc.xrcd->usecnt);
-	atomic_inc(&devr->s0->ext.xrc.cq->usecnt);
+	atomic_inc(&devr->s0->ext.cq->usecnt);
 	atomic_inc(&devr->p0->usecnt);
 	atomic_set(&devr->s0->usecnt, 0);
 
@@ -3253,9 +3253,9 @@ static int create_dev_resources(struct mlx5_ib_resources *devr)
 	devr->s1->event_handler = NULL;
 	devr->s1->srq_context   = NULL;
 	devr->s1->srq_type      = IB_SRQT_BASIC;
-	devr->s1->ext.xrc.cq	= devr->c0;
+	devr->s1->ext.cq	= devr->c0;
 	atomic_inc(&devr->p0->usecnt);
-	atomic_set(&devr->s0->usecnt, 0);
+	atomic_set(&devr->s1->usecnt, 0);
 
 	for (port = 0; port < ARRAY_SIZE(devr->ports); ++port) {
 		INIT_WORK(&devr->ports[port].pkey_change_work,
diff --git a/drivers/infiniband/hw/mlx5/srq.c b/drivers/infiniband/hw/mlx5/srq.c
index 30b3ddd8e1ab..e6be4f2927a7 100644
--- a/drivers/infiniband/hw/mlx5/srq.c
+++ b/drivers/infiniband/hw/mlx5/srq.c
@@ -292,13 +292,16 @@ struct ib_srq *mlx5_ib_create_srq(struct ib_pd *pd,
 	in.wqe_shift = srq->msrq.wqe_shift - 4;
 	if (srq->wq_sig)
 		in.flags |= MLX5_SRQ_FLAG_WQ_SIG;
-	if (init_attr->srq_type == IB_SRQT_XRC) {
+
+	if (init_attr->srq_type == IB_SRQT_XRC)
 		in.xrcd = to_mxrcd(init_attr->ext.xrc.xrcd)->xrcdn;
-		in.cqn = to_mcq(init_attr->ext.xrc.cq)->mcq.cqn;
-	} else if (init_attr->srq_type == IB_SRQT_BASIC) {
+	else
 		in.xrcd = to_mxrcd(dev->devr.x0)->xrcdn;
+
+	if (ib_srq_has_cq(init_attr->srq_type))
+		in.cqn = to_mcq(init_attr->ext.cq)->mcq.cqn;
+	else
 		in.cqn = to_mcq(dev->devr.c0)->mcq.cqn;
-	}
 
 	in.pd = to_mpd(pd)->pdn;
 	in.db_record = srq->db.dma;
diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h
index cab0bdcfad51..f0e46757185b 100644
--- a/include/rdma/ib_verbs.h
+++ b/include/rdma/ib_verbs.h
@@ -994,6 +994,11 @@ enum ib_srq_type {
 	IB_SRQT_XRC
 };
 
+static inline bool ib_srq_has_cq(enum ib_srq_type srq_type)
+{
+	return srq_type == IB_SRQT_XRC;
+}
+
 enum ib_srq_attr_mask {
 	IB_SRQ_MAX_WR	= 1 << 0,
 	IB_SRQ_LIMIT	= 1 << 1,
@@ -1011,11 +1016,13 @@ struct ib_srq_init_attr {
 	struct ib_srq_attr	attr;
 	enum ib_srq_type	srq_type;
 
-	union {
-		struct {
-			struct ib_xrcd *xrcd;
-			struct ib_cq   *cq;
-		} xrc;
+	struct {
+		struct ib_cq   *cq;
+		union {
+			struct {
+				struct ib_xrcd *xrcd;
+			} xrc;
+		};
 	} ext;
 };
 
@@ -1554,12 +1561,14 @@ struct ib_srq {
 	enum ib_srq_type	srq_type;
 	atomic_t		usecnt;
 
-	union {
-		struct {
-			struct ib_xrcd *xrcd;
-			struct ib_cq   *cq;
-			u32		srq_num;
-		} xrc;
+	struct {
+		struct ib_cq   *cq;
+		union {
+			struct {
+				struct ib_xrcd *xrcd;
+				u32		srq_num;
+			} xrc;
+		};
 	} ext;
 };
 
-- 
cgit v1.2.3


From 9c2c849625cf779e0fac41c8be3c163df4b80c14 Mon Sep 17 00:00:00 2001
From: Artemy Kovalyov <artemyko@mellanox.com>
Date: Thu, 17 Aug 2017 15:52:05 +0300
Subject: IB/core: Add new SRQ type IB_SRQT_TM

This patch adds new SRQ type - IB_SRQT_TM. The new SRQ type supports tag
matching and rendezvous offloads for MPI applications.

When SRQ receives a message it will search through the matching list
for the corresponding posted receive buffer. The process of searching
the matching list is called tag matching.
In case the tag matching results in a match, the received message will
be placed in the address specified by the receive buffer. In case no
match was found the message will be placed in a generic buffer until the
corresponding receive buffer will be posted. These messages are called
unexpected and their set is called an unexpected list.

Signed-off-by: Artemy Kovalyov <artemyko@mellanox.com>
Reviewed-by: Yossi Itigin <yosefe@mellanox.com>
Signed-off-by: Leon Romanovsky <leon@kernel.org>
Signed-off-by: Doug Ledford <dledford@redhat.com>
---
 include/rdma/ib_verbs.h | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h
index f0e46757185b..1b4bb8743969 100644
--- a/include/rdma/ib_verbs.h
+++ b/include/rdma/ib_verbs.h
@@ -991,12 +991,14 @@ enum ib_cq_notify_flags {
 
 enum ib_srq_type {
 	IB_SRQT_BASIC,
-	IB_SRQT_XRC
+	IB_SRQT_XRC,
+	IB_SRQT_TM,
 };
 
 static inline bool ib_srq_has_cq(enum ib_srq_type srq_type)
 {
-	return srq_type == IB_SRQT_XRC;
+	return srq_type == IB_SRQT_XRC ||
+	       srq_type == IB_SRQT_TM;
 }
 
 enum ib_srq_attr_mask {
@@ -1022,6 +1024,10 @@ struct ib_srq_init_attr {
 			struct {
 				struct ib_xrcd *xrcd;
 			} xrc;
+
+			struct {
+				u32		max_num_tags;
+			} tag_matching;
 		};
 	} ext;
 };
-- 
cgit v1.2.3


From 9382d4e1d3c09fe20fa53eb12b51ef01ad40774f Mon Sep 17 00:00:00 2001
From: Artemy Kovalyov <artemyko@mellanox.com>
Date: Thu, 17 Aug 2017 15:52:06 +0300
Subject: IB/uverbs: Add XRQ creation parameter to UAPI

Add tm_list_size parameter to struct ib_uverbs_create_xsrq.
If SRQ type is tag-matching this field defines maximum size
of tag matching list. Otherwise, it is expected to be zero.

Signed-off-by: Artemy Kovalyov <artemyko@mellanox.com>
Reviewed-by: Yossi Itigin <yosefe@mellanox.com>
Signed-off-by: Leon Romanovsky <leon@kernel.org>
Signed-off-by: Doug Ledford <dledford@redhat.com>
---
 include/uapi/rdma/ib_user_verbs.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/uapi/rdma/ib_user_verbs.h b/include/uapi/rdma/ib_user_verbs.h
index 63656d2e8705..d5434bbf40c8 100644
--- a/include/uapi/rdma/ib_user_verbs.h
+++ b/include/uapi/rdma/ib_user_verbs.h
@@ -1024,7 +1024,7 @@ struct ib_uverbs_create_xsrq {
 	__u32 max_wr;
 	__u32 max_sge;
 	__u32 srq_limit;
-	__u32 reserved;
+	__u32 max_num_tags;
 	__u32 xrcd_handle;
 	__u32 cq_handle;
 	__u64 driver_data[0];
-- 
cgit v1.2.3


From 8d50505ada728258fcdce99120b937ce68298c4e Mon Sep 17 00:00:00 2001
From: Artemy Kovalyov <artemyko@mellanox.com>
Date: Thu, 17 Aug 2017 15:52:08 +0300
Subject: IB/uverbs: Expose XRQ capabilities

Make XRQ capabilities available via ibv_query_device() verb.

Signed-off-by: Artemy Kovalyov <artemyko@mellanox.com>
Reviewed-by: Yossi Itigin <yosefe@mellanox.com>
Signed-off-by: Leon Romanovsky <leon@kernel.org>
Signed-off-by: Doug Ledford <dledford@redhat.com>
---
 drivers/infiniband/core/uverbs_cmd.c | 10 ++++++++++
 include/uapi/rdma/ib_user_verbs.h    | 15 +++++++++++++++
 2 files changed, 25 insertions(+)

(limited to 'include')

diff --git a/drivers/infiniband/core/uverbs_cmd.c b/drivers/infiniband/core/uverbs_cmd.c
index e69038a07fa0..e0cb99860934 100644
--- a/drivers/infiniband/core/uverbs_cmd.c
+++ b/drivers/infiniband/core/uverbs_cmd.c
@@ -3868,6 +3868,16 @@ int ib_uverbs_ex_query_device(struct ib_uverbs_file *file,
 
 	resp.raw_packet_caps = attr.raw_packet_caps;
 	resp.response_length += sizeof(resp.raw_packet_caps);
+
+	if (ucore->outlen < resp.response_length + sizeof(resp.xrq_caps))
+		goto end;
+
+	resp.xrq_caps.max_rndv_hdr_size = attr.xrq_caps.max_rndv_hdr_size;
+	resp.xrq_caps.max_num_tags      = attr.xrq_caps.max_num_tags;
+	resp.xrq_caps.max_ops		= attr.xrq_caps.max_ops;
+	resp.xrq_caps.max_sge		= attr.xrq_caps.max_sge;
+	resp.xrq_caps.flags		= attr.xrq_caps.flags;
+	resp.response_length += sizeof(resp.xrq_caps);
 end:
 	err = ib_copy_to_udata(ucore, &resp, resp.response_length);
 	return err;
diff --git a/include/uapi/rdma/ib_user_verbs.h b/include/uapi/rdma/ib_user_verbs.h
index d5434bbf40c8..9a0b6479fe0c 100644
--- a/include/uapi/rdma/ib_user_verbs.h
+++ b/include/uapi/rdma/ib_user_verbs.h
@@ -236,6 +236,20 @@ struct ib_uverbs_rss_caps {
 	__u32 reserved;
 };
 
+struct ib_uverbs_tm_caps {
+	/* Max size of rendezvous request message */
+	__u32 max_rndv_hdr_size;
+	/* Max number of entries in tag matching list */
+	__u32 max_num_tags;
+	/* TM flags */
+	__u32 flags;
+	/* Max number of outstanding list operations */
+	__u32 max_ops;
+	/* Max number of SGE in tag matching entry */
+	__u32 max_sge;
+	__u32 reserved;
+};
+
 struct ib_uverbs_ex_query_device_resp {
 	struct ib_uverbs_query_device_resp base;
 	__u32 comp_mask;
@@ -247,6 +261,7 @@ struct ib_uverbs_ex_query_device_resp {
 	struct ib_uverbs_rss_caps rss_caps;
 	__u32  max_wq_type_rq;
 	__u32 raw_packet_caps;
+	struct ib_uverbs_tm_caps xrq_caps;
 };
 
 struct ib_uverbs_query_port {
-- 
cgit v1.2.3


From 5b3ec3fcb6bbe081279c73fb574af8c72f14cea0 Mon Sep 17 00:00:00 2001
From: Artemy Kovalyov <artemyko@mellanox.com>
Date: Thu, 17 Aug 2017 15:52:10 +0300
Subject: net/mlx5: Add XRQ support

Add support to new XRQ(eXtended shared Receive Queue)
hardware object. It supports SRQ semantics with addition
of extended receive buffers topologies and offloads.

Currently supports tag matching topology and rendezvouz offload.

Signed-off-by: Artemy Kovalyov <artemyko@mellanox.com>
Reviewed-by: Yossi Itigin <yosefe@mellanox.com>
Signed-off-by: Leon Romanovsky <leon@kernel.org>
Signed-off-by: Doug Ledford <dledford@redhat.com>
---
 drivers/net/ethernet/mellanox/mlx5/core/srq.c | 150 ++++++++++++++++++++++++--
 include/linux/mlx5/driver.h                   |   1 +
 include/linux/mlx5/srq.h                      |   5 +
 3 files changed, 146 insertions(+), 10 deletions(-)

(limited to 'include')

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/srq.c b/drivers/net/ethernet/mellanox/mlx5/core/srq.c
index f774de6f5fcb..7673da04efa4 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/srq.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/srq.c
@@ -435,16 +435,128 @@ out:
 	return err;
 }
 
+static int create_xrq_cmd(struct mlx5_core_dev *dev, struct mlx5_core_srq *srq,
+			  struct mlx5_srq_attr *in)
+{
+	u32 create_out[MLX5_ST_SZ_DW(create_xrq_out)] = {0};
+	void *create_in;
+	void *xrqc;
+	void *wq;
+	int pas_size;
+	int inlen;
+	int err;
+
+	pas_size = get_pas_size(in);
+	inlen = MLX5_ST_SZ_BYTES(create_xrq_in) + pas_size;
+	create_in = kvzalloc(inlen, GFP_KERNEL);
+	if (!create_in)
+		return -ENOMEM;
+
+	xrqc = MLX5_ADDR_OF(create_xrq_in, create_in, xrq_context);
+	wq = MLX5_ADDR_OF(xrqc, xrqc, wq);
+
+	set_wq(wq, in);
+	memcpy(MLX5_ADDR_OF(xrqc, xrqc, wq.pas), in->pas, pas_size);
+
+	if (in->type == IB_SRQT_TM) {
+		MLX5_SET(xrqc, xrqc, topology, MLX5_XRQC_TOPOLOGY_TAG_MATCHING);
+		if (in->flags & MLX5_SRQ_FLAG_RNDV)
+			MLX5_SET(xrqc, xrqc, offload, MLX5_XRQC_OFFLOAD_RNDV);
+		MLX5_SET(xrqc, xrqc,
+			 tag_matching_topology_context.log_matching_list_sz,
+			 in->tm_log_list_size);
+	}
+	MLX5_SET(xrqc, xrqc, user_index, in->user_index);
+	MLX5_SET(xrqc, xrqc, cqn, in->cqn);
+	MLX5_SET(create_xrq_in, create_in, opcode, MLX5_CMD_OP_CREATE_XRQ);
+	err = mlx5_cmd_exec(dev, create_in, inlen, create_out,
+			    sizeof(create_out));
+	kvfree(create_in);
+	if (!err)
+		srq->srqn = MLX5_GET(create_xrq_out, create_out, xrqn);
+
+	return err;
+}
+
+static int destroy_xrq_cmd(struct mlx5_core_dev *dev, struct mlx5_core_srq *srq)
+{
+	u32 in[MLX5_ST_SZ_DW(destroy_xrq_in)] = {0};
+	u32 out[MLX5_ST_SZ_DW(destroy_xrq_out)] = {0};
+
+	MLX5_SET(destroy_xrq_in, in, opcode, MLX5_CMD_OP_DESTROY_XRQ);
+	MLX5_SET(destroy_xrq_in, in, xrqn,   srq->srqn);
+
+	return mlx5_cmd_exec(dev, in, sizeof(in), out, sizeof(out));
+}
+
+static int arm_xrq_cmd(struct mlx5_core_dev *dev,
+		       struct mlx5_core_srq *srq,
+		       u16 lwm)
+{
+	u32 out[MLX5_ST_SZ_DW(arm_rq_out)] = {0};
+	u32 in[MLX5_ST_SZ_DW(arm_rq_in)] = {0};
+
+	MLX5_SET(arm_rq_in, in, opcode,     MLX5_CMD_OP_ARM_RQ);
+	MLX5_SET(arm_rq_in, in, op_mod,     MLX5_ARM_RQ_IN_OP_MOD_XRQ);
+	MLX5_SET(arm_rq_in, in, srq_number, srq->srqn);
+	MLX5_SET(arm_rq_in, in, lwm,	    lwm);
+
+	return mlx5_cmd_exec(dev, in, sizeof(in), out, sizeof(out));
+}
+
+static int query_xrq_cmd(struct mlx5_core_dev *dev, struct mlx5_core_srq *srq,
+			 struct mlx5_srq_attr *out)
+{
+	u32 in[MLX5_ST_SZ_DW(query_xrq_in)] = {0};
+	u32 *xrq_out;
+	int outlen = MLX5_ST_SZ_BYTES(query_xrq_out);
+	void *xrqc;
+	int err;
+
+	xrq_out = kvzalloc(outlen, GFP_KERNEL);
+	if (!xrq_out)
+		return -ENOMEM;
+
+	MLX5_SET(query_xrq_in, in, opcode, MLX5_CMD_OP_QUERY_XRQ);
+	MLX5_SET(query_xrq_in, in, xrqn, srq->srqn);
+
+	err = mlx5_cmd_exec(dev, in, sizeof(in), xrq_out, outlen);
+	if (err)
+		goto out;
+
+	xrqc = MLX5_ADDR_OF(query_xrq_out, xrq_out, xrq_context);
+	get_wq(MLX5_ADDR_OF(xrqc, xrqc, wq), out);
+	if (MLX5_GET(xrqc, xrqc, state) != MLX5_XRQC_STATE_GOOD)
+		out->flags |= MLX5_SRQ_FLAG_ERR;
+	out->tm_next_tag =
+		MLX5_GET(xrqc, xrqc,
+			 tag_matching_topology_context.append_next_index);
+	out->tm_hw_phase_cnt =
+		MLX5_GET(xrqc, xrqc,
+			 tag_matching_topology_context.hw_phase_cnt);
+	out->tm_sw_phase_cnt =
+		MLX5_GET(xrqc, xrqc,
+			 tag_matching_topology_context.sw_phase_cnt);
+
+out:
+	kvfree(xrq_out);
+	return err;
+}
+
 static int create_srq_split(struct mlx5_core_dev *dev,
 			    struct mlx5_core_srq *srq,
 			    struct mlx5_srq_attr *in)
 {
 	if (!dev->issi)
 		return create_srq_cmd(dev, srq, in);
-	else if (srq->common.res == MLX5_RES_XSRQ)
+	switch (srq->common.res) {
+	case MLX5_RES_XSRQ:
 		return create_xrc_srq_cmd(dev, srq, in);
-	else
+	case MLX5_RES_XRQ:
+		return create_xrq_cmd(dev, srq, in);
+	default:
 		return create_rmp_cmd(dev, srq, in);
+	}
 }
 
 static int destroy_srq_split(struct mlx5_core_dev *dev,
@@ -452,10 +564,14 @@ static int destroy_srq_split(struct mlx5_core_dev *dev,
 {
 	if (!dev->issi)
 		return destroy_srq_cmd(dev, srq);
-	else if (srq->common.res == MLX5_RES_XSRQ)
+	switch (srq->common.res) {
+	case MLX5_RES_XSRQ:
 		return destroy_xrc_srq_cmd(dev, srq);
-	else
+	case MLX5_RES_XRQ:
+		return destroy_xrq_cmd(dev, srq);
+	default:
 		return destroy_rmp_cmd(dev, srq);
+	}
 }
 
 int mlx5_core_create_srq(struct mlx5_core_dev *dev, struct mlx5_core_srq *srq,
@@ -464,10 +580,16 @@ int mlx5_core_create_srq(struct mlx5_core_dev *dev, struct mlx5_core_srq *srq,
 	int err;
 	struct mlx5_srq_table *table = &dev->priv.srq_table;
 
-	if (in->type == IB_SRQT_XRC)
+	switch (in->type) {
+	case IB_SRQT_XRC:
 		srq->common.res = MLX5_RES_XSRQ;
-	else
+		break;
+	case IB_SRQT_TM:
+		srq->common.res = MLX5_RES_XRQ;
+		break;
+	default:
 		srq->common.res = MLX5_RES_SRQ;
+	}
 
 	err = create_srq_split(dev, srq, in);
 	if (err)
@@ -528,10 +650,14 @@ int mlx5_core_query_srq(struct mlx5_core_dev *dev, struct mlx5_core_srq *srq,
 {
 	if (!dev->issi)
 		return query_srq_cmd(dev, srq, out);
-	else if (srq->common.res == MLX5_RES_XSRQ)
+	switch (srq->common.res) {
+	case MLX5_RES_XSRQ:
 		return query_xrc_srq_cmd(dev, srq, out);
-	else
+	case MLX5_RES_XRQ:
+		return query_xrq_cmd(dev, srq, out);
+	default:
 		return query_rmp_cmd(dev, srq, out);
+	}
 }
 EXPORT_SYMBOL(mlx5_core_query_srq);
 
@@ -540,10 +666,14 @@ int mlx5_core_arm_srq(struct mlx5_core_dev *dev, struct mlx5_core_srq *srq,
 {
 	if (!dev->issi)
 		return arm_srq_cmd(dev, srq, lwm, is_srq);
-	else if (srq->common.res == MLX5_RES_XSRQ)
+	switch (srq->common.res) {
+	case MLX5_RES_XSRQ:
 		return arm_xrc_srq_cmd(dev, srq, lwm);
-	else
+	case MLX5_RES_XRQ:
+		return arm_xrq_cmd(dev, srq, lwm);
+	default:
 		return arm_rmp_cmd(dev, srq, lwm);
+	}
 }
 EXPORT_SYMBOL(mlx5_core_arm_srq);
 
diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h
index 99d88624ad07..c33e6f7a1afb 100644
--- a/include/linux/mlx5/driver.h
+++ b/include/linux/mlx5/driver.h
@@ -418,6 +418,7 @@ enum mlx5_res_type {
 	MLX5_RES_SQ	= MLX5_EVENT_QUEUE_TYPE_SQ,
 	MLX5_RES_SRQ	= 3,
 	MLX5_RES_XSRQ	= 4,
+	MLX5_RES_XRQ	= 5,
 };
 
 struct mlx5_core_rsc_common {
diff --git a/include/linux/mlx5/srq.h b/include/linux/mlx5/srq.h
index 1cde0fd53f90..24ff23e27c8a 100644
--- a/include/linux/mlx5/srq.h
+++ b/include/linux/mlx5/srq.h
@@ -38,6 +38,7 @@
 enum {
 	MLX5_SRQ_FLAG_ERR    = (1 << 0),
 	MLX5_SRQ_FLAG_WQ_SIG = (1 << 1),
+	MLX5_SRQ_FLAG_RNDV   = (1 << 2),
 };
 
 struct mlx5_srq_attr {
@@ -56,6 +57,10 @@ struct mlx5_srq_attr {
 	u32 user_index;
 	u64 db_record;
 	__be64 *pas;
+	u32 tm_log_list_size;
+	u32 tm_next_tag;
+	u32 tm_hw_phase_cnt;
+	u32 tm_sw_phase_cnt;
 };
 
 struct mlx5_core_dev;
-- 
cgit v1.2.3


From 8d4e6c4caa12dafbcba138e5450b7af17b0b2194 Mon Sep 17 00:00:00 2001
From: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Date: Thu, 30 Mar 2017 18:39:56 +0300
Subject: perf/core, pt, bts: Get rid of itrace_started

I just noticed that hw.itrace_started and hw.config are aliased to the
same location. Now, the PT driver happens to use both, which works out
fine by sheer luck:

 - STORE(hw.itrace_start) is ordered before STORE(hw.config), in the
    program order, although there are no compiler barriers to ensure that,

 - to the perf_log_itrace_start() hw.itrace_start looks set at the same
   time as when it is intended to be set because both stores happen in the
   same path,

 - hw.config is never reset to zero in the PT driver.

Now, the use of hw.config by the PT driver makes more sense (it being a
HW PMU) than messing around with itrace_started, which is an awkward API
to begin with.

This patch replaces hw.itrace_started with an attach_state bit and an
API call for the PMU drivers to use to communicate the condition.

Signed-off-by: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Arnaldo Carvalho de Melo <acme@infradead.org>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Vince Weaver <vincent.weaver@maine.edu>
Cc: vince@deater.net
Link: http://lkml.kernel.org/r/20170330153956.25994-1-alexander.shishkin@linux.intel.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/events/intel/bts.c | 2 +-
 arch/x86/events/intel/pt.c  | 5 +++--
 include/linux/perf_event.h  | 5 ++---
 kernel/events/core.c        | 7 ++++++-
 4 files changed, 12 insertions(+), 7 deletions(-)

(limited to 'include')

diff --git a/arch/x86/events/intel/bts.c b/arch/x86/events/intel/bts.c
index ddd8d3516bfc..16076eb34699 100644
--- a/arch/x86/events/intel/bts.c
+++ b/arch/x86/events/intel/bts.c
@@ -268,7 +268,7 @@ static void bts_event_start(struct perf_event *event, int flags)
 	bts->ds_back.bts_absolute_maximum = cpuc->ds->bts_absolute_maximum;
 	bts->ds_back.bts_interrupt_threshold = cpuc->ds->bts_interrupt_threshold;
 
-	event->hw.itrace_started = 1;
+	perf_event_itrace_started(event);
 	event->hw.state = 0;
 
 	__bts_event_start(event);
diff --git a/arch/x86/events/intel/pt.c b/arch/x86/events/intel/pt.c
index ae8324d65e61..81fd41d5a0d9 100644
--- a/arch/x86/events/intel/pt.c
+++ b/arch/x86/events/intel/pt.c
@@ -471,8 +471,9 @@ static void pt_config(struct perf_event *event)
 	struct pt *pt = this_cpu_ptr(&pt_ctx);
 	u64 reg;
 
-	if (!event->hw.itrace_started) {
-		event->hw.itrace_started = 1;
+	/* First round: clear STATUS, in particular the PSB byte counter. */
+	if (!event->hw.config) {
+		perf_event_itrace_started(event);
 		wrmsrl(MSR_IA32_RTIT_STATUS, 0);
 	}
 
diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index c00cd4b02f32..adda0aaae6c8 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -147,9 +147,6 @@ struct hw_perf_event {
 			struct list_head	cqm_groups_entry;
 			struct list_head	cqm_group_entry;
 		};
-		struct { /* itrace */
-			int			itrace_started;
-		};
 		struct { /* amd_power */
 			u64	pwr_acc;
 			u64	ptsc;
@@ -541,6 +538,7 @@ struct swevent_hlist {
 #define PERF_ATTACH_GROUP	0x02
 #define PERF_ATTACH_TASK	0x04
 #define PERF_ATTACH_TASK_DATA	0x08
+#define PERF_ATTACH_ITRACE	0x10
 
 struct perf_cgroup;
 struct ring_buffer;
@@ -864,6 +862,7 @@ extern int perf_aux_output_skip(struct perf_output_handle *handle,
 				unsigned long size);
 extern void *perf_get_aux(struct perf_output_handle *handle);
 extern void perf_aux_output_flag(struct perf_output_handle *handle, u64 flags);
+extern void perf_event_itrace_started(struct perf_event *event);
 
 extern int perf_pmu_register(struct pmu *pmu, const char *name, int type);
 extern void perf_pmu_unregister(struct pmu *pmu);
diff --git a/kernel/events/core.c b/kernel/events/core.c
index e5467e107624..77fd6b11ef22 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -7301,6 +7301,11 @@ static void perf_log_throttle(struct perf_event *event, int enable)
 	perf_output_end(&handle);
 }
 
+void perf_event_itrace_started(struct perf_event *event)
+{
+	event->attach_state |= PERF_ATTACH_ITRACE;
+}
+
 static void perf_log_itrace_start(struct perf_event *event)
 {
 	struct perf_output_handle handle;
@@ -7316,7 +7321,7 @@ static void perf_log_itrace_start(struct perf_event *event)
 		event = event->parent;
 
 	if (!(event->pmu->capabilities & PERF_PMU_CAP_ITRACE) ||
-	    event->hw.itrace_started)
+	    event->attach_state & PERF_ATTACH_ITRACE)
 		return;
 
 	rec.header.type	= PERF_RECORD_ITRACE_START;
-- 
cgit v1.2.3


From fc7ce9c74c3ad232b084d80148654f926d01ece7 Mon Sep 17 00:00:00 2001
From: Kan Liang <kan.liang@intel.com>
Date: Mon, 28 Aug 2017 20:52:49 -0400
Subject: perf/core, x86: Add PERF_SAMPLE_PHYS_ADDR

For understanding how the workload maps to memory channels and hardware
behavior, it's very important to collect address maps with physical
addresses. For example, 3D XPoint access can only be found by filtering
the physical address.

Add a new sample type for physical address.

perf already has a facility to collect data virtual address. This patch
introduces a function to convert the virtual address to physical address.
The function is quite generic and can be extended to any architecture as
long as a virtual address is provided.

 - For kernel direct mapping addresses, virt_to_phys is used to convert
   the virtual addresses to physical address.

 - For user virtual addresses, __get_user_pages_fast is used to walk the
   pages tables for user physical address.

 - This does not work for vmalloc addresses right now. These are not
   resolved, but code to do that could be added.

The new sample type requires collecting the virtual address. The
virtual address will not be output unless SAMPLE_ADDR is applied.

For security, the physical address can only be exposed to root or
privileged user.

Tested-by: Madhavan Srinivasan <maddy@linux.vnet.ibm.com>
Signed-off-by: Kan Liang <kan.liang@intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Vince Weaver <vincent.weaver@maine.edu>
Cc: acme@kernel.org
Cc: mpe@ellerman.id.au
Link: http://lkml.kernel.org/r/1503967969-48278-1-git-send-email-kan.liang@intel.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/powerpc/perf/core-book3s.c |  3 ++-
 arch/x86/events/intel/ds.c      |  2 +-
 arch/x86/events/perf_event.h    |  2 +-
 include/linux/perf_event.h      |  2 ++
 include/uapi/linux/perf_event.h |  4 +++-
 kernel/events/core.c            | 46 +++++++++++++++++++++++++++++++++++++++++
 6 files changed, 55 insertions(+), 4 deletions(-)

(limited to 'include')

diff --git a/arch/powerpc/perf/core-book3s.c b/arch/powerpc/perf/core-book3s.c
index 6c2d4168daec..2e3eb7431571 100644
--- a/arch/powerpc/perf/core-book3s.c
+++ b/arch/powerpc/perf/core-book3s.c
@@ -2039,7 +2039,8 @@ static void record_and_restart(struct perf_event *event, unsigned long val,
 
 		perf_sample_data_init(&data, ~0ULL, event->hw.last_period);
 
-		if (event->attr.sample_type & PERF_SAMPLE_ADDR)
+		if (event->attr.sample_type &
+		    (PERF_SAMPLE_ADDR | PERF_SAMPLE_PHYS_ADDR))
 			perf_get_data_addr(regs, &data.addr);
 
 		if (event->attr.sample_type & PERF_SAMPLE_BRANCH_STACK) {
diff --git a/arch/x86/events/intel/ds.c b/arch/x86/events/intel/ds.c
index 98e36e0c791c..e1965e5ff570 100644
--- a/arch/x86/events/intel/ds.c
+++ b/arch/x86/events/intel/ds.c
@@ -1185,7 +1185,7 @@ static void setup_pebs_sample_data(struct perf_event *event,
 	else
 		regs->flags &= ~PERF_EFLAGS_EXACT;
 
-	if ((sample_type & PERF_SAMPLE_ADDR) &&
+	if ((sample_type & (PERF_SAMPLE_ADDR | PERF_SAMPLE_PHYS_ADDR)) &&
 	    x86_pmu.intel_cap.pebs_format >= 1)
 		data->addr = pebs->dla;
 
diff --git a/arch/x86/events/perf_event.h b/arch/x86/events/perf_event.h
index 9337589014cc..4196f81ec0e1 100644
--- a/arch/x86/events/perf_event.h
+++ b/arch/x86/events/perf_event.h
@@ -91,7 +91,7 @@ struct amd_nb {
 	(PERF_SAMPLE_IP | PERF_SAMPLE_TID | PERF_SAMPLE_ADDR | \
 	PERF_SAMPLE_ID | PERF_SAMPLE_CPU | PERF_SAMPLE_STREAM_ID | \
 	PERF_SAMPLE_DATA_SRC | PERF_SAMPLE_IDENTIFIER | \
-	PERF_SAMPLE_TRANSACTION)
+	PERF_SAMPLE_TRANSACTION | PERF_SAMPLE_PHYS_ADDR)
 
 /*
  * A debug store configuration.
diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index adda0aaae6c8..718ba163c1b9 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -943,6 +943,8 @@ struct perf_sample_data {
 
 	struct perf_regs		regs_intr;
 	u64				stack_user_size;
+
+	u64				phys_addr;
 } ____cacheline_aligned;
 
 /* default value for data source */
diff --git a/include/uapi/linux/perf_event.h b/include/uapi/linux/perf_event.h
index 2a37ae925d85..140ae638cfd6 100644
--- a/include/uapi/linux/perf_event.h
+++ b/include/uapi/linux/perf_event.h
@@ -139,8 +139,9 @@ enum perf_event_sample_format {
 	PERF_SAMPLE_IDENTIFIER			= 1U << 16,
 	PERF_SAMPLE_TRANSACTION			= 1U << 17,
 	PERF_SAMPLE_REGS_INTR			= 1U << 18,
+	PERF_SAMPLE_PHYS_ADDR			= 1U << 19,
 
-	PERF_SAMPLE_MAX = 1U << 19,		/* non-ABI */
+	PERF_SAMPLE_MAX = 1U << 20,		/* non-ABI */
 };
 
 /*
@@ -814,6 +815,7 @@ enum perf_event_type {
 	 *	{ u64			transaction; } && PERF_SAMPLE_TRANSACTION
 	 *	{ u64			abi; # enum perf_sample_regs_abi
 	 *	  u64			regs[weight(mask)]; } && PERF_SAMPLE_REGS_INTR
+	 *	{ u64			phys_addr;} && PERF_SAMPLE_PHYS_ADDR
 	 * };
 	 */
 	PERF_RECORD_SAMPLE			= 9,
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 77fd6b11ef22..ce64f3fed5c6 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -1575,6 +1575,9 @@ static void __perf_event_header_size(struct perf_event *event, u64 sample_type)
 	if (sample_type & PERF_SAMPLE_TRANSACTION)
 		size += sizeof(data->txn);
 
+	if (sample_type & PERF_SAMPLE_PHYS_ADDR)
+		size += sizeof(data->phys_addr);
+
 	event->header_size = size;
 }
 
@@ -6017,6 +6020,9 @@ void perf_output_sample(struct perf_output_handle *handle,
 		}
 	}
 
+	if (sample_type & PERF_SAMPLE_PHYS_ADDR)
+		perf_output_put(handle, data->phys_addr);
+
 	if (!event->attr.watermark) {
 		int wakeup_events = event->attr.wakeup_events;
 
@@ -6032,6 +6038,38 @@ void perf_output_sample(struct perf_output_handle *handle,
 	}
 }
 
+static u64 perf_virt_to_phys(u64 virt)
+{
+	u64 phys_addr = 0;
+	struct page *p = NULL;
+
+	if (!virt)
+		return 0;
+
+	if (virt >= TASK_SIZE) {
+		/* If it's vmalloc()d memory, leave phys_addr as 0 */
+		if (virt_addr_valid((void *)(uintptr_t)virt) &&
+		    !(virt >= VMALLOC_START && virt < VMALLOC_END))
+			phys_addr = (u64)virt_to_phys((void *)(uintptr_t)virt);
+	} else {
+		/*
+		 * Walking the pages tables for user address.
+		 * Interrupts are disabled, so it prevents any tear down
+		 * of the page tables.
+		 * Try IRQ-safe __get_user_pages_fast first.
+		 * If failed, leave phys_addr as 0.
+		 */
+		if ((current->mm != NULL) &&
+		    (__get_user_pages_fast(virt, 1, 0, &p) == 1))
+			phys_addr = page_to_phys(p) + virt % PAGE_SIZE;
+
+		if (p)
+			put_page(p);
+	}
+
+	return phys_addr;
+}
+
 void perf_prepare_sample(struct perf_event_header *header,
 			 struct perf_sample_data *data,
 			 struct perf_event *event,
@@ -6150,6 +6188,9 @@ void perf_prepare_sample(struct perf_event_header *header,
 
 		header->size += size;
 	}
+
+	if (sample_type & PERF_SAMPLE_PHYS_ADDR)
+		data->phys_addr = perf_virt_to_phys(data->addr);
 }
 
 static void __always_inline
@@ -9909,6 +9950,11 @@ SYSCALL_DEFINE5(perf_event_open,
 			return -EINVAL;
 	}
 
+	/* Only privileged users can get physical addresses */
+	if ((attr.sample_type & PERF_SAMPLE_PHYS_ADDR) &&
+	    perf_paranoid_kernel() && !capable(CAP_SYS_ADMIN))
+		return -EACCES;
+
 	if (!attr.sample_max_stack)
 		attr.sample_max_stack = sysctl_perf_event_max_stack;
 
-- 
cgit v1.2.3


From f52be5708076b75a045ac52c6fef3fffb8300525 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Tue, 29 Aug 2017 10:59:39 +0200
Subject: locking/lockdep: Untangle xhlock history save/restore from task
 independence

Where XHLOCK_{SOFT,HARD} are save/restore points in the xhlocks[] to
ensure the temporal IRQ events don't interact with task state, the
XHLOCK_PROC is a fundament different beast that just happens to share
the interface.

The purpose of XHLOCK_PROC is to annotate independent execution inside
one task. For example workqueues, each work should appear to run in its
own 'pristine' 'task'.

Remove XHLOCK_PROC in favour of its own interface to avoid confusion.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Byungchul Park <byungchul.park@lge.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: boqun.feng@gmail.com
Cc: david@fromorbit.com
Cc: johannes@sipsolutions.net
Cc: kernel-team@lge.com
Cc: oleg@redhat.com
Cc: tj@kernel.org
Link: http://lkml.kernel.org/r/20170829085939.ggmb6xiohw67micb@hirez.programming.kicks-ass.net
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/irqflags.h |  4 +--
 include/linux/lockdep.h  |  7 +++--
 kernel/locking/lockdep.c | 79 +++++++++++++++++++++++-------------------------
 kernel/workqueue.c       |  9 +++---
 4 files changed, 48 insertions(+), 51 deletions(-)

(limited to 'include')

diff --git a/include/linux/irqflags.h b/include/linux/irqflags.h
index 9bc050bc81b2..5fdd93bb9300 100644
--- a/include/linux/irqflags.h
+++ b/include/linux/irqflags.h
@@ -26,7 +26,7 @@
 # define trace_hardirq_enter()			\
 do {						\
 	current->hardirq_context++;		\
-	crossrelease_hist_start(XHLOCK_HARD, 0);\
+	crossrelease_hist_start(XHLOCK_HARD);	\
 } while (0)
 # define trace_hardirq_exit()			\
 do {						\
@@ -36,7 +36,7 @@ do {						\
 # define lockdep_softirq_enter()		\
 do {						\
 	current->softirq_context++;		\
-	crossrelease_hist_start(XHLOCK_SOFT, 0);\
+	crossrelease_hist_start(XHLOCK_SOFT);	\
 } while (0)
 # define lockdep_softirq_exit()			\
 do {						\
diff --git a/include/linux/lockdep.h b/include/linux/lockdep.h
index 78bb7133abed..bfa8e0b0d6f1 100644
--- a/include/linux/lockdep.h
+++ b/include/linux/lockdep.h
@@ -551,7 +551,6 @@ struct pin_cookie { };
 enum xhlock_context_t {
 	XHLOCK_HARD,
 	XHLOCK_SOFT,
-	XHLOCK_PROC,
 	XHLOCK_CTX_NR,
 };
 
@@ -580,8 +579,9 @@ extern void lock_commit_crosslock(struct lockdep_map *lock);
 #define STATIC_LOCKDEP_MAP_INIT(_name, _key) \
 	{ .name = (_name), .key = (void *)(_key), .cross = 0, }
 
-extern void crossrelease_hist_start(enum xhlock_context_t c, bool force);
+extern void crossrelease_hist_start(enum xhlock_context_t c);
 extern void crossrelease_hist_end(enum xhlock_context_t c);
+extern void lockdep_invariant_state(bool force);
 extern void lockdep_init_task(struct task_struct *task);
 extern void lockdep_free_task(struct task_struct *task);
 #else /* !CROSSRELEASE */
@@ -593,8 +593,9 @@ extern void lockdep_free_task(struct task_struct *task);
 #define STATIC_LOCKDEP_MAP_INIT(_name, _key) \
 	{ .name = (_name), .key = (void *)(_key), }
 
-static inline void crossrelease_hist_start(enum xhlock_context_t c, bool force) {}
+static inline void crossrelease_hist_start(enum xhlock_context_t c) {}
 static inline void crossrelease_hist_end(enum xhlock_context_t c) {}
+static inline void lockdep_invariant_state(bool force) {}
 static inline void lockdep_init_task(struct task_struct *task) {}
 static inline void lockdep_free_task(struct task_struct *task) {}
 #endif /* CROSSRELEASE */
diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c
index f73ca595b81e..44c8d0d17170 100644
--- a/kernel/locking/lockdep.c
+++ b/kernel/locking/lockdep.c
@@ -4623,13 +4623,8 @@ asmlinkage __visible void lockdep_sys_exit(void)
 	/*
 	 * The lock history for each syscall should be independent. So wipe the
 	 * slate clean on return to userspace.
-	 *
-	 * crossrelease_hist_end() works well here even when getting here
-	 * without starting (i.e. just after forking), because it rolls back
-	 * the index to point to the last entry, which is already invalid.
 	 */
-	crossrelease_hist_end(XHLOCK_PROC);
-	crossrelease_hist_start(XHLOCK_PROC, false);
+	lockdep_invariant_state(false);
 }
 
 void lockdep_rcu_suspicious(const char *file, const int line, const char *s)
@@ -4723,19 +4718,47 @@ static inline void invalidate_xhlock(struct hist_lock *xhlock)
 }
 
 /*
- * Lock history stacks; we have 3 nested lock history stacks:
+ * Lock history stacks; we have 2 nested lock history stacks:
  *
  *   HARD(IRQ)
  *   SOFT(IRQ)
- *   PROC(ess)
  *
  * The thing is that once we complete a HARD/SOFT IRQ the future task locks
  * should not depend on any of the locks observed while running the IRQ.  So
  * what we do is rewind the history buffer and erase all our knowledge of that
  * temporal event.
- *
- * The PROCess one is special though; it is used to annotate independence
- * inside a task.
+ */
+
+void crossrelease_hist_start(enum xhlock_context_t c)
+{
+	struct task_struct *cur = current;
+
+	if (!cur->xhlocks)
+		return;
+
+	cur->xhlock_idx_hist[c] = cur->xhlock_idx;
+	cur->hist_id_save[c]    = cur->hist_id;
+}
+
+void crossrelease_hist_end(enum xhlock_context_t c)
+{
+	struct task_struct *cur = current;
+
+	if (cur->xhlocks) {
+		unsigned int idx = cur->xhlock_idx_hist[c];
+		struct hist_lock *h = &xhlock(idx);
+
+		cur->xhlock_idx = idx;
+
+		/* Check if the ring was overwritten. */
+		if (h->hist_id != cur->hist_id_save[c])
+			invalidate_xhlock(h);
+	}
+}
+
+/*
+ * lockdep_invariant_state() is used to annotate independence inside a task, to
+ * make one task look like multiple independent 'tasks'.
  *
  * Take for instance workqueues; each work is independent of the last. The
  * completion of a future work does not depend on the completion of a past work
@@ -4758,40 +4781,14 @@ static inline void invalidate_xhlock(struct hist_lock *xhlock)
  * entry. Similarly, independence per-definition means it does not depend on
  * prior state.
  */
-void crossrelease_hist_start(enum xhlock_context_t c, bool force)
+void lockdep_invariant_state(bool force)
 {
-	struct task_struct *cur = current;
-
-	if (!cur->xhlocks)
-		return;
-
 	/*
 	 * We call this at an invariant point, no current state, no history.
+	 * Verify the former, enforce the latter.
 	 */
-	if (c == XHLOCK_PROC) {
-		/* verified the former, ensure the latter */
-		WARN_ON_ONCE(!force && cur->lockdep_depth);
-		invalidate_xhlock(&xhlock(cur->xhlock_idx));
-	}
-
-	cur->xhlock_idx_hist[c] = cur->xhlock_idx;
-	cur->hist_id_save[c]    = cur->hist_id;
-}
-
-void crossrelease_hist_end(enum xhlock_context_t c)
-{
-	struct task_struct *cur = current;
-
-	if (cur->xhlocks) {
-		unsigned int idx = cur->xhlock_idx_hist[c];
-		struct hist_lock *h = &xhlock(idx);
-
-		cur->xhlock_idx = idx;
-
-		/* Check if the ring was overwritten. */
-		if (h->hist_id != cur->hist_id_save[c])
-			invalidate_xhlock(h);
-	}
+	WARN_ON_ONCE(!force && current->lockdep_depth);
+	invalidate_xhlock(&xhlock(current->xhlock_idx));
 }
 
 static int cross_lock(struct lockdep_map *lock)
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index c0331891dec1..ab3c0dc8c7ed 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -2094,8 +2094,8 @@ __acquires(&pool->lock)
 	lock_map_acquire(&pwq->wq->lockdep_map);
 	lock_map_acquire(&lockdep_map);
 	/*
-	 * Strictly speaking we should do start(PROC) without holding any
-	 * locks, that is, before these two lock_map_acquire()'s.
+	 * Strictly speaking we should mark the invariant state without holding
+	 * any locks, that is, before these two lock_map_acquire()'s.
 	 *
 	 * However, that would result in:
 	 *
@@ -2107,14 +2107,14 @@ __acquires(&pool->lock)
 	 * Which would create W1->C->W1 dependencies, even though there is no
 	 * actual deadlock possible. There are two solutions, using a
 	 * read-recursive acquire on the work(queue) 'locks', but this will then
-	 * hit the lockdep limitation on recursive locks, or simly discard
+	 * hit the lockdep limitation on recursive locks, or simply discard
 	 * these locks.
 	 *
 	 * AFAICT there is no possible deadlock scenario between the
 	 * flush_work() and complete() primitives (except for single-threaded
 	 * workqueues), so hiding them isn't a problem.
 	 */
-	crossrelease_hist_start(XHLOCK_PROC, true);
+	lockdep_invariant_state(true);
 	trace_workqueue_execute_start(work);
 	worker->current_func(work);
 	/*
@@ -2122,7 +2122,6 @@ __acquires(&pool->lock)
 	 * point will only record its address.
 	 */
 	trace_workqueue_execute_end(work);
-	crossrelease_hist_end(XHLOCK_PROC);
 	lock_map_release(&lockdep_map);
 	lock_map_release(&pwq->wq->lockdep_map);
 
-- 
cgit v1.2.3


From 966a967116e699762dbf4af7f9e0d1955c25aa37 Mon Sep 17 00:00:00 2001
From: Ying Huang <ying.huang@intel.com>
Date: Tue, 8 Aug 2017 12:30:00 +0800
Subject: smp: Avoid using two cache lines for struct call_single_data

struct call_single_data is used in IPIs to transfer information between
CPUs.  Its size is bigger than sizeof(unsigned long) and less than
cache line size.  Currently it is not allocated with any explicit alignment
requirements.  This makes it possible for allocated call_single_data to
cross two cache lines, which results in double the number of the cache lines
that need to be transferred among CPUs.

This can be fixed by requiring call_single_data to be aligned with the
size of call_single_data. Currently the size of call_single_data is the
power of 2.  If we add new fields to call_single_data, we may need to
add padding to make sure the size of new definition is the power of 2
as well.

Fortunately, this is enforced by GCC, which will report bad sizes.

To set alignment requirements of call_single_data to the size of
call_single_data, a struct definition and a typedef is used.

To test the effect of the patch, I used the vm-scalability multiple
thread swap test case (swap-w-seq-mt).  The test will create multiple
threads and each thread will eat memory until all RAM and part of swap
is used, so that huge number of IPIs are triggered when unmapping
memory.  In the test, the throughput of memory writing improves ~5%
compared with misaligned call_single_data, because of faster IPIs.

Suggested-by: Peter Zijlstra <peterz@infradead.org>
Signed-off-by: Huang, Ying <ying.huang@intel.com>
[ Add call_single_data_t and align with size of call_single_data. ]
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Aaron Lu <aaron.lu@intel.com>
Cc: Borislav Petkov <bp@suse.de>
Cc: Eric Dumazet <eric.dumazet@gmail.com>
Cc: Juergen Gross <jgross@suse.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/87bmnqd6lz.fsf@yhuang-mobile.sh.intel.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/mips/kernel/smp.c                             |  6 ++--
 block/blk-softirq.c                                |  2 +-
 drivers/block/null_blk.c                           |  2 +-
 drivers/cpuidle/coupled.c                          | 10 +++----
 drivers/net/ethernet/cavium/liquidio/lio_main.c    |  2 +-
 drivers/net/ethernet/cavium/liquidio/octeon_droq.h |  2 +-
 include/linux/blkdev.h                             |  2 +-
 include/linux/netdevice.h                          |  2 +-
 include/linux/smp.h                                |  8 ++++--
 kernel/sched/sched.h                               |  2 +-
 kernel/smp.c                                       | 32 ++++++++++++----------
 kernel/up.c                                        |  2 +-
 12 files changed, 39 insertions(+), 33 deletions(-)

(limited to 'include')

diff --git a/arch/mips/kernel/smp.c b/arch/mips/kernel/smp.c
index 6bace7695788..c7cbddfcdc3b 100644
--- a/arch/mips/kernel/smp.c
+++ b/arch/mips/kernel/smp.c
@@ -648,12 +648,12 @@ EXPORT_SYMBOL(flush_tlb_one);
 #ifdef CONFIG_GENERIC_CLOCKEVENTS_BROADCAST
 
 static DEFINE_PER_CPU(atomic_t, tick_broadcast_count);
-static DEFINE_PER_CPU(struct call_single_data, tick_broadcast_csd);
+static DEFINE_PER_CPU(call_single_data_t, tick_broadcast_csd);
 
 void tick_broadcast(const struct cpumask *mask)
 {
 	atomic_t *count;
-	struct call_single_data *csd;
+	call_single_data_t *csd;
 	int cpu;
 
 	for_each_cpu(cpu, mask) {
@@ -674,7 +674,7 @@ static void tick_broadcast_callee(void *info)
 
 static int __init tick_broadcast_init(void)
 {
-	struct call_single_data *csd;
+	call_single_data_t *csd;
 	int cpu;
 
 	for (cpu = 0; cpu < NR_CPUS; cpu++) {
diff --git a/block/blk-softirq.c b/block/blk-softirq.c
index 87b7df4851bf..07125e7941f4 100644
--- a/block/blk-softirq.c
+++ b/block/blk-softirq.c
@@ -60,7 +60,7 @@ static void trigger_softirq(void *data)
 static int raise_blk_irq(int cpu, struct request *rq)
 {
 	if (cpu_online(cpu)) {
-		struct call_single_data *data = &rq->csd;
+		call_single_data_t *data = &rq->csd;
 
 		data->func = trigger_softirq;
 		data->info = rq;
diff --git a/drivers/block/null_blk.c b/drivers/block/null_blk.c
index 85c24cace973..81142ce781da 100644
--- a/drivers/block/null_blk.c
+++ b/drivers/block/null_blk.c
@@ -13,7 +13,7 @@
 struct nullb_cmd {
 	struct list_head list;
 	struct llist_node ll_list;
-	struct call_single_data csd;
+	call_single_data_t csd;
 	struct request *rq;
 	struct bio *bio;
 	unsigned int tag;
diff --git a/drivers/cpuidle/coupled.c b/drivers/cpuidle/coupled.c
index 71e586d7df71..147f38ea0fcd 100644
--- a/drivers/cpuidle/coupled.c
+++ b/drivers/cpuidle/coupled.c
@@ -119,13 +119,13 @@ struct cpuidle_coupled {
 
 #define CPUIDLE_COUPLED_NOT_IDLE	(-1)
 
-static DEFINE_PER_CPU(struct call_single_data, cpuidle_coupled_poke_cb);
+static DEFINE_PER_CPU(call_single_data_t, cpuidle_coupled_poke_cb);
 
 /*
  * The cpuidle_coupled_poke_pending mask is used to avoid calling
- * __smp_call_function_single with the per cpu call_single_data struct already
+ * __smp_call_function_single with the per cpu call_single_data_t struct already
  * in use.  This prevents a deadlock where two cpus are waiting for each others
- * call_single_data struct to be available
+ * call_single_data_t struct to be available
  */
 static cpumask_t cpuidle_coupled_poke_pending;
 
@@ -339,7 +339,7 @@ static void cpuidle_coupled_handle_poke(void *info)
  */
 static void cpuidle_coupled_poke(int cpu)
 {
-	struct call_single_data *csd = &per_cpu(cpuidle_coupled_poke_cb, cpu);
+	call_single_data_t *csd = &per_cpu(cpuidle_coupled_poke_cb, cpu);
 
 	if (!cpumask_test_and_set_cpu(cpu, &cpuidle_coupled_poke_pending))
 		smp_call_function_single_async(cpu, csd);
@@ -651,7 +651,7 @@ int cpuidle_coupled_register_device(struct cpuidle_device *dev)
 {
 	int cpu;
 	struct cpuidle_device *other_dev;
-	struct call_single_data *csd;
+	call_single_data_t *csd;
 	struct cpuidle_coupled *coupled;
 
 	if (cpumask_empty(&dev->coupled_cpus))
diff --git a/drivers/net/ethernet/cavium/liquidio/lio_main.c b/drivers/net/ethernet/cavium/liquidio/lio_main.c
index 51583ae4b1eb..120b6e537b28 100644
--- a/drivers/net/ethernet/cavium/liquidio/lio_main.c
+++ b/drivers/net/ethernet/cavium/liquidio/lio_main.c
@@ -2468,7 +2468,7 @@ static void liquidio_napi_drv_callback(void *arg)
 	if (OCTEON_CN23XX_PF(oct) || droq->cpu_id == this_cpu) {
 		napi_schedule_irqoff(&droq->napi);
 	} else {
-		struct call_single_data *csd = &droq->csd;
+		call_single_data_t *csd = &droq->csd;
 
 		csd->func = napi_schedule_wrapper;
 		csd->info = &droq->napi;
diff --git a/drivers/net/ethernet/cavium/liquidio/octeon_droq.h b/drivers/net/ethernet/cavium/liquidio/octeon_droq.h
index 6efd139b894d..f91bc84d1719 100644
--- a/drivers/net/ethernet/cavium/liquidio/octeon_droq.h
+++ b/drivers/net/ethernet/cavium/liquidio/octeon_droq.h
@@ -328,7 +328,7 @@ struct octeon_droq {
 
 	u32 cpu_id;
 
-	struct call_single_data csd;
+	call_single_data_t csd;
 };
 
 #define OCT_DROQ_SIZE   (sizeof(struct octeon_droq))
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 25f6a0cb27d3..006fa09a641e 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -134,7 +134,7 @@ typedef __u32 __bitwise req_flags_t;
 struct request {
 	struct list_head queuelist;
 	union {
-		struct call_single_data csd;
+		call_single_data_t csd;
 		u64 fifo_time;
 	};
 
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 779b23595596..6557f320b66e 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -2774,7 +2774,7 @@ struct softnet_data {
 	unsigned int		input_queue_head ____cacheline_aligned_in_smp;
 
 	/* Elements below can be accessed between CPUs for RPS/RFS */
-	struct call_single_data	csd ____cacheline_aligned_in_smp;
+	call_single_data_t	csd ____cacheline_aligned_in_smp;
 	struct softnet_data	*rps_ipi_next;
 	unsigned int		cpu;
 	unsigned int		input_queue_tail;
diff --git a/include/linux/smp.h b/include/linux/smp.h
index 68123c1fe549..98b1fe027fc9 100644
--- a/include/linux/smp.h
+++ b/include/linux/smp.h
@@ -14,13 +14,17 @@
 #include <linux/llist.h>
 
 typedef void (*smp_call_func_t)(void *info);
-struct call_single_data {
+struct __call_single_data {
 	struct llist_node llist;
 	smp_call_func_t func;
 	void *info;
 	unsigned int flags;
 };
 
+/* Use __aligned() to avoid to use 2 cache lines for 1 csd */
+typedef struct __call_single_data call_single_data_t
+	__aligned(sizeof(struct __call_single_data));
+
 /* total number of cpus in this system (may exceed NR_CPUS) */
 extern unsigned int total_cpus;
 
@@ -48,7 +52,7 @@ void on_each_cpu_cond(bool (*cond_func)(int cpu, void *info),
 		smp_call_func_t func, void *info, bool wait,
 		gfp_t gfp_flags);
 
-int smp_call_function_single_async(int cpu, struct call_single_data *csd);
+int smp_call_function_single_async(int cpu, call_single_data_t *csd);
 
 #ifdef CONFIG_SMP
 
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index eeef1a3086d1..f29a7d2b57e1 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -769,7 +769,7 @@ struct rq {
 #ifdef CONFIG_SCHED_HRTICK
 #ifdef CONFIG_SMP
 	int hrtick_csd_pending;
-	struct call_single_data hrtick_csd;
+	call_single_data_t hrtick_csd;
 #endif
 	struct hrtimer hrtick_timer;
 #endif
diff --git a/kernel/smp.c b/kernel/smp.c
index 3061483cb3ad..81cfca9b4cc3 100644
--- a/kernel/smp.c
+++ b/kernel/smp.c
@@ -28,7 +28,7 @@ enum {
 };
 
 struct call_function_data {
-	struct call_single_data	__percpu *csd;
+	call_single_data_t	__percpu *csd;
 	cpumask_var_t		cpumask;
 	cpumask_var_t		cpumask_ipi;
 };
@@ -51,7 +51,7 @@ int smpcfd_prepare_cpu(unsigned int cpu)
 		free_cpumask_var(cfd->cpumask);
 		return -ENOMEM;
 	}
-	cfd->csd = alloc_percpu(struct call_single_data);
+	cfd->csd = alloc_percpu(call_single_data_t);
 	if (!cfd->csd) {
 		free_cpumask_var(cfd->cpumask);
 		free_cpumask_var(cfd->cpumask_ipi);
@@ -103,12 +103,12 @@ void __init call_function_init(void)
  * previous function call. For multi-cpu calls its even more interesting
  * as we'll have to ensure no other cpu is observing our csd.
  */
-static __always_inline void csd_lock_wait(struct call_single_data *csd)
+static __always_inline void csd_lock_wait(call_single_data_t *csd)
 {
 	smp_cond_load_acquire(&csd->flags, !(VAL & CSD_FLAG_LOCK));
 }
 
-static __always_inline void csd_lock(struct call_single_data *csd)
+static __always_inline void csd_lock(call_single_data_t *csd)
 {
 	csd_lock_wait(csd);
 	csd->flags |= CSD_FLAG_LOCK;
@@ -116,12 +116,12 @@ static __always_inline void csd_lock(struct call_single_data *csd)
 	/*
 	 * prevent CPU from reordering the above assignment
 	 * to ->flags with any subsequent assignments to other
-	 * fields of the specified call_single_data structure:
+	 * fields of the specified call_single_data_t structure:
 	 */
 	smp_wmb();
 }
 
-static __always_inline void csd_unlock(struct call_single_data *csd)
+static __always_inline void csd_unlock(call_single_data_t *csd)
 {
 	WARN_ON(!(csd->flags & CSD_FLAG_LOCK));
 
@@ -131,14 +131,14 @@ static __always_inline void csd_unlock(struct call_single_data *csd)
 	smp_store_release(&csd->flags, 0);
 }
 
-static DEFINE_PER_CPU_SHARED_ALIGNED(struct call_single_data, csd_data);
+static DEFINE_PER_CPU_SHARED_ALIGNED(call_single_data_t, csd_data);
 
 /*
- * Insert a previously allocated call_single_data element
+ * Insert a previously allocated call_single_data_t element
  * for execution on the given CPU. data must already have
  * ->func, ->info, and ->flags set.
  */
-static int generic_exec_single(int cpu, struct call_single_data *csd,
+static int generic_exec_single(int cpu, call_single_data_t *csd,
 			       smp_call_func_t func, void *info)
 {
 	if (cpu == smp_processor_id()) {
@@ -210,7 +210,7 @@ static void flush_smp_call_function_queue(bool warn_cpu_offline)
 {
 	struct llist_head *head;
 	struct llist_node *entry;
-	struct call_single_data *csd, *csd_next;
+	call_single_data_t *csd, *csd_next;
 	static bool warned;
 
 	WARN_ON(!irqs_disabled());
@@ -268,8 +268,10 @@ static void flush_smp_call_function_queue(bool warn_cpu_offline)
 int smp_call_function_single(int cpu, smp_call_func_t func, void *info,
 			     int wait)
 {
-	struct call_single_data *csd;
-	struct call_single_data csd_stack = { .flags = CSD_FLAG_LOCK | CSD_FLAG_SYNCHRONOUS };
+	call_single_data_t *csd;
+	call_single_data_t csd_stack = {
+		.flags = CSD_FLAG_LOCK | CSD_FLAG_SYNCHRONOUS,
+	};
 	int this_cpu;
 	int err;
 
@@ -321,7 +323,7 @@ EXPORT_SYMBOL(smp_call_function_single);
  * NOTE: Be careful, there is unfortunately no current debugging facility to
  * validate the correctness of this serialization.
  */
-int smp_call_function_single_async(int cpu, struct call_single_data *csd)
+int smp_call_function_single_async(int cpu, call_single_data_t *csd)
 {
 	int err = 0;
 
@@ -444,7 +446,7 @@ void smp_call_function_many(const struct cpumask *mask,
 
 	cpumask_clear(cfd->cpumask_ipi);
 	for_each_cpu(cpu, cfd->cpumask) {
-		struct call_single_data *csd = per_cpu_ptr(cfd->csd, cpu);
+		call_single_data_t *csd = per_cpu_ptr(cfd->csd, cpu);
 
 		csd_lock(csd);
 		if (wait)
@@ -460,7 +462,7 @@ void smp_call_function_many(const struct cpumask *mask,
 
 	if (wait) {
 		for_each_cpu(cpu, cfd->cpumask) {
-			struct call_single_data *csd;
+			call_single_data_t *csd;
 
 			csd = per_cpu_ptr(cfd->csd, cpu);
 			csd_lock_wait(csd);
diff --git a/kernel/up.c b/kernel/up.c
index ee81ac9af4ca..42c46bf3e0a5 100644
--- a/kernel/up.c
+++ b/kernel/up.c
@@ -23,7 +23,7 @@ int smp_call_function_single(int cpu, void (*func) (void *info), void *info,
 }
 EXPORT_SYMBOL(smp_call_function_single);
 
-int smp_call_function_single_async(int cpu, struct call_single_data *csd)
+int smp_call_function_single_async(int cpu, call_single_data_t *csd)
 {
 	unsigned long flags;
 
-- 
cgit v1.2.3


From ec81048cc340bb03334e6ca62661ecc0a684897a Mon Sep 17 00:00:00 2001
From: Boqun Feng <boqun.feng@gmail.com>
Date: Wed, 23 Aug 2017 23:25:38 +0800
Subject: sched/completion: Avoid unnecessary stack allocation for
 COMPLETION_INITIALIZER_ONSTACK()

In theory, COMPLETION_INITIALIZER_ONSTACK() should never affect the
stack allocation of the caller. However, on some compilers, a temporary
structure was allocated for the return value of
COMPLETION_INITIALIZER_ONSTACK().

For example in write_journal() with LOCKDEP_COMPLETIONS=y (GCC is 7.1.1):

	io_comp.comp = COMPLETION_INITIALIZER_ONSTACK(io_comp.comp);
	    2462:       e8 00 00 00 00          callq  2467 <write_journal+0x47>
	    2467:       48 8d 85 80 fd ff ff    lea    -0x280(%rbp),%rax
	    246e:       48 c7 c6 00 00 00 00    mov    $0x0,%rsi
	    2475:       48 c7 c2 00 00 00 00    mov    $0x0,%rdx
		x->done = 0;
	    247c:       c7 85 90 fd ff ff 00    movl   $0x0,-0x270(%rbp)
	    2483:       00 00 00
		init_waitqueue_head(&x->wait);
	    2486:       48 8d 78 18             lea    0x18(%rax),%rdi
	    248a:       e8 00 00 00 00          callq  248f <write_journal+0x6f>
		if (commit_start + commit_sections <= ic->journal_sections) {
	    248f:       41 8b 87 a8 00 00 00    mov    0xa8(%r15),%eax
		io_comp.comp = COMPLETION_INITIALIZER_ONSTACK(io_comp.comp);
	    2496:       48 8d bd e8 f9 ff ff    lea    -0x618(%rbp),%rdi
	    249d:       48 8d b5 90 fd ff ff    lea    -0x270(%rbp),%rsi
	    24a4:       b9 17 00 00 00          mov    $0x17,%ecx
	    24a9:       f3 48 a5                rep movsq %ds:(%rsi),%es:(%rdi)
		if (commit_start + commit_sections <= ic->journal_sections) {
	    24ac:       41 39 c6                cmp    %eax,%r14d
		io_comp.comp = COMPLETION_INITIALIZER_ONSTACK(io_comp.comp);
	    24af:       48 8d bd 90 fd ff ff    lea    -0x270(%rbp),%rdi
	    24b6:       48 8d b5 e8 f9 ff ff    lea    -0x618(%rbp),%rsi
	    24bd:       b9 17 00 00 00          mov    $0x17,%ecx
	    24c2:       f3 48 a5                rep movsq %ds:(%rsi),%es:(%rdi)

We can obviously see the temporary structure allocated, and the compiler
also does two meaningless memcpy with "rep movsq".

And according to:

	https://gcc.gnu.org/onlinedocs/gcc/Statement-Exprs.html#Statement-Exprs

The return value of a statement expression is returned by value, so the
temporary variable is created in COMPLETION_INITIALIZER_ONSTACK(), and
that's why the temporary structures are allocted.

To fix this, make the brace block in COMPLETION_INITIALIZER_ONSTACK()
return a pointer and dereference it outside the block rather than return
the whole structure, in this way, we are able to teach the compiler not
to do the unnecessary stack allocation.

This could also reduce the stack size even if !LOCKDEP, for example in
write_journal(), compiled with gcc 7.1.1, the result of command:

	 objdump -d drivers/md/dm-integrity.o | ./scripts/checkstack.pl x86

before:

	0x0000246a write_journal [dm-integrity.o]:              696

after:

	0x00002b7a write_journal [dm-integrity.o]:              296

Reported-by: Arnd Bergmann <arnd@arndb.de>
Signed-off-by: Boqun Feng <boqun.feng@gmail.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Acked-by: Arnd Bergmann <arnd@arndb.de>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Byungchul Park <byungchul.park@lge.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Nicholas Piggin <npiggin@gmail.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: walken@google.com
Cc: willy@infradead.org
Link: http://lkml.kernel.org/r/20170823152542.5150-3-boqun.feng@gmail.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/completion.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/completion.h b/include/linux/completion.h
index 791f053f28b7..cae5400022a3 100644
--- a/include/linux/completion.h
+++ b/include/linux/completion.h
@@ -74,7 +74,7 @@ static inline void complete_release_commit(struct completion *x) {}
 #endif
 
 #define COMPLETION_INITIALIZER_ONSTACK(work) \
-	({ init_completion(&work); work; })
+	(*({ init_completion(&work); &work; }))
 
 /**
  * DECLARE_COMPLETION - declare and initialize a completion structure
-- 
cgit v1.2.3


From e8f11db956aa09c1618051a7aaf367d6810d8d8c Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Tue, 29 Aug 2017 14:42:06 +0200
Subject: libata: check for trusted computing in IDENTIFY DEVICE data

ATA-8 and later mirrors the TRUSTED COMPUTING SUPPORTED bit in word 48 of
the IDENTIFY DEVICE data.  Check this before issuing a READ LOG PAGE
command to avoid issues with buggy devices.  The only downside is that
we can't support Security Send / Receive for a device with an older
revision due to the conflicting use of this field in earlier
specifications.

tj: The reason we need this is because some devices which don't
    support READ LOG PAGE lock up after getting issued that command.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Tested-by: David Ahern <dsahern@gmail.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 drivers/ata/libata-core.c |  3 +++
 include/linux/ata.h       | 10 +++++++++-
 2 files changed, 12 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/drivers/ata/libata-core.c b/drivers/ata/libata-core.c
index 697f5f896b19..ca57b03ab950 100644
--- a/drivers/ata/libata-core.c
+++ b/drivers/ata/libata-core.c
@@ -2413,6 +2413,9 @@ static void ata_dev_config_trusted(struct ata_device *dev)
 	u64 trusted_cap;
 	unsigned int err;
 
+	if (!ata_id_has_trusted(dev->id))
+		return;
+
 	if (!ata_identify_page_supported(dev, ATA_LOG_SECURITY)) {
 		ata_dev_warn(dev,
 			     "Security Log not supported\n");
diff --git a/include/linux/ata.h b/include/linux/ata.h
index e65ae4b2ed48..c7a353825450 100644
--- a/include/linux/ata.h
+++ b/include/linux/ata.h
@@ -60,7 +60,8 @@ enum {
 	ATA_ID_FW_REV		= 23,
 	ATA_ID_PROD		= 27,
 	ATA_ID_MAX_MULTSECT	= 47,
-	ATA_ID_DWORD_IO		= 48,
+	ATA_ID_DWORD_IO		= 48,	/* before ATA-8 */
+	ATA_ID_TRUSTED		= 48,	/* ATA-8 and later */
 	ATA_ID_CAPABILITY	= 49,
 	ATA_ID_OLD_PIO_MODES	= 51,
 	ATA_ID_OLD_DMA_MODES	= 52,
@@ -889,6 +890,13 @@ static inline bool ata_id_has_dword_io(const u16 *id)
 	return id[ATA_ID_DWORD_IO] & (1 << 0);
 }
 
+static inline bool ata_id_has_trusted(const u16 *id)
+{
+	if (ata_id_major_version(id) <= 7)
+		return false;
+	return id[ATA_ID_TRUSTED] & (1 << 0);
+}
+
 static inline bool ata_id_has_unload(const u16 *id)
 {
 	if (ata_id_major_version(id) >= 7 &&
-- 
cgit v1.2.3


From 2aca392398384f8a2f339f4c661a1699f4f2e2eb Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Tue, 29 Aug 2017 08:36:58 -0700
Subject: Revert "libata: quirk read log on no-name M.2 SSD"

This reverts commit 35f0b6a779b8b7a98faefd7c1c660b4dac9a5c26.

We now conditionalize issuing of READ LOG PAGE on the TRUSTED
COMPUTING SUPPORTED bit in the identity data and this shouldn't be
necessary.

Signed-off-by: Tejun Heo <tj@kernel.org>
---
 drivers/ata/libata-core.c | 4 ----
 include/linux/libata.h    | 1 -
 2 files changed, 5 deletions(-)

(limited to 'include')

diff --git a/drivers/ata/libata-core.c b/drivers/ata/libata-core.c
index ca57b03ab950..1945a8ea2099 100644
--- a/drivers/ata/libata-core.c
+++ b/drivers/ata/libata-core.c
@@ -2079,8 +2079,6 @@ unsigned int ata_read_log_page(struct ata_device *dev, u8 log,
 	 */
 	if (ap_flags & ATA_FLAG_NO_LOG_PAGE)
 		return AC_ERR_DEV;
-	if (dev->horkage & ATA_HORKAGE_NO_LOG_PAGE)
-		return AC_ERR_DEV;
 
 retry:
 	ata_tf_init(dev, &tf);
@@ -4583,8 +4581,6 @@ static const struct ata_blacklist_entry ata_device_blacklist [] = {
 	{ "WDC WD3000JD-*",		NULL,	ATA_HORKAGE_WD_BROKEN_LPM },
 	{ "WDC WD3200JD-*",		NULL,	ATA_HORKAGE_WD_BROKEN_LPM },
 
-
-	{ "M.2 (S42) 3ME3",		NULL,	ATA_HORKAGE_NO_LOG_PAGE },
 	/* End Marker */
 	{ }
 };
diff --git a/include/linux/libata.h b/include/linux/libata.h
index 9e927ae7fced..931c32f1f18d 100644
--- a/include/linux/libata.h
+++ b/include/linux/libata.h
@@ -438,7 +438,6 @@ enum {
 	ATA_HORKAGE_NO_DMA_LOG	= (1 << 23),	/* don't use DMA for log read */
 	ATA_HORKAGE_NOTRIM	= (1 << 24),	/* don't use TRIM */
 	ATA_HORKAGE_MAX_SEC_1024 = (1 << 25),	/* Limit max sects to 1024 */
-	ATA_HORKAGE_NO_LOG_PAGE = (1 << 26),	/* Doesn't like Get Log Page */
 
 	 /* DMA mask for user DMA control: User visible values; DO NOT
 	    renumber */
-- 
cgit v1.2.3


From 2d2a2b8c080ad8feab7ca87769dedb3c7a83a375 Mon Sep 17 00:00:00 2001
From: Lothar Waßmann <LW@KARO-electronics.de>
Date: Tue, 29 Aug 2017 12:17:13 +0200
Subject: mtd: nand: complain loudly when chip->bits_per_cell is not correctly
 initialized
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

chip->bits_per_cell which is used to determine the NAND cell type
(SLC/MLC) should always have a value != 0.
Complain loudly if the value is 0 in nand_is_slc() to catch use before
correct initialization.

Signed-off-by: Lothar Waßmann <LW@KARO-electronics.de>
Signed-off-by: Boris Brezillon <boris.brezillon@free-electrons.com>
---
 include/linux/mtd/rawnand.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include')

diff --git a/include/linux/mtd/rawnand.h b/include/linux/mtd/rawnand.h
index 8fb488d586d6..c433f093766a 100644
--- a/include/linux/mtd/rawnand.h
+++ b/include/linux/mtd/rawnand.h
@@ -1234,6 +1234,8 @@ int onfi_init_data_interface(struct nand_chip *chip,
  */
 static inline bool nand_is_slc(struct nand_chip *chip)
 {
+	WARN(chip->bits_per_cell == 0,
+	     "chip->bits_per_cell is used uninitialized\n");
 	return chip->bits_per_cell == 1;
 }
 
-- 
cgit v1.2.3


From c31e5a4876cdfbfcc28c4fb201ad872c65671067 Mon Sep 17 00:00:00 2001
From: Jesper Dangaard Brouer <brouer@redhat.com>
Date: Tue, 29 Aug 2017 16:37:40 +0200
Subject: xdp: remove redundant argument to trace_xdp_redirect

Supplying the action argument XDP_REDIRECT to the tracepoint xdp_redirect
is redundant as it is only called in-case this action was specified.

Remove the argument, but keep "act" member of the tracepoint struct and
populate it with XDP_REDIRECT.  This makes it easier to write a common bpf_prog
processing events.

Signed-off-by: Jesper Dangaard Brouer <brouer@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/trace/events/xdp.h | 6 +++---
 net/core/filter.c          | 6 +++---
 2 files changed, 6 insertions(+), 6 deletions(-)

(limited to 'include')

diff --git a/include/trace/events/xdp.h b/include/trace/events/xdp.h
index 27cf2ef35f5f..f684f3b36bca 100644
--- a/include/trace/events/xdp.h
+++ b/include/trace/events/xdp.h
@@ -52,10 +52,10 @@ TRACE_EVENT(xdp_exception,
 TRACE_EVENT(xdp_redirect,
 
 	TP_PROTO(const struct net_device *dev,
-		 const struct bpf_prog *xdp, u32 act,
+		 const struct bpf_prog *xdp,
 		 int to_index, int err),
 
-	TP_ARGS(dev, xdp, act, to_index, err),
+	TP_ARGS(dev, xdp, to_index, err),
 
 	TP_STRUCT__entry(
 		__array(u8, prog_tag, 8)
@@ -68,7 +68,7 @@ TRACE_EVENT(xdp_redirect,
 	TP_fast_assign(
 		BUILD_BUG_ON(sizeof(__entry->prog_tag) != sizeof(xdp->tag));
 		memcpy(__entry->prog_tag, xdp->tag, sizeof(xdp->tag));
-		__entry->act		= act;
+		__entry->act		= XDP_REDIRECT;
 		__entry->ifindex	= dev->ifindex;
 		__entry->to_index	= to_index;
 		__entry->err		= err;
diff --git a/net/core/filter.c b/net/core/filter.c
index 4bcd6baa80c9..de31fb684ad4 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -2524,7 +2524,7 @@ static int xdp_do_redirect_map(struct net_device *dev, struct xdp_buff *xdp,
 	if (likely(!err))
 		ri->map_to_flush = map;
 out:
-	trace_xdp_redirect(dev, xdp_prog, XDP_REDIRECT, index, err);
+	trace_xdp_redirect(dev, xdp_prog, index, err);
 	return err;
 }
 
@@ -2548,7 +2548,7 @@ int xdp_do_redirect(struct net_device *dev, struct xdp_buff *xdp,
 
 	err = __bpf_tx_xdp(fwd, NULL, xdp, 0);
 out:
-	trace_xdp_redirect(dev, xdp_prog, XDP_REDIRECT, index, err);
+	trace_xdp_redirect(dev, xdp_prog, index, err);
 	return err;
 }
 EXPORT_SYMBOL_GPL(xdp_do_redirect);
@@ -2582,7 +2582,7 @@ int xdp_do_generic_redirect(struct net_device *dev, struct sk_buff *skb,
 
 	skb->dev = fwd;
 out:
-	trace_xdp_redirect(dev, xdp_prog, XDP_REDIRECT, index, err);
+	trace_xdp_redirect(dev, xdp_prog, index, err);
 	return err;
 }
 EXPORT_SYMBOL_GPL(xdp_do_generic_redirect);
-- 
cgit v1.2.3


From 8d3b778ff544b369f0847e6c15f3e73057298aa4 Mon Sep 17 00:00:00 2001
From: Jesper Dangaard Brouer <brouer@redhat.com>
Date: Tue, 29 Aug 2017 16:37:45 +0200
Subject: xdp: tracepoint xdp_redirect also need a map argument

To make sense of the map index, the tracepoint user also need to know
that map we are talking about.  Supply the map pointer but only expose
the map->id.

The 'to_index' is renamed 'to_ifindex'.  In the xdp_redirect_map case,
this is the result of the devmap lookup. The map lookup key is exposed
as map_index, which is needed to troubleshoot in case the lookup failed.
The 'to_ifindex' is placed after 'err' to keep TP_STRUCT as common as
possible.

This also keeps the TP_STRUCT similar enough, that userspace can write
a monitor program, that doesn't need to care about whether
bpf_redirect or bpf_redirect_map were used.

Signed-off-by: Jesper Dangaard Brouer <brouer@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/trace/events/xdp.h | 38 ++++++++++++++++++++++++++++++--------
 net/core/filter.c          |  6 +++---
 2 files changed, 33 insertions(+), 11 deletions(-)

(limited to 'include')

diff --git a/include/trace/events/xdp.h b/include/trace/events/xdp.h
index f684f3b36bca..573dcfa1aeaa 100644
--- a/include/trace/events/xdp.h
+++ b/include/trace/events/xdp.h
@@ -49,20 +49,23 @@ TRACE_EVENT(xdp_exception,
 		  __entry->ifindex)
 );
 
-TRACE_EVENT(xdp_redirect,
+DECLARE_EVENT_CLASS(xdp_redirect_template,
 
 	TP_PROTO(const struct net_device *dev,
 		 const struct bpf_prog *xdp,
-		 int to_index, int err),
+		 int to_ifindex, int err,
+		 const struct bpf_map *map, u32 map_index),
 
-	TP_ARGS(dev, xdp, to_index, err),
+	TP_ARGS(dev, xdp, to_ifindex, err, map, map_index),
 
 	TP_STRUCT__entry(
 		__array(u8, prog_tag, 8)
 		__field(u32, act)
 		__field(int, ifindex)
-		__field(int, to_index)
 		__field(int, err)
+		__field(int, to_ifindex)
+		__field(u32, map_id)
+		__field(int, map_index)
 	),
 
 	TP_fast_assign(
@@ -70,16 +73,35 @@ TRACE_EVENT(xdp_redirect,
 		memcpy(__entry->prog_tag, xdp->tag, sizeof(xdp->tag));
 		__entry->act		= XDP_REDIRECT;
 		__entry->ifindex	= dev->ifindex;
-		__entry->to_index	= to_index;
 		__entry->err		= err;
+		__entry->to_ifindex	= to_ifindex;
+		__entry->map_id		= map ? map->id : 0;
+		__entry->map_index	= map_index;
 	),
 
-	TP_printk("prog=%s action=%s ifindex=%d to_index=%d err=%d",
+	TP_printk("prog=%s action=%s ifindex=%d to_ifindex=%d err=%d"
+		  " map_id=%d map_index=%d",
 		  __print_hex_str(__entry->prog_tag, 8),
 		  __print_symbolic(__entry->act, __XDP_ACT_SYM_TAB),
-		  __entry->ifindex, __entry->to_index,
-		  __entry->err)
+		  __entry->ifindex, __entry->to_ifindex,
+		  __entry->err,
+		  __entry->map_id, __entry->map_index)
 );
+
+DEFINE_EVENT(xdp_redirect_template, xdp_redirect,
+	TP_PROTO(const struct net_device *dev,
+		 const struct bpf_prog *xdp,
+		 int to_ifindex, int err,
+		 const struct bpf_map *map, u32 map_index),
+	TP_ARGS(dev, xdp, to_ifindex, err, map, map_index)
+);
+
+#define _trace_xdp_redirect(dev, xdp, to, err)	\
+	 trace_xdp_redirect(dev, xdp, to, err, NULL, 0);
+
+#define trace_xdp_redirect_map(dev, xdp, fwd, err, map, idx)	\
+	trace_xdp_redirect(dev, xdp, fwd ? fwd->ifindex : 0, err, map, idx);
+
 #endif /* _TRACE_XDP_H */
 
 #include <trace/define_trace.h>
diff --git a/net/core/filter.c b/net/core/filter.c
index de31fb684ad4..31eab77cc842 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -2524,7 +2524,7 @@ static int xdp_do_redirect_map(struct net_device *dev, struct xdp_buff *xdp,
 	if (likely(!err))
 		ri->map_to_flush = map;
 out:
-	trace_xdp_redirect(dev, xdp_prog, index, err);
+	trace_xdp_redirect_map(dev, xdp_prog, fwd, err, map, index);
 	return err;
 }
 
@@ -2548,7 +2548,7 @@ int xdp_do_redirect(struct net_device *dev, struct xdp_buff *xdp,
 
 	err = __bpf_tx_xdp(fwd, NULL, xdp, 0);
 out:
-	trace_xdp_redirect(dev, xdp_prog, index, err);
+	_trace_xdp_redirect(dev, xdp_prog, index, err);
 	return err;
 }
 EXPORT_SYMBOL_GPL(xdp_do_redirect);
@@ -2582,7 +2582,7 @@ int xdp_do_generic_redirect(struct net_device *dev, struct sk_buff *skb,
 
 	skb->dev = fwd;
 out:
-	trace_xdp_redirect(dev, xdp_prog, index, err);
+	_trace_xdp_redirect(dev, xdp_prog, index, err);
 	return err;
 }
 EXPORT_SYMBOL_GPL(xdp_do_generic_redirect);
-- 
cgit v1.2.3


From b06337dfdb16bc3f668326b6a618c472c671182a Mon Sep 17 00:00:00 2001
From: Jesper Dangaard Brouer <brouer@redhat.com>
Date: Tue, 29 Aug 2017 16:37:51 +0200
Subject: xdp: make xdp tracepoints report bpf prog id instead of prog_tag

Given previous patch expose the map_id, it seems natural to also
report the bpf prog id.

Signed-off-by: Jesper Dangaard Brouer <brouer@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/trace/events/xdp.h | 18 ++++++++----------
 1 file changed, 8 insertions(+), 10 deletions(-)

(limited to 'include')

diff --git a/include/trace/events/xdp.h b/include/trace/events/xdp.h
index 573dcfa1aeaa..89eba564e199 100644
--- a/include/trace/events/xdp.h
+++ b/include/trace/events/xdp.h
@@ -31,20 +31,19 @@ TRACE_EVENT(xdp_exception,
 	TP_ARGS(dev, xdp, act),
 
 	TP_STRUCT__entry(
-		__array(u8, prog_tag, 8)
+		__field(int, prog_id)
 		__field(u32, act)
 		__field(int, ifindex)
 	),
 
 	TP_fast_assign(
-		BUILD_BUG_ON(sizeof(__entry->prog_tag) != sizeof(xdp->tag));
-		memcpy(__entry->prog_tag, xdp->tag, sizeof(xdp->tag));
+		__entry->prog_id	= xdp->aux->id;
 		__entry->act		= act;
 		__entry->ifindex	= dev->ifindex;
 	),
 
-	TP_printk("prog=%s action=%s ifindex=%d",
-		  __print_hex_str(__entry->prog_tag, 8),
+	TP_printk("prog_id=%d action=%s ifindex=%d",
+		  __entry->prog_id,
 		  __print_symbolic(__entry->act, __XDP_ACT_SYM_TAB),
 		  __entry->ifindex)
 );
@@ -59,7 +58,7 @@ DECLARE_EVENT_CLASS(xdp_redirect_template,
 	TP_ARGS(dev, xdp, to_ifindex, err, map, map_index),
 
 	TP_STRUCT__entry(
-		__array(u8, prog_tag, 8)
+		__field(int, prog_id)
 		__field(u32, act)
 		__field(int, ifindex)
 		__field(int, err)
@@ -69,8 +68,7 @@ DECLARE_EVENT_CLASS(xdp_redirect_template,
 	),
 
 	TP_fast_assign(
-		BUILD_BUG_ON(sizeof(__entry->prog_tag) != sizeof(xdp->tag));
-		memcpy(__entry->prog_tag, xdp->tag, sizeof(xdp->tag));
+		__entry->prog_id	= xdp->aux->id;
 		__entry->act		= XDP_REDIRECT;
 		__entry->ifindex	= dev->ifindex;
 		__entry->err		= err;
@@ -79,9 +77,9 @@ DECLARE_EVENT_CLASS(xdp_redirect_template,
 		__entry->map_index	= map_index;
 	),
 
-	TP_printk("prog=%s action=%s ifindex=%d to_ifindex=%d err=%d"
+	TP_printk("prog_id=%d action=%s ifindex=%d to_ifindex=%d err=%d"
 		  " map_id=%d map_index=%d",
-		  __print_hex_str(__entry->prog_tag, 8),
+		  __entry->prog_id,
 		  __print_symbolic(__entry->act, __XDP_ACT_SYM_TAB),
 		  __entry->ifindex, __entry->to_ifindex,
 		  __entry->err,
-- 
cgit v1.2.3


From f5836ca5e9867fa6ab88cadb9873af56d9ceb589 Mon Sep 17 00:00:00 2001
From: Jesper Dangaard Brouer <brouer@redhat.com>
Date: Tue, 29 Aug 2017 16:37:56 +0200
Subject: xdp: separate xdp_redirect tracepoint in error case

There is a need to separate the xdp_redirect tracepoint into two
tracepoints, for separating the error case from the normal forward
case.

Due to the extreme speeds XDP is operating at, loading a tracepoint
have a measurable impact.  Single core XDP REDIRECT (ethtool tuned
rx-usecs 25) can do 13.7 Mpps forwarding, but loading a simple
bpf_prog at the tracepoint (with a return 0) reduce perf to 10.2 Mpps
(CPU E5-1650 v4 @ 3.60GHz, driver: ixgbe)

The overhead of loading a bpf-based tracepoint can be calculated to
cost 25 nanosec ((1/13782002-1/10267937)*10^9 = -24.83 ns).

Using perf record on the tracepoint event, with a non-matching --filter
expression, the overhead is much larger. Performance drops to 8.3 Mpps,
cost 48 nanosec ((1/13782002-1/8312497)*10^9 = -47.74))

Having a separate tracepoint for err cases, which should be less
frequent, allow running a continuous monitor for errors while not
affecting the redirect forward performance (this have also been
verified by measurements).

Signed-off-by: Jesper Dangaard Brouer <brouer@redhat.com>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/trace/events/xdp.h | 22 ++++++++++++++++++----
 net/core/filter.c          | 37 ++++++++++++++++++++++++-------------
 2 files changed, 42 insertions(+), 17 deletions(-)

(limited to 'include')

diff --git a/include/trace/events/xdp.h b/include/trace/events/xdp.h
index 89eba564e199..1eebad55ebd4 100644
--- a/include/trace/events/xdp.h
+++ b/include/trace/events/xdp.h
@@ -94,11 +94,25 @@ DEFINE_EVENT(xdp_redirect_template, xdp_redirect,
 	TP_ARGS(dev, xdp, to_ifindex, err, map, map_index)
 );
 
-#define _trace_xdp_redirect(dev, xdp, to, err)	\
-	 trace_xdp_redirect(dev, xdp, to, err, NULL, 0);
+DEFINE_EVENT(xdp_redirect_template, xdp_redirect_err,
+	TP_PROTO(const struct net_device *dev,
+		 const struct bpf_prog *xdp,
+		 int to_ifindex, int err,
+		 const struct bpf_map *map, u32 map_index),
+	TP_ARGS(dev, xdp, to_ifindex, err, map, map_index)
+);
+
+#define _trace_xdp_redirect(dev, xdp, to)		\
+	 trace_xdp_redirect(dev, xdp, to, 0, NULL, 0);
+
+#define _trace_xdp_redirect_err(dev, xdp, to, err)	\
+	 trace_xdp_redirect_err(dev, xdp, to, err, NULL, 0);
+
+#define trace_xdp_redirect_map(dev, xdp, fwd, map, idx)	\
+	trace_xdp_redirect(dev, xdp, fwd ? fwd->ifindex : 0, 0, map, idx);
 
-#define trace_xdp_redirect_map(dev, xdp, fwd, err, map, idx)	\
-	trace_xdp_redirect(dev, xdp, fwd ? fwd->ifindex : 0, err, map, idx);
+#define trace_xdp_redirect_map_err(dev, xdp, fwd, map, idx, err)	\
+	trace_xdp_redirect_err(dev, xdp, fwd ? fwd->ifindex : 0, err, map, idx);
 
 #endif /* _TRACE_XDP_H */
 
diff --git a/net/core/filter.c b/net/core/filter.c
index 31eab77cc842..096e78de0b97 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -2515,16 +2515,20 @@ static int xdp_do_redirect_map(struct net_device *dev, struct xdp_buff *xdp,
 	fwd = __dev_map_lookup_elem(map, index);
 	if (!fwd) {
 		err = -EINVAL;
-		goto out;
+		goto err;
 	}
 	if (ri->map_to_flush && ri->map_to_flush != map)
 		xdp_do_flush_map();
 
 	err = __bpf_tx_xdp(fwd, map, xdp, index);
-	if (likely(!err))
-		ri->map_to_flush = map;
-out:
-	trace_xdp_redirect_map(dev, xdp_prog, fwd, err, map, index);
+	if (unlikely(err))
+		goto err;
+
+	ri->map_to_flush = map;
+	trace_xdp_redirect_map(dev, xdp_prog, fwd, map, index);
+	return 0;
+err:
+	trace_xdp_redirect_map_err(dev, xdp_prog, fwd, map, index, err);
 	return err;
 }
 
@@ -2543,12 +2547,17 @@ int xdp_do_redirect(struct net_device *dev, struct xdp_buff *xdp,
 	ri->ifindex = 0;
 	if (unlikely(!fwd)) {
 		err = -EINVAL;
-		goto out;
+		goto err;
 	}
 
 	err = __bpf_tx_xdp(fwd, NULL, xdp, 0);
-out:
-	_trace_xdp_redirect(dev, xdp_prog, index, err);
+	if (unlikely(err))
+		goto err;
+
+	_trace_xdp_redirect(dev, xdp_prog, index);
+	return 0;
+err:
+	_trace_xdp_redirect_err(dev, xdp_prog, index, err);
 	return err;
 }
 EXPORT_SYMBOL_GPL(xdp_do_redirect);
@@ -2566,23 +2575,25 @@ int xdp_do_generic_redirect(struct net_device *dev, struct sk_buff *skb,
 	ri->ifindex = 0;
 	if (unlikely(!fwd)) {
 		err = -EINVAL;
-		goto out;
+		goto err;
 	}
 
 	if (unlikely(!(fwd->flags & IFF_UP))) {
 		err = -ENETDOWN;
-		goto out;
+		goto err;
 	}
 
 	len = fwd->mtu + fwd->hard_header_len + VLAN_HLEN;
 	if (skb->len > len) {
 		err = -EMSGSIZE;
-		goto out;
+		goto err;
 	}
 
 	skb->dev = fwd;
-out:
-	_trace_xdp_redirect(dev, xdp_prog, index, err);
+	_trace_xdp_redirect(dev, xdp_prog, index);
+	return 0;
+err:
+	_trace_xdp_redirect_err(dev, xdp_prog, index, err);
 	return err;
 }
 EXPORT_SYMBOL_GPL(xdp_do_generic_redirect);
-- 
cgit v1.2.3


From 59a308967589f5b3f1f42793ab49bc2e18069769 Mon Sep 17 00:00:00 2001
From: Jesper Dangaard Brouer <brouer@redhat.com>
Date: Tue, 29 Aug 2017 16:38:01 +0200
Subject: xdp: separate xdp_redirect tracepoint in map case

Creating as specific xdp_redirect_map variant of the xdp tracepoints
allow users to write simpler/faster BPF progs that get attached to
these tracepoints.

Goal is to still keep the tracepoints in xdp_redirect and xdp_redirect_map
similar enough, that a tool can read the top part of the TP_STRUCT and
produce similar monitor statistics.

Signed-off-by: Jesper Dangaard Brouer <brouer@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/trace/events/xdp.h | 46 ++++++++++++++++++++++++++++++++++++++--------
 net/core/filter.c          |  4 ++--
 2 files changed, 40 insertions(+), 10 deletions(-)

(limited to 'include')

diff --git a/include/trace/events/xdp.h b/include/trace/events/xdp.h
index 1eebad55ebd4..862575ac8da9 100644
--- a/include/trace/events/xdp.h
+++ b/include/trace/events/xdp.h
@@ -77,13 +77,11 @@ DECLARE_EVENT_CLASS(xdp_redirect_template,
 		__entry->map_index	= map_index;
 	),
 
-	TP_printk("prog_id=%d action=%s ifindex=%d to_ifindex=%d err=%d"
-		  " map_id=%d map_index=%d",
+	TP_printk("prog_id=%d action=%s ifindex=%d to_ifindex=%d err=%d",
 		  __entry->prog_id,
 		  __print_symbolic(__entry->act, __XDP_ACT_SYM_TAB),
 		  __entry->ifindex, __entry->to_ifindex,
-		  __entry->err,
-		  __entry->map_id, __entry->map_index)
+		  __entry->err)
 );
 
 DEFINE_EVENT(xdp_redirect_template, xdp_redirect,
@@ -108,11 +106,43 @@ DEFINE_EVENT(xdp_redirect_template, xdp_redirect_err,
 #define _trace_xdp_redirect_err(dev, xdp, to, err)	\
 	 trace_xdp_redirect_err(dev, xdp, to, err, NULL, 0);
 
-#define trace_xdp_redirect_map(dev, xdp, fwd, map, idx)	\
-	trace_xdp_redirect(dev, xdp, fwd ? fwd->ifindex : 0, 0, map, idx);
+DEFINE_EVENT_PRINT(xdp_redirect_template, xdp_redirect_map,
+	TP_PROTO(const struct net_device *dev,
+		 const struct bpf_prog *xdp,
+		 int to_ifindex, int err,
+		 const struct bpf_map *map, u32 map_index),
+	TP_ARGS(dev, xdp, to_ifindex, err, map, map_index),
+	TP_printk("prog_id=%d action=%s ifindex=%d to_ifindex=%d err=%d"
+		  " map_id=%d map_index=%d",
+		  __entry->prog_id,
+		  __print_symbolic(__entry->act, __XDP_ACT_SYM_TAB),
+		  __entry->ifindex, __entry->to_ifindex,
+		  __entry->err,
+		  __entry->map_id, __entry->map_index)
+);
+
+DEFINE_EVENT_PRINT(xdp_redirect_template, xdp_redirect_map_err,
+	TP_PROTO(const struct net_device *dev,
+		 const struct bpf_prog *xdp,
+		 int to_ifindex, int err,
+		 const struct bpf_map *map, u32 map_index),
+	TP_ARGS(dev, xdp, to_ifindex, err, map, map_index),
+	TP_printk("prog_id=%d action=%s ifindex=%d to_ifindex=%d err=%d"
+		  " map_id=%d map_index=%d",
+		  __entry->prog_id,
+		  __print_symbolic(__entry->act, __XDP_ACT_SYM_TAB),
+		  __entry->ifindex, __entry->to_ifindex,
+		  __entry->err,
+		  __entry->map_id, __entry->map_index)
+);
+
+#define _trace_xdp_redirect_map(dev, xdp, fwd, map, idx)		\
+	 trace_xdp_redirect_map(dev, xdp, fwd ? fwd->ifindex : 0,	\
+				0, map, idx);
 
-#define trace_xdp_redirect_map_err(dev, xdp, fwd, map, idx, err)	\
-	trace_xdp_redirect_err(dev, xdp, fwd ? fwd->ifindex : 0, err, map, idx);
+#define _trace_xdp_redirect_map_err(dev, xdp, fwd, map, idx, err)	\
+	 trace_xdp_redirect_map_err(dev, xdp, fwd ? fwd->ifindex : 0,	\
+				    err, map, idx);
 
 #endif /* _TRACE_XDP_H */
 
diff --git a/net/core/filter.c b/net/core/filter.c
index 096e78de0b97..c6a37fe0285b 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -2525,10 +2525,10 @@ static int xdp_do_redirect_map(struct net_device *dev, struct xdp_buff *xdp,
 		goto err;
 
 	ri->map_to_flush = map;
-	trace_xdp_redirect_map(dev, xdp_prog, fwd, map, index);
+	_trace_xdp_redirect_map(dev, xdp_prog, fwd, map, index);
 	return 0;
 err:
-	trace_xdp_redirect_map_err(dev, xdp_prog, fwd, map, index, err);
+	_trace_xdp_redirect_map_err(dev, xdp_prog, fwd, map, index, err);
 	return err;
 }
 
-- 
cgit v1.2.3


From f5808ac158f2b16b686a3d3c0879c5d6048aba14 Mon Sep 17 00:00:00 2001
From: Andrew Jeffery <andrew@aj.id.au>
Date: Mon, 28 Aug 2017 09:47:11 +0930
Subject: leds: gpio: Allow LED to retain state at shutdown

In some systems, such as Baseboard Management Controllers (BMCs), we
want to retain the state of LEDs across a reboot of the BMC (whilst the
host remains up). Implement support for the retain-state-shutdown
devicetree property in leds-gpio.

Signed-off-by: Andrew Jeffery <andrew@aj.id.au>
Acked-by: Pavel Machek <pavel@ucw.cz>
Tested-by: Brandon Wyman <bjwyman@gmail.com>
Signed-off-by: Jacek Anaszewski <jacek.anaszewski@gmail.com>
---
 drivers/leds/leds-gpio.c | 7 ++++++-
 include/linux/leds.h     | 2 ++
 2 files changed, 8 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/drivers/leds/leds-gpio.c b/drivers/leds/leds-gpio.c
index e753ba93ba1e..764c31301f90 100644
--- a/drivers/leds/leds-gpio.c
+++ b/drivers/leds/leds-gpio.c
@@ -134,6 +134,8 @@ static int create_gpio_led(const struct gpio_led *template,
 		led_dat->cdev.flags |= LED_CORE_SUSPENDRESUME;
 	if (template->panic_indicator)
 		led_dat->cdev.flags |= LED_PANIC_INDICATOR;
+	if (template->retain_state_shutdown)
+		led_dat->cdev.flags |= LED_RETAIN_AT_SHUTDOWN;
 
 	ret = gpiod_direction_output(led_dat->gpiod, state);
 	if (ret < 0)
@@ -205,6 +207,8 @@ static struct gpio_leds_priv *gpio_leds_create(struct platform_device *pdev)
 
 		if (fwnode_property_present(child, "retain-state-suspended"))
 			led.retain_state_suspended = 1;
+		if (fwnode_property_present(child, "retain-state-shutdown"))
+			led.retain_state_shutdown = 1;
 		if (fwnode_property_present(child, "panic-indicator"))
 			led.panic_indicator = 1;
 
@@ -267,7 +271,8 @@ static void gpio_led_shutdown(struct platform_device *pdev)
 	for (i = 0; i < priv->num_leds; i++) {
 		struct gpio_led_data *led = &priv->leds[i];
 
-		gpio_led_set(&led->cdev, LED_OFF);
+		if (!(led->cdev.flags & LED_RETAIN_AT_SHUTDOWN))
+			gpio_led_set(&led->cdev, LED_OFF);
 	}
 }
 
diff --git a/include/linux/leds.h b/include/linux/leds.h
index 64c56d454f7d..bf6db4fe895b 100644
--- a/include/linux/leds.h
+++ b/include/linux/leds.h
@@ -49,6 +49,7 @@ struct led_classdev {
 #define LED_HW_PLUGGABLE	(1 << 19)
 #define LED_PANIC_INDICATOR	(1 << 20)
 #define LED_BRIGHT_HW_CHANGED	(1 << 21)
+#define LED_RETAIN_AT_SHUTDOWN	(1 << 22)
 
 	/* set_brightness_work / blink_timer flags, atomic, private. */
 	unsigned long		work_flags;
@@ -392,6 +393,7 @@ struct gpio_led {
 	unsigned	retain_state_suspended : 1;
 	unsigned	panic_indicator : 1;
 	unsigned	default_state : 2;
+	unsigned	retain_state_shutdown : 1;
 	/* default_state should be one of LEDS_GPIO_DEFSTATE_(ON|OFF|KEEP) */
 	struct gpio_desc *gpiod;
 };
-- 
cgit v1.2.3


From a92e145059cb883155a24a2d3ac33296d33d9df7 Mon Sep 17 00:00:00 2001
From: Tom St Denis <tom.stdenis@amd.com>
Date: Wed, 23 Aug 2017 15:17:47 -0400
Subject: drm/ttm: Add DMA map/unmap tracepoint (v3)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Also exports two functions that vendor drivers can call
to trace DMA mappings.  This is meant to help translate
IOMMU mappings of bus addresses back to physical pages.

Used by the umr amdgpu debugger for instance.

Signed-off-by: Tom St Denis <tom.stdenis@amd.com>
Reviewed-by: Christian König <christian.koenig@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>

(v2): Use dev_name() to get PCI path instead.
(v3): Use correct types for dma/phys addresses
---
 drivers/gpu/drm/ttm/Makefile          |  3 +-
 drivers/gpu/drm/ttm/ttm_debug.c       | 74 +++++++++++++++++++++++++++++
 drivers/gpu/drm/ttm/ttm_trace.h       | 87 +++++++++++++++++++++++++++++++++++
 drivers/gpu/drm/ttm/ttm_tracepoints.c | 45 ++++++++++++++++++
 include/drm/ttm/ttm_debug.h           | 31 +++++++++++++
 5 files changed, 239 insertions(+), 1 deletion(-)
 create mode 100644 drivers/gpu/drm/ttm/ttm_debug.c
 create mode 100644 drivers/gpu/drm/ttm/ttm_trace.h
 create mode 100644 drivers/gpu/drm/ttm/ttm_tracepoints.c
 create mode 100644 include/drm/ttm/ttm_debug.h

(limited to 'include')

diff --git a/drivers/gpu/drm/ttm/Makefile b/drivers/gpu/drm/ttm/Makefile
index 4d0c938ff4b2..a44fdfbe6351 100644
--- a/drivers/gpu/drm/ttm/Makefile
+++ b/drivers/gpu/drm/ttm/Makefile
@@ -1,10 +1,11 @@
 #
 # Makefile for the drm device driver.  This driver provides support for the
 
+ccflags-y := -I$(src)/.
 ttm-y := ttm_memory.o ttm_tt.o ttm_bo.o \
 	ttm_bo_util.o ttm_bo_vm.o ttm_module.o \
 	ttm_object.o ttm_lock.o ttm_execbuf_util.o ttm_page_alloc.o \
-	ttm_bo_manager.o ttm_page_alloc_dma.o
+	ttm_bo_manager.o ttm_page_alloc_dma.o ttm_debug.o ttm_tracepoints.o
 ttm-$(CONFIG_AGP) += ttm_agp_backend.o
 
 obj-$(CONFIG_DRM_TTM) += ttm.o
diff --git a/drivers/gpu/drm/ttm/ttm_debug.c b/drivers/gpu/drm/ttm/ttm_debug.c
new file mode 100644
index 000000000000..ef5f0d090154
--- /dev/null
+++ b/drivers/gpu/drm/ttm/ttm_debug.c
@@ -0,0 +1,74 @@
+/**************************************************************************
+ *
+ * Copyright (c) 2017 Advanced Micro Devices, Inc.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
+ * THE COPYRIGHT HOLDERS, AUTHORS AND/OR ITS SUPPLIERS BE LIABLE FOR ANY CLAIM,
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+/*
+ * Authors: Tom St Denis <tom.stdenis@amd.com>
+ */
+#include <linux/sched.h>
+#include <linux/highmem.h>
+#include <linux/pagemap.h>
+#include <linux/shmem_fs.h>
+#include <linux/file.h>
+#include <linux/swap.h>
+#include <linux/slab.h>
+#include <linux/export.h>
+#include <drm/drm_cache.h>
+#include <drm/ttm/ttm_module.h>
+#include <drm/ttm/ttm_bo_driver.h>
+#include <drm/ttm/ttm_placement.h>
+#include <drm/ttm/ttm_page_alloc.h>
+#include "ttm_trace.h"
+
+void ttm_trace_dma_map(struct device *dev, struct ttm_dma_tt *tt)
+{
+	unsigned i;
+
+	if (unlikely(trace_ttm_dma_map_enabled())) {
+		for (i = 0; i < tt->ttm.num_pages; i++) {
+			trace_ttm_dma_map(
+				dev,
+				tt->ttm.pages[i],
+				tt->dma_address[i]);
+		}
+	}
+}
+EXPORT_SYMBOL(ttm_trace_dma_map);
+
+void ttm_trace_dma_unmap(struct device *dev, struct ttm_dma_tt *tt)
+{
+	unsigned i;
+
+	if (unlikely(trace_ttm_dma_unmap_enabled())) {
+		for (i = 0; i < tt->ttm.num_pages; i++) {
+			trace_ttm_dma_unmap(
+				dev,
+				tt->ttm.pages[i],
+				tt->dma_address[i]);
+		}
+	}
+}
+EXPORT_SYMBOL(ttm_trace_dma_unmap);
+
diff --git a/drivers/gpu/drm/ttm/ttm_trace.h b/drivers/gpu/drm/ttm/ttm_trace.h
new file mode 100644
index 000000000000..23279b9b8e64
--- /dev/null
+++ b/drivers/gpu/drm/ttm/ttm_trace.h
@@ -0,0 +1,87 @@
+/**************************************************************************
+ *
+ * Copyright (c) 2017 Advanced Micro Devices, Inc.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
+ * THE COPYRIGHT HOLDERS, AUTHORS AND/OR ITS SUPPLIERS BE LIABLE FOR ANY CLAIM,
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+/*
+ * Authors: Tom St Denis <tom.stdenis@amd.com>
+ */
+#if !defined(_TTM_TRACE_H) || defined(TRACE_HEADER_MULTI_READ)
+#define _TTM_TRACE_H_
+
+#include <linux/stringify.h>
+#include <linux/types.h>
+#include <linux/tracepoint.h>
+
+#include <drm/drmP.h>
+
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM ttm
+#define TRACE_INCLUDE_FILE ttm_trace
+
+TRACE_EVENT(ttm_dma_map,
+	    TP_PROTO(struct device *dev, struct page *page, dma_addr_t dma_address),
+	    TP_ARGS(dev, page, dma_address),
+	    TP_STRUCT__entry(
+				__string(device, dev_name(dev))
+				__field(dma_addr_t, dma)
+				__field(phys_addr_t, phys)
+			    ),
+	    TP_fast_assign(
+			   __assign_str(device, dev_name(dev));
+			   __entry->dma = dma_address;
+			   __entry->phys = page_to_phys(page);
+			   ),
+	    TP_printk("%s: %pad => %pa",
+		      __get_str(device),
+		      &__entry->dma,
+		      &__entry->phys)
+);
+
+TRACE_EVENT(ttm_dma_unmap,
+	    TP_PROTO(struct device *dev, struct page *page, dma_addr_t dma_address),
+	    TP_ARGS(dev, page, dma_address),
+	    TP_STRUCT__entry(
+				__string(device, dev_name(dev))
+				__field(dma_addr_t, dma)
+				__field(phys_addr_t, phys)
+			    ),
+	    TP_fast_assign(
+			   __assign_str(device, dev_name(dev));
+			   __entry->dma = dma_address;
+			   __entry->phys = page_to_phys(page);
+			   ),
+	    TP_printk("%s: %pad => %pa",
+		      __get_str(device),
+		      &__entry->dma,
+		      &__entry->phys)
+);
+
+#endif
+
+/* This part must be outside protection */
+#undef TRACE_INCLUDE_PATH
+#define TRACE_INCLUDE_PATH .
+#include <trace/define_trace.h>
+
diff --git a/drivers/gpu/drm/ttm/ttm_tracepoints.c b/drivers/gpu/drm/ttm/ttm_tracepoints.c
new file mode 100644
index 000000000000..861a6266822b
--- /dev/null
+++ b/drivers/gpu/drm/ttm/ttm_tracepoints.c
@@ -0,0 +1,45 @@
+/**************************************************************************
+ *
+ * Copyright (c) 2017 Advanced Micro Devices, Inc.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
+ * THE COPYRIGHT HOLDERS, AUTHORS AND/OR ITS SUPPLIERS BE LIABLE FOR ANY CLAIM,
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+/*
+ * Authors: Tom St Denis <tom.stdenis@amd.com>
+ */
+#include <linux/sched.h>
+#include <linux/highmem.h>
+#include <linux/pagemap.h>
+#include <linux/shmem_fs.h>
+#include <linux/file.h>
+#include <linux/swap.h>
+#include <linux/slab.h>
+#include <linux/export.h>
+#include <drm/drm_cache.h>
+#include <drm/ttm/ttm_module.h>
+#include <drm/ttm/ttm_bo_driver.h>
+#include <drm/ttm/ttm_placement.h>
+#include <drm/ttm/ttm_page_alloc.h>
+
+#define CREATE_TRACE_POINTS
+#include "ttm_trace.h"
diff --git a/include/drm/ttm/ttm_debug.h b/include/drm/ttm/ttm_debug.h
new file mode 100644
index 000000000000..b5e460fa5086
--- /dev/null
+++ b/include/drm/ttm/ttm_debug.h
@@ -0,0 +1,31 @@
+/**************************************************************************
+ *
+ * Copyright (c) 2017 Advanced Micro Devices, Inc.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
+ * THE COPYRIGHT HOLDERS, AUTHORS AND/OR ITS SUPPLIERS BE LIABLE FOR ANY CLAIM,
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+/*
+ * Authors: Tom St Denis <tom.stdenis@amd.com>
+ */
+extern void ttm_trace_dma_map(struct device *dev, struct ttm_dma_tt *tt);
+extern void ttm_trace_dma_unmap(struct device *dev, struct ttm_dma_tt *tt);
-- 
cgit v1.2.3


From a4dec819c8bba6365eb893a4ca88db4dd1210110 Mon Sep 17 00:00:00 2001
From: Tom St Denis <tom.stdenis@amd.com>
Date: Fri, 18 Aug 2017 10:04:57 -0400
Subject: drm/ttm: Add helper functions to populate/map in one call (v2)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

These functions replace a section of common code found
in radeon/amdgpu drivers (and possibly others) as part
of the ttm_tt_*populate() callbacks.

v2: squash in fix for sw iommu from Tom

Signed-off-by: Tom St Denis <tom.stdenis@amd.com>
Reviewed-by: Christian König <christian.koenig@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
---
 drivers/gpu/drm/ttm/ttm_page_alloc.c | 41 ++++++++++++++++++++++++++++++++++++
 include/drm/ttm/ttm_page_alloc.h     | 21 ++++++++++++++++++
 2 files changed, 62 insertions(+)

(limited to 'include')

diff --git a/drivers/gpu/drm/ttm/ttm_page_alloc.c b/drivers/gpu/drm/ttm/ttm_page_alloc.c
index 871599826773..6a660d196d87 100644
--- a/drivers/gpu/drm/ttm/ttm_page_alloc.c
+++ b/drivers/gpu/drm/ttm/ttm_page_alloc.c
@@ -920,6 +920,47 @@ void ttm_pool_unpopulate(struct ttm_tt *ttm)
 }
 EXPORT_SYMBOL(ttm_pool_unpopulate);
 
+int ttm_populate_and_map_pages(struct device *dev, struct ttm_dma_tt *tt)
+{
+	unsigned i;
+	int r;
+
+	r = ttm_pool_populate(&tt->ttm);
+	if (r)
+		return r;
+
+	for (i = 0; i < tt->ttm.num_pages; i++) {
+		tt->dma_address[i] = dma_map_page(dev, tt->ttm.pages[i],
+						  0, PAGE_SIZE,
+						  DMA_BIDIRECTIONAL);
+		if (dma_mapping_error(dev, tt->dma_address[i])) {
+			while (i--) {
+				dma_unmap_page(dev, tt->dma_address[i],
+					       PAGE_SIZE, DMA_BIDIRECTIONAL);
+				tt->dma_address[i] = 0;
+			}
+			ttm_pool_unpopulate(&tt->ttm);
+			return -EFAULT;
+		}
+	}
+	return 0;
+}
+EXPORT_SYMBOL(ttm_populate_and_map_pages);
+
+void ttm_unmap_and_unpopulate_pages(struct device *dev, struct ttm_dma_tt *tt)
+{
+	unsigned i;
+	
+	for (i = 0; i < tt->ttm.num_pages; i++) {
+		if (tt->dma_address[i]) {
+			dma_unmap_page(dev, tt->dma_address[i],
+				       PAGE_SIZE, DMA_BIDIRECTIONAL);
+		}
+	}
+	ttm_pool_unpopulate(&tt->ttm);
+}
+EXPORT_SYMBOL(ttm_unmap_and_unpopulate_pages);
+
 int ttm_page_alloc_debugfs(struct seq_file *m, void *data)
 {
 	struct ttm_page_pool *p;
diff --git a/include/drm/ttm/ttm_page_alloc.h b/include/drm/ttm/ttm_page_alloc.h
index 49a828425fa2..bf21166f2b97 100644
--- a/include/drm/ttm/ttm_page_alloc.h
+++ b/include/drm/ttm/ttm_page_alloc.h
@@ -83,6 +83,17 @@ extern int ttm_dma_page_alloc_debugfs(struct seq_file *m, void *data);
 extern int ttm_dma_populate(struct ttm_dma_tt *ttm_dma, struct device *dev);
 extern void ttm_dma_unpopulate(struct ttm_dma_tt *ttm_dma, struct device *dev);
 
+
+/**
+ * Populates and DMA maps pages to fullfil a ttm_dma_populate() request
+ */
+int ttm_populate_and_map_pages(struct device *dev, struct ttm_dma_tt *tt);
+
+/**
+ * Unpopulates and DMA unmaps pages as part of a
+ * ttm_dma_unpopulate() request */
+void ttm_unmap_and_unpopulate_pages(struct device *dev, struct ttm_dma_tt *tt);
+
 #else
 static inline int ttm_dma_page_alloc_init(struct ttm_mem_global *glob,
 					  unsigned max_pages)
@@ -105,6 +116,16 @@ static inline void ttm_dma_unpopulate(struct ttm_dma_tt *ttm_dma,
 				      struct device *dev)
 {
 }
+
+static inline int ttm_populate_and_map_pages(struct device *dev, struct ttm_dma_tt *tt)
+{
+	return -ENOMEM;
+}
+
+static inline void ttm_unmap_and_unpopulate_pages(struct device *dev, struct ttm_dma_tt *tt)
+{
+}
+
 #endif
 
 #endif
-- 
cgit v1.2.3


From 96bec198352799794b0f8937620e811ef8b9fa22 Mon Sep 17 00:00:00 2001
From: Tom St Denis <tom.stdenis@amd.com>
Date: Thu, 24 Aug 2017 06:46:39 -0400
Subject: drm/ttm: Remove needless 'extern' on functions in header.
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Minor tidy up.

Signed-off-by: Tom St Denis <tom.stdenis@amd.com>
Reviewed-by: Christian König <christian.koenig@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
---
 include/drm/ttm/ttm_page_alloc.h | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

(limited to 'include')

diff --git a/include/drm/ttm/ttm_page_alloc.h b/include/drm/ttm/ttm_page_alloc.h
index bf21166f2b97..38a2b4770c35 100644
--- a/include/drm/ttm/ttm_page_alloc.h
+++ b/include/drm/ttm/ttm_page_alloc.h
@@ -47,7 +47,7 @@ void ttm_page_alloc_fini(void);
  *
  * Add backing pages to all of @ttm
  */
-extern int ttm_pool_populate(struct ttm_tt *ttm);
+int ttm_pool_populate(struct ttm_tt *ttm);
 
 /**
  * ttm_pool_unpopulate:
@@ -56,12 +56,12 @@ extern int ttm_pool_populate(struct ttm_tt *ttm);
  *
  * Free all pages of @ttm
  */
-extern void ttm_pool_unpopulate(struct ttm_tt *ttm);
+void ttm_pool_unpopulate(struct ttm_tt *ttm);
 
 /**
  * Output the state of pools to debugfs file
  */
-extern int ttm_page_alloc_debugfs(struct seq_file *m, void *data);
+int ttm_page_alloc_debugfs(struct seq_file *m, void *data);
 
 
 #if defined(CONFIG_SWIOTLB) || defined(CONFIG_INTEL_IOMMU)
@@ -78,10 +78,10 @@ void ttm_dma_page_alloc_fini(void);
 /**
  * Output the state of pools to debugfs file
  */
-extern int ttm_dma_page_alloc_debugfs(struct seq_file *m, void *data);
+int ttm_dma_page_alloc_debugfs(struct seq_file *m, void *data);
 
-extern int ttm_dma_populate(struct ttm_dma_tt *ttm_dma, struct device *dev);
-extern void ttm_dma_unpopulate(struct ttm_dma_tt *ttm_dma, struct device *dev);
+int ttm_dma_populate(struct ttm_dma_tt *ttm_dma, struct device *dev);
+void ttm_dma_unpopulate(struct ttm_dma_tt *ttm_dma, struct device *dev);
 
 
 /**
-- 
cgit v1.2.3


From 52c9285d47459cf241e144c7d8ef15941ba1b181 Mon Sep 17 00:00:00 2001
From: Kishon Vijay Abraham I <kishon@ti.com>
Date: Fri, 18 Aug 2017 20:27:56 +0530
Subject: PCI: endpoint: Add support for configurable page size
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

pci-epc-mem uses a page size equal to *PAGE_SIZE* (usually 4KB) to manage
the address space. However certain platforms like TI's K2G have a
restriction that this address space should be either divided into
1MB/2MB/4MB or 8MB sizes (Ref: 11.14.4.9.1 Outbound Address Translation in
K2G TRM SPRUHY8F January 2016 – Revised May 2017).  Add support to handle
different page sizes here.

Signed-off-by: Kishon Vijay Abraham I <kishon@ti.com>
Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
---
 drivers/pci/endpoint/pci-epc-mem.c | 59 ++++++++++++++++++++++++++++++++------
 include/linux/pci-epc.h            |  8 +++++-
 2 files changed, 57 insertions(+), 10 deletions(-)

(limited to 'include')

diff --git a/drivers/pci/endpoint/pci-epc-mem.c b/drivers/pci/endpoint/pci-epc-mem.c
index 3a94cc1caf22..83b7d5d3fc3e 100644
--- a/drivers/pci/endpoint/pci-epc-mem.c
+++ b/drivers/pci/endpoint/pci-epc-mem.c
@@ -24,21 +24,54 @@
 #include <linux/pci-epc.h>
 
 /**
- * pci_epc_mem_init() - initialize the pci_epc_mem structure
+ * pci_epc_mem_get_order() - determine the allocation order of a memory size
+ * @mem: address space of the endpoint controller
+ * @size: the size for which to get the order
+ *
+ * Reimplement get_order() for mem->page_size since the generic get_order
+ * always gets order with a constant PAGE_SIZE.
+ */
+static int pci_epc_mem_get_order(struct pci_epc_mem *mem, size_t size)
+{
+	int order;
+	unsigned int page_shift = ilog2(mem->page_size);
+
+	size--;
+	size >>= page_shift;
+#if BITS_PER_LONG == 32
+	order = fls(size);
+#else
+	order = fls64(size);
+#endif
+	return order;
+}
+
+/**
+ * __pci_epc_mem_init() - initialize the pci_epc_mem structure
  * @epc: the EPC device that invoked pci_epc_mem_init
  * @phys_base: the physical address of the base
  * @size: the size of the address space
+ * @page_size: size of each page
  *
  * Invoke to initialize the pci_epc_mem structure used by the
  * endpoint functions to allocate mapped PCI address.
  */
-int pci_epc_mem_init(struct pci_epc *epc, phys_addr_t phys_base, size_t size)
+int __pci_epc_mem_init(struct pci_epc *epc, phys_addr_t phys_base, size_t size,
+		       size_t page_size)
 {
 	int ret;
 	struct pci_epc_mem *mem;
 	unsigned long *bitmap;
-	int pages = size >> PAGE_SHIFT;
-	int bitmap_size = BITS_TO_LONGS(pages) * sizeof(long);
+	unsigned int page_shift;
+	int pages;
+	int bitmap_size;
+
+	if (page_size < PAGE_SIZE)
+		page_size = PAGE_SIZE;
+
+	page_shift = ilog2(page_size);
+	pages = size >> page_shift;
+	bitmap_size = BITS_TO_LONGS(pages) * sizeof(long);
 
 	mem = kzalloc(sizeof(*mem), GFP_KERNEL);
 	if (!mem) {
@@ -54,6 +87,7 @@ int pci_epc_mem_init(struct pci_epc *epc, phys_addr_t phys_base, size_t size)
 
 	mem->bitmap = bitmap;
 	mem->phys_base = phys_base;
+	mem->page_size = page_size;
 	mem->pages = pages;
 	mem->size = size;
 
@@ -67,7 +101,7 @@ err_mem:
 err:
 return ret;
 }
-EXPORT_SYMBOL_GPL(pci_epc_mem_init);
+EXPORT_SYMBOL_GPL(__pci_epc_mem_init);
 
 /**
  * pci_epc_mem_exit() - cleanup the pci_epc_mem structure
@@ -101,13 +135,17 @@ void __iomem *pci_epc_mem_alloc_addr(struct pci_epc *epc,
 	int pageno;
 	void __iomem *virt_addr;
 	struct pci_epc_mem *mem = epc->mem;
-	int order = get_order(size);
+	unsigned int page_shift = ilog2(mem->page_size);
+	int order;
+
+	size = ALIGN(size, mem->page_size);
+	order = pci_epc_mem_get_order(mem, size);
 
 	pageno = bitmap_find_free_region(mem->bitmap, mem->pages, order);
 	if (pageno < 0)
 		return NULL;
 
-	*phys_addr = mem->phys_base + (pageno << PAGE_SHIFT);
+	*phys_addr = mem->phys_base + (pageno << page_shift);
 	virt_addr = ioremap(*phys_addr, size);
 	if (!virt_addr)
 		bitmap_release_region(mem->bitmap, pageno, order);
@@ -129,11 +167,14 @@ void pci_epc_mem_free_addr(struct pci_epc *epc, phys_addr_t phys_addr,
 			   void __iomem *virt_addr, size_t size)
 {
 	int pageno;
-	int order = get_order(size);
 	struct pci_epc_mem *mem = epc->mem;
+	unsigned int page_shift = ilog2(mem->page_size);
+	int order;
 
 	iounmap(virt_addr);
-	pageno = (phys_addr - mem->phys_base) >> PAGE_SHIFT;
+	pageno = (phys_addr - mem->phys_base) >> page_shift;
+	size = ALIGN(size, mem->page_size);
+	order = pci_epc_mem_get_order(mem, size);
 	bitmap_release_region(mem->bitmap, pageno, order);
 }
 EXPORT_SYMBOL_GPL(pci_epc_mem_free_addr);
diff --git a/include/linux/pci-epc.h b/include/linux/pci-epc.h
index af5edbf3eea3..f7a04e1af112 100644
--- a/include/linux/pci-epc.h
+++ b/include/linux/pci-epc.h
@@ -62,11 +62,13 @@ struct pci_epc_ops {
  * @size: the size of the PCI address space
  * @bitmap: bitmap to manage the PCI address space
  * @pages: number of bits representing the address region
+ * @page_size: size of each page
  */
 struct pci_epc_mem {
 	phys_addr_t	phys_base;
 	size_t		size;
 	unsigned long	*bitmap;
+	size_t		page_size;
 	int		pages;
 };
 
@@ -98,6 +100,9 @@ struct pci_epc {
 #define devm_pci_epc_create(dev, ops)    \
 		__devm_pci_epc_create((dev), (ops), THIS_MODULE)
 
+#define pci_epc_mem_init(epc, phys_addr, size)	\
+		__pci_epc_mem_init((epc), (phys_addr), (size), PAGE_SIZE)
+
 static inline void epc_set_drvdata(struct pci_epc *epc, void *data)
 {
 	dev_set_drvdata(&epc->dev, data);
@@ -135,7 +140,8 @@ void pci_epc_stop(struct pci_epc *epc);
 struct pci_epc *pci_epc_get(const char *epc_name);
 void pci_epc_put(struct pci_epc *epc);
 
-int pci_epc_mem_init(struct pci_epc *epc, phys_addr_t phys_addr, size_t size);
+int __pci_epc_mem_init(struct pci_epc *epc, phys_addr_t phys_addr, size_t size,
+		       size_t page_size);
 void pci_epc_mem_exit(struct pci_epc *epc);
 void __iomem *pci_epc_mem_alloc_addr(struct pci_epc *epc,
 				     phys_addr_t *phys_addr, size_t size);
-- 
cgit v1.2.3


From 9de7e14a1a9c6bc4f9be6ccd9b951341a80dbd52 Mon Sep 17 00:00:00 2001
From: Lars Ellenberg <lars.ellenberg@linbit.com>
Date: Tue, 29 Aug 2017 10:20:38 +0200
Subject: drbd: new disk-option disable-write-same

Some backend devices claim to support write-same,
but would fail actual write-same requests.

Allow to set (or toggle) whether or not DRBD tries to support write-same.

Signed-off-by: Philipp Reisner <philipp.reisner@linbit.com>
Signed-off-by: Lars Ellenberg <lars.ellenberg@linbit.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/block/drbd/drbd_nl.c | 15 ++++++++++++---
 include/linux/drbd_genl.h    |  3 ++-
 include/linux/drbd_limits.h  |  8 +++++++-
 3 files changed, 21 insertions(+), 5 deletions(-)

(limited to 'include')

diff --git a/drivers/block/drbd/drbd_nl.c b/drivers/block/drbd/drbd_nl.c
index ad0fcb43e45c..c383b6cf272a 100644
--- a/drivers/block/drbd/drbd_nl.c
+++ b/drivers/block/drbd/drbd_nl.c
@@ -1236,12 +1236,18 @@ static void fixup_discard_if_not_supported(struct request_queue *q)
 
 static void decide_on_write_same_support(struct drbd_device *device,
 			struct request_queue *q,
-			struct request_queue *b, struct o_qlim *o)
+			struct request_queue *b, struct o_qlim *o,
+			bool disable_write_same)
 {
 	struct drbd_peer_device *peer_device = first_peer_device(device);
 	struct drbd_connection *connection = peer_device->connection;
 	bool can_do = b ? b->limits.max_write_same_sectors : true;
 
+	if (can_do && disable_write_same) {
+		can_do = false;
+		drbd_info(peer_device, "WRITE_SAME disabled by config\n");
+	}
+
 	if (can_do && connection->cstate >= C_CONNECTED && !(connection->agreed_features & DRBD_FF_WSAME)) {
 		can_do = false;
 		drbd_info(peer_device, "peer does not support WRITE_SAME\n");
@@ -1302,6 +1308,7 @@ static void drbd_setup_queue_param(struct drbd_device *device, struct drbd_backi
 	struct request_queue *b = NULL;
 	struct disk_conf *dc;
 	bool discard_zeroes_if_aligned = true;
+	bool disable_write_same = false;
 
 	if (bdev) {
 		b = bdev->backing_bdev->bd_disk->queue;
@@ -1311,6 +1318,7 @@ static void drbd_setup_queue_param(struct drbd_device *device, struct drbd_backi
 		dc = rcu_dereference(device->ldev->disk_conf);
 		max_segments = dc->max_bio_bvecs;
 		discard_zeroes_if_aligned = dc->discard_zeroes_if_aligned;
+		disable_write_same = dc->disable_write_same;
 		rcu_read_unlock();
 
 		blk_set_stacking_limits(&q->limits);
@@ -1321,7 +1329,7 @@ static void drbd_setup_queue_param(struct drbd_device *device, struct drbd_backi
 	blk_queue_max_segments(q, max_segments ? max_segments : BLK_MAX_SEGMENTS);
 	blk_queue_segment_boundary(q, PAGE_SIZE-1);
 	decide_on_discard_support(device, q, b, discard_zeroes_if_aligned);
-	decide_on_write_same_support(device, q, b, o);
+	decide_on_write_same_support(device, q, b, o, disable_write_same);
 
 	if (b) {
 		blk_queue_stack_limits(q, b);
@@ -1612,7 +1620,8 @@ int drbd_adm_disk_opts(struct sk_buff *skb, struct genl_info *info)
 	if (write_ordering_changed(old_disk_conf, new_disk_conf))
 		drbd_bump_write_ordering(device->resource, NULL, WO_BDEV_FLUSH);
 
-	if (old_disk_conf->discard_zeroes_if_aligned != new_disk_conf->discard_zeroes_if_aligned)
+	if (old_disk_conf->discard_zeroes_if_aligned != new_disk_conf->discard_zeroes_if_aligned
+	||  old_disk_conf->disable_write_same != new_disk_conf->disable_write_same)
 		drbd_reconsider_queue_parameters(device, device->ldev, NULL);
 
 	drbd_md_sync(device);
diff --git a/include/linux/drbd_genl.h b/include/linux/drbd_genl.h
index 2896f93808ae..4e6d4d4c7056 100644
--- a/include/linux/drbd_genl.h
+++ b/include/linux/drbd_genl.h
@@ -132,7 +132,8 @@ GENL_struct(DRBD_NLA_DISK_CONF, 3, disk_conf,
 	__flg_field_def(18, DRBD_GENLA_F_MANDATORY,	disk_drain, DRBD_DISK_DRAIN_DEF)
 	__flg_field_def(19, DRBD_GENLA_F_MANDATORY,	md_flushes, DRBD_MD_FLUSHES_DEF)
 	__flg_field_def(23,     0 /* OPTIONAL */,	al_updates, DRBD_AL_UPDATES_DEF)
-	__flg_field_def(24,     0 /* OPTIONAL */,	discard_zeroes_if_aligned, DRBD_DISCARD_ZEROES_IF_ALIGNED)
+	__flg_field_def(24,     0 /* OPTIONAL */,	discard_zeroes_if_aligned, DRBD_DISCARD_ZEROES_IF_ALIGNED_DEF)
+	__flg_field_def(26,     0 /* OPTIONAL */,	disable_write_same, DRBD_DISABLE_WRITE_SAME_DEF)
 )
 
 GENL_struct(DRBD_NLA_RESOURCE_OPTS, 4, res_opts,
diff --git a/include/linux/drbd_limits.h b/include/linux/drbd_limits.h
index ddac68422a96..24ae1b9b76c7 100644
--- a/include/linux/drbd_limits.h
+++ b/include/linux/drbd_limits.h
@@ -209,12 +209,18 @@
 #define DRBD_MD_FLUSHES_DEF	1
 #define DRBD_TCP_CORK_DEF	1
 #define DRBD_AL_UPDATES_DEF     1
+
 /* We used to ignore the discard_zeroes_data setting.
  * To not change established (and expected) behaviour,
  * by default assume that, for discard_zeroes_data=0,
  * we can make that an effective discard_zeroes_data=1,
  * if we only explicitly zero-out unaligned partial chunks. */
-#define DRBD_DISCARD_ZEROES_IF_ALIGNED 1
+#define DRBD_DISCARD_ZEROES_IF_ALIGNED_DEF 1
+
+/* Some backends pretend to support WRITE SAME,
+ * but fail such requests when they are actually submitted.
+ * This is to tell DRBD to not even try. */
+#define DRBD_DISABLE_WRITE_SAME_DEF 0
 
 #define DRBD_ALLOW_TWO_PRIMARIES_DEF	0
 #define DRBD_ALWAYS_ASBP_DEF	0
-- 
cgit v1.2.3


From 365cf663b64791e341f425385c7ae152327c7009 Mon Sep 17 00:00:00 2001
From: Roland Kammerer <roland.kammerer@linbit.com>
Date: Tue, 29 Aug 2017 10:20:48 +0200
Subject: drbd: switch from kmalloc() to kmalloc_array()

We had one call to kmalloc that actually allocates an array. Switch that
one to the kmalloc_array() function.

Signed-off-by: Philipp Reisner <philipp.reisner@linbit.com>
Signed-off-by: Lars Ellenberg <lars.ellenberg@linbit.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/block/drbd/drbd_receiver.c | 2 +-
 include/linux/drbd.h               | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/drivers/block/drbd/drbd_receiver.c b/drivers/block/drbd/drbd_receiver.c
index 4e8a543ded70..796eaf347dc0 100644
--- a/drivers/block/drbd/drbd_receiver.c
+++ b/drivers/block/drbd/drbd_receiver.c
@@ -4126,7 +4126,7 @@ static int receive_uuids(struct drbd_connection *connection, struct packet_info
 		return config_unknown_volume(connection, pi);
 	device = peer_device->device;
 
-	p_uuid = kmalloc(sizeof(u64)*UI_EXTENDED_SIZE, GFP_NOIO);
+	p_uuid = kmalloc_array(UI_EXTENDED_SIZE, sizeof(*p_uuid), GFP_NOIO);
 	if (!p_uuid) {
 		drbd_err(device, "kmalloc of p_uuid failed\n");
 		return false;
diff --git a/include/linux/drbd.h b/include/linux/drbd.h
index 002611c85318..2d0259327721 100644
--- a/include/linux/drbd.h
+++ b/include/linux/drbd.h
@@ -51,7 +51,7 @@
 #endif
 
 extern const char *drbd_buildtag(void);
-#define REL_VERSION "8.4.7"
+#define REL_VERSION "8.4.10"
 #define API_VERSION 1
 #define PRO_VERSION_MIN 86
 #define PRO_VERSION_MAX 101
-- 
cgit v1.2.3


From b74fd306ef2d48781e807dd506cca818bb83d749 Mon Sep 17 00:00:00 2001
From: Roopa Prabhu <roopa@cumulusnetworks.com>
Date: Tue, 29 Aug 2017 13:16:57 -0700
Subject: bridge: fdb add and delete tracepoints

A few useful tracepoints to trace bridge forwarding
database updates.

Signed-off-by: Roopa Prabhu <roopa@cumulusnetworks.com>
Reviewed-by: Florian Fainelli <f.fainelli@gmail.com>
Acked-by: Nikolay Aleksandrov <nikolay@cumulusnetworks.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/trace/events/bridge.h | 98 +++++++++++++++++++++++++++++++++++++++++++
 net/bridge/br_fdb.c           |  7 ++++
 net/core/net-traces.c         |  6 +++
 3 files changed, 111 insertions(+)
 create mode 100644 include/trace/events/bridge.h

(limited to 'include')

diff --git a/include/trace/events/bridge.h b/include/trace/events/bridge.h
new file mode 100644
index 000000000000..0f1cde00af11
--- /dev/null
+++ b/include/trace/events/bridge.h
@@ -0,0 +1,98 @@
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM bridge
+
+#if !defined(_TRACE_BRIDGE_H) || defined(TRACE_HEADER_MULTI_READ)
+#define _TRACE_BRIDGE_H
+
+#include <linux/netdevice.h>
+#include <linux/tracepoint.h>
+
+#include "../../../net/bridge/br_private.h"
+
+TRACE_EVENT(br_fdb_add,
+
+	TP_PROTO(struct ndmsg *ndm, struct net_device *dev,
+		 const unsigned char *addr, u16 vid, u16 nlh_flags),
+
+	TP_ARGS(ndm, dev, addr, vid, nlh_flags),
+
+	TP_STRUCT__entry(
+		__field(u8, ndm_flags)
+		__string(dev, dev->name)
+		__array(unsigned char, addr, ETH_ALEN)
+		__field(u16, vid)
+		__field(u16, nlh_flags)
+	),
+
+	TP_fast_assign(
+		__assign_str(dev, dev->name);
+		memcpy(__entry->addr, addr, ETH_ALEN);
+		__entry->vid = vid;
+		__entry->nlh_flags = nlh_flags;
+		__entry->ndm_flags = ndm->ndm_flags;
+	),
+
+	TP_printk("dev %s addr %02x:%02x:%02x:%02x:%02x:%02x vid %u nlh_flags %04x ndm_flags %02x",
+		  __get_str(dev), __entry->addr[0], __entry->addr[1],
+		  __entry->addr[2], __entry->addr[3], __entry->addr[4],
+		  __entry->addr[5], __entry->vid,
+		  __entry->nlh_flags, __entry->ndm_flags)
+);
+
+TRACE_EVENT(br_fdb_external_learn_add,
+
+	TP_PROTO(struct net_bridge *br, struct net_bridge_port *p,
+		 const unsigned char *addr, u16 vid),
+
+	TP_ARGS(br, p, addr, vid),
+
+	TP_STRUCT__entry(
+		__string(br_dev, br->dev->name)
+		__string(dev, p ? p->dev->name : "null")
+		__array(unsigned char, addr, ETH_ALEN)
+		__field(u16, vid)
+	),
+
+	TP_fast_assign(
+		__assign_str(br_dev, br->dev->name);
+		__assign_str(dev, p ? p->dev->name : "null");
+		memcpy(__entry->addr, addr, ETH_ALEN);
+		__entry->vid = vid;
+	),
+
+	TP_printk("br_dev %s port %s addr %02x:%02x:%02x:%02x:%02x:%02x vid %u",
+		  __get_str(br_dev), __get_str(dev), __entry->addr[0],
+		  __entry->addr[1], __entry->addr[2], __entry->addr[3],
+		  __entry->addr[4], __entry->addr[5], __entry->vid)
+);
+
+TRACE_EVENT(fdb_delete,
+
+	TP_PROTO(struct net_bridge *br, struct net_bridge_fdb_entry *f),
+
+	TP_ARGS(br, f),
+
+	TP_STRUCT__entry(
+		__string(br_dev, br->dev->name)
+		__string(dev, f->dst ? f->dst->dev->name : "null")
+		__array(unsigned char, addr, ETH_ALEN)
+		__field(u16, vid)
+	),
+
+	TP_fast_assign(
+		__assign_str(br_dev, br->dev->name);
+		__assign_str(dev, f->dst ? f->dst->dev->name : "null");
+		memcpy(__entry->addr, f->addr.addr, ETH_ALEN);
+		__entry->vid = f->vlan_id;
+	),
+
+	TP_printk("br_dev %s dev %s addr %02x:%02x:%02x:%02x:%02x:%02x vid %u",
+		  __get_str(br_dev), __get_str(dev), __entry->addr[0],
+		  __entry->addr[1], __entry->addr[2], __entry->addr[3],
+		  __entry->addr[4], __entry->addr[5], __entry->vid)
+);
+
+#endif /* _TRACE_BRIDGE_H */
+
+/* This part must be outside protection */
+#include <trace/define_trace.h>
diff --git a/net/bridge/br_fdb.c b/net/bridge/br_fdb.c
index a79b648aac88..be5e1da6d824 100644
--- a/net/bridge/br_fdb.c
+++ b/net/bridge/br_fdb.c
@@ -25,6 +25,7 @@
 #include <asm/unaligned.h>
 #include <linux/if_vlan.h>
 #include <net/switchdev.h>
+#include <trace/events/bridge.h>
 #include "br_private.h"
 
 static struct kmem_cache *br_fdb_cache __read_mostly;
@@ -171,6 +172,8 @@ static void fdb_del_hw_addr(struct net_bridge *br, const unsigned char *addr)
 
 static void fdb_delete(struct net_bridge *br, struct net_bridge_fdb_entry *f)
 {
+	trace_fdb_delete(br, f);
+
 	if (f->is_static)
 		fdb_del_hw_addr(br, f->addr.addr);
 
@@ -870,6 +873,8 @@ int br_fdb_add(struct ndmsg *ndm, struct nlattr *tb[],
 	struct net_bridge *br = NULL;
 	int err = 0;
 
+	trace_br_fdb_add(ndm, dev, addr, vid, nlh_flags);
+
 	if (!(ndm->ndm_state & (NUD_PERMANENT|NUD_NOARP|NUD_REACHABLE))) {
 		pr_info("bridge: RTM_NEWNEIGH with invalid state %#x\n", ndm->ndm_state);
 		return -EINVAL;
@@ -1066,6 +1071,8 @@ int br_fdb_external_learn_add(struct net_bridge *br, struct net_bridge_port *p,
 	bool modified = false;
 	int err = 0;
 
+	trace_br_fdb_external_learn_add(br, p, addr, vid);
+
 	spin_lock_bh(&br->hash_lock);
 
 	head = &br->hash[br_mac_hash(addr, vid)];
diff --git a/net/core/net-traces.c b/net/core/net-traces.c
index 4f1468ccd056..4a0292c97070 100644
--- a/net/core/net-traces.c
+++ b/net/core/net-traces.c
@@ -37,6 +37,12 @@
 #include <trace/events/fib6.h>
 EXPORT_TRACEPOINT_SYMBOL_GPL(fib6_table_lookup);
 #endif
+#if IS_ENABLED(CONFIG_BRIDGE)
+#include <trace/events/bridge.h>
+EXPORT_TRACEPOINT_SYMBOL_GPL(br_fdb_add);
+EXPORT_TRACEPOINT_SYMBOL_GPL(br_fdb_external_learn_add);
+EXPORT_TRACEPOINT_SYMBOL_GPL(fdb_delete);
+#endif
 
 EXPORT_TRACEPOINT_SYMBOL_GPL(kfree_skb);
 
-- 
cgit v1.2.3


From c73c8b1e47ca275388f774fd36560ab7e994e99e Mon Sep 17 00:00:00 2001
From: Eran Ben Elisha <eranbe@mellanox.com>
Date: Mon, 28 Aug 2017 16:38:20 +0300
Subject: net/mlx4_core: Dynamically allocate structs at mlx4_slave_cap

In order to avoid temporary large structs on the stack,
allocate them dynamically.

Signed-off-by: Eran Ben Elisha <eranbe@mellanox.com>
Signed-off-by: Tal Alon <talal@mellanox.com>
Signed-off-by: Tariq Toukan <tariqt@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/infiniband/hw/mlx4/qp.c           |  26 ++--
 drivers/net/ethernet/mellanox/mlx4/fw.c   |  12 +-
 drivers/net/ethernet/mellanox/mlx4/fw.h   |   6 +-
 drivers/net/ethernet/mellanox/mlx4/main.c | 235 ++++++++++++++++--------------
 drivers/net/ethernet/mellanox/mlx4/qp.c   |  24 +--
 include/linux/mlx4/device.h               |  14 +-
 6 files changed, 159 insertions(+), 158 deletions(-)

(limited to 'include')

diff --git a/drivers/infiniband/hw/mlx4/qp.c b/drivers/infiniband/hw/mlx4/qp.c
index 75c0e6c5dd56..83e5c72f19b8 100644
--- a/drivers/infiniband/hw/mlx4/qp.c
+++ b/drivers/infiniband/hw/mlx4/qp.c
@@ -145,8 +145,8 @@ static int is_sqp(struct mlx4_ib_dev *dev, struct mlx4_ib_qp *qp)
 	/* VF or PF -- proxy SQP */
 	if (mlx4_is_mfunc(dev->dev)) {
 		for (i = 0; i < dev->dev->caps.num_ports; i++) {
-			if (qp->mqp.qpn == dev->dev->caps.qp0_proxy[i] ||
-			    qp->mqp.qpn == dev->dev->caps.qp1_proxy[i]) {
+			if (qp->mqp.qpn == dev->dev->caps.spec_qps[i].qp0_proxy ||
+			    qp->mqp.qpn == dev->dev->caps.spec_qps[i].qp1_proxy) {
 				proxy_sqp = 1;
 				break;
 			}
@@ -173,7 +173,7 @@ static int is_qp0(struct mlx4_ib_dev *dev, struct mlx4_ib_qp *qp)
 	/* VF or PF -- proxy QP0 */
 	if (mlx4_is_mfunc(dev->dev)) {
 		for (i = 0; i < dev->dev->caps.num_ports; i++) {
-			if (qp->mqp.qpn == dev->dev->caps.qp0_proxy[i]) {
+			if (qp->mqp.qpn == dev->dev->caps.spec_qps[i].qp0_proxy) {
 				proxy_qp0 = 1;
 				break;
 			}
@@ -614,8 +614,8 @@ static int qp0_enabled_vf(struct mlx4_dev *dev, int qpn)
 {
 	int i;
 	for (i = 0; i < dev->caps.num_ports; i++) {
-		if (qpn == dev->caps.qp0_proxy[i])
-			return !!dev->caps.qp0_qkey[i];
+		if (qpn == dev->caps.spec_qps[i].qp0_proxy)
+			return !!dev->caps.spec_qps[i].qp0_qkey;
 	}
 	return 0;
 }
@@ -1114,9 +1114,9 @@ static u32 get_sqp_num(struct mlx4_ib_dev *dev, struct ib_qp_init_attr *attr)
 	}
 	/* PF or VF -- creating proxies */
 	if (attr->qp_type == IB_QPT_SMI)
-		return dev->dev->caps.qp0_proxy[attr->port_num - 1];
+		return dev->dev->caps.spec_qps[attr->port_num - 1].qp0_proxy;
 	else
-		return dev->dev->caps.qp1_proxy[attr->port_num - 1];
+		return dev->dev->caps.spec_qps[attr->port_num - 1].qp1_proxy;
 }
 
 static struct ib_qp *_mlx4_ib_create_qp(struct ib_pd *pd,
@@ -2277,9 +2277,9 @@ static int vf_get_qp0_qkey(struct mlx4_dev *dev, int qpn, u32 *qkey)
 {
 	int i;
 	for (i = 0; i < dev->caps.num_ports; i++) {
-		if (qpn == dev->caps.qp0_proxy[i] ||
-		    qpn == dev->caps.qp0_tunnel[i]) {
-			*qkey = dev->caps.qp0_qkey[i];
+		if (qpn == dev->caps.spec_qps[i].qp0_proxy ||
+		    qpn == dev->caps.spec_qps[i].qp0_tunnel) {
+			*qkey = dev->caps.spec_qps[i].qp0_qkey;
 			return 0;
 		}
 	}
@@ -2340,7 +2340,7 @@ static int build_sriov_qp0_header(struct mlx4_ib_sqp *sqp,
 		sqp->ud_header.bth.destination_qpn = cpu_to_be32(wr->remote_qpn);
 	else
 		sqp->ud_header.bth.destination_qpn =
-			cpu_to_be32(mdev->dev->caps.qp0_tunnel[sqp->qp.port - 1]);
+			cpu_to_be32(mdev->dev->caps.spec_qps[sqp->qp.port - 1].qp0_tunnel);
 
 	sqp->ud_header.bth.psn = cpu_to_be32((sqp->send_psn++) & ((1 << 24) - 1));
 	if (mlx4_is_master(mdev->dev)) {
@@ -2800,9 +2800,9 @@ static void set_tunnel_datagram_seg(struct mlx4_ib_dev *dev,
 
 	memcpy(dseg->av, &sqp_av, sizeof (struct mlx4_av));
 	if (qpt == MLX4_IB_QPT_PROXY_GSI)
-		dseg->dqpn = cpu_to_be32(dev->dev->caps.qp1_tunnel[port - 1]);
+		dseg->dqpn = cpu_to_be32(dev->dev->caps.spec_qps[port - 1].qp1_tunnel);
 	else
-		dseg->dqpn = cpu_to_be32(dev->dev->caps.qp0_tunnel[port - 1]);
+		dseg->dqpn = cpu_to_be32(dev->dev->caps.spec_qps[port - 1].qp0_tunnel);
 	/* Use QKEY from the QP context, which is set by master */
 	dseg->qkey = cpu_to_be32(IB_QP_SET_QKEY);
 }
diff --git a/drivers/net/ethernet/mellanox/mlx4/fw.c b/drivers/net/ethernet/mellanox/mlx4/fw.c
index 042707623922..f76cac01d564 100644
--- a/drivers/net/ethernet/mellanox/mlx4/fw.c
+++ b/drivers/net/ethernet/mellanox/mlx4/fw.c
@@ -679,22 +679,22 @@ int mlx4_QUERY_FUNC_CAP(struct mlx4_dev *dev, u8 gen_or_port,
 
 	if (func_cap->flags1 & QUERY_FUNC_CAP_VF_ENABLE_QP0) {
 		MLX4_GET(qkey, outbox, QUERY_FUNC_CAP_PRIV_VF_QKEY_OFFSET);
-		func_cap->qp0_qkey = qkey;
+		func_cap->spec_qps.qp0_qkey = qkey;
 	} else {
-		func_cap->qp0_qkey = 0;
+		func_cap->spec_qps.qp0_qkey = 0;
 	}
 
 	MLX4_GET(size, outbox, QUERY_FUNC_CAP_QP0_TUNNEL);
-	func_cap->qp0_tunnel_qpn = size & 0xFFFFFF;
+	func_cap->spec_qps.qp0_tunnel = size & 0xFFFFFF;
 
 	MLX4_GET(size, outbox, QUERY_FUNC_CAP_QP0_PROXY);
-	func_cap->qp0_proxy_qpn = size & 0xFFFFFF;
+	func_cap->spec_qps.qp0_proxy = size & 0xFFFFFF;
 
 	MLX4_GET(size, outbox, QUERY_FUNC_CAP_QP1_TUNNEL);
-	func_cap->qp1_tunnel_qpn = size & 0xFFFFFF;
+	func_cap->spec_qps.qp1_tunnel = size & 0xFFFFFF;
 
 	MLX4_GET(size, outbox, QUERY_FUNC_CAP_QP1_PROXY);
-	func_cap->qp1_proxy_qpn = size & 0xFFFFFF;
+	func_cap->spec_qps.qp1_proxy = size & 0xFFFFFF;
 
 	if (func_cap->flags1 & QUERY_FUNC_CAP_FLAGS1_NIC_INFO)
 		MLX4_GET(func_cap->phys_port_id, outbox,
diff --git a/drivers/net/ethernet/mellanox/mlx4/fw.h b/drivers/net/ethernet/mellanox/mlx4/fw.h
index b52ba01aa486..cd6399c76bfd 100644
--- a/drivers/net/ethernet/mellanox/mlx4/fw.h
+++ b/drivers/net/ethernet/mellanox/mlx4/fw.h
@@ -144,11 +144,7 @@ struct mlx4_func_cap {
 	int	max_eq;
 	int	reserved_eq;
 	int	mcg_quota;
-	u32	qp0_qkey;
-	u32	qp0_tunnel_qpn;
-	u32	qp0_proxy_qpn;
-	u32	qp1_tunnel_qpn;
-	u32	qp1_proxy_qpn;
+	struct mlx4_spec_qps spec_qps;
 	u32	reserved_lkey;
 	u8	physical_port;
 	u8	flags0;
diff --git a/drivers/net/ethernet/mellanox/mlx4/main.c b/drivers/net/ethernet/mellanox/mlx4/main.c
index 8404e165eca4..e6413a8a5f07 100644
--- a/drivers/net/ethernet/mellanox/mlx4/main.c
+++ b/drivers/net/ethernet/mellanox/mlx4/main.c
@@ -819,38 +819,93 @@ static void slave_adjust_steering_mode(struct mlx4_dev *dev,
 		 mlx4_steering_mode_str(dev->caps.steering_mode));
 }
 
+static void mlx4_slave_destroy_special_qp_cap(struct mlx4_dev *dev)
+{
+	kfree(dev->caps.spec_qps);
+	dev->caps.spec_qps = NULL;
+}
+
+static int mlx4_slave_special_qp_cap(struct mlx4_dev *dev)
+{
+	struct mlx4_func_cap *func_cap = NULL;
+	struct mlx4_caps *caps = &dev->caps;
+	int i, err = 0;
+
+	func_cap = kzalloc(sizeof(*func_cap), GFP_KERNEL);
+	caps->spec_qps = kcalloc(caps->num_ports, sizeof(*caps->spec_qps), GFP_KERNEL);
+
+	if (!func_cap || !caps->spec_qps) {
+		mlx4_err(dev, "Failed to allocate memory for special qps cap\n");
+		err = -ENOMEM;
+		goto err_mem;
+	}
+
+	for (i = 1; i <= caps->num_ports; ++i) {
+		err = mlx4_QUERY_FUNC_CAP(dev, i, func_cap);
+		if (err) {
+			mlx4_err(dev, "QUERY_FUNC_CAP port command failed for port %d, aborting (%d)\n",
+				 i, err);
+			goto err_mem;
+		}
+		caps->spec_qps[i - 1] = func_cap->spec_qps;
+		caps->port_mask[i] = caps->port_type[i];
+		caps->phys_port_id[i] = func_cap->phys_port_id;
+		err = mlx4_get_slave_pkey_gid_tbl_len(dev, i,
+						      &caps->gid_table_len[i],
+						      &caps->pkey_table_len[i]);
+		if (err) {
+			mlx4_err(dev, "QUERY_PORT command failed for port %d, aborting (%d)\n",
+				 i, err);
+			goto err_mem;
+		}
+	}
+
+err_mem:
+	if (err)
+		mlx4_slave_destroy_special_qp_cap(dev);
+	kfree(func_cap);
+	return err;
+}
+
 static int mlx4_slave_cap(struct mlx4_dev *dev)
 {
 	int			   err;
 	u32			   page_size;
-	struct mlx4_dev_cap	   dev_cap;
-	struct mlx4_func_cap	   func_cap;
-	struct mlx4_init_hca_param hca_param;
-	u8			   i;
+	struct mlx4_dev_cap	   *dev_cap = NULL;
+	struct mlx4_func_cap	   *func_cap = NULL;
+	struct mlx4_init_hca_param *hca_param = NULL;
+
+	hca_param = kzalloc(sizeof(*hca_param), GFP_KERNEL);
+	func_cap = kzalloc(sizeof(*func_cap), GFP_KERNEL);
+	dev_cap = kzalloc(sizeof(*dev_cap), GFP_KERNEL);
+	if (!hca_param || !func_cap || !dev_cap) {
+		mlx4_err(dev, "Failed to allocate memory for slave_cap\n");
+		err = -ENOMEM;
+		goto free_mem;
+	}
 
-	memset(&hca_param, 0, sizeof(hca_param));
-	err = mlx4_QUERY_HCA(dev, &hca_param);
+	err = mlx4_QUERY_HCA(dev, hca_param);
 	if (err) {
 		mlx4_err(dev, "QUERY_HCA command failed, aborting\n");
-		return err;
+		goto free_mem;
 	}
 
 	/* fail if the hca has an unknown global capability
 	 * at this time global_caps should be always zeroed
 	 */
-	if (hca_param.global_caps) {
+	if (hca_param->global_caps) {
 		mlx4_err(dev, "Unknown hca global capabilities\n");
-		return -EINVAL;
+		err = -EINVAL;
+		goto free_mem;
 	}
 
-	dev->caps.hca_core_clock = hca_param.hca_core_clock;
+	dev->caps.hca_core_clock = hca_param->hca_core_clock;
 
-	memset(&dev_cap, 0, sizeof(dev_cap));
-	dev->caps.max_qp_dest_rdma = 1 << hca_param.log_rd_per_qp;
-	err = mlx4_dev_cap(dev, &dev_cap);
+	dev->caps.max_qp_dest_rdma = 1 << hca_param->log_rd_per_qp;
+	err = mlx4_dev_cap(dev, dev_cap);
 	if (err) {
 		mlx4_err(dev, "QUERY_DEV_CAP command failed, aborting\n");
-		return err;
+		goto free_mem;
 	}
 
 	err = mlx4_QUERY_FW(dev);
@@ -862,21 +917,23 @@ static int mlx4_slave_cap(struct mlx4_dev *dev)
 	if (page_size > PAGE_SIZE) {
 		mlx4_err(dev, "HCA minimum page size of %d bigger than kernel PAGE_SIZE of %ld, aborting\n",
 			 page_size, PAGE_SIZE);
-		return -ENODEV;
+		err = -ENODEV;
+		goto free_mem;
 	}
 
 	/* Set uar_page_shift for VF */
-	dev->uar_page_shift = hca_param.uar_page_sz + 12;
+	dev->uar_page_shift = hca_param->uar_page_sz + 12;
 
 	/* Make sure the master uar page size is valid */
 	if (dev->uar_page_shift > PAGE_SHIFT) {
 		mlx4_err(dev,
 			 "Invalid configuration: uar page size is larger than system page size\n");
-		return  -ENODEV;
+		err = -ENODEV;
+		goto free_mem;
 	}
 
 	/* Set reserved_uars based on the uar_page_shift */
-	mlx4_set_num_reserved_uars(dev, &dev_cap);
+	mlx4_set_num_reserved_uars(dev, dev_cap);
 
 	/* Although uar page size in FW differs from system page size,
 	 * upper software layers (mlx4_ib, mlx4_en and part of mlx4_core)
@@ -884,34 +941,35 @@ static int mlx4_slave_cap(struct mlx4_dev *dev)
 	 */
 	dev->caps.uar_page_size = PAGE_SIZE;
 
-	memset(&func_cap, 0, sizeof(func_cap));
-	err = mlx4_QUERY_FUNC_CAP(dev, 0, &func_cap);
+	err = mlx4_QUERY_FUNC_CAP(dev, 0, func_cap);
 	if (err) {
 		mlx4_err(dev, "QUERY_FUNC_CAP general command failed, aborting (%d)\n",
 			 err);
-		return err;
+		goto free_mem;
 	}
 
-	if ((func_cap.pf_context_behaviour | PF_CONTEXT_BEHAVIOUR_MASK) !=
+	if ((func_cap->pf_context_behaviour | PF_CONTEXT_BEHAVIOUR_MASK) !=
 	    PF_CONTEXT_BEHAVIOUR_MASK) {
 		mlx4_err(dev, "Unknown pf context behaviour %x known flags %x\n",
-			 func_cap.pf_context_behaviour, PF_CONTEXT_BEHAVIOUR_MASK);
-		return -EINVAL;
-	}
-
-	dev->caps.num_ports		= func_cap.num_ports;
-	dev->quotas.qp			= func_cap.qp_quota;
-	dev->quotas.srq			= func_cap.srq_quota;
-	dev->quotas.cq			= func_cap.cq_quota;
-	dev->quotas.mpt			= func_cap.mpt_quota;
-	dev->quotas.mtt			= func_cap.mtt_quota;
-	dev->caps.num_qps		= 1 << hca_param.log_num_qps;
-	dev->caps.num_srqs		= 1 << hca_param.log_num_srqs;
-	dev->caps.num_cqs		= 1 << hca_param.log_num_cqs;
-	dev->caps.num_mpts		= 1 << hca_param.log_mpt_sz;
-	dev->caps.num_eqs		= func_cap.max_eq;
-	dev->caps.reserved_eqs		= func_cap.reserved_eq;
-	dev->caps.reserved_lkey		= func_cap.reserved_lkey;
+			 func_cap->pf_context_behaviour,
+			 PF_CONTEXT_BEHAVIOUR_MASK);
+		err = -EINVAL;
+		goto free_mem;
+	}
+
+	dev->caps.num_ports		= func_cap->num_ports;
+	dev->quotas.qp			= func_cap->qp_quota;
+	dev->quotas.srq			= func_cap->srq_quota;
+	dev->quotas.cq			= func_cap->cq_quota;
+	dev->quotas.mpt			= func_cap->mpt_quota;
+	dev->quotas.mtt			= func_cap->mtt_quota;
+	dev->caps.num_qps		= 1 << hca_param->log_num_qps;
+	dev->caps.num_srqs		= 1 << hca_param->log_num_srqs;
+	dev->caps.num_cqs		= 1 << hca_param->log_num_cqs;
+	dev->caps.num_mpts		= 1 << hca_param->log_mpt_sz;
+	dev->caps.num_eqs		= func_cap->max_eq;
+	dev->caps.reserved_eqs		= func_cap->reserved_eq;
+	dev->caps.reserved_lkey		= func_cap->reserved_lkey;
 	dev->caps.num_pds               = MLX4_NUM_PDS;
 	dev->caps.num_mgms              = 0;
 	dev->caps.num_amgms             = 0;
@@ -924,38 +982,10 @@ static int mlx4_slave_cap(struct mlx4_dev *dev)
 
 	mlx4_replace_zero_macs(dev);
 
-	dev->caps.qp0_qkey = kcalloc(dev->caps.num_ports, sizeof(u32), GFP_KERNEL);
-	dev->caps.qp0_tunnel = kcalloc(dev->caps.num_ports, sizeof(u32), GFP_KERNEL);
-	dev->caps.qp0_proxy = kcalloc(dev->caps.num_ports, sizeof(u32), GFP_KERNEL);
-	dev->caps.qp1_tunnel = kcalloc(dev->caps.num_ports, sizeof(u32), GFP_KERNEL);
-	dev->caps.qp1_proxy = kcalloc(dev->caps.num_ports, sizeof(u32), GFP_KERNEL);
-
-	if (!dev->caps.qp0_tunnel || !dev->caps.qp0_proxy ||
-	    !dev->caps.qp1_tunnel || !dev->caps.qp1_proxy ||
-	    !dev->caps.qp0_qkey) {
-		err = -ENOMEM;
-		goto err_mem;
-	}
-
-	for (i = 1; i <= dev->caps.num_ports; ++i) {
-		err = mlx4_QUERY_FUNC_CAP(dev, i, &func_cap);
-		if (err) {
-			mlx4_err(dev, "QUERY_FUNC_CAP port command failed for port %d, aborting (%d)\n",
-				 i, err);
-			goto err_mem;
-		}
-		dev->caps.qp0_qkey[i - 1] = func_cap.qp0_qkey;
-		dev->caps.qp0_tunnel[i - 1] = func_cap.qp0_tunnel_qpn;
-		dev->caps.qp0_proxy[i - 1] = func_cap.qp0_proxy_qpn;
-		dev->caps.qp1_tunnel[i - 1] = func_cap.qp1_tunnel_qpn;
-		dev->caps.qp1_proxy[i - 1] = func_cap.qp1_proxy_qpn;
-		dev->caps.port_mask[i] = dev->caps.port_type[i];
-		dev->caps.phys_port_id[i] = func_cap.phys_port_id;
-		err = mlx4_get_slave_pkey_gid_tbl_len(dev, i,
-						      &dev->caps.gid_table_len[i],
-						      &dev->caps.pkey_table_len[i]);
-		if (err)
-			goto err_mem;
+	err = mlx4_slave_special_qp_cap(dev);
+	if (err) {
+		mlx4_err(dev, "Set special QP caps failed. aborting\n");
+		goto free_mem;
 	}
 
 	if (dev->caps.uar_page_size * (dev->caps.num_uars -
@@ -970,7 +1000,7 @@ static int mlx4_slave_cap(struct mlx4_dev *dev)
 		goto err_mem;
 	}
 
-	if (hca_param.dev_cap_enabled & MLX4_DEV_CAP_64B_EQE_ENABLED) {
+	if (hca_param->dev_cap_enabled & MLX4_DEV_CAP_64B_EQE_ENABLED) {
 		dev->caps.eqe_size   = 64;
 		dev->caps.eqe_factor = 1;
 	} else {
@@ -978,20 +1008,20 @@ static int mlx4_slave_cap(struct mlx4_dev *dev)
 		dev->caps.eqe_factor = 0;
 	}
 
-	if (hca_param.dev_cap_enabled & MLX4_DEV_CAP_64B_CQE_ENABLED) {
+	if (hca_param->dev_cap_enabled & MLX4_DEV_CAP_64B_CQE_ENABLED) {
 		dev->caps.cqe_size   = 64;
 		dev->caps.userspace_caps |= MLX4_USER_DEV_CAP_LARGE_CQE;
 	} else {
 		dev->caps.cqe_size   = 32;
 	}
 
-	if (hca_param.dev_cap_enabled & MLX4_DEV_CAP_EQE_STRIDE_ENABLED) {
-		dev->caps.eqe_size = hca_param.eqe_size;
+	if (hca_param->dev_cap_enabled & MLX4_DEV_CAP_EQE_STRIDE_ENABLED) {
+		dev->caps.eqe_size = hca_param->eqe_size;
 		dev->caps.eqe_factor = 0;
 	}
 
-	if (hca_param.dev_cap_enabled & MLX4_DEV_CAP_CQE_STRIDE_ENABLED) {
-		dev->caps.cqe_size = hca_param.cqe_size;
+	if (hca_param->dev_cap_enabled & MLX4_DEV_CAP_CQE_STRIDE_ENABLED) {
+		dev->caps.cqe_size = hca_param->cqe_size;
 		/* User still need to know when CQE > 32B */
 		dev->caps.userspace_caps |= MLX4_USER_DEV_CAP_LARGE_CQE;
 	}
@@ -999,31 +1029,24 @@ static int mlx4_slave_cap(struct mlx4_dev *dev)
 	dev->caps.flags2 &= ~MLX4_DEV_CAP_FLAG2_TS;
 	mlx4_warn(dev, "Timestamping is not supported in slave mode\n");
 
-	slave_adjust_steering_mode(dev, &dev_cap, &hca_param);
+	slave_adjust_steering_mode(dev, dev_cap, hca_param);
 	mlx4_dbg(dev, "RSS support for IP fragments is %s\n",
-		 hca_param.rss_ip_frags ? "on" : "off");
+		 hca_param->rss_ip_frags ? "on" : "off");
 
-	if (func_cap.extra_flags & MLX4_QUERY_FUNC_FLAGS_BF_RES_QP &&
+	if (func_cap->extra_flags & MLX4_QUERY_FUNC_FLAGS_BF_RES_QP &&
 	    dev->caps.bf_reg_size)
 		dev->caps.alloc_res_qp_mask |= MLX4_RESERVE_ETH_BF_QP;
 
-	if (func_cap.extra_flags & MLX4_QUERY_FUNC_FLAGS_A0_RES_QP)
+	if (func_cap->extra_flags & MLX4_QUERY_FUNC_FLAGS_A0_RES_QP)
 		dev->caps.alloc_res_qp_mask |= MLX4_RESERVE_A0_QP;
 
-	return 0;
-
 err_mem:
-	kfree(dev->caps.qp0_qkey);
-	kfree(dev->caps.qp0_tunnel);
-	kfree(dev->caps.qp0_proxy);
-	kfree(dev->caps.qp1_tunnel);
-	kfree(dev->caps.qp1_proxy);
-	dev->caps.qp0_qkey = NULL;
-	dev->caps.qp0_tunnel = NULL;
-	dev->caps.qp0_proxy = NULL;
-	dev->caps.qp1_tunnel = NULL;
-	dev->caps.qp1_proxy = NULL;
-
+	if (err)
+		mlx4_slave_destroy_special_qp_cap(dev);
+free_mem:
+	kfree(hca_param);
+	kfree(func_cap);
+	kfree(dev_cap);
 	return err;
 }
 
@@ -2407,13 +2430,8 @@ unmap_bf:
 	unmap_internal_clock(dev);
 	unmap_bf_area(dev);
 
-	if (mlx4_is_slave(dev)) {
-		kfree(dev->caps.qp0_qkey);
-		kfree(dev->caps.qp0_tunnel);
-		kfree(dev->caps.qp0_proxy);
-		kfree(dev->caps.qp1_tunnel);
-		kfree(dev->caps.qp1_proxy);
-	}
+	if (mlx4_is_slave(dev))
+		mlx4_slave_destroy_special_qp_cap(dev);
 
 err_close:
 	if (mlx4_is_slave(dev))
@@ -3596,13 +3614,8 @@ err_master_mfunc:
 		mlx4_multi_func_cleanup(dev);
 	}
 
-	if (mlx4_is_slave(dev)) {
-		kfree(dev->caps.qp0_qkey);
-		kfree(dev->caps.qp0_tunnel);
-		kfree(dev->caps.qp0_proxy);
-		kfree(dev->caps.qp1_tunnel);
-		kfree(dev->caps.qp1_proxy);
-	}
+	if (mlx4_is_slave(dev))
+		mlx4_slave_destroy_special_qp_cap(dev);
 
 err_close:
 	mlx4_close_hca(dev);
@@ -3942,11 +3955,7 @@ static void mlx4_unload_one(struct pci_dev *pdev)
 	if (!mlx4_is_slave(dev))
 		mlx4_free_ownership(dev);
 
-	kfree(dev->caps.qp0_qkey);
-	kfree(dev->caps.qp0_tunnel);
-	kfree(dev->caps.qp0_proxy);
-	kfree(dev->caps.qp1_tunnel);
-	kfree(dev->caps.qp1_proxy);
+	mlx4_slave_destroy_special_qp_cap(dev);
 	kfree(dev->dev_vfs);
 
 	mlx4_clean_dev(dev);
diff --git a/drivers/net/ethernet/mellanox/mlx4/qp.c b/drivers/net/ethernet/mellanox/mlx4/qp.c
index 2b067763a6bc..28c441c0d31f 100644
--- a/drivers/net/ethernet/mellanox/mlx4/qp.c
+++ b/drivers/net/ethernet/mellanox/mlx4/qp.c
@@ -844,24 +844,20 @@ int mlx4_init_qp_table(struct mlx4_dev *dev)
 
 		/* In mfunc, calculate proxy and tunnel qp offsets for the PF here,
 		 * since the PF does not call mlx4_slave_caps */
-		dev->caps.qp0_tunnel = kcalloc(dev->caps.num_ports, sizeof(u32), GFP_KERNEL);
-		dev->caps.qp0_proxy = kcalloc(dev->caps.num_ports, sizeof(u32), GFP_KERNEL);
-		dev->caps.qp1_tunnel = kcalloc(dev->caps.num_ports, sizeof(u32), GFP_KERNEL);
-		dev->caps.qp1_proxy = kcalloc(dev->caps.num_ports, sizeof(u32), GFP_KERNEL);
+		dev->caps.spec_qps = kcalloc(dev->caps.num_ports, sizeof(dev->caps.spec_qps), GFP_KERNEL);
 
-		if (!dev->caps.qp0_tunnel || !dev->caps.qp0_proxy ||
-		    !dev->caps.qp1_tunnel || !dev->caps.qp1_proxy) {
+		if (!dev->caps.spec_qps) {
 			err = -ENOMEM;
 			goto err_mem;
 		}
 
 		for (k = 0; k < dev->caps.num_ports; k++) {
-			dev->caps.qp0_proxy[k] = dev->phys_caps.base_proxy_sqpn +
+			dev->caps.spec_qps[k].qp0_proxy = dev->phys_caps.base_proxy_sqpn +
 				8 * mlx4_master_func_num(dev) + k;
-			dev->caps.qp0_tunnel[k] = dev->caps.qp0_proxy[k] + 8 * MLX4_MFUNC_MAX;
-			dev->caps.qp1_proxy[k] = dev->phys_caps.base_proxy_sqpn +
+			dev->caps.spec_qps[k].qp0_tunnel = dev->caps.spec_qps[k].qp0_proxy + 8 * MLX4_MFUNC_MAX;
+			dev->caps.spec_qps[k].qp1_proxy = dev->phys_caps.base_proxy_sqpn +
 				8 * mlx4_master_func_num(dev) + MLX4_MAX_PORTS + k;
-			dev->caps.qp1_tunnel[k] = dev->caps.qp1_proxy[k] + 8 * MLX4_MFUNC_MAX;
+			dev->caps.spec_qps[k].qp1_tunnel = dev->caps.spec_qps[k].qp1_proxy + 8 * MLX4_MFUNC_MAX;
 		}
 	}
 
@@ -873,12 +869,8 @@ int mlx4_init_qp_table(struct mlx4_dev *dev)
 	return err;
 
 err_mem:
-	kfree(dev->caps.qp0_tunnel);
-	kfree(dev->caps.qp0_proxy);
-	kfree(dev->caps.qp1_tunnel);
-	kfree(dev->caps.qp1_proxy);
-	dev->caps.qp0_tunnel = dev->caps.qp0_proxy =
-		dev->caps.qp1_tunnel = dev->caps.qp1_proxy = NULL;
+	kfree(dev->caps.spec_qps);
+	dev->caps.spec_qps = NULL;
 	mlx4_cleanup_qp_zones(dev);
 	return err;
 }
diff --git a/include/linux/mlx4/device.h b/include/linux/mlx4/device.h
index b54517c05e9a..9844606f9491 100644
--- a/include/linux/mlx4/device.h
+++ b/include/linux/mlx4/device.h
@@ -518,6 +518,14 @@ struct mlx4_phys_caps {
 	u32			base_tunnel_sqpn;
 };
 
+struct mlx4_spec_qps {
+	u32 qp0_qkey;
+	u32 qp0_proxy;
+	u32 qp0_tunnel;
+	u32 qp1_proxy;
+	u32 qp1_tunnel;
+};
+
 struct mlx4_caps {
 	u64			fw_ver;
 	u32			function;
@@ -547,11 +555,7 @@ struct mlx4_caps {
 	int			max_qp_init_rdma;
 	int			max_qp_dest_rdma;
 	int			max_tc_eth;
-	u32			*qp0_qkey;
-	u32			*qp0_proxy;
-	u32			*qp1_proxy;
-	u32			*qp0_tunnel;
-	u32			*qp1_tunnel;
+	struct mlx4_spec_qps   *spec_qps;
 	int			num_srqs;
 	int			max_srq_wqes;
 	int			max_srq_sge;
-- 
cgit v1.2.3


From be59960395f86991c6599c41d8c421fe4bf7a210 Mon Sep 17 00:00:00 2001
From: Moshe Shemesh <moshe@mellanox.com>
Date: Mon, 28 Aug 2017 16:38:23 +0300
Subject: net/mlx4: Add user mac FW update support

Adding support for updating the FW on new port mac, when port mac change
is requested by the user. This info is required by the FW as OEM
management tools require this info directly from the NIC FW.
Check device capability bit to verify the FW supports user mac.
If the FW does support it, use set_port command to notify the FW on the
new mac.
The feature is relevant only to PF port mac.

Signed-off-by: Moshe Shemesh <moshe@mellanox.com>
Signed-off-by: Tariq Toukan <tariqt@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/mellanox/mlx4/en_netdev.c | 23 +++++++++++++++++++++--
 drivers/net/ethernet/mellanox/mlx4/fw.c        |  5 +++++
 drivers/net/ethernet/mellanox/mlx4/main.c      |  3 +++
 drivers/net/ethernet/mellanox/mlx4/mlx4.h      |  2 ++
 drivers/net/ethernet/mellanox/mlx4/port.c      | 25 +++++++++++++++++++++++++
 include/linux/mlx4/device.h                    |  2 ++
 6 files changed, 58 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/drivers/net/ethernet/mellanox/mlx4/en_netdev.c b/drivers/net/ethernet/mellanox/mlx4/en_netdev.c
index 6e67ca7aa7f5..3753943aec0f 100644
--- a/drivers/net/ethernet/mellanox/mlx4/en_netdev.c
+++ b/drivers/net/ethernet/mellanox/mlx4/en_netdev.c
@@ -732,6 +732,21 @@ static int mlx4_en_replace_mac(struct mlx4_en_priv *priv, int qpn,
 	return __mlx4_replace_mac(dev, priv->port, qpn, new_mac_u64);
 }
 
+static void mlx4_en_update_user_mac(struct mlx4_en_priv *priv,
+				    unsigned char new_mac[ETH_ALEN + 2])
+{
+	struct mlx4_en_dev *mdev = priv->mdev;
+	int err;
+
+	if (!(mdev->dev->caps.flags2 & MLX4_DEV_CAP_FLAG2_USER_MAC_EN))
+		return;
+
+	err = mlx4_SET_PORT_user_mac(mdev->dev, priv->port, new_mac);
+	if (err)
+		en_err(priv, "Failed to pass user MAC(%pM) to Firmware for port %d, with error %d\n",
+		       new_mac, priv->port, err);
+}
+
 static int mlx4_en_do_set_mac(struct mlx4_en_priv *priv,
 			      unsigned char new_mac[ETH_ALEN + 2])
 {
@@ -766,8 +781,12 @@ static int mlx4_en_set_mac(struct net_device *dev, void *addr)
 	mutex_lock(&mdev->state_lock);
 	memcpy(new_mac, saddr->sa_data, ETH_ALEN);
 	err = mlx4_en_do_set_mac(priv, new_mac);
-	if (!err)
-		memcpy(dev->dev_addr, saddr->sa_data, ETH_ALEN);
+	if (err)
+		goto out;
+
+	memcpy(dev->dev_addr, saddr->sa_data, ETH_ALEN);
+	mlx4_en_update_user_mac(priv, new_mac);
+out:
 	mutex_unlock(&mdev->state_lock);
 
 	return err;
diff --git a/drivers/net/ethernet/mellanox/mlx4/fw.c b/drivers/net/ethernet/mellanox/mlx4/fw.c
index 10cfb058a9de..d4c7c15cb4ad 100644
--- a/drivers/net/ethernet/mellanox/mlx4/fw.c
+++ b/drivers/net/ethernet/mellanox/mlx4/fw.c
@@ -162,6 +162,7 @@ static void dump_dev_cap_flags2(struct mlx4_dev *dev, u64 flags)
 		[35] = "Diag counters per port",
 		[36] = "QinQ VST mode support",
 		[37] = "sl to vl mapping table change event support",
+		[38] = "user MAC support",
 	};
 	int i;
 
@@ -778,6 +779,7 @@ int mlx4_QUERY_DEV_CAP(struct mlx4_dev *dev, struct mlx4_dev_cap *dev_cap)
 #define QUERY_DEV_CAP_MAX_DESC_SZ_SQ_OFFSET	0x52
 #define QUERY_DEV_CAP_MAX_SG_RQ_OFFSET		0x55
 #define QUERY_DEV_CAP_MAX_DESC_SZ_RQ_OFFSET	0x56
+#define QUERY_DEV_CAP_USER_MAC_EN_OFFSET	0x5C
 #define QUERY_DEV_CAP_SVLAN_BY_QP_OFFSET	0x5D
 #define QUERY_DEV_CAP_MAX_QP_MCG_OFFSET		0x61
 #define QUERY_DEV_CAP_RSVD_MCG_OFFSET		0x62
@@ -949,6 +951,9 @@ int mlx4_QUERY_DEV_CAP(struct mlx4_dev *dev, struct mlx4_dev_cap *dev_cap)
 	MLX4_GET(size, outbox, QUERY_DEV_CAP_MAX_DESC_SZ_SQ_OFFSET);
 	dev_cap->max_sq_desc_sz = size;
 
+	MLX4_GET(field, outbox, QUERY_DEV_CAP_USER_MAC_EN_OFFSET);
+	if (field & (1 << 2))
+		dev_cap->flags2 |= MLX4_DEV_CAP_FLAG2_USER_MAC_EN;
 	MLX4_GET(field, outbox, QUERY_DEV_CAP_SVLAN_BY_QP_OFFSET);
 	if (field & 0x1)
 		dev_cap->flags2 |= MLX4_DEV_CAP_FLAG2_SVLAN_BY_QP;
diff --git a/drivers/net/ethernet/mellanox/mlx4/main.c b/drivers/net/ethernet/mellanox/mlx4/main.c
index e6413a8a5f07..c631d157b97d 100644
--- a/drivers/net/ethernet/mellanox/mlx4/main.c
+++ b/drivers/net/ethernet/mellanox/mlx4/main.c
@@ -1029,6 +1029,9 @@ static int mlx4_slave_cap(struct mlx4_dev *dev)
 	dev->caps.flags2 &= ~MLX4_DEV_CAP_FLAG2_TS;
 	mlx4_warn(dev, "Timestamping is not supported in slave mode\n");
 
+	dev->caps.flags2 &= ~MLX4_DEV_CAP_FLAG2_USER_MAC_EN;
+	mlx4_dbg(dev, "User MAC FW update is not supported in slave mode\n");
+
 	slave_adjust_steering_mode(dev, dev_cap, hca_param);
 	mlx4_dbg(dev, "RSS support for IP fragments is %s\n",
 		 hca_param->rss_ip_frags ? "on" : "off");
diff --git a/drivers/net/ethernet/mellanox/mlx4/mlx4.h b/drivers/net/ethernet/mellanox/mlx4/mlx4.h
index 706d7f21ac5c..16f1e097d33a 100644
--- a/drivers/net/ethernet/mellanox/mlx4/mlx4.h
+++ b/drivers/net/ethernet/mellanox/mlx4/mlx4.h
@@ -807,6 +807,8 @@ struct mlx4_set_port_general_context {
 	u8 phv_en;
 	u8 reserved6[5];
 	__be16 user_mtu;
+	u16 reserved7;
+	u8 user_mac[6];
 };
 
 struct mlx4_set_port_rqp_calc_context {
diff --git a/drivers/net/ethernet/mellanox/mlx4/port.c b/drivers/net/ethernet/mellanox/mlx4/port.c
index 4e36e287d605..3ef3406ff4cb 100644
--- a/drivers/net/ethernet/mellanox/mlx4/port.c
+++ b/drivers/net/ethernet/mellanox/mlx4/port.c
@@ -52,6 +52,7 @@
 
 #define MLX4_FLAG2_V_IGNORE_FCS_MASK		BIT(1)
 #define MLX4_FLAG2_V_USER_MTU_MASK		BIT(5)
+#define MLX4_FLAG2_V_USER_MAC_MASK		BIT(6)
 #define MLX4_FLAG_V_MTU_MASK			BIT(0)
 #define MLX4_FLAG_V_PPRX_MASK			BIT(1)
 #define MLX4_FLAG_V_PPTX_MASK			BIT(2)
@@ -1700,6 +1701,30 @@ int mlx4_SET_PORT_user_mtu(struct mlx4_dev *dev, u8 port, u16 user_mtu)
 }
 EXPORT_SYMBOL(mlx4_SET_PORT_user_mtu);
 
+int mlx4_SET_PORT_user_mac(struct mlx4_dev *dev, u8 port, u8 *user_mac)
+{
+	struct mlx4_cmd_mailbox *mailbox;
+	struct mlx4_set_port_general_context *context;
+	u32 in_mod;
+	int err;
+
+	mailbox = mlx4_alloc_cmd_mailbox(dev);
+	if (IS_ERR(mailbox))
+		return PTR_ERR(mailbox);
+	context = mailbox->buf;
+	context->flags2 |= MLX4_FLAG2_V_USER_MAC_MASK;
+	memcpy(context->user_mac, user_mac, sizeof(context->user_mac));
+
+	in_mod = MLX4_SET_PORT_GENERAL << 8 | port;
+	err = mlx4_cmd(dev, mailbox->dma, in_mod, MLX4_SET_PORT_ETH_OPCODE,
+		       MLX4_CMD_SET_PORT, MLX4_CMD_TIME_CLASS_B,
+		       MLX4_CMD_NATIVE);
+
+	mlx4_free_cmd_mailbox(dev, mailbox);
+	return err;
+}
+EXPORT_SYMBOL(mlx4_SET_PORT_user_mac);
+
 int mlx4_SET_PORT_fcs_check(struct mlx4_dev *dev, u8 port, u8 ignore_fcs_value)
 {
 	struct mlx4_cmd_mailbox *mailbox;
diff --git a/include/linux/mlx4/device.h b/include/linux/mlx4/device.h
index 9844606f9491..89c0e7f7cd9b 100644
--- a/include/linux/mlx4/device.h
+++ b/include/linux/mlx4/device.h
@@ -224,6 +224,7 @@ enum {
 	MLX4_DEV_CAP_FLAG2_DIAG_PER_PORT	= 1ULL <<  35,
 	MLX4_DEV_CAP_FLAG2_SVLAN_BY_QP          = 1ULL <<  36,
 	MLX4_DEV_CAP_FLAG2_SL_TO_VL_CHANGE_EVENT = 1ULL << 37,
+	MLX4_DEV_CAP_FLAG2_USER_MAC_EN		= 1ULL << 38,
 };
 
 enum {
@@ -1377,6 +1378,7 @@ int mlx4_get_base_qpn(struct mlx4_dev *dev, u8 port);
 int __mlx4_replace_mac(struct mlx4_dev *dev, u8 port, int qpn, u64 new_mac);
 int mlx4_SET_PORT_general(struct mlx4_dev *dev, u8 port, int mtu,
 			  u8 pptx, u8 pfctx, u8 pprx, u8 pfcrx);
+int mlx4_SET_PORT_user_mac(struct mlx4_dev *dev, u8 port, u8 *user_mac);
 int mlx4_SET_PORT_user_mtu(struct mlx4_dev *dev, u8 port, u16 user_mtu);
 int mlx4_SET_PORT_qpn_calc(struct mlx4_dev *dev, u8 port, u32 base_qpn,
 			   u8 promisc);
-- 
cgit v1.2.3


From 2804fd3af6ba5ae5737705b27146455eabe2e2f8 Mon Sep 17 00:00:00 2001
From: Alexander Aring <aring@mojatatu.com>
Date: Mon, 28 Aug 2017 15:03:13 -0400
Subject: if_ether: add forces ife lfb type

This patch adds the forces IFE lfb type according to IEEE registered
ethertypes. See http://standards-oui.ieee.org/ethertype/eth.txt for more
information. Since there exists the IFE subsystem it can be used there.

This patch also use the correct word "ForCES" instead of "FoRCES" which
is a spelling error inside the IEEE ethertype specification.

Signed-off-by: Alexander Aring <aring@mojatatu.com>
Acked-by: Jamal Hadi Salim <jhs@mojatatu.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/if_ether.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include')

diff --git a/include/uapi/linux/if_ether.h b/include/uapi/linux/if_ether.h
index efeb1190c2ca..f68f6bf4a253 100644
--- a/include/uapi/linux/if_ether.h
+++ b/include/uapi/linux/if_ether.h
@@ -104,6 +104,7 @@
 #define ETH_P_QINQ2	0x9200		/* deprecated QinQ VLAN [ NOT AN OFFICIALLY REGISTERED ID ] */
 #define ETH_P_QINQ3	0x9300		/* deprecated QinQ VLAN [ NOT AN OFFICIALLY REGISTERED ID ] */
 #define ETH_P_EDSA	0xDADA		/* Ethertype DSA [ NOT AN OFFICIALLY REGISTERED ID ] */
+#define ETH_P_IFE	0xED3E		/* ForCES inter-FE LFB type */
 #define ETH_P_AF_IUCV   0xFBFB		/* IBM af_iucv [ NOT AN OFFICIALLY REGISTERED ID ] */
 
 #define ETH_P_802_3_MIN	0x0600		/* If the value in the ethernet type is less than this value
-- 
cgit v1.2.3


From 155e6f649757c902901e599c268f8b575ddac1f8 Mon Sep 17 00:00:00 2001
From: Jiri Benc <jbenc@redhat.com>
Date: Mon, 28 Aug 2017 21:43:21 +0200
Subject: ether: add NSH ethertype

The NSH draft says:

   An IEEE EtherType, 0x894F, has been allocated for NSH.

Signed-off-by: Jiri Benc <jbenc@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/if_ether.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include')

diff --git a/include/uapi/linux/if_ether.h b/include/uapi/linux/if_ether.h
index f68f6bf4a253..61f7ccce5b69 100644
--- a/include/uapi/linux/if_ether.h
+++ b/include/uapi/linux/if_ether.h
@@ -99,6 +99,7 @@
 #define ETH_P_FIP	0x8914		/* FCoE Initialization Protocol */
 #define ETH_P_80221	0x8917		/* IEEE 802.21 Media Independent Handover Protocol */
 #define ETH_P_HSR	0x892F		/* IEC 62439-3 HSRv1	*/
+#define ETH_P_NSH	0x894F		/* Network Service Header */
 #define ETH_P_LOOPBACK	0x9000		/* Ethernet loopback packet, per IEEE 802.3 */
 #define ETH_P_QINQ1	0x9100		/* deprecated QinQ VLAN [ NOT AN OFFICIALLY REGISTERED ID ] */
 #define ETH_P_QINQ2	0x9200		/* deprecated QinQ VLAN [ NOT AN OFFICIALLY REGISTERED ID ] */
-- 
cgit v1.2.3


From fa20e0e32cb3dfc1760b6254b64977f2fb5bd851 Mon Sep 17 00:00:00 2001
From: Jiri Benc <jbenc@redhat.com>
Date: Mon, 28 Aug 2017 21:43:22 +0200
Subject: vxlan: factor out VXLAN-GPE next protocol

The values are shared between VXLAN-GPE and NSH. Originally probably by
coincidence but I notified both working groups about this last year and they
seem to keep the values in sync since then.

Hopefully they'll get a single IANA registry for the values, too. (I asked
them for that.)

Factor out the code to be shared by the NSH implementation.

NSH and MPLS values are added in this patch, too. For MPLS, the drafts
incorrectly assign only a single value, while we have two MPLS ethertypes.
I raised the problem with both groups. For now, I assume the value is for
unicast.

Signed-off-by: Jiri Benc <jbenc@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/vxlan.c     | 32 +++++++-------------------------
 include/net/tun_proto.h | 49 +++++++++++++++++++++++++++++++++++++++++++++++++
 include/net/vxlan.h     |  6 ------
 3 files changed, 56 insertions(+), 31 deletions(-)
 create mode 100644 include/net/tun_proto.h

(limited to 'include')

diff --git a/drivers/net/vxlan.c b/drivers/net/vxlan.c
index ae3a1da703c2..d7c49cf1d5e9 100644
--- a/drivers/net/vxlan.c
+++ b/drivers/net/vxlan.c
@@ -26,6 +26,7 @@
 #include <net/inet_ecn.h>
 #include <net/net_namespace.h>
 #include <net/netns/generic.h>
+#include <net/tun_proto.h>
 #include <net/vxlan.h>
 
 #if IS_ENABLED(CONFIG_IPV6)
@@ -1261,19 +1262,9 @@ static bool vxlan_parse_gpe_hdr(struct vxlanhdr *unparsed,
 	if (gpe->oam_flag)
 		return false;
 
-	switch (gpe->next_protocol) {
-	case VXLAN_GPE_NP_IPV4:
-		*protocol = htons(ETH_P_IP);
-		break;
-	case VXLAN_GPE_NP_IPV6:
-		*protocol = htons(ETH_P_IPV6);
-		break;
-	case VXLAN_GPE_NP_ETHERNET:
-		*protocol = htons(ETH_P_TEB);
-		break;
-	default:
+	*protocol = tun_p_to_eth_p(gpe->next_protocol);
+	if (!*protocol)
 		return false;
-	}
 
 	unparsed->vx_flags &= ~VXLAN_GPE_USED_BITS;
 	return true;
@@ -1799,19 +1790,10 @@ static int vxlan_build_gpe_hdr(struct vxlanhdr *vxh, u32 vxflags,
 	struct vxlanhdr_gpe *gpe = (struct vxlanhdr_gpe *)vxh;
 
 	gpe->np_applied = 1;
-
-	switch (protocol) {
-	case htons(ETH_P_IP):
-		gpe->next_protocol = VXLAN_GPE_NP_IPV4;
-		return 0;
-	case htons(ETH_P_IPV6):
-		gpe->next_protocol = VXLAN_GPE_NP_IPV6;
-		return 0;
-	case htons(ETH_P_TEB):
-		gpe->next_protocol = VXLAN_GPE_NP_ETHERNET;
-		return 0;
-	}
-	return -EPFNOSUPPORT;
+	gpe->next_protocol = tun_p_from_eth_p(protocol);
+	if (!gpe->next_protocol)
+		return -EPFNOSUPPORT;
+	return 0;
 }
 
 static int vxlan_build_skb(struct sk_buff *skb, struct dst_entry *dst,
diff --git a/include/net/tun_proto.h b/include/net/tun_proto.h
new file mode 100644
index 000000000000..2ea3deba4c99
--- /dev/null
+++ b/include/net/tun_proto.h
@@ -0,0 +1,49 @@
+#ifndef __NET_TUN_PROTO_H
+#define __NET_TUN_PROTO_H
+
+#include <linux/kernel.h>
+
+/* One byte protocol values as defined by VXLAN-GPE and NSH. These will
+ * hopefully get a shared IANA registry.
+ */
+#define TUN_P_IPV4      0x01
+#define TUN_P_IPV6      0x02
+#define TUN_P_ETHERNET  0x03
+#define TUN_P_NSH       0x04
+#define TUN_P_MPLS_UC   0x05
+
+static inline __be16 tun_p_to_eth_p(u8 proto)
+{
+	switch (proto) {
+	case TUN_P_IPV4:
+		return htons(ETH_P_IP);
+	case TUN_P_IPV6:
+		return htons(ETH_P_IPV6);
+	case TUN_P_ETHERNET:
+		return htons(ETH_P_TEB);
+	case TUN_P_NSH:
+		return htons(ETH_P_NSH);
+	case TUN_P_MPLS_UC:
+		return htons(ETH_P_MPLS_UC);
+	}
+	return 0;
+}
+
+static inline u8 tun_p_from_eth_p(__be16 proto)
+{
+	switch (proto) {
+	case htons(ETH_P_IP):
+		return TUN_P_IPV4;
+	case htons(ETH_P_IPV6):
+		return TUN_P_IPV6;
+	case htons(ETH_P_TEB):
+		return TUN_P_ETHERNET;
+	case htons(ETH_P_NSH):
+		return TUN_P_NSH;
+	case htons(ETH_P_MPLS_UC):
+		return TUN_P_MPLS_UC;
+	}
+	return 0;
+}
+
+#endif
diff --git a/include/net/vxlan.h b/include/net/vxlan.h
index 3f430e38ab82..4e3876dde295 100644
--- a/include/net/vxlan.h
+++ b/include/net/vxlan.h
@@ -168,12 +168,6 @@ reserved_flags2:2;
 #define VXLAN_GPE_USED_BITS (VXLAN_HF_VER | VXLAN_HF_NP | VXLAN_HF_OAM | \
 			     cpu_to_be32(0xff))
 
-/* VXLAN-GPE header Next Protocol. */
-#define VXLAN_GPE_NP_IPV4      0x01
-#define VXLAN_GPE_NP_IPV6      0x02
-#define VXLAN_GPE_NP_ETHERNET  0x03
-#define VXLAN_GPE_NP_NSH       0x04
-
 struct vxlan_metadata {
 	u32		gbp;
 };
-- 
cgit v1.2.3


From 1f0b7744c50573df464ca33d8e5275be509f852b Mon Sep 17 00:00:00 2001
From: Yi Yang <yi.y.yang@intel.com>
Date: Mon, 28 Aug 2017 21:43:23 +0200
Subject: net: add NSH header structures and helpers

NSH (Network Service Header)[1] is a new protocol for service
function chaining, it can be handled as a L3 protocol like
IPv4 and IPv6, Eth + NSH + Inner packet or VxLAN-gpe + NSH +
Inner packet are two typical use cases.

This patch adds NSH header structures and helpers for NSH GSO
support and Open vSwitch NSH support.

[1] https://datatracker.ietf.org/doc/draft-ietf-sfc-nsh/

[Jiri: added nsh_hdr() helper and renamed the header struct to "struct
nshhdr" to match the usual pattern. Removed packet type defines, these are
now shared with VXLAN-GPE.]

Signed-off-by: Yi Yang <yi.y.yang@intel.com>
Signed-off-by: Jiri Benc <jbenc@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/nsh.h | 307 ++++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 307 insertions(+)
 create mode 100644 include/net/nsh.h

(limited to 'include')

diff --git a/include/net/nsh.h b/include/net/nsh.h
new file mode 100644
index 000000000000..a1eaea20be96
--- /dev/null
+++ b/include/net/nsh.h
@@ -0,0 +1,307 @@
+#ifndef __NET_NSH_H
+#define __NET_NSH_H 1
+
+#include <linux/skbuff.h>
+
+/*
+ * Network Service Header:
+ *  0                   1                   2                   3
+ *  0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
+ * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ * |Ver|O|U|    TTL    |   Length  |U|U|U|U|MD Type| Next Protocol |
+ * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ * |          Service Path Identifier (SPI)        | Service Index |
+ * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ * |                                                               |
+ * ~               Mandatory/Optional Context Headers              ~
+ * |                                                               |
+ * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ *
+ * Version: The version field is used to ensure backward compatibility
+ * going forward with future NSH specification updates.  It MUST be set
+ * to 0x0 by the sender, in this first revision of NSH.  Given the
+ * widespread implementation of existing hardware that uses the first
+ * nibble after an MPLS label stack for ECMP decision processing, this
+ * document reserves version 01b and this value MUST NOT be used in
+ * future versions of the protocol.  Please see [RFC7325] for further
+ * discussion of MPLS-related forwarding requirements.
+ *
+ * O bit: Setting this bit indicates an Operations, Administration, and
+ * Maintenance (OAM) packet.  The actual format and processing of SFC
+ * OAM packets is outside the scope of this specification (see for
+ * example [I-D.ietf-sfc-oam-framework] for one approach).
+ *
+ * The O bit MUST be set for OAM packets and MUST NOT be set for non-OAM
+ * packets.  The O bit MUST NOT be modified along the SFP.
+ *
+ * SF/SFF/SFC Proxy/Classifier implementations that do not support SFC
+ * OAM procedures SHOULD discard packets with O bit set, but MAY support
+ * a configurable parameter to enable forwarding received SFC OAM
+ * packets unmodified to the next element in the chain.  Forwarding OAM
+ * packets unmodified by SFC elements that do not support SFC OAM
+ * procedures may be acceptable for a subset of OAM functions, but can
+ * result in unexpected outcomes for others, thus it is recommended to
+ * analyze the impact of forwarding an OAM packet for all OAM functions
+ * prior to enabling this behavior.  The configurable parameter MUST be
+ * disabled by default.
+ *
+ * TTL: Indicates the maximum SFF hops for an SFP.  This field is used
+ * for service plane loop detection.  The initial TTL value SHOULD be
+ * configurable via the control plane; the configured initial value can
+ * be specific to one or more SFPs.  If no initial value is explicitly
+ * provided, the default initial TTL value of 63 MUST be used.  Each SFF
+ * involved in forwarding an NSH packet MUST decrement the TTL value by
+ * 1 prior to NSH forwarding lookup.  Decrementing by 1 from an incoming
+ * value of 0 shall result in a TTL value of 63.  The packet MUST NOT be
+ * forwarded if TTL is, after decrement, 0.
+ *
+ * All other flag fields, marked U, are unassigned and available for
+ * future use, see Section 11.2.1.  Unassigned bits MUST be set to zero
+ * upon origination, and MUST be ignored and preserved unmodified by
+ * other NSH supporting elements.  Elements which do not understand the
+ * meaning of any of these bits MUST NOT modify their actions based on
+ * those unknown bits.
+ *
+ * Length: The total length, in 4-byte words, of NSH including the Base
+ * Header, the Service Path Header, the Fixed Length Context Header or
+ * Variable Length Context Header(s).  The length MUST be 0x6 for MD
+ * Type equal to 0x1, and MUST be 0x2 or greater for MD Type equal to
+ * 0x2.  The length of the NSH header MUST be an integer multiple of 4
+ * bytes, thus variable length metadata is always padded out to a
+ * multiple of 4 bytes.
+ *
+ * MD Type: Indicates the format of NSH beyond the mandatory Base Header
+ * and the Service Path Header.  MD Type defines the format of the
+ * metadata being carried.
+ *
+ * 0x0 - This is a reserved value.  Implementations SHOULD silently
+ * discard packets with MD Type 0x0.
+ *
+ * 0x1 - This indicates that the format of the header includes a fixed
+ * length Context Header (see Figure 4 below).
+ *
+ * 0x2 - This does not mandate any headers beyond the Base Header and
+ * Service Path Header, but may contain optional variable length Context
+ * Header(s).  The semantics of the variable length Context Header(s)
+ * are not defined in this document.  The format of the optional
+ * variable length Context Headers is provided in Section 2.5.1.
+ *
+ * 0xF - This value is reserved for experimentation and testing, as per
+ * [RFC3692].  Implementations not explicitly configured to be part of
+ * an experiment SHOULD silently discard packets with MD Type 0xF.
+ *
+ * Next Protocol: indicates the protocol type of the encapsulated data.
+ * NSH does not alter the inner payload, and the semantics on the inner
+ * protocol remain unchanged due to NSH service function chaining.
+ * Please see the IANA Considerations section below, Section 11.2.5.
+ *
+ * This document defines the following Next Protocol values:
+ *
+ * 0x1: IPv4
+ * 0x2: IPv6
+ * 0x3: Ethernet
+ * 0x4: NSH
+ * 0x5: MPLS
+ * 0xFE: Experiment 1
+ * 0xFF: Experiment 2
+ *
+ * Packets with Next Protocol values not supported SHOULD be silently
+ * dropped by default, although an implementation MAY provide a
+ * configuration parameter to forward them.  Additionally, an
+ * implementation not explicitly configured for a specific experiment
+ * [RFC3692] SHOULD silently drop packets with Next Protocol values 0xFE
+ * and 0xFF.
+ *
+ * Service Path Identifier (SPI): Identifies a service path.
+ * Participating nodes MUST use this identifier for Service Function
+ * Path selection.  The initial classifier MUST set the appropriate SPI
+ * for a given classification result.
+ *
+ * Service Index (SI): Provides location within the SFP.  The initial
+ * classifier for a given SFP SHOULD set the SI to 255, however the
+ * control plane MAY configure the initial value of SI as appropriate
+ * (i.e., taking into account the length of the service function path).
+ * The Service Index MUST be decremented by a value of 1 by Service
+ * Functions or by SFC Proxy nodes after performing required services
+ * and the new decremented SI value MUST be used in the egress packet's
+ * NSH.  The initial Classifier MUST send the packet to the first SFF in
+ * the identified SFP for forwarding along an SFP.  If re-classification
+ * occurs, and that re-classification results in a new SPI, the
+ * (re)classifier is, in effect, the initial classifier for the
+ * resultant SPI.
+ *
+ * The SI is used in conjunction the with Service Path Identifier for
+ * Service Function Path Selection and for determining the next SFF/SF
+ * in the path.  The SI is also valuable when troubleshooting or
+ * reporting service paths.  Additionally, while the TTL field is the
+ * main mechanism for service plane loop detection, the SI can also be
+ * used for detecting service plane loops.
+ *
+ * When the Base Header specifies MD Type = 0x1, a Fixed Length Context
+ * Header (16-bytes) MUST be present immediately following the Service
+ * Path Header. The value of a Fixed Length Context
+ * Header that carries no metadata MUST be set to zero.
+ *
+ * When the base header specifies MD Type = 0x2, zero or more Variable
+ * Length Context Headers MAY be added, immediately following the
+ * Service Path Header (see Figure 5).  Therefore, Length = 0x2,
+ * indicates that only the Base Header followed by the Service Path
+ * Header are present.  The optional Variable Length Context Headers
+ * MUST be of an integer number of 4-bytes.  The base header Length
+ * field MUST be used to determine the offset to locate the original
+ * packet or frame for SFC nodes that require access to that
+ * information.
+ *
+ * The format of the optional variable length Context Headers
+ *
+ *  0                   1                   2                   3
+ *  0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
+ * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ * |          Metadata Class       |      Type     |U|    Length   |
+ * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ * |                      Variable Metadata                        |
+ * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ *
+ * Metadata Class (MD Class): Defines the scope of the 'Type' field to
+ * provide a hierarchical namespace.  The IANA Considerations
+ * Section 11.2.4 defines how the MD Class values can be allocated to
+ * standards bodies, vendors, and others.
+ *
+ * Type: Indicates the explicit type of metadata being carried.  The
+ * definition of the Type is the responsibility of the MD Class owner.
+ *
+ * Unassigned bit: One unassigned bit is available for future use. This
+ * bit MUST NOT be set, and MUST be ignored on receipt.
+ *
+ * Length: Indicates the length of the variable metadata, in bytes.  In
+ * case the metadata length is not an integer number of 4-byte words,
+ * the sender MUST add pad bytes immediately following the last metadata
+ * byte to extend the metadata to an integer number of 4-byte words.
+ * The receiver MUST round up the length field to the nearest 4-byte
+ * word boundary, to locate and process the next field in the packet.
+ * The receiver MUST access only those bytes in the metadata indicated
+ * by the length field (i.e., actual number of bytes) and MUST ignore
+ * the remaining bytes up to the nearest 4-byte word boundary.  The
+ * Length may be 0 or greater.
+ *
+ * A value of 0 denotes a Context Header without a Variable Metadata
+ * field.
+ *
+ * [0] https://datatracker.ietf.org/doc/draft-ietf-sfc-nsh/
+ */
+
+/**
+ * struct nsh_md1_ctx - Keeps track of NSH context data
+ * @nshc<1-4>: NSH Contexts.
+ */
+struct nsh_md1_ctx {
+	__be32 context[4];
+};
+
+struct nsh_md2_tlv {
+	__be16 md_class;
+	u8 type;
+	u8 length;
+	u8 md_value[];
+};
+
+struct nshhdr {
+	__be16 ver_flags_ttl_len;
+	u8 mdtype;
+	u8 np;
+	__be32 path_hdr;
+	union {
+	    struct nsh_md1_ctx md1;
+	    struct nsh_md2_tlv md2;
+	};
+};
+
+/* Masking NSH header fields. */
+#define NSH_VER_MASK       0xc000
+#define NSH_VER_SHIFT      14
+#define NSH_FLAGS_MASK     0x3000
+#define NSH_FLAGS_SHIFT    12
+#define NSH_TTL_MASK       0x0fc0
+#define NSH_TTL_SHIFT      6
+#define NSH_LEN_MASK       0x003f
+#define NSH_LEN_SHIFT      0
+
+#define NSH_MDTYPE_MASK    0x0f
+#define NSH_MDTYPE_SHIFT   0
+
+#define NSH_SPI_MASK       0xffffff00
+#define NSH_SPI_SHIFT      8
+#define NSH_SI_MASK        0x000000ff
+#define NSH_SI_SHIFT       0
+
+/* MD Type Registry. */
+#define NSH_M_TYPE1     0x01
+#define NSH_M_TYPE2     0x02
+#define NSH_M_EXP1      0xFE
+#define NSH_M_EXP2      0xFF
+
+/* NSH Base Header Length */
+#define NSH_BASE_HDR_LEN  8
+
+/* NSH MD Type 1 header Length. */
+#define NSH_M_TYPE1_LEN   24
+
+/* NSH header maximum Length. */
+#define NSH_HDR_MAX_LEN 256
+
+/* NSH context headers maximum Length. */
+#define NSH_CTX_HDRS_MAX_LEN 248
+
+static inline struct nshhdr *nsh_hdr(struct sk_buff *skb)
+{
+	return (struct nshhdr *)skb_network_header(skb);
+}
+
+static inline u16 nsh_hdr_len(const struct nshhdr *nsh)
+{
+	return ((ntohs(nsh->ver_flags_ttl_len) & NSH_LEN_MASK)
+		>> NSH_LEN_SHIFT) << 2;
+}
+
+static inline u8 nsh_get_ver(const struct nshhdr *nsh)
+{
+	return (ntohs(nsh->ver_flags_ttl_len) & NSH_VER_MASK)
+		>> NSH_VER_SHIFT;
+}
+
+static inline u8 nsh_get_flags(const struct nshhdr *nsh)
+{
+	return (ntohs(nsh->ver_flags_ttl_len) & NSH_FLAGS_MASK)
+		>> NSH_FLAGS_SHIFT;
+}
+
+static inline u8 nsh_get_ttl(const struct nshhdr *nsh)
+{
+	return (ntohs(nsh->ver_flags_ttl_len) & NSH_TTL_MASK)
+		>> NSH_TTL_SHIFT;
+}
+
+static inline void __nsh_set_xflag(struct nshhdr *nsh, u16 xflag, u16 xmask)
+{
+	nsh->ver_flags_ttl_len
+		= (nsh->ver_flags_ttl_len & ~htons(xmask)) | htons(xflag);
+}
+
+static inline void nsh_set_flags_and_ttl(struct nshhdr *nsh, u8 flags, u8 ttl)
+{
+	__nsh_set_xflag(nsh, ((flags << NSH_FLAGS_SHIFT) & NSH_FLAGS_MASK) |
+			     ((ttl << NSH_TTL_SHIFT) & NSH_TTL_MASK),
+			NSH_FLAGS_MASK | NSH_TTL_MASK);
+}
+
+static inline void nsh_set_flags_ttl_len(struct nshhdr *nsh, u8 flags,
+					 u8 ttl, u8 len)
+{
+	len = len >> 2;
+	__nsh_set_xflag(nsh, ((flags << NSH_FLAGS_SHIFT) & NSH_FLAGS_MASK) |
+			     ((ttl << NSH_TTL_SHIFT) & NSH_TTL_MASK) |
+			     ((len << NSH_LEN_SHIFT) & NSH_LEN_MASK),
+			NSH_FLAGS_MASK | NSH_TTL_MASK | NSH_LEN_MASK);
+}
+
+#endif /* __NET_NSH_H */
-- 
cgit v1.2.3


From 1b70d792cf6775fb5d0737524387893daeb5374a Mon Sep 17 00:00:00 2001
From: David Ahern <dsahern@gmail.com>
Date: Mon, 28 Aug 2017 13:53:34 -0700
Subject: ipv6: Use rt6i_idev index for echo replies to a local address

Tariq repored local pings to linklocal address is failing:
$ ifconfig ens8
ens8: flags=4163<UP,BROADCAST,RUNNING,MULTICAST>  mtu 1500
        inet 11.141.16.6  netmask 255.255.0.0  broadcast 11.141.255.255
        inet6 fe80::7efe:90ff:fecb:7502  prefixlen 64  scopeid 0x20<link>
        ether 7c:fe:90:cb:75:02  txqueuelen 1000  (Ethernet)
        RX packets 12  bytes 1164 (1.1 KiB)
        RX errors 0  dropped 0  overruns 0  frame 0
        TX packets 30  bytes 2484 (2.4 KiB)
        TX errors 0  dropped 0 overruns 0  carrier 0  collisions 0

$  /bin/ping6 -c 3 fe80::7efe:90ff:fecb:7502%ens8
PING fe80::7efe:90ff:fecb:7502%ens8(fe80::7efe:90ff:fecb:7502) 56 data bytes

Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/ip6_route.h | 10 ++++++++++
 net/ipv6/icmp.c         | 33 ++++++++++++++++++++-------------
 2 files changed, 30 insertions(+), 13 deletions(-)

(limited to 'include')

diff --git a/include/net/ip6_route.h b/include/net/ip6_route.h
index 882bc3c7ccde..ee96f402cb75 100644
--- a/include/net/ip6_route.h
+++ b/include/net/ip6_route.h
@@ -164,6 +164,16 @@ void rt6_mtu_change(struct net_device *dev, unsigned int mtu);
 void rt6_remove_prefsrc(struct inet6_ifaddr *ifp);
 void rt6_clean_tohost(struct net *net, struct in6_addr *gateway);
 
+static inline const struct rt6_info *skb_rt6_info(const struct sk_buff *skb)
+{
+	const struct dst_entry *dst = skb_dst(skb);
+	const struct rt6_info *rt6 = NULL;
+
+	if (dst)
+		rt6 = container_of(dst, struct rt6_info, dst);
+
+	return rt6;
+}
 
 /*
  *	Store a destination cache entry in a socket
diff --git a/net/ipv6/icmp.c b/net/ipv6/icmp.c
index dd7608cf1d72..5acb54405b10 100644
--- a/net/ipv6/icmp.c
+++ b/net/ipv6/icmp.c
@@ -399,6 +399,24 @@ relookup_failed:
 	return ERR_PTR(err);
 }
 
+static int icmp6_iif(const struct sk_buff *skb)
+{
+	int iif = skb->dev->ifindex;
+
+	/* for local traffic to local address, skb dev is the loopback
+	 * device. Check if there is a dst attached to the skb and if so
+	 * get the real device index.
+	 */
+	if (unlikely(iif == LOOPBACK_IFINDEX)) {
+		const struct rt6_info *rt6 = skb_rt6_info(skb);
+
+		if (rt6)
+			iif = rt6->rt6i_idev->dev->ifindex;
+	}
+
+	return iif;
+}
+
 /*
  *	Send an ICMP message in response to a packet in error
  */
@@ -460,18 +478,7 @@ static void icmp6_send(struct sk_buff *skb, u8 type, u8 code, __u32 info,
 	 */
 
 	if (__ipv6_addr_needs_scope_id(addr_type)) {
-		iif = skb->dev->ifindex;
-
-		/* for local packets, get the real device index */
-		if (iif == LOOPBACK_IFINDEX) {
-			dst = skb_dst(skb);
-			if (dst) {
-				struct rt6_info *rt;
-
-				rt = container_of(dst, struct rt6_info, dst);
-				iif = rt->rt6i_idev->dev->ifindex;
-			}
-		}
+		iif = icmp6_iif(skb);
 	} else {
 		dst = skb_dst(skb);
 		iif = l3mdev_master_ifindex(dst ? dst->dev : skb->dev);
@@ -694,7 +701,7 @@ static void icmpv6_echo_reply(struct sk_buff *skb)
 	fl6.daddr = ipv6_hdr(skb)->saddr;
 	if (saddr)
 		fl6.saddr = *saddr;
-	fl6.flowi6_oif = skb->dev->ifindex;
+	fl6.flowi6_oif = icmp6_iif(skb);
 	fl6.fl6_icmp_type = ICMPV6_ECHO_REPLY;
 	fl6.flowi6_mark = mark;
 	fl6.flowi6_uid = sock_net_uid(net, NULL);
-- 
cgit v1.2.3


From 0dd5759dbb1c9a862e7d90c09d6cf398c45f1100 Mon Sep 17 00:00:00 2001
From: Dave Jiang <dave.jiang@intel.com>
Date: Tue, 29 Aug 2017 13:17:51 -0700
Subject: net: remove dmaengine.h inclusion from netdevice.h

Since the removal of NET_DMA, dmaengine.h header file shouldn't be needed
by netdevice.h anymore.

Signed-off-by: Dave Jiang <dave.jiang@intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netdevice.h | 1 -
 1 file changed, 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index c5475b37a631..29bf06ff157c 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -35,7 +35,6 @@
 
 #include <linux/percpu.h>
 #include <linux/rculist.h>
-#include <linux/dmaengine.h>
 #include <linux/workqueue.h>
 #include <linux/dynamic_queue_limits.h>
 
-- 
cgit v1.2.3


From eaa72dc47488d599439cd0fd0f8c4f1bcb3906bb Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Tue, 29 Aug 2017 15:16:01 -0700
Subject: neigh: increase queue_len_bytes to match wmem_default

Florian reported UDP xmit drops that could be root caused to the
too small neigh limit.

Current limit is 64 KB, meaning that even a single UDP socket would hit
it, since its default sk_sndbuf comes from net.core.wmem_default
(~212992 bytes on 64bit arches).

Once ARP/ND resolution is in progress, we should allow a little more
packets to be queued, at least for one producer.

Once neigh arp_queue is filled, a rogue socket should hit its sk_sndbuf
limit and either block in sendmsg() or return -EAGAIN.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Reported-by: Florian Fainelli <f.fainelli@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 Documentation/networking/ip-sysctl.txt |  7 +++++--
 include/net/sock.h                     | 10 ++++++++++
 net/core/sock.c                        | 10 ----------
 net/decnet/dn_neigh.c                  |  2 +-
 net/ipv4/arp.c                         |  2 +-
 net/ipv4/tcp_input.c                   |  2 +-
 net/ipv6/ndisc.c                       |  2 +-
 7 files changed, 19 insertions(+), 16 deletions(-)

(limited to 'include')

diff --git a/Documentation/networking/ip-sysctl.txt b/Documentation/networking/ip-sysctl.txt
index 6b0bc0f71534..b3345d0fe0a6 100644
--- a/Documentation/networking/ip-sysctl.txt
+++ b/Documentation/networking/ip-sysctl.txt
@@ -109,7 +109,10 @@ neigh/default/unres_qlen_bytes - INTEGER
 	queued for each	unresolved address by other network layers.
 	(added in linux 3.3)
 	Setting negative value is meaningless and will return error.
-	Default: 65536 Bytes(64KB)
+	Default: SK_WMEM_MAX, (same as net.core.wmem_default).
+		Exact value depends on architecture and kernel options,
+		but should be enough to allow queuing 256 packets
+		of medium size.
 
 neigh/default/unres_qlen - INTEGER
 	The maximum number of packets which may be queued for each
@@ -119,7 +122,7 @@ neigh/default/unres_qlen - INTEGER
 	unexpected packet loss. The current default value is calculated
 	according to default value of unres_qlen_bytes and true size of
 	packet.
-	Default: 31
+	Default: 101
 
 mtu_expires - INTEGER
 	Time, in seconds, that cached PMTU information is kept.
diff --git a/include/net/sock.h b/include/net/sock.h
index 1c2912d433e8..03a362568357 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -2368,6 +2368,16 @@ bool sk_net_capable(const struct sock *sk, int cap);
 
 void sk_get_meminfo(const struct sock *sk, u32 *meminfo);
 
+/* Take into consideration the size of the struct sk_buff overhead in the
+ * determination of these values, since that is non-constant across
+ * platforms.  This makes socket queueing behavior and performance
+ * not depend upon such differences.
+ */
+#define _SK_MEM_PACKETS		256
+#define _SK_MEM_OVERHEAD	SKB_TRUESIZE(256)
+#define SK_WMEM_MAX		(_SK_MEM_OVERHEAD * _SK_MEM_PACKETS)
+#define SK_RMEM_MAX		(_SK_MEM_OVERHEAD * _SK_MEM_PACKETS)
+
 extern __u32 sysctl_wmem_max;
 extern __u32 sysctl_rmem_max;
 
diff --git a/net/core/sock.c b/net/core/sock.c
index dfdd14cac775..9b7b6bbb2a23 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -307,16 +307,6 @@ static struct lock_class_key af_wlock_keys[AF_MAX];
 static struct lock_class_key af_elock_keys[AF_MAX];
 static struct lock_class_key af_kern_callback_keys[AF_MAX];
 
-/* Take into consideration the size of the struct sk_buff overhead in the
- * determination of these values, since that is non-constant across
- * platforms.  This makes socket queueing behavior and performance
- * not depend upon such differences.
- */
-#define _SK_MEM_PACKETS		256
-#define _SK_MEM_OVERHEAD	SKB_TRUESIZE(256)
-#define SK_WMEM_MAX		(_SK_MEM_OVERHEAD * _SK_MEM_PACKETS)
-#define SK_RMEM_MAX		(_SK_MEM_OVERHEAD * _SK_MEM_PACKETS)
-
 /* Run time adjustable parameters. */
 __u32 sysctl_wmem_max __read_mostly = SK_WMEM_MAX;
 EXPORT_SYMBOL(sysctl_wmem_max);
diff --git a/net/decnet/dn_neigh.c b/net/decnet/dn_neigh.c
index 21dedf6fd0f7..22bf0b95d6ed 100644
--- a/net/decnet/dn_neigh.c
+++ b/net/decnet/dn_neigh.c
@@ -94,7 +94,7 @@ struct neigh_table dn_neigh_table = {
 			[NEIGH_VAR_BASE_REACHABLE_TIME] = 30 * HZ,
 			[NEIGH_VAR_DELAY_PROBE_TIME] = 5 * HZ,
 			[NEIGH_VAR_GC_STALETIME] = 60 * HZ,
-			[NEIGH_VAR_QUEUE_LEN_BYTES] = 64*1024,
+			[NEIGH_VAR_QUEUE_LEN_BYTES] = SK_WMEM_MAX,
 			[NEIGH_VAR_PROXY_QLEN] = 0,
 			[NEIGH_VAR_ANYCAST_DELAY] = 0,
 			[NEIGH_VAR_PROXY_DELAY] = 0,
diff --git a/net/ipv4/arp.c b/net/ipv4/arp.c
index 8b52179ddc6e..7c45b8896709 100644
--- a/net/ipv4/arp.c
+++ b/net/ipv4/arp.c
@@ -171,7 +171,7 @@ struct neigh_table arp_tbl = {
 			[NEIGH_VAR_BASE_REACHABLE_TIME] = 30 * HZ,
 			[NEIGH_VAR_DELAY_PROBE_TIME] = 5 * HZ,
 			[NEIGH_VAR_GC_STALETIME] = 60 * HZ,
-			[NEIGH_VAR_QUEUE_LEN_BYTES] = 64 * 1024,
+			[NEIGH_VAR_QUEUE_LEN_BYTES] = SK_WMEM_MAX,
 			[NEIGH_VAR_PROXY_QLEN] = 64,
 			[NEIGH_VAR_ANYCAST_DELAY] = 1 * HZ,
 			[NEIGH_VAR_PROXY_DELAY]	= (8 * HZ) / 10,
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 568ccfd6dd37..7616cd76f6f6 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -6086,9 +6086,9 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops,
 	struct tcp_sock *tp = tcp_sk(sk);
 	struct net *net = sock_net(sk);
 	struct sock *fastopen_sk = NULL;
-	struct dst_entry *dst = NULL;
 	struct request_sock *req;
 	bool want_cookie = false;
+	struct dst_entry *dst;
 	struct flowi fl;
 
 	/* TW buckets are converted to open requests without
diff --git a/net/ipv6/ndisc.c b/net/ipv6/ndisc.c
index 5e338eb89509..266a530414d7 100644
--- a/net/ipv6/ndisc.c
+++ b/net/ipv6/ndisc.c
@@ -127,7 +127,7 @@ struct neigh_table nd_tbl = {
 			[NEIGH_VAR_BASE_REACHABLE_TIME] = ND_REACHABLE_TIME,
 			[NEIGH_VAR_DELAY_PROBE_TIME] = 5 * HZ,
 			[NEIGH_VAR_GC_STALETIME] = 60 * HZ,
-			[NEIGH_VAR_QUEUE_LEN_BYTES] = 64 * 1024,
+			[NEIGH_VAR_QUEUE_LEN_BYTES] = SK_WMEM_MAX,
 			[NEIGH_VAR_PROXY_QLEN] = 64,
 			[NEIGH_VAR_ANYCAST_DELAY] = 1 * HZ,
 			[NEIGH_VAR_PROXY_DELAY] = (8 * HZ) / 10,
-- 
cgit v1.2.3


From dc2251bf98c66db3f4e055b751968f0871037ae4 Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rafael.j.wysocki@intel.com>
Date: Wed, 23 Aug 2017 23:19:57 +0200
Subject: cpuidle: Eliminate the CPUIDLE_DRIVER_STATE_START symbol

On some architectures the first (index 0) idle state is a polling
one and it doesn't really save energy, so there is the
CPUIDLE_DRIVER_STATE_START symbol allowing some pieces of
cpuidle code to avoid using that state.

However, this makes the code rather hard to follow.  It is better
to explicitly avoid the polling state, so add a new cpuidle state
flag CPUIDLE_FLAG_POLLING to mark it and make the relevant code
check that flag for the first state instead of using the
CPUIDLE_DRIVER_STATE_START symbol.

In the ACPI processor driver that cannot always rely on the state
flags (like before the states table has been set up) define
a new internal symbol ACPI_IDLE_STATE_START equivalent to the
CPUIDLE_DRIVER_STATE_START one and drop the latter.

Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Tested-by: Sudeep Holla <sudeep.holla@arm.com>
Acked-by: Daniel Lezcano <daniel.lezcano@linaro.org>
---
 drivers/acpi/processor_idle.c      | 10 ++++++----
 drivers/cpuidle/driver.c           |  1 +
 drivers/cpuidle/governors/ladder.c | 14 ++++++++------
 drivers/cpuidle/governors/menu.c   | 13 +++++--------
 include/linux/cpuidle.h            |  7 +------
 5 files changed, 21 insertions(+), 24 deletions(-)

(limited to 'include')

diff --git a/drivers/acpi/processor_idle.c b/drivers/acpi/processor_idle.c
index 5c8aa9cf62d7..39a01ea7a46d 100644
--- a/drivers/acpi/processor_idle.c
+++ b/drivers/acpi/processor_idle.c
@@ -48,6 +48,8 @@
 #define _COMPONENT              ACPI_PROCESSOR_COMPONENT
 ACPI_MODULE_NAME("processor_idle");
 
+#define ACPI_IDLE_STATE_START	(IS_ENABLED(CONFIG_ARCH_HAS_CPU_RELAX) ? 1 : 0)
+
 static unsigned int max_cstate __read_mostly = ACPI_PROCESSOR_MAX_POWER;
 module_param(max_cstate, uint, 0000);
 static unsigned int nocst __read_mostly;
@@ -761,7 +763,7 @@ static int acpi_idle_enter(struct cpuidle_device *dev,
 
 	if (cx->type != ACPI_STATE_C1) {
 		if (acpi_idle_fallback_to_c1(pr) && num_online_cpus() > 1) {
-			index = CPUIDLE_DRIVER_STATE_START;
+			index = ACPI_IDLE_STATE_START;
 			cx = per_cpu(acpi_cstate[index], dev->cpu);
 		} else if (cx->type == ACPI_STATE_C3 && pr->flags.bm_check) {
 			if (cx->bm_sts_skip || !acpi_idle_bm_check()) {
@@ -813,7 +815,7 @@ static void acpi_idle_enter_freeze(struct cpuidle_device *dev,
 static int acpi_processor_setup_cpuidle_cx(struct acpi_processor *pr,
 					   struct cpuidle_device *dev)
 {
-	int i, count = CPUIDLE_DRIVER_STATE_START;
+	int i, count = ACPI_IDLE_STATE_START;
 	struct acpi_processor_cx *cx;
 
 	if (max_cstate == 0)
@@ -840,7 +842,7 @@ static int acpi_processor_setup_cpuidle_cx(struct acpi_processor *pr,
 
 static int acpi_processor_setup_cstates(struct acpi_processor *pr)
 {
-	int i, count = CPUIDLE_DRIVER_STATE_START;
+	int i, count = ACPI_IDLE_STATE_START;
 	struct acpi_processor_cx *cx;
 	struct cpuidle_state *state;
 	struct cpuidle_driver *drv = &acpi_idle_driver;
@@ -1291,7 +1293,7 @@ static int acpi_processor_setup_cpuidle_states(struct acpi_processor *pr)
 		return -EINVAL;
 
 	drv->safe_state_index = -1;
-	for (i = CPUIDLE_DRIVER_STATE_START; i < CPUIDLE_STATE_MAX; i++) {
+	for (i = ACPI_IDLE_STATE_START; i < CPUIDLE_STATE_MAX; i++) {
 		drv->states[i].name[0] = '\0';
 		drv->states[i].desc[0] = '\0';
 	}
diff --git a/drivers/cpuidle/driver.c b/drivers/cpuidle/driver.c
index e53fb861beb0..e942f8ef4309 100644
--- a/drivers/cpuidle/driver.c
+++ b/drivers/cpuidle/driver.c
@@ -204,6 +204,7 @@ static void poll_idle_init(struct cpuidle_driver *drv)
 	state->power_usage = -1;
 	state->enter = poll_idle;
 	state->disabled = false;
+	state->flags = CPUIDLE_FLAG_POLLING;
 }
 #else
 static void poll_idle_init(struct cpuidle_driver *drv) {}
diff --git a/drivers/cpuidle/governors/ladder.c b/drivers/cpuidle/governors/ladder.c
index ac321f09e717..ce1a2ffffb2a 100644
--- a/drivers/cpuidle/governors/ladder.c
+++ b/drivers/cpuidle/governors/ladder.c
@@ -69,6 +69,7 @@ static int ladder_select_state(struct cpuidle_driver *drv,
 	struct ladder_device *ldev = this_cpu_ptr(&ladder_devices);
 	struct ladder_device_state *last_state;
 	int last_residency, last_idx = ldev->last_state_idx;
+	int first_idx = drv->states[0].flags & CPUIDLE_FLAG_POLLING ? 1 : 0;
 	int latency_req = pm_qos_request(PM_QOS_CPU_DMA_LATENCY);
 
 	/* Special case when user has set very strict latency requirement */
@@ -96,13 +97,13 @@ static int ladder_select_state(struct cpuidle_driver *drv,
 	}
 
 	/* consider demotion */
-	if (last_idx > CPUIDLE_DRIVER_STATE_START &&
+	if (last_idx > first_idx &&
 	    (drv->states[last_idx].disabled ||
 	    dev->states_usage[last_idx].disable ||
 	    drv->states[last_idx].exit_latency > latency_req)) {
 		int i;
 
-		for (i = last_idx - 1; i > CPUIDLE_DRIVER_STATE_START; i--) {
+		for (i = last_idx - 1; i > first_idx; i--) {
 			if (drv->states[i].exit_latency <= latency_req)
 				break;
 		}
@@ -110,7 +111,7 @@ static int ladder_select_state(struct cpuidle_driver *drv,
 		return i;
 	}
 
-	if (last_idx > CPUIDLE_DRIVER_STATE_START &&
+	if (last_idx > first_idx &&
 	    last_residency < last_state->threshold.demotion_time) {
 		last_state->stats.demotion_count++;
 		last_state->stats.promotion_count = 0;
@@ -133,13 +134,14 @@ static int ladder_enable_device(struct cpuidle_driver *drv,
 				struct cpuidle_device *dev)
 {
 	int i;
+	int first_idx = drv->states[0].flags & CPUIDLE_FLAG_POLLING ? 1 : 0;
 	struct ladder_device *ldev = &per_cpu(ladder_devices, dev->cpu);
 	struct ladder_device_state *lstate;
 	struct cpuidle_state *state;
 
-	ldev->last_state_idx = CPUIDLE_DRIVER_STATE_START;
+	ldev->last_state_idx = first_idx;
 
-	for (i = CPUIDLE_DRIVER_STATE_START; i < drv->state_count; i++) {
+	for (i = first_idx; i < drv->state_count; i++) {
 		state = &drv->states[i];
 		lstate = &ldev->states[i];
 
@@ -151,7 +153,7 @@ static int ladder_enable_device(struct cpuidle_driver *drv,
 
 		if (i < drv->state_count - 1)
 			lstate->threshold.promotion_time = state->exit_latency;
-		if (i > CPUIDLE_DRIVER_STATE_START)
+		if (i > first_idx)
 			lstate->threshold.demotion_time = state->exit_latency;
 	}
 
diff --git a/drivers/cpuidle/governors/menu.c b/drivers/cpuidle/governors/menu.c
index 61b64c2b2cb8..48eaf2879228 100644
--- a/drivers/cpuidle/governors/menu.c
+++ b/drivers/cpuidle/governors/menu.c
@@ -324,8 +324,9 @@ static int menu_select(struct cpuidle_driver *drv, struct cpuidle_device *dev)
 	expected_interval = get_typical_interval(data);
 	expected_interval = min(expected_interval, data->next_timer_us);
 
-	if (CPUIDLE_DRIVER_STATE_START > 0) {
-		struct cpuidle_state *s = &drv->states[CPUIDLE_DRIVER_STATE_START];
+	first_idx = 0;
+	if (drv->states[0].flags & CPUIDLE_FLAG_POLLING) {
+		struct cpuidle_state *s = &drv->states[1];
 		unsigned int polling_threshold;
 
 		/*
@@ -336,12 +337,8 @@ static int menu_select(struct cpuidle_driver *drv, struct cpuidle_device *dev)
 		polling_threshold = max_t(unsigned int, 20, s->target_residency);
 		if (data->next_timer_us > polling_threshold &&
 		    latency_req > s->exit_latency && !s->disabled &&
-		    !dev->states_usage[CPUIDLE_DRIVER_STATE_START].disable)
-			first_idx = CPUIDLE_DRIVER_STATE_START;
-		else
-			first_idx = CPUIDLE_DRIVER_STATE_START - 1;
-	} else {
-		first_idx = 0;
+		    !dev->states_usage[1].disable)
+			first_idx = 1;
 	}
 
 	/*
diff --git a/include/linux/cpuidle.h b/include/linux/cpuidle.h
index fc1e5d7fc1c7..b58d952a00e2 100644
--- a/include/linux/cpuidle.h
+++ b/include/linux/cpuidle.h
@@ -63,6 +63,7 @@ struct cpuidle_state {
 
 /* Idle State Flags */
 #define CPUIDLE_FLAG_NONE       (0x00)
+#define CPUIDLE_FLAG_POLLING	(0x01) /* polling state */
 #define CPUIDLE_FLAG_COUPLED	(0x02) /* state applies to multiple cpus */
 #define CPUIDLE_FLAG_TIMER_STOP (0x04)  /* timer is stopped on this state */
 
@@ -250,12 +251,6 @@ static inline int cpuidle_register_governor(struct cpuidle_governor *gov)
 {return 0;}
 #endif
 
-#ifdef CONFIG_ARCH_HAS_CPU_RELAX
-#define CPUIDLE_DRIVER_STATE_START	1
-#else
-#define CPUIDLE_DRIVER_STATE_START	0
-#endif
-
 #define CPU_PM_CPU_IDLE_ENTER(low_level_idle_enter, idx)	\
 ({								\
 	int __ret;						\
-- 
cgit v1.2.3


From 34c2f65b718d44ea7d7b3cc10777f410677455ce Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rafael.j.wysocki@intel.com>
Date: Wed, 23 Aug 2017 23:21:07 +0200
Subject: cpuidle: Move polling state initialization code to separate file

Move the polling state initialization code to a separate file built
conditionally on CONFIG_ARCH_HAS_CPU_RELAX to get rid of the #ifdef
in driver.c.

Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Tested-by: Sudeep Holla <sudeep.holla@arm.com>
Acked-by: Daniel Lezcano <daniel.lezcano@linaro.org>
---
 drivers/cpuidle/Makefile     |  1 +
 drivers/cpuidle/driver.c     | 31 -------------------------------
 drivers/cpuidle/poll_state.c | 36 ++++++++++++++++++++++++++++++++++++
 include/linux/cpuidle.h      |  6 ++++++
 4 files changed, 43 insertions(+), 31 deletions(-)
 create mode 100644 drivers/cpuidle/poll_state.c

(limited to 'include')

diff --git a/drivers/cpuidle/Makefile b/drivers/cpuidle/Makefile
index 3ba81b1dffad..0b67a05a7aae 100644
--- a/drivers/cpuidle/Makefile
+++ b/drivers/cpuidle/Makefile
@@ -5,6 +5,7 @@
 obj-y += cpuidle.o driver.o governor.o sysfs.o governors/
 obj-$(CONFIG_ARCH_NEEDS_CPU_IDLE_COUPLED) += coupled.o
 obj-$(CONFIG_DT_IDLE_STATES)		  += dt_idle_states.o
+obj-$(CONFIG_ARCH_HAS_CPU_RELAX)	  += poll_state.o
 
 ##################################################################################
 # ARM SoC drivers
diff --git a/drivers/cpuidle/driver.c b/drivers/cpuidle/driver.c
index e942f8ef4309..6f694c86f3fa 100644
--- a/drivers/cpuidle/driver.c
+++ b/drivers/cpuidle/driver.c
@@ -179,37 +179,6 @@ static void __cpuidle_driver_init(struct cpuidle_driver *drv)
 	}
 }
 
-#ifdef CONFIG_ARCH_HAS_CPU_RELAX
-static int __cpuidle poll_idle(struct cpuidle_device *dev,
-			       struct cpuidle_driver *drv, int index)
-{
-	local_irq_enable();
-	if (!current_set_polling_and_test()) {
-		while (!need_resched())
-			cpu_relax();
-	}
-	current_clr_polling();
-
-	return index;
-}
-
-static void poll_idle_init(struct cpuidle_driver *drv)
-{
-	struct cpuidle_state *state = &drv->states[0];
-
-	snprintf(state->name, CPUIDLE_NAME_LEN, "POLL");
-	snprintf(state->desc, CPUIDLE_DESC_LEN, "CPUIDLE CORE POLL IDLE");
-	state->exit_latency = 0;
-	state->target_residency = 0;
-	state->power_usage = -1;
-	state->enter = poll_idle;
-	state->disabled = false;
-	state->flags = CPUIDLE_FLAG_POLLING;
-}
-#else
-static void poll_idle_init(struct cpuidle_driver *drv) {}
-#endif /* !CONFIG_ARCH_HAS_CPU_RELAX */
-
 /**
  * __cpuidle_register_driver: register the driver
  * @drv: a valid pointer to a struct cpuidle_driver
diff --git a/drivers/cpuidle/poll_state.c b/drivers/cpuidle/poll_state.c
new file mode 100644
index 000000000000..0db4f7273952
--- /dev/null
+++ b/drivers/cpuidle/poll_state.c
@@ -0,0 +1,36 @@
+/*
+ * poll_state.c - Polling idle state
+ *
+ * This file is released under the GPLv2.
+ */
+
+#include <linux/cpuidle.h>
+#include <linux/sched.h>
+#include <linux/sched/idle.h>
+
+static int __cpuidle poll_idle(struct cpuidle_device *dev,
+			       struct cpuidle_driver *drv, int index)
+{
+	local_irq_enable();
+	if (!current_set_polling_and_test()) {
+		while (!need_resched())
+			cpu_relax();
+	}
+	current_clr_polling();
+
+	return index;
+}
+
+void poll_idle_init(struct cpuidle_driver *drv)
+{
+	struct cpuidle_state *state = &drv->states[0];
+
+	snprintf(state->name, CPUIDLE_NAME_LEN, "POLL");
+	snprintf(state->desc, CPUIDLE_DESC_LEN, "CPUIDLE CORE POLL IDLE");
+	state->exit_latency = 0;
+	state->target_residency = 0;
+	state->power_usage = -1;
+	state->enter = poll_idle;
+	state->disabled = false;
+	state->flags = CPUIDLE_FLAG_POLLING;
+}
diff --git a/include/linux/cpuidle.h b/include/linux/cpuidle.h
index b58d952a00e2..561bc5365067 100644
--- a/include/linux/cpuidle.h
+++ b/include/linux/cpuidle.h
@@ -225,6 +225,12 @@ static inline void cpuidle_coupled_parallel_barrier(struct cpuidle_device *dev,
 }
 #endif
 
+#ifdef CONFIG_ARCH_HAS_CPU_RELAX
+void poll_idle_init(struct cpuidle_driver *drv);
+#else
+static void poll_idle_init(struct cpuidle_driver *drv) {}
+#endif
+
 /******************************
  * CPUIDLE GOVERNOR INTERFACE *
  ******************************/
-- 
cgit v1.2.3


From 1b39e3f813b4685c7a30ae964d5529a1b0e3a286 Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rafael.j.wysocki@intel.com>
Date: Tue, 29 Aug 2017 03:14:37 +0200
Subject: cpuidle: Make drivers initialize polling state

Make the drivers that want to include the polling state into their
states table initialize it explicitly and drop the initialization of
it (which in fact is conditional, but that is not obvious from the
code) from the core.

Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Tested-by: Sudeep Holla <sudeep.holla@arm.com>
Acked-by: Daniel Lezcano <daniel.lezcano@linaro.org>
---
 drivers/acpi/processor_idle.c | 9 ++++++++-
 drivers/cpuidle/driver.c      | 2 --
 drivers/cpuidle/poll_state.c  | 3 ++-
 drivers/idle/intel_idle.c     | 1 +
 include/linux/cpuidle.h       | 4 ++--
 5 files changed, 13 insertions(+), 6 deletions(-)

(limited to 'include')

diff --git a/drivers/acpi/processor_idle.c b/drivers/acpi/processor_idle.c
index 39a01ea7a46d..df38e81cc672 100644
--- a/drivers/acpi/processor_idle.c
+++ b/drivers/acpi/processor_idle.c
@@ -842,7 +842,7 @@ static int acpi_processor_setup_cpuidle_cx(struct acpi_processor *pr,
 
 static int acpi_processor_setup_cstates(struct acpi_processor *pr)
 {
-	int i, count = ACPI_IDLE_STATE_START;
+	int i, count;
 	struct acpi_processor_cx *cx;
 	struct cpuidle_state *state;
 	struct cpuidle_driver *drv = &acpi_idle_driver;
@@ -850,6 +850,13 @@ static int acpi_processor_setup_cstates(struct acpi_processor *pr)
 	if (max_cstate == 0)
 		max_cstate = 1;
 
+	if (IS_ENABLED(CONFIG_ARCH_HAS_CPU_RELAX)) {
+		cpuidle_poll_state_init(drv);
+		count = 1;
+	} else {
+		count = 0;
+	}
+
 	for (i = 1; i < ACPI_PROCESSOR_MAX_POWER && i <= max_cstate; i++) {
 		cx = &pr->power.states[i];
 
diff --git a/drivers/cpuidle/driver.c b/drivers/cpuidle/driver.c
index 6f694c86f3fa..dc32f34e68d9 100644
--- a/drivers/cpuidle/driver.c
+++ b/drivers/cpuidle/driver.c
@@ -216,8 +216,6 @@ static int __cpuidle_register_driver(struct cpuidle_driver *drv)
 		on_each_cpu_mask(drv->cpumask, cpuidle_setup_broadcast_timer,
 				 (void *)1, 1);
 
-	poll_idle_init(drv);
-
 	return 0;
 }
 
diff --git a/drivers/cpuidle/poll_state.c b/drivers/cpuidle/poll_state.c
index 0db4f7273952..7416b16287de 100644
--- a/drivers/cpuidle/poll_state.c
+++ b/drivers/cpuidle/poll_state.c
@@ -21,7 +21,7 @@ static int __cpuidle poll_idle(struct cpuidle_device *dev,
 	return index;
 }
 
-void poll_idle_init(struct cpuidle_driver *drv)
+void cpuidle_poll_state_init(struct cpuidle_driver *drv)
 {
 	struct cpuidle_state *state = &drv->states[0];
 
@@ -34,3 +34,4 @@ void poll_idle_init(struct cpuidle_driver *drv)
 	state->disabled = false;
 	state->flags = CPUIDLE_FLAG_POLLING;
 }
+EXPORT_SYMBOL_GPL(cpuidle_poll_state_init);
diff --git a/drivers/idle/intel_idle.c b/drivers/idle/intel_idle.c
index c2ae819a871c..7bf8739e33bc 100644
--- a/drivers/idle/intel_idle.c
+++ b/drivers/idle/intel_idle.c
@@ -1331,6 +1331,7 @@ static void __init intel_idle_cpuidle_driver_init(void)
 
 	intel_idle_state_table_update();
 
+	cpuidle_poll_state_init(drv);
 	drv->state_count = 1;
 
 	for (cstate = 0; cstate < CPUIDLE_STATE_MAX; ++cstate) {
diff --git a/include/linux/cpuidle.h b/include/linux/cpuidle.h
index 561bc5365067..5baacd3a0559 100644
--- a/include/linux/cpuidle.h
+++ b/include/linux/cpuidle.h
@@ -226,9 +226,9 @@ static inline void cpuidle_coupled_parallel_barrier(struct cpuidle_device *dev,
 #endif
 
 #ifdef CONFIG_ARCH_HAS_CPU_RELAX
-void poll_idle_init(struct cpuidle_driver *drv);
+void cpuidle_poll_state_init(struct cpuidle_driver *drv);
 #else
-static void poll_idle_init(struct cpuidle_driver *drv) {}
+static inline void cpuidle_poll_state_init(struct cpuidle_driver *drv) {}
 #endif
 
 /******************************
-- 
cgit v1.2.3


From e931d0dab4393f7195f84f42ed6fd973c26f62f1 Mon Sep 17 00:00:00 2001
From: Punit Agrawal <punit.agrawal@arm.com>
Date: Tue, 29 Aug 2017 14:20:20 +0100
Subject: ACPI / APEI: Suppress message if HEST not present

According to the ACPI specification, firmware is not required to provide
the Hardware Error Source Table (HEST). When HEST is not present, the
following superfluous message is printed to the kernel boot log -

[    3.460067] GHES: HEST is not enabled!

Extend hest_disable variable to track whether the firmware provides this
table and if it is not present skip any log output. The existing
behaviour is preserved in all other cases.

Suggested-by: Borislav Petkov <bp@suse.de>
Signed-off-by: Punit Agrawal <punit.agrawal@arm.com>
Reviewed-by: Borislav Petkov <bp@suse.de>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/acpi/apei/ghes.c |  7 ++++++-
 drivers/acpi/apei/hest.c | 13 +++++++------
 include/acpi/apei.h      |  8 +++++++-
 3 files changed, 20 insertions(+), 8 deletions(-)

(limited to 'include')

diff --git a/drivers/acpi/apei/ghes.c b/drivers/acpi/apei/ghes.c
index eed09fc664e8..077f9bad6f44 100644
--- a/drivers/acpi/apei/ghes.c
+++ b/drivers/acpi/apei/ghes.c
@@ -1266,9 +1266,14 @@ static int __init ghes_init(void)
 	if (acpi_disabled)
 		return -ENODEV;
 
-	if (hest_disable) {
+	switch (hest_disable) {
+	case HEST_NOT_FOUND:
+		return -ENODEV;
+	case HEST_DISABLED:
 		pr_info(GHES_PFX "HEST is not enabled!\n");
 		return -EINVAL;
+	default:
+		break;
 	}
 
 	if (ghes_disable) {
diff --git a/drivers/acpi/apei/hest.c b/drivers/acpi/apei/hest.c
index 456b488eb1df..9cb74115a43d 100644
--- a/drivers/acpi/apei/hest.c
+++ b/drivers/acpi/apei/hest.c
@@ -37,7 +37,7 @@
 
 #define HEST_PFX "HEST: "
 
-bool hest_disable;
+int hest_disable;
 EXPORT_SYMBOL_GPL(hest_disable);
 
 /* HEST table parsing */
@@ -213,7 +213,7 @@ err:
 
 static int __init setup_hest_disable(char *str)
 {
-	hest_disable = 1;
+	hest_disable = HEST_DISABLED;
 	return 0;
 }
 
@@ -232,9 +232,10 @@ void __init acpi_hest_init(void)
 
 	status = acpi_get_table(ACPI_SIG_HEST, 0,
 				(struct acpi_table_header **)&hest_tab);
-	if (status == AE_NOT_FOUND)
-		goto err;
-	else if (ACPI_FAILURE(status)) {
+	if (status == AE_NOT_FOUND) {
+		hest_disable = HEST_NOT_FOUND;
+		return;
+	} else if (ACPI_FAILURE(status)) {
 		const char *msg = acpi_format_exception(status);
 		pr_err(HEST_PFX "Failed to get table, %s\n", msg);
 		rc = -EINVAL;
@@ -257,5 +258,5 @@ void __init acpi_hest_init(void)
 	pr_info(HEST_PFX "Table parsing has been initialized.\n");
 	return;
 err:
-	hest_disable = 1;
+	hest_disable = HEST_DISABLED;
 }
diff --git a/include/acpi/apei.h b/include/acpi/apei.h
index 76284bb560a6..c46694abea28 100644
--- a/include/acpi/apei.h
+++ b/include/acpi/apei.h
@@ -16,7 +16,13 @@
 
 #ifdef __KERNEL__
 
-extern bool hest_disable;
+enum hest_status {
+	HEST_ENABLED,
+	HEST_DISABLED,
+	HEST_NOT_FOUND,
+};
+
+extern int hest_disable;
 extern int erst_disable;
 #ifdef CONFIG_ACPI_APEI_GHES
 extern bool ghes_disable;
-- 
cgit v1.2.3


From 26e3e3cb05604073f2e9969fb0f06f35eb3b3313 Mon Sep 17 00:00:00 2001
From: Bart Van Assche <bart.vanassche@wdc.com>
Date: Tue, 29 Aug 2017 08:50:11 -0700
Subject: scsi: rcu: Introduce rcu_swap_protected()

A common pattern in RCU code is to assign a new value to an RCU pointer
after having read and stored the old value. Introduce a macro for this
pattern.

Signed-off-by: Bart Van Assche <bart.vanassche@wdc.com>
Acked-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Christoph Hellwig <hch@lst.de>
Cc: Hannes Reinecke <hare@suse.de>
Cc: Johannes Thumshirn <jthumshirn@suse.de>
Cc: Shane M Seymour <shane.seymour@hpe.com>
Signed-off-by: Martin K. Petersen <martin.petersen@oracle.com>
---
 include/linux/rcupdate.h | 16 ++++++++++++++++
 1 file changed, 16 insertions(+)

(limited to 'include')

diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h
index f816fc72b51e..8e920f0ecb07 100644
--- a/include/linux/rcupdate.h
+++ b/include/linux/rcupdate.h
@@ -407,6 +407,22 @@ static inline void rcu_preempt_sleep_check(void) { }
 	_r_a_p__v;							      \
 })
 
+/**
+ * rcu_swap_protected() - swap an RCU and a regular pointer
+ * @rcu_ptr: RCU pointer
+ * @ptr: regular pointer
+ * @c: the conditions under which the dereference will take place
+ *
+ * Perform swap(@rcu_ptr, @ptr) where @rcu_ptr is an RCU-annotated pointer and
+ * @c is the argument that is passed to the rcu_dereference_protected() call
+ * used to read that pointer.
+ */
+#define rcu_swap_protected(rcu_ptr, ptr, c) do {			\
+	typeof(ptr) __tmp = rcu_dereference_protected((rcu_ptr), (c));	\
+	rcu_assign_pointer((rcu_ptr), (ptr));				\
+	(ptr) = __tmp;							\
+} while (0)
+
 /**
  * rcu_access_pointer() - fetch RCU pointer with no dereferencing
  * @p: The pointer to read
-- 
cgit v1.2.3


From ccf1e0045eea8f98d60fc9327bcb14c958d2e4c7 Mon Sep 17 00:00:00 2001
From: Bart Van Assche <bart.vanassche@wdc.com>
Date: Tue, 29 Aug 2017 08:50:13 -0700
Subject: scsi: Rework handling of scsi_device.vpd_pg8[03]

Introduce struct scsi_vpd for the VPD page length, data and the RCU head
that will be used to free the VPD data. Use kfree_rcu() instead of
kfree() to free VPD data. Move the VPD buffer pointer check inside the
RCU read lock in the sysfs code. Only annotate pointers that are shared
across threads with __rcu. Use rcu_dereference() when dereferencing an
RCU pointer. This patch suppresses about twenty sparse complaints about
the vpd_pg8[03] pointers. This patch also fixes a race condition, namely
that updating of the VPD pointers and length variables in struct
scsi_device was not atomic with reference to the code reading these
variables. See also "Does the update code tolerate concurrent accesses?"
in Documentation/RCU/checklist.txt.

Fixes: commit 09e2b0b14690 ("scsi: rescan VPD attributes")
Signed-off-by: Bart Van Assche <bart.vanassche@wdc.com>
Acked-by: Hannes Reinecke <hare@suse.de>
Reviewed-by: Shane Seymour <shane.seymour@hpe.com>
Cc: Christoph Hellwig <hch@lst.de>
Cc: Johannes Thumshirn <jthumshirn@suse.de>
Cc: Shane Seymour <shane.seymour@hpe.com>
Signed-off-by: Martin K. Petersen <martin.petersen@oracle.com>
---
 drivers/scsi/scsi.c        | 44 ++++++++++++++++++--------------------------
 drivers/scsi/scsi_lib.c    | 16 ++++++++--------
 drivers/scsi/scsi_sysfs.c  | 29 ++++++++++++++++++++---------
 include/scsi/scsi_device.h | 18 ++++++++++++++----
 4 files changed, 60 insertions(+), 47 deletions(-)

(limited to 'include')

diff --git a/drivers/scsi/scsi.c b/drivers/scsi/scsi.c
index f3f4926a3e77..d201ebcf4544 100644
--- a/drivers/scsi/scsi.c
+++ b/drivers/scsi/scsi.c
@@ -415,22 +415,20 @@ EXPORT_SYMBOL_GPL(scsi_get_vpd_page);
  * scsi_get_vpd_buf - Get Vital Product Data from a SCSI device
  * @sdev: The device to ask
  * @page: Which Vital Product Data to return
- * @len: Upon success, the VPD length will be stored in *@len.
  *
  * Returns %NULL upon failure.
  */
-static unsigned char *scsi_get_vpd_buf(struct scsi_device *sdev, u8 page,
-				       int *len)
+static struct scsi_vpd *scsi_get_vpd_buf(struct scsi_device *sdev, u8 page)
 {
-	unsigned char *vpd_buf;
+	struct scsi_vpd *vpd_buf;
 	int vpd_len = SCSI_VPD_PG_LEN, result;
 
 retry_pg:
-	vpd_buf = kmalloc(vpd_len, GFP_KERNEL);
+	vpd_buf = kmalloc(sizeof(*vpd_buf) + vpd_len, GFP_KERNEL);
 	if (!vpd_buf)
 		return NULL;
 
-	result = scsi_vpd_inquiry(sdev, vpd_buf, page, vpd_len);
+	result = scsi_vpd_inquiry(sdev, vpd_buf->data, page, vpd_len);
 	if (result < 0) {
 		kfree(vpd_buf);
 		return NULL;
@@ -441,31 +439,27 @@ retry_pg:
 		goto retry_pg;
 	}
 
-	*len = result;
+	vpd_buf->len = result;
 
 	return vpd_buf;
 }
 
 static void scsi_update_vpd_page(struct scsi_device *sdev, u8 page,
-				 unsigned char __rcu **sdev_vpd_buf,
-				 int *sdev_vpd_len)
+				 struct scsi_vpd __rcu **sdev_vpd_buf)
 {
-	unsigned char *vpd_buf;
-	int len;
+	struct scsi_vpd *vpd_buf;
 
-	vpd_buf = scsi_get_vpd_buf(sdev, page, &len);
+	vpd_buf = scsi_get_vpd_buf(sdev, page);
 	if (!vpd_buf)
 		return;
 
 	mutex_lock(&sdev->inquiry_mutex);
 	rcu_swap_protected(*sdev_vpd_buf, vpd_buf,
 			   lockdep_is_held(&sdev->inquiry_mutex));
-	*sdev_vpd_len = len;
 	mutex_unlock(&sdev->inquiry_mutex);
 
-	synchronize_rcu();
-
-	kfree(vpd_buf);
+	if (vpd_buf)
+		kfree_rcu(vpd_buf, rcu);
 }
 
 /**
@@ -479,24 +473,22 @@ static void scsi_update_vpd_page(struct scsi_device *sdev, u8 page,
  */
 void scsi_attach_vpd(struct scsi_device *sdev)
 {
-	int i, vpd_len;
-	unsigned char *vpd_buf;
+	int i;
+	struct scsi_vpd *vpd_buf;
 
 	if (!scsi_device_supports_vpd(sdev))
 		return;
 
 	/* Ask for all the pages supported by this device */
-	vpd_buf = scsi_get_vpd_buf(sdev, 0, &vpd_len);
+	vpd_buf = scsi_get_vpd_buf(sdev, 0);
 	if (!vpd_buf)
 		return;
 
-	for (i = 4; i < vpd_len; i++) {
-		if (vpd_buf[i] == 0x80)
-			scsi_update_vpd_page(sdev, 0x80, &sdev->vpd_pg80,
-					     &sdev->vpd_pg80_len);
-		if (vpd_buf[i] == 0x83)
-			scsi_update_vpd_page(sdev, 0x83, &sdev->vpd_pg83,
-					     &sdev->vpd_pg83_len);
+	for (i = 4; i < vpd_buf->len; i++) {
+		if (vpd_buf->data[i] == 0x80)
+			scsi_update_vpd_page(sdev, 0x80, &sdev->vpd_pg80);
+		if (vpd_buf->data[i] == 0x83)
+			scsi_update_vpd_page(sdev, 0x83, &sdev->vpd_pg83);
 	}
 	kfree(vpd_buf);
 }
diff --git a/drivers/scsi/scsi_lib.c b/drivers/scsi/scsi_lib.c
index 696d2eae0ba6..938a7e398cd4 100644
--- a/drivers/scsi/scsi_lib.c
+++ b/drivers/scsi/scsi_lib.c
@@ -3272,8 +3272,8 @@ int scsi_vpd_lun_id(struct scsi_device *sdev, char *id, size_t id_len)
 {
 	u8 cur_id_type = 0xff;
 	u8 cur_id_size = 0;
-	unsigned char *d, *cur_id_str;
-	unsigned char __rcu *vpd_pg83;
+	const unsigned char *d, *cur_id_str;
+	const struct scsi_vpd *vpd_pg83;
 	int id_size = -EINVAL;
 
 	rcu_read_lock();
@@ -3304,8 +3304,8 @@ int scsi_vpd_lun_id(struct scsi_device *sdev, char *id, size_t id_len)
 	}
 
 	memset(id, 0, id_len);
-	d = vpd_pg83 + 4;
-	while (d < vpd_pg83 + sdev->vpd_pg83_len) {
+	d = vpd_pg83->data + 4;
+	while (d < vpd_pg83->data + vpd_pg83->len) {
 		/* Skip designators not referring to the LUN */
 		if ((d[1] & 0x30) != 0x00)
 			goto next_desig;
@@ -3421,8 +3421,8 @@ EXPORT_SYMBOL(scsi_vpd_lun_id);
  */
 int scsi_vpd_tpg_id(struct scsi_device *sdev, int *rel_id)
 {
-	unsigned char *d;
-	unsigned char __rcu *vpd_pg83;
+	const unsigned char *d;
+	const struct scsi_vpd *vpd_pg83;
 	int group_id = -EAGAIN, rel_port = -1;
 
 	rcu_read_lock();
@@ -3432,8 +3432,8 @@ int scsi_vpd_tpg_id(struct scsi_device *sdev, int *rel_id)
 		return -ENXIO;
 	}
 
-	d = sdev->vpd_pg83 + 4;
-	while (d < sdev->vpd_pg83 + sdev->vpd_pg83_len) {
+	d = vpd_pg83->data + 4;
+	while (d < vpd_pg83->data + vpd_pg83->len) {
 		switch (d[1] & 0xf) {
 		case 0x4:
 			/* Relative target port */
diff --git a/drivers/scsi/scsi_sysfs.c b/drivers/scsi/scsi_sysfs.c
index 5ed473a87589..bf53356f41f0 100644
--- a/drivers/scsi/scsi_sysfs.c
+++ b/drivers/scsi/scsi_sysfs.c
@@ -428,6 +428,7 @@ static void scsi_device_dev_release_usercontext(struct work_struct *work)
 	struct scsi_device *sdev;
 	struct device *parent;
 	struct list_head *this, *tmp;
+	struct scsi_vpd *vpd_pg80 = NULL, *vpd_pg83 = NULL;
 	unsigned long flags;
 
 	sdev = container_of(work, struct scsi_device, ew.work);
@@ -456,8 +457,17 @@ static void scsi_device_dev_release_usercontext(struct work_struct *work)
 	/* NULL queue means the device can't be used */
 	sdev->request_queue = NULL;
 
-	kfree(sdev->vpd_pg83);
-	kfree(sdev->vpd_pg80);
+	mutex_lock(&sdev->inquiry_mutex);
+	rcu_swap_protected(sdev->vpd_pg80, vpd_pg80,
+			   lockdep_is_held(&sdev->inquiry_mutex));
+	rcu_swap_protected(sdev->vpd_pg83, vpd_pg83,
+			   lockdep_is_held(&sdev->inquiry_mutex));
+	mutex_unlock(&sdev->inquiry_mutex);
+
+	if (vpd_pg83)
+		kfree_rcu(vpd_pg83, rcu);
+	if (vpd_pg80)
+		kfree_rcu(vpd_pg80, rcu);
 	kfree(sdev->inquiry);
 	kfree(sdev);
 
@@ -795,15 +805,16 @@ show_vpd_##_page(struct file *filp, struct kobject *kobj,	\
 {									\
 	struct device *dev = container_of(kobj, struct device, kobj);	\
 	struct scsi_device *sdev = to_scsi_device(dev);			\
-	int ret;							\
-	if (!sdev->vpd_##_page)						\
-		return -EINVAL;						\
+	struct scsi_vpd *vpd_page;					\
+	int ret = -EINVAL;						\
+									\
 	rcu_read_lock();						\
-	ret = memory_read_from_buffer(buf, count, &off,			\
-				      rcu_dereference(sdev->vpd_##_page), \
-				       sdev->vpd_##_page##_len);	\
+	vpd_page = rcu_dereference(sdev->vpd_##_page);			\
+	if (vpd_page)							\
+		ret = memory_read_from_buffer(buf, count, &off,		\
+				vpd_page->data, vpd_page->len);		\
 	rcu_read_unlock();						\
-	return ret;						\
+	return ret;							\
 }									\
 static struct bin_attribute dev_attr_vpd_##_page = {		\
 	.attr =	{.name = __stringify(vpd_##_page), .mode = S_IRUGO },	\
diff --git a/include/scsi/scsi_device.h b/include/scsi/scsi_device.h
index f054f3f43c75..82e93ee94708 100644
--- a/include/scsi/scsi_device.h
+++ b/include/scsi/scsi_device.h
@@ -80,6 +80,18 @@ struct scsi_event {
 	 */
 };
 
+/**
+ * struct scsi_vpd - SCSI Vital Product Data
+ * @rcu: For kfree_rcu().
+ * @len: Length in bytes of @data.
+ * @data: VPD data as defined in various T10 SCSI standard documents.
+ */
+struct scsi_vpd {
+	struct rcu_head	rcu;
+	int		len;
+	unsigned char	data[];
+};
+
 struct scsi_device {
 	struct Scsi_Host *host;
 	struct request_queue *request_queue;
@@ -122,10 +134,8 @@ struct scsi_device {
 	const char * rev;		/* ... "nullnullnullnull" before scan */
 
 #define SCSI_VPD_PG_LEN                255
-	int vpd_pg83_len;
-	unsigned char __rcu *vpd_pg83;
-	int vpd_pg80_len;
-	unsigned char __rcu *vpd_pg80;
+	struct scsi_vpd __rcu *vpd_pg83;
+	struct scsi_vpd __rcu *vpd_pg80;
 	unsigned char current_tag;	/* current tag */
 	struct scsi_target      *sdev_target;   /* used only for single_lun */
 
-- 
cgit v1.2.3


From c1225f01af085aea9c9d094febf157de9d07d861 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Fri, 25 Aug 2017 17:37:38 +0200
Subject: scsi: bsg-lib: pass the release callback through bsg_setup_queue

The SAS code will need it.  Also mark the name argument const to match
bsg_register_queue.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Johannes Thumshirn <jthumshirn@suse.de>
Signed-off-by: Martin K. Petersen <martin.petersen@oracle.com>
---
 block/bsg-lib.c                     | 7 ++++---
 drivers/scsi/scsi_transport_fc.c    | 6 ++++--
 drivers/scsi/scsi_transport_iscsi.c | 2 +-
 include/linux/bsg-lib.h             | 5 +++--
 4 files changed, 12 insertions(+), 8 deletions(-)

(limited to 'include')

diff --git a/block/bsg-lib.c b/block/bsg-lib.c
index c4513b23f57a..4752dbc3dc49 100644
--- a/block/bsg-lib.c
+++ b/block/bsg-lib.c
@@ -226,8 +226,9 @@ static void bsg_request_fn(struct request_queue *q)
  * @job_fn: bsg job handler
  * @dd_job_size: size of LLD data needed for each job
  */
-struct request_queue *bsg_setup_queue(struct device *dev, char *name,
-		bsg_job_fn *job_fn, int dd_job_size)
+struct request_queue *bsg_setup_queue(struct device *dev, const char *name,
+		bsg_job_fn *job_fn, int dd_job_size,
+		void (*release)(struct device *))
 {
 	struct request_queue *q;
 	int ret;
@@ -250,7 +251,7 @@ struct request_queue *bsg_setup_queue(struct device *dev, char *name,
 	blk_queue_softirq_done(q, bsg_softirq_done);
 	blk_queue_rq_timeout(q, BLK_DEFAULT_SG_TIMEOUT);
 
-	ret = bsg_register_queue(q, dev, name, NULL);
+	ret = bsg_register_queue(q, dev, name, release);
 	if (ret) {
 		printk(KERN_ERR "%s: bsg interface failed to "
 		       "initialize - register queue\n", dev->kobj.name);
diff --git a/drivers/scsi/scsi_transport_fc.c b/drivers/scsi/scsi_transport_fc.c
index 1118aa5f88cd..3c6bc0081fcb 100644
--- a/drivers/scsi/scsi_transport_fc.c
+++ b/drivers/scsi/scsi_transport_fc.c
@@ -3784,7 +3784,8 @@ fc_bsg_hostadd(struct Scsi_Host *shost, struct fc_host_attrs *fc_host)
 	snprintf(bsg_name, sizeof(bsg_name),
 		 "fc_host%d", shost->host_no);
 
-	q = bsg_setup_queue(dev, bsg_name, fc_bsg_dispatch, i->f->dd_bsg_size);
+	q = bsg_setup_queue(dev, bsg_name, fc_bsg_dispatch, i->f->dd_bsg_size,
+			NULL);
 	if (IS_ERR(q)) {
 		dev_err(dev,
 			"fc_host%d: bsg interface failed to initialize - setup queue\n",
@@ -3829,7 +3830,8 @@ fc_bsg_rportadd(struct Scsi_Host *shost, struct fc_rport *rport)
 	if (!i->f->bsg_request)
 		return -ENOTSUPP;
 
-	q = bsg_setup_queue(dev, NULL, fc_bsg_dispatch, i->f->dd_bsg_size);
+	q = bsg_setup_queue(dev, NULL, fc_bsg_dispatch, i->f->dd_bsg_size,
+			NULL);
 	if (IS_ERR(q)) {
 		dev_err(dev, "failed to setup bsg queue\n");
 		return PTR_ERR(q);
diff --git a/drivers/scsi/scsi_transport_iscsi.c b/drivers/scsi/scsi_transport_iscsi.c
index b9513f82ec53..8934f19bce8e 100644
--- a/drivers/scsi/scsi_transport_iscsi.c
+++ b/drivers/scsi/scsi_transport_iscsi.c
@@ -1542,7 +1542,7 @@ iscsi_bsg_host_add(struct Scsi_Host *shost, struct iscsi_cls_host *ihost)
 		return -ENOTSUPP;
 
 	snprintf(bsg_name, sizeof(bsg_name), "iscsi_host%d", shost->host_no);
-	q = bsg_setup_queue(dev, bsg_name, iscsi_bsg_host_dispatch, 0);
+	q = bsg_setup_queue(dev, bsg_name, iscsi_bsg_host_dispatch, 0, NULL);
 	if (IS_ERR(q)) {
 		shost_printk(KERN_ERR, shost, "bsg interface failed to "
 			     "initialize - no request queue\n");
diff --git a/include/linux/bsg-lib.h b/include/linux/bsg-lib.h
index e34dde2da0ef..1062f08e1a55 100644
--- a/include/linux/bsg-lib.h
+++ b/include/linux/bsg-lib.h
@@ -66,8 +66,9 @@ struct bsg_job {
 
 void bsg_job_done(struct bsg_job *job, int result,
 		  unsigned int reply_payload_rcv_len);
-struct request_queue *bsg_setup_queue(struct device *dev, char *name,
-		bsg_job_fn *job_fn, int dd_job_size);
+struct request_queue *bsg_setup_queue(struct device *dev, const char *name,
+		bsg_job_fn *job_fn, int dd_job_size,
+		void (*release)(struct device *));
 void bsg_job_put(struct bsg_job *job);
 int __must_check bsg_job_get(struct bsg_job *job);
 
-- 
cgit v1.2.3


From 651a013649943710a900551ec6e03d2084e1a65a Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Fri, 25 Aug 2017 17:37:41 +0200
Subject: scsi: scsi_transport_sas: switch to bsg-lib for SMP passthrough

Simplify the SMP passthrough code by switching it to the generic bsg-lib
helpers that abstract away the details of the request code, and gets
drivers out of seeing struct scsi_request.

For the libsas host SMP code there is a small behavior difference in
that we now always clear the residual len for successful commands,
similar to the three other SMP handler implementations.  Given that
there is no partial command handling in the host SMP handler this should
not matter in practice.

[mkp: typos and checkpatch fixes]

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Johannes Thumshirn <jthumshirn@suse.de>
Signed-off-by: Martin K. Petersen <martin.petersen@oracle.com>
---
 drivers/message/fusion/mptsas.c          |  79 +++++------
 drivers/scsi/libsas/Kconfig              |   1 +
 drivers/scsi/libsas/sas_expander.c       |  70 +++++-----
 drivers/scsi/libsas/sas_host_smp.c       | 106 ++++++--------
 drivers/scsi/libsas/sas_internal.h       |  12 +-
 drivers/scsi/mpt3sas/mpt3sas_transport.c | 230 ++++++++++++-------------------
 drivers/scsi/scsi_transport_sas.c        | 118 ++++------------
 include/scsi/libsas.h                    |   3 -
 include/scsi/scsi_transport_sas.h        |   4 +-
 9 files changed, 243 insertions(+), 380 deletions(-)

(limited to 'include')

diff --git a/drivers/message/fusion/mptsas.c b/drivers/message/fusion/mptsas.c
index 42ee70c23d9f..345f6035599e 100644
--- a/drivers/message/fusion/mptsas.c
+++ b/drivers/message/fusion/mptsas.c
@@ -2210,33 +2210,26 @@ mptsas_get_bay_identifier(struct sas_rphy *rphy)
 	return rc;
 }
 
-static int mptsas_smp_handler(struct Scsi_Host *shost, struct sas_rphy *rphy,
-			      struct request *req)
+static void mptsas_smp_handler(struct bsg_job *job, struct Scsi_Host *shost,
+		struct sas_rphy *rphy)
 {
 	MPT_ADAPTER *ioc = ((MPT_SCSI_HOST *) shost->hostdata)->ioc;
 	MPT_FRAME_HDR *mf;
 	SmpPassthroughRequest_t *smpreq;
-	struct request *rsp = req->next_rq;
-	int ret;
 	int flagsLength;
 	unsigned long timeleft;
 	char *psge;
-	dma_addr_t dma_addr_in = 0;
-	dma_addr_t dma_addr_out = 0;
 	u64 sas_address = 0;
-
-	if (!rsp) {
-		printk(MYIOC_s_ERR_FMT "%s: the smp response space is missing\n",
-		    ioc->name, __func__);
-		return -EINVAL;
-	}
+	unsigned int reslen = 0;
+	int ret = -EINVAL;
 
 	/* do we need to support multiple segments? */
-	if (bio_multiple_segments(req->bio) ||
-	    bio_multiple_segments(rsp->bio)) {
+	if (job->request_payload.sg_cnt > 1 ||
+	    job->reply_payload.sg_cnt > 1) {
 		printk(MYIOC_s_ERR_FMT "%s: multiple segments req %u, rsp %u\n",
-		    ioc->name, __func__, blk_rq_bytes(req), blk_rq_bytes(rsp));
-		return -EINVAL;
+		    ioc->name, __func__, job->request_payload.payload_len,
+		    job->reply_payload.payload_len);
+		goto out;
 	}
 
 	ret = mutex_lock_interruptible(&ioc->sas_mgmt.mutex);
@@ -2252,7 +2245,8 @@ static int mptsas_smp_handler(struct Scsi_Host *shost, struct sas_rphy *rphy,
 	smpreq = (SmpPassthroughRequest_t *)mf;
 	memset(smpreq, 0, sizeof(*smpreq));
 
-	smpreq->RequestDataLength = cpu_to_le16(blk_rq_bytes(req) - 4);
+	smpreq->RequestDataLength =
+		cpu_to_le16(job->request_payload.payload_len - 4);
 	smpreq->Function = MPI_FUNCTION_SMP_PASSTHROUGH;
 
 	if (rphy)
@@ -2278,13 +2272,14 @@ static int mptsas_smp_handler(struct Scsi_Host *shost, struct sas_rphy *rphy,
 		       MPI_SGE_FLAGS_END_OF_BUFFER |
 		       MPI_SGE_FLAGS_DIRECTION)
 		       << MPI_SGE_FLAGS_SHIFT;
-	flagsLength |= (blk_rq_bytes(req) - 4);
 
-	dma_addr_out = pci_map_single(ioc->pcidev, bio_data(req->bio),
-				      blk_rq_bytes(req), PCI_DMA_BIDIRECTIONAL);
-	if (pci_dma_mapping_error(ioc->pcidev, dma_addr_out))
+	if (!dma_map_sg(&ioc->pcidev->dev, job->request_payload.sg_list,
+			1, PCI_DMA_BIDIRECTIONAL))
 		goto put_mf;
-	ioc->add_sge(psge, flagsLength, dma_addr_out);
+
+	flagsLength |= (sg_dma_len(job->request_payload.sg_list) - 4);
+	ioc->add_sge(psge, flagsLength,
+			sg_dma_address(job->request_payload.sg_list));
 	psge += ioc->SGE_size;
 
 	/* response */
@@ -2294,12 +2289,13 @@ static int mptsas_smp_handler(struct Scsi_Host *shost, struct sas_rphy *rphy,
 		MPI_SGE_FLAGS_END_OF_BUFFER;
 
 	flagsLength = flagsLength << MPI_SGE_FLAGS_SHIFT;
-	flagsLength |= blk_rq_bytes(rsp) + 4;
-	dma_addr_in =  pci_map_single(ioc->pcidev, bio_data(rsp->bio),
-				      blk_rq_bytes(rsp), PCI_DMA_BIDIRECTIONAL);
-	if (pci_dma_mapping_error(ioc->pcidev, dma_addr_in))
-		goto unmap;
-	ioc->add_sge(psge, flagsLength, dma_addr_in);
+
+	if (!dma_map_sg(&ioc->pcidev->dev, job->reply_payload.sg_list,
+			1, PCI_DMA_BIDIRECTIONAL))
+		goto unmap_out;
+	flagsLength |= sg_dma_len(job->reply_payload.sg_list) + 4;
+	ioc->add_sge(psge, flagsLength,
+			sg_dma_address(job->reply_payload.sg_list));
 
 	INITIALIZE_MGMT_STATUS(ioc->sas_mgmt.status)
 	mpt_put_msg_frame(mptsasMgmtCtx, ioc, mf);
@@ -2310,10 +2306,10 @@ static int mptsas_smp_handler(struct Scsi_Host *shost, struct sas_rphy *rphy,
 		mpt_free_msg_frame(ioc, mf);
 		mf = NULL;
 		if (ioc->sas_mgmt.status & MPT_MGMT_STATUS_DID_IOCRESET)
-			goto unmap;
+			goto unmap_in;
 		if (!timeleft)
 			mpt_Soft_Hard_ResetHandler(ioc, CAN_SLEEP);
-		goto unmap;
+		goto unmap_in;
 	}
 	mf = NULL;
 
@@ -2321,23 +2317,22 @@ static int mptsas_smp_handler(struct Scsi_Host *shost, struct sas_rphy *rphy,
 		SmpPassthroughReply_t *smprep;
 
 		smprep = (SmpPassthroughReply_t *)ioc->sas_mgmt.reply;
-		memcpy(scsi_req(req)->sense, smprep, sizeof(*smprep));
-		scsi_req(req)->sense_len = sizeof(*smprep);
-		scsi_req(req)->resid_len = 0;
-		scsi_req(rsp)->resid_len -= smprep->ResponseDataLength;
+		memcpy(job->reply, smprep, sizeof(*smprep));
+		job->reply_len = sizeof(*smprep);
+		reslen = smprep->ResponseDataLength;
 	} else {
 		printk(MYIOC_s_ERR_FMT
 		    "%s: smp passthru reply failed to be returned\n",
 		    ioc->name, __func__);
 		ret = -ENXIO;
 	}
-unmap:
-	if (dma_addr_out)
-		pci_unmap_single(ioc->pcidev, dma_addr_out, blk_rq_bytes(req),
-				 PCI_DMA_BIDIRECTIONAL);
-	if (dma_addr_in)
-		pci_unmap_single(ioc->pcidev, dma_addr_in, blk_rq_bytes(rsp),
-				 PCI_DMA_BIDIRECTIONAL);
+
+unmap_in:
+	dma_unmap_sg(&ioc->pcidev->dev, job->reply_payload.sg_list, 1,
+			PCI_DMA_BIDIRECTIONAL);
+unmap_out:
+	dma_unmap_sg(&ioc->pcidev->dev, job->request_payload.sg_list, 1,
+			PCI_DMA_BIDIRECTIONAL);
 put_mf:
 	if (mf)
 		mpt_free_msg_frame(ioc, mf);
@@ -2345,7 +2340,7 @@ out_unlock:
 	CLEAR_MGMT_STATUS(ioc->sas_mgmt.status)
 	mutex_unlock(&ioc->sas_mgmt.mutex);
 out:
-	return ret;
+	bsg_job_done(job, ret, reslen);
 }
 
 static struct sas_function_template mptsas_transport_functions = {
diff --git a/drivers/scsi/libsas/Kconfig b/drivers/scsi/libsas/Kconfig
index 9dafe64e7c7a..13739bfacc67 100644
--- a/drivers/scsi/libsas/Kconfig
+++ b/drivers/scsi/libsas/Kconfig
@@ -26,6 +26,7 @@ config SCSI_SAS_LIBSAS
 	tristate "SAS Domain Transport Attributes"
 	depends on SCSI
 	select SCSI_SAS_ATTRS
+	select BLK_DEV_BSGLIB
 	help
 	  This provides transport specific helpers for SAS drivers which
 	  use the domain device construct (like the aic94xxx).
diff --git a/drivers/scsi/libsas/sas_expander.c b/drivers/scsi/libsas/sas_expander.c
index 570b2cb2da43..6b4fd2375178 100644
--- a/drivers/scsi/libsas/sas_expander.c
+++ b/drivers/scsi/libsas/sas_expander.c
@@ -64,8 +64,8 @@ static void smp_task_done(struct sas_task *task)
 /* Give it some long enough timeout. In seconds. */
 #define SMP_TIMEOUT 10
 
-static int smp_execute_task(struct domain_device *dev, void *req, int req_size,
-			    void *resp, int resp_size)
+static int smp_execute_task_sg(struct domain_device *dev,
+		struct scatterlist *req, struct scatterlist *resp)
 {
 	int res, retry;
 	struct sas_task *task = NULL;
@@ -86,8 +86,8 @@ static int smp_execute_task(struct domain_device *dev, void *req, int req_size,
 		}
 		task->dev = dev;
 		task->task_proto = dev->tproto;
-		sg_init_one(&task->smp_task.smp_req, req, req_size);
-		sg_init_one(&task->smp_task.smp_resp, resp, resp_size);
+		task->smp_task.smp_req = *req;
+		task->smp_task.smp_resp = *resp;
 
 		task->task_done = smp_task_done;
 
@@ -151,6 +151,17 @@ static int smp_execute_task(struct domain_device *dev, void *req, int req_size,
 	return res;
 }
 
+static int smp_execute_task(struct domain_device *dev, void *req, int req_size,
+			    void *resp, int resp_size)
+{
+	struct scatterlist req_sg;
+	struct scatterlist resp_sg;
+
+	sg_init_one(&req_sg, req, req_size);
+	sg_init_one(&resp_sg, resp, resp_size);
+	return smp_execute_task_sg(dev, &req_sg, &resp_sg);
+}
+
 /* ---------- Allocations ---------- */
 
 static inline void *alloc_smp_req(int size)
@@ -2130,57 +2141,50 @@ int sas_ex_revalidate_domain(struct domain_device *port_dev)
 	return res;
 }
 
-int sas_smp_handler(struct Scsi_Host *shost, struct sas_rphy *rphy,
-		    struct request *req)
+void sas_smp_handler(struct bsg_job *job, struct Scsi_Host *shost,
+		struct sas_rphy *rphy)
 {
 	struct domain_device *dev;
-	int ret, type;
-	struct request *rsp = req->next_rq;
-
-	if (!rsp) {
-		printk("%s: space for a smp response is missing\n",
-		       __func__);
-		return -EINVAL;
-	}
+	unsigned int reslen = 0;
+	int ret = -EINVAL;
 
 	/* no rphy means no smp target support (ie aic94xx host) */
 	if (!rphy)
-		return sas_smp_host_handler(shost, req, rsp);
-
-	type = rphy->identify.device_type;
+		return sas_smp_host_handler(job, shost);
 
-	if (type != SAS_EDGE_EXPANDER_DEVICE &&
-	    type != SAS_FANOUT_EXPANDER_DEVICE) {
+	switch (rphy->identify.device_type) {
+	case SAS_EDGE_EXPANDER_DEVICE:
+	case SAS_FANOUT_EXPANDER_DEVICE:
+		break;
+	default:
 		printk("%s: can we send a smp request to a device?\n",
 		       __func__);
-		return -EINVAL;
+		goto out;
 	}
 
 	dev = sas_find_dev_by_rphy(rphy);
 	if (!dev) {
 		printk("%s: fail to find a domain_device?\n", __func__);
-		return -EINVAL;
+		goto out;
 	}
 
 	/* do we need to support multiple segments? */
-	if (bio_multiple_segments(req->bio) ||
-	    bio_multiple_segments(rsp->bio)) {
+	if (job->request_payload.sg_cnt > 1 ||
+	    job->reply_payload.sg_cnt > 1) {
 		printk("%s: multiple segments req %u, rsp %u\n",
-		       __func__, blk_rq_bytes(req), blk_rq_bytes(rsp));
-		return -EINVAL;
+		       __func__, job->request_payload.payload_len,
+		       job->reply_payload.payload_len);
+		goto out;
 	}
 
-	ret = smp_execute_task(dev, bio_data(req->bio), blk_rq_bytes(req),
-			       bio_data(rsp->bio), blk_rq_bytes(rsp));
+	ret = smp_execute_task_sg(dev, job->request_payload.sg_list,
+			job->reply_payload.sg_list);
 	if (ret > 0) {
 		/* positive number is the untransferred residual */
-		scsi_req(rsp)->resid_len = ret;
-		scsi_req(req)->resid_len = 0;
+		reslen = ret;
 		ret = 0;
-	} else if (ret == 0) {
-		scsi_req(rsp)->resid_len = 0;
-		scsi_req(req)->resid_len = 0;
 	}
 
-	return ret;
+out:
+	bsg_job_done(job, ret, reslen);
 }
diff --git a/drivers/scsi/libsas/sas_host_smp.c b/drivers/scsi/libsas/sas_host_smp.c
index 45cbbc44f4d7..9ead93df3a6e 100644
--- a/drivers/scsi/libsas/sas_host_smp.c
+++ b/drivers/scsi/libsas/sas_host_smp.c
@@ -225,47 +225,36 @@ static void sas_phy_control(struct sas_ha_struct *sas_ha, u8 phy_id,
 		resp_data[2] = SMP_RESP_FUNC_ACC;
 }
 
-int sas_smp_host_handler(struct Scsi_Host *shost, struct request *req,
-			 struct request *rsp)
+void sas_smp_host_handler(struct bsg_job *job, struct Scsi_Host *shost)
 {
-	u8 *req_data = NULL, *resp_data = NULL, *buf;
 	struct sas_ha_struct *sas_ha = SHOST_TO_SAS_HA(shost);
+	u8 *req_data, *resp_data;
+	unsigned int reslen = 0;
 	int error = -EINVAL;
 
 	/* eight is the minimum size for request and response frames */
-	if (blk_rq_bytes(req) < 8 || blk_rq_bytes(rsp) < 8)
+	if (job->request_payload.payload_len < 8 ||
+	    job->reply_payload.payload_len < 8)
 		goto out;
 
-	if (bio_offset(req->bio) + blk_rq_bytes(req) > PAGE_SIZE ||
-	    bio_offset(rsp->bio) + blk_rq_bytes(rsp) > PAGE_SIZE) {
-		shost_printk(KERN_ERR, shost,
-			"SMP request/response frame crosses page boundary");
+	error = -ENOMEM;
+	req_data = kzalloc(job->request_payload.payload_len, GFP_KERNEL);
+	if (!req_data)
 		goto out;
-	}
-
-	req_data = kzalloc(blk_rq_bytes(req), GFP_KERNEL);
+	sg_copy_to_buffer(job->request_payload.sg_list,
+			  job->request_payload.sg_cnt, req_data,
+			  job->request_payload.payload_len);
 
 	/* make sure frame can always be built ... we copy
 	 * back only the requested length */
-	resp_data = kzalloc(max(blk_rq_bytes(rsp), 128U), GFP_KERNEL);
-
-	if (!req_data || !resp_data) {
-		error = -ENOMEM;
-		goto out;
-	}
-
-	local_irq_disable();
-	buf = kmap_atomic(bio_page(req->bio));
-	memcpy(req_data, buf, blk_rq_bytes(req));
-	kunmap_atomic(buf - bio_offset(req->bio));
-	local_irq_enable();
+	resp_data = kzalloc(max(job->reply_payload.payload_len, 128U),
+			GFP_KERNEL);
+	if (!resp_data)
+		goto out_free_req;
 
+	error = -EINVAL;
 	if (req_data[0] != SMP_REQUEST)
-		goto out;
-
-	/* always succeeds ... even if we can't process the request
-	 * the result is in the response frame */
-	error = 0;
+		goto out_free_resp;
 
 	/* set up default don't know response */
 	resp_data[0] = SMP_RESPONSE;
@@ -274,20 +263,18 @@ int sas_smp_host_handler(struct Scsi_Host *shost, struct request *req,
 
 	switch (req_data[1]) {
 	case SMP_REPORT_GENERAL:
-		scsi_req(req)->resid_len -= 8;
-		scsi_req(rsp)->resid_len -= 32;
 		resp_data[2] = SMP_RESP_FUNC_ACC;
 		resp_data[9] = sas_ha->num_phys;
+		reslen = 32;
 		break;
 
 	case SMP_REPORT_MANUF_INFO:
-		scsi_req(req)->resid_len -= 8;
-		scsi_req(rsp)->resid_len -= 64;
 		resp_data[2] = SMP_RESP_FUNC_ACC;
 		memcpy(resp_data + 12, shost->hostt->name,
 		       SAS_EXPANDER_VENDOR_ID_LEN);
 		memcpy(resp_data + 20, "libsas virt phy",
 		       SAS_EXPANDER_PRODUCT_ID_LEN);
+		reslen = 64;
 		break;
 
 	case SMP_READ_GPIO_REG:
@@ -295,14 +282,10 @@ int sas_smp_host_handler(struct Scsi_Host *shost, struct request *req,
 		break;
 
 	case SMP_DISCOVER:
-		scsi_req(req)->resid_len -= 16;
-		if ((int)scsi_req(req)->resid_len < 0) {
-			scsi_req(req)->resid_len = 0;
-			error = -EINVAL;
-			goto out;
-		}
-		scsi_req(rsp)->resid_len -= 56;
+		if (job->request_payload.payload_len < 16)
+			goto out_free_resp;
 		sas_host_smp_discover(sas_ha, resp_data, req_data[9]);
+		reslen = 56;
 		break;
 
 	case SMP_REPORT_PHY_ERR_LOG:
@@ -311,14 +294,10 @@ int sas_smp_host_handler(struct Scsi_Host *shost, struct request *req,
 		break;
 
 	case SMP_REPORT_PHY_SATA:
-		scsi_req(req)->resid_len -= 16;
-		if ((int)scsi_req(req)->resid_len < 0) {
-			scsi_req(req)->resid_len = 0;
-			error = -EINVAL;
-			goto out;
-		}
-		scsi_req(rsp)->resid_len -= 60;
+		if (job->request_payload.payload_len < 16)
+			goto out_free_resp;
 		sas_report_phy_sata(sas_ha, resp_data, req_data[9]);
+		reslen = 60;
 		break;
 
 	case SMP_REPORT_ROUTE_INFO:
@@ -330,16 +309,15 @@ int sas_smp_host_handler(struct Scsi_Host *shost, struct request *req,
 		const int base_frame_size = 11;
 		int to_write = req_data[4];
 
-		if (blk_rq_bytes(req) < base_frame_size + to_write * 4 ||
-		    scsi_req(req)->resid_len < base_frame_size + to_write * 4) {
+		if (job->request_payload.payload_len <
+				base_frame_size + to_write * 4) {
 			resp_data[2] = SMP_RESP_INV_FRM_LEN;
 			break;
 		}
 
 		to_write = sas_host_smp_write_gpio(sas_ha, resp_data, req_data[2],
 						   req_data[3], to_write, &req_data[8]);
-		scsi_req(req)->resid_len -= base_frame_size + to_write * 4;
-		scsi_req(rsp)->resid_len -= 8;
+		reslen = 8;
 		break;
 	}
 
@@ -348,16 +326,12 @@ int sas_smp_host_handler(struct Scsi_Host *shost, struct request *req,
 		break;
 
 	case SMP_PHY_CONTROL:
-		scsi_req(req)->resid_len -= 44;
-		if ((int)scsi_req(req)->resid_len < 0) {
-			scsi_req(req)->resid_len = 0;
-			error = -EINVAL;
-			goto out;
-		}
-		scsi_req(rsp)->resid_len -= 8;
+		if (job->request_payload.payload_len < 44)
+			goto out_free_resp;
 		sas_phy_control(sas_ha, req_data[9], req_data[10],
 				req_data[32] >> 4, req_data[33] >> 4,
 				resp_data);
+		reslen = 8;
 		break;
 
 	case SMP_PHY_TEST_FUNCTION:
@@ -369,15 +343,15 @@ int sas_smp_host_handler(struct Scsi_Host *shost, struct request *req,
 		break;
 	}
 
-	local_irq_disable();
-	buf = kmap_atomic(bio_page(rsp->bio));
-	memcpy(buf, resp_data, blk_rq_bytes(rsp));
-	flush_kernel_dcache_page(bio_page(rsp->bio));
-	kunmap_atomic(buf - bio_offset(rsp->bio));
-	local_irq_enable();
+	sg_copy_from_buffer(job->reply_payload.sg_list,
+			    job->reply_payload.sg_cnt, resp_data,
+			    job->reply_payload.payload_len);
 
- out:
-	kfree(req_data);
+	error = 0;
+out_free_resp:
 	kfree(resp_data);
-	return error;
+out_free_req:
+	kfree(req_data);
+out:
+	bsg_job_done(job, error, reslen);
 }
diff --git a/drivers/scsi/libsas/sas_internal.h b/drivers/scsi/libsas/sas_internal.h
index a216c957b639..c07e08136491 100644
--- a/drivers/scsi/libsas/sas_internal.h
+++ b/drivers/scsi/libsas/sas_internal.h
@@ -81,6 +81,8 @@ int sas_queue_work(struct sas_ha_struct *ha, struct sas_work *sw);
 int sas_notify_lldd_dev_found(struct domain_device *);
 void sas_notify_lldd_dev_gone(struct domain_device *);
 
+void sas_smp_handler(struct bsg_job *job, struct Scsi_Host *shost,
+		struct sas_rphy *rphy);
 int sas_smp_phy_control(struct domain_device *dev, int phy_id,
 			enum phy_func phy_func, struct sas_phy_linkrates *);
 int sas_smp_get_phy_events(struct sas_phy *phy);
@@ -98,16 +100,14 @@ void sas_hae_reset(struct work_struct *work);
 void sas_free_device(struct kref *kref);
 
 #ifdef CONFIG_SCSI_SAS_HOST_SMP
-extern int sas_smp_host_handler(struct Scsi_Host *shost, struct request *req,
-				struct request *rsp);
+extern void sas_smp_host_handler(struct bsg_job *job, struct Scsi_Host *shost);
 #else
-static inline int sas_smp_host_handler(struct Scsi_Host *shost,
-				       struct request *req,
-				       struct request *rsp)
+static inline void sas_smp_host_handler(struct bsg_job *job,
+		struct Scsi_Host *shost)
 {
 	shost_printk(KERN_ERR, shost,
 		"Cannot send SMP to a sas host (not enabled in CONFIG)\n");
-	return -EINVAL;
+	bsg_job_done(job, -EINVAL, 0);
 }
 #endif
 
diff --git a/drivers/scsi/mpt3sas/mpt3sas_transport.c b/drivers/scsi/mpt3sas/mpt3sas_transport.c
index e7a7a704a315..d3940c5d079d 100644
--- a/drivers/scsi/mpt3sas/mpt3sas_transport.c
+++ b/drivers/scsi/mpt3sas/mpt3sas_transport.c
@@ -1870,6 +1870,38 @@ _transport_phy_speed(struct sas_phy *phy, struct sas_phy_linkrates *rates)
 	return rc;
 }
 
+static int
+_transport_map_smp_buffer(struct device *dev, struct bsg_buffer *buf,
+		dma_addr_t *dma_addr, size_t *dma_len, void **p)
+{
+	/* Check if the request is split across multiple segments */
+	if (buf->sg_cnt > 1) {
+		*p = dma_alloc_coherent(dev, buf->payload_len, dma_addr,
+				GFP_KERNEL);
+		if (!*p)
+			return -ENOMEM;
+		*dma_len = buf->payload_len;
+	} else {
+		if (!dma_map_sg(dev, buf->sg_list, 1, DMA_BIDIRECTIONAL))
+			return -ENOMEM;
+		*dma_addr = sg_dma_address(buf->sg_list);
+		*dma_len = sg_dma_len(buf->sg_list);
+		*p = NULL;
+	}
+
+	return 0;
+}
+
+static void
+_transport_unmap_smp_buffer(struct device *dev, struct bsg_buffer *buf,
+		dma_addr_t dma_addr, void *p)
+{
+	if (p)
+		dma_free_coherent(dev, buf->payload_len, p, dma_addr);
+	else
+		dma_unmap_sg(dev, buf->sg_list, 1, DMA_BIDIRECTIONAL);
+}
+
 /**
  * _transport_smp_handler - transport portal for smp passthru
  * @shost: shost object
@@ -1880,9 +1912,9 @@ _transport_phy_speed(struct sas_phy *phy, struct sas_phy_linkrates *rates)
  * Example:
  *           smp_rep_general /sys/class/bsg/expander-5:0
  */
-static int
-_transport_smp_handler(struct Scsi_Host *shost, struct sas_rphy *rphy,
-	struct request *req)
+static void
+_transport_smp_handler(struct bsg_job *job, struct Scsi_Host *shost,
+		struct sas_rphy *rphy)
 {
 	struct MPT3SAS_ADAPTER *ioc = shost_priv(shost);
 	Mpi2SmpPassthroughRequest_t *mpi_request;
@@ -1891,33 +1923,25 @@ _transport_smp_handler(struct Scsi_Host *shost, struct sas_rphy *rphy,
 	u16 smid;
 	u32 ioc_state;
 	void *psge;
-	u8 issue_reset = 0;
-	dma_addr_t dma_addr_in = 0;
-	dma_addr_t dma_addr_out = 0;
-	dma_addr_t pci_dma_in = 0;
-	dma_addr_t pci_dma_out = 0;
-	void *pci_addr_in = NULL;
-	void *pci_addr_out = NULL;
+	dma_addr_t dma_addr_in;
+	dma_addr_t dma_addr_out;
+	void *addr_in = NULL;
+	void *addr_out = NULL;
+	size_t dma_len_in;
+	size_t dma_len_out;
 	u16 wait_state_count;
-	struct request *rsp = req->next_rq;
-	struct bio_vec bvec;
-	struct bvec_iter iter;
-
-	if (!rsp) {
-		pr_err(MPT3SAS_FMT "%s: the smp response space is missing\n",
-			ioc->name, __func__);
-		return -EINVAL;
-	}
+	unsigned int reslen = 0;
 
 	if (ioc->shost_recovery || ioc->pci_error_recovery) {
 		pr_info(MPT3SAS_FMT "%s: host reset in progress!\n",
 		    __func__, ioc->name);
-		return -EFAULT;
+		rc = -EFAULT;
+		goto out;
 	}
 
 	rc = mutex_lock_interruptible(&ioc->transport_cmds.mutex);
 	if (rc)
-		return rc;
+		goto out;
 
 	if (ioc->transport_cmds.status != MPT3_CMD_NOT_USED) {
 		pr_err(MPT3SAS_FMT "%s: transport_cmds in use\n", ioc->name,
@@ -1927,58 +1951,20 @@ _transport_smp_handler(struct Scsi_Host *shost, struct sas_rphy *rphy,
 	}
 	ioc->transport_cmds.status = MPT3_CMD_PENDING;
 
-	/* Check if the request is split across multiple segments */
-	if (bio_multiple_segments(req->bio)) {
-		u32 offset = 0;
-
-		/* Allocate memory and copy the request */
-		pci_addr_out = pci_alloc_consistent(ioc->pdev,
-		    blk_rq_bytes(req), &pci_dma_out);
-		if (!pci_addr_out) {
-			pr_info(MPT3SAS_FMT "%s(): PCI Addr out = NULL\n",
-			    ioc->name, __func__);
-			rc = -ENOMEM;
-			goto out;
-		}
-
-		bio_for_each_segment(bvec, req->bio, iter) {
-			memcpy(pci_addr_out + offset,
-			    page_address(bvec.bv_page) + bvec.bv_offset,
-			    bvec.bv_len);
-			offset += bvec.bv_len;
-		}
-	} else {
-		dma_addr_out = pci_map_single(ioc->pdev, bio_data(req->bio),
-		    blk_rq_bytes(req), PCI_DMA_BIDIRECTIONAL);
-		if (pci_dma_mapping_error(ioc->pdev, dma_addr_out)) {
-			pr_info(MPT3SAS_FMT "%s(): DMA Addr out = NULL\n",
-			    ioc->name, __func__);
-			rc = -ENOMEM;
-			goto free_pci;
-		}
+	rc = _transport_map_smp_buffer(&ioc->pdev->dev, &job->request_payload,
+			&dma_addr_out, &dma_len_out, &addr_out);
+	if (rc)
+		goto out;
+	if (addr_out) {
+		sg_copy_to_buffer(job->request_payload.sg_list,
+				job->request_payload.sg_cnt, addr_out,
+				job->request_payload.payload_len);
 	}
 
-	/* Check if the response needs to be populated across
-	 * multiple segments */
-	if (bio_multiple_segments(rsp->bio)) {
-		pci_addr_in = pci_alloc_consistent(ioc->pdev, blk_rq_bytes(rsp),
-		    &pci_dma_in);
-		if (!pci_addr_in) {
-			pr_info(MPT3SAS_FMT "%s(): PCI Addr in = NULL\n",
-			    ioc->name, __func__);
-			rc = -ENOMEM;
-			goto unmap;
-		}
-	} else {
-		dma_addr_in =  pci_map_single(ioc->pdev, bio_data(rsp->bio),
-		    blk_rq_bytes(rsp), PCI_DMA_BIDIRECTIONAL);
-		if (pci_dma_mapping_error(ioc->pdev, dma_addr_in)) {
-			pr_info(MPT3SAS_FMT "%s(): DMA Addr in = NULL\n",
-			    ioc->name, __func__);
-			rc = -ENOMEM;
-			goto unmap;
-		}
-	}
+	rc = _transport_map_smp_buffer(&ioc->pdev->dev, &job->reply_payload,
+			&dma_addr_in, &dma_len_in, &addr_in);
+	if (rc)
+		goto unmap_out;
 
 	wait_state_count = 0;
 	ioc_state = mpt3sas_base_get_iocstate(ioc, 1);
@@ -1988,7 +1974,7 @@ _transport_smp_handler(struct Scsi_Host *shost, struct sas_rphy *rphy,
 			    "%s: failed due to ioc not operational\n",
 			    ioc->name, __func__);
 			rc = -EFAULT;
-			goto unmap;
+			goto unmap_in;
 		}
 		ssleep(1);
 		ioc_state = mpt3sas_base_get_iocstate(ioc, 1);
@@ -2005,7 +1991,7 @@ _transport_smp_handler(struct Scsi_Host *shost, struct sas_rphy *rphy,
 		pr_err(MPT3SAS_FMT "%s: failed obtaining a smid\n",
 		    ioc->name, __func__);
 		rc = -EAGAIN;
-		goto unmap;
+		goto unmap_in;
 	}
 
 	rc = 0;
@@ -2018,15 +2004,11 @@ _transport_smp_handler(struct Scsi_Host *shost, struct sas_rphy *rphy,
 	mpi_request->SASAddress = (rphy) ?
 	    cpu_to_le64(rphy->identify.sas_address) :
 	    cpu_to_le64(ioc->sas_hba.sas_address);
-	mpi_request->RequestDataLength = cpu_to_le16(blk_rq_bytes(req) - 4);
+	mpi_request->RequestDataLength = cpu_to_le16(dma_len_out - 4);
 	psge = &mpi_request->SGL;
 
-	if (bio_multiple_segments(req->bio))
-		ioc->build_sg(ioc, psge, pci_dma_out, (blk_rq_bytes(req) - 4),
-		    pci_dma_in, (blk_rq_bytes(rsp) + 4));
-	else
-		ioc->build_sg(ioc, psge, dma_addr_out, (blk_rq_bytes(req) - 4),
-		    dma_addr_in, (blk_rq_bytes(rsp) + 4));
+	ioc->build_sg(ioc, psge, dma_addr_out, dma_len_out - 4, dma_addr_in,
+			dma_len_in - 4);
 
 	dtransportprintk(ioc, pr_info(MPT3SAS_FMT
 		"%s - sending smp request\n", ioc->name, __func__));
@@ -2040,83 +2022,51 @@ _transport_smp_handler(struct Scsi_Host *shost, struct sas_rphy *rphy,
 		    __func__, ioc->name);
 		_debug_dump_mf(mpi_request,
 		    sizeof(Mpi2SmpPassthroughRequest_t)/4);
-		if (!(ioc->transport_cmds.status & MPT3_CMD_RESET))
-			issue_reset = 1;
-		goto issue_host_reset;
+		if (!(ioc->transport_cmds.status & MPT3_CMD_RESET)) {
+			mpt3sas_base_hard_reset_handler(ioc, FORCE_BIG_HAMMER);
+			rc = -ETIMEDOUT;
+			goto unmap_in;
+		}
 	}
 
 	dtransportprintk(ioc, pr_info(MPT3SAS_FMT
 		"%s - complete\n", ioc->name, __func__));
 
-	if (ioc->transport_cmds.status & MPT3_CMD_REPLY_VALID) {
-
-		mpi_reply = ioc->transport_cmds.reply;
-
-		dtransportprintk(ioc, pr_info(MPT3SAS_FMT
-		    "%s - reply data transfer size(%d)\n",
-		    ioc->name, __func__,
-		    le16_to_cpu(mpi_reply->ResponseDataLength)));
-
-		memcpy(scsi_req(req)->sense, mpi_reply, sizeof(*mpi_reply));
-		scsi_req(req)->sense_len = sizeof(*mpi_reply);
-		scsi_req(req)->resid_len = 0;
-		scsi_req(rsp)->resid_len -=
-		    le16_to_cpu(mpi_reply->ResponseDataLength);
-
-		/* check if the resp needs to be copied from the allocated
-		 * pci mem */
-		if (bio_multiple_segments(rsp->bio)) {
-			u32 offset = 0;
-			u32 bytes_to_copy =
-			    le16_to_cpu(mpi_reply->ResponseDataLength);
-			bio_for_each_segment(bvec, rsp->bio, iter) {
-				if (bytes_to_copy <= bvec.bv_len) {
-					memcpy(page_address(bvec.bv_page) +
-					    bvec.bv_offset, pci_addr_in +
-					    offset, bytes_to_copy);
-					break;
-				} else {
-					memcpy(page_address(bvec.bv_page) +
-					    bvec.bv_offset, pci_addr_in +
-					    offset, bvec.bv_len);
-					bytes_to_copy -= bvec.bv_len;
-				}
-				offset += bvec.bv_len;
-			}
-		}
-	} else {
+	if (!(ioc->transport_cmds.status & MPT3_CMD_REPLY_VALID)) {
 		dtransportprintk(ioc, pr_info(MPT3SAS_FMT
 		    "%s - no reply\n", ioc->name, __func__));
 		rc = -ENXIO;
+		goto unmap_in;
 	}
 
- issue_host_reset:
-	if (issue_reset) {
-		mpt3sas_base_hard_reset_handler(ioc, FORCE_BIG_HAMMER);
-		rc = -ETIMEDOUT;
-	}
+	mpi_reply = ioc->transport_cmds.reply;
 
- unmap:
-	if (dma_addr_out)
-		pci_unmap_single(ioc->pdev, dma_addr_out, blk_rq_bytes(req),
-		    PCI_DMA_BIDIRECTIONAL);
-	if (dma_addr_in)
-		pci_unmap_single(ioc->pdev, dma_addr_in, blk_rq_bytes(rsp),
-		    PCI_DMA_BIDIRECTIONAL);
+	dtransportprintk(ioc,
+		pr_info(MPT3SAS_FMT "%s - reply data transfer size(%d)\n",
+			ioc->name, __func__,
+			le16_to_cpu(mpi_reply->ResponseDataLength)));
 
- free_pci:
-	if (pci_addr_out)
-		pci_free_consistent(ioc->pdev, blk_rq_bytes(req), pci_addr_out,
-		    pci_dma_out);
+	memcpy(job->reply, mpi_reply, sizeof(*mpi_reply));
+	job->reply_len = sizeof(*mpi_reply);
+	reslen = le16_to_cpu(mpi_reply->ResponseDataLength);
 
-	if (pci_addr_in)
-		pci_free_consistent(ioc->pdev, blk_rq_bytes(rsp), pci_addr_in,
-		    pci_dma_in);
+	if (addr_in) {
+		sg_copy_to_buffer(job->reply_payload.sg_list,
+				job->reply_payload.sg_cnt, addr_in,
+				job->reply_payload.payload_len);
+	}
 
+	rc = 0;
+ unmap_in:
+	_transport_unmap_smp_buffer(&ioc->pdev->dev, &job->reply_payload,
+			dma_addr_in, addr_in);
+ unmap_out:
+	_transport_unmap_smp_buffer(&ioc->pdev->dev, &job->request_payload,
+			dma_addr_out, addr_out);
  out:
 	ioc->transport_cmds.status = MPT3_CMD_NOT_USED;
 	mutex_unlock(&ioc->transport_cmds.mutex);
-	return rc;
+	bsg_job_done(job, rc, reslen);
 }
 
 struct sas_function_template mpt3sas_transport_functions = {
diff --git a/drivers/scsi/scsi_transport_sas.c b/drivers/scsi/scsi_transport_sas.c
index e2e948f1ce28..319dff970237 100644
--- a/drivers/scsi/scsi_transport_sas.c
+++ b/drivers/scsi/scsi_transport_sas.c
@@ -169,39 +169,22 @@ static struct sas_end_device *sas_sdev_to_rdev(struct scsi_device *sdev)
 	return rdev;
 }
 
-static void sas_smp_request(struct request_queue *q, struct Scsi_Host *shost,
-			    struct sas_rphy *rphy)
+static int sas_smp_dispatch(struct bsg_job *job)
 {
-	struct request *req;
-	blk_status_t ret;
-	int (*handler)(struct Scsi_Host *, struct sas_rphy *, struct request *);
+	struct Scsi_Host *shost = dev_to_shost(job->dev);
+	struct sas_rphy *rphy = NULL;
 
-	while ((req = blk_fetch_request(q)) != NULL) {
-		spin_unlock_irq(q->queue_lock);
+	if (!scsi_is_host_device(job->dev))
+		rphy = dev_to_rphy(job->dev);
 
-		scsi_req(req)->resid_len = blk_rq_bytes(req);
-		if (req->next_rq)
-			scsi_req(req->next_rq)->resid_len =
-				blk_rq_bytes(req->next_rq);
-		handler = to_sas_internal(shost->transportt)->f->smp_handler;
-		ret = handler(shost, rphy, req);
-		scsi_req(req)->result = ret;
-
-		blk_end_request_all(req, 0);
-
-		spin_lock_irq(q->queue_lock);
+	if (!job->req->next_rq) {
+		dev_warn(job->dev, "space for a smp response is missing\n");
+		bsg_job_done(job, -EINVAL, 0);
+		return 0;
 	}
-}
 
-static void sas_host_smp_request(struct request_queue *q)
-{
-	sas_smp_request(q, (struct Scsi_Host *)q->queuedata, NULL);
-}
-
-static void sas_non_host_smp_request(struct request_queue *q)
-{
-	struct sas_rphy *rphy = q->queuedata;
-	sas_smp_request(q, rphy_to_shost(rphy), rphy);
+	to_sas_internal(shost->transportt)->f->smp_handler(job, shost, rphy);
+	return 0;
 }
 
 static void sas_host_release(struct device *dev)
@@ -217,81 +200,36 @@ static void sas_host_release(struct device *dev)
 static int sas_bsg_initialize(struct Scsi_Host *shost, struct sas_rphy *rphy)
 {
 	struct request_queue *q;
-	int error;
-	struct device *dev;
-	char namebuf[20];
-	const char *name;
-	void (*release)(struct device *);
 
 	if (!to_sas_internal(shost->transportt)->f->smp_handler) {
 		printk("%s can't handle SMP requests\n", shost->hostt->name);
 		return 0;
 	}
 
-	q = blk_alloc_queue(GFP_KERNEL);
-	if (!q)
-		return -ENOMEM;
-	q->initialize_rq_fn = scsi_initialize_rq;
-	q->cmd_size = sizeof(struct scsi_request);
-
 	if (rphy) {
-		q->request_fn = sas_non_host_smp_request;
-		dev = &rphy->dev;
-		name = dev_name(dev);
-		release = NULL;
+		q = bsg_setup_queue(&rphy->dev, dev_name(&rphy->dev),
+				sas_smp_dispatch, 0, NULL);
+		if (IS_ERR(q))
+			return PTR_ERR(q);
+		rphy->q = q;
 	} else {
-		q->request_fn = sas_host_smp_request;
-		dev = &shost->shost_gendev;
-		snprintf(namebuf, sizeof(namebuf),
-			 "sas_host%d", shost->host_no);
-		name = namebuf;
-		release = sas_host_release;
+		char name[20];
+
+		snprintf(name, sizeof(name), "sas_host%d", shost->host_no);
+		q = bsg_setup_queue(&shost->shost_gendev, name,
+				sas_smp_dispatch, 0, sas_host_release);
+		if (IS_ERR(q))
+			return PTR_ERR(q);
+		to_sas_host_attrs(shost)->q = q;
 	}
-	error = blk_init_allocated_queue(q);
-	if (error)
-		goto out_cleanup_queue;
 
 	/*
 	 * by default assume old behaviour and bounce for any highmem page
 	 */
 	blk_queue_bounce_limit(q, BLK_BOUNCE_HIGH);
-
-	error = bsg_register_queue(q, dev, name, release);
-	if (error)
-		goto out_cleanup_queue;
-
-	if (rphy)
-		rphy->q = q;
-	else
-		to_sas_host_attrs(shost)->q = q;
-
-	if (rphy)
-		q->queuedata = rphy;
-	else
-		q->queuedata = shost;
-
 	queue_flag_set_unlocked(QUEUE_FLAG_BIDI, q);
 	queue_flag_set_unlocked(QUEUE_FLAG_SCSI_PASSTHROUGH, q);
 	return 0;
-
-out_cleanup_queue:
-	blk_cleanup_queue(q);
-	return error;
-}
-
-static void sas_bsg_remove(struct Scsi_Host *shost, struct sas_rphy *rphy)
-{
-	struct request_queue *q;
-
-	if (rphy)
-		q = rphy->q;
-	else
-		q = to_sas_host_attrs(shost)->q;
-
-	if (!q)
-		return;
-
-	bsg_unregister_queue(q);
 }
 
 /*
@@ -321,9 +259,10 @@ static int sas_host_remove(struct transport_container *tc, struct device *dev,
 			   struct device *cdev)
 {
 	struct Scsi_Host *shost = dev_to_shost(dev);
+	struct request_queue *q = to_sas_host_attrs(shost)->q;
 
-	sas_bsg_remove(shost, NULL);
-
+	if (q)
+		bsg_unregister_queue(q);
 	return 0;
 }
 
@@ -1713,7 +1652,8 @@ sas_rphy_remove(struct sas_rphy *rphy)
 	}
 
 	sas_rphy_unlink(rphy);
-	sas_bsg_remove(NULL, rphy);
+	if (rphy->q)
+		bsg_unregister_queue(rphy->q);
 	transport_remove_device(dev);
 	device_del(dev);
 }
diff --git a/include/scsi/libsas.h b/include/scsi/libsas.h
index e7c012ce5ecd..6c0dc6155ee7 100644
--- a/include/scsi/libsas.h
+++ b/include/scsi/libsas.h
@@ -721,9 +721,6 @@ extern int sas_slave_alloc(struct scsi_device *);
 extern int sas_ioctl(struct scsi_device *sdev, int cmd, void __user *arg);
 extern int sas_drain_work(struct sas_ha_struct *ha);
 
-extern int sas_smp_handler(struct Scsi_Host *shost, struct sas_rphy *rphy,
-			   struct request *req);
-
 extern void sas_ssp_task_response(struct device *dev, struct sas_task *task,
 				  struct ssp_response_iu *iu);
 struct sas_phy *sas_get_local_phy(struct domain_device *dev);
diff --git a/include/scsi/scsi_transport_sas.h b/include/scsi/scsi_transport_sas.h
index 73d870918939..a23304b7fb2e 100644
--- a/include/scsi/scsi_transport_sas.h
+++ b/include/scsi/scsi_transport_sas.h
@@ -5,6 +5,7 @@
 #include <linux/types.h>
 #include <linux/mutex.h>
 #include <scsi/sas.h>
+#include <linux/bsg-lib.h>
 
 struct scsi_transport_template;
 struct sas_rphy;
@@ -176,7 +177,8 @@ struct sas_function_template {
 	int (*phy_setup)(struct sas_phy *);
 	void (*phy_release)(struct sas_phy *);
 	int (*set_phy_speed)(struct sas_phy *, struct sas_phy_linkrates *);
-	int (*smp_handler)(struct Scsi_Host *, struct sas_rphy *, struct request *);
+	void (*smp_handler)(struct bsg_job *, struct Scsi_Host *,
+			struct sas_rphy *);
 };
 
 
-- 
cgit v1.2.3


From caf989c350e8e0b9584744b9005fc2c45ca30883 Mon Sep 17 00:00:00 2001
From: Bjorn Andersson <bjorn.andersson@linaro.org>
Date: Thu, 24 Aug 2017 12:51:30 +0530
Subject: rpmsg: glink: Introduce glink smem based transport

The glink protocol supports different types of transports (shared
memory). With the core protocol remaining the same, the way the
transport's memory is probed and accessed is different. So add support
for glink's smem based transports.

Adding a new smem transport register function and the fifo accessors for
the same.

Acked-by: Arun Kumar Neelakantam <aneela@codeaurora.org>
Signed-off-by: Sricharan R <sricharan@codeaurora.org>
Signed-off-by: Bjorn Andersson <bjorn.andersson@linaro.org>
---
 drivers/rpmsg/Kconfig             |  10 ++
 drivers/rpmsg/Makefile            |   1 +
 drivers/rpmsg/qcom_glink_native.c |   5 +
 drivers/rpmsg/qcom_glink_native.h |   1 +
 drivers/rpmsg/qcom_glink_smem.c   | 311 ++++++++++++++++++++++++++++++++++++++
 include/linux/rpmsg/qcom_glink.h  |  27 ++++
 6 files changed, 355 insertions(+)
 create mode 100644 drivers/rpmsg/qcom_glink_smem.c
 create mode 100644 include/linux/rpmsg/qcom_glink.h

(limited to 'include')

diff --git a/drivers/rpmsg/Kconfig b/drivers/rpmsg/Kconfig
index 4218570279e0..0fe6eac46512 100644
--- a/drivers/rpmsg/Kconfig
+++ b/drivers/rpmsg/Kconfig
@@ -27,6 +27,16 @@ config RPMSG_QCOM_GLINK_RPM
 	  which serves as a channel for communication with the RPM in GLINK
 	  enabled systems.
 
+config RPMSG_QCOM_GLINK_SMEM
+	tristate "Qualcomm SMEM Glink driver"
+	select RPMSG_QCOM_GLINK_NATIVE
+	depends on MAILBOX
+	depends on QCOM_SMEM
+	help
+	  Say y here to enable support for the GLINK SMEM communication driver,
+	  which provides support for using the GLINK communication protocol
+	  over SMEM.
+
 config RPMSG_QCOM_SMD
 	tristate "Qualcomm Shared Memory Driver (SMD)"
 	depends on QCOM_SMEM
diff --git a/drivers/rpmsg/Makefile b/drivers/rpmsg/Makefile
index 09a756c33e71..c71f4ab1ae17 100644
--- a/drivers/rpmsg/Makefile
+++ b/drivers/rpmsg/Makefile
@@ -2,5 +2,6 @@ obj-$(CONFIG_RPMSG)		+= rpmsg_core.o
 obj-$(CONFIG_RPMSG_CHAR)	+= rpmsg_char.o
 obj-$(CONFIG_RPMSG_QCOM_GLINK_RPM) += qcom_glink_rpm.o
 obj-$(CONFIG_RPMSG_QCOM_GLINK_NATIVE) += qcom_glink_native.o
+obj-$(CONFIG_RPMSG_QCOM_GLINK_SMEM) += qcom_glink_smem.o
 obj-$(CONFIG_RPMSG_QCOM_SMD)	+= qcom_smd.o
 obj-$(CONFIG_RPMSG_VIRTIO)	+= virtio_rpmsg_bus.o
diff --git a/drivers/rpmsg/qcom_glink_native.c b/drivers/rpmsg/qcom_glink_native.c
index 21adde34725b..50a800803688 100644
--- a/drivers/rpmsg/qcom_glink_native.c
+++ b/drivers/rpmsg/qcom_glink_native.c
@@ -1010,3 +1010,8 @@ void qcom_glink_native_remove(struct qcom_glink *glink)
 	idr_destroy(&glink->rcids);
 	mbox_free_channel(glink->mbox_chan);
 }
+
+void qcom_glink_native_unregister(struct qcom_glink *glink)
+{
+	device_unregister(glink->dev);
+}
diff --git a/drivers/rpmsg/qcom_glink_native.h b/drivers/rpmsg/qcom_glink_native.h
index d5627a495c00..197bb9da2c65 100644
--- a/drivers/rpmsg/qcom_glink_native.h
+++ b/drivers/rpmsg/qcom_glink_native.h
@@ -35,4 +35,5 @@ struct qcom_glink *qcom_glink_native_probe(struct device *dev,
 					   struct qcom_glink_pipe *tx);
 void qcom_glink_native_remove(struct qcom_glink *glink);
 
+void qcom_glink_native_unregister(struct qcom_glink *glink);
 #endif
diff --git a/drivers/rpmsg/qcom_glink_smem.c b/drivers/rpmsg/qcom_glink_smem.c
new file mode 100644
index 000000000000..19179a15b229
--- /dev/null
+++ b/drivers/rpmsg/qcom_glink_smem.c
@@ -0,0 +1,311 @@
+/*
+ * Copyright (c) 2016, Linaro Ltd
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 and
+ * only version 2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ */
+
+#include <linux/io.h>
+#include <linux/module.h>
+#include <linux/of.h>
+#include <linux/of_address.h>
+#include <linux/interrupt.h>
+#include <linux/platform_device.h>
+#include <linux/mfd/syscon.h>
+#include <linux/slab.h>
+#include <linux/rpmsg.h>
+#include <linux/idr.h>
+#include <linux/circ_buf.h>
+#include <linux/soc/qcom/smem.h>
+#include <linux/sizes.h>
+#include <linux/delay.h>
+#include <linux/regmap.h>
+#include <linux/workqueue.h>
+#include <linux/list.h>
+
+#include <linux/delay.h>
+#include <linux/rpmsg.h>
+#include <linux/rpmsg/qcom_glink.h>
+
+#include "qcom_glink_native.h"
+
+#define FIFO_FULL_RESERVE 8
+#define FIFO_ALIGNMENT 8
+#define TX_BLOCKED_CMD_RESERVE 8 /* size of struct read_notif_request */
+
+#define SMEM_GLINK_NATIVE_XPRT_DESCRIPTOR	478
+#define SMEM_GLINK_NATIVE_XPRT_FIFO_0		479
+#define SMEM_GLINK_NATIVE_XPRT_FIFO_1		480
+
+struct glink_smem_pipe {
+	struct qcom_glink_pipe native;
+
+	__le32 *tail;
+	__le32 *head;
+
+	void *fifo;
+
+	int remote_pid;
+};
+
+#define to_smem_pipe(p) container_of(p, struct glink_smem_pipe, native)
+
+static size_t glink_smem_rx_avail(struct qcom_glink_pipe *np)
+{
+	struct glink_smem_pipe *pipe = to_smem_pipe(np);
+	size_t len;
+	void *fifo;
+	u32 head;
+	u32 tail;
+
+	if (!pipe->fifo) {
+		fifo = qcom_smem_get(pipe->remote_pid,
+				     SMEM_GLINK_NATIVE_XPRT_FIFO_1, &len);
+		if (IS_ERR(fifo)) {
+			pr_err("failed to acquire RX fifo handle: %ld\n",
+			       PTR_ERR(fifo));
+			return 0;
+		}
+
+		pipe->fifo = fifo;
+		pipe->native.length = len;
+	}
+
+	head = le32_to_cpu(*pipe->head);
+	tail = le32_to_cpu(*pipe->tail);
+
+	if (head < tail)
+		return pipe->native.length - tail + head;
+	else
+		return head - tail;
+}
+
+static void glink_smem_rx_peak(struct qcom_glink_pipe *np,
+			       void *data, size_t count)
+{
+	struct glink_smem_pipe *pipe = to_smem_pipe(np);
+	size_t len;
+	u32 tail;
+
+	tail = le32_to_cpu(*pipe->tail);
+
+	len = min_t(size_t, count, pipe->native.length - tail);
+	if (len) {
+		__ioread32_copy(data, pipe->fifo + tail,
+				len / sizeof(u32));
+	}
+
+	if (len != count) {
+		__ioread32_copy(data + len, pipe->fifo,
+				(count - len) / sizeof(u32));
+	}
+}
+
+static void glink_smem_rx_advance(struct qcom_glink_pipe *np,
+				  size_t count)
+{
+	struct glink_smem_pipe *pipe = to_smem_pipe(np);
+	u32 tail;
+
+	tail = le32_to_cpu(*pipe->tail);
+
+	tail += count;
+	if (tail > pipe->native.length)
+		tail -= pipe->native.length;
+
+	*pipe->tail = cpu_to_le32(tail);
+}
+
+static size_t glink_smem_tx_avail(struct qcom_glink_pipe *np)
+{
+	struct glink_smem_pipe *pipe = to_smem_pipe(np);
+	u32 head;
+	u32 tail;
+	u32 avail;
+
+	head = le32_to_cpu(*pipe->head);
+	tail = le32_to_cpu(*pipe->tail);
+
+	if (tail <= head)
+		avail = pipe->native.length - head + tail;
+	else
+		avail = tail - head;
+
+	if (avail < (FIFO_FULL_RESERVE + TX_BLOCKED_CMD_RESERVE))
+		avail = 0;
+	else
+		avail -= FIFO_FULL_RESERVE + TX_BLOCKED_CMD_RESERVE;
+
+	return avail;
+}
+
+static unsigned int glink_smem_tx_write_one(struct glink_smem_pipe *pipe,
+					    unsigned int head,
+					    const void *data, size_t count)
+{
+	size_t len;
+
+	len = min_t(size_t, count, pipe->native.length - head);
+	if (len)
+		memcpy(pipe->fifo + head, data, len);
+
+	if (len != count)
+		memcpy(pipe->fifo, data + len, count - len);
+
+	head += count;
+	if (head >= pipe->native.length)
+		head -= pipe->native.length;
+
+	return head;
+}
+
+static void glink_smem_tx_write(struct qcom_glink_pipe *glink_pipe,
+				const void *hdr, size_t hlen,
+				const void *data, size_t dlen)
+{
+	struct glink_smem_pipe *pipe = to_smem_pipe(glink_pipe);
+	unsigned int head;
+
+	head = le32_to_cpu(*pipe->head);
+
+	head = glink_smem_tx_write_one(pipe, head, hdr, hlen);
+	head = glink_smem_tx_write_one(pipe, head, data, dlen);
+
+	/* Ensure head is always aligned to 8 bytes */
+	head = ALIGN(head, 8);
+	if (head >= pipe->native.length)
+		head -= pipe->native.length;
+
+	*pipe->head = cpu_to_le32(head);
+}
+
+static void qcom_glink_smem_release(struct device *dev)
+{
+	kfree(dev);
+}
+
+struct qcom_glink *qcom_glink_smem_register(struct device *parent,
+					    struct device_node *node)
+{
+	struct glink_smem_pipe *rx_pipe;
+	struct glink_smem_pipe *tx_pipe;
+	struct qcom_glink *glink;
+	struct device *dev;
+	u32 remote_pid;
+	__le32 *descs;
+	size_t size;
+	int ret;
+
+	dev = kzalloc(sizeof(*dev), GFP_KERNEL);
+	if (!dev)
+		return ERR_PTR(-ENOMEM);
+
+	dev->parent = parent;
+	dev->of_node = node;
+	dev->release = qcom_glink_smem_release;
+	dev_set_name(dev, "%s:%s", node->parent->name, node->name);
+	ret = device_register(dev);
+	if (ret) {
+		pr_err("failed to register glink edge\n");
+		return ERR_PTR(ret);
+	}
+
+	ret = of_property_read_u32(dev->of_node, "qcom,remote-pid",
+				   &remote_pid);
+	if (ret) {
+		dev_err(dev, "failed to parse qcom,remote-pid\n");
+		goto err_put_dev;
+	}
+
+	rx_pipe = devm_kzalloc(dev, sizeof(*rx_pipe), GFP_KERNEL);
+	tx_pipe = devm_kzalloc(dev, sizeof(*tx_pipe), GFP_KERNEL);
+	if (!rx_pipe || !tx_pipe) {
+		ret = -ENOMEM;
+		goto err_put_dev;
+	}
+
+	ret = qcom_smem_alloc(remote_pid,
+			      SMEM_GLINK_NATIVE_XPRT_DESCRIPTOR, 32);
+	if (ret && ret != -EEXIST) {
+		dev_err(dev, "failed to allocate glink descriptors\n");
+		goto err_put_dev;
+	}
+
+	descs = qcom_smem_get(remote_pid,
+			      SMEM_GLINK_NATIVE_XPRT_DESCRIPTOR, &size);
+	if (IS_ERR(descs)) {
+		dev_err(dev, "failed to acquire xprt descriptor\n");
+		ret = PTR_ERR(descs);
+		goto err_put_dev;
+	}
+
+	if (size != 32) {
+		dev_err(dev, "glink descriptor of invalid size\n");
+		ret = -EINVAL;
+		goto err_put_dev;
+	}
+
+	tx_pipe->tail = &descs[0];
+	tx_pipe->head = &descs[1];
+	rx_pipe->tail = &descs[2];
+	rx_pipe->head = &descs[3];
+
+	ret = qcom_smem_alloc(remote_pid, SMEM_GLINK_NATIVE_XPRT_FIFO_0,
+			      SZ_16K);
+	if (ret && ret != -EEXIST) {
+		dev_err(dev, "failed to allocate TX fifo\n");
+		goto err_put_dev;
+	}
+
+	tx_pipe->fifo = qcom_smem_get(remote_pid, SMEM_GLINK_NATIVE_XPRT_FIFO_0,
+				      &tx_pipe->native.length);
+	if (IS_ERR(tx_pipe->fifo)) {
+		dev_err(dev, "failed to acquire TX fifo\n");
+		ret = PTR_ERR(tx_pipe->fifo);
+		goto err_put_dev;
+	}
+
+	rx_pipe->native.avail = glink_smem_rx_avail;
+	rx_pipe->native.peak = glink_smem_rx_peak;
+	rx_pipe->native.advance = glink_smem_rx_advance;
+	rx_pipe->remote_pid = remote_pid;
+
+	tx_pipe->native.avail = glink_smem_tx_avail;
+	tx_pipe->native.write = glink_smem_tx_write;
+	tx_pipe->remote_pid = remote_pid;
+
+	*rx_pipe->tail = 0;
+	*tx_pipe->head = 0;
+
+	glink = qcom_glink_native_probe(dev,
+					&rx_pipe->native, &tx_pipe->native);
+	if (IS_ERR(glink)) {
+		ret = PTR_ERR(glink);
+		goto err_put_dev;
+	}
+
+	return glink;
+
+err_put_dev:
+	put_device(dev);
+
+	return ERR_PTR(ret);
+}
+EXPORT_SYMBOL_GPL(qcom_glink_smem_register);
+
+void qcom_glink_smem_unregister(struct qcom_glink *glink)
+{
+	qcom_glink_native_remove(glink);
+	qcom_glink_native_unregister(glink);
+}
+EXPORT_SYMBOL_GPL(qcom_glink_smem_unregister);
+
+MODULE_AUTHOR("Bjorn Andersson <bjorn.andersson@linaro.org>");
+MODULE_DESCRIPTION("Qualcomm GLINK SMEM driver");
+MODULE_LICENSE("GPL v2");
diff --git a/include/linux/rpmsg/qcom_glink.h b/include/linux/rpmsg/qcom_glink.h
new file mode 100644
index 000000000000..a622f029836e
--- /dev/null
+++ b/include/linux/rpmsg/qcom_glink.h
@@ -0,0 +1,27 @@
+#ifndef _LINUX_RPMSG_QCOM_GLINK_H
+#define _LINUX_RPMSG_QCOM_GLINK_H
+
+#include <linux/device.h>
+
+struct qcom_glink;
+
+#if IS_ENABLED(CONFIG_RPMSG_QCOM_GLINK_SMEM)
+
+struct qcom_glink *qcom_glink_smem_register(struct device *parent,
+					    struct device_node *node);
+void qcom_glink_smem_unregister(struct qcom_glink *glink);
+
+#else
+
+static inline struct qcom_glink *
+qcom_glink_smem_register(struct device *parent,
+			 struct device_node *node)
+{
+	return NULL;
+}
+
+static inline void qcom_glink_smem_unregister(struct qcom_glink *glink) {}
+
+#endif
+
+#endif
-- 
cgit v1.2.3


From 603aa14d3daaa7073bab4c472025c4963030e0cc Mon Sep 17 00:00:00 2001
From: Yoshihiro Shimoda <yoshihiro.shimoda.uh@renesas.com>
Date: Wed, 21 Jun 2017 16:00:27 +0200
Subject: mmc: tmio, renesas-sdhi: add max_{segs, blk_count} to tmio_mmc_data

Allow TMIO and SDHI driver implementations to provide values for
max_segs and max_blk_count.

A follow-up patch will set these values for Renesas Gen3 SoCs
the using an SDHI driver.

Signed-off-by: Yoshihiro Shimoda <yoshihiro.shimoda.uh@renesas.com>
Signed-off-by: Ai Kyuse <ai.kyuse.uw@renesas.com>
Signed-off-by: Simon Horman <horms+renesas@verge.net.au>
Reviewed-by: Wolfram Sang <wsa+renesas@sang-engineering.com>
Signed-off-by: Ulf Hansson <ulf.hansson@linaro.org>
---
 drivers/mmc/host/renesas_sdhi.h      | 2 ++
 drivers/mmc/host/renesas_sdhi_core.c | 2 ++
 drivers/mmc/host/tmio_mmc_core.c     | 6 +++---
 include/linux/mfd/tmio.h             | 2 ++
 4 files changed, 9 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/drivers/mmc/host/renesas_sdhi.h b/drivers/mmc/host/renesas_sdhi.h
index ca83acc113b8..b9dfea5d8193 100644
--- a/drivers/mmc/host/renesas_sdhi.h
+++ b/drivers/mmc/host/renesas_sdhi.h
@@ -31,6 +31,8 @@ struct renesas_sdhi_of_data {
 	int scc_offset;
 	struct renesas_sdhi_scc *taps;
 	int taps_num;
+	unsigned int max_blk_count;
+	unsigned short max_segs;
 };
 
 int renesas_sdhi_probe(struct platform_device *pdev,
diff --git a/drivers/mmc/host/renesas_sdhi_core.c b/drivers/mmc/host/renesas_sdhi_core.c
index a4fb07d0ea91..569bcdd5e653 100644
--- a/drivers/mmc/host/renesas_sdhi_core.c
+++ b/drivers/mmc/host/renesas_sdhi_core.c
@@ -526,6 +526,8 @@ int renesas_sdhi_probe(struct platform_device *pdev,
 		mmc_data->capabilities |= of_data->capabilities;
 		mmc_data->capabilities2 |= of_data->capabilities2;
 		mmc_data->dma_rx_offset = of_data->dma_rx_offset;
+		mmc_data->max_blk_count = of_data->max_blk_count;
+		mmc_data->max_segs = of_data->max_segs;
 		dma_priv->dma_buswidth = of_data->dma_buswidth;
 		host->bus_shift = of_data->bus_shift;
 	}
diff --git a/drivers/mmc/host/tmio_mmc_core.c b/drivers/mmc/host/tmio_mmc_core.c
index 88a94355ac90..56478fde7a09 100644
--- a/drivers/mmc/host/tmio_mmc_core.c
+++ b/drivers/mmc/host/tmio_mmc_core.c
@@ -1251,10 +1251,10 @@ int tmio_mmc_host_probe(struct tmio_mmc_host *_host,
 
 	mmc->caps |= MMC_CAP_4_BIT_DATA | pdata->capabilities;
 	mmc->caps2 |= pdata->capabilities2;
-	mmc->max_segs = 32;
+	mmc->max_segs = pdata->max_segs ? : 32;
 	mmc->max_blk_size = 512;
-	mmc->max_blk_count = (PAGE_SIZE / mmc->max_blk_size) *
-		mmc->max_segs;
+	mmc->max_blk_count = pdata->max_blk_count ? :
+		(PAGE_SIZE / mmc->max_blk_size) * mmc->max_segs;
 	mmc->max_req_size = mmc->max_blk_size * mmc->max_blk_count;
 	mmc->max_seg_size = mmc->max_req_size;
 
diff --git a/include/linux/mfd/tmio.h b/include/linux/mfd/tmio.h
index 26e8f8c0a6db..18b17a39d465 100644
--- a/include/linux/mfd/tmio.h
+++ b/include/linux/mfd/tmio.h
@@ -128,6 +128,8 @@ struct tmio_mmc_data {
 	unsigned int			cd_gpio;
 	int				alignment_shift;
 	dma_addr_t			dma_rx_offset;
+	unsigned int			max_blk_count;
+	unsigned short			max_segs;
 	void (*set_pwr)(struct platform_device *host, int state);
 	void (*set_clk_div)(struct platform_device *host, int state);
 };
-- 
cgit v1.2.3


From 1896f14006b28a7ed6881666d18a244827af0a5b Mon Sep 17 00:00:00 2001
From: Shawn Lin <shawn.lin@rock-chips.com>
Date: Wed, 19 Jul 2017 15:50:56 +0800
Subject: mmc: core: remove check of host->removed for rescan routine

The intention of this check was to prevent the conflict between
hotplug and removing driver for whatever reason. Currently it
doesn't improve anything and the following rescan process could
still saftly perform the scan flow. So these code seems pointless
now and let's remove them.

Signed-off-by: Shawn Lin <shawn.lin@rock-chips.com>
Signed-off-by: Ulf Hansson <ulf.hansson@linaro.org>
---
 drivers/mmc/core/core.c  | 13 -------------
 include/linux/mmc/host.h |  3 ---
 2 files changed, 16 deletions(-)

(limited to 'include')

diff --git a/drivers/mmc/core/core.c b/drivers/mmc/core/core.c
index 26431267a3e2..b311ec974c6a 100644
--- a/drivers/mmc/core/core.c
+++ b/drivers/mmc/core/core.c
@@ -1769,13 +1769,6 @@ void mmc_detach_bus(struct mmc_host *host)
 static void _mmc_detect_change(struct mmc_host *host, unsigned long delay,
 				bool cd_irq)
 {
-#ifdef CONFIG_MMC_DEBUG
-	unsigned long flags;
-	spin_lock_irqsave(&host->lock, flags);
-	WARN_ON(host->removed);
-	spin_unlock_irqrestore(&host->lock, flags);
-#endif
-
 	/*
 	 * If the device is configured as wakeup, we prevent a new sleep for
 	 * 5 s to give provision for user space to consume the event.
@@ -2646,12 +2639,6 @@ void mmc_start_host(struct mmc_host *host)
 
 void mmc_stop_host(struct mmc_host *host)
 {
-#ifdef CONFIG_MMC_DEBUG
-	unsigned long flags;
-	spin_lock_irqsave(&host->lock, flags);
-	host->removed = 1;
-	spin_unlock_irqrestore(&host->lock, flags);
-#endif
 	if (host->slot.cd_irq >= 0) {
 		if (host->slot.cd_wake_enabled)
 			disable_irq_wake(host->slot.cd_irq);
diff --git a/include/linux/mmc/host.h b/include/linux/mmc/host.h
index ebd1cebbef0c..4617c21f97f7 100644
--- a/include/linux/mmc/host.h
+++ b/include/linux/mmc/host.h
@@ -328,9 +328,6 @@ struct mmc_host {
 	unsigned int		use_spi_crc:1;
 	unsigned int		claimed:1;	/* host exclusively claimed */
 	unsigned int		bus_dead:1;	/* bus has been released */
-#ifdef CONFIG_MMC_DEBUG
-	unsigned int		removed:1;	/* host is being removed */
-#endif
 	unsigned int		can_retune:1;	/* re-tuning can be used */
 	unsigned int		doing_retune:1;	/* re-tuning in progress */
 	unsigned int		retune_now:1;	/* do re-tuning at next req */
-- 
cgit v1.2.3


From 4406ae215b5a1dd59d941c1323b9f40d241357ac Mon Sep 17 00:00:00 2001
From: Shawn Lin <shawn.lin@rock-chips.com>
Date: Wed, 2 Aug 2017 11:12:42 +0800
Subject: mmc: core: correct taac parameter according to the specification

Per the spec of JESD84-B51, section 7.3, replace tacc with taac to
fix the obvious typo.

Signed-off-by: Shawn Lin <shawn.lin@rock-chips.com>
Signed-off-by: Ulf Hansson <ulf.hansson@linaro.org>
---
 drivers/mmc/core/core.c  | 14 +++++++-------
 drivers/mmc/core/mmc.c   |  8 ++++----
 drivers/mmc/core/sd.c    | 12 ++++++------
 include/linux/mmc/card.h |  4 ++--
 4 files changed, 19 insertions(+), 19 deletions(-)

(limited to 'include')

diff --git a/drivers/mmc/core/core.c b/drivers/mmc/core/core.c
index 15623ccdcd8a..6177eb09bf1b 100644
--- a/drivers/mmc/core/core.c
+++ b/drivers/mmc/core/core.c
@@ -733,8 +733,8 @@ void mmc_set_data_timeout(struct mmc_data *data, const struct mmc_card *card)
 	if (data->flags & MMC_DATA_WRITE)
 		mult <<= card->csd.r2w_factor;
 
-	data->timeout_ns = card->csd.tacc_ns * mult;
-	data->timeout_clks = card->csd.tacc_clks * mult;
+	data->timeout_ns = card->csd.taac_ns * mult;
+	data->timeout_clks = card->csd.taac_clks * mult;
 
 	/*
 	 * SD cards also have an upper limit on the timeout.
@@ -1859,14 +1859,14 @@ static unsigned int mmc_mmc_erase_timeout(struct mmc_card *card,
 	} else {
 		/* CSD Erase Group Size uses write timeout */
 		unsigned int mult = (10 << card->csd.r2w_factor);
-		unsigned int timeout_clks = card->csd.tacc_clks * mult;
+		unsigned int timeout_clks = card->csd.taac_clks * mult;
 		unsigned int timeout_us;
 
-		/* Avoid overflow: e.g. tacc_ns=80000000 mult=1280 */
-		if (card->csd.tacc_ns < 1000000)
-			timeout_us = (card->csd.tacc_ns * mult) / 1000;
+		/* Avoid overflow: e.g. taac_ns=80000000 mult=1280 */
+		if (card->csd.taac_ns < 1000000)
+			timeout_us = (card->csd.taac_ns * mult) / 1000;
 		else
-			timeout_us = (card->csd.tacc_ns / 1000) * mult;
+			timeout_us = (card->csd.taac_ns / 1000) * mult;
 
 		/*
 		 * ios.clock is only a target.  The real clock rate might be
diff --git a/drivers/mmc/core/mmc.c b/drivers/mmc/core/mmc.c
index 2bae69e39544..a65f56bea4f3 100644
--- a/drivers/mmc/core/mmc.c
+++ b/drivers/mmc/core/mmc.c
@@ -41,11 +41,11 @@ static const unsigned char tran_mant[] = {
 	35,	40,	45,	50,	55,	60,	70,	80,
 };
 
-static const unsigned int tacc_exp[] = {
+static const unsigned int taac_exp[] = {
 	1,	10,	100,	1000,	10000,	100000,	1000000, 10000000,
 };
 
-static const unsigned int tacc_mant[] = {
+static const unsigned int taac_mant[] = {
 	0,	10,	12,	13,	15,	20,	25,	30,
 	35,	40,	45,	50,	55,	60,	70,	80,
 };
@@ -153,8 +153,8 @@ static int mmc_decode_csd(struct mmc_card *card)
 	csd->mmca_vsn	 = UNSTUFF_BITS(resp, 122, 4);
 	m = UNSTUFF_BITS(resp, 115, 4);
 	e = UNSTUFF_BITS(resp, 112, 3);
-	csd->tacc_ns	 = (tacc_exp[e] * tacc_mant[m] + 9) / 10;
-	csd->tacc_clks	 = UNSTUFF_BITS(resp, 104, 8) * 100;
+	csd->taac_ns	 = (taac_exp[e] * taac_mant[m] + 9) / 10;
+	csd->taac_clks	 = UNSTUFF_BITS(resp, 104, 8) * 100;
 
 	m = UNSTUFF_BITS(resp, 99, 4);
 	e = UNSTUFF_BITS(resp, 96, 3);
diff --git a/drivers/mmc/core/sd.c b/drivers/mmc/core/sd.c
index a1b0aa14d5e3..4fd1620b732d 100644
--- a/drivers/mmc/core/sd.c
+++ b/drivers/mmc/core/sd.c
@@ -39,11 +39,11 @@ static const unsigned char tran_mant[] = {
 	35,	40,	45,	50,	55,	60,	70,	80,
 };
 
-static const unsigned int tacc_exp[] = {
+static const unsigned int taac_exp[] = {
 	1,	10,	100,	1000,	10000,	100000,	1000000, 10000000,
 };
 
-static const unsigned int tacc_mant[] = {
+static const unsigned int taac_mant[] = {
 	0,	10,	12,	13,	15,	20,	25,	30,
 	35,	40,	45,	50,	55,	60,	70,	80,
 };
@@ -111,8 +111,8 @@ static int mmc_decode_csd(struct mmc_card *card)
 	case 0:
 		m = UNSTUFF_BITS(resp, 115, 4);
 		e = UNSTUFF_BITS(resp, 112, 3);
-		csd->tacc_ns	 = (tacc_exp[e] * tacc_mant[m] + 9) / 10;
-		csd->tacc_clks	 = UNSTUFF_BITS(resp, 104, 8) * 100;
+		csd->taac_ns	 = (taac_exp[e] * taac_mant[m] + 9) / 10;
+		csd->taac_clks	 = UNSTUFF_BITS(resp, 104, 8) * 100;
 
 		m = UNSTUFF_BITS(resp, 99, 4);
 		e = UNSTUFF_BITS(resp, 96, 3);
@@ -148,8 +148,8 @@ static int mmc_decode_csd(struct mmc_card *card)
 		 */
 		mmc_card_set_blockaddr(card);
 
-		csd->tacc_ns	 = 0; /* Unused */
-		csd->tacc_clks	 = 0; /* Unused */
+		csd->taac_ns	 = 0; /* Unused */
+		csd->taac_clks	 = 0; /* Unused */
 
 		m = UNSTUFF_BITS(resp, 99, 4);
 		e = UNSTUFF_BITS(resp, 96, 3);
diff --git a/include/linux/mmc/card.h b/include/linux/mmc/card.h
index 46c73e97e61f..279b39008a33 100644
--- a/include/linux/mmc/card.h
+++ b/include/linux/mmc/card.h
@@ -29,8 +29,8 @@ struct mmc_csd {
 	unsigned char		structure;
 	unsigned char		mmca_vsn;
 	unsigned short		cmdclass;
-	unsigned short		tacc_clks;
-	unsigned int		tacc_ns;
+	unsigned short		taac_clks;
+	unsigned int		taac_ns;
 	unsigned int		c_size;
 	unsigned int		r2w_factor;
 	unsigned int		max_dtr;
-- 
cgit v1.2.3


From f6f64ed868d32a121f1535da9f42791c91562e4a Mon Sep 17 00:00:00 2001
From: Chen-Yu Tsai <wens@csie.org>
Date: Mon, 24 Jul 2017 21:58:56 +0800
Subject: clk: sunxi-ng: Add interface to query or configure MMC timing modes.

Starting with the A83T SoC, Allwinner introduced a new timing mode for
its MMC clocks. The new mode changes how the MMC controller sample and
output clocks are delayed to match chip and board specifics. There are
two controls for this, one on the CCU side controlling how the clocks
behave, and one in the MMC controller controlling what inputs to take
and how to route them.

In the old mode, the MMC clock had 2 child clocks providing the output
and sample clocks, which could be delayed by a number of clock cycles
measured from the MMC clock's parent.

With the new mode, the 2 delay clocks are no longer active. Instead,
the delays and associated controls are moved into the MMC controller.
The output of the MMC clock is also halved.

The difference in how things are wired between the modes means that the
clock controls and the MMC controls must match. To achieve this in a
clear, explicit way, we introduce two functions for the MMC driver to
use: one queries the hardware for the current mode set, and the other
allows the MMC driver to request a mode.

Signed-off-by: Chen-Yu Tsai <wens@csie.org>
Acked-by: Maxime Ripard <maxime.ripard@free-electrons.com>
Signed-off-by: Ulf Hansson <ulf.hansson@linaro.org>
---
 drivers/clk/sunxi-ng/Makefile         |  1 +
 drivers/clk/sunxi-ng/ccu_common.h     |  4 ++
 drivers/clk/sunxi-ng/ccu_mmc_timing.c | 70 +++++++++++++++++++++++++++++++++++
 include/linux/clk/sunxi-ng.h          | 35 ++++++++++++++++++
 4 files changed, 110 insertions(+)
 create mode 100644 drivers/clk/sunxi-ng/ccu_mmc_timing.c
 create mode 100644 include/linux/clk/sunxi-ng.h

(limited to 'include')

diff --git a/drivers/clk/sunxi-ng/Makefile b/drivers/clk/sunxi-ng/Makefile
index 0c45fa50283d..45a5910379a5 100644
--- a/drivers/clk/sunxi-ng/Makefile
+++ b/drivers/clk/sunxi-ng/Makefile
@@ -1,5 +1,6 @@
 # Common objects
 lib-$(CONFIG_SUNXI_CCU)		+= ccu_common.o
+lib-$(CONFIG_SUNXI_CCU)		+= ccu_mmc_timing.o
 lib-$(CONFIG_SUNXI_CCU)		+= ccu_reset.o
 
 # Base clock types
diff --git a/drivers/clk/sunxi-ng/ccu_common.h b/drivers/clk/sunxi-ng/ccu_common.h
index d6fdd7a789aa..cadd1a9f93b6 100644
--- a/drivers/clk/sunxi-ng/ccu_common.h
+++ b/drivers/clk/sunxi-ng/ccu_common.h
@@ -23,6 +23,10 @@
 #define CCU_FEATURE_FIXED_POSTDIV	BIT(3)
 #define CCU_FEATURE_ALL_PREDIV		BIT(4)
 #define CCU_FEATURE_LOCK_REG		BIT(5)
+#define CCU_FEATURE_MMC_TIMING_SWITCH	BIT(6)
+
+/* MMC timing mode switch bit */
+#define CCU_MMC_NEW_TIMING_MODE		BIT(30)
 
 struct device_node;
 
diff --git a/drivers/clk/sunxi-ng/ccu_mmc_timing.c b/drivers/clk/sunxi-ng/ccu_mmc_timing.c
new file mode 100644
index 000000000000..f9869f7353c0
--- /dev/null
+++ b/drivers/clk/sunxi-ng/ccu_mmc_timing.c
@@ -0,0 +1,70 @@
+/*
+ * Copyright (c) 2017 Chen-Yu Tsai. All rights reserved.
+ *
+ * This software is licensed under the terms of the GNU General Public
+ * License version 2, as published by the Free Software Foundation, and
+ * may be copied, distributed, and modified under those terms.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ */
+
+#include <linux/clk-provider.h>
+#include <linux/clk/sunxi-ng.h>
+
+#include "ccu_common.h"
+
+/**
+ * sunxi_ccu_set_mmc_timing_mode: Configure the MMC clock timing mode
+ * @clk: clock to be configured
+ * @new_mode: true for new timing mode introduced in A83T and later
+ *
+ * Returns 0 on success, -ENOTSUPP if the clock does not support
+ * switching modes.
+ */
+int sunxi_ccu_set_mmc_timing_mode(struct clk *clk, bool new_mode)
+{
+	struct clk_hw *hw = __clk_get_hw(clk);
+	struct ccu_common *cm = hw_to_ccu_common(hw);
+	unsigned long flags;
+	u32 val;
+
+	if (!(cm->features & CCU_FEATURE_MMC_TIMING_SWITCH))
+		return -ENOTSUPP;
+
+	spin_lock_irqsave(cm->lock, flags);
+
+	val = readl(cm->base + cm->reg);
+	if (new_mode)
+		val |= CCU_MMC_NEW_TIMING_MODE;
+	else
+		val &= ~CCU_MMC_NEW_TIMING_MODE;
+	writel(val, cm->base + cm->reg);
+
+	spin_unlock_irqrestore(cm->lock, flags);
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(sunxi_ccu_set_mmc_timing_mode);
+
+/**
+ * sunxi_ccu_set_mmc_timing_mode: Get the current MMC clock timing mode
+ * @clk: clock to query
+ *
+ * Returns 0 if the clock is in old timing mode, > 0 if it is in
+ * new timing mode, and -ENOTSUPP if the clock does not support
+ * this function.
+ */
+int sunxi_ccu_get_mmc_timing_mode(struct clk *clk)
+{
+	struct clk_hw *hw = __clk_get_hw(clk);
+	struct ccu_common *cm = hw_to_ccu_common(hw);
+
+	if (!(cm->features & CCU_FEATURE_MMC_TIMING_SWITCH))
+		return -ENOTSUPP;
+
+	return !!(readl(cm->base + cm->reg) & CCU_MMC_NEW_TIMING_MODE);
+}
+EXPORT_SYMBOL_GPL(sunxi_ccu_get_mmc_timing_mode);
diff --git a/include/linux/clk/sunxi-ng.h b/include/linux/clk/sunxi-ng.h
new file mode 100644
index 000000000000..990f760f70e5
--- /dev/null
+++ b/include/linux/clk/sunxi-ng.h
@@ -0,0 +1,35 @@
+/*
+ * Copyright (c) 2017 Chen-Yu Tsai. All rights reserved.
+ *
+ * This software is licensed under the terms of the GNU General Public
+ * License version 2, as published by the Free Software Foundation, and
+ * may be copied, distributed, and modified under those terms.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ */
+
+#ifndef _LINUX_CLK_SUNXI_NG_H_
+#define _LINUX_CLK_SUNXI_NG_H_
+
+#include <linux/errno.h>
+
+#ifdef CONFIG_SUNXI_CCU
+int sunxi_ccu_set_mmc_timing_mode(struct clk *clk, bool new_mode);
+int sunxi_ccu_get_mmc_timing_mode(struct clk *clk);
+#else
+static inline int sunxi_ccu_set_mmc_timing_mode(struct clk *clk,
+						bool new_mode)
+{
+	return -ENOTSUPP;
+}
+
+static inline int sunxi_ccu_get_mmc_timing_mode(struct clk *clk)
+{
+	return -ENOTSUPP;
+}
+#endif
+
+#endif
-- 
cgit v1.2.3


From 5124b59202eb3118eba5ac2222dc00f3390549a8 Mon Sep 17 00:00:00 2001
From: Wolfram Sang <wsa+renesas@sang-engineering.com>
Date: Wed, 9 Aug 2017 21:00:41 +0200
Subject: mmc: renesas_sdhi: use extra flag for CBSY usage

There is one SDHI instance on Gen2 which does not have the CBSY bit.
So, turn CBSY usage into an extra flag and set it accordingly. This has
the additional advantage that we can also set it for other incarnations
later.

Signed-off-by: Wolfram Sang <wsa+renesas@sang-engineering.com>
Tested-by: Chris Brandt <Chris.Brandt@renesas.com>
Reviewed-by: Simon Horman <horms+renesas@verge.net.au>
Signed-off-by: Ulf Hansson <ulf.hansson@linaro.org>
---
 drivers/mmc/host/renesas_sdhi_core.c          | 6 +++++-
 drivers/mmc/host/renesas_sdhi_internal_dmac.c | 3 ++-
 drivers/mmc/host/renesas_sdhi_sys_dmac.c      | 6 ++++--
 include/linux/mfd/tmio.h                      | 3 +++
 4 files changed, 14 insertions(+), 4 deletions(-)

(limited to 'include')

diff --git a/drivers/mmc/host/renesas_sdhi_core.c b/drivers/mmc/host/renesas_sdhi_core.c
index dd215723fa43..2dea4039c091 100644
--- a/drivers/mmc/host/renesas_sdhi_core.c
+++ b/drivers/mmc/host/renesas_sdhi_core.c
@@ -429,7 +429,7 @@ static int renesas_sdhi_write16_hook(struct tmio_mmc_host *host, int addr)
 	case CTL_TRANSACTION_CTL:
 	case CTL_DMA_ENABLE:
 	case EXT_ACC:
-		if (host->pdata->flags & TMIO_MMC_MIN_RCAR2)
+		if (host->pdata->flags & TMIO_MMC_HAVE_CBSY)
 			bit = TMIO_STAT_CMD_BUSY;
 		/* fallthrough */
 	case CTL_SD_CARD_CLK_CTL:
@@ -588,6 +588,10 @@ int renesas_sdhi_probe(struct platform_device *pdev,
 	if (ret < 0)
 		goto efree;
 
+	/* One Gen2 SDHI incarnation does NOT have a CBSY bit */
+	if (sd_ctrl_read16(host, CTL_VERSION) == SDHI_VER_GEN2_SDR50)
+		mmc_data->flags &= ~TMIO_MMC_HAVE_CBSY;
+
 	/* Enable tuning iff we have an SCC and a supported mode */
 	if (of_data && of_data->scc_offset &&
 	    (host->mmc->caps & MMC_CAP_UHS_SDR104 ||
diff --git a/drivers/mmc/host/renesas_sdhi_internal_dmac.c b/drivers/mmc/host/renesas_sdhi_internal_dmac.c
index 3a8257742040..f905f2361d12 100644
--- a/drivers/mmc/host/renesas_sdhi_internal_dmac.c
+++ b/drivers/mmc/host/renesas_sdhi_internal_dmac.c
@@ -72,7 +72,8 @@ static struct renesas_sdhi_scc rcar_gen3_scc_taps[] = {
 
 static const struct renesas_sdhi_of_data of_rcar_gen3_compatible = {
 	.tmio_flags	= TMIO_MMC_HAS_IDLE_WAIT | TMIO_MMC_WRPROTECT_DISABLE |
-			  TMIO_MMC_CLK_ACTUAL | TMIO_MMC_MIN_RCAR2,
+			  TMIO_MMC_CLK_ACTUAL | TMIO_MMC_HAVE_CBSY |
+			  TMIO_MMC_MIN_RCAR2,
 	.capabilities	= MMC_CAP_SD_HIGHSPEED | MMC_CAP_SDIO_IRQ |
 			  MMC_CAP_CMD23,
 	.bus_shift	= 2,
diff --git a/drivers/mmc/host/renesas_sdhi_sys_dmac.c b/drivers/mmc/host/renesas_sdhi_sys_dmac.c
index 718cb8a9d2ce..9b77f521cd2c 100644
--- a/drivers/mmc/host/renesas_sdhi_sys_dmac.c
+++ b/drivers/mmc/host/renesas_sdhi_sys_dmac.c
@@ -58,7 +58,8 @@ static struct renesas_sdhi_scc rcar_gen2_scc_taps[] = {
 
 static const struct renesas_sdhi_of_data of_rcar_gen2_compatible = {
 	.tmio_flags	= TMIO_MMC_HAS_IDLE_WAIT | TMIO_MMC_WRPROTECT_DISABLE |
-			  TMIO_MMC_CLK_ACTUAL | TMIO_MMC_MIN_RCAR2,
+			  TMIO_MMC_CLK_ACTUAL | TMIO_MMC_HAVE_CBSY |
+			  TMIO_MMC_MIN_RCAR2,
 	.capabilities	= MMC_CAP_SD_HIGHSPEED | MMC_CAP_SDIO_IRQ |
 			  MMC_CAP_CMD23,
 	.dma_buswidth	= DMA_SLAVE_BUSWIDTH_4_BYTES,
@@ -78,7 +79,8 @@ static struct renesas_sdhi_scc rcar_gen3_scc_taps[] = {
 
 static const struct renesas_sdhi_of_data of_rcar_gen3_compatible = {
 	.tmio_flags	= TMIO_MMC_HAS_IDLE_WAIT | TMIO_MMC_WRPROTECT_DISABLE |
-			  TMIO_MMC_CLK_ACTUAL | TMIO_MMC_MIN_RCAR2,
+			  TMIO_MMC_CLK_ACTUAL | TMIO_MMC_HAVE_CBSY |
+			  TMIO_MMC_MIN_RCAR2,
 	.capabilities	= MMC_CAP_SD_HIGHSPEED | MMC_CAP_SDIO_IRQ |
 			  MMC_CAP_CMD23,
 	.bus_shift	= 2,
diff --git a/include/linux/mfd/tmio.h b/include/linux/mfd/tmio.h
index 18b17a39d465..b572955e6de6 100644
--- a/include/linux/mfd/tmio.h
+++ b/include/linux/mfd/tmio.h
@@ -107,6 +107,9 @@
  */
 #define TMIO_MMC_CLK_ACTUAL		BIT(10)
 
+/* Some controllers have a CBSY bit */
+#define TMIO_MMC_HAVE_CBSY		BIT(11)
+
 int tmio_core_mmc_enable(void __iomem *cnf, int shift, unsigned long base);
 int tmio_core_mmc_resume(void __iomem *cnf, int shift, unsigned long base);
 void tmio_core_mmc_pwr(void __iomem *cnf, int shift, int state);
-- 
cgit v1.2.3


From dc8d68bb6c6f7d7fb1aeb3450c0654598e4f4e52 Mon Sep 17 00:00:00 2001
From: Adrian Hunter <adrian.hunter@intel.com>
Date: Thu, 10 Aug 2017 15:08:11 +0300
Subject: mmc: core: Remove unused MMC_CAP2_PACKED_CMD

Packed commands support was removed but some bits got left behind. Remove
them.

Signed-off-by: Adrian Hunter <adrian.hunter@intel.com>
Reviewed-by: Linus Walleij <linus.walleij@linaro.org>
Signed-off-by: Ulf Hansson <ulf.hansson@linaro.org>
---
 drivers/mmc/core/mmc.c   | 23 -----------------------
 include/linux/mmc/host.h |  4 ----
 2 files changed, 27 deletions(-)

(limited to 'include')

diff --git a/drivers/mmc/core/mmc.c b/drivers/mmc/core/mmc.c
index a65f56bea4f3..a7eb623f8daa 100644
--- a/drivers/mmc/core/mmc.c
+++ b/drivers/mmc/core/mmc.c
@@ -1790,29 +1790,6 @@ static int mmc_init_card(struct mmc_host *host, u32 ocr,
 	 */
 	card->reenable_cmdq = card->ext_csd.cmdq_en;
 
-	/*
-	 * The mandatory minimum values are defined for packed command.
-	 * read: 5, write: 3
-	 */
-	if (card->ext_csd.max_packed_writes >= 3 &&
-	    card->ext_csd.max_packed_reads >= 5 &&
-	    host->caps2 & MMC_CAP2_PACKED_CMD) {
-		err = mmc_switch(card, EXT_CSD_CMD_SET_NORMAL,
-				EXT_CSD_EXP_EVENTS_CTRL,
-				EXT_CSD_PACKED_EVENT_EN,
-				card->ext_csd.generic_cmd6_time);
-		if (err && err != -EBADMSG)
-			goto free_card;
-		if (err) {
-			pr_warn("%s: Enabling packed event failed\n",
-				mmc_hostname(card->host));
-			card->ext_csd.packed_event_en = 0;
-			err = 0;
-		} else {
-			card->ext_csd.packed_event_en = 1;
-		}
-	}
-
 	if (!oldcard)
 		host->card = card;
 
diff --git a/include/linux/mmc/host.h b/include/linux/mmc/host.h
index 4617c21f97f7..e92629518f68 100644
--- a/include/linux/mmc/host.h
+++ b/include/linux/mmc/host.h
@@ -291,10 +291,6 @@ struct mmc_host {
 				 MMC_CAP2_HS200_1_2V_SDR)
 #define MMC_CAP2_CD_ACTIVE_HIGH	(1 << 10)	/* Card-detect signal active high */
 #define MMC_CAP2_RO_ACTIVE_HIGH	(1 << 11)	/* Write-protect signal active high */
-#define MMC_CAP2_PACKED_RD	(1 << 12)	/* Allow packed read */
-#define MMC_CAP2_PACKED_WR	(1 << 13)	/* Allow packed write */
-#define MMC_CAP2_PACKED_CMD	(MMC_CAP2_PACKED_RD | \
-				 MMC_CAP2_PACKED_WR)
 #define MMC_CAP2_NO_PRESCAN_POWERUP (1 << 14)	/* Don't power up before scan */
 #define MMC_CAP2_HS400_1_8V	(1 << 15)	/* Can support HS400 1.8V */
 #define MMC_CAP2_HS400_1_2V	(1 << 16)	/* Can support HS400 1.2V */
-- 
cgit v1.2.3


From d2f82254e4e862662e7820953b049b7d4d660ec7 Mon Sep 17 00:00:00 2001
From: Adrian Hunter <adrian.hunter@intel.com>
Date: Thu, 10 Aug 2017 15:08:07 +0300
Subject: mmc: core: Add members to mmc_request and mmc_data for CQE's

Most of the information needed to issue requests to a CQE is already in
struct mmc_request and struct mmc_data. Add data block address, some flags,
and the task id (tag), and allow for cmd being NULL which it is for CQE
tasks.

Signed-off-by: Adrian Hunter <adrian.hunter@intel.com>
Signed-off-by: Ulf Hansson <ulf.hansson@linaro.org>
---
 include/linux/mmc/core.h   | 13 +++++++++++--
 include/trace/events/mmc.h | 36 +++++++++++++++++++++++-------------
 2 files changed, 34 insertions(+), 15 deletions(-)

(limited to 'include')

diff --git a/include/linux/mmc/core.h b/include/linux/mmc/core.h
index a0c63ea28796..bf1788a224e6 100644
--- a/include/linux/mmc/core.h
+++ b/include/linux/mmc/core.h
@@ -122,11 +122,18 @@ struct mmc_data {
 	unsigned int		timeout_clks;	/* data timeout (in clocks) */
 	unsigned int		blksz;		/* data block size */
 	unsigned int		blocks;		/* number of blocks */
+	unsigned int		blk_addr;	/* block address */
 	int			error;		/* data error */
 	unsigned int		flags;
 
-#define MMC_DATA_WRITE	(1 << 8)
-#define MMC_DATA_READ	(1 << 9)
+#define MMC_DATA_WRITE		BIT(8)
+#define MMC_DATA_READ		BIT(9)
+/* Extra flags used by CQE */
+#define MMC_DATA_QBR		BIT(10)		/* CQE queue barrier*/
+#define MMC_DATA_PRIO		BIT(11)		/* CQE high priority */
+#define MMC_DATA_REL_WR		BIT(12)		/* Reliable write */
+#define MMC_DATA_DAT_TAG	BIT(13)		/* Tag request */
+#define MMC_DATA_FORCED_PRG	BIT(14)		/* Forced programming */
 
 	unsigned int		bytes_xfered;
 
@@ -153,6 +160,8 @@ struct mmc_request {
 
 	/* Allow other commands during this ongoing data transfer or busy wait */
 	bool			cap_cmd_during_tfr;
+
+	int			tag;
 };
 
 struct mmc_card;
diff --git a/include/trace/events/mmc.h b/include/trace/events/mmc.h
index a72f9b94c80b..f30a99ac65b6 100644
--- a/include/trace/events/mmc.h
+++ b/include/trace/events/mmc.h
@@ -29,8 +29,10 @@ TRACE_EVENT(mmc_request_start,
 		__field(unsigned int,		sbc_flags)
 		__field(unsigned int,		sbc_retries)
 		__field(unsigned int,		blocks)
+		__field(unsigned int,		blk_addr)
 		__field(unsigned int,		blksz)
 		__field(unsigned int,		data_flags)
+		__field(int,			tag)
 		__field(unsigned int,		can_retune)
 		__field(unsigned int,		doing_retune)
 		__field(unsigned int,		retune_now)
@@ -42,10 +44,10 @@ TRACE_EVENT(mmc_request_start,
 	),
 
 	TP_fast_assign(
-		__entry->cmd_opcode = mrq->cmd->opcode;
-		__entry->cmd_arg = mrq->cmd->arg;
-		__entry->cmd_flags = mrq->cmd->flags;
-		__entry->cmd_retries = mrq->cmd->retries;
+		__entry->cmd_opcode = mrq->cmd ? mrq->cmd->opcode : 0;
+		__entry->cmd_arg = mrq->cmd ? mrq->cmd->arg : 0;
+		__entry->cmd_flags = mrq->cmd ? mrq->cmd->flags : 0;
+		__entry->cmd_retries = mrq->cmd ? mrq->cmd->retries : 0;
 		__entry->stop_opcode = mrq->stop ? mrq->stop->opcode : 0;
 		__entry->stop_arg = mrq->stop ? mrq->stop->arg : 0;
 		__entry->stop_flags = mrq->stop ? mrq->stop->flags : 0;
@@ -56,7 +58,9 @@ TRACE_EVENT(mmc_request_start,
 		__entry->sbc_retries = mrq->sbc ? mrq->sbc->retries : 0;
 		__entry->blksz = mrq->data ? mrq->data->blksz : 0;
 		__entry->blocks = mrq->data ? mrq->data->blocks : 0;
+		__entry->blk_addr = mrq->data ? mrq->data->blk_addr : 0;
 		__entry->data_flags = mrq->data ? mrq->data->flags : 0;
+		__entry->tag = mrq->tag;
 		__entry->can_retune = host->can_retune;
 		__entry->doing_retune = host->doing_retune;
 		__entry->retune_now = host->retune_now;
@@ -71,8 +75,8 @@ TRACE_EVENT(mmc_request_start,
 		  "cmd_opcode=%u cmd_arg=0x%x cmd_flags=0x%x cmd_retries=%u "
 		  "stop_opcode=%u stop_arg=0x%x stop_flags=0x%x stop_retries=%u "
 		  "sbc_opcode=%u sbc_arg=0x%x sbc_flags=0x%x sbc_retires=%u "
-		  "blocks=%u block_size=%u data_flags=0x%x "
-		  "can_retune=%u doing_retune=%u retune_now=%u "
+		  "blocks=%u block_size=%u blk_addr=%u data_flags=0x%x "
+		  "tag=%d can_retune=%u doing_retune=%u retune_now=%u "
 		  "need_retune=%d hold_retune=%d retune_period=%u",
 		  __get_str(name), __entry->mrq,
 		  __entry->cmd_opcode, __entry->cmd_arg,
@@ -81,7 +85,8 @@ TRACE_EVENT(mmc_request_start,
 		  __entry->stop_flags, __entry->stop_retries,
 		  __entry->sbc_opcode, __entry->sbc_arg,
 		  __entry->sbc_flags, __entry->sbc_retries,
-		  __entry->blocks, __entry->blksz, __entry->data_flags,
+		  __entry->blocks, __entry->blk_addr,
+		  __entry->blksz, __entry->data_flags, __entry->tag,
 		  __entry->can_retune, __entry->doing_retune,
 		  __entry->retune_now, __entry->need_retune,
 		  __entry->hold_retune, __entry->retune_period)
@@ -108,6 +113,7 @@ TRACE_EVENT(mmc_request_done,
 		__field(unsigned int,		sbc_retries)
 		__field(unsigned int,		bytes_xfered)
 		__field(int,			data_err)
+		__field(int,			tag)
 		__field(unsigned int,		can_retune)
 		__field(unsigned int,		doing_retune)
 		__field(unsigned int,		retune_now)
@@ -119,10 +125,13 @@ TRACE_EVENT(mmc_request_done,
 	),
 
 	TP_fast_assign(
-		__entry->cmd_opcode = mrq->cmd->opcode;
-		__entry->cmd_err = mrq->cmd->error;
-		memcpy(__entry->cmd_resp, mrq->cmd->resp, 4);
-		__entry->cmd_retries = mrq->cmd->retries;
+		__entry->cmd_opcode = mrq->cmd ? mrq->cmd->opcode : 0;
+		__entry->cmd_err = mrq->cmd ? mrq->cmd->error : 0;
+		__entry->cmd_resp[0] = mrq->cmd ? mrq->cmd->resp[0] : 0;
+		__entry->cmd_resp[1] = mrq->cmd ? mrq->cmd->resp[1] : 0;
+		__entry->cmd_resp[2] = mrq->cmd ? mrq->cmd->resp[2] : 0;
+		__entry->cmd_resp[3] = mrq->cmd ? mrq->cmd->resp[3] : 0;
+		__entry->cmd_retries = mrq->cmd ? mrq->cmd->retries : 0;
 		__entry->stop_opcode = mrq->stop ? mrq->stop->opcode : 0;
 		__entry->stop_err = mrq->stop ? mrq->stop->error : 0;
 		__entry->stop_resp[0] = mrq->stop ? mrq->stop->resp[0] : 0;
@@ -139,6 +148,7 @@ TRACE_EVENT(mmc_request_done,
 		__entry->sbc_retries = mrq->sbc ? mrq->sbc->retries : 0;
 		__entry->bytes_xfered = mrq->data ? mrq->data->bytes_xfered : 0;
 		__entry->data_err = mrq->data ? mrq->data->error : 0;
+		__entry->tag = mrq->tag;
 		__entry->can_retune = host->can_retune;
 		__entry->doing_retune = host->doing_retune;
 		__entry->retune_now = host->retune_now;
@@ -154,7 +164,7 @@ TRACE_EVENT(mmc_request_done,
 		  "cmd_retries=%u stop_opcode=%u stop_err=%d "
 		  "stop_resp=0x%x 0x%x 0x%x 0x%x stop_retries=%u "
 		  "sbc_opcode=%u sbc_err=%d sbc_resp=0x%x 0x%x 0x%x 0x%x "
-		  "sbc_retries=%u bytes_xfered=%u data_err=%d "
+		  "sbc_retries=%u bytes_xfered=%u data_err=%d tag=%d "
 		  "can_retune=%u doing_retune=%u retune_now=%u need_retune=%d "
 		  "hold_retune=%d retune_period=%u",
 		  __get_str(name), __entry->mrq,
@@ -170,7 +180,7 @@ TRACE_EVENT(mmc_request_done,
 		  __entry->sbc_resp[0], __entry->sbc_resp[1],
 		  __entry->sbc_resp[2], __entry->sbc_resp[3],
 		  __entry->sbc_retries,
-		  __entry->bytes_xfered, __entry->data_err,
+		  __entry->bytes_xfered, __entry->data_err, __entry->tag,
 		  __entry->can_retune, __entry->doing_retune,
 		  __entry->retune_now, __entry->need_retune,
 		  __entry->hold_retune, __entry->retune_period)
-- 
cgit v1.2.3


From c0020756315eebec58310aca42cf9fb73e1322eb Mon Sep 17 00:00:00 2001
From: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Date: Wed, 19 Jul 2017 21:28:52 +0300
Subject: efi: switch to use new generic UUID API

There are new types and helpers that are supposed to be used in new code.

As a preparation to get rid of legacy types and API functions do
the conversion here.

Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Acked-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 drivers/firmware/efi/cper.c | 10 ++---
 include/linux/cper.h        | 94 ++++++++++++++++++++++-----------------------
 include/linux/efi.h         |  4 +-
 3 files changed, 54 insertions(+), 54 deletions(-)

(limited to 'include')

diff --git a/drivers/firmware/efi/cper.c b/drivers/firmware/efi/cper.c
index 48a8f69da42a..684e65c11dde 100644
--- a/drivers/firmware/efi/cper.c
+++ b/drivers/firmware/efi/cper.c
@@ -534,7 +534,7 @@ static void
 cper_estatus_print_section(const char *pfx, struct acpi_hest_generic_data *gdata,
 			   int sec_no)
 {
-	uuid_le *sec_type = (uuid_le *)gdata->section_type;
+	guid_t *sec_type = (guid_t *)gdata->section_type;
 	__u16 severity;
 	char newpfx[64];
 
@@ -545,12 +545,12 @@ cper_estatus_print_section(const char *pfx, struct acpi_hest_generic_data *gdata
 	printk("%s""Error %d, type: %s\n", pfx, sec_no,
 	       cper_severity_str(severity));
 	if (gdata->validation_bits & CPER_SEC_VALID_FRU_ID)
-		printk("%s""fru_id: %pUl\n", pfx, (uuid_le *)gdata->fru_id);
+		printk("%s""fru_id: %pUl\n", pfx, gdata->fru_id);
 	if (gdata->validation_bits & CPER_SEC_VALID_FRU_TEXT)
 		printk("%s""fru_text: %.20s\n", pfx, gdata->fru_text);
 
 	snprintf(newpfx, sizeof(newpfx), "%s%s", pfx, INDENT_SP);
-	if (!uuid_le_cmp(*sec_type, CPER_SEC_PROC_GENERIC)) {
+	if (guid_equal(sec_type, &CPER_SEC_PROC_GENERIC)) {
 		struct cper_sec_proc_generic *proc_err = acpi_hest_get_payload(gdata);
 
 		printk("%s""section_type: general processor error\n", newpfx);
@@ -558,7 +558,7 @@ cper_estatus_print_section(const char *pfx, struct acpi_hest_generic_data *gdata
 			cper_print_proc_generic(newpfx, proc_err);
 		else
 			goto err_section_too_small;
-	} else if (!uuid_le_cmp(*sec_type, CPER_SEC_PLATFORM_MEM)) {
+	} else if (guid_equal(sec_type, &CPER_SEC_PLATFORM_MEM)) {
 		struct cper_sec_mem_err *mem_err = acpi_hest_get_payload(gdata);
 
 		printk("%s""section_type: memory error\n", newpfx);
@@ -568,7 +568,7 @@ cper_estatus_print_section(const char *pfx, struct acpi_hest_generic_data *gdata
 				       gdata->error_data_length);
 		else
 			goto err_section_too_small;
-	} else if (!uuid_le_cmp(*sec_type, CPER_SEC_PCIE)) {
+	} else if (guid_equal(sec_type, &CPER_SEC_PCIE)) {
 		struct cper_sec_pcie *pcie = acpi_hest_get_payload(gdata);
 
 		printk("%s""section_type: PCIe error\n", newpfx);
diff --git a/include/linux/cper.h b/include/linux/cper.h
index 4c671fc2081e..723e952fde0d 100644
--- a/include/linux/cper.h
+++ b/include/linux/cper.h
@@ -74,36 +74,36 @@ enum {
  * Corrected Machine Check
  */
 #define CPER_NOTIFY_CMC							\
-	UUID_LE(0x2DCE8BB1, 0xBDD7, 0x450e, 0xB9, 0xAD, 0x9C, 0xF4,	\
-		0xEB, 0xD4, 0xF8, 0x90)
+	GUID_INIT(0x2DCE8BB1, 0xBDD7, 0x450e, 0xB9, 0xAD, 0x9C, 0xF4,	\
+		  0xEB, 0xD4, 0xF8, 0x90)
 /* Corrected Platform Error */
 #define CPER_NOTIFY_CPE							\
-	UUID_LE(0x4E292F96, 0xD843, 0x4a55, 0xA8, 0xC2, 0xD4, 0x81,	\
-		0xF2, 0x7E, 0xBE, 0xEE)
+	GUID_INIT(0x4E292F96, 0xD843, 0x4a55, 0xA8, 0xC2, 0xD4, 0x81,	\
+		  0xF2, 0x7E, 0xBE, 0xEE)
 /* Machine Check Exception */
 #define CPER_NOTIFY_MCE							\
-	UUID_LE(0xE8F56FFE, 0x919C, 0x4cc5, 0xBA, 0x88, 0x65, 0xAB,	\
-		0xE1, 0x49, 0x13, 0xBB)
+	GUID_INIT(0xE8F56FFE, 0x919C, 0x4cc5, 0xBA, 0x88, 0x65, 0xAB,	\
+		  0xE1, 0x49, 0x13, 0xBB)
 /* PCI Express Error */
 #define CPER_NOTIFY_PCIE						\
-	UUID_LE(0xCF93C01F, 0x1A16, 0x4dfc, 0xB8, 0xBC, 0x9C, 0x4D,	\
-		0xAF, 0x67, 0xC1, 0x04)
+	GUID_INIT(0xCF93C01F, 0x1A16, 0x4dfc, 0xB8, 0xBC, 0x9C, 0x4D,	\
+		  0xAF, 0x67, 0xC1, 0x04)
 /* INIT Record (for IPF) */
 #define CPER_NOTIFY_INIT						\
-	UUID_LE(0xCC5263E8, 0x9308, 0x454a, 0x89, 0xD0, 0x34, 0x0B,	\
-		0xD3, 0x9B, 0xC9, 0x8E)
+	GUID_INIT(0xCC5263E8, 0x9308, 0x454a, 0x89, 0xD0, 0x34, 0x0B,	\
+		  0xD3, 0x9B, 0xC9, 0x8E)
 /* Non-Maskable Interrupt */
 #define CPER_NOTIFY_NMI							\
-	UUID_LE(0x5BAD89FF, 0xB7E6, 0x42c9, 0x81, 0x4A, 0xCF, 0x24,	\
-		0x85, 0xD6, 0xE9, 0x8A)
+	GUID_INIT(0x5BAD89FF, 0xB7E6, 0x42c9, 0x81, 0x4A, 0xCF, 0x24,	\
+		  0x85, 0xD6, 0xE9, 0x8A)
 /* BOOT Error Record */
 #define CPER_NOTIFY_BOOT						\
-	UUID_LE(0x3D61A466, 0xAB40, 0x409a, 0xA6, 0x98, 0xF3, 0x62,	\
-		0xD4, 0x64, 0xB3, 0x8F)
+	GUID_INIT(0x3D61A466, 0xAB40, 0x409a, 0xA6, 0x98, 0xF3, 0x62,	\
+		  0xD4, 0x64, 0xB3, 0x8F)
 /* DMA Remapping Error */
 #define CPER_NOTIFY_DMAR						\
-	UUID_LE(0x667DD791, 0xC6B3, 0x4c27, 0x8A, 0x6B, 0x0F, 0x8E,	\
-		0x72, 0x2D, 0xEB, 0x41)
+	GUID_INIT(0x667DD791, 0xC6B3, 0x4c27, 0x8A, 0x6B, 0x0F, 0x8E,	\
+		  0x72, 0x2D, 0xEB, 0x41)
 
 /*
  * Flags bits definitions for flags in struct cper_record_header
@@ -170,50 +170,50 @@ enum {
  * Processor Generic
  */
 #define CPER_SEC_PROC_GENERIC						\
-	UUID_LE(0x9876CCAD, 0x47B4, 0x4bdb, 0xB6, 0x5E, 0x16, 0xF1,	\
-		0x93, 0xC4, 0xF3, 0xDB)
+	GUID_INIT(0x9876CCAD, 0x47B4, 0x4bdb, 0xB6, 0x5E, 0x16, 0xF1,	\
+		  0x93, 0xC4, 0xF3, 0xDB)
 /* Processor Specific: X86/X86_64 */
 #define CPER_SEC_PROC_IA						\
-	UUID_LE(0xDC3EA0B0, 0xA144, 0x4797, 0xB9, 0x5B, 0x53, 0xFA,	\
-		0x24, 0x2B, 0x6E, 0x1D)
+	GUID_INIT(0xDC3EA0B0, 0xA144, 0x4797, 0xB9, 0x5B, 0x53, 0xFA,	\
+		  0x24, 0x2B, 0x6E, 0x1D)
 /* Processor Specific: IA64 */
 #define CPER_SEC_PROC_IPF						\
-	UUID_LE(0xE429FAF1, 0x3CB7, 0x11D4, 0x0B, 0xCA, 0x07, 0x00,	\
-		0x80, 0xC7, 0x3C, 0x88, 0x81)
+	GUID_INIT(0xE429FAF1, 0x3CB7, 0x11D4, 0x0B, 0xCA, 0x07, 0x00,	\
+		  0x80, 0xC7, 0x3C, 0x88, 0x81)
 /* Processor Specific: ARM */
 #define CPER_SEC_PROC_ARM						\
-	UUID_LE(0xE19E3D16, 0xBC11, 0x11E4, 0x9C, 0xAA, 0xC2, 0x05,	\
-		0x1D, 0x5D, 0x46, 0xB0)
+	GUID_INIT(0xE19E3D16, 0xBC11, 0x11E4, 0x9C, 0xAA, 0xC2, 0x05,	\
+		  0x1D, 0x5D, 0x46, 0xB0)
 /* Platform Memory */
 #define CPER_SEC_PLATFORM_MEM						\
-	UUID_LE(0xA5BC1114, 0x6F64, 0x4EDE, 0xB8, 0x63, 0x3E, 0x83,	\
-		0xED, 0x7C, 0x83, 0xB1)
+	GUID_INIT(0xA5BC1114, 0x6F64, 0x4EDE, 0xB8, 0x63, 0x3E, 0x83,	\
+		  0xED, 0x7C, 0x83, 0xB1)
 #define CPER_SEC_PCIE							\
-	UUID_LE(0xD995E954, 0xBBC1, 0x430F, 0xAD, 0x91, 0xB4, 0x4D,	\
-		0xCB, 0x3C, 0x6F, 0x35)
+	GUID_INIT(0xD995E954, 0xBBC1, 0x430F, 0xAD, 0x91, 0xB4, 0x4D,	\
+		  0xCB, 0x3C, 0x6F, 0x35)
 /* Firmware Error Record Reference */
 #define CPER_SEC_FW_ERR_REC_REF						\
-	UUID_LE(0x81212A96, 0x09ED, 0x4996, 0x94, 0x71, 0x8D, 0x72,	\
-		0x9C, 0x8E, 0x69, 0xED)
+	GUID_INIT(0x81212A96, 0x09ED, 0x4996, 0x94, 0x71, 0x8D, 0x72,	\
+		  0x9C, 0x8E, 0x69, 0xED)
 /* PCI/PCI-X Bus */
 #define CPER_SEC_PCI_X_BUS						\
-	UUID_LE(0xC5753963, 0x3B84, 0x4095, 0xBF, 0x78, 0xED, 0xDA,	\
-		0xD3, 0xF9, 0xC9, 0xDD)
+	GUID_INIT(0xC5753963, 0x3B84, 0x4095, 0xBF, 0x78, 0xED, 0xDA,	\
+		  0xD3, 0xF9, 0xC9, 0xDD)
 /* PCI Component/Device */
 #define CPER_SEC_PCI_DEV						\
-	UUID_LE(0xEB5E4685, 0xCA66, 0x4769, 0xB6, 0xA2, 0x26, 0x06,	\
-		0x8B, 0x00, 0x13, 0x26)
+	GUID_INIT(0xEB5E4685, 0xCA66, 0x4769, 0xB6, 0xA2, 0x26, 0x06,	\
+		  0x8B, 0x00, 0x13, 0x26)
 #define CPER_SEC_DMAR_GENERIC						\
-	UUID_LE(0x5B51FEF7, 0xC79D, 0x4434, 0x8F, 0x1B, 0xAA, 0x62,	\
-		0xDE, 0x3E, 0x2C, 0x64)
+	GUID_INIT(0x5B51FEF7, 0xC79D, 0x4434, 0x8F, 0x1B, 0xAA, 0x62,	\
+		  0xDE, 0x3E, 0x2C, 0x64)
 /* Intel VT for Directed I/O specific DMAr */
 #define CPER_SEC_DMAR_VT						\
-	UUID_LE(0x71761D37, 0x32B2, 0x45cd, 0xA7, 0xD0, 0xB0, 0xFE,	\
-		0xDD, 0x93, 0xE8, 0xCF)
+	GUID_INIT(0x71761D37, 0x32B2, 0x45cd, 0xA7, 0xD0, 0xB0, 0xFE,	\
+		  0xDD, 0x93, 0xE8, 0xCF)
 /* IOMMU specific DMAr */
 #define CPER_SEC_DMAR_IOMMU						\
-	UUID_LE(0x036F84E1, 0x7F37, 0x428c, 0xA7, 0x9E, 0x57, 0x5F,	\
-		0xDF, 0xAA, 0x84, 0xEC)
+	GUID_INIT(0x036F84E1, 0x7F37, 0x428c, 0xA7, 0x9E, 0x57, 0x5F,	\
+		  0xDF, 0xAA, 0x84, 0xEC)
 
 #define CPER_PROC_VALID_TYPE			0x0001
 #define CPER_PROC_VALID_ISA			0x0002
@@ -290,10 +290,10 @@ struct cper_record_header {
 	__u32	validation_bits;
 	__u32	record_length;
 	__u64	timestamp;
-	uuid_le	platform_id;
-	uuid_le	partition_id;
-	uuid_le	creator_id;
-	uuid_le	notification_type;
+	guid_t	platform_id;
+	guid_t	partition_id;
+	guid_t	creator_id;
+	guid_t	notification_type;
 	__u64	record_id;
 	__u32	flags;
 	__u64	persistence_information;
@@ -309,8 +309,8 @@ struct cper_section_descriptor {
 	__u8	validation_bits;
 	__u8	reserved;		/* must be zero */
 	__u32	flags;
-	uuid_le	section_type;
-	uuid_le	fru_id;
+	guid_t	section_type;
+	guid_t	fru_id;
 	__u32	section_severity;
 	__u8	fru_text[20];
 };
@@ -343,7 +343,7 @@ struct cper_sec_proc_ia {
 
 /* IA32/X64 Processor Error Information Structure */
 struct cper_ia_err_info {
-	uuid_le	err_type;
+	guid_t	err_type;
 	__u64	validation_bits;
 	__u64	check_info;
 	__u64	target_id;
diff --git a/include/linux/efi.h b/include/linux/efi.h
index 8269bcb8ccf7..7a322aed979f 100644
--- a/include/linux/efi.h
+++ b/include/linux/efi.h
@@ -47,10 +47,10 @@ typedef u16 efi_char16_t;		/* UNICODE character */
 typedef u64 efi_physical_addr_t;
 typedef void *efi_handle_t;
 
-typedef uuid_le efi_guid_t;
+typedef guid_t efi_guid_t;
 
 #define EFI_GUID(a,b,c,d0,d1,d2,d3,d4,d5,d6,d7) \
-	UUID_LE(a, b, c, d0, d1, d2, d3, d4, d5, d6, d7)
+	GUID_INIT(a, b, c, d0, d1, d2, d3, d4, d5, d6, d7)
 
 /*
  * Generic EFI table header
-- 
cgit v1.2.3


From 223694b9ae8bfba99f3528d49d07a740af6ff95a Mon Sep 17 00:00:00 2001
From: Changpeng Liu <changpeng.liu@intel.com>
Date: Thu, 31 Aug 2017 11:22:49 +0800
Subject: nvme: fix the definition of the doorbell buffer config support bit

NVMe 1.3 specification defines the Optional Admin Command Support feature
flags, bit 8 set to '1' then the controller supports the Doorbell Buffer
Config command. Bit 7 is used for Virtualization Mangement command.

Signed-off-by: Changpeng Liu <changpeng.liu@intel.com>
Reviewed-by: Sagi Grimberg <sagi@grimberg.me>
Reviewed-by: Max Gurtovoy <maxg@mellanox.com>
Reviewed-by: Johannes Thumshirn <jthumshirn@suse.de>
Signed-off-by: Christoph Hellwig <hch@lst.de>
Fixes: f9f38e33 ("nvme: improve performance for virtual NVMe devices")
Cc: stable@vger.kernel.org
---
 include/linux/nvme.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/nvme.h b/include/linux/nvme.h
index 25d8225dbd04..8efff888bd9b 100644
--- a/include/linux/nvme.h
+++ b/include/linux/nvme.h
@@ -254,7 +254,7 @@ enum {
 	NVME_CTRL_VWC_PRESENT			= 1 << 0,
 	NVME_CTRL_OACS_SEC_SUPP                 = 1 << 0,
 	NVME_CTRL_OACS_DIRECTIVES		= 1 << 5,
-	NVME_CTRL_OACS_DBBUF_SUPP		= 1 << 7,
+	NVME_CTRL_OACS_DBBUF_SUPP		= 1 << 8,
 };
 
 struct nvme_lbaf {
-- 
cgit v1.2.3


From d3bf68ae04c7e29ed3c30b7f4b1f0c6a4a11c7f1 Mon Sep 17 00:00:00 2001
From: Adrian Hunter <adrian.hunter@intel.com>
Date: Fri, 25 Aug 2017 15:43:46 +0300
Subject: mmc: host: Add CQE interface

Add CQE host operations, capabilities, and host members.

Signed-off-by: Adrian Hunter <adrian.hunter@intel.com>
Signed-off-by: Ulf Hansson <ulf.hansson@linaro.org>
---
 include/linux/mmc/core.h |  6 ++++++
 include/linux/mmc/host.h | 53 ++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 59 insertions(+)

(limited to 'include')

diff --git a/include/linux/mmc/core.h b/include/linux/mmc/core.h
index bf1788a224e6..c6d17f626234 100644
--- a/include/linux/mmc/core.h
+++ b/include/linux/mmc/core.h
@@ -156,6 +156,12 @@ struct mmc_request {
 	struct completion	completion;
 	struct completion	cmd_completion;
 	void			(*done)(struct mmc_request *);/* completion function */
+	/*
+	 * Notify uppers layers (e.g. mmc block driver) that recovery is needed
+	 * due to an error associated with the mmc_request. Currently used only
+	 * by CQE.
+	 */
+	void			(*recovery_notifier)(struct mmc_request *);
 	struct mmc_host		*host;
 
 	/* Allow other commands during this ongoing data transfer or busy wait */
diff --git a/include/linux/mmc/host.h b/include/linux/mmc/host.h
index e92629518f68..f3f2d07feb2a 100644
--- a/include/linux/mmc/host.h
+++ b/include/linux/mmc/host.h
@@ -162,6 +162,50 @@ struct mmc_host_ops {
 				  unsigned int direction, int blk_size);
 };
 
+struct mmc_cqe_ops {
+	/* Allocate resources, and make the CQE operational */
+	int	(*cqe_enable)(struct mmc_host *host, struct mmc_card *card);
+	/* Free resources, and make the CQE non-operational */
+	void	(*cqe_disable)(struct mmc_host *host);
+	/*
+	 * Issue a read, write or DCMD request to the CQE. Also deal with the
+	 * effect of ->cqe_off().
+	 */
+	int	(*cqe_request)(struct mmc_host *host, struct mmc_request *mrq);
+	/* Free resources (e.g. DMA mapping) associated with the request */
+	void	(*cqe_post_req)(struct mmc_host *host, struct mmc_request *mrq);
+	/*
+	 * Prepare the CQE and host controller to accept non-CQ commands. There
+	 * is no corresponding ->cqe_on(), instead ->cqe_request() is required
+	 * to deal with that.
+	 */
+	void	(*cqe_off)(struct mmc_host *host);
+	/*
+	 * Wait for all CQE tasks to complete. Return an error if recovery
+	 * becomes necessary.
+	 */
+	int	(*cqe_wait_for_idle)(struct mmc_host *host);
+	/*
+	 * Notify CQE that a request has timed out. Return false if the request
+	 * completed or true if a timeout happened in which case indicate if
+	 * recovery is needed.
+	 */
+	bool	(*cqe_timeout)(struct mmc_host *host, struct mmc_request *mrq,
+			       bool *recovery_needed);
+	/*
+	 * Stop all CQE activity and prepare the CQE and host controller to
+	 * accept recovery commands.
+	 */
+	void	(*cqe_recovery_start)(struct mmc_host *host);
+	/*
+	 * Clear the queue and call mmc_cqe_request_done() on all requests.
+	 * Requests that errored will have the error set on the mmc_request
+	 * (data->error or cmd->error for DCMD).  Requests that did not error
+	 * will have zero data bytes transferred.
+	 */
+	void	(*cqe_recovery_finish)(struct mmc_host *host);
+};
+
 struct mmc_async_req {
 	/* active mmc request */
 	struct mmc_request	*mrq;
@@ -303,6 +347,8 @@ struct mmc_host {
 #define MMC_CAP2_HS400_ES	(1 << 20)	/* Host supports enhanced strobe */
 #define MMC_CAP2_NO_SD		(1 << 21)	/* Do not send SD commands during initialization */
 #define MMC_CAP2_NO_MMC		(1 << 22)	/* Do not send (e)MMC commands during initialization */
+#define MMC_CAP2_CQE		(1 << 23)	/* Has eMMC command queue engine */
+#define MMC_CAP2_CQE_DCMD	(1 << 24)	/* CQE can issue a direct command */
 
 	mmc_pm_flag_t		pm_caps;	/* supported pm features */
 
@@ -386,6 +432,13 @@ struct mmc_host {
 	int			dsr_req;	/* DSR value is valid */
 	u32			dsr;	/* optional driver stage (DSR) value */
 
+	/* Command Queue Engine (CQE) support */
+	const struct mmc_cqe_ops *cqe_ops;
+	void			*cqe_private;
+	int			cqe_qdepth;
+	bool			cqe_enabled;
+	bool			cqe_on;
+
 	unsigned long		private[0] ____cacheline_aligned;
 };
 
-- 
cgit v1.2.3


From 906d5ff6188953f4981df39e9999f858542df9ce Mon Sep 17 00:00:00 2001
From: Adrian Hunter <adrian.hunter@intel.com>
Date: Fri, 25 Aug 2017 15:43:44 +0300
Subject: mmc: core: Move mmc_start_areq() declaration

mmc_start_areq() is an internal mmc core API. Move the declaration
accordingly.

Signed-off-by: Adrian Hunter <adrian.hunter@intel.com>
Signed-off-by: Ulf Hansson <ulf.hansson@linaro.org>
---
 drivers/mmc/core/core.h  | 6 ++++++
 include/linux/mmc/core.h | 4 ----
 2 files changed, 6 insertions(+), 4 deletions(-)

(limited to 'include')

diff --git a/drivers/mmc/core/core.h b/drivers/mmc/core/core.h
index 55f543fd37c4..ca861091a776 100644
--- a/drivers/mmc/core/core.h
+++ b/drivers/mmc/core/core.h
@@ -107,6 +107,12 @@ static inline void mmc_unregister_pm_notifier(struct mmc_host *host) { }
 void mmc_wait_for_req_done(struct mmc_host *host, struct mmc_request *mrq);
 bool mmc_is_req_done(struct mmc_host *host, struct mmc_request *mrq);
 
+struct mmc_async_req;
+
+struct mmc_async_req *mmc_start_areq(struct mmc_host *host,
+				     struct mmc_async_req *areq,
+				     enum mmc_blk_status *ret_stat);
+
 int mmc_erase(struct mmc_card *card, unsigned int from, unsigned int nr,
 		unsigned int arg);
 int mmc_can_erase(struct mmc_card *card);
diff --git a/include/linux/mmc/core.h b/include/linux/mmc/core.h
index c6d17f626234..927519385482 100644
--- a/include/linux/mmc/core.h
+++ b/include/linux/mmc/core.h
@@ -171,11 +171,7 @@ struct mmc_request {
 };
 
 struct mmc_card;
-struct mmc_async_req;
 
-struct mmc_async_req *mmc_start_areq(struct mmc_host *host,
-				struct mmc_async_req *areq,
-				enum mmc_blk_status *ret_stat);
 void mmc_wait_for_req(struct mmc_host *host, struct mmc_request *mrq);
 int mmc_wait_for_cmd(struct mmc_host *host, struct mmc_command *cmd,
 		int retries);
-- 
cgit v1.2.3


From a0aa309c39de58b86b704654434431aeb5a8bdf1 Mon Sep 17 00:00:00 2001
From: Matan Barak <matanb@mellanox.com>
Date: Thu, 3 Aug 2017 16:06:55 +0300
Subject: IB/core: Add a generic way to execute an operation on a uobject

The ioctl infrastructure treats all user-objects in the same manner.
It gets objects ids from the user-space and by using the object type
and type attributes mentioned in the object specification, it executes
this required method. Passing an object id from the user-space as
an attribute is carried out in three stages. The first is carried out
before the actual handler and the last is carried out afterwards.

The different supported operations are read, write, destroy and create.
In the first stage, the former three actions just fetches the object
from the repository (by using its id) and locks it. The last action
allocates a new uobject. Afterwards, the second stage is carried out
when the handler itself carries out the required modification of the
object. The last stage is carried out after the handler finishes and
commits the result. The former two operations just unlock the object.
Destroy calls the "free object" operation, taking into account the
object's type and releases the uobject as well. Creation just adds the
new uobject to the repository, making the object visible to the
application.

In order to abstract these details from the ioctl infrastructure
layer, we add uverbs_get_uobject_from_context and
uverbs_finalize_object functions which corresponds to the first
and last stages respectively.

Signed-off-by: Matan Barak <matanb@mellanox.com>
Reviewed-by: Yishai Hadas <yishaih@mellanox.com>
Signed-off-by: Doug Ledford <dledford@redhat.com>
---
 drivers/infiniband/core/rdma_core.c | 58 +++++++++++++++++++++++++++++++++++++
 drivers/infiniband/core/rdma_core.h | 17 +++++++++++
 include/rdma/uverbs_ioctl.h         | 52 +++++++++++++++++++++++++++++++++
 3 files changed, 127 insertions(+)
 create mode 100644 include/rdma/uverbs_ioctl.h

(limited to 'include')

diff --git a/drivers/infiniband/core/rdma_core.c b/drivers/infiniband/core/rdma_core.c
index 41c31a2bf093..2bd58ff17bb8 100644
--- a/drivers/infiniband/core/rdma_core.c
+++ b/drivers/infiniband/core/rdma_core.c
@@ -35,6 +35,7 @@
 #include <rdma/ib_verbs.h>
 #include <rdma/uverbs_types.h>
 #include <linux/rcupdate.h>
+#include <rdma/uverbs_ioctl.h>
 #include "uverbs.h"
 #include "core_priv.h"
 #include "rdma_core.h"
@@ -625,3 +626,60 @@ const struct uverbs_obj_type_class uverbs_fd_class = {
 	.needs_kfree_rcu = false,
 };
 
+struct ib_uobject *uverbs_get_uobject_from_context(const struct uverbs_obj_type *type_attrs,
+						   struct ib_ucontext *ucontext,
+						   enum uverbs_obj_access access,
+						   int id)
+{
+	switch (access) {
+	case UVERBS_ACCESS_READ:
+		return rdma_lookup_get_uobject(type_attrs, ucontext, id, false);
+	case UVERBS_ACCESS_DESTROY:
+	case UVERBS_ACCESS_WRITE:
+		return rdma_lookup_get_uobject(type_attrs, ucontext, id, true);
+	case UVERBS_ACCESS_NEW:
+		return rdma_alloc_begin_uobject(type_attrs, ucontext);
+	default:
+		WARN_ON(true);
+		return ERR_PTR(-EOPNOTSUPP);
+	}
+}
+
+int uverbs_finalize_object(struct ib_uobject *uobj,
+			   enum uverbs_obj_access access,
+			   bool commit)
+{
+	int ret = 0;
+
+	/*
+	 * refcounts should be handled at the object level and not at the
+	 * uobject level. Refcounts of the objects themselves are done in
+	 * handlers.
+	 */
+
+	switch (access) {
+	case UVERBS_ACCESS_READ:
+		rdma_lookup_put_uobject(uobj, false);
+		break;
+	case UVERBS_ACCESS_WRITE:
+		rdma_lookup_put_uobject(uobj, true);
+		break;
+	case UVERBS_ACCESS_DESTROY:
+		if (commit)
+			ret = rdma_remove_commit_uobject(uobj);
+		else
+			rdma_lookup_put_uobject(uobj, true);
+		break;
+	case UVERBS_ACCESS_NEW:
+		if (commit)
+			ret = rdma_alloc_commit_uobject(uobj);
+		else
+			rdma_alloc_abort_uobject(uobj);
+		break;
+	default:
+		WARN_ON(true);
+		ret = -EOPNOTSUPP;
+	}
+
+	return ret;
+}
diff --git a/drivers/infiniband/core/rdma_core.h b/drivers/infiniband/core/rdma_core.h
index 1b82e7ff7fe8..97483d1a7336 100644
--- a/drivers/infiniband/core/rdma_core.h
+++ b/drivers/infiniband/core/rdma_core.h
@@ -39,6 +39,7 @@
 
 #include <linux/idr.h>
 #include <rdma/uverbs_types.h>
+#include <rdma/uverbs_ioctl.h>
 #include <rdma/ib_verbs.h>
 #include <linux/mutex.h>
 
@@ -75,4 +76,20 @@ void uverbs_uobject_put(struct ib_uobject *uobject);
  */
 void uverbs_close_fd(struct file *f);
 
+/*
+ * Get an ib_uobject that corresponds to the given id from ucontext, assuming
+ * the object is from the given type. Lock it to the required access when
+ * applicable.
+ * This function could create (access == NEW), destroy (access == DESTROY)
+ * or unlock (access == READ || access == WRITE) objects if required.
+ * The action will be finalized only when uverbs_finalize_object is called.
+ */
+struct ib_uobject *uverbs_get_uobject_from_context(const struct uverbs_obj_type *type_attrs,
+						   struct ib_ucontext *ucontext,
+						   enum uverbs_obj_access access,
+						   int id);
+int uverbs_finalize_object(struct ib_uobject *uobj,
+			   enum uverbs_obj_access access,
+			   bool commit);
+
 #endif /* RDMA_CORE_H */
diff --git a/include/rdma/uverbs_ioctl.h b/include/rdma/uverbs_ioctl.h
new file mode 100644
index 000000000000..6885b92db4a8
--- /dev/null
+++ b/include/rdma/uverbs_ioctl.h
@@ -0,0 +1,52 @@
+/*
+ * Copyright (c) 2017, Mellanox Technologies inc.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef _UVERBS_IOCTL_
+#define _UVERBS_IOCTL_
+
+#include <rdma/uverbs_types.h>
+
+/*
+ * =======================================
+ *	Verbs action specifications
+ * =======================================
+ */
+
+enum uverbs_obj_access {
+	UVERBS_ACCESS_READ,
+	UVERBS_ACCESS_WRITE,
+	UVERBS_ACCESS_NEW,
+	UVERBS_ACCESS_DESTROY
+};
+
+#endif
+
-- 
cgit v1.2.3


From f43dbebfa32041826299bdccae0352887fa007ea Mon Sep 17 00:00:00 2001
From: Matan Barak <matanb@mellanox.com>
Date: Thu, 3 Aug 2017 16:06:56 +0300
Subject: IB/core: Add support to finalize objects in one transaction

The new ioctl based infrastructure either commits or rollbacks
all objects of the method as one transaction. In order to do
that, we introduce a notion of dealing with a collection of
objects that are related to a specific method.

This also requires adding a notion of a method and attribute.
A method contains a hash of attributes, where each bucket
contains several attributes. The attributes are hashed according
to their namespace which resides in the four upper bits of the id.

For example, an object could be a CQ, which has an action of CREATE_CQ.
This action has multiple attributes. For example, the CQ's new handle
and the comp_channel. Each layer in this hierarchy - objects, methods
and attributes is split into namespaces. The basic example for that is
one namespace representing the default entities and another one
representing the driver specific entities.

When declaring these methods and attributes, we actually declare
their specifications. When a method is executed, we actually
allocates some space to hold auxiliary information. This auxiliary
information contains meta-data about the required objects, such
as pointers to their type information, pointers to the uobjects
themselves (if exist), etc.
The specification, along with the auxiliary information we allocated
and filled is given to the finalize_objects function.

Signed-off-by: Matan Barak <matanb@mellanox.com>
Reviewed-by: Yishai Hadas <yishaih@mellanox.com>
Signed-off-by: Doug Ledford <dledford@redhat.com>
---
 drivers/infiniband/core/rdma_core.c | 40 ++++++++++++++++++++++++++++
 drivers/infiniband/core/rdma_core.h | 22 ++++++++++++++-
 include/rdma/uverbs_ioctl.h         | 53 +++++++++++++++++++++++++++++++++++++
 3 files changed, 114 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/drivers/infiniband/core/rdma_core.c b/drivers/infiniband/core/rdma_core.c
index 2bd58ff17bb8..0fe8ef913387 100644
--- a/drivers/infiniband/core/rdma_core.c
+++ b/drivers/infiniband/core/rdma_core.c
@@ -683,3 +683,43 @@ int uverbs_finalize_object(struct ib_uobject *uobj,
 
 	return ret;
 }
+
+int uverbs_finalize_objects(struct uverbs_attr_bundle *attrs_bundle,
+			    struct uverbs_attr_spec_hash * const *spec_hash,
+			    size_t num,
+			    bool commit)
+{
+	unsigned int i;
+	int ret = 0;
+
+	for (i = 0; i < num; i++) {
+		struct uverbs_attr_bundle_hash *curr_bundle =
+			&attrs_bundle->hash[i];
+		const struct uverbs_attr_spec_hash *curr_spec_bucket =
+			spec_hash[i];
+		unsigned int j;
+
+		for (j = 0; j < curr_bundle->num_attrs; j++) {
+			struct uverbs_attr *attr;
+			const struct uverbs_attr_spec *spec;
+
+			if (!uverbs_attr_is_valid_in_hash(curr_bundle, j))
+				continue;
+
+			attr = &curr_bundle->attrs[j];
+			spec = &curr_spec_bucket->attrs[j];
+
+			if (spec->type == UVERBS_ATTR_TYPE_IDR ||
+			    spec->type == UVERBS_ATTR_TYPE_FD) {
+				int current_ret;
+
+				current_ret = uverbs_finalize_object(attr->obj_attr.uobject,
+								     spec->obj.access,
+								     commit);
+				if (!ret)
+					ret = current_ret;
+			}
+		}
+	}
+	return ret;
+}
diff --git a/drivers/infiniband/core/rdma_core.h b/drivers/infiniband/core/rdma_core.h
index 97483d1a7336..9ed6ad0324c7 100644
--- a/drivers/infiniband/core/rdma_core.h
+++ b/drivers/infiniband/core/rdma_core.h
@@ -82,7 +82,8 @@ void uverbs_close_fd(struct file *f);
  * applicable.
  * This function could create (access == NEW), destroy (access == DESTROY)
  * or unlock (access == READ || access == WRITE) objects if required.
- * The action will be finalized only when uverbs_finalize_object is called.
+ * The action will be finalized only when uverbs_finalize_object or
+ * uverbs_finalize_objects are called.
  */
 struct ib_uobject *uverbs_get_uobject_from_context(const struct uverbs_obj_type *type_attrs,
 						   struct ib_ucontext *ucontext,
@@ -91,5 +92,24 @@ struct ib_uobject *uverbs_get_uobject_from_context(const struct uverbs_obj_type
 int uverbs_finalize_object(struct ib_uobject *uobj,
 			   enum uverbs_obj_access access,
 			   bool commit);
+/*
+ * Note that certain finalize stages could return a status:
+ *   (a) alloc_commit could return a failure if the object is committed at the
+ *       same time when the context is destroyed.
+ *   (b) remove_commit could fail if the object wasn't destroyed successfully.
+ * Since multiple objects could be finalized in one transaction, it is very NOT
+ * recommended to have several finalize actions which have side effects.
+ * For example, it's NOT recommended to have a certain action which has both
+ * a commit action and a destroy action or two destroy objects in the same
+ * action. The rule of thumb is to have one destroy or commit action with
+ * multiple lookups.
+ * The first non zero return value of finalize_object is returned from this
+ * function. For example, this could happen when we couldn't destroy an
+ * object.
+ */
+int uverbs_finalize_objects(struct uverbs_attr_bundle *attrs_bundle,
+			    struct uverbs_attr_spec_hash * const *spec_hash,
+			    size_t num,
+			    bool commit);
 
 #endif /* RDMA_CORE_H */
diff --git a/include/rdma/uverbs_ioctl.h b/include/rdma/uverbs_ioctl.h
index 6885b92db4a8..d3ec02b7d937 100644
--- a/include/rdma/uverbs_ioctl.h
+++ b/include/rdma/uverbs_ioctl.h
@@ -41,6 +41,12 @@
  * =======================================
  */
 
+enum uverbs_attr_type {
+	UVERBS_ATTR_TYPE_NA,
+	UVERBS_ATTR_TYPE_IDR,
+	UVERBS_ATTR_TYPE_FD,
+};
+
 enum uverbs_obj_access {
 	UVERBS_ACCESS_READ,
 	UVERBS_ACCESS_WRITE,
@@ -48,5 +54,52 @@ enum uverbs_obj_access {
 	UVERBS_ACCESS_DESTROY
 };
 
+struct uverbs_attr_spec {
+	enum uverbs_attr_type		type;
+	struct {
+		/*
+		 * higher bits mean the namespace and lower bits mean
+		 * the type id within the namespace.
+		 */
+		u16			obj_type;
+		u8			access;
+	} obj;
+};
+
+struct uverbs_attr_spec_hash {
+	size_t				num_attrs;
+	struct uverbs_attr_spec		attrs[0];
+};
+
+struct uverbs_obj_attr {
+	struct ib_uobject		*uobject;
+};
+
+struct uverbs_attr {
+	struct uverbs_obj_attr	obj_attr;
+};
+
+struct uverbs_attr_bundle_hash {
+	/* if bit i is set, it means attrs[i] contains valid information */
+	unsigned long *valid_bitmap;
+	size_t num_attrs;
+	/*
+	 * arrays of attributes, each element corresponds to the specification
+	 * of the attribute in the same index.
+	 */
+	struct uverbs_attr *attrs;
+};
+
+struct uverbs_attr_bundle {
+	size_t				num_buckets;
+	struct uverbs_attr_bundle_hash  hash[];
+};
+
+static inline bool uverbs_attr_is_valid_in_hash(const struct uverbs_attr_bundle_hash *attrs_hash,
+						unsigned int idx)
+{
+	return test_bit(idx, attrs_hash->valid_bitmap);
+}
+
 #endif
 
-- 
cgit v1.2.3


From 66b6bef2c4e06f8c7a0030445766bf868110c5a1 Mon Sep 17 00:00:00 2001
From: Hans de Goede <hdegoede@redhat.com>
Date: Wed, 30 Aug 2017 11:48:11 +0200
Subject: power: supply: bq24190_charger: Export 5V boost converter as
 regulator

Register the 5V boost converter as a regulator named "usb_otg_vbus".

This commit also adds support for bq24190_platform_data, through which
non device-tree platforms can pass the regulator_init_data (containing
mappings for the consumer amongst other things).

Signed-off-by: Hans de Goede <hdegoede@redhat.com>
Acked-by: Tony Lindgren <tony@atomide.com>
Signed-off-by: Sebastian Reichel <sebastian.reichel@collabora.co.uk>
---
 drivers/power/supply/bq24190_charger.c | 112 +++++++++++++++++++++++++++++++++
 include/linux/power/bq24190_charger.h  |  18 ++++++
 2 files changed, 130 insertions(+)
 create mode 100644 include/linux/power/bq24190_charger.h

(limited to 'include')

diff --git a/drivers/power/supply/bq24190_charger.c b/drivers/power/supply/bq24190_charger.c
index fa711c106b63..e606a078b0dd 100644
--- a/drivers/power/supply/bq24190_charger.c
+++ b/drivers/power/supply/bq24190_charger.c
@@ -16,6 +16,9 @@
 #include <linux/of_device.h>
 #include <linux/pm_runtime.h>
 #include <linux/power_supply.h>
+#include <linux/power/bq24190_charger.h>
+#include <linux/regulator/driver.h>
+#include <linux/regulator/machine.h>
 #include <linux/workqueue.h>
 #include <linux/gpio.h>
 #include <linux/i2c.h>
@@ -513,6 +516,111 @@ static int bq24190_sysfs_create_group(struct bq24190_dev_info *bdi)
 static inline void bq24190_sysfs_remove_group(struct bq24190_dev_info *bdi) {}
 #endif
 
+#ifdef CONFIG_REGULATOR
+static int bq24190_set_charge_mode(struct regulator_dev *dev, u8 val)
+{
+	struct bq24190_dev_info *bdi = rdev_get_drvdata(dev);
+	int ret;
+
+	ret = pm_runtime_get_sync(bdi->dev);
+	if (ret < 0) {
+		dev_warn(bdi->dev, "pm_runtime_get failed: %i\n", ret);
+		pm_runtime_put_noidle(bdi->dev);
+		return ret;
+	}
+
+	ret = bq24190_write_mask(bdi, BQ24190_REG_POC,
+				 BQ24190_REG_POC_CHG_CONFIG_MASK,
+				 BQ24190_REG_POC_CHG_CONFIG_SHIFT, val);
+
+	pm_runtime_mark_last_busy(bdi->dev);
+	pm_runtime_put_autosuspend(bdi->dev);
+
+	return ret;
+}
+
+static int bq24190_vbus_enable(struct regulator_dev *dev)
+{
+	return bq24190_set_charge_mode(dev, BQ24190_REG_POC_CHG_CONFIG_OTG);
+}
+
+static int bq24190_vbus_disable(struct regulator_dev *dev)
+{
+	return bq24190_set_charge_mode(dev, BQ24190_REG_POC_CHG_CONFIG_CHARGE);
+}
+
+static int bq24190_vbus_is_enabled(struct regulator_dev *dev)
+{
+	struct bq24190_dev_info *bdi = rdev_get_drvdata(dev);
+	int ret;
+	u8 val;
+
+	ret = pm_runtime_get_sync(bdi->dev);
+	if (ret < 0) {
+		dev_warn(bdi->dev, "pm_runtime_get failed: %i\n", ret);
+		pm_runtime_put_noidle(bdi->dev);
+		return ret;
+	}
+
+	ret = bq24190_read_mask(bdi, BQ24190_REG_POC,
+				BQ24190_REG_POC_CHG_CONFIG_MASK,
+				BQ24190_REG_POC_CHG_CONFIG_SHIFT, &val);
+
+	pm_runtime_mark_last_busy(bdi->dev);
+	pm_runtime_put_autosuspend(bdi->dev);
+
+	return ret ? ret : val == BQ24190_REG_POC_CHG_CONFIG_OTG;
+}
+
+static const struct regulator_ops bq24190_vbus_ops = {
+	.enable = bq24190_vbus_enable,
+	.disable = bq24190_vbus_disable,
+	.is_enabled = bq24190_vbus_is_enabled,
+};
+
+static const struct regulator_desc bq24190_vbus_desc = {
+	.name = "usb_otg_vbus",
+	.type = REGULATOR_VOLTAGE,
+	.owner = THIS_MODULE,
+	.ops = &bq24190_vbus_ops,
+	.fixed_uV = 5000000,
+	.n_voltages = 1,
+};
+
+static const struct regulator_init_data bq24190_vbus_init_data = {
+	.constraints = {
+		.valid_ops_mask = REGULATOR_CHANGE_STATUS,
+	},
+};
+
+static int bq24190_register_vbus_regulator(struct bq24190_dev_info *bdi)
+{
+	struct bq24190_platform_data *pdata = bdi->dev->platform_data;
+	struct regulator_config cfg = { };
+	struct regulator_dev *reg;
+	int ret = 0;
+
+	cfg.dev = bdi->dev;
+	if (pdata && pdata->regulator_init_data)
+		cfg.init_data = pdata->regulator_init_data;
+	else
+		cfg.init_data = &bq24190_vbus_init_data;
+	cfg.driver_data = bdi;
+	reg = devm_regulator_register(bdi->dev, &bq24190_vbus_desc, &cfg);
+	if (IS_ERR(reg)) {
+		ret = PTR_ERR(reg);
+		dev_err(bdi->dev, "Can't register regulator: %d\n", ret);
+	}
+
+	return ret;
+}
+#else
+static int bq24190_register_vbus_regulator(struct bq24190_dev_info *bdi)
+{
+	return 0;
+}
+#endif
+
 static int bq24190_set_config(struct bq24190_dev_info *bdi)
 {
 	int ret;
@@ -1740,6 +1848,10 @@ static int bq24190_probe(struct i2c_client *client,
 		goto out_sysfs;
 	}
 
+	ret = bq24190_register_vbus_regulator(bdi);
+	if (ret < 0)
+		goto out_sysfs;
+
 	if (bdi->extcon) {
 		INIT_DELAYED_WORK(&bdi->extcon_work, bq24190_extcon_work);
 		bdi->extcon_nb.notifier_call = bq24190_extcon_event;
diff --git a/include/linux/power/bq24190_charger.h b/include/linux/power/bq24190_charger.h
new file mode 100644
index 000000000000..45ce7f116a91
--- /dev/null
+++ b/include/linux/power/bq24190_charger.h
@@ -0,0 +1,18 @@
+/*
+ * Platform data for the TI bq24190 battery charger driver.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#ifndef _BQ24190_CHARGER_H_
+#define _BQ24190_CHARGER_H_
+
+#include <linux/regulator/machine.h>
+
+struct bq24190_platform_data {
+	const struct regulator_init_data *regulator_init_data;
+};
+
+#endif
-- 
cgit v1.2.3


From add02cfdc9bc2987b0121861d5bb0c7392865be9 Mon Sep 17 00:00:00 2001
From: Joerg Roedel <jroedel@suse.de>
Date: Wed, 23 Aug 2017 15:50:04 +0200
Subject: iommu: Introduce Interface for IOMMU TLB Flushing

With the current IOMMU-API the hardware TLBs have to be
flushed in every iommu_ops->unmap() call-back.

For unmapping large amounts of address space, like it
happens when a KVM domain with assigned devices is
destroyed, this causes thousands of unnecessary TLB flushes
in the IOMMU hardware because the unmap call-back runs for
every unmapped physical page.

With the TLB Flush Interface and the new iommu_unmap_fast()
function introduced here the need to clean the hardware TLBs
is removed from the unmapping code-path. Users of
iommu_unmap_fast() have to explicitly call the TLB-Flush
functions to sync the page-table changes to the hardware.

Three functions for TLB-Flushes are introduced:

	* iommu_flush_tlb_all() - Flushes all TLB entries
	                          associated with that
				  domain. TLBs entries are
				  flushed when this function
				  returns.

	* iommu_tlb_range_add() - This will add a given
				  range to the flush queue
				  for this domain.

	* iommu_tlb_sync() - Flushes all queued ranges from
			     the hardware TLBs. Returns when
			     the flush is finished.

The semantic of this interface is intentionally similar to
the iommu_gather_ops from the io-pgtable code.

Cc: Alex Williamson <alex.williamson@redhat.com>
Cc: Will Deacon <will.deacon@arm.com>
Cc: Robin Murphy <robin.murphy@arm.com>
Signed-off-by: Joerg Roedel <jroedel@suse.de>
---
 drivers/iommu/iommu.c | 32 ++++++++++++++++++++++++++++----
 include/linux/iommu.h | 50 +++++++++++++++++++++++++++++++++++++++++++++++++-
 2 files changed, 77 insertions(+), 5 deletions(-)

(limited to 'include')

diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c
index 5499a0387349..31c2b1dc8cfd 100644
--- a/drivers/iommu/iommu.c
+++ b/drivers/iommu/iommu.c
@@ -527,6 +527,8 @@ static int iommu_group_create_direct_mappings(struct iommu_group *group,
 
 	}
 
+	iommu_flush_tlb_all(domain);
+
 out:
 	iommu_put_resv_regions(dev, &mappings);
 
@@ -1547,13 +1549,16 @@ int iommu_map(struct iommu_domain *domain, unsigned long iova,
 }
 EXPORT_SYMBOL_GPL(iommu_map);
 
-size_t iommu_unmap(struct iommu_domain *domain, unsigned long iova, size_t size)
+static size_t __iommu_unmap(struct iommu_domain *domain,
+			    unsigned long iova, size_t size,
+			    bool sync)
 {
+	const struct iommu_ops *ops = domain->ops;
 	size_t unmapped_page, unmapped = 0;
-	unsigned int min_pagesz;
 	unsigned long orig_iova = iova;
+	unsigned int min_pagesz;
 
-	if (unlikely(domain->ops->unmap == NULL ||
+	if (unlikely(ops->unmap == NULL ||
 		     domain->pgsize_bitmap == 0UL))
 		return -ENODEV;
 
@@ -1583,10 +1588,13 @@ size_t iommu_unmap(struct iommu_domain *domain, unsigned long iova, size_t size)
 	while (unmapped < size) {
 		size_t pgsize = iommu_pgsize(domain, iova, size - unmapped);
 
-		unmapped_page = domain->ops->unmap(domain, iova, pgsize);
+		unmapped_page = ops->unmap(domain, iova, pgsize);
 		if (!unmapped_page)
 			break;
 
+		if (sync && ops->iotlb_range_add)
+			ops->iotlb_range_add(domain, iova, pgsize);
+
 		pr_debug("unmapped: iova 0x%lx size 0x%zx\n",
 			 iova, unmapped_page);
 
@@ -1594,11 +1602,27 @@ size_t iommu_unmap(struct iommu_domain *domain, unsigned long iova, size_t size)
 		unmapped += unmapped_page;
 	}
 
+	if (sync && ops->iotlb_sync)
+		ops->iotlb_sync(domain);
+
 	trace_unmap(orig_iova, size, unmapped);
 	return unmapped;
 }
+
+size_t iommu_unmap(struct iommu_domain *domain,
+		   unsigned long iova, size_t size)
+{
+	return __iommu_unmap(domain, iova, size, true);
+}
 EXPORT_SYMBOL_GPL(iommu_unmap);
 
+size_t iommu_unmap_fast(struct iommu_domain *domain,
+			unsigned long iova, size_t size)
+{
+	return __iommu_unmap(domain, iova, size, false);
+}
+EXPORT_SYMBOL_GPL(iommu_unmap_fast);
+
 size_t default_iommu_map_sg(struct iommu_domain *domain, unsigned long iova,
 			 struct scatterlist *sg, unsigned int nents, int prot)
 {
diff --git a/include/linux/iommu.h b/include/linux/iommu.h
index f1ce8e517d8d..50be4fd338e4 100644
--- a/include/linux/iommu.h
+++ b/include/linux/iommu.h
@@ -167,6 +167,10 @@ struct iommu_resv_region {
  * @map: map a physically contiguous memory region to an iommu domain
  * @unmap: unmap a physically contiguous memory region from an iommu domain
  * @map_sg: map a scatter-gather list of physically contiguous memory chunks
+ * @flush_tlb_all: Synchronously flush all hardware TLBs for this domain
+ * @tlb_range_add: Add a given iova range to the flush queue for this domain
+ * @tlb_sync: Flush all queued ranges from the hardware TLBs and empty flush
+ *            queue
  * to an iommu domain
  * @iova_to_phys: translate iova to physical address
  * @add_device: add device to iommu grouping
@@ -199,6 +203,10 @@ struct iommu_ops {
 		     size_t size);
 	size_t (*map_sg)(struct iommu_domain *domain, unsigned long iova,
 			 struct scatterlist *sg, unsigned int nents, int prot);
+	void (*flush_iotlb_all)(struct iommu_domain *domain);
+	void (*iotlb_range_add)(struct iommu_domain *domain,
+				unsigned long iova, size_t size);
+	void (*iotlb_sync)(struct iommu_domain *domain);
 	phys_addr_t (*iova_to_phys)(struct iommu_domain *domain, dma_addr_t iova);
 	int (*add_device)(struct device *dev);
 	void (*remove_device)(struct device *dev);
@@ -286,7 +294,9 @@ extern struct iommu_domain *iommu_get_domain_for_dev(struct device *dev);
 extern int iommu_map(struct iommu_domain *domain, unsigned long iova,
 		     phys_addr_t paddr, size_t size, int prot);
 extern size_t iommu_unmap(struct iommu_domain *domain, unsigned long iova,
-		       size_t size);
+			  size_t size);
+extern size_t iommu_unmap_fast(struct iommu_domain *domain,
+			       unsigned long iova, size_t size);
 extern size_t default_iommu_map_sg(struct iommu_domain *domain, unsigned long iova,
 				struct scatterlist *sg,unsigned int nents,
 				int prot);
@@ -343,6 +353,25 @@ extern void iommu_domain_window_disable(struct iommu_domain *domain, u32 wnd_nr)
 extern int report_iommu_fault(struct iommu_domain *domain, struct device *dev,
 			      unsigned long iova, int flags);
 
+static inline void iommu_flush_tlb_all(struct iommu_domain *domain)
+{
+	if (domain->ops->flush_iotlb_all)
+		domain->ops->flush_iotlb_all(domain);
+}
+
+static inline void iommu_tlb_range_add(struct iommu_domain *domain,
+				       unsigned long iova, size_t size)
+{
+	if (domain->ops->iotlb_range_add)
+		domain->ops->iotlb_range_add(domain, iova, size);
+}
+
+static inline void iommu_tlb_sync(struct iommu_domain *domain)
+{
+	if (domain->ops->iotlb_sync)
+		domain->ops->iotlb_sync(domain);
+}
+
 static inline size_t iommu_map_sg(struct iommu_domain *domain,
 				  unsigned long iova, struct scatterlist *sg,
 				  unsigned int nents, int prot)
@@ -436,6 +465,12 @@ static inline int iommu_unmap(struct iommu_domain *domain, unsigned long iova,
 	return -ENODEV;
 }
 
+static inline int iommu_unmap_fast(struct iommu_domain *domain, unsigned long iova,
+				   int gfp_order)
+{
+	return -ENODEV;
+}
+
 static inline size_t iommu_map_sg(struct iommu_domain *domain,
 				  unsigned long iova, struct scatterlist *sg,
 				  unsigned int nents, int prot)
@@ -443,6 +478,19 @@ static inline size_t iommu_map_sg(struct iommu_domain *domain,
 	return -ENODEV;
 }
 
+static inline void iommu_flush_tlb_all(struct iommu_domain *domain)
+{
+}
+
+static inline void iommu_tlb_range_add(struct iommu_domain *domain,
+				       unsigned long iova, size_t size)
+{
+}
+
+static inline void iommu_tlb_sync(struct iommu_domain *domain)
+{
+}
+
 static inline int iommu_domain_window_enable(struct iommu_domain *domain,
 					     u32 wnd_nr, phys_addr_t paddr,
 					     u64 size, int prot)
-- 
cgit v1.2.3


From 78f35473508118df5ea04b9515ac3f1aaec0a980 Mon Sep 17 00:00:00 2001
From: Dan Williams <dan.j.williams@intel.com>
Date: Wed, 30 Aug 2017 09:16:38 -0700
Subject: dax: introduce a fs_dax_get_by_bdev() helper

Add a helper that can replace the following common pattern:

	if (blk_queue_dax(bdev->bd_queue))
		fs_dax_get_by_host(bdev->bd_disk->disk_name);

This will be used to move dax_device lookup from iomap-operation time to
fs-mount time.

Reviewed-by: Jan Kara <jack@suse.cz>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com>
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 drivers/dax/super.c | 10 ++++++++++
 include/linux/dax.h |  6 ++++++
 2 files changed, 16 insertions(+)

(limited to 'include')

diff --git a/drivers/dax/super.c b/drivers/dax/super.c
index 938eb4868f7f..b699aac268a6 100644
--- a/drivers/dax/super.c
+++ b/drivers/dax/super.c
@@ -46,6 +46,8 @@ void dax_read_unlock(int id)
 EXPORT_SYMBOL_GPL(dax_read_unlock);
 
 #ifdef CONFIG_BLOCK
+#include <linux/blkdev.h>
+
 int bdev_dax_pgoff(struct block_device *bdev, sector_t sector, size_t size,
 		pgoff_t *pgoff)
 {
@@ -59,6 +61,14 @@ int bdev_dax_pgoff(struct block_device *bdev, sector_t sector, size_t size,
 }
 EXPORT_SYMBOL(bdev_dax_pgoff);
 
+struct dax_device *fs_dax_get_by_bdev(struct block_device *bdev)
+{
+	if (!blk_queue_dax(bdev->bd_queue))
+		return NULL;
+	return fs_dax_get_by_host(bdev->bd_disk->disk_name);
+}
+EXPORT_SYMBOL_GPL(fs_dax_get_by_bdev);
+
 /**
  * __bdev_dax_supported() - Check if the device supports dax for filesystem
  * @sb: The superblock of the device
diff --git a/include/linux/dax.h b/include/linux/dax.h
index df97b7af7e2c..ac8afa18f707 100644
--- a/include/linux/dax.h
+++ b/include/linux/dax.h
@@ -57,6 +57,7 @@ static inline void fs_put_dax(struct dax_device *dax_dev)
 	put_dax(dax_dev);
 }
 
+struct dax_device *fs_dax_get_by_bdev(struct block_device *bdev);
 #else
 static inline int bdev_dax_supported(struct super_block *sb, int blocksize)
 {
@@ -71,6 +72,11 @@ static inline struct dax_device *fs_dax_get_by_host(const char *host)
 static inline void fs_put_dax(struct dax_device *dax_dev)
 {
 }
+
+static inline struct dax_device *fs_dax_get_by_bdev(struct block_device *bdev)
+{
+	return NULL;
+}
 #endif
 
 int dax_read_lock(void);
-- 
cgit v1.2.3


From c1d2b4c3e204e602c97680335d082b8d012d08cd Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Wed, 30 Aug 2017 19:24:57 +0200
Subject: tcp: Revert "tcp: remove CA_ACK_SLOWPATH"

This change was a followup to the header prediction removal,
so first revert this as a prerequisite to back out hp removal.

Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/tcp.h       |  5 +++--
 net/ipv4/tcp_input.c    | 35 +++++++++++++++++++----------------
 net/ipv4/tcp_westwood.c | 31 +++++++++++++++++++++++++++----
 3 files changed, 49 insertions(+), 22 deletions(-)

(limited to 'include')

diff --git a/include/net/tcp.h b/include/net/tcp.h
index c614ff135b66..c546d13ffbca 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -910,8 +910,9 @@ enum tcp_ca_event {
 
 /* Information about inbound ACK, passed to cong_ops->in_ack_event() */
 enum tcp_ca_ack_event_flags {
-	CA_ACK_WIN_UPDATE	= (1 << 0),	/* ACK updated window */
-	CA_ACK_ECE		= (1 << 1),	/* ECE bit is set on ack */
+	CA_ACK_SLOWPATH		= (1 << 0),	/* In slow path processing */
+	CA_ACK_WIN_UPDATE	= (1 << 1),	/* ACK updated window */
+	CA_ACK_ECE		= (1 << 2),	/* ECE bit is set on ack */
 };
 
 /*
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 7616cd76f6f6..a0e436366d31 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -3552,7 +3552,6 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
 	u32 lost = tp->lost;
 	int acked = 0; /* Number of packets newly acked */
 	int rexmit = REXMIT_NONE; /* Flag to (re)transmit to recover losses */
-	u32 ack_ev_flags = 0;
 
 	sack_state.first_sackt = 0;
 	sack_state.rate = &rs;
@@ -3593,26 +3592,30 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
 	if (flag & FLAG_UPDATE_TS_RECENT)
 		tcp_replace_ts_recent(tp, TCP_SKB_CB(skb)->seq);
 
-	if (ack_seq != TCP_SKB_CB(skb)->end_seq)
-		flag |= FLAG_DATA;
-	else
-		NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPPUREACKS);
+	{
+		u32 ack_ev_flags = CA_ACK_SLOWPATH;
 
-	flag |= tcp_ack_update_window(sk, skb, ack, ack_seq);
+		if (ack_seq != TCP_SKB_CB(skb)->end_seq)
+			flag |= FLAG_DATA;
+		else
+			NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPPUREACKS);
 
-	if (TCP_SKB_CB(skb)->sacked)
-		flag |= tcp_sacktag_write_queue(sk, skb, prior_snd_una,
-						&sack_state);
+		flag |= tcp_ack_update_window(sk, skb, ack, ack_seq);
 
-	if (tcp_ecn_rcv_ecn_echo(tp, tcp_hdr(skb))) {
-		flag |= FLAG_ECE;
-		ack_ev_flags = CA_ACK_ECE;
-	}
+		if (TCP_SKB_CB(skb)->sacked)
+			flag |= tcp_sacktag_write_queue(sk, skb, prior_snd_una,
+							&sack_state);
+
+		if (tcp_ecn_rcv_ecn_echo(tp, tcp_hdr(skb))) {
+			flag |= FLAG_ECE;
+			ack_ev_flags |= CA_ACK_ECE;
+		}
 
-	if (flag & FLAG_WIN_UPDATE)
-		ack_ev_flags |= CA_ACK_WIN_UPDATE;
+		if (flag & FLAG_WIN_UPDATE)
+			ack_ev_flags |= CA_ACK_WIN_UPDATE;
 
-	tcp_in_ack_event(sk, ack_ev_flags);
+		tcp_in_ack_event(sk, ack_ev_flags);
+	}
 
 	/* We passed data and got it acked, remove any soft error
 	 * log. Something worked...
diff --git a/net/ipv4/tcp_westwood.c b/net/ipv4/tcp_westwood.c
index e5de84310949..bec9cafbe3f9 100644
--- a/net/ipv4/tcp_westwood.c
+++ b/net/ipv4/tcp_westwood.c
@@ -153,6 +153,24 @@ static inline void update_rtt_min(struct westwood *w)
 		w->rtt_min = min(w->rtt, w->rtt_min);
 }
 
+/*
+ * @westwood_fast_bw
+ * It is called when we are in fast path. In particular it is called when
+ * header prediction is successful. In such case in fact update is
+ * straight forward and doesn't need any particular care.
+ */
+static inline void westwood_fast_bw(struct sock *sk)
+{
+	const struct tcp_sock *tp = tcp_sk(sk);
+	struct westwood *w = inet_csk_ca(sk);
+
+	westwood_update_window(sk);
+
+	w->bk += tp->snd_una - w->snd_una;
+	w->snd_una = tp->snd_una;
+	update_rtt_min(w);
+}
+
 /*
  * @westwood_acked_count
  * This function evaluates cumul_ack for evaluating bk in case of
@@ -205,12 +223,17 @@ static u32 tcp_westwood_bw_rttmin(const struct sock *sk)
 
 static void tcp_westwood_ack(struct sock *sk, u32 ack_flags)
 {
-	struct westwood *w = inet_csk_ca(sk);
+	if (ack_flags & CA_ACK_SLOWPATH) {
+		struct westwood *w = inet_csk_ca(sk);
 
-	westwood_update_window(sk);
-	w->bk += westwood_acked_count(sk);
+		westwood_update_window(sk);
+		w->bk += westwood_acked_count(sk);
 
-	update_rtt_min(w);
+		update_rtt_min(w);
+		return;
+	}
+
+	westwood_fast_bw(sk);
 }
 
 static void tcp_westwood_event(struct sock *sk, enum tcp_ca_event event)
-- 
cgit v1.2.3


From 31770e34e43d6c8dee129bfee77e56c34e61f0e5 Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Wed, 30 Aug 2017 19:24:58 +0200
Subject: tcp: Revert "tcp: remove header prediction"

This reverts commit 45f119bf936b1f9f546a0b139c5b56f9bb2bdc78.

Eric Dumazet says:
  We found at Google a significant regression caused by
  45f119bf936b1f9f546a0b139c5b56f9bb2bdc78 tcp: remove header prediction

  In typical RPC  (TCP_RR), when a TCP socket receives data, we now call
  tcp_ack() while we used to not call it.

  This touches enough cache lines to cause a slowdown.

so problem does not seem to be HP removal itself but the tcp_ack()
call.  Therefore, it might be possible to remove HP after all, provided
one finds a way to elide tcp_ack for most cases.

Reported-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/tcp.h       |   6 ++
 include/net/tcp.h         |  23 ++++++
 include/uapi/linux/snmp.h |   2 +
 net/ipv4/proc.c           |   2 +
 net/ipv4/tcp.c            |   4 +-
 net/ipv4/tcp_input.c      | 188 ++++++++++++++++++++++++++++++++++++++++++++--
 net/ipv4/tcp_minisocks.c  |   2 +
 net/ipv4/tcp_output.c     |   2 +
 8 files changed, 223 insertions(+), 6 deletions(-)

(limited to 'include')

diff --git a/include/linux/tcp.h b/include/linux/tcp.h
index 267164a1d559..4aa40ef02d32 100644
--- a/include/linux/tcp.h
+++ b/include/linux/tcp.h
@@ -147,6 +147,12 @@ struct tcp_sock {
 	u16	tcp_header_len;	/* Bytes of tcp header to send		*/
 	u16	gso_segs;	/* Max number of segs per GSO packet	*/
 
+/*
+ *	Header prediction flags
+ *	0x5?10 << 16 + snd_wnd in net byte order
+ */
+	__be32	pred_flags;
+
 /*
  *	RFC793 variables by their proper names. This means you can
  *	read the code and the spec side by side (and laugh ...)
diff --git a/include/net/tcp.h b/include/net/tcp.h
index c546d13ffbca..9c3db054e47f 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -634,6 +634,29 @@ static inline u32 __tcp_set_rto(const struct tcp_sock *tp)
 	return usecs_to_jiffies((tp->srtt_us >> 3) + tp->rttvar_us);
 }
 
+static inline void __tcp_fast_path_on(struct tcp_sock *tp, u32 snd_wnd)
+{
+	tp->pred_flags = htonl((tp->tcp_header_len << 26) |
+			       ntohl(TCP_FLAG_ACK) |
+			       snd_wnd);
+}
+
+static inline void tcp_fast_path_on(struct tcp_sock *tp)
+{
+	__tcp_fast_path_on(tp, tp->snd_wnd >> tp->rx_opt.snd_wscale);
+}
+
+static inline void tcp_fast_path_check(struct sock *sk)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+
+	if (RB_EMPTY_ROOT(&tp->out_of_order_queue) &&
+	    tp->rcv_wnd &&
+	    atomic_read(&sk->sk_rmem_alloc) < sk->sk_rcvbuf &&
+	    !tp->urg_data)
+		tcp_fast_path_on(tp);
+}
+
 /* Compute the actual rto_min value */
 static inline u32 tcp_rto_min(struct sock *sk)
 {
diff --git a/include/uapi/linux/snmp.h b/include/uapi/linux/snmp.h
index b3f346fb9fe3..758f12b58541 100644
--- a/include/uapi/linux/snmp.h
+++ b/include/uapi/linux/snmp.h
@@ -184,7 +184,9 @@ enum
 	LINUX_MIB_DELAYEDACKLOST,		/* DelayedACKLost */
 	LINUX_MIB_LISTENOVERFLOWS,		/* ListenOverflows */
 	LINUX_MIB_LISTENDROPS,			/* ListenDrops */
+	LINUX_MIB_TCPHPHITS,			/* TCPHPHits */
 	LINUX_MIB_TCPPUREACKS,			/* TCPPureAcks */
+	LINUX_MIB_TCPHPACKS,			/* TCPHPAcks */
 	LINUX_MIB_TCPRENORECOVERY,		/* TCPRenoRecovery */
 	LINUX_MIB_TCPSACKRECOVERY,		/* TCPSackRecovery */
 	LINUX_MIB_TCPSACKRENEGING,		/* TCPSACKReneging */
diff --git a/net/ipv4/proc.c b/net/ipv4/proc.c
index b6d3fe03feb3..127153f1ed8a 100644
--- a/net/ipv4/proc.c
+++ b/net/ipv4/proc.c
@@ -206,7 +206,9 @@ static const struct snmp_mib snmp4_net_list[] = {
 	SNMP_MIB_ITEM("DelayedACKLost", LINUX_MIB_DELAYEDACKLOST),
 	SNMP_MIB_ITEM("ListenOverflows", LINUX_MIB_LISTENOVERFLOWS),
 	SNMP_MIB_ITEM("ListenDrops", LINUX_MIB_LISTENDROPS),
+	SNMP_MIB_ITEM("TCPHPHits", LINUX_MIB_TCPHPHITS),
 	SNMP_MIB_ITEM("TCPPureAcks", LINUX_MIB_TCPPUREACKS),
+	SNMP_MIB_ITEM("TCPHPAcks", LINUX_MIB_TCPHPACKS),
 	SNMP_MIB_ITEM("TCPRenoRecovery", LINUX_MIB_TCPRENORECOVERY),
 	SNMP_MIB_ITEM("TCPSackRecovery", LINUX_MIB_TCPSACKRECOVERY),
 	SNMP_MIB_ITEM("TCPSACKReneging", LINUX_MIB_TCPSACKRENEGING),
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 566083ee2654..21ca2df274c5 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -1963,8 +1963,10 @@ int tcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int nonblock,
 		tcp_rcv_space_adjust(sk);
 
 skip_copy:
-		if (tp->urg_data && after(tp->copied_seq, tp->urg_seq))
+		if (tp->urg_data && after(tp->copied_seq, tp->urg_seq)) {
 			tp->urg_data = 0;
+			tcp_fast_path_check(sk);
+		}
 		if (used + offset < skb->len)
 			continue;
 
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index a0e436366d31..c5d7656beeee 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -103,6 +103,7 @@ int sysctl_tcp_invalid_ratelimit __read_mostly = HZ/2;
 #define FLAG_DATA_SACKED	0x20 /* New SACK.				*/
 #define FLAG_ECE		0x40 /* ECE in this ACK				*/
 #define FLAG_LOST_RETRANS	0x80 /* This ACK marks some retransmission lost */
+#define FLAG_SLOWPATH		0x100 /* Do not skip RFC checks for window update.*/
 #define FLAG_ORIG_SACK_ACKED	0x200 /* Never retransmitted data are (s)acked	*/
 #define FLAG_SND_UNA_ADVANCED	0x400 /* Snd_una was changed (!= FLAG_DATA_ACKED) */
 #define FLAG_DSACKING_ACK	0x800 /* SACK blocks contained D-SACK info */
@@ -3371,6 +3372,12 @@ static int tcp_ack_update_window(struct sock *sk, const struct sk_buff *skb, u32
 		if (tp->snd_wnd != nwin) {
 			tp->snd_wnd = nwin;
 
+			/* Note, it is the only place, where
+			 * fast path is recovered for sending TCP.
+			 */
+			tp->pred_flags = 0;
+			tcp_fast_path_check(sk);
+
 			if (tcp_send_head(sk))
 				tcp_slow_start_after_idle_check(sk);
 
@@ -3592,7 +3599,19 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
 	if (flag & FLAG_UPDATE_TS_RECENT)
 		tcp_replace_ts_recent(tp, TCP_SKB_CB(skb)->seq);
 
-	{
+	if (!(flag & FLAG_SLOWPATH) && after(ack, prior_snd_una)) {
+		/* Window is constant, pure forward advance.
+		 * No more checks are required.
+		 * Note, we use the fact that SND.UNA>=SND.WL2.
+		 */
+		tcp_update_wl(tp, ack_seq);
+		tcp_snd_una_update(tp, ack);
+		flag |= FLAG_WIN_UPDATE;
+
+		tcp_in_ack_event(sk, CA_ACK_WIN_UPDATE);
+
+		NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPHPACKS);
+	} else {
 		u32 ack_ev_flags = CA_ACK_SLOWPATH;
 
 		if (ack_seq != TCP_SKB_CB(skb)->end_seq)
@@ -4407,6 +4426,8 @@ static void tcp_data_queue_ofo(struct sock *sk, struct sk_buff *skb)
 	if (TCP_SKB_CB(skb)->has_rxtstamp)
 		TCP_SKB_CB(skb)->swtstamp = skb->tstamp;
 
+	/* Disable header prediction. */
+	tp->pred_flags = 0;
 	inet_csk_schedule_ack(sk);
 
 	NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPOFOQUEUE);
@@ -4647,6 +4668,8 @@ queue_and_out:
 		if (tp->rx_opt.num_sacks)
 			tcp_sack_remove(tp);
 
+		tcp_fast_path_check(sk);
+
 		if (eaten > 0)
 			kfree_skb_partial(skb, fragstolen);
 		if (!sock_flag(sk, SOCK_DEAD))
@@ -4972,6 +4995,7 @@ static int tcp_prune_queue(struct sock *sk)
 	NET_INC_STATS(sock_net(sk), LINUX_MIB_RCVPRUNED);
 
 	/* Massive buffer overcommit. */
+	tp->pred_flags = 0;
 	return -1;
 }
 
@@ -5143,6 +5167,9 @@ static void tcp_check_urg(struct sock *sk, const struct tcphdr *th)
 
 	tp->urg_data = TCP_URG_NOTYET;
 	tp->urg_seq = ptr;
+
+	/* Disable header prediction. */
+	tp->pred_flags = 0;
 }
 
 /* This is the 'fast' part of urgent handling. */
@@ -5301,6 +5328,26 @@ discard:
 
 /*
  *	TCP receive function for the ESTABLISHED state.
+ *
+ *	It is split into a fast path and a slow path. The fast path is
+ * 	disabled when:
+ *	- A zero window was announced from us - zero window probing
+ *        is only handled properly in the slow path.
+ *	- Out of order segments arrived.
+ *	- Urgent data is expected.
+ *	- There is no buffer space left
+ *	- Unexpected TCP flags/window values/header lengths are received
+ *	  (detected by checking the TCP header against pred_flags)
+ *	- Data is sent in both directions. Fast path only supports pure senders
+ *	  or pure receivers (this means either the sequence number or the ack
+ *	  value must stay constant)
+ *	- Unexpected TCP option.
+ *
+ *	When these conditions are not satisfied it drops into a standard
+ *	receive procedure patterned after RFC793 to handle all cases.
+ *	The first three cases are guaranteed by proper pred_flags setting,
+ *	the rest is checked inline. Fast processing is turned on in
+ *	tcp_data_queue when everything is OK.
  */
 void tcp_rcv_established(struct sock *sk, struct sk_buff *skb,
 			 const struct tcphdr *th)
@@ -5311,19 +5358,144 @@ void tcp_rcv_established(struct sock *sk, struct sk_buff *skb,
 	tcp_mstamp_refresh(tp);
 	if (unlikely(!sk->sk_rx_dst))
 		inet_csk(sk)->icsk_af_ops->sk_rx_dst_set(sk, skb);
+	/*
+	 *	Header prediction.
+	 *	The code loosely follows the one in the famous
+	 *	"30 instruction TCP receive" Van Jacobson mail.
+	 *
+	 *	Van's trick is to deposit buffers into socket queue
+	 *	on a device interrupt, to call tcp_recv function
+	 *	on the receive process context and checksum and copy
+	 *	the buffer to user space. smart...
+	 *
+	 *	Our current scheme is not silly either but we take the
+	 *	extra cost of the net_bh soft interrupt processing...
+	 *	We do checksum and copy also but from device to kernel.
+	 */
 
 	tp->rx_opt.saw_tstamp = 0;
 
+	/*	pred_flags is 0xS?10 << 16 + snd_wnd
+	 *	if header_prediction is to be made
+	 *	'S' will always be tp->tcp_header_len >> 2
+	 *	'?' will be 0 for the fast path, otherwise pred_flags is 0 to
+	 *  turn it off	(when there are holes in the receive
+	 *	 space for instance)
+	 *	PSH flag is ignored.
+	 */
+
+	if ((tcp_flag_word(th) & TCP_HP_BITS) == tp->pred_flags &&
+	    TCP_SKB_CB(skb)->seq == tp->rcv_nxt &&
+	    !after(TCP_SKB_CB(skb)->ack_seq, tp->snd_nxt)) {
+		int tcp_header_len = tp->tcp_header_len;
+
+		/* Timestamp header prediction: tcp_header_len
+		 * is automatically equal to th->doff*4 due to pred_flags
+		 * match.
+		 */
+
+		/* Check timestamp */
+		if (tcp_header_len == sizeof(struct tcphdr) + TCPOLEN_TSTAMP_ALIGNED) {
+			/* No? Slow path! */
+			if (!tcp_parse_aligned_timestamp(tp, th))
+				goto slow_path;
+
+			/* If PAWS failed, check it more carefully in slow path */
+			if ((s32)(tp->rx_opt.rcv_tsval - tp->rx_opt.ts_recent) < 0)
+				goto slow_path;
+
+			/* DO NOT update ts_recent here, if checksum fails
+			 * and timestamp was corrupted part, it will result
+			 * in a hung connection since we will drop all
+			 * future packets due to the PAWS test.
+			 */
+		}
+
+		if (len <= tcp_header_len) {
+			/* Bulk data transfer: sender */
+			if (len == tcp_header_len) {
+				/* Predicted packet is in window by definition.
+				 * seq == rcv_nxt and rcv_wup <= rcv_nxt.
+				 * Hence, check seq<=rcv_wup reduces to:
+				 */
+				if (tcp_header_len ==
+				    (sizeof(struct tcphdr) + TCPOLEN_TSTAMP_ALIGNED) &&
+				    tp->rcv_nxt == tp->rcv_wup)
+					tcp_store_ts_recent(tp);
+
+				/* We know that such packets are checksummed
+				 * on entry.
+				 */
+				tcp_ack(sk, skb, 0);
+				__kfree_skb(skb);
+				tcp_data_snd_check(sk);
+				return;
+			} else { /* Header too small */
+				TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
+				goto discard;
+			}
+		} else {
+			int eaten = 0;
+			bool fragstolen = false;
+
+			if (tcp_checksum_complete(skb))
+				goto csum_error;
+
+			if ((int)skb->truesize > sk->sk_forward_alloc)
+				goto step5;
+
+			/* Predicted packet is in window by definition.
+			 * seq == rcv_nxt and rcv_wup <= rcv_nxt.
+			 * Hence, check seq<=rcv_wup reduces to:
+			 */
+			if (tcp_header_len ==
+			    (sizeof(struct tcphdr) + TCPOLEN_TSTAMP_ALIGNED) &&
+			    tp->rcv_nxt == tp->rcv_wup)
+				tcp_store_ts_recent(tp);
+
+			tcp_rcv_rtt_measure_ts(sk, skb);
+
+			NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPHPHITS);
+
+			/* Bulk data transfer: receiver */
+			eaten = tcp_queue_rcv(sk, skb, tcp_header_len,
+					      &fragstolen);
+
+			tcp_event_data_recv(sk, skb);
+
+			if (TCP_SKB_CB(skb)->ack_seq != tp->snd_una) {
+				/* Well, only one small jumplet in fast path... */
+				tcp_ack(sk, skb, FLAG_DATA);
+				tcp_data_snd_check(sk);
+				if (!inet_csk_ack_scheduled(sk))
+					goto no_ack;
+			}
+
+			__tcp_ack_snd_check(sk, 0);
+no_ack:
+			if (eaten)
+				kfree_skb_partial(skb, fragstolen);
+			sk->sk_data_ready(sk);
+			return;
+		}
+	}
+
+slow_path:
 	if (len < (th->doff << 2) || tcp_checksum_complete(skb))
 		goto csum_error;
 
 	if (!th->ack && !th->rst && !th->syn)
 		goto discard;
 
+	/*
+	 *	Standard slow path.
+	 */
+
 	if (!tcp_validate_incoming(sk, skb, th, 1))
 		return;
 
-	if (tcp_ack(sk, skb, FLAG_UPDATE_TS_RECENT) < 0)
+step5:
+	if (tcp_ack(sk, skb, FLAG_SLOWPATH | FLAG_UPDATE_TS_RECENT) < 0)
 		goto discard;
 
 	tcp_rcv_rtt_measure_ts(sk, skb);
@@ -5376,6 +5548,11 @@ void tcp_finish_connect(struct sock *sk, struct sk_buff *skb)
 
 	if (sock_flag(sk, SOCK_KEEPOPEN))
 		inet_csk_reset_keepalive_timer(sk, keepalive_time_when(tp));
+
+	if (!tp->rx_opt.snd_wscale)
+		__tcp_fast_path_on(tp, tp->snd_wnd);
+	else
+		tp->pred_flags = 0;
 }
 
 static bool tcp_rcv_fastopen_synack(struct sock *sk, struct sk_buff *synack,
@@ -5504,7 +5681,7 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb,
 		tcp_ecn_rcv_synack(tp, th);
 
 		tcp_init_wl(tp, TCP_SKB_CB(skb)->seq);
-		tcp_ack(sk, skb, 0);
+		tcp_ack(sk, skb, FLAG_SLOWPATH);
 
 		/* Ok.. it's good. Set up sequence numbers and
 		 * move to established.
@@ -5740,8 +5917,8 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb)
 		return 0;
 
 	/* step 5: check the ACK field */
-
-	acceptable = tcp_ack(sk, skb, FLAG_UPDATE_TS_RECENT |
+	acceptable = tcp_ack(sk, skb, FLAG_SLOWPATH |
+				      FLAG_UPDATE_TS_RECENT |
 				      FLAG_NO_CHALLENGE_ACK) > 0;
 
 	if (!acceptable) {
@@ -5809,6 +5986,7 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb)
 		tp->lsndtime = tcp_jiffies32;
 
 		tcp_initialize_rcv_mss(sk);
+		tcp_fast_path_on(tp);
 		break;
 
 	case TCP_FIN_WAIT1: {
diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c
index 1537b87c657f..188a6f31356d 100644
--- a/net/ipv4/tcp_minisocks.c
+++ b/net/ipv4/tcp_minisocks.c
@@ -436,6 +436,8 @@ struct sock *tcp_create_openreq_child(const struct sock *sk,
 		struct tcp_sock *newtp = tcp_sk(newsk);
 
 		/* Now setup tcp_sock */
+		newtp->pred_flags = 0;
+
 		newtp->rcv_wup = newtp->copied_seq =
 		newtp->rcv_nxt = treq->rcv_isn + 1;
 		newtp->segs_in = 1;
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 3e0d19631534..5b6690d05abb 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -295,7 +295,9 @@ static u16 tcp_select_window(struct sock *sk)
 	/* RFC1323 scaling applied */
 	new_win >>= tp->rx_opt.rcv_wscale;
 
+	/* If we advertise zero window, disable fast path. */
 	if (new_win == 0) {
+		tp->pred_flags = 0;
 		if (old_win)
 			NET_INC_STATS(sock_net(sk),
 				      LINUX_MIB_TCPTOZEROWINDOWADV);
-- 
cgit v1.2.3


From b3cb5388499c5e219324bfe7da2e46cbad82bfcf Mon Sep 17 00:00:00 2001
From: Huy Nguyen <huyn@mellanox.com>
Date: Tue, 8 Aug 2017 13:17:00 -0500
Subject: net/mlx5: Skip mlx5_unload_one if mlx5_load_one fails

There is an issue where the firmware fails during mlx5_load_one,
the health_care timer detects the issue and schedules a health_care call.
Then the mlx5_load_one detects the issue, cleans up and quits. Then
the health_care starts and calls mlx5_unload_one to clean up the resources
that no longer exist and causes kernel panic.

The root cause is that the bit MLX5_INTERFACE_STATE_DOWN is not set
after mlx5_load_one fails. The solution is removing the bit
MLX5_INTERFACE_STATE_DOWN and quit mlx5_unload_one if the
bit MLX5_INTERFACE_STATE_UP is not set. The bit MLX5_INTERFACE_STATE_DOWN
is redundant and we can use MLX5_INTERFACE_STATE_UP instead.

Fixes: 5fc7197d3a25 ("net/mlx5: Add pci shutdown callback")
Signed-off-by: Huy Nguyen <huyn@mellanox.com>
Reviewed-by: Daniel Jurgens <danielj@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
---
 drivers/net/ethernet/mellanox/mlx5/core/main.c | 4 +---
 include/linux/mlx5/driver.h                    | 5 ++---
 2 files changed, 3 insertions(+), 6 deletions(-)

(limited to 'include')

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/main.c b/drivers/net/ethernet/mellanox/mlx5/core/main.c
index c065132b956d..4cdb414aa2d5 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/main.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/main.c
@@ -1186,7 +1186,6 @@ static int mlx5_load_one(struct mlx5_core_dev *dev, struct mlx5_priv *priv,
 		}
 	}
 
-	clear_bit(MLX5_INTERFACE_STATE_DOWN, &dev->intf_state);
 	set_bit(MLX5_INTERFACE_STATE_UP, &dev->intf_state);
 out:
 	mutex_unlock(&dev->intf_state_mutex);
@@ -1261,7 +1260,7 @@ static int mlx5_unload_one(struct mlx5_core_dev *dev, struct mlx5_priv *priv,
 		mlx5_drain_health_recovery(dev);
 
 	mutex_lock(&dev->intf_state_mutex);
-	if (test_bit(MLX5_INTERFACE_STATE_DOWN, &dev->intf_state)) {
+	if (!test_bit(MLX5_INTERFACE_STATE_UP, &dev->intf_state)) {
 		dev_warn(&dev->pdev->dev, "%s: interface is down, NOP\n",
 			 __func__);
 		if (cleanup)
@@ -1270,7 +1269,6 @@ static int mlx5_unload_one(struct mlx5_core_dev *dev, struct mlx5_priv *priv,
 	}
 
 	clear_bit(MLX5_INTERFACE_STATE_UP, &dev->intf_state);
-	set_bit(MLX5_INTERFACE_STATE_DOWN, &dev->intf_state);
 
 	if (mlx5_device_registered(dev))
 		mlx5_detach_device(dev);
diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h
index df6ce59a1f95..918f5e644506 100644
--- a/include/linux/mlx5/driver.h
+++ b/include/linux/mlx5/driver.h
@@ -673,9 +673,8 @@ enum mlx5_device_state {
 };
 
 enum mlx5_interface_state {
-	MLX5_INTERFACE_STATE_DOWN = BIT(0),
-	MLX5_INTERFACE_STATE_UP = BIT(1),
-	MLX5_INTERFACE_STATE_SHUTDOWN = BIT(2),
+	MLX5_INTERFACE_STATE_UP = BIT(0),
+	MLX5_INTERFACE_STATE_SHUTDOWN = BIT(1),
 };
 
 enum mlx5_pci_status {
-- 
cgit v1.2.3


From 10a8d00707082955b177164d4b4e758ffcbd4017 Mon Sep 17 00:00:00 2001
From: Huy Nguyen <huyn@mellanox.com>
Date: Wed, 9 Aug 2017 10:03:40 -0500
Subject: net/mlx5: Remove the flag MLX5_INTERFACE_STATE_SHUTDOWN

MLX5_INTERFACE_STATE_SHUTDOWN is not used in the code.

Fixes: 5fc7197d3a25 ("net/mlx5: Add pci shutdown callback")
Signed-off-by: Huy Nguyen <huyn@mellanox.com>
Reviewed-by: Daniel Jurgens <danielj@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
---
 drivers/net/ethernet/mellanox/mlx5/core/main.c | 2 --
 include/linux/mlx5/driver.h                    | 1 -
 2 files changed, 3 deletions(-)

(limited to 'include')

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/main.c b/drivers/net/ethernet/mellanox/mlx5/core/main.c
index 4cdb414aa2d5..16885827367b 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/main.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/main.c
@@ -1563,8 +1563,6 @@ static void shutdown(struct pci_dev *pdev)
 	int err;
 
 	dev_info(&pdev->dev, "Shutdown was called\n");
-	/* Notify mlx5 clients that the kernel is being shut down */
-	set_bit(MLX5_INTERFACE_STATE_SHUTDOWN, &dev->intf_state);
 	err = mlx5_try_fast_unload(dev);
 	if (err)
 		mlx5_unload_one(dev, priv, false);
diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h
index 918f5e644506..205d82d4c468 100644
--- a/include/linux/mlx5/driver.h
+++ b/include/linux/mlx5/driver.h
@@ -674,7 +674,6 @@ enum mlx5_device_state {
 
 enum mlx5_interface_state {
 	MLX5_INTERFACE_STATE_UP = BIT(0),
-	MLX5_INTERFACE_STATE_SHUTDOWN = BIT(1),
 };
 
 enum mlx5_pci_status {
-- 
cgit v1.2.3


From 7373ae7e8f0bf2c0718422481da986db5058b005 Mon Sep 17 00:00:00 2001
From: Subash Abhinov Kasiviswanathan <subashab@codeaurora.org>
Date: Tue, 29 Aug 2017 22:44:16 -0600
Subject: net: ether: Add support for multiplexing and aggregation type

Define the Qualcomm multiplexing and aggregation (MAP) ether type 0x00F9.
This is needed for receiving data in the MAP protocol like RMNET. This is
not an officially registered ID.

Signed-off-by: Subash Abhinov Kasiviswanathan <subashab@codeaurora.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/if_ether.h | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'include')

diff --git a/include/uapi/linux/if_ether.h b/include/uapi/linux/if_ether.h
index 61f7ccce5b69..9037065e23d0 100644
--- a/include/uapi/linux/if_ether.h
+++ b/include/uapi/linux/if_ether.h
@@ -140,6 +140,9 @@
 #define ETH_P_IEEE802154 0x00F6		/* IEEE802.15.4 frame		*/
 #define ETH_P_CAIF	0x00F7		/* ST-Ericsson CAIF protocol	*/
 #define ETH_P_XDSA	0x00F8		/* Multiplexed DSA protocol	*/
+#define ETH_P_MAP	0x00F9		/* Qualcomm multiplexing and
+					 * aggregation protocol
+					 */
 
 /*
  *	This is an Ethernet frame header.
-- 
cgit v1.2.3


From cdf4969c42a6c1a376dd03a9e846cf638d3cd4b1 Mon Sep 17 00:00:00 2001
From: Subash Abhinov Kasiviswanathan <subashab@codeaurora.org>
Date: Tue, 29 Aug 2017 22:44:17 -0600
Subject: net: arp: Add support for raw IP device

Define the raw IP type. This is needed for raw IP net devices
like rmnet.

Signed-off-by: Subash Abhinov Kasiviswanathan <subashab@codeaurora.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/if_arp.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include')

diff --git a/include/uapi/linux/if_arp.h b/include/uapi/linux/if_arp.h
index cf73510b9238..a2a635620600 100644
--- a/include/uapi/linux/if_arp.h
+++ b/include/uapi/linux/if_arp.h
@@ -59,6 +59,7 @@
 #define ARPHRD_LAPB	516		/* LAPB				*/
 #define ARPHRD_DDCMP    517		/* Digital's DDCMP protocol     */
 #define ARPHRD_RAWHDLC	518		/* Raw HDLC			*/
+#define ARPHRD_RAWIP    519		/* Raw IP                       */
 
 #define ARPHRD_TUNNEL	768		/* IPIP tunnel			*/
 #define ARPHRD_TUNNEL6	769		/* IP6IP6 tunnel       		*/
-- 
cgit v1.2.3


From 7d8e8292013ab72ae1f1500cbc91f198ccb1826d Mon Sep 17 00:00:00 2001
From: Takashi Iwai <tiwai@suse.de>
Date: Wed, 30 Aug 2017 16:13:25 +0200
Subject: ALSA: Get rid of card power_lock

Currently we're taking power_lock at each card component for assuring
the power-up sequence, but it doesn't help anything in the
implementation at the moment: it just serializes unnecessarily the
callers, but it doesn't protect about the power state change itself.
It used to have some usefulness in the early days where we managed the
PM manually.  But now the suspend/resume core procedure is beyond our
hands, and power_lock lost its meaning.

This patch drops the power_lock from allover the places.
There shouldn't be any issues by this change, as it's no helper
regarding the power state change.  Rather we'll get better performance
by removing the serialization; which is the only slight concern of any
behavior change, but it can't be a showstopper, after all.

Signed-off-by: Takashi Iwai <tiwai@suse.de>
---
 include/sound/core.h        | 13 ----------
 sound/core/control.c        | 58 ++++++++++++++++++++++++---------------------
 sound/core/control_compat.c | 34 ++++++++++++--------------
 sound/core/init.c           |  5 ----
 sound/core/pcm_native.c     | 30 ++++++-----------------
 sound/soc/soc-core.c        |  2 --
 6 files changed, 53 insertions(+), 89 deletions(-)

(limited to 'include')

diff --git a/include/sound/core.h b/include/sound/core.h
index 357f36b5ee80..4104a9d1001f 100644
--- a/include/sound/core.h
+++ b/include/sound/core.h
@@ -136,7 +136,6 @@ struct snd_card {
 
 #ifdef CONFIG_PM
 	unsigned int power_state;	/* power state */
-	struct mutex power_lock;	/* power lock */
 	wait_queue_head_t power_sleep;
 #endif
 
@@ -149,16 +148,6 @@ struct snd_card {
 #define dev_to_snd_card(p)	container_of(p, struct snd_card, card_dev)
 
 #ifdef CONFIG_PM
-static inline void snd_power_lock(struct snd_card *card)
-{
-	mutex_lock(&card->power_lock);
-}
-
-static inline void snd_power_unlock(struct snd_card *card)
-{
-	mutex_unlock(&card->power_lock);
-}
-
 static inline unsigned int snd_power_get_state(struct snd_card *card)
 {
 	return card->power_state;
@@ -175,8 +164,6 @@ int snd_power_wait(struct snd_card *card, unsigned int power_state);
 
 #else /* ! CONFIG_PM */
 
-#define snd_power_lock(card)		do { (void)(card); } while (0)
-#define snd_power_unlock(card)		do { (void)(card); } while (0)
 static inline int snd_power_wait(struct snd_card *card, unsigned int state) { return 0; }
 #define snd_power_get_state(card)	({ (void)(card); SNDRV_CTL_POWER_D0; })
 #define snd_power_change_state(card, state)	do { (void)(card); } while (0)
diff --git a/sound/core/control.c b/sound/core/control.c
index 51d4b4ad3e1d..56b3e2d49c82 100644
--- a/sound/core/control.c
+++ b/sound/core/control.c
@@ -864,14 +864,14 @@ static int snd_ctl_elem_info_user(struct snd_ctl_file *ctl,
 
 	if (copy_from_user(&info, _info, sizeof(info)))
 		return -EFAULT;
-	snd_power_lock(ctl->card);
 	result = snd_power_wait(ctl->card, SNDRV_CTL_POWER_D0);
-	if (result >= 0)
-		result = snd_ctl_elem_info(ctl, &info);
-	snd_power_unlock(ctl->card);
-	if (result >= 0)
-		if (copy_to_user(_info, &info, sizeof(info)))
-			return -EFAULT;
+	if (result < 0)
+		return result;
+	result = snd_ctl_elem_info(ctl, &info);
+	if (result < 0)
+		return result;
+	if (copy_to_user(_info, &info, sizeof(info)))
+		return -EFAULT;
 	return result;
 }
 
@@ -905,17 +905,19 @@ static int snd_ctl_elem_read_user(struct snd_card *card,
 	if (IS_ERR(control))
 		return PTR_ERR(control);
 
-	snd_power_lock(card);
 	result = snd_power_wait(card, SNDRV_CTL_POWER_D0);
-	if (result >= 0) {
-		down_read(&card->controls_rwsem);
-		result = snd_ctl_elem_read(card, control);
-		up_read(&card->controls_rwsem);
-	}
-	snd_power_unlock(card);
-	if (result >= 0)
-		if (copy_to_user(_control, control, sizeof(*control)))
-			result = -EFAULT;
+	if (result < 0)
+		goto error;
+
+	down_read(&card->controls_rwsem);
+	result = snd_ctl_elem_read(card, control);
+	up_read(&card->controls_rwsem);
+	if (result < 0)
+		goto error;
+
+	if (copy_to_user(_control, control, sizeof(*control)))
+		result = -EFAULT;
+ error:
 	kfree(control);
 	return result;
 }
@@ -964,17 +966,19 @@ static int snd_ctl_elem_write_user(struct snd_ctl_file *file,
 		return PTR_ERR(control);
 
 	card = file->card;
-	snd_power_lock(card);
 	result = snd_power_wait(card, SNDRV_CTL_POWER_D0);
-	if (result >= 0) {
-		down_write(&card->controls_rwsem);
-		result = snd_ctl_elem_write(card, file, control);
-		up_write(&card->controls_rwsem);
-	}
-	snd_power_unlock(card);
-	if (result >= 0)
-		if (copy_to_user(_control, control, sizeof(*control)))
-			result = -EFAULT;
+	if (result < 0)
+		goto error;
+
+	down_write(&card->controls_rwsem);
+	result = snd_ctl_elem_write(card, file, control);
+	up_write(&card->controls_rwsem);
+	if (result < 0)
+		goto error;
+
+	if (copy_to_user(_control, control, sizeof(*control)))
+		result = -EFAULT;
+ error:
 	kfree(control);
 	return result;
 }
diff --git a/sound/core/control_compat.c b/sound/core/control_compat.c
index 1fa70766ffab..a848836a5de0 100644
--- a/sound/core/control_compat.c
+++ b/sound/core/control_compat.c
@@ -111,12 +111,10 @@ static int snd_ctl_elem_info_compat(struct snd_ctl_file *ctl,
 	if (get_user(data->value.enumerated.item, &data32->value.enumerated.item))
 		goto error;
 
-	snd_power_lock(ctl->card);
 	err = snd_power_wait(ctl->card, SNDRV_CTL_POWER_D0);
-	if (err >= 0)
-		err = snd_ctl_elem_info(ctl, data);
-	snd_power_unlock(ctl->card);
-
+	if (err < 0)
+		goto error;
+	err = snd_ctl_elem_info(ctl, data);
 	if (err < 0)
 		goto error;
 	/* restore info to 32bit */
@@ -315,14 +313,13 @@ static int ctl_elem_read_user(struct snd_card *card,
 	if (err < 0)
 		goto error;
 
-	snd_power_lock(card);
 	err = snd_power_wait(card, SNDRV_CTL_POWER_D0);
-	if (err >= 0)
-		err = snd_ctl_elem_read(card, data);
-	snd_power_unlock(card);
-	if (err >= 0)
-		err = copy_ctl_value_to_user(userdata, valuep, data,
-					     type, count);
+	if (err < 0)
+		goto error;
+	err = snd_ctl_elem_read(card, data);
+	if (err < 0)
+		goto error;
+	err = copy_ctl_value_to_user(userdata, valuep, data, type, count);
  error:
 	kfree(data);
 	return err;
@@ -344,14 +341,13 @@ static int ctl_elem_write_user(struct snd_ctl_file *file,
 	if (err < 0)
 		goto error;
 
-	snd_power_lock(card);
 	err = snd_power_wait(card, SNDRV_CTL_POWER_D0);
-	if (err >= 0)
-		err = snd_ctl_elem_write(card, file, data);
-	snd_power_unlock(card);
-	if (err >= 0)
-		err = copy_ctl_value_to_user(userdata, valuep, data,
-					     type, count);
+	if (err < 0)
+		goto error;
+	err = snd_ctl_elem_write(card, file, data);
+	if (err < 0)
+		goto error;
+	err = copy_ctl_value_to_user(userdata, valuep, data, type, count);
  error:
 	kfree(data);
 	return err;
diff --git a/sound/core/init.c b/sound/core/init.c
index 6e219dc23f96..32ebe2f6bc59 100644
--- a/sound/core/init.c
+++ b/sound/core/init.c
@@ -253,7 +253,6 @@ int snd_card_new(struct device *parent, int idx, const char *xid,
 	spin_lock_init(&card->files_lock);
 	INIT_LIST_HEAD(&card->files_list);
 #ifdef CONFIG_PM
-	mutex_init(&card->power_lock);
 	init_waitqueue_head(&card->power_sleep);
 #endif
 
@@ -978,8 +977,6 @@ EXPORT_SYMBOL(snd_card_file_remove);
  *  Waits until the power-state is changed.
  *
  *  Return: Zero if successful, or a negative error code.
- *
- *  Note: the power lock must be active before call.
  */
 int snd_power_wait(struct snd_card *card, unsigned int power_state)
 {
@@ -999,9 +996,7 @@ int snd_power_wait(struct snd_card *card, unsigned int power_state)
 		if (snd_power_get_state(card) == power_state)
 			break;
 		set_current_state(TASK_UNINTERRUPTIBLE);
-		snd_power_unlock(card);
 		schedule_timeout(30 * HZ);
-		snd_power_lock(card);
 	}
 	remove_wait_queue(&card->power_sleep, &wait);
 	return result;
diff --git a/sound/core/pcm_native.c b/sound/core/pcm_native.c
index cf0433f80067..621142ea9ec6 100644
--- a/sound/core/pcm_native.c
+++ b/sound/core/pcm_native.c
@@ -1830,7 +1830,6 @@ static int snd_pcm_drain(struct snd_pcm_substream *substream,
 		add_wait_queue(&to_check->sleep, &wait);
 		snd_pcm_stream_unlock_irq(substream);
 		up_read(&snd_pcm_link_rwsem);
-		snd_power_unlock(card);
 		if (runtime->no_period_wakeup)
 			tout = MAX_SCHEDULE_TIMEOUT;
 		else {
@@ -1842,7 +1841,6 @@ static int snd_pcm_drain(struct snd_pcm_substream *substream,
 			tout = msecs_to_jiffies(tout * 1000);
 		}
 		tout = schedule_timeout_interruptible(tout);
-		snd_power_lock(card);
 		down_read(&snd_pcm_link_rwsem);
 		snd_pcm_stream_lock_irq(substream);
 		remove_wait_queue(&to_check->sleep, &wait);
@@ -2764,11 +2762,16 @@ static int snd_pcm_tstamp(struct snd_pcm_substream *substream, int __user *_arg)
 	return 0;
 }
 		
-static int snd_pcm_common_ioctl(struct file *file,
+static int snd_pcm_common_ioctl1(struct file *file,
 				 struct snd_pcm_substream *substream,
 				 unsigned int cmd, void __user *arg)
 {
 	struct snd_pcm_file *pcm_file = file->private_data;
+	int res;
+
+	res = snd_power_wait(substream->pcm->card, SNDRV_CTL_POWER_D0);
+	if (res < 0)
+		return res;
 
 	switch (cmd) {
 	case SNDRV_PCM_IOCTL_PVERSION:
@@ -2846,21 +2849,6 @@ static int snd_pcm_common_ioctl(struct file *file,
 	return -ENOTTY;
 }
 
-static int snd_pcm_common_ioctl1(struct file *file,
-				 struct snd_pcm_substream *substream,
-				 unsigned int cmd, void __user *arg)
-{
-	struct snd_card *card = substream->pcm->card;
-	int res;
-
-	snd_power_lock(card);
-	res = snd_power_wait(card, SNDRV_CTL_POWER_D0);
-	if (res >= 0)
-		res = snd_pcm_common_ioctl(file, substream, cmd, arg);
-	snd_power_unlock(card);
-	return res;
-}
-
 static int snd_pcm_playback_ioctl1(struct file *file,
 				   struct snd_pcm_substream *substream,
 				   unsigned int cmd, void __user *arg)
@@ -3064,7 +3052,6 @@ int snd_pcm_kernel_ioctl(struct snd_pcm_substream *substream,
 {
 	snd_pcm_uframes_t *frames = arg;
 	snd_pcm_sframes_t result;
-	int err;
 	
 	switch (cmd) {
 	case SNDRV_PCM_IOCTL_FORWARD:
@@ -3084,10 +3071,7 @@ int snd_pcm_kernel_ioctl(struct snd_pcm_substream *substream,
 	case SNDRV_PCM_IOCTL_START:
 		return snd_pcm_start_lock_irq(substream);
 	case SNDRV_PCM_IOCTL_DRAIN:
-		snd_power_lock(substream->pcm->card);
-		err = snd_pcm_drain(substream, NULL);
-		snd_power_unlock(substream->pcm->card);
-		return err;
+		return snd_pcm_drain(substream, NULL);
 	case SNDRV_PCM_IOCTL_DROP:
 		return snd_pcm_drop(substream);
 	case SNDRV_PCM_IOCTL_DELAY:
diff --git a/sound/soc/soc-core.c b/sound/soc/soc-core.c
index 13c875e2392a..62c11e26ce5c 100644
--- a/sound/soc/soc-core.c
+++ b/sound/soc/soc-core.c
@@ -653,9 +653,7 @@ int snd_soc_suspend(struct device *dev)
 	/* Due to the resume being scheduled into a workqueue we could
 	* suspend before that's finished - wait for it to complete.
 	 */
-	snd_power_lock(card->snd_card);
 	snd_power_wait(card->snd_card, SNDRV_CTL_POWER_D0);
-	snd_power_unlock(card->snd_card);
 
 	/* we're going to block userspace touching us until resume completes */
 	snd_power_change_state(card->snd_card, SNDRV_CTL_POWER_D3hot);
-- 
cgit v1.2.3


From 71ccef0df533cd9c8c6cbf1483a636a092088ab9 Mon Sep 17 00:00:00 2001
From: Kuninori Morimoto <kuninori.morimoto.gx@renesas.com>
Date: Thu, 24 Aug 2017 00:57:35 +0000
Subject: ASoC: add Component level set_sysclk

In current ALSA SoC, Codec only has set_sysclk feature.
Codec will be merged into Component in next generation ALSA SoC,
thus current Codec specific feature need to be merged into it.
This is glue patch for it.

Signed-off-by: Kuninori Morimoto <kuninori.morimoto.gx@renesas.com>
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 include/sound/soc.h  | 11 +++++++++++
 sound/soc/soc-core.c | 45 ++++++++++++++++++++++++++++++++++++++++-----
 2 files changed, 51 insertions(+), 5 deletions(-)

(limited to 'include')

diff --git a/include/sound/soc.h b/include/sound/soc.h
index fbf57ff75958..813cdb95e049 100644
--- a/include/sound/soc.h
+++ b/include/sound/soc.h
@@ -795,6 +795,10 @@ struct snd_soc_component_driver {
 	int (*suspend)(struct snd_soc_component *);
 	int (*resume)(struct snd_soc_component *);
 
+	/* component wide operations */
+	int (*set_sysclk)(struct snd_soc_component *component,
+			  int clk_id, int source, unsigned int freq, int dir);
+
 	/* DT */
 	int (*of_xlate_dai_name)(struct snd_soc_component *component,
 				 struct of_phandle_args *args,
@@ -865,6 +869,9 @@ struct snd_soc_component {
 	int (*suspend)(struct snd_soc_component *);
 	int (*resume)(struct snd_soc_component *);
 
+	int (*set_sysclk)(struct snd_soc_component *component,
+			  int clk_id, int source, unsigned int freq, int dir);
+
 	/* machine specific init */
 	int (*init)(struct snd_soc_component *component);
 
@@ -1459,6 +1466,10 @@ void snd_soc_component_async_complete(struct snd_soc_component *component);
 int snd_soc_component_test_bits(struct snd_soc_component *component,
 	unsigned int reg, unsigned int mask, unsigned int value);
 
+/* component wide operations */
+int snd_soc_component_set_sysclk(struct snd_soc_component *component,
+			int clk_id, int source, unsigned int freq, int dir);
+
 #ifdef CONFIG_REGMAP
 
 void snd_soc_component_init_regmap(struct snd_soc_component *component,
diff --git a/sound/soc/soc-core.c b/sound/soc/soc-core.c
index 10efd15338fb..654db550039d 100644
--- a/sound/soc/soc-core.c
+++ b/sound/soc/soc-core.c
@@ -2590,11 +2590,9 @@ int snd_soc_dai_set_sysclk(struct snd_soc_dai *dai, int clk_id,
 {
 	if (dai->driver && dai->driver->ops->set_sysclk)
 		return dai->driver->ops->set_sysclk(dai, clk_id, freq, dir);
-	else if (dai->codec && dai->codec->driver->set_sysclk)
-		return dai->codec->driver->set_sysclk(dai->codec, clk_id, 0,
-						      freq, dir);
-	else
-		return -ENOTSUPP;
+
+	return snd_soc_component_set_sysclk(dai->component, clk_id, 0,
+					    freq, dir);
 }
 EXPORT_SYMBOL_GPL(snd_soc_dai_set_sysclk);
 
@@ -2619,6 +2617,32 @@ int snd_soc_codec_set_sysclk(struct snd_soc_codec *codec, int clk_id,
 }
 EXPORT_SYMBOL_GPL(snd_soc_codec_set_sysclk);
 
+/**
+ * snd_soc_component_set_sysclk - configure COMPONENT system or master clock.
+ * @component: COMPONENT
+ * @clk_id: DAI specific clock ID
+ * @source: Source for the clock
+ * @freq: new clock frequency in Hz
+ * @dir: new clock direction - input/output.
+ *
+ * Configures the CODEC master (MCLK) or system (SYSCLK) clocking.
+ */
+int snd_soc_component_set_sysclk(struct snd_soc_component *component, int clk_id,
+			     int source, unsigned int freq, int dir)
+{
+	/* will be removed */
+	if (component->set_sysclk)
+		return component->set_sysclk(component, clk_id, source,
+					     freq, dir);
+
+	if (component->driver->set_sysclk)
+		return component->driver->set_sysclk(component, clk_id, source,
+						 freq, dir);
+
+	return -ENOTSUPP;
+}
+EXPORT_SYMBOL_GPL(snd_soc_component_set_sysclk);
+
 /**
  * snd_soc_dai_set_clkdiv - configure DAI clock dividers.
  * @dai: DAI
@@ -3174,6 +3198,7 @@ static int snd_soc_component_initialize(struct snd_soc_component *component,
 	component->remove = component->driver->remove;
 	component->suspend = component->driver->suspend;
 	component->resume = component->driver->resume;
+	component->set_sysclk = component->driver->set_sysclk;
 
 	dapm = &component->dapm;
 	dapm->dev = dev;
@@ -3557,6 +3582,14 @@ static int snd_soc_codec_drv_read(struct snd_soc_component *component,
 	return 0;
 }
 
+static int snd_soc_codec_set_sysclk_(struct snd_soc_component *component,
+			  int clk_id, int source, unsigned int freq, int dir)
+{
+	struct snd_soc_codec *codec = snd_soc_component_to_codec(component);
+
+	return snd_soc_codec_set_sysclk(codec, clk_id, source, freq, dir);
+}
+
 static int snd_soc_codec_set_bias_level(struct snd_soc_dapm_context *dapm,
 	enum snd_soc_bias_level level)
 {
@@ -3608,6 +3641,8 @@ int snd_soc_register_codec(struct device *dev,
 		codec->component.write = snd_soc_codec_drv_write;
 	if (codec_drv->read)
 		codec->component.read = snd_soc_codec_drv_read;
+	if (codec_drv->set_sysclk)
+		codec->component.set_sysclk = snd_soc_codec_set_sysclk_;
 	codec->component.ignore_pmdown_time = codec_drv->ignore_pmdown_time;
 
 	dapm = snd_soc_codec_get_dapm(codec);
-- 
cgit v1.2.3


From ef641e5d5e6c7a07748239036b786a90ba9b9a95 Mon Sep 17 00:00:00 2001
From: Kuninori Morimoto <kuninori.morimoto.gx@renesas.com>
Date: Thu, 24 Aug 2017 00:57:51 +0000
Subject: ASoC: add Component level set_pll

In current ALSA SoC, Codec only has set_pll feature.
Codec will be merged into Component in next generation ALSA SoC,
thus current Codec specific feature need to be merged into it.
This is glue patch for it.

Signed-off-by: Kuninori Morimoto <kuninori.morimoto.gx@renesas.com>
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 include/sound/soc.h  |  7 +++++++
 sound/soc/soc-core.c | 47 ++++++++++++++++++++++++++++++++++++++++++-----
 2 files changed, 49 insertions(+), 5 deletions(-)

(limited to 'include')

diff --git a/include/sound/soc.h b/include/sound/soc.h
index 813cdb95e049..23681538c849 100644
--- a/include/sound/soc.h
+++ b/include/sound/soc.h
@@ -798,6 +798,8 @@ struct snd_soc_component_driver {
 	/* component wide operations */
 	int (*set_sysclk)(struct snd_soc_component *component,
 			  int clk_id, int source, unsigned int freq, int dir);
+	int (*set_pll)(struct snd_soc_component *component, int pll_id,
+		       int source, unsigned int freq_in, unsigned int freq_out);
 
 	/* DT */
 	int (*of_xlate_dai_name)(struct snd_soc_component *component,
@@ -871,6 +873,8 @@ struct snd_soc_component {
 
 	int (*set_sysclk)(struct snd_soc_component *component,
 			  int clk_id, int source, unsigned int freq, int dir);
+	int (*set_pll)(struct snd_soc_component *component, int pll_id,
+		       int source, unsigned int freq_in, unsigned int freq_out);
 
 	/* machine specific init */
 	int (*init)(struct snd_soc_component *component);
@@ -1469,6 +1473,9 @@ int snd_soc_component_test_bits(struct snd_soc_component *component,
 /* component wide operations */
 int snd_soc_component_set_sysclk(struct snd_soc_component *component,
 			int clk_id, int source, unsigned int freq, int dir);
+int snd_soc_component_set_pll(struct snd_soc_component *component, int pll_id,
+			      int source, unsigned int freq_in,
+			      unsigned int freq_out);
 
 #ifdef CONFIG_REGMAP
 
diff --git a/sound/soc/soc-core.c b/sound/soc/soc-core.c
index 654db550039d..ee641cd8aa22 100644
--- a/sound/soc/soc-core.c
+++ b/sound/soc/soc-core.c
@@ -2679,11 +2679,9 @@ int snd_soc_dai_set_pll(struct snd_soc_dai *dai, int pll_id, int source,
 	if (dai->driver && dai->driver->ops->set_pll)
 		return dai->driver->ops->set_pll(dai, pll_id, source,
 					 freq_in, freq_out);
-	else if (dai->codec && dai->codec->driver->set_pll)
-		return dai->codec->driver->set_pll(dai->codec, pll_id, source,
-						   freq_in, freq_out);
-	else
-		return -EINVAL;
+
+	return snd_soc_component_set_pll(dai->component, pll_id, source,
+					 freq_in, freq_out);
 }
 EXPORT_SYMBOL_GPL(snd_soc_dai_set_pll);
 
@@ -2708,6 +2706,33 @@ int snd_soc_codec_set_pll(struct snd_soc_codec *codec, int pll_id, int source,
 }
 EXPORT_SYMBOL_GPL(snd_soc_codec_set_pll);
 
+/*
+ * snd_soc_component_set_pll - configure component PLL.
+ * @component: COMPONENT
+ * @pll_id: DAI specific PLL ID
+ * @source: DAI specific source for the PLL
+ * @freq_in: PLL input clock frequency in Hz
+ * @freq_out: requested PLL output clock frequency in Hz
+ *
+ * Configures and enables PLL to generate output clock based on input clock.
+ */
+int snd_soc_component_set_pll(struct snd_soc_component *component, int pll_id,
+			      int source, unsigned int freq_in,
+			      unsigned int freq_out)
+{
+	/* will be removed */
+	if (component->set_pll)
+		return component->set_pll(component, pll_id, source,
+					      freq_in, freq_out);
+
+	if (component->driver->set_pll)
+		return component->driver->set_pll(component, pll_id, source,
+					      freq_in, freq_out);
+
+	return -EINVAL;
+}
+EXPORT_SYMBOL_GPL(snd_soc_component_set_pll);
+
 /**
  * snd_soc_dai_set_bclk_ratio - configure BCLK to sample rate ratio.
  * @dai: DAI
@@ -3199,6 +3224,7 @@ static int snd_soc_component_initialize(struct snd_soc_component *component,
 	component->suspend = component->driver->suspend;
 	component->resume = component->driver->resume;
 	component->set_sysclk = component->driver->set_sysclk;
+	component->set_pll = component->driver->set_pll;
 
 	dapm = &component->dapm;
 	dapm->dev = dev;
@@ -3590,6 +3616,15 @@ static int snd_soc_codec_set_sysclk_(struct snd_soc_component *component,
 	return snd_soc_codec_set_sysclk(codec, clk_id, source, freq, dir);
 }
 
+static int snd_soc_codec_set_pll_(struct snd_soc_component *component,
+				  int pll_id, int source, unsigned int freq_in,
+				  unsigned int freq_out)
+{
+	struct snd_soc_codec *codec = snd_soc_component_to_codec(component);
+
+	return snd_soc_codec_set_pll(codec, pll_id, source, freq_in, freq_out);
+}
+
 static int snd_soc_codec_set_bias_level(struct snd_soc_dapm_context *dapm,
 	enum snd_soc_bias_level level)
 {
@@ -3643,6 +3678,8 @@ int snd_soc_register_codec(struct device *dev,
 		codec->component.read = snd_soc_codec_drv_read;
 	if (codec_drv->set_sysclk)
 		codec->component.set_sysclk = snd_soc_codec_set_sysclk_;
+	if (codec_drv->set_pll)
+		codec->component.set_pll = snd_soc_codec_set_pll_;
 	codec->component.ignore_pmdown_time = codec_drv->ignore_pmdown_time;
 
 	dapm = snd_soc_codec_get_dapm(codec);
-- 
cgit v1.2.3


From 44c07365e9e2c87c7e04d63293618c391d1480ec Mon Sep 17 00:00:00 2001
From: Kuninori Morimoto <kuninori.morimoto.gx@renesas.com>
Date: Thu, 24 Aug 2017 00:58:07 +0000
Subject: ASoC: add Component level set_jack

In current ALSA SoC, Codec only has set_jack feature.
Codec will be merged into Component in next generation ALSA SoC,
thus current Codec specific feature need to be merged into it.
This is glue patch for it.

Signed-off-by: Kuninori Morimoto <kuninori.morimoto.gx@renesas.com>
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 include/sound/soc.h  |  4 ++++
 sound/soc/soc-core.c | 11 +++++++++++
 sound/soc/soc-jack.c | 22 ++++++++++++++++++++++
 3 files changed, 37 insertions(+)

(limited to 'include')

diff --git a/include/sound/soc.h b/include/sound/soc.h
index 23681538c849..feb896815069 100644
--- a/include/sound/soc.h
+++ b/include/sound/soc.h
@@ -800,6 +800,8 @@ struct snd_soc_component_driver {
 			  int clk_id, int source, unsigned int freq, int dir);
 	int (*set_pll)(struct snd_soc_component *component, int pll_id,
 		       int source, unsigned int freq_in, unsigned int freq_out);
+	int (*set_jack)(struct snd_soc_component *component,
+			struct snd_soc_jack *jack,  void *data);
 
 	/* DT */
 	int (*of_xlate_dai_name)(struct snd_soc_component *component,
@@ -875,6 +877,8 @@ struct snd_soc_component {
 			  int clk_id, int source, unsigned int freq, int dir);
 	int (*set_pll)(struct snd_soc_component *component, int pll_id,
 		       int source, unsigned int freq_in, unsigned int freq_out);
+	int (*set_jack)(struct snd_soc_component *component,
+			struct snd_soc_jack *jack,  void *data);
 
 	/* machine specific init */
 	int (*init)(struct snd_soc_component *component);
diff --git a/sound/soc/soc-core.c b/sound/soc/soc-core.c
index ee641cd8aa22..3d7287858609 100644
--- a/sound/soc/soc-core.c
+++ b/sound/soc/soc-core.c
@@ -3225,6 +3225,7 @@ static int snd_soc_component_initialize(struct snd_soc_component *component,
 	component->resume = component->driver->resume;
 	component->set_sysclk = component->driver->set_sysclk;
 	component->set_pll = component->driver->set_pll;
+	component->set_jack = component->driver->set_jack;
 
 	dapm = &component->dapm;
 	dapm->dev = dev;
@@ -3625,6 +3626,14 @@ static int snd_soc_codec_set_pll_(struct snd_soc_component *component,
 	return snd_soc_codec_set_pll(codec, pll_id, source, freq_in, freq_out);
 }
 
+static int snd_soc_codec_set_jack_(struct snd_soc_component *component,
+			       struct snd_soc_jack *jack, void *data)
+{
+	struct snd_soc_codec *codec = snd_soc_component_to_codec(component);
+
+	return snd_soc_codec_set_jack(codec, jack, data);
+}
+
 static int snd_soc_codec_set_bias_level(struct snd_soc_dapm_context *dapm,
 	enum snd_soc_bias_level level)
 {
@@ -3680,6 +3689,8 @@ int snd_soc_register_codec(struct device *dev,
 		codec->component.set_sysclk = snd_soc_codec_set_sysclk_;
 	if (codec_drv->set_pll)
 		codec->component.set_pll = snd_soc_codec_set_pll_;
+	if (codec_drv->set_jack)
+		codec->component.set_jack = snd_soc_codec_set_jack_;
 	codec->component.ignore_pmdown_time = codec_drv->ignore_pmdown_time;
 
 	dapm = snd_soc_codec_get_dapm(codec);
diff --git a/sound/soc/soc-jack.c b/sound/soc/soc-jack.c
index 7daf21fee355..2f9f496ab14f 100644
--- a/sound/soc/soc-jack.c
+++ b/sound/soc/soc-jack.c
@@ -40,6 +40,28 @@ int snd_soc_codec_set_jack(struct snd_soc_codec *codec,
 }
 EXPORT_SYMBOL_GPL(snd_soc_codec_set_jack);
 
+/**
+ * snd_soc_component_set_jack - configure component jack.
+ * @component: COMPONENTs
+ * @jack: structure to use for the jack
+ * @data: can be used if codec driver need extra data for configuring jack
+ *
+ * Configures and enables jack detection function.
+ */
+int snd_soc_component_set_jack(struct snd_soc_component *component,
+			       struct snd_soc_jack *jack, void *data)
+{
+	/* will be removed */
+	if (component->set_jack)
+		return component->set_jack(component, jack, data);
+
+	if (component->driver->set_jack)
+		return component->driver->set_jack(component, jack, data);
+
+	return -ENOTSUPP;
+}
+EXPORT_SYMBOL_GPL(snd_soc_component_set_jack);
+
 /**
  * snd_soc_card_jack_new - Create a new jack
  * @card:  ASoC card
-- 
cgit v1.2.3


From 388f79fda74fd3d8700ed5d899573ec58c2e0253 Mon Sep 17 00:00:00 2001
From: Chris Mi <chrism@mellanox.com>
Date: Wed, 30 Aug 2017 02:31:57 -0400
Subject: idr: Add new APIs to support unsigned long

The following new APIs are added:

int idr_alloc_ext(struct idr *idr, void *ptr, unsigned long *index,
                  unsigned long start, unsigned long end, gfp_t gfp);
void *idr_remove_ext(struct idr *idr, unsigned long id);
void *idr_find_ext(const struct idr *idr, unsigned long id);
void *idr_replace_ext(struct idr *idr, void *ptr, unsigned long id);
void *idr_get_next_ext(struct idr *idr, unsigned long *nextid);

Signed-off-by: Chris Mi <chrism@mellanox.com>
Signed-off-by: Jiri Pirko <jiri@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/idr.h        | 69 ++++++++++++++++++++++++++++++++++++++++++++--
 include/linux/radix-tree.h | 21 ++++++++++++--
 lib/idr.c                  | 66 +++++++++++++++++++++++++-------------------
 lib/radix-tree.c           |  6 ++--
 4 files changed, 125 insertions(+), 37 deletions(-)

(limited to 'include')

diff --git a/include/linux/idr.h b/include/linux/idr.h
index bf70b3ef0a07..7c3a365f7e12 100644
--- a/include/linux/idr.h
+++ b/include/linux/idr.h
@@ -80,19 +80,75 @@ static inline void idr_set_cursor(struct idr *idr, unsigned int val)
  */
 
 void idr_preload(gfp_t gfp_mask);
-int idr_alloc(struct idr *, void *entry, int start, int end, gfp_t);
+
+int idr_alloc_cmn(struct idr *idr, void *ptr, unsigned long *index,
+		  unsigned long start, unsigned long end, gfp_t gfp,
+		  bool ext);
+
+/**
+ * idr_alloc - allocate an id
+ * @idr: idr handle
+ * @ptr: pointer to be associated with the new id
+ * @start: the minimum id (inclusive)
+ * @end: the maximum id (exclusive)
+ * @gfp: memory allocation flags
+ *
+ * Allocates an unused ID in the range [start, end).  Returns -ENOSPC
+ * if there are no unused IDs in that range.
+ *
+ * Note that @end is treated as max when <= 0.  This is to always allow
+ * using @start + N as @end as long as N is inside integer range.
+ *
+ * Simultaneous modifications to the @idr are not allowed and should be
+ * prevented by the user, usually with a lock.  idr_alloc() may be called
+ * concurrently with read-only accesses to the @idr, such as idr_find() and
+ * idr_for_each_entry().
+ */
+static inline int idr_alloc(struct idr *idr, void *ptr,
+			    int start, int end, gfp_t gfp)
+{
+	unsigned long id;
+	int ret;
+
+	if (WARN_ON_ONCE(start < 0))
+		return -EINVAL;
+
+	ret = idr_alloc_cmn(idr, ptr, &id, start, end, gfp, false);
+
+	if (ret)
+		return ret;
+
+	return id;
+}
+
+static inline int idr_alloc_ext(struct idr *idr, void *ptr,
+				unsigned long *index,
+				unsigned long start,
+				unsigned long end,
+				gfp_t gfp)
+{
+	return idr_alloc_cmn(idr, ptr, index, start, end, gfp, true);
+}
+
 int idr_alloc_cyclic(struct idr *, void *entry, int start, int end, gfp_t);
 int idr_for_each(const struct idr *,
 		 int (*fn)(int id, void *p, void *data), void *data);
 void *idr_get_next(struct idr *, int *nextid);
+void *idr_get_next_ext(struct idr *idr, unsigned long *nextid);
 void *idr_replace(struct idr *, void *, int id);
+void *idr_replace_ext(struct idr *idr, void *ptr, unsigned long id);
 void idr_destroy(struct idr *);
 
-static inline void *idr_remove(struct idr *idr, int id)
+static inline void *idr_remove_ext(struct idr *idr, unsigned long id)
 {
 	return radix_tree_delete_item(&idr->idr_rt, id, NULL);
 }
 
+static inline void *idr_remove(struct idr *idr, int id)
+{
+	return idr_remove_ext(idr, id);
+}
+
 static inline void idr_init(struct idr *idr)
 {
 	INIT_RADIX_TREE(&idr->idr_rt, IDR_RT_MARKER);
@@ -128,11 +184,16 @@ static inline void idr_preload_end(void)
  * This function can be called under rcu_read_lock(), given that the leaf
  * pointers lifetimes are correctly managed.
  */
-static inline void *idr_find(const struct idr *idr, int id)
+static inline void *idr_find_ext(const struct idr *idr, unsigned long id)
 {
 	return radix_tree_lookup(&idr->idr_rt, id);
 }
 
+static inline void *idr_find(const struct idr *idr, int id)
+{
+	return idr_find_ext(idr, id);
+}
+
 /**
  * idr_for_each_entry - iterate over an idr's elements of a given type
  * @idr:     idr handle
@@ -145,6 +206,8 @@ static inline void *idr_find(const struct idr *idr, int id)
  */
 #define idr_for_each_entry(idr, entry, id)			\
 	for (id = 0; ((entry) = idr_get_next(idr, &(id))) != NULL; ++id)
+#define idr_for_each_entry_ext(idr, entry, id)			\
+	for (id = 0; ((entry) = idr_get_next_ext(idr, &(id))) != NULL; ++id)
 
 /**
  * idr_for_each_entry_continue - continue iteration over an idr's elements of a given type
diff --git a/include/linux/radix-tree.h b/include/linux/radix-tree.h
index 3e5735064b71..567ebb5eaab0 100644
--- a/include/linux/radix-tree.h
+++ b/include/linux/radix-tree.h
@@ -357,8 +357,25 @@ int radix_tree_split(struct radix_tree_root *, unsigned long index,
 			unsigned new_order);
 int radix_tree_join(struct radix_tree_root *, unsigned long index,
 			unsigned new_order, void *);
-void __rcu **idr_get_free(struct radix_tree_root *, struct radix_tree_iter *,
-			gfp_t, int end);
+
+void __rcu **idr_get_free_cmn(struct radix_tree_root *root,
+			      struct radix_tree_iter *iter, gfp_t gfp,
+			      unsigned long max);
+static inline void __rcu **idr_get_free(struct radix_tree_root *root,
+					struct radix_tree_iter *iter,
+					gfp_t gfp,
+					int end)
+{
+	return idr_get_free_cmn(root, iter, gfp, end > 0 ? end - 1 : INT_MAX);
+}
+
+static inline void __rcu **idr_get_free_ext(struct radix_tree_root *root,
+					    struct radix_tree_iter *iter,
+					    gfp_t gfp,
+					    unsigned long end)
+{
+	return idr_get_free_cmn(root, iter, gfp, end - 1);
+}
 
 enum {
 	RADIX_TREE_ITER_TAG_MASK = 0x0f,	/* tag index in lower nybble */
diff --git a/lib/idr.c b/lib/idr.c
index b13682bb0a1c..082778cf883e 100644
--- a/lib/idr.c
+++ b/lib/idr.c
@@ -7,45 +7,32 @@
 DEFINE_PER_CPU(struct ida_bitmap *, ida_bitmap);
 static DEFINE_SPINLOCK(simple_ida_lock);
 
-/**
- * idr_alloc - allocate an id
- * @idr: idr handle
- * @ptr: pointer to be associated with the new id
- * @start: the minimum id (inclusive)
- * @end: the maximum id (exclusive)
- * @gfp: memory allocation flags
- *
- * Allocates an unused ID in the range [start, end).  Returns -ENOSPC
- * if there are no unused IDs in that range.
- *
- * Note that @end is treated as max when <= 0.  This is to always allow
- * using @start + N as @end as long as N is inside integer range.
- *
- * Simultaneous modifications to the @idr are not allowed and should be
- * prevented by the user, usually with a lock.  idr_alloc() may be called
- * concurrently with read-only accesses to the @idr, such as idr_find() and
- * idr_for_each_entry().
- */
-int idr_alloc(struct idr *idr, void *ptr, int start, int end, gfp_t gfp)
+int idr_alloc_cmn(struct idr *idr, void *ptr, unsigned long *index,
+		  unsigned long start, unsigned long end, gfp_t gfp,
+		  bool ext)
 {
-	void __rcu **slot;
 	struct radix_tree_iter iter;
+	void __rcu **slot;
 
-	if (WARN_ON_ONCE(start < 0))
-		return -EINVAL;
 	if (WARN_ON_ONCE(radix_tree_is_internal_node(ptr)))
 		return -EINVAL;
 
 	radix_tree_iter_init(&iter, start);
-	slot = idr_get_free(&idr->idr_rt, &iter, gfp, end);
+	if (ext)
+		slot = idr_get_free_ext(&idr->idr_rt, &iter, gfp, end);
+	else
+		slot = idr_get_free(&idr->idr_rt, &iter, gfp, end);
 	if (IS_ERR(slot))
 		return PTR_ERR(slot);
 
 	radix_tree_iter_replace(&idr->idr_rt, &iter, slot, ptr);
 	radix_tree_iter_tag_clear(&idr->idr_rt, &iter, IDR_FREE);
-	return iter.index;
+
+	if (index)
+		*index = iter.index;
+	return 0;
 }
-EXPORT_SYMBOL_GPL(idr_alloc);
+EXPORT_SYMBOL_GPL(idr_alloc_cmn);
 
 /**
  * idr_alloc_cyclic - allocate new idr entry in a cyclical fashion
@@ -134,6 +121,20 @@ void *idr_get_next(struct idr *idr, int *nextid)
 }
 EXPORT_SYMBOL(idr_get_next);
 
+void *idr_get_next_ext(struct idr *idr, unsigned long *nextid)
+{
+	struct radix_tree_iter iter;
+	void __rcu **slot;
+
+	slot = radix_tree_iter_find(&idr->idr_rt, &iter, *nextid);
+	if (!slot)
+		return NULL;
+
+	*nextid = iter.index;
+	return rcu_dereference_raw(*slot);
+}
+EXPORT_SYMBOL(idr_get_next_ext);
+
 /**
  * idr_replace - replace pointer for given id
  * @idr: idr handle
@@ -149,13 +150,20 @@ EXPORT_SYMBOL(idr_get_next);
  * %-EINVAL indicates that @id or @ptr were not valid.
  */
 void *idr_replace(struct idr *idr, void *ptr, int id)
+{
+	if (WARN_ON_ONCE(id < 0))
+		return ERR_PTR(-EINVAL);
+
+	return idr_replace_ext(idr, ptr, id);
+}
+EXPORT_SYMBOL(idr_replace);
+
+void *idr_replace_ext(struct idr *idr, void *ptr, unsigned long id)
 {
 	struct radix_tree_node *node;
 	void __rcu **slot = NULL;
 	void *entry;
 
-	if (WARN_ON_ONCE(id < 0))
-		return ERR_PTR(-EINVAL);
 	if (WARN_ON_ONCE(radix_tree_is_internal_node(ptr)))
 		return ERR_PTR(-EINVAL);
 
@@ -167,7 +175,7 @@ void *idr_replace(struct idr *idr, void *ptr, int id)
 
 	return entry;
 }
-EXPORT_SYMBOL(idr_replace);
+EXPORT_SYMBOL(idr_replace_ext);
 
 /**
  * DOC: IDA description
diff --git a/lib/radix-tree.c b/lib/radix-tree.c
index 898e87998417..c191b42d1213 100644
--- a/lib/radix-tree.c
+++ b/lib/radix-tree.c
@@ -2137,13 +2137,13 @@ int ida_pre_get(struct ida *ida, gfp_t gfp)
 }
 EXPORT_SYMBOL(ida_pre_get);
 
-void __rcu **idr_get_free(struct radix_tree_root *root,
-			struct radix_tree_iter *iter, gfp_t gfp, int end)
+void __rcu **idr_get_free_cmn(struct radix_tree_root *root,
+			      struct radix_tree_iter *iter, gfp_t gfp,
+			      unsigned long max)
 {
 	struct radix_tree_node *node = NULL, *child;
 	void __rcu **slot = (void __rcu **)&root->rnode;
 	unsigned long maxindex, start = iter->next_index;
-	unsigned long max = end > 0 ? end - 1 : INT_MAX;
 	unsigned int shift, offset = 0;
 
  grow:
-- 
cgit v1.2.3


From 65a206c01e8e7ffe971477a36419422099216eff Mon Sep 17 00:00:00 2001
From: Chris Mi <chrism@mellanox.com>
Date: Wed, 30 Aug 2017 02:31:59 -0400
Subject: net/sched: Change act_api and act_xxx modules to use IDR

Typically, each TC filter has its own action. All the actions of the
same type are saved in its hash table. But the hash buckets are too
small that it degrades to a list. And the performance is greatly
affected. For example, it takes about 0m11.914s to insert 64K rules.
If we convert the hash table to IDR, it only takes about 0m1.500s.
The improvement is huge.

But please note that the test result is based on previous patch that
cls_flower uses IDR.

Signed-off-by: Chris Mi <chrism@mellanox.com>
Signed-off-by: Jiri Pirko <jiri@mellanox.com>
Acked-by: Jamal Hadi Salim <jhs@mojatatu.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/act_api.h      |  76 +++++---------
 net/sched/act_api.c        | 251 ++++++++++++++++++++++-----------------------
 net/sched/act_bpf.c        |  17 ++-
 net/sched/act_connmark.c   |  16 ++-
 net/sched/act_csum.c       |  16 ++-
 net/sched/act_gact.c       |  16 ++-
 net/sched/act_ife.c        |  18 ++--
 net/sched/act_ipt.c        |  26 +++--
 net/sched/act_mirred.c     |  19 ++--
 net/sched/act_nat.c        |  16 ++-
 net/sched/act_pedit.c      |  18 ++--
 net/sched/act_police.c     |  18 ++--
 net/sched/act_sample.c     |  17 ++-
 net/sched/act_simple.c     |  20 ++--
 net/sched/act_skbedit.c    |  18 ++--
 net/sched/act_skbmod.c     |  18 ++--
 net/sched/act_tunnel_key.c |  20 ++--
 net/sched/act_vlan.c       |  22 ++--
 18 files changed, 278 insertions(+), 344 deletions(-)

(limited to 'include')

diff --git a/include/net/act_api.h b/include/net/act_api.h
index 26ffd8333f50..8f3d5d8b5ae0 100644
--- a/include/net/act_api.h
+++ b/include/net/act_api.h
@@ -10,12 +10,9 @@
 #include <net/net_namespace.h>
 #include <net/netns/generic.h>
 
-
-struct tcf_hashinfo {
-	struct hlist_head	*htab;
-	unsigned int		hmask;
-	spinlock_t		lock;
-	u32			index;
+struct tcf_idrinfo {
+	spinlock_t	lock;
+	struct idr	action_idr;
 };
 
 struct tc_action_ops;
@@ -25,9 +22,8 @@ struct tc_action {
 	__u32				type; /* for backward compat(TCA_OLD_COMPAT) */
 	__u32				order;
 	struct list_head		list;
-	struct tcf_hashinfo		*hinfo;
+	struct tcf_idrinfo		*idrinfo;
 
-	struct hlist_node		tcfa_head;
 	u32				tcfa_index;
 	int				tcfa_refcnt;
 	int				tcfa_bindcnt;
@@ -44,7 +40,6 @@ struct tc_action {
 	struct tc_cookie	*act_cookie;
 	struct tcf_chain	*goto_chain;
 };
-#define tcf_head	common.tcfa_head
 #define tcf_index	common.tcfa_index
 #define tcf_refcnt	common.tcfa_refcnt
 #define tcf_bindcnt	common.tcfa_bindcnt
@@ -57,27 +52,6 @@ struct tc_action {
 #define tcf_lock	common.tcfa_lock
 #define tcf_rcu		common.tcfa_rcu
 
-static inline unsigned int tcf_hash(u32 index, unsigned int hmask)
-{
-	return index & hmask;
-}
-
-static inline int tcf_hashinfo_init(struct tcf_hashinfo *hf, unsigned int mask)
-{
-	int i;
-
-	spin_lock_init(&hf->lock);
-	hf->index = 0;
-	hf->hmask = mask;
-	hf->htab = kzalloc((mask + 1) * sizeof(struct hlist_head),
-			   GFP_KERNEL);
-	if (!hf->htab)
-		return -ENOMEM;
-	for (i = 0; i < mask + 1; i++)
-		INIT_HLIST_HEAD(&hf->htab[i]);
-	return 0;
-}
-
 /* Update lastuse only if needed, to avoid dirtying a cache line.
  * We use a temp variable to avoid fetching jiffies twice.
  */
@@ -126,53 +100,51 @@ struct tc_action_ops {
 };
 
 struct tc_action_net {
-	struct tcf_hashinfo *hinfo;
+	struct tcf_idrinfo *idrinfo;
 	const struct tc_action_ops *ops;
 };
 
 static inline
 int tc_action_net_init(struct tc_action_net *tn,
-		       const struct tc_action_ops *ops, unsigned int mask)
+		       const struct tc_action_ops *ops)
 {
 	int err = 0;
 
-	tn->hinfo = kmalloc(sizeof(*tn->hinfo), GFP_KERNEL);
-	if (!tn->hinfo)
+	tn->idrinfo = kmalloc(sizeof(*tn->idrinfo), GFP_KERNEL);
+	if (!tn->idrinfo)
 		return -ENOMEM;
 	tn->ops = ops;
-	err = tcf_hashinfo_init(tn->hinfo, mask);
-	if (err)
-		kfree(tn->hinfo);
+	spin_lock_init(&tn->idrinfo->lock);
+	idr_init(&tn->idrinfo->action_idr);
 	return err;
 }
 
-void tcf_hashinfo_destroy(const struct tc_action_ops *ops,
-			  struct tcf_hashinfo *hinfo);
+void tcf_idrinfo_destroy(const struct tc_action_ops *ops,
+			 struct tcf_idrinfo *idrinfo);
 
 static inline void tc_action_net_exit(struct tc_action_net *tn)
 {
-	tcf_hashinfo_destroy(tn->ops, tn->hinfo);
-	kfree(tn->hinfo);
+	tcf_idrinfo_destroy(tn->ops, tn->idrinfo);
+	kfree(tn->idrinfo);
 }
 
 int tcf_generic_walker(struct tc_action_net *tn, struct sk_buff *skb,
 		       struct netlink_callback *cb, int type,
 		       const struct tc_action_ops *ops);
-int tcf_hash_search(struct tc_action_net *tn, struct tc_action **a, u32 index);
-u32 tcf_hash_new_index(struct tc_action_net *tn);
-bool tcf_hash_check(struct tc_action_net *tn, u32 index, struct tc_action **a,
+int tcf_idr_search(struct tc_action_net *tn, struct tc_action **a, u32 index);
+bool tcf_idr_check(struct tc_action_net *tn, u32 index, struct tc_action **a,
 		    int bind);
-int tcf_hash_create(struct tc_action_net *tn, u32 index, struct nlattr *est,
-		    struct tc_action **a, const struct tc_action_ops *ops, int bind,
-		    bool cpustats);
-void tcf_hash_cleanup(struct tc_action *a, struct nlattr *est);
-void tcf_hash_insert(struct tc_action_net *tn, struct tc_action *a);
+int tcf_idr_create(struct tc_action_net *tn, u32 index, struct nlattr *est,
+		   struct tc_action **a, const struct tc_action_ops *ops,
+		   int bind, bool cpustats);
+void tcf_idr_cleanup(struct tc_action *a, struct nlattr *est);
+void tcf_idr_insert(struct tc_action_net *tn, struct tc_action *a);
 
-int __tcf_hash_release(struct tc_action *a, bool bind, bool strict);
+int __tcf_idr_release(struct tc_action *a, bool bind, bool strict);
 
-static inline int tcf_hash_release(struct tc_action *a, bool bind)
+static inline int tcf_idr_release(struct tc_action *a, bool bind)
 {
-	return __tcf_hash_release(a, bind, false);
+	return __tcf_idr_release(a, bind, false);
 }
 
 int tcf_register_action(struct tc_action_ops *a, struct pernet_operations *ops);
diff --git a/net/sched/act_api.c b/net/sched/act_api.c
index 02fcb0c78a28..0eb545bcb247 100644
--- a/net/sched/act_api.c
+++ b/net/sched/act_api.c
@@ -70,11 +70,11 @@ static void free_tcf(struct rcu_head *head)
 	kfree(p);
 }
 
-static void tcf_hash_destroy(struct tcf_hashinfo *hinfo, struct tc_action *p)
+static void tcf_idr_remove(struct tcf_idrinfo *idrinfo, struct tc_action *p)
 {
-	spin_lock_bh(&hinfo->lock);
-	hlist_del(&p->tcfa_head);
-	spin_unlock_bh(&hinfo->lock);
+	spin_lock_bh(&idrinfo->lock);
+	idr_remove_ext(&idrinfo->action_idr, p->tcfa_index);
+	spin_unlock_bh(&idrinfo->lock);
 	gen_kill_estimator(&p->tcfa_rate_est);
 	/*
 	 * gen_estimator est_timer() might access p->tcfa_lock
@@ -83,7 +83,7 @@ static void tcf_hash_destroy(struct tcf_hashinfo *hinfo, struct tc_action *p)
 	call_rcu(&p->tcfa_rcu, free_tcf);
 }
 
-int __tcf_hash_release(struct tc_action *p, bool bind, bool strict)
+int __tcf_idr_release(struct tc_action *p, bool bind, bool strict)
 {
 	int ret = 0;
 
@@ -97,64 +97,60 @@ int __tcf_hash_release(struct tc_action *p, bool bind, bool strict)
 		if (p->tcfa_bindcnt <= 0 && p->tcfa_refcnt <= 0) {
 			if (p->ops->cleanup)
 				p->ops->cleanup(p, bind);
-			tcf_hash_destroy(p->hinfo, p);
+			tcf_idr_remove(p->idrinfo, p);
 			ret = ACT_P_DELETED;
 		}
 	}
 
 	return ret;
 }
-EXPORT_SYMBOL(__tcf_hash_release);
+EXPORT_SYMBOL(__tcf_idr_release);
 
-static int tcf_dump_walker(struct tcf_hashinfo *hinfo, struct sk_buff *skb,
+static int tcf_dump_walker(struct tcf_idrinfo *idrinfo, struct sk_buff *skb,
 			   struct netlink_callback *cb)
 {
-	int err = 0, index = -1, i = 0, s_i = 0, n_i = 0;
+	int err = 0, index = -1, s_i = 0, n_i = 0;
 	u32 act_flags = cb->args[2];
 	unsigned long jiffy_since = cb->args[3];
 	struct nlattr *nest;
+	struct idr *idr = &idrinfo->action_idr;
+	struct tc_action *p;
+	unsigned long id = 1;
 
-	spin_lock_bh(&hinfo->lock);
+	spin_lock_bh(&idrinfo->lock);
 
 	s_i = cb->args[0];
 
-	for (i = 0; i < (hinfo->hmask + 1); i++) {
-		struct hlist_head *head;
-		struct tc_action *p;
-
-		head = &hinfo->htab[tcf_hash(i, hinfo->hmask)];
-
-		hlist_for_each_entry_rcu(p, head, tcfa_head) {
-			index++;
-			if (index < s_i)
-				continue;
-
-			if (jiffy_since &&
-			    time_after(jiffy_since,
-				       (unsigned long)p->tcfa_tm.lastuse))
-				continue;
-
-			nest = nla_nest_start(skb, n_i);
-			if (nest == NULL)
-				goto nla_put_failure;
-			err = tcf_action_dump_1(skb, p, 0, 0);
-			if (err < 0) {
-				index--;
-				nlmsg_trim(skb, nest);
-				goto done;
-			}
-			nla_nest_end(skb, nest);
-			n_i++;
-			if (!(act_flags & TCA_FLAG_LARGE_DUMP_ON) &&
-			    n_i >= TCA_ACT_MAX_PRIO)
-				goto done;
+	idr_for_each_entry_ext(idr, p, id) {
+		index++;
+		if (index < s_i)
+			continue;
+
+		if (jiffy_since &&
+		    time_after(jiffy_since,
+			       (unsigned long)p->tcfa_tm.lastuse))
+			continue;
+
+		nest = nla_nest_start(skb, n_i);
+		if (!nest)
+			goto nla_put_failure;
+		err = tcf_action_dump_1(skb, p, 0, 0);
+		if (err < 0) {
+			index--;
+			nlmsg_trim(skb, nest);
+			goto done;
 		}
+		nla_nest_end(skb, nest);
+		n_i++;
+		if (!(act_flags & TCA_FLAG_LARGE_DUMP_ON) &&
+		    n_i >= TCA_ACT_MAX_PRIO)
+			goto done;
 	}
 done:
 	if (index >= 0)
 		cb->args[0] = index + 1;
 
-	spin_unlock_bh(&hinfo->lock);
+	spin_unlock_bh(&idrinfo->lock);
 	if (n_i) {
 		if (act_flags & TCA_FLAG_LARGE_DUMP_ON)
 			cb->args[1] = n_i;
@@ -166,31 +162,29 @@ nla_put_failure:
 	goto done;
 }
 
-static int tcf_del_walker(struct tcf_hashinfo *hinfo, struct sk_buff *skb,
+static int tcf_del_walker(struct tcf_idrinfo *idrinfo, struct sk_buff *skb,
 			  const struct tc_action_ops *ops)
 {
 	struct nlattr *nest;
-	int i = 0, n_i = 0;
+	int n_i = 0;
 	int ret = -EINVAL;
+	struct idr *idr = &idrinfo->action_idr;
+	struct tc_action *p;
+	unsigned long id = 1;
 
 	nest = nla_nest_start(skb, 0);
 	if (nest == NULL)
 		goto nla_put_failure;
 	if (nla_put_string(skb, TCA_KIND, ops->kind))
 		goto nla_put_failure;
-	for (i = 0; i < (hinfo->hmask + 1); i++) {
-		struct hlist_head *head;
-		struct hlist_node *n;
-		struct tc_action *p;
-
-		head = &hinfo->htab[tcf_hash(i, hinfo->hmask)];
-		hlist_for_each_entry_safe(p, n, head, tcfa_head) {
-			ret = __tcf_hash_release(p, false, true);
-			if (ret == ACT_P_DELETED) {
-				module_put(p->ops->owner);
-				n_i++;
-			} else if (ret < 0)
-				goto nla_put_failure;
+
+	idr_for_each_entry_ext(idr, p, id) {
+		ret = __tcf_idr_release(p, false, true);
+		if (ret == ACT_P_DELETED) {
+			module_put(p->ops->owner);
+			n_i++;
+		} else if (ret < 0) {
+			goto nla_put_failure;
 		}
 	}
 	if (nla_put_u32(skb, TCA_FCNT, n_i))
@@ -207,12 +201,12 @@ int tcf_generic_walker(struct tc_action_net *tn, struct sk_buff *skb,
 		       struct netlink_callback *cb, int type,
 		       const struct tc_action_ops *ops)
 {
-	struct tcf_hashinfo *hinfo = tn->hinfo;
+	struct tcf_idrinfo *idrinfo = tn->idrinfo;
 
 	if (type == RTM_DELACTION) {
-		return tcf_del_walker(hinfo, skb, ops);
+		return tcf_del_walker(idrinfo, skb, ops);
 	} else if (type == RTM_GETACTION) {
-		return tcf_dump_walker(hinfo, skb, cb);
+		return tcf_dump_walker(idrinfo, skb, cb);
 	} else {
 		WARN(1, "tcf_generic_walker: unknown action %d\n", type);
 		return -EINVAL;
@@ -220,40 +214,21 @@ int tcf_generic_walker(struct tc_action_net *tn, struct sk_buff *skb,
 }
 EXPORT_SYMBOL(tcf_generic_walker);
 
-static struct tc_action *tcf_hash_lookup(u32 index, struct tcf_hashinfo *hinfo)
+static struct tc_action *tcf_idr_lookup(u32 index, struct tcf_idrinfo *idrinfo)
 {
 	struct tc_action *p = NULL;
-	struct hlist_head *head;
 
-	spin_lock_bh(&hinfo->lock);
-	head = &hinfo->htab[tcf_hash(index, hinfo->hmask)];
-	hlist_for_each_entry_rcu(p, head, tcfa_head)
-		if (p->tcfa_index == index)
-			break;
-	spin_unlock_bh(&hinfo->lock);
+	spin_lock_bh(&idrinfo->lock);
+	p = idr_find_ext(&idrinfo->action_idr, index);
+	spin_unlock_bh(&idrinfo->lock);
 
 	return p;
 }
 
-u32 tcf_hash_new_index(struct tc_action_net *tn)
-{
-	struct tcf_hashinfo *hinfo = tn->hinfo;
-	u32 val = hinfo->index;
-
-	do {
-		if (++val == 0)
-			val = 1;
-	} while (tcf_hash_lookup(val, hinfo));
-
-	hinfo->index = val;
-	return val;
-}
-EXPORT_SYMBOL(tcf_hash_new_index);
-
-int tcf_hash_search(struct tc_action_net *tn, struct tc_action **a, u32 index)
+int tcf_idr_search(struct tc_action_net *tn, struct tc_action **a, u32 index)
 {
-	struct tcf_hashinfo *hinfo = tn->hinfo;
-	struct tc_action *p = tcf_hash_lookup(index, hinfo);
+	struct tcf_idrinfo *idrinfo = tn->idrinfo;
+	struct tc_action *p = tcf_idr_lookup(index, idrinfo);
 
 	if (p) {
 		*a = p;
@@ -261,15 +236,15 @@ int tcf_hash_search(struct tc_action_net *tn, struct tc_action **a, u32 index)
 	}
 	return 0;
 }
-EXPORT_SYMBOL(tcf_hash_search);
+EXPORT_SYMBOL(tcf_idr_search);
 
-bool tcf_hash_check(struct tc_action_net *tn, u32 index, struct tc_action **a,
-		    int bind)
+bool tcf_idr_check(struct tc_action_net *tn, u32 index, struct tc_action **a,
+		   int bind)
 {
-	struct tcf_hashinfo *hinfo = tn->hinfo;
-	struct tc_action *p = NULL;
+	struct tcf_idrinfo *idrinfo = tn->idrinfo;
+	struct tc_action *p = tcf_idr_lookup(index, idrinfo);
 
-	if (index && (p = tcf_hash_lookup(index, hinfo)) != NULL) {
+	if (index && p) {
 		if (bind)
 			p->tcfa_bindcnt++;
 		p->tcfa_refcnt++;
@@ -278,23 +253,25 @@ bool tcf_hash_check(struct tc_action_net *tn, u32 index, struct tc_action **a,
 	}
 	return false;
 }
-EXPORT_SYMBOL(tcf_hash_check);
+EXPORT_SYMBOL(tcf_idr_check);
 
-void tcf_hash_cleanup(struct tc_action *a, struct nlattr *est)
+void tcf_idr_cleanup(struct tc_action *a, struct nlattr *est)
 {
 	if (est)
 		gen_kill_estimator(&a->tcfa_rate_est);
 	call_rcu(&a->tcfa_rcu, free_tcf);
 }
-EXPORT_SYMBOL(tcf_hash_cleanup);
+EXPORT_SYMBOL(tcf_idr_cleanup);
 
-int tcf_hash_create(struct tc_action_net *tn, u32 index, struct nlattr *est,
-		    struct tc_action **a, const struct tc_action_ops *ops,
-		    int bind, bool cpustats)
+int tcf_idr_create(struct tc_action_net *tn, u32 index, struct nlattr *est,
+		   struct tc_action **a, const struct tc_action_ops *ops,
+		   int bind, bool cpustats)
 {
 	struct tc_action *p = kzalloc(ops->size, GFP_KERNEL);
-	struct tcf_hashinfo *hinfo = tn->hinfo;
+	struct tcf_idrinfo *idrinfo = tn->idrinfo;
+	struct idr *idr = &idrinfo->action_idr;
 	int err = -ENOMEM;
+	unsigned long idr_index;
 
 	if (unlikely(!p))
 		return -ENOMEM;
@@ -317,8 +294,28 @@ err2:
 		}
 	}
 	spin_lock_init(&p->tcfa_lock);
-	INIT_HLIST_NODE(&p->tcfa_head);
-	p->tcfa_index = index ? index : tcf_hash_new_index(tn);
+	/* user doesn't specify an index */
+	if (!index) {
+		spin_lock_bh(&idrinfo->lock);
+		err = idr_alloc_ext(idr, NULL, &idr_index, 1, 0,
+				    GFP_KERNEL);
+		spin_unlock_bh(&idrinfo->lock);
+		if (err) {
+err3:
+			free_percpu(p->cpu_qstats);
+			goto err2;
+		}
+		p->tcfa_index = idr_index;
+	} else {
+		spin_lock_bh(&idrinfo->lock);
+		err = idr_alloc_ext(idr, NULL, NULL, index, index + 1,
+				    GFP_KERNEL);
+		spin_unlock_bh(&idrinfo->lock);
+		if (err)
+			goto err3;
+		p->tcfa_index = index;
+	}
+
 	p->tcfa_tm.install = jiffies;
 	p->tcfa_tm.lastuse = jiffies;
 	p->tcfa_tm.firstuse = 0;
@@ -327,52 +324,46 @@ err2:
 					&p->tcfa_rate_est,
 					&p->tcfa_lock, NULL, est);
 		if (err) {
-			free_percpu(p->cpu_qstats);
-			goto err2;
+			goto err3;
 		}
 	}
 
-	p->hinfo = hinfo;
+	p->idrinfo = idrinfo;
 	p->ops = ops;
 	INIT_LIST_HEAD(&p->list);
 	*a = p;
 	return 0;
 }
-EXPORT_SYMBOL(tcf_hash_create);
+EXPORT_SYMBOL(tcf_idr_create);
 
-void tcf_hash_insert(struct tc_action_net *tn, struct tc_action *a)
+void tcf_idr_insert(struct tc_action_net *tn, struct tc_action *a)
 {
-	struct tcf_hashinfo *hinfo = tn->hinfo;
-	unsigned int h = tcf_hash(a->tcfa_index, hinfo->hmask);
+	struct tcf_idrinfo *idrinfo = tn->idrinfo;
 
-	spin_lock_bh(&hinfo->lock);
-	hlist_add_head(&a->tcfa_head, &hinfo->htab[h]);
-	spin_unlock_bh(&hinfo->lock);
+	spin_lock_bh(&idrinfo->lock);
+	idr_replace_ext(&idrinfo->action_idr, a, a->tcfa_index);
+	spin_unlock_bh(&idrinfo->lock);
 }
-EXPORT_SYMBOL(tcf_hash_insert);
+EXPORT_SYMBOL(tcf_idr_insert);
 
-void tcf_hashinfo_destroy(const struct tc_action_ops *ops,
-			  struct tcf_hashinfo *hinfo)
+void tcf_idrinfo_destroy(const struct tc_action_ops *ops,
+			 struct tcf_idrinfo *idrinfo)
 {
-	int i;
-
-	for (i = 0; i < hinfo->hmask + 1; i++) {
-		struct tc_action *p;
-		struct hlist_node *n;
-
-		hlist_for_each_entry_safe(p, n, &hinfo->htab[i], tcfa_head) {
-			int ret;
+	struct idr *idr = &idrinfo->action_idr;
+	struct tc_action *p;
+	int ret;
+	unsigned long id = 1;
 
-			ret = __tcf_hash_release(p, false, true);
-			if (ret == ACT_P_DELETED)
-				module_put(ops->owner);
-			else if (ret < 0)
-				return;
-		}
+	idr_for_each_entry_ext(idr, p, id) {
+		ret = __tcf_idr_release(p, false, true);
+		if (ret == ACT_P_DELETED)
+			module_put(ops->owner);
+		else if (ret < 0)
+			return;
 	}
-	kfree(hinfo->htab);
+	idr_destroy(&idrinfo->action_idr);
 }
-EXPORT_SYMBOL(tcf_hashinfo_destroy);
+EXPORT_SYMBOL(tcf_idrinfo_destroy);
 
 static LIST_HEAD(act_base);
 static DEFINE_RWLOCK(act_mod_lock);
@@ -524,7 +515,7 @@ int tcf_action_destroy(struct list_head *actions, int bind)
 	int ret = 0;
 
 	list_for_each_entry_safe(a, tmp, actions, list) {
-		ret = __tcf_hash_release(a, bind, true);
+		ret = __tcf_idr_release(a, bind, true);
 		if (ret == ACT_P_DELETED)
 			module_put(a->ops->owner);
 		else if (ret < 0)
diff --git a/net/sched/act_bpf.c b/net/sched/act_bpf.c
index 9afe1337cfd1..c0c707eb2c96 100644
--- a/net/sched/act_bpf.c
+++ b/net/sched/act_bpf.c
@@ -21,7 +21,6 @@
 #include <linux/tc_act/tc_bpf.h>
 #include <net/tc_act/tc_bpf.h>
 
-#define BPF_TAB_MASK		15
 #define ACT_BPF_NAME_LEN	256
 
 struct tcf_bpf_cfg {
@@ -295,9 +294,9 @@ static int tcf_bpf_init(struct net *net, struct nlattr *nla,
 
 	parm = nla_data(tb[TCA_ACT_BPF_PARMS]);
 
-	if (!tcf_hash_check(tn, parm->index, act, bind)) {
-		ret = tcf_hash_create(tn, parm->index, est, act,
-				      &act_bpf_ops, bind, true);
+	if (!tcf_idr_check(tn, parm->index, act, bind)) {
+		ret = tcf_idr_create(tn, parm->index, est, act,
+				     &act_bpf_ops, bind, true);
 		if (ret < 0)
 			return ret;
 
@@ -307,7 +306,7 @@ static int tcf_bpf_init(struct net *net, struct nlattr *nla,
 		if (bind)
 			return 0;
 
-		tcf_hash_release(*act, bind);
+		tcf_idr_release(*act, bind);
 		if (!replace)
 			return -EEXIST;
 	}
@@ -343,7 +342,7 @@ static int tcf_bpf_init(struct net *net, struct nlattr *nla,
 	rcu_assign_pointer(prog->filter, cfg.filter);
 
 	if (res == ACT_P_CREATED) {
-		tcf_hash_insert(tn, *act);
+		tcf_idr_insert(tn, *act);
 	} else {
 		/* make sure the program being replaced is no longer executing */
 		synchronize_rcu();
@@ -353,7 +352,7 @@ static int tcf_bpf_init(struct net *net, struct nlattr *nla,
 	return res;
 out:
 	if (res == ACT_P_CREATED)
-		tcf_hash_cleanup(*act, est);
+		tcf_idr_cleanup(*act, est);
 
 	return ret;
 }
@@ -379,7 +378,7 @@ static int tcf_bpf_search(struct net *net, struct tc_action **a, u32 index)
 {
 	struct tc_action_net *tn = net_generic(net, bpf_net_id);
 
-	return tcf_hash_search(tn, a, index);
+	return tcf_idr_search(tn, a, index);
 }
 
 static struct tc_action_ops act_bpf_ops __read_mostly = {
@@ -399,7 +398,7 @@ static __net_init int bpf_init_net(struct net *net)
 {
 	struct tc_action_net *tn = net_generic(net, bpf_net_id);
 
-	return tc_action_net_init(tn, &act_bpf_ops, BPF_TAB_MASK);
+	return tc_action_net_init(tn, &act_bpf_ops);
 }
 
 static void __net_exit bpf_exit_net(struct net *net)
diff --git a/net/sched/act_connmark.c b/net/sched/act_connmark.c
index 2155bc6c6a1e..10b7a8855a6c 100644
--- a/net/sched/act_connmark.c
+++ b/net/sched/act_connmark.c
@@ -28,8 +28,6 @@
 #include <net/netfilter/nf_conntrack_core.h>
 #include <net/netfilter/nf_conntrack_zones.h>
 
-#define CONNMARK_TAB_MASK     3
-
 static unsigned int connmark_net_id;
 static struct tc_action_ops act_connmark_ops;
 
@@ -119,9 +117,9 @@ static int tcf_connmark_init(struct net *net, struct nlattr *nla,
 
 	parm = nla_data(tb[TCA_CONNMARK_PARMS]);
 
-	if (!tcf_hash_check(tn, parm->index, a, bind)) {
-		ret = tcf_hash_create(tn, parm->index, est, a,
-				      &act_connmark_ops, bind, false);
+	if (!tcf_idr_check(tn, parm->index, a, bind)) {
+		ret = tcf_idr_create(tn, parm->index, est, a,
+				     &act_connmark_ops, bind, false);
 		if (ret)
 			return ret;
 
@@ -130,13 +128,13 @@ static int tcf_connmark_init(struct net *net, struct nlattr *nla,
 		ci->net = net;
 		ci->zone = parm->zone;
 
-		tcf_hash_insert(tn, *a);
+		tcf_idr_insert(tn, *a);
 		ret = ACT_P_CREATED;
 	} else {
 		ci = to_connmark(*a);
 		if (bind)
 			return 0;
-		tcf_hash_release(*a, bind);
+		tcf_idr_release(*a, bind);
 		if (!ovr)
 			return -EEXIST;
 		/* replacing action and zone */
@@ -189,7 +187,7 @@ static int tcf_connmark_search(struct net *net, struct tc_action **a, u32 index)
 {
 	struct tc_action_net *tn = net_generic(net, connmark_net_id);
 
-	return tcf_hash_search(tn, a, index);
+	return tcf_idr_search(tn, a, index);
 }
 
 static struct tc_action_ops act_connmark_ops = {
@@ -208,7 +206,7 @@ static __net_init int connmark_init_net(struct net *net)
 {
 	struct tc_action_net *tn = net_generic(net, connmark_net_id);
 
-	return tc_action_net_init(tn, &act_connmark_ops, CONNMARK_TAB_MASK);
+	return tc_action_net_init(tn, &act_connmark_ops);
 }
 
 static void __net_exit connmark_exit_net(struct net *net)
diff --git a/net/sched/act_csum.c b/net/sched/act_csum.c
index 67afc12df88b..1c40caadcff9 100644
--- a/net/sched/act_csum.c
+++ b/net/sched/act_csum.c
@@ -37,8 +37,6 @@
 #include <linux/tc_act/tc_csum.h>
 #include <net/tc_act/tc_csum.h>
 
-#define CSUM_TAB_MASK 15
-
 static const struct nla_policy csum_policy[TCA_CSUM_MAX + 1] = {
 	[TCA_CSUM_PARMS] = { .len = sizeof(struct tc_csum), },
 };
@@ -67,16 +65,16 @@ static int tcf_csum_init(struct net *net, struct nlattr *nla,
 		return -EINVAL;
 	parm = nla_data(tb[TCA_CSUM_PARMS]);
 
-	if (!tcf_hash_check(tn, parm->index, a, bind)) {
-		ret = tcf_hash_create(tn, parm->index, est, a,
-				      &act_csum_ops, bind, false);
+	if (!tcf_idr_check(tn, parm->index, a, bind)) {
+		ret = tcf_idr_create(tn, parm->index, est, a,
+				     &act_csum_ops, bind, false);
 		if (ret)
 			return ret;
 		ret = ACT_P_CREATED;
 	} else {
 		if (bind)/* dont override defaults */
 			return 0;
-		tcf_hash_release(*a, bind);
+		tcf_idr_release(*a, bind);
 		if (!ovr)
 			return -EEXIST;
 	}
@@ -88,7 +86,7 @@ static int tcf_csum_init(struct net *net, struct nlattr *nla,
 	spin_unlock_bh(&p->tcf_lock);
 
 	if (ret == ACT_P_CREATED)
-		tcf_hash_insert(tn, *a);
+		tcf_idr_insert(tn, *a);
 
 	return ret;
 }
@@ -609,7 +607,7 @@ static int tcf_csum_search(struct net *net, struct tc_action **a, u32 index)
 {
 	struct tc_action_net *tn = net_generic(net, csum_net_id);
 
-	return tcf_hash_search(tn, a, index);
+	return tcf_idr_search(tn, a, index);
 }
 
 static struct tc_action_ops act_csum_ops = {
@@ -628,7 +626,7 @@ static __net_init int csum_init_net(struct net *net)
 {
 	struct tc_action_net *tn = net_generic(net, csum_net_id);
 
-	return tc_action_net_init(tn, &act_csum_ops, CSUM_TAB_MASK);
+	return tc_action_net_init(tn, &act_csum_ops);
 }
 
 static void __net_exit csum_exit_net(struct net *net)
diff --git a/net/sched/act_gact.c b/net/sched/act_gact.c
index 99afe8b1f1fb..e29a48ef7fc3 100644
--- a/net/sched/act_gact.c
+++ b/net/sched/act_gact.c
@@ -23,8 +23,6 @@
 #include <linux/tc_act/tc_gact.h>
 #include <net/tc_act/tc_gact.h>
 
-#define GACT_TAB_MASK	15
-
 static unsigned int gact_net_id;
 static struct tc_action_ops act_gact_ops;
 
@@ -92,16 +90,16 @@ static int tcf_gact_init(struct net *net, struct nlattr *nla,
 	}
 #endif
 
-	if (!tcf_hash_check(tn, parm->index, a, bind)) {
-		ret = tcf_hash_create(tn, parm->index, est, a,
-				      &act_gact_ops, bind, true);
+	if (!tcf_idr_check(tn, parm->index, a, bind)) {
+		ret = tcf_idr_create(tn, parm->index, est, a,
+				     &act_gact_ops, bind, true);
 		if (ret)
 			return ret;
 		ret = ACT_P_CREATED;
 	} else {
 		if (bind)/* dont override defaults */
 			return 0;
-		tcf_hash_release(*a, bind);
+		tcf_idr_release(*a, bind);
 		if (!ovr)
 			return -EEXIST;
 	}
@@ -122,7 +120,7 @@ static int tcf_gact_init(struct net *net, struct nlattr *nla,
 	}
 #endif
 	if (ret == ACT_P_CREATED)
-		tcf_hash_insert(tn, *a);
+		tcf_idr_insert(tn, *a);
 	return ret;
 }
 
@@ -214,7 +212,7 @@ static int tcf_gact_search(struct net *net, struct tc_action **a, u32 index)
 {
 	struct tc_action_net *tn = net_generic(net, gact_net_id);
 
-	return tcf_hash_search(tn, a, index);
+	return tcf_idr_search(tn, a, index);
 }
 
 static struct tc_action_ops act_gact_ops = {
@@ -234,7 +232,7 @@ static __net_init int gact_init_net(struct net *net)
 {
 	struct tc_action_net *tn = net_generic(net, gact_net_id);
 
-	return tc_action_net_init(tn, &act_gact_ops, GACT_TAB_MASK);
+	return tc_action_net_init(tn, &act_gact_ops);
 }
 
 static void __net_exit gact_exit_net(struct net *net)
diff --git a/net/sched/act_ife.c b/net/sched/act_ife.c
index 7ed1be80ee86..8ccd35825b6b 100644
--- a/net/sched/act_ife.c
+++ b/net/sched/act_ife.c
@@ -34,8 +34,6 @@
 #include <linux/etherdevice.h>
 #include <net/ife.h>
 
-#define IFE_TAB_MASK 15
-
 static unsigned int ife_net_id;
 static int max_metacnt = IFE_META_MAX + 1;
 static struct tc_action_ops act_ife_ops;
@@ -452,18 +450,18 @@ static int tcf_ife_init(struct net *net, struct nlattr *nla,
 
 	parm = nla_data(tb[TCA_IFE_PARMS]);
 
-	exists = tcf_hash_check(tn, parm->index, a, bind);
+	exists = tcf_idr_check(tn, parm->index, a, bind);
 	if (exists && bind)
 		return 0;
 
 	if (!exists) {
-		ret = tcf_hash_create(tn, parm->index, est, a, &act_ife_ops,
-				      bind, false);
+		ret = tcf_idr_create(tn, parm->index, est, a, &act_ife_ops,
+				     bind, false);
 		if (ret)
 			return ret;
 		ret = ACT_P_CREATED;
 	} else {
-		tcf_hash_release(*a, bind);
+		tcf_idr_release(*a, bind);
 		if (!ovr)
 			return -EEXIST;
 	}
@@ -507,7 +505,7 @@ static int tcf_ife_init(struct net *net, struct nlattr *nla,
 		if (err) {
 metadata_parse_err:
 			if (exists)
-				tcf_hash_release(*a, bind);
+				tcf_idr_release(*a, bind);
 			if (ret == ACT_P_CREATED)
 				_tcf_ife_cleanup(*a, bind);
 
@@ -541,7 +539,7 @@ metadata_parse_err:
 		spin_unlock_bh(&ife->tcf_lock);
 
 	if (ret == ACT_P_CREATED)
-		tcf_hash_insert(tn, *a);
+		tcf_idr_insert(tn, *a);
 
 	return ret;
 }
@@ -800,7 +798,7 @@ static int tcf_ife_search(struct net *net, struct tc_action **a, u32 index)
 {
 	struct tc_action_net *tn = net_generic(net, ife_net_id);
 
-	return tcf_hash_search(tn, a, index);
+	return tcf_idr_search(tn, a, index);
 }
 
 static struct tc_action_ops act_ife_ops = {
@@ -820,7 +818,7 @@ static __net_init int ife_init_net(struct net *net)
 {
 	struct tc_action_net *tn = net_generic(net, ife_net_id);
 
-	return tc_action_net_init(tn, &act_ife_ops, IFE_TAB_MASK);
+	return tc_action_net_init(tn, &act_ife_ops);
 }
 
 static void __net_exit ife_exit_net(struct net *net)
diff --git a/net/sched/act_ipt.c b/net/sched/act_ipt.c
index 541707802a23..d9e399a7e3d5 100644
--- a/net/sched/act_ipt.c
+++ b/net/sched/act_ipt.c
@@ -28,8 +28,6 @@
 #include <linux/netfilter_ipv4/ip_tables.h>
 
 
-#define IPT_TAB_MASK     15
-
 static unsigned int ipt_net_id;
 static struct tc_action_ops act_ipt_ops;
 
@@ -118,33 +116,33 @@ static int __tcf_ipt_init(struct net *net, unsigned int id, struct nlattr *nla,
 	if (tb[TCA_IPT_INDEX] != NULL)
 		index = nla_get_u32(tb[TCA_IPT_INDEX]);
 
-	exists = tcf_hash_check(tn, index, a, bind);
+	exists = tcf_idr_check(tn, index, a, bind);
 	if (exists && bind)
 		return 0;
 
 	if (tb[TCA_IPT_HOOK] == NULL || tb[TCA_IPT_TARG] == NULL) {
 		if (exists)
-			tcf_hash_release(*a, bind);
+			tcf_idr_release(*a, bind);
 		return -EINVAL;
 	}
 
 	td = (struct xt_entry_target *)nla_data(tb[TCA_IPT_TARG]);
 	if (nla_len(tb[TCA_IPT_TARG]) < td->u.target_size) {
 		if (exists)
-			tcf_hash_release(*a, bind);
+			tcf_idr_release(*a, bind);
 		return -EINVAL;
 	}
 
 	if (!exists) {
-		ret = tcf_hash_create(tn, index, est, a, ops, bind,
-				      false);
+		ret = tcf_idr_create(tn, index, est, a, ops, bind,
+				     false);
 		if (ret)
 			return ret;
 		ret = ACT_P_CREATED;
 	} else {
 		if (bind)/* dont override defaults */
 			return 0;
-		tcf_hash_release(*a, bind);
+		tcf_idr_release(*a, bind);
 
 		if (!ovr)
 			return -EEXIST;
@@ -180,7 +178,7 @@ static int __tcf_ipt_init(struct net *net, unsigned int id, struct nlattr *nla,
 	ipt->tcfi_hook  = hook;
 	spin_unlock_bh(&ipt->tcf_lock);
 	if (ret == ACT_P_CREATED)
-		tcf_hash_insert(tn, *a);
+		tcf_idr_insert(tn, *a);
 	return ret;
 
 err3:
@@ -189,7 +187,7 @@ err2:
 	kfree(tname);
 err1:
 	if (ret == ACT_P_CREATED)
-		tcf_hash_cleanup(*a, est);
+		tcf_idr_cleanup(*a, est);
 	return err;
 }
 
@@ -316,7 +314,7 @@ static int tcf_ipt_search(struct net *net, struct tc_action **a, u32 index)
 {
 	struct tc_action_net *tn = net_generic(net, ipt_net_id);
 
-	return tcf_hash_search(tn, a, index);
+	return tcf_idr_search(tn, a, index);
 }
 
 static struct tc_action_ops act_ipt_ops = {
@@ -336,7 +334,7 @@ static __net_init int ipt_init_net(struct net *net)
 {
 	struct tc_action_net *tn = net_generic(net, ipt_net_id);
 
-	return tc_action_net_init(tn, &act_ipt_ops, IPT_TAB_MASK);
+	return tc_action_net_init(tn, &act_ipt_ops);
 }
 
 static void __net_exit ipt_exit_net(struct net *net)
@@ -366,7 +364,7 @@ static int tcf_xt_search(struct net *net, struct tc_action **a, u32 index)
 {
 	struct tc_action_net *tn = net_generic(net, xt_net_id);
 
-	return tcf_hash_search(tn, a, index);
+	return tcf_idr_search(tn, a, index);
 }
 
 static struct tc_action_ops act_xt_ops = {
@@ -386,7 +384,7 @@ static __net_init int xt_init_net(struct net *net)
 {
 	struct tc_action_net *tn = net_generic(net, xt_net_id);
 
-	return tc_action_net_init(tn, &act_xt_ops, IPT_TAB_MASK);
+	return tc_action_net_init(tn, &act_xt_ops);
 }
 
 static void __net_exit xt_exit_net(struct net *net)
diff --git a/net/sched/act_mirred.c b/net/sched/act_mirred.c
index 1b5549ababd4..416627c66f08 100644
--- a/net/sched/act_mirred.c
+++ b/net/sched/act_mirred.c
@@ -28,7 +28,6 @@
 #include <linux/tc_act/tc_mirred.h>
 #include <net/tc_act/tc_mirred.h>
 
-#define MIRRED_TAB_MASK     7
 static LIST_HEAD(mirred_list);
 static DEFINE_SPINLOCK(mirred_list_lock);
 
@@ -94,7 +93,7 @@ static int tcf_mirred_init(struct net *net, struct nlattr *nla,
 		return -EINVAL;
 	parm = nla_data(tb[TCA_MIRRED_PARMS]);
 
-	exists = tcf_hash_check(tn, parm->index, a, bind);
+	exists = tcf_idr_check(tn, parm->index, a, bind);
 	if (exists && bind)
 		return 0;
 
@@ -106,14 +105,14 @@ static int tcf_mirred_init(struct net *net, struct nlattr *nla,
 		break;
 	default:
 		if (exists)
-			tcf_hash_release(*a, bind);
+			tcf_idr_release(*a, bind);
 		return -EINVAL;
 	}
 	if (parm->ifindex) {
 		dev = __dev_get_by_index(net, parm->ifindex);
 		if (dev == NULL) {
 			if (exists)
-				tcf_hash_release(*a, bind);
+				tcf_idr_release(*a, bind);
 			return -ENODEV;
 		}
 		mac_header_xmit = dev_is_mac_header_xmit(dev);
@@ -124,13 +123,13 @@ static int tcf_mirred_init(struct net *net, struct nlattr *nla,
 	if (!exists) {
 		if (dev == NULL)
 			return -EINVAL;
-		ret = tcf_hash_create(tn, parm->index, est, a,
-				      &act_mirred_ops, bind, true);
+		ret = tcf_idr_create(tn, parm->index, est, a,
+				     &act_mirred_ops, bind, true);
 		if (ret)
 			return ret;
 		ret = ACT_P_CREATED;
 	} else {
-		tcf_hash_release(*a, bind);
+		tcf_idr_release(*a, bind);
 		if (!ovr)
 			return -EEXIST;
 	}
@@ -152,7 +151,7 @@ static int tcf_mirred_init(struct net *net, struct nlattr *nla,
 		spin_lock_bh(&mirred_list_lock);
 		list_add(&m->tcfm_list, &mirred_list);
 		spin_unlock_bh(&mirred_list_lock);
-		tcf_hash_insert(tn, *a);
+		tcf_idr_insert(tn, *a);
 	}
 
 	return ret;
@@ -283,7 +282,7 @@ static int tcf_mirred_search(struct net *net, struct tc_action **a, u32 index)
 {
 	struct tc_action_net *tn = net_generic(net, mirred_net_id);
 
-	return tcf_hash_search(tn, a, index);
+	return tcf_idr_search(tn, a, index);
 }
 
 static int mirred_device_event(struct notifier_block *unused,
@@ -344,7 +343,7 @@ static __net_init int mirred_init_net(struct net *net)
 {
 	struct tc_action_net *tn = net_generic(net, mirred_net_id);
 
-	return tc_action_net_init(tn, &act_mirred_ops, MIRRED_TAB_MASK);
+	return tc_action_net_init(tn, &act_mirred_ops);
 }
 
 static void __net_exit mirred_exit_net(struct net *net)
diff --git a/net/sched/act_nat.c b/net/sched/act_nat.c
index 9016ab8a0649..c365d01b99c8 100644
--- a/net/sched/act_nat.c
+++ b/net/sched/act_nat.c
@@ -29,8 +29,6 @@
 #include <net/udp.h>
 
 
-#define NAT_TAB_MASK	15
-
 static unsigned int nat_net_id;
 static struct tc_action_ops act_nat_ops;
 
@@ -58,16 +56,16 @@ static int tcf_nat_init(struct net *net, struct nlattr *nla, struct nlattr *est,
 		return -EINVAL;
 	parm = nla_data(tb[TCA_NAT_PARMS]);
 
-	if (!tcf_hash_check(tn, parm->index, a, bind)) {
-		ret = tcf_hash_create(tn, parm->index, est, a,
-				      &act_nat_ops, bind, false);
+	if (!tcf_idr_check(tn, parm->index, a, bind)) {
+		ret = tcf_idr_create(tn, parm->index, est, a,
+				     &act_nat_ops, bind, false);
 		if (ret)
 			return ret;
 		ret = ACT_P_CREATED;
 	} else {
 		if (bind)
 			return 0;
-		tcf_hash_release(*a, bind);
+		tcf_idr_release(*a, bind);
 		if (!ovr)
 			return -EEXIST;
 	}
@@ -83,7 +81,7 @@ static int tcf_nat_init(struct net *net, struct nlattr *nla, struct nlattr *est,
 	spin_unlock_bh(&p->tcf_lock);
 
 	if (ret == ACT_P_CREATED)
-		tcf_hash_insert(tn, *a);
+		tcf_idr_insert(tn, *a);
 
 	return ret;
 }
@@ -290,7 +288,7 @@ static int tcf_nat_search(struct net *net, struct tc_action **a, u32 index)
 {
 	struct tc_action_net *tn = net_generic(net, nat_net_id);
 
-	return tcf_hash_search(tn, a, index);
+	return tcf_idr_search(tn, a, index);
 }
 
 static struct tc_action_ops act_nat_ops = {
@@ -309,7 +307,7 @@ static __net_init int nat_init_net(struct net *net)
 {
 	struct tc_action_net *tn = net_generic(net, nat_net_id);
 
-	return tc_action_net_init(tn, &act_nat_ops, NAT_TAB_MASK);
+	return tc_action_net_init(tn, &act_nat_ops);
 }
 
 static void __net_exit nat_exit_net(struct net *net)
diff --git a/net/sched/act_pedit.c b/net/sched/act_pedit.c
index 7dc5892671c8..491fe5deb09e 100644
--- a/net/sched/act_pedit.c
+++ b/net/sched/act_pedit.c
@@ -24,8 +24,6 @@
 #include <net/tc_act/tc_pedit.h>
 #include <uapi/linux/tc_act/tc_pedit.h>
 
-#define PEDIT_TAB_MASK	15
-
 static unsigned int pedit_net_id;
 static struct tc_action_ops act_pedit_ops;
 
@@ -168,17 +166,17 @@ static int tcf_pedit_init(struct net *net, struct nlattr *nla,
 	if (IS_ERR(keys_ex))
 		return PTR_ERR(keys_ex);
 
-	if (!tcf_hash_check(tn, parm->index, a, bind)) {
+	if (!tcf_idr_check(tn, parm->index, a, bind)) {
 		if (!parm->nkeys)
 			return -EINVAL;
-		ret = tcf_hash_create(tn, parm->index, est, a,
-				      &act_pedit_ops, bind, false);
+		ret = tcf_idr_create(tn, parm->index, est, a,
+				     &act_pedit_ops, bind, false);
 		if (ret)
 			return ret;
 		p = to_pedit(*a);
 		keys = kmalloc(ksize, GFP_KERNEL);
 		if (keys == NULL) {
-			tcf_hash_cleanup(*a, est);
+			tcf_idr_cleanup(*a, est);
 			kfree(keys_ex);
 			return -ENOMEM;
 		}
@@ -186,7 +184,7 @@ static int tcf_pedit_init(struct net *net, struct nlattr *nla,
 	} else {
 		if (bind)
 			return 0;
-		tcf_hash_release(*a, bind);
+		tcf_idr_release(*a, bind);
 		if (!ovr)
 			return -EEXIST;
 		p = to_pedit(*a);
@@ -214,7 +212,7 @@ static int tcf_pedit_init(struct net *net, struct nlattr *nla,
 
 	spin_unlock_bh(&p->tcf_lock);
 	if (ret == ACT_P_CREATED)
-		tcf_hash_insert(tn, *a);
+		tcf_idr_insert(tn, *a);
 	return ret;
 }
 
@@ -432,7 +430,7 @@ static int tcf_pedit_search(struct net *net, struct tc_action **a, u32 index)
 {
 	struct tc_action_net *tn = net_generic(net, pedit_net_id);
 
-	return tcf_hash_search(tn, a, index);
+	return tcf_idr_search(tn, a, index);
 }
 
 static struct tc_action_ops act_pedit_ops = {
@@ -452,7 +450,7 @@ static __net_init int pedit_init_net(struct net *net)
 {
 	struct tc_action_net *tn = net_generic(net, pedit_net_id);
 
-	return tc_action_net_init(tn, &act_pedit_ops, PEDIT_TAB_MASK);
+	return tc_action_net_init(tn, &act_pedit_ops);
 }
 
 static void __net_exit pedit_exit_net(struct net *net)
diff --git a/net/sched/act_police.c b/net/sched/act_police.c
index b062bc80c7cb..3bb2ebf9e9ae 100644
--- a/net/sched/act_police.c
+++ b/net/sched/act_police.c
@@ -40,8 +40,6 @@ struct tcf_police {
 
 #define to_police(pc) ((struct tcf_police *)pc)
 
-#define POL_TAB_MASK     15
-
 /* old policer structure from before tc actions */
 struct tc_police_compat {
 	u32			index;
@@ -101,18 +99,18 @@ static int tcf_act_police_init(struct net *net, struct nlattr *nla,
 		return -EINVAL;
 
 	parm = nla_data(tb[TCA_POLICE_TBF]);
-	exists = tcf_hash_check(tn, parm->index, a, bind);
+	exists = tcf_idr_check(tn, parm->index, a, bind);
 	if (exists && bind)
 		return 0;
 
 	if (!exists) {
-		ret = tcf_hash_create(tn, parm->index, NULL, a,
-				      &act_police_ops, bind, false);
+		ret = tcf_idr_create(tn, parm->index, NULL, a,
+				     &act_police_ops, bind, false);
 		if (ret)
 			return ret;
 		ret = ACT_P_CREATED;
 	} else {
-		tcf_hash_release(*a, bind);
+		tcf_idr_release(*a, bind);
 		if (!ovr)
 			return -EEXIST;
 	}
@@ -188,7 +186,7 @@ static int tcf_act_police_init(struct net *net, struct nlattr *nla,
 		return ret;
 
 	police->tcfp_t_c = ktime_get_ns();
-	tcf_hash_insert(tn, *a);
+	tcf_idr_insert(tn, *a);
 
 	return ret;
 
@@ -196,7 +194,7 @@ failure:
 	qdisc_put_rtab(P_tab);
 	qdisc_put_rtab(R_tab);
 	if (ret == ACT_P_CREATED)
-		tcf_hash_cleanup(*a, est);
+		tcf_idr_cleanup(*a, est);
 	return err;
 }
 
@@ -310,7 +308,7 @@ static int tcf_police_search(struct net *net, struct tc_action **a, u32 index)
 {
 	struct tc_action_net *tn = net_generic(net, police_net_id);
 
-	return tcf_hash_search(tn, a, index);
+	return tcf_idr_search(tn, a, index);
 }
 
 MODULE_AUTHOR("Alexey Kuznetsov");
@@ -333,7 +331,7 @@ static __net_init int police_init_net(struct net *net)
 {
 	struct tc_action_net *tn = net_generic(net, police_net_id);
 
-	return tc_action_net_init(tn, &act_police_ops, POL_TAB_MASK);
+	return tc_action_net_init(tn, &act_police_ops);
 }
 
 static void __net_exit police_exit_net(struct net *net)
diff --git a/net/sched/act_sample.c b/net/sched/act_sample.c
index 59d6645a4007..ec986ae52808 100644
--- a/net/sched/act_sample.c
+++ b/net/sched/act_sample.c
@@ -25,7 +25,6 @@
 
 #include <linux/if_arp.h>
 
-#define SAMPLE_TAB_MASK     7
 static unsigned int sample_net_id;
 static struct tc_action_ops act_sample_ops;
 
@@ -59,18 +58,18 @@ static int tcf_sample_init(struct net *net, struct nlattr *nla,
 
 	parm = nla_data(tb[TCA_SAMPLE_PARMS]);
 
-	exists = tcf_hash_check(tn, parm->index, a, bind);
+	exists = tcf_idr_check(tn, parm->index, a, bind);
 	if (exists && bind)
 		return 0;
 
 	if (!exists) {
-		ret = tcf_hash_create(tn, parm->index, est, a,
-				      &act_sample_ops, bind, false);
+		ret = tcf_idr_create(tn, parm->index, est, a,
+				     &act_sample_ops, bind, false);
 		if (ret)
 			return ret;
 		ret = ACT_P_CREATED;
 	} else {
-		tcf_hash_release(*a, bind);
+		tcf_idr_release(*a, bind);
 		if (!ovr)
 			return -EEXIST;
 	}
@@ -82,7 +81,7 @@ static int tcf_sample_init(struct net *net, struct nlattr *nla,
 	psample_group = psample_group_get(net, s->psample_group_num);
 	if (!psample_group) {
 		if (ret == ACT_P_CREATED)
-			tcf_hash_release(*a, bind);
+			tcf_idr_release(*a, bind);
 		return -ENOMEM;
 	}
 	RCU_INIT_POINTER(s->psample_group, psample_group);
@@ -93,7 +92,7 @@ static int tcf_sample_init(struct net *net, struct nlattr *nla,
 	}
 
 	if (ret == ACT_P_CREATED)
-		tcf_hash_insert(tn, *a);
+		tcf_idr_insert(tn, *a);
 	return ret;
 }
 
@@ -221,7 +220,7 @@ static int tcf_sample_search(struct net *net, struct tc_action **a, u32 index)
 {
 	struct tc_action_net *tn = net_generic(net, sample_net_id);
 
-	return tcf_hash_search(tn, a, index);
+	return tcf_idr_search(tn, a, index);
 }
 
 static struct tc_action_ops act_sample_ops = {
@@ -241,7 +240,7 @@ static __net_init int sample_init_net(struct net *net)
 {
 	struct tc_action_net *tn = net_generic(net, sample_net_id);
 
-	return tc_action_net_init(tn, &act_sample_ops, SAMPLE_TAB_MASK);
+	return tc_action_net_init(tn, &act_sample_ops);
 }
 
 static void __net_exit sample_exit_net(struct net *net)
diff --git a/net/sched/act_simple.c b/net/sched/act_simple.c
index 43605e7ce051..e7b57e5071a3 100644
--- a/net/sched/act_simple.c
+++ b/net/sched/act_simple.c
@@ -24,8 +24,6 @@
 #include <linux/tc_act/tc_defact.h>
 #include <net/tc_act/tc_defact.h>
 
-#define SIMP_TAB_MASK     7
-
 static unsigned int simp_net_id;
 static struct tc_action_ops act_simp_ops;
 
@@ -102,28 +100,28 @@ static int tcf_simp_init(struct net *net, struct nlattr *nla,
 		return -EINVAL;
 
 	parm = nla_data(tb[TCA_DEF_PARMS]);
-	exists = tcf_hash_check(tn, parm->index, a, bind);
+	exists = tcf_idr_check(tn, parm->index, a, bind);
 	if (exists && bind)
 		return 0;
 
 	if (tb[TCA_DEF_DATA] == NULL) {
 		if (exists)
-			tcf_hash_release(*a, bind);
+			tcf_idr_release(*a, bind);
 		return -EINVAL;
 	}
 
 	defdata = nla_data(tb[TCA_DEF_DATA]);
 
 	if (!exists) {
-		ret = tcf_hash_create(tn, parm->index, est, a,
-				      &act_simp_ops, bind, false);
+		ret = tcf_idr_create(tn, parm->index, est, a,
+				     &act_simp_ops, bind, false);
 		if (ret)
 			return ret;
 
 		d = to_defact(*a);
 		ret = alloc_defdata(d, defdata);
 		if (ret < 0) {
-			tcf_hash_cleanup(*a, est);
+			tcf_idr_cleanup(*a, est);
 			return ret;
 		}
 		d->tcf_action = parm->action;
@@ -131,7 +129,7 @@ static int tcf_simp_init(struct net *net, struct nlattr *nla,
 	} else {
 		d = to_defact(*a);
 
-		tcf_hash_release(*a, bind);
+		tcf_idr_release(*a, bind);
 		if (!ovr)
 			return -EEXIST;
 
@@ -139,7 +137,7 @@ static int tcf_simp_init(struct net *net, struct nlattr *nla,
 	}
 
 	if (ret == ACT_P_CREATED)
-		tcf_hash_insert(tn, *a);
+		tcf_idr_insert(tn, *a);
 	return ret;
 }
 
@@ -183,7 +181,7 @@ static int tcf_simp_search(struct net *net, struct tc_action **a, u32 index)
 {
 	struct tc_action_net *tn = net_generic(net, simp_net_id);
 
-	return tcf_hash_search(tn, a, index);
+	return tcf_idr_search(tn, a, index);
 }
 
 static struct tc_action_ops act_simp_ops = {
@@ -203,7 +201,7 @@ static __net_init int simp_init_net(struct net *net)
 {
 	struct tc_action_net *tn = net_generic(net, simp_net_id);
 
-	return tc_action_net_init(tn, &act_simp_ops, SIMP_TAB_MASK);
+	return tc_action_net_init(tn, &act_simp_ops);
 }
 
 static void __net_exit simp_exit_net(struct net *net)
diff --git a/net/sched/act_skbedit.c b/net/sched/act_skbedit.c
index 6b3e65d7de0c..59949d61f20d 100644
--- a/net/sched/act_skbedit.c
+++ b/net/sched/act_skbedit.c
@@ -27,8 +27,6 @@
 #include <linux/tc_act/tc_skbedit.h>
 #include <net/tc_act/tc_skbedit.h>
 
-#define SKBEDIT_TAB_MASK     15
-
 static unsigned int skbedit_net_id;
 static struct tc_action_ops act_skbedit_ops;
 
@@ -118,18 +116,18 @@ static int tcf_skbedit_init(struct net *net, struct nlattr *nla,
 
 	parm = nla_data(tb[TCA_SKBEDIT_PARMS]);
 
-	exists = tcf_hash_check(tn, parm->index, a, bind);
+	exists = tcf_idr_check(tn, parm->index, a, bind);
 	if (exists && bind)
 		return 0;
 
 	if (!flags) {
-		tcf_hash_release(*a, bind);
+		tcf_idr_release(*a, bind);
 		return -EINVAL;
 	}
 
 	if (!exists) {
-		ret = tcf_hash_create(tn, parm->index, est, a,
-				      &act_skbedit_ops, bind, false);
+		ret = tcf_idr_create(tn, parm->index, est, a,
+				     &act_skbedit_ops, bind, false);
 		if (ret)
 			return ret;
 
@@ -137,7 +135,7 @@ static int tcf_skbedit_init(struct net *net, struct nlattr *nla,
 		ret = ACT_P_CREATED;
 	} else {
 		d = to_skbedit(*a);
-		tcf_hash_release(*a, bind);
+		tcf_idr_release(*a, bind);
 		if (!ovr)
 			return -EEXIST;
 	}
@@ -163,7 +161,7 @@ static int tcf_skbedit_init(struct net *net, struct nlattr *nla,
 	spin_unlock_bh(&d->tcf_lock);
 
 	if (ret == ACT_P_CREATED)
-		tcf_hash_insert(tn, *a);
+		tcf_idr_insert(tn, *a);
 	return ret;
 }
 
@@ -221,7 +219,7 @@ static int tcf_skbedit_search(struct net *net, struct tc_action **a, u32 index)
 {
 	struct tc_action_net *tn = net_generic(net, skbedit_net_id);
 
-	return tcf_hash_search(tn, a, index);
+	return tcf_idr_search(tn, a, index);
 }
 
 static struct tc_action_ops act_skbedit_ops = {
@@ -240,7 +238,7 @@ static __net_init int skbedit_init_net(struct net *net)
 {
 	struct tc_action_net *tn = net_generic(net, skbedit_net_id);
 
-	return tc_action_net_init(tn, &act_skbedit_ops, SKBEDIT_TAB_MASK);
+	return tc_action_net_init(tn, &act_skbedit_ops);
 }
 
 static void __net_exit skbedit_exit_net(struct net *net)
diff --git a/net/sched/act_skbmod.c b/net/sched/act_skbmod.c
index a73c4bbcada2..b642ad3d39dd 100644
--- a/net/sched/act_skbmod.c
+++ b/net/sched/act_skbmod.c
@@ -20,8 +20,6 @@
 #include <linux/tc_act/tc_skbmod.h>
 #include <net/tc_act/tc_skbmod.h>
 
-#define SKBMOD_TAB_MASK     15
-
 static unsigned int skbmod_net_id;
 static struct tc_action_ops act_skbmod_ops;
 
@@ -129,7 +127,7 @@ static int tcf_skbmod_init(struct net *net, struct nlattr *nla,
 	if (parm->flags & SKBMOD_F_SWAPMAC)
 		lflags = SKBMOD_F_SWAPMAC;
 
-	exists = tcf_hash_check(tn, parm->index, a, bind);
+	exists = tcf_idr_check(tn, parm->index, a, bind);
 	if (exists && bind)
 		return 0;
 
@@ -137,14 +135,14 @@ static int tcf_skbmod_init(struct net *net, struct nlattr *nla,
 		return -EINVAL;
 
 	if (!exists) {
-		ret = tcf_hash_create(tn, parm->index, est, a,
-				      &act_skbmod_ops, bind, true);
+		ret = tcf_idr_create(tn, parm->index, est, a,
+				     &act_skbmod_ops, bind, true);
 		if (ret)
 			return ret;
 
 		ret = ACT_P_CREATED;
 	} else {
-		tcf_hash_release(*a, bind);
+		tcf_idr_release(*a, bind);
 		if (!ovr)
 			return -EEXIST;
 	}
@@ -155,7 +153,7 @@ static int tcf_skbmod_init(struct net *net, struct nlattr *nla,
 	p = kzalloc(sizeof(struct tcf_skbmod_params), GFP_KERNEL);
 	if (unlikely(!p)) {
 		if (ovr)
-			tcf_hash_release(*a, bind);
+			tcf_idr_release(*a, bind);
 		return -ENOMEM;
 	}
 
@@ -182,7 +180,7 @@ static int tcf_skbmod_init(struct net *net, struct nlattr *nla,
 		kfree_rcu(p_old, rcu);
 
 	if (ret == ACT_P_CREATED)
-		tcf_hash_insert(tn, *a);
+		tcf_idr_insert(tn, *a);
 	return ret;
 }
 
@@ -245,7 +243,7 @@ static int tcf_skbmod_search(struct net *net, struct tc_action **a, u32 index)
 {
 	struct tc_action_net *tn = net_generic(net, skbmod_net_id);
 
-	return tcf_hash_search(tn, a, index);
+	return tcf_idr_search(tn, a, index);
 }
 
 static struct tc_action_ops act_skbmod_ops = {
@@ -265,7 +263,7 @@ static __net_init int skbmod_init_net(struct net *net)
 {
 	struct tc_action_net *tn = net_generic(net, skbmod_net_id);
 
-	return tc_action_net_init(tn, &act_skbmod_ops, SKBMOD_TAB_MASK);
+	return tc_action_net_init(tn, &act_skbmod_ops);
 }
 
 static void __net_exit skbmod_exit_net(struct net *net)
diff --git a/net/sched/act_tunnel_key.c b/net/sched/act_tunnel_key.c
index fd7e75679c69..30c96274c638 100644
--- a/net/sched/act_tunnel_key.c
+++ b/net/sched/act_tunnel_key.c
@@ -20,8 +20,6 @@
 #include <linux/tc_act/tc_tunnel_key.h>
 #include <net/tc_act/tc_tunnel_key.h>
 
-#define TUNNEL_KEY_TAB_MASK     15
-
 static unsigned int tunnel_key_net_id;
 static struct tc_action_ops act_tunnel_key_ops;
 
@@ -100,7 +98,7 @@ static int tunnel_key_init(struct net *net, struct nlattr *nla,
 		return -EINVAL;
 
 	parm = nla_data(tb[TCA_TUNNEL_KEY_PARMS]);
-	exists = tcf_hash_check(tn, parm->index, a, bind);
+	exists = tcf_idr_check(tn, parm->index, a, bind);
 	if (exists && bind)
 		return 0;
 
@@ -159,14 +157,14 @@ static int tunnel_key_init(struct net *net, struct nlattr *nla,
 	}
 
 	if (!exists) {
-		ret = tcf_hash_create(tn, parm->index, est, a,
-				      &act_tunnel_key_ops, bind, true);
+		ret = tcf_idr_create(tn, parm->index, est, a,
+				     &act_tunnel_key_ops, bind, true);
 		if (ret)
 			return ret;
 
 		ret = ACT_P_CREATED;
 	} else {
-		tcf_hash_release(*a, bind);
+		tcf_idr_release(*a, bind);
 		if (!ovr)
 			return -EEXIST;
 	}
@@ -177,7 +175,7 @@ static int tunnel_key_init(struct net *net, struct nlattr *nla,
 	params_new = kzalloc(sizeof(*params_new), GFP_KERNEL);
 	if (unlikely(!params_new)) {
 		if (ret == ACT_P_CREATED)
-			tcf_hash_release(*a, bind);
+			tcf_idr_release(*a, bind);
 		return -ENOMEM;
 	}
 
@@ -193,13 +191,13 @@ static int tunnel_key_init(struct net *net, struct nlattr *nla,
 		kfree_rcu(params_old, rcu);
 
 	if (ret == ACT_P_CREATED)
-		tcf_hash_insert(tn, *a);
+		tcf_idr_insert(tn, *a);
 
 	return ret;
 
 err_out:
 	if (exists)
-		tcf_hash_release(*a, bind);
+		tcf_idr_release(*a, bind);
 	return ret;
 }
 
@@ -304,7 +302,7 @@ static int tunnel_key_search(struct net *net, struct tc_action **a, u32 index)
 {
 	struct tc_action_net *tn = net_generic(net, tunnel_key_net_id);
 
-	return tcf_hash_search(tn, a, index);
+	return tcf_idr_search(tn, a, index);
 }
 
 static struct tc_action_ops act_tunnel_key_ops = {
@@ -324,7 +322,7 @@ static __net_init int tunnel_key_init_net(struct net *net)
 {
 	struct tc_action_net *tn = net_generic(net, tunnel_key_net_id);
 
-	return tc_action_net_init(tn, &act_tunnel_key_ops, TUNNEL_KEY_TAB_MASK);
+	return tc_action_net_init(tn, &act_tunnel_key_ops);
 }
 
 static void __net_exit tunnel_key_exit_net(struct net *net)
diff --git a/net/sched/act_vlan.c b/net/sched/act_vlan.c
index 13ba3a89f675..16eb067a8d8f 100644
--- a/net/sched/act_vlan.c
+++ b/net/sched/act_vlan.c
@@ -19,8 +19,6 @@
 #include <linux/tc_act/tc_vlan.h>
 #include <net/tc_act/tc_vlan.h>
 
-#define VLAN_TAB_MASK     15
-
 static unsigned int vlan_net_id;
 static struct tc_action_ops act_vlan_ops;
 
@@ -128,7 +126,7 @@ static int tcf_vlan_init(struct net *net, struct nlattr *nla,
 	if (!tb[TCA_VLAN_PARMS])
 		return -EINVAL;
 	parm = nla_data(tb[TCA_VLAN_PARMS]);
-	exists = tcf_hash_check(tn, parm->index, a, bind);
+	exists = tcf_idr_check(tn, parm->index, a, bind);
 	if (exists && bind)
 		return 0;
 
@@ -139,13 +137,13 @@ static int tcf_vlan_init(struct net *net, struct nlattr *nla,
 	case TCA_VLAN_ACT_MODIFY:
 		if (!tb[TCA_VLAN_PUSH_VLAN_ID]) {
 			if (exists)
-				tcf_hash_release(*a, bind);
+				tcf_idr_release(*a, bind);
 			return -EINVAL;
 		}
 		push_vid = nla_get_u16(tb[TCA_VLAN_PUSH_VLAN_ID]);
 		if (push_vid >= VLAN_VID_MASK) {
 			if (exists)
-				tcf_hash_release(*a, bind);
+				tcf_idr_release(*a, bind);
 			return -ERANGE;
 		}
 
@@ -167,20 +165,20 @@ static int tcf_vlan_init(struct net *net, struct nlattr *nla,
 		break;
 	default:
 		if (exists)
-			tcf_hash_release(*a, bind);
+			tcf_idr_release(*a, bind);
 		return -EINVAL;
 	}
 	action = parm->v_action;
 
 	if (!exists) {
-		ret = tcf_hash_create(tn, parm->index, est, a,
-				      &act_vlan_ops, bind, false);
+		ret = tcf_idr_create(tn, parm->index, est, a,
+				     &act_vlan_ops, bind, false);
 		if (ret)
 			return ret;
 
 		ret = ACT_P_CREATED;
 	} else {
-		tcf_hash_release(*a, bind);
+		tcf_idr_release(*a, bind);
 		if (!ovr)
 			return -EEXIST;
 	}
@@ -199,7 +197,7 @@ static int tcf_vlan_init(struct net *net, struct nlattr *nla,
 	spin_unlock_bh(&v->tcf_lock);
 
 	if (ret == ACT_P_CREATED)
-		tcf_hash_insert(tn, *a);
+		tcf_idr_insert(tn, *a);
 	return ret;
 }
 
@@ -252,7 +250,7 @@ static int tcf_vlan_search(struct net *net, struct tc_action **a, u32 index)
 {
 	struct tc_action_net *tn = net_generic(net, vlan_net_id);
 
-	return tcf_hash_search(tn, a, index);
+	return tcf_idr_search(tn, a, index);
 }
 
 static struct tc_action_ops act_vlan_ops = {
@@ -271,7 +269,7 @@ static __net_init int vlan_init_net(struct net *net)
 {
 	struct tc_action_net *tn = net_generic(net, vlan_net_id);
 
-	return tc_action_net_init(tn, &act_vlan_ops, VLAN_TAB_MASK);
+	return tc_action_net_init(tn, &act_vlan_ops);
 }
 
 static void __net_exit vlan_exit_net(struct net *net)
-- 
cgit v1.2.3


From 5c23f2dc8eeb5a6010cb66119d942361bc8ec833 Mon Sep 17 00:00:00 2001
From: Antoine Tenart <antoine.tenart@free-electrons.com>
Date: Wed, 30 Aug 2017 10:29:12 +0200
Subject: phy: add sgmii and 10gkr modes to the phy_mode enum

This patch adds more generic PHY modes to the phy_mode enum, to
allow configuring generic PHYs to the SGMII and/or the 10GKR mode
by using the set_mode callback.

Signed-off-by: Antoine Tenart <antoine.tenart@free-electrons.com>
Acked-by: Kishon Vijay Abraham I <kishon@ti.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/phy/phy.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include')

diff --git a/include/linux/phy/phy.h b/include/linux/phy/phy.h
index 78bb0d7f6b11..e694d4008c4a 100644
--- a/include/linux/phy/phy.h
+++ b/include/linux/phy/phy.h
@@ -27,6 +27,8 @@ enum phy_mode {
 	PHY_MODE_USB_HOST,
 	PHY_MODE_USB_DEVICE,
 	PHY_MODE_USB_OTG,
+	PHY_MODE_SGMII,
+	PHY_MODE_10GKR,
 };
 
 /**
-- 
cgit v1.2.3


From 2729984149e6a23e849a40e16fc3efdc07dd3668 Mon Sep 17 00:00:00 2001
From: Gal Pressman <galp@mellanox.com>
Date: Sun, 13 Aug 2017 13:34:42 +0300
Subject: net/mlx5e: Support TSO and TX checksum offloads for GRE tunnels

Add TX offloads support for GRE tunneled packets by reporting the needed
netdev features.

Signed-off-by: Gal Pressman <galp@mellanox.com>
Reviewed-by: Or Gerlitz <ogerlitz@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
---
 drivers/net/ethernet/mellanox/mlx5/core/en_main.c | 51 +++++++++++++++--------
 include/linux/mlx5/mlx5_ifc.h                     |  2 +-
 2 files changed, 34 insertions(+), 19 deletions(-)

(limited to 'include')

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
index fdc2b92f020b..9475fb89a744 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
@@ -3499,13 +3499,13 @@ static void mlx5e_del_vxlan_port(struct net_device *netdev,
 	mlx5e_vxlan_queue_work(priv, ti->sa_family, be16_to_cpu(ti->port), 0);
 }
 
-static netdev_features_t mlx5e_vxlan_features_check(struct mlx5e_priv *priv,
-						    struct sk_buff *skb,
-						    netdev_features_t features)
+static netdev_features_t mlx5e_tunnel_features_check(struct mlx5e_priv *priv,
+						     struct sk_buff *skb,
+						     netdev_features_t features)
 {
 	struct udphdr *udph;
-	u16 proto;
-	u16 port = 0;
+	u8 proto;
+	u16 port;
 
 	switch (vlan_get_protocol(skb)) {
 	case htons(ETH_P_IP):
@@ -3518,14 +3518,17 @@ static netdev_features_t mlx5e_vxlan_features_check(struct mlx5e_priv *priv,
 		goto out;
 	}
 
-	if (proto == IPPROTO_UDP) {
+	switch (proto) {
+	case IPPROTO_GRE:
+		return features;
+	case IPPROTO_UDP:
 		udph = udp_hdr(skb);
 		port = be16_to_cpu(udph->dest);
-	}
 
-	/* Verify if UDP port is being offloaded by HW */
-	if (port && mlx5e_vxlan_lookup_port(priv, port))
-		return features;
+		/* Verify if UDP port is being offloaded by HW */
+		if (mlx5e_vxlan_lookup_port(priv, port))
+			return features;
+	}
 
 out:
 	/* Disable CSUM and GSO if the udp dport is not offloaded by HW */
@@ -3549,7 +3552,7 @@ static netdev_features_t mlx5e_features_check(struct sk_buff *skb,
 	/* Validate if the tunneled packet is being offloaded by HW */
 	if (skb->encapsulation &&
 	    (features & NETIF_F_CSUM_MASK || features & NETIF_F_GSO_MASK))
-		return mlx5e_vxlan_features_check(priv, skb, features);
+		return mlx5e_tunnel_features_check(priv, skb, features);
 
 	return features;
 }
@@ -4014,20 +4017,32 @@ static void mlx5e_build_nic_netdev(struct net_device *netdev)
 	netdev->hw_features      |= NETIF_F_HW_VLAN_CTAG_RX;
 	netdev->hw_features      |= NETIF_F_HW_VLAN_CTAG_FILTER;
 
-	if (mlx5e_vxlan_allowed(mdev)) {
-		netdev->hw_features     |= NETIF_F_GSO_UDP_TUNNEL |
-					   NETIF_F_GSO_UDP_TUNNEL_CSUM |
-					   NETIF_F_GSO_PARTIAL;
+	if (mlx5e_vxlan_allowed(mdev) || MLX5_CAP_ETH(mdev, tunnel_stateless_gre)) {
+		netdev->hw_features     |= NETIF_F_GSO_PARTIAL;
 		netdev->hw_enc_features |= NETIF_F_IP_CSUM;
 		netdev->hw_enc_features |= NETIF_F_IPV6_CSUM;
 		netdev->hw_enc_features |= NETIF_F_TSO;
 		netdev->hw_enc_features |= NETIF_F_TSO6;
-		netdev->hw_enc_features |= NETIF_F_GSO_UDP_TUNNEL;
-		netdev->hw_enc_features |= NETIF_F_GSO_UDP_TUNNEL_CSUM |
-					   NETIF_F_GSO_PARTIAL;
+		netdev->hw_enc_features |= NETIF_F_GSO_PARTIAL;
+	}
+
+	if (mlx5e_vxlan_allowed(mdev)) {
+		netdev->hw_features     |= NETIF_F_GSO_UDP_TUNNEL |
+					   NETIF_F_GSO_UDP_TUNNEL_CSUM;
+		netdev->hw_enc_features |= NETIF_F_GSO_UDP_TUNNEL |
+					   NETIF_F_GSO_UDP_TUNNEL_CSUM;
 		netdev->gso_partial_features = NETIF_F_GSO_UDP_TUNNEL_CSUM;
 	}
 
+	if (MLX5_CAP_ETH(mdev, tunnel_stateless_gre)) {
+		netdev->hw_features     |= NETIF_F_GSO_GRE |
+					   NETIF_F_GSO_GRE_CSUM;
+		netdev->hw_enc_features |= NETIF_F_GSO_GRE |
+					   NETIF_F_GSO_GRE_CSUM;
+		netdev->gso_partial_features |= NETIF_F_GSO_GRE |
+						NETIF_F_GSO_GRE_CSUM;
+	}
+
 	mlx5_query_port_fcs(mdev, &fcs_supported, &fcs_enabled);
 
 	if (fcs_supported)
diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h
index ae7d09b9c52f..3d5d32e5446c 100644
--- a/include/linux/mlx5/mlx5_ifc.h
+++ b/include/linux/mlx5/mlx5_ifc.h
@@ -602,7 +602,7 @@ struct mlx5_ifc_per_protocol_networking_offload_caps_bits {
 	u8         reserved_at_1a[0x1];
 	u8         tunnel_lso_const_out_ip_id[0x1];
 	u8         reserved_at_1c[0x2];
-	u8         tunnel_statless_gre[0x1];
+	u8         tunnel_stateless_gre[0x1];
 	u8         tunnel_stateless_vxlan[0x1];
 
 	u8         swp[0x1];
-- 
cgit v1.2.3


From e3bfed1df379c18f20feb06427d952b766e2c00f Mon Sep 17 00:00:00 2001
From: Paul Mackerras <paulus@ozlabs.org>
Date: Fri, 25 Aug 2017 19:53:39 +1000
Subject: KVM: PPC: Book3S HV: Report storage key support to userspace

This adds information about storage keys to the struct returned by
the KVM_PPC_GET_SMMU_INFO ioctl.  The new fields replace a pad field,
which was zeroed by previous kernel versions.  Thus userspace that
knows about the new fields will see zeroes when running on an older
kernel, indicating that storage keys are not supported.  The size of
the structure has not changed.

The number of keys is hard-coded for the CPUs supported by HV KVM,
which is just POWER7, POWER8 and POWER9.

Signed-off-by: Paul Mackerras <paulus@ozlabs.org>
Reviewed-by: David Gibson <david@gibson.dropbear.id.au>
Signed-off-by: Paul Mackerras <paulus@ozlabs.org>
---
 arch/powerpc/kvm/book3s_hv.c | 8 ++++++++
 include/uapi/linux/kvm.h     | 3 ++-
 2 files changed, 10 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c
index 1182cfd79857..f62ad2e9085f 100644
--- a/arch/powerpc/kvm/book3s_hv.c
+++ b/arch/powerpc/kvm/book3s_hv.c
@@ -3331,6 +3331,14 @@ static int kvm_vm_ioctl_get_smmu_info_hv(struct kvm *kvm,
 	if (radix_enabled())
 		return -EINVAL;
 
+	/*
+	 * POWER7, POWER8 and POWER9 all support 32 storage keys for data.
+	 * POWER7 doesn't support keys for instruction accesses,
+	 * POWER8 and POWER9 do.
+	 */
+	info->data_keys = 32;
+	info->instr_keys = cpu_has_feature(CPU_FTR_ARCH_207S) ? 32 : 0;
+
 	info->flags = KVM_PPC_PAGE_SIZES_REAL;
 	if (mmu_has_feature(MMU_FTR_1T_SEGMENT))
 		info->flags |= KVM_PPC_1T_SEGMENTS;
diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
index 6cd63c18708a..838887587411 100644
--- a/include/uapi/linux/kvm.h
+++ b/include/uapi/linux/kvm.h
@@ -711,7 +711,8 @@ struct kvm_ppc_one_seg_page_size {
 struct kvm_ppc_smmu_info {
 	__u64 flags;
 	__u32 slb_size;
-	__u32 pad;
+	__u16 data_keys;	/* # storage keys supported for data */
+	__u16 instr_keys;	/* # storage keys supported for instructions */
 	struct kvm_ppc_one_seg_page_size sps[KVM_PPC_PAGE_SIZES_MAX_SZ];
 };
 
-- 
cgit v1.2.3


From 47ebcc0bb1d5eb7f1b1eeab675409ea7f67b4a5c Mon Sep 17 00:00:00 2001
From: Yossi Kuperman <yossiku@mellanox.com>
Date: Wed, 30 Aug 2017 11:30:39 +0300
Subject: xfrm: Add support for network devices capable of removing the ESP
 trailer

In conjunction with crypto offload [1], removing the ESP trailer by
hardware can potentially improve the performance by avoiding (1) a
cache miss incurred by reading the nexthdr field and (2) the necessity
to calculate the csum value of the trailer in order to keep skb->csum
valid.

This patch introduces the changes to the xfrm stack and merely serves
as an infrastructure. Subsequent patch to mlx5 driver will put this to
a good use.

[1] https://www.mail-archive.com/netdev@vger.kernel.org/msg175733.html

Signed-off-by: Yossi Kuperman <yossiku@mellanox.com>
Signed-off-by: Steffen Klassert <steffen.klassert@secunet.com>
---
 include/net/xfrm.h    |  1 +
 net/ipv4/esp4.c       | 70 ++++++++++++++++++++++++++++++++++-----------------
 net/ipv6/esp6.c       | 51 ++++++++++++++++++++++++++-----------
 net/xfrm/xfrm_input.c |  5 ++++
 4 files changed, 89 insertions(+), 38 deletions(-)

(limited to 'include')

diff --git a/include/net/xfrm.h b/include/net/xfrm.h
index 9c7b70cce6d6..f002a2c5e33c 100644
--- a/include/net/xfrm.h
+++ b/include/net/xfrm.h
@@ -1019,6 +1019,7 @@ struct xfrm_offload {
 #define	CRYPTO_FALLBACK		8
 #define	XFRM_GSO_SEGMENT	16
 #define	XFRM_GRO		32
+#define	XFRM_ESP_NO_TRAILER	64
 
 	__u32			status;
 #define CRYPTO_SUCCESS				1
diff --git a/net/ipv4/esp4.c b/net/ipv4/esp4.c
index 741acd7b9646..319000573bc7 100644
--- a/net/ipv4/esp4.c
+++ b/net/ipv4/esp4.c
@@ -499,19 +499,59 @@ static int esp_output(struct xfrm_state *x, struct sk_buff *skb)
 	return esp_output_tail(x, skb, &esp);
 }
 
+static inline int esp_remove_trailer(struct sk_buff *skb)
+{
+	struct xfrm_state *x = xfrm_input_state(skb);
+	struct xfrm_offload *xo = xfrm_offload(skb);
+	struct crypto_aead *aead = x->data;
+	int alen, hlen, elen;
+	int padlen, trimlen;
+	__wsum csumdiff;
+	u8 nexthdr[2];
+	int ret;
+
+	alen = crypto_aead_authsize(aead);
+	hlen = sizeof(struct ip_esp_hdr) + crypto_aead_ivsize(aead);
+	elen = skb->len - hlen;
+
+	if (xo && (xo->flags & XFRM_ESP_NO_TRAILER)) {
+		ret = xo->proto;
+		goto out;
+	}
+
+	if (skb_copy_bits(skb, skb->len - alen - 2, nexthdr, 2))
+		BUG();
+
+	ret = -EINVAL;
+	padlen = nexthdr[0];
+	if (padlen + 2 + alen >= elen) {
+		net_dbg_ratelimited("ipsec esp packet is garbage padlen=%d, elen=%d\n",
+				    padlen + 2, elen - alen);
+		goto out;
+	}
+
+	trimlen = alen + padlen + 2;
+	if (skb->ip_summed == CHECKSUM_COMPLETE) {
+		csumdiff = skb_checksum(skb, skb->len - trimlen, trimlen, 0);
+		skb->csum = csum_block_sub(skb->csum, csumdiff,
+					   skb->len - trimlen);
+	}
+	pskb_trim(skb, skb->len - trimlen);
+
+	ret = nexthdr[1];
+
+out:
+	return ret;
+}
+
 int esp_input_done2(struct sk_buff *skb, int err)
 {
 	const struct iphdr *iph;
 	struct xfrm_state *x = xfrm_input_state(skb);
 	struct xfrm_offload *xo = xfrm_offload(skb);
 	struct crypto_aead *aead = x->data;
-	int alen = crypto_aead_authsize(aead);
 	int hlen = sizeof(struct ip_esp_hdr) + crypto_aead_ivsize(aead);
-	int elen = skb->len - hlen;
 	int ihl;
-	u8 nexthdr[2];
-	int padlen, trimlen;
-	__wsum csumdiff;
 
 	if (!xo || (xo && !(xo->flags & CRYPTO_DONE)))
 		kfree(ESP_SKB_CB(skb)->tmp);
@@ -519,16 +559,10 @@ int esp_input_done2(struct sk_buff *skb, int err)
 	if (unlikely(err))
 		goto out;
 
-	if (skb_copy_bits(skb, skb->len-alen-2, nexthdr, 2))
-		BUG();
-
-	err = -EINVAL;
-	padlen = nexthdr[0];
-	if (padlen + 2 + alen >= elen)
+	err = esp_remove_trailer(skb);
+	if (unlikely(err < 0))
 		goto out;
 
-	/* ... check padding bits here. Silly. :-) */
-
 	iph = ip_hdr(skb);
 	ihl = iph->ihl * 4;
 
@@ -569,22 +603,12 @@ int esp_input_done2(struct sk_buff *skb, int err)
 			skb->ip_summed = CHECKSUM_UNNECESSARY;
 	}
 
-	trimlen = alen + padlen + 2;
-	if (skb->ip_summed == CHECKSUM_COMPLETE) {
-		csumdiff = skb_checksum(skb, skb->len - trimlen, trimlen, 0);
-		skb->csum = csum_block_sub(skb->csum, csumdiff,
-					   skb->len - trimlen);
-	}
-	pskb_trim(skb, skb->len - trimlen);
-
 	skb_pull_rcsum(skb, hlen);
 	if (x->props.mode == XFRM_MODE_TUNNEL)
 		skb_reset_transport_header(skb);
 	else
 		skb_set_transport_header(skb, -ihl);
 
-	err = nexthdr[1];
-
 	/* RFC4303: Drop dummy packets without any error */
 	if (err == IPPROTO_NONE)
 		err = -EINVAL;
diff --git a/net/ipv6/esp6.c b/net/ipv6/esp6.c
index 74bde202eb9a..7fb41b0ad437 100644
--- a/net/ipv6/esp6.c
+++ b/net/ipv6/esp6.c
@@ -461,29 +461,30 @@ static int esp6_output(struct xfrm_state *x, struct sk_buff *skb)
 	return esp6_output_tail(x, skb, &esp);
 }
 
-int esp6_input_done2(struct sk_buff *skb, int err)
+static inline int esp_remove_trailer(struct sk_buff *skb)
 {
 	struct xfrm_state *x = xfrm_input_state(skb);
 	struct xfrm_offload *xo = xfrm_offload(skb);
 	struct crypto_aead *aead = x->data;
-	int alen = crypto_aead_authsize(aead);
-	int hlen = sizeof(struct ip_esp_hdr) + crypto_aead_ivsize(aead);
-	int elen = skb->len - hlen;
-	int hdr_len = skb_network_header_len(skb);
+	int alen, hlen, elen;
 	int padlen, trimlen;
 	__wsum csumdiff;
 	u8 nexthdr[2];
+	int ret;
 
-	if (!xo || (xo && !(xo->flags & CRYPTO_DONE)))
-		kfree(ESP_SKB_CB(skb)->tmp);
+	alen = crypto_aead_authsize(aead);
+	hlen = sizeof(struct ip_esp_hdr) + crypto_aead_ivsize(aead);
+	elen = skb->len - hlen;
 
-	if (unlikely(err))
+	if (xo && (xo->flags & XFRM_ESP_NO_TRAILER)) {
+		ret = xo->proto;
 		goto out;
+	}
 
 	if (skb_copy_bits(skb, skb->len - alen - 2, nexthdr, 2))
 		BUG();
 
-	err = -EINVAL;
+	ret = -EINVAL;
 	padlen = nexthdr[0];
 	if (padlen + 2 + alen >= elen) {
 		net_dbg_ratelimited("ipsec esp packet is garbage padlen=%d, elen=%d\n",
@@ -491,26 +492,46 @@ int esp6_input_done2(struct sk_buff *skb, int err)
 		goto out;
 	}
 
-	/* ... check padding bits here. Silly. :-) */
-
 	trimlen = alen + padlen + 2;
 	if (skb->ip_summed == CHECKSUM_COMPLETE) {
-		skb_postpull_rcsum(skb, skb_network_header(skb),
-				   skb_network_header_len(skb));
 		csumdiff = skb_checksum(skb, skb->len - trimlen, trimlen, 0);
 		skb->csum = csum_block_sub(skb->csum, csumdiff,
 					   skb->len - trimlen);
 	}
 	pskb_trim(skb, skb->len - trimlen);
 
+	ret = nexthdr[1];
+
+out:
+	return ret;
+}
+
+int esp6_input_done2(struct sk_buff *skb, int err)
+{
+	struct xfrm_state *x = xfrm_input_state(skb);
+	struct xfrm_offload *xo = xfrm_offload(skb);
+	struct crypto_aead *aead = x->data;
+	int hlen = sizeof(struct ip_esp_hdr) + crypto_aead_ivsize(aead);
+	int hdr_len = skb_network_header_len(skb);
+
+	if (!xo || (xo && !(xo->flags & CRYPTO_DONE)))
+		kfree(ESP_SKB_CB(skb)->tmp);
+
+	if (unlikely(err))
+		goto out;
+
+	err = esp_remove_trailer(skb);
+	if (unlikely(err < 0))
+		goto out;
+
+	skb_postpull_rcsum(skb, skb_network_header(skb),
+			   skb_network_header_len(skb));
 	skb_pull_rcsum(skb, hlen);
 	if (x->props.mode == XFRM_MODE_TUNNEL)
 		skb_reset_transport_header(skb);
 	else
 		skb_set_transport_header(skb, -hdr_len);
 
-	err = nexthdr[1];
-
 	/* RFC4303: Drop dummy packets without any error */
 	if (err == IPPROTO_NONE)
 		err = -EINVAL;
diff --git a/net/xfrm/xfrm_input.c b/net/xfrm/xfrm_input.c
index f07eec59dcae..2515cd2bc5db 100644
--- a/net/xfrm/xfrm_input.c
+++ b/net/xfrm/xfrm_input.c
@@ -247,6 +247,11 @@ int xfrm_input(struct sk_buff *skb, int nexthdr, __be32 spi, int encap_type)
 					goto drop;
 				}
 
+				if (xo->status & CRYPTO_INVALID_PROTOCOL) {
+					XFRM_INC_STATS(net, LINUX_MIB_XFRMINSTATEPROTOERROR);
+					goto drop;
+				}
+
 				XFRM_INC_STATS(net, LINUX_MIB_XFRMINBUFFERERROR);
 				goto drop;
 			}
-- 
cgit v1.2.3


From 6606bc9dee63ad8cda2cc310d2ad5992673a785a Mon Sep 17 00:00:00 2001
From: Baolin Wang <baolin.wang@spreadtrum.com>
Date: Thu, 17 Aug 2017 14:50:36 +0800
Subject: pinctrl: Add sleep related state to indicate sleep related configs

In some scenarios, we should set some pins as input/output/pullup/pulldown
when the specified system goes into deep sleep mode, then when the system
goes into deep sleep mode, these pins will be set automatically by hardware.

That means some pins are not controlled by any specific driver in the OS, but
need to be controlled when entering sleep mode. Thus we introduce one sleep
state config into pinconf-generic for users to configure.

Signed-off-by: Baolin Wang <baolin.wang@spreadtrum.com>
Signed-off-by: Linus Walleij <linus.walleij@linaro.org>
---
 Documentation/devicetree/bindings/pinctrl/pinctrl-bindings.txt | 2 ++
 drivers/pinctrl/pinconf-generic.c                              | 2 ++
 include/linux/pinctrl/pinconf-generic.h                        | 2 ++
 3 files changed, 6 insertions(+)

(limited to 'include')

diff --git a/Documentation/devicetree/bindings/pinctrl/pinctrl-bindings.txt b/Documentation/devicetree/bindings/pinctrl/pinctrl-bindings.txt
index 62d0f33fa65e..4483cc31e531 100644
--- a/Documentation/devicetree/bindings/pinctrl/pinctrl-bindings.txt
+++ b/Documentation/devicetree/bindings/pinctrl/pinctrl-bindings.txt
@@ -268,6 +268,8 @@ output-enable		- enable output on a pin without actively driving it
 			  (such as enabling an output buffer)
 output-low		- set the pin to output mode with low level
 output-high		- set the pin to output mode with high level
+sleep-hardware-state	- indicate this is sleep related state which will be programmed
+			  into the registers for the sleep state.
 slew-rate		- set the slew rate
 
 For example:
diff --git a/drivers/pinctrl/pinconf-generic.c b/drivers/pinctrl/pinconf-generic.c
index 4cf901c78130..8eaa25c3384f 100644
--- a/drivers/pinctrl/pinconf-generic.c
+++ b/drivers/pinctrl/pinconf-generic.c
@@ -47,6 +47,7 @@ static const struct pin_config_item conf_items[] = {
 	PCONFDUMP(PIN_CONFIG_OUTPUT_ENABLE, "output enabled", NULL, false),
 	PCONFDUMP(PIN_CONFIG_OUTPUT, "pin output", "level", true),
 	PCONFDUMP(PIN_CONFIG_POWER_SOURCE, "pin power source", "selector", true),
+	PCONFDUMP(PIN_CONFIG_SLEEP_HARDWARE_STATE, "sleep hardware state", NULL, false),
 	PCONFDUMP(PIN_CONFIG_SLEW_RATE, "slew rate", NULL, true),
 };
 
@@ -178,6 +179,7 @@ static const struct pinconf_generic_params dt_params[] = {
 	{ "output-high", PIN_CONFIG_OUTPUT, 1, },
 	{ "output-low", PIN_CONFIG_OUTPUT, 0, },
 	{ "power-source", PIN_CONFIG_POWER_SOURCE, 0 },
+	{ "sleep-hardware-state", PIN_CONFIG_SLEEP_HARDWARE_STATE, 0 },
 	{ "slew-rate", PIN_CONFIG_SLEW_RATE, 0 },
 };
 
diff --git a/include/linux/pinctrl/pinconf-generic.h b/include/linux/pinctrl/pinconf-generic.h
index e91d1b6a260d..5d8bc7f21c2a 100644
--- a/include/linux/pinctrl/pinconf-generic.h
+++ b/include/linux/pinctrl/pinconf-generic.h
@@ -86,6 +86,7 @@
  * @PIN_CONFIG_POWER_SOURCE: if the pin can select between different power
  *	supplies, the argument to this parameter (on a custom format) tells
  *	the driver which alternative power source to use.
+ * @PIN_CONFIG_SLEEP_HARDWARE_STATE: indicate this is sleep related state.
  * @PIN_CONFIG_SLEW_RATE: if the pin can select slew rate, the argument to
  *	this parameter (on a custom format) tells the driver which alternative
  *	slew rate to use.
@@ -114,6 +115,7 @@ enum pin_config_param {
 	PIN_CONFIG_OUTPUT_ENABLE,
 	PIN_CONFIG_OUTPUT,
 	PIN_CONFIG_POWER_SOURCE,
+	PIN_CONFIG_SLEEP_HARDWARE_STATE,
 	PIN_CONFIG_SLEW_RATE,
 	PIN_CONFIG_END = 0x7F,
 	PIN_CONFIG_MAX = 0xFF,
-- 
cgit v1.2.3


From 72f9b089ecd2cc2194d27cbb14fd80a0b1472e89 Mon Sep 17 00:00:00 2001
From: Aditya Sarwade <asarwade@vmware.com>
Date: Tue, 29 Aug 2017 15:51:29 -0700
Subject: RDMA/vmw_pvrdma: Report network header type in WC

We should report the network header type in the work completion so that
the kernel can infer the right RoCE type headers.

Reviewed-by: Bryan Tan <bryantan@vmware.com>
Signed-off-by: Aditya Sarwade <asarwade@vmware.com>
Signed-off-by: Adit Ranadive <aditr@vmware.com>
Reviewed-by: Yuval Shaia <yuval.shaia@oracle.com>
Signed-off-by: Doug Ledford <dledford@redhat.com>
---
 drivers/infiniband/hw/vmw_pvrdma/pvrdma_cq.c | 1 +
 include/uapi/rdma/vmw_pvrdma-abi.h           | 6 ++++--
 2 files changed, 5 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/drivers/infiniband/hw/vmw_pvrdma/pvrdma_cq.c b/drivers/infiniband/hw/vmw_pvrdma/pvrdma_cq.c
index 90aa326fd7c0..8a12dc73b68e 100644
--- a/drivers/infiniband/hw/vmw_pvrdma/pvrdma_cq.c
+++ b/drivers/infiniband/hw/vmw_pvrdma/pvrdma_cq.c
@@ -389,6 +389,7 @@ retry:
 	wc->dlid_path_bits = cqe->dlid_path_bits;
 	wc->port_num = cqe->port_num;
 	wc->vendor_err = cqe->vendor_err;
+	wc->network_hdr_type = cqe->network_hdr_type;
 
 	/* Update shared ring state */
 	pvrdma_idx_ring_inc(&cq->ring_state->rx.cons_head, cq->ibcq.cqe);
diff --git a/include/uapi/rdma/vmw_pvrdma-abi.h b/include/uapi/rdma/vmw_pvrdma-abi.h
index c8c1d2d6df4d..c6569b0032ec 100644
--- a/include/uapi/rdma/vmw_pvrdma-abi.h
+++ b/include/uapi/rdma/vmw_pvrdma-abi.h
@@ -125,7 +125,8 @@ enum pvrdma_wc_flags {
 	PVRDMA_WC_IP_CSUM_OK		= 1 << 3,
 	PVRDMA_WC_WITH_SMAC		= 1 << 4,
 	PVRDMA_WC_WITH_VLAN		= 1 << 5,
-	PVRDMA_WC_FLAGS_MAX		= PVRDMA_WC_WITH_VLAN,
+	PVRDMA_WC_WITH_NETWORK_HDR_TYPE	= 1 << 6,
+	PVRDMA_WC_FLAGS_MAX		= PVRDMA_WC_WITH_NETWORK_HDR_TYPE,
 };
 
 struct pvrdma_alloc_ucontext_resp {
@@ -283,7 +284,8 @@ struct pvrdma_cqe {
 	__u8 dlid_path_bits;
 	__u8 port_num;
 	__u8 smac[6];
-	__u8 reserved2[7]; /* Pad to next power of 2 (64). */
+	__u8 network_hdr_type;
+	__u8 reserved2[6]; /* Pad to next power of 2 (64). */
 };
 
 #endif /* __VMW_PVRDMA_ABI_H__ */
-- 
cgit v1.2.3


From fac9658cabb98afb68ef1630c558864e6f559c07 Mon Sep 17 00:00:00 2001
From: Matan Barak <matanb@mellanox.com>
Date: Thu, 3 Aug 2017 16:06:57 +0300
Subject: IB/core: Add new ioctl interface

In this ioctl interface, processing the command starts from
properties of the command and fetching the appropriate user objects
before calling the handler.

Parsing and validation is done according to a specifier declared by
the driver's code. In the driver, all supported objects are declared.
These objects are separated to different object namepsaces. Dividing
objects to namespaces is done at initialization by using the higher
bits of the object ids. This initialization can mix objects declared
in different places to one parsing tree using in this ioctl interface.

For each object we list all supported methods. Similarly to objects,
methods are separated to method namespaces too. Namespacing is done
similarly to the objects case. This could be used in order to add
methods to an existing object.

Each method has a specific handler, which could be either a default
handler or a driver specific handler.
Along with the handler, a bunch of attributes are specified as well.
Similarly to objects and method, attributes are namespaced and hashed
by their ids at initialization too. All supported attributes are
subject to automatic fetching and validation. These attributes include
the command, response and the method's related objects' ids.

When these entities (objects, methods and attributes) are used, the
high bits of the entities ids are used in order to calculate the hash
bucket index. Then, these high bits are masked out in order to have a
zero based index. Since we use these high bits for both bucketing and
namespacing, we get a compact representation and O(1) array access.
This is mandatory for efficient dispatching.

Each attribute has a type (PTR_IN, PTR_OUT, IDR and FD) and a length.
Attributes could be validated through some attributes, like:
(*) Minimum size / Exact size
(*) Fops for FD
(*) Object type for IDR

If an IDR/fd attribute is specified, the kernel also states the object
type and the required access (NEW, WRITE, READ or DESTROY).
All uobject/fd management is done automatically by the infrastructure,
meaning - the infrastructure will fail concurrent commands that at
least one of them requires concurrent access (WRITE/DESTROY),
synchronize actions with device removals (dissociate context events)
and take care of reference counting (increase/decrease) for concurrent
actions invocation. The reference counts on the actual kernel objects
shall be handled by the handlers.

 objects
+--------+
|        |
|        |   methods                                                                +--------+
|        |   ns         method      method_spec                           +-----+   |len     |
+--------+  +------+[d]+-------+   +----------------+[d]+------------+    |attr1+-> |type    |
| object +> |method+-> | spec  +-> +  attr_buckets  +-> |default_chain+--> +-----+   |idr_type|
+--------+  +------+   |handler|   |                |   +------------+    |attr2|   |access  |
|        |  |      |   +-------+   +----------------+   |driver chain|    +-----+   +--------+
|        |  |      |                                    +------------+
|        |  +------+
|        |
|        |
|        |
|        |
|        |
|        |
|        |
|        |
|        |
|        |
+--------+

[d] = Hash ids to groups using the high order bits

The right types table is also chosen by using the high bits from
the ids. Currently we have either default or driver specific groups.

Once validation and object fetching (or creation) completed, we call
the handler:
int (*handler)(struct ib_device *ib_dev, struct ib_uverbs_file *ufile,
               struct uverbs_attr_bundle *ctx);

ctx bundles attributes of different namespaces. Each element there
is an array of attributes which corresponds to one namespaces of
attributes. For example, in the usually used case:

 ctx                               core
+----------------------------+     +------------+
| core:                      +---> | valid      |
+----------------------------+     | cmd_attr   |
| driver:                    |     +------------+
|----------------------------+--+  | valid      |
                                |  | cmd_attr   |
                                |  +------------+
                                |  | valid      |
                                |  | obj_attr   |
                                |  +------------+
                                |
                                |  drivers
                                |  +------------+
                                +> | valid      |
                                   | cmd_attr   |
                                   +------------+
                                   | valid      |
                                   | cmd_attr   |
                                   +------------+
                                   | valid      |
                                   | obj_attr   |
                                   +------------+

Signed-off-by: Matan Barak <matanb@mellanox.com>
Reviewed-by: Yishai Hadas <yishaih@mellanox.com>
Signed-off-by: Doug Ledford <dledford@redhat.com>
---
 drivers/infiniband/core/Makefile       |   2 +-
 drivers/infiniband/core/rdma_core.c    |  46 +++++
 drivers/infiniband/core/rdma_core.h    |   5 +
 drivers/infiniband/core/uverbs_ioctl.c | 364 +++++++++++++++++++++++++++++++++
 include/rdma/ib_verbs.h                |   2 +
 include/rdma/uverbs_ioctl.h            | 101 ++++++++-
 include/uapi/rdma/rdma_user_ioctl.h    |  33 +++
 7 files changed, 543 insertions(+), 10 deletions(-)
 create mode 100644 drivers/infiniband/core/uverbs_ioctl.c

(limited to 'include')

diff --git a/drivers/infiniband/core/Makefile b/drivers/infiniband/core/Makefile
index 920609a0872e..746756dc9877 100644
--- a/drivers/infiniband/core/Makefile
+++ b/drivers/infiniband/core/Makefile
@@ -32,4 +32,4 @@ ib_umad-y :=			user_mad.o
 ib_ucm-y :=			ucm.o
 
 ib_uverbs-y :=			uverbs_main.o uverbs_cmd.o uverbs_marshall.o \
-				rdma_core.o uverbs_std_types.o
+				rdma_core.o uverbs_std_types.o uverbs_ioctl.o
diff --git a/drivers/infiniband/core/rdma_core.c b/drivers/infiniband/core/rdma_core.c
index 0fe8ef913387..2a2f002ac7cb 100644
--- a/drivers/infiniband/core/rdma_core.c
+++ b/drivers/infiniband/core/rdma_core.c
@@ -36,10 +36,56 @@
 #include <rdma/uverbs_types.h>
 #include <linux/rcupdate.h>
 #include <rdma/uverbs_ioctl.h>
+#include <rdma/rdma_user_ioctl.h>
 #include "uverbs.h"
 #include "core_priv.h"
 #include "rdma_core.h"
 
+int uverbs_ns_idx(u16 *id, unsigned int ns_count)
+{
+	int ret = (*id & UVERBS_ID_NS_MASK) >> UVERBS_ID_NS_SHIFT;
+
+	if (ret >= ns_count)
+		return -EINVAL;
+
+	*id &= ~UVERBS_ID_NS_MASK;
+	return ret;
+}
+
+const struct uverbs_object_spec *uverbs_get_object(const struct ib_device *ibdev,
+						   uint16_t object)
+{
+	const struct uverbs_root_spec *object_hash = ibdev->specs_root;
+	const struct uverbs_object_spec_hash *objects;
+	int ret = uverbs_ns_idx(&object, object_hash->num_buckets);
+
+	if (ret < 0)
+		return NULL;
+
+	objects = object_hash->object_buckets[ret];
+
+	if (object >= objects->num_objects)
+		return NULL;
+
+	return objects->objects[object];
+}
+
+const struct uverbs_method_spec *uverbs_get_method(const struct uverbs_object_spec *object,
+						   uint16_t method)
+{
+	const struct uverbs_method_spec_hash *methods;
+	int ret = uverbs_ns_idx(&method, object->num_buckets);
+
+	if (ret < 0)
+		return NULL;
+
+	methods = object->method_buckets[ret];
+	if (method >= methods->num_methods)
+		return NULL;
+
+	return methods->methods[method];
+}
+
 void uverbs_uobject_get(struct ib_uobject *uobject)
 {
 	kref_get(&uobject->ref);
diff --git a/drivers/infiniband/core/rdma_core.h b/drivers/infiniband/core/rdma_core.h
index 9ed6ad0324c7..1efcf93238dd 100644
--- a/drivers/infiniband/core/rdma_core.h
+++ b/drivers/infiniband/core/rdma_core.h
@@ -43,6 +43,11 @@
 #include <rdma/ib_verbs.h>
 #include <linux/mutex.h>
 
+int uverbs_ns_idx(u16 *id, unsigned int ns_count);
+const struct uverbs_object_spec *uverbs_get_object(const struct ib_device *ibdev,
+						   uint16_t object);
+const struct uverbs_method_spec *uverbs_get_method(const struct uverbs_object_spec *object,
+						   uint16_t method);
 /*
  * These functions initialize the context and cleanups its uobjects.
  * The context has a list of objects which is protected by a mutex
diff --git a/drivers/infiniband/core/uverbs_ioctl.c b/drivers/infiniband/core/uverbs_ioctl.c
new file mode 100644
index 000000000000..5286ad57d903
--- /dev/null
+++ b/drivers/infiniband/core/uverbs_ioctl.c
@@ -0,0 +1,364 @@
+/*
+ * Copyright (c) 2017, Mellanox Technologies inc.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <rdma/rdma_user_ioctl.h>
+#include <rdma/uverbs_ioctl.h>
+#include "rdma_core.h"
+#include "uverbs.h"
+
+static int uverbs_process_attr(struct ib_device *ibdev,
+			       struct ib_ucontext *ucontext,
+			       const struct ib_uverbs_attr *uattr,
+			       u16 attr_id,
+			       const struct uverbs_attr_spec_hash *attr_spec_bucket,
+			       struct uverbs_attr_bundle_hash *attr_bundle_h,
+			       struct ib_uverbs_attr __user *uattr_ptr)
+{
+	const struct uverbs_attr_spec *spec;
+	struct uverbs_attr *e;
+	const struct uverbs_object_spec *object;
+	struct uverbs_obj_attr *o_attr;
+	struct uverbs_attr *elements = attr_bundle_h->attrs;
+
+	if (uattr->reserved)
+		return -EINVAL;
+
+	if (attr_id >= attr_spec_bucket->num_attrs) {
+		if (uattr->flags & UVERBS_ATTR_F_MANDATORY)
+			return -EINVAL;
+		else
+			return 0;
+	}
+
+	spec = &attr_spec_bucket->attrs[attr_id];
+	e = &elements[attr_id];
+	e->uattr = uattr_ptr;
+
+	switch (spec->type) {
+	case UVERBS_ATTR_TYPE_PTR_IN:
+	case UVERBS_ATTR_TYPE_PTR_OUT:
+		if (uattr->len < spec->len ||
+		    (!(spec->flags & UVERBS_ATTR_SPEC_F_MIN_SZ) &&
+		     uattr->len > spec->len))
+			return -EINVAL;
+
+		e->ptr_attr.data = uattr->data;
+		e->ptr_attr.len = uattr->len;
+		e->ptr_attr.flags = uattr->flags;
+		break;
+
+	case UVERBS_ATTR_TYPE_IDR:
+		if (uattr->data >> 32)
+			return -EINVAL;
+	/* fall through */
+	case UVERBS_ATTR_TYPE_FD:
+		if (uattr->len != 0 || !ucontext || uattr->data > INT_MAX)
+			return -EINVAL;
+
+		o_attr = &e->obj_attr;
+		object = uverbs_get_object(ibdev, spec->obj.obj_type);
+		if (!object)
+			return -EINVAL;
+		o_attr->type = object->type_attrs;
+
+		o_attr->id = (int)uattr->data;
+		o_attr->uobject = uverbs_get_uobject_from_context(
+					o_attr->type,
+					ucontext,
+					spec->obj.access,
+					o_attr->id);
+
+		if (IS_ERR(o_attr->uobject))
+			return PTR_ERR(o_attr->uobject);
+
+		if (spec->obj.access == UVERBS_ACCESS_NEW) {
+			u64 id = o_attr->uobject->id;
+
+			/* Copy the allocated id to the user-space */
+			if (put_user(id, &e->uattr->data)) {
+				uverbs_finalize_object(o_attr->uobject,
+						       UVERBS_ACCESS_NEW,
+						       false);
+				return -EFAULT;
+			}
+		}
+
+		break;
+	default:
+		return -EOPNOTSUPP;
+	}
+
+	set_bit(attr_id, attr_bundle_h->valid_bitmap);
+	return 0;
+}
+
+static int uverbs_uattrs_process(struct ib_device *ibdev,
+				 struct ib_ucontext *ucontext,
+				 const struct ib_uverbs_attr *uattrs,
+				 size_t num_uattrs,
+				 const struct uverbs_method_spec *method,
+				 struct uverbs_attr_bundle *attr_bundle,
+				 struct ib_uverbs_attr __user *uattr_ptr)
+{
+	size_t i;
+	int ret = 0;
+	int num_given_buckets = 0;
+
+	for (i = 0; i < num_uattrs; i++) {
+		const struct ib_uverbs_attr *uattr = &uattrs[i];
+		u16 attr_id = uattr->attr_id;
+		struct uverbs_attr_spec_hash *attr_spec_bucket;
+
+		ret = uverbs_ns_idx(&attr_id, method->num_buckets);
+		if (ret < 0) {
+			if (uattr->flags & UVERBS_ATTR_F_MANDATORY) {
+				uverbs_finalize_objects(attr_bundle,
+							method->attr_buckets,
+							num_given_buckets,
+							false);
+				return ret;
+			}
+			continue;
+		}
+
+		/*
+		 * ret is the found ns, so increase num_given_buckets if
+		 * necessary.
+		 */
+		if (ret >= num_given_buckets)
+			num_given_buckets = ret + 1;
+
+		attr_spec_bucket = method->attr_buckets[ret];
+		ret = uverbs_process_attr(ibdev, ucontext, uattr, attr_id,
+					  attr_spec_bucket, &attr_bundle->hash[ret],
+					  uattr_ptr++);
+		if (ret) {
+			uverbs_finalize_objects(attr_bundle,
+						method->attr_buckets,
+						num_given_buckets,
+						false);
+			return ret;
+		}
+	}
+
+	return num_given_buckets;
+}
+
+static int uverbs_validate_kernel_mandatory(const struct uverbs_method_spec *method_spec,
+					    struct uverbs_attr_bundle *attr_bundle)
+{
+	unsigned int i;
+
+	for (i = 0; i < attr_bundle->num_buckets; i++) {
+		struct uverbs_attr_spec_hash *attr_spec_bucket =
+			method_spec->attr_buckets[i];
+
+		if (!bitmap_subset(attr_spec_bucket->mandatory_attrs_bitmask,
+				   attr_bundle->hash[i].valid_bitmap,
+				   attr_spec_bucket->num_attrs))
+			return -EINVAL;
+	}
+
+	return 0;
+}
+
+static int uverbs_handle_method(struct ib_uverbs_attr __user *uattr_ptr,
+				const struct ib_uverbs_attr *uattrs,
+				size_t num_uattrs,
+				struct ib_device *ibdev,
+				struct ib_uverbs_file *ufile,
+				const struct uverbs_method_spec *method_spec,
+				struct uverbs_attr_bundle *attr_bundle)
+{
+	int ret;
+	int finalize_ret;
+	int num_given_buckets;
+
+	num_given_buckets = uverbs_uattrs_process(ibdev, ufile->ucontext, uattrs,
+						  num_uattrs, method_spec,
+						  attr_bundle, uattr_ptr);
+	if (num_given_buckets <= 0)
+		return -EINVAL;
+
+	attr_bundle->num_buckets = num_given_buckets;
+	ret = uverbs_validate_kernel_mandatory(method_spec, attr_bundle);
+	if (ret)
+		goto cleanup;
+
+	ret = method_spec->handler(ibdev, ufile, attr_bundle);
+cleanup:
+	finalize_ret = uverbs_finalize_objects(attr_bundle,
+					       method_spec->attr_buckets,
+					       attr_bundle->num_buckets,
+					       !ret);
+
+	return ret ? ret : finalize_ret;
+}
+
+#define UVERBS_OPTIMIZE_USING_STACK_SZ  256
+static long ib_uverbs_cmd_verbs(struct ib_device *ib_dev,
+				struct ib_uverbs_file *file,
+				struct ib_uverbs_ioctl_hdr *hdr,
+				void __user *buf)
+{
+	const struct uverbs_object_spec *object_spec;
+	const struct uverbs_method_spec *method_spec;
+	long err = 0;
+	unsigned int i;
+	struct {
+		struct ib_uverbs_attr		*uattrs;
+		struct uverbs_attr_bundle	*uverbs_attr_bundle;
+	} *ctx = NULL;
+	struct uverbs_attr *curr_attr;
+	unsigned long *curr_bitmap;
+	size_t ctx_size;
+#ifdef UVERBS_OPTIMIZE_USING_STACK_SZ
+	uintptr_t data[UVERBS_OPTIMIZE_USING_STACK_SZ / sizeof(uintptr_t)];
+#endif
+
+	if (hdr->reserved)
+		return -EINVAL;
+
+	object_spec = uverbs_get_object(ib_dev, hdr->object_id);
+	if (!object_spec)
+		return -EOPNOTSUPP;
+
+	method_spec = uverbs_get_method(object_spec, hdr->method_id);
+	if (!method_spec)
+		return -EOPNOTSUPP;
+
+	if ((method_spec->flags & UVERBS_ACTION_FLAG_CREATE_ROOT) ^ !file->ucontext)
+		return -EINVAL;
+
+	ctx_size = sizeof(*ctx) +
+		   sizeof(struct uverbs_attr_bundle) +
+		   sizeof(struct uverbs_attr_bundle_hash) * method_spec->num_buckets +
+		   sizeof(*ctx->uattrs) * hdr->num_attrs +
+		   sizeof(*ctx->uverbs_attr_bundle->hash[0].attrs) *
+		   method_spec->num_child_attrs +
+		   sizeof(*ctx->uverbs_attr_bundle->hash[0].valid_bitmap) *
+			(method_spec->num_child_attrs / BITS_PER_LONG +
+			 method_spec->num_buckets);
+
+#ifdef UVERBS_OPTIMIZE_USING_STACK_SZ
+	if (ctx_size <= UVERBS_OPTIMIZE_USING_STACK_SZ)
+		ctx = (void *)data;
+
+	if (!ctx)
+#endif
+	ctx = kmalloc(ctx_size, GFP_KERNEL);
+	if (!ctx)
+		return -ENOMEM;
+
+	ctx->uverbs_attr_bundle = (void *)ctx + sizeof(*ctx);
+	ctx->uattrs = (void *)(ctx->uverbs_attr_bundle + 1) +
+			      (sizeof(ctx->uverbs_attr_bundle->hash[0]) *
+			       method_spec->num_buckets);
+	curr_attr = (void *)(ctx->uattrs + hdr->num_attrs);
+	curr_bitmap = (void *)(curr_attr + method_spec->num_child_attrs);
+
+	/*
+	 * We just fill the pointers and num_attrs here. The data itself will be
+	 * filled at a later stage (uverbs_process_attr)
+	 */
+	for (i = 0; i < method_spec->num_buckets; i++) {
+		unsigned int curr_num_attrs = method_spec->attr_buckets[i]->num_attrs;
+
+		ctx->uverbs_attr_bundle->hash[i].attrs = curr_attr;
+		curr_attr += curr_num_attrs;
+		ctx->uverbs_attr_bundle->hash[i].num_attrs = curr_num_attrs;
+		ctx->uverbs_attr_bundle->hash[i].valid_bitmap = curr_bitmap;
+		bitmap_zero(curr_bitmap, curr_num_attrs);
+		curr_bitmap += BITS_TO_LONGS(curr_num_attrs);
+	}
+
+	err = copy_from_user(ctx->uattrs, buf,
+			     sizeof(*ctx->uattrs) * hdr->num_attrs);
+	if (err) {
+		err = -EFAULT;
+		goto out;
+	}
+
+	err = uverbs_handle_method(buf, ctx->uattrs, hdr->num_attrs, ib_dev,
+				   file, method_spec, ctx->uverbs_attr_bundle);
+out:
+#ifdef UVERBS_OPTIMIZE_USING_STACK_SZ
+	if (ctx_size > UVERBS_OPTIMIZE_USING_STACK_SZ)
+#endif
+	kfree(ctx);
+	return err;
+}
+
+#define IB_UVERBS_MAX_CMD_SZ 4096
+
+long ib_uverbs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
+{
+	struct ib_uverbs_file *file = filp->private_data;
+	struct ib_uverbs_ioctl_hdr __user *user_hdr =
+		(struct ib_uverbs_ioctl_hdr __user *)arg;
+	struct ib_uverbs_ioctl_hdr hdr;
+	struct ib_device *ib_dev;
+	int srcu_key;
+	long err;
+
+	srcu_key = srcu_read_lock(&file->device->disassociate_srcu);
+	ib_dev = srcu_dereference(file->device->ib_dev,
+				  &file->device->disassociate_srcu);
+	if (!ib_dev) {
+		err = -EIO;
+		goto out;
+	}
+
+	if (cmd == RDMA_VERBS_IOCTL) {
+		err = copy_from_user(&hdr, user_hdr, sizeof(hdr));
+
+		if (err || hdr.length > IB_UVERBS_MAX_CMD_SZ ||
+		    hdr.length != sizeof(hdr) + hdr.num_attrs * sizeof(struct ib_uverbs_attr)) {
+			err = -EINVAL;
+			goto out;
+		}
+
+		if (hdr.reserved) {
+			err = -EOPNOTSUPP;
+			goto out;
+		}
+
+		err = ib_uverbs_cmd_verbs(ib_dev, file, &hdr,
+					  (__user void *)arg + sizeof(hdr));
+	} else {
+		err = -ENOIOCTLCMD;
+	}
+out:
+	srcu_read_unlock(&file->device->disassociate_srcu, srcu_key);
+
+	return err;
+}
diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h
index 1b4bb8743969..e6df68048517 100644
--- a/include/rdma/ib_verbs.h
+++ b/include/rdma/ib_verbs.h
@@ -2348,6 +2348,8 @@ struct ib_device {
 	void (*get_dev_fw_str)(struct ib_device *, char *str);
 	const struct cpumask *(*get_vector_affinity)(struct ib_device *ibdev,
 						     int comp_vector);
+
+	struct uverbs_root_spec		*specs_root;
 };
 
 struct ib_client {
diff --git a/include/rdma/uverbs_ioctl.h b/include/rdma/uverbs_ioctl.h
index d3ec02b7d937..f83f56329761 100644
--- a/include/rdma/uverbs_ioctl.h
+++ b/include/rdma/uverbs_ioctl.h
@@ -43,6 +43,8 @@
 
 enum uverbs_attr_type {
 	UVERBS_ATTR_TYPE_NA,
+	UVERBS_ATTR_TYPE_PTR_IN,
+	UVERBS_ATTR_TYPE_PTR_OUT,
 	UVERBS_ATTR_TYPE_IDR,
 	UVERBS_ATTR_TYPE_FD,
 };
@@ -54,29 +56,110 @@ enum uverbs_obj_access {
 	UVERBS_ACCESS_DESTROY
 };
 
+enum {
+	UVERBS_ATTR_SPEC_F_MANDATORY	= 1U << 0,
+	/* Support extending attributes by length */
+	UVERBS_ATTR_SPEC_F_MIN_SZ	= 1U << 1,
+};
+
 struct uverbs_attr_spec {
 	enum uverbs_attr_type		type;
-	struct {
-		/*
-		 * higher bits mean the namespace and lower bits mean
-		 * the type id within the namespace.
-		 */
-		u16			obj_type;
-		u8			access;
-	} obj;
+	union {
+		u16				len;
+		struct {
+			/*
+			 * higher bits mean the namespace and lower bits mean
+			 * the type id within the namespace.
+			 */
+			u16			obj_type;
+			u8			access;
+		} obj;
+	};
+	/* Combination of bits from enum UVERBS_ATTR_SPEC_F_XXXX */
+	u8				flags;
 };
 
 struct uverbs_attr_spec_hash {
 	size_t				num_attrs;
+	unsigned long			*mandatory_attrs_bitmask;
 	struct uverbs_attr_spec		attrs[0];
 };
 
+struct uverbs_attr_bundle;
+struct ib_uverbs_file;
+
+enum {
+	/*
+	 * Action marked with this flag creates a context (or root for all
+	 * objects).
+	 */
+	UVERBS_ACTION_FLAG_CREATE_ROOT = 1U << 0,
+};
+
+struct uverbs_method_spec {
+	/* Combination of bits from enum UVERBS_ACTION_FLAG_XXXX */
+	u32						flags;
+	size_t						num_buckets;
+	size_t						num_child_attrs;
+	int (*handler)(struct ib_device *ib_dev, struct ib_uverbs_file *ufile,
+		       struct uverbs_attr_bundle *ctx);
+	struct uverbs_attr_spec_hash		*attr_buckets[0];
+};
+
+struct uverbs_method_spec_hash {
+	size_t					num_methods;
+	struct uverbs_method_spec		*methods[0];
+};
+
+struct uverbs_object_spec {
+	const struct uverbs_obj_type		*type_attrs;
+	size_t					num_buckets;
+	struct uverbs_method_spec_hash		*method_buckets[0];
+};
+
+struct uverbs_object_spec_hash {
+	size_t					num_objects;
+	struct uverbs_object_spec		*objects[0];
+};
+
+struct uverbs_root_spec {
+	size_t					num_buckets;
+	struct uverbs_object_spec_hash		*object_buckets[0];
+};
+
+/* =================================================
+ *              Parsing infrastructure
+ * =================================================
+ */
+
+struct uverbs_ptr_attr {
+	union {
+		u64		data;
+		void	__user *ptr;
+	};
+	u16		len;
+	/* Combination of bits from enum UVERBS_ATTR_F_XXXX */
+	u16		flags;
+};
+
 struct uverbs_obj_attr {
+	/* pointer to the kernel descriptor -> type, access, etc */
+	const struct uverbs_obj_type	*type;
 	struct ib_uobject		*uobject;
+	/* fd or id in idr of this object */
+	int				id;
 };
 
 struct uverbs_attr {
-	struct uverbs_obj_attr	obj_attr;
+	/*
+	 * pointer to the user-space given attribute, in order to write the
+	 * new uobject's id or update flags.
+	 */
+	struct ib_uverbs_attr __user	*uattr;
+	union {
+		struct uverbs_ptr_attr	ptr_attr;
+		struct uverbs_obj_attr	obj_attr;
+	};
 };
 
 struct uverbs_attr_bundle_hash {
diff --git a/include/uapi/rdma/rdma_user_ioctl.h b/include/uapi/rdma/rdma_user_ioctl.h
index 9388125ad51b..165a27e969d5 100644
--- a/include/uapi/rdma/rdma_user_ioctl.h
+++ b/include/uapi/rdma/rdma_user_ioctl.h
@@ -43,6 +43,39 @@
 /* Legacy name, for user space application which already use it */
 #define IB_IOCTL_MAGIC		RDMA_IOCTL_MAGIC
 
+#define RDMA_VERBS_IOCTL \
+	_IOWR(RDMA_IOCTL_MAGIC, 1, struct ib_uverbs_ioctl_hdr)
+
+#define UVERBS_ID_NS_MASK 0xF000
+#define UVERBS_ID_NS_SHIFT 12
+
+enum {
+	/* User input */
+	UVERBS_ATTR_F_MANDATORY = 1U << 0,
+	/*
+	 * Valid output bit should be ignored and considered set in
+	 * mandatory fields. This bit is kernel output.
+	 */
+	UVERBS_ATTR_F_VALID_OUTPUT = 1U << 1,
+};
+
+struct ib_uverbs_attr {
+	__u16 attr_id;		/* command specific type attribute */
+	__u16 len;		/* only for pointers */
+	__u16 flags;		/* combination of UVERBS_ATTR_F_XXXX */
+	__u16 reserved;
+	__u64 data;		/* ptr to command, inline data or idr/fd */
+};
+
+struct ib_uverbs_ioctl_hdr {
+	__u16 length;
+	__u16 object_id;
+	__u16 method_id;
+	__u16 num_attrs;
+	__u64 reserved;
+	struct ib_uverbs_attr  attrs[0];
+};
+
 /*
  * General blocks assignments
  * It is closed on purpose do not expose it it user space
-- 
cgit v1.2.3


From 5009010fbf54bdc27e57baca490e1f9d6a4609e0 Mon Sep 17 00:00:00 2001
From: Matan Barak <matanb@mellanox.com>
Date: Thu, 3 Aug 2017 16:06:58 +0300
Subject: IB/core: Declare an object instead of declaring only type attributes

Switch all uverbs_type_attrs_xxxx with DECLARE_UVERBS_OBJECT
macros. This will be later used in order to embed the object
specific methods in the objects as well.

Signed-off-by: Matan Barak <matanb@mellanox.com>
Reviewed-by: Yishai Hadas <yishaih@mellanox.com>
Signed-off-by: Doug Ledford <dledford@redhat.com>
---
 drivers/infiniband/core/uverbs_std_types.c | 112 +++++++++++++----------------
 include/rdma/uverbs_ioctl.h                |  16 +++++
 include/rdma/uverbs_std_types.h            |  40 +++++------
 include/rdma/uverbs_types.h                |  38 ++++++----
 4 files changed, 107 insertions(+), 99 deletions(-)

(limited to 'include')

diff --git a/drivers/infiniband/core/uverbs_std_types.c b/drivers/infiniband/core/uverbs_std_types.c
index ef293379f37a..b75c7da0d0a4 100644
--- a/drivers/infiniband/core/uverbs_std_types.c
+++ b/drivers/infiniband/core/uverbs_std_types.c
@@ -209,67 +209,51 @@ static int uverbs_hot_unplug_completion_event_file(struct ib_uobject_file *uobj_
 	return 0;
 };
 
-const struct uverbs_obj_fd_type uverbs_type_attrs_comp_channel = {
-	.type = UVERBS_TYPE_ALLOC_FD(sizeof(struct ib_uverbs_completion_event_file), 0),
-	.context_closed = uverbs_hot_unplug_completion_event_file,
-	.fops = &uverbs_event_fops,
-	.name = "[infinibandevent]",
-	.flags = O_RDONLY,
-};
-
-const struct uverbs_obj_idr_type uverbs_type_attrs_cq = {
-	.type = UVERBS_TYPE_ALLOC_IDR_SZ(sizeof(struct ib_ucq_object), 0),
-	.destroy_object = uverbs_free_cq,
-};
-
-const struct uverbs_obj_idr_type uverbs_type_attrs_qp = {
-	.type = UVERBS_TYPE_ALLOC_IDR_SZ(sizeof(struct ib_uqp_object), 0),
-	.destroy_object = uverbs_free_qp,
-};
-
-const struct uverbs_obj_idr_type uverbs_type_attrs_mw = {
-	.type = UVERBS_TYPE_ALLOC_IDR(0),
-	.destroy_object = uverbs_free_mw,
-};
-
-const struct uverbs_obj_idr_type uverbs_type_attrs_mr = {
-	/* 1 is used in order to free the MR after all the MWs */
-	.type = UVERBS_TYPE_ALLOC_IDR(1),
-	.destroy_object = uverbs_free_mr,
-};
-
-const struct uverbs_obj_idr_type uverbs_type_attrs_srq = {
-	.type = UVERBS_TYPE_ALLOC_IDR_SZ(sizeof(struct ib_usrq_object), 0),
-	.destroy_object = uverbs_free_srq,
-};
-
-const struct uverbs_obj_idr_type uverbs_type_attrs_ah = {
-	.type = UVERBS_TYPE_ALLOC_IDR(0),
-	.destroy_object = uverbs_free_ah,
-};
-
-const struct uverbs_obj_idr_type uverbs_type_attrs_flow = {
-	.type = UVERBS_TYPE_ALLOC_IDR(0),
-	.destroy_object = uverbs_free_flow,
-};
-
-const struct uverbs_obj_idr_type uverbs_type_attrs_wq = {
-	.type = UVERBS_TYPE_ALLOC_IDR_SZ(sizeof(struct ib_uwq_object), 0),
-	.destroy_object = uverbs_free_wq,
-};
-
-const struct uverbs_obj_idr_type uverbs_type_attrs_rwq_ind_table = {
-	.type = UVERBS_TYPE_ALLOC_IDR(0),
-	.destroy_object = uverbs_free_rwq_ind_tbl,
-};
-
-const struct uverbs_obj_idr_type uverbs_type_attrs_xrcd = {
-	.type = UVERBS_TYPE_ALLOC_IDR_SZ(sizeof(struct ib_uxrcd_object), 0),
-	.destroy_object = uverbs_free_xrcd,
-};
-
-const struct uverbs_obj_idr_type uverbs_type_attrs_pd = {
-	/* 2 is used in order to free the PD after MRs */
-	.type = UVERBS_TYPE_ALLOC_IDR(2),
-	.destroy_object = uverbs_free_pd,
-};
+DECLARE_UVERBS_OBJECT(uverbs_object_comp_channel,
+		      UVERBS_OBJECT_COMP_CHANNEL,
+		      &UVERBS_TYPE_ALLOC_FD(0,
+					      sizeof(struct ib_uverbs_completion_event_file),
+					      uverbs_hot_unplug_completion_event_file,
+					      &uverbs_event_fops,
+					      "[infinibandevent]", O_RDONLY));
+
+DECLARE_UVERBS_OBJECT(uverbs_object_cq, UVERBS_OBJECT_CQ,
+		      &UVERBS_TYPE_ALLOC_IDR_SZ(sizeof(struct ib_ucq_object), 0,
+						  uverbs_free_cq));
+
+DECLARE_UVERBS_OBJECT(uverbs_object_qp, UVERBS_OBJECT_QP,
+		      &UVERBS_TYPE_ALLOC_IDR_SZ(sizeof(struct ib_uqp_object), 0,
+						  uverbs_free_qp));
+
+DECLARE_UVERBS_OBJECT(uverbs_object_mw, UVERBS_OBJECT_MW,
+		      &UVERBS_TYPE_ALLOC_IDR(0, uverbs_free_mw));
+
+DECLARE_UVERBS_OBJECT(uverbs_object_mr, UVERBS_OBJECT_MR,
+		      /* 1 is used in order to free the MR after all the MWs */
+		      &UVERBS_TYPE_ALLOC_IDR(1, uverbs_free_mr));
+
+DECLARE_UVERBS_OBJECT(uverbs_object_srq, UVERBS_OBJECT_SRQ,
+		      &UVERBS_TYPE_ALLOC_IDR_SZ(sizeof(struct ib_usrq_object), 0,
+						  uverbs_free_srq));
+
+DECLARE_UVERBS_OBJECT(uverbs_object_ah, UVERBS_OBJECT_AH,
+		      &UVERBS_TYPE_ALLOC_IDR(0, uverbs_free_ah));
+
+DECLARE_UVERBS_OBJECT(uverbs_object_flow, UVERBS_OBJECT_FLOW,
+		      &UVERBS_TYPE_ALLOC_IDR(0, uverbs_free_flow));
+
+DECLARE_UVERBS_OBJECT(uverbs_object_wq, UVERBS_OBJECT_WQ,
+		      &UVERBS_TYPE_ALLOC_IDR_SZ(sizeof(struct ib_uwq_object), 0,
+						  uverbs_free_wq));
+
+DECLARE_UVERBS_OBJECT(uverbs_object_rwq_ind_table,
+		      UVERBS_OBJECT_RWQ_IND_TBL,
+		      &UVERBS_TYPE_ALLOC_IDR(0, uverbs_free_rwq_ind_tbl));
+
+DECLARE_UVERBS_OBJECT(uverbs_object_xrcd, UVERBS_OBJECT_XRCD,
+		      &UVERBS_TYPE_ALLOC_IDR_SZ(sizeof(struct ib_uxrcd_object), 0,
+						  uverbs_free_xrcd));
+
+DECLARE_UVERBS_OBJECT(uverbs_object_pd, UVERBS_OBJECT_PD,
+		      /* 2 is used in order to free the PD after MRs */
+		      &UVERBS_TYPE_ALLOC_IDR(2, uverbs_free_pd));
diff --git a/include/rdma/uverbs_ioctl.h b/include/rdma/uverbs_ioctl.h
index f83f56329761..99130083615e 100644
--- a/include/rdma/uverbs_ioctl.h
+++ b/include/rdma/uverbs_ioctl.h
@@ -127,6 +127,22 @@ struct uverbs_root_spec {
 	struct uverbs_object_spec_hash		*object_buckets[0];
 };
 
+/*
+ * =======================================
+ *	Verbs definitions
+ * =======================================
+ */
+
+struct uverbs_object_def {
+	const struct uverbs_obj_type	        *type_attrs;
+};
+
+#define _UVERBS_OBJECT(_id, _type_attrs, ...)				\
+	((const struct uverbs_object_def) {				\
+	 .type_attrs = _type_attrs})
+#define DECLARE_UVERBS_OBJECT(_name, _id, _type_attrs, ...)		\
+	const struct uverbs_object_def _name =				\
+		_UVERBS_OBJECT(_id, _type_attrs, ##__VA_ARGS__)
 /* =================================================
  *              Parsing infrastructure
  * =================================================
diff --git a/include/rdma/uverbs_std_types.h b/include/rdma/uverbs_std_types.h
index 7771ce966952..eda271b4aa6c 100644
--- a/include/rdma/uverbs_std_types.h
+++ b/include/rdma/uverbs_std_types.h
@@ -35,18 +35,18 @@
 
 #include <rdma/uverbs_types.h>
 
-extern const struct uverbs_obj_fd_type uverbs_type_attrs_comp_channel;
-extern const struct uverbs_obj_idr_type uverbs_type_attrs_cq;
-extern const struct uverbs_obj_idr_type uverbs_type_attrs_qp;
-extern const struct uverbs_obj_idr_type uverbs_type_attrs_rwq_ind_table;
-extern const struct uverbs_obj_idr_type uverbs_type_attrs_wq;
-extern const struct uverbs_obj_idr_type uverbs_type_attrs_srq;
-extern const struct uverbs_obj_idr_type uverbs_type_attrs_ah;
-extern const struct uverbs_obj_idr_type uverbs_type_attrs_flow;
-extern const struct uverbs_obj_idr_type uverbs_type_attrs_mr;
-extern const struct uverbs_obj_idr_type uverbs_type_attrs_mw;
-extern const struct uverbs_obj_idr_type uverbs_type_attrs_pd;
-extern const struct uverbs_obj_idr_type uverbs_type_attrs_xrcd;
+extern const struct uverbs_object_def uverbs_object_comp_channel;
+extern const struct uverbs_object_def uverbs_object_cq;
+extern const struct uverbs_object_def uverbs_object_qp;
+extern const struct uverbs_object_def uverbs_object_rwq_ind_table;
+extern const struct uverbs_object_def uverbs_object_wq;
+extern const struct uverbs_object_def uverbs_object_srq;
+extern const struct uverbs_object_def uverbs_object_ah;
+extern const struct uverbs_object_def uverbs_object_flow;
+extern const struct uverbs_object_def uverbs_object_mr;
+extern const struct uverbs_object_def uverbs_object_mw;
+extern const struct uverbs_object_def uverbs_object_pd;
+extern const struct uverbs_object_def uverbs_object_xrcd;
 
 static inline struct ib_uobject *__uobj_get(const struct uverbs_obj_type *type,
 					    bool write,
@@ -56,22 +56,22 @@ static inline struct ib_uobject *__uobj_get(const struct uverbs_obj_type *type,
 	return rdma_lookup_get_uobject(type, ucontext, id, write);
 }
 
-#define uobj_get_type(_type) uverbs_type_attrs_##_type.type
+#define uobj_get_type(_object) uverbs_object_##_object.type_attrs
 
 #define uobj_get_read(_type, _id, _ucontext)				\
-	 __uobj_get(&(_type), false, _ucontext, _id)
+	 __uobj_get(_type, false, _ucontext, _id)
 
-#define uobj_get_obj_read(_type, _id, _ucontext)			\
+#define uobj_get_obj_read(_object, _id, _ucontext)			\
 ({									\
-	struct ib_uobject *uobj =					\
-		__uobj_get(&uobj_get_type(_type),			\
+	struct ib_uobject *__uobj =					\
+		__uobj_get(uverbs_object_##_object.type_attrs,		\
 			   false, _ucontext, _id);			\
 									\
-	(struct ib_##_type *)(IS_ERR(uobj) ? NULL : uobj->object);	\
+	(struct ib_##_object *)(IS_ERR(__uobj) ? NULL : __uobj->object);\
 })
 
 #define uobj_get_write(_type, _id, _ucontext)				\
-	 __uobj_get(&(_type), true, _ucontext, _id)
+	 __uobj_get(_type, true, _ucontext, _id)
 
 static inline void uobj_put_read(struct ib_uobject *uobj)
 {
@@ -108,7 +108,7 @@ static inline struct ib_uobject *__uobj_alloc(const struct uverbs_obj_type *type
 }
 
 #define uobj_alloc(_type, ucontext)	\
-	__uobj_alloc(&(_type), ucontext)
+	__uobj_alloc(_type, ucontext)
 
 #endif
 
diff --git a/include/rdma/uverbs_types.h b/include/rdma/uverbs_types.h
index 351ea185df44..9760b6d70744 100644
--- a/include/rdma/uverbs_types.h
+++ b/include/rdma/uverbs_types.h
@@ -151,22 +151,30 @@ extern const struct uverbs_obj_type_class uverbs_fd_class;
 
 #define UVERBS_BUILD_BUG_ON(cond) (sizeof(char[1 - 2 * !!(cond)]) -	\
 				   sizeof(char))
-#define UVERBS_TYPE_ALLOC_FD(_size, _order)				 \
-	{								 \
-		.destroy_order = _order,				 \
-		.type_class = &uverbs_fd_class,				 \
-		.obj_size = (_size) +					 \
-			  UVERBS_BUILD_BUG_ON((_size) <			 \
-					      sizeof(struct ib_uobject_file)),\
-	}
-#define UVERBS_TYPE_ALLOC_IDR_SZ(_size, _order)				\
-	{								\
+#define UVERBS_TYPE_ALLOC_FD(_order, _obj_size, _context_closed, _fops, _name, _flags)\
+	((&((const struct uverbs_obj_fd_type)				\
+	 {.type = {							\
+		.destroy_order = _order,				\
+		.type_class = &uverbs_fd_class,				\
+		.obj_size = (_obj_size) +				\
+			UVERBS_BUILD_BUG_ON((_obj_size) < sizeof(struct ib_uobject_file)), \
+	 },								\
+	 .context_closed = _context_closed,				\
+	 .fops = _fops,							\
+	 .name = _name,							\
+	 .flags = _flags}))->type)
+#define UVERBS_TYPE_ALLOC_IDR_SZ(_size, _order, _destroy_object)	\
+	((&((const struct uverbs_obj_idr_type)				\
+	 {.type = {							\
 		.destroy_order = _order,				\
 		.type_class = &uverbs_idr_class,			\
 		.obj_size = (_size) +					\
-			  UVERBS_BUILD_BUG_ON((_size) <			\
-					      sizeof(struct ib_uobject)), \
-	}
-#define UVERBS_TYPE_ALLOC_IDR(_order)					\
-	 UVERBS_TYPE_ALLOC_IDR_SZ(sizeof(struct ib_uobject), _order)
+			UVERBS_BUILD_BUG_ON((_size) <			\
+					    sizeof(struct ib_uobject))	\
+	 },								\
+	 .destroy_object = _destroy_object,}))->type)
+#define UVERBS_TYPE_ALLOC_IDR(_order, _destroy_object)			\
+	 UVERBS_TYPE_ALLOC_IDR_SZ(sizeof(struct ib_uobject), _order,	\
+				  _destroy_object)
+
 #endif
-- 
cgit v1.2.3


From 09e3ebf8c193d3f154c4ffb7cb18995df0243bc6 Mon Sep 17 00:00:00 2001
From: Matan Barak <matanb@mellanox.com>
Date: Thu, 3 Aug 2017 16:06:59 +0300
Subject: IB/core: Add DEVICE object and root tree structure

This adds the DEVICE object. This object supports creating the context
that all objects are created from. Moreover, it supports executing
methods which are related to the device itself, such as QUERY_DEVICE.
This is a singleton object (per file instance).

All standard objects are put in the root structure. This root will later
on be used in drivers as the source for their whole parsing tree.
Later on, when new features are added, these drivers could mix this root
with other customized objects.

Signed-off-by: Matan Barak <matanb@mellanox.com>
Reviewed-by: Yishai Hadas <yishaih@mellanox.com>
Signed-off-by: Doug Ledford <dledford@redhat.com>
---
 drivers/infiniband/core/uverbs_std_types.c | 17 +++++++++++++++
 include/rdma/uverbs_ioctl.h                | 35 ++++++++++++++++++++++++++++++
 include/rdma/uverbs_std_types.h            | 18 +++++++++++++++
 3 files changed, 70 insertions(+)

(limited to 'include')

diff --git a/drivers/infiniband/core/uverbs_std_types.c b/drivers/infiniband/core/uverbs_std_types.c
index b75c7da0d0a4..5f90978bda8d 100644
--- a/drivers/infiniband/core/uverbs_std_types.c
+++ b/drivers/infiniband/core/uverbs_std_types.c
@@ -257,3 +257,20 @@ DECLARE_UVERBS_OBJECT(uverbs_object_xrcd, UVERBS_OBJECT_XRCD,
 DECLARE_UVERBS_OBJECT(uverbs_object_pd, UVERBS_OBJECT_PD,
 		      /* 2 is used in order to free the PD after MRs */
 		      &UVERBS_TYPE_ALLOC_IDR(2, uverbs_free_pd));
+
+DECLARE_UVERBS_OBJECT(uverbs_object_device, UVERBS_OBJECT_DEVICE, NULL);
+
+DECLARE_UVERBS_OBJECT_TREE(uverbs_default_objects,
+			   &uverbs_object_device,
+			   &uverbs_object_pd,
+			   &uverbs_object_mr,
+			   &uverbs_object_comp_channel,
+			   &uverbs_object_cq,
+			   &uverbs_object_qp,
+			   &uverbs_object_ah,
+			   &uverbs_object_mw,
+			   &uverbs_object_srq,
+			   &uverbs_object_flow,
+			   &uverbs_object_wq,
+			   &uverbs_object_rwq_ind_table,
+			   &uverbs_object_xrcd);
diff --git a/include/rdma/uverbs_ioctl.h b/include/rdma/uverbs_ioctl.h
index 99130083615e..2e8925434d74 100644
--- a/include/rdma/uverbs_ioctl.h
+++ b/include/rdma/uverbs_ioctl.h
@@ -133,16 +133,51 @@ struct uverbs_root_spec {
  * =======================================
  */
 
+struct uverbs_attr_def {
+	u16                           id;
+	struct uverbs_attr_spec       attr;
+};
+
+struct uverbs_method_def {
+	u16                                  id;
+	/* Combination of bits from enum UVERBS_ACTION_FLAG_XXXX */
+	u32				     flags;
+	size_t				     num_attrs;
+	const struct uverbs_attr_def * const (*attrs)[];
+	int (*handler)(struct ib_device *ib_dev, struct ib_uverbs_file *ufile,
+		       struct uverbs_attr_bundle *ctx);
+};
+
 struct uverbs_object_def {
+	u16					 id;
 	const struct uverbs_obj_type	        *type_attrs;
+	size_t				         num_methods;
+	const struct uverbs_method_def * const (*methods)[];
+};
+
+struct uverbs_object_tree_def {
+	size_t					 num_objects;
+	const struct uverbs_object_def * const (*objects)[];
 };
 
 #define _UVERBS_OBJECT(_id, _type_attrs, ...)				\
 	((const struct uverbs_object_def) {				\
+	 .id = _id,							\
 	 .type_attrs = _type_attrs})
 #define DECLARE_UVERBS_OBJECT(_name, _id, _type_attrs, ...)		\
 	const struct uverbs_object_def _name =				\
 		_UVERBS_OBJECT(_id, _type_attrs, ##__VA_ARGS__)
+#define _UVERBS_TREE_OBJECTS_SZ(...)					\
+	(sizeof((const struct uverbs_object_def * const []){__VA_ARGS__}) / \
+	 sizeof(const struct uverbs_object_def *))
+#define _UVERBS_OBJECT_TREE(...)					\
+	((const struct uverbs_object_tree_def) {			\
+	 .num_objects = _UVERBS_TREE_OBJECTS_SZ(__VA_ARGS__),		\
+	 .objects = &(const struct uverbs_object_def * const []){__VA_ARGS__} })
+#define DECLARE_UVERBS_OBJECT_TREE(_name, ...)				\
+	const struct uverbs_object_tree_def _name =			\
+		_UVERBS_OBJECT_TREE(__VA_ARGS__)
+
 /* =================================================
  *              Parsing infrastructure
  * =================================================
diff --git a/include/rdma/uverbs_std_types.h b/include/rdma/uverbs_std_types.h
index eda271b4aa6c..bef74099b7c5 100644
--- a/include/rdma/uverbs_std_types.h
+++ b/include/rdma/uverbs_std_types.h
@@ -35,6 +35,23 @@
 
 #include <rdma/uverbs_types.h>
 
+enum uverbs_default_objects {
+	UVERBS_OBJECT_DEVICE, /* No instances of DEVICE are allowed */
+	UVERBS_OBJECT_PD,
+	UVERBS_OBJECT_COMP_CHANNEL,
+	UVERBS_OBJECT_CQ,
+	UVERBS_OBJECT_QP,
+	UVERBS_OBJECT_SRQ,
+	UVERBS_OBJECT_AH,
+	UVERBS_OBJECT_MR,
+	UVERBS_OBJECT_MW,
+	UVERBS_OBJECT_FLOW,
+	UVERBS_OBJECT_XRCD,
+	UVERBS_OBJECT_RWQ_IND_TBL,
+	UVERBS_OBJECT_WQ,
+	UVERBS_OBJECT_LAST,
+};
+
 extern const struct uverbs_object_def uverbs_object_comp_channel;
 extern const struct uverbs_object_def uverbs_object_cq;
 extern const struct uverbs_object_def uverbs_object_qp;
@@ -47,6 +64,7 @@ extern const struct uverbs_object_def uverbs_object_mr;
 extern const struct uverbs_object_def uverbs_object_mw;
 extern const struct uverbs_object_def uverbs_object_pd;
 extern const struct uverbs_object_def uverbs_object_xrcd;
+extern const struct uverbs_object_def uverbs_object_device;
 
 static inline struct ib_uobject *__uobj_get(const struct uverbs_obj_type *type,
 					    bool write,
-- 
cgit v1.2.3


From 118620d3686b2d624f9a5019f2f14c64cf50d21a Mon Sep 17 00:00:00 2001
From: Matan Barak <matanb@mellanox.com>
Date: Thu, 3 Aug 2017 16:07:00 +0300
Subject: IB/core: Add uverbs merge trees functionality

Different drivers support different features and even subset of the
common uverbs implementation. Currently, this is handled as bitmask
in every driver that represents which kind of methods it supports, but
doesn't go down to attributes granularity. Moreover, drivers might
want to add their specific types, methods and attributes to let
their user-space counter-parts be exposed to some more efficient
abstractions. It means that existence of different features is
validated syntactically via the parsing infrastructure rather than
using a complex in-handler logic.

In order to do that, we allow defining features and abstractions
as parsing trees. These per-feature parsing tree could be merged
to an efficient (perfect-hash based) parsing tree, which is later
used by the parsing infrastructure.

To sum it up, this makes a parse tree unique for a device and
represents only the features this particular device supports.
This is done by having a root specification tree per feature.
Before a device registers itself as an IB device, it merges
all these trees into one parsing tree. This parsing tree
is used to parse all user-space commands.

A future user-space application could read this parse tree. This
tree represents which objects, methods and attributes are
supported by this device.

This is based on the idea of
Jason Gunthorpe <jgunthorpe@obsidianresearch.com>

Signed-off-by: Matan Barak <matanb@mellanox.com>
Reviewed-by: Yishai Hadas <yishaih@mellanox.com>
Signed-off-by: Doug Ledford <dledford@redhat.com>
---
 drivers/infiniband/core/Makefile             |   3 +-
 drivers/infiniband/core/uverbs_ioctl_merge.c | 665 +++++++++++++++++++++++++++
 include/rdma/uverbs_ioctl.h                  |  40 +-
 3 files changed, 706 insertions(+), 2 deletions(-)
 create mode 100644 drivers/infiniband/core/uverbs_ioctl_merge.c

(limited to 'include')

diff --git a/drivers/infiniband/core/Makefile b/drivers/infiniband/core/Makefile
index 746756dc9877..b4df164f71a6 100644
--- a/drivers/infiniband/core/Makefile
+++ b/drivers/infiniband/core/Makefile
@@ -32,4 +32,5 @@ ib_umad-y :=			user_mad.o
 ib_ucm-y :=			ucm.o
 
 ib_uverbs-y :=			uverbs_main.o uverbs_cmd.o uverbs_marshall.o \
-				rdma_core.o uverbs_std_types.o uverbs_ioctl.o
+				rdma_core.o uverbs_std_types.o uverbs_ioctl.o \
+				uverbs_ioctl_merge.o
diff --git a/drivers/infiniband/core/uverbs_ioctl_merge.c b/drivers/infiniband/core/uverbs_ioctl_merge.c
new file mode 100644
index 000000000000..76ddb6564578
--- /dev/null
+++ b/drivers/infiniband/core/uverbs_ioctl_merge.c
@@ -0,0 +1,665 @@
+/*
+ * Copyright (c) 2017, Mellanox Technologies inc.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <rdma/uverbs_ioctl.h>
+#include <rdma/rdma_user_ioctl.h>
+#include <linux/bitops.h>
+#include "uverbs.h"
+
+#define UVERBS_NUM_NS (UVERBS_ID_NS_MASK >> UVERBS_ID_NS_SHIFT)
+#define GET_NS_ID(idx) (((idx) & UVERBS_ID_NS_MASK) >> UVERBS_ID_NS_SHIFT)
+#define GET_ID(idx) ((idx) & ~UVERBS_ID_NS_MASK)
+
+#define _for_each_element(elem, tmpi, tmpj, hashes, num_buckets_offset,	       \
+			  buckets_offset)				       \
+	for (tmpj = 0,							       \
+	     elem = (*(const void ***)((hashes)[tmpi] +			       \
+				       (buckets_offset)))[0];	               \
+	     tmpj < *(size_t *)((hashes)[tmpi] + (num_buckets_offset));        \
+	     tmpj++)						               \
+		if ((elem = ((*(const void ***)(hashes[tmpi] +		       \
+						(buckets_offset)))[tmpj])))
+
+/*
+ * Iterate all elements of a few @hashes. The number of given hashes is
+ * indicated by @num_hashes. The offset of the number of buckets in the hash is
+ * represented by @num_buckets_offset, while the offset of the buckets array in
+ * the hash structure is represented by @buckets_offset. tmpi and tmpj are two
+ * short (or int) based indices that are given by the user. tmpi iterates over
+ * the different hashes. @elem points the current element in the hashes[tmpi]
+ * bucket we are looping on. To be honest, @hashes representation isn't exactly
+ * a hash, but more a collection of elements. These elements' ids are treated
+ * in a hash like manner, where the first upper bits are the bucket number.
+ * These elements are later mapped into a perfect-hash.
+ */
+#define for_each_element(elem, tmpi, tmpj, hashes, num_hashes,                 \
+			 num_buckets_offset, buckets_offset)		       \
+	for (tmpi = 0; tmpi < (num_hashes); tmpi++)		               \
+		_for_each_element(elem, tmpi, tmpj, hashes, num_buckets_offset,\
+				  buckets_offset)
+
+#define get_elements_iterators_entry_above(iters, num_elements, elements,     \
+					  num_objects_fld, objects_fld, bucket,\
+					  min_id)			       \
+	get_elements_above_id((const void **)iters, num_elements,       \
+				     (const void **)(elements),		       \
+				     offsetof(typeof(**elements),	       \
+					      num_objects_fld),		       \
+				     offsetof(typeof(**elements), objects_fld),\
+				     offsetof(typeof(***(*elements)->objects_fld), id),\
+				     bucket, min_id)
+
+#define get_objects_above_id(iters, num_trees, trees, bucket, min_id)	       \
+	get_elements_iterators_entry_above(iters, num_trees, trees,	       \
+					   num_objects, objects, bucket, min_id)
+
+#define get_methods_above_id(method_iters, num_iters, iters, bucket, min_id)\
+	get_elements_iterators_entry_above(method_iters, num_iters, iters,     \
+					   num_methods, methods, bucket, min_id)
+
+#define get_attrs_above_id(attrs_iters, num_iters, iters, bucket, min_id)\
+	get_elements_iterators_entry_above(attrs_iters, num_iters, iters,      \
+					   num_attrs, attrs, bucket, min_id)
+
+/*
+ * get_elements_above_id get a few hashes represented by @elements and
+ * @num_elements. The hashes fields are described by @num_offset, @data_offset
+ * and @id_offset in the same way as required by for_each_element. The function
+ * returns an array of @iters, represents an array of elements in the hashes
+ * buckets, which their ids are the smallest ids in all hashes but are all
+ * larger than the id given by min_id. Elements are only added to the iters
+ * array if their id belongs to the bucket @bucket. The number of elements in
+ * the returned array is returned by the function. @min_id is also updated to
+ * reflect the new min_id of all elements in iters.
+ */
+static size_t get_elements_above_id(const void **iters,
+				    unsigned int num_elements,
+				    const void **elements,
+				    size_t num_offset,
+				    size_t data_offset,
+				    size_t id_offset,
+				    u16 bucket,
+				    short *min_id)
+{
+	size_t num_iters = 0;
+	short min = SHRT_MAX;
+	const void *elem;
+	int i, j, last_stored = -1;
+
+	for_each_element(elem, i, j, elements, num_elements, num_offset,
+			 data_offset) {
+		u16 id = *(u16 *)(elem + id_offset);
+
+		if (GET_NS_ID(id) != bucket)
+			continue;
+
+		if (GET_ID(id) < *min_id ||
+		    (min != SHRT_MAX && GET_ID(id) > min))
+			continue;
+
+		/*
+		 * We first iterate all hashes represented by @elements. When
+		 * we do, we try to find an element @elem in the bucket @bucket
+		 * which its id is min. Since we can't ensure the user sorted
+		 * the elements in increasing order, we override this hash's
+		 * minimal id element we found, if a new element with a smaller
+		 * id was just found.
+		 */
+		iters[last_stored == i ? num_iters - 1 : num_iters++] = elem;
+		last_stored = i;
+		min = GET_ID(id);
+	}
+
+	/*
+	 * We only insert to our iters array an element, if its id is smaller
+	 * than all previous ids. Therefore, the final iters array is sorted so
+	 * that smaller ids are in the end of the array.
+	 * Therefore, we need to clean the beginning of the array to make sure
+	 * all ids of final elements are equal to min.
+	 */
+	for (i = num_iters - 1; i >= 0 &&
+	     GET_ID(*(u16 *)(iters[i] + id_offset)) == min; i--)
+		;
+
+	num_iters -= i + 1;
+	memmove(iters, iters + i + 1, sizeof(*iters) * num_iters);
+
+	*min_id = min;
+	return num_iters;
+}
+
+#define find_max_element_entry_id(num_elements, elements, num_objects_fld, \
+				  objects_fld, bucket)			   \
+	find_max_element_id(num_elements, (const void **)(elements),	   \
+			    offsetof(typeof(**elements), num_objects_fld),    \
+			    offsetof(typeof(**elements), objects_fld),	      \
+			    offsetof(typeof(***(*elements)->objects_fld), id),\
+			    bucket)
+
+static short find_max_element_ns_id(unsigned int num_elements,
+				    const void **elements,
+				    size_t num_offset,
+				    size_t data_offset,
+				    size_t id_offset)
+{
+	short max_ns = SHRT_MIN;
+	const void *elem;
+	int i, j;
+
+	for_each_element(elem, i, j, elements, num_elements, num_offset,
+			 data_offset) {
+		u16 id = *(u16 *)(elem + id_offset);
+
+		if (GET_NS_ID(id) > max_ns)
+			max_ns = GET_NS_ID(id);
+	}
+
+	return max_ns;
+}
+
+static short find_max_element_id(unsigned int num_elements,
+				 const void **elements,
+				 size_t num_offset,
+				 size_t data_offset,
+				 size_t id_offset,
+				 u16 bucket)
+{
+	short max_id = SHRT_MIN;
+	const void *elem;
+	int i, j;
+
+	for_each_element(elem, i, j, elements, num_elements, num_offset,
+			 data_offset) {
+		u16 id = *(u16 *)(elem + id_offset);
+
+		if (GET_NS_ID(id) == bucket &&
+		    GET_ID(id) > max_id)
+			max_id = GET_ID(id);
+	}
+	return max_id;
+}
+
+#define find_max_element_entry_id(num_elements, elements, num_objects_fld,   \
+				  objects_fld, bucket)			      \
+	find_max_element_id(num_elements, (const void **)(elements),	      \
+			    offsetof(typeof(**elements), num_objects_fld),    \
+			    offsetof(typeof(**elements), objects_fld),	      \
+			    offsetof(typeof(***(*elements)->objects_fld), id),\
+			    bucket)
+
+#define find_max_element_ns_entry_id(num_elements, elements,		    \
+				     num_objects_fld, objects_fld)	    \
+	find_max_element_ns_id(num_elements, (const void **)(elements),	    \
+			      offsetof(typeof(**elements), num_objects_fld),\
+			      offsetof(typeof(**elements), objects_fld),    \
+			      offsetof(typeof(***(*elements)->objects_fld), id))
+
+/*
+ * find_max_xxxx_ns_id gets a few elements. Each element is described by an id
+ * which its upper bits represents a namespace. It finds the max namespace. This
+ * could be used in order to know how many buckets do we need to allocate. If no
+ * elements exist, SHRT_MIN is returned. Namespace represents here different
+ * buckets. The common example is "common bucket" and "driver bucket".
+ *
+ * find_max_xxxx_id gets a few elements and a bucket. Each element is described
+ * by an id which its upper bits represent a namespace. It returns the max id
+ * which is contained in the same namespace defined in @bucket. This could be
+ * used in order to know how many elements do we need to allocate in the bucket.
+ * If no elements exist, SHRT_MIN is returned.
+ */
+
+#define find_max_object_id(num_trees, trees, bucket)			\
+		find_max_element_entry_id(num_trees, trees, num_objects,\
+					  objects, bucket)
+#define find_max_object_ns_id(num_trees, trees)			\
+		find_max_element_ns_entry_id(num_trees, trees,		\
+					     num_objects, objects)
+
+#define find_max_method_id(num_iters, iters, bucket)			\
+		find_max_element_entry_id(num_iters, iters, num_methods,\
+					  methods, bucket)
+#define find_max_method_ns_id(num_iters, iters)			\
+		find_max_element_ns_entry_id(num_iters, iters,		\
+					     num_methods, methods)
+
+#define find_max_attr_id(num_iters, iters, bucket)			\
+		find_max_element_entry_id(num_iters, iters, num_attrs,  \
+					  attrs, bucket)
+#define find_max_attr_ns_id(num_iters, iters)				\
+		find_max_element_ns_entry_id(num_iters, iters,		\
+					     num_attrs, attrs)
+
+static void free_method(struct uverbs_method_spec *method)
+{
+	unsigned int i;
+
+	if (!method)
+		return;
+
+	for (i = 0; i < method->num_buckets; i++)
+		kfree(method->attr_buckets[i]);
+
+	kfree(method);
+}
+
+#define IS_ATTR_OBJECT(attr) ((attr)->type == UVERBS_ATTR_TYPE_IDR || \
+			      (attr)->type == UVERBS_ATTR_TYPE_FD)
+
+/*
+ * This function gets array of size @num_method_defs which contains pointers to
+ * method definitions @method_defs. The function allocates an
+ * uverbs_method_spec structure and initializes its number of buckets and the
+ * elements in buckets to the correct attributes. While doing that, it
+ * validates that there aren't conflicts between attributes of different
+ * method_defs.
+ */
+static struct uverbs_method_spec *build_method_with_attrs(const struct uverbs_method_def **method_defs,
+							  size_t num_method_defs)
+{
+	int bucket_idx;
+	int max_attr_buckets = 0;
+	size_t num_attr_buckets = 0;
+	int res = 0;
+	struct uverbs_method_spec *method = NULL;
+	const struct uverbs_attr_def **attr_defs;
+	unsigned int num_of_singularities = 0;
+
+	max_attr_buckets = find_max_attr_ns_id(num_method_defs, method_defs);
+	if (max_attr_buckets >= 0)
+		num_attr_buckets = max_attr_buckets + 1;
+
+	method = kzalloc(sizeof(*method) +
+			 num_attr_buckets * sizeof(*method->attr_buckets),
+			 GFP_KERNEL);
+	if (!method)
+		return ERR_PTR(-ENOMEM);
+
+	method->num_buckets = num_attr_buckets;
+	attr_defs = kcalloc(num_method_defs, sizeof(*attr_defs), GFP_KERNEL);
+	if (!attr_defs) {
+		res = -ENOMEM;
+		goto free_method;
+	}
+	for (bucket_idx = 0; bucket_idx < method->num_buckets; bucket_idx++) {
+		short min_id = SHRT_MIN;
+		int attr_max_bucket = 0;
+		struct uverbs_attr_spec_hash *hash = NULL;
+
+		attr_max_bucket = find_max_attr_id(num_method_defs, method_defs,
+						   bucket_idx);
+		if (attr_max_bucket < 0)
+			continue;
+
+		hash = kzalloc(sizeof(*hash) +
+			       ALIGN(sizeof(*hash->attrs) * (attr_max_bucket + 1),
+				     sizeof(long)) +
+			       BITS_TO_LONGS(attr_max_bucket) * sizeof(long),
+			       GFP_KERNEL);
+		if (!hash) {
+			res = -ENOMEM;
+			goto free;
+		}
+		hash->num_attrs = attr_max_bucket + 1;
+		method->num_child_attrs += hash->num_attrs;
+		hash->mandatory_attrs_bitmask = (void *)(hash + 1) +
+						 ALIGN(sizeof(*hash->attrs) *
+						       (attr_max_bucket + 1),
+						       sizeof(long));
+
+		method->attr_buckets[bucket_idx] = hash;
+
+		do {
+			size_t			 num_attr_defs;
+			struct uverbs_attr_spec	*attr;
+			bool attr_obj_with_special_access;
+
+			num_attr_defs =
+				get_attrs_above_id(attr_defs,
+						   num_method_defs,
+						   method_defs,
+						   bucket_idx,
+						   &min_id);
+			/* Last attr in bucket */
+			if (!num_attr_defs)
+				break;
+
+			if (num_attr_defs > 1) {
+				/*
+				 * We don't allow two attribute definitions for
+				 * the same attribute. This is usually a
+				 * programmer error. If required, it's better to
+				 * just add a new attribute to capture the new
+				 * semantics.
+				 */
+				res = -EEXIST;
+				goto free;
+			}
+
+			attr = &hash->attrs[min_id];
+			memcpy(attr, &attr_defs[0]->attr, sizeof(*attr));
+
+			attr_obj_with_special_access = IS_ATTR_OBJECT(attr) &&
+				   (attr->obj.access == UVERBS_ACCESS_NEW ||
+				    attr->obj.access == UVERBS_ACCESS_DESTROY);
+			num_of_singularities +=  !!attr_obj_with_special_access;
+			if (WARN(num_of_singularities > 1,
+				 "ib_uverbs: Method contains more than one object attr (%d) with new/destroy access\n",
+				 min_id) ||
+			    WARN(attr_obj_with_special_access &&
+				 !(attr->flags & UVERBS_ATTR_SPEC_F_MANDATORY),
+				 "ib_uverbs: Tried to merge attr (%d) but it's an object with new/destroy aceess but isn't mandatory\n",
+				 min_id) ||
+			    WARN(IS_ATTR_OBJECT(attr) &&
+				 attr->flags & UVERBS_ATTR_SPEC_F_MIN_SZ,
+				 "ib_uverbs: Tried to merge attr (%d) but it's an object with min_sz flag\n",
+				 min_id)) {
+				res = -EINVAL;
+				goto free;
+			}
+
+			if (attr->flags & UVERBS_ATTR_SPEC_F_MANDATORY)
+				set_bit(min_id, hash->mandatory_attrs_bitmask);
+			min_id++;
+
+		} while (1);
+	}
+	kfree(attr_defs);
+	return method;
+
+free:
+	kfree(attr_defs);
+free_method:
+	free_method(method);
+	return ERR_PTR(res);
+}
+
+static void free_object(struct uverbs_object_spec *object)
+{
+	unsigned int i, j;
+
+	if (!object)
+		return;
+
+	for (i = 0; i < object->num_buckets; i++) {
+		struct uverbs_method_spec_hash	*method_buckets =
+			object->method_buckets[i];
+
+		if (!method_buckets)
+			continue;
+
+		for (j = 0; j < method_buckets->num_methods; j++)
+			free_method(method_buckets->methods[j]);
+
+		kfree(method_buckets);
+	}
+
+	kfree(object);
+}
+
+/*
+ * This function gets array of size @num_object_defs which contains pointers to
+ * object definitions @object_defs. The function allocated an
+ * uverbs_object_spec structure and initialize its number of buckets and the
+ * elements in buckets to the correct methods. While doing that, it
+ * sorts out the correct relationship between conflicts in the same method.
+ */
+static struct uverbs_object_spec *build_object_with_methods(const struct uverbs_object_def **object_defs,
+							    size_t num_object_defs)
+{
+	u16 bucket_idx;
+	int max_method_buckets = 0;
+	u16 num_method_buckets = 0;
+	int res = 0;
+	struct uverbs_object_spec *object = NULL;
+	const struct uverbs_method_def **method_defs;
+
+	max_method_buckets = find_max_method_ns_id(num_object_defs, object_defs);
+	if (max_method_buckets >= 0)
+		num_method_buckets = max_method_buckets + 1;
+
+	object = kzalloc(sizeof(*object) +
+			 num_method_buckets *
+			 sizeof(*object->method_buckets), GFP_KERNEL);
+	if (!object)
+		return ERR_PTR(-ENOMEM);
+
+	object->num_buckets = num_method_buckets;
+	method_defs = kcalloc(num_object_defs, sizeof(*method_defs), GFP_KERNEL);
+	if (!method_defs) {
+		res = -ENOMEM;
+		goto free_object;
+	}
+
+	for (bucket_idx = 0; bucket_idx < object->num_buckets; bucket_idx++) {
+		short min_id = SHRT_MIN;
+		int methods_max_bucket = 0;
+		struct uverbs_method_spec_hash *hash = NULL;
+
+		methods_max_bucket = find_max_method_id(num_object_defs, object_defs,
+							bucket_idx);
+		if (methods_max_bucket < 0)
+			continue;
+
+		hash = kzalloc(sizeof(*hash) +
+			       sizeof(*hash->methods) * (methods_max_bucket + 1),
+			       GFP_KERNEL);
+		if (!hash) {
+			res = -ENOMEM;
+			goto free;
+		}
+
+		hash->num_methods = methods_max_bucket + 1;
+		object->method_buckets[bucket_idx] = hash;
+
+		do {
+			size_t				num_method_defs;
+			struct uverbs_method_spec	*method;
+			int i;
+
+			num_method_defs =
+				get_methods_above_id(method_defs,
+						     num_object_defs,
+						     object_defs,
+						     bucket_idx,
+						     &min_id);
+			/* Last method in bucket */
+			if (!num_method_defs)
+				break;
+
+			method = build_method_with_attrs(method_defs,
+							 num_method_defs);
+			if (IS_ERR(method)) {
+				res = PTR_ERR(method);
+				goto free;
+			}
+
+			/*
+			 * The last tree which is given as an argument to the
+			 * merge overrides previous method handler.
+			 * Therefore, we iterate backwards and search for the
+			 * first handler which != NULL. This also defines the
+			 * set of flags used for this handler.
+			 */
+			for (i = num_object_defs - 1;
+			     i >= 0 && !method_defs[i]->handler; i--)
+				;
+			hash->methods[min_id++] = method;
+			/* NULL handler isn't allowed */
+			if (WARN(i < 0,
+				 "ib_uverbs: tried to merge function id %d, but all handlers are NULL\n",
+				 min_id)) {
+				res = -EINVAL;
+				goto free;
+			}
+			method->handler = method_defs[i]->handler;
+			method->flags = method_defs[i]->flags;
+
+		} while (1);
+	}
+	kfree(method_defs);
+	return object;
+
+free:
+	kfree(method_defs);
+free_object:
+	free_object(object);
+	return ERR_PTR(res);
+}
+
+void uverbs_free_spec_tree(struct uverbs_root_spec *root)
+{
+	unsigned int i, j;
+
+	if (!root)
+		return;
+
+	for (i = 0; i < root->num_buckets; i++) {
+		struct uverbs_object_spec_hash *object_hash =
+			root->object_buckets[i];
+
+		if (!object_hash)
+			continue;
+
+		for (j = 0; j < object_hash->num_objects; j++)
+			free_object(object_hash->objects[j]);
+
+		kfree(object_hash);
+	}
+
+	kfree(root);
+}
+EXPORT_SYMBOL(uverbs_free_spec_tree);
+
+struct uverbs_root_spec *uverbs_alloc_spec_tree(unsigned int num_trees,
+						const struct uverbs_object_tree_def **trees)
+{
+	u16 bucket_idx;
+	short max_object_buckets = 0;
+	size_t num_objects_buckets = 0;
+	struct uverbs_root_spec *root_spec = NULL;
+	const struct uverbs_object_def **object_defs;
+	int i;
+	int res = 0;
+
+	max_object_buckets = find_max_object_ns_id(num_trees, trees);
+	/*
+	 * Devices which don't want to support ib_uverbs, should just allocate
+	 * an empty parsing tree. Every user-space command won't hit any valid
+	 * entry in the parsing tree and thus will fail.
+	 */
+	if (max_object_buckets >= 0)
+		num_objects_buckets = max_object_buckets + 1;
+
+	root_spec = kzalloc(sizeof(*root_spec) +
+			    num_objects_buckets * sizeof(*root_spec->object_buckets),
+			    GFP_KERNEL);
+	if (!root_spec)
+		return ERR_PTR(-ENOMEM);
+	root_spec->num_buckets = num_objects_buckets;
+
+	object_defs = kcalloc(num_trees, sizeof(*object_defs),
+			      GFP_KERNEL);
+	if (!object_defs) {
+		res = -ENOMEM;
+		goto free_root;
+	}
+
+	for (bucket_idx = 0; bucket_idx < root_spec->num_buckets; bucket_idx++) {
+		short min_id = SHRT_MIN;
+		short objects_max_bucket;
+		struct uverbs_object_spec_hash *hash = NULL;
+
+		objects_max_bucket = find_max_object_id(num_trees, trees,
+							bucket_idx);
+		if (objects_max_bucket < 0)
+			continue;
+
+		hash = kzalloc(sizeof(*hash) +
+			       sizeof(*hash->objects) * (objects_max_bucket + 1),
+			       GFP_KERNEL);
+		if (!hash) {
+			res = -ENOMEM;
+			goto free;
+		}
+		hash->num_objects = objects_max_bucket + 1;
+		root_spec->object_buckets[bucket_idx] = hash;
+
+		do {
+			size_t				num_object_defs;
+			struct uverbs_object_spec	*object;
+
+			num_object_defs = get_objects_above_id(object_defs,
+							       num_trees,
+							       trees,
+							       bucket_idx,
+							       &min_id);
+			/* Last object in bucket */
+			if (!num_object_defs)
+				break;
+
+			object = build_object_with_methods(object_defs,
+							   num_object_defs);
+			if (IS_ERR(object)) {
+				res = PTR_ERR(object);
+				goto free;
+			}
+
+			/*
+			 * The last tree which is given as an argument to the
+			 * merge overrides previous object's type_attrs.
+			 * Therefore, we iterate backwards and search for the
+			 * first type_attrs which != NULL.
+			 */
+			for (i = num_object_defs - 1;
+			     i >= 0 && !object_defs[i]->type_attrs; i--)
+				;
+			/*
+			 * NULL is a valid type_attrs. It means an object we
+			 * can't instantiate (like DEVICE).
+			 */
+			object->type_attrs = i < 0 ? NULL :
+				object_defs[i]->type_attrs;
+
+			hash->objects[min_id++] = object;
+		} while (1);
+	}
+
+	kfree(object_defs);
+	return root_spec;
+
+free:
+	kfree(object_defs);
+free_root:
+	uverbs_free_spec_tree(root_spec);
+	return ERR_PTR(res);
+}
+EXPORT_SYMBOL(uverbs_alloc_spec_tree);
diff --git a/include/rdma/uverbs_ioctl.h b/include/rdma/uverbs_ioctl.h
index 2e8925434d74..cf5b238d2d81 100644
--- a/include/rdma/uverbs_ioctl.h
+++ b/include/rdma/uverbs_ioctl.h
@@ -235,5 +235,43 @@ static inline bool uverbs_attr_is_valid_in_hash(const struct uverbs_attr_bundle_
 	return test_bit(idx, attrs_hash->valid_bitmap);
 }
 
-#endif
+/* =================================================
+ *	 Definitions -> Specs infrastructure
+ * =================================================
+ */
+
+/*
+ * uverbs_alloc_spec_tree - Merges different common and driver specific feature
+ *	into one parsing tree that every uverbs command will be parsed upon.
+ *
+ * @num_trees: Number of trees in the array @trees.
+ * @trees: Array of pointers to tree root definitions to merge. Each such tree
+ *	   possibly contains objects, methods and attributes definitions.
+ *
+ * Returns:
+ *	uverbs_root_spec *: The root of the merged parsing tree.
+ *	On error, we return an error code. Error is checked via IS_ERR.
+ *
+ * The following merges could take place:
+ * a. Two trees representing the same method with different handler
+ *	-> We take the handler of the tree that its handler != NULL
+ *	   and its index in the trees array is greater. The incentive for that
+ *	   is that developers are expected to first merge common trees and then
+ *	   merge trees that gives specialized the behaviour.
+ * b. Two trees representing the same object with different
+ *    type_attrs (struct uverbs_obj_type):
+ *	-> We take the type_attrs of the tree that its type_attr != NULL
+ *	   and its index in the trees array is greater. This could be used
+ *	   in order to override the free function, allocation size, etc.
+ * c. Two trees representing the same method attribute (same id but possibly
+ *    different attributes):
+ *	-> ERROR (-ENOENT), we believe that's not the programmer's intent.
+ *
+ * An object without any methods is considered invalid and will abort the
+ * function with -ENOENT error.
+ */
+struct uverbs_root_spec *uverbs_alloc_spec_tree(unsigned int num_trees,
+						const struct uverbs_object_tree_def **trees);
+void uverbs_free_spec_tree(struct uverbs_root_spec *root);
 
+#endif
-- 
cgit v1.2.3


From 3541030650c0ddb5d52163082fee427b2a453799 Mon Sep 17 00:00:00 2001
From: Matan Barak <matanb@mellanox.com>
Date: Thu, 3 Aug 2017 16:07:01 +0300
Subject: IB/core: Add macros for declaring methods and attributes

This patch adds macros for declaring objects, methods and
attributes. These definitions are later used by downstream patches
to declare some of the default types.

Signed-off-by: Matan Barak <matanb@mellanox.com>
Reviewed-by: Yishai Hadas <yishaih@mellanox.com>
Signed-off-by: Doug Ledford <dledford@redhat.com>
---
 include/rdma/uverbs_ioctl.h | 105 +++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 104 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/rdma/uverbs_ioctl.h b/include/rdma/uverbs_ioctl.h
index cf5b238d2d81..9a8d217cdc1d 100644
--- a/include/rdma/uverbs_ioctl.h
+++ b/include/rdma/uverbs_ioctl.h
@@ -34,6 +34,8 @@
 #define _UVERBS_IOCTL_
 
 #include <rdma/uverbs_types.h>
+#include <linux/uaccess.h>
+#include <rdma/rdma_user_ioctl.h>
 
 /*
  * =======================================
@@ -160,10 +162,99 @@ struct uverbs_object_tree_def {
 	const struct uverbs_object_def * const (*objects)[];
 };
 
+#define UA_FLAGS(_flags)  .flags = _flags
+#define __UVERBS_ATTR0(_id, _len, _type, ...)                           \
+	((const struct uverbs_attr_def)				  \
+	 {.id = _id, .attr = {.type = _type, {.len = _len}, .flags = 0, } })
+#define __UVERBS_ATTR1(_id, _len, _type, _flags)                        \
+	((const struct uverbs_attr_def)				  \
+	 {.id = _id, .attr = {.type = _type, {.len = _len}, _flags, } })
+#define __UVERBS_ATTR(_id, _len, _type, _flags, _n, ...)		\
+	__UVERBS_ATTR##_n(_id, _len, _type, _flags)
+/*
+ * In new compiler, UVERBS_ATTR could be simplified by declaring it as
+ * [_id] = {.type = _type, .len = _len, ##__VA_ARGS__}
+ * But since we support older compilers too, we need the more complex code.
+ */
+#define UVERBS_ATTR(_id, _len, _type, ...)				\
+	__UVERBS_ATTR(_id, _len, _type, ##__VA_ARGS__, 1, 0)
+#define UVERBS_ATTR_PTR_IN_SZ(_id, _len, ...)				\
+	UVERBS_ATTR(_id, _len, UVERBS_ATTR_TYPE_PTR_IN, ##__VA_ARGS__)
+/* If sizeof(_type) <= sizeof(u64), this will be inlined rather than a pointer */
+#define UVERBS_ATTR_PTR_IN(_id, _type, ...)				\
+	UVERBS_ATTR_PTR_IN_SZ(_id, sizeof(_type), ##__VA_ARGS__)
+#define UVERBS_ATTR_PTR_OUT_SZ(_id, _len, ...)				\
+	UVERBS_ATTR(_id, _len, UVERBS_ATTR_TYPE_PTR_OUT, ##__VA_ARGS__)
+#define UVERBS_ATTR_PTR_OUT(_id, _type, ...)				\
+	UVERBS_ATTR_PTR_OUT_SZ(_id, sizeof(_type), ##__VA_ARGS__)
+
+/*
+ * In new compiler, UVERBS_ATTR_IDR (and FD) could be simplified by declaring
+ * it as
+ * {.id = _id,								\
+ *  .attr {.type = __obj_class,						\
+ *         .obj = {.obj_type = _idr_type,				\
+ *                       .access = _access                              \
+ *                }, ##__VA_ARGS__ } }
+ * But since we support older compilers too, we need the more complex code.
+ */
+#define ___UVERBS_ATTR_OBJ0(_id, _obj_class, _obj_type, _access, ...)\
+	((const struct uverbs_attr_def)					\
+	{.id = _id,							\
+	 .attr = {.type = _obj_class,					\
+		  {.obj = {.obj_type = _obj_type, .access = _access } },\
+		  .flags = 0} })
+#define ___UVERBS_ATTR_OBJ1(_id, _obj_class, _obj_type, _access, _flags)\
+	((const struct uverbs_attr_def)					\
+	{.id = _id,							\
+	.attr = {.type = _obj_class,					\
+		 {.obj = {.obj_type = _obj_type, .access = _access} },	\
+		  _flags} })
+#define ___UVERBS_ATTR_OBJ(_id, _obj_class, _obj_type, _access, _flags, \
+			   _n, ...)					\
+	___UVERBS_ATTR_OBJ##_n(_id, _obj_class, _obj_type, _access, _flags)
+#define __UVERBS_ATTR_OBJ(_id, _obj_class, _obj_type, _access, ...)	\
+	___UVERBS_ATTR_OBJ(_id, _obj_class, _obj_type, _access,		\
+			   ##__VA_ARGS__, 1, 0)
+#define UVERBS_ATTR_IDR(_id, _idr_type, _access, ...)			 \
+	__UVERBS_ATTR_OBJ(_id, UVERBS_ATTR_TYPE_IDR, _idr_type, _access,\
+			  ##__VA_ARGS__)
+#define UVERBS_ATTR_FD(_id, _fd_type, _access, ...)			\
+	__UVERBS_ATTR_OBJ(_id, UVERBS_ATTR_TYPE_FD, _fd_type,		\
+			  (_access) + BUILD_BUG_ON_ZERO(		\
+				(_access) != UVERBS_ACCESS_NEW &&	\
+				(_access) != UVERBS_ACCESS_READ),	\
+			  ##__VA_ARGS__)
+#define DECLARE_UVERBS_ATTR_SPEC(_name, ...)				\
+	const struct uverbs_attr_def _name = __VA_ARGS__
+
+#define _UVERBS_METHOD_ATTRS_SZ(...)					\
+	(sizeof((const struct uverbs_attr_def * const []){__VA_ARGS__}) /\
+	 sizeof(const struct uverbs_attr_def *))
+#define _UVERBS_METHOD(_id, _handler, _flags, ...)			\
+	((const struct uverbs_method_def) {				\
+	 .id = _id,							\
+	 .flags = _flags,						\
+	 .handler = _handler,						\
+	 .num_attrs = _UVERBS_METHOD_ATTRS_SZ(__VA_ARGS__),		\
+	 .attrs = &(const struct uverbs_attr_def * const []){__VA_ARGS__} })
+#define DECLARE_UVERBS_METHOD(_name, _id, _handler, ...)		\
+	const struct uverbs_method_def _name =				\
+		_UVERBS_METHOD(_id, _handler, 0, ##__VA_ARGS__)
+#define DECLARE_UVERBS_CTX_METHOD(_name, _id, _handler, _flags, ...)	\
+	const struct uverbs_method_def _name =				\
+		_UVERBS_METHOD(_id, _handler,				\
+			       UVERBS_ACTION_FLAG_CREATE_ROOT,		\
+			       ##__VA_ARGS__)
+#define _UVERBS_OBJECT_METHODS_SZ(...)					\
+	(sizeof((const struct uverbs_method_def * const []){__VA_ARGS__}) / \
+	 sizeof(const struct uverbs_method_def *))
 #define _UVERBS_OBJECT(_id, _type_attrs, ...)				\
 	((const struct uverbs_object_def) {				\
 	 .id = _id,							\
-	 .type_attrs = _type_attrs})
+	 .type_attrs = _type_attrs,					\
+	 .num_methods = _UVERBS_OBJECT_METHODS_SZ(__VA_ARGS__),		\
+	 .methods = &(const struct uverbs_method_def * const []){__VA_ARGS__} })
 #define DECLARE_UVERBS_OBJECT(_name, _id, _type_attrs, ...)		\
 	const struct uverbs_object_def _name =				\
 		_UVERBS_OBJECT(_id, _type_attrs, ##__VA_ARGS__)
@@ -235,6 +326,18 @@ static inline bool uverbs_attr_is_valid_in_hash(const struct uverbs_attr_bundle_
 	return test_bit(idx, attrs_hash->valid_bitmap);
 }
 
+static inline bool uverbs_attr_is_valid(const struct uverbs_attr_bundle *attrs_bundle,
+					unsigned int idx)
+{
+	u16 idx_bucket = idx >>	UVERBS_ID_NS_SHIFT;
+
+	if (attrs_bundle->num_buckets <= idx_bucket)
+		return false;
+
+	return uverbs_attr_is_valid_in_hash(&attrs_bundle->hash[idx_bucket],
+					    idx & ~UVERBS_ID_NS_MASK);
+}
+
 /* =================================================
  *	 Definitions -> Specs infrastructure
  * =================================================
-- 
cgit v1.2.3


From 4da70da23e9ba03f7f9e067fbe0eec6ebbfee401 Mon Sep 17 00:00:00 2001
From: Matan Barak <matanb@mellanox.com>
Date: Thu, 3 Aug 2017 16:07:02 +0300
Subject: IB/core: Explicitly destroy an object while keeping uobject

When some objects are destroyed, we need to extract their status at
destruction. After object's destruction, this status
(e.g. events_reported) relies in the uobject. In order to have the
latest and correct status, the underlying object should be destroyed,
but we should keep the uobject alive and read this information off the
uobject. We introduce a rdma_explicit_destroy function. This function
destroys the class type object (for example, the IDR class type which
destroys the underlying object as well) and then convert the uobject
to be of a null class type. This uobject will then be destroyed as any
other uobject once uverbs_finalize_object[s] is called.

Signed-off-by: Matan Barak <matanb@mellanox.com>
Reviewed-by: Yishai Hadas <yishaih@mellanox.com>
Signed-off-by: Doug Ledford <dledford@redhat.com>
---
 drivers/infiniband/core/rdma_core.c | 35 +++++++++++++++++++++++++++++++++++
 include/rdma/uverbs_types.h         |  1 +
 2 files changed, 36 insertions(+)

(limited to 'include')

diff --git a/drivers/infiniband/core/rdma_core.c b/drivers/infiniband/core/rdma_core.c
index 2a2f002ac7cb..85b5ee4defa4 100644
--- a/drivers/infiniband/core/rdma_core.c
+++ b/drivers/infiniband/core/rdma_core.c
@@ -451,6 +451,41 @@ int __must_check rdma_remove_commit_uobject(struct ib_uobject *uobj)
 	return ret;
 }
 
+static int null_obj_type_class_remove_commit(struct ib_uobject *uobj,
+					     enum rdma_remove_reason why)
+{
+	return 0;
+}
+
+static const struct uverbs_obj_type null_obj_type = {
+	.type_class = &((const struct uverbs_obj_type_class){
+			.remove_commit = null_obj_type_class_remove_commit,
+			/* be cautious */
+			.needs_kfree_rcu = true}),
+};
+
+int rdma_explicit_destroy(struct ib_uobject *uobject)
+{
+	int ret;
+	struct ib_ucontext *ucontext = uobject->context;
+
+	/* Cleanup is running. Calling this should have been impossible */
+	if (!down_read_trylock(&ucontext->cleanup_rwsem)) {
+		WARN(true, "ib_uverbs: Cleanup is running while removing an uobject\n");
+		return 0;
+	}
+	lockdep_check(uobject, true);
+	ret = uobject->type->type_class->remove_commit(uobject,
+						       RDMA_REMOVE_DESTROY);
+	if (ret)
+		return ret;
+
+	uobject->type = &null_obj_type;
+
+	up_read(&ucontext->cleanup_rwsem);
+	return 0;
+}
+
 static void alloc_commit_idr_uobject(struct ib_uobject *uobj)
 {
 	uverbs_uobject_add(uobj);
diff --git a/include/rdma/uverbs_types.h b/include/rdma/uverbs_types.h
index 9760b6d70744..cc04ec65588d 100644
--- a/include/rdma/uverbs_types.h
+++ b/include/rdma/uverbs_types.h
@@ -129,6 +129,7 @@ struct ib_uobject *rdma_alloc_begin_uobject(const struct uverbs_obj_type *type,
 void rdma_alloc_abort_uobject(struct ib_uobject *uobj);
 int __must_check rdma_remove_commit_uobject(struct ib_uobject *uobj);
 int rdma_alloc_commit_uobject(struct ib_uobject *uobj);
+int rdma_explicit_destroy(struct ib_uobject *uobject);
 
 struct uverbs_obj_fd_type {
 	/*
-- 
cgit v1.2.3


From 64b19e1323e96c34af7ca90d1954e70890c7a98e Mon Sep 17 00:00:00 2001
From: Matan Barak <matanb@mellanox.com>
Date: Thu, 3 Aug 2017 16:07:03 +0300
Subject: IB/core: Export ioctl enum types to user-space

Add a new ib_user_ioctl_verbs.h which exports all required ABI
enums and structs to the user-space.
Export the default types to user-space through this file.

Signed-off-by: Matan Barak <matanb@mellanox.com>
Reviewed-by: Yishai Hadas <yishaih@mellanox.com>
Signed-off-by: Doug Ledford <dledford@redhat.com>
---
 include/rdma/uverbs_std_types.h         | 18 +----------
 include/uapi/rdma/ib_user_ioctl_verbs.h | 54 +++++++++++++++++++++++++++++++++
 2 files changed, 55 insertions(+), 17 deletions(-)
 create mode 100644 include/uapi/rdma/ib_user_ioctl_verbs.h

(limited to 'include')

diff --git a/include/rdma/uverbs_std_types.h b/include/rdma/uverbs_std_types.h
index bef74099b7c5..400efe2a4d3c 100644
--- a/include/rdma/uverbs_std_types.h
+++ b/include/rdma/uverbs_std_types.h
@@ -34,23 +34,7 @@
 #define _UVERBS_STD_TYPES__
 
 #include <rdma/uverbs_types.h>
-
-enum uverbs_default_objects {
-	UVERBS_OBJECT_DEVICE, /* No instances of DEVICE are allowed */
-	UVERBS_OBJECT_PD,
-	UVERBS_OBJECT_COMP_CHANNEL,
-	UVERBS_OBJECT_CQ,
-	UVERBS_OBJECT_QP,
-	UVERBS_OBJECT_SRQ,
-	UVERBS_OBJECT_AH,
-	UVERBS_OBJECT_MR,
-	UVERBS_OBJECT_MW,
-	UVERBS_OBJECT_FLOW,
-	UVERBS_OBJECT_XRCD,
-	UVERBS_OBJECT_RWQ_IND_TBL,
-	UVERBS_OBJECT_WQ,
-	UVERBS_OBJECT_LAST,
-};
+#include <rdma/ib_user_ioctl_verbs.h>
 
 extern const struct uverbs_object_def uverbs_object_comp_channel;
 extern const struct uverbs_object_def uverbs_object_cq;
diff --git a/include/uapi/rdma/ib_user_ioctl_verbs.h b/include/uapi/rdma/ib_user_ioctl_verbs.h
new file mode 100644
index 000000000000..78a2e5be4d6e
--- /dev/null
+++ b/include/uapi/rdma/ib_user_ioctl_verbs.h
@@ -0,0 +1,54 @@
+/*
+ * Copyright (c) 2017, Mellanox Technologies inc.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef IB_USER_IOCTL_VERBS_H
+#define IB_USER_IOCTL_VERBS_H
+
+enum uverbs_default_objects {
+	UVERBS_OBJECT_DEVICE, /* No instances of DEVICE are allowed */
+	UVERBS_OBJECT_PD,
+	UVERBS_OBJECT_COMP_CHANNEL,
+	UVERBS_OBJECT_CQ,
+	UVERBS_OBJECT_QP,
+	UVERBS_OBJECT_SRQ,
+	UVERBS_OBJECT_AH,
+	UVERBS_OBJECT_MR,
+	UVERBS_OBJECT_MW,
+	UVERBS_OBJECT_FLOW,
+	UVERBS_OBJECT_XRCD,
+	UVERBS_OBJECT_RWQ_IND_TBL,
+	UVERBS_OBJECT_WQ,
+	UVERBS_OBJECT_LAST,
+};
+
+#endif
+
-- 
cgit v1.2.3


From d70724f149b107f8e4062320270d3d8b6713a1bb Mon Sep 17 00:00:00 2001
From: Matan Barak <matanb@mellanox.com>
Date: Thu, 3 Aug 2017 16:07:04 +0300
Subject: IB/core: Add legacy driver's user-data

In this phase, we don't want to change all the drivers to use
flexible driver's specific attributes. Therefore, we add two default
attributes: UHW_IN and UHW_OUT. These attributes are optional in some
methods and they encode the driver specific command data. We add
a function that extract this data and creates the legacy udata over
it.

Driver's data should start from UVERBS_UDATA_DRIVER_DATA_FLAG. This
turns on the first bit of the namespace, indicating this attribute
belongs to the driver's namespace.

Signed-off-by: Matan Barak <matanb@mellanox.com>
Reviewed-by: Yishai Hadas <yishaih@mellanox.com>
Signed-off-by: Doug Ledford <dledford@redhat.com>
---
 drivers/infiniband/core/uverbs_std_types.c | 40 ++++++++++++++++++++++++++
 include/rdma/uverbs_ioctl.h                | 46 ++++++++++++++++++++++++++++++
 include/uapi/rdma/ib_user_ioctl_verbs.h    | 10 +++++++
 3 files changed, 96 insertions(+)

(limited to 'include')

diff --git a/drivers/infiniband/core/uverbs_std_types.c b/drivers/infiniband/core/uverbs_std_types.c
index 5f90978bda8d..db66c18857e4 100644
--- a/drivers/infiniband/core/uverbs_std_types.c
+++ b/drivers/infiniband/core/uverbs_std_types.c
@@ -209,6 +209,46 @@ static int uverbs_hot_unplug_completion_event_file(struct ib_uobject_file *uobj_
 	return 0;
 };
 
+/*
+ * This spec is used in order to pass information to the hardware driver in a
+ * legacy way. Every verb that could get driver specific data should get this
+ * spec.
+ */
+static const struct uverbs_attr_def uverbs_uhw_compat_in =
+	UVERBS_ATTR_PTR_IN_SZ(UVERBS_UHW_IN, 0, UA_FLAGS(UVERBS_ATTR_SPEC_F_MIN_SZ));
+static const struct uverbs_attr_def uverbs_uhw_compat_out =
+	UVERBS_ATTR_PTR_OUT_SZ(UVERBS_UHW_OUT, 0, UA_FLAGS(UVERBS_ATTR_SPEC_F_MIN_SZ));
+
+static void create_udata(struct uverbs_attr_bundle *ctx,
+			 struct ib_udata *udata)
+{
+	/*
+	 * This is for ease of conversion. The purpose is to convert all drivers
+	 * to use uverbs_attr_bundle instead of ib_udata.
+	 * Assume attr == 0 is input and attr == 1 is output.
+	 */
+	void __user *inbuf;
+	size_t inbuf_len = 0;
+	void __user *outbuf;
+	size_t outbuf_len = 0;
+	const struct uverbs_attr *uhw_in =
+		uverbs_attr_get(ctx, UVERBS_UHW_IN);
+	const struct uverbs_attr *uhw_out =
+		uverbs_attr_get(ctx, UVERBS_UHW_OUT);
+
+	if (!IS_ERR(uhw_in)) {
+		inbuf = uhw_in->ptr_attr.ptr;
+		inbuf_len = uhw_in->ptr_attr.len;
+	}
+
+	if (!IS_ERR(uhw_out)) {
+		outbuf = uhw_out->ptr_attr.ptr;
+		outbuf_len = uhw_out->ptr_attr.len;
+	}
+
+	INIT_UDATA_BUF_OR_NULL(udata, inbuf, outbuf, inbuf_len, outbuf_len);
+}
+
 DECLARE_UVERBS_OBJECT(uverbs_object_comp_channel,
 		      UVERBS_OBJECT_COMP_CHANNEL,
 		      &UVERBS_TYPE_ALLOC_FD(0,
diff --git a/include/rdma/uverbs_ioctl.h b/include/rdma/uverbs_ioctl.h
index 9a8d217cdc1d..759afa0621ea 100644
--- a/include/rdma/uverbs_ioctl.h
+++ b/include/rdma/uverbs_ioctl.h
@@ -36,6 +36,7 @@
 #include <rdma/uverbs_types.h>
 #include <linux/uaccess.h>
 #include <rdma/rdma_user_ioctl.h>
+#include <rdma/ib_user_ioctl_verbs.h>
 
 /*
  * =======================================
@@ -338,6 +339,51 @@ static inline bool uverbs_attr_is_valid(const struct uverbs_attr_bundle *attrs_b
 					    idx & ~UVERBS_ID_NS_MASK);
 }
 
+static inline const struct uverbs_attr *uverbs_attr_get(const struct uverbs_attr_bundle *attrs_bundle,
+							u16 idx)
+{
+	u16 idx_bucket = idx >>	UVERBS_ID_NS_SHIFT;
+
+	if (!uverbs_attr_is_valid(attrs_bundle, idx))
+		return ERR_PTR(-ENOENT);
+
+	return &attrs_bundle->hash[idx_bucket].attrs[idx & ~UVERBS_ID_NS_MASK];
+}
+
+static inline int uverbs_copy_to(const struct uverbs_attr_bundle *attrs_bundle,
+				 size_t idx, const void *from)
+{
+	const struct uverbs_attr *attr = uverbs_attr_get(attrs_bundle, idx);
+	u16 flags;
+
+	if (IS_ERR(attr))
+		return PTR_ERR(attr);
+
+	flags = attr->ptr_attr.flags | UVERBS_ATTR_F_VALID_OUTPUT;
+	return (!copy_to_user(attr->ptr_attr.ptr, from, attr->ptr_attr.len) &&
+		!put_user(flags, &attr->uattr->flags)) ? 0 : -EFAULT;
+}
+
+static inline int _uverbs_copy_from(void *to, size_t to_size,
+				    const struct uverbs_attr_bundle *attrs_bundle,
+				    size_t idx)
+{
+	const struct uverbs_attr *attr = uverbs_attr_get(attrs_bundle, idx);
+
+	if (IS_ERR(attr))
+		return PTR_ERR(attr);
+
+	if (to_size <= sizeof(((struct ib_uverbs_attr *)0)->data))
+		memcpy(to, &attr->ptr_attr.data, attr->ptr_attr.len);
+	else if (copy_from_user(to, attr->ptr_attr.ptr, attr->ptr_attr.len))
+		return -EFAULT;
+
+	return 0;
+}
+
+#define uverbs_copy_from(to, attrs_bundle, idx)				      \
+	_uverbs_copy_from(to, sizeof(*(to)), attrs_bundle, idx)
+
 /* =================================================
  *	 Definitions -> Specs infrastructure
  * =================================================
diff --git a/include/uapi/rdma/ib_user_ioctl_verbs.h b/include/uapi/rdma/ib_user_ioctl_verbs.h
index 78a2e5be4d6e..90f81eeca35b 100644
--- a/include/uapi/rdma/ib_user_ioctl_verbs.h
+++ b/include/uapi/rdma/ib_user_ioctl_verbs.h
@@ -33,6 +33,11 @@
 #ifndef IB_USER_IOCTL_VERBS_H
 #define IB_USER_IOCTL_VERBS_H
 
+#include <rdma/rdma_user_ioctl.h>
+
+#define UVERBS_UDATA_DRIVER_DATA_NS	1
+#define UVERBS_UDATA_DRIVER_DATA_FLAG	(1UL << UVERBS_ID_NS_SHIFT)
+
 enum uverbs_default_objects {
 	UVERBS_OBJECT_DEVICE, /* No instances of DEVICE are allowed */
 	UVERBS_OBJECT_PD,
@@ -50,5 +55,10 @@ enum uverbs_default_objects {
 	UVERBS_OBJECT_LAST,
 };
 
+enum {
+	UVERBS_UHW_IN = UVERBS_UDATA_DRIVER_DATA_FLAG,
+	UVERBS_UHW_OUT,
+};
+
 #endif
 
-- 
cgit v1.2.3


From 9ee79fce364216df35ec46e26d20780c3c1644cc Mon Sep 17 00:00:00 2001
From: Matan Barak <matanb@mellanox.com>
Date: Thu, 3 Aug 2017 16:07:05 +0300
Subject: IB/core: Add completion queue (cq) object actions

Adding CQ ioctl actions:
1. create_cq
2. destroy_cq

This requires adding the following:
1. A specification describing the method
	a. Handler
	b. Attributes specification
		Each attribute is one of the following:
		a. PTR_IN - input data
			    Note: This could be encoded inlined for
				  data < 64bit
		b. PTR_OUT - response data
		c. IDR - idr based object
		d. FD - fd based object
                Blobs attributes (clauses a and b) contain their type,
	        while objects specifications (clauses c and d)
                contains the expected object type (for example, the
                given id should be UVERBS_TYPE_PD) and the required
                access (READ, WRITE, NEW or DESTROY). If a NEW is
                required, the new object's id will be assigned to this
                attribute. All attributes could get UA_FLAGS
                attribute. Currently we support stating that an
		attribute is mandatory or that the specification size
                corresponds to a lower bound (and that this attribute
		could be extended).
		We currently add both default attributes and the two
		generic UHW_IN and UHW_OUT driver specific attributes.
2. Handler
   A handler gets a uverbs_attr_bundle. The handler developer uses
   uverbs_attr_get to fetch an attribute of a given id.
   Each of these attribute groups correspond to the specification
   group defined in the action (clauses 1.b and 1.c respectively).
   The indices of these arrays corresponds to the attribute ids
   declared in the specifications (clause 2).

   The handler is quite simple. It assumes the infrastructure fetched
   all objects and locked, created or destroyed them as required by
   the specification. Pointer (or blob) attributes were validated to
   match their required sizes. After the handler finished, the
   infrastructure commits or rollbacks the objects.

Signed-off-by: Matan Barak <matanb@mellanox.com>
Reviewed-by: Yishai Hadas <yishaih@mellanox.com>
Signed-off-by: Doug Ledford <dledford@redhat.com>
---
 drivers/infiniband/core/uverbs_std_types.c | 138 ++++++++++++++++++++++++++++-
 include/uapi/rdma/ib_user_ioctl_verbs.h    |  20 +++++
 2 files changed, 157 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/drivers/infiniband/core/uverbs_std_types.c b/drivers/infiniband/core/uverbs_std_types.c
index db66c18857e4..0a98579700ec 100644
--- a/drivers/infiniband/core/uverbs_std_types.c
+++ b/drivers/infiniband/core/uverbs_std_types.c
@@ -249,6 +249,140 @@ static void create_udata(struct uverbs_attr_bundle *ctx,
 	INIT_UDATA_BUF_OR_NULL(udata, inbuf, outbuf, inbuf_len, outbuf_len);
 }
 
+static int uverbs_create_cq_handler(struct ib_device *ib_dev,
+				    struct ib_uverbs_file *file,
+				    struct uverbs_attr_bundle *attrs)
+{
+	struct ib_ucontext *ucontext = file->ucontext;
+	struct ib_ucq_object           *obj;
+	struct ib_udata uhw;
+	int ret;
+	u64 user_handle;
+	struct ib_cq_init_attr attr = {};
+	struct ib_cq                   *cq;
+	struct ib_uverbs_completion_event_file    *ev_file = NULL;
+	const struct uverbs_attr *ev_file_attr;
+	struct ib_uobject *ev_file_uobj;
+
+	if (!(ib_dev->uverbs_cmd_mask & 1ULL << IB_USER_VERBS_CMD_CREATE_CQ))
+		return -EOPNOTSUPP;
+
+	ret = uverbs_copy_from(&attr.comp_vector, attrs, CREATE_CQ_COMP_VECTOR);
+	if (!ret)
+		ret = uverbs_copy_from(&attr.cqe, attrs, CREATE_CQ_CQE);
+	if (!ret)
+		ret = uverbs_copy_from(&user_handle, attrs, CREATE_CQ_USER_HANDLE);
+	if (ret)
+		return ret;
+
+	/* Optional param, if it doesn't exist, we get -ENOENT and skip it */
+	if (uverbs_copy_from(&attr.flags, attrs, CREATE_CQ_FLAGS) == -EFAULT)
+		return -EFAULT;
+
+	ev_file_attr = uverbs_attr_get(attrs, CREATE_CQ_COMP_CHANNEL);
+	if (!IS_ERR(ev_file_attr)) {
+		ev_file_uobj = ev_file_attr->obj_attr.uobject;
+
+		ev_file = container_of(ev_file_uobj,
+				       struct ib_uverbs_completion_event_file,
+				       uobj_file.uobj);
+		uverbs_uobject_get(ev_file_uobj);
+	}
+
+	if (attr.comp_vector >= ucontext->ufile->device->num_comp_vectors) {
+		ret = -EINVAL;
+		goto err_event_file;
+	}
+
+	obj = container_of(uverbs_attr_get(attrs, CREATE_CQ_HANDLE)->obj_attr.uobject,
+			   typeof(*obj), uobject);
+	obj->uverbs_file	   = ucontext->ufile;
+	obj->comp_events_reported  = 0;
+	obj->async_events_reported = 0;
+	INIT_LIST_HEAD(&obj->comp_list);
+	INIT_LIST_HEAD(&obj->async_list);
+
+	/* Temporary, only until drivers get the new uverbs_attr_bundle */
+	create_udata(attrs, &uhw);
+
+	cq = ib_dev->create_cq(ib_dev, &attr, ucontext, &uhw);
+	if (IS_ERR(cq)) {
+		ret = PTR_ERR(cq);
+		goto err_event_file;
+	}
+
+	cq->device        = ib_dev;
+	cq->uobject       = &obj->uobject;
+	cq->comp_handler  = ib_uverbs_comp_handler;
+	cq->event_handler = ib_uverbs_cq_event_handler;
+	cq->cq_context    = &ev_file->ev_queue;
+	obj->uobject.object = cq;
+	obj->uobject.user_handle = user_handle;
+	atomic_set(&cq->usecnt, 0);
+
+	ret = uverbs_copy_to(attrs, CREATE_CQ_RESP_CQE, &cq->cqe);
+	if (ret)
+		goto err_cq;
+
+	return 0;
+err_cq:
+	ib_destroy_cq(cq);
+
+err_event_file:
+	if (ev_file)
+		uverbs_uobject_put(ev_file_uobj);
+	return ret;
+};
+
+static DECLARE_UVERBS_METHOD(
+	uverbs_method_cq_create, UVERBS_CQ_CREATE, uverbs_create_cq_handler,
+	&UVERBS_ATTR_IDR(CREATE_CQ_HANDLE, UVERBS_OBJECT_CQ, UVERBS_ACCESS_NEW,
+			 UA_FLAGS(UVERBS_ATTR_SPEC_F_MANDATORY)),
+	&UVERBS_ATTR_PTR_IN(CREATE_CQ_CQE, u32,
+			    UA_FLAGS(UVERBS_ATTR_SPEC_F_MANDATORY)),
+	&UVERBS_ATTR_PTR_IN(CREATE_CQ_USER_HANDLE, u64,
+			    UA_FLAGS(UVERBS_ATTR_SPEC_F_MANDATORY)),
+	&UVERBS_ATTR_FD(CREATE_CQ_COMP_CHANNEL, UVERBS_OBJECT_COMP_CHANNEL,
+			UVERBS_ACCESS_READ),
+	&UVERBS_ATTR_PTR_IN(CREATE_CQ_COMP_VECTOR, u32,
+			    UA_FLAGS(UVERBS_ATTR_SPEC_F_MANDATORY)),
+	&UVERBS_ATTR_PTR_IN(CREATE_CQ_FLAGS, u32),
+	&UVERBS_ATTR_PTR_OUT(CREATE_CQ_RESP_CQE, u32,
+			     UA_FLAGS(UVERBS_ATTR_SPEC_F_MANDATORY)),
+	&uverbs_uhw_compat_in, &uverbs_uhw_compat_out);
+
+static int uverbs_destroy_cq_handler(struct ib_device *ib_dev,
+				     struct ib_uverbs_file *file,
+				     struct uverbs_attr_bundle *attrs)
+{
+	struct ib_uverbs_destroy_cq_resp resp;
+	struct ib_uobject *uobj =
+		uverbs_attr_get(attrs, DESTROY_CQ_HANDLE)->obj_attr.uobject;
+	struct ib_ucq_object *obj = container_of(uobj, struct ib_ucq_object,
+						 uobject);
+	int ret;
+
+	if (!(ib_dev->uverbs_cmd_mask & 1ULL << IB_USER_VERBS_CMD_DESTROY_CQ))
+		return -EOPNOTSUPP;
+
+	ret = rdma_explicit_destroy(uobj);
+	if (ret)
+		return ret;
+
+	resp.comp_events_reported  = obj->comp_events_reported;
+	resp.async_events_reported = obj->async_events_reported;
+
+	return uverbs_copy_to(attrs, DESTROY_CQ_RESP, &resp);
+}
+
+static DECLARE_UVERBS_METHOD(
+	uverbs_method_cq_destroy, UVERBS_CQ_DESTROY, uverbs_destroy_cq_handler,
+	&UVERBS_ATTR_IDR(DESTROY_CQ_HANDLE, UVERBS_OBJECT_CQ,
+			 UVERBS_ACCESS_DESTROY,
+			 UA_FLAGS(UVERBS_ATTR_SPEC_F_MANDATORY)),
+	&UVERBS_ATTR_PTR_OUT(DESTROY_CQ_RESP, struct ib_uverbs_destroy_cq_resp,
+			     UA_FLAGS(UVERBS_ATTR_SPEC_F_MANDATORY)));
+
 DECLARE_UVERBS_OBJECT(uverbs_object_comp_channel,
 		      UVERBS_OBJECT_COMP_CHANNEL,
 		      &UVERBS_TYPE_ALLOC_FD(0,
@@ -259,7 +393,9 @@ DECLARE_UVERBS_OBJECT(uverbs_object_comp_channel,
 
 DECLARE_UVERBS_OBJECT(uverbs_object_cq, UVERBS_OBJECT_CQ,
 		      &UVERBS_TYPE_ALLOC_IDR_SZ(sizeof(struct ib_ucq_object), 0,
-						  uverbs_free_cq));
+						  uverbs_free_cq),
+		      &uverbs_method_cq_create,
+		      &uverbs_method_cq_destroy);
 
 DECLARE_UVERBS_OBJECT(uverbs_object_qp, UVERBS_OBJECT_QP,
 		      &UVERBS_TYPE_ALLOC_IDR_SZ(sizeof(struct ib_uqp_object), 0,
diff --git a/include/uapi/rdma/ib_user_ioctl_verbs.h b/include/uapi/rdma/ib_user_ioctl_verbs.h
index 90f81eeca35b..842792eae383 100644
--- a/include/uapi/rdma/ib_user_ioctl_verbs.h
+++ b/include/uapi/rdma/ib_user_ioctl_verbs.h
@@ -60,5 +60,25 @@ enum {
 	UVERBS_UHW_OUT,
 };
 
+enum uverbs_create_cq_cmd_attr_ids {
+	CREATE_CQ_HANDLE,
+	CREATE_CQ_CQE,
+	CREATE_CQ_USER_HANDLE,
+	CREATE_CQ_COMP_CHANNEL,
+	CREATE_CQ_COMP_VECTOR,
+	CREATE_CQ_FLAGS,
+	CREATE_CQ_RESP_CQE,
+};
+
+enum uverbs_destroy_cq_cmd_attr_ids {
+	DESTROY_CQ_HANDLE,
+	DESTROY_CQ_RESP,
+};
+
+enum uverbs_actions_cq_ops {
+	UVERBS_CQ_CREATE,
+	UVERBS_CQ_DESTROY,
+};
+
 #endif
 
-- 
cgit v1.2.3


From 524271129401ed896dc76e49acdbafc506cb41ac Mon Sep 17 00:00:00 2001
From: Matan Barak <matanb@mellanox.com>
Date: Thu, 3 Aug 2017 16:07:06 +0300
Subject: IB/core: Assign root to all drivers

In order to use the parsing tree, we need to assign the root
to all drivers. Currently, we just assign the default parsing
tree via ib_uverbs_add_one. The driver could override this by
assigning a parsing tree prior to registering the device.

Signed-off-by: Matan Barak <matanb@mellanox.com>
Reviewed-by: Yishai Hadas <yishaih@mellanox.com>
Signed-off-by: Doug Ledford <dledford@redhat.com>
---
 drivers/infiniband/core/uverbs.h      |  1 +
 drivers/infiniband/core/uverbs_main.c | 18 ++++++++++++++++++
 include/rdma/uverbs_ioctl.h           | 12 ++++++++++++
 include/rdma/uverbs_std_types.h       | 14 ++++++++++++++
 4 files changed, 45 insertions(+)

(limited to 'include')

diff --git a/drivers/infiniband/core/uverbs.h b/drivers/infiniband/core/uverbs.h
index 64d494a64daf..0f6f768f687e 100644
--- a/drivers/infiniband/core/uverbs.h
+++ b/drivers/infiniband/core/uverbs.h
@@ -100,6 +100,7 @@ struct ib_uverbs_device {
 	struct mutex				lists_mutex; /* protect lists */
 	struct list_head			uverbs_file_list;
 	struct list_head			uverbs_events_file_list;
+	struct uverbs_root_spec			*specs_root;
 };
 
 struct ib_uverbs_event_queue {
diff --git a/drivers/infiniband/core/uverbs_main.c b/drivers/infiniband/core/uverbs_main.c
index defeda33e27f..872fec910c16 100644
--- a/drivers/infiniband/core/uverbs_main.c
+++ b/drivers/infiniband/core/uverbs_main.c
@@ -49,6 +49,7 @@
 #include <linux/uaccess.h>
 
 #include <rdma/ib.h>
+#include <rdma/uverbs_std_types.h>
 
 #include "uverbs.h"
 #include "core_priv.h"
@@ -1097,6 +1098,18 @@ static void ib_uverbs_add_one(struct ib_device *device)
 	if (device_create_file(uverbs_dev->dev, &dev_attr_abi_version))
 		goto err_class;
 
+	if (!device->specs_root) {
+		const struct uverbs_object_tree_def *default_root[] = {
+			uverbs_default_get_objects()};
+
+		uverbs_dev->specs_root = uverbs_alloc_spec_tree(1,
+								default_root);
+		if (IS_ERR(uverbs_dev->specs_root))
+			goto err_class;
+
+		device->specs_root = uverbs_dev->specs_root;
+	}
+
 	ib_set_client_data(device, &uverbs_client, uverbs_dev);
 
 	return;
@@ -1228,6 +1241,11 @@ static void ib_uverbs_remove_one(struct ib_device *device, void *client_data)
 		ib_uverbs_comp_dev(uverbs_dev);
 	if (wait_clients)
 		wait_for_completion(&uverbs_dev->comp);
+	if (uverbs_dev->specs_root) {
+		uverbs_free_spec_tree(uverbs_dev->specs_root);
+		device->specs_root = NULL;
+	}
+
 	kobject_put(&uverbs_dev->kobj);
 }
 
diff --git a/include/rdma/uverbs_ioctl.h b/include/rdma/uverbs_ioctl.h
index 759afa0621ea..6da44079aa58 100644
--- a/include/rdma/uverbs_ioctl.h
+++ b/include/rdma/uverbs_ioctl.h
@@ -419,8 +419,20 @@ static inline int _uverbs_copy_from(void *to, size_t to_size,
  * An object without any methods is considered invalid and will abort the
  * function with -ENOENT error.
  */
+#if IS_ENABLED(CONFIG_INFINIBAND_USER_ACCESS)
 struct uverbs_root_spec *uverbs_alloc_spec_tree(unsigned int num_trees,
 						const struct uverbs_object_tree_def **trees);
 void uverbs_free_spec_tree(struct uverbs_root_spec *root);
+#else
+static inline struct uverbs_root_spec *uverbs_alloc_spec_tree(unsigned int num_trees,
+							      const struct uverbs_object_tree_def **trees)
+{
+	return NULL;
+}
+
+static inline void uverbs_free_spec_tree(struct uverbs_root_spec *root)
+{
+}
+#endif
 
 #endif
diff --git a/include/rdma/uverbs_std_types.h b/include/rdma/uverbs_std_types.h
index 400efe2a4d3c..5f8e20bbd67c 100644
--- a/include/rdma/uverbs_std_types.h
+++ b/include/rdma/uverbs_std_types.h
@@ -34,8 +34,10 @@
 #define _UVERBS_STD_TYPES__
 
 #include <rdma/uverbs_types.h>
+#include <rdma/uverbs_ioctl.h>
 #include <rdma/ib_user_ioctl_verbs.h>
 
+#if IS_ENABLED(CONFIG_INFINIBAND_USER_ACCESS)
 extern const struct uverbs_object_def uverbs_object_comp_channel;
 extern const struct uverbs_object_def uverbs_object_cq;
 extern const struct uverbs_object_def uverbs_object_qp;
@@ -50,6 +52,18 @@ extern const struct uverbs_object_def uverbs_object_pd;
 extern const struct uverbs_object_def uverbs_object_xrcd;
 extern const struct uverbs_object_def uverbs_object_device;
 
+extern const struct uverbs_object_tree_def uverbs_default_objects;
+static inline const struct uverbs_object_tree_def *uverbs_default_get_objects(void)
+{
+	return &uverbs_default_objects;
+}
+#else
+static inline const struct uverbs_object_tree_def *uverbs_default_get_objects(void)
+{
+	return NULL;
+}
+#endif
+
 static inline struct ib_uobject *__uobj_get(const struct uverbs_obj_type *type,
 					    bool write,
 					    struct ib_ucontext *ucontext,
-- 
cgit v1.2.3


From 75e3ea5ba7d7a6c171c62c710990f2a5fb07e19a Mon Sep 17 00:00:00 2001
From: Stefano Stabellini <sstabellini@kernel.org>
Date: Wed, 5 Jul 2017 13:08:30 -0700
Subject: xen: introduce the pvcalls interface header

Introduce the C header file which defines the PV Calls interface. It is
imported from xen/include/public/io/pvcalls.h.

Signed-off-by: Stefano Stabellini <stefano@aporeto.com>
Reviewed-by: Boris Ostrovsky <boris.ostrovsky@oracle.com>
Reviewed-by: Juergen Gross <jgross@suse.com>
CC: konrad.wilk@oracle.com
CC: boris.ostrovsky@oracle.com
CC: jgross@suse.com
Signed-off-by: Boris Ostrovsky <boris.ostrovsky@oracle.com>
---
 include/xen/interface/io/pvcalls.h | 121 +++++++++++++++++++++++++++++++++++++
 include/xen/interface/io/ring.h    |   2 +
 2 files changed, 123 insertions(+)
 create mode 100644 include/xen/interface/io/pvcalls.h

(limited to 'include')

diff --git a/include/xen/interface/io/pvcalls.h b/include/xen/interface/io/pvcalls.h
new file mode 100644
index 000000000000..ccf97b817e72
--- /dev/null
+++ b/include/xen/interface/io/pvcalls.h
@@ -0,0 +1,121 @@
+#ifndef __XEN_PUBLIC_IO_XEN_PVCALLS_H__
+#define __XEN_PUBLIC_IO_XEN_PVCALLS_H__
+
+#include <linux/net.h>
+#include <xen/interface/io/ring.h>
+#include <xen/interface/grant_table.h>
+
+/* "1" means socket, connect, release, bind, listen, accept and poll */
+#define XENBUS_FUNCTIONS_CALLS "1"
+
+/*
+ * See docs/misc/pvcalls.markdown in xen.git for the full specification:
+ * https://xenbits.xen.org/docs/unstable/misc/pvcalls.html
+ */
+struct pvcalls_data_intf {
+    RING_IDX in_cons, in_prod, in_error;
+
+    uint8_t pad1[52];
+
+    RING_IDX out_cons, out_prod, out_error;
+
+    uint8_t pad2[52];
+
+    RING_IDX ring_order;
+    grant_ref_t ref[];
+};
+DEFINE_XEN_FLEX_RING(pvcalls);
+
+#define PVCALLS_SOCKET         0
+#define PVCALLS_CONNECT        1
+#define PVCALLS_RELEASE        2
+#define PVCALLS_BIND           3
+#define PVCALLS_LISTEN         4
+#define PVCALLS_ACCEPT         5
+#define PVCALLS_POLL           6
+
+struct xen_pvcalls_request {
+    uint32_t req_id; /* private to guest, echoed in response */
+    uint32_t cmd;    /* command to execute */
+    union {
+        struct xen_pvcalls_socket {
+            uint64_t id;
+            uint32_t domain;
+            uint32_t type;
+            uint32_t protocol;
+        } socket;
+        struct xen_pvcalls_connect {
+            uint64_t id;
+            uint8_t addr[28];
+            uint32_t len;
+            uint32_t flags;
+            grant_ref_t ref;
+            uint32_t evtchn;
+        } connect;
+        struct xen_pvcalls_release {
+            uint64_t id;
+            uint8_t reuse;
+        } release;
+        struct xen_pvcalls_bind {
+            uint64_t id;
+            uint8_t addr[28];
+            uint32_t len;
+        } bind;
+        struct xen_pvcalls_listen {
+            uint64_t id;
+            uint32_t backlog;
+        } listen;
+        struct xen_pvcalls_accept {
+            uint64_t id;
+            uint64_t id_new;
+            grant_ref_t ref;
+            uint32_t evtchn;
+        } accept;
+        struct xen_pvcalls_poll {
+            uint64_t id;
+        } poll;
+        /* dummy member to force sizeof(struct xen_pvcalls_request)
+         * to match across archs */
+        struct xen_pvcalls_dummy {
+            uint8_t dummy[56];
+        } dummy;
+    } u;
+};
+
+struct xen_pvcalls_response {
+    uint32_t req_id;
+    uint32_t cmd;
+    int32_t ret;
+    uint32_t pad;
+    union {
+        struct _xen_pvcalls_socket {
+            uint64_t id;
+        } socket;
+        struct _xen_pvcalls_connect {
+            uint64_t id;
+        } connect;
+        struct _xen_pvcalls_release {
+            uint64_t id;
+        } release;
+        struct _xen_pvcalls_bind {
+            uint64_t id;
+        } bind;
+        struct _xen_pvcalls_listen {
+            uint64_t id;
+        } listen;
+        struct _xen_pvcalls_accept {
+            uint64_t id;
+        } accept;
+        struct _xen_pvcalls_poll {
+            uint64_t id;
+        } poll;
+        struct _xen_pvcalls_dummy {
+            uint8_t dummy[8];
+        } dummy;
+    } u;
+};
+
+DEFINE_RING_TYPES(xen_pvcalls, struct xen_pvcalls_request,
+                  struct xen_pvcalls_response);
+
+#endif
diff --git a/include/xen/interface/io/ring.h b/include/xen/interface/io/ring.h
index c79456855539..e547088ceb0e 100644
--- a/include/xen/interface/io/ring.h
+++ b/include/xen/interface/io/ring.h
@@ -9,6 +9,8 @@
 #ifndef __XEN_PUBLIC_IO_RING_H__
 #define __XEN_PUBLIC_IO_RING_H__
 
+#include <xen/interface/grant_table.h>
+
 typedef unsigned int RING_IDX;
 
 /* Round a 32-bit unsigned constant down to the nearest power of two. */
-- 
cgit v1.2.3


From b4feaeb036b2e0f715e0d05ae84da01c6696a75f Mon Sep 17 00:00:00 2001
From: Juergen Gross <jgross@suse.com>
Date: Thu, 27 Jul 2017 17:11:58 +0200
Subject: xen: cleanup xen.h

The macros for testing domain types are more complicated then they
need to. Simplify them.

Signed-off-by: Juergen Gross <jgross@suse.com>
Reviewed-by: Boris Ostrovsky <boris.ostrovsky@oracle.com>
Signed-off-by: Boris Ostrovsky <boris.ostrovsky@oracle.com>
---
 include/xen/xen.h | 20 +++++++++-----------
 1 file changed, 9 insertions(+), 11 deletions(-)

(limited to 'include')

diff --git a/include/xen/xen.h b/include/xen/xen.h
index 6e8b7fc79801..28c59ca529d7 100644
--- a/include/xen/xen.h
+++ b/include/xen/xen.h
@@ -13,11 +13,16 @@ extern enum xen_domain_type xen_domain_type;
 #define xen_domain_type		XEN_NATIVE
 #endif
 
+#ifdef CONFIG_XEN_PVH
+extern bool xen_pvh;
+#else
+#define xen_pvh			0
+#endif
+
 #define xen_domain()		(xen_domain_type != XEN_NATIVE)
-#define xen_pv_domain()		(xen_domain() &&			\
-				 xen_domain_type == XEN_PV_DOMAIN)
-#define xen_hvm_domain()	(xen_domain() &&			\
-				 xen_domain_type == XEN_HVM_DOMAIN)
+#define xen_pv_domain()		(xen_domain_type == XEN_PV_DOMAIN)
+#define xen_hvm_domain()	(xen_domain_type == XEN_HVM_DOMAIN)
+#define xen_pvh_domain()	(xen_pvh)
 
 #ifdef CONFIG_XEN_DOM0
 #include <xen/interface/xen.h>
@@ -29,11 +34,4 @@ extern enum xen_domain_type xen_domain_type;
 #define xen_initial_domain()	(0)
 #endif	/* CONFIG_XEN_DOM0 */
 
-#ifdef CONFIG_XEN_PVH
-extern bool xen_pvh;
-#define xen_pvh_domain()	(xen_hvm_domain() && xen_pvh)
-#else
-#define xen_pvh_domain()	(0)
-#endif
-
 #endif	/* _XEN_XEN_H */
-- 
cgit v1.2.3


From 882bbe56aed0cbb00b374454f6c069919c9228dd Mon Sep 17 00:00:00 2001
From: Juergen Gross <jgross@suse.com>
Date: Fri, 4 Aug 2017 13:36:12 +0200
Subject: xen: remove unused function xen_set_domain_pte()

The function xen_set_domain_pte() is used nowhere in the kernel.
Remove it.

Signed-off-by: Juergen Gross <jgross@suse.com>
Acked-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
Reviewed-by: Boris Ostrovsky <boris.ostrovsky@oracle.com>
Signed-off-by: Boris Ostrovsky <boris.ostrovsky@oracle.com>
---
 arch/x86/include/asm/xen/page.h |  2 --
 arch/x86/xen/mmu_pv.c           | 20 --------------------
 include/trace/events/xen.h      | 18 ------------------
 3 files changed, 40 deletions(-)

(limited to 'include')

diff --git a/arch/x86/include/asm/xen/page.h b/arch/x86/include/asm/xen/page.h
index 497f7d28c1d6..07b6531813c4 100644
--- a/arch/x86/include/asm/xen/page.h
+++ b/arch/x86/include/asm/xen/page.h
@@ -314,8 +314,6 @@ static inline pte_t __pte_ma(pteval_t x)
 #define p4d_val_ma(x)	((x).p4d)
 #endif
 
-void xen_set_domain_pte(pte_t *ptep, pte_t pteval, unsigned domid);
-
 xmaddr_t arbitrary_virt_to_machine(void *address);
 unsigned long arbitrary_virt_to_mfn(void *vaddr);
 void make_lowmem_page_readonly(void *vaddr);
diff --git a/arch/x86/xen/mmu_pv.c b/arch/x86/xen/mmu_pv.c
index cab28cf2cffb..0422ee7e70b3 100644
--- a/arch/x86/xen/mmu_pv.c
+++ b/arch/x86/xen/mmu_pv.c
@@ -162,26 +162,6 @@ static bool xen_page_pinned(void *ptr)
 	return PagePinned(page);
 }
 
-void xen_set_domain_pte(pte_t *ptep, pte_t pteval, unsigned domid)
-{
-	struct multicall_space mcs;
-	struct mmu_update *u;
-
-	trace_xen_mmu_set_domain_pte(ptep, pteval, domid);
-
-	mcs = xen_mc_entry(sizeof(*u));
-	u = mcs.args;
-
-	/* ptep might be kmapped when using 32-bit HIGHPTE */
-	u->ptr = virt_to_machine(ptep).maddr;
-	u->val = pte_val_ma(pteval);
-
-	MULTI_mmu_update(mcs.mc, mcs.args, 1, NULL, domid);
-
-	xen_mc_issue(PARAVIRT_LAZY_MMU);
-}
-EXPORT_SYMBOL_GPL(xen_set_domain_pte);
-
 static void xen_extend_mmu_update(const struct mmu_update *update)
 {
 	struct multicall_space mcs;
diff --git a/include/trace/events/xen.h b/include/trace/events/xen.h
index b70a38b7fa84..677e8ac2bb81 100644
--- a/include/trace/events/xen.h
+++ b/include/trace/events/xen.h
@@ -149,24 +149,6 @@ DECLARE_EVENT_CLASS(xen_mmu__set_pte,
 DEFINE_XEN_MMU_SET_PTE(xen_mmu_set_pte);
 DEFINE_XEN_MMU_SET_PTE(xen_mmu_set_pte_atomic);
 
-TRACE_EVENT(xen_mmu_set_domain_pte,
-	    TP_PROTO(pte_t *ptep, pte_t pteval, unsigned domid),
-	    TP_ARGS(ptep, pteval, domid),
-	    TP_STRUCT__entry(
-		    __field(pte_t *, ptep)
-		    __field(pteval_t, pteval)
-		    __field(unsigned, domid)
-		    ),
-	    TP_fast_assign(__entry->ptep = ptep;
-			   __entry->pteval = pteval.pte;
-			   __entry->domid = domid),
-	    TP_printk("ptep %p pteval %0*llx (raw %0*llx) domid %u",
-		      __entry->ptep,
-		      (int)sizeof(pteval_t) * 2, (unsigned long long)pte_val(native_make_pte(__entry->pteval)),
-		      (int)sizeof(pteval_t) * 2, (unsigned long long)__entry->pteval,
-		      __entry->domid)
-	);
-
 TRACE_EVENT(xen_mmu_set_pte_at,
 	    TP_PROTO(struct mm_struct *mm, unsigned long addr,
 		     pte_t *ptep, pte_t pteval),
-- 
cgit v1.2.3


From a0e4fd14ba0b90b6fcea9c0f7f6de6886479427d Mon Sep 17 00:00:00 2001
From: Juergen Gross <jgross@suse.com>
Date: Fri, 4 Aug 2017 13:36:13 +0200
Subject: xen: remove not used trace functions

There are some Xen specific trace functions defined in
include/trace/events/xen.h. Remove them.

Signed-off-by: Juergen Gross <jgross@suse.com>
Acked-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
Reviewed-by: Boris Ostrovsky <boris.ostrovsky@oracle.com>
Signed-off-by: Boris Ostrovsky <boris.ostrovsky@oracle.com>
---
 include/trace/events/xen.h | 20 --------------------
 1 file changed, 20 deletions(-)

(limited to 'include')

diff --git a/include/trace/events/xen.h b/include/trace/events/xen.h
index 677e8ac2bb81..1b4fed72f573 100644
--- a/include/trace/events/xen.h
+++ b/include/trace/events/xen.h
@@ -248,16 +248,6 @@ TRACE_EVENT(xen_mmu_set_p4d,
 		      (int)sizeof(p4dval_t) * 2, (unsigned long long)pgd_val(native_make_pgd(__entry->p4dval)),
 		      (int)sizeof(p4dval_t) * 2, (unsigned long long)__entry->p4dval)
 	);
-
-TRACE_EVENT(xen_mmu_pud_clear,
-	    TP_PROTO(pud_t *pudp),
-	    TP_ARGS(pudp),
-	    TP_STRUCT__entry(
-		    __field(pud_t *, pudp)
-		    ),
-	    TP_fast_assign(__entry->pudp = pudp),
-	    TP_printk("pudp %p", __entry->pudp)
-	);
 #else
 
 TRACE_EVENT(xen_mmu_set_pud,
@@ -277,16 +267,6 @@ TRACE_EVENT(xen_mmu_set_pud,
 
 #endif
 
-TRACE_EVENT(xen_mmu_pgd_clear,
-	    TP_PROTO(pgd_t *pgdp),
-	    TP_ARGS(pgdp),
-	    TP_STRUCT__entry(
-		    __field(pgd_t *, pgdp)
-		    ),
-	    TP_fast_assign(__entry->pgdp = pgdp),
-	    TP_printk("pgdp %p", __entry->pgdp)
-	);
-
 DECLARE_EVENT_CLASS(xen_mmu_ptep_modify_prot,
 	    TP_PROTO(struct mm_struct *mm, unsigned long addr,
 		     pte_t *ptep, pte_t pteval),
-- 
cgit v1.2.3


From de29faa0d8ac925534749bc56d539bf936ce122b Mon Sep 17 00:00:00 2001
From: Marc Zyngier <marc.zyngier@arm.com>
Date: Mon, 19 Dec 2016 19:25:00 +0000
Subject: irqchip/gic-v4: Add management structure definitions

Add a bunch of GICv4-specific data structures that will get used in
subsequent patches.

Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Marc Zyngier <marc.zyngier@arm.com>
---
 include/linux/irqchip/arm-gic-v4.h | 92 ++++++++++++++++++++++++++++++++++++++
 1 file changed, 92 insertions(+)
 create mode 100644 include/linux/irqchip/arm-gic-v4.h

(limited to 'include')

diff --git a/include/linux/irqchip/arm-gic-v4.h b/include/linux/irqchip/arm-gic-v4.h
new file mode 100644
index 000000000000..d499538dd86f
--- /dev/null
+++ b/include/linux/irqchip/arm-gic-v4.h
@@ -0,0 +1,92 @@
+/*
+ * Copyright (C) 2016,2017 ARM Limited, All Rights Reserved.
+ * Author: Marc Zyngier <marc.zyngier@arm.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef __LINUX_IRQCHIP_ARM_GIC_V4_H
+#define __LINUX_IRQCHIP_ARM_GIC_V4_H
+
+struct its_vpe;
+
+/* Embedded in kvm.arch */
+struct its_vm {
+	struct fwnode_handle	*fwnode;
+	struct irq_domain	*domain;
+	struct page		*vprop_page;
+	struct its_vpe		**vpes;
+	int			nr_vpes;
+	irq_hw_number_t		db_lpi_base;
+	unsigned long		*db_bitmap;
+	int			nr_db_lpis;
+};
+
+/* Embedded in kvm_vcpu.arch */
+struct its_vpe {
+	struct page 		*vpt_page;
+	struct its_vm		*its_vm;
+	/* Doorbell interrupt */
+	int			irq;
+	irq_hw_number_t		vpe_db_lpi;
+	/*
+	 * This collection ID is used to indirect the target
+	 * redistributor for this VPE. The ID itself isn't involved in
+	 * programming of the ITS.
+	 */
+	u16			col_idx;
+	/* Unique (system-wide) VPE identifier */
+	u16			vpe_id;
+	/* Implementation Defined Area Invalid */
+	bool			idai;
+	/* Pending VLPIs on schedule out? */
+	bool			pending_last;
+};
+
+/*
+ * struct its_vlpi_map: structure describing the mapping of a
+ * VLPI. Only to be interpreted in the context of a physical interrupt
+ * it complements.  To be used as the vcpu_info passed to
+ * irq_set_vcpu_affinity().
+ *
+ * @vm:		Pointer to the GICv4 notion of a VM
+ * @vpe:	Pointer to the GICv4 notion of a virtual CPU (VPE)
+ * @vintid:	Virtual LPI number
+ * @db_enabled:	Is the VPE doorbell to be generated?
+ */
+struct its_vlpi_map {
+	struct its_vm		*vm;
+	struct its_vpe		*vpe;
+	u32			vintid;
+	bool			db_enabled;
+};
+
+enum its_vcpu_info_cmd_type {
+	MAP_VLPI,
+	GET_VLPI,
+	PROP_UPDATE_VLPI,
+	PROP_UPDATE_AND_INV_VLPI,
+	SCHEDULE_VPE,
+	DESCHEDULE_VPE,
+	INVALL_VPE,
+};
+
+struct its_cmd_info {
+	enum its_vcpu_info_cmd_type	cmd_type;
+	union {
+		struct its_vlpi_map	*map;
+		u8			config;
+	};
+};
+
+#endif
-- 
cgit v1.2.3


From d7276b80e752acaaf30f04e62d0e986b5bced2df Mon Sep 17 00:00:00 2001
From: Marc Zyngier <marc.zyngier@arm.com>
Date: Tue, 20 Dec 2016 15:11:47 +0000
Subject: irqchip/gic-v3-its: Add GICv4 ITS command definitions

Add the new GICv4 ITS command definitions, most of them, being
defined in terms of their physical counterparts.

Reviewed-by: Eric Auger <eric.auger@redhat.com>
Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Marc Zyngier <marc.zyngier@arm.com>
---
 drivers/irqchip/irq-gic-v3-its.c   |  2 +-
 include/linux/irqchip/arm-gic-v3.h | 12 ++++++++++++
 2 files changed, 13 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/drivers/irqchip/irq-gic-v3-its.c b/drivers/irqchip/irq-gic-v3-its.c
index 19ce99b78042..dd92a8a4572a 100644
--- a/drivers/irqchip/irq-gic-v3-its.c
+++ b/drivers/irqchip/irq-gic-v3-its.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (C) 2013, 2014 ARM Limited, All Rights Reserved.
+ * Copyright (C) 2013-2017 ARM Limited, All Rights Reserved.
  * Author: Marc Zyngier <marc.zyngier@arm.com>
  *
  * This program is free software; you can redistribute it and/or modify
diff --git a/include/linux/irqchip/arm-gic-v3.h b/include/linux/irqchip/arm-gic-v3.h
index af8c55105fc2..7c6fd8f3e36c 100644
--- a/include/linux/irqchip/arm-gic-v3.h
+++ b/include/linux/irqchip/arm-gic-v3.h
@@ -347,6 +347,18 @@
 #define GITS_CMD_CLEAR			0x04
 #define GITS_CMD_SYNC			0x05
 
+/*
+ * GICv4 ITS specific commands
+ */
+#define GITS_CMD_GICv4(x)		((x) | 0x20)
+#define GITS_CMD_VINVALL		GITS_CMD_GICv4(GITS_CMD_INVALL)
+#define GITS_CMD_VMAPP			GITS_CMD_GICv4(GITS_CMD_MAPC)
+#define GITS_CMD_VMAPTI			GITS_CMD_GICv4(GITS_CMD_MAPTI)
+#define GITS_CMD_VMOVI			GITS_CMD_GICv4(GITS_CMD_MOVI)
+#define GITS_CMD_VSYNC			GITS_CMD_GICv4(GITS_CMD_SYNC)
+/* VMOVP is the odd one, as it doesn't have a physical counterpart */
+#define GITS_CMD_VMOVP			GITS_CMD_GICv4(2)
+
 /*
  * ITS error numbers
  */
-- 
cgit v1.2.3


From 3ca63f363f3f8fe457482c53d5c86d83bff21e64 Mon Sep 17 00:00:00 2001
From: Marc Zyngier <marc.zyngier@arm.com>
Date: Tue, 3 Jan 2017 13:39:52 +0000
Subject: irqchip/gic-v3-its: Add VPENDBASER/VPROPBASER accessors

V{PEND,PROP}BASER being 64bit registers, they need some ad-hoc
accessors on 32bit, specially given that VPENDBASER contains
a Valid bit, making the access a bit convoluted.

Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Eric Auger <eric.auger@redhat.com>
Signed-off-by: Marc Zyngier <marc.zyngier@arm.com>
---
 arch/arm/include/asm/arch_gicv3.h   | 28 ++++++++++++++++++++++++++++
 arch/arm64/include/asm/arch_gicv3.h |  5 +++++
 include/linux/irqchip/arm-gic-v3.h  |  5 +++++
 3 files changed, 38 insertions(+)

(limited to 'include')

diff --git a/arch/arm/include/asm/arch_gicv3.h b/arch/arm/include/asm/arch_gicv3.h
index 27475904e096..8d45e88feac9 100644
--- a/arch/arm/include/asm/arch_gicv3.h
+++ b/arch/arm/include/asm/arch_gicv3.h
@@ -291,5 +291,33 @@ static inline u64 __gic_readq_nonatomic(const volatile void __iomem *addr)
  */
 #define gits_write_cwriter(v, c)	__gic_writeq_nonatomic(v, c)
 
+/*
+ * GITS_VPROPBASER - hi and lo bits may be accessed independently.
+ */
+#define gits_write_vpropbaser(v, c)	__gic_writeq_nonatomic(v, c)
+
+/*
+ * GITS_VPENDBASER - the Valid bit must be cleared before changing
+ * anything else.
+ */
+static inline void gits_write_vpendbaser(u64 val, void * __iomem addr)
+{
+	u32 tmp;
+
+	tmp = readl_relaxed(addr + 4);
+	if (tmp & (GICR_VPENDBASER_Valid >> 32)) {
+		tmp &= ~(GICR_VPENDBASER_Valid >> 32);
+		writel_relaxed(tmp, addr + 4);
+	}
+
+	/*
+	 * Use the fact that __gic_writeq_nonatomic writes the second
+	 * half of the 64bit quantity after the first.
+	 */
+	__gic_writeq_nonatomic(val, addr);
+}
+
+#define gits_read_vpendbaser(c)		__gic_readq_nonatomic(c)
+
 #endif /* !__ASSEMBLY__ */
 #endif /* !__ASM_ARCH_GICV3_H */
diff --git a/arch/arm64/include/asm/arch_gicv3.h b/arch/arm64/include/asm/arch_gicv3.h
index 8cef47fa2218..0d2a53457c30 100644
--- a/arch/arm64/include/asm/arch_gicv3.h
+++ b/arch/arm64/include/asm/arch_gicv3.h
@@ -133,5 +133,10 @@ static inline void gic_write_bpr1(u32 val)
 #define gicr_write_pendbaser(v, c)	writeq_relaxed(v, c)
 #define gicr_read_pendbaser(c)		readq_relaxed(c)
 
+#define gits_write_vpropbaser(v, c)	writeq_relaxed(v, c)
+
+#define gits_write_vpendbaser(v, c)	writeq_relaxed(v, c)
+#define gits_read_vpendbaser(c)		readq_relaxed(c)
+
 #endif /* __ASSEMBLY__ */
 #endif /* __ASM_ARCH_GICV3_H */
diff --git a/include/linux/irqchip/arm-gic-v3.h b/include/linux/irqchip/arm-gic-v3.h
index 7c6fd8f3e36c..17ba0d732f12 100644
--- a/include/linux/irqchip/arm-gic-v3.h
+++ b/include/linux/irqchip/arm-gic-v3.h
@@ -212,6 +212,11 @@
 #define LPI_PROP_GROUP1			(1 << 1)
 #define LPI_PROP_ENABLED		(1 << 0)
 
+#define GICR_VPENDBASER_Dirty		(1ULL << 60)
+#define GICR_VPENDBASER_PendingLast	(1ULL << 61)
+#define GICR_VPENDBASER_IDAI		(1ULL << 62)
+#define GICR_VPENDBASER_Valid		(1ULL << 63)
+
 /*
  * ITS registers, offsets from ITS_base
  */
-- 
cgit v1.2.3


From e643d80340363c9d172abfbe437537196cfc1643 Mon Sep 17 00:00:00 2001
From: Marc Zyngier <marc.zyngier@arm.com>
Date: Tue, 20 Dec 2016 15:09:31 +0000
Subject: irqchip/gic-v3-its: Add VPE scheduling

When a VPE is scheduled to run, the corresponding redistributor must
be told so, by setting VPROPBASER to the VM's property table, and
VPENDBASER to the vcpu's pending table.

When scheduled out, we preserve the IDAI and PendingLast bits. The
latter is specially important, as it tells the hypervisor that
there are pending interrupts for this vcpu.

Reviewed-by: Eric Auger <eric.auger@redhat.com>
Signed-off-by: Marc Zyngier <marc.zyngier@arm.com>
---
 drivers/irqchip/irq-gic-v3-its.c   | 85 ++++++++++++++++++++++++++++++++++++++
 include/linux/irqchip/arm-gic-v3.h | 58 ++++++++++++++++++++++++++
 2 files changed, 143 insertions(+)

(limited to 'include')

diff --git a/drivers/irqchip/irq-gic-v3-its.c b/drivers/irqchip/irq-gic-v3-its.c
index 128740b87806..f4827040a788 100644
--- a/drivers/irqchip/irq-gic-v3-its.c
+++ b/drivers/irqchip/irq-gic-v3-its.c
@@ -152,6 +152,7 @@ static DEFINE_IDA(its_vpeid_ida);
 
 #define gic_data_rdist()		(raw_cpu_ptr(gic_rdists->rdist))
 #define gic_data_rdist_rd_base()	(gic_data_rdist()->rd_base)
+#define gic_data_rdist_vlpi_base()	(gic_data_rdist_rd_base() + SZ_128K)
 
 static struct its_collection *dev_event_to_col(struct its_device *its_dev,
 					       u32 event)
@@ -2153,8 +2154,92 @@ static const struct irq_domain_ops its_domain_ops = {
 	.deactivate		= its_irq_domain_deactivate,
 };
 
+static void its_vpe_schedule(struct its_vpe *vpe)
+{
+	void * __iomem vlpi_base = gic_data_rdist_vlpi_base();
+	u64 val;
+
+	/* Schedule the VPE */
+	val  = virt_to_phys(page_address(vpe->its_vm->vprop_page)) &
+		GENMASK_ULL(51, 12);
+	val |= (LPI_NRBITS - 1) & GICR_VPROPBASER_IDBITS_MASK;
+	val |= GICR_VPROPBASER_RaWb;
+	val |= GICR_VPROPBASER_InnerShareable;
+	gits_write_vpropbaser(val, vlpi_base + GICR_VPROPBASER);
+
+	val  = virt_to_phys(page_address(vpe->vpt_page)) &
+		GENMASK_ULL(51, 16);
+	val |= GICR_VPENDBASER_RaWaWb;
+	val |= GICR_VPENDBASER_NonShareable;
+	/*
+	 * There is no good way of finding out if the pending table is
+	 * empty as we can race against the doorbell interrupt very
+	 * easily. So in the end, vpe->pending_last is only an
+	 * indication that the vcpu has something pending, not one
+	 * that the pending table is empty. A good implementation
+	 * would be able to read its coarse map pretty quickly anyway,
+	 * making this a tolerable issue.
+	 */
+	val |= GICR_VPENDBASER_PendingLast;
+	val |= vpe->idai ? GICR_VPENDBASER_IDAI : 0;
+	val |= GICR_VPENDBASER_Valid;
+	gits_write_vpendbaser(val, vlpi_base + GICR_VPENDBASER);
+}
+
+static void its_vpe_deschedule(struct its_vpe *vpe)
+{
+	void * __iomem vlpi_base = gic_data_rdist_vlpi_base();
+	u32 count = 1000000;	/* 1s! */
+	bool clean;
+	u64 val;
+
+	/* We're being scheduled out */
+	val = gits_read_vpendbaser(vlpi_base + GICR_VPENDBASER);
+	val &= ~GICR_VPENDBASER_Valid;
+	gits_write_vpendbaser(val, vlpi_base + GICR_VPENDBASER);
+
+	do {
+		val = gits_read_vpendbaser(vlpi_base + GICR_VPENDBASER);
+		clean = !(val & GICR_VPENDBASER_Dirty);
+		if (!clean) {
+			count--;
+			cpu_relax();
+			udelay(1);
+		}
+	} while (!clean && count);
+
+	if (unlikely(!clean && !count)) {
+		pr_err_ratelimited("ITS virtual pending table not cleaning\n");
+		vpe->idai = false;
+		vpe->pending_last = true;
+	} else {
+		vpe->idai = !!(val & GICR_VPENDBASER_IDAI);
+		vpe->pending_last = !!(val & GICR_VPENDBASER_PendingLast);
+	}
+}
+
+static int its_vpe_set_vcpu_affinity(struct irq_data *d, void *vcpu_info)
+{
+	struct its_vpe *vpe = irq_data_get_irq_chip_data(d);
+	struct its_cmd_info *info = vcpu_info;
+
+	switch (info->cmd_type) {
+	case SCHEDULE_VPE:
+		its_vpe_schedule(vpe);
+		return 0;
+
+	case DESCHEDULE_VPE:
+		its_vpe_deschedule(vpe);
+		return 0;
+
+	default:
+		return -EINVAL;
+	}
+}
+
 static struct irq_chip its_vpe_irq_chip = {
 	.name			= "GICv4-vpe",
+	.irq_set_vcpu_affinity	= its_vpe_set_vcpu_affinity,
 };
 
 static int its_vpe_id_alloc(void)
diff --git a/include/linux/irqchip/arm-gic-v3.h b/include/linux/irqchip/arm-gic-v3.h
index 17ba0d732f12..6bc142cfa616 100644
--- a/include/linux/irqchip/arm-gic-v3.h
+++ b/include/linux/irqchip/arm-gic-v3.h
@@ -212,6 +212,64 @@
 #define LPI_PROP_GROUP1			(1 << 1)
 #define LPI_PROP_ENABLED		(1 << 0)
 
+/*
+ * Re-Distributor registers, offsets from VLPI_base
+ */
+#define GICR_VPROPBASER			0x0070
+
+#define GICR_VPROPBASER_IDBITS_MASK	0x1f
+
+#define GICR_VPROPBASER_SHAREABILITY_SHIFT		(10)
+#define GICR_VPROPBASER_INNER_CACHEABILITY_SHIFT	(7)
+#define GICR_VPROPBASER_OUTER_CACHEABILITY_SHIFT	(56)
+
+#define GICR_VPROPBASER_SHAREABILITY_MASK				\
+	GIC_BASER_SHAREABILITY(GICR_VPROPBASER, SHAREABILITY_MASK)
+#define GICR_VPROPBASER_INNER_CACHEABILITY_MASK				\
+	GIC_BASER_CACHEABILITY(GICR_VPROPBASER, INNER, MASK)
+#define GICR_VPROPBASER_OUTER_CACHEABILITY_MASK				\
+	GIC_BASER_CACHEABILITY(GICR_VPROPBASER, OUTER, MASK)
+#define GICR_VPROPBASER_CACHEABILITY_MASK				\
+	GICR_VPROPBASER_INNER_CACHEABILITY_MASK
+
+#define GICR_VPROPBASER_InnerShareable					\
+	GIC_BASER_SHAREABILITY(GICR_VPROPBASER, InnerShareable)
+
+#define GICR_VPROPBASER_nCnB	GIC_BASER_CACHEABILITY(GICR_VPROPBASER, INNER, nCnB)
+#define GICR_VPROPBASER_nC 	GIC_BASER_CACHEABILITY(GICR_VPROPBASER, INNER, nC)
+#define GICR_VPROPBASER_RaWt	GIC_BASER_CACHEABILITY(GICR_VPROPBASER, INNER, RaWt)
+#define GICR_VPROPBASER_RaWb	GIC_BASER_CACHEABILITY(GICR_VPROPBASER, INNER, RaWt)
+#define GICR_VPROPBASER_WaWt	GIC_BASER_CACHEABILITY(GICR_VPROPBASER, INNER, WaWt)
+#define GICR_VPROPBASER_WaWb	GIC_BASER_CACHEABILITY(GICR_VPROPBASER, INNER, WaWb)
+#define GICR_VPROPBASER_RaWaWt	GIC_BASER_CACHEABILITY(GICR_VPROPBASER, INNER, RaWaWt)
+#define GICR_VPROPBASER_RaWaWb	GIC_BASER_CACHEABILITY(GICR_VPROPBASER, INNER, RaWaWb)
+
+#define GICR_VPENDBASER			0x0078
+
+#define GICR_VPENDBASER_SHAREABILITY_SHIFT		(10)
+#define GICR_VPENDBASER_INNER_CACHEABILITY_SHIFT	(7)
+#define GICR_VPENDBASER_OUTER_CACHEABILITY_SHIFT	(56)
+#define GICR_VPENDBASER_SHAREABILITY_MASK				\
+	GIC_BASER_SHAREABILITY(GICR_VPENDBASER, SHAREABILITY_MASK)
+#define GICR_VPENDBASER_INNER_CACHEABILITY_MASK				\
+	GIC_BASER_CACHEABILITY(GICR_VPENDBASER, INNER, MASK)
+#define GICR_VPENDBASER_OUTER_CACHEABILITY_MASK				\
+	GIC_BASER_CACHEABILITY(GICR_VPENDBASER, OUTER, MASK)
+#define GICR_VPENDBASER_CACHEABILITY_MASK				\
+	GICR_VPENDBASER_INNER_CACHEABILITY_MASK
+
+#define GICR_VPENDBASER_NonShareable					\
+	GIC_BASER_SHAREABILITY(GICR_VPENDBASER, NonShareable)
+
+#define GICR_VPENDBASER_nCnB	GIC_BASER_CACHEABILITY(GICR_VPENDBASER, INNER, nCnB)
+#define GICR_VPENDBASER_nC 	GIC_BASER_CACHEABILITY(GICR_VPENDBASER, INNER, nC)
+#define GICR_VPENDBASER_RaWt	GIC_BASER_CACHEABILITY(GICR_VPENDBASER, INNER, RaWt)
+#define GICR_VPENDBASER_RaWb	GIC_BASER_CACHEABILITY(GICR_VPENDBASER, INNER, RaWt)
+#define GICR_VPENDBASER_WaWt	GIC_BASER_CACHEABILITY(GICR_VPENDBASER, INNER, WaWt)
+#define GICR_VPENDBASER_WaWb	GIC_BASER_CACHEABILITY(GICR_VPENDBASER, INNER, WaWb)
+#define GICR_VPENDBASER_RaWaWt	GIC_BASER_CACHEABILITY(GICR_VPENDBASER, INNER, RaWaWt)
+#define GICR_VPENDBASER_RaWaWb	GIC_BASER_CACHEABILITY(GICR_VPENDBASER, INNER, RaWaWb)
+
 #define GICR_VPENDBASER_Dirty		(1ULL << 60)
 #define GICR_VPENDBASER_PendingLast	(1ULL << 61)
 #define GICR_VPENDBASER_IDAI		(1ULL << 62)
-- 
cgit v1.2.3


From 20b3d54ecba51c5fe476eea94ffdc463559c5c85 Mon Sep 17 00:00:00 2001
From: Marc Zyngier <marc.zyngier@arm.com>
Date: Tue, 20 Dec 2016 15:23:22 +0000
Subject: irqchip/gic-v3-its: Add device proxy for VPE management if !DirectLpi

When we don't have the DirectLPI feature, we must work around the
architecture shortcomings to be able to perform the required
maintenance (interrupt masking, clearing and injection).

For this, we create a fake device whose sole purpose is to
provide a way to issue commands as if we were dealing with LPIs
coming from that device (while they actually originate from
the ITS). This fake device doesn't have LPIs allocated to it,
but instead uses the VPE LPIs.

Of course, this could be a real bottleneck, and a naive
implementation would require 6 commands to issue an invalidation.

Instead, let's allocate at least one event per physical CPU
(rounded up to the next power of 2), and opportunistically
map the VPE doorbell to an event. This doorbell will be mapped
until we roll over and need to reallocate this slot.

This ensures that most of the time, we only need 2 commands
to issue an INV, INT or CLEAR, making the performance a lot
better, given that we always issue a CLEAR on entry, and
an INV on each side of a trapped WFI.

Signed-off-by: Marc Zyngier <marc.zyngier@arm.com>
---
 drivers/irqchip/irq-gic-v3-its.c   | 151 +++++++++++++++++++++++++++++++++++--
 include/linux/irqchip/arm-gic-v4.h |   2 +
 2 files changed, 147 insertions(+), 6 deletions(-)

(limited to 'include')

diff --git a/drivers/irqchip/irq-gic-v3-its.c b/drivers/irqchip/irq-gic-v3-its.c
index 017a6efa3747..06f14f20368b 100644
--- a/drivers/irqchip/irq-gic-v3-its.c
+++ b/drivers/irqchip/irq-gic-v3-its.c
@@ -136,6 +136,13 @@ struct its_device {
 	u32			device_id;
 };
 
+static struct {
+	raw_spinlock_t		lock;
+	struct its_device	*dev;
+	struct its_vpe		**vpes;
+	int			next_victim;
+} vpe_proxy;
+
 static LIST_HEAD(its_nodes);
 static DEFINE_SPINLOCK(its_lock);
 static struct rdists *gic_rdists;
@@ -2090,6 +2097,16 @@ static int its_msi_prepare(struct irq_domain *domain, struct device *dev,
 	msi_info = msi_get_domain_info(domain);
 	its = msi_info->data;
 
+	if (!gic_rdists->has_direct_lpi &&
+	    vpe_proxy.dev &&
+	    vpe_proxy.dev->its == its &&
+	    dev_id == vpe_proxy.dev->device_id) {
+		/* Bad luck. Get yourself a better implementation */
+		WARN_ONCE(1, "DevId %x clashes with GICv4 VPE proxy device\n",
+			  dev_id);
+		return -EINVAL;
+	}
+
 	its_dev = its_find_device(its, dev_id);
 	if (its_dev) {
 		/*
@@ -2237,6 +2254,70 @@ static const struct irq_domain_ops its_domain_ops = {
 	.deactivate		= its_irq_domain_deactivate,
 };
 
+/*
+ * This is insane.
+ *
+ * If a GICv4 doesn't implement Direct LPIs (which is extremely
+ * likely), the only way to perform an invalidate is to use a fake
+ * device to issue an INV command, implying that the LPI has first
+ * been mapped to some event on that device. Since this is not exactly
+ * cheap, we try to keep that mapping around as long as possible, and
+ * only issue an UNMAP if we're short on available slots.
+ *
+ * Broken by design(tm).
+ */
+static void its_vpe_db_proxy_unmap_locked(struct its_vpe *vpe)
+{
+	/* Already unmapped? */
+	if (vpe->vpe_proxy_event == -1)
+		return;
+
+	its_send_discard(vpe_proxy.dev, vpe->vpe_proxy_event);
+	vpe_proxy.vpes[vpe->vpe_proxy_event] = NULL;
+
+	/*
+	 * We don't track empty slots at all, so let's move the
+	 * next_victim pointer if we can quickly reuse that slot
+	 * instead of nuking an existing entry. Not clear that this is
+	 * always a win though, and this might just generate a ripple
+	 * effect... Let's just hope VPEs don't migrate too often.
+	 */
+	if (vpe_proxy.vpes[vpe_proxy.next_victim])
+		vpe_proxy.next_victim = vpe->vpe_proxy_event;
+
+	vpe->vpe_proxy_event = -1;
+}
+
+static void its_vpe_db_proxy_unmap(struct its_vpe *vpe)
+{
+	if (!gic_rdists->has_direct_lpi) {
+		unsigned long flags;
+
+		raw_spin_lock_irqsave(&vpe_proxy.lock, flags);
+		its_vpe_db_proxy_unmap_locked(vpe);
+		raw_spin_unlock_irqrestore(&vpe_proxy.lock, flags);
+	}
+}
+
+static void its_vpe_db_proxy_map_locked(struct its_vpe *vpe)
+{
+	/* Already mapped? */
+	if (vpe->vpe_proxy_event != -1)
+		return;
+
+	/* This slot was already allocated. Kick the other VPE out. */
+	if (vpe_proxy.vpes[vpe_proxy.next_victim])
+		its_vpe_db_proxy_unmap_locked(vpe_proxy.vpes[vpe_proxy.next_victim]);
+
+	/* Map the new VPE instead */
+	vpe_proxy.vpes[vpe_proxy.next_victim] = vpe;
+	vpe->vpe_proxy_event = vpe_proxy.next_victim;
+	vpe_proxy.next_victim = (vpe_proxy.next_victim + 1) % vpe_proxy.dev->nr_ites;
+
+	vpe_proxy.dev->event_map.col_map[vpe->vpe_proxy_event] = vpe->col_idx;
+	its_send_mapti(vpe_proxy.dev, vpe->vpe_db_lpi, vpe->vpe_proxy_event);
+}
+
 static int its_vpe_set_affinity(struct irq_data *d,
 				const struct cpumask *mask_val,
 				bool force)
@@ -2246,9 +2327,11 @@ static int its_vpe_set_affinity(struct irq_data *d,
 
 	/*
 	 * Changing affinity is mega expensive, so let's be as lazy as
-	 * we can and only do it if we really have to.
+	 * we can and only do it if we really have to. Also, if mapped
+	 * into the proxy device, we need to nuke that mapping.
 	 */
 	if (vpe->col_idx != cpu) {
+		its_vpe_db_proxy_unmap(vpe);
 		vpe->col_idx = cpu;
 		its_send_vmovp(vpe);
 	}
@@ -2343,15 +2426,33 @@ static int its_vpe_set_vcpu_affinity(struct irq_data *d, void *vcpu_info)
 	}
 }
 
+static void its_vpe_send_cmd(struct its_vpe *vpe,
+			     void (*cmd)(struct its_device *, u32))
+{
+	unsigned long flags;
+
+	raw_spin_lock_irqsave(&vpe_proxy.lock, flags);
+
+	its_vpe_db_proxy_map_locked(vpe);
+	cmd(vpe_proxy.dev, vpe->vpe_proxy_event);
+
+	raw_spin_unlock_irqrestore(&vpe_proxy.lock, flags);
+}
+
 static void its_vpe_send_inv(struct irq_data *d)
 {
 	struct its_vpe *vpe = irq_data_get_irq_chip_data(d);
-	void __iomem *rdbase;
 
-	rdbase = per_cpu_ptr(gic_rdists->rdist, vpe->col_idx)->rd_base;
-	gic_write_lpir(vpe->vpe_db_lpi, rdbase + GICR_INVLPIR);
-	while (gic_read_lpir(rdbase + GICR_SYNCR) & 1)
-		cpu_relax();
+	if (gic_rdists->has_direct_lpi) {
+		void __iomem *rdbase;
+
+		rdbase = per_cpu_ptr(gic_rdists->rdist, vpe->col_idx)->rd_base;
+		gic_write_lpir(vpe->vpe_db_lpi, rdbase + GICR_INVLPIR);
+		while (gic_read_lpir(rdbase + GICR_SYNCR) & 1)
+			cpu_relax();
+	} else {
+		its_vpe_send_cmd(vpe, its_send_inv);
+	}
 }
 
 static void its_vpe_mask_irq(struct irq_data *d)
@@ -2417,12 +2518,14 @@ static int its_vpe_init(struct its_vpe *vpe)
 
 	vpe->vpe_id = vpe_id;
 	vpe->vpt_page = vpt_page;
+	vpe->vpe_proxy_event = -1;
 
 	return 0;
 }
 
 static void its_vpe_teardown(struct its_vpe *vpe)
 {
+	its_vpe_db_proxy_unmap(vpe);
 	its_vpe_id_free(vpe->vpe_id);
 	its_free_pending_table(vpe->vpt_page);
 }
@@ -2653,6 +2756,42 @@ static int its_init_domain(struct fwnode_handle *handle, struct its_node *its)
 
 static int its_init_vpe_domain(void)
 {
+	struct its_node *its;
+	u32 devid;
+	int entries;
+
+	if (gic_rdists->has_direct_lpi) {
+		pr_info("ITS: Using DirectLPI for VPE invalidation\n");
+		return 0;
+	}
+
+	/* Any ITS will do, even if not v4 */
+	its = list_first_entry(&its_nodes, struct its_node, entry);
+
+	entries = roundup_pow_of_two(nr_cpu_ids);
+	vpe_proxy.vpes = kzalloc(sizeof(*vpe_proxy.vpes) * entries,
+				 GFP_KERNEL);
+	if (!vpe_proxy.vpes) {
+		pr_err("ITS: Can't allocate GICv4 proxy device array\n");
+		return -ENOMEM;
+	}
+
+	/* Use the last possible DevID */
+	devid = GENMASK(its->device_ids - 1, 0);
+	vpe_proxy.dev = its_create_device(its, devid, entries, false);
+	if (!vpe_proxy.dev) {
+		kfree(vpe_proxy.vpes);
+		pr_err("ITS: Can't allocate GICv4 proxy device\n");
+		return -ENOMEM;
+	}
+
+	BUG_ON(entries != vpe_proxy.dev->nr_ites);
+
+	raw_spin_lock_init(&vpe_proxy.lock);
+	vpe_proxy.next_victim = 0;
+	pr_info("ITS: Allocated DevID %x as GICv4 proxy device (%d slots)\n",
+		devid, vpe_proxy.dev->nr_ites);
+
 	return 0;
 }
 
diff --git a/include/linux/irqchip/arm-gic-v4.h b/include/linux/irqchip/arm-gic-v4.h
index d499538dd86f..e7a93ad4fe97 100644
--- a/include/linux/irqchip/arm-gic-v4.h
+++ b/include/linux/irqchip/arm-gic-v4.h
@@ -39,6 +39,8 @@ struct its_vpe {
 	/* Doorbell interrupt */
 	int			irq;
 	irq_hw_number_t		vpe_db_lpi;
+	/* VPE proxy mapping */
+	int			vpe_proxy_event;
 	/*
 	 * This collection ID is used to indirect the target
 	 * redistributor for this VPE. The ID itself isn't involved in
-- 
cgit v1.2.3


From d51c4b4da7f8fae8c884e3b89fdab906f66da28a Mon Sep 17 00:00:00 2001
From: Marc Zyngier <marc.zyngier@arm.com>
Date: Tue, 27 Jun 2017 21:24:25 +0100
Subject: irqchip/gic-v3-its: Set implementation defined bit to enable VLPIs

A long time ago, GITS_CTLR[1] used to be called GITC_CTLR.EnableVLPI.
It has been subsequently deprecated and is now an "Implementation
Defined" bit that may ot may not be set for GICv4. Brilliant.

And the current crop of the FastModel requires that bit for VLPIs
to be enabled. Oh well... Let's set it and find out what breaks.

Signed-off-by: Marc Zyngier <marc.zyngier@arm.com>
---
 drivers/irqchip/irq-gic-v3-its.c   | 7 +++++--
 include/linux/irqchip/arm-gic-v3.h | 1 +
 2 files changed, 6 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/drivers/irqchip/irq-gic-v3-its.c b/drivers/irqchip/irq-gic-v3-its.c
index 21ee33e81c74..d79bfdbd506b 100644
--- a/drivers/irqchip/irq-gic-v3-its.c
+++ b/drivers/irqchip/irq-gic-v3-its.c
@@ -2714,7 +2714,7 @@ static int its_force_quiescent(void __iomem *base)
 		return 0;
 
 	/* Disable the generation of all interrupts to this ITS */
-	val &= ~GITS_CTLR_ENABLE;
+	val &= ~(GITS_CTLR_ENABLE | GITS_CTLR_ImDe);
 	writel_relaxed(val, base + GITS_CTLR);
 
 	/* Poll GITS_CTLR and wait until ITS becomes quiescent */
@@ -2998,7 +2998,10 @@ static int __init its_probe_one(struct resource *res,
 
 	gits_write_cwriter(0, its->base + GITS_CWRITER);
 	ctlr = readl_relaxed(its->base + GITS_CTLR);
-	writel_relaxed(ctlr | GITS_CTLR_ENABLE, its->base + GITS_CTLR);
+	ctlr |= GITS_CTLR_ENABLE;
+	if (its->is_v4)
+		ctlr |= GITS_CTLR_ImDe;
+	writel_relaxed(ctlr, its->base + GITS_CTLR);
 
 	err = its_init_domain(handle, its);
 	if (err)
diff --git a/include/linux/irqchip/arm-gic-v3.h b/include/linux/irqchip/arm-gic-v3.h
index 6bc142cfa616..1ea576c8126f 100644
--- a/include/linux/irqchip/arm-gic-v3.h
+++ b/include/linux/irqchip/arm-gic-v3.h
@@ -298,6 +298,7 @@
 #define GITS_TRANSLATER			0x10040
 
 #define GITS_CTLR_ENABLE		(1U << 0)
+#define GITS_CTLR_ImDe			(1U << 1)
 #define	GITS_CTLR_ITS_NUMBER_SHIFT	4
 #define	GITS_CTLR_ITS_NUMBER		(0xFU << GITS_CTLR_ITS_NUMBER_SHIFT)
 #define GITS_CTLR_QUIESCENT		(1U << 31)
-- 
cgit v1.2.3


From 7de5c0af9c7c717f9052e6d75b24f90050e6a56e Mon Sep 17 00:00:00 2001
From: Marc Zyngier <marc.zyngier@arm.com>
Date: Tue, 20 Dec 2016 15:27:52 +0000
Subject: irqchip/gic-v4: Add per-VM VPE domain creation

When creating a VM, it is very convenient to have an irq domain
containing all the doorbell interrupts associated with that VM
(each interrupt representing a VPE).

Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Marc Zyngier <marc.zyngier@arm.com>
---
 drivers/irqchip/irq-gic-v4.c       | 74 ++++++++++++++++++++++++++++++++++++++
 include/linux/irqchip/arm-gic-v4.h |  3 ++
 2 files changed, 77 insertions(+)
 create mode 100644 drivers/irqchip/irq-gic-v4.c

(limited to 'include')

diff --git a/drivers/irqchip/irq-gic-v4.c b/drivers/irqchip/irq-gic-v4.c
new file mode 100644
index 000000000000..3d4bedea4e78
--- /dev/null
+++ b/drivers/irqchip/irq-gic-v4.c
@@ -0,0 +1,74 @@
+/*
+ * Copyright (C) 2016,2017 ARM Limited, All Rights Reserved.
+ * Author: Marc Zyngier <marc.zyngier@arm.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <linux/interrupt.h>
+#include <linux/irq.h>
+#include <linux/irqdomain.h>
+#include <linux/msi.h>
+#include <linux/sched.h>
+
+#include <linux/irqchip/arm-gic-v4.h>
+
+static struct irq_domain *gic_domain;
+static const struct irq_domain_ops *vpe_domain_ops;
+
+int its_alloc_vcpu_irqs(struct its_vm *vm)
+{
+	int vpe_base_irq, i;
+
+	vm->fwnode = irq_domain_alloc_named_id_fwnode("GICv4-vpe",
+						      task_pid_nr(current));
+	if (!vm->fwnode)
+		goto err;
+
+	vm->domain = irq_domain_create_hierarchy(gic_domain, 0, vm->nr_vpes,
+						 vm->fwnode, vpe_domain_ops,
+						 vm);
+	if (!vm->domain)
+		goto err;
+
+	for (i = 0; i < vm->nr_vpes; i++) {
+		vm->vpes[i]->its_vm = vm;
+		vm->vpes[i]->idai = true;
+	}
+
+	vpe_base_irq = __irq_domain_alloc_irqs(vm->domain, -1, vm->nr_vpes,
+					       NUMA_NO_NODE, vm,
+					       false, NULL);
+	if (vpe_base_irq <= 0)
+		goto err;
+
+	for (i = 0; i < vm->nr_vpes; i++)
+		vm->vpes[i]->irq = vpe_base_irq + i;
+
+	return 0;
+
+err:
+	if (vm->domain)
+		irq_domain_remove(vm->domain);
+	if (vm->fwnode)
+		irq_domain_free_fwnode(vm->fwnode);
+
+	return -ENOMEM;
+}
+
+void its_free_vcpu_irqs(struct its_vm *vm)
+{
+	irq_domain_free_irqs(vm->vpes[0]->irq, vm->nr_vpes);
+	irq_domain_remove(vm->domain);
+	irq_domain_free_fwnode(vm->fwnode);
+}
diff --git a/include/linux/irqchip/arm-gic-v4.h b/include/linux/irqchip/arm-gic-v4.h
index e7a93ad4fe97..3dc811dc53da 100644
--- a/include/linux/irqchip/arm-gic-v4.h
+++ b/include/linux/irqchip/arm-gic-v4.h
@@ -91,4 +91,7 @@ struct its_cmd_info {
 	};
 };
 
+int its_alloc_vcpu_irqs(struct its_vm *vm);
+void its_free_vcpu_irqs(struct its_vm *vm);
+
 #endif
-- 
cgit v1.2.3


From eab84318c2811e3f38c080efcc7f709f51bb8370 Mon Sep 17 00:00:00 2001
From: Marc Zyngier <marc.zyngier@arm.com>
Date: Tue, 20 Dec 2016 15:31:02 +0000
Subject: irqchip/gic-v4: Add VPE command interface

Add the required interfaces to schedule a VPE and perform a
VINVALL command.

Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Eric Auger <eric.auger@redhat.com>
Signed-off-by: Marc Zyngier <marc.zyngier@arm.com>
---
 drivers/irqchip/irq-gic-v4.c       | 25 +++++++++++++++++++++++++
 include/linux/irqchip/arm-gic-v4.h |  2 ++
 2 files changed, 27 insertions(+)

(limited to 'include')

diff --git a/drivers/irqchip/irq-gic-v4.c b/drivers/irqchip/irq-gic-v4.c
index 3d4bedea4e78..d3f8b7a13d2c 100644
--- a/drivers/irqchip/irq-gic-v4.c
+++ b/drivers/irqchip/irq-gic-v4.c
@@ -72,3 +72,28 @@ void its_free_vcpu_irqs(struct its_vm *vm)
 	irq_domain_remove(vm->domain);
 	irq_domain_free_fwnode(vm->fwnode);
 }
+
+static int its_send_vpe_cmd(struct its_vpe *vpe, struct its_cmd_info *info)
+{
+	return irq_set_vcpu_affinity(vpe->irq, info);
+}
+
+int its_schedule_vpe(struct its_vpe *vpe, bool on)
+{
+	struct its_cmd_info info;
+
+	WARN_ON(preemptible());
+
+	info.cmd_type = on ? SCHEDULE_VPE : DESCHEDULE_VPE;
+
+	return its_send_vpe_cmd(vpe, &info);
+}
+
+int its_invall_vpe(struct its_vpe *vpe)
+{
+	struct its_cmd_info info = {
+		.cmd_type = INVALL_VPE,
+	};
+
+	return its_send_vpe_cmd(vpe, &info);
+}
diff --git a/include/linux/irqchip/arm-gic-v4.h b/include/linux/irqchip/arm-gic-v4.h
index 3dc811dc53da..6450f3ed101f 100644
--- a/include/linux/irqchip/arm-gic-v4.h
+++ b/include/linux/irqchip/arm-gic-v4.h
@@ -93,5 +93,7 @@ struct its_cmd_info {
 
 int its_alloc_vcpu_irqs(struct its_vm *vm);
 void its_free_vcpu_irqs(struct its_vm *vm);
+int its_schedule_vpe(struct its_vpe *vpe, bool on);
+int its_invall_vpe(struct its_vpe *vpe);
 
 #endif
-- 
cgit v1.2.3


From f2eac75de435871d5a497f8b557874a2a8a7b264 Mon Sep 17 00:00:00 2001
From: Marc Zyngier <marc.zyngier@arm.com>
Date: Wed, 21 Dec 2016 21:50:32 +0000
Subject: irqchip/gic-v4: Add VLPI configuration interface

Add the required interfaces to map, unmap and update a VLPI.

Reviewed-by: Eric Auger <eric.auger@redhat.com>
Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Marc Zyngier <marc.zyngier@arm.com>
---
 drivers/irqchip/irq-gic-v4.c       | 42 ++++++++++++++++++++++++++++++++++++++
 include/linux/irqchip/arm-gic-v4.h |  4 ++++
 2 files changed, 46 insertions(+)

(limited to 'include')

diff --git a/drivers/irqchip/irq-gic-v4.c b/drivers/irqchip/irq-gic-v4.c
index d3f8b7a13d2c..6d33828a84a7 100644
--- a/drivers/irqchip/irq-gic-v4.c
+++ b/drivers/irqchip/irq-gic-v4.c
@@ -97,3 +97,45 @@ int its_invall_vpe(struct its_vpe *vpe)
 
 	return its_send_vpe_cmd(vpe, &info);
 }
+
+int its_map_vlpi(int irq, struct its_vlpi_map *map)
+{
+	struct its_cmd_info info = {
+		.cmd_type = MAP_VLPI,
+		.map      = map,
+	};
+
+	/*
+	 * The host will never see that interrupt firing again, so it
+	 * is vital that we don't do any lazy masking.
+	 */
+	irq_set_status_flags(irq, IRQ_DISABLE_UNLAZY);
+
+	return irq_set_vcpu_affinity(irq, &info);
+}
+
+int its_get_vlpi(int irq, struct its_vlpi_map *map)
+{
+	struct its_cmd_info info = {
+		.cmd_type = GET_VLPI,
+		.map      = map,
+	};
+
+	return irq_set_vcpu_affinity(irq, &info);
+}
+
+int its_unmap_vlpi(int irq)
+{
+	irq_clear_status_flags(irq, IRQ_DISABLE_UNLAZY);
+	return irq_set_vcpu_affinity(irq, NULL);
+}
+
+int its_prop_update_vlpi(int irq, u8 config, bool inv)
+{
+	struct its_cmd_info info = {
+		.cmd_type = inv ? PROP_UPDATE_AND_INV_VLPI : PROP_UPDATE_VLPI,
+		.config   = config,
+	};
+
+	return irq_set_vcpu_affinity(irq, &info);
+}
diff --git a/include/linux/irqchip/arm-gic-v4.h b/include/linux/irqchip/arm-gic-v4.h
index 6450f3ed101f..e22f878ad017 100644
--- a/include/linux/irqchip/arm-gic-v4.h
+++ b/include/linux/irqchip/arm-gic-v4.h
@@ -95,5 +95,9 @@ int its_alloc_vcpu_irqs(struct its_vm *vm);
 void its_free_vcpu_irqs(struct its_vm *vm);
 int its_schedule_vpe(struct its_vpe *vpe, bool on);
 int its_invall_vpe(struct its_vpe *vpe);
+int its_map_vlpi(int irq, struct its_vlpi_map *map);
+int its_get_vlpi(int irq, struct its_vlpi_map *map);
+int its_unmap_vlpi(int irq);
+int its_prop_update_vlpi(int irq, u8 config, bool inv);
 
 #endif
-- 
cgit v1.2.3


From 3d63cb53e221d8ab347e94aeac0b5511857beb7f Mon Sep 17 00:00:00 2001
From: Marc Zyngier <marc.zyngier@arm.com>
Date: Tue, 20 Dec 2016 15:31:54 +0000
Subject: irqchip/gic-v4: Enable low-level GICv4 operations

Get the show on the road...

Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Marc Zyngier <marc.zyngier@arm.com>
---
 drivers/irqchip/Makefile           |  2 +-
 drivers/irqchip/irq-gic-v3-its.c   |  3 ++-
 drivers/irqchip/irq-gic-v4.c       | 13 +++++++++++++
 include/linux/irqchip/arm-gic-v4.h |  2 ++
 4 files changed, 18 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/drivers/irqchip/Makefile b/drivers/irqchip/Makefile
index 2c630574986f..845abc107ad5 100644
--- a/drivers/irqchip/Makefile
+++ b/drivers/irqchip/Makefile
@@ -28,7 +28,7 @@ obj-$(CONFIG_ARM_GIC_PM)		+= irq-gic-pm.o
 obj-$(CONFIG_ARCH_REALVIEW)		+= irq-gic-realview.o
 obj-$(CONFIG_ARM_GIC_V2M)		+= irq-gic-v2m.o
 obj-$(CONFIG_ARM_GIC_V3)		+= irq-gic-v3.o irq-gic-common.o
-obj-$(CONFIG_ARM_GIC_V3_ITS)		+= irq-gic-v3-its.o irq-gic-v3-its-pci-msi.o irq-gic-v3-its-platform-msi.o
+obj-$(CONFIG_ARM_GIC_V3_ITS)		+= irq-gic-v3-its.o irq-gic-v3-its-pci-msi.o irq-gic-v3-its-platform-msi.o irq-gic-v4.o
 obj-$(CONFIG_PARTITION_PERCPU)		+= irq-partition-percpu.o
 obj-$(CONFIG_HISILICON_IRQ_MBIGEN)	+= irq-mbigen.o
 obj-$(CONFIG_ARM_NVIC)			+= irq-nvic.o
diff --git a/drivers/irqchip/irq-gic-v3-its.c b/drivers/irqchip/irq-gic-v3-its.c
index d79bfdbd506b..a93816cb2a98 100644
--- a/drivers/irqchip/irq-gic-v3-its.c
+++ b/drivers/irqchip/irq-gic-v3-its.c
@@ -3226,7 +3226,8 @@ int __init its_init(struct fwnode_handle *handle, struct rdists *rdists,
 		has_v4 |= its->is_v4;
 
 	if (has_v4 & rdists->has_vlpis) {
-		if (its_init_vpe_domain()) {
+		if (its_init_vpe_domain() ||
+		    its_init_v4(parent_domain, &its_vpe_domain_ops)) {
 			rdists->has_vlpis = false;
 			pr_err("ITS: Disabling GICv4 support\n");
 		}
diff --git a/drivers/irqchip/irq-gic-v4.c b/drivers/irqchip/irq-gic-v4.c
index 8eb2ad39322e..2370e6d9e603 100644
--- a/drivers/irqchip/irq-gic-v4.c
+++ b/drivers/irqchip/irq-gic-v4.c
@@ -210,3 +210,16 @@ int its_prop_update_vlpi(int irq, u8 config, bool inv)
 
 	return irq_set_vcpu_affinity(irq, &info);
 }
+
+int its_init_v4(struct irq_domain *domain, const struct irq_domain_ops *ops)
+{
+	if (domain) {
+		pr_info("ITS: Enabling GICv4 support\n");
+		gic_domain = domain;
+		vpe_domain_ops = ops;
+		return 0;
+	}
+
+	pr_err("ITS: No GICv4 VPE domain allocated\n");
+	return -ENODEV;
+}
diff --git a/include/linux/irqchip/arm-gic-v4.h b/include/linux/irqchip/arm-gic-v4.h
index e22f878ad017..58a4d89aa82c 100644
--- a/include/linux/irqchip/arm-gic-v4.h
+++ b/include/linux/irqchip/arm-gic-v4.h
@@ -100,4 +100,6 @@ int its_get_vlpi(int irq, struct its_vlpi_map *map);
 int its_unmap_vlpi(int irq);
 int its_prop_update_vlpi(int irq, u8 config, bool inv);
 
+int its_init_v4(struct irq_domain *domain, const struct irq_domain_ops *ops);
+
 #endif
-- 
cgit v1.2.3


From 4bdf502517288662d883fbaa915874790f51a2cd Mon Sep 17 00:00:00 2001
From: Marc Zyngier <marc.zyngier@arm.com>
Date: Sun, 25 Jun 2017 14:10:46 +0100
Subject: irqchip/gic-v3: Advertise GICv4 support to KVM

As KVM needs to know about the availability of GICv4 to enable
direct injection of interrupts, let's advertise the feature in
the gic_kvm_info structure.

Signed-off-by: Marc Zyngier <marc.zyngier@arm.com>
---
 drivers/irqchip/irq-gic-v3.c           | 2 ++
 include/linux/irqchip/arm-gic-common.h | 2 ++
 2 files changed, 4 insertions(+)

(limited to 'include')

diff --git a/drivers/irqchip/irq-gic-v3.c b/drivers/irqchip/irq-gic-v3.c
index 65fabd5f2ec6..cc968eae6c36 100644
--- a/drivers/irqchip/irq-gic-v3.c
+++ b/drivers/irqchip/irq-gic-v3.c
@@ -1159,6 +1159,7 @@ static void __init gic_of_setup_kvm_info(struct device_node *node)
 	if (!ret)
 		gic_v3_kvm_info.vcpu = r;
 
+	gic_v3_kvm_info.has_v4 = gic_data.rdists.has_vlpis;
 	gic_set_kvm_info(&gic_v3_kvm_info);
 }
 
@@ -1452,6 +1453,7 @@ static void __init gic_acpi_setup_kvm_info(void)
 		vcpu->end = vcpu->start + ACPI_GICV2_VCPU_MEM_SIZE - 1;
 	}
 
+	gic_v3_kvm_info.has_v4 = gic_data.rdists.has_vlpis;
 	gic_set_kvm_info(&gic_v3_kvm_info);
 }
 
diff --git a/include/linux/irqchip/arm-gic-common.h b/include/linux/irqchip/arm-gic-common.h
index c647b0547bcd..0a83b4379f34 100644
--- a/include/linux/irqchip/arm-gic-common.h
+++ b/include/linux/irqchip/arm-gic-common.h
@@ -27,6 +27,8 @@ struct gic_kvm_info {
 	unsigned int	maint_irq;
 	/* Virtual control interface */
 	struct resource vctrl;
+	/* vlpi support */
+	bool		has_v4;
 };
 
 const struct gic_kvm_info *gic_get_kvm_info(void);
-- 
cgit v1.2.3


From b5f515735bea4ae71c248aea3e049073f8852889 Mon Sep 17 00:00:00 2001
From: Andreas Dilger <adilger@dilger.ca>
Date: Thu, 31 Aug 2017 11:09:45 -0400
Subject: ext4: avoid Y2038 overflow in recently_deleted()

Avoid a 32-bit time overflow in recently_deleted() since i_dtime
(inode deletion time) is stored only as a 32-bit value on disk.
Since i_dtime isn't used for much beyond a boolean value in e2fsck
and is otherwise only used in this function in the kernel, there is
no benefit to use more space in the inode for this field on disk.

Instead, compare only the relative deletion time with the low
32 bits of the time using the newly-added time_before32() helper,
which is similar to time_before() and time_after() for jiffies.

Increase RECENTCY_DIRTY to 300s based on Ted's comments about
usage experience at Google.

Signed-off-by: Andreas Dilger <adilger@dilger.ca>
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
Reviewed-by: Arnd Bergmann <arnd@arndb.de>
---
 fs/ext4/ialloc.c     | 19 +++++++++++++------
 include/linux/time.h | 15 +++++++++++++++
 2 files changed, 28 insertions(+), 6 deletions(-)

(limited to 'include')

diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c
index fb83a36b9723..71e93a23cec3 100644
--- a/fs/ext4/ialloc.c
+++ b/fs/ext4/ialloc.c
@@ -692,16 +692,17 @@ static int find_group_other(struct super_block *sb, struct inode *parent,
  * somewhat arbitrary...)
  */
 #define RECENTCY_MIN	5
-#define RECENTCY_DIRTY	30
+#define RECENTCY_DIRTY	300
 
 static int recently_deleted(struct super_block *sb, ext4_group_t group, int ino)
 {
 	struct ext4_group_desc	*gdp;
 	struct ext4_inode	*raw_inode;
 	struct buffer_head	*bh;
-	unsigned long		dtime, now;
-	int	inodes_per_block = EXT4_SB(sb)->s_inodes_per_block;
-	int	offset, ret = 0, recentcy = RECENTCY_MIN;
+	int inodes_per_block = EXT4_SB(sb)->s_inodes_per_block;
+	int offset, ret = 0;
+	int recentcy = RECENTCY_MIN;
+	u32 dtime, now;
 
 	gdp = ext4_get_group_desc(sb, group, NULL);
 	if (unlikely(!gdp))
@@ -718,12 +719,18 @@ static int recently_deleted(struct super_block *sb, ext4_group_t group, int ino)
 
 	offset = (ino % inodes_per_block) * EXT4_INODE_SIZE(sb);
 	raw_inode = (struct ext4_inode *) (bh->b_data + offset);
+
+	/* i_dtime is only 32 bits on disk, but we only care about relative
+	 * times in the range of a few minutes (i.e. long enough to sync a
+	 * recently-deleted inode to disk), so using the low 32 bits of the
+	 * clock (a 68 year range) is enough, see time_before32() */
 	dtime = le32_to_cpu(raw_inode->i_dtime);
-	now = get_seconds();
+	now = ktime_get_real_seconds();
 	if (buffer_dirty(bh))
 		recentcy += RECENTCY_DIRTY;
 
-	if (dtime && (dtime < now) && (now < dtime + recentcy))
+	if (dtime && time_before32(dtime, now) &&
+	    time_before32(now, dtime + recentcy))
 		ret = 1;
 out:
 	brelse(bh);
diff --git a/include/linux/time.h b/include/linux/time.h
index 4abb32d4c6b8..3877136bbdf8 100644
--- a/include/linux/time.h
+++ b/include/linux/time.h
@@ -285,4 +285,19 @@ static inline bool itimerspec64_valid(const struct itimerspec64 *its)
 	return true;
 }
 
+/**
+ * time_after32 - compare two 32-bit relative times
+ * @a:	the time which may be after @b
+ * @b:	the time which may be before @a
+ *
+ * time_after32(a, b) returns true if the time @a is after time @b.
+ * time_before32(b, a) returns true if the time @b is before time @a.
+ *
+ * Similar to time_after(), compare two 32-bit timestamps for relative
+ * times.  This is useful for comparing 32-bit seconds values that can't
+ * be converted to 64-bit values (e.g. due to disk format or wire protocol
+ * issues) when it is known that the times are less than 68 years apart.
+ */
+#define time_after32(a, b)	((s32)((u32)(b) - (u32)(a)) < 0)
+#define time_before32(b, a)	time_after32(a, b)
 #endif
-- 
cgit v1.2.3


From a27bfcab5c1c3a0df61b68e85fc5e4cade83559f Mon Sep 17 00:00:00 2001
From: Jonathan Corbet <corbet@lwn.net>
Date: Thu, 31 Aug 2017 09:47:22 -0600
Subject: genalloc: Fix an incorrect kerneldoc comment

The kerneldoc comment for the genpool_algo_t typedef was incomplete and
incorrectly formatted, leading to a raft of warnings during the docs build.
Fix it appropriately.

Signed-off-by: Jonathan Corbet <corbet@lwn.net>
---
 include/linux/genalloc.h | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/linux/genalloc.h b/include/linux/genalloc.h
index 29d4385903d4..6dfec4d638df 100644
--- a/include/linux/genalloc.h
+++ b/include/linux/genalloc.h
@@ -38,12 +38,13 @@ struct device_node;
 struct gen_pool;
 
 /**
- * Allocation callback function type definition
+ * typedef genpool_algo_t: Allocation callback function type definition
  * @map: Pointer to bitmap
  * @size: The bitmap size in bits
  * @start: The bitnumber to start searching at
  * @nr: The number of zeroed bits we're looking for
- * @data: optional additional data used by @genpool_algo_t
+ * @data: optional additional data used by the callback
+ * @pool: the pool being allocated from
  */
 typedef unsigned long (*genpool_algo_t)(unsigned long *map,
 			unsigned long size,
-- 
cgit v1.2.3


From eb3c74de28b24f5a36d12d6c84f1fceb25d12c4f Mon Sep 17 00:00:00 2001
From: Baolin Wang <baolin.wang@linaro.org>
Date: Mon, 28 Aug 2017 19:02:41 +0800
Subject: usb: phy: Avoid unchecked dereference warning

Move the USB phy NULL checking before issuing usb_phy_set_charger_current()
to avoid unchecked dereference warning.

Signed-off-by: Baolin Wang <baolin.wang@linaro.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/linux/usb/phy.h | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/usb/phy.h b/include/linux/usb/phy.h
index de881b171ba9..8c6914873a16 100644
--- a/include/linux/usb/phy.h
+++ b/include/linux/usb/phy.h
@@ -322,9 +322,12 @@ static inline void usb_phy_set_charger_state(struct usb_phy *usb_phy,
 static inline int
 usb_phy_set_power(struct usb_phy *x, unsigned mA)
 {
+	if (!x)
+		return 0;
+
 	usb_phy_set_charger_current(x, mA);
 
-	if (x && x->set_power)
+	if (x->set_power)
 		return x->set_power(x, mA);
 	return 0;
 }
-- 
cgit v1.2.3


From 2eb7954809bf26de27bc3a2fea4eef606bbf4482 Mon Sep 17 00:00:00 2001
From: Jaghathiswari Rankappagounder Natarajan <jaghu@google.com>
Date: Wed, 30 Aug 2017 16:34:33 -0700
Subject: drivers: w1: add hwmon support structures

This patch has changes to w1.h/w1.c generic files to add (optional) hwmon
support structures.

Signed-off-by: Jaghathiswari Rankappagounder Natarajan <jaghu@google.com>
Acked-by: Evgeniy Polyakov <zbr@ioremap.net>
Acked-by: Guenter Roeck <linux@roeck-us.net>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/w1/w1.c    | 18 +++++++++++++++++-
 include/linux/w1.h |  4 ++++
 2 files changed, 21 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/drivers/w1/w1.c b/drivers/w1/w1.c
index 3c76e1ca4b83..0c2a5a8327bd 100644
--- a/drivers/w1/w1.c
+++ b/drivers/w1/w1.c
@@ -25,6 +25,7 @@
 #include <linux/sched.h>
 #include <linux/kthread.h>
 #include <linux/freezer.h>
+#include <linux/hwmon.h>
 
 #include <linux/atomic.h>
 
@@ -649,9 +650,24 @@ static int w1_family_notify(unsigned long action, struct w1_slave *sl)
 				return err;
 			}
 		}
-
+		if (IS_REACHABLE(CONFIG_HWMON) && fops->chip_info) {
+			struct device *hwmon
+				= hwmon_device_register_with_info(&sl->dev,
+						"w1_slave_temp", sl,
+						fops->chip_info,
+						NULL);
+			if (IS_ERR(hwmon)) {
+				dev_warn(&sl->dev,
+					 "could not create hwmon device\n");
+			} else {
+				sl->hwmon = hwmon;
+			}
+		}
 		break;
 	case BUS_NOTIFY_DEL_DEVICE:
+		if (IS_REACHABLE(CONFIG_HWMON) && fops->chip_info &&
+			    sl->hwmon)
+			hwmon_device_unregister(sl->hwmon);
 		if (fops->remove_slave)
 			sl->family->fops->remove_slave(sl);
 		if (fops->groups)
diff --git a/include/linux/w1.h b/include/linux/w1.h
index 90cbe7e65059..5b2972946dda 100644
--- a/include/linux/w1.h
+++ b/include/linux/w1.h
@@ -68,6 +68,7 @@ struct w1_reg_num {
  * @family: module for device family type
  * @family_data: pointer for use by the family module
  * @dev: kernel device identifier
+ * @hwmon: pointer to hwmon device
  *
  */
 struct w1_slave {
@@ -83,6 +84,7 @@ struct w1_slave {
 	struct w1_family	*family;
 	void			*family_data;
 	struct device		dev;
+	struct device		*hwmon;
 };
 
 typedef void (*w1_slave_found_callback)(struct w1_master *, u64);
@@ -250,11 +252,13 @@ void w1_remove_master_device(struct w1_bus_master *master);
  * @add_slave: add_slave
  * @remove_slave: remove_slave
  * @groups: sysfs group
+ * @chip_info: pointer to struct hwmon_chip_info
  */
 struct w1_family_ops {
 	int  (*add_slave)(struct w1_slave *sl);
 	void (*remove_slave)(struct w1_slave *sl);
 	const struct attribute_group **groups;
+	const struct hwmon_chip_info *chip_info;
 };
 
 /**
-- 
cgit v1.2.3


From e1eb899b45781b9bb77e6d7772d6e67bb0bc1a18 Mon Sep 17 00:00:00 2001
From: Christian König <christian.koenig@amd.com>
Date: Fri, 25 Aug 2017 09:14:43 +0200
Subject: drm/amdgpu: add IOCTL interface for per VM BOs v3
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Add the IOCTL interface so that applications can allocate per VM BOs.

Still WIP since not all corner cases are tested yet, but this reduces average
CS overhead for 10K BOs from 21ms down to 48us.

v2: add some extra checks, remove the WIP tag
v3: rename new flag to AMDGPU_GEM_CREATE_VM_ALWAYS_VALID

Signed-off-by: Christian König <christian.koenig@amd.com>
Reviewed-by: Felix Kuehling <Felix.Kuehling@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu.h       |  7 ++--
 drivers/gpu/drm/amd/amdgpu/amdgpu_fb.c    |  2 +-
 drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c   | 63 ++++++++++++++++++++++---------
 drivers/gpu/drm/amd/amdgpu/amdgpu_prime.c |  3 +-
 include/uapi/drm/amdgpu_drm.h             |  2 +
 5 files changed, 55 insertions(+), 22 deletions(-)

(limited to 'include')

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
index 103635ab784c..5809f55e0d9d 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
@@ -455,9 +455,10 @@ struct amdgpu_sa_bo {
  */
 void amdgpu_gem_force_release(struct amdgpu_device *adev);
 int amdgpu_gem_object_create(struct amdgpu_device *adev, unsigned long size,
-				int alignment, u32 initial_domain,
-				u64 flags, bool kernel,
-				struct drm_gem_object **obj);
+			     int alignment, u32 initial_domain,
+			     u64 flags, bool kernel,
+			     struct reservation_object *resv,
+			     struct drm_gem_object **obj);
 
 int amdgpu_mode_dumb_create(struct drm_file *file_priv,
 			    struct drm_device *dev,
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_fb.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_fb.c
index 9afa9c097e1f..b6cb276f0a70 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_fb.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_fb.c
@@ -149,7 +149,7 @@ static int amdgpufb_create_pinned_object(struct amdgpu_fbdev *rfbdev,
 				       AMDGPU_GEM_CREATE_CPU_ACCESS_REQUIRED |
 				       AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS |
 				       AMDGPU_GEM_CREATE_VRAM_CLEARED,
-				       true, &gobj);
+				       true, NULL, &gobj);
 	if (ret) {
 		pr_err("failed to allocate framebuffer (%d)\n", aligned_size);
 		return -ENOMEM;
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c
index e32a2b55b54f..f1e61b3df640 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c
@@ -44,11 +44,12 @@ void amdgpu_gem_object_free(struct drm_gem_object *gobj)
 }
 
 int amdgpu_gem_object_create(struct amdgpu_device *adev, unsigned long size,
-				int alignment, u32 initial_domain,
-				u64 flags, bool kernel,
-				struct drm_gem_object **obj)
+			     int alignment, u32 initial_domain,
+			     u64 flags, bool kernel,
+			     struct reservation_object *resv,
+			     struct drm_gem_object **obj)
 {
-	struct amdgpu_bo *robj;
+	struct amdgpu_bo *bo;
 	int r;
 
 	*obj = NULL;
@@ -59,7 +60,7 @@ int amdgpu_gem_object_create(struct amdgpu_device *adev, unsigned long size,
 
 retry:
 	r = amdgpu_bo_create(adev, size, alignment, kernel, initial_domain,
-			     flags, NULL, NULL, 0, &robj);
+			     flags, NULL, resv, 0, &bo);
 	if (r) {
 		if (r != -ERESTARTSYS) {
 			if (initial_domain == AMDGPU_GEM_DOMAIN_VRAM) {
@@ -71,7 +72,7 @@ retry:
 		}
 		return r;
 	}
-	*obj = &robj->gem_base;
+	*obj = &bo->gem_base;
 
 	return 0;
 }
@@ -119,6 +120,10 @@ int amdgpu_gem_object_open(struct drm_gem_object *obj,
 	if (mm && mm != current->mm)
 		return -EPERM;
 
+	if (abo->flags & AMDGPU_GEM_CREATE_VM_ALWAYS_VALID &&
+	    abo->tbo.resv != vm->root.base.bo->tbo.resv)
+		return -EPERM;
+
 	r = amdgpu_bo_reserve(abo, false);
 	if (r)
 		return r;
@@ -142,13 +147,14 @@ void amdgpu_gem_object_close(struct drm_gem_object *obj,
 	struct amdgpu_vm *vm = &fpriv->vm;
 
 	struct amdgpu_bo_list_entry vm_pd;
-	struct list_head list;
+	struct list_head list, duplicates;
 	struct ttm_validate_buffer tv;
 	struct ww_acquire_ctx ticket;
 	struct amdgpu_bo_va *bo_va;
 	int r;
 
 	INIT_LIST_HEAD(&list);
+	INIT_LIST_HEAD(&duplicates);
 
 	tv.bo = &bo->tbo;
 	tv.shared = true;
@@ -156,7 +162,7 @@ void amdgpu_gem_object_close(struct drm_gem_object *obj,
 
 	amdgpu_vm_get_pd_bo(vm, &list, &vm_pd);
 
-	r = ttm_eu_reserve_buffers(&ticket, &list, false, NULL);
+	r = ttm_eu_reserve_buffers(&ticket, &list, false, &duplicates);
 	if (r) {
 		dev_err(adev->dev, "leaking bo va because "
 			"we fail to reserve bo (%d)\n", r);
@@ -191,9 +197,12 @@ int amdgpu_gem_create_ioctl(struct drm_device *dev, void *data,
 			    struct drm_file *filp)
 {
 	struct amdgpu_device *adev = dev->dev_private;
+	struct amdgpu_fpriv *fpriv = filp->driver_priv;
+	struct amdgpu_vm *vm = &fpriv->vm;
 	union drm_amdgpu_gem_create *args = data;
 	uint64_t flags = args->in.domain_flags;
 	uint64_t size = args->in.bo_size;
+	struct reservation_object *resv = NULL;
 	struct drm_gem_object *gobj;
 	uint32_t handle;
 	int r;
@@ -202,7 +211,8 @@ int amdgpu_gem_create_ioctl(struct drm_device *dev, void *data,
 	if (flags & ~(AMDGPU_GEM_CREATE_CPU_ACCESS_REQUIRED |
 		      AMDGPU_GEM_CREATE_NO_CPU_ACCESS |
 		      AMDGPU_GEM_CREATE_CPU_GTT_USWC |
-		      AMDGPU_GEM_CREATE_VRAM_CLEARED))
+		      AMDGPU_GEM_CREATE_VRAM_CLEARED |
+		      AMDGPU_GEM_CREATE_VM_ALWAYS_VALID))
 		return -EINVAL;
 
 	/* reject invalid gem domains */
@@ -229,9 +239,25 @@ int amdgpu_gem_create_ioctl(struct drm_device *dev, void *data,
 	}
 	size = roundup(size, PAGE_SIZE);
 
+	if (flags & AMDGPU_GEM_CREATE_VM_ALWAYS_VALID) {
+		r = amdgpu_bo_reserve(vm->root.base.bo, false);
+		if (r)
+			return r;
+
+		resv = vm->root.base.bo->tbo.resv;
+	}
+
 	r = amdgpu_gem_object_create(adev, size, args->in.alignment,
 				     (u32)(0xffffffff & args->in.domains),
-				     flags, false, &gobj);
+				     flags, false, resv, &gobj);
+	if (flags & AMDGPU_GEM_CREATE_VM_ALWAYS_VALID) {
+		if (!r) {
+			struct amdgpu_bo *abo = gem_to_amdgpu_bo(gobj);
+
+			abo->parent = amdgpu_bo_ref(vm->root.base.bo);
+		}
+		amdgpu_bo_unreserve(vm->root.base.bo);
+	}
 	if (r)
 		return r;
 
@@ -273,9 +299,8 @@ int amdgpu_gem_userptr_ioctl(struct drm_device *dev, void *data,
 	}
 
 	/* create a gem object to contain this object in */
-	r = amdgpu_gem_object_create(adev, args->size, 0,
-				     AMDGPU_GEM_DOMAIN_CPU, 0,
-				     0, &gobj);
+	r = amdgpu_gem_object_create(adev, args->size, 0, AMDGPU_GEM_DOMAIN_CPU,
+				     0, 0, NULL, &gobj);
 	if (r)
 		return r;
 
@@ -527,7 +552,7 @@ int amdgpu_gem_va_ioctl(struct drm_device *dev, void *data,
 	struct amdgpu_bo_list_entry vm_pd;
 	struct ttm_validate_buffer tv;
 	struct ww_acquire_ctx ticket;
-	struct list_head list;
+	struct list_head list, duplicates;
 	uint64_t va_flags;
 	int r = 0;
 
@@ -563,6 +588,7 @@ int amdgpu_gem_va_ioctl(struct drm_device *dev, void *data,
 	}
 
 	INIT_LIST_HEAD(&list);
+	INIT_LIST_HEAD(&duplicates);
 	if ((args->operation != AMDGPU_VA_OP_CLEAR) &&
 	    !(args->flags & AMDGPU_VM_PAGE_PRT)) {
 		gobj = drm_gem_object_lookup(filp, args->handle);
@@ -579,7 +605,7 @@ int amdgpu_gem_va_ioctl(struct drm_device *dev, void *data,
 
 	amdgpu_vm_get_pd_bo(&fpriv->vm, &list, &vm_pd);
 
-	r = ttm_eu_reserve_buffers(&ticket, &list, true, NULL);
+	r = ttm_eu_reserve_buffers(&ticket, &list, true, &duplicates);
 	if (r)
 		goto error_unref;
 
@@ -645,6 +671,7 @@ error_unref:
 int amdgpu_gem_op_ioctl(struct drm_device *dev, void *data,
 			struct drm_file *filp)
 {
+	struct amdgpu_device *adev = dev->dev_private;
 	struct drm_amdgpu_gem_op *args = data;
 	struct drm_gem_object *gobj;
 	struct amdgpu_bo *robj;
@@ -692,6 +719,9 @@ int amdgpu_gem_op_ioctl(struct drm_device *dev, void *data,
 		if (robj->allowed_domains == AMDGPU_GEM_DOMAIN_VRAM)
 			robj->allowed_domains |= AMDGPU_GEM_DOMAIN_GTT;
 
+		if (robj->flags & AMDGPU_GEM_CREATE_VM_ALWAYS_VALID)
+			amdgpu_vm_bo_invalidate(adev, robj, true);
+
 		amdgpu_bo_unreserve(robj);
 		break;
 	default:
@@ -721,8 +751,7 @@ int amdgpu_mode_dumb_create(struct drm_file *file_priv,
 	r = amdgpu_gem_object_create(adev, args->size, 0,
 				     AMDGPU_GEM_DOMAIN_VRAM,
 				     AMDGPU_GEM_CREATE_CPU_ACCESS_REQUIRED,
-				     ttm_bo_type_device,
-				     &gobj);
+				     false, NULL, &gobj);
 	if (r)
 		return -ENOMEM;
 
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_prime.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_prime.c
index 5b3f92891f89..7e0826469b5e 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_prime.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_prime.c
@@ -136,7 +136,8 @@ struct dma_buf *amdgpu_gem_prime_export(struct drm_device *dev,
 {
 	struct amdgpu_bo *bo = gem_to_amdgpu_bo(gobj);
 
-	if (amdgpu_ttm_tt_get_usermm(bo->tbo.ttm))
+	if (amdgpu_ttm_tt_get_usermm(bo->tbo.ttm) ||
+	    bo->flags & AMDGPU_GEM_CREATE_VM_ALWAYS_VALID)
 		return ERR_PTR(-EPERM);
 
 	return drm_gem_prime_export(dev, gobj, flags);
diff --git a/include/uapi/drm/amdgpu_drm.h b/include/uapi/drm/amdgpu_drm.h
index 7b8fa11c2285..e055776f2f4c 100644
--- a/include/uapi/drm/amdgpu_drm.h
+++ b/include/uapi/drm/amdgpu_drm.h
@@ -87,6 +87,8 @@ extern "C" {
 #define AMDGPU_GEM_CREATE_SHADOW		(1 << 4)
 /* Flag that allocating the BO should use linear VRAM */
 #define AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS	(1 << 5)
+/* Flag that BO is always valid in this VM */
+#define AMDGPU_GEM_CREATE_VM_ALWAYS_VALID	(1 << 6)
 
 struct drm_amdgpu_gem_create_in  {
 	/** the requested memory size */
-- 
cgit v1.2.3


From 07d79fc7d94e3f884b8b1c95aa615b202bb5e4c1 Mon Sep 17 00:00:00 2001
From: Cong Wang <xiyou.wangcong@gmail.com>
Date: Wed, 30 Aug 2017 14:30:36 -0700
Subject: net_sched: add reverse binding for tc class

TC filters when used as classifiers are bound to TC classes.
However, there is a hidden difference when adding them in different
orders:

1. If we add tc classes before its filters, everything is fine.
   Logically, the classes exist before we specify their ID's in
   filters, it is easy to bind them together, just as in the current
   code base.

2. If we add tc filters before the tc classes they bind, we have to
   do dynamic lookup in fast path. What's worse, this happens all
   the time not just once, because on fast path tcf_result is passed
   on stack, there is no way to propagate back to the one in tc filters.

This hidden difference hurts performance silently if we have many tc
classes in hierarchy.

This patch intends to close this gap by doing the reverse binding when
we create a new class, in this case we can actually search all the
filters in its parent, match and fixup by classid. And because
tcf_result is specific to each type of tc filter, we have to introduce
a new ops for each filter to tell how to bind the class.

Note, we still can NOT totally get rid of those class lookup in
->enqueue() because cgroup and flow filters have no way to determine
the classid at setup time, they still have to go through dynamic lookup.

Cc: Jamal Hadi Salim <jhs@mojatatu.com>
Signed-off-by: Cong Wang <xiyou.wangcong@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/sch_generic.h |  1 +
 net/sched/cls_basic.c     |  9 +++++++
 net/sched/cls_bpf.c       |  9 +++++++
 net/sched/cls_flower.c    |  9 +++++++
 net/sched/cls_fw.c        |  9 +++++++
 net/sched/cls_matchall.c  |  9 +++++++
 net/sched/cls_route.c     |  9 +++++++
 net/sched/cls_rsvp.h      |  9 +++++++
 net/sched/cls_tcindex.c   |  9 +++++++
 net/sched/cls_u32.c       |  9 +++++++
 net/sched/sch_api.c       | 68 +++++++++++++++++++++++++++++++++++++++++++++--
 11 files changed, 148 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h
index c30b634c5f82..d6247a3c40df 100644
--- a/include/net/sch_generic.h
+++ b/include/net/sch_generic.h
@@ -217,6 +217,7 @@ struct tcf_proto_ops {
 					void **, bool);
 	int			(*delete)(struct tcf_proto*, void *, bool*);
 	void			(*walk)(struct tcf_proto*, struct tcf_walker *arg);
+	void			(*bind_class)(void *, u32, unsigned long);
 
 	/* rtnetlink specific */
 	int			(*dump)(struct net*, struct tcf_proto*, void *,
diff --git a/net/sched/cls_basic.c b/net/sched/cls_basic.c
index 73cc7f167a38..d89ebafd2239 100644
--- a/net/sched/cls_basic.c
+++ b/net/sched/cls_basic.c
@@ -235,6 +235,14 @@ skip:
 	}
 }
 
+static void basic_bind_class(void *fh, u32 classid, unsigned long cl)
+{
+	struct basic_filter *f = fh;
+
+	if (f && f->res.classid == classid)
+		f->res.class = cl;
+}
+
 static int basic_dump(struct net *net, struct tcf_proto *tp, void *fh,
 		      struct sk_buff *skb, struct tcmsg *t)
 {
@@ -280,6 +288,7 @@ static struct tcf_proto_ops cls_basic_ops __read_mostly = {
 	.delete		=	basic_delete,
 	.walk		=	basic_walk,
 	.dump		=	basic_dump,
+	.bind_class	=	basic_bind_class,
 	.owner		=	THIS_MODULE,
 };
 
diff --git a/net/sched/cls_bpf.c b/net/sched/cls_bpf.c
index 6f2dffe30f25..520c5027646a 100644
--- a/net/sched/cls_bpf.c
+++ b/net/sched/cls_bpf.c
@@ -607,6 +607,14 @@ nla_put_failure:
 	return -1;
 }
 
+static void cls_bpf_bind_class(void *fh, u32 classid, unsigned long cl)
+{
+	struct cls_bpf_prog *prog = fh;
+
+	if (prog && prog->res.classid == classid)
+		prog->res.class = cl;
+}
+
 static void cls_bpf_walk(struct tcf_proto *tp, struct tcf_walker *arg)
 {
 	struct cls_bpf_head *head = rtnl_dereference(tp->root);
@@ -635,6 +643,7 @@ static struct tcf_proto_ops cls_bpf_ops __read_mostly = {
 	.delete		=	cls_bpf_delete,
 	.walk		=	cls_bpf_walk,
 	.dump		=	cls_bpf_dump,
+	.bind_class	=	cls_bpf_bind_class,
 };
 
 static int __init cls_bpf_init_mod(void)
diff --git a/net/sched/cls_flower.c b/net/sched/cls_flower.c
index 3d041d279b92..1a267e77c6de 100644
--- a/net/sched/cls_flower.c
+++ b/net/sched/cls_flower.c
@@ -1351,6 +1351,14 @@ nla_put_failure:
 	return -1;
 }
 
+static void fl_bind_class(void *fh, u32 classid, unsigned long cl)
+{
+	struct cls_fl_filter *f = fh;
+
+	if (f && f->res.classid == classid)
+		f->res.class = cl;
+}
+
 static struct tcf_proto_ops cls_fl_ops __read_mostly = {
 	.kind		= "flower",
 	.classify	= fl_classify,
@@ -1361,6 +1369,7 @@ static struct tcf_proto_ops cls_fl_ops __read_mostly = {
 	.delete		= fl_delete,
 	.walk		= fl_walk,
 	.dump		= fl_dump,
+	.bind_class	= fl_bind_class,
 	.owner		= THIS_MODULE,
 };
 
diff --git a/net/sched/cls_fw.c b/net/sched/cls_fw.c
index 192255ec50bd..941245ad07fd 100644
--- a/net/sched/cls_fw.c
+++ b/net/sched/cls_fw.c
@@ -412,6 +412,14 @@ nla_put_failure:
 	return -1;
 }
 
+static void fw_bind_class(void *fh, u32 classid, unsigned long cl)
+{
+	struct fw_filter *f = fh;
+
+	if (f && f->res.classid == classid)
+		f->res.class = cl;
+}
+
 static struct tcf_proto_ops cls_fw_ops __read_mostly = {
 	.kind		=	"fw",
 	.classify	=	fw_classify,
@@ -422,6 +430,7 @@ static struct tcf_proto_ops cls_fw_ops __read_mostly = {
 	.delete		=	fw_delete,
 	.walk		=	fw_walk,
 	.dump		=	fw_dump,
+	.bind_class	=	fw_bind_class,
 	.owner		=	THIS_MODULE,
 };
 
diff --git a/net/sched/cls_matchall.c b/net/sched/cls_matchall.c
index d4dc387f7a56..21cc45caf842 100644
--- a/net/sched/cls_matchall.c
+++ b/net/sched/cls_matchall.c
@@ -251,6 +251,14 @@ nla_put_failure:
 	return -1;
 }
 
+static void mall_bind_class(void *fh, u32 classid, unsigned long cl)
+{
+	struct cls_mall_head *head = fh;
+
+	if (head && head->res.classid == classid)
+		head->res.class = cl;
+}
+
 static struct tcf_proto_ops cls_mall_ops __read_mostly = {
 	.kind		= "matchall",
 	.classify	= mall_classify,
@@ -261,6 +269,7 @@ static struct tcf_proto_ops cls_mall_ops __read_mostly = {
 	.delete		= mall_delete,
 	.walk		= mall_walk,
 	.dump		= mall_dump,
+	.bind_class	= mall_bind_class,
 	.owner		= THIS_MODULE,
 };
 
diff --git a/net/sched/cls_route.c b/net/sched/cls_route.c
index 3b70982394ce..9ddde65915d2 100644
--- a/net/sched/cls_route.c
+++ b/net/sched/cls_route.c
@@ -624,6 +624,14 @@ nla_put_failure:
 	return -1;
 }
 
+static void route4_bind_class(void *fh, u32 classid, unsigned long cl)
+{
+	struct route4_filter *f = fh;
+
+	if (f && f->res.classid == classid)
+		f->res.class = cl;
+}
+
 static struct tcf_proto_ops cls_route4_ops __read_mostly = {
 	.kind		=	"route",
 	.classify	=	route4_classify,
@@ -634,6 +642,7 @@ static struct tcf_proto_ops cls_route4_ops __read_mostly = {
 	.delete		=	route4_delete,
 	.walk		=	route4_walk,
 	.dump		=	route4_dump,
+	.bind_class	=	route4_bind_class,
 	.owner		=	THIS_MODULE,
 };
 
diff --git a/net/sched/cls_rsvp.h b/net/sched/cls_rsvp.h
index 26203ff817f3..98c05db85bcb 100644
--- a/net/sched/cls_rsvp.h
+++ b/net/sched/cls_rsvp.h
@@ -723,6 +723,14 @@ nla_put_failure:
 	return -1;
 }
 
+static void rsvp_bind_class(void *fh, u32 classid, unsigned long cl)
+{
+	struct rsvp_filter *f = fh;
+
+	if (f && f->res.classid == classid)
+		f->res.class = cl;
+}
+
 static struct tcf_proto_ops RSVP_OPS __read_mostly = {
 	.kind		=	RSVP_ID,
 	.classify	=	rsvp_classify,
@@ -733,6 +741,7 @@ static struct tcf_proto_ops RSVP_OPS __read_mostly = {
 	.delete		=	rsvp_delete,
 	.walk		=	rsvp_walk,
 	.dump		=	rsvp_dump,
+	.bind_class	=	rsvp_bind_class,
 	.owner		=	THIS_MODULE,
 };
 
diff --git a/net/sched/cls_tcindex.c b/net/sched/cls_tcindex.c
index fb281b9b2c52..14a7e08b2fa9 100644
--- a/net/sched/cls_tcindex.c
+++ b/net/sched/cls_tcindex.c
@@ -606,6 +606,14 @@ nla_put_failure:
 	return -1;
 }
 
+static void tcindex_bind_class(void *fh, u32 classid, unsigned long cl)
+{
+	struct tcindex_filter_result *r = fh;
+
+	if (r && r->res.classid == classid)
+		r->res.class = cl;
+}
+
 static struct tcf_proto_ops cls_tcindex_ops __read_mostly = {
 	.kind		=	"tcindex",
 	.classify	=	tcindex_classify,
@@ -616,6 +624,7 @@ static struct tcf_proto_ops cls_tcindex_ops __read_mostly = {
 	.delete		=	tcindex_delete,
 	.walk		=	tcindex_walk,
 	.dump		=	tcindex_dump,
+	.bind_class	=	tcindex_bind_class,
 	.owner		=	THIS_MODULE,
 };
 
diff --git a/net/sched/cls_u32.c b/net/sched/cls_u32.c
index 99ea4c74dd5b..10b8d851fc6b 100644
--- a/net/sched/cls_u32.c
+++ b/net/sched/cls_u32.c
@@ -1112,6 +1112,14 @@ static void u32_walk(struct tcf_proto *tp, struct tcf_walker *arg)
 	}
 }
 
+static void u32_bind_class(void *fh, u32 classid, unsigned long cl)
+{
+	struct tc_u_knode *n = fh;
+
+	if (n && n->res.classid == classid)
+		n->res.class = cl;
+}
+
 static int u32_dump(struct net *net, struct tcf_proto *tp, void *fh,
 		    struct sk_buff *skb, struct tcmsg *t)
 {
@@ -1242,6 +1250,7 @@ static struct tcf_proto_ops cls_u32_ops __read_mostly = {
 	.delete		=	u32_delete,
 	.walk		=	u32_walk,
 	.dump		=	u32_dump,
+	.bind_class	=	u32_bind_class,
 	.owner		=	THIS_MODULE,
 };
 
diff --git a/net/sched/sch_api.c b/net/sched/sch_api.c
index e7f8e4bfd4ec..929b024f41ba 100644
--- a/net/sched/sch_api.c
+++ b/net/sched/sch_api.c
@@ -35,6 +35,7 @@
 #include <net/sock.h>
 #include <net/netlink.h>
 #include <net/pkt_sched.h>
+#include <net/pkt_cls.h>
 
 /*
 
@@ -1648,6 +1649,64 @@ static int tclass_del_notify(struct net *net,
 			      n->nlmsg_flags & NLM_F_ECHO);
 }
 
+#ifdef CONFIG_NET_CLS
+
+struct tcf_bind_args {
+	struct tcf_walker w;
+	u32 classid;
+	unsigned long cl;
+};
+
+static int tcf_node_bind(struct tcf_proto *tp, void *n, struct tcf_walker *arg)
+{
+	struct tcf_bind_args *a = (void *)arg;
+
+	if (tp->ops->bind_class) {
+		tcf_tree_lock(tp);
+		tp->ops->bind_class(n, a->classid, a->cl);
+		tcf_tree_unlock(tp);
+	}
+	return 0;
+}
+
+static void tc_bind_tclass(struct Qdisc *q, u32 portid, u32 clid,
+			   unsigned long new_cl)
+{
+	const struct Qdisc_class_ops *cops = q->ops->cl_ops;
+	struct tcf_block *block;
+	struct tcf_chain *chain;
+	unsigned long cl;
+
+	cl = cops->find(q, portid);
+	if (!cl)
+		return;
+	block = cops->tcf_block(q, cl);
+	if (!block)
+		return;
+	list_for_each_entry(chain, &block->chain_list, list) {
+		struct tcf_proto *tp;
+
+		for (tp = rtnl_dereference(chain->filter_chain);
+		     tp; tp = rtnl_dereference(tp->next)) {
+			struct tcf_bind_args arg = {};
+
+			arg.w.fn = tcf_node_bind;
+			arg.classid = clid;
+			arg.cl = new_cl;
+			tp->ops->walk(tp, &arg.w);
+		}
+	}
+}
+
+#else
+
+static void tc_bind_tclass(struct Qdisc *q, u32 portid, u32 clid,
+			   unsigned long new_cl)
+{
+}
+
+#endif
+
 static int tc_ctl_tclass(struct sk_buff *skb, struct nlmsghdr *n,
 			 struct netlink_ext_ack *extack)
 {
@@ -1753,6 +1812,8 @@ static int tc_ctl_tclass(struct sk_buff *skb, struct nlmsghdr *n,
 			break;
 		case RTM_DELTCLASS:
 			err = tclass_del_notify(net, cops, skb, n, q, cl);
+			/* Unbind the class with flilters with 0 */
+			tc_bind_tclass(q, portid, clid, 0);
 			goto out;
 		case RTM_GETTCLASS:
 			err = tclass_notify(net, skb, n, q, cl, RTM_NEWTCLASS);
@@ -1767,9 +1828,12 @@ static int tc_ctl_tclass(struct sk_buff *skb, struct nlmsghdr *n,
 	err = -EOPNOTSUPP;
 	if (cops->change)
 		err = cops->change(q, clid, portid, tca, &new_cl);
-	if (err == 0)
+	if (err == 0) {
 		tclass_notify(net, skb, n, q, new_cl, RTM_NEWTCLASS);
-
+		/* We just create a new class, need to do reverse binding. */
+		if (cl != new_cl)
+			tc_bind_tclass(q, portid, clid, new_cl);
+	}
 out:
 	return err;
 }
-- 
cgit v1.2.3


From e3cfddd577e763496c25ddb855cfff48b1174b63 Mon Sep 17 00:00:00 2001
From: Roopa Prabhu <roopa@cumulusnetworks.com>
Date: Wed, 30 Aug 2017 22:18:13 -0700
Subject: bridge: add tracepoint in br_fdb_update

This extends bridge fdb table tracepoints to also cover
learned fdb entries in the br_fdb_update path. Note that
unlike other tracepoints I have moved this to when the fdb
is modified because this is in the datapath and can generate
a lot of noise in the trace output. br_fdb_update is also called
from added_by_user context in the NTF_USE case which is already
traced ..hence the !added_by_user check.

Signed-off-by: Roopa Prabhu <roopa@cumulusnetworks.com>
Acked-by: Nikolay Aleksandrov <nikolay@cumulusnetworks.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/trace/events/bridge.h | 31 +++++++++++++++++++++++++++++++
 net/bridge/br_fdb.c           |  5 ++++-
 net/core/net-traces.c         |  1 +
 3 files changed, 36 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/trace/events/bridge.h b/include/trace/events/bridge.h
index 0f1cde00af11..1bee3e7fdf32 100644
--- a/include/trace/events/bridge.h
+++ b/include/trace/events/bridge.h
@@ -92,6 +92,37 @@ TRACE_EVENT(fdb_delete,
 		  __entry->addr[4], __entry->addr[5], __entry->vid)
 );
 
+TRACE_EVENT(br_fdb_update,
+
+	TP_PROTO(struct net_bridge *br, struct net_bridge_port *source,
+		 const unsigned char *addr, u16 vid, bool added_by_user),
+
+	TP_ARGS(br, source, addr, vid, added_by_user),
+
+	TP_STRUCT__entry(
+		__string(br_dev, br->dev->name)
+		__string(dev, source->dev->name)
+		__array(unsigned char, addr, ETH_ALEN)
+		__field(u16, vid)
+		__field(bool, added_by_user)
+	),
+
+	TP_fast_assign(
+		__assign_str(br_dev, br->dev->name);
+		__assign_str(dev, source->dev->name);
+		memcpy(__entry->addr, addr, ETH_ALEN);
+		__entry->vid = vid;
+		__entry->added_by_user = added_by_user;
+	),
+
+	TP_printk("br_dev %s source %s addr %02x:%02x:%02x:%02x:%02x:%02x vid %u added_by_user %d",
+		  __get_str(br_dev), __get_str(dev), __entry->addr[0],
+		  __entry->addr[1], __entry->addr[2], __entry->addr[3],
+		  __entry->addr[4], __entry->addr[5], __entry->vid,
+		  __entry->added_by_user)
+);
+
+
 #endif /* _TRACE_BRIDGE_H */
 
 /* This part must be outside protection */
diff --git a/net/bridge/br_fdb.c b/net/bridge/br_fdb.c
index be5e1da6d824..4ea5c8bbe286 100644
--- a/net/bridge/br_fdb.c
+++ b/net/bridge/br_fdb.c
@@ -583,8 +583,10 @@ void br_fdb_update(struct net_bridge *br, struct net_bridge_port *source,
 				fdb->updated = now;
 			if (unlikely(added_by_user))
 				fdb->added_by_user = 1;
-			if (unlikely(fdb_modified))
+			if (unlikely(fdb_modified)) {
+				trace_br_fdb_update(br, source, addr, vid, added_by_user);
 				fdb_notify(br, fdb, RTM_NEWNEIGH);
+			}
 		}
 	} else {
 		spin_lock(&br->hash_lock);
@@ -593,6 +595,7 @@ void br_fdb_update(struct net_bridge *br, struct net_bridge_port *source,
 			if (fdb) {
 				if (unlikely(added_by_user))
 					fdb->added_by_user = 1;
+				trace_br_fdb_update(br, source, addr, vid, added_by_user);
 				fdb_notify(br, fdb, RTM_NEWNEIGH);
 			}
 		}
diff --git a/net/core/net-traces.c b/net/core/net-traces.c
index 4a0292c97070..1132820c8e62 100644
--- a/net/core/net-traces.c
+++ b/net/core/net-traces.c
@@ -42,6 +42,7 @@ EXPORT_TRACEPOINT_SYMBOL_GPL(fib6_table_lookup);
 EXPORT_TRACEPOINT_SYMBOL_GPL(br_fdb_add);
 EXPORT_TRACEPOINT_SYMBOL_GPL(br_fdb_external_learn_add);
 EXPORT_TRACEPOINT_SYMBOL_GPL(fdb_delete);
+EXPORT_TRACEPOINT_SYMBOL_GPL(br_fdb_update);
 #endif
 
 EXPORT_TRACEPOINT_SYMBOL_GPL(kfree_skb);
-- 
cgit v1.2.3


From f16ded5948a5e6a399161bd71e866b198952ea5f Mon Sep 17 00:00:00 2001
From: Rami Rosen <rami.rosen@intel.com>
Date: Thu, 31 Aug 2017 13:41:40 +0300
Subject: net: fix two typos in net_device_ops documentation.

This patch fixes two trivial typos in net_device_ops documentation,
related to ndo_xdp_flush callback.

Signed-off-by: Rami Rosen <rami.rosen@intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netdevice.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 29bf06ff157c..35de8312e0b5 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -1124,8 +1124,8 @@ struct xfrmdev_ops {
  *	This function is used to submit a XDP packet for transmit on a
  *	netdevice.
  * void (*ndo_xdp_flush)(struct net_device *dev);
- *	This function is used to inform the driver to flush a paticular
- *	xpd tx queue. Must be called on same CPU as xdp_xmit.
+ *	This function is used to inform the driver to flush a particular
+ *	xdp tx queue. Must be called on same CPU as xdp_xmit.
  */
 struct net_device_ops {
 	int			(*ndo_init)(struct net_device *dev);
-- 
cgit v1.2.3


From 8fc614c0ae5cb5df11d6aa9559e63baacf20a840 Mon Sep 17 00:00:00 2001
From: Bjorn Helgaas <bhelgaas@google.com>
Date: Thu, 31 Aug 2017 14:12:39 -0500
Subject: PCI/AER: Reformat AER register definitions

Reformat so comments fit on same line as definition.  No functional change
intended.

Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
---
 include/uapi/linux/pci_regs.h | 26 ++++++++++----------------
 1 file changed, 10 insertions(+), 16 deletions(-)

(limited to 'include')

diff --git a/include/uapi/linux/pci_regs.h b/include/uapi/linux/pci_regs.h
index c22d3ebaca20..46632aaee1c0 100644
--- a/include/uapi/linux/pci_regs.h
+++ b/include/uapi/linux/pci_regs.h
@@ -733,23 +733,17 @@
 #define  PCI_ERR_CAP_ECRC_CHKE	0x00000100	/* ECRC Check Enable */
 #define PCI_ERR_HEADER_LOG	28	/* Header Log Register (16 bytes) */
 #define PCI_ERR_ROOT_COMMAND	44	/* Root Error Command */
-/* Correctable Err Reporting Enable */
-#define PCI_ERR_ROOT_CMD_COR_EN		0x00000001
-/* Non-fatal Err Reporting Enable */
-#define PCI_ERR_ROOT_CMD_NONFATAL_EN	0x00000002
-/* Fatal Err Reporting Enable */
-#define PCI_ERR_ROOT_CMD_FATAL_EN	0x00000004
+#define PCI_ERR_ROOT_CMD_COR_EN		0x00000001 /* Correctable Err Reporting Enable */
+#define PCI_ERR_ROOT_CMD_NONFATAL_EN	0x00000002 /* Non-Fatal Err Reporting Enable */
+#define PCI_ERR_ROOT_CMD_FATAL_EN	0x00000004 /* Fatal Err Reporting Enable */
 #define PCI_ERR_ROOT_STATUS	48
-#define PCI_ERR_ROOT_COR_RCV		0x00000001	/* ERR_COR Received */
-/* Multi ERR_COR Received */
-#define PCI_ERR_ROOT_MULTI_COR_RCV	0x00000002
-/* ERR_FATAL/NONFATAL Received */
-#define PCI_ERR_ROOT_UNCOR_RCV		0x00000004
-/* Multi ERR_FATAL/NONFATAL Received */
-#define PCI_ERR_ROOT_MULTI_UNCOR_RCV	0x00000008
-#define PCI_ERR_ROOT_FIRST_FATAL	0x00000010	/* First Fatal */
-#define PCI_ERR_ROOT_NONFATAL_RCV	0x00000020	/* Non-Fatal Received */
-#define PCI_ERR_ROOT_FATAL_RCV		0x00000040	/* Fatal Received */
+#define PCI_ERR_ROOT_COR_RCV		0x00000001 /* ERR_COR Received */
+#define PCI_ERR_ROOT_MULTI_COR_RCV	0x00000002 /* Multiple ERR_COR */
+#define PCI_ERR_ROOT_UNCOR_RCV		0x00000004 /* ERR_FATAL/NONFATAL */
+#define PCI_ERR_ROOT_MULTI_UNCOR_RCV	0x00000008 /* Multiple FATAL/NONFATAL */
+#define PCI_ERR_ROOT_FIRST_FATAL	0x00000010 /* First UNC is Fatal */
+#define PCI_ERR_ROOT_NONFATAL_RCV	0x00000020 /* Non-Fatal Received */
+#define PCI_ERR_ROOT_FATAL_RCV		0x00000040 /* Fatal Received */
 #define PCI_ERR_ROOT_ERR_SRC	52	/* Error Source Identification */
 
 /* Virtual Channel */
-- 
cgit v1.2.3


From 60361e12d01676e23a8de89a5ef4a349ae97f616 Mon Sep 17 00:00:00 2001
From: Zev Weiss <zev@bewilderbeest.net>
Date: Wed, 30 Aug 2017 05:36:38 -0500
Subject: ftrace: Fix debug preempt config name in stack_tracer_{en,dis}able

stack_tracer_disable()/stack_tracer_enable() had been using the wrong
name for the config symbol to enable their preempt-debugging checks --
fix with a word swap.

Link: http://lkml.kernel.org/r/20170831154036.4xldyakmmhuts5x7@hatter.bewilderbeest.net

Cc: stable@vger.kernel.org
Fixes: 8aaf1ee70e ("tracing: Rename trace_active to disable_stack_tracer and inline its modification")
Signed-off-by: Zev Weiss <zev@bewilderbeest.net>
Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
---
 include/linux/ftrace.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h
index 6383115e9d2c..2e028854bac7 100644
--- a/include/linux/ftrace.h
+++ b/include/linux/ftrace.h
@@ -307,7 +307,7 @@ DECLARE_PER_CPU(int, disable_stack_tracer);
 static inline void stack_tracer_disable(void)
 {
 	/* Preemption or interupts must be disabled */
-	if (IS_ENABLED(CONFIG_PREEMPT_DEBUG))
+	if (IS_ENABLED(CONFIG_DEBUG_PREEMPT))
 		WARN_ON_ONCE(!preempt_count() || !irqs_disabled());
 	this_cpu_inc(disable_stack_tracer);
 }
@@ -320,7 +320,7 @@ static inline void stack_tracer_disable(void)
  */
 static inline void stack_tracer_enable(void)
 {
-	if (IS_ENABLED(CONFIG_PREEMPT_DEBUG))
+	if (IS_ENABLED(CONFIG_DEBUG_PREEMPT))
 		WARN_ON_ONCE(!preempt_count() || !irqs_disabled());
 	this_cpu_dec(disable_stack_tracer);
 }
-- 
cgit v1.2.3


From 89e4fdecb51cf5535867026274bc97de9480ade5 Mon Sep 17 00:00:00 2001
From: Omar Sandoval <osandov@fb.com>
Date: Thu, 24 Aug 2017 00:03:43 -0700
Subject: loop: add ioctl for changing logical block size

This is a different approach from the first attempt in f2c6df7dbf9a
("loop: support 4k physical blocksize"). Rather than extending
LOOP_{GET,SET}_STATUS, add a separate ioctl just for setting the block
size.

Reviewed-by: Ming Lei <ming.lei@redhat.com>
Reviewed-by: Hannes Reinecke <hare@suse.com>
Signed-off-by: Omar Sandoval <osandov@fb.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/block/loop.c      | 24 ++++++++++++++++++++++++
 include/uapi/linux/loop.h |  1 +
 2 files changed, 25 insertions(+)

(limited to 'include')

diff --git a/drivers/block/loop.c b/drivers/block/loop.c
index e3f190016d4f..ac106b287d75 100644
--- a/drivers/block/loop.c
+++ b/drivers/block/loop.c
@@ -1047,6 +1047,7 @@ static int loop_clr_fd(struct loop_device *lo)
 	memset(lo->lo_encrypt_key, 0, LO_KEY_SIZE);
 	memset(lo->lo_crypt_name, 0, LO_NAME_SIZE);
 	memset(lo->lo_file_name, 0, LO_NAME_SIZE);
+	blk_queue_logical_block_size(lo->lo_queue, 512);
 	if (bdev) {
 		bdput(bdev);
 		invalidate_bdev(bdev);
@@ -1330,6 +1331,24 @@ static int loop_set_dio(struct loop_device *lo, unsigned long arg)
 	return error;
 }
 
+static int loop_set_block_size(struct loop_device *lo, unsigned long arg)
+{
+	if (lo->lo_state != Lo_bound)
+		return -ENXIO;
+
+	if (arg < 512 || arg > PAGE_SIZE || !is_power_of_2(arg))
+		return -EINVAL;
+
+	blk_mq_freeze_queue(lo->lo_queue);
+
+	blk_queue_logical_block_size(lo->lo_queue, arg);
+	loop_update_dio(lo);
+
+	blk_mq_unfreeze_queue(lo->lo_queue);
+
+	return 0;
+}
+
 static int lo_ioctl(struct block_device *bdev, fmode_t mode,
 	unsigned int cmd, unsigned long arg)
 {
@@ -1378,6 +1397,11 @@ static int lo_ioctl(struct block_device *bdev, fmode_t mode,
 		if ((mode & FMODE_WRITE) || capable(CAP_SYS_ADMIN))
 			err = loop_set_dio(lo, arg);
 		break;
+	case LOOP_SET_BLOCK_SIZE:
+		err = -EPERM;
+		if ((mode & FMODE_WRITE) || capable(CAP_SYS_ADMIN))
+			err = loop_set_block_size(lo, arg);
+		break;
 	default:
 		err = lo->ioctl ? lo->ioctl(lo, cmd, arg) : -EINVAL;
 	}
diff --git a/include/uapi/linux/loop.h b/include/uapi/linux/loop.h
index c8125ec1f4f2..23158dbe2424 100644
--- a/include/uapi/linux/loop.h
+++ b/include/uapi/linux/loop.h
@@ -88,6 +88,7 @@ struct loop_info64 {
 #define LOOP_CHANGE_FD		0x4C06
 #define LOOP_SET_CAPACITY	0x4C07
 #define LOOP_SET_DIRECT_IO	0x4C08
+#define LOOP_SET_BLOCK_SIZE	0x4C09
 
 /* /dev/loop-control interface */
 #define LOOP_CTL_ADD		0x4C80
-- 
cgit v1.2.3


From 4f59c718521a0f00b6589da6b8fcea2dc296026d Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Sat, 8 Jul 2017 11:40:39 -0400
Subject: teach SYSCALL_DEFINE/COMPAT_SYSCALL_DEFINE to handle __bitwise
 arguments

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 arch/s390/include/asm/compat.h | 5 +++--
 include/linux/compat.h         | 2 +-
 include/linux/syscalls.h       | 9 +++++----
 3 files changed, 9 insertions(+), 7 deletions(-)

(limited to 'include')

diff --git a/arch/s390/include/asm/compat.h b/arch/s390/include/asm/compat.h
index b9300f8aee10..07a82bc933a7 100644
--- a/arch/s390/include/asm/compat.h
+++ b/arch/s390/include/asm/compat.h
@@ -8,11 +8,12 @@
 #include <linux/sched/task_stack.h>
 #include <linux/thread_info.h>
 
-#define __TYPE_IS_PTR(t) (!__builtin_types_compatible_p(typeof(0?(t)0:0ULL), u64))
+#define __TYPE_IS_PTR(t) (!__builtin_types_compatible_p( \
+				typeof(0?(__force t)0:0ULL), u64))
 
 #define __SC_DELOUSE(t,v) ({ \
 	BUILD_BUG_ON(sizeof(t) > 4 && !__TYPE_IS_PTR(t)); \
-	(t)(__TYPE_IS_PTR(t) ? ((v) & 0x7fffffff) : (v)); \
+	(__force t)(__TYPE_IS_PTR(t) ? ((v) & 0x7fffffff) : (v)); \
 })
 
 #define PSW32_MASK_PER		0x40000000UL
diff --git a/include/linux/compat.h b/include/linux/compat.h
index 5a6a109b4a50..e5d3fbe24f7d 100644
--- a/include/linux/compat.h
+++ b/include/linux/compat.h
@@ -27,7 +27,7 @@
 #endif
 
 #ifndef __SC_DELOUSE
-#define __SC_DELOUSE(t,v) ((t)(unsigned long)(v))
+#define __SC_DELOUSE(t,v) ((__force t)(unsigned long)(v))
 #endif
 
 #define COMPAT_SYSCALL_DEFINE0(name) \
diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
index 3cb15ea48aee..0bc1d2e8cc17 100644
--- a/include/linux/syscalls.h
+++ b/include/linux/syscalls.h
@@ -100,11 +100,12 @@ union bpf_attr;
 #define __MAP(n,...) __MAP##n(__VA_ARGS__)
 
 #define __SC_DECL(t, a)	t a
-#define __TYPE_IS_L(t)	(__same_type((t)0, 0L))
-#define __TYPE_IS_UL(t)	(__same_type((t)0, 0UL))
-#define __TYPE_IS_LL(t) (__same_type((t)0, 0LL) || __same_type((t)0, 0ULL))
+#define __TYPE_AS(t, v)	__same_type((__force t)0, v)
+#define __TYPE_IS_L(t)	(__TYPE_AS(t, 0L))
+#define __TYPE_IS_UL(t)	(__TYPE_AS(t, 0UL))
+#define __TYPE_IS_LL(t) (__TYPE_AS(t, 0LL) || __TYPE_AS(t, 0ULL))
 #define __SC_LONG(t, a) __typeof(__builtin_choose_expr(__TYPE_IS_LL(t), 0LL, 0L)) a
-#define __SC_CAST(t, a)	(t) a
+#define __SC_CAST(t, a)	(__force t) a
 #define __SC_ARGS(t, a)	a
 #define __SC_TEST(t, a) (void)BUILD_BUG_ON_ZERO(!__TYPE_IS_LL(t) && sizeof(t) > sizeof(long))
 
-- 
cgit v1.2.3


From ddef7ed2b5cbafae692d1d580bb5a07808926a9c Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Thu, 6 Jul 2017 18:58:37 +0200
Subject: annotate RWF_... flags

[AV: added missing annotations in syscalls.h/compat.h]

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/nfsd/vfs.c                |  2 +-
 fs/read_write.c              | 50 ++++++++++++++++++++++----------------------
 include/linux/compat.h       | 16 ++++++++++++--
 include/linux/fs.h           | 12 ++++++-----
 include/linux/syscalls.h     |  4 ++--
 include/uapi/linux/aio_abi.h | 21 ++++++++++---------
 include/uapi/linux/fs.h      | 28 ++++++++++++++++++-------
 7 files changed, 80 insertions(+), 53 deletions(-)

(limited to 'include')

diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index 38d0383dc7f9..bc69d40c4e8b 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -969,7 +969,7 @@ nfsd_vfs_write(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file,
 	int			use_wgather;
 	loff_t			pos = offset;
 	unsigned int		pflags = current->flags;
-	int			flags = 0;
+	rwf_t			flags = 0;
 
 	if (test_bit(RQ_LOCAL, &rqstp->rq_flags))
 		/*
diff --git a/fs/read_write.c b/fs/read_write.c
index 0cc7033aa413..61b58c7b6531 100644
--- a/fs/read_write.c
+++ b/fs/read_write.c
@@ -33,7 +33,7 @@ const struct file_operations generic_ro_fops = {
 
 EXPORT_SYMBOL(generic_ro_fops);
 
-static inline int unsigned_offsets(struct file *file)
+static inline bool unsigned_offsets(struct file *file)
 {
 	return file->f_mode & FMODE_UNSIGNED_OFFSET;
 }
@@ -633,7 +633,7 @@ unsigned long iov_shorten(struct iovec *iov, unsigned long nr_segs, size_t to)
 EXPORT_SYMBOL(iov_shorten);
 
 static ssize_t do_iter_readv_writev(struct file *filp, struct iov_iter *iter,
-		loff_t *ppos, int type, int flags)
+		loff_t *ppos, int type, rwf_t flags)
 {
 	struct kiocb kiocb;
 	ssize_t ret;
@@ -655,7 +655,7 @@ static ssize_t do_iter_readv_writev(struct file *filp, struct iov_iter *iter,
 
 /* Do it by hand, with file-ops */
 static ssize_t do_loop_readv_writev(struct file *filp, struct iov_iter *iter,
-		loff_t *ppos, int type, int flags)
+		loff_t *ppos, int type, rwf_t flags)
 {
 	ssize_t ret = 0;
 
@@ -871,7 +871,7 @@ out:
 #endif
 
 static ssize_t do_iter_read(struct file *file, struct iov_iter *iter,
-		loff_t *pos, int flags)
+		loff_t *pos, rwf_t flags)
 {
 	size_t tot_len;
 	ssize_t ret = 0;
@@ -899,7 +899,7 @@ out:
 }
 
 ssize_t vfs_iter_read(struct file *file, struct iov_iter *iter, loff_t *ppos,
-		int flags)
+		rwf_t flags)
 {
 	if (!file->f_op->read_iter)
 		return -EINVAL;
@@ -908,7 +908,7 @@ ssize_t vfs_iter_read(struct file *file, struct iov_iter *iter, loff_t *ppos,
 EXPORT_SYMBOL(vfs_iter_read);
 
 static ssize_t do_iter_write(struct file *file, struct iov_iter *iter,
-		loff_t *pos, int flags)
+		loff_t *pos, rwf_t flags)
 {
 	size_t tot_len;
 	ssize_t ret = 0;
@@ -935,7 +935,7 @@ static ssize_t do_iter_write(struct file *file, struct iov_iter *iter,
 }
 
 ssize_t vfs_iter_write(struct file *file, struct iov_iter *iter, loff_t *ppos,
-		int flags)
+		rwf_t flags)
 {
 	if (!file->f_op->write_iter)
 		return -EINVAL;
@@ -944,7 +944,7 @@ ssize_t vfs_iter_write(struct file *file, struct iov_iter *iter, loff_t *ppos,
 EXPORT_SYMBOL(vfs_iter_write);
 
 ssize_t vfs_readv(struct file *file, const struct iovec __user *vec,
-		  unsigned long vlen, loff_t *pos, int flags)
+		  unsigned long vlen, loff_t *pos, rwf_t flags)
 {
 	struct iovec iovstack[UIO_FASTIOV];
 	struct iovec *iov = iovstack;
@@ -962,7 +962,7 @@ ssize_t vfs_readv(struct file *file, const struct iovec __user *vec,
 EXPORT_SYMBOL(vfs_readv);
 
 ssize_t vfs_writev(struct file *file, const struct iovec __user *vec,
-		   unsigned long vlen, loff_t *pos, int flags)
+		   unsigned long vlen, loff_t *pos, rwf_t flags)
 {
 	struct iovec iovstack[UIO_FASTIOV];
 	struct iovec *iov = iovstack;
@@ -981,7 +981,7 @@ ssize_t vfs_writev(struct file *file, const struct iovec __user *vec,
 EXPORT_SYMBOL(vfs_writev);
 
 static ssize_t do_readv(unsigned long fd, const struct iovec __user *vec,
-			unsigned long vlen, int flags)
+			unsigned long vlen, rwf_t flags)
 {
 	struct fd f = fdget_pos(fd);
 	ssize_t ret = -EBADF;
@@ -1001,7 +1001,7 @@ static ssize_t do_readv(unsigned long fd, const struct iovec __user *vec,
 }
 
 static ssize_t do_writev(unsigned long fd, const struct iovec __user *vec,
-			 unsigned long vlen, int flags)
+			 unsigned long vlen, rwf_t flags)
 {
 	struct fd f = fdget_pos(fd);
 	ssize_t ret = -EBADF;
@@ -1027,7 +1027,7 @@ static inline loff_t pos_from_hilo(unsigned long high, unsigned long low)
 }
 
 static ssize_t do_preadv(unsigned long fd, const struct iovec __user *vec,
-			 unsigned long vlen, loff_t pos, int flags)
+			 unsigned long vlen, loff_t pos, rwf_t flags)
 {
 	struct fd f;
 	ssize_t ret = -EBADF;
@@ -1050,7 +1050,7 @@ static ssize_t do_preadv(unsigned long fd, const struct iovec __user *vec,
 }
 
 static ssize_t do_pwritev(unsigned long fd, const struct iovec __user *vec,
-			  unsigned long vlen, loff_t pos, int flags)
+			  unsigned long vlen, loff_t pos, rwf_t flags)
 {
 	struct fd f;
 	ssize_t ret = -EBADF;
@@ -1094,7 +1094,7 @@ SYSCALL_DEFINE5(preadv, unsigned long, fd, const struct iovec __user *, vec,
 
 SYSCALL_DEFINE6(preadv2, unsigned long, fd, const struct iovec __user *, vec,
 		unsigned long, vlen, unsigned long, pos_l, unsigned long, pos_h,
-		int, flags)
+		rwf_t, flags)
 {
 	loff_t pos = pos_from_hilo(pos_h, pos_l);
 
@@ -1114,7 +1114,7 @@ SYSCALL_DEFINE5(pwritev, unsigned long, fd, const struct iovec __user *, vec,
 
 SYSCALL_DEFINE6(pwritev2, unsigned long, fd, const struct iovec __user *, vec,
 		unsigned long, vlen, unsigned long, pos_l, unsigned long, pos_h,
-		int, flags)
+		rwf_t, flags)
 {
 	loff_t pos = pos_from_hilo(pos_h, pos_l);
 
@@ -1127,7 +1127,7 @@ SYSCALL_DEFINE6(pwritev2, unsigned long, fd, const struct iovec __user *, vec,
 #ifdef CONFIG_COMPAT
 static size_t compat_readv(struct file *file,
 			   const struct compat_iovec __user *vec,
-			   unsigned long vlen, loff_t *pos, int flags)
+			   unsigned long vlen, loff_t *pos, rwf_t flags)
 {
 	struct iovec iovstack[UIO_FASTIOV];
 	struct iovec *iov = iovstack;
@@ -1147,7 +1147,7 @@ static size_t compat_readv(struct file *file,
 
 static size_t do_compat_readv(compat_ulong_t fd,
 				 const struct compat_iovec __user *vec,
-				 compat_ulong_t vlen, int flags)
+				 compat_ulong_t vlen, rwf_t flags)
 {
 	struct fd f = fdget_pos(fd);
 	ssize_t ret;
@@ -1173,7 +1173,7 @@ COMPAT_SYSCALL_DEFINE3(readv, compat_ulong_t, fd,
 
 static long do_compat_preadv64(unsigned long fd,
 				  const struct compat_iovec __user *vec,
-				  unsigned long vlen, loff_t pos, int flags)
+				  unsigned long vlen, loff_t pos, rwf_t flags)
 {
 	struct fd f;
 	ssize_t ret;
@@ -1211,7 +1211,7 @@ COMPAT_SYSCALL_DEFINE5(preadv, compat_ulong_t, fd,
 #ifdef __ARCH_WANT_COMPAT_SYS_PREADV64V2
 COMPAT_SYSCALL_DEFINE5(preadv64v2, unsigned long, fd,
 		const struct compat_iovec __user *,vec,
-		unsigned long, vlen, loff_t, pos, int, flags)
+		unsigned long, vlen, loff_t, pos, rwf_t, flags)
 {
 	return do_compat_preadv64(fd, vec, vlen, pos, flags);
 }
@@ -1220,7 +1220,7 @@ COMPAT_SYSCALL_DEFINE5(preadv64v2, unsigned long, fd,
 COMPAT_SYSCALL_DEFINE6(preadv2, compat_ulong_t, fd,
 		const struct compat_iovec __user *,vec,
 		compat_ulong_t, vlen, u32, pos_low, u32, pos_high,
-		int, flags)
+		rwf_t, flags)
 {
 	loff_t pos = ((loff_t)pos_high << 32) | pos_low;
 
@@ -1232,7 +1232,7 @@ COMPAT_SYSCALL_DEFINE6(preadv2, compat_ulong_t, fd,
 
 static size_t compat_writev(struct file *file,
 			    const struct compat_iovec __user *vec,
-			    unsigned long vlen, loff_t *pos, int flags)
+			    unsigned long vlen, loff_t *pos, rwf_t flags)
 {
 	struct iovec iovstack[UIO_FASTIOV];
 	struct iovec *iov = iovstack;
@@ -1254,7 +1254,7 @@ static size_t compat_writev(struct file *file,
 
 static size_t do_compat_writev(compat_ulong_t fd,
 				  const struct compat_iovec __user* vec,
-				  compat_ulong_t vlen, int flags)
+				  compat_ulong_t vlen, rwf_t flags)
 {
 	struct fd f = fdget_pos(fd);
 	ssize_t ret;
@@ -1279,7 +1279,7 @@ COMPAT_SYSCALL_DEFINE3(writev, compat_ulong_t, fd,
 
 static long do_compat_pwritev64(unsigned long fd,
 				   const struct compat_iovec __user *vec,
-				   unsigned long vlen, loff_t pos, int flags)
+				   unsigned long vlen, loff_t pos, rwf_t flags)
 {
 	struct fd f;
 	ssize_t ret;
@@ -1317,7 +1317,7 @@ COMPAT_SYSCALL_DEFINE5(pwritev, compat_ulong_t, fd,
 #ifdef __ARCH_WANT_COMPAT_SYS_PWRITEV64V2
 COMPAT_SYSCALL_DEFINE5(pwritev64v2, unsigned long, fd,
 		const struct compat_iovec __user *,vec,
-		unsigned long, vlen, loff_t, pos, int, flags)
+		unsigned long, vlen, loff_t, pos, rwf_t, flags)
 {
 	return do_compat_pwritev64(fd, vec, vlen, pos, flags);
 }
@@ -1325,7 +1325,7 @@ COMPAT_SYSCALL_DEFINE5(pwritev64v2, unsigned long, fd,
 
 COMPAT_SYSCALL_DEFINE6(pwritev2, compat_ulong_t, fd,
 		const struct compat_iovec __user *,vec,
-		compat_ulong_t, vlen, u32, pos_low, u32, pos_high, int, flags)
+		compat_ulong_t, vlen, u32, pos_low, u32, pos_high, rwf_t, flags)
 {
 	loff_t pos = ((loff_t)pos_high << 32) | pos_low;
 
diff --git a/include/linux/compat.h b/include/linux/compat.h
index e5d3fbe24f7d..3fc433303d7a 100644
--- a/include/linux/compat.h
+++ b/include/linux/compat.h
@@ -365,10 +365,10 @@ asmlinkage ssize_t compat_sys_pwritev(compat_ulong_t fd,
 		compat_ulong_t vlen, u32 pos_low, u32 pos_high);
 asmlinkage ssize_t compat_sys_preadv2(compat_ulong_t fd,
 		const struct compat_iovec __user *vec,
-		compat_ulong_t vlen, u32 pos_low, u32 pos_high, int flags);
+		compat_ulong_t vlen, u32 pos_low, u32 pos_high, rwf_t flags);
 asmlinkage ssize_t compat_sys_pwritev2(compat_ulong_t fd,
 		const struct compat_iovec __user *vec,
-		compat_ulong_t vlen, u32 pos_low, u32 pos_high, int flags);
+		compat_ulong_t vlen, u32 pos_low, u32 pos_high, rwf_t flags);
 
 #ifdef __ARCH_WANT_COMPAT_SYS_PREADV64
 asmlinkage long compat_sys_preadv64(unsigned long fd,
@@ -382,6 +382,18 @@ asmlinkage long compat_sys_pwritev64(unsigned long fd,
 		unsigned long vlen, loff_t pos);
 #endif
 
+#ifdef __ARCH_WANT_COMPAT_SYS_PREADV64V2
+asmlinkage long  compat_sys_readv64v2(unsigned long fd,
+		const struct compat_iovec __user *vec,
+		unsigned long vlen, loff_t pos, rwf_t flags);
+#endif
+
+#ifdef __ARCH_WANT_COMPAT_SYS_PWRITEV64V2
+asmlinkage long compat_sys_pwritev64v2(unsigned long fd,
+		const struct compat_iovec __user *vec,
+		unsigned long vlen, loff_t pos, rwf_t flags);
+#endif
+
 asmlinkage long compat_sys_lseek(unsigned int, compat_off_t, unsigned int);
 
 asmlinkage long compat_sys_execve(const char __user *filename, const compat_uptr_t __user *argv,
diff --git a/include/linux/fs.h b/include/linux/fs.h
index cbfe127bccf8..2625fc47c7e5 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -72,6 +72,8 @@ extern int leases_enable, lease_break_time;
 extern int sysctl_protected_symlinks;
 extern int sysctl_protected_hardlinks;
 
+typedef __kernel_rwf_t rwf_t;
+
 struct buffer_head;
 typedef int (get_block_t)(struct inode *inode, sector_t iblock,
 			struct buffer_head *bh_result, int create);
@@ -1758,9 +1760,9 @@ extern ssize_t __vfs_write(struct file *, const char __user *, size_t, loff_t *)
 extern ssize_t vfs_read(struct file *, char __user *, size_t, loff_t *);
 extern ssize_t vfs_write(struct file *, const char __user *, size_t, loff_t *);
 extern ssize_t vfs_readv(struct file *, const struct iovec __user *,
-		unsigned long, loff_t *, int);
+		unsigned long, loff_t *, rwf_t);
 extern ssize_t vfs_writev(struct file *, const struct iovec __user *,
-		unsigned long, loff_t *, int);
+		unsigned long, loff_t *, rwf_t);
 extern ssize_t vfs_copy_file_range(struct file *, loff_t , struct file *,
 				   loff_t, size_t, unsigned int);
 extern int vfs_clone_file_prep_inodes(struct inode *inode_in, loff_t pos_in,
@@ -2874,9 +2876,9 @@ extern ssize_t generic_file_direct_write(struct kiocb *, struct iov_iter *);
 extern ssize_t generic_perform_write(struct file *, struct iov_iter *, loff_t);
 
 ssize_t vfs_iter_read(struct file *file, struct iov_iter *iter, loff_t *ppos,
-		int flags);
+		rwf_t flags);
 ssize_t vfs_iter_write(struct file *file, struct iov_iter *iter, loff_t *ppos,
-		int flags);
+		rwf_t flags);
 
 /* fs/block_dev.c */
 extern ssize_t blkdev_read_iter(struct kiocb *iocb, struct iov_iter *to);
@@ -3143,7 +3145,7 @@ static inline int iocb_flags(struct file *file)
 	return res;
 }
 
-static inline int kiocb_set_rw_flags(struct kiocb *ki, int flags)
+static inline int kiocb_set_rw_flags(struct kiocb *ki, rwf_t flags)
 {
 	if (unlikely(flags & ~RWF_SUPPORTED))
 		return -EOPNOTSUPP;
diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
index 0bc1d2e8cc17..138c94535864 100644
--- a/include/linux/syscalls.h
+++ b/include/linux/syscalls.h
@@ -579,12 +579,12 @@ asmlinkage long sys_preadv(unsigned long fd, const struct iovec __user *vec,
 			   unsigned long vlen, unsigned long pos_l, unsigned long pos_h);
 asmlinkage long sys_preadv2(unsigned long fd, const struct iovec __user *vec,
 			    unsigned long vlen, unsigned long pos_l, unsigned long pos_h,
-			    int flags);
+			    rwf_t flags);
 asmlinkage long sys_pwritev(unsigned long fd, const struct iovec __user *vec,
 			    unsigned long vlen, unsigned long pos_l, unsigned long pos_h);
 asmlinkage long sys_pwritev2(unsigned long fd, const struct iovec __user *vec,
 			    unsigned long vlen, unsigned long pos_l, unsigned long pos_h,
-			    int flags);
+			    rwf_t flags);
 asmlinkage long sys_getcwd(char __user *buf, unsigned long size);
 asmlinkage long sys_mkdir(const char __user *pathname, umode_t mode);
 asmlinkage long sys_chdir(const char __user *filename);
diff --git a/include/uapi/linux/aio_abi.h b/include/uapi/linux/aio_abi.h
index a2d4a8ac94ca..a04adbc70ddf 100644
--- a/include/uapi/linux/aio_abi.h
+++ b/include/uapi/linux/aio_abi.h
@@ -28,6 +28,7 @@
 #define __LINUX__AIO_ABI_H
 
 #include <linux/types.h>
+#include <linux/fs.h>
 #include <asm/byteorder.h>
 
 typedef __kernel_ulong_t aio_context_t;
@@ -62,14 +63,6 @@ struct io_event {
 	__s64		res2;		/* secondary result */
 };
 
-#if defined(__BYTE_ORDER) ? __BYTE_ORDER == __LITTLE_ENDIAN : defined(__LITTLE_ENDIAN)
-#define PADDED(x,y)	x, y
-#elif defined(__BYTE_ORDER) ? __BYTE_ORDER == __BIG_ENDIAN : defined(__BIG_ENDIAN)
-#define PADDED(x,y)	y, x
-#else
-#error edit for your odd byteorder.
-#endif
-
 /*
  * we always use a 64bit off_t when communicating
  * with userland.  its up to libraries to do the
@@ -79,8 +72,16 @@ struct io_event {
 struct iocb {
 	/* these are internal to the kernel/libc. */
 	__u64	aio_data;	/* data to be returned in event's data */
-	__u32	PADDED(aio_key, aio_rw_flags);
-				/* the kernel sets aio_key to the req # */
+
+#if defined(__BYTE_ORDER) ? __BYTE_ORDER == __LITTLE_ENDIAN : defined(__LITTLE_ENDIAN)
+	__u32	aio_key;	/* the kernel sets aio_key to the req # */
+	__kernel_rwf_t aio_rw_flags;	/* RWF_* flags */
+#elif defined(__BYTE_ORDER) ? __BYTE_ORDER == __BIG_ENDIAN : defined(__BIG_ENDIAN)
+	__kernel_rwf_t aio_rw_flags;	/* RWF_* flags */
+	__u32	aio_key;	/* the kernel sets aio_key to the req # */
+#else
+#error edit for your odd byteorder.
+#endif
 
 	/* common fields */
 	__u16	aio_lio_opcode;	/* see IOCB_CMD_ above */
diff --git a/include/uapi/linux/fs.h b/include/uapi/linux/fs.h
index b7495d05e8de..56235dddea7d 100644
--- a/include/uapi/linux/fs.h
+++ b/include/uapi/linux/fs.h
@@ -358,13 +358,25 @@ struct fscrypt_key {
 #define SYNC_FILE_RANGE_WRITE		2
 #define SYNC_FILE_RANGE_WAIT_AFTER	4
 
-/* flags for preadv2/pwritev2: */
-#define RWF_HIPRI			0x00000001 /* high priority request, poll if possible */
-#define RWF_DSYNC			0x00000002 /* per-IO O_DSYNC */
-#define RWF_SYNC			0x00000004 /* per-IO O_SYNC */
-#define RWF_NOWAIT			0x00000008 /* per-IO, return -EAGAIN if operation would block */
-
-#define RWF_SUPPORTED			(RWF_HIPRI | RWF_DSYNC | RWF_SYNC |\
-					 RWF_NOWAIT)
+/*
+ * Flags for preadv2/pwritev2:
+ */
+
+typedef int __bitwise __kernel_rwf_t;
+
+/* high priority request, poll if possible */
+#define RWF_HIPRI	((__force __kernel_rwf_t)0x00000001)
+
+/* per-IO O_DSYNC */
+#define RWF_DSYNC	((__force __kernel_rwf_t)0x00000002)
+
+/* per-IO O_SYNC */
+#define RWF_SYNC	((__force __kernel_rwf_t)0x00000004)
+
+/* per-IO, return -EAGAIN if operation would block */
+#define RWF_NOWAIT	((__force __kernel_rwf_t)0x00000008)
+
+/* mask of flags supported by the kernel */
+#define RWF_SUPPORTED	(RWF_HIPRI | RWF_DSYNC | RWF_SYNC | RWF_NOWAIT)
 
 #endif /* _UAPI_LINUX_FS_H */
-- 
cgit v1.2.3


From f58e76c1c551c7577b25a6fe493d82f5214331b7 Mon Sep 17 00:00:00 2001
From: Bart Van Assche <bart.vanassche@wdc.com>
Date: Wed, 23 Aug 2017 15:29:10 -0700
Subject: <linux/uaccess.h>: Fix copy_in_user() declaration

copy_in_user() copies data from user-space address @from to user-
space address @to. Hence declare both @from and @to as user-space
pointers.

Fixes: commit d597580d3737 ("generic ...copy_..._user primitives")
Signed-off-by: Bart Van Assche <bart.vanassche@wdc.com>
Cc: <stable@vger.kernel.org>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 include/linux/uaccess.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/uaccess.h b/include/linux/uaccess.h
index acdd6f915a8d..20ef8e6ec2db 100644
--- a/include/linux/uaccess.h
+++ b/include/linux/uaccess.h
@@ -156,7 +156,7 @@ copy_to_user(void __user *to, const void *from, unsigned long n)
 }
 #ifdef CONFIG_COMPAT
 static __always_inline unsigned long __must_check
-copy_in_user(void __user *to, const void *from, unsigned long n)
+copy_in_user(void __user *to, const void __user *from, unsigned long n)
 {
 	might_fault();
 	if (access_ok(VERIFY_WRITE, to, n) && access_ok(VERIFY_READ, from, n))
-- 
cgit v1.2.3


From 1797f5b3cf0b3a73c42b89f7a8fd897417373730 Mon Sep 17 00:00:00 2001
From: Arkadi Sharshevsky <arkadis@mellanox.com>
Date: Thu, 31 Aug 2017 17:59:12 +0200
Subject: devlink: Add IPv6 header for dpipe

This will be used by the IPv6 host table which will be introduced in the
following patches. The fields in the header are added per-use. This header
is global and can be reused by many drivers.

Signed-off-by: Arkadi Sharshevsky <arkadis@mellanox.com>
Signed-off-by: Jiri Pirko <jiri@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/devlink.h        |  1 +
 include/uapi/linux/devlink.h |  5 +++++
 net/core/devlink.c           | 17 +++++++++++++++++
 3 files changed, 23 insertions(+)

(limited to 'include')

diff --git a/include/net/devlink.h b/include/net/devlink.h
index aaf7178127a2..b9654e133599 100644
--- a/include/net/devlink.h
+++ b/include/net/devlink.h
@@ -330,6 +330,7 @@ int devlink_dpipe_match_put(struct sk_buff *skb,
 			    struct devlink_dpipe_match *match);
 extern struct devlink_dpipe_header devlink_dpipe_header_ethernet;
 extern struct devlink_dpipe_header devlink_dpipe_header_ipv4;
+extern struct devlink_dpipe_header devlink_dpipe_header_ipv6;
 
 #else
 
diff --git a/include/uapi/linux/devlink.h b/include/uapi/linux/devlink.h
index 6c172548589d..0cbca96c66b9 100644
--- a/include/uapi/linux/devlink.h
+++ b/include/uapi/linux/devlink.h
@@ -234,9 +234,14 @@ enum devlink_dpipe_field_ipv4_id {
 	DEVLINK_DPIPE_FIELD_IPV4_DST_IP,
 };
 
+enum devlink_dpipe_field_ipv6_id {
+	DEVLINK_DPIPE_FIELD_IPV6_DST_IP,
+};
+
 enum devlink_dpipe_header_id {
 	DEVLINK_DPIPE_HEADER_ETHERNET,
 	DEVLINK_DPIPE_HEADER_IPV4,
+	DEVLINK_DPIPE_HEADER_IPV6,
 };
 
 #endif /* _UAPI_LINUX_DEVLINK_H_ */
diff --git a/net/core/devlink.c b/net/core/devlink.c
index cbc4b0461b0f..7d430c1d9c3e 100644
--- a/net/core/devlink.c
+++ b/net/core/devlink.c
@@ -63,6 +63,23 @@ struct devlink_dpipe_header devlink_dpipe_header_ipv4 = {
 };
 EXPORT_SYMBOL(devlink_dpipe_header_ipv4);
 
+static struct devlink_dpipe_field devlink_dpipe_fields_ipv6[] = {
+	{
+		.name = "destination ip",
+		.id = DEVLINK_DPIPE_FIELD_IPV6_DST_IP,
+		.bitwidth = 128,
+	},
+};
+
+struct devlink_dpipe_header devlink_dpipe_header_ipv6 = {
+	.name = "ipv6",
+	.id = DEVLINK_DPIPE_HEADER_IPV6,
+	.fields = devlink_dpipe_fields_ipv6,
+	.fields_count = ARRAY_SIZE(devlink_dpipe_fields_ipv6),
+	.global = true,
+};
+EXPORT_SYMBOL(devlink_dpipe_header_ipv6);
+
 EXPORT_TRACEPOINT_SYMBOL_GPL(devlink_hwmsg);
 
 static LIST_HEAD(devlink_list);
-- 
cgit v1.2.3


From 065e63f951432068ba89a844fcbff68ea16ee186 Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (VMware)" <rostedt@goodmis.org>
Date: Thu, 31 Aug 2017 17:03:47 -0400
Subject: tracing: Only have rmmod clear buffers that its events were active in

Currently, when a module event is enabled, when that module is removed, it
clears all ring buffers. This is to prevent another module from being loaded
and having one of its trace event IDs from reusing a trace event ID of the
removed module. This could cause undesirable effects as the trace event of
the new module would be using its own processing algorithms to process raw
data of another event. To prevent this, when a module is loaded, if any of
its events have been used (signified by the WAS_ENABLED event call flag,
which is never cleared), all ring buffers are cleared, just in case any one
of them contains event data of the removed event.

The problem is, there's no reason to clear all ring buffers if only one (or
less than all of them) uses one of the events. Instead, only clear the ring
buffers that recorded the events of a module that is being removed.

To do this, instead of keeping the WAS_ENABLED flag with the trace event
call, move it to the per instance (per ring buffer) event file descriptor.
The event file descriptor maps each event to a separate ring buffer
instance. Then when the module is removed, only the ring buffers that
activated one of the module's events get cleared. The rest are not touched.

Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
---
 include/linux/trace_events.h |  8 +++-----
 kernel/trace/trace.c         |  3 +++
 kernel/trace/trace.h         |  1 +
 kernel/trace/trace_events.c  | 15 +++++++--------
 4 files changed, 14 insertions(+), 13 deletions(-)

(limited to 'include')

diff --git a/include/linux/trace_events.h b/include/linux/trace_events.h
index 536c80ff7ad9..3702b9cb5dc8 100644
--- a/include/linux/trace_events.h
+++ b/include/linux/trace_events.h
@@ -217,7 +217,6 @@ enum {
 	TRACE_EVENT_FL_CAP_ANY_BIT,
 	TRACE_EVENT_FL_NO_SET_FILTER_BIT,
 	TRACE_EVENT_FL_IGNORE_ENABLE_BIT,
-	TRACE_EVENT_FL_WAS_ENABLED_BIT,
 	TRACE_EVENT_FL_TRACEPOINT_BIT,
 	TRACE_EVENT_FL_KPROBE_BIT,
 	TRACE_EVENT_FL_UPROBE_BIT,
@@ -229,9 +228,6 @@ enum {
  *  CAP_ANY	  - Any user can enable for perf
  *  NO_SET_FILTER - Set when filter has error and is to be ignored
  *  IGNORE_ENABLE - For trace internal events, do not enable with debugfs file
- *  WAS_ENABLED   - Set and stays set when an event was ever enabled
- *                    (used for module unloading, if a module event is enabled,
- *                     it is best to clear the buffers that used it).
  *  TRACEPOINT    - Event is a tracepoint
  *  KPROBE        - Event is a kprobe
  *  UPROBE        - Event is a uprobe
@@ -241,7 +237,6 @@ enum {
 	TRACE_EVENT_FL_CAP_ANY		= (1 << TRACE_EVENT_FL_CAP_ANY_BIT),
 	TRACE_EVENT_FL_NO_SET_FILTER	= (1 << TRACE_EVENT_FL_NO_SET_FILTER_BIT),
 	TRACE_EVENT_FL_IGNORE_ENABLE	= (1 << TRACE_EVENT_FL_IGNORE_ENABLE_BIT),
-	TRACE_EVENT_FL_WAS_ENABLED	= (1 << TRACE_EVENT_FL_WAS_ENABLED_BIT),
 	TRACE_EVENT_FL_TRACEPOINT	= (1 << TRACE_EVENT_FL_TRACEPOINT_BIT),
 	TRACE_EVENT_FL_KPROBE		= (1 << TRACE_EVENT_FL_KPROBE_BIT),
 	TRACE_EVENT_FL_UPROBE		= (1 << TRACE_EVENT_FL_UPROBE_BIT),
@@ -306,6 +301,7 @@ enum {
 	EVENT_FILE_FL_TRIGGER_MODE_BIT,
 	EVENT_FILE_FL_TRIGGER_COND_BIT,
 	EVENT_FILE_FL_PID_FILTER_BIT,
+	EVENT_FILE_FL_WAS_ENABLED_BIT,
 };
 
 /*
@@ -321,6 +317,7 @@ enum {
  *  TRIGGER_MODE  - When set, invoke the triggers associated with the event
  *  TRIGGER_COND  - When set, one or more triggers has an associated filter
  *  PID_FILTER    - When set, the event is filtered based on pid
+ *  WAS_ENABLED   - Set when enabled to know to clear trace on module removal
  */
 enum {
 	EVENT_FILE_FL_ENABLED		= (1 << EVENT_FILE_FL_ENABLED_BIT),
@@ -333,6 +330,7 @@ enum {
 	EVENT_FILE_FL_TRIGGER_MODE	= (1 << EVENT_FILE_FL_TRIGGER_MODE_BIT),
 	EVENT_FILE_FL_TRIGGER_COND	= (1 << EVENT_FILE_FL_TRIGGER_COND_BIT),
 	EVENT_FILE_FL_PID_FILTER	= (1 << EVENT_FILE_FL_PID_FILTER_BIT),
+	EVENT_FILE_FL_WAS_ENABLED	= (1 << EVENT_FILE_FL_WAS_ENABLED_BIT),
 };
 
 struct trace_event_file {
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 44004d8aa3b3..30338a835a51 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -1702,6 +1702,9 @@ void tracing_reset_all_online_cpus(void)
 	struct trace_array *tr;
 
 	list_for_each_entry(tr, &ftrace_trace_arrays, list) {
+		if (!tr->clear_trace)
+			continue;
+		tr->clear_trace = false;
 		tracing_reset_online_cpus(&tr->trace_buffer);
 #ifdef CONFIG_TRACER_MAX_TRACE
 		tracing_reset_online_cpus(&tr->max_buffer);
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 490ba229931d..fb5d54d0d1b3 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -245,6 +245,7 @@ struct trace_array {
 	int			stop_count;
 	int			clock_id;
 	int			nr_topts;
+	bool			clear_trace;
 	struct tracer		*current_trace;
 	unsigned int		trace_flags;
 	unsigned char		trace_flags_index[TRACE_FLAGS_MAX_SIZE];
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index 36132f9280e6..c93540c5df21 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -466,7 +466,7 @@ static int __ftrace_event_enable_disable(struct trace_event_file *file,
 			set_bit(EVENT_FILE_FL_ENABLED_BIT, &file->flags);
 
 			/* WAS_ENABLED gets set but never cleared. */
-			call->flags |= TRACE_EVENT_FL_WAS_ENABLED;
+			set_bit(EVENT_FILE_FL_WAS_ENABLED_BIT, &file->flags);
 		}
 		break;
 	}
@@ -2058,6 +2058,10 @@ static void event_remove(struct trace_event_call *call)
 	do_for_each_event_file(tr, file) {
 		if (file->event_call != call)
 			continue;
+
+		if (file->flags & EVENT_FILE_FL_WAS_ENABLED)
+			tr->clear_trace = true;
+
 		ftrace_event_enable_disable(file, 0);
 		/*
 		 * The do_for_each_event_file() is
@@ -2396,15 +2400,11 @@ static void trace_module_add_events(struct module *mod)
 static void trace_module_remove_events(struct module *mod)
 {
 	struct trace_event_call *call, *p;
-	bool clear_trace = false;
 
 	down_write(&trace_event_sem);
 	list_for_each_entry_safe(call, p, &ftrace_events, list) {
-		if (call->mod == mod) {
-			if (call->flags & TRACE_EVENT_FL_WAS_ENABLED)
-				clear_trace = true;
+		if (call->mod == mod)
 			__trace_remove_event_call(call);
-		}
 	}
 	up_write(&trace_event_sem);
 
@@ -2416,8 +2416,7 @@ static void trace_module_remove_events(struct module *mod)
 	 * over from this module may be passed to the new module events and
 	 * unexpected results may occur.
 	 */
-	if (clear_trace)
-		tracing_reset_all_online_cpus();
+	tracing_reset_all_online_cpus();
 }
 
 static int trace_module_notify(struct notifier_block *self,
-- 
cgit v1.2.3


From 5deb67f77a266010e2c10fb124b7516d0d258ce8 Mon Sep 17 00:00:00 2001
From: Robin Murphy <robin.murphy@arm.com>
Date: Thu, 31 Aug 2017 12:27:09 +0100
Subject: libnvdimm, nd_blk: remove mmio_flush_range()

mmio_flush_range() suffers from a lack of clearly-defined semantics,
and is somewhat ambiguous to port to other architectures where the
scope of the writeback implied by "flush" and ordering might matter,
but MMIO would tend to imply non-cacheable anyway. Per the rationale
in 67a3e8fe9015 ("nd_blk: change aperture mapping from WC to WB"), the
only existing use is actually to invalidate clean cache lines for
ARCH_MEMREMAP_PMEM type mappings *without* writeback. Since the recent
cleanup of the pmem API, that also now happens to be the exact purpose
of arch_invalidate_pmem(), which would be a far more well-defined tool
for the job.

Rather than risk potentially inconsistent implementations of
mmio_flush_range() for the sake of one callsite, streamline things by
removing it entirely and instead move the ARCH_MEMREMAP_PMEM related
definitions up to the libnvdimm level, so they can be shared by NFIT
as well. This allows NFIT to be enabled for arm64.

Signed-off-by: Robin Murphy <robin.murphy@arm.com>
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 arch/x86/Kconfig                  |  1 -
 arch/x86/include/asm/cacheflush.h |  2 --
 drivers/acpi/nfit/Kconfig         |  2 +-
 drivers/acpi/nfit/core.c          |  2 +-
 drivers/nvdimm/pmem.h             | 14 --------------
 include/linux/libnvdimm.h         | 15 +++++++++++++++
 lib/Kconfig                       |  3 ---
 tools/testing/nvdimm/test/nfit.c  |  4 ++--
 8 files changed, 19 insertions(+), 24 deletions(-)

(limited to 'include')

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 781521b7cf9e..5f3b756ec0d3 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -53,7 +53,6 @@ config X86
 	select ARCH_HAS_FORTIFY_SOURCE
 	select ARCH_HAS_GCOV_PROFILE_ALL
 	select ARCH_HAS_KCOV			if X86_64
-	select ARCH_HAS_MMIO_FLUSH
 	select ARCH_HAS_PMEM_API		if X86_64
 	select ARCH_HAS_UACCESS_FLUSHCACHE	if X86_64
 	select ARCH_HAS_SET_MEMORY
diff --git a/arch/x86/include/asm/cacheflush.h b/arch/x86/include/asm/cacheflush.h
index 8b4140f6724f..cb9a1af109b4 100644
--- a/arch/x86/include/asm/cacheflush.h
+++ b/arch/x86/include/asm/cacheflush.h
@@ -7,6 +7,4 @@
 
 void clflush_cache_range(void *addr, unsigned int size);
 
-#define mmio_flush_range(addr, size) clflush_cache_range(addr, size)
-
 #endif /* _ASM_X86_CACHEFLUSH_H */
diff --git a/drivers/acpi/nfit/Kconfig b/drivers/acpi/nfit/Kconfig
index 6d3351452ea2..929ba4da0b30 100644
--- a/drivers/acpi/nfit/Kconfig
+++ b/drivers/acpi/nfit/Kconfig
@@ -2,7 +2,7 @@ config ACPI_NFIT
 	tristate "ACPI NVDIMM Firmware Interface Table (NFIT)"
 	depends on PHYS_ADDR_T_64BIT
 	depends on BLK_DEV
-	depends on ARCH_HAS_MMIO_FLUSH
+	depends on ARCH_HAS_PMEM_API
 	select LIBNVDIMM
 	help
 	  Infrastructure to probe ACPI 6 compliant platforms for
diff --git a/drivers/acpi/nfit/core.c b/drivers/acpi/nfit/core.c
index 03105648f9b1..c20124a6eb49 100644
--- a/drivers/acpi/nfit/core.c
+++ b/drivers/acpi/nfit/core.c
@@ -1964,7 +1964,7 @@ static int acpi_nfit_blk_single_io(struct nfit_blk *nfit_blk,
 			memcpy_flushcache(mmio->addr.aperture + offset, iobuf + copied, c);
 		else {
 			if (nfit_blk->dimm_flags & NFIT_BLK_READ_FLUSH)
-				mmio_flush_range((void __force *)
+				arch_invalidate_pmem((void __force *)
 					mmio->addr.aperture + offset, c);
 
 			memcpy(iobuf + copied, mmio->addr.aperture + offset, c);
diff --git a/drivers/nvdimm/pmem.h b/drivers/nvdimm/pmem.h
index 5434321cad67..c5917f040fa7 100644
--- a/drivers/nvdimm/pmem.h
+++ b/drivers/nvdimm/pmem.h
@@ -5,20 +5,6 @@
 #include <linux/pfn_t.h>
 #include <linux/fs.h>
 
-#ifdef CONFIG_ARCH_HAS_PMEM_API
-#define ARCH_MEMREMAP_PMEM MEMREMAP_WB
-void arch_wb_cache_pmem(void *addr, size_t size);
-void arch_invalidate_pmem(void *addr, size_t size);
-#else
-#define ARCH_MEMREMAP_PMEM MEMREMAP_WT
-static inline void arch_wb_cache_pmem(void *addr, size_t size)
-{
-}
-static inline void arch_invalidate_pmem(void *addr, size_t size)
-{
-}
-#endif
-
 /* this definition is in it's own header for tools/testing/nvdimm to consume */
 struct pmem_device {
 	/* One contiguous memory region per device */
diff --git a/include/linux/libnvdimm.h b/include/linux/libnvdimm.h
index 9b8d81a7b80e..3eaad2fbf284 100644
--- a/include/linux/libnvdimm.h
+++ b/include/linux/libnvdimm.h
@@ -174,4 +174,19 @@ u64 nd_fletcher64(void *addr, size_t len, bool le);
 void nvdimm_flush(struct nd_region *nd_region);
 int nvdimm_has_flush(struct nd_region *nd_region);
 int nvdimm_has_cache(struct nd_region *nd_region);
+
+#ifdef CONFIG_ARCH_HAS_PMEM_API
+#define ARCH_MEMREMAP_PMEM MEMREMAP_WB
+void arch_wb_cache_pmem(void *addr, size_t size);
+void arch_invalidate_pmem(void *addr, size_t size);
+#else
+#define ARCH_MEMREMAP_PMEM MEMREMAP_WT
+static inline void arch_wb_cache_pmem(void *addr, size_t size)
+{
+}
+static inline void arch_invalidate_pmem(void *addr, size_t size)
+{
+}
+#endif
+
 #endif /* __LIBNVDIMM_H__ */
diff --git a/lib/Kconfig b/lib/Kconfig
index 6762529ad9e4..527da69e3be1 100644
--- a/lib/Kconfig
+++ b/lib/Kconfig
@@ -559,9 +559,6 @@ config ARCH_HAS_PMEM_API
 config ARCH_HAS_UACCESS_FLUSHCACHE
 	bool
 
-config ARCH_HAS_MMIO_FLUSH
-	bool
-
 config STACKDEPOT
 	bool
 	select STACKTRACE
diff --git a/tools/testing/nvdimm/test/nfit.c b/tools/testing/nvdimm/test/nfit.c
index 4c2fa98ef39d..d20791c3f499 100644
--- a/tools/testing/nvdimm/test/nfit.c
+++ b/tools/testing/nvdimm/test/nfit.c
@@ -1546,8 +1546,8 @@ static int nfit_test_blk_do_io(struct nd_blk_region *ndbr, resource_size_t dpa,
 	else {
 		memcpy(iobuf, mmio->addr.base + dpa, len);
 
-		/* give us some some coverage of the mmio_flush_range() API */
-		mmio_flush_range(mmio->addr.base + dpa, len);
+		/* give us some some coverage of the arch_invalidate_pmem() API */
+		arch_invalidate_pmem(mmio->addr.base + dpa, len);
 	}
 	nd_region_release_lane(nd_region, lane);
 
-- 
cgit v1.2.3


From a4d1a885251382250ec315482bdd8ca52dd61e6a Mon Sep 17 00:00:00 2001
From: Jérôme Glisse <jglisse@redhat.com>
Date: Thu, 31 Aug 2017 17:17:26 -0400
Subject: dax: update to new mmu_notifier semantic
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Replace all mmu_notifier_invalidate_page() calls by *_invalidate_range()
and make sure it is bracketed by calls to *_invalidate_range_start()/end().

Note that because we can not presume the pmd value or pte value we have
to assume the worst and unconditionaly report an invalidation as
happening.

Signed-off-by: Jérôme Glisse <jglisse@redhat.com>
Cc: Dan Williams <dan.j.williams@intel.com>
Cc: Ross Zwisler <ross.zwisler@linux.intel.com>
Cc: Bernhard Held <berny156@gmx.de>
Cc: Adam Borowski <kilobyte@angband.pl>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: Radim Krčmář <rkrcmar@redhat.com>
Cc: Wanpeng Li <kernellwp@gmail.com>
Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: Takashi Iwai <tiwai@suse.de>
Cc: Nadav Amit <nadav.amit@gmail.com>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Cc: axie <axie@amd.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/dax.c           | 19 +++++++++++--------
 include/linux/mm.h |  1 +
 mm/memory.c        | 26 +++++++++++++++++++++-----
 3 files changed, 33 insertions(+), 13 deletions(-)

(limited to 'include')

diff --git a/fs/dax.c b/fs/dax.c
index 865d42c63e23..ab925dc6647a 100644
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -646,11 +646,10 @@ static void dax_mapping_entry_mkclean(struct address_space *mapping,
 	pte_t pte, *ptep = NULL;
 	pmd_t *pmdp = NULL;
 	spinlock_t *ptl;
-	bool changed;
 
 	i_mmap_lock_read(mapping);
 	vma_interval_tree_foreach(vma, &mapping->i_mmap, index, index) {
-		unsigned long address;
+		unsigned long address, start, end;
 
 		cond_resched();
 
@@ -658,8 +657,13 @@ static void dax_mapping_entry_mkclean(struct address_space *mapping,
 			continue;
 
 		address = pgoff_address(index, vma);
-		changed = false;
-		if (follow_pte_pmd(vma->vm_mm, address, &ptep, &pmdp, &ptl))
+
+		/*
+		 * Note because we provide start/end to follow_pte_pmd it will
+		 * call mmu_notifier_invalidate_range_start() on our behalf
+		 * before taking any lock.
+		 */
+		if (follow_pte_pmd(vma->vm_mm, address, &start, &end, &ptep, &pmdp, &ptl))
 			continue;
 
 		if (pmdp) {
@@ -676,7 +680,7 @@ static void dax_mapping_entry_mkclean(struct address_space *mapping,
 			pmd = pmd_wrprotect(pmd);
 			pmd = pmd_mkclean(pmd);
 			set_pmd_at(vma->vm_mm, address, pmdp, pmd);
-			changed = true;
+			mmu_notifier_invalidate_range(vma->vm_mm, start, end);
 unlock_pmd:
 			spin_unlock(ptl);
 #endif
@@ -691,13 +695,12 @@ unlock_pmd:
 			pte = pte_wrprotect(pte);
 			pte = pte_mkclean(pte);
 			set_pte_at(vma->vm_mm, address, ptep, pte);
-			changed = true;
+			mmu_notifier_invalidate_range(vma->vm_mm, start, end);
 unlock_pte:
 			pte_unmap_unlock(ptep, ptl);
 		}
 
-		if (changed)
-			mmu_notifier_invalidate_page(vma->vm_mm, address);
+		mmu_notifier_invalidate_range_end(vma->vm_mm, start, end);
 	}
 	i_mmap_unlock_read(mapping);
 }
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 46b9ac5e8569..c1f6c95f3496 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1260,6 +1260,7 @@ int copy_page_range(struct mm_struct *dst, struct mm_struct *src,
 void unmap_mapping_range(struct address_space *mapping,
 		loff_t const holebegin, loff_t const holelen, int even_cows);
 int follow_pte_pmd(struct mm_struct *mm, unsigned long address,
+			     unsigned long *start, unsigned long *end,
 			     pte_t **ptepp, pmd_t **pmdpp, spinlock_t **ptlp);
 int follow_pfn(struct vm_area_struct *vma, unsigned long address,
 	unsigned long *pfn);
diff --git a/mm/memory.c b/mm/memory.c
index fe2fba27ded2..56e48e4593cb 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -4008,7 +4008,8 @@ int __pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long address)
 #endif /* __PAGETABLE_PMD_FOLDED */
 
 static int __follow_pte_pmd(struct mm_struct *mm, unsigned long address,
-		pte_t **ptepp, pmd_t **pmdpp, spinlock_t **ptlp)
+			    unsigned long *start, unsigned long *end,
+			    pte_t **ptepp, pmd_t **pmdpp, spinlock_t **ptlp)
 {
 	pgd_t *pgd;
 	p4d_t *p4d;
@@ -4035,17 +4036,29 @@ static int __follow_pte_pmd(struct mm_struct *mm, unsigned long address,
 		if (!pmdpp)
 			goto out;
 
+		if (start && end) {
+			*start = address & PMD_MASK;
+			*end = *start + PMD_SIZE;
+			mmu_notifier_invalidate_range_start(mm, *start, *end);
+		}
 		*ptlp = pmd_lock(mm, pmd);
 		if (pmd_huge(*pmd)) {
 			*pmdpp = pmd;
 			return 0;
 		}
 		spin_unlock(*ptlp);
+		if (start && end)
+			mmu_notifier_invalidate_range_end(mm, *start, *end);
 	}
 
 	if (pmd_none(*pmd) || unlikely(pmd_bad(*pmd)))
 		goto out;
 
+	if (start && end) {
+		*start = address & PAGE_MASK;
+		*end = *start + PAGE_SIZE;
+		mmu_notifier_invalidate_range_start(mm, *start, *end);
+	}
 	ptep = pte_offset_map_lock(mm, pmd, address, ptlp);
 	if (!pte_present(*ptep))
 		goto unlock;
@@ -4053,6 +4066,8 @@ static int __follow_pte_pmd(struct mm_struct *mm, unsigned long address,
 	return 0;
 unlock:
 	pte_unmap_unlock(ptep, *ptlp);
+	if (start && end)
+		mmu_notifier_invalidate_range_end(mm, *start, *end);
 out:
 	return -EINVAL;
 }
@@ -4064,20 +4079,21 @@ static inline int follow_pte(struct mm_struct *mm, unsigned long address,
 
 	/* (void) is needed to make gcc happy */
 	(void) __cond_lock(*ptlp,
-			   !(res = __follow_pte_pmd(mm, address, ptepp, NULL,
-					   ptlp)));
+			   !(res = __follow_pte_pmd(mm, address, NULL, NULL,
+						    ptepp, NULL, ptlp)));
 	return res;
 }
 
 int follow_pte_pmd(struct mm_struct *mm, unsigned long address,
+			     unsigned long *start, unsigned long *end,
 			     pte_t **ptepp, pmd_t **pmdpp, spinlock_t **ptlp)
 {
 	int res;
 
 	/* (void) is needed to make gcc happy */
 	(void) __cond_lock(*ptlp,
-			   !(res = __follow_pte_pmd(mm, address, ptepp, pmdpp,
-					   ptlp)));
+			   !(res = __follow_pte_pmd(mm, address, start, end,
+						    ptepp, pmdpp, ptlp)));
 	return res;
 }
 EXPORT_SYMBOL(follow_pte_pmd);
-- 
cgit v1.2.3


From 5f32b265400de723ab0db23101a75ac073bdd980 Mon Sep 17 00:00:00 2001
From: Jérôme Glisse <jglisse@redhat.com>
Date: Thu, 31 Aug 2017 17:17:38 -0400
Subject: mm/mmu_notifier: kill invalidate_page
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The invalidate_page callback suffered from two pitfalls.  First it used
to happen after the page table lock was release and thus a new page
might have setup before the call to invalidate_page() happened.

This is in a weird way fixed by commit c7ab0d2fdc84 ("mm: convert
try_to_unmap_one() to use page_vma_mapped_walk()") that moved the
callback under the page table lock but this also broke several existing
users of the mmu_notifier API that assumed they could sleep inside this
callback.

The second pitfall was invalidate_page() being the only callback not
taking a range of address in respect to invalidation but was giving an
address and a page.  Lots of the callback implementers assumed this
could never be THP and thus failed to invalidate the appropriate range
for THP.

By killing this callback we unify the mmu_notifier callback API to
always take a virtual address range as input.

Finally this also simplifies the end user life as there is now two clear
choices:
  - invalidate_range_start()/end() callback (which allow you to sleep)
  - invalidate_range() where you can not sleep but happen right after
    page table update under page table lock

Signed-off-by: Jérôme Glisse <jglisse@redhat.com>
Cc: Bernhard Held <berny156@gmx.de>
Cc: Adam Borowski <kilobyte@angband.pl>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: Radim Krčmář <rkrcmar@redhat.com>
Cc: Wanpeng Li <kernellwp@gmail.com>
Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: Takashi Iwai <tiwai@suse.de>
Cc: Nadav Amit <nadav.amit@gmail.com>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Cc: axie <axie@amd.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/mmu_notifier.h | 25 -------------------------
 mm/mmu_notifier.c            | 14 --------------
 2 files changed, 39 deletions(-)

(limited to 'include')

diff --git a/include/linux/mmu_notifier.h b/include/linux/mmu_notifier.h
index c91b3bcd158f..7b2e31b1745a 100644
--- a/include/linux/mmu_notifier.h
+++ b/include/linux/mmu_notifier.h
@@ -94,17 +94,6 @@ struct mmu_notifier_ops {
 			   unsigned long address,
 			   pte_t pte);
 
-	/*
-	 * Before this is invoked any secondary MMU is still ok to
-	 * read/write to the page previously pointed to by the Linux
-	 * pte because the page hasn't been freed yet and it won't be
-	 * freed until this returns. If required set_page_dirty has to
-	 * be called internally to this method.
-	 */
-	void (*invalidate_page)(struct mmu_notifier *mn,
-				struct mm_struct *mm,
-				unsigned long address);
-
 	/*
 	 * invalidate_range_start() and invalidate_range_end() must be
 	 * paired and are called only when the mmap_sem and/or the
@@ -220,8 +209,6 @@ extern int __mmu_notifier_test_young(struct mm_struct *mm,
 				     unsigned long address);
 extern void __mmu_notifier_change_pte(struct mm_struct *mm,
 				      unsigned long address, pte_t pte);
-extern void __mmu_notifier_invalidate_page(struct mm_struct *mm,
-					  unsigned long address);
 extern void __mmu_notifier_invalidate_range_start(struct mm_struct *mm,
 				  unsigned long start, unsigned long end);
 extern void __mmu_notifier_invalidate_range_end(struct mm_struct *mm,
@@ -268,13 +255,6 @@ static inline void mmu_notifier_change_pte(struct mm_struct *mm,
 		__mmu_notifier_change_pte(mm, address, pte);
 }
 
-static inline void mmu_notifier_invalidate_page(struct mm_struct *mm,
-					  unsigned long address)
-{
-	if (mm_has_notifiers(mm))
-		__mmu_notifier_invalidate_page(mm, address);
-}
-
 static inline void mmu_notifier_invalidate_range_start(struct mm_struct *mm,
 				  unsigned long start, unsigned long end)
 {
@@ -442,11 +422,6 @@ static inline void mmu_notifier_change_pte(struct mm_struct *mm,
 {
 }
 
-static inline void mmu_notifier_invalidate_page(struct mm_struct *mm,
-					  unsigned long address)
-{
-}
-
 static inline void mmu_notifier_invalidate_range_start(struct mm_struct *mm,
 				  unsigned long start, unsigned long end)
 {
diff --git a/mm/mmu_notifier.c b/mm/mmu_notifier.c
index 54ca54562928..314285284e6e 100644
--- a/mm/mmu_notifier.c
+++ b/mm/mmu_notifier.c
@@ -174,20 +174,6 @@ void __mmu_notifier_change_pte(struct mm_struct *mm, unsigned long address,
 	srcu_read_unlock(&srcu, id);
 }
 
-void __mmu_notifier_invalidate_page(struct mm_struct *mm,
-					  unsigned long address)
-{
-	struct mmu_notifier *mn;
-	int id;
-
-	id = srcu_read_lock(&srcu);
-	hlist_for_each_entry_rcu(mn, &mm->mmu_notifier_mm->list, hlist) {
-		if (mn->ops->invalidate_page)
-			mn->ops->invalidate_page(mn, mm, address);
-	}
-	srcu_read_unlock(&srcu, id);
-}
-
 void __mmu_notifier_invalidate_range_start(struct mm_struct *mm,
 				  unsigned long start, unsigned long end)
 {
-- 
cgit v1.2.3


From c03567a8e8d5cf2aaca40e605c48f319dc2ead57 Mon Sep 17 00:00:00 2001
From: Joe Stringer <joe@ovn.org>
Date: Thu, 31 Aug 2017 16:15:33 -0700
Subject: include/linux/compiler.h: don't perform compiletime_assert with -O0

Commit c7acec713d14 ("kernel.h: handle pointers to arrays better in
container_of()") made use of __compiletime_assert() from container_of()
thus increasing the usage of this macro, allowing developers to notice
type conflicts in usage of container_of() at compile time.

However, the implementation of __compiletime_assert relies on compiler
optimizations to report an error.  This means that if a developer uses
"-O0" with any code that performs container_of(), the compiler will always
report an error regardless of whether there is an actual problem in the
code.

This patch disables compile_time_assert when optimizations are disabled to
allow such code to compile with CFLAGS="-O0".

Example compilation failure:

./include/linux/compiler.h:547:38: error: call to `__compiletime_assert_94' declared with attribute error: pointer type mismatch in container_of()
  _compiletime_assert(condition, msg, __compiletime_assert_, __LINE__)
                                      ^
./include/linux/compiler.h:530:4: note: in definition of macro `__compiletime_assert'
    prefix ## suffix();    \
    ^~~~~~
./include/linux/compiler.h:547:2: note: in expansion of macro `_compiletime_assert'
  _compiletime_assert(condition, msg, __compiletime_assert_, __LINE__)
  ^~~~~~~~~~~~~~~~~~~
./include/linux/build_bug.h:46:37: note: in expansion of macro `compiletime_assert'
 #define BUILD_BUG_ON_MSG(cond, msg) compiletime_assert(!(cond), msg)
                                     ^~~~~~~~~~~~~~~~~~
./include/linux/kernel.h:860:2: note: in expansion of macro `BUILD_BUG_ON_MSG'
  BUILD_BUG_ON_MSG(!__same_type(*(ptr), ((type *)0)->member) && \
  ^~~~~~~~~~~~~~~~

[akpm@linux-foundation.org: use do{}while(0), per Michal]
Link: http://lkml.kernel.org/r/20170829230114.11662-1-joe@ovn.org
Fixes: c7acec713d14c6c ("kernel.h: handle pointers to arrays better in container_of()")
Signed-off-by: Joe Stringer <joe@ovn.org>
Cc: Ian Abbott <abbotti@mev.co.uk>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Michal Nazarewicz <mina86@mina86.com>
Cc: Kees Cook <keescook@chromium.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/compiler.h | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/compiler.h b/include/linux/compiler.h
index eca8ad75e28b..043b60de041e 100644
--- a/include/linux/compiler.h
+++ b/include/linux/compiler.h
@@ -517,7 +517,8 @@ static __always_inline void __write_once_size(volatile void *p, void *res, int s
 # define __compiletime_error_fallback(condition) do { } while (0)
 #endif
 
-#define __compiletime_assert(condition, msg, prefix, suffix)		\
+#ifdef __OPTIMIZE__
+# define __compiletime_assert(condition, msg, prefix, suffix)		\
 	do {								\
 		bool __cond = !(condition);				\
 		extern void prefix ## suffix(void) __compiletime_error(msg); \
@@ -525,6 +526,9 @@ static __always_inline void __write_once_size(volatile void *p, void *res, int s
 			prefix ## suffix();				\
 		__compiletime_error_fallback(__cond);			\
 	} while (0)
+#else
+# define __compiletime_assert(condition, msg, prefix, suffix) do { } while (0)
+#endif
 
 #define _compiletime_assert(condition, msg, prefix, suffix) \
 	__compiletime_assert(condition, msg, prefix, suffix)
-- 
cgit v1.2.3


From 0a9c869d5c568054a828a38357f30d77659e5b1e Mon Sep 17 00:00:00 2001
From: Gabriel Fernandez <gabriel.fernandez@st.com>
Date: Mon, 21 Aug 2017 13:59:01 +0200
Subject: clk: gate: expose clk_gate_ops::is_enabled

This patch exposes clk_gate_ops::is_enabled as functions
that can be directly called and assigned in places like this so
we don't need wrapper functions that do nothing besides forward
the call.

Signed-off-by: Gabriel Fernandez <gabriel.fernandez@st.com>
Suggested-by: Stephen Boyd <sboyd@codeaurora.org>
Signed-off-by: Stephen Boyd <sboyd@codeaurora.org>
---
 drivers/clk/clk-gate.c       | 3 ++-
 include/linux/clk-provider.h | 1 +
 2 files changed, 3 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/drivers/clk/clk-gate.c b/drivers/clk/clk-gate.c
index 4e0c054a787c..dd82485e09a1 100644
--- a/drivers/clk/clk-gate.c
+++ b/drivers/clk/clk-gate.c
@@ -86,7 +86,7 @@ static void clk_gate_disable(struct clk_hw *hw)
 	clk_gate_endisable(hw, 0);
 }
 
-static int clk_gate_is_enabled(struct clk_hw *hw)
+int clk_gate_is_enabled(struct clk_hw *hw)
 {
 	u32 reg;
 	struct clk_gate *gate = to_clk_gate(hw);
@@ -101,6 +101,7 @@ static int clk_gate_is_enabled(struct clk_hw *hw)
 
 	return reg ? 1 : 0;
 }
+EXPORT_SYMBOL_GPL(clk_gate_is_enabled);
 
 const struct clk_ops clk_gate_ops = {
 	.enable = clk_gate_enable,
diff --git a/include/linux/clk-provider.h b/include/linux/clk-provider.h
index 1fc113fbf955..5100ec1b5d55 100644
--- a/include/linux/clk-provider.h
+++ b/include/linux/clk-provider.h
@@ -343,6 +343,7 @@ struct clk_hw *clk_hw_register_gate(struct device *dev, const char *name,
 		u8 clk_gate_flags, spinlock_t *lock);
 void clk_unregister_gate(struct clk *clk);
 void clk_hw_unregister_gate(struct clk_hw *hw);
+int clk_gate_is_enabled(struct clk_hw *hw);
 
 struct clk_div_table {
 	unsigned int	val;
-- 
cgit v1.2.3


From 3e4d618b0722b64c551c3f2fc4c4f9cb3558ed93 Mon Sep 17 00:00:00 2001
From: Gabriel Fernandez <gabriel.fernandez@st.com>
Date: Mon, 21 Aug 2017 13:59:02 +0200
Subject: clk: stm32h7: Add stm32h743 clock driver

This patch enables clocks for STM32H743 boards.

Signed-off-by: Gabriel Fernandez <gabriel.fernandez@st.com>

for MFD changes:
Acked-by: Lee Jones <lee.jones@linaro.org>

for DT-Bindings
Acked-by: Rob Herring <robh@kernel.org>
Signed-off-by: Stephen Boyd <sboyd@codeaurora.org>
---
 .../devicetree/bindings/clock/st,stm32h7-rcc.txt   |   71 +
 drivers/clk/Makefile                               |    1 +
 drivers/clk/clk-stm32h7.c                          | 1410 ++++++++++++++++++++
 include/dt-bindings/clock/stm32h7-clks.h           |  165 +++
 include/dt-bindings/mfd/stm32h7-rcc.h              |  136 ++
 5 files changed, 1783 insertions(+)
 create mode 100644 Documentation/devicetree/bindings/clock/st,stm32h7-rcc.txt
 create mode 100644 drivers/clk/clk-stm32h7.c
 create mode 100644 include/dt-bindings/clock/stm32h7-clks.h
 create mode 100644 include/dt-bindings/mfd/stm32h7-rcc.h

(limited to 'include')

diff --git a/Documentation/devicetree/bindings/clock/st,stm32h7-rcc.txt b/Documentation/devicetree/bindings/clock/st,stm32h7-rcc.txt
new file mode 100644
index 000000000000..a135504c7d57
--- /dev/null
+++ b/Documentation/devicetree/bindings/clock/st,stm32h7-rcc.txt
@@ -0,0 +1,71 @@
+STMicroelectronics STM32H7 Reset and Clock Controller
+=====================================================
+
+The RCC IP is both a reset and a clock controller.
+
+Please refer to clock-bindings.txt for common clock controller binding usage.
+Please also refer to reset.txt for common reset controller binding usage.
+
+Required properties:
+- compatible: Should be:
+  "st,stm32h743-rcc"
+
+- reg: should be register base and length as documented in the
+  datasheet
+
+- #reset-cells: 1, see below
+
+- #clock-cells : from common clock binding; shall be set to 1
+
+- clocks: External oscillator clock phandle
+  - high speed external clock signal (HSE)
+  - low speed external clock signal (LSE)
+  - external I2S clock (I2S_CKIN)
+
+Optional properties:
+- st,syscfg: phandle for pwrcfg, mandatory to disable/enable backup domain
+  write protection (RTC clock).
+
+Example:
+
+	rcc: reset-clock-controller@58024400 {
+		compatible = "st,stm32h743-rcc", "st,stm32-rcc";
+		reg = <0x58024400 0x400>;
+		#reset-cells = <1>;
+		#clock-cells = <2>;
+		clocks = <&clk_hse>, <&clk_lse>, <&clk_i2s_ckin>;
+
+		st,syscfg = <&pwrcfg>;
+};
+
+The peripheral clock consumer should specify the desired clock by
+having the clock ID in its "clocks" phandle cell.
+
+Example:
+
+		timer5: timer@40000c00 {
+			compatible = "st,stm32-timer";
+			reg = <0x40000c00 0x400>;
+			interrupts = <50>;
+			clocks = <&rcc TIM5_CK>;
+		};
+
+Specifying softreset control of devices
+=======================================
+
+Device nodes should specify the reset channel required in their "resets"
+property, containing a phandle to the reset device node and an index specifying
+which channel to use.
+The index is the bit number within the RCC registers bank, starting from RCC
+base address.
+It is calculated as: index = register_offset / 4 * 32 + bit_offset.
+Where bit_offset is the bit offset within the register.
+
+For example, for CRC reset:
+  crc = AHB4RSTR_offset / 4 * 32 + CRCRST_bit_offset = 0x88 / 4 * 32 + 19 = 1107
+
+Example:
+
+	timer2 {
+		resets	= <&rcc STM32H7_APB1L_RESET(TIM2)>;
+	};
diff --git a/drivers/clk/Makefile b/drivers/clk/Makefile
index 72e7c0eef32f..f87d085805a9 100644
--- a/drivers/clk/Makefile
+++ b/drivers/clk/Makefile
@@ -45,6 +45,7 @@ obj-$(CONFIG_COMMON_CLK_SI5351)		+= clk-si5351.o
 obj-$(CONFIG_COMMON_CLK_SI514)		+= clk-si514.o
 obj-$(CONFIG_COMMON_CLK_SI570)		+= clk-si570.o
 obj-$(CONFIG_ARCH_STM32)		+= clk-stm32f4.o
+obj-$(CONFIG_ARCH_STM32)		+= clk-stm32h7.o
 obj-$(CONFIG_ARCH_TANGO)		+= clk-tango4.o
 obj-$(CONFIG_CLK_TWL6040)		+= clk-twl6040.o
 obj-$(CONFIG_ARCH_U300)			+= clk-u300.o
diff --git a/drivers/clk/clk-stm32h7.c b/drivers/clk/clk-stm32h7.c
new file mode 100644
index 000000000000..a94c3f56c590
--- /dev/null
+++ b/drivers/clk/clk-stm32h7.c
@@ -0,0 +1,1410 @@
+/*
+ * Copyright (C) Gabriel Fernandez 2017
+ * Author: Gabriel Fernandez <gabriel.fernandez@st.com>
+ *
+ * License terms: GPL V2.0.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <linux/clk.h>
+#include <linux/clk-provider.h>
+#include <linux/err.h>
+#include <linux/io.h>
+#include <linux/mfd/syscon.h>
+#include <linux/of.h>
+#include <linux/of_address.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+#include <linux/regmap.h>
+
+#include <dt-bindings/clock/stm32h7-clks.h>
+
+/* Reset Clock Control Registers */
+#define RCC_CR		0x00
+#define RCC_CFGR	0x10
+#define RCC_D1CFGR	0x18
+#define RCC_D2CFGR	0x1C
+#define RCC_D3CFGR	0x20
+#define RCC_PLLCKSELR	0x28
+#define RCC_PLLCFGR	0x2C
+#define RCC_PLL1DIVR	0x30
+#define RCC_PLL1FRACR	0x34
+#define RCC_PLL2DIVR	0x38
+#define RCC_PLL2FRACR	0x3C
+#define RCC_PLL3DIVR	0x40
+#define RCC_PLL3FRACR	0x44
+#define RCC_D1CCIPR	0x4C
+#define RCC_D2CCIP1R	0x50
+#define RCC_D2CCIP2R	0x54
+#define RCC_D3CCIPR	0x58
+#define RCC_BDCR	0x70
+#define RCC_CSR		0x74
+#define RCC_AHB3ENR	0xD4
+#define RCC_AHB1ENR	0xD8
+#define RCC_AHB2ENR	0xDC
+#define RCC_AHB4ENR	0xE0
+#define RCC_APB3ENR	0xE4
+#define RCC_APB1LENR	0xE8
+#define RCC_APB1HENR	0xEC
+#define RCC_APB2ENR	0xF0
+#define RCC_APB4ENR	0xF4
+
+static DEFINE_SPINLOCK(stm32rcc_lock);
+
+static void __iomem *base;
+static struct clk_hw **hws;
+
+/* System clock parent */
+static const char * const sys_src[] = {
+	"hsi_ck", "csi_ck", "hse_ck", "pll1_p" };
+
+static const char * const tracein_src[] = {
+	"hsi_ck", "csi_ck", "hse_ck", "pll1_r" };
+
+static const char * const per_src[] = {
+	"hsi_ker", "csi_ker", "hse_ck", "disabled" };
+
+static const char * const pll_src[] = {
+	"hsi_ck", "csi_ck", "hse_ck", "no clock" };
+
+static const char * const sdmmc_src[] = { "pll1_q", "pll2_r" };
+
+static const char * const dsi_src[] = { "ck_dsi_phy", "pll2_q" };
+
+static const char * const qspi_src[] = {
+	"hclk", "pll1_q", "pll2_r", "per_ck" };
+
+static const char * const fmc_src[] = {
+	"hclk", "pll1_q", "pll2_r", "per_ck" };
+
+/* Kernel clock parent */
+static const char * const swp_src[] = {	"pclk1", "hsi_ker" };
+
+static const char * const fdcan_src[] = { "hse_ck", "pll1_q", "pll2_q" };
+
+static const char * const dfsdm1_src[] = { "pclk2", "sys_ck" };
+
+static const char * const spdifrx_src[] = {
+	"pll1_q", "pll2_r", "pll3_r", "hsi_ker" };
+
+static const char *spi_src1[5] = {
+	"pll1_q", "pll2_p", "pll3_p", NULL, "per_ck" };
+
+static const char * const spi_src2[] = {
+	"pclk2", "pll2_q", "pll3_q", "hsi_ker", "csi_ker", "hse_ck" };
+
+static const char * const spi_src3[] = {
+	"pclk4", "pll2_q", "pll3_q", "hsi_ker", "csi_ker", "hse_ck" };
+
+static const char * const lptim_src1[] = {
+	"pclk1", "pll2_p", "pll3_r", "lse_ck", "lsi_ck", "per_ck" };
+
+static const char * const lptim_src2[] = {
+	"pclk4", "pll2_p", "pll3_r", "lse_ck", "lsi_ck", "per_ck" };
+
+static const char * const cec_src[] = {"lse_ck", "lsi_ck", "csi_ker_div122" };
+
+static const char * const usbotg_src[] = {"pll1_q", "pll3_q", "rc48_ck" };
+
+/* i2c 1,2,3 src */
+static const char * const i2c_src1[] = {
+	"pclk1", "pll3_r", "hsi_ker", "csi_ker" };
+
+static const char * const i2c_src2[] = {
+	"pclk4", "pll3_r", "hsi_ker", "csi_ker" };
+
+static const char * const rng_src[] = {
+	"rc48_ck", "pll1_q", "lse_ck", "lsi_ck" };
+
+/* usart 1,6 src */
+static const char * const usart_src1[] = {
+	"pclk2", "pll2_q", "pll3_q", "hsi_ker", "csi_ker", "lse_ck" };
+
+/* usart 2,3,4,5,7,8 src */
+static const char * const usart_src2[] = {
+	"pclk1", "pll2_q", "pll3_q", "hsi_ker", "csi_ker", "lse_ck" };
+
+static const char *sai_src[5] = {
+	"pll1_q", "pll2_p", "pll3_p", NULL, "per_ck" };
+
+static const char * const adc_src[] = { "pll2_p", "pll3_r", "per_ck" };
+
+/* lptim 2,3,4,5 src */
+static const char * const lpuart1_src[] = {
+	"pclk3", "pll2_q", "pll3_q", "csi_ker", "lse_ck" };
+
+static const char * const hrtim_src[] = { "tim2_ker", "d1cpre" };
+
+/* RTC clock parent */
+static const char * const rtc_src[] = { "off", "lse_ck", "lsi_ck", "hse_1M" };
+
+/* Micro-controller output clock parent */
+static const char * const mco_src1[] = {
+	"hsi_ck", "lse_ck", "hse_ck", "pll1_q",	"rc48_ck" };
+
+static const char * const mco_src2[] = {
+	"sys_ck", "pll2_p", "hse_ck", "pll1_p", "csi_ck", "lsi_ck" };
+
+/* LCD clock */
+static const char * const ltdc_src[] = {"pll3_r"};
+
+/* Gate clock with ready bit and backup domain management */
+struct stm32_ready_gate {
+	struct	clk_gate gate;
+	u8	bit_rdy;
+};
+
+#define to_ready_gate_clk(_rgate) container_of(_rgate, struct stm32_ready_gate,\
+		gate)
+
+#define RGATE_TIMEOUT 10000
+
+static int ready_gate_clk_enable(struct clk_hw *hw)
+{
+	struct clk_gate *gate = to_clk_gate(hw);
+	struct stm32_ready_gate *rgate = to_ready_gate_clk(gate);
+	int bit_status;
+	unsigned int timeout = RGATE_TIMEOUT;
+
+	if (clk_gate_ops.is_enabled(hw))
+		return 0;
+
+	clk_gate_ops.enable(hw);
+
+	/* We can't use readl_poll_timeout() because we can blocked if
+	 * someone enables this clock before clocksource changes.
+	 * Only jiffies counter is available. Jiffies are incremented by
+	 * interruptions and enable op does not allow to be interrupted.
+	 */
+	do {
+		bit_status = !(readl(gate->reg) & BIT(rgate->bit_rdy));
+
+		if (bit_status)
+			udelay(100);
+
+	} while (bit_status && --timeout);
+
+	return bit_status;
+}
+
+static void ready_gate_clk_disable(struct clk_hw *hw)
+{
+	struct clk_gate *gate = to_clk_gate(hw);
+	struct stm32_ready_gate *rgate = to_ready_gate_clk(gate);
+	int bit_status;
+	unsigned int timeout = RGATE_TIMEOUT;
+
+	if (!clk_gate_ops.is_enabled(hw))
+		return;
+
+	clk_gate_ops.disable(hw);
+
+	do {
+		bit_status = !!(readl(gate->reg) & BIT(rgate->bit_rdy));
+
+		if (bit_status)
+			udelay(100);
+
+	} while (bit_status && --timeout);
+}
+
+static const struct clk_ops ready_gate_clk_ops = {
+	.enable		= ready_gate_clk_enable,
+	.disable	= ready_gate_clk_disable,
+	.is_enabled	= clk_gate_is_enabled,
+};
+
+static struct clk_hw *clk_register_ready_gate(struct device *dev,
+		const char *name, const char *parent_name,
+		void __iomem *reg, u8 bit_idx, u8 bit_rdy,
+		unsigned long flags, spinlock_t *lock)
+{
+	struct stm32_ready_gate *rgate;
+	struct clk_init_data init = { NULL };
+	struct clk_hw *hw;
+	int ret;
+
+	rgate = kzalloc(sizeof(*rgate), GFP_KERNEL);
+	if (!rgate)
+		return ERR_PTR(-ENOMEM);
+
+	init.name = name;
+	init.ops = &ready_gate_clk_ops;
+	init.flags = flags;
+	init.parent_names = &parent_name;
+	init.num_parents = 1;
+
+	rgate->bit_rdy = bit_rdy;
+	rgate->gate.lock = lock;
+	rgate->gate.reg = reg;
+	rgate->gate.bit_idx = bit_idx;
+	rgate->gate.hw.init = &init;
+
+	hw = &rgate->gate.hw;
+	ret = clk_hw_register(dev, hw);
+	if (ret) {
+		kfree(rgate);
+		hw = ERR_PTR(ret);
+	}
+
+	return hw;
+}
+
+struct gate_cfg {
+	u32 offset;
+	u8  bit_idx;
+};
+
+struct muxdiv_cfg {
+	u32 offset;
+	u8 shift;
+	u8 width;
+};
+
+struct composite_clk_cfg {
+	struct gate_cfg *gate;
+	struct muxdiv_cfg *mux;
+	struct muxdiv_cfg *div;
+	const char *name;
+	const char * const *parent_name;
+	int num_parents;
+	u32 flags;
+};
+
+struct composite_clk_gcfg_t {
+	u8 flags;
+	const struct clk_ops *ops;
+};
+
+/*
+ * General config definition of a composite clock (only clock diviser for rate)
+ */
+struct composite_clk_gcfg {
+	struct composite_clk_gcfg_t *mux;
+	struct composite_clk_gcfg_t *div;
+	struct composite_clk_gcfg_t *gate;
+};
+
+#define M_CFG_MUX(_mux_ops, _mux_flags)\
+	.mux = &(struct composite_clk_gcfg_t) { _mux_flags, _mux_ops}
+
+#define M_CFG_DIV(_rate_ops, _rate_flags)\
+	.div = &(struct composite_clk_gcfg_t) {_rate_flags, _rate_ops}
+
+#define M_CFG_GATE(_gate_ops, _gate_flags)\
+	.gate = &(struct composite_clk_gcfg_t) { _gate_flags, _gate_ops}
+
+static struct clk_mux *_get_cmux(void __iomem *reg, u8 shift, u8 width,
+		u32 flags, spinlock_t *lock)
+{
+	struct clk_mux *mux;
+
+	mux = kzalloc(sizeof(*mux), GFP_KERNEL);
+	if (!mux)
+		return ERR_PTR(-ENOMEM);
+
+	mux->reg	= reg;
+	mux->shift	= shift;
+	mux->mask	= (1 << width) - 1;
+	mux->flags	= flags;
+	mux->lock	= lock;
+
+	return mux;
+}
+
+static struct clk_divider *_get_cdiv(void __iomem *reg, u8 shift, u8 width,
+		u32 flags, spinlock_t *lock)
+{
+	struct clk_divider *div;
+
+	div = kzalloc(sizeof(*div), GFP_KERNEL);
+
+	if (!div)
+		return ERR_PTR(-ENOMEM);
+
+	div->reg   = reg;
+	div->shift = shift;
+	div->width = width;
+	div->flags = flags;
+	div->lock  = lock;
+
+	return div;
+}
+
+static struct clk_gate *_get_cgate(void __iomem *reg, u8 bit_idx, u32 flags,
+		spinlock_t *lock)
+{
+	struct clk_gate *gate;
+
+	gate = kzalloc(sizeof(*gate), GFP_KERNEL);
+	if (!gate)
+		return ERR_PTR(-ENOMEM);
+
+	gate->reg	= reg;
+	gate->bit_idx	= bit_idx;
+	gate->flags	= flags;
+	gate->lock	= lock;
+
+	return gate;
+}
+
+struct composite_cfg {
+	struct clk_hw *mux_hw;
+	struct clk_hw *div_hw;
+	struct clk_hw *gate_hw;
+
+	const struct clk_ops *mux_ops;
+	const struct clk_ops *div_ops;
+	const struct clk_ops *gate_ops;
+};
+
+static void get_cfg_composite_div(const struct composite_clk_gcfg *gcfg,
+		const struct composite_clk_cfg *cfg,
+		struct composite_cfg *composite, spinlock_t *lock)
+{
+	struct clk_mux     *mux = NULL;
+	struct clk_divider *div = NULL;
+	struct clk_gate    *gate = NULL;
+	const struct clk_ops *mux_ops, *div_ops, *gate_ops;
+	struct clk_hw *mux_hw;
+	struct clk_hw *div_hw;
+	struct clk_hw *gate_hw;
+
+	mux_ops = div_ops = gate_ops = NULL;
+	mux_hw = div_hw = gate_hw = NULL;
+
+	if (gcfg->mux && gcfg->mux) {
+		mux = _get_cmux(base + cfg->mux->offset,
+				cfg->mux->shift,
+				cfg->mux->width,
+				gcfg->mux->flags, lock);
+
+		if (!IS_ERR(mux)) {
+			mux_hw = &mux->hw;
+			mux_ops = gcfg->mux->ops ?
+				  gcfg->mux->ops : &clk_mux_ops;
+		}
+	}
+
+	if (gcfg->div && cfg->div) {
+		div = _get_cdiv(base + cfg->div->offset,
+				cfg->div->shift,
+				cfg->div->width,
+				gcfg->div->flags, lock);
+
+		if (!IS_ERR(div)) {
+			div_hw = &div->hw;
+			div_ops = gcfg->div->ops ?
+				  gcfg->div->ops : &clk_divider_ops;
+		}
+	}
+
+	if (gcfg->gate && gcfg->gate) {
+		gate = _get_cgate(base + cfg->gate->offset,
+				cfg->gate->bit_idx,
+				gcfg->gate->flags, lock);
+
+		if (!IS_ERR(gate)) {
+			gate_hw = &gate->hw;
+			gate_ops = gcfg->gate->ops ?
+				   gcfg->gate->ops : &clk_gate_ops;
+		}
+	}
+
+	composite->mux_hw = mux_hw;
+	composite->mux_ops = mux_ops;
+
+	composite->div_hw = div_hw;
+	composite->div_ops = div_ops;
+
+	composite->gate_hw = gate_hw;
+	composite->gate_ops = gate_ops;
+}
+
+/* Kernel Timer */
+struct timer_ker {
+	u8 dppre_shift;
+	struct clk_hw hw;
+	spinlock_t *lock;
+};
+
+#define to_timer_ker(_hw) container_of(_hw, struct timer_ker, hw)
+
+static unsigned long timer_ker_recalc_rate(struct clk_hw *hw,
+		unsigned long parent_rate)
+{
+	struct timer_ker *clk_elem = to_timer_ker(hw);
+	u32 timpre;
+	u32 dppre_shift = clk_elem->dppre_shift;
+	u32 prescaler;
+	u32 mul;
+
+	timpre = (readl(base + RCC_CFGR) >> 15) & 0x01;
+
+	prescaler = (readl(base + RCC_D2CFGR) >> dppre_shift) & 0x03;
+
+	mul = 2;
+
+	if (prescaler < 4)
+		mul = 1;
+
+	else if (timpre && prescaler > 4)
+		mul = 4;
+
+	return parent_rate * mul;
+}
+
+static const struct clk_ops timer_ker_ops = {
+	.recalc_rate = timer_ker_recalc_rate,
+};
+
+static struct clk_hw *clk_register_stm32_timer_ker(struct device *dev,
+		const char *name, const char *parent_name,
+		unsigned long flags,
+		u8 dppre_shift,
+		spinlock_t *lock)
+{
+	struct timer_ker *element;
+	struct clk_init_data init;
+	struct clk_hw *hw;
+	int err;
+
+	element = kzalloc(sizeof(*element), GFP_KERNEL);
+	if (!element)
+		return ERR_PTR(-ENOMEM);
+
+	init.name = name;
+	init.ops = &timer_ker_ops;
+	init.flags = flags;
+	init.parent_names = &parent_name;
+	init.num_parents = 1;
+
+	element->hw.init = &init;
+	element->lock = lock;
+	element->dppre_shift = dppre_shift;
+
+	hw = &element->hw;
+	err = clk_hw_register(dev, hw);
+
+	if (err) {
+		kfree(element);
+		return ERR_PTR(err);
+	}
+
+	return hw;
+}
+
+static const struct clk_div_table d1cpre_div_table[] = {
+	{ 0, 1 }, { 1, 1 }, { 2, 1 }, { 3, 1},
+	{ 4, 1 }, { 5, 1 }, { 6, 1 }, { 7, 1},
+	{ 8, 2 }, { 9, 4 }, { 10, 8 }, { 11, 16 },
+	{ 12, 64 }, { 13, 128 }, { 14, 256 },
+	{ 15, 512 },
+	{ 0 },
+};
+
+static const struct clk_div_table ppre_div_table[] = {
+	{ 0, 1 }, { 1, 1 }, { 2, 1 }, { 3, 1},
+	{ 4, 2 }, { 5, 4 }, { 6, 8 }, { 7, 16 },
+	{ 0 },
+};
+
+static void register_core_and_bus_clocks(void)
+{
+	/* CORE AND BUS */
+	hws[SYS_D1CPRE] = clk_hw_register_divider_table(NULL, "d1cpre",
+			"sys_ck", CLK_IGNORE_UNUSED, base + RCC_D1CFGR, 8, 4, 0,
+			d1cpre_div_table, &stm32rcc_lock);
+
+	hws[HCLK] = clk_hw_register_divider_table(NULL, "hclk", "d1cpre",
+			CLK_IGNORE_UNUSED, base + RCC_D1CFGR, 0, 4, 0,
+			d1cpre_div_table, &stm32rcc_lock);
+
+	/* D1 DOMAIN */
+	/* * CPU Systick */
+	hws[CPU_SYSTICK] = clk_hw_register_fixed_factor(NULL, "systick",
+			"d1cpre", 0, 1, 8);
+
+	/* * APB3 peripheral */
+	hws[PCLK3] = clk_hw_register_divider_table(NULL, "pclk3", "hclk", 0,
+			base + RCC_D1CFGR, 4, 3, 0,
+			ppre_div_table, &stm32rcc_lock);
+
+	/* D2 DOMAIN */
+	/* * APB1 peripheral */
+	hws[PCLK1] = clk_hw_register_divider_table(NULL, "pclk1", "hclk", 0,
+			base + RCC_D2CFGR, 4, 3, 0,
+			ppre_div_table, &stm32rcc_lock);
+
+	/* Timers prescaler clocks */
+	clk_register_stm32_timer_ker(NULL, "tim1_ker", "pclk1", 0,
+			4, &stm32rcc_lock);
+
+	/* * APB2 peripheral */
+	hws[PCLK2] = clk_hw_register_divider_table(NULL, "pclk2", "hclk", 0,
+			base + RCC_D2CFGR, 8, 3, 0, ppre_div_table,
+			&stm32rcc_lock);
+
+	clk_register_stm32_timer_ker(NULL, "tim2_ker", "pclk2", 0, 8,
+			&stm32rcc_lock);
+
+	/* D3 DOMAIN */
+	/* * APB4 peripheral */
+	hws[PCLK4] = clk_hw_register_divider_table(NULL, "pclk4", "hclk", 0,
+			base + RCC_D3CFGR, 4, 3, 0,
+			ppre_div_table, &stm32rcc_lock);
+}
+
+/* MUX clock configuration */
+struct stm32_mux_clk {
+	const char *name;
+	const char * const *parents;
+	u8 num_parents;
+	u32 offset;
+	u8 shift;
+	u8 width;
+	u32 flags;
+};
+
+#define M_MCLOCF(_name, _parents, _mux_offset, _mux_shift, _mux_width, _flags)\
+{\
+	.name		= _name,\
+	.parents	= _parents,\
+	.num_parents	= ARRAY_SIZE(_parents),\
+	.offset		= _mux_offset,\
+	.shift		= _mux_shift,\
+	.width		= _mux_width,\
+	.flags		= _flags,\
+}
+
+#define M_MCLOC(_name, _parents, _mux_offset, _mux_shift, _mux_width)\
+	M_MCLOCF(_name, _parents, _mux_offset, _mux_shift, _mux_width, 0)\
+
+static const struct stm32_mux_clk stm32_mclk[] __initconst = {
+	M_MCLOC("per_ck",	per_src,	RCC_D1CCIPR,	28, 3),
+	M_MCLOC("pllsrc",	pll_src,	RCC_PLLCKSELR,	 0, 3),
+	M_MCLOC("sys_ck",	sys_src,	RCC_CFGR,	 0, 3),
+	M_MCLOC("tracein_ck",	tracein_src,	RCC_CFGR,	 0, 3),
+};
+
+/* Oscillary clock configuration */
+struct stm32_osc_clk {
+	const char *name;
+	const char *parent;
+	u32 gate_offset;
+	u8 bit_idx;
+	u8 bit_rdy;
+	u32 flags;
+};
+
+#define OSC_CLKF(_name, _parent, _gate_offset, _bit_idx, _bit_rdy, _flags)\
+{\
+	.name		= _name,\
+	.parent		= _parent,\
+	.gate_offset	= _gate_offset,\
+	.bit_idx	= _bit_idx,\
+	.bit_rdy	= _bit_rdy,\
+	.flags		= _flags,\
+}
+
+#define OSC_CLK(_name, _parent, _gate_offset, _bit_idx, _bit_rdy)\
+	OSC_CLKF(_name, _parent, _gate_offset, _bit_idx, _bit_rdy, 0)
+
+static const struct stm32_osc_clk stm32_oclk[] __initconst = {
+	OSC_CLKF("hsi_ck",  "hsidiv",   RCC_CR,   0,  2, CLK_IGNORE_UNUSED),
+	OSC_CLKF("hsi_ker", "hsidiv",   RCC_CR,   1,  2, CLK_IGNORE_UNUSED),
+	OSC_CLKF("csi_ck",  "clk-csi",  RCC_CR,   7,  8, CLK_IGNORE_UNUSED),
+	OSC_CLKF("csi_ker", "clk-csi",  RCC_CR,   9,  8, CLK_IGNORE_UNUSED),
+	OSC_CLKF("rc48_ck", "clk-rc48", RCC_CR,  12, 13, CLK_IGNORE_UNUSED),
+	OSC_CLKF("lsi_ck",  "clk-lsi",  RCC_CSR,  0,  1, CLK_IGNORE_UNUSED),
+};
+
+/* PLL configuration */
+struct st32h7_pll_cfg {
+	u8 bit_idx;
+	u32 offset_divr;
+	u8 bit_frac_en;
+	u32 offset_frac;
+	u8 divm;
+};
+
+struct stm32_pll_data {
+	const char *name;
+	const char *parent_name;
+	unsigned long flags;
+	const struct st32h7_pll_cfg *cfg;
+};
+
+static const struct st32h7_pll_cfg stm32h7_pll1 = {
+	.bit_idx = 24,
+	.offset_divr = RCC_PLL1DIVR,
+	.bit_frac_en = 0,
+	.offset_frac = RCC_PLL1FRACR,
+	.divm = 4,
+};
+
+static const struct st32h7_pll_cfg stm32h7_pll2 = {
+	.bit_idx = 26,
+	.offset_divr = RCC_PLL2DIVR,
+	.bit_frac_en = 4,
+	.offset_frac = RCC_PLL2FRACR,
+	.divm = 12,
+};
+
+static const struct st32h7_pll_cfg stm32h7_pll3 = {
+	.bit_idx = 28,
+	.offset_divr = RCC_PLL3DIVR,
+	.bit_frac_en = 8,
+	.offset_frac = RCC_PLL3FRACR,
+	.divm = 20,
+};
+
+static const struct stm32_pll_data stm32_pll[] = {
+	{ "vco1", "pllsrc", CLK_IGNORE_UNUSED, &stm32h7_pll1 },
+	{ "vco2", "pllsrc", 0, &stm32h7_pll2 },
+	{ "vco3", "pllsrc", 0, &stm32h7_pll3 },
+};
+
+struct stm32_fractional_divider {
+	void __iomem	*mreg;
+	u8		mshift;
+	u8		mwidth;
+	u32		mmask;
+
+	void __iomem	*nreg;
+	u8		nshift;
+	u8		nwidth;
+
+	void __iomem	*freg_status;
+	u8		freg_bit;
+	void __iomem	*freg_value;
+	u8		fshift;
+	u8		fwidth;
+
+	u8		flags;
+	struct clk_hw	hw;
+	spinlock_t	*lock;
+};
+
+struct stm32_pll_obj {
+	spinlock_t *lock;
+	struct stm32_fractional_divider div;
+	struct stm32_ready_gate rgate;
+	struct clk_hw hw;
+};
+
+#define to_pll(_hw) container_of(_hw, struct stm32_pll_obj, hw)
+
+static int pll_is_enabled(struct clk_hw *hw)
+{
+	struct stm32_pll_obj *clk_elem = to_pll(hw);
+	struct clk_hw *_hw = &clk_elem->rgate.gate.hw;
+
+	__clk_hw_set_clk(_hw, hw);
+
+	return ready_gate_clk_ops.is_enabled(_hw);
+}
+
+static int pll_enable(struct clk_hw *hw)
+{
+	struct stm32_pll_obj *clk_elem = to_pll(hw);
+	struct clk_hw *_hw = &clk_elem->rgate.gate.hw;
+
+	__clk_hw_set_clk(_hw, hw);
+
+	return ready_gate_clk_ops.enable(_hw);
+}
+
+static void pll_disable(struct clk_hw *hw)
+{
+	struct stm32_pll_obj *clk_elem = to_pll(hw);
+	struct clk_hw *_hw = &clk_elem->rgate.gate.hw;
+
+	__clk_hw_set_clk(_hw, hw);
+
+	ready_gate_clk_ops.disable(_hw);
+}
+
+static int pll_frac_is_enabled(struct clk_hw *hw)
+{
+	struct stm32_pll_obj *clk_elem = to_pll(hw);
+	struct stm32_fractional_divider *fd = &clk_elem->div;
+
+	return (readl(fd->freg_status) >> fd->freg_bit) & 0x01;
+}
+
+static unsigned long pll_read_frac(struct clk_hw *hw)
+{
+	struct stm32_pll_obj *clk_elem = to_pll(hw);
+	struct stm32_fractional_divider *fd = &clk_elem->div;
+
+	return (readl(fd->freg_value) >> fd->fshift) &
+		GENMASK(fd->fwidth - 1, 0);
+}
+
+static unsigned long pll_fd_recalc_rate(struct clk_hw *hw,
+		unsigned long parent_rate)
+{
+	struct stm32_pll_obj *clk_elem = to_pll(hw);
+	struct stm32_fractional_divider *fd = &clk_elem->div;
+	unsigned long m, n;
+	u32 val, mask;
+	u64 rate, rate1 = 0;
+
+	val = readl(fd->mreg);
+	mask = GENMASK(fd->mwidth - 1, 0) << fd->mshift;
+	m = (val & mask) >> fd->mshift;
+
+	val = readl(fd->nreg);
+	mask = GENMASK(fd->nwidth - 1, 0) << fd->nshift;
+	n = ((val & mask) >> fd->nshift) + 1;
+
+	if (!n || !m)
+		return parent_rate;
+
+	rate = (u64)parent_rate * n;
+	do_div(rate, m);
+
+	if (pll_frac_is_enabled(hw)) {
+		val = pll_read_frac(hw);
+		rate1 = (u64)parent_rate * (u64)val;
+		do_div(rate1, (m * 8191));
+	}
+
+	return rate + rate1;
+}
+
+static const struct clk_ops pll_ops = {
+	.enable		= pll_enable,
+	.disable	= pll_disable,
+	.is_enabled	= pll_is_enabled,
+	.recalc_rate	= pll_fd_recalc_rate,
+};
+
+static struct clk_hw *clk_register_stm32_pll(struct device *dev,
+		const char *name,
+		const char *parent,
+		unsigned long flags,
+		const struct st32h7_pll_cfg *cfg,
+		spinlock_t *lock)
+{
+	struct stm32_pll_obj *pll;
+	struct clk_init_data init = { NULL };
+	struct clk_hw *hw;
+	int ret;
+	struct stm32_fractional_divider *div = NULL;
+	struct stm32_ready_gate *rgate;
+
+	pll = kzalloc(sizeof(*pll), GFP_KERNEL);
+	if (!pll)
+		return ERR_PTR(-ENOMEM);
+
+	init.name = name;
+	init.ops = &pll_ops;
+	init.flags = flags;
+	init.parent_names = &parent;
+	init.num_parents = 1;
+	pll->hw.init = &init;
+
+	hw = &pll->hw;
+	rgate = &pll->rgate;
+
+	rgate->bit_rdy = cfg->bit_idx + 1;
+	rgate->gate.lock = lock;
+	rgate->gate.reg = base + RCC_CR;
+	rgate->gate.bit_idx = cfg->bit_idx;
+
+	div = &pll->div;
+	div->flags = 0;
+	div->mreg = base + RCC_PLLCKSELR;
+	div->mshift = cfg->divm;
+	div->mwidth = 6;
+	div->nreg = base +  cfg->offset_divr;
+	div->nshift = 0;
+	div->nwidth = 9;
+
+	div->freg_status = base + RCC_PLLCFGR;
+	div->freg_bit = cfg->bit_frac_en;
+	div->freg_value = base +  cfg->offset_frac;
+	div->fshift = 3;
+	div->fwidth = 13;
+
+	div->lock = lock;
+
+	ret = clk_hw_register(dev, hw);
+	if (ret) {
+		kfree(pll);
+		hw = ERR_PTR(ret);
+	}
+
+	return hw;
+}
+
+/* ODF CLOCKS */
+static unsigned long odf_divider_recalc_rate(struct clk_hw *hw,
+		unsigned long parent_rate)
+{
+	return clk_divider_ops.recalc_rate(hw, parent_rate);
+}
+
+static long odf_divider_round_rate(struct clk_hw *hw, unsigned long rate,
+		unsigned long *prate)
+{
+	return clk_divider_ops.round_rate(hw, rate, prate);
+}
+
+static int odf_divider_set_rate(struct clk_hw *hw, unsigned long rate,
+		unsigned long parent_rate)
+{
+	struct clk_hw *hwp;
+	int pll_status;
+	int ret;
+
+	hwp = clk_hw_get_parent(hw);
+
+	pll_status = pll_is_enabled(hwp);
+
+	if (pll_status)
+		pll_disable(hwp);
+
+	ret = clk_divider_ops.set_rate(hw, rate, parent_rate);
+
+	if (pll_status)
+		pll_enable(hwp);
+
+	return ret;
+}
+
+static const struct clk_ops odf_divider_ops = {
+	.recalc_rate	= odf_divider_recalc_rate,
+	.round_rate	= odf_divider_round_rate,
+	.set_rate	= odf_divider_set_rate,
+};
+
+static int odf_gate_enable(struct clk_hw *hw)
+{
+	struct clk_hw *hwp;
+	int pll_status;
+	int ret;
+
+	if (clk_gate_ops.is_enabled(hw))
+		return 0;
+
+	hwp = clk_hw_get_parent(hw);
+
+	pll_status = pll_is_enabled(hwp);
+
+	if (pll_status)
+		pll_disable(hwp);
+
+	ret = clk_gate_ops.enable(hw);
+
+	if (pll_status)
+		pll_enable(hwp);
+
+	return ret;
+}
+
+static void odf_gate_disable(struct clk_hw *hw)
+{
+	struct clk_hw *hwp;
+	int pll_status;
+
+	if (!clk_gate_ops.is_enabled(hw))
+		return;
+
+	hwp = clk_hw_get_parent(hw);
+
+	pll_status = pll_is_enabled(hwp);
+
+	if (pll_status)
+		pll_disable(hwp);
+
+	clk_gate_ops.disable(hw);
+
+	if (pll_status)
+		pll_enable(hwp);
+}
+
+static const struct clk_ops odf_gate_ops = {
+	.enable		= odf_gate_enable,
+	.disable	= odf_gate_disable,
+	.is_enabled	= clk_gate_is_enabled,
+};
+
+static struct composite_clk_gcfg odf_clk_gcfg = {
+	M_CFG_DIV(&odf_divider_ops, 0),
+	M_CFG_GATE(&odf_gate_ops, 0),
+};
+
+#define M_ODF_F(_name, _parent, _gate_offset,  _bit_idx, _rate_offset,\
+		_rate_shift, _rate_width, _flags)\
+{\
+	.mux = NULL,\
+	.div = &(struct muxdiv_cfg) {_rate_offset, _rate_shift, _rate_width},\
+	.gate = &(struct gate_cfg) {_gate_offset, _bit_idx },\
+	.name = _name,\
+	.parent_name = &(const char *) {_parent},\
+	.num_parents = 1,\
+	.flags = _flags,\
+}
+
+#define M_ODF(_name, _parent, _gate_offset,  _bit_idx, _rate_offset,\
+		_rate_shift, _rate_width)\
+M_ODF_F(_name, _parent, _gate_offset,  _bit_idx, _rate_offset,\
+		_rate_shift, _rate_width, 0)\
+
+static const struct composite_clk_cfg stm32_odf[3][3] = {
+	{
+		M_ODF_F("pll1_p", "vco1", RCC_PLLCFGR, 16, RCC_PLL1DIVR,  9, 7,
+				CLK_IGNORE_UNUSED),
+		M_ODF_F("pll1_q", "vco1", RCC_PLLCFGR, 17, RCC_PLL1DIVR, 16, 7,
+				CLK_IGNORE_UNUSED),
+		M_ODF_F("pll1_r", "vco1", RCC_PLLCFGR, 18, RCC_PLL1DIVR, 24, 7,
+				CLK_IGNORE_UNUSED),
+	},
+
+	{
+		M_ODF("pll2_p", "vco2", RCC_PLLCFGR, 19, RCC_PLL2DIVR,  9, 7),
+		M_ODF("pll2_q", "vco2", RCC_PLLCFGR, 20, RCC_PLL2DIVR, 16, 7),
+		M_ODF("pll2_r", "vco2", RCC_PLLCFGR, 21, RCC_PLL2DIVR, 24, 7),
+	},
+	{
+		M_ODF("pll3_p", "vco3", RCC_PLLCFGR, 22, RCC_PLL3DIVR,  9, 7),
+		M_ODF("pll3_q", "vco3", RCC_PLLCFGR, 23, RCC_PLL3DIVR, 16, 7),
+		M_ODF("pll3_r", "vco3", RCC_PLLCFGR, 24, RCC_PLL3DIVR, 24, 7),
+	}
+};
+
+/* PERIF CLOCKS */
+struct pclk_t {
+	u32 gate_offset;
+	u8 bit_idx;
+	const char *name;
+	const char *parent;
+	u32 flags;
+};
+
+#define PER_CLKF(_gate_offset, _bit_idx, _name, _parent, _flags)\
+{\
+	.gate_offset	= _gate_offset,\
+	.bit_idx	= _bit_idx,\
+	.name		= _name,\
+	.parent		= _parent,\
+	.flags		= _flags,\
+}
+
+#define PER_CLK(_gate_offset, _bit_idx, _name, _parent)\
+	PER_CLKF(_gate_offset, _bit_idx, _name, _parent, 0)
+
+static const struct pclk_t pclk[] = {
+	PER_CLK(RCC_AHB3ENR, 31, "d1sram1", "hclk"),
+	PER_CLK(RCC_AHB3ENR, 30, "itcm", "hclk"),
+	PER_CLK(RCC_AHB3ENR, 29, "dtcm2", "hclk"),
+	PER_CLK(RCC_AHB3ENR, 28, "dtcm1", "hclk"),
+	PER_CLK(RCC_AHB3ENR, 8, "flitf", "hclk"),
+	PER_CLK(RCC_AHB3ENR, 5, "jpgdec", "hclk"),
+	PER_CLK(RCC_AHB3ENR, 4, "dma2d", "hclk"),
+	PER_CLK(RCC_AHB3ENR, 0, "mdma", "hclk"),
+	PER_CLK(RCC_AHB1ENR, 28, "usb2ulpi", "hclk"),
+	PER_CLK(RCC_AHB1ENR, 26, "usb1ulpi", "hclk"),
+	PER_CLK(RCC_AHB1ENR, 17, "eth1rx", "hclk"),
+	PER_CLK(RCC_AHB1ENR, 16, "eth1tx", "hclk"),
+	PER_CLK(RCC_AHB1ENR, 15, "eth1mac", "hclk"),
+	PER_CLK(RCC_AHB1ENR, 14, "art", "hclk"),
+	PER_CLK(RCC_AHB1ENR, 1, "dma2", "hclk"),
+	PER_CLK(RCC_AHB1ENR, 0, "dma1", "hclk"),
+	PER_CLK(RCC_AHB2ENR, 31, "d2sram3", "hclk"),
+	PER_CLK(RCC_AHB2ENR, 30, "d2sram2", "hclk"),
+	PER_CLK(RCC_AHB2ENR, 29, "d2sram1", "hclk"),
+	PER_CLK(RCC_AHB2ENR, 5, "hash", "hclk"),
+	PER_CLK(RCC_AHB2ENR, 4, "crypt", "hclk"),
+	PER_CLK(RCC_AHB2ENR, 0, "camitf", "hclk"),
+	PER_CLK(RCC_AHB4ENR, 28, "bkpram", "hclk"),
+	PER_CLK(RCC_AHB4ENR, 25, "hsem", "hclk"),
+	PER_CLK(RCC_AHB4ENR, 21, "bdma", "hclk"),
+	PER_CLK(RCC_AHB4ENR, 19, "crc", "hclk"),
+	PER_CLK(RCC_AHB4ENR, 10, "gpiok", "hclk"),
+	PER_CLK(RCC_AHB4ENR, 9, "gpioj", "hclk"),
+	PER_CLK(RCC_AHB4ENR, 8, "gpioi", "hclk"),
+	PER_CLK(RCC_AHB4ENR, 7, "gpioh", "hclk"),
+	PER_CLK(RCC_AHB4ENR, 6, "gpiog", "hclk"),
+	PER_CLK(RCC_AHB4ENR, 5, "gpiof", "hclk"),
+	PER_CLK(RCC_AHB4ENR, 4, "gpioe", "hclk"),
+	PER_CLK(RCC_AHB4ENR, 3, "gpiod", "hclk"),
+	PER_CLK(RCC_AHB4ENR, 2, "gpioc", "hclk"),
+	PER_CLK(RCC_AHB4ENR, 1, "gpiob", "hclk"),
+	PER_CLK(RCC_AHB4ENR, 0, "gpioa", "hclk"),
+	PER_CLK(RCC_APB3ENR, 6, "wwdg1", "pclk3"),
+	PER_CLK(RCC_APB1LENR, 29, "dac12", "pclk1"),
+	PER_CLK(RCC_APB1LENR, 11, "wwdg2", "pclk1"),
+	PER_CLK(RCC_APB1LENR, 8, "tim14", "tim1_ker"),
+	PER_CLK(RCC_APB1LENR, 7, "tim13", "tim1_ker"),
+	PER_CLK(RCC_APB1LENR, 6, "tim12", "tim1_ker"),
+	PER_CLK(RCC_APB1LENR, 5, "tim7", "tim1_ker"),
+	PER_CLK(RCC_APB1LENR, 4, "tim6", "tim1_ker"),
+	PER_CLK(RCC_APB1LENR, 3, "tim5", "tim1_ker"),
+	PER_CLK(RCC_APB1LENR, 2, "tim4", "tim1_ker"),
+	PER_CLK(RCC_APB1LENR, 1, "tim3", "tim1_ker"),
+	PER_CLK(RCC_APB1LENR, 0, "tim2", "tim1_ker"),
+	PER_CLK(RCC_APB1HENR, 5, "mdios", "pclk1"),
+	PER_CLK(RCC_APB1HENR, 4, "opamp", "pclk1"),
+	PER_CLK(RCC_APB1HENR, 1, "crs", "pclk1"),
+	PER_CLK(RCC_APB2ENR, 18, "tim17", "tim2_ker"),
+	PER_CLK(RCC_APB2ENR, 17, "tim16", "tim2_ker"),
+	PER_CLK(RCC_APB2ENR, 16, "tim15", "tim2_ker"),
+	PER_CLK(RCC_APB2ENR, 1, "tim8", "tim2_ker"),
+	PER_CLK(RCC_APB2ENR, 0, "tim1", "tim2_ker"),
+	PER_CLK(RCC_APB4ENR, 26, "tmpsens", "pclk4"),
+	PER_CLK(RCC_APB4ENR, 16, "rtcapb", "pclk4"),
+	PER_CLK(RCC_APB4ENR, 15, "vref", "pclk4"),
+	PER_CLK(RCC_APB4ENR, 14, "comp12", "pclk4"),
+	PER_CLK(RCC_APB4ENR, 1, "syscfg", "pclk4"),
+};
+
+/* KERNEL CLOCKS */
+#define KER_CLKF(_gate_offset, _bit_idx,\
+		_mux_offset, _mux_shift, _mux_width,\
+		_name, _parent_name,\
+		_flags) \
+{ \
+	.gate = &(struct gate_cfg) {_gate_offset, _bit_idx},\
+	.mux = &(struct muxdiv_cfg) {_mux_offset, _mux_shift, _mux_width },\
+	.name = _name, \
+	.parent_name = _parent_name, \
+	.num_parents = ARRAY_SIZE(_parent_name),\
+	.flags = _flags,\
+}
+
+#define KER_CLK(_gate_offset, _bit_idx, _mux_offset, _mux_shift, _mux_width,\
+		_name, _parent_name) \
+KER_CLKF(_gate_offset, _bit_idx, _mux_offset, _mux_shift, _mux_width,\
+		_name, _parent_name, 0)\
+
+#define KER_CLKF_NOMUX(_gate_offset, _bit_idx,\
+		_name, _parent_name,\
+		_flags) \
+{ \
+	.gate = &(struct gate_cfg) {_gate_offset, _bit_idx},\
+	.mux = NULL,\
+	.name = _name, \
+	.parent_name = _parent_name, \
+	.num_parents = 1,\
+	.flags = _flags,\
+}
+
+static const struct composite_clk_cfg kclk[] = {
+	KER_CLK(RCC_AHB3ENR,  16, RCC_D1CCIPR,	16, 1, "sdmmc1", sdmmc_src),
+	KER_CLKF(RCC_AHB3ENR, 14, RCC_D1CCIPR,	 4, 2, "quadspi", qspi_src,
+			CLK_IGNORE_UNUSED),
+	KER_CLKF(RCC_AHB3ENR, 12, RCC_D1CCIPR,	 0, 2, "fmc", fmc_src,
+			CLK_IGNORE_UNUSED),
+	KER_CLK(RCC_AHB1ENR,  27, RCC_D2CCIP2R,	20, 2, "usb2otg", usbotg_src),
+	KER_CLK(RCC_AHB1ENR,  25, RCC_D2CCIP2R, 20, 2, "usb1otg", usbotg_src),
+	KER_CLK(RCC_AHB1ENR,   5, RCC_D3CCIPR,	16, 2, "adc12", adc_src),
+	KER_CLK(RCC_AHB2ENR,   9, RCC_D1CCIPR,	16, 1, "sdmmc2", sdmmc_src),
+	KER_CLK(RCC_AHB2ENR,   6, RCC_D2CCIP2R,	 8, 2, "rng", rng_src),
+	KER_CLK(RCC_AHB4ENR,  24, RCC_D3CCIPR,  16, 2, "adc3", adc_src),
+	KER_CLKF(RCC_APB3ENR,   4, RCC_D1CCIPR,	 8, 1, "dsi", dsi_src,
+			CLK_SET_RATE_PARENT),
+	KER_CLKF_NOMUX(RCC_APB3ENR, 3, "ltdc", ltdc_src, CLK_SET_RATE_PARENT),
+	KER_CLK(RCC_APB1LENR, 31, RCC_D2CCIP2R,  0, 3, "usart8", usart_src2),
+	KER_CLK(RCC_APB1LENR, 30, RCC_D2CCIP2R,  0, 3, "usart7", usart_src2),
+	KER_CLK(RCC_APB1LENR, 27, RCC_D2CCIP2R, 22, 2, "hdmicec", cec_src),
+	KER_CLK(RCC_APB1LENR, 23, RCC_D2CCIP2R, 12, 2, "i2c3", i2c_src1),
+	KER_CLK(RCC_APB1LENR, 22, RCC_D2CCIP2R, 12, 2, "i2c2", i2c_src1),
+	KER_CLK(RCC_APB1LENR, 21, RCC_D2CCIP2R, 12, 2, "i2c1", i2c_src1),
+	KER_CLK(RCC_APB1LENR, 20, RCC_D2CCIP2R,	 0, 3, "uart5", usart_src2),
+	KER_CLK(RCC_APB1LENR, 19, RCC_D2CCIP2R,  0, 3, "uart4", usart_src2),
+	KER_CLK(RCC_APB1LENR, 18, RCC_D2CCIP2R,  0, 3, "usart3", usart_src2),
+	KER_CLK(RCC_APB1LENR, 17, RCC_D2CCIP2R,  0, 3, "usart2", usart_src2),
+	KER_CLK(RCC_APB1LENR, 16, RCC_D2CCIP1R, 20, 2, "spdifrx", spdifrx_src),
+	KER_CLK(RCC_APB1LENR, 15, RCC_D2CCIP1R, 16, 3, "spi3", spi_src1),
+	KER_CLK(RCC_APB1LENR, 14, RCC_D2CCIP1R, 16, 3, "spi2", spi_src1),
+	KER_CLK(RCC_APB1LENR,  9, RCC_D2CCIP2R, 28, 3, "lptim1", lptim_src1),
+	KER_CLK(RCC_APB1HENR,  8, RCC_D2CCIP1R, 28, 2, "fdcan", fdcan_src),
+	KER_CLK(RCC_APB1HENR,  2, RCC_D2CCIP1R, 31, 1, "swp", swp_src),
+	KER_CLK(RCC_APB2ENR,  29, RCC_CFGR,	14, 1, "hrtim", hrtim_src),
+	KER_CLK(RCC_APB2ENR,  28, RCC_D2CCIP1R, 24, 1, "dfsdm1", dfsdm1_src),
+	KER_CLKF(RCC_APB2ENR,  24, RCC_D2CCIP1R,  6, 3, "sai3", sai_src,
+		 CLK_SET_RATE_PARENT | CLK_SET_RATE_NO_REPARENT),
+	KER_CLKF(RCC_APB2ENR,  23, RCC_D2CCIP1R,  6, 3, "sai2", sai_src,
+		 CLK_SET_RATE_PARENT | CLK_SET_RATE_NO_REPARENT),
+	KER_CLKF(RCC_APB2ENR,  22, RCC_D2CCIP1R,  0, 3, "sai1", sai_src,
+		 CLK_SET_RATE_PARENT | CLK_SET_RATE_NO_REPARENT),
+	KER_CLK(RCC_APB2ENR,  20, RCC_D2CCIP1R, 16, 3, "spi5", spi_src2),
+	KER_CLK(RCC_APB2ENR,  13, RCC_D2CCIP1R, 16, 3, "spi4", spi_src2),
+	KER_CLK(RCC_APB2ENR,  12, RCC_D2CCIP1R, 16, 3, "spi1", spi_src1),
+	KER_CLK(RCC_APB2ENR,   5, RCC_D2CCIP2R,  3, 3, "usart6", usart_src1),
+	KER_CLK(RCC_APB2ENR,   4, RCC_D2CCIP2R,  3, 3, "usart1", usart_src1),
+	KER_CLK(RCC_APB4ENR,  21, RCC_D3CCIPR,	24, 3, "sai4b", sai_src),
+	KER_CLK(RCC_APB4ENR,  21, RCC_D3CCIPR,	21, 3, "sai4a", sai_src),
+	KER_CLK(RCC_APB4ENR,  12, RCC_D3CCIPR,	13, 3, "lptim5", lptim_src2),
+	KER_CLK(RCC_APB4ENR,  11, RCC_D3CCIPR,	13, 3, "lptim4", lptim_src2),
+	KER_CLK(RCC_APB4ENR,  10, RCC_D3CCIPR,	13, 3, "lptim3", lptim_src2),
+	KER_CLK(RCC_APB4ENR,   9, RCC_D3CCIPR,	10, 3, "lptim2", lptim_src2),
+	KER_CLK(RCC_APB4ENR,   7, RCC_D3CCIPR,	 8, 2, "i2c4", i2c_src2),
+	KER_CLK(RCC_APB4ENR,   5, RCC_D3CCIPR,	28, 3, "spi6", spi_src3),
+	KER_CLK(RCC_APB4ENR,   3, RCC_D3CCIPR,	 0, 3, "lpuart1", lpuart1_src),
+};
+
+static struct composite_clk_gcfg kernel_clk_cfg = {
+	M_CFG_MUX(NULL, 0),
+	M_CFG_GATE(NULL, 0),
+};
+
+/* RTC clock */
+/*
+ * RTC & LSE registers are protected against parasitic write access.
+ * PWR_CR_DBP bit must be set to enable write access to RTC registers.
+ */
+/* STM32_PWR_CR */
+#define PWR_CR				0x00
+/* STM32_PWR_CR bit field */
+#define PWR_CR_DBP			BIT(8)
+
+static struct composite_clk_gcfg rtc_clk_cfg = {
+	M_CFG_MUX(NULL, 0),
+	M_CFG_GATE(NULL, 0),
+};
+
+static const struct composite_clk_cfg rtc_clk =
+	KER_CLK(RCC_BDCR, 15, RCC_BDCR, 8, 2, "rtc_ck", rtc_src);
+
+/* Micro-controller output clock */
+static struct composite_clk_gcfg mco_clk_cfg = {
+	M_CFG_MUX(NULL, 0),
+	M_CFG_DIV(NULL,	CLK_DIVIDER_ONE_BASED | CLK_DIVIDER_ALLOW_ZERO),
+};
+
+#define M_MCO_F(_name, _parents, _mux_offset,  _mux_shift, _mux_width,\
+		_rate_offset, _rate_shift, _rate_width,\
+		_flags)\
+{\
+	.mux = &(struct muxdiv_cfg) {_mux_offset, _mux_shift, _mux_width },\
+	.div = &(struct muxdiv_cfg) {_rate_offset, _rate_shift, _rate_width},\
+	.gate = NULL,\
+	.name = _name,\
+	.parent_name = _parents,\
+	.num_parents = ARRAY_SIZE(_parents),\
+	.flags = _flags,\
+}
+
+static const struct composite_clk_cfg mco_clk[] = {
+	M_MCO_F("mco1", mco_src1, RCC_CFGR, 22, 4, RCC_CFGR, 18, 4, 0),
+	M_MCO_F("mco2", mco_src2, RCC_CFGR, 29, 3, RCC_CFGR, 25, 4, 0),
+};
+
+static void __init stm32h7_rcc_init(struct device_node *np)
+{
+	struct clk_hw_onecell_data *clk_data;
+	struct composite_cfg c_cfg;
+	int n;
+	const char *hse_clk, *lse_clk, *i2s_clk;
+	struct regmap *pdrm;
+
+	clk_data = kzalloc(sizeof(*clk_data) +
+			sizeof(*clk_data->hws) * STM32H7_MAX_CLKS,
+			GFP_KERNEL);
+	if (!clk_data)
+		return;
+
+	clk_data->num = STM32H7_MAX_CLKS;
+
+	hws = clk_data->hws;
+
+	for (n = 0; n < STM32H7_MAX_CLKS; n++)
+		hws[n] = ERR_PTR(-ENOENT);
+
+	/* get RCC base @ from DT */
+	base = of_iomap(np, 0);
+	if (!base) {
+		pr_err("%s: unable to map resource", np->name);
+		goto err_free_clks;
+	}
+
+	pdrm = syscon_regmap_lookup_by_phandle(np, "st,syscfg");
+	if (IS_ERR(pdrm))
+		pr_warn("%s: Unable to get syscfg\n", __func__);
+	else
+		/* In any case disable backup domain write protection
+		 * and will never be enabled.
+		 * Needed by LSE & RTC clocks.
+		 */
+		regmap_update_bits(pdrm, PWR_CR, PWR_CR_DBP, PWR_CR_DBP);
+
+	/* Put parent names from DT */
+	hse_clk = of_clk_get_parent_name(np, 0);
+	lse_clk = of_clk_get_parent_name(np, 1);
+	i2s_clk = of_clk_get_parent_name(np, 2);
+
+	sai_src[3] = i2s_clk;
+	spi_src1[3] = i2s_clk;
+
+	/* Register Internal oscillators */
+	clk_hw_register_fixed_rate(NULL, "clk-hsi", NULL, 0, 64000000);
+	clk_hw_register_fixed_rate(NULL, "clk-csi", NULL, 0, 4000000);
+	clk_hw_register_fixed_rate(NULL, "clk-lsi", NULL, 0, 32000);
+	clk_hw_register_fixed_rate(NULL, "clk-rc48", NULL, 0, 48000);
+
+	/* This clock is coming from outside. Frequencies unknown */
+	hws[CK_DSI_PHY] = clk_hw_register_fixed_rate(NULL, "ck_dsi_phy", NULL,
+			0, 0);
+
+	hws[HSI_DIV] = clk_hw_register_divider(NULL, "hsidiv", "clk-hsi", 0,
+			base + RCC_CR, 3, 2, CLK_DIVIDER_POWER_OF_TWO,
+			&stm32rcc_lock);
+
+	hws[HSE_1M] = clk_hw_register_divider(NULL, "hse_1M", "hse_ck",	0,
+			base + RCC_CFGR, 8, 6, CLK_DIVIDER_ONE_BASED |
+			CLK_DIVIDER_ALLOW_ZERO,
+			&stm32rcc_lock);
+
+	/* Mux system clocks */
+	for (n = 0; n < ARRAY_SIZE(stm32_mclk); n++)
+		hws[MCLK_BANK + n] = clk_hw_register_mux(NULL,
+				stm32_mclk[n].name,
+				stm32_mclk[n].parents,
+				stm32_mclk[n].num_parents,
+				stm32_mclk[n].flags,
+				stm32_mclk[n].offset + base,
+				stm32_mclk[n].shift,
+				stm32_mclk[n].width,
+				0,
+				&stm32rcc_lock);
+
+	register_core_and_bus_clocks();
+
+	/* Oscillary clocks */
+	for (n = 0; n < ARRAY_SIZE(stm32_oclk); n++)
+		hws[OSC_BANK + n] = clk_register_ready_gate(NULL,
+				stm32_oclk[n].name,
+				stm32_oclk[n].parent,
+				stm32_oclk[n].gate_offset + base,
+				stm32_oclk[n].bit_idx,
+				stm32_oclk[n].bit_rdy,
+				stm32_oclk[n].flags,
+				&stm32rcc_lock);
+
+	hws[HSE_CK] = clk_register_ready_gate(NULL,
+				"hse_ck",
+				hse_clk,
+				RCC_CR + base,
+				16, 17,
+				0,
+				&stm32rcc_lock);
+
+	hws[LSE_CK] = clk_register_ready_gate(NULL,
+				"lse_ck",
+				lse_clk,
+				RCC_BDCR + base,
+				0, 1,
+				0,
+				&stm32rcc_lock);
+
+	hws[CSI_KER_DIV122 + n] = clk_hw_register_fixed_factor(NULL,
+			"csi_ker_div122", "csi_ker", 0, 1, 122);
+
+	/* PLLs */
+	for (n = 0; n < ARRAY_SIZE(stm32_pll); n++) {
+		int odf;
+
+		/* Register the VCO */
+		clk_register_stm32_pll(NULL, stm32_pll[n].name,
+				stm32_pll[n].parent_name, stm32_pll[n].flags,
+				stm32_pll[n].cfg,
+				&stm32rcc_lock);
+
+		/* Register the 3 output dividers */
+		for (odf = 0; odf < 3; odf++) {
+			int idx = n * 3 + odf;
+
+			get_cfg_composite_div(&odf_clk_gcfg, &stm32_odf[n][odf],
+					&c_cfg,	&stm32rcc_lock);
+
+			hws[ODF_BANK + idx] = clk_hw_register_composite(NULL,
+					stm32_odf[n][odf].name,
+					stm32_odf[n][odf].parent_name,
+					stm32_odf[n][odf].num_parents,
+					c_cfg.mux_hw, c_cfg.mux_ops,
+					c_cfg.div_hw, c_cfg.div_ops,
+					c_cfg.gate_hw, c_cfg.gate_ops,
+					stm32_odf[n][odf].flags);
+		}
+	}
+
+	/* Peripheral clocks */
+	for (n = 0; n < ARRAY_SIZE(pclk); n++)
+		hws[PERIF_BANK + n] = clk_hw_register_gate(NULL, pclk[n].name,
+				pclk[n].parent,
+				pclk[n].flags, base + pclk[n].gate_offset,
+				pclk[n].bit_idx, pclk[n].flags, &stm32rcc_lock);
+
+	/* Kernel clocks */
+	for (n = 0; n < ARRAY_SIZE(kclk); n++) {
+		get_cfg_composite_div(&kernel_clk_cfg, &kclk[n], &c_cfg,
+				&stm32rcc_lock);
+
+		hws[KERN_BANK + n] = clk_hw_register_composite(NULL,
+				kclk[n].name,
+				kclk[n].parent_name,
+				kclk[n].num_parents,
+				c_cfg.mux_hw, c_cfg.mux_ops,
+				c_cfg.div_hw, c_cfg.div_ops,
+				c_cfg.gate_hw, c_cfg.gate_ops,
+				kclk[n].flags);
+	}
+
+	/* RTC clock (default state is off) */
+	clk_hw_register_fixed_rate(NULL, "off", NULL, 0, 0);
+
+	get_cfg_composite_div(&rtc_clk_cfg, &rtc_clk, &c_cfg, &stm32rcc_lock);
+
+	hws[RTC_CK] = clk_hw_register_composite(NULL,
+			rtc_clk.name,
+			rtc_clk.parent_name,
+			rtc_clk.num_parents,
+			c_cfg.mux_hw, c_cfg.mux_ops,
+			c_cfg.div_hw, c_cfg.div_ops,
+			c_cfg.gate_hw, c_cfg.gate_ops,
+			rtc_clk.flags);
+
+	/* Micro-controller clocks */
+	for (n = 0; n < ARRAY_SIZE(mco_clk); n++) {
+		get_cfg_composite_div(&mco_clk_cfg, &mco_clk[n], &c_cfg,
+				&stm32rcc_lock);
+
+		hws[MCO_BANK + n] = clk_hw_register_composite(NULL,
+				mco_clk[n].name,
+				mco_clk[n].parent_name,
+				mco_clk[n].num_parents,
+				c_cfg.mux_hw, c_cfg.mux_ops,
+				c_cfg.div_hw, c_cfg.div_ops,
+				c_cfg.gate_hw, c_cfg.gate_ops,
+				mco_clk[n].flags);
+	}
+
+	of_clk_add_hw_provider(np, of_clk_hw_onecell_get, clk_data);
+
+	return;
+
+err_free_clks:
+	kfree(clk_data);
+}
+
+/* The RCC node is a clock and reset controller, and these
+ * functionalities are supported by different drivers that
+ * matches the same compatible strings.
+ */
+CLK_OF_DECLARE_DRIVER(stm32h7_rcc, "st,stm32h743-rcc", stm32h7_rcc_init);
diff --git a/include/dt-bindings/clock/stm32h7-clks.h b/include/dt-bindings/clock/stm32h7-clks.h
new file mode 100644
index 000000000000..6637272b3242
--- /dev/null
+++ b/include/dt-bindings/clock/stm32h7-clks.h
@@ -0,0 +1,165 @@
+/* SYS, CORE AND BUS CLOCKS */
+#define SYS_D1CPRE 0
+#define HCLK 1
+#define PCLK1 2
+#define PCLK2 3
+#define PCLK3 4
+#define PCLK4 5
+#define HSI_DIV 6
+#define HSE_1M 7
+#define I2S_CKIN 8
+#define CK_DSI_PHY 9
+#define HSE_CK 10
+#define LSE_CK 11
+#define CSI_KER_DIV122 12
+#define RTC_CK 13
+#define CPU_SYSTICK 14
+
+/* OSCILLATOR BANK */
+#define OSC_BANK 18
+#define HSI_CK 18
+#define HSI_KER_CK 19
+#define CSI_CK 20
+#define CSI_KER_CK 21
+#define RC48_CK 22
+#define LSI_CK 23
+
+/* MCLOCK BANK */
+#define MCLK_BANK 28
+#define PER_CK 28
+#define PLLSRC 29
+#define SYS_CK 30
+#define TRACEIN_CK 31
+
+/* ODF BANK */
+#define ODF_BANK 32
+#define PLL1_P 32
+#define PLL1_Q 33
+#define PLL1_R 34
+#define PLL2_P 35
+#define PLL2_Q 36
+#define PLL2_R 37
+#define PLL3_P 38
+#define PLL3_Q 39
+#define PLL3_R 40
+
+/* MCO BANK */
+#define MCO_BANK 41
+#define MCO1 41
+#define MCO2 42
+
+/* PERIF BANK */
+#define PERIF_BANK 50
+#define D1SRAM1_CK 50
+#define ITCM_CK 51
+#define DTCM2_CK 52
+#define DTCM1_CK 53
+#define FLITF_CK 54
+#define JPGDEC_CK 55
+#define DMA2D_CK 56
+#define MDMA_CK 57
+#define USB2ULPI_CK 58
+#define USB1ULPI_CK 59
+#define ETH1RX_CK 60
+#define ETH1TX_CK 61
+#define ETH1MAC_CK 62
+#define ART_CK 63
+#define DMA2_CK 64
+#define DMA1_CK 65
+#define D2SRAM3_CK 66
+#define D2SRAM2_CK 67
+#define D2SRAM1_CK 68
+#define HASH_CK 69
+#define CRYPT_CK 70
+#define CAMITF_CK 71
+#define BKPRAM_CK 72
+#define HSEM_CK 73
+#define BDMA_CK 74
+#define CRC_CK 75
+#define GPIOK_CK 76
+#define GPIOJ_CK 77
+#define GPIOI_CK 78
+#define GPIOH_CK 79
+#define GPIOG_CK 80
+#define GPIOF_CK 81
+#define GPIOE_CK 82
+#define GPIOD_CK 83
+#define GPIOC_CK 84
+#define GPIOB_CK 85
+#define GPIOA_CK 86
+#define WWDG1_CK 87
+#define DAC12_CK 88
+#define WWDG2_CK 89
+#define TIM14_CK 90
+#define TIM13_CK 91
+#define TIM12_CK 92
+#define TIM7_CK 93
+#define TIM6_CK 94
+#define TIM5_CK 95
+#define TIM4_CK 96
+#define TIM3_CK 97
+#define TIM2_CK 98
+#define MDIOS_CK 99
+#define OPAMP_CK 100
+#define CRS_CK 101
+#define TIM17_CK 102
+#define TIM16_CK 103
+#define TIM15_CK 104
+#define TIM8_CK 105
+#define TIM1_CK 106
+#define TMPSENS_CK 107
+#define RTCAPB_CK 108
+#define VREF_CK 109
+#define COMP12_CK 110
+#define SYSCFG_CK 111
+
+/* KERNEL BANK */
+#define KERN_BANK 120
+#define SDMMC1_CK 120
+#define QUADSPI_CK 121
+#define FMC_CK 122
+#define USB2OTG_CK 123
+#define USB1OTG_CK 124
+#define ADC12_CK 125
+#define SDMMC2_CK 126
+#define RNG_CK 127
+#define ADC3_CK 128
+#define DSI_CK 129
+#define LTDC_CK 130
+#define USART8_CK 131
+#define USART7_CK 132
+#define HDMICEC_CK 133
+#define I2C3_CK 134
+#define I2C2_CK 135
+#define I2C1_CK 136
+#define UART5_CK 137
+#define UART4_CK 138
+#define USART3_CK 139
+#define USART2_CK 140
+#define SPDIFRX_CK 141
+#define SPI3_CK 142
+#define SPI2_CK 143
+#define LPTIM1_CK 144
+#define FDCAN_CK 145
+#define SWP_CK 146
+#define HRTIM_CK 147
+#define DFSDM1_CK 148
+#define SAI3_CK 149
+#define SAI2_CK 150
+#define SAI1_CK 151
+#define SPI5_CK 152
+#define SPI4_CK 153
+#define SPI1_CK 154
+#define USART6_CK 155
+#define USART1_CK 156
+#define SAI4B_CK 157
+#define SAI4A_CK 158
+#define LPTIM5_CK 159
+#define LPTIM4_CK 160
+#define LPTIM3_CK 161
+#define LPTIM2_CK 162
+#define I2C4_CK 163
+#define SPI6_CK 164
+#define LPUART1_CK 165
+
+#define STM32H7_MAX_CLKS 166
diff --git a/include/dt-bindings/mfd/stm32h7-rcc.h b/include/dt-bindings/mfd/stm32h7-rcc.h
new file mode 100644
index 000000000000..461a8e04453a
--- /dev/null
+++ b/include/dt-bindings/mfd/stm32h7-rcc.h
@@ -0,0 +1,136 @@
+/*
+ * This header provides constants for the STM32H7 RCC IP
+ */
+
+#ifndef _DT_BINDINGS_MFD_STM32H7_RCC_H
+#define _DT_BINDINGS_MFD_STM32H7_RCC_H
+
+/* AHB3 */
+#define STM32H7_RCC_AHB3_MDMA		0
+#define STM32H7_RCC_AHB3_DMA2D		4
+#define STM32H7_RCC_AHB3_JPGDEC		5
+#define STM32H7_RCC_AHB3_FMC		12
+#define STM32H7_RCC_AHB3_QUADSPI	14
+#define STM32H7_RCC_AHB3_SDMMC1		16
+#define STM32H7_RCC_AHB3_CPU		31
+
+#define STM32H7_AHB3_RESET(bit) (STM32H7_RCC_AHB3_##bit + (0x7C * 8))
+
+/* AHB1 */
+#define STM32H7_RCC_AHB1_DMA1		0
+#define STM32H7_RCC_AHB1_DMA2		1
+#define STM32H7_RCC_AHB1_ADC12		5
+#define STM32H7_RCC_AHB1_ART		14
+#define STM32H7_RCC_AHB1_ETH1MAC	15
+#define STM32H7_RCC_AHB1_USB1OTG	25
+#define STM32H7_RCC_AHB1_USB2OTG	27
+
+#define STM32H7_AHB1_RESET(bit) (STM32H7_RCC_AHB1_##bit + (0x80 * 8))
+
+/* AHB2 */
+#define STM32H7_RCC_AHB2_CAMITF		0
+#define STM32H7_RCC_AHB2_CRYPT		4
+#define STM32H7_RCC_AHB2_HASH		5
+#define STM32H7_RCC_AHB2_RNG		6
+#define STM32H7_RCC_AHB2_SDMMC2		9
+
+#define STM32H7_AHB2_RESET(bit) (STM32H7_RCC_AHB2_##bit + (0x84 * 8))
+
+/* AHB4 */
+#define STM32H7_RCC_AHB4_GPIOA		0
+#define STM32H7_RCC_AHB4_GPIOB		1
+#define STM32H7_RCC_AHB4_GPIOC		2
+#define STM32H7_RCC_AHB4_GPIOD		3
+#define STM32H7_RCC_AHB4_GPIOE		4
+#define STM32H7_RCC_AHB4_GPIOF		5
+#define STM32H7_RCC_AHB4_GPIOG		6
+#define STM32H7_RCC_AHB4_GPIOH		7
+#define STM32H7_RCC_AHB4_GPIOI		8
+#define STM32H7_RCC_AHB4_GPIOJ		9
+#define STM32H7_RCC_AHB4_GPIOK		10
+#define STM32H7_RCC_AHB4_CRC		19
+#define STM32H7_RCC_AHB4_BDMA		21
+#define STM32H7_RCC_AHB4_ADC3		24
+#define STM32H7_RCC_AHB4_HSEM		25
+
+#define STM32H7_AHB4_RESET(bit) (STM32H7_RCC_AHB4_##bit + (0x88 * 8))
+
+/* APB3 */
+#define STM32H7_RCC_APB3_LTDC		3
+#define STM32H7_RCC_APB3_DSI		4
+
+#define STM32H7_APB3_RESET(bit) (STM32H7_RCC_APB3_##bit + (0x8C * 8))
+
+/* APB1L */
+#define STM32H7_RCC_APB1L_TIM2		0
+#define STM32H7_RCC_APB1L_TIM3		1
+#define STM32H7_RCC_APB1L_TIM4		2
+#define STM32H7_RCC_APB1L_TIM5		3
+#define STM32H7_RCC_APB1L_TIM6		4
+#define STM32H7_RCC_APB1L_TIM7		5
+#define STM32H7_RCC_APB1L_TIM12		6
+#define STM32H7_RCC_APB1L_TIM13		7
+#define STM32H7_RCC_APB1L_TIM14		8
+#define STM32H7_RCC_APB1L_LPTIM1	9
+#define STM32H7_RCC_APB1L_SPI2		14
+#define STM32H7_RCC_APB1L_SPI3		15
+#define STM32H7_RCC_APB1L_SPDIF_RX	16
+#define STM32H7_RCC_APB1L_USART2	17
+#define STM32H7_RCC_APB1L_USART3	18
+#define STM32H7_RCC_APB1L_UART4		19
+#define STM32H7_RCC_APB1L_UART5		20
+#define STM32H7_RCC_APB1L_I2C1		21
+#define STM32H7_RCC_APB1L_I2C2		22
+#define STM32H7_RCC_APB1L_I2C3		23
+#define STM32H7_RCC_APB1L_HDMICEC	27
+#define STM32H7_RCC_APB1L_DAC12		29
+#define STM32H7_RCC_APB1L_USART7	30
+#define STM32H7_RCC_APB1L_USART8	31
+
+#define STM32H7_APB1L_RESET(bit) (STM32H7_RCC_APB1L_##bit + (0x90 * 8))
+
+/* APB1H */
+#define STM32H7_RCC_APB1H_CRS		1
+#define STM32H7_RCC_APB1H_SWP		2
+#define STM32H7_RCC_APB1H_OPAMP		4
+#define STM32H7_RCC_APB1H_MDIOS		5
+#define STM32H7_RCC_APB1H_FDCAN		8
+
+#define STM32H7_APB1H_RESET(bit) (STM32H7_RCC_APB1H_##bit + (0x94 * 8))
+
+/* APB2 */
+#define STM32H7_RCC_APB2_TIM1		0
+#define STM32H7_RCC_APB2_TIM8		1
+#define STM32H7_RCC_APB2_USART1		4
+#define STM32H7_RCC_APB2_USART6		5
+#define STM32H7_RCC_APB2_SPI1		12
+#define STM32H7_RCC_APB2_SPI4		13
+#define STM32H7_RCC_APB2_TIM15		16
+#define STM32H7_RCC_APB2_TIM16		17
+#define STM32H7_RCC_APB2_TIM17		18
+#define STM32H7_RCC_APB2_SPI5		20
+#define STM32H7_RCC_APB2_SAI1		22
+#define STM32H7_RCC_APB2_SAI2		23
+#define STM32H7_RCC_APB2_SAI3		24
+#define STM32H7_RCC_APB2_DFSDM1		28
+#define STM32H7_RCC_APB2_HRTIM		29
+
+#define STM32H7_APB2_RESET(bit) (STM32H7_RCC_APB2_##bit + (0x98 * 8))
+
+/* APB4 */
+#define STM32H7_RCC_APB4_SYSCFG		1
+#define STM32H7_RCC_APB4_LPUART1	3
+#define STM32H7_RCC_APB4_SPI6		5
+#define STM32H7_RCC_APB4_I2C4		7
+#define STM32H7_RCC_APB4_LPTIM2		9
+#define STM32H7_RCC_APB4_LPTIM3		10
+#define STM32H7_RCC_APB4_LPTIM4		11
+#define STM32H7_RCC_APB4_LPTIM5		12
+#define STM32H7_RCC_APB4_COMP12		14
+#define STM32H7_RCC_APB4_VREF		15
+#define STM32H7_RCC_APB4_SAI4		21
+#define STM32H7_RCC_APB4_TMPSENS	26
+
+#define STM32H7_APB4_RESET(bit) (STM32H7_RCC_APB4_##bit + (0x9C * 8))
+
+#endif /* _DT_BINDINGS_MFD_STM32H7_RCC_H */
-- 
cgit v1.2.3


From 64104f703212ff50e855bb2e2fa80d71db62c521 Mon Sep 17 00:00:00 2001
From: Bart Van Assche <bart.vanassche@wdc.com>
Date: Wed, 30 Aug 2017 16:58:39 -0700
Subject: scsi: Call scsi_initialize_rq() for filesystem requests

If a pass-through request is submitted then blk_get_request()
initializes that request by calling scsi_initialize_rq(). Also call this
function for filesystem requests. Introduce CMD_INITIALIZED to keep
track of whether or not a request has already been initialized.

Signed-off-by: Bart Van Assche <bart.vanassche@wdc.com>
Cc: Christoph Hellwig <hch@lst.de>
Cc: Brian King <brking@linux.vnet.ibm.com>
Cc: Hannes Reinecke <hare@suse.com>
Cc: Johannes Thumshirn <jthumshirn@suse.de>
Reviewed-by: Johannes Thumshirn <jthumshirn@suse.de>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Martin K. Petersen <martin.petersen@oracle.com>
---
 drivers/scsi/scsi_lib.c  | 26 ++++++++++++++++++++++----
 include/scsi/scsi_cmnd.h |  3 +++
 2 files changed, 25 insertions(+), 4 deletions(-)

(limited to 'include')

diff --git a/drivers/scsi/scsi_lib.c b/drivers/scsi/scsi_lib.c
index 938a7e398cd4..4bfe0df35823 100644
--- a/drivers/scsi/scsi_lib.c
+++ b/drivers/scsi/scsi_lib.c
@@ -642,6 +642,11 @@ static bool scsi_end_request(struct request *req, blk_status_t error,
 	if (blk_queue_add_random(q))
 		add_disk_randomness(req->rq_disk);
 
+	if (!blk_rq_is_scsi(req)) {
+		WARN_ON_ONCE(!(cmd->flags & SCMD_INITIALIZED));
+		cmd->flags &= ~SCMD_INITIALIZED;
+	}
+
 	if (req->mq_ctx) {
 		/*
 		 * In the MQ case the command gets freed by __blk_mq_end_request,
@@ -1110,7 +1115,8 @@ EXPORT_SYMBOL(scsi_init_io);
  * scsi_initialize_rq - initialize struct scsi_cmnd.req
  * @rq: Request associated with the SCSI command to be initialized.
  *
- * Called from inside blk_get_request().
+ * Called from inside blk_get_request() for pass-through requests and from
+ * inside scsi_init_command() for filesystem requests.
  */
 void scsi_initialize_rq(struct request *rq)
 {
@@ -1154,7 +1160,13 @@ void scsi_init_command(struct scsi_device *dev, struct scsi_cmnd *cmd)
 {
 	void *buf = cmd->sense_buffer;
 	void *prot = cmd->prot_sdb;
-	unsigned int unchecked_isa_dma = cmd->flags & SCMD_UNCHECKED_ISA_DMA;
+	struct request *rq = blk_mq_rq_from_pdu(cmd);
+	unsigned int flags = cmd->flags & SCMD_PRESERVED_FLAGS;
+
+	if (!blk_rq_is_scsi(rq) && !(flags & SCMD_INITIALIZED)) {
+		flags |= SCMD_INITIALIZED;
+		scsi_initialize_rq(rq);
+	}
 
 	/* zero out the cmd, except for the embedded scsi_request */
 	memset((char *)cmd + sizeof(cmd->req), 0,
@@ -1163,7 +1175,7 @@ void scsi_init_command(struct scsi_device *dev, struct scsi_cmnd *cmd)
 	cmd->device = dev;
 	cmd->sense_buffer = buf;
 	cmd->prot_sdb = prot;
-	cmd->flags = unchecked_isa_dma;
+	cmd->flags = flags;
 	INIT_DELAYED_WORK(&cmd->abort_work, scmd_eh_abort_handler);
 	cmd->jiffies_at_alloc = jiffies;
 
@@ -1350,6 +1362,8 @@ static int scsi_prep_fn(struct request_queue *q, struct request *req)
 
 	ret = scsi_setup_cmnd(sdev, req);
 out:
+	if (ret != BLKPREP_OK)
+		cmd->flags &= ~SCMD_INITIALIZED;
 	return scsi_prep_return(q, req, ret);
 }
 
@@ -1869,6 +1883,7 @@ static int scsi_mq_prep_fn(struct request *req)
 	struct scsi_device *sdev = req->q->queuedata;
 	struct Scsi_Host *shost = sdev->host;
 	struct scatterlist *sg;
+	int ret;
 
 	scsi_init_command(sdev, cmd);
 
@@ -1902,7 +1917,10 @@ static int scsi_mq_prep_fn(struct request *req)
 
 	blk_mq_start_request(req);
 
-	return scsi_setup_cmnd(sdev, req);
+	ret = scsi_setup_cmnd(sdev, req);
+	if (ret != BLK_STS_OK)
+		cmd->flags &= ~SCMD_INITIALIZED;
+	return ret;
 }
 
 static void scsi_mq_done(struct scsi_cmnd *cmd)
diff --git a/include/scsi/scsi_cmnd.h b/include/scsi/scsi_cmnd.h
index f5afcff8d76f..a9f8f7e79d83 100644
--- a/include/scsi/scsi_cmnd.h
+++ b/include/scsi/scsi_cmnd.h
@@ -57,6 +57,9 @@ struct scsi_pointer {
 /* for scmd->flags */
 #define SCMD_TAGGED		(1 << 0)
 #define SCMD_UNCHECKED_ISA_DMA	(1 << 1)
+#define SCMD_INITIALIZED	(1 << 3)
+/* flags preserved across unprep / reprep */
+#define SCMD_PRESERVED_FLAGS	(SCMD_UNCHECKED_ISA_DMA | SCMD_INITIALIZED)
 
 struct scsi_cmnd {
 	struct scsi_request req;
-- 
cgit v1.2.3


From 482dca939fb7ee35ba20b944b4c2476133dbf0df Mon Sep 17 00:00:00 2001
From: David Ahern <dsahern@gmail.com>
Date: Thu, 31 Aug 2017 15:05:44 -0700
Subject: bpf: Add mark and priority to sock options that can be set

Add socket mark and priority to fields that can be set by
ebpf program when a socket is created.

Signed-off-by: David Ahern <dsahern@gmail.com>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/bpf.h |  2 ++
 net/core/filter.c        | 26 ++++++++++++++++++++++++++
 2 files changed, 28 insertions(+)

(limited to 'include')

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 08c206a863e1..ba848b761cfb 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -758,6 +758,8 @@ struct bpf_sock {
 	__u32 family;
 	__u32 type;
 	__u32 protocol;
+	__u32 mark;
+	__u32 priority;
 };
 
 #define XDP_PACKET_HEADROOM 256
diff --git a/net/core/filter.c b/net/core/filter.c
index c6a37fe0285b..f51b9690adf3 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -3455,6 +3455,10 @@ static bool sock_filter_is_valid_access(int off, int size,
 		switch (off) {
 		case offsetof(struct bpf_sock, bound_dev_if):
 			break;
+		case offsetof(struct bpf_sock, mark):
+			break;
+		case offsetof(struct bpf_sock, priority):
+			break;
 		default:
 			return false;
 		}
@@ -3958,6 +3962,28 @@ static u32 sock_filter_convert_ctx_access(enum bpf_access_type type,
 				      offsetof(struct sock, sk_bound_dev_if));
 		break;
 
+	case offsetof(struct bpf_sock, mark):
+		BUILD_BUG_ON(FIELD_SIZEOF(struct sock, sk_mark) != 4);
+
+		if (type == BPF_WRITE)
+			*insn++ = BPF_STX_MEM(BPF_W, si->dst_reg, si->src_reg,
+					offsetof(struct sock, sk_mark));
+		else
+			*insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg,
+				      offsetof(struct sock, sk_mark));
+		break;
+
+	case offsetof(struct bpf_sock, priority):
+		BUILD_BUG_ON(FIELD_SIZEOF(struct sock, sk_priority) != 4);
+
+		if (type == BPF_WRITE)
+			*insn++ = BPF_STX_MEM(BPF_W, si->dst_reg, si->src_reg,
+					offsetof(struct sock, sk_priority));
+		else
+			*insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg,
+				      offsetof(struct sock, sk_priority));
+		break;
+
 	case offsetof(struct bpf_sock, family):
 		BUILD_BUG_ON(FIELD_SIZEOF(struct sock, sk_family) != 2);
 
-- 
cgit v1.2.3


From abcc61537e3566cae7f1fd225f2dcb82b3595fe3 Mon Sep 17 00:00:00 2001
From: Colin Cross <ccross@android.com>
Date: Thu, 31 Aug 2017 10:04:24 +0200
Subject: ANDROID: binder: Add BINDER_GET_NODE_DEBUG_INFO ioctl

The BINDER_GET_NODE_DEBUG_INFO ioctl will return debug info on
a node.  Each successive call reusing the previous return value
will return the next node.  The data will be used by
libmemunreachable to mark the pointers with kernel references
as reachable.

Signed-off-by: Colin Cross <ccross@android.com>
Signed-off-by: Martijn Coenen <maco@android.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/android/binder.c            | 43 +++++++++++++++++++++++++++++++++++++
 include/uapi/linux/android/binder.h | 14 ++++++++++++
 2 files changed, 57 insertions(+)

(limited to 'include')

diff --git a/drivers/android/binder.c b/drivers/android/binder.c
index 8f2031c52ea4..eb8bd8d7c4c9 100644
--- a/drivers/android/binder.c
+++ b/drivers/android/binder.c
@@ -4389,6 +4389,31 @@ out:
 	return ret;
 }
 
+static int binder_ioctl_get_node_debug_info(struct binder_proc *proc,
+				struct binder_node_debug_info *info)
+{
+	struct rb_node *n;
+	binder_uintptr_t ptr = info->ptr;
+
+	memset(info, 0, sizeof(*info));
+
+	binder_inner_proc_lock(proc);
+	for (n = rb_first(&proc->nodes); n != NULL; n = rb_next(n)) {
+		struct binder_node *node = rb_entry(n, struct binder_node,
+						    rb_node);
+		if (node->ptr > ptr) {
+			info->ptr = node->ptr;
+			info->cookie = node->cookie;
+			info->has_strong_ref = node->has_strong_ref;
+			info->has_weak_ref = node->has_weak_ref;
+			break;
+		}
+	}
+	binder_inner_proc_unlock(proc);
+
+	return 0;
+}
+
 static long binder_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
 {
 	int ret;
@@ -4458,6 +4483,24 @@ static long binder_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
 		}
 		break;
 	}
+	case BINDER_GET_NODE_DEBUG_INFO: {
+		struct binder_node_debug_info info;
+
+		if (copy_from_user(&info, ubuf, sizeof(info))) {
+			ret = -EFAULT;
+			goto err;
+		}
+
+		ret = binder_ioctl_get_node_debug_info(proc, &info);
+		if (ret < 0)
+			goto err;
+
+		if (copy_to_user(ubuf, &info, sizeof(info))) {
+			ret = -EFAULT;
+			goto err;
+		}
+		break;
+	}
 	default:
 		ret = -EINVAL;
 		goto err;
diff --git a/include/uapi/linux/android/binder.h b/include/uapi/linux/android/binder.h
index 7668b5791c91..84a9a0944e13 100644
--- a/include/uapi/linux/android/binder.h
+++ b/include/uapi/linux/android/binder.h
@@ -186,6 +186,19 @@ struct binder_version {
 #define BINDER_CURRENT_PROTOCOL_VERSION 8
 #endif
 
+/*
+ * Use with BINDER_GET_NODE_DEBUG_INFO, driver reads ptr, writes to all fields.
+ * Set ptr to NULL for the first call to get the info for the first node, and
+ * then repeat the call passing the previously returned value to get the next
+ * nodes.  ptr will be 0 when there are no more nodes.
+ */
+struct binder_node_debug_info {
+	binder_uintptr_t ptr;
+	binder_uintptr_t cookie;
+	__u32            has_strong_ref;
+	__u32            has_weak_ref;
+};
+
 #define BINDER_WRITE_READ		_IOWR('b', 1, struct binder_write_read)
 #define BINDER_SET_IDLE_TIMEOUT		_IOW('b', 3, __s64)
 #define BINDER_SET_MAX_THREADS		_IOW('b', 5, __u32)
@@ -193,6 +206,7 @@ struct binder_version {
 #define BINDER_SET_CONTEXT_MGR		_IOW('b', 7, __s32)
 #define BINDER_THREAD_EXIT		_IOW('b', 8, __s32)
 #define BINDER_VERSION			_IOWR('b', 9, struct binder_version)
+#define BINDER_GET_NODE_DEBUG_INFO	_IOWR('b', 11, struct binder_node_debug_info)
 
 /*
  * NOTE: Two special error codes you should check for when calling
-- 
cgit v1.2.3


From b32dbc1e0bf0eaa8bdd725491b361d5fc2f57a85 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Fri, 25 Aug 2017 17:12:18 +0200
Subject: dma-coherent: remove the DMA_MEMORY_INCLUDES_CHILDREN flag

This flag was never implemented or used.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Robin Murphy <robin.murphy@arm.com>
---
 Documentation/DMA-API.txt     | 4 ----
 drivers/base/dma-coherent.c   | 2 --
 drivers/char/virtio_console.c | 3 ---
 include/linux/dma-mapping.h   | 3 +--
 4 files changed, 1 insertion(+), 11 deletions(-)

(limited to 'include')

diff --git a/Documentation/DMA-API.txt b/Documentation/DMA-API.txt
index ef3a04fcad65..dddf52e768d6 100644
--- a/Documentation/DMA-API.txt
+++ b/Documentation/DMA-API.txt
@@ -600,10 +600,6 @@ flags can be ORed together and are:
 
 One or both of these flags must be present.
 
-- DMA_MEMORY_INCLUDES_CHILDREN - make the declared memory be allocated by
-  dma_alloc_coherent of any child devices of this one (for memory residing
-  on a bridge).
-
 - DMA_MEMORY_EXCLUSIVE - only allocate memory from the declared regions.
   Do not allow dma_alloc_coherent() to fall back to system memory when
   it's out of memory in the declared region.
diff --git a/drivers/base/dma-coherent.c b/drivers/base/dma-coherent.c
index 1c152aed6b82..6f6dc4d41788 100644
--- a/drivers/base/dma-coherent.c
+++ b/drivers/base/dma-coherent.c
@@ -109,8 +109,6 @@ static int dma_assign_coherent_memory(struct device *dev,
 		return -EBUSY;
 
 	dev->dma_mem = mem;
-	/* FIXME: this routine just ignores DMA_MEMORY_INCLUDES_CHILDREN */
-
 	return 0;
 }
 
diff --git a/drivers/char/virtio_console.c b/drivers/char/virtio_console.c
index ad843eb02ae7..48e86ce9fac8 100644
--- a/drivers/char/virtio_console.c
+++ b/drivers/char/virtio_console.c
@@ -451,9 +451,6 @@ static struct port_buffer *alloc_buf(struct virtqueue *vq, size_t buf_size,
 		 * device is created by remoteproc, the DMA memory is
 		 * associated with the grandparent device:
 		 * vdev => rproc => platform-dev.
-		 * The code here would have been less quirky if
-		 * DMA_MEMORY_INCLUDES_CHILDREN had been supported
-		 * in dma-coherent.c
 		 */
 		if (!vq->vdev->dev.parent || !vq->vdev->dev.parent->parent)
 			goto free_buf;
diff --git a/include/linux/dma-mapping.h b/include/linux/dma-mapping.h
index 4c98cc96971f..abf369b11f54 100644
--- a/include/linux/dma-mapping.h
+++ b/include/linux/dma-mapping.h
@@ -696,8 +696,7 @@ static inline int dma_get_cache_alignment(void)
 /* flags for the coherent memory api */
 #define	DMA_MEMORY_MAP			0x01
 #define DMA_MEMORY_IO			0x02
-#define DMA_MEMORY_INCLUDES_CHILDREN	0x04
-#define DMA_MEMORY_EXCLUSIVE		0x08
+#define DMA_MEMORY_EXCLUSIVE		0x04
 
 #ifdef CONFIG_HAVE_GENERIC_DMA_COHERENT
 int dma_declare_coherent_memory(struct device *dev, phys_addr_t phys_addr,
-- 
cgit v1.2.3


From 2436bdcda53ff4abb7897c87fa29ef3de8055344 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Fri, 25 Aug 2017 17:13:09 +0200
Subject: dma-coherent: remove the DMA_MEMORY_MAP and DMA_MEMORY_IO flags

DMA_MEMORY_IO was never used in the tree, so remove it.  That means there is
no need for the DMA_MEMORY_MAP flag either now, so remove it as well and
change dma_declare_coherent_memory to return a normal errno value.

Signed-off-by: Christoph Hellwig <hch@lst.de>
 Reviewed-by: Marek Szyprowski <m.szyprowski@samsung.com>
---
 Documentation/DMA-API.txt                          | 21 +---------
 arch/arm/mach-imx/mach-imx27_visstrim_m10.c        | 44 ++++++++-------------
 arch/arm/mach-imx/mach-mx31moboard.c               | 12 +++---
 arch/sh/drivers/pci/fixups-dreamcast.c             |  3 +-
 drivers/base/dma-coherent.c                        | 46 +++++++---------------
 drivers/base/dma-mapping.c                         |  7 +---
 .../platform/soc_camera/sh_mobile_ceu_camera.c     |  5 +--
 drivers/scsi/NCR_Q720.c                            |  3 +-
 drivers/usb/host/ohci-sm501.c                      |  7 ++--
 drivers/usb/host/ohci-tmio.c                       |  9 ++---
 include/linux/dma-mapping.h                        |  6 +--
 11 files changed, 52 insertions(+), 111 deletions(-)

(limited to 'include')

diff --git a/Documentation/DMA-API.txt b/Documentation/DMA-API.txt
index dddf52e768d6..ac66ae2509a9 100644
--- a/Documentation/DMA-API.txt
+++ b/Documentation/DMA-API.txt
@@ -592,30 +592,11 @@ size is the size of the area (must be multiples of PAGE_SIZE).
 
 flags can be ORed together and are:
 
-- DMA_MEMORY_MAP - request that the memory returned from
-  dma_alloc_coherent() be directly writable.
-
-- DMA_MEMORY_IO - request that the memory returned from
-  dma_alloc_coherent() be addressable using read()/write()/memcpy_toio() etc.
-
-One or both of these flags must be present.
-
 - DMA_MEMORY_EXCLUSIVE - only allocate memory from the declared regions.
   Do not allow dma_alloc_coherent() to fall back to system memory when
   it's out of memory in the declared region.
 
-The return value will be either DMA_MEMORY_MAP or DMA_MEMORY_IO and
-must correspond to a passed in flag (i.e. no returning DMA_MEMORY_IO
-if only DMA_MEMORY_MAP were passed in) for success or zero for
-failure.
-
-Note, for DMA_MEMORY_IO returns, all subsequent memory returned by
-dma_alloc_coherent() may no longer be accessed directly, but instead
-must be accessed using the correct bus functions.  If your driver
-isn't prepared to handle this contingency, it should not specify
-DMA_MEMORY_IO in the input flags.
-
-As a simplification for the platforms, only **one** such region of
+As a simplification for the platforms, only *one* such region of
 memory may be declared per device.
 
 For reasons of efficiency, most platforms choose to track the declared
diff --git a/arch/arm/mach-imx/mach-imx27_visstrim_m10.c b/arch/arm/mach-imx/mach-imx27_visstrim_m10.c
index dd75a4756761..5169dfba9718 100644
--- a/arch/arm/mach-imx/mach-imx27_visstrim_m10.c
+++ b/arch/arm/mach-imx/mach-imx27_visstrim_m10.c
@@ -245,7 +245,6 @@ static phys_addr_t mx2_camera_base __initdata;
 static void __init visstrim_analog_camera_init(void)
 {
 	struct platform_device *pdev;
-	int dma;
 
 	gpio_set_value(TVP5150_PWDN, 1);
 	ndelay(1);
@@ -258,12 +257,9 @@ static void __init visstrim_analog_camera_init(void)
 	if (IS_ERR(pdev))
 		return;
 
-	dma = dma_declare_coherent_memory(&pdev->dev,
-				mx2_camera_base, mx2_camera_base,
-				MX2_CAMERA_BUF_SIZE,
-				DMA_MEMORY_MAP | DMA_MEMORY_EXCLUSIVE);
-	if (!(dma & DMA_MEMORY_MAP))
-		return;
+	dma_declare_coherent_memory(&pdev->dev, mx2_camera_base,
+				    mx2_camera_base, MX2_CAMERA_BUF_SIZE,
+				    DMA_MEMORY_EXCLUSIVE);
 }
 
 static void __init visstrim_reserve(void)
@@ -444,16 +440,13 @@ static const struct imx_ssi_platform_data visstrim_m10_ssi_pdata __initconst = {
 static void __init visstrim_coda_init(void)
 {
 	struct platform_device *pdev;
-	int dma;
 
 	pdev = imx27_add_coda();
-	dma = dma_declare_coherent_memory(&pdev->dev,
-					  mx2_camera_base + MX2_CAMERA_BUF_SIZE,
-					  mx2_camera_base + MX2_CAMERA_BUF_SIZE,
-					  MX2_CAMERA_BUF_SIZE,
-					  DMA_MEMORY_MAP | DMA_MEMORY_EXCLUSIVE);
-	if (!(dma & DMA_MEMORY_MAP))
-		return;
+	dma_declare_coherent_memory(&pdev->dev,
+				    mx2_camera_base + MX2_CAMERA_BUF_SIZE,
+				    mx2_camera_base + MX2_CAMERA_BUF_SIZE,
+				    MX2_CAMERA_BUF_SIZE,
+				    DMA_MEMORY_EXCLUSIVE);
 }
 
 /* DMA deinterlace */
@@ -466,24 +459,21 @@ static void __init visstrim_deinterlace_init(void)
 {
 	int ret = -ENOMEM;
 	struct platform_device *pdev = &visstrim_deinterlace;
-	int dma;
 
 	ret = platform_device_register(pdev);
 
-	dma = dma_declare_coherent_memory(&pdev->dev,
-					  mx2_camera_base + 2 * MX2_CAMERA_BUF_SIZE,
-					  mx2_camera_base + 2 * MX2_CAMERA_BUF_SIZE,
-					  MX2_CAMERA_BUF_SIZE,
-					  DMA_MEMORY_MAP | DMA_MEMORY_EXCLUSIVE);
-	if (!(dma & DMA_MEMORY_MAP))
-		return;
+	dma_declare_coherent_memory(&pdev->dev,
+				    mx2_camera_base + 2 * MX2_CAMERA_BUF_SIZE,
+				    mx2_camera_base + 2 * MX2_CAMERA_BUF_SIZE,
+				    MX2_CAMERA_BUF_SIZE,
+				    DMA_MEMORY_EXCLUSIVE);
 }
 
 /* Emma-PrP for format conversion */
 static void __init visstrim_emmaprp_init(void)
 {
 	struct platform_device *pdev;
-	int dma;
+	int ret;
 
 	pdev = imx27_add_mx2_emmaprp();
 	if (IS_ERR(pdev))
@@ -493,11 +483,11 @@ static void __init visstrim_emmaprp_init(void)
 	 * Use the same memory area as the analog camera since both
 	 * devices are, by nature, exclusive.
 	 */
-	dma = dma_declare_coherent_memory(&pdev->dev,
+	ret = dma_declare_coherent_memory(&pdev->dev,
 				mx2_camera_base, mx2_camera_base,
 				MX2_CAMERA_BUF_SIZE,
-				DMA_MEMORY_MAP | DMA_MEMORY_EXCLUSIVE);
-	if (!(dma & DMA_MEMORY_MAP))
+				DMA_MEMORY_EXCLUSIVE);
+	if (ret)
 		pr_err("Failed to declare memory for emmaprp\n");
 }
 
diff --git a/arch/arm/mach-imx/mach-mx31moboard.c b/arch/arm/mach-imx/mach-mx31moboard.c
index bde9a9af6714..3cd030960c98 100644
--- a/arch/arm/mach-imx/mach-mx31moboard.c
+++ b/arch/arm/mach-imx/mach-mx31moboard.c
@@ -475,7 +475,7 @@ static phys_addr_t mx3_camera_base __initdata;
 
 static int __init mx31moboard_init_cam(void)
 {
-	int dma, ret = -ENOMEM;
+	int dma, ret;
 	struct platform_device *pdev;
 
 	imx31_add_ipu_core();
@@ -484,11 +484,11 @@ static int __init mx31moboard_init_cam(void)
 	if (IS_ERR(pdev))
 		return PTR_ERR(pdev);
 
-	dma = dma_declare_coherent_memory(&pdev->dev,
-					mx3_camera_base, mx3_camera_base,
-					MX3_CAMERA_BUF_SIZE,
-					DMA_MEMORY_MAP | DMA_MEMORY_EXCLUSIVE);
-	if (!(dma & DMA_MEMORY_MAP))
+	ret = dma_declare_coherent_memory(&pdev->dev,
+					  mx3_camera_base, mx3_camera_base,
+					  MX3_CAMERA_BUF_SIZE,
+					  DMA_MEMORY_EXCLUSIVE);
+	if (ret)
 		goto err;
 
 	ret = platform_device_add(pdev);
diff --git a/arch/sh/drivers/pci/fixups-dreamcast.c b/arch/sh/drivers/pci/fixups-dreamcast.c
index 1d1c5a227e50..c931d5872ffe 100644
--- a/arch/sh/drivers/pci/fixups-dreamcast.c
+++ b/arch/sh/drivers/pci/fixups-dreamcast.c
@@ -63,11 +63,10 @@ static void gapspci_fixup_resources(struct pci_dev *dev)
 		res.end = GAPSPCI_DMA_BASE + GAPSPCI_DMA_SIZE - 1;
 		res.flags = IORESOURCE_MEM;
 		pcibios_resource_to_bus(dev->bus, &region, &res);
-		BUG_ON(!dma_declare_coherent_memory(&dev->dev,
+		BUG_ON(dma_declare_coherent_memory(&dev->dev,
 						res.start,
 						region.start,
 						resource_size(&res),
-						DMA_MEMORY_MAP |
 						DMA_MEMORY_EXCLUSIVE));
 		break;
 	default:
diff --git a/drivers/base/dma-coherent.c b/drivers/base/dma-coherent.c
index 6f6dc4d41788..b275eeb739f1 100644
--- a/drivers/base/dma-coherent.c
+++ b/drivers/base/dma-coherent.c
@@ -46,15 +46,10 @@ static bool dma_init_coherent_memory(
 	int pages = size >> PAGE_SHIFT;
 	int bitmap_size = BITS_TO_LONGS(pages) * sizeof(long);
 
-	if ((flags & (DMA_MEMORY_MAP | DMA_MEMORY_IO)) == 0)
-		goto out;
 	if (!size)
 		goto out;
 
-	if (flags & DMA_MEMORY_MAP)
-		mem_base = memremap(phys_addr, size, MEMREMAP_WC);
-	else
-		mem_base = ioremap(phys_addr, size);
+	mem_base = memremap(phys_addr, size, MEMREMAP_WC);
 	if (!mem_base)
 		goto out;
 
@@ -77,12 +72,8 @@ static bool dma_init_coherent_memory(
 
 out:
 	kfree(dma_mem);
-	if (mem_base) {
-		if (flags & DMA_MEMORY_MAP)
-			memunmap(mem_base);
-		else
-			iounmap(mem_base);
-	}
+	if (mem_base)
+		memunmap(mem_base);
 	return false;
 }
 
@@ -91,10 +82,7 @@ static void dma_release_coherent_memory(struct dma_coherent_mem *mem)
 	if (!mem)
 		return;
 
-	if (mem->flags & DMA_MEMORY_MAP)
-		memunmap(mem->virt_base);
-	else
-		iounmap(mem->virt_base);
+	memunmap(mem->virt_base);
 	kfree(mem->bitmap);
 	kfree(mem);
 }
@@ -116,16 +104,16 @@ int dma_declare_coherent_memory(struct device *dev, phys_addr_t phys_addr,
 				dma_addr_t device_addr, size_t size, int flags)
 {
 	struct dma_coherent_mem *mem;
+	int ret;
 
-	if (!dma_init_coherent_memory(phys_addr, device_addr, size, flags,
-				      &mem))
-		return 0;
-
-	if (dma_assign_coherent_memory(dev, mem) == 0)
-		return flags & DMA_MEMORY_MAP ? DMA_MEMORY_MAP : DMA_MEMORY_IO;
+	ret = dma_init_coherent_memory(phys_addr, device_addr, size, flags, &mem);
+	if (ret)
+		return ret;
 
-	dma_release_coherent_memory(mem);
-	return 0;
+	ret = dma_assign_coherent_memory(dev, mem);
+	if (ret)
+		dma_release_coherent_memory(mem);
+	return ret;
 }
 EXPORT_SYMBOL(dma_declare_coherent_memory);
 
@@ -186,15 +174,9 @@ static void *__dma_alloc_from_coherent(struct dma_coherent_mem *mem,
 	 */
 	*dma_handle = mem->device_base + (pageno << PAGE_SHIFT);
 	ret = mem->virt_base + (pageno << PAGE_SHIFT);
-	dma_memory_map = (mem->flags & DMA_MEMORY_MAP);
 	spin_unlock_irqrestore(&mem->spinlock, flags);
-	if (dma_memory_map)
-		memset(ret, 0, size);
-	else
-		memset_io(ret, 0, size);
-
+	memset(ret, 0, size);
 	return ret;
-
 err:
 	spin_unlock_irqrestore(&mem->spinlock, flags);
 	return NULL;
@@ -360,7 +342,7 @@ static int rmem_dma_device_init(struct reserved_mem *rmem, struct device *dev)
 
 	if (!mem &&
 	    !dma_init_coherent_memory(rmem->base, rmem->base, rmem->size,
-				      DMA_MEMORY_MAP | DMA_MEMORY_EXCLUSIVE,
+				      DMA_MEMORY_EXCLUSIVE,
 				      &mem)) {
 		pr_err("Reserved memory: failed to init DMA memory pool at %pa, size %ld MiB\n",
 			&rmem->base, (unsigned long)rmem->size / SZ_1M);
diff --git a/drivers/base/dma-mapping.c b/drivers/base/dma-mapping.c
index b555ff9dd8fc..e584eddef0a7 100644
--- a/drivers/base/dma-mapping.c
+++ b/drivers/base/dma-mapping.c
@@ -176,13 +176,10 @@ int dmam_declare_coherent_memory(struct device *dev, phys_addr_t phys_addr,
 
 	rc = dma_declare_coherent_memory(dev, phys_addr, device_addr, size,
 					 flags);
-	if (rc) {
+	if (!rc)
 		devres_add(dev, res);
-		rc = 0;
-	} else {
+	else
 		devres_free(res);
-		rc = -ENOMEM;
-	}
 
 	return rc;
 }
diff --git a/drivers/media/platform/soc_camera/sh_mobile_ceu_camera.c b/drivers/media/platform/soc_camera/sh_mobile_ceu_camera.c
index 96dc01750bc0..36762ec954e7 100644
--- a/drivers/media/platform/soc_camera/sh_mobile_ceu_camera.c
+++ b/drivers/media/platform/soc_camera/sh_mobile_ceu_camera.c
@@ -1708,11 +1708,10 @@ static int sh_mobile_ceu_probe(struct platform_device *pdev)
 		err = dma_declare_coherent_memory(&pdev->dev, res->start,
 						  res->start,
 						  resource_size(res),
-						  DMA_MEMORY_MAP |
 						  DMA_MEMORY_EXCLUSIVE);
-		if (!err) {
+		if (err) {
 			dev_err(&pdev->dev, "Unable to declare CEU memory.\n");
-			return -ENXIO;
+			return err;
 		}
 
 		pcdev->video_limit = resource_size(res);
diff --git a/drivers/scsi/NCR_Q720.c b/drivers/scsi/NCR_Q720.c
index 05835bf1bf9c..54e7d26908ee 100644
--- a/drivers/scsi/NCR_Q720.c
+++ b/drivers/scsi/NCR_Q720.c
@@ -217,8 +217,7 @@ NCR_Q720_probe(struct device *dev)
 	}
 	
 	if (dma_declare_coherent_memory(dev, base_addr, base_addr,
-					mem_size, DMA_MEMORY_MAP)
-	    != DMA_MEMORY_MAP) {
+					mem_size, 0)) {
 		printk(KERN_ERR "NCR_Q720: DMA declare memory failed\n");
 		goto out_release_region;
 	}
diff --git a/drivers/usb/host/ohci-sm501.c b/drivers/usb/host/ohci-sm501.c
index a8b8d8b8d9f3..d4e0f7cd96fa 100644
--- a/drivers/usb/host/ohci-sm501.c
+++ b/drivers/usb/host/ohci-sm501.c
@@ -123,13 +123,12 @@ static int ohci_hcd_sm501_drv_probe(struct platform_device *pdev)
 	 * regular memory. The HCD_LOCAL_MEM flag does just that.
 	 */
 
-	if (!dma_declare_coherent_memory(dev, mem->start,
+	retval = dma_declare_coherent_memory(dev, mem->start,
 					 mem->start - mem->parent->start,
 					 resource_size(mem),
-					 DMA_MEMORY_MAP |
-					 DMA_MEMORY_EXCLUSIVE)) {
+					 DMA_MEMORY_EXCLUSIVE);
+	if (retval) {
 		dev_err(dev, "cannot declare coherent memory\n");
-		retval = -ENXIO;
 		goto err1;
 	}
 
diff --git a/drivers/usb/host/ohci-tmio.c b/drivers/usb/host/ohci-tmio.c
index cfcfadfc94fc..16d081a093bb 100644
--- a/drivers/usb/host/ohci-tmio.c
+++ b/drivers/usb/host/ohci-tmio.c
@@ -227,13 +227,10 @@ static int ohci_hcd_tmio_drv_probe(struct platform_device *dev)
 		goto err_ioremap_regs;
 	}
 
-	if (!dma_declare_coherent_memory(&dev->dev, sram->start,
-				sram->start,
-				resource_size(sram),
-				DMA_MEMORY_MAP | DMA_MEMORY_EXCLUSIVE)) {
-		ret = -EBUSY;
+	ret = dma_declare_coherent_memory(&dev->dev, sram->start, sram->start,
+				resource_size(sram), DMA_MEMORY_EXCLUSIVE);
+	if (ret)
 		goto err_dma_declare;
-	}
 
 	if (cell->enable) {
 		ret = cell->enable(dev);
diff --git a/include/linux/dma-mapping.h b/include/linux/dma-mapping.h
index abf369b11f54..b7dd11c02a45 100644
--- a/include/linux/dma-mapping.h
+++ b/include/linux/dma-mapping.h
@@ -694,9 +694,7 @@ static inline int dma_get_cache_alignment(void)
 #endif
 
 /* flags for the coherent memory api */
-#define	DMA_MEMORY_MAP			0x01
-#define DMA_MEMORY_IO			0x02
-#define DMA_MEMORY_EXCLUSIVE		0x04
+#define DMA_MEMORY_EXCLUSIVE		0x01
 
 #ifdef CONFIG_HAVE_GENERIC_DMA_COHERENT
 int dma_declare_coherent_memory(struct device *dev, phys_addr_t phys_addr,
@@ -709,7 +707,7 @@ static inline int
 dma_declare_coherent_memory(struct device *dev, phys_addr_t phys_addr,
 			    dma_addr_t device_addr, size_t size, int flags)
 {
-	return 0;
+	return -ENOSYS;
 }
 
 static inline void
-- 
cgit v1.2.3


From a728f56094e7cf60a1dc0642fe86901d1a4dfb4e Mon Sep 17 00:00:00 2001
From: Vitaly Wool <vitalywool@gmail.com>
Date: Thu, 17 Aug 2017 13:42:36 +0200
Subject: ASoC: make clock direction configurable in asoc-simple

Some CPU drivers (e. g. davinci-mcasp) may require the system clock to
be configured as OUT, while there's no good way currently to set
SND_SOC_CLK_OUT in simple-soc driver if the clock is fixed-rate.

This patch makes asoc_simple_card_init_dai() initialize clock to
SND_SOCK_CLK_OUT if explicitly stated in the relevant dts file. This
change is transparent and doesn't change the default behavior.

Signed-off-by: Vitaly Wool <vitaly.wool@konsulko.com>
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 Documentation/devicetree/bindings/sound/simple-card.txt | 3 +++
 include/sound/simple_card_utils.h                       | 1 +
 sound/soc/generic/simple-card-utils.c                   | 9 +++++++--
 3 files changed, 11 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/Documentation/devicetree/bindings/sound/simple-card.txt b/Documentation/devicetree/bindings/sound/simple-card.txt
index c7a93931fad2..166f2290233b 100644
--- a/Documentation/devicetree/bindings/sound/simple-card.txt
+++ b/Documentation/devicetree/bindings/sound/simple-card.txt
@@ -86,6 +86,9 @@ Optional CPU/CODEC subnodes properties:
 					  in dai startup() and disabled with
 					  clk_disable_unprepare() in dai
 					  shutdown().
+- system-clock-direction-out		: specifies clock direction as 'out' on
+					  initialization. It is useful for some aCPUs with
+					  fixed clocks.
 
 Example 1 - single DAI link:
 
diff --git a/include/sound/simple_card_utils.h b/include/sound/simple_card_utils.h
index 42c6a6ac3ce6..7e25afce6566 100644
--- a/include/sound/simple_card_utils.h
+++ b/include/sound/simple_card_utils.h
@@ -15,6 +15,7 @@
 struct asoc_simple_dai {
 	const char *name;
 	unsigned int sysclk;
+	int clk_direction;
 	int slots;
 	int slot_width;
 	unsigned int tx_slot_mask;
diff --git a/sound/soc/generic/simple-card-utils.c b/sound/soc/generic/simple-card-utils.c
index 26d64fa40c9c..af44c8c52a19 100644
--- a/sound/soc/generic/simple-card-utils.c
+++ b/sound/soc/generic/simple-card-utils.c
@@ -196,7 +196,11 @@ int asoc_simple_card_parse_clk(struct device *dev,
 			simple_dai->sysclk = clk_get_rate(clk);
 	}
 
-	dev_dbg(dev, "%s : sysclk = %d\n", name, simple_dai->sysclk);
+	if (of_property_read_bool(node, "system-clock-direction-out"))
+		simple_dai->clk_direction = SND_SOC_CLOCK_OUT;
+
+	dev_dbg(dev, "%s : sysclk = %d, direction %d\n", name,
+		simple_dai->sysclk, simple_dai->clk_direction);
 
 	return 0;
 }
@@ -310,7 +314,8 @@ int asoc_simple_card_init_dai(struct snd_soc_dai *dai,
 	int ret;
 
 	if (simple_dai->sysclk) {
-		ret = snd_soc_dai_set_sysclk(dai, 0, simple_dai->sysclk, 0);
+		ret = snd_soc_dai_set_sysclk(dai, 0, simple_dai->sysclk,
+					     simple_dai->clk_direction);
 		if (ret && ret != -ENOTSUPP) {
 			dev_err(dai->dev, "simple-card: set_sysclk error\n");
 			return ret;
-- 
cgit v1.2.3


From 25cc72a33835ed8a6f53180a822cadab855852ac Mon Sep 17 00:00:00 2001
From: Ido Schimmel <idosch@mellanox.com>
Date: Fri, 1 Sep 2017 10:52:31 +0200
Subject: mlxsw: spectrum: Forbid linking to devices that have uppers

The mlxsw driver relies on NETDEV_CHANGEUPPER events to configure the
device in case a port is enslaved to a master netdev such as bridge or
bond.

Since the driver ignores events unrelated to its ports and their
uppers, it's possible to engineer situations in which the device's data
path differs from the kernel's.

One example to such a situation is when a port is enslaved to a bond
that is already enslaved to a bridge. When the bond was enslaved the
driver ignored the event - as the bond wasn't one of its uppers - and
therefore a bridge port instance isn't created in the device.

Until such configurations are supported forbid them by checking that the
upper device doesn't have uppers of its own.

Fixes: 0d65fc13042f ("mlxsw: spectrum: Implement LAG port join/leave")
Signed-off-by: Ido Schimmel <idosch@mellanox.com>
Reported-by: Nogah Frankel <nogahf@mellanox.com>
Tested-by: Nogah Frankel <nogahf@mellanox.com>
Signed-off-by: Jiri Pirko <jiri@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/mellanox/mlxsw/spectrum.c | 6 ++++++
 include/linux/netdevice.h                      | 2 ++
 net/core/dev.c                                 | 3 ++-
 3 files changed, 10 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum.c
index 60bf8f27cc00..c6a3e61b53bd 100644
--- a/drivers/net/ethernet/mellanox/mlxsw/spectrum.c
+++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum.c
@@ -4139,6 +4139,8 @@ static int mlxsw_sp_netdevice_port_upper_event(struct net_device *lower_dev,
 			return -EINVAL;
 		if (!info->linking)
 			break;
+		if (netdev_has_any_upper_dev(upper_dev))
+			return -EINVAL;
 		if (netif_is_lag_master(upper_dev) &&
 		    !mlxsw_sp_master_lag_check(mlxsw_sp, upper_dev,
 					       info->upper_info))
@@ -4258,6 +4260,10 @@ static int mlxsw_sp_netdevice_port_vlan_event(struct net_device *vlan_dev,
 		upper_dev = info->upper_dev;
 		if (!netif_is_bridge_master(upper_dev))
 			return -EINVAL;
+		if (!info->linking)
+			break;
+		if (netdev_has_any_upper_dev(upper_dev))
+			return -EINVAL;
 		break;
 	case NETDEV_CHANGEUPPER:
 		upper_dev = info->upper_dev;
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 779b23595596..c99ba7914c0a 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -3866,6 +3866,8 @@ int netdev_walk_all_upper_dev_rcu(struct net_device *dev,
 bool netdev_has_upper_dev_all_rcu(struct net_device *dev,
 				  struct net_device *upper_dev);
 
+bool netdev_has_any_upper_dev(struct net_device *dev);
+
 void *netdev_lower_get_next_private(struct net_device *dev,
 				    struct list_head **iter);
 void *netdev_lower_get_next_private_rcu(struct net_device *dev,
diff --git a/net/core/dev.c b/net/core/dev.c
index 818dfa6e7ab5..86b4b0a79e7a 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -5668,12 +5668,13 @@ EXPORT_SYMBOL(netdev_has_upper_dev_all_rcu);
  * Find out if a device is linked to an upper device and return true in case
  * it is. The caller must hold the RTNL lock.
  */
-static bool netdev_has_any_upper_dev(struct net_device *dev)
+bool netdev_has_any_upper_dev(struct net_device *dev)
 {
 	ASSERT_RTNL();
 
 	return !list_empty(&dev->adj_list.upper);
 }
+EXPORT_SYMBOL(netdev_has_any_upper_dev);
 
 /**
  * netdev_master_upper_dev_get - Get master upper device
-- 
cgit v1.2.3


From 2093ea2e098b602e0ecefcd30211193b888da5d3 Mon Sep 17 00:00:00 2001
From: Thierry Reding <treding@nvidia.com>
Date: Fri, 1 Sep 2017 16:40:41 +0200
Subject: drm/vtables: Fix typo
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The callback is named .atomic_check, not .atomc_check.

Reviewed-by: Ville Syrjälä <ville.syrjala@linux.intel.com>
Signed-off-by: Thierry Reding <treding@nvidia.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20170901144042.6023-1-thierry.reding@gmail.com
---
 include/drm/drm_modeset_helper_vtables.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/drm/drm_modeset_helper_vtables.h b/include/drm/drm_modeset_helper_vtables.h
index c55cf3ff6847..16646c44b7df 100644
--- a/include/drm/drm_modeset_helper_vtables.h
+++ b/include/drm/drm_modeset_helper_vtables.h
@@ -314,7 +314,7 @@ struct drm_crtc_helper_funcs {
 	 * implementation in drm_atomic_helper_check().
 	 *
 	 * When using drm_atomic_helper_check_planes() this hook is called
-	 * after the &drm_plane_helper_funcs.atomc_check hook for planes, which
+	 * after the &drm_plane_helper_funcs.atomic_check hook for planes, which
 	 * allows drivers to assign shared resources requested by planes in this
 	 * callback here. For more complicated dependencies the driver can call
 	 * the provided check helpers multiple times until the computed state
-- 
cgit v1.2.3


From 46ad42a3756d5e2078ed05bc99aa197815afac5d Mon Sep 17 00:00:00 2001
From: Thierry Reding <treding@nvidia.com>
Date: Fri, 1 Sep 2017 16:40:42 +0200
Subject: drm/atomic: Fix typo in kerneldoc
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The for_each_crtc_in_state() is used to iterate over CRTCs rather than
connectors.

Reviewed-by: Ville Syrjälä <ville.syrjala@linux.intel.com>
Signed-off-by: Thierry Reding <treding@nvidia.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20170901144042.6023-2-thierry.reding@gmail.com
---
 include/drm/drm_atomic.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/drm/drm_atomic.h b/include/drm/drm_atomic.h
index 8a5808eb5628..f73b663c1f76 100644
--- a/include/drm/drm_atomic.h
+++ b/include/drm/drm_atomic.h
@@ -643,7 +643,7 @@ void drm_state_dump(struct drm_device *dev, struct drm_printer *p);
 		for_each_if (connector)
 
 /**
- * for_each_crtc_in_state - iterate over all connectors in an atomic update
+ * for_each_crtc_in_state - iterate over all CRTCs in an atomic update
  * @__state: &struct drm_atomic_state pointer
  * @crtc: &struct drm_crtc iteration cursor
  * @crtc_state: &struct drm_crtc_state iteration cursor
-- 
cgit v1.2.3


From 799ea9e9c59949008770aab4e1da87f10e99dbe4 Mon Sep 17 00:00:00 2001
From: "Darrick J. Wong" <darrick.wong@oracle.com>
Date: Fri, 18 Aug 2017 18:08:25 -0700
Subject: xfs: evict all inodes involved with log redo item

When we introduced the bmap redo log items, we set MS_ACTIVE on the
mountpoint and XFS_IRECOVERY on the inode to prevent unlinked inodes
from being truncated prematurely during log recovery.  This also had the
effect of putting linked inodes on the lru instead of evicting them.

Unfortunately, we neglected to find all those unreferenced lru inodes
and evict them after finishing log recovery, which means that we leak
them if anything goes wrong in the rest of xfs_mountfs, because the lru
is only cleaned out on unmount.

Therefore, evict unreferenced inodes in the lru list immediately
after clearing MS_ACTIVE.

Fixes: 17c12bcd30 ("xfs: when replaying bmap operations, don't let unlinked inodes get reaped")
Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
Cc: viro@ZenIV.linux.org.uk
Reviewed-by: Brian Foster <bfoster@redhat.com>
---
 fs/inode.c         |  1 +
 fs/internal.h      |  1 -
 fs/xfs/xfs_log.c   | 12 ++++++++++++
 include/linux/fs.h |  1 +
 4 files changed, 14 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/fs/inode.c b/fs/inode.c
index 50370599e371..6a1626e0edaf 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -637,6 +637,7 @@ again:
 
 	dispose_list(&dispose);
 }
+EXPORT_SYMBOL_GPL(evict_inodes);
 
 /**
  * invalidate_inodes	- attempt to free all inodes on a superblock
diff --git a/fs/internal.h b/fs/internal.h
index 9676fe11c093..fedfe94d84ba 100644
--- a/fs/internal.h
+++ b/fs/internal.h
@@ -132,7 +132,6 @@ static inline bool atime_needs_update_rcu(const struct path *path,
 extern void inode_io_list_del(struct inode *inode);
 
 extern long get_nr_dirty_inodes(void);
-extern void evict_inodes(struct super_block *);
 extern int invalidate_inodes(struct super_block *, bool);
 
 /*
diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c
index f467f1230444..bcb2f860e508 100644
--- a/fs/xfs/xfs_log.c
+++ b/fs/xfs/xfs_log.c
@@ -761,12 +761,24 @@ xfs_log_mount_finish(
 	 * inodes.  Turn it off immediately after recovery finishes
 	 * so that we don't leak the quota inodes if subsequent mount
 	 * activities fail.
+	 *
+	 * We let all inodes involved in redo item processing end up on
+	 * the LRU instead of being evicted immediately so that if we do
+	 * something to an unlinked inode, the irele won't cause
+	 * premature truncation and freeing of the inode, which results
+	 * in log recovery failure.  We have to evict the unreferenced
+	 * lru inodes after clearing MS_ACTIVE because we don't
+	 * otherwise clean up the lru if there's a subsequent failure in
+	 * xfs_mountfs, which leads to us leaking the inodes if nothing
+	 * else (e.g. quotacheck) references the inodes before the
+	 * mount failure occurs.
 	 */
 	mp->m_super->s_flags |= MS_ACTIVE;
 	error = xlog_recover_finish(mp->m_log);
 	if (!error)
 		xfs_log_work_queue(mp);
 	mp->m_super->s_flags &= ~MS_ACTIVE;
+	evict_inodes(mp->m_super);
 
 	if (readonly)
 		mp->m_flags |= XFS_MOUNT_RDONLY;
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 6e1fd5d21248..00f46e018370 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -2831,6 +2831,7 @@ static inline void lockdep_annotate_inode_mutex_key(struct inode *inode) { };
 #endif
 extern void unlock_new_inode(struct inode *);
 extern unsigned int get_next_ino(void);
+extern void evict_inodes(struct super_block *sb);
 
 extern void __iget(struct inode * inode);
 extern void iget_failed(struct inode *);
-- 
cgit v1.2.3


From 8db6c34f1dbc8e06aa016a9b829b06902c3e1340 Mon Sep 17 00:00:00 2001
From: "Serge E. Hallyn" <serge@hallyn.com>
Date: Mon, 8 May 2017 13:11:56 -0500
Subject: Introduce v3 namespaced file capabilities

Root in a non-initial user ns cannot be trusted to write a traditional
security.capability xattr.  If it were allowed to do so, then any
unprivileged user on the host could map his own uid to root in a private
namespace, write the xattr, and execute the file with privilege on the
host.

However supporting file capabilities in a user namespace is very
desirable.  Not doing so means that any programs designed to run with
limited privilege must continue to support other methods of gaining and
dropping privilege.  For instance a program installer must detect
whether file capabilities can be assigned, and assign them if so but set
setuid-root otherwise.  The program in turn must know how to drop
partial capabilities, and do so only if setuid-root.

This patch introduces v3 of the security.capability xattr.  It builds a
vfs_ns_cap_data struct by appending a uid_t rootid to struct
vfs_cap_data.  This is the absolute uid_t (that is, the uid_t in user
namespace which mounted the filesystem, usually init_user_ns) of the
root id in whose namespaces the file capabilities may take effect.

When a task asks to write a v2 security.capability xattr, if it is
privileged with respect to the userns which mounted the filesystem, then
nothing should change.  Otherwise, the kernel will transparently rewrite
the xattr as a v3 with the appropriate rootid.  This is done during the
execution of setxattr() to catch user-space-initiated capability writes.
Subsequently, any task executing the file which has the noted kuid as
its root uid, or which is in a descendent user_ns of such a user_ns,
will run the file with capabilities.

Similarly when asking to read file capabilities, a v3 capability will
be presented as v2 if it applies to the caller's namespace.

If a task writes a v3 security.capability, then it can provide a uid for
the xattr so long as the uid is valid in its own user namespace, and it
is privileged with CAP_SETFCAP over its namespace.  The kernel will
translate that rootid to an absolute uid, and write that to disk.  After
this, a task in the writer's namespace will not be able to use those
capabilities (unless rootid was 0), but a task in a namespace where the
given uid is root will.

Only a single security.capability xattr may exist at a time for a given
file.  A task may overwrite an existing xattr so long as it is
privileged over the inode.  Note this is a departure from previous
semantics, which required privilege to remove a security.capability
xattr.  This check can be re-added if deemed useful.

This allows a simple setxattr to work, allows tar/untar to work, and
allows us to tar in one namespace and untar in another while preserving
the capability, without risking leaking privilege into a parent
namespace.

Example using tar:

 $ cp /bin/sleep sleepx
 $ mkdir b1 b2
 $ lxc-usernsexec -m b:0:100000:1 -m b:1:$(id -u):1 -- chown 0:0 b1
 $ lxc-usernsexec -m b:0:100001:1 -m b:1:$(id -u):1 -- chown 0:0 b2
 $ lxc-usernsexec -m b:0:100000:1000 -- tar --xattrs-include=security.capability --xattrs -cf b1/sleepx.tar sleepx
 $ lxc-usernsexec -m b:0:100001:1000 -- tar --xattrs-include=security.capability --xattrs -C b2 -xf b1/sleepx.tar
 $ lxc-usernsexec -m b:0:100001:1000 -- getcap b2/sleepx
   b2/sleepx = cap_sys_admin+ep
 # /opt/ltp/testcases/bin/getv3xattr b2/sleepx
   v3 xattr, rootid is 100001

A patch to linux-test-project adding a new set of tests for this
functionality is in the nsfscaps branch at github.com/hallyn/ltp

Changelog:
   Nov 02 2016: fix invalid check at refuse_fcap_overwrite()
   Nov 07 2016: convert rootid from and to fs user_ns
   (From ebiederm: mar 28 2017)
     commoncap.c: fix typos - s/v4/v3
     get_vfs_caps_from_disk: clarify the fs_ns root access check
     nsfscaps: change the code split for cap_inode_setxattr()
   Apr 09 2017:
       don't return v3 cap for caps owned by current root.
      return a v2 cap for a true v2 cap in non-init ns
   Apr 18 2017:
      . Change the flow of fscap writing to support s_user_ns writing.
      . Remove refuse_fcap_overwrite().  The value of the previous
        xattr doesn't matter.
   Apr 24 2017:
      . incorporate Eric's incremental diff
      . move cap_convert_nscap to setxattr and simplify its usage
   May 8, 2017:
      . fix leaking dentry refcount in cap_inode_getsecurity

Signed-off-by: Serge Hallyn <serge@hallyn.com>
Signed-off-by: Eric W. Biederman <ebiederm@xmission.com>
---
 fs/xattr.c                      |   6 +
 include/linux/capability.h      |   2 +
 include/linux/security.h        |   2 +
 include/uapi/linux/capability.h |  22 +++-
 security/commoncap.c            | 270 +++++++++++++++++++++++++++++++++++++---
 5 files changed, 280 insertions(+), 22 deletions(-)

(limited to 'include')

diff --git a/fs/xattr.c b/fs/xattr.c
index 464c94bf65f9..7b03df6b8be2 100644
--- a/fs/xattr.c
+++ b/fs/xattr.c
@@ -441,6 +441,12 @@ setxattr(struct dentry *d, const char __user *name, const void __user *value,
 		if ((strcmp(kname, XATTR_NAME_POSIX_ACL_ACCESS) == 0) ||
 		    (strcmp(kname, XATTR_NAME_POSIX_ACL_DEFAULT) == 0))
 			posix_acl_fix_xattr_from_user(kvalue, size);
+		else if (strcmp(kname, XATTR_NAME_CAPS) == 0) {
+			error = cap_convert_nscap(d, &kvalue, size);
+			if (error < 0)
+				goto out;
+			size = error;
+		}
 	}
 
 	error = vfs_setxattr(d, kname, kvalue, size, flags);
diff --git a/include/linux/capability.h b/include/linux/capability.h
index 6ffb67e10c06..b52e278e4744 100644
--- a/include/linux/capability.h
+++ b/include/linux/capability.h
@@ -248,4 +248,6 @@ extern bool ptracer_capable(struct task_struct *tsk, struct user_namespace *ns);
 /* audit system wants to get cap info from files as well */
 extern int get_vfs_caps_from_disk(const struct dentry *dentry, struct cpu_vfs_cap_data *cpu_caps);
 
+extern int cap_convert_nscap(struct dentry *dentry, void **ivalue, size_t size);
+
 #endif /* !_LINUX_CAPABILITY_H */
diff --git a/include/linux/security.h b/include/linux/security.h
index b6ea1dc9cc9d..6fff8c924718 100644
--- a/include/linux/security.h
+++ b/include/linux/security.h
@@ -91,6 +91,8 @@ extern int cap_inode_setxattr(struct dentry *dentry, const char *name,
 extern int cap_inode_removexattr(struct dentry *dentry, const char *name);
 extern int cap_inode_need_killpriv(struct dentry *dentry);
 extern int cap_inode_killpriv(struct dentry *dentry);
+extern int cap_inode_getsecurity(struct inode *inode, const char *name,
+				 void **buffer, bool alloc);
 extern int cap_mmap_addr(unsigned long addr);
 extern int cap_mmap_file(struct file *file, unsigned long reqprot,
 			 unsigned long prot, unsigned long flags);
diff --git a/include/uapi/linux/capability.h b/include/uapi/linux/capability.h
index 6fe14d001f68..230e05d35191 100644
--- a/include/uapi/linux/capability.h
+++ b/include/uapi/linux/capability.h
@@ -60,9 +60,13 @@ typedef struct __user_cap_data_struct {
 #define VFS_CAP_U32_2           2
 #define XATTR_CAPS_SZ_2         (sizeof(__le32)*(1 + 2*VFS_CAP_U32_2))
 
-#define XATTR_CAPS_SZ           XATTR_CAPS_SZ_2
-#define VFS_CAP_U32             VFS_CAP_U32_2
-#define VFS_CAP_REVISION	VFS_CAP_REVISION_2
+#define VFS_CAP_REVISION_3	0x03000000
+#define VFS_CAP_U32_3           2
+#define XATTR_CAPS_SZ_3         (sizeof(__le32)*(2 + 2*VFS_CAP_U32_3))
+
+#define XATTR_CAPS_SZ           XATTR_CAPS_SZ_3
+#define VFS_CAP_U32             VFS_CAP_U32_3
+#define VFS_CAP_REVISION	VFS_CAP_REVISION_3
 
 struct vfs_cap_data {
 	__le32 magic_etc;            /* Little endian */
@@ -72,6 +76,18 @@ struct vfs_cap_data {
 	} data[VFS_CAP_U32];
 };
 
+/*
+ * same as vfs_cap_data but with a rootid at the end
+ */
+struct vfs_ns_cap_data {
+	__le32 magic_etc;
+	struct {
+		__le32 permitted;    /* Little endian */
+		__le32 inheritable;  /* Little endian */
+	} data[VFS_CAP_U32];
+	__le32 rootid;
+};
+
 #ifndef __KERNEL__
 
 /*
diff --git a/security/commoncap.c b/security/commoncap.c
index d59320282294..c37d27dd1e2c 100644
--- a/security/commoncap.c
+++ b/security/commoncap.c
@@ -335,6 +335,209 @@ int cap_inode_killpriv(struct dentry *dentry)
 	return error;
 }
 
+static bool rootid_owns_currentns(kuid_t kroot)
+{
+	struct user_namespace *ns;
+
+	if (!uid_valid(kroot))
+		return false;
+
+	for (ns = current_user_ns(); ; ns = ns->parent) {
+		if (from_kuid(ns, kroot) == 0)
+			return true;
+		if (ns == &init_user_ns)
+			break;
+	}
+
+	return false;
+}
+
+static __u32 sansflags(__u32 m)
+{
+	return m & ~VFS_CAP_FLAGS_EFFECTIVE;
+}
+
+static bool is_v2header(size_t size, __le32 magic)
+{
+	__u32 m = le32_to_cpu(magic);
+	if (size != XATTR_CAPS_SZ_2)
+		return false;
+	return sansflags(m) == VFS_CAP_REVISION_2;
+}
+
+static bool is_v3header(size_t size, __le32 magic)
+{
+	__u32 m = le32_to_cpu(magic);
+
+	if (size != XATTR_CAPS_SZ_3)
+		return false;
+	return sansflags(m) == VFS_CAP_REVISION_3;
+}
+
+/*
+ * getsecurity: We are called for security.* before any attempt to read the
+ * xattr from the inode itself.
+ *
+ * This gives us a chance to read the on-disk value and convert it.  If we
+ * return -EOPNOTSUPP, then vfs_getxattr() will call the i_op handler.
+ *
+ * Note we are not called by vfs_getxattr_alloc(), but that is only called
+ * by the integrity subsystem, which really wants the unconverted values -
+ * so that's good.
+ */
+int cap_inode_getsecurity(struct inode *inode, const char *name, void **buffer,
+			  bool alloc)
+{
+	int size, ret;
+	kuid_t kroot;
+	uid_t root, mappedroot;
+	char *tmpbuf = NULL;
+	struct vfs_cap_data *cap;
+	struct vfs_ns_cap_data *nscap;
+	struct dentry *dentry;
+	struct user_namespace *fs_ns;
+
+	if (strcmp(name, "capability") != 0)
+		return -EOPNOTSUPP;
+
+	dentry = d_find_alias(inode);
+	if (!dentry)
+		return -EINVAL;
+
+	size = sizeof(struct vfs_ns_cap_data);
+	ret = (int) vfs_getxattr_alloc(dentry, XATTR_NAME_CAPS,
+				 &tmpbuf, size, GFP_NOFS);
+	dput(dentry);
+
+	if (ret < 0)
+		return ret;
+
+	fs_ns = inode->i_sb->s_user_ns;
+	cap = (struct vfs_cap_data *) tmpbuf;
+	if (is_v2header((size_t) ret, cap->magic_etc)) {
+		/* If this is sizeof(vfs_cap_data) then we're ok with the
+		 * on-disk value, so return that.  */
+		if (alloc)
+			*buffer = tmpbuf;
+		else
+			kfree(tmpbuf);
+		return ret;
+	} else if (!is_v3header((size_t) ret, cap->magic_etc)) {
+		kfree(tmpbuf);
+		return -EINVAL;
+	}
+
+	nscap = (struct vfs_ns_cap_data *) tmpbuf;
+	root = le32_to_cpu(nscap->rootid);
+	kroot = make_kuid(fs_ns, root);
+
+	/* If the root kuid maps to a valid uid in current ns, then return
+	 * this as a nscap. */
+	mappedroot = from_kuid(current_user_ns(), kroot);
+	if (mappedroot != (uid_t)-1 && mappedroot != (uid_t)0) {
+		if (alloc) {
+			*buffer = tmpbuf;
+			nscap->rootid = cpu_to_le32(mappedroot);
+		} else
+			kfree(tmpbuf);
+		return size;
+	}
+
+	if (!rootid_owns_currentns(kroot)) {
+		kfree(tmpbuf);
+		return -EOPNOTSUPP;
+	}
+
+	/* This comes from a parent namespace.  Return as a v2 capability */
+	size = sizeof(struct vfs_cap_data);
+	if (alloc) {
+		*buffer = kmalloc(size, GFP_ATOMIC);
+		if (*buffer) {
+			struct vfs_cap_data *cap = *buffer;
+			__le32 nsmagic, magic;
+			magic = VFS_CAP_REVISION_2;
+			nsmagic = le32_to_cpu(nscap->magic_etc);
+			if (nsmagic & VFS_CAP_FLAGS_EFFECTIVE)
+				magic |= VFS_CAP_FLAGS_EFFECTIVE;
+			memcpy(&cap->data, &nscap->data, sizeof(__le32) * 2 * VFS_CAP_U32);
+			cap->magic_etc = cpu_to_le32(magic);
+		}
+	}
+	kfree(tmpbuf);
+	return size;
+}
+
+static kuid_t rootid_from_xattr(const void *value, size_t size,
+				struct user_namespace *task_ns)
+{
+	const struct vfs_ns_cap_data *nscap = value;
+	uid_t rootid = 0;
+
+	if (size == XATTR_CAPS_SZ_3)
+		rootid = le32_to_cpu(nscap->rootid);
+
+	return make_kuid(task_ns, rootid);
+}
+
+static bool validheader(size_t size, __le32 magic)
+{
+	return is_v2header(size, magic) || is_v3header(size, magic);
+}
+
+/*
+ * User requested a write of security.capability.  If needed, update the
+ * xattr to change from v2 to v3, or to fixup the v3 rootid.
+ *
+ * If all is ok, we return the new size, on error return < 0.
+ */
+int cap_convert_nscap(struct dentry *dentry, void **ivalue, size_t size)
+{
+	struct vfs_ns_cap_data *nscap;
+	uid_t nsrootid;
+	const struct vfs_cap_data *cap = *ivalue;
+	__u32 magic, nsmagic;
+	struct inode *inode = d_backing_inode(dentry);
+	struct user_namespace *task_ns = current_user_ns(),
+		*fs_ns = inode->i_sb->s_user_ns;
+	kuid_t rootid;
+	size_t newsize;
+
+	if (!*ivalue)
+		return -EINVAL;
+	if (!validheader(size, cap->magic_etc))
+		return -EINVAL;
+	if (!capable_wrt_inode_uidgid(inode, CAP_SETFCAP))
+		return -EPERM;
+	if (size == XATTR_CAPS_SZ_2)
+		if (ns_capable(inode->i_sb->s_user_ns, CAP_SETFCAP))
+			/* user is privileged, just write the v2 */
+			return size;
+
+	rootid = rootid_from_xattr(*ivalue, size, task_ns);
+	if (!uid_valid(rootid))
+		return -EINVAL;
+
+	nsrootid = from_kuid(fs_ns, rootid);
+	if (nsrootid == -1)
+		return -EINVAL;
+
+	newsize = sizeof(struct vfs_ns_cap_data);
+	nscap = kmalloc(newsize, GFP_ATOMIC);
+	if (!nscap)
+		return -ENOMEM;
+	nscap->rootid = cpu_to_le32(nsrootid);
+	nsmagic = VFS_CAP_REVISION_3;
+	magic = le32_to_cpu(cap->magic_etc);
+	if (magic & VFS_CAP_FLAGS_EFFECTIVE)
+		nsmagic |= VFS_CAP_FLAGS_EFFECTIVE;
+	nscap->magic_etc = cpu_to_le32(nsmagic);
+	memcpy(&nscap->data, &cap->data, sizeof(__le32) * 2 * VFS_CAP_U32);
+
+	kvfree(*ivalue);
+	*ivalue = nscap;
+	return newsize;
+}
+
 /*
  * Calculate the new process capability sets from the capability sets attached
  * to a file.
@@ -388,7 +591,10 @@ int get_vfs_caps_from_disk(const struct dentry *dentry, struct cpu_vfs_cap_data
 	__u32 magic_etc;
 	unsigned tocopy, i;
 	int size;
-	struct vfs_cap_data caps;
+	struct vfs_ns_cap_data data, *nscaps = &data;
+	struct vfs_cap_data *caps = (struct vfs_cap_data *) &data;
+	kuid_t rootkuid;
+	struct user_namespace *fs_ns = inode->i_sb->s_user_ns;
 
 	memset(cpu_caps, 0, sizeof(struct cpu_vfs_cap_data));
 
@@ -396,18 +602,20 @@ int get_vfs_caps_from_disk(const struct dentry *dentry, struct cpu_vfs_cap_data
 		return -ENODATA;
 
 	size = __vfs_getxattr((struct dentry *)dentry, inode,
-			      XATTR_NAME_CAPS, &caps, XATTR_CAPS_SZ);
+			      XATTR_NAME_CAPS, &data, XATTR_CAPS_SZ);
 	if (size == -ENODATA || size == -EOPNOTSUPP)
 		/* no data, that's ok */
 		return -ENODATA;
+
 	if (size < 0)
 		return size;
 
 	if (size < sizeof(magic_etc))
 		return -EINVAL;
 
-	cpu_caps->magic_etc = magic_etc = le32_to_cpu(caps.magic_etc);
+	cpu_caps->magic_etc = magic_etc = le32_to_cpu(caps->magic_etc);
 
+	rootkuid = make_kuid(fs_ns, 0);
 	switch (magic_etc & VFS_CAP_REVISION_MASK) {
 	case VFS_CAP_REVISION_1:
 		if (size != XATTR_CAPS_SZ_1)
@@ -419,15 +627,27 @@ int get_vfs_caps_from_disk(const struct dentry *dentry, struct cpu_vfs_cap_data
 			return -EINVAL;
 		tocopy = VFS_CAP_U32_2;
 		break;
+	case VFS_CAP_REVISION_3:
+		if (size != XATTR_CAPS_SZ_3)
+			return -EINVAL;
+		tocopy = VFS_CAP_U32_3;
+		rootkuid = make_kuid(fs_ns, le32_to_cpu(nscaps->rootid));
+		break;
+
 	default:
 		return -EINVAL;
 	}
+	/* Limit the caps to the mounter of the filesystem
+	 * or the more limited uid specified in the xattr.
+	 */
+	if (!rootid_owns_currentns(rootkuid))
+		return -ENODATA;
 
 	CAP_FOR_EACH_U32(i) {
 		if (i >= tocopy)
 			break;
-		cpu_caps->permitted.cap[i] = le32_to_cpu(caps.data[i].permitted);
-		cpu_caps->inheritable.cap[i] = le32_to_cpu(caps.data[i].inheritable);
+		cpu_caps->permitted.cap[i] = le32_to_cpu(caps->data[i].permitted);
+		cpu_caps->inheritable.cap[i] = le32_to_cpu(caps->data[i].inheritable);
 	}
 
 	cpu_caps->permitted.cap[CAP_LAST_U32] &= CAP_LAST_U32_VALID_MASK;
@@ -465,8 +685,8 @@ static int get_file_caps(struct linux_binprm *bprm, bool *effective, bool *has_c
 	rc = get_vfs_caps_from_disk(bprm->file->f_path.dentry, &vcaps);
 	if (rc < 0) {
 		if (rc == -EINVAL)
-			printk(KERN_NOTICE "%s: get_vfs_caps_from_disk returned %d for %s\n",
-				__func__, rc, bprm->filename);
+			printk(KERN_NOTICE "Invalid argument reading file caps for %s\n",
+					bprm->filename);
 		else if (rc == -ENODATA)
 			rc = 0;
 		goto out;
@@ -663,15 +883,19 @@ int cap_bprm_secureexec(struct linux_binprm *bprm)
 int cap_inode_setxattr(struct dentry *dentry, const char *name,
 		       const void *value, size_t size, int flags)
 {
-	if (!strcmp(name, XATTR_NAME_CAPS)) {
-		if (!capable(CAP_SETFCAP))
-			return -EPERM;
+	/* Ignore non-security xattrs */
+	if (strncmp(name, XATTR_SECURITY_PREFIX,
+			sizeof(XATTR_SECURITY_PREFIX) - 1) != 0)
+		return 0;
+
+	/*
+	 * For XATTR_NAME_CAPS the check will be done in
+	 * cap_convert_nscap(), called by setxattr()
+	 */
+	if (strcmp(name, XATTR_NAME_CAPS) == 0)
 		return 0;
-	}
 
-	if (!strncmp(name, XATTR_SECURITY_PREFIX,
-		     sizeof(XATTR_SECURITY_PREFIX) - 1) &&
-	    !capable(CAP_SYS_ADMIN))
+	if (!capable(CAP_SYS_ADMIN))
 		return -EPERM;
 	return 0;
 }
@@ -689,15 +913,22 @@ int cap_inode_setxattr(struct dentry *dentry, const char *name,
  */
 int cap_inode_removexattr(struct dentry *dentry, const char *name)
 {
-	if (!strcmp(name, XATTR_NAME_CAPS)) {
-		if (!capable(CAP_SETFCAP))
+	/* Ignore non-security xattrs */
+	if (strncmp(name, XATTR_SECURITY_PREFIX,
+			sizeof(XATTR_SECURITY_PREFIX) - 1) != 0)
+		return 0;
+
+	if (strcmp(name, XATTR_NAME_CAPS) == 0) {
+		/* security.capability gets namespaced */
+		struct inode *inode = d_backing_inode(dentry);
+		if (!inode)
+			return -EINVAL;
+		if (!capable_wrt_inode_uidgid(inode, CAP_SETFCAP))
 			return -EPERM;
 		return 0;
 	}
 
-	if (!strncmp(name, XATTR_SECURITY_PREFIX,
-		     sizeof(XATTR_SECURITY_PREFIX) - 1) &&
-	    !capable(CAP_SYS_ADMIN))
+	if (!capable(CAP_SYS_ADMIN))
 		return -EPERM;
 	return 0;
 }
@@ -1085,6 +1316,7 @@ struct security_hook_list capability_hooks[] __lsm_ro_after_init = {
 	LSM_HOOK_INIT(bprm_secureexec, cap_bprm_secureexec),
 	LSM_HOOK_INIT(inode_need_killpriv, cap_inode_need_killpriv),
 	LSM_HOOK_INIT(inode_killpriv, cap_inode_killpriv),
+	LSM_HOOK_INIT(inode_getsecurity, cap_inode_getsecurity),
 	LSM_HOOK_INIT(mmap_addr, cap_mmap_addr),
 	LSM_HOOK_INIT(mmap_file, cap_mmap_file),
 	LSM_HOOK_INIT(task_fix_setuid, cap_task_fix_setuid),
-- 
cgit v1.2.3


From d897246df9fc0a5df97a784bf7b072be4a6ae479 Mon Sep 17 00:00:00 2001
From: "Darrick J. Wong" <darrick.wong@oracle.com>
Date: Thu, 31 Aug 2017 13:47:35 -0700
Subject: fsmap: fix documentation of FMR_OF_LAST

The FMR_OF_LAST flag is set on the last fsmap record being returned for
the dataset requested, contrary to what the header file says.  Fix the
docs to reflect the behavior of all fsmap implementations.

Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
Reviewed-by: Dave Chinner <dchinner@redhat.com>
---
 include/uapi/linux/fsmap.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/uapi/linux/fsmap.h b/include/uapi/linux/fsmap.h
index 7e8e5f0bd6d2..e5213c3e38b2 100644
--- a/include/uapi/linux/fsmap.h
+++ b/include/uapi/linux/fsmap.h
@@ -96,7 +96,7 @@ fsmap_advance(
 #define FMR_OF_EXTENT_MAP	0x4	/* segment = extent map */
 #define FMR_OF_SHARED		0x8	/* segment = shared with another file */
 #define FMR_OF_SPECIAL_OWNER	0x10	/* owner is a special value */
-#define FMR_OF_LAST		0x20	/* segment is the last in the FS */
+#define FMR_OF_LAST		0x20	/* segment is the last in the dataset */
 
 /* Each FS gets to define its own special owner codes. */
 #define FMR_OWNER(type, code)	(((__u64)type << 32) | \
-- 
cgit v1.2.3


From 65bce46298d064dff9db1282e17bb26602715819 Mon Sep 17 00:00:00 2001
From: Loic Poulain <loic.poulain@linaro.org>
Date: Fri, 1 Sep 2017 13:41:17 -0700
Subject: Bluetooth: make baswap src const

Signed-off-by: Loic Poulain <loic.poulain@linaro.org>
Signed-off-by: Bjorn Andersson <bjorn.andersson@linaro.org>
Signed-off-by: Marcel Holtmann <marcel@holtmann.org>
---
 include/net/bluetooth/bluetooth.h | 2 +-
 net/bluetooth/lib.c               | 6 +++---
 2 files changed, 4 insertions(+), 4 deletions(-)

(limited to 'include')

diff --git a/include/net/bluetooth/bluetooth.h b/include/net/bluetooth/bluetooth.h
index 01487192f628..020142bb9735 100644
--- a/include/net/bluetooth/bluetooth.h
+++ b/include/net/bluetooth/bluetooth.h
@@ -233,7 +233,7 @@ static inline void bacpy(bdaddr_t *dst, const bdaddr_t *src)
 	memcpy(dst, src, sizeof(bdaddr_t));
 }
 
-void baswap(bdaddr_t *dst, bdaddr_t *src);
+void baswap(bdaddr_t *dst, const bdaddr_t *src);
 
 /* Common socket structures and functions */
 
diff --git a/net/bluetooth/lib.c b/net/bluetooth/lib.c
index aa4cf64e32a6..63e65d9b4b24 100644
--- a/net/bluetooth/lib.c
+++ b/net/bluetooth/lib.c
@@ -30,10 +30,10 @@
 
 #include <net/bluetooth/bluetooth.h>
 
-void baswap(bdaddr_t *dst, bdaddr_t *src)
+void baswap(bdaddr_t *dst, const bdaddr_t *src)
 {
-	unsigned char *d = (unsigned char *) dst;
-	unsigned char *s = (unsigned char *) src;
+	const unsigned char *s = (const unsigned char *)src;
+	unsigned char *d = (unsigned char *)dst;
 	unsigned int i;
 
 	for (i = 0; i < 6; i++)
-- 
cgit v1.2.3


From 0142626d08223b0f6ad04859301b53178f11c317 Mon Sep 17 00:00:00 2001
From: Shawn Lin <shawn.lin@rock-chips.com>
Date: Fri, 1 Sep 2017 16:35:20 -0500
Subject: PCI: Correct kernel-doc of pci_vpd_srdt_size(), pci_vpd_srdt_tag()

The kernel-doc comments don't match the arguments, so fix the comments.

Signed-off-by: Shawn Lin <shawn.lin@rock-chips.com>
Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
---
 include/linux/pci.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/linux/pci.h b/include/linux/pci.h
index 4869e66dd659..7950fc697394 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -2059,7 +2059,7 @@ static inline u16 pci_vpd_lrdt_tag(const u8 *lrdt)
 
 /**
  * pci_vpd_srdt_size - Extracts the Small Resource Data Type length
- * @lrdt: Pointer to the beginning of the Small Resource Data Type tag
+ * @srdt: Pointer to the beginning of the Small Resource Data Type tag
  *
  * Returns the extracted Small Resource Data Type length.
  */
@@ -2070,7 +2070,7 @@ static inline u8 pci_vpd_srdt_size(const u8 *srdt)
 
 /**
  * pci_vpd_srdt_tag - Extracts the Small Resource Data Type Tag Item
- * @lrdt: Pointer to the beginning of the Small Resource Data Type tag
+ * @srdt: Pointer to the beginning of the Small Resource Data Type tag
  *
  * Returns the extracted Small Resource Data Type Tag Item.
  */
-- 
cgit v1.2.3


From 96291d565550c1fd363e488cc17cb3189d2e4cc2 Mon Sep 17 00:00:00 2001
From: Bjorn Helgaas <bhelgaas@google.com>
Date: Fri, 1 Sep 2017 16:35:50 -0500
Subject: PCI: Fix typos and whitespace errors

Fix various typos and whitespace errors:

  s/Synopsis/Synopsys/
  s/Designware/DesignWare/
  s/Keystine/Keystone/
  s/gpio/GPIO/
  s/pcie/PCIe/
  s/phy/PHY/
  s/confgiruation/configuration/

No functional change intended.

Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
---
 CREDITS                                            |  2 +-
 .../devicetree/bindings/pci/83xx-512x-pci.txt      |  6 +++---
 .../devicetree/bindings/pci/altera-pcie.txt        | 18 ++++++++--------
 .../devicetree/bindings/pci/axis,artpec6-pcie.txt  |  2 +-
 .../devicetree/bindings/pci/designware-pcie.txt    | 24 ++++++++++------------
 .../devicetree/bindings/pci/fsl,imx6q-pcie.txt     |  2 +-
 .../devicetree/bindings/pci/hisilicon-pcie.txt     |  4 ++--
 .../devicetree/bindings/pci/kirin-pcie.txt         |  8 ++++----
 .../devicetree/bindings/pci/layerscape-pci.txt     |  2 +-
 .../devicetree/bindings/pci/mvebu-pci.txt          |  2 +-
 .../devicetree/bindings/pci/pci-armada8k.txt       |  2 +-
 .../devicetree/bindings/pci/pci-keystone.txt       | 15 +++++++-------
 .../devicetree/bindings/pci/qcom,pcie.txt          |  4 ++--
 .../devicetree/bindings/pci/ralink,rt3883-pci.txt  |  2 +-
 Documentation/devicetree/bindings/pci/rcar-pci.txt |  7 +++----
 .../devicetree/bindings/pci/rockchip-pcie.txt      |  2 +-
 .../bindings/pci/samsung,exynos5440-pcie.txt       | 22 ++++++++++----------
 .../devicetree/bindings/pci/spear13xx-pcie.txt     |  6 +++---
 Documentation/devicetree/bindings/pci/ti-pci.txt   |  8 ++++----
 .../devicetree/bindings/pci/versatile.txt          |  2 +-
 .../devicetree/bindings/pci/xgene-pci-msi.txt      |  5 +++--
 .../devicetree/bindings/pci/xgene-pci.txt          |  8 ++++----
 .../devicetree/bindings/pci/xilinx-nwl-pcie.txt    |  7 ++++---
 MAINTAINERS                                        |  2 +-
 drivers/pci/dwc/Kconfig                            | 12 +++++------
 drivers/pci/dwc/pci-dra7xx.c                       |  1 -
 drivers/pci/dwc/pci-keystone-dw.c                  |  2 +-
 drivers/pci/dwc/pcie-designware-ep.c               |  2 +-
 drivers/pci/dwc/pcie-designware-host.c             |  2 +-
 drivers/pci/dwc/pcie-designware.c                  |  2 +-
 drivers/pci/dwc/pcie-designware.h                  |  2 +-
 drivers/pci/host/pcie-rockchip.c                   |  2 +-
 drivers/pci/host/pcie-xilinx.c                     |  2 +-
 drivers/pci/pcie/aer/aerdrv_core.c                 |  4 ++--
 drivers/pci/quirks.c                               |  2 +-
 include/linux/aer.h                                |  5 ++---
 include/linux/pcieport_if.h                        |  2 +-
 37 files changed, 99 insertions(+), 103 deletions(-)

(limited to 'include')

diff --git a/CREDITS b/CREDITS
index 5d09c26d69cd..0d2d60de5a25 100644
--- a/CREDITS
+++ b/CREDITS
@@ -2090,7 +2090,7 @@ S: Kuala Lumpur, Malaysia
 
 N: Mohit Kumar
 D: ST Microelectronics SPEAr13xx PCI host bridge driver
-D: Synopsys Designware PCI host bridge driver
+D: Synopsys DesignWare PCI host bridge driver
 
 N: Gabor Kuti
 E: seasons@falcon.sch.bme.hu
diff --git a/Documentation/devicetree/bindings/pci/83xx-512x-pci.txt b/Documentation/devicetree/bindings/pci/83xx-512x-pci.txt
index 35a465362408..b9165b72473c 100644
--- a/Documentation/devicetree/bindings/pci/83xx-512x-pci.txt
+++ b/Documentation/devicetree/bindings/pci/83xx-512x-pci.txt
@@ -1,11 +1,11 @@
 * Freescale 83xx and 512x PCI bridges
 
-Freescale 83xx and 512x SOCs include the same pci bridge core.
+Freescale 83xx and 512x SOCs include the same PCI bridge core.
 
 83xx/512x specific notes:
 - reg: should contain two address length tuples
-    The first is for the internal pci bridge registers
-    The second is for the pci config space access registers
+    The first is for the internal PCI bridge registers
+    The second is for the PCI config space access registers
 
 Example (MPC8313ERDB)
 	pci0: pci@e0008500 {
diff --git a/Documentation/devicetree/bindings/pci/altera-pcie.txt b/Documentation/devicetree/bindings/pci/altera-pcie.txt
index 2951a6a50704..495880193adc 100644
--- a/Documentation/devicetree/bindings/pci/altera-pcie.txt
+++ b/Documentation/devicetree/bindings/pci/altera-pcie.txt
@@ -7,21 +7,21 @@ Required properties:
 		"Txs": TX slave port region
 		"Cra": Control register access region
 - interrupt-parent:	interrupt source phandle.
-- interrupts:	specifies the interrupt source of the parent interrupt controller.
-		The format of the interrupt specifier depends on the parent interrupt
-		controller.
+- interrupts:	specifies the interrupt source of the parent interrupt
+		controller.  The format of the interrupt specifier depends
+		on the parent interrupt controller.
 - device_type:	must be "pci"
 - #address-cells:	set to <3>
-- #size-cells:	set to <2>
+- #size-cells:		set to <2>
 - #interrupt-cells:	set to <1>
-- ranges:		describes the translation of addresses for root ports and standard
-		PCI regions.
+- ranges:	describes the translation of addresses for root ports and
+		standard PCI regions.
 - interrupt-map-mask and interrupt-map: standard PCI properties to define the
 		mapping of the PCIe interface to interrupt numbers.
 
 Optional properties:
-- msi-parent:	Link to the hardware entity that serves as the MSI controller for this PCIe
-		controller.
+- msi-parent:	Link to the hardware entity that serves as the MSI controller
+		for this PCIe controller.
 - bus-range:	PCI bus numbers covered
 
 Example
@@ -45,5 +45,5 @@ Example
 			            <0 0 0 3 &pcie_0 3>,
 			            <0 0 0 4 &pcie_0 4>;
 		ranges = <0x82000000 0x00000000 0x00000000 0xc0000000 0x00000000 0x10000000
-			    0x82000000 0x00000000 0x10000000 0xd0000000 0x00000000 0x10000000>;
+			  0x82000000 0x00000000 0x10000000 0xd0000000 0x00000000 0x10000000>;
 	};
diff --git a/Documentation/devicetree/bindings/pci/axis,artpec6-pcie.txt b/Documentation/devicetree/bindings/pci/axis,artpec6-pcie.txt
index 5ecaea1e6eee..4e4aee4439ea 100644
--- a/Documentation/devicetree/bindings/pci/axis,artpec6-pcie.txt
+++ b/Documentation/devicetree/bindings/pci/axis,artpec6-pcie.txt
@@ -6,7 +6,7 @@ and thus inherits all the common properties defined in designware-pcie.txt.
 Required properties:
 - compatible: "axis,artpec6-pcie", "snps,dw-pcie"
 - reg: base addresses and lengths of the PCIe controller (DBI),
-	the phy controller, and configuration address space.
+	the PHY controller, and configuration address space.
 - reg-names: Must include the following entries:
 	- "dbi"
 	- "phy"
diff --git a/Documentation/devicetree/bindings/pci/designware-pcie.txt b/Documentation/devicetree/bindings/pci/designware-pcie.txt
index b2480dd38c11..1da7ade3183c 100644
--- a/Documentation/devicetree/bindings/pci/designware-pcie.txt
+++ b/Documentation/devicetree/bindings/pci/designware-pcie.txt
@@ -1,4 +1,4 @@
-* Synopsys Designware PCIe interface
+* Synopsys DesignWare PCIe interface
 
 Required properties:
 - compatible: should contain "snps,dw-pcie" to identify the core.
@@ -17,29 +17,27 @@ RC mode:
 	properties to define the mapping of the PCIe interface to interrupt
 	numbers.
 EP mode:
-- num-ib-windows: number of inbound address translation
-        windows
-- num-ob-windows: number of outbound address translation
-        windows
+- num-ib-windows: number of inbound address translation windows
+- num-ob-windows: number of outbound address translation windows
 
 Optional properties:
 - num-lanes: number of lanes to use (this property should be specified unless
   the link is brought already up in BIOS)
-- reset-gpio: gpio pin number of power good signal
+- reset-gpio: GPIO pin number of power good signal
 - clocks: Must contain an entry for each entry in clock-names.
 	See ../clocks/clock-bindings.txt for details.
 - clock-names: Must include the following entries:
 	- "pcie"
 	- "pcie_bus"
 RC mode:
-- num-viewport: number of view ports configured in
-  hardware. If a platform does not specify it, the driver assumes 2.
-- bus-range: PCI bus numbers covered (it is recommended
-  for new devicetrees to specify this property, to keep backwards
-  compatibility a range of 0x00-0xff is assumed if not present)
+- num-viewport: number of view ports configured in hardware. If a platform
+  does not specify it, the driver assumes 2.
+- bus-range: PCI bus numbers covered (it is recommended for new devicetrees
+  to specify this property, to keep backwards compatibility a range of
+  0x00-0xff is assumed if not present)
+
 EP mode:
-- max-functions: maximum number of functions that can be
-  configured
+- max-functions: maximum number of functions that can be configured
 
 Example configuration:
 
diff --git a/Documentation/devicetree/bindings/pci/fsl,imx6q-pcie.txt b/Documentation/devicetree/bindings/pci/fsl,imx6q-pcie.txt
index cf92d3ba5a26..7b1e48bf172b 100644
--- a/Documentation/devicetree/bindings/pci/fsl,imx6q-pcie.txt
+++ b/Documentation/devicetree/bindings/pci/fsl,imx6q-pcie.txt
@@ -1,6 +1,6 @@
 * Freescale i.MX6 PCIe interface
 
-This PCIe host controller is based on the Synopsis Designware PCIe IP
+This PCIe host controller is based on the Synopsys DesignWare PCIe IP
 and thus inherits all the common properties defined in designware-pcie.txt.
 
 Required properties:
diff --git a/Documentation/devicetree/bindings/pci/hisilicon-pcie.txt b/Documentation/devicetree/bindings/pci/hisilicon-pcie.txt
index a339dbb15493..b0fd3ba66ec9 100644
--- a/Documentation/devicetree/bindings/pci/hisilicon-pcie.txt
+++ b/Documentation/devicetree/bindings/pci/hisilicon-pcie.txt
@@ -1,7 +1,7 @@
 HiSilicon Hip05 and Hip06 PCIe host bridge DT description
 
-HiSilicon PCIe host controller is based on Designware PCI core.
-It shares common functions with PCIe Designware core driver and inherits
+HiSilicon PCIe host controller is based on the Synopsys DesignWare PCI core.
+It shares common functions with the PCIe DesignWare core driver and inherits
 common properties defined in
 Documentation/devicetree/bindings/pci/designware-pci.txt.
 
diff --git a/Documentation/devicetree/bindings/pci/kirin-pcie.txt b/Documentation/devicetree/bindings/pci/kirin-pcie.txt
index 68ffa0fbcd73..6e217c63123d 100644
--- a/Documentation/devicetree/bindings/pci/kirin-pcie.txt
+++ b/Documentation/devicetree/bindings/pci/kirin-pcie.txt
@@ -1,8 +1,8 @@
 HiSilicon Kirin SoCs PCIe host DT description
 
-Kirin PCIe host controller is based on Designware PCI core.
-It shares common functions with PCIe Designware core driver
-and inherits common properties defined in
+Kirin PCIe host controller is based on the Synopsys DesignWare PCI core.
+It shares common functions with the PCIe DesignWare core driver and
+inherits common properties defined in
 Documentation/devicetree/bindings/pci/designware-pci.txt.
 
 Additional properties are described here:
@@ -16,7 +16,7 @@ Required properties
   "apb": apb Ctrl register defined by Kirin;
   "phy": apb PHY register defined by Kirin;
   "config": PCIe configuration space registers.
-- reset-gpios: The gpio to generate PCIe perst assert and deassert signal.
+- reset-gpios: The GPIO to generate PCIe PERST# assert and deassert signal.
 
 Optional properties:
 
diff --git a/Documentation/devicetree/bindings/pci/layerscape-pci.txt b/Documentation/devicetree/bindings/pci/layerscape-pci.txt
index ee1c72d5162e..e00c706f85d9 100644
--- a/Documentation/devicetree/bindings/pci/layerscape-pci.txt
+++ b/Documentation/devicetree/bindings/pci/layerscape-pci.txt
@@ -16,7 +16,7 @@ Required properties:
         "fsl,ls1021a-pcie", "snps,dw-pcie"
         "fsl,ls2080a-pcie", "fsl,ls2085a-pcie", "snps,dw-pcie"
         "fsl,ls1046a-pcie"
-- reg: base addresses and lengths of the PCIe controller
+- reg: base addresses and lengths of the PCIe controller register blocks.
 - interrupts: A list of interrupt outputs of the controller. Must contain an
   entry for each entry in the interrupt-names property.
 - interrupt-names: Must include the following entries:
diff --git a/Documentation/devicetree/bindings/pci/mvebu-pci.txt b/Documentation/devicetree/bindings/pci/mvebu-pci.txt
index 2de6f65ecfb1..196e034f6879 100644
--- a/Documentation/devicetree/bindings/pci/mvebu-pci.txt
+++ b/Documentation/devicetree/bindings/pci/mvebu-pci.txt
@@ -77,7 +77,7 @@ and the following optional properties:
 - marvell,pcie-lane: the physical PCIe lane number, for ports having
   multiple lanes. If this property is not found, we assume that the
   value is 0.
-- reset-gpios: optional gpio to PERST#
+- reset-gpios: optional GPIO to PERST#
 - reset-delay-us: delay in us to wait after reset de-assertion, if not
   specified will default to 100ms, as required by the PCIe specification.
 
diff --git a/Documentation/devicetree/bindings/pci/pci-armada8k.txt b/Documentation/devicetree/bindings/pci/pci-armada8k.txt
index 598533a57d79..c54a84350ec8 100644
--- a/Documentation/devicetree/bindings/pci/pci-armada8k.txt
+++ b/Documentation/devicetree/bindings/pci/pci-armada8k.txt
@@ -1,6 +1,6 @@
 * Marvell Armada 7K/8K PCIe interface
 
-This PCIe host controller is based on the Synopsis Designware PCIe IP
+This PCIe host controller is based on the Synopsys DesignWare PCIe IP
 and thus inherits all the common properties defined in designware-pcie.txt.
 
 Required properties:
diff --git a/Documentation/devicetree/bindings/pci/pci-keystone.txt b/Documentation/devicetree/bindings/pci/pci-keystone.txt
index d08a4d51108f..7e05487544ed 100644
--- a/Documentation/devicetree/bindings/pci/pci-keystone.txt
+++ b/Documentation/devicetree/bindings/pci/pci-keystone.txt
@@ -1,12 +1,12 @@
 TI Keystone PCIe interface
 
-Keystone PCI host Controller is based on Designware PCI h/w version 3.65.
-It shares common functions with PCIe Designware core driver and inherit
-common properties defined in
+Keystone PCI host Controller is based on the Synopsys DesignWare PCI
+hardware version 3.65.  It shares common functions with the PCIe DesignWare
+core driver and inherits common properties defined in
 Documentation/devicetree/bindings/pci/designware-pci.txt
 
 Please refer to Documentation/devicetree/bindings/pci/designware-pci.txt
-for the details of Designware DT bindings.  Additional properties are
+for the details of DesignWare DT bindings.  Additional properties are
 described here as well as properties that are not applicable.
 
 Required Properties:-
@@ -52,13 +52,12 @@ pcie_intc: Interrupt controller device node for Legacy IRQ chip
 	};
 
 Optional properties:-
-	phys: phandle to Generic Keystone SerDes phy for PCI
-	phy-names: name of the Generic Keystine SerDes phy for PCI
+	phys: phandle to generic Keystone SerDes PHY for PCI
+	phy-names: name of the generic Keystone SerDes PHY for PCI
 	  - If boot loader already does PCI link establishment, then phys and
 	    phy-names shouldn't be present.
 	interrupts: platform interrupt for error interrupts.
 
-Designware DT Properties not applicable for Keystone PCI
+DesignWare DT Properties not applicable for Keystone PCI
 
 1. pcie_bus clock-names not used.  Instead, a phandle to phys is used.
-
diff --git a/Documentation/devicetree/bindings/pci/qcom,pcie.txt b/Documentation/devicetree/bindings/pci/qcom,pcie.txt
index 9d418b71774f..1425bb639300 100644
--- a/Documentation/devicetree/bindings/pci/qcom,pcie.txt
+++ b/Documentation/devicetree/bindings/pci/qcom,pcie.txt
@@ -20,7 +20,7 @@
 	Value type: <stringlist>
 	Definition: Must include the following entries
 			- "parf"   Qualcomm specific registers
-			- "dbi"	   Designware PCIe registers
+			- "dbi"	   DesignWare PCIe registers
 			- "elbi"   External local bus interface registers
 			- "config" PCIe configuration space
 
@@ -180,7 +180,7 @@
 - <name>-gpios:
 	Usage: optional
 	Value type: <prop-encoded-array>
-	Definition: List of phandle and gpio specifier pairs. Should contain
+	Definition: List of phandle and GPIO specifier pairs. Should contain
 			- "perst-gpios"	PCIe endpoint reset signal line
 			- "wake-gpios"	PCIe endpoint wake signal line
 
diff --git a/Documentation/devicetree/bindings/pci/ralink,rt3883-pci.txt b/Documentation/devicetree/bindings/pci/ralink,rt3883-pci.txt
index 8e0a1eb0acbb..a04ab1b76211 100644
--- a/Documentation/devicetree/bindings/pci/ralink,rt3883-pci.txt
+++ b/Documentation/devicetree/bindings/pci/ralink,rt3883-pci.txt
@@ -71,7 +71,7 @@
    - interrupt-map: standard PCI properties to define the mapping of the
      PCI interface to interrupt numbers.
 
-   The PCI host bridge node migh have additional sub-nodes representing
+   The PCI host bridge node might have additional sub-nodes representing
    the onboard PCI devices/PCI slots. Each such sub-node must have the
    following mandatory properties:
 
diff --git a/Documentation/devicetree/bindings/pci/rcar-pci.txt b/Documentation/devicetree/bindings/pci/rcar-pci.txt
index bd27428dda61..6b5b388fbc99 100644
--- a/Documentation/devicetree/bindings/pci/rcar-pci.txt
+++ b/Documentation/devicetree/bindings/pci/rcar-pci.txt
@@ -14,7 +14,7 @@ compatible: "renesas,pcie-r8a7779" for the R8A7779 SoC;
 	    SoC-specific version corresponding to the platform first
 	    followed by the generic version.
 
-- reg: base address and length of the pcie controller registers.
+- reg: base address and length of the PCIe controller registers.
 - #address-cells: set to <3>
 - #size-cells: set to <2>
 - bus-range: PCI bus numbers covered
@@ -25,15 +25,14 @@ compatible: "renesas,pcie-r8a7779" for the R8A7779 SoC;
 	source for hardware related interrupts (e.g. link speed change).
 - #interrupt-cells: set to <1>
 - interrupt-map-mask and interrupt-map: standard PCI properties
-	to define the mapping of the PCIe interface to interrupt
-	numbers.
+	to define the mapping of the PCIe interface to interrupt numbers.
 - clocks: from common clock binding: clock specifiers for the PCIe controller
 	and PCIe bus clocks.
 - clock-names: from common clock binding: should be "pcie" and "pcie_bus".
 
 Example:
 
-SoC specific DT Entry:
+SoC-specific DT Entry:
 
 	pcie: pcie@fe000000 {
 		compatible = "renesas,pcie-r8a7791", "renesas,pcie-rcar-gen2";
diff --git a/Documentation/devicetree/bindings/pci/rockchip-pcie.txt b/Documentation/devicetree/bindings/pci/rockchip-pcie.txt
index 1453a734c2f5..1136e9282108 100644
--- a/Documentation/devicetree/bindings/pci/rockchip-pcie.txt
+++ b/Documentation/devicetree/bindings/pci/rockchip-pcie.txt
@@ -45,7 +45,7 @@ Required properties:
 Optional Property:
 - aspm-no-l0s: RC won't support ASPM L0s. This property is needed if
 	using 24MHz OSC for RC's PHY.
-- ep-gpios: contain the entry for pre-reset gpio
+- ep-gpios: contain the entry for pre-reset GPIO
 - num-lanes: number of lanes to use
 - vpcie3v3-supply: The phandle to the 3.3v regulator to use for PCIe.
 - vpcie1v8-supply: The phandle to the 1.8v regulator to use for PCIe.
diff --git a/Documentation/devicetree/bindings/pci/samsung,exynos5440-pcie.txt b/Documentation/devicetree/bindings/pci/samsung,exynos5440-pcie.txt
index 7d3b09474657..34a11bfbfb60 100644
--- a/Documentation/devicetree/bindings/pci/samsung,exynos5440-pcie.txt
+++ b/Documentation/devicetree/bindings/pci/samsung,exynos5440-pcie.txt
@@ -1,29 +1,29 @@
 * Samsung Exynos 5440 PCIe interface
 
-This PCIe host controller is based on the Synopsis Designware PCIe IP
+This PCIe host controller is based on the Synopsys DesignWare PCIe IP
 and thus inherits all the common properties defined in designware-pcie.txt.
 
 Required properties:
 - compatible: "samsung,exynos5440-pcie"
-- reg: base addresses and lengths of the pcie controller,
-	the phy controller, additional register for the phy controller.
-	(Registers for the phy controller are DEPRECATED.
+- reg: base addresses and lengths of the PCIe controller,
+	the PHY controller, additional register for the PHY controller.
+	(Registers for the PHY controller are DEPRECATED.
 	 Use the PHY framework.)
 - reg-names : First name should be set to "elbi".
-	And use the "config" instead of getting the confgiruation address space
+	And use the "config" instead of getting the configuration address space
 	from "ranges".
-	NOTE: When use the "config" property, reg-names must be set.
+	NOTE: When using the "config" property, reg-names must be set.
 - interrupts: A list of interrupt outputs for level interrupt,
 	pulse interrupt, special interrupt.
-- phys: From PHY binding. Phandle for the Generic PHY.
+- phys: From PHY binding. Phandle for the generic PHY.
 	Refer to Documentation/devicetree/bindings/phy/samsung-phy.txt
 
-Other common properties refer to
-	Documentation/devicetree/binding/pci/designware-pcie.txt
+For other common properties, refer to
+	Documentation/devicetree/bindings/pci/designware-pcie.txt
 
 Example:
 
-SoC specific DT Entry:
+SoC-specific DT Entry:
 
 	pcie@290000 {
 		compatible = "samsung,exynos5440-pcie", "snps,dw-pcie";
@@ -83,7 +83,7 @@ With using PHY framework:
 		...
 	};
 
-Board specific DT Entry:
+Board-specific DT Entry:
 
 	pcie@290000 {
 		reset-gpio = <&pin_ctrl 5 0>;
diff --git a/Documentation/devicetree/bindings/pci/spear13xx-pcie.txt b/Documentation/devicetree/bindings/pci/spear13xx-pcie.txt
index 49ea76da7718..d5a14f5dad46 100644
--- a/Documentation/devicetree/bindings/pci/spear13xx-pcie.txt
+++ b/Documentation/devicetree/bindings/pci/spear13xx-pcie.txt
@@ -1,12 +1,12 @@
 SPEAr13XX PCIe DT detail:
 ================================
 
-SPEAr13XX uses synopsis designware PCIe controller and ST MiPHY as phy
+SPEAr13XX uses the Synopsys DesignWare PCIe controller and ST MiPHY as PHY
 controller.
 
 Required properties:
-- compatible : should be "st,spear1340-pcie", "snps,dw-pcie".
-- phys		    : phandle to phy node associated with pcie controller
+- compatible	    : should be "st,spear1340-pcie", "snps,dw-pcie".
+- phys		    : phandle to PHY node associated with PCIe controller
 - phy-names	    : must be "pcie-phy"
 - All other definitions as per generic PCI bindings
 
diff --git a/Documentation/devicetree/bindings/pci/ti-pci.txt b/Documentation/devicetree/bindings/pci/ti-pci.txt
index 6a07c96227e0..7f7af3044016 100644
--- a/Documentation/devicetree/bindings/pci/ti-pci.txt
+++ b/Documentation/devicetree/bindings/pci/ti-pci.txt
@@ -1,6 +1,6 @@
 TI PCI Controllers
 
-PCIe Designware Controller
+PCIe DesignWare Controller
  - compatible: Should be "ti,dra7-pcie" for RC
 	       Should be "ti,dra7-pcie-ep" for EP
  - phys : list of PHY specifiers (used by generic PHY framework)
@@ -13,7 +13,7 @@ PCIe Designware Controller
 HOST MODE
 =========
  - reg : Two register ranges as listed in the reg-names property
- - reg-names : The first entry must be "ti-conf" for the TI specific registers
+ - reg-names : The first entry must be "ti-conf" for the TI-specific registers
 	       The second entry must be "rc-dbics" for the DesignWare PCIe
 	       registers
 	       The third entry must be "config" for the PCIe configuration space
@@ -30,7 +30,7 @@ HOST MODE
 DEVICE MODE
 ===========
  - reg : Four register ranges as listed in the reg-names property
- - reg-names : "ti-conf" for the TI specific registers
+ - reg-names : "ti-conf" for the TI-specific registers
 	       "ep_dbics" for the standard configuration registers as
 		they are locally accessed within the DIF CS space
 	       "ep_dbics2" for the standard configuration registers as
@@ -46,7 +46,7 @@ DEVICE MODE
 			       access.
 
 Optional Property:
- - gpios : Should be added if a gpio line is required to drive PERST# line
+ - gpios : Should be added if a GPIO line is required to drive PERST# line
 
 NOTE: Two DT nodes may be added for each PCI controller; one for host
 mode and another for device mode. So in order for PCI to
diff --git a/Documentation/devicetree/bindings/pci/versatile.txt b/Documentation/devicetree/bindings/pci/versatile.txt
index ebd1e7d0403e..0a702b13d2ac 100644
--- a/Documentation/devicetree/bindings/pci/versatile.txt
+++ b/Documentation/devicetree/bindings/pci/versatile.txt
@@ -5,7 +5,7 @@ PCI host controller found on the ARM Versatile PB board's FPGA.
 Required properties:
 - compatible: should contain "arm,versatile-pci" to identify the Versatile PCI
   controller.
-- reg: base addresses and lengths of the pci controller. There must be 3
+- reg: base addresses and lengths of the PCI controller. There must be 3
   entries:
 	- Versatile-specific registers
 	- Self Config space
diff --git a/Documentation/devicetree/bindings/pci/xgene-pci-msi.txt b/Documentation/devicetree/bindings/pci/xgene-pci-msi.txt
index 36d881c8e6d4..09ac2dc3afc1 100644
--- a/Documentation/devicetree/bindings/pci/xgene-pci-msi.txt
+++ b/Documentation/devicetree/bindings/pci/xgene-pci-msi.txt
@@ -4,7 +4,7 @@ Required properties:
 
 - compatible: should be "apm,xgene1-msi" to identify
 	      X-Gene v1 PCIe MSI controller block.
-- msi-controller: indicates that this is X-Gene v1 PCIe MSI controller node
+- msi-controller: indicates that this is an X-Gene v1 PCIe MSI controller node
 - reg: physical base address (0x79000000) and length (0x900000) for controller
        registers. These registers include the MSI termination address and data
        registers as well as the MSI interrupt status registers.
@@ -13,7 +13,8 @@ Required properties:
 	      interrupt number 0x10 to 0x1f.
 - interrupt-names: not required
 
-Each PCIe node needs to have property msi-parent that points to msi controller node
+Each PCIe node needs to have property msi-parent that points to an MSI
+controller node
 
 Examples:
 
diff --git a/Documentation/devicetree/bindings/pci/xgene-pci.txt b/Documentation/devicetree/bindings/pci/xgene-pci.txt
index 1070b068c7c6..6fd2decfa66c 100644
--- a/Documentation/devicetree/bindings/pci/xgene-pci.txt
+++ b/Documentation/devicetree/bindings/pci/xgene-pci.txt
@@ -8,7 +8,7 @@ Required properties:
        property.
 - reg-names: Must include the following entries:
   "csr": controller configuration registers.
-  "cfg": pcie configuration space registers.
+  "cfg": PCIe configuration space registers.
 - #address-cells: set to <3>
 - #size-cells: set to <2>
 - ranges: ranges for the outbound memory, I/O regions.
@@ -21,11 +21,11 @@ Required properties:
 
 Optional properties:
 - status: Either "ok" or "disabled".
-- dma-coherent: Present if dma operations are coherent
+- dma-coherent: Present if DMA operations are coherent
 
 Example:
 
-SoC specific DT Entry:
+SoC-specific DT Entry:
 
 	pcie0: pcie@1f2b0000 {
 		status = "disabled";
@@ -51,7 +51,7 @@ SoC specific DT Entry:
 	};
 
 
-Board specific DT Entry:
+Board-specific DT Entry:
 	&pcie0 {
 		status = "ok";
 	};
diff --git a/Documentation/devicetree/bindings/pci/xilinx-nwl-pcie.txt b/Documentation/devicetree/bindings/pci/xilinx-nwl-pcie.txt
index 3259798a1192..01bf7fdf4c19 100644
--- a/Documentation/devicetree/bindings/pci/xilinx-nwl-pcie.txt
+++ b/Documentation/devicetree/bindings/pci/xilinx-nwl-pcie.txt
@@ -15,9 +15,9 @@ Required properties:
 - device_type: must be "pci"
 - interrupts: Should contain NWL PCIe interrupt
 - interrupt-names: Must include the following entries:
-	"msi1, msi0": interrupt asserted when MSI is received
+	"msi1, msi0": interrupt asserted when an MSI is received
 	"intx": interrupt asserted when a legacy interrupt is received
-	"misc": interrupt asserted when miscellaneous is received
+	"misc": interrupt asserted when miscellaneous interrupt is received
 - interrupt-map-mask and interrupt-map: standard PCI properties to define the
 	mapping of the PCI interface to interrupt numbers.
 - ranges: ranges for the PCI memory regions (I/O space region is not
@@ -26,7 +26,8 @@ Required properties:
 	detailed explanation
 - msi-controller: indicates that this is MSI controller node
 - msi-parent:  MSI parent of the root complex itself
-- legacy-interrupt-controller: Interrupt controller device node for Legacy interrupts
+- legacy-interrupt-controller: Interrupt controller device node for Legacy
+	interrupts
 	- interrupt-controller: identifies the node as an interrupt controller
 	- #interrupt-cells: should be set to 1
 	- #address-cells: specifies the number of cells needed to encode an
diff --git a/MAINTAINERS b/MAINTAINERS
index f66488dfdbc9..4c003c9be237 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -10136,7 +10136,7 @@ L:	linux-samsung-soc@vger.kernel.org (moderated for non-subscribers)
 S:	Maintained
 F:	drivers/pci/dwc/pci-exynos.c
 
-PCI DRIVER FOR SYNOPSIS DESIGNWARE
+PCI DRIVER FOR SYNOPSYS DESIGNWARE
 M:	Jingoo Han <jingoohan1@gmail.com>
 M:	Joao Pinto <Joao.Pinto@synopsys.com>
 L:	linux-pci@vger.kernel.org
diff --git a/drivers/pci/dwc/Kconfig b/drivers/pci/dwc/Kconfig
index d275aadc47ee..22ec82fcdea2 100644
--- a/drivers/pci/dwc/Kconfig
+++ b/drivers/pci/dwc/Kconfig
@@ -25,7 +25,7 @@ config PCI_DRA7XX
 	 work either as EP or RC. In order to enable host-specific features
 	 PCI_DRA7XX_HOST must be selected and in order to enable device-
 	 specific features PCI_DRA7XX_EP must be selected. This uses
-	 the Designware core.
+	 the DesignWare core.
 
 if PCI_DRA7XX
 
@@ -97,8 +97,8 @@ config PCI_KEYSTONE
 	select PCIE_DW_HOST
 	help
 	  Say Y here if you want to enable PCI controller support on Keystone
-	  SoCs. The PCI controller on Keystone is based on Designware hardware
-	  and therefore the driver re-uses the Designware core functions to
+	  SoCs. The PCI controller on Keystone is based on DesignWare hardware
+	  and therefore the driver re-uses the DesignWare core functions to
 	  implement the driver.
 
 config PCI_LAYERSCAPE
@@ -132,7 +132,7 @@ config PCIE_QCOM
 	select PCIE_DW_HOST
 	help
 	  Say Y here to enable PCIe controller support on Qualcomm SoCs. The
-	  PCIe controller uses the Designware core plus Qualcomm-specific
+	  PCIe controller uses the DesignWare core plus Qualcomm-specific
 	  hardware wrappers.
 
 config PCIE_ARMADA_8K
@@ -145,8 +145,8 @@ config PCIE_ARMADA_8K
 	help
 	  Say Y here if you want to enable PCIe controller support on
 	  Armada-8K SoCs. The PCIe controller on Armada-8K is based on
-	  Designware hardware and therefore the driver re-uses the
-	  Designware core functions to implement the driver.
+	  DesignWare hardware and therefore the driver re-uses the
+	  DesignWare core functions to implement the driver.
 
 config PCIE_ARTPEC6
 	bool "Axis ARTPEC-6 PCIe controller"
diff --git a/drivers/pci/dwc/pci-dra7xx.c b/drivers/pci/dwc/pci-dra7xx.c
index f2fc5f47064e..a6f972e2e6f2 100644
--- a/drivers/pci/dwc/pci-dra7xx.c
+++ b/drivers/pci/dwc/pci-dra7xx.c
@@ -275,7 +275,6 @@ static irqreturn_t dra7xx_pcie_msi_irq_handler(int irq, void *arg)
 	return IRQ_HANDLED;
 }
 
-
 static irqreturn_t dra7xx_pcie_irq_handler(int irq, void *arg)
 {
 	struct dra7xx_pcie *dra7xx = arg;
diff --git a/drivers/pci/dwc/pci-keystone-dw.c b/drivers/pci/dwc/pci-keystone-dw.c
index 8bc626e640c8..6eb21aa1a99f 100644
--- a/drivers/pci/dwc/pci-keystone-dw.c
+++ b/drivers/pci/dwc/pci-keystone-dw.c
@@ -1,5 +1,5 @@
 /*
- * Designware application register space functions for Keystone PCI controller
+ * DesignWare application register space functions for Keystone PCI controller
  *
  * Copyright (C) 2013-2014 Texas Instruments., Ltd.
  *		http://www.ti.com
diff --git a/drivers/pci/dwc/pcie-designware-ep.c b/drivers/pci/dwc/pcie-designware-ep.c
index 398406393f37..e38747c6718c 100644
--- a/drivers/pci/dwc/pcie-designware-ep.c
+++ b/drivers/pci/dwc/pcie-designware-ep.c
@@ -1,5 +1,5 @@
 /**
- * Synopsys Designware PCIe Endpoint controller driver
+ * Synopsys DesignWare PCIe Endpoint controller driver
  *
  * Copyright (C) 2017 Texas Instruments
  * Author: Kishon Vijay Abraham I <kishon@ti.com>
diff --git a/drivers/pci/dwc/pcie-designware-host.c b/drivers/pci/dwc/pcie-designware-host.c
index d29c020da082..e71450b9aabe 100644
--- a/drivers/pci/dwc/pcie-designware-host.c
+++ b/drivers/pci/dwc/pcie-designware-host.c
@@ -1,5 +1,5 @@
 /*
- * Synopsys Designware PCIe host controller driver
+ * Synopsys DesignWare PCIe host controller driver
  *
  * Copyright (C) 2013 Samsung Electronics Co., Ltd.
  *		http://www.samsung.com
diff --git a/drivers/pci/dwc/pcie-designware.c b/drivers/pci/dwc/pcie-designware.c
index 0e03af279259..daae8dd62d39 100644
--- a/drivers/pci/dwc/pcie-designware.c
+++ b/drivers/pci/dwc/pcie-designware.c
@@ -1,5 +1,5 @@
 /*
- * Synopsys Designware PCIe host controller driver
+ * Synopsys DesignWare PCIe host controller driver
  *
  * Copyright (C) 2013 Samsung Electronics Co., Ltd.
  *		http://www.samsung.com
diff --git a/drivers/pci/dwc/pcie-designware.h b/drivers/pci/dwc/pcie-designware.h
index b4d2a89f8e58..cca0b93c97ac 100644
--- a/drivers/pci/dwc/pcie-designware.h
+++ b/drivers/pci/dwc/pcie-designware.h
@@ -1,5 +1,5 @@
 /*
- * Synopsys Designware PCIe host controller driver
+ * Synopsys DesignWare PCIe host controller driver
  *
  * Copyright (C) 2013 Samsung Electronics Co., Ltd.
  *		http://www.samsung.com
diff --git a/drivers/pci/host/pcie-rockchip.c b/drivers/pci/host/pcie-rockchip.c
index 7bb9870f6d8c..5fbdc38f94dd 100644
--- a/drivers/pci/host/pcie-rockchip.c
+++ b/drivers/pci/host/pcie-rockchip.c
@@ -6,7 +6,7 @@
  * Author: Shawn Lin <shawn.lin@rock-chips.com>
  *         Wenrui Li <wenrui.li@rock-chips.com>
  *
- * Bits taken from Synopsys Designware Host controller driver and
+ * Bits taken from Synopsys DesignWare Host controller driver and
  * ARM PCI Host generic driver.
  *
  * This program is free software: you can redistribute it and/or modify
diff --git a/drivers/pci/host/pcie-xilinx.c b/drivers/pci/host/pcie-xilinx.c
index f63fa5e0278c..168db0477671 100644
--- a/drivers/pci/host/pcie-xilinx.c
+++ b/drivers/pci/host/pcie-xilinx.c
@@ -5,7 +5,7 @@
  *
  * Based on the Tegra PCIe driver
  *
- * Bits taken from Synopsys Designware Host controller driver and
+ * Bits taken from Synopsys DesignWare Host controller driver and
  * ARM PCI Host generic driver.
  *
  * This program is free software: you can redistribute it and/or modify
diff --git a/drivers/pci/pcie/aer/aerdrv_core.c b/drivers/pci/pcie/aer/aerdrv_core.c
index b1303b32053f..890efcc574cb 100644
--- a/drivers/pci/pcie/aer/aerdrv_core.c
+++ b/drivers/pci/pcie/aer/aerdrv_core.c
@@ -5,10 +5,10 @@
  * License.  See the file "COPYING" in the main directory of this archive
  * for more details.
  *
- * This file implements the core part of PCI-Express AER. When an pci-express
+ * This file implements the core part of PCIe AER. When a PCIe
  * error is delivered, an error message will be collected and printed to
  * console, then, an error recovery procedure will be executed by following
- * the pci error recovery rules.
+ * the PCI error recovery rules.
  *
  * Copyright (C) 2006 Intel Corp.
  *	Tom Long Nguyen (tom.l.nguyen@intel.com)
diff --git a/drivers/pci/quirks.c b/drivers/pci/quirks.c
index 6967c6b4cf6b..8ae7ee75952b 100644
--- a/drivers/pci/quirks.c
+++ b/drivers/pci/quirks.c
@@ -2061,7 +2061,7 @@ DECLARE_PCI_FIXUP_CLASS_FINAL(PCI_VENDOR_ID_INTEL, PCI_ANY_ID,
 
 /*
  * The 82575 and 82598 may experience data corruption issues when transitioning
- * out of L0S.  To prevent this we need to disable L0S on the pci-e link
+ * out of L0S.  To prevent this we need to disable L0S on the PCIe link.
  */
 static void quirk_disable_aspm_l0s(struct pci_dev *dev)
 {
diff --git a/include/linux/aer.h b/include/linux/aer.h
index 04602cbe85dc..43799bd17a02 100644
--- a/include/linux/aer.h
+++ b/include/linux/aer.h
@@ -39,7 +39,7 @@ struct aer_capability_regs {
 };
 
 #if defined(CONFIG_PCIEAER)
-/* pci-e port driver needs this function to enable aer */
+/* PCIe port driver needs this function to enable AER */
 int pci_enable_pcie_error_reporting(struct pci_dev *dev);
 int pci_disable_pcie_error_reporting(struct pci_dev *dev);
 int pci_cleanup_aer_uncorrect_error_status(struct pci_dev *dev);
@@ -67,7 +67,6 @@ void cper_print_aer(struct pci_dev *dev, int aer_severity,
 		    struct aer_capability_regs *aer);
 int cper_severity_to_aer(int cper_severity);
 void aer_recover_queue(int domain, unsigned int bus, unsigned int devfn,
-		       int severity,
-		       struct aer_capability_regs *aer_regs);
+		       int severity, struct aer_capability_regs *aer_regs);
 #endif //_AER_H_
 
diff --git a/include/linux/pcieport_if.h b/include/linux/pcieport_if.h
index afcd130ab3a9..12542f5ac83d 100644
--- a/include/linux/pcieport_if.h
+++ b/include/linux/pcieport_if.h
@@ -38,7 +38,7 @@ static inline void set_service_data(struct pcie_device *dev, void *data)
 	dev->priv_data = data;
 }
 
-static inline void* get_service_data(struct pcie_device *dev)
+static inline void *get_service_data(struct pcie_device *dev)
 {
 	return dev->priv_data;
 }
-- 
cgit v1.2.3


From 0865805d82d4c822647ee35ab2629c48cc40706b Mon Sep 17 00:00:00 2001
From: Quentin Schulz <quentin.schulz@free-electrons.com>
Date: Thu, 10 Aug 2017 08:34:03 +0200
Subject: clk: at91: add audio pll clock drivers

This new clock driver set allows to have a fractional divided clock that
would generate a precise clock particularly suitable for audio
applications.

The main audio pll clock has two children clocks: one that is connected
to the PMC, the other that can directly drive a pad. As these two routes
have different enable bits and different dividers and divider formulas,
they are handled by two different drivers. Each of them could modify the
rate of the main audio pll parent.

The main audio pll clock can output 620MHz to 700MHz.

Signed-off-by: Nicolas Ferre <nicolas.ferre@atmel.com>
Signed-off-by: Quentin Schulz <quentin.schulz@free-electrons.com>
Acked-by: Boris Brezillon <boris.brezillon@free-electrons.com>
Signed-off-by: Stephen Boyd <sboyd@codeaurora.org>
---
 arch/arm/mach-at91/Kconfig       |   4 +
 drivers/clk/at91/Makefile        |   1 +
 drivers/clk/at91/clk-audio-pll.c | 536 +++++++++++++++++++++++++++++++++++++++
 include/linux/clk/at91_pmc.h     |  25 ++
 4 files changed, 566 insertions(+)
 create mode 100644 drivers/clk/at91/clk-audio-pll.c

(limited to 'include')

diff --git a/arch/arm/mach-at91/Kconfig b/arch/arm/mach-at91/Kconfig
index d735e5fc4772..9ae14d59a9ce 100644
--- a/arch/arm/mach-at91/Kconfig
+++ b/arch/arm/mach-at91/Kconfig
@@ -26,6 +26,7 @@ config SOC_SAMA5D2
 	select HAVE_AT91_USB_CLK
 	select HAVE_AT91_H32MX
 	select HAVE_AT91_GENERATED_CLK
+	select HAVE_AT91_AUDIO_PLL
 	select PINCTRL_AT91PIO4
 	help
 	  Select this if ou are using one of Atmel's SAMA5D2 family SoC.
@@ -125,6 +126,9 @@ config HAVE_AT91_H32MX
 config HAVE_AT91_GENERATED_CLK
 	bool
 
+config HAVE_AT91_AUDIO_PLL
+	bool
+
 config SOC_SAM_V4_V5
 	bool
 
diff --git a/drivers/clk/at91/Makefile b/drivers/clk/at91/Makefile
index 13e67bd35cff..c68947b65a4c 100644
--- a/drivers/clk/at91/Makefile
+++ b/drivers/clk/at91/Makefile
@@ -6,6 +6,7 @@ obj-y += pmc.o sckc.o
 obj-y += clk-slow.o clk-main.o clk-pll.o clk-plldiv.o clk-master.o
 obj-y += clk-system.o clk-peripheral.o clk-programmable.o
 
+obj-$(CONFIG_HAVE_AT91_AUDIO_PLL)	+= clk-audio-pll.o
 obj-$(CONFIG_HAVE_AT91_UTMI)		+= clk-utmi.o
 obj-$(CONFIG_HAVE_AT91_USB_CLK)		+= clk-usb.o
 obj-$(CONFIG_HAVE_AT91_SMD)		+= clk-smd.o
diff --git a/drivers/clk/at91/clk-audio-pll.c b/drivers/clk/at91/clk-audio-pll.c
new file mode 100644
index 000000000000..da7bafcfbe70
--- /dev/null
+++ b/drivers/clk/at91/clk-audio-pll.c
@@ -0,0 +1,536 @@
+/*
+ *  Copyright (C) 2016 Atmel Corporation,
+ *		       Songjun Wu <songjun.wu@atmel.com>,
+ *                     Nicolas Ferre <nicolas.ferre@atmel.com>
+ *  Copyright (C) 2017 Free Electrons,
+ *		       Quentin Schulz <quentin.schulz@free-electrons.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * The Sama5d2 SoC has two audio PLLs (PMC and PAD) that shares the same parent
+ * (FRAC). FRAC can output between 620 and 700MHz and only multiply the rate of
+ * its own parent. PMC and PAD can then divide the FRAC rate to best match the
+ * asked rate.
+ *
+ * Traits of FRAC clock:
+ * enable - clk_enable writes nd, fracr parameters and enables PLL
+ * rate - rate is adjustable.
+ *        clk->rate = parent->rate * ((nd + 1) + (fracr / 2^22))
+ * parent - fixed parent.  No clk_set_parent support
+ *
+ * Traits of PMC clock:
+ * enable - clk_enable writes qdpmc, and enables PMC output
+ * rate - rate is adjustable.
+ *        clk->rate = parent->rate / (qdpmc + 1)
+ * parent - fixed parent.  No clk_set_parent support
+ *
+ * Traits of PAD clock:
+ * enable - clk_enable writes divisors and enables PAD output
+ * rate - rate is adjustable.
+ *        clk->rate = parent->rate / (qdaudio * div))
+ * parent - fixed parent.  No clk_set_parent support
+ *
+ */
+
+#include <linux/clk.h>
+#include <linux/clk-provider.h>
+#include <linux/clk/at91_pmc.h>
+#include <linux/of.h>
+#include <linux/mfd/syscon.h>
+#include <linux/regmap.h>
+#include <linux/slab.h>
+
+#define AUDIO_PLL_DIV_FRAC	BIT(22)
+#define AUDIO_PLL_ND_MAX	(AT91_PMC_AUDIO_PLL_ND_MASK >> \
+					AT91_PMC_AUDIO_PLL_ND_OFFSET)
+
+#define AUDIO_PLL_QDPAD(qd, div)	((AT91_PMC_AUDIO_PLL_QDPAD_EXTDIV(qd) & \
+					  AT91_PMC_AUDIO_PLL_QDPAD_EXTDIV_MASK) | \
+					 (AT91_PMC_AUDIO_PLL_QDPAD_DIV(div) & \
+					  AT91_PMC_AUDIO_PLL_QDPAD_DIV_MASK))
+
+#define AUDIO_PLL_QDPMC_MAX		(AT91_PMC_AUDIO_PLL_QDPMC_MASK >> \
+						AT91_PMC_AUDIO_PLL_QDPMC_OFFSET)
+
+#define AUDIO_PLL_FOUT_MIN	620000000UL
+#define AUDIO_PLL_FOUT_MAX	700000000UL
+
+struct clk_audio_frac {
+	struct clk_hw hw;
+	struct regmap *regmap;
+	u32 fracr;
+	u8 nd;
+};
+
+struct clk_audio_pad {
+	struct clk_hw hw;
+	struct regmap *regmap;
+	u8 qdaudio;
+	u8 div;
+};
+
+struct clk_audio_pmc {
+	struct clk_hw hw;
+	struct regmap *regmap;
+	u8 qdpmc;
+};
+
+#define to_clk_audio_frac(hw) container_of(hw, struct clk_audio_frac, hw)
+#define to_clk_audio_pad(hw) container_of(hw, struct clk_audio_pad, hw)
+#define to_clk_audio_pmc(hw) container_of(hw, struct clk_audio_pmc, hw)
+
+static int clk_audio_pll_frac_enable(struct clk_hw *hw)
+{
+	struct clk_audio_frac *frac = to_clk_audio_frac(hw);
+
+	regmap_update_bits(frac->regmap, AT91_PMC_AUDIO_PLL0,
+			   AT91_PMC_AUDIO_PLL_RESETN, 0);
+	regmap_update_bits(frac->regmap, AT91_PMC_AUDIO_PLL0,
+			   AT91_PMC_AUDIO_PLL_RESETN,
+			   AT91_PMC_AUDIO_PLL_RESETN);
+	regmap_update_bits(frac->regmap, AT91_PMC_AUDIO_PLL1,
+			   AT91_PMC_AUDIO_PLL_FRACR_MASK, frac->fracr);
+
+	/*
+	 * reset and enable have to be done in 2 separated writes
+	 * for AT91_PMC_AUDIO_PLL0
+	 */
+	regmap_update_bits(frac->regmap, AT91_PMC_AUDIO_PLL0,
+			   AT91_PMC_AUDIO_PLL_PLLEN |
+			   AT91_PMC_AUDIO_PLL_ND_MASK,
+			   AT91_PMC_AUDIO_PLL_PLLEN |
+			   AT91_PMC_AUDIO_PLL_ND(frac->nd));
+
+	return 0;
+}
+
+static int clk_audio_pll_pad_enable(struct clk_hw *hw)
+{
+	struct clk_audio_pad *apad_ck = to_clk_audio_pad(hw);
+
+	regmap_update_bits(apad_ck->regmap, AT91_PMC_AUDIO_PLL1,
+			   AT91_PMC_AUDIO_PLL_QDPAD_MASK,
+			   AUDIO_PLL_QDPAD(apad_ck->qdaudio, apad_ck->div));
+	regmap_update_bits(apad_ck->regmap, AT91_PMC_AUDIO_PLL0,
+			   AT91_PMC_AUDIO_PLL_PADEN, AT91_PMC_AUDIO_PLL_PADEN);
+
+	return 0;
+}
+
+static int clk_audio_pll_pmc_enable(struct clk_hw *hw)
+{
+	struct clk_audio_pmc *apmc_ck = to_clk_audio_pmc(hw);
+
+	regmap_update_bits(apmc_ck->regmap, AT91_PMC_AUDIO_PLL0,
+			   AT91_PMC_AUDIO_PLL_PMCEN |
+			   AT91_PMC_AUDIO_PLL_QDPMC_MASK,
+			   AT91_PMC_AUDIO_PLL_PMCEN |
+			   AT91_PMC_AUDIO_PLL_QDPMC(apmc_ck->qdpmc));
+	return 0;
+}
+
+static void clk_audio_pll_frac_disable(struct clk_hw *hw)
+{
+	struct clk_audio_frac *frac = to_clk_audio_frac(hw);
+
+	regmap_update_bits(frac->regmap, AT91_PMC_AUDIO_PLL0,
+			   AT91_PMC_AUDIO_PLL_PLLEN, 0);
+	/* do it in 2 separated writes */
+	regmap_update_bits(frac->regmap, AT91_PMC_AUDIO_PLL0,
+			   AT91_PMC_AUDIO_PLL_RESETN, 0);
+}
+
+static void clk_audio_pll_pad_disable(struct clk_hw *hw)
+{
+	struct clk_audio_pad *apad_ck = to_clk_audio_pad(hw);
+
+	regmap_update_bits(apad_ck->regmap, AT91_PMC_AUDIO_PLL0,
+			   AT91_PMC_AUDIO_PLL_PADEN, 0);
+}
+
+static void clk_audio_pll_pmc_disable(struct clk_hw *hw)
+{
+	struct clk_audio_pmc *apmc_ck = to_clk_audio_pmc(hw);
+
+	regmap_update_bits(apmc_ck->regmap, AT91_PMC_AUDIO_PLL0,
+			   AT91_PMC_AUDIO_PLL_PMCEN, 0);
+}
+
+static unsigned long clk_audio_pll_fout(unsigned long parent_rate,
+					unsigned long nd, unsigned long fracr)
+{
+	unsigned long long fr = (unsigned long long)parent_rate * fracr;
+
+	pr_debug("A PLL: %s, fr = %llu\n", __func__, fr);
+
+	fr = DIV_ROUND_CLOSEST_ULL(fr, AUDIO_PLL_DIV_FRAC);
+
+	pr_debug("A PLL: %s, fr = %llu\n", __func__, fr);
+
+	return parent_rate * (nd + 1) + fr;
+}
+
+static unsigned long clk_audio_pll_frac_recalc_rate(struct clk_hw *hw,
+						    unsigned long parent_rate)
+{
+	struct clk_audio_frac *frac = to_clk_audio_frac(hw);
+	unsigned long fout;
+
+	fout = clk_audio_pll_fout(parent_rate, frac->nd, frac->fracr);
+
+	pr_debug("A PLL: %s, fout = %lu (nd = %u, fracr = %lu)\n", __func__,
+		 fout, frac->nd, (unsigned long)frac->fracr);
+
+	return fout;
+}
+
+static unsigned long clk_audio_pll_pad_recalc_rate(struct clk_hw *hw,
+						   unsigned long parent_rate)
+{
+	struct clk_audio_pad *apad_ck = to_clk_audio_pad(hw);
+	unsigned long apad_rate = 0;
+
+	if (apad_ck->qdaudio && apad_ck->div)
+		apad_rate = parent_rate / (apad_ck->qdaudio * apad_ck->div);
+
+	pr_debug("A PLL/PAD: %s, apad_rate = %lu (div = %u, qdaudio = %u)\n",
+		 __func__, apad_rate, apad_ck->div, apad_ck->qdaudio);
+
+	return apad_rate;
+}
+
+static unsigned long clk_audio_pll_pmc_recalc_rate(struct clk_hw *hw,
+						   unsigned long parent_rate)
+{
+	struct clk_audio_pmc *apmc_ck = to_clk_audio_pmc(hw);
+	unsigned long apmc_rate = 0;
+
+	apmc_rate = parent_rate / (apmc_ck->qdpmc + 1);
+
+	pr_debug("A PLL/PMC: %s, apmc_rate = %lu (qdpmc = %u)\n", __func__,
+		 apmc_rate, apmc_ck->qdpmc);
+
+	return apmc_rate;
+}
+
+static int clk_audio_pll_frac_compute_frac(unsigned long rate,
+					   unsigned long parent_rate,
+					   unsigned long *nd,
+					   unsigned long *fracr)
+{
+	unsigned long long tmp, rem;
+
+	if (!rate)
+		return -EINVAL;
+
+	tmp = rate;
+	rem = do_div(tmp, parent_rate);
+	if (!tmp || tmp >= AUDIO_PLL_ND_MAX)
+		return -EINVAL;
+
+	*nd = tmp - 1;
+
+	tmp = rem * AUDIO_PLL_DIV_FRAC;
+	tmp = DIV_ROUND_CLOSEST_ULL(tmp, parent_rate);
+	if (tmp > AT91_PMC_AUDIO_PLL_FRACR_MASK)
+		return -EINVAL;
+
+	/* we can cast here as we verified the bounds just above */
+	*fracr = (unsigned long)tmp;
+
+	return 0;
+}
+
+static int clk_audio_pll_frac_determine_rate(struct clk_hw *hw,
+					     struct clk_rate_request *req)
+{
+	unsigned long fracr, nd;
+	int ret;
+
+	pr_debug("A PLL: %s, rate = %lu (parent_rate = %lu)\n", __func__,
+		 req->rate, req->best_parent_rate);
+
+	req->rate = clamp(req->rate, AUDIO_PLL_FOUT_MIN, AUDIO_PLL_FOUT_MAX);
+
+	req->min_rate = max(req->min_rate, AUDIO_PLL_FOUT_MIN);
+	req->max_rate = min(req->max_rate, AUDIO_PLL_FOUT_MAX);
+
+	ret = clk_audio_pll_frac_compute_frac(req->rate, req->best_parent_rate,
+					      &nd, &fracr);
+	if (ret)
+		return ret;
+
+	req->rate = clk_audio_pll_fout(req->best_parent_rate, nd, fracr);
+
+	req->best_parent_hw = clk_hw_get_parent(hw);
+
+	pr_debug("A PLL: %s, best_rate = %lu (nd = %lu, fracr = %lu)\n",
+		 __func__, req->rate, nd, fracr);
+
+	return 0;
+}
+
+static long clk_audio_pll_pad_round_rate(struct clk_hw *hw, unsigned long rate,
+					 unsigned long *parent_rate)
+{
+	struct clk_hw *pclk = clk_hw_get_parent(hw);
+	long best_rate = -EINVAL;
+	unsigned long best_parent_rate;
+	unsigned long tmp_qd;
+	u32 div;
+	long tmp_rate;
+	int tmp_diff;
+	int best_diff = -1;
+
+	pr_debug("A PLL/PAD: %s, rate = %lu (parent_rate = %lu)\n", __func__,
+		 rate, *parent_rate);
+
+	/*
+	 * Rate divisor is actually made of two different divisors, multiplied
+	 * between themselves before dividing the rate.
+	 * tmp_qd goes from 1 to 31 and div is either 2 or 3.
+	 * In order to avoid testing twice the rate divisor (e.g. divisor 12 can
+	 * be found with (tmp_qd, div) = (2, 6) or (3, 4)), we remove any loop
+	 * for a rate divisor when div is 2 and tmp_qd is a multiple of 3.
+	 * We cannot inverse it (condition div is 3 and tmp_qd is even) or we
+	 * would miss some rate divisor that aren't reachable with div being 2
+	 * (e.g. rate divisor 90 is made with div = 3 and tmp_qd = 30, thus
+	 * tmp_qd is even so we skip it because we think div 2 could make this
+	 * rate divisor which isn't possible since tmp_qd has to be <= 31).
+	 */
+	for (tmp_qd = 1; tmp_qd < AT91_PMC_AUDIO_PLL_QDPAD_EXTDIV_MAX; tmp_qd++)
+		for (div = 2; div <= 3; div++) {
+			if (div == 2 && tmp_qd % 3 == 0)
+				continue;
+
+			best_parent_rate = clk_hw_round_rate(pclk,
+							rate * tmp_qd * div);
+			tmp_rate = best_parent_rate / (div * tmp_qd);
+			tmp_diff = abs(rate - tmp_rate);
+
+			if (best_diff < 0 || best_diff > tmp_diff) {
+				*parent_rate = best_parent_rate;
+				best_rate = tmp_rate;
+				best_diff = tmp_diff;
+			}
+		}
+
+	pr_debug("A PLL/PAD: %s, best_rate = %ld, best_parent_rate = %lu\n",
+		 __func__, best_rate, best_parent_rate);
+
+	return best_rate;
+}
+
+static long clk_audio_pll_pmc_round_rate(struct clk_hw *hw, unsigned long rate,
+					 unsigned long *parent_rate)
+{
+	struct clk_hw *pclk = clk_hw_get_parent(hw);
+	long best_rate = -EINVAL;
+	unsigned long best_parent_rate = 0;
+	u32 tmp_qd = 0, div;
+	long tmp_rate;
+	int tmp_diff;
+	int best_diff = -1;
+
+	pr_debug("A PLL/PMC: %s, rate = %lu (parent_rate = %lu)\n", __func__,
+		 rate, *parent_rate);
+
+	for (div = 1; div <= AUDIO_PLL_QDPMC_MAX; div++) {
+		best_parent_rate = clk_round_rate(pclk->clk, rate * div);
+		tmp_rate = best_parent_rate / div;
+		tmp_diff = abs(rate - tmp_rate);
+
+		if (best_diff < 0 || best_diff > tmp_diff) {
+			*parent_rate = best_parent_rate;
+			best_rate = tmp_rate;
+			best_diff = tmp_diff;
+			tmp_qd = div;
+		}
+	}
+
+	pr_debug("A PLL/PMC: %s, best_rate = %ld, best_parent_rate = %lu (qd = %d)\n",
+		 __func__, best_rate, *parent_rate, tmp_qd - 1);
+
+	return best_rate;
+}
+
+static int clk_audio_pll_frac_set_rate(struct clk_hw *hw, unsigned long rate,
+				       unsigned long parent_rate)
+{
+	struct clk_audio_frac *frac = to_clk_audio_frac(hw);
+	unsigned long fracr, nd;
+	int ret;
+
+	pr_debug("A PLL: %s, rate = %lu (parent_rate = %lu)\n", __func__, rate,
+		 parent_rate);
+
+	if (rate < AUDIO_PLL_FOUT_MIN || rate > AUDIO_PLL_FOUT_MAX)
+		return -EINVAL;
+
+	ret = clk_audio_pll_frac_compute_frac(rate, parent_rate, &nd, &fracr);
+	if (ret)
+		return ret;
+
+	frac->nd = nd;
+	frac->fracr = fracr;
+
+	return 0;
+}
+
+static int clk_audio_pll_pad_set_rate(struct clk_hw *hw, unsigned long rate,
+				      unsigned long parent_rate)
+{
+	struct clk_audio_pad *apad_ck = to_clk_audio_pad(hw);
+	u8 tmp_div;
+
+	pr_debug("A PLL/PAD: %s, rate = %lu (parent_rate = %lu)\n", __func__,
+		 rate, parent_rate);
+
+	if (!rate)
+		return -EINVAL;
+
+	tmp_div = parent_rate / rate;
+	if (tmp_div % 3 == 0) {
+		apad_ck->qdaudio = tmp_div / 3;
+		apad_ck->div = 3;
+	} else {
+		apad_ck->qdaudio = tmp_div / 2;
+		apad_ck->div = 2;
+	}
+
+	return 0;
+}
+
+static int clk_audio_pll_pmc_set_rate(struct clk_hw *hw, unsigned long rate,
+				      unsigned long parent_rate)
+{
+	struct clk_audio_pmc *apmc_ck = to_clk_audio_pmc(hw);
+
+	if (!rate)
+		return -EINVAL;
+
+	pr_debug("A PLL/PMC: %s, rate = %lu (parent_rate = %lu)\n", __func__,
+		 rate, parent_rate);
+
+	apmc_ck->qdpmc = parent_rate / rate - 1;
+
+	return 0;
+}
+
+static const struct clk_ops audio_pll_frac_ops = {
+	.enable = clk_audio_pll_frac_enable,
+	.disable = clk_audio_pll_frac_disable,
+	.recalc_rate = clk_audio_pll_frac_recalc_rate,
+	.determine_rate = clk_audio_pll_frac_determine_rate,
+	.set_rate = clk_audio_pll_frac_set_rate,
+};
+
+static const struct clk_ops audio_pll_pad_ops = {
+	.enable = clk_audio_pll_pad_enable,
+	.disable = clk_audio_pll_pad_disable,
+	.recalc_rate = clk_audio_pll_pad_recalc_rate,
+	.round_rate = clk_audio_pll_pad_round_rate,
+	.set_rate = clk_audio_pll_pad_set_rate,
+};
+
+static const struct clk_ops audio_pll_pmc_ops = {
+	.enable = clk_audio_pll_pmc_enable,
+	.disable = clk_audio_pll_pmc_disable,
+	.recalc_rate = clk_audio_pll_pmc_recalc_rate,
+	.round_rate = clk_audio_pll_pmc_round_rate,
+	.set_rate = clk_audio_pll_pmc_set_rate,
+};
+
+static int of_sama5d2_clk_audio_pll_setup(struct device_node *np,
+					  struct clk_init_data *init,
+					  struct clk_hw *hw,
+					  struct regmap **clk_audio_regmap)
+{
+	struct regmap *regmap;
+	const char *parent_names[1];
+	int ret;
+
+	regmap = syscon_node_to_regmap(of_get_parent(np));
+	if (IS_ERR(regmap))
+		return PTR_ERR(regmap);
+
+	init->name = np->name;
+	of_clk_parent_fill(np, parent_names, 1);
+	init->parent_names = parent_names;
+	init->num_parents = 1;
+
+	hw->init = init;
+	*clk_audio_regmap = regmap;
+
+	ret = clk_hw_register(NULL, hw);
+	if (ret)
+		return ret;
+
+	return of_clk_add_hw_provider(np, of_clk_hw_simple_get, hw);
+}
+
+static void __init of_sama5d2_clk_audio_pll_frac_setup(struct device_node *np)
+{
+	struct clk_audio_frac *frac_ck;
+	struct clk_init_data init = {};
+
+	frac_ck = kzalloc(sizeof(*frac_ck), GFP_KERNEL);
+	if (!frac_ck)
+		return;
+
+	init.ops = &audio_pll_frac_ops;
+	init.flags = CLK_SET_RATE_GATE;
+
+	if (of_sama5d2_clk_audio_pll_setup(np, &init, &frac_ck->hw,
+					   &frac_ck->regmap))
+		kfree(frac_ck);
+}
+
+static void __init of_sama5d2_clk_audio_pll_pad_setup(struct device_node *np)
+{
+	struct clk_audio_pad *apad_ck;
+	struct clk_init_data init = {};
+
+	apad_ck = kzalloc(sizeof(*apad_ck), GFP_KERNEL);
+	if (!apad_ck)
+		return;
+
+	init.ops = &audio_pll_pad_ops;
+	init.flags = CLK_SET_RATE_GATE | CLK_SET_PARENT_GATE |
+		CLK_SET_RATE_PARENT;
+
+	if (of_sama5d2_clk_audio_pll_setup(np, &init, &apad_ck->hw,
+					   &apad_ck->regmap))
+		kfree(apad_ck);
+}
+
+static void __init of_sama5d2_clk_audio_pll_pmc_setup(struct device_node *np)
+{
+	struct clk_audio_pad *apmc_ck;
+	struct clk_init_data init = {};
+
+	apmc_ck = kzalloc(sizeof(*apmc_ck), GFP_KERNEL);
+	if (!apmc_ck)
+		return;
+
+	init.ops = &audio_pll_pmc_ops;
+	init.flags = CLK_SET_RATE_GATE | CLK_SET_PARENT_GATE |
+		CLK_SET_RATE_PARENT;
+
+	if (of_sama5d2_clk_audio_pll_setup(np, &init, &apmc_ck->hw,
+					   &apmc_ck->regmap))
+		kfree(apmc_ck);
+}
+
+CLK_OF_DECLARE(of_sama5d2_clk_audio_pll_frac_setup,
+	       "atmel,sama5d2-clk-audio-pll-frac",
+	       of_sama5d2_clk_audio_pll_frac_setup);
+CLK_OF_DECLARE(of_sama5d2_clk_audio_pll_pad_setup,
+	       "atmel,sama5d2-clk-audio-pll-pad",
+	       of_sama5d2_clk_audio_pll_pad_setup);
+CLK_OF_DECLARE(of_sama5d2_clk_audio_pll_pmc_setup,
+	       "atmel,sama5d2-clk-audio-pll-pmc",
+	       of_sama5d2_clk_audio_pll_pmc_setup);
diff --git a/include/linux/clk/at91_pmc.h b/include/linux/clk/at91_pmc.h
index 17f413bbbedf..6aca5ce8a99a 100644
--- a/include/linux/clk/at91_pmc.h
+++ b/include/linux/clk/at91_pmc.h
@@ -185,4 +185,29 @@
 #define		AT91_PMC_PCR_EN			(0x1  <<  28)				/* Enable */
 #define		AT91_PMC_PCR_GCKEN		(0x1  <<  29)				/* GCK Enable */
 
+#define AT91_PMC_AUDIO_PLL0	0x14c
+#define		AT91_PMC_AUDIO_PLL_PLLEN	(1  <<  0)
+#define		AT91_PMC_AUDIO_PLL_PADEN	(1  <<  1)
+#define		AT91_PMC_AUDIO_PLL_PMCEN	(1  <<  2)
+#define		AT91_PMC_AUDIO_PLL_RESETN	(1  <<  3)
+#define		AT91_PMC_AUDIO_PLL_ND_OFFSET	8
+#define		AT91_PMC_AUDIO_PLL_ND_MASK	(0x7f << AT91_PMC_AUDIO_PLL_ND_OFFSET)
+#define		AT91_PMC_AUDIO_PLL_ND(n)	((n)  << AT91_PMC_AUDIO_PLL_ND_OFFSET)
+#define		AT91_PMC_AUDIO_PLL_QDPMC_OFFSET	16
+#define		AT91_PMC_AUDIO_PLL_QDPMC_MASK	(0x7f << AT91_PMC_AUDIO_PLL_QDPMC_OFFSET)
+#define		AT91_PMC_AUDIO_PLL_QDPMC(n)	((n)  << AT91_PMC_AUDIO_PLL_QDPMC_OFFSET)
+
+#define AT91_PMC_AUDIO_PLL1	0x150
+#define		AT91_PMC_AUDIO_PLL_FRACR_MASK		0x3fffff
+#define		AT91_PMC_AUDIO_PLL_QDPAD_OFFSET		24
+#define		AT91_PMC_AUDIO_PLL_QDPAD_MASK		(0x7f << AT91_PMC_AUDIO_PLL_QDPAD_OFFSET)
+#define		AT91_PMC_AUDIO_PLL_QDPAD(n)		((n)  << AT91_PMC_AUDIO_PLL_QDPAD_OFFSET)
+#define		AT91_PMC_AUDIO_PLL_QDPAD_DIV_OFFSET	AT91_PMC_AUDIO_PLL_QDPAD_OFFSET
+#define		AT91_PMC_AUDIO_PLL_QDPAD_DIV_MASK	(0x3  << AT91_PMC_AUDIO_PLL_QDPAD_DIV_OFFSET)
+#define		AT91_PMC_AUDIO_PLL_QDPAD_DIV(n)		((n)  << AT91_PMC_AUDIO_PLL_QDPAD_DIV_OFFSET)
+#define		AT91_PMC_AUDIO_PLL_QDPAD_EXTDIV_OFFSET	26
+#define		AT91_PMC_AUDIO_PLL_QDPAD_EXTDIV_MAX	0x1f
+#define		AT91_PMC_AUDIO_PLL_QDPAD_EXTDIV_MASK	(AT91_PMC_AUDIO_PLL_QDPAD_EXTDIV_MAX << AT91_PMC_AUDIO_PLL_QDPAD_EXTDIV_OFFSET)
+#define		AT91_PMC_AUDIO_PLL_QDPAD_EXTDIV(n)	((n)  << AT91_PMC_AUDIO_PLL_QDPAD_EXTDIV_OFFSET)
+
 #endif
-- 
cgit v1.2.3


From 7c89717f82bd305e3102963485f3da160d11bcf6 Mon Sep 17 00:00:00 2001
From: Bjorn Andersson <bjorn.andersson@linaro.org>
Date: Sun, 27 Aug 2017 22:34:53 -0700
Subject: remoteproc: Introduce rproc handle accessor for children

In certain circumstances rpmsg devices needs to acquire a handle to the
ancestor remoteproc instance, e.g. to invoke rproc_report_crash() when a
fatal error is detected. Introduce an interface that walks the device
tree in search for a remoteproc instance and return this.

Tested-by: Suman Anna <s-anna@ti.com>
Signed-off-by: Bjorn Andersson <bjorn.andersson@linaro.org>
---
 drivers/remoteproc/remoteproc_core.c | 18 ++++++++++++++++++
 include/linux/remoteproc.h           |  2 ++
 2 files changed, 20 insertions(+)

(limited to 'include')

diff --git a/drivers/remoteproc/remoteproc_core.c b/drivers/remoteproc/remoteproc_core.c
index 8072df1f0f94..eab14b414bf0 100644
--- a/drivers/remoteproc/remoteproc_core.c
+++ b/drivers/remoteproc/remoteproc_core.c
@@ -1431,6 +1431,7 @@ struct rproc *rproc_alloc(struct device *dev, const char *name,
 	rproc->dev.parent = dev;
 	rproc->dev.type = &rproc_type;
 	rproc->dev.class = &rproc_class;
+	rproc->dev.driver_data = rproc;
 
 	/* Assign a unique device index and name */
 	rproc->index = ida_simple_get(&rproc_dev_index, 0, 0, GFP_KERNEL);
@@ -1569,6 +1570,23 @@ void rproc_remove_subdev(struct rproc *rproc, struct rproc_subdev *subdev)
 }
 EXPORT_SYMBOL(rproc_remove_subdev);
 
+/**
+ * rproc_get_by_child() - acquire rproc handle of @dev's ancestor
+ * @dev:	child device to find ancestor of
+ *
+ * Returns the ancestor rproc instance, or NULL if not found.
+ */
+struct rproc *rproc_get_by_child(struct device *dev)
+{
+	for (dev = dev->parent; dev; dev = dev->parent) {
+		if (dev->type == &rproc_type)
+			return dev->driver_data;
+	}
+
+	return NULL;
+}
+EXPORT_SYMBOL(rproc_get_by_child);
+
 /**
  * rproc_report_crash() - rproc crash reporter function
  * @rproc: remote processor
diff --git a/include/linux/remoteproc.h b/include/linux/remoteproc.h
index 81da49564ff4..44e630eb3d94 100644
--- a/include/linux/remoteproc.h
+++ b/include/linux/remoteproc.h
@@ -510,6 +510,8 @@ struct rproc_vdev {
 };
 
 struct rproc *rproc_get_by_phandle(phandle phandle);
+struct rproc *rproc_get_by_child(struct device *dev);
+
 struct rproc *rproc_alloc(struct device *dev, const char *name,
 			  const struct rproc_ops *ops,
 			  const char *firmware, int len);
-- 
cgit v1.2.3


From b37e88407c1d78f157778d73427cd7e9e1d6369d Mon Sep 17 00:00:00 2001
From: Ivan Delalande <colona@arista.com>
Date: Thu, 31 Aug 2017 09:59:38 -0700
Subject: inet_diag: allow protocols to provide additional data

Extend inet_diag_handler to allow individual protocols to report
additional data on INET_DIAG_INFO through idiag_get_aux. The size
can be dynamic and is computed by idiag_get_aux_size.

Signed-off-by: Ivan Delalande <colona@arista.com>
Acked-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/inet_diag.h |  7 +++++++
 net/ipv4/inet_diag.c      | 22 ++++++++++++++++++----
 2 files changed, 25 insertions(+), 4 deletions(-)

(limited to 'include')

diff --git a/include/linux/inet_diag.h b/include/linux/inet_diag.h
index 65da430e260f..ee251c585854 100644
--- a/include/linux/inet_diag.h
+++ b/include/linux/inet_diag.h
@@ -25,6 +25,13 @@ struct inet_diag_handler {
 					  struct inet_diag_msg *r,
 					  void *info);
 
+	int		(*idiag_get_aux)(struct sock *sk,
+					 bool net_admin,
+					 struct sk_buff *skb);
+
+	size_t		(*idiag_get_aux_size)(struct sock *sk,
+					      bool net_admin);
+
 	int		(*destroy)(struct sk_buff *in_skb,
 				   const struct inet_diag_req_v2 *req);
 
diff --git a/net/ipv4/inet_diag.c b/net/ipv4/inet_diag.c
index 67325d5832d7..c9c35b61a027 100644
--- a/net/ipv4/inet_diag.c
+++ b/net/ipv4/inet_diag.c
@@ -93,8 +93,17 @@ void inet_diag_msg_common_fill(struct inet_diag_msg *r, struct sock *sk)
 }
 EXPORT_SYMBOL_GPL(inet_diag_msg_common_fill);
 
-static size_t inet_sk_attr_size(void)
+static size_t inet_sk_attr_size(struct sock *sk,
+				const struct inet_diag_req_v2 *req,
+				bool net_admin)
 {
+	const struct inet_diag_handler *handler;
+	size_t aux = 0;
+
+	handler = inet_diag_table[req->sdiag_protocol];
+	if (handler && handler->idiag_get_aux_size)
+		aux = handler->idiag_get_aux_size(sk, net_admin);
+
 	return	  nla_total_size(sizeof(struct tcp_info))
 		+ nla_total_size(1) /* INET_DIAG_SHUTDOWN */
 		+ nla_total_size(1) /* INET_DIAG_TOS */
@@ -105,6 +114,7 @@ static size_t inet_sk_attr_size(void)
 		+ nla_total_size(SK_MEMINFO_VARS * sizeof(u32))
 		+ nla_total_size(TCP_CA_NAME_MAX)
 		+ nla_total_size(sizeof(struct tcpvegas_info))
+		+ aux
 		+ 64;
 }
 
@@ -260,6 +270,10 @@ int inet_sk_diag_fill(struct sock *sk, struct inet_connection_sock *icsk,
 
 	handler->idiag_get_info(sk, r, info);
 
+	if (ext & (1 << (INET_DIAG_INFO - 1)) && handler->idiag_get_aux)
+		if (handler->idiag_get_aux(sk, net_admin, skb) < 0)
+			goto errout;
+
 	if (sk->sk_state < TCP_TIME_WAIT) {
 		union tcp_cc_info info;
 		size_t sz = 0;
@@ -449,6 +463,7 @@ int inet_diag_dump_one_icsk(struct inet_hashinfo *hashinfo,
 			    const struct nlmsghdr *nlh,
 			    const struct inet_diag_req_v2 *req)
 {
+	bool net_admin = netlink_net_capable(in_skb, CAP_NET_ADMIN);
 	struct net *net = sock_net(in_skb->sk);
 	struct sk_buff *rep;
 	struct sock *sk;
@@ -458,7 +473,7 @@ int inet_diag_dump_one_icsk(struct inet_hashinfo *hashinfo,
 	if (IS_ERR(sk))
 		return PTR_ERR(sk);
 
-	rep = nlmsg_new(inet_sk_attr_size(), GFP_KERNEL);
+	rep = nlmsg_new(inet_sk_attr_size(sk, req, net_admin), GFP_KERNEL);
 	if (!rep) {
 		err = -ENOMEM;
 		goto out;
@@ -467,8 +482,7 @@ int inet_diag_dump_one_icsk(struct inet_hashinfo *hashinfo,
 	err = sk_diag_fill(sk, rep, req,
 			   sk_user_ns(NETLINK_CB(in_skb).sk),
 			   NETLINK_CB(in_skb).portid,
-			   nlh->nlmsg_seq, 0, nlh,
-			   netlink_net_capable(in_skb, CAP_NET_ADMIN));
+			   nlh->nlmsg_seq, 0, nlh, net_admin);
 	if (err < 0) {
 		WARN_ON(err == -EMSGSIZE);
 		nlmsg_free(rep);
-- 
cgit v1.2.3


From c03fa9bcacd9ac04595cc13f34f3445f0a5ecf13 Mon Sep 17 00:00:00 2001
From: Ivan Delalande <colona@arista.com>
Date: Thu, 31 Aug 2017 09:59:39 -0700
Subject: tcp_diag: report TCP MD5 signing keys and addresses

Report TCP MD5 (RFC2385) signing keys, addresses and address prefixes to
processes with CAP_NET_ADMIN requesting INET_DIAG_INFO. Currently it is
not possible to retrieve these from the kernel once they have been
configured on sockets.

Signed-off-by: Ivan Delalande <colona@arista.com>
Acked-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/inet_diag.h |   1 +
 include/uapi/linux/tcp.h       |   9 ++++
 net/ipv4/tcp_diag.c            | 109 ++++++++++++++++++++++++++++++++++++++---
 3 files changed, 113 insertions(+), 6 deletions(-)

(limited to 'include')

diff --git a/include/uapi/linux/inet_diag.h b/include/uapi/linux/inet_diag.h
index 678496897a68..f52ff62bfabe 100644
--- a/include/uapi/linux/inet_diag.h
+++ b/include/uapi/linux/inet_diag.h
@@ -143,6 +143,7 @@ enum {
 	INET_DIAG_MARK,
 	INET_DIAG_BBRINFO,
 	INET_DIAG_CLASS_ID,
+	INET_DIAG_MD5SIG,
 	__INET_DIAG_MAX,
 };
 
diff --git a/include/uapi/linux/tcp.h b/include/uapi/linux/tcp.h
index 030e594bab45..15c25eccab2b 100644
--- a/include/uapi/linux/tcp.h
+++ b/include/uapi/linux/tcp.h
@@ -256,4 +256,13 @@ struct tcp_md5sig {
 	__u8	tcpm_key[TCP_MD5SIG_MAXKEYLEN];		/* key (binary) */
 };
 
+/* INET_DIAG_MD5SIG */
+struct tcp_diag_md5sig {
+	__u8	tcpm_family;
+	__u8	tcpm_prefixlen;
+	__u16	tcpm_keylen;
+	__be32	tcpm_addr[4];
+	__u8	tcpm_key[TCP_MD5SIG_MAXKEYLEN];
+};
+
 #endif /* _UAPI_LINUX_TCP_H */
diff --git a/net/ipv4/tcp_diag.c b/net/ipv4/tcp_diag.c
index a748c74aa8b7..abbf0edcf6c2 100644
--- a/net/ipv4/tcp_diag.c
+++ b/net/ipv4/tcp_diag.c
@@ -16,6 +16,7 @@
 
 #include <linux/tcp.h>
 
+#include <net/netlink.h>
 #include <net/tcp.h>
 
 static void tcp_diag_get_info(struct sock *sk, struct inet_diag_msg *r,
@@ -36,6 +37,100 @@ static void tcp_diag_get_info(struct sock *sk, struct inet_diag_msg *r,
 		tcp_get_info(sk, info);
 }
 
+#ifdef CONFIG_TCP_MD5SIG
+static void tcp_diag_md5sig_fill(struct tcp_diag_md5sig *info,
+				 const struct tcp_md5sig_key *key)
+{
+	info->tcpm_family = key->family;
+	info->tcpm_prefixlen = key->prefixlen;
+	info->tcpm_keylen = key->keylen;
+	memcpy(info->tcpm_key, key->key, key->keylen);
+
+	if (key->family == AF_INET)
+		info->tcpm_addr[0] = key->addr.a4.s_addr;
+	#if IS_ENABLED(CONFIG_IPV6)
+	else if (key->family == AF_INET6)
+		memcpy(&info->tcpm_addr, &key->addr.a6,
+		       sizeof(info->tcpm_addr));
+	#endif
+}
+
+static int tcp_diag_put_md5sig(struct sk_buff *skb,
+			       const struct tcp_md5sig_info *md5sig)
+{
+	const struct tcp_md5sig_key *key;
+	struct tcp_diag_md5sig *info;
+	struct nlattr *attr;
+	int md5sig_count = 0;
+
+	hlist_for_each_entry_rcu(key, &md5sig->head, node)
+		md5sig_count++;
+	if (md5sig_count == 0)
+		return 0;
+
+	attr = nla_reserve(skb, INET_DIAG_MD5SIG,
+			   md5sig_count * sizeof(struct tcp_diag_md5sig));
+	if (!attr)
+		return -EMSGSIZE;
+
+	info = nla_data(attr);
+	memset(info, 0, md5sig_count * sizeof(struct tcp_diag_md5sig));
+	hlist_for_each_entry_rcu(key, &md5sig->head, node) {
+		tcp_diag_md5sig_fill(info++, key);
+		if (--md5sig_count == 0)
+			break;
+	}
+
+	return 0;
+}
+#endif
+
+static int tcp_diag_get_aux(struct sock *sk, bool net_admin,
+			    struct sk_buff *skb)
+{
+#ifdef CONFIG_TCP_MD5SIG
+	if (net_admin) {
+		struct tcp_md5sig_info *md5sig;
+		int err = 0;
+
+		rcu_read_lock();
+		md5sig = rcu_dereference(tcp_sk(sk)->md5sig_info);
+		if (md5sig)
+			err = tcp_diag_put_md5sig(skb, md5sig);
+		rcu_read_unlock();
+		if (err < 0)
+			return err;
+	}
+#endif
+
+	return 0;
+}
+
+static size_t tcp_diag_get_aux_size(struct sock *sk, bool net_admin)
+{
+	size_t size = 0;
+
+#ifdef CONFIG_TCP_MD5SIG
+	if (net_admin && sk_fullsock(sk)) {
+		const struct tcp_md5sig_info *md5sig;
+		const struct tcp_md5sig_key *key;
+		size_t md5sig_count = 0;
+
+		rcu_read_lock();
+		md5sig = rcu_dereference(tcp_sk(sk)->md5sig_info);
+		if (md5sig) {
+			hlist_for_each_entry_rcu(key, &md5sig->head, node)
+				md5sig_count++;
+		}
+		rcu_read_unlock();
+		size += nla_total_size(md5sig_count *
+				       sizeof(struct tcp_diag_md5sig));
+	}
+#endif
+
+	return size;
+}
+
 static void tcp_diag_dump(struct sk_buff *skb, struct netlink_callback *cb,
 			  const struct inet_diag_req_v2 *r, struct nlattr *bc)
 {
@@ -68,13 +163,15 @@ static int tcp_diag_destroy(struct sk_buff *in_skb,
 #endif
 
 static const struct inet_diag_handler tcp_diag_handler = {
-	.dump		 = tcp_diag_dump,
-	.dump_one	 = tcp_diag_dump_one,
-	.idiag_get_info	 = tcp_diag_get_info,
-	.idiag_type	 = IPPROTO_TCP,
-	.idiag_info_size = sizeof(struct tcp_info),
+	.dump			= tcp_diag_dump,
+	.dump_one		= tcp_diag_dump_one,
+	.idiag_get_info		= tcp_diag_get_info,
+	.idiag_get_aux		= tcp_diag_get_aux,
+	.idiag_get_aux_size	= tcp_diag_get_aux_size,
+	.idiag_type		= IPPROTO_TCP,
+	.idiag_info_size	= sizeof(struct tcp_info),
 #ifdef CONFIG_INET_DIAG_DESTROY
-	.destroy	 = tcp_diag_destroy,
+	.destroy		= tcp_diag_destroy,
 #endif
 };
 
-- 
cgit v1.2.3


From c1d1b437816f0afa99202be3cb650c9d174667bc Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Thu, 31 Aug 2017 16:48:22 -0700
Subject: net: convert (struct ubuf_info)->refcnt to refcount_t

refcount_t type and corresponding API should be
used instead of atomic_t when the variable is used as
a reference counter. This allows to avoid accidental
refcounter overflows that might lead to use-after-free
situations.

v2: added the change in drivers/vhost/net.c as spotted
by Willem.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Acked-by: Willem de Bruijn <willemb@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/vhost/net.c    | 2 +-
 include/linux/skbuff.h | 5 +++--
 net/core/skbuff.c      | 6 +++---
 3 files changed, 7 insertions(+), 6 deletions(-)

(limited to 'include')

diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
index ba08b78ed630..8d2bcae53a2e 100644
--- a/drivers/vhost/net.c
+++ b/drivers/vhost/net.c
@@ -533,7 +533,7 @@ static void handle_tx(struct vhost_net *net)
 			ubuf->callback = vhost_zerocopy_callback;
 			ubuf->ctx = nvq->ubufs;
 			ubuf->desc = nvq->upend_idx;
-			atomic_set(&ubuf->refcnt, 1);
+			refcount_set(&ubuf->refcnt, 1);
 			msg.msg_control = ubuf;
 			msg.msg_controllen = sizeof(ubuf);
 			ubufs = nvq->ubufs;
diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index f93cc01064cb..f751f3b93039 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -22,6 +22,7 @@
 #include <linux/cache.h>
 #include <linux/rbtree.h>
 #include <linux/socket.h>
+#include <linux/refcount.h>
 
 #include <linux/atomic.h>
 #include <asm/types.h>
@@ -456,7 +457,7 @@ struct ubuf_info {
 			u32 bytelen;
 		};
 	};
-	atomic_t refcnt;
+	refcount_t refcnt;
 
 	struct mmpin {
 		struct user_struct *user;
@@ -472,7 +473,7 @@ struct ubuf_info *sock_zerocopy_realloc(struct sock *sk, size_t size,
 
 static inline void sock_zerocopy_get(struct ubuf_info *uarg)
 {
-	atomic_inc(&uarg->refcnt);
+	refcount_inc(&uarg->refcnt);
 }
 
 void sock_zerocopy_put(struct ubuf_info *uarg);
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index a003f953a0a6..68065d7d383f 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -963,7 +963,7 @@ struct ubuf_info *sock_zerocopy_alloc(struct sock *sk, size_t size)
 	uarg->len = 1;
 	uarg->bytelen = size;
 	uarg->zerocopy = 1;
-	atomic_set(&uarg->refcnt, 1);
+	refcount_set(&uarg->refcnt, 1);
 	sock_hold(sk);
 
 	return uarg;
@@ -1086,7 +1086,7 @@ EXPORT_SYMBOL_GPL(sock_zerocopy_callback);
 
 void sock_zerocopy_put(struct ubuf_info *uarg)
 {
-	if (uarg && atomic_dec_and_test(&uarg->refcnt)) {
+	if (uarg && refcount_dec_and_test(&uarg->refcnt)) {
 		if (uarg->callback)
 			uarg->callback(uarg, uarg->zerocopy);
 		else
@@ -1483,7 +1483,7 @@ int pskb_expand_head(struct sk_buff *skb, int nhead, int ntail,
 		if (skb_orphan_frags(skb, gfp_mask))
 			goto nofrags;
 		if (skb_zcopy(skb))
-			atomic_inc(&skb_uarg(skb)->refcnt);
+			refcount_inc(&skb_uarg(skb)->refcnt);
 		for (i = 0; i < skb_shinfo(skb)->nr_frags; i++)
 			skb_frag_ref(skb, i);
 
-- 
cgit v1.2.3


From 864150dfa31dceab6ec5ca4579a2d35ede985cb7 Mon Sep 17 00:00:00 2001
From: Ido Schimmel <idosch@mellanox.com>
Date: Fri, 1 Sep 2017 12:15:17 +0300
Subject: net: Add module reference to FIB notifiers

When a listener registers to the FIB notification chain it receives a
dump of the FIB entries and rules from existing address families by
invoking their dump operations.

While we call into these modules we need to make sure they aren't
removed. Do that by increasing their reference count before invoking
their dump operations and decrease it afterwards.

Fixes: 04b1d4e50e82 ("net: core: Make the FIB notification chain generic")
Signed-off-by: Ido Schimmel <idosch@mellanox.com>
Reviewed-by: Jiri Pirko <jiri@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/fib_notifier.h |  2 ++
 net/core/fib_notifier.c    | 13 +++++++++++--
 net/ipv4/fib_notifier.c    |  2 ++
 net/ipv6/fib6_notifier.c   |  2 ++
 4 files changed, 17 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/net/fib_notifier.h b/include/net/fib_notifier.h
index 241475224f74..669b9716dc7a 100644
--- a/include/net/fib_notifier.h
+++ b/include/net/fib_notifier.h
@@ -2,6 +2,7 @@
 #define __NET_FIB_NOTIFIER_H
 
 #include <linux/types.h>
+#include <linux/module.h>
 #include <linux/notifier.h>
 #include <net/net_namespace.h>
 
@@ -26,6 +27,7 @@ struct fib_notifier_ops {
 	struct list_head list;
 	unsigned int (*fib_seq_read)(struct net *net);
 	int (*fib_dump)(struct net *net, struct notifier_block *nb);
+	struct module *owner;
 	struct rcu_head rcu;
 };
 
diff --git a/net/core/fib_notifier.c b/net/core/fib_notifier.c
index 292aab83702f..4fc202dbdfb6 100644
--- a/net/core/fib_notifier.c
+++ b/net/core/fib_notifier.c
@@ -2,6 +2,7 @@
 #include <linux/notifier.h>
 #include <linux/rcupdate.h>
 #include <linux/kernel.h>
+#include <linux/module.h>
 #include <linux/init.h>
 #include <net/net_namespace.h>
 #include <net/fib_notifier.h>
@@ -33,8 +34,12 @@ static unsigned int fib_seq_sum(void)
 
 	rtnl_lock();
 	for_each_net(net) {
-		list_for_each_entry(ops, &net->fib_notifier_ops, list)
+		list_for_each_entry(ops, &net->fib_notifier_ops, list) {
+			if (!try_module_get(ops->owner))
+				continue;
 			fib_seq += ops->fib_seq_read(net);
+			module_put(ops->owner);
+		}
 	}
 	rtnl_unlock();
 
@@ -46,8 +51,12 @@ static int fib_net_dump(struct net *net, struct notifier_block *nb)
 	struct fib_notifier_ops *ops;
 
 	list_for_each_entry_rcu(ops, &net->fib_notifier_ops, list) {
-		int err = ops->fib_dump(net, nb);
+		int err;
 
+		if (!try_module_get(ops->owner))
+			continue;
+		err = ops->fib_dump(net, nb);
+		module_put(ops->owner);
 		if (err)
 			return err;
 	}
diff --git a/net/ipv4/fib_notifier.c b/net/ipv4/fib_notifier.c
index 5d7afb145562..cfd420b0572c 100644
--- a/net/ipv4/fib_notifier.c
+++ b/net/ipv4/fib_notifier.c
@@ -2,6 +2,7 @@
 #include <linux/notifier.h>
 #include <linux/socket.h>
 #include <linux/kernel.h>
+#include <linux/export.h>
 #include <net/net_namespace.h>
 #include <net/fib_notifier.h>
 #include <net/netns/ipv4.h>
@@ -49,6 +50,7 @@ static const struct fib_notifier_ops fib4_notifier_ops_template = {
 	.family		= AF_INET,
 	.fib_seq_read	= fib4_seq_read,
 	.fib_dump	= fib4_dump,
+	.owner		= THIS_MODULE,
 };
 
 int __net_init fib4_notifier_init(struct net *net)
diff --git a/net/ipv6/fib6_notifier.c b/net/ipv6/fib6_notifier.c
index 66a103ef7e86..05f82baaa99e 100644
--- a/net/ipv6/fib6_notifier.c
+++ b/net/ipv6/fib6_notifier.c
@@ -1,6 +1,7 @@
 #include <linux/notifier.h>
 #include <linux/socket.h>
 #include <linux/kernel.h>
+#include <linux/export.h>
 #include <net/net_namespace.h>
 #include <net/fib_notifier.h>
 #include <net/netns/ipv6.h>
@@ -41,6 +42,7 @@ static const struct fib_notifier_ops fib6_notifier_ops_template = {
 	.family		= AF_INET6,
 	.fib_seq_read	= fib6_seq_read,
 	.fib_dump	= fib6_dump,
+	.owner		= THIS_MODULE,
 };
 
 int __net_init fib6_notifier_init(struct net *net)
-- 
cgit v1.2.3


From 604acb193b8383a3774673c324e576e4c6b4ffcd Mon Sep 17 00:00:00 2001
From: Tariq Toukan <tariqt@mellanox.com>
Date: Mon, 5 Jun 2017 11:17:20 +0300
Subject: net/mlx5e: Refactor data-path lro header function

Refactor function mlx5e_lro_update_hdr() to reduce number of
branches.

Signed-off-by: Tariq Toukan <tariqt@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
---
 drivers/net/ethernet/mellanox/mlx5/core/en_rx.c | 45 +++++++++++--------------
 include/linux/mlx5/device.h                     |  2 +-
 2 files changed, 21 insertions(+), 26 deletions(-)

(limited to 'include')

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c b/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c
index ab1213a3615e..9d9c13ae6b83 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c
@@ -496,56 +496,51 @@ static void mlx5e_lro_update_hdr(struct sk_buff *skb, struct mlx5_cqe64 *cqe,
 				 u32 cqe_bcnt)
 {
 	struct ethhdr	*eth = (struct ethhdr *)(skb->data);
-	struct iphdr	*ipv4;
-	struct ipv6hdr	*ipv6;
 	struct tcphdr	*tcp;
 	int network_depth = 0;
 	__be16 proto;
 	u16 tot_len;
+	void *ip_p;
 
 	u8 l4_hdr_type = get_cqe_l4_hdr_type(cqe);
-	int tcp_ack = ((l4_hdr_type == CQE_L4_HDR_TYPE_TCP_ACK_NO_DATA) ||
-		       (l4_hdr_type == CQE_L4_HDR_TYPE_TCP_ACK_AND_DATA));
+	u8 tcp_ack = (l4_hdr_type == CQE_L4_HDR_TYPE_TCP_ACK_NO_DATA) ||
+		(l4_hdr_type == CQE_L4_HDR_TYPE_TCP_ACK_AND_DATA);
 
 	skb->mac_len = ETH_HLEN;
 	proto = __vlan_get_protocol(skb, eth->h_proto, &network_depth);
 
-	ipv4 = (struct iphdr *)(skb->data + network_depth);
-	ipv6 = (struct ipv6hdr *)(skb->data + network_depth);
 	tot_len = cqe_bcnt - network_depth;
+	ip_p = skb->data + network_depth;
 
 	if (proto == htons(ETH_P_IP)) {
-		tcp = (struct tcphdr *)(skb->data + network_depth +
-					sizeof(struct iphdr));
-		ipv6 = NULL;
-		skb_shinfo(skb)->gso_type = SKB_GSO_TCPV4;
-	} else {
-		tcp = (struct tcphdr *)(skb->data + network_depth +
-					sizeof(struct ipv6hdr));
-		ipv4 = NULL;
-		skb_shinfo(skb)->gso_type = SKB_GSO_TCPV6;
-	}
-
-	if (get_cqe_lro_tcppsh(cqe))
-		tcp->psh                = 1;
+		struct iphdr *ipv4 = ip_p;
 
-	if (tcp_ack) {
-		tcp->ack                = 1;
-		tcp->ack_seq            = cqe->lro_ack_seq_num;
-		tcp->window             = cqe->lro_tcp_win;
-	}
+		tcp = ip_p + sizeof(struct iphdr);
+		skb_shinfo(skb)->gso_type = SKB_GSO_TCPV4;
 
-	if (ipv4) {
 		ipv4->ttl               = cqe->lro_min_ttl;
 		ipv4->tot_len           = cpu_to_be16(tot_len);
 		ipv4->check             = 0;
 		ipv4->check             = ip_fast_csum((unsigned char *)ipv4,
 						       ipv4->ihl);
 	} else {
+		struct ipv6hdr *ipv6 = ip_p;
+
+		tcp = ip_p + sizeof(struct ipv6hdr);
+		skb_shinfo(skb)->gso_type = SKB_GSO_TCPV6;
+
 		ipv6->hop_limit         = cqe->lro_min_ttl;
 		ipv6->payload_len       = cpu_to_be16(tot_len -
 						      sizeof(struct ipv6hdr));
 	}
+
+	tcp->psh = get_cqe_lro_tcppsh(cqe);
+
+	if (tcp_ack) {
+		tcp->ack                = 1;
+		tcp->ack_seq            = cqe->lro_ack_seq_num;
+		tcp->window             = cqe->lro_tcp_win;
+	}
 }
 
 static inline void mlx5e_skb_set_hash(struct mlx5_cqe64 *cqe,
diff --git a/include/linux/mlx5/device.h b/include/linux/mlx5/device.h
index 3c7442b56460..7031d655ec32 100644
--- a/include/linux/mlx5/device.h
+++ b/include/linux/mlx5/device.h
@@ -709,7 +709,7 @@ static inline int mlx5_get_cqe_format(struct mlx5_cqe64 *cqe)
 	return (cqe->op_own >> 2) & 0x3;
 }
 
-static inline int get_cqe_lro_tcppsh(struct mlx5_cqe64 *cqe)
+static inline u8 get_cqe_lro_tcppsh(struct mlx5_cqe64 *cqe)
 {
 	return (cqe->lro_tcppsh_abort_dupack >> 6) & 1;
 }
-- 
cgit v1.2.3


From 64327fc811268d4a24de03dac242ea29de6be75f Mon Sep 17 00:00:00 2001
From: Stefano Brivio <sbrivio@redhat.com>
Date: Thu, 31 Aug 2017 18:11:41 +0200
Subject: ipv4: Don't override return code from ip_route_input_noref()

After ip_route_input() calls ip_route_input_noref(), another
check on skb_dst() is done, but if this fails, we shouldn't
override the return code from ip_route_input_noref(), as it
could have been more specific (i.e. -EHOSTUNREACH).

This also saves one call to skb_dst_force_safe() and one to
skb_dst() in case the ip_route_input_noref() check fails.

Reported-by: Sabrina Dubroca <sdubroca@redhat.com>
Fixes: 9df16efadd2a ("ipv4: call dst_hold_safe() properly")
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
Acked-by: Wei Wang <weiwan@google.com>
Acked-by: Sabrina Dubroca <sd@queasysnail.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/route.h | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/include/net/route.h b/include/net/route.h
index cb0a76d9dde1..1b09a9368c68 100644
--- a/include/net/route.h
+++ b/include/net/route.h
@@ -189,10 +189,11 @@ static inline int ip_route_input(struct sk_buff *skb, __be32 dst, __be32 src,
 
 	rcu_read_lock();
 	err = ip_route_input_noref(skb, dst, src, tos, devin);
-	if (!err)
+	if (!err) {
 		skb_dst_force_safe(skb);
-	if (!skb_dst(skb))
-		err = -EINVAL;
+		if (!skb_dst(skb))
+			err = -EINVAL;
+	}
 	rcu_read_unlock();
 
 	return err;
-- 
cgit v1.2.3


From fb452a1aa3fd4034d7999e309c5466ff2d7005aa Mon Sep 17 00:00:00 2001
From: Jesper Dangaard Brouer <brouer@redhat.com>
Date: Fri, 1 Sep 2017 11:26:08 +0200
Subject: Revert "net: use lib/percpu_counter API for fragmentation mem
 accounting"

This reverts commit 6d7b857d541ecd1d9bd997c97242d4ef94b19de2.

There is a bug in fragmentation codes use of the percpu_counter API,
that can cause issues on systems with many CPUs.

The frag_mem_limit() just reads the global counter (fbc->count),
without considering other CPUs can have upto batch size (130K) that
haven't been subtracted yet.  Due to the 3MBytes lower thresh limit,
this become dangerous at >=24 CPUs (3*1024*1024/130000=24).

The correct API usage would be to use __percpu_counter_compare() which
does the right thing, and takes into account the number of (online)
CPUs and batch size, to account for this and call __percpu_counter_sum()
when needed.

We choose to revert the use of the lib/percpu_counter API for frag
memory accounting for several reasons:

1) On systems with CPUs > 24, the heavier fully locked
   __percpu_counter_sum() is always invoked, which will be more
   expensive than the atomic_t that is reverted to.

Given systems with more than 24 CPUs are becoming common this doesn't
seem like a good option.  To mitigate this, the batch size could be
decreased and thresh be increased.

2) The add_frag_mem_limit+sub_frag_mem_limit pairs happen on the RX
   CPU, before SKBs are pushed into sockets on remote CPUs.  Given
   NICs can only hash on L2 part of the IP-header, the NIC-RXq's will
   likely be limited.  Thus, a fair chance that atomic add+dec happen
   on the same CPU.

Revert note that commit 1d6119baf061 ("net: fix percpu memory leaks")
removed init_frag_mem_limit() and instead use inet_frags_init_net().
After this revert, inet_frags_uninit_net() becomes empty.

Fixes: 6d7b857d541e ("net: use lib/percpu_counter API for fragmentation mem accounting")
Fixes: 1d6119baf061 ("net: fix percpu memory leaks")
Signed-off-by: Jesper Dangaard Brouer <brouer@redhat.com>
Acked-by: Florian Westphal <fw@strlen.de>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/inet_frag.h  | 30 +++++++++---------------------
 net/ipv4/inet_fragment.c |  4 +---
 2 files changed, 10 insertions(+), 24 deletions(-)

(limited to 'include')

diff --git a/include/net/inet_frag.h b/include/net/inet_frag.h
index 6fdcd2427776..fa635aa6d0b9 100644
--- a/include/net/inet_frag.h
+++ b/include/net/inet_frag.h
@@ -1,14 +1,9 @@
 #ifndef __NET_FRAG_H__
 #define __NET_FRAG_H__
 
-#include <linux/percpu_counter.h>
-
 struct netns_frags {
-	/* The percpu_counter "mem" need to be cacheline aligned.
-	 *  mem.count must not share cacheline with other writers
-	 */
-	struct percpu_counter   mem ____cacheline_aligned_in_smp;
-
+	/* Keep atomic mem on separate cachelines in structs that include it */
+	atomic_t		mem ____cacheline_aligned_in_smp;
 	/* sysctls */
 	int			timeout;
 	int			high_thresh;
@@ -110,11 +105,11 @@ void inet_frags_fini(struct inet_frags *);
 
 static inline int inet_frags_init_net(struct netns_frags *nf)
 {
-	return percpu_counter_init(&nf->mem, 0, GFP_KERNEL);
+	atomic_set(&nf->mem, 0);
+	return 0;
 }
 static inline void inet_frags_uninit_net(struct netns_frags *nf)
 {
-	percpu_counter_destroy(&nf->mem);
 }
 
 void inet_frags_exit_net(struct netns_frags *nf, struct inet_frags *f);
@@ -140,31 +135,24 @@ static inline bool inet_frag_evicting(struct inet_frag_queue *q)
 
 /* Memory Tracking Functions. */
 
-/* The default percpu_counter batch size is not big enough to scale to
- * fragmentation mem acct sizes.
- * The mem size of a 64K fragment is approx:
- *  (44 fragments * 2944 truesize) + frag_queue struct(200) = 129736 bytes
- */
-static unsigned int frag_percpu_counter_batch = 130000;
-
 static inline int frag_mem_limit(struct netns_frags *nf)
 {
-	return percpu_counter_read(&nf->mem);
+	return atomic_read(&nf->mem);
 }
 
 static inline void sub_frag_mem_limit(struct netns_frags *nf, int i)
 {
-	percpu_counter_add_batch(&nf->mem, -i, frag_percpu_counter_batch);
+	atomic_sub(i, &nf->mem);
 }
 
 static inline void add_frag_mem_limit(struct netns_frags *nf, int i)
 {
-	percpu_counter_add_batch(&nf->mem, i, frag_percpu_counter_batch);
+	atomic_add(i, &nf->mem);
 }
 
-static inline unsigned int sum_frag_mem_limit(struct netns_frags *nf)
+static inline int sum_frag_mem_limit(struct netns_frags *nf)
 {
-	return percpu_counter_sum_positive(&nf->mem);
+	return atomic_read(&nf->mem);
 }
 
 /* RFC 3168 support :
diff --git a/net/ipv4/inet_fragment.c b/net/ipv4/inet_fragment.c
index 96e95e83cc61..af74d0433453 100644
--- a/net/ipv4/inet_fragment.c
+++ b/net/ipv4/inet_fragment.c
@@ -234,10 +234,8 @@ evict_again:
 	cond_resched();
 
 	if (read_seqretry(&f->rnd_seqlock, seq) ||
-	    percpu_counter_sum(&nf->mem))
+	    sum_frag_mem_limit(nf))
 		goto evict_again;
-
-	percpu_counter_destroy(&nf->mem);
 }
 EXPORT_SYMBOL(inet_frags_exit_net);
 
-- 
cgit v1.2.3


From 5a63643e583b6a9789d7a225ae076fb4e603991c Mon Sep 17 00:00:00 2001
From: Jesper Dangaard Brouer <brouer@redhat.com>
Date: Fri, 1 Sep 2017 11:26:13 +0200
Subject: Revert "net: fix percpu memory leaks"

This reverts commit 1d6119baf0610f813eb9d9580eb4fd16de5b4ceb.

After reverting commit 6d7b857d541e ("net: use lib/percpu_counter API
for fragmentation mem accounting") then here is no need for this
fix-up patch.  As percpu_counter is no longer used, it cannot
memory leak it any-longer.

Fixes: 6d7b857d541e ("net: use lib/percpu_counter API for fragmentation mem accounting")
Fixes: 1d6119baf061 ("net: fix percpu memory leaks")
Signed-off-by: Jesper Dangaard Brouer <brouer@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/inet_frag.h                 |  7 +------
 net/ieee802154/6lowpan/reassembly.c     | 11 +++--------
 net/ipv4/ip_fragment.c                  | 12 +++---------
 net/ipv6/netfilter/nf_conntrack_reasm.c | 12 +++---------
 net/ipv6/reassembly.c                   | 12 +++---------
 5 files changed, 13 insertions(+), 41 deletions(-)

(limited to 'include')

diff --git a/include/net/inet_frag.h b/include/net/inet_frag.h
index fa635aa6d0b9..fc59e0775e00 100644
--- a/include/net/inet_frag.h
+++ b/include/net/inet_frag.h
@@ -103,15 +103,10 @@ struct inet_frags {
 int inet_frags_init(struct inet_frags *);
 void inet_frags_fini(struct inet_frags *);
 
-static inline int inet_frags_init_net(struct netns_frags *nf)
+static inline void inet_frags_init_net(struct netns_frags *nf)
 {
 	atomic_set(&nf->mem, 0);
-	return 0;
 }
-static inline void inet_frags_uninit_net(struct netns_frags *nf)
-{
-}
-
 void inet_frags_exit_net(struct netns_frags *nf, struct inet_frags *f);
 
 void inet_frag_kill(struct inet_frag_queue *q, struct inet_frags *f);
diff --git a/net/ieee802154/6lowpan/reassembly.c b/net/ieee802154/6lowpan/reassembly.c
index 30d875dff6b5..f85b08baff16 100644
--- a/net/ieee802154/6lowpan/reassembly.c
+++ b/net/ieee802154/6lowpan/reassembly.c
@@ -580,19 +580,14 @@ static int __net_init lowpan_frags_init_net(struct net *net)
 {
 	struct netns_ieee802154_lowpan *ieee802154_lowpan =
 		net_ieee802154_lowpan(net);
-	int res;
 
 	ieee802154_lowpan->frags.high_thresh = IPV6_FRAG_HIGH_THRESH;
 	ieee802154_lowpan->frags.low_thresh = IPV6_FRAG_LOW_THRESH;
 	ieee802154_lowpan->frags.timeout = IPV6_FRAG_TIMEOUT;
 
-	res = inet_frags_init_net(&ieee802154_lowpan->frags);
-	if (res)
-		return res;
-	res = lowpan_frags_ns_sysctl_register(net);
-	if (res)
-		inet_frags_uninit_net(&ieee802154_lowpan->frags);
-	return res;
+	inet_frags_init_net(&ieee802154_lowpan->frags);
+
+	return lowpan_frags_ns_sysctl_register(net);
 }
 
 static void __net_exit lowpan_frags_exit_net(struct net *net)
diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c
index 9a8cfac503dc..46408c220d9d 100644
--- a/net/ipv4/ip_fragment.c
+++ b/net/ipv4/ip_fragment.c
@@ -844,8 +844,6 @@ static void __init ip4_frags_ctl_register(void)
 
 static int __net_init ipv4_frags_init_net(struct net *net)
 {
-	int res;
-
 	/* Fragment cache limits.
 	 *
 	 * The fragment memory accounting code, (tries to) account for
@@ -871,13 +869,9 @@ static int __net_init ipv4_frags_init_net(struct net *net)
 
 	net->ipv4.frags.max_dist = 64;
 
-	res = inet_frags_init_net(&net->ipv4.frags);
-	if (res)
-		return res;
-	res = ip4_frags_ns_ctl_register(net);
-	if (res)
-		inet_frags_uninit_net(&net->ipv4.frags);
-	return res;
+	inet_frags_init_net(&net->ipv4.frags);
+
+	return ip4_frags_ns_ctl_register(net);
 }
 
 static void __net_exit ipv4_frags_exit_net(struct net *net)
diff --git a/net/ipv6/netfilter/nf_conntrack_reasm.c b/net/ipv6/netfilter/nf_conntrack_reasm.c
index 986d4ca38832..b263bf3a19f7 100644
--- a/net/ipv6/netfilter/nf_conntrack_reasm.c
+++ b/net/ipv6/netfilter/nf_conntrack_reasm.c
@@ -622,18 +622,12 @@ EXPORT_SYMBOL_GPL(nf_ct_frag6_gather);
 
 static int nf_ct_net_init(struct net *net)
 {
-	int res;
-
 	net->nf_frag.frags.high_thresh = IPV6_FRAG_HIGH_THRESH;
 	net->nf_frag.frags.low_thresh = IPV6_FRAG_LOW_THRESH;
 	net->nf_frag.frags.timeout = IPV6_FRAG_TIMEOUT;
-	res = inet_frags_init_net(&net->nf_frag.frags);
-	if (res)
-		return res;
-	res = nf_ct_frag6_sysctl_register(net);
-	if (res)
-		inet_frags_uninit_net(&net->nf_frag.frags);
-	return res;
+	inet_frags_init_net(&net->nf_frag.frags);
+
+	return nf_ct_frag6_sysctl_register(net);
 }
 
 static void nf_ct_net_exit(struct net *net)
diff --git a/net/ipv6/reassembly.c b/net/ipv6/reassembly.c
index e1da5b888cc4..846012eae526 100644
--- a/net/ipv6/reassembly.c
+++ b/net/ipv6/reassembly.c
@@ -714,19 +714,13 @@ static void ip6_frags_sysctl_unregister(void)
 
 static int __net_init ipv6_frags_init_net(struct net *net)
 {
-	int res;
-
 	net->ipv6.frags.high_thresh = IPV6_FRAG_HIGH_THRESH;
 	net->ipv6.frags.low_thresh = IPV6_FRAG_LOW_THRESH;
 	net->ipv6.frags.timeout = IPV6_FRAG_TIMEOUT;
 
-	res = inet_frags_init_net(&net->ipv6.frags);
-	if (res)
-		return res;
-	res = ip6_frags_ns_sysctl_register(net);
-	if (res)
-		inet_frags_uninit_net(&net->ipv6.frags);
-	return res;
+	inet_frags_init_net(&net->ipv6.frags);
+
+	return ip6_frags_ns_sysctl_register(net);
 }
 
 static void __net_exit ipv6_frags_exit_net(struct net *net)
-- 
cgit v1.2.3


From b9047726386bb538cf5e4f52a9f04eb556eebc67 Mon Sep 17 00:00:00 2001
From: Deepa Dinamani <deepa.kernel@gmail.com>
Date: Wed, 2 Aug 2017 19:51:11 -0700
Subject: ipc: mqueue: Replace timespec with timespec64

struct timespec is not y2038 safe. Replace
all uses of timespec by y2038 safe struct timespec64.

Even though timespec is used here to represent timeouts,
replace these with timespec64 so that it facilitates
in verification by creating a y2038 safe kernel image
that is free of timespec.

The syscall interfaces themselves are not changed as part
of the patch. They will be part of a different series.

Signed-off-by: Deepa Dinamani <deepa.kernel@gmail.com>
Cc: Paul Moore <paul@paul-moore.com>
Cc: Richard Guy Briggs <rgb@redhat.com>
Reviewed-by: Richard Guy Briggs <rgb@redhat.com>
Reviewed-by: Arnd Bergmann <arnd@arndb.de>
Acked-by: Paul Moore <paul@paul-moore.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 include/linux/audit.h |  6 +++---
 ipc/mqueue.c          | 28 ++++++++++++++--------------
 kernel/audit.h        |  2 +-
 kernel/auditsc.c      | 12 ++++++------
 4 files changed, 24 insertions(+), 24 deletions(-)

(limited to 'include')

diff --git a/include/linux/audit.h b/include/linux/audit.h
index 2150bdccfbab..74d4d4e8e3db 100644
--- a/include/linux/audit.h
+++ b/include/linux/audit.h
@@ -351,7 +351,7 @@ extern int __audit_socketcall(int nargs, unsigned long *args);
 extern int __audit_sockaddr(int len, void *addr);
 extern void __audit_fd_pair(int fd1, int fd2);
 extern void __audit_mq_open(int oflag, umode_t mode, struct mq_attr *attr);
-extern void __audit_mq_sendrecv(mqd_t mqdes, size_t msg_len, unsigned int msg_prio, const struct timespec *abs_timeout);
+extern void __audit_mq_sendrecv(mqd_t mqdes, size_t msg_len, unsigned int msg_prio, const struct timespec64 *abs_timeout);
 extern void __audit_mq_notify(mqd_t mqdes, const struct sigevent *notification);
 extern void __audit_mq_getsetattr(mqd_t mqdes, struct mq_attr *mqstat);
 extern int __audit_log_bprm_fcaps(struct linux_binprm *bprm,
@@ -412,7 +412,7 @@ static inline void audit_mq_open(int oflag, umode_t mode, struct mq_attr *attr)
 	if (unlikely(!audit_dummy_context()))
 		__audit_mq_open(oflag, mode, attr);
 }
-static inline void audit_mq_sendrecv(mqd_t mqdes, size_t msg_len, unsigned int msg_prio, const struct timespec *abs_timeout)
+static inline void audit_mq_sendrecv(mqd_t mqdes, size_t msg_len, unsigned int msg_prio, const struct timespec64 *abs_timeout)
 {
 	if (unlikely(!audit_dummy_context()))
 		__audit_mq_sendrecv(mqdes, msg_len, msg_prio, abs_timeout);
@@ -549,7 +549,7 @@ static inline void audit_mq_open(int oflag, umode_t mode, struct mq_attr *attr)
 { }
 static inline void audit_mq_sendrecv(mqd_t mqdes, size_t msg_len,
 				     unsigned int msg_prio,
-				     const struct timespec *abs_timeout)
+				     const struct timespec64 *abs_timeout)
 { }
 static inline void audit_mq_notify(mqd_t mqdes,
 				   const struct sigevent *notification)
diff --git a/ipc/mqueue.c b/ipc/mqueue.c
index eb1391b52c6f..d24025626310 100644
--- a/ipc/mqueue.c
+++ b/ipc/mqueue.c
@@ -668,11 +668,11 @@ static void __do_notify(struct mqueue_inode_info *info)
 }
 
 static int prepare_timeout(const struct timespec __user *u_abs_timeout,
-			   struct timespec *ts)
+			   struct timespec64 *ts)
 {
-	if (copy_from_user(ts, u_abs_timeout, sizeof(struct timespec)))
+	if (get_timespec64(ts, u_abs_timeout))
 		return -EFAULT;
-	if (!timespec_valid(ts))
+	if (!timespec64_valid(ts))
 		return -EINVAL;
 	return 0;
 }
@@ -962,7 +962,7 @@ static inline void pipelined_receive(struct wake_q_head *wake_q,
 
 static int do_mq_timedsend(mqd_t mqdes, const char __user *u_msg_ptr,
 		size_t msg_len, unsigned int msg_prio,
-		struct timespec *ts)
+		struct timespec64 *ts)
 {
 	struct fd f;
 	struct inode *inode;
@@ -979,7 +979,7 @@ static int do_mq_timedsend(mqd_t mqdes, const char __user *u_msg_ptr,
 		return -EINVAL;
 
 	if (ts) {
-		expires = timespec_to_ktime(*ts);
+		expires = timespec64_to_ktime(*ts);
 		timeout = &expires;
 	}
 
@@ -1080,7 +1080,7 @@ out:
 
 static int do_mq_timedreceive(mqd_t mqdes, char __user *u_msg_ptr,
 		size_t msg_len, unsigned int __user *u_msg_prio,
-		struct timespec *ts)
+		struct timespec64 *ts)
 {
 	ssize_t ret;
 	struct msg_msg *msg_ptr;
@@ -1092,7 +1092,7 @@ static int do_mq_timedreceive(mqd_t mqdes, char __user *u_msg_ptr,
 	struct posix_msg_tree_node *new_leaf = NULL;
 
 	if (ts) {
-		expires = timespec_to_ktime(*ts);
+		expires = timespec64_to_ktime(*ts);
 		timeout = &expires;
 	}
 
@@ -1184,7 +1184,7 @@ SYSCALL_DEFINE5(mq_timedsend, mqd_t, mqdes, const char __user *, u_msg_ptr,
 		size_t, msg_len, unsigned int, msg_prio,
 		const struct timespec __user *, u_abs_timeout)
 {
-	struct timespec ts, *p = NULL;
+	struct timespec64 ts, *p = NULL;
 	if (u_abs_timeout) {
 		int res = prepare_timeout(u_abs_timeout, &ts);
 		if (res)
@@ -1198,7 +1198,7 @@ SYSCALL_DEFINE5(mq_timedreceive, mqd_t, mqdes, char __user *, u_msg_ptr,
 		size_t, msg_len, unsigned int __user *, u_msg_prio,
 		const struct timespec __user *, u_abs_timeout)
 {
-	struct timespec ts, *p = NULL;
+	struct timespec64 ts, *p = NULL;
 	if (u_abs_timeout) {
 		int res = prepare_timeout(u_abs_timeout, &ts);
 		if (res)
@@ -1475,11 +1475,11 @@ COMPAT_SYSCALL_DEFINE4(mq_open, const char __user *, u_name,
 }
 
 static int compat_prepare_timeout(const struct compat_timespec __user *p,
-				   struct timespec *ts)
+				   struct timespec64 *ts)
 {
-	if (compat_get_timespec(ts, p))
+	if (compat_get_timespec64(ts, p))
 		return -EFAULT;
-	if (!timespec_valid(ts))
+	if (!timespec64_valid(ts))
 		return -EINVAL;
 	return 0;
 }
@@ -1489,7 +1489,7 @@ COMPAT_SYSCALL_DEFINE5(mq_timedsend, mqd_t, mqdes,
 		       compat_size_t, msg_len, unsigned int, msg_prio,
 		       const struct compat_timespec __user *, u_abs_timeout)
 {
-	struct timespec ts, *p = NULL;
+	struct timespec64 ts, *p = NULL;
 	if (u_abs_timeout) {
 		int res = compat_prepare_timeout(u_abs_timeout, &ts);
 		if (res)
@@ -1504,7 +1504,7 @@ COMPAT_SYSCALL_DEFINE5(mq_timedreceive, mqd_t, mqdes,
 		       compat_size_t, msg_len, unsigned int __user *, u_msg_prio,
 		       const struct compat_timespec __user *, u_abs_timeout)
 {
-	struct timespec ts, *p = NULL;
+	struct timespec64 ts, *p = NULL;
 	if (u_abs_timeout) {
 		int res = compat_prepare_timeout(u_abs_timeout, &ts);
 		if (res)
diff --git a/kernel/audit.h b/kernel/audit.h
index b331d9b83f63..9b110ae17ee3 100644
--- a/kernel/audit.h
+++ b/kernel/audit.h
@@ -182,7 +182,7 @@ struct audit_context {
 			mqd_t			mqdes;
 			size_t			msg_len;
 			unsigned int		msg_prio;
-			struct timespec		abs_timeout;
+			struct timespec64	abs_timeout;
 		} mq_sendrecv;
 		struct {
 			int			oflag;
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index 3260ba2312a9..daee2d5bd03a 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -1235,11 +1235,11 @@ static void show_special(struct audit_context *context, int *call_panic)
 	case AUDIT_MQ_SENDRECV:
 		audit_log_format(ab,
 			"mqdes=%d msg_len=%zd msg_prio=%u "
-			"abs_timeout_sec=%ld abs_timeout_nsec=%ld",
+			"abs_timeout_sec=%lld abs_timeout_nsec=%ld",
 			context->mq_sendrecv.mqdes,
 			context->mq_sendrecv.msg_len,
 			context->mq_sendrecv.msg_prio,
-			context->mq_sendrecv.abs_timeout.tv_sec,
+			(long long) context->mq_sendrecv.abs_timeout.tv_sec,
 			context->mq_sendrecv.abs_timeout.tv_nsec);
 		break;
 	case AUDIT_MQ_NOTIFY:
@@ -2083,15 +2083,15 @@ void __audit_mq_open(int oflag, umode_t mode, struct mq_attr *attr)
  *
  */
 void __audit_mq_sendrecv(mqd_t mqdes, size_t msg_len, unsigned int msg_prio,
-			const struct timespec *abs_timeout)
+			const struct timespec64 *abs_timeout)
 {
 	struct audit_context *context = current->audit_context;
-	struct timespec *p = &context->mq_sendrecv.abs_timeout;
+	struct timespec64 *p = &context->mq_sendrecv.abs_timeout;
 
 	if (abs_timeout)
-		memcpy(p, abs_timeout, sizeof(struct timespec));
+		memcpy(p, abs_timeout, sizeof(*p));
 	else
-		memset(p, 0, sizeof(struct timespec));
+		memset(p, 0, sizeof(*p));
 
 	context->mq_sendrecv.mqdes = mqdes;
 	context->mq_sendrecv.msg_len = msg_len;
-- 
cgit v1.2.3


From 50578ea97a2a352c109bd1657e667b212faf2cbb Mon Sep 17 00:00:00 2001
From: Deepa Dinamani <deepa.kernel@gmail.com>
Date: Wed, 2 Aug 2017 19:51:12 -0700
Subject: ipc: msg: Make msg_queue timestamps y2038 safe

time_t is not y2038 safe. Replace all uses of
time_t by y2038 safe time64_t.

Similarly, replace the calls to get_seconds() with
y2038 safe ktime_get_real_seconds().
Note that this preserves fast access on 64 bit systems,
but 32 bit systems need sequence counters.

The syscall interfaces themselves are not changed as part of
the patch. They will be part of a different series.

Signed-off-by: Deepa Dinamani <deepa.kernel@gmail.com>
Reviewed-by: Arnd Bergmann <arnd@arndb.de>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 include/linux/msg.h | 7 ++++---
 ipc/msg.c           | 6 +++---
 2 files changed, 7 insertions(+), 6 deletions(-)

(limited to 'include')

diff --git a/include/linux/msg.h b/include/linux/msg.h
index 4e5ec3cbf464..05115342daa3 100644
--- a/include/linux/msg.h
+++ b/include/linux/msg.h
@@ -2,6 +2,7 @@
 #define _LINUX_MSG_H
 
 #include <linux/list.h>
+#include <linux/time64.h>
 #include <uapi/linux/msg.h>
 
 /* one msg_msg structure for each message */
@@ -17,9 +18,9 @@ struct msg_msg {
 /* one msq_queue structure for each present queue on the system */
 struct msg_queue {
 	struct kern_ipc_perm q_perm;
-	time_t q_stime;			/* last msgsnd time */
-	time_t q_rtime;			/* last msgrcv time */
-	time_t q_ctime;			/* last change time */
+	time64_t q_stime;		/* last msgsnd time */
+	time64_t q_rtime;		/* last msgrcv time */
+	time64_t q_ctime;		/* last change time */
 	unsigned long q_cbytes;		/* current number of bytes on queue */
 	unsigned long q_qnum;		/* number of messages in queue */
 	unsigned long q_qbytes;		/* max number of bytes on queue */
diff --git a/ipc/msg.c b/ipc/msg.c
index 855da19c765a..0e7ccfc0700b 100644
--- a/ipc/msg.c
+++ b/ipc/msg.c
@@ -133,7 +133,7 @@ static int newque(struct ipc_namespace *ns, struct ipc_params *params)
 	}
 
 	msq->q_stime = msq->q_rtime = 0;
-	msq->q_ctime = get_seconds();
+	msq->q_ctime = ktime_get_real_seconds();
 	msq->q_cbytes = msq->q_qnum = 0;
 	msq->q_qbytes = ns->msg_ctlmnb;
 	msq->q_lspid = msq->q_lrpid = 0;
@@ -406,7 +406,7 @@ static int msgctl_down(struct ipc_namespace *ns, int msqid, int cmd,
 
 		msq->q_qbytes = msqid64->msg_qbytes;
 
-		msq->q_ctime = get_seconds();
+		msq->q_ctime = ktime_get_real_seconds();
 		/*
 		 * Sleeping receivers might be excluded by
 		 * stricter permissions.
@@ -1181,7 +1181,7 @@ static int sysvipc_msg_proc_show(struct seq_file *s, void *it)
 	struct msg_queue *msq = it;
 
 	seq_printf(s,
-		   "%10d %10d  %4o  %10lu %10lu %5u %5u %5u %5u %5u %5u %10lu %10lu %10lu\n",
+		   "%10d %10d  %4o  %10lu %10lu %5u %5u %5u %5u %5u %5u %10llu %10llu %10llu\n",
 		   msq->q_perm.key,
 		   msq->q_perm.id,
 		   msq->q_perm.mode,
-- 
cgit v1.2.3


From e54d02b23c5eed3aa0ffe54e659dfe1c9084c262 Mon Sep 17 00:00:00 2001
From: Deepa Dinamani <deepa.kernel@gmail.com>
Date: Wed, 2 Aug 2017 19:51:13 -0700
Subject: ipc: sem: Make sem_array timestamps y2038 safe

time_t is not y2038 safe. Replace all uses of
time_t by y2038 safe time64_t.

Similarly, replace the calls to get_seconds() with
y2038 safe ktime_get_real_seconds().
Note that this preserves fast access on 64 bit systems,
but 32 bit systems need sequence counters.

The syscall interface themselves are not changed as part of
the patch. They will be part of a different series.

Signed-off-by: Deepa Dinamani <deepa.kernel@gmail.com>
Reviewed-by: Arnd Bergmann <arnd@arndb.de>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 include/linux/sem.h |  3 ++-
 ipc/sem.c           | 18 +++++++++---------
 2 files changed, 11 insertions(+), 10 deletions(-)

(limited to 'include')

diff --git a/include/linux/sem.h b/include/linux/sem.h
index be5cf2ea14ad..867f86f1a20b 100644
--- a/include/linux/sem.h
+++ b/include/linux/sem.h
@@ -4,6 +4,7 @@
 #include <linux/atomic.h>
 #include <linux/rcupdate.h>
 #include <linux/cache.h>
+#include <linux/time64.h>
 #include <uapi/linux/sem.h>
 
 struct task_struct;
@@ -30,7 +31,7 @@ struct sem {
 /* One sem_array data structure for each set of semaphores in the system. */
 struct sem_array {
 	struct kern_ipc_perm	sem_perm;	/* permissions .. see ipc.h */
-	time_t			sem_ctime;	/* create/last semctl() time */
+	time64_t		sem_ctime;	/* create/last semctl() time */
 	struct list_head	pending_alter;	/* pending operations */
 						/* that alter the array */
 	struct list_head	pending_const;	/* pending complex operations */
diff --git a/ipc/sem.c b/ipc/sem.c
index feea26f897e7..f25ea86774c0 100644
--- a/ipc/sem.c
+++ b/ipc/sem.c
@@ -511,7 +511,7 @@ static int newary(struct ipc_namespace *ns, struct ipc_params *params)
 	INIT_LIST_HEAD(&sma->pending_const);
 	INIT_LIST_HEAD(&sma->list_id);
 	sma->sem_nsems = nsems;
-	sma->sem_ctime = get_seconds();
+	sma->sem_ctime = ktime_get_real_seconds();
 
 	retval = ipc_addid(&sem_ids(ns), &sma->sem_perm, ns->sc_semmni);
 	if (retval < 0) {
@@ -1162,14 +1162,14 @@ static unsigned long copy_semid_to_user(void __user *buf, struct semid64_ds *in,
 	}
 }
 
-static time_t get_semotime(struct sem_array *sma)
+static time64_t get_semotime(struct sem_array *sma)
 {
 	int i;
-	time_t res;
+	time64_t res;
 
 	res = sma->sems[0].sem_otime;
 	for (i = 1; i < sma->sem_nsems; i++) {
-		time_t to = sma->sems[i].sem_otime;
+		time64_t to = sma->sems[i].sem_otime;
 
 		if (to > res)
 			res = to;
@@ -1309,7 +1309,7 @@ static int semctl_setval(struct ipc_namespace *ns, int semid, int semnum,
 
 	curr->semval = val;
 	curr->sempid = task_tgid_vnr(current);
-	sma->sem_ctime = get_seconds();
+	sma->sem_ctime = ktime_get_real_seconds();
 	/* maybe some queued-up processes were waiting for this */
 	do_smart_update(sma, NULL, 0, 0, &wake_q);
 	sem_unlock(sma, -1);
@@ -1437,7 +1437,7 @@ static int semctl_main(struct ipc_namespace *ns, int semid, int semnum,
 			for (i = 0; i < nsems; i++)
 				un->semadj[i] = 0;
 		}
-		sma->sem_ctime = get_seconds();
+		sma->sem_ctime = ktime_get_real_seconds();
 		/* maybe some queued-up processes were waiting for this */
 		do_smart_update(sma, NULL, 0, 0, &wake_q);
 		err = 0;
@@ -1547,7 +1547,7 @@ static int semctl_down(struct ipc_namespace *ns, int semid,
 		err = ipc_update_perm(&semid64->sem_perm, ipcp);
 		if (err)
 			goto out_unlock0;
-		sma->sem_ctime = get_seconds();
+		sma->sem_ctime = ktime_get_real_seconds();
 		break;
 	default:
 		err = -EINVAL;
@@ -2292,7 +2292,7 @@ static int sysvipc_sem_proc_show(struct seq_file *s, void *it)
 {
 	struct user_namespace *user_ns = seq_user_ns(s);
 	struct sem_array *sma = it;
-	time_t sem_otime;
+	time64_t sem_otime;
 
 	/*
 	 * The proc interface isn't aware of sem_lock(), it calls
@@ -2305,7 +2305,7 @@ static int sysvipc_sem_proc_show(struct seq_file *s, void *it)
 	sem_otime = get_semotime(sma);
 
 	seq_printf(s,
-		   "%10d %10d  %4o %10u %5u %5u %5u %5u %10lu %10lu\n",
+		   "%10d %10d  %4o %10u %5u %5u %5u %5u %10llu %10llu\n",
 		   sma->sem_perm.key,
 		   sma->sem_perm.id,
 		   sma->sem_perm.mode,
-- 
cgit v1.2.3


From 7ff2819e8dd5b528887dfbe4ff395f5d2142edff Mon Sep 17 00:00:00 2001
From: Deepa Dinamani <deepa.kernel@gmail.com>
Date: Wed, 2 Aug 2017 19:51:14 -0700
Subject: ipc: shm: Make shmid_kernel timestamps y2038 safe

time_t is not y2038 safe. Replace all uses of
time_t by y2038 safe time64_t.

Similarly, replace the calls to get_seconds() with
y2038 safe ktime_get_real_seconds().
Note that this preserves fast access on 64 bit systems,
but 32 bit systems need sequence counters.

The syscall interfaces themselves are not changed as part of
the patch. They will be part of a different series.

Signed-off-by: Deepa Dinamani <deepa.kernel@gmail.com>
Reviewed-by: Arnd Bergmann <arnd@arndb.de>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 include/linux/shm.h |  6 +++---
 ipc/shm.c           | 10 +++++-----
 2 files changed, 8 insertions(+), 8 deletions(-)

(limited to 'include')

diff --git a/include/linux/shm.h b/include/linux/shm.h
index 04e881829625..7bb897b25309 100644
--- a/include/linux/shm.h
+++ b/include/linux/shm.h
@@ -12,9 +12,9 @@ struct shmid_kernel /* private to the kernel */
 	struct file		*shm_file;
 	unsigned long		shm_nattch;
 	unsigned long		shm_segsz;
-	time_t			shm_atim;
-	time_t			shm_dtim;
-	time_t			shm_ctim;
+	time64_t		shm_atim;
+	time64_t		shm_dtim;
+	time64_t		shm_ctim;
 	pid_t			shm_cprid;
 	pid_t			shm_lprid;
 	struct user_struct	*mlock_user;
diff --git a/ipc/shm.c b/ipc/shm.c
index 342024de3b9d..f3d6408d6de1 100644
--- a/ipc/shm.c
+++ b/ipc/shm.c
@@ -200,7 +200,7 @@ static int __shm_open(struct vm_area_struct *vma)
 	if (IS_ERR(shp))
 		return PTR_ERR(shp);
 
-	shp->shm_atim = get_seconds();
+	shp->shm_atim = ktime_get_real_seconds();
 	shp->shm_lprid = task_tgid_vnr(current);
 	shp->shm_nattch++;
 	shm_unlock(shp);
@@ -287,7 +287,7 @@ static void shm_close(struct vm_area_struct *vma)
 		goto done; /* no-op */
 
 	shp->shm_lprid = task_tgid_vnr(current);
-	shp->shm_dtim = get_seconds();
+	shp->shm_dtim = ktime_get_real_seconds();
 	shp->shm_nattch--;
 	if (shm_may_destroy(ns, shp))
 		shm_destroy(ns, shp);
@@ -592,7 +592,7 @@ static int newseg(struct ipc_namespace *ns, struct ipc_params *params)
 	shp->shm_cprid = task_tgid_vnr(current);
 	shp->shm_lprid = 0;
 	shp->shm_atim = shp->shm_dtim = 0;
-	shp->shm_ctim = get_seconds();
+	shp->shm_ctim = ktime_get_real_seconds();
 	shp->shm_segsz = size;
 	shp->shm_nattch = 0;
 	shp->shm_file = file;
@@ -846,7 +846,7 @@ static int shmctl_down(struct ipc_namespace *ns, int shmid, int cmd,
 		err = ipc_update_perm(&shmid64->shm_perm, ipcp);
 		if (err)
 			goto out_unlock0;
-		shp->shm_ctim = get_seconds();
+		shp->shm_ctim = ktime_get_real_seconds();
 		break;
 	default:
 		err = -EINVAL;
@@ -1586,7 +1586,7 @@ static int sysvipc_shm_proc_show(struct seq_file *s, void *it)
 
 	seq_printf(s,
 		   "%10d %10d  %4o " SIZE_SPEC " %5u %5u  "
-		   "%5lu %5u %5u %5u %5u %10lu %10lu %10lu "
+		   "%5lu %5u %5u %5u %5u %10llu %10llu %10llu "
 		   SIZE_SPEC " " SIZE_SPEC "\n",
 		   shp->shm_perm.key,
 		   shp->shm_perm.id,
-- 
cgit v1.2.3


From aaed2dd8a31359e5767ee099ecbb078d55be4d29 Mon Sep 17 00:00:00 2001
From: Deepa Dinamani <deepa.kernel@gmail.com>
Date: Wed, 2 Aug 2017 19:51:15 -0700
Subject: utimes: Make utimes y2038 safe

struct timespec is not y2038 safe on 32 bit machines.
Replace timespec with y2038 safe struct timespec64.

Note that the patch only changes the internals without
modifying the syscall interfaces. This will be part
of a separate series.

Signed-off-by: Deepa Dinamani <deepa.kernel@gmail.com>
Reviewed-by: Arnd Bergmann <arnd@arndb.de>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/utimes.c          | 23 ++++++++++++-----------
 include/linux/time.h |  2 +-
 init/initramfs.c     |  2 +-
 3 files changed, 14 insertions(+), 13 deletions(-)

(limited to 'include')

diff --git a/fs/utimes.c b/fs/utimes.c
index 6571d8c848a0..51edb9f9507c 100644
--- a/fs/utimes.c
+++ b/fs/utimes.c
@@ -22,7 +22,7 @@
  */
 SYSCALL_DEFINE2(utime, char __user *, filename, struct utimbuf __user *, times)
 {
-	struct timespec tv[2];
+	struct timespec64 tv[2];
 
 	if (times) {
 		if (get_user(tv[0].tv_sec, &times->actime) ||
@@ -44,7 +44,7 @@ static bool nsec_valid(long nsec)
 	return nsec >= 0 && nsec <= 999999999;
 }
 
-static int utimes_common(const struct path *path, struct timespec *times)
+static int utimes_common(const struct path *path, struct timespec64 *times)
 {
 	int error;
 	struct iattr newattrs;
@@ -115,7 +115,7 @@ out:
  * must be owner or have write permission.
  * Else, update from *times, must be owner or super user.
  */
-long do_utimes(int dfd, const char __user *filename, struct timespec *times,
+long do_utimes(int dfd, const char __user *filename, struct timespec64 *times,
 	       int flags)
 {
 	int error = -EINVAL;
@@ -167,10 +167,11 @@ out:
 SYSCALL_DEFINE4(utimensat, int, dfd, const char __user *, filename,
 		struct timespec __user *, utimes, int, flags)
 {
-	struct timespec tstimes[2];
+	struct timespec64 tstimes[2];
 
 	if (utimes) {
-		if (copy_from_user(&tstimes, utimes, sizeof(tstimes)))
+		if ((get_timespec64(&tstimes[0], &utimes[0]) ||
+			get_timespec64(&tstimes[1], &utimes[1])))
 			return -EFAULT;
 
 		/* Nothing to do, we must not even check the path.  */
@@ -186,7 +187,7 @@ SYSCALL_DEFINE3(futimesat, int, dfd, const char __user *, filename,
 		struct timeval __user *, utimes)
 {
 	struct timeval times[2];
-	struct timespec tstimes[2];
+	struct timespec64 tstimes[2];
 
 	if (utimes) {
 		if (copy_from_user(&times, utimes, sizeof(times)))
@@ -224,7 +225,7 @@ SYSCALL_DEFINE2(utimes, char __user *, filename,
 COMPAT_SYSCALL_DEFINE2(utime, const char __user *, filename,
 		       struct compat_utimbuf __user *, t)
 {
-	struct timespec tv[2];
+	struct timespec64 tv[2];
 
 	if (t) {
 		if (get_user(tv[0].tv_sec, &t->actime) ||
@@ -238,11 +239,11 @@ COMPAT_SYSCALL_DEFINE2(utime, const char __user *, filename,
 
 COMPAT_SYSCALL_DEFINE4(utimensat, unsigned int, dfd, const char __user *, filename, struct compat_timespec __user *, t, int, flags)
 {
-	struct timespec tv[2];
+	struct timespec64 tv[2];
 
 	if  (t) {
-		if (compat_get_timespec(&tv[0], &t[0]) ||
-		    compat_get_timespec(&tv[1], &t[1]))
+		if (compat_get_timespec64(&tv[0], &t[0]) ||
+		    compat_get_timespec64(&tv[1], &t[1]))
 			return -EFAULT;
 
 		if (tv[0].tv_nsec == UTIME_OMIT && tv[1].tv_nsec == UTIME_OMIT)
@@ -253,7 +254,7 @@ COMPAT_SYSCALL_DEFINE4(utimensat, unsigned int, dfd, const char __user *, filena
 
 COMPAT_SYSCALL_DEFINE3(futimesat, unsigned int, dfd, const char __user *, filename, struct compat_timeval __user *, t)
 {
-	struct timespec tv[2];
+	struct timespec64 tv[2];
 
 	if (t) {
 		if (get_user(tv[0].tv_sec, &t[0].tv_sec) ||
diff --git a/include/linux/time.h b/include/linux/time.h
index 4abb32d4c6b8..3d0cd017f0d7 100644
--- a/include/linux/time.h
+++ b/include/linux/time.h
@@ -178,7 +178,7 @@ extern int do_setitimer(int which, struct itimerval *value,
 			struct itimerval *ovalue);
 extern int do_getitimer(int which, struct itimerval *value);
 
-extern long do_utimes(int dfd, const char __user *filename, struct timespec *times, int flags);
+extern long do_utimes(int dfd, const char __user *filename, struct timespec64 *times, int flags);
 
 /*
  * Similar to the struct tm in userspace <time.h>, but it needs to be here so
diff --git a/init/initramfs.c b/init/initramfs.c
index 8a532050043f..e64bf7b4c1ca 100644
--- a/init/initramfs.c
+++ b/init/initramfs.c
@@ -110,7 +110,7 @@ static void __init free_hash(void)
 
 static long __init do_utime(char *filename, time_t mtime)
 {
-	struct timespec t[2];
+	struct timespec64 t[2];
 
 	t[0].tv_sec = mtime;
 	t[0].tv_nsec = 0;
-- 
cgit v1.2.3


From 6b8ed8720011f703bd9be63c905b592666e1bf62 Mon Sep 17 00:00:00 2001
From: Gabriel Krisman Bertazi <krisman@collabora.co.uk>
Date: Thu, 31 Aug 2017 16:52:14 -0300
Subject: drm: Fix example comment of format modifier blob

To represent formats 98-102, the supported formats mask must be
0x7c00000000 and not 0x3c00000000.

Signed-off-by: Gabriel Krisman Bertazi <krisman@collabora.co.uk>
Reviewed-by: Ben Widawsky <ben@bwidawsk.net>
Signed-off-by: Daniel Vetter <daniel.vetter@ffwll.ch>
Link: https://patchwork.freedesktop.org/patch/msgid/20170831195215.13302-1-krisman@collabora.co.uk
---
 include/uapi/drm/drm_mode.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/uapi/drm/drm_mode.h b/include/uapi/drm/drm_mode.h
index a2bb7161f020..e040c952accc 100644
--- a/include/uapi/drm/drm_mode.h
+++ b/include/uapi/drm/drm_mode.h
@@ -749,9 +749,9 @@ struct drm_format_modifier {
 	 * If the number formats grew to 128, and formats 98-102 are
 	 * supported with the modifier:
 	 *
-	 * 0x0000003c00000000 0000000000000000
+	 * 0x0000007c00000000 0000000000000000
 	 *		  ^
-	 *		  |__offset = 64, formats = 0x3c00000000
+	 *		  |__offset = 64, formats = 0x7c00000000
 	 *
 	 */
 	__u64 formats;
-- 
cgit v1.2.3


From b24a5f293058b512f1685930f2983a20ee3e15ab Mon Sep 17 00:00:00 2001
From: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Date: Fri, 1 Sep 2017 19:15:07 +0300
Subject: ALSA: atmel: Remove leftovers of AVR32 removal

The ALSA related include header files are left overs after the commit

	020c5260c2b1 ("ALSA: atmel: Remove AVR32 bits from the driver")

Fixes: 020c5260c2b1 ("ALSA: atmel: Remove AVR32 bits from the driver")
Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Signed-off-by: Takashi Iwai <tiwai@suse.de>
---
 include/sound/atmel-abdac.h | 23 -----------------------
 include/sound/atmel-ac97c.h | 38 --------------------------------------
 2 files changed, 61 deletions(-)
 delete mode 100644 include/sound/atmel-abdac.h
 delete mode 100644 include/sound/atmel-ac97c.h

(limited to 'include')

diff --git a/include/sound/atmel-abdac.h b/include/sound/atmel-abdac.h
deleted file mode 100644
index a8f735d677fa..000000000000
--- a/include/sound/atmel-abdac.h
+++ /dev/null
@@ -1,23 +0,0 @@
-/*
- * Driver for the Atmel Audio Bitstream DAC (ABDAC)
- *
- * Copyright (C) 2009 Atmel Corporation
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms of the GNU General Public License version 2 as published
- * by the Free Software Foundation.
- */
-#ifndef __INCLUDE_SOUND_ATMEL_ABDAC_H
-#define __INCLUDE_SOUND_ATMEL_ABDAC_H
-
-#include <linux/platform_data/dma-dw.h>
-
-/**
- * struct atmel_abdac_pdata - board specific ABDAC configuration
- * @dws: DMA slave interface to use for sound playback.
- */
-struct atmel_abdac_pdata {
-	struct dw_dma_slave	dws;
-};
-
-#endif /* __INCLUDE_SOUND_ATMEL_ABDAC_H */
diff --git a/include/sound/atmel-ac97c.h b/include/sound/atmel-ac97c.h
deleted file mode 100644
index f2a1cdc37661..000000000000
--- a/include/sound/atmel-ac97c.h
+++ /dev/null
@@ -1,38 +0,0 @@
-/*
- * Driver for the Atmel AC97C controller
- *
- * Copyright (C) 2005-2009 Atmel Corporation
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms of the GNU General Public License version 2 as published
- * by the Free Software Foundation.
- */
-#ifndef __INCLUDE_SOUND_ATMEL_AC97C_H
-#define __INCLUDE_SOUND_ATMEL_AC97C_H
-
-#include <linux/platform_data/dma-dw.h>
-
-#define AC97C_CAPTURE	0x01
-#define AC97C_PLAYBACK	0x02
-#define AC97C_BOTH	(AC97C_CAPTURE | AC97C_PLAYBACK)
-
-/**
- * struct atmel_ac97c_pdata - board specific AC97C configuration
- * @rx_dws: DMA slave interface to use for sound capture.
- * @tx_dws: DMA slave interface to use for sound playback.
- * @reset_pin: GPIO pin wired to the reset input on the external AC97 codec,
- *             optional to use, set to -ENODEV if not in use. AC97 layer will
- *             try to do a software reset of the external codec anyway.
- *
- * If the user do not want to use a DMA channel for playback or capture, i.e.
- * only one feature is required on the board. The slave for playback or capture
- * can be set to NULL. The AC97C driver will take use of this when setting up
- * the sound streams.
- */
-struct ac97c_platform_data {
-	struct dw_dma_slave	rx_dws;
-	struct dw_dma_slave	tx_dws;
-	int			reset_pin;
-};
-
-#endif /* __INCLUDE_SOUND_ATMEL_AC97C_H */
-- 
cgit v1.2.3


From bea74641e3786d51dcf1175527cc1781420961c9 Mon Sep 17 00:00:00 2001
From: Vishwanath Pai <vpai@akamai.com>
Date: Fri, 18 Aug 2017 20:58:59 +0200
Subject: netfilter: xt_hashlimit: add rate match mode

This patch adds a new feature to hashlimit that allows matching on the
current packet/byte rate without rate limiting. This can be enabled
with a new flag --hashlimit-rate-match. The match returns true if the
current rate of packets is above/below the user specified value.

The main difference between the existing algorithm and the new one is
that the existing algorithm rate-limits the flow whereas the new
algorithm does not. Instead it *classifies* the flow based on whether
it is above or below a certain rate. I will demonstrate this with an
example below. Let us assume this rule:

iptables -A INPUT -m hashlimit --hashlimit-above 10/s -j new_chain

If the packet rate is 15/s, the existing algorithm would ACCEPT 10
packets every second and send 5 packets to "new_chain".

But with the new algorithm, as long as the rate of 15/s is sustained,
all packets will continue to match and every packet is sent to new_chain.

This new functionality will let us classify different flows based on
their current rate, so that further decisions can be made on them based on
what the current rate is.

This is how the new algorithm works:
We divide time into intervals of 1 (sec/min/hour) as specified by
the user. We keep track of the number of packets/bytes processed in the
current interval. After each interval we reset the counter to 0.

When we receive a packet for match, we look at the packet rate
during the current interval and the previous interval to make a
decision:

if [ prev_rate < user and cur_rate < user ]
        return Below
else
        return Above

Where cur_rate is the number of packets/bytes seen in the current
interval, prev is the number of packets/bytes seen in the previous
interval and 'user' is the rate specified by the user.

We also provide flexibility to the user for choosing the time
interval using the option --hashilmit-interval. For example the user can
keep a low rate like x/hour but still keep the interval as small as 1
second.

To preserve backwards compatibility we have to add this feature in a new
revision, so I've created revision 3 for hashlimit. The two new options
we add are:

--hashlimit-rate-match
--hashlimit-rate-interval

I have updated the help text to add these new options. Also added a few
tests for the new options.

Suggested-by: Igor Lubashev <ilubashe@akamai.com>
Reviewed-by: Josh Hunt <johunt@akamai.com>
Signed-off-by: Vishwanath Pai <vpai@akamai.com>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/linux/netfilter/xt_hashlimit.h      |   3 +-
 include/uapi/linux/netfilter/xt_hashlimit.h |  36 +++-
 net/netfilter/xt_hashlimit.c                | 277 +++++++++++++++++++++++++---
 3 files changed, 285 insertions(+), 31 deletions(-)

(limited to 'include')

diff --git a/include/linux/netfilter/xt_hashlimit.h b/include/linux/netfilter/xt_hashlimit.h
index 074790c0cf74..0fc458bde80b 100644
--- a/include/linux/netfilter/xt_hashlimit.h
+++ b/include/linux/netfilter/xt_hashlimit.h
@@ -5,5 +5,6 @@
 
 #define XT_HASHLIMIT_ALL (XT_HASHLIMIT_HASH_DIP | XT_HASHLIMIT_HASH_DPT | \
 			  XT_HASHLIMIT_HASH_SIP | XT_HASHLIMIT_HASH_SPT | \
-			  XT_HASHLIMIT_INVERT | XT_HASHLIMIT_BYTES)
+			  XT_HASHLIMIT_INVERT | XT_HASHLIMIT_BYTES |\
+			  XT_HASHLIMIT_RATE_MATCH)
 #endif /*_XT_HASHLIMIT_H*/
diff --git a/include/uapi/linux/netfilter/xt_hashlimit.h b/include/uapi/linux/netfilter/xt_hashlimit.h
index 79da349f1060..aa98573248b1 100644
--- a/include/uapi/linux/netfilter/xt_hashlimit.h
+++ b/include/uapi/linux/netfilter/xt_hashlimit.h
@@ -19,12 +19,13 @@
 struct xt_hashlimit_htable;
 
 enum {
-	XT_HASHLIMIT_HASH_DIP = 1 << 0,
-	XT_HASHLIMIT_HASH_DPT = 1 << 1,
-	XT_HASHLIMIT_HASH_SIP = 1 << 2,
-	XT_HASHLIMIT_HASH_SPT = 1 << 3,
-	XT_HASHLIMIT_INVERT   = 1 << 4,
-	XT_HASHLIMIT_BYTES    = 1 << 5,
+	XT_HASHLIMIT_HASH_DIP		= 1 << 0,
+	XT_HASHLIMIT_HASH_DPT		= 1 << 1,
+	XT_HASHLIMIT_HASH_SIP		= 1 << 2,
+	XT_HASHLIMIT_HASH_SPT		= 1 << 3,
+	XT_HASHLIMIT_INVERT		= 1 << 4,
+	XT_HASHLIMIT_BYTES		= 1 << 5,
+	XT_HASHLIMIT_RATE_MATCH		= 1 << 6,
 };
 
 struct hashlimit_cfg {
@@ -79,6 +80,21 @@ struct hashlimit_cfg2 {
 	__u8 srcmask, dstmask;
 };
 
+struct hashlimit_cfg3 {
+	__u64 avg;		/* Average secs between packets * scale */
+	__u64 burst;		/* Period multiplier for upper limit. */
+	__u32 mode;		/* bitmask of XT_HASHLIMIT_HASH_* */
+
+	/* user specified */
+	__u32 size;		/* how many buckets */
+	__u32 max;		/* max number of entries */
+	__u32 gc_interval;	/* gc interval */
+	__u32 expire;		/* when do entries expire? */
+
+	__u32 interval;
+	__u8 srcmask, dstmask;
+};
+
 struct xt_hashlimit_mtinfo1 {
 	char name[IFNAMSIZ];
 	struct hashlimit_cfg1 cfg;
@@ -95,4 +111,12 @@ struct xt_hashlimit_mtinfo2 {
 	struct xt_hashlimit_htable *hinfo __attribute__((aligned(8)));
 };
 
+struct xt_hashlimit_mtinfo3 {
+	char name[NAME_MAX];
+	struct hashlimit_cfg3 cfg;
+
+	/* Used internally by the kernel */
+	struct xt_hashlimit_htable *hinfo __attribute__((aligned(8)));
+};
+
 #endif /* _UAPI_XT_HASHLIMIT_H */
diff --git a/net/netfilter/xt_hashlimit.c b/net/netfilter/xt_hashlimit.c
index ffdb611e54a2..10d48234f5f4 100644
--- a/net/netfilter/xt_hashlimit.c
+++ b/net/netfilter/xt_hashlimit.c
@@ -56,6 +56,7 @@ static inline struct hashlimit_net *hashlimit_pernet(struct net *net)
 }
 
 /* need to declare this at the top */
+static const struct file_operations dl_file_ops_v2;
 static const struct file_operations dl_file_ops_v1;
 static const struct file_operations dl_file_ops;
 
@@ -87,8 +88,19 @@ struct dsthash_ent {
 	unsigned long expires;		/* precalculated expiry time */
 	struct {
 		unsigned long prev;	/* last modification */
-		u_int64_t credit;
-		u_int64_t credit_cap, cost;
+		union {
+			struct {
+				u_int64_t credit;
+				u_int64_t credit_cap;
+				u_int64_t cost;
+			};
+			struct {
+				u_int32_t interval, prev_window;
+				u_int64_t current_rate;
+				u_int64_t rate;
+				int64_t burst;
+			};
+		};
 	} rateinfo;
 	struct rcu_head rcu;
 };
@@ -99,7 +111,7 @@ struct xt_hashlimit_htable {
 	u_int8_t family;
 	bool rnd_initialized;
 
-	struct hashlimit_cfg2 cfg;	/* config */
+	struct hashlimit_cfg3 cfg;	/* config */
 
 	/* used internally */
 	spinlock_t lock;		/* lock for list_head */
@@ -116,10 +128,10 @@ struct xt_hashlimit_htable {
 };
 
 static int
-cfg_copy(struct hashlimit_cfg2 *to, void *from, int revision)
+cfg_copy(struct hashlimit_cfg3 *to, const void *from, int revision)
 {
 	if (revision == 1) {
-		struct hashlimit_cfg1 *cfg = from;
+		struct hashlimit_cfg1 *cfg = (struct hashlimit_cfg1 *)from;
 
 		to->mode = cfg->mode;
 		to->avg = cfg->avg;
@@ -131,7 +143,19 @@ cfg_copy(struct hashlimit_cfg2 *to, void *from, int revision)
 		to->srcmask = cfg->srcmask;
 		to->dstmask = cfg->dstmask;
 	} else if (revision == 2) {
-		memcpy(to, from, sizeof(struct hashlimit_cfg2));
+		struct hashlimit_cfg2 *cfg = (struct hashlimit_cfg2 *)from;
+
+		to->mode = cfg->mode;
+		to->avg = cfg->avg;
+		to->burst = cfg->burst;
+		to->size = cfg->size;
+		to->max = cfg->max;
+		to->gc_interval = cfg->gc_interval;
+		to->expire = cfg->expire;
+		to->srcmask = cfg->srcmask;
+		to->dstmask = cfg->dstmask;
+	} else if (revision == 3) {
+		memcpy(to, from, sizeof(struct hashlimit_cfg3));
 	} else {
 		return -EINVAL;
 	}
@@ -240,13 +264,14 @@ dsthash_free(struct xt_hashlimit_htable *ht, struct dsthash_ent *ent)
 }
 static void htable_gc(struct work_struct *work);
 
-static int htable_create(struct net *net, struct hashlimit_cfg2 *cfg,
+static int htable_create(struct net *net, struct hashlimit_cfg3 *cfg,
 			 const char *name, u_int8_t family,
 			 struct xt_hashlimit_htable **out_hinfo,
 			 int revision)
 {
 	struct hashlimit_net *hashlimit_net = hashlimit_pernet(net);
 	struct xt_hashlimit_htable *hinfo;
+	const struct file_operations *fops;
 	unsigned int size, i;
 	int ret;
 
@@ -268,7 +293,7 @@ static int htable_create(struct net *net, struct hashlimit_cfg2 *cfg,
 	*out_hinfo = hinfo;
 
 	/* copy match config into hashtable config */
-	ret = cfg_copy(&hinfo->cfg, (void *)cfg, 2);
+	ret = cfg_copy(&hinfo->cfg, (void *)cfg, 3);
 
 	if (ret)
 		return ret;
@@ -293,11 +318,21 @@ static int htable_create(struct net *net, struct hashlimit_cfg2 *cfg,
 	}
 	spin_lock_init(&hinfo->lock);
 
+	switch (revision) {
+	case 1:
+		fops = &dl_file_ops_v1;
+		break;
+	case 2:
+		fops = &dl_file_ops_v2;
+		break;
+	default:
+		fops = &dl_file_ops;
+	}
+
 	hinfo->pde = proc_create_data(name, 0,
 		(family == NFPROTO_IPV4) ?
 		hashlimit_net->ipt_hashlimit : hashlimit_net->ip6t_hashlimit,
-		(revision == 1) ? &dl_file_ops_v1 : &dl_file_ops,
-		hinfo);
+		fops, hinfo);
 	if (hinfo->pde == NULL) {
 		kfree(hinfo->name);
 		vfree(hinfo);
@@ -482,6 +517,25 @@ static u32 user2credits_byte(u32 user)
 	return (u32) (us >> 32);
 }
 
+static u64 user2rate(u64 user)
+{
+	if (user != 0) {
+		return div64_u64(XT_HASHLIMIT_SCALE_v2, user);
+	} else {
+		pr_warn("invalid rate from userspace: %llu\n", user);
+		return 0;
+	}
+}
+
+static u64 user2rate_bytes(u64 user)
+{
+	u64 r;
+
+	r = user ? 0xFFFFFFFFULL / user : 0xFFFFFFFFULL;
+	r = (r - 1) << 4;
+	return r;
+}
+
 static void rateinfo_recalc(struct dsthash_ent *dh, unsigned long now,
 			    u32 mode, int revision)
 {
@@ -491,6 +545,21 @@ static void rateinfo_recalc(struct dsthash_ent *dh, unsigned long now,
 	if (delta == 0)
 		return;
 
+	if (revision >= 3 && mode & XT_HASHLIMIT_RATE_MATCH) {
+		u64 interval = dh->rateinfo.interval * HZ;
+
+		if (delta < interval)
+			return;
+
+		dh->rateinfo.prev = now;
+		dh->rateinfo.prev_window =
+			((dh->rateinfo.current_rate * interval) >
+			 (delta * dh->rateinfo.rate));
+		dh->rateinfo.current_rate = 0;
+
+		return;
+	}
+
 	dh->rateinfo.prev = now;
 
 	if (mode & XT_HASHLIMIT_BYTES) {
@@ -515,7 +584,23 @@ static void rateinfo_init(struct dsthash_ent *dh,
 			  struct xt_hashlimit_htable *hinfo, int revision)
 {
 	dh->rateinfo.prev = jiffies;
-	if (hinfo->cfg.mode & XT_HASHLIMIT_BYTES) {
+	if (revision >= 3 && hinfo->cfg.mode & XT_HASHLIMIT_RATE_MATCH) {
+		dh->rateinfo.prev_window = 0;
+		dh->rateinfo.current_rate = 0;
+		if (hinfo->cfg.mode & XT_HASHLIMIT_BYTES) {
+			dh->rateinfo.rate = user2rate_bytes(hinfo->cfg.avg);
+			if (hinfo->cfg.burst)
+				dh->rateinfo.burst =
+					hinfo->cfg.burst * dh->rateinfo.rate;
+			else
+				dh->rateinfo.burst = dh->rateinfo.rate;
+		} else {
+			dh->rateinfo.rate = user2rate(hinfo->cfg.avg);
+			dh->rateinfo.burst =
+				hinfo->cfg.burst + dh->rateinfo.rate;
+		}
+		dh->rateinfo.interval = hinfo->cfg.interval;
+	} else if (hinfo->cfg.mode & XT_HASHLIMIT_BYTES) {
 		dh->rateinfo.credit = CREDITS_PER_JIFFY_BYTES * HZ;
 		dh->rateinfo.cost = user2credits_byte(hinfo->cfg.avg);
 		dh->rateinfo.credit_cap = hinfo->cfg.burst;
@@ -648,7 +733,7 @@ static u32 hashlimit_byte_cost(unsigned int len, struct dsthash_ent *dh)
 static bool
 hashlimit_mt_common(const struct sk_buff *skb, struct xt_action_param *par,
 		    struct xt_hashlimit_htable *hinfo,
-		    const struct hashlimit_cfg2 *cfg, int revision)
+		    const struct hashlimit_cfg3 *cfg, int revision)
 {
 	unsigned long now = jiffies;
 	struct dsthash_ent *dh;
@@ -680,6 +765,20 @@ hashlimit_mt_common(const struct sk_buff *skb, struct xt_action_param *par,
 		rateinfo_recalc(dh, now, hinfo->cfg.mode, revision);
 	}
 
+	if (cfg->mode & XT_HASHLIMIT_RATE_MATCH) {
+		cost = (cfg->mode & XT_HASHLIMIT_BYTES) ? skb->len : 1;
+		dh->rateinfo.current_rate += cost;
+
+		if (!dh->rateinfo.prev_window &&
+		    (dh->rateinfo.current_rate <= dh->rateinfo.burst)) {
+			spin_unlock(&dh->lock);
+			rcu_read_unlock_bh();
+			return !(cfg->mode & XT_HASHLIMIT_INVERT);
+		} else {
+			goto overlimit;
+		}
+	}
+
 	if (cfg->mode & XT_HASHLIMIT_BYTES)
 		cost = hashlimit_byte_cost(skb->len, dh);
 	else
@@ -693,6 +792,7 @@ hashlimit_mt_common(const struct sk_buff *skb, struct xt_action_param *par,
 		return !(cfg->mode & XT_HASHLIMIT_INVERT);
 	}
 
+overlimit:
 	spin_unlock(&dh->lock);
 	local_bh_enable();
 	/* default match is underlimit - so over the limit, we need to invert */
@@ -708,7 +808,7 @@ hashlimit_mt_v1(const struct sk_buff *skb, struct xt_action_param *par)
 {
 	const struct xt_hashlimit_mtinfo1 *info = par->matchinfo;
 	struct xt_hashlimit_htable *hinfo = info->hinfo;
-	struct hashlimit_cfg2 cfg = {};
+	struct hashlimit_cfg3 cfg = {};
 	int ret;
 
 	ret = cfg_copy(&cfg, (void *)&info->cfg, 1);
@@ -720,17 +820,33 @@ hashlimit_mt_v1(const struct sk_buff *skb, struct xt_action_param *par)
 }
 
 static bool
-hashlimit_mt(const struct sk_buff *skb, struct xt_action_param *par)
+hashlimit_mt_v2(const struct sk_buff *skb, struct xt_action_param *par)
 {
 	const struct xt_hashlimit_mtinfo2 *info = par->matchinfo;
 	struct xt_hashlimit_htable *hinfo = info->hinfo;
+	struct hashlimit_cfg3 cfg = {};
+	int ret;
+
+	ret = cfg_copy(&cfg, (void *)&info->cfg, 2);
+
+	if (ret)
+		return ret;
+
+	return hashlimit_mt_common(skb, par, hinfo, &cfg, 2);
+}
+
+static bool
+hashlimit_mt(const struct sk_buff *skb, struct xt_action_param *par)
+{
+	const struct xt_hashlimit_mtinfo3 *info = par->matchinfo;
+	struct xt_hashlimit_htable *hinfo = info->hinfo;
 
-	return hashlimit_mt_common(skb, par, hinfo, &info->cfg, 2);
+	return hashlimit_mt_common(skb, par, hinfo, &info->cfg, 3);
 }
 
 static int hashlimit_mt_check_common(const struct xt_mtchk_param *par,
 				     struct xt_hashlimit_htable **hinfo,
-				     struct hashlimit_cfg2 *cfg,
+				     struct hashlimit_cfg3 *cfg,
 				     const char *name, int revision)
 {
 	struct net *net = par->net;
@@ -753,7 +869,17 @@ static int hashlimit_mt_check_common(const struct xt_mtchk_param *par,
 	}
 
 	/* Check for overflow. */
-	if (cfg->mode & XT_HASHLIMIT_BYTES) {
+	if (revision >= 3 && cfg->mode & XT_HASHLIMIT_RATE_MATCH) {
+		if (cfg->avg == 0) {
+			pr_info("hashlimit invalid rate\n");
+			return -ERANGE;
+		}
+
+		if (cfg->interval == 0) {
+			pr_info("hashlimit invalid interval\n");
+			return -EINVAL;
+		}
+	} else if (cfg->mode & XT_HASHLIMIT_BYTES) {
 		if (user2credits_byte(cfg->avg) == 0) {
 			pr_info("overflow, rate too high: %llu\n", cfg->avg);
 			return -EINVAL;
@@ -784,7 +910,7 @@ static int hashlimit_mt_check_common(const struct xt_mtchk_param *par,
 static int hashlimit_mt_check_v1(const struct xt_mtchk_param *par)
 {
 	struct xt_hashlimit_mtinfo1 *info = par->matchinfo;
-	struct hashlimit_cfg2 cfg = {};
+	struct hashlimit_cfg3 cfg = {};
 	int ret;
 
 	if (info->name[sizeof(info->name) - 1] != '\0')
@@ -799,15 +925,40 @@ static int hashlimit_mt_check_v1(const struct xt_mtchk_param *par)
 					 &cfg, info->name, 1);
 }
 
-static int hashlimit_mt_check(const struct xt_mtchk_param *par)
+static int hashlimit_mt_check_v2(const struct xt_mtchk_param *par)
 {
 	struct xt_hashlimit_mtinfo2 *info = par->matchinfo;
+	struct hashlimit_cfg3 cfg = {};
+	int ret;
+
+	if (info->name[sizeof(info->name) - 1] != '\0')
+		return -EINVAL;
+
+	ret = cfg_copy(&cfg, (void *)&info->cfg, 2);
+
+	if (ret)
+		return ret;
+
+	return hashlimit_mt_check_common(par, &info->hinfo,
+					 &cfg, info->name, 2);
+}
+
+static int hashlimit_mt_check(const struct xt_mtchk_param *par)
+{
+	struct xt_hashlimit_mtinfo3 *info = par->matchinfo;
 
 	if (info->name[sizeof(info->name) - 1] != '\0')
 		return -EINVAL;
 
 	return hashlimit_mt_check_common(par, &info->hinfo, &info->cfg,
-					 info->name, 2);
+					 info->name, 3);
+}
+
+static void hashlimit_mt_destroy_v2(const struct xt_mtdtor_param *par)
+{
+	const struct xt_hashlimit_mtinfo2 *info = par->matchinfo;
+
+	htable_put(info->hinfo);
 }
 
 static void hashlimit_mt_destroy_v1(const struct xt_mtdtor_param *par)
@@ -819,7 +970,7 @@ static void hashlimit_mt_destroy_v1(const struct xt_mtdtor_param *par)
 
 static void hashlimit_mt_destroy(const struct xt_mtdtor_param *par)
 {
-	const struct xt_hashlimit_mtinfo2 *info = par->matchinfo;
+	const struct xt_hashlimit_mtinfo3 *info = par->matchinfo;
 
 	htable_put(info->hinfo);
 }
@@ -840,9 +991,20 @@ static struct xt_match hashlimit_mt_reg[] __read_mostly = {
 		.name           = "hashlimit",
 		.revision       = 2,
 		.family         = NFPROTO_IPV4,
-		.match          = hashlimit_mt,
+		.match          = hashlimit_mt_v2,
 		.matchsize      = sizeof(struct xt_hashlimit_mtinfo2),
 		.usersize	= offsetof(struct xt_hashlimit_mtinfo2, hinfo),
+		.checkentry     = hashlimit_mt_check_v2,
+		.destroy        = hashlimit_mt_destroy_v2,
+		.me             = THIS_MODULE,
+	},
+	{
+		.name           = "hashlimit",
+		.revision       = 3,
+		.family         = NFPROTO_IPV4,
+		.match          = hashlimit_mt,
+		.matchsize      = sizeof(struct xt_hashlimit_mtinfo3),
+		.usersize	= offsetof(struct xt_hashlimit_mtinfo3, hinfo),
 		.checkentry     = hashlimit_mt_check,
 		.destroy        = hashlimit_mt_destroy,
 		.me             = THIS_MODULE,
@@ -863,9 +1025,20 @@ static struct xt_match hashlimit_mt_reg[] __read_mostly = {
 		.name           = "hashlimit",
 		.revision       = 2,
 		.family         = NFPROTO_IPV6,
-		.match          = hashlimit_mt,
+		.match          = hashlimit_mt_v2,
 		.matchsize      = sizeof(struct xt_hashlimit_mtinfo2),
 		.usersize	= offsetof(struct xt_hashlimit_mtinfo2, hinfo),
+		.checkentry     = hashlimit_mt_check_v2,
+		.destroy        = hashlimit_mt_destroy_v2,
+		.me             = THIS_MODULE,
+	},
+	{
+		.name           = "hashlimit",
+		.revision       = 3,
+		.family         = NFPROTO_IPV6,
+		.match          = hashlimit_mt,
+		.matchsize      = sizeof(struct xt_hashlimit_mtinfo3),
+		.usersize	= offsetof(struct xt_hashlimit_mtinfo3, hinfo),
 		.checkentry     = hashlimit_mt_check,
 		.destroy        = hashlimit_mt_destroy,
 		.me             = THIS_MODULE,
@@ -947,6 +1120,21 @@ static void dl_seq_print(struct dsthash_ent *ent, u_int8_t family,
 	}
 }
 
+static int dl_seq_real_show_v2(struct dsthash_ent *ent, u_int8_t family,
+			       struct seq_file *s)
+{
+	const struct xt_hashlimit_htable *ht = s->private;
+
+	spin_lock(&ent->lock);
+	/* recalculate to show accurate numbers */
+	rateinfo_recalc(ent, jiffies, ht->cfg.mode, 2);
+
+	dl_seq_print(ent, family, s);
+
+	spin_unlock(&ent->lock);
+	return seq_has_overflowed(s);
+}
+
 static int dl_seq_real_show_v1(struct dsthash_ent *ent, u_int8_t family,
 			       struct seq_file *s)
 {
@@ -969,7 +1157,7 @@ static int dl_seq_real_show(struct dsthash_ent *ent, u_int8_t family,
 
 	spin_lock(&ent->lock);
 	/* recalculate to show accurate numbers */
-	rateinfo_recalc(ent, jiffies, ht->cfg.mode, 2);
+	rateinfo_recalc(ent, jiffies, ht->cfg.mode, 3);
 
 	dl_seq_print(ent, family, s);
 
@@ -977,6 +1165,20 @@ static int dl_seq_real_show(struct dsthash_ent *ent, u_int8_t family,
 	return seq_has_overflowed(s);
 }
 
+static int dl_seq_show_v2(struct seq_file *s, void *v)
+{
+	struct xt_hashlimit_htable *htable = s->private;
+	unsigned int *bucket = (unsigned int *)v;
+	struct dsthash_ent *ent;
+
+	if (!hlist_empty(&htable->hash[*bucket])) {
+		hlist_for_each_entry(ent, &htable->hash[*bucket], node)
+			if (dl_seq_real_show_v2(ent, htable->family, s))
+				return -1;
+	}
+	return 0;
+}
+
 static int dl_seq_show_v1(struct seq_file *s, void *v)
 {
 	struct xt_hashlimit_htable *htable = s->private;
@@ -1012,6 +1214,13 @@ static const struct seq_operations dl_seq_ops_v1 = {
 	.show  = dl_seq_show_v1
 };
 
+static const struct seq_operations dl_seq_ops_v2 = {
+	.start = dl_seq_start,
+	.next  = dl_seq_next,
+	.stop  = dl_seq_stop,
+	.show  = dl_seq_show_v2
+};
+
 static const struct seq_operations dl_seq_ops = {
 	.start = dl_seq_start,
 	.next  = dl_seq_next,
@@ -1019,6 +1228,18 @@ static const struct seq_operations dl_seq_ops = {
 	.show  = dl_seq_show
 };
 
+static int dl_proc_open_v2(struct inode *inode, struct file *file)
+{
+	int ret = seq_open(file, &dl_seq_ops_v2);
+
+	if (!ret) {
+		struct seq_file *sf = file->private_data;
+
+		sf->private = PDE_DATA(inode);
+	}
+	return ret;
+}
+
 static int dl_proc_open_v1(struct inode *inode, struct file *file)
 {
 	int ret = seq_open(file, &dl_seq_ops_v1);
@@ -1042,6 +1263,14 @@ static int dl_proc_open(struct inode *inode, struct file *file)
 	return ret;
 }
 
+static const struct file_operations dl_file_ops_v2 = {
+	.owner   = THIS_MODULE,
+	.open    = dl_proc_open_v2,
+	.read    = seq_read,
+	.llseek  = seq_lseek,
+	.release = seq_release
+};
+
 static const struct file_operations dl_file_ops_v1 = {
 	.owner   = THIS_MODULE,
 	.open    = dl_proc_open_v1,
-- 
cgit v1.2.3


From dfc46034b54af3abf594de75a1ee43ef2ec2a60a Mon Sep 17 00:00:00 2001
From: "Pablo M. Bermudo Garay" <pablombg@gmail.com>
Date: Wed, 23 Aug 2017 22:41:23 +0200
Subject: netfilter: nf_tables: add select_ops for stateful objects

This patch adds support for overloading stateful objects operations
through the select_ops() callback, just as it is implemented for
expressions.

This change is needed for upcoming additions to the stateful objects
infrastructure.

Signed-off-by: Pablo M. Bermudo Garay <pablombg@gmail.com>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/net/netfilter/nf_tables.h | 35 ++++++++++++++++++++++++-----------
 net/netfilter/nf_tables_api.c     | 36 ++++++++++++++++++++++++------------
 net/netfilter/nft_counter.c       | 20 +++++++++++++-------
 net/netfilter/nft_ct.c            | 18 ++++++++++++------
 net/netfilter/nft_objref.c        |  7 ++++---
 net/netfilter/nft_quota.c         | 20 +++++++++++++-------
 6 files changed, 90 insertions(+), 46 deletions(-)

(limited to 'include')

diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h
index f9795fe394f3..0f5b12a4ad09 100644
--- a/include/net/netfilter/nf_tables.h
+++ b/include/net/netfilter/nf_tables.h
@@ -1007,12 +1007,12 @@ int nft_verdict_dump(struct sk_buff *skb, int type,
  *
  *	@list: table stateful object list node
  *	@table: table this object belongs to
- *	@type: pointer to object type
- *	@data: pointer to object data
  *	@name: name of this stateful object
  *	@genmask: generation mask
  *	@use: number of references to this stateful object
  * 	@data: object data, layout depends on type
+ *	@ops: object operations
+ *	@data: pointer to object data
  */
 struct nft_object {
 	struct list_head		list;
@@ -1021,7 +1021,7 @@ struct nft_object {
 	u32				genmask:2,
 					use:30;
 	/* runtime data below here */
-	const struct nft_object_type	*type ____cacheline_aligned;
+	const struct nft_object_ops	*ops ____cacheline_aligned;
 	unsigned char			data[]
 		__attribute__((aligned(__alignof__(u64))));
 };
@@ -1044,27 +1044,39 @@ void nft_obj_notify(struct net *net, struct nft_table *table,
 /**
  *	struct nft_object_type - stateful object type
  *
- *	@eval: stateful object evaluation function
+ *	@select_ops: function to select nft_object_ops
+ *	@ops: default ops, used when no select_ops functions is present
  *	@list: list node in list of object types
  *	@type: stateful object numeric type
- *	@size: stateful object size
  *	@owner: module owner
  *	@maxattr: maximum netlink attribute
  *	@policy: netlink attribute policy
+ */
+struct nft_object_type {
+	const struct nft_object_ops	*(*select_ops)(const struct nft_ctx *,
+						       const struct nlattr * const tb[]);
+	const struct nft_object_ops	*ops;
+	struct list_head		list;
+	u32				type;
+	unsigned int                    maxattr;
+	struct module			*owner;
+	const struct nla_policy		*policy;
+};
+
+/**
+ *	struct nft_object_ops - stateful object operations
+ *
+ *	@eval: stateful object evaluation function
+ *	@size: stateful object size
  *	@init: initialize object from netlink attributes
  *	@destroy: release existing stateful object
  *	@dump: netlink dump stateful object
  */
-struct nft_object_type {
+struct nft_object_ops {
 	void				(*eval)(struct nft_object *obj,
 						struct nft_regs *regs,
 						const struct nft_pktinfo *pkt);
-	struct list_head		list;
-	u32				type;
 	unsigned int			size;
-	unsigned int			maxattr;
-	struct module			*owner;
-	const struct nla_policy		*policy;
 	int				(*init)(const struct nft_ctx *ctx,
 						const struct nlattr *const tb[],
 						struct nft_object *obj);
@@ -1072,6 +1084,7 @@ struct nft_object_type {
 	int				(*dump)(struct sk_buff *skb,
 						struct nft_object *obj,
 						bool reset);
+	const struct nft_object_type	*type;
 };
 
 int nft_register_obj(struct nft_object_type *obj_type);
diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c
index 149785ff1c7b..b37c178897f3 100644
--- a/net/netfilter/nf_tables_api.c
+++ b/net/netfilter/nf_tables_api.c
@@ -4248,7 +4248,7 @@ struct nft_object *nf_tables_obj_lookup(const struct nft_table *table,
 
 	list_for_each_entry(obj, &table->objects, list) {
 		if (!nla_strcmp(nla, obj->name) &&
-		    objtype == obj->type->type &&
+		    objtype == obj->ops->type->type &&
 		    nft_active_genmask(obj, genmask))
 			return obj;
 	}
@@ -4270,6 +4270,7 @@ static struct nft_object *nft_obj_init(const struct nft_ctx *ctx,
 				       const struct nlattr *attr)
 {
 	struct nlattr *tb[type->maxattr + 1];
+	const struct nft_object_ops *ops;
 	struct nft_object *obj;
 	int err;
 
@@ -4282,16 +4283,27 @@ static struct nft_object *nft_obj_init(const struct nft_ctx *ctx,
 		memset(tb, 0, sizeof(tb[0]) * (type->maxattr + 1));
 	}
 
+	if (type->select_ops) {
+		ops = type->select_ops(ctx, (const struct nlattr * const *)tb);
+		if (IS_ERR(ops)) {
+			err = PTR_ERR(ops);
+			goto err1;
+		}
+	} else {
+		ops = type->ops;
+	}
+
 	err = -ENOMEM;
-	obj = kzalloc(sizeof(struct nft_object) + type->size, GFP_KERNEL);
+	obj = kzalloc(sizeof(*obj) + ops->size, GFP_KERNEL);
 	if (obj == NULL)
 		goto err1;
 
-	err = type->init(ctx, (const struct nlattr * const *)tb, obj);
+	err = ops->init(ctx, (const struct nlattr * const *)tb, obj);
 	if (err < 0)
 		goto err2;
 
-	obj->type = type;
+	obj->ops = ops;
+
 	return obj;
 err2:
 	kfree(obj);
@@ -4307,7 +4319,7 @@ static int nft_object_dump(struct sk_buff *skb, unsigned int attr,
 	nest = nla_nest_start(skb, attr);
 	if (!nest)
 		goto nla_put_failure;
-	if (obj->type->dump(skb, obj, reset) < 0)
+	if (obj->ops->dump(skb, obj, reset) < 0)
 		goto nla_put_failure;
 	nla_nest_end(skb, nest);
 	return 0;
@@ -4418,8 +4430,8 @@ static int nf_tables_newobj(struct net *net, struct sock *nlsk,
 err3:
 	kfree(obj->name);
 err2:
-	if (obj->type->destroy)
-		obj->type->destroy(obj);
+	if (obj->ops->destroy)
+		obj->ops->destroy(obj);
 	kfree(obj);
 err1:
 	module_put(type->owner);
@@ -4446,7 +4458,7 @@ static int nf_tables_fill_obj_info(struct sk_buff *skb, struct net *net,
 
 	if (nla_put_string(skb, NFTA_OBJ_TABLE, table->name) ||
 	    nla_put_string(skb, NFTA_OBJ_NAME, obj->name) ||
-	    nla_put_be32(skb, NFTA_OBJ_TYPE, htonl(obj->type->type)) ||
+	    nla_put_be32(skb, NFTA_OBJ_TYPE, htonl(obj->ops->type->type)) ||
 	    nla_put_be32(skb, NFTA_OBJ_USE, htonl(obj->use)) ||
 	    nft_object_dump(skb, NFTA_OBJ_DATA, obj, reset))
 		goto nla_put_failure;
@@ -4500,7 +4512,7 @@ static int nf_tables_dump_obj(struct sk_buff *skb, struct netlink_callback *cb)
 					goto cont;
 				if (filter &&
 				    filter->type != NFT_OBJECT_UNSPEC &&
-				    obj->type->type != filter->type)
+				    obj->ops->type->type != filter->type)
 					goto cont;
 
 				if (nf_tables_fill_obj_info(skb, net, NETLINK_CB(cb->skb).portid,
@@ -4628,10 +4640,10 @@ err:
 
 static void nft_obj_destroy(struct nft_object *obj)
 {
-	if (obj->type->destroy)
-		obj->type->destroy(obj);
+	if (obj->ops->destroy)
+		obj->ops->destroy(obj);
 
-	module_put(obj->type->owner);
+	module_put(obj->ops->type->owner);
 	kfree(obj->name);
 	kfree(obj);
 }
diff --git a/net/netfilter/nft_counter.c b/net/netfilter/nft_counter.c
index 67a710ebde09..eefe3b409925 100644
--- a/net/netfilter/nft_counter.c
+++ b/net/netfilter/nft_counter.c
@@ -175,15 +175,21 @@ static const struct nla_policy nft_counter_policy[NFTA_COUNTER_MAX + 1] = {
 	[NFTA_COUNTER_BYTES]	= { .type = NLA_U64 },
 };
 
-static struct nft_object_type nft_counter_obj __read_mostly = {
-	.type		= NFT_OBJECT_COUNTER,
+static struct nft_object_type nft_counter_obj_type;
+static const struct nft_object_ops nft_counter_obj_ops = {
+	.type		= &nft_counter_obj_type,
 	.size		= sizeof(struct nft_counter_percpu_priv),
-	.maxattr	= NFTA_COUNTER_MAX,
-	.policy		= nft_counter_policy,
 	.eval		= nft_counter_obj_eval,
 	.init		= nft_counter_obj_init,
 	.destroy	= nft_counter_obj_destroy,
 	.dump		= nft_counter_obj_dump,
+};
+
+static struct nft_object_type nft_counter_obj_type __read_mostly = {
+	.type		= NFT_OBJECT_COUNTER,
+	.ops		= &nft_counter_obj_ops,
+	.maxattr	= NFTA_COUNTER_MAX,
+	.policy		= nft_counter_policy,
 	.owner		= THIS_MODULE,
 };
 
@@ -271,7 +277,7 @@ static int __init nft_counter_module_init(void)
 	for_each_possible_cpu(cpu)
 		seqcount_init(per_cpu_ptr(&nft_counter_seq, cpu));
 
-	err = nft_register_obj(&nft_counter_obj);
+	err = nft_register_obj(&nft_counter_obj_type);
 	if (err < 0)
 		return err;
 
@@ -281,14 +287,14 @@ static int __init nft_counter_module_init(void)
 
 	return 0;
 err1:
-	nft_unregister_obj(&nft_counter_obj);
+	nft_unregister_obj(&nft_counter_obj_type);
 	return err;
 }
 
 static void __exit nft_counter_module_exit(void)
 {
 	nft_unregister_expr(&nft_counter_type);
-	nft_unregister_obj(&nft_counter_obj);
+	nft_unregister_obj(&nft_counter_obj_type);
 }
 
 module_init(nft_counter_module_init);
diff --git a/net/netfilter/nft_ct.c b/net/netfilter/nft_ct.c
index 1678e9e75e8e..bd0975d7dd6f 100644
--- a/net/netfilter/nft_ct.c
+++ b/net/netfilter/nft_ct.c
@@ -904,15 +904,21 @@ static const struct nla_policy nft_ct_helper_policy[NFTA_CT_HELPER_MAX + 1] = {
 	[NFTA_CT_HELPER_L4PROTO] = { .type = NLA_U8 },
 };
 
-static struct nft_object_type nft_ct_helper_obj __read_mostly = {
-	.type		= NFT_OBJECT_CT_HELPER,
+static struct nft_object_type nft_ct_helper_obj_type;
+static const struct nft_object_ops nft_ct_helper_obj_ops = {
+	.type		= &nft_ct_helper_obj_type,
 	.size		= sizeof(struct nft_ct_helper_obj),
-	.maxattr	= NFTA_CT_HELPER_MAX,
-	.policy		= nft_ct_helper_policy,
 	.eval		= nft_ct_helper_obj_eval,
 	.init		= nft_ct_helper_obj_init,
 	.destroy	= nft_ct_helper_obj_destroy,
 	.dump		= nft_ct_helper_obj_dump,
+};
+
+static struct nft_object_type nft_ct_helper_obj_type __read_mostly = {
+	.type		= NFT_OBJECT_CT_HELPER,
+	.ops		= &nft_ct_helper_obj_ops,
+	.maxattr	= NFTA_CT_HELPER_MAX,
+	.policy		= nft_ct_helper_policy,
 	.owner		= THIS_MODULE,
 };
 
@@ -930,7 +936,7 @@ static int __init nft_ct_module_init(void)
 	if (err < 0)
 		goto err1;
 
-	err = nft_register_obj(&nft_ct_helper_obj);
+	err = nft_register_obj(&nft_ct_helper_obj_type);
 	if (err < 0)
 		goto err2;
 
@@ -945,7 +951,7 @@ err1:
 
 static void __exit nft_ct_module_exit(void)
 {
-	nft_unregister_obj(&nft_ct_helper_obj);
+	nft_unregister_obj(&nft_ct_helper_obj_type);
 	nft_unregister_expr(&nft_notrack_type);
 	nft_unregister_expr(&nft_ct_type);
 }
diff --git a/net/netfilter/nft_objref.c b/net/netfilter/nft_objref.c
index 1dd428fbaaa3..7bcdc48f3d73 100644
--- a/net/netfilter/nft_objref.c
+++ b/net/netfilter/nft_objref.c
@@ -22,7 +22,7 @@ static void nft_objref_eval(const struct nft_expr *expr,
 {
 	struct nft_object *obj = nft_objref_priv(expr);
 
-	obj->type->eval(obj, regs, pkt);
+	obj->ops->eval(obj, regs, pkt);
 }
 
 static int nft_objref_init(const struct nft_ctx *ctx,
@@ -54,7 +54,8 @@ static int nft_objref_dump(struct sk_buff *skb, const struct nft_expr *expr)
 	const struct nft_object *obj = nft_objref_priv(expr);
 
 	if (nla_put_string(skb, NFTA_OBJREF_IMM_NAME, obj->name) ||
-	    nla_put_be32(skb, NFTA_OBJREF_IMM_TYPE, htonl(obj->type->type)))
+	    nla_put_be32(skb, NFTA_OBJREF_IMM_TYPE,
+			 htonl(obj->ops->type->type)))
 		goto nla_put_failure;
 
 	return 0;
@@ -104,7 +105,7 @@ static void nft_objref_map_eval(const struct nft_expr *expr,
 		return;
 	}
 	obj = *nft_set_ext_obj(ext);
-	obj->type->eval(obj, regs, pkt);
+	obj->ops->eval(obj, regs, pkt);
 }
 
 static int nft_objref_map_init(const struct nft_ctx *ctx,
diff --git a/net/netfilter/nft_quota.c b/net/netfilter/nft_quota.c
index 25e33159be57..0ed124a93fcf 100644
--- a/net/netfilter/nft_quota.c
+++ b/net/netfilter/nft_quota.c
@@ -151,14 +151,20 @@ static int nft_quota_obj_dump(struct sk_buff *skb, struct nft_object *obj,
 	return nft_quota_do_dump(skb, priv, reset);
 }
 
-static struct nft_object_type nft_quota_obj __read_mostly = {
-	.type		= NFT_OBJECT_QUOTA,
+static struct nft_object_type nft_quota_obj_type;
+static const struct nft_object_ops nft_quota_obj_ops = {
+	.type		= &nft_quota_obj_type,
 	.size		= sizeof(struct nft_quota),
-	.maxattr	= NFTA_QUOTA_MAX,
-	.policy		= nft_quota_policy,
 	.init		= nft_quota_obj_init,
 	.eval		= nft_quota_obj_eval,
 	.dump		= nft_quota_obj_dump,
+};
+
+static struct nft_object_type nft_quota_obj_type __read_mostly = {
+	.type		= NFT_OBJECT_QUOTA,
+	.ops		= &nft_quota_obj_ops,
+	.maxattr	= NFTA_QUOTA_MAX,
+	.policy		= nft_quota_policy,
 	.owner		= THIS_MODULE,
 };
 
@@ -209,7 +215,7 @@ static int __init nft_quota_module_init(void)
 {
 	int err;
 
-	err = nft_register_obj(&nft_quota_obj);
+	err = nft_register_obj(&nft_quota_obj_type);
 	if (err < 0)
 		return err;
 
@@ -219,14 +225,14 @@ static int __init nft_quota_module_init(void)
 
 	return 0;
 err1:
-	nft_unregister_obj(&nft_quota_obj);
+	nft_unregister_obj(&nft_quota_obj_type);
 	return err;
 }
 
 static void __exit nft_quota_module_exit(void)
 {
 	nft_unregister_expr(&nft_quota_type);
-	nft_unregister_obj(&nft_quota_obj);
+	nft_unregister_obj(&nft_quota_obj_type);
 }
 
 module_init(nft_quota_module_init);
-- 
cgit v1.2.3


From a691205571723cb0544110ca91653ac4b0eb5b17 Mon Sep 17 00:00:00 2001
From: "Pablo M. Bermudo Garay" <pablombg@gmail.com>
Date: Wed, 23 Aug 2017 22:41:25 +0200
Subject: netfilter: nft_limit: add stateful object type

Register a new limit stateful object type into the stateful object
infrastructure.

Signed-off-by: Pablo M. Bermudo Garay <pablombg@gmail.com>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/uapi/linux/netfilter/nf_tables.h |   3 +-
 net/netfilter/nft_limit.c                | 122 ++++++++++++++++++++++++++++++-
 2 files changed, 123 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/uapi/linux/netfilter/nf_tables.h b/include/uapi/linux/netfilter/nf_tables.h
index b49da72efa68..871afa4871bf 100644
--- a/include/uapi/linux/netfilter/nf_tables.h
+++ b/include/uapi/linux/netfilter/nf_tables.h
@@ -1282,7 +1282,8 @@ enum nft_ct_helper_attributes {
 #define NFT_OBJECT_COUNTER	1
 #define NFT_OBJECT_QUOTA	2
 #define NFT_OBJECT_CT_HELPER	3
-#define __NFT_OBJECT_MAX	4
+#define NFT_OBJECT_LIMIT	4
+#define __NFT_OBJECT_MAX	5
 #define NFT_OBJECT_MAX		(__NFT_OBJECT_MAX - 1)
 
 /**
diff --git a/net/netfilter/nft_limit.c b/net/netfilter/nft_limit.c
index aae2d1ec27f3..a9fc298ef4c3 100644
--- a/net/netfilter/nft_limit.c
+++ b/net/netfilter/nft_limit.c
@@ -229,14 +229,133 @@ static struct nft_expr_type nft_limit_type __read_mostly = {
 	.owner		= THIS_MODULE,
 };
 
+static void nft_limit_obj_pkts_eval(struct nft_object *obj,
+				    struct nft_regs *regs,
+				    const struct nft_pktinfo *pkt)
+{
+	struct nft_limit_pkts *priv = nft_obj_data(obj);
+
+	if (nft_limit_eval(&priv->limit, priv->cost))
+		regs->verdict.code = NFT_BREAK;
+}
+
+static int nft_limit_obj_pkts_init(const struct nft_ctx *ctx,
+				   const struct nlattr * const tb[],
+				   struct nft_object *obj)
+{
+	struct nft_limit_pkts *priv = nft_obj_data(obj);
+	int err;
+
+	err = nft_limit_init(&priv->limit, tb);
+	if (err < 0)
+		return err;
+
+	priv->cost = div64_u64(priv->limit.nsecs, priv->limit.rate);
+	return 0;
+}
+
+static int nft_limit_obj_pkts_dump(struct sk_buff *skb,
+				   struct nft_object *obj,
+				   bool reset)
+{
+	const struct nft_limit_pkts *priv = nft_obj_data(obj);
+
+	return nft_limit_dump(skb, &priv->limit, NFT_LIMIT_PKTS);
+}
+
+static struct nft_object_type nft_limit_obj_type;
+static const struct nft_object_ops nft_limit_obj_pkts_ops = {
+	.type		= &nft_limit_obj_type,
+	.size		= NFT_EXPR_SIZE(sizeof(struct nft_limit_pkts)),
+	.init		= nft_limit_obj_pkts_init,
+	.eval		= nft_limit_obj_pkts_eval,
+	.dump		= nft_limit_obj_pkts_dump,
+};
+
+static void nft_limit_obj_bytes_eval(struct nft_object *obj,
+				     struct nft_regs *regs,
+				     const struct nft_pktinfo *pkt)
+{
+	struct nft_limit *priv = nft_obj_data(obj);
+	u64 cost = div64_u64(priv->nsecs * pkt->skb->len, priv->rate);
+
+	if (nft_limit_eval(priv, cost))
+		regs->verdict.code = NFT_BREAK;
+}
+
+static int nft_limit_obj_bytes_init(const struct nft_ctx *ctx,
+				    const struct nlattr * const tb[],
+				    struct nft_object *obj)
+{
+	struct nft_limit *priv = nft_obj_data(obj);
+
+	return nft_limit_init(priv, tb);
+}
+
+static int nft_limit_obj_bytes_dump(struct sk_buff *skb,
+				    struct nft_object *obj,
+				    bool reset)
+{
+	const struct nft_limit *priv = nft_obj_data(obj);
+
+	return nft_limit_dump(skb, priv, NFT_LIMIT_PKT_BYTES);
+}
+
+static struct nft_object_type nft_limit_obj_type;
+static const struct nft_object_ops nft_limit_obj_bytes_ops = {
+	.type		= &nft_limit_obj_type,
+	.size		= sizeof(struct nft_limit),
+	.init		= nft_limit_obj_bytes_init,
+	.eval		= nft_limit_obj_bytes_eval,
+	.dump		= nft_limit_obj_bytes_dump,
+};
+
+static const struct nft_object_ops *
+nft_limit_obj_select_ops(const struct nft_ctx *ctx,
+			 const struct nlattr * const tb[])
+{
+	if (!tb[NFTA_LIMIT_TYPE])
+		return &nft_limit_obj_pkts_ops;
+
+	switch (ntohl(nla_get_be32(tb[NFTA_LIMIT_TYPE]))) {
+	case NFT_LIMIT_PKTS:
+		return &nft_limit_obj_pkts_ops;
+	case NFT_LIMIT_PKT_BYTES:
+		return &nft_limit_obj_bytes_ops;
+	}
+	return ERR_PTR(-EOPNOTSUPP);
+}
+
+static struct nft_object_type nft_limit_obj_type __read_mostly = {
+	.select_ops	= nft_limit_obj_select_ops,
+	.type		= NFT_OBJECT_LIMIT,
+	.maxattr	= NFTA_LIMIT_MAX,
+	.policy		= nft_limit_policy,
+	.owner		= THIS_MODULE,
+};
+
 static int __init nft_limit_module_init(void)
 {
-	return nft_register_expr(&nft_limit_type);
+	int err;
+
+	err = nft_register_obj(&nft_limit_obj_type);
+	if (err < 0)
+		return err;
+
+	err = nft_register_expr(&nft_limit_type);
+	if (err < 0)
+		goto err1;
+
+	return 0;
+err1:
+	nft_unregister_obj(&nft_limit_obj_type);
+	return err;
 }
 
 static void __exit nft_limit_module_exit(void)
 {
 	nft_unregister_expr(&nft_limit_type);
+	nft_unregister_obj(&nft_limit_obj_type);
 }
 
 module_init(nft_limit_module_init);
@@ -245,3 +364,4 @@ module_exit(nft_limit_module_exit);
 MODULE_LICENSE("GPL");
 MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>");
 MODULE_ALIAS_NFT_EXPR("limit");
+MODULE_ALIAS_NFT_OBJ(NFT_OBJECT_LIMIT);
-- 
cgit v1.2.3


From d1c1e39de8357d66163da39e893e38ea1410e8f8 Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Tue, 29 Aug 2017 12:04:10 +0200
Subject: netfilter: remove unused hooknum arg from packet functions

tested with allmodconfig build.

Signed-off-by: Florian Westphal <fw@strlen.de>
---
 include/net/netfilter/nf_conntrack_l4proto.h   | 1 -
 net/ipv4/netfilter/nf_conntrack_proto_icmp.c   | 1 -
 net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c | 6 ++----
 net/netfilter/nf_conntrack_core.c              | 2 +-
 net/netfilter/nf_conntrack_proto_dccp.c        | 2 +-
 net/netfilter/nf_conntrack_proto_generic.c     | 1 -
 net/netfilter/nf_conntrack_proto_gre.c         | 1 -
 net/netfilter/nf_conntrack_proto_sctp.c        | 1 -
 net/netfilter/nf_conntrack_proto_tcp.c         | 1 -
 net/netfilter/nf_conntrack_proto_udp.c         | 1 -
 10 files changed, 4 insertions(+), 13 deletions(-)

(limited to 'include')

diff --git a/include/net/netfilter/nf_conntrack_l4proto.h b/include/net/netfilter/nf_conntrack_l4proto.h
index d4933d56809d..738a0307a96b 100644
--- a/include/net/netfilter/nf_conntrack_l4proto.h
+++ b/include/net/netfilter/nf_conntrack_l4proto.h
@@ -43,7 +43,6 @@ struct nf_conntrack_l4proto {
 		      unsigned int dataoff,
 		      enum ip_conntrack_info ctinfo,
 		      u_int8_t pf,
-		      unsigned int hooknum,
 		      unsigned int *timeouts);
 
 	/* Called when a new connection for this protocol found;
diff --git a/net/ipv4/netfilter/nf_conntrack_proto_icmp.c b/net/ipv4/netfilter/nf_conntrack_proto_icmp.c
index 434b4e20f6db..ce108a996316 100644
--- a/net/ipv4/netfilter/nf_conntrack_proto_icmp.c
+++ b/net/ipv4/netfilter/nf_conntrack_proto_icmp.c
@@ -82,7 +82,6 @@ static int icmp_packet(struct nf_conn *ct,
 		       unsigned int dataoff,
 		       enum ip_conntrack_info ctinfo,
 		       u_int8_t pf,
-		       unsigned int hooknum,
 		       unsigned int *timeout)
 {
 	/* Do not immediately delete the connection after the first
diff --git a/net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c b/net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c
index 43544b975eae..30e34c4de003 100644
--- a/net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c
+++ b/net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c
@@ -95,7 +95,6 @@ static int icmpv6_packet(struct nf_conn *ct,
 		       unsigned int dataoff,
 		       enum ip_conntrack_info ctinfo,
 		       u_int8_t pf,
-		       unsigned int hooknum,
 		       unsigned int *timeout)
 {
 	/* Do not immediately delete the connection after the first
@@ -129,8 +128,7 @@ static bool icmpv6_new(struct nf_conn *ct, const struct sk_buff *skb,
 static int
 icmpv6_error_message(struct net *net, struct nf_conn *tmpl,
 		     struct sk_buff *skb,
-		     unsigned int icmp6off,
-		     unsigned int hooknum)
+		     unsigned int icmp6off)
 {
 	struct nf_conntrack_tuple intuple, origtuple;
 	const struct nf_conntrack_tuple_hash *h;
@@ -214,7 +212,7 @@ icmpv6_error(struct net *net, struct nf_conn *tmpl,
 	if (icmp6h->icmp6_type >= 128)
 		return NF_ACCEPT;
 
-	return icmpv6_error_message(net, tmpl, skb, dataoff, hooknum);
+	return icmpv6_error_message(net, tmpl, skb, dataoff);
 }
 
 #if IS_ENABLED(CONFIG_NF_CT_NETLINK)
diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c
index c23df7c9cd59..ee5555dd7ebc 100644
--- a/net/netfilter/nf_conntrack_core.c
+++ b/net/netfilter/nf_conntrack_core.c
@@ -1414,7 +1414,7 @@ repeat:
 	/* Decide what timeout policy we want to apply to this flow. */
 	timeouts = nf_ct_timeout_lookup(net, ct, l4proto);
 
-	ret = l4proto->packet(ct, skb, dataoff, ctinfo, pf, hooknum, timeouts);
+	ret = l4proto->packet(ct, skb, dataoff, ctinfo, pf, timeouts);
 	if (ret <= 0) {
 		/* Invalid: inverse of the return code tells
 		 * the netfilter core what to do */
diff --git a/net/netfilter/nf_conntrack_proto_dccp.c b/net/netfilter/nf_conntrack_proto_dccp.c
index 188347571fc7..0f5a4d79f6b8 100644
--- a/net/netfilter/nf_conntrack_proto_dccp.c
+++ b/net/netfilter/nf_conntrack_proto_dccp.c
@@ -469,7 +469,7 @@ static unsigned int *dccp_get_timeouts(struct net *net)
 
 static int dccp_packet(struct nf_conn *ct, const struct sk_buff *skb,
 		       unsigned int dataoff, enum ip_conntrack_info ctinfo,
-		       u_int8_t pf, unsigned int hooknum,
+		       u_int8_t pf,
 		       unsigned int *timeouts)
 {
 	struct net *net = nf_ct_net(ct);
diff --git a/net/netfilter/nf_conntrack_proto_generic.c b/net/netfilter/nf_conntrack_proto_generic.c
index 2993995b690d..9cd40700842e 100644
--- a/net/netfilter/nf_conntrack_proto_generic.c
+++ b/net/netfilter/nf_conntrack_proto_generic.c
@@ -61,7 +61,6 @@ static int generic_packet(struct nf_conn *ct,
 			  unsigned int dataoff,
 			  enum ip_conntrack_info ctinfo,
 			  u_int8_t pf,
-			  unsigned int hooknum,
 			  unsigned int *timeout)
 {
 	nf_ct_refresh_acct(ct, ctinfo, skb, *timeout);
diff --git a/net/netfilter/nf_conntrack_proto_gre.c b/net/netfilter/nf_conntrack_proto_gre.c
index c0e3a23ac23a..09a90484c27d 100644
--- a/net/netfilter/nf_conntrack_proto_gre.c
+++ b/net/netfilter/nf_conntrack_proto_gre.c
@@ -245,7 +245,6 @@ static int gre_packet(struct nf_conn *ct,
 		      unsigned int dataoff,
 		      enum ip_conntrack_info ctinfo,
 		      u_int8_t pf,
-		      unsigned int hooknum,
 		      unsigned int *timeouts)
 {
 	/* If we've seen traffic both ways, this is a GRE connection.
diff --git a/net/netfilter/nf_conntrack_proto_sctp.c b/net/netfilter/nf_conntrack_proto_sctp.c
index 890b5c73368d..6303a88af12b 100644
--- a/net/netfilter/nf_conntrack_proto_sctp.c
+++ b/net/netfilter/nf_conntrack_proto_sctp.c
@@ -307,7 +307,6 @@ static int sctp_packet(struct nf_conn *ct,
 		       unsigned int dataoff,
 		       enum ip_conntrack_info ctinfo,
 		       u_int8_t pf,
-		       unsigned int hooknum,
 		       unsigned int *timeouts)
 {
 	enum sctp_conntrack new_state, old_state;
diff --git a/net/netfilter/nf_conntrack_proto_tcp.c b/net/netfilter/nf_conntrack_proto_tcp.c
index 33c52d9ab2f5..cba1c6ffe51a 100644
--- a/net/netfilter/nf_conntrack_proto_tcp.c
+++ b/net/netfilter/nf_conntrack_proto_tcp.c
@@ -803,7 +803,6 @@ static int tcp_packet(struct nf_conn *ct,
 		      unsigned int dataoff,
 		      enum ip_conntrack_info ctinfo,
 		      u_int8_t pf,
-		      unsigned int hooknum,
 		      unsigned int *timeouts)
 {
 	struct net *net = nf_ct_net(ct);
diff --git a/net/netfilter/nf_conntrack_proto_udp.c b/net/netfilter/nf_conntrack_proto_udp.c
index dcf3030d2226..8af734cd1a94 100644
--- a/net/netfilter/nf_conntrack_proto_udp.c
+++ b/net/netfilter/nf_conntrack_proto_udp.c
@@ -74,7 +74,6 @@ static int udp_packet(struct nf_conn *ct,
 		      unsigned int dataoff,
 		      enum ip_conntrack_info ctinfo,
 		      u_int8_t pf,
-		      unsigned int hooknum,
 		      unsigned int *timeouts)
 {
 	/* If we've seen traffic both ways, this is some kind of UDP
-- 
cgit v1.2.3


From 44d6e2f27328b254111dd716fde45b3b59b8a4f7 Mon Sep 17 00:00:00 2001
From: Varsha Rao <rvarsha016@gmail.com>
Date: Wed, 30 Aug 2017 13:37:11 +0530
Subject: net: Replace NF_CT_ASSERT() with WARN_ON().

This patch removes NF_CT_ASSERT() and instead uses WARN_ON().

Signed-off-by: Varsha Rao <rvarsha016@gmail.com>
---
 include/net/netfilter/nf_conntrack.h           |  2 +-
 net/ipv4/netfilter/nf_conntrack_proto_icmp.c   |  2 +-
 net/ipv4/netfilter/nf_nat_l3proto_ipv4.c       |  6 +++---
 net/ipv4/netfilter/nf_nat_masquerade_ipv4.c    |  8 ++++----
 net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c |  2 +-
 net/ipv6/netfilter/nf_nat_l3proto_ipv6.c       |  6 +++---
 net/ipv6/netfilter/nf_nat_masquerade_ipv6.c    |  4 ++--
 net/netfilter/nf_conntrack_core.c              | 11 +++++------
 net/netfilter/nf_conntrack_expect.c            |  4 ++--
 net/netfilter/nf_conntrack_extend.c            |  2 +-
 net/netfilter/nf_conntrack_standalone.c        |  6 +++---
 net/netfilter/nf_nat_core.c                    |  4 ++--
 net/netfilter/nf_nat_redirect.c                |  6 +++---
 net/netfilter/xt_NETMAP.c                      |  8 ++++----
 net/netfilter/xt_nat.c                         | 20 ++++++++++----------
 15 files changed, 45 insertions(+), 46 deletions(-)

(limited to 'include')

diff --git a/include/net/netfilter/nf_conntrack.h b/include/net/netfilter/nf_conntrack.h
index 6e6f678aaac7..0385cb08c478 100644
--- a/include/net/netfilter/nf_conntrack.h
+++ b/include/net/netfilter/nf_conntrack.h
@@ -159,7 +159,7 @@ nf_ct_get(const struct sk_buff *skb, enum ip_conntrack_info *ctinfo)
 /* decrement reference count on a conntrack */
 static inline void nf_ct_put(struct nf_conn *ct)
 {
-	NF_CT_ASSERT(ct);
+	WARN_ON(!ct);
 	nf_conntrack_put(&ct->ct_general);
 }
 
diff --git a/net/ipv4/netfilter/nf_conntrack_proto_icmp.c b/net/ipv4/netfilter/nf_conntrack_proto_icmp.c
index ce108a996316..a046c298413a 100644
--- a/net/ipv4/netfilter/nf_conntrack_proto_icmp.c
+++ b/net/ipv4/netfilter/nf_conntrack_proto_icmp.c
@@ -126,7 +126,7 @@ icmp_error_message(struct net *net, struct nf_conn *tmpl, struct sk_buff *skb,
 	enum ip_conntrack_info ctinfo;
 	struct nf_conntrack_zone tmp;
 
-	NF_CT_ASSERT(!skb_nfct(skb));
+	WARN_ON(skb_nfct(skb));
 	zone = nf_ct_zone_tmpl(tmpl, skb, &tmp);
 
 	/* Are they talking about one of our connections? */
diff --git a/net/ipv4/netfilter/nf_nat_l3proto_ipv4.c b/net/ipv4/netfilter/nf_nat_l3proto_ipv4.c
index feedd759ca80..a0f37b208268 100644
--- a/net/ipv4/netfilter/nf_nat_l3proto_ipv4.c
+++ b/net/ipv4/netfilter/nf_nat_l3proto_ipv4.c
@@ -190,7 +190,7 @@ int nf_nat_icmp_reply_translation(struct sk_buff *skb,
 	struct nf_conntrack_tuple target;
 	unsigned long statusbit;
 
-	NF_CT_ASSERT(ctinfo == IP_CT_RELATED || ctinfo == IP_CT_RELATED_REPLY);
+	WARN_ON(ctinfo != IP_CT_RELATED && ctinfo != IP_CT_RELATED_REPLY);
 
 	if (!skb_make_writable(skb, hdrlen + sizeof(*inside)))
 		return 0;
@@ -306,8 +306,8 @@ nf_nat_ipv4_fn(void *priv, struct sk_buff *skb,
 
 	default:
 		/* ESTABLISHED */
-		NF_CT_ASSERT(ctinfo == IP_CT_ESTABLISHED ||
-			     ctinfo == IP_CT_ESTABLISHED_REPLY);
+		WARN_ON(ctinfo != IP_CT_ESTABLISHED &&
+			ctinfo != IP_CT_ESTABLISHED_REPLY);
 		if (nf_nat_oif_changed(state->hook, ctinfo, nat, state->out))
 			goto oif_changed;
 	}
diff --git a/net/ipv4/netfilter/nf_nat_masquerade_ipv4.c b/net/ipv4/netfilter/nf_nat_masquerade_ipv4.c
index f39037fca923..0c366aad89cb 100644
--- a/net/ipv4/netfilter/nf_nat_masquerade_ipv4.c
+++ b/net/ipv4/netfilter/nf_nat_masquerade_ipv4.c
@@ -34,12 +34,12 @@ nf_nat_masquerade_ipv4(struct sk_buff *skb, unsigned int hooknum,
 	const struct rtable *rt;
 	__be32 newsrc, nh;
 
-	NF_CT_ASSERT(hooknum == NF_INET_POST_ROUTING);
+	WARN_ON(hooknum != NF_INET_POST_ROUTING);
 
 	ct = nf_ct_get(skb, &ctinfo);
 
-	NF_CT_ASSERT(ct && (ctinfo == IP_CT_NEW || ctinfo == IP_CT_RELATED ||
-			    ctinfo == IP_CT_RELATED_REPLY));
+	WARN_ON(!(ct && (ctinfo == IP_CT_NEW || ctinfo == IP_CT_RELATED ||
+			 ctinfo == IP_CT_RELATED_REPLY)));
 
 	/* Source address is 0.0.0.0 - locally generated packet that is
 	 * probably not supposed to be masqueraded.
@@ -96,7 +96,7 @@ static int masq_device_event(struct notifier_block *this,
 		 * conntracks which were associated with that device,
 		 * and forget them.
 		 */
-		NF_CT_ASSERT(dev->ifindex != 0);
+		WARN_ON(dev->ifindex == 0);
 
 		nf_ct_iterate_cleanup_net(net, device_cmp,
 					  (void *)(long)dev->ifindex, 0, 0);
diff --git a/net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c b/net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c
index 30e34c4de003..a9e1fd1a8536 100644
--- a/net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c
+++ b/net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c
@@ -136,7 +136,7 @@ icmpv6_error_message(struct net *net, struct nf_conn *tmpl,
 	enum ip_conntrack_info ctinfo;
 	struct nf_conntrack_zone tmp;
 
-	NF_CT_ASSERT(!skb_nfct(skb));
+	WARN_ON(skb_nfct(skb));
 
 	/* Are they talking about one of our connections? */
 	if (!nf_ct_get_tuplepr(skb,
diff --git a/net/ipv6/netfilter/nf_nat_l3proto_ipv6.c b/net/ipv6/netfilter/nf_nat_l3proto_ipv6.c
index b2b4f031b3a1..46d6dba50698 100644
--- a/net/ipv6/netfilter/nf_nat_l3proto_ipv6.c
+++ b/net/ipv6/netfilter/nf_nat_l3proto_ipv6.c
@@ -196,7 +196,7 @@ int nf_nat_icmpv6_reply_translation(struct sk_buff *skb,
 	struct nf_conntrack_tuple target;
 	unsigned long statusbit;
 
-	NF_CT_ASSERT(ctinfo == IP_CT_RELATED || ctinfo == IP_CT_RELATED_REPLY);
+	WARN_ON(ctinfo != IP_CT_RELATED && ctinfo != IP_CT_RELATED_REPLY);
 
 	if (!skb_make_writable(skb, hdrlen + sizeof(*inside)))
 		return 0;
@@ -319,8 +319,8 @@ nf_nat_ipv6_fn(void *priv, struct sk_buff *skb,
 
 	default:
 		/* ESTABLISHED */
-		NF_CT_ASSERT(ctinfo == IP_CT_ESTABLISHED ||
-			     ctinfo == IP_CT_ESTABLISHED_REPLY);
+		WARN_ON(ctinfo != IP_CT_ESTABLISHED &&
+			ctinfo != IP_CT_ESTABLISHED_REPLY);
 		if (nf_nat_oif_changed(state->hook, ctinfo, nat, state->out))
 			goto oif_changed;
 	}
diff --git a/net/ipv6/netfilter/nf_nat_masquerade_ipv6.c b/net/ipv6/netfilter/nf_nat_masquerade_ipv6.c
index d7b679037bae..98f61fcb9108 100644
--- a/net/ipv6/netfilter/nf_nat_masquerade_ipv6.c
+++ b/net/ipv6/netfilter/nf_nat_masquerade_ipv6.c
@@ -36,8 +36,8 @@ nf_nat_masquerade_ipv6(struct sk_buff *skb, const struct nf_nat_range *range,
 	struct nf_nat_range newrange;
 
 	ct = nf_ct_get(skb, &ctinfo);
-	NF_CT_ASSERT(ct && (ctinfo == IP_CT_NEW || ctinfo == IP_CT_RELATED ||
-			    ctinfo == IP_CT_RELATED_REPLY));
+	WARN_ON(!(ct && (ctinfo == IP_CT_NEW || ctinfo == IP_CT_RELATED ||
+			 ctinfo == IP_CT_RELATED_REPLY)));
 
 	if (ipv6_dev_get_saddr(nf_ct_net(ct), out,
 			       &ipv6_hdr(skb)->daddr, 0, &src) < 0)
diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c
index ee5555dd7ebc..99c753c485ee 100644
--- a/net/netfilter/nf_conntrack_core.c
+++ b/net/netfilter/nf_conntrack_core.c
@@ -403,7 +403,7 @@ destroy_conntrack(struct nf_conntrack *nfct)
 	const struct nf_conntrack_l4proto *l4proto;
 
 	pr_debug("destroy_conntrack(%p)\n", ct);
-	NF_CT_ASSERT(atomic_read(&nfct->use) == 0);
+	WARN_ON(atomic_read(&nfct->use) != 0);
 
 	if (unlikely(nf_ct_is_template(ct))) {
 		nf_ct_tmpl_free(ct);
@@ -756,12 +756,11 @@ __nf_conntrack_confirm(struct sk_buff *skb)
 	 * connections for unconfirmed conns.  But packet copies and
 	 * REJECT will give spurious warnings here.
 	 */
-	/* NF_CT_ASSERT(atomic_read(&ct->ct_general.use) == 1); */
 
 	/* No external references means no one else could have
 	 * confirmed us.
 	 */
-	NF_CT_ASSERT(!nf_ct_is_confirmed(ct));
+	WARN_ON(nf_ct_is_confirmed(ct));
 	pr_debug("Confirming conntrack %p\n", ct);
 	/* We have to check the DYING flag after unlink to prevent
 	 * a race against nf_ct_get_next_corpse() possibly called from
@@ -1160,7 +1159,7 @@ void nf_conntrack_free(struct nf_conn *ct)
 	/* A freed object has refcnt == 0, that's
 	 * the golden rule for SLAB_TYPESAFE_BY_RCU
 	 */
-	NF_CT_ASSERT(atomic_read(&ct->ct_general.use) == 0);
+	WARN_ON(atomic_read(&ct->ct_general.use) != 0);
 
 	nf_ct_ext_destroy(ct);
 	nf_ct_ext_free(ct);
@@ -1468,7 +1467,7 @@ void nf_conntrack_alter_reply(struct nf_conn *ct,
 	struct nf_conn_help *help = nfct_help(ct);
 
 	/* Should be unconfirmed, so not in hash table yet */
-	NF_CT_ASSERT(!nf_ct_is_confirmed(ct));
+	WARN_ON(nf_ct_is_confirmed(ct));
 
 	pr_debug("Altering reply tuple of %p to ", ct);
 	nf_ct_dump_tuple(newreply);
@@ -1490,7 +1489,7 @@ void __nf_ct_refresh_acct(struct nf_conn *ct,
 			  unsigned long extra_jiffies,
 			  int do_acct)
 {
-	NF_CT_ASSERT(skb);
+	WARN_ON(!skb);
 
 	/* Only update if this is not a fixed timeout */
 	if (test_bit(IPS_FIXED_TIMEOUT_BIT, &ct->status))
diff --git a/net/netfilter/nf_conntrack_expect.c b/net/netfilter/nf_conntrack_expect.c
index dad2c0c22ad5..64778f9a8548 100644
--- a/net/netfilter/nf_conntrack_expect.c
+++ b/net/netfilter/nf_conntrack_expect.c
@@ -51,8 +51,8 @@ void nf_ct_unlink_expect_report(struct nf_conntrack_expect *exp,
 	struct nf_conn_help *master_help = nfct_help(exp->master);
 	struct net *net = nf_ct_exp_net(exp);
 
-	NF_CT_ASSERT(master_help);
-	NF_CT_ASSERT(!timer_pending(&exp->timeout));
+	WARN_ON(!master_help);
+	WARN_ON(timer_pending(&exp->timeout));
 
 	hlist_del_rcu(&exp->hnode);
 	net->ct.expect_count--;
diff --git a/net/netfilter/nf_conntrack_extend.c b/net/netfilter/nf_conntrack_extend.c
index 6c605e88ebae..9fe0ddc333fb 100644
--- a/net/netfilter/nf_conntrack_extend.c
+++ b/net/netfilter/nf_conntrack_extend.c
@@ -47,7 +47,7 @@ void *nf_ct_ext_add(struct nf_conn *ct, enum nf_ct_ext_id id, gfp_t gfp)
 	struct nf_ct_ext_type *t;
 
 	/* Conntrack must not be confirmed to avoid races on reallocation. */
-	NF_CT_ASSERT(!nf_ct_is_confirmed(ct));
+	WARN_ON(nf_ct_is_confirmed(ct));
 
 	old = ct->ext;
 
diff --git a/net/netfilter/nf_conntrack_standalone.c b/net/netfilter/nf_conntrack_standalone.c
index 9eb85858d764..5a101caa3e12 100644
--- a/net/netfilter/nf_conntrack_standalone.c
+++ b/net/netfilter/nf_conntrack_standalone.c
@@ -287,7 +287,7 @@ static int ct_seq_show(struct seq_file *s, void *v)
 	struct net *net = seq_file_net(s);
 	int ret = 0;
 
-	NF_CT_ASSERT(ct);
+	WARN_ON(!ct);
 	if (unlikely(!atomic_inc_not_zero(&ct->ct_general.use)))
 		return 0;
 
@@ -304,9 +304,9 @@ static int ct_seq_show(struct seq_file *s, void *v)
 		goto release;
 
 	l3proto = __nf_ct_l3proto_find(nf_ct_l3num(ct));
-	NF_CT_ASSERT(l3proto);
+	WARN_ON(!l3proto);
 	l4proto = __nf_ct_l4proto_find(nf_ct_l3num(ct), nf_ct_protonum(ct));
-	NF_CT_ASSERT(l4proto);
+	WARN_ON(!l4proto);
 
 	ret = -ENOSPC;
 	seq_printf(s, "%-8s %u %-8s %u %ld ",
diff --git a/net/netfilter/nf_nat_core.c b/net/netfilter/nf_nat_core.c
index b1d3740ae36a..40573aa6c133 100644
--- a/net/netfilter/nf_nat_core.c
+++ b/net/netfilter/nf_nat_core.c
@@ -414,8 +414,8 @@ nf_nat_setup_info(struct nf_conn *ct,
 	if (nf_ct_is_confirmed(ct))
 		return NF_ACCEPT;
 
-	NF_CT_ASSERT(maniptype == NF_NAT_MANIP_SRC ||
-		     maniptype == NF_NAT_MANIP_DST);
+	WARN_ON(maniptype != NF_NAT_MANIP_SRC &&
+		maniptype != NF_NAT_MANIP_DST);
 	BUG_ON(nf_nat_initialized(ct, maniptype));
 
 	/* What we've got will look like inverse of reply. Normally
diff --git a/net/netfilter/nf_nat_redirect.c b/net/netfilter/nf_nat_redirect.c
index 86067560a318..25b06b959118 100644
--- a/net/netfilter/nf_nat_redirect.c
+++ b/net/netfilter/nf_nat_redirect.c
@@ -38,11 +38,11 @@ nf_nat_redirect_ipv4(struct sk_buff *skb,
 	__be32 newdst;
 	struct nf_nat_range newrange;
 
-	NF_CT_ASSERT(hooknum == NF_INET_PRE_ROUTING ||
-		     hooknum == NF_INET_LOCAL_OUT);
+	WARN_ON(hooknum != NF_INET_PRE_ROUTING &&
+		hooknum != NF_INET_LOCAL_OUT);
 
 	ct = nf_ct_get(skb, &ctinfo);
-	NF_CT_ASSERT(ct && (ctinfo == IP_CT_NEW || ctinfo == IP_CT_RELATED));
+	WARN_ON(!(ct && (ctinfo == IP_CT_NEW || ctinfo == IP_CT_RELATED)));
 
 	/* Local packets: make them go to loopback */
 	if (hooknum == NF_INET_LOCAL_OUT) {
diff --git a/net/netfilter/xt_NETMAP.c b/net/netfilter/xt_NETMAP.c
index e45a01255e70..58aa9dd3c5b7 100644
--- a/net/netfilter/xt_NETMAP.c
+++ b/net/netfilter/xt_NETMAP.c
@@ -77,10 +77,10 @@ netmap_tg4(struct sk_buff *skb, const struct xt_action_param *par)
 	const struct nf_nat_ipv4_multi_range_compat *mr = par->targinfo;
 	struct nf_nat_range newrange;
 
-	NF_CT_ASSERT(xt_hooknum(par) == NF_INET_PRE_ROUTING ||
-		     xt_hooknum(par) == NF_INET_POST_ROUTING ||
-		     xt_hooknum(par) == NF_INET_LOCAL_OUT ||
-		     xt_hooknum(par) == NF_INET_LOCAL_IN);
+	WARN_ON(xt_hooknum(par) != NF_INET_PRE_ROUTING &&
+		xt_hooknum(par) != NF_INET_POST_ROUTING &&
+		xt_hooknum(par) != NF_INET_LOCAL_OUT &&
+		xt_hooknum(par) != NF_INET_LOCAL_IN);
 	ct = nf_ct_get(skb, &ctinfo);
 
 	netmask = ~(mr->range[0].min_ip ^ mr->range[0].max_ip);
diff --git a/net/netfilter/xt_nat.c b/net/netfilter/xt_nat.c
index 8107b3eb865f..0fd14d1eb09d 100644
--- a/net/netfilter/xt_nat.c
+++ b/net/netfilter/xt_nat.c
@@ -58,9 +58,9 @@ xt_snat_target_v0(struct sk_buff *skb, const struct xt_action_param *par)
 	struct nf_conn *ct;
 
 	ct = nf_ct_get(skb, &ctinfo);
-	NF_CT_ASSERT(ct != NULL &&
-		     (ctinfo == IP_CT_NEW || ctinfo == IP_CT_RELATED ||
-		      ctinfo == IP_CT_RELATED_REPLY));
+	WARN_ON(!(ct != NULL &&
+		 (ctinfo == IP_CT_NEW || ctinfo == IP_CT_RELATED ||
+		  ctinfo == IP_CT_RELATED_REPLY)));
 
 	xt_nat_convert_range(&range, &mr->range[0]);
 	return nf_nat_setup_info(ct, &range, NF_NAT_MANIP_SRC);
@@ -75,8 +75,8 @@ xt_dnat_target_v0(struct sk_buff *skb, const struct xt_action_param *par)
 	struct nf_conn *ct;
 
 	ct = nf_ct_get(skb, &ctinfo);
-	NF_CT_ASSERT(ct != NULL &&
-		     (ctinfo == IP_CT_NEW || ctinfo == IP_CT_RELATED));
+	WARN_ON(!(ct != NULL &&
+		 (ctinfo == IP_CT_NEW || ctinfo == IP_CT_RELATED)));
 
 	xt_nat_convert_range(&range, &mr->range[0]);
 	return nf_nat_setup_info(ct, &range, NF_NAT_MANIP_DST);
@@ -90,9 +90,9 @@ xt_snat_target_v1(struct sk_buff *skb, const struct xt_action_param *par)
 	struct nf_conn *ct;
 
 	ct = nf_ct_get(skb, &ctinfo);
-	NF_CT_ASSERT(ct != NULL &&
-		     (ctinfo == IP_CT_NEW || ctinfo == IP_CT_RELATED ||
-		      ctinfo == IP_CT_RELATED_REPLY));
+	WARN_ON(!(ct != NULL &&
+		 (ctinfo == IP_CT_NEW || ctinfo == IP_CT_RELATED ||
+		  ctinfo == IP_CT_RELATED_REPLY)));
 
 	return nf_nat_setup_info(ct, range, NF_NAT_MANIP_SRC);
 }
@@ -105,8 +105,8 @@ xt_dnat_target_v1(struct sk_buff *skb, const struct xt_action_param *par)
 	struct nf_conn *ct;
 
 	ct = nf_ct_get(skb, &ctinfo);
-	NF_CT_ASSERT(ct != NULL &&
-		     (ctinfo == IP_CT_NEW || ctinfo == IP_CT_RELATED));
+	WARN_ON(!(ct != NULL &&
+		 (ctinfo == IP_CT_NEW || ctinfo == IP_CT_RELATED)));
 
 	return nf_nat_setup_info(ct, range, NF_NAT_MANIP_DST);
 }
-- 
cgit v1.2.3


From 9efdb14f76f4d7591cd4d7a436ebd716b19703b6 Mon Sep 17 00:00:00 2001
From: Varsha Rao <rvarsha016@gmail.com>
Date: Wed, 30 Aug 2017 13:37:12 +0530
Subject: net: Remove CONFIG_NETFILTER_DEBUG and _ASSERT() macros.

This patch removes CONFIG_NETFILTER_DEBUG and _ASSERT() macros as they
are no longer required. Replace _ASSERT() macros with WARN_ON().

Signed-off-by: Varsha Rao <rvarsha016@gmail.com>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 arch/parisc/configs/c3000_defconfig  |  1 -
 arch/sh/configs/se7751_defconfig     |  1 -
 include/net/netfilter/nf_conntrack.h |  6 ------
 net/Kconfig                          |  7 -------
 net/bridge/netfilter/ebtables.c      | 20 ++++++++------------
 net/ipv4/netfilter/ip_tables.c       | 12 +++---------
 net/ipv6/netfilter/ip6_tables.c      | 12 +++---------
 7 files changed, 14 insertions(+), 45 deletions(-)

(limited to 'include')

diff --git a/arch/parisc/configs/c3000_defconfig b/arch/parisc/configs/c3000_defconfig
index 0764d3971cf6..8d41a73bd71b 100644
--- a/arch/parisc/configs/c3000_defconfig
+++ b/arch/parisc/configs/c3000_defconfig
@@ -31,7 +31,6 @@ CONFIG_IP_PNP_BOOTP=y
 CONFIG_INET6_IPCOMP=m
 CONFIG_IPV6_TUNNEL=m
 CONFIG_NETFILTER=y
-CONFIG_NETFILTER_DEBUG=y
 CONFIG_NET_PKTGEN=m
 CONFIG_UEVENT_HELPER_PATH="/sbin/hotplug"
 CONFIG_DEVTMPFS=y
diff --git a/arch/sh/configs/se7751_defconfig b/arch/sh/configs/se7751_defconfig
index 75c92fc1876b..56b5e4ce8d4a 100644
--- a/arch/sh/configs/se7751_defconfig
+++ b/arch/sh/configs/se7751_defconfig
@@ -28,7 +28,6 @@ CONFIG_IP_PNP_RARP=y
 # CONFIG_INET_LRO is not set
 # CONFIG_IPV6 is not set
 CONFIG_NETFILTER=y
-CONFIG_NETFILTER_DEBUG=y
 CONFIG_IP_NF_QUEUE=y
 CONFIG_MTD=y
 CONFIG_MTD_PARTITIONS=y
diff --git a/include/net/netfilter/nf_conntrack.h b/include/net/netfilter/nf_conntrack.h
index 0385cb08c478..fdc9c64a1c94 100644
--- a/include/net/netfilter/nf_conntrack.h
+++ b/include/net/netfilter/nf_conntrack.h
@@ -44,12 +44,6 @@ union nf_conntrack_expect_proto {
 #include <linux/types.h>
 #include <linux/skbuff.h>
 
-#ifdef CONFIG_NETFILTER_DEBUG
-#define NF_CT_ASSERT(x)		WARN_ON(!(x))
-#else
-#define NF_CT_ASSERT(x)
-#endif
-
 #include <net/netfilter/ipv4/nf_conntrack_ipv4.h>
 #include <net/netfilter/ipv6/nf_conntrack_ipv6.h>
 
diff --git a/net/Kconfig b/net/Kconfig
index e0e7c62c88f0..9dba2715919d 100644
--- a/net/Kconfig
+++ b/net/Kconfig
@@ -166,13 +166,6 @@ menuconfig NETFILTER
 
 if NETFILTER
 
-config NETFILTER_DEBUG
-	bool "Network packet filtering debugging"
-	depends on NETFILTER
-	help
-	  You can say Y here if you want to get additional messages useful in
-	  debugging the netfilter code.
-
 config NETFILTER_ADVANCED
 	bool "Advanced netfilter configuration"
 	depends on NETFILTER
diff --git a/net/bridge/netfilter/ebtables.c b/net/bridge/netfilter/ebtables.c
index 54c7ef4e970e..83951f978445 100644
--- a/net/bridge/netfilter/ebtables.c
+++ b/net/bridge/netfilter/ebtables.c
@@ -252,13 +252,11 @@ unsigned int ebt_do_table(struct sk_buff *skb,
 		}
 		if (verdict == EBT_RETURN) {
 letsreturn:
-#ifdef CONFIG_NETFILTER_DEBUG
-			if (sp == 0) {
-				BUGPRINT("RETURN on base chain");
+			if (WARN(sp == 0, "RETURN on base chain")) {
 				/* act like this is EBT_CONTINUE */
 				goto letscontinue;
 			}
-#endif
+
 			sp--;
 			/* put all the local variables right */
 			i = cs[sp].n;
@@ -271,26 +269,24 @@ letsreturn:
 		}
 		if (verdict == EBT_CONTINUE)
 			goto letscontinue;
-#ifdef CONFIG_NETFILTER_DEBUG
-		if (verdict < 0) {
-			BUGPRINT("bogus standard verdict\n");
+
+		if (WARN(verdict < 0, "bogus standard verdict\n")) {
 			read_unlock_bh(&table->lock);
 			return NF_DROP;
 		}
-#endif
+
 		/* jump to a udc */
 		cs[sp].n = i + 1;
 		cs[sp].chaininfo = chaininfo;
 		cs[sp].e = ebt_next_entry(point);
 		i = 0;
 		chaininfo = (struct ebt_entries *) (base + verdict);
-#ifdef CONFIG_NETFILTER_DEBUG
-		if (chaininfo->distinguisher) {
-			BUGPRINT("jump to non-chain\n");
+
+		if (WARN(chaininfo->distinguisher, "jump to non-chain\n")) {
 			read_unlock_bh(&table->lock);
 			return NF_DROP;
 		}
-#endif
+
 		nentries = chaininfo->nentries;
 		point = (struct ebt_entry *)chaininfo->data;
 		counter_base = cb_base + chaininfo->counter_offset;
diff --git a/net/ipv4/netfilter/ip_tables.c b/net/ipv4/netfilter/ip_tables.c
index ce1d97579ce8..576cba2b57e9 100644
--- a/net/ipv4/netfilter/ip_tables.c
+++ b/net/ipv4/netfilter/ip_tables.c
@@ -35,12 +35,6 @@ MODULE_LICENSE("GPL");
 MODULE_AUTHOR("Netfilter Core Team <coreteam@netfilter.org>");
 MODULE_DESCRIPTION("IPv4 packet filter");
 
-#ifdef CONFIG_NETFILTER_DEBUG
-#define IP_NF_ASSERT(x)		WARN_ON(!(x))
-#else
-#define IP_NF_ASSERT(x)
-#endif
-
 void *ipt_alloc_initial_table(const struct xt_table *info)
 {
 	return xt_alloc_initial_table(ipt, IPT);
@@ -263,7 +257,7 @@ ipt_do_table(struct sk_buff *skb,
 	acpar.hotdrop = false;
 	acpar.state   = state;
 
-	IP_NF_ASSERT(table->valid_hooks & (1 << hook));
+	WARN_ON(!(table->valid_hooks & (1 << hook)));
 	local_bh_disable();
 	addend = xt_write_recseq_begin();
 	private = table->private;
@@ -293,7 +287,7 @@ ipt_do_table(struct sk_buff *skb,
 		const struct xt_entry_match *ematch;
 		struct xt_counters *counter;
 
-		IP_NF_ASSERT(e);
+		WARN_ON(!e);
 		if (!ip_packet_match(ip, indev, outdev,
 		    &e->ip, acpar.fragoff)) {
  no_match:
@@ -312,7 +306,7 @@ ipt_do_table(struct sk_buff *skb,
 		ADD_COUNTER(*counter, skb->len, 1);
 
 		t = ipt_get_target(e);
-		IP_NF_ASSERT(t->u.kernel.target);
+		WARN_ON(!t->u.kernel.target);
 
 #if IS_ENABLED(CONFIG_NETFILTER_XT_TARGET_TRACE)
 		/* The packet is traced: log it */
diff --git a/net/ipv6/netfilter/ip6_tables.c b/net/ipv6/netfilter/ip6_tables.c
index 9f6644958e5e..54b1e75eded1 100644
--- a/net/ipv6/netfilter/ip6_tables.c
+++ b/net/ipv6/netfilter/ip6_tables.c
@@ -39,12 +39,6 @@ MODULE_LICENSE("GPL");
 MODULE_AUTHOR("Netfilter Core Team <coreteam@netfilter.org>");
 MODULE_DESCRIPTION("IPv6 packet filter");
 
-#ifdef CONFIG_NETFILTER_DEBUG
-#define IP_NF_ASSERT(x)	WARN_ON(!(x))
-#else
-#define IP_NF_ASSERT(x)
-#endif
-
 void *ip6t_alloc_initial_table(const struct xt_table *info)
 {
 	return xt_alloc_initial_table(ip6t, IP6T);
@@ -284,7 +278,7 @@ ip6t_do_table(struct sk_buff *skb,
 	acpar.hotdrop = false;
 	acpar.state   = state;
 
-	IP_NF_ASSERT(table->valid_hooks & (1 << hook));
+	WARN_ON(!(table->valid_hooks & (1 << hook)));
 
 	local_bh_disable();
 	addend = xt_write_recseq_begin();
@@ -315,7 +309,7 @@ ip6t_do_table(struct sk_buff *skb,
 		const struct xt_entry_match *ematch;
 		struct xt_counters *counter;
 
-		IP_NF_ASSERT(e);
+		WARN_ON(!e);
 		acpar.thoff = 0;
 		if (!ip6_packet_match(skb, indev, outdev, &e->ipv6,
 		    &acpar.thoff, &acpar.fragoff, &acpar.hotdrop)) {
@@ -335,7 +329,7 @@ ip6t_do_table(struct sk_buff *skb,
 		ADD_COUNTER(*counter, skb->len, 1);
 
 		t = ip6t_get_target_c(e);
-		IP_NF_ASSERT(t->u.kernel.target);
+		WARN_ON(!t->u.kernel.target);
 
 #if IS_ENABLED(CONFIG_NETFILTER_XT_TARGET_TRACE)
 		/* The packet is traced: log it */
-- 
cgit v1.2.3


From 095a7e388b5fbf8958686a90a04fe9387f6aa50b Mon Sep 17 00:00:00 2001
From: Paul Burton <paul.burton@imgtec.com>
Date: Sat, 12 Aug 2017 21:36:12 -0700
Subject: irqchip: mips-gic: Remove counter access functions

The MIPS GIC clocksource driver is no longer using the accessor
functions provided by the irqchip driver, so remove them.

Signed-off-by: Paul Burton <paul.burton@imgtec.com>
Acked-by: Marc Zyngier <marc.zyngier@arm.com>
Cc: Jason Cooper <jason@lakedaemon.net>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-mips@linux-mips.org
Patchwork: https://patchwork.linux-mips.org/patch/17022/
Signed-off-by: Ralf Baechle <ralf@linux-mips.org>
---
 drivers/irqchip/irq-mips-gic.c   | 95 ----------------------------------------
 include/linux/irqchip/mips-gic.h | 22 ----------
 2 files changed, 117 deletions(-)

(limited to 'include')

diff --git a/drivers/irqchip/irq-mips-gic.c b/drivers/irqchip/irq-mips-gic.c
index 9102a69a9ebf..e41ff59ceb32 100644
--- a/drivers/irqchip/irq-mips-gic.c
+++ b/drivers/irqchip/irq-mips-gic.c
@@ -138,101 +138,6 @@ static inline void gic_map_to_vpe(unsigned int intr, unsigned int vpe)
 		  GIC_SH_MAP_TO_VPE_REG_BIT(vpe));
 }
 
-#ifdef CONFIG_CLKSRC_MIPS_GIC
-u64 notrace gic_read_count(void)
-{
-	unsigned int hi, hi2, lo;
-
-	if (mips_cm_is64)
-		return (u64)gic_read(GIC_REG(SHARED, GIC_SH_COUNTER));
-
-	do {
-		hi = gic_read32(GIC_REG(SHARED, GIC_SH_COUNTER_63_32));
-		lo = gic_read32(GIC_REG(SHARED, GIC_SH_COUNTER_31_00));
-		hi2 = gic_read32(GIC_REG(SHARED, GIC_SH_COUNTER_63_32));
-	} while (hi2 != hi);
-
-	return (((u64) hi) << 32) + lo;
-}
-
-unsigned int gic_get_count_width(void)
-{
-	unsigned int bits, config;
-
-	config = gic_read(GIC_REG(SHARED, GIC_SH_CONFIG));
-	bits = 32 + 4 * ((config & GIC_SH_CONFIG_COUNTBITS_MSK) >>
-			 GIC_SH_CONFIG_COUNTBITS_SHF);
-
-	return bits;
-}
-
-void notrace gic_write_compare(u64 cnt)
-{
-	if (mips_cm_is64) {
-		gic_write(GIC_REG(VPE_LOCAL, GIC_VPE_COMPARE), cnt);
-	} else {
-		gic_write32(GIC_REG(VPE_LOCAL, GIC_VPE_COMPARE_HI),
-					(int)(cnt >> 32));
-		gic_write32(GIC_REG(VPE_LOCAL, GIC_VPE_COMPARE_LO),
-					(int)(cnt & 0xffffffff));
-	}
-}
-
-void notrace gic_write_cpu_compare(u64 cnt, int cpu)
-{
-	unsigned long flags;
-
-	local_irq_save(flags);
-
-	gic_write(GIC_REG(VPE_LOCAL, GIC_VPE_OTHER_ADDR), mips_cm_vp_id(cpu));
-
-	if (mips_cm_is64) {
-		gic_write(GIC_REG(VPE_OTHER, GIC_VPE_COMPARE), cnt);
-	} else {
-		gic_write32(GIC_REG(VPE_OTHER, GIC_VPE_COMPARE_HI),
-					(int)(cnt >> 32));
-		gic_write32(GIC_REG(VPE_OTHER, GIC_VPE_COMPARE_LO),
-					(int)(cnt & 0xffffffff));
-	}
-
-	local_irq_restore(flags);
-}
-
-u64 gic_read_compare(void)
-{
-	unsigned int hi, lo;
-
-	if (mips_cm_is64)
-		return (u64)gic_read(GIC_REG(VPE_LOCAL, GIC_VPE_COMPARE));
-
-	hi = gic_read32(GIC_REG(VPE_LOCAL, GIC_VPE_COMPARE_HI));
-	lo = gic_read32(GIC_REG(VPE_LOCAL, GIC_VPE_COMPARE_LO));
-
-	return (((u64) hi) << 32) + lo;
-}
-
-void gic_start_count(void)
-{
-	u32 gicconfig;
-
-	/* Start the counter */
-	gicconfig = gic_read(GIC_REG(SHARED, GIC_SH_CONFIG));
-	gicconfig &= ~(1 << GIC_SH_CONFIG_COUNTSTOP_SHF);
-	gic_write(GIC_REG(SHARED, GIC_SH_CONFIG), gicconfig);
-}
-
-void gic_stop_count(void)
-{
-	u32 gicconfig;
-
-	/* Stop the counter */
-	gicconfig = gic_read(GIC_REG(SHARED, GIC_SH_CONFIG));
-	gicconfig |= 1 << GIC_SH_CONFIG_COUNTSTOP_SHF;
-	gic_write(GIC_REG(SHARED, GIC_SH_CONFIG), gicconfig);
-}
-
-#endif
-
 unsigned gic_read_local_vp_id(void)
 {
 	unsigned long ident;
diff --git a/include/linux/irqchip/mips-gic.h b/include/linux/irqchip/mips-gic.h
index 2b0e56619e53..c9e1c993cf5b 100644
--- a/include/linux/irqchip/mips-gic.h
+++ b/include/linux/irqchip/mips-gic.h
@@ -40,11 +40,6 @@
 
 #define GIC_SH_CONFIG_OFS		0x0000
 
-/* Shared Global Counter */
-#define GIC_SH_COUNTER_31_00_OFS	0x0010
-/* 64-bit counter register for CM3 */
-#define GIC_SH_COUNTER_OFS		GIC_SH_COUNTER_31_00_OFS
-#define GIC_SH_COUNTER_63_32_OFS	0x0014
 #define GIC_SH_REVISIONID_OFS		0x0020
 
 /* Convert an interrupt number to a byte offset/bit for multi-word registers */
@@ -107,10 +102,6 @@
 #define GIC_VPE_WD_CONFIG0_OFS		0x0090
 #define GIC_VPE_WD_COUNT0_OFS		0x0094
 #define GIC_VPE_WD_INITIAL0_OFS		0x0098
-#define GIC_VPE_COMPARE_LO_OFS		0x00a0
-/* 64-bit Compare register on CM3 */
-#define GIC_VPE_COMPARE_OFS		GIC_VPE_COMPARE_LO_OFS
-#define GIC_VPE_COMPARE_HI_OFS		0x00a4
 
 #define GIC_VPE_EIC_SHADOW_SET_BASE_OFS	0x0100
 #define GIC_VPE_EIC_SS(intr)		(4 * (intr))
@@ -128,12 +119,6 @@
 #define GIC_UMV_SH_COUNTER_63_32_OFS	0x0004
 
 /* Masks */
-#define GIC_SH_CONFIG_COUNTSTOP_SHF	28
-#define GIC_SH_CONFIG_COUNTSTOP_MSK	(MSK(1) << GIC_SH_CONFIG_COUNTSTOP_SHF)
-
-#define GIC_SH_CONFIG_COUNTBITS_SHF	24
-#define GIC_SH_CONFIG_COUNTBITS_MSK	(MSK(4) << GIC_SH_CONFIG_COUNTBITS_SHF)
-
 #define GIC_SH_CONFIG_NUMINTRS_SHF	16
 #define GIC_SH_CONFIG_NUMINTRS_MSK	(MSK(8) << GIC_SH_CONFIG_NUMINTRS_SHF)
 
@@ -258,13 +243,6 @@ extern unsigned int gic_present;
 extern void gic_init(unsigned long gic_base_addr,
 	unsigned long gic_addrspace_size, unsigned int cpu_vec,
 	unsigned int irqbase);
-extern u64 gic_read_count(void);
-extern unsigned int gic_get_count_width(void);
-extern u64 gic_read_compare(void);
-extern void gic_write_compare(u64 cnt);
-extern void gic_write_cpu_compare(u64 cnt, int cpu);
-extern void gic_start_count(void);
-extern void gic_stop_count(void);
 extern int gic_get_c0_compare_int(void);
 extern int gic_get_c0_perfcount_int(void);
 extern int gic_get_c0_fdc_int(void);
-- 
cgit v1.2.3


From 9762d2e6d329a500dc6c07156e3c56aab3992471 Mon Sep 17 00:00:00 2001
From: Paul Burton <paul.burton@imgtec.com>
Date: Sat, 12 Aug 2017 21:36:14 -0700
Subject: irqchip: mips-gic: Remove gic_read_local_vp_id()

Nothing needs gic_read_local_vp_id() any longer, so remove the dead
code.

Signed-off-by: Paul Burton <paul.burton@imgtec.com>
Acked-by: Marc Zyngier <marc.zyngier@arm.com>
Cc: Jason Cooper <jason@lakedaemon.net>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-mips@linux-mips.org
Patchwork: https://patchwork.linux-mips.org/patch/17024/
Signed-off-by: Ralf Baechle <ralf@linux-mips.org>
---
 drivers/irqchip/irq-mips-gic.c   |  8 --------
 include/linux/irqchip/mips-gic.h | 17 -----------------
 2 files changed, 25 deletions(-)

(limited to 'include')

diff --git a/drivers/irqchip/irq-mips-gic.c b/drivers/irqchip/irq-mips-gic.c
index e41ff59ceb32..5193e6cf87ab 100644
--- a/drivers/irqchip/irq-mips-gic.c
+++ b/drivers/irqchip/irq-mips-gic.c
@@ -138,14 +138,6 @@ static inline void gic_map_to_vpe(unsigned int intr, unsigned int vpe)
 		  GIC_SH_MAP_TO_VPE_REG_BIT(vpe));
 }
 
-unsigned gic_read_local_vp_id(void)
-{
-	unsigned long ident;
-
-	ident = gic_read(GIC_REG(VPE_LOCAL, GIC_VP_IDENT));
-	return ident & GIC_VP_IDENT_VCNUM_MSK;
-}
-
 static bool gic_local_irq_is_routable(int intr)
 {
 	u32 vpe_ctl;
diff --git a/include/linux/irqchip/mips-gic.h b/include/linux/irqchip/mips-gic.h
index c9e1c993cf5b..29453bdc06e2 100644
--- a/include/linux/irqchip/mips-gic.h
+++ b/include/linux/irqchip/mips-gic.h
@@ -98,7 +98,6 @@
 #define GIC_VPE_SWINT0_MAP_OFS		0x0054
 #define GIC_VPE_SWINT1_MAP_OFS		0x0058
 #define GIC_VPE_OTHER_ADDR_OFS		0x0080
-#define GIC_VP_IDENT_OFS		0x0088
 #define GIC_VPE_WD_CONFIG0_OFS		0x0090
 #define GIC_VPE_WD_COUNT0_OFS		0x0094
 #define GIC_VPE_WD_INITIAL0_OFS		0x0098
@@ -197,10 +196,6 @@
 #define GIC_VPE_SMASK_FDC_SHF		6
 #define GIC_VPE_SMASK_FDC_MSK		(MSK(1) << GIC_VPE_SMASK_FDC_SHF)
 
-/* GIC_VP_IDENT fields */
-#define GIC_VP_IDENT_VCNUM_SHF		0
-#define GIC_VP_IDENT_VCNUM_MSK		(MSK(6) << GIC_VP_IDENT_VCNUM_SHF)
-
 /* GIC nomenclature for Core Interrupt Pins. */
 #define GIC_CPU_INT0		0 /* Core Interrupt 2 */
 #define GIC_CPU_INT1		1 /* .		      */
@@ -260,16 +255,4 @@ static inline int gic_get_usm_range(struct resource *gic_usm_res)
 
 #endif /* CONFIG_MIPS_GIC */
 
-/**
- * gic_read_local_vp_id() - read the local VPs VCNUM
- *
- * Read the VCNUM of the local VP from the GIC_VP_IDENT register and
- * return it to the caller. This ID should be used to refer to the VP
- * via the GICs VP-other region, or when calculating an offset to a
- * bit representing the VP in interrupt masks.
- *
- * Return: The VCNUM value for the local VP.
- */
-extern unsigned gic_read_local_vp_id(void);
-
 #endif /* __LINUX_IRQCHIP_MIPS_GIC_H */
-- 
cgit v1.2.3


From e98fcb2a8cc003c37c24713d2303667a8f624a30 Mon Sep 17 00:00:00 2001
From: Paul Burton <paul.burton@imgtec.com>
Date: Sat, 12 Aug 2017 21:36:16 -0700
Subject: irqchip: mips-gic: Simplify shared interrupt pending/mask reads

Simplify the reads of the bitmaps indicating pending & masked interrupts
in gic_handle_shared_int() using the __ioread32_copy() &
__ioread64_copy() helper functions.

Signed-off-by: Paul Burton <paul.burton@imgtec.com>
Acked-by: Marc Zyngier <marc.zyngier@arm.com>
Cc: Jason Cooper <jason@lakedaemon.net>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-mips@linux-mips.org
Patchwork: https://patchwork.linux-mips.org/patch/17026/
Signed-off-by: Ralf Baechle <ralf@linux-mips.org>
---
 drivers/irqchip/irq-mips-gic.c   | 29 +++++++++++------------------
 include/linux/irqchip/mips-gic.h |  6 ------
 2 files changed, 11 insertions(+), 24 deletions(-)

(limited to 'include')

diff --git a/drivers/irqchip/irq-mips-gic.c b/drivers/irqchip/irq-mips-gic.c
index 5193e6cf87ab..7445c3b58c44 100644
--- a/drivers/irqchip/irq-mips-gic.c
+++ b/drivers/irqchip/irq-mips-gic.c
@@ -225,31 +225,24 @@ int gic_get_usm_range(struct resource *gic_usm_res)
 
 static void gic_handle_shared_int(bool chained)
 {
-	unsigned int i, intr, virq, gic_reg_step = mips_cm_is64 ? 8 : 4;
+	unsigned int intr, virq;
 	unsigned long *pcpu_mask;
-	unsigned long pending_reg, intrmask_reg;
 	DECLARE_BITMAP(pending, GIC_MAX_INTRS);
 	DECLARE_BITMAP(intrmask, GIC_MAX_INTRS);
 
 	/* Get per-cpu bitmaps */
 	pcpu_mask = pcpu_masks[smp_processor_id()].pcpu_mask;
 
-	pending_reg = GIC_REG(SHARED, GIC_SH_PEND);
-	intrmask_reg = GIC_REG(SHARED, GIC_SH_MASK);
-
-	for (i = 0; i < BITS_TO_LONGS(gic_shared_intrs); i++) {
-		pending[i] = gic_read(pending_reg);
-		intrmask[i] = gic_read(intrmask_reg);
-		pending_reg += gic_reg_step;
-		intrmask_reg += gic_reg_step;
-
-		if (!IS_ENABLED(CONFIG_64BIT) || mips_cm_is64)
-			continue;
-
-		pending[i] |= (u64)gic_read(pending_reg) << 32;
-		intrmask[i] |= (u64)gic_read(intrmask_reg) << 32;
-		pending_reg += gic_reg_step;
-		intrmask_reg += gic_reg_step;
+	if (mips_cm_is64) {
+		__ioread64_copy(pending, addr_gic_pend(),
+				DIV_ROUND_UP(gic_shared_intrs, 64));
+		__ioread64_copy(intrmask, addr_gic_mask(),
+				DIV_ROUND_UP(gic_shared_intrs, 64));
+	} else {
+		__ioread32_copy(pending, addr_gic_pend(),
+				DIV_ROUND_UP(gic_shared_intrs, 32));
+		__ioread32_copy(intrmask, addr_gic_mask(),
+				DIV_ROUND_UP(gic_shared_intrs, 32));
 	}
 
 	bitmap_and(pending, pending, intrmask, gic_shared_intrs);
diff --git a/include/linux/irqchip/mips-gic.h b/include/linux/irqchip/mips-gic.h
index 29453bdc06e2..835e25506660 100644
--- a/include/linux/irqchip/mips-gic.h
+++ b/include/linux/irqchip/mips-gic.h
@@ -68,12 +68,6 @@
 #define GIC_SH_RMASK_OFS		0x0300
 #define GIC_SH_SMASK_OFS		0x0380
 
-/* Global Interrupt Mask Register (RO) - Bit Set == Interrupt enabled */
-#define GIC_SH_MASK_OFS			0x0400
-
-/* Pending Global Interrupts (RO) */
-#define GIC_SH_PEND_OFS			0x0480
-
 /* Maps Interrupt X to a Pin */
 #define GIC_SH_INTR_MAP_TO_PIN_BASE_OFS 0x0500
 #define GIC_SH_MAP_TO_PIN(intr)		(4 * (intr))
-- 
cgit v1.2.3


From a0dc5cb5e31bdbcf1f1dddf62a62d06d6b82d53a Mon Sep 17 00:00:00 2001
From: Paul Burton <paul.burton@imgtec.com>
Date: Sat, 12 Aug 2017 21:36:17 -0700
Subject: irqchip: mips-gic: Simplify gic_local_irq_domain_map()

Simplify gic_local_irq_domain_map() by:

- Moving the check for invalid IRQs outside of the loop.

- Moving the decision about whether to use gic_cpu_pin or timer_cpu_pin
  outside of the loop.

- Using the new write_gic_vo_map() accessor function to avoid the need
  to handle each map register separately.

Signed-off-by: Paul Burton <paul.burton@imgtec.com>
Acked-by: Marc Zyngier <marc.zyngier@arm.com>
Cc: Jason Cooper <jason@lakedaemon.net>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-mips@linux-mips.org
Patchwork: https://patchwork.linux-mips.org/patch/17027/
Signed-off-by: Ralf Baechle <ralf@linux-mips.org>
---
 drivers/irqchip/irq-mips-gic.c   | 57 +++++++++++-----------------------------
 include/linux/irqchip/mips-gic.h |  6 -----
 2 files changed, 16 insertions(+), 47 deletions(-)

(limited to 'include')

diff --git a/drivers/irqchip/irq-mips-gic.c b/drivers/irqchip/irq-mips-gic.c
index 7445c3b58c44..4b6c4e55562d 100644
--- a/drivers/irqchip/irq-mips-gic.c
+++ b/drivers/irqchip/irq-mips-gic.c
@@ -498,58 +498,33 @@ static int gic_local_irq_domain_map(struct irq_domain *d, unsigned int virq,
 				    irq_hw_number_t hw)
 {
 	int intr = GIC_HWIRQ_TO_LOCAL(hw);
-	int ret = 0;
 	int i;
 	unsigned long flags;
+	u32 val;
 
 	if (!gic_local_irq_is_routable(intr))
 		return -EPERM;
 
-	spin_lock_irqsave(&gic_lock, flags);
-	for (i = 0; i < gic_vpes; i++) {
-		u32 val = GIC_MAP_TO_PIN_MSK | gic_cpu_pin;
+	if (intr > GIC_LOCAL_INT_FDC) {
+		pr_err("Invalid local IRQ %d\n", intr);
+		return -EINVAL;
+	}
 
-		gic_write(GIC_REG(VPE_LOCAL, GIC_VPE_OTHER_ADDR),
-			  mips_cm_vp_id(i));
+	if (intr == GIC_LOCAL_INT_TIMER) {
+		/* CONFIG_MIPS_CMP workaround (see __gic_init) */
+		val = GIC_MAP_PIN_MAP_TO_PIN | timer_cpu_pin;
+	} else {
+		val = GIC_MAP_PIN_MAP_TO_PIN | gic_cpu_pin;
+	}
 
-		switch (intr) {
-		case GIC_LOCAL_INT_WD:
-			gic_write32(GIC_REG(VPE_OTHER, GIC_VPE_WD_MAP), val);
-			break;
-		case GIC_LOCAL_INT_COMPARE:
-			gic_write32(GIC_REG(VPE_OTHER, GIC_VPE_COMPARE_MAP),
-				    val);
-			break;
-		case GIC_LOCAL_INT_TIMER:
-			/* CONFIG_MIPS_CMP workaround (see __gic_init) */
-			val = GIC_MAP_TO_PIN_MSK | timer_cpu_pin;
-			gic_write32(GIC_REG(VPE_OTHER, GIC_VPE_TIMER_MAP),
-				    val);
-			break;
-		case GIC_LOCAL_INT_PERFCTR:
-			gic_write32(GIC_REG(VPE_OTHER, GIC_VPE_PERFCTR_MAP),
-				    val);
-			break;
-		case GIC_LOCAL_INT_SWINT0:
-			gic_write32(GIC_REG(VPE_OTHER, GIC_VPE_SWINT0_MAP),
-				    val);
-			break;
-		case GIC_LOCAL_INT_SWINT1:
-			gic_write32(GIC_REG(VPE_OTHER, GIC_VPE_SWINT1_MAP),
-				    val);
-			break;
-		case GIC_LOCAL_INT_FDC:
-			gic_write32(GIC_REG(VPE_OTHER, GIC_VPE_FDC_MAP), val);
-			break;
-		default:
-			pr_err("Invalid local IRQ %d\n", intr);
-			ret = -EINVAL;
-			break;
-		}
+	spin_lock_irqsave(&gic_lock, flags);
+	for (i = 0; i < gic_vpes; i++) {
+		write_gic_vl_other(mips_cm_vp_id(i));
+		write_gic_vo_map(intr, val);
 	}
 	spin_unlock_irqrestore(&gic_lock, flags);
 
-	return ret;
+	return 0;
 }
 
 static int gic_shared_irq_domain_map(struct irq_domain *d, unsigned int virq,
diff --git a/include/linux/irqchip/mips-gic.h b/include/linux/irqchip/mips-gic.h
index 835e25506660..1342b17b6812 100644
--- a/include/linux/irqchip/mips-gic.h
+++ b/include/linux/irqchip/mips-gic.h
@@ -84,13 +84,7 @@
 #define GIC_VPE_MASK_OFS		0x0008
 #define GIC_VPE_RMASK_OFS		0x000c
 #define GIC_VPE_SMASK_OFS		0x0010
-#define GIC_VPE_WD_MAP_OFS		0x0040
-#define GIC_VPE_COMPARE_MAP_OFS		0x0044
 #define GIC_VPE_TIMER_MAP_OFS		0x0048
-#define GIC_VPE_FDC_MAP_OFS		0x004c
-#define GIC_VPE_PERFCTR_MAP_OFS		0x0050
-#define GIC_VPE_SWINT0_MAP_OFS		0x0054
-#define GIC_VPE_SWINT1_MAP_OFS		0x0058
 #define GIC_VPE_OTHER_ADDR_OFS		0x0080
 #define GIC_VPE_WD_CONFIG0_OFS		0x0090
 #define GIC_VPE_WD_COUNT0_OFS		0x0094
-- 
cgit v1.2.3


From 87554b0ef3884143563e090375269730780c6617 Mon Sep 17 00:00:00 2001
From: Paul Burton <paul.burton@imgtec.com>
Date: Sat, 12 Aug 2017 21:36:18 -0700
Subject: irqchip: mips-gic: Drop gic_(re)set_mask() functions

The gic_set_mask() & gic_reset_mask() functions are now no more
convenient to call than the write_gic_smask() or write_gic_rmask()
accessor functions. Remove the layer of abstraction.

Signed-off-by: Paul Burton <paul.burton@imgtec.com>
Acked-by: Marc Zyngier <marc.zyngier@arm.com>
Cc: Jason Cooper <jason@lakedaemon.net>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-mips@linux-mips.org
Patchwork: https://patchwork.linux-mips.org/patch/17028/
Signed-off-by: Ralf Baechle <ralf@linux-mips.org>
---
 drivers/irqchip/irq-mips-gic.c   | 18 +++---------------
 include/linux/irqchip/mips-gic.h |  4 ----
 2 files changed, 3 insertions(+), 19 deletions(-)

(limited to 'include')

diff --git a/drivers/irqchip/irq-mips-gic.c b/drivers/irqchip/irq-mips-gic.c
index 4b6c4e55562d..7eb998c61d1e 100644
--- a/drivers/irqchip/irq-mips-gic.c
+++ b/drivers/irqchip/irq-mips-gic.c
@@ -92,18 +92,6 @@ static inline void gic_update_bits(unsigned int reg, unsigned long mask,
 	gic_write(reg, regval);
 }
 
-static inline void gic_reset_mask(unsigned int intr)
-{
-	gic_write(GIC_REG(SHARED, GIC_SH_RMASK) + GIC_INTR_OFS(intr),
-		  1ul << GIC_INTR_BIT(intr));
-}
-
-static inline void gic_set_mask(unsigned int intr)
-{
-	gic_write(GIC_REG(SHARED, GIC_SH_SMASK) + GIC_INTR_OFS(intr),
-		  1ul << GIC_INTR_BIT(intr));
-}
-
 static inline void gic_set_polarity(unsigned int intr, unsigned int pol)
 {
 	gic_update_bits(GIC_REG(SHARED, GIC_SH_SET_POLARITY) +
@@ -260,12 +248,12 @@ static void gic_handle_shared_int(bool chained)
 
 static void gic_mask_irq(struct irq_data *d)
 {
-	gic_reset_mask(GIC_HWIRQ_TO_SHARED(d->hwirq));
+	write_gic_rmask(BIT(GIC_HWIRQ_TO_SHARED(d->hwirq)));
 }
 
 static void gic_unmask_irq(struct irq_data *d)
 {
-	gic_set_mask(GIC_HWIRQ_TO_SHARED(d->hwirq));
+	write_gic_smask(BIT(GIC_HWIRQ_TO_SHARED(d->hwirq)));
 }
 
 static void gic_ack_irq(struct irq_data *d)
@@ -478,7 +466,7 @@ static void __init gic_basic_init(void)
 	for (i = 0; i < gic_shared_intrs; i++) {
 		gic_set_polarity(i, GIC_POL_POS);
 		gic_set_trigger(i, GIC_TRIG_LEVEL);
-		gic_reset_mask(i);
+		write_gic_rmask(BIT(i));
 	}
 
 	for (i = 0; i < gic_vpes; i++) {
diff --git a/include/linux/irqchip/mips-gic.h b/include/linux/irqchip/mips-gic.h
index 1342b17b6812..8160cc8b677d 100644
--- a/include/linux/irqchip/mips-gic.h
+++ b/include/linux/irqchip/mips-gic.h
@@ -64,10 +64,6 @@
 /* Set/Clear corresponding bit in Edge Detect Register */
 #define GIC_SH_WEDGE_OFS		0x0280
 
-/* Mask manipulation */
-#define GIC_SH_RMASK_OFS		0x0300
-#define GIC_SH_SMASK_OFS		0x0380
-
 /* Maps Interrupt X to a Pin */
 #define GIC_SH_INTR_MAP_TO_PIN_BASE_OFS 0x0500
 #define GIC_SH_MAP_TO_PIN(intr)		(4 * (intr))
-- 
cgit v1.2.3


From 80e5f9c9e295b08c5c588083a4ad35e1e5f78731 Mon Sep 17 00:00:00 2001
From: Paul Burton <paul.burton@imgtec.com>
Date: Sat, 12 Aug 2017 21:36:19 -0700
Subject: irqchip: mips-gic: Remove gic_set_polarity()

Remove the gic_set_polarity() function in favour of using the new
change_gic_pol() accessor function which provides equivalent
functionality.

Signed-off-by: Paul Burton <paul.burton@imgtec.com>
Acked-by: Marc Zyngier <marc.zyngier@arm.com>
Cc: Jason Cooper <jason@lakedaemon.net>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-mips@linux-mips.org
Patchwork: https://patchwork.linux-mips.org/patch/17029/
Signed-off-by: Ralf Baechle <ralf@linux-mips.org>
---
 drivers/irqchip/irq-mips-gic.c   | 17 +++++------------
 include/linux/irqchip/mips-gic.h |  5 -----
 2 files changed, 5 insertions(+), 17 deletions(-)

(limited to 'include')

diff --git a/drivers/irqchip/irq-mips-gic.c b/drivers/irqchip/irq-mips-gic.c
index 7eb998c61d1e..987289558024 100644
--- a/drivers/irqchip/irq-mips-gic.c
+++ b/drivers/irqchip/irq-mips-gic.c
@@ -92,13 +92,6 @@ static inline void gic_update_bits(unsigned int reg, unsigned long mask,
 	gic_write(reg, regval);
 }
 
-static inline void gic_set_polarity(unsigned int intr, unsigned int pol)
-{
-	gic_update_bits(GIC_REG(SHARED, GIC_SH_SET_POLARITY) +
-			GIC_INTR_OFS(intr), 1ul << GIC_INTR_BIT(intr),
-			(unsigned long)pol << GIC_INTR_BIT(intr));
-}
-
 static inline void gic_set_trigger(unsigned int intr, unsigned int trig)
 {
 	gic_update_bits(GIC_REG(SHARED, GIC_SH_SET_TRIGGER) +
@@ -272,13 +265,13 @@ static int gic_set_type(struct irq_data *d, unsigned int type)
 	spin_lock_irqsave(&gic_lock, flags);
 	switch (type & IRQ_TYPE_SENSE_MASK) {
 	case IRQ_TYPE_EDGE_FALLING:
-		gic_set_polarity(irq, GIC_POL_NEG);
+		change_gic_pol(irq, GIC_POL_FALLING_EDGE);
 		gic_set_trigger(irq, GIC_TRIG_EDGE);
 		gic_set_dual_edge(irq, GIC_TRIG_DUAL_DISABLE);
 		is_edge = true;
 		break;
 	case IRQ_TYPE_EDGE_RISING:
-		gic_set_polarity(irq, GIC_POL_POS);
+		change_gic_pol(irq, GIC_POL_RISING_EDGE);
 		gic_set_trigger(irq, GIC_TRIG_EDGE);
 		gic_set_dual_edge(irq, GIC_TRIG_DUAL_DISABLE);
 		is_edge = true;
@@ -290,14 +283,14 @@ static int gic_set_type(struct irq_data *d, unsigned int type)
 		is_edge = true;
 		break;
 	case IRQ_TYPE_LEVEL_LOW:
-		gic_set_polarity(irq, GIC_POL_NEG);
+		change_gic_pol(irq, GIC_POL_ACTIVE_LOW);
 		gic_set_trigger(irq, GIC_TRIG_LEVEL);
 		gic_set_dual_edge(irq, GIC_TRIG_DUAL_DISABLE);
 		is_edge = false;
 		break;
 	case IRQ_TYPE_LEVEL_HIGH:
 	default:
-		gic_set_polarity(irq, GIC_POL_POS);
+		change_gic_pol(irq, GIC_POL_ACTIVE_HIGH);
 		gic_set_trigger(irq, GIC_TRIG_LEVEL);
 		gic_set_dual_edge(irq, GIC_TRIG_DUAL_DISABLE);
 		is_edge = false;
@@ -464,7 +457,7 @@ static void __init gic_basic_init(void)
 
 	/* Setup defaults */
 	for (i = 0; i < gic_shared_intrs; i++) {
-		gic_set_polarity(i, GIC_POL_POS);
+		change_gic_pol(i, GIC_POL_ACTIVE_HIGH);
 		gic_set_trigger(i, GIC_TRIG_LEVEL);
 		write_gic_rmask(BIT(i));
 	}
diff --git a/include/linux/irqchip/mips-gic.h b/include/linux/irqchip/mips-gic.h
index 8160cc8b677d..960e49a64e7a 100644
--- a/include/linux/irqchip/mips-gic.h
+++ b/include/linux/irqchip/mips-gic.h
@@ -14,8 +14,6 @@
 #define GIC_MAX_INTRS			256
 
 /* Constants */
-#define GIC_POL_POS			1
-#define GIC_POL_NEG			0
 #define GIC_TRIG_EDGE			1
 #define GIC_TRIG_LEVEL			0
 #define GIC_TRIG_DUAL_ENABLE		1
@@ -52,9 +50,6 @@
 })
 #define GIC_INTR_BIT(intr)		((intr) % (mips_cm_is64 ? 64 : 32))
 
-/* Polarity : Reset Value is always 0 */
-#define GIC_SH_SET_POLARITY_OFS		0x0100
-
 /* Triggering : Reset Value is always 0 */
 #define GIC_SH_SET_TRIGGER_OFS		0x0180
 
-- 
cgit v1.2.3


From 471aa962a6acc19a990e20fa3846db40e62120cc Mon Sep 17 00:00:00 2001
From: Paul Burton <paul.burton@imgtec.com>
Date: Sat, 12 Aug 2017 21:36:20 -0700
Subject: irqchip: mips-gic: Remove gic_set_trigger()

Remove the gic_set_trigger() function in favour of using the new
change_gic_trig() accessor function which provides equivalent
functionality.

Signed-off-by: Paul Burton <paul.burton@imgtec.com>
Acked-by: Marc Zyngier <marc.zyngier@arm.com>
Cc: Jason Cooper <jason@lakedaemon.net>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-mips@linux-mips.org
Patchwork: https://patchwork.linux-mips.org/patch/17030/
Signed-off-by: Ralf Baechle <ralf@linux-mips.org>
---
 drivers/irqchip/irq-mips-gic.c   | 19 ++++++-------------
 include/linux/irqchip/mips-gic.h |  5 -----
 2 files changed, 6 insertions(+), 18 deletions(-)

(limited to 'include')

diff --git a/drivers/irqchip/irq-mips-gic.c b/drivers/irqchip/irq-mips-gic.c
index 987289558024..14a1682f399e 100644
--- a/drivers/irqchip/irq-mips-gic.c
+++ b/drivers/irqchip/irq-mips-gic.c
@@ -92,13 +92,6 @@ static inline void gic_update_bits(unsigned int reg, unsigned long mask,
 	gic_write(reg, regval);
 }
 
-static inline void gic_set_trigger(unsigned int intr, unsigned int trig)
-{
-	gic_update_bits(GIC_REG(SHARED, GIC_SH_SET_TRIGGER) +
-			GIC_INTR_OFS(intr), 1ul << GIC_INTR_BIT(intr),
-			(unsigned long)trig << GIC_INTR_BIT(intr));
-}
-
 static inline void gic_set_dual_edge(unsigned int intr, unsigned int dual)
 {
 	gic_update_bits(GIC_REG(SHARED, GIC_SH_SET_DUAL) + GIC_INTR_OFS(intr),
@@ -266,32 +259,32 @@ static int gic_set_type(struct irq_data *d, unsigned int type)
 	switch (type & IRQ_TYPE_SENSE_MASK) {
 	case IRQ_TYPE_EDGE_FALLING:
 		change_gic_pol(irq, GIC_POL_FALLING_EDGE);
-		gic_set_trigger(irq, GIC_TRIG_EDGE);
+		change_gic_trig(irq, GIC_TRIG_EDGE);
 		gic_set_dual_edge(irq, GIC_TRIG_DUAL_DISABLE);
 		is_edge = true;
 		break;
 	case IRQ_TYPE_EDGE_RISING:
 		change_gic_pol(irq, GIC_POL_RISING_EDGE);
-		gic_set_trigger(irq, GIC_TRIG_EDGE);
+		change_gic_trig(irq, GIC_TRIG_EDGE);
 		gic_set_dual_edge(irq, GIC_TRIG_DUAL_DISABLE);
 		is_edge = true;
 		break;
 	case IRQ_TYPE_EDGE_BOTH:
 		/* polarity is irrelevant in this case */
-		gic_set_trigger(irq, GIC_TRIG_EDGE);
+		change_gic_trig(irq, GIC_TRIG_EDGE);
 		gic_set_dual_edge(irq, GIC_TRIG_DUAL_ENABLE);
 		is_edge = true;
 		break;
 	case IRQ_TYPE_LEVEL_LOW:
 		change_gic_pol(irq, GIC_POL_ACTIVE_LOW);
-		gic_set_trigger(irq, GIC_TRIG_LEVEL);
+		change_gic_trig(irq, GIC_TRIG_LEVEL);
 		gic_set_dual_edge(irq, GIC_TRIG_DUAL_DISABLE);
 		is_edge = false;
 		break;
 	case IRQ_TYPE_LEVEL_HIGH:
 	default:
 		change_gic_pol(irq, GIC_POL_ACTIVE_HIGH);
-		gic_set_trigger(irq, GIC_TRIG_LEVEL);
+		change_gic_trig(irq, GIC_TRIG_LEVEL);
 		gic_set_dual_edge(irq, GIC_TRIG_DUAL_DISABLE);
 		is_edge = false;
 		break;
@@ -458,7 +451,7 @@ static void __init gic_basic_init(void)
 	/* Setup defaults */
 	for (i = 0; i < gic_shared_intrs; i++) {
 		change_gic_pol(i, GIC_POL_ACTIVE_HIGH);
-		gic_set_trigger(i, GIC_TRIG_LEVEL);
+		change_gic_trig(i, GIC_TRIG_LEVEL);
 		write_gic_rmask(BIT(i));
 	}
 
diff --git a/include/linux/irqchip/mips-gic.h b/include/linux/irqchip/mips-gic.h
index 960e49a64e7a..2e9f0c43d425 100644
--- a/include/linux/irqchip/mips-gic.h
+++ b/include/linux/irqchip/mips-gic.h
@@ -14,8 +14,6 @@
 #define GIC_MAX_INTRS			256
 
 /* Constants */
-#define GIC_TRIG_EDGE			1
-#define GIC_TRIG_LEVEL			0
 #define GIC_TRIG_DUAL_ENABLE		1
 #define GIC_TRIG_DUAL_DISABLE		0
 
@@ -50,9 +48,6 @@
 })
 #define GIC_INTR_BIT(intr)		((intr) % (mips_cm_is64 ? 64 : 32))
 
-/* Triggering : Reset Value is always 0 */
-#define GIC_SH_SET_TRIGGER_OFS		0x0180
-
 /* Dual edge triggering : Reset Value is always 0 */
 #define GIC_SH_SET_DUAL_OFS		0x0200
 
-- 
cgit v1.2.3


From c26ba670cdb84e0556436be7bf68a75a1d4f4d76 Mon Sep 17 00:00:00 2001
From: Paul Burton <paul.burton@imgtec.com>
Date: Sat, 12 Aug 2017 21:36:21 -0700
Subject: irqchip: mips-gic: Remove gic_set_dual_edge()

Remove the gic_set_dual_edge() function in favour of using the new
change_gic_dual() accessor function which provides equivalent
functionality. This also allows us to remove the gic_update_bits()
function which gic_set_dual_edge() was the last user of, along with the
GIC_INTR_OFS() & GIC_INTR_BIT() macros.

Signed-off-by: Paul Burton <paul.burton@imgtec.com>
Acked-by: Marc Zyngier <marc.zyngier@arm.com>
Cc: Jason Cooper <jason@lakedaemon.net>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-mips@linux-mips.org
Patchwork: https://patchwork.linux-mips.org/patch/17031/
Signed-off-by: Ralf Baechle <ralf@linux-mips.org>
---
 drivers/irqchip/irq-mips-gic.c   | 28 +++++-----------------------
 include/linux/irqchip/mips-gic.h | 17 -----------------
 2 files changed, 5 insertions(+), 40 deletions(-)

(limited to 'include')

diff --git a/drivers/irqchip/irq-mips-gic.c b/drivers/irqchip/irq-mips-gic.c
index 14a1682f399e..8aae9d20b82c 100644
--- a/drivers/irqchip/irq-mips-gic.c
+++ b/drivers/irqchip/irq-mips-gic.c
@@ -81,24 +81,6 @@ static inline void gic_write(unsigned int reg, unsigned long val)
 		return gic_write64(reg, (u64)val);
 }
 
-static inline void gic_update_bits(unsigned int reg, unsigned long mask,
-				   unsigned long val)
-{
-	unsigned long regval;
-
-	regval = gic_read(reg);
-	regval &= ~mask;
-	regval |= val;
-	gic_write(reg, regval);
-}
-
-static inline void gic_set_dual_edge(unsigned int intr, unsigned int dual)
-{
-	gic_update_bits(GIC_REG(SHARED, GIC_SH_SET_DUAL) + GIC_INTR_OFS(intr),
-			1ul << GIC_INTR_BIT(intr),
-			(unsigned long)dual << GIC_INTR_BIT(intr));
-}
-
 static inline void gic_map_to_pin(unsigned int intr, unsigned int pin)
 {
 	gic_write32(GIC_REG(SHARED, GIC_SH_INTR_MAP_TO_PIN_BASE) +
@@ -260,32 +242,32 @@ static int gic_set_type(struct irq_data *d, unsigned int type)
 	case IRQ_TYPE_EDGE_FALLING:
 		change_gic_pol(irq, GIC_POL_FALLING_EDGE);
 		change_gic_trig(irq, GIC_TRIG_EDGE);
-		gic_set_dual_edge(irq, GIC_TRIG_DUAL_DISABLE);
+		change_gic_dual(irq, GIC_DUAL_SINGLE);
 		is_edge = true;
 		break;
 	case IRQ_TYPE_EDGE_RISING:
 		change_gic_pol(irq, GIC_POL_RISING_EDGE);
 		change_gic_trig(irq, GIC_TRIG_EDGE);
-		gic_set_dual_edge(irq, GIC_TRIG_DUAL_DISABLE);
+		change_gic_dual(irq, GIC_DUAL_SINGLE);
 		is_edge = true;
 		break;
 	case IRQ_TYPE_EDGE_BOTH:
 		/* polarity is irrelevant in this case */
 		change_gic_trig(irq, GIC_TRIG_EDGE);
-		gic_set_dual_edge(irq, GIC_TRIG_DUAL_ENABLE);
+		change_gic_dual(irq, GIC_DUAL_DUAL);
 		is_edge = true;
 		break;
 	case IRQ_TYPE_LEVEL_LOW:
 		change_gic_pol(irq, GIC_POL_ACTIVE_LOW);
 		change_gic_trig(irq, GIC_TRIG_LEVEL);
-		gic_set_dual_edge(irq, GIC_TRIG_DUAL_DISABLE);
+		change_gic_dual(irq, GIC_DUAL_SINGLE);
 		is_edge = false;
 		break;
 	case IRQ_TYPE_LEVEL_HIGH:
 	default:
 		change_gic_pol(irq, GIC_POL_ACTIVE_HIGH);
 		change_gic_trig(irq, GIC_TRIG_LEVEL);
-		gic_set_dual_edge(irq, GIC_TRIG_DUAL_DISABLE);
+		change_gic_dual(irq, GIC_DUAL_SINGLE);
 		is_edge = false;
 		break;
 	}
diff --git a/include/linux/irqchip/mips-gic.h b/include/linux/irqchip/mips-gic.h
index 2e9f0c43d425..bd348fc9db18 100644
--- a/include/linux/irqchip/mips-gic.h
+++ b/include/linux/irqchip/mips-gic.h
@@ -13,10 +13,6 @@
 
 #define GIC_MAX_INTRS			256
 
-/* Constants */
-#define GIC_TRIG_DUAL_ENABLE		1
-#define GIC_TRIG_DUAL_DISABLE		0
-
 #define MSK(n) ((1 << (n)) - 1)
 
 /* Accessors */
@@ -38,19 +34,6 @@
 
 #define GIC_SH_REVISIONID_OFS		0x0020
 
-/* Convert an interrupt number to a byte offset/bit for multi-word registers */
-#define GIC_INTR_OFS(intr) ({				\
-	unsigned bits = mips_cm_is64 ? 64 : 32;		\
-	unsigned reg_idx = (intr) / bits;		\
-	unsigned reg_width = bits / 8;			\
-							\
-	reg_idx * reg_width;				\
-})
-#define GIC_INTR_BIT(intr)		((intr) % (mips_cm_is64 ? 64 : 32))
-
-/* Dual edge triggering : Reset Value is always 0 */
-#define GIC_SH_SET_DUAL_OFS		0x0200
-
 /* Set/Clear corresponding bit in Edge Detect Register */
 #define GIC_SH_WEDGE_OFS		0x0280
 
-- 
cgit v1.2.3


From d3e8cf44792d60047ba45036abfc38ab414e49e5 Mon Sep 17 00:00:00 2001
From: Paul Burton <paul.burton@imgtec.com>
Date: Sat, 12 Aug 2017 21:36:22 -0700
Subject: irqchip: mips-gic: Remove gic_map_to_pin()

Remove the gic_map_to_pin() function in favour of using the new
write_gic_map_pin() accessor function which isn't any more complex to
use & allows us to drop a level of abstraction.

Signed-off-by: Paul Burton <paul.burton@imgtec.com>
Acked-by: Marc Zyngier <marc.zyngier@arm.com>
Cc: Jason Cooper <jason@lakedaemon.net>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-mips@linux-mips.org
Patchwork: https://patchwork.linux-mips.org/patch/17032/
Signed-off-by: Ralf Baechle <ralf@linux-mips.org>
---
 drivers/irqchip/irq-mips-gic.c   |  8 +-------
 include/linux/irqchip/mips-gic.h | 10 ----------
 2 files changed, 1 insertion(+), 17 deletions(-)

(limited to 'include')

diff --git a/drivers/irqchip/irq-mips-gic.c b/drivers/irqchip/irq-mips-gic.c
index 8aae9d20b82c..f1a4e5d86ca3 100644
--- a/drivers/irqchip/irq-mips-gic.c
+++ b/drivers/irqchip/irq-mips-gic.c
@@ -81,12 +81,6 @@ static inline void gic_write(unsigned int reg, unsigned long val)
 		return gic_write64(reg, (u64)val);
 }
 
-static inline void gic_map_to_pin(unsigned int intr, unsigned int pin)
-{
-	gic_write32(GIC_REG(SHARED, GIC_SH_INTR_MAP_TO_PIN_BASE) +
-		    GIC_SH_MAP_TO_PIN(intr), GIC_MAP_TO_PIN_MSK | pin);
-}
-
 static inline void gic_map_to_vpe(unsigned int intr, unsigned int vpe)
 {
 	gic_write(GIC_REG(SHARED, GIC_SH_INTR_MAP_TO_VPE_BASE) +
@@ -491,7 +485,7 @@ static int gic_shared_irq_domain_map(struct irq_domain *d, unsigned int virq,
 	int i;
 
 	spin_lock_irqsave(&gic_lock, flags);
-	gic_map_to_pin(intr, gic_cpu_pin);
+	write_gic_map_pin(intr, GIC_MAP_PIN_MAP_TO_PIN | gic_cpu_pin);
 	gic_map_to_vpe(intr, mips_cm_vp_id(vpe));
 	for (i = 0; i < min(gic_vpes, NR_CPUS); i++)
 		clear_bit(intr, pcpu_masks[i].pcpu_mask);
diff --git a/include/linux/irqchip/mips-gic.h b/include/linux/irqchip/mips-gic.h
index bd348fc9db18..dea79a7a54cc 100644
--- a/include/linux/irqchip/mips-gic.h
+++ b/include/linux/irqchip/mips-gic.h
@@ -37,10 +37,6 @@
 /* Set/Clear corresponding bit in Edge Detect Register */
 #define GIC_SH_WEDGE_OFS		0x0280
 
-/* Maps Interrupt X to a Pin */
-#define GIC_SH_INTR_MAP_TO_PIN_BASE_OFS 0x0500
-#define GIC_SH_MAP_TO_PIN(intr)		(4 * (intr))
-
 /* Maps Interrupt X to a VPE */
 #define GIC_SH_INTR_MAP_TO_VPE_BASE_OFS 0x2000
 #define GIC_SH_MAP_TO_VPE_REG_OFF(intr, vpe) \
@@ -84,12 +80,6 @@
 #define GIC_SH_WEDGE_SET(intr)		((intr) | (0x1 << 31))
 #define GIC_SH_WEDGE_CLR(intr)		((intr) & ~(0x1 << 31))
 
-#define GIC_MAP_TO_PIN_SHF		31
-#define GIC_MAP_TO_PIN_MSK		(MSK(1) << GIC_MAP_TO_PIN_SHF)
-#define GIC_MAP_TO_NMI_SHF		30
-#define GIC_MAP_TO_NMI_MSK		(MSK(1) << GIC_MAP_TO_NMI_SHF)
-#define GIC_MAP_TO_YQ_SHF		29
-#define GIC_MAP_TO_YQ_MSK		(MSK(1) << GIC_MAP_TO_YQ_SHF)
 #define GIC_MAP_SHF			0
 #define GIC_MAP_MSK			(MSK(6) << GIC_MAP_SHF)
 
-- 
cgit v1.2.3


From 0efe3cbf1504309bf088f74eb627d0987f67e934 Mon Sep 17 00:00:00 2001
From: Paul Burton <paul.burton@imgtec.com>
Date: Sat, 12 Aug 2017 21:36:23 -0700
Subject: irqchip: mips-gic: Remove gic_map_to_vpe()

Remove the gic_map_to_vpe() function in favour of using the new
write_gic_map_vp() accessor function which isn't any more complex to
use & allows us to drop a level of abstraction.

Signed-off-by: Paul Burton <paul.burton@imgtec.com>
Acked-by: Marc Zyngier <marc.zyngier@arm.com>
Cc: Jason Cooper <jason@lakedaemon.net>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-mips@linux-mips.org
Patchwork: https://patchwork.linux-mips.org/patch/17033/
Signed-off-by: Ralf Baechle <ralf@linux-mips.org>
---
 drivers/irqchip/irq-mips-gic.c   | 11 ++---------
 include/linux/irqchip/mips-gic.h |  6 ------
 2 files changed, 2 insertions(+), 15 deletions(-)

(limited to 'include')

diff --git a/drivers/irqchip/irq-mips-gic.c b/drivers/irqchip/irq-mips-gic.c
index f1a4e5d86ca3..d9851cbb2a6b 100644
--- a/drivers/irqchip/irq-mips-gic.c
+++ b/drivers/irqchip/irq-mips-gic.c
@@ -81,13 +81,6 @@ static inline void gic_write(unsigned int reg, unsigned long val)
 		return gic_write64(reg, (u64)val);
 }
 
-static inline void gic_map_to_vpe(unsigned int intr, unsigned int vpe)
-{
-	gic_write(GIC_REG(SHARED, GIC_SH_INTR_MAP_TO_VPE_BASE) +
-		  GIC_SH_MAP_TO_VPE_REG_OFF(intr, vpe),
-		  GIC_SH_MAP_TO_VPE_REG_BIT(vpe));
-}
-
 static bool gic_local_irq_is_routable(int intr)
 {
 	u32 vpe_ctl;
@@ -294,7 +287,7 @@ static int gic_set_affinity(struct irq_data *d, const struct cpumask *cpumask,
 	spin_lock_irqsave(&gic_lock, flags);
 
 	/* Re-route this IRQ */
-	gic_map_to_vpe(irq, mips_cm_vp_id(cpumask_first(&tmp)));
+	write_gic_map_vp(irq, BIT(mips_cm_vp_id(cpumask_first(&tmp))));
 
 	/* Update the pcpu_masks */
 	for (i = 0; i < min(gic_vpes, NR_CPUS); i++)
@@ -486,7 +479,7 @@ static int gic_shared_irq_domain_map(struct irq_domain *d, unsigned int virq,
 
 	spin_lock_irqsave(&gic_lock, flags);
 	write_gic_map_pin(intr, GIC_MAP_PIN_MAP_TO_PIN | gic_cpu_pin);
-	gic_map_to_vpe(intr, mips_cm_vp_id(vpe));
+	write_gic_map_vp(intr, BIT(mips_cm_vp_id(vpe)));
 	for (i = 0; i < min(gic_vpes, NR_CPUS); i++)
 		clear_bit(intr, pcpu_masks[i].pcpu_mask);
 	set_bit(intr, pcpu_masks[vpe].pcpu_mask);
diff --git a/include/linux/irqchip/mips-gic.h b/include/linux/irqchip/mips-gic.h
index dea79a7a54cc..ad8b216b6056 100644
--- a/include/linux/irqchip/mips-gic.h
+++ b/include/linux/irqchip/mips-gic.h
@@ -37,12 +37,6 @@
 /* Set/Clear corresponding bit in Edge Detect Register */
 #define GIC_SH_WEDGE_OFS		0x0280
 
-/* Maps Interrupt X to a VPE */
-#define GIC_SH_INTR_MAP_TO_VPE_BASE_OFS 0x2000
-#define GIC_SH_MAP_TO_VPE_REG_OFF(intr, vpe) \
-	((32 * (intr)) + (((vpe) / 32) * 4))
-#define GIC_SH_MAP_TO_VPE_REG_BIT(vpe)	(1 << ((vpe) % 32))
-
 /* Register Map for Local Section */
 #define GIC_VPE_CTL_OFS			0x0000
 #define GIC_VPE_PEND_OFS		0x0004
-- 
cgit v1.2.3


From 3680746abd87e733ec836025115580562b3dcb47 Mon Sep 17 00:00:00 2001
From: Paul Burton <paul.burton@imgtec.com>
Date: Sat, 12 Aug 2017 21:36:24 -0700
Subject: irqchip: mips-gic: Convert remaining shared reg access to new
 accessors

Convert the remaining accesses to registers in the GIC shared register
block to use the new accessor functions provided by asm/mips-gic.h,
resulting in code which is often shorter & easier to read.

Signed-off-by: Paul Burton <paul.burton@imgtec.com>
Acked-by: Marc Zyngier <marc.zyngier@arm.com>
Cc: Jason Cooper <jason@lakedaemon.net>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-mips@linux-mips.org
Patchwork: https://patchwork.linux-mips.org/patch/17034/
Signed-off-by: Ralf Baechle <ralf@linux-mips.org>
---
 drivers/irqchip/irq-mips-gic.c   | 16 ++++++++--------
 include/linux/irqchip/mips-gic.h | 20 --------------------
 2 files changed, 8 insertions(+), 28 deletions(-)

(limited to 'include')

diff --git a/drivers/irqchip/irq-mips-gic.c b/drivers/irqchip/irq-mips-gic.c
index d9851cbb2a6b..a906284215b7 100644
--- a/drivers/irqchip/irq-mips-gic.c
+++ b/drivers/irqchip/irq-mips-gic.c
@@ -119,7 +119,7 @@ static void gic_send_ipi(struct irq_data *d, unsigned int cpu)
 {
 	irq_hw_number_t hwirq = GIC_HWIRQ_TO_SHARED(irqd_to_hwirq(d));
 
-	gic_write(GIC_REG(SHARED, GIC_SH_WEDGE), GIC_SH_WEDGE_SET(hwirq));
+	write_gic_wedge(GIC_WEDGE_RW | hwirq);
 }
 
 int gic_get_c0_compare_int(void)
@@ -215,7 +215,7 @@ static void gic_ack_irq(struct irq_data *d)
 {
 	unsigned int irq = GIC_HWIRQ_TO_SHARED(d->hwirq);
 
-	gic_write(GIC_REG(SHARED, GIC_SH_WEDGE), GIC_SH_WEDGE_CLR(irq));
+	write_gic_wedge(irq);
 }
 
 static int gic_set_type(struct irq_data *d, unsigned int type)
@@ -700,13 +700,13 @@ static void __init __gic_init(unsigned long gic_base_addr,
 
 	mips_gic_base = ioremap_nocache(gic_base_addr, gic_addrspace_size);
 
-	gicconfig = gic_read(GIC_REG(SHARED, GIC_SH_CONFIG));
-	gic_shared_intrs = (gicconfig & GIC_SH_CONFIG_NUMINTRS_MSK) >>
-		   GIC_SH_CONFIG_NUMINTRS_SHF;
-	gic_shared_intrs = ((gic_shared_intrs + 1) * 8);
+	gicconfig = read_gic_config();
+	gic_shared_intrs = gicconfig & GIC_CONFIG_NUMINTERRUPTS;
+	gic_shared_intrs >>= __fls(GIC_CONFIG_NUMINTERRUPTS);
+	gic_shared_intrs = (gic_shared_intrs + 1) * 8;
 
-	gic_vpes = (gicconfig & GIC_SH_CONFIG_NUMVPES_MSK) >>
-		  GIC_SH_CONFIG_NUMVPES_SHF;
+	gic_vpes = gicconfig & GIC_CONFIG_PVPS;
+	gic_vpes >>= __fls(GIC_CONFIG_PVPS);
 	gic_vpes = gic_vpes + 1;
 
 	if (cpu_has_veic) {
diff --git a/include/linux/irqchip/mips-gic.h b/include/linux/irqchip/mips-gic.h
index ad8b216b6056..f0a60770d775 100644
--- a/include/linux/irqchip/mips-gic.h
+++ b/include/linux/irqchip/mips-gic.h
@@ -19,8 +19,6 @@
 #define GIC_REG(segment, offset) (segment##_##SECTION_OFS + offset##_##OFS)
 
 /* GIC Address Space */
-#define SHARED_SECTION_OFS		0x0000
-#define SHARED_SECTION_SIZE		0x8000
 #define VPE_LOCAL_SECTION_OFS		0x8000
 #define VPE_LOCAL_SECTION_SIZE		0x4000
 #define VPE_OTHER_SECTION_OFS		0xc000
@@ -28,15 +26,6 @@
 #define USM_VISIBLE_SECTION_OFS		0x10000
 #define USM_VISIBLE_SECTION_SIZE	0x10000
 
-/* Register Map for Shared Section */
-
-#define GIC_SH_CONFIG_OFS		0x0000
-
-#define GIC_SH_REVISIONID_OFS		0x0020
-
-/* Set/Clear corresponding bit in Edge Detect Register */
-#define GIC_SH_WEDGE_OFS		0x0280
-
 /* Register Map for Local Section */
 #define GIC_VPE_CTL_OFS			0x0000
 #define GIC_VPE_PEND_OFS		0x0004
@@ -65,15 +54,6 @@
 #define GIC_UMV_SH_COUNTER_63_32_OFS	0x0004
 
 /* Masks */
-#define GIC_SH_CONFIG_NUMINTRS_SHF	16
-#define GIC_SH_CONFIG_NUMINTRS_MSK	(MSK(8) << GIC_SH_CONFIG_NUMINTRS_SHF)
-
-#define GIC_SH_CONFIG_NUMVPES_SHF	0
-#define GIC_SH_CONFIG_NUMVPES_MSK	(MSK(8) << GIC_SH_CONFIG_NUMVPES_SHF)
-
-#define GIC_SH_WEDGE_SET(intr)		((intr) | (0x1 << 31))
-#define GIC_SH_WEDGE_CLR(intr)		((intr) & ~(0x1 << 31))
-
 #define GIC_MAP_SHF			0
 #define GIC_MAP_MSK			(MSK(6) << GIC_MAP_SHF)
 
-- 
cgit v1.2.3


From 9da3c64589e4eae68631b1b5ed31c586be6ad923 Mon Sep 17 00:00:00 2001
From: Paul Burton <paul.burton@imgtec.com>
Date: Sat, 12 Aug 2017 21:36:25 -0700
Subject: irqchip: mips-gic: Convert local int mask access to new accessors

Use the new accessor functions provided by asm/mips-gic.h to access
masks controlling local interrupts, resulting in code which is often
shorter & easier to read.

Signed-off-by: Paul Burton <paul.burton@imgtec.com>
Acked-by: Marc Zyngier <marc.zyngier@arm.com>
Cc: Jason Cooper <jason@lakedaemon.net>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-mips@linux-mips.org
Patchwork: https://patchwork.linux-mips.org/patch/17035/
Signed-off-by: Ralf Baechle <ralf@linux-mips.org>
---
 drivers/irqchip/irq-mips-gic.c   | 14 +++++------
 include/linux/irqchip/mips-gic.h | 52 ----------------------------------------
 2 files changed, 7 insertions(+), 59 deletions(-)

(limited to 'include')

diff --git a/drivers/irqchip/irq-mips-gic.c b/drivers/irqchip/irq-mips-gic.c
index a906284215b7..42d7866c6cd5 100644
--- a/drivers/irqchip/irq-mips-gic.c
+++ b/drivers/irqchip/irq-mips-gic.c
@@ -328,8 +328,8 @@ static void gic_handle_local_int(bool chained)
 	unsigned long pending, masked;
 	unsigned int intr, virq;
 
-	pending = gic_read32(GIC_REG(VPE_LOCAL, GIC_VPE_PEND));
-	masked = gic_read32(GIC_REG(VPE_LOCAL, GIC_VPE_MASK));
+	pending = read_gic_vl_pend();
+	masked = read_gic_vl_mask();
 
 	bitmap_and(&pending, &pending, &masked, GIC_NUM_LOCAL_INTRS);
 
@@ -347,14 +347,14 @@ static void gic_mask_local_irq(struct irq_data *d)
 {
 	int intr = GIC_HWIRQ_TO_LOCAL(d->hwirq);
 
-	gic_write32(GIC_REG(VPE_LOCAL, GIC_VPE_RMASK), 1 << intr);
+	write_gic_vl_rmask(BIT(intr));
 }
 
 static void gic_unmask_local_irq(struct irq_data *d)
 {
 	int intr = GIC_HWIRQ_TO_LOCAL(d->hwirq);
 
-	gic_write32(GIC_REG(VPE_LOCAL, GIC_VPE_SMASK), 1 << intr);
+	write_gic_vl_smask(BIT(intr));
 }
 
 static struct irq_chip gic_local_irq_controller = {
@@ -373,7 +373,7 @@ static void gic_mask_local_irq_all_vpes(struct irq_data *d)
 	for (i = 0; i < gic_vpes; i++) {
 		gic_write(GIC_REG(VPE_LOCAL, GIC_VPE_OTHER_ADDR),
 			  mips_cm_vp_id(i));
-		gic_write32(GIC_REG(VPE_OTHER, GIC_VPE_RMASK), 1 << intr);
+		write_gic_vo_rmask(BIT(intr));
 	}
 	spin_unlock_irqrestore(&gic_lock, flags);
 }
@@ -388,7 +388,7 @@ static void gic_unmask_local_irq_all_vpes(struct irq_data *d)
 	for (i = 0; i < gic_vpes; i++) {
 		gic_write(GIC_REG(VPE_LOCAL, GIC_VPE_OTHER_ADDR),
 			  mips_cm_vp_id(i));
-		gic_write32(GIC_REG(VPE_OTHER, GIC_VPE_SMASK), 1 << intr);
+		write_gic_vo_smask(BIT(intr));
 	}
 	spin_unlock_irqrestore(&gic_lock, flags);
 }
@@ -432,7 +432,7 @@ static void __init gic_basic_init(void)
 		for (j = 0; j < GIC_NUM_LOCAL_INTRS; j++) {
 			if (!gic_local_irq_is_routable(j))
 				continue;
-			gic_write32(GIC_REG(VPE_OTHER, GIC_VPE_RMASK), 1 << j);
+			write_gic_vo_rmask(BIT(j));
 		}
 	}
 }
diff --git a/include/linux/irqchip/mips-gic.h b/include/linux/irqchip/mips-gic.h
index f0a60770d775..011698962a8d 100644
--- a/include/linux/irqchip/mips-gic.h
+++ b/include/linux/irqchip/mips-gic.h
@@ -28,10 +28,6 @@
 
 /* Register Map for Local Section */
 #define GIC_VPE_CTL_OFS			0x0000
-#define GIC_VPE_PEND_OFS		0x0004
-#define GIC_VPE_MASK_OFS		0x0008
-#define GIC_VPE_RMASK_OFS		0x000c
-#define GIC_VPE_SMASK_OFS		0x0010
 #define GIC_VPE_TIMER_MAP_OFS		0x0048
 #define GIC_VPE_OTHER_ADDR_OFS		0x0080
 #define GIC_VPE_WD_CONFIG0_OFS		0x0090
@@ -69,54 +65,6 @@
 #define GIC_VPE_CTL_EIC_MODE_SHF	0
 #define GIC_VPE_CTL_EIC_MODE_MSK	(MSK(1) << GIC_VPE_CTL_EIC_MODE_SHF)
 
-/* GIC_VPE_PEND Masks */
-#define GIC_VPE_PEND_WD_SHF		0
-#define GIC_VPE_PEND_WD_MSK		(MSK(1) << GIC_VPE_PEND_WD_SHF)
-#define GIC_VPE_PEND_CMP_SHF		1
-#define GIC_VPE_PEND_CMP_MSK		(MSK(1) << GIC_VPE_PEND_CMP_SHF)
-#define GIC_VPE_PEND_TIMER_SHF		2
-#define GIC_VPE_PEND_TIMER_MSK		(MSK(1) << GIC_VPE_PEND_TIMER_SHF)
-#define GIC_VPE_PEND_PERFCOUNT_SHF	3
-#define GIC_VPE_PEND_PERFCOUNT_MSK	(MSK(1) << GIC_VPE_PEND_PERFCOUNT_SHF)
-#define GIC_VPE_PEND_SWINT0_SHF		4
-#define GIC_VPE_PEND_SWINT0_MSK		(MSK(1) << GIC_VPE_PEND_SWINT0_SHF)
-#define GIC_VPE_PEND_SWINT1_SHF		5
-#define GIC_VPE_PEND_SWINT1_MSK		(MSK(1) << GIC_VPE_PEND_SWINT1_SHF)
-#define GIC_VPE_PEND_FDC_SHF		6
-#define GIC_VPE_PEND_FDC_MSK		(MSK(1) << GIC_VPE_PEND_FDC_SHF)
-
-/* GIC_VPE_RMASK Masks */
-#define GIC_VPE_RMASK_WD_SHF		0
-#define GIC_VPE_RMASK_WD_MSK		(MSK(1) << GIC_VPE_RMASK_WD_SHF)
-#define GIC_VPE_RMASK_CMP_SHF		1
-#define GIC_VPE_RMASK_CMP_MSK		(MSK(1) << GIC_VPE_RMASK_CMP_SHF)
-#define GIC_VPE_RMASK_TIMER_SHF		2
-#define GIC_VPE_RMASK_TIMER_MSK		(MSK(1) << GIC_VPE_RMASK_TIMER_SHF)
-#define GIC_VPE_RMASK_PERFCNT_SHF	3
-#define GIC_VPE_RMASK_PERFCNT_MSK	(MSK(1) << GIC_VPE_RMASK_PERFCNT_SHF)
-#define GIC_VPE_RMASK_SWINT0_SHF	4
-#define GIC_VPE_RMASK_SWINT0_MSK	(MSK(1) << GIC_VPE_RMASK_SWINT0_SHF)
-#define GIC_VPE_RMASK_SWINT1_SHF	5
-#define GIC_VPE_RMASK_SWINT1_MSK	(MSK(1) << GIC_VPE_RMASK_SWINT1_SHF)
-#define GIC_VPE_RMASK_FDC_SHF		6
-#define GIC_VPE_RMASK_FDC_MSK		(MSK(1) << GIC_VPE_RMASK_FDC_SHF)
-
-/* GIC_VPE_SMASK Masks */
-#define GIC_VPE_SMASK_WD_SHF		0
-#define GIC_VPE_SMASK_WD_MSK		(MSK(1) << GIC_VPE_SMASK_WD_SHF)
-#define GIC_VPE_SMASK_CMP_SHF		1
-#define GIC_VPE_SMASK_CMP_MSK		(MSK(1) << GIC_VPE_SMASK_CMP_SHF)
-#define GIC_VPE_SMASK_TIMER_SHF		2
-#define GIC_VPE_SMASK_TIMER_MSK		(MSK(1) << GIC_VPE_SMASK_TIMER_SHF)
-#define GIC_VPE_SMASK_PERFCNT_SHF	3
-#define GIC_VPE_SMASK_PERFCNT_MSK	(MSK(1) << GIC_VPE_SMASK_PERFCNT_SHF)
-#define GIC_VPE_SMASK_SWINT0_SHF	4
-#define GIC_VPE_SMASK_SWINT0_MSK	(MSK(1) << GIC_VPE_SMASK_SWINT0_SHF)
-#define GIC_VPE_SMASK_SWINT1_SHF	5
-#define GIC_VPE_SMASK_SWINT1_MSK	(MSK(1) << GIC_VPE_SMASK_SWINT1_SHF)
-#define GIC_VPE_SMASK_FDC_SHF		6
-#define GIC_VPE_SMASK_FDC_MSK		(MSK(1) << GIC_VPE_SMASK_FDC_SHF)
-
 /* GIC nomenclature for Core Interrupt Pins. */
 #define GIC_CPU_INT0		0 /* Core Interrupt 2 */
 #define GIC_CPU_INT1		1 /* .		      */
-- 
cgit v1.2.3


From 0d0cf58cd6814ed63deb67fc5f4c27ad725075b1 Mon Sep 17 00:00:00 2001
From: Paul Burton <paul.burton@imgtec.com>
Date: Sat, 12 Aug 2017 21:36:26 -0700
Subject: irqchip: mips-gic: Convert remaining local reg access to new
 accessors

Convert the remaining accesses to registers in the GIC VP-local &
VP-other register blocks to use the new accessor functions provided by
asm/mips-gic.h, resulting in code which is often shorter & easier to
read.

Signed-off-by: Paul Burton <paul.burton@imgtec.com>
Acked-by: Marc Zyngier <marc.zyngier@arm.com>
Cc: Jason Cooper <jason@lakedaemon.net>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-mips@linux-mips.org
Patchwork: https://patchwork.linux-mips.org/patch/17036/
Signed-off-by: Ralf Baechle <ralf@linux-mips.org>
---
 drivers/irqchip/irq-mips-gic.c   | 68 +++++++---------------------------------
 include/linux/irqchip/mips-gic.h | 44 --------------------------
 2 files changed, 12 insertions(+), 100 deletions(-)

(limited to 'include')

diff --git a/drivers/irqchip/irq-mips-gic.c b/drivers/irqchip/irq-mips-gic.c
index 42d7866c6cd5..ff6c2df86fe8 100644
--- a/drivers/irqchip/irq-mips-gic.c
+++ b/drivers/irqchip/irq-mips-gic.c
@@ -45,42 +45,6 @@ DECLARE_BITMAP(ipi_available, GIC_MAX_INTRS);
 
 static void __gic_irq_dispatch(void);
 
-static inline u32 gic_read32(unsigned int reg)
-{
-	return __raw_readl(mips_gic_base + reg);
-}
-
-static inline u64 gic_read64(unsigned int reg)
-{
-	return __raw_readq(mips_gic_base + reg);
-}
-
-static inline unsigned long gic_read(unsigned int reg)
-{
-	if (!mips_cm_is64)
-		return gic_read32(reg);
-	else
-		return gic_read64(reg);
-}
-
-static inline void gic_write32(unsigned int reg, u32 val)
-{
-	return __raw_writel(val, mips_gic_base + reg);
-}
-
-static inline void gic_write64(unsigned int reg, u64 val)
-{
-	return __raw_writeq(val, mips_gic_base + reg);
-}
-
-static inline void gic_write(unsigned int reg, unsigned long val)
-{
-	if (!mips_cm_is64)
-		return gic_write32(reg, (u32)val);
-	else
-		return gic_write64(reg, (u64)val);
-}
-
 static bool gic_local_irq_is_routable(int intr)
 {
 	u32 vpe_ctl;
@@ -89,17 +53,17 @@ static bool gic_local_irq_is_routable(int intr)
 	if (cpu_has_veic)
 		return true;
 
-	vpe_ctl = gic_read32(GIC_REG(VPE_LOCAL, GIC_VPE_CTL));
+	vpe_ctl = read_gic_vl_ctl();
 	switch (intr) {
 	case GIC_LOCAL_INT_TIMER:
-		return vpe_ctl & GIC_VPE_CTL_TIMER_RTBL_MSK;
+		return vpe_ctl & GIC_VX_CTL_TIMER_ROUTABLE;
 	case GIC_LOCAL_INT_PERFCTR:
-		return vpe_ctl & GIC_VPE_CTL_PERFCNT_RTBL_MSK;
+		return vpe_ctl & GIC_VX_CTL_PERFCNT_ROUTABLE;
 	case GIC_LOCAL_INT_FDC:
-		return vpe_ctl & GIC_VPE_CTL_FDC_RTBL_MSK;
+		return vpe_ctl & GIC_VX_CTL_FDC_ROUTABLE;
 	case GIC_LOCAL_INT_SWINT0:
 	case GIC_LOCAL_INT_SWINT1:
-		return vpe_ctl & GIC_VPE_CTL_SWINT_RTBL_MSK;
+		return vpe_ctl & GIC_VX_CTL_SWINT_ROUTABLE;
 	default:
 		return true;
 	}
@@ -111,8 +75,7 @@ static void gic_bind_eic_interrupt(int irq, int set)
 	irq -= GIC_PIN_TO_VEC_OFFSET;
 
 	/* Set irq to use shadow set */
-	gic_write(GIC_REG(VPE_LOCAL, GIC_VPE_EIC_SHADOW_SET_BASE) +
-		  GIC_VPE_EIC_SS(irq), set);
+	write_gic_vl_eic_shadow_set(irq, set);
 }
 
 static void gic_send_ipi(struct irq_data *d, unsigned int cpu)
@@ -371,8 +334,7 @@ static void gic_mask_local_irq_all_vpes(struct irq_data *d)
 
 	spin_lock_irqsave(&gic_lock, flags);
 	for (i = 0; i < gic_vpes; i++) {
-		gic_write(GIC_REG(VPE_LOCAL, GIC_VPE_OTHER_ADDR),
-			  mips_cm_vp_id(i));
+		write_gic_vl_other(mips_cm_vp_id(i));
 		write_gic_vo_rmask(BIT(intr));
 	}
 	spin_unlock_irqrestore(&gic_lock, flags);
@@ -386,8 +348,7 @@ static void gic_unmask_local_irq_all_vpes(struct irq_data *d)
 
 	spin_lock_irqsave(&gic_lock, flags);
 	for (i = 0; i < gic_vpes; i++) {
-		gic_write(GIC_REG(VPE_LOCAL, GIC_VPE_OTHER_ADDR),
-			  mips_cm_vp_id(i));
+		write_gic_vl_other(mips_cm_vp_id(i));
 		write_gic_vo_smask(BIT(intr));
 	}
 	spin_unlock_irqrestore(&gic_lock, flags);
@@ -427,8 +388,7 @@ static void __init gic_basic_init(void)
 	for (i = 0; i < gic_vpes; i++) {
 		unsigned int j;
 
-		gic_write(GIC_REG(VPE_LOCAL, GIC_VPE_OTHER_ADDR),
-			  mips_cm_vp_id(i));
+		write_gic_vl_other(mips_cm_vp_id(i));
 		for (j = 0; j < GIC_NUM_LOCAL_INTRS; j++) {
 			if (!gic_local_irq_is_routable(j))
 				continue;
@@ -712,10 +672,8 @@ static void __init __gic_init(unsigned long gic_base_addr,
 	if (cpu_has_veic) {
 		/* Set EIC mode for all VPEs */
 		for_each_present_cpu(cpu) {
-			gic_write(GIC_REG(VPE_LOCAL, GIC_VPE_OTHER_ADDR),
-				  mips_cm_vp_id(cpu));
-			gic_write(GIC_REG(VPE_OTHER, GIC_VPE_CTL),
-				  GIC_VPE_CTL_EIC_MODE_MSK);
+			write_gic_vl_other(mips_cm_vp_id(cpu));
+			write_gic_vo_ctl(GIC_VX_CTL_EIC);
 		}
 
 		/* Always use vector 1 in EIC mode */
@@ -740,9 +698,7 @@ static void __init __gic_init(unsigned long gic_base_addr,
 		 */
 		if (IS_ENABLED(CONFIG_MIPS_CMP) &&
 		    gic_local_irq_is_routable(GIC_LOCAL_INT_TIMER)) {
-			timer_cpu_pin = gic_read32(GIC_REG(VPE_LOCAL,
-							 GIC_VPE_TIMER_MAP)) &
-					GIC_MAP_MSK;
+			timer_cpu_pin = read_gic_vl_timer_map() & GIC_MAP_PIN_MAP;
 			irq_set_chained_handler(MIPS_CPU_IRQ_BASE +
 						GIC_CPU_PIN_OFFSET +
 						timer_cpu_pin,
diff --git a/include/linux/irqchip/mips-gic.h b/include/linux/irqchip/mips-gic.h
index 011698962a8d..b7a3ce1da9a7 100644
--- a/include/linux/irqchip/mips-gic.h
+++ b/include/linux/irqchip/mips-gic.h
@@ -13,58 +13,14 @@
 
 #define GIC_MAX_INTRS			256
 
-#define MSK(n) ((1 << (n)) - 1)
-
-/* Accessors */
-#define GIC_REG(segment, offset) (segment##_##SECTION_OFS + offset##_##OFS)
-
 /* GIC Address Space */
-#define VPE_LOCAL_SECTION_OFS		0x8000
-#define VPE_LOCAL_SECTION_SIZE		0x4000
-#define VPE_OTHER_SECTION_OFS		0xc000
-#define VPE_OTHER_SECTION_SIZE		0x4000
 #define USM_VISIBLE_SECTION_OFS		0x10000
 #define USM_VISIBLE_SECTION_SIZE	0x10000
 
-/* Register Map for Local Section */
-#define GIC_VPE_CTL_OFS			0x0000
-#define GIC_VPE_TIMER_MAP_OFS		0x0048
-#define GIC_VPE_OTHER_ADDR_OFS		0x0080
-#define GIC_VPE_WD_CONFIG0_OFS		0x0090
-#define GIC_VPE_WD_COUNT0_OFS		0x0094
-#define GIC_VPE_WD_INITIAL0_OFS		0x0098
-
-#define GIC_VPE_EIC_SHADOW_SET_BASE_OFS	0x0100
-#define GIC_VPE_EIC_SS(intr)		(4 * (intr))
-
-#define GIC_VPE_EIC_VEC_BASE_OFS	0x0800
-#define GIC_VPE_EIC_VEC(intr)		(4 * (intr))
-
-#define GIC_VPE_TENABLE_NMI_OFS		0x1000
-#define GIC_VPE_TENABLE_YQ_OFS		0x1004
-#define GIC_VPE_TENABLE_INT_31_0_OFS	0x1080
-#define GIC_VPE_TENABLE_INT_63_32_OFS	0x1084
-
 /* User Mode Visible Section Register Map */
 #define GIC_UMV_SH_COUNTER_31_00_OFS	0x0000
 #define GIC_UMV_SH_COUNTER_63_32_OFS	0x0004
 
-/* Masks */
-#define GIC_MAP_SHF			0
-#define GIC_MAP_MSK			(MSK(6) << GIC_MAP_SHF)
-
-/* GIC_VPE_CTL Masks */
-#define GIC_VPE_CTL_FDC_RTBL_SHF	4
-#define GIC_VPE_CTL_FDC_RTBL_MSK	(MSK(1) << GIC_VPE_CTL_FDC_RTBL_SHF)
-#define GIC_VPE_CTL_SWINT_RTBL_SHF	3
-#define GIC_VPE_CTL_SWINT_RTBL_MSK	(MSK(1) << GIC_VPE_CTL_SWINT_RTBL_SHF)
-#define GIC_VPE_CTL_PERFCNT_RTBL_SHF	2
-#define GIC_VPE_CTL_PERFCNT_RTBL_MSK	(MSK(1) << GIC_VPE_CTL_PERFCNT_RTBL_SHF)
-#define GIC_VPE_CTL_TIMER_RTBL_SHF	1
-#define GIC_VPE_CTL_TIMER_RTBL_MSK	(MSK(1) << GIC_VPE_CTL_TIMER_RTBL_SHF)
-#define GIC_VPE_CTL_EIC_MODE_SHF	0
-#define GIC_VPE_CTL_EIC_MODE_MSK	(MSK(1) << GIC_VPE_CTL_EIC_MODE_SHF)
-
 /* GIC nomenclature for Core Interrupt Pins. */
 #define GIC_CPU_INT0		0 /* Core Interrupt 2 */
 #define GIC_CPU_INT1		1 /* .		      */
-- 
cgit v1.2.3


From ba9cc4352e9c40ab3b158620a1bd9cd53295ea17 Mon Sep 17 00:00:00 2001
From: Paul Burton <paul.burton@imgtec.com>
Date: Sat, 12 Aug 2017 21:36:27 -0700
Subject: MIPS: GIC: Move GIC_LOCAL_INT_* to asm/mips-gic.h

Move the definition of VP-local interrupts provided by the MIPS Global
Interrupt Controller to the new asm/mips-gic.h header to be alongside
the new accessor functions. Whilst at it, convert to an enum which lends
itself more easily to expansion & documentation.

Signed-off-by: Paul Burton <paul.burton@imgtec.com>
Acked-by: Marc Zyngier <marc.zyngier@arm.com>
Cc: Jason Cooper <jason@lakedaemon.net>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-mips@linux-mips.org
Patchwork: https://patchwork.linux-mips.org/patch/17037/
Signed-off-by: Ralf Baechle <ralf@linux-mips.org>
---
 arch/mips/include/asm/mips-gic.h | 24 ++++++++++++++++++++++++
 include/linux/irqchip/mips-gic.h | 10 ----------
 2 files changed, 24 insertions(+), 10 deletions(-)

(limited to 'include')

diff --git a/arch/mips/include/asm/mips-gic.h b/arch/mips/include/asm/mips-gic.h
index 8cf4bdc1a059..27736d7f4aba 100644
--- a/arch/mips/include/asm/mips-gic.h
+++ b/arch/mips/include/asm/mips-gic.h
@@ -277,6 +277,30 @@ GIC_VX_ACCESSOR_RW(64, 0x0a0, compare)
 /* GIC_Vx_EIC_SHADOW_SET_BASE - Set shadow register set for each interrupt */
 GIC_VX_ACCESSOR_RW_INTR_REG(32, 0x100, 0x4, eic_shadow_set)
 
+/**
+ * enum mips_gic_local_interrupt - GIC local interrupts
+ * @GIC_LOCAL_INT_WD: GIC watchdog timer interrupt
+ * @GIC_LOCAL_INT_COMPARE: GIC count/compare interrupt
+ * @GIC_LOCAL_INT_TIMER: CP0 count/compare interrupt
+ * @GIC_LOCAL_INT_PERFCTR: Performance counter interrupt
+ * @GIC_LOCAL_INT_SWINT0: Software interrupt 0
+ * @GIC_LOCAL_INT_SWINT1: Software interrupt 1
+ * @GIC_LOCAL_INT_FDC: Fast debug channel interrupt
+ * @GIC_NUM_LOCAL_INTRS: The number of local interrupts
+ *
+ * Enumerates interrupts provided by the GIC that are local to a VP.
+ */
+enum mips_gic_local_interrupt {
+	GIC_LOCAL_INT_WD,
+	GIC_LOCAL_INT_COMPARE,
+	GIC_LOCAL_INT_TIMER,
+	GIC_LOCAL_INT_PERFCTR,
+	GIC_LOCAL_INT_SWINT0,
+	GIC_LOCAL_INT_SWINT1,
+	GIC_LOCAL_INT_FDC,
+	GIC_NUM_LOCAL_INTRS
+};
+
 /**
  * mips_gic_present() - Determine whether a GIC is present
  *
diff --git a/include/linux/irqchip/mips-gic.h b/include/linux/irqchip/mips-gic.h
index b7a3ce1da9a7..9546947d1842 100644
--- a/include/linux/irqchip/mips-gic.h
+++ b/include/linux/irqchip/mips-gic.h
@@ -38,16 +38,6 @@
 /* Mapped interrupt to pin X, then GIC will generate the vector (X+1). */
 #define GIC_PIN_TO_VEC_OFFSET	1
 
-/* Local GIC interrupts. */
-#define GIC_LOCAL_INT_WD	0 /* GIC watchdog */
-#define GIC_LOCAL_INT_COMPARE	1 /* GIC count and compare timer */
-#define GIC_LOCAL_INT_TIMER	2 /* CPU timer interrupt */
-#define GIC_LOCAL_INT_PERFCTR	3 /* CPU performance counter */
-#define GIC_LOCAL_INT_SWINT0	4 /* CPU software interrupt 0 */
-#define GIC_LOCAL_INT_SWINT1	5 /* CPU software interrupt 1 */
-#define GIC_LOCAL_INT_FDC	6 /* CPU fast debug channel */
-#define GIC_NUM_LOCAL_INTRS	7
-
 /* Convert between local/shared IRQ number and GIC HW IRQ number. */
 #define GIC_LOCAL_HWIRQ_BASE	0
 #define GIC_LOCAL_TO_HWIRQ(x)	(GIC_LOCAL_HWIRQ_BASE + (x))
-- 
cgit v1.2.3


From 3ee50dcbef374651056ea42d5eb543bbacb3ff41 Mon Sep 17 00:00:00 2001
From: Paul Burton <paul.burton@imgtec.com>
Date: Sat, 12 Aug 2017 21:36:28 -0700
Subject: irqchip: mips-gic: Remove GIC_CPU_INT* macros

The GIC_CPU_INT* macros are never used. Remove the dead code.

Signed-off-by: Paul Burton <paul.burton@imgtec.com>
Acked-by: Marc Zyngier <marc.zyngier@arm.com>
Cc: Jason Cooper <jason@lakedaemon.net>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-mips@linux-mips.org
Patchwork: https://patchwork.linux-mips.org/patch/17038/
Signed-off-by: Ralf Baechle <ralf@linux-mips.org>
---
 include/linux/irqchip/mips-gic.h | 8 --------
 1 file changed, 8 deletions(-)

(limited to 'include')

diff --git a/include/linux/irqchip/mips-gic.h b/include/linux/irqchip/mips-gic.h
index 9546947d1842..e93aaf529baa 100644
--- a/include/linux/irqchip/mips-gic.h
+++ b/include/linux/irqchip/mips-gic.h
@@ -21,14 +21,6 @@
 #define GIC_UMV_SH_COUNTER_31_00_OFS	0x0000
 #define GIC_UMV_SH_COUNTER_63_32_OFS	0x0004
 
-/* GIC nomenclature for Core Interrupt Pins. */
-#define GIC_CPU_INT0		0 /* Core Interrupt 2 */
-#define GIC_CPU_INT1		1 /* .		      */
-#define GIC_CPU_INT2		2 /* .		      */
-#define GIC_CPU_INT3		3 /* .		      */
-#define GIC_CPU_INT4		4 /* .		      */
-#define GIC_CPU_INT5		5 /* Core Interrupt 7 */
-
 /* Add 2 to convert GIC CPU pin to core interrupt */
 #define GIC_CPU_PIN_OFFSET	2
 
-- 
cgit v1.2.3


From b11d4c1f5a3ac68fa163b0daeca1f98ba328c9de Mon Sep 17 00:00:00 2001
From: Paul Burton <paul.burton@imgtec.com>
Date: Sat, 12 Aug 2017 21:36:29 -0700
Subject: irqchip: mips-gic: Move various definitions to the driver

Move the definitions of macros used to convert between hardware IRQ
numbers & shared or local interrupt numbers into the irqchip driver,
which is all that should ever need to care about them.

Remove GIC_CPU_TO_VEC_OFFSET() in the process since it's never used.

Signed-off-by: Paul Burton <paul.burton@imgtec.com>
Acked-by: Marc Zyngier <marc.zyngier@arm.com>
Cc: Jason Cooper <jason@lakedaemon.net>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-mips@linux-mips.org
Patchwork: https://patchwork.linux-mips.org/patch/17039/
Signed-off-by: Ralf Baechle <ralf@linux-mips.org>
---
 arch/mips/include/asm/mips-boards/maltaint.h |  5 -----
 drivers/irqchip/irq-mips-gic.c               | 16 ++++++++++++++++
 include/linux/irqchip/mips-gic.h             | 19 -------------------
 3 files changed, 16 insertions(+), 24 deletions(-)

(limited to 'include')

diff --git a/arch/mips/include/asm/mips-boards/maltaint.h b/arch/mips/include/asm/mips-boards/maltaint.h
index 987ff580466b..817698abf2eb 100644
--- a/arch/mips/include/asm/mips-boards/maltaint.h
+++ b/arch/mips/include/asm/mips-boards/maltaint.h
@@ -10,8 +10,6 @@
 #ifndef _MIPS_MALTAINT_H
 #define _MIPS_MALTAINT_H
 
-#include <linux/irqchip/mips-gic.h>
-
 /*
  * Interrupts 0..15 are used for Malta ISA compatible interrupts
  */
@@ -62,7 +60,4 @@
 #define MSC01E_INT_PERFCTR	10
 #define MSC01E_INT_CPUCTR	11
 
-/* GIC external interrupts */
-#define GIC_INT_I8259A		GIC_SHARED_TO_HWIRQ(3)
-
 #endif /* !(_MIPS_MALTAINT_H) */
diff --git a/drivers/irqchip/irq-mips-gic.c b/drivers/irqchip/irq-mips-gic.c
index ff6c2df86fe8..6e303df56447 100644
--- a/drivers/irqchip/irq-mips-gic.c
+++ b/drivers/irqchip/irq-mips-gic.c
@@ -23,6 +23,22 @@
 
 #include <dt-bindings/interrupt-controller/mips-gic.h>
 
+#define GIC_MAX_INTRS		256
+
+/* Add 2 to convert GIC CPU pin to core interrupt */
+#define GIC_CPU_PIN_OFFSET	2
+
+/* Mapped interrupt to pin X, then GIC will generate the vector (X+1). */
+#define GIC_PIN_TO_VEC_OFFSET	1
+
+/* Convert between local/shared IRQ number and GIC HW IRQ number. */
+#define GIC_LOCAL_HWIRQ_BASE	0
+#define GIC_LOCAL_TO_HWIRQ(x)	(GIC_LOCAL_HWIRQ_BASE + (x))
+#define GIC_HWIRQ_TO_LOCAL(x)	((x) - GIC_LOCAL_HWIRQ_BASE)
+#define GIC_SHARED_HWIRQ_BASE	GIC_NUM_LOCAL_INTRS
+#define GIC_SHARED_TO_HWIRQ(x)	(GIC_SHARED_HWIRQ_BASE + (x))
+#define GIC_HWIRQ_TO_SHARED(x)	((x) - GIC_SHARED_HWIRQ_BASE)
+
 unsigned int gic_present;
 void __iomem *mips_gic_base;
 
diff --git a/include/linux/irqchip/mips-gic.h b/include/linux/irqchip/mips-gic.h
index e93aaf529baa..da02a146b292 100644
--- a/include/linux/irqchip/mips-gic.h
+++ b/include/linux/irqchip/mips-gic.h
@@ -11,8 +11,6 @@
 #include <linux/clocksource.h>
 #include <linux/ioport.h>
 
-#define GIC_MAX_INTRS			256
-
 /* GIC Address Space */
 #define USM_VISIBLE_SECTION_OFS		0x10000
 #define USM_VISIBLE_SECTION_SIZE	0x10000
@@ -21,23 +19,6 @@
 #define GIC_UMV_SH_COUNTER_31_00_OFS	0x0000
 #define GIC_UMV_SH_COUNTER_63_32_OFS	0x0004
 
-/* Add 2 to convert GIC CPU pin to core interrupt */
-#define GIC_CPU_PIN_OFFSET	2
-
-/* Add 2 to convert non-EIC hardware interrupt to EIC vector number. */
-#define GIC_CPU_TO_VEC_OFFSET	2
-
-/* Mapped interrupt to pin X, then GIC will generate the vector (X+1). */
-#define GIC_PIN_TO_VEC_OFFSET	1
-
-/* Convert between local/shared IRQ number and GIC HW IRQ number. */
-#define GIC_LOCAL_HWIRQ_BASE	0
-#define GIC_LOCAL_TO_HWIRQ(x)	(GIC_LOCAL_HWIRQ_BASE + (x))
-#define GIC_HWIRQ_TO_LOCAL(x)	((x) - GIC_LOCAL_HWIRQ_BASE)
-#define GIC_SHARED_HWIRQ_BASE	GIC_NUM_LOCAL_INTRS
-#define GIC_SHARED_TO_HWIRQ(x)	(GIC_SHARED_HWIRQ_BASE + (x))
-#define GIC_HWIRQ_TO_SHARED(x)	((x) - GIC_SHARED_HWIRQ_BASE)
-
 #ifdef CONFIG_MIPS_GIC
 
 extern unsigned int gic_present;
-- 
cgit v1.2.3


From 84103814a2cfd3561ff00bd7317c22f40f9e0dad Mon Sep 17 00:00:00 2001
From: Paul Burton <paul.burton@imgtec.com>
Date: Sat, 12 Aug 2017 21:36:31 -0700
Subject: irqchip: mips-gic: Remove gic_get_usm_range()

The MIPS VDSO code is no longer reliant upon the irqchip driver to
provide the address of the GIC's user-visible section via
gic_get_usm_range(). Remove the now-dead code.

Signed-off-by: Paul Burton <paul.burton@imgtec.com>
Acked-by: Marc Zyngier <marc.zyngier@arm.com>
Cc: Jason Cooper <jason@lakedaemon.net>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-mips@linux-mips.org
Patchwork: https://patchwork.linux-mips.org/patch/17041/
Signed-off-by: Ralf Baechle <ralf@linux-mips.org>
---
 drivers/irqchip/irq-mips-gic.c   | 14 --------------
 include/linux/irqchip/mips-gic.h | 11 -----------
 2 files changed, 25 deletions(-)

(limited to 'include')

diff --git a/drivers/irqchip/irq-mips-gic.c b/drivers/irqchip/irq-mips-gic.c
index 6e303df56447..3e4c79e921e3 100644
--- a/drivers/irqchip/irq-mips-gic.c
+++ b/drivers/irqchip/irq-mips-gic.c
@@ -46,7 +46,6 @@ struct gic_pcpu_mask {
 	DECLARE_BITMAP(pcpu_mask, GIC_MAX_INTRS);
 };
 
-static unsigned long __gic_base_addr;
 static struct gic_pcpu_mask pcpu_masks[NR_CPUS];
 static DEFINE_SPINLOCK(gic_lock);
 static struct irq_domain *gic_irq_domain;
@@ -134,17 +133,6 @@ int gic_get_c0_fdc_int(void)
 				  GIC_LOCAL_TO_HWIRQ(GIC_LOCAL_INT_FDC));
 }
 
-int gic_get_usm_range(struct resource *gic_usm_res)
-{
-	if (!gic_present)
-		return -1;
-
-	gic_usm_res->start = __gic_base_addr + USM_VISIBLE_SECTION_OFS;
-	gic_usm_res->end = gic_usm_res->start + (USM_VISIBLE_SECTION_SIZE - 1);
-
-	return 0;
-}
-
 static void gic_handle_shared_int(bool chained)
 {
 	unsigned int intr, virq;
@@ -672,8 +660,6 @@ static void __init __gic_init(unsigned long gic_base_addr,
 	unsigned int gicconfig, cpu;
 	unsigned int v[2];
 
-	__gic_base_addr = gic_base_addr;
-
 	mips_gic_base = ioremap_nocache(gic_base_addr, gic_addrspace_size);
 
 	gicconfig = read_gic_config();
diff --git a/include/linux/irqchip/mips-gic.h b/include/linux/irqchip/mips-gic.h
index da02a146b292..843e1bb49767 100644
--- a/include/linux/irqchip/mips-gic.h
+++ b/include/linux/irqchip/mips-gic.h
@@ -11,10 +11,6 @@
 #include <linux/clocksource.h>
 #include <linux/ioport.h>
 
-/* GIC Address Space */
-#define USM_VISIBLE_SECTION_OFS		0x10000
-#define USM_VISIBLE_SECTION_SIZE	0x10000
-
 /* User Mode Visible Section Register Map */
 #define GIC_UMV_SH_COUNTER_31_00_OFS	0x0000
 #define GIC_UMV_SH_COUNTER_63_32_OFS	0x0004
@@ -29,18 +25,11 @@ extern void gic_init(unsigned long gic_base_addr,
 extern int gic_get_c0_compare_int(void);
 extern int gic_get_c0_perfcount_int(void);
 extern int gic_get_c0_fdc_int(void);
-extern int gic_get_usm_range(struct resource *gic_usm_res);
 
 #else /* CONFIG_MIPS_GIC */
 
 #define gic_present	0
 
-static inline int gic_get_usm_range(struct resource *gic_usm_res)
-{
-	/* Shouldn't be called. */
-	return -1;
-}
-
 #endif /* CONFIG_MIPS_GIC */
 
 #endif /* __LINUX_IRQCHIP_MIPS_GIC_H */
-- 
cgit v1.2.3


From 85eec73ce4c4322b3b442b9582f6656abb5d125f Mon Sep 17 00:00:00 2001
From: Paul Burton <paul.burton@imgtec.com>
Date: Sat, 12 Aug 2017 21:36:33 -0700
Subject: irqchip: mips-gic: Remove gic_init()

All in-tree platforms now probe the GIC driver using device tree, and as
such nothing calls gic_init() any longer. Remove the dead code.

Signed-off-by: Paul Burton <paul.burton@imgtec.com>
Acked-by: Marc Zyngier <marc.zyngier@arm.com>
Cc: Jason Cooper <jason@lakedaemon.net>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-mips@linux-mips.org
Patchwork: https://patchwork.linux-mips.org/patch/17043/
Signed-off-by: Ralf Baechle <ralf@linux-mips.org>
---
 drivers/irqchip/irq-mips-gic.c   | 7 -------
 include/linux/irqchip/mips-gic.h | 3 ---
 2 files changed, 10 deletions(-)

(limited to 'include')

diff --git a/drivers/irqchip/irq-mips-gic.c b/drivers/irqchip/irq-mips-gic.c
index 462c2e509714..4ef3f53225ca 100644
--- a/drivers/irqchip/irq-mips-gic.c
+++ b/drivers/irqchip/irq-mips-gic.c
@@ -737,13 +737,6 @@ static void __init __gic_init(unsigned long gic_base_addr,
 	gic_basic_init();
 }
 
-void __init gic_init(unsigned long gic_base_addr,
-		     unsigned long gic_addrspace_size,
-		     unsigned int cpu_vec, unsigned int irqbase)
-{
-	__gic_init(gic_base_addr, gic_addrspace_size, cpu_vec, irqbase, NULL);
-}
-
 static int __init gic_of_init(struct device_node *node,
 			      struct device_node *parent)
 {
diff --git a/include/linux/irqchip/mips-gic.h b/include/linux/irqchip/mips-gic.h
index 843e1bb49767..63c5f87034c9 100644
--- a/include/linux/irqchip/mips-gic.h
+++ b/include/linux/irqchip/mips-gic.h
@@ -19,9 +19,6 @@
 
 extern unsigned int gic_present;
 
-extern void gic_init(unsigned long gic_base_addr,
-	unsigned long gic_addrspace_size, unsigned int cpu_vec,
-	unsigned int irqbase);
 extern int gic_get_c0_compare_int(void);
 extern int gic_get_c0_perfcount_int(void);
 extern int gic_get_c0_fdc_int(void);
-- 
cgit v1.2.3


From 56d7b61dc6d4e13cef786c8e4a2bfe43e4db932d Mon Sep 17 00:00:00 2001
From: Paul Burton <paul.burton@imgtec.com>
Date: Sat, 12 Aug 2017 21:36:35 -0700
Subject: irqchip: mips-gic: Remove gic_present

Nothing uses the global gic_present variable anymore; mips_gic_present()
should be used instead. Remove the dead code.

Signed-off-by: Paul Burton <paul.burton@imgtec.com>
Acked-by: Marc Zyngier <marc.zyngier@arm.com>
Cc: Jason Cooper <jason@lakedaemon.net>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-mips@linux-mips.org
Patchwork: https://patchwork.linux-mips.org/patch/17045/
Signed-off-by: Ralf Baechle <ralf@linux-mips.org>
---
 drivers/irqchip/irq-mips-gic.c   | 2 --
 include/linux/irqchip/mips-gic.h | 6 ------
 2 files changed, 8 deletions(-)

(limited to 'include')

diff --git a/drivers/irqchip/irq-mips-gic.c b/drivers/irqchip/irq-mips-gic.c
index 4ef3f53225ca..b444bef6d3c2 100644
--- a/drivers/irqchip/irq-mips-gic.c
+++ b/drivers/irqchip/irq-mips-gic.c
@@ -39,7 +39,6 @@
 #define GIC_SHARED_TO_HWIRQ(x)	(GIC_SHARED_HWIRQ_BASE + (x))
 #define GIC_HWIRQ_TO_SHARED(x)	((x) - GIC_SHARED_HWIRQ_BASE)
 
-unsigned int gic_present;
 void __iomem *mips_gic_base;
 
 struct gic_pcpu_mask {
@@ -781,7 +780,6 @@ static int __init gic_of_init(struct device_node *node,
 		/* Ensure GIC region is enabled before trying to access it */
 		__sync();
 	}
-	gic_present = true;
 
 	__gic_init(gic_base, gic_len, cpu_vec, 0, node);
 
diff --git a/include/linux/irqchip/mips-gic.h b/include/linux/irqchip/mips-gic.h
index 63c5f87034c9..113010c8dd8b 100644
--- a/include/linux/irqchip/mips-gic.h
+++ b/include/linux/irqchip/mips-gic.h
@@ -17,16 +17,10 @@
 
 #ifdef CONFIG_MIPS_GIC
 
-extern unsigned int gic_present;
-
 extern int gic_get_c0_compare_int(void);
 extern int gic_get_c0_perfcount_int(void);
 extern int gic_get_c0_fdc_int(void);
 
-#else /* CONFIG_MIPS_GIC */
-
-#define gic_present	0
-
 #endif /* CONFIG_MIPS_GIC */
 
 #endif /* __LINUX_IRQCHIP_MIPS_GIC_H */
-- 
cgit v1.2.3


From dd0163508c07a67b28befe5af23d7ab9941ae8ca Mon Sep 17 00:00:00 2001
From: Paul Burton <paul.burton@imgtec.com>
Date: Sat, 12 Aug 2017 21:36:36 -0700
Subject: irqchip: mips-gic: Move gic_get_c0_*_int() to asm/mips-gic.h

The linux/irqchip/mips-gic.h header is now almost empty. Move the
declarations of gic_get_c0_compare_int(), gic_get_c0_perfcount_int() &
gic_get_c0_fdc_int() to asm/mips-gic.h in order to close in on being
able to delete the former header.

Signed-off-by: Paul Burton <paul.burton@imgtec.com>
Acked-by: Marc Zyngier <marc.zyngier@arm.com>
Cc: Jason Cooper <jason@lakedaemon.net>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-mips@linux-mips.org
Patchwork: https://patchwork.linux-mips.org/patch/17046/
Signed-off-by: Ralf Baechle <ralf@linux-mips.org>
---
 arch/mips/generic/irq.c          |  1 -
 arch/mips/include/asm/mips-gic.h | 30 ++++++++++++++++++++++++++++++
 arch/mips/mti-malta/malta-time.c |  1 -
 arch/mips/pistachio/time.c       |  2 +-
 arch/mips/ralink/irq-gic.c       |  2 +-
 drivers/irqchip/irq-mips-gic.c   |  1 -
 include/linux/irqchip/mips-gic.h |  8 --------
 7 files changed, 32 insertions(+), 13 deletions(-)

(limited to 'include')

diff --git a/arch/mips/generic/irq.c b/arch/mips/generic/irq.c
index 2d7bf74179d5..5322d09dd51b 100644
--- a/arch/mips/generic/irq.c
+++ b/arch/mips/generic/irq.c
@@ -12,7 +12,6 @@
 #include <linux/clk-provider.h>
 #include <linux/clocksource.h>
 #include <linux/init.h>
-#include <linux/irqchip/mips-gic.h>
 #include <linux/types.h>
 
 #include <asm/irq.h>
diff --git a/arch/mips/include/asm/mips-gic.h b/arch/mips/include/asm/mips-gic.h
index 27736d7f4aba..a2badf572632 100644
--- a/arch/mips/include/asm/mips-gic.h
+++ b/arch/mips/include/asm/mips-gic.h
@@ -314,4 +314,34 @@ static inline bool mips_gic_present(void)
 	return IS_ENABLED(CONFIG_MIPS_GIC) && mips_gic_base;
 }
 
+/**
+ * gic_get_c0_compare_int() - Return cp0 count/compare interrupt virq
+ *
+ * Determine the virq number to use for the coprocessor 0 count/compare
+ * interrupt, which may be routed via the GIC.
+ *
+ * Returns the virq number or a negative error number.
+ */
+extern int gic_get_c0_compare_int(void);
+
+/**
+ * gic_get_c0_perfcount_int() - Return performance counter interrupt virq
+ *
+ * Determine the virq number to use for CPU performance counter interrupts,
+ * which may be routed via the GIC.
+ *
+ * Returns the virq number or a negative error number.
+ */
+extern int gic_get_c0_perfcount_int(void);
+
+/**
+ * gic_get_c0_fdc_int() - Return fast debug channel interrupt virq
+ *
+ * Determine the virq number to use for fast debug channel (FDC) interrupts,
+ * which may be routed via the GIC.
+ *
+ * Returns the virq number or a negative error number.
+ */
+extern int gic_get_c0_fdc_int(void);
+
 #endif /* __MIPS_ASM_MIPS_CPS_H__ */
diff --git a/arch/mips/mti-malta/malta-time.c b/arch/mips/mti-malta/malta-time.c
index 7d53103b085f..66c866740ff2 100644
--- a/arch/mips/mti-malta/malta-time.c
+++ b/arch/mips/mti-malta/malta-time.c
@@ -26,7 +26,6 @@
 #include <linux/sched.h>
 #include <linux/spinlock.h>
 #include <linux/interrupt.h>
-#include <linux/irqchip/mips-gic.h>
 #include <linux/timex.h>
 #include <linux/mc146818rtc.h>
 
diff --git a/arch/mips/pistachio/time.c b/arch/mips/pistachio/time.c
index 17a0f1dec05b..8a6af9b76202 100644
--- a/arch/mips/pistachio/time.c
+++ b/arch/mips/pistachio/time.c
@@ -12,9 +12,9 @@
 #include <linux/clk-provider.h>
 #include <linux/clocksource.h>
 #include <linux/init.h>
-#include <linux/irqchip/mips-gic.h>
 #include <linux/of.h>
 
+#include <asm/mips-cps.h>
 #include <asm/time.h>
 
 unsigned int get_c0_compare_int(void)
diff --git a/arch/mips/ralink/irq-gic.c b/arch/mips/ralink/irq-gic.c
index 2058280450b5..bda576f2cad8 100644
--- a/arch/mips/ralink/irq-gic.c
+++ b/arch/mips/ralink/irq-gic.c
@@ -11,7 +11,7 @@
 
 #include <linux/of.h>
 #include <linux/irqchip.h>
-#include <linux/irqchip/mips-gic.h>
+#include <asm/mips-cps.h>
 
 int get_c0_perfcount_int(void)
 {
diff --git a/drivers/irqchip/irq-mips-gic.c b/drivers/irqchip/irq-mips-gic.c
index b444bef6d3c2..bbf39dcfeda4 100644
--- a/drivers/irqchip/irq-mips-gic.c
+++ b/drivers/irqchip/irq-mips-gic.c
@@ -12,7 +12,6 @@
 #include <linux/interrupt.h>
 #include <linux/irq.h>
 #include <linux/irqchip.h>
-#include <linux/irqchip/mips-gic.h>
 #include <linux/of_address.h>
 #include <linux/sched.h>
 #include <linux/smp.h>
diff --git a/include/linux/irqchip/mips-gic.h b/include/linux/irqchip/mips-gic.h
index 113010c8dd8b..277d5be03a57 100644
--- a/include/linux/irqchip/mips-gic.h
+++ b/include/linux/irqchip/mips-gic.h
@@ -15,12 +15,4 @@
 #define GIC_UMV_SH_COUNTER_31_00_OFS	0x0000
 #define GIC_UMV_SH_COUNTER_63_32_OFS	0x0004
 
-#ifdef CONFIG_MIPS_GIC
-
-extern int gic_get_c0_compare_int(void);
-extern int gic_get_c0_perfcount_int(void);
-extern int gic_get_c0_fdc_int(void);
-
-#endif /* CONFIG_MIPS_GIC */
-
 #endif /* __LINUX_IRQCHIP_MIPS_GIC_H */
-- 
cgit v1.2.3


From 16ae123e89d603a69d980bd76c4bb686f219ba0e Mon Sep 17 00:00:00 2001
From: Paul Burton <paul.burton@imgtec.com>
Date: Sat, 12 Aug 2017 21:36:37 -0700
Subject: MIPS: VDSO: Avoid use of linux/irqchip/mips-gic.h

Our VDSO code makes use of macros from linux/irqchip/mips-gic.h to
provide offsets to register values, but these are trivial offsets to the
two 32 bit halves of a 64 bit value. Replace use of the macros with zero
(ie. omit adding an offset) and the size of the low 32 bit of the value.
This removes our need for linux/irqchip/mips-gic.h & prepares us for it
to be removed.

Signed-off-by: Paul Burton <paul.burton@imgtec.com>
Acked-by: Marc Zyngier <marc.zyngier@arm.com>
Cc: Jason Cooper <jason@lakedaemon.net>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-mips@linux-mips.org
Patchwork: https://patchwork.linux-mips.org/patch/17047/
Signed-off-by: Ralf Baechle <ralf@linux-mips.org>
---
 arch/mips/vdso/gettimeofday.c    | 7 +++----
 include/linux/irqchip/mips-gic.h | 4 ----
 2 files changed, 3 insertions(+), 8 deletions(-)

(limited to 'include')

diff --git a/arch/mips/vdso/gettimeofday.c b/arch/mips/vdso/gettimeofday.c
index fec7835b9de7..e22b422f282c 100644
--- a/arch/mips/vdso/gettimeofday.c
+++ b/arch/mips/vdso/gettimeofday.c
@@ -11,7 +11,6 @@
 #include "vdso.h"
 
 #include <linux/compiler.h>
-#include <linux/irqchip/mips-gic.h>
 #include <linux/time.h>
 
 #include <asm/clocksource.h>
@@ -125,9 +124,9 @@ static __always_inline u64 read_gic_count(const union mips_vdso_data *data)
 	u32 hi, hi2, lo;
 
 	do {
-		hi = __raw_readl(gic + GIC_UMV_SH_COUNTER_63_32_OFS);
-		lo = __raw_readl(gic + GIC_UMV_SH_COUNTER_31_00_OFS);
-		hi2 = __raw_readl(gic + GIC_UMV_SH_COUNTER_63_32_OFS);
+		hi = __raw_readl(gic + sizeof(lo));
+		lo = __raw_readl(gic);
+		hi2 = __raw_readl(gic + sizeof(lo));
 	} while (hi2 != hi);
 
 	return (((u64)hi) << 32) + lo;
diff --git a/include/linux/irqchip/mips-gic.h b/include/linux/irqchip/mips-gic.h
index 277d5be03a57..6e6c9adea049 100644
--- a/include/linux/irqchip/mips-gic.h
+++ b/include/linux/irqchip/mips-gic.h
@@ -11,8 +11,4 @@
 #include <linux/clocksource.h>
 #include <linux/ioport.h>
 
-/* User Mode Visible Section Register Map */
-#define GIC_UMV_SH_COUNTER_31_00_OFS	0x0000
-#define GIC_UMV_SH_COUNTER_63_32_OFS	0x0004
-
 #endif /* __LINUX_IRQCHIP_MIPS_GIC_H */
-- 
cgit v1.2.3


From 1fad12cd5ef43755d028f760fa089c87f5533451 Mon Sep 17 00:00:00 2001
From: Paul Burton <paul.burton@imgtec.com>
Date: Sat, 12 Aug 2017 21:36:39 -0700
Subject: irqchip: mips-gic: Remove linux/irqchip/mips-gic.h

The linux/irqchip/mips-gic.h header is no longer used. Remove it.

Signed-off-by: Paul Burton <paul.burton@imgtec.com>
Acked-by: Marc Zyngier <marc.zyngier@arm.com>
Cc: Jason Cooper <jason@lakedaemon.net>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-mips@linux-mips.org
Patchwork: https://patchwork.linux-mips.org/patch/17049/
Signed-off-by: Ralf Baechle <ralf@linux-mips.org>
---
 include/linux/irqchip/mips-gic.h | 14 --------------
 1 file changed, 14 deletions(-)
 delete mode 100644 include/linux/irqchip/mips-gic.h

(limited to 'include')

diff --git a/include/linux/irqchip/mips-gic.h b/include/linux/irqchip/mips-gic.h
deleted file mode 100644
index 6e6c9adea049..000000000000
--- a/include/linux/irqchip/mips-gic.h
+++ /dev/null
@@ -1,14 +0,0 @@
-/*
- * This file is subject to the terms and conditions of the GNU General Public
- * License.  See the file "COPYING" in the main directory of this archive
- * for more details.
- *
- * Copyright (C) 2000, 07 MIPS Technologies, Inc.
- */
-#ifndef __LINUX_IRQCHIP_MIPS_GIC_H
-#define __LINUX_IRQCHIP_MIPS_GIC_H
-
-#include <linux/clocksource.h>
-#include <linux/ioport.h>
-
-#endif /* __LINUX_IRQCHIP_MIPS_GIC_H */
-- 
cgit v1.2.3


From a205425658dead19bb1b8ac00584aed98e60dde2 Mon Sep 17 00:00:00 2001
From: Wolfram Sang <wsa@the-dreams.de>
Date: Mon, 14 Aug 2017 18:34:24 +0200
Subject: mfd: twl: Move header file out of I2C realm

include/linux/i2c is not for client devices. Move the header file to a
more appropriate location.

Signed-off-by: Wolfram Sang <wsa@the-dreams.de>
Acked-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Acked-by: Alexandre Belloni <alexandre.belloni@free-electrons.com>
Acked-by: Mark Brown <broonie@kernel.org>
Acked-by: Sebastian Reichel <sebastian.reichel@collabora.co.uk>
Acked-by: Jonathan Cameron <jic23@kernel.org>
Acked-by: Dmitry Torokhov <dmitry.torokhov@gmail.com>
Acked-by: Kishon Vijay Abraham I <kishon@ti.com>
Acked-by: Bartlomiej Zolnierkiewicz <b.zolnierkie@samsung.com>
Acked-by: Thierry Reding <thierry.reding@gmail.com>
Acked-by: Tony Lindgren <tony@atomide.com>
Acked-by: Daniel Thompson <daniel.thompson@linaro.org>
Acked-by: Linus Walleij <linus.walleij@linaro.org>
Acked-by: Guenter Roeck <linux@roeck-us.net>
Signed-off-by: Lee Jones <lee.jones@linaro.org>
---
 arch/arm/mach-omap2/common.h            |   2 +-
 arch/arm/mach-omap2/omap_twl.c          |   2 +-
 drivers/gpio/gpio-twl4030.c             |   2 +-
 drivers/iio/adc/twl4030-madc.c          |   2 +-
 drivers/iio/adc/twl6030-gpadc.c         |   2 +-
 drivers/input/keyboard/twl4030_keypad.c |   2 +-
 drivers/input/misc/twl4030-pwrbutton.c  |   2 +-
 drivers/input/misc/twl4030-vibra.c      |   2 +-
 drivers/mfd/twl-core.c                  |   6 +-
 drivers/mfd/twl4030-audio.c             |   2 +-
 drivers/mfd/twl4030-irq.c               |   2 +-
 drivers/mfd/twl4030-power.c             |   2 +-
 drivers/mfd/twl6030-irq.c               |   2 +-
 drivers/phy/ti/phy-twl4030-usb.c        |   2 +-
 drivers/power/supply/twl4030_charger.c  |   2 +-
 drivers/pwm/pwm-twl-led.c               |   2 +-
 drivers/pwm/pwm-twl.c                   |   2 +-
 drivers/regulator/twl-regulator.c       |   2 +-
 drivers/regulator/twl6030-regulator.c   |   2 +-
 drivers/rtc/rtc-twl.c                   |   2 +-
 drivers/usb/phy/phy-twl6030-usb.c       |   2 +-
 drivers/video/backlight/pandora_bl.c    |   2 +-
 drivers/watchdog/twl4030_wdt.c          |   2 +-
 include/linux/i2c/twl.h                 | 876 --------------------------------
 include/linux/mfd/twl.h                 | 876 ++++++++++++++++++++++++++++++++
 sound/soc/codecs/twl4030.c              |   2 +-
 26 files changed, 902 insertions(+), 902 deletions(-)
 delete mode 100644 include/linux/i2c/twl.h
 create mode 100644 include/linux/mfd/twl.h

(limited to 'include')

diff --git a/arch/arm/mach-omap2/common.h b/arch/arm/mach-omap2/common.h
index 8cc6338fcb12..b5ad7fcb80ed 100644
--- a/arch/arm/mach-omap2/common.h
+++ b/arch/arm/mach-omap2/common.h
@@ -29,7 +29,7 @@
 #include <linux/irq.h>
 #include <linux/delay.h>
 #include <linux/i2c.h>
-#include <linux/i2c/twl.h>
+#include <linux/mfd/twl.h>
 #include <linux/i2c-omap.h>
 #include <linux/reboot.h>
 #include <linux/irqchip/irq-omap-intc.h>
diff --git a/arch/arm/mach-omap2/omap_twl.c b/arch/arm/mach-omap2/omap_twl.c
index 1346b3ab34a5..295124b248ae 100644
--- a/arch/arm/mach-omap2/omap_twl.c
+++ b/arch/arm/mach-omap2/omap_twl.c
@@ -16,7 +16,7 @@
 #include <linux/err.h>
 #include <linux/io.h>
 #include <linux/kernel.h>
-#include <linux/i2c/twl.h>
+#include <linux/mfd/twl.h>
 
 #include "soc.h"
 #include "voltage.h"
diff --git a/drivers/gpio/gpio-twl4030.c b/drivers/gpio/gpio-twl4030.c
index 24f388ed46d4..9b511df5450e 100644
--- a/drivers/gpio/gpio-twl4030.c
+++ b/drivers/gpio/gpio-twl4030.c
@@ -35,7 +35,7 @@
 #include <linux/of.h>
 #include <linux/irqdomain.h>
 
-#include <linux/i2c/twl.h>
+#include <linux/mfd/twl.h>
 
 /*
  * The GPIO "subchip" supports 18 GPIOs which can be configured as
diff --git a/drivers/iio/adc/twl4030-madc.c b/drivers/iio/adc/twl4030-madc.c
index bd3d37fc2144..1edd99f0c5e5 100644
--- a/drivers/iio/adc/twl4030-madc.c
+++ b/drivers/iio/adc/twl4030-madc.c
@@ -35,7 +35,7 @@
 #include <linux/delay.h>
 #include <linux/platform_device.h>
 #include <linux/slab.h>
-#include <linux/i2c/twl.h>
+#include <linux/mfd/twl.h>
 #include <linux/module.h>
 #include <linux/stddef.h>
 #include <linux/mutex.h>
diff --git a/drivers/iio/adc/twl6030-gpadc.c b/drivers/iio/adc/twl6030-gpadc.c
index becbb0aef232..bc0e60b9da45 100644
--- a/drivers/iio/adc/twl6030-gpadc.c
+++ b/drivers/iio/adc/twl6030-gpadc.c
@@ -33,7 +33,7 @@
 #include <linux/module.h>
 #include <linux/platform_device.h>
 #include <linux/of_platform.h>
-#include <linux/i2c/twl.h>
+#include <linux/mfd/twl.h>
 #include <linux/iio/iio.h>
 #include <linux/iio/sysfs.h>
 
diff --git a/drivers/input/keyboard/twl4030_keypad.c b/drivers/input/keyboard/twl4030_keypad.c
index 39e72b3219d8..f9f98ef1d98e 100644
--- a/drivers/input/keyboard/twl4030_keypad.c
+++ b/drivers/input/keyboard/twl4030_keypad.c
@@ -30,7 +30,7 @@
 #include <linux/interrupt.h>
 #include <linux/input.h>
 #include <linux/platform_device.h>
-#include <linux/i2c/twl.h>
+#include <linux/mfd/twl.h>
 #include <linux/slab.h>
 #include <linux/of.h>
 
diff --git a/drivers/input/misc/twl4030-pwrbutton.c b/drivers/input/misc/twl4030-pwrbutton.c
index 1c13005b228f..b307cca17022 100644
--- a/drivers/input/misc/twl4030-pwrbutton.c
+++ b/drivers/input/misc/twl4030-pwrbutton.c
@@ -27,7 +27,7 @@
 #include <linux/input.h>
 #include <linux/interrupt.h>
 #include <linux/platform_device.h>
-#include <linux/i2c/twl.h>
+#include <linux/mfd/twl.h>
 
 #define PWR_PWRON_IRQ (1 << 0)
 
diff --git a/drivers/input/misc/twl4030-vibra.c b/drivers/input/misc/twl4030-vibra.c
index caa5a62c42fb..6c51d404874b 100644
--- a/drivers/input/misc/twl4030-vibra.c
+++ b/drivers/input/misc/twl4030-vibra.c
@@ -28,7 +28,7 @@
 #include <linux/platform_device.h>
 #include <linux/of.h>
 #include <linux/workqueue.h>
-#include <linux/i2c/twl.h>
+#include <linux/mfd/twl.h>
 #include <linux/mfd/twl4030-audio.h>
 #include <linux/input.h>
 #include <linux/slab.h>
diff --git a/drivers/mfd/twl-core.c b/drivers/mfd/twl-core.c
index c64615dca2bd..2a09dde4ca6e 100644
--- a/drivers/mfd/twl-core.c
+++ b/drivers/mfd/twl-core.c
@@ -44,7 +44,7 @@
 #include <linux/regulator/machine.h>
 
 #include <linux/i2c.h>
-#include <linux/i2c/twl.h>
+#include <linux/mfd/twl.h>
 
 /* Register descriptions for audio */
 #include <linux/mfd/twl4030-audio.h>
@@ -173,7 +173,7 @@ static struct twl_private *twl_priv;
 static struct twl_mapping twl4030_map[] = {
 	/*
 	 * NOTE:  don't change this table without updating the
-	 * <linux/i2c/twl.h> defines for TWL4030_MODULE_*
+	 * <linux/mfd/twl.h> defines for TWL4030_MODULE_*
 	 * so they continue to match the order in this table.
 	 */
 
@@ -344,7 +344,7 @@ static const struct regmap_config twl4030_regmap_config[4] = {
 static struct twl_mapping twl6030_map[] = {
 	/*
 	 * NOTE:  don't change this table without updating the
-	 * <linux/i2c/twl.h> defines for TWL4030_MODULE_*
+	 * <linux/mfd/twl.h> defines for TWL4030_MODULE_*
 	 * so they continue to match the order in this table.
 	 */
 
diff --git a/drivers/mfd/twl4030-audio.c b/drivers/mfd/twl4030-audio.c
index 0a1606480023..da16bf45fab4 100644
--- a/drivers/mfd/twl4030-audio.c
+++ b/drivers/mfd/twl4030-audio.c
@@ -30,7 +30,7 @@
 #include <linux/platform_device.h>
 #include <linux/of.h>
 #include <linux/of_platform.h>
-#include <linux/i2c/twl.h>
+#include <linux/mfd/twl.h>
 #include <linux/mfd/core.h>
 #include <linux/mfd/twl4030-audio.h>
 
diff --git a/drivers/mfd/twl4030-irq.c b/drivers/mfd/twl4030-irq.c
index 378c02d43bf7..b16c16f194fd 100644
--- a/drivers/mfd/twl4030-irq.c
+++ b/drivers/mfd/twl4030-irq.c
@@ -33,7 +33,7 @@
 #include <linux/slab.h>
 #include <linux/of.h>
 #include <linux/irqdomain.h>
-#include <linux/i2c/twl.h>
+#include <linux/mfd/twl.h>
 
 #include "twl-core.h"
 
diff --git a/drivers/mfd/twl4030-power.c b/drivers/mfd/twl4030-power.c
index f4b2c29d77e3..6b36932263ba 100644
--- a/drivers/mfd/twl4030-power.c
+++ b/drivers/mfd/twl4030-power.c
@@ -25,7 +25,7 @@
 
 #include <linux/module.h>
 #include <linux/pm.h>
-#include <linux/i2c/twl.h>
+#include <linux/mfd/twl.h>
 #include <linux/platform_device.h>
 #include <linux/of.h>
 #include <linux/of_device.h>
diff --git a/drivers/mfd/twl6030-irq.c b/drivers/mfd/twl6030-irq.c
index 53574508a613..e3ec8dfa9f1e 100644
--- a/drivers/mfd/twl6030-irq.c
+++ b/drivers/mfd/twl6030-irq.c
@@ -35,7 +35,7 @@
 #include <linux/interrupt.h>
 #include <linux/irq.h>
 #include <linux/kthread.h>
-#include <linux/i2c/twl.h>
+#include <linux/mfd/twl.h>
 #include <linux/platform_device.h>
 #include <linux/suspend.h>
 #include <linux/of.h>
diff --git a/drivers/phy/ti/phy-twl4030-usb.c b/drivers/phy/ti/phy-twl4030-usb.c
index 2990b3965460..28f49902760d 100644
--- a/drivers/phy/ti/phy-twl4030-usb.c
+++ b/drivers/phy/ti/phy-twl4030-usb.c
@@ -36,7 +36,7 @@
 #include <linux/pm_runtime.h>
 #include <linux/usb/musb.h>
 #include <linux/usb/ulpi.h>
-#include <linux/i2c/twl.h>
+#include <linux/mfd/twl.h>
 #include <linux/regulator/consumer.h>
 #include <linux/err.h>
 #include <linux/slab.h>
diff --git a/drivers/power/supply/twl4030_charger.c b/drivers/power/supply/twl4030_charger.c
index 9dff1b4b85fc..a5915f498eea 100644
--- a/drivers/power/supply/twl4030_charger.c
+++ b/drivers/power/supply/twl4030_charger.c
@@ -18,7 +18,7 @@
 #include <linux/err.h>
 #include <linux/platform_device.h>
 #include <linux/interrupt.h>
-#include <linux/i2c/twl.h>
+#include <linux/mfd/twl.h>
 #include <linux/power_supply.h>
 #include <linux/notifier.h>
 #include <linux/usb/otg.h>
diff --git a/drivers/pwm/pwm-twl-led.c b/drivers/pwm/pwm-twl-led.c
index 21eff991d0e3..01153622778b 100644
--- a/drivers/pwm/pwm-twl-led.c
+++ b/drivers/pwm/pwm-twl-led.c
@@ -24,7 +24,7 @@
 #include <linux/of.h>
 #include <linux/platform_device.h>
 #include <linux/pwm.h>
-#include <linux/i2c/twl.h>
+#include <linux/mfd/twl.h>
 #include <linux/slab.h>
 
 /*
diff --git a/drivers/pwm/pwm-twl.c b/drivers/pwm/pwm-twl.c
index 9de617b76680..b7a45be99815 100644
--- a/drivers/pwm/pwm-twl.c
+++ b/drivers/pwm/pwm-twl.c
@@ -21,7 +21,7 @@
 #include <linux/of.h>
 #include <linux/platform_device.h>
 #include <linux/pwm.h>
-#include <linux/i2c/twl.h>
+#include <linux/mfd/twl.h>
 #include <linux/slab.h>
 
 /*
diff --git a/drivers/regulator/twl-regulator.c b/drivers/regulator/twl-regulator.c
index 6c9ec84121bd..a4456db5849d 100644
--- a/drivers/regulator/twl-regulator.c
+++ b/drivers/regulator/twl-regulator.c
@@ -20,7 +20,7 @@
 #include <linux/regulator/driver.h>
 #include <linux/regulator/machine.h>
 #include <linux/regulator/of_regulator.h>
-#include <linux/i2c/twl.h>
+#include <linux/mfd/twl.h>
 #include <linux/delay.h>
 
 /*
diff --git a/drivers/regulator/twl6030-regulator.c b/drivers/regulator/twl6030-regulator.c
index 56aada387887..219cbd910dbf 100644
--- a/drivers/regulator/twl6030-regulator.c
+++ b/drivers/regulator/twl6030-regulator.c
@@ -21,7 +21,7 @@
 #include <linux/regulator/driver.h>
 #include <linux/regulator/machine.h>
 #include <linux/regulator/of_regulator.h>
-#include <linux/i2c/twl.h>
+#include <linux/mfd/twl.h>
 #include <linux/delay.h>
 
 struct twlreg_info {
diff --git a/drivers/rtc/rtc-twl.c b/drivers/rtc/rtc-twl.c
index c18c39212ce6..3472e79f2b17 100644
--- a/drivers/rtc/rtc-twl.c
+++ b/drivers/rtc/rtc-twl.c
@@ -31,7 +31,7 @@
 #include <linux/interrupt.h>
 #include <linux/of.h>
 
-#include <linux/i2c/twl.h>
+#include <linux/mfd/twl.h>
 
 enum twl_class {
 	TWL_4030 = 0,
diff --git a/drivers/usb/phy/phy-twl6030-usb.c b/drivers/usb/phy/phy-twl6030-usb.c
index 628b600b02b1..b5dc077ed7d3 100644
--- a/drivers/usb/phy/phy-twl6030-usb.c
+++ b/drivers/usb/phy/phy-twl6030-usb.c
@@ -28,7 +28,7 @@
 #include <linux/usb/musb.h>
 #include <linux/usb/phy_companion.h>
 #include <linux/phy/omap_usb.h>
-#include <linux/i2c/twl.h>
+#include <linux/mfd/twl.h>
 #include <linux/regulator/consumer.h>
 #include <linux/err.h>
 #include <linux/slab.h>
diff --git a/drivers/video/backlight/pandora_bl.c b/drivers/video/backlight/pandora_bl.c
index 5d8bb8b20183..a186bc677c7d 100644
--- a/drivers/video/backlight/pandora_bl.c
+++ b/drivers/video/backlight/pandora_bl.c
@@ -16,7 +16,7 @@
 #include <linux/delay.h>
 #include <linux/fb.h>
 #include <linux/backlight.h>
-#include <linux/i2c/twl.h>
+#include <linux/mfd/twl.h>
 #include <linux/err.h>
 
 #define TWL_PWM0_ON		0x00
diff --git a/drivers/watchdog/twl4030_wdt.c b/drivers/watchdog/twl4030_wdt.c
index 9bf3cc0f3961..569fe85e52da 100644
--- a/drivers/watchdog/twl4030_wdt.c
+++ b/drivers/watchdog/twl4030_wdt.c
@@ -24,7 +24,7 @@
 #include <linux/kernel.h>
 #include <linux/watchdog.h>
 #include <linux/platform_device.h>
-#include <linux/i2c/twl.h>
+#include <linux/mfd/twl.h>
 
 #define TWL4030_WATCHDOG_CFG_REG_OFFS	0x3
 
diff --git a/include/linux/i2c/twl.h b/include/linux/i2c/twl.h
deleted file mode 100644
index 9ad7828d9d34..000000000000
--- a/include/linux/i2c/twl.h
+++ /dev/null
@@ -1,876 +0,0 @@
-/*
- * twl4030.h - header for TWL4030 PM and audio CODEC device
- *
- * Copyright (C) 2005-2006 Texas Instruments, Inc.
- *
- * Based on tlv320aic23.c:
- * Copyright (c) by Kai Svahn <kai.svahn@nokia.com>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307 USA
- *
- */
-
-#ifndef __TWL_H_
-#define __TWL_H_
-
-#include <linux/types.h>
-#include <linux/input/matrix_keypad.h>
-
-/*
- * Using the twl4030 core we address registers using a pair
- *	{ module id, relative register offset }
- * which that core then maps to the relevant
- *	{ i2c slave, absolute register address }
- *
- * The module IDs are meaningful only to the twl4030 core code,
- * which uses them as array indices to look up the first register
- * address each module uses within a given i2c slave.
- */
-
-/* Module IDs for similar functionalities found in twl4030/twl6030 */
-enum twl_module_ids {
-	TWL_MODULE_USB,
-	TWL_MODULE_PIH,
-	TWL_MODULE_MAIN_CHARGE,
-	TWL_MODULE_PM_MASTER,
-	TWL_MODULE_PM_RECEIVER,
-
-	TWL_MODULE_RTC,
-	TWL_MODULE_PWM,
-	TWL_MODULE_LED,
-	TWL_MODULE_SECURED_REG,
-
-	TWL_MODULE_LAST,
-};
-
-/* Modules only available in twl4030 series */
-enum twl4030_module_ids {
-	TWL4030_MODULE_AUDIO_VOICE = TWL_MODULE_LAST,
-	TWL4030_MODULE_GPIO,
-	TWL4030_MODULE_INTBR,
-	TWL4030_MODULE_TEST,
-	TWL4030_MODULE_KEYPAD,
-
-	TWL4030_MODULE_MADC,
-	TWL4030_MODULE_INTERRUPTS,
-	TWL4030_MODULE_PRECHARGE,
-	TWL4030_MODULE_BACKUP,
-	TWL4030_MODULE_INT,
-
-	TWL5031_MODULE_ACCESSORY,
-	TWL5031_MODULE_INTERRUPTS,
-
-	TWL4030_MODULE_LAST,
-};
-
-/* Modules only available in twl6030 series */
-enum twl6030_module_ids {
-	TWL6030_MODULE_ID0 = TWL_MODULE_LAST,
-	TWL6030_MODULE_ID1,
-	TWL6030_MODULE_ID2,
-	TWL6030_MODULE_GPADC,
-	TWL6030_MODULE_GASGAUGE,
-
-	TWL6030_MODULE_LAST,
-};
-
-/* Until the clients has been converted to use TWL_MODULE_LED */
-#define TWL4030_MODULE_LED	TWL_MODULE_LED
-
-#define GPIO_INTR_OFFSET	0
-#define KEYPAD_INTR_OFFSET	1
-#define BCI_INTR_OFFSET		2
-#define MADC_INTR_OFFSET	3
-#define USB_INTR_OFFSET		4
-#define CHARGERFAULT_INTR_OFFSET 5
-#define BCI_PRES_INTR_OFFSET	9
-#define USB_PRES_INTR_OFFSET	10
-#define RTC_INTR_OFFSET		11
-
-/*
- * Offset from TWL6030_IRQ_BASE / pdata->irq_base
- */
-#define PWR_INTR_OFFSET		0
-#define HOTDIE_INTR_OFFSET	12
-#define SMPSLDO_INTR_OFFSET	13
-#define BATDETECT_INTR_OFFSET	14
-#define SIMDETECT_INTR_OFFSET	15
-#define MMCDETECT_INTR_OFFSET	16
-#define GASGAUGE_INTR_OFFSET	17
-#define USBOTG_INTR_OFFSET	4
-#define CHARGER_INTR_OFFSET	2
-#define RSV_INTR_OFFSET		0
-
-/* INT register offsets */
-#define REG_INT_STS_A			0x00
-#define REG_INT_STS_B			0x01
-#define REG_INT_STS_C			0x02
-
-#define REG_INT_MSK_LINE_A		0x03
-#define REG_INT_MSK_LINE_B		0x04
-#define REG_INT_MSK_LINE_C		0x05
-
-#define REG_INT_MSK_STS_A		0x06
-#define REG_INT_MSK_STS_B		0x07
-#define REG_INT_MSK_STS_C		0x08
-
-/* MASK INT REG GROUP A */
-#define TWL6030_PWR_INT_MASK 		0x07
-#define TWL6030_RTC_INT_MASK 		0x18
-#define TWL6030_HOTDIE_INT_MASK 	0x20
-#define TWL6030_SMPSLDOA_INT_MASK	0xC0
-
-/* MASK INT REG GROUP B */
-#define TWL6030_SMPSLDOB_INT_MASK 	0x01
-#define TWL6030_BATDETECT_INT_MASK 	0x02
-#define TWL6030_SIMDETECT_INT_MASK 	0x04
-#define TWL6030_MMCDETECT_INT_MASK 	0x08
-#define TWL6030_GPADC_INT_MASK 		0x60
-#define TWL6030_GASGAUGE_INT_MASK 	0x80
-
-/* MASK INT REG GROUP C */
-#define TWL6030_USBOTG_INT_MASK  	0x0F
-#define TWL6030_CHARGER_CTRL_INT_MASK 	0x10
-#define TWL6030_CHARGER_FAULT_INT_MASK 	0x60
-
-#define TWL6030_MMCCTRL		0xEE
-#define VMMC_AUTO_OFF			(0x1 << 3)
-#define SW_FC				(0x1 << 2)
-#define STS_MMC			0x1
-
-#define TWL6030_CFG_INPUT_PUPD3	0xF2
-#define MMC_PU				(0x1 << 3)
-#define MMC_PD				(0x1 << 2)
-
-#define TWL_SIL_TYPE(rev)		((rev) & 0x00FFFFFF)
-#define TWL_SIL_REV(rev)		((rev) >> 24)
-#define TWL_SIL_5030			0x09002F
-#define TWL5030_REV_1_0			0x00
-#define TWL5030_REV_1_1			0x10
-#define TWL5030_REV_1_2			0x30
-
-#define TWL4030_CLASS_ID 		0x4030
-#define TWL6030_CLASS_ID 		0x6030
-unsigned int twl_rev(void);
-#define GET_TWL_REV (twl_rev())
-#define TWL_CLASS_IS(class, id)			\
-static inline int twl_class_is_ ##class(void)	\
-{						\
-	return ((id) == (GET_TWL_REV)) ? 1 : 0;	\
-}
-
-TWL_CLASS_IS(4030, TWL4030_CLASS_ID)
-TWL_CLASS_IS(6030, TWL6030_CLASS_ID)
-
-/* Set the regcache bypass for the regmap associated with the nodule */
-int twl_set_regcache_bypass(u8 mod_no, bool enable);
-
-/*
- * Read and write several 8-bit registers at once.
- */
-int twl_i2c_write(u8 mod_no, u8 *value, u8 reg, unsigned num_bytes);
-int twl_i2c_read(u8 mod_no, u8 *value, u8 reg, unsigned num_bytes);
-
-/*
- * Read and write single 8-bit registers
- */
-static inline int twl_i2c_write_u8(u8 mod_no, u8 val, u8 reg) {
-	return twl_i2c_write(mod_no, &val, reg, 1);
-}
-
-static inline int twl_i2c_read_u8(u8 mod_no, u8 *val, u8 reg) {
-	return twl_i2c_read(mod_no, val, reg, 1);
-}
-
-static inline int twl_i2c_write_u16(u8 mod_no, u16 val, u8 reg) {
-	val = cpu_to_le16(val);
-	return twl_i2c_write(mod_no, (u8*) &val, reg, 2);
-}
-
-static inline int twl_i2c_read_u16(u8 mod_no, u16 *val, u8 reg) {
-	int ret;
-	ret = twl_i2c_read(mod_no, (u8*) val, reg, 2);
-	*val = le16_to_cpu(*val);
-	return ret;
-}
-
-int twl_get_type(void);
-int twl_get_version(void);
-int twl_get_hfclk_rate(void);
-
-int twl6030_interrupt_unmask(u8 bit_mask, u8 offset);
-int twl6030_interrupt_mask(u8 bit_mask, u8 offset);
-
-/* Card detect Configuration for MMC1 Controller on OMAP4 */
-#ifdef CONFIG_TWL4030_CORE
-int twl6030_mmc_card_detect_config(void);
-#else
-static inline int twl6030_mmc_card_detect_config(void)
-{
-	pr_debug("twl6030_mmc_card_detect_config not supported\n");
-	return 0;
-}
-#endif
-
-/* MMC1 Controller on OMAP4 uses Phoenix irq for Card detect */
-#ifdef CONFIG_TWL4030_CORE
-int twl6030_mmc_card_detect(struct device *dev, int slot);
-#else
-static inline int twl6030_mmc_card_detect(struct device *dev, int slot)
-{
-	pr_debug("Call back twl6030_mmc_card_detect not supported\n");
-	return -EIO;
-}
-#endif
-/*----------------------------------------------------------------------*/
-
-/*
- * NOTE:  at up to 1024 registers, this is a big chip.
- *
- * Avoid putting register declarations in this file, instead of into
- * a driver-private file, unless some of the registers in a block
- * need to be shared with other drivers.  One example is blocks that
- * have Secondary IRQ Handler (SIH) registers.
- */
-
-#define TWL4030_SIH_CTRL_EXCLEN_MASK	BIT(0)
-#define TWL4030_SIH_CTRL_PENDDIS_MASK	BIT(1)
-#define TWL4030_SIH_CTRL_COR_MASK	BIT(2)
-
-/*----------------------------------------------------------------------*/
-
-/*
- * GPIO Block Register offsets (use TWL4030_MODULE_GPIO)
- */
-
-#define REG_GPIODATAIN1			0x0
-#define REG_GPIODATAIN2			0x1
-#define REG_GPIODATAIN3			0x2
-#define REG_GPIODATADIR1		0x3
-#define REG_GPIODATADIR2		0x4
-#define REG_GPIODATADIR3		0x5
-#define REG_GPIODATAOUT1		0x6
-#define REG_GPIODATAOUT2		0x7
-#define REG_GPIODATAOUT3		0x8
-#define REG_CLEARGPIODATAOUT1		0x9
-#define REG_CLEARGPIODATAOUT2		0xA
-#define REG_CLEARGPIODATAOUT3		0xB
-#define REG_SETGPIODATAOUT1		0xC
-#define REG_SETGPIODATAOUT2		0xD
-#define REG_SETGPIODATAOUT3		0xE
-#define REG_GPIO_DEBEN1			0xF
-#define REG_GPIO_DEBEN2			0x10
-#define REG_GPIO_DEBEN3			0x11
-#define REG_GPIO_CTRL			0x12
-#define REG_GPIOPUPDCTR1		0x13
-#define REG_GPIOPUPDCTR2		0x14
-#define REG_GPIOPUPDCTR3		0x15
-#define REG_GPIOPUPDCTR4		0x16
-#define REG_GPIOPUPDCTR5		0x17
-#define REG_GPIO_ISR1A			0x19
-#define REG_GPIO_ISR2A			0x1A
-#define REG_GPIO_ISR3A			0x1B
-#define REG_GPIO_IMR1A			0x1C
-#define REG_GPIO_IMR2A			0x1D
-#define REG_GPIO_IMR3A			0x1E
-#define REG_GPIO_ISR1B			0x1F
-#define REG_GPIO_ISR2B			0x20
-#define REG_GPIO_ISR3B			0x21
-#define REG_GPIO_IMR1B			0x22
-#define REG_GPIO_IMR2B			0x23
-#define REG_GPIO_IMR3B			0x24
-#define REG_GPIO_EDR1			0x28
-#define REG_GPIO_EDR2			0x29
-#define REG_GPIO_EDR3			0x2A
-#define REG_GPIO_EDR4			0x2B
-#define REG_GPIO_EDR5			0x2C
-#define REG_GPIO_SIH_CTRL		0x2D
-
-/* Up to 18 signals are available as GPIOs, when their
- * pins are not assigned to another use (such as ULPI/USB).
- */
-#define TWL4030_GPIO_MAX		18
-
-/*----------------------------------------------------------------------*/
-
-/*Interface Bit Register (INTBR) offsets
- *(Use TWL_4030_MODULE_INTBR)
- */
-
-#define REG_IDCODE_7_0			0x00
-#define REG_IDCODE_15_8			0x01
-#define REG_IDCODE_16_23		0x02
-#define REG_IDCODE_31_24		0x03
-#define REG_GPPUPDCTR1			0x0F
-#define REG_UNLOCK_TEST_REG		0x12
-
-/*I2C1 and I2C4(SR) SDA/SCL pull-up control bits */
-
-#define I2C_SCL_CTRL_PU			BIT(0)
-#define I2C_SDA_CTRL_PU			BIT(2)
-#define SR_I2C_SCL_CTRL_PU		BIT(4)
-#define SR_I2C_SDA_CTRL_PU		BIT(6)
-
-#define TWL_EEPROM_R_UNLOCK		0x49
-
-/*----------------------------------------------------------------------*/
-
-/*
- * Keypad register offsets (use TWL4030_MODULE_KEYPAD)
- * ... SIH/interrupt only
- */
-
-#define TWL4030_KEYPAD_KEYP_ISR1	0x11
-#define TWL4030_KEYPAD_KEYP_IMR1	0x12
-#define TWL4030_KEYPAD_KEYP_ISR2	0x13
-#define TWL4030_KEYPAD_KEYP_IMR2	0x14
-#define TWL4030_KEYPAD_KEYP_SIR		0x15	/* test register */
-#define TWL4030_KEYPAD_KEYP_EDR		0x16
-#define TWL4030_KEYPAD_KEYP_SIH_CTRL	0x17
-
-/*----------------------------------------------------------------------*/
-
-/*
- * Multichannel ADC register offsets (use TWL4030_MODULE_MADC)
- * ... SIH/interrupt only
- */
-
-#define TWL4030_MADC_ISR1		0x61
-#define TWL4030_MADC_IMR1		0x62
-#define TWL4030_MADC_ISR2		0x63
-#define TWL4030_MADC_IMR2		0x64
-#define TWL4030_MADC_SIR		0x65	/* test register */
-#define TWL4030_MADC_EDR		0x66
-#define TWL4030_MADC_SIH_CTRL		0x67
-
-/*----------------------------------------------------------------------*/
-
-/*
- * Battery charger register offsets (use TWL4030_MODULE_INTERRUPTS)
- */
-
-#define TWL4030_INTERRUPTS_BCIISR1A	0x0
-#define TWL4030_INTERRUPTS_BCIISR2A	0x1
-#define TWL4030_INTERRUPTS_BCIIMR1A	0x2
-#define TWL4030_INTERRUPTS_BCIIMR2A	0x3
-#define TWL4030_INTERRUPTS_BCIISR1B	0x4
-#define TWL4030_INTERRUPTS_BCIISR2B	0x5
-#define TWL4030_INTERRUPTS_BCIIMR1B	0x6
-#define TWL4030_INTERRUPTS_BCIIMR2B	0x7
-#define TWL4030_INTERRUPTS_BCISIR1	0x8	/* test register */
-#define TWL4030_INTERRUPTS_BCISIR2	0x9	/* test register */
-#define TWL4030_INTERRUPTS_BCIEDR1	0xa
-#define TWL4030_INTERRUPTS_BCIEDR2	0xb
-#define TWL4030_INTERRUPTS_BCIEDR3	0xc
-#define TWL4030_INTERRUPTS_BCISIHCTRL	0xd
-
-/*----------------------------------------------------------------------*/
-
-/*
- * Power Interrupt block register offsets (use TWL4030_MODULE_INT)
- */
-
-#define TWL4030_INT_PWR_ISR1		0x0
-#define TWL4030_INT_PWR_IMR1		0x1
-#define TWL4030_INT_PWR_ISR2		0x2
-#define TWL4030_INT_PWR_IMR2		0x3
-#define TWL4030_INT_PWR_SIR		0x4	/* test register */
-#define TWL4030_INT_PWR_EDR1		0x5
-#define TWL4030_INT_PWR_EDR2		0x6
-#define TWL4030_INT_PWR_SIH_CTRL	0x7
-
-/*----------------------------------------------------------------------*/
-
-/*
- * Accessory Interrupts
- */
-#define TWL5031_ACIIMR_LSB		0x05
-#define TWL5031_ACIIMR_MSB		0x06
-#define TWL5031_ACIIDR_LSB		0x07
-#define TWL5031_ACIIDR_MSB		0x08
-#define TWL5031_ACCISR1			0x0F
-#define TWL5031_ACCIMR1			0x10
-#define TWL5031_ACCISR2			0x11
-#define TWL5031_ACCIMR2			0x12
-#define TWL5031_ACCSIR			0x13
-#define TWL5031_ACCEDR1			0x14
-#define TWL5031_ACCSIHCTRL		0x15
-
-/*----------------------------------------------------------------------*/
-
-/*
- * Battery Charger Controller
- */
-
-#define TWL5031_INTERRUPTS_BCIISR1	0x0
-#define TWL5031_INTERRUPTS_BCIIMR1	0x1
-#define TWL5031_INTERRUPTS_BCIISR2	0x2
-#define TWL5031_INTERRUPTS_BCIIMR2	0x3
-#define TWL5031_INTERRUPTS_BCISIR	0x4
-#define TWL5031_INTERRUPTS_BCIEDR1	0x5
-#define TWL5031_INTERRUPTS_BCIEDR2	0x6
-#define TWL5031_INTERRUPTS_BCISIHCTRL	0x7
-
-/*----------------------------------------------------------------------*/
-
-/*
- * PM Master module register offsets (use TWL4030_MODULE_PM_MASTER)
- */
-
-#define TWL4030_PM_MASTER_CFG_P1_TRANSITION	0x00
-#define TWL4030_PM_MASTER_CFG_P2_TRANSITION	0x01
-#define TWL4030_PM_MASTER_CFG_P3_TRANSITION	0x02
-#define TWL4030_PM_MASTER_CFG_P123_TRANSITION	0x03
-#define TWL4030_PM_MASTER_STS_BOOT		0x04
-#define TWL4030_PM_MASTER_CFG_BOOT		0x05
-#define TWL4030_PM_MASTER_SHUNDAN		0x06
-#define TWL4030_PM_MASTER_BOOT_BCI		0x07
-#define TWL4030_PM_MASTER_CFG_PWRANA1		0x08
-#define TWL4030_PM_MASTER_CFG_PWRANA2		0x09
-#define TWL4030_PM_MASTER_BACKUP_MISC_STS	0x0b
-#define TWL4030_PM_MASTER_BACKUP_MISC_CFG	0x0c
-#define TWL4030_PM_MASTER_BACKUP_MISC_TST	0x0d
-#define TWL4030_PM_MASTER_PROTECT_KEY		0x0e
-#define TWL4030_PM_MASTER_STS_HW_CONDITIONS	0x0f
-#define TWL4030_PM_MASTER_P1_SW_EVENTS		0x10
-#define TWL4030_PM_MASTER_P2_SW_EVENTS		0x11
-#define TWL4030_PM_MASTER_P3_SW_EVENTS		0x12
-#define TWL4030_PM_MASTER_STS_P123_STATE	0x13
-#define TWL4030_PM_MASTER_PB_CFG		0x14
-#define TWL4030_PM_MASTER_PB_WORD_MSB		0x15
-#define TWL4030_PM_MASTER_PB_WORD_LSB		0x16
-#define TWL4030_PM_MASTER_SEQ_ADD_W2P		0x1c
-#define TWL4030_PM_MASTER_SEQ_ADD_P2A		0x1d
-#define TWL4030_PM_MASTER_SEQ_ADD_A2W		0x1e
-#define TWL4030_PM_MASTER_SEQ_ADD_A2S		0x1f
-#define TWL4030_PM_MASTER_SEQ_ADD_S2A12		0x20
-#define TWL4030_PM_MASTER_SEQ_ADD_S2A3		0x21
-#define TWL4030_PM_MASTER_SEQ_ADD_WARM		0x22
-#define TWL4030_PM_MASTER_MEMORY_ADDRESS	0x23
-#define TWL4030_PM_MASTER_MEMORY_DATA		0x24
-
-#define TWL4030_PM_MASTER_KEY_CFG1		0xc0
-#define TWL4030_PM_MASTER_KEY_CFG2		0x0c
-
-#define TWL4030_PM_MASTER_KEY_TST1		0xe0
-#define TWL4030_PM_MASTER_KEY_TST2		0x0e
-
-#define TWL4030_PM_MASTER_GLOBAL_TST		0xb6
-
-/*----------------------------------------------------------------------*/
-
-/* Power bus message definitions */
-
-/* The TWL4030/5030 splits its power-management resources (the various
- * regulators, clock and reset lines) into 3 processor groups - P1, P2 and
- * P3. These groups can then be configured to transition between sleep, wait-on
- * and active states by sending messages to the power bus.  See Section 5.4.2
- * Power Resources of TWL4030 TRM
- */
-
-/* Processor groups */
-#define DEV_GRP_NULL		0x0
-#define DEV_GRP_P1		0x1	/* P1: all OMAP devices */
-#define DEV_GRP_P2		0x2	/* P2: all Modem devices */
-#define DEV_GRP_P3		0x4	/* P3: all peripheral devices */
-
-/* Resource groups */
-#define RES_GRP_RES		0x0	/* Reserved */
-#define RES_GRP_PP		0x1	/* Power providers */
-#define RES_GRP_RC		0x2	/* Reset and control */
-#define RES_GRP_PP_RC		0x3
-#define RES_GRP_PR		0x4	/* Power references */
-#define RES_GRP_PP_PR		0x5
-#define RES_GRP_RC_PR		0x6
-#define RES_GRP_ALL		0x7	/* All resource groups */
-
-#define RES_TYPE2_R0		0x0
-#define RES_TYPE2_R1		0x1
-#define RES_TYPE2_R2		0x2
-
-#define RES_TYPE_R0		0x0
-#define RES_TYPE_ALL		0x7
-
-/* Resource states */
-#define RES_STATE_WRST		0xF
-#define RES_STATE_ACTIVE	0xE
-#define RES_STATE_SLEEP		0x8
-#define RES_STATE_OFF		0x0
-
-/* Power resources */
-
-/* Power providers */
-#define RES_VAUX1               1
-#define RES_VAUX2               2
-#define RES_VAUX3               3
-#define RES_VAUX4               4
-#define RES_VMMC1               5
-#define RES_VMMC2               6
-#define RES_VPLL1               7
-#define RES_VPLL2               8
-#define RES_VSIM                9
-#define RES_VDAC                10
-#define RES_VINTANA1            11
-#define RES_VINTANA2            12
-#define RES_VINTDIG             13
-#define RES_VIO                 14
-#define RES_VDD1                15
-#define RES_VDD2                16
-#define RES_VUSB_1V5            17
-#define RES_VUSB_1V8            18
-#define RES_VUSB_3V1            19
-#define RES_VUSBCP              20
-#define RES_REGEN               21
-/* Reset and control */
-#define RES_NRES_PWRON          22
-#define RES_CLKEN               23
-#define RES_SYSEN               24
-#define RES_HFCLKOUT            25
-#define RES_32KCLKOUT           26
-#define RES_RESET               27
-/* Power Reference */
-#define RES_MAIN_REF            28
-
-#define TOTAL_RESOURCES		28
-/*
- * Power Bus Message Format ... these can be sent individually by Linux,
- * but are usually part of downloaded scripts that are run when various
- * power events are triggered.
- *
- *  Broadcast Message (16 Bits):
- *    DEV_GRP[15:13] MT[12]  RES_GRP[11:9]  RES_TYPE2[8:7] RES_TYPE[6:4]
- *    RES_STATE[3:0]
- *
- *  Singular Message (16 Bits):
- *    DEV_GRP[15:13] MT[12]  RES_ID[11:4]  RES_STATE[3:0]
- */
-
-#define MSG_BROADCAST(devgrp, grp, type, type2, state) \
-	( (devgrp) << 13 | 1 << 12 | (grp) << 9 | (type2) << 7 \
-	| (type) << 4 | (state))
-
-#define MSG_SINGULAR(devgrp, id, state) \
-	((devgrp) << 13 | 0 << 12 | (id) << 4 | (state))
-
-#define MSG_BROADCAST_ALL(devgrp, state) \
-	((devgrp) << 5 | (state))
-
-#define MSG_BROADCAST_REF MSG_BROADCAST_ALL
-#define MSG_BROADCAST_PROV MSG_BROADCAST_ALL
-#define MSG_BROADCAST__CLK_RST MSG_BROADCAST_ALL
-/*----------------------------------------------------------------------*/
-
-struct twl4030_clock_init_data {
-	bool ck32k_lowpwr_enable;
-};
-
-struct twl4030_bci_platform_data {
-	int *battery_tmp_tbl;
-	unsigned int tblsize;
-	int	bb_uvolt;	/* voltage to charge backup battery */
-	int	bb_uamp;	/* current for backup battery charging */
-};
-
-/* TWL4030_GPIO_MAX (18) GPIOs, with interrupts */
-struct twl4030_gpio_platform_data {
-	/* package the two LED signals as output-only GPIOs? */
-	bool		use_leds;
-
-	/* gpio-n should control VMMC(n+1) if BIT(n) in mmc_cd is set */
-	u8		mmc_cd;
-
-	/* if BIT(N) is set, or VMMC(n+1) is linked, debounce GPIO-N */
-	u32		debounce;
-
-	/* For gpio-N, bit (1 << N) in "pullups" is set if that pullup
-	 * should be enabled.  Else, if that bit is set in "pulldowns",
-	 * that pulldown is enabled.  Don't waste power by letting any
-	 * digital inputs float...
-	 */
-	u32		pullups;
-	u32		pulldowns;
-
-	int		(*setup)(struct device *dev,
-				unsigned gpio, unsigned ngpio);
-	int		(*teardown)(struct device *dev,
-				unsigned gpio, unsigned ngpio);
-};
-
-struct twl4030_madc_platform_data {
-	int		irq_line;
-};
-
-/* Boards have unique mappings of {row, col} --> keycode.
- * Column and row are 8 bits each, but range only from 0..7.
- * a PERSISTENT_KEY is "always on" and never reported.
- */
-#define PERSISTENT_KEY(r, c)	KEY((r), (c), KEY_RESERVED)
-
-struct twl4030_keypad_data {
-	const struct matrix_keymap_data *keymap_data;
-	unsigned rows;
-	unsigned cols;
-	bool rep;
-};
-
-enum twl4030_usb_mode {
-	T2_USB_MODE_ULPI = 1,
-	T2_USB_MODE_CEA2011_3PIN = 2,
-};
-
-struct twl4030_usb_data {
-	enum twl4030_usb_mode	usb_mode;
-	unsigned long		features;
-
-	int		(*phy_init)(struct device *dev);
-	int		(*phy_exit)(struct device *dev);
-	/* Power on/off the PHY */
-	int		(*phy_power)(struct device *dev, int iD, int on);
-	/* enable/disable  phy clocks */
-	int		(*phy_set_clock)(struct device *dev, int on);
-	/* suspend/resume of phy */
-	int		(*phy_suspend)(struct device *dev, int suspend);
-};
-
-struct twl4030_ins {
-	u16 pmb_message;
-	u8 delay;
-};
-
-struct twl4030_script {
-	struct twl4030_ins *script;
-	unsigned size;
-	u8 flags;
-#define TWL4030_WRST_SCRIPT	(1<<0)
-#define TWL4030_WAKEUP12_SCRIPT	(1<<1)
-#define TWL4030_WAKEUP3_SCRIPT	(1<<2)
-#define TWL4030_SLEEP_SCRIPT	(1<<3)
-};
-
-struct twl4030_resconfig {
-	u8 resource;
-	u8 devgroup;	/* Processor group that Power resource belongs to */
-	u8 type;	/* Power resource addressed, 6 / broadcast message */
-	u8 type2;	/* Power resource addressed, 3 / broadcast message */
-	u8 remap_off;	/* off state remapping */
-	u8 remap_sleep;	/* sleep state remapping */
-};
-
-struct twl4030_power_data {
-	struct twl4030_script **scripts;
-	unsigned num;
-	struct twl4030_resconfig *resource_config;
-	struct twl4030_resconfig *board_config;
-#define TWL4030_RESCONFIG_UNDEF	((u8)-1)
-	bool use_poweroff;	/* Board is wired for TWL poweroff */
-	bool ac_charger_quirk;	/* Disable AC charger on board */
-};
-
-extern int twl4030_remove_script(u8 flags);
-extern void twl4030_power_off(void);
-
-struct twl4030_codec_data {
-	unsigned int digimic_delay; /* in ms */
-	unsigned int ramp_delay_value;
-	unsigned int offset_cncl_path;
-	unsigned int hs_extmute:1;
-	int hs_extmute_gpio;
-};
-
-struct twl4030_vibra_data {
-	unsigned int	coexist;
-};
-
-struct twl4030_audio_data {
-	unsigned int	audio_mclk;
-	struct twl4030_codec_data *codec;
-	struct twl4030_vibra_data *vibra;
-
-	/* twl6040 */
-	int audpwron_gpio;	/* audio power-on gpio */
-	int naudint_irq;	/* audio interrupt */
-	unsigned int irq_base;
-};
-
-struct twl4030_platform_data {
-	struct twl4030_clock_init_data		*clock;
-	struct twl4030_bci_platform_data	*bci;
-	struct twl4030_gpio_platform_data	*gpio;
-	struct twl4030_madc_platform_data	*madc;
-	struct twl4030_keypad_data		*keypad;
-	struct twl4030_usb_data			*usb;
-	struct twl4030_power_data		*power;
-	struct twl4030_audio_data		*audio;
-
-	/* Common LDO regulators for TWL4030/TWL6030 */
-	struct regulator_init_data		*vdac;
-	struct regulator_init_data		*vaux1;
-	struct regulator_init_data		*vaux2;
-	struct regulator_init_data		*vaux3;
-	struct regulator_init_data		*vdd1;
-	struct regulator_init_data		*vdd2;
-	struct regulator_init_data		*vdd3;
-	/* TWL4030 LDO regulators */
-	struct regulator_init_data		*vpll1;
-	struct regulator_init_data		*vpll2;
-	struct regulator_init_data		*vmmc1;
-	struct regulator_init_data		*vmmc2;
-	struct regulator_init_data		*vsim;
-	struct regulator_init_data		*vaux4;
-	struct regulator_init_data		*vio;
-	struct regulator_init_data		*vintana1;
-	struct regulator_init_data		*vintana2;
-	struct regulator_init_data		*vintdig;
-	/* TWL6030 LDO regulators */
-	struct regulator_init_data              *vmmc;
-	struct regulator_init_data              *vpp;
-	struct regulator_init_data              *vusim;
-	struct regulator_init_data              *vana;
-	struct regulator_init_data              *vcxio;
-	struct regulator_init_data              *vusb;
-	struct regulator_init_data		*clk32kg;
-	struct regulator_init_data              *v1v8;
-	struct regulator_init_data              *v2v1;
-	/* TWL6032 LDO regulators */
-	struct regulator_init_data		*ldo1;
-	struct regulator_init_data		*ldo2;
-	struct regulator_init_data		*ldo3;
-	struct regulator_init_data		*ldo4;
-	struct regulator_init_data		*ldo5;
-	struct regulator_init_data		*ldo6;
-	struct regulator_init_data		*ldo7;
-	struct regulator_init_data		*ldoln;
-	struct regulator_init_data		*ldousb;
-	/* TWL6032 DCDC regulators */
-	struct regulator_init_data		*smps3;
-	struct regulator_init_data		*smps4;
-	struct regulator_init_data		*vio6025;
-};
-
-struct twl_regulator_driver_data {
-	int		(*set_voltage)(void *data, int target_uV);
-	int		(*get_voltage)(void *data);
-	void		*data;
-	unsigned long	features;
-};
-/* chip-specific feature flags, for twl_regulator_driver_data.features */
-#define TWL4030_VAUX2		BIT(0)	/* pre-5030 voltage ranges */
-#define TPS_SUBSET		BIT(1)	/* tps659[23]0 have fewer LDOs */
-#define TWL5031			BIT(2)  /* twl5031 has different registers */
-#define TWL6030_CLASS		BIT(3)	/* TWL6030 class */
-#define TWL6032_SUBCLASS	BIT(4)  /* TWL6032 has changed registers */
-#define TWL4030_ALLOW_UNSUPPORTED BIT(5) /* Some voltages are possible
-					  * but not officially supported.
-					  * This flag is necessary to
-					  * enable them.
-					  */
-
-/*----------------------------------------------------------------------*/
-
-int twl4030_sih_setup(struct device *dev, int module, int irq_base);
-
-/* Offsets to Power Registers */
-#define TWL4030_VDAC_DEV_GRP		0x3B
-#define TWL4030_VDAC_DEDICATED		0x3E
-#define TWL4030_VAUX1_DEV_GRP		0x17
-#define TWL4030_VAUX1_DEDICATED		0x1A
-#define TWL4030_VAUX2_DEV_GRP		0x1B
-#define TWL4030_VAUX2_DEDICATED		0x1E
-#define TWL4030_VAUX3_DEV_GRP		0x1F
-#define TWL4030_VAUX3_DEDICATED		0x22
-
-static inline int twl4030charger_usb_en(int enable) { return 0; }
-
-/*----------------------------------------------------------------------*/
-
-/* Linux-specific regulator identifiers ... for now, we only support
- * the LDOs, and leave the three buck converters alone.  VDD1 and VDD2
- * need to tie into hardware based voltage scaling (cpufreq etc), while
- * VIO is generally fixed.
- */
-
-/* TWL4030 SMPS/LDO's */
-/* EXTERNAL dc-to-dc buck converters */
-#define TWL4030_REG_VDD1	0
-#define TWL4030_REG_VDD2	1
-#define TWL4030_REG_VIO		2
-
-/* EXTERNAL LDOs */
-#define TWL4030_REG_VDAC	3
-#define TWL4030_REG_VPLL1	4
-#define TWL4030_REG_VPLL2	5	/* not on all chips */
-#define TWL4030_REG_VMMC1	6
-#define TWL4030_REG_VMMC2	7	/* not on all chips */
-#define TWL4030_REG_VSIM	8	/* not on all chips */
-#define TWL4030_REG_VAUX1	9	/* not on all chips */
-#define TWL4030_REG_VAUX2_4030	10	/* (twl4030-specific) */
-#define TWL4030_REG_VAUX2	11	/* (twl5030 and newer) */
-#define TWL4030_REG_VAUX3	12	/* not on all chips */
-#define TWL4030_REG_VAUX4	13	/* not on all chips */
-
-/* INTERNAL LDOs */
-#define TWL4030_REG_VINTANA1	14
-#define TWL4030_REG_VINTANA2	15
-#define TWL4030_REG_VINTDIG	16
-#define TWL4030_REG_VUSB1V5	17
-#define TWL4030_REG_VUSB1V8	18
-#define TWL4030_REG_VUSB3V1	19
-
-/* TWL6030 SMPS/LDO's */
-/* EXTERNAL dc-to-dc buck convertor controllable via SR */
-#define TWL6030_REG_VDD1	30
-#define TWL6030_REG_VDD2	31
-#define TWL6030_REG_VDD3	32
-
-/* Non SR compliant dc-to-dc buck convertors */
-#define TWL6030_REG_VMEM	33
-#define TWL6030_REG_V2V1	34
-#define TWL6030_REG_V1V29	35
-#define TWL6030_REG_V1V8	36
-
-/* EXTERNAL LDOs */
-#define TWL6030_REG_VAUX1_6030	37
-#define TWL6030_REG_VAUX2_6030	38
-#define TWL6030_REG_VAUX3_6030	39
-#define TWL6030_REG_VMMC	40
-#define TWL6030_REG_VPP		41
-#define TWL6030_REG_VUSIM	42
-#define TWL6030_REG_VANA	43
-#define TWL6030_REG_VCXIO	44
-#define TWL6030_REG_VDAC	45
-#define TWL6030_REG_VUSB	46
-
-/* INTERNAL LDOs */
-#define TWL6030_REG_VRTC	47
-#define TWL6030_REG_CLK32KG	48
-
-/* LDOs on 6025 have different names */
-#define TWL6032_REG_LDO2	49
-#define TWL6032_REG_LDO4	50
-#define TWL6032_REG_LDO3	51
-#define TWL6032_REG_LDO5	52
-#define TWL6032_REG_LDO1	53
-#define TWL6032_REG_LDO7	54
-#define TWL6032_REG_LDO6	55
-#define TWL6032_REG_LDOLN	56
-#define TWL6032_REG_LDOUSB	57
-
-/* 6025 DCDC supplies */
-#define TWL6032_REG_SMPS3	58
-#define TWL6032_REG_SMPS4	59
-#define TWL6032_REG_VIO		60
-
-
-#endif /* End of __TWL4030_H */
diff --git a/include/linux/mfd/twl.h b/include/linux/mfd/twl.h
new file mode 100644
index 000000000000..9ad7828d9d34
--- /dev/null
+++ b/include/linux/mfd/twl.h
@@ -0,0 +1,876 @@
+/*
+ * twl4030.h - header for TWL4030 PM and audio CODEC device
+ *
+ * Copyright (C) 2005-2006 Texas Instruments, Inc.
+ *
+ * Based on tlv320aic23.c:
+ * Copyright (c) by Kai Svahn <kai.svahn@nokia.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307 USA
+ *
+ */
+
+#ifndef __TWL_H_
+#define __TWL_H_
+
+#include <linux/types.h>
+#include <linux/input/matrix_keypad.h>
+
+/*
+ * Using the twl4030 core we address registers using a pair
+ *	{ module id, relative register offset }
+ * which that core then maps to the relevant
+ *	{ i2c slave, absolute register address }
+ *
+ * The module IDs are meaningful only to the twl4030 core code,
+ * which uses them as array indices to look up the first register
+ * address each module uses within a given i2c slave.
+ */
+
+/* Module IDs for similar functionalities found in twl4030/twl6030 */
+enum twl_module_ids {
+	TWL_MODULE_USB,
+	TWL_MODULE_PIH,
+	TWL_MODULE_MAIN_CHARGE,
+	TWL_MODULE_PM_MASTER,
+	TWL_MODULE_PM_RECEIVER,
+
+	TWL_MODULE_RTC,
+	TWL_MODULE_PWM,
+	TWL_MODULE_LED,
+	TWL_MODULE_SECURED_REG,
+
+	TWL_MODULE_LAST,
+};
+
+/* Modules only available in twl4030 series */
+enum twl4030_module_ids {
+	TWL4030_MODULE_AUDIO_VOICE = TWL_MODULE_LAST,
+	TWL4030_MODULE_GPIO,
+	TWL4030_MODULE_INTBR,
+	TWL4030_MODULE_TEST,
+	TWL4030_MODULE_KEYPAD,
+
+	TWL4030_MODULE_MADC,
+	TWL4030_MODULE_INTERRUPTS,
+	TWL4030_MODULE_PRECHARGE,
+	TWL4030_MODULE_BACKUP,
+	TWL4030_MODULE_INT,
+
+	TWL5031_MODULE_ACCESSORY,
+	TWL5031_MODULE_INTERRUPTS,
+
+	TWL4030_MODULE_LAST,
+};
+
+/* Modules only available in twl6030 series */
+enum twl6030_module_ids {
+	TWL6030_MODULE_ID0 = TWL_MODULE_LAST,
+	TWL6030_MODULE_ID1,
+	TWL6030_MODULE_ID2,
+	TWL6030_MODULE_GPADC,
+	TWL6030_MODULE_GASGAUGE,
+
+	TWL6030_MODULE_LAST,
+};
+
+/* Until the clients has been converted to use TWL_MODULE_LED */
+#define TWL4030_MODULE_LED	TWL_MODULE_LED
+
+#define GPIO_INTR_OFFSET	0
+#define KEYPAD_INTR_OFFSET	1
+#define BCI_INTR_OFFSET		2
+#define MADC_INTR_OFFSET	3
+#define USB_INTR_OFFSET		4
+#define CHARGERFAULT_INTR_OFFSET 5
+#define BCI_PRES_INTR_OFFSET	9
+#define USB_PRES_INTR_OFFSET	10
+#define RTC_INTR_OFFSET		11
+
+/*
+ * Offset from TWL6030_IRQ_BASE / pdata->irq_base
+ */
+#define PWR_INTR_OFFSET		0
+#define HOTDIE_INTR_OFFSET	12
+#define SMPSLDO_INTR_OFFSET	13
+#define BATDETECT_INTR_OFFSET	14
+#define SIMDETECT_INTR_OFFSET	15
+#define MMCDETECT_INTR_OFFSET	16
+#define GASGAUGE_INTR_OFFSET	17
+#define USBOTG_INTR_OFFSET	4
+#define CHARGER_INTR_OFFSET	2
+#define RSV_INTR_OFFSET		0
+
+/* INT register offsets */
+#define REG_INT_STS_A			0x00
+#define REG_INT_STS_B			0x01
+#define REG_INT_STS_C			0x02
+
+#define REG_INT_MSK_LINE_A		0x03
+#define REG_INT_MSK_LINE_B		0x04
+#define REG_INT_MSK_LINE_C		0x05
+
+#define REG_INT_MSK_STS_A		0x06
+#define REG_INT_MSK_STS_B		0x07
+#define REG_INT_MSK_STS_C		0x08
+
+/* MASK INT REG GROUP A */
+#define TWL6030_PWR_INT_MASK 		0x07
+#define TWL6030_RTC_INT_MASK 		0x18
+#define TWL6030_HOTDIE_INT_MASK 	0x20
+#define TWL6030_SMPSLDOA_INT_MASK	0xC0
+
+/* MASK INT REG GROUP B */
+#define TWL6030_SMPSLDOB_INT_MASK 	0x01
+#define TWL6030_BATDETECT_INT_MASK 	0x02
+#define TWL6030_SIMDETECT_INT_MASK 	0x04
+#define TWL6030_MMCDETECT_INT_MASK 	0x08
+#define TWL6030_GPADC_INT_MASK 		0x60
+#define TWL6030_GASGAUGE_INT_MASK 	0x80
+
+/* MASK INT REG GROUP C */
+#define TWL6030_USBOTG_INT_MASK  	0x0F
+#define TWL6030_CHARGER_CTRL_INT_MASK 	0x10
+#define TWL6030_CHARGER_FAULT_INT_MASK 	0x60
+
+#define TWL6030_MMCCTRL		0xEE
+#define VMMC_AUTO_OFF			(0x1 << 3)
+#define SW_FC				(0x1 << 2)
+#define STS_MMC			0x1
+
+#define TWL6030_CFG_INPUT_PUPD3	0xF2
+#define MMC_PU				(0x1 << 3)
+#define MMC_PD				(0x1 << 2)
+
+#define TWL_SIL_TYPE(rev)		((rev) & 0x00FFFFFF)
+#define TWL_SIL_REV(rev)		((rev) >> 24)
+#define TWL_SIL_5030			0x09002F
+#define TWL5030_REV_1_0			0x00
+#define TWL5030_REV_1_1			0x10
+#define TWL5030_REV_1_2			0x30
+
+#define TWL4030_CLASS_ID 		0x4030
+#define TWL6030_CLASS_ID 		0x6030
+unsigned int twl_rev(void);
+#define GET_TWL_REV (twl_rev())
+#define TWL_CLASS_IS(class, id)			\
+static inline int twl_class_is_ ##class(void)	\
+{						\
+	return ((id) == (GET_TWL_REV)) ? 1 : 0;	\
+}
+
+TWL_CLASS_IS(4030, TWL4030_CLASS_ID)
+TWL_CLASS_IS(6030, TWL6030_CLASS_ID)
+
+/* Set the regcache bypass for the regmap associated with the nodule */
+int twl_set_regcache_bypass(u8 mod_no, bool enable);
+
+/*
+ * Read and write several 8-bit registers at once.
+ */
+int twl_i2c_write(u8 mod_no, u8 *value, u8 reg, unsigned num_bytes);
+int twl_i2c_read(u8 mod_no, u8 *value, u8 reg, unsigned num_bytes);
+
+/*
+ * Read and write single 8-bit registers
+ */
+static inline int twl_i2c_write_u8(u8 mod_no, u8 val, u8 reg) {
+	return twl_i2c_write(mod_no, &val, reg, 1);
+}
+
+static inline int twl_i2c_read_u8(u8 mod_no, u8 *val, u8 reg) {
+	return twl_i2c_read(mod_no, val, reg, 1);
+}
+
+static inline int twl_i2c_write_u16(u8 mod_no, u16 val, u8 reg) {
+	val = cpu_to_le16(val);
+	return twl_i2c_write(mod_no, (u8*) &val, reg, 2);
+}
+
+static inline int twl_i2c_read_u16(u8 mod_no, u16 *val, u8 reg) {
+	int ret;
+	ret = twl_i2c_read(mod_no, (u8*) val, reg, 2);
+	*val = le16_to_cpu(*val);
+	return ret;
+}
+
+int twl_get_type(void);
+int twl_get_version(void);
+int twl_get_hfclk_rate(void);
+
+int twl6030_interrupt_unmask(u8 bit_mask, u8 offset);
+int twl6030_interrupt_mask(u8 bit_mask, u8 offset);
+
+/* Card detect Configuration for MMC1 Controller on OMAP4 */
+#ifdef CONFIG_TWL4030_CORE
+int twl6030_mmc_card_detect_config(void);
+#else
+static inline int twl6030_mmc_card_detect_config(void)
+{
+	pr_debug("twl6030_mmc_card_detect_config not supported\n");
+	return 0;
+}
+#endif
+
+/* MMC1 Controller on OMAP4 uses Phoenix irq for Card detect */
+#ifdef CONFIG_TWL4030_CORE
+int twl6030_mmc_card_detect(struct device *dev, int slot);
+#else
+static inline int twl6030_mmc_card_detect(struct device *dev, int slot)
+{
+	pr_debug("Call back twl6030_mmc_card_detect not supported\n");
+	return -EIO;
+}
+#endif
+/*----------------------------------------------------------------------*/
+
+/*
+ * NOTE:  at up to 1024 registers, this is a big chip.
+ *
+ * Avoid putting register declarations in this file, instead of into
+ * a driver-private file, unless some of the registers in a block
+ * need to be shared with other drivers.  One example is blocks that
+ * have Secondary IRQ Handler (SIH) registers.
+ */
+
+#define TWL4030_SIH_CTRL_EXCLEN_MASK	BIT(0)
+#define TWL4030_SIH_CTRL_PENDDIS_MASK	BIT(1)
+#define TWL4030_SIH_CTRL_COR_MASK	BIT(2)
+
+/*----------------------------------------------------------------------*/
+
+/*
+ * GPIO Block Register offsets (use TWL4030_MODULE_GPIO)
+ */
+
+#define REG_GPIODATAIN1			0x0
+#define REG_GPIODATAIN2			0x1
+#define REG_GPIODATAIN3			0x2
+#define REG_GPIODATADIR1		0x3
+#define REG_GPIODATADIR2		0x4
+#define REG_GPIODATADIR3		0x5
+#define REG_GPIODATAOUT1		0x6
+#define REG_GPIODATAOUT2		0x7
+#define REG_GPIODATAOUT3		0x8
+#define REG_CLEARGPIODATAOUT1		0x9
+#define REG_CLEARGPIODATAOUT2		0xA
+#define REG_CLEARGPIODATAOUT3		0xB
+#define REG_SETGPIODATAOUT1		0xC
+#define REG_SETGPIODATAOUT2		0xD
+#define REG_SETGPIODATAOUT3		0xE
+#define REG_GPIO_DEBEN1			0xF
+#define REG_GPIO_DEBEN2			0x10
+#define REG_GPIO_DEBEN3			0x11
+#define REG_GPIO_CTRL			0x12
+#define REG_GPIOPUPDCTR1		0x13
+#define REG_GPIOPUPDCTR2		0x14
+#define REG_GPIOPUPDCTR3		0x15
+#define REG_GPIOPUPDCTR4		0x16
+#define REG_GPIOPUPDCTR5		0x17
+#define REG_GPIO_ISR1A			0x19
+#define REG_GPIO_ISR2A			0x1A
+#define REG_GPIO_ISR3A			0x1B
+#define REG_GPIO_IMR1A			0x1C
+#define REG_GPIO_IMR2A			0x1D
+#define REG_GPIO_IMR3A			0x1E
+#define REG_GPIO_ISR1B			0x1F
+#define REG_GPIO_ISR2B			0x20
+#define REG_GPIO_ISR3B			0x21
+#define REG_GPIO_IMR1B			0x22
+#define REG_GPIO_IMR2B			0x23
+#define REG_GPIO_IMR3B			0x24
+#define REG_GPIO_EDR1			0x28
+#define REG_GPIO_EDR2			0x29
+#define REG_GPIO_EDR3			0x2A
+#define REG_GPIO_EDR4			0x2B
+#define REG_GPIO_EDR5			0x2C
+#define REG_GPIO_SIH_CTRL		0x2D
+
+/* Up to 18 signals are available as GPIOs, when their
+ * pins are not assigned to another use (such as ULPI/USB).
+ */
+#define TWL4030_GPIO_MAX		18
+
+/*----------------------------------------------------------------------*/
+
+/*Interface Bit Register (INTBR) offsets
+ *(Use TWL_4030_MODULE_INTBR)
+ */
+
+#define REG_IDCODE_7_0			0x00
+#define REG_IDCODE_15_8			0x01
+#define REG_IDCODE_16_23		0x02
+#define REG_IDCODE_31_24		0x03
+#define REG_GPPUPDCTR1			0x0F
+#define REG_UNLOCK_TEST_REG		0x12
+
+/*I2C1 and I2C4(SR) SDA/SCL pull-up control bits */
+
+#define I2C_SCL_CTRL_PU			BIT(0)
+#define I2C_SDA_CTRL_PU			BIT(2)
+#define SR_I2C_SCL_CTRL_PU		BIT(4)
+#define SR_I2C_SDA_CTRL_PU		BIT(6)
+
+#define TWL_EEPROM_R_UNLOCK		0x49
+
+/*----------------------------------------------------------------------*/
+
+/*
+ * Keypad register offsets (use TWL4030_MODULE_KEYPAD)
+ * ... SIH/interrupt only
+ */
+
+#define TWL4030_KEYPAD_KEYP_ISR1	0x11
+#define TWL4030_KEYPAD_KEYP_IMR1	0x12
+#define TWL4030_KEYPAD_KEYP_ISR2	0x13
+#define TWL4030_KEYPAD_KEYP_IMR2	0x14
+#define TWL4030_KEYPAD_KEYP_SIR		0x15	/* test register */
+#define TWL4030_KEYPAD_KEYP_EDR		0x16
+#define TWL4030_KEYPAD_KEYP_SIH_CTRL	0x17
+
+/*----------------------------------------------------------------------*/
+
+/*
+ * Multichannel ADC register offsets (use TWL4030_MODULE_MADC)
+ * ... SIH/interrupt only
+ */
+
+#define TWL4030_MADC_ISR1		0x61
+#define TWL4030_MADC_IMR1		0x62
+#define TWL4030_MADC_ISR2		0x63
+#define TWL4030_MADC_IMR2		0x64
+#define TWL4030_MADC_SIR		0x65	/* test register */
+#define TWL4030_MADC_EDR		0x66
+#define TWL4030_MADC_SIH_CTRL		0x67
+
+/*----------------------------------------------------------------------*/
+
+/*
+ * Battery charger register offsets (use TWL4030_MODULE_INTERRUPTS)
+ */
+
+#define TWL4030_INTERRUPTS_BCIISR1A	0x0
+#define TWL4030_INTERRUPTS_BCIISR2A	0x1
+#define TWL4030_INTERRUPTS_BCIIMR1A	0x2
+#define TWL4030_INTERRUPTS_BCIIMR2A	0x3
+#define TWL4030_INTERRUPTS_BCIISR1B	0x4
+#define TWL4030_INTERRUPTS_BCIISR2B	0x5
+#define TWL4030_INTERRUPTS_BCIIMR1B	0x6
+#define TWL4030_INTERRUPTS_BCIIMR2B	0x7
+#define TWL4030_INTERRUPTS_BCISIR1	0x8	/* test register */
+#define TWL4030_INTERRUPTS_BCISIR2	0x9	/* test register */
+#define TWL4030_INTERRUPTS_BCIEDR1	0xa
+#define TWL4030_INTERRUPTS_BCIEDR2	0xb
+#define TWL4030_INTERRUPTS_BCIEDR3	0xc
+#define TWL4030_INTERRUPTS_BCISIHCTRL	0xd
+
+/*----------------------------------------------------------------------*/
+
+/*
+ * Power Interrupt block register offsets (use TWL4030_MODULE_INT)
+ */
+
+#define TWL4030_INT_PWR_ISR1		0x0
+#define TWL4030_INT_PWR_IMR1		0x1
+#define TWL4030_INT_PWR_ISR2		0x2
+#define TWL4030_INT_PWR_IMR2		0x3
+#define TWL4030_INT_PWR_SIR		0x4	/* test register */
+#define TWL4030_INT_PWR_EDR1		0x5
+#define TWL4030_INT_PWR_EDR2		0x6
+#define TWL4030_INT_PWR_SIH_CTRL	0x7
+
+/*----------------------------------------------------------------------*/
+
+/*
+ * Accessory Interrupts
+ */
+#define TWL5031_ACIIMR_LSB		0x05
+#define TWL5031_ACIIMR_MSB		0x06
+#define TWL5031_ACIIDR_LSB		0x07
+#define TWL5031_ACIIDR_MSB		0x08
+#define TWL5031_ACCISR1			0x0F
+#define TWL5031_ACCIMR1			0x10
+#define TWL5031_ACCISR2			0x11
+#define TWL5031_ACCIMR2			0x12
+#define TWL5031_ACCSIR			0x13
+#define TWL5031_ACCEDR1			0x14
+#define TWL5031_ACCSIHCTRL		0x15
+
+/*----------------------------------------------------------------------*/
+
+/*
+ * Battery Charger Controller
+ */
+
+#define TWL5031_INTERRUPTS_BCIISR1	0x0
+#define TWL5031_INTERRUPTS_BCIIMR1	0x1
+#define TWL5031_INTERRUPTS_BCIISR2	0x2
+#define TWL5031_INTERRUPTS_BCIIMR2	0x3
+#define TWL5031_INTERRUPTS_BCISIR	0x4
+#define TWL5031_INTERRUPTS_BCIEDR1	0x5
+#define TWL5031_INTERRUPTS_BCIEDR2	0x6
+#define TWL5031_INTERRUPTS_BCISIHCTRL	0x7
+
+/*----------------------------------------------------------------------*/
+
+/*
+ * PM Master module register offsets (use TWL4030_MODULE_PM_MASTER)
+ */
+
+#define TWL4030_PM_MASTER_CFG_P1_TRANSITION	0x00
+#define TWL4030_PM_MASTER_CFG_P2_TRANSITION	0x01
+#define TWL4030_PM_MASTER_CFG_P3_TRANSITION	0x02
+#define TWL4030_PM_MASTER_CFG_P123_TRANSITION	0x03
+#define TWL4030_PM_MASTER_STS_BOOT		0x04
+#define TWL4030_PM_MASTER_CFG_BOOT		0x05
+#define TWL4030_PM_MASTER_SHUNDAN		0x06
+#define TWL4030_PM_MASTER_BOOT_BCI		0x07
+#define TWL4030_PM_MASTER_CFG_PWRANA1		0x08
+#define TWL4030_PM_MASTER_CFG_PWRANA2		0x09
+#define TWL4030_PM_MASTER_BACKUP_MISC_STS	0x0b
+#define TWL4030_PM_MASTER_BACKUP_MISC_CFG	0x0c
+#define TWL4030_PM_MASTER_BACKUP_MISC_TST	0x0d
+#define TWL4030_PM_MASTER_PROTECT_KEY		0x0e
+#define TWL4030_PM_MASTER_STS_HW_CONDITIONS	0x0f
+#define TWL4030_PM_MASTER_P1_SW_EVENTS		0x10
+#define TWL4030_PM_MASTER_P2_SW_EVENTS		0x11
+#define TWL4030_PM_MASTER_P3_SW_EVENTS		0x12
+#define TWL4030_PM_MASTER_STS_P123_STATE	0x13
+#define TWL4030_PM_MASTER_PB_CFG		0x14
+#define TWL4030_PM_MASTER_PB_WORD_MSB		0x15
+#define TWL4030_PM_MASTER_PB_WORD_LSB		0x16
+#define TWL4030_PM_MASTER_SEQ_ADD_W2P		0x1c
+#define TWL4030_PM_MASTER_SEQ_ADD_P2A		0x1d
+#define TWL4030_PM_MASTER_SEQ_ADD_A2W		0x1e
+#define TWL4030_PM_MASTER_SEQ_ADD_A2S		0x1f
+#define TWL4030_PM_MASTER_SEQ_ADD_S2A12		0x20
+#define TWL4030_PM_MASTER_SEQ_ADD_S2A3		0x21
+#define TWL4030_PM_MASTER_SEQ_ADD_WARM		0x22
+#define TWL4030_PM_MASTER_MEMORY_ADDRESS	0x23
+#define TWL4030_PM_MASTER_MEMORY_DATA		0x24
+
+#define TWL4030_PM_MASTER_KEY_CFG1		0xc0
+#define TWL4030_PM_MASTER_KEY_CFG2		0x0c
+
+#define TWL4030_PM_MASTER_KEY_TST1		0xe0
+#define TWL4030_PM_MASTER_KEY_TST2		0x0e
+
+#define TWL4030_PM_MASTER_GLOBAL_TST		0xb6
+
+/*----------------------------------------------------------------------*/
+
+/* Power bus message definitions */
+
+/* The TWL4030/5030 splits its power-management resources (the various
+ * regulators, clock and reset lines) into 3 processor groups - P1, P2 and
+ * P3. These groups can then be configured to transition between sleep, wait-on
+ * and active states by sending messages to the power bus.  See Section 5.4.2
+ * Power Resources of TWL4030 TRM
+ */
+
+/* Processor groups */
+#define DEV_GRP_NULL		0x0
+#define DEV_GRP_P1		0x1	/* P1: all OMAP devices */
+#define DEV_GRP_P2		0x2	/* P2: all Modem devices */
+#define DEV_GRP_P3		0x4	/* P3: all peripheral devices */
+
+/* Resource groups */
+#define RES_GRP_RES		0x0	/* Reserved */
+#define RES_GRP_PP		0x1	/* Power providers */
+#define RES_GRP_RC		0x2	/* Reset and control */
+#define RES_GRP_PP_RC		0x3
+#define RES_GRP_PR		0x4	/* Power references */
+#define RES_GRP_PP_PR		0x5
+#define RES_GRP_RC_PR		0x6
+#define RES_GRP_ALL		0x7	/* All resource groups */
+
+#define RES_TYPE2_R0		0x0
+#define RES_TYPE2_R1		0x1
+#define RES_TYPE2_R2		0x2
+
+#define RES_TYPE_R0		0x0
+#define RES_TYPE_ALL		0x7
+
+/* Resource states */
+#define RES_STATE_WRST		0xF
+#define RES_STATE_ACTIVE	0xE
+#define RES_STATE_SLEEP		0x8
+#define RES_STATE_OFF		0x0
+
+/* Power resources */
+
+/* Power providers */
+#define RES_VAUX1               1
+#define RES_VAUX2               2
+#define RES_VAUX3               3
+#define RES_VAUX4               4
+#define RES_VMMC1               5
+#define RES_VMMC2               6
+#define RES_VPLL1               7
+#define RES_VPLL2               8
+#define RES_VSIM                9
+#define RES_VDAC                10
+#define RES_VINTANA1            11
+#define RES_VINTANA2            12
+#define RES_VINTDIG             13
+#define RES_VIO                 14
+#define RES_VDD1                15
+#define RES_VDD2                16
+#define RES_VUSB_1V5            17
+#define RES_VUSB_1V8            18
+#define RES_VUSB_3V1            19
+#define RES_VUSBCP              20
+#define RES_REGEN               21
+/* Reset and control */
+#define RES_NRES_PWRON          22
+#define RES_CLKEN               23
+#define RES_SYSEN               24
+#define RES_HFCLKOUT            25
+#define RES_32KCLKOUT           26
+#define RES_RESET               27
+/* Power Reference */
+#define RES_MAIN_REF            28
+
+#define TOTAL_RESOURCES		28
+/*
+ * Power Bus Message Format ... these can be sent individually by Linux,
+ * but are usually part of downloaded scripts that are run when various
+ * power events are triggered.
+ *
+ *  Broadcast Message (16 Bits):
+ *    DEV_GRP[15:13] MT[12]  RES_GRP[11:9]  RES_TYPE2[8:7] RES_TYPE[6:4]
+ *    RES_STATE[3:0]
+ *
+ *  Singular Message (16 Bits):
+ *    DEV_GRP[15:13] MT[12]  RES_ID[11:4]  RES_STATE[3:0]
+ */
+
+#define MSG_BROADCAST(devgrp, grp, type, type2, state) \
+	( (devgrp) << 13 | 1 << 12 | (grp) << 9 | (type2) << 7 \
+	| (type) << 4 | (state))
+
+#define MSG_SINGULAR(devgrp, id, state) \
+	((devgrp) << 13 | 0 << 12 | (id) << 4 | (state))
+
+#define MSG_BROADCAST_ALL(devgrp, state) \
+	((devgrp) << 5 | (state))
+
+#define MSG_BROADCAST_REF MSG_BROADCAST_ALL
+#define MSG_BROADCAST_PROV MSG_BROADCAST_ALL
+#define MSG_BROADCAST__CLK_RST MSG_BROADCAST_ALL
+/*----------------------------------------------------------------------*/
+
+struct twl4030_clock_init_data {
+	bool ck32k_lowpwr_enable;
+};
+
+struct twl4030_bci_platform_data {
+	int *battery_tmp_tbl;
+	unsigned int tblsize;
+	int	bb_uvolt;	/* voltage to charge backup battery */
+	int	bb_uamp;	/* current for backup battery charging */
+};
+
+/* TWL4030_GPIO_MAX (18) GPIOs, with interrupts */
+struct twl4030_gpio_platform_data {
+	/* package the two LED signals as output-only GPIOs? */
+	bool		use_leds;
+
+	/* gpio-n should control VMMC(n+1) if BIT(n) in mmc_cd is set */
+	u8		mmc_cd;
+
+	/* if BIT(N) is set, or VMMC(n+1) is linked, debounce GPIO-N */
+	u32		debounce;
+
+	/* For gpio-N, bit (1 << N) in "pullups" is set if that pullup
+	 * should be enabled.  Else, if that bit is set in "pulldowns",
+	 * that pulldown is enabled.  Don't waste power by letting any
+	 * digital inputs float...
+	 */
+	u32		pullups;
+	u32		pulldowns;
+
+	int		(*setup)(struct device *dev,
+				unsigned gpio, unsigned ngpio);
+	int		(*teardown)(struct device *dev,
+				unsigned gpio, unsigned ngpio);
+};
+
+struct twl4030_madc_platform_data {
+	int		irq_line;
+};
+
+/* Boards have unique mappings of {row, col} --> keycode.
+ * Column and row are 8 bits each, but range only from 0..7.
+ * a PERSISTENT_KEY is "always on" and never reported.
+ */
+#define PERSISTENT_KEY(r, c)	KEY((r), (c), KEY_RESERVED)
+
+struct twl4030_keypad_data {
+	const struct matrix_keymap_data *keymap_data;
+	unsigned rows;
+	unsigned cols;
+	bool rep;
+};
+
+enum twl4030_usb_mode {
+	T2_USB_MODE_ULPI = 1,
+	T2_USB_MODE_CEA2011_3PIN = 2,
+};
+
+struct twl4030_usb_data {
+	enum twl4030_usb_mode	usb_mode;
+	unsigned long		features;
+
+	int		(*phy_init)(struct device *dev);
+	int		(*phy_exit)(struct device *dev);
+	/* Power on/off the PHY */
+	int		(*phy_power)(struct device *dev, int iD, int on);
+	/* enable/disable  phy clocks */
+	int		(*phy_set_clock)(struct device *dev, int on);
+	/* suspend/resume of phy */
+	int		(*phy_suspend)(struct device *dev, int suspend);
+};
+
+struct twl4030_ins {
+	u16 pmb_message;
+	u8 delay;
+};
+
+struct twl4030_script {
+	struct twl4030_ins *script;
+	unsigned size;
+	u8 flags;
+#define TWL4030_WRST_SCRIPT	(1<<0)
+#define TWL4030_WAKEUP12_SCRIPT	(1<<1)
+#define TWL4030_WAKEUP3_SCRIPT	(1<<2)
+#define TWL4030_SLEEP_SCRIPT	(1<<3)
+};
+
+struct twl4030_resconfig {
+	u8 resource;
+	u8 devgroup;	/* Processor group that Power resource belongs to */
+	u8 type;	/* Power resource addressed, 6 / broadcast message */
+	u8 type2;	/* Power resource addressed, 3 / broadcast message */
+	u8 remap_off;	/* off state remapping */
+	u8 remap_sleep;	/* sleep state remapping */
+};
+
+struct twl4030_power_data {
+	struct twl4030_script **scripts;
+	unsigned num;
+	struct twl4030_resconfig *resource_config;
+	struct twl4030_resconfig *board_config;
+#define TWL4030_RESCONFIG_UNDEF	((u8)-1)
+	bool use_poweroff;	/* Board is wired for TWL poweroff */
+	bool ac_charger_quirk;	/* Disable AC charger on board */
+};
+
+extern int twl4030_remove_script(u8 flags);
+extern void twl4030_power_off(void);
+
+struct twl4030_codec_data {
+	unsigned int digimic_delay; /* in ms */
+	unsigned int ramp_delay_value;
+	unsigned int offset_cncl_path;
+	unsigned int hs_extmute:1;
+	int hs_extmute_gpio;
+};
+
+struct twl4030_vibra_data {
+	unsigned int	coexist;
+};
+
+struct twl4030_audio_data {
+	unsigned int	audio_mclk;
+	struct twl4030_codec_data *codec;
+	struct twl4030_vibra_data *vibra;
+
+	/* twl6040 */
+	int audpwron_gpio;	/* audio power-on gpio */
+	int naudint_irq;	/* audio interrupt */
+	unsigned int irq_base;
+};
+
+struct twl4030_platform_data {
+	struct twl4030_clock_init_data		*clock;
+	struct twl4030_bci_platform_data	*bci;
+	struct twl4030_gpio_platform_data	*gpio;
+	struct twl4030_madc_platform_data	*madc;
+	struct twl4030_keypad_data		*keypad;
+	struct twl4030_usb_data			*usb;
+	struct twl4030_power_data		*power;
+	struct twl4030_audio_data		*audio;
+
+	/* Common LDO regulators for TWL4030/TWL6030 */
+	struct regulator_init_data		*vdac;
+	struct regulator_init_data		*vaux1;
+	struct regulator_init_data		*vaux2;
+	struct regulator_init_data		*vaux3;
+	struct regulator_init_data		*vdd1;
+	struct regulator_init_data		*vdd2;
+	struct regulator_init_data		*vdd3;
+	/* TWL4030 LDO regulators */
+	struct regulator_init_data		*vpll1;
+	struct regulator_init_data		*vpll2;
+	struct regulator_init_data		*vmmc1;
+	struct regulator_init_data		*vmmc2;
+	struct regulator_init_data		*vsim;
+	struct regulator_init_data		*vaux4;
+	struct regulator_init_data		*vio;
+	struct regulator_init_data		*vintana1;
+	struct regulator_init_data		*vintana2;
+	struct regulator_init_data		*vintdig;
+	/* TWL6030 LDO regulators */
+	struct regulator_init_data              *vmmc;
+	struct regulator_init_data              *vpp;
+	struct regulator_init_data              *vusim;
+	struct regulator_init_data              *vana;
+	struct regulator_init_data              *vcxio;
+	struct regulator_init_data              *vusb;
+	struct regulator_init_data		*clk32kg;
+	struct regulator_init_data              *v1v8;
+	struct regulator_init_data              *v2v1;
+	/* TWL6032 LDO regulators */
+	struct regulator_init_data		*ldo1;
+	struct regulator_init_data		*ldo2;
+	struct regulator_init_data		*ldo3;
+	struct regulator_init_data		*ldo4;
+	struct regulator_init_data		*ldo5;
+	struct regulator_init_data		*ldo6;
+	struct regulator_init_data		*ldo7;
+	struct regulator_init_data		*ldoln;
+	struct regulator_init_data		*ldousb;
+	/* TWL6032 DCDC regulators */
+	struct regulator_init_data		*smps3;
+	struct regulator_init_data		*smps4;
+	struct regulator_init_data		*vio6025;
+};
+
+struct twl_regulator_driver_data {
+	int		(*set_voltage)(void *data, int target_uV);
+	int		(*get_voltage)(void *data);
+	void		*data;
+	unsigned long	features;
+};
+/* chip-specific feature flags, for twl_regulator_driver_data.features */
+#define TWL4030_VAUX2		BIT(0)	/* pre-5030 voltage ranges */
+#define TPS_SUBSET		BIT(1)	/* tps659[23]0 have fewer LDOs */
+#define TWL5031			BIT(2)  /* twl5031 has different registers */
+#define TWL6030_CLASS		BIT(3)	/* TWL6030 class */
+#define TWL6032_SUBCLASS	BIT(4)  /* TWL6032 has changed registers */
+#define TWL4030_ALLOW_UNSUPPORTED BIT(5) /* Some voltages are possible
+					  * but not officially supported.
+					  * This flag is necessary to
+					  * enable them.
+					  */
+
+/*----------------------------------------------------------------------*/
+
+int twl4030_sih_setup(struct device *dev, int module, int irq_base);
+
+/* Offsets to Power Registers */
+#define TWL4030_VDAC_DEV_GRP		0x3B
+#define TWL4030_VDAC_DEDICATED		0x3E
+#define TWL4030_VAUX1_DEV_GRP		0x17
+#define TWL4030_VAUX1_DEDICATED		0x1A
+#define TWL4030_VAUX2_DEV_GRP		0x1B
+#define TWL4030_VAUX2_DEDICATED		0x1E
+#define TWL4030_VAUX3_DEV_GRP		0x1F
+#define TWL4030_VAUX3_DEDICATED		0x22
+
+static inline int twl4030charger_usb_en(int enable) { return 0; }
+
+/*----------------------------------------------------------------------*/
+
+/* Linux-specific regulator identifiers ... for now, we only support
+ * the LDOs, and leave the three buck converters alone.  VDD1 and VDD2
+ * need to tie into hardware based voltage scaling (cpufreq etc), while
+ * VIO is generally fixed.
+ */
+
+/* TWL4030 SMPS/LDO's */
+/* EXTERNAL dc-to-dc buck converters */
+#define TWL4030_REG_VDD1	0
+#define TWL4030_REG_VDD2	1
+#define TWL4030_REG_VIO		2
+
+/* EXTERNAL LDOs */
+#define TWL4030_REG_VDAC	3
+#define TWL4030_REG_VPLL1	4
+#define TWL4030_REG_VPLL2	5	/* not on all chips */
+#define TWL4030_REG_VMMC1	6
+#define TWL4030_REG_VMMC2	7	/* not on all chips */
+#define TWL4030_REG_VSIM	8	/* not on all chips */
+#define TWL4030_REG_VAUX1	9	/* not on all chips */
+#define TWL4030_REG_VAUX2_4030	10	/* (twl4030-specific) */
+#define TWL4030_REG_VAUX2	11	/* (twl5030 and newer) */
+#define TWL4030_REG_VAUX3	12	/* not on all chips */
+#define TWL4030_REG_VAUX4	13	/* not on all chips */
+
+/* INTERNAL LDOs */
+#define TWL4030_REG_VINTANA1	14
+#define TWL4030_REG_VINTANA2	15
+#define TWL4030_REG_VINTDIG	16
+#define TWL4030_REG_VUSB1V5	17
+#define TWL4030_REG_VUSB1V8	18
+#define TWL4030_REG_VUSB3V1	19
+
+/* TWL6030 SMPS/LDO's */
+/* EXTERNAL dc-to-dc buck convertor controllable via SR */
+#define TWL6030_REG_VDD1	30
+#define TWL6030_REG_VDD2	31
+#define TWL6030_REG_VDD3	32
+
+/* Non SR compliant dc-to-dc buck convertors */
+#define TWL6030_REG_VMEM	33
+#define TWL6030_REG_V2V1	34
+#define TWL6030_REG_V1V29	35
+#define TWL6030_REG_V1V8	36
+
+/* EXTERNAL LDOs */
+#define TWL6030_REG_VAUX1_6030	37
+#define TWL6030_REG_VAUX2_6030	38
+#define TWL6030_REG_VAUX3_6030	39
+#define TWL6030_REG_VMMC	40
+#define TWL6030_REG_VPP		41
+#define TWL6030_REG_VUSIM	42
+#define TWL6030_REG_VANA	43
+#define TWL6030_REG_VCXIO	44
+#define TWL6030_REG_VDAC	45
+#define TWL6030_REG_VUSB	46
+
+/* INTERNAL LDOs */
+#define TWL6030_REG_VRTC	47
+#define TWL6030_REG_CLK32KG	48
+
+/* LDOs on 6025 have different names */
+#define TWL6032_REG_LDO2	49
+#define TWL6032_REG_LDO4	50
+#define TWL6032_REG_LDO3	51
+#define TWL6032_REG_LDO5	52
+#define TWL6032_REG_LDO1	53
+#define TWL6032_REG_LDO7	54
+#define TWL6032_REG_LDO6	55
+#define TWL6032_REG_LDOLN	56
+#define TWL6032_REG_LDOUSB	57
+
+/* 6025 DCDC supplies */
+#define TWL6032_REG_SMPS3	58
+#define TWL6032_REG_SMPS4	59
+#define TWL6032_REG_VIO		60
+
+
+#endif /* End of __TWL4030_H */
diff --git a/sound/soc/codecs/twl4030.c b/sound/soc/codecs/twl4030.c
index a2104d68169d..bd86a2d78c5f 100644
--- a/sound/soc/codecs/twl4030.c
+++ b/sound/soc/codecs/twl4030.c
@@ -28,7 +28,7 @@
 #include <linux/platform_device.h>
 #include <linux/of.h>
 #include <linux/of_gpio.h>
-#include <linux/i2c/twl.h>
+#include <linux/mfd/twl.h>
 #include <linux/slab.h>
 #include <linux/gpio.h>
 #include <sound/core.h>
-- 
cgit v1.2.3


From e8924005b4e7964313536547d4b73406330be26d Mon Sep 17 00:00:00 2001
From: Fabrice Gasnier <fabrice.gasnier@st.com>
Date: Mon, 28 Aug 2017 12:04:07 +0200
Subject: mfd: Add STM32 LPTimer driver

STM32 Low-Power Timer hardware block can be used for:
- PWM generation
- IIO trigger (in sync with PWM)
- IIO quadrature encoder counter
PWM and IIO timer configuration are mixed in the same registers so
we need a multi fonction driver to be able to share those registers.

Signed-off-by: Fabrice Gasnier <fabrice.gasnier@st.com>
Signed-off-by: Lee Jones <lee.jones@linaro.org>
---
 drivers/mfd/Kconfig               |  14 +++++
 drivers/mfd/Makefile              |   1 +
 drivers/mfd/stm32-lptimer.c       | 107 ++++++++++++++++++++++++++++++++++++++
 include/linux/mfd/stm32-lptimer.h |  62 ++++++++++++++++++++++
 4 files changed, 184 insertions(+)
 create mode 100644 drivers/mfd/stm32-lptimer.c
 create mode 100644 include/linux/mfd/stm32-lptimer.h

(limited to 'include')

diff --git a/drivers/mfd/Kconfig b/drivers/mfd/Kconfig
index 94ad2c1c3d90..fe76db945f46 100644
--- a/drivers/mfd/Kconfig
+++ b/drivers/mfd/Kconfig
@@ -1723,6 +1723,20 @@ config MFD_STW481X
 	  in various ST Microelectronics and ST-Ericsson embedded
 	  Nomadik series.
 
+config MFD_STM32_LPTIMER
+	tristate "Support for STM32 Low-Power Timer"
+	depends on (ARCH_STM32 && OF) || COMPILE_TEST
+	select MFD_CORE
+	select REGMAP
+	select REGMAP_MMIO
+	help
+	  Select this option to enable STM32 Low-Power Timer driver
+	  used for PWM, IIO Trigger, IIO Encoder and Counter. Shared
+	  resources are also dealt with here.
+
+	  To compile this driver as a module, choose M here: the
+	  module will be called stm32-lptimer.
+
 config MFD_STM32_TIMERS
 	tristate "Support for STM32 Timers"
 	depends on (ARCH_STM32 && OF) || COMPILE_TEST
diff --git a/drivers/mfd/Makefile b/drivers/mfd/Makefile
index 080793b3fd0e..b80b1a314ca5 100644
--- a/drivers/mfd/Makefile
+++ b/drivers/mfd/Makefile
@@ -221,5 +221,6 @@ obj-$(CONFIG_MFD_MT6397)	+= mt6397-core.o
 obj-$(CONFIG_MFD_ALTERA_A10SR)	+= altera-a10sr.o
 obj-$(CONFIG_MFD_SUN4I_GPADC)	+= sun4i-gpadc.o
 
+obj-$(CONFIG_MFD_STM32_LPTIMER)	+= stm32-lptimer.o
 obj-$(CONFIG_MFD_STM32_TIMERS) 	+= stm32-timers.o
 obj-$(CONFIG_MFD_MXS_LRADC)     += mxs-lradc.o
diff --git a/drivers/mfd/stm32-lptimer.c b/drivers/mfd/stm32-lptimer.c
new file mode 100644
index 000000000000..075330a25f61
--- /dev/null
+++ b/drivers/mfd/stm32-lptimer.c
@@ -0,0 +1,107 @@
+/*
+ * STM32 Low-Power Timer parent driver.
+ *
+ * Copyright (C) STMicroelectronics 2017
+ *
+ * Author: Fabrice Gasnier <fabrice.gasnier@st.com>
+ *
+ * Inspired by Benjamin Gaignard's stm32-timers driver
+ *
+ * License terms:  GNU General Public License (GPL), version 2
+ */
+
+#include <linux/mfd/stm32-lptimer.h>
+#include <linux/module.h>
+#include <linux/of_platform.h>
+
+#define STM32_LPTIM_MAX_REGISTER	0x3fc
+
+static const struct regmap_config stm32_lptimer_regmap_cfg = {
+	.reg_bits = 32,
+	.val_bits = 32,
+	.reg_stride = sizeof(u32),
+	.max_register = STM32_LPTIM_MAX_REGISTER,
+};
+
+static int stm32_lptimer_detect_encoder(struct stm32_lptimer *ddata)
+{
+	u32 val;
+	int ret;
+
+	/*
+	 * Quadrature encoder mode bit can only be written and read back when
+	 * Low-Power Timer supports it.
+	 */
+	ret = regmap_update_bits(ddata->regmap, STM32_LPTIM_CFGR,
+				 STM32_LPTIM_ENC, STM32_LPTIM_ENC);
+	if (ret)
+		return ret;
+
+	ret = regmap_read(ddata->regmap, STM32_LPTIM_CFGR, &val);
+	if (ret)
+		return ret;
+
+	ret = regmap_update_bits(ddata->regmap, STM32_LPTIM_CFGR,
+				 STM32_LPTIM_ENC, 0);
+	if (ret)
+		return ret;
+
+	ddata->has_encoder = !!(val & STM32_LPTIM_ENC);
+
+	return 0;
+}
+
+static int stm32_lptimer_probe(struct platform_device *pdev)
+{
+	struct device *dev = &pdev->dev;
+	struct stm32_lptimer *ddata;
+	struct resource *res;
+	void __iomem *mmio;
+	int ret;
+
+	ddata = devm_kzalloc(dev, sizeof(*ddata), GFP_KERNEL);
+	if (!ddata)
+		return -ENOMEM;
+
+	res = platform_get_resource(pdev, IORESOURCE_MEM, 0);
+	mmio = devm_ioremap_resource(dev, res);
+	if (IS_ERR(mmio))
+		return PTR_ERR(mmio);
+
+	ddata->regmap = devm_regmap_init_mmio_clk(dev, "mux", mmio,
+						  &stm32_lptimer_regmap_cfg);
+	if (IS_ERR(ddata->regmap))
+		return PTR_ERR(ddata->regmap);
+
+	ddata->clk = devm_clk_get(dev, NULL);
+	if (IS_ERR(ddata->clk))
+		return PTR_ERR(ddata->clk);
+
+	ret = stm32_lptimer_detect_encoder(ddata);
+	if (ret)
+		return ret;
+
+	platform_set_drvdata(pdev, ddata);
+
+	return devm_of_platform_populate(&pdev->dev);
+}
+
+static const struct of_device_id stm32_lptimer_of_match[] = {
+	{ .compatible = "st,stm32-lptimer", },
+	{},
+};
+MODULE_DEVICE_TABLE(of, stm32_lptimer_of_match);
+
+static struct platform_driver stm32_lptimer_driver = {
+	.probe = stm32_lptimer_probe,
+	.driver = {
+		.name = "stm32-lptimer",
+		.of_match_table = stm32_lptimer_of_match,
+	},
+};
+module_platform_driver(stm32_lptimer_driver);
+
+MODULE_AUTHOR("Fabrice Gasnier <fabrice.gasnier@st.com>");
+MODULE_DESCRIPTION("STMicroelectronics STM32 Low-Power Timer");
+MODULE_ALIAS("platform:stm32-lptimer");
+MODULE_LICENSE("GPL v2");
diff --git a/include/linux/mfd/stm32-lptimer.h b/include/linux/mfd/stm32-lptimer.h
new file mode 100644
index 000000000000..77c7cf40d9b4
--- /dev/null
+++ b/include/linux/mfd/stm32-lptimer.h
@@ -0,0 +1,62 @@
+/*
+ * STM32 Low-Power Timer parent driver.
+ *
+ * Copyright (C) STMicroelectronics 2017
+ *
+ * Author: Fabrice Gasnier <fabrice.gasnier@st.com>
+ *
+ * Inspired by Benjamin Gaignard's stm32-timers driver
+ *
+ * License terms:  GNU General Public License (GPL), version 2
+ */
+
+#ifndef _LINUX_STM32_LPTIMER_H_
+#define _LINUX_STM32_LPTIMER_H_
+
+#include <linux/clk.h>
+#include <linux/regmap.h>
+
+#define STM32_LPTIM_ISR		0x00	/* Interrupt and Status Reg  */
+#define STM32_LPTIM_ICR		0x04	/* Interrupt Clear Reg       */
+#define STM32_LPTIM_IER		0x08	/* Interrupt Enable Reg      */
+#define STM32_LPTIM_CFGR	0x0C	/* Configuration Reg         */
+#define STM32_LPTIM_CR		0x10	/* Control Reg               */
+#define STM32_LPTIM_CMP		0x14	/* Compare Reg               */
+#define STM32_LPTIM_ARR		0x18	/* Autoreload Reg            */
+#define STM32_LPTIM_CNT		0x1C	/* Counter Reg               */
+
+/* STM32_LPTIM_ISR - bit fields */
+#define STM32_LPTIM_CMPOK_ARROK		GENMASK(4, 3)
+#define STM32_LPTIM_ARROK		BIT(4)
+#define STM32_LPTIM_CMPOK		BIT(3)
+
+/* STM32_LPTIM_ICR - bit fields */
+#define STM32_LPTIM_CMPOKCF_ARROKCF	GENMASK(4, 3)
+
+/* STM32_LPTIM_CR - bit fields */
+#define STM32_LPTIM_CNTSTRT	BIT(2)
+#define STM32_LPTIM_ENABLE	BIT(0)
+
+/* STM32_LPTIM_CFGR - bit fields */
+#define STM32_LPTIM_ENC		BIT(24)
+#define STM32_LPTIM_COUNTMODE	BIT(23)
+#define STM32_LPTIM_WAVPOL	BIT(21)
+#define STM32_LPTIM_PRESC	GENMASK(11, 9)
+#define STM32_LPTIM_CKPOL	GENMASK(2, 1)
+
+/* STM32_LPTIM_ARR */
+#define STM32_LPTIM_MAX_ARR	0xFFFF
+
+/**
+ * struct stm32_lptimer - STM32 Low-Power Timer data assigned by parent device
+ * @clk: clock reference for this instance
+ * @regmap: register map reference for this instance
+ * @has_encoder: indicates this Low-Power Timer supports encoder mode
+ */
+struct stm32_lptimer {
+	struct clk *clk;
+	struct regmap *regmap;
+	bool has_encoder;
+};
+
+#endif
-- 
cgit v1.2.3


From b01ced2b504b2592af6703533c62cb9d1cdc1c6c Mon Sep 17 00:00:00 2001
From: Fabrice Gasnier <fabrice.gasnier@st.com>
Date: Mon, 28 Aug 2017 12:04:11 +0200
Subject: iio: trigger: Add STM32 LPTimer trigger driver

Add support for LPTIMx_OUT triggers that can be found on some STM32
devices. These triggers can be used then by ADC or DAC.
Typical usage is to configure LPTimer as PWM output (via pwm-stm32-lp)
and have synchronised analog conversions with these triggers.

Signed-off-by: Fabrice Gasnier <fabrice.gasnier@st.com>
Reviewed-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
Signed-off-by: Lee Jones <lee.jones@linaro.org>
---
 drivers/iio/trigger/Kconfig                   |  11 +++
 drivers/iio/trigger/Makefile                  |   1 +
 drivers/iio/trigger/stm32-lptimer-trigger.c   | 118 ++++++++++++++++++++++++++
 include/linux/iio/timer/stm32-lptim-trigger.h |  27 ++++++
 4 files changed, 157 insertions(+)
 create mode 100644 drivers/iio/trigger/stm32-lptimer-trigger.c
 create mode 100644 include/linux/iio/timer/stm32-lptim-trigger.h

(limited to 'include')

diff --git a/drivers/iio/trigger/Kconfig b/drivers/iio/trigger/Kconfig
index e4d4e63434db..a633d2c8e805 100644
--- a/drivers/iio/trigger/Kconfig
+++ b/drivers/iio/trigger/Kconfig
@@ -24,6 +24,17 @@ config IIO_INTERRUPT_TRIGGER
 	  To compile this driver as a module, choose M here: the
 	  module will be called iio-trig-interrupt.
 
+config IIO_STM32_LPTIMER_TRIGGER
+	tristate "STM32 Low-Power Timer Trigger"
+	depends on MFD_STM32_LPTIMER || COMPILE_TEST
+	help
+	  Select this option to enable STM32 Low-Power Timer Trigger.
+	  This can be used as trigger source for STM32 internal ADC
+	  and/or DAC.
+
+	  To compile this driver as a module, choose M here: the
+	  module will be called stm32-lptimer-trigger.
+
 config IIO_STM32_TIMER_TRIGGER
 	tristate "STM32 Timer Trigger"
 	depends on (ARCH_STM32 && OF && MFD_STM32_TIMERS) || COMPILE_TEST
diff --git a/drivers/iio/trigger/Makefile b/drivers/iio/trigger/Makefile
index 5c4ecd380653..0a72a2a76cb2 100644
--- a/drivers/iio/trigger/Makefile
+++ b/drivers/iio/trigger/Makefile
@@ -6,6 +6,7 @@
 
 obj-$(CONFIG_IIO_HRTIMER_TRIGGER) += iio-trig-hrtimer.o
 obj-$(CONFIG_IIO_INTERRUPT_TRIGGER) += iio-trig-interrupt.o
+obj-$(CONFIG_IIO_STM32_LPTIMER_TRIGGER) += stm32-lptimer-trigger.o
 obj-$(CONFIG_IIO_STM32_TIMER_TRIGGER) += stm32-timer-trigger.o
 obj-$(CONFIG_IIO_SYSFS_TRIGGER) += iio-trig-sysfs.o
 obj-$(CONFIG_IIO_TIGHTLOOP_TRIGGER) += iio-trig-loop.o
diff --git a/drivers/iio/trigger/stm32-lptimer-trigger.c b/drivers/iio/trigger/stm32-lptimer-trigger.c
new file mode 100644
index 000000000000..241eae6a4306
--- /dev/null
+++ b/drivers/iio/trigger/stm32-lptimer-trigger.c
@@ -0,0 +1,118 @@
+/*
+ * STM32 Low-Power Timer Trigger driver
+ *
+ * Copyright (C) STMicroelectronics 2017
+ *
+ * Author: Fabrice Gasnier <fabrice.gasnier@st.com>.
+ *
+ * License terms:  GNU General Public License (GPL), version 2
+ *
+ * Inspired by Benjamin Gaignard's stm32-timer-trigger driver
+ */
+
+#include <linux/iio/timer/stm32-lptim-trigger.h>
+#include <linux/mfd/stm32-lptimer.h>
+#include <linux/module.h>
+#include <linux/platform_device.h>
+
+/* List Low-Power Timer triggers */
+static const char * const stm32_lptim_triggers[] = {
+	LPTIM1_OUT,
+	LPTIM2_OUT,
+	LPTIM3_OUT,
+};
+
+struct stm32_lptim_trigger {
+	struct device *dev;
+	const char *trg;
+};
+
+static int stm32_lptim_validate_device(struct iio_trigger *trig,
+				       struct iio_dev *indio_dev)
+{
+	if (indio_dev->modes & INDIO_HARDWARE_TRIGGERED)
+		return 0;
+
+	return -EINVAL;
+}
+
+static const struct iio_trigger_ops stm32_lptim_trigger_ops = {
+	.owner = THIS_MODULE,
+	.validate_device = stm32_lptim_validate_device,
+};
+
+/**
+ * is_stm32_lptim_trigger
+ * @trig: trigger to be checked
+ *
+ * return true if the trigger is a valid STM32 IIO Low-Power Timer Trigger
+ * either return false
+ */
+bool is_stm32_lptim_trigger(struct iio_trigger *trig)
+{
+	return (trig->ops == &stm32_lptim_trigger_ops);
+}
+EXPORT_SYMBOL(is_stm32_lptim_trigger);
+
+static int stm32_lptim_setup_trig(struct stm32_lptim_trigger *priv)
+{
+	struct iio_trigger *trig;
+
+	trig = devm_iio_trigger_alloc(priv->dev, "%s", priv->trg);
+	if  (!trig)
+		return -ENOMEM;
+
+	trig->dev.parent = priv->dev->parent;
+	trig->ops = &stm32_lptim_trigger_ops;
+	iio_trigger_set_drvdata(trig, priv);
+
+	return devm_iio_trigger_register(priv->dev, trig);
+}
+
+static int stm32_lptim_trigger_probe(struct platform_device *pdev)
+{
+	struct stm32_lptim_trigger *priv;
+	u32 index;
+	int ret;
+
+	priv = devm_kzalloc(&pdev->dev, sizeof(*priv), GFP_KERNEL);
+	if (!priv)
+		return -ENOMEM;
+
+	if (of_property_read_u32(pdev->dev.of_node, "reg", &index))
+		return -EINVAL;
+
+	if (index >= ARRAY_SIZE(stm32_lptim_triggers))
+		return -EINVAL;
+
+	priv->dev = &pdev->dev;
+	priv->trg = stm32_lptim_triggers[index];
+
+	ret = stm32_lptim_setup_trig(priv);
+	if (ret)
+		return ret;
+
+	platform_set_drvdata(pdev, priv);
+
+	return 0;
+}
+
+static const struct of_device_id stm32_lptim_trig_of_match[] = {
+	{ .compatible = "st,stm32-lptimer-trigger", },
+	{},
+};
+MODULE_DEVICE_TABLE(of, stm32_lptim_trig_of_match);
+
+static struct platform_driver stm32_lptim_trigger_driver = {
+	.probe = stm32_lptim_trigger_probe,
+	.driver = {
+		.name = "stm32-lptimer-trigger",
+		.of_match_table = stm32_lptim_trig_of_match,
+	},
+};
+module_platform_driver(stm32_lptim_trigger_driver);
+
+MODULE_AUTHOR("Fabrice Gasnier <fabrice.gasnier@st.com>");
+MODULE_ALIAS("platform:stm32-lptimer-trigger");
+MODULE_DESCRIPTION("STMicroelectronics STM32 LPTIM trigger driver");
+MODULE_LICENSE("GPL v2");
diff --git a/include/linux/iio/timer/stm32-lptim-trigger.h b/include/linux/iio/timer/stm32-lptim-trigger.h
new file mode 100644
index 000000000000..34d59bfdce2d
--- /dev/null
+++ b/include/linux/iio/timer/stm32-lptim-trigger.h
@@ -0,0 +1,27 @@
+/*
+ * Copyright (C) STMicroelectronics 2017
+ *
+ * Author: Fabrice Gasnier <fabrice.gasnier@st.com>
+ *
+ * License terms:  GNU General Public License (GPL), version 2
+ */
+
+#ifndef _STM32_LPTIM_TRIGGER_H_
+#define _STM32_LPTIM_TRIGGER_H_
+
+#include <linux/iio/iio.h>
+#include <linux/iio/trigger.h>
+
+#define LPTIM1_OUT	"lptim1_out"
+#define LPTIM2_OUT	"lptim2_out"
+#define LPTIM3_OUT	"lptim3_out"
+
+#if IS_ENABLED(CONFIG_IIO_STM32_LPTIMER_TRIGGER)
+bool is_stm32_lptim_trigger(struct iio_trigger *trig);
+#else
+static inline bool is_stm32_lptim_trigger(struct iio_trigger *trig)
+{
+	return false;
+}
+#endif
+#endif
-- 
cgit v1.2.3


From 2335ba704f32b855651d0cd15dd9b271ec565fb6 Mon Sep 17 00:00:00 2001
From: Pablo Neira Ayuso <pablo@netfilter.org>
Date: Sun, 3 Sep 2017 23:55:59 +0200
Subject: netlink: add NLM_F_NONREC flag for deletion requests

In the last NFWS in Faro, Portugal, we discussed that netlink is lacking
the semantics to request non recursive deletions, ie. do not delete an
object iff it has child objects that hang from this parent object that
the user requests to be deleted.

We need this new flag to solve a problem for the iptables-compat
backward compatibility utility, that runs iptables commands using the
existing nf_tables netlink interface. Specifically, custom chains in
iptables cannot be deleted if there are rules in it, however, nf_tables
allows to remove any chain that is populated with content. To sort out
this asymmetry, iptables-compat userspace sets this new NLM_F_NONREC
flag to obtain the same semantics that iptables provides.

This new flag should only be used for deletion requests. Note this new
flag value overlaps with the existing:

* NLM_F_ROOT for get requests.
* NLM_F_REPLACE for new requests.

However, those flags should not ever be used in deletion requests.

Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/uapi/linux/netlink.h | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'include')

diff --git a/include/uapi/linux/netlink.h b/include/uapi/linux/netlink.h
index f4fc9c9e123d..e8af60a7c56d 100644
--- a/include/uapi/linux/netlink.h
+++ b/include/uapi/linux/netlink.h
@@ -69,6 +69,9 @@ struct nlmsghdr {
 #define NLM_F_CREATE	0x400	/* Create, if it does not exist	*/
 #define NLM_F_APPEND	0x800	/* Add to end of list		*/
 
+/* Modifiers to DELETE request */
+#define NLM_F_NONREC	0x100	/* Do not delete recursively	*/
+
 /* Flags for ACK message */
 #define NLM_F_CAPPED	0x100	/* request was capped */
 #define NLM_F_ACK_TLVS	0x200	/* extended ACK TVLs were included */
-- 
cgit v1.2.3


From 1985296a3ccd5d89d2ec2ca17b1fa1a225f8ecd1 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Mon, 4 Sep 2017 12:07:24 -0400
Subject: fix the __user misannotations in asm-generic get_user/put_user

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 include/asm-generic/uaccess.h | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'include')

diff --git a/include/asm-generic/uaccess.h b/include/asm-generic/uaccess.h
index 723e81a6c162..2e51f6e7b3c2 100644
--- a/include/asm-generic/uaccess.h
+++ b/include/asm-generic/uaccess.h
@@ -75,10 +75,10 @@ static inline int __access_ok(unsigned long addr, unsigned long size)
 
 #define put_user(x, ptr)					\
 ({								\
-	void *__p = (ptr);					\
+	void __user *__p = (ptr);				\
 	might_fault();						\
 	access_ok(VERIFY_WRITE, __p, sizeof(*ptr)) ?		\
-		__put_user((x), ((__typeof__(*(ptr)) *)__p)) :	\
+		__put_user((x), ((__typeof__(*(ptr)) __user *)__p)) :	\
 		-EFAULT;					\
 })
 
@@ -137,10 +137,10 @@ extern int __put_user_bad(void) __attribute__((noreturn));
 
 #define get_user(x, ptr)					\
 ({								\
-	const void *__p = (ptr);				\
+	const void __user *__p = (ptr);				\
 	might_fault();						\
 	access_ok(VERIFY_READ, __p, sizeof(*ptr)) ?		\
-		__get_user((x), (__typeof__(*(ptr)) *)__p) :	\
+		__get_user((x), (__typeof__(*(ptr)) __user *)__p) :\
 		((x) = (__typeof__(*(ptr)))0,-EFAULT);		\
 })
 
-- 
cgit v1.2.3


From 126534141b45d9d1b205fbe3f2321200074b76fd Mon Sep 17 00:00:00 2001
From: Martin Blumenstingl <martin.blumenstingl@googlemail.com>
Date: Sun, 20 Aug 2017 00:18:19 +0200
Subject: MIPS: lantiq: Add a GPHY driver which uses the RCU syscon-mfd

Compared to the old xrx200_phy_fw driver the new version has multiple
enhancements. The name of the firmware files does not have to be added
to all .dts files anymore - one now configures the GPHY mode (FE or GE)
instead. Each GPHY can now also boot separate firmware (thus mixing of
GE and FE GPHYs is now possible).
The new implementation is based on the RCU syscon-mfd and uses the
reeset_controller framework instead of raw RCU register reads/writes.

Signed-off-by: Martin Blumenstingl <martin.blumenstingl@googlemail.com>
Signed-off-by: Hauke Mehrtens <hauke@hauke-m.de>
Reviewed-by: Andy Shevchenko <andy.shevchenko@gmail.com>
Acked-by: Rob Herring <robh@kernel.org>
Cc: john@phrozen.org
Cc: p.zabel@pengutronix.de
Cc: kishon@ti.com
Cc: mark.rutland@arm.com
Cc: linux-mips@linux-mips.org
Cc: linux-mtd@lists.infradead.org
Cc: linux-watchdog@vger.kernel.org
Cc: devicetree@vger.kernel.org
Cc: linux-spi@vger.kernel.org
Patchwork: https://patchwork.linux-mips.org/patch/17128/
Signed-off-by: Ralf Baechle <ralf@linux-mips.org>
---
 .../devicetree/bindings/mips/lantiq/rcu-gphy.txt   |  36 +++
 arch/mips/lantiq/xway/sysctrl.c                    |   6 +-
 drivers/soc/lantiq/Makefile                        |   1 +
 drivers/soc/lantiq/gphy.c                          | 260 +++++++++++++++++++++
 include/dt-bindings/mips/lantiq_rcu_gphy.h         |  15 ++
 5 files changed, 316 insertions(+), 2 deletions(-)
 create mode 100644 Documentation/devicetree/bindings/mips/lantiq/rcu-gphy.txt
 create mode 100644 drivers/soc/lantiq/gphy.c
 create mode 100644 include/dt-bindings/mips/lantiq_rcu_gphy.h

(limited to 'include')

diff --git a/Documentation/devicetree/bindings/mips/lantiq/rcu-gphy.txt b/Documentation/devicetree/bindings/mips/lantiq/rcu-gphy.txt
new file mode 100644
index 000000000000..a0c19bd1ce66
--- /dev/null
+++ b/Documentation/devicetree/bindings/mips/lantiq/rcu-gphy.txt
@@ -0,0 +1,36 @@
+Lantiq XWAY SoC GPHY binding
+============================
+
+This binding describes a software-defined ethernet PHY, provided by the RCU
+module on newer Lantiq XWAY SoCs (xRX200 and newer).
+
+-------------------------------------------------------------------------------
+Required properties:
+- compatible		: Should be one of
+				"lantiq,xrx200a1x-gphy"
+				"lantiq,xrx200a2x-gphy"
+				"lantiq,xrx300-gphy"
+				"lantiq,xrx330-gphy"
+- reg			: Addrress of the GPHY FW load address register
+- resets		: Must reference the RCU GPHY reset bit
+- reset-names		: One entry, value must be "gphy" or optional "gphy2"
+- clocks		: A reference to the (PMU) GPHY clock gate
+
+Optional properties:
+- lantiq,gphy-mode	: GPHY_MODE_GE (default) or GPHY_MODE_FE as defined in
+			  <dt-bindings/mips/lantiq_xway_gphy.h>
+
+
+-------------------------------------------------------------------------------
+Example for the GPHys on the xRX200 SoCs:
+
+#include <dt-bindings/mips/lantiq_rcu_gphy.h>
+	gphy0: gphy@20 {
+		compatible = "lantiq,xrx200a2x-gphy";
+		reg = <0x20 0x4>;
+
+		resets = <&reset0 31 30>, <&reset1 7 7>;
+		reset-names = "gphy", "gphy2";
+		clocks = <&pmu0 XRX200_PMU_GATE_GPHY>;
+		lantiq,gphy-mode = <GPHY_MODE_GE>;
+	};
diff --git a/arch/mips/lantiq/xway/sysctrl.c b/arch/mips/lantiq/xway/sysctrl.c
index 706639a343bc..87eab4d288e5 100644
--- a/arch/mips/lantiq/xway/sysctrl.c
+++ b/arch/mips/lantiq/xway/sysctrl.c
@@ -518,7 +518,8 @@ void __init ltq_soc_init(void)
 		clkdev_add_pmu("1e108000.eth", NULL, 0, 0, PMU_SWITCH |
 			       PMU_PPE_DP | PMU_PPE_TC);
 		clkdev_add_pmu("1da00000.usif", "NULL", 1, 0, PMU_USIF);
-		clkdev_add_pmu("1f203000.rcu", "gphy", 1, 0, PMU_GPHY);
+		clkdev_add_pmu("1f203020.gphy", NULL, 1, 0, PMU_GPHY);
+		clkdev_add_pmu("1f203068.gphy", NULL, 1, 0, PMU_GPHY);
 		clkdev_add_pmu("1e103100.deu", NULL, 1, 0, PMU_DEU);
 		clkdev_add_pmu("1e116000.mei", "afe", 1, 2, PMU_ANALOG_DSL_AFE);
 		clkdev_add_pmu("1e116000.mei", "dfe", 1, 0, PMU_DFE);
@@ -541,7 +542,8 @@ void __init ltq_soc_init(void)
 				PMU_SWITCH | PMU_PPE_DPLUS | PMU_PPE_DPLUM |
 				PMU_PPE_EMA | PMU_PPE_TC | PMU_PPE_SLL01 |
 				PMU_PPE_QSB | PMU_PPE_TOP);
-		clkdev_add_pmu("1f203000.rcu", "gphy", 0, 0, PMU_GPHY);
+		clkdev_add_pmu("1f203020.gphy", NULL, 0, 0, PMU_GPHY);
+		clkdev_add_pmu("1f203068.gphy", NULL, 0, 0, PMU_GPHY);
 		clkdev_add_pmu("1e103000.sdio", NULL, 1, 0, PMU_SDIO);
 		clkdev_add_pmu("1e103100.deu", NULL, 1, 0, PMU_DEU);
 		clkdev_add_pmu("1e116000.mei", "dfe", 1, 0, PMU_DFE);
diff --git a/drivers/soc/lantiq/Makefile b/drivers/soc/lantiq/Makefile
index 35aa86bd1023..be9e866d53e5 100644
--- a/drivers/soc/lantiq/Makefile
+++ b/drivers/soc/lantiq/Makefile
@@ -1 +1,2 @@
 obj-y				+= fpi-bus.o
+obj-$(CONFIG_XRX200_PHY_FW)	+= gphy.o
diff --git a/drivers/soc/lantiq/gphy.c b/drivers/soc/lantiq/gphy.c
new file mode 100644
index 000000000000..8d8659463b3e
--- /dev/null
+++ b/drivers/soc/lantiq/gphy.c
@@ -0,0 +1,260 @@
+/*
+ *  This program is free software; you can redistribute it and/or modify it
+ *  under the terms of the GNU General Public License version 2 as published
+ *  by the Free Software Foundation.
+ *
+ *  Copyright (C) 2012 John Crispin <blogic@phrozen.org>
+ *  Copyright (C) 2016 Martin Blumenstingl <martin.blumenstingl@googlemail.com>
+ *  Copyright (C) 2017 Hauke Mehrtens <hauke@hauke-m.de>
+ */
+
+#include <linux/clk.h>
+#include <linux/delay.h>
+#include <linux/dma-mapping.h>
+#include <linux/firmware.h>
+#include <linux/mfd/syscon.h>
+#include <linux/module.h>
+#include <linux/reboot.h>
+#include <linux/regmap.h>
+#include <linux/reset.h>
+#include <linux/of_device.h>
+#include <linux/of_platform.h>
+#include <linux/property.h>
+#include <dt-bindings/mips/lantiq_rcu_gphy.h>
+
+#include <lantiq_soc.h>
+
+#define XRX200_GPHY_FW_ALIGN	(16 * 1024)
+
+struct xway_gphy_priv {
+	struct clk *gphy_clk_gate;
+	struct reset_control *gphy_reset;
+	struct reset_control *gphy_reset2;
+	struct notifier_block gphy_reboot_nb;
+	void __iomem *membase;
+	char *fw_name;
+};
+
+struct xway_gphy_match_data {
+	char *fe_firmware_name;
+	char *ge_firmware_name;
+};
+
+static const struct xway_gphy_match_data xrx200a1x_gphy_data = {
+	.fe_firmware_name = "lantiq/xrx200_phy22f_a14.bin",
+	.ge_firmware_name = "lantiq/xrx200_phy11g_a14.bin",
+};
+
+static const struct xway_gphy_match_data xrx200a2x_gphy_data = {
+	.fe_firmware_name = "lantiq/xrx200_phy22f_a22.bin",
+	.ge_firmware_name = "lantiq/xrx200_phy11g_a22.bin",
+};
+
+static const struct xway_gphy_match_data xrx300_gphy_data = {
+	.fe_firmware_name = "lantiq/xrx300_phy22f_a21.bin",
+	.ge_firmware_name = "lantiq/xrx300_phy11g_a21.bin",
+};
+
+static const struct of_device_id xway_gphy_match[] = {
+	{ .compatible = "lantiq,xrx200a1x-gphy", .data = &xrx200a1x_gphy_data },
+	{ .compatible = "lantiq,xrx200a2x-gphy", .data = &xrx200a2x_gphy_data },
+	{ .compatible = "lantiq,xrx300-gphy", .data = &xrx300_gphy_data },
+	{ .compatible = "lantiq,xrx330-gphy", .data = &xrx300_gphy_data },
+	{},
+};
+MODULE_DEVICE_TABLE(of, xway_gphy_match);
+
+static struct xway_gphy_priv *to_xway_gphy_priv(struct notifier_block *nb)
+{
+	return container_of(nb, struct xway_gphy_priv, gphy_reboot_nb);
+}
+
+static int xway_gphy_reboot_notify(struct notifier_block *reboot_nb,
+				   unsigned long code, void *unused)
+{
+	struct xway_gphy_priv *priv = to_xway_gphy_priv(reboot_nb);
+
+	if (priv) {
+		reset_control_assert(priv->gphy_reset);
+		reset_control_assert(priv->gphy_reset2);
+	}
+
+	return NOTIFY_DONE;
+}
+
+static int xway_gphy_load(struct device *dev, struct xway_gphy_priv *priv,
+			  dma_addr_t *dev_addr)
+{
+	const struct firmware *fw;
+	void *fw_addr;
+	dma_addr_t dma_addr;
+	size_t size;
+	int ret;
+
+	ret = request_firmware(&fw, priv->fw_name, dev);
+	if (ret) {
+		dev_err(dev, "failed to load firmware: %s, error: %i\n",
+			priv->fw_name, ret);
+		return ret;
+	}
+
+	/*
+	 * GPHY cores need the firmware code in a persistent and contiguous
+	 * memory area with a 16 kB boundary aligned start address.
+	 */
+	size = fw->size + XRX200_GPHY_FW_ALIGN;
+
+	fw_addr = dmam_alloc_coherent(dev, size, &dma_addr, GFP_KERNEL);
+	if (fw_addr) {
+		fw_addr = PTR_ALIGN(fw_addr, XRX200_GPHY_FW_ALIGN);
+		*dev_addr = ALIGN(dma_addr, XRX200_GPHY_FW_ALIGN);
+		memcpy(fw_addr, fw->data, fw->size);
+	} else {
+		dev_err(dev, "failed to alloc firmware memory\n");
+		ret = -ENOMEM;
+	}
+
+	release_firmware(fw);
+
+	return ret;
+}
+
+static int xway_gphy_of_probe(struct platform_device *pdev,
+			      struct xway_gphy_priv *priv)
+{
+	struct device *dev = &pdev->dev;
+	const struct xway_gphy_match_data *gphy_fw_name_cfg;
+	u32 gphy_mode;
+	int ret;
+	struct resource *res_gphy;
+
+	gphy_fw_name_cfg = of_device_get_match_data(dev);
+
+	priv->gphy_clk_gate = devm_clk_get(dev, NULL);
+	if (IS_ERR(priv->gphy_clk_gate)) {
+		dev_err(dev, "Failed to lookup gate clock\n");
+		return PTR_ERR(priv->gphy_clk_gate);
+	}
+
+	res_gphy = platform_get_resource(pdev, IORESOURCE_MEM, 0);
+	priv->membase = devm_ioremap_resource(dev, res_gphy);
+	if (IS_ERR(priv->membase))
+		return PTR_ERR(priv->membase);
+
+	priv->gphy_reset = devm_reset_control_get(dev, "gphy");
+	if (IS_ERR(priv->gphy_reset)) {
+		if (PTR_ERR(priv->gphy_reset) != -EPROBE_DEFER)
+			dev_err(dev, "Failed to lookup gphy reset\n");
+		return PTR_ERR(priv->gphy_reset);
+	}
+
+	priv->gphy_reset2 = devm_reset_control_get_optional(dev, "gphy2");
+	if (IS_ERR(priv->gphy_reset2))
+		return PTR_ERR(priv->gphy_reset2);
+
+	ret = device_property_read_u32(dev, "lantiq,gphy-mode", &gphy_mode);
+	/* Default to GE mode */
+	if (ret)
+		gphy_mode = GPHY_MODE_GE;
+
+	switch (gphy_mode) {
+	case GPHY_MODE_FE:
+		priv->fw_name = gphy_fw_name_cfg->fe_firmware_name;
+		break;
+	case GPHY_MODE_GE:
+		priv->fw_name = gphy_fw_name_cfg->ge_firmware_name;
+		break;
+	default:
+		dev_err(dev, "Unknown GPHY mode %d\n", gphy_mode);
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+static int xway_gphy_probe(struct platform_device *pdev)
+{
+	struct device *dev = &pdev->dev;
+	struct xway_gphy_priv *priv;
+	dma_addr_t fw_addr = 0;
+	int ret;
+
+	priv = devm_kzalloc(dev, sizeof(*priv), GFP_KERNEL);
+	if (!priv)
+		return -ENOMEM;
+
+	ret = xway_gphy_of_probe(pdev, priv);
+	if (ret)
+		return ret;
+
+	ret = clk_prepare_enable(priv->gphy_clk_gate);
+	if (ret)
+		return ret;
+
+	ret = xway_gphy_load(dev, priv, &fw_addr);
+	if (ret) {
+		clk_disable_unprepare(priv->gphy_clk_gate);
+		return ret;
+	}
+
+	reset_control_assert(priv->gphy_reset);
+	reset_control_assert(priv->gphy_reset2);
+
+	iowrite32be(fw_addr, priv->membase);
+
+	reset_control_deassert(priv->gphy_reset);
+	reset_control_deassert(priv->gphy_reset2);
+
+	/* assert the gphy reset because it can hang after a reboot: */
+	priv->gphy_reboot_nb.notifier_call = xway_gphy_reboot_notify;
+	priv->gphy_reboot_nb.priority = -1;
+
+	ret = register_reboot_notifier(&priv->gphy_reboot_nb);
+	if (ret)
+		dev_warn(dev, "Failed to register reboot notifier\n");
+
+	platform_set_drvdata(pdev, priv);
+
+	return ret;
+}
+
+static int xway_gphy_remove(struct platform_device *pdev)
+{
+	struct device *dev = &pdev->dev;
+	struct xway_gphy_priv *priv = platform_get_drvdata(pdev);
+	int ret;
+
+	reset_control_assert(priv->gphy_reset);
+	reset_control_assert(priv->gphy_reset2);
+
+	iowrite32be(0, priv->membase);
+
+	clk_disable_unprepare(priv->gphy_clk_gate);
+
+	ret = unregister_reboot_notifier(&priv->gphy_reboot_nb);
+	if (ret)
+		dev_warn(dev, "Failed to unregister reboot notifier\n");
+
+	return 0;
+}
+
+static struct platform_driver xway_gphy_driver = {
+	.probe = xway_gphy_probe,
+	.remove = xway_gphy_remove,
+	.driver = {
+		.name = "xway-rcu-gphy",
+		.of_match_table = xway_gphy_match,
+	},
+};
+
+module_platform_driver(xway_gphy_driver);
+
+MODULE_FIRMWARE("lantiq/xrx300_phy11g_a21.bin");
+MODULE_FIRMWARE("lantiq/xrx300_phy22f_a21.bin");
+MODULE_FIRMWARE("lantiq/xrx200_phy11g_a14.bin");
+MODULE_FIRMWARE("lantiq/xrx200_phy11g_a22.bin");
+MODULE_FIRMWARE("lantiq/xrx200_phy22f_a14.bin");
+MODULE_FIRMWARE("lantiq/xrx200_phy22f_a22.bin");
+MODULE_AUTHOR("Martin Blumenstingl <martin.blumenstingl@googlemail.com>");
+MODULE_DESCRIPTION("Lantiq XWAY GPHY Firmware Loader");
+MODULE_LICENSE("GPL");
diff --git a/include/dt-bindings/mips/lantiq_rcu_gphy.h b/include/dt-bindings/mips/lantiq_rcu_gphy.h
new file mode 100644
index 000000000000..fa1a63773342
--- /dev/null
+++ b/include/dt-bindings/mips/lantiq_rcu_gphy.h
@@ -0,0 +1,15 @@
+/*
+ *  This program is free software; you can redistribute it and/or modify it
+ *  under the terms of the GNU General Public License version 2 as published
+ *  by the Free Software Foundation.
+ *
+ *  Copyright (C) 2016 Martin Blumenstingl <martin.blumenstingl@googlemail.com>
+ *  Copyright (C) 2017 Hauke Mehrtens <hauke@hauke-m.de>
+ */
+#ifndef _DT_BINDINGS_MIPS_LANTIQ_RCU_GPHY_H
+#define _DT_BINDINGS_MIPS_LANTIQ_RCU_GPHY_H
+
+#define GPHY_MODE_GE	1
+#define GPHY_MODE_FE	2
+
+#endif /* _DT_BINDINGS_MIPS_LANTIQ_RCU_GPHY_H */
-- 
cgit v1.2.3


From 495e642939114478a5237a7d91661ba93b76f15a Mon Sep 17 00:00:00 2001
From: Miklos Szeredi <mszeredi@redhat.com>
Date: Mon, 4 Sep 2017 21:42:22 +0200
Subject: vfs: add flags to d_real()

Add a separate flags argument (in addition to the open flags) to control
the behavior of d_real().

Signed-off-by: Miklos Szeredi <mszeredi@redhat.com>
---
 Documentation/filesystems/Locking |  2 +-
 Documentation/filesystems/vfs.txt |  2 +-
 fs/open.c                         |  4 ++--
 fs/overlayfs/super.c              |  4 ++--
 include/linux/dcache.h            | 11 ++++++-----
 include/linux/fs.h                |  2 +-
 6 files changed, 13 insertions(+), 12 deletions(-)

(limited to 'include')

diff --git a/Documentation/filesystems/Locking b/Documentation/filesystems/Locking
index fe25787ff6d4..75d2d57e2c44 100644
--- a/Documentation/filesystems/Locking
+++ b/Documentation/filesystems/Locking
@@ -22,7 +22,7 @@ prototypes:
 	struct vfsmount *(*d_automount)(struct path *path);
 	int (*d_manage)(const struct path *, bool);
 	struct dentry *(*d_real)(struct dentry *, const struct inode *,
-				 unsigned int);
+				 unsigned int, unsigned int);
 
 locking rules:
 		rename_lock	->d_lock	may block	rcu-walk
diff --git a/Documentation/filesystems/vfs.txt b/Documentation/filesystems/vfs.txt
index 73e7d91f03dc..7f20c1bdfb67 100644
--- a/Documentation/filesystems/vfs.txt
+++ b/Documentation/filesystems/vfs.txt
@@ -990,7 +990,7 @@ struct dentry_operations {
 	struct vfsmount *(*d_automount)(struct path *);
 	int (*d_manage)(const struct path *, bool);
 	struct dentry *(*d_real)(struct dentry *, const struct inode *,
-				 unsigned int);
+				 unsigned int, unsigned int);
 };
 
   d_revalidate: called when the VFS needs to revalidate a dentry. This
diff --git a/fs/open.c b/fs/open.c
index 35bb784763a4..6d5c9a9b8c8d 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -96,7 +96,7 @@ long vfs_truncate(const struct path *path, loff_t length)
 	 * write access on the upper inode, not on the overlay inode.  For
 	 * non-overlay filesystems d_real() is an identity function.
 	 */
-	upperdentry = d_real(path->dentry, NULL, O_WRONLY);
+	upperdentry = d_real(path->dentry, NULL, O_WRONLY, 0);
 	error = PTR_ERR(upperdentry);
 	if (IS_ERR(upperdentry))
 		goto mnt_drop_write_and_out;
@@ -857,7 +857,7 @@ EXPORT_SYMBOL(file_path);
 int vfs_open(const struct path *path, struct file *file,
 	     const struct cred *cred)
 {
-	struct dentry *dentry = d_real(path->dentry, NULL, file->f_flags);
+	struct dentry *dentry = d_real(path->dentry, NULL, file->f_flags, 0);
 
 	if (IS_ERR(dentry))
 		return PTR_ERR(dentry);
diff --git a/fs/overlayfs/super.c b/fs/overlayfs/super.c
index c0c02cca776b..19e89ce39017 100644
--- a/fs/overlayfs/super.c
+++ b/fs/overlayfs/super.c
@@ -70,7 +70,7 @@ static int ovl_check_append_only(struct inode *inode, int flag)
 
 static struct dentry *ovl_d_real(struct dentry *dentry,
 				 const struct inode *inode,
-				 unsigned int open_flags)
+				 unsigned int open_flags, unsigned int flags)
 {
 	struct dentry *real;
 	int err;
@@ -102,7 +102,7 @@ static struct dentry *ovl_d_real(struct dentry *dentry,
 		goto bug;
 
 	/* Handle recursion */
-	real = d_real(real, inode, open_flags);
+	real = d_real(real, inode, open_flags, 0);
 
 	if (!inode || inode == d_inode(real))
 		return real;
diff --git a/include/linux/dcache.h b/include/linux/dcache.h
index aae1cdb76851..fd0721e520f4 100644
--- a/include/linux/dcache.h
+++ b/include/linux/dcache.h
@@ -147,7 +147,7 @@ struct dentry_operations {
 	struct vfsmount *(*d_automount)(struct path *);
 	int (*d_manage)(const struct path *, bool);
 	struct dentry *(*d_real)(struct dentry *, const struct inode *,
-				 unsigned int);
+				 unsigned int, unsigned int);
 } ____cacheline_aligned;
 
 /*
@@ -566,7 +566,8 @@ static inline struct dentry *d_backing_dentry(struct dentry *upper)
  * d_real - Return the real dentry
  * @dentry: the dentry to query
  * @inode: inode to select the dentry from multiple layers (can be NULL)
- * @flags: open flags to control copy-up behavior
+ * @open_flags: open flags to control copy-up behavior
+ * @flags: flags to control what is returned by this function
  *
  * If dentry is on a union/overlay, then return the underlying, real dentry.
  * Otherwise return the dentry itself.
@@ -575,10 +576,10 @@ static inline struct dentry *d_backing_dentry(struct dentry *upper)
  */
 static inline struct dentry *d_real(struct dentry *dentry,
 				    const struct inode *inode,
-				    unsigned int flags)
+				    unsigned int open_flags, unsigned int flags)
 {
 	if (unlikely(dentry->d_flags & DCACHE_OP_REAL))
-		return dentry->d_op->d_real(dentry, inode, flags);
+		return dentry->d_op->d_real(dentry, inode, open_flags, flags);
 	else
 		return dentry;
 }
@@ -593,7 +594,7 @@ static inline struct dentry *d_real(struct dentry *dentry,
 static inline struct inode *d_real_inode(const struct dentry *dentry)
 {
 	/* This usage of d_real() results in const dentry */
-	return d_backing_inode(d_real((struct dentry *) dentry, NULL, 0));
+	return d_backing_inode(d_real((struct dentry *) dentry, NULL, 0, 0));
 }
 
 struct name_snapshot {
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 6e1fd5d21248..ee1db83c39cb 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1233,7 +1233,7 @@ static inline struct inode *file_inode(const struct file *f)
 
 static inline struct dentry *file_dentry(const struct file *file)
 {
-	return d_real(file->f_path.dentry, file_inode(file), 0);
+	return d_real(file->f_path.dentry, file_inode(file), 0, 0);
 }
 
 static inline int locks_lock_file_wait(struct file *filp, struct file_lock *fl)
-- 
cgit v1.2.3


From 91f9943e1c7b6638f27312d03fe71fcc67b23571 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Tue, 29 Aug 2017 16:13:20 +0200
Subject: fs: support RWF_NOWAIT for buffered reads

This is based on the old idea and code from Milosz Tanski.  With the aio
nowait code it becomes mostly trivial now.  Buffered writes continue to
return -EOPNOTSUPP if RWF_NOWAIT is passed.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Jan Kara <jack@suse.cz>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/aio.c           |  6 ------
 fs/btrfs/file.c    |  6 +++++-
 fs/ext4/file.c     |  6 +++---
 fs/xfs/xfs_file.c  | 11 +++++++++--
 include/linux/fs.h |  6 +++---
 5 files changed, 20 insertions(+), 15 deletions(-)

(limited to 'include')

diff --git a/fs/aio.c b/fs/aio.c
index dcad3a66748c..d93daa076726 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -1593,12 +1593,6 @@ static int io_submit_one(struct kioctx *ctx, struct iocb __user *user_iocb,
 		goto out_put_req;
 	}
 
-	if ((req->common.ki_flags & IOCB_NOWAIT) &&
-			!(req->common.ki_flags & IOCB_DIRECT)) {
-		ret = -EOPNOTSUPP;
-		goto out_put_req;
-	}
-
 	ret = put_user(KIOCB_KEY, &user_iocb->aio_key);
 	if (unlikely(ret)) {
 		pr_debug("EFAULT: aio_key\n");
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 9e75d8a39aac..e62dd55b4079 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -1886,6 +1886,10 @@ static ssize_t btrfs_file_write_iter(struct kiocb *iocb,
 	loff_t oldsize;
 	int clean_page = 0;
 
+	if (!(iocb->ki_flags & IOCB_DIRECT) &&
+	    (iocb->ki_flags & IOCB_NOWAIT))
+		return -EOPNOTSUPP;
+
 	if (!inode_trylock(inode)) {
 		if (iocb->ki_flags & IOCB_NOWAIT)
 			return -EAGAIN;
@@ -3105,7 +3109,7 @@ out:
 
 static int btrfs_file_open(struct inode *inode, struct file *filp)
 {
-	filp->f_mode |= FMODE_AIO_NOWAIT;
+	filp->f_mode |= FMODE_NOWAIT;
 	return generic_file_open(inode, filp);
 }
 
diff --git a/fs/ext4/file.c b/fs/ext4/file.c
index 0d7cf0cc9b87..f83521337b8f 100644
--- a/fs/ext4/file.c
+++ b/fs/ext4/file.c
@@ -223,6 +223,8 @@ ext4_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
 	if (IS_DAX(inode))
 		return ext4_dax_write_iter(iocb, from);
 #endif
+	if (!o_direct && (iocb->ki_flags & IOCB_NOWAIT))
+		return -EOPNOTSUPP;
 
 	if (!inode_trylock(inode)) {
 		if (iocb->ki_flags & IOCB_NOWAIT)
@@ -448,9 +450,7 @@ static int ext4_file_open(struct inode * inode, struct file * filp)
 			return ret;
 	}
 
-	/* Set the flags to support nowait AIO */
-	filp->f_mode |= FMODE_AIO_NOWAIT;
-
+	filp->f_mode |= FMODE_NOWAIT;
 	return dquot_file_open(inode, filp);
 }
 
diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index c4893e226fd8..1a09104b3eb0 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -259,7 +259,11 @@ xfs_file_buffered_aio_read(
 
 	trace_xfs_file_buffered_read(ip, iov_iter_count(to), iocb->ki_pos);
 
-	xfs_ilock(ip, XFS_IOLOCK_SHARED);
+	if (!xfs_ilock_nowait(ip, XFS_IOLOCK_SHARED)) {
+		if (iocb->ki_flags & IOCB_NOWAIT)
+			return -EAGAIN;
+		xfs_ilock(ip, XFS_IOLOCK_SHARED);
+	}
 	ret = generic_file_read_iter(iocb, to);
 	xfs_iunlock(ip, XFS_IOLOCK_SHARED);
 
@@ -636,6 +640,9 @@ xfs_file_buffered_aio_write(
 	int			enospc = 0;
 	int			iolock;
 
+	if (iocb->ki_flags & IOCB_NOWAIT)
+		return -EOPNOTSUPP;
+
 write_retry:
 	iolock = XFS_IOLOCK_EXCL;
 	xfs_ilock(ip, iolock);
@@ -912,7 +919,7 @@ xfs_file_open(
 		return -EFBIG;
 	if (XFS_FORCED_SHUTDOWN(XFS_M(inode->i_sb)))
 		return -EIO;
-	file->f_mode |= FMODE_AIO_NOWAIT;
+	file->f_mode |= FMODE_NOWAIT;
 	return 0;
 }
 
diff --git a/include/linux/fs.h b/include/linux/fs.h
index cbfe127bccf8..94582c379dac 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -146,8 +146,8 @@ typedef int (dio_iodone_t)(struct kiocb *iocb, loff_t offset,
 /* File was opened by fanotify and shouldn't generate fanotify events */
 #define FMODE_NONOTIFY		((__force fmode_t)0x4000000)
 
-/* File is capable of returning -EAGAIN if AIO will block */
-#define FMODE_AIO_NOWAIT	((__force fmode_t)0x8000000)
+/* File is capable of returning -EAGAIN if I/O will block */
+#define FMODE_NOWAIT	((__force fmode_t)0x8000000)
 
 /*
  * Flag for rw_copy_check_uvector and compat_rw_copy_check_uvector
@@ -3149,7 +3149,7 @@ static inline int kiocb_set_rw_flags(struct kiocb *ki, int flags)
 		return -EOPNOTSUPP;
 
 	if (flags & RWF_NOWAIT) {
-		if (!(ki->ki_filp->f_mode & FMODE_AIO_NOWAIT))
+		if (!(ki->ki_filp->f_mode & FMODE_NOWAIT))
 			return -EOPNOTSUPP;
 		ki->ki_flags |= IOCB_NOWAIT;
 	}
-- 
cgit v1.2.3


From bdd1d2d3d251c65b74ac4493e08db18971c09240 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Fri, 1 Sep 2017 17:39:13 +0200
Subject: fs: fix kernel_read prototype

Use proper ssize_t and size_t types for the return value and count
argument, move the offset last and make it an in/out argument like
all other read/write helpers, and make the buf argument a void pointer
to get rid of lots of casts in the callers.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 arch/mips/kernel/elf.c                             | 12 ++++-------
 arch/x86/ia32/ia32_aout.c                          |  4 ++--
 drivers/media/pci/cx25821/cx25821-audio-upstream.c | 13 +++++-------
 drivers/mtd/nand/nandsim.c                         |  2 +-
 fs/binfmt_aout.c                                   |  3 ++-
 fs/binfmt_elf.c                                    | 23 ++++++++++++----------
 fs/binfmt_elf_fdpic.c                              | 17 ++++++++--------
 fs/binfmt_flat.c                                   | 18 +++++------------
 fs/binfmt_misc.c                                   |  5 ++++-
 fs/coda/dir.c                                      |  5 +++--
 fs/ecryptfs/read_write.c                           |  2 +-
 fs/exec.c                                          |  7 +++----
 fs/read_write.c                                    |  8 +++-----
 include/linux/fs.h                                 |  2 +-
 kernel/sysctl_binary.c                             | 12 +++++++----
 net/9p/trans_fd.c                                  |  4 +++-
 security/keys/big_key.c                            |  3 ++-
 17 files changed, 69 insertions(+), 71 deletions(-)

(limited to 'include')

diff --git a/arch/mips/kernel/elf.c b/arch/mips/kernel/elf.c
index 5c429d70e17f..0828d6d963b7 100644
--- a/arch/mips/kernel/elf.c
+++ b/arch/mips/kernel/elf.c
@@ -87,6 +87,7 @@ int arch_elf_pt_proc(void *_ehdr, void *_phdr, struct file *elf,
 	bool elf32;
 	u32 flags;
 	int ret;
+	loff_t pos;
 
 	elf32 = ehdr->e32.e_ident[EI_CLASS] == ELFCLASS32;
 	flags = elf32 ? ehdr->e32.e_flags : ehdr->e64.e_flags;
@@ -108,21 +109,16 @@ int arch_elf_pt_proc(void *_ehdr, void *_phdr, struct file *elf,
 
 		if (phdr32->p_filesz < sizeof(abiflags))
 			return -EINVAL;
-
-		ret = kernel_read(elf, phdr32->p_offset,
-				  (char *)&abiflags,
-				  sizeof(abiflags));
+		pos = phdr32->p_offset;
 	} else {
 		if (phdr64->p_type != PT_MIPS_ABIFLAGS)
 			return 0;
 		if (phdr64->p_filesz < sizeof(abiflags))
 			return -EINVAL;
-
-		ret = kernel_read(elf, phdr64->p_offset,
-				  (char *)&abiflags,
-				  sizeof(abiflags));
+		pos = phdr64->p_offset;
 	}
 
+	ret = kernel_read(elf, &abiflags, sizeof(abiflags), &pos);
 	if (ret < 0)
 		return ret;
 	if (ret != sizeof(abiflags))
diff --git a/arch/x86/ia32/ia32_aout.c b/arch/x86/ia32/ia32_aout.c
index 8d0879f1d42c..8e02b30cf08e 100644
--- a/arch/x86/ia32/ia32_aout.c
+++ b/arch/x86/ia32/ia32_aout.c
@@ -407,10 +407,10 @@ static int load_aout_library(struct file *file)
 	unsigned long bss, start_addr, len, error;
 	int retval;
 	struct exec ex;
-
+	loff_t pos = 0;
 
 	retval = -ENOEXEC;
-	error = kernel_read(file, 0, (char *) &ex, sizeof(ex));
+	error = kernel_read(file, &ex, sizeof(ex), &pos);
 	if (error != sizeof(ex))
 		goto out;
 
diff --git a/drivers/media/pci/cx25821/cx25821-audio-upstream.c b/drivers/media/pci/cx25821/cx25821-audio-upstream.c
index b94eb1c0023d..ada26d4acfb4 100644
--- a/drivers/media/pci/cx25821/cx25821-audio-upstream.c
+++ b/drivers/media/pci/cx25821/cx25821-audio-upstream.c
@@ -277,7 +277,7 @@ static int cx25821_get_audio_data(struct cx25821_dev *dev,
 		p = (char *)dev->_audiodata_buf_virt_addr + frame_offset;
 
 	for (i = 0; i < dev->_audio_lines_count; i++) {
-		int n = kernel_read(file, file_offset, mybuf, AUDIO_LINE_SIZE);
+		int n = kernel_read(file, mybuf, AUDIO_LINE_SIZE, &file_offset);
 		if (n < AUDIO_LINE_SIZE) {
 			pr_info("Done: exit %s() since no more bytes to read from Audio file\n",
 				__func__);
@@ -290,7 +290,6 @@ static int cx25821_get_audio_data(struct cx25821_dev *dev,
 			memcpy(p, mybuf, n);
 			p += n;
 		}
-		file_offset += n;
 	}
 	dev->_audioframe_count++;
 	fput(file);
@@ -318,7 +317,7 @@ static int cx25821_openfile_audio(struct cx25821_dev *dev,
 {
 	char *p = (void *)dev->_audiodata_buf_virt_addr;
 	struct file *file;
-	loff_t offset;
+	loff_t file_offset = 0;
 	int i, j;
 
 	file = filp_open(dev->_audiofilename, O_RDONLY | O_LARGEFILE, 0);
@@ -328,11 +327,11 @@ static int cx25821_openfile_audio(struct cx25821_dev *dev,
 		return PTR_ERR(file);
 	}
 
-	for (j = 0, offset = 0; j < NUM_AUDIO_FRAMES; j++) {
+	for (j = 0; j < NUM_AUDIO_FRAMES; j++) {
 		for (i = 0; i < dev->_audio_lines_count; i++) {
 			char buf[AUDIO_LINE_SIZE];
-			int n = kernel_read(file, offset, buf,
-						AUDIO_LINE_SIZE);
+			loff_t offset = file_offset;
+			int n = kernel_read(file, buf, AUDIO_LINE_SIZE, &file_offset);
 
 			if (n < AUDIO_LINE_SIZE) {
 				pr_info("Done: exit %s() since no more bytes to read from Audio file\n",
@@ -344,8 +343,6 @@ static int cx25821_openfile_audio(struct cx25821_dev *dev,
 
 			if (p)
 				memcpy(p + offset, buf, n);
-
-			offset += n;
 		}
 		dev->_audioframe_count++;
 	}
diff --git a/drivers/mtd/nand/nandsim.c b/drivers/mtd/nand/nandsim.c
index e4211c3cc49b..a8089656879a 100644
--- a/drivers/mtd/nand/nandsim.c
+++ b/drivers/mtd/nand/nandsim.c
@@ -1379,7 +1379,7 @@ static ssize_t read_file(struct nandsim *ns, struct file *file, void *buf, size_
 	if (err)
 		return err;
 	noreclaim_flag = memalloc_noreclaim_save();
-	tx = kernel_read(file, pos, buf, count);
+	tx = kernel_read(file, buf, count, &pos);
 	memalloc_noreclaim_restore(noreclaim_flag);
 	put_pages(ns);
 	return tx;
diff --git a/fs/binfmt_aout.c b/fs/binfmt_aout.c
index 9be82c4e14a4..ce1824f47ba6 100644
--- a/fs/binfmt_aout.c
+++ b/fs/binfmt_aout.c
@@ -341,11 +341,12 @@ static int load_aout_library(struct file *file)
 	unsigned long error;
 	int retval;
 	struct exec ex;
+	loff_t pos = 0;
 
 	inode = file_inode(file);
 
 	retval = -ENOEXEC;
-	error = kernel_read(file, 0, (char *) &ex, sizeof(ex));
+	error = kernel_read(file, &ex, sizeof(ex), &pos);
 	if (error != sizeof(ex))
 		goto out;
 
diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c
index 6466153f2bf0..2f928b87c90e 100644
--- a/fs/binfmt_elf.c
+++ b/fs/binfmt_elf.c
@@ -409,6 +409,7 @@ static struct elf_phdr *load_elf_phdrs(struct elfhdr *elf_ex,
 {
 	struct elf_phdr *elf_phdata = NULL;
 	int retval, size, err = -1;
+	loff_t pos = elf_ex->e_phoff;
 
 	/*
 	 * If the size of this structure has changed, then punt, since
@@ -432,8 +433,7 @@ static struct elf_phdr *load_elf_phdrs(struct elfhdr *elf_ex,
 		goto out;
 
 	/* Read in the program headers */
-	retval = kernel_read(elf_file, elf_ex->e_phoff,
-			     (char *)elf_phdata, size);
+	retval = kernel_read(elf_file, elf_phdata, size, &pos);
 	if (retval != size) {
 		err = (retval < 0) ? retval : -EIO;
 		goto out;
@@ -698,6 +698,7 @@ static int load_elf_binary(struct linux_binprm *bprm)
 		struct elfhdr interp_elf_ex;
 	} *loc;
 	struct arch_elf_state arch_state = INIT_ARCH_ELF_STATE;
+	loff_t pos;
 
 	loc = kmalloc(sizeof(*loc), GFP_KERNEL);
 	if (!loc) {
@@ -750,9 +751,9 @@ static int load_elf_binary(struct linux_binprm *bprm)
 			if (!elf_interpreter)
 				goto out_free_ph;
 
-			retval = kernel_read(bprm->file, elf_ppnt->p_offset,
-					     elf_interpreter,
-					     elf_ppnt->p_filesz);
+			pos = elf_ppnt->p_offset;
+			retval = kernel_read(bprm->file, elf_interpreter,
+					     elf_ppnt->p_filesz, &pos);
 			if (retval != elf_ppnt->p_filesz) {
 				if (retval >= 0)
 					retval = -EIO;
@@ -776,9 +777,9 @@ static int load_elf_binary(struct linux_binprm *bprm)
 			would_dump(bprm, interpreter);
 
 			/* Get the exec headers */
-			retval = kernel_read(interpreter, 0,
-					     (void *)&loc->interp_elf_ex,
-					     sizeof(loc->interp_elf_ex));
+			pos = 0;
+			retval = kernel_read(interpreter, &loc->interp_elf_ex,
+					     sizeof(loc->interp_elf_ex), &pos);
 			if (retval != sizeof(loc->interp_elf_ex)) {
 				if (retval >= 0)
 					retval = -EIO;
@@ -1175,9 +1176,10 @@ static int load_elf_library(struct file *file)
 	unsigned long elf_bss, bss, len;
 	int retval, error, i, j;
 	struct elfhdr elf_ex;
+	loff_t pos = 0;
 
 	error = -ENOEXEC;
-	retval = kernel_read(file, 0, (char *)&elf_ex, sizeof(elf_ex));
+	retval = kernel_read(file, &elf_ex, sizeof(elf_ex), &pos);
 	if (retval != sizeof(elf_ex))
 		goto out;
 
@@ -1201,7 +1203,8 @@ static int load_elf_library(struct file *file)
 
 	eppnt = elf_phdata;
 	error = -ENOEXEC;
-	retval = kernel_read(file, elf_ex.e_phoff, (char *)eppnt, j);
+	pos =  elf_ex.e_phoff;
+	retval = kernel_read(file, eppnt, j, &pos);
 	if (retval != j)
 		goto out_free_ph;
 
diff --git a/fs/binfmt_elf_fdpic.c b/fs/binfmt_elf_fdpic.c
index cf93a4fad012..b4ebfe203a68 100644
--- a/fs/binfmt_elf_fdpic.c
+++ b/fs/binfmt_elf_fdpic.c
@@ -145,6 +145,7 @@ static int elf_fdpic_fetch_phdrs(struct elf_fdpic_params *params,
 	struct elf32_phdr *phdr;
 	unsigned long size;
 	int retval, loop;
+	loff_t pos = params->hdr.e_phoff;
 
 	if (params->hdr.e_phentsize != sizeof(struct elf_phdr))
 		return -ENOMEM;
@@ -156,8 +157,7 @@ static int elf_fdpic_fetch_phdrs(struct elf_fdpic_params *params,
 	if (!params->phdrs)
 		return -ENOMEM;
 
-	retval = kernel_read(file, params->hdr.e_phoff,
-			     (char *) params->phdrs, size);
+	retval = kernel_read(file, params->phdrs, size, &pos);
 	if (unlikely(retval != size))
 		return retval < 0 ? retval : -ENOEXEC;
 
@@ -199,6 +199,7 @@ static int load_elf_fdpic_binary(struct linux_binprm *bprm)
 	char *interpreter_name = NULL;
 	int executable_stack;
 	int retval, i;
+	loff_t pos;
 
 	kdebug("____ LOAD %d ____", current->pid);
 
@@ -246,10 +247,9 @@ static int load_elf_fdpic_binary(struct linux_binprm *bprm)
 			if (!interpreter_name)
 				goto error;
 
-			retval = kernel_read(bprm->file,
-					     phdr->p_offset,
-					     interpreter_name,
-					     phdr->p_filesz);
+			pos = phdr->p_offset;
+			retval = kernel_read(bprm->file, interpreter_name,
+					     phdr->p_filesz, &pos);
 			if (unlikely(retval != phdr->p_filesz)) {
 				if (retval >= 0)
 					retval = -ENOEXEC;
@@ -277,8 +277,9 @@ static int load_elf_fdpic_binary(struct linux_binprm *bprm)
 			 */
 			would_dump(bprm, interpreter);
 
-			retval = kernel_read(interpreter, 0, bprm->buf,
-					     BINPRM_BUF_SIZE);
+			pos = 0;
+			retval = kernel_read(interpreter, bprm->buf,
+					BINPRM_BUF_SIZE, &pos);
 			if (unlikely(retval != BINPRM_BUF_SIZE)) {
 				if (retval >= 0)
 					retval = -ENOEXEC;
diff --git a/fs/binfmt_flat.c b/fs/binfmt_flat.c
index a1e6860b6f46..afb7e9d521d2 100644
--- a/fs/binfmt_flat.c
+++ b/fs/binfmt_flat.c
@@ -176,19 +176,14 @@ static int create_flat_tables(struct linux_binprm *bprm, unsigned long arg_start
 #define ENCRYPTED    0x20 /* bit 5 set: file is encrypted */
 #define RESERVED     0xC0 /* bit 6,7:   reserved */
 
-static int decompress_exec(
-	struct linux_binprm *bprm,
-	unsigned long offset,
-	char *dst,
-	long len,
-	int fd)
+static int decompress_exec(struct linux_binprm *bprm, loff_t fpos, char *dst,
+		long len, int fd)
 {
 	unsigned char *buf;
 	z_stream strm;
-	loff_t fpos;
 	int ret, retval;
 
-	pr_debug("decompress_exec(offset=%lx,buf=%p,len=%lx)\n", offset, dst, len);
+	pr_debug("decompress_exec(offset=%llx,buf=%p,len=%lx)\n", fpos, dst, len);
 
 	memset(&strm, 0, sizeof(strm));
 	strm.workspace = kmalloc(zlib_inflate_workspacesize(), GFP_KERNEL);
@@ -204,13 +199,11 @@ static int decompress_exec(
 	}
 
 	/* Read in first chunk of data and parse gzip header. */
-	fpos = offset;
-	ret = kernel_read(bprm->file, offset, buf, LBUFSIZE);
+	ret = kernel_read(bprm->file, buf, LBUFSIZE, &fpos);
 
 	strm.next_in = buf;
 	strm.avail_in = ret;
 	strm.total_in = 0;
-	fpos += ret;
 
 	retval = -ENOEXEC;
 
@@ -276,7 +269,7 @@ static int decompress_exec(
 	}
 
 	while ((ret = zlib_inflate(&strm, Z_NO_FLUSH)) == Z_OK) {
-		ret = kernel_read(bprm->file, fpos, buf, LBUFSIZE);
+		ret = kernel_read(bprm->file, buf, LBUFSIZE, &fpos);
 		if (ret <= 0)
 			break;
 		len -= ret;
@@ -284,7 +277,6 @@ static int decompress_exec(
 		strm.next_in = buf;
 		strm.avail_in = ret;
 		strm.total_in = 0;
-		fpos += ret;
 	}
 
 	if (ret < 0) {
diff --git a/fs/binfmt_misc.c b/fs/binfmt_misc.c
index f4718098ac31..ce7181ea60fa 100644
--- a/fs/binfmt_misc.c
+++ b/fs/binfmt_misc.c
@@ -218,12 +218,15 @@ static int load_misc_binary(struct linux_binprm *bprm)
 
 	bprm->file = interp_file;
 	if (fmt->flags & MISC_FMT_CREDENTIALS) {
+		loff_t pos = 0;
+
 		/*
 		 * No need to call prepare_binprm(), it's already been
 		 * done.  bprm->buf is stale, update from interp_file.
 		 */
 		memset(bprm->buf, 0, BINPRM_BUF_SIZE);
-		retval = kernel_read(bprm->file, 0, bprm->buf, BINPRM_BUF_SIZE);
+		retval = kernel_read(bprm->file, bprm->buf, BINPRM_BUF_SIZE,
+				&pos);
 	} else
 		retval = prepare_binprm(bprm);
 
diff --git a/fs/coda/dir.c b/fs/coda/dir.c
index c0474ac6cbf2..274ab5586dd0 100644
--- a/fs/coda/dir.c
+++ b/fs/coda/dir.c
@@ -368,9 +368,10 @@ static int coda_venus_readdir(struct file *coda_file, struct dir_context *ctx)
 		goto out;
 
 	while (1) {
+		loff_t pos = ctx->pos - 2;
+
 		/* read entries from the directory file */
-		ret = kernel_read(host_file, ctx->pos - 2, (char *)vdir,
-				  sizeof(*vdir));
+		ret = kernel_read(host_file, vdir, sizeof(*vdir), &pos);
 		if (ret < 0) {
 			pr_err("%s: read dir %s failed %d\n",
 			       __func__, coda_f2s(&cii->c_fid), ret);
diff --git a/fs/ecryptfs/read_write.c b/fs/ecryptfs/read_write.c
index 039e627194a9..d8af0e99bfaf 100644
--- a/fs/ecryptfs/read_write.c
+++ b/fs/ecryptfs/read_write.c
@@ -237,7 +237,7 @@ int ecryptfs_read_lower(char *data, loff_t offset, size_t size,
 	lower_file = ecryptfs_inode_to_private(ecryptfs_inode)->lower_file;
 	if (!lower_file)
 		return -EIO;
-	return kernel_read(lower_file, offset, data, size);
+	return kernel_read(lower_file, data, size, &offset);
 }
 
 /**
diff --git a/fs/exec.c b/fs/exec.c
index 8adcc5eaa175..15fb4d56cc43 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -922,8 +922,7 @@ int kernel_read_file(struct file *file, void **buf, loff_t *size,
 
 	pos = 0;
 	while (pos < i_size) {
-		bytes = kernel_read(file, pos, (char *)(*buf) + pos,
-				    i_size - pos);
+		bytes = kernel_read(file, *buf + pos, i_size - pos, &pos);
 		if (bytes < 0) {
 			ret = bytes;
 			goto out;
@@ -931,7 +930,6 @@ int kernel_read_file(struct file *file, void **buf, loff_t *size,
 
 		if (bytes == 0)
 			break;
-		pos += bytes;
 	}
 
 	if (pos != i_size) {
@@ -1524,6 +1522,7 @@ static void bprm_fill_uid(struct linux_binprm *bprm)
 int prepare_binprm(struct linux_binprm *bprm)
 {
 	int retval;
+	loff_t pos = 0;
 
 	bprm_fill_uid(bprm);
 
@@ -1534,7 +1533,7 @@ int prepare_binprm(struct linux_binprm *bprm)
 	bprm->cred_prepared = 1;
 
 	memset(bprm->buf, 0, BINPRM_BUF_SIZE);
-	return kernel_read(bprm->file, 0, bprm->buf, BINPRM_BUF_SIZE);
+	return kernel_read(bprm->file, bprm->buf, BINPRM_BUF_SIZE, &pos);
 }
 
 EXPORT_SYMBOL(prepare_binprm);
diff --git a/fs/read_write.c b/fs/read_write.c
index 1ea862bc7efd..9cf1de855b7a 100644
--- a/fs/read_write.c
+++ b/fs/read_write.c
@@ -415,17 +415,15 @@ ssize_t __vfs_read(struct file *file, char __user *buf, size_t count,
 }
 EXPORT_SYMBOL(__vfs_read);
 
-int kernel_read(struct file *file, loff_t offset, char *addr,
-		unsigned long count)
+ssize_t kernel_read(struct file *file, void *buf, size_t count, loff_t *pos)
 {
 	mm_segment_t old_fs;
-	loff_t pos = offset;
-	int result;
+	ssize_t result;
 
 	old_fs = get_fs();
 	set_fs(get_ds());
 	/* The cast to a user pointer is valid due to the set_fs() */
-	result = vfs_read(file, (void __user *)addr, count, &pos);
+	result = vfs_read(file, (void __user *)buf, count, pos);
 	set_fs(old_fs);
 	return result;
 }
diff --git a/include/linux/fs.h b/include/linux/fs.h
index cbfe127bccf8..2ba8f38a4d63 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -2772,13 +2772,13 @@ static inline const char *kernel_read_file_id_str(enum kernel_read_file_id id)
 	return kernel_read_file_str[id];
 }
 
-extern int kernel_read(struct file *, loff_t, char *, unsigned long);
 extern int kernel_read_file(struct file *, void **, loff_t *, loff_t,
 			    enum kernel_read_file_id);
 extern int kernel_read_file_from_path(char *, void **, loff_t *, loff_t,
 				      enum kernel_read_file_id);
 extern int kernel_read_file_from_fd(int, void **, loff_t *, loff_t,
 				    enum kernel_read_file_id);
+extern ssize_t kernel_read(struct file *, void *, size_t, loff_t *);
 extern ssize_t kernel_write(struct file *, const char *, size_t, loff_t);
 extern ssize_t __kernel_write(struct file *, const char *, size_t, loff_t *);
 extern struct file * open_exec(const char *);
diff --git a/kernel/sysctl_binary.c b/kernel/sysctl_binary.c
index 02e1859f2ca8..243fa1c28b4a 100644
--- a/kernel/sysctl_binary.c
+++ b/kernel/sysctl_binary.c
@@ -986,8 +986,9 @@ static ssize_t bin_intvec(struct file *file,
 		size_t length = oldlen / sizeof(*vec);
 		char *str, *end;
 		int i;
+		loff_t pos = 0;
 
-		result = kernel_read(file, 0, buffer, BUFSZ - 1);
+		result = kernel_read(file, buffer, BUFSZ - 1, &pos);
 		if (result < 0)
 			goto out_kfree;
 
@@ -1057,8 +1058,9 @@ static ssize_t bin_ulongvec(struct file *file,
 		size_t length = oldlen / sizeof(*vec);
 		char *str, *end;
 		int i;
+		loff_t pos = 0;
 
-		result = kernel_read(file, 0, buffer, BUFSZ - 1);
+		result = kernel_read(file, buffer, BUFSZ - 1, &pos);
 		if (result < 0)
 			goto out_kfree;
 
@@ -1120,8 +1122,9 @@ static ssize_t bin_uuid(struct file *file,
 	if (oldval && oldlen) {
 		char buf[UUID_STRING_LEN + 1];
 		uuid_t uuid;
+		loff_t pos = 0;
 
-		result = kernel_read(file, 0, buf, sizeof(buf) - 1);
+		result = kernel_read(file, buf, sizeof(buf) - 1, &pos);
 		if (result < 0)
 			goto out;
 
@@ -1154,8 +1157,9 @@ static ssize_t bin_dn_node_address(struct file *file,
 		char buf[15], *nodep;
 		unsigned long area, node;
 		__le16 dnaddr;
+		loff_t pos = 0;
 
-		result = kernel_read(file, 0, buf, sizeof(buf) - 1);
+		result = kernel_read(file, buf, sizeof(buf) - 1, &pos);
 		if (result < 0)
 			goto out;
 
diff --git a/net/9p/trans_fd.c b/net/9p/trans_fd.c
index ddfa86648f95..f12815777beb 100644
--- a/net/9p/trans_fd.c
+++ b/net/9p/trans_fd.c
@@ -272,6 +272,7 @@ static int p9_fd_read(struct p9_client *client, void *v, int len)
 {
 	int ret;
 	struct p9_trans_fd *ts = NULL;
+	loff_t pos;
 
 	if (client && client->status != Disconnected)
 		ts = client->trans;
@@ -282,7 +283,8 @@ static int p9_fd_read(struct p9_client *client, void *v, int len)
 	if (!(ts->rd->f_flags & O_NONBLOCK))
 		p9_debug(P9_DEBUG_ERROR, "blocking read ...\n");
 
-	ret = kernel_read(ts->rd, ts->rd->f_pos, v, len);
+	pos = ts->rd->f_pos;
+	ret = kernel_read(ts->rd, v, len, &pos);
 	if (ret <= 0 && ret != -ERESTARTSYS && ret != -EAGAIN)
 		client->status = Disconnected;
 	return ret;
diff --git a/security/keys/big_key.c b/security/keys/big_key.c
index 835c1ab30d01..9f4c86cade8e 100644
--- a/security/keys/big_key.c
+++ b/security/keys/big_key.c
@@ -295,6 +295,7 @@ long big_key_read(const struct key *key, char __user *buffer, size_t buflen)
 		u8 *data;
 		u8 *enckey = (u8 *)key->payload.data[big_key_data];
 		size_t enclen = ALIGN(datalen, crypto_skcipher_blocksize(big_key_skcipher));
+		loff_t pos = 0;
 
 		data = kmalloc(enclen, GFP_KERNEL);
 		if (!data)
@@ -307,7 +308,7 @@ long big_key_read(const struct key *key, char __user *buffer, size_t buflen)
 		}
 
 		/* read file to kernel and decrypt */
-		ret = kernel_read(file, 0, data, enclen);
+		ret = kernel_read(file, data, enclen, &pos);
 		if (ret >= 0 && ret != enclen) {
 			ret = -EIO;
 			goto err_fput;
-- 
cgit v1.2.3


From e13ec939e96b13e664bb6cee361cc976a0ee621a Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Fri, 1 Sep 2017 17:39:14 +0200
Subject: fs: fix kernel_write prototype

Make the position an in/out argument like all the other read/write
helpers and and make the buf argument a void pointer.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 drivers/mtd/nand/nandsim.c        | 2 +-
 drivers/target/target_core_alua.c | 3 ++-
 drivers/target/target_core_file.c | 2 +-
 drivers/target/target_core_pr.c   | 3 ++-
 fs/ecryptfs/read_write.c          | 2 +-
 fs/read_write.c                   | 6 +++---
 include/linux/fs.h                | 2 +-
 kernel/sysctl_binary.c            | 9 ++++++---
 security/keys/big_key.c           | 3 ++-
 9 files changed, 19 insertions(+), 13 deletions(-)

(limited to 'include')

diff --git a/drivers/mtd/nand/nandsim.c b/drivers/mtd/nand/nandsim.c
index a8089656879a..3300a77667fb 100644
--- a/drivers/mtd/nand/nandsim.c
+++ b/drivers/mtd/nand/nandsim.c
@@ -1395,7 +1395,7 @@ static ssize_t write_file(struct nandsim *ns, struct file *file, void *buf, size
 	if (err)
 		return err;
 	noreclaim_flag = memalloc_noreclaim_save();
-	tx = kernel_write(file, buf, count, pos);
+	tx = kernel_write(file, buf, count, &pos);
 	memalloc_noreclaim_restore(noreclaim_flag);
 	put_pages(ns);
 	return tx;
diff --git a/drivers/target/target_core_alua.c b/drivers/target/target_core_alua.c
index a91b7c25ffd4..928127642574 100644
--- a/drivers/target/target_core_alua.c
+++ b/drivers/target/target_core_alua.c
@@ -896,13 +896,14 @@ static int core_alua_write_tpg_metadata(
 	u32 md_buf_len)
 {
 	struct file *file = filp_open(path, O_RDWR | O_CREAT | O_TRUNC, 0600);
+	loff_t pos = 0;
 	int ret;
 
 	if (IS_ERR(file)) {
 		pr_err("filp_open(%s) for ALUA metadata failed\n", path);
 		return -ENODEV;
 	}
-	ret = kernel_write(file, md_buf, md_buf_len, 0);
+	ret = kernel_write(file, md_buf, md_buf_len, &pos);
 	if (ret < 0)
 		pr_err("Error writing ALUA metadata file: %s\n", path);
 	fput(file);
diff --git a/drivers/target/target_core_file.c b/drivers/target/target_core_file.c
index 24cf11d9e50a..c629817a8854 100644
--- a/drivers/target/target_core_file.c
+++ b/drivers/target/target_core_file.c
@@ -443,7 +443,7 @@ fd_do_prot_fill(struct se_device *se_dev, sector_t lba, sector_t nolb,
 
 	for (prot = 0; prot < prot_length;) {
 		sector_t len = min_t(sector_t, bufsize, prot_length - prot);
-		ssize_t ret = kernel_write(prot_fd, buf, len, pos + prot);
+		ssize_t ret = kernel_write(prot_fd, buf, len, &pos);
 
 		if (ret != len) {
 			pr_err("vfs_write to prot file failed: %zd\n", ret);
diff --git a/drivers/target/target_core_pr.c b/drivers/target/target_core_pr.c
index 6d5def64db61..dd2cd8048582 100644
--- a/drivers/target/target_core_pr.c
+++ b/drivers/target/target_core_pr.c
@@ -1974,6 +1974,7 @@ static int __core_scsi3_write_aptpl_to_file(
 	char path[512];
 	u32 pr_aptpl_buf_len;
 	int ret;
+	loff_t pos = 0;
 
 	memset(path, 0, 512);
 
@@ -1993,7 +1994,7 @@ static int __core_scsi3_write_aptpl_to_file(
 
 	pr_aptpl_buf_len = (strlen(buf) + 1); /* Add extra for NULL */
 
-	ret = kernel_write(file, buf, pr_aptpl_buf_len, 0);
+	ret = kernel_write(file, buf, pr_aptpl_buf_len, &pos);
 
 	if (ret < 0)
 		pr_debug("Error writing APTPL metadata file: %s\n", path);
diff --git a/fs/ecryptfs/read_write.c b/fs/ecryptfs/read_write.c
index d8af0e99bfaf..c596e7c03424 100644
--- a/fs/ecryptfs/read_write.c
+++ b/fs/ecryptfs/read_write.c
@@ -47,7 +47,7 @@ int ecryptfs_write_lower(struct inode *ecryptfs_inode, char *data,
 	lower_file = ecryptfs_inode_to_private(ecryptfs_inode)->lower_file;
 	if (!lower_file)
 		return -EIO;
-	rc = kernel_write(lower_file, data, size, offset);
+	rc = kernel_write(lower_file, data, size, &offset);
 	mark_inode_dirty_sync(ecryptfs_inode);
 	return rc;
 }
diff --git a/fs/read_write.c b/fs/read_write.c
index 9cf1de855b7a..9f3aeb101d73 100644
--- a/fs/read_write.c
+++ b/fs/read_write.c
@@ -512,8 +512,8 @@ ssize_t __kernel_write(struct file *file, const char *buf, size_t count, loff_t
 }
 EXPORT_SYMBOL(__kernel_write);
 
-ssize_t kernel_write(struct file *file, const char *buf, size_t count,
-			    loff_t pos)
+ssize_t kernel_write(struct file *file, const void *buf, size_t count,
+			    loff_t *pos)
 {
 	mm_segment_t old_fs;
 	ssize_t res;
@@ -521,7 +521,7 @@ ssize_t kernel_write(struct file *file, const char *buf, size_t count,
 	old_fs = get_fs();
 	set_fs(get_ds());
 	/* The cast to a user pointer is valid due to the set_fs() */
-	res = vfs_write(file, (__force const char __user *)buf, count, &pos);
+	res = vfs_write(file, (__force const char __user *)buf, count, pos);
 	set_fs(old_fs);
 
 	return res;
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 2ba8f38a4d63..0e7d3da8a307 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -2779,7 +2779,7 @@ extern int kernel_read_file_from_path(char *, void **, loff_t *, loff_t,
 extern int kernel_read_file_from_fd(int, void **, loff_t *, loff_t,
 				    enum kernel_read_file_id);
 extern ssize_t kernel_read(struct file *, void *, size_t, loff_t *);
-extern ssize_t kernel_write(struct file *, const char *, size_t, loff_t);
+extern ssize_t kernel_write(struct file *, const void *, size_t, loff_t *);
 extern ssize_t __kernel_write(struct file *, const char *, size_t, loff_t *);
 extern struct file * open_exec(const char *);
  
diff --git a/kernel/sysctl_binary.c b/kernel/sysctl_binary.c
index 243fa1c28b4a..58ea8c03662e 100644
--- a/kernel/sysctl_binary.c
+++ b/kernel/sysctl_binary.c
@@ -1017,6 +1017,7 @@ static ssize_t bin_intvec(struct file *file,
 		size_t length = newlen / sizeof(*vec);
 		char *str, *end;
 		int i;
+		loff_t pos = 0;
 
 		str = buffer;
 		end = str + BUFSZ;
@@ -1030,7 +1031,7 @@ static ssize_t bin_intvec(struct file *file,
 			str += scnprintf(str, end - str, "%lu\t", value);
 		}
 
-		result = kernel_write(file, buffer, str - buffer, 0);
+		result = kernel_write(file, buffer, str - buffer, &pos);
 		if (result < 0)
 			goto out_kfree;
 	}
@@ -1089,6 +1090,7 @@ static ssize_t bin_ulongvec(struct file *file,
 		size_t length = newlen / sizeof(*vec);
 		char *str, *end;
 		int i;
+		loff_t pos = 0;
 
 		str = buffer;
 		end = str + BUFSZ;
@@ -1102,7 +1104,7 @@ static ssize_t bin_ulongvec(struct file *file,
 			str += scnprintf(str, end - str, "%lu\t", value);
 		}
 
-		result = kernel_write(file, buffer, str - buffer, 0);
+		result = kernel_write(file, buffer, str - buffer, &pos);
 		if (result < 0)
 			goto out_kfree;
 	}
@@ -1192,6 +1194,7 @@ static ssize_t bin_dn_node_address(struct file *file,
 		__le16 dnaddr;
 		char buf[15];
 		int len;
+		loff_t pos = 0;
 
 		result = -EINVAL;
 		if (newlen != sizeof(dnaddr))
@@ -1205,7 +1208,7 @@ static ssize_t bin_dn_node_address(struct file *file,
 				le16_to_cpu(dnaddr) >> 10,
 				le16_to_cpu(dnaddr) & 0x3ff);
 
-		result = kernel_write(file, buf, len, 0);
+		result = kernel_write(file, buf, len, &pos);
 		if (result < 0)
 			goto out;
 	}
diff --git a/security/keys/big_key.c b/security/keys/big_key.c
index 9f4c86cade8e..6acb00f6f22c 100644
--- a/security/keys/big_key.c
+++ b/security/keys/big_key.c
@@ -147,6 +147,7 @@ int big_key_preparse(struct key_preparsed_payload *prep)
 		 * File content is stored encrypted with randomly generated key.
 		 */
 		size_t enclen = ALIGN(datalen, crypto_skcipher_blocksize(big_key_skcipher));
+		loff_t pos = 0;
 
 		/* prepare aligned data to encrypt */
 		data = kmalloc(enclen, GFP_KERNEL);
@@ -179,7 +180,7 @@ int big_key_preparse(struct key_preparsed_payload *prep)
 			goto err_enckey;
 		}
 
-		written = kernel_write(file, data, enclen, 0);
+		written = kernel_write(file, data, enclen, &pos);
 		if (written != enclen) {
 			ret = written;
 			if (written >= 0)
-- 
cgit v1.2.3


From 73e18f7c0b3e1432353cdd86672c27cace7e6a7e Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Fri, 1 Sep 2017 17:39:15 +0200
Subject: fs: make the buf argument to __kernel_write a void pointer

This matches kernel_read and kernel_write and avoids any need for casts in
the callers.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/read_write.c    | 2 +-
 include/linux/fs.h | 2 +-
 kernel/acct.c      | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/fs/read_write.c b/fs/read_write.c
index 9f3aeb101d73..7415f94511e5 100644
--- a/fs/read_write.c
+++ b/fs/read_write.c
@@ -487,7 +487,7 @@ ssize_t __vfs_write(struct file *file, const char __user *p, size_t count,
 }
 EXPORT_SYMBOL(__vfs_write);
 
-ssize_t __kernel_write(struct file *file, const char *buf, size_t count, loff_t *pos)
+ssize_t __kernel_write(struct file *file, const void *buf, size_t count, loff_t *pos)
 {
 	mm_segment_t old_fs;
 	const char __user *p;
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 0e7d3da8a307..9ab7e2bf7dd1 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -2780,7 +2780,7 @@ extern int kernel_read_file_from_fd(int, void **, loff_t *, loff_t,
 				    enum kernel_read_file_id);
 extern ssize_t kernel_read(struct file *, void *, size_t, loff_t *);
 extern ssize_t kernel_write(struct file *, const void *, size_t, loff_t *);
-extern ssize_t __kernel_write(struct file *, const char *, size_t, loff_t *);
+extern ssize_t __kernel_write(struct file *, const void *, size_t, loff_t *);
 extern struct file * open_exec(const char *);
  
 /* fs/dcache.c -- generic fs support functions */
diff --git a/kernel/acct.c b/kernel/acct.c
index 5b1284370367..5e72af29ab73 100644
--- a/kernel/acct.c
+++ b/kernel/acct.c
@@ -516,7 +516,7 @@ static void do_acct_process(struct bsd_acct_struct *acct)
 	if (file_start_write_trylock(file)) {
 		/* it's been opened O_APPEND, so position is irrelevant */
 		loff_t pos = 0;
-		__kernel_write(file, (char *)&ac, sizeof(acct_t), &pos);
+		__kernel_write(file, &ac, sizeof(acct_t), &pos);
 		file_end_write(file);
 	}
 out:
-- 
cgit v1.2.3


From eb031849d52e61d24ba54e9d27553189ff328174 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Fri, 1 Sep 2017 17:39:23 +0200
Subject: fs: unexport __vfs_read/__vfs_write

No modular users left, and any new ones should use kernel_read/write
or iov_iter variants instead.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/read_write.c    | 2 --
 include/linux/fs.h | 1 -
 2 files changed, 3 deletions(-)

(limited to 'include')

diff --git a/fs/read_write.c b/fs/read_write.c
index 7415f94511e5..49450c642298 100644
--- a/fs/read_write.c
+++ b/fs/read_write.c
@@ -413,7 +413,6 @@ ssize_t __vfs_read(struct file *file, char __user *buf, size_t count,
 	else
 		return -EINVAL;
 }
-EXPORT_SYMBOL(__vfs_read);
 
 ssize_t kernel_read(struct file *file, void *buf, size_t count, loff_t *pos)
 {
@@ -485,7 +484,6 @@ ssize_t __vfs_write(struct file *file, const char __user *p, size_t count,
 	else
 		return -EINVAL;
 }
-EXPORT_SYMBOL(__vfs_write);
 
 ssize_t __kernel_write(struct file *file, const void *buf, size_t count, loff_t *pos)
 {
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 9ab7e2bf7dd1..f24b821cee8d 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1754,7 +1754,6 @@ ssize_t rw_copy_check_uvector(int type, const struct iovec __user * uvector,
 			      struct iovec **ret_pointer);
 
 extern ssize_t __vfs_read(struct file *, char __user *, size_t, loff_t *);
-extern ssize_t __vfs_write(struct file *, const char __user *, size_t, loff_t *);
 extern ssize_t vfs_read(struct file *, char __user *, size_t, loff_t *);
 extern ssize_t vfs_write(struct file *, const char __user *, size_t, loff_t *);
 extern ssize_t vfs_readv(struct file *, const struct iovec __user *,
-- 
cgit v1.2.3


From 9725d4cef62229b4ec4c912e0db0761e7d400650 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Fri, 1 Sep 2017 17:39:25 +0200
Subject: fs: unexport vfs_readv and vfs_writev

We've got no modular users left, and any potential modular user is better
of with iov_iter based variants.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/read_write.c    | 4 +---
 include/linux/fs.h | 2 --
 2 files changed, 1 insertion(+), 5 deletions(-)

(limited to 'include')

diff --git a/fs/read_write.c b/fs/read_write.c
index f5cfce243cef..cbdccaf032c6 100644
--- a/fs/read_write.c
+++ b/fs/read_write.c
@@ -982,9 +982,8 @@ ssize_t vfs_readv(struct file *file, const struct iovec __user *vec,
 
 	return ret;
 }
-EXPORT_SYMBOL(vfs_readv);
 
-ssize_t vfs_writev(struct file *file, const struct iovec __user *vec,
+static ssize_t vfs_writev(struct file *file, const struct iovec __user *vec,
 		   unsigned long vlen, loff_t *pos, int flags)
 {
 	struct iovec iovstack[UIO_FASTIOV];
@@ -1001,7 +1000,6 @@ ssize_t vfs_writev(struct file *file, const struct iovec __user *vec,
 	}
 	return ret;
 }
-EXPORT_SYMBOL(vfs_writev);
 
 static ssize_t do_readv(unsigned long fd, const struct iovec __user *vec,
 			unsigned long vlen, int flags)
diff --git a/include/linux/fs.h b/include/linux/fs.h
index f24b821cee8d..355cf02830a0 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1758,8 +1758,6 @@ extern ssize_t vfs_read(struct file *, char __user *, size_t, loff_t *);
 extern ssize_t vfs_write(struct file *, const char __user *, size_t, loff_t *);
 extern ssize_t vfs_readv(struct file *, const struct iovec __user *,
 		unsigned long, loff_t *, int);
-extern ssize_t vfs_writev(struct file *, const struct iovec __user *,
-		unsigned long, loff_t *, int);
 extern ssize_t vfs_copy_file_range(struct file *, loff_t , struct file *,
 				   loff_t, size_t, unsigned int);
 extern int vfs_clone_file_prep_inodes(struct inode *inode_in, loff_t pos_in,
-- 
cgit v1.2.3


From ec58871fb9c50429d6b5570066a7166da8faf086 Mon Sep 17 00:00:00 2001
From: Guodong Xu <guodong.xu@linaro.org>
Date: Thu, 20 Jul 2017 15:32:42 +0800
Subject: mfd: hi6421-pmic: Add support for HiSilicon Hi6421v530

Add support for HiSilicon Hi6421v530 PMIC. Hi6421v530 communicates with
main SoC via memory-mapped I/O.

Hi6421v530 and Hi6421 are PMIC chips from the same vendor, HiSilicon,
but at different revisions. They share the same memory-mapped I/O
design. They differ in integrated devices, such as regulator details,
LDO voltage points.

Signed-off-by: Guodong Xu <guodong.xu@linaro.org>
Signed-off-by: Wang Xiaoyin <hw.wangxiaoyin@hisilicon.com>
Acked-by: Arnd Bergmann <arnd@arndb.de>
Signed-off-by: Lee Jones <lee.jones@linaro.org>
---
 drivers/mfd/hi6421-pmic-core.c  | 70 ++++++++++++++++++++++++++++++-----------
 include/linux/mfd/hi6421-pmic.h |  5 +++
 2 files changed, 57 insertions(+), 18 deletions(-)

(limited to 'include')

diff --git a/drivers/mfd/hi6421-pmic-core.c b/drivers/mfd/hi6421-pmic-core.c
index b1139d4b0048..6fb7ba272e09 100644
--- a/drivers/mfd/hi6421-pmic-core.c
+++ b/drivers/mfd/hi6421-pmic-core.c
@@ -1,9 +1,9 @@
 /*
- * Device driver for Hi6421 IC
+ * Device driver for Hi6421 PMIC
  *
  * Copyright (c) <2011-2014> HiSilicon Technologies Co., Ltd.
  *              http://www.hisilicon.com
- * Copyright (c) <2013-2014> Linaro Ltd.
+ * Copyright (c) <2013-2017> Linaro Ltd.
  *              http://www.linaro.org
  *
  * Author: Guodong Xu <guodong.xu@linaro.org>
@@ -16,16 +16,20 @@
 #include <linux/device.h>
 #include <linux/err.h>
 #include <linux/mfd/core.h>
+#include <linux/mfd/hi6421-pmic.h>
 #include <linux/module.h>
-#include <linux/of.h>
+#include <linux/of_device.h>
 #include <linux/platform_device.h>
 #include <linux/regmap.h>
-#include <linux/mfd/hi6421-pmic.h>
 
 static const struct mfd_cell hi6421_devs[] = {
 	{ .name = "hi6421-regulator", },
 };
 
+static const struct mfd_cell hi6421v530_devs[] = {
+	{ .name = "hi6421v530-regulator", },
+};
+
 static const struct regmap_config hi6421_regmap_config = {
 	.reg_bits = 32,
 	.reg_stride = 4,
@@ -33,12 +37,33 @@ static const struct regmap_config hi6421_regmap_config = {
 	.max_register = HI6421_REG_TO_BUS_ADDR(HI6421_REG_MAX),
 };
 
+static const struct of_device_id of_hi6421_pmic_match[] = {
+	{
+		.compatible = "hisilicon,hi6421-pmic",
+		.data = (void *)HI6421
+	},
+	{
+		.compatible = "hisilicon,hi6421v530-pmic",
+		.data = (void *)HI6421_V530
+	},
+	{ },
+};
+MODULE_DEVICE_TABLE(of, of_hi6421_pmic_match);
+
 static int hi6421_pmic_probe(struct platform_device *pdev)
 {
 	struct hi6421_pmic *pmic;
 	struct resource *res;
+	const struct of_device_id *id;
+	const struct mfd_cell *subdevs;
+	enum hi6421_type type;
 	void __iomem *base;
-	int ret;
+	int n_subdevs, ret;
+
+	id = of_match_device(of_hi6421_pmic_match, &pdev->dev);
+	if (!id)
+		return -EINVAL;
+	type = (enum hi6421_type)id->data;
 
 	pmic = devm_kzalloc(&pdev->dev, sizeof(*pmic), GFP_KERNEL);
 	if (!pmic)
@@ -57,18 +82,33 @@ static int hi6421_pmic_probe(struct platform_device *pdev)
 		return PTR_ERR(pmic->regmap);
 	}
 
-	/* set over-current protection debounce 8ms */
-	regmap_update_bits(pmic->regmap, HI6421_OCP_DEB_CTRL_REG,
+	platform_set_drvdata(pdev, pmic);
+
+	switch (type) {
+	case HI6421:
+		/* set over-current protection debounce 8ms */
+		regmap_update_bits(pmic->regmap, HI6421_OCP_DEB_CTRL_REG,
 				(HI6421_OCP_DEB_SEL_MASK
 				 | HI6421_OCP_EN_DEBOUNCE_MASK
 				 | HI6421_OCP_AUTO_STOP_MASK),
 				(HI6421_OCP_DEB_SEL_8MS
 				 | HI6421_OCP_EN_DEBOUNCE_ENABLE));
 
-	platform_set_drvdata(pdev, pmic);
+		subdevs = hi6421_devs;
+		n_subdevs = ARRAY_SIZE(hi6421_devs);
+		break;
+	case HI6421_V530:
+		subdevs = hi6421v530_devs;
+		n_subdevs = ARRAY_SIZE(hi6421v530_devs);
+		break;
+	default:
+		dev_err(&pdev->dev, "Unknown device type %d\n",
+						(unsigned int)type);
+		return -EINVAL;
+	}
 
-	ret = devm_mfd_add_devices(&pdev->dev, 0, hi6421_devs,
-				   ARRAY_SIZE(hi6421_devs), NULL, 0, NULL);
+	ret = devm_mfd_add_devices(&pdev->dev, PLATFORM_DEVID_NONE,
+				   subdevs, n_subdevs, NULL, 0, NULL);
 	if (ret) {
 		dev_err(&pdev->dev, "Failed to add child devices: %d\n", ret);
 		return ret;
@@ -77,16 +117,10 @@ static int hi6421_pmic_probe(struct platform_device *pdev)
 	return 0;
 }
 
-static const struct of_device_id of_hi6421_pmic_match_tbl[] = {
-	{ .compatible = "hisilicon,hi6421-pmic", },
-	{ },
-};
-MODULE_DEVICE_TABLE(of, of_hi6421_pmic_match_tbl);
-
 static struct platform_driver hi6421_pmic_driver = {
 	.driver = {
-		.name	= "hi6421_pmic",
-		.of_match_table = of_hi6421_pmic_match_tbl,
+		.name = "hi6421_pmic",
+		.of_match_table = of_hi6421_pmic_match,
 	},
 	.probe	= hi6421_pmic_probe,
 };
diff --git a/include/linux/mfd/hi6421-pmic.h b/include/linux/mfd/hi6421-pmic.h
index 587273e35acf..2580c08db7b1 100644
--- a/include/linux/mfd/hi6421-pmic.h
+++ b/include/linux/mfd/hi6421-pmic.h
@@ -38,4 +38,9 @@ struct hi6421_pmic {
 	struct regmap		*regmap;
 };
 
+enum hi6421_type {
+	HI6421 = 0,
+	HI6421_V530,
+};
+
 #endif		/* __HI6421_PMIC_H */
-- 
cgit v1.2.3


From d3ea212720948acff862b4c842d5b464ad338841 Mon Sep 17 00:00:00 2001
From: Marek Vasut <marek.vasut@gmail.com>
Date: Mon, 17 Jul 2017 22:45:12 +0200
Subject: mfd: Add ROHM BD9571MWV-M MFD PMIC driver

Add the MFD part of the ROHM BD9571MWV-M PMIC driver and MAINTAINERS
entry. The MFD part only specifies the regmap bits for the PMIC and
binds the subdevs together.

Signed-off-by: Marek Vasut <marek.vasut+renesas@gmail.com>
Signed-off-by: Lee Jones <lee.jones@linaro.org>
---
 MAINTAINERS                   |  11 ++
 drivers/mfd/Kconfig           |  14 +++
 drivers/mfd/Makefile          |   1 +
 drivers/mfd/bd9571mwv.c       | 230 ++++++++++++++++++++++++++++++++++++++++++
 include/linux/mfd/bd9571mwv.h | 115 +++++++++++++++++++++
 5 files changed, 371 insertions(+)
 create mode 100644 drivers/mfd/bd9571mwv.c
 create mode 100644 include/linux/mfd/bd9571mwv.h

(limited to 'include')

diff --git a/MAINTAINERS b/MAINTAINERS
index 1c3feffb1c1c..0876e3e2b8e9 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -11280,6 +11280,17 @@ L:	linux-serial@vger.kernel.org
 S:	Odd Fixes
 F:	drivers/tty/serial/rp2.*
 
+ROHM MULTIFUNCTION BD9571MWV-M PMIC DEVICE DRIVERS
+M:	Marek Vasut <marek.vasut+renesas@gmail.com>
+L:	linux-kernel@vger.kernel.org
+L:	linux-renesas-soc@vger.kernel.org
+S:	Supported
+F:	drivers/mfd/bd9571mwv.c
+F:	drivers/regulator/bd9571mwv-regulator.c
+F:	drivers/gpio/gpio-bd9571mwv.c
+F:	include/linux/mfd/bd9571mwv.h
+F:	Documentation/devicetree/bindings/mfd/bd9571mwv.txt
+
 ROSE NETWORK LAYER
 M:	Ralf Baechle <ralf@linux-mips.org>
 L:	linux-hams@vger.kernel.org
diff --git a/drivers/mfd/Kconfig b/drivers/mfd/Kconfig
index 42fb90ae3345..79e4ec6f19a8 100644
--- a/drivers/mfd/Kconfig
+++ b/drivers/mfd/Kconfig
@@ -133,6 +133,20 @@ config MFD_BCM590XX
 	help
 	  Support for the BCM590xx PMUs from Broadcom
 
+config MFD_BD9571MWV
+	tristate "ROHM BD9571MWV PMIC"
+	select MFD_CORE
+	select REGMAP_I2C
+	select REGMAP_IRQ
+	depends on I2C
+	help
+	  Support for the ROHM BD9571MWV PMIC, which contains single
+	  voltage regulator, voltage sampling units, GPIO block and
+	  watchdog block.
+
+	  This driver can also be built as a module. If so, the module
+	  will be called bd9571mwv.
+
 config MFD_AC100
 	tristate "X-Powers AC100"
 	select MFD_CORE
diff --git a/drivers/mfd/Makefile b/drivers/mfd/Makefile
index b80b1a314ca5..21e19a5f3f3c 100644
--- a/drivers/mfd/Makefile
+++ b/drivers/mfd/Makefile
@@ -10,6 +10,7 @@ obj-$(CONFIG_MFD_ACT8945A)	+= act8945a.o
 obj-$(CONFIG_MFD_SM501)		+= sm501.o
 obj-$(CONFIG_MFD_ASIC3)		+= asic3.o tmio_core.o
 obj-$(CONFIG_MFD_BCM590XX)	+= bcm590xx.o
+obj-$(CONFIG_MFD_BD9571MWV)	+= bd9571mwv.o
 cros_ec_core-objs		:= cros_ec.o
 cros_ec_core-$(CONFIG_ACPI)	+= cros_ec_acpi_gpe.o
 obj-$(CONFIG_MFD_CROS_EC)	+= cros_ec_core.o
diff --git a/drivers/mfd/bd9571mwv.c b/drivers/mfd/bd9571mwv.c
new file mode 100644
index 000000000000..64e088dfe7b0
--- /dev/null
+++ b/drivers/mfd/bd9571mwv.c
@@ -0,0 +1,230 @@
+/*
+ * ROHM BD9571MWV-M MFD driver
+ *
+ * Copyright (C) 2017 Marek Vasut <marek.vasut+renesas@gmail.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed "as is" WITHOUT ANY WARRANTY of any
+ * kind, whether expressed or implied; without even the implied warranty
+ * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License version 2 for more details.
+ *
+ * Based on the TPS65086 driver
+ */
+
+#include <linux/i2c.h>
+#include <linux/interrupt.h>
+#include <linux/mfd/core.h>
+#include <linux/module.h>
+
+#include <linux/mfd/bd9571mwv.h>
+
+static const struct mfd_cell bd9571mwv_cells[] = {
+	{ .name = "bd9571mwv-regulator", },
+	{ .name = "bd9571mwv-gpio", },
+};
+
+static const struct regmap_range bd9571mwv_readable_yes_ranges[] = {
+	regmap_reg_range(BD9571MWV_VENDOR_CODE, BD9571MWV_PRODUCT_REVISION),
+	regmap_reg_range(BD9571MWV_AVS_SET_MONI, BD9571MWV_AVS_DVFS_VID(3)),
+	regmap_reg_range(BD9571MWV_VD18_VID, BD9571MWV_VD33_VID),
+	regmap_reg_range(BD9571MWV_DVFS_VINIT, BD9571MWV_DVFS_VINIT),
+	regmap_reg_range(BD9571MWV_DVFS_SETVMAX, BD9571MWV_DVFS_MONIVDAC),
+	regmap_reg_range(BD9571MWV_GPIO_IN, BD9571MWV_GPIO_IN),
+	regmap_reg_range(BD9571MWV_GPIO_INT, BD9571MWV_GPIO_INTMASK),
+	regmap_reg_range(BD9571MWV_INT_INTREQ, BD9571MWV_INT_INTMASK),
+};
+
+static const struct regmap_access_table bd9571mwv_readable_table = {
+	.yes_ranges	= bd9571mwv_readable_yes_ranges,
+	.n_yes_ranges	= ARRAY_SIZE(bd9571mwv_readable_yes_ranges),
+};
+
+static const struct regmap_range bd9571mwv_writable_yes_ranges[] = {
+	regmap_reg_range(BD9571MWV_AVS_VD09_VID(0), BD9571MWV_AVS_VD09_VID(3)),
+	regmap_reg_range(BD9571MWV_DVFS_SETVID, BD9571MWV_DVFS_SETVID),
+	regmap_reg_range(BD9571MWV_GPIO_DIR, BD9571MWV_GPIO_OUT),
+	regmap_reg_range(BD9571MWV_GPIO_INT_SET, BD9571MWV_GPIO_INTMASK),
+	regmap_reg_range(BD9571MWV_INT_INTREQ, BD9571MWV_INT_INTMASK),
+};
+
+static const struct regmap_access_table bd9571mwv_writable_table = {
+	.yes_ranges	= bd9571mwv_writable_yes_ranges,
+	.n_yes_ranges	= ARRAY_SIZE(bd9571mwv_writable_yes_ranges),
+};
+
+static const struct regmap_range bd9571mwv_volatile_yes_ranges[] = {
+	regmap_reg_range(BD9571MWV_GPIO_IN, BD9571MWV_GPIO_IN),
+	regmap_reg_range(BD9571MWV_GPIO_INT, BD9571MWV_GPIO_INT),
+	regmap_reg_range(BD9571MWV_INT_INTREQ, BD9571MWV_INT_INTREQ),
+};
+
+static const struct regmap_access_table bd9571mwv_volatile_table = {
+	.yes_ranges	= bd9571mwv_volatile_yes_ranges,
+	.n_yes_ranges	= ARRAY_SIZE(bd9571mwv_volatile_yes_ranges),
+};
+
+static const struct regmap_config bd9571mwv_regmap_config = {
+	.reg_bits	= 8,
+	.val_bits	= 8,
+	.cache_type	= REGCACHE_RBTREE,
+	.rd_table	= &bd9571mwv_readable_table,
+	.wr_table	= &bd9571mwv_writable_table,
+	.volatile_table	= &bd9571mwv_volatile_table,
+	.max_register	= 0xff,
+};
+
+static const struct regmap_irq bd9571mwv_irqs[] = {
+	REGMAP_IRQ_REG(BD9571MWV_IRQ_MD1, 0,
+		       BD9571MWV_INT_INTREQ_MD1_INT),
+	REGMAP_IRQ_REG(BD9571MWV_IRQ_MD2_E1, 0,
+		       BD9571MWV_INT_INTREQ_MD2_E1_INT),
+	REGMAP_IRQ_REG(BD9571MWV_IRQ_MD2_E2, 0,
+		       BD9571MWV_INT_INTREQ_MD2_E2_INT),
+	REGMAP_IRQ_REG(BD9571MWV_IRQ_PROT_ERR, 0,
+		       BD9571MWV_INT_INTREQ_PROT_ERR_INT),
+	REGMAP_IRQ_REG(BD9571MWV_IRQ_GP, 0,
+		       BD9571MWV_INT_INTREQ_GP_INT),
+	REGMAP_IRQ_REG(BD9571MWV_IRQ_128H_OF, 0,
+		       BD9571MWV_INT_INTREQ_128H_OF_INT),
+	REGMAP_IRQ_REG(BD9571MWV_IRQ_WDT_OF, 0,
+		       BD9571MWV_INT_INTREQ_WDT_OF_INT),
+	REGMAP_IRQ_REG(BD9571MWV_IRQ_BKUP_TRG, 0,
+		       BD9571MWV_INT_INTREQ_BKUP_TRG_INT),
+};
+
+static struct regmap_irq_chip bd9571mwv_irq_chip = {
+	.name		= "bd9571mwv",
+	.status_base	= BD9571MWV_INT_INTREQ,
+	.mask_base	= BD9571MWV_INT_INTMASK,
+	.ack_base	= BD9571MWV_INT_INTREQ,
+	.init_ack_masked = true,
+	.num_regs	= 1,
+	.irqs		= bd9571mwv_irqs,
+	.num_irqs	= ARRAY_SIZE(bd9571mwv_irqs),
+};
+
+static int bd9571mwv_identify(struct bd9571mwv *bd)
+{
+	struct device *dev = bd->dev;
+	unsigned int value;
+	int ret;
+
+	ret = regmap_read(bd->regmap, BD9571MWV_VENDOR_CODE, &value);
+	if (ret) {
+		dev_err(dev, "Failed to read vendor code register (ret=%i)\n",
+			ret);
+		return ret;
+	}
+
+	if (value != BD9571MWV_VENDOR_CODE_VAL) {
+		dev_err(dev, "Invalid vendor code ID %02x (expected %02x)\n",
+			value, BD9571MWV_VENDOR_CODE_VAL);
+		return -EINVAL;
+	}
+
+	ret = regmap_read(bd->regmap, BD9571MWV_PRODUCT_CODE, &value);
+	if (ret) {
+		dev_err(dev, "Failed to read product code register (ret=%i)\n",
+			ret);
+		return ret;
+	}
+
+	if (value != BD9571MWV_PRODUCT_CODE_VAL) {
+		dev_err(dev, "Invalid product code ID %02x (expected %02x)\n",
+			value, BD9571MWV_PRODUCT_CODE_VAL);
+		return -EINVAL;
+	}
+
+	ret = regmap_read(bd->regmap, BD9571MWV_PRODUCT_REVISION, &value);
+	if (ret) {
+		dev_err(dev, "Failed to read revision register (ret=%i)\n",
+			ret);
+		return ret;
+	}
+
+	dev_info(dev, "Device: BD9571MWV rev. %d\n", value & 0xff);
+
+	return 0;
+}
+
+static int bd9571mwv_probe(struct i2c_client *client,
+			  const struct i2c_device_id *ids)
+{
+	struct bd9571mwv *bd;
+	int ret;
+
+	bd = devm_kzalloc(&client->dev, sizeof(*bd), GFP_KERNEL);
+	if (!bd)
+		return -ENOMEM;
+
+	i2c_set_clientdata(client, bd);
+	bd->dev = &client->dev;
+	bd->irq = client->irq;
+
+	bd->regmap = devm_regmap_init_i2c(client, &bd9571mwv_regmap_config);
+	if (IS_ERR(bd->regmap)) {
+		dev_err(bd->dev, "Failed to initialize register map\n");
+		return PTR_ERR(bd->regmap);
+	}
+
+	ret = bd9571mwv_identify(bd);
+	if (ret)
+		return ret;
+
+	ret = regmap_add_irq_chip(bd->regmap, bd->irq, IRQF_ONESHOT, 0,
+				  &bd9571mwv_irq_chip, &bd->irq_data);
+	if (ret) {
+		dev_err(bd->dev, "Failed to register IRQ chip\n");
+		return ret;
+	}
+
+	ret = mfd_add_devices(bd->dev, PLATFORM_DEVID_AUTO, bd9571mwv_cells,
+			      ARRAY_SIZE(bd9571mwv_cells), NULL, 0,
+			      regmap_irq_get_domain(bd->irq_data));
+	if (ret) {
+		regmap_del_irq_chip(bd->irq, bd->irq_data);
+		return ret;
+	}
+
+	return 0;
+}
+
+static int bd9571mwv_remove(struct i2c_client *client)
+{
+	struct bd9571mwv *bd = i2c_get_clientdata(client);
+
+	regmap_del_irq_chip(bd->irq, bd->irq_data);
+
+	return 0;
+}
+
+static const struct of_device_id bd9571mwv_of_match_table[] = {
+	{ .compatible = "rohm,bd9571mwv", },
+	{ /* sentinel */ }
+};
+MODULE_DEVICE_TABLE(of, bd9571mwv_of_match_table);
+
+static const struct i2c_device_id bd9571mwv_id_table[] = {
+	{ "bd9571mwv", 0 },
+	{ /* sentinel */ }
+};
+MODULE_DEVICE_TABLE(i2c, bd9571mwv_id_table);
+
+static struct i2c_driver bd9571mwv_driver = {
+	.driver		= {
+		.name	= "bd9571mwv",
+		.of_match_table = bd9571mwv_of_match_table,
+	},
+	.probe		= bd9571mwv_probe,
+	.remove		= bd9571mwv_remove,
+	.id_table       = bd9571mwv_id_table,
+};
+module_i2c_driver(bd9571mwv_driver);
+
+MODULE_AUTHOR("Marek Vasut <marek.vasut+renesas@gmail.com>");
+MODULE_DESCRIPTION("BD9571MWV PMIC Driver");
+MODULE_LICENSE("GPL v2");
diff --git a/include/linux/mfd/bd9571mwv.h b/include/linux/mfd/bd9571mwv.h
new file mode 100644
index 000000000000..f0708ba4cbba
--- /dev/null
+++ b/include/linux/mfd/bd9571mwv.h
@@ -0,0 +1,115 @@
+/*
+ * ROHM BD9571MWV-M driver
+ *
+ * Copyright (C) 2017 Marek Vasut <marek.vasut+renesas@gmail.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed "as is" WITHOUT ANY WARRANTY of any
+ * kind, whether expressed or implied; without even the implied warranty
+ * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License version 2 for more details.
+ *
+ * Based on the TPS65086 driver
+ */
+
+#ifndef __LINUX_MFD_BD9571MWV_H
+#define __LINUX_MFD_BD9571MWV_H
+
+#include <linux/device.h>
+#include <linux/regmap.h>
+
+/* List of registers for BD9571MWV */
+#define BD9571MWV_VENDOR_CODE			0x00
+#define BD9571MWV_VENDOR_CODE_VAL		0xdb
+#define BD9571MWV_PRODUCT_CODE			0x01
+#define BD9571MWV_PRODUCT_CODE_VAL		0x60
+#define BD9571MWV_PRODUCT_REVISION		0x02
+
+#define BD9571MWV_I2C_FUSA_MODE			0x10
+#define BD9571MWV_I2C_MD2_E1_BIT_1		0x11
+#define BD9571MWV_I2C_MD2_E1_BIT_2		0x12
+
+#define BD9571MWV_BKUP_MODE_CNT			0x20
+#define BD9571MWV_BKUP_MODE_STATUS		0x21
+#define BD9571MWV_BKUP_RECOVERY_CNT		0x22
+#define BD9571MWV_BKUP_CTRL_TIM_CNT		0x23
+#define BD9571MWV_WAITBKUP_WDT_CNT		0x24
+#define BD9571MWV_128H_TIM_CNT			0x26
+#define BD9571MWV_QLLM_CNT			0x27
+
+#define BD9571MWV_AVS_SET_MONI			0x31
+#define BD9571MWV_AVS_SET_MONI_MASK		0x3
+#define BD9571MWV_AVS_VD09_VID(n)		(0x32 + (n))
+#define BD9571MWV_AVS_DVFS_VID(n)		(0x36 + (n))
+
+#define BD9571MWV_VD18_VID			0x42
+#define BD9571MWV_VD25_VID			0x43
+#define BD9571MWV_VD33_VID			0x44
+
+#define BD9571MWV_DVFS_VINIT			0x50
+#define BD9571MWV_DVFS_SETVMAX			0x52
+#define BD9571MWV_DVFS_BOOSTVID			0x53
+#define BD9571MWV_DVFS_SETVID			0x54
+#define BD9571MWV_DVFS_MONIVDAC			0x55
+#define BD9571MWV_DVFS_PGD_CNT			0x56
+
+#define BD9571MWV_GPIO_DIR			0x60
+#define BD9571MWV_GPIO_OUT			0x61
+#define BD9571MWV_GPIO_IN			0x62
+#define BD9571MWV_GPIO_DEB			0x63
+#define BD9571MWV_GPIO_INT_SET			0x64
+#define BD9571MWV_GPIO_INT			0x65
+#define BD9571MWV_GPIO_INTMASK			0x66
+
+#define BD9571MWV_REG_KEEP(n)			(0x70 + (n))
+
+#define BD9571MWV_PMIC_INTERNAL_STATUS		0x80
+#define BD9571MWV_PROT_ERROR_STATUS0		0x81
+#define BD9571MWV_PROT_ERROR_STATUS1		0x82
+#define BD9571MWV_PROT_ERROR_STATUS2		0x83
+#define BD9571MWV_PROT_ERROR_STATUS3		0x84
+#define BD9571MWV_PROT_ERROR_STATUS4		0x85
+
+#define BD9571MWV_INT_INTREQ			0x90
+#define BD9571MWV_INT_INTREQ_MD1_INT		BIT(0)
+#define BD9571MWV_INT_INTREQ_MD2_E1_INT		BIT(1)
+#define BD9571MWV_INT_INTREQ_MD2_E2_INT		BIT(2)
+#define BD9571MWV_INT_INTREQ_PROT_ERR_INT	BIT(3)
+#define BD9571MWV_INT_INTREQ_GP_INT		BIT(4)
+#define BD9571MWV_INT_INTREQ_128H_OF_INT	BIT(5)
+#define BD9571MWV_INT_INTREQ_WDT_OF_INT		BIT(6)
+#define BD9571MWV_INT_INTREQ_BKUP_TRG_INT	BIT(7)
+#define BD9571MWV_INT_INTMASK			0x91
+
+#define BD9571MWV_ACCESS_KEY			0xff
+
+/* Define the BD9571MWV IRQ numbers */
+enum bd9571mwv_irqs {
+	BD9571MWV_IRQ_MD1,
+	BD9571MWV_IRQ_MD2_E1,
+	BD9571MWV_IRQ_MD2_E2,
+	BD9571MWV_IRQ_PROT_ERR,
+	BD9571MWV_IRQ_GP,
+	BD9571MWV_IRQ_128H_OF,
+	BD9571MWV_IRQ_WDT_OF,
+	BD9571MWV_IRQ_BKUP_TRG,
+};
+
+/**
+ * struct bd9571mwv - state holder for the bd9571mwv driver
+ *
+ * Device data may be used to access the BD9571MWV chip
+ */
+struct bd9571mwv {
+	struct device *dev;
+	struct regmap *regmap;
+
+	/* IRQ Data */
+	int irq;
+	struct regmap_irq_chip_data *irq_data;
+};
+
+#endif /* __LINUX_MFD_BD9571MWV_H */
-- 
cgit v1.2.3


From 7303733a6ca2a68b210ebdc09cace8b0ffe8b179 Mon Sep 17 00:00:00 2001
From: Chen-Yu Tsai <wens@csie.org>
Date: Wed, 26 Jul 2017 16:28:26 +0800
Subject: mfd: axp20x: Add support for AXP813 PMIC

The X-Powers AXP813 PMIC is normally used with Allwinner's A83T SoC.
It has the same range of functions as other X-Powers PMICs, such as
DC-DC buck converter and linear regulator outputs, AC-IN and VBUS
power supplies, power button trigger, GPIOs, ADCs, and a battery
charger.

Note that the IRQ table given in the datasheet is incorrect: in IRQ
enable/status registers 1, there are separate IRQs for ACIN and VBUS,
instead of bits [7:5] being the same as bits [4:2]. So it shares the
same IRQs as the AXP803, rather than the AXP288.

This patch adds basic mfd support for it, with only the power button
enabled.

Signed-off-by: Chen-Yu Tsai <wens@csie.org>
Acked-by: Maxime Ripard <maxime.ripard@free-electrons.com>
Signed-off-by: Lee Jones <lee.jones@linaro.org>
---
 drivers/mfd/axp20x-rsb.c   |  1 +
 drivers/mfd/axp20x.c       | 22 ++++++++++++++++++++++
 include/linux/mfd/axp20x.h | 29 +++++++++++++++++++++++++++++
 3 files changed, 52 insertions(+)

(limited to 'include')

diff --git a/drivers/mfd/axp20x-rsb.c b/drivers/mfd/axp20x-rsb.c
index fd5c7267b136..7ddbd9e8dd03 100644
--- a/drivers/mfd/axp20x-rsb.c
+++ b/drivers/mfd/axp20x-rsb.c
@@ -64,6 +64,7 @@ static const struct of_device_id axp20x_rsb_of_match[] = {
 	{ .compatible = "x-powers,axp803", .data = (void *)AXP803_ID },
 	{ .compatible = "x-powers,axp806", .data = (void *)AXP806_ID },
 	{ .compatible = "x-powers,axp809", .data = (void *)AXP809_ID },
+	{ .compatible = "x-powers,axp813", .data = (void *)AXP813_ID },
 	{ },
 };
 MODULE_DEVICE_TABLE(of, axp20x_rsb_of_match);
diff --git a/drivers/mfd/axp20x.c b/drivers/mfd/axp20x.c
index 917b6ddc4f15..ec4271ca065c 100644
--- a/drivers/mfd/axp20x.c
+++ b/drivers/mfd/axp20x.c
@@ -44,6 +44,7 @@ static const char * const axp20x_model_names[] = {
 	"AXP803",
 	"AXP806",
 	"AXP809",
+	"AXP813",
 };
 
 static const struct regmap_range axp152_writeable_ranges[] = {
@@ -870,6 +871,14 @@ static struct mfd_cell axp809_cells[] = {
 	},
 };
 
+static struct mfd_cell axp813_cells[] = {
+	{
+		.name			= "axp20x-pek",
+		.num_resources		= ARRAY_SIZE(axp803_pek_resources),
+		.resources		= axp803_pek_resources,
+	}
+};
+
 static struct axp20x_dev *axp20x_pm_power_off;
 static void axp20x_power_off(void)
 {
@@ -956,6 +965,19 @@ int axp20x_match_device(struct axp20x_dev *axp20x)
 		axp20x->regmap_cfg = &axp22x_regmap_config;
 		axp20x->regmap_irq_chip = &axp809_regmap_irq_chip;
 		break;
+	case AXP813_ID:
+		axp20x->nr_cells = ARRAY_SIZE(axp813_cells);
+		axp20x->cells = axp813_cells;
+		axp20x->regmap_cfg = &axp288_regmap_config;
+		/*
+		 * The IRQ table given in the datasheet is incorrect.
+		 * In IRQ enable/status registers 1, there are separate
+		 * IRQs for ACIN and VBUS, instead of bits [7:5] being
+		 * the same as bits [4:2]. So it shares the same IRQs
+		 * as the AXP803, rather than the AXP288.
+		 */
+		axp20x->regmap_irq_chip = &axp803_regmap_irq_chip;
+		break;
 	default:
 		dev_err(dev, "unsupported AXP20X ID %lu\n", axp20x->variant);
 		return -EINVAL;
diff --git a/include/linux/mfd/axp20x.h b/include/linux/mfd/axp20x.h
index 965b027e31b3..e9c908c4fba8 100644
--- a/include/linux/mfd/axp20x.h
+++ b/include/linux/mfd/axp20x.h
@@ -23,6 +23,7 @@ enum axp20x_variants {
 	AXP803_ID,
 	AXP806_ID,
 	AXP809_ID,
+	AXP813_ID,
 	NR_AXP20X_VARIANTS,
 };
 
@@ -387,6 +388,34 @@ enum {
 	AXP803_REG_ID_MAX,
 };
 
+enum {
+	AXP813_DCDC1 = 0,
+	AXP813_DCDC2,
+	AXP813_DCDC3,
+	AXP813_DCDC4,
+	AXP813_DCDC5,
+	AXP813_DCDC6,
+	AXP813_DCDC7,
+	AXP813_ALDO1,
+	AXP813_ALDO2,
+	AXP813_ALDO3,
+	AXP813_DLDO1,
+	AXP813_DLDO2,
+	AXP813_DLDO3,
+	AXP813_DLDO4,
+	AXP813_ELDO1,
+	AXP813_ELDO2,
+	AXP813_ELDO3,
+	AXP813_FLDO1,
+	AXP813_FLDO2,
+	AXP813_FLDO3,
+	AXP813_RTC_LDO,
+	AXP813_LDO_IO0,
+	AXP813_LDO_IO1,
+	AXP813_SW,
+	AXP813_REG_ID_MAX,
+};
+
 /* IRQs */
 enum {
 	AXP152_IRQ_LDO0IN_CONNECT = 1,
-- 
cgit v1.2.3


From b0f3ab20e76499db12b0bbadb5737d9870f10418 Mon Sep 17 00:00:00 2001
From: Ludovic Desroches <ludovic.desroches@microchip.com>
Date: Tue, 18 Jul 2017 15:22:19 +0200
Subject: mfd: syscon: atmel-smc: Add helper to retrieve register layout

For HSMC controller, the register layout depends on the device i.e. the
offset of setup, pulse, cycle, mode and timings registers is not the
same. An helper is added to provide the correct register layout.

Fixes: fe9d7cb22ef3 ("mfd: syscon: atmel-smc: Add new helpers to ease
SMC regs manipulation")
Suggested-by: Boris Brezillon <boris.brezillon@free-electrons.com>
Signed-off-by: Ludovic Desroches <ludovic.desroches@microchip.com>
Acked-by: Boris Brezillon <boris.brezillon@free-electrons.com>
Acked-by: Nicolas Ferre <nicolas.ferre@microchip.com>
Acked-by: Alexandre Belloni <alexandre.belloni@free-electrons.com>
Signed-off-by: Lee Jones <lee.jones@linaro.org>
---
 drivers/memory/atmel-ebi.c               | 13 +++++--
 drivers/mfd/atmel-smc.c                  | 67 +++++++++++++++++++++++++-------
 drivers/mtd/nand/atmel/nand-controller.c | 10 +++--
 include/linux/mfd/syscon/atmel-smc.h     | 32 ++++++++++-----
 4 files changed, 92 insertions(+), 30 deletions(-)

(limited to 'include')

diff --git a/drivers/memory/atmel-ebi.c b/drivers/memory/atmel-ebi.c
index ebf69ff48ae2..c00a7c7f460a 100644
--- a/drivers/memory/atmel-ebi.c
+++ b/drivers/memory/atmel-ebi.c
@@ -51,6 +51,7 @@ struct atmel_ebi {
 	struct  {
 		struct regmap *regmap;
 		struct clk *clk;
+		const struct atmel_hsmc_reg_layout *layout;
 	} smc;
 
 	struct device *dev;
@@ -84,8 +85,8 @@ static void at91sam9_ebi_get_config(struct atmel_ebi_dev *ebid,
 static void sama5_ebi_get_config(struct atmel_ebi_dev *ebid,
 				 struct atmel_ebi_dev_config *conf)
 {
-	atmel_hsmc_cs_conf_get(ebid->ebi->smc.regmap, conf->cs,
-			       &conf->smcconf);
+	atmel_hsmc_cs_conf_get(ebid->ebi->smc.regmap, ebid->ebi->smc.layout,
+			       conf->cs, &conf->smcconf);
 }
 
 static const struct atmel_smc_timing_xlate timings_xlate_table[] = {
@@ -287,8 +288,8 @@ static void at91sam9_ebi_apply_config(struct atmel_ebi_dev *ebid,
 static void sama5_ebi_apply_config(struct atmel_ebi_dev *ebid,
 				   struct atmel_ebi_dev_config *conf)
 {
-	atmel_hsmc_cs_conf_apply(ebid->ebi->smc.regmap, conf->cs,
-				 &conf->smcconf);
+	atmel_hsmc_cs_conf_apply(ebid->ebi->smc.regmap, ebid->ebi->smc.layout,
+				 conf->cs, &conf->smcconf);
 }
 
 static int atmel_ebi_dev_setup(struct atmel_ebi *ebi, struct device_node *np,
@@ -527,6 +528,10 @@ static int atmel_ebi_probe(struct platform_device *pdev)
 	if (IS_ERR(ebi->smc.regmap))
 		return PTR_ERR(ebi->smc.regmap);
 
+	ebi->smc.layout = atmel_hsmc_get_reg_layout(smc_np);
+	if (IS_ERR(ebi->smc.layout))
+		return PTR_ERR(ebi->smc.layout);
+
 	ebi->smc.clk = of_clk_get(smc_np, 0);
 	if (IS_ERR(ebi->smc.clk)) {
 		if (PTR_ERR(ebi->smc.clk) != -ENOENT)
diff --git a/drivers/mfd/atmel-smc.c b/drivers/mfd/atmel-smc.c
index 20cc0ea470fa..7d77948567d7 100644
--- a/drivers/mfd/atmel-smc.c
+++ b/drivers/mfd/atmel-smc.c
@@ -258,19 +258,21 @@ EXPORT_SYMBOL_GPL(atmel_smc_cs_conf_apply);
  * atmel_hsmc_cs_conf_apply - apply an SMC CS conf
  * @regmap: the HSMC regmap
  * @cs: the CS id
+ * @layout: the layout of registers
  * @conf the SMC CS conf to apply
  *
  * Applies an SMC CS configuration.
  * Only valid on post-sama5 SoCs.
  */
-void atmel_hsmc_cs_conf_apply(struct regmap *regmap, int cs,
-			      const struct atmel_smc_cs_conf *conf)
+void atmel_hsmc_cs_conf_apply(struct regmap *regmap,
+			      const struct atmel_hsmc_reg_layout *layout,
+			      int cs, const struct atmel_smc_cs_conf *conf)
 {
-	regmap_write(regmap, ATMEL_HSMC_SETUP(cs), conf->setup);
-	regmap_write(regmap, ATMEL_HSMC_PULSE(cs), conf->pulse);
-	regmap_write(regmap, ATMEL_HSMC_CYCLE(cs), conf->cycle);
-	regmap_write(regmap, ATMEL_HSMC_TIMINGS(cs), conf->timings);
-	regmap_write(regmap, ATMEL_HSMC_MODE(cs), conf->mode);
+	regmap_write(regmap, ATMEL_HSMC_SETUP(layout, cs), conf->setup);
+	regmap_write(regmap, ATMEL_HSMC_PULSE(layout, cs), conf->pulse);
+	regmap_write(regmap, ATMEL_HSMC_CYCLE(layout, cs), conf->cycle);
+	regmap_write(regmap, ATMEL_HSMC_TIMINGS(layout, cs), conf->timings);
+	regmap_write(regmap, ATMEL_HSMC_MODE(layout, cs), conf->mode);
 }
 EXPORT_SYMBOL_GPL(atmel_hsmc_cs_conf_apply);
 
@@ -297,18 +299,55 @@ EXPORT_SYMBOL_GPL(atmel_smc_cs_conf_get);
  * atmel_hsmc_cs_conf_get - retrieve the current SMC CS conf
  * @regmap: the HSMC regmap
  * @cs: the CS id
+ * @layout: the layout of registers
  * @conf: the SMC CS conf object to store the current conf
  *
  * Retrieve the SMC CS configuration.
  * Only valid on post-sama5 SoCs.
  */
-void atmel_hsmc_cs_conf_get(struct regmap *regmap, int cs,
-			    struct atmel_smc_cs_conf *conf)
+void atmel_hsmc_cs_conf_get(struct regmap *regmap,
+			    const struct atmel_hsmc_reg_layout *layout,
+			    int cs, struct atmel_smc_cs_conf *conf)
 {
-	regmap_read(regmap, ATMEL_HSMC_SETUP(cs), &conf->setup);
-	regmap_read(regmap, ATMEL_HSMC_PULSE(cs), &conf->pulse);
-	regmap_read(regmap, ATMEL_HSMC_CYCLE(cs), &conf->cycle);
-	regmap_read(regmap, ATMEL_HSMC_TIMINGS(cs), &conf->timings);
-	regmap_read(regmap, ATMEL_HSMC_MODE(cs), &conf->mode);
+	regmap_read(regmap, ATMEL_HSMC_SETUP(layout, cs), &conf->setup);
+	regmap_read(regmap, ATMEL_HSMC_PULSE(layout, cs), &conf->pulse);
+	regmap_read(regmap, ATMEL_HSMC_CYCLE(layout, cs), &conf->cycle);
+	regmap_read(regmap, ATMEL_HSMC_TIMINGS(layout, cs), &conf->timings);
+	regmap_read(regmap, ATMEL_HSMC_MODE(layout, cs), &conf->mode);
 }
 EXPORT_SYMBOL_GPL(atmel_hsmc_cs_conf_get);
+
+static const struct atmel_hsmc_reg_layout sama5d3_reg_layout = {
+	.timing_regs_offset = 0x600,
+};
+
+static const struct atmel_hsmc_reg_layout sama5d2_reg_layout = {
+	.timing_regs_offset = 0x700,
+};
+
+static const struct of_device_id atmel_smc_ids[] = {
+	{ .compatible = "atmel,at91sam9260-smc", .data = NULL },
+	{ .compatible = "atmel,sama5d3-smc", .data = &sama5d3_reg_layout },
+	{ .compatible = "atmel,sama5d2-smc", .data = &sama5d2_reg_layout },
+	{ /* sentinel */ },
+};
+
+/**
+ * atmel_hsmc_get_reg_layout - retrieve the layout of HSMC registers
+ * @np: the HSMC regmap
+ *
+ * Retrieve the layout of HSMC registers.
+ *
+ * Returns NULL in case of SMC, a struct atmel_hsmc_reg_layout pointer
+ * in HSMC case, otherwise ERR_PTR(-EINVAL).
+ */
+const struct atmel_hsmc_reg_layout *
+atmel_hsmc_get_reg_layout(struct device_node *np)
+{
+	const struct of_device_id *match;
+
+	match = of_match_node(atmel_smc_ids, np);
+
+	return match ? match->data : ERR_PTR(-EINVAL);
+}
+EXPORT_SYMBOL_GPL(atmel_hsmc_get_reg_layout);
diff --git a/drivers/mtd/nand/atmel/nand-controller.c b/drivers/mtd/nand/atmel/nand-controller.c
index ceec21bd30c4..1913ce18fb1c 100644
--- a/drivers/mtd/nand/atmel/nand-controller.c
+++ b/drivers/mtd/nand/atmel/nand-controller.c
@@ -247,6 +247,7 @@ struct atmel_hsmc_nand_controller {
 		void __iomem *virt;
 		dma_addr_t dma;
 	} sram;
+	const struct atmel_hsmc_reg_layout *hsmc_layout;
 	struct regmap *io;
 	struct atmel_nfc_op op;
 	struct completion complete;
@@ -1442,12 +1443,12 @@ static int atmel_hsmc_nand_setup_data_interface(struct atmel_nand *nand,
 					int csline,
 					const struct nand_data_interface *conf)
 {
-	struct atmel_nand_controller *nc;
+	struct atmel_hsmc_nand_controller *nc;
 	struct atmel_smc_cs_conf smcconf;
 	struct atmel_nand_cs *cs;
 	int ret;
 
-	nc = to_nand_controller(nand->base.controller);
+	nc = to_hsmc_nand_controller(nand->base.controller);
 
 	ret = atmel_smc_nand_prepare_smcconf(nand, conf, &smcconf);
 	if (ret)
@@ -1462,7 +1463,8 @@ static int atmel_hsmc_nand_setup_data_interface(struct atmel_nand *nand,
 	if (cs->rb.type == ATMEL_NAND_NATIVE_RB)
 		cs->smcconf.timings |= ATMEL_HSMC_TIMINGS_RBNSEL(cs->rb.id);
 
-	atmel_hsmc_cs_conf_apply(nc->smc, cs->id, &cs->smcconf);
+	atmel_hsmc_cs_conf_apply(nc->base.smc, nc->hsmc_layout, cs->id,
+				 &cs->smcconf);
 
 	return 0;
 }
@@ -2177,6 +2179,8 @@ atmel_hsmc_nand_controller_init(struct atmel_hsmc_nand_controller *nc)
 		return -EINVAL;
 	}
 
+	nc->hsmc_layout = atmel_hsmc_get_reg_layout(np);
+
 	nc->irq = of_irq_get(np, 0);
 	of_node_put(np);
 	if (nc->irq < 0) {
diff --git a/include/linux/mfd/syscon/atmel-smc.h b/include/linux/mfd/syscon/atmel-smc.h
index afa266169800..7a367f34b66a 100644
--- a/include/linux/mfd/syscon/atmel-smc.h
+++ b/include/linux/mfd/syscon/atmel-smc.h
@@ -15,21 +15,26 @@
 #define _LINUX_MFD_SYSCON_ATMEL_SMC_H_
 
 #include <linux/kernel.h>
+#include <linux/of.h>
 #include <linux/regmap.h>
 
 #define ATMEL_SMC_SETUP(cs)			(((cs) * 0x10))
-#define ATMEL_HSMC_SETUP(cs)			(0x600 + ((cs) * 0x14))
+#define ATMEL_HSMC_SETUP(layout, cs)		\
+	((layout)->timing_regs_offset + ((cs) * 0x14))
 #define ATMEL_SMC_PULSE(cs)			(((cs) * 0x10) + 0x4)
-#define ATMEL_HSMC_PULSE(cs)			(0x600 + ((cs) * 0x14) + 0x4)
+#define ATMEL_HSMC_PULSE(layout, cs)		\
+	((layout)->timing_regs_offset + ((cs) * 0x14) + 0x4)
 #define ATMEL_SMC_CYCLE(cs)			(((cs) * 0x10) + 0x8)
-#define ATMEL_HSMC_CYCLE(cs)			(0x600 + ((cs) * 0x14) + 0x8)
+#define ATMEL_HSMC_CYCLE(layout, cs)			\
+	((layout)->timing_regs_offset + ((cs) * 0x14) + 0x8)
 #define ATMEL_SMC_NWE_SHIFT			0
 #define ATMEL_SMC_NCS_WR_SHIFT			8
 #define ATMEL_SMC_NRD_SHIFT			16
 #define ATMEL_SMC_NCS_RD_SHIFT			24
 
 #define ATMEL_SMC_MODE(cs)			(((cs) * 0x10) + 0xc)
-#define ATMEL_HSMC_MODE(cs)			(0x600 + ((cs) * 0x14) + 0x10)
+#define ATMEL_HSMC_MODE(layout, cs)			\
+	((layout)->timing_regs_offset + ((cs) * 0x14) + 0x10)
 #define ATMEL_SMC_MODE_READMODE_MASK		BIT(0)
 #define ATMEL_SMC_MODE_READMODE_NCS		(0 << 0)
 #define ATMEL_SMC_MODE_READMODE_NRD		(1 << 0)
@@ -59,7 +64,8 @@
 #define ATMEL_SMC_MODE_PS_16			(2 << 28)
 #define ATMEL_SMC_MODE_PS_32			(3 << 28)
 
-#define ATMEL_HSMC_TIMINGS(cs)			(0x600 + ((cs) * 0x14) + 0xc)
+#define ATMEL_HSMC_TIMINGS(layout, cs)			\
+	((layout)->timing_regs_offset + ((cs) * 0x14) + 0xc)
 #define ATMEL_HSMC_TIMINGS_OCMS			BIT(12)
 #define ATMEL_HSMC_TIMINGS_RBNSEL(x)		((x) << 28)
 #define ATMEL_HSMC_TIMINGS_NFSEL		BIT(31)
@@ -69,6 +75,10 @@
 #define ATMEL_HSMC_TIMINGS_TRR_SHIFT		16
 #define ATMEL_HSMC_TIMINGS_TWB_SHIFT		24
 
+struct atmel_hsmc_reg_layout {
+	unsigned int timing_regs_offset;
+};
+
 /**
  * struct atmel_smc_cs_conf - SMC CS config as described in the datasheet.
  * @setup: NCS/NWE/NRD setup timings (not applicable to at91rm9200)
@@ -98,11 +108,15 @@ int atmel_smc_cs_conf_set_cycle(struct atmel_smc_cs_conf *conf,
 				unsigned int shift, unsigned int ncycles);
 void atmel_smc_cs_conf_apply(struct regmap *regmap, int cs,
 			     const struct atmel_smc_cs_conf *conf);
-void atmel_hsmc_cs_conf_apply(struct regmap *regmap, int cs,
-			      const struct atmel_smc_cs_conf *conf);
+void atmel_hsmc_cs_conf_apply(struct regmap *regmap,
+			      const struct atmel_hsmc_reg_layout *reglayout,
+			      int cs, const struct atmel_smc_cs_conf *conf);
 void atmel_smc_cs_conf_get(struct regmap *regmap, int cs,
 			   struct atmel_smc_cs_conf *conf);
-void atmel_hsmc_cs_conf_get(struct regmap *regmap, int cs,
-			    struct atmel_smc_cs_conf *conf);
+void atmel_hsmc_cs_conf_get(struct regmap *regmap,
+			    const struct atmel_hsmc_reg_layout *reglayout,
+			    int cs, struct atmel_smc_cs_conf *conf);
+const struct atmel_hsmc_reg_layout *
+atmel_hsmc_get_reg_layout(struct device_node *np);
 
 #endif /* _LINUX_MFD_SYSCON_ATMEL_SMC_H_ */
-- 
cgit v1.2.3


From 9bbf6a15ce19dd947b7fa6ad4095931ab3682da8 Mon Sep 17 00:00:00 2001
From: Rajmohan Mani <rajmohan.mani@intel.com>
Date: Fri, 28 Jul 2017 17:30:24 -0700
Subject: mfd: Add support for TPS68470 device

The TPS68470 device is an advanced power management
unit that powers a Compact Camera Module (CCM),
generates clocks for image sensors, drives a dual
LED for Flash and incorporates two LED drivers for
general purpose indicators.

This patch adds support for TPS68470 mfd device.

Signed-off-by: Rajmohan Mani <rajmohan.mani@intel.com>
Signed-off-by: Lee Jones <lee.jones@linaro.org>
---
 drivers/mfd/Kconfig          |  18 ++++++++
 drivers/mfd/Makefile         |   1 +
 drivers/mfd/tps68470.c       | 106 +++++++++++++++++++++++++++++++++++++++++++
 include/linux/mfd/tps68470.h |  97 +++++++++++++++++++++++++++++++++++++++
 4 files changed, 222 insertions(+)
 create mode 100644 drivers/mfd/tps68470.c
 create mode 100644 include/linux/mfd/tps68470.h

(limited to 'include')

diff --git a/drivers/mfd/Kconfig b/drivers/mfd/Kconfig
index 79e4ec6f19a8..4f57725a02ea 100644
--- a/drivers/mfd/Kconfig
+++ b/drivers/mfd/Kconfig
@@ -1352,6 +1352,24 @@ config MFD_TPS65217
 	  This driver can also be built as a module.  If so, the module
 	  will be called tps65217.
 
+config MFD_TPS68470
+	bool "TI TPS68470 Power Management / LED chips"
+	depends on ACPI && I2C=y
+	select MFD_CORE
+	select REGMAP_I2C
+	select I2C_DESIGNWARE_PLATFORM
+	help
+	  If you say yes here you get support for the TPS68470 series of
+	  Power Management / LED chips.
+
+	  These include voltage regulators, LEDs and other features
+	  that are often used in portable devices.
+
+	  This option is a bool as it provides an ACPI operation
+	  region, which must be available before any of the devices
+	  using this are probed. This option also configures the
+	  designware-i2c driver to be built-in, for the same reason.
+
 config MFD_TI_LP873X
 	tristate "TI LP873X Power Management IC"
 	depends on I2C
diff --git a/drivers/mfd/Makefile b/drivers/mfd/Makefile
index 21e19a5f3f3c..c3d0a1b39bb6 100644
--- a/drivers/mfd/Makefile
+++ b/drivers/mfd/Makefile
@@ -84,6 +84,7 @@ obj-$(CONFIG_MFD_TPS65910)	+= tps65910.o
 obj-$(CONFIG_MFD_TPS65912)	+= tps65912-core.o
 obj-$(CONFIG_MFD_TPS65912_I2C)	+= tps65912-i2c.o
 obj-$(CONFIG_MFD_TPS65912_SPI)  += tps65912-spi.o
+obj-$(CONFIG_MFD_TPS68470)	+= tps68470.o
 obj-$(CONFIG_MFD_TPS80031)	+= tps80031.o
 obj-$(CONFIG_MENELAUS)		+= menelaus.o
 
diff --git a/drivers/mfd/tps68470.c b/drivers/mfd/tps68470.c
new file mode 100644
index 000000000000..189efaea054c
--- /dev/null
+++ b/drivers/mfd/tps68470.c
@@ -0,0 +1,106 @@
+/*
+ * TPS68470 chip Parent driver
+ *
+ * Copyright (C) 2017 Intel Corporation
+ *
+ * Authors:
+ *	Rajmohan Mani <rajmohan.mani@intel.com>
+ *	Tianshu Qiu <tian.shu.qiu@intel.com>
+ *	Jian Xu Zheng <jian.xu.zheng@intel.com>
+ *	Yuning Pu <yuning.pu@intel.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation version 2.
+ *
+ * This program is distributed "as is" WITHOUT ANY WARRANTY of any
+ * kind, whether express or implied; without even the implied warranty
+ * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ */
+
+#include <linux/acpi.h>
+#include <linux/delay.h>
+#include <linux/i2c.h>
+#include <linux/init.h>
+#include <linux/mfd/core.h>
+#include <linux/mfd/tps68470.h>
+#include <linux/regmap.h>
+
+static const struct mfd_cell tps68470s[] = {
+	{ .name = "tps68470-gpio" },
+	{ .name = "tps68470_pmic_opregion" },
+};
+
+static const struct regmap_config tps68470_regmap_config = {
+	.reg_bits = 8,
+	.val_bits = 8,
+	.max_register = TPS68470_REG_MAX,
+};
+
+static int tps68470_chip_init(struct device *dev, struct regmap *regmap)
+{
+	unsigned int version;
+	int ret;
+
+	/* Force software reset */
+	ret = regmap_write(regmap, TPS68470_REG_RESET, TPS68470_REG_RESET_MASK);
+	if (ret)
+		return ret;
+
+	ret = regmap_read(regmap, TPS68470_REG_REVID, &version);
+	if (ret) {
+		dev_err(dev, "Failed to read revision register: %d\n", ret);
+		return ret;
+	}
+
+	dev_info(dev, "TPS68470 REVID: 0x%x\n", version);
+
+	return 0;
+}
+
+static int tps68470_probe(struct i2c_client *client)
+{
+	struct device *dev = &client->dev;
+	struct regmap *regmap;
+	int ret;
+
+	regmap = devm_regmap_init_i2c(client, &tps68470_regmap_config);
+	if (IS_ERR(regmap)) {
+		dev_err(dev, "devm_regmap_init_i2c Error %ld\n",
+			PTR_ERR(regmap));
+		return PTR_ERR(regmap);
+	}
+
+	i2c_set_clientdata(client, regmap);
+
+	ret = tps68470_chip_init(dev, regmap);
+	if (ret < 0) {
+		dev_err(dev, "TPS68470 Init Error %d\n", ret);
+		return ret;
+	}
+
+	ret = devm_mfd_add_devices(dev, PLATFORM_DEVID_NONE, tps68470s,
+			      ARRAY_SIZE(tps68470s), NULL, 0, NULL);
+	if (ret < 0) {
+		dev_err(dev, "devm_mfd_add_devices failed: %d\n", ret);
+		return ret;
+	}
+
+	return 0;
+}
+
+static const struct acpi_device_id tps68470_acpi_ids[] = {
+	{"INT3472"},
+	{},
+};
+MODULE_DEVICE_TABLE(acpi, tps68470_acpi_ids);
+
+static struct i2c_driver tps68470_driver = {
+	.driver = {
+		   .name = "tps68470",
+		   .acpi_match_table = tps68470_acpi_ids,
+	},
+	.probe_new = tps68470_probe,
+};
+builtin_i2c_driver(tps68470_driver);
diff --git a/include/linux/mfd/tps68470.h b/include/linux/mfd/tps68470.h
new file mode 100644
index 000000000000..44f9d9f647ed
--- /dev/null
+++ b/include/linux/mfd/tps68470.h
@@ -0,0 +1,97 @@
+/*
+ * Copyright (c) 2017 Intel Corporation
+ *
+ * Functions to access TPS68470 power management chip.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation version 2.
+ *
+ * This program is distributed "as is" WITHOUT ANY WARRANTY of any
+ * kind, whether express or implied; without even the implied warranty
+ * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ */
+
+#ifndef __LINUX_MFD_TPS68470_H
+#define __LINUX_MFD_TPS68470_H
+
+/* Register addresses */
+#define TPS68470_REG_POSTDIV2		0x06
+#define TPS68470_REG_BOOSTDIV		0x07
+#define TPS68470_REG_BUCKDIV		0x08
+#define TPS68470_REG_PLLSWR		0x09
+#define TPS68470_REG_XTALDIV		0x0A
+#define TPS68470_REG_PLLDIV		0x0B
+#define TPS68470_REG_POSTDIV		0x0C
+#define TPS68470_REG_PLLCTL		0x0D
+#define TPS68470_REG_PLLCTL2		0x0E
+#define TPS68470_REG_CLKCFG1		0x0F
+#define TPS68470_REG_CLKCFG2		0x10
+#define TPS68470_REG_GPCTL0A		0x14
+#define TPS68470_REG_GPCTL0B		0x15
+#define TPS68470_REG_GPCTL1A		0x16
+#define TPS68470_REG_GPCTL1B		0x17
+#define TPS68470_REG_GPCTL2A		0x18
+#define TPS68470_REG_GPCTL2B		0x19
+#define TPS68470_REG_GPCTL3A		0x1A
+#define TPS68470_REG_GPCTL3B		0x1B
+#define TPS68470_REG_GPCTL4A		0x1C
+#define TPS68470_REG_GPCTL4B		0x1D
+#define TPS68470_REG_GPCTL5A		0x1E
+#define TPS68470_REG_GPCTL5B		0x1F
+#define TPS68470_REG_GPCTL6A		0x20
+#define TPS68470_REG_GPCTL6B		0x21
+#define TPS68470_REG_SGPO		0x22
+#define TPS68470_REG_GPDI		0x26
+#define TPS68470_REG_GPDO		0x27
+#define TPS68470_REG_VCMVAL		0x3C
+#define TPS68470_REG_VAUX1VAL		0x3D
+#define TPS68470_REG_VAUX2VAL		0x3E
+#define TPS68470_REG_VIOVAL		0x3F
+#define TPS68470_REG_VSIOVAL		0x40
+#define TPS68470_REG_VAVAL		0x41
+#define TPS68470_REG_VDVAL		0x42
+#define TPS68470_REG_S_I2C_CTL		0x43
+#define TPS68470_REG_VCMCTL		0x44
+#define TPS68470_REG_VAUX1CTL		0x45
+#define TPS68470_REG_VAUX2CTL		0x46
+#define TPS68470_REG_VACTL		0x47
+#define TPS68470_REG_VDCTL		0x48
+#define TPS68470_REG_RESET		0x50
+#define TPS68470_REG_REVID		0xFF
+
+#define TPS68470_REG_MAX		TPS68470_REG_REVID
+
+/* Register field definitions */
+
+#define TPS68470_REG_RESET_MASK		GENMASK(7, 0)
+#define TPS68470_VAVAL_AVOLT_MASK	GENMASK(6, 0)
+
+#define TPS68470_VDVAL_DVOLT_MASK	GENMASK(5, 0)
+#define TPS68470_VCMVAL_VCVOLT_MASK	GENMASK(6, 0)
+#define TPS68470_VIOVAL_IOVOLT_MASK	GENMASK(6, 0)
+#define TPS68470_VSIOVAL_IOVOLT_MASK	GENMASK(6, 0)
+#define TPS68470_VAUX1VAL_AUX1VOLT_MASK	GENMASK(6, 0)
+#define TPS68470_VAUX2VAL_AUX2VOLT_MASK	GENMASK(6, 0)
+
+#define TPS68470_VACTL_EN_MASK		GENMASK(0, 0)
+#define TPS68470_VDCTL_EN_MASK		GENMASK(0, 0)
+#define TPS68470_VCMCTL_EN_MASK		GENMASK(0, 0)
+#define TPS68470_S_I2C_CTL_EN_MASK	GENMASK(1, 0)
+#define TPS68470_VAUX1CTL_EN_MASK	GENMASK(0, 0)
+#define TPS68470_VAUX2CTL_EN_MASK	GENMASK(0, 0)
+#define TPS68470_PLL_EN_MASK		GENMASK(0, 0)
+
+#define TPS68470_CLKCFG1_MODE_A_MASK	GENMASK(1, 0)
+#define TPS68470_CLKCFG1_MODE_B_MASK	GENMASK(3, 2)
+
+#define TPS68470_GPIO_CTL_REG_A(x)	(TPS68470_REG_GPCTL0A + (x) * 2)
+#define TPS68470_GPIO_CTL_REG_B(x)	(TPS68470_REG_GPCTL0B + (x) * 2)
+#define TPS68470_GPIO_MODE_MASK		GENMASK(1, 0)
+#define TPS68470_GPIO_MODE_IN		0
+#define TPS68470_GPIO_MODE_IN_PULLUP	1
+#define TPS68470_GPIO_MODE_OUT_CMOS	2
+#define TPS68470_GPIO_MODE_OUT_ODRAIN	3
+
+#endif /* __LINUX_MFD_TPS68470_H */
-- 
cgit v1.2.3


From 53168215909281a09d3afc6fb51a9d4f81f74d39 Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes.berg@intel.com>
Date: Thu, 22 Jun 2017 12:20:30 +0200
Subject: mac80211: fix VLAN handling with TXQs

With TXQs, the AP_VLAN interfaces are resolved to their owner AP
interface when enqueuing the frame, which makes sense since the
frame really goes out on that as far as the driver is concerned.

However, this introduces a problem: frames to be encrypted with
a VLAN-specific GTK will now be encrypted with the AP GTK, since
the information about which virtual interface to use to select
the key is taken from the TXQ.

Fix this by preserving info->control.vif and using that in the
dequeue function. This now requires doing the driver-mapping
in the dequeue as well.

Since there's no way to filter the frames that are sitting on a
TXQ, drop all frames, which may affect other interfaces, when an
AP_VLAN is removed.

Cc: stable@vger.kernel.org
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/net/mac80211.h | 15 ++-------------
 net/mac80211/iface.c   | 17 +++++++++++++++--
 net/mac80211/tx.c      | 36 +++++++++++++++++++++++++++++-------
 3 files changed, 46 insertions(+), 22 deletions(-)

(limited to 'include')

diff --git a/include/net/mac80211.h b/include/net/mac80211.h
index f8149ca192b4..885690fa39c8 100644
--- a/include/net/mac80211.h
+++ b/include/net/mac80211.h
@@ -919,21 +919,10 @@ struct ieee80211_tx_info {
 				unsigned long jiffies;
 			};
 			/* NB: vif can be NULL for injected frames */
-			union {
-				/* NB: vif can be NULL for injected frames */
-				struct ieee80211_vif *vif;
-
-				/* When packets are enqueued on txq it's easy
-				 * to re-construct the vif pointer. There's no
-				 * more space in tx_info so it can be used to
-				 * store the necessary enqueue time for packet
-				 * sojourn time computation.
-				 */
-				codel_time_t enqueue_time;
-			};
+			struct ieee80211_vif *vif;
 			struct ieee80211_key_conf *hw_key;
 			u32 flags;
-			/* 4 bytes free */
+			codel_time_t enqueue_time;
 		} control;
 		struct {
 			u64 cookie;
diff --git a/net/mac80211/iface.c b/net/mac80211/iface.c
index 9228ac73c429..44399322f356 100644
--- a/net/mac80211/iface.c
+++ b/net/mac80211/iface.c
@@ -792,6 +792,7 @@ static int ieee80211_open(struct net_device *dev)
 static void ieee80211_do_stop(struct ieee80211_sub_if_data *sdata,
 			      bool going_down)
 {
+	struct ieee80211_sub_if_data *txq_sdata = sdata;
 	struct ieee80211_local *local = sdata->local;
 	struct fq *fq = &local->fq;
 	unsigned long flags;
@@ -937,6 +938,9 @@ static void ieee80211_do_stop(struct ieee80211_sub_if_data *sdata,
 
 	switch (sdata->vif.type) {
 	case NL80211_IFTYPE_AP_VLAN:
+		txq_sdata = container_of(sdata->bss,
+					 struct ieee80211_sub_if_data, u.ap);
+
 		mutex_lock(&local->mtx);
 		list_del(&sdata->u.vlan.list);
 		mutex_unlock(&local->mtx);
@@ -1007,8 +1011,17 @@ static void ieee80211_do_stop(struct ieee80211_sub_if_data *sdata,
 	}
 	spin_unlock_irqrestore(&local->queue_stop_reason_lock, flags);
 
-	if (sdata->vif.txq) {
-		struct txq_info *txqi = to_txq_info(sdata->vif.txq);
+	if (txq_sdata->vif.txq) {
+		struct txq_info *txqi = to_txq_info(txq_sdata->vif.txq);
+
+		/*
+		 * FIXME FIXME
+		 *
+		 * We really shouldn't purge the *entire* txqi since that
+		 * contains frames for the other AP_VLANs (and possibly
+		 * the AP itself) as well, but there's no API in FQ now
+		 * to be able to filter.
+		 */
 
 		spin_lock_bh(&fq->lock);
 		ieee80211_txq_purge(local, txqi);
diff --git a/net/mac80211/tx.c b/net/mac80211/tx.c
index 8858f4f185e9..94826680cf2b 100644
--- a/net/mac80211/tx.c
+++ b/net/mac80211/tx.c
@@ -1276,11 +1276,6 @@ static void ieee80211_set_skb_enqueue_time(struct sk_buff *skb)
 	IEEE80211_SKB_CB(skb)->control.enqueue_time = codel_get_time();
 }
 
-static void ieee80211_set_skb_vif(struct sk_buff *skb, struct txq_info *txqi)
-{
-	IEEE80211_SKB_CB(skb)->control.vif = txqi->txq.vif;
-}
-
 static u32 codel_skb_len_func(const struct sk_buff *skb)
 {
 	return skb->len;
@@ -3414,6 +3409,7 @@ struct sk_buff *ieee80211_tx_dequeue(struct ieee80211_hw *hw,
 	struct ieee80211_tx_info *info;
 	struct ieee80211_tx_data tx;
 	ieee80211_tx_result r;
+	struct ieee80211_vif *vif;
 
 	spin_lock_bh(&fq->lock);
 
@@ -3430,8 +3426,6 @@ begin:
 	if (!skb)
 		goto out;
 
-	ieee80211_set_skb_vif(skb, txqi);
-
 	hdr = (struct ieee80211_hdr *)skb->data;
 	info = IEEE80211_SKB_CB(skb);
 
@@ -3488,6 +3482,34 @@ begin:
 		}
 	}
 
+	switch (tx.sdata->vif.type) {
+	case NL80211_IFTYPE_MONITOR:
+		if (tx.sdata->u.mntr.flags & MONITOR_FLAG_ACTIVE) {
+			vif = &tx.sdata->vif;
+			break;
+		}
+		tx.sdata = rcu_dereference(local->monitor_sdata);
+		if (tx.sdata) {
+			vif = &tx.sdata->vif;
+			info->hw_queue =
+				vif->hw_queue[skb_get_queue_mapping(skb)];
+		} else if (ieee80211_hw_check(&local->hw, QUEUE_CONTROL)) {
+			ieee80211_free_txskb(&local->hw, skb);
+			goto begin;
+		} else {
+			vif = NULL;
+		}
+		break;
+	case NL80211_IFTYPE_AP_VLAN:
+		tx.sdata = container_of(tx.sdata->bss,
+					struct ieee80211_sub_if_data, u.ap);
+		/* fall through */
+	default:
+		vif = &tx.sdata->vif;
+		break;
+	}
+
+	IEEE80211_SKB_CB(skb)->control.vif = vif;
 out:
 	spin_unlock_bh(&fq->lock);
 
-- 
cgit v1.2.3


From c93022a72f01f8e53d6e1bc2a8d2c2824c2f36bc Mon Sep 17 00:00:00 2001
From: Mauro Carvalho Chehab <mchehab@s-opensource.com>
Date: Fri, 1 Sep 2017 05:43:39 -0400
Subject: media: ca.h: split typedefs from structs

Using typedefs inside the Kernel is against CodingStyle, and
there's no good usage here.

Just like we did at frontend.h, at commit 0df289a209e0 ("[media] dvb:
Get rid of typedev usage for enums"), let's keep those typedefs only
to provide userspace backward compatibility.

No functional changes.

Signed-off-by: Mauro Carvalho Chehab <mchehab@s-opensource.com>
---
 drivers/media/pci/ttpci/av7110.h    |  2 +-
 drivers/media/pci/ttpci/av7110_ca.c | 12 ++++-----
 include/uapi/linux/dvb/ca.h         | 51 +++++++++++++++++++++++--------------
 3 files changed, 39 insertions(+), 26 deletions(-)

(limited to 'include')

diff --git a/drivers/media/pci/ttpci/av7110.h b/drivers/media/pci/ttpci/av7110.h
index 824c1e262fbb..347827925c14 100644
--- a/drivers/media/pci/ttpci/av7110.h
+++ b/drivers/media/pci/ttpci/av7110.h
@@ -177,7 +177,7 @@ struct av7110 {
 
 	/* CA */
 
-	ca_slot_info_t		ci_slot[2];
+	struct ca_slot_info	ci_slot[2];
 
 	enum av7110_video_mode	vidmode;
 	struct dmxdev		dmxdev;
diff --git a/drivers/media/pci/ttpci/av7110_ca.c b/drivers/media/pci/ttpci/av7110_ca.c
index f64723aea56b..1fe49171d823 100644
--- a/drivers/media/pci/ttpci/av7110_ca.c
+++ b/drivers/media/pci/ttpci/av7110_ca.c
@@ -119,7 +119,7 @@ static void ci_ll_release(struct dvb_ringbuffer *cirbuf, struct dvb_ringbuffer *
 }
 
 static int ci_ll_reset(struct dvb_ringbuffer *cibuf, struct file *file,
-		       int slots, ca_slot_info_t *slot)
+		       int slots, struct ca_slot_info *slot)
 {
 	int i;
 	int len = 0;
@@ -264,7 +264,7 @@ static int dvb_ca_ioctl(struct file *file, unsigned int cmd, void *parg)
 		break;
 	case CA_GET_CAP:
 	{
-		ca_caps_t cap;
+		struct ca_caps cap;
 
 		cap.slot_num = 2;
 		cap.slot_type = (FW_CI_LL_SUPPORT(av7110->arm_app) ?
@@ -277,7 +277,7 @@ static int dvb_ca_ioctl(struct file *file, unsigned int cmd, void *parg)
 
 	case CA_GET_SLOT_INFO:
 	{
-		ca_slot_info_t *info=(ca_slot_info_t *)parg;
+		struct ca_slot_info *info=(struct ca_slot_info *)parg;
 
 		if (info->num < 0 || info->num > 1) {
 			mutex_unlock(&av7110->ioctl_mutex);
@@ -286,7 +286,7 @@ static int dvb_ca_ioctl(struct file *file, unsigned int cmd, void *parg)
 		av7110->ci_slot[info->num].num = info->num;
 		av7110->ci_slot[info->num].type = FW_CI_LL_SUPPORT(av7110->arm_app) ?
 							CA_CI_LINK : CA_CI;
-		memcpy(info, &av7110->ci_slot[info->num], sizeof(ca_slot_info_t));
+		memcpy(info, &av7110->ci_slot[info->num], sizeof(struct ca_slot_info));
 		break;
 	}
 
@@ -298,7 +298,7 @@ static int dvb_ca_ioctl(struct file *file, unsigned int cmd, void *parg)
 
 	case CA_GET_DESCR_INFO:
 	{
-		ca_descr_info_t info;
+		struct ca_descr_info info;
 
 		info.num = 16;
 		info.type = CA_ECD;
@@ -308,7 +308,7 @@ static int dvb_ca_ioctl(struct file *file, unsigned int cmd, void *parg)
 
 	case CA_SET_DESCR:
 	{
-		ca_descr_t *descr = (ca_descr_t*) parg;
+		struct ca_descr *descr = (struct ca_descr*) parg;
 
 		if (descr->index >= 16 || descr->parity > 1) {
 			mutex_unlock(&av7110->ioctl_mutex);
diff --git a/include/uapi/linux/dvb/ca.h b/include/uapi/linux/dvb/ca.h
index c18537f3e449..00cf24587bea 100644
--- a/include/uapi/linux/dvb/ca.h
+++ b/include/uapi/linux/dvb/ca.h
@@ -26,7 +26,7 @@
 
 /* slot interface types and info */
 
-typedef struct ca_slot_info {
+struct ca_slot_info {
 	int num;               /* slot number */
 
 	int type;              /* CA interface this slot supports */
@@ -39,52 +39,65 @@ typedef struct ca_slot_info {
 	unsigned int flags;
 #define CA_CI_MODULE_PRESENT 1 /* module (or card) inserted */
 #define CA_CI_MODULE_READY   2
-} ca_slot_info_t;
+};
 
 
 /* descrambler types and info */
 
-typedef struct ca_descr_info {
+struct ca_descr_info {
 	unsigned int num;          /* number of available descramblers (keys) */
 	unsigned int type;         /* type of supported scrambling system */
 #define CA_ECD           1
 #define CA_NDS           2
 #define CA_DSS           4
-} ca_descr_info_t;
+};
 
-typedef struct ca_caps {
+struct ca_caps {
 	unsigned int slot_num;     /* total number of CA card and module slots */
 	unsigned int slot_type;    /* OR of all supported types */
 	unsigned int descr_num;    /* total number of descrambler slots (keys) */
 	unsigned int descr_type;   /* OR of all supported types */
-} ca_caps_t;
+};
 
 /* a message to/from a CI-CAM */
-typedef struct ca_msg {
+struct ca_msg {
 	unsigned int index;
 	unsigned int type;
 	unsigned int length;
 	unsigned char msg[256];
-} ca_msg_t;
+};
 
-typedef struct ca_descr {
+struct ca_descr {
 	unsigned int index;
 	unsigned int parity;	/* 0 == even, 1 == odd */
 	unsigned char cw[8];
-} ca_descr_t;
+};
 
-typedef struct ca_pid {
+struct ca_pid {
 	unsigned int pid;
 	int index;		/* -1 == disable*/
-} ca_pid_t;
+};
 
 #define CA_RESET          _IO('o', 128)
-#define CA_GET_CAP        _IOR('o', 129, ca_caps_t)
-#define CA_GET_SLOT_INFO  _IOR('o', 130, ca_slot_info_t)
-#define CA_GET_DESCR_INFO _IOR('o', 131, ca_descr_info_t)
-#define CA_GET_MSG        _IOR('o', 132, ca_msg_t)
-#define CA_SEND_MSG       _IOW('o', 133, ca_msg_t)
-#define CA_SET_DESCR      _IOW('o', 134, ca_descr_t)
-#define CA_SET_PID        _IOW('o', 135, ca_pid_t)
+#define CA_GET_CAP        _IOR('o', 129, struct ca_caps)
+#define CA_GET_SLOT_INFO  _IOR('o', 130, struct ca_slot_info)
+#define CA_GET_DESCR_INFO _IOR('o', 131, struct ca_descr_info)
+#define CA_GET_MSG        _IOR('o', 132, struct ca_msg)
+#define CA_SEND_MSG       _IOW('o', 133, struct ca_msg)
+#define CA_SET_DESCR      _IOW('o', 134, struct ca_descr)
+#define CA_SET_PID        _IOW('o', 135, struct ca_pid)
+
+#if !defined (__KERNEL__)
+
+/* This is needed for legacy userspace support */
+typedef struct ca_slot_info ca_slot_info_t;
+typedef struct ca_descr_info  ca_descr_info_t;
+typedef struct ca_caps  ca_caps_t;
+typedef struct ca_msg ca_msg_t;
+typedef struct ca_descr ca_descr_t;
+typedef struct ca_pid ca_pid_t;
+
+#endif
+
 
 #endif
-- 
cgit v1.2.3


From 3256b36ea36525945d8575c0100752819a309aaa Mon Sep 17 00:00:00 2001
From: Mauro Carvalho Chehab <mchehab@s-opensource.com>
Date: Fri, 1 Sep 2017 06:09:14 -0400
Subject: media: dmx.h: split typedefs from structs

Using typedefs inside the Kernel is against CodingStyle, and
there's no good usage here.

Just like we did at frontend.h, at commit 0df289a209e0
("[media] dvb: Get rid of typedev usage for enums"), let's keep
those typedefs only to provide userspace backward compatibility.

No functional changes.

Signed-off-by: Mauro Carvalho Chehab <mchehab@s-opensource.com>
---
 drivers/media/dvb-core/dmxdev.c |  4 +--
 include/uapi/linux/dvb/dmx.h    | 56 ++++++++++++++++++++++++-----------------
 2 files changed, 35 insertions(+), 25 deletions(-)

(limited to 'include')

diff --git a/drivers/media/dvb-core/dmxdev.c b/drivers/media/dvb-core/dmxdev.c
index 45e91add73ba..16b0b74c3114 100644
--- a/drivers/media/dvb-core/dmxdev.c
+++ b/drivers/media/dvb-core/dmxdev.c
@@ -562,7 +562,7 @@ static int dvb_dmxdev_start_feed(struct dmxdev *dmxdev,
 {
 	ktime_t timeout = 0;
 	struct dmx_pes_filter_params *para = &filter->params.pes;
-	dmx_output_t otype;
+	enum dmx_output otype;
 	int ret;
 	int ts_type;
 	enum dmx_ts_pes ts_pes;
@@ -787,7 +787,7 @@ static int dvb_dmxdev_filter_free(struct dmxdev *dmxdev,
 	return 0;
 }
 
-static inline void invert_mode(dmx_filter_t *filter)
+static inline void invert_mode(struct dmx_filter *filter)
 {
 	int i;
 
diff --git a/include/uapi/linux/dvb/dmx.h b/include/uapi/linux/dvb/dmx.h
index 427e4899ed69..1bc4d6fb0f01 100644
--- a/include/uapi/linux/dvb/dmx.h
+++ b/include/uapi/linux/dvb/dmx.h
@@ -43,16 +43,14 @@ enum dmx_output
 	DMX_OUT_TSDEMUX_TAP /* Like TS_TAP but retrieved from the DMX device */
 };
 
-typedef enum dmx_output dmx_output_t;
-
-typedef enum dmx_input
+enum dmx_input
 {
 	DMX_IN_FRONTEND, /* Input from a front-end device.  */
 	DMX_IN_DVR       /* Input from the logical DVR device.  */
-} dmx_input_t;
+};
 
 
-typedef enum dmx_ts_pes
+enum dmx_ts_pes
 {
 	DMX_PES_AUDIO0,
 	DMX_PES_VIDEO0,
@@ -79,7 +77,7 @@ typedef enum dmx_ts_pes
 	DMX_PES_PCR3,
 
 	DMX_PES_OTHER
-} dmx_pes_type_t;
+};
 
 #define DMX_PES_AUDIO    DMX_PES_AUDIO0
 #define DMX_PES_VIDEO    DMX_PES_VIDEO0
@@ -88,20 +86,20 @@ typedef enum dmx_ts_pes
 #define DMX_PES_PCR      DMX_PES_PCR0
 
 
-typedef struct dmx_filter
+struct dmx_filter
 {
 	__u8  filter[DMX_FILTER_SIZE];
 	__u8  mask[DMX_FILTER_SIZE];
 	__u8  mode[DMX_FILTER_SIZE];
-} dmx_filter_t;
+};
 
 
 struct dmx_sct_filter_params
 {
-	__u16          pid;
-	dmx_filter_t   filter;
-	__u32          timeout;
-	__u32          flags;
+	__u16             pid;
+	struct dmx_filter filter;
+	__u32             timeout;
+	__u32             flags;
 #define DMX_CHECK_CRC       1
 #define DMX_ONESHOT         2
 #define DMX_IMMEDIATE_START 4
@@ -111,19 +109,19 @@ struct dmx_sct_filter_params
 
 struct dmx_pes_filter_params
 {
-	__u16          pid;
-	dmx_input_t    input;
-	dmx_output_t   output;
-	dmx_pes_type_t pes_type;
-	__u32          flags;
+	__u16           pid;
+	enum dmx_input  input;
+	enum dmx_output output;
+	enum dmx_ts_pes pes_type;
+	__u32           flags;
 };
 
-typedef struct dmx_caps {
+struct dmx_caps {
 	__u32 caps;
 	int num_decoders;
-} dmx_caps_t;
+};
 
-typedef enum dmx_source {
+enum dmx_source {
 	DMX_SOURCE_FRONT0 = 0,
 	DMX_SOURCE_FRONT1,
 	DMX_SOURCE_FRONT2,
@@ -132,7 +130,7 @@ typedef enum dmx_source {
 	DMX_SOURCE_DVR1,
 	DMX_SOURCE_DVR2,
 	DMX_SOURCE_DVR3
-} dmx_source_t;
+};
 
 struct dmx_stc {
 	unsigned int num;	/* input : which STC? 0..N */
@@ -146,10 +144,22 @@ struct dmx_stc {
 #define DMX_SET_PES_FILTER       _IOW('o', 44, struct dmx_pes_filter_params)
 #define DMX_SET_BUFFER_SIZE      _IO('o', 45)
 #define DMX_GET_PES_PIDS         _IOR('o', 47, __u16[5])
-#define DMX_GET_CAPS             _IOR('o', 48, dmx_caps_t)
-#define DMX_SET_SOURCE           _IOW('o', 49, dmx_source_t)
+#define DMX_GET_CAPS             _IOR('o', 48, struct dmx_caps)
+#define DMX_SET_SOURCE           _IOW('o', 49, enum dmx_source)
 #define DMX_GET_STC              _IOWR('o', 50, struct dmx_stc)
 #define DMX_ADD_PID              _IOW('o', 51, __u16)
 #define DMX_REMOVE_PID           _IOW('o', 52, __u16)
 
+#if !defined (__KERNEL__)
+
+/* This is needed for legacy userspace support */
+typedef enum dmx_output dmx_output_t;
+typedef enum dmx_input dmx_input_t;
+typedef enum dmx_ts_pes dmx_pes_type_t;
+typedef struct dmx_filter dmx_filter_t;
+typedef struct dmx_caps dmx_caps_t;
+typedef enum dmx_source  dmx_source_t;
+
+#endif
+
 #endif /* _UAPI_DVBDMX_H_ */
-- 
cgit v1.2.3


From f35afa4f60c868d7c7811ba747133acbf39410ac Mon Sep 17 00:00:00 2001
From: Mauro Carvalho Chehab <mchehab@s-opensource.com>
Date: Wed, 30 Aug 2017 07:55:47 -0400
Subject: media: dvb/frontend.h: move out a private internal structure

struct dtv_cmds_h is just an ancillary struct used by the
dvb_frontend.c to internally store frontend commands.

It doesn't belong to the userspace header, nor it is used anywhere,
except inside the DVB core. So, remove it from the header.

Signed-off-by: Mauro Carvalho Chehab <mchehab@s-opensource.com>
---
 drivers/media/dvb-core/dvb_frontend.c | 11 +++++++++++
 include/uapi/linux/dvb/frontend.h     | 11 -----------
 2 files changed, 11 insertions(+), 11 deletions(-)

(limited to 'include')

diff --git a/drivers/media/dvb-core/dvb_frontend.c b/drivers/media/dvb-core/dvb_frontend.c
index 114994ca0929..2fcba1616168 100644
--- a/drivers/media/dvb-core/dvb_frontend.c
+++ b/drivers/media/dvb-core/dvb_frontend.c
@@ -1000,6 +1000,17 @@ static int dvb_frontend_clear_cache(struct dvb_frontend *fe)
 	.buffer = b \
 }
 
+struct dtv_cmds_h {
+	char	*name;		/* A display name for debugging purposes */
+
+	__u32	cmd;		/* A unique ID */
+
+	/* Flags */
+	__u32	set:1;		/* Either a set or get property */
+	__u32	buffer:1;	/* Does this property use the buffer? */
+	__u32	reserved:30;	/* Align */
+};
+
 static struct dtv_cmds_h dtv_cmds[DTV_MAX_COMMAND + 1] = {
 	_DTV_CMD(DTV_TUNE, 1, 0),
 	_DTV_CMD(DTV_CLEAR, 1, 0),
diff --git a/include/uapi/linux/dvb/frontend.h b/include/uapi/linux/dvb/frontend.h
index afc3972b0879..3a80f3d1da1c 100644
--- a/include/uapi/linux/dvb/frontend.h
+++ b/include/uapi/linux/dvb/frontend.h
@@ -384,17 +384,6 @@ enum atscmh_rs_code_mode {
 #define NO_STREAM_ID_FILTER	(~0U)
 #define LNA_AUTO                (~0U)
 
-struct dtv_cmds_h {
-	char	*name;		/* A display name for debugging purposes */
-
-	__u32	cmd;		/* A unique ID */
-
-	/* Flags */
-	__u32	set:1;		/* Either a set or get property */
-	__u32	buffer:1;	/* Does this property use the buffer? */
-	__u32	reserved:30;	/* Align */
-};
-
 /**
  * Scale types for the quality parameters.
  * @FE_SCALE_NOT_AVAILABLE: That QoS measure is not available. That
-- 
cgit v1.2.3


From 8220ead805b6bab4ade2839857a198e9708b07de Mon Sep 17 00:00:00 2001
From: Mauro Carvalho Chehab <mchehab@s-opensource.com>
Date: Wed, 30 Aug 2017 08:12:38 -0400
Subject: media: dvb/frontend.h: document the uAPI file

Most of the stuff at the Digital TV frontend header file
are documented only at the Documentation. However, a few
kernel-doc markups are there, several of them with parsing
issues.

Add the missing documentation, copying definitions from the
Documentation when it applies, fixing some bugs.

Please notice that DVBv3 stuff that were deprecated weren't
commented by purpose. Instead, they were clearly tagged as
such.

This patch prepares to move part of the documentation from
Documentation/ to kernel-doc comments.

Signed-off-by: Mauro Carvalho Chehab <mchehab@s-opensource.com>
---
 include/uapi/linux/dvb/frontend.h | 580 ++++++++++++++++++++++++++++++++------
 1 file changed, 498 insertions(+), 82 deletions(-)

(limited to 'include')

diff --git a/include/uapi/linux/dvb/frontend.h b/include/uapi/linux/dvb/frontend.h
index 3a80f3d1da1c..16a318fc469a 100644
--- a/include/uapi/linux/dvb/frontend.h
+++ b/include/uapi/linux/dvb/frontend.h
@@ -28,13 +28,46 @@
 
 #include <linux/types.h>
 
-enum fe_type {
-	FE_QPSK,
-	FE_QAM,
-	FE_OFDM,
-	FE_ATSC
-};
-
+/**
+ * enum fe_caps - Frontend capabilities
+ *
+ * @FE_IS_STUPID:			There's something wrong at the
+ *					frontend, and it can't report its
+ *					capabilities.
+ * @FE_CAN_INVERSION_AUTO:		Can auto-detect frequency spectral
+ *					band inversion
+ * @FE_CAN_FEC_1_2:			Supports FEC 1/2
+ * @FE_CAN_FEC_2_3:			Supports FEC 2/3
+ * @FE_CAN_FEC_3_4:			Supports FEC 3/4
+ * @FE_CAN_FEC_4_5:			Supports FEC 4/5
+ * @FE_CAN_FEC_5_6:			Supports FEC 5/6
+ * @FE_CAN_FEC_6_7:			Supports FEC 6/7
+ * @FE_CAN_FEC_7_8:			Supports FEC 7/8
+ * @FE_CAN_FEC_8_9:			Supports FEC 8/9
+ * @FE_CAN_FEC_AUTO:			Can auto-detect FEC
+ * @FE_CAN_QPSK:			Supports QPSK modulation
+ * @FE_CAN_QAM_16:			Supports 16-QAM modulation
+ * @FE_CAN_QAM_32:			Supports 32-QAM modulation
+ * @FE_CAN_QAM_64:			Supports 64-QAM modulation
+ * @FE_CAN_QAM_128:			Supports 128-QAM modulation
+ * @FE_CAN_QAM_256:			Supports 256-QAM modulation
+ * @FE_CAN_QAM_AUTO:			Can auto-detect QAM modulation
+ * @FE_CAN_TRANSMISSION_MODE_AUTO:	Can auto-detect transmission mode
+ * @FE_CAN_BANDWIDTH_AUTO:		Can auto-detect bandwidth
+ * @FE_CAN_GUARD_INTERVAL_AUTO:		Can auto-detect guard interval
+ * @FE_CAN_HIERARCHY_AUTO:		Can auto-detect hierarchy
+ * @FE_CAN_8VSB:			Supports 8-VSB modulation
+ * @FE_CAN_16VSB:			Supporta 16-VSB modulation
+ * @FE_HAS_EXTENDED_CAPS:		Unused
+ * @FE_CAN_MULTISTREAM:			Supports multistream filtering
+ * @FE_CAN_TURBO_FEC:			Supports "turbo FEC" modulation
+ * @FE_CAN_2G_MODULATION:		Supports "2nd generation" modulation,
+ *					e. g. DVB-S2, DVB-T2, DVB-C2
+ * @FE_NEEDS_BENDING:			Unused
+ * @FE_CAN_RECOVER:			Can recover from a cable unplug
+ *					automatically
+ * @FE_CAN_MUTE_TS:			Can stop spurious TS data output
+ */
 enum fe_caps {
 	FE_IS_STUPID			= 0,
 	FE_CAN_INVERSION_AUTO		= 0x1,
@@ -60,15 +93,55 @@ enum fe_caps {
 	FE_CAN_HIERARCHY_AUTO		= 0x100000,
 	FE_CAN_8VSB			= 0x200000,
 	FE_CAN_16VSB			= 0x400000,
-	FE_HAS_EXTENDED_CAPS		= 0x800000,   /* We need more bitspace for newer APIs, indicate this. */
-	FE_CAN_MULTISTREAM		= 0x4000000,  /* frontend supports multistream filtering */
-	FE_CAN_TURBO_FEC		= 0x8000000,  /* frontend supports "turbo fec modulation" */
-	FE_CAN_2G_MODULATION		= 0x10000000, /* frontend supports "2nd generation modulation" (DVB-S2) */
-	FE_NEEDS_BENDING		= 0x20000000, /* not supported anymore, don't use (frontend requires frequency bending) */
-	FE_CAN_RECOVER			= 0x40000000, /* frontend can recover from a cable unplug automatically */
-	FE_CAN_MUTE_TS			= 0x80000000  /* frontend can stop spurious TS data output */
+	FE_HAS_EXTENDED_CAPS		= 0x800000,
+	FE_CAN_MULTISTREAM		= 0x4000000,
+	FE_CAN_TURBO_FEC		= 0x8000000,
+	FE_CAN_2G_MODULATION		= 0x10000000,
+	FE_NEEDS_BENDING		= 0x20000000,
+	FE_CAN_RECOVER			= 0x40000000,
+	FE_CAN_MUTE_TS			= 0x80000000
+};
+
+/*
+ * DEPRECATED: Should be kept just due to backward compatibility.
+ */
+enum fe_type {
+	FE_QPSK,
+	FE_QAM,
+	FE_OFDM,
+	FE_ATSC
 };
 
+/**
+ * struct dvb_frontend_info - Frontend properties and capabilities
+ *
+ * @name:			Name of the frontend
+ * @type:			**DEPRECATED**.
+ *				Should not be used on modern programs,
+ *				as a frontend may have more than one type.
+ *				In order to get the support types of a given
+ *				frontend, use :c:type:`DTV_ENUM_DELSYS`
+ *				instead.
+ * @frequency_min:		Minimal frequency supported by the frontend.
+ * @frequency_max:		Minimal frequency supported by the frontend.
+ * @frequency_stepsize:		All frequencies are multiple of this value.
+ * @frequency_tolerance:	Frequency tolerance.
+ * @symbol_rate_min:		Minimal symbol rate, in bauds
+ *				(for Cable/Satellite systems).
+ * @symbol_rate_max:		Maximal symbol rate, in bauds
+ *				(for Cable/Satellite systems).
+ * @symbol_rate_tolerance:	Maximal symbol rate tolerance, in ppm
+ *				(for Cable/Satellite systems).
+ * @notifier_delay:		**DEPRECATED**. Not used by any driver.
+ * @caps:			Capabilities supported by the frontend,
+ *				as specified in &enum fe_caps.
+ *
+ * .. note:
+ *
+ *    #. The frequencies are specified in Hz for Terrestrial and Cable
+ *       systems.
+ *    #. The frequencies are specified in kHz for Satellite systems.
+ */
 struct dvb_frontend_info {
 	char       name[128];
 	enum fe_type type;	/* DEPRECATED. Use DTV_ENUM_DELSYS instead */
@@ -78,53 +151,102 @@ struct dvb_frontend_info {
 	__u32      frequency_tolerance;
 	__u32      symbol_rate_min;
 	__u32      symbol_rate_max;
-	__u32      symbol_rate_tolerance;	/* ppm */
+	__u32      symbol_rate_tolerance;
 	__u32      notifier_delay;		/* DEPRECATED */
 	enum fe_caps caps;
 };
 
-
 /**
- *  Check out the DiSEqC bus spec available on http://www.eutelsat.org/ for
- *  the meaning of this struct...
+ * struct dvb_diseqc_master_cmd - DiSEqC master command
+ *
+ * @msg:
+ *	DiSEqC message to be sent. It contains a 3 bytes header with:
+ *	framing + address + command, and an optional argument
+ *	of up to 3 bytes of data.
+ * @msg_len:
+ *	Length of the DiSEqC message. Valid values are 3 to 6.
+ *
+ * Check out the DiSEqC bus spec available on http://www.eutelsat.org/ for
+ * the possible messages that can be used.
  */
 struct dvb_diseqc_master_cmd {
-	__u8 msg [6];	/*  { framing, address, command, data [3] } */
-	__u8 msg_len;	/*  valid values are 3...6  */
+	__u8 msg[6];
+	__u8 msg_len;
 };
 
+/**
+ * struct dvb_diseqc_slave_reply - DiSEqC received data
+ *
+ * @msg:
+ *	DiSEqC message buffer to store a message received via DiSEqC.
+ *	It contains one byte header with: framing and
+ *	an optional argument of up to 3 bytes of data.
+ * @msg_len:
+ *	Length of the DiSEqC message. Valid values are 0 to 4,
+ *	where 0 means no message.
+ * @timeout:
+ *	Return from ioctl after timeout ms with errorcode when
+ *	no message was received.
+ *
+ * Check out the DiSEqC bus spec available on http://www.eutelsat.org/ for
+ * the possible messages that can be used.
+ */
 struct dvb_diseqc_slave_reply {
-	__u8 msg [4];	/*  { framing, data [3] } */
-	__u8 msg_len;	/*  valid values are 0...4, 0 means no msg  */
-	int  timeout;	/*  return from ioctl after timeout ms with */
-};			/*  errorcode when no message was received  */
+	__u8 msg[4];
+	__u8 msg_len;
+	int  timeout;
+};
 
+/**
+ * enum fe_sec_voltage - DC Voltage used to feed the LNBf
+ *
+ * @SEC_VOLTAGE_13:	Output 13V to the LNBf
+ * @SEC_VOLTAGE_18:	Output 18V to the LNBf
+ * @SEC_VOLTAGE_OFF:	Don't feed the LNBf with a DC voltage
+ */
 enum fe_sec_voltage {
 	SEC_VOLTAGE_13,
 	SEC_VOLTAGE_18,
 	SEC_VOLTAGE_OFF
 };
 
+/**
+ * enum fe_sec_tone_mode - Type of tone to be send to the LNBf.
+ * @SEC_TONE_ON:	Sends a 22kHz tone burst to the antenna.
+ * @SEC_TONE_OFF:	Don't send a 22kHz tone to the antenna (except
+ *			if the ``FE_DISEQC_*`` ioctls are called).
+ */
 enum fe_sec_tone_mode {
 	SEC_TONE_ON,
 	SEC_TONE_OFF
 };
 
+/**
+ * enum fe_sec_mini_cmd - Type of mini burst to be sent
+ *
+ * @SEC_MINI_A:		Sends a mini-DiSEqC 22kHz '0' Tone Burst to select
+ *			satellite-A
+ * @SEC_MINI_B:		Sends a mini-DiSEqC 22kHz '1' Data Burst to select
+ *			satellite-B
+ */
 enum fe_sec_mini_cmd {
 	SEC_MINI_A,
 	SEC_MINI_B
 };
 
 /**
- * enum fe_status - enumerates the possible frontend status
- * @FE_HAS_SIGNAL:	found something above the noise level
- * @FE_HAS_CARRIER:	found a DVB signal
- * @FE_HAS_VITERBI:	FEC is stable
- * @FE_HAS_SYNC:	found sync bytes
- * @FE_HAS_LOCK:	everything's working
- * @FE_TIMEDOUT:	no lock within the last ~2 seconds
- * @FE_REINIT:		frontend was reinitialized, application is recommended
- *			to reset DiSEqC, tone and parameters
+ * enum fe_status - Enumerates the possible frontend status.
+ * @FE_NONE:		The frontend doesn't have any kind of lock.
+ *			That's the initial frontend status
+ * @FE_HAS_SIGNAL:	Has found something above the noise level.
+ * @FE_HAS_CARRIER:	Has found a DVB signal.
+ * @FE_HAS_VITERBI:	FEC inner coding (Viterbi, LDPC or other inner code).
+ *			is stable.
+ * @FE_HAS_SYNC:	Synchronization bytes was found.
+ * @FE_HAS_LOCK:	DVB were locked and everything is working.
+ * @FE_TIMEDOUT:	Fo lock within the last about 2 seconds.
+ * @FE_REINIT:		Frontend was reinitialized, application is recommended
+ *			to reset DiSEqC, tone and parameters.
  */
 enum fe_status {
 	FE_NONE			= 0x00,
@@ -137,12 +259,45 @@ enum fe_status {
 	FE_REINIT		= 0x40,
 };
 
+/**
+ * enum fe_spectral_inversion - Type of inversion band
+ *
+ * @INVERSION_OFF:	Don't do spectral band inversion.
+ * @INVERSION_ON:	Do spectral band inversion.
+ * @INVERSION_AUTO:	Autodetect spectral band inversion.
+ *
+ * This parameter indicates if spectral inversion should be presumed or
+ * not. In the automatic setting (``INVERSION_AUTO``) the hardware will try
+ * to figure out the correct setting by itself. If the hardware doesn't
+ * support, the DVB core will try to lock at the carrier first with
+ * inversion off. If it fails, it will try to enable inversion.
+ */
 enum fe_spectral_inversion {
 	INVERSION_OFF,
 	INVERSION_ON,
 	INVERSION_AUTO
 };
 
+/**
+ * enum fe_code_rate - Type of Forward Error Correction (FEC)
+ *
+ *
+ * @FEC_NONE: No Forward Error Correction Code
+ * @FEC_1_2:  Forward Error Correction Code 1/2
+ * @FEC_2_3:  Forward Error Correction Code 2/3
+ * @FEC_3_4:  Forward Error Correction Code 3/4
+ * @FEC_4_5:  Forward Error Correction Code 4/5
+ * @FEC_5_6:  Forward Error Correction Code 5/6
+ * @FEC_6_7:  Forward Error Correction Code 6/7
+ * @FEC_7_8:  Forward Error Correction Code 7/8
+ * @FEC_8_9:  Forward Error Correction Code 8/9
+ * @FEC_AUTO: Autodetect Error Correction Code
+ * @FEC_3_5:  Forward Error Correction Code 3/5
+ * @FEC_9_10: Forward Error Correction Code 9/10
+ * @FEC_2_5:  Forward Error Correction Code 2/5
+ *
+ * Please note that not all FEC types are supported by a given standard.
+ */
 enum fe_code_rate {
 	FEC_NONE = 0,
 	FEC_1_2,
@@ -159,6 +314,26 @@ enum fe_code_rate {
 	FEC_2_5,
 };
 
+/**
+ * enum fe_modulation - Type of modulation/constellation
+ * @QPSK:	QPSK modulation
+ * @QAM_16:	16-QAM modulation
+ * @QAM_32:	32-QAM modulation
+ * @QAM_64:	64-QAM modulation
+ * @QAM_128:	128-QAM modulation
+ * @QAM_256:	256-QAM modulation
+ * @QAM_AUTO:	Autodetect QAM modulation
+ * @VSB_8:	8-VSB modulation
+ * @VSB_16:	16-VSB modulation
+ * @PSK_8:	8-PSK modulation
+ * @APSK_16:	16-APSK modulation
+ * @APSK_32:	32-APSK modulation
+ * @DQPSK:	DQPSK modulation
+ * @QAM_4_NR:	4-QAM-NR modulation
+ *
+ * Please note that not all modulations are supported by a given standard.
+ *
+ */
 enum fe_modulation {
 	QPSK,
 	QAM_16,
@@ -176,6 +351,32 @@ enum fe_modulation {
 	QAM_4_NR,
 };
 
+/**
+ * enum fe_transmit_mode - Transmission mode
+ *
+ * @TRANSMISSION_MODE_AUTO:
+ *	Autodetect transmission mode. The hardware will try to find the
+ *	correct FFT-size (if capable) to fill in the missing parameters.
+ * @TRANSMISSION_MODE_1K:
+ *	Transmission mode 1K
+ * @TRANSMISSION_MODE_2K:
+ *	Transmission mode 2K
+ * @TRANSMISSION_MODE_8K:
+ *	Transmission mode 8K
+ * @TRANSMISSION_MODE_4K:
+ *	Transmission mode 4K
+ * @TRANSMISSION_MODE_16K:
+ *	Transmission mode 16K
+ * @TRANSMISSION_MODE_32K:
+ *	Transmission mode 32K
+ * @TRANSMISSION_MODE_C1:
+ *	Single Carrier (C=1) transmission mode (DTMB only)
+ * @TRANSMISSION_MODE_C3780:
+ *	Multi Carrier (C=3780) transmission mode (DTMB only)
+ *
+ * Please note that not all transmission modes are supported by a given
+ * standard.
+ */
 enum fe_transmit_mode {
 	TRANSMISSION_MODE_2K,
 	TRANSMISSION_MODE_8K,
@@ -188,6 +389,23 @@ enum fe_transmit_mode {
 	TRANSMISSION_MODE_C3780,
 };
 
+/**
+ * enum fe_guard_interval - Guard interval
+ *
+ * @GUARD_INTERVAL_AUTO:	Autodetect the guard interval
+ * @GUARD_INTERVAL_1_128:	Guard interval 1/128
+ * @GUARD_INTERVAL_1_32:	Guard interval 1/32
+ * @GUARD_INTERVAL_1_16:	Guard interval 1/16
+ * @GUARD_INTERVAL_1_8:		Guard interval 1/8
+ * @GUARD_INTERVAL_1_4:		Guard interval 1/4
+ * @GUARD_INTERVAL_19_128:	Guard interval 19/128
+ * @GUARD_INTERVAL_19_256:	Guard interval 19/256
+ * @GUARD_INTERVAL_PN420:	PN length 420 (1/4)
+ * @GUARD_INTERVAL_PN595:	PN length 595 (1/6)
+ * @GUARD_INTERVAL_PN945:	PN length 945 (1/9)
+ *
+ * Please note that not all guard intervals are supported by a given standard.
+ */
 enum fe_guard_interval {
 	GUARD_INTERVAL_1_32,
 	GUARD_INTERVAL_1_16,
@@ -202,6 +420,16 @@ enum fe_guard_interval {
 	GUARD_INTERVAL_PN945,
 };
 
+/**
+ * enum fe_hierarchy - Hierarchy
+ * @HIERARCHY_NONE:	No hierarchy
+ * @HIERARCHY_AUTO:	Autodetect hierarchy (if supported)
+ * @HIERARCHY_1:	Hierarchy 1
+ * @HIERARCHY_2:	Hierarchy 2
+ * @HIERARCHY_4:	Hierarchy 4
+ *
+ * Please note that not all hierarchy types are supported by a given standard.
+ */
 enum fe_hierarchy {
 	HIERARCHY_NONE,
 	HIERARCHY_1,
@@ -210,6 +438,15 @@ enum fe_hierarchy {
 	HIERARCHY_AUTO
 };
 
+/**
+ * enum fe_interleaving - Interleaving
+ * @INTERLEAVING_NONE:	No interleaving.
+ * @INTERLEAVING_AUTO:	Auto-detect interleaving.
+ * @INTERLEAVING_240:	Interleaving of 240 symbols.
+ * @INTERLEAVING_720:	Interleaving of 720 symbols.
+ *
+ * Please note that, currently, only DTMB uses it.
+ */
 enum fe_interleaving {
 	INTERLEAVING_NONE,
 	INTERLEAVING_AUTO,
@@ -217,7 +454,8 @@ enum fe_interleaving {
 	INTERLEAVING_720,
 };
 
-/* S2API Commands */
+/* DVBv5 property Commands */
+
 #define DTV_UNDEFINED		0
 #define DTV_TUNE		1
 #define DTV_CLEAR		2
@@ -310,19 +548,79 @@ enum fe_interleaving {
 
 #define DTV_MAX_COMMAND		DTV_STAT_TOTAL_BLOCK_COUNT
 
+/**
+ * enum fe_pilot - Type of pilot tone
+ *
+ * @PILOT_ON:	Pilot tones enabled
+ * @PILOT_OFF:	Pilot tones disabled
+ * @PILOT_AUTO:	Autodetect pilot tones
+ */
 enum fe_pilot {
 	PILOT_ON,
 	PILOT_OFF,
 	PILOT_AUTO,
 };
 
+/**
+ * enum fe_rolloff - Rolloff factor (also known as alpha)
+ * @ROLLOFF_35:		Roloff factor: 35%
+ * @ROLLOFF_20:		Roloff factor: 20%
+ * @ROLLOFF_25:		Roloff factor: 25%
+ * @ROLLOFF_AUTO:	Auto-detect the roloff factor.
+ *
+ * .. note:
+ *
+ *    Roloff factor of 35% is implied on DVB-S. On DVB-S2, it is default.
+ */
 enum fe_rolloff {
-	ROLLOFF_35, /* Implied value in DVB-S, default for DVB-S2 */
+	ROLLOFF_35,
 	ROLLOFF_20,
 	ROLLOFF_25,
 	ROLLOFF_AUTO,
 };
 
+/**
+ * enum fe_delivery_system - Type of the delivery system
+ *
+ * @SYS_UNDEFINED:
+ *	Undefined standard. Generally, indicates an error
+ * @SYS_DVBC_ANNEX_A:
+ *	Cable TV: DVB-C following ITU-T J.83 Annex A spec
+ * @SYS_DVBC_ANNEX_B:
+ *	Cable TV: DVB-C following ITU-T J.83 Annex B spec (ClearQAM)
+ * @SYS_DVBC_ANNEX_C:
+ *	Cable TV: DVB-C following ITU-T J.83 Annex C spec
+ * @SYS_ISDBC:
+ *	Cable TV: ISDB-C (no drivers yet)
+ * @SYS_DVBT:
+ *	Terrestrial TV: DVB-T
+ * @SYS_DVBT2:
+ *	Terrestrial TV: DVB-T2
+ * @SYS_ISDBT:
+ *	Terrestrial TV: ISDB-T
+ * @SYS_ATSC:
+ *	Terrestrial TV: ATSC
+ * @SYS_ATSCMH:
+ *	Terrestrial TV (mobile): ATSC-M/H
+ * @SYS_DTMB:
+ *	Terrestrial TV: DTMB
+ * @SYS_DVBS:
+ *	Satellite TV: DVB-S
+ * @SYS_DVBS2:
+ *	Satellite TV: DVB-S2
+ * @SYS_TURBO:
+ *	Satellite TV: DVB-S Turbo
+ * @SYS_ISDBS:
+ *	Satellite TV: ISDB-S
+ * @SYS_DAB:
+ *	Digital audio: DAB (not fully supported)
+ * @SYS_DSS:
+ *	Satellite TV: DSS (not fully supported)
+ * @SYS_CMMB:
+ *	Terrestrial TV (mobile): CMMB (not fully supported)
+ * @SYS_DVBH:
+ *	Terrestrial TV (mobile): DVB-H (standard deprecated)
+ */
 enum fe_delivery_system {
 	SYS_UNDEFINED,
 	SYS_DVBC_ANNEX_A,
@@ -345,35 +643,85 @@ enum fe_delivery_system {
 	SYS_DVBC_ANNEX_C,
 };
 
-/* backward compatibility */
+/* backward compatibility definitions for delivery systems */
 #define SYS_DVBC_ANNEX_AC	SYS_DVBC_ANNEX_A
-#define SYS_DMBTH SYS_DTMB /* DMB-TH is legacy name, use DTMB instead */
+#define SYS_DMBTH		SYS_DTMB /* DMB-TH is legacy name, use DTMB */
 
-/* ATSC-MH */
+/* ATSC-MH specific parameters */
 
+/**
+ * enum atscmh_sccc_block_mode - Type of Series Concatenated Convolutional
+ *				 Code Block Mode.
+ *
+ * @ATSCMH_SCCC_BLK_SEP:
+ *	Separate SCCC: the SCCC outer code mode shall be set independently
+ *	for each Group Region (A, B, C, D)
+ * @ATSCMH_SCCC_BLK_COMB:
+ *	Combined SCCC: all four Regions shall have the same SCCC outer
+ *	code mode.
+ * @ATSCMH_SCCC_BLK_RES:
+ *	Reserved. Shouldn't be used.
+ */
 enum atscmh_sccc_block_mode {
 	ATSCMH_SCCC_BLK_SEP      = 0,
 	ATSCMH_SCCC_BLK_COMB     = 1,
 	ATSCMH_SCCC_BLK_RES      = 2,
 };
 
+/**
+ * enum atscmh_sccc_code_mode - Type of Series Concatenated Convolutional
+ *				Code Rate.
+ *
+ * @ATSCMH_SCCC_CODE_HLF:
+ *	The outer code rate of a SCCC Block is 1/2 rate.
+ * @ATSCMH_SCCC_CODE_QTR:
+ *	The outer code rate of a SCCC Block is 1/4 rate.
+ * @ATSCMH_SCCC_CODE_RES:
+ *	Reserved. Should not be used.
+ */
 enum atscmh_sccc_code_mode {
 	ATSCMH_SCCC_CODE_HLF     = 0,
 	ATSCMH_SCCC_CODE_QTR     = 1,
 	ATSCMH_SCCC_CODE_RES     = 2,
 };
 
+/**
+ * enum atscmh_rs_frame_ensemble - Reed Solomon(RS) frame ensemble.
+ *
+ * @ATSCMH_RSFRAME_ENS_PRI:	Primary Ensemble.
+ * @ATSCMH_RSFRAME_ENS_SEC:	Secondary Ensemble.
+ */
 enum atscmh_rs_frame_ensemble {
 	ATSCMH_RSFRAME_ENS_PRI   = 0,
 	ATSCMH_RSFRAME_ENS_SEC   = 1,
 };
 
+/**
+ * enum atscmh_rs_frame_mode - Reed Solomon (RS) frame mode.
+ *
+ * @ATSCMH_RSFRAME_PRI_ONLY:
+ *	Single Frame: There is only a primary RS Frame for all Group
+ *	Regions.
+ * @ATSCMH_RSFRAME_PRI_SEC:
+ *	Dual Frame: There are two separate RS Frames: Primary RS Frame for
+ *	Group Region A and B and Secondary RS Frame for Group Region C and
+ *	D.
+ * @ATSCMH_RSFRAME_RES:
+ *	Reserved. Shouldn't be used.
+ */
 enum atscmh_rs_frame_mode {
 	ATSCMH_RSFRAME_PRI_ONLY  = 0,
 	ATSCMH_RSFRAME_PRI_SEC   = 1,
 	ATSCMH_RSFRAME_RES       = 2,
 };
 
+/**
+ * enum atscmh_rs_code_mode
+ * @ATSCMH_RSCODE_211_187:	Reed Solomon code (211,187).
+ * @ATSCMH_RSCODE_223_187:	Reed Solomon code (223,187).
+ * @ATSCMH_RSCODE_235_187:	Reed Solomon code (235,187).
+ * @ATSCMH_RSCODE_RES:		Reserved. Shouldn't be used.
+ */
 enum atscmh_rs_code_mode {
 	ATSCMH_RSCODE_211_187    = 0,
 	ATSCMH_RSCODE_223_187    = 1,
@@ -385,16 +733,17 @@ enum atscmh_rs_code_mode {
 #define LNA_AUTO                (~0U)
 
 /**
- * Scale types for the quality parameters.
+ * enum fecap_scale_params - scale types for the quality parameters.
+ *
  * @FE_SCALE_NOT_AVAILABLE: That QoS measure is not available. That
  *			    could indicate a temporary or a permanent
  *			    condition.
  * @FE_SCALE_DECIBEL: The scale is measured in 0.001 dB steps, typically
- *		  used on signal measures.
+ *		      used on signal measures.
  * @FE_SCALE_RELATIVE: The scale is a relative percentual measure,
- *			ranging from 0 (0%) to 0xffff (100%).
+ *		       ranging from 0 (0%) to 0xffff (100%).
  * @FE_SCALE_COUNTER: The scale counts the occurrence of an event, like
- *			bit error, block error, lapsed time.
+ *		      bit error, block error, lapsed time.
  */
 enum fecap_scale_params {
 	FE_SCALE_NOT_AVAILABLE = 0,
@@ -406,24 +755,38 @@ enum fecap_scale_params {
 /**
  * struct dtv_stats - Used for reading a DTV status property
  *
- * @value:	value of the measure. Should range from 0 to 0xffff;
  * @scale:	Filled with enum fecap_scale_params - the scale
  *		in usage for that parameter
  *
+ * The ``{unnamed_union}`` may have either one of the values below:
+ *
+ * %svalue
+ *	integer value of the measure, for %FE_SCALE_DECIBEL,
+ *	used for dB measures. The unit is 0.001 dB.
+ *
+ * %uvalue
+ *	unsigned integer value of the measure, used when @scale is
+ *	either %FE_SCALE_RELATIVE or %FE_SCALE_COUNTER.
+ *
  * For most delivery systems, this will return a single value for each
  * parameter.
+ *
  * It should be noticed, however, that new OFDM delivery systems like
  * ISDB can use different modulation types for each group of carriers.
  * On such standards, up to 8 groups of statistics can be provided, one
  * for each carrier group (called "layer" on ISDB).
+ *
  * In order to be consistent with other delivery systems, the first
  * value refers to the entire set of carriers ("global").
- * dtv_status:scale should use the value FE_SCALE_NOT_AVAILABLE when
+ *
+ * @scale should use the value %FE_SCALE_NOT_AVAILABLE when
  * the value for the entire group of carriers or from one specific layer
  * is not provided by the hardware.
- * st.len should be filled with the latest filled status + 1.
  *
- * In other words, for ISDB, those values should be filled like:
+ * @len should be filled with the latest filled status + 1.
+ *
+ * In other words, for ISDB, those values should be filled like::
+ *
  *	u.st.stat.svalue[0] = global statistics;
  *	u.st.stat.scale[0] = FE_SCALE_DECIBEL;
  *	u.st.stat.value[1] = layer A statistics;
@@ -445,11 +808,39 @@ struct dtv_stats {
 
 #define MAX_DTV_STATS   4
 
+/**
+ * struct dtv_fe_stats - store Digital TV frontend statistics
+ *
+ * @len:	length of the statistics - if zero, stats is disabled.
+ * @stat:	array with digital TV statistics.
+ *
+ * On most standards, @len can either be 0 or 1. However, for ISDB, each
+ * layer is modulated in separate. So, each layer may have its own set
+ * of statistics. If so, stat[0] carries on a global value for the property.
+ * Indexes 1 to 3 means layer A to B.
+ */
 struct dtv_fe_stats {
 	__u8 len;
 	struct dtv_stats stat[MAX_DTV_STATS];
 } __attribute__ ((packed));
 
+/**
+ * struct dtv_property - store one of frontend command and its value
+ *
+ * @cmd:	Digital TV command.
+ * @reserved:	Not used.
+ * @u:		Union with the values for the command.
+ * @result:	Result of the command set (currently unused).
+ *
+ * The @u union may have either one of the values below:
+ *
+ * %data
+ *	an unsigned 32-bits number.
+ * %st
+ *	a &struct dtv_fe_stats array of statistics.
+ * %buffer
+ *	a buffer of up to 32 characters (currently unused).
+ */
 struct dtv_property {
 	__u32 cmd;
 	__u32 reserved[3];
@@ -469,17 +860,70 @@ struct dtv_property {
 /* num of properties cannot exceed DTV_IOCTL_MAX_MSGS per ioctl */
 #define DTV_IOCTL_MAX_MSGS 64
 
+/**
+ * struct dtv_properties - a set of command/value pairs.
+ *
+ * @num:	amount of commands stored at the struct.
+ * @props:	a pointer to &struct dtv_property.
+ */
 struct dtv_properties {
 	__u32 num;
 	struct dtv_property *props;
 };
 
+/*
+ * When set, this flag will disable any zigzagging or other "normal" tuning
+ * behavior. Additionally, there will be no automatic monitoring of the lock
+ * status, and hence no frontend events will be generated. If a frontend device
+ * is closed, this flag will be automatically turned off when the device is
+ * reopened read-write.
+ */
+#define FE_TUNE_MODE_ONESHOT 0x01
+
+/* Digital TV Frontend API calls */
+
+#define FE_GET_INFO		   _IOR('o', 61, struct dvb_frontend_info)
+
+#define FE_DISEQC_RESET_OVERLOAD   _IO('o', 62)
+#define FE_DISEQC_SEND_MASTER_CMD  _IOW('o', 63, struct dvb_diseqc_master_cmd)
+#define FE_DISEQC_RECV_SLAVE_REPLY _IOR('o', 64, struct dvb_diseqc_slave_reply)
+#define FE_DISEQC_SEND_BURST       _IO('o', 65)  /* fe_sec_mini_cmd_t */
+
+#define FE_SET_TONE		   _IO('o', 66)  /* fe_sec_tone_mode_t */
+#define FE_SET_VOLTAGE		   _IO('o', 67)  /* fe_sec_voltage_t */
+#define FE_ENABLE_HIGH_LNB_VOLTAGE _IO('o', 68)  /* int */
+
+#define FE_READ_STATUS		   _IOR('o', 69, fe_status_t)
+#define FE_READ_BER		   _IOR('o', 70, __u32)
+#define FE_READ_SIGNAL_STRENGTH    _IOR('o', 71, __u16)
+#define FE_READ_SNR		   _IOR('o', 72, __u16)
+#define FE_READ_UNCORRECTED_BLOCKS _IOR('o', 73, __u32)
+
+#define FE_SET_FRONTEND_TUNE_MODE  _IO('o', 81) /* unsigned int */
+#define FE_GET_EVENT		   _IOR('o', 78, struct dvb_frontend_event)
+
+#define FE_DISHNETWORK_SEND_LEGACY_CMD _IO('o', 80) /* unsigned int */
+
+#define FE_SET_PROPERTY		   _IOW('o', 82, struct dtv_properties)
+#define FE_GET_PROPERTY		   _IOR('o', 83, struct dtv_properties)
+
 #if defined(__DVB_CORE__) || !defined (__KERNEL__)
 
 /*
- * DEPRECATED: The DVBv3 ioctls, structs and enums should not be used on
- * newer programs, as it doesn't support the second generation of digital
- * TV standards, nor supports newer delivery systems.
+ * DEPRECATED: Everything below is deprecated in favor of DVBv5 API
+ *
+ * The DVBv3 only ioctls, structs and enums should not be used on
+ * newer programs, as it doesn't support the second generation of
+ * digital TV standards, nor supports newer delivery systems.
+ * They also don't support modern frontends with usually support multiple
+ * delivery systems.
+ *
+ * Drivers shouldn't use them.
+ *
+ * New applications should use DVBv5 delivery system instead
+ */
+
+/*
  */
 
 enum fe_bandwidth {
@@ -492,7 +936,7 @@ enum fe_bandwidth {
 	BANDWIDTH_1_712_MHZ,
 };
 
-/* This is needed for legacy userspace support */
+/* This is kept for legacy userspace support */
 typedef enum fe_sec_voltage fe_sec_voltage_t;
 typedef enum fe_caps fe_caps_t;
 typedef enum fe_type fe_type_t;
@@ -510,6 +954,8 @@ typedef enum fe_pilot fe_pilot_t;
 typedef enum fe_rolloff fe_rolloff_t;
 typedef enum fe_delivery_system fe_delivery_system_t;
 
+/* DVBv3 structs */
+
 struct dvb_qpsk_parameters {
 	__u32		symbol_rate;  /* symbol rate in Symbols per second */
 	fe_code_rate_t	fec_inner;    /* forward error correction (see above) */
@@ -551,42 +997,12 @@ struct dvb_frontend_event {
 	fe_status_t status;
 	struct dvb_frontend_parameters parameters;
 };
-#endif
-
-#define FE_SET_PROPERTY		   _IOW('o', 82, struct dtv_properties)
-#define FE_GET_PROPERTY		   _IOR('o', 83, struct dtv_properties)
-
-/**
- * When set, this flag will disable any zigzagging or other "normal" tuning
- * behaviour. Additionally, there will be no automatic monitoring of the lock
- * status, and hence no frontend events will be generated. If a frontend device
- * is closed, this flag will be automatically turned off when the device is
- * reopened read-write.
- */
-#define FE_TUNE_MODE_ONESHOT 0x01
 
-#define FE_GET_INFO		   _IOR('o', 61, struct dvb_frontend_info)
-
-#define FE_DISEQC_RESET_OVERLOAD   _IO('o', 62)
-#define FE_DISEQC_SEND_MASTER_CMD  _IOW('o', 63, struct dvb_diseqc_master_cmd)
-#define FE_DISEQC_RECV_SLAVE_REPLY _IOR('o', 64, struct dvb_diseqc_slave_reply)
-#define FE_DISEQC_SEND_BURST       _IO('o', 65)  /* fe_sec_mini_cmd_t */
-
-#define FE_SET_TONE		   _IO('o', 66)  /* fe_sec_tone_mode_t */
-#define FE_SET_VOLTAGE		   _IO('o', 67)  /* fe_sec_voltage_t */
-#define FE_ENABLE_HIGH_LNB_VOLTAGE _IO('o', 68)  /* int */
-
-#define FE_READ_STATUS		   _IOR('o', 69, fe_status_t)
-#define FE_READ_BER		   _IOR('o', 70, __u32)
-#define FE_READ_SIGNAL_STRENGTH    _IOR('o', 71, __u16)
-#define FE_READ_SNR		   _IOR('o', 72, __u16)
-#define FE_READ_UNCORRECTED_BLOCKS _IOR('o', 73, __u32)
+/* DVBv3 API calls */
 
 #define FE_SET_FRONTEND		   _IOW('o', 76, struct dvb_frontend_parameters)
 #define FE_GET_FRONTEND		   _IOR('o', 77, struct dvb_frontend_parameters)
-#define FE_SET_FRONTEND_TUNE_MODE  _IO('o', 81) /* unsigned int */
-#define FE_GET_EVENT		   _IOR('o', 78, struct dvb_frontend_event)
 
-#define FE_DISHNETWORK_SEND_LEGACY_CMD _IO('o', 80) /* unsigned int */
+#endif
 
 #endif /*_DVBFRONTEND_H_*/
-- 
cgit v1.2.3


From 9d5e27cbc117671959a9f625e51c754f5a0666e3 Mon Sep 17 00:00:00 2001
From: Mauro Carvalho Chehab <mchehab@s-opensource.com>
Date: Wed, 30 Aug 2017 13:45:20 -0400
Subject: media: dvb frontend docs: use kernel-doc documentation

Now that frontend.h contains most documentation for the frontend,
remove the duplicated information from Documentation/ and use the
kernel-doc auto-generated one instead.

That should simplify maintainership of DVB frontend uAPI, as most
of the documentation will stick with the header file.

Signed-off-by: Mauro Carvalho Chehab <mchehab@s-opensource.com>
---
 Documentation/media/frontend.h.rst.exceptions      |  185 +-
 Documentation/media/uapi/dvb/dtv-fe-stats.rst      |   17 -
 Documentation/media/uapi/dvb/dtv-properties.rst    |   15 -
 Documentation/media/uapi/dvb/dtv-property.rst      |   31 -
 Documentation/media/uapi/dvb/dtv-stats.rst         |   18 -
 Documentation/media/uapi/dvb/dvbproperty-006.rst   |   12 -
 Documentation/media/uapi/dvb/dvbproperty.rst       |   28 +-
 .../media/uapi/dvb/fe-diseqc-recv-slave-reply.rst  |   40 +-
 .../media/uapi/dvb/fe-diseqc-send-burst.rst        |   31 +-
 .../media/uapi/dvb/fe-diseqc-send-master-cmd.rst   |   29 +-
 Documentation/media/uapi/dvb/fe-get-info.rst       |  370 +---
 Documentation/media/uapi/dvb/fe-get-property.rst   |    2 +-
 Documentation/media/uapi/dvb/fe-read-status.rst    |   83 -
 Documentation/media/uapi/dvb/fe-set-tone.rst       |   30 -
 .../media/uapi/dvb/fe_property_parameters.rst      | 1847 +++++---------------
 Documentation/media/uapi/dvb/frontend-header.rst   |    4 +
 include/uapi/linux/dvb/frontend.h                  |    8 +-
 17 files changed, 602 insertions(+), 2148 deletions(-)
 delete mode 100644 Documentation/media/uapi/dvb/dtv-fe-stats.rst
 delete mode 100644 Documentation/media/uapi/dvb/dtv-properties.rst
 delete mode 100644 Documentation/media/uapi/dvb/dtv-property.rst
 delete mode 100644 Documentation/media/uapi/dvb/dtv-stats.rst
 delete mode 100644 Documentation/media/uapi/dvb/dvbproperty-006.rst
 create mode 100644 Documentation/media/uapi/dvb/frontend-header.rst

(limited to 'include')

diff --git a/Documentation/media/frontend.h.rst.exceptions b/Documentation/media/frontend.h.rst.exceptions
index 7656770f1936..f7c4df620a52 100644
--- a/Documentation/media/frontend.h.rst.exceptions
+++ b/Documentation/media/frontend.h.rst.exceptions
@@ -25,19 +25,9 @@ ignore define DTV_MAX_COMMAND
 ignore define MAX_DTV_STATS
 ignore define DTV_IOCTL_MAX_MSGS
 
-# Stats enum is documented altogether
-replace enum fecap_scale_params :ref:`frontend-stat-properties`
-replace symbol FE_SCALE_COUNTER frontend-stat-properties
-replace symbol FE_SCALE_DECIBEL frontend-stat-properties
-replace symbol FE_SCALE_NOT_AVAILABLE frontend-stat-properties
-replace symbol FE_SCALE_RELATIVE frontend-stat-properties
-
 # the same reference is used for both get and set ioctls
 replace ioctl FE_SET_PROPERTY :c:type:`FE_GET_PROPERTY`
 
-# Ignore struct used only internally at Kernel
-ignore struct dtv_cmds_h
-
 # Typedefs that use the enum reference
 replace typedef fe_sec_voltage_t :c:type:`fe_sec_voltage`
 
@@ -45,3 +35,178 @@ replace typedef fe_sec_voltage_t :c:type:`fe_sec_voltage`
 replace define FE_TUNE_MODE_ONESHOT :c:func:`FE_SET_FRONTEND_TUNE_MODE`
 replace define LNA_AUTO dtv-lna
 replace define NO_STREAM_ID_FILTER dtv-stream-id
+
+# Those enums are defined at the frontend.h header, and not externally
+
+ignore symbol FE_IS_STUPID
+ignore symbol FE_CAN_INVERSION_AUTO
+ignore symbol FE_CAN_FEC_1_2
+ignore symbol FE_CAN_FEC_2_3
+ignore symbol FE_CAN_FEC_3_4
+ignore symbol FE_CAN_FEC_4_5
+ignore symbol FE_CAN_FEC_5_6
+ignore symbol FE_CAN_FEC_6_7
+ignore symbol FE_CAN_FEC_7_8
+ignore symbol FE_CAN_FEC_8_9
+ignore symbol FE_CAN_FEC_AUTO
+ignore symbol FE_CAN_QPSK
+ignore symbol FE_CAN_QAM_16
+ignore symbol FE_CAN_QAM_32
+ignore symbol FE_CAN_QAM_64
+ignore symbol FE_CAN_QAM_128
+ignore symbol FE_CAN_QAM_256
+ignore symbol FE_CAN_QAM_AUTO
+ignore symbol FE_CAN_TRANSMISSION_MODE_AUTO
+ignore symbol FE_CAN_BANDWIDTH_AUTO
+ignore symbol FE_CAN_GUARD_INTERVAL_AUTO
+ignore symbol FE_CAN_HIERARCHY_AUTO
+ignore symbol FE_CAN_8VSB
+ignore symbol FE_CAN_16VSB
+ignore symbol FE_HAS_EXTENDED_CAPS
+ignore symbol FE_CAN_MULTISTREAM
+ignore symbol FE_CAN_TURBO_FEC
+ignore symbol FE_CAN_2G_MODULATION
+ignore symbol FE_NEEDS_BENDING
+ignore symbol FE_CAN_RECOVER
+ignore symbol FE_CAN_MUTE_TS
+
+ignore symbol QPSK
+ignore symbol QAM_16
+ignore symbol QAM_32
+ignore symbol QAM_64
+ignore symbol QAM_128
+ignore symbol QAM_256
+ignore symbol QAM_AUTO
+ignore symbol VSB_8
+ignore symbol VSB_16
+ignore symbol PSK_8
+ignore symbol APSK_16
+ignore symbol APSK_32
+ignore symbol DQPSK
+ignore symbol QAM_4_NR
+
+ignore symbol SEC_VOLTAGE_13
+ignore symbol SEC_VOLTAGE_18
+ignore symbol SEC_VOLTAGE_OFF
+
+ignore symbol SEC_TONE_ON
+ignore symbol SEC_TONE_OFF
+
+ignore symbol SEC_MINI_A
+ignore symbol SEC_MINI_B
+
+ignore symbol FE_NONE
+ignore symbol FE_HAS_SIGNAL
+ignore symbol FE_HAS_CARRIER
+ignore symbol FE_HAS_VITERBI
+ignore symbol FE_HAS_SYNC
+ignore symbol FE_HAS_LOCK
+ignore symbol FE_REINIT
+ignore symbol FE_TIMEDOUT
+
+ignore symbol FEC_NONE
+ignore symbol FEC_1_2
+ignore symbol FEC_2_3
+ignore symbol FEC_3_4
+ignore symbol FEC_4_5
+ignore symbol FEC_5_6
+ignore symbol FEC_6_7
+ignore symbol FEC_7_8
+ignore symbol FEC_8_9
+ignore symbol FEC_AUTO
+ignore symbol FEC_3_5
+ignore symbol FEC_9_10
+ignore symbol FEC_2_5
+
+ignore symbol TRANSMISSION_MODE_AUTO
+ignore symbol TRANSMISSION_MODE_1K
+ignore symbol TRANSMISSION_MODE_2K
+ignore symbol TRANSMISSION_MODE_8K
+ignore symbol TRANSMISSION_MODE_4K
+ignore symbol TRANSMISSION_MODE_16K
+ignore symbol TRANSMISSION_MODE_32K
+ignore symbol TRANSMISSION_MODE_C1
+ignore symbol TRANSMISSION_MODE_C3780
+ignore symbol TRANSMISSION_MODE_2K
+ignore symbol TRANSMISSION_MODE_8K
+
+ignore symbol GUARD_INTERVAL_AUTO
+ignore symbol GUARD_INTERVAL_1_128
+ignore symbol GUARD_INTERVAL_1_32
+ignore symbol GUARD_INTERVAL_1_16
+ignore symbol GUARD_INTERVAL_1_8
+ignore symbol GUARD_INTERVAL_1_4
+ignore symbol GUARD_INTERVAL_19_128
+ignore symbol GUARD_INTERVAL_19_256
+ignore symbol GUARD_INTERVAL_PN420
+ignore symbol GUARD_INTERVAL_PN595
+ignore symbol GUARD_INTERVAL_PN945
+
+ignore symbol HIERARCHY_NONE
+ignore symbol HIERARCHY_AUTO
+ignore symbol HIERARCHY_1
+ignore symbol HIERARCHY_2
+ignore symbol HIERARCHY_4
+
+ignore symbol INTERLEAVING_NONE
+ignore symbol INTERLEAVING_AUTO
+ignore symbol INTERLEAVING_240
+ignore symbol INTERLEAVING_720
+
+ignore symbol PILOT_ON
+ignore symbol PILOT_OFF
+ignore symbol PILOT_AUTO
+
+ignore symbol ROLLOFF_35
+ignore symbol ROLLOFF_20
+ignore symbol ROLLOFF_25
+ignore symbol ROLLOFF_AUTO
+
+ignore symbol INVERSION_ON
+ignore symbol INVERSION_OFF
+ignore symbol INVERSION_AUTO
+
+ignore symbol SYS_UNDEFINED
+ignore symbol SYS_DVBC_ANNEX_A
+ignore symbol SYS_DVBC_ANNEX_B
+ignore symbol SYS_DVBC_ANNEX_C
+ignore symbol SYS_ISDBC
+ignore symbol SYS_DVBT
+ignore symbol SYS_DVBT2
+ignore symbol SYS_ISDBT
+ignore symbol SYS_ATSC
+ignore symbol SYS_ATSCMH
+ignore symbol SYS_DTMB
+ignore symbol SYS_DVBS
+ignore symbol SYS_DVBS2
+ignore symbol SYS_TURBO
+ignore symbol SYS_ISDBS
+ignore symbol SYS_DAB
+ignore symbol SYS_DSS
+ignore symbol SYS_CMMB
+ignore symbol SYS_DVBH
+
+ignore symbol ATSCMH_SCCC_BLK_SEP
+ignore symbol ATSCMH_SCCC_BLK_COMB
+ignore symbol ATSCMH_SCCC_BLK_RES
+
+ignore symbol ATSCMH_SCCC_CODE_HLF
+ignore symbol ATSCMH_SCCC_CODE_QTR
+ignore symbol ATSCMH_SCCC_CODE_RES
+
+ignore symbol ATSCMH_RSFRAME_ENS_PRI
+ignore symbol ATSCMH_RSFRAME_ENS_SEC
+
+ignore symbol ATSCMH_RSFRAME_PRI_ONLY
+ignore symbol ATSCMH_RSFRAME_PRI_SEC
+ignore symbol ATSCMH_RSFRAME_RES
+
+ignore symbol ATSCMH_RSCODE_211_187
+ignore symbol ATSCMH_RSCODE_223_187
+ignore symbol ATSCMH_RSCODE_235_187
+ignore symbol ATSCMH_RSCODE_RES
+
+ignore symbol FE_SCALE_NOT_AVAILABLE
+ignore symbol FE_SCALE_DECIBEL
+ignore symbol FE_SCALE_RELATIVE
+ignore symbol FE_SCALE_COUNTER
diff --git a/Documentation/media/uapi/dvb/dtv-fe-stats.rst b/Documentation/media/uapi/dvb/dtv-fe-stats.rst
deleted file mode 100644
index e8a02a1f138d..000000000000
--- a/Documentation/media/uapi/dvb/dtv-fe-stats.rst
+++ /dev/null
@@ -1,17 +0,0 @@
-.. -*- coding: utf-8; mode: rst -*-
-
-.. c:type:: dtv_fe_stats
-
-*******************
-struct dtv_fe_stats
-*******************
-
-
-.. code-block:: c
-
-    #define MAX_DTV_STATS   4
-
-    struct dtv_fe_stats {
-	__u8 len;
-	struct dtv_stats stat[MAX_DTV_STATS];
-    } __packed;
diff --git a/Documentation/media/uapi/dvb/dtv-properties.rst b/Documentation/media/uapi/dvb/dtv-properties.rst
deleted file mode 100644
index 48c4e834ad11..000000000000
--- a/Documentation/media/uapi/dvb/dtv-properties.rst
+++ /dev/null
@@ -1,15 +0,0 @@
-.. -*- coding: utf-8; mode: rst -*-
-
-.. c:type:: dtv_properties
-
-*********************
-struct dtv_properties
-*********************
-
-
-.. code-block:: c
-
-    struct dtv_properties {
-	__u32 num;
-	struct dtv_property *props;
-    };
diff --git a/Documentation/media/uapi/dvb/dtv-property.rst b/Documentation/media/uapi/dvb/dtv-property.rst
deleted file mode 100644
index 3ddc3474b00e..000000000000
--- a/Documentation/media/uapi/dvb/dtv-property.rst
+++ /dev/null
@@ -1,31 +0,0 @@
-.. -*- coding: utf-8; mode: rst -*-
-
-.. c:type:: dtv_property
-
-*******************
-struct dtv_property
-*******************
-
-
-.. code-block:: c
-
-    /* Reserved fields should be set to 0 */
-
-    struct dtv_property {
-	__u32 cmd;
-	__u32 reserved[3];
-	union {
-	    __u32 data;
-	    struct dtv_fe_stats st;
-	    struct {
-		__u8 data[32];
-		__u32 len;
-		__u32 reserved1[3];
-		void *reserved2;
-	    } buffer;
-	} u;
-	int result;
-    } __attribute__ ((packed));
-
-    /* num of properties cannot exceed DTV_IOCTL_MAX_MSGS per ioctl */
-    #define DTV_IOCTL_MAX_MSGS 64
diff --git a/Documentation/media/uapi/dvb/dtv-stats.rst b/Documentation/media/uapi/dvb/dtv-stats.rst
deleted file mode 100644
index 35239e72bf74..000000000000
--- a/Documentation/media/uapi/dvb/dtv-stats.rst
+++ /dev/null
@@ -1,18 +0,0 @@
-.. -*- coding: utf-8; mode: rst -*-
-
-.. c:type:: dtv_stats
-
-****************
-struct dtv_stats
-****************
-
-
-.. code-block:: c
-
-    struct dtv_stats {
-	__u8 scale; /* enum fecap_scale_params type */
-	union {
-	    __u64 uvalue;   /* for counters and relative scales */
-	    __s64 svalue;   /* for 1/1000 dB measures */
-	};
-    } __packed;
diff --git a/Documentation/media/uapi/dvb/dvbproperty-006.rst b/Documentation/media/uapi/dvb/dvbproperty-006.rst
deleted file mode 100644
index 3343a0f306fe..000000000000
--- a/Documentation/media/uapi/dvb/dvbproperty-006.rst
+++ /dev/null
@@ -1,12 +0,0 @@
-.. -*- coding: utf-8; mode: rst -*-
-
-**************
-Property types
-**************
-
-On :ref:`FE_GET_PROPERTY and FE_SET_PROPERTY <FE_GET_PROPERTY>`,
-the actual action is determined by the dtv_property cmd/data pairs.
-With one single ioctl, is possible to get/set up to 64 properties. The
-actual meaning of each property is described on the next sections.
-
-The available frontend property types are shown on the next section.
diff --git a/Documentation/media/uapi/dvb/dvbproperty.rst b/Documentation/media/uapi/dvb/dvbproperty.rst
index 843f1d70aff0..c40943be5925 100644
--- a/Documentation/media/uapi/dvb/dvbproperty.rst
+++ b/Documentation/media/uapi/dvb/dvbproperty.rst
@@ -2,8 +2,9 @@
 
 .. _frontend-properties:
 
-DVB Frontend properties
-=======================
+**************
+Property types
+**************
 
 Tuning into a Digital TV physical channel and starting decoding it
 requires changing a set of parameters, in order to control the tuner,
@@ -20,10 +21,15 @@ enough to group the structs that would be required for those new
 standards. Also, extending it would break userspace.
 
 So, the legacy union/struct based approach was deprecated, in favor
-of a properties set approach.
+of a properties set approach. On such approach,
+:ref:`FE_GET_PROPERTY and FE_SET_PROPERTY <FE_GET_PROPERTY>` are used
+to setup the frontend and read its status.
+
+The actual action is determined by a set of dtv_property cmd/data pairs.
+With one single ioctl, is possible to get/set up to 64 properties.
 
 This section describes the new and recommended way to set the frontend,
-with suppports all digital TV delivery systems.
+with supports all digital TV delivery systems.
 
 .. note::
 
@@ -63,12 +69,9 @@ Mbauds, those properties should be sent to
 The code that would that would do the above is show in
 :ref:`dtv-prop-example`.
 
-.. _dtv-prop-example:
-
-Example: Setting digital TV frontend properties
-===============================================
-
 .. code-block:: c
+    :caption: Example: Setting digital TV frontend properties
+    :name: dtv-prop-example
 
     #include <stdio.h>
     #include <fcntl.h>
@@ -112,17 +115,12 @@ Example: Setting digital TV frontend properties
    provides methods for usual operations like program scanning and to
    read/write channel descriptor files.
 
-
 .. toctree::
     :maxdepth: 1
 
-    dtv-stats
-    dtv-fe-stats
-    dtv-property
-    dtv-properties
-    dvbproperty-006
     fe_property_parameters
     frontend-stat-properties
     frontend-property-terrestrial-systems
     frontend-property-cable-systems
     frontend-property-satellite-systems
+    frontend-header
diff --git a/Documentation/media/uapi/dvb/fe-diseqc-recv-slave-reply.rst b/Documentation/media/uapi/dvb/fe-diseqc-recv-slave-reply.rst
index 302db2857f90..473855584d7f 100644
--- a/Documentation/media/uapi/dvb/fe-diseqc-recv-slave-reply.rst
+++ b/Documentation/media/uapi/dvb/fe-diseqc-recv-slave-reply.rst
@@ -26,8 +26,7 @@ Arguments
     File descriptor returned by :ref:`open() <frontend_f_open>`.
 
 ``argp``
-    pointer to struct
-    :c:type:`dvb_diseqc_slave_reply`
+    pointer to struct :c:type:`dvb_diseqc_slave_reply`.
 
 
 Description
@@ -35,42 +34,7 @@ Description
 
 Receives reply from a DiSEqC 2.0 command.
 
-.. c:type:: dvb_diseqc_slave_reply
-
-.. tabularcolumns:: |p{4.4cm}|p{4.4cm}|p{8.7cm}|
-
-.. flat-table:: struct dvb_diseqc_slave_reply
-    :header-rows:  0
-    :stub-columns: 0
-    :widths:       1 1 2
-
-
-    -  .. row 1
-
-       -  uint8_t
-
-       -  msg[4]
-
-       -  DiSEqC message (framing, data[3])
-
-    -  .. row 2
-
-       -  uint8_t
-
-       -  msg_len
-
-       -  Length of the DiSEqC message. Valid values are 0 to 4, where 0
-	  means no msg
-
-    -  .. row 3
-
-       -  int
-
-       -  timeout
-
-       -  Return from ioctl after timeout ms with errorcode when no message
-	  was received
-
+The received message is stored at the buffer pointed by ``argp``.
 
 Return Value
 ============
diff --git a/Documentation/media/uapi/dvb/fe-diseqc-send-burst.rst b/Documentation/media/uapi/dvb/fe-diseqc-send-burst.rst
index e962f6ec5aaf..54d35517e784 100644
--- a/Documentation/media/uapi/dvb/fe-diseqc-send-burst.rst
+++ b/Documentation/media/uapi/dvb/fe-diseqc-send-burst.rst
@@ -26,7 +26,7 @@ Arguments
     File descriptor returned by :ref:`open() <frontend_f_open>`.
 
 ``tone``
-    an integer enumered value described at :c:type:`fe_sec_mini_cmd`
+    An integer enumered value described at :c:type:`fe_sec_mini_cmd`.
 
 
 Description
@@ -39,35 +39,6 @@ read/write permissions.
 It provides support for what's specified at
 `Digital Satellite Equipment Control (DiSEqC) - Simple "ToneBurst" Detection Circuit specification. <http://www.eutelsat.com/files/contributed/satellites/pdf/Diseqc/associated%20docs/simple_tone_burst_detec.pdf>`__
 
-.. c:type:: fe_sec_mini_cmd
-
-.. flat-table:: enum fe_sec_mini_cmd
-    :header-rows:  1
-    :stub-columns: 0
-
-
-    -  .. row 1
-
-       -  ID
-
-       -  Description
-
-    -  .. row 2
-
-       -  .. _SEC-MINI-A:
-
-	  ``SEC_MINI_A``
-
-       -  Sends a mini-DiSEqC 22kHz '0' Tone Burst to select satellite-A
-
-    -  .. row 3
-
-       -  .. _SEC-MINI-B:
-
-	  ``SEC_MINI_B``
-
-       -  Sends a mini-DiSEqC 22kHz '1' Data Burst to select satellite-B
-
 
 Return Value
 ============
diff --git a/Documentation/media/uapi/dvb/fe-diseqc-send-master-cmd.rst b/Documentation/media/uapi/dvb/fe-diseqc-send-master-cmd.rst
index bbcab3df39b5..7392d6747ad6 100644
--- a/Documentation/media/uapi/dvb/fe-diseqc-send-master-cmd.rst
+++ b/Documentation/media/uapi/dvb/fe-diseqc-send-master-cmd.rst
@@ -33,34 +33,7 @@ Arguments
 Description
 ===========
 
-Sends a DiSEqC command to the antenna subsystem.
-
-
-.. c:type:: dvb_diseqc_master_cmd
-
-.. tabularcolumns:: |p{4.4cm}|p{4.4cm}|p{8.7cm}|
-
-.. flat-table:: struct dvb_diseqc_master_cmd
-    :header-rows:  0
-    :stub-columns: 0
-    :widths:       1 1 2
-
-
-    -  .. row 1
-
-       -  uint8_t
-
-       -  msg[6]
-
-       -  DiSEqC message (framing, address, command, data[3])
-
-    -  .. row 2
-
-       -  uint8_t
-
-       -  msg_len
-
-       -  Length of the DiSEqC message. Valid values are 3 to 6
+Sends the DiSEqC command pointed by ``argp`` to the antenna subsystem.
 
 Return Value
 ============
diff --git a/Documentation/media/uapi/dvb/fe-get-info.rst b/Documentation/media/uapi/dvb/fe-get-info.rst
index e3d64b251f61..30dde8a791f3 100644
--- a/Documentation/media/uapi/dvb/fe-get-info.rst
+++ b/Documentation/media/uapi/dvb/fe-get-info.rst
@@ -40,112 +40,6 @@ takes a pointer to dvb_frontend_info which is filled by the driver.
 When the driver is not compatible with this specification the ioctl
 returns an error.
 
-.. c:type:: dvb_frontend_info
-
-.. tabularcolumns:: |p{4.4cm}|p{4.4cm}|p{8.7cm}|
-
-.. flat-table:: struct dvb_frontend_info
-    :header-rows:  0
-    :stub-columns: 0
-    :widths:       1 1 2
-
-
-    -  .. row 1
-
-       -  char
-
-       -  name[128]
-
-       -  Name of the frontend
-
-    -  .. row 2
-
-       -  fe_type_t
-
-       -  type
-
-       -  **DEPRECATED**. DVBv3 type. Should not be used on modern programs,
-	  as a frontend may have more than one type. So, the DVBv5 API
-	  should be used instead to enumerate and select the frontend type.
-
-    -  .. row 3
-
-       -  uint32_t
-
-       -  frequency_min
-
-       -  Minimal frequency supported by the frontend
-
-    -  .. row 4
-
-       -  uint32_t
-
-       -  frequency_max
-
-       -  Maximal frequency supported by the frontend
-
-    -  .. row 5
-
-       -  uint32_t
-
-       -  frequency_stepsize
-
-       -  Frequency step - all frequencies are multiple of this value
-
-    -  .. row 6
-
-       -  uint32_t
-
-       -  frequency_tolerance
-
-       -  Tolerance of the frequency
-
-    -  .. row 7
-
-       -  uint32_t
-
-       -  symbol_rate_min
-
-       -  Minimal symbol rate (for Cable/Satellite systems), in bauds
-
-    -  .. row 8
-
-       -  uint32_t
-
-       -  symbol_rate_max
-
-       -  Maximal symbol rate (for Cable/Satellite systems), in bauds
-
-    -  .. row 9
-
-       -  uint32_t
-
-       -  symbol_rate_tolerance
-
-       -  Maximal symbol rate tolerance, in ppm
-
-    -  .. row 10
-
-       -  uint32_t
-
-       -  notifier_delay
-
-       -  **DEPRECATED**. Not used by any driver.
-
-    -  .. row 11
-
-       -  enum :c:type:`fe_caps`
-
-       -  caps
-
-       -  Capabilities supported by the frontend
-
-
-.. note::
-
-   The frequencies are specified in Hz for Terrestrial and Cable
-   systems. They're specified in kHz for Satellite systems
-
 
 frontend capabilities
 =====================
@@ -153,269 +47,7 @@ frontend capabilities
 Capabilities describe what a frontend can do. Some capabilities are
 supported only on some specific frontend types.
 
-.. c:type:: fe_caps
-
-.. tabularcolumns:: |p{6.5cm}|p{11.0cm}|
-
-.. flat-table:: enum fe_caps
-    :header-rows:  1
-    :stub-columns: 0
-
-
-    -  .. row 1
-
-       -  ID
-
-       -  Description
-
-    -  .. row 2
-
-       -  .. _FE-IS-STUPID:
-
-	  ``FE_IS_STUPID``
-
-       -  There's something wrong at the frontend, and it can't report its
-	  capabilities
-
-    -  .. row 3
-
-       -  .. _FE-CAN-INVERSION-AUTO:
-
-	  ``FE_CAN_INVERSION_AUTO``
-
-       -  The frontend is capable of auto-detecting inversion
-
-    -  .. row 4
-
-       -  .. _FE-CAN-FEC-1-2:
-
-	  ``FE_CAN_FEC_1_2``
-
-       -  The frontend supports FEC 1/2
-
-    -  .. row 5
-
-       -  .. _FE-CAN-FEC-2-3:
-
-	  ``FE_CAN_FEC_2_3``
-
-       -  The frontend supports FEC 2/3
-
-    -  .. row 6
-
-       -  .. _FE-CAN-FEC-3-4:
-
-	  ``FE_CAN_FEC_3_4``
-
-       -  The frontend supports FEC 3/4
-
-    -  .. row 7
-
-       -  .. _FE-CAN-FEC-4-5:
-
-	  ``FE_CAN_FEC_4_5``
-
-       -  The frontend supports FEC 4/5
-
-    -  .. row 8
-
-       -  .. _FE-CAN-FEC-5-6:
-
-	  ``FE_CAN_FEC_5_6``
-
-       -  The frontend supports FEC 5/6
-
-    -  .. row 9
-
-       -  .. _FE-CAN-FEC-6-7:
-
-	  ``FE_CAN_FEC_6_7``
-
-       -  The frontend supports FEC 6/7
-
-    -  .. row 10
-
-       -  .. _FE-CAN-FEC-7-8:
-
-	  ``FE_CAN_FEC_7_8``
-
-       -  The frontend supports FEC 7/8
-
-    -  .. row 11
-
-       -  .. _FE-CAN-FEC-8-9:
-
-	  ``FE_CAN_FEC_8_9``
-
-       -  The frontend supports FEC 8/9
-
-    -  .. row 12
-
-       -  .. _FE-CAN-FEC-AUTO:
-
-	  ``FE_CAN_FEC_AUTO``
-
-       -  The frontend can autodetect FEC.
-
-    -  .. row 13
-
-       -  .. _FE-CAN-QPSK:
-
-	  ``FE_CAN_QPSK``
-
-       -  The frontend supports QPSK modulation
-
-    -  .. row 14
-
-       -  .. _FE-CAN-QAM-16:
-
-	  ``FE_CAN_QAM_16``
-
-       -  The frontend supports 16-QAM modulation
-
-    -  .. row 15
-
-       -  .. _FE-CAN-QAM-32:
-
-	  ``FE_CAN_QAM_32``
-
-       -  The frontend supports 32-QAM modulation
-
-    -  .. row 16
-
-       -  .. _FE-CAN-QAM-64:
-
-	  ``FE_CAN_QAM_64``
-
-       -  The frontend supports 64-QAM modulation
-
-    -  .. row 17
-
-       -  .. _FE-CAN-QAM-128:
-
-	  ``FE_CAN_QAM_128``
-
-       -  The frontend supports 128-QAM modulation
-
-    -  .. row 18
-
-       -  .. _FE-CAN-QAM-256:
-
-	  ``FE_CAN_QAM_256``
-
-       -  The frontend supports 256-QAM modulation
-
-    -  .. row 19
-
-       -  .. _FE-CAN-QAM-AUTO:
-
-	  ``FE_CAN_QAM_AUTO``
-
-       -  The frontend can autodetect modulation
-
-    -  .. row 20
-
-       -  .. _FE-CAN-TRANSMISSION-MODE-AUTO:
-
-	  ``FE_CAN_TRANSMISSION_MODE_AUTO``
-
-       -  The frontend can autodetect the transmission mode
-
-    -  .. row 21
-
-       -  .. _FE-CAN-BANDWIDTH-AUTO:
-
-	  ``FE_CAN_BANDWIDTH_AUTO``
-
-       -  The frontend can autodetect the bandwidth
-
-    -  .. row 22
-
-       -  .. _FE-CAN-GUARD-INTERVAL-AUTO:
-
-	  ``FE_CAN_GUARD_INTERVAL_AUTO``
-
-       -  The frontend can autodetect the guard interval
-
-    -  .. row 23
-
-       -  .. _FE-CAN-HIERARCHY-AUTO:
-
-	  ``FE_CAN_HIERARCHY_AUTO``
-
-       -  The frontend can autodetect hierarch
-
-    -  .. row 24
-
-       -  .. _FE-CAN-8VSB:
-
-	  ``FE_CAN_8VSB``
-
-       -  The frontend supports 8-VSB modulation
-
-    -  .. row 25
-
-       -  .. _FE-CAN-16VSB:
-
-	  ``FE_CAN_16VSB``
-
-       -  The frontend supports 16-VSB modulation
-
-    -  .. row 26
-
-       -  .. _FE-HAS-EXTENDED-CAPS:
-
-	  ``FE_HAS_EXTENDED_CAPS``
-
-       -  Currently, unused
-
-    -  .. row 27
-
-       -  .. _FE-CAN-MULTISTREAM:
-
-	  ``FE_CAN_MULTISTREAM``
-
-       -  The frontend supports multistream filtering
-
-    -  .. row 28
-
-       -  .. _FE-CAN-TURBO-FEC:
-
-	  ``FE_CAN_TURBO_FEC``
-
-       -  The frontend supports turbo FEC modulation
-
-    -  .. row 29
-
-       -  .. _FE-CAN-2G-MODULATION:
-
-	  ``FE_CAN_2G_MODULATION``
-
-       -  The frontend supports "2nd generation modulation" (DVB-S2/T2)>
-
-    -  .. row 30
-
-       -  .. _FE-NEEDS-BENDING:
-
-	  ``FE_NEEDS_BENDING``
-
-       -  Not supported anymore, don't use it
-
-    -  .. row 31
-
-       -  .. _FE-CAN-RECOVER:
-
-	  ``FE_CAN_RECOVER``
-
-       -  The frontend can recover from a cable unplug automatically
-
-    -  .. row 32
-
-       -  .. _FE-CAN-MUTE-TS:
-
-	  ``FE_CAN_MUTE_TS``
-
-       -  The frontend can stop spurious TS data output
+The frontend capabilities are described at :c:type:`fe_caps`.
 
 
 Return Value
diff --git a/Documentation/media/uapi/dvb/fe-get-property.rst b/Documentation/media/uapi/dvb/fe-get-property.rst
index 015d4db597b5..b22e37c4a787 100644
--- a/Documentation/media/uapi/dvb/fe-get-property.rst
+++ b/Documentation/media/uapi/dvb/fe-get-property.rst
@@ -29,7 +29,7 @@ Arguments
     File descriptor returned by :ref:`open() <frontend_f_open>`.
 
 ``argp``
-    pointer to struct :c:type:`dtv_properties`
+    Pointer to struct :c:type:`dtv_properties`.
 
 
 Description
diff --git a/Documentation/media/uapi/dvb/fe-read-status.rst b/Documentation/media/uapi/dvb/fe-read-status.rst
index 76b93e386552..21b2db3591fd 100644
--- a/Documentation/media/uapi/dvb/fe-read-status.rst
+++ b/Documentation/media/uapi/dvb/fe-read-status.rst
@@ -52,89 +52,6 @@ The fe_status parameter is used to indicate the current state and/or
 state changes of the frontend hardware. It is produced using the enum
 :c:type:`fe_status` values on a bitmask
 
-.. c:type:: fe_status
-
-.. tabularcolumns:: |p{3.5cm}|p{14.0cm}|
-
-.. _fe-status:
-
-.. flat-table:: enum fe_status
-    :header-rows:  1
-    :stub-columns: 0
-
-
-    -  .. row 1
-
-       -  ID
-
-       -  Description
-
-    -  .. row 2
-
-       -  .. _FE-NONE:
-
-	  ``FE_NONE``
-
-       -  The frontend doesn't have any kind of lock. That's the initial frontend status
-
-    -  .. row 3
-
-       -  .. _FE-HAS-SIGNAL:
-
-	  ``FE_HAS_SIGNAL``
-
-       -  The frontend has found something above the noise level
-
-    -  .. row 4
-
-       -  .. _FE-HAS-CARRIER:
-
-	  ``FE_HAS_CARRIER``
-
-       -  The frontend has found a DVB signal
-
-    -  .. row 5
-
-       -  .. _FE-HAS-VITERBI:
-
-	  ``FE_HAS_VITERBI``
-
-       -  The frontend FEC inner coding (Viterbi, LDPC or other inner code)
-	  is stable
-
-    -  .. row 6
-
-       -  .. _FE-HAS-SYNC:
-
-	  ``FE_HAS_SYNC``
-
-       -  Synchronization bytes was found
-
-    -  .. row 7
-
-       -  .. _FE-HAS-LOCK:
-
-	  ``FE_HAS_LOCK``
-
-       -  The DVB were locked and everything is working
-
-    -  .. row 8
-
-       -  .. _FE-TIMEDOUT:
-
-	  ``FE_TIMEDOUT``
-
-       -  no lock within the last about 2 seconds
-
-    -  .. row 9
-
-       -  .. _FE-REINIT:
-
-	  ``FE_REINIT``
-
-       -  The frontend was reinitialized, application is recommended to
-	  reset DiSEqC, tone and parameters
-
 
 Return Value
 ============
diff --git a/Documentation/media/uapi/dvb/fe-set-tone.rst b/Documentation/media/uapi/dvb/fe-set-tone.rst
index 84e4da3fd4c9..d0950acbbe64 100644
--- a/Documentation/media/uapi/dvb/fe-set-tone.rst
+++ b/Documentation/media/uapi/dvb/fe-set-tone.rst
@@ -45,36 +45,6 @@ this is done using the DiSEqC ioctls.
    capability of selecting the band. So, it is recommended that applications
    would change to SEC_TONE_OFF when the device is not used.
 
-.. c:type:: fe_sec_tone_mode
-
-.. flat-table:: enum fe_sec_tone_mode
-    :header-rows:  1
-    :stub-columns: 0
-
-
-    -  .. row 1
-
-       -  ID
-
-       -  Description
-
-    -  .. row 2
-
-       -  .. _SEC-TONE-ON:
-
-	  ``SEC_TONE_ON``
-
-       -  Sends a 22kHz tone burst to the antenna
-
-    -  .. row 3
-
-       -  .. _SEC-TONE-OFF:
-
-	  ``SEC_TONE_OFF``
-
-       -  Don't send a 22kHz tone to the antenna (except if the
-	  FE_DISEQC_* ioctls are called)
-
 
 Return Value
 ============
diff --git a/Documentation/media/uapi/dvb/fe_property_parameters.rst b/Documentation/media/uapi/dvb/fe_property_parameters.rst
index 7bb7559c4500..c6eb74f59b00 100644
--- a/Documentation/media/uapi/dvb/fe_property_parameters.rst
+++ b/Documentation/media/uapi/dvb/fe_property_parameters.rst
@@ -6,6 +6,11 @@
 Digital TV property parameters
 ******************************
 
+There are several different Digital TV parameters that can be used by
+:ref:`FE_SET_PROPERTY and FE_GET_PROPERTY ioctls<FE_GET_PROPERTY>`.
+This section describes each of them. Please notice, however, that only
+a subset of them are needed to setup a frontend.
+
 
 .. _DTV-UNDEFINED:
 
@@ -67,144 +72,36 @@ DTV_MODULATION
 ==============
 
 Specifies the frontend modulation type for delivery systems that
-supports more than one modulation type. The modulation can be one of the
-types defined by enum :c:type:`fe_modulation`.
-
-
-.. c:type:: fe_modulation
-
-Modulation property
--------------------
-
-Most of the digital TV standards currently offers more than one possible
-modulation (sometimes called as "constellation" on some standards). This
-enum contains the values used by the Kernel. Please note that not all
-modulations are supported by a given standard.
-
-
-.. flat-table:: enum fe_modulation
-    :header-rows:  1
-    :stub-columns: 0
-
-
-    -  .. row 1
-
-       -  ID
-
-       -  Description
-
-    -  .. row 2
-
-       -  .. _QPSK:
-
-	  ``QPSK``
-
-       -  QPSK modulation
-
-    -  .. row 3
-
-       -  .. _QAM-16:
-
-	  ``QAM_16``
-
-       -  16-QAM modulation
-
-    -  .. row 4
-
-       -  .. _QAM-32:
-
-	  ``QAM_32``
-
-       -  32-QAM modulation
-
-    -  .. row 5
-
-       -  .. _QAM-64:
-
-	  ``QAM_64``
-
-       -  64-QAM modulation
-
-    -  .. row 6
-
-       -  .. _QAM-128:
-
-	  ``QAM_128``
-
-       -  128-QAM modulation
-
-    -  .. row 7
-
-       -  .. _QAM-256:
-
-	  ``QAM_256``
-
-       -  256-QAM modulation
-
-    -  .. row 8
-
-       -  .. _QAM-AUTO:
-
-	  ``QAM_AUTO``
-
-       -  Autodetect QAM modulation
-
-    -  .. row 9
-
-       -  .. _VSB-8:
-
-	  ``VSB_8``
-
-       -  8-VSB modulation
-
-    -  .. row 10
-
-       -  .. _VSB-16:
-
-	  ``VSB_16``
-
-       -  16-VSB modulation
-
-    -  .. row 11
-
-       -  .. _PSK-8:
-
-	  ``PSK_8``
-
-       -  8-PSK modulation
-
-    -  .. row 12
-
-       -  .. _APSK-16:
-
-	  ``APSK_16``
-
-       -  16-APSK modulation
-
-    -  .. row 13
-
-       -  .. _APSK-32:
+supports more multiple modulations.
+
+The modulation can be one of the types defined by enum :c:type:`fe_modulation`.
+
+Most of the digital TV standards offers more than one possible
+modulation type.
+
+The table below presents a summary of the types of modulation types
+supported by each delivery system, as currently defined by specs.
+
+======================= =======================================================
+Standard		Modulation types
+======================= =======================================================
+ATSC (version 1)	8-VSB and 16-VSB.
+DMTB			4-QAM, 16-QAM, 32-QAM, 64-QAM and 4-QAM-NR.
+DVB-C Annex A/C		16-QAM, 32-QAM, 64-QAM and 256-QAM.
+DVB-C Annex B		64-QAM.
+DVB-T			QPSK, 16-QAM and 64-QAM.
+DVB-T2			QPSK, 16-QAM, 64-QAM and 256-QAM.
+DVB-S			No need to set. It supports only QPSK.
+DVB-S2			QPSK, 8-PSK, 16-APSK and 32-APSK.
+ISDB-T			QPSK, DQPSK, 16-QAM and 64-QAM.
+ISDB-S			8-PSK, QPSK and BPSK.
+======================= =======================================================
 
-	  ``APSK_32``
-
-       -  32-APSK modulation
-
-    -  .. row 14
-
-       -  .. _DQPSK:
-
-	  ``DQPSK``
-
-       -  DQPSK modulation
-
-    -  .. row 15
-
-       -  .. _QAM-4-NR:
-
-	  ``QAM_4_NR``
-
-       -  4-QAM-NR modulation
+.. note::
 
+   Please notice that some of the above modulation types may not be
+   defined currently at the Kernel. The reason is simple: no driver
+   needed such definition yet.
 
 
 .. _DTV-BANDWIDTH-HZ:
@@ -249,54 +146,7 @@ DTV_INVERSION
 
 Specifies if the frontend should do spectral inversion or not.
 
-.. c:type:: fe_spectral_inversion
-
-enum fe_modulation: Frontend spectral inversion
------------------------------------------------
-
-This parameter indicates if spectral inversion should be presumed or
-not. In the automatic setting (``INVERSION_AUTO``) the hardware will try
-to figure out the correct setting by itself. If the hardware doesn't
-support, the DVB core will try to lock at the carrier first with
-inversion off. If it fails, it will try to enable inversion.
-
-
-.. flat-table:: enum fe_modulation
-    :header-rows:  1
-    :stub-columns: 0
-
-
-    -  .. row 1
-
-       -  ID
-
-       -  Description
-
-    -  .. row 2
-
-       -  .. _INVERSION-OFF:
-
-	  ``INVERSION_OFF``
-
-       -  Don't do spectral band inversion.
-
-    -  .. row 3
-
-       -  .. _INVERSION-ON:
-
-	  ``INVERSION_ON``
-
-       -  Do spectral band inversion.
-
-    -  .. row 4
-
-       -  .. _INVERSION-AUTO:
-
-	  ``INVERSION_AUTO``
-
-       -  Autodetect spectral band inversion.
-
-
+The acceptable values are defined by :c:type:`fe_spectral_inversion`.
 
 .. _DTV-DISEQC-MASTER:
 
@@ -320,1525 +170,671 @@ standards.
 DTV_INNER_FEC
 =============
 
-Used cable/satellite transmissions. The acceptable values are:
+Used cable/satellite transmissions.
 
-.. c:type:: fe_code_rate
+The acceptable values are defined by :c:type:`fe_code_rate`.
 
-enum fe_code_rate: type of the Forward Error Correction.
---------------------------------------------------------
 
-.. flat-table:: enum fe_code_rate
-    :header-rows:  1
-    :stub-columns: 0
+.. _DTV-VOLTAGE:
 
+DTV_VOLTAGE
+===========
 
-    -  .. row 1
+The voltage is usually used with non-DiSEqC capable LNBs to switch the
+polarzation (horizontal/vertical). When using DiSEqC epuipment this
+voltage has to be switched consistently to the DiSEqC commands as
+described in the DiSEqC spec.
 
-       -  ID
+The acceptable values are defined by :c:type:`fe_sec_voltage`.
 
-       -  Description
 
-    -  .. row 2
+.. _DTV-TONE:
 
-       -  .. _FEC-NONE:
+DTV_TONE
+========
 
-	  ``FEC_NONE``
+Currently not used.
 
-       -  No Forward Error Correction Code
 
-    -  .. row 3
+.. _DTV-PILOT:
 
-       -  .. _FEC-AUTO:
+DTV_PILOT
+=========
 
-	  ``FEC_AUTO``
+Sets DVB-S2 pilot.
 
-       -  Autodetect Error Correction Code
+The acceptable values are defined by :c:type:`fe_pilot`.
 
-    -  .. row 4
 
-       -  .. _FEC-1-2:
+.. _DTV-ROLLOFF:
 
-	  ``FEC_1_2``
+DTV_ROLLOFF
+===========
 
-       -  Forward Error Correction Code 1/2
+Sets DVB-S2 rolloff
 
-    -  .. row 5
+The acceptable values are defined by :c:type:`fe_rolloff`.
 
-       -  .. _FEC-2-3:
 
-	  ``FEC_2_3``
+.. _DTV-DISEQC-SLAVE-REPLY:
 
-       -  Forward Error Correction Code 2/3
+DTV_DISEQC_SLAVE_REPLY
+======================
 
-    -  .. row 6
+Currently not implemented.
 
-       -  .. _FEC-3-4:
 
-	  ``FEC_3_4``
+.. _DTV-FE-CAPABILITY-COUNT:
 
-       -  Forward Error Correction Code 3/4
+DTV_FE_CAPABILITY_COUNT
+=======================
 
-    -  .. row 7
+Currently not implemented.
 
-       -  .. _FEC-4-5:
 
-	  ``FEC_4_5``
+.. _DTV-FE-CAPABILITY:
 
-       -  Forward Error Correction Code 4/5
+DTV_FE_CAPABILITY
+=================
 
-    -  .. row 8
+Currently not implemented.
 
-       -  .. _FEC-5-6:
 
-	  ``FEC_5_6``
+.. _DTV-DELIVERY-SYSTEM:
 
-       -  Forward Error Correction Code 5/6
+DTV_DELIVERY_SYSTEM
+===================
 
-    -  .. row 9
+Specifies the type of Delivery system.
 
-       -  .. _FEC-6-7:
+The acceptable values are defined by :c:type:`fe_delivery_system`.
 
-	  ``FEC_6_7``
 
-       -  Forward Error Correction Code 6/7
+.. _DTV-ISDBT-PARTIAL-RECEPTION:
 
-    -  .. row 10
+DTV_ISDBT_PARTIAL_RECEPTION
+===========================
 
-       -  .. _FEC-7-8:
+If ``DTV_ISDBT_SOUND_BROADCASTING`` is '0' this bit-field represents
+whether the channel is in partial reception mode or not.
 
-	  ``FEC_7_8``
+If '1' ``DTV_ISDBT_LAYERA_*`` values are assigned to the center segment
+and ``DTV_ISDBT_LAYERA_SEGMENT_COUNT`` has to be '1'.
 
-       -  Forward Error Correction Code 7/8
+If in addition ``DTV_ISDBT_SOUND_BROADCASTING`` is '1'
+``DTV_ISDBT_PARTIAL_RECEPTION`` represents whether this ISDB-Tsb channel
+is consisting of one segment and layer or three segments and two layers.
 
-    -  .. row 11
+Possible values: 0, 1, -1 (AUTO)
 
-       -  .. _FEC-8-9:
 
-	  ``FEC_8_9``
+.. _DTV-ISDBT-SOUND-BROADCASTING:
 
-       -  Forward Error Correction Code 8/9
+DTV_ISDBT_SOUND_BROADCASTING
+============================
 
-    -  .. row 12
+This field represents whether the other DTV_ISDBT_*-parameters are
+referring to an ISDB-T and an ISDB-Tsb channel. (See also
+``DTV_ISDBT_PARTIAL_RECEPTION``).
 
-       -  .. _FEC-9-10:
+Possible values: 0, 1, -1 (AUTO)
 
-	  ``FEC_9_10``
 
-       -  Forward Error Correction Code 9/10
+.. _DTV-ISDBT-SB-SUBCHANNEL-ID:
 
-    -  .. row 13
+DTV_ISDBT_SB_SUBCHANNEL_ID
+==========================
 
-       -  .. _FEC-2-5:
+This field only applies if ``DTV_ISDBT_SOUND_BROADCASTING`` is '1'.
 
-	  ``FEC_2_5``
+(Note of the author: This might not be the correct description of the
+``SUBCHANNEL-ID`` in all details, but it is my understanding of the
+technical background needed to program a device)
 
-       -  Forward Error Correction Code 2/5
+An ISDB-Tsb channel (1 or 3 segments) can be broadcasted alone or in a
+set of connected ISDB-Tsb channels. In this set of channels every
+channel can be received independently. The number of connected ISDB-Tsb
+segment can vary, e.g. depending on the frequency spectrum bandwidth
+available.
 
-    -  .. row 14
+Example: Assume 8 ISDB-Tsb connected segments are broadcasted. The
+broadcaster has several possibilities to put those channels in the air:
+Assuming a normal 13-segment ISDB-T spectrum he can align the 8 segments
+from position 1-8 to 5-13 or anything in between.
 
-       -  .. _FEC-3-5:
+The underlying layer of segments are subchannels: each segment is
+consisting of several subchannels with a predefined IDs. A sub-channel
+is used to help the demodulator to synchronize on the channel.
 
-	  ``FEC_3_5``
+An ISDB-T channel is always centered over all sub-channels. As for the
+example above, in ISDB-Tsb it is no longer as simple as that.
 
-       -  Forward Error Correction Code 3/5
+``The DTV_ISDBT_SB_SUBCHANNEL_ID`` parameter is used to give the
+sub-channel ID of the segment to be demodulated.
 
+Possible values: 0 .. 41, -1 (AUTO)
 
 
-.. _DTV-VOLTAGE:
+.. _DTV-ISDBT-SB-SEGMENT-IDX:
 
-DTV_VOLTAGE
-===========
+DTV_ISDBT_SB_SEGMENT_IDX
+========================
 
-The voltage is usually used with non-DiSEqC capable LNBs to switch the
-polarzation (horizontal/vertical). When using DiSEqC epuipment this
-voltage has to be switched consistently to the DiSEqC commands as
-described in the DiSEqC spec.
+This field only applies if ``DTV_ISDBT_SOUND_BROADCASTING`` is '1'.
 
+``DTV_ISDBT_SB_SEGMENT_IDX`` gives the index of the segment to be
+demodulated for an ISDB-Tsb channel where several of them are
+transmitted in the connected manner.
 
-.. c:type:: fe_sec_voltage
+Possible values: 0 .. ``DTV_ISDBT_SB_SEGMENT_COUNT`` - 1
 
-.. flat-table:: enum fe_sec_voltage
-    :header-rows:  1
-    :stub-columns: 0
+Note: This value cannot be determined by an automatic channel search.
 
 
-    -  .. row 1
+.. _DTV-ISDBT-SB-SEGMENT-COUNT:
 
-       -  ID
+DTV_ISDBT_SB_SEGMENT_COUNT
+==========================
 
-       -  Description
+This field only applies if ``DTV_ISDBT_SOUND_BROADCASTING`` is '1'.
 
-    -  .. row 2
+``DTV_ISDBT_SB_SEGMENT_COUNT`` gives the total count of connected
+ISDB-Tsb channels.
 
-       -  .. _SEC-VOLTAGE-13:
+Possible values: 1 .. 13
 
-	  ``SEC_VOLTAGE_13``
+Note: This value cannot be determined by an automatic channel search.
 
-       -  Set DC voltage level to 13V
 
-    -  .. row 3
+.. _isdb-hierq-layers:
 
-       -  .. _SEC-VOLTAGE-18:
+DTV-ISDBT-LAYER[A-C] parameters
+===============================
 
-	  ``SEC_VOLTAGE_18``
+ISDB-T channels can be coded hierarchically. As opposed to DVB-T in
+ISDB-T hierarchical layers can be decoded simultaneously. For that
+reason a ISDB-T demodulator has 3 Viterbi and 3 Reed-Solomon decoders.
 
-       -  Set DC voltage level to 18V
+ISDB-T has 3 hierarchical layers which each can use a part of the
+available segments. The total number of segments over all layers has to
+13 in ISDB-T.
 
-    -  .. row 4
+There are 3 parameter sets, for Layers A, B and C.
 
-       -  .. _SEC-VOLTAGE-OFF:
 
-	  ``SEC_VOLTAGE_OFF``
+.. _DTV-ISDBT-LAYER-ENABLED:
 
-       -  Don't send any voltage to the antenna
+DTV_ISDBT_LAYER_ENABLED
+-----------------------
 
+Hierarchical reception in ISDB-T is achieved by enabling or disabling
+layers in the decoding process. Setting all bits of
+``DTV_ISDBT_LAYER_ENABLED`` to '1' forces all layers (if applicable) to
+be demodulated. This is the default.
 
+If the channel is in the partial reception mode
+(``DTV_ISDBT_PARTIAL_RECEPTION`` = 1) the central segment can be decoded
+independently of the other 12 segments. In that mode layer A has to have
+a ``SEGMENT_COUNT`` of 1.
 
-.. _DTV-TONE:
+In ISDB-Tsb only layer A is used, it can be 1 or 3 in ISDB-Tsb according
+to ``DTV_ISDBT_PARTIAL_RECEPTION``. ``SEGMENT_COUNT`` must be filled
+accordingly.
 
-DTV_TONE
-========
+Only the values of the first 3 bits are used. Other bits will be silently ignored:
 
-Currently not used.
+``DTV_ISDBT_LAYER_ENABLED`` bit 0: layer A enabled
 
+``DTV_ISDBT_LAYER_ENABLED`` bit 1: layer B enabled
 
-.. _DTV-PILOT:
+``DTV_ISDBT_LAYER_ENABLED`` bit 2: layer C enabled
 
-DTV_PILOT
-=========
+``DTV_ISDBT_LAYER_ENABLED`` bits 3-31: unused
 
-Sets DVB-S2 pilot
 
+.. _DTV-ISDBT-LAYER-FEC:
 
-.. c:type:: fe_pilot
+DTV_ISDBT_LAYER[A-C]_FEC
+------------------------
 
-fe_pilot type
--------------
+The Forward Error Correction mechanism used by a given ISDB Layer, as
+defined by :c:type:`fe_code_rate`.
 
 
-.. flat-table:: enum fe_pilot
-    :header-rows:  1
-    :stub-columns: 0
+Possible values are: ``FEC_AUTO``, ``FEC_1_2``, ``FEC_2_3``, ``FEC_3_4``,
+``FEC_5_6``, ``FEC_7_8``
 
 
-    -  .. row 1
-
-       -  ID
-
-       -  Description
-
-    -  .. row 2
-
-       -  .. _PILOT-ON:
-
-	  ``PILOT_ON``
-
-       -  Pilot tones enabled
-
-    -  .. row 3
-
-       -  .. _PILOT-OFF:
-
-	  ``PILOT_OFF``
-
-       -  Pilot tones disabled
-
-    -  .. row 4
-
-       -  .. _PILOT-AUTO:
-
-	  ``PILOT_AUTO``
-
-       -  Autodetect pilot tones
-
-
-
-.. _DTV-ROLLOFF:
-
-DTV_ROLLOFF
-===========
-
-Sets DVB-S2 rolloff
-
-
-.. c:type:: fe_rolloff
-
-fe_rolloff type
----------------
-
-
-.. flat-table:: enum fe_rolloff
-    :header-rows:  1
-    :stub-columns: 0
-
-
-    -  .. row 1
-
-       -  ID
-
-       -  Description
-
-    -  .. row 2
-
-       -  .. _ROLLOFF-35:
-
-	  ``ROLLOFF_35``
-
-       -  Roloff factor: α=35%
-
-    -  .. row 3
-
-       -  .. _ROLLOFF-20:
-
-	  ``ROLLOFF_20``
-
-       -  Roloff factor: α=20%
-
-    -  .. row 4
-
-       -  .. _ROLLOFF-25:
-
-	  ``ROLLOFF_25``
-
-       -  Roloff factor: α=25%
-
-    -  .. row 5
-
-       -  .. _ROLLOFF-AUTO:
-
-	  ``ROLLOFF_AUTO``
-
-       -  Auto-detect the roloff factor.
-
-
-
-.. _DTV-DISEQC-SLAVE-REPLY:
-
-DTV_DISEQC_SLAVE_REPLY
-======================
-
-Currently not implemented.
-
-
-.. _DTV-FE-CAPABILITY-COUNT:
-
-DTV_FE_CAPABILITY_COUNT
-=======================
-
-Currently not implemented.
-
-
-.. _DTV-FE-CAPABILITY:
-
-DTV_FE_CAPABILITY
-=================
-
-Currently not implemented.
-
-
-.. _DTV-DELIVERY-SYSTEM:
-
-DTV_DELIVERY_SYSTEM
-===================
-
-Specifies the type of Delivery system
-
-
-.. c:type:: fe_delivery_system
-
-fe_delivery_system type
------------------------
-
-Possible values:
-
-
-.. flat-table:: enum fe_delivery_system
-    :header-rows:  1
-    :stub-columns: 0
-
-
-    -  .. row 1
-
-       -  ID
-
-       -  Description
-
-    -  .. row 2
-
-       -  .. _SYS-UNDEFINED:
-
-	  ``SYS_UNDEFINED``
-
-       -  Undefined standard. Generally, indicates an error
-
-    -  .. row 3
-
-       -  .. _SYS-DVBC-ANNEX-A:
-
-	  ``SYS_DVBC_ANNEX_A``
-
-       -  Cable TV: DVB-C following ITU-T J.83 Annex A spec
-
-    -  .. row 4
-
-       -  .. _SYS-DVBC-ANNEX-B:
-
-	  ``SYS_DVBC_ANNEX_B``
-
-       -  Cable TV: DVB-C following ITU-T J.83 Annex B spec (ClearQAM)
-
-    -  .. row 5
-
-       -  .. _SYS-DVBC-ANNEX-C:
-
-	  ``SYS_DVBC_ANNEX_C``
-
-       -  Cable TV: DVB-C following ITU-T J.83 Annex C spec
-
-    -  .. row 6
-
-       -  .. _SYS-ISDBC:
-
-	  ``SYS_ISDBC``
-
-       -  Cable TV: ISDB-C (no drivers yet)
-
-    -  .. row 7
-
-       -  .. _SYS-DVBT:
-
-	  ``SYS_DVBT``
-
-       -  Terrestral TV: DVB-T
-
-    -  .. row 8
-
-       -  .. _SYS-DVBT2:
-
-	  ``SYS_DVBT2``
-
-       -  Terrestral TV: DVB-T2
-
-    -  .. row 9
-
-       -  .. _SYS-ISDBT:
-
-	  ``SYS_ISDBT``
-
-       -  Terrestral TV: ISDB-T
-
-    -  .. row 10
-
-       -  .. _SYS-ATSC:
-
-	  ``SYS_ATSC``
-
-       -  Terrestral TV: ATSC
-
-    -  .. row 11
-
-       -  .. _SYS-ATSCMH:
-
-	  ``SYS_ATSCMH``
-
-       -  Terrestral TV (mobile): ATSC-M/H
-
-    -  .. row 12
-
-       -  .. _SYS-DTMB:
-
-	  ``SYS_DTMB``
-
-       -  Terrestrial TV: DTMB
-
-    -  .. row 13
-
-       -  .. _SYS-DVBS:
-
-	  ``SYS_DVBS``
-
-       -  Satellite TV: DVB-S
-
-    -  .. row 14
-
-       -  .. _SYS-DVBS2:
-
-	  ``SYS_DVBS2``
-
-       -  Satellite TV: DVB-S2
-
-    -  .. row 15
-
-       -  .. _SYS-TURBO:
-
-	  ``SYS_TURBO``
-
-       -  Satellite TV: DVB-S Turbo
-
-    -  .. row 16
-
-       -  .. _SYS-ISDBS:
-
-	  ``SYS_ISDBS``
-
-       -  Satellite TV: ISDB-S
-
-    -  .. row 17
-
-       -  .. _SYS-DAB:
-
-	  ``SYS_DAB``
-
-       -  Digital audio: DAB (not fully supported)
-
-    -  .. row 18
-
-       -  .. _SYS-DSS:
-
-	  ``SYS_DSS``
-
-       -  Satellite TV:"DSS (not fully supported)
-
-    -  .. row 19
-
-       -  .. _SYS-CMMB:
-
-	  ``SYS_CMMB``
-
-       -  Terrestral TV (mobile):CMMB (not fully supported)
-
-    -  .. row 20
-
-       -  .. _SYS-DVBH:
-
-	  ``SYS_DVBH``
-
-       -  Terrestral TV (mobile): DVB-H (standard deprecated)
-
-
-
-.. _DTV-ISDBT-PARTIAL-RECEPTION:
-
-DTV_ISDBT_PARTIAL_RECEPTION
-===========================
-
-If ``DTV_ISDBT_SOUND_BROADCASTING`` is '0' this bit-field represents
-whether the channel is in partial reception mode or not.
-
-If '1' ``DTV_ISDBT_LAYERA_*`` values are assigned to the center segment
-and ``DTV_ISDBT_LAYERA_SEGMENT_COUNT`` has to be '1'.
-
-If in addition ``DTV_ISDBT_SOUND_BROADCASTING`` is '1'
-``DTV_ISDBT_PARTIAL_RECEPTION`` represents whether this ISDB-Tsb channel
-is consisting of one segment and layer or three segments and two layers.
-
-Possible values: 0, 1, -1 (AUTO)
-
-
-.. _DTV-ISDBT-SOUND-BROADCASTING:
-
-DTV_ISDBT_SOUND_BROADCASTING
-============================
-
-This field represents whether the other DTV_ISDBT_*-parameters are
-referring to an ISDB-T and an ISDB-Tsb channel. (See also
-``DTV_ISDBT_PARTIAL_RECEPTION``).
-
-Possible values: 0, 1, -1 (AUTO)
-
-
-.. _DTV-ISDBT-SB-SUBCHANNEL-ID:
-
-DTV_ISDBT_SB_SUBCHANNEL_ID
-==========================
-
-This field only applies if ``DTV_ISDBT_SOUND_BROADCASTING`` is '1'.
-
-(Note of the author: This might not be the correct description of the
-``SUBCHANNEL-ID`` in all details, but it is my understanding of the
-technical background needed to program a device)
-
-An ISDB-Tsb channel (1 or 3 segments) can be broadcasted alone or in a
-set of connected ISDB-Tsb channels. In this set of channels every
-channel can be received independently. The number of connected ISDB-Tsb
-segment can vary, e.g. depending on the frequency spectrum bandwidth
-available.
-
-Example: Assume 8 ISDB-Tsb connected segments are broadcasted. The
-broadcaster has several possibilities to put those channels in the air:
-Assuming a normal 13-segment ISDB-T spectrum he can align the 8 segments
-from position 1-8 to 5-13 or anything in between.
-
-The underlying layer of segments are subchannels: each segment is
-consisting of several subchannels with a predefined IDs. A sub-channel
-is used to help the demodulator to synchronize on the channel.
-
-An ISDB-T channel is always centered over all sub-channels. As for the
-example above, in ISDB-Tsb it is no longer as simple as that.
-
-``The DTV_ISDBT_SB_SUBCHANNEL_ID`` parameter is used to give the
-sub-channel ID of the segment to be demodulated.
-
-Possible values: 0 .. 41, -1 (AUTO)
-
-
-.. _DTV-ISDBT-SB-SEGMENT-IDX:
-
-DTV_ISDBT_SB_SEGMENT_IDX
-========================
-
-This field only applies if ``DTV_ISDBT_SOUND_BROADCASTING`` is '1'.
-
-``DTV_ISDBT_SB_SEGMENT_IDX`` gives the index of the segment to be
-demodulated for an ISDB-Tsb channel where several of them are
-transmitted in the connected manner.
-
-Possible values: 0 .. ``DTV_ISDBT_SB_SEGMENT_COUNT`` - 1
-
-Note: This value cannot be determined by an automatic channel search.
-
-
-.. _DTV-ISDBT-SB-SEGMENT-COUNT:
-
-DTV_ISDBT_SB_SEGMENT_COUNT
-==========================
-
-This field only applies if ``DTV_ISDBT_SOUND_BROADCASTING`` is '1'.
-
-``DTV_ISDBT_SB_SEGMENT_COUNT`` gives the total count of connected
-ISDB-Tsb channels.
-
-Possible values: 1 .. 13
-
-Note: This value cannot be determined by an automatic channel search.
-
-
-.. _isdb-hierq-layers:
-
-DTV-ISDBT-LAYER[A-C] parameters
-===============================
-
-ISDB-T channels can be coded hierarchically. As opposed to DVB-T in
-ISDB-T hierarchical layers can be decoded simultaneously. For that
-reason a ISDB-T demodulator has 3 Viterbi and 3 Reed-Solomon decoders.
-
-ISDB-T has 3 hierarchical layers which each can use a part of the
-available segments. The total number of segments over all layers has to
-13 in ISDB-T.
-
-There are 3 parameter sets, for Layers A, B and C.
-
-
-.. _DTV-ISDBT-LAYER-ENABLED:
-
-DTV_ISDBT_LAYER_ENABLED
------------------------
-
-Hierarchical reception in ISDB-T is achieved by enabling or disabling
-layers in the decoding process. Setting all bits of
-``DTV_ISDBT_LAYER_ENABLED`` to '1' forces all layers (if applicable) to
-be demodulated. This is the default.
-
-If the channel is in the partial reception mode
-(``DTV_ISDBT_PARTIAL_RECEPTION`` = 1) the central segment can be decoded
-independently of the other 12 segments. In that mode layer A has to have
-a ``SEGMENT_COUNT`` of 1.
-
-In ISDB-Tsb only layer A is used, it can be 1 or 3 in ISDB-Tsb according
-to ``DTV_ISDBT_PARTIAL_RECEPTION``. ``SEGMENT_COUNT`` must be filled
-accordingly.
-
-Only the values of the first 3 bits are used. Other bits will be silently ignored:
-
-``DTV_ISDBT_LAYER_ENABLED`` bit 0: layer A enabled
-
-``DTV_ISDBT_LAYER_ENABLED`` bit 1: layer B enabled
-
-``DTV_ISDBT_LAYER_ENABLED`` bit 2: layer C enabled
-
-``DTV_ISDBT_LAYER_ENABLED`` bits 3-31: unused
-
-
-.. _DTV-ISDBT-LAYER-FEC:
-
-DTV_ISDBT_LAYER[A-C]_FEC
-------------------------
-
-Possible values: ``FEC_AUTO``, ``FEC_1_2``, ``FEC_2_3``, ``FEC_3_4``,
-``FEC_5_6``, ``FEC_7_8``
-
-
-.. _DTV-ISDBT-LAYER-MODULATION:
-
-DTV_ISDBT_LAYER[A-C]_MODULATION
--------------------------------
-
-Possible values: ``QAM_AUTO``, QP\ ``SK, QAM_16``, ``QAM_64``, ``DQPSK``
-
-Note: If layer C is ``DQPSK`` layer B has to be ``DQPSK``. If layer B is
-``DQPSK`` and ``DTV_ISDBT_PARTIAL_RECEPTION``\ =0 layer has to be
-``DQPSK``.
-
-
-.. _DTV-ISDBT-LAYER-SEGMENT-COUNT:
-
-DTV_ISDBT_LAYER[A-C]_SEGMENT_COUNT
-----------------------------------
-
-Possible values: 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, -1 (AUTO)
-
-Note: Truth table for ``DTV_ISDBT_SOUND_BROADCASTING`` and
-``DTV_ISDBT_PARTIAL_RECEPTION`` and ``LAYER[A-C]_SEGMENT_COUNT``
-
-.. _isdbt-layer_seg-cnt-table:
-
-.. flat-table:: Truth table for ISDB-T Sound Broadcasting
-    :header-rows:  0
-    :stub-columns: 0
-
-
-    -  .. row 1
-
-       -  PR
-
-       -  SB
-
-       -  Layer A width
-
-       -  Layer B width
-
-       -  Layer C width
-
-       -  total width
-
-    -  .. row 2
-
-       -  0
-
-       -  0
-
-       -  1 .. 13
-
-       -  1 .. 13
-
-       -  1 .. 13
-
-       -  13
-
-    -  .. row 3
-
-       -  1
-
-       -  0
-
-       -  1
-
-       -  1 .. 13
-
-       -  1 .. 13
-
-       -  13
-
-    -  .. row 4
-
-       -  0
-
-       -  1
-
-       -  1
-
-       -  0
-
-       -  0
-
-       -  1
-
-    -  .. row 5
-
-       -  1
-
-       -  1
-
-       -  1
-
-       -  2
-
-       -  0
-
-       -  13
-
-
-
-.. _DTV-ISDBT-LAYER-TIME-INTERLEAVING:
-
-DTV_ISDBT_LAYER[A-C]_TIME_INTERLEAVING
---------------------------------------
-
-Valid values: 0, 1, 2, 4, -1 (AUTO)
-
-when DTV_ISDBT_SOUND_BROADCASTING is active, value 8 is also valid.
-
-Note: The real time interleaving length depends on the mode (fft-size).
-The values here are referring to what can be found in the
-TMCC-structure, as shown in the table below.
-
-
-.. c:type:: isdbt_layer_interleaving_table
-
-.. flat-table:: ISDB-T time interleaving modes
-    :header-rows:  0
-    :stub-columns: 0
-
-
-    -  .. row 1
-
-       -  ``DTV_ISDBT_LAYER[A-C]_TIME_INTERLEAVING``
-
-       -  Mode 1 (2K FFT)
-
-       -  Mode 2 (4K FFT)
-
-       -  Mode 3 (8K FFT)
-
-    -  .. row 2
-
-       -  0
-
-       -  0
-
-       -  0
-
-       -  0
-
-    -  .. row 3
-
-       -  1
-
-       -  4
-
-       -  2
-
-       -  1
-
-    -  .. row 4
-
-       -  2
-
-       -  8
-
-       -  4
-
-       -  2
-
-    -  .. row 5
-
-       -  4
-
-       -  16
-
-       -  8
-
-       -  4
-
-
-
-.. _DTV-ATSCMH-FIC-VER:
-
-DTV_ATSCMH_FIC_VER
-------------------
-
-Version number of the FIC (Fast Information Channel) signaling data.
-
-FIC is used for relaying information to allow rapid service acquisition
-by the receiver.
-
-Possible values: 0, 1, 2, 3, ..., 30, 31
-
-
-.. _DTV-ATSCMH-PARADE-ID:
-
-DTV_ATSCMH_PARADE_ID
---------------------
-
-Parade identification number
-
-A parade is a collection of up to eight MH groups, conveying one or two
-ensembles.
-
-Possible values: 0, 1, 2, 3, ..., 126, 127
-
-
-.. _DTV-ATSCMH-NOG:
-
-DTV_ATSCMH_NOG
---------------
-
-Number of MH groups per MH subframe for a designated parade.
-
-Possible values: 1, 2, 3, 4, 5, 6, 7, 8
-
-
-.. _DTV-ATSCMH-TNOG:
-
-DTV_ATSCMH_TNOG
----------------
-
-Total number of MH groups including all MH groups belonging to all MH
-parades in one MH subframe.
-
-Possible values: 0, 1, 2, 3, ..., 30, 31
-
-
-.. _DTV-ATSCMH-SGN:
-
-DTV_ATSCMH_SGN
---------------
-
-Start group number.
-
-Possible values: 0, 1, 2, 3, ..., 14, 15
-
-
-.. _DTV-ATSCMH-PRC:
-
-DTV_ATSCMH_PRC
---------------
-
-Parade repetition cycle.
-
-Possible values: 1, 2, 3, 4, 5, 6, 7, 8
-
-
-.. _DTV-ATSCMH-RS-FRAME-MODE:
-
-DTV_ATSCMH_RS_FRAME_MODE
-------------------------
-
-Reed Solomon (RS) frame mode.
-
-Possible values are:
-
-.. tabularcolumns:: |p{5.0cm}|p{12.5cm}|
-
-.. c:type:: atscmh_rs_frame_mode
-
-.. flat-table:: enum atscmh_rs_frame_mode
-    :header-rows:  1
-    :stub-columns: 0
-
-
-    -  .. row 1
-
-       -  ID
-
-       -  Description
-
-    -  .. row 2
-
-       -  .. _ATSCMH-RSFRAME-PRI-ONLY:
-
-	  ``ATSCMH_RSFRAME_PRI_ONLY``
-
-       -  Single Frame: There is only a primary RS Frame for all Group
-	  Regions.
-
-    -  .. row 3
-
-       -  .. _ATSCMH-RSFRAME-PRI-SEC:
-
-	  ``ATSCMH_RSFRAME_PRI_SEC``
-
-       -  Dual Frame: There are two separate RS Frames: Primary RS Frame for
-	  Group Region A and B and Secondary RS Frame for Group Region C and
-	  D.
-
-
-
-.. _DTV-ATSCMH-RS-FRAME-ENSEMBLE:
-
-DTV_ATSCMH_RS_FRAME_ENSEMBLE
-----------------------------
-
-Reed Solomon(RS) frame ensemble.
-
-Possible values are:
-
-
-.. c:type:: atscmh_rs_frame_ensemble
-
-.. flat-table:: enum atscmh_rs_frame_ensemble
-    :header-rows:  1
-    :stub-columns: 0
-
-
-    -  .. row 1
-
-       -  ID
-
-       -  Description
-
-    -  .. row 2
-
-       -  .. _ATSCMH-RSFRAME-ENS-PRI:
-
-	  ``ATSCMH_RSFRAME_ENS_PRI``
-
-       -  Primary Ensemble.
-
-    -  .. row 3
-
-       -  .. _ATSCMH-RSFRAME-ENS-SEC:
-
-	  ``AATSCMH_RSFRAME_PRI_SEC``
-
-       -  Secondary Ensemble.
-
-    -  .. row 4
-
-       -  .. _ATSCMH-RSFRAME-RES:
-
-	  ``AATSCMH_RSFRAME_RES``
-
-       -  Reserved. Shouldn't be used.
-
-
-
-.. _DTV-ATSCMH-RS-CODE-MODE-PRI:
-
-DTV_ATSCMH_RS_CODE_MODE_PRI
----------------------------
-
-Reed Solomon (RS) code mode (primary).
-
-Possible values are:
-
-
-.. c:type:: atscmh_rs_code_mode
-
-.. flat-table:: enum atscmh_rs_code_mode
-    :header-rows:  1
-    :stub-columns: 0
-
-
-    -  .. row 1
-
-       -  ID
-
-       -  Description
-
-    -  .. row 2
-
-       -  .. _ATSCMH-RSCODE-211-187:
-
-	  ``ATSCMH_RSCODE_211_187``
-
-       -  Reed Solomon code (211,187).
-
-    -  .. row 3
-
-       -  .. _ATSCMH-RSCODE-223-187:
-
-	  ``ATSCMH_RSCODE_223_187``
-
-       -  Reed Solomon code (223,187).
-
-    -  .. row 4
-
-       -  .. _ATSCMH-RSCODE-235-187:
-
-	  ``ATSCMH_RSCODE_235_187``
-
-       -  Reed Solomon code (235,187).
-
-    -  .. row 5
-
-       -  .. _ATSCMH-RSCODE-RES:
-
-	  ``ATSCMH_RSCODE_RES``
-
-       -  Reserved. Shouldn't be used.
-
-
-
-.. _DTV-ATSCMH-RS-CODE-MODE-SEC:
-
-DTV_ATSCMH_RS_CODE_MODE_SEC
----------------------------
-
-Reed Solomon (RS) code mode (secondary).
-
-Possible values are the same as documented on enum
-:c:type:`atscmh_rs_code_mode`:
-
-
-.. _DTV-ATSCMH-SCCC-BLOCK-MODE:
-
-DTV_ATSCMH_SCCC_BLOCK_MODE
---------------------------
-
-Series Concatenated Convolutional Code Block Mode.
-
-Possible values are:
-
-.. tabularcolumns:: |p{4.5cm}|p{13.0cm}|
-
-.. c:type:: atscmh_sccc_block_mode
-
-.. flat-table:: enum atscmh_scc_block_mode
-    :header-rows:  1
-    :stub-columns: 0
-
-
-    -  .. row 1
-
-       -  ID
-
-       -  Description
-
-    -  .. row 2
-
-       -  .. _ATSCMH-SCCC-BLK-SEP:
-
-	  ``ATSCMH_SCCC_BLK_SEP``
-
-       -  Separate SCCC: the SCCC outer code mode shall be set independently
-	  for each Group Region (A, B, C, D)
-
-    -  .. row 3
-
-       -  .. _ATSCMH-SCCC-BLK-COMB:
-
-	  ``ATSCMH_SCCC_BLK_COMB``
-
-       -  Combined SCCC: all four Regions shall have the same SCCC outer
-	  code mode.
+.. _DTV-ISDBT-LAYER-MODULATION:
 
-    -  .. row 4
+DTV_ISDBT_LAYER[A-C]_MODULATION
+-------------------------------
 
-       -  .. _ATSCMH-SCCC-BLK-RES:
+The modulation used by a given ISDB Layer, as defined by
+:c:type:`fe_modulation`.
 
-	  ``ATSCMH_SCCC_BLK_RES``
+Possible values are: ``QAM_AUTO``, ``QPSK``, ``QAM_16``, ``QAM_64``, ``DQPSK``
 
-       -  Reserved. Shouldn't be used.
+.. note::
 
+   #. If layer C is ``DQPSK``, then layer B has to be ``DQPSK``.
 
+   #. If layer B is ``DQPSK`` and ``DTV_ISDBT_PARTIAL_RECEPTION``\ = 0,
+      then layer has to be ``DQPSK``.
 
-.. _DTV-ATSCMH-SCCC-CODE-MODE-A:
 
-DTV_ATSCMH_SCCC_CODE_MODE_A
----------------------------
+.. _DTV-ISDBT-LAYER-SEGMENT-COUNT:
 
-Series Concatenated Convolutional Code Rate.
+DTV_ISDBT_LAYER[A-C]_SEGMENT_COUNT
+----------------------------------
 
-Possible values are:
+Possible values: 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, -1 (AUTO)
 
+Note: Truth table for ``DTV_ISDBT_SOUND_BROADCASTING`` and
+``DTV_ISDBT_PARTIAL_RECEPTION`` and ``LAYER[A-C]_SEGMENT_COUNT``
 
-.. c:type:: atscmh_sccc_code_mode
+.. _isdbt-layer_seg-cnt-table:
 
-.. flat-table:: enum atscmh_sccc_code_mode
+.. flat-table:: Truth table for ISDB-T Sound Broadcasting
     :header-rows:  1
     :stub-columns: 0
 
 
     -  .. row 1
 
-       -  ID
-
-       -  Description
-
-    -  .. row 2
-
-       -  .. _ATSCMH-SCCC-CODE-HLF:
-
-	  ``ATSCMH_SCCC_CODE_HLF``
-
-       -  The outer code rate of a SCCC Block is 1/2 rate.
-
-    -  .. row 3
-
-       -  .. _ATSCMH-SCCC-CODE-QTR:
+       -  Partial Reception
 
-	  ``ATSCMH_SCCC_CODE_QTR``
+       -  Sound Broadcasting
 
-       -  The outer code rate of a SCCC Block is 1/4 rate.
+       -  Layer A width
 
-    -  .. row 4
+       -  Layer B width
 
-       -  .. _ATSCMH-SCCC-CODE-RES:
+       -  Layer C width
 
-	  ``ATSCMH_SCCC_CODE_RES``
+       -  total width
 
-       -  to be documented.
+    -  .. row 2
 
+       -  0
 
+       -  0
 
-.. _DTV-ATSCMH-SCCC-CODE-MODE-B:
+       -  1 .. 13
 
-DTV_ATSCMH_SCCC_CODE_MODE_B
----------------------------
+       -  1 .. 13
 
-Series Concatenated Convolutional Code Rate.
+       -  1 .. 13
 
-Possible values are the same as documented on enum
-:c:type:`atscmh_sccc_code_mode`.
+       -  13
 
+    -  .. row 3
 
-.. _DTV-ATSCMH-SCCC-CODE-MODE-C:
+       -  1
 
-DTV_ATSCMH_SCCC_CODE_MODE_C
----------------------------
+       -  0
 
-Series Concatenated Convolutional Code Rate.
+       -  1
 
-Possible values are the same as documented on enum
-:c:type:`atscmh_sccc_code_mode`.
+       -  1 .. 13
 
+       -  1 .. 13
 
-.. _DTV-ATSCMH-SCCC-CODE-MODE-D:
+       -  13
 
-DTV_ATSCMH_SCCC_CODE_MODE_D
----------------------------
+    -  .. row 4
 
-Series Concatenated Convolutional Code Rate.
+       -  0
 
-Possible values are the same as documented on enum
-:c:type:`atscmh_sccc_code_mode`.
+       -  1
 
+       -  1
 
-.. _DTV-API-VERSION:
+       -  0
 
-DTV_API_VERSION
-===============
+       -  0
 
-Returns the major/minor version of the DVB API
+       -  1
 
+    -  .. row 5
 
-.. _DTV-CODE-RATE-HP:
+       -  1
 
-DTV_CODE_RATE_HP
-================
+       -  1
 
-Used on terrestrial transmissions. The acceptable values are the ones
-described at :c:type:`fe_transmit_mode`.
+       -  1
 
+       -  2
 
-.. _DTV-CODE-RATE-LP:
+       -  0
 
-DTV_CODE_RATE_LP
-================
+       -  13
 
-Used on terrestrial transmissions. The acceptable values are the ones
-described at :c:type:`fe_transmit_mode`.
 
 
-.. _DTV-GUARD-INTERVAL:
+.. _DTV-ISDBT-LAYER-TIME-INTERLEAVING:
 
-DTV_GUARD_INTERVAL
-==================
+DTV_ISDBT_LAYER[A-C]_TIME_INTERLEAVING
+--------------------------------------
 
-Possible values are:
+Valid values: 0, 1, 2, 4, -1 (AUTO)
 
+when DTV_ISDBT_SOUND_BROADCASTING is active, value 8 is also valid.
 
-.. c:type:: fe_guard_interval
+Note: The real time interleaving length depends on the mode (fft-size).
+The values here are referring to what can be found in the
+TMCC-structure, as shown in the table below.
 
-Modulation guard interval
--------------------------
 
+.. c:type:: isdbt_layer_interleaving_table
 
-.. flat-table:: enum fe_guard_interval
+.. flat-table:: ISDB-T time interleaving modes
     :header-rows:  1
     :stub-columns: 0
 
 
     -  .. row 1
 
-       -  ID
-
-       -  Description
-
-    -  .. row 2
-
-       -  .. _GUARD-INTERVAL-AUTO:
-
-	  ``GUARD_INTERVAL_AUTO``
+       -  ``DTV_ISDBT_LAYER[A-C]_TIME_INTERLEAVING``
 
-       -  Autodetect the guard interval
+       -  Mode 1 (2K FFT)
 
-    -  .. row 3
+       -  Mode 2 (4K FFT)
 
-       -  .. _GUARD-INTERVAL-1-128:
+       -  Mode 3 (8K FFT)
 
-	  ``GUARD_INTERVAL_1_128``
+    -  .. row 2
 
-       -  Guard interval 1/128
+       -  0
 
-    -  .. row 4
+       -  0
 
-       -  .. _GUARD-INTERVAL-1-32:
+       -  0
 
-	  ``GUARD_INTERVAL_1_32``
+       -  0
 
-       -  Guard interval 1/32
+    -  .. row 3
 
-    -  .. row 5
+       -  1
 
-       -  .. _GUARD-INTERVAL-1-16:
+       -  4
 
-	  ``GUARD_INTERVAL_1_16``
+       -  2
 
-       -  Guard interval 1/16
+       -  1
 
-    -  .. row 6
+    -  .. row 4
 
-       -  .. _GUARD-INTERVAL-1-8:
+       -  2
 
-	  ``GUARD_INTERVAL_1_8``
+       -  8
 
-       -  Guard interval 1/8
+       -  4
 
-    -  .. row 7
+       -  2
 
-       -  .. _GUARD-INTERVAL-1-4:
+    -  .. row 5
 
-	  ``GUARD_INTERVAL_1_4``
+       -  4
 
-       -  Guard interval 1/4
+       -  16
 
-    -  .. row 8
+       -  8
 
-       -  .. _GUARD-INTERVAL-19-128:
+       -  4
 
-	  ``GUARD_INTERVAL_19_128``
 
-       -  Guard interval 19/128
 
-    -  .. row 9
+.. _DTV-ATSCMH-FIC-VER:
 
-       -  .. _GUARD-INTERVAL-19-256:
+DTV_ATSCMH_FIC_VER
+------------------
 
-	  ``GUARD_INTERVAL_19_256``
+Version number of the FIC (Fast Information Channel) signaling data.
 
-       -  Guard interval 19/256
+FIC is used for relaying information to allow rapid service acquisition
+by the receiver.
 
-    -  .. row 10
+Possible values: 0, 1, 2, 3, ..., 30, 31
 
-       -  .. _GUARD-INTERVAL-PN420:
 
-	  ``GUARD_INTERVAL_PN420``
+.. _DTV-ATSCMH-PARADE-ID:
 
-       -  PN length 420 (1/4)
+DTV_ATSCMH_PARADE_ID
+--------------------
 
-    -  .. row 11
+Parade identification number
 
-       -  .. _GUARD-INTERVAL-PN595:
+A parade is a collection of up to eight MH groups, conveying one or two
+ensembles.
 
-	  ``GUARD_INTERVAL_PN595``
+Possible values: 0, 1, 2, 3, ..., 126, 127
 
-       -  PN length 595 (1/6)
 
-    -  .. row 12
+.. _DTV-ATSCMH-NOG:
 
-       -  .. _GUARD-INTERVAL-PN945:
+DTV_ATSCMH_NOG
+--------------
 
-	  ``GUARD_INTERVAL_PN945``
+Number of MH groups per MH subframe for a designated parade.
 
-       -  PN length 945 (1/9)
+Possible values: 1, 2, 3, 4, 5, 6, 7, 8
 
 
-Notes:
+.. _DTV-ATSCMH-TNOG:
 
-1) If ``DTV_GUARD_INTERVAL`` is set the ``GUARD_INTERVAL_AUTO`` the
-hardware will try to find the correct guard interval (if capable) and
-will use TMCC to fill in the missing parameters.
+DTV_ATSCMH_TNOG
+---------------
 
-2) Intervals 1/128, 19/128 and 19/256 are used only for DVB-T2 at
-present
+Total number of MH groups including all MH groups belonging to all MH
+parades in one MH subframe.
 
-3) DTMB specifies PN420, PN595 and PN945.
+Possible values: 0, 1, 2, 3, ..., 30, 31
 
 
-.. _DTV-TRANSMISSION-MODE:
+.. _DTV-ATSCMH-SGN:
 
-DTV_TRANSMISSION_MODE
-=====================
+DTV_ATSCMH_SGN
+--------------
 
-Specifies the number of carriers used by the standard. This is used only
-on OFTM-based standards, e. g. DVB-T/T2, ISDB-T, DTMB
+Start group number.
 
+Possible values: 0, 1, 2, 3, ..., 14, 15
 
-.. c:type:: fe_transmit_mode
 
-enum fe_transmit_mode: Number of carriers per channel
------------------------------------------------------
+.. _DTV-ATSCMH-PRC:
 
-.. tabularcolumns:: |p{5.0cm}|p{12.5cm}|
+DTV_ATSCMH_PRC
+--------------
 
-.. flat-table:: enum fe_transmit_mode
-    :header-rows:  1
-    :stub-columns: 0
+Parade repetition cycle.
 
+Possible values: 1, 2, 3, 4, 5, 6, 7, 8
 
-    -  .. row 1
 
-       -  ID
+.. _DTV-ATSCMH-RS-FRAME-MODE:
 
-       -  Description
+DTV_ATSCMH_RS_FRAME_MODE
+------------------------
 
-    -  .. row 2
+Reed Solomon (RS) frame mode.
 
-       -  .. _TRANSMISSION-MODE-AUTO:
+The acceptable values are defined by :c:type:`atscmh_rs_frame_mode`.
 
-	  ``TRANSMISSION_MODE_AUTO``
 
-       -  Autodetect transmission mode. The hardware will try to find the
-	  correct FFT-size (if capable) to fill in the missing parameters.
+.. _DTV-ATSCMH-RS-FRAME-ENSEMBLE:
 
-    -  .. row 3
+DTV_ATSCMH_RS_FRAME_ENSEMBLE
+----------------------------
 
-       -  .. _TRANSMISSION-MODE-1K:
+Reed Solomon(RS) frame ensemble.
 
-	  ``TRANSMISSION_MODE_1K``
+The acceptable values are defined by :c:type:`atscmh_rs_frame_ensemble`.
 
-       -  Transmission mode 1K
 
-    -  .. row 4
+.. _DTV-ATSCMH-RS-CODE-MODE-PRI:
 
-       -  .. _TRANSMISSION-MODE-2K:
+DTV_ATSCMH_RS_CODE_MODE_PRI
+---------------------------
 
-	  ``TRANSMISSION_MODE_2K``
+Reed Solomon (RS) code mode (primary).
 
-       -  Transmission mode 2K
+The acceptable values are defined by :c:type:`atscmh_rs_code_mode`.
 
-    -  .. row 5
 
-       -  .. _TRANSMISSION-MODE-8K:
+.. _DTV-ATSCMH-RS-CODE-MODE-SEC:
 
-	  ``TRANSMISSION_MODE_8K``
+DTV_ATSCMH_RS_CODE_MODE_SEC
+---------------------------
 
-       -  Transmission mode 8K
+Reed Solomon (RS) code mode (secondary).
 
-    -  .. row 6
+The acceptable values are defined by :c:type:`atscmh_rs_code_mode`.
 
-       -  .. _TRANSMISSION-MODE-4K:
 
-	  ``TRANSMISSION_MODE_4K``
+.. _DTV-ATSCMH-SCCC-BLOCK-MODE:
 
-       -  Transmission mode 4K
+DTV_ATSCMH_SCCC_BLOCK_MODE
+--------------------------
 
-    -  .. row 7
+Series Concatenated Convolutional Code Block Mode.
 
-       -  .. _TRANSMISSION-MODE-16K:
+The acceptable values are defined by :c:type:`atscmh_sccc_block_mode`.
 
-	  ``TRANSMISSION_MODE_16K``
 
-       -  Transmission mode 16K
+.. _DTV-ATSCMH-SCCC-CODE-MODE-A:
 
-    -  .. row 8
+DTV_ATSCMH_SCCC_CODE_MODE_A
+---------------------------
 
-       -  .. _TRANSMISSION-MODE-32K:
+Series Concatenated Convolutional Code Rate.
 
-	  ``TRANSMISSION_MODE_32K``
+The acceptable values are defined by :c:type:`atscmh_sccc_code_mode`.
 
-       -  Transmission mode 32K
+.. _DTV-ATSCMH-SCCC-CODE-MODE-B:
 
-    -  .. row 9
+DTV_ATSCMH_SCCC_CODE_MODE_B
+---------------------------
 
-       -  .. _TRANSMISSION-MODE-C1:
+Series Concatenated Convolutional Code Rate.
 
-	  ``TRANSMISSION_MODE_C1``
+Possible values are the same as documented on enum
+:c:type:`atscmh_sccc_code_mode`.
 
-       -  Single Carrier (C=1) transmission mode (DTMB)
 
-    -  .. row 10
+.. _DTV-ATSCMH-SCCC-CODE-MODE-C:
 
-       -  .. _TRANSMISSION-MODE-C3780:
+DTV_ATSCMH_SCCC_CODE_MODE_C
+---------------------------
 
-	  ``TRANSMISSION_MODE_C3780``
+Series Concatenated Convolutional Code Rate.
 
-       -  Multi Carrier (C=3780) transmission mode (DTMB)
+Possible values are the same as documented on enum
+:c:type:`atscmh_sccc_code_mode`.
 
 
-Notes:
+.. _DTV-ATSCMH-SCCC-CODE-MODE-D:
 
-1) ISDB-T supports three carrier/symbol-size: 8K, 4K, 2K. It is called
-'mode' in the standard: Mode 1 is 2K, mode 2 is 4K, mode 3 is 8K
+DTV_ATSCMH_SCCC_CODE_MODE_D
+---------------------------
 
-2) If ``DTV_TRANSMISSION_MODE`` is set the ``TRANSMISSION_MODE_AUTO``
-the hardware will try to find the correct FFT-size (if capable) and will
-use TMCC to fill in the missing parameters.
+Series Concatenated Convolutional Code Rate.
 
-3) DVB-T specifies 2K and 8K as valid sizes.
+Possible values are the same as documented on enum
+:c:type:`atscmh_sccc_code_mode`.
 
-4) DVB-T2 specifies 1K, 2K, 4K, 8K, 16K and 32K.
 
-5) DTMB specifies C1 and C3780.
+.. _DTV-API-VERSION:
 
+DTV_API_VERSION
+===============
 
-.. _DTV-HIERARCHY:
+Returns the major/minor version of the DVB API
 
-DTV_HIERARCHY
-=============
 
-Frontend hierarchy
+.. _DTV-CODE-RATE-HP:
 
+DTV_CODE_RATE_HP
+================
 
-.. c:type:: fe_hierarchy
+Used on terrestrial transmissions.
 
-Frontend hierarchy
-------------------
+The acceptable values are defined by :c:type:`fe_transmit_mode`.
 
 
-.. flat-table:: enum fe_hierarchy
-    :header-rows:  1
-    :stub-columns: 0
+.. _DTV-CODE-RATE-LP:
 
+DTV_CODE_RATE_LP
+================
 
-    -  .. row 1
+Used on terrestrial transmissions.
 
-       -  ID
+The acceptable values are defined by :c:type:`fe_transmit_mode`.
 
-       -  Description
 
-    -  .. row 2
+.. _DTV-GUARD-INTERVAL:
 
-       -  .. _HIERARCHY-NONE:
+DTV_GUARD_INTERVAL
+==================
 
-	  ``HIERARCHY_NONE``
+The acceptable values are defined by :c:type:`fe_guard_interval`.
 
-       -  No hierarchy
+.. note::
 
-    -  .. row 3
+   #. If ``DTV_GUARD_INTERVAL`` is set the ``GUARD_INTERVAL_AUTO`` the
+      hardware will try to find the correct guard interval (if capable) and
+      will use TMCC to fill in the missing parameters.
+   #. Intervals ``GUARD_INTERVAL_1_128``, ``GUARD_INTERVAL_19_128``
+      and ``GUARD_INTERVAL_19_256`` are used only for DVB-T2 at
+      present.
+   #. Intervals ``GUARD_INTERVAL_PN420``, ``GUARD_INTERVAL_PN595`` and
+      ``GUARD_INTERVAL_PN945`` are used only for DMTB at the present.
+      On such standard, only those intervals and ``GUARD_INTERVAL_AUTO``
+      are valid.
 
-       -  .. _HIERARCHY-AUTO:
+.. _DTV-TRANSMISSION-MODE:
 
-	  ``HIERARCHY_AUTO``
+DTV_TRANSMISSION_MODE
+=====================
 
-       -  Autodetect hierarchy (if supported)
+Specifies the FFT size (with corresponds to the approximate number of
+carriers) used by the standard. This is used only on OFTM-based standards,
+e. g. DVB-T/T2, ISDB-T, DTMB.
 
-    -  .. row 4
+The acceptable values are defined by :c:type:`fe_transmit_mode`.
 
-       -  .. _HIERARCHY-1:
+.. note::
 
-	  ``HIERARCHY_1``
+   #. ISDB-T supports three carrier/symbol-size: 8K, 4K, 2K. It is called
+      **mode** on such standard, and are numbered from 1 to 3:
 
-       -  Hierarchy 1
+      ====	========	========================
+      Mode	FFT size	Transmission mode
+      ====	========	========================
+      1		2K		``TRANSMISSION_MODE_2K``
+      2		4K		``TRANSMISSION_MODE_4K``
+      3		8K		``TRANSMISSION_MODE_8K``
+      ====	========	========================
 
-    -  .. row 5
+   #. If ``DTV_TRANSMISSION_MODE`` is set the ``TRANSMISSION_MODE_AUTO``
+      the hardware will try to find the correct FFT-size (if capable) and
+      will use TMCC to fill in the missing parameters.
 
-       -  .. _HIERARCHY-2:
+   #. DVB-T specifies 2K and 8K as valid sizes.
 
-	  ``HIERARCHY_2``
+   #. DVB-T2 specifies 1K, 2K, 4K, 8K, 16K and 32K.
 
-       -  Hierarchy 2
+   #. DTMB specifies C1 and C3780.
 
-    -  .. row 6
 
-       -  .. _HIERARCHY-4:
+.. _DTV-HIERARCHY:
 
-	  ``HIERARCHY_4``
+DTV_HIERARCHY
+=============
 
-       -  Hierarchy 4
+Frontend hierarchy.
 
+The acceptable values are defined by :c:type:`fe_hierarchy`.
 
 
 .. _DTV-STREAM-ID:
@@ -1884,60 +880,17 @@ with it, rather than trying to use FE_GET_INFO. In the case of a
 legacy frontend, the result is just the same as with FE_GET_INFO, but
 in a more structured format
 
+The acceptable values are defined by :c:type:`fe_delivery_system`.
+
 
 .. _DTV-INTERLEAVING:
 
 DTV_INTERLEAVING
 ================
 
-Time interleaving to be used. Currently, used only on DTMB.
-
-
-.. c:type:: fe_interleaving
-
-.. flat-table:: enum fe_interleaving
-    :header-rows:  1
-    :stub-columns: 0
-
-
-    -  .. row 1
-
-       -  ID
-
-       -  Description
-
-    -  .. row 2
-
-       -  .. _INTERLEAVING-NONE:
-
-	  ``INTERLEAVING_NONE``
-
-       -  No interleaving.
-
-    -  .. row 3
-
-       -  .. _INTERLEAVING-AUTO:
-
-	  ``INTERLEAVING_AUTO``
-
-       -  Auto-detect interleaving.
-
-    -  .. row 4
-
-       -  .. _INTERLEAVING-240:
-
-	  ``INTERLEAVING_240``
-
-       -  Interleaving of 240 symbols.
-
-    -  .. row 5
-
-       -  .. _INTERLEAVING-720:
-
-	  ``INTERLEAVING_720``
-
-       -  Interleaving of 720 symbols.
+Time interleaving to be used.
 
+The acceptable values are defined by :c:type:`fe_interleaving`.
 
 
 .. _DTV-LNA:
diff --git a/Documentation/media/uapi/dvb/frontend-header.rst b/Documentation/media/uapi/dvb/frontend-header.rst
new file mode 100644
index 000000000000..8d8433cf1e12
--- /dev/null
+++ b/Documentation/media/uapi/dvb/frontend-header.rst
@@ -0,0 +1,4 @@
+Frontend uAPI data types
+========================
+
+.. kernel-doc:: include/uapi/linux/dvb/frontend.h
diff --git a/include/uapi/linux/dvb/frontend.h b/include/uapi/linux/dvb/frontend.h
index 16a318fc469a..e7c29d0bdee4 100644
--- a/include/uapi/linux/dvb/frontend.h
+++ b/include/uapi/linux/dvb/frontend.h
@@ -562,10 +562,10 @@ enum fe_pilot {
 };
 
 /**
- * enum fe_rolloff - Rolloff factor (also known as alpha)
- * @ROLLOFF_35:		Roloff factor: 35%
- * @ROLLOFF_20:		Roloff factor: 20%
- * @ROLLOFF_25:		Roloff factor: 25%
+ * enum fe_rolloff - Rolloff factor
+ * @ROLLOFF_35:		Roloff factor: α=35%
+ * @ROLLOFF_20:		Roloff factor: α=20%
+ * @ROLLOFF_25:		Roloff factor: α=25%
  * @ROLLOFF_AUTO:	Auto-detect the roloff factor.
  *
  * .. note:
-- 
cgit v1.2.3


From 791edca5685b26d4575e59f5420ba3e206f5cebb Mon Sep 17 00:00:00 2001
From: Mauro Carvalho Chehab <mchehab@s-opensource.com>
Date: Thu, 31 Aug 2017 12:52:45 -0400
Subject: media: dmx.h: get rid of unused DMX_KERNEL_CLIENT

There's a flag defined for Digital TV demux that is not used
anywhere, called DMX_KERNEL_CLIENT. Get rid of it.

Signed-off-by: Mauro Carvalho Chehab <mchehab@s-opensource.com>
---
 Documentation/media/dmx.h.rst.exceptions   | 1 -
 Documentation/media/uapi/dvb/dmx_types.rst | 1 -
 include/uapi/linux/dvb/dmx.h               | 1 -
 3 files changed, 3 deletions(-)

(limited to 'include')

diff --git a/Documentation/media/dmx.h.rst.exceptions b/Documentation/media/dmx.h.rst.exceptions
index 2fdb458564ba..933ca5a61ce1 100644
--- a/Documentation/media/dmx.h.rst.exceptions
+++ b/Documentation/media/dmx.h.rst.exceptions
@@ -56,7 +56,6 @@ replace symbol DMX_SOURCE_DVR3 :c:type:`dmx_source`
 replace define DMX_CHECK_CRC :c:type:`dmx_sct_filter_params`
 replace define DMX_ONESHOT :c:type:`dmx_sct_filter_params`
 replace define DMX_IMMEDIATE_START :c:type:`dmx_sct_filter_params`
-replace define DMX_KERNEL_CLIENT :c:type:`dmx_sct_filter_params`
 
 # some typedefs should point to struct/enums
 replace typedef dmx_caps_t :c:type:`dmx_caps`
diff --git a/Documentation/media/uapi/dvb/dmx_types.rst b/Documentation/media/uapi/dvb/dmx_types.rst
index 80dd659860d7..0f0113205c94 100644
--- a/Documentation/media/uapi/dvb/dmx_types.rst
+++ b/Documentation/media/uapi/dvb/dmx_types.rst
@@ -147,7 +147,6 @@ struct dmx_sct_filter_params
     #define DMX_CHECK_CRC       1
     #define DMX_ONESHOT         2
     #define DMX_IMMEDIATE_START 4
-    #define DMX_KERNEL_CLIENT   0x8000
     };
 
 
diff --git a/include/uapi/linux/dvb/dmx.h b/include/uapi/linux/dvb/dmx.h
index 1bc4d6fb0f01..1702f923d425 100644
--- a/include/uapi/linux/dvb/dmx.h
+++ b/include/uapi/linux/dvb/dmx.h
@@ -103,7 +103,6 @@ struct dmx_sct_filter_params
 #define DMX_CHECK_CRC       1
 #define DMX_ONESHOT         2
 #define DMX_IMMEDIATE_START 4
-#define DMX_KERNEL_CLIENT   0x8000
 };
 
 
-- 
cgit v1.2.3


From 286fe1ca3fa1b6fcc7ce8695b7c8d681e6e1c3b7 Mon Sep 17 00:00:00 2001
From: Mauro Carvalho Chehab <mchehab@s-opensource.com>
Date: Thu, 31 Aug 2017 14:11:34 -0400
Subject: media: dmx.h: get rid of DMX_GET_CAPS

There's no driver currently using it; it is also not
documented about what it would be supposed to do.

So, get rid of it.

Signed-off-by: Mauro Carvalho Chehab <mchehab@s-opensource.com>
---
 Documentation/media/dmx.h.rst.exceptions      |  1 -
 Documentation/media/uapi/dvb/dmx-get-caps.rst | 41 ---------------------------
 Documentation/media/uapi/dvb/dmx_fcalls.rst   |  1 -
 Documentation/media/uapi/dvb/dmx_types.rst    | 12 --------
 include/uapi/linux/dvb/dmx.h                  |  7 -----
 5 files changed, 62 deletions(-)
 delete mode 100644 Documentation/media/uapi/dvb/dmx-get-caps.rst

(limited to 'include')

diff --git a/Documentation/media/dmx.h.rst.exceptions b/Documentation/media/dmx.h.rst.exceptions
index 933ca5a61ce1..5572d2dc9d0e 100644
--- a/Documentation/media/dmx.h.rst.exceptions
+++ b/Documentation/media/dmx.h.rst.exceptions
@@ -58,7 +58,6 @@ replace define DMX_ONESHOT :c:type:`dmx_sct_filter_params`
 replace define DMX_IMMEDIATE_START :c:type:`dmx_sct_filter_params`
 
 # some typedefs should point to struct/enums
-replace typedef dmx_caps_t :c:type:`dmx_caps`
 replace typedef dmx_filter_t :c:type:`dmx_filter`
 replace typedef dmx_pes_type_t :c:type:`dmx_pes_type`
 replace typedef dmx_input_t :c:type:`dmx_input`
diff --git a/Documentation/media/uapi/dvb/dmx-get-caps.rst b/Documentation/media/uapi/dvb/dmx-get-caps.rst
deleted file mode 100644
index 145fb520d779..000000000000
--- a/Documentation/media/uapi/dvb/dmx-get-caps.rst
+++ /dev/null
@@ -1,41 +0,0 @@
-.. -*- coding: utf-8; mode: rst -*-
-
-.. _DMX_GET_CAPS:
-
-============
-DMX_GET_CAPS
-============
-
-Name
-----
-
-DMX_GET_CAPS
-
-
-Synopsis
---------
-
-.. c:function:: int ioctl(fd, DMX_GET_CAPS, struct dmx_caps *caps)
-    :name: DMX_GET_CAPS
-
-Arguments
----------
-
-``fd``
-    File descriptor returned by :c:func:`open() <dvb-dmx-open>`.
-
-``caps``
-    Pointer to struct :c:type:`dmx_caps`
-
-
-Description
------------
-
-.. note:: This ioctl is undocumented. Documentation is welcome.
-
-Return Value
-------------
-
-On success 0 is returned, on error -1 and the ``errno`` variable is set
-appropriately. The generic error codes are described at the
-:ref:`Generic Error Codes <gen-errors>` chapter.
diff --git a/Documentation/media/uapi/dvb/dmx_fcalls.rst b/Documentation/media/uapi/dvb/dmx_fcalls.rst
index 77a1554d9834..49e013d4540f 100644
--- a/Documentation/media/uapi/dvb/dmx_fcalls.rst
+++ b/Documentation/media/uapi/dvb/dmx_fcalls.rst
@@ -21,7 +21,6 @@ Demux Function Calls
     dmx-get-event
     dmx-get-stc
     dmx-get-pes-pids
-    dmx-get-caps
     dmx-set-source
     dmx-add-pid
     dmx-remove-pid
diff --git a/Documentation/media/uapi/dvb/dmx_types.rst b/Documentation/media/uapi/dvb/dmx_types.rst
index 0f0113205c94..9e907b85cf16 100644
--- a/Documentation/media/uapi/dvb/dmx_types.rst
+++ b/Documentation/media/uapi/dvb/dmx_types.rst
@@ -199,18 +199,6 @@ struct dmx_stc
     };
 
 
-struct dmx_caps
-===============
-
-.. c:type:: dmx_caps
-
-.. code-block:: c
-
-     typedef struct dmx_caps {
-	__u32 caps;
-	int num_decoders;
-    } dmx_caps_t;
-
 
 enum dmx_source
 ===============
diff --git a/include/uapi/linux/dvb/dmx.h b/include/uapi/linux/dvb/dmx.h
index 1702f923d425..db8bd00c93de 100644
--- a/include/uapi/linux/dvb/dmx.h
+++ b/include/uapi/linux/dvb/dmx.h
@@ -115,11 +115,6 @@ struct dmx_pes_filter_params
 	__u32           flags;
 };
 
-struct dmx_caps {
-	__u32 caps;
-	int num_decoders;
-};
-
 enum dmx_source {
 	DMX_SOURCE_FRONT0 = 0,
 	DMX_SOURCE_FRONT1,
@@ -143,7 +138,6 @@ struct dmx_stc {
 #define DMX_SET_PES_FILTER       _IOW('o', 44, struct dmx_pes_filter_params)
 #define DMX_SET_BUFFER_SIZE      _IO('o', 45)
 #define DMX_GET_PES_PIDS         _IOR('o', 47, __u16[5])
-#define DMX_GET_CAPS             _IOR('o', 48, struct dmx_caps)
 #define DMX_SET_SOURCE           _IOW('o', 49, enum dmx_source)
 #define DMX_GET_STC              _IOWR('o', 50, struct dmx_stc)
 #define DMX_ADD_PID              _IOW('o', 51, __u16)
@@ -156,7 +150,6 @@ typedef enum dmx_output dmx_output_t;
 typedef enum dmx_input dmx_input_t;
 typedef enum dmx_ts_pes dmx_pes_type_t;
 typedef struct dmx_filter dmx_filter_t;
-typedef struct dmx_caps dmx_caps_t;
 typedef enum dmx_source  dmx_source_t;
 
 #endif
-- 
cgit v1.2.3


From 13adefbe9e566c6db91579e4ce17f1e5193d6f2c Mon Sep 17 00:00:00 2001
From: Mauro Carvalho Chehab <mchehab@s-opensource.com>
Date: Thu, 31 Aug 2017 14:21:43 -0400
Subject: media: dmx.h: get rid of DMX_SET_SOURCE

No driver uses this ioctl, nor it is documented anywhere.

So, get rid of it.

Signed-off-by: Mauro Carvalho Chehab <mchehab@s-opensource.com>
---
 Documentation/media/dmx.h.rst.exceptions        | 13 --------
 Documentation/media/uapi/dvb/dmx-set-source.rst | 44 -------------------------
 Documentation/media/uapi/dvb/dmx_fcalls.rst     |  1 -
 Documentation/media/uapi/dvb/dmx_types.rst      | 20 -----------
 include/uapi/linux/dvb/dmx.h                    | 12 -------
 5 files changed, 90 deletions(-)
 delete mode 100644 Documentation/media/uapi/dvb/dmx-set-source.rst

(limited to 'include')

diff --git a/Documentation/media/dmx.h.rst.exceptions b/Documentation/media/dmx.h.rst.exceptions
index 5572d2dc9d0e..d2dac35bb84b 100644
--- a/Documentation/media/dmx.h.rst.exceptions
+++ b/Documentation/media/dmx.h.rst.exceptions
@@ -40,18 +40,6 @@ replace enum dmx_input :c:type:`dmx_input`
 replace symbol DMX_IN_FRONTEND :c:type:`dmx_input`
 replace symbol DMX_IN_DVR :c:type:`dmx_input`
 
-# dmx_source_t symbols
-replace enum dmx_source :c:type:`dmx_source`
-replace symbol DMX_SOURCE_FRONT0 :c:type:`dmx_source`
-replace symbol DMX_SOURCE_FRONT1 :c:type:`dmx_source`
-replace symbol DMX_SOURCE_FRONT2 :c:type:`dmx_source`
-replace symbol DMX_SOURCE_FRONT3 :c:type:`dmx_source`
-replace symbol DMX_SOURCE_DVR0 :c:type:`dmx_source`
-replace symbol DMX_SOURCE_DVR1 :c:type:`dmx_source`
-replace symbol DMX_SOURCE_DVR2 :c:type:`dmx_source`
-replace symbol DMX_SOURCE_DVR3 :c:type:`dmx_source`
-
-
 # Flags for struct dmx_sct_filter_params
 replace define DMX_CHECK_CRC :c:type:`dmx_sct_filter_params`
 replace define DMX_ONESHOT :c:type:`dmx_sct_filter_params`
@@ -61,4 +49,3 @@ replace define DMX_IMMEDIATE_START :c:type:`dmx_sct_filter_params`
 replace typedef dmx_filter_t :c:type:`dmx_filter`
 replace typedef dmx_pes_type_t :c:type:`dmx_pes_type`
 replace typedef dmx_input_t :c:type:`dmx_input`
-replace typedef dmx_source_t :c:type:`dmx_source`
diff --git a/Documentation/media/uapi/dvb/dmx-set-source.rst b/Documentation/media/uapi/dvb/dmx-set-source.rst
deleted file mode 100644
index ac7f77b25e06..000000000000
--- a/Documentation/media/uapi/dvb/dmx-set-source.rst
+++ /dev/null
@@ -1,44 +0,0 @@
-.. -*- coding: utf-8; mode: rst -*-
-
-.. _DMX_SET_SOURCE:
-
-==============
-DMX_SET_SOURCE
-==============
-
-Name
-----
-
-DMX_SET_SOURCE
-
-
-Synopsis
---------
-
-.. c:function:: int ioctl(fd, DMX_SET_SOURCE, struct dmx_source *src)
-    :name: DMX_SET_SOURCE
-
-
-Arguments
----------
-
-
-``fd``
-    File descriptor returned by :c:func:`open() <dvb-dmx-open>`.
-
-``src``
-   Undocumented.
-
-
-Description
------------
-
-.. note:: This ioctl is undocumented. Documentation is welcome.
-
-
-Return Value
-------------
-
-On success 0 is returned, on error -1 and the ``errno`` variable is set
-appropriately. The generic error codes are described at the
-:ref:`Generic Error Codes <gen-errors>` chapter.
diff --git a/Documentation/media/uapi/dvb/dmx_fcalls.rst b/Documentation/media/uapi/dvb/dmx_fcalls.rst
index 49e013d4540f..be98d60877f2 100644
--- a/Documentation/media/uapi/dvb/dmx_fcalls.rst
+++ b/Documentation/media/uapi/dvb/dmx_fcalls.rst
@@ -21,6 +21,5 @@ Demux Function Calls
     dmx-get-event
     dmx-get-stc
     dmx-get-pes-pids
-    dmx-set-source
     dmx-add-pid
     dmx-remove-pid
diff --git a/Documentation/media/uapi/dvb/dmx_types.rst b/Documentation/media/uapi/dvb/dmx_types.rst
index 9e907b85cf16..a205c02ccdc1 100644
--- a/Documentation/media/uapi/dvb/dmx_types.rst
+++ b/Documentation/media/uapi/dvb/dmx_types.rst
@@ -197,23 +197,3 @@ struct dmx_stc
 	unsigned int base;  /* output: divisor for stc to get 90 kHz clock */
 	__u64 stc;      /* output: stc in 'base'*90 kHz units */
     };
-
-
-
-enum dmx_source
-===============
-
-.. c:type:: dmx_source
-
-.. code-block:: c
-
-    typedef enum dmx_source {
-	DMX_SOURCE_FRONT0 = 0,
-	DMX_SOURCE_FRONT1,
-	DMX_SOURCE_FRONT2,
-	DMX_SOURCE_FRONT3,
-	DMX_SOURCE_DVR0   = 16,
-	DMX_SOURCE_DVR1,
-	DMX_SOURCE_DVR2,
-	DMX_SOURCE_DVR3
-    } dmx_source_t;
diff --git a/include/uapi/linux/dvb/dmx.h b/include/uapi/linux/dvb/dmx.h
index db8bd00c93de..08dc17060321 100644
--- a/include/uapi/linux/dvb/dmx.h
+++ b/include/uapi/linux/dvb/dmx.h
@@ -115,16 +115,6 @@ struct dmx_pes_filter_params
 	__u32           flags;
 };
 
-enum dmx_source {
-	DMX_SOURCE_FRONT0 = 0,
-	DMX_SOURCE_FRONT1,
-	DMX_SOURCE_FRONT2,
-	DMX_SOURCE_FRONT3,
-	DMX_SOURCE_DVR0   = 16,
-	DMX_SOURCE_DVR1,
-	DMX_SOURCE_DVR2,
-	DMX_SOURCE_DVR3
-};
 
 struct dmx_stc {
 	unsigned int num;	/* input : which STC? 0..N */
@@ -138,7 +128,6 @@ struct dmx_stc {
 #define DMX_SET_PES_FILTER       _IOW('o', 44, struct dmx_pes_filter_params)
 #define DMX_SET_BUFFER_SIZE      _IO('o', 45)
 #define DMX_GET_PES_PIDS         _IOR('o', 47, __u16[5])
-#define DMX_SET_SOURCE           _IOW('o', 49, enum dmx_source)
 #define DMX_GET_STC              _IOWR('o', 50, struct dmx_stc)
 #define DMX_ADD_PID              _IOW('o', 51, __u16)
 #define DMX_REMOVE_PID           _IOW('o', 52, __u16)
@@ -150,7 +139,6 @@ typedef enum dmx_output dmx_output_t;
 typedef enum dmx_input dmx_input_t;
 typedef enum dmx_ts_pes dmx_pes_type_t;
 typedef struct dmx_filter dmx_filter_t;
-typedef enum dmx_source  dmx_source_t;
 
 #endif
 
-- 
cgit v1.2.3


From bb98e6d280e00a1180f47d3391ee0bd1f312b5f6 Mon Sep 17 00:00:00 2001
From: Mauro Carvalho Chehab <mchehab@s-opensource.com>
Date: Thu, 31 Aug 2017 12:28:52 -0400
Subject: media: dmx.h: add kernel-doc markups and use it at Documentation/

The demux documentation is pretty poor nowadays: most of the
structs and enums aren't documented at all.

Add proper kernel-doc markups for them and use it.

Now, the demux API data structures are fully documented :-)

Signed-off-by: Mauro Carvalho Chehab <mchehab@s-opensource.com>
---
 Documentation/media/dmx.h.rst.exceptions   |   5 +
 Documentation/media/uapi/dvb/dmx_types.rst | 173 +----------------------------
 include/uapi/linux/dvb/dmx.h               | 139 ++++++++++++++++++-----
 3 files changed, 120 insertions(+), 197 deletions(-)

(limited to 'include')

diff --git a/Documentation/media/dmx.h.rst.exceptions b/Documentation/media/dmx.h.rst.exceptions
index d2dac35bb84b..629db384104a 100644
--- a/Documentation/media/dmx.h.rst.exceptions
+++ b/Documentation/media/dmx.h.rst.exceptions
@@ -49,3 +49,8 @@ replace define DMX_IMMEDIATE_START :c:type:`dmx_sct_filter_params`
 replace typedef dmx_filter_t :c:type:`dmx_filter`
 replace typedef dmx_pes_type_t :c:type:`dmx_pes_type`
 replace typedef dmx_input_t :c:type:`dmx_input`
+
+ignore symbol DMX_OUT_DECODER
+ignore symbol DMX_OUT_TAP
+ignore symbol DMX_OUT_TS_TAP
+ignore symbol DMX_OUT_TSDEMUX_TAP
diff --git a/Documentation/media/uapi/dvb/dmx_types.rst b/Documentation/media/uapi/dvb/dmx_types.rst
index 171205ed86a4..2a023a4f516c 100644
--- a/Documentation/media/uapi/dvb/dmx_types.rst
+++ b/Documentation/media/uapi/dvb/dmx_types.rst
@@ -6,175 +6,4 @@
 Demux Data Types
 ****************
 
-Output for the demux
-====================
-
-.. c:type:: dmx_output
-
-.. tabularcolumns:: |p{5.0cm}|p{12.5cm}|
-
-.. flat-table:: enum dmx_output
-    :header-rows:  1
-    :stub-columns: 0
-
-
-    -  .. row 1
-
-       -  ID
-
-       -  Description
-
-    -  .. row 2
-
-       -  .. _DMX-OUT-DECODER:
-
-	  DMX_OUT_DECODER
-
-       -  Streaming directly to decoder.
-
-    -  .. row 3
-
-       -  .. _DMX-OUT-TAP:
-
-	  DMX_OUT_TAP
-
-       -  Output going to a memory buffer (to be retrieved via the read
-	  command). Delivers the stream output to the demux device on which
-	  the ioctl is called.
-
-    -  .. row 4
-
-       -  .. _DMX-OUT-TS-TAP:
-
-	  DMX_OUT_TS_TAP
-
-       -  Output multiplexed into a new TS (to be retrieved by reading from
-	  the logical DVR device). Routes output to the logical DVR device
-	  ``/dev/dvb/adapter?/dvr?``, which delivers a TS multiplexed from
-	  all filters for which ``DMX_OUT_TS_TAP`` was specified.
-
-    -  .. row 5
-
-       -  .. _DMX-OUT-TSDEMUX-TAP:
-
-	  DMX_OUT_TSDEMUX_TAP
-
-       -  Like :ref:`DMX_OUT_TS_TAP <DMX-OUT-TS-TAP>` but retrieved
-	  from the DMX device.
-
-
-dmx_input_t
-===========
-
-.. c:type:: dmx_input
-
-.. code-block:: c
-
-    typedef enum
-    {
-	DMX_IN_FRONTEND, /* Input from a front-end device.  */
-	DMX_IN_DVR       /* Input from the logical DVR device.  */
-    } dmx_input_t;
-
-
-dmx_pes_type_t
-==============
-
-.. c:type:: dmx_pes_type
-
-
-.. code-block:: c
-
-    typedef enum
-    {
-	DMX_PES_AUDIO0,
-	DMX_PES_VIDEO0,
-	DMX_PES_TELETEXT0,
-	DMX_PES_SUBTITLE0,
-	DMX_PES_PCR0,
-
-	DMX_PES_AUDIO1,
-	DMX_PES_VIDEO1,
-	DMX_PES_TELETEXT1,
-	DMX_PES_SUBTITLE1,
-	DMX_PES_PCR1,
-
-	DMX_PES_AUDIO2,
-	DMX_PES_VIDEO2,
-	DMX_PES_TELETEXT2,
-	DMX_PES_SUBTITLE2,
-	DMX_PES_PCR2,
-
-	DMX_PES_AUDIO3,
-	DMX_PES_VIDEO3,
-	DMX_PES_TELETEXT3,
-	DMX_PES_SUBTITLE3,
-	DMX_PES_PCR3,
-
-	DMX_PES_OTHER
-    } dmx_pes_type_t;
-
-
-struct dmx_filter
-=================
-
-.. c:type:: dmx_filter
-
-.. code-block:: c
-
-     typedef struct dmx_filter
-    {
-	__u8  filter[DMX_FILTER_SIZE];
-	__u8  mask[DMX_FILTER_SIZE];
-	__u8  mode[DMX_FILTER_SIZE];
-    } dmx_filter_t;
-
-
-.. c:type:: dmx_sct_filter_params
-
-struct dmx_sct_filter_params
-============================
-
-
-.. code-block:: c
-
-    struct dmx_sct_filter_params
-    {
-	__u16          pid;
-	dmx_filter_t   filter;
-	__u32          timeout;
-	__u32          flags;
-    #define DMX_CHECK_CRC       1
-    #define DMX_ONESHOT         2
-    #define DMX_IMMEDIATE_START 4
-    };
-
-
-struct dmx_pes_filter_params
-============================
-
-.. c:type:: dmx_pes_filter_params
-
-.. code-block:: c
-
-    struct dmx_pes_filter_params
-    {
-	__u16          pid;
-	dmx_input_t    input;
-	dmx_output_t   output;
-	dmx_pes_type_t pes_type;
-	__u32          flags;
-    };
-
-struct dmx_stc
-==============
-
-.. c:type:: dmx_stc
-
-.. code-block:: c
-
-    struct dmx_stc {
-	unsigned int num;   /* input : which STC? 0..N */
-	unsigned int base;  /* output: divisor for stc to get 90 kHz clock */
-	__u64 stc;      /* output: stc in 'base'*90 kHz units */
-    };
+.. kernel-doc:: include/uapi/linux/dvb/dmx.h
diff --git a/include/uapi/linux/dvb/dmx.h b/include/uapi/linux/dvb/dmx.h
index 08dc17060321..4e3f3a2fe83f 100644
--- a/include/uapi/linux/dvb/dmx.h
+++ b/include/uapi/linux/dvb/dmx.h
@@ -32,26 +32,74 @@
 
 #define DMX_FILTER_SIZE 16
 
-enum dmx_output
-{
-	DMX_OUT_DECODER, /* Streaming directly to decoder. */
-	DMX_OUT_TAP,     /* Output going to a memory buffer */
-			 /* (to be retrieved via the read command).*/
-	DMX_OUT_TS_TAP,  /* Output multiplexed into a new TS  */
-			 /* (to be retrieved by reading from the */
-			 /* logical DVR device).                 */
-	DMX_OUT_TSDEMUX_TAP /* Like TS_TAP but retrieved from the DMX device */
+/**
+ * enum dmx_output - Output for the demux.
+ *
+ * @DMX_OUT_DECODER:
+ *	Streaming directly to decoder.
+ * @DMX_OUT_TAP:
+ *	Output going to a memory buffer (to be retrieved via the read command).
+ *	Delivers the stream output to the demux device on which the ioctl
+ *	is called.
+ * @DMX_OUT_TS_TAP:
+ *	Output multiplexed into a new TS (to be retrieved by reading from the
+ *	logical DVR device). Routes output to the logical DVR device
+ *	``/dev/dvb/adapter?/dvr?``, which delivers a TS multiplexed from all
+ *	filters for which @DMX_OUT_TS_TAP was specified.
+ * @DMX_OUT_TSDEMUX_TAP:
+ *	Like @DMX_OUT_TS_TAP but retrieved from the DMX device.
+ */
+enum dmx_output {
+	DMX_OUT_DECODER,
+	DMX_OUT_TAP,
+	DMX_OUT_TS_TAP,
+	DMX_OUT_TSDEMUX_TAP
 };
 
-enum dmx_input
-{
-	DMX_IN_FRONTEND, /* Input from a front-end device.  */
-	DMX_IN_DVR       /* Input from the logical DVR device.  */
+
+/**
+ * enum dmx_input - Input from the demux.
+ *
+ * @DMX_IN_FRONTEND:	Input from a front-end device.
+ * @DMX_IN_DVR:		Input from the logical DVR device.
+ */
+enum dmx_input {
+	DMX_IN_FRONTEND,
+	DMX_IN_DVR
 };
 
+/**
+ * enum dmx_ts_pes - type of the PES filter.
+ *
+ * @DMX_PES_AUDIO0:	first audio PID. Also referred as @DMX_PES_AUDIO.
+ * @DMX_PES_VIDEO0:	first video PID. Also referred as @DMX_PES_VIDEO.
+ * @DMX_PES_TELETEXT0:	first teletext PID. Also referred as @DMX_PES_TELETEXT.
+ * @DMX_PES_SUBTITLE0:	first subtitle PID. Also referred as @DMX_PES_SUBTITLE.
+ * @DMX_PES_PCR0:	first Program Clock Reference PID.
+ *			Also referred as @DMX_PES_PCR.
+ *
+ * @DMX_PES_AUDIO1:	second audio PID.
+ * @DMX_PES_VIDEO1:	second video PID.
+ * @DMX_PES_TELETEXT1:	second teletext PID.
+ * @DMX_PES_SUBTITLE1:	second subtitle PID.
+ * @DMX_PES_PCR1:	second Program Clock Reference PID.
+ *
+ * @DMX_PES_AUDIO2:	third audio PID.
+ * @DMX_PES_VIDEO2:	third video PID.
+ * @DMX_PES_TELETEXT2:	third teletext PID.
+ * @DMX_PES_SUBTITLE2:	third subtitle PID.
+ * @DMX_PES_PCR2:	third Program Clock Reference PID.
+ *
+ * @DMX_PES_AUDIO3:	fourth audio PID.
+ * @DMX_PES_VIDEO3:	fourth video PID.
+ * @DMX_PES_TELETEXT3:	fourth teletext PID.
+ * @DMX_PES_SUBTITLE3:	fourth subtitle PID.
+ * @DMX_PES_PCR3:	fourth Program Clock Reference PID.
+ *
+ * @DMX_PES_OTHER:	any other PID.
+ */
 
-enum dmx_ts_pes
-{
+enum dmx_ts_pes {
 	DMX_PES_AUDIO0,
 	DMX_PES_VIDEO0,
 	DMX_PES_TELETEXT0,
@@ -86,16 +134,42 @@ enum dmx_ts_pes
 #define DMX_PES_PCR      DMX_PES_PCR0
 
 
-struct dmx_filter
-{
+
+/**
+ * struct dmx_filter - Specifies a section header filter.
+ *
+ * @filter: bit array with bits to be matched at the section header.
+ * @mask: bits that are valid at the filter bit array.
+ * @mode: mode of match: if bit is zero, it will match if equal (positive
+ *	  match); if bit is one, it will match if the bit is negated.
+ *
+ * Note: All arrays in this struct have a size of DMX_FILTER_SIZE (16 bytes).
+ */
+struct dmx_filter {
 	__u8  filter[DMX_FILTER_SIZE];
 	__u8  mask[DMX_FILTER_SIZE];
 	__u8  mode[DMX_FILTER_SIZE];
 };
 
-
-struct dmx_sct_filter_params
-{
+/**
+ * struct dmx_sct_filter_params - Specifies a section filter.
+ *
+ * @pid: PID to be filtered.
+ * @filter: section header filter, as defined by &struct dmx_filter.
+ * @timeout: maximum time to filter, in milliseconds.
+ * @flags: extra flags for the section filter.
+ *
+ * Carries the configuration for a MPEG-TS section filter.
+ *
+ * The @flags can be:
+ *
+ *	- %DMX_CHECK_CRC - only deliver sections where the CRC check succeeded;
+ *	- %DMX_ONESHOT - disable the section filter after one section
+ *	  has been delivered;
+ *	- %DMX_IMMEDIATE_START - Start filter immediately without requiring a
+ *	  :ref:`DMX_START`.
+ */
+struct dmx_sct_filter_params {
 	__u16             pid;
 	struct dmx_filter filter;
 	__u32             timeout;
@@ -105,7 +179,16 @@ struct dmx_sct_filter_params
 #define DMX_IMMEDIATE_START 4
 };
 
-
+/**
+ * struct dmx_pes_filter_params - Specifies Packetized Elementary Stream (PES)
+ *	filter parameters.
+ *
+ * @pid:	PID to be filtered.
+ * @input:	Demux input, as specified by &enum dmx_input.
+ * @output:	Demux output, as specified by &enum dmx_output.
+ * @pes_type:	Type of the pes filter, as specified by &enum dmx_pes_type.
+ * @flags:	Demux PES flags.
+ */
 struct dmx_pes_filter_params
 {
 	__u16           pid;
@@ -115,11 +198,17 @@ struct dmx_pes_filter_params
 	__u32           flags;
 };
 
-
+/**
+ * struct dmx_stc - Stores System Time Counter (STC) information.
+ *
+ * @num: input data: number of the STC, from 0 to N.
+ * @base: output: divisor for STC to get 90 kHz clock.
+ * @stc: output: stc in @base * 90 kHz units.
+ */
 struct dmx_stc {
-	unsigned int num;	/* input : which STC? 0..N */
-	unsigned int base;	/* output: divisor for stc to get 90 kHz clock */
-	__u64 stc;		/* output: stc in 'base'*90 kHz units */
+	unsigned int num;
+	unsigned int base;
+	__u64 stc;
 };
 
 #define DMX_START                _IO('o', 41)
-- 
cgit v1.2.3


From 833ff5e7feda1a042b83e82208cef3d212ca0ef1 Mon Sep 17 00:00:00 2001
From: Mauro Carvalho Chehab <mchehab@s-opensource.com>
Date: Fri, 1 Sep 2017 07:41:49 -0400
Subject: media: ca.h: get rid of CA_SET_PID

This ioctl seems to be some attempt to support a feature
at the bt8xx dst_ca driver. Yet, as said there, it
"needs more work". Right now, the code there is just
a boilerplate.

At the end of the day, no driver uses this ioctl, nor it is
documented anywhere (except for "needs more work").

So, get rid of it.

Signed-off-by: Mauro Carvalho Chehab <mchehab@s-opensource.com>
---
 Documentation/media/ca.h.rst.exceptions            |  1 -
 Documentation/media/dvb-drivers/ci.rst             |  1 -
 Documentation/media/uapi/dvb/ca-set-pid.rst        | 60 ----------------------
 Documentation/media/uapi/dvb/ca_data_types.rst     | 14 -----
 Documentation/media/uapi/dvb/ca_function_calls.rst |  1 -
 drivers/media/pci/bt8xx/dst_ca.c                   | 16 ------
 include/uapi/linux/dvb/ca.h                        |  7 ---
 7 files changed, 100 deletions(-)
 delete mode 100644 Documentation/media/uapi/dvb/ca-set-pid.rst

(limited to 'include')

diff --git a/Documentation/media/ca.h.rst.exceptions b/Documentation/media/ca.h.rst.exceptions
index d7c9fed8c004..553559cc6ad7 100644
--- a/Documentation/media/ca.h.rst.exceptions
+++ b/Documentation/media/ca.h.rst.exceptions
@@ -16,7 +16,6 @@ replace define CA_NDS :c:type:`ca_descr_info`
 replace define CA_DSS :c:type:`ca_descr_info`
 
 # some typedefs should point to struct/enums
-replace typedef ca_pid_t :c:type:`ca_pid`
 replace typedef ca_slot_info_t :c:type:`ca_slot_info`
 replace typedef ca_descr_info_t :c:type:`ca_descr_info`
 replace typedef ca_caps_t :c:type:`ca_caps`
diff --git a/Documentation/media/dvb-drivers/ci.rst b/Documentation/media/dvb-drivers/ci.rst
index 69b07e9d1816..87f3748c49b9 100644
--- a/Documentation/media/dvb-drivers/ci.rst
+++ b/Documentation/media/dvb-drivers/ci.rst
@@ -143,7 +143,6 @@ All these ioctls are also valid for the High level CI interface
 #define CA_GET_MSG        _IOR('o', 132, ca_msg_t)
 #define CA_SEND_MSG       _IOW('o', 133, ca_msg_t)
 #define CA_SET_DESCR      _IOW('o', 134, ca_descr_t)
-#define CA_SET_PID        _IOW('o', 135, ca_pid_t)
 
 
 On querying the device, the device yields information thus:
diff --git a/Documentation/media/uapi/dvb/ca-set-pid.rst b/Documentation/media/uapi/dvb/ca-set-pid.rst
deleted file mode 100644
index 891c1c72ef24..000000000000
--- a/Documentation/media/uapi/dvb/ca-set-pid.rst
+++ /dev/null
@@ -1,60 +0,0 @@
-.. -*- coding: utf-8; mode: rst -*-
-
-.. _CA_SET_PID:
-
-==========
-CA_SET_PID
-==========
-
-Name
-----
-
-CA_SET_PID
-
-
-Synopsis
---------
-
-.. c:function:: int ioctl(fd, CA_SET_PID, struct ca_pid *pid)
-    :name: CA_SET_PID
-
-
-Arguments
----------
-
-``fd``
-  File descriptor returned by a previous call to :c:func:`open() <dvb-ca-open>`.
-
-``pid``
-  Pointer to struct :c:type:`ca_pid`.
-
-.. c:type:: ca_pid
-
-.. flat-table:: struct ca_pid
-    :header-rows:  1
-    :stub-columns: 0
-
-    -
-       - unsigned int
-       - pid
-       - Program ID
-
-    -
-       - int
-       - index
-       - PID index. Use -1 to disable.
-
-
-
-Description
------------
-
-.. note:: This ioctl is undocumented. Documentation is welcome.
-
-
-Return Value
-------------
-
-On success 0 is returned, on error -1 and the ``errno`` variable is set
-appropriately. The generic error codes are described at the
-:ref:`Generic Error Codes <gen-errors>` chapter.
diff --git a/Documentation/media/uapi/dvb/ca_data_types.rst b/Documentation/media/uapi/dvb/ca_data_types.rst
index d9e27c77426c..555b5137936b 100644
--- a/Documentation/media/uapi/dvb/ca_data_types.rst
+++ b/Documentation/media/uapi/dvb/ca_data_types.rst
@@ -94,17 +94,3 @@ ca_descr_t
 	unsigned int parity;
 	unsigned char cw[8];
     } ca_descr_t;
-
-
-.. c:type:: ca_pid
-
-ca-pid
-======
-
-
-.. code-block:: c
-
-    typedef struct ca_pid {
-	unsigned int pid;
-	int index;      /* -1 == disable*/
-    } ca_pid_t;
diff --git a/Documentation/media/uapi/dvb/ca_function_calls.rst b/Documentation/media/uapi/dvb/ca_function_calls.rst
index c085a0ebbc05..87d697851e82 100644
--- a/Documentation/media/uapi/dvb/ca_function_calls.rst
+++ b/Documentation/media/uapi/dvb/ca_function_calls.rst
@@ -18,4 +18,3 @@ CA Function Calls
     ca-get-msg
     ca-send-msg
     ca-set-descr
-    ca-set-pid
diff --git a/drivers/media/pci/bt8xx/dst_ca.c b/drivers/media/pci/bt8xx/dst_ca.c
index 90f4263452d3..7db47d8bbe15 100644
--- a/drivers/media/pci/bt8xx/dst_ca.c
+++ b/drivers/media/pci/bt8xx/dst_ca.c
@@ -64,13 +64,6 @@ static int ca_set_slot_descr(void)
 	return -EOPNOTSUPP;
 }
 
-/*	Need some more work	*/
-static int ca_set_pid(void)
-{
-	/*	We could make this more graceful ?	*/
-	return -EOPNOTSUPP;
-}
-
 static void put_command_and_length(u8 *data, int command, int length)
 {
 	data[0] = (command >> 16) & 0xff;
@@ -629,15 +622,6 @@ static long dst_ca_ioctl(struct file *file, unsigned int cmd, unsigned long ioct
 		}
 		dprintk(verbose, DST_CA_INFO, 1, " -->CA_SET_DESCR Success !");
 		break;
-	case CA_SET_PID:
-		dprintk(verbose, DST_CA_INFO, 1, " Setting PID");
-		if ((ca_set_pid()) < 0) {
-			dprintk(verbose, DST_CA_ERROR, 1, " -->CA_SET_PID Failed !");
-			result = -1;
-			goto free_mem_and_exit;
-		}
-		dprintk(verbose, DST_CA_INFO, 1, " -->CA_SET_PID Success !");
-		break;
 	default:
 		result = -EOPNOTSUPP;
 	}
diff --git a/include/uapi/linux/dvb/ca.h b/include/uapi/linux/dvb/ca.h
index 00cf24587bea..859f6c0c4751 100644
--- a/include/uapi/linux/dvb/ca.h
+++ b/include/uapi/linux/dvb/ca.h
@@ -73,11 +73,6 @@ struct ca_descr {
 	unsigned char cw[8];
 };
 
-struct ca_pid {
-	unsigned int pid;
-	int index;		/* -1 == disable*/
-};
-
 #define CA_RESET          _IO('o', 128)
 #define CA_GET_CAP        _IOR('o', 129, struct ca_caps)
 #define CA_GET_SLOT_INFO  _IOR('o', 130, struct ca_slot_info)
@@ -85,7 +80,6 @@ struct ca_pid {
 #define CA_GET_MSG        _IOR('o', 132, struct ca_msg)
 #define CA_SEND_MSG       _IOW('o', 133, struct ca_msg)
 #define CA_SET_DESCR      _IOW('o', 134, struct ca_descr)
-#define CA_SET_PID        _IOW('o', 135, struct ca_pid)
 
 #if !defined (__KERNEL__)
 
@@ -95,7 +89,6 @@ typedef struct ca_descr_info  ca_descr_info_t;
 typedef struct ca_caps  ca_caps_t;
 typedef struct ca_msg ca_msg_t;
 typedef struct ca_descr ca_descr_t;
-typedef struct ca_pid ca_pid_t;
 
 #endif
 
-- 
cgit v1.2.3


From fed7c4fe8bd0b131cc3f19ba2744061935cdcdb7 Mon Sep 17 00:00:00 2001
From: Mauro Carvalho Chehab <mchehab@s-opensource.com>
Date: Fri, 1 Sep 2017 07:48:02 -0400
Subject: media: ca.h: document most CA data types

For most of the stuff there, documenting is easy, as the
header file contains information.

Yet, I was unable to document two data structs:
	ca_msg and ca_descr

As those two structs are used by a few drivers, keep them.

Signed-off-by: Mauro Carvalho Chehab <mchehab@s-opensource.com>
---
 Documentation/media/uapi/dvb/ca_data_types.rst | 75 ++++---------------------
 include/uapi/linux/dvb/ca.h                    | 78 ++++++++++++++++++++------
 2 files changed, 70 insertions(+), 83 deletions(-)

(limited to 'include')

diff --git a/Documentation/media/uapi/dvb/ca_data_types.rst b/Documentation/media/uapi/dvb/ca_data_types.rst
index 555b5137936b..aa57dd176825 100644
--- a/Documentation/media/uapi/dvb/ca_data_types.rst
+++ b/Documentation/media/uapi/dvb/ca_data_types.rst
@@ -6,91 +6,36 @@
 CA Data Types
 *************
 
+.. kernel-doc:: include/uapi/linux/dvb/ca.h
 
-.. c:type:: ca_slot_info
-
-ca_slot_info_t
-==============
-
-
-.. code-block:: c
-
-    typedef struct ca_slot_info {
-	int num;               /* slot number */
-
-	int type;              /* CA interface this slot supports */
-    #define CA_CI            1     /* CI high level interface */
-    #define CA_CI_LINK       2     /* CI link layer level interface */
-    #define CA_CI_PHYS       4     /* CI physical layer level interface */
-    #define CA_DESCR         8     /* built-in descrambler */
-    #define CA_SC          128     /* simple smart card interface */
-
-	unsigned int flags;
-    #define CA_CI_MODULE_PRESENT 1 /* module (or card) inserted */
-    #define CA_CI_MODULE_READY   2
-    } ca_slot_info_t;
-
-
-.. c:type:: ca_descr_info
-
-ca_descr_info_t
-===============
-
-
-.. code-block:: c
-
-    typedef struct ca_descr_info {
-	unsigned int num;  /* number of available descramblers (keys) */
-	unsigned int type; /* type of supported scrambling system */
-    #define CA_ECD           1
-    #define CA_NDS           2
-    #define CA_DSS           4
-    } ca_descr_info_t;
-
-
-.. c:type:: ca_caps
-
-ca_caps_t
-=========
-
+.. c:type:: ca_msg
 
-.. code-block:: c
+Undocumented data types
+=======================
 
-    typedef struct ca_caps {
-	unsigned int slot_num;  /* total number of CA card and module slots */
-	unsigned int slot_type; /* OR of all supported types */
-	unsigned int descr_num; /* total number of descrambler slots (keys) */
-	unsigned int descr_type;/* OR of all supported types */
-     } ca_cap_t;
+.. note::
 
+   Those data types are undocumented. Documentation is welcome.
 
 .. c:type:: ca_msg
 
-ca_msg_t
-========
-
-
 .. code-block:: c
 
     /* a message to/from a CI-CAM */
-    typedef struct ca_msg {
+    struct ca_msg {
 	unsigned int index;
 	unsigned int type;
 	unsigned int length;
 	unsigned char msg[256];
-    } ca_msg_t;
+    };
 
 
 .. c:type:: ca_descr
 
-ca_descr_t
-==========
-
-
 .. code-block:: c
 
-    typedef struct ca_descr {
+    struct ca_descr {
 	unsigned int index;
 	unsigned int parity;
 	unsigned char cw[8];
-    } ca_descr_t;
+    };
diff --git a/include/uapi/linux/dvb/ca.h b/include/uapi/linux/dvb/ca.h
index 859f6c0c4751..7ee641b4124c 100644
--- a/include/uapi/linux/dvb/ca.h
+++ b/include/uapi/linux/dvb/ca.h
@@ -24,39 +24,81 @@
 #ifndef _DVBCA_H_
 #define _DVBCA_H_
 
-/* slot interface types and info */
+/**
+ * struct ca_slot_info - CA slot interface types and info.
+ *
+ * @num:	slot number.
+ * @type:	slot type.
+ * @flags:	flags applicable to the slot.
+ *
+ * This struct stores the CA slot information.
+ *
+ * @type can be:
+ *
+ *	- %CA_CI - CI high level interface;
+ *	- %CA_CI_LINK - CI link layer level interface;
+ *	- %CA_CI_PHYS - CI physical layer level interface;
+ *	- %CA_DESCR - built-in descrambler;
+ *	- %CA_SC -simple smart card interface.
+ *
+ * @flags can be:
+ *
+ *	- %CA_CI_MODULE_PRESENT - module (or card) inserted;
+ *	- %CA_CI_MODULE_READY - module is ready for usage.
+ */
 
 struct ca_slot_info {
-	int num;               /* slot number */
-
-	int type;              /* CA interface this slot supports */
-#define CA_CI            1     /* CI high level interface */
-#define CA_CI_LINK       2     /* CI link layer level interface */
-#define CA_CI_PHYS       4     /* CI physical layer level interface */
-#define CA_DESCR         8     /* built-in descrambler */
-#define CA_SC          128     /* simple smart card interface */
+	int num;
+	int type;
+#define CA_CI            1
+#define CA_CI_LINK       2
+#define CA_CI_PHYS       4
+#define CA_DESCR         8
+#define CA_SC          128
 
 	unsigned int flags;
-#define CA_CI_MODULE_PRESENT 1 /* module (or card) inserted */
+#define CA_CI_MODULE_PRESENT 1
 #define CA_CI_MODULE_READY   2
 };
 
 
-/* descrambler types and info */
-
+/**
+ * struct ca_descr_info - descrambler types and info.
+ *
+ * @num:	number of available descramblers (keys).
+ * @type:	type of supported scrambling system.
+ *
+ * Identifies the number of descramblers and their type.
+ *
+ * @type can be:
+ *
+ *	- %CA_ECD - European Common Descrambler (ECD) hardware;
+ *	- %CA_NDS - Videoguard (NDS) hardware;
+ *	- %CA_DSS - Distributed Sample Scrambling (DSS) hardware.
+ */
 struct ca_descr_info {
-	unsigned int num;          /* number of available descramblers (keys) */
-	unsigned int type;         /* type of supported scrambling system */
+	unsigned int num;
+	unsigned int type;
 #define CA_ECD           1
 #define CA_NDS           2
 #define CA_DSS           4
 };
 
+/**
+ * struct ca_caps - CA slot interface capabilities.
+ *
+ * @slot_num:	total number of CA card and module slots.
+ * @slot_type:	bitmap with all supported types as defined at
+ *		&struct ca_slot_info (e. g. %CA_CI, %CA_CI_LINK, etc).
+ * @descr_num:	total number of descrambler slots (keys)
+ * @descr_type:	bitmap with all supported types as defined at
+ *		&struct ca_descr_info (e. g. %CA_ECD, %CA_NDS, etc).
+ */
 struct ca_caps {
-	unsigned int slot_num;     /* total number of CA card and module slots */
-	unsigned int slot_type;    /* OR of all supported types */
-	unsigned int descr_num;    /* total number of descrambler slots (keys) */
-	unsigned int descr_type;   /* OR of all supported types */
+	unsigned int slot_num;
+	unsigned int slot_type;
+	unsigned int descr_num;
+	unsigned int descr_type;
 };
 
 /* a message to/from a CI-CAM */
-- 
cgit v1.2.3


From 5176d6eefd5d58fbb787f96c2140cffb2e826b17 Mon Sep 17 00:00:00 2001
From: Mauro Carvalho Chehab <mchehab@s-opensource.com>
Date: Fri, 1 Sep 2017 15:05:28 -0400
Subject: media: frontend.h: Avoid the term DVB when doesn't refer to a
 delivery system

The DVB term can either refer to the subsystem or to a delivery
system. Avoid it in the first case at the kernel-doc markups.

Signed-off-by: Mauro Carvalho Chehab <mchehab@s-opensource.com>
---
 include/uapi/linux/dvb/frontend.h | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/include/uapi/linux/dvb/frontend.h b/include/uapi/linux/dvb/frontend.h
index e7c29d0bdee4..fc2edb6014fe 100644
--- a/include/uapi/linux/dvb/frontend.h
+++ b/include/uapi/linux/dvb/frontend.h
@@ -239,11 +239,11 @@ enum fe_sec_mini_cmd {
  * @FE_NONE:		The frontend doesn't have any kind of lock.
  *			That's the initial frontend status
  * @FE_HAS_SIGNAL:	Has found something above the noise level.
- * @FE_HAS_CARRIER:	Has found a DVB signal.
+ * @FE_HAS_CARRIER:	Has found a signal.
  * @FE_HAS_VITERBI:	FEC inner coding (Viterbi, LDPC or other inner code).
  *			is stable.
  * @FE_HAS_SYNC:	Synchronization bytes was found.
- * @FE_HAS_LOCK:	DVB were locked and everything is working.
+ * @FE_HAS_LOCK:	Digital TV were locked and everything is working.
  * @FE_TIMEDOUT:	Fo lock within the last about 2 seconds.
  * @FE_REINIT:		Frontend was reinitialized, application is recommended
  *			to reset DiSEqC, tone and parameters.
@@ -269,7 +269,7 @@ enum fe_status {
  * This parameter indicates if spectral inversion should be presumed or
  * not. In the automatic setting (``INVERSION_AUTO``) the hardware will try
  * to figure out the correct setting by itself. If the hardware doesn't
- * support, the DVB core will try to lock at the carrier first with
+ * support, the %dvb_frontend will try to lock at the carrier first with
  * inversion off. If it fails, it will try to enable inversion.
  */
 enum fe_spectral_inversion {
-- 
cgit v1.2.3


From 56d51b65bcc7a5780663abd579fb6f039616b347 Mon Sep 17 00:00:00 2001
From: Mauro Carvalho Chehab <mchehab@s-opensource.com>
Date: Fri, 1 Sep 2017 15:45:47 -0400
Subject: media: net.h: add kernel-doc and use it at Documentation/

As we did with frontend.h, ca.h and dmx.h, move the struct
definition to net.h.

That should help to keep it updated, as more stuff gets
added there.

Signed-off-by: Mauro Carvalho Chehab <mchehab@s-opensource.com>
---
 Documentation/media/uapi/dvb/net-add-if.rst | 34 -----------------------------
 Documentation/media/uapi/dvb/net-types.rst  |  9 ++++++++
 Documentation/media/uapi/dvb/net.rst        |  1 +
 include/uapi/linux/dvb/net.h                | 15 +++++++++++++
 4 files changed, 25 insertions(+), 34 deletions(-)
 create mode 100644 Documentation/media/uapi/dvb/net-types.rst

(limited to 'include')

diff --git a/Documentation/media/uapi/dvb/net-add-if.rst b/Documentation/media/uapi/dvb/net-add-if.rst
index 1087efb9baa0..6749b70246c5 100644
--- a/Documentation/media/uapi/dvb/net-add-if.rst
+++ b/Documentation/media/uapi/dvb/net-add-if.rst
@@ -41,40 +41,6 @@ created.
 The struct :c:type:`dvb_net_if`::ifnum field will be
 filled with the number of the created interface.
 
-.. c:type:: dvb_net_if
-
-.. flat-table:: struct dvb_net_if
-    :header-rows:  1
-    :stub-columns: 0
-
-
-    -  .. row 1
-
-       -  ID
-
-       -  Description
-
-    -  .. row 2
-
-       -  pid
-
-       -  Packet ID (PID) of the MPEG-TS that contains data
-
-    -  .. row 3
-
-       -  ifnum
-
-       -  number of the Digital TV interface.
-
-    -  .. row 4
-
-       -  feedtype
-
-       -  Encapsulation type of the feed. It can be:
-	  ``DVB_NET_FEEDTYPE_MPE`` for MPE encoding or
-	  ``DVB_NET_FEEDTYPE_ULE`` for ULE encoding.
-
-
 Return Value
 ============
 
diff --git a/Documentation/media/uapi/dvb/net-types.rst b/Documentation/media/uapi/dvb/net-types.rst
new file mode 100644
index 000000000000..e1177bdcd623
--- /dev/null
+++ b/Documentation/media/uapi/dvb/net-types.rst
@@ -0,0 +1,9 @@
+.. -*- coding: utf-8; mode: rst -*-
+
+.. _dmx_types:
+
+**************
+Net Data Types
+**************
+
+.. kernel-doc:: include/uapi/linux/dvb/net.h
diff --git a/Documentation/media/uapi/dvb/net.rst b/Documentation/media/uapi/dvb/net.rst
index fdb5301a4b1f..e0cd4e402627 100644
--- a/Documentation/media/uapi/dvb/net.rst
+++ b/Documentation/media/uapi/dvb/net.rst
@@ -35,6 +35,7 @@ Digital TV net Function Calls
 .. toctree::
     :maxdepth: 1
 
+    net-types
     net-add-if
     net-remove-if
     net-get-if
diff --git a/include/uapi/linux/dvb/net.h b/include/uapi/linux/dvb/net.h
index f451e7eb0b0b..89d805f9a5a6 100644
--- a/include/uapi/linux/dvb/net.h
+++ b/include/uapi/linux/dvb/net.h
@@ -26,6 +26,21 @@
 
 #include <linux/types.h>
 
+/**
+ * struct dvb_net_if - describes a DVB network interface
+ *
+ * @pid: Packet ID (PID) of the MPEG-TS that contains data
+ * @if_num: number of the Digital TV interface.
+ * @feedtype: Encapsulation type of the feed.
+ *
+ * A MPEG-TS stream may contain packet IDs with IP packages on it.
+ * This struct describes it, and the type of encoding.
+ *
+ * @feedtype can be:
+ *
+ *	- %DVB_NET_FEEDTYPE_MPE for MPE encoding
+ *	- %DVB_NET_FEEDTYPE_ULE for ULE encoding.
+ */
 struct dvb_net_if {
 	__u16 pid;
 	__u16 if_num;
-- 
cgit v1.2.3


From bd9049edc66e13e868f819c39844f60443e70817 Mon Sep 17 00:00:00 2001
From: Mauro Carvalho Chehab <mchehab@s-opensource.com>
Date: Sun, 3 Sep 2017 20:50:17 -0400
Subject: media: ca docs: document CA_SET_DESCR ioctl and structs
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The av7110 driver uses CA_SET_DESCR to store the descrambler
control words at the CA descrambler slots.

Document it.

Thanks-to: Honza Petrouš <jpetrous@gmail.com>
Signed-off-by: Mauro Carvalho Chehab <mchehab@s-opensource.com>
---
 Documentation/media/uapi/dvb/ca-set-descr.rst | 15 ++-------------
 include/uapi/linux/dvb/ca.h                   |  9 ++++++++-
 2 files changed, 10 insertions(+), 14 deletions(-)

(limited to 'include')

diff --git a/Documentation/media/uapi/dvb/ca-set-descr.rst b/Documentation/media/uapi/dvb/ca-set-descr.rst
index 9c484317d55c..a6c47205ffd8 100644
--- a/Documentation/media/uapi/dvb/ca-set-descr.rst
+++ b/Documentation/media/uapi/dvb/ca-set-descr.rst
@@ -28,22 +28,11 @@ Arguments
 ``msg``
   Pointer to struct :c:type:`ca_descr`.
 
-.. c:type:: ca_descr
-
-.. code-block:: c
-
-    struct ca_descr {
-	unsigned int index;
-	unsigned int parity;
-	unsigned char cw[8];
-    };
-
-
 Description
 -----------
 
-.. note:: This ioctl is undocumented. Documentation is welcome.
-
+CA_SET_DESCR is used for feeding descrambler CA slots with descrambling
+keys (refered as control words).
 
 Return Value
 ------------
diff --git a/include/uapi/linux/dvb/ca.h b/include/uapi/linux/dvb/ca.h
index 7ee641b4124c..c36fdb8e2733 100644
--- a/include/uapi/linux/dvb/ca.h
+++ b/include/uapi/linux/dvb/ca.h
@@ -109,9 +109,16 @@ struct ca_msg {
 	unsigned char msg[256];
 };
 
+/**
+ * struct ca_descr - CA descrambler control words info
+ *
+ * @index: CA Descrambler slot
+ * @parity: control words parity, where 0 means even and 1 means odd
+ * @cw: CA Descrambler control words
+ */
 struct ca_descr {
 	unsigned int index;
-	unsigned int parity;	/* 0 == even, 1 == odd */
+	unsigned int parity;
 	unsigned char cw[8];
 };
 
-- 
cgit v1.2.3


From 7e6854a9bfea9ed6553acd0204da5101c9a2e6a0 Mon Sep 17 00:00:00 2001
From: Mauro Carvalho Chehab <mchehab@s-opensource.com>
Date: Mon, 4 Sep 2017 08:03:40 -0400
Subject: media: ca.h: document ca_msg and the corresponding ioctls

Usually, CA messages are sent/received via reading/writing at
the CA device node. However, two drivers (dst_ca and firedtv-ci)
also implement it via ioctls.

Apparently, on both cases, the net result is the same.

Anyway, let's document it.

Signed-off-by: Mauro Carvalho Chehab <mchehab@s-opensource.com>
---
 Documentation/media/uapi/dvb/ca-get-msg.rst  | 19 ++++++-------------
 Documentation/media/uapi/dvb/ca-send-msg.rst |  6 +++++-
 include/uapi/linux/dvb/ca.h                  | 11 ++++++++++-
 3 files changed, 21 insertions(+), 15 deletions(-)

(limited to 'include')

diff --git a/Documentation/media/uapi/dvb/ca-get-msg.rst b/Documentation/media/uapi/dvb/ca-get-msg.rst
index bdb116552068..ceeda623ce93 100644
--- a/Documentation/media/uapi/dvb/ca-get-msg.rst
+++ b/Documentation/media/uapi/dvb/ca-get-msg.rst
@@ -28,22 +28,15 @@ Arguments
 ``msg``
   Pointer to struct :c:type:`ca_msg`.
 
-.. c:type:: ca_msg
-
-.. code-block:: c
-
-    /* a message to/from a CI-CAM */
-    struct ca_msg {
-	unsigned int index;
-	unsigned int type;
-	unsigned int length;
-	unsigned char msg[256];
-    };
-
 Description
 -----------
 
-.. note:: This ioctl is undocumented. Documentation is welcome.
+Receives a message via a CI CA module.
+
+.. note::
+
+   Please notice that, on most drivers, this is done by reading from
+   the /dev/adapter?/ca? device node.
 
 
 Return Value
diff --git a/Documentation/media/uapi/dvb/ca-send-msg.rst b/Documentation/media/uapi/dvb/ca-send-msg.rst
index 644b6bda1aea..9e91287b7bbc 100644
--- a/Documentation/media/uapi/dvb/ca-send-msg.rst
+++ b/Documentation/media/uapi/dvb/ca-send-msg.rst
@@ -32,8 +32,12 @@ Arguments
 Description
 -----------
 
-.. note:: This ioctl is undocumented. Documentation is welcome.
+Sends a message via a CI CA module.
 
+.. note::
+
+   Please notice that, on most drivers, this is done by writing
+   to the /dev/adapter?/ca? device node.
 
 Return Value
 ------------
diff --git a/include/uapi/linux/dvb/ca.h b/include/uapi/linux/dvb/ca.h
index c36fdb8e2733..24fc38efbc2b 100644
--- a/include/uapi/linux/dvb/ca.h
+++ b/include/uapi/linux/dvb/ca.h
@@ -101,7 +101,16 @@ struct ca_caps {
 	unsigned int descr_type;
 };
 
-/* a message to/from a CI-CAM */
+/**
+ * struct ca_msg - a message to/from a CI-CAM
+ *
+ * @index:	unused
+ * @type:	unused
+ * @length:	length of the message
+ * @msg:	message
+ *
+ * This struct carries a message to be send/received from a CI CA module.
+ */
 struct ca_msg {
 	unsigned int index;
 	unsigned int type;
-- 
cgit v1.2.3


From cd91304e7190b4c4802f8e413ab2214b233e0260 Mon Sep 17 00:00:00 2001
From: Miklos Szeredi <mszeredi@redhat.com>
Date: Tue, 5 Sep 2017 12:53:11 +0200
Subject: ovl: fix relatime for directories

Need to treat non-regular overlayfs files the same as regular files when
checking for an atime update.

Add a d_real() flag to make it return the upper dentry for all file types.

Reported-by: "zhangyi (F)" <yi.zhang@huawei.com>
Signed-off-by: Miklos Szeredi <mszeredi@redhat.com>
---
 fs/inode.c             | 21 +++++++++++++++++----
 fs/overlayfs/super.c   |  3 +++
 include/linux/dcache.h |  3 +++
 3 files changed, 23 insertions(+), 4 deletions(-)

(limited to 'include')

diff --git a/fs/inode.c b/fs/inode.c
index 50370599e371..eed15033e36b 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -1569,11 +1569,24 @@ EXPORT_SYMBOL(bmap);
 static void update_ovl_inode_times(struct dentry *dentry, struct inode *inode,
 			       bool rcu)
 {
-	if (!rcu) {
-		struct inode *realinode = d_real_inode(dentry);
+	struct dentry *upperdentry;
 
-		if (unlikely(inode != realinode) &&
-		    (!timespec_equal(&inode->i_mtime, &realinode->i_mtime) ||
+	/*
+	 * Nothing to do if in rcu or if non-overlayfs
+	 */
+	if (rcu || likely(!(dentry->d_flags & DCACHE_OP_REAL)))
+		return;
+
+	upperdentry = d_real(dentry, NULL, 0, D_REAL_UPPER);
+
+	/*
+	 * If file is on lower then we can't update atime, so no worries about
+	 * stale mtime/ctime.
+	 */
+	if (upperdentry) {
+		struct inode *realinode = d_inode(upperdentry);
+
+		if ((!timespec_equal(&inode->i_mtime, &realinode->i_mtime) ||
 		     !timespec_equal(&inode->i_ctime, &realinode->i_ctime))) {
 			inode->i_mtime = realinode->i_mtime;
 			inode->i_ctime = realinode->i_ctime;
diff --git a/fs/overlayfs/super.c b/fs/overlayfs/super.c
index 19e89ce39017..cd49c0298ddf 100644
--- a/fs/overlayfs/super.c
+++ b/fs/overlayfs/super.c
@@ -75,6 +75,9 @@ static struct dentry *ovl_d_real(struct dentry *dentry,
 	struct dentry *real;
 	int err;
 
+	if (flags & D_REAL_UPPER)
+		return ovl_dentry_upper(dentry);
+
 	if (!d_is_reg(dentry)) {
 		if (!inode || inode == d_inode(dentry))
 			return dentry;
diff --git a/include/linux/dcache.h b/include/linux/dcache.h
index fd0721e520f4..ed1a7cf6923a 100644
--- a/include/linux/dcache.h
+++ b/include/linux/dcache.h
@@ -562,6 +562,9 @@ static inline struct dentry *d_backing_dentry(struct dentry *upper)
 	return upper;
 }
 
+/* d_real() flags */
+#define D_REAL_UPPER	0x2	/* return upper dentry or NULL if non-upper */
+
 /**
  * d_real - Return the real dentry
  * @dentry: the dentry to query
-- 
cgit v1.2.3


From e4faa09b0dae4f8f429922190e9aa99a564ff785 Mon Sep 17 00:00:00 2001
From: Mauro Carvalho Chehab <mchehab@s-opensource.com>
Date: Tue, 5 Sep 2017 07:02:44 -0400
Subject: media: dvb headers: make checkpatch happier

Adjust dvb ca.h, dmx.h and frontend.h in order to make
checkpatch happier. Now, it only complains about the typedefs,
and those are there just to provide backward userspace
compatibility.

Signed-off-by: Mauro Carvalho Chehab <mchehab@s-opensource.com>
---
 include/uapi/linux/dvb/ca.h       | 2 +-
 include/uapi/linux/dvb/dmx.h      | 5 ++---
 include/uapi/linux/dvb/frontend.h | 6 +++---
 3 files changed, 6 insertions(+), 7 deletions(-)

(limited to 'include')

diff --git a/include/uapi/linux/dvb/ca.h b/include/uapi/linux/dvb/ca.h
index 24fc38efbc2b..cb150029fdff 100644
--- a/include/uapi/linux/dvb/ca.h
+++ b/include/uapi/linux/dvb/ca.h
@@ -139,7 +139,7 @@ struct ca_descr {
 #define CA_SEND_MSG       _IOW('o', 133, struct ca_msg)
 #define CA_SET_DESCR      _IOW('o', 134, struct ca_descr)
 
-#if !defined (__KERNEL__)
+#if !defined(__KERNEL__)
 
 /* This is needed for legacy userspace support */
 typedef struct ca_slot_info ca_slot_info_t;
diff --git a/include/uapi/linux/dvb/dmx.h b/include/uapi/linux/dvb/dmx.h
index 4e3f3a2fe83f..4aa5f6a1815a 100644
--- a/include/uapi/linux/dvb/dmx.h
+++ b/include/uapi/linux/dvb/dmx.h
@@ -189,8 +189,7 @@ struct dmx_sct_filter_params {
  * @pes_type:	Type of the pes filter, as specified by &enum dmx_pes_type.
  * @flags:	Demux PES flags.
  */
-struct dmx_pes_filter_params
-{
+struct dmx_pes_filter_params {
 	__u16           pid;
 	enum dmx_input  input;
 	enum dmx_output output;
@@ -221,7 +220,7 @@ struct dmx_stc {
 #define DMX_ADD_PID              _IOW('o', 51, __u16)
 #define DMX_REMOVE_PID           _IOW('o', 52, __u16)
 
-#if !defined (__KERNEL__)
+#if !defined(__KERNEL__)
 
 /* This is needed for legacy userspace support */
 typedef enum dmx_output dmx_output_t;
diff --git a/include/uapi/linux/dvb/frontend.h b/include/uapi/linux/dvb/frontend.h
index fc2edb6014fe..861cacd5711f 100644
--- a/include/uapi/linux/dvb/frontend.h
+++ b/include/uapi/linux/dvb/frontend.h
@@ -907,7 +907,7 @@ struct dtv_properties {
 #define FE_SET_PROPERTY		   _IOW('o', 82, struct dtv_properties)
 #define FE_GET_PROPERTY		   _IOR('o', 83, struct dtv_properties)
 
-#if defined(__DVB_CORE__) || !defined (__KERNEL__)
+#if defined(__DVB_CORE__) || !defined(__KERNEL__)
 
 /*
  * DEPRECATED: Everything below is deprecated in favor of DVBv5 API
@@ -982,8 +982,8 @@ struct dvb_ofdm_parameters {
 };
 
 struct dvb_frontend_parameters {
-	__u32 frequency;     /* (absolute) frequency in Hz for DVB-C/DVB-T/ATSC */
-			     /* intermediate frequency in kHz for DVB-S */
+	__u32 frequency;  /* (absolute) frequency in Hz for DVB-C/DVB-T/ATSC */
+			  /* intermediate frequency in kHz for DVB-S */
 	fe_spectral_inversion_t inversion;
 	union {
 		struct dvb_qpsk_parameters qpsk;	/* DVB-S */
-- 
cgit v1.2.3


From 7760e223507b5218d0674b5e2e8dc29fea079a89 Mon Sep 17 00:00:00 2001
From: Bart Van Assche <bart.vanassche@wdc.com>
Date: Thu, 31 Aug 2017 16:59:06 -0700
Subject: scsi: Remove Scsi_Host.uspace_req_q

A patch I wrote myself several years ago removed SCSI target support
from the code under drivers/scsi. That patch removed the code that sets
uspace_req_q to a non-NULL value. Hence also remove the code that
depends on uspace_req_q != NULL.

References: commit 066465251303 ("tgt: removal")
Signed-off-by: Bart Van Assche <bart.vanassche@wdc.com>
Cc: Christoph Hellwig <hch@lst.de>
Cc: Hannes Reinecke <hare@suse.de>
Cc: Johannes Thumshirn <jthumshirn@suse.de>
Reviewed-by: Johannes Thumshirn <jthumshirn@suse.de>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Martin K. Petersen <martin.petersen@oracle.com>
---
 drivers/scsi/hosts.c     | 8 --------
 include/scsi/scsi_host.h | 6 ------
 2 files changed, 14 deletions(-)

(limited to 'include')

diff --git a/drivers/scsi/hosts.c b/drivers/scsi/hosts.c
index 831a1c8b9f89..fe3a0da3ec97 100644
--- a/drivers/scsi/hosts.c
+++ b/drivers/scsi/hosts.c
@@ -315,8 +315,6 @@ static void scsi_host_dev_release(struct device *dev)
 {
 	struct Scsi_Host *shost = dev_to_shost(dev);
 	struct device *parent = dev->parent;
-	struct request_queue *q;
-	void *queuedata;
 
 	scsi_proc_hostdir_rm(shost->hostt);
 
@@ -326,12 +324,6 @@ static void scsi_host_dev_release(struct device *dev)
 		kthread_stop(shost->ehandler);
 	if (shost->work_q)
 		destroy_workqueue(shost->work_q);
-	q = shost->uspace_req_q;
-	if (q) {
-		queuedata = q->queuedata;
-		blk_cleanup_queue(q);
-		kfree(queuedata);
-	}
 
 	if (shost->shost_state == SHOST_CREATED) {
 		/*
diff --git a/include/scsi/scsi_host.h b/include/scsi/scsi_host.h
index afb04811b7b9..0a804b1a4726 100644
--- a/include/scsi/scsi_host.h
+++ b/include/scsi/scsi_host.h
@@ -691,12 +691,6 @@ struct Scsi_Host {
 	unsigned int prot_capabilities;
 	unsigned char prot_guard_type;
 
-	/*
-	 * q used for scsi_tgt msgs, async events or any other requests that
-	 * need to be processed in userspace
-	 */
-	struct request_queue *uspace_req_q;
-
 	/* legacy crap */
 	unsigned long base;
 	unsigned long io_port;
-- 
cgit v1.2.3


From fbf1c41fc0f4d3574ac2377245efd666c1fa3075 Mon Sep 17 00:00:00 2001
From: Ben Hutchings <ben@decadent.org.uk>
Date: Sun, 3 Sep 2017 01:18:41 +0100
Subject: workqueue: Fix flag collision

Commit 0a94efb5acbb ("workqueue: implicit ordered attribute should be
overridable") introduced a __WQ_ORDERED_EXPLICIT flag but gave it the
same value as __WQ_LEGACY.  I don't believe these were intended to
mean the same thing, so renumber __WQ_ORDERED_EXPLICIT.

Fixes: 0a94efb5acbb ("workqueue: implicit ordered attribute should be ...")
Signed-off-by: Ben Hutchings <ben@decadent.org.uk>
Cc: stable@vger.kernel.org # v4.13
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 include/linux/workqueue.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/workqueue.h b/include/linux/workqueue.h
index db6dc9dc0482..1c49431f3121 100644
--- a/include/linux/workqueue.h
+++ b/include/linux/workqueue.h
@@ -323,8 +323,8 @@ enum {
 
 	__WQ_DRAINING		= 1 << 16, /* internal: workqueue is draining */
 	__WQ_ORDERED		= 1 << 17, /* internal: workqueue is ordered */
-	__WQ_ORDERED_EXPLICIT	= 1 << 18, /* internal: alloc_ordered_workqueue() */
 	__WQ_LEGACY		= 1 << 18, /* internal: create*_workqueue() */
+	__WQ_ORDERED_EXPLICIT	= 1 << 19, /* internal: alloc_ordered_workqueue() */
 
 	WQ_MAX_ACTIVE		= 512,	  /* I like 512, better ideas? */
 	WQ_MAX_UNBOUND_PER_CPU	= 4,	  /* 4 * #cpus for unbound wq */
-- 
cgit v1.2.3


From fd0c88b70060e03a2215259ed29b4b31497ad205 Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Tue, 5 Sep 2017 10:05:47 +0200
Subject: net/ncsi: fix ncsi_vlan_rx_{add,kill}_vid references

We get a new link error in allmodconfig kernels after ftgmac100
started using the ncsi helpers:

ERROR: "ncsi_vlan_rx_kill_vid" [drivers/net/ethernet/faraday/ftgmac100.ko] undefined!
ERROR: "ncsi_vlan_rx_add_vid" [drivers/net/ethernet/faraday/ftgmac100.ko] undefined!

Related to that, we get another error when CONFIG_NET_NCSI is disabled:

drivers/net/ethernet/faraday/ftgmac100.c:1626:25: error: 'ncsi_vlan_rx_add_vid' undeclared here (not in a function); did you mean 'ncsi_start_dev'?
drivers/net/ethernet/faraday/ftgmac100.c:1627:26: error: 'ncsi_vlan_rx_kill_vid' undeclared here (not in a function); did you mean 'ncsi_vlan_rx_add_vid'?

This fixes both problems at once, using a 'static inline' stub helper
for the disabled case, and exporting the functions when they are present.

Fixes: 51564585d8c6 ("ftgmac100: Support NCSI VLAN filtering when available")
Fixes: 21acf63013ed ("net/ncsi: Configure VLAN tag filter")
Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/ncsi.h     | 10 ++++++++++
 net/ncsi/ncsi-manage.c |  2 ++
 2 files changed, 12 insertions(+)

(limited to 'include')

diff --git a/include/net/ncsi.h b/include/net/ncsi.h
index 1f96af46df49..fdc60ff2511d 100644
--- a/include/net/ncsi.h
+++ b/include/net/ncsi.h
@@ -36,6 +36,16 @@ int ncsi_start_dev(struct ncsi_dev *nd);
 void ncsi_stop_dev(struct ncsi_dev *nd);
 void ncsi_unregister_dev(struct ncsi_dev *nd);
 #else /* !CONFIG_NET_NCSI */
+static inline int ncsi_vlan_rx_add_vid(struct net_device *dev, __be16 proto, u16 vid)
+{
+	return -EINVAL;
+}
+
+static inline int ncsi_vlan_rx_kill_vid(struct net_device *dev, __be16 proto, u16 vid)
+{
+	return -EINVAL;
+}
+
 static inline struct ncsi_dev *ncsi_register_dev(struct net_device *dev,
 					void (*notifier)(struct ncsi_dev *nd))
 {
diff --git a/net/ncsi/ncsi-manage.c b/net/ncsi/ncsi-manage.c
index 11904b3b702d..3fd3c39e6278 100644
--- a/net/ncsi/ncsi-manage.c
+++ b/net/ncsi/ncsi-manage.c
@@ -1453,6 +1453,7 @@ int ncsi_vlan_rx_add_vid(struct net_device *dev, __be16 proto, u16 vid)
 
 	return found ? ncsi_process_next_channel(ndp) : 0;
 }
+EXPORT_SYMBOL_GPL(ncsi_vlan_rx_add_vid);
 
 int ncsi_vlan_rx_kill_vid(struct net_device *dev, __be16 proto, u16 vid)
 {
@@ -1491,6 +1492,7 @@ int ncsi_vlan_rx_kill_vid(struct net_device *dev, __be16 proto, u16 vid)
 
 	return found ? ncsi_process_next_channel(ndp) : 0;
 }
+EXPORT_SYMBOL_GPL(ncsi_vlan_rx_kill_vid);
 
 struct ncsi_dev *ncsi_register_dev(struct net_device *dev,
 				   void (*handler)(struct ncsi_dev *ndev))
-- 
cgit v1.2.3


From 2c08ab3f2504bc7ba816ce6fde051b8bd5f028e4 Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Tue, 5 Sep 2017 10:31:35 +0200
Subject: soc: ti/knav_dma: include dmaengine header

A header file cleanup apparently caused a build regression
with one driver using the knav infrastructure:

In file included from drivers/net/ethernet/ti/netcp_core.c:30:0:
include/linux/soc/ti/knav_dma.h:129:30: error: field 'direction' has incomplete type
  enum dma_transfer_direction direction;
                              ^~~~~~~~~
drivers/net/ethernet/ti/netcp_core.c: In function 'netcp_txpipe_open':
drivers/net/ethernet/ti/netcp_core.c:1349:21: error: 'DMA_MEM_TO_DEV' undeclared (first use in this function); did you mean 'DMA_MEMORY_MAP'?
  config.direction = DMA_MEM_TO_DEV;
                     ^~~~~~~~~~~~~~
                     DMA_MEMORY_MAP
drivers/net/ethernet/ti/netcp_core.c:1349:21: note: each undeclared identifier is reported only once for each function it appears in
drivers/net/ethernet/ti/netcp_core.c: In function 'netcp_setup_navigator_resources':
drivers/net/ethernet/ti/netcp_core.c:1659:22: error: 'DMA_DEV_TO_MEM' undeclared (first use in this function); did you mean 'DMA_DESC_HOST'?
  config.direction  = DMA_DEV_TO_MEM;

As the header is no longer included implicitly through netdevice.h,
we should include it in the header that references the enum.

Fixes: 0dd5759dbb1c ("net: remove dmaengine.h inclusion from netdevice.h")
Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/soc/ti/knav_dma.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include')

diff --git a/include/linux/soc/ti/knav_dma.h b/include/linux/soc/ti/knav_dma.h
index 2b7882666ef6..66693bc4c6ad 100644
--- a/include/linux/soc/ti/knav_dma.h
+++ b/include/linux/soc/ti/knav_dma.h
@@ -17,6 +17,8 @@
 #ifndef __SOC_TI_KEYSTONE_NAVIGATOR_DMA_H__
 #define __SOC_TI_KEYSTONE_NAVIGATOR_DMA_H__
 
+#include <linux/dmaengine.h>
+
 /*
  * PKTDMA descriptor manipulation macros for host packet descriptor
  */
-- 
cgit v1.2.3


From 3a1214e8b06317b4e71cd3a36344df87b7858e19 Mon Sep 17 00:00:00 2001
From: Tom Herbert <tom@quantonium.net>
Date: Fri, 1 Sep 2017 14:04:11 -0700
Subject: flow_dissector: Cleanup control flow

__skb_flow_dissect is riddled with gotos that make discerning the flow,
debugging, and extending the capability difficult. This patch
reorganizes things so that we only perform goto's after the two main
switch statements (no gotos within the cases now). It also eliminates
several goto labels so that there are only two labels that can be target
for goto.

Reported-by: Alexander Popov <alex.popov@linux.com>
Signed-off-by: Tom Herbert <tom@quantonium.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/flow_dissector.h |   8 ++
 net/core/flow_dissector.c    | 223 ++++++++++++++++++++++++++++---------------
 2 files changed, 153 insertions(+), 78 deletions(-)

(limited to 'include')

diff --git a/include/net/flow_dissector.h b/include/net/flow_dissector.h
index e2663e900b0a..fc3dce730a6b 100644
--- a/include/net/flow_dissector.h
+++ b/include/net/flow_dissector.h
@@ -19,6 +19,14 @@ struct flow_dissector_key_control {
 #define FLOW_DIS_FIRST_FRAG	BIT(1)
 #define FLOW_DIS_ENCAPSULATION	BIT(2)
 
+enum flow_dissect_ret {
+	FLOW_DISSECT_RET_OUT_GOOD,
+	FLOW_DISSECT_RET_OUT_BAD,
+	FLOW_DISSECT_RET_PROTO_AGAIN,
+	FLOW_DISSECT_RET_IPPROTO_AGAIN,
+	FLOW_DISSECT_RET_CONTINUE,
+};
+
 /**
  * struct flow_dissector_key_basic:
  * @thoff: Transport header offset
diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c
index e2eaa1ff948d..e0ea17d1c7fc 100644
--- a/net/core/flow_dissector.c
+++ b/net/core/flow_dissector.c
@@ -115,12 +115,6 @@ __be32 __skb_flow_get_ports(const struct sk_buff *skb, int thoff, u8 ip_proto,
 }
 EXPORT_SYMBOL(__skb_flow_get_ports);
 
-enum flow_dissect_ret {
-	FLOW_DISSECT_RET_OUT_GOOD,
-	FLOW_DISSECT_RET_OUT_BAD,
-	FLOW_DISSECT_RET_OUT_PROTO_AGAIN,
-};
-
 static enum flow_dissect_ret
 __skb_flow_dissect_mpls(const struct sk_buff *skb,
 			struct flow_dissector *flow_dissector,
@@ -341,7 +335,7 @@ __skb_flow_dissect_gre(const struct sk_buff *skb,
 	if (flags & FLOW_DISSECTOR_F_STOP_AT_ENCAP)
 		return FLOW_DISSECT_RET_OUT_GOOD;
 
-	return FLOW_DISSECT_RET_OUT_PROTO_AGAIN;
+	return FLOW_DISSECT_RET_PROTO_AGAIN;
 }
 
 static void
@@ -431,6 +425,7 @@ bool __skb_flow_dissect(const struct sk_buff *skb,
 	struct flow_dissector_key_icmp *key_icmp;
 	struct flow_dissector_key_tags *key_tags;
 	struct flow_dissector_key_vlan *key_vlan;
+	enum flow_dissect_ret fdret;
 	bool skip_vlan = false;
 	u8 ip_proto = 0;
 	bool ret;
@@ -482,14 +477,19 @@ bool __skb_flow_dissect(const struct sk_buff *skb,
 	}
 
 proto_again:
+	fdret = FLOW_DISSECT_RET_CONTINUE;
+
 	switch (proto) {
 	case htons(ETH_P_IP): {
 		const struct iphdr *iph;
 		struct iphdr _iph;
-ip:
+
 		iph = __skb_header_pointer(skb, nhoff, sizeof(_iph), data, hlen, &_iph);
-		if (!iph || iph->ihl < 5)
-			goto out_bad;
+		if (!iph || iph->ihl < 5) {
+			fdret = FLOW_DISSECT_RET_OUT_BAD;
+			break;
+		}
+
 		nhoff += iph->ihl * 4;
 
 		ip_proto = iph->protocol;
@@ -509,19 +509,25 @@ ip:
 			key_control->flags |= FLOW_DIS_IS_FRAGMENT;
 
 			if (iph->frag_off & htons(IP_OFFSET)) {
-				goto out_good;
+				fdret = FLOW_DISSECT_RET_OUT_GOOD;
+				break;
 			} else {
 				key_control->flags |= FLOW_DIS_FIRST_FRAG;
-				if (!(flags & FLOW_DISSECTOR_F_PARSE_1ST_FRAG))
-					goto out_good;
+				if (!(flags &
+				      FLOW_DISSECTOR_F_PARSE_1ST_FRAG)) {
+					fdret = FLOW_DISSECT_RET_OUT_GOOD;
+					break;
+				}
 			}
 		}
 
 		__skb_flow_dissect_ipv4(skb, flow_dissector,
 					target_container, data, iph);
 
-		if (flags & FLOW_DISSECTOR_F_STOP_AT_L3)
-			goto out_good;
+		if (flags & FLOW_DISSECTOR_F_STOP_AT_L3) {
+			fdret = FLOW_DISSECT_RET_OUT_GOOD;
+			break;
+		}
 
 		break;
 	}
@@ -529,10 +535,11 @@ ip:
 		const struct ipv6hdr *iph;
 		struct ipv6hdr _iph;
 
-ipv6:
 		iph = __skb_header_pointer(skb, nhoff, sizeof(_iph), data, hlen, &_iph);
-		if (!iph)
-			goto out_bad;
+		if (!iph) {
+			fdret = FLOW_DISSECT_RET_OUT_BAD;
+			break;
+		}
 
 		ip_proto = iph->nexthdr;
 		nhoff += sizeof(struct ipv6hdr);
@@ -561,15 +568,17 @@ ipv6:
 								     target_container);
 				key_tags->flow_label = ntohl(flow_label);
 			}
-			if (flags & FLOW_DISSECTOR_F_STOP_AT_FLOW_LABEL)
-				goto out_good;
+			if (flags & FLOW_DISSECTOR_F_STOP_AT_FLOW_LABEL) {
+				fdret = FLOW_DISSECT_RET_OUT_GOOD;
+				break;
+			}
 		}
 
 		__skb_flow_dissect_ipv6(skb, flow_dissector,
 					target_container, data, iph);
 
 		if (flags & FLOW_DISSECTOR_F_STOP_AT_L3)
-			goto out_good;
+			fdret = FLOW_DISSECT_RET_OUT_GOOD;
 
 		break;
 	}
@@ -585,12 +594,17 @@ ipv6:
 		if (!vlan_tag_present || eth_type_vlan(skb->protocol)) {
 			vlan = __skb_header_pointer(skb, nhoff, sizeof(_vlan),
 						    data, hlen, &_vlan);
-			if (!vlan)
-				goto out_bad;
+			if (!vlan) {
+				fdret = FLOW_DISSECT_RET_OUT_BAD;
+				break;
+			}
+
 			proto = vlan->h_vlan_encapsulated_proto;
 			nhoff += sizeof(*vlan);
-			if (skip_vlan)
-				goto proto_again;
+			if (skip_vlan) {
+				fdret = FLOW_DISSECT_RET_PROTO_AGAIN;
+				break;
+			}
 		}
 
 		skip_vlan = true;
@@ -613,7 +627,8 @@ ipv6:
 			}
 		}
 
-		goto proto_again;
+		fdret = FLOW_DISSECT_RET_PROTO_AGAIN;
+		break;
 	}
 	case htons(ETH_P_PPP_SES): {
 		struct {
@@ -621,18 +636,27 @@ ipv6:
 			__be16 proto;
 		} *hdr, _hdr;
 		hdr = __skb_header_pointer(skb, nhoff, sizeof(_hdr), data, hlen, &_hdr);
-		if (!hdr)
-			goto out_bad;
+		if (!hdr) {
+			fdret = FLOW_DISSECT_RET_OUT_BAD;
+			break;
+		}
+
 		proto = hdr->proto;
 		nhoff += PPPOE_SES_HLEN;
 		switch (proto) {
 		case htons(PPP_IP):
-			goto ip;
+			proto = htons(ETH_P_IP);
+			fdret = FLOW_DISSECT_RET_PROTO_AGAIN;
+			break;
 		case htons(PPP_IPV6):
-			goto ipv6;
+			proto = htons(ETH_P_IPV6);
+			fdret = FLOW_DISSECT_RET_PROTO_AGAIN;
+			break;
 		default:
-			goto out_bad;
+			fdret = FLOW_DISSECT_RET_OUT_BAD;
+			break;
 		}
+		break;
 	}
 	case htons(ETH_P_TIPC): {
 		struct {
@@ -640,8 +664,10 @@ ipv6:
 			__be32 srcnode;
 		} *hdr, _hdr;
 		hdr = __skb_header_pointer(skb, nhoff, sizeof(_hdr), data, hlen, &_hdr);
-		if (!hdr)
-			goto out_bad;
+		if (!hdr) {
+			fdret = FLOW_DISSECT_RET_OUT_BAD;
+			break;
+		}
 
 		if (dissector_uses_key(flow_dissector,
 				       FLOW_DISSECTOR_KEY_TIPC_ADDRS)) {
@@ -651,56 +677,62 @@ ipv6:
 			key_addrs->tipcaddrs.srcnode = hdr->srcnode;
 			key_control->addr_type = FLOW_DISSECTOR_KEY_TIPC_ADDRS;
 		}
-		goto out_good;
+		fdret = FLOW_DISSECT_RET_OUT_GOOD;
+		break;
 	}
 
 	case htons(ETH_P_MPLS_UC):
 	case htons(ETH_P_MPLS_MC):
-mpls:
-		switch (__skb_flow_dissect_mpls(skb, flow_dissector,
+		fdret = __skb_flow_dissect_mpls(skb, flow_dissector,
 						target_container, data,
-						nhoff, hlen)) {
-		case FLOW_DISSECT_RET_OUT_GOOD:
-			goto out_good;
-		case FLOW_DISSECT_RET_OUT_BAD:
-		default:
-			goto out_bad;
-		}
+						nhoff, hlen);
+		break;
 	case htons(ETH_P_FCOE):
-		if ((hlen - nhoff) < FCOE_HEADER_LEN)
-			goto out_bad;
+		if ((hlen - nhoff) < FCOE_HEADER_LEN) {
+			fdret = FLOW_DISSECT_RET_OUT_BAD;
+			break;
+		}
 
 		nhoff += FCOE_HEADER_LEN;
-		goto out_good;
+		fdret = FLOW_DISSECT_RET_OUT_GOOD;
+		break;
 
 	case htons(ETH_P_ARP):
 	case htons(ETH_P_RARP):
-		switch (__skb_flow_dissect_arp(skb, flow_dissector,
+		fdret = __skb_flow_dissect_arp(skb, flow_dissector,
 					       target_container, data,
-					       nhoff, hlen)) {
-		case FLOW_DISSECT_RET_OUT_GOOD:
-			goto out_good;
-		case FLOW_DISSECT_RET_OUT_BAD:
-		default:
-			goto out_bad;
-		}
+					       nhoff, hlen);
+		break;
+
+	default:
+		fdret = FLOW_DISSECT_RET_OUT_BAD;
+		break;
+	}
+
+	/* Process result of proto processing */
+	switch (fdret) {
+	case FLOW_DISSECT_RET_OUT_GOOD:
+		goto out_good;
+	case FLOW_DISSECT_RET_PROTO_AGAIN:
+		goto proto_again;
+	case FLOW_DISSECT_RET_CONTINUE:
+	case FLOW_DISSECT_RET_IPPROTO_AGAIN:
+		break;
+	case FLOW_DISSECT_RET_OUT_BAD:
 	default:
 		goto out_bad;
 	}
 
 ip_proto_again:
+	fdret = FLOW_DISSECT_RET_CONTINUE;
+
 	switch (ip_proto) {
 	case IPPROTO_GRE:
-		switch (__skb_flow_dissect_gre(skb, key_control, flow_dissector,
+		fdret = __skb_flow_dissect_gre(skb, key_control, flow_dissector,
 					       target_container, data,
-					       &proto, &nhoff, &hlen, flags)) {
-		case FLOW_DISSECT_RET_OUT_GOOD:
-			goto out_good;
-		case FLOW_DISSECT_RET_OUT_BAD:
-			goto out_bad;
-		case FLOW_DISSECT_RET_OUT_PROTO_AGAIN:
-			goto proto_again;
-		}
+					       &proto, &nhoff, &hlen, flags);
+		break;
+
 	case NEXTHDR_HOP:
 	case NEXTHDR_ROUTING:
 	case NEXTHDR_DEST: {
@@ -711,13 +743,16 @@ ip_proto_again:
 
 		opthdr = __skb_header_pointer(skb, nhoff, sizeof(_opthdr),
 					      data, hlen, &_opthdr);
-		if (!opthdr)
-			goto out_bad;
+		if (!opthdr) {
+			fdret = FLOW_DISSECT_RET_OUT_BAD;
+			break;
+		}
 
 		ip_proto = opthdr[0];
 		nhoff += (opthdr[1] + 1) << 3;
 
-		goto ip_proto_again;
+		fdret = FLOW_DISSECT_RET_IPPROTO_AGAIN;
+		break;
 	}
 	case NEXTHDR_FRAGMENT: {
 		struct frag_hdr _fh, *fh;
@@ -728,8 +763,10 @@ ip_proto_again:
 		fh = __skb_header_pointer(skb, nhoff, sizeof(_fh),
 					  data, hlen, &_fh);
 
-		if (!fh)
-			goto out_bad;
+		if (!fh) {
+			fdret = FLOW_DISSECT_RET_OUT_BAD;
+			break;
+		}
 
 		key_control->flags |= FLOW_DIS_IS_FRAGMENT;
 
@@ -738,34 +775,50 @@ ip_proto_again:
 
 		if (!(fh->frag_off & htons(IP6_OFFSET))) {
 			key_control->flags |= FLOW_DIS_FIRST_FRAG;
-			if (flags & FLOW_DISSECTOR_F_PARSE_1ST_FRAG)
-				goto ip_proto_again;
+			if (flags & FLOW_DISSECTOR_F_PARSE_1ST_FRAG) {
+				fdret = FLOW_DISSECT_RET_IPPROTO_AGAIN;
+				break;
+			}
 		}
-		goto out_good;
+
+		fdret = FLOW_DISSECT_RET_OUT_GOOD;
+		break;
 	}
 	case IPPROTO_IPIP:
 		proto = htons(ETH_P_IP);
 
 		key_control->flags |= FLOW_DIS_ENCAPSULATION;
-		if (flags & FLOW_DISSECTOR_F_STOP_AT_ENCAP)
-			goto out_good;
+		if (flags & FLOW_DISSECTOR_F_STOP_AT_ENCAP) {
+			fdret = FLOW_DISSECT_RET_OUT_GOOD;
+			break;
+		}
+
+		fdret = FLOW_DISSECT_RET_PROTO_AGAIN;
+		break;
 
-		goto ip;
 	case IPPROTO_IPV6:
 		proto = htons(ETH_P_IPV6);
 
 		key_control->flags |= FLOW_DIS_ENCAPSULATION;
-		if (flags & FLOW_DISSECTOR_F_STOP_AT_ENCAP)
-			goto out_good;
+		if (flags & FLOW_DISSECTOR_F_STOP_AT_ENCAP) {
+			fdret = FLOW_DISSECT_RET_OUT_GOOD;
+			break;
+		}
+
+		fdret = FLOW_DISSECT_RET_PROTO_AGAIN;
+		break;
+
 
-		goto ipv6;
 	case IPPROTO_MPLS:
 		proto = htons(ETH_P_MPLS_UC);
-		goto mpls;
+		fdret = FLOW_DISSECT_RET_PROTO_AGAIN;
+		break;
+
 	case IPPROTO_TCP:
 		__skb_flow_dissect_tcp(skb, flow_dissector, target_container,
 				       data, nhoff, hlen);
 		break;
+
 	default:
 		break;
 	}
@@ -787,6 +840,20 @@ ip_proto_again:
 		key_icmp->icmp = skb_flow_get_be16(skb, nhoff, data, hlen);
 	}
 
+	/* Process result of IP proto processing */
+	switch (fdret) {
+	case FLOW_DISSECT_RET_PROTO_AGAIN:
+		goto proto_again;
+	case FLOW_DISSECT_RET_IPPROTO_AGAIN:
+		goto ip_proto_again;
+	case FLOW_DISSECT_RET_OUT_GOOD:
+	case FLOW_DISSECT_RET_CONTINUE:
+		break;
+	case FLOW_DISSECT_RET_OUT_BAD:
+	default:
+		goto out_bad;
+	}
+
 out_good:
 	ret = true;
 
-- 
cgit v1.2.3


From 55199df6d2af55be414f40856efcb527811001bb Mon Sep 17 00:00:00 2001
From: Florian Fainelli <f.fainelli@gmail.com>
Date: Sun, 3 Sep 2017 20:27:00 -0700
Subject: net: dsa: Allow switch drivers to indicate number of TX queues

Let switch drivers indicate how many TX queues they support. Some
switches, such as Broadcom Starfighter 2 are designed with 8 egress
queues. Future changes will allow us to leverage the queue mapping and
direct the transmission towards a particular queue.

Signed-off-by: Florian Fainelli <f.fainelli@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/dsa.h | 3 +++
 net/dsa/slave.c   | 8 ++++++--
 2 files changed, 9 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/net/dsa.h b/include/net/dsa.h
index 398ca8d70ccd..dd44d6ce1097 100644
--- a/include/net/dsa.h
+++ b/include/net/dsa.h
@@ -243,6 +243,9 @@ struct dsa_switch {
 	/* devlink used to represent this switch device */
 	struct devlink		*devlink;
 
+	/* Number of switch port queues */
+	unsigned int		num_tx_queues;
+
 	/* Dynamically allocated ports, keep last */
 	size_t num_ports;
 	struct dsa_port ports[];
diff --git a/net/dsa/slave.c b/net/dsa/slave.c
index 78e78a6e6833..2afa99506f8b 100644
--- a/net/dsa/slave.c
+++ b/net/dsa/slave.c
@@ -1259,8 +1259,12 @@ int dsa_slave_create(struct dsa_port *port, const char *name)
 	cpu_dp = ds->dst->cpu_dp;
 	master = cpu_dp->netdev;
 
-	slave_dev = alloc_netdev(sizeof(struct dsa_slave_priv), name,
-				 NET_NAME_UNKNOWN, ether_setup);
+	if (!ds->num_tx_queues)
+		ds->num_tx_queues = 1;
+
+	slave_dev = alloc_netdev_mqs(sizeof(struct dsa_slave_priv), name,
+				     NET_NAME_UNKNOWN, ether_setup,
+				     ds->num_tx_queues, 1);
 	if (slave_dev == NULL)
 		return -ENOMEM;
 
-- 
cgit v1.2.3


From 0062818298662d0d05061949d12880146b5ebd65 Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Mon, 28 Aug 2017 15:06:14 -0400
Subject: rdma core: Add rdma_rw_mr_payload()

The amount of payload per MR depends on device capabilities and
the memory registration mode in use. The new rdma_rw API hides both,
making it difficult for ULPs to determine how large their transport
send queues need to be.

Expose the MR payload information via a new API.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Acked-by: Doug Ledford <dledford@redhat.com>
Signed-off-by: J. Bruce Fields <bfields@redhat.com>
---
 drivers/infiniband/core/rw.c | 24 ++++++++++++++++++++++++
 include/rdma/rw.h            |  2 ++
 2 files changed, 26 insertions(+)

(limited to 'include')

diff --git a/drivers/infiniband/core/rw.c b/drivers/infiniband/core/rw.c
index dbfd854c32c9..6ca607e8e293 100644
--- a/drivers/infiniband/core/rw.c
+++ b/drivers/infiniband/core/rw.c
@@ -643,6 +643,30 @@ void rdma_rw_ctx_destroy_signature(struct rdma_rw_ctx *ctx, struct ib_qp *qp,
 }
 EXPORT_SYMBOL(rdma_rw_ctx_destroy_signature);
 
+/**
+ * rdma_rw_mr_factor - return number of MRs required for a payload
+ * @device:	device handling the connection
+ * @port_num:	port num to which the connection is bound
+ * @maxpages:	maximum payload pages per rdma_rw_ctx
+ *
+ * Returns the number of MRs the device requires to move @maxpayload
+ * bytes. The returned value is used during transport creation to
+ * compute max_rdma_ctxts and the size of the transport's Send and
+ * Send Completion Queues.
+ */
+unsigned int rdma_rw_mr_factor(struct ib_device *device, u8 port_num,
+			       unsigned int maxpages)
+{
+	unsigned int mr_pages;
+
+	if (rdma_rw_can_use_mr(device, port_num))
+		mr_pages = rdma_rw_fr_page_list_len(device);
+	else
+		mr_pages = device->attrs.max_sge_rd;
+	return DIV_ROUND_UP(maxpages, mr_pages);
+}
+EXPORT_SYMBOL(rdma_rw_mr_factor);
+
 void rdma_rw_init_qp(struct ib_device *dev, struct ib_qp_init_attr *attr)
 {
 	u32 factor;
diff --git a/include/rdma/rw.h b/include/rdma/rw.h
index 377d865e506d..a3cbbc7b6417 100644
--- a/include/rdma/rw.h
+++ b/include/rdma/rw.h
@@ -81,6 +81,8 @@ struct ib_send_wr *rdma_rw_ctx_wrs(struct rdma_rw_ctx *ctx, struct ib_qp *qp,
 int rdma_rw_ctx_post(struct rdma_rw_ctx *ctx, struct ib_qp *qp, u8 port_num,
 		struct ib_cqe *cqe, struct ib_send_wr *chain_wr);
 
+unsigned int rdma_rw_mr_factor(struct ib_device *device, u8 port_num,
+		unsigned int maxpages);
 void rdma_rw_init_qp(struct ib_device *dev, struct ib_qp_init_attr *attr);
 int rdma_rw_init_mrs(struct ib_qp *qp, struct ib_qp_init_attr *attr);
 void rdma_rw_cleanup_mrs(struct ib_qp *qp);
-- 
cgit v1.2.3


From 5482a978b962abd23f17a004e46d255d11646c20 Mon Sep 17 00:00:00 2001
From: Corentin Labbe <clabbe.montjoie@gmail.com>
Date: Mon, 4 Sep 2017 18:30:14 +0200
Subject: net: mdio-mux: add mdio_mux parameter to mdio_mux_init()

mdio_mux_init() use the parameter dev for two distinct thing:
1) Have a device for all devm_ functions
2) Get device_node from it

Since it is two distinct purpose, this patch add a parameter mdio_mux
that is linked to task 2.

This will also permit to register an of_node mdio-mux that lacks a direct
owning device.
For example a mdio-mux which is a subnode of a real device.

Signed-off-by: Corentin Labbe <clabbe.montjoie@gmail.com>
Reviewed-by: Florian Fainelli <f.fainelli@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/phy/mdio-mux-bcm-iproc.c | 2 +-
 drivers/net/phy/mdio-mux-gpio.c      | 2 +-
 drivers/net/phy/mdio-mux-mmioreg.c   | 3 ++-
 drivers/net/phy/mdio-mux.c           | 7 ++++---
 include/linux/mdio-mux.h             | 9 +++++++++
 5 files changed, 17 insertions(+), 6 deletions(-)

(limited to 'include')

diff --git a/drivers/net/phy/mdio-mux-bcm-iproc.c b/drivers/net/phy/mdio-mux-bcm-iproc.c
index 0a5f62e0efcc..0831b7142df7 100644
--- a/drivers/net/phy/mdio-mux-bcm-iproc.c
+++ b/drivers/net/phy/mdio-mux-bcm-iproc.c
@@ -199,7 +199,7 @@ static int mdio_mux_iproc_probe(struct platform_device *pdev)
 
 	platform_set_drvdata(pdev, md);
 
-	rc = mdio_mux_init(md->dev, mdio_mux_iproc_switch_fn,
+	rc = mdio_mux_init(md->dev, md->dev->of_node, mdio_mux_iproc_switch_fn,
 			   &md->mux_handle, md, md->mii_bus);
 	if (rc) {
 		dev_info(md->dev, "mdiomux initialization failed\n");
diff --git a/drivers/net/phy/mdio-mux-gpio.c b/drivers/net/phy/mdio-mux-gpio.c
index 919949960a10..082ffef0dec4 100644
--- a/drivers/net/phy/mdio-mux-gpio.c
+++ b/drivers/net/phy/mdio-mux-gpio.c
@@ -54,7 +54,7 @@ static int mdio_mux_gpio_probe(struct platform_device *pdev)
 	if (IS_ERR(s->gpios))
 		return PTR_ERR(s->gpios);
 
-	r = mdio_mux_init(&pdev->dev,
+	r = mdio_mux_init(&pdev->dev, pdev->dev.of_node,
 			  mdio_mux_gpio_switch_fn, &s->mux_handle, s, NULL);
 
 	if (r != 0) {
diff --git a/drivers/net/phy/mdio-mux-mmioreg.c b/drivers/net/phy/mdio-mux-mmioreg.c
index c3825c7da038..2573ab012f16 100644
--- a/drivers/net/phy/mdio-mux-mmioreg.c
+++ b/drivers/net/phy/mdio-mux-mmioreg.c
@@ -159,7 +159,8 @@ static int mdio_mux_mmioreg_probe(struct platform_device *pdev)
 		}
 	}
 
-	ret = mdio_mux_init(&pdev->dev, mdio_mux_mmioreg_switch_fn,
+	ret = mdio_mux_init(&pdev->dev, pdev->dev.of_node,
+			    mdio_mux_mmioreg_switch_fn,
 			    &s->mux_handle, s, NULL);
 	if (ret) {
 		dev_err(&pdev->dev, "failed to register mdio-mux bus %pOF\n",
diff --git a/drivers/net/phy/mdio-mux.c b/drivers/net/phy/mdio-mux.c
index 6f75e9f27fed..0a86f1e4c02f 100644
--- a/drivers/net/phy/mdio-mux.c
+++ b/drivers/net/phy/mdio-mux.c
@@ -86,6 +86,7 @@ out:
 static int parent_count;
 
 int mdio_mux_init(struct device *dev,
+		  struct device_node *mux_node,
 		  int (*switch_fn)(int cur, int desired, void *data),
 		  void **mux_handle,
 		  void *data,
@@ -98,11 +99,11 @@ int mdio_mux_init(struct device *dev,
 	struct mdio_mux_parent_bus *pb;
 	struct mdio_mux_child_bus *cb;
 
-	if (!dev->of_node)
+	if (!mux_node)
 		return -ENODEV;
 
 	if (!mux_bus) {
-		parent_bus_node = of_parse_phandle(dev->of_node,
+		parent_bus_node = of_parse_phandle(mux_node,
 						   "mdio-parent-bus", 0);
 
 		if (!parent_bus_node)
@@ -132,7 +133,7 @@ int mdio_mux_init(struct device *dev,
 	pb->mii_bus = parent_bus;
 
 	ret_val = -ENODEV;
-	for_each_available_child_of_node(dev->of_node, child_bus_node) {
+	for_each_available_child_of_node(mux_node, child_bus_node) {
 		int v;
 
 		r = of_property_read_u32(child_bus_node, "reg", &v);
diff --git a/include/linux/mdio-mux.h b/include/linux/mdio-mux.h
index 61f5b21b31c7..a5d58f221939 100644
--- a/include/linux/mdio-mux.h
+++ b/include/linux/mdio-mux.h
@@ -12,7 +12,16 @@
 #include <linux/device.h>
 #include <linux/phy.h>
 
+/* mdio_mux_init() - Initialize a MDIO mux
+ * @dev		The device owning the MDIO mux
+ * @mux_node	The device node of the MDIO mux
+ * @switch_fn	The function called for switching target MDIO child
+ * mux_handle	A pointer to a (void *) used internaly by mdio-mux
+ * @data	Private data used by switch_fn()
+ * @mux_bus	An optional parent bus (Other case are to use parent_bus property)
+ */
 int mdio_mux_init(struct device *dev,
+		  struct device_node *mux_node,
 		  int (*switch_fn) (int cur, int desired, void *data),
 		  void **mux_handle,
 		  void *data,
-- 
cgit v1.2.3


From 03c6f7d64ac9c0a37cca91392ac4be8993a8f53d Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.com>
Date: Wed, 16 Aug 2017 12:47:27 +1000
Subject: NFS: remove jiffies field from access cache

This field hasn't been used since commit 57b691819ee2 ("NFS: Cache
access checks more aggressively").

Signed-off-by: NeilBrown <neilb@suse.com>
Signed-off-by: Trond Myklebust <trond.myklebust@primarydata.com>
---
 fs/nfs/dir.c           | 4 ----
 fs/nfs/nfs4proc.c      | 1 -
 include/linux/nfs_fs.h | 1 -
 3 files changed, 6 deletions(-)

(limited to 'include')

diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index 3522b1249019..5ceaeb1f6fb6 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -2260,7 +2260,6 @@ static int nfs_access_get_cached(struct inode *inode, struct rpc_cred *cred, str
 		spin_lock(&inode->i_lock);
 		retry = false;
 	}
-	res->jiffies = cache->jiffies;
 	res->cred = cache->cred;
 	res->mask = cache->mask;
 	list_move_tail(&cache->lru, &nfsi->access_cache_entry_lru);
@@ -2296,7 +2295,6 @@ static int nfs_access_get_cached_rcu(struct inode *inode, struct rpc_cred *cred,
 		goto out;
 	if (nfs_check_cache_invalid(inode, NFS_INO_INVALID_ACCESS))
 		goto out;
-	res->jiffies = cache->jiffies;
 	res->cred = cache->cred;
 	res->mask = cache->mask;
 	err = 0;
@@ -2344,7 +2342,6 @@ void nfs_access_add_cache(struct inode *inode, struct nfs_access_entry *set)
 	if (cache == NULL)
 		return;
 	RB_CLEAR_NODE(&cache->rb_node);
-	cache->jiffies = set->jiffies;
 	cache->cred = get_rpccred(set->cred);
 	cache->mask = set->mask;
 
@@ -2432,7 +2429,6 @@ static int nfs_do_access(struct inode *inode, struct rpc_cred *cred, int mask)
 	cache.mask = NFS_MAY_LOOKUP | NFS_MAY_EXECUTE
 		     | NFS_MAY_WRITE | NFS_MAY_READ;
 	cache.cred = cred;
-	cache.jiffies = jiffies;
 	status = NFS_PROTO(inode)->access(inode, &cache);
 	if (status != 0) {
 		if (status == -ESTALE) {
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 08cc97488904..6c61e2b99635 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -2285,7 +2285,6 @@ static int nfs4_opendata_access(struct rpc_cred *cred,
 		mask = NFS4_ACCESS_READ;
 
 	cache.cred = cred;
-	cache.jiffies = jiffies;
 	nfs_access_set_mask(&cache, opendata->o_res.access_result);
 	nfs_access_add_cache(state->inode, &cache);
 
diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h
index 238fdc4c46df..a0282ceaa48b 100644
--- a/include/linux/nfs_fs.h
+++ b/include/linux/nfs_fs.h
@@ -49,7 +49,6 @@
 struct nfs_access_entry {
 	struct rb_node		rb_node;
 	struct list_head	lru;
-	unsigned long		jiffies;
 	struct rpc_cred *	cred;
 	__u32			mask;
 	struct rcu_head		rcu_head;
-- 
cgit v1.2.3


From 2ae409dc6a907e80f4cd32ad4482ef52441e3147 Mon Sep 17 00:00:00 2001
From: "Yan, Zheng" <zyan@redhat.com>
Date: Tue, 11 Jul 2017 16:20:05 +0800
Subject: ceph: remove unused cap_release_safety mount option

Signed-off-by: "Yan, Zheng" <zyan@redhat.com>
Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
---
 fs/ceph/super.c              | 6 ------
 fs/ceph/super.h              | 1 -
 include/linux/ceph/libceph.h | 1 -
 3 files changed, 8 deletions(-)

(limited to 'include')

diff --git a/fs/ceph/super.c b/fs/ceph/super.c
index aa06a8c24792..280311e36a17 100644
--- a/fs/ceph/super.c
+++ b/fs/ceph/super.c
@@ -113,7 +113,6 @@ enum {
 	Opt_rasize,
 	Opt_caps_wanted_delay_min,
 	Opt_caps_wanted_delay_max,
-	Opt_cap_release_safety,
 	Opt_readdir_max_entries,
 	Opt_readdir_max_bytes,
 	Opt_congestion_kb,
@@ -152,7 +151,6 @@ static match_table_t fsopt_tokens = {
 	{Opt_rasize, "rasize=%d"},
 	{Opt_caps_wanted_delay_min, "caps_wanted_delay_min=%d"},
 	{Opt_caps_wanted_delay_max, "caps_wanted_delay_max=%d"},
-	{Opt_cap_release_safety, "cap_release_safety=%d"},
 	{Opt_readdir_max_entries, "readdir_max_entries=%d"},
 	{Opt_readdir_max_bytes, "readdir_max_bytes=%d"},
 	{Opt_congestion_kb, "write_congestion_kb=%d"},
@@ -402,7 +400,6 @@ static int parse_mount_options(struct ceph_mount_options **pfsopt,
 
 	fsopt->caps_wanted_delay_min = CEPH_CAPS_WANTED_DELAY_MIN_DEFAULT;
 	fsopt->caps_wanted_delay_max = CEPH_CAPS_WANTED_DELAY_MAX_DEFAULT;
-	fsopt->cap_release_safety = CEPH_CAP_RELEASE_SAFETY_DEFAULT;
 	fsopt->max_readdir = CEPH_MAX_READDIR_DEFAULT;
 	fsopt->max_readdir_bytes = CEPH_MAX_READDIR_BYTES_DEFAULT;
 	fsopt->congestion_kb = default_congestion_kb();
@@ -520,9 +517,6 @@ static int ceph_show_options(struct seq_file *m, struct dentry *root)
 	if (fsopt->caps_wanted_delay_max != CEPH_CAPS_WANTED_DELAY_MAX_DEFAULT)
 		seq_printf(m, ",caps_wanted_delay_max=%d",
 			   fsopt->caps_wanted_delay_max);
-	if (fsopt->cap_release_safety != CEPH_CAP_RELEASE_SAFETY_DEFAULT)
-		seq_printf(m, ",cap_release_safety=%d",
-			   fsopt->cap_release_safety);
 	if (fsopt->max_readdir != CEPH_MAX_READDIR_DEFAULT)
 		seq_printf(m, ",readdir_max_entries=%d", fsopt->max_readdir);
 	if (fsopt->max_readdir_bytes != CEPH_MAX_READDIR_BYTES_DEFAULT)
diff --git a/fs/ceph/super.h b/fs/ceph/super.h
index f02a2225fe42..da036d01933e 100644
--- a/fs/ceph/super.h
+++ b/fs/ceph/super.h
@@ -61,7 +61,6 @@ struct ceph_mount_options {
 	int rasize;           /* max readahead */
 	int congestion_kb;    /* max writeback in flight */
 	int caps_wanted_delay_min, caps_wanted_delay_max;
-	int cap_release_safety;
 	int max_readdir;       /* max readdir result (entires) */
 	int max_readdir_bytes; /* max readdir result (bytes) */
 
diff --git a/include/linux/ceph/libceph.h b/include/linux/ceph/libceph.h
index 8a79587e1317..dca30ac9bd34 100644
--- a/include/linux/ceph/libceph.h
+++ b/include/linux/ceph/libceph.h
@@ -93,7 +93,6 @@ struct ceph_options {
 #define CEPH_CAPS_WANTED_DELAY_MIN_DEFAULT      5  /* cap release delay */
 #define CEPH_CAPS_WANTED_DELAY_MAX_DEFAULT     60  /* cap release delay */
 
-#define CEPH_CAP_RELEASE_SAFETY_DEFAULT        (CEPH_CAPS_PER_RELEASE * 4)
 
 /* mount state */
 enum {
-- 
cgit v1.2.3


From 4214fb158cc423ac31b841000e219855be055388 Mon Sep 17 00:00:00 2001
From: "Yan, Zheng" <zyan@redhat.com>
Date: Tue, 11 Jul 2017 18:49:44 +0800
Subject: ceph: validate correctness of some mount options

Signed-off-by: "Yan, Zheng" <zyan@redhat.com>
Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
---
 fs/ceph/super.c              | 21 ++++++++++++++-------
 fs/ceph/super.h              |  9 +++++++++
 include/linux/ceph/libceph.h | 10 ----------
 3 files changed, 23 insertions(+), 17 deletions(-)

(limited to 'include')

diff --git a/fs/ceph/super.c b/fs/ceph/super.c
index caf9801712ca..1deb8810d7c7 100644
--- a/fs/ceph/super.c
+++ b/fs/ceph/super.c
@@ -243,21 +243,33 @@ static int parse_fsopt_token(char *c, void *private)
 		fsopt->rsize = ALIGN(intval, PAGE_SIZE);
 		break;
 	case Opt_rasize:
-		fsopt->rasize = intval;
+		if (intval < 0)
+			return -EINVAL;
+		fsopt->rasize = ALIGN(intval + PAGE_SIZE - 1, PAGE_SIZE);
 		break;
 	case Opt_caps_wanted_delay_min:
+		if (intval < 1)
+			return -EINVAL;
 		fsopt->caps_wanted_delay_min = intval;
 		break;
 	case Opt_caps_wanted_delay_max:
+		if (intval < 1)
+			return -EINVAL;
 		fsopt->caps_wanted_delay_max = intval;
 		break;
 	case Opt_readdir_max_entries:
+		if (intval < 1)
+			return -EINVAL;
 		fsopt->max_readdir = intval;
 		break;
 	case Opt_readdir_max_bytes:
+		if (intval < PAGE_SIZE && intval != 0)
+			return -EINVAL;
 		fsopt->max_readdir_bytes = intval;
 		break;
 	case Opt_congestion_kb:
+		if (intval < 1024) /* at least 1M */
+			return -EINVAL;
 		fsopt->congestion_kb = intval;
 		break;
 	case Opt_dirstat:
@@ -946,12 +958,7 @@ static int ceph_setup_bdi(struct super_block *sb, struct ceph_fs_client *fsc)
 		return err;
 
 	/* set ra_pages based on rasize mount option? */
-	if (fsc->mount_options->rasize >= PAGE_SIZE)
-		sb->s_bdi->ra_pages =
-			(fsc->mount_options->rasize + PAGE_SIZE - 1)
-			>> PAGE_SHIFT;
-	else
-		sb->s_bdi->ra_pages = VM_MAX_READAHEAD * 1024 / PAGE_SIZE;
+	sb->s_bdi->ra_pages = fsc->mount_options->rasize >> PAGE_SHIFT;
 
 	/* set io_pages based on max osd read size */
 	sb->s_bdi->io_pages = fsc->mount_options->rsize >> PAGE_SHIFT;
diff --git a/fs/ceph/super.h b/fs/ceph/super.h
index eed2a67d8e52..279a2f401cf5 100644
--- a/fs/ceph/super.h
+++ b/fs/ceph/super.h
@@ -56,6 +56,15 @@
 #define CEPH_MAX_READDIR_BYTES_DEFAULT  (512*1024)
 #define CEPH_SNAPDIRNAME_DEFAULT        ".snap"
 
+/*
+ * Delay telling the MDS we no longer want caps, in case we reopen
+ * the file.  Delay a minimum amount of time, even if we send a cap
+ * message for some other reason.  Otherwise, take the oppotunity to
+ * update the mds to avoid sending another message later.
+ */
+#define CEPH_CAPS_WANTED_DELAY_MIN_DEFAULT      5  /* cap release delay */
+#define CEPH_CAPS_WANTED_DELAY_MAX_DEFAULT     60  /* cap release delay */
+
 struct ceph_mount_options {
 	int flags;
 	int sb_flags;
diff --git a/include/linux/ceph/libceph.h b/include/linux/ceph/libceph.h
index dca30ac9bd34..4c846aabd9f6 100644
--- a/include/linux/ceph/libceph.h
+++ b/include/linux/ceph/libceph.h
@@ -84,16 +84,6 @@ struct ceph_options {
 
 #define CEPH_AUTH_NAME_DEFAULT   "guest"
 
-/*
- * Delay telling the MDS we no longer want caps, in case we reopen
- * the file.  Delay a minimum amount of time, even if we send a cap
- * message for some other reason.  Otherwise, take the oppotunity to
- * update the mds to avoid sending another message later.
- */
-#define CEPH_CAPS_WANTED_DELAY_MIN_DEFAULT      5  /* cap release delay */
-#define CEPH_CAPS_WANTED_DELAY_MAX_DEFAULT     60  /* cap release delay */
-
-
 /* mount state */
 enum {
 	CEPH_MOUNT_MOUNTING,
-- 
cgit v1.2.3


From 3fb99d483e614bc3834784c7a686572c7970bb92 Mon Sep 17 00:00:00 2001
From: Yanhu Cao <gmayyyha@gmail.com>
Date: Fri, 21 Jul 2017 17:20:10 +0800
Subject: ceph: nuke startsync op

startsync is a no-op, has been for years.  Remove it.

Link: http://tracker.ceph.com/issues/20604
Signed-off-by: Yanhu Cao <gmayyyha@gmail.com>
Reviewed-by: "Yan, Zheng" <zyan@redhat.com>
Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
---
 fs/ceph/addr.c             | 21 +++------------------
 fs/ceph/file.c             |  5 +----
 include/linux/ceph/rados.h |  1 -
 net/ceph/osd_client.c      |  5 -----
 4 files changed, 4 insertions(+), 28 deletions(-)

(limited to 'include')

diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c
index 149b10063be8..825931516623 100644
--- a/fs/ceph/addr.c
+++ b/fs/ceph/addr.c
@@ -752,21 +752,11 @@ static int ceph_writepages_start(struct address_space *mapping,
 	int rc = 0;
 	unsigned int wsize = i_blocksize(inode);
 	struct ceph_osd_request *req = NULL;
-	int do_sync = 0;
 	loff_t snap_size, i_size;
 	u64 truncate_size;
 	u32 truncate_seq;
 
-	/*
-	 * Include a 'sync' in the OSD request if this is a data
-	 * integrity write (e.g., O_SYNC write or fsync()), or if our
-	 * cap is being revoked.
-	 */
-	if ((wbc->sync_mode == WB_SYNC_ALL) ||
-		ceph_caps_revoking(ci, CEPH_CAP_FILE_BUFFER))
-		do_sync = 1;
-	dout("writepages_start %p dosync=%d (mode=%s)\n",
-	     inode, do_sync,
+	dout("writepages_start %p (mode=%s)\n", inode,
 	     wbc->sync_mode == WB_SYNC_NONE ? "NONE" :
 	     (wbc->sync_mode == WB_SYNC_ALL ? "ALL" : "HOLD"));
 
@@ -936,7 +926,7 @@ get_more_pages:
 					break;
 				}
 
-				num_ops = 1 + do_sync;
+				num_ops = 1;
 				strip_unit_end = page->index +
 					((len - 1) >> PAGE_SHIFT);
 
@@ -1042,7 +1032,7 @@ new_request:
 		for (i = 0; i < locked_pages; i++) {
 			u64 cur_offset = page_offset(pages[i]);
 			if (offset + len != cur_offset) {
-				if (op_idx + do_sync + 1 == req->r_num_ops)
+				if (op_idx + 1 == req->r_num_ops)
 					break;
 				osd_req_op_extent_dup_last(req, op_idx,
 							   cur_offset - offset);
@@ -1079,17 +1069,12 @@ new_request:
 						 0, !!pool, false);
 		osd_req_op_extent_update(req, op_idx, len);
 
-		if (do_sync) {
-			op_idx++;
-			osd_req_op_init(req, op_idx, CEPH_OSD_OP_STARTSYNC, 0);
-		}
 		BUG_ON(op_idx + 1 != req->r_num_ops);
 
 		pool = NULL;
 		if (i < locked_pages) {
 			BUG_ON(num_ops <= req->r_num_ops);
 			num_ops -= req->r_num_ops;
-			num_ops += do_sync;
 			locked_pages -= i;
 
 			/* allocate new pages array for next request */
diff --git a/fs/ceph/file.c b/fs/ceph/file.c
index a39ff54cb372..0e8986c69639 100644
--- a/fs/ceph/file.c
+++ b/fs/ceph/file.c
@@ -800,7 +800,6 @@ static void ceph_aio_retry_work(struct work_struct *work)
 	}
 
 	req->r_ops[0] = orig_req->r_ops[0];
-	osd_req_op_init(req, 1, CEPH_OSD_OP_STARTSYNC, 0);
 
 	req->r_mtime = aio_req->mtime;
 	req->r_data_offset = req->r_ops[0].extent.offset;
@@ -874,8 +873,7 @@ ceph_direct_read_write(struct kiocb *iocb, struct iov_iter *iter,
 		vino = ceph_vino(inode);
 		req = ceph_osdc_new_request(&fsc->client->osdc, &ci->i_layout,
 					    vino, pos, &size, 0,
-					    /*include a 'startsync' command*/
-					    write ? 2 : 1,
+					    1,
 					    write ? CEPH_OSD_OP_WRITE :
 						    CEPH_OSD_OP_READ,
 					    flags, snapc,
@@ -927,7 +925,6 @@ ceph_direct_read_write(struct kiocb *iocb, struct iov_iter *iter,
 			truncate_inode_pages_range(inode->i_mapping, pos,
 					(pos+len) | (PAGE_SIZE - 1));
 
-			osd_req_op_init(req, 1, CEPH_OSD_OP_STARTSYNC, 0);
 			req->r_mtime = mtime;
 		}
 
diff --git a/include/linux/ceph/rados.h b/include/linux/ceph/rados.h
index b8281feda9c7..01408841c9c4 100644
--- a/include/linux/ceph/rados.h
+++ b/include/linux/ceph/rados.h
@@ -230,7 +230,6 @@ extern const char *ceph_osd_state_name(int s);
 									    \
 	/* fancy write */						    \
 	f(APPEND,	__CEPH_OSD_OP(WR, DATA, 6),	"append")	    \
-	f(STARTSYNC,	__CEPH_OSD_OP(WR, DATA, 7),	"startsync")	    \
 	f(SETTRUNC,	__CEPH_OSD_OP(WR, DATA, 8),	"settrunc")	    \
 	f(TRIMTRUNC,	__CEPH_OSD_OP(WR, DATA, 9),	"trimtrunc")	    \
 									    \
diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c
index dcfbdd74dfd1..e02f01f534e2 100644
--- a/net/ceph/osd_client.c
+++ b/net/ceph/osd_client.c
@@ -863,8 +863,6 @@ static u32 osd_req_encode_op(struct ceph_osd_op *dst,
 		dst->cls.method_len = src->cls.method_len;
 		dst->cls.indata_len = cpu_to_le32(src->cls.indata_len);
 		break;
-	case CEPH_OSD_OP_STARTSYNC:
-		break;
 	case CEPH_OSD_OP_WATCH:
 		dst->watch.cookie = cpu_to_le64(src->watch.cookie);
 		dst->watch.ver = cpu_to_le64(0);
@@ -916,9 +914,6 @@ static u32 osd_req_encode_op(struct ceph_osd_op *dst,
  * if the file was recently truncated, we include information about its
  * old and new size so that the object can be updated appropriately.  (we
  * avoid synchronously deleting truncated objects because it's slow.)
- *
- * if @do_sync, include a 'startsync' command so that the osd will flush
- * data quickly.
  */
 struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *osdc,
 					       struct ceph_file_layout *layout,
-- 
cgit v1.2.3


From 95569713afc0b53ded1bba67834e0be24529a8c9 Mon Sep 17 00:00:00 2001
From: "Yan, Zheng" <zyan@redhat.com>
Date: Mon, 24 Jul 2017 17:59:39 +0800
Subject: ceph: new cap message flags indicate if there is pending capsnap

These flags tell mds if there is pending capsnap explicitly.
Without this explicit notification, mds can only conclude if
client has pending capsnap. The method mds use is inefficient
and error-prone.

Signed-off-by: "Yan, Zheng" <zyan@redhat.com>
Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
---
 fs/ceph/caps.c               | 5 ++++-
 include/linux/ceph/ceph_fs.h | 4 +++-
 2 files changed, 7 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c
index 7007ae2a5ad2..b675c004f6a7 100644
--- a/fs/ceph/caps.c
+++ b/fs/ceph/caps.c
@@ -1248,7 +1248,10 @@ static int __send_cap(struct ceph_mds_client *mdsc, struct ceph_cap *cap,
 	arg.mode = inode->i_mode;
 
 	arg.inline_data = ci->i_inline_version != CEPH_INLINE_NONE;
-	arg.flags = 0;
+	if (list_empty(&ci->i_cap_snaps))
+		arg.flags = CEPH_CLIENT_CAPS_NO_CAPSNAP;
+	else
+		arg.flags = CEPH_CLIENT_CAPS_PENDING_CAPSNAP;
 	if (sync)
 		arg.flags |= CEPH_CLIENT_CAPS_SYNC;
 
diff --git a/include/linux/ceph/ceph_fs.h b/include/linux/ceph/ceph_fs.h
index edf5b04b918a..d1642a4b4c5e 100644
--- a/include/linux/ceph/ceph_fs.h
+++ b/include/linux/ceph/ceph_fs.h
@@ -669,7 +669,9 @@ enum {
 extern const char *ceph_cap_op_name(int op);
 
 /* flags field in client cap messages (version >= 10) */
-#define CEPH_CLIENT_CAPS_SYNC	(0x1)
+#define CEPH_CLIENT_CAPS_SYNC			(1<<0)
+#define CEPH_CLIENT_CAPS_NO_CAPSNAP		(1<<1)
+#define CEPH_CLIENT_CAPS_PENDING_CAPSNAP	(1<<2);
 
 /*
  * caps message, used for capability callbacks, acks, requests, etc.
-- 
cgit v1.2.3


From 06d74376c8af32f5b8d777a943aa4dc99165088b Mon Sep 17 00:00:00 2001
From: Douglas Fuller <dfuller@redhat.com>
Date: Wed, 16 Aug 2017 10:19:27 -0400
Subject: ceph: more accurate statfs

Improve accuracy of statfs reporting for Ceph filesystems comprising
exactly one data pool. In this case, the Ceph monitor can now report
the space usage for the single data pool instead of the global data
for the entire Ceph cluster. Include support for this message in
mon_client and leverage it in ceph/super.

Signed-off-by: Douglas Fuller <dfuller@redhat.com>
Reviewed-by: Yan, Zheng <zyan@redhat.com>
Reviewed-by: Ilya Dryomov <idryomov@gmail.com>
Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
---
 fs/ceph/super.c                 | 9 ++++++++-
 include/linux/ceph/ceph_fs.h    | 2 ++
 include/linux/ceph/mon_client.h | 4 ++--
 net/ceph/mon_client.c           | 6 +++++-
 4 files changed, 17 insertions(+), 4 deletions(-)

(limited to 'include')

diff --git a/fs/ceph/super.c b/fs/ceph/super.c
index 1deb8810d7c7..324d29ecbe0b 100644
--- a/fs/ceph/super.c
+++ b/fs/ceph/super.c
@@ -49,9 +49,16 @@ static int ceph_statfs(struct dentry *dentry, struct kstatfs *buf)
 	struct ceph_statfs st;
 	u64 fsid;
 	int err;
+	u64 data_pool;
+
+	if (fsc->mdsc->mdsmap->m_num_data_pg_pools == 1) {
+		data_pool = fsc->mdsc->mdsmap->m_data_pg_pools[0];
+	} else {
+		data_pool = CEPH_NOPOOL;
+	}
 
 	dout("statfs\n");
-	err = ceph_monc_do_statfs(&fsc->client->monc, &st);
+	err = ceph_monc_do_statfs(&fsc->client->monc, data_pool, &st);
 	if (err < 0)
 		return err;
 
diff --git a/include/linux/ceph/ceph_fs.h b/include/linux/ceph/ceph_fs.h
index d1642a4b4c5e..b422170b791a 100644
--- a/include/linux/ceph/ceph_fs.h
+++ b/include/linux/ceph/ceph_fs.h
@@ -167,6 +167,8 @@ struct ceph_mon_request_header {
 struct ceph_mon_statfs {
 	struct ceph_mon_request_header monhdr;
 	struct ceph_fsid fsid;
+	__u8 contains_data_pool;
+	__le64 data_pool;
 } __attribute__ ((packed));
 
 struct ceph_statfs {
diff --git a/include/linux/ceph/mon_client.h b/include/linux/ceph/mon_client.h
index d5a3ecea578d..0fa990bf867a 100644
--- a/include/linux/ceph/mon_client.h
+++ b/include/linux/ceph/mon_client.h
@@ -133,8 +133,8 @@ void ceph_monc_renew_subs(struct ceph_mon_client *monc);
 extern int ceph_monc_wait_osdmap(struct ceph_mon_client *monc, u32 epoch,
 				 unsigned long timeout);
 
-extern int ceph_monc_do_statfs(struct ceph_mon_client *monc,
-			       struct ceph_statfs *buf);
+int ceph_monc_do_statfs(struct ceph_mon_client *monc, u64 data_pool,
+			struct ceph_statfs *buf);
 
 int ceph_monc_get_version(struct ceph_mon_client *monc, const char *what,
 			  u64 *newest);
diff --git a/net/ceph/mon_client.c b/net/ceph/mon_client.c
index 875675765531..63edc6e5f026 100644
--- a/net/ceph/mon_client.c
+++ b/net/ceph/mon_client.c
@@ -676,7 +676,8 @@ bad:
 /*
  * Do a synchronous statfs().
  */
-int ceph_monc_do_statfs(struct ceph_mon_client *monc, struct ceph_statfs *buf)
+int ceph_monc_do_statfs(struct ceph_mon_client *monc, u64 data_pool,
+			struct ceph_statfs *buf)
 {
 	struct ceph_mon_generic_request *req;
 	struct ceph_mon_statfs *h;
@@ -696,6 +697,7 @@ int ceph_monc_do_statfs(struct ceph_mon_client *monc, struct ceph_statfs *buf)
 		goto out;
 
 	req->u.st = buf;
+	req->request->hdr.version = cpu_to_le16(2);
 
 	mutex_lock(&monc->mutex);
 	register_generic_request(req);
@@ -705,6 +707,8 @@ int ceph_monc_do_statfs(struct ceph_mon_client *monc, struct ceph_statfs *buf)
 	h->monhdr.session_mon = cpu_to_le16(-1);
 	h->monhdr.session_mon_tid = 0;
 	h->fsid = monc->monmap->fsid;
+	h->contains_data_pool = (data_pool != CEPH_NOPOOL);
+	h->data_pool = cpu_to_le64(data_pool);
 	send_generic_request(monc, req);
 	mutex_unlock(&monc->mutex);
 
-- 
cgit v1.2.3


From b2770da6425406cf3f6d3fddbf9086b1db0106a1 Mon Sep 17 00:00:00 2001
From: Ross Zwisler <ross.zwisler@linux.intel.com>
Date: Wed, 6 Sep 2017 16:18:35 -0700
Subject: mm: add vm_insert_mixed_mkwrite()

When servicing mmap() reads from file holes the current DAX code
allocates a page cache page of all zeroes and places the struct page
pointer in the mapping->page_tree radix tree.  This has three major
drawbacks:

1) It consumes memory unnecessarily. For every 4k page that is read via
   a DAX mmap() over a hole, we allocate a new page cache page. This
   means that if you read 1GiB worth of pages, you end up using 1GiB of
   zeroed memory.

2) It is slower than using a common zero page because each page fault
   has more work to do. Instead of just inserting a common zero page we
   have to allocate a page cache page, zero it, and then insert it.

3) The fact that we had to check for both DAX exceptional entries and
   for page cache pages in the radix tree made the DAX code more
   complex.

This series solves these issues by following the lead of the DAX PMD
code and using a common 4k zero page instead.  This reduces memory usage
and decreases latencies for some workloads, and it simplifies the DAX
code, removing over 100 lines in total.

This patch (of 5):

To be able to use the common 4k zero page in DAX we need to have our PTE
fault path look more like our PMD fault path where a PTE entry can be
marked as dirty and writeable as it is first inserted rather than
waiting for a follow-up dax_pfn_mkwrite() => finish_mkwrite_fault()
call.

Right now we can rely on having a dax_pfn_mkwrite() call because we can
distinguish between these two cases in do_wp_page():

	case 1: 4k zero page => writable DAX storage
	case 2: read-only DAX storage => writeable DAX storage

This distinction is made by via vm_normal_page().  vm_normal_page()
returns false for the common 4k zero page, though, just as it does for
DAX ptes.  Instead of special casing the DAX + 4k zero page case we will
simplify our DAX PTE page fault sequence so that it matches our DAX PMD
sequence, and get rid of the dax_pfn_mkwrite() helper.  We will instead
use dax_iomap_fault() to handle write-protection faults.

This means that insert_pfn() needs to follow the lead of
insert_pfn_pmd() and allow us to pass in a 'mkwrite' flag.  If 'mkwrite'
is set insert_pfn() will do the work that was previously done by
wp_page_reuse() as part of the dax_pfn_mkwrite() call path.

Link: http://lkml.kernel.org/r/20170724170616.25810-2-ross.zwisler@linux.intel.com
Signed-off-by: Ross Zwisler <ross.zwisler@linux.intel.com>
Reviewed-by: Jan Kara <jack@suse.cz>
Acked-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Cc: "Darrick J. Wong" <darrick.wong@oracle.com>
Cc: "Theodore Ts'o" <tytso@mit.edu>
Cc: Alexander Viro <viro@zeniv.linux.org.uk>
Cc: Andreas Dilger <adilger.kernel@dilger.ca>
Cc: Christoph Hellwig <hch@lst.de>
Cc: Dan Williams <dan.j.williams@intel.com>
Cc: Dave Chinner <david@fromorbit.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Matthew Wilcox <mawilcox@microsoft.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/mm.h |  2 ++
 mm/memory.c        | 50 +++++++++++++++++++++++++++++++++++++++++++-------
 2 files changed, 45 insertions(+), 7 deletions(-)

(limited to 'include')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index c1f6c95f3496..eb5e4bc946cc 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -2294,6 +2294,8 @@ int vm_insert_pfn_prot(struct vm_area_struct *vma, unsigned long addr,
 			unsigned long pfn, pgprot_t pgprot);
 int vm_insert_mixed(struct vm_area_struct *vma, unsigned long addr,
 			pfn_t pfn);
+int vm_insert_mixed_mkwrite(struct vm_area_struct *vma, unsigned long addr,
+			pfn_t pfn);
 int vm_iomap_memory(struct vm_area_struct *vma, phys_addr_t start, unsigned long len);
 
 
diff --git a/mm/memory.c b/mm/memory.c
index 56e48e4593cb..71c0b6f98a62 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -1676,7 +1676,7 @@ int vm_insert_page(struct vm_area_struct *vma, unsigned long addr,
 EXPORT_SYMBOL(vm_insert_page);
 
 static int insert_pfn(struct vm_area_struct *vma, unsigned long addr,
-			pfn_t pfn, pgprot_t prot)
+			pfn_t pfn, pgprot_t prot, bool mkwrite)
 {
 	struct mm_struct *mm = vma->vm_mm;
 	int retval;
@@ -1688,14 +1688,35 @@ static int insert_pfn(struct vm_area_struct *vma, unsigned long addr,
 	if (!pte)
 		goto out;
 	retval = -EBUSY;
-	if (!pte_none(*pte))
-		goto out_unlock;
+	if (!pte_none(*pte)) {
+		if (mkwrite) {
+			/*
+			 * For read faults on private mappings the PFN passed
+			 * in may not match the PFN we have mapped if the
+			 * mapped PFN is a writeable COW page.  In the mkwrite
+			 * case we are creating a writable PTE for a shared
+			 * mapping and we expect the PFNs to match.
+			 */
+			if (WARN_ON_ONCE(pte_pfn(*pte) != pfn_t_to_pfn(pfn)))
+				goto out_unlock;
+			entry = *pte;
+			goto out_mkwrite;
+		} else
+			goto out_unlock;
+	}
 
 	/* Ok, finally just insert the thing.. */
 	if (pfn_t_devmap(pfn))
 		entry = pte_mkdevmap(pfn_t_pte(pfn, prot));
 	else
 		entry = pte_mkspecial(pfn_t_pte(pfn, prot));
+
+out_mkwrite:
+	if (mkwrite) {
+		entry = pte_mkyoung(entry);
+		entry = maybe_mkwrite(pte_mkdirty(entry), vma);
+	}
+
 	set_pte_at(mm, addr, pte, entry);
 	update_mmu_cache(vma, addr, pte); /* XXX: why not for insert_page? */
 
@@ -1766,14 +1787,15 @@ int vm_insert_pfn_prot(struct vm_area_struct *vma, unsigned long addr,
 
 	track_pfn_insert(vma, &pgprot, __pfn_to_pfn_t(pfn, PFN_DEV));
 
-	ret = insert_pfn(vma, addr, __pfn_to_pfn_t(pfn, PFN_DEV), pgprot);
+	ret = insert_pfn(vma, addr, __pfn_to_pfn_t(pfn, PFN_DEV), pgprot,
+			false);
 
 	return ret;
 }
 EXPORT_SYMBOL(vm_insert_pfn_prot);
 
-int vm_insert_mixed(struct vm_area_struct *vma, unsigned long addr,
-			pfn_t pfn)
+static int __vm_insert_mixed(struct vm_area_struct *vma, unsigned long addr,
+			pfn_t pfn, bool mkwrite)
 {
 	pgprot_t pgprot = vma->vm_page_prot;
 
@@ -1802,10 +1824,24 @@ int vm_insert_mixed(struct vm_area_struct *vma, unsigned long addr,
 		page = pfn_to_page(pfn_t_to_pfn(pfn));
 		return insert_page(vma, addr, page, pgprot);
 	}
-	return insert_pfn(vma, addr, pfn, pgprot);
+	return insert_pfn(vma, addr, pfn, pgprot, mkwrite);
+}
+
+int vm_insert_mixed(struct vm_area_struct *vma, unsigned long addr,
+			pfn_t pfn)
+{
+	return __vm_insert_mixed(vma, addr, pfn, false);
+
 }
 EXPORT_SYMBOL(vm_insert_mixed);
 
+int vm_insert_mixed_mkwrite(struct vm_area_struct *vma, unsigned long addr,
+			pfn_t pfn)
+{
+	return __vm_insert_mixed(vma, addr, pfn, true);
+}
+EXPORT_SYMBOL(vm_insert_mixed_mkwrite);
+
 /*
  * maps a range of physical memory into the requested pages. the old
  * mappings are removed. any references to nonexistent pages results
-- 
cgit v1.2.3


From 91d25ba8a6b0d810dc844cebeedc53029118ce3e Mon Sep 17 00:00:00 2001
From: Ross Zwisler <ross.zwisler@linux.intel.com>
Date: Wed, 6 Sep 2017 16:18:43 -0700
Subject: dax: use common 4k zero page for dax mmap reads

When servicing mmap() reads from file holes the current DAX code
allocates a page cache page of all zeroes and places the struct page
pointer in the mapping->page_tree radix tree.

This has three major drawbacks:

1) It consumes memory unnecessarily. For every 4k page that is read via
   a DAX mmap() over a hole, we allocate a new page cache page. This
   means that if you read 1GiB worth of pages, you end up using 1GiB of
   zeroed memory. This is easily visible by looking at the overall
   memory consumption of the system or by looking at /proc/[pid]/smaps:

	7f62e72b3000-7f63272b3000 rw-s 00000000 103:00 12   /root/dax/data
	Size:            1048576 kB
	Rss:             1048576 kB
	Pss:             1048576 kB
	Shared_Clean:          0 kB
	Shared_Dirty:          0 kB
	Private_Clean:   1048576 kB
	Private_Dirty:         0 kB
	Referenced:      1048576 kB
	Anonymous:             0 kB
	LazyFree:              0 kB
	AnonHugePages:         0 kB
	ShmemPmdMapped:        0 kB
	Shared_Hugetlb:        0 kB
	Private_Hugetlb:       0 kB
	Swap:                  0 kB
	SwapPss:               0 kB
	KernelPageSize:        4 kB
	MMUPageSize:           4 kB
	Locked:                0 kB

2) It is slower than using a common zero page because each page fault
   has more work to do. Instead of just inserting a common zero page we
   have to allocate a page cache page, zero it, and then insert it. Here
   are the average latencies of dax_load_hole() as measured by ftrace on
   a random test box:

    Old method, using zeroed page cache pages:	3.4 us
    New method, using the common 4k zero page:	0.8 us

   This was the average latency over 1 GiB of sequential reads done by
   this simple fio script:

     [global]
     size=1G
     filename=/root/dax/data
     fallocate=none
     [io]
     rw=read
     ioengine=mmap

3) The fact that we had to check for both DAX exceptional entries and
   for page cache pages in the radix tree made the DAX code more
   complex.

Solve these issues by following the lead of the DAX PMD code and using a
common 4k zero page instead.  As with the PMD code we will now insert a
DAX exceptional entry into the radix tree instead of a struct page
pointer which allows us to remove all the special casing in the DAX
code.

Note that we do still pretty aggressively check for regular pages in the
DAX radix tree, especially where we take action based on the bits set in
the page.  If we ever find a regular page in our radix tree now that
most likely means that someone besides DAX is inserting pages (which has
happened lots of times in the past), and we want to find that out early
and fail loudly.

This solution also removes the extra memory consumption.  Here is that
same /proc/[pid]/smaps after 1GiB of reading from a hole with the new
code:

	7f2054a74000-7f2094a74000 rw-s 00000000 103:00 12   /root/dax/data
	Size:            1048576 kB
	Rss:                   0 kB
	Pss:                   0 kB
	Shared_Clean:          0 kB
	Shared_Dirty:          0 kB
	Private_Clean:         0 kB
	Private_Dirty:         0 kB
	Referenced:            0 kB
	Anonymous:             0 kB
	LazyFree:              0 kB
	AnonHugePages:         0 kB
	ShmemPmdMapped:        0 kB
	Shared_Hugetlb:        0 kB
	Private_Hugetlb:       0 kB
	Swap:                  0 kB
	SwapPss:               0 kB
	KernelPageSize:        4 kB
	MMUPageSize:           4 kB
	Locked:                0 kB

Overall system memory consumption is similarly improved.

Another major change is that we remove dax_pfn_mkwrite() from our fault
flow, and instead rely on the page fault itself to make the PTE dirty
and writeable.  The following description from the patch adding the
vm_insert_mixed_mkwrite() call explains this a little more:

   "To be able to use the common 4k zero page in DAX we need to have our
    PTE fault path look more like our PMD fault path where a PTE entry
    can be marked as dirty and writeable as it is first inserted rather
    than waiting for a follow-up dax_pfn_mkwrite() =>
    finish_mkwrite_fault() call.

    Right now we can rely on having a dax_pfn_mkwrite() call because we
    can distinguish between these two cases in do_wp_page():

            case 1: 4k zero page => writable DAX storage
            case 2: read-only DAX storage => writeable DAX storage

    This distinction is made by via vm_normal_page(). vm_normal_page()
    returns false for the common 4k zero page, though, just as it does
    for DAX ptes. Instead of special casing the DAX + 4k zero page case
    we will simplify our DAX PTE page fault sequence so that it matches
    our DAX PMD sequence, and get rid of the dax_pfn_mkwrite() helper.
    We will instead use dax_iomap_fault() to handle write-protection
    faults.

    This means that insert_pfn() needs to follow the lead of
    insert_pfn_pmd() and allow us to pass in a 'mkwrite' flag. If
    'mkwrite' is set insert_pfn() will do the work that was previously
    done by wp_page_reuse() as part of the dax_pfn_mkwrite() call path"

Link: http://lkml.kernel.org/r/20170724170616.25810-4-ross.zwisler@linux.intel.com
Signed-off-by: Ross Zwisler <ross.zwisler@linux.intel.com>
Reviewed-by: Jan Kara <jack@suse.cz>
Cc: "Darrick J. Wong" <darrick.wong@oracle.com>
Cc: "Theodore Ts'o" <tytso@mit.edu>
Cc: Alexander Viro <viro@zeniv.linux.org.uk>
Cc: Andreas Dilger <adilger.kernel@dilger.ca>
Cc: Christoph Hellwig <hch@lst.de>
Cc: Dan Williams <dan.j.williams@intel.com>
Cc: Dave Chinner <david@fromorbit.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Matthew Wilcox <mawilcox@microsoft.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 Documentation/filesystems/dax.txt |   5 +-
 fs/dax.c                          | 243 ++++++++++++--------------------------
 fs/ext2/file.c                    |  25 +---
 fs/ext4/file.c                    |  32 +----
 fs/xfs/xfs_file.c                 |   2 +-
 include/linux/dax.h               |  12 +-
 include/trace/events/fs_dax.h     |   2 -
 7 files changed, 86 insertions(+), 235 deletions(-)

(limited to 'include')

diff --git a/Documentation/filesystems/dax.txt b/Documentation/filesystems/dax.txt
index a7e6e14aeb08..3be3b266be41 100644
--- a/Documentation/filesystems/dax.txt
+++ b/Documentation/filesystems/dax.txt
@@ -63,9 +63,8 @@ Filesystem support consists of
 - implementing an mmap file operation for DAX files which sets the
   VM_MIXEDMAP and VM_HUGEPAGE flags on the VMA, and setting the vm_ops to
   include handlers for fault, pmd_fault, page_mkwrite, pfn_mkwrite. These
-  handlers should probably call dax_iomap_fault() (for fault and page_mkwrite
-  handlers), dax_iomap_pmd_fault(), dax_pfn_mkwrite() passing the appropriate
-  iomap operations.
+  handlers should probably call dax_iomap_fault() passing the appropriate
+  fault size and iomap operations.
 - calling iomap_zero_range() passing appropriate iomap operations instead of
   block_truncate_page() for DAX files
 - ensuring that there is sufficient locking between reads, writes,
diff --git a/fs/dax.c b/fs/dax.c
index b8882b5ce6ed..ab67ae30ccbf 100644
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -66,7 +66,7 @@ static int dax_is_pte_entry(void *entry)
 
 static int dax_is_zero_entry(void *entry)
 {
-	return (unsigned long)entry & RADIX_DAX_HZP;
+	return (unsigned long)entry & RADIX_DAX_ZERO_PAGE;
 }
 
 static int dax_is_empty_entry(void *entry)
@@ -206,7 +206,8 @@ static void *get_unlocked_mapping_entry(struct address_space *mapping,
 	for (;;) {
 		entry = __radix_tree_lookup(&mapping->page_tree, index, NULL,
 					  &slot);
-		if (!entry || !radix_tree_exceptional_entry(entry) ||
+		if (!entry ||
+		    WARN_ON_ONCE(!radix_tree_exceptional_entry(entry)) ||
 		    !slot_locked(mapping, slot)) {
 			if (slotp)
 				*slotp = slot;
@@ -241,14 +242,9 @@ static void dax_unlock_mapping_entry(struct address_space *mapping,
 }
 
 static void put_locked_mapping_entry(struct address_space *mapping,
-				     pgoff_t index, void *entry)
+		pgoff_t index)
 {
-	if (!radix_tree_exceptional_entry(entry)) {
-		unlock_page(entry);
-		put_page(entry);
-	} else {
-		dax_unlock_mapping_entry(mapping, index);
-	}
+	dax_unlock_mapping_entry(mapping, index);
 }
 
 /*
@@ -258,7 +254,7 @@ static void put_locked_mapping_entry(struct address_space *mapping,
 static void put_unlocked_mapping_entry(struct address_space *mapping,
 				       pgoff_t index, void *entry)
 {
-	if (!radix_tree_exceptional_entry(entry))
+	if (!entry)
 		return;
 
 	/* We have to wake up next waiter for the radix tree entry lock */
@@ -266,15 +262,15 @@ static void put_unlocked_mapping_entry(struct address_space *mapping,
 }
 
 /*
- * Find radix tree entry at given index. If it points to a page, return with
- * the page locked. If it points to the exceptional entry, return with the
- * radix tree entry locked. If the radix tree doesn't contain given index,
- * create empty exceptional entry for the index and return with it locked.
+ * Find radix tree entry at given index. If it points to an exceptional entry,
+ * return it with the radix tree entry locked. If the radix tree doesn't
+ * contain given index, create an empty exceptional entry for the index and
+ * return with it locked.
  *
  * When requesting an entry with size RADIX_DAX_PMD, grab_mapping_entry() will
  * either return that locked entry or will return an error.  This error will
- * happen if there are any 4k entries (either zero pages or DAX entries)
- * within the 2MiB range that we are requesting.
+ * happen if there are any 4k entries within the 2MiB range that we are
+ * requesting.
  *
  * We always favor 4k entries over 2MiB entries. There isn't a flow where we
  * evict 4k entries in order to 'upgrade' them to a 2MiB entry.  A 2MiB
@@ -301,18 +297,21 @@ restart:
 	spin_lock_irq(&mapping->tree_lock);
 	entry = get_unlocked_mapping_entry(mapping, index, &slot);
 
+	if (WARN_ON_ONCE(entry && !radix_tree_exceptional_entry(entry))) {
+		entry = ERR_PTR(-EIO);
+		goto out_unlock;
+	}
+
 	if (entry) {
 		if (size_flag & RADIX_DAX_PMD) {
-			if (!radix_tree_exceptional_entry(entry) ||
-			    dax_is_pte_entry(entry)) {
+			if (dax_is_pte_entry(entry)) {
 				put_unlocked_mapping_entry(mapping, index,
 						entry);
 				entry = ERR_PTR(-EEXIST);
 				goto out_unlock;
 			}
 		} else { /* trying to grab a PTE entry */
-			if (radix_tree_exceptional_entry(entry) &&
-			    dax_is_pmd_entry(entry) &&
+			if (dax_is_pmd_entry(entry) &&
 			    (dax_is_zero_entry(entry) ||
 			     dax_is_empty_entry(entry))) {
 				pmd_downgrade = true;
@@ -346,7 +345,7 @@ restart:
 				mapping_gfp_mask(mapping) & ~__GFP_HIGHMEM);
 		if (err) {
 			if (pmd_downgrade)
-				put_locked_mapping_entry(mapping, index, entry);
+				put_locked_mapping_entry(mapping, index);
 			return ERR_PTR(err);
 		}
 		spin_lock_irq(&mapping->tree_lock);
@@ -396,21 +395,6 @@ restart:
 		spin_unlock_irq(&mapping->tree_lock);
 		return entry;
 	}
-	/* Normal page in radix tree? */
-	if (!radix_tree_exceptional_entry(entry)) {
-		struct page *page = entry;
-
-		get_page(page);
-		spin_unlock_irq(&mapping->tree_lock);
-		lock_page(page);
-		/* Page got truncated? Retry... */
-		if (unlikely(page->mapping != mapping)) {
-			unlock_page(page);
-			put_page(page);
-			goto restart;
-		}
-		return page;
-	}
 	entry = lock_slot(mapping, slot);
  out_unlock:
 	spin_unlock_irq(&mapping->tree_lock);
@@ -426,7 +410,7 @@ static int __dax_invalidate_mapping_entry(struct address_space *mapping,
 
 	spin_lock_irq(&mapping->tree_lock);
 	entry = get_unlocked_mapping_entry(mapping, index, NULL);
-	if (!entry || !radix_tree_exceptional_entry(entry))
+	if (!entry || WARN_ON_ONCE(!radix_tree_exceptional_entry(entry)))
 		goto out;
 	if (!trunc &&
 	    (radix_tree_tag_get(page_tree, index, PAGECACHE_TAG_DIRTY) ||
@@ -508,47 +492,27 @@ static void *dax_insert_mapping_entry(struct address_space *mapping,
 				      unsigned long flags)
 {
 	struct radix_tree_root *page_tree = &mapping->page_tree;
-	int error = 0;
-	bool hole_fill = false;
 	void *new_entry;
 	pgoff_t index = vmf->pgoff;
 
 	if (vmf->flags & FAULT_FLAG_WRITE)
 		__mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
 
-	/* Replacing hole page with block mapping? */
-	if (!radix_tree_exceptional_entry(entry)) {
-		hole_fill = true;
-		/*
-		 * Unmap the page now before we remove it from page cache below.
-		 * The page is locked so it cannot be faulted in again.
-		 */
-		unmap_mapping_range(mapping, vmf->pgoff << PAGE_SHIFT,
-				    PAGE_SIZE, 0);
-		error = radix_tree_preload(vmf->gfp_mask & ~__GFP_HIGHMEM);
-		if (error)
-			return ERR_PTR(error);
-	} else if (dax_is_zero_entry(entry) && !(flags & RADIX_DAX_HZP)) {
-		/* replacing huge zero page with PMD block mapping */
-		unmap_mapping_range(mapping,
-			(vmf->pgoff << PAGE_SHIFT) & PMD_MASK, PMD_SIZE, 0);
+	if (dax_is_zero_entry(entry) && !(flags & RADIX_DAX_ZERO_PAGE)) {
+		/* we are replacing a zero page with block mapping */
+		if (dax_is_pmd_entry(entry))
+			unmap_mapping_range(mapping,
+					(vmf->pgoff << PAGE_SHIFT) & PMD_MASK,
+					PMD_SIZE, 0);
+		else /* pte entry */
+			unmap_mapping_range(mapping, vmf->pgoff << PAGE_SHIFT,
+					PAGE_SIZE, 0);
 	}
 
 	spin_lock_irq(&mapping->tree_lock);
 	new_entry = dax_radix_locked_entry(sector, flags);
 
-	if (hole_fill) {
-		__delete_from_page_cache(entry, NULL);
-		/* Drop pagecache reference */
-		put_page(entry);
-		error = __radix_tree_insert(page_tree, index,
-				dax_radix_order(new_entry), new_entry);
-		if (error) {
-			new_entry = ERR_PTR(error);
-			goto unlock;
-		}
-		mapping->nrexceptional++;
-	} else if (dax_is_zero_entry(entry) || dax_is_empty_entry(entry)) {
+	if (dax_is_zero_entry(entry) || dax_is_empty_entry(entry)) {
 		/*
 		 * Only swap our new entry into the radix tree if the current
 		 * entry is a zero page or an empty entry.  If a normal PTE or
@@ -565,23 +529,14 @@ static void *dax_insert_mapping_entry(struct address_space *mapping,
 		WARN_ON_ONCE(ret != entry);
 		__radix_tree_replace(page_tree, node, slot,
 				     new_entry, NULL, NULL);
+		entry = new_entry;
 	}
+
 	if (vmf->flags & FAULT_FLAG_WRITE)
 		radix_tree_tag_set(page_tree, index, PAGECACHE_TAG_DIRTY);
- unlock:
+
 	spin_unlock_irq(&mapping->tree_lock);
-	if (hole_fill) {
-		radix_tree_preload_end();
-		/*
-		 * We don't need hole page anymore, it has been replaced with
-		 * locked radix tree entry now.
-		 */
-		if (mapping->a_ops->freepage)
-			mapping->a_ops->freepage(entry);
-		unlock_page(entry);
-		put_page(entry);
-	}
-	return new_entry;
+	return entry;
 }
 
 static inline unsigned long
@@ -683,7 +638,7 @@ static int dax_writeback_one(struct block_device *bdev,
 	spin_lock_irq(&mapping->tree_lock);
 	entry2 = get_unlocked_mapping_entry(mapping, index, &slot);
 	/* Entry got punched out / reallocated? */
-	if (!entry2 || !radix_tree_exceptional_entry(entry2))
+	if (!entry2 || WARN_ON_ONCE(!radix_tree_exceptional_entry(entry2)))
 		goto put_unlocked;
 	/*
 	 * Entry got reallocated elsewhere? No need to writeback. We have to
@@ -755,7 +710,7 @@ static int dax_writeback_one(struct block_device *bdev,
 	trace_dax_writeback_one(mapping->host, index, size >> PAGE_SHIFT);
  dax_unlock:
 	dax_read_unlock(id);
-	put_locked_mapping_entry(mapping, index, entry);
+	put_locked_mapping_entry(mapping, index);
 	return ret;
 
  put_unlocked:
@@ -830,11 +785,10 @@ EXPORT_SYMBOL_GPL(dax_writeback_mapping_range);
 
 static int dax_insert_mapping(struct address_space *mapping,
 		struct block_device *bdev, struct dax_device *dax_dev,
-		sector_t sector, size_t size, void **entryp,
+		sector_t sector, size_t size, void *entry,
 		struct vm_area_struct *vma, struct vm_fault *vmf)
 {
 	unsigned long vaddr = vmf->address;
-	void *entry = *entryp;
 	void *ret, *kaddr;
 	pgoff_t pgoff;
 	int id, rc;
@@ -855,87 +809,44 @@ static int dax_insert_mapping(struct address_space *mapping,
 	ret = dax_insert_mapping_entry(mapping, vmf, entry, sector, 0);
 	if (IS_ERR(ret))
 		return PTR_ERR(ret);
-	*entryp = ret;
 
 	trace_dax_insert_mapping(mapping->host, vmf, ret);
-	return vm_insert_mixed(vma, vaddr, pfn);
-}
-
-/**
- * dax_pfn_mkwrite - handle first write to DAX page
- * @vmf: The description of the fault
- */
-int dax_pfn_mkwrite(struct vm_fault *vmf)
-{
-	struct file *file = vmf->vma->vm_file;
-	struct address_space *mapping = file->f_mapping;
-	struct inode *inode = mapping->host;
-	void *entry, **slot;
-	pgoff_t index = vmf->pgoff;
-
-	spin_lock_irq(&mapping->tree_lock);
-	entry = get_unlocked_mapping_entry(mapping, index, &slot);
-	if (!entry || !radix_tree_exceptional_entry(entry)) {
-		if (entry)
-			put_unlocked_mapping_entry(mapping, index, entry);
-		spin_unlock_irq(&mapping->tree_lock);
-		trace_dax_pfn_mkwrite_no_entry(inode, vmf, VM_FAULT_NOPAGE);
-		return VM_FAULT_NOPAGE;
-	}
-	radix_tree_tag_set(&mapping->page_tree, index, PAGECACHE_TAG_DIRTY);
-	entry = lock_slot(mapping, slot);
-	spin_unlock_irq(&mapping->tree_lock);
-	/*
-	 * If we race with somebody updating the PTE and finish_mkwrite_fault()
-	 * fails, we don't care. We need to return VM_FAULT_NOPAGE and retry
-	 * the fault in either case.
-	 */
-	finish_mkwrite_fault(vmf);
-	put_locked_mapping_entry(mapping, index, entry);
-	trace_dax_pfn_mkwrite(inode, vmf, VM_FAULT_NOPAGE);
-	return VM_FAULT_NOPAGE;
+	if (vmf->flags & FAULT_FLAG_WRITE)
+		return vm_insert_mixed_mkwrite(vma, vaddr, pfn);
+	else
+		return vm_insert_mixed(vma, vaddr, pfn);
 }
-EXPORT_SYMBOL_GPL(dax_pfn_mkwrite);
 
 /*
- * The user has performed a load from a hole in the file.  Allocating
- * a new page in the file would cause excessive storage usage for
- * workloads with sparse files.  We allocate a page cache page instead.
- * We'll kick it out of the page cache if it's ever written to,
- * otherwise it will simply fall out of the page cache under memory
- * pressure without ever having been dirtied.
+ * The user has performed a load from a hole in the file.  Allocating a new
+ * page in the file would cause excessive storage usage for workloads with
+ * sparse files.  Instead we insert a read-only mapping of the 4k zero page.
+ * If this page is ever written to we will re-fault and change the mapping to
+ * point to real DAX storage instead.
  */
-static int dax_load_hole(struct address_space *mapping, void **entry,
+static int dax_load_hole(struct address_space *mapping, void *entry,
 			 struct vm_fault *vmf)
 {
 	struct inode *inode = mapping->host;
-	struct page *page;
-	int ret;
-
-	/* Hole page already exists? Return it...  */
-	if (!radix_tree_exceptional_entry(*entry)) {
-		page = *entry;
-		goto finish_fault;
-	}
+	unsigned long vaddr = vmf->address;
+	int ret = VM_FAULT_NOPAGE;
+	struct page *zero_page;
+	void *entry2;
 
-	/* This will replace locked radix tree entry with a hole page */
-	page = find_or_create_page(mapping, vmf->pgoff,
-				   vmf->gfp_mask | __GFP_ZERO);
-	if (!page) {
+	zero_page = ZERO_PAGE(0);
+	if (unlikely(!zero_page)) {
 		ret = VM_FAULT_OOM;
 		goto out;
 	}
 
-finish_fault:
-	vmf->page = page;
-	ret = finish_fault(vmf);
-	vmf->page = NULL;
-	*entry = page;
-	if (!ret) {
-		/* Grab reference for PTE that is now referencing the page */
-		get_page(page);
-		ret = VM_FAULT_NOPAGE;
+	entry2 = dax_insert_mapping_entry(mapping, vmf, entry, 0,
+			RADIX_DAX_ZERO_PAGE);
+	if (IS_ERR(entry2)) {
+		ret = VM_FAULT_SIGBUS;
+		goto out;
 	}
+
+	vm_insert_mixed(vmf->vma, vaddr, page_to_pfn_t(zero_page));
 out:
 	trace_dax_load_hole(inode, vmf, ret);
 	return ret;
@@ -1223,7 +1134,7 @@ static int dax_iomap_pte_fault(struct vm_fault *vmf,
 			major = VM_FAULT_MAJOR;
 		}
 		error = dax_insert_mapping(mapping, iomap.bdev, iomap.dax_dev,
-				sector, PAGE_SIZE, &entry, vmf->vma, vmf);
+				sector, PAGE_SIZE, entry, vmf->vma, vmf);
 		/* -EBUSY is fine, somebody else faulted on the same PTE */
 		if (error == -EBUSY)
 			error = 0;
@@ -1231,7 +1142,7 @@ static int dax_iomap_pte_fault(struct vm_fault *vmf,
 	case IOMAP_UNWRITTEN:
 	case IOMAP_HOLE:
 		if (!(vmf->flags & FAULT_FLAG_WRITE)) {
-			vmf_ret = dax_load_hole(mapping, &entry, vmf);
+			vmf_ret = dax_load_hole(mapping, entry, vmf);
 			goto finish_iomap;
 		}
 		/*FALLTHRU*/
@@ -1258,7 +1169,7 @@ static int dax_iomap_pte_fault(struct vm_fault *vmf,
 		ops->iomap_end(inode, pos, PAGE_SIZE, copied, flags, &iomap);
 	}
  unlock_entry:
-	put_locked_mapping_entry(mapping, vmf->pgoff, entry);
+	put_locked_mapping_entry(mapping, vmf->pgoff);
  out:
 	trace_dax_pte_fault_done(inode, vmf, vmf_ret);
 	return vmf_ret;
@@ -1272,7 +1183,7 @@ static int dax_iomap_pte_fault(struct vm_fault *vmf,
 #define PG_PMD_COLOUR	((PMD_SIZE >> PAGE_SHIFT) - 1)
 
 static int dax_pmd_insert_mapping(struct vm_fault *vmf, struct iomap *iomap,
-		loff_t pos, void **entryp)
+		loff_t pos, void *entry)
 {
 	struct address_space *mapping = vmf->vma->vm_file->f_mapping;
 	const sector_t sector = dax_iomap_sector(iomap, pos);
@@ -1303,11 +1214,10 @@ static int dax_pmd_insert_mapping(struct vm_fault *vmf, struct iomap *iomap,
 		goto unlock_fallback;
 	dax_read_unlock(id);
 
-	ret = dax_insert_mapping_entry(mapping, vmf, *entryp, sector,
+	ret = dax_insert_mapping_entry(mapping, vmf, entry, sector,
 			RADIX_DAX_PMD);
 	if (IS_ERR(ret))
 		goto fallback;
-	*entryp = ret;
 
 	trace_dax_pmd_insert_mapping(inode, vmf, length, pfn, ret);
 	return vmf_insert_pfn_pmd(vmf->vma, vmf->address, vmf->pmd,
@@ -1321,7 +1231,7 @@ fallback:
 }
 
 static int dax_pmd_load_hole(struct vm_fault *vmf, struct iomap *iomap,
-		void **entryp)
+		void *entry)
 {
 	struct address_space *mapping = vmf->vma->vm_file->f_mapping;
 	unsigned long pmd_addr = vmf->address & PMD_MASK;
@@ -1336,11 +1246,10 @@ static int dax_pmd_load_hole(struct vm_fault *vmf, struct iomap *iomap,
 	if (unlikely(!zero_page))
 		goto fallback;
 
-	ret = dax_insert_mapping_entry(mapping, vmf, *entryp, 0,
-			RADIX_DAX_PMD | RADIX_DAX_HZP);
+	ret = dax_insert_mapping_entry(mapping, vmf, entry, 0,
+			RADIX_DAX_PMD | RADIX_DAX_ZERO_PAGE);
 	if (IS_ERR(ret))
 		goto fallback;
-	*entryp = ret;
 
 	ptl = pmd_lock(vmf->vma->vm_mm, vmf->pmd);
 	if (!pmd_none(*(vmf->pmd))) {
@@ -1416,10 +1325,10 @@ static int dax_iomap_pmd_fault(struct vm_fault *vmf,
 		goto fallback;
 
 	/*
-	 * grab_mapping_entry() will make sure we get a 2M empty entry, a DAX
-	 * PMD or a HZP entry.  If it can't (because a 4k page is already in
-	 * the tree, for instance), it will return -EEXIST and we just fall
-	 * back to 4k entries.
+	 * grab_mapping_entry() will make sure we get a 2MiB empty entry, a
+	 * 2MiB zero page entry or a DAX PMD.  If it can't (because a 4k page
+	 * is already in the tree, for instance), it will return -EEXIST and
+	 * we just fall back to 4k entries.
 	 */
 	entry = grab_mapping_entry(mapping, pgoff, RADIX_DAX_PMD);
 	if (IS_ERR(entry))
@@ -1452,13 +1361,13 @@ static int dax_iomap_pmd_fault(struct vm_fault *vmf,
 
 	switch (iomap.type) {
 	case IOMAP_MAPPED:
-		result = dax_pmd_insert_mapping(vmf, &iomap, pos, &entry);
+		result = dax_pmd_insert_mapping(vmf, &iomap, pos, entry);
 		break;
 	case IOMAP_UNWRITTEN:
 	case IOMAP_HOLE:
 		if (WARN_ON_ONCE(write))
 			break;
-		result = dax_pmd_load_hole(vmf, &iomap, &entry);
+		result = dax_pmd_load_hole(vmf, &iomap, entry);
 		break;
 	default:
 		WARN_ON_ONCE(1);
@@ -1481,7 +1390,7 @@ static int dax_iomap_pmd_fault(struct vm_fault *vmf,
 				&iomap);
 	}
  unlock_entry:
-	put_locked_mapping_entry(mapping, pgoff, entry);
+	put_locked_mapping_entry(mapping, pgoff);
  fallback:
 	if (result == VM_FAULT_FALLBACK) {
 		split_huge_pmd(vma, vmf->pmd, vmf->address);
diff --git a/fs/ext2/file.c b/fs/ext2/file.c
index d34d32bdc944..ff3a3636a5ca 100644
--- a/fs/ext2/file.c
+++ b/fs/ext2/file.c
@@ -107,29 +107,6 @@ static int ext2_dax_fault(struct vm_fault *vmf)
 	return ret;
 }
 
-static int ext2_dax_pfn_mkwrite(struct vm_fault *vmf)
-{
-	struct inode *inode = file_inode(vmf->vma->vm_file);
-	struct ext2_inode_info *ei = EXT2_I(inode);
-	loff_t size;
-	int ret;
-
-	sb_start_pagefault(inode->i_sb);
-	file_update_time(vmf->vma->vm_file);
-	down_read(&ei->dax_sem);
-
-	/* check that the faulting page hasn't raced with truncate */
-	size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT;
-	if (vmf->pgoff >= size)
-		ret = VM_FAULT_SIGBUS;
-	else
-		ret = dax_pfn_mkwrite(vmf);
-
-	up_read(&ei->dax_sem);
-	sb_end_pagefault(inode->i_sb);
-	return ret;
-}
-
 static const struct vm_operations_struct ext2_dax_vm_ops = {
 	.fault		= ext2_dax_fault,
 	/*
@@ -138,7 +115,7 @@ static const struct vm_operations_struct ext2_dax_vm_ops = {
 	 * will always fail and fail back to regular faults.
 	 */
 	.page_mkwrite	= ext2_dax_fault,
-	.pfn_mkwrite	= ext2_dax_pfn_mkwrite,
+	.pfn_mkwrite	= ext2_dax_fault,
 };
 
 static int ext2_file_mmap(struct file *file, struct vm_area_struct *vma)
diff --git a/fs/ext4/file.c b/fs/ext4/file.c
index 0d7cf0cc9b87..f28ac999dfba 100644
--- a/fs/ext4/file.c
+++ b/fs/ext4/file.c
@@ -311,41 +311,11 @@ static int ext4_dax_fault(struct vm_fault *vmf)
 	return ext4_dax_huge_fault(vmf, PE_SIZE_PTE);
 }
 
-/*
- * Handle write fault for VM_MIXEDMAP mappings. Similarly to ext4_dax_fault()
- * handler we check for races agaist truncate. Note that since we cycle through
- * i_mmap_sem, we are sure that also any hole punching that began before we
- * were called is finished by now and so if it included part of the file we
- * are working on, our pte will get unmapped and the check for pte_same() in
- * wp_pfn_shared() fails. Thus fault gets retried and things work out as
- * desired.
- */
-static int ext4_dax_pfn_mkwrite(struct vm_fault *vmf)
-{
-	struct inode *inode = file_inode(vmf->vma->vm_file);
-	struct super_block *sb = inode->i_sb;
-	loff_t size;
-	int ret;
-
-	sb_start_pagefault(sb);
-	file_update_time(vmf->vma->vm_file);
-	down_read(&EXT4_I(inode)->i_mmap_sem);
-	size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT;
-	if (vmf->pgoff >= size)
-		ret = VM_FAULT_SIGBUS;
-	else
-		ret = dax_pfn_mkwrite(vmf);
-	up_read(&EXT4_I(inode)->i_mmap_sem);
-	sb_end_pagefault(sb);
-
-	return ret;
-}
-
 static const struct vm_operations_struct ext4_dax_vm_ops = {
 	.fault		= ext4_dax_fault,
 	.huge_fault	= ext4_dax_huge_fault,
 	.page_mkwrite	= ext4_dax_fault,
-	.pfn_mkwrite	= ext4_dax_pfn_mkwrite,
+	.pfn_mkwrite	= ext4_dax_fault,
 };
 #else
 #define ext4_dax_vm_ops	ext4_file_vm_ops
diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index c4893e226fd8..62db8ffa83b9 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -1130,7 +1130,7 @@ xfs_filemap_pfn_mkwrite(
 	if (vmf->pgoff >= size)
 		ret = VM_FAULT_SIGBUS;
 	else if (IS_DAX(inode))
-		ret = dax_pfn_mkwrite(vmf);
+		ret = dax_iomap_fault(vmf, PE_SIZE_PTE, &xfs_iomap_ops);
 	xfs_iunlock(ip, XFS_MMAPLOCK_SHARED);
 	sb_end_pagefault(inode->i_sb);
 	return ret;
diff --git a/include/linux/dax.h b/include/linux/dax.h
index df97b7af7e2c..b3518559f0da 100644
--- a/include/linux/dax.h
+++ b/include/linux/dax.h
@@ -91,18 +91,17 @@ bool dax_write_cache_enabled(struct dax_device *dax_dev);
 
 /*
  * We use lowest available bit in exceptional entry for locking, one bit for
- * the entry size (PMD) and two more to tell us if the entry is a huge zero
- * page (HZP) or an empty entry that is just used for locking.  In total four
- * special bits.
+ * the entry size (PMD) and two more to tell us if the entry is a zero page or
+ * an empty entry that is just used for locking.  In total four special bits.
  *
- * If the PMD bit isn't set the entry has size PAGE_SIZE, and if the HZP and
- * EMPTY bits aren't set the entry is a normal DAX entry with a filesystem
+ * If the PMD bit isn't set the entry has size PAGE_SIZE, and if the ZERO_PAGE
+ * and EMPTY bits aren't set the entry is a normal DAX entry with a filesystem
  * block allocation.
  */
 #define RADIX_DAX_SHIFT	(RADIX_TREE_EXCEPTIONAL_SHIFT + 4)
 #define RADIX_DAX_ENTRY_LOCK (1 << RADIX_TREE_EXCEPTIONAL_SHIFT)
 #define RADIX_DAX_PMD (1 << (RADIX_TREE_EXCEPTIONAL_SHIFT + 1))
-#define RADIX_DAX_HZP (1 << (RADIX_TREE_EXCEPTIONAL_SHIFT + 2))
+#define RADIX_DAX_ZERO_PAGE (1 << (RADIX_TREE_EXCEPTIONAL_SHIFT + 2))
 #define RADIX_DAX_EMPTY (1 << (RADIX_TREE_EXCEPTIONAL_SHIFT + 3))
 
 static inline unsigned long dax_radix_sector(void *entry)
@@ -153,7 +152,6 @@ static inline unsigned int dax_radix_order(void *entry)
 	return 0;
 }
 #endif
-int dax_pfn_mkwrite(struct vm_fault *vmf);
 
 static inline bool dax_mapping(struct address_space *mapping)
 {
diff --git a/include/trace/events/fs_dax.h b/include/trace/events/fs_dax.h
index 08bb3ed18dcc..fbc4a06f7310 100644
--- a/include/trace/events/fs_dax.h
+++ b/include/trace/events/fs_dax.h
@@ -190,8 +190,6 @@ DEFINE_EVENT(dax_pte_fault_class, name, \
 
 DEFINE_PTE_FAULT_EVENT(dax_pte_fault);
 DEFINE_PTE_FAULT_EVENT(dax_pte_fault_done);
-DEFINE_PTE_FAULT_EVENT(dax_pfn_mkwrite_no_entry);
-DEFINE_PTE_FAULT_EVENT(dax_pfn_mkwrite);
 DEFINE_PTE_FAULT_EVENT(dax_load_hole);
 
 TRACE_EVENT(dax_insert_mapping,
-- 
cgit v1.2.3


From d01ad197ac3b50a99ea668697acefe12e73c5fea Mon Sep 17 00:00:00 2001
From: Ross Zwisler <ross.zwisler@linux.intel.com>
Date: Wed, 6 Sep 2017 16:18:47 -0700
Subject: dax: remove DAX code from page_cache_tree_insert()

Now that we no longer insert struct page pointers in DAX radix trees we
can remove the special casing for DAX in page_cache_tree_insert().

This also allows us to make dax_wake_mapping_entry_waiter() local to
fs/dax.c, removing it from dax.h.

Link: http://lkml.kernel.org/r/20170724170616.25810-5-ross.zwisler@linux.intel.com
Signed-off-by: Ross Zwisler <ross.zwisler@linux.intel.com>
Suggested-by: Jan Kara <jack@suse.cz>
Reviewed-by: Jan Kara <jack@suse.cz>
Cc: "Darrick J. Wong" <darrick.wong@oracle.com>
Cc: "Theodore Ts'o" <tytso@mit.edu>
Cc: Alexander Viro <viro@zeniv.linux.org.uk>
Cc: Andreas Dilger <adilger.kernel@dilger.ca>
Cc: Christoph Hellwig <hch@lst.de>
Cc: Dan Williams <dan.j.williams@intel.com>
Cc: Dave Chinner <david@fromorbit.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Matthew Wilcox <mawilcox@microsoft.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/dax.c            |  2 +-
 include/linux/dax.h |  2 --
 mm/filemap.c        | 13 ++-----------
 3 files changed, 3 insertions(+), 14 deletions(-)

(limited to 'include')

diff --git a/fs/dax.c b/fs/dax.c
index ab67ae30ccbf..8f70e3b59b91 100644
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -127,7 +127,7 @@ static int wake_exceptional_entry_func(wait_queue_entry_t *wait, unsigned int mo
  * correct waitqueue where tasks might be waiting for that old 'entry' and
  * wake them.
  */
-void dax_wake_mapping_entry_waiter(struct address_space *mapping,
+static void dax_wake_mapping_entry_waiter(struct address_space *mapping,
 		pgoff_t index, void *entry, bool wake_all)
 {
 	struct exceptional_entry_key key;
diff --git a/include/linux/dax.h b/include/linux/dax.h
index b3518559f0da..319e0d997446 100644
--- a/include/linux/dax.h
+++ b/include/linux/dax.h
@@ -123,8 +123,6 @@ int dax_iomap_fault(struct vm_fault *vmf, enum page_entry_size pe_size,
 int dax_delete_mapping_entry(struct address_space *mapping, pgoff_t index);
 int dax_invalidate_mapping_entry_sync(struct address_space *mapping,
 				      pgoff_t index);
-void dax_wake_mapping_entry_waiter(struct address_space *mapping,
-		pgoff_t index, void *entry, bool wake_all);
 
 #ifdef CONFIG_FS_DAX
 int __dax_zero_page_range(struct block_device *bdev,
diff --git a/mm/filemap.c b/mm/filemap.c
index 65b4b6e7f7bd..dad935769055 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -130,17 +130,8 @@ static int page_cache_tree_insert(struct address_space *mapping,
 			return -EEXIST;
 
 		mapping->nrexceptional--;
-		if (!dax_mapping(mapping)) {
-			if (shadowp)
-				*shadowp = p;
-		} else {
-			/* DAX can replace empty locked entry with a hole */
-			WARN_ON_ONCE(p !=
-				dax_radix_locked_entry(0, RADIX_DAX_EMPTY));
-			/* Wakeup waiters for exceptional entry lock */
-			dax_wake_mapping_entry_waiter(mapping, page->index, p,
-						      true);
-		}
+		if (shadowp)
+			*shadowp = p;
 	}
 	__radix_tree_replace(&mapping->page_tree, node, slot, page,
 			     workingset_update_node, mapping);
-- 
cgit v1.2.3


From 527b19d0808e75fbba896beb2435c2b4d6bcd32a Mon Sep 17 00:00:00 2001
From: Ross Zwisler <ross.zwisler@linux.intel.com>
Date: Wed, 6 Sep 2017 16:18:51 -0700
Subject: dax: move all DAX radix tree defs to fs/dax.c

Now that we no longer insert struct page pointers in DAX radix trees the
page cache code no longer needs to know anything about DAX exceptional
entries.  Move all the DAX exceptional entry definitions from dax.h to
fs/dax.c.

Link: http://lkml.kernel.org/r/20170724170616.25810-6-ross.zwisler@linux.intel.com
Signed-off-by: Ross Zwisler <ross.zwisler@linux.intel.com>
Suggested-by: Jan Kara <jack@suse.cz>
Reviewed-by: Jan Kara <jack@suse.cz>
Cc: "Darrick J. Wong" <darrick.wong@oracle.com>
Cc: "Theodore Ts'o" <tytso@mit.edu>
Cc: Alexander Viro <viro@zeniv.linux.org.uk>
Cc: Andreas Dilger <adilger.kernel@dilger.ca>
Cc: Christoph Hellwig <hch@lst.de>
Cc: Dan Williams <dan.j.williams@intel.com>
Cc: Dave Chinner <david@fromorbit.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Matthew Wilcox <mawilcox@microsoft.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/dax.c            | 34 ++++++++++++++++++++++++++++++++++
 include/linux/dax.h | 41 -----------------------------------------
 2 files changed, 34 insertions(+), 41 deletions(-)

(limited to 'include')

diff --git a/fs/dax.c b/fs/dax.c
index 8f70e3b59b91..c576f6181dc8 100644
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -54,6 +54,40 @@ static int __init init_dax_wait_table(void)
 }
 fs_initcall(init_dax_wait_table);
 
+/*
+ * We use lowest available bit in exceptional entry for locking, one bit for
+ * the entry size (PMD) and two more to tell us if the entry is a zero page or
+ * an empty entry that is just used for locking.  In total four special bits.
+ *
+ * If the PMD bit isn't set the entry has size PAGE_SIZE, and if the ZERO_PAGE
+ * and EMPTY bits aren't set the entry is a normal DAX entry with a filesystem
+ * block allocation.
+ */
+#define RADIX_DAX_SHIFT		(RADIX_TREE_EXCEPTIONAL_SHIFT + 4)
+#define RADIX_DAX_ENTRY_LOCK	(1 << RADIX_TREE_EXCEPTIONAL_SHIFT)
+#define RADIX_DAX_PMD		(1 << (RADIX_TREE_EXCEPTIONAL_SHIFT + 1))
+#define RADIX_DAX_ZERO_PAGE	(1 << (RADIX_TREE_EXCEPTIONAL_SHIFT + 2))
+#define RADIX_DAX_EMPTY		(1 << (RADIX_TREE_EXCEPTIONAL_SHIFT + 3))
+
+static unsigned long dax_radix_sector(void *entry)
+{
+	return (unsigned long)entry >> RADIX_DAX_SHIFT;
+}
+
+static void *dax_radix_locked_entry(sector_t sector, unsigned long flags)
+{
+	return (void *)(RADIX_TREE_EXCEPTIONAL_ENTRY | flags |
+			((unsigned long)sector << RADIX_DAX_SHIFT) |
+			RADIX_DAX_ENTRY_LOCK);
+}
+
+static unsigned int dax_radix_order(void *entry)
+{
+	if ((unsigned long)entry & RADIX_DAX_PMD)
+		return PMD_SHIFT - PAGE_SHIFT;
+	return 0;
+}
+
 static int dax_is_pmd_entry(void *entry)
 {
 	return (unsigned long)entry & RADIX_DAX_PMD;
diff --git a/include/linux/dax.h b/include/linux/dax.h
index 319e0d997446..eb0bff6f1eab 100644
--- a/include/linux/dax.h
+++ b/include/linux/dax.h
@@ -89,33 +89,6 @@ void dax_flush(struct dax_device *dax_dev, pgoff_t pgoff, void *addr,
 void dax_write_cache(struct dax_device *dax_dev, bool wc);
 bool dax_write_cache_enabled(struct dax_device *dax_dev);
 
-/*
- * We use lowest available bit in exceptional entry for locking, one bit for
- * the entry size (PMD) and two more to tell us if the entry is a zero page or
- * an empty entry that is just used for locking.  In total four special bits.
- *
- * If the PMD bit isn't set the entry has size PAGE_SIZE, and if the ZERO_PAGE
- * and EMPTY bits aren't set the entry is a normal DAX entry with a filesystem
- * block allocation.
- */
-#define RADIX_DAX_SHIFT	(RADIX_TREE_EXCEPTIONAL_SHIFT + 4)
-#define RADIX_DAX_ENTRY_LOCK (1 << RADIX_TREE_EXCEPTIONAL_SHIFT)
-#define RADIX_DAX_PMD (1 << (RADIX_TREE_EXCEPTIONAL_SHIFT + 1))
-#define RADIX_DAX_ZERO_PAGE (1 << (RADIX_TREE_EXCEPTIONAL_SHIFT + 2))
-#define RADIX_DAX_EMPTY (1 << (RADIX_TREE_EXCEPTIONAL_SHIFT + 3))
-
-static inline unsigned long dax_radix_sector(void *entry)
-{
-	return (unsigned long)entry >> RADIX_DAX_SHIFT;
-}
-
-static inline void *dax_radix_locked_entry(sector_t sector, unsigned long flags)
-{
-	return (void *)(RADIX_TREE_EXCEPTIONAL_ENTRY | flags |
-			((unsigned long)sector << RADIX_DAX_SHIFT) |
-			RADIX_DAX_ENTRY_LOCK);
-}
-
 ssize_t dax_iomap_rw(struct kiocb *iocb, struct iov_iter *iter,
 		const struct iomap_ops *ops);
 int dax_iomap_fault(struct vm_fault *vmf, enum page_entry_size pe_size,
@@ -137,20 +110,6 @@ static inline int __dax_zero_page_range(struct block_device *bdev,
 }
 #endif
 
-#ifdef CONFIG_FS_DAX_PMD
-static inline unsigned int dax_radix_order(void *entry)
-{
-	if ((unsigned long)entry & RADIX_DAX_PMD)
-		return PMD_SHIFT - PAGE_SHIFT;
-	return 0;
-}
-#else
-static inline unsigned int dax_radix_order(void *entry)
-{
-	return 0;
-}
-#endif
-
 static inline bool dax_mapping(struct address_space *mapping)
 {
 	return mapping->host && IS_DAX(mapping->host);
-- 
cgit v1.2.3


From 2482ddec670fb83717d129012bc558777cb159f7 Mon Sep 17 00:00:00 2001
From: Kees Cook <keescook@chromium.org>
Date: Wed, 6 Sep 2017 16:19:18 -0700
Subject: mm: add SLUB free list pointer obfuscation

This SLUB free list pointer obfuscation code is modified from Brad
Spengler/PaX Team's code in the last public patch of grsecurity/PaX
based on my understanding of the code.  Changes or omissions from the
original code are mine and don't reflect the original grsecurity/PaX
code.

This adds a per-cache random value to SLUB caches that is XORed with
their freelist pointer address and value.  This adds nearly zero
overhead and frustrates the very common heap overflow exploitation
method of overwriting freelist pointers.

A recent example of the attack is written up here:

  http://cyseclabs.com/blog/cve-2016-6187-heap-off-by-one-exploit

and there is a section dedicated to the technique the book "A Guide to
Kernel Exploitation: Attacking the Core".

This is based on patches by Daniel Micay, and refactored to minimize the
use of #ifdef.

With 200-count cycles of "hackbench -g 20 -l 1000" I saw the following
run times:

 before:
 	mean 10.11882499999999999995
	variance .03320378329145728642
	stdev .18221905304181911048

  after:
	mean 10.12654000000000000014
	variance .04700556623115577889
	stdev .21680767106160192064

The difference gets lost in the noise, but if the above is to be taken
literally, using CONFIG_FREELIST_HARDENED is 0.07% slower.

Link: http://lkml.kernel.org/r/20170802180609.GA66807@beast
Signed-off-by: Kees Cook <keescook@chromium.org>
Suggested-by: Daniel Micay <danielmicay@gmail.com>
Cc: Rik van Riel <riel@redhat.com>
Cc: Tycho Andersen <tycho@docker.com>
Cc: Alexander Popov <alex.popov@linux.com>
Cc: Christoph Lameter <cl@linux.com>
Cc: Pekka Enberg <penberg@kernel.org>
Cc: David Rientjes <rientjes@google.com>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/slub_def.h |  4 ++++
 init/Kconfig             |  9 +++++++++
 mm/slub.c                | 42 +++++++++++++++++++++++++++++++++++++-----
 3 files changed, 50 insertions(+), 5 deletions(-)

(limited to 'include')

diff --git a/include/linux/slub_def.h b/include/linux/slub_def.h
index cc0faf3a90be..0783b622311e 100644
--- a/include/linux/slub_def.h
+++ b/include/linux/slub_def.h
@@ -115,6 +115,10 @@ struct kmem_cache {
 #endif
 #endif
 
+#ifdef CONFIG_SLAB_FREELIST_HARDENED
+	unsigned long random;
+#endif
+
 #ifdef CONFIG_NUMA
 	/*
 	 * Defragmentation by allocating from a remote node.
diff --git a/init/Kconfig b/init/Kconfig
index 5f0ef850e808..78cb2461012e 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -1576,6 +1576,15 @@ config SLAB_FREELIST_RANDOM
 	  security feature reduces the predictability of the kernel slab
 	  allocator against heap overflows.
 
+config SLAB_FREELIST_HARDENED
+	bool "Harden slab freelist metadata"
+	depends on SLUB
+	help
+	  Many kernel heap attacks try to target slab cache metadata and
+	  other infrastructure. This options makes minor performance
+	  sacrifies to harden the kernel slab allocator against common
+	  freelist exploit methods.
+
 config SLUB_CPU_PARTIAL
 	default y
 	depends on SLUB && SMP
diff --git a/mm/slub.c b/mm/slub.c
index 3e90d791dd41..6c87c2c6af24 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -34,6 +34,7 @@
 #include <linux/stacktrace.h>
 #include <linux/prefetch.h>
 #include <linux/memcontrol.h>
+#include <linux/random.h>
 
 #include <trace/events/kmem.h>
 
@@ -238,30 +239,58 @@ static inline void stat(const struct kmem_cache *s, enum stat_item si)
  * 			Core slab cache functions
  *******************************************************************/
 
+/*
+ * Returns freelist pointer (ptr). With hardening, this is obfuscated
+ * with an XOR of the address where the pointer is held and a per-cache
+ * random number.
+ */
+static inline void *freelist_ptr(const struct kmem_cache *s, void *ptr,
+				 unsigned long ptr_addr)
+{
+#ifdef CONFIG_SLAB_FREELIST_HARDENED
+	return (void *)((unsigned long)ptr ^ s->random ^ ptr_addr);
+#else
+	return ptr;
+#endif
+}
+
+/* Returns the freelist pointer recorded at location ptr_addr. */
+static inline void *freelist_dereference(const struct kmem_cache *s,
+					 void *ptr_addr)
+{
+	return freelist_ptr(s, (void *)*(unsigned long *)(ptr_addr),
+			    (unsigned long)ptr_addr);
+}
+
 static inline void *get_freepointer(struct kmem_cache *s, void *object)
 {
-	return *(void **)(object + s->offset);
+	return freelist_dereference(s, object + s->offset);
 }
 
 static void prefetch_freepointer(const struct kmem_cache *s, void *object)
 {
-	prefetch(object + s->offset);
+	if (object)
+		prefetch(freelist_dereference(s, object + s->offset));
 }
 
 static inline void *get_freepointer_safe(struct kmem_cache *s, void *object)
 {
+	unsigned long freepointer_addr;
 	void *p;
 
 	if (!debug_pagealloc_enabled())
 		return get_freepointer(s, object);
 
-	probe_kernel_read(&p, (void **)(object + s->offset), sizeof(p));
-	return p;
+	freepointer_addr = (unsigned long)object + s->offset;
+	probe_kernel_read(&p, (void **)freepointer_addr, sizeof(p));
+	return freelist_ptr(s, p, freepointer_addr);
 }
 
 static inline void set_freepointer(struct kmem_cache *s, void *object, void *fp)
 {
-	*(void **)(object + s->offset) = fp;
+	unsigned long freeptr_addr = (unsigned long)object + s->offset;
+
+	*(void **)freeptr_addr = freelist_ptr(s, fp, freeptr_addr);
 }
 
 /* Loop over all objects in a slab */
@@ -3563,6 +3592,9 @@ static int kmem_cache_open(struct kmem_cache *s, unsigned long flags)
 {
 	s->flags = kmem_cache_flags(s->size, flags, s->name, s->ctor);
 	s->reserved = 0;
+#ifdef CONFIG_SLAB_FREELIST_HARDENED
+	s->random = get_random_long();
+#endif
 
 	if (need_reserve_slab_rcu && (s->flags & SLAB_TYPESAFE_BY_RCU))
 		s->reserved = sizeof(struct rcu_head);
-- 
cgit v1.2.3


From d460acb5bdffc19b492b70b8f416c24dc03c474e Mon Sep 17 00:00:00 2001
From: Chris Wilson <chris@chris-wilson.co.uk>
Date: Wed, 6 Sep 2017 16:19:26 -0700
Subject: mm: track actual nr_scanned during shrink_slab()

Some shrinkers may only be able to free a bunch of objects at a time,
and so free more than the requested nr_to_scan in one pass.

Whilst other shrinkers may find themselves even unable to scan as many
objects as they counted, and so underreport.  Account for the extra
freed/scanned objects against the total number of objects we intend to
scan, otherwise we may end up penalising the slab far more than
intended.  Similarly, we want to add the underperforming scan to the
deferred pass so that we try harder and harder in future passes.

Link: http://lkml.kernel.org/r/20170822135325.9191-1-chris@chris-wilson.co.uk
Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Hillf Danton <hillf.zj@alibaba-inc.com>
Cc: Minchan Kim <minchan@kernel.org>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Mel Gorman <mgorman@techsingularity.net>
Cc: Shaohua Li <shli@fb.com>
Cc: Christoph Lameter <cl@linux.com>
Cc: Pekka Enberg <penberg@kernel.org>
Cc: David Rientjes <rientjes@google.com>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Cc: Joonas Lahtinen <joonas.lahtinen@linux.intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/shrinker.h | 7 +++++++
 mm/vmscan.c              | 7 ++++---
 2 files changed, 11 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/include/linux/shrinker.h b/include/linux/shrinker.h
index 4fcacd915d45..51d189615bda 100644
--- a/include/linux/shrinker.h
+++ b/include/linux/shrinker.h
@@ -18,6 +18,13 @@ struct shrink_control {
 	 */
 	unsigned long nr_to_scan;
 
+	/*
+	 * How many objects did scan_objects process?
+	 * This defaults to nr_to_scan before every call, but the callee
+	 * should track its actual progress.
+	 */
+	unsigned long nr_scanned;
+
 	/* current node being shrunk (for NUMA aware shrinkers) */
 	int nid;
 
diff --git a/mm/vmscan.c b/mm/vmscan.c
index f957afe900ec..095817820a56 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -393,14 +393,15 @@ static unsigned long do_shrink_slab(struct shrink_control *shrinkctl,
 		unsigned long nr_to_scan = min(batch_size, total_scan);
 
 		shrinkctl->nr_to_scan = nr_to_scan;
+		shrinkctl->nr_scanned = nr_to_scan;
 		ret = shrinker->scan_objects(shrinker, shrinkctl);
 		if (ret == SHRINK_STOP)
 			break;
 		freed += ret;
 
-		count_vm_events(SLABS_SCANNED, nr_to_scan);
-		total_scan -= nr_to_scan;
-		scanned += nr_to_scan;
+		count_vm_events(SLABS_SCANNED, shrinkctl->nr_scanned);
+		total_scan -= shrinkctl->nr_scanned;
+		scanned += shrinkctl->nr_scanned;
 
 		cond_resched();
 	}
-- 
cgit v1.2.3


From e5e68930263377c6d4f6da0ff06f36b55d83a83f Mon Sep 17 00:00:00 2001
From: Michal Hocko <mhocko@suse.com>
Date: Wed, 6 Sep 2017 16:19:37 -0700
Subject: mm, memory_hotplug: display allowed zones in the preferred ordering

Prior to commit f1dd2cd13c4b ("mm, memory_hotplug: do not associate
hotadded memory to zones until online") we used to allow to change the
valid zone types of a memory block if it is adjacent to a different zone
type.

This fact was reflected in memoryNN/valid_zones by the ordering of
printed zones.  The first one was default (echo online > memoryNN/state)
and the other one could be onlined explicitly by online_{movable,kernel}.

This behavior was removed by the said patch and as such the ordering was
not all that important.  In most cases a kernel zone would be default
anyway.  The only exception is movable_node handled by "mm,
memory_hotplug: support movable_node for hotpluggable nodes".

Let's reintroduce this behavior again because later patch will remove
the zone overlap restriction and so user will be allowed to online
kernel resp.  movable block regardless of its placement.  Original
behavior will then become significant again because it would be
non-trivial for users to see what is the default zone to online into.

Implementation is really simple.  Pull out zone selection out of
move_pfn_range into zone_for_pfn_range helper and use it in
show_valid_zones to display the zone for default onlining and then both
kernel and movable if they are allowed.  Default online zone is not
duplicated.

Link: http://lkml.kernel.org/r/20170714121233.16861-2-mhocko@kernel.org
Signed-off-by: Michal Hocko <mhocko@suse.com>
Acked-by: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Acked-by: Vlastimil Babka <vbabka@suse.cz>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: Reza Arbab <arbab@linux.vnet.ibm.com>
Cc: Yasuaki Ishimatsu <yasu.isimatu@gmail.com>
Cc: Xishi Qiu <qiuxishi@huawei.com>
Cc: Kani Toshimitsu <toshi.kani@hpe.com>
Cc: <slaoub@gmail.com>
Cc: Daniel Kiper <daniel.kiper@oracle.com>
Cc: Igor Mammedov <imammedo@redhat.com>
Cc: Vitaly Kuznetsov <vkuznets@redhat.com>
Cc: Wei Yang <richard.weiyang@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/base/memory.c          | 33 +++++++++++++------
 include/linux/memory_hotplug.h |  2 +-
 mm/memory_hotplug.c            | 73 ++++++++++++++++++++++++------------------
 3 files changed, 65 insertions(+), 43 deletions(-)

(limited to 'include')

diff --git a/drivers/base/memory.c b/drivers/base/memory.c
index c7c4e0325cdb..26383af9900c 100644
--- a/drivers/base/memory.c
+++ b/drivers/base/memory.c
@@ -388,6 +388,22 @@ static ssize_t show_phys_device(struct device *dev,
 }
 
 #ifdef CONFIG_MEMORY_HOTREMOVE
+static void print_allowed_zone(char *buf, int nid, unsigned long start_pfn,
+		unsigned long nr_pages, int online_type,
+		struct zone *default_zone)
+{
+	struct zone *zone;
+
+	if (!allow_online_pfn_range(nid, start_pfn, nr_pages, online_type))
+		return;
+
+	zone = zone_for_pfn_range(online_type, nid, start_pfn, nr_pages);
+	if (zone != default_zone) {
+		strcat(buf, " ");
+		strcat(buf, zone->name);
+	}
+}
+
 static ssize_t show_valid_zones(struct device *dev,
 				struct device_attribute *attr, char *buf)
 {
@@ -395,7 +411,7 @@ static ssize_t show_valid_zones(struct device *dev,
 	unsigned long start_pfn = section_nr_to_pfn(mem->start_section_nr);
 	unsigned long nr_pages = PAGES_PER_SECTION * sections_per_block;
 	unsigned long valid_start_pfn, valid_end_pfn;
-	bool append = false;
+	struct zone *default_zone;
 	int nid;
 
 	/*
@@ -418,16 +434,13 @@ static ssize_t show_valid_zones(struct device *dev,
 	}
 
 	nid = pfn_to_nid(start_pfn);
-	if (allow_online_pfn_range(nid, start_pfn, nr_pages, MMOP_ONLINE_KERNEL)) {
-		strcat(buf, default_zone_for_pfn(nid, start_pfn, nr_pages)->name);
-		append = true;
-	}
+	default_zone = zone_for_pfn_range(MMOP_ONLINE_KEEP, nid, start_pfn, nr_pages);
+	strcat(buf, default_zone->name);
 
-	if (allow_online_pfn_range(nid, start_pfn, nr_pages, MMOP_ONLINE_MOVABLE)) {
-		if (append)
-			strcat(buf, " ");
-		strcat(buf, NODE_DATA(nid)->node_zones[ZONE_MOVABLE].name);
-	}
+	print_allowed_zone(buf, nid, start_pfn, nr_pages, MMOP_ONLINE_KERNEL,
+			default_zone);
+	print_allowed_zone(buf, nid, start_pfn, nr_pages, MMOP_ONLINE_MOVABLE,
+			default_zone);
 out:
 	strcat(buf, "\n");
 
diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h
index c8a5056a5ae0..5e6e4cc36ff4 100644
--- a/include/linux/memory_hotplug.h
+++ b/include/linux/memory_hotplug.h
@@ -319,6 +319,6 @@ extern struct page *sparse_decode_mem_map(unsigned long coded_mem_map,
 					  unsigned long pnum);
 extern bool allow_online_pfn_range(int nid, unsigned long pfn, unsigned long nr_pages,
 		int online_type);
-extern struct zone *default_zone_for_pfn(int nid, unsigned long pfn,
+extern struct zone *zone_for_pfn_range(int online_type, int nid, unsigned start_pfn,
 		unsigned long nr_pages);
 #endif /* __LINUX_MEMORY_HOTPLUG_H */
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index 8dccc317aac2..e342624622a1 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -773,31 +773,6 @@ static void node_states_set_node(int node, struct memory_notify *arg)
 	node_set_state(node, N_MEMORY);
 }
 
-bool allow_online_pfn_range(int nid, unsigned long pfn, unsigned long nr_pages, int online_type)
-{
-	struct pglist_data *pgdat = NODE_DATA(nid);
-	struct zone *movable_zone = &pgdat->node_zones[ZONE_MOVABLE];
-	struct zone *default_zone = default_zone_for_pfn(nid, pfn, nr_pages);
-
-	/*
-	 * TODO there shouldn't be any inherent reason to have ZONE_NORMAL
-	 * physically before ZONE_MOVABLE. All we need is they do not
-	 * overlap. Historically we didn't allow ZONE_NORMAL after ZONE_MOVABLE
-	 * though so let's stick with it for simplicity for now.
-	 * TODO make sure we do not overlap with ZONE_DEVICE
-	 */
-	if (online_type == MMOP_ONLINE_KERNEL) {
-		if (zone_is_empty(movable_zone))
-			return true;
-		return movable_zone->zone_start_pfn >= pfn + nr_pages;
-	} else if (online_type == MMOP_ONLINE_MOVABLE) {
-		return zone_end_pfn(default_zone) <= pfn;
-	}
-
-	/* MMOP_ONLINE_KEEP will always succeed and inherits the current zone */
-	return online_type == MMOP_ONLINE_KEEP;
-}
-
 static void __meminit resize_zone_range(struct zone *zone, unsigned long start_pfn,
 		unsigned long nr_pages)
 {
@@ -856,7 +831,7 @@ void __ref move_pfn_range_to_zone(struct zone *zone,
  * If no kernel zone covers this pfn range it will automatically go
  * to the ZONE_NORMAL.
  */
-struct zone *default_zone_for_pfn(int nid, unsigned long start_pfn,
+static struct zone *default_zone_for_pfn(int nid, unsigned long start_pfn,
 		unsigned long nr_pages)
 {
 	struct pglist_data *pgdat = NODE_DATA(nid);
@@ -872,6 +847,31 @@ struct zone *default_zone_for_pfn(int nid, unsigned long start_pfn,
 	return &pgdat->node_zones[ZONE_NORMAL];
 }
 
+bool allow_online_pfn_range(int nid, unsigned long pfn, unsigned long nr_pages, int online_type)
+{
+	struct pglist_data *pgdat = NODE_DATA(nid);
+	struct zone *movable_zone = &pgdat->node_zones[ZONE_MOVABLE];
+	struct zone *default_zone = default_zone_for_pfn(nid, pfn, nr_pages);
+
+	/*
+	 * TODO there shouldn't be any inherent reason to have ZONE_NORMAL
+	 * physically before ZONE_MOVABLE. All we need is they do not
+	 * overlap. Historically we didn't allow ZONE_NORMAL after ZONE_MOVABLE
+	 * though so let's stick with it for simplicity for now.
+	 * TODO make sure we do not overlap with ZONE_DEVICE
+	 */
+	if (online_type == MMOP_ONLINE_KERNEL) {
+		if (zone_is_empty(movable_zone))
+			return true;
+		return movable_zone->zone_start_pfn >= pfn + nr_pages;
+	} else if (online_type == MMOP_ONLINE_MOVABLE) {
+		return zone_end_pfn(default_zone) <= pfn;
+	}
+
+	/* MMOP_ONLINE_KEEP will always succeed and inherits the current zone */
+	return online_type == MMOP_ONLINE_KEEP;
+}
+
 static inline bool movable_pfn_range(int nid, struct zone *default_zone,
 		unsigned long start_pfn, unsigned long nr_pages)
 {
@@ -885,12 +885,8 @@ static inline bool movable_pfn_range(int nid, struct zone *default_zone,
 	return !zone_intersects(default_zone, start_pfn, nr_pages);
 }
 
-/*
- * Associates the given pfn range with the given node and the zone appropriate
- * for the given online type.
- */
-static struct zone * __meminit move_pfn_range(int online_type, int nid,
-		unsigned long start_pfn, unsigned long nr_pages)
+struct zone * zone_for_pfn_range(int online_type, int nid, unsigned start_pfn,
+		unsigned long nr_pages)
 {
 	struct pglist_data *pgdat = NODE_DATA(nid);
 	struct zone *zone = default_zone_for_pfn(nid, start_pfn, nr_pages);
@@ -909,6 +905,19 @@ static struct zone * __meminit move_pfn_range(int online_type, int nid,
 		zone = &pgdat->node_zones[ZONE_MOVABLE];
 	}
 
+	return zone;
+}
+
+/*
+ * Associates the given pfn range with the given node and the zone appropriate
+ * for the given online type.
+ */
+static struct zone * __meminit move_pfn_range(int online_type, int nid,
+		unsigned long start_pfn, unsigned long nr_pages)
+{
+	struct zone *zone;
+
+	zone = zone_for_pfn_range(online_type, nid, start_pfn, nr_pages);
 	move_pfn_range_to_zone(zone, start_pfn, nr_pages);
 	return zone;
 }
-- 
cgit v1.2.3


From c9bff3eebc09be23fbc868f5e6731666d23cbea3 Mon Sep 17 00:00:00 2001
From: Michal Hocko <mhocko@suse.com>
Date: Wed, 6 Sep 2017 16:20:13 -0700
Subject: mm, page_alloc: rip out ZONELIST_ORDER_ZONE

Patch series "cleanup zonelists initialization", v1.

This is aimed at cleaning up the zonelists initialization code we have
but the primary motivation was bug report [2] which got resolved but the
usage of stop_machine is just too ugly to live.  Most patches are
straightforward but 3 of them need a special consideration.

Patch 1 removes zone ordered zonelists completely.  I am CCing linux-api
because this is a user visible change.  As I argue in the patch
description I do not think we have a strong usecase for it these days.
I have kept sysctl in place and warn into the log if somebody tries to
configure zone lists ordering.  If somebody has a real usecase for it we
can revert this patch but I do not expect anybody will actually notice
runtime differences.  This patch is not strictly needed for the rest but
it made patch 6 easier to implement.

Patch 7 removes stop_machine from build_all_zonelists without adding any
special synchronization between iterators and updater which I _believe_
is acceptable as explained in the changelog.  I hope I am not missing
anything.

Patch 8 then removes zonelists_mutex which is kind of ugly as well and
not really needed AFAICS but a care should be taken when double checking
my thinking.

This patch (of 9):

Supporting zone ordered zonelists costs us just a lot of code while the
usefulness is arguable if existent at all.  Mel has already made node
ordering default on 64b systems.  32b systems are still using
ZONELIST_ORDER_ZONE because it is considered better to fallback to a
different NUMA node rather than consume precious lowmem zones.

This argument is, however, weaken by the fact that the memory reclaim
has been reworked to be node rather than zone oriented.  This means that
lowmem requests have to skip over all highmem pages on LRUs already and
so zone ordering doesn't save the reclaim time much.  So the only
advantage of the zone ordering is under a light memory pressure when
highmem requests do not ever hit into lowmem zones and the lowmem
pressure doesn't need to reclaim.

Considering that 32b NUMA systems are rather suboptimal already and it
is generally advisable to use 64b kernel on such a HW I believe we
should rather care about the code maintainability and just get rid of
ZONELIST_ORDER_ZONE altogether.  Keep systcl in place and warn if
somebody tries to set zone ordering either from kernel command line or
the sysctl.

[mhocko@suse.com: reading vm.numa_zonelist_order will never terminate]
Link: http://lkml.kernel.org/r/20170721143915.14161-2-mhocko@kernel.org
Signed-off-by: Michal Hocko <mhocko@suse.com>
Acked-by: Mel Gorman <mgorman@suse.de>
Acked-by: Vlastimil Babka <vbabka@suse.cz>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Joonsoo Kim <js1304@gmail.com>
Cc: Shaohua Li <shaohua.li@intel.com>
Cc: Toshi Kani <toshi.kani@hpe.com>
Cc: Abdul Haleem <abdhalee@linux.vnet.ibm.com>
Cc: <linux-api@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 Documentation/admin-guide/kernel-parameters.txt |   2 +-
 Documentation/sysctl/vm.txt                     |   4 +-
 Documentation/vm/numa                           |   7 +-
 include/linux/mmzone.h                          |   2 +-
 mm/page_alloc.c                                 | 179 +++---------------------
 5 files changed, 28 insertions(+), 166 deletions(-)

(limited to 'include')

diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt
index 6996b7727b85..86b0e8ec8ad7 100644
--- a/Documentation/admin-guide/kernel-parameters.txt
+++ b/Documentation/admin-guide/kernel-parameters.txt
@@ -2783,7 +2783,7 @@
 			Allowed values are enable and disable
 
 	numa_zonelist_order= [KNL, BOOT] Select zonelist order for NUMA.
-			one of ['zone', 'node', 'default'] can be specified
+			'node', 'default' can be specified
 			This can be set from sysctl after boot.
 			See Documentation/sysctl/vm.txt for details.
 
diff --git a/Documentation/sysctl/vm.txt b/Documentation/sysctl/vm.txt
index 48244c42ff52..9baf66a9ef4e 100644
--- a/Documentation/sysctl/vm.txt
+++ b/Documentation/sysctl/vm.txt
@@ -572,7 +572,9 @@ See Documentation/nommu-mmap.txt for more information.
 
 numa_zonelist_order
 
-This sysctl is only for NUMA.
+This sysctl is only for NUMA and it is deprecated. Anything but
+Node order will fail!
+
 'where the memory is allocated from' is controlled by zonelists.
 (This documentation ignores ZONE_HIGHMEM/ZONE_DMA32 for simple explanation.
  you may be able to read ZONE_DMA as ZONE_DMA32...)
diff --git a/Documentation/vm/numa b/Documentation/vm/numa
index a08f71647714..a31b85b9bb88 100644
--- a/Documentation/vm/numa
+++ b/Documentation/vm/numa
@@ -79,11 +79,8 @@ memory, Linux must decide whether to order the zonelists such that allocations
 fall back to the same zone type on a different node, or to a different zone
 type on the same node.  This is an important consideration because some zones,
 such as DMA or DMA32, represent relatively scarce resources.  Linux chooses
-a default zonelist order based on the sizes of the various zone types relative
-to the total memory of the node and the total memory of the system.  The
-default zonelist order may be overridden using the numa_zonelist_order kernel
-boot parameter or sysctl.  [see Documentation/admin-guide/kernel-parameters.rst and
-Documentation/sysctl/vm.txt]
+a default Node ordered zonelist. This means it tries to fallback to other zones
+from the same node before using remote nodes which are ordered by NUMA distance.
 
 By default, Linux will attempt to satisfy memory allocation requests from the
 node to which the CPU that executes the request is assigned.  Specifically,
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index fc14b8b3f6ce..bfdc37b77d88 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -896,7 +896,7 @@ int sysctl_min_slab_ratio_sysctl_handler(struct ctl_table *, int,
 extern int numa_zonelist_order_handler(struct ctl_table *, int,
 			void __user *, size_t *, loff_t *);
 extern char numa_zonelist_order[];
-#define NUMA_ZONELIST_ORDER_LEN 16	/* string buffer size */
+#define NUMA_ZONELIST_ORDER_LEN	16
 
 #ifndef CONFIG_NEED_MULTIPLE_NODES
 
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index dcc8a1cf55b6..6b23df1be909 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -4858,52 +4858,18 @@ static int build_zonelists_node(pg_data_t *pgdat, struct zonelist *zonelist,
 	return nr_zones;
 }
 
-
-/*
- *  zonelist_order:
- *  0 = automatic detection of better ordering.
- *  1 = order by ([node] distance, -zonetype)
- *  2 = order by (-zonetype, [node] distance)
- *
- *  If not NUMA, ZONELIST_ORDER_ZONE and ZONELIST_ORDER_NODE will create
- *  the same zonelist. So only NUMA can configure this param.
- */
-#define ZONELIST_ORDER_DEFAULT  0
-#define ZONELIST_ORDER_NODE     1
-#define ZONELIST_ORDER_ZONE     2
-
-/* zonelist order in the kernel.
- * set_zonelist_order() will set this to NODE or ZONE.
- */
-static int current_zonelist_order = ZONELIST_ORDER_DEFAULT;
-static char zonelist_order_name[3][8] = {"Default", "Node", "Zone"};
-
-
 #ifdef CONFIG_NUMA
-/* The value user specified ....changed by config */
-static int user_zonelist_order = ZONELIST_ORDER_DEFAULT;
-/* string for sysctl */
-#define NUMA_ZONELIST_ORDER_LEN	16
-char numa_zonelist_order[16] = "default";
-
-/*
- * interface for configure zonelist ordering.
- * command line option "numa_zonelist_order"
- *	= "[dD]efault	- default, automatic configuration.
- *	= "[nN]ode 	- order by node locality, then by zone within node
- *	= "[zZ]one      - order by zone, then by locality within zone
- */
 
 static int __parse_numa_zonelist_order(char *s)
 {
-	if (*s == 'd' || *s == 'D') {
-		user_zonelist_order = ZONELIST_ORDER_DEFAULT;
-	} else if (*s == 'n' || *s == 'N') {
-		user_zonelist_order = ZONELIST_ORDER_NODE;
-	} else if (*s == 'z' || *s == 'Z') {
-		user_zonelist_order = ZONELIST_ORDER_ZONE;
-	} else {
-		pr_warn("Ignoring invalid numa_zonelist_order value:  %s\n", s);
+	/*
+	 * We used to support different zonlists modes but they turned
+	 * out to be just not useful. Let's keep the warning in place
+	 * if somebody still use the cmd line parameter so that we do
+	 * not fail it silently
+	 */
+	if (!(*s == 'd' || *s == 'D' || *s == 'n' || *s == 'N')) {
+		pr_warn("Ignoring unsupported numa_zonelist_order value:  %s\n", s);
 		return -EINVAL;
 	}
 	return 0;
@@ -4911,19 +4877,15 @@ static int __parse_numa_zonelist_order(char *s)
 
 static __init int setup_numa_zonelist_order(char *s)
 {
-	int ret;
-
 	if (!s)
 		return 0;
 
-	ret = __parse_numa_zonelist_order(s);
-	if (ret == 0)
-		strlcpy(numa_zonelist_order, s, NUMA_ZONELIST_ORDER_LEN);
-
-	return ret;
+	return __parse_numa_zonelist_order(s);
 }
 early_param("numa_zonelist_order", setup_numa_zonelist_order);
 
+char numa_zonelist_order[] = "Node";
+
 /*
  * sysctl handler for numa_zonelist_order
  */
@@ -4931,42 +4893,17 @@ int numa_zonelist_order_handler(struct ctl_table *table, int write,
 		void __user *buffer, size_t *length,
 		loff_t *ppos)
 {
-	char saved_string[NUMA_ZONELIST_ORDER_LEN];
+	char *str;
 	int ret;
-	static DEFINE_MUTEX(zl_order_mutex);
 
-	mutex_lock(&zl_order_mutex);
-	if (write) {
-		if (strlen((char *)table->data) >= NUMA_ZONELIST_ORDER_LEN) {
-			ret = -EINVAL;
-			goto out;
-		}
-		strcpy(saved_string, (char *)table->data);
-	}
-	ret = proc_dostring(table, write, buffer, length, ppos);
-	if (ret)
-		goto out;
-	if (write) {
-		int oldval = user_zonelist_order;
+	if (!write)
+		return proc_dostring(table, write, buffer, length, ppos);
+	str = memdup_user_nul(buffer, 16);
+	if (IS_ERR(str))
+		return PTR_ERR(str);
 
-		ret = __parse_numa_zonelist_order((char *)table->data);
-		if (ret) {
-			/*
-			 * bogus value.  restore saved string
-			 */
-			strncpy((char *)table->data, saved_string,
-				NUMA_ZONELIST_ORDER_LEN);
-			user_zonelist_order = oldval;
-		} else if (oldval != user_zonelist_order) {
-			mem_hotplug_begin();
-			mutex_lock(&zonelists_mutex);
-			build_all_zonelists(NULL, NULL);
-			mutex_unlock(&zonelists_mutex);
-			mem_hotplug_done();
-		}
-	}
-out:
-	mutex_unlock(&zl_order_mutex);
+	ret = __parse_numa_zonelist_order(str);
+	kfree(str);
 	return ret;
 }
 
@@ -5075,70 +5012,12 @@ static void build_thisnode_zonelists(pg_data_t *pgdat)
  */
 static int node_order[MAX_NUMNODES];
 
-static void build_zonelists_in_zone_order(pg_data_t *pgdat, int nr_nodes)
-{
-	int pos, j, node;
-	int zone_type;		/* needs to be signed */
-	struct zone *z;
-	struct zonelist *zonelist;
-
-	zonelist = &pgdat->node_zonelists[ZONELIST_FALLBACK];
-	pos = 0;
-	for (zone_type = MAX_NR_ZONES - 1; zone_type >= 0; zone_type--) {
-		for (j = 0; j < nr_nodes; j++) {
-			node = node_order[j];
-			z = &NODE_DATA(node)->node_zones[zone_type];
-			if (managed_zone(z)) {
-				zoneref_set_zone(z,
-					&zonelist->_zonerefs[pos++]);
-				check_highest_zone(zone_type);
-			}
-		}
-	}
-	zonelist->_zonerefs[pos].zone = NULL;
-	zonelist->_zonerefs[pos].zone_idx = 0;
-}
-
-#if defined(CONFIG_64BIT)
-/*
- * Devices that require DMA32/DMA are relatively rare and do not justify a
- * penalty to every machine in case the specialised case applies. Default
- * to Node-ordering on 64-bit NUMA machines
- */
-static int default_zonelist_order(void)
-{
-	return ZONELIST_ORDER_NODE;
-}
-#else
-/*
- * On 32-bit, the Normal zone needs to be preserved for allocations accessible
- * by the kernel. If processes running on node 0 deplete the low memory zone
- * then reclaim will occur more frequency increasing stalls and potentially
- * be easier to OOM if a large percentage of the zone is under writeback or
- * dirty. The problem is significantly worse if CONFIG_HIGHPTE is not set.
- * Hence, default to zone ordering on 32-bit.
- */
-static int default_zonelist_order(void)
-{
-	return ZONELIST_ORDER_ZONE;
-}
-#endif /* CONFIG_64BIT */
-
-static void set_zonelist_order(void)
-{
-	if (user_zonelist_order == ZONELIST_ORDER_DEFAULT)
-		current_zonelist_order = default_zonelist_order();
-	else
-		current_zonelist_order = user_zonelist_order;
-}
-
 static void build_zonelists(pg_data_t *pgdat)
 {
 	int i, node, load;
 	nodemask_t used_mask;
 	int local_node, prev_node;
 	struct zonelist *zonelist;
-	unsigned int order = current_zonelist_order;
 
 	/* initialize zonelists */
 	for (i = 0; i < MAX_ZONELISTS; i++) {
@@ -5168,15 +5047,7 @@ static void build_zonelists(pg_data_t *pgdat)
 
 		prev_node = node;
 		load--;
-		if (order == ZONELIST_ORDER_NODE)
-			build_zonelists_in_node_order(pgdat, node);
-		else
-			node_order[i++] = node;	/* remember order */
-	}
-
-	if (order == ZONELIST_ORDER_ZONE) {
-		/* calculate node order -- i.e., DMA last! */
-		build_zonelists_in_zone_order(pgdat, i);
+		build_zonelists_in_node_order(pgdat, node);
 	}
 
 	build_thisnode_zonelists(pgdat);
@@ -5204,11 +5075,6 @@ static void setup_min_unmapped_ratio(void);
 static void setup_min_slab_ratio(void);
 #else	/* CONFIG_NUMA */
 
-static void set_zonelist_order(void)
-{
-	current_zonelist_order = ZONELIST_ORDER_ZONE;
-}
-
 static void build_zonelists(pg_data_t *pgdat)
 {
 	int node, local_node;
@@ -5348,8 +5214,6 @@ build_all_zonelists_init(void)
  */
 void __ref build_all_zonelists(pg_data_t *pgdat, struct zone *zone)
 {
-	set_zonelist_order();
-
 	if (system_state == SYSTEM_BOOTING) {
 		build_all_zonelists_init();
 	} else {
@@ -5375,9 +5239,8 @@ void __ref build_all_zonelists(pg_data_t *pgdat, struct zone *zone)
 	else
 		page_group_by_mobility_disabled = 0;
 
-	pr_info("Built %i zonelists in %s order, mobility grouping %s.  Total pages: %ld\n",
+	pr_info("Built %i zonelists, mobility grouping %s.  Total pages: %ld\n",
 		nr_online_nodes,
-		zonelist_order_name[current_zonelist_order],
 		page_group_by_mobility_disabled ? "off" : "on",
 		vm_total_pages);
 #ifdef CONFIG_NUMA
-- 
cgit v1.2.3


From 72675e131eb418c78980c1e683c0c25a25b61221 Mon Sep 17 00:00:00 2001
From: Michal Hocko <mhocko@suse.com>
Date: Wed, 6 Sep 2017 16:20:24 -0700
Subject: mm, memory_hotplug: drop zone from build_all_zonelists

build_all_zonelists gets a zone parameter to initialize zone's pagesets.
There is only a single user which gives a non-NULL zone parameter and
that one doesn't really need the rest of the build_all_zonelists (see
commit 6dcd73d7011b ("memory-hotplug: allocate zone's pcp before
onlining pages")).

Therefore remove setup_zone_pageset from build_all_zonelists and call it
from its only user directly.  This will also remove a pointless zonlists
rebuilding which is always good.

Link: http://lkml.kernel.org/r/20170721143915.14161-5-mhocko@kernel.org
Signed-off-by: Michal Hocko <mhocko@suse.com>
Acked-by: Vlastimil Babka <vbabka@suse.cz>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Joonsoo Kim <js1304@gmail.com>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Shaohua Li <shaohua.li@intel.com>
Cc: Toshi Kani <toshi.kani@hpe.com>
Cc: Wen Congyang <wency@cn.fujitsu.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/mmzone.h |  2 +-
 init/main.c            |  2 +-
 mm/internal.h          |  1 +
 mm/memory_hotplug.c    | 10 +++++-----
 mm/page_alloc.c        | 13 +++----------
 5 files changed, 11 insertions(+), 17 deletions(-)

(limited to 'include')

diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index bfdc37b77d88..551f68bec2fa 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -771,7 +771,7 @@ static inline bool is_dev_zone(const struct zone *zone)
 #include <linux/memory_hotplug.h>
 
 extern struct mutex zonelists_mutex;
-void build_all_zonelists(pg_data_t *pgdat, struct zone *zone);
+void build_all_zonelists(pg_data_t *pgdat);
 void wakeup_kswapd(struct zone *zone, int order, enum zone_type classzone_idx);
 bool __zone_watermark_ok(struct zone *z, unsigned int order, unsigned long mark,
 			 int classzone_idx, unsigned int alloc_flags,
diff --git a/init/main.c b/init/main.c
index 8828fc148670..a21a1a8708a8 100644
--- a/init/main.c
+++ b/init/main.c
@@ -542,7 +542,7 @@ asmlinkage __visible void __init start_kernel(void)
 	boot_cpu_state_init();
 	smp_prepare_boot_cpu();	/* arch-specific boot-cpu hooks */
 
-	build_all_zonelists(NULL, NULL);
+	build_all_zonelists(NULL);
 	page_alloc_init();
 
 	pr_notice("Kernel command line: %s\n", boot_command_line);
diff --git a/mm/internal.h b/mm/internal.h
index 4ef49fc55e58..781c0d54d75a 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -525,4 +525,5 @@ static inline bool is_migrate_highatomic_page(struct page *page)
 	return get_pageblock_migratetype(page) == MIGRATE_HIGHATOMIC;
 }
 
+void setup_zone_pageset(struct zone *zone);
 #endif	/* __MM_INTERNAL_H */
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index 3e69984346da..c4df7d3c64d1 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -929,7 +929,7 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages, int online_typ
 	mutex_lock(&zonelists_mutex);
 	if (!populated_zone(zone)) {
 		need_zonelists_rebuild = 1;
-		build_all_zonelists(NULL, zone);
+		setup_zone_pageset(zone);
 	}
 
 	ret = walk_system_ram_range(pfn, nr_pages, &onlined_pages,
@@ -950,7 +950,7 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages, int online_typ
 	if (onlined_pages) {
 		node_states_set_node(nid, &arg);
 		if (need_zonelists_rebuild)
-			build_all_zonelists(NULL, NULL);
+			build_all_zonelists(NULL);
 		else
 			zone_pcp_update(zone);
 	}
@@ -1028,7 +1028,7 @@ static pg_data_t __ref *hotadd_new_pgdat(int nid, u64 start)
 	 * to access not-initialized zonelist, build here.
 	 */
 	mutex_lock(&zonelists_mutex);
-	build_all_zonelists(pgdat, NULL);
+	build_all_zonelists(pgdat);
 	mutex_unlock(&zonelists_mutex);
 
 	/*
@@ -1084,7 +1084,7 @@ int try_online_node(int nid)
 
 	if (pgdat->node_zonelists->_zonerefs->zone == NULL) {
 		mutex_lock(&zonelists_mutex);
-		build_all_zonelists(NULL, NULL);
+		build_all_zonelists(NULL);
 		mutex_unlock(&zonelists_mutex);
 	}
 
@@ -1704,7 +1704,7 @@ repeat:
 	if (!populated_zone(zone)) {
 		zone_pcp_reset(zone);
 		mutex_lock(&zonelists_mutex);
-		build_all_zonelists(NULL, NULL);
+		build_all_zonelists(NULL);
 		mutex_unlock(&zonelists_mutex);
 	} else
 		zone_pcp_update(zone);
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 94fb4370e000..2523d5b3b22f 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -5129,7 +5129,6 @@ static void build_zonelists(pg_data_t *pgdat)
 static void setup_pageset(struct per_cpu_pageset *p, unsigned long batch);
 static DEFINE_PER_CPU(struct per_cpu_pageset, boot_pageset);
 static DEFINE_PER_CPU(struct per_cpu_nodestat, boot_nodestats);
-static void setup_zone_pageset(struct zone *zone);
 
 /*
  * Global mutex to protect against size modification of zonelists
@@ -5209,20 +5208,14 @@ build_all_zonelists_init(void)
  * Called with zonelists_mutex held always
  * unless system_state == SYSTEM_BOOTING.
  *
- * __ref due to (1) call of __meminit annotated setup_zone_pageset
- * [we're only called with non-NULL zone through __meminit paths] and
- * (2) call of __init annotated helper build_all_zonelists_init
+ * __ref due to call of __init annotated helper build_all_zonelists_init
  * [protected by SYSTEM_BOOTING].
  */
-void __ref build_all_zonelists(pg_data_t *pgdat, struct zone *zone)
+void __ref build_all_zonelists(pg_data_t *pgdat)
 {
 	if (system_state == SYSTEM_BOOTING) {
 		build_all_zonelists_init();
 	} else {
-#ifdef CONFIG_MEMORY_HOTPLUG
-		if (zone)
-			setup_zone_pageset(zone);
-#endif
 		/* we have to stop all cpus to guarantee there is no user
 		   of zonelist */
 		stop_machine_cpuslocked(__build_all_zonelists, pgdat, NULL);
@@ -5496,7 +5489,7 @@ static void __meminit zone_pageset_init(struct zone *zone, int cpu)
 	pageset_set_high_and_batch(zone, pcp);
 }
 
-static void __meminit setup_zone_pageset(struct zone *zone)
+void __meminit setup_zone_pageset(struct zone *zone)
 {
 	int cpu;
 	zone->pageset = alloc_percpu(struct per_cpu_pageset);
-- 
cgit v1.2.3


From b93e0f329e24f3615aa551fd9b99a75fb7c9195f Mon Sep 17 00:00:00 2001
From: Michal Hocko <mhocko@suse.com>
Date: Wed, 6 Sep 2017 16:20:37 -0700
Subject: mm, memory_hotplug: get rid of zonelists_mutex

zonelists_mutex was introduced by commit 4eaf3f64397c ("mem-hotplug: fix
potential race while building zonelist for new populated zone") to
protect zonelist building from races.  This is no longer needed though
because both memory online and offline are fully serialized.  New users
have grown since then.

Notably setup_per_zone_wmarks wants to prevent from races between memory
hotplug, khugepaged setup and manual min_free_kbytes update via sysctl
(see cfd3da1e49bb ("mm: Serialize access to min_free_kbytes").  Let's
add a private lock for that purpose.  This will not prevent from seeing
halfway through memory hotplug operation but that shouldn't be a big
deal becuse memory hotplug will update watermarks explicitly so we will
eventually get a full picture.  The lock just makes sure we won't race
when updating watermarks leading to weird results.

Also __build_all_zonelists manipulates global data so add a private lock
for it as well.  This doesn't seem to be necessary today but it is more
robust to have a lock there.

While we are at it make sure we document that memory online/offline
depends on a full serialization either via mem_hotplug_begin() or
device_lock.

Link: http://lkml.kernel.org/r/20170721143915.14161-9-mhocko@kernel.org
Signed-off-by: Michal Hocko <mhocko@suse.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Joonsoo Kim <js1304@gmail.com>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Shaohua Li <shaohua.li@intel.com>
Cc: Toshi Kani <toshi.kani@hpe.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Haicheng Li <haicheng.li@linux.intel.com>
Cc: Wu Fengguang <fengguang.wu@intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/mmzone.h |  1 -
 mm/memory_hotplug.c    | 12 ++----------
 mm/page_alloc.c        | 18 +++++++++---------
 3 files changed, 11 insertions(+), 20 deletions(-)

(limited to 'include')

diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 551f68bec2fa..e7e92c8f4883 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -770,7 +770,6 @@ static inline bool is_dev_zone(const struct zone *zone)
 
 #include <linux/memory_hotplug.h>
 
-extern struct mutex zonelists_mutex;
 void build_all_zonelists(pg_data_t *pgdat);
 void wakeup_kswapd(struct zone *zone, int order, enum zone_type classzone_idx);
 bool __zone_watermark_ok(struct zone *z, unsigned int order, unsigned long mark,
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index 2f0c7ebc7624..73bf17df6899 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -897,7 +897,7 @@ static struct zone * __meminit move_pfn_range(int online_type, int nid,
 	return zone;
 }
 
-/* Must be protected by mem_hotplug_begin() */
+/* Must be protected by mem_hotplug_begin() or a device_lock */
 int __ref online_pages(unsigned long pfn, unsigned long nr_pages, int online_type)
 {
 	unsigned long flags;
@@ -926,7 +926,6 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages, int online_typ
 	 * This means the page allocator ignores this zone.
 	 * So, zonelist must be updated after online.
 	 */
-	mutex_lock(&zonelists_mutex);
 	if (!populated_zone(zone)) {
 		need_zonelists_rebuild = 1;
 		setup_zone_pageset(zone);
@@ -937,7 +936,6 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages, int online_typ
 	if (ret) {
 		if (need_zonelists_rebuild)
 			zone_pcp_reset(zone);
-		mutex_unlock(&zonelists_mutex);
 		goto failed_addition;
 	}
 
@@ -955,8 +953,6 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages, int online_typ
 			zone_pcp_update(zone);
 	}
 
-	mutex_unlock(&zonelists_mutex);
-
 	init_per_zone_wmark_min();
 
 	if (onlined_pages) {
@@ -1027,9 +1023,7 @@ static pg_data_t __ref *hotadd_new_pgdat(int nid, u64 start)
 	 * The node we allocated has no zone fallback lists. For avoiding
 	 * to access not-initialized zonelist, build here.
 	 */
-	mutex_lock(&zonelists_mutex);
 	build_all_zonelists(pgdat);
-	mutex_unlock(&zonelists_mutex);
 
 	/*
 	 * zone->managed_pages is set to an approximate value in
@@ -1696,9 +1690,7 @@ repeat:
 
 	if (!populated_zone(zone)) {
 		zone_pcp_reset(zone);
-		mutex_lock(&zonelists_mutex);
 		build_all_zonelists(NULL);
-		mutex_unlock(&zonelists_mutex);
 	} else
 		zone_pcp_update(zone);
 
@@ -1724,7 +1716,7 @@ failed_removal:
 	return ret;
 }
 
-/* Must be protected by mem_hotplug_begin() */
+/* Must be protected by mem_hotplug_begin() or a device_lock */
 int offline_pages(unsigned long start_pfn, unsigned long nr_pages)
 {
 	return __offline_pages(start_pfn, start_pfn + nr_pages, 120 * HZ);
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index e3086d0fd945..0bea94af0423 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -5131,17 +5131,14 @@ static void setup_pageset(struct per_cpu_pageset *p, unsigned long batch);
 static DEFINE_PER_CPU(struct per_cpu_pageset, boot_pageset);
 static DEFINE_PER_CPU(struct per_cpu_nodestat, boot_nodestats);
 
-/*
- * Global mutex to protect against size modification of zonelists
- * as well as to serialize pageset setup for the new populated zone.
- */
-DEFINE_MUTEX(zonelists_mutex);
-
 static void __build_all_zonelists(void *data)
 {
 	int nid;
 	int __maybe_unused cpu;
 	pg_data_t *self = data;
+	static DEFINE_SPINLOCK(lock);
+
+	spin_lock(&lock);
 
 #ifdef CONFIG_NUMA
 	memset(node_load, 0, sizeof(node_load));
@@ -5173,6 +5170,8 @@ static void __build_all_zonelists(void *data)
 			set_cpu_numa_mem(cpu, local_memory_node(cpu_to_node(cpu)));
 #endif
 	}
+
+	spin_unlock(&lock);
 }
 
 static noinline void __init
@@ -5203,7 +5202,6 @@ build_all_zonelists_init(void)
 }
 
 /*
- * Called with zonelists_mutex held always
  * unless system_state == SYSTEM_BOOTING.
  *
  * __ref due to call of __init annotated helper build_all_zonelists_init
@@ -6939,9 +6937,11 @@ static void __setup_per_zone_wmarks(void)
  */
 void setup_per_zone_wmarks(void)
 {
-	mutex_lock(&zonelists_mutex);
+	static DEFINE_SPINLOCK(lock);
+
+	spin_lock(&lock);
 	__setup_per_zone_wmarks();
-	mutex_unlock(&zonelists_mutex);
+	spin_unlock(&lock);
 }
 
 /*
-- 
cgit v1.2.3


From 26b433d0da062d6e19d75350c0171d3cf8ff560d Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Wed, 6 Sep 2017 16:21:15 -0700
Subject: fscache: remove unused ->now_uncached callback

Patch series "Ranged pagevec lookup", v2.

In this series I make pagevec_lookup() update the index (to be
consistent with pagevec_lookup_tag() and also as a preparation for
ranged lookups), provide ranged variant of pagevec_lookup() and use it
in places where it makes sense.  This not only removes some common code
but is also a measurable performance win for some use cases (see patch
4/10) where radix tree is sparse and searching & grabing of a page after
the end of the range has measurable overhead.

This patch (of 10):

The callback doesn't ever get called.  Remove it.

Link: http://lkml.kernel.org/r/20170726114704.7626-2-jack@suse.cz
Signed-off-by: Jan Kara <jack@suse.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 Documentation/filesystems/caching/netfs-api.txt |  2 --
 fs/9p/cache.c                                   | 29 -----------------
 fs/afs/cache.c                                  | 43 -------------------------
 fs/ceph/cache.c                                 | 31 ------------------
 fs/cifs/cache.c                                 | 31 ------------------
 fs/nfs/fscache-index.c                          | 40 -----------------------
 include/linux/fscache.h                         |  9 ------
 7 files changed, 185 deletions(-)

(limited to 'include')

diff --git a/Documentation/filesystems/caching/netfs-api.txt b/Documentation/filesystems/caching/netfs-api.txt
index aed6b94160b1..0eb31de3a2c1 100644
--- a/Documentation/filesystems/caching/netfs-api.txt
+++ b/Documentation/filesystems/caching/netfs-api.txt
@@ -151,8 +151,6 @@ To define an object, a structure of the following type should be filled out:
 		void (*mark_pages_cached)(void *cookie_netfs_data,
 					  struct address_space *mapping,
 					  struct pagevec *cached_pvec);
-
-		void (*now_uncached)(void *cookie_netfs_data);
 	};
 
 This has the following fields:
diff --git a/fs/9p/cache.c b/fs/9p/cache.c
index 103ca5e1267b..64c58eb26159 100644
--- a/fs/9p/cache.c
+++ b/fs/9p/cache.c
@@ -151,34 +151,6 @@ fscache_checkaux v9fs_cache_inode_check_aux(void *cookie_netfs_data,
 	return FSCACHE_CHECKAUX_OKAY;
 }
 
-static void v9fs_cache_inode_now_uncached(void *cookie_netfs_data)
-{
-	struct v9fs_inode *v9inode = cookie_netfs_data;
-	struct pagevec pvec;
-	pgoff_t first;
-	int loop, nr_pages;
-
-	pagevec_init(&pvec, 0);
-	first = 0;
-
-	for (;;) {
-		nr_pages = pagevec_lookup(&pvec, v9inode->vfs_inode.i_mapping,
-					  first,
-					  PAGEVEC_SIZE - pagevec_count(&pvec));
-		if (!nr_pages)
-			break;
-
-		for (loop = 0; loop < nr_pages; loop++)
-			ClearPageFsCache(pvec.pages[loop]);
-
-		first = pvec.pages[nr_pages - 1]->index + 1;
-
-		pvec.nr = nr_pages;
-		pagevec_release(&pvec);
-		cond_resched();
-	}
-}
-
 const struct fscache_cookie_def v9fs_cache_inode_index_def = {
 	.name		= "9p.inode",
 	.type		= FSCACHE_COOKIE_TYPE_DATAFILE,
@@ -186,7 +158,6 @@ const struct fscache_cookie_def v9fs_cache_inode_index_def = {
 	.get_attr	= v9fs_cache_inode_get_attr,
 	.get_aux	= v9fs_cache_inode_get_aux,
 	.check_aux	= v9fs_cache_inode_check_aux,
-	.now_uncached	= v9fs_cache_inode_now_uncached,
 };
 
 void v9fs_cache_inode_get_cookie(struct inode *inode)
diff --git a/fs/afs/cache.c b/fs/afs/cache.c
index 577763c3d88b..1fe855191261 100644
--- a/fs/afs/cache.c
+++ b/fs/afs/cache.c
@@ -39,7 +39,6 @@ static uint16_t afs_vnode_cache_get_aux(const void *cookie_netfs_data,
 static enum fscache_checkaux afs_vnode_cache_check_aux(void *cookie_netfs_data,
 						       const void *buffer,
 						       uint16_t buflen);
-static void afs_vnode_cache_now_uncached(void *cookie_netfs_data);
 
 struct fscache_netfs afs_cache_netfs = {
 	.name			= "afs",
@@ -75,7 +74,6 @@ struct fscache_cookie_def afs_vnode_cache_index_def = {
 	.get_attr		= afs_vnode_cache_get_attr,
 	.get_aux		= afs_vnode_cache_get_aux,
 	.check_aux		= afs_vnode_cache_check_aux,
-	.now_uncached		= afs_vnode_cache_now_uncached,
 };
 
 /*
@@ -359,44 +357,3 @@ static enum fscache_checkaux afs_vnode_cache_check_aux(void *cookie_netfs_data,
 	_leave(" = SUCCESS");
 	return FSCACHE_CHECKAUX_OKAY;
 }
-
-/*
- * indication the cookie is no longer uncached
- * - this function is called when the backing store currently caching a cookie
- *   is removed
- * - the netfs should use this to clean up any markers indicating cached pages
- * - this is mandatory for any object that may have data
- */
-static void afs_vnode_cache_now_uncached(void *cookie_netfs_data)
-{
-	struct afs_vnode *vnode = cookie_netfs_data;
-	struct pagevec pvec;
-	pgoff_t first;
-	int loop, nr_pages;
-
-	_enter("{%x,%x,%Lx}",
-	       vnode->fid.vnode, vnode->fid.unique, vnode->status.data_version);
-
-	pagevec_init(&pvec, 0);
-	first = 0;
-
-	for (;;) {
-		/* grab a bunch of pages to clean */
-		nr_pages = pagevec_lookup(&pvec, vnode->vfs_inode.i_mapping,
-					  first,
-					  PAGEVEC_SIZE - pagevec_count(&pvec));
-		if (!nr_pages)
-			break;
-
-		for (loop = 0; loop < nr_pages; loop++)
-			ClearPageFsCache(pvec.pages[loop]);
-
-		first = pvec.pages[nr_pages - 1]->index + 1;
-
-		pvec.nr = nr_pages;
-		pagevec_release(&pvec);
-		cond_resched();
-	}
-
-	_leave("");
-}
diff --git a/fs/ceph/cache.c b/fs/ceph/cache.c
index 337f88673ed9..174d6e6569a8 100644
--- a/fs/ceph/cache.c
+++ b/fs/ceph/cache.c
@@ -194,36 +194,6 @@ static enum fscache_checkaux ceph_fscache_inode_check_aux(
 	return FSCACHE_CHECKAUX_OKAY;
 }
 
-static void ceph_fscache_inode_now_uncached(void* cookie_netfs_data)
-{
-	struct ceph_inode_info* ci = cookie_netfs_data;
-	struct pagevec pvec;
-	pgoff_t first;
-	int loop, nr_pages;
-
-	pagevec_init(&pvec, 0);
-	first = 0;
-
-	dout("ceph inode 0x%p now uncached", ci);
-
-	while (1) {
-		nr_pages = pagevec_lookup(&pvec, ci->vfs_inode.i_mapping, first,
-					  PAGEVEC_SIZE - pagevec_count(&pvec));
-
-		if (!nr_pages)
-			break;
-
-		for (loop = 0; loop < nr_pages; loop++)
-			ClearPageFsCache(pvec.pages[loop]);
-
-		first = pvec.pages[nr_pages - 1]->index + 1;
-
-		pvec.nr = nr_pages;
-		pagevec_release(&pvec);
-		cond_resched();
-	}
-}
-
 static const struct fscache_cookie_def ceph_fscache_inode_object_def = {
 	.name		= "CEPH.inode",
 	.type		= FSCACHE_COOKIE_TYPE_DATAFILE,
@@ -231,7 +201,6 @@ static const struct fscache_cookie_def ceph_fscache_inode_object_def = {
 	.get_attr	= ceph_fscache_inode_get_attr,
 	.get_aux	= ceph_fscache_inode_get_aux,
 	.check_aux	= ceph_fscache_inode_check_aux,
-	.now_uncached	= ceph_fscache_inode_now_uncached,
 };
 
 void ceph_fscache_register_inode_cookie(struct inode *inode)
diff --git a/fs/cifs/cache.c b/fs/cifs/cache.c
index 6c665bf4a27c..2c14020e5e1d 100644
--- a/fs/cifs/cache.c
+++ b/fs/cifs/cache.c
@@ -292,36 +292,6 @@ fscache_checkaux cifs_fscache_inode_check_aux(void *cookie_netfs_data,
 	return FSCACHE_CHECKAUX_OKAY;
 }
 
-static void cifs_fscache_inode_now_uncached(void *cookie_netfs_data)
-{
-	struct cifsInodeInfo *cifsi = cookie_netfs_data;
-	struct pagevec pvec;
-	pgoff_t first;
-	int loop, nr_pages;
-
-	pagevec_init(&pvec, 0);
-	first = 0;
-
-	cifs_dbg(FYI, "%s: cifs inode 0x%p now uncached\n", __func__, cifsi);
-
-	for (;;) {
-		nr_pages = pagevec_lookup(&pvec,
-					  cifsi->vfs_inode.i_mapping, first,
-					  PAGEVEC_SIZE - pagevec_count(&pvec));
-		if (!nr_pages)
-			break;
-
-		for (loop = 0; loop < nr_pages; loop++)
-			ClearPageFsCache(pvec.pages[loop]);
-
-		first = pvec.pages[nr_pages - 1]->index + 1;
-
-		pvec.nr = nr_pages;
-		pagevec_release(&pvec);
-		cond_resched();
-	}
-}
-
 const struct fscache_cookie_def cifs_fscache_inode_object_def = {
 	.name		= "CIFS.uniqueid",
 	.type		= FSCACHE_COOKIE_TYPE_DATAFILE,
@@ -329,5 +299,4 @@ const struct fscache_cookie_def cifs_fscache_inode_object_def = {
 	.get_attr	= cifs_fscache_inode_get_attr,
 	.get_aux	= cifs_fscache_inode_get_aux,
 	.check_aux	= cifs_fscache_inode_check_aux,
-	.now_uncached	= cifs_fscache_inode_now_uncached,
 };
diff --git a/fs/nfs/fscache-index.c b/fs/nfs/fscache-index.c
index 777b055063f6..3025fe8584a0 100644
--- a/fs/nfs/fscache-index.c
+++ b/fs/nfs/fscache-index.c
@@ -251,45 +251,6 @@ enum fscache_checkaux nfs_fscache_inode_check_aux(void *cookie_netfs_data,
 	return FSCACHE_CHECKAUX_OKAY;
 }
 
-/*
- * Indication from FS-Cache that the cookie is no longer cached
- * - This function is called when the backing store currently caching a cookie
- *   is removed
- * - The netfs should use this to clean up any markers indicating cached pages
- * - This is mandatory for any object that may have data
- */
-static void nfs_fscache_inode_now_uncached(void *cookie_netfs_data)
-{
-	struct nfs_inode *nfsi = cookie_netfs_data;
-	struct pagevec pvec;
-	pgoff_t first;
-	int loop, nr_pages;
-
-	pagevec_init(&pvec, 0);
-	first = 0;
-
-	dprintk("NFS: nfs_inode_now_uncached: nfs_inode 0x%p\n", nfsi);
-
-	for (;;) {
-		/* grab a bunch of pages to unmark */
-		nr_pages = pagevec_lookup(&pvec,
-					  nfsi->vfs_inode.i_mapping,
-					  first,
-					  PAGEVEC_SIZE - pagevec_count(&pvec));
-		if (!nr_pages)
-			break;
-
-		for (loop = 0; loop < nr_pages; loop++)
-			ClearPageFsCache(pvec.pages[loop]);
-
-		first = pvec.pages[nr_pages - 1]->index + 1;
-
-		pvec.nr = nr_pages;
-		pagevec_release(&pvec);
-		cond_resched();
-	}
-}
-
 /*
  * Get an extra reference on a read context.
  * - This function can be absent if the completion function doesn't require a
@@ -330,7 +291,6 @@ const struct fscache_cookie_def nfs_fscache_inode_object_def = {
 	.get_attr	= nfs_fscache_inode_get_attr,
 	.get_aux	= nfs_fscache_inode_get_aux,
 	.check_aux	= nfs_fscache_inode_check_aux,
-	.now_uncached	= nfs_fscache_inode_now_uncached,
 	.get_context	= nfs_fh_get_context,
 	.put_context	= nfs_fh_put_context,
 };
diff --git a/include/linux/fscache.h b/include/linux/fscache.h
index 115bb81912cc..f4ff47d4a893 100644
--- a/include/linux/fscache.h
+++ b/include/linux/fscache.h
@@ -143,15 +143,6 @@ struct fscache_cookie_def {
 	void (*mark_page_cached)(void *cookie_netfs_data,
 				 struct address_space *mapping,
 				 struct page *page);
-
-	/* indicate the cookie is no longer cached
-	 * - this function is called when the backing store currently caching
-	 *   a cookie is removed
-	 * - the netfs should use this to clean up any markers indicating
-	 *   cached pages
-	 * - this is mandatory for any object that may have data
-	 */
-	void (*now_uncached)(void *cookie_netfs_data);
 };
 
 /*
-- 
cgit v1.2.3


From d72dc8a25afc71ce90ee92bdd77550e9beb85d4d Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Wed, 6 Sep 2017 16:21:18 -0700
Subject: mm: make pagevec_lookup() update index

Make pagevec_lookup() (and underlying find_get_pages()) update index to
the next page where iteration should continue.  Most callers want this
and also pagevec_lookup_tag() already does this.

Link: http://lkml.kernel.org/r/20170726114704.7626-3-jack@suse.cz
Signed-off-by: Jan Kara <jack@suse.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/buffer.c             | 10 ++++------
 fs/ext4/file.c          |  4 +---
 fs/ext4/inode.c         |  8 ++------
 fs/fscache/page.c       |  5 ++---
 fs/hugetlbfs/inode.c    | 17 ++++++++---------
 fs/nilfs2/page.c        |  3 +--
 fs/ramfs/file-nommu.c   |  2 +-
 include/linux/pagemap.h |  2 +-
 include/linux/pagevec.h |  2 +-
 mm/filemap.c            | 11 ++++++++---
 mm/swap.c               |  5 +++--
 11 files changed, 32 insertions(+), 37 deletions(-)

(limited to 'include')

diff --git a/fs/buffer.c b/fs/buffer.c
index 5715dac7821f..5b20893708e2 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -1633,13 +1633,12 @@ void clean_bdev_aliases(struct block_device *bdev, sector_t block, sector_t len)
 
 	end = (block + len - 1) >> (PAGE_SHIFT - bd_inode->i_blkbits);
 	pagevec_init(&pvec, 0);
-	while (index <= end && pagevec_lookup(&pvec, bd_mapping, index,
+	while (index <= end && pagevec_lookup(&pvec, bd_mapping, &index,
 			min(end - index, (pgoff_t)PAGEVEC_SIZE - 1) + 1)) {
 		for (i = 0; i < pagevec_count(&pvec); i++) {
 			struct page *page = pvec.pages[i];
 
-			index = page->index;
-			if (index > end)
+			if (page->index > end)
 				break;
 			if (!page_has_buffers(page))
 				continue;
@@ -1670,7 +1669,6 @@ unlock_page:
 		}
 		pagevec_release(&pvec);
 		cond_resched();
-		index++;
 	}
 }
 EXPORT_SYMBOL(clean_bdev_aliases);
@@ -3552,7 +3550,8 @@ page_cache_seek_hole_data(struct inode *inode, loff_t offset, loff_t length,
 		unsigned want, nr_pages, i;
 
 		want = min_t(unsigned, end - index, PAGEVEC_SIZE);
-		nr_pages = pagevec_lookup(&pvec, inode->i_mapping, index, want);
+		nr_pages = pagevec_lookup(&pvec, inode->i_mapping, &index,
+					  want);
 		if (nr_pages == 0)
 			break;
 
@@ -3594,7 +3593,6 @@ page_cache_seek_hole_data(struct inode *inode, loff_t offset, loff_t length,
 		if (nr_pages < want)
 			break;
 
-		index = pvec.pages[i - 1]->index + 1;
 		pagevec_release(&pvec);
 	} while (index < end);
 
diff --git a/fs/ext4/file.c b/fs/ext4/file.c
index f28ac999dfba..1c73b21fdc13 100644
--- a/fs/ext4/file.c
+++ b/fs/ext4/file.c
@@ -468,7 +468,7 @@ static int ext4_find_unwritten_pgoff(struct inode *inode,
 		unsigned long nr_pages;
 
 		num = min_t(pgoff_t, end - index, PAGEVEC_SIZE - 1) + 1;
-		nr_pages = pagevec_lookup(&pvec, inode->i_mapping, index,
+		nr_pages = pagevec_lookup(&pvec, inode->i_mapping, &index,
 					  (pgoff_t)num);
 		if (nr_pages == 0)
 			break;
@@ -536,8 +536,6 @@ next:
 		/* The no. of pages is less than our desired, we are done. */
 		if (nr_pages < num)
 			break;
-
-		index = pvec.pages[i - 1]->index + 1;
 		pagevec_release(&pvec);
 	} while (index <= end);
 
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index c774bdc22759..b3ce1c6d9f23 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -1720,7 +1720,7 @@ static void mpage_release_unused_pages(struct mpage_da_data *mpd,
 
 	pagevec_init(&pvec, 0);
 	while (index <= end) {
-		nr_pages = pagevec_lookup(&pvec, mapping, index, PAGEVEC_SIZE);
+		nr_pages = pagevec_lookup(&pvec, mapping, &index, PAGEVEC_SIZE);
 		if (nr_pages == 0)
 			break;
 		for (i = 0; i < nr_pages; i++) {
@@ -1737,7 +1737,6 @@ static void mpage_release_unused_pages(struct mpage_da_data *mpd,
 			}
 			unlock_page(page);
 		}
-		index = pvec.pages[nr_pages - 1]->index + 1;
 		pagevec_release(&pvec);
 	}
 }
@@ -2348,7 +2347,7 @@ static int mpage_map_and_submit_buffers(struct mpage_da_data *mpd)
 
 	pagevec_init(&pvec, 0);
 	while (start <= end) {
-		nr_pages = pagevec_lookup(&pvec, inode->i_mapping, start,
+		nr_pages = pagevec_lookup(&pvec, inode->i_mapping, &start,
 					  PAGEVEC_SIZE);
 		if (nr_pages == 0)
 			break;
@@ -2357,8 +2356,6 @@ static int mpage_map_and_submit_buffers(struct mpage_da_data *mpd)
 
 			if (page->index > end)
 				break;
-			/* Up to 'end' pages must be contiguous */
-			BUG_ON(page->index != start);
 			bh = head = page_buffers(page);
 			do {
 				if (lblk < mpd->map.m_lblk)
@@ -2403,7 +2400,6 @@ static int mpage_map_and_submit_buffers(struct mpage_da_data *mpd)
 				pagevec_release(&pvec);
 				return err;
 			}
-			start++;
 		}
 		pagevec_release(&pvec);
 	}
diff --git a/fs/fscache/page.c b/fs/fscache/page.c
index c8c4f79c7ce1..83018861dcd2 100644
--- a/fs/fscache/page.c
+++ b/fs/fscache/page.c
@@ -1178,11 +1178,10 @@ void __fscache_uncache_all_inode_pages(struct fscache_cookie *cookie,
 	pagevec_init(&pvec, 0);
 	next = 0;
 	do {
-		if (!pagevec_lookup(&pvec, mapping, next, PAGEVEC_SIZE))
+		if (!pagevec_lookup(&pvec, mapping, &next, PAGEVEC_SIZE))
 			break;
 		for (i = 0; i < pagevec_count(&pvec); i++) {
 			struct page *page = pvec.pages[i];
-			next = page->index;
 			if (PageFsCache(page)) {
 				__fscache_wait_on_page_write(cookie, page);
 				__fscache_uncache_page(cookie, page);
@@ -1190,7 +1189,7 @@ void __fscache_uncache_all_inode_pages(struct fscache_cookie *cookie,
 		}
 		pagevec_release(&pvec);
 		cond_resched();
-	} while (++next);
+	} while (next);
 
 	_leave("");
 }
diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index 28d2753be094..b9678ce91e25 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -401,7 +401,7 @@ static void remove_inode_hugepages(struct inode *inode, loff_t lstart,
 	const pgoff_t end = lend >> huge_page_shift(h);
 	struct vm_area_struct pseudo_vma;
 	struct pagevec pvec;
-	pgoff_t next;
+	pgoff_t next, index;
 	int i, freed = 0;
 	long lookup_nr = PAGEVEC_SIZE;
 	bool truncate_op = (lend == LLONG_MAX);
@@ -420,7 +420,7 @@ static void remove_inode_hugepages(struct inode *inode, loff_t lstart,
 		/*
 		 * When no more pages are found, we are done.
 		 */
-		if (!pagevec_lookup(&pvec, mapping, next, lookup_nr))
+		if (!pagevec_lookup(&pvec, mapping, &next, lookup_nr))
 			break;
 
 		for (i = 0; i < pagevec_count(&pvec); ++i) {
@@ -432,13 +432,13 @@ static void remove_inode_hugepages(struct inode *inode, loff_t lstart,
 			 * only possible in the punch hole case as end is
 			 * max page offset in the truncate case.
 			 */
-			next = page->index;
-			if (next >= end)
+			index = page->index;
+			if (index >= end)
 				break;
 
 			hash = hugetlb_fault_mutex_hash(h, current->mm,
 							&pseudo_vma,
-							mapping, next, 0);
+							mapping, index, 0);
 			mutex_lock(&hugetlb_fault_mutex_table[hash]);
 
 			/*
@@ -455,8 +455,8 @@ static void remove_inode_hugepages(struct inode *inode, loff_t lstart,
 
 				i_mmap_lock_write(mapping);
 				hugetlb_vmdelete_list(&mapping->i_mmap,
-					next * pages_per_huge_page(h),
-					(next + 1) * pages_per_huge_page(h));
+					index * pages_per_huge_page(h),
+					(index + 1) * pages_per_huge_page(h));
 				i_mmap_unlock_write(mapping);
 			}
 
@@ -475,14 +475,13 @@ static void remove_inode_hugepages(struct inode *inode, loff_t lstart,
 			freed++;
 			if (!truncate_op) {
 				if (unlikely(hugetlb_unreserve_pages(inode,
-							next, next + 1, 1)))
+							index, index + 1, 1)))
 					hugetlb_fix_reserve_counts(inode);
 			}
 
 			unlock_page(page);
 			mutex_unlock(&hugetlb_fault_mutex_table[hash]);
 		}
-		++next;
 		huge_pagevec_release(&pvec);
 		cond_resched();
 	}
diff --git a/fs/nilfs2/page.c b/fs/nilfs2/page.c
index f11a3ad2df0c..382a36c72d72 100644
--- a/fs/nilfs2/page.c
+++ b/fs/nilfs2/page.c
@@ -312,10 +312,9 @@ void nilfs_copy_back_pages(struct address_space *dmap,
 
 	pagevec_init(&pvec, 0);
 repeat:
-	n = pagevec_lookup(&pvec, smap, index, PAGEVEC_SIZE);
+	n = pagevec_lookup(&pvec, smap, &index, PAGEVEC_SIZE);
 	if (!n)
 		return;
-	index = pvec.pages[n - 1]->index + 1;
 
 	for (i = 0; i < pagevec_count(&pvec); i++) {
 		struct page *page = pvec.pages[i], *dpage;
diff --git a/fs/ramfs/file-nommu.c b/fs/ramfs/file-nommu.c
index 2ef7ce75c062..3ac1f2387083 100644
--- a/fs/ramfs/file-nommu.c
+++ b/fs/ramfs/file-nommu.c
@@ -228,7 +228,7 @@ static unsigned long ramfs_nommu_get_unmapped_area(struct file *file,
 	if (!pages)
 		goto out_free;
 
-	nr = find_get_pages(inode->i_mapping, pgoff, lpages, pages);
+	nr = find_get_pages(inode->i_mapping, &pgoff, lpages, pages);
 	if (nr != lpages)
 		goto out_free_pages; /* leave if some pages were missing */
 
diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h
index 79b36f57c3ba..249b1b5964c7 100644
--- a/include/linux/pagemap.h
+++ b/include/linux/pagemap.h
@@ -353,7 +353,7 @@ struct page *find_lock_entry(struct address_space *mapping, pgoff_t offset);
 unsigned find_get_entries(struct address_space *mapping, pgoff_t start,
 			  unsigned int nr_entries, struct page **entries,
 			  pgoff_t *indices);
-unsigned find_get_pages(struct address_space *mapping, pgoff_t start,
+unsigned find_get_pages(struct address_space *mapping, pgoff_t *start,
 			unsigned int nr_pages, struct page **pages);
 unsigned find_get_pages_contig(struct address_space *mapping, pgoff_t start,
 			       unsigned int nr_pages, struct page **pages);
diff --git a/include/linux/pagevec.h b/include/linux/pagevec.h
index b45d391b4540..c395a5bb58b2 100644
--- a/include/linux/pagevec.h
+++ b/include/linux/pagevec.h
@@ -28,7 +28,7 @@ unsigned pagevec_lookup_entries(struct pagevec *pvec,
 				pgoff_t *indices);
 void pagevec_remove_exceptionals(struct pagevec *pvec);
 unsigned pagevec_lookup(struct pagevec *pvec, struct address_space *mapping,
-		pgoff_t start, unsigned nr_pages);
+		pgoff_t *start, unsigned nr_pages);
 unsigned pagevec_lookup_tag(struct pagevec *pvec,
 		struct address_space *mapping, pgoff_t *index, int tag,
 		unsigned nr_pages);
diff --git a/mm/filemap.c b/mm/filemap.c
index dad935769055..ab9011408d81 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -403,7 +403,7 @@ bool filemap_range_has_page(struct address_space *mapping,
 		return false;
 
 	pagevec_init(&pvec, 0);
-	if (!pagevec_lookup(&pvec, mapping, index, 1))
+	if (!pagevec_lookup(&pvec, mapping, &index, 1))
 		return false;
 	ret = (pvec.pages[0]->index <= end);
 	pagevec_release(&pvec);
@@ -1569,10 +1569,11 @@ export:
  *
  * The search returns a group of mapping-contiguous pages with ascending
  * indexes.  There may be holes in the indices due to not-present pages.
+ * We also update @start to index the next page for the traversal.
  *
  * find_get_pages() returns the number of pages which were found.
  */
-unsigned find_get_pages(struct address_space *mapping, pgoff_t start,
+unsigned find_get_pages(struct address_space *mapping, pgoff_t *start,
 			    unsigned int nr_pages, struct page **pages)
 {
 	struct radix_tree_iter iter;
@@ -1583,7 +1584,7 @@ unsigned find_get_pages(struct address_space *mapping, pgoff_t start,
 		return 0;
 
 	rcu_read_lock();
-	radix_tree_for_each_slot(slot, &mapping->page_tree, &iter, start) {
+	radix_tree_for_each_slot(slot, &mapping->page_tree, &iter, *start) {
 		struct page *head, *page;
 repeat:
 		page = radix_tree_deref_slot(slot);
@@ -1625,6 +1626,10 @@ repeat:
 	}
 
 	rcu_read_unlock();
+
+	if (ret)
+		*start = pages[ret - 1]->index + 1;
+
 	return ret;
 }
 
diff --git a/mm/swap.c b/mm/swap.c
index 60b1d2a75852..4bffd1198ce5 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -957,12 +957,13 @@ void pagevec_remove_exceptionals(struct pagevec *pvec)
  * reference against the pages in @pvec.
  *
  * The search returns a group of mapping-contiguous pages with ascending
- * indexes.  There may be holes in the indices due to not-present pages.
+ * indexes.  There may be holes in the indices due to not-present pages. We
+ * also update @start to index the next page for the traversal.
  *
  * pagevec_lookup() returns the number of pages which were found.
  */
 unsigned pagevec_lookup(struct pagevec *pvec, struct address_space *mapping,
-		pgoff_t start, unsigned nr_pages)
+		pgoff_t *start, unsigned nr_pages)
 {
 	pvec->nr = find_get_pages(mapping, start, nr_pages, pvec->pages);
 	return pagevec_count(pvec);
-- 
cgit v1.2.3


From b947cee4b96306037e166ff1ea5156c0ecdd7d91 Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Wed, 6 Sep 2017 16:21:21 -0700
Subject: mm: implement find_get_pages_range()

Implement a variant of find_get_pages() that stops iterating at given
index.  This may be substantial performance gain if the mapping is
sparse.  See following commit for details.  Furthermore lots of users of
this function (through pagevec_lookup()) actually want a range lookup
and all of them are currently open-coding this.

Also create corresponding pagevec_lookup_range() function.

Link: http://lkml.kernel.org/r/20170726114704.7626-4-jack@suse.cz
Signed-off-by: Jan Kara <jack@suse.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/pagemap.h | 12 ++++++++++--
 include/linux/pagevec.h | 13 +++++++++++--
 mm/filemap.c            | 42 ++++++++++++++++++++++++++++++------------
 mm/swap.c               | 22 ++++++++++++++--------
 4 files changed, 65 insertions(+), 24 deletions(-)

(limited to 'include')

diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h
index 249b1b5964c7..5bbd6780f205 100644
--- a/include/linux/pagemap.h
+++ b/include/linux/pagemap.h
@@ -353,8 +353,16 @@ struct page *find_lock_entry(struct address_space *mapping, pgoff_t offset);
 unsigned find_get_entries(struct address_space *mapping, pgoff_t start,
 			  unsigned int nr_entries, struct page **entries,
 			  pgoff_t *indices);
-unsigned find_get_pages(struct address_space *mapping, pgoff_t *start,
-			unsigned int nr_pages, struct page **pages);
+unsigned find_get_pages_range(struct address_space *mapping, pgoff_t *start,
+			pgoff_t end, unsigned int nr_pages,
+			struct page **pages);
+static inline unsigned find_get_pages(struct address_space *mapping,
+			pgoff_t *start, unsigned int nr_pages,
+			struct page **pages)
+{
+	return find_get_pages_range(mapping, start, (pgoff_t)-1, nr_pages,
+				    pages);
+}
 unsigned find_get_pages_contig(struct address_space *mapping, pgoff_t start,
 			       unsigned int nr_pages, struct page **pages);
 unsigned find_get_pages_tag(struct address_space *mapping, pgoff_t *index,
diff --git a/include/linux/pagevec.h b/include/linux/pagevec.h
index c395a5bb58b2..7df056910437 100644
--- a/include/linux/pagevec.h
+++ b/include/linux/pagevec.h
@@ -27,8 +27,17 @@ unsigned pagevec_lookup_entries(struct pagevec *pvec,
 				pgoff_t start, unsigned nr_entries,
 				pgoff_t *indices);
 void pagevec_remove_exceptionals(struct pagevec *pvec);
-unsigned pagevec_lookup(struct pagevec *pvec, struct address_space *mapping,
-		pgoff_t *start, unsigned nr_pages);
+unsigned pagevec_lookup_range(struct pagevec *pvec,
+			      struct address_space *mapping,
+			      pgoff_t *start, pgoff_t end, unsigned nr_pages);
+static inline unsigned pagevec_lookup(struct pagevec *pvec,
+				      struct address_space *mapping,
+				      pgoff_t *start, unsigned nr_pages)
+{
+	return pagevec_lookup_range(pvec, mapping, start, (pgoff_t)-1,
+				    nr_pages);
+}
+
 unsigned pagevec_lookup_tag(struct pagevec *pvec,
 		struct address_space *mapping, pgoff_t *index, int tag,
 		unsigned nr_pages);
diff --git a/mm/filemap.c b/mm/filemap.c
index ab9011408d81..129883f160a7 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -1557,24 +1557,29 @@ export:
 }
 
 /**
- * find_get_pages - gang pagecache lookup
+ * find_get_pages_range - gang pagecache lookup
  * @mapping:	The address_space to search
  * @start:	The starting page index
+ * @end:	The final page index (inclusive)
  * @nr_pages:	The maximum number of pages
  * @pages:	Where the resulting pages are placed
  *
- * find_get_pages() will search for and return a group of up to
- * @nr_pages pages in the mapping.  The pages are placed at @pages.
- * find_get_pages() takes a reference against the returned pages.
+ * find_get_pages_range() will search for and return a group of up to @nr_pages
+ * pages in the mapping starting at index @start and up to index @end
+ * (inclusive).  The pages are placed at @pages.  find_get_pages_range() takes
+ * a reference against the returned pages.
  *
  * The search returns a group of mapping-contiguous pages with ascending
  * indexes.  There may be holes in the indices due to not-present pages.
  * We also update @start to index the next page for the traversal.
  *
- * find_get_pages() returns the number of pages which were found.
+ * find_get_pages_range() returns the number of pages which were found. If this
+ * number is smaller than @nr_pages, the end of specified range has been
+ * reached.
  */
-unsigned find_get_pages(struct address_space *mapping, pgoff_t *start,
-			    unsigned int nr_pages, struct page **pages)
+unsigned find_get_pages_range(struct address_space *mapping, pgoff_t *start,
+			      pgoff_t end, unsigned int nr_pages,
+			      struct page **pages)
 {
 	struct radix_tree_iter iter;
 	void **slot;
@@ -1586,6 +1591,9 @@ unsigned find_get_pages(struct address_space *mapping, pgoff_t *start,
 	rcu_read_lock();
 	radix_tree_for_each_slot(slot, &mapping->page_tree, &iter, *start) {
 		struct page *head, *page;
+
+		if (iter.index > end)
+			break;
 repeat:
 		page = radix_tree_deref_slot(slot);
 		if (unlikely(!page))
@@ -1621,15 +1629,25 @@ repeat:
 		}
 
 		pages[ret] = page;
-		if (++ret == nr_pages)
-			break;
+		if (++ret == nr_pages) {
+			*start = pages[ret - 1]->index + 1;
+			goto out;
+		}
 	}
 
+	/*
+	 * We come here when there is no page beyond @end. We take care to not
+	 * overflow the index @start as it confuses some of the callers. This
+	 * breaks the iteration when there is page at index -1 but that is
+	 * already broken anyway.
+	 */
+	if (end == (pgoff_t)-1)
+		*start = (pgoff_t)-1;
+	else
+		*start = end + 1;
+out:
 	rcu_read_unlock();
 
-	if (ret)
-		*start = pages[ret - 1]->index + 1;
-
 	return ret;
 }
 
diff --git a/mm/swap.c b/mm/swap.c
index 4bffd1198ce5..e06e9aa2478e 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -946,29 +946,35 @@ void pagevec_remove_exceptionals(struct pagevec *pvec)
 }
 
 /**
- * pagevec_lookup - gang pagecache lookup
+ * pagevec_lookup_range - gang pagecache lookup
  * @pvec:	Where the resulting pages are placed
  * @mapping:	The address_space to search
  * @start:	The starting page index
+ * @end:	The final page index
  * @nr_pages:	The maximum number of pages
  *
- * pagevec_lookup() will search for and return a group of up to @nr_pages pages
- * in the mapping.  The pages are placed in @pvec.  pagevec_lookup() takes a
+ * pagevec_lookup_range() will search for and return a group of up to @nr_pages
+ * pages in the mapping starting from index @start and upto index @end
+ * (inclusive).  The pages are placed in @pvec.  pagevec_lookup() takes a
  * reference against the pages in @pvec.
  *
  * The search returns a group of mapping-contiguous pages with ascending
  * indexes.  There may be holes in the indices due to not-present pages. We
  * also update @start to index the next page for the traversal.
  *
- * pagevec_lookup() returns the number of pages which were found.
+ * pagevec_lookup_range() returns the number of pages which were found. If this
+ * number is smaller than @nr_pages, the end of specified range has been
+ * reached.
  */
-unsigned pagevec_lookup(struct pagevec *pvec, struct address_space *mapping,
-		pgoff_t *start, unsigned nr_pages)
+unsigned pagevec_lookup_range(struct pagevec *pvec,
+		struct address_space *mapping, pgoff_t *start, pgoff_t end,
+		unsigned nr_pages)
 {
-	pvec->nr = find_get_pages(mapping, start, nr_pages, pvec->pages);
+	pvec->nr = find_get_pages_range(mapping, start, end, nr_pages,
+					pvec->pages);
 	return pagevec_count(pvec);
 }
-EXPORT_SYMBOL(pagevec_lookup);
+EXPORT_SYMBOL(pagevec_lookup_range);
 
 unsigned pagevec_lookup_tag(struct pagevec *pvec, struct address_space *mapping,
 		pgoff_t *index, int tag, unsigned nr_pages)
-- 
cgit v1.2.3


From 397162ffa2ed1cadffe05c324c6ddc53647f9c62 Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Wed, 6 Sep 2017 16:21:43 -0700
Subject: mm: remove nr_pages argument from pagevec_lookup{,_range}()

All users of pagevec_lookup() and pagevec_lookup_range() now pass
PAGEVEC_SIZE as a desired number of pages.

Just drop the argument.

Link: http://lkml.kernel.org/r/20170726114704.7626-11-jack@suse.cz
Signed-off-by: Jan Kara <jack@suse.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/buffer.c             | 5 ++---
 fs/ext4/file.c          | 2 +-
 fs/ext4/inode.c         | 5 ++---
 fs/fscache/page.c       | 2 +-
 fs/hugetlbfs/inode.c    | 3 +--
 fs/nilfs2/page.c        | 2 +-
 include/linux/pagevec.h | 7 +++----
 mm/swap.c               | 5 ++---
 8 files changed, 13 insertions(+), 18 deletions(-)

(limited to 'include')

diff --git a/fs/buffer.c b/fs/buffer.c
index 8770b58ca569..50da0e102ca0 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -1633,8 +1633,7 @@ void clean_bdev_aliases(struct block_device *bdev, sector_t block, sector_t len)
 
 	end = (block + len - 1) >> (PAGE_SHIFT - bd_inode->i_blkbits);
 	pagevec_init(&pvec, 0);
-	while (pagevec_lookup_range(&pvec, bd_mapping, &index, end,
-				    PAGEVEC_SIZE)) {
+	while (pagevec_lookup_range(&pvec, bd_mapping, &index, end)) {
 		count = pagevec_count(&pvec);
 		for (i = 0; i < count; i++) {
 			struct page *page = pvec.pages[i];
@@ -3552,7 +3551,7 @@ page_cache_seek_hole_data(struct inode *inode, loff_t offset, loff_t length,
 		unsigned nr_pages, i;
 
 		nr_pages = pagevec_lookup_range(&pvec, inode->i_mapping, &index,
-						end - 1, PAGEVEC_SIZE);
+						end - 1);
 		if (nr_pages == 0)
 			break;
 
diff --git a/fs/ext4/file.c b/fs/ext4/file.c
index 726344809721..484c1326dd6a 100644
--- a/fs/ext4/file.c
+++ b/fs/ext4/file.c
@@ -468,7 +468,7 @@ static int ext4_find_unwritten_pgoff(struct inode *inode,
 		unsigned long nr_pages;
 
 		nr_pages = pagevec_lookup_range(&pvec, inode->i_mapping,
-					&index, end, PAGEVEC_SIZE);
+					&index, end);
 		if (nr_pages == 0)
 			break;
 
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 6103ce0430df..ce68a3483666 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -1720,8 +1720,7 @@ static void mpage_release_unused_pages(struct mpage_da_data *mpd,
 
 	pagevec_init(&pvec, 0);
 	while (index <= end) {
-		nr_pages = pagevec_lookup_range(&pvec, mapping, &index, end,
-						PAGEVEC_SIZE);
+		nr_pages = pagevec_lookup_range(&pvec, mapping, &index, end);
 		if (nr_pages == 0)
 			break;
 		for (i = 0; i < nr_pages; i++) {
@@ -2348,7 +2347,7 @@ static int mpage_map_and_submit_buffers(struct mpage_da_data *mpd)
 	pagevec_init(&pvec, 0);
 	while (start <= end) {
 		nr_pages = pagevec_lookup_range(&pvec, inode->i_mapping,
-						&start, end, PAGEVEC_SIZE);
+						&start, end);
 		if (nr_pages == 0)
 			break;
 		for (i = 0; i < nr_pages; i++) {
diff --git a/fs/fscache/page.c b/fs/fscache/page.c
index 83018861dcd2..0ad3fd3ad0b4 100644
--- a/fs/fscache/page.c
+++ b/fs/fscache/page.c
@@ -1178,7 +1178,7 @@ void __fscache_uncache_all_inode_pages(struct fscache_cookie *cookie,
 	pagevec_init(&pvec, 0);
 	next = 0;
 	do {
-		if (!pagevec_lookup(&pvec, mapping, &next, PAGEVEC_SIZE))
+		if (!pagevec_lookup(&pvec, mapping, &next))
 			break;
 		for (i = 0; i < pagevec_count(&pvec); i++) {
 			struct page *page = pvec.pages[i];
diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index 8931236f3ef4..7c02b3f738e1 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -413,8 +413,7 @@ static void remove_inode_hugepages(struct inode *inode, loff_t lstart,
 		/*
 		 * When no more pages are found, we are done.
 		 */
-		if (!pagevec_lookup_range(&pvec, mapping, &next, end - 1,
-					  PAGEVEC_SIZE))
+		if (!pagevec_lookup_range(&pvec, mapping, &next, end - 1))
 			break;
 
 		for (i = 0; i < pagevec_count(&pvec); ++i) {
diff --git a/fs/nilfs2/page.c b/fs/nilfs2/page.c
index 382a36c72d72..8616c46d33da 100644
--- a/fs/nilfs2/page.c
+++ b/fs/nilfs2/page.c
@@ -312,7 +312,7 @@ void nilfs_copy_back_pages(struct address_space *dmap,
 
 	pagevec_init(&pvec, 0);
 repeat:
-	n = pagevec_lookup(&pvec, smap, &index, PAGEVEC_SIZE);
+	n = pagevec_lookup(&pvec, smap, &index);
 	if (!n)
 		return;
 
diff --git a/include/linux/pagevec.h b/include/linux/pagevec.h
index 7df056910437..4dcd5506f1ed 100644
--- a/include/linux/pagevec.h
+++ b/include/linux/pagevec.h
@@ -29,13 +29,12 @@ unsigned pagevec_lookup_entries(struct pagevec *pvec,
 void pagevec_remove_exceptionals(struct pagevec *pvec);
 unsigned pagevec_lookup_range(struct pagevec *pvec,
 			      struct address_space *mapping,
-			      pgoff_t *start, pgoff_t end, unsigned nr_pages);
+			      pgoff_t *start, pgoff_t end);
 static inline unsigned pagevec_lookup(struct pagevec *pvec,
 				      struct address_space *mapping,
-				      pgoff_t *start, unsigned nr_pages)
+				      pgoff_t *start)
 {
-	return pagevec_lookup_range(pvec, mapping, start, (pgoff_t)-1,
-				    nr_pages);
+	return pagevec_lookup_range(pvec, mapping, start, (pgoff_t)-1);
 }
 
 unsigned pagevec_lookup_tag(struct pagevec *pvec,
diff --git a/mm/swap.c b/mm/swap.c
index e06e9aa2478e..62d96b8e5eb3 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -967,10 +967,9 @@ void pagevec_remove_exceptionals(struct pagevec *pvec)
  * reached.
  */
 unsigned pagevec_lookup_range(struct pagevec *pvec,
-		struct address_space *mapping, pgoff_t *start, pgoff_t end,
-		unsigned nr_pages)
+		struct address_space *mapping, pgoff_t *start, pgoff_t end)
 {
-	pvec->nr = find_get_pages_range(mapping, start, end, nr_pages,
+	pvec->nr = find_get_pages_range(mapping, start, end, PAGEVEC_SIZE,
 					pvec->pages);
 	return pagevec_count(pvec);
 }
-- 
cgit v1.2.3


From 04fecbf51b3cb79a628d50b65797c4866342b8d2 Mon Sep 17 00:00:00 2001
From: Matthias Kaehlcke <mka@chromium.org>
Date: Wed, 6 Sep 2017 16:22:09 -0700
Subject: mm: memcontrol: use int for event/state parameter in several
 functions

Several functions use an enum type as parameter for an event/state, but
are called in some locations with an argument of a different enum type.
Adjust the interface of these functions to reality by changing the
parameter to int.

This fixes a ton of enum-conversion warnings that are generated when
building the kernel with clang.

[mka@chromium.org: also change parameter type of inc/dec/mod_memcg_page_state()]
  Link: http://lkml.kernel.org/r/20170728213442.93823-1-mka@chromium.org
Link: http://lkml.kernel.org/r/20170727211004.34435-1-mka@chromium.org
Signed-off-by: Matthias Kaehlcke <mka@chromium.org>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Acked-by: Michal Hocko <mhocko@suse.com>
Cc: Vladimir Davydov <vdavydov.dev@gmail.com>
Cc: Doug Anderson <dianders@chromium.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/memcontrol.h | 52 ++++++++++++++++++++++++++++------------------
 mm/memcontrol.c            |  4 +++-
 2 files changed, 35 insertions(+), 21 deletions(-)

(limited to 'include')

diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index 9b15a4bcfa77..69966c461d1c 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -488,8 +488,9 @@ struct mem_cgroup *lock_page_memcg(struct page *page);
 void __unlock_page_memcg(struct mem_cgroup *memcg);
 void unlock_page_memcg(struct page *page);
 
+/* idx can be of type enum memcg_stat_item or node_stat_item */
 static inline unsigned long memcg_page_state(struct mem_cgroup *memcg,
-					     enum memcg_stat_item idx)
+					     int idx)
 {
 	long val = 0;
 	int cpu;
@@ -503,15 +504,17 @@ static inline unsigned long memcg_page_state(struct mem_cgroup *memcg,
 	return val;
 }
 
+/* idx can be of type enum memcg_stat_item or node_stat_item */
 static inline void __mod_memcg_state(struct mem_cgroup *memcg,
-				     enum memcg_stat_item idx, int val)
+				     int idx, int val)
 {
 	if (!mem_cgroup_disabled())
 		__this_cpu_add(memcg->stat->count[idx], val);
 }
 
+/* idx can be of type enum memcg_stat_item or node_stat_item */
 static inline void mod_memcg_state(struct mem_cgroup *memcg,
-				   enum memcg_stat_item idx, int val)
+				   int idx, int val)
 {
 	if (!mem_cgroup_disabled())
 		this_cpu_add(memcg->stat->count[idx], val);
@@ -535,14 +538,14 @@ static inline void mod_memcg_state(struct mem_cgroup *memcg,
  * Kernel pages are an exception to this, since they'll never move.
  */
 static inline void __mod_memcg_page_state(struct page *page,
-					  enum memcg_stat_item idx, int val)
+					  int idx, int val)
 {
 	if (page->mem_cgroup)
 		__mod_memcg_state(page->mem_cgroup, idx, val);
 }
 
 static inline void mod_memcg_page_state(struct page *page,
-					enum memcg_stat_item idx, int val)
+					int idx, int val)
 {
 	if (page->mem_cgroup)
 		mod_memcg_state(page->mem_cgroup, idx, val);
@@ -632,8 +635,9 @@ static inline void count_memcg_events(struct mem_cgroup *memcg,
 		this_cpu_add(memcg->stat->events[idx], count);
 }
 
+/* idx can be of type enum memcg_stat_item or node_stat_item */
 static inline void count_memcg_page_event(struct page *page,
-					  enum memcg_stat_item idx)
+					  int idx)
 {
 	if (page->mem_cgroup)
 		count_memcg_events(page->mem_cgroup, idx, 1);
@@ -846,31 +850,31 @@ static inline bool mem_cgroup_oom_synchronize(bool wait)
 }
 
 static inline unsigned long memcg_page_state(struct mem_cgroup *memcg,
-					     enum memcg_stat_item idx)
+					     int idx)
 {
 	return 0;
 }
 
 static inline void __mod_memcg_state(struct mem_cgroup *memcg,
-				     enum memcg_stat_item idx,
+				     int idx,
 				     int nr)
 {
 }
 
 static inline void mod_memcg_state(struct mem_cgroup *memcg,
-				   enum memcg_stat_item idx,
+				   int idx,
 				   int nr)
 {
 }
 
 static inline void __mod_memcg_page_state(struct page *page,
-					  enum memcg_stat_item idx,
+					  int idx,
 					  int nr)
 {
 }
 
 static inline void mod_memcg_page_state(struct page *page,
-					enum memcg_stat_item idx,
+					int idx,
 					int nr)
 {
 }
@@ -924,7 +928,7 @@ static inline void count_memcg_events(struct mem_cgroup *memcg,
 }
 
 static inline void count_memcg_page_event(struct page *page,
-					  enum memcg_stat_item idx)
+					  int idx)
 {
 }
 
@@ -934,26 +938,30 @@ void count_memcg_event_mm(struct mm_struct *mm, enum vm_event_item idx)
 }
 #endif /* CONFIG_MEMCG */
 
+/* idx can be of type enum memcg_stat_item or node_stat_item */
 static inline void __inc_memcg_state(struct mem_cgroup *memcg,
-				     enum memcg_stat_item idx)
+				     int idx)
 {
 	__mod_memcg_state(memcg, idx, 1);
 }
 
+/* idx can be of type enum memcg_stat_item or node_stat_item */
 static inline void __dec_memcg_state(struct mem_cgroup *memcg,
-				     enum memcg_stat_item idx)
+				     int idx)
 {
 	__mod_memcg_state(memcg, idx, -1);
 }
 
+/* idx can be of type enum memcg_stat_item or node_stat_item */
 static inline void __inc_memcg_page_state(struct page *page,
-					  enum memcg_stat_item idx)
+					  int idx)
 {
 	__mod_memcg_page_state(page, idx, 1);
 }
 
+/* idx can be of type enum memcg_stat_item or node_stat_item */
 static inline void __dec_memcg_page_state(struct page *page,
-					  enum memcg_stat_item idx)
+					  int idx)
 {
 	__mod_memcg_page_state(page, idx, -1);
 }
@@ -982,26 +990,30 @@ static inline void __dec_lruvec_page_state(struct page *page,
 	__mod_lruvec_page_state(page, idx, -1);
 }
 
+/* idx can be of type enum memcg_stat_item or node_stat_item */
 static inline void inc_memcg_state(struct mem_cgroup *memcg,
-				   enum memcg_stat_item idx)
+				   int idx)
 {
 	mod_memcg_state(memcg, idx, 1);
 }
 
+/* idx can be of type enum memcg_stat_item or node_stat_item */
 static inline void dec_memcg_state(struct mem_cgroup *memcg,
-				   enum memcg_stat_item idx)
+				   int idx)
 {
 	mod_memcg_state(memcg, idx, -1);
 }
 
+/* idx can be of type enum memcg_stat_item or node_stat_item */
 static inline void inc_memcg_page_state(struct page *page,
-					enum memcg_stat_item idx)
+					int idx)
 {
 	mod_memcg_page_state(page, idx, 1);
 }
 
+/* idx can be of type enum memcg_stat_item or node_stat_item */
 static inline void dec_memcg_page_state(struct page *page,
-					enum memcg_stat_item idx)
+					int idx)
 {
 	mod_memcg_page_state(page, idx, -1);
 }
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 2926c44519b6..3d3f7b1686ac 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -550,10 +550,12 @@ mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_node *mctz)
  * value, and reading all cpu value can be performance bottleneck in some
  * common workload, threshold and synchronization as vmstat[] should be
  * implemented.
+ *
+ * The parameter idx can be of type enum memcg_event_item or vm_event_item.
  */
 
 static unsigned long memcg_sum_events(struct mem_cgroup *memcg,
-				      enum memcg_event_item event)
+				      int event)
 {
 	unsigned long val = 0;
 	int cpu;
-- 
cgit v1.2.3


From e07098294adfd03d582af7626752255e3d170393 Mon Sep 17 00:00:00 2001
From: Huang Ying <ying.huang@intel.com>
Date: Wed, 6 Sep 2017 16:22:16 -0700
Subject: mm, THP, swap: support to reclaim swap space for THP swapped out

The normal swap slot reclaiming can be done when the swap count reaches
SWAP_HAS_CACHE.  But for the swap slot which is backing a THP, all swap
slots backing one THP must be reclaimed together, because the swap slot
may be used again when the THP is swapped out again later.  So the swap
slots backing one THP can be reclaimed together when the swap count for
all swap slots for the THP reached SWAP_HAS_CACHE.  In the patch, the
functions to check whether the swap count for all swap slots backing one
THP reached SWAP_HAS_CACHE are implemented and used when checking
whether a swap slot can be reclaimed.

To make it easier to determine whether a swap slot is backing a THP, a
new swap cluster flag named CLUSTER_FLAG_HUGE is added to mark a swap
cluster which is backing a THP (Transparent Huge Page).  Because THP
swap in as a whole isn't supported now.  After deleting the THP from the
swap cache (for example, swapping out finished), the CLUSTER_FLAG_HUGE
flag will be cleared.  So that, the normal pages inside THP can be
swapped in individually.

[ying.huang@intel.com: fix swap_page_trans_huge_swapped on HDD]
  Link: http://lkml.kernel.org/r/874ltsm0bi.fsf@yhuang-dev.intel.com
Link: http://lkml.kernel.org/r/20170724051840.2309-3-ying.huang@intel.com
Signed-off-by: "Huang, Ying" <ying.huang@intel.com>
Acked-by: Rik van Riel <riel@redhat.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Minchan Kim <minchan@kernel.org>
Cc: Hugh Dickins <hughd@google.com>
Cc: Shaohua Li <shli@kernel.org>
Cc: "Kirill A . Shutemov" <kirill.shutemov@linux.intel.com>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: Dan Williams <dan.j.williams@intel.com>
Cc: Jens Axboe <axboe@kernel.dk>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Ross Zwisler <ross.zwisler@intel.com> [for brd.c, zram_drv.c, pmem.c]
Cc: Vishal L Verma <vishal.l.verma@intel.com>
Cc: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/swap.h |  1 +
 mm/swapfile.c        | 78 +++++++++++++++++++++++++++++++++++++++++++++++-----
 2 files changed, 72 insertions(+), 7 deletions(-)

(limited to 'include')

diff --git a/include/linux/swap.h b/include/linux/swap.h
index d83d28e53e62..964b4f1fba4a 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -188,6 +188,7 @@ struct swap_cluster_info {
 };
 #define CLUSTER_FLAG_FREE 1 /* This cluster is free */
 #define CLUSTER_FLAG_NEXT_NULL 2 /* This cluster has no next cluster */
+#define CLUSTER_FLAG_HUGE 4 /* This cluster is backing a transparent huge page */
 
 /*
  * We assign a cluster to each CPU, so each CPU can allocate swap entry from
diff --git a/mm/swapfile.c b/mm/swapfile.c
index c32e9b23d642..164d9624d7d2 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -265,6 +265,16 @@ static inline void cluster_set_null(struct swap_cluster_info *info)
 	info->data = 0;
 }
 
+static inline bool cluster_is_huge(struct swap_cluster_info *info)
+{
+	return info->flags & CLUSTER_FLAG_HUGE;
+}
+
+static inline void cluster_clear_huge(struct swap_cluster_info *info)
+{
+	info->flags &= ~CLUSTER_FLAG_HUGE;
+}
+
 static inline struct swap_cluster_info *lock_cluster(struct swap_info_struct *si,
 						     unsigned long offset)
 {
@@ -846,7 +856,7 @@ static int swap_alloc_cluster(struct swap_info_struct *si, swp_entry_t *slot)
 	offset = idx * SWAPFILE_CLUSTER;
 	ci = lock_cluster(si, offset);
 	alloc_cluster(si, idx);
-	cluster_set_count_flag(ci, SWAPFILE_CLUSTER, 0);
+	cluster_set_count_flag(ci, SWAPFILE_CLUSTER, CLUSTER_FLAG_HUGE);
 
 	map = si->swap_map + offset;
 	for (i = 0; i < SWAPFILE_CLUSTER; i++)
@@ -1176,6 +1186,7 @@ static void swapcache_free_cluster(swp_entry_t entry)
 		return;
 
 	ci = lock_cluster(si, offset);
+	VM_BUG_ON(!cluster_is_huge(ci));
 	map = si->swap_map + offset;
 	for (i = 0; i < SWAPFILE_CLUSTER; i++) {
 		val = map[i];
@@ -1187,6 +1198,7 @@ static void swapcache_free_cluster(swp_entry_t entry)
 		for (i = 0; i < SWAPFILE_CLUSTER; i++)
 			map[i] &= ~SWAP_HAS_CACHE;
 	}
+	cluster_clear_huge(ci);
 	unlock_cluster(ci);
 	if (free_entries == SWAPFILE_CLUSTER) {
 		spin_lock(&si->lock);
@@ -1350,6 +1362,54 @@ out:
 	return count;
 }
 
+#ifdef CONFIG_THP_SWAP
+static bool swap_page_trans_huge_swapped(struct swap_info_struct *si,
+					 swp_entry_t entry)
+{
+	struct swap_cluster_info *ci;
+	unsigned char *map = si->swap_map;
+	unsigned long roffset = swp_offset(entry);
+	unsigned long offset = round_down(roffset, SWAPFILE_CLUSTER);
+	int i;
+	bool ret = false;
+
+	ci = lock_cluster_or_swap_info(si, offset);
+	if (!ci || !cluster_is_huge(ci)) {
+		if (map[roffset] != SWAP_HAS_CACHE)
+			ret = true;
+		goto unlock_out;
+	}
+	for (i = 0; i < SWAPFILE_CLUSTER; i++) {
+		if (map[offset + i] != SWAP_HAS_CACHE) {
+			ret = true;
+			break;
+		}
+	}
+unlock_out:
+	unlock_cluster_or_swap_info(si, ci);
+	return ret;
+}
+
+static bool page_swapped(struct page *page)
+{
+	swp_entry_t entry;
+	struct swap_info_struct *si;
+
+	if (likely(!PageTransCompound(page)))
+		return page_swapcount(page) != 0;
+
+	page = compound_head(page);
+	entry.val = page_private(page);
+	si = _swap_info_get(entry);
+	if (si)
+		return swap_page_trans_huge_swapped(si, entry);
+	return false;
+}
+#else
+#define swap_page_trans_huge_swapped(si, entry)	swap_swapcount(si, entry)
+#define page_swapped(page)			(page_swapcount(page) != 0)
+#endif
+
 /*
  * We can write to an anon page without COW if there are no other references
  * to it.  And as a side-effect, free up its swap: because the old content
@@ -1404,7 +1464,7 @@ int try_to_free_swap(struct page *page)
 		return 0;
 	if (PageWriteback(page))
 		return 0;
-	if (page_swapcount(page))
+	if (page_swapped(page))
 		return 0;
 
 	/*
@@ -1425,6 +1485,7 @@ int try_to_free_swap(struct page *page)
 	if (pm_suspended_storage())
 		return 0;
 
+	page = compound_head(page);
 	delete_from_swap_cache(page);
 	SetPageDirty(page);
 	return 1;
@@ -1446,7 +1507,8 @@ int free_swap_and_cache(swp_entry_t entry)
 	p = _swap_info_get(entry);
 	if (p) {
 		count = __swap_entry_free(p, entry, 1);
-		if (count == SWAP_HAS_CACHE) {
+		if (count == SWAP_HAS_CACHE &&
+		    !swap_page_trans_huge_swapped(p, entry)) {
 			page = find_get_page(swap_address_space(entry),
 					     swp_offset(entry));
 			if (page && !trylock_page(page)) {
@@ -1463,7 +1525,8 @@ int free_swap_and_cache(swp_entry_t entry)
 		 */
 		if (PageSwapCache(page) && !PageWriteback(page) &&
 		    (!page_mapped(page) || mem_cgroup_swap_full(page)) &&
-		    !swap_swapcount(p, entry)) {
+		    !swap_page_trans_huge_swapped(p, entry)) {
+			page = compound_head(page);
 			delete_from_swap_cache(page);
 			SetPageDirty(page);
 		}
@@ -2017,7 +2080,7 @@ int try_to_unuse(unsigned int type, bool frontswap,
 				.sync_mode = WB_SYNC_NONE,
 			};
 
-			swap_writepage(page, &wbc);
+			swap_writepage(compound_head(page), &wbc);
 			lock_page(page);
 			wait_on_page_writeback(page);
 		}
@@ -2030,8 +2093,9 @@ int try_to_unuse(unsigned int type, bool frontswap,
 		 * delete, since it may not have been written out to swap yet.
 		 */
 		if (PageSwapCache(page) &&
-		    likely(page_private(page) == entry.val))
-			delete_from_swap_cache(page);
+		    likely(page_private(page) == entry.val) &&
+		    !page_swapped(page))
+			delete_from_swap_cache(compound_head(page));
 
 		/*
 		 * So we could skip searching mms once swap count went
-- 
cgit v1.2.3


From ba3c4ce6def4915093be80585ff69f780630f32f Mon Sep 17 00:00:00 2001
From: Huang Ying <ying.huang@intel.com>
Date: Wed, 6 Sep 2017 16:22:19 -0700
Subject: mm, THP, swap: make reuse_swap_page() works for THP swapped out

After supporting to delay THP (Transparent Huge Page) splitting after
swapped out, it is possible that some page table mappings of the THP are
turned into swap entries.  So reuse_swap_page() need to check the swap
count in addition to the map count as before.  This patch done that.

In the huge PMD write protect fault handler, in addition to the page map
count, the swap count need to be checked too, so the page lock need to
be acquired too when calling reuse_swap_page() in addition to the page
table lock.

[ying.huang@intel.com: silence a compiler warning]
  Link: http://lkml.kernel.org/r/87bmnzizjy.fsf@yhuang-dev.intel.com
Link: http://lkml.kernel.org/r/20170724051840.2309-4-ying.huang@intel.com
Signed-off-by: "Huang, Ying" <ying.huang@intel.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Minchan Kim <minchan@kernel.org>
Cc: Hugh Dickins <hughd@google.com>
Cc: Shaohua Li <shli@kernel.org>
Cc: Rik van Riel <riel@redhat.com>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: "Kirill A . Shutemov" <kirill.shutemov@linux.intel.com>
Cc: Dan Williams <dan.j.williams@intel.com>
Cc: Jens Axboe <axboe@kernel.dk>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Ross Zwisler <ross.zwisler@intel.com> [for brd.c, zram_drv.c, pmem.c]
Cc: Vishal L Verma <vishal.l.verma@intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/swap.h |   4 +-
 mm/huge_memory.c     |  16 +++++++-
 mm/memory.c          |   6 +--
 mm/swapfile.c        | 102 ++++++++++++++++++++++++++++++++++++++++++++++-----
 4 files changed, 113 insertions(+), 15 deletions(-)

(limited to 'include')

diff --git a/include/linux/swap.h b/include/linux/swap.h
index 964b4f1fba4a..7176ba780e83 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -510,8 +510,8 @@ static inline int swp_swapcount(swp_entry_t entry)
 	return 0;
 }
 
-#define reuse_swap_page(page, total_mapcount) \
-	(page_trans_huge_mapcount(page, total_mapcount) == 1)
+#define reuse_swap_page(page, total_map_swapcount) \
+	(page_trans_huge_mapcount(page, total_map_swapcount) == 1)
 
 static inline int try_to_free_swap(struct page *page)
 {
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 7f0b3f6db923..02dfe635c9fe 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -1240,15 +1240,29 @@ int do_huge_pmd_wp_page(struct vm_fault *vmf, pmd_t orig_pmd)
 	 * We can only reuse the page if nobody else maps the huge page or it's
 	 * part.
 	 */
-	if (page_trans_huge_mapcount(page, NULL) == 1) {
+	if (!trylock_page(page)) {
+		get_page(page);
+		spin_unlock(vmf->ptl);
+		lock_page(page);
+		spin_lock(vmf->ptl);
+		if (unlikely(!pmd_same(*vmf->pmd, orig_pmd))) {
+			unlock_page(page);
+			put_page(page);
+			goto out_unlock;
+		}
+		put_page(page);
+	}
+	if (reuse_swap_page(page, NULL)) {
 		pmd_t entry;
 		entry = pmd_mkyoung(orig_pmd);
 		entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
 		if (pmdp_set_access_flags(vma, haddr, vmf->pmd, entry,  1))
 			update_mmu_cache_pmd(vma, vmf->address, vmf->pmd);
 		ret |= VM_FAULT_WRITE;
+		unlock_page(page);
 		goto out_unlock;
 	}
+	unlock_page(page);
 	get_page(page);
 	spin_unlock(vmf->ptl);
 alloc:
diff --git a/mm/memory.c b/mm/memory.c
index 1416485e278c..3dd8bb46391b 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -2619,7 +2619,7 @@ static int do_wp_page(struct vm_fault *vmf)
 	 * not dirty accountable.
 	 */
 	if (PageAnon(vmf->page) && !PageKsm(vmf->page)) {
-		int total_mapcount;
+		int total_map_swapcount;
 		if (!trylock_page(vmf->page)) {
 			get_page(vmf->page);
 			pte_unmap_unlock(vmf->pte, vmf->ptl);
@@ -2634,8 +2634,8 @@ static int do_wp_page(struct vm_fault *vmf)
 			}
 			put_page(vmf->page);
 		}
-		if (reuse_swap_page(vmf->page, &total_mapcount)) {
-			if (total_mapcount == 1) {
+		if (reuse_swap_page(vmf->page, &total_map_swapcount)) {
+			if (total_map_swapcount == 1) {
 				/*
 				 * The page is all ours. Move it to
 				 * our anon_vma so the rmap code will
diff --git a/mm/swapfile.c b/mm/swapfile.c
index 164d9624d7d2..2bfbfb87123a 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -1405,9 +1405,89 @@ static bool page_swapped(struct page *page)
 		return swap_page_trans_huge_swapped(si, entry);
 	return false;
 }
+
+static int page_trans_huge_map_swapcount(struct page *page, int *total_mapcount,
+					 int *total_swapcount)
+{
+	int i, map_swapcount, _total_mapcount, _total_swapcount;
+	unsigned long offset = 0;
+	struct swap_info_struct *si;
+	struct swap_cluster_info *ci = NULL;
+	unsigned char *map = NULL;
+	int mapcount, swapcount = 0;
+
+	/* hugetlbfs shouldn't call it */
+	VM_BUG_ON_PAGE(PageHuge(page), page);
+
+	if (likely(!PageTransCompound(page))) {
+		mapcount = atomic_read(&page->_mapcount) + 1;
+		if (total_mapcount)
+			*total_mapcount = mapcount;
+		if (PageSwapCache(page))
+			swapcount = page_swapcount(page);
+		if (total_swapcount)
+			*total_swapcount = swapcount;
+		return mapcount + swapcount;
+	}
+
+	page = compound_head(page);
+
+	_total_mapcount = _total_swapcount = map_swapcount = 0;
+	if (PageSwapCache(page)) {
+		swp_entry_t entry;
+
+		entry.val = page_private(page);
+		si = _swap_info_get(entry);
+		if (si) {
+			map = si->swap_map;
+			offset = swp_offset(entry);
+		}
+	}
+	if (map)
+		ci = lock_cluster(si, offset);
+	for (i = 0; i < HPAGE_PMD_NR; i++) {
+		mapcount = atomic_read(&page[i]._mapcount) + 1;
+		_total_mapcount += mapcount;
+		if (map) {
+			swapcount = swap_count(map[offset + i]);
+			_total_swapcount += swapcount;
+		}
+		map_swapcount = max(map_swapcount, mapcount + swapcount);
+	}
+	unlock_cluster(ci);
+	if (PageDoubleMap(page)) {
+		map_swapcount -= 1;
+		_total_mapcount -= HPAGE_PMD_NR;
+	}
+	mapcount = compound_mapcount(page);
+	map_swapcount += mapcount;
+	_total_mapcount += mapcount;
+	if (total_mapcount)
+		*total_mapcount = _total_mapcount;
+	if (total_swapcount)
+		*total_swapcount = _total_swapcount;
+
+	return map_swapcount;
+}
 #else
 #define swap_page_trans_huge_swapped(si, entry)	swap_swapcount(si, entry)
 #define page_swapped(page)			(page_swapcount(page) != 0)
+
+static int page_trans_huge_map_swapcount(struct page *page, int *total_mapcount,
+					 int *total_swapcount)
+{
+	int mapcount, swapcount = 0;
+
+	/* hugetlbfs shouldn't call it */
+	VM_BUG_ON_PAGE(PageHuge(page), page);
+
+	mapcount = page_trans_huge_mapcount(page, total_mapcount);
+	if (PageSwapCache(page))
+		swapcount = page_swapcount(page);
+	if (total_swapcount)
+		*total_swapcount = swapcount;
+	return mapcount + swapcount;
+}
 #endif
 
 /*
@@ -1416,23 +1496,27 @@ static bool page_swapped(struct page *page)
  * on disk will never be read, and seeking back there to write new content
  * later would only waste time away from clustering.
  *
- * NOTE: total_mapcount should not be relied upon by the caller if
+ * NOTE: total_map_swapcount should not be relied upon by the caller if
  * reuse_swap_page() returns false, but it may be always overwritten
  * (see the other implementation for CONFIG_SWAP=n).
  */
-bool reuse_swap_page(struct page *page, int *total_mapcount)
+bool reuse_swap_page(struct page *page, int *total_map_swapcount)
 {
-	int count;
+	int count, total_mapcount, total_swapcount;
 
 	VM_BUG_ON_PAGE(!PageLocked(page), page);
 	if (unlikely(PageKsm(page)))
 		return false;
-	count = page_trans_huge_mapcount(page, total_mapcount);
-	if (count <= 1 && PageSwapCache(page)) {
-		count += page_swapcount(page);
-		if (count != 1)
-			goto out;
+	count = page_trans_huge_map_swapcount(page, &total_mapcount,
+					      &total_swapcount);
+	if (total_map_swapcount)
+		*total_map_swapcount = total_mapcount + total_swapcount;
+	if (count == 1 && PageSwapCache(page) &&
+	    (likely(!PageTransCompound(page)) ||
+	     /* The remaining swap count will be freed soon */
+	     total_swapcount == page_swapcount(page))) {
 		if (!PageWriteback(page)) {
+			page = compound_head(page);
 			delete_from_swap_cache(page);
 			SetPageDirty(page);
 		} else {
@@ -1448,7 +1532,7 @@ bool reuse_swap_page(struct page *page, int *total_mapcount)
 			spin_unlock(&p->lock);
 		}
 	}
-out:
+
 	return count <= 1;
 }
 
-- 
cgit v1.2.3


From 225311a46411c37e20e73d99f4382f141e12f6f9 Mon Sep 17 00:00:00 2001
From: Huang Ying <ying.huang@intel.com>
Date: Wed, 6 Sep 2017 16:22:30 -0700
Subject: mm: test code to write THP to swap device as a whole

To support delay splitting THP (Transparent Huge Page) after swapped
out, we need to enhance swap writing code to support to write a THP as a
whole.  This will improve swap write IO performance.

As Ming Lei <ming.lei@redhat.com> pointed out, this should be based on
multipage bvec support, which hasn't been merged yet.  So this patch is
only for testing the functionality of the other patches in the series.
And will be reimplemented after multipage bvec support is merged.

Link: http://lkml.kernel.org/r/20170724051840.2309-7-ying.huang@intel.com
Signed-off-by: "Huang, Ying" <ying.huang@intel.com>
Cc: "Kirill A . Shutemov" <kirill.shutemov@linux.intel.com>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: Dan Williams <dan.j.williams@intel.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: Jens Axboe <axboe@kernel.dk>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Minchan Kim <minchan@kernel.org>
Cc: Rik van Riel <riel@redhat.com>
Cc: Ross Zwisler <ross.zwisler@intel.com> [for brd.c, zram_drv.c, pmem.c]
Cc: Shaohua Li <shli@kernel.org>
Cc: Vishal L Verma <vishal.l.verma@intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/bio.h           |  8 ++++++++
 include/linux/page-flags.h    |  4 ++--
 include/linux/vm_event_item.h |  1 +
 mm/page_io.c                  | 21 ++++++++++++++++-----
 mm/vmstat.c                   |  1 +
 5 files changed, 28 insertions(+), 7 deletions(-)

(limited to 'include')

diff --git a/include/linux/bio.h b/include/linux/bio.h
index 7b1cf4ba0902..1f0720de8990 100644
--- a/include/linux/bio.h
+++ b/include/linux/bio.h
@@ -38,7 +38,15 @@
 #define BIO_BUG_ON
 #endif
 
+#ifdef CONFIG_THP_SWAP
+#if HPAGE_PMD_NR > 256
+#define BIO_MAX_PAGES		HPAGE_PMD_NR
+#else
 #define BIO_MAX_PAGES		256
+#endif
+#else
+#define BIO_MAX_PAGES		256
+#endif
 
 #define bio_prio(bio)			(bio)->bi_ioprio
 #define bio_set_prio(bio, prio)		((bio)->bi_ioprio = prio)
diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h
index d33e3280c8ad..ba2d470d2d0a 100644
--- a/include/linux/page-flags.h
+++ b/include/linux/page-flags.h
@@ -303,8 +303,8 @@ PAGEFLAG(OwnerPriv1, owner_priv_1, PF_ANY)
  * Only test-and-set exist for PG_writeback.  The unconditional operators are
  * risky: they bypass page accounting.
  */
-TESTPAGEFLAG(Writeback, writeback, PF_NO_COMPOUND)
-	TESTSCFLAG(Writeback, writeback, PF_NO_COMPOUND)
+TESTPAGEFLAG(Writeback, writeback, PF_NO_TAIL)
+	TESTSCFLAG(Writeback, writeback, PF_NO_TAIL)
 PAGEFLAG(MappedToDisk, mappedtodisk, PF_NO_TAIL)
 
 /* PG_readahead is only used for reads; PG_reclaim is only for writes */
diff --git a/include/linux/vm_event_item.h b/include/linux/vm_event_item.h
index 37e8d31a4632..c75024e80eed 100644
--- a/include/linux/vm_event_item.h
+++ b/include/linux/vm_event_item.h
@@ -85,6 +85,7 @@ enum vm_event_item { PGPGIN, PGPGOUT, PSWPIN, PSWPOUT,
 #endif
 		THP_ZERO_PAGE_ALLOC,
 		THP_ZERO_PAGE_ALLOC_FAILED,
+		THP_SWPOUT,
 #endif
 #ifdef CONFIG_MEMORY_BALLOON
 		BALLOON_INFLATE,
diff --git a/mm/page_io.c b/mm/page_io.c
index 5f61b54ee1f3..20139b90125a 100644
--- a/mm/page_io.c
+++ b/mm/page_io.c
@@ -28,16 +28,18 @@
 static struct bio *get_swap_bio(gfp_t gfp_flags,
 				struct page *page, bio_end_io_t end_io)
 {
+	int i, nr = hpage_nr_pages(page);
 	struct bio *bio;
 
-	bio = bio_alloc(gfp_flags, 1);
+	bio = bio_alloc(gfp_flags, nr);
 	if (bio) {
 		bio->bi_iter.bi_sector = map_swap_page(page, &bio->bi_bdev);
 		bio->bi_iter.bi_sector <<= PAGE_SHIFT - 9;
 		bio->bi_end_io = end_io;
 
-		bio_add_page(bio, page, PAGE_SIZE, 0);
-		BUG_ON(bio->bi_iter.bi_size != PAGE_SIZE);
+		for (i = 0; i < nr; i++)
+			bio_add_page(bio, page + i, PAGE_SIZE, 0);
+		VM_BUG_ON(bio->bi_iter.bi_size != PAGE_SIZE * nr);
 	}
 	return bio;
 }
@@ -262,6 +264,15 @@ static sector_t swap_page_sector(struct page *page)
 	return (sector_t)__page_file_index(page) << (PAGE_SHIFT - 9);
 }
 
+static inline void count_swpout_vm_event(struct page *page)
+{
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+	if (unlikely(PageTransHuge(page)))
+		count_vm_event(THP_SWPOUT);
+#endif
+	count_vm_events(PSWPOUT, hpage_nr_pages(page));
+}
+
 int __swap_writepage(struct page *page, struct writeback_control *wbc,
 		bio_end_io_t end_write_func)
 {
@@ -313,7 +324,7 @@ int __swap_writepage(struct page *page, struct writeback_control *wbc,
 
 	ret = bdev_write_page(sis->bdev, swap_page_sector(page), page, wbc);
 	if (!ret) {
-		count_vm_event(PSWPOUT);
+		count_swpout_vm_event(page);
 		return 0;
 	}
 
@@ -326,7 +337,7 @@ int __swap_writepage(struct page *page, struct writeback_control *wbc,
 		goto out;
 	}
 	bio->bi_opf = REQ_OP_WRITE | wbc_to_write_flags(wbc);
-	count_vm_event(PSWPOUT);
+	count_swpout_vm_event(page);
 	set_page_writeback(page);
 	unlock_page(page);
 	submit_bio(bio);
diff --git a/mm/vmstat.c b/mm/vmstat.c
index 9a4441bbeef2..bccf426453cd 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -1071,6 +1071,7 @@ const char * const vmstat_text[] = {
 #endif
 	"thp_zero_page_alloc",
 	"thp_zero_page_alloc_failed",
+	"thp_swpout",
 #endif
 #ifdef CONFIG_MEMORY_BALLOON
 	"balloon_inflate",
-- 
cgit v1.2.3


From 59807685a7e77e8c8fe5925613968841538d53d7 Mon Sep 17 00:00:00 2001
From: Huang Ying <ying.huang@intel.com>
Date: Wed, 6 Sep 2017 16:22:34 -0700
Subject: mm, THP, swap: support splitting THP for THP swap out

After adding swapping out support for THP (Transparent Huge Page), it is
possible that a THP in swap cache (partly swapped out) need to be split.
To split such a THP, the swap cluster backing the THP need to be split
too, that is, the CLUSTER_FLAG_HUGE flag need to be cleared for the swap
cluster.  The patch implemented this.

And because the THP swap writing needs the THP keeps as huge page during
writing.  The PageWriteback flag is checked before splitting.

Link: http://lkml.kernel.org/r/20170724051840.2309-8-ying.huang@intel.com
Signed-off-by: "Huang, Ying" <ying.huang@intel.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Minchan Kim <minchan@kernel.org>
Cc: Hugh Dickins <hughd@google.com>
Cc: Shaohua Li <shli@kernel.org>
Cc: Rik van Riel <riel@redhat.com>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: "Kirill A . Shutemov" <kirill.shutemov@linux.intel.com>
Cc: Dan Williams <dan.j.williams@intel.com>
Cc: Jens Axboe <axboe@kernel.dk>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Ross Zwisler <ross.zwisler@intel.com> [for brd.c, zram_drv.c, pmem.c]
Cc: Vishal L Verma <vishal.l.verma@intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/swap.h |  9 +++++++++
 mm/huge_memory.c     | 10 +++++++++-
 mm/swapfile.c        | 15 +++++++++++++++
 3 files changed, 33 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/swap.h b/include/linux/swap.h
index 7176ba780e83..461cf107ad52 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -527,6 +527,15 @@ static inline swp_entry_t get_swap_page(struct page *page)
 
 #endif /* CONFIG_SWAP */
 
+#ifdef CONFIG_THP_SWAP
+extern int split_swap_cluster(swp_entry_t entry);
+#else
+static inline int split_swap_cluster(swp_entry_t entry)
+{
+	return 0;
+}
+#endif
+
 #ifdef CONFIG_MEMCG
 static inline int mem_cgroup_swappiness(struct mem_cgroup *memcg)
 {
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 02dfe635c9fe..772048c233d1 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -2481,6 +2481,9 @@ int split_huge_page_to_list(struct page *page, struct list_head *list)
 	VM_BUG_ON_PAGE(!PageLocked(page), page);
 	VM_BUG_ON_PAGE(!PageCompound(page), page);
 
+	if (PageWriteback(page))
+		return -EBUSY;
+
 	if (PageAnon(head)) {
 		/*
 		 * The caller does not necessarily hold an mmap_sem that would
@@ -2558,7 +2561,12 @@ int split_huge_page_to_list(struct page *page, struct list_head *list)
 			__dec_node_page_state(page, NR_SHMEM_THPS);
 		spin_unlock(&pgdata->split_queue_lock);
 		__split_huge_page(page, list, flags);
-		ret = 0;
+		if (PageSwapCache(head)) {
+			swp_entry_t entry = { .val = page_private(head) };
+
+			ret = split_swap_cluster(entry);
+		} else
+			ret = 0;
 	} else {
 		if (IS_ENABLED(CONFIG_DEBUG_VM) && mapcount) {
 			pr_alert("total_mapcount: %u, page_count(): %u\n",
diff --git a/mm/swapfile.c b/mm/swapfile.c
index 267b1fe41844..42eff9e4e972 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -1216,6 +1216,21 @@ static void swapcache_free_cluster(swp_entry_t entry)
 		}
 	}
 }
+
+int split_swap_cluster(swp_entry_t entry)
+{
+	struct swap_info_struct *si;
+	struct swap_cluster_info *ci;
+	unsigned long offset = swp_offset(entry);
+
+	si = _swap_info_get(entry);
+	if (!si)
+		return -EBUSY;
+	ci = lock_cluster(si, offset);
+	cluster_clear_huge(ci);
+	unlock_cluster(ci);
+	return 0;
+}
 #else
 static inline void swapcache_free_cluster(swp_entry_t entry)
 {
-- 
cgit v1.2.3


From fe490cc0fe9e6ee48cc48bb5dc463bc5f0f1428f Mon Sep 17 00:00:00 2001
From: Huang Ying <ying.huang@intel.com>
Date: Wed, 6 Sep 2017 16:22:52 -0700
Subject: mm, THP, swap: add THP swapping out fallback counting

When swapping out THP (Transparent Huge Page), instead of swapping out
the THP as a whole, sometimes we have to fallback to split the THP into
normal pages before swapping, because no free swap clusters are
available, or cgroup limit is exceeded, etc.  To count the number of the
fallback, a new VM event THP_SWPOUT_FALLBACK is added, and counted when
we fallback to split the THP.

Link: http://lkml.kernel.org/r/20170724051840.2309-13-ying.huang@intel.com
Signed-off-by: "Huang, Ying" <ying.huang@intel.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Minchan Kim <minchan@kernel.org>
Cc: Hugh Dickins <hughd@google.com>
Cc: Shaohua Li <shli@kernel.org>
Cc: Rik van Riel <riel@redhat.com>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: "Kirill A . Shutemov" <kirill.shutemov@linux.intel.com>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Dan Williams <dan.j.williams@intel.com>
Cc: Jens Axboe <axboe@kernel.dk>
Cc: Ross Zwisler <ross.zwisler@intel.com> [for brd.c, zram_drv.c, pmem.c]
Cc: Vishal L Verma <vishal.l.verma@intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/vm_event_item.h | 1 +
 mm/vmscan.c                   | 3 +++
 mm/vmstat.c                   | 1 +
 3 files changed, 5 insertions(+)

(limited to 'include')

diff --git a/include/linux/vm_event_item.h b/include/linux/vm_event_item.h
index c75024e80eed..e02820fc2861 100644
--- a/include/linux/vm_event_item.h
+++ b/include/linux/vm_event_item.h
@@ -86,6 +86,7 @@ enum vm_event_item { PGPGIN, PGPGOUT, PSWPIN, PSWPOUT,
 		THP_ZERO_PAGE_ALLOC,
 		THP_ZERO_PAGE_ALLOC_FAILED,
 		THP_SWPOUT,
+		THP_SWPOUT_FALLBACK,
 #endif
 #ifdef CONFIG_MEMORY_BALLOON
 		BALLOON_INFLATE,
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 6fbf707c0ce2..13d711dd8776 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -1154,6 +1154,9 @@ static unsigned long shrink_page_list(struct list_head *page_list,
 					if (split_huge_page_to_list(page,
 								    page_list))
 						goto activate_locked;
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+					count_vm_event(THP_SWPOUT_FALLBACK);
+#endif
 					if (!add_to_swap(page))
 						goto activate_locked;
 				}
diff --git a/mm/vmstat.c b/mm/vmstat.c
index bccf426453cd..e131b51654c7 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -1072,6 +1072,7 @@ const char * const vmstat_text[] = {
 	"thp_zero_page_alloc",
 	"thp_zero_page_alloc_failed",
 	"thp_swpout",
+	"thp_swpout_fallback",
 #endif
 #ifdef CONFIG_MEMORY_BALLOON
 	"balloon_inflate",
-- 
cgit v1.2.3


From 8d10396342063c79e92c4e46215370ab7b988569 Mon Sep 17 00:00:00 2001
From: Mike Rapoport <rppt@linux.vnet.ibm.com>
Date: Wed, 6 Sep 2017 16:23:02 -0700
Subject: userfaultfd: shmem: add shmem_mfill_zeropage_pte for userfaultfd
 support

shmem_mfill_zeropage_pte is the low level routine that implements the
userfaultfd UFFDIO_ZEROPAGE command.  Since for shmem mappings zero
pages are always allocated and accounted, the new method is a slight
extension of the existing shmem_mcopy_atomic_pte.

Link: http://lkml.kernel.org/r/1497939652-16528-4-git-send-email-rppt@linux.vnet.ibm.com
Signed-off-by: Mike Rapoport <rppt@linux.vnet.ibm.com>
Cc: "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: Hillf Danton <hillf.zj@alibaba-inc.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: Pavel Emelyanov <xemul@virtuozzo.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/shmem_fs.h |  6 +++++
 mm/shmem.c               | 62 +++++++++++++++++++++++++++++++++++-------------
 2 files changed, 51 insertions(+), 17 deletions(-)

(limited to 'include')

diff --git a/include/linux/shmem_fs.h b/include/linux/shmem_fs.h
index a7d6bd2a918f..b6c3540e07bc 100644
--- a/include/linux/shmem_fs.h
+++ b/include/linux/shmem_fs.h
@@ -137,9 +137,15 @@ extern int shmem_mcopy_atomic_pte(struct mm_struct *dst_mm, pmd_t *dst_pmd,
 				  unsigned long dst_addr,
 				  unsigned long src_addr,
 				  struct page **pagep);
+extern int shmem_mfill_zeropage_pte(struct mm_struct *dst_mm,
+				    pmd_t *dst_pmd,
+				    struct vm_area_struct *dst_vma,
+				    unsigned long dst_addr);
 #else
 #define shmem_mcopy_atomic_pte(dst_mm, dst_pte, dst_vma, dst_addr, \
 			       src_addr, pagep)        ({ BUG(); 0; })
+#define shmem_mfill_zeropage_pte(dst_mm, dst_pmd, dst_vma, \
+				 dst_addr)      ({ BUG(); 0; })
 #endif
 
 #endif
diff --git a/mm/shmem.c b/mm/shmem.c
index b7d84c4f2a5c..64bdc91187f7 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -2207,12 +2207,13 @@ bool shmem_mapping(struct address_space *mapping)
 	return mapping->a_ops == &shmem_aops;
 }
 
-int shmem_mcopy_atomic_pte(struct mm_struct *dst_mm,
-			   pmd_t *dst_pmd,
-			   struct vm_area_struct *dst_vma,
-			   unsigned long dst_addr,
-			   unsigned long src_addr,
-			   struct page **pagep)
+static int shmem_mfill_atomic_pte(struct mm_struct *dst_mm,
+				  pmd_t *dst_pmd,
+				  struct vm_area_struct *dst_vma,
+				  unsigned long dst_addr,
+				  unsigned long src_addr,
+				  bool zeropage,
+				  struct page **pagep)
 {
 	struct inode *inode = file_inode(dst_vma->vm_file);
 	struct shmem_inode_info *info = SHMEM_I(inode);
@@ -2235,17 +2236,22 @@ int shmem_mcopy_atomic_pte(struct mm_struct *dst_mm,
 		if (!page)
 			goto out_unacct_blocks;
 
-		page_kaddr = kmap_atomic(page);
-		ret = copy_from_user(page_kaddr, (const void __user *)src_addr,
-				     PAGE_SIZE);
-		kunmap_atomic(page_kaddr);
-
-		/* fallback to copy_from_user outside mmap_sem */
-		if (unlikely(ret)) {
-			*pagep = page;
-			shmem_inode_unacct_blocks(inode, 1);
-			/* don't free the page */
-			return -EFAULT;
+		if (!zeropage) {	/* mcopy_atomic */
+			page_kaddr = kmap_atomic(page);
+			ret = copy_from_user(page_kaddr,
+					     (const void __user *)src_addr,
+					     PAGE_SIZE);
+			kunmap_atomic(page_kaddr);
+
+			/* fallback to copy_from_user outside mmap_sem */
+			if (unlikely(ret)) {
+				*pagep = page;
+				shmem_inode_unacct_blocks(inode, 1);
+				/* don't free the page */
+				return -EFAULT;
+			}
+		} else {		/* mfill_zeropage_atomic */
+			clear_highpage(page);
 		}
 	} else {
 		page = *pagep;
@@ -2311,6 +2317,28 @@ out_unacct_blocks:
 	goto out;
 }
 
+int shmem_mcopy_atomic_pte(struct mm_struct *dst_mm,
+			   pmd_t *dst_pmd,
+			   struct vm_area_struct *dst_vma,
+			   unsigned long dst_addr,
+			   unsigned long src_addr,
+			   struct page **pagep)
+{
+	return shmem_mfill_atomic_pte(dst_mm, dst_pmd, dst_vma,
+				      dst_addr, src_addr, false, pagep);
+}
+
+int shmem_mfill_zeropage_pte(struct mm_struct *dst_mm,
+			     pmd_t *dst_pmd,
+			     struct vm_area_struct *dst_vma,
+			     unsigned long dst_addr)
+{
+	struct page *page = NULL;
+
+	return shmem_mfill_atomic_pte(dst_mm, dst_pmd, dst_vma,
+				      dst_addr, 0, true, &page);
+}
+
 #ifdef CONFIG_TMPFS
 static const struct inode_operations shmem_symlink_inode_operations;
 static const struct inode_operations shmem_short_symlink_operations;
-- 
cgit v1.2.3


From a446d6f9ce545248c62ab9694d05ac6f68563706 Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@redhat.com>
Date: Wed, 6 Sep 2017 16:23:22 -0700
Subject: include/linux/fs.h: remove unneeded forward definition of mm_struct

Link: http://lkml.kernel.org/r/20170525102927.6163-1-jlayton@redhat.com
Signed-off-by: Jeff Layton <jlayton@redhat.com>
Reviewed-by: Jan Kara <jack@suse.cz>
Cc: Alexander Viro <viro@zeniv.linux.org.uk>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/fs.h | 2 --
 1 file changed, 2 deletions(-)

(limited to 'include')

diff --git a/include/linux/fs.h b/include/linux/fs.h
index cc2e0f5a8fd1..baea880c5c03 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1270,8 +1270,6 @@ extern void f_delown(struct file *filp);
 extern pid_t f_getown(struct file *filp);
 extern int send_sigurg(struct fown_struct *fown);
 
-struct mm_struct;
-
 /*
  *	Umount options
  */
-- 
cgit v1.2.3


From e652f694598273c5d749687032d1534a30e6a3a5 Mon Sep 17 00:00:00 2001
From: Mike Kravetz <mike.kravetz@oracle.com>
Date: Wed, 6 Sep 2017 16:23:25 -0700
Subject: mm: hugetlb: define system call hugetlb size encodings in single file

Patch series "Consolidate system call hugetlb page size encodings".

These patches are the result of discussions in
https://lkml.org/lkml/2017/3/8/548.  The following changes are made in the
patch set:

1) Put all the log2 encoded huge page size definitions in a common
   header file.  The idea is have a set of definitions that can be use as
   the basis for system call specific definitions such as MAP_HUGE_* and
   SHM_HUGE_*.

2) Remove MAP_HUGE_* definitions in arch specific files.  All these
   definitions are the same.  Consolidate all definitions in the primary
   user header file (uapi/linux/mman.h).

3) Remove SHM_HUGE_* definitions intended for user space from kernel
   header file, and add to user (uapi/linux/shm.h) header file.  Add
   definitions for all known huge page size encodings as in mmap.

This patch (of 3):

If hugetlb pages are requested in mmap or shmget system calls, a huge
page size other than default can be requested.  This is accomplished by
encoding the log2 of the huge page size in the upper bits of the flag
argument.  asm-generic and arch specific headers all define the same
values for these encodings.

Put common definitions in a single header file.  The primary uapi header
files for mmap and shm will use these definitions as a basis for
definitions specific to those system calls.

Link: http://lkml.kernel.org/r/1501527386-10736-2-git-send-email-mike.kravetz@oracle.com
Signed-off-by: Mike Kravetz <mike.kravetz@oracle.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: Andi Kleen <ak@linux.intel.com>
Cc: Michael Kerrisk <mtk.manpages@gmail.com>
Cc: Davidlohr Bueso <dbueso@suse.de>
Cc: Anshuman Khandual <khandual@linux.vnet.ibm.com>
Cc: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: Arnd Bergmann <arnd@arndb.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/uapi/asm-generic/hugetlb_encode.h | 34 +++++++++++++++++++++++++++++++
 1 file changed, 34 insertions(+)
 create mode 100644 include/uapi/asm-generic/hugetlb_encode.h

(limited to 'include')

diff --git a/include/uapi/asm-generic/hugetlb_encode.h b/include/uapi/asm-generic/hugetlb_encode.h
new file mode 100644
index 000000000000..e4732d3c2998
--- /dev/null
+++ b/include/uapi/asm-generic/hugetlb_encode.h
@@ -0,0 +1,34 @@
+#ifndef _ASM_GENERIC_HUGETLB_ENCODE_H_
+#define _ASM_GENERIC_HUGETLB_ENCODE_H_
+
+/*
+ * Several system calls take a flag to request "hugetlb" huge pages.
+ * Without further specification, these system calls will use the
+ * system's default huge page size.  If a system supports multiple
+ * huge page sizes, the desired huge page size can be specified in
+ * bits [26:31] of the flag arguments.  The value in these 6 bits
+ * will encode the log2 of the huge page size.
+ *
+ * The following definitions are associated with this huge page size
+ * encoding in flag arguments.  System call specific header files
+ * that use this encoding should include this file.  They can then
+ * provide definitions based on these with their own specific prefix.
+ * for example:
+ * #define MAP_HUGE_SHIFT HUGETLB_FLAG_ENCODE_SHIFT
+ */
+
+#define HUGETLB_FLAG_ENCODE_SHIFT	26
+#define HUGETLB_FLAG_ENCODE_MASK	0x3f
+
+#define HUGETLB_FLAG_ENCODE_64KB	(16 << HUGETLB_FLAG_ENCODE_SHIFT)
+#define HUGETLB_FLAG_ENCODE_512KB	(19 << HUGETLB_FLAG_ENCODE_SHIFT)
+#define HUGETLB_FLAG_ENCODE_1MB		(20 << HUGETLB_FLAG_ENCODE_SHIFT)
+#define HUGETLB_FLAG_ENCODE_2MB		(21 << HUGETLB_FLAG_ENCODE_SHIFT)
+#define HUGETLB_FLAG_ENCODE_8MB		(23 << HUGETLB_FLAG_ENCODE_SHIFT)
+#define HUGETLB_FLAG_ENCODE_16MB	(24 << HUGETLB_FLAG_ENCODE_SHIFT)
+#define HUGETLB_FLAG_ENCODE_256MB	(28 << HUGETLB_FLAG_ENCODE_SHIFT)
+#define HUGETLB_FLAG_ENCODE_1GB		(30 << HUGETLB_FLAG_ENCODE_SHIFT)
+#define HUGETLB_FLAG_ENCODE_2GB		(31 << HUGETLB_FLAG_ENCODE_SHIFT)
+#define HUGETLB_FLAG_ENCODE_16GB	(34 << HUGETLB_FLAG_ENCODE_SHIFT)
+
+#endif /* _ASM_GENERIC_HUGETLB_ENCODE_H_ */
-- 
cgit v1.2.3


From aafd4562dfee81a40ba21b5ea3cf5e06664bc7f6 Mon Sep 17 00:00:00 2001
From: Mike Kravetz <mike.kravetz@oracle.com>
Date: Wed, 6 Sep 2017 16:23:29 -0700
Subject: mm: arch: consolidate mmap hugetlb size encodings

A non-default huge page size can be encoded in the flags argument of the
mmap system call.  The definitions for these encodings are in arch
specific header files.  However, all architectures use the same values.

Consolidate all the definitions in the primary user header file
(uapi/linux/mman.h).  Include definitions for all known huge page sizes.
Use the generic encoding definitions in hugetlb_encode.h as the basis
for these definitions.

Link: http://lkml.kernel.org/r/1501527386-10736-3-git-send-email-mike.kravetz@oracle.com
Signed-off-by: Mike Kravetz <mike.kravetz@oracle.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Cc: Andi Kleen <ak@linux.intel.com>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
Cc: Anshuman Khandual <khandual@linux.vnet.ibm.com>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Davidlohr Bueso <dbueso@suse.de>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: Michael Kerrisk <mtk.manpages@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/alpha/include/uapi/asm/mman.h     | 11 -----------
 arch/mips/include/uapi/asm/mman.h      | 11 -----------
 arch/parisc/include/uapi/asm/mman.h    | 11 -----------
 arch/powerpc/include/uapi/asm/mman.h   | 16 ----------------
 arch/x86/include/uapi/asm/mman.h       |  3 ---
 arch/xtensa/include/uapi/asm/mman.h    | 11 -----------
 include/uapi/asm-generic/mman-common.h | 11 -----------
 include/uapi/linux/mman.h              | 22 ++++++++++++++++++++++
 8 files changed, 22 insertions(+), 74 deletions(-)

(limited to 'include')

diff --git a/arch/alpha/include/uapi/asm/mman.h b/arch/alpha/include/uapi/asm/mman.h
index 02760f6e6ca4..13b52aad3c43 100644
--- a/arch/alpha/include/uapi/asm/mman.h
+++ b/arch/alpha/include/uapi/asm/mman.h
@@ -67,17 +67,6 @@
 /* compatibility flags */
 #define MAP_FILE	0
 
-/*
- * When MAP_HUGETLB is set bits [26:31] encode the log2 of the huge page size.
- * This gives us 6 bits, which is enough until someone invents 128 bit address
- * spaces.
- *
- * Assume these are all power of twos.
- * When 0 use the default page size.
- */
-#define MAP_HUGE_SHIFT	26
-#define MAP_HUGE_MASK	0x3f
-
 #define PKEY_DISABLE_ACCESS	0x1
 #define PKEY_DISABLE_WRITE	0x2
 #define PKEY_ACCESS_MASK	(PKEY_DISABLE_ACCESS |\
diff --git a/arch/mips/include/uapi/asm/mman.h b/arch/mips/include/uapi/asm/mman.h
index 655e2fb5395b..398eebcc3541 100644
--- a/arch/mips/include/uapi/asm/mman.h
+++ b/arch/mips/include/uapi/asm/mman.h
@@ -94,17 +94,6 @@
 /* compatibility flags */
 #define MAP_FILE	0
 
-/*
- * When MAP_HUGETLB is set bits [26:31] encode the log2 of the huge page size.
- * This gives us 6 bits, which is enough until someone invents 128 bit address
- * spaces.
- *
- * Assume these are all power of twos.
- * When 0 use the default page size.
- */
-#define MAP_HUGE_SHIFT	26
-#define MAP_HUGE_MASK	0x3f
-
 #define PKEY_DISABLE_ACCESS	0x1
 #define PKEY_DISABLE_WRITE	0x2
 #define PKEY_ACCESS_MASK	(PKEY_DISABLE_ACCESS |\
diff --git a/arch/parisc/include/uapi/asm/mman.h b/arch/parisc/include/uapi/asm/mman.h
index 9a9c2fe4be50..b87fbe3f338a 100644
--- a/arch/parisc/include/uapi/asm/mman.h
+++ b/arch/parisc/include/uapi/asm/mman.h
@@ -64,17 +64,6 @@
 #define MAP_FILE	0
 #define MAP_VARIABLE	0
 
-/*
- * When MAP_HUGETLB is set bits [26:31] encode the log2 of the huge page size.
- * This gives us 6 bits, which is enough until someone invents 128 bit address
- * spaces.
- *
- * Assume these are all power of twos.
- * When 0 use the default page size.
- */
-#define MAP_HUGE_SHIFT	26
-#define MAP_HUGE_MASK	0x3f
-
 #define PKEY_DISABLE_ACCESS	0x1
 #define PKEY_DISABLE_WRITE	0x2
 #define PKEY_ACCESS_MASK	(PKEY_DISABLE_ACCESS |\
diff --git a/arch/powerpc/include/uapi/asm/mman.h b/arch/powerpc/include/uapi/asm/mman.h
index ab45cc2f3101..03c06ba7464f 100644
--- a/arch/powerpc/include/uapi/asm/mman.h
+++ b/arch/powerpc/include/uapi/asm/mman.h
@@ -29,20 +29,4 @@
 #define MAP_STACK	0x20000		/* give out an address that is best suited for process/thread stacks */
 #define MAP_HUGETLB	0x40000		/* create a huge page mapping */
 
-/*
- * When MAP_HUGETLB is set, bits [26:31] of the flags argument to mmap(2),
- * encode the log2 of the huge page size. A value of zero indicates that the
- * default huge page size should be used. To use a non-default huge page size,
- * one of these defines can be used, or the size can be encoded by hand. Note
- * that on most systems only a subset, or possibly none, of these sizes will be
- * available.
- */
-#define MAP_HUGE_512KB	(19 << MAP_HUGE_SHIFT)	/* 512KB HugeTLB Page */
-#define MAP_HUGE_1MB	(20 << MAP_HUGE_SHIFT)	/* 1MB   HugeTLB Page */
-#define MAP_HUGE_2MB	(21 << MAP_HUGE_SHIFT)	/* 2MB   HugeTLB Page */
-#define MAP_HUGE_8MB	(23 << MAP_HUGE_SHIFT)	/* 8MB   HugeTLB Page */
-#define MAP_HUGE_16MB	(24 << MAP_HUGE_SHIFT)	/* 16MB  HugeTLB Page */
-#define MAP_HUGE_1GB	(30 << MAP_HUGE_SHIFT)	/* 1GB   HugeTLB Page */
-#define MAP_HUGE_16GB	(34 << MAP_HUGE_SHIFT)	/* 16GB  HugeTLB Page */
-
 #endif /* _UAPI_ASM_POWERPC_MMAN_H */
diff --git a/arch/x86/include/uapi/asm/mman.h b/arch/x86/include/uapi/asm/mman.h
index 39bca7fac087..3be08f07695c 100644
--- a/arch/x86/include/uapi/asm/mman.h
+++ b/arch/x86/include/uapi/asm/mman.h
@@ -3,9 +3,6 @@
 
 #define MAP_32BIT	0x40		/* only give out 32bit addresses */
 
-#define MAP_HUGE_2MB    (21 << MAP_HUGE_SHIFT)
-#define MAP_HUGE_1GB    (30 << MAP_HUGE_SHIFT)
-
 #ifdef CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS
 /*
  * Take the 4 protection key bits out of the vma->vm_flags
diff --git a/arch/xtensa/include/uapi/asm/mman.h b/arch/xtensa/include/uapi/asm/mman.h
index 24365b30aae9..8ce77a2e9bab 100644
--- a/arch/xtensa/include/uapi/asm/mman.h
+++ b/arch/xtensa/include/uapi/asm/mman.h
@@ -106,17 +106,6 @@
 /* compatibility flags */
 #define MAP_FILE	0
 
-/*
- * When MAP_HUGETLB is set bits [26:31] encode the log2 of the huge page size.
- * This gives us 6 bits, which is enough until someone invents 128 bit address
- * spaces.
- *
- * Assume these are all power of twos.
- * When 0 use the default page size.
- */
-#define MAP_HUGE_SHIFT	26
-#define MAP_HUGE_MASK	0x3f
-
 #define PKEY_DISABLE_ACCESS	0x1
 #define PKEY_DISABLE_WRITE	0x2
 #define PKEY_ACCESS_MASK	(PKEY_DISABLE_ACCESS |\
diff --git a/include/uapi/asm-generic/mman-common.h b/include/uapi/asm-generic/mman-common.h
index 8c27db0c5c08..d248f3c335b5 100644
--- a/include/uapi/asm-generic/mman-common.h
+++ b/include/uapi/asm-generic/mman-common.h
@@ -61,17 +61,6 @@
 /* compatibility flags */
 #define MAP_FILE	0
 
-/*
- * When MAP_HUGETLB is set bits [26:31] encode the log2 of the huge page size.
- * This gives us 6 bits, which is enough until someone invents 128 bit address
- * spaces.
- *
- * Assume these are all power of twos.
- * When 0 use the default page size.
- */
-#define MAP_HUGE_SHIFT	26
-#define MAP_HUGE_MASK	0x3f
-
 #define PKEY_DISABLE_ACCESS	0x1
 #define PKEY_DISABLE_WRITE	0x2
 #define PKEY_ACCESS_MASK	(PKEY_DISABLE_ACCESS |\
diff --git a/include/uapi/linux/mman.h b/include/uapi/linux/mman.h
index ade4acd3a90c..a937480d7cd3 100644
--- a/include/uapi/linux/mman.h
+++ b/include/uapi/linux/mman.h
@@ -2,6 +2,7 @@
 #define _UAPI_LINUX_MMAN_H
 
 #include <asm/mman.h>
+#include <asm-generic/hugetlb_encode.h>
 
 #define MREMAP_MAYMOVE	1
 #define MREMAP_FIXED	2
@@ -10,4 +11,25 @@
 #define OVERCOMMIT_ALWAYS		1
 #define OVERCOMMIT_NEVER		2
 
+/*
+ * Huge page size encoding when MAP_HUGETLB is specified, and a huge page
+ * size other than the default is desired.  See hugetlb_encode.h.
+ * All known huge page size encodings are provided here.  It is the
+ * responsibility of the application to know which sizes are supported on
+ * the running system.  See mmap(2) man page for details.
+ */
+#define MAP_HUGE_SHIFT	HUGETLB_FLAG_ENCODE_SHIFT
+#define MAP_HUGE_MASK	HUGETLB_FLAG_ENCODE_MASK
+
+#define MAP_HUGE_64KB	HUGETLB_FLAG_ENCODE_64KB
+#define MAP_HUGE_512KB	HUGETLB_FLAG_ENCODE_512KB
+#define MAP_HUGE_1MB	HUGETLB_FLAG_ENCODE_1MB
+#define MAP_HUGE_2MB	HUGETLB_FLAG_ENCODE_2MB
+#define MAP_HUGE_8MB	HUGETLB_FLAG_ENCODE_8MB
+#define MAP_HUGE_16MB	HUGETLB_FLAG_ENCODE_16MB
+#define MAP_HUGE_256MB	HUGETLB_FLAG_ENCODE_256MB
+#define MAP_HUGE_1GB	HUGETLB_FLAG_ENCODE_1GB
+#define MAP_HUGE_2GB	HUGETLB_FLAG_ENCODE_2GB
+#define MAP_HUGE_16GB	HUGETLB_FLAG_ENCODE_16GB
+
 #endif /* _UAPI_LINUX_MMAN_H */
-- 
cgit v1.2.3


From 4da243ac1cf6aeb30b7c555d56208982d66d6d33 Mon Sep 17 00:00:00 2001
From: Mike Kravetz <mike.kravetz@oracle.com>
Date: Wed, 6 Sep 2017 16:23:33 -0700
Subject: mm: shm: use new hugetlb size encoding definitions

Use the common definitions from hugetlb_encode.h header file for
encoding hugetlb size definitions in shmget system call flags.

In addition, move these definitions from the internal (kernel) to user
(uapi) header file.

Link: http://lkml.kernel.org/r/1501527386-10736-4-git-send-email-mike.kravetz@oracle.com
Signed-off-by: Mike Kravetz <mike.kravetz@oracle.com>
Suggested-by: Matthew Wilcox <willy@infradead.org>
Acked-by: Michal Hocko <mhocko@suse.com>
Cc: Andi Kleen <ak@linux.intel.com>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
Cc: Anshuman Khandual <khandual@linux.vnet.ibm.com>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Davidlohr Bueso <dbueso@suse.de>
Cc: Michael Kerrisk <mtk.manpages@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/shm.h      | 17 -----------------
 include/uapi/linux/shm.h | 31 +++++++++++++++++++++++++++++--
 2 files changed, 29 insertions(+), 19 deletions(-)

(limited to 'include')

diff --git a/include/linux/shm.h b/include/linux/shm.h
index 0fb7061ec54c..21a5e6c43385 100644
--- a/include/linux/shm.h
+++ b/include/linux/shm.h
@@ -27,23 +27,6 @@ struct shmid_kernel /* private to the kernel */
 /* shm_mode upper byte flags */
 #define	SHM_DEST	01000	/* segment will be destroyed on last detach */
 #define SHM_LOCKED      02000   /* segment will not be swapped */
-#define SHM_HUGETLB     04000   /* segment will use huge TLB pages */
-#define SHM_NORESERVE   010000  /* don't check for reservations */
-
-/* Bits [26:31] are reserved */
-
-/*
- * When SHM_HUGETLB is set bits [26:31] encode the log2 of the huge page size.
- * This gives us 6 bits, which is enough until someone invents 128 bit address
- * spaces.
- *
- * Assume these are all power of twos.
- * When 0 use the default page size.
- */
-#define SHM_HUGE_SHIFT  26
-#define SHM_HUGE_MASK   0x3f
-#define SHM_HUGE_2MB    (21 << SHM_HUGE_SHIFT)
-#define SHM_HUGE_1GB    (30 << SHM_HUGE_SHIFT)
 
 #ifdef CONFIG_SYSVIPC
 struct sysv_shm {
diff --git a/include/uapi/linux/shm.h b/include/uapi/linux/shm.h
index 1fbf24ea37fd..cf23c873719d 100644
--- a/include/uapi/linux/shm.h
+++ b/include/uapi/linux/shm.h
@@ -3,6 +3,7 @@
 
 #include <linux/ipc.h>
 #include <linux/errno.h>
+#include <asm-generic/hugetlb_encode.h>
 #ifndef __KERNEL__
 #include <unistd.h>
 #endif
@@ -40,11 +41,37 @@ struct shmid_ds {
 /* Include the definition of shmid64_ds and shminfo64 */
 #include <asm/shmbuf.h>
 
-/* permission flag for shmget */
+/*
+ * shmget() shmflg values.
+ */
+/* The bottom nine bits are the same as open(2) mode flags */
 #define SHM_R		0400	/* or S_IRUGO from <linux/stat.h> */
 #define SHM_W		0200	/* or S_IWUGO from <linux/stat.h> */
+/* Bits 9 & 10 are IPC_CREAT and IPC_EXCL */
+#define SHM_HUGETLB	04000	/* segment will use huge TLB pages */
+#define SHM_NORESERVE	010000	/* don't check for reservations */
+
+/*
+ * Huge page size encoding when SHM_HUGETLB is specified, and a huge page
+ * size other than the default is desired.  See hugetlb_encode.h
+ */
+#define SHM_HUGE_SHIFT	HUGETLB_FLAG_ENCODE_SHIFT
+#define SHM_HUGE_MASK	HUGETLB_FLAG_ENCODE_MASK
+
+#define SHM_HUGE_64KB	HUGETLB_FLAG_ENCODE_64KB
+#define SHM_HUGE_512KB	HUGETLB_FLAG_ENCODE_512KB
+#define SHM_HUGE_1MB	HUGETLB_FLAG_ENCODE_1MB
+#define SHM_HUGE_2MB	HUGETLB_FLAG_ENCODE_2MB
+#define SHM_HUGE_8MB	HUGETLB_FLAG_ENCODE_8MB
+#define SHM_HUGE_16MB	HUGETLB_FLAG_ENCODE_16MB
+#define SHM_HUGE_256MB	HUGETLB_FLAG_ENCODE_256MB
+#define SHM_HUGE_1GB	HUGETLB_FLAG_ENCODE_1GB
+#define SHM_HUGE_2GB	HUGETLB_FLAG_ENCODE_2GB
+#define SHM_HUGE_16GB	HUGETLB_FLAG_ENCODE_16GB
 
-/* mode for attach */
+/*
+ * shmat() shmflg values
+ */
 #define	SHM_RDONLY	010000	/* read-only access */
 #define	SHM_RND		020000	/* round attach address to SHMLBA boundary */
 #define	SHM_REMAP	040000	/* take-over region on attach */
-- 
cgit v1.2.3


From c41f012ade0b95b0a6e25c7150673e0554736165 Mon Sep 17 00:00:00 2001
From: Michal Hocko <mhocko@suse.com>
Date: Wed, 6 Sep 2017 16:23:36 -0700
Subject: mm: rename global_page_state to global_zone_page_state

global_page_state is error prone as a recent bug report pointed out [1].
It only returns proper values for zone based counters as the enum it
gets suggests.  We already have global_node_page_state so let's rename
global_page_state to global_zone_page_state to be more explicit here.
All existing users seems to be correct:

$ git grep "global_page_state(NR_" | sed 's@.*(\(NR_[A-Z_]*\)).*@\1@' | sort | uniq -c
      2 NR_BOUNCE
      2 NR_FREE_CMA_PAGES
     11 NR_FREE_PAGES
      1 NR_KERNEL_STACK_KB
      1 NR_MLOCK
      2 NR_PAGETABLE

This patch shouldn't introduce any functional change.

[1] http://lkml.kernel.org/r/201707260628.v6Q6SmaS030814@www262.sakura.ne.jp

Link: http://lkml.kernel.org/r/20170801134256.5400-2-hannes@cmpxchg.org
Signed-off-by: Michal Hocko <mhocko@suse.com>
Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Cc: Tetsuo Handa <penguin-kernel@i-love.sakura.ne.jp>
Cc: Josef Bacik <josef@toxicpanda.com>
Cc: Vladimir Davydov <vdavydov.dev@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/proc/meminfo.c      | 10 +++++-----
 include/linux/swap.h   |  4 ++--
 include/linux/vmstat.h |  4 ++--
 mm/mmap.c              |  6 +++---
 mm/nommu.c             |  4 ++--
 mm/page-writeback.c    |  4 ++--
 mm/page_alloc.c        | 12 ++++++------
 mm/util.c              |  2 +-
 mm/vmstat.c            |  4 ++--
 9 files changed, 25 insertions(+), 25 deletions(-)

(limited to 'include')

diff --git a/fs/proc/meminfo.c b/fs/proc/meminfo.c
index 509a61668d90..cdd979724c74 100644
--- a/fs/proc/meminfo.c
+++ b/fs/proc/meminfo.c
@@ -80,7 +80,7 @@ static int meminfo_proc_show(struct seq_file *m, void *v)
 	show_val_kb(m, "Active(file):   ", pages[LRU_ACTIVE_FILE]);
 	show_val_kb(m, "Inactive(file): ", pages[LRU_INACTIVE_FILE]);
 	show_val_kb(m, "Unevictable:    ", pages[LRU_UNEVICTABLE]);
-	show_val_kb(m, "Mlocked:        ", global_page_state(NR_MLOCK));
+	show_val_kb(m, "Mlocked:        ", global_zone_page_state(NR_MLOCK));
 
 #ifdef CONFIG_HIGHMEM
 	show_val_kb(m, "HighTotal:      ", i.totalhigh);
@@ -114,9 +114,9 @@ static int meminfo_proc_show(struct seq_file *m, void *v)
 	show_val_kb(m, "SUnreclaim:     ",
 		    global_node_page_state(NR_SLAB_UNRECLAIMABLE));
 	seq_printf(m, "KernelStack:    %8lu kB\n",
-		   global_page_state(NR_KERNEL_STACK_KB));
+		   global_zone_page_state(NR_KERNEL_STACK_KB));
 	show_val_kb(m, "PageTables:     ",
-		    global_page_state(NR_PAGETABLE));
+		    global_zone_page_state(NR_PAGETABLE));
 #ifdef CONFIG_QUICKLIST
 	show_val_kb(m, "Quicklists:     ", quicklist_total_size());
 #endif
@@ -124,7 +124,7 @@ static int meminfo_proc_show(struct seq_file *m, void *v)
 	show_val_kb(m, "NFS_Unstable:   ",
 		    global_node_page_state(NR_UNSTABLE_NFS));
 	show_val_kb(m, "Bounce:         ",
-		    global_page_state(NR_BOUNCE));
+		    global_zone_page_state(NR_BOUNCE));
 	show_val_kb(m, "WritebackTmp:   ",
 		    global_node_page_state(NR_WRITEBACK_TEMP));
 	show_val_kb(m, "CommitLimit:    ", vm_commit_limit());
@@ -151,7 +151,7 @@ static int meminfo_proc_show(struct seq_file *m, void *v)
 #ifdef CONFIG_CMA
 	show_val_kb(m, "CmaTotal:       ", totalcma_pages);
 	show_val_kb(m, "CmaFree:        ",
-		    global_page_state(NR_FREE_CMA_PAGES));
+		    global_zone_page_state(NR_FREE_CMA_PAGES));
 #endif
 
 	hugetlb_report_meminfo(m);
diff --git a/include/linux/swap.h b/include/linux/swap.h
index 461cf107ad52..76f1632eea5a 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -263,8 +263,8 @@ extern unsigned long totalreserve_pages;
 extern unsigned long nr_free_buffer_pages(void);
 extern unsigned long nr_free_pagecache_pages(void);
 
-/* Definition of global_page_state not available yet */
-#define nr_free_pages() global_page_state(NR_FREE_PAGES)
+/* Definition of global_zone_page_state not available yet */
+#define nr_free_pages() global_zone_page_state(NR_FREE_PAGES)
 
 
 /* linux/mm/swap.c */
diff --git a/include/linux/vmstat.h b/include/linux/vmstat.h
index b3d85f30d424..97e11ab573f0 100644
--- a/include/linux/vmstat.h
+++ b/include/linux/vmstat.h
@@ -123,7 +123,7 @@ static inline void node_page_state_add(long x, struct pglist_data *pgdat,
 	atomic_long_add(x, &vm_node_stat[item]);
 }
 
-static inline unsigned long global_page_state(enum zone_stat_item item)
+static inline unsigned long global_zone_page_state(enum zone_stat_item item)
 {
 	long x = atomic_long_read(&vm_zone_stat[item]);
 #ifdef CONFIG_SMP
@@ -199,7 +199,7 @@ extern unsigned long sum_zone_node_page_state(int node,
 extern unsigned long node_page_state(struct pglist_data *pgdat,
 						enum node_stat_item item);
 #else
-#define sum_zone_node_page_state(node, item) global_page_state(item)
+#define sum_zone_node_page_state(node, item) global_zone_page_state(item)
 #define node_page_state(node, item) global_node_page_state(item)
 #endif /* CONFIG_NUMA */
 
diff --git a/mm/mmap.c b/mm/mmap.c
index f19efcf75418..9800e29763f4 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -3514,7 +3514,7 @@ static int init_user_reserve(void)
 {
 	unsigned long free_kbytes;
 
-	free_kbytes = global_page_state(NR_FREE_PAGES) << (PAGE_SHIFT - 10);
+	free_kbytes = global_zone_page_state(NR_FREE_PAGES) << (PAGE_SHIFT - 10);
 
 	sysctl_user_reserve_kbytes = min(free_kbytes / 32, 1UL << 17);
 	return 0;
@@ -3535,7 +3535,7 @@ static int init_admin_reserve(void)
 {
 	unsigned long free_kbytes;
 
-	free_kbytes = global_page_state(NR_FREE_PAGES) << (PAGE_SHIFT - 10);
+	free_kbytes = global_zone_page_state(NR_FREE_PAGES) << (PAGE_SHIFT - 10);
 
 	sysctl_admin_reserve_kbytes = min(free_kbytes / 32, 1UL << 13);
 	return 0;
@@ -3579,7 +3579,7 @@ static int reserve_mem_notifier(struct notifier_block *nb,
 
 		break;
 	case MEM_OFFLINE:
-		free_kbytes = global_page_state(NR_FREE_PAGES) << (PAGE_SHIFT - 10);
+		free_kbytes = global_zone_page_state(NR_FREE_PAGES) << (PAGE_SHIFT - 10);
 
 		if (sysctl_user_reserve_kbytes > free_kbytes) {
 			init_user_reserve();
diff --git a/mm/nommu.c b/mm/nommu.c
index fc184f597d59..53d5175a5c14 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -1962,7 +1962,7 @@ static int __meminit init_user_reserve(void)
 {
 	unsigned long free_kbytes;
 
-	free_kbytes = global_page_state(NR_FREE_PAGES) << (PAGE_SHIFT - 10);
+	free_kbytes = global_zone_page_state(NR_FREE_PAGES) << (PAGE_SHIFT - 10);
 
 	sysctl_user_reserve_kbytes = min(free_kbytes / 32, 1UL << 17);
 	return 0;
@@ -1983,7 +1983,7 @@ static int __meminit init_admin_reserve(void)
 {
 	unsigned long free_kbytes;
 
-	free_kbytes = global_page_state(NR_FREE_PAGES) << (PAGE_SHIFT - 10);
+	free_kbytes = global_zone_page_state(NR_FREE_PAGES) << (PAGE_SHIFT - 10);
 
 	sysctl_admin_reserve_kbytes = min(free_kbytes / 32, 1UL << 13);
 	return 0;
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index bf050ab025b7..0b9c5cbe8eba 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -363,7 +363,7 @@ static unsigned long global_dirtyable_memory(void)
 {
 	unsigned long x;
 
-	x = global_page_state(NR_FREE_PAGES);
+	x = global_zone_page_state(NR_FREE_PAGES);
 	/*
 	 * Pages reserved for the kernel should not be considered
 	 * dirtyable, to prevent a situation where reclaim has to
@@ -1405,7 +1405,7 @@ void wb_update_bandwidth(struct bdi_writeback *wb, unsigned long start_time)
  * will look to see if it needs to start dirty throttling.
  *
  * If dirty_poll_interval is too low, big NUMA machines will call the expensive
- * global_page_state() too often. So scale it near-sqrt to the safety margin
+ * global_zone_page_state() too often. So scale it near-sqrt to the safety margin
  * (the number of pages we may dirty without exceeding the dirty limits).
  */
 static unsigned long dirty_poll_interval(unsigned long dirty,
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 0bea94af0423..a4562c058ec4 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -4509,7 +4509,7 @@ long si_mem_available(void)
 	 * Estimate the amount of memory available for userspace allocations,
 	 * without causing swapping.
 	 */
-	available = global_page_state(NR_FREE_PAGES) - totalreserve_pages;
+	available = global_zone_page_state(NR_FREE_PAGES) - totalreserve_pages;
 
 	/*
 	 * Not all the page cache can be freed, otherwise the system will
@@ -4538,7 +4538,7 @@ void si_meminfo(struct sysinfo *val)
 {
 	val->totalram = totalram_pages;
 	val->sharedram = global_node_page_state(NR_SHMEM);
-	val->freeram = global_page_state(NR_FREE_PAGES);
+	val->freeram = global_zone_page_state(NR_FREE_PAGES);
 	val->bufferram = nr_blockdev_pages();
 	val->totalhigh = totalhigh_pages;
 	val->freehigh = nr_free_highpages();
@@ -4673,11 +4673,11 @@ void show_free_areas(unsigned int filter, nodemask_t *nodemask)
 		global_node_page_state(NR_SLAB_UNRECLAIMABLE),
 		global_node_page_state(NR_FILE_MAPPED),
 		global_node_page_state(NR_SHMEM),
-		global_page_state(NR_PAGETABLE),
-		global_page_state(NR_BOUNCE),
-		global_page_state(NR_FREE_PAGES),
+		global_zone_page_state(NR_PAGETABLE),
+		global_zone_page_state(NR_BOUNCE),
+		global_zone_page_state(NR_FREE_PAGES),
 		free_pcp,
-		global_page_state(NR_FREE_CMA_PAGES));
+		global_zone_page_state(NR_FREE_CMA_PAGES));
 
 	for_each_online_pgdat(pgdat) {
 		if (show_mem_node_skip(filter, pgdat->node_id, nodemask))
diff --git a/mm/util.c b/mm/util.c
index 9ecddf568fe3..34e57fae959d 100644
--- a/mm/util.c
+++ b/mm/util.c
@@ -614,7 +614,7 @@ int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin)
 		return 0;
 
 	if (sysctl_overcommit_memory == OVERCOMMIT_GUESS) {
-		free = global_page_state(NR_FREE_PAGES);
+		free = global_zone_page_state(NR_FREE_PAGES);
 		free += global_node_page_state(NR_FILE_PAGES);
 
 		/*
diff --git a/mm/vmstat.c b/mm/vmstat.c
index e131b51654c7..ba9b202e8500 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -1502,7 +1502,7 @@ static void *vmstat_start(struct seq_file *m, loff_t *pos)
 	if (!v)
 		return ERR_PTR(-ENOMEM);
 	for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++)
-		v[i] = global_page_state(i);
+		v[i] = global_zone_page_state(i);
 	v += NR_VM_ZONE_STAT_ITEMS;
 
 	for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++)
@@ -1591,7 +1591,7 @@ int vmstat_refresh(struct ctl_table *table, int write,
 	 * which can equally be echo'ed to or cat'ted from (by root),
 	 * can be used to update the stats just before reading them.
 	 *
-	 * Oh, and since global_page_state() etc. are so careful to hide
+	 * Oh, and since global_zone_page_state() etc. are so careful to hide
 	 * transiently negative values, report an error here if any of
 	 * the stats is negative, so we know to go looking for imbalance.
 	 */
-- 
cgit v1.2.3


From 2d6d6f5a09a96cc1fec7ed992b825e05f64cb50e Mon Sep 17 00:00:00 2001
From: Prakash Sangappa <prakash.sangappa@oracle.com>
Date: Wed, 6 Sep 2017 16:23:39 -0700
Subject: mm: userfaultfd: add feature to request for a signal delivery

In some cases, userfaultfd mechanism should just deliver a SIGBUS signal
to the faulting process, instead of the page-fault event.  Dealing with
page-fault event using a monitor thread can be an overhead in these
cases.  For example applications like the database could use the
signaling mechanism for robustness purpose.

Database uses hugetlbfs for performance reason.  Files on hugetlbfs
filesystem are created and huge pages allocated using fallocate() API.
Pages are deallocated/freed using fallocate() hole punching support.
These files are mmapped and accessed by many processes as shared memory.
The database keeps track of which offsets in the hugetlbfs file have
pages allocated.

Any access to mapped address over holes in the file, which can occur due
to bugs in the application, is considered invalid and expect the process
to simply receive a SIGBUS.  However, currently when a hole in the file
is accessed via the mapped address, kernel/mm attempts to automatically
allocate a page at page fault time, resulting in implicitly filling the
hole in the file.  This may not be the desired behavior for applications
like the database that want to explicitly manage page allocations of
hugetlbfs files.

Using userfaultfd mechanism with this support to get a signal, database
application can prevent pages from being allocated implicitly when
processes access mapped address over holes in the file.

This patch adds UFFD_FEATURE_SIGBUS feature to userfaultfd mechnism to
request for a SIGBUS signal.

See following for previous discussion about the database requirement
leading to this proposal as suggested by Andrea.

http://www.spinics.net/lists/linux-mm/msg129224.html

Link: http://lkml.kernel.org/r/1501552446-748335-2-git-send-email-prakash.sangappa@oracle.com
Signed-off-by: Prakash Sangappa <prakash.sangappa@oracle.com>
Reviewed-by: Mike Rapoport <rppt@linux.vnet.ibm.com>
Reviewed-by: Andrea Arcangeli <aarcange@redhat.com>
Cc: Mike Kravetz <mike.kravetz@oracle.com>
Cc: Shuah Khan <shuah@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/userfaultfd.c                 |  3 +++
 include/uapi/linux/userfaultfd.h | 10 +++++++++-
 2 files changed, 12 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c
index 01a85e2660b8..5fd4d846691f 100644
--- a/fs/userfaultfd.c
+++ b/fs/userfaultfd.c
@@ -370,6 +370,9 @@ int handle_userfault(struct vm_fault *vmf, unsigned long reason)
 	VM_BUG_ON(reason & ~(VM_UFFD_MISSING|VM_UFFD_WP));
 	VM_BUG_ON(!(reason & VM_UFFD_MISSING) ^ !!(reason & VM_UFFD_WP));
 
+	if (ctx->features & UFFD_FEATURE_SIGBUS)
+		goto out;
+
 	/*
 	 * If it's already released don't get it. This avoids to loop
 	 * in __get_user_pages if userfaultfd_release waits on the
diff --git a/include/uapi/linux/userfaultfd.h b/include/uapi/linux/userfaultfd.h
index 3b059530dac9..d39d5db56771 100644
--- a/include/uapi/linux/userfaultfd.h
+++ b/include/uapi/linux/userfaultfd.h
@@ -23,7 +23,8 @@
 			   UFFD_FEATURE_EVENT_REMOVE |	\
 			   UFFD_FEATURE_EVENT_UNMAP |		\
 			   UFFD_FEATURE_MISSING_HUGETLBFS |	\
-			   UFFD_FEATURE_MISSING_SHMEM)
+			   UFFD_FEATURE_MISSING_SHMEM |		\
+			   UFFD_FEATURE_SIGBUS)
 #define UFFD_API_IOCTLS				\
 	((__u64)1 << _UFFDIO_REGISTER |		\
 	 (__u64)1 << _UFFDIO_UNREGISTER |	\
@@ -153,6 +154,12 @@ struct uffdio_api {
 	 * UFFD_FEATURE_MISSING_SHMEM works the same as
 	 * UFFD_FEATURE_MISSING_HUGETLBFS, but it applies to shmem
 	 * (i.e. tmpfs and other shmem based APIs).
+	 *
+	 * UFFD_FEATURE_SIGBUS feature means no page-fault
+	 * (UFFD_EVENT_PAGEFAULT) event will be delivered, instead
+	 * a SIGBUS signal will be sent to the faulting process.
+	 * The application process can enable this behavior by adding
+	 * it to uffdio_api.features.
 	 */
 #define UFFD_FEATURE_PAGEFAULT_FLAG_WP		(1<<0)
 #define UFFD_FEATURE_EVENT_FORK			(1<<1)
@@ -161,6 +168,7 @@ struct uffdio_api {
 #define UFFD_FEATURE_MISSING_HUGETLBFS		(1<<4)
 #define UFFD_FEATURE_MISSING_SHMEM		(1<<5)
 #define UFFD_FEATURE_EVENT_UNMAP		(1<<6)
+#define UFFD_FEATURE_SIGBUS			(1<<7)
 	__u64 features;
 
 	__u64 ioctls;
-- 
cgit v1.2.3


From 9d4ac934829ac58c5109c49e6dfe677300e5e652 Mon Sep 17 00:00:00 2001
From: Alexey Perevalov <a.perevalov@samsung.com>
Date: Wed, 6 Sep 2017 16:23:56 -0700
Subject: userfaultfd: provide pid in userfault msg

It could be useful for calculating downtime during postcopy live
migration per vCPU.  Side observer or application itself will be
informed about proper task's sleep during userfaultfd processing.

Process's thread id is being provided when user requeste it by setting
UFFD_FEATURE_THREAD_ID bit into uffdio_api.features.

Link: http://lkml.kernel.org/r/20170802165145.22628-6-aarcange@redhat.com
Signed-off-by: Alexey Perevalov <a.perevalov@samsung.com>
Signed-off-by: Andrea Arcangeli <aarcange@redhat.com>
Cc: "Dr. David Alan Gilbert" <dgilbert@redhat.com>
Cc: Maxime Coquelin <maxime.coquelin@redhat.com>
Cc: Mike Kravetz <mike.kravetz@oracle.com>
Cc: Mike Rapoport <rppt@linux.vnet.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/userfaultfd.c                 |  8 ++++++--
 include/uapi/linux/userfaultfd.h | 10 +++++++---
 2 files changed, 13 insertions(+), 5 deletions(-)

(limited to 'include')

diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c
index 5fd4d846691f..665bf7a930b2 100644
--- a/fs/userfaultfd.c
+++ b/fs/userfaultfd.c
@@ -178,7 +178,8 @@ static inline void msg_init(struct uffd_msg *msg)
 
 static inline struct uffd_msg userfault_msg(unsigned long address,
 					    unsigned int flags,
-					    unsigned long reason)
+					    unsigned long reason,
+					    unsigned int features)
 {
 	struct uffd_msg msg;
 	msg_init(&msg);
@@ -202,6 +203,8 @@ static inline struct uffd_msg userfault_msg(unsigned long address,
 		 * write protect fault.
 		 */
 		msg.arg.pagefault.flags |= UFFD_PAGEFAULT_FLAG_WP;
+	if (features & UFFD_FEATURE_THREAD_ID)
+		msg.arg.pagefault.ptid = task_pid_vnr(current);
 	return msg;
 }
 
@@ -422,7 +425,8 @@ int handle_userfault(struct vm_fault *vmf, unsigned long reason)
 
 	init_waitqueue_func_entry(&uwq.wq, userfaultfd_wake_function);
 	uwq.wq.private = current;
-	uwq.msg = userfault_msg(vmf->address, vmf->flags, reason);
+	uwq.msg = userfault_msg(vmf->address, vmf->flags, reason,
+			ctx->features);
 	uwq.ctx = ctx;
 	uwq.waken = false;
 
diff --git a/include/uapi/linux/userfaultfd.h b/include/uapi/linux/userfaultfd.h
index d39d5db56771..2b24c28d99a7 100644
--- a/include/uapi/linux/userfaultfd.h
+++ b/include/uapi/linux/userfaultfd.h
@@ -24,7 +24,8 @@
 			   UFFD_FEATURE_EVENT_UNMAP |		\
 			   UFFD_FEATURE_MISSING_HUGETLBFS |	\
 			   UFFD_FEATURE_MISSING_SHMEM |		\
-			   UFFD_FEATURE_SIGBUS)
+			   UFFD_FEATURE_SIGBUS |		\
+			   UFFD_FEATURE_THREAD_ID)
 #define UFFD_API_IOCTLS				\
 	((__u64)1 << _UFFDIO_REGISTER |		\
 	 (__u64)1 << _UFFDIO_UNREGISTER |	\
@@ -79,6 +80,7 @@ struct uffd_msg {
 		struct {
 			__u64	flags;
 			__u64	address;
+			__u32   ptid;
 		} pagefault;
 
 		struct {
@@ -158,8 +160,9 @@ struct uffdio_api {
 	 * UFFD_FEATURE_SIGBUS feature means no page-fault
 	 * (UFFD_EVENT_PAGEFAULT) event will be delivered, instead
 	 * a SIGBUS signal will be sent to the faulting process.
-	 * The application process can enable this behavior by adding
-	 * it to uffdio_api.features.
+	 *
+	 * UFFD_FEATURE_THREAD_ID pid of the page faulted task_struct will
+	 * be returned, if feature is not requested 0 will be returned.
 	 */
 #define UFFD_FEATURE_PAGEFAULT_FLAG_WP		(1<<0)
 #define UFFD_FEATURE_EVENT_FORK			(1<<1)
@@ -169,6 +172,7 @@ struct uffdio_api {
 #define UFFD_FEATURE_MISSING_SHMEM		(1<<5)
 #define UFFD_FEATURE_EVENT_UNMAP		(1<<6)
 #define UFFD_FEATURE_SIGBUS			(1<<7)
+#define UFFD_FEATURE_THREAD_ID			(1<<8)
 	__u64 features;
 
 	__u64 ioctls;
-- 
cgit v1.2.3


From a36985d31a65d5c0559fb582719e32eaf0ccec3b Mon Sep 17 00:00:00 2001
From: Andrea Arcangeli <aarcange@redhat.com>
Date: Wed, 6 Sep 2017 16:23:59 -0700
Subject: userfaultfd: provide pid in userfault msg - add feat union

No ABI change, but this will make it more explicit to software that ptid
is only available if requested by passing UFFD_FEATURE_THREAD_ID to
UFFDIO_API.  The fact it's a union will also self document it shouldn't
be taken for granted there's a tpid there.

Link: http://lkml.kernel.org/r/20170802165145.22628-7-aarcange@redhat.com
Signed-off-by: Andrea Arcangeli <aarcange@redhat.com>
Cc: "Dr. David Alan Gilbert" <dgilbert@redhat.com>
Cc: Alexey Perevalov <a.perevalov@samsung.com>
Cc: Maxime Coquelin <maxime.coquelin@redhat.com>
Cc: Mike Kravetz <mike.kravetz@oracle.com>
Cc: Mike Rapoport <rppt@linux.vnet.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/userfaultfd.c                 | 2 +-
 include/uapi/linux/userfaultfd.h | 4 +++-
 2 files changed, 4 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c
index 665bf7a930b2..5419e7da82ba 100644
--- a/fs/userfaultfd.c
+++ b/fs/userfaultfd.c
@@ -204,7 +204,7 @@ static inline struct uffd_msg userfault_msg(unsigned long address,
 		 */
 		msg.arg.pagefault.flags |= UFFD_PAGEFAULT_FLAG_WP;
 	if (features & UFFD_FEATURE_THREAD_ID)
-		msg.arg.pagefault.ptid = task_pid_vnr(current);
+		msg.arg.pagefault.feat.ptid = task_pid_vnr(current);
 	return msg;
 }
 
diff --git a/include/uapi/linux/userfaultfd.h b/include/uapi/linux/userfaultfd.h
index 2b24c28d99a7..d6d1f65cb3c3 100644
--- a/include/uapi/linux/userfaultfd.h
+++ b/include/uapi/linux/userfaultfd.h
@@ -80,7 +80,9 @@ struct uffd_msg {
 		struct {
 			__u64	flags;
 			__u64	address;
-			__u32   ptid;
+			union {
+				__u32 ptid;
+			} feat;
 		} pagefault;
 
 		struct {
-- 
cgit v1.2.3


From 749df87bd7bee5a79cef073f5d032ddb2b211de8 Mon Sep 17 00:00:00 2001
From: Mike Kravetz <mike.kravetz@oracle.com>
Date: Wed, 6 Sep 2017 16:24:16 -0700
Subject: mm/shmem: add hugetlbfs support to memfd_create()

This patch came out of discussions in this e-mail thread:
  http://lkml.kernel.org/r/1499357846-7481-1-git-send-email-mike.kravetz%40oracle.com

The Oracle JVM team is developing a new garbage collection model.  This
new model requires multiple mappings of the same anonymous memory.  One
straight forward way to accomplish this is with memfd_create.  They can
use the returned fd to create multiple mappings of the same memory.

The JVM today has an option to use (static hugetlb) huge pages.  If this
option is specified, they would like to use the same garbage collection
model requiring multiple mappings to the same memory.  Using hugetlbfs,
it is possible to explicitly mount a filesystem and specify file paths
in order to get an fd that can be used for multiple mappings.  However,
this introduces additional system admin work and coordination.

Ideally they would like to get a hugetlbfs fd without requiring explicit
mounting of a filesystem.  Today, mmap and shmget can make use of
hugetlbfs without explicitly mounting a filesystem.  The patch adds this
functionality to memfd_create.

Add a new flag MFD_HUGETLB to memfd_create() that will specify the file
to be created resides in the hugetlbfs filesystem.  This is the generic
hugetlbfs filesystem not associated with any specific mount point.  As
with other system calls that request hugetlbfs backed pages, there is
the ability to encode huge page size in the flag arguments.

hugetlbfs does not support sealing operations, therefore specifying
MFD_ALLOW_SEALING with MFD_HUGETLB will result in EINVAL.

Of course, the memfd_man page would need updating if this type of
functionality moves forward.

Link: http://lkml.kernel.org/r/1502149672-7759-2-git-send-email-mike.kravetz@oracle.com
Signed-off-by: Mike Kravetz <mike.kravetz@oracle.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: "Kirill A . Shutemov" <kirill.shutemov@linux.intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/uapi/linux/memfd.h | 24 ++++++++++++++++++++++++
 mm/shmem.c                 | 37 +++++++++++++++++++++++++++++++------
 2 files changed, 55 insertions(+), 6 deletions(-)

(limited to 'include')

diff --git a/include/uapi/linux/memfd.h b/include/uapi/linux/memfd.h
index 534e364bda92..7f3a722dbd72 100644
--- a/include/uapi/linux/memfd.h
+++ b/include/uapi/linux/memfd.h
@@ -1,8 +1,32 @@
 #ifndef _UAPI_LINUX_MEMFD_H
 #define _UAPI_LINUX_MEMFD_H
 
+#include <asm-generic/hugetlb_encode.h>
+
 /* flags for memfd_create(2) (unsigned int) */
 #define MFD_CLOEXEC		0x0001U
 #define MFD_ALLOW_SEALING	0x0002U
+#define MFD_HUGETLB		0x0004U
+
+/*
+ * Huge page size encoding when MFD_HUGETLB is specified, and a huge page
+ * size other than the default is desired.  See hugetlb_encode.h.
+ * All known huge page size encodings are provided here.  It is the
+ * responsibility of the application to know which sizes are supported on
+ * the running system.  See mmap(2) man page for details.
+ */
+#define MFD_HUGE_SHIFT	HUGETLB_FLAG_ENCODE_SHIFT
+#define MFD_HUGE_MASK	HUGETLB_FLAG_ENCODE_MASK
+
+#define MFD_HUGE_64KB	HUGETLB_FLAG_ENCODE_64KB
+#define MFD_HUGE_512KB	HUGETLB_FLAG_ENCODE_512KB
+#define MFD_HUGE_1MB	HUGETLB_FLAG_ENCODE_1MB
+#define MFD_HUGE_2MB	HUGETLB_FLAG_ENCODE_2MB
+#define MFD_HUGE_8MB	HUGETLB_FLAG_ENCODE_8MB
+#define MFD_HUGE_16MB	HUGETLB_FLAG_ENCODE_16MB
+#define MFD_HUGE_256MB	HUGETLB_FLAG_ENCODE_256MB
+#define MFD_HUGE_1GB	HUGETLB_FLAG_ENCODE_1GB
+#define MFD_HUGE_2GB	HUGETLB_FLAG_ENCODE_2GB
+#define MFD_HUGE_16GB	HUGETLB_FLAG_ENCODE_16GB
 
 #endif /* _UAPI_LINUX_MEMFD_H */
diff --git a/mm/shmem.c b/mm/shmem.c
index 64bdc91187f7..47179bbe9ee7 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -34,6 +34,7 @@
 #include <linux/swap.h>
 #include <linux/uio.h>
 #include <linux/khugepaged.h>
+#include <linux/hugetlb.h>
 
 #include <asm/tlbflush.h> /* for arch/microblaze update_mmu_cache() */
 
@@ -3652,7 +3653,7 @@ static int shmem_show_options(struct seq_file *seq, struct dentry *root)
 #define MFD_NAME_PREFIX_LEN (sizeof(MFD_NAME_PREFIX) - 1)
 #define MFD_NAME_MAX_LEN (NAME_MAX - MFD_NAME_PREFIX_LEN)
 
-#define MFD_ALL_FLAGS (MFD_CLOEXEC | MFD_ALLOW_SEALING)
+#define MFD_ALL_FLAGS (MFD_CLOEXEC | MFD_ALLOW_SEALING | MFD_HUGETLB)
 
 SYSCALL_DEFINE2(memfd_create,
 		const char __user *, uname,
@@ -3664,8 +3665,18 @@ SYSCALL_DEFINE2(memfd_create,
 	char *name;
 	long len;
 
-	if (flags & ~(unsigned int)MFD_ALL_FLAGS)
-		return -EINVAL;
+	if (!(flags & MFD_HUGETLB)) {
+		if (flags & ~(unsigned int)MFD_ALL_FLAGS)
+			return -EINVAL;
+	} else {
+		/* Sealing not supported in hugetlbfs (MFD_HUGETLB) */
+		if (flags & MFD_ALLOW_SEALING)
+			return -EINVAL;
+		/* Allow huge page size encoding in flags. */
+		if (flags & ~(unsigned int)(MFD_ALL_FLAGS |
+				(MFD_HUGE_MASK << MFD_HUGE_SHIFT)))
+			return -EINVAL;
+	}
 
 	/* length includes terminating zero */
 	len = strnlen_user(uname, MFD_NAME_MAX_LEN + 1);
@@ -3696,16 +3707,30 @@ SYSCALL_DEFINE2(memfd_create,
 		goto err_name;
 	}
 
-	file = shmem_file_setup(name, 0, VM_NORESERVE);
+	if (flags & MFD_HUGETLB) {
+		struct user_struct *user = NULL;
+
+		file = hugetlb_file_setup(name, 0, VM_NORESERVE, &user,
+					HUGETLB_ANONHUGE_INODE,
+					(flags >> MFD_HUGE_SHIFT) &
+					MFD_HUGE_MASK);
+	} else
+		file = shmem_file_setup(name, 0, VM_NORESERVE);
 	if (IS_ERR(file)) {
 		error = PTR_ERR(file);
 		goto err_fd;
 	}
-	info = SHMEM_I(file_inode(file));
 	file->f_mode |= FMODE_LSEEK | FMODE_PREAD | FMODE_PWRITE;
 	file->f_flags |= O_RDWR | O_LARGEFILE;
-	if (flags & MFD_ALLOW_SEALING)
+
+	if (flags & MFD_ALLOW_SEALING) {
+		/*
+		 * flags check at beginning of function ensures
+		 * this is not a hugetlbfs (MFD_HUGETLB) file.
+		 */
+		info = SHMEM_I(file_inode(file));
 		info->seals &= ~F_SEAL_SEAL;
+	}
 
 	fd_install(fd, file);
 	kfree(name);
-- 
cgit v1.2.3


From cbc65df240c104bf540af1ad58595bf1eaa5ee10 Mon Sep 17 00:00:00 2001
From: Huang Ying <ying.huang@intel.com>
Date: Wed, 6 Sep 2017 16:24:29 -0700
Subject: mm, swap: add swap readahead hit statistics

Patch series "mm, swap: VMA based swap readahead", v4.

The swap readahead is an important mechanism to reduce the swap in
latency.  Although pure sequential memory access pattern isn't very
popular for anonymous memory, the space locality is still considered
valid.

In the original swap readahead implementation, the consecutive blocks in
swap device are readahead based on the global space locality estimation.
But the consecutive blocks in swap device just reflect the order of page
reclaiming, don't necessarily reflect the access pattern in virtual
memory space.  And the different tasks in the system may have different
access patterns, which makes the global space locality estimation
incorrect.

In this patchset, when page fault occurs, the virtual pages near the
fault address will be readahead instead of the swap slots near the fault
swap slot in swap device.  This avoid to readahead the unrelated swap
slots.  At the same time, the swap readahead is changed to work on
per-VMA from globally.  So that the different access patterns of the
different VMAs could be distinguished, and the different readahead
policy could be applied accordingly.  The original core readahead
detection and scaling algorithm is reused, because it is an effect
algorithm to detect the space locality.

In addition to the swap readahead changes, some new sysfs interface is
added to show the efficiency of the readahead algorithm and some other
swap statistics.

This new implementation will incur more small random read, on SSD, the
improved correctness of estimation and readahead target should beat the
potential increased overhead, this is also illustrated in the test
results below.  But on HDD, the overhead may beat the benefit, so the
original implementation will be used by default.

The test and result is as follow,

Common test condition
=====================

Test Machine: Xeon E5 v3 (2 sockets, 72 threads, 32G RAM)
Swap device: NVMe disk

Micro-benchmark with combined access pattern
============================================

vm-scalability, sequential swap test case, 4 processes to eat 50G
virtual memory space, repeat the sequential memory writing until 300
seconds.  The first round writing will trigger swap out, the following
rounds will trigger sequential swap in and out.

At the same time, run vm-scalability random swap test case in
background, 8 processes to eat 30G virtual memory space, repeat the
random memory write until 300 seconds.  This will trigger random swap-in
in the background.

This is a combined workload with sequential and random memory accessing
at the same time.  The result (for sequential workload) is as follow,

			Base		Optimized
			----		---------
throughput		345413 KB/s	414029 KB/s (+19.9%)
latency.average		97.14 us	61.06 us (-37.1%)
latency.50th		2 us		1 us
latency.60th		2 us		1 us
latency.70th		98 us		2 us
latency.80th		160 us		2 us
latency.90th		260 us		217 us
latency.95th		346 us		369 us
latency.99th		1.34 ms		1.09 ms
ra_hit%			52.69%		99.98%

The original swap readahead algorithm is confused by the background
random access workload, so readahead hit rate is lower.  The VMA-base
readahead algorithm works much better.

Linpack
=======

The test memory size is bigger than RAM to trigger swapping.

			Base		Optimized
			----		---------
elapsed_time		393.49 s	329.88 s (-16.2%)
ra_hit%			86.21%		98.82%

The score of base and optimized kernel hasn't visible changes.  But the
elapsed time reduced and readahead hit rate improved, so the optimized
kernel runs better for startup and tear down stages.  And the absolute
value of readahead hit rate is high, shows that the space locality is
still valid in some practical workloads.

This patch (of 5):

The statistics for total readahead pages and total readahead hits are
recorded and exported via the following sysfs interface.

/sys/kernel/mm/swap/ra_hits
/sys/kernel/mm/swap/ra_total

With them, the efficiency of the swap readahead could be measured, so
that the swap readahead algorithm and parameters could be tuned
accordingly.

[akpm@linux-foundation.org: don't display swap stats if CONFIG_SWAP=n]
Link: http://lkml.kernel.org/r/20170807054038.1843-2-ying.huang@intel.com
Signed-off-by: "Huang, Ying" <ying.huang@intel.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Minchan Kim <minchan@kernel.org>
Cc: Rik van Riel <riel@redhat.com>
Cc: Shaohua Li <shli@kernel.org>
Cc: Hugh Dickins <hughd@google.com>
Cc: Fengguang Wu <fengguang.wu@intel.com>
Cc: Tim Chen <tim.c.chen@intel.com>
Cc: Dave Hansen <dave.hansen@intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/vm_event_item.h | 4 ++++
 mm/swap_state.c               | 9 +++++++--
 mm/vmstat.c                   | 4 ++++
 3 files changed, 15 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/linux/vm_event_item.h b/include/linux/vm_event_item.h
index e02820fc2861..d77bc35278b0 100644
--- a/include/linux/vm_event_item.h
+++ b/include/linux/vm_event_item.h
@@ -105,6 +105,10 @@ enum vm_event_item { PGPGIN, PGPGOUT, PSWPIN, PSWPOUT,
 		VMACACHE_FIND_CALLS,
 		VMACACHE_FIND_HITS,
 		VMACACHE_FULL_FLUSHES,
+#endif
+#ifdef CONFIG_SWAP
+		SWAP_RA,
+		SWAP_RA_HIT,
 #endif
 		NR_VM_EVENT_ITEMS
 };
diff --git a/mm/swap_state.c b/mm/swap_state.c
index b68c93014f50..d1bdb31cab13 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -305,8 +305,10 @@ struct page * lookup_swap_cache(swp_entry_t entry)
 
 	if (page && likely(!PageTransCompound(page))) {
 		INC_CACHE_INFO(find_success);
-		if (TestClearPageReadahead(page))
+		if (TestClearPageReadahead(page)) {
 			atomic_inc(&swapin_readahead_hits);
+			count_vm_event(SWAP_RA_HIT);
+		}
 	}
 
 	INC_CACHE_INFO(find_total);
@@ -516,8 +518,11 @@ struct page *swapin_readahead(swp_entry_t entry, gfp_t gfp_mask,
 						gfp_mask, vma, addr, false);
 		if (!page)
 			continue;
-		if (offset != entry_offset && likely(!PageTransCompound(page)))
+		if (offset != entry_offset &&
+		    likely(!PageTransCompound(page))) {
 			SetPageReadahead(page);
+			count_vm_event(SWAP_RA);
+		}
 		put_page(page);
 	}
 	blk_finish_plug(&plug);
diff --git a/mm/vmstat.c b/mm/vmstat.c
index 85f3a2e04adc..c7e4b8458023 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -1098,6 +1098,10 @@ const char * const vmstat_text[] = {
 	"vmacache_find_hits",
 	"vmacache_full_flushes",
 #endif
+#ifdef CONFIG_SWAP
+	"swap_ra",
+	"swap_ra_hit",
+#endif
 #endif /* CONFIG_VM_EVENTS_COUNTERS */
 };
 #endif /* CONFIG_PROC_FS || CONFIG_SYSFS || CONFIG_NUMA */
-- 
cgit v1.2.3


From ec560175c0b6fce86994bdf036754d48122c5c87 Mon Sep 17 00:00:00 2001
From: Huang Ying <ying.huang@intel.com>
Date: Wed, 6 Sep 2017 16:24:36 -0700
Subject: mm, swap: VMA based swap readahead

The swap readahead is an important mechanism to reduce the swap in
latency.  Although pure sequential memory access pattern isn't very
popular for anonymous memory, the space locality is still considered
valid.

In the original swap readahead implementation, the consecutive blocks in
swap device are readahead based on the global space locality estimation.
But the consecutive blocks in swap device just reflect the order of page
reclaiming, don't necessarily reflect the access pattern in virtual
memory.  And the different tasks in the system may have different access
patterns, which makes the global space locality estimation incorrect.

In this patch, when page fault occurs, the virtual pages near the fault
address will be readahead instead of the swap slots near the fault swap
slot in swap device.  This avoid to readahead the unrelated swap slots.
At the same time, the swap readahead is changed to work on per-VMA from
globally.  So that the different access patterns of the different VMAs
could be distinguished, and the different readahead policy could be
applied accordingly.  The original core readahead detection and scaling
algorithm is reused, because it is an effect algorithm to detect the
space locality.

The test and result is as follow,

Common test condition
=====================

Test Machine: Xeon E5 v3 (2 sockets, 72 threads, 32G RAM) Swap device:
NVMe disk

Micro-benchmark with combined access pattern
============================================

vm-scalability, sequential swap test case, 4 processes to eat 50G
virtual memory space, repeat the sequential memory writing until 300
seconds.  The first round writing will trigger swap out, the following
rounds will trigger sequential swap in and out.

At the same time, run vm-scalability random swap test case in
background, 8 processes to eat 30G virtual memory space, repeat the
random memory write until 300 seconds.  This will trigger random swap-in
in the background.

This is a combined workload with sequential and random memory accessing
at the same time.  The result (for sequential workload) is as follow,

			Base		Optimized
			----		---------
throughput		345413 KB/s	414029 KB/s (+19.9%)
latency.average		97.14 us	61.06 us (-37.1%)
latency.50th		2 us		1 us
latency.60th		2 us		1 us
latency.70th		98 us		2 us
latency.80th		160 us		2 us
latency.90th		260 us		217 us
latency.95th		346 us		369 us
latency.99th		1.34 ms		1.09 ms
ra_hit%			52.69%		99.98%

The original swap readahead algorithm is confused by the background
random access workload, so readahead hit rate is lower.  The VMA-base
readahead algorithm works much better.

Linpack
=======

The test memory size is bigger than RAM to trigger swapping.

			Base		Optimized
			----		---------
elapsed_time		393.49 s	329.88 s (-16.2%)
ra_hit%			86.21%		98.82%

The score of base and optimized kernel hasn't visible changes.  But the
elapsed time reduced and readahead hit rate improved, so the optimized
kernel runs better for startup and tear down stages.  And the absolute
value of readahead hit rate is high, shows that the space locality is
still valid in some practical workloads.

Link: http://lkml.kernel.org/r/20170807054038.1843-4-ying.huang@intel.com
Signed-off-by: "Huang, Ying" <ying.huang@intel.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Minchan Kim <minchan@kernel.org>
Cc: Rik van Riel <riel@redhat.com>
Cc: Shaohua Li <shli@kernel.org>
Cc: Hugh Dickins <hughd@google.com>
Cc: Fengguang Wu <fengguang.wu@intel.com>
Cc: Tim Chen <tim.c.chen@intel.com>
Cc: Dave Hansen <dave.hansen@intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/mm_types.h |   1 +
 include/linux/swap.h     |  57 ++++++++++++-
 mm/memory.c              |  23 +++--
 mm/shmem.c               |   2 +-
 mm/swap_state.c          | 215 +++++++++++++++++++++++++++++++++++++++++++----
 5 files changed, 273 insertions(+), 25 deletions(-)

(limited to 'include')

diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index 57378c7cb5f8..f45ad815b7d7 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -335,6 +335,7 @@ struct vm_area_struct {
 	struct file * vm_file;		/* File we map to (can be NULL). */
 	void * vm_private_data;		/* was vm_pte (shared mem) */
 
+	atomic_long_t swap_readahead_info;
 #ifndef CONFIG_MMU
 	struct vm_region *vm_region;	/* NOMMU mapping region */
 #endif
diff --git a/include/linux/swap.h b/include/linux/swap.h
index 76f1632eea5a..61d63379e956 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -251,6 +251,25 @@ struct swap_info_struct {
 	struct swap_cluster_list discard_clusters; /* discard clusters list */
 };
 
+#ifdef CONFIG_64BIT
+#define SWAP_RA_ORDER_CEILING	5
+#else
+/* Avoid stack overflow, because we need to save part of page table */
+#define SWAP_RA_ORDER_CEILING	3
+#define SWAP_RA_PTE_CACHE_SIZE	(1 << SWAP_RA_ORDER_CEILING)
+#endif
+
+struct vma_swap_readahead {
+	unsigned short win;
+	unsigned short offset;
+	unsigned short nr_pte;
+#ifdef CONFIG_64BIT
+	pte_t *ptes;
+#else
+	pte_t ptes[SWAP_RA_PTE_CACHE_SIZE];
+#endif
+};
+
 /* linux/mm/workingset.c */
 void *workingset_eviction(struct address_space *mapping, struct page *page);
 bool workingset_refault(void *shadow);
@@ -350,6 +369,7 @@ int generic_swapfile_activate(struct swap_info_struct *, struct file *,
 #define SWAP_ADDRESS_SPACE_SHIFT	14
 #define SWAP_ADDRESS_SPACE_PAGES	(1 << SWAP_ADDRESS_SPACE_SHIFT)
 extern struct address_space *swapper_spaces[];
+extern bool swap_vma_readahead;
 #define swap_address_space(entry)			    \
 	(&swapper_spaces[swp_type(entry)][swp_offset(entry) \
 		>> SWAP_ADDRESS_SPACE_SHIFT])
@@ -362,7 +382,9 @@ extern void __delete_from_swap_cache(struct page *);
 extern void delete_from_swap_cache(struct page *);
 extern void free_page_and_swap_cache(struct page *);
 extern void free_pages_and_swap_cache(struct page **, int);
-extern struct page *lookup_swap_cache(swp_entry_t);
+extern struct page *lookup_swap_cache(swp_entry_t entry,
+				      struct vm_area_struct *vma,
+				      unsigned long addr);
 extern struct page *read_swap_cache_async(swp_entry_t, gfp_t,
 			struct vm_area_struct *vma, unsigned long addr,
 			bool do_poll);
@@ -372,6 +394,17 @@ extern struct page *__read_swap_cache_async(swp_entry_t, gfp_t,
 extern struct page *swapin_readahead(swp_entry_t, gfp_t,
 			struct vm_area_struct *vma, unsigned long addr);
 
+extern struct page *swap_readahead_detect(struct vm_fault *vmf,
+					  struct vma_swap_readahead *swap_ra);
+extern struct page *do_swap_page_readahead(swp_entry_t fentry, gfp_t gfp_mask,
+					   struct vm_fault *vmf,
+					   struct vma_swap_readahead *swap_ra);
+
+static inline bool swap_use_vma_readahead(void)
+{
+	return READ_ONCE(swap_vma_readahead);
+}
+
 /* linux/mm/swapfile.c */
 extern atomic_long_t nr_swap_pages;
 extern long total_swap_pages;
@@ -466,12 +499,32 @@ static inline struct page *swapin_readahead(swp_entry_t swp, gfp_t gfp_mask,
 	return NULL;
 }
 
+static inline bool swap_use_vma_readahead(void)
+{
+	return false;
+}
+
+static inline struct page *swap_readahead_detect(
+	struct vm_fault *vmf, struct vma_swap_readahead *swap_ra)
+{
+	return NULL;
+}
+
+static inline struct page *do_swap_page_readahead(
+	swp_entry_t fentry, gfp_t gfp_mask,
+	struct vm_fault *vmf, struct vma_swap_readahead *swap_ra)
+{
+	return NULL;
+}
+
 static inline int swap_writepage(struct page *p, struct writeback_control *wbc)
 {
 	return 0;
 }
 
-static inline struct page *lookup_swap_cache(swp_entry_t swp)
+static inline struct page *lookup_swap_cache(swp_entry_t swp,
+					     struct vm_area_struct *vma,
+					     unsigned long addr)
 {
 	return NULL;
 }
diff --git a/mm/memory.c b/mm/memory.c
index 3dd8bb46391b..e87953775e3c 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -2752,16 +2752,23 @@ EXPORT_SYMBOL(unmap_mapping_range);
 int do_swap_page(struct vm_fault *vmf)
 {
 	struct vm_area_struct *vma = vmf->vma;
-	struct page *page, *swapcache;
+	struct page *page = NULL, *swapcache;
 	struct mem_cgroup *memcg;
+	struct vma_swap_readahead swap_ra;
 	swp_entry_t entry;
 	pte_t pte;
 	int locked;
 	int exclusive = 0;
 	int ret = 0;
+	bool vma_readahead = swap_use_vma_readahead();
 
-	if (!pte_unmap_same(vma->vm_mm, vmf->pmd, vmf->pte, vmf->orig_pte))
+	if (vma_readahead)
+		page = swap_readahead_detect(vmf, &swap_ra);
+	if (!pte_unmap_same(vma->vm_mm, vmf->pmd, vmf->pte, vmf->orig_pte)) {
+		if (page)
+			put_page(page);
 		goto out;
+	}
 
 	entry = pte_to_swp_entry(vmf->orig_pte);
 	if (unlikely(non_swap_entry(entry))) {
@@ -2777,10 +2784,16 @@ int do_swap_page(struct vm_fault *vmf)
 		goto out;
 	}
 	delayacct_set_flag(DELAYACCT_PF_SWAPIN);
-	page = lookup_swap_cache(entry);
+	if (!page)
+		page = lookup_swap_cache(entry, vma_readahead ? vma : NULL,
+					 vmf->address);
 	if (!page) {
-		page = swapin_readahead(entry, GFP_HIGHUSER_MOVABLE, vma,
-					vmf->address);
+		if (vma_readahead)
+			page = do_swap_page_readahead(entry,
+				GFP_HIGHUSER_MOVABLE, vmf, &swap_ra);
+		else
+			page = swapin_readahead(entry,
+				GFP_HIGHUSER_MOVABLE, vma, vmf->address);
 		if (!page) {
 			/*
 			 * Back out if somebody else faulted in this pte
diff --git a/mm/shmem.c b/mm/shmem.c
index 47179bbe9ee7..ace53a582be5 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -1650,7 +1650,7 @@ repeat:
 
 	if (swap.val) {
 		/* Look it up and read it in.. */
-		page = lookup_swap_cache(swap);
+		page = lookup_swap_cache(swap, NULL, 0);
 		if (!page) {
 			/* Or update major stats only when swapin succeeds?? */
 			if (fault_type) {
diff --git a/mm/swap_state.c b/mm/swap_state.c
index a901afe9da61..3885fef7bdf5 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -37,6 +37,29 @@ static const struct address_space_operations swap_aops = {
 
 struct address_space *swapper_spaces[MAX_SWAPFILES];
 static unsigned int nr_swapper_spaces[MAX_SWAPFILES];
+bool swap_vma_readahead = true;
+
+#define SWAP_RA_MAX_ORDER_DEFAULT	3
+
+static int swap_ra_max_order = SWAP_RA_MAX_ORDER_DEFAULT;
+
+#define SWAP_RA_WIN_SHIFT	(PAGE_SHIFT / 2)
+#define SWAP_RA_HITS_MASK	((1UL << SWAP_RA_WIN_SHIFT) - 1)
+#define SWAP_RA_HITS_MAX	SWAP_RA_HITS_MASK
+#define SWAP_RA_WIN_MASK	(~PAGE_MASK & ~SWAP_RA_HITS_MASK)
+
+#define SWAP_RA_HITS(v)		((v) & SWAP_RA_HITS_MASK)
+#define SWAP_RA_WIN(v)		(((v) & SWAP_RA_WIN_MASK) >> SWAP_RA_WIN_SHIFT)
+#define SWAP_RA_ADDR(v)		((v) & PAGE_MASK)
+
+#define SWAP_RA_VAL(addr, win, hits)				\
+	(((addr) & PAGE_MASK) |					\
+	 (((win) << SWAP_RA_WIN_SHIFT) & SWAP_RA_WIN_MASK) |	\
+	 ((hits) & SWAP_RA_HITS_MASK))
+
+/* Initial readahead hits is 4 to start up with a small window */
+#define GET_SWAP_RA_VAL(vma)					\
+	(atomic_long_read(&(vma)->swap_readahead_info) ? : 4)
 
 #define INC_CACHE_INFO(x)	do { swap_cache_info.x++; } while (0)
 #define ADD_CACHE_INFO(x, nr)	do { swap_cache_info.x += (nr); } while (0)
@@ -297,21 +320,36 @@ void free_pages_and_swap_cache(struct page **pages, int nr)
  * lock getting page table operations atomic even if we drop the page
  * lock before returning.
  */
-struct page * lookup_swap_cache(swp_entry_t entry)
+struct page *lookup_swap_cache(swp_entry_t entry, struct vm_area_struct *vma,
+			       unsigned long addr)
 {
 	struct page *page;
+	unsigned long ra_info;
+	int win, hits, readahead;
 
 	page = find_get_page(swap_address_space(entry), swp_offset(entry));
 
-	if (page && likely(!PageTransCompound(page))) {
+	INC_CACHE_INFO(find_total);
+	if (page) {
 		INC_CACHE_INFO(find_success);
-		if (TestClearPageReadahead(page)) {
-			atomic_inc(&swapin_readahead_hits);
+		if (unlikely(PageTransCompound(page)))
+			return page;
+		readahead = TestClearPageReadahead(page);
+		if (vma) {
+			ra_info = GET_SWAP_RA_VAL(vma);
+			win = SWAP_RA_WIN(ra_info);
+			hits = SWAP_RA_HITS(ra_info);
+			if (readahead)
+				hits = min_t(int, hits + 1, SWAP_RA_HITS_MAX);
+			atomic_long_set(&vma->swap_readahead_info,
+					SWAP_RA_VAL(addr, win, hits));
+		}
+		if (readahead) {
 			count_vm_event(SWAP_RA_HIT);
+			if (!vma)
+				atomic_inc(&swapin_readahead_hits);
 		}
 	}
-
-	INC_CACHE_INFO(find_total);
 	return page;
 }
 
@@ -426,22 +464,20 @@ struct page *read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
 	return retpage;
 }
 
-static unsigned long swapin_nr_pages(unsigned long offset)
+static unsigned int __swapin_nr_pages(unsigned long prev_offset,
+				      unsigned long offset,
+				      int hits,
+				      int max_pages,
+				      int prev_win)
 {
-	static unsigned long prev_offset;
-	unsigned int pages, max_pages, last_ra;
-	static atomic_t last_readahead_pages;
-
-	max_pages = 1 << READ_ONCE(page_cluster);
-	if (max_pages <= 1)
-		return 1;
+	unsigned int pages, last_ra;
 
 	/*
 	 * This heuristic has been found to work well on both sequential and
 	 * random loads, swapping to hard disk or to SSD: please don't ask
 	 * what the "+ 2" means, it just happens to work well, that's all.
 	 */
-	pages = atomic_xchg(&swapin_readahead_hits, 0) + 2;
+	pages = hits + 2;
 	if (pages == 2) {
 		/*
 		 * We can have no readahead hits to judge by: but must not get
@@ -450,7 +486,6 @@ static unsigned long swapin_nr_pages(unsigned long offset)
 		 */
 		if (offset != prev_offset + 1 && offset != prev_offset - 1)
 			pages = 1;
-		prev_offset = offset;
 	} else {
 		unsigned int roundup = 4;
 		while (roundup < pages)
@@ -462,9 +497,28 @@ static unsigned long swapin_nr_pages(unsigned long offset)
 		pages = max_pages;
 
 	/* Don't shrink readahead too fast */
-	last_ra = atomic_read(&last_readahead_pages) / 2;
+	last_ra = prev_win / 2;
 	if (pages < last_ra)
 		pages = last_ra;
+
+	return pages;
+}
+
+static unsigned long swapin_nr_pages(unsigned long offset)
+{
+	static unsigned long prev_offset;
+	unsigned int hits, pages, max_pages;
+	static atomic_t last_readahead_pages;
+
+	max_pages = 1 << READ_ONCE(page_cluster);
+	if (max_pages <= 1)
+		return 1;
+
+	hits = atomic_xchg(&swapin_readahead_hits, 0);
+	pages = __swapin_nr_pages(prev_offset, offset, hits, max_pages,
+				  atomic_read(&last_readahead_pages));
+	if (!hits)
+		prev_offset = offset;
 	atomic_set(&last_readahead_pages, pages);
 
 	return pages;
@@ -570,3 +624,130 @@ void exit_swap_address_space(unsigned int type)
 	synchronize_rcu();
 	kvfree(spaces);
 }
+
+static inline void swap_ra_clamp_pfn(struct vm_area_struct *vma,
+				     unsigned long faddr,
+				     unsigned long lpfn,
+				     unsigned long rpfn,
+				     unsigned long *start,
+				     unsigned long *end)
+{
+	*start = max3(lpfn, PFN_DOWN(vma->vm_start),
+		      PFN_DOWN(faddr & PMD_MASK));
+	*end = min3(rpfn, PFN_DOWN(vma->vm_end),
+		    PFN_DOWN((faddr & PMD_MASK) + PMD_SIZE));
+}
+
+struct page *swap_readahead_detect(struct vm_fault *vmf,
+				   struct vma_swap_readahead *swap_ra)
+{
+	struct vm_area_struct *vma = vmf->vma;
+	unsigned long swap_ra_info;
+	struct page *page;
+	swp_entry_t entry;
+	unsigned long faddr, pfn, fpfn;
+	unsigned long start, end;
+	pte_t *pte;
+	unsigned int max_win, hits, prev_win, win, left;
+#ifndef CONFIG_64BIT
+	pte_t *tpte;
+#endif
+
+	faddr = vmf->address;
+	entry = pte_to_swp_entry(vmf->orig_pte);
+	if ((unlikely(non_swap_entry(entry))))
+		return NULL;
+	page = lookup_swap_cache(entry, vma, faddr);
+	if (page)
+		return page;
+
+	max_win = 1 << READ_ONCE(swap_ra_max_order);
+	if (max_win == 1) {
+		swap_ra->win = 1;
+		return NULL;
+	}
+
+	fpfn = PFN_DOWN(faddr);
+	swap_ra_info = GET_SWAP_RA_VAL(vma);
+	pfn = PFN_DOWN(SWAP_RA_ADDR(swap_ra_info));
+	prev_win = SWAP_RA_WIN(swap_ra_info);
+	hits = SWAP_RA_HITS(swap_ra_info);
+	swap_ra->win = win = __swapin_nr_pages(pfn, fpfn, hits,
+					       max_win, prev_win);
+	atomic_long_set(&vma->swap_readahead_info,
+			SWAP_RA_VAL(faddr, win, 0));
+
+	if (win == 1)
+		return NULL;
+
+	/* Copy the PTEs because the page table may be unmapped */
+	if (fpfn == pfn + 1)
+		swap_ra_clamp_pfn(vma, faddr, fpfn, fpfn + win, &start, &end);
+	else if (pfn == fpfn + 1)
+		swap_ra_clamp_pfn(vma, faddr, fpfn - win + 1, fpfn + 1,
+				  &start, &end);
+	else {
+		left = (win - 1) / 2;
+		swap_ra_clamp_pfn(vma, faddr, fpfn - left, fpfn + win - left,
+				  &start, &end);
+	}
+	swap_ra->nr_pte = end - start;
+	swap_ra->offset = fpfn - start;
+	pte = vmf->pte - swap_ra->offset;
+#ifdef CONFIG_64BIT
+	swap_ra->ptes = pte;
+#else
+	tpte = swap_ra->ptes;
+	for (pfn = start; pfn != end; pfn++)
+		*tpte++ = *pte++;
+#endif
+
+	return NULL;
+}
+
+struct page *do_swap_page_readahead(swp_entry_t fentry, gfp_t gfp_mask,
+				    struct vm_fault *vmf,
+				    struct vma_swap_readahead *swap_ra)
+{
+	struct blk_plug plug;
+	struct vm_area_struct *vma = vmf->vma;
+	struct page *page;
+	pte_t *pte, pentry;
+	swp_entry_t entry;
+	unsigned int i;
+	bool page_allocated;
+
+	if (swap_ra->win == 1)
+		goto skip;
+
+	blk_start_plug(&plug);
+	for (i = 0, pte = swap_ra->ptes; i < swap_ra->nr_pte;
+	     i++, pte++) {
+		pentry = *pte;
+		if (pte_none(pentry))
+			continue;
+		if (pte_present(pentry))
+			continue;
+		entry = pte_to_swp_entry(pentry);
+		if (unlikely(non_swap_entry(entry)))
+			continue;
+		page = __read_swap_cache_async(entry, gfp_mask, vma,
+					       vmf->address, &page_allocated);
+		if (!page)
+			continue;
+		if (page_allocated) {
+			swap_readpage(page, false);
+			if (i != swap_ra->offset &&
+			    likely(!PageTransCompound(page))) {
+				SetPageReadahead(page);
+				count_vm_event(SWAP_RA);
+			}
+		}
+		put_page(page);
+	}
+	blk_finish_plug(&plug);
+	lru_add_drain();
+skip:
+	return read_swap_cache_async(fentry, gfp_mask, vma, vmf->address,
+				     swap_ra->win == 1);
+}
-- 
cgit v1.2.3


From 81a0298bdfab0203d360df7c9bf690d1d457f999 Mon Sep 17 00:00:00 2001
From: Huang Ying <ying.huang@intel.com>
Date: Wed, 6 Sep 2017 16:24:43 -0700
Subject: mm, swap: don't use VMA based swap readahead if HDD is used as swap

VMA based swap readahead will readahead the virtual pages that is
continuous in the virtual address space.  While the original swap
readahead will readahead the swap slots that is continuous in the swap
device.  Although VMA based swap readahead is more correct for the swap
slots to be readahead, it will trigger more small random readings, which
may cause the performance of HDD (hard disk) to degrade heavily, and may
finally exceed the benefit.

To avoid the issue, in this patch, if the HDD is used as swap, the VMA
based swap readahead will be disabled, and the original swap readahead
will be used instead.

Link: http://lkml.kernel.org/r/20170807054038.1843-6-ying.huang@intel.com
Signed-off-by: "Huang, Ying" <ying.huang@intel.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Minchan Kim <minchan@kernel.org>
Cc: Rik van Riel <riel@redhat.com>
Cc: Shaohua Li <shli@kernel.org>
Cc: Hugh Dickins <hughd@google.com>
Cc: Fengguang Wu <fengguang.wu@intel.com>
Cc: Tim Chen <tim.c.chen@intel.com>
Cc: Dave Hansen <dave.hansen@intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/swap.h | 11 ++++++-----
 mm/swapfile.c        |  8 +++++++-
 2 files changed, 13 insertions(+), 6 deletions(-)

(limited to 'include')

diff --git a/include/linux/swap.h b/include/linux/swap.h
index 61d63379e956..9c4ae6f14eea 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -400,16 +400,17 @@ extern struct page *do_swap_page_readahead(swp_entry_t fentry, gfp_t gfp_mask,
 					   struct vm_fault *vmf,
 					   struct vma_swap_readahead *swap_ra);
 
-static inline bool swap_use_vma_readahead(void)
-{
-	return READ_ONCE(swap_vma_readahead);
-}
-
 /* linux/mm/swapfile.c */
 extern atomic_long_t nr_swap_pages;
 extern long total_swap_pages;
+extern atomic_t nr_rotate_swap;
 extern bool has_usable_swap(void);
 
+static inline bool swap_use_vma_readahead(void)
+{
+	return READ_ONCE(swap_vma_readahead) && !atomic_read(&nr_rotate_swap);
+}
+
 /* Swap 50% full? Release swapcache more aggressively.. */
 static inline bool vm_swap_full(void)
 {
diff --git a/mm/swapfile.c b/mm/swapfile.c
index 42eff9e4e972..4f8b3e08a547 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -96,6 +96,8 @@ static DECLARE_WAIT_QUEUE_HEAD(proc_poll_wait);
 /* Activity counter to indicate that a swapon or swapoff has occurred */
 static atomic_t proc_poll_event = ATOMIC_INIT(0);
 
+atomic_t nr_rotate_swap = ATOMIC_INIT(0);
+
 static inline unsigned char swap_count(unsigned char ent)
 {
 	return ent & ~SWAP_HAS_CACHE;	/* may include SWAP_HAS_CONT flag */
@@ -2569,6 +2571,9 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
 	if (p->flags & SWP_CONTINUED)
 		free_swap_count_continuations(p);
 
+	if (!p->bdev || !blk_queue_nonrot(bdev_get_queue(p->bdev)))
+		atomic_dec(&nr_rotate_swap);
+
 	mutex_lock(&swapon_mutex);
 	spin_lock(&swap_lock);
 	spin_lock(&p->lock);
@@ -3145,7 +3150,8 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
 			cluster = per_cpu_ptr(p->percpu_cluster, cpu);
 			cluster_set_null(&cluster->index);
 		}
-	}
+	} else
+		atomic_inc(&nr_rotate_swap);
 
 	error = swap_cgroup_swapon(p->type, maxpages);
 	if (error)
-- 
cgit v1.2.3


From a2468cc9bfdff6139f59ca896671e5819ff5f94a Mon Sep 17 00:00:00 2001
From: Aaron Lu <aaron.lu@intel.com>
Date: Wed, 6 Sep 2017 16:24:57 -0700
Subject: swap: choose swap device according to numa node

If the system has more than one swap device and swap device has the node
information, we can make use of this information to decide which swap
device to use in get_swap_pages() to get better performance.

The current code uses a priority based list, swap_avail_list, to decide
which swap device to use and if multiple swap devices share the same
priority, they are used round robin.  This patch changes the previous
single global swap_avail_list into a per-numa-node list, i.e.  for each
numa node, it sees its own priority based list of available swap
devices.  Swap device's priority can be promoted on its matching node's
swap_avail_list.

The current swap device's priority is set as: user can set a >=0 value,
or the system will pick one starting from -1 then downwards.  The
priority value in the swap_avail_list is the negated value of the swap
device's due to plist being sorted from low to high.  The new policy
doesn't change the semantics for priority >=0 cases, the previous
starting from -1 then downwards now becomes starting from -2 then
downwards and -1 is reserved as the promoted value.

Take 4-node EX machine as an example, suppose 4 swap devices are
available, each sit on a different node:
swapA on node 0
swapB on node 1
swapC on node 2
swapD on node 3

After they are all swapped on in the sequence of ABCD.

Current behaviour:
their priorities will be:
swapA: -1
swapB: -2
swapC: -3
swapD: -4
And their position in the global swap_avail_list will be:
swapA   -> swapB   -> swapC   -> swapD
prio:1     prio:2     prio:3     prio:4

New behaviour:
their priorities will be(note that -1 is skipped):
swapA: -2
swapB: -3
swapC: -4
swapD: -5
And their positions in the 4 swap_avail_lists[nid] will be:
swap_avail_lists[0]: /* node 0's available swap device list */
swapA   -> swapB   -> swapC   -> swapD
prio:1     prio:3     prio:4     prio:5
swap_avali_lists[1]: /* node 1's available swap device list */
swapB   -> swapA   -> swapC   -> swapD
prio:1     prio:2     prio:4     prio:5
swap_avail_lists[2]: /* node 2's available swap device list */
swapC   -> swapA   -> swapB   -> swapD
prio:1     prio:2     prio:3     prio:5
swap_avail_lists[3]: /* node 3's available swap device list */
swapD   -> swapA   -> swapB   -> swapC
prio:1     prio:2     prio:3     prio:4

To see the effect of the patch, a test that starts N process, each mmap
a region of anonymous memory and then continually write to it at random
position to trigger both swap in and out is used.

On a 2 node Skylake EP machine with 64GiB memory, two 170GB SSD drives
are used as swap devices with each attached to a different node, the
result is:

runtime=30m/processes=32/total test size=128G/each process mmap region=4G
kernel         throughput
vanilla        13306
auto-binding   15169 +14%

runtime=30m/processes=64/total test size=128G/each process mmap region=2G
kernel         throughput
vanilla        11885
auto-binding   14879 +25%

[aaron.lu@intel.com: v2]
  Link: http://lkml.kernel.org/r/20170814053130.GD2369@aaronlu.sh.intel.com
  Link: http://lkml.kernel.org/r/20170816024439.GA10925@aaronlu.sh.intel.com
[akpm@linux-foundation.org: use kmalloc_array()]
Link: http://lkml.kernel.org/r/20170814053130.GD2369@aaronlu.sh.intel.com
Link: http://lkml.kernel.org/r/20170816024439.GA10925@aaronlu.sh.intel.com
Signed-off-by: Aaron Lu <aaron.lu@intel.com>
Cc: "Chen, Tim C" <tim.c.chen@intel.com>
Cc: Huang Ying <ying.huang@intel.com>
Cc: Andi Kleen <andi@firstfloor.org>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Minchan Kim <minchan@kernel.org>
Cc: Hugh Dickins <hughd@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 Documentation/vm/swap_numa.txt |  69 ++++++++++++++++++++++++
 include/linux/swap.h           |   2 +-
 mm/swapfile.c                  | 120 ++++++++++++++++++++++++++++++++---------
 3 files changed, 164 insertions(+), 27 deletions(-)
 create mode 100644 Documentation/vm/swap_numa.txt

(limited to 'include')

diff --git a/Documentation/vm/swap_numa.txt b/Documentation/vm/swap_numa.txt
new file mode 100644
index 000000000000..d5960c9124f5
--- /dev/null
+++ b/Documentation/vm/swap_numa.txt
@@ -0,0 +1,69 @@
+Automatically bind swap device to numa node
+-------------------------------------------
+
+If the system has more than one swap device and swap device has the node
+information, we can make use of this information to decide which swap
+device to use in get_swap_pages() to get better performance.
+
+
+How to use this feature
+-----------------------
+
+Swap device has priority and that decides the order of it to be used. To make
+use of automatically binding, there is no need to manipulate priority settings
+for swap devices. e.g. on a 2 node machine, assume 2 swap devices swapA and
+swapB, with swapA attached to node 0 and swapB attached to node 1, are going
+to be swapped on. Simply swapping them on by doing:
+# swapon /dev/swapA
+# swapon /dev/swapB
+
+Then node 0 will use the two swap devices in the order of swapA then swapB and
+node 1 will use the two swap devices in the order of swapB then swapA. Note
+that the order of them being swapped on doesn't matter.
+
+A more complex example on a 4 node machine. Assume 6 swap devices are going to
+be swapped on: swapA and swapB are attached to node 0, swapC is attached to
+node 1, swapD and swapE are attached to node 2 and swapF is attached to node3.
+The way to swap them on is the same as above:
+# swapon /dev/swapA
+# swapon /dev/swapB
+# swapon /dev/swapC
+# swapon /dev/swapD
+# swapon /dev/swapE
+# swapon /dev/swapF
+
+Then node 0 will use them in the order of:
+swapA/swapB -> swapC -> swapD -> swapE -> swapF
+swapA and swapB will be used in a round robin mode before any other swap device.
+
+node 1 will use them in the order of:
+swapC -> swapA -> swapB -> swapD -> swapE -> swapF
+
+node 2 will use them in the order of:
+swapD/swapE -> swapA -> swapB -> swapC -> swapF
+Similaly, swapD and swapE will be used in a round robin mode before any
+other swap devices.
+
+node 3 will use them in the order of:
+swapF -> swapA -> swapB -> swapC -> swapD -> swapE
+
+
+Implementation details
+----------------------
+
+The current code uses a priority based list, swap_avail_list, to decide
+which swap device to use and if multiple swap devices share the same
+priority, they are used round robin. This change here replaces the single
+global swap_avail_list with a per-numa-node list, i.e. for each numa node,
+it sees its own priority based list of available swap devices. Swap
+device's priority can be promoted on its matching node's swap_avail_list.
+
+The current swap device's priority is set as: user can set a >=0 value,
+or the system will pick one starting from -1 then downwards. The priority
+value in the swap_avail_list is the negated value of the swap device's
+due to plist being sorted from low to high. The new policy doesn't change
+the semantics for priority >=0 cases, the previous starting from -1 then
+downwards now becomes starting from -2 then downwards and -1 is reserved
+as the promoted value. So if multiple swap devices are attached to the same
+node, they will all be promoted to priority -1 on that node's plist and will
+be used round robin before any other swap devices.
diff --git a/include/linux/swap.h b/include/linux/swap.h
index 9c4ae6f14eea..8bf3487fb204 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -212,7 +212,7 @@ struct swap_info_struct {
 	unsigned long	flags;		/* SWP_USED etc: see above */
 	signed short	prio;		/* swap priority of this type */
 	struct plist_node list;		/* entry in swap_active_head */
-	struct plist_node avail_list;	/* entry in swap_avail_head */
+	struct plist_node avail_lists[MAX_NUMNODES];/* entry in swap_avail_heads */
 	signed char	type;		/* strange name for an index */
 	unsigned int	max;		/* extent of the swap_map */
 	unsigned char *swap_map;	/* vmalloc'ed array of usage counts */
diff --git a/mm/swapfile.c b/mm/swapfile.c
index 4f8b3e08a547..d483278ee35b 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -60,7 +60,7 @@ atomic_long_t nr_swap_pages;
 EXPORT_SYMBOL_GPL(nr_swap_pages);
 /* protected with swap_lock. reading in vm_swap_full() doesn't need lock */
 long total_swap_pages;
-static int least_priority;
+static int least_priority = -1;
 
 static const char Bad_file[] = "Bad swap file entry ";
 static const char Unused_file[] = "Unused swap file entry ";
@@ -85,7 +85,7 @@ PLIST_HEAD(swap_active_head);
  * is held and the locking order requires swap_lock to be taken
  * before any swap_info_struct->lock.
  */
-static PLIST_HEAD(swap_avail_head);
+struct plist_head *swap_avail_heads;
 static DEFINE_SPINLOCK(swap_avail_lock);
 
 struct swap_info_struct *swap_info[MAX_SWAPFILES];
@@ -592,6 +592,21 @@ new_cluster:
 	return found_free;
 }
 
+static void __del_from_avail_list(struct swap_info_struct *p)
+{
+	int nid;
+
+	for_each_node(nid)
+		plist_del(&p->avail_lists[nid], &swap_avail_heads[nid]);
+}
+
+static void del_from_avail_list(struct swap_info_struct *p)
+{
+	spin_lock(&swap_avail_lock);
+	__del_from_avail_list(p);
+	spin_unlock(&swap_avail_lock);
+}
+
 static void swap_range_alloc(struct swap_info_struct *si, unsigned long offset,
 			     unsigned int nr_entries)
 {
@@ -605,12 +620,22 @@ static void swap_range_alloc(struct swap_info_struct *si, unsigned long offset,
 	if (si->inuse_pages == si->pages) {
 		si->lowest_bit = si->max;
 		si->highest_bit = 0;
-		spin_lock(&swap_avail_lock);
-		plist_del(&si->avail_list, &swap_avail_head);
-		spin_unlock(&swap_avail_lock);
+		del_from_avail_list(si);
 	}
 }
 
+static void add_to_avail_list(struct swap_info_struct *p)
+{
+	int nid;
+
+	spin_lock(&swap_avail_lock);
+	for_each_node(nid) {
+		WARN_ON(!plist_node_empty(&p->avail_lists[nid]));
+		plist_add(&p->avail_lists[nid], &swap_avail_heads[nid]);
+	}
+	spin_unlock(&swap_avail_lock);
+}
+
 static void swap_range_free(struct swap_info_struct *si, unsigned long offset,
 			    unsigned int nr_entries)
 {
@@ -623,13 +648,8 @@ static void swap_range_free(struct swap_info_struct *si, unsigned long offset,
 		bool was_full = !si->highest_bit;
 
 		si->highest_bit = end;
-		if (was_full && (si->flags & SWP_WRITEOK)) {
-			spin_lock(&swap_avail_lock);
-			WARN_ON(!plist_node_empty(&si->avail_list));
-			if (plist_node_empty(&si->avail_list))
-				plist_add(&si->avail_list, &swap_avail_head);
-			spin_unlock(&swap_avail_lock);
-		}
+		if (was_full && (si->flags & SWP_WRITEOK))
+			add_to_avail_list(si);
 	}
 	atomic_long_add(nr_entries, &nr_swap_pages);
 	si->inuse_pages -= nr_entries;
@@ -910,6 +930,7 @@ int get_swap_pages(int n_goal, bool cluster, swp_entry_t swp_entries[])
 	struct swap_info_struct *si, *next;
 	long avail_pgs;
 	int n_ret = 0;
+	int node;
 
 	/* Only single cluster request supported */
 	WARN_ON_ONCE(n_goal > 1 && cluster);
@@ -929,14 +950,15 @@ int get_swap_pages(int n_goal, bool cluster, swp_entry_t swp_entries[])
 	spin_lock(&swap_avail_lock);
 
 start_over:
-	plist_for_each_entry_safe(si, next, &swap_avail_head, avail_list) {
+	node = numa_node_id();
+	plist_for_each_entry_safe(si, next, &swap_avail_heads[node], avail_lists[node]) {
 		/* requeue si to after same-priority siblings */
-		plist_requeue(&si->avail_list, &swap_avail_head);
+		plist_requeue(&si->avail_lists[node], &swap_avail_heads[node]);
 		spin_unlock(&swap_avail_lock);
 		spin_lock(&si->lock);
 		if (!si->highest_bit || !(si->flags & SWP_WRITEOK)) {
 			spin_lock(&swap_avail_lock);
-			if (plist_node_empty(&si->avail_list)) {
+			if (plist_node_empty(&si->avail_lists[node])) {
 				spin_unlock(&si->lock);
 				goto nextsi;
 			}
@@ -946,7 +968,7 @@ start_over:
 			WARN(!(si->flags & SWP_WRITEOK),
 			     "swap_info %d in list but !SWP_WRITEOK\n",
 			     si->type);
-			plist_del(&si->avail_list, &swap_avail_head);
+			__del_from_avail_list(si);
 			spin_unlock(&si->lock);
 			goto nextsi;
 		}
@@ -975,7 +997,7 @@ nextsi:
 		 * swap_avail_head list then try it, otherwise start over
 		 * if we have not gotten any slots.
 		 */
-		if (plist_node_empty(&next->avail_list))
+		if (plist_node_empty(&next->avail_lists[node]))
 			goto start_over;
 	}
 
@@ -2410,10 +2432,24 @@ static int setup_swap_extents(struct swap_info_struct *sis, sector_t *span)
 	return generic_swapfile_activate(sis, swap_file, span);
 }
 
+static int swap_node(struct swap_info_struct *p)
+{
+	struct block_device *bdev;
+
+	if (p->bdev)
+		bdev = p->bdev;
+	else
+		bdev = p->swap_file->f_inode->i_sb->s_bdev;
+
+	return bdev ? bdev->bd_disk->node_id : NUMA_NO_NODE;
+}
+
 static void _enable_swap_info(struct swap_info_struct *p, int prio,
 				unsigned char *swap_map,
 				struct swap_cluster_info *cluster_info)
 {
+	int i;
+
 	if (prio >= 0)
 		p->prio = prio;
 	else
@@ -2423,7 +2459,16 @@ static void _enable_swap_info(struct swap_info_struct *p, int prio,
 	 * low-to-high, while swap ordering is high-to-low
 	 */
 	p->list.prio = -p->prio;
-	p->avail_list.prio = -p->prio;
+	for_each_node(i) {
+		if (p->prio >= 0)
+			p->avail_lists[i].prio = -p->prio;
+		else {
+			if (swap_node(p) == i)
+				p->avail_lists[i].prio = 1;
+			else
+				p->avail_lists[i].prio = -p->prio;
+		}
+	}
 	p->swap_map = swap_map;
 	p->cluster_info = cluster_info;
 	p->flags |= SWP_WRITEOK;
@@ -2442,9 +2487,7 @@ static void _enable_swap_info(struct swap_info_struct *p, int prio,
 	 * swap_info_struct.
 	 */
 	plist_add(&p->list, &swap_active_head);
-	spin_lock(&swap_avail_lock);
-	plist_add(&p->avail_list, &swap_avail_head);
-	spin_unlock(&swap_avail_lock);
+	add_to_avail_list(p);
 }
 
 static void enable_swap_info(struct swap_info_struct *p, int prio,
@@ -2529,17 +2572,19 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
 		spin_unlock(&swap_lock);
 		goto out_dput;
 	}
-	spin_lock(&swap_avail_lock);
-	plist_del(&p->avail_list, &swap_avail_head);
-	spin_unlock(&swap_avail_lock);
+	del_from_avail_list(p);
 	spin_lock(&p->lock);
 	if (p->prio < 0) {
 		struct swap_info_struct *si = p;
+		int nid;
 
 		plist_for_each_entry_continue(si, &swap_active_head, list) {
 			si->prio++;
 			si->list.prio--;
-			si->avail_list.prio--;
+			for_each_node(nid) {
+				if (si->avail_lists[nid].prio != 1)
+					si->avail_lists[nid].prio--;
+			}
 		}
 		least_priority++;
 	}
@@ -2783,6 +2828,7 @@ static struct swap_info_struct *alloc_swap_info(void)
 {
 	struct swap_info_struct *p;
 	unsigned int type;
+	int i;
 
 	p = kzalloc(sizeof(*p), GFP_KERNEL);
 	if (!p)
@@ -2818,7 +2864,8 @@ static struct swap_info_struct *alloc_swap_info(void)
 	}
 	INIT_LIST_HEAD(&p->first_swap_extent.list);
 	plist_node_init(&p->list, 0);
-	plist_node_init(&p->avail_list, 0);
+	for_each_node(i)
+		plist_node_init(&p->avail_lists[i], 0);
 	p->flags = SWP_USED;
 	spin_unlock(&swap_lock);
 	spin_lock_init(&p->lock);
@@ -3060,6 +3107,9 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
 	if (!capable(CAP_SYS_ADMIN))
 		return -EPERM;
 
+	if (!swap_avail_heads)
+		return -ENOMEM;
+
 	p = alloc_swap_info();
 	if (IS_ERR(p))
 		return PTR_ERR(p);
@@ -3645,3 +3695,21 @@ static void free_swap_count_continuations(struct swap_info_struct *si)
 		}
 	}
 }
+
+static int __init swapfile_init(void)
+{
+	int nid;
+
+	swap_avail_heads = kmalloc_array(nr_node_ids, sizeof(struct plist_head),
+					 GFP_KERNEL);
+	if (!swap_avail_heads) {
+		pr_emerg("Not enough memory for swap heads, swap is disabled\n");
+		return -ENOMEM;
+	}
+
+	for_each_node(nid)
+		plist_head_init(&swap_avail_heads[nid]);
+
+	return 0;
+}
+subsys_initcall(swapfile_init);
-- 
cgit v1.2.3


From 212925802454672e6cd2949a727f5e2c1377bf06 Mon Sep 17 00:00:00 2001
From: Andrea Arcangeli <aarcange@redhat.com>
Date: Wed, 6 Sep 2017 16:25:00 -0700
Subject: mm: oom: let oom_reap_task and exit_mmap run concurrently

This is purely required because exit_aio() may block and exit_mmap() may
never start, if the oom_reap_task cannot start running on a mm with
mm_users == 0.

At the same time if the OOM reaper doesn't wait at all for the memory of
the current OOM candidate to be freed by exit_mmap->unmap_vmas, it would
generate a spurious OOM kill.

If it wasn't because of the exit_aio or similar blocking functions in
the last mmput, it would be enough to change the oom_reap_task() in the
case it finds mm_users == 0, to wait for a timeout or to wait for
__mmput to set MMF_OOM_SKIP itself, but it's not just exit_mmap the
problem here so the concurrency of exit_mmap and oom_reap_task is
apparently warranted.

It's a non standard runtime, exit_mmap() runs without mmap_sem, and
oom_reap_task runs with the mmap_sem for reading as usual (kind of
MADV_DONTNEED).

The race between the two is solved with a combination of
tsk_is_oom_victim() (serialized by task_lock) and MMF_OOM_SKIP
(serialized by a dummy down_write/up_write cycle on the same lines of
the ksm_exit method).

If the oom_reap_task() may be running concurrently during exit_mmap,
exit_mmap will wait it to finish in down_write (before taking down mm
structures that would make the oom_reap_task fail with use after free).

If exit_mmap comes first, oom_reap_task() will skip the mm if
MMF_OOM_SKIP is already set and in turn all memory is already freed and
furthermore the mm data structures may already have been taken down by
free_pgtables.

[aarcange@redhat.com: incremental one liner]
  Link: http://lkml.kernel.org/r/20170726164319.GC29716@redhat.com
[rientjes@google.com: remove unused mmput_async]
  Link: http://lkml.kernel.org/r/alpine.DEB.2.10.1708141733130.50317@chino.kir.corp.google.com
[aarcange@redhat.com: microoptimization]
  Link: http://lkml.kernel.org/r/20170817171240.GB5066@redhat.com
Link: http://lkml.kernel.org/r/20170726162912.GA29716@redhat.com
Fixes: 26db62f179d1 ("oom: keep mm of the killed task available")
Signed-off-by: Andrea Arcangeli <aarcange@redhat.com>
Signed-off-by: David Rientjes <rientjes@google.com>
Reported-by: David Rientjes <rientjes@google.com>
Tested-by: David Rientjes <rientjes@google.com>
Reviewed-by: Michal Hocko <mhocko@suse.com>
Cc: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: "Kirill A. Shutemov" <kirill@shutemov.name>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/sched/mm.h |  6 ------
 kernel/fork.c            | 17 -----------------
 mm/mmap.c                | 18 ++++++++++++++++++
 mm/oom_kill.c            | 15 +++++----------
 4 files changed, 23 insertions(+), 33 deletions(-)

(limited to 'include')

diff --git a/include/linux/sched/mm.h b/include/linux/sched/mm.h
index 2b0a281f9d26..3a19c253bdb1 100644
--- a/include/linux/sched/mm.h
+++ b/include/linux/sched/mm.h
@@ -84,12 +84,6 @@ static inline bool mmget_not_zero(struct mm_struct *mm)
 
 /* mmput gets rid of the mappings and all user-space */
 extern void mmput(struct mm_struct *);
-#ifdef CONFIG_MMU
-/* same as above but performs the slow path from the async context. Can
- * be called from the atomic context as well
- */
-extern void mmput_async(struct mm_struct *);
-#endif
 
 /* Grab a reference to a task's mm, if it is not already going away */
 extern struct mm_struct *get_task_mm(struct task_struct *task);
diff --git a/kernel/fork.c b/kernel/fork.c
index 4e5345c07344..7ed64600da6c 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -922,7 +922,6 @@ static inline void __mmput(struct mm_struct *mm)
 	}
 	if (mm->binfmt)
 		module_put(mm->binfmt->module);
-	set_bit(MMF_OOM_SKIP, &mm->flags);
 	mmdrop(mm);
 }
 
@@ -938,22 +937,6 @@ void mmput(struct mm_struct *mm)
 }
 EXPORT_SYMBOL_GPL(mmput);
 
-#ifdef CONFIG_MMU
-static void mmput_async_fn(struct work_struct *work)
-{
-	struct mm_struct *mm = container_of(work, struct mm_struct, async_put_work);
-	__mmput(mm);
-}
-
-void mmput_async(struct mm_struct *mm)
-{
-	if (atomic_dec_and_test(&mm->mm_users)) {
-		INIT_WORK(&mm->async_put_work, mmput_async_fn);
-		schedule_work(&mm->async_put_work);
-	}
-}
-#endif
-
 /**
  * set_mm_exe_file - change a reference to the mm's executable file
  *
diff --git a/mm/mmap.c b/mm/mmap.c
index 52f6c6b18f40..4c5981651407 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -44,6 +44,7 @@
 #include <linux/userfaultfd_k.h>
 #include <linux/moduleparam.h>
 #include <linux/pkeys.h>
+#include <linux/oom.h>
 
 #include <linux/uaccess.h>
 #include <asm/cacheflush.h>
@@ -3001,6 +3002,23 @@ void exit_mmap(struct mm_struct *mm)
 	/* Use -1 here to ensure all VMAs in the mm are unmapped */
 	unmap_vmas(&tlb, vma, 0, -1);
 
+	set_bit(MMF_OOM_SKIP, &mm->flags);
+	if (unlikely(tsk_is_oom_victim(current))) {
+		/*
+		 * Wait for oom_reap_task() to stop working on this
+		 * mm. Because MMF_OOM_SKIP is already set before
+		 * calling down_read(), oom_reap_task() will not run
+		 * on this "mm" post up_write().
+		 *
+		 * tsk_is_oom_victim() cannot be set from under us
+		 * either because current->mm is already set to NULL
+		 * under task_lock before calling mmput and oom_mm is
+		 * set not NULL by the OOM killer only if current->mm
+		 * is found not NULL while holding the task_lock.
+		 */
+		down_write(&mm->mmap_sem);
+		up_write(&mm->mmap_sem);
+	}
 	free_pgtables(&tlb, vma, FIRST_USER_ADDRESS, USER_PGTABLES_CEILING);
 	tlb_finish_mmu(&tlb, 0, -1);
 
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index c9f3569a76c7..99736e026712 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -495,11 +495,12 @@ static bool __oom_reap_task_mm(struct task_struct *tsk, struct mm_struct *mm)
 	}
 
 	/*
-	 * increase mm_users only after we know we will reap something so
-	 * that the mmput_async is called only when we have reaped something
-	 * and delayed __mmput doesn't matter that much
+	 * MMF_OOM_SKIP is set by exit_mmap when the OOM reaper can't
+	 * work on the mm anymore. The check for MMF_OOM_SKIP must run
+	 * under mmap_sem for reading because it serializes against the
+	 * down_write();up_write() cycle in exit_mmap().
 	 */
-	if (!mmget_not_zero(mm)) {
+	if (test_bit(MMF_OOM_SKIP, &mm->flags)) {
 		up_read(&mm->mmap_sem);
 		trace_skip_task_reaping(tsk->pid);
 		goto unlock_oom;
@@ -542,12 +543,6 @@ static bool __oom_reap_task_mm(struct task_struct *tsk, struct mm_struct *mm)
 			K(get_mm_counter(mm, MM_SHMEMPAGES)));
 	up_read(&mm->mmap_sem);
 
-	/*
-	 * Drop our reference but make sure the mmput slow path is called from a
-	 * different context because we shouldn't risk we get stuck there and
-	 * put the oom_reaper out of the way.
-	 */
-	mmput_async(mm);
 	trace_finish_task_reaping(tsk->pid);
 unlock_oom:
 	mutex_unlock(&oom_lock);
-- 
cgit v1.2.3


From c79b57e462b5d2f47afa5f175cf1828f16e18612 Mon Sep 17 00:00:00 2001
From: Huang Ying <ying.huang@intel.com>
Date: Wed, 6 Sep 2017 16:25:04 -0700
Subject: mm: hugetlb: clear target sub-page last when clearing huge page

Huge page helps to reduce TLB miss rate, but it has higher cache
footprint, sometimes this may cause some issue.  For example, when
clearing huge page on x86_64 platform, the cache footprint is 2M.  But
on a Xeon E5 v3 2699 CPU, there are 18 cores, 36 threads, and only 45M
LLC (last level cache).  That is, in average, there are 2.5M LLC for
each core and 1.25M LLC for each thread.

If the cache pressure is heavy when clearing the huge page, and we clear
the huge page from the begin to the end, it is possible that the begin
of huge page is evicted from the cache after we finishing clearing the
end of the huge page.  And it is possible for the application to access
the begin of the huge page after clearing the huge page.

To help the above situation, in this patch, when we clear a huge page,
the order to clear sub-pages is changed.  In quite some situation, we
can get the address that the application will access after we clear the
huge page, for example, in a page fault handler.  Instead of clearing
the huge page from begin to end, we will clear the sub-pages farthest
from the the sub-page to access firstly, and clear the sub-page to
access last.  This will make the sub-page to access most cache-hot and
sub-pages around it more cache-hot too.  If we cannot know the address
the application will access, the begin of the huge page is assumed to be
the the address the application will access.

With this patch, the throughput increases ~28.3% in vm-scalability
anon-w-seq test case with 72 processes on a 2 socket Xeon E5 v3 2699
system (36 cores, 72 threads).  The test case creates 72 processes, each
process mmap a big anonymous memory area and writes to it from the begin
to the end.  For each process, other processes could be seen as other
workload which generates heavy cache pressure.  At the same time, the
cache miss rate reduced from ~33.4% to ~31.7%, the IPC (instruction per
cycle) increased from 0.56 to 0.74, and the time spent in user space is
reduced ~7.9%

Christopher Lameter suggests to clear bytes inside a sub-page from end
to begin too.  But tests show no visible performance difference in the
tests.  May because the size of page is small compared with the cache
size.

Thanks Andi Kleen to propose to use address to access to determine the
order of sub-pages to clear.

The hugetlbfs access address could be improved, will do that in another
patch.

[ying.huang@intel.com: improve readability of clear_huge_page()]
  Link: http://lkml.kernel.org/r/20170830051842.1397-1-ying.huang@intel.com
Link: http://lkml.kernel.org/r/20170815014618.15842-1-ying.huang@intel.com
Suggested-by: Andi Kleen <andi.kleen@intel.com>
Signed-off-by: "Huang, Ying" <ying.huang@intel.com>
Acked-by: Jan Kara <jack@suse.cz>
Reviewed-by: Michal Hocko <mhocko@suse.com>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com>
Cc: Nadia Yvette Chambers <nyc@holomorphy.com>
Cc: Matthew Wilcox <mawilcox@microsoft.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: Minchan Kim <minchan@kernel.org>
Cc: Shaohua Li <shli@fb.com>
Cc: Christopher Lameter <cl@linux.com>
Cc: Mike Kravetz <mike.kravetz@oracle.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/mm.h |  2 +-
 mm/huge_memory.c   |  4 ++--
 mm/memory.c        | 42 ++++++++++++++++++++++++++++++++++++++----
 3 files changed, 41 insertions(+), 7 deletions(-)

(limited to 'include')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index eb5e4bc946cc..0acea73af839 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -2508,7 +2508,7 @@ enum mf_action_page_type {
 
 #if defined(CONFIG_TRANSPARENT_HUGEPAGE) || defined(CONFIG_HUGETLBFS)
 extern void clear_huge_page(struct page *page,
-			    unsigned long addr,
+			    unsigned long addr_hint,
 			    unsigned int pages_per_huge_page);
 extern void copy_user_huge_page(struct page *dst, struct page *src,
 				unsigned long addr, struct vm_area_struct *vma,
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 772048c233d1..0b51e70e0a8b 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -567,7 +567,7 @@ static int __do_huge_pmd_anonymous_page(struct vm_fault *vmf, struct page *page,
 		goto release;
 	}
 
-	clear_huge_page(page, haddr, HPAGE_PMD_NR);
+	clear_huge_page(page, vmf->address, HPAGE_PMD_NR);
 	/*
 	 * The memory barrier inside __SetPageUptodate makes sure that
 	 * clear_huge_page writes become visible before the set_pmd_at()
@@ -1305,7 +1305,7 @@ alloc:
 	count_vm_event(THP_FAULT_ALLOC);
 
 	if (!page)
-		clear_huge_page(new_page, haddr, HPAGE_PMD_NR);
+		clear_huge_page(new_page, vmf->address, HPAGE_PMD_NR);
 	else
 		copy_user_huge_page(new_page, page, haddr, vma, HPAGE_PMD_NR);
 	__SetPageUptodate(new_page);
diff --git a/mm/memory.c b/mm/memory.c
index e87953775e3c..13ee83b43878 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -4417,19 +4417,53 @@ static void clear_gigantic_page(struct page *page,
 	}
 }
 void clear_huge_page(struct page *page,
-		     unsigned long addr, unsigned int pages_per_huge_page)
+		     unsigned long addr_hint, unsigned int pages_per_huge_page)
 {
-	int i;
+	int i, n, base, l;
+	unsigned long addr = addr_hint &
+		~(((unsigned long)pages_per_huge_page << PAGE_SHIFT) - 1);
 
 	if (unlikely(pages_per_huge_page > MAX_ORDER_NR_PAGES)) {
 		clear_gigantic_page(page, addr, pages_per_huge_page);
 		return;
 	}
 
+	/* Clear sub-page to access last to keep its cache lines hot */
 	might_sleep();
-	for (i = 0; i < pages_per_huge_page; i++) {
+	n = (addr_hint - addr) / PAGE_SIZE;
+	if (2 * n <= pages_per_huge_page) {
+		/* If sub-page to access in first half of huge page */
+		base = 0;
+		l = n;
+		/* Clear sub-pages at the end of huge page */
+		for (i = pages_per_huge_page - 1; i >= 2 * n; i--) {
+			cond_resched();
+			clear_user_highpage(page + i, addr + i * PAGE_SIZE);
+		}
+	} else {
+		/* If sub-page to access in second half of huge page */
+		base = pages_per_huge_page - 2 * (pages_per_huge_page - n);
+		l = pages_per_huge_page - n;
+		/* Clear sub-pages at the begin of huge page */
+		for (i = 0; i < base; i++) {
+			cond_resched();
+			clear_user_highpage(page + i, addr + i * PAGE_SIZE);
+		}
+	}
+	/*
+	 * Clear remaining sub-pages in left-right-left-right pattern
+	 * towards the sub-page to access
+	 */
+	for (i = 0; i < l; i++) {
+		int left_idx = base + i;
+		int right_idx = base + 2 * l - 1 - i;
+
+		cond_resched();
+		clear_user_highpage(page + left_idx,
+				    addr + left_idx * PAGE_SIZE);
 		cond_resched();
-		clear_user_highpage(page + i, addr + i * PAGE_SIZE);
+		clear_user_highpage(page + right_idx,
+				    addr + right_idx * PAGE_SIZE);
 	}
 }
 
-- 
cgit v1.2.3


From df3735c5b40fad8d0d28eb8ab065fe955b3347ee Mon Sep 17 00:00:00 2001
From: Rik van Riel <riel@redhat.com>
Date: Wed, 6 Sep 2017 16:25:11 -0700
Subject: x86,mpx: make mpx depend on x86-64 to free up VMA flag
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Patch series "mm,fork,security: introduce MADV_WIPEONFORK", v4.

If a child process accesses memory that was MADV_WIPEONFORK, it will get
zeroes.  The address ranges are still valid, they are just empty.

If a child process accesses memory that was MADV_DONTFORK, it will get a
segmentation fault, since those address ranges are no longer valid in
the child after fork.

Since MADV_DONTFORK also seems to be used to allow very large programs
to fork in systems with strict memory overcommit restrictions, changing
the semantics of MADV_DONTFORK might break existing programs.

The use case is libraries that store or cache information, and want to
know that they need to regenerate it in the child process after fork.

Examples of this would be:
 - systemd/pulseaudio API checks (fail after fork) (replacing a getpid
   check, which is too slow without a PID cache)
 - PKCS#11 API reinitialization check (mandated by specification)
 - glibc's upcoming PRNG (reseed after fork)
 - OpenSSL PRNG (reseed after fork)

The security benefits of a forking server having a re-inialized PRNG in
every child process are pretty obvious.  However, due to libraries
having all kinds of internal state, and programs getting compiled with
many different versions of each library, it is unreasonable to expect
calling programs to re-initialize everything manually after fork.

A further complication is the proliferation of clone flags, programs
bypassing glibc's functions to call clone directly, and programs calling
unshare, causing the glibc pthread_atfork hook to not get called.

It would be better to have the kernel take care of this automatically.

The patchset also adds MADV_KEEPONFORK, to undo the effects of a prior
MADV_WIPEONFORK.

This is similar to the OpenBSD minherit syscall with MAP_INHERIT_ZERO:

    https://man.openbsd.org/minherit.2

This patch (of 2):

MPX only seems to be available on 64 bit CPUs, starting with Skylake and
Goldmont.  Move VM_MPX into the 64 bit only portion of vma->vm_flags, in
order to free up a VMA flag.

Link: http://lkml.kernel.org/r/20170811212829.29186-2-riel@redhat.com
Signed-off-by: Rik van Riel <riel@redhat.com>
Acked-by: Dave Hansen <dave.hansen@intel.com>
Cc: Mike Kravetz <mike.kravetz@oracle.com>
Cc: Florian Weimer <fweimer@redhat.com>
Cc: Kees Cook <keescook@chromium.org>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Will Drewry <wad@chromium.org>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: "Kirill A. Shutemov" <kirill@shutemov.name>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: Colm MacCártaigh <colm@allcosts.net>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/x86/Kconfig   | 4 +++-
 include/linux/mm.h | 8 ++++++--
 2 files changed, 9 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index acb366bf6bc1..4b278a33ccbb 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -1806,7 +1806,9 @@ config X86_SMAP
 config X86_INTEL_MPX
 	prompt "Intel MPX (Memory Protection Extensions)"
 	def_bool n
-	depends on CPU_SUP_INTEL
+	# Note: only available in 64-bit mode due to VMA flags shortage
+	depends on CPU_SUP_INTEL && X86_64
+	select ARCH_USES_HIGH_VMA_FLAGS
 	---help---
 	  MPX provides hardware features that can be used in
 	  conjunction with compiler-instrumented code to check
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 0acea73af839..9efe62032094 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -208,10 +208,12 @@ extern unsigned int kobjsize(const void *objp);
 #define VM_HIGH_ARCH_BIT_1	33	/* bit only usable on 64-bit architectures */
 #define VM_HIGH_ARCH_BIT_2	34	/* bit only usable on 64-bit architectures */
 #define VM_HIGH_ARCH_BIT_3	35	/* bit only usable on 64-bit architectures */
+#define VM_HIGH_ARCH_BIT_4	36	/* bit only usable on 64-bit architectures */
 #define VM_HIGH_ARCH_0	BIT(VM_HIGH_ARCH_BIT_0)
 #define VM_HIGH_ARCH_1	BIT(VM_HIGH_ARCH_BIT_1)
 #define VM_HIGH_ARCH_2	BIT(VM_HIGH_ARCH_BIT_2)
 #define VM_HIGH_ARCH_3	BIT(VM_HIGH_ARCH_BIT_3)
+#define VM_HIGH_ARCH_4	BIT(VM_HIGH_ARCH_BIT_4)
 #endif /* CONFIG_ARCH_USES_HIGH_VMA_FLAGS */
 
 #if defined(CONFIG_X86)
@@ -235,9 +237,11 @@ extern unsigned int kobjsize(const void *objp);
 # define VM_MAPPED_COPY	VM_ARCH_1	/* T if mapped copy of data (nommu mmap) */
 #endif
 
-#if defined(CONFIG_X86)
+#if defined(CONFIG_X86_INTEL_MPX)
 /* MPX specific bounds table or bounds directory */
-# define VM_MPX		VM_ARCH_2
+# define VM_MPX		VM_HIGH_ARCH_BIT_4
+#else
+# define VM_MPX		VM_NONE
 #endif
 
 #ifndef VM_GROWSUP
-- 
cgit v1.2.3


From d2cd9ede6e193dd7d88b6d27399e96229a551b19 Mon Sep 17 00:00:00 2001
From: Rik van Riel <riel@redhat.com>
Date: Wed, 6 Sep 2017 16:25:15 -0700
Subject: mm,fork: introduce MADV_WIPEONFORK
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Introduce MADV_WIPEONFORK semantics, which result in a VMA being empty
in the child process after fork.  This differs from MADV_DONTFORK in one
important way.

If a child process accesses memory that was MADV_WIPEONFORK, it will get
zeroes.  The address ranges are still valid, they are just empty.

If a child process accesses memory that was MADV_DONTFORK, it will get a
segmentation fault, since those address ranges are no longer valid in
the child after fork.

Since MADV_DONTFORK also seems to be used to allow very large programs
to fork in systems with strict memory overcommit restrictions, changing
the semantics of MADV_DONTFORK might break existing programs.

MADV_WIPEONFORK only works on private, anonymous VMAs.

The use case is libraries that store or cache information, and want to
know that they need to regenerate it in the child process after fork.

Examples of this would be:
 - systemd/pulseaudio API checks (fail after fork) (replacing a getpid
   check, which is too slow without a PID cache)
 - PKCS#11 API reinitialization check (mandated by specification)
 - glibc's upcoming PRNG (reseed after fork)
 - OpenSSL PRNG (reseed after fork)

The security benefits of a forking server having a re-inialized PRNG in
every child process are pretty obvious.  However, due to libraries
having all kinds of internal state, and programs getting compiled with
many different versions of each library, it is unreasonable to expect
calling programs to re-initialize everything manually after fork.

A further complication is the proliferation of clone flags, programs
bypassing glibc's functions to call clone directly, and programs calling
unshare, causing the glibc pthread_atfork hook to not get called.

It would be better to have the kernel take care of this automatically.

The patch also adds MADV_KEEPONFORK, to undo the effects of a prior
MADV_WIPEONFORK.

This is similar to the OpenBSD minherit syscall with MAP_INHERIT_ZERO:

    https://man.openbsd.org/minherit.2

[akpm@linux-foundation.org: numerically order arch/parisc/include/uapi/asm/mman.h #defines]
Link: http://lkml.kernel.org/r/20170811212829.29186-3-riel@redhat.com
Signed-off-by: Rik van Riel <riel@redhat.com>
Reported-by: Florian Weimer <fweimer@redhat.com>
Reported-by: Colm MacCártaigh <colm@allcosts.net>
Reviewed-by: Mike Kravetz <mike.kravetz@oracle.com>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: "Kirill A. Shutemov" <kirill@shutemov.name>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Dave Hansen <dave.hansen@intel.com>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Helge Deller <deller@gmx.de>
Cc: Kees Cook <keescook@chromium.org>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Will Drewry <wad@chromium.org>
Cc: <linux-api@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/alpha/include/uapi/asm/mman.h     |  3 +++
 arch/mips/include/uapi/asm/mman.h      |  3 +++
 arch/parisc/include/uapi/asm/mman.h    |  3 +++
 arch/xtensa/include/uapi/asm/mman.h    |  3 +++
 fs/proc/task_mmu.c                     |  1 +
 include/linux/mm.h                     |  2 +-
 include/trace/events/mmflags.h         |  8 +-------
 include/uapi/asm-generic/mman-common.h |  3 +++
 kernel/fork.c                          | 10 ++++++++--
 mm/madvise.c                           | 13 +++++++++++++
 10 files changed, 39 insertions(+), 10 deletions(-)

(limited to 'include')

diff --git a/arch/alpha/include/uapi/asm/mman.h b/arch/alpha/include/uapi/asm/mman.h
index 13b52aad3c43..3b26cc62dadb 100644
--- a/arch/alpha/include/uapi/asm/mman.h
+++ b/arch/alpha/include/uapi/asm/mman.h
@@ -64,6 +64,9 @@
 					   overrides the coredump filter bits */
 #define MADV_DODUMP	17		/* Clear the MADV_NODUMP flag */
 
+#define MADV_WIPEONFORK 18		/* Zero memory on fork, child only */
+#define MADV_KEEPONFORK 19		/* Undo MADV_WIPEONFORK */
+
 /* compatibility flags */
 #define MAP_FILE	0
 
diff --git a/arch/mips/include/uapi/asm/mman.h b/arch/mips/include/uapi/asm/mman.h
index 398eebcc3541..da3216007fe0 100644
--- a/arch/mips/include/uapi/asm/mman.h
+++ b/arch/mips/include/uapi/asm/mman.h
@@ -91,6 +91,9 @@
 					   overrides the coredump filter bits */
 #define MADV_DODUMP	17		/* Clear the MADV_NODUMP flag */
 
+#define MADV_WIPEONFORK 18		/* Zero memory on fork, child only */
+#define MADV_KEEPONFORK 19		/* Undo MADV_WIPEONFORK */
+
 /* compatibility flags */
 #define MAP_FILE	0
 
diff --git a/arch/parisc/include/uapi/asm/mman.h b/arch/parisc/include/uapi/asm/mman.h
index b87fbe3f338a..775b5d5e41a1 100644
--- a/arch/parisc/include/uapi/asm/mman.h
+++ b/arch/parisc/include/uapi/asm/mman.h
@@ -57,6 +57,9 @@
 					   overrides the coredump filter bits */
 #define MADV_DODUMP	70		/* Clear the MADV_NODUMP flag */
 
+#define MADV_WIPEONFORK 71		/* Zero memory on fork, child only */
+#define MADV_KEEPONFORK 72		/* Undo MADV_WIPEONFORK */
+
 #define MADV_HWPOISON     100		/* poison a page for testing */
 #define MADV_SOFT_OFFLINE 101		/* soft offline page for testing */
 
diff --git a/arch/xtensa/include/uapi/asm/mman.h b/arch/xtensa/include/uapi/asm/mman.h
index 8ce77a2e9bab..b15b278aa314 100644
--- a/arch/xtensa/include/uapi/asm/mman.h
+++ b/arch/xtensa/include/uapi/asm/mman.h
@@ -103,6 +103,9 @@
 					   overrides the coredump filter bits */
 #define MADV_DODUMP	17		/* Clear the MADV_NODUMP flag */
 
+#define MADV_WIPEONFORK 18		/* Zero memory on fork, child only */
+#define MADV_KEEPONFORK 19		/* Undo MADV_WIPEONFORK */
+
 /* compatibility flags */
 #define MAP_FILE	0
 
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index b2330aedc63f..a290966f91ec 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -663,6 +663,7 @@ static void show_smap_vma_flags(struct seq_file *m, struct vm_area_struct *vma)
 		[ilog2(VM_NORESERVE)]	= "nr",
 		[ilog2(VM_HUGETLB)]	= "ht",
 		[ilog2(VM_ARCH_1)]	= "ar",
+		[ilog2(VM_WIPEONFORK)]	= "wf",
 		[ilog2(VM_DONTDUMP)]	= "dd",
 #ifdef CONFIG_MEM_SOFT_DIRTY
 		[ilog2(VM_SOFTDIRTY)]	= "sd",
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 9efe62032094..39db8e54c5d5 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -189,7 +189,7 @@ extern unsigned int kobjsize(const void *objp);
 #define VM_NORESERVE	0x00200000	/* should the VM suppress accounting */
 #define VM_HUGETLB	0x00400000	/* Huge TLB Page VM */
 #define VM_ARCH_1	0x01000000	/* Architecture-specific flag */
-#define VM_ARCH_2	0x02000000
+#define VM_WIPEONFORK	0x02000000	/* Wipe VMA contents in child. */
 #define VM_DONTDUMP	0x04000000	/* Do not include in the core dump */
 
 #ifdef CONFIG_MEM_SOFT_DIRTY
diff --git a/include/trace/events/mmflags.h b/include/trace/events/mmflags.h
index 8e50d01c645f..4c2e4737d7bc 100644
--- a/include/trace/events/mmflags.h
+++ b/include/trace/events/mmflags.h
@@ -125,12 +125,6 @@ IF_HAVE_PG_IDLE(PG_idle,		"idle"		)
 #define __VM_ARCH_SPECIFIC_1 {VM_ARCH_1,	"arch_1"	}
 #endif
 
-#if defined(CONFIG_X86)
-#define __VM_ARCH_SPECIFIC_2 {VM_MPX,		"mpx"		}
-#else
-#define __VM_ARCH_SPECIFIC_2 {VM_ARCH_2,	"arch_2"	}
-#endif
-
 #ifdef CONFIG_MEM_SOFT_DIRTY
 #define IF_HAVE_VM_SOFTDIRTY(flag,name) {flag, name },
 #else
@@ -162,7 +156,7 @@ IF_HAVE_PG_IDLE(PG_idle,		"idle"		)
 	{VM_NORESERVE,			"noreserve"	},		\
 	{VM_HUGETLB,			"hugetlb"	},		\
 	__VM_ARCH_SPECIFIC_1				,		\
-	__VM_ARCH_SPECIFIC_2				,		\
+	{VM_WIPEONFORK,			"wipeonfork"	},		\
 	{VM_DONTDUMP,			"dontdump"	},		\
 IF_HAVE_VM_SOFTDIRTY(VM_SOFTDIRTY,	"softdirty"	)		\
 	{VM_MIXEDMAP,			"mixedmap"	},		\
diff --git a/include/uapi/asm-generic/mman-common.h b/include/uapi/asm-generic/mman-common.h
index d248f3c335b5..203268f9231e 100644
--- a/include/uapi/asm-generic/mman-common.h
+++ b/include/uapi/asm-generic/mman-common.h
@@ -58,6 +58,9 @@
 					   overrides the coredump filter bits */
 #define MADV_DODUMP	17		/* Clear the MADV_DONTDUMP flag */
 
+#define MADV_WIPEONFORK 18		/* Zero memory on fork, child only */
+#define MADV_KEEPONFORK 19		/* Undo MADV_WIPEONFORK */
+
 /* compatibility flags */
 #define MAP_FILE	0
 
diff --git a/kernel/fork.c b/kernel/fork.c
index 7ed64600da6c..24a4c0be80d5 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -657,7 +657,12 @@ static __latent_entropy int dup_mmap(struct mm_struct *mm,
 		retval = dup_userfaultfd(tmp, &uf);
 		if (retval)
 			goto fail_nomem_anon_vma_fork;
-		if (anon_vma_fork(tmp, mpnt))
+		if (tmp->vm_flags & VM_WIPEONFORK) {
+			/* VM_WIPEONFORK gets a clean slate in the child. */
+			tmp->anon_vma = NULL;
+			if (anon_vma_prepare(tmp))
+				goto fail_nomem_anon_vma_fork;
+		} else if (anon_vma_fork(tmp, mpnt))
 			goto fail_nomem_anon_vma_fork;
 		tmp->vm_flags &= ~(VM_LOCKED | VM_LOCKONFAULT);
 		tmp->vm_next = tmp->vm_prev = NULL;
@@ -701,7 +706,8 @@ static __latent_entropy int dup_mmap(struct mm_struct *mm,
 		rb_parent = &tmp->vm_rb;
 
 		mm->map_count++;
-		retval = copy_page_range(mm, oldmm, mpnt);
+		if (!(tmp->vm_flags & VM_WIPEONFORK))
+			retval = copy_page_range(mm, oldmm, mpnt);
 
 		if (tmp->vm_ops && tmp->vm_ops->open)
 			tmp->vm_ops->open(tmp);
diff --git a/mm/madvise.c b/mm/madvise.c
index 4d7d1e5ddba9..eea1c733286f 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -80,6 +80,17 @@ static long madvise_behavior(struct vm_area_struct *vma,
 		}
 		new_flags &= ~VM_DONTCOPY;
 		break;
+	case MADV_WIPEONFORK:
+		/* MADV_WIPEONFORK is only supported on anonymous memory. */
+		if (vma->vm_file || vma->vm_flags & VM_SHARED) {
+			error = -EINVAL;
+			goto out;
+		}
+		new_flags |= VM_WIPEONFORK;
+		break;
+	case MADV_KEEPONFORK:
+		new_flags &= ~VM_WIPEONFORK;
+		break;
 	case MADV_DONTDUMP:
 		new_flags |= VM_DONTDUMP;
 		break;
@@ -696,6 +707,8 @@ madvise_behavior_valid(int behavior)
 #endif
 	case MADV_DONTDUMP:
 	case MADV_DODUMP:
+	case MADV_WIPEONFORK:
+	case MADV_KEEPONFORK:
 #ifdef CONFIG_MEMORY_FAILURE
 	case MADV_SOFT_OFFLINE:
 	case MADV_HWPOISON:
-- 
cgit v1.2.3


From 50e76632339d4655859523a39249dd95ee5e93e7 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Thu, 7 Sep 2017 11:13:38 +0200
Subject: sched/cpuset/pm: Fix cpuset vs. suspend-resume bugs

Cpusets vs. suspend-resume is _completely_ broken. And it got noticed
because it now resulted in non-cpuset usage breaking too.

On suspend cpuset_cpu_inactive() doesn't call into
cpuset_update_active_cpus() because it doesn't want to move tasks about,
there is no need, all tasks are frozen and won't run again until after
we've resumed everything.

But this means that when we finally do call into
cpuset_update_active_cpus() after resuming the last frozen cpu in
cpuset_cpu_active(), the top_cpuset will not have any difference with
the cpu_active_mask and this it will not in fact do _anything_.

So the cpuset configuration will not be restored. This was largely
hidden because we would unconditionally create identity domains and
mobile users would not in fact use cpusets much. And servers what do use
cpusets tend to not suspend-resume much.

An addition problem is that we'd not in fact wait for the cpuset work to
finish before resuming the tasks, allowing spurious migrations outside
of the specified domains.

Fix the rebuild by introducing cpuset_force_rebuild() and fix the
ordering with cpuset_wait_for_hotplug().

Reported-by: Andy Lutomirski <luto@kernel.org>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: <stable@vger.kernel.org>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rafael J. Wysocki <rjw@rjwysocki.net>
Cc: Tejun Heo <tj@kernel.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Fixes: deb7aa308ea2 ("cpuset: reorganize CPU / memory hotplug handling")
Link: http://lkml.kernel.org/r/20170907091338.orwxrqkbfkki3c24@hirez.programming.kicks-ass.net
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/cpuset.h |  6 ++++++
 kernel/cgroup/cpuset.c | 16 +++++++++++++++-
 kernel/power/process.c |  5 ++++-
 kernel/sched/core.c    |  7 +++----
 4 files changed, 28 insertions(+), 6 deletions(-)

(limited to 'include')

diff --git a/include/linux/cpuset.h b/include/linux/cpuset.h
index e74655d941b7..a1e6a33a4b03 100644
--- a/include/linux/cpuset.h
+++ b/include/linux/cpuset.h
@@ -51,7 +51,9 @@ static inline void cpuset_dec(void)
 
 extern int cpuset_init(void);
 extern void cpuset_init_smp(void);
+extern void cpuset_force_rebuild(void);
 extern void cpuset_update_active_cpus(void);
+extern void cpuset_wait_for_hotplug(void);
 extern void cpuset_cpus_allowed(struct task_struct *p, struct cpumask *mask);
 extern void cpuset_cpus_allowed_fallback(struct task_struct *p);
 extern nodemask_t cpuset_mems_allowed(struct task_struct *p);
@@ -164,11 +166,15 @@ static inline bool cpusets_enabled(void) { return false; }
 static inline int cpuset_init(void) { return 0; }
 static inline void cpuset_init_smp(void) {}
 
+static inline void cpuset_force_rebuild(void) { }
+
 static inline void cpuset_update_active_cpus(void)
 {
 	partition_sched_domains(1, NULL, NULL);
 }
 
+static inline void cpuset_wait_for_hotplug(void) { }
+
 static inline void cpuset_cpus_allowed(struct task_struct *p,
 				       struct cpumask *mask)
 {
diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c
index 2f4039bafebb..0513ee39698b 100644
--- a/kernel/cgroup/cpuset.c
+++ b/kernel/cgroup/cpuset.c
@@ -2267,6 +2267,13 @@ retry:
 	mutex_unlock(&cpuset_mutex);
 }
 
+static bool force_rebuild;
+
+void cpuset_force_rebuild(void)
+{
+	force_rebuild = true;
+}
+
 /**
  * cpuset_hotplug_workfn - handle CPU/memory hotunplug for a cpuset
  *
@@ -2341,8 +2348,10 @@ static void cpuset_hotplug_workfn(struct work_struct *work)
 	}
 
 	/* rebuild sched domains if cpus_allowed has changed */
-	if (cpus_updated)
+	if (cpus_updated || force_rebuild) {
+		force_rebuild = false;
 		rebuild_sched_domains();
+	}
 }
 
 void cpuset_update_active_cpus(void)
@@ -2355,6 +2364,11 @@ void cpuset_update_active_cpus(void)
 	schedule_work(&cpuset_hotplug_work);
 }
 
+void cpuset_wait_for_hotplug(void)
+{
+	flush_work(&cpuset_hotplug_work);
+}
+
 /*
  * Keep top_cpuset.mems_allowed tracking node_states[N_MEMORY].
  * Call this routine anytime after node_states[N_MEMORY] changes.
diff --git a/kernel/power/process.c b/kernel/power/process.c
index 78672d324a6e..50f25cb370c6 100644
--- a/kernel/power/process.c
+++ b/kernel/power/process.c
@@ -20,8 +20,9 @@
 #include <linux/workqueue.h>
 #include <linux/kmod.h>
 #include <trace/events/power.h>
+#include <linux/cpuset.h>
 
-/* 
+/*
  * Timeout for stopping processes
  */
 unsigned int __read_mostly freeze_timeout_msecs = 20 * MSEC_PER_SEC;
@@ -202,6 +203,8 @@ void thaw_processes(void)
 	__usermodehelper_set_disable_depth(UMH_FREEZING);
 	thaw_workqueues();
 
+	cpuset_wait_for_hotplug();
+
 	read_lock(&tasklist_lock);
 	for_each_process_thread(g, p) {
 		/* No other threads should have PF_SUSPEND_TASK set */
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 6d2c7ff9ba98..136a76d80dbf 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -5556,16 +5556,15 @@ static void cpuset_cpu_active(void)
 		 * operation in the resume sequence, just build a single sched
 		 * domain, ignoring cpusets.
 		 */
-		num_cpus_frozen--;
-		if (likely(num_cpus_frozen)) {
-			partition_sched_domains(1, NULL, NULL);
+		partition_sched_domains(1, NULL, NULL);
+		if (--num_cpus_frozen)
 			return;
-		}
 		/*
 		 * This is the last CPU online operation. So fall through and
 		 * restore the original sched domains by considering the
 		 * cpuset configurations.
 		 */
+		cpuset_force_rebuild();
 	}
 	cpuset_update_active_cpus();
 }
-- 
cgit v1.2.3


From 21d9bb4a05bac50fb4f850517af4030baecd00f6 Mon Sep 17 00:00:00 2001
From: Borislav Petkov <bp@suse.de>
Date: Thu, 7 Sep 2017 11:38:37 +0200
Subject: x86/mm: Make the SME mask a u64

The SME encryption mask is for masking 64-bit pagetable entries. It
being an unsigned long works fine on X86_64 but on 32-bit builds in
truncates bits leading to Xen guests crashing very early.

And regardless, the whole SME mask handling shouldnt've leaked into
32-bit because SME is X86_64-only feature. So, first make the mask u64.
And then, add trivial 32-bit versions of the __sme_* macros so that
nothing happens there.

Reported-and-tested-by: Boris Ostrovsky <boris.ostrovsky@oracle.com>
Tested-by: Brijesh Singh <brijesh.singh@amd.com>
Signed-off-by: Borislav Petkov <bp@suse.de>
Acked-by: Tom Lendacky <Thomas.Lendacky@amd.com>
Acked-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas <Thomas.Lendacky@amd.com>
Fixes: 21729f81ce8a ("x86/mm: Provide general kernel support for memory encryption")
Link: http://lkml.kernel.org/r/20170907093837.76zojtkgebwtqc74@pd.tnic
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/include/asm/mem_encrypt.h |  4 ++--
 arch/x86/mm/mem_encrypt.c          |  2 +-
 include/linux/mem_encrypt.h        | 13 +++++++++----
 3 files changed, 12 insertions(+), 7 deletions(-)

(limited to 'include')

diff --git a/arch/x86/include/asm/mem_encrypt.h b/arch/x86/include/asm/mem_encrypt.h
index 8e618fcf1f7c..6a77c63540f7 100644
--- a/arch/x86/include/asm/mem_encrypt.h
+++ b/arch/x86/include/asm/mem_encrypt.h
@@ -21,7 +21,7 @@
 
 #ifdef CONFIG_AMD_MEM_ENCRYPT
 
-extern unsigned long sme_me_mask;
+extern u64 sme_me_mask;
 
 void sme_encrypt_execute(unsigned long encrypted_kernel_vaddr,
 			 unsigned long decrypted_kernel_vaddr,
@@ -49,7 +49,7 @@ void swiotlb_set_mem_attributes(void *vaddr, unsigned long size);
 
 #else	/* !CONFIG_AMD_MEM_ENCRYPT */
 
-#define sme_me_mask	0UL
+#define sme_me_mask	0ULL
 
 static inline void __init sme_early_encrypt(resource_size_t paddr,
 					    unsigned long size) { }
diff --git a/arch/x86/mm/mem_encrypt.c b/arch/x86/mm/mem_encrypt.c
index 0fbd09269757..3fcc8e01683b 100644
--- a/arch/x86/mm/mem_encrypt.c
+++ b/arch/x86/mm/mem_encrypt.c
@@ -37,7 +37,7 @@ static char sme_cmdline_off[] __initdata = "off";
  * reside in the .data section so as not to be zeroed out when the .bss
  * section is later cleared.
  */
-unsigned long sme_me_mask __section(.data) = 0;
+u64 sme_me_mask __section(.data) = 0;
 EXPORT_SYMBOL_GPL(sme_me_mask);
 
 /* Buffer used for early in-place encryption by BSP, no locking needed */
diff --git a/include/linux/mem_encrypt.h b/include/linux/mem_encrypt.h
index 1255f09f5e42..265a9cd21cb4 100644
--- a/include/linux/mem_encrypt.h
+++ b/include/linux/mem_encrypt.h
@@ -21,7 +21,7 @@
 
 #else	/* !CONFIG_ARCH_HAS_MEM_ENCRYPT */
 
-#define sme_me_mask	0UL
+#define sme_me_mask	0ULL
 
 #endif	/* CONFIG_ARCH_HAS_MEM_ENCRYPT */
 
@@ -30,18 +30,23 @@ static inline bool sme_active(void)
 	return !!sme_me_mask;
 }
 
-static inline unsigned long sme_get_me_mask(void)
+static inline u64 sme_get_me_mask(void)
 {
 	return sme_me_mask;
 }
 
+#ifdef CONFIG_AMD_MEM_ENCRYPT
 /*
  * The __sme_set() and __sme_clr() macros are useful for adding or removing
  * the encryption mask from a value (e.g. when dealing with pagetable
  * entries).
  */
-#define __sme_set(x)		((unsigned long)(x) | sme_me_mask)
-#define __sme_clr(x)		((unsigned long)(x) & ~sme_me_mask)
+#define __sme_set(x)		((x) | sme_me_mask)
+#define __sme_clr(x)		((x) & ~sme_me_mask)
+#else
+#define __sme_set(x)		(x)
+#define __sme_clr(x)		(x)
+#endif
 
 #endif	/* __ASSEMBLY__ */
 
-- 
cgit v1.2.3


From ca2c1418efe9f7fe37aa1f355efdf4eb293673ce Mon Sep 17 00:00:00 2001
From: Paolo Abeni <pabeni@redhat.com>
Date: Wed, 6 Sep 2017 14:44:36 +0200
Subject: udp: drop head states only when all skb references are gone

After commit 0ddf3fb2c43d ("udp: preserve skb->dst if required
for IP options processing") we clear the skb head state as soon
as the skb carrying them is first processed.

Since the same skb can be processed several times when MSG_PEEK
is used, we can end up lacking the required head states, and
eventually oopsing.

Fix this clearing the skb head state only when processing the
last skb reference.

Reported-by: Eric Dumazet <edumazet@google.com>
Fixes: 0ddf3fb2c43d ("udp: preserve skb->dst if required for IP options processing")
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/skbuff.h | 2 +-
 net/core/skbuff.c      | 9 +++------
 net/ipv4/udp.c         | 5 ++++-
 3 files changed, 8 insertions(+), 8 deletions(-)

(limited to 'include')

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index f751f3b93039..72299ef00061 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -958,7 +958,7 @@ void kfree_skb(struct sk_buff *skb);
 void kfree_skb_list(struct sk_buff *segs);
 void skb_tx_error(struct sk_buff *skb);
 void consume_skb(struct sk_buff *skb);
-void consume_stateless_skb(struct sk_buff *skb);
+void __consume_stateless_skb(struct sk_buff *skb);
 void  __kfree_skb(struct sk_buff *skb);
 extern struct kmem_cache *skbuff_head_cache;
 
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index 68065d7d383f..16982de649b9 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -710,14 +710,11 @@ EXPORT_SYMBOL(consume_skb);
  *	consume_stateless_skb - free an skbuff, assuming it is stateless
  *	@skb: buffer to free
  *
- *	Works like consume_skb(), but this variant assumes that all the head
- *	states have been already dropped.
+ *	Alike consume_skb(), but this variant assumes that this is the last
+ *	skb reference and all the head states have been already dropped
  */
-void consume_stateless_skb(struct sk_buff *skb)
+void __consume_stateless_skb(struct sk_buff *skb)
 {
-	if (!skb_unref(skb))
-		return;
-
 	trace_consume_skb(skb);
 	skb_release_data(skb);
 	kfree_skbmem(skb);
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index db1c9e78c83c..ef29df8648e4 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -1397,12 +1397,15 @@ void skb_consume_udp(struct sock *sk, struct sk_buff *skb, int len)
 		unlock_sock_fast(sk, slow);
 	}
 
+	if (!skb_unref(skb))
+		return;
+
 	/* In the more common cases we cleared the head states previously,
 	 * see __udp_queue_rcv_skb().
 	 */
 	if (unlikely(udp_skb_has_head_state(skb)))
 		skb_release_head_state(skb);
-	consume_stateless_skb(skb);
+	__consume_stateless_skb(skb);
 }
 EXPORT_SYMBOL_GPL(skb_consume_udp);
 
-- 
cgit v1.2.3


From f46640b931e588aeec5285b4a9547b354ad10cd0 Mon Sep 17 00:00:00 2001
From: Maarten Lankhorst <maarten.lankhorst@linux.intel.com>
Date: Mon, 4 Sep 2017 12:48:36 +0200
Subject: drm/atomic: Return commit in drm_crtc_commit_get for better
 annotation

This will allow code to do x->commit = drm_crtc_commit_get(commit),
making it clearer where references are used.

Signed-off-by: Maarten Lankhorst <maarten.lankhorst@linux.intel.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20170904104838.23822-5-maarten.lankhorst@linux.intel.com
Reviewed-by: Daniel Vetter <daniel.vetter@ffwll.ch>
---
 drivers/gpu/drm/drm_atomic_helper.c | 3 +--
 include/drm/drm_atomic.h            | 6 +++++-
 2 files changed, 6 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/drivers/gpu/drm/drm_atomic_helper.c b/drivers/gpu/drm/drm_atomic_helper.c
index 1bc32cd74d78..94ad11f76e0e 100644
--- a/drivers/gpu/drm/drm_atomic_helper.c
+++ b/drivers/gpu/drm/drm_atomic_helper.c
@@ -1633,8 +1633,7 @@ static int stall_checks(struct drm_crtc *crtc, bool nonblock)
 				return -EBUSY;
 			}
 		} else if (i == 1) {
-			stall_commit = commit;
-			drm_crtc_commit_get(stall_commit);
+			stall_commit = drm_crtc_commit_get(commit);
 			break;
 		}
 
diff --git a/include/drm/drm_atomic.h b/include/drm/drm_atomic.h
index f73b663c1f76..c0451e2a51af 100644
--- a/include/drm/drm_atomic.h
+++ b/include/drm/drm_atomic.h
@@ -252,10 +252,14 @@ void __drm_crtc_commit_free(struct kref *kref);
  * @commit: CRTC commit
  *
  * Increases the reference of @commit.
+ *
+ * Returns:
+ * The pointer to @commit, with reference increased.
  */
-static inline void drm_crtc_commit_get(struct drm_crtc_commit *commit)
+static inline struct drm_crtc_commit *drm_crtc_commit_get(struct drm_crtc_commit *commit)
 {
 	kref_get(&commit->ref);
+	return commit;
 }
 
 /**
-- 
cgit v1.2.3


From 163bcc2c74a22c891c1906e6e343e28a70a54978 Mon Sep 17 00:00:00 2001
From: Maarten Lankhorst <maarten.lankhorst@linux.intel.com>
Date: Mon, 4 Sep 2017 17:04:56 +0200
Subject: drm/atomic: Move drm_crtc_commit to drm_crtc_state, v4.

Most code only cares about the current commit or previous commit.
Fortuantely we already have a place to track those. Move it to
drm_crtc_state where it belongs. :)

The per-crtc commit_list is kept for places where we have to look
deeper than the current or previous commit for checking whether to stall
on unpin. This is used in drm_atomic_helper_setup_commit and
intel_has_pending_fb_unpin.

Changes since v1:
- Update kerneldoc for drm_crtc.commit_list. (danvet)
Changes since v2:
- Remove drm_atomic_helper_async_check hunk. (pinchartl)
Changes since v3:
- Fix use-after-free in drm_atomic_helper_commit_cleanup_done().

Signed-off-by: Maarten Lankhorst <maarten.lankhorst@linux.intel.com>
Reviewed-by: Daniel Vetter <daniel.vetter@ffwll.ch>
Link: https://patchwork.freedesktop.org/patch/msgid/20170904150456.31049-1-maarten.lankhorst@linux.intel.com
[mlankhorst: preceeding -> preceding (checkpatch)]
---
 drivers/gpu/drm/drm_atomic.c        |  7 ----
 drivers/gpu/drm/drm_atomic_helper.c | 82 ++++++++++++++++---------------------
 include/drm/drm_atomic.h            |  1 -
 include/drm/drm_crtc.h              | 23 +++++++++--
 4 files changed, 54 insertions(+), 59 deletions(-)

(limited to 'include')

diff --git a/drivers/gpu/drm/drm_atomic.c b/drivers/gpu/drm/drm_atomic.c
index 1b755439f591..58df70a5aab1 100644
--- a/drivers/gpu/drm/drm_atomic.c
+++ b/drivers/gpu/drm/drm_atomic.c
@@ -163,13 +163,6 @@ void drm_atomic_state_default_clear(struct drm_atomic_state *state)
 		crtc->funcs->atomic_destroy_state(crtc,
 						  state->crtcs[i].state);
 
-		if (state->crtcs[i].commit) {
-			kfree(state->crtcs[i].commit->event);
-			state->crtcs[i].commit->event = NULL;
-			drm_crtc_commit_put(state->crtcs[i].commit);
-		}
-
-		state->crtcs[i].commit = NULL;
 		state->crtcs[i].ptr = NULL;
 		state->crtcs[i].state = NULL;
 	}
diff --git a/drivers/gpu/drm/drm_atomic_helper.c b/drivers/gpu/drm/drm_atomic_helper.c
index 94ad11f76e0e..075b0b386adc 100644
--- a/drivers/gpu/drm/drm_atomic_helper.c
+++ b/drivers/gpu/drm/drm_atomic_helper.c
@@ -1262,12 +1262,12 @@ EXPORT_SYMBOL(drm_atomic_helper_wait_for_vblanks);
 void drm_atomic_helper_wait_for_flip_done(struct drm_device *dev,
 					  struct drm_atomic_state *old_state)
 {
-	struct drm_crtc_state *unused;
+	struct drm_crtc_state *new_crtc_state;
 	struct drm_crtc *crtc;
 	int i;
 
-	for_each_new_crtc_in_state(old_state, crtc, unused, i) {
-		struct drm_crtc_commit *commit = old_state->crtcs[i].commit;
+	for_each_new_crtc_in_state(old_state, crtc, new_crtc_state, i) {
+		struct drm_crtc_commit *commit = new_crtc_state->commit;
 		int ret;
 
 		if (!commit)
@@ -1730,7 +1730,7 @@ int drm_atomic_helper_setup_commit(struct drm_atomic_state *state,
 		kref_init(&commit->ref);
 		commit->crtc = crtc;
 
-		state->crtcs[i].commit = commit;
+		new_crtc_state->commit = commit;
 
 		ret = stall_checks(crtc, nonblock);
 		if (ret)
@@ -1768,22 +1768,6 @@ int drm_atomic_helper_setup_commit(struct drm_atomic_state *state,
 }
 EXPORT_SYMBOL(drm_atomic_helper_setup_commit);
 
-
-static struct drm_crtc_commit *preceeding_commit(struct drm_crtc *crtc)
-{
-	struct drm_crtc_commit *commit;
-	int i = 0;
-
-	list_for_each_entry(commit, &crtc->commit_list, commit_entry) {
-		/* skip the first entry, that's the current commit */
-		if (i == 1)
-			return commit;
-		i++;
-	}
-
-	return NULL;
-}
-
 /**
  * drm_atomic_helper_wait_for_dependencies - wait for required preceeding commits
  * @old_state: atomic state object with old state structures
@@ -1799,17 +1783,13 @@ static struct drm_crtc_commit *preceeding_commit(struct drm_crtc *crtc)
 void drm_atomic_helper_wait_for_dependencies(struct drm_atomic_state *old_state)
 {
 	struct drm_crtc *crtc;
-	struct drm_crtc_state *new_crtc_state;
+	struct drm_crtc_state *old_crtc_state;
 	struct drm_crtc_commit *commit;
 	int i;
 	long ret;
 
-	for_each_new_crtc_in_state(old_state, crtc, new_crtc_state, i) {
-		spin_lock(&crtc->commit_lock);
-		commit = preceeding_commit(crtc);
-		if (commit)
-			drm_crtc_commit_get(commit);
-		spin_unlock(&crtc->commit_lock);
+	for_each_old_crtc_in_state(old_state, crtc, old_crtc_state, i) {
+		commit = old_crtc_state->commit;
 
 		if (!commit)
 			continue;
@@ -1827,8 +1807,6 @@ void drm_atomic_helper_wait_for_dependencies(struct drm_atomic_state *old_state)
 		if (ret == 0)
 			DRM_ERROR("[CRTC:%d:%s] flip_done timed out\n",
 				  crtc->base.id, crtc->name);
-
-		drm_crtc_commit_put(commit);
 	}
 }
 EXPORT_SYMBOL(drm_atomic_helper_wait_for_dependencies);
@@ -1851,15 +1829,25 @@ EXPORT_SYMBOL(drm_atomic_helper_wait_for_dependencies);
 void drm_atomic_helper_commit_hw_done(struct drm_atomic_state *old_state)
 {
 	struct drm_crtc *crtc;
-	struct drm_crtc_state *new_crtc_state;
+	struct drm_crtc_state *old_crtc_state, *new_crtc_state;
 	struct drm_crtc_commit *commit;
 	int i;
 
-	for_each_new_crtc_in_state(old_state, crtc, new_crtc_state, i) {
-		commit = old_state->crtcs[i].commit;
+	for_each_oldnew_crtc_in_state(old_state, crtc, old_crtc_state, new_crtc_state, i) {
+		commit = new_crtc_state->commit;
 		if (!commit)
 			continue;
 
+		/*
+		 * copy new_crtc_state->commit to old_crtc_state->commit,
+		 * it's unsafe to touch new_crtc_state after hw_done,
+		 * but we still need to do so in cleanup_done().
+		 */
+		if (old_crtc_state->commit)
+			drm_crtc_commit_put(old_crtc_state->commit);
+
+		old_crtc_state->commit = drm_crtc_commit_get(commit);
+
 		/* backend must have consumed any event by now */
 		WARN_ON(new_crtc_state->event);
 		complete_all(&commit->hw_done);
@@ -1881,13 +1869,13 @@ EXPORT_SYMBOL(drm_atomic_helper_commit_hw_done);
 void drm_atomic_helper_commit_cleanup_done(struct drm_atomic_state *old_state)
 {
 	struct drm_crtc *crtc;
-	struct drm_crtc_state *new_crtc_state;
+	struct drm_crtc_state *old_crtc_state;
 	struct drm_crtc_commit *commit;
 	int i;
 	long ret;
 
-	for_each_new_crtc_in_state(old_state, crtc, new_crtc_state, i) {
-		commit = old_state->crtcs[i].commit;
+	for_each_old_crtc_in_state(old_state, crtc, old_crtc_state, i) {
+		commit = old_crtc_state->commit;
 		if (WARN_ON(!commit))
 			continue;
 
@@ -2293,20 +2281,13 @@ int drm_atomic_helper_swap_state(struct drm_atomic_state *state,
 	struct drm_private_state *old_obj_state, *new_obj_state;
 
 	if (stall) {
-		for_each_new_crtc_in_state(state, crtc, new_crtc_state, i) {
-			spin_lock(&crtc->commit_lock);
-			commit = list_first_entry_or_null(&crtc->commit_list,
-					struct drm_crtc_commit, commit_entry);
-			if (commit)
-				drm_crtc_commit_get(commit);
-			spin_unlock(&crtc->commit_lock);
+		for_each_old_crtc_in_state(state, crtc, old_crtc_state, i) {
+			commit = old_crtc_state->commit;
 
 			if (!commit)
 				continue;
 
 			ret = wait_for_completion_interruptible(&commit->hw_done);
-			drm_crtc_commit_put(commit);
-
 			if (ret)
 				return ret;
 		}
@@ -2331,13 +2312,13 @@ int drm_atomic_helper_swap_state(struct drm_atomic_state *state,
 		state->crtcs[i].state = old_crtc_state;
 		crtc->state = new_crtc_state;
 
-		if (state->crtcs[i].commit) {
+		if (new_crtc_state->commit) {
 			spin_lock(&crtc->commit_lock);
-			list_add(&state->crtcs[i].commit->commit_entry,
+			list_add(&new_crtc_state->commit->commit_entry,
 				 &crtc->commit_list);
 			spin_unlock(&crtc->commit_lock);
 
-			state->crtcs[i].commit->event = NULL;
+			new_crtc_state->commit->event = NULL;
 		}
 	}
 
@@ -3171,6 +3152,7 @@ void __drm_atomic_helper_crtc_duplicate_state(struct drm_crtc *crtc,
 	state->connectors_changed = false;
 	state->color_mgmt_changed = false;
 	state->zpos_changed = false;
+	state->commit = NULL;
 	state->event = NULL;
 	state->pageflip_flags = 0;
 }
@@ -3209,6 +3191,12 @@ EXPORT_SYMBOL(drm_atomic_helper_crtc_duplicate_state);
  */
 void __drm_atomic_helper_crtc_destroy_state(struct drm_crtc_state *state)
 {
+	if (state->commit) {
+		kfree(state->commit->event);
+		state->commit->event = NULL;
+		drm_crtc_commit_put(state->commit);
+	}
+
 	drm_property_blob_put(state->mode_blob);
 	drm_property_blob_put(state->degamma_lut);
 	drm_property_blob_put(state->ctm);
diff --git a/include/drm/drm_atomic.h b/include/drm/drm_atomic.h
index c0451e2a51af..a80a8dadef00 100644
--- a/include/drm/drm_atomic.h
+++ b/include/drm/drm_atomic.h
@@ -144,7 +144,6 @@ struct __drm_planes_state {
 struct __drm_crtcs_state {
 	struct drm_crtc *ptr;
 	struct drm_crtc_state *state, *old_state, *new_state;
-	struct drm_crtc_commit *commit;
 	s32 __user *out_fence_ptr;
 	unsigned last_vblank_count;
 };
diff --git a/include/drm/drm_crtc.h b/include/drm/drm_crtc.h
index 1a642020e306..901f5c054a2c 100644
--- a/include/drm/drm_crtc.h
+++ b/include/drm/drm_crtc.h
@@ -253,6 +253,15 @@ struct drm_crtc_state {
 	 */
 	struct drm_pending_vblank_event *event;
 
+	/**
+	 * @commit:
+	 *
+	 * This tracks how the commit for this update proceeds through the
+	 * various phases. This is never cleared, except when we destroy the
+	 * state, so that subsequent commits can synchronize with previous ones.
+	 */
+	struct drm_crtc_commit *commit;
+
 	struct drm_atomic_state *state;
 };
 
@@ -808,10 +817,16 @@ struct drm_crtc {
 	 * @commit_list:
 	 *
 	 * List of &drm_crtc_commit structures tracking pending commits.
-	 * Protected by @commit_lock. This list doesn't hold its own full
-	 * reference, but burrows it from the ongoing commit. Commit entries
-	 * must be removed from this list once the commit is fully completed,
-	 * but before it's correspoding &drm_atomic_state gets destroyed.
+	 * Protected by @commit_lock. This list holds its own full reference,
+	 * as does the ongoing commit.
+	 *
+	 * "Note that the commit for a state change is also tracked in
+	 * &drm_crtc_state.commit. For accessing the immediately preceding
+	 * commit in an atomic update it is recommended to just use that
+	 * pointer in the old CRTC state, since accessing that doesn't need
+	 * any locking or list-walking. @commit_list should only be used to
+	 * stall for framebuffer cleanup that's signalled through
+	 * &drm_crtc_commit.cleanup_done."
 	 */
 	struct list_head commit_list;
 
-- 
cgit v1.2.3


From 21a01abbe32a3cbeb903378a24e504bfd9fe0648 Mon Sep 17 00:00:00 2001
From: Maarten Lankhorst <maarten.lankhorst@linux.intel.com>
Date: Mon, 4 Sep 2017 12:48:37 +0200
Subject: drm/atomic: Fix freeing connector/plane state too early by tracking
 commits, v3.

Currently we neatly track the crtc state, but forget to look at
plane/connector state.

When doing a nonblocking modeset, immediately followed by a setprop
before the modeset completes, the setprop will see the modesets new
state as the old state and free it.

This has to be solved by waiting for hw_done on the connector, even
if it's not assigned to a crtc. When a connector is unbound we take
the last crtc commit, and when it stays unbound we create a new
fake crtc commit for that gets signaled on hw_done for all the
planes/connectors.

We wait for it the same way as we do for crtc's, which will make
sure we never run into a use-after-free situation.

Changes since v1:
- Only create a single disable commit. (danvet)
- Fix leak in intel_legacy_cursor_update.
Changes since v2:
- Make reference counting in drm_atomic_helper_setup_commit
  more obvious. (pinchartl)
- Call cleanup_done for fake commit. (danvet)
- Add comments to drm_atomic_helper_setup_commit. (danvet, pinchartl)
- Add comment to drm_atomic_helper_swap_state. (pinchartl)

Signed-off-by: Maarten Lankhorst <maarten.lankhorst@linux.intel.com>
Testcase: kms_atomic_transition.plane-use-after-nonblocking-unbind*
Cc: Laurent Pinchart <laurent.pinchart@ideasonboard.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20170904104838.23822-6-maarten.lankhorst@linux.intel.com
Reviewed-by: Daniel Vetter <daniel.vetter@ffwll.ch>
---
 drivers/gpu/drm/drm_atomic.c         |   4 +
 drivers/gpu/drm/drm_atomic_helper.c  | 172 +++++++++++++++++++++++++++++++++--
 drivers/gpu/drm/i915/intel_display.c |   2 +
 include/drm/drm_atomic.h             |  12 +++
 include/drm/drm_connector.h          |   7 ++
 include/drm/drm_plane.h              |   7 ++
 6 files changed, 198 insertions(+), 6 deletions(-)

(limited to 'include')

diff --git a/drivers/gpu/drm/drm_atomic.c b/drivers/gpu/drm/drm_atomic.c
index 58df70a5aab1..98f42397c529 100644
--- a/drivers/gpu/drm/drm_atomic.c
+++ b/drivers/gpu/drm/drm_atomic.c
@@ -192,6 +192,10 @@ void drm_atomic_state_default_clear(struct drm_atomic_state *state)
 	}
 	state->num_private_objs = 0;
 
+	if (state->fake_commit) {
+		drm_crtc_commit_put(state->fake_commit);
+		state->fake_commit = NULL;
+	}
 }
 EXPORT_SYMBOL(drm_atomic_state_default_clear);
 
diff --git a/drivers/gpu/drm/drm_atomic_helper.c b/drivers/gpu/drm/drm_atomic_helper.c
index 8bc2f155a7e4..820adcda45b8 100644
--- a/drivers/gpu/drm/drm_atomic_helper.c
+++ b/drivers/gpu/drm/drm_atomic_helper.c
@@ -1667,6 +1667,38 @@ static void release_crtc_commit(struct completion *completion)
 	drm_crtc_commit_put(commit);
 }
 
+static void init_commit(struct drm_crtc_commit *commit, struct drm_crtc *crtc)
+{
+	init_completion(&commit->flip_done);
+	init_completion(&commit->hw_done);
+	init_completion(&commit->cleanup_done);
+	INIT_LIST_HEAD(&commit->commit_entry);
+	kref_init(&commit->ref);
+	commit->crtc = crtc;
+}
+
+static struct drm_crtc_commit *
+crtc_or_fake_commit(struct drm_atomic_state *state, struct drm_crtc *crtc)
+{
+	if (crtc) {
+		struct drm_crtc_state *new_crtc_state;
+
+		new_crtc_state = drm_atomic_get_new_crtc_state(state, crtc);
+
+		return new_crtc_state->commit;
+	}
+
+	if (!state->fake_commit) {
+		state->fake_commit = kzalloc(sizeof(*state->fake_commit), GFP_KERNEL);
+		if (!state->fake_commit)
+			return NULL;
+
+		init_commit(state->fake_commit, NULL);
+	}
+
+	return state->fake_commit;
+}
+
 /**
  * drm_atomic_helper_setup_commit - setup possibly nonblocking commit
  * @state: new modeset state to be committed
@@ -1715,6 +1747,10 @@ int drm_atomic_helper_setup_commit(struct drm_atomic_state *state,
 {
 	struct drm_crtc *crtc;
 	struct drm_crtc_state *old_crtc_state, *new_crtc_state;
+	struct drm_connector *conn;
+	struct drm_connector_state *old_conn_state, *new_conn_state;
+	struct drm_plane *plane;
+	struct drm_plane_state *old_plane_state, *new_plane_state;
 	struct drm_crtc_commit *commit;
 	int i, ret;
 
@@ -1723,12 +1759,7 @@ int drm_atomic_helper_setup_commit(struct drm_atomic_state *state,
 		if (!commit)
 			return -ENOMEM;
 
-		init_completion(&commit->flip_done);
-		init_completion(&commit->hw_done);
-		init_completion(&commit->cleanup_done);
-		INIT_LIST_HEAD(&commit->commit_entry);
-		kref_init(&commit->ref);
-		commit->crtc = crtc;
+		init_commit(commit, crtc);
 
 		new_crtc_state->commit = commit;
 
@@ -1764,6 +1795,42 @@ int drm_atomic_helper_setup_commit(struct drm_atomic_state *state,
 		drm_crtc_commit_get(commit);
 	}
 
+	for_each_oldnew_connector_in_state(state, conn, old_conn_state, new_conn_state, i) {
+		/* commit tracked through new_crtc_state->commit, no need to do it explicitly */
+		if (new_conn_state->crtc)
+			continue;
+
+		/* Userspace is not allowed to get ahead of the previous
+		 * commit with nonblocking ones. */
+		if (nonblock && old_conn_state->commit &&
+		    !try_wait_for_completion(&old_conn_state->commit->flip_done))
+			return -EBUSY;
+
+		commit = crtc_or_fake_commit(state, old_conn_state->crtc);
+		if (!commit)
+			return -ENOMEM;
+
+		new_conn_state->commit = drm_crtc_commit_get(commit);
+	}
+
+	for_each_oldnew_plane_in_state(state, plane, old_plane_state, new_plane_state, i) {
+		/* commit tracked through new_crtc_state->commit, no need to do it explicitly */
+		if (new_plane_state->crtc)
+			continue;
+
+		/* Userspace is not allowed to get ahead of the previous
+		 * commit with nonblocking ones. */
+		if (nonblock && old_plane_state->commit &&
+		    !try_wait_for_completion(&old_plane_state->commit->flip_done))
+			return -EBUSY;
+
+		commit = crtc_or_fake_commit(state, old_plane_state->crtc);
+		if (!commit)
+			return -ENOMEM;
+
+		new_plane_state->commit = drm_crtc_commit_get(commit);
+	}
+
 	return 0;
 }
 EXPORT_SYMBOL(drm_atomic_helper_setup_commit);
@@ -1784,6 +1851,10 @@ void drm_atomic_helper_wait_for_dependencies(struct drm_atomic_state *old_state)
 {
 	struct drm_crtc *crtc;
 	struct drm_crtc_state *old_crtc_state;
+	struct drm_plane *plane;
+	struct drm_plane_state *old_plane_state;
+	struct drm_connector *conn;
+	struct drm_connector_state *old_conn_state;
 	struct drm_crtc_commit *commit;
 	int i;
 	long ret;
@@ -1808,6 +1879,48 @@ void drm_atomic_helper_wait_for_dependencies(struct drm_atomic_state *old_state)
 			DRM_ERROR("[CRTC:%d:%s] flip_done timed out\n",
 				  crtc->base.id, crtc->name);
 	}
+
+	for_each_old_connector_in_state(old_state, conn, old_conn_state, i) {
+		commit = old_conn_state->commit;
+
+		if (!commit)
+			continue;
+
+		ret = wait_for_completion_timeout(&commit->hw_done,
+						  10*HZ);
+		if (ret == 0)
+			DRM_ERROR("[CONNECTOR:%d:%s] hw_done timed out\n",
+				  conn->base.id, conn->name);
+
+		/* Currently no support for overwriting flips, hence
+		 * stall for previous one to execute completely. */
+		ret = wait_for_completion_timeout(&commit->flip_done,
+						  10*HZ);
+		if (ret == 0)
+			DRM_ERROR("[CONNECTOR:%d:%s] flip_done timed out\n",
+				  conn->base.id, conn->name);
+	}
+
+	for_each_old_plane_in_state(old_state, plane, old_plane_state, i) {
+		commit = old_plane_state->commit;
+
+		if (!commit)
+			continue;
+
+		ret = wait_for_completion_timeout(&commit->hw_done,
+						  10*HZ);
+		if (ret == 0)
+			DRM_ERROR("[PLANE:%d:%s] hw_done timed out\n",
+				  plane->base.id, plane->name);
+
+		/* Currently no support for overwriting flips, hence
+		 * stall for previous one to execute completely. */
+		ret = wait_for_completion_timeout(&commit->flip_done,
+						  10*HZ);
+		if (ret == 0)
+			DRM_ERROR("[PLANE:%d:%s] flip_done timed out\n",
+				  plane->base.id, plane->name);
+	}
 }
 EXPORT_SYMBOL(drm_atomic_helper_wait_for_dependencies);
 
@@ -1852,6 +1965,11 @@ void drm_atomic_helper_commit_hw_done(struct drm_atomic_state *old_state)
 		WARN_ON(new_crtc_state->event);
 		complete_all(&commit->hw_done);
 	}
+
+	if (old_state->fake_commit) {
+		complete_all(&old_state->fake_commit->hw_done);
+		complete_all(&old_state->fake_commit->flip_done);
+	}
 }
 EXPORT_SYMBOL(drm_atomic_helper_commit_hw_done);
 
@@ -1885,6 +2003,9 @@ void drm_atomic_helper_commit_cleanup_done(struct drm_atomic_state *old_state)
 		list_del(&commit->commit_entry);
 		spin_unlock(&crtc->commit_lock);
 	}
+
+	if (old_state->fake_commit)
+		complete_all(&old_state->fake_commit->cleanup_done);
 }
 EXPORT_SYMBOL(drm_atomic_helper_commit_cleanup_done);
 
@@ -2264,6 +2385,15 @@ int drm_atomic_helper_swap_state(struct drm_atomic_state *state,
 	struct drm_private_state *old_obj_state, *new_obj_state;
 
 	if (stall) {
+		/*
+		 * We have to stall for hw_done here before
+		 * drm_atomic_helper_wait_for_dependencies() because flip
+		 * depth > 1 is not yet supported by all drivers. As long as
+		 * obj->state is directly dereferenced anywhere in the drivers
+		 * atomic_commit_tail function, then it's unsafe to swap state
+		 * before drm_atomic_helper_commit_hw_done() is called.
+		 */
+
 		for_each_old_crtc_in_state(state, crtc, old_crtc_state, i) {
 			commit = old_crtc_state->commit;
 
@@ -2274,6 +2404,28 @@ int drm_atomic_helper_swap_state(struct drm_atomic_state *state,
 			if (ret)
 				return ret;
 		}
+
+		for_each_old_connector_in_state(state, connector, old_conn_state, i) {
+			commit = old_conn_state->commit;
+
+			if (!commit)
+				continue;
+
+			ret = wait_for_completion_interruptible(&commit->hw_done);
+			if (ret)
+				return ret;
+		}
+
+		for_each_old_plane_in_state(state, plane, old_plane_state, i) {
+			commit = old_plane_state->commit;
+
+			if (!commit)
+				continue;
+
+			ret = wait_for_completion_interruptible(&commit->hw_done);
+			if (ret)
+				return ret;
+		}
 	}
 
 	for_each_oldnew_connector_in_state(state, connector, old_conn_state, new_conn_state, i) {
@@ -3242,6 +3394,7 @@ void __drm_atomic_helper_plane_duplicate_state(struct drm_plane *plane,
 		drm_framebuffer_get(state->fb);
 
 	state->fence = NULL;
+	state->commit = NULL;
 }
 EXPORT_SYMBOL(__drm_atomic_helper_plane_duplicate_state);
 
@@ -3283,6 +3436,9 @@ void __drm_atomic_helper_plane_destroy_state(struct drm_plane_state *state)
 
 	if (state->fence)
 		dma_fence_put(state->fence);
+
+	if (state->commit)
+		drm_crtc_commit_put(state->commit);
 }
 EXPORT_SYMBOL(__drm_atomic_helper_plane_destroy_state);
 
@@ -3361,6 +3517,7 @@ __drm_atomic_helper_connector_duplicate_state(struct drm_connector *connector,
 	memcpy(state, connector->state, sizeof(*state));
 	if (state->crtc)
 		drm_connector_get(connector);
+	state->commit = NULL;
 }
 EXPORT_SYMBOL(__drm_atomic_helper_connector_duplicate_state);
 
@@ -3487,6 +3644,9 @@ __drm_atomic_helper_connector_destroy_state(struct drm_connector_state *state)
 {
 	if (state->crtc)
 		drm_connector_put(state->connector);
+
+	if (state->commit)
+		drm_crtc_commit_put(state->commit);
 }
 EXPORT_SYMBOL(__drm_atomic_helper_connector_destroy_state);
 
diff --git a/drivers/gpu/drm/i915/intel_display.c b/drivers/gpu/drm/i915/intel_display.c
index 67a8b6d9fd91..2e7376f9019f 100644
--- a/drivers/gpu/drm/i915/intel_display.c
+++ b/drivers/gpu/drm/i915/intel_display.c
@@ -13616,8 +13616,10 @@ intel_legacy_cursor_update(struct drm_plane *plane,
 
 	/* Swap plane state */
 	new_plane_state->fence = old_plane_state->fence;
+	new_plane_state->commit = old_plane_state->commit;
 	*to_intel_plane_state(old_plane_state) = *to_intel_plane_state(new_plane_state);
 	new_plane_state->fence = NULL;
+	new_plane_state->commit = NULL;
 	new_plane_state->fb = old_fb;
 	to_intel_plane_state(new_plane_state)->vma = old_vma;
 
diff --git a/include/drm/drm_atomic.h b/include/drm/drm_atomic.h
index a80a8dadef00..07a71daa3582 100644
--- a/include/drm/drm_atomic.h
+++ b/include/drm/drm_atomic.h
@@ -235,6 +235,18 @@ struct drm_atomic_state {
 
 	struct drm_modeset_acquire_ctx *acquire_ctx;
 
+	/**
+	 * @fake_commit:
+	 *
+	 * Used for signaling unbound planes/connectors.
+	 * When a connector or plane is not bound to any CRTC, it's still important
+	 * to preserve linearity to prevent the atomic states from being freed to early.
+	 *
+	 * This commit (if set) is not bound to any crtc, but will be completed when
+	 * drm_atomic_helper_commit_hw_done() is called.
+	 */
+	struct drm_crtc_commit *fake_commit;
+
 	/**
 	 * @commit_work:
 	 *
diff --git a/include/drm/drm_connector.h b/include/drm/drm_connector.h
index ea8da401c93c..8837649d16e8 100644
--- a/include/drm/drm_connector.h
+++ b/include/drm/drm_connector.h
@@ -347,6 +347,13 @@ struct drm_connector_state {
 
 	struct drm_atomic_state *state;
 
+	/**
+	 * @commit: Tracks the pending commit to prevent use-after-free conditions.
+	 *
+	 * Is only set when @crtc is NULL.
+	 */
+	struct drm_crtc_commit *commit;
+
 	struct drm_tv_connector_state tv;
 
 	/**
diff --git a/include/drm/drm_plane.h b/include/drm/drm_plane.h
index 73f90f9d057f..7d96116fd4c4 100644
--- a/include/drm/drm_plane.h
+++ b/include/drm/drm_plane.h
@@ -123,6 +123,13 @@ struct drm_plane_state {
 	 */
 	bool visible;
 
+	/**
+	 * @commit: Tracks the pending commit to prevent use-after-free conditions.
+	 *
+	 * Is only set when @crtc is NULL.
+	 */
+	struct drm_crtc_commit *commit;
+
 	struct drm_atomic_state *state;
 };
 
-- 
cgit v1.2.3


From 669c9215afea4e3684ef13e54e6908e9ae34f0ae Mon Sep 17 00:00:00 2001
From: Maarten Lankhorst <maarten.lankhorst@linux.intel.com>
Date: Mon, 4 Sep 2017 12:48:38 +0200
Subject: drm/atomic: Make async plane update checks work as intended, v2.

By always keeping track of the last commit in plane_state, we know
whether there is an active update on the plane or not. With that
information we can reject the fast update, and force the slowpath
to be used as was originally intended.

We cannot use plane_state->crtc->state here, because this only mentions
the most recent commit for the crtc, but not the planes that were part
of it. We specifically care about what the last commit involving this
plane is, which can only be tracked with a pointer in the plane state.

Changes since v1:
- Clean up the whole function here, instead of partially earlier.
- Add mention in the commit message why we need commit in plane_state.
- Swap plane->state in intel_legacy_cursor_update, instead of
  reassigning all variables. With this commit We know that the cursor
  is not part of any active commits so this hack can be removed.

Cc: Gustavo Padovan <gustavo.padovan@collabora.com>
Signed-off-by: Maarten Lankhorst <maarten.lankhorst@linux.intel.com>
Reviewed-by: Gustavo Padovan <gustavo.padovan@collabora.com>
Reviewed-by: Daniel Vetter <daniel.vetter@ffwll.ch> #v1
Link: https://patchwork.freedesktop.org/patch/msgid/20170904104838.23822-7-maarten.lankhorst@linux.intel.com
[mlankhorst: Amend commit for merge conflicts with drm-intel]
---
 drivers/gpu/drm/drm_atomic_helper.c  | 53 ++++++++++--------------------------
 drivers/gpu/drm/i915/intel_display.c | 26 +++++++++++-------
 include/drm/drm_plane.h              |  5 ++--
 3 files changed, 34 insertions(+), 50 deletions(-)

(limited to 'include')

diff --git a/drivers/gpu/drm/drm_atomic_helper.c b/drivers/gpu/drm/drm_atomic_helper.c
index 820adcda45b8..b97c054f9786 100644
--- a/drivers/gpu/drm/drm_atomic_helper.c
+++ b/drivers/gpu/drm/drm_atomic_helper.c
@@ -1388,35 +1388,31 @@ int drm_atomic_helper_async_check(struct drm_device *dev,
 {
 	struct drm_crtc *crtc;
 	struct drm_crtc_state *crtc_state;
-	struct drm_crtc_commit *commit;
-	struct drm_plane *__plane, *plane = NULL;
-	struct drm_plane_state *__plane_state, *plane_state = NULL;
+	struct drm_plane *plane;
+	struct drm_plane_state *old_plane_state, *new_plane_state;
 	const struct drm_plane_helper_funcs *funcs;
-	int i, j, n_planes = 0;
+	int i, n_planes = 0;
 
 	for_each_new_crtc_in_state(state, crtc, crtc_state, i) {
 		if (drm_atomic_crtc_needs_modeset(crtc_state))
 			return -EINVAL;
 	}
 
-	for_each_new_plane_in_state(state, __plane, __plane_state, i) {
+	for_each_oldnew_plane_in_state(state, plane, old_plane_state, new_plane_state, i)
 		n_planes++;
-		plane = __plane;
-		plane_state = __plane_state;
-	}
 
 	/* FIXME: we support only single plane updates for now */
-	if (!plane || n_planes != 1)
+	if (n_planes != 1)
 		return -EINVAL;
 
-	if (!plane_state->crtc)
+	if (!new_plane_state->crtc)
 		return -EINVAL;
 
 	funcs = plane->helper_private;
 	if (!funcs->atomic_async_update)
 		return -EINVAL;
 
-	if (plane_state->fence)
+	if (new_plane_state->fence)
 		return -EINVAL;
 
 	/*
@@ -1424,31 +1420,11 @@ int drm_atomic_helper_async_check(struct drm_device *dev,
 	 * the plane.  This prevents our async update's changes from getting
 	 * overridden by a previous synchronous update's state.
 	 */
-	for_each_new_crtc_in_state(state, crtc, crtc_state, i) {
-		if (plane->crtc != crtc)
-			continue;
-
-		spin_lock(&crtc->commit_lock);
-		commit = list_first_entry_or_null(&crtc->commit_list,
-						  struct drm_crtc_commit,
-						  commit_entry);
-		if (!commit) {
-			spin_unlock(&crtc->commit_lock);
-			continue;
-		}
-		spin_unlock(&crtc->commit_lock);
-
-		if (!crtc->state->state)
-			continue;
+	if (old_plane_state->commit &&
+	    !try_wait_for_completion(&old_plane_state->commit->hw_done))
+		return -EBUSY;
 
-		for_each_plane_in_state(crtc->state->state, __plane,
-					__plane_state, j) {
-			if (__plane == plane)
-				return -EINVAL;
-		}
-	}
-
-	return funcs->atomic_async_check(plane, plane_state);
+	return funcs->atomic_async_check(plane, new_plane_state);
 }
 EXPORT_SYMBOL(drm_atomic_helper_async_check);
 
@@ -1814,9 +1790,10 @@ int drm_atomic_helper_setup_commit(struct drm_atomic_state *state,
 	}
 
 	for_each_oldnew_plane_in_state(state, plane, old_plane_state, new_plane_state, i) {
-		/* commit tracked through new_crtc_state->commit, no need to do it explicitly */
-		if (new_plane_state->crtc)
-			continue;
+		/*
+		 * Unlike connectors, always track planes explicitly for
+		 * async pageflip support.
+		 */
 
 		/* Userspace is not allowed to get ahead of the previous
 		 * commit with nonblocking ones. */
diff --git a/drivers/gpu/drm/i915/intel_display.c b/drivers/gpu/drm/i915/intel_display.c
index 2e7376f9019f..3f505682963f 100644
--- a/drivers/gpu/drm/i915/intel_display.c
+++ b/drivers/gpu/drm/i915/intel_display.c
@@ -13548,6 +13548,14 @@ intel_legacy_cursor_update(struct drm_plane *plane,
 		goto slow;
 
 	old_plane_state = plane->state;
+	/*
+	 * Don't do an async update if there is an outstanding commit modifying
+	 * the plane.  This prevents our async update's changes from getting
+	 * overridden by a previous synchronous update's state.
+	 */
+	if (old_plane_state->commit &&
+	    !try_wait_for_completion(&old_plane_state->commit->hw_done))
+		goto slow;
 
 	/*
 	 * If any parameters change that may affect watermarks,
@@ -13609,19 +13617,12 @@ intel_legacy_cursor_update(struct drm_plane *plane,
 	}
 
 	old_fb = old_plane_state->fb;
-	old_vma = to_intel_plane_state(old_plane_state)->vma;
 
 	i915_gem_track_fb(intel_fb_obj(old_fb), intel_fb_obj(fb),
 			  intel_plane->frontbuffer_bit);
 
 	/* Swap plane state */
-	new_plane_state->fence = old_plane_state->fence;
-	new_plane_state->commit = old_plane_state->commit;
-	*to_intel_plane_state(old_plane_state) = *to_intel_plane_state(new_plane_state);
-	new_plane_state->fence = NULL;
-	new_plane_state->commit = NULL;
-	new_plane_state->fb = old_fb;
-	to_intel_plane_state(new_plane_state)->vma = old_vma;
+	plane->state = new_plane_state;
 
 	if (plane->state->visible) {
 		trace_intel_update_plane(plane, to_intel_crtc(crtc));
@@ -13633,12 +13634,17 @@ intel_legacy_cursor_update(struct drm_plane *plane,
 		intel_plane->disable_plane(intel_plane, to_intel_crtc(crtc));
 	}
 
-	intel_cleanup_plane_fb(plane, new_plane_state);
+	old_vma = fetch_and_zero(&to_intel_plane_state(old_plane_state)->vma);
+	if (old_vma)
+		intel_unpin_fb_vma(old_vma);
 
 out_unlock:
 	mutex_unlock(&dev_priv->drm.struct_mutex);
 out_free:
-	intel_plane_destroy_state(plane, new_plane_state);
+	if (ret)
+		intel_plane_destroy_state(plane, new_plane_state);
+	else
+		intel_plane_destroy_state(plane, old_plane_state);
 	return ret;
 
 slow:
diff --git a/include/drm/drm_plane.h b/include/drm/drm_plane.h
index 7d96116fd4c4..feb9941d0cdb 100644
--- a/include/drm/drm_plane.h
+++ b/include/drm/drm_plane.h
@@ -124,9 +124,10 @@ struct drm_plane_state {
 	bool visible;
 
 	/**
-	 * @commit: Tracks the pending commit to prevent use-after-free conditions.
+	 * @commit: Tracks the pending commit to prevent use-after-free conditions,
+	 * and for async plane updates.
 	 *
-	 * Is only set when @crtc is NULL.
+	 * May be NULL.
 	 */
 	struct drm_crtc_commit *commit;
 
-- 
cgit v1.2.3


From 77ac3b00b13185741effd0d5e2f1f05e4bfef7dc Mon Sep 17 00:00:00 2001
From: Maarten Lankhorst <maarten.lankhorst@linux.intel.com>
Date: Wed, 19 Jul 2017 16:39:20 +0200
Subject: drm/atomic: Remove deprecated accessor macros

Now that the last users have been converted, we can finally get rid of
for_each_obj_in_state, we have better macros to replace them with.

Signed-off-by: Maarten Lankhorst <maarten.lankhorst@linux.intel.com>
Cc: Daniel Vetter <daniel.vetter@intel.com>
Cc: Jani Nikula <jani.nikula@linux.intel.com>
Cc: Sean Paul <seanpaul@chromium.org>
Cc: David Airlie <airlied@linux.ie>
Link: https://patchwork.freedesktop.org/patch/msgid/20170719143920.25685-8-maarten.lankhorst@linux.intel.com
Reviewed-by: Daniel Vetter <daniel.vetter@ffwll.ch>
---
 include/drm/drm_atomic.h    | 75 ---------------------------------------------
 include/drm/drm_connector.h |  3 +-
 include/drm/drm_crtc.h      |  8 ++---
 include/drm/drm_plane.h     |  8 ++---
 4 files changed, 9 insertions(+), 85 deletions(-)

(limited to 'include')

diff --git a/include/drm/drm_atomic.h b/include/drm/drm_atomic.h
index 07a71daa3582..5834580d75bc 100644
--- a/include/drm/drm_atomic.h
+++ b/include/drm/drm_atomic.h
@@ -569,31 +569,6 @@ int __must_check drm_atomic_nonblocking_commit(struct drm_atomic_state *state);
 
 void drm_state_dump(struct drm_device *dev, struct drm_printer *p);
 
-/**
- * for_each_connector_in_state - iterate over all connectors in an atomic update
- * @__state: &struct drm_atomic_state pointer
- * @connector: &struct drm_connector iteration cursor
- * @connector_state: &struct drm_connector_state iteration cursor
- * @__i: int iteration cursor, for macro-internal use
- *
- * This iterates over all connectors in an atomic update. Note that before the
- * software state is committed (by calling drm_atomic_helper_swap_state(), this
- * points to the new state, while afterwards it points to the old state. Due to
- * this tricky confusion this macro is deprecated.
- *
- * FIXME:
- *
- * Replace all usage of this with one of the explicit iterators below and then
- * remove this macro.
- */
-#define for_each_connector_in_state(__state, connector, connector_state, __i) \
-	for ((__i) = 0;							\
-	     (__i) < (__state)->num_connector &&				\
-	     ((connector) = (__state)->connectors[__i].ptr,			\
-	     (connector_state) = (__state)->connectors[__i].state, 1); 	\
-	     (__i)++)							\
-		for_each_if (connector)
-
 /**
  * for_each_oldnew_connector_in_state - iterate over all connectors in an atomic update
  * @__state: &struct drm_atomic_state pointer
@@ -657,31 +632,6 @@ void drm_state_dump(struct drm_device *dev, struct drm_printer *p);
 	     (__i)++)							\
 		for_each_if (connector)
 
-/**
- * for_each_crtc_in_state - iterate over all CRTCs in an atomic update
- * @__state: &struct drm_atomic_state pointer
- * @crtc: &struct drm_crtc iteration cursor
- * @crtc_state: &struct drm_crtc_state iteration cursor
- * @__i: int iteration cursor, for macro-internal use
- *
- * This iterates over all CRTCs in an atomic update. Note that before the
- * software state is committed (by calling drm_atomic_helper_swap_state(), this
- * points to the new state, while afterwards it points to the old state. Due to
- * this tricky confusion this macro is deprecated.
- *
- * FIXME:
- *
- * Replace all usage of this with one of the explicit iterators below and then
- * remove this macro.
- */
-#define for_each_crtc_in_state(__state, crtc, crtc_state, __i)	\
-	for ((__i) = 0;						\
-	     (__i) < (__state)->dev->mode_config.num_crtc &&	\
-	     ((crtc) = (__state)->crtcs[__i].ptr,			\
-	     (crtc_state) = (__state)->crtcs[__i].state, 1);	\
-	     (__i)++)						\
-		for_each_if (crtc_state)
-
 /**
  * for_each_oldnew_crtc_in_state - iterate over all CRTCs in an atomic update
  * @__state: &struct drm_atomic_state pointer
@@ -741,31 +691,6 @@ void drm_state_dump(struct drm_device *dev, struct drm_printer *p);
 	     (__i)++)							\
 		for_each_if (crtc)
 
-/**
- * for_each_plane_in_state - iterate over all planes in an atomic update
- * @__state: &struct drm_atomic_state pointer
- * @plane: &struct drm_plane iteration cursor
- * @plane_state: &struct drm_plane_state iteration cursor
- * @__i: int iteration cursor, for macro-internal use
- *
- * This iterates over all planes in an atomic update. Note that before the
- * software state is committed (by calling drm_atomic_helper_swap_state(), this
- * points to the new state, while afterwards it points to the old state. Due to
- * this tricky confusion this macro is deprecated.
- *
- * FIXME:
- *
- * Replace all usage of this with one of the explicit iterators below and then
- * remove this macro.
- */
-#define for_each_plane_in_state(__state, plane, plane_state, __i)		\
-	for ((__i) = 0;							\
-	     (__i) < (__state)->dev->mode_config.num_total_plane &&	\
-	     ((plane) = (__state)->planes[__i].ptr,				\
-	     (plane_state) = (__state)->planes[__i].state, 1);		\
-	     (__i)++)							\
-		for_each_if (plane_state)
-
 /**
  * for_each_oldnew_plane_in_state - iterate over all planes in an atomic update
  * @__state: &struct drm_atomic_state pointer
diff --git a/include/drm/drm_connector.h b/include/drm/drm_connector.h
index 8837649d16e8..b34904dc8b9b 100644
--- a/include/drm/drm_connector.h
+++ b/include/drm/drm_connector.h
@@ -895,8 +895,7 @@ struct drm_connector {
 	 * This is protected by @drm_mode_config.connection_mutex. Note that
 	 * nonblocking atomic commits access the current connector state without
 	 * taking locks. Either by going through the &struct drm_atomic_state
-	 * pointers, see for_each_connector_in_state(),
-	 * for_each_oldnew_connector_in_state(),
+	 * pointers, see for_each_oldnew_connector_in_state(),
 	 * for_each_old_connector_in_state() and
 	 * for_each_new_connector_in_state(). Or through careful ordering of
 	 * atomic commit operations as implemented in the atomic helpers, see
diff --git a/include/drm/drm_crtc.h b/include/drm/drm_crtc.h
index 901f5c054a2c..80c97210eda5 100644
--- a/include/drm/drm_crtc.h
+++ b/include/drm/drm_crtc.h
@@ -806,10 +806,10 @@ struct drm_crtc {
 	 * This is protected by @mutex. Note that nonblocking atomic commits
 	 * access the current CRTC state without taking locks. Either by going
 	 * through the &struct drm_atomic_state pointers, see
-	 * for_each_crtc_in_state(), for_each_oldnew_crtc_in_state(),
-	 * for_each_old_crtc_in_state() and for_each_new_crtc_in_state(). Or
-	 * through careful ordering of atomic commit operations as implemented
-	 * in the atomic helpers, see &struct drm_crtc_commit.
+	 * for_each_oldnew_crtc_in_state(), for_each_old_crtc_in_state() and
+	 * for_each_new_crtc_in_state(). Or through careful ordering of atomic
+	 * commit operations as implemented in the atomic helpers, see
+	 * &struct drm_crtc_commit.
 	 */
 	struct drm_crtc_state *state;
 
diff --git a/include/drm/drm_plane.h b/include/drm/drm_plane.h
index feb9941d0cdb..82a217bd77f0 100644
--- a/include/drm/drm_plane.h
+++ b/include/drm/drm_plane.h
@@ -539,10 +539,10 @@ struct drm_plane {
 	 * This is protected by @mutex. Note that nonblocking atomic commits
 	 * access the current plane state without taking locks. Either by going
 	 * through the &struct drm_atomic_state pointers, see
-	 * for_each_plane_in_state(), for_each_oldnew_plane_in_state(),
-	 * for_each_old_plane_in_state() and for_each_new_plane_in_state(). Or
-	 * through careful ordering of atomic commit operations as implemented
-	 * in the atomic helpers, see &struct drm_crtc_commit.
+	 * for_each_oldnew_plane_in_state(), for_each_old_plane_in_state() and
+	 * for_each_new_plane_in_state(). Or through careful ordering of atomic
+	 * commit operations as implemented in the atomic helpers, see
+	 * &struct drm_crtc_commit.
 	 */
 	struct drm_plane_state *state;
 
-- 
cgit v1.2.3


From e1bf1687740ce1a3598a1c5e452b852ff2190682 Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Wed, 6 Sep 2017 14:39:51 +0200
Subject: netfilter: nat: Revert "netfilter: nat: convert nat bysrc hash to
 rhashtable"

This reverts commit 870190a9ec9075205c0fa795a09fa931694a3ff1.

It was not a good idea. The custom hash table was a much better
fit for this purpose.

A fast lookup is not essential, in fact for most cases there is no lookup
at all because original tuple is not taken and can be used as-is.
What needs to be fast is insertion and deletion.

rhlist removal however requires a rhlist walk.
We can have thousands of entries in such a list if source port/addresses
are reused for multiple flows, if this happens removal requests are so
expensive that deletions of a few thousand flows can take several
seconds(!).

The advantages that we got from rhashtable are:
1) table auto-sizing
2) multiple locks

1) would be nice to have, but it is not essential as we have at
most one lookup per new flow, so even a million flows in the bysource
table are not a problem compared to current deletion cost.
2) is easy to add to custom hash table.

I tried to add hlist_node to rhlist to speed up rhltable_remove but this
isn't doable without changing semantics.  rhltable_remove_fast will
check that the to-be-deleted object is part of the table and that
requires a list walk that we want to avoid.

Furthermore, using hlist_node increases size of struct rhlist_head, which
in turn increases nf_conn size.

Link: https://bugzilla.kernel.org/show_bug.cgi?id=196821
Reported-by: Ivan Babrou <ibobrik@gmail.com>
Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/net/netfilter/nf_conntrack.h |   3 +-
 include/net/netfilter/nf_nat.h       |   1 -
 net/netfilter/nf_nat_core.c          | 130 ++++++++++++++---------------------
 3 files changed, 54 insertions(+), 80 deletions(-)

(limited to 'include')

diff --git a/include/net/netfilter/nf_conntrack.h b/include/net/netfilter/nf_conntrack.h
index fdc9c64a1c94..8f3bd30511de 100644
--- a/include/net/netfilter/nf_conntrack.h
+++ b/include/net/netfilter/nf_conntrack.h
@@ -17,7 +17,6 @@
 #include <linux/bitops.h>
 #include <linux/compiler.h>
 #include <linux/atomic.h>
-#include <linux/rhashtable.h>
 
 #include <linux/netfilter/nf_conntrack_tcp.h>
 #include <linux/netfilter/nf_conntrack_dccp.h>
@@ -77,7 +76,7 @@ struct nf_conn {
 	possible_net_t ct_net;
 
 #if IS_ENABLED(CONFIG_NF_NAT)
-	struct rhlist_head nat_bysource;
+	struct hlist_node	nat_bysource;
 #endif
 	/* all members below initialized via memset */
 	u8 __nfct_init_offset[0];
diff --git a/include/net/netfilter/nf_nat.h b/include/net/netfilter/nf_nat.h
index 05c82a1a4267..b71701302e61 100644
--- a/include/net/netfilter/nf_nat.h
+++ b/include/net/netfilter/nf_nat.h
@@ -1,6 +1,5 @@
 #ifndef _NF_NAT_H
 #define _NF_NAT_H
-#include <linux/rhashtable.h>
 #include <linux/netfilter_ipv4.h>
 #include <linux/netfilter/nf_nat.h>
 #include <net/netfilter/nf_conntrack_tuple.h>
diff --git a/net/netfilter/nf_nat_core.c b/net/netfilter/nf_nat_core.c
index dc3519cc7209..f090419f5f97 100644
--- a/net/netfilter/nf_nat_core.c
+++ b/net/netfilter/nf_nat_core.c
@@ -30,19 +30,17 @@
 #include <net/netfilter/nf_conntrack_zones.h>
 #include <linux/netfilter/nf_nat.h>
 
+static DEFINE_SPINLOCK(nf_nat_lock);
+
 static DEFINE_MUTEX(nf_nat_proto_mutex);
 static const struct nf_nat_l3proto __rcu *nf_nat_l3protos[NFPROTO_NUMPROTO]
 						__read_mostly;
 static const struct nf_nat_l4proto __rcu **nf_nat_l4protos[NFPROTO_NUMPROTO]
 						__read_mostly;
 
-struct nf_nat_conn_key {
-	const struct net *net;
-	const struct nf_conntrack_tuple *tuple;
-	const struct nf_conntrack_zone *zone;
-};
-
-static struct rhltable nf_nat_bysource_table;
+static struct hlist_head *nf_nat_bysource __read_mostly;
+static unsigned int nf_nat_htable_size __read_mostly;
+static unsigned int nf_nat_hash_rnd __read_mostly;
 
 inline const struct nf_nat_l3proto *
 __nf_nat_l3proto_find(u8 family)
@@ -118,17 +116,19 @@ int nf_xfrm_me_harder(struct net *net, struct sk_buff *skb, unsigned int family)
 EXPORT_SYMBOL(nf_xfrm_me_harder);
 #endif /* CONFIG_XFRM */
 
-static u32 nf_nat_bysource_hash(const void *data, u32 len, u32 seed)
+/* We keep an extra hash for each conntrack, for fast searching. */
+static unsigned int
+hash_by_src(const struct net *n, const struct nf_conntrack_tuple *tuple)
 {
-	const struct nf_conntrack_tuple *t;
-	const struct nf_conn *ct = data;
+	unsigned int hash;
+
+	get_random_once(&nf_nat_hash_rnd, sizeof(nf_nat_hash_rnd));
 
-	t = &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple;
 	/* Original src, to ensure we map it consistently if poss. */
+	hash = jhash2((u32 *)&tuple->src, sizeof(tuple->src) / sizeof(u32),
+		      tuple->dst.protonum ^ nf_nat_hash_rnd ^ net_hash_mix(n));
 
-	seed ^= net_hash_mix(nf_ct_net(ct));
-	return jhash2((const u32 *)&t->src, sizeof(t->src) / sizeof(u32),
-		      t->dst.protonum ^ seed);
+	return reciprocal_scale(hash, nf_nat_htable_size);
 }
 
 /* Is this tuple already taken? (not by us) */
@@ -184,28 +184,6 @@ same_src(const struct nf_conn *ct,
 		t->src.u.all == tuple->src.u.all);
 }
 
-static int nf_nat_bysource_cmp(struct rhashtable_compare_arg *arg,
-			       const void *obj)
-{
-	const struct nf_nat_conn_key *key = arg->key;
-	const struct nf_conn *ct = obj;
-
-	if (!same_src(ct, key->tuple) ||
-	    !net_eq(nf_ct_net(ct), key->net) ||
-	    !nf_ct_zone_equal(ct, key->zone, IP_CT_DIR_ORIGINAL))
-		return 1;
-
-	return 0;
-}
-
-static struct rhashtable_params nf_nat_bysource_params = {
-	.head_offset = offsetof(struct nf_conn, nat_bysource),
-	.obj_hashfn = nf_nat_bysource_hash,
-	.obj_cmpfn = nf_nat_bysource_cmp,
-	.nelem_hint = 256,
-	.min_size = 1024,
-};
-
 /* Only called for SRC manip */
 static int
 find_appropriate_src(struct net *net,
@@ -216,26 +194,22 @@ find_appropriate_src(struct net *net,
 		     struct nf_conntrack_tuple *result,
 		     const struct nf_nat_range *range)
 {
+	unsigned int h = hash_by_src(net, tuple);
 	const struct nf_conn *ct;
-	struct nf_nat_conn_key key = {
-		.net = net,
-		.tuple = tuple,
-		.zone = zone
-	};
-	struct rhlist_head *hl, *h;
-
-	hl = rhltable_lookup(&nf_nat_bysource_table, &key,
-			     nf_nat_bysource_params);
 
-	rhl_for_each_entry_rcu(ct, h, hl, nat_bysource) {
-		nf_ct_invert_tuplepr(result,
-				     &ct->tuplehash[IP_CT_DIR_REPLY].tuple);
-		result->dst = tuple->dst;
-
-		if (in_range(l3proto, l4proto, result, range))
-			return 1;
+	hlist_for_each_entry_rcu(ct, &nf_nat_bysource[h], nat_bysource) {
+		if (same_src(ct, tuple) &&
+		    net_eq(net, nf_ct_net(ct)) &&
+		    nf_ct_zone_equal(ct, zone, IP_CT_DIR_ORIGINAL)) {
+			/* Copy source part from reply tuple. */
+			nf_ct_invert_tuplepr(result,
+				       &ct->tuplehash[IP_CT_DIR_REPLY].tuple);
+			result->dst = tuple->dst;
+
+			if (in_range(l3proto, l4proto, result, range))
+				return 1;
+		}
 	}
-
 	return 0;
 }
 
@@ -408,6 +382,7 @@ nf_nat_setup_info(struct nf_conn *ct,
 		  const struct nf_nat_range *range,
 		  enum nf_nat_manip_type maniptype)
 {
+	struct net *net = nf_ct_net(ct);
 	struct nf_conntrack_tuple curr_tuple, new_tuple;
 
 	/* Can't setup nat info for confirmed ct. */
@@ -449,19 +424,14 @@ nf_nat_setup_info(struct nf_conn *ct,
 	}
 
 	if (maniptype == NF_NAT_MANIP_SRC) {
-		struct nf_nat_conn_key key = {
-			.net = nf_ct_net(ct),
-			.tuple = &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple,
-			.zone = nf_ct_zone(ct),
-		};
-		int err;
-
-		err = rhltable_insert_key(&nf_nat_bysource_table,
-					  &key,
-					  &ct->nat_bysource,
-					  nf_nat_bysource_params);
-		if (err)
-			return NF_DROP;
+		unsigned int srchash;
+
+		srchash = hash_by_src(net,
+				      &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple);
+		spin_lock_bh(&nf_nat_lock);
+		hlist_add_head_rcu(&ct->nat_bysource,
+				   &nf_nat_bysource[srchash]);
+		spin_unlock_bh(&nf_nat_lock);
 	}
 
 	/* It's done. */
@@ -570,8 +540,9 @@ static int nf_nat_proto_clean(struct nf_conn *ct, void *data)
 	 * will delete entry from already-freed table.
 	 */
 	clear_bit(IPS_SRC_NAT_DONE_BIT, &ct->status);
-	rhltable_remove(&nf_nat_bysource_table, &ct->nat_bysource,
-			nf_nat_bysource_params);
+	spin_lock_bh(&nf_nat_lock);
+	hlist_del_rcu(&ct->nat_bysource);
+	spin_unlock_bh(&nf_nat_lock);
 
 	/* don't delete conntrack.  Although that would make things a lot
 	 * simpler, we'd end up flushing all conntracks on nat rmmod.
@@ -699,9 +670,11 @@ EXPORT_SYMBOL_GPL(nf_nat_l3proto_unregister);
 /* No one using conntrack by the time this called. */
 static void nf_nat_cleanup_conntrack(struct nf_conn *ct)
 {
-	if (ct->status & IPS_SRC_NAT_DONE)
-		rhltable_remove(&nf_nat_bysource_table, &ct->nat_bysource,
-				nf_nat_bysource_params);
+	if (ct->status & IPS_SRC_NAT_DONE) {
+		spin_lock_bh(&nf_nat_lock);
+		hlist_del_rcu(&ct->nat_bysource);
+		spin_unlock_bh(&nf_nat_lock);
+	}
 }
 
 static struct nf_ct_ext_type nat_extend __read_mostly = {
@@ -825,13 +798,16 @@ static int __init nf_nat_init(void)
 {
 	int ret;
 
-	ret = rhltable_init(&nf_nat_bysource_table, &nf_nat_bysource_params);
-	if (ret)
-		return ret;
+	/* Leave them the same for the moment. */
+	nf_nat_htable_size = nf_conntrack_htable_size;
+
+	nf_nat_bysource = nf_ct_alloc_hashtable(&nf_nat_htable_size, 0);
+	if (!nf_nat_bysource)
+		return -ENOMEM;
 
 	ret = nf_ct_extend_register(&nat_extend);
 	if (ret < 0) {
-		rhltable_destroy(&nf_nat_bysource_table);
+		nf_ct_free_hashtable(nf_nat_bysource, nf_nat_htable_size);
 		printk(KERN_ERR "nf_nat_core: Unable to register extension\n");
 		return ret;
 	}
@@ -865,8 +841,8 @@ static void __exit nf_nat_cleanup(void)
 
 	for (i = 0; i < NFPROTO_NUMPROTO; i++)
 		kfree(nf_nat_l4protos[i]);
-
-	rhltable_destroy(&nf_nat_bysource_table);
+	synchronize_net();
+	nf_ct_free_hashtable(nf_nat_bysource, nf_nat_htable_size);
 }
 
 MODULE_LICENSE("GPL");
-- 
cgit v1.2.3


From b5ff8161e37cef3265e186ecded23324e4dc2973 Mon Sep 17 00:00:00 2001
From: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
Date: Fri, 8 Sep 2017 16:10:49 -0700
Subject: mm: thp: introduce separate TTU flag for thp freezing

TTU_MIGRATION is used to convert pte into migration entry until thp
split completes.  This behavior conflicts with thp migration added later
patches, so let's introduce a new TTU flag specifically for freezing.

try_to_unmap() is used both for thp split (via freeze_page()) and page
migration (via __unmap_and_move()).  In freeze_page(), ttu_flag given
for head page is like below (assuming anonymous thp):

    (TTU_IGNORE_MLOCK | TTU_IGNORE_ACCESS | TTU_RMAP_LOCKED | \
     TTU_MIGRATION | TTU_SPLIT_HUGE_PMD)

and ttu_flag given for tail pages is:

    (TTU_IGNORE_MLOCK | TTU_IGNORE_ACCESS | TTU_RMAP_LOCKED | \
     TTU_MIGRATION)

__unmap_and_move() calls try_to_unmap() with ttu_flag:

    (TTU_MIGRATION | TTU_IGNORE_MLOCK | TTU_IGNORE_ACCESS)

Now I'm trying to insert a branch for thp migration at the top of
try_to_unmap_one() like below

static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
                       unsigned long address, void *arg)
  {
          ...
          /* PMD-mapped THP migration entry */
          if (!pvmw.pte && (flags & TTU_MIGRATION)) {
              if (!PageAnon(page))
                  continue;

              set_pmd_migration_entry(&pvmw, page);
              continue;
          }
	  ...
  }

so try_to_unmap() for tail pages called by thp split can go into thp
migration code path (which converts *pmd* into migration entry), while
the expectation is to freeze thp (which converts *pte* into migration
entry.)

I detected this failure as a "bad page state" error in a testcase where
split_huge_page() is called from queue_pages_pte_range().

Link: http://lkml.kernel.org/r/20170717193955.20207-4-zi.yan@sent.com
Signed-off-by: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
Signed-off-by: Zi Yan <zi.yan@cs.rutgers.edu>
Acked-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Anshuman Khandual <khandual@linux.vnet.ibm.com>
Cc: Dave Hansen <dave.hansen@intel.com>
Cc: David Nellans <dnellans@nvidia.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Mel Gorman <mgorman@techsingularity.net>
Cc: Minchan Kim <minchan@kernel.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: Michal Hocko <mhocko@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/rmap.h | 3 ++-
 mm/huge_memory.c     | 2 +-
 mm/rmap.c            | 7 ++++---
 3 files changed, 7 insertions(+), 5 deletions(-)

(limited to 'include')

diff --git a/include/linux/rmap.h b/include/linux/rmap.h
index 43ef2c30cb0f..f8ca2e74b819 100644
--- a/include/linux/rmap.h
+++ b/include/linux/rmap.h
@@ -93,8 +93,9 @@ enum ttu_flags {
 	TTU_BATCH_FLUSH		= 0x40,	/* Batch TLB flushes where possible
 					 * and caller guarantees they will
 					 * do a final flush if necessary */
-	TTU_RMAP_LOCKED		= 0x80	/* do not grab rmap lock:
+	TTU_RMAP_LOCKED		= 0x80,	/* do not grab rmap lock:
 					 * caller holds it */
+	TTU_SPLIT_FREEZE	= 0x100,		/* freeze pte under splitting thp */
 };
 
 #ifdef CONFIG_MMU
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 0b51e70e0a8b..8a97833ef0f1 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -2210,7 +2210,7 @@ static void freeze_page(struct page *page)
 	VM_BUG_ON_PAGE(!PageHead(page), page);
 
 	if (PageAnon(page))
-		ttu_flags |= TTU_MIGRATION;
+		ttu_flags |= TTU_SPLIT_FREEZE;
 
 	unmap_success = try_to_unmap(page, ttu_flags);
 	VM_BUG_ON_PAGE(!unmap_success, page);
diff --git a/mm/rmap.c b/mm/rmap.c
index c570f82e6827..5b26af8a7a29 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -1348,7 +1348,7 @@ static bool try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
 
 	if (flags & TTU_SPLIT_HUGE_PMD) {
 		split_huge_pmd_address(vma, address,
-				flags & TTU_MIGRATION, page);
+				flags & TTU_SPLIT_FREEZE, page);
 	}
 
 	/*
@@ -1445,7 +1445,7 @@ static bool try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
 			 */
 			dec_mm_counter(mm, mm_counter(page));
 		} else if (IS_ENABLED(CONFIG_MIGRATION) &&
-				(flags & TTU_MIGRATION)) {
+				(flags & (TTU_MIGRATION|TTU_SPLIT_FREEZE))) {
 			swp_entry_t entry;
 			pte_t swp_pte;
 			/*
@@ -1575,7 +1575,8 @@ bool try_to_unmap(struct page *page, enum ttu_flags flags)
 	 * locking requirements of exec(), migration skips
 	 * temporary VMAs until after exec() completes.
 	 */
-	if ((flags & TTU_MIGRATION) && !PageKsm(page) && PageAnon(page))
+	if ((flags & (TTU_MIGRATION|TTU_SPLIT_FREEZE))
+	    && !PageKsm(page) && PageAnon(page))
 		rwc.invalid_vma = invalid_migration_vma;
 
 	if (flags & TTU_RMAP_LOCKED)
-- 
cgit v1.2.3


From 9c670ea37947a82cb6d4df69139f7e46ed71a0ac Mon Sep 17 00:00:00 2001
From: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
Date: Fri, 8 Sep 2017 16:10:53 -0700
Subject: mm: thp: introduce CONFIG_ARCH_ENABLE_THP_MIGRATION

Introduce CONFIG_ARCH_ENABLE_THP_MIGRATION to limit thp migration
functionality to x86_64, which should be safer at the first step.

Link: http://lkml.kernel.org/r/20170717193955.20207-5-zi.yan@sent.com
Signed-off-by: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
Signed-off-by: Zi Yan <zi.yan@cs.rutgers.edu>
Reviewed-by: Anshuman Khandual <khandual@linux.vnet.ibm.com>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Dave Hansen <dave.hansen@intel.com>
Cc: David Nellans <dnellans@nvidia.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Cc: Mel Gorman <mgorman@techsingularity.net>
Cc: Minchan Kim <minchan@kernel.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: Michal Hocko <mhocko@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/x86/Kconfig        |  4 ++++
 include/linux/huge_mm.h | 10 ++++++++++
 mm/Kconfig              |  3 +++
 3 files changed, 17 insertions(+)

(limited to 'include')

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 4b278a33ccbb..07d9b6c75328 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -2343,6 +2343,10 @@ config ARCH_ENABLE_HUGEPAGE_MIGRATION
 	def_bool y
 	depends on X86_64 && HUGETLB_PAGE && MIGRATION
 
+config ARCH_ENABLE_THP_MIGRATION
+	def_bool y
+	depends on X86_64 && TRANSPARENT_HUGEPAGE
+
 menu "Power management and ACPI options"
 
 config ARCH_HIBERNATION_HEADER
diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
index ee696347f928..d8f35a0865dc 100644
--- a/include/linux/huge_mm.h
+++ b/include/linux/huge_mm.h
@@ -233,6 +233,11 @@ void mm_put_huge_zero_page(struct mm_struct *mm);
 
 #define mk_huge_pmd(page, prot) pmd_mkhuge(mk_pmd(page, prot))
 
+static inline bool thp_migration_supported(void)
+{
+	return IS_ENABLED(CONFIG_ARCH_ENABLE_THP_MIGRATION);
+}
+
 #else /* CONFIG_TRANSPARENT_HUGEPAGE */
 #define HPAGE_PMD_SHIFT ({ BUILD_BUG(); 0; })
 #define HPAGE_PMD_MASK ({ BUILD_BUG(); 0; })
@@ -336,6 +341,11 @@ static inline struct page *follow_devmap_pud(struct vm_area_struct *vma,
 {
 	return NULL;
 }
+
+static inline bool thp_migration_supported(void)
+{
+	return false;
+}
 #endif /* CONFIG_TRANSPARENT_HUGEPAGE */
 
 #endif /* _LINUX_HUGE_MM_H */
diff --git a/mm/Kconfig b/mm/Kconfig
index 0ded10a22639..9ef8c2ea92ad 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -262,6 +262,9 @@ config MIGRATION
 config ARCH_ENABLE_HUGEPAGE_MIGRATION
 	bool
 
+config ARCH_ENABLE_THP_MIGRATION
+	bool
+
 config PHYS_ADDR_T_64BIT
 	def_bool 64BIT || ARCH_PHYS_ADDR_T_64BIT
 
-- 
cgit v1.2.3


From 616b8371539a6c487404c3b8fb04078016dab4ba Mon Sep 17 00:00:00 2001
From: Zi Yan <zi.yan@cs.rutgers.edu>
Date: Fri, 8 Sep 2017 16:10:57 -0700
Subject: mm: thp: enable thp migration in generic path

Add thp migration's core code, including conversions between a PMD entry
and a swap entry, setting PMD migration entry, removing PMD migration
entry, and waiting on PMD migration entries.

This patch makes it possible to support thp migration.  If you fail to
allocate a destination page as a thp, you just split the source thp as
we do now, and then enter the normal page migration.  If you succeed to
allocate destination thp, you enter thp migration.  Subsequent patches
actually enable thp migration for each caller of page migration by
allowing its get_new_page() callback to allocate thps.

[zi.yan@cs.rutgers.edu: fix gcc-4.9.0 -Wmissing-braces warning]
  Link: http://lkml.kernel.org/r/A0ABA698-7486-46C3-B209-E95A9048B22C@cs.rutgers.edu
[akpm@linux-foundation.org: fix x86_64 allnoconfig warning]
Signed-off-by: Zi Yan <zi.yan@cs.rutgers.edu>
Acked-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Anshuman Khandual <khandual@linux.vnet.ibm.com>
Cc: Dave Hansen <dave.hansen@intel.com>
Cc: David Nellans <dnellans@nvidia.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Mel Gorman <mgorman@techsingularity.net>
Cc: Minchan Kim <minchan@kernel.org>
Cc: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: Michal Hocko <mhocko@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/x86/include/asm/pgtable_64.h |  2 +
 include/linux/swapops.h           | 73 ++++++++++++++++++++++++++++++++--
 mm/huge_memory.c                  | 84 ++++++++++++++++++++++++++++++++++++---
 mm/migrate.c                      | 32 ++++++++++++++-
 mm/page_vma_mapped.c              | 18 +++++++--
 mm/pgtable-generic.c              |  3 +-
 mm/rmap.c                         | 13 ++++++
 7 files changed, 212 insertions(+), 13 deletions(-)

(limited to 'include')

diff --git a/arch/x86/include/asm/pgtable_64.h b/arch/x86/include/asm/pgtable_64.h
index 46bf71c77a25..972a4698c530 100644
--- a/arch/x86/include/asm/pgtable_64.h
+++ b/arch/x86/include/asm/pgtable_64.h
@@ -210,7 +210,9 @@ static inline int pgd_large(pgd_t pgd) { return 0; }
 					 ((type) << (SWP_TYPE_FIRST_BIT)) \
 					 | ((offset) << SWP_OFFSET_FIRST_BIT) })
 #define __pte_to_swp_entry(pte)		((swp_entry_t) { pte_val((pte)) })
+#define __pmd_to_swp_entry(pmd)		((swp_entry_t) { pmd_val((pmd)) })
 #define __swp_entry_to_pte(x)		((pte_t) { .pte = (x).val })
+#define __swp_entry_to_pmd(x)		((pmd_t) { .pmd = (x).val })
 
 extern int kern_addr_valid(unsigned long addr);
 extern void cleanup_highmap(void);
diff --git a/include/linux/swapops.h b/include/linux/swapops.h
index c5ff7b217ee6..82089fd88c29 100644
--- a/include/linux/swapops.h
+++ b/include/linux/swapops.h
@@ -103,7 +103,8 @@ static inline void *swp_to_radix_entry(swp_entry_t entry)
 #ifdef CONFIG_MIGRATION
 static inline swp_entry_t make_migration_entry(struct page *page, int write)
 {
-	BUG_ON(!PageLocked(page));
+	BUG_ON(!PageLocked(compound_head(page)));
+
 	return swp_entry(write ? SWP_MIGRATION_WRITE : SWP_MIGRATION_READ,
 			page_to_pfn(page));
 }
@@ -126,7 +127,7 @@ static inline struct page *migration_entry_to_page(swp_entry_t entry)
 	 * Any use of migration entries may only occur while the
 	 * corresponding page is locked
 	 */
-	BUG_ON(!PageLocked(p));
+	BUG_ON(!PageLocked(compound_head(p)));
 	return p;
 }
 
@@ -148,7 +149,11 @@ static inline int is_migration_entry(swp_entry_t swp)
 {
 	return 0;
 }
-#define migration_entry_to_page(swp) NULL
+static inline struct page *migration_entry_to_page(swp_entry_t entry)
+{
+	return NULL;
+}
+
 static inline void make_migration_entry_read(swp_entry_t *entryp) { }
 static inline void __migration_entry_wait(struct mm_struct *mm, pte_t *ptep,
 					spinlock_t *ptl) { }
@@ -163,6 +168,68 @@ static inline int is_write_migration_entry(swp_entry_t entry)
 
 #endif
 
+struct page_vma_mapped_walk;
+
+#ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION
+extern void set_pmd_migration_entry(struct page_vma_mapped_walk *pvmw,
+		struct page *page);
+
+extern void remove_migration_pmd(struct page_vma_mapped_walk *pvmw,
+		struct page *new);
+
+extern void pmd_migration_entry_wait(struct mm_struct *mm, pmd_t *pmd);
+
+static inline swp_entry_t pmd_to_swp_entry(pmd_t pmd)
+{
+	swp_entry_t arch_entry;
+
+	arch_entry = __pmd_to_swp_entry(pmd);
+	return swp_entry(__swp_type(arch_entry), __swp_offset(arch_entry));
+}
+
+static inline pmd_t swp_entry_to_pmd(swp_entry_t entry)
+{
+	swp_entry_t arch_entry;
+
+	arch_entry = __swp_entry(swp_type(entry), swp_offset(entry));
+	return __swp_entry_to_pmd(arch_entry);
+}
+
+static inline int is_pmd_migration_entry(pmd_t pmd)
+{
+	return !pmd_present(pmd) && is_migration_entry(pmd_to_swp_entry(pmd));
+}
+#else
+static inline void set_pmd_migration_entry(struct page_vma_mapped_walk *pvmw,
+		struct page *page)
+{
+	BUILD_BUG();
+}
+
+static inline void remove_migration_pmd(struct page_vma_mapped_walk *pvmw,
+		struct page *new)
+{
+	BUILD_BUG();
+}
+
+static inline void pmd_migration_entry_wait(struct mm_struct *m, pmd_t *p) { }
+
+static inline swp_entry_t pmd_to_swp_entry(pmd_t pmd)
+{
+	return swp_entry(0, 0);
+}
+
+static inline pmd_t swp_entry_to_pmd(swp_entry_t entry)
+{
+	return __pmd(0);
+}
+
+static inline int is_pmd_migration_entry(pmd_t pmd)
+{
+	return 0;
+}
+#endif
+
 #ifdef CONFIG_MEMORY_FAILURE
 
 extern atomic_long_t num_poisoned_pages __read_mostly;
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 8a97833ef0f1..937f007794dd 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -1684,10 +1684,24 @@ int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
 		spin_unlock(ptl);
 		tlb_remove_page_size(tlb, pmd_page(orig_pmd), HPAGE_PMD_SIZE);
 	} else {
-		struct page *page = pmd_page(orig_pmd);
-		page_remove_rmap(page, true);
-		VM_BUG_ON_PAGE(page_mapcount(page) < 0, page);
-		VM_BUG_ON_PAGE(!PageHead(page), page);
+		struct page *page = NULL;
+		int flush_needed = 1;
+
+		if (pmd_present(orig_pmd)) {
+			page = pmd_page(orig_pmd);
+			page_remove_rmap(page, true);
+			VM_BUG_ON_PAGE(page_mapcount(page) < 0, page);
+			VM_BUG_ON_PAGE(!PageHead(page), page);
+		} else if (thp_migration_supported()) {
+			swp_entry_t entry;
+
+			VM_BUG_ON(!is_pmd_migration_entry(orig_pmd));
+			entry = pmd_to_swp_entry(orig_pmd);
+			page = pfn_to_page(swp_offset(entry));
+			flush_needed = 0;
+		} else
+			WARN_ONCE(1, "Non present huge pmd without pmd migration enabled!");
+
 		if (PageAnon(page)) {
 			zap_deposited_table(tlb->mm, pmd);
 			add_mm_counter(tlb->mm, MM_ANONPAGES, -HPAGE_PMD_NR);
@@ -1696,8 +1710,10 @@ int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
 				zap_deposited_table(tlb->mm, pmd);
 			add_mm_counter(tlb->mm, MM_FILEPAGES, -HPAGE_PMD_NR);
 		}
+
 		spin_unlock(ptl);
-		tlb_remove_page_size(tlb, page, HPAGE_PMD_SIZE);
+		if (flush_needed)
+			tlb_remove_page_size(tlb, page, HPAGE_PMD_SIZE);
 	}
 	return 1;
 }
@@ -2745,3 +2761,61 @@ static int __init split_huge_pages_debugfs(void)
 }
 late_initcall(split_huge_pages_debugfs);
 #endif
+
+#ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION
+void set_pmd_migration_entry(struct page_vma_mapped_walk *pvmw,
+		struct page *page)
+{
+	struct vm_area_struct *vma = pvmw->vma;
+	struct mm_struct *mm = vma->vm_mm;
+	unsigned long address = pvmw->address;
+	pmd_t pmdval;
+	swp_entry_t entry;
+
+	if (!(pvmw->pmd && !pvmw->pte))
+		return;
+
+	mmu_notifier_invalidate_range_start(mm, address,
+			address + HPAGE_PMD_SIZE);
+
+	flush_cache_range(vma, address, address + HPAGE_PMD_SIZE);
+	pmdval = *pvmw->pmd;
+	pmdp_invalidate(vma, address, pvmw->pmd);
+	if (pmd_dirty(pmdval))
+		set_page_dirty(page);
+	entry = make_migration_entry(page, pmd_write(pmdval));
+	pmdval = swp_entry_to_pmd(entry);
+	set_pmd_at(mm, address, pvmw->pmd, pmdval);
+	page_remove_rmap(page, true);
+	put_page(page);
+
+	mmu_notifier_invalidate_range_end(mm, address,
+			address + HPAGE_PMD_SIZE);
+}
+
+void remove_migration_pmd(struct page_vma_mapped_walk *pvmw, struct page *new)
+{
+	struct vm_area_struct *vma = pvmw->vma;
+	struct mm_struct *mm = vma->vm_mm;
+	unsigned long address = pvmw->address;
+	unsigned long mmun_start = address & HPAGE_PMD_MASK;
+	pmd_t pmde;
+	swp_entry_t entry;
+
+	if (!(pvmw->pmd && !pvmw->pte))
+		return;
+
+	entry = pmd_to_swp_entry(*pvmw->pmd);
+	get_page(new);
+	pmde = pmd_mkold(mk_huge_pmd(new, vma->vm_page_prot));
+	if (is_write_migration_entry(entry))
+		pmde = maybe_pmd_mkwrite(pmde, vma);
+
+	flush_cache_range(vma, mmun_start, mmun_start + HPAGE_PMD_SIZE);
+	page_add_anon_rmap(new, vma, mmun_start, true);
+	set_pmd_at(mm, mmun_start, pvmw->pmd, pmde);
+	if (vma->vm_flags & VM_LOCKED)
+		mlock_vma_page(new);
+	update_mmu_cache_pmd(vma, address, pvmw->pmd);
+}
+#endif
diff --git a/mm/migrate.c b/mm/migrate.c
index e84eeb4e4356..bf5366a2176b 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -216,6 +216,15 @@ static bool remove_migration_pte(struct page *page, struct vm_area_struct *vma,
 			new = page - pvmw.page->index +
 				linear_page_index(vma, pvmw.address);
 
+#ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION
+		/* PMD-mapped THP migration entry */
+		if (!pvmw.pte) {
+			VM_BUG_ON_PAGE(PageHuge(page) || !PageTransCompound(page), page);
+			remove_migration_pmd(&pvmw, new);
+			continue;
+		}
+#endif
+
 		get_page(new);
 		pte = pte_mkold(mk_pte(new, READ_ONCE(vma->vm_page_prot)));
 		if (pte_swp_soft_dirty(*pvmw.pte))
@@ -330,6 +339,27 @@ void migration_entry_wait_huge(struct vm_area_struct *vma,
 	__migration_entry_wait(mm, pte, ptl);
 }
 
+#ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION
+void pmd_migration_entry_wait(struct mm_struct *mm, pmd_t *pmd)
+{
+	spinlock_t *ptl;
+	struct page *page;
+
+	ptl = pmd_lock(mm, pmd);
+	if (!is_pmd_migration_entry(*pmd))
+		goto unlock;
+	page = migration_entry_to_page(pmd_to_swp_entry(*pmd));
+	if (!get_page_unless_zero(page))
+		goto unlock;
+	spin_unlock(ptl);
+	wait_on_page_locked(page);
+	put_page(page);
+	return;
+unlock:
+	spin_unlock(ptl);
+}
+#endif
+
 #ifdef CONFIG_BLOCK
 /* Returns true if all buffers are successfully locked */
 static bool buffer_migrate_lock_buffers(struct buffer_head *head,
@@ -1088,7 +1118,7 @@ static ICE_noinline int unmap_and_move(new_page_t get_new_page,
 		goto out;
 	}
 
-	if (unlikely(PageTransHuge(page))) {
+	if (unlikely(PageTransHuge(page) && !PageTransHuge(newpage))) {
 		lock_page(page);
 		rc = split_huge_page(page);
 		unlock_page(page);
diff --git a/mm/page_vma_mapped.c b/mm/page_vma_mapped.c
index 8ec6ba230bb9..3bd3008db4cb 100644
--- a/mm/page_vma_mapped.c
+++ b/mm/page_vma_mapped.c
@@ -138,16 +138,28 @@ restart:
 	if (!pud_present(*pud))
 		return false;
 	pvmw->pmd = pmd_offset(pud, pvmw->address);
-	if (pmd_trans_huge(*pvmw->pmd)) {
+	if (pmd_trans_huge(*pvmw->pmd) || is_pmd_migration_entry(*pvmw->pmd)) {
 		pvmw->ptl = pmd_lock(mm, pvmw->pmd);
-		if (!pmd_present(*pvmw->pmd))
-			return not_found(pvmw);
 		if (likely(pmd_trans_huge(*pvmw->pmd))) {
 			if (pvmw->flags & PVMW_MIGRATION)
 				return not_found(pvmw);
 			if (pmd_page(*pvmw->pmd) != page)
 				return not_found(pvmw);
 			return true;
+		} else if (!pmd_present(*pvmw->pmd)) {
+			if (thp_migration_supported()) {
+				if (!(pvmw->flags & PVMW_MIGRATION))
+					return not_found(pvmw);
+				if (is_migration_entry(pmd_to_swp_entry(*pvmw->pmd))) {
+					swp_entry_t entry = pmd_to_swp_entry(*pvmw->pmd);
+
+					if (migration_entry_to_page(entry) != page)
+						return not_found(pvmw);
+					return true;
+				}
+			} else
+				WARN_ONCE(1, "Non present huge pmd without pmd migration enabled!");
+			return not_found(pvmw);
 		} else {
 			/* THP pmd was split under us: handle on pte level */
 			spin_unlock(pvmw->ptl);
diff --git a/mm/pgtable-generic.c b/mm/pgtable-generic.c
index c99d9512a45b..1175f6a24fdb 100644
--- a/mm/pgtable-generic.c
+++ b/mm/pgtable-generic.c
@@ -124,7 +124,8 @@ pmd_t pmdp_huge_clear_flush(struct vm_area_struct *vma, unsigned long address,
 {
 	pmd_t pmd;
 	VM_BUG_ON(address & ~HPAGE_PMD_MASK);
-	VM_BUG_ON(!pmd_trans_huge(*pmdp) && !pmd_devmap(*pmdp));
+	VM_BUG_ON((pmd_present(*pmdp) && !pmd_trans_huge(*pmdp) &&
+			   !pmd_devmap(*pmdp)) || !pmd_present(*pmdp));
 	pmd = pmdp_huge_get_and_clear(vma->vm_mm, address, pmdp);
 	flush_pmd_tlb_range(vma, address, address + HPAGE_PMD_SIZE);
 	return pmd;
diff --git a/mm/rmap.c b/mm/rmap.c
index 5b26af8a7a29..7dc9c02f7106 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -1360,6 +1360,19 @@ static bool try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
 	mmu_notifier_invalidate_range_start(vma->vm_mm, start, end);
 
 	while (page_vma_mapped_walk(&pvmw)) {
+#ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION
+		/* PMD-mapped THP migration entry */
+		if (!pvmw.pte && (flags & TTU_MIGRATION)) {
+			VM_BUG_ON_PAGE(PageHuge(page) || !PageTransCompound(page), page);
+
+			if (!PageAnon(page))
+				continue;
+
+			set_pmd_migration_entry(&pvmw, page);
+			continue;
+		}
+#endif
+
 		/*
 		 * If the page is mlock()d, we cannot swap it out.
 		 * If it's recently referenced (perhaps page_referenced
-- 
cgit v1.2.3


From 84c3fc4e9c563d8fb91cfdf5948da48fe1af34d3 Mon Sep 17 00:00:00 2001
From: Zi Yan <zi.yan@cs.rutgers.edu>
Date: Fri, 8 Sep 2017 16:11:01 -0700
Subject: mm: thp: check pmd migration entry in common path

When THP migration is being used, memory management code needs to handle
pmd migration entries properly.  This patch uses !pmd_present() or
is_swap_pmd() (depending on whether pmd_none() needs separate code or
not) to check pmd migration entries at the places where a pmd entry is
present.

Since pmd-related code uses split_huge_page(), split_huge_pmd(),
pmd_trans_huge(), pmd_trans_unstable(), or
pmd_none_or_trans_huge_or_clear_bad(), this patch:

1. adds pmd migration entry split code in split_huge_pmd(),

2. takes care of pmd migration entries whenever pmd_trans_huge() is present,

3. makes pmd_none_or_trans_huge_or_clear_bad() pmd migration entry aware.

Since split_huge_page() uses split_huge_pmd() and pmd_trans_unstable()
is equivalent to pmd_none_or_trans_huge_or_clear_bad(), we do not change
them.

Until this commit, a pmd entry should be:
1. pointing to a pte page,
2. is_swap_pmd(),
3. pmd_trans_huge(),
4. pmd_devmap(), or
5. pmd_none().

Signed-off-by: Zi Yan <zi.yan@cs.rutgers.edu>
Cc: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Anshuman Khandual <khandual@linux.vnet.ibm.com>
Cc: Dave Hansen <dave.hansen@intel.com>
Cc: David Nellans <dnellans@nvidia.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Mel Gorman <mgorman@techsingularity.net>
Cc: Minchan Kim <minchan@kernel.org>
Cc: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: Michal Hocko <mhocko@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/proc/task_mmu.c            | 32 +++++++++++++--------
 include/asm-generic/pgtable.h | 18 +++++++++++-
 include/linux/huge_mm.h       | 14 ++++++++--
 mm/gup.c                      | 22 +++++++++++++--
 mm/huge_memory.c              | 65 +++++++++++++++++++++++++++++++++++++++----
 mm/memcontrol.c               |  5 ++++
 mm/memory.c                   | 12 ++++++--
 mm/mprotect.c                 |  4 +--
 mm/mremap.c                   |  2 +-
 9 files changed, 147 insertions(+), 27 deletions(-)

(limited to 'include')

diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index a290966f91ec..8eec35af32e4 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -608,7 +608,8 @@ static int smaps_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
 
 	ptl = pmd_trans_huge_lock(pmd, vma);
 	if (ptl) {
-		smaps_pmd_entry(pmd, addr, walk);
+		if (pmd_present(*pmd))
+			smaps_pmd_entry(pmd, addr, walk);
 		spin_unlock(ptl);
 		return 0;
 	}
@@ -1012,6 +1013,9 @@ static int clear_refs_pte_range(pmd_t *pmd, unsigned long addr,
 			goto out;
 		}
 
+		if (!pmd_present(*pmd))
+			goto out;
+
 		page = pmd_page(*pmd);
 
 		/* Clear accessed and referenced bits. */
@@ -1293,27 +1297,33 @@ static int pagemap_pmd_range(pmd_t *pmdp, unsigned long addr, unsigned long end,
 	if (ptl) {
 		u64 flags = 0, frame = 0;
 		pmd_t pmd = *pmdp;
+		struct page *page = NULL;
 
 		if ((vma->vm_flags & VM_SOFTDIRTY) || pmd_soft_dirty(pmd))
 			flags |= PM_SOFT_DIRTY;
 
-		/*
-		 * Currently pmd for thp is always present because thp
-		 * can not be swapped-out, migrated, or HWPOISONed
-		 * (split in such cases instead.)
-		 * This if-check is just to prepare for future implementation.
-		 */
 		if (pmd_present(pmd)) {
-			struct page *page = pmd_page(pmd);
-
-			if (page_mapcount(page) == 1)
-				flags |= PM_MMAP_EXCLUSIVE;
+			page = pmd_page(pmd);
 
 			flags |= PM_PRESENT;
 			if (pm->show_pfn)
 				frame = pmd_pfn(pmd) +
 					((addr & ~PMD_MASK) >> PAGE_SHIFT);
 		}
+#ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION
+		else if (is_swap_pmd(pmd)) {
+			swp_entry_t entry = pmd_to_swp_entry(pmd);
+
+			frame = swp_type(entry) |
+				(swp_offset(entry) << MAX_SWAPFILES_SHIFT);
+			flags |= PM_SWAP;
+			VM_BUG_ON(!is_pmd_migration_entry(pmd));
+			page = migration_entry_to_page(entry);
+		}
+#endif
+
+		if (page && page_mapcount(page) == 1)
+			flags |= PM_MMAP_EXCLUSIVE;
 
 		for (; addr != end; addr += PAGE_SIZE) {
 			pagemap_entry_t pme = make_pme(frame, flags);
diff --git a/include/asm-generic/pgtable.h b/include/asm-generic/pgtable.h
index 4d7bb98f4134..4f93a6d10a47 100644
--- a/include/asm-generic/pgtable.h
+++ b/include/asm-generic/pgtable.h
@@ -846,7 +846,23 @@ static inline int pmd_none_or_trans_huge_or_clear_bad(pmd_t *pmd)
 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
 	barrier();
 #endif
-	if (pmd_none(pmdval) || pmd_trans_huge(pmdval))
+	/*
+	 * !pmd_present() checks for pmd migration entries
+	 *
+	 * The complete check uses is_pmd_migration_entry() in linux/swapops.h
+	 * But using that requires moving current function and pmd_trans_unstable()
+	 * to linux/swapops.h to resovle dependency, which is too much code move.
+	 *
+	 * !pmd_present() is equivalent to is_pmd_migration_entry() currently,
+	 * because !pmd_present() pages can only be under migration not swapped
+	 * out.
+	 *
+	 * pmd_none() is preseved for future condition checks on pmd migration
+	 * entries and not confusing with this function name, although it is
+	 * redundant with !pmd_present().
+	 */
+	if (pmd_none(pmdval) || pmd_trans_huge(pmdval) ||
+		(IS_ENABLED(CONFIG_ARCH_ENABLE_THP_MIGRATION) && !pmd_present(pmdval)))
 		return 1;
 	if (unlikely(pmd_bad(pmdval))) {
 		pmd_clear_bad(pmd);
diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
index d8f35a0865dc..14bc21c2ee7f 100644
--- a/include/linux/huge_mm.h
+++ b/include/linux/huge_mm.h
@@ -147,7 +147,7 @@ void __split_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
 #define split_huge_pmd(__vma, __pmd, __address)				\
 	do {								\
 		pmd_t *____pmd = (__pmd);				\
-		if (pmd_trans_huge(*____pmd)				\
+		if (is_swap_pmd(*____pmd) || pmd_trans_huge(*____pmd)	\
 					|| pmd_devmap(*____pmd))	\
 			__split_huge_pmd(__vma, __pmd, __address,	\
 						false, NULL);		\
@@ -178,12 +178,18 @@ extern spinlock_t *__pmd_trans_huge_lock(pmd_t *pmd,
 		struct vm_area_struct *vma);
 extern spinlock_t *__pud_trans_huge_lock(pud_t *pud,
 		struct vm_area_struct *vma);
+
+static inline int is_swap_pmd(pmd_t pmd)
+{
+	return !pmd_none(pmd) && !pmd_present(pmd);
+}
+
 /* mmap_sem must be held on entry */
 static inline spinlock_t *pmd_trans_huge_lock(pmd_t *pmd,
 		struct vm_area_struct *vma)
 {
 	VM_BUG_ON_VMA(!rwsem_is_locked(&vma->vm_mm->mmap_sem), vma);
-	if (pmd_trans_huge(*pmd) || pmd_devmap(*pmd))
+	if (is_swap_pmd(*pmd) || pmd_trans_huge(*pmd) || pmd_devmap(*pmd))
 		return __pmd_trans_huge_lock(pmd, vma);
 	else
 		return NULL;
@@ -299,6 +305,10 @@ static inline void vma_adjust_trans_huge(struct vm_area_struct *vma,
 					 long adjust_next)
 {
 }
+static inline int is_swap_pmd(pmd_t pmd)
+{
+	return 0;
+}
 static inline spinlock_t *pmd_trans_huge_lock(pmd_t *pmd,
 		struct vm_area_struct *vma)
 {
diff --git a/mm/gup.c b/mm/gup.c
index 33d651deeae2..76fd199aaae2 100644
--- a/mm/gup.c
+++ b/mm/gup.c
@@ -234,6 +234,16 @@ static struct page *follow_pmd_mask(struct vm_area_struct *vma,
 			return page;
 		return no_page_table(vma, flags);
 	}
+retry:
+	if (!pmd_present(*pmd)) {
+		if (likely(!(flags & FOLL_MIGRATION)))
+			return no_page_table(vma, flags);
+		VM_BUG_ON(thp_migration_supported() &&
+				  !is_pmd_migration_entry(*pmd));
+		if (is_pmd_migration_entry(*pmd))
+			pmd_migration_entry_wait(mm, pmd);
+		goto retry;
+	}
 	if (pmd_devmap(*pmd)) {
 		ptl = pmd_lock(mm, pmd);
 		page = follow_devmap_pmd(vma, address, pmd, flags);
@@ -247,7 +257,15 @@ static struct page *follow_pmd_mask(struct vm_area_struct *vma,
 	if ((flags & FOLL_NUMA) && pmd_protnone(*pmd))
 		return no_page_table(vma, flags);
 
+retry_locked:
 	ptl = pmd_lock(mm, pmd);
+	if (unlikely(!pmd_present(*pmd))) {
+		spin_unlock(ptl);
+		if (likely(!(flags & FOLL_MIGRATION)))
+			return no_page_table(vma, flags);
+		pmd_migration_entry_wait(mm, pmd);
+		goto retry_locked;
+	}
 	if (unlikely(!pmd_trans_huge(*pmd))) {
 		spin_unlock(ptl);
 		return follow_page_pte(vma, address, pmd, flags);
@@ -424,7 +442,7 @@ static int get_gate_page(struct mm_struct *mm, unsigned long address,
 	pud = pud_offset(p4d, address);
 	BUG_ON(pud_none(*pud));
 	pmd = pmd_offset(pud, address);
-	if (pmd_none(*pmd))
+	if (!pmd_present(*pmd))
 		return -EFAULT;
 	VM_BUG_ON(pmd_trans_huge(*pmd));
 	pte = pte_offset_map(pmd, address);
@@ -1534,7 +1552,7 @@ static int gup_pmd_range(pud_t pud, unsigned long addr, unsigned long end,
 		pmd_t pmd = READ_ONCE(*pmdp);
 
 		next = pmd_addr_end(addr, end);
-		if (pmd_none(pmd))
+		if (!pmd_present(pmd))
 			return 0;
 
 		if (unlikely(pmd_trans_huge(pmd) || pmd_huge(pmd))) {
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 937f007794dd..b82585eabe85 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -928,6 +928,23 @@ int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm,
 
 	ret = -EAGAIN;
 	pmd = *src_pmd;
+
+#ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION
+	if (unlikely(is_swap_pmd(pmd))) {
+		swp_entry_t entry = pmd_to_swp_entry(pmd);
+
+		VM_BUG_ON(!is_pmd_migration_entry(pmd));
+		if (is_write_migration_entry(entry)) {
+			make_migration_entry_read(&entry);
+			pmd = swp_entry_to_pmd(entry);
+			set_pmd_at(src_mm, addr, src_pmd, pmd);
+		}
+		set_pmd_at(dst_mm, addr, dst_pmd, pmd);
+		ret = 0;
+		goto out_unlock;
+	}
+#endif
+
 	if (unlikely(!pmd_trans_huge(pmd))) {
 		pte_free(dst_mm, pgtable);
 		goto out_unlock;
@@ -1599,6 +1616,12 @@ bool madvise_free_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
 	if (is_huge_zero_pmd(orig_pmd))
 		goto out;
 
+	if (unlikely(!pmd_present(orig_pmd))) {
+		VM_BUG_ON(thp_migration_supported() &&
+				  !is_pmd_migration_entry(orig_pmd));
+		goto out;
+	}
+
 	page = pmd_page(orig_pmd);
 	/*
 	 * If other processes are mapping this page, we couldn't discard
@@ -1810,6 +1833,25 @@ int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
 	preserve_write = prot_numa && pmd_write(*pmd);
 	ret = 1;
 
+#ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION
+	if (is_swap_pmd(*pmd)) {
+		swp_entry_t entry = pmd_to_swp_entry(*pmd);
+
+		VM_BUG_ON(!is_pmd_migration_entry(*pmd));
+		if (is_write_migration_entry(entry)) {
+			pmd_t newpmd;
+			/*
+			 * A protection check is difficult so
+			 * just be safe and disable write
+			 */
+			make_migration_entry_read(&entry);
+			newpmd = swp_entry_to_pmd(entry);
+			set_pmd_at(mm, addr, pmd, newpmd);
+		}
+		goto unlock;
+	}
+#endif
+
 	/*
 	 * Avoid trapping faults against the zero page. The read-only
 	 * data is likely to be read-cached on the local CPU and
@@ -1875,7 +1917,8 @@ spinlock_t *__pmd_trans_huge_lock(pmd_t *pmd, struct vm_area_struct *vma)
 {
 	spinlock_t *ptl;
 	ptl = pmd_lock(vma->vm_mm, pmd);
-	if (likely(pmd_trans_huge(*pmd) || pmd_devmap(*pmd)))
+	if (likely(is_swap_pmd(*pmd) || pmd_trans_huge(*pmd) ||
+			pmd_devmap(*pmd)))
 		return ptl;
 	spin_unlock(ptl);
 	return NULL;
@@ -1993,14 +2036,15 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
 	struct page *page;
 	pgtable_t pgtable;
 	pmd_t _pmd;
-	bool young, write, dirty, soft_dirty;
+	bool young, write, dirty, soft_dirty, pmd_migration = false;
 	unsigned long addr;
 	int i;
 
 	VM_BUG_ON(haddr & ~HPAGE_PMD_MASK);
 	VM_BUG_ON_VMA(vma->vm_start > haddr, vma);
 	VM_BUG_ON_VMA(vma->vm_end < haddr + HPAGE_PMD_SIZE, vma);
-	VM_BUG_ON(!pmd_trans_huge(*pmd) && !pmd_devmap(*pmd));
+	VM_BUG_ON(!is_pmd_migration_entry(*pmd) && !pmd_trans_huge(*pmd)
+				&& !pmd_devmap(*pmd));
 
 	count_vm_event(THP_SPLIT_PMD);
 
@@ -2025,7 +2069,16 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
 		return __split_huge_zero_page_pmd(vma, haddr, pmd);
 	}
 
-	page = pmd_page(*pmd);
+#ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION
+	pmd_migration = is_pmd_migration_entry(*pmd);
+	if (pmd_migration) {
+		swp_entry_t entry;
+
+		entry = pmd_to_swp_entry(*pmd);
+		page = pfn_to_page(swp_offset(entry));
+	} else
+#endif
+		page = pmd_page(*pmd);
 	VM_BUG_ON_PAGE(!page_count(page), page);
 	page_ref_add(page, HPAGE_PMD_NR - 1);
 	write = pmd_write(*pmd);
@@ -2044,7 +2097,7 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
 		 * transferred to avoid any possibility of altering
 		 * permissions across VMAs.
 		 */
-		if (freeze) {
+		if (freeze || pmd_migration) {
 			swp_entry_t swp_entry;
 			swp_entry = make_migration_entry(page + i, write);
 			entry = swp_entry_to_pte(swp_entry);
@@ -2143,7 +2196,7 @@ void __split_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
 		page = pmd_page(*pmd);
 		if (PageMlocked(page))
 			clear_page_mlock(page);
-	} else if (!pmd_devmap(*pmd))
+	} else if (!(pmd_devmap(*pmd) || is_pmd_migration_entry(*pmd)))
 		goto out;
 	__split_huge_pmd_locked(vma, pmd, haddr, freeze);
 out:
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 6532b219b222..f1f3f5b41155 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -4664,6 +4664,11 @@ static enum mc_target_type get_mctgt_type_thp(struct vm_area_struct *vma,
 	struct page *page = NULL;
 	enum mc_target_type ret = MC_TARGET_NONE;
 
+	if (unlikely(is_swap_pmd(pmd))) {
+		VM_BUG_ON(thp_migration_supported() &&
+				  !is_pmd_migration_entry(pmd));
+		return ret;
+	}
 	page = pmd_page(pmd);
 	VM_BUG_ON_PAGE(!page || !PageHead(page), page);
 	if (!(mc.flags & MOVE_ANON))
diff --git a/mm/memory.c b/mm/memory.c
index 13ee83b43878..886033b95fd2 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -1065,7 +1065,8 @@ static inline int copy_pmd_range(struct mm_struct *dst_mm, struct mm_struct *src
 	src_pmd = pmd_offset(src_pud, addr);
 	do {
 		next = pmd_addr_end(addr, end);
-		if (pmd_trans_huge(*src_pmd) || pmd_devmap(*src_pmd)) {
+		if (is_swap_pmd(*src_pmd) || pmd_trans_huge(*src_pmd)
+			|| pmd_devmap(*src_pmd)) {
 			int err;
 			VM_BUG_ON_VMA(next-addr != HPAGE_PMD_SIZE, vma);
 			err = copy_huge_pmd(dst_mm, src_mm,
@@ -1326,7 +1327,7 @@ static inline unsigned long zap_pmd_range(struct mmu_gather *tlb,
 	pmd = pmd_offset(pud, addr);
 	do {
 		next = pmd_addr_end(addr, end);
-		if (pmd_trans_huge(*pmd) || pmd_devmap(*pmd)) {
+		if (is_swap_pmd(*pmd) || pmd_trans_huge(*pmd) || pmd_devmap(*pmd)) {
 			if (next - addr != HPAGE_PMD_SIZE) {
 				VM_BUG_ON_VMA(vma_is_anonymous(vma) &&
 				    !rwsem_is_locked(&tlb->mm->mmap_sem), vma);
@@ -3911,6 +3912,13 @@ static int __handle_mm_fault(struct vm_area_struct *vma, unsigned long address,
 		pmd_t orig_pmd = *vmf.pmd;
 
 		barrier();
+		if (unlikely(is_swap_pmd(orig_pmd))) {
+			VM_BUG_ON(thp_migration_supported() &&
+					  !is_pmd_migration_entry(orig_pmd));
+			if (is_pmd_migration_entry(orig_pmd))
+				pmd_migration_entry_wait(mm, vmf.pmd);
+			return 0;
+		}
 		if (pmd_trans_huge(orig_pmd) || pmd_devmap(orig_pmd)) {
 			if (pmd_protnone(orig_pmd) && vma_is_accessible(vma))
 				return do_huge_pmd_numa_page(&vmf, orig_pmd);
diff --git a/mm/mprotect.c b/mm/mprotect.c
index bd0f409922cb..a1bfe9545770 100644
--- a/mm/mprotect.c
+++ b/mm/mprotect.c
@@ -149,7 +149,7 @@ static inline unsigned long change_pmd_range(struct vm_area_struct *vma,
 		unsigned long this_pages;
 
 		next = pmd_addr_end(addr, end);
-		if (!pmd_trans_huge(*pmd) && !pmd_devmap(*pmd)
+		if (!is_swap_pmd(*pmd) && !pmd_trans_huge(*pmd) && !pmd_devmap(*pmd)
 				&& pmd_none_or_clear_bad(pmd))
 			continue;
 
@@ -159,7 +159,7 @@ static inline unsigned long change_pmd_range(struct vm_area_struct *vma,
 			mmu_notifier_invalidate_range_start(mm, mni_start, end);
 		}
 
-		if (pmd_trans_huge(*pmd) || pmd_devmap(*pmd)) {
+		if (is_swap_pmd(*pmd) || pmd_trans_huge(*pmd) || pmd_devmap(*pmd)) {
 			if (next - addr != HPAGE_PMD_SIZE) {
 				__split_huge_pmd(vma, pmd, addr, false, NULL);
 			} else {
diff --git a/mm/mremap.c b/mm/mremap.c
index 7395564daa6c..cfec004c4ff9 100644
--- a/mm/mremap.c
+++ b/mm/mremap.c
@@ -223,7 +223,7 @@ unsigned long move_page_tables(struct vm_area_struct *vma,
 		new_pmd = alloc_new_pmd(vma->vm_mm, vma, new_addr);
 		if (!new_pmd)
 			break;
-		if (pmd_trans_huge(*old_pmd)) {
+		if (is_swap_pmd(*old_pmd) || pmd_trans_huge(*old_pmd)) {
 			if (extent == HPAGE_PMD_SIZE) {
 				bool moved;
 				/* See comment in move_ptes() */
-- 
cgit v1.2.3


From ab6e3d0939bb332d72444a532f0f72e0dfde7b7b Mon Sep 17 00:00:00 2001
From: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
Date: Fri, 8 Sep 2017 16:11:04 -0700
Subject: mm: soft-dirty: keep soft-dirty bits over thp migration

Soft dirty bit is designed to keep tracked over page migration.  This
patch makes it work in the same manner for thp migration too.

Signed-off-by: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
Signed-off-by: Zi Yan <zi.yan@cs.rutgers.edu>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Anshuman Khandual <khandual@linux.vnet.ibm.com>
Cc: Dave Hansen <dave.hansen@intel.com>
Cc: David Nellans <dnellans@nvidia.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Cc: Mel Gorman <mgorman@techsingularity.net>
Cc: Minchan Kim <minchan@kernel.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: Michal Hocko <mhocko@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/x86/include/asm/pgtable.h | 17 +++++++++++++++++
 fs/proc/task_mmu.c             | 27 ++++++++++++++++-----------
 include/asm-generic/pgtable.h  | 34 +++++++++++++++++++++++++++++++++-
 include/linux/swapops.h        |  2 ++
 mm/huge_memory.c               | 27 ++++++++++++++++++++++++---
 5 files changed, 92 insertions(+), 15 deletions(-)

(limited to 'include')

diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h
index bbeae4a2bd01..5b4c44d419c5 100644
--- a/arch/x86/include/asm/pgtable.h
+++ b/arch/x86/include/asm/pgtable.h
@@ -1172,6 +1172,23 @@ static inline pte_t pte_swp_clear_soft_dirty(pte_t pte)
 {
 	return pte_clear_flags(pte, _PAGE_SWP_SOFT_DIRTY);
 }
+
+#ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION
+static inline pmd_t pmd_swp_mksoft_dirty(pmd_t pmd)
+{
+	return pmd_set_flags(pmd, _PAGE_SWP_SOFT_DIRTY);
+}
+
+static inline int pmd_swp_soft_dirty(pmd_t pmd)
+{
+	return pmd_flags(pmd) & _PAGE_SWP_SOFT_DIRTY;
+}
+
+static inline pmd_t pmd_swp_clear_soft_dirty(pmd_t pmd)
+{
+	return pmd_clear_flags(pmd, _PAGE_SWP_SOFT_DIRTY);
+}
+#endif
 #endif
 
 #define PKRU_AD_BIT 0x1
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index 8eec35af32e4..4b21c4e51ce4 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -978,17 +978,22 @@ static inline void clear_soft_dirty_pmd(struct vm_area_struct *vma,
 {
 	pmd_t pmd = *pmdp;
 
-	/* See comment in change_huge_pmd() */
-	pmdp_invalidate(vma, addr, pmdp);
-	if (pmd_dirty(*pmdp))
-		pmd = pmd_mkdirty(pmd);
-	if (pmd_young(*pmdp))
-		pmd = pmd_mkyoung(pmd);
-
-	pmd = pmd_wrprotect(pmd);
-	pmd = pmd_clear_soft_dirty(pmd);
-
-	set_pmd_at(vma->vm_mm, addr, pmdp, pmd);
+	if (pmd_present(pmd)) {
+		/* See comment in change_huge_pmd() */
+		pmdp_invalidate(vma, addr, pmdp);
+		if (pmd_dirty(*pmdp))
+			pmd = pmd_mkdirty(pmd);
+		if (pmd_young(*pmdp))
+			pmd = pmd_mkyoung(pmd);
+
+		pmd = pmd_wrprotect(pmd);
+		pmd = pmd_clear_soft_dirty(pmd);
+
+		set_pmd_at(vma->vm_mm, addr, pmdp, pmd);
+	} else if (is_migration_entry(pmd_to_swp_entry(pmd))) {
+		pmd = pmd_swp_clear_soft_dirty(pmd);
+		set_pmd_at(vma->vm_mm, addr, pmdp, pmd);
+	}
 }
 #else
 static inline void clear_soft_dirty_pmd(struct vm_area_struct *vma,
diff --git a/include/asm-generic/pgtable.h b/include/asm-generic/pgtable.h
index 4f93a6d10a47..8e0243036564 100644
--- a/include/asm-generic/pgtable.h
+++ b/include/asm-generic/pgtable.h
@@ -630,7 +630,24 @@ static inline void ptep_modify_prot_commit(struct mm_struct *mm,
 #define arch_start_context_switch(prev)	do {} while (0)
 #endif
 
-#ifndef CONFIG_HAVE_ARCH_SOFT_DIRTY
+#ifdef CONFIG_HAVE_ARCH_SOFT_DIRTY
+#ifndef CONFIG_ARCH_ENABLE_THP_MIGRATION
+static inline pmd_t pmd_swp_mksoft_dirty(pmd_t pmd)
+{
+	return pmd;
+}
+
+static inline int pmd_swp_soft_dirty(pmd_t pmd)
+{
+	return 0;
+}
+
+static inline pmd_t pmd_swp_clear_soft_dirty(pmd_t pmd)
+{
+	return pmd;
+}
+#endif
+#else /* !CONFIG_HAVE_ARCH_SOFT_DIRTY */
 static inline int pte_soft_dirty(pte_t pte)
 {
 	return 0;
@@ -675,6 +692,21 @@ static inline pte_t pte_swp_clear_soft_dirty(pte_t pte)
 {
 	return pte;
 }
+
+static inline pmd_t pmd_swp_mksoft_dirty(pmd_t pmd)
+{
+	return pmd;
+}
+
+static inline int pmd_swp_soft_dirty(pmd_t pmd)
+{
+	return 0;
+}
+
+static inline pmd_t pmd_swp_clear_soft_dirty(pmd_t pmd)
+{
+	return pmd;
+}
 #endif
 
 #ifndef __HAVE_PFNMAP_TRACKING
diff --git a/include/linux/swapops.h b/include/linux/swapops.h
index 82089fd88c29..45b092aa6419 100644
--- a/include/linux/swapops.h
+++ b/include/linux/swapops.h
@@ -183,6 +183,8 @@ static inline swp_entry_t pmd_to_swp_entry(pmd_t pmd)
 {
 	swp_entry_t arch_entry;
 
+	if (pmd_swp_soft_dirty(pmd))
+		pmd = pmd_swp_clear_soft_dirty(pmd);
 	arch_entry = __pmd_to_swp_entry(pmd);
 	return swp_entry(__swp_type(arch_entry), __swp_offset(arch_entry));
 }
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index b82585eabe85..269b5df58543 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -937,6 +937,8 @@ int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm,
 		if (is_write_migration_entry(entry)) {
 			make_migration_entry_read(&entry);
 			pmd = swp_entry_to_pmd(entry);
+			if (pmd_swp_soft_dirty(*src_pmd))
+				pmd = pmd_swp_mksoft_dirty(pmd);
 			set_pmd_at(src_mm, addr, src_pmd, pmd);
 		}
 		set_pmd_at(dst_mm, addr, dst_pmd, pmd);
@@ -1756,6 +1758,17 @@ static inline int pmd_move_must_withdraw(spinlock_t *new_pmd_ptl,
 }
 #endif
 
+static pmd_t move_soft_dirty_pmd(pmd_t pmd)
+{
+#ifdef CONFIG_MEM_SOFT_DIRTY
+	if (unlikely(is_pmd_migration_entry(pmd)))
+		pmd = pmd_swp_mksoft_dirty(pmd);
+	else if (pmd_present(pmd))
+		pmd = pmd_mksoft_dirty(pmd);
+#endif
+	return pmd;
+}
+
 bool move_huge_pmd(struct vm_area_struct *vma, unsigned long old_addr,
 		  unsigned long new_addr, unsigned long old_end,
 		  pmd_t *old_pmd, pmd_t *new_pmd, bool *need_flush)
@@ -1798,7 +1811,8 @@ bool move_huge_pmd(struct vm_area_struct *vma, unsigned long old_addr,
 			pgtable = pgtable_trans_huge_withdraw(mm, old_pmd);
 			pgtable_trans_huge_deposit(mm, new_pmd, pgtable);
 		}
-		set_pmd_at(mm, new_addr, new_pmd, pmd_mksoft_dirty(pmd));
+		pmd = move_soft_dirty_pmd(pmd);
+		set_pmd_at(mm, new_addr, new_pmd, pmd);
 		if (new_ptl != old_ptl)
 			spin_unlock(new_ptl);
 		if (force_flush)
@@ -1846,6 +1860,8 @@ int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
 			 */
 			make_migration_entry_read(&entry);
 			newpmd = swp_entry_to_pmd(entry);
+			if (pmd_swp_soft_dirty(*pmd))
+				newpmd = pmd_swp_mksoft_dirty(newpmd);
 			set_pmd_at(mm, addr, pmd, newpmd);
 		}
 		goto unlock;
@@ -2824,6 +2840,7 @@ void set_pmd_migration_entry(struct page_vma_mapped_walk *pvmw,
 	unsigned long address = pvmw->address;
 	pmd_t pmdval;
 	swp_entry_t entry;
+	pmd_t pmdswp;
 
 	if (!(pvmw->pmd && !pvmw->pte))
 		return;
@@ -2837,8 +2854,10 @@ void set_pmd_migration_entry(struct page_vma_mapped_walk *pvmw,
 	if (pmd_dirty(pmdval))
 		set_page_dirty(page);
 	entry = make_migration_entry(page, pmd_write(pmdval));
-	pmdval = swp_entry_to_pmd(entry);
-	set_pmd_at(mm, address, pvmw->pmd, pmdval);
+	pmdswp = swp_entry_to_pmd(entry);
+	if (pmd_soft_dirty(pmdval))
+		pmdswp = pmd_swp_mksoft_dirty(pmdswp);
+	set_pmd_at(mm, address, pvmw->pmd, pmdswp);
 	page_remove_rmap(page, true);
 	put_page(page);
 
@@ -2861,6 +2880,8 @@ void remove_migration_pmd(struct page_vma_mapped_walk *pvmw, struct page *new)
 	entry = pmd_to_swp_entry(*pvmw->pmd);
 	get_page(new);
 	pmde = pmd_mkold(mk_huge_pmd(new, vma->vm_page_prot));
+	if (pmd_swp_soft_dirty(*pvmw->pmd))
+		pmde = pmd_mksoft_dirty(pmde);
 	if (is_write_migration_entry(entry))
 		pmde = maybe_pmd_mkwrite(pmde, vma);
 
-- 
cgit v1.2.3


From 8135d8926c08e553e39b0b040c6d01f0daef0676 Mon Sep 17 00:00:00 2001
From: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
Date: Fri, 8 Sep 2017 16:11:15 -0700
Subject: mm: memory_hotplug: memory hotremove supports thp migration

This patch enables thp migration for memory hotremove.

Link: http://lkml.kernel.org/r/20170717193955.20207-11-zi.yan@sent.com
Signed-off-by: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
Signed-off-by: Zi Yan <zi.yan@cs.rutgers.edu>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Anshuman Khandual <khandual@linux.vnet.ibm.com>
Cc: Dave Hansen <dave.hansen@intel.com>
Cc: David Nellans <dnellans@nvidia.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Cc: Mel Gorman <mgorman@techsingularity.net>
Cc: Minchan Kim <minchan@kernel.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: Michal Hocko <mhocko@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/migrate.h | 15 ++++++++++++++-
 mm/memory_hotplug.c     |  4 +++-
 2 files changed, 17 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/linux/migrate.h b/include/linux/migrate.h
index 3e0d405dc842..ce15989521a1 100644
--- a/include/linux/migrate.h
+++ b/include/linux/migrate.h
@@ -35,15 +35,28 @@ static inline struct page *new_page_nodemask(struct page *page,
 				int preferred_nid, nodemask_t *nodemask)
 {
 	gfp_t gfp_mask = GFP_USER | __GFP_MOVABLE | __GFP_RETRY_MAYFAIL;
+	unsigned int order = 0;
+	struct page *new_page = NULL;
 
 	if (PageHuge(page))
 		return alloc_huge_page_nodemask(page_hstate(compound_head(page)),
 				preferred_nid, nodemask);
 
+	if (thp_migration_supported() && PageTransHuge(page)) {
+		order = HPAGE_PMD_ORDER;
+		gfp_mask |= GFP_TRANSHUGE;
+	}
+
 	if (PageHighMem(page) || (zone_idx(page_zone(page)) == ZONE_MOVABLE))
 		gfp_mask |= __GFP_HIGHMEM;
 
-	return __alloc_pages_nodemask(gfp_mask, 0, preferred_nid, nodemask);
+	new_page = __alloc_pages_nodemask(gfp_mask, order,
+				preferred_nid, nodemask);
+
+	if (new_page && PageTransHuge(page))
+		prep_transhuge_page(new_page);
+
+	return new_page;
 }
 
 #ifdef CONFIG_MIGRATION
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index 73bf17df6899..1f92fb84770d 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -1380,7 +1380,9 @@ do_migrate_range(unsigned long start_pfn, unsigned long end_pfn)
 			if (isolate_huge_page(page, &source))
 				move_pages -= 1 << compound_order(head);
 			continue;
-		}
+		} else if (thp_migration_supported() && PageTransHuge(page))
+			pfn = page_to_pfn(compound_head(page))
+				+ hpage_nr_pages(page) - 1;
 
 		if (!get_page_unless_zero(page))
 			continue;
-- 
cgit v1.2.3


From 133ff0eac95b7dc6edf89dc51bd139a0630bbae7 Mon Sep 17 00:00:00 2001
From: Jérôme Glisse <jglisse@redhat.com>
Date: Fri, 8 Sep 2017 16:11:23 -0700
Subject: mm/hmm: heterogeneous memory management (HMM for short)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

HMM provides 3 separate types of functionality:
    - Mirroring: synchronize CPU page table and device page table
    - Device memory: allocating struct page for device memory
    - Migration: migrating regular memory to device memory

This patch introduces some common helpers and definitions to all of
those 3 functionality.

Link: http://lkml.kernel.org/r/20170817000548.32038-3-jglisse@redhat.com
Signed-off-by: Jérôme Glisse <jglisse@redhat.com>
Signed-off-by: Evgeny Baskakov <ebaskakov@nvidia.com>
Signed-off-by: John Hubbard <jhubbard@nvidia.com>
Signed-off-by: Mark Hairgrove <mhairgrove@nvidia.com>
Signed-off-by: Sherry Cheung <SCheung@nvidia.com>
Signed-off-by: Subhash Gutti <sgutti@nvidia.com>
Cc: Aneesh Kumar <aneesh.kumar@linux.vnet.ibm.com>
Cc: Balbir Singh <bsingharora@gmail.com>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Dan Williams <dan.j.williams@intel.com>
Cc: David Nellans <dnellans@nvidia.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Ross Zwisler <ross.zwisler@linux.intel.com>
Cc: Vladimir Davydov <vdavydov.dev@gmail.com>
Cc: Bob Liu <liubo95@huawei.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/hmm.h      | 152 +++++++++++++++++++++++++++++++++++++++++++++++
 include/linux/mm_types.h |   6 ++
 kernel/fork.c            |   3 +
 mm/Kconfig               |  13 ++++
 mm/Makefile              |   2 +-
 mm/hmm.c                 |  74 +++++++++++++++++++++++
 6 files changed, 249 insertions(+), 1 deletion(-)
 create mode 100644 include/linux/hmm.h
 create mode 100644 mm/hmm.c

(limited to 'include')

diff --git a/include/linux/hmm.h b/include/linux/hmm.h
new file mode 100644
index 000000000000..5d83fec6dfdd
--- /dev/null
+++ b/include/linux/hmm.h
@@ -0,0 +1,152 @@
+/*
+ * Copyright 2013 Red Hat Inc.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * Authors: JÃ©rÃ´me Glisse <jglisse@redhat.com>
+ */
+/*
+ * Heterogeneous Memory Management (HMM)
+ *
+ * See Documentation/vm/hmm.txt for reasons and overview of what HMM is and it
+ * is for. Here we focus on the HMM API description, with some explanation of
+ * the underlying implementation.
+ *
+ * Short description: HMM provides a set of helpers to share a virtual address
+ * space between CPU and a device, so that the device can access any valid
+ * address of the process (while still obeying memory protection). HMM also
+ * provides helpers to migrate process memory to device memory, and back. Each
+ * set of functionality (address space mirroring, and migration to and from
+ * device memory) can be used independently of the other.
+ *
+ *
+ * HMM address space mirroring API:
+ *
+ * Use HMM address space mirroring if you want to mirror range of the CPU page
+ * table of a process into a device page table. Here, "mirror" means "keep
+ * synchronized". Prerequisites: the device must provide the ability to write-
+ * protect its page tables (at PAGE_SIZE granularity), and must be able to
+ * recover from the resulting potential page faults.
+ *
+ * HMM guarantees that at any point in time, a given virtual address points to
+ * either the same memory in both CPU and device page tables (that is: CPU and
+ * device page tables each point to the same pages), or that one page table (CPU
+ * or device) points to no entry, while the other still points to the old page
+ * for the address. The latter case happens when the CPU page table update
+ * happens first, and then the update is mirrored over to the device page table.
+ * This does not cause any issue, because the CPU page table cannot start
+ * pointing to a new page until the device page table is invalidated.
+ *
+ * HMM uses mmu_notifiers to monitor the CPU page tables, and forwards any
+ * updates to each device driver that has registered a mirror. It also provides
+ * some API calls to help with taking a snapshot of the CPU page table, and to
+ * synchronize with any updates that might happen concurrently.
+ *
+ *
+ * HMM migration to and from device memory:
+ *
+ * HMM provides a set of helpers to hotplug device memory as ZONE_DEVICE, with
+ * a new MEMORY_DEVICE_PRIVATE type. This provides a struct page for each page
+ * of the device memory, and allows the device driver to manage its memory
+ * using those struct pages. Having struct pages for device memory makes
+ * migration easier. Because that memory is not addressable by the CPU it must
+ * never be pinned to the device; in other words, any CPU page fault can always
+ * cause the device memory to be migrated (copied/moved) back to regular memory.
+ *
+ * A new migrate helper (migrate_vma()) has been added (see mm/migrate.c) that
+ * allows use of a device DMA engine to perform the copy operation between
+ * regular system memory and device memory.
+ */
+#ifndef LINUX_HMM_H
+#define LINUX_HMM_H
+
+#include <linux/kconfig.h>
+
+#if IS_ENABLED(CONFIG_HMM)
+
+
+/*
+ * hmm_pfn_t - HMM uses its own pfn type to keep several flags per page
+ *
+ * Flags:
+ * HMM_PFN_VALID: pfn is valid
+ * HMM_PFN_WRITE: CPU page table has write permission set
+ */
+typedef unsigned long hmm_pfn_t;
+
+#define HMM_PFN_VALID (1 << 0)
+#define HMM_PFN_WRITE (1 << 1)
+#define HMM_PFN_SHIFT 2
+
+/*
+ * hmm_pfn_t_to_page() - return struct page pointed to by a valid hmm_pfn_t
+ * @pfn: hmm_pfn_t to convert to struct page
+ * Returns: struct page pointer if pfn is a valid hmm_pfn_t, NULL otherwise
+ *
+ * If the hmm_pfn_t is valid (ie valid flag set) then return the struct page
+ * matching the pfn value stored in the hmm_pfn_t. Otherwise return NULL.
+ */
+static inline struct page *hmm_pfn_t_to_page(hmm_pfn_t pfn)
+{
+	if (!(pfn & HMM_PFN_VALID))
+		return NULL;
+	return pfn_to_page(pfn >> HMM_PFN_SHIFT);
+}
+
+/*
+ * hmm_pfn_t_to_pfn() - return pfn value store in a hmm_pfn_t
+ * @pfn: hmm_pfn_t to extract pfn from
+ * Returns: pfn value if hmm_pfn_t is valid, -1UL otherwise
+ */
+static inline unsigned long hmm_pfn_t_to_pfn(hmm_pfn_t pfn)
+{
+	if (!(pfn & HMM_PFN_VALID))
+		return -1UL;
+	return (pfn >> HMM_PFN_SHIFT);
+}
+
+/*
+ * hmm_pfn_t_from_page() - create a valid hmm_pfn_t value from struct page
+ * @page: struct page pointer for which to create the hmm_pfn_t
+ * Returns: valid hmm_pfn_t for the page
+ */
+static inline hmm_pfn_t hmm_pfn_t_from_page(struct page *page)
+{
+	return (page_to_pfn(page) << HMM_PFN_SHIFT) | HMM_PFN_VALID;
+}
+
+/*
+ * hmm_pfn_t_from_pfn() - create a valid hmm_pfn_t value from pfn
+ * @pfn: pfn value for which to create the hmm_pfn_t
+ * Returns: valid hmm_pfn_t for the pfn
+ */
+static inline hmm_pfn_t hmm_pfn_t_from_pfn(unsigned long pfn)
+{
+	return (pfn << HMM_PFN_SHIFT) | HMM_PFN_VALID;
+}
+
+
+/* Below are for HMM internal use only! Not to be used by device driver! */
+void hmm_mm_destroy(struct mm_struct *mm);
+
+static inline void hmm_mm_init(struct mm_struct *mm)
+{
+	mm->hmm = NULL;
+}
+
+#else /* IS_ENABLED(CONFIG_HMM) */
+
+/* Below are for HMM internal use only! Not to be used by device driver! */
+static inline void hmm_mm_destroy(struct mm_struct *mm) {}
+static inline void hmm_mm_init(struct mm_struct *mm) {}
+
+#endif /* IS_ENABLED(CONFIG_HMM) */
+#endif /* LINUX_HMM_H */
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index f45ad815b7d7..46f4ecf5479a 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -23,6 +23,7 @@
 
 struct address_space;
 struct mem_cgroup;
+struct hmm;
 
 /*
  * Each physical page in the system has a struct page associated with
@@ -503,6 +504,11 @@ struct mm_struct {
 	atomic_long_t hugetlb_usage;
 #endif
 	struct work_struct async_put_work;
+
+#if IS_ENABLED(CONFIG_HMM)
+	/* HMM needs to track a few things per mm */
+	struct hmm *hmm;
+#endif
 } __randomize_layout;
 
 extern struct mm_struct init_mm;
diff --git a/kernel/fork.c b/kernel/fork.c
index 24a4c0be80d5..2ccbbbfcb7b8 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -37,6 +37,7 @@
 #include <linux/binfmts.h>
 #include <linux/mman.h>
 #include <linux/mmu_notifier.h>
+#include <linux/hmm.h>
 #include <linux/fs.h>
 #include <linux/mm.h>
 #include <linux/vmacache.h>
@@ -824,6 +825,7 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p,
 	mm_init_owner(mm, p);
 	RCU_INIT_POINTER(mm->exe_file, NULL);
 	mmu_notifier_mm_init(mm);
+	hmm_mm_init(mm);
 	init_tlb_flush_pending(mm);
 #if defined(CONFIG_TRANSPARENT_HUGEPAGE) && !USE_SPLIT_PMD_PTLOCKS
 	mm->pmd_huge_pte = NULL;
@@ -903,6 +905,7 @@ void __mmdrop(struct mm_struct *mm)
 	BUG_ON(mm == &init_mm);
 	mm_free_pgd(mm);
 	destroy_context(mm);
+	hmm_mm_destroy(mm);
 	mmu_notifier_mm_destroy(mm);
 	check_mm(mm);
 	put_user_ns(mm->user_ns);
diff --git a/mm/Kconfig b/mm/Kconfig
index 9ef8c2ea92ad..037fa26d16a2 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -692,6 +692,19 @@ config ZONE_DEVICE
 
 	  If FS_DAX is enabled, then say Y.
 
+config ARCH_HAS_HMM
+	bool
+	default y
+	depends on (X86_64 || PPC64)
+	depends on ZONE_DEVICE
+	depends on MMU && 64BIT
+	depends on MEMORY_HOTPLUG
+	depends on MEMORY_HOTREMOVE
+	depends on SPARSEMEM_VMEMMAP
+
+config HMM
+	bool
+
 config FRAME_VECTOR
 	bool
 
diff --git a/mm/Makefile b/mm/Makefile
index 411bd24d4a7c..1cde2a8bed97 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -39,7 +39,7 @@ obj-y			:= filemap.o mempool.o oom_kill.o \
 			   mm_init.o mmu_context.o percpu.o slab_common.o \
 			   compaction.o vmacache.o swap_slots.o \
 			   interval_tree.o list_lru.o workingset.o \
-			   debug.o $(mmu-y)
+			   debug.o hmm.o $(mmu-y)
 
 obj-y += init-mm.o
 
diff --git a/mm/hmm.c b/mm/hmm.c
new file mode 100644
index 000000000000..de032ff9e576
--- /dev/null
+++ b/mm/hmm.c
@@ -0,0 +1,74 @@
+/*
+ * Copyright 2013 Red Hat Inc.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * Authors: JÃ©rÃ´me Glisse <jglisse@redhat.com>
+ */
+/*
+ * Refer to include/linux/hmm.h for information about heterogeneous memory
+ * management or HMM for short.
+ */
+#include <linux/mm.h>
+#include <linux/hmm.h>
+#include <linux/slab.h>
+#include <linux/sched.h>
+
+
+#ifdef CONFIG_HMM
+/*
+ * struct hmm - HMM per mm struct
+ *
+ * @mm: mm struct this HMM struct is bound to
+ */
+struct hmm {
+	struct mm_struct	*mm;
+};
+
+/*
+ * hmm_register - register HMM against an mm (HMM internal)
+ *
+ * @mm: mm struct to attach to
+ *
+ * This is not intended to be used directly by device drivers. It allocates an
+ * HMM struct if mm does not have one, and initializes it.
+ */
+static struct hmm *hmm_register(struct mm_struct *mm)
+{
+	if (!mm->hmm) {
+		struct hmm *hmm = NULL;
+
+		hmm = kmalloc(sizeof(*hmm), GFP_KERNEL);
+		if (!hmm)
+			return NULL;
+		hmm->mm = mm;
+
+		spin_lock(&mm->page_table_lock);
+		if (!mm->hmm)
+			mm->hmm = hmm;
+		else
+			kfree(hmm);
+		spin_unlock(&mm->page_table_lock);
+	}
+
+	/*
+	 * The hmm struct can only be freed once the mm_struct goes away,
+	 * hence we should always have pre-allocated an new hmm struct
+	 * above.
+	 */
+	return mm->hmm;
+}
+
+void hmm_mm_destroy(struct mm_struct *mm)
+{
+	kfree(mm->hmm);
+}
+#endif /* CONFIG_HMM */
-- 
cgit v1.2.3


From c0b124054f9e42eb6da545a10fe9122a7d7c3f72 Mon Sep 17 00:00:00 2001
From: Jérôme Glisse <jglisse@redhat.com>
Date: Fri, 8 Sep 2017 16:11:27 -0700
Subject: mm/hmm/mirror: mirror process address space on device with HMM
 helpers
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This is a heterogeneous memory management (HMM) process address space
mirroring.  In a nutshell this provide an API to mirror process address
space on a device.  This boils down to keeping CPU and device page table
synchronize (we assume that both device and CPU are cache coherent like
PCIe device can be).

This patch provide a simple API for device driver to achieve address space
mirroring thus avoiding each device driver to grow its own CPU page table
walker and its own CPU page table synchronization mechanism.

This is useful for NVidia GPU >= Pascal, Mellanox IB >= mlx5 and more
hardware in the future.

[jglisse@redhat.com: fix hmm for "mmu_notifier kill invalidate_page callback"]
  Link: http://lkml.kernel.org/r/20170830231955.GD9445@redhat.com
Link: http://lkml.kernel.org/r/20170817000548.32038-4-jglisse@redhat.com
Signed-off-by: Jérôme Glisse <jglisse@redhat.com>
Signed-off-by: Evgeny Baskakov <ebaskakov@nvidia.com>
Signed-off-by: John Hubbard <jhubbard@nvidia.com>
Signed-off-by: Mark Hairgrove <mhairgrove@nvidia.com>
Signed-off-by: Sherry Cheung <SCheung@nvidia.com>
Signed-off-by: Subhash Gutti <sgutti@nvidia.com>
Cc: Aneesh Kumar <aneesh.kumar@linux.vnet.ibm.com>
Cc: Balbir Singh <bsingharora@gmail.com>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Dan Williams <dan.j.williams@intel.com>
Cc: David Nellans <dnellans@nvidia.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Ross Zwisler <ross.zwisler@linux.intel.com>
Cc: Vladimir Davydov <vdavydov.dev@gmail.com>
Cc: Bob Liu <liubo95@huawei.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/hmm.h | 110 +++++++++++++++++++++++++++++++++++++
 mm/Kconfig          |  12 +++++
 mm/hmm.c            | 153 ++++++++++++++++++++++++++++++++++++++++++++++------
 3 files changed, 260 insertions(+), 15 deletions(-)

(limited to 'include')

diff --git a/include/linux/hmm.h b/include/linux/hmm.h
index 5d83fec6dfdd..76310726ee9c 100644
--- a/include/linux/hmm.h
+++ b/include/linux/hmm.h
@@ -72,6 +72,7 @@
 
 #if IS_ENABLED(CONFIG_HMM)
 
+struct hmm;
 
 /*
  * hmm_pfn_t - HMM uses its own pfn type to keep several flags per page
@@ -134,6 +135,115 @@ static inline hmm_pfn_t hmm_pfn_t_from_pfn(unsigned long pfn)
 }
 
 
+#if IS_ENABLED(CONFIG_HMM_MIRROR)
+/*
+ * Mirroring: how to synchronize device page table with CPU page table.
+ *
+ * A device driver that is participating in HMM mirroring must always
+ * synchronize with CPU page table updates. For this, device drivers can either
+ * directly use mmu_notifier APIs or they can use the hmm_mirror API. Device
+ * drivers can decide to register one mirror per device per process, or just
+ * one mirror per process for a group of devices. The pattern is:
+ *
+ *      int device_bind_address_space(..., struct mm_struct *mm, ...)
+ *      {
+ *          struct device_address_space *das;
+ *
+ *          // Device driver specific initialization, and allocation of das
+ *          // which contains an hmm_mirror struct as one of its fields.
+ *          ...
+ *
+ *          ret = hmm_mirror_register(&das->mirror, mm, &device_mirror_ops);
+ *          if (ret) {
+ *              // Cleanup on error
+ *              return ret;
+ *          }
+ *
+ *          // Other device driver specific initialization
+ *          ...
+ *      }
+ *
+ * Once an hmm_mirror is registered for an address space, the device driver
+ * will get callbacks through sync_cpu_device_pagetables() operation (see
+ * hmm_mirror_ops struct).
+ *
+ * Device driver must not free the struct containing the hmm_mirror struct
+ * before calling hmm_mirror_unregister(). The expected usage is to do that when
+ * the device driver is unbinding from an address space.
+ *
+ *
+ *      void device_unbind_address_space(struct device_address_space *das)
+ *      {
+ *          // Device driver specific cleanup
+ *          ...
+ *
+ *          hmm_mirror_unregister(&das->mirror);
+ *
+ *          // Other device driver specific cleanup, and now das can be freed
+ *          ...
+ *      }
+ */
+
+struct hmm_mirror;
+
+/*
+ * enum hmm_update_type - type of update
+ * @HMM_UPDATE_INVALIDATE: invalidate range (no indication as to why)
+ */
+enum hmm_update_type {
+	HMM_UPDATE_INVALIDATE,
+};
+
+/*
+ * struct hmm_mirror_ops - HMM mirror device operations callback
+ *
+ * @update: callback to update range on a device
+ */
+struct hmm_mirror_ops {
+	/* sync_cpu_device_pagetables() - synchronize page tables
+	 *
+	 * @mirror: pointer to struct hmm_mirror
+	 * @update_type: type of update that occurred to the CPU page table
+	 * @start: virtual start address of the range to update
+	 * @end: virtual end address of the range to update
+	 *
+	 * This callback ultimately originates from mmu_notifiers when the CPU
+	 * page table is updated. The device driver must update its page table
+	 * in response to this callback. The update argument tells what action
+	 * to perform.
+	 *
+	 * The device driver must not return from this callback until the device
+	 * page tables are completely updated (TLBs flushed, etc); this is a
+	 * synchronous call.
+	 */
+	void (*sync_cpu_device_pagetables)(struct hmm_mirror *mirror,
+					   enum hmm_update_type update_type,
+					   unsigned long start,
+					   unsigned long end);
+};
+
+/*
+ * struct hmm_mirror - mirror struct for a device driver
+ *
+ * @hmm: pointer to struct hmm (which is unique per mm_struct)
+ * @ops: device driver callback for HMM mirror operations
+ * @list: for list of mirrors of a given mm
+ *
+ * Each address space (mm_struct) being mirrored by a device must register one
+ * instance of an hmm_mirror struct with HMM. HMM will track the list of all
+ * mirrors for each mm_struct.
+ */
+struct hmm_mirror {
+	struct hmm			*hmm;
+	const struct hmm_mirror_ops	*ops;
+	struct list_head		list;
+};
+
+int hmm_mirror_register(struct hmm_mirror *mirror, struct mm_struct *mm);
+void hmm_mirror_unregister(struct hmm_mirror *mirror);
+#endif /* IS_ENABLED(CONFIG_HMM_MIRROR) */
+
+
 /* Below are for HMM internal use only! Not to be used by device driver! */
 void hmm_mm_destroy(struct mm_struct *mm);
 
diff --git a/mm/Kconfig b/mm/Kconfig
index 037fa26d16a2..254db99f263d 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -705,6 +705,18 @@ config ARCH_HAS_HMM
 config HMM
 	bool
 
+config HMM_MIRROR
+	bool "HMM mirror CPU page table into a device page table"
+	depends on ARCH_HAS_HMM
+	select MMU_NOTIFIER
+	select HMM
+	help
+	  Select HMM_MIRROR if you want to mirror range of the CPU page table of a
+	  process into a device page table. Here, mirror means "keep synchronized".
+	  Prerequisites: the device must provide the ability to write-protect its
+	  page tables (at PAGE_SIZE granularity), and must be able to recover from
+	  the resulting potential page faults.
+
 config FRAME_VECTOR
 	bool
 
diff --git a/mm/hmm.c b/mm/hmm.c
index de032ff9e576..d37daf9edcd3 100644
--- a/mm/hmm.c
+++ b/mm/hmm.c
@@ -21,16 +21,27 @@
 #include <linux/hmm.h>
 #include <linux/slab.h>
 #include <linux/sched.h>
+#include <linux/mmu_notifier.h>
 
 
 #ifdef CONFIG_HMM
+static const struct mmu_notifier_ops hmm_mmu_notifier_ops;
+
 /*
  * struct hmm - HMM per mm struct
  *
  * @mm: mm struct this HMM struct is bound to
+ * @sequence: we track updates to the CPU page table with a sequence number
+ * @mirrors: list of mirrors for this mm
+ * @mmu_notifier: mmu notifier to track updates to CPU page table
+ * @mirrors_sem: read/write semaphore protecting the mirrors list
  */
 struct hmm {
 	struct mm_struct	*mm;
+	atomic_t		sequence;
+	struct list_head	mirrors;
+	struct mmu_notifier	mmu_notifier;
+	struct rw_semaphore	mirrors_sem;
 };
 
 /*
@@ -43,27 +54,48 @@ struct hmm {
  */
 static struct hmm *hmm_register(struct mm_struct *mm)
 {
-	if (!mm->hmm) {
-		struct hmm *hmm = NULL;
-
-		hmm = kmalloc(sizeof(*hmm), GFP_KERNEL);
-		if (!hmm)
-			return NULL;
-		hmm->mm = mm;
-
-		spin_lock(&mm->page_table_lock);
-		if (!mm->hmm)
-			mm->hmm = hmm;
-		else
-			kfree(hmm);
-		spin_unlock(&mm->page_table_lock);
-	}
+	struct hmm *hmm = READ_ONCE(mm->hmm);
+	bool cleanup = false;
 
 	/*
 	 * The hmm struct can only be freed once the mm_struct goes away,
 	 * hence we should always have pre-allocated an new hmm struct
 	 * above.
 	 */
+	if (hmm)
+		return hmm;
+
+	hmm = kmalloc(sizeof(*hmm), GFP_KERNEL);
+	if (!hmm)
+		return NULL;
+	INIT_LIST_HEAD(&hmm->mirrors);
+	init_rwsem(&hmm->mirrors_sem);
+	atomic_set(&hmm->sequence, 0);
+	hmm->mmu_notifier.ops = NULL;
+	hmm->mm = mm;
+
+	/*
+	 * We should only get here if hold the mmap_sem in write mode ie on
+	 * registration of first mirror through hmm_mirror_register()
+	 */
+	hmm->mmu_notifier.ops = &hmm_mmu_notifier_ops;
+	if (__mmu_notifier_register(&hmm->mmu_notifier, mm)) {
+		kfree(hmm);
+		return NULL;
+	}
+
+	spin_lock(&mm->page_table_lock);
+	if (!mm->hmm)
+		mm->hmm = hmm;
+	else
+		cleanup = true;
+	spin_unlock(&mm->page_table_lock);
+
+	if (cleanup) {
+		mmu_notifier_unregister(&hmm->mmu_notifier, mm);
+		kfree(hmm);
+	}
+
 	return mm->hmm;
 }
 
@@ -72,3 +104,94 @@ void hmm_mm_destroy(struct mm_struct *mm)
 	kfree(mm->hmm);
 }
 #endif /* CONFIG_HMM */
+
+#if IS_ENABLED(CONFIG_HMM_MIRROR)
+static void hmm_invalidate_range(struct hmm *hmm,
+				 enum hmm_update_type action,
+				 unsigned long start,
+				 unsigned long end)
+{
+	struct hmm_mirror *mirror;
+
+	down_read(&hmm->mirrors_sem);
+	list_for_each_entry(mirror, &hmm->mirrors, list)
+		mirror->ops->sync_cpu_device_pagetables(mirror, action,
+							start, end);
+	up_read(&hmm->mirrors_sem);
+}
+
+static void hmm_invalidate_range_start(struct mmu_notifier *mn,
+				       struct mm_struct *mm,
+				       unsigned long start,
+				       unsigned long end)
+{
+	struct hmm *hmm = mm->hmm;
+
+	VM_BUG_ON(!hmm);
+
+	atomic_inc(&hmm->sequence);
+}
+
+static void hmm_invalidate_range_end(struct mmu_notifier *mn,
+				     struct mm_struct *mm,
+				     unsigned long start,
+				     unsigned long end)
+{
+	struct hmm *hmm = mm->hmm;
+
+	VM_BUG_ON(!hmm);
+
+	hmm_invalidate_range(mm->hmm, HMM_UPDATE_INVALIDATE, start, end);
+}
+
+static const struct mmu_notifier_ops hmm_mmu_notifier_ops = {
+	.invalidate_range_start	= hmm_invalidate_range_start,
+	.invalidate_range_end	= hmm_invalidate_range_end,
+};
+
+/*
+ * hmm_mirror_register() - register a mirror against an mm
+ *
+ * @mirror: new mirror struct to register
+ * @mm: mm to register against
+ *
+ * To start mirroring a process address space, the device driver must register
+ * an HMM mirror struct.
+ *
+ * THE mm->mmap_sem MUST BE HELD IN WRITE MODE !
+ */
+int hmm_mirror_register(struct hmm_mirror *mirror, struct mm_struct *mm)
+{
+	/* Sanity check */
+	if (!mm || !mirror || !mirror->ops)
+		return -EINVAL;
+
+	mirror->hmm = hmm_register(mm);
+	if (!mirror->hmm)
+		return -ENOMEM;
+
+	down_write(&mirror->hmm->mirrors_sem);
+	list_add(&mirror->list, &mirror->hmm->mirrors);
+	up_write(&mirror->hmm->mirrors_sem);
+
+	return 0;
+}
+EXPORT_SYMBOL(hmm_mirror_register);
+
+/*
+ * hmm_mirror_unregister() - unregister a mirror
+ *
+ * @mirror: new mirror struct to register
+ *
+ * Stop mirroring a process address space, and cleanup.
+ */
+void hmm_mirror_unregister(struct hmm_mirror *mirror)
+{
+	struct hmm *hmm = mirror->hmm;
+
+	down_write(&hmm->mirrors_sem);
+	list_del(&mirror->list);
+	up_write(&hmm->mirrors_sem);
+}
+EXPORT_SYMBOL(hmm_mirror_unregister);
+#endif /* IS_ENABLED(CONFIG_HMM_MIRROR) */
-- 
cgit v1.2.3


From da4c3c735ea4dcc2a0b0ff0bd4803c336361b6f5 Mon Sep 17 00:00:00 2001
From: Jérôme Glisse <jglisse@redhat.com>
Date: Fri, 8 Sep 2017 16:11:31 -0700
Subject: mm/hmm/mirror: helper to snapshot CPU page table
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This does not use existing page table walker because we want to share
same code for our page fault handler.

Link: http://lkml.kernel.org/r/20170817000548.32038-5-jglisse@redhat.com
Signed-off-by: Jérôme Glisse <jglisse@redhat.com>
Signed-off-by: Evgeny Baskakov <ebaskakov@nvidia.com>
Signed-off-by: John Hubbard <jhubbard@nvidia.com>
Signed-off-by: Mark Hairgrove <mhairgrove@nvidia.com>
Signed-off-by: Sherry Cheung <SCheung@nvidia.com>
Signed-off-by: Subhash Gutti <sgutti@nvidia.com>
Cc: Aneesh Kumar <aneesh.kumar@linux.vnet.ibm.com>
Cc: Balbir Singh <bsingharora@gmail.com>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Dan Williams <dan.j.williams@intel.com>
Cc: David Nellans <dnellans@nvidia.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Ross Zwisler <ross.zwisler@linux.intel.com>
Cc: Vladimir Davydov <vdavydov.dev@gmail.com>
Cc: Bob Liu <liubo95@huawei.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/hmm.h |  55 +++++++++-
 mm/hmm.c            | 285 ++++++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 338 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/linux/hmm.h b/include/linux/hmm.h
index 76310726ee9c..62899c9829c9 100644
--- a/include/linux/hmm.h
+++ b/include/linux/hmm.h
@@ -79,13 +79,26 @@ struct hmm;
  *
  * Flags:
  * HMM_PFN_VALID: pfn is valid
+ * HMM_PFN_READ:  CPU page table has read permission set
  * HMM_PFN_WRITE: CPU page table has write permission set
+ * HMM_PFN_ERROR: corresponding CPU page table entry points to poisoned memory
+ * HMM_PFN_EMPTY: corresponding CPU page table entry is pte_none()
+ * HMM_PFN_SPECIAL: corresponding CPU page table entry is special; i.e., the
+ *      result of vm_insert_pfn() or vm_insert_page(). Therefore, it should not
+ *      be mirrored by a device, because the entry will never have HMM_PFN_VALID
+ *      set and the pfn value is undefined.
+ * HMM_PFN_DEVICE_UNADDRESSABLE: unaddressable device memory (ZONE_DEVICE)
  */
 typedef unsigned long hmm_pfn_t;
 
 #define HMM_PFN_VALID (1 << 0)
-#define HMM_PFN_WRITE (1 << 1)
-#define HMM_PFN_SHIFT 2
+#define HMM_PFN_READ (1 << 1)
+#define HMM_PFN_WRITE (1 << 2)
+#define HMM_PFN_ERROR (1 << 3)
+#define HMM_PFN_EMPTY (1 << 4)
+#define HMM_PFN_SPECIAL (1 << 5)
+#define HMM_PFN_DEVICE_UNADDRESSABLE (1 << 6)
+#define HMM_PFN_SHIFT 7
 
 /*
  * hmm_pfn_t_to_page() - return struct page pointed to by a valid hmm_pfn_t
@@ -241,6 +254,44 @@ struct hmm_mirror {
 
 int hmm_mirror_register(struct hmm_mirror *mirror, struct mm_struct *mm);
 void hmm_mirror_unregister(struct hmm_mirror *mirror);
+
+
+/*
+ * struct hmm_range - track invalidation lock on virtual address range
+ *
+ * @list: all range lock are on a list
+ * @start: range virtual start address (inclusive)
+ * @end: range virtual end address (exclusive)
+ * @pfns: array of pfns (big enough for the range)
+ * @valid: pfns array did not change since it has been fill by an HMM function
+ */
+struct hmm_range {
+	struct list_head	list;
+	unsigned long		start;
+	unsigned long		end;
+	hmm_pfn_t		*pfns;
+	bool			valid;
+};
+
+/*
+ * To snapshot the CPU page table, call hmm_vma_get_pfns(), then take a device
+ * driver lock that serializes device page table updates, then call
+ * hmm_vma_range_done(), to check if the snapshot is still valid. The same
+ * device driver page table update lock must also be used in the
+ * hmm_mirror_ops.sync_cpu_device_pagetables() callback, so that CPU page
+ * table invalidation serializes on it.
+ *
+ * YOU MUST CALL hmm_vma_range_done() ONCE AND ONLY ONCE EACH TIME YOU CALL
+ * hmm_vma_get_pfns() WITHOUT ERROR !
+ *
+ * IF YOU DO NOT FOLLOW THE ABOVE RULE THE SNAPSHOT CONTENT MIGHT BE INVALID !
+ */
+int hmm_vma_get_pfns(struct vm_area_struct *vma,
+		     struct hmm_range *range,
+		     unsigned long start,
+		     unsigned long end,
+		     hmm_pfn_t *pfns);
+bool hmm_vma_range_done(struct vm_area_struct *vma, struct hmm_range *range);
 #endif /* IS_ENABLED(CONFIG_HMM_MIRROR) */
 
 
diff --git a/mm/hmm.c b/mm/hmm.c
index d37daf9edcd3..172984848d51 100644
--- a/mm/hmm.c
+++ b/mm/hmm.c
@@ -19,8 +19,12 @@
  */
 #include <linux/mm.h>
 #include <linux/hmm.h>
+#include <linux/rmap.h>
+#include <linux/swap.h>
 #include <linux/slab.h>
 #include <linux/sched.h>
+#include <linux/swapops.h>
+#include <linux/hugetlb.h>
 #include <linux/mmu_notifier.h>
 
 
@@ -31,14 +35,18 @@ static const struct mmu_notifier_ops hmm_mmu_notifier_ops;
  * struct hmm - HMM per mm struct
  *
  * @mm: mm struct this HMM struct is bound to
+ * @lock: lock protecting ranges list
  * @sequence: we track updates to the CPU page table with a sequence number
+ * @ranges: list of range being snapshotted
  * @mirrors: list of mirrors for this mm
  * @mmu_notifier: mmu notifier to track updates to CPU page table
  * @mirrors_sem: read/write semaphore protecting the mirrors list
  */
 struct hmm {
 	struct mm_struct	*mm;
+	spinlock_t		lock;
 	atomic_t		sequence;
+	struct list_head	ranges;
 	struct list_head	mirrors;
 	struct mmu_notifier	mmu_notifier;
 	struct rw_semaphore	mirrors_sem;
@@ -72,6 +80,8 @@ static struct hmm *hmm_register(struct mm_struct *mm)
 	init_rwsem(&hmm->mirrors_sem);
 	atomic_set(&hmm->sequence, 0);
 	hmm->mmu_notifier.ops = NULL;
+	INIT_LIST_HEAD(&hmm->ranges);
+	spin_lock_init(&hmm->lock);
 	hmm->mm = mm;
 
 	/*
@@ -112,6 +122,22 @@ static void hmm_invalidate_range(struct hmm *hmm,
 				 unsigned long end)
 {
 	struct hmm_mirror *mirror;
+	struct hmm_range *range;
+
+	spin_lock(&hmm->lock);
+	list_for_each_entry(range, &hmm->ranges, list) {
+		unsigned long addr, idx, npages;
+
+		if (end < range->start || start >= range->end)
+			continue;
+
+		range->valid = false;
+		addr = max(start, range->start);
+		idx = (addr - range->start) >> PAGE_SHIFT;
+		npages = (min(range->end, end) - addr) >> PAGE_SHIFT;
+		memset(&range->pfns[idx], 0, sizeof(*range->pfns) * npages);
+	}
+	spin_unlock(&hmm->lock);
 
 	down_read(&hmm->mirrors_sem);
 	list_for_each_entry(mirror, &hmm->mirrors, list)
@@ -194,4 +220,263 @@ void hmm_mirror_unregister(struct hmm_mirror *mirror)
 	up_write(&hmm->mirrors_sem);
 }
 EXPORT_SYMBOL(hmm_mirror_unregister);
+
+static void hmm_pfns_special(hmm_pfn_t *pfns,
+			     unsigned long addr,
+			     unsigned long end)
+{
+	for (; addr < end; addr += PAGE_SIZE, pfns++)
+		*pfns = HMM_PFN_SPECIAL;
+}
+
+static int hmm_pfns_bad(unsigned long addr,
+			unsigned long end,
+			struct mm_walk *walk)
+{
+	struct hmm_range *range = walk->private;
+	hmm_pfn_t *pfns = range->pfns;
+	unsigned long i;
+
+	i = (addr - range->start) >> PAGE_SHIFT;
+	for (; addr < end; addr += PAGE_SIZE, i++)
+		pfns[i] = HMM_PFN_ERROR;
+
+	return 0;
+}
+
+static int hmm_vma_walk_hole(unsigned long addr,
+			     unsigned long end,
+			     struct mm_walk *walk)
+{
+	struct hmm_range *range = walk->private;
+	hmm_pfn_t *pfns = range->pfns;
+	unsigned long i;
+
+	i = (addr - range->start) >> PAGE_SHIFT;
+	for (; addr < end; addr += PAGE_SIZE, i++)
+		pfns[i] = HMM_PFN_EMPTY;
+
+	return 0;
+}
+
+static int hmm_vma_walk_clear(unsigned long addr,
+			      unsigned long end,
+			      struct mm_walk *walk)
+{
+	struct hmm_range *range = walk->private;
+	hmm_pfn_t *pfns = range->pfns;
+	unsigned long i;
+
+	i = (addr - range->start) >> PAGE_SHIFT;
+	for (; addr < end; addr += PAGE_SIZE, i++)
+		pfns[i] = 0;
+
+	return 0;
+}
+
+static int hmm_vma_walk_pmd(pmd_t *pmdp,
+			    unsigned long start,
+			    unsigned long end,
+			    struct mm_walk *walk)
+{
+	struct hmm_range *range = walk->private;
+	struct vm_area_struct *vma = walk->vma;
+	hmm_pfn_t *pfns = range->pfns;
+	unsigned long addr = start, i;
+	hmm_pfn_t flag;
+	pte_t *ptep;
+
+	i = (addr - range->start) >> PAGE_SHIFT;
+	flag = vma->vm_flags & VM_READ ? HMM_PFN_READ : 0;
+
+again:
+	if (pmd_none(*pmdp))
+		return hmm_vma_walk_hole(start, end, walk);
+
+	if (pmd_huge(*pmdp) && vma->vm_flags & VM_HUGETLB)
+		return hmm_pfns_bad(start, end, walk);
+
+	if (pmd_devmap(*pmdp) || pmd_trans_huge(*pmdp)) {
+		unsigned long pfn;
+		pmd_t pmd;
+
+		/*
+		 * No need to take pmd_lock here, even if some other threads
+		 * is splitting the huge pmd we will get that event through
+		 * mmu_notifier callback.
+		 *
+		 * So just read pmd value and check again its a transparent
+		 * huge or device mapping one and compute corresponding pfn
+		 * values.
+		 */
+		pmd = pmd_read_atomic(pmdp);
+		barrier();
+		if (!pmd_devmap(pmd) && !pmd_trans_huge(pmd))
+			goto again;
+		if (pmd_protnone(pmd))
+			return hmm_vma_walk_clear(start, end, walk);
+
+		pfn = pmd_pfn(pmd) + pte_index(addr);
+		flag |= pmd_write(pmd) ? HMM_PFN_WRITE : 0;
+		for (; addr < end; addr += PAGE_SIZE, i++, pfn++)
+			pfns[i] = hmm_pfn_t_from_pfn(pfn) | flag;
+		return 0;
+	}
+
+	if (pmd_bad(*pmdp))
+		return hmm_pfns_bad(start, end, walk);
+
+	ptep = pte_offset_map(pmdp, addr);
+	for (; addr < end; addr += PAGE_SIZE, ptep++, i++) {
+		pte_t pte = *ptep;
+
+		pfns[i] = 0;
+
+		if (pte_none(pte) || !pte_present(pte)) {
+			pfns[i] = HMM_PFN_EMPTY;
+			continue;
+		}
+
+		pfns[i] = hmm_pfn_t_from_pfn(pte_pfn(pte)) | flag;
+		pfns[i] |= pte_write(pte) ? HMM_PFN_WRITE : 0;
+	}
+	pte_unmap(ptep - 1);
+
+	return 0;
+}
+
+/*
+ * hmm_vma_get_pfns() - snapshot CPU page table for a range of virtual addresses
+ * @vma: virtual memory area containing the virtual address range
+ * @range: used to track snapshot validity
+ * @start: range virtual start address (inclusive)
+ * @end: range virtual end address (exclusive)
+ * @entries: array of hmm_pfn_t: provided by the caller, filled in by function
+ * Returns: -EINVAL if invalid argument, -ENOMEM out of memory, 0 success
+ *
+ * This snapshots the CPU page table for a range of virtual addresses. Snapshot
+ * validity is tracked by range struct. See hmm_vma_range_done() for further
+ * information.
+ *
+ * The range struct is initialized here. It tracks the CPU page table, but only
+ * if the function returns success (0), in which case the caller must then call
+ * hmm_vma_range_done() to stop CPU page table update tracking on this range.
+ *
+ * NOT CALLING hmm_vma_range_done() IF FUNCTION RETURNS 0 WILL LEAD TO SERIOUS
+ * MEMORY CORRUPTION ! YOU HAVE BEEN WARNED !
+ */
+int hmm_vma_get_pfns(struct vm_area_struct *vma,
+		     struct hmm_range *range,
+		     unsigned long start,
+		     unsigned long end,
+		     hmm_pfn_t *pfns)
+{
+	struct mm_walk mm_walk;
+	struct hmm *hmm;
+
+	/* FIXME support hugetlb fs */
+	if (is_vm_hugetlb_page(vma) || (vma->vm_flags & VM_SPECIAL)) {
+		hmm_pfns_special(pfns, start, end);
+		return -EINVAL;
+	}
+
+	/* Sanity check, this really should not happen ! */
+	if (start < vma->vm_start || start >= vma->vm_end)
+		return -EINVAL;
+	if (end < vma->vm_start || end > vma->vm_end)
+		return -EINVAL;
+
+	hmm = hmm_register(vma->vm_mm);
+	if (!hmm)
+		return -ENOMEM;
+	/* Caller must have registered a mirror, via hmm_mirror_register() ! */
+	if (!hmm->mmu_notifier.ops)
+		return -EINVAL;
+
+	/* Initialize range to track CPU page table update */
+	range->start = start;
+	range->pfns = pfns;
+	range->end = end;
+	spin_lock(&hmm->lock);
+	range->valid = true;
+	list_add_rcu(&range->list, &hmm->ranges);
+	spin_unlock(&hmm->lock);
+
+	mm_walk.vma = vma;
+	mm_walk.mm = vma->vm_mm;
+	mm_walk.private = range;
+	mm_walk.pte_entry = NULL;
+	mm_walk.test_walk = NULL;
+	mm_walk.hugetlb_entry = NULL;
+	mm_walk.pmd_entry = hmm_vma_walk_pmd;
+	mm_walk.pte_hole = hmm_vma_walk_hole;
+
+	walk_page_range(start, end, &mm_walk);
+
+	return 0;
+}
+EXPORT_SYMBOL(hmm_vma_get_pfns);
+
+/*
+ * hmm_vma_range_done() - stop tracking change to CPU page table over a range
+ * @vma: virtual memory area containing the virtual address range
+ * @range: range being tracked
+ * Returns: false if range data has been invalidated, true otherwise
+ *
+ * Range struct is used to track updates to the CPU page table after a call to
+ * either hmm_vma_get_pfns() or hmm_vma_fault(). Once the device driver is done
+ * using the data,  or wants to lock updates to the data it got from those
+ * functions, it must call the hmm_vma_range_done() function, which will then
+ * stop tracking CPU page table updates.
+ *
+ * Note that device driver must still implement general CPU page table update
+ * tracking either by using hmm_mirror (see hmm_mirror_register()) or by using
+ * the mmu_notifier API directly.
+ *
+ * CPU page table update tracking done through hmm_range is only temporary and
+ * to be used while trying to duplicate CPU page table contents for a range of
+ * virtual addresses.
+ *
+ * There are two ways to use this :
+ * again:
+ *   hmm_vma_get_pfns(vma, range, start, end, pfns);
+ *   trans = device_build_page_table_update_transaction(pfns);
+ *   device_page_table_lock();
+ *   if (!hmm_vma_range_done(vma, range)) {
+ *     device_page_table_unlock();
+ *     goto again;
+ *   }
+ *   device_commit_transaction(trans);
+ *   device_page_table_unlock();
+ *
+ * Or:
+ *   hmm_vma_get_pfns(vma, range, start, end, pfns);
+ *   device_page_table_lock();
+ *   hmm_vma_range_done(vma, range);
+ *   device_update_page_table(pfns);
+ *   device_page_table_unlock();
+ */
+bool hmm_vma_range_done(struct vm_area_struct *vma, struct hmm_range *range)
+{
+	unsigned long npages = (range->end - range->start) >> PAGE_SHIFT;
+	struct hmm *hmm;
+
+	if (range->end <= range->start) {
+		BUG();
+		return false;
+	}
+
+	hmm = hmm_register(vma->vm_mm);
+	if (!hmm) {
+		memset(range->pfns, 0, sizeof(*range->pfns) * npages);
+		return false;
+	}
+
+	spin_lock(&hmm->lock);
+	list_del_rcu(&range->list);
+	spin_unlock(&hmm->lock);
+
+	return range->valid;
+}
+EXPORT_SYMBOL(hmm_vma_range_done);
 #endif /* IS_ENABLED(CONFIG_HMM_MIRROR) */
-- 
cgit v1.2.3


From 74eee180b935fcb9b83a56dd7648fb75caf38f0e Mon Sep 17 00:00:00 2001
From: Jérôme Glisse <jglisse@redhat.com>
Date: Fri, 8 Sep 2017 16:11:35 -0700
Subject: mm/hmm/mirror: device page fault handler
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This handles page fault on behalf of device driver, unlike
handle_mm_fault() it does not trigger migration back to system memory for
device memory.

Link: http://lkml.kernel.org/r/20170817000548.32038-6-jglisse@redhat.com
Signed-off-by: Jérôme Glisse <jglisse@redhat.com>
Signed-off-by: Evgeny Baskakov <ebaskakov@nvidia.com>
Signed-off-by: John Hubbard <jhubbard@nvidia.com>
Signed-off-by: Mark Hairgrove <mhairgrove@nvidia.com>
Signed-off-by: Sherry Cheung <SCheung@nvidia.com>
Signed-off-by: Subhash Gutti <sgutti@nvidia.com>
Cc: Aneesh Kumar <aneesh.kumar@linux.vnet.ibm.com>
Cc: Balbir Singh <bsingharora@gmail.com>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Dan Williams <dan.j.williams@intel.com>
Cc: David Nellans <dnellans@nvidia.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Ross Zwisler <ross.zwisler@linux.intel.com>
Cc: Vladimir Davydov <vdavydov.dev@gmail.com>
Cc: Bob Liu <liubo95@huawei.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/hmm.h |  27 ++++++
 mm/hmm.c            | 256 +++++++++++++++++++++++++++++++++++++++++++++++++---
 2 files changed, 271 insertions(+), 12 deletions(-)

(limited to 'include')

diff --git a/include/linux/hmm.h b/include/linux/hmm.h
index 62899c9829c9..61a6535fe438 100644
--- a/include/linux/hmm.h
+++ b/include/linux/hmm.h
@@ -292,6 +292,33 @@ int hmm_vma_get_pfns(struct vm_area_struct *vma,
 		     unsigned long end,
 		     hmm_pfn_t *pfns);
 bool hmm_vma_range_done(struct vm_area_struct *vma, struct hmm_range *range);
+
+
+/*
+ * Fault memory on behalf of device driver. Unlike handle_mm_fault(), this will
+ * not migrate any device memory back to system memory. The hmm_pfn_t array will
+ * be updated with the fault result and current snapshot of the CPU page table
+ * for the range.
+ *
+ * The mmap_sem must be taken in read mode before entering and it might be
+ * dropped by the function if the block argument is false. In that case, the
+ * function returns -EAGAIN.
+ *
+ * Return value does not reflect if the fault was successful for every single
+ * address or not. Therefore, the caller must to inspect the hmm_pfn_t array to
+ * determine fault status for each address.
+ *
+ * Trying to fault inside an invalid vma will result in -EINVAL.
+ *
+ * See the function description in mm/hmm.c for further documentation.
+ */
+int hmm_vma_fault(struct vm_area_struct *vma,
+		  struct hmm_range *range,
+		  unsigned long start,
+		  unsigned long end,
+		  hmm_pfn_t *pfns,
+		  bool write,
+		  bool block);
 #endif /* IS_ENABLED(CONFIG_HMM_MIRROR) */
 
 
diff --git a/mm/hmm.c b/mm/hmm.c
index 172984848d51..f6c745b9a25a 100644
--- a/mm/hmm.c
+++ b/mm/hmm.c
@@ -221,6 +221,36 @@ void hmm_mirror_unregister(struct hmm_mirror *mirror)
 }
 EXPORT_SYMBOL(hmm_mirror_unregister);
 
+struct hmm_vma_walk {
+	struct hmm_range	*range;
+	unsigned long		last;
+	bool			fault;
+	bool			block;
+	bool			write;
+};
+
+static int hmm_vma_do_fault(struct mm_walk *walk,
+			    unsigned long addr,
+			    hmm_pfn_t *pfn)
+{
+	unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_REMOTE;
+	struct hmm_vma_walk *hmm_vma_walk = walk->private;
+	struct vm_area_struct *vma = walk->vma;
+	int r;
+
+	flags |= hmm_vma_walk->block ? 0 : FAULT_FLAG_ALLOW_RETRY;
+	flags |= hmm_vma_walk->write ? FAULT_FLAG_WRITE : 0;
+	r = handle_mm_fault(vma, addr, flags);
+	if (r & VM_FAULT_RETRY)
+		return -EBUSY;
+	if (r & VM_FAULT_ERROR) {
+		*pfn = HMM_PFN_ERROR;
+		return -EFAULT;
+	}
+
+	return -EAGAIN;
+}
+
 static void hmm_pfns_special(hmm_pfn_t *pfns,
 			     unsigned long addr,
 			     unsigned long end)
@@ -244,34 +274,62 @@ static int hmm_pfns_bad(unsigned long addr,
 	return 0;
 }
 
+static void hmm_pfns_clear(hmm_pfn_t *pfns,
+			   unsigned long addr,
+			   unsigned long end)
+{
+	for (; addr < end; addr += PAGE_SIZE, pfns++)
+		*pfns = 0;
+}
+
 static int hmm_vma_walk_hole(unsigned long addr,
 			     unsigned long end,
 			     struct mm_walk *walk)
 {
-	struct hmm_range *range = walk->private;
+	struct hmm_vma_walk *hmm_vma_walk = walk->private;
+	struct hmm_range *range = hmm_vma_walk->range;
 	hmm_pfn_t *pfns = range->pfns;
 	unsigned long i;
 
+	hmm_vma_walk->last = addr;
 	i = (addr - range->start) >> PAGE_SHIFT;
-	for (; addr < end; addr += PAGE_SIZE, i++)
+	for (; addr < end; addr += PAGE_SIZE, i++) {
 		pfns[i] = HMM_PFN_EMPTY;
+		if (hmm_vma_walk->fault) {
+			int ret;
 
-	return 0;
+			ret = hmm_vma_do_fault(walk, addr, &pfns[i]);
+			if (ret != -EAGAIN)
+				return ret;
+		}
+	}
+
+	return hmm_vma_walk->fault ? -EAGAIN : 0;
 }
 
 static int hmm_vma_walk_clear(unsigned long addr,
 			      unsigned long end,
 			      struct mm_walk *walk)
 {
-	struct hmm_range *range = walk->private;
+	struct hmm_vma_walk *hmm_vma_walk = walk->private;
+	struct hmm_range *range = hmm_vma_walk->range;
 	hmm_pfn_t *pfns = range->pfns;
 	unsigned long i;
 
+	hmm_vma_walk->last = addr;
 	i = (addr - range->start) >> PAGE_SHIFT;
-	for (; addr < end; addr += PAGE_SIZE, i++)
+	for (; addr < end; addr += PAGE_SIZE, i++) {
 		pfns[i] = 0;
+		if (hmm_vma_walk->fault) {
+			int ret;
 
-	return 0;
+			ret = hmm_vma_do_fault(walk, addr, &pfns[i]);
+			if (ret != -EAGAIN)
+				return ret;
+		}
+	}
+
+	return hmm_vma_walk->fault ? -EAGAIN : 0;
 }
 
 static int hmm_vma_walk_pmd(pmd_t *pmdp,
@@ -279,15 +337,18 @@ static int hmm_vma_walk_pmd(pmd_t *pmdp,
 			    unsigned long end,
 			    struct mm_walk *walk)
 {
-	struct hmm_range *range = walk->private;
+	struct hmm_vma_walk *hmm_vma_walk = walk->private;
+	struct hmm_range *range = hmm_vma_walk->range;
 	struct vm_area_struct *vma = walk->vma;
 	hmm_pfn_t *pfns = range->pfns;
 	unsigned long addr = start, i;
+	bool write_fault;
 	hmm_pfn_t flag;
 	pte_t *ptep;
 
 	i = (addr - range->start) >> PAGE_SHIFT;
 	flag = vma->vm_flags & VM_READ ? HMM_PFN_READ : 0;
+	write_fault = hmm_vma_walk->fault & hmm_vma_walk->write;
 
 again:
 	if (pmd_none(*pmdp))
@@ -316,6 +377,9 @@ again:
 		if (pmd_protnone(pmd))
 			return hmm_vma_walk_clear(start, end, walk);
 
+		if (write_fault && !pmd_write(pmd))
+			return hmm_vma_walk_clear(start, end, walk);
+
 		pfn = pmd_pfn(pmd) + pte_index(addr);
 		flag |= pmd_write(pmd) ? HMM_PFN_WRITE : 0;
 		for (; addr < end; addr += PAGE_SIZE, i++, pfn++)
@@ -332,13 +396,55 @@ again:
 
 		pfns[i] = 0;
 
-		if (pte_none(pte) || !pte_present(pte)) {
+		if (pte_none(pte)) {
 			pfns[i] = HMM_PFN_EMPTY;
+			if (hmm_vma_walk->fault)
+				goto fault;
 			continue;
 		}
 
+		if (!pte_present(pte)) {
+			swp_entry_t entry;
+
+			if (!non_swap_entry(entry)) {
+				if (hmm_vma_walk->fault)
+					goto fault;
+				continue;
+			}
+
+			entry = pte_to_swp_entry(pte);
+
+			/*
+			 * This is a special swap entry, ignore migration, use
+			 * device and report anything else as error.
+			 */
+			if (is_migration_entry(entry)) {
+				if (hmm_vma_walk->fault) {
+					pte_unmap(ptep);
+					hmm_vma_walk->last = addr;
+					migration_entry_wait(vma->vm_mm,
+							     pmdp, addr);
+					return -EAGAIN;
+				}
+				continue;
+			} else {
+				/* Report error for everything else */
+				pfns[i] = HMM_PFN_ERROR;
+			}
+			continue;
+		}
+
+		if (write_fault && !pte_write(pte))
+			goto fault;
+
 		pfns[i] = hmm_pfn_t_from_pfn(pte_pfn(pte)) | flag;
 		pfns[i] |= pte_write(pte) ? HMM_PFN_WRITE : 0;
+		continue;
+
+fault:
+		pte_unmap(ptep);
+		/* Fault all pages in range */
+		return hmm_vma_walk_clear(start, end, walk);
 	}
 	pte_unmap(ptep - 1);
 
@@ -371,6 +477,7 @@ int hmm_vma_get_pfns(struct vm_area_struct *vma,
 		     unsigned long end,
 		     hmm_pfn_t *pfns)
 {
+	struct hmm_vma_walk hmm_vma_walk;
 	struct mm_walk mm_walk;
 	struct hmm *hmm;
 
@@ -402,9 +509,12 @@ int hmm_vma_get_pfns(struct vm_area_struct *vma,
 	list_add_rcu(&range->list, &hmm->ranges);
 	spin_unlock(&hmm->lock);
 
+	hmm_vma_walk.fault = false;
+	hmm_vma_walk.range = range;
+	mm_walk.private = &hmm_vma_walk;
+
 	mm_walk.vma = vma;
 	mm_walk.mm = vma->vm_mm;
-	mm_walk.private = range;
 	mm_walk.pte_entry = NULL;
 	mm_walk.test_walk = NULL;
 	mm_walk.hugetlb_entry = NULL;
@@ -412,7 +522,6 @@ int hmm_vma_get_pfns(struct vm_area_struct *vma,
 	mm_walk.pte_hole = hmm_vma_walk_hole;
 
 	walk_page_range(start, end, &mm_walk);
-
 	return 0;
 }
 EXPORT_SYMBOL(hmm_vma_get_pfns);
@@ -439,7 +548,7 @@ EXPORT_SYMBOL(hmm_vma_get_pfns);
  *
  * There are two ways to use this :
  * again:
- *   hmm_vma_get_pfns(vma, range, start, end, pfns);
+ *   hmm_vma_get_pfns(vma, range, start, end, pfns); or hmm_vma_fault(...);
  *   trans = device_build_page_table_update_transaction(pfns);
  *   device_page_table_lock();
  *   if (!hmm_vma_range_done(vma, range)) {
@@ -450,7 +559,7 @@ EXPORT_SYMBOL(hmm_vma_get_pfns);
  *   device_page_table_unlock();
  *
  * Or:
- *   hmm_vma_get_pfns(vma, range, start, end, pfns);
+ *   hmm_vma_get_pfns(vma, range, start, end, pfns); or hmm_vma_fault(...);
  *   device_page_table_lock();
  *   hmm_vma_range_done(vma, range);
  *   device_update_page_table(pfns);
@@ -479,4 +588,127 @@ bool hmm_vma_range_done(struct vm_area_struct *vma, struct hmm_range *range)
 	return range->valid;
 }
 EXPORT_SYMBOL(hmm_vma_range_done);
+
+/*
+ * hmm_vma_fault() - try to fault some address in a virtual address range
+ * @vma: virtual memory area containing the virtual address range
+ * @range: use to track pfns array content validity
+ * @start: fault range virtual start address (inclusive)
+ * @end: fault range virtual end address (exclusive)
+ * @pfns: array of hmm_pfn_t, only entry with fault flag set will be faulted
+ * @write: is it a write fault
+ * @block: allow blocking on fault (if true it sleeps and do not drop mmap_sem)
+ * Returns: 0 success, error otherwise (-EAGAIN means mmap_sem have been drop)
+ *
+ * This is similar to a regular CPU page fault except that it will not trigger
+ * any memory migration if the memory being faulted is not accessible by CPUs.
+ *
+ * On error, for one virtual address in the range, the function will set the
+ * hmm_pfn_t error flag for the corresponding pfn entry.
+ *
+ * Expected use pattern:
+ * retry:
+ *   down_read(&mm->mmap_sem);
+ *   // Find vma and address device wants to fault, initialize hmm_pfn_t
+ *   // array accordingly
+ *   ret = hmm_vma_fault(vma, start, end, pfns, allow_retry);
+ *   switch (ret) {
+ *   case -EAGAIN:
+ *     hmm_vma_range_done(vma, range);
+ *     // You might want to rate limit or yield to play nicely, you may
+ *     // also commit any valid pfn in the array assuming that you are
+ *     // getting true from hmm_vma_range_monitor_end()
+ *     goto retry;
+ *   case 0:
+ *     break;
+ *   default:
+ *     // Handle error !
+ *     up_read(&mm->mmap_sem)
+ *     return;
+ *   }
+ *   // Take device driver lock that serialize device page table update
+ *   driver_lock_device_page_table_update();
+ *   hmm_vma_range_done(vma, range);
+ *   // Commit pfns we got from hmm_vma_fault()
+ *   driver_unlock_device_page_table_update();
+ *   up_read(&mm->mmap_sem)
+ *
+ * YOU MUST CALL hmm_vma_range_done() AFTER THIS FUNCTION RETURN SUCCESS (0)
+ * BEFORE FREEING THE range struct OR YOU WILL HAVE SERIOUS MEMORY CORRUPTION !
+ *
+ * YOU HAVE BEEN WARNED !
+ */
+int hmm_vma_fault(struct vm_area_struct *vma,
+		  struct hmm_range *range,
+		  unsigned long start,
+		  unsigned long end,
+		  hmm_pfn_t *pfns,
+		  bool write,
+		  bool block)
+{
+	struct hmm_vma_walk hmm_vma_walk;
+	struct mm_walk mm_walk;
+	struct hmm *hmm;
+	int ret;
+
+	/* Sanity check, this really should not happen ! */
+	if (start < vma->vm_start || start >= vma->vm_end)
+		return -EINVAL;
+	if (end < vma->vm_start || end > vma->vm_end)
+		return -EINVAL;
+
+	hmm = hmm_register(vma->vm_mm);
+	if (!hmm) {
+		hmm_pfns_clear(pfns, start, end);
+		return -ENOMEM;
+	}
+	/* Caller must have registered a mirror using hmm_mirror_register() */
+	if (!hmm->mmu_notifier.ops)
+		return -EINVAL;
+
+	/* Initialize range to track CPU page table update */
+	range->start = start;
+	range->pfns = pfns;
+	range->end = end;
+	spin_lock(&hmm->lock);
+	range->valid = true;
+	list_add_rcu(&range->list, &hmm->ranges);
+	spin_unlock(&hmm->lock);
+
+	/* FIXME support hugetlb fs */
+	if (is_vm_hugetlb_page(vma) || (vma->vm_flags & VM_SPECIAL)) {
+		hmm_pfns_special(pfns, start, end);
+		return 0;
+	}
+
+	hmm_vma_walk.fault = true;
+	hmm_vma_walk.write = write;
+	hmm_vma_walk.block = block;
+	hmm_vma_walk.range = range;
+	mm_walk.private = &hmm_vma_walk;
+	hmm_vma_walk.last = range->start;
+
+	mm_walk.vma = vma;
+	mm_walk.mm = vma->vm_mm;
+	mm_walk.pte_entry = NULL;
+	mm_walk.test_walk = NULL;
+	mm_walk.hugetlb_entry = NULL;
+	mm_walk.pmd_entry = hmm_vma_walk_pmd;
+	mm_walk.pte_hole = hmm_vma_walk_hole;
+
+	do {
+		ret = walk_page_range(start, end, &mm_walk);
+		start = hmm_vma_walk.last;
+	} while (ret == -EAGAIN);
+
+	if (ret) {
+		unsigned long i;
+
+		i = (hmm_vma_walk.last - range->start) >> PAGE_SHIFT;
+		hmm_pfns_clear(&pfns[i], hmm_vma_walk.last, end);
+		hmm_vma_range_done(vma, range);
+	}
+	return ret;
+}
+EXPORT_SYMBOL(hmm_vma_fault);
 #endif /* IS_ENABLED(CONFIG_HMM_MIRROR) */
-- 
cgit v1.2.3


From 3072e413e305e353cd4654f8a57d953b66e85bf3 Mon Sep 17 00:00:00 2001
From: Michal Hocko <mhocko@suse.com>
Date: Fri, 8 Sep 2017 16:11:39 -0700
Subject: mm/memory_hotplug: introduce add_pages
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

There are new users of memory hotplug emerging.  Some of them require
different subset of arch_add_memory.  There are some which only require
allocation of struct pages without mapping those pages to the kernel
address space.  We currently have __add_pages for that purpose.  But this
is rather lowlevel and not very suitable for the code outside of the
memory hotplug.  E.g.  x86_64 wants to update max_pfn which should be done
by the caller.  Introduce add_pages() which should care about those
details if they are needed.  Each architecture should define its
implementation and select CONFIG_ARCH_HAS_ADD_PAGES.  All others use the
currently existing __add_pages.

Link: http://lkml.kernel.org/r/20170817000548.32038-7-jglisse@redhat.com
Signed-off-by: Michal Hocko <mhocko@suse.com>
Signed-off-by: Jérôme Glisse <jglisse@redhat.com>
Acked-by: Balbir Singh <bsingharora@gmail.com>
Cc: Aneesh Kumar <aneesh.kumar@linux.vnet.ibm.com>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Dan Williams <dan.j.williams@intel.com>
Cc: David Nellans <dnellans@nvidia.com>
Cc: Evgeny Baskakov <ebaskakov@nvidia.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: John Hubbard <jhubbard@nvidia.com>
Cc: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Cc: Mark Hairgrove <mhairgrove@nvidia.com>
Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Ross Zwisler <ross.zwisler@linux.intel.com>
Cc: Sherry Cheung <SCheung@nvidia.com>
Cc: Subhash Gutti <sgutti@nvidia.com>
Cc: Vladimir Davydov <vdavydov.dev@gmail.com>
Cc: Bob Liu <liubo95@huawei.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/x86/Kconfig               |  4 ++++
 arch/x86/mm/init_64.c          | 22 +++++++++++++++-------
 include/linux/memory_hotplug.h | 11 +++++++++++
 3 files changed, 30 insertions(+), 7 deletions(-)

(limited to 'include')

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 07d9b6c75328..a3e6e6136a47 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -2323,6 +2323,10 @@ source "kernel/livepatch/Kconfig"
 
 endmenu
 
+config ARCH_HAS_ADD_PAGES
+	def_bool y
+	depends on X86_64 && ARCH_ENABLE_MEMORY_HOTPLUG
+
 config ARCH_ENABLE_MEMORY_HOTPLUG
 	def_bool y
 	depends on X86_64 || (X86_32 && HIGHMEM)
diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index 136422d7d539..048fbe8fc274 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -761,7 +761,7 @@ void __init paging_init(void)
  * After memory hotplug the variables max_pfn, max_low_pfn and high_memory need
  * updating.
  */
-static void  update_end_of_memory_vars(u64 start, u64 size)
+static void update_end_of_memory_vars(u64 start, u64 size)
 {
 	unsigned long end_pfn = PFN_UP(start + size);
 
@@ -772,22 +772,30 @@ static void  update_end_of_memory_vars(u64 start, u64 size)
 	}
 }
 
-int arch_add_memory(int nid, u64 start, u64 size, bool want_memblock)
+int add_pages(int nid, unsigned long start_pfn,
+	      unsigned long nr_pages, bool want_memblock)
 {
-	unsigned long start_pfn = start >> PAGE_SHIFT;
-	unsigned long nr_pages = size >> PAGE_SHIFT;
 	int ret;
 
-	init_memory_mapping(start, start + size);
-
 	ret = __add_pages(nid, start_pfn, nr_pages, want_memblock);
 	WARN_ON_ONCE(ret);
 
 	/* update max_pfn, max_low_pfn and high_memory */
-	update_end_of_memory_vars(start, size);
+	update_end_of_memory_vars(start_pfn << PAGE_SHIFT,
+				  nr_pages << PAGE_SHIFT);
 
 	return ret;
 }
+
+int arch_add_memory(int nid, u64 start, u64 size, bool want_memblock)
+{
+	unsigned long start_pfn = start >> PAGE_SHIFT;
+	unsigned long nr_pages = size >> PAGE_SHIFT;
+
+	init_memory_mapping(start, start + size);
+
+	return add_pages(nid, start_pfn, nr_pages, want_memblock);
+}
 EXPORT_SYMBOL_GPL(arch_add_memory);
 
 #define PAGE_INUSE 0xFD
diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h
index 5e6e4cc36ff4..0995e1a2b458 100644
--- a/include/linux/memory_hotplug.h
+++ b/include/linux/memory_hotplug.h
@@ -133,6 +133,17 @@ extern int __remove_pages(struct zone *zone, unsigned long start_pfn,
 extern int __add_pages(int nid, unsigned long start_pfn,
 	unsigned long nr_pages, bool want_memblock);
 
+#ifndef CONFIG_ARCH_HAS_ADD_PAGES
+static inline int add_pages(int nid, unsigned long start_pfn,
+			    unsigned long nr_pages, bool want_memblock)
+{
+	return __add_pages(nid, start_pfn, nr_pages, want_memblock);
+}
+#else /* ARCH_HAS_ADD_PAGES */
+int add_pages(int nid, unsigned long start_pfn,
+	      unsigned long nr_pages, bool want_memblock);
+#endif /* ARCH_HAS_ADD_PAGES */
+
 #ifdef CONFIG_NUMA
 extern int memory_add_physaddr_to_nid(u64 start);
 #else
-- 
cgit v1.2.3


From 5042db43cc26f51eed51c56192e2c2317e44315f Mon Sep 17 00:00:00 2001
From: Jérôme Glisse <jglisse@redhat.com>
Date: Fri, 8 Sep 2017 16:11:43 -0700
Subject: mm/ZONE_DEVICE: new type of ZONE_DEVICE for unaddressable memory
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

HMM (heterogeneous memory management) need struct page to support
migration from system main memory to device memory.  Reasons for HMM and
migration to device memory is explained with HMM core patch.

This patch deals with device memory that is un-addressable memory (ie CPU
can not access it).  Hence we do not want those struct page to be manage
like regular memory.  That is why we extend ZONE_DEVICE to support
different types of memory.

A persistent memory type is define for existing user of ZONE_DEVICE and a
new device un-addressable type is added for the un-addressable memory
type.  There is a clear separation between what is expected from each
memory type and existing user of ZONE_DEVICE are un-affected by new
requirement and new use of the un-addressable type.  All specific code
path are protect with test against the memory type.

Because memory is un-addressable we use a new special swap type for when a
page is migrated to device memory (this reduces the number of maximum swap
file).

The main two additions beside memory type to ZONE_DEVICE is two callbacks.
First one, page_free() is call whenever page refcount reach 1 (which
means the page is free as ZONE_DEVICE page never reach a refcount of 0).
This allow device driver to manage its memory and associated struct page.

The second callback page_fault() happens when there is a CPU access to an
address that is back by a device page (which are un-addressable by the
CPU).  This callback is responsible to migrate the page back to system
main memory.  Device driver can not block migration back to system memory,
HMM make sure that such page can not be pin into device memory.

If device is in some error condition and can not migrate memory back then
a CPU page fault to device memory should end with SIGBUS.

[arnd@arndb.de: fix warning]
  Link: http://lkml.kernel.org/r/20170823133213.712917-1-arnd@arndb.de
Link: http://lkml.kernel.org/r/20170817000548.32038-8-jglisse@redhat.com
Signed-off-by: Jérôme Glisse <jglisse@redhat.com>
Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Acked-by: Dan Williams <dan.j.williams@intel.com>
Cc: Ross Zwisler <ross.zwisler@linux.intel.com>
Cc: Aneesh Kumar <aneesh.kumar@linux.vnet.ibm.com>
Cc: Balbir Singh <bsingharora@gmail.com>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: David Nellans <dnellans@nvidia.com>
Cc: Evgeny Baskakov <ebaskakov@nvidia.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: John Hubbard <jhubbard@nvidia.com>
Cc: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Cc: Mark Hairgrove <mhairgrove@nvidia.com>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Sherry Cheung <SCheung@nvidia.com>
Cc: Subhash Gutti <sgutti@nvidia.com>
Cc: Vladimir Davydov <vdavydov.dev@gmail.com>
Cc: Bob Liu <liubo95@huawei.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/proc/task_mmu.c       |  7 +++++
 include/linux/ioport.h   |  1 +
 include/linux/memremap.h | 73 ++++++++++++++++++++++++++++++++++++++++++++++++
 include/linux/mm.h       | 12 ++++++++
 include/linux/swap.h     | 24 ++++++++++++++--
 include/linux/swapops.h  | 68 ++++++++++++++++++++++++++++++++++++++++++++
 kernel/memremap.c        | 34 ++++++++++++++++++++++
 mm/Kconfig               | 11 +++++++-
 mm/memory.c              | 61 ++++++++++++++++++++++++++++++++++++++++
 mm/memory_hotplug.c      | 10 +++++--
 mm/mprotect.c            | 14 ++++++++++
 11 files changed, 309 insertions(+), 6 deletions(-)

(limited to 'include')

diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index 4b21c4e51ce4..90ab657f8e56 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -549,6 +549,8 @@ static void smaps_pte_entry(pte_t *pte, unsigned long addr,
 			}
 		} else if (is_migration_entry(swpent))
 			page = migration_entry_to_page(swpent);
+		else if (is_device_private_entry(swpent))
+			page = device_private_entry_to_page(swpent);
 	} else if (unlikely(IS_ENABLED(CONFIG_SHMEM) && mss->check_shmem_swap
 							&& pte_none(*pte))) {
 		page = find_get_entry(vma->vm_file->f_mapping,
@@ -713,6 +715,8 @@ static int smaps_hugetlb_range(pte_t *pte, unsigned long hmask,
 
 		if (is_migration_entry(swpent))
 			page = migration_entry_to_page(swpent);
+		else if (is_device_private_entry(swpent))
+			page = device_private_entry_to_page(swpent);
 	}
 	if (page) {
 		int mapcount = page_mapcount(page);
@@ -1276,6 +1280,9 @@ static pagemap_entry_t pte_to_pagemap_entry(struct pagemapread *pm,
 		flags |= PM_SWAP;
 		if (is_migration_entry(entry))
 			page = migration_entry_to_page(entry);
+
+		if (is_device_private_entry(entry))
+			page = device_private_entry_to_page(entry);
 	}
 
 	if (page && !PageAnon(page))
diff --git a/include/linux/ioport.h b/include/linux/ioport.h
index 6230064d7f95..3a4f69137bc2 100644
--- a/include/linux/ioport.h
+++ b/include/linux/ioport.h
@@ -130,6 +130,7 @@ enum {
 	IORES_DESC_ACPI_NV_STORAGE		= 3,
 	IORES_DESC_PERSISTENT_MEMORY		= 4,
 	IORES_DESC_PERSISTENT_MEMORY_LEGACY	= 5,
+	IORES_DESC_DEVICE_PRIVATE_MEMORY	= 6,
 };
 
 /* helpers to define resources */
diff --git a/include/linux/memremap.h b/include/linux/memremap.h
index 93416196ba64..8e164ec9eed0 100644
--- a/include/linux/memremap.h
+++ b/include/linux/memremap.h
@@ -4,6 +4,8 @@
 #include <linux/ioport.h>
 #include <linux/percpu-refcount.h>
 
+#include <asm/pgtable.h>
+
 struct resource;
 struct device;
 
@@ -35,18 +37,89 @@ static inline struct vmem_altmap *to_vmem_altmap(unsigned long memmap_start)
 }
 #endif
 
+/*
+ * Specialize ZONE_DEVICE memory into multiple types each having differents
+ * usage.
+ *
+ * MEMORY_DEVICE_HOST:
+ * Persistent device memory (pmem): struct page might be allocated in different
+ * memory and architecture might want to perform special actions. It is similar
+ * to regular memory, in that the CPU can access it transparently. However,
+ * it is likely to have different bandwidth and latency than regular memory.
+ * See Documentation/nvdimm/nvdimm.txt for more information.
+ *
+ * MEMORY_DEVICE_PRIVATE:
+ * Device memory that is not directly addressable by the CPU: CPU can neither
+ * read nor write private memory. In this case, we do still have struct pages
+ * backing the device memory. Doing so simplifies the implementation, but it is
+ * important to remember that there are certain points at which the struct page
+ * must be treated as an opaque object, rather than a "normal" struct page.
+ *
+ * A more complete discussion of unaddressable memory may be found in
+ * include/linux/hmm.h and Documentation/vm/hmm.txt.
+ */
+enum memory_type {
+	MEMORY_DEVICE_HOST = 0,
+	MEMORY_DEVICE_PRIVATE,
+};
+
+/*
+ * For MEMORY_DEVICE_PRIVATE we use ZONE_DEVICE and extend it with two
+ * callbacks:
+ *   page_fault()
+ *   page_free()
+ *
+ * Additional notes about MEMORY_DEVICE_PRIVATE may be found in
+ * include/linux/hmm.h and Documentation/vm/hmm.txt. There is also a brief
+ * explanation in include/linux/memory_hotplug.h.
+ *
+ * The page_fault() callback must migrate page back, from device memory to
+ * system memory, so that the CPU can access it. This might fail for various
+ * reasons (device issues,  device have been unplugged, ...). When such error
+ * conditions happen, the page_fault() callback must return VM_FAULT_SIGBUS and
+ * set the CPU page table entry to "poisoned".
+ *
+ * Note that because memory cgroup charges are transferred to the device memory,
+ * this should never fail due to memory restrictions. However, allocation
+ * of a regular system page might still fail because we are out of memory. If
+ * that happens, the page_fault() callback must return VM_FAULT_OOM.
+ *
+ * The page_fault() callback can also try to migrate back multiple pages in one
+ * chunk, as an optimization. It must, however, prioritize the faulting address
+ * over all the others.
+ *
+ *
+ * The page_free() callback is called once the page refcount reaches 1
+ * (ZONE_DEVICE pages never reach 0 refcount unless there is a refcount bug.
+ * This allows the device driver to implement its own memory management.)
+ */
+typedef int (*dev_page_fault_t)(struct vm_area_struct *vma,
+				unsigned long addr,
+				const struct page *page,
+				unsigned int flags,
+				pmd_t *pmdp);
+typedef void (*dev_page_free_t)(struct page *page, void *data);
+
 /**
  * struct dev_pagemap - metadata for ZONE_DEVICE mappings
+ * @page_fault: callback when CPU fault on an unaddressable device page
+ * @page_free: free page callback when page refcount reaches 1
  * @altmap: pre-allocated/reserved memory for vmemmap allocations
  * @res: physical address range covered by @ref
  * @ref: reference count that pins the devm_memremap_pages() mapping
  * @dev: host device of the mapping for debug
+ * @data: private data pointer for page_free()
+ * @type: memory type: see MEMORY_* in memory_hotplug.h
  */
 struct dev_pagemap {
+	dev_page_fault_t page_fault;
+	dev_page_free_t page_free;
 	struct vmem_altmap *altmap;
 	const struct resource *res;
 	struct percpu_ref *ref;
 	struct device *dev;
+	void *data;
+	enum memory_type type;
 };
 
 #ifdef CONFIG_ZONE_DEVICE
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 39db8e54c5d5..a74c4e954352 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -792,11 +792,23 @@ static inline bool is_zone_device_page(const struct page *page)
 {
 	return page_zonenum(page) == ZONE_DEVICE;
 }
+
+static inline bool is_device_private_page(const struct page *page)
+{
+	/* See MEMORY_DEVICE_PRIVATE in include/linux/memory_hotplug.h */
+	return ((page_zonenum(page) == ZONE_DEVICE) &&
+		(page->pgmap->type == MEMORY_DEVICE_PRIVATE));
+}
 #else
 static inline bool is_zone_device_page(const struct page *page)
 {
 	return false;
 }
+
+static inline bool is_device_private_page(const struct page *page)
+{
+	return false;
+}
 #endif
 
 static inline void get_page(struct page *page)
diff --git a/include/linux/swap.h b/include/linux/swap.h
index 8bf3487fb204..8a807292037f 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -50,6 +50,23 @@ static inline int current_is_kswapd(void)
  * actions on faults.
  */
 
+/*
+ * Unaddressable device memory support. See include/linux/hmm.h and
+ * Documentation/vm/hmm.txt. Short description is we need struct pages for
+ * device memory that is unaddressable (inaccessible) by CPU, so that we can
+ * migrate part of a process memory to device memory.
+ *
+ * When a page is migrated from CPU to device, we set the CPU page table entry
+ * to a special SWP_DEVICE_* entry.
+ */
+#ifdef CONFIG_DEVICE_PRIVATE
+#define SWP_DEVICE_NUM 2
+#define SWP_DEVICE_WRITE (MAX_SWAPFILES+SWP_HWPOISON_NUM+SWP_MIGRATION_NUM)
+#define SWP_DEVICE_READ (MAX_SWAPFILES+SWP_HWPOISON_NUM+SWP_MIGRATION_NUM+1)
+#else
+#define SWP_DEVICE_NUM 0
+#endif
+
 /*
  * NUMA node memory migration support
  */
@@ -72,7 +89,8 @@ static inline int current_is_kswapd(void)
 #endif
 
 #define MAX_SWAPFILES \
-	((1 << MAX_SWAPFILES_SHIFT) - SWP_MIGRATION_NUM - SWP_HWPOISON_NUM)
+	((1 << MAX_SWAPFILES_SHIFT) - SWP_DEVICE_NUM - \
+	SWP_MIGRATION_NUM - SWP_HWPOISON_NUM)
 
 /*
  * Magic header for a swap area. The first part of the union is
@@ -469,8 +487,8 @@ static inline void show_swap_cache_info(void)
 {
 }
 
-#define free_swap_and_cache(swp)	is_migration_entry(swp)
-#define swapcache_prepare(swp)		is_migration_entry(swp)
+#define free_swap_and_cache(e) ({(is_migration_entry(e) || is_device_private_entry(e));})
+#define swapcache_prepare(e) ({(is_migration_entry(e) || is_device_private_entry(e));})
 
 static inline int add_swap_count_continuation(swp_entry_t swp, gfp_t gfp_mask)
 {
diff --git a/include/linux/swapops.h b/include/linux/swapops.h
index 45b092aa6419..291c4b534658 100644
--- a/include/linux/swapops.h
+++ b/include/linux/swapops.h
@@ -100,6 +100,74 @@ static inline void *swp_to_radix_entry(swp_entry_t entry)
 	return (void *)(value | RADIX_TREE_EXCEPTIONAL_ENTRY);
 }
 
+#if IS_ENABLED(CONFIG_DEVICE_PRIVATE)
+static inline swp_entry_t make_device_private_entry(struct page *page, bool write)
+{
+	return swp_entry(write ? SWP_DEVICE_WRITE : SWP_DEVICE_READ,
+			 page_to_pfn(page));
+}
+
+static inline bool is_device_private_entry(swp_entry_t entry)
+{
+	int type = swp_type(entry);
+	return type == SWP_DEVICE_READ || type == SWP_DEVICE_WRITE;
+}
+
+static inline void make_device_private_entry_read(swp_entry_t *entry)
+{
+	*entry = swp_entry(SWP_DEVICE_READ, swp_offset(*entry));
+}
+
+static inline bool is_write_device_private_entry(swp_entry_t entry)
+{
+	return unlikely(swp_type(entry) == SWP_DEVICE_WRITE);
+}
+
+static inline struct page *device_private_entry_to_page(swp_entry_t entry)
+{
+	return pfn_to_page(swp_offset(entry));
+}
+
+int device_private_entry_fault(struct vm_area_struct *vma,
+		       unsigned long addr,
+		       swp_entry_t entry,
+		       unsigned int flags,
+		       pmd_t *pmdp);
+#else /* CONFIG_DEVICE_PRIVATE */
+static inline swp_entry_t make_device_private_entry(struct page *page, bool write)
+{
+	return swp_entry(0, 0);
+}
+
+static inline void make_device_private_entry_read(swp_entry_t *entry)
+{
+}
+
+static inline bool is_device_private_entry(swp_entry_t entry)
+{
+	return false;
+}
+
+static inline bool is_write_device_private_entry(swp_entry_t entry)
+{
+	return false;
+}
+
+static inline struct page *device_private_entry_to_page(swp_entry_t entry)
+{
+	return NULL;
+}
+
+static inline int device_private_entry_fault(struct vm_area_struct *vma,
+				     unsigned long addr,
+				     swp_entry_t entry,
+				     unsigned int flags,
+				     pmd_t *pmdp)
+{
+	return VM_FAULT_SIGBUS;
+}
+#endif /* CONFIG_DEVICE_PRIVATE */
+
 #ifdef CONFIG_MIGRATION
 static inline swp_entry_t make_migration_entry(struct page *page, int write)
 {
diff --git a/kernel/memremap.c b/kernel/memremap.c
index 066e73c2fcc9..f1d1e0dfe8b4 100644
--- a/kernel/memremap.c
+++ b/kernel/memremap.c
@@ -18,6 +18,8 @@
 #include <linux/io.h>
 #include <linux/mm.h>
 #include <linux/memory_hotplug.h>
+#include <linux/swap.h>
+#include <linux/swapops.h>
 
 #ifndef ioremap_cache
 /* temporary while we convert existing ioremap_cache users to memremap */
@@ -219,6 +221,34 @@ static unsigned long order_at(struct resource *res, unsigned long pgoff)
 	for (pgoff = 0, order = order_at((res), pgoff); order < ULONG_MAX; \
 			pgoff += 1UL << order, order = order_at((res), pgoff))
 
+#if IS_ENABLED(CONFIG_DEVICE_PRIVATE)
+int device_private_entry_fault(struct vm_area_struct *vma,
+		       unsigned long addr,
+		       swp_entry_t entry,
+		       unsigned int flags,
+		       pmd_t *pmdp)
+{
+	struct page *page = device_private_entry_to_page(entry);
+
+	/*
+	 * The page_fault() callback must migrate page back to system memory
+	 * so that CPU can access it. This might fail for various reasons
+	 * (device issue, device was unsafely unplugged, ...). When such
+	 * error conditions happen, the callback must return VM_FAULT_SIGBUS.
+	 *
+	 * Note that because memory cgroup charges are accounted to the device
+	 * memory, this should never fail because of memory restrictions (but
+	 * allocation of regular system page might still fail because we are
+	 * out of memory).
+	 *
+	 * There is a more in-depth description of what that callback can and
+	 * cannot do, in include/linux/memremap.h
+	 */
+	return page->pgmap->page_fault(vma, addr, page, flags, pmdp);
+}
+EXPORT_SYMBOL(device_private_entry_fault);
+#endif /* CONFIG_DEVICE_PRIVATE */
+
 static void pgmap_radix_release(struct resource *res)
 {
 	unsigned long pgoff, order;
@@ -356,6 +386,10 @@ void *devm_memremap_pages(struct device *dev, struct resource *res,
 	}
 	pgmap->ref = ref;
 	pgmap->res = &page_map->res;
+	pgmap->type = MEMORY_DEVICE_HOST;
+	pgmap->page_fault = NULL;
+	pgmap->page_free = NULL;
+	pgmap->data = NULL;
 
 	mutex_lock(&pgmap_lock);
 	error = 0;
diff --git a/mm/Kconfig b/mm/Kconfig
index 254db99f263d..ec27855db133 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -676,7 +676,7 @@ config ARCH_HAS_ZONE_DEVICE
 	bool
 
 config ZONE_DEVICE
-	bool "Device memory (pmem, etc...) hotplug support"
+	bool "Device memory (pmem, HMM, etc...) hotplug support"
 	depends on MEMORY_HOTPLUG
 	depends on MEMORY_HOTREMOVE
 	depends on SPARSEMEM_VMEMMAP
@@ -717,6 +717,15 @@ config HMM_MIRROR
 	  page tables (at PAGE_SIZE granularity), and must be able to recover from
 	  the resulting potential page faults.
 
+config DEVICE_PRIVATE
+	bool "Unaddressable device memory (GPU memory, ...)"
+	depends on ARCH_HAS_HMM
+
+	help
+	  Allows creation of struct pages to represent unaddressable device
+	  memory; i.e., memory that is only accessible from the device (or
+	  group of devices). You likely also want to select HMM_MIRROR.
+
 config FRAME_VECTOR
 	bool
 
diff --git a/mm/memory.c b/mm/memory.c
index 886033b95fd2..079eeac0b009 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -49,6 +49,7 @@
 #include <linux/swap.h>
 #include <linux/highmem.h>
 #include <linux/pagemap.h>
+#include <linux/memremap.h>
 #include <linux/ksm.h>
 #include <linux/rmap.h>
 #include <linux/export.h>
@@ -956,6 +957,35 @@ copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm,
 					pte = pte_swp_mksoft_dirty(pte);
 				set_pte_at(src_mm, addr, src_pte, pte);
 			}
+		} else if (is_device_private_entry(entry)) {
+			page = device_private_entry_to_page(entry);
+
+			/*
+			 * Update rss count even for unaddressable pages, as
+			 * they should treated just like normal pages in this
+			 * respect.
+			 *
+			 * We will likely want to have some new rss counters
+			 * for unaddressable pages, at some point. But for now
+			 * keep things as they are.
+			 */
+			get_page(page);
+			rss[mm_counter(page)]++;
+			page_dup_rmap(page, false);
+
+			/*
+			 * We do not preserve soft-dirty information, because so
+			 * far, checkpoint/restore is the only feature that
+			 * requires that. And checkpoint/restore does not work
+			 * when a device driver is involved (you cannot easily
+			 * save and restore device driver state).
+			 */
+			if (is_write_device_private_entry(entry) &&
+			    is_cow_mapping(vm_flags)) {
+				make_device_private_entry_read(&entry);
+				pte = swp_entry_to_pte(entry);
+				set_pte_at(src_mm, addr, src_pte, pte);
+			}
 		}
 		goto out_set_pte;
 	}
@@ -1274,6 +1304,29 @@ again:
 			}
 			continue;
 		}
+
+		entry = pte_to_swp_entry(ptent);
+		if (non_swap_entry(entry) && is_device_private_entry(entry)) {
+			struct page *page = device_private_entry_to_page(entry);
+
+			if (unlikely(details && details->check_mapping)) {
+				/*
+				 * unmap_shared_mapping_pages() wants to
+				 * invalidate cache without truncating:
+				 * unmap shared but keep private pages.
+				 */
+				if (details->check_mapping !=
+				    page_rmapping(page))
+					continue;
+			}
+
+			pte_clear_not_present_full(mm, addr, pte, tlb->fullmm);
+			rss[mm_counter(page)]--;
+			page_remove_rmap(page, false);
+			put_page(page);
+			continue;
+		}
+
 		/* If details->check_mapping, we leave swap entries. */
 		if (unlikely(details))
 			continue;
@@ -2776,6 +2829,14 @@ int do_swap_page(struct vm_fault *vmf)
 		if (is_migration_entry(entry)) {
 			migration_entry_wait(vma->vm_mm, vmf->pmd,
 					     vmf->address);
+		} else if (is_device_private_entry(entry)) {
+			/*
+			 * For un-addressable device memory we call the pgmap
+			 * fault handler callback. The callback must migrate
+			 * the page back to some CPU accessible page.
+			 */
+			ret = device_private_entry_fault(vma, vmf->address, entry,
+						 vmf->flags, vmf->pmd);
 		} else if (is_hwpoison_entry(entry)) {
 			ret = VM_FAULT_HWPOISON;
 		} else {
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index 1f92fb84770d..e882cb6da994 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -99,7 +99,7 @@ void mem_hotplug_done(void)
 /* add this memory to iomem resource */
 static struct resource *register_memory_resource(u64 start, u64 size)
 {
-	struct resource *res;
+	struct resource *res, *conflict;
 	res = kzalloc(sizeof(struct resource), GFP_KERNEL);
 	if (!res)
 		return ERR_PTR(-ENOMEM);
@@ -108,7 +108,13 @@ static struct resource *register_memory_resource(u64 start, u64 size)
 	res->start = start;
 	res->end = start + size - 1;
 	res->flags = IORESOURCE_SYSTEM_RAM | IORESOURCE_BUSY;
-	if (request_resource(&iomem_resource, res) < 0) {
+	conflict =  request_resource_conflict(&iomem_resource, res);
+	if (conflict) {
+		if (conflict->desc == IORES_DESC_DEVICE_PRIVATE_MEMORY) {
+			pr_debug("Device unaddressable memory block "
+				 "memory hotplug at %#010llx !\n",
+				 (unsigned long long)start);
+		}
 		pr_debug("System RAM resource %pR cannot be added\n", res);
 		kfree(res);
 		return ERR_PTR(-EEXIST);
diff --git a/mm/mprotect.c b/mm/mprotect.c
index a1bfe9545770..6d3e2f082290 100644
--- a/mm/mprotect.c
+++ b/mm/mprotect.c
@@ -125,6 +125,20 @@ static unsigned long change_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
 
 				pages++;
 			}
+
+			if (is_write_device_private_entry(entry)) {
+				pte_t newpte;
+
+				/*
+				 * We do not preserve soft-dirtiness. See
+				 * copy_one_pte() for explanation.
+				 */
+				make_device_private_entry_read(&entry);
+				newpte = swp_entry_to_pte(entry);
+				set_pte_at(mm, addr, pte, newpte);
+
+				pages++;
+			}
 		}
 	} while (pte++, addr += PAGE_SIZE, addr != end);
 	arch_leave_lazy_mmu_mode();
-- 
cgit v1.2.3


From 7b2d55d2c8961ae9d456d3133f4ae2f0fbd3e14f Mon Sep 17 00:00:00 2001
From: Jérôme Glisse <jglisse@redhat.com>
Date: Fri, 8 Sep 2017 16:11:46 -0700
Subject: mm/ZONE_DEVICE: special case put_page() for device private pages
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

A ZONE_DEVICE page that reach a refcount of 1 is free ie no longer have
any user.  For device private pages this is important to catch and thus we
need to special case put_page() for this.

Link: http://lkml.kernel.org/r/20170817000548.32038-9-jglisse@redhat.com
Signed-off-by: Jérôme Glisse <jglisse@redhat.com>
Cc: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Cc: Dan Williams <dan.j.williams@intel.com>
Cc: Ross Zwisler <ross.zwisler@linux.intel.com>
Cc: Aneesh Kumar <aneesh.kumar@linux.vnet.ibm.com>
Cc: Balbir Singh <bsingharora@gmail.com>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: David Nellans <dnellans@nvidia.com>
Cc: Evgeny Baskakov <ebaskakov@nvidia.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: John Hubbard <jhubbard@nvidia.com>
Cc: Mark Hairgrove <mhairgrove@nvidia.com>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Sherry Cheung <SCheung@nvidia.com>
Cc: Subhash Gutti <sgutti@nvidia.com>
Cc: Vladimir Davydov <vdavydov.dev@gmail.com>
Cc: Bob Liu <liubo95@huawei.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/memremap.h | 13 +++++++++++++
 include/linux/mm.h       | 31 ++++++++++++++++++++++---------
 kernel/memremap.c        | 25 ++++++++++++++++++++++++-
 mm/hmm.c                 |  8 ++++++++
 4 files changed, 67 insertions(+), 10 deletions(-)

(limited to 'include')

diff --git a/include/linux/memremap.h b/include/linux/memremap.h
index 8e164ec9eed0..8aa6b82679e2 100644
--- a/include/linux/memremap.h
+++ b/include/linux/memremap.h
@@ -126,6 +126,14 @@ struct dev_pagemap {
 void *devm_memremap_pages(struct device *dev, struct resource *res,
 		struct percpu_ref *ref, struct vmem_altmap *altmap);
 struct dev_pagemap *find_dev_pagemap(resource_size_t phys);
+
+static inline bool is_zone_device_page(const struct page *page);
+
+static inline bool is_device_private_page(const struct page *page)
+{
+	return is_zone_device_page(page) &&
+		page->pgmap->type == MEMORY_DEVICE_PRIVATE;
+}
 #else
 static inline void *devm_memremap_pages(struct device *dev,
 		struct resource *res, struct percpu_ref *ref,
@@ -144,6 +152,11 @@ static inline struct dev_pagemap *find_dev_pagemap(resource_size_t phys)
 {
 	return NULL;
 }
+
+static inline bool is_device_private_page(const struct page *page)
+{
+	return false;
+}
 #endif
 
 /**
diff --git a/include/linux/mm.h b/include/linux/mm.h
index a74c4e954352..eccdab4bb44a 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -23,6 +23,7 @@
 #include <linux/page_ext.h>
 #include <linux/err.h>
 #include <linux/page_ref.h>
+#include <linux/memremap.h>
 
 struct mempolicy;
 struct anon_vma;
@@ -792,25 +793,25 @@ static inline bool is_zone_device_page(const struct page *page)
 {
 	return page_zonenum(page) == ZONE_DEVICE;
 }
-
-static inline bool is_device_private_page(const struct page *page)
-{
-	/* See MEMORY_DEVICE_PRIVATE in include/linux/memory_hotplug.h */
-	return ((page_zonenum(page) == ZONE_DEVICE) &&
-		(page->pgmap->type == MEMORY_DEVICE_PRIVATE));
-}
 #else
 static inline bool is_zone_device_page(const struct page *page)
 {
 	return false;
 }
+#endif
 
-static inline bool is_device_private_page(const struct page *page)
+#ifdef CONFIG_DEVICE_PRIVATE
+void put_zone_device_private_page(struct page *page);
+#else
+static inline void put_zone_device_private_page(struct page *page)
 {
-	return false;
 }
 #endif
 
+static inline bool is_device_private_page(const struct page *page);
+
+DECLARE_STATIC_KEY_FALSE(device_private_key);
+
 static inline void get_page(struct page *page)
 {
 	page = compound_head(page);
@@ -826,6 +827,18 @@ static inline void put_page(struct page *page)
 {
 	page = compound_head(page);
 
+	/*
+	 * For private device pages we need to catch refcount transition from
+	 * 2 to 1, when refcount reach one it means the private device page is
+	 * free and we need to inform the device driver through callback. See
+	 * include/linux/memremap.h and HMM for details.
+	 */
+	if (static_branch_unlikely(&device_private_key) &&
+	    unlikely(is_device_private_page(page))) {
+		put_zone_device_private_page(page);
+		return;
+	}
+
 	if (put_page_testzero(page))
 		__put_page(page);
 }
diff --git a/kernel/memremap.c b/kernel/memremap.c
index f1d1e0dfe8b4..1403cf16fa61 100644
--- a/kernel/memremap.c
+++ b/kernel/memremap.c
@@ -11,7 +11,6 @@
  * General Public License for more details.
  */
 #include <linux/radix-tree.h>
-#include <linux/memremap.h>
 #include <linux/device.h>
 #include <linux/types.h>
 #include <linux/pfn_t.h>
@@ -500,3 +499,27 @@ struct vmem_altmap *to_vmem_altmap(unsigned long memmap_start)
 	return pgmap ? pgmap->altmap : NULL;
 }
 #endif /* CONFIG_ZONE_DEVICE */
+
+
+#ifdef CONFIG_DEVICE_PRIVATE
+void put_zone_device_private_page(struct page *page)
+{
+	int count = page_ref_dec_return(page);
+
+	/*
+	 * If refcount is 1 then page is freed and refcount is stable as nobody
+	 * holds a reference on the page.
+	 */
+	if (count == 1) {
+		/* Clear Active bit in case of parallel mark_page_accessed */
+		__ClearPageActive(page);
+		__ClearPageWaiters(page);
+
+		page->mapping = NULL;
+
+		page->pgmap->page_free(page, page->pgmap->data);
+	} else if (!count)
+		__put_page(page);
+}
+EXPORT_SYMBOL(put_zone_device_private_page);
+#endif /* CONFIG_DEVICE_PRIVATE */
diff --git a/mm/hmm.c b/mm/hmm.c
index f6c745b9a25a..3c6265d4254b 100644
--- a/mm/hmm.c
+++ b/mm/hmm.c
@@ -25,9 +25,17 @@
 #include <linux/sched.h>
 #include <linux/swapops.h>
 #include <linux/hugetlb.h>
+#include <linux/jump_label.h>
 #include <linux/mmu_notifier.h>
 
 
+/*
+ * Device private memory see HMM (Documentation/vm/hmm.txt) or hmm.h
+ */
+DEFINE_STATIC_KEY_FALSE(device_private_key);
+EXPORT_SYMBOL(device_private_key);
+
+
 #ifdef CONFIG_HMM
 static const struct mmu_notifier_ops hmm_mmu_notifier_ops;
 
-- 
cgit v1.2.3


From 4ef589dc9b10cdcae75a2b2b0e9b2c5e8a92c378 Mon Sep 17 00:00:00 2001
From: Jérôme Glisse <jglisse@redhat.com>
Date: Fri, 8 Sep 2017 16:11:58 -0700
Subject: mm/hmm/devmem: device memory hotplug using ZONE_DEVICE
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This introduce a simple struct and associated helpers for device driver to
use when hotpluging un-addressable device memory as ZONE_DEVICE.  It will
find a unuse physical address range and trigger memory hotplug for it
which allocates and initialize struct page for the device memory.

Device driver should use this helper during device initialization to
hotplug the device memory.  It should only need to remove the memory once
the device is going offline (shutdown or hotremove).  There should not be
any userspace API to hotplug memory expect maybe for host device driver to
allow to add more memory to a guest device driver.

Device's memory is manage by the device driver and HMM only provides
helpers to that effect.

Link: http://lkml.kernel.org/r/20170817000548.32038-12-jglisse@redhat.com
Signed-off-by: Jérôme Glisse <jglisse@redhat.com>
Signed-off-by: Evgeny Baskakov <ebaskakov@nvidia.com>
Signed-off-by: John Hubbard <jhubbard@nvidia.com>
Signed-off-by: Mark Hairgrove <mhairgrove@nvidia.com>
Signed-off-by: Sherry Cheung <SCheung@nvidia.com>
Signed-off-by: Subhash Gutti <sgutti@nvidia.com>
Signed-off-by: Balbir Singh <bsingharora@gmail.com>
Cc: Aneesh Kumar <aneesh.kumar@linux.vnet.ibm.com>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Dan Williams <dan.j.williams@intel.com>
Cc: David Nellans <dnellans@nvidia.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Ross Zwisler <ross.zwisler@linux.intel.com>
Cc: Vladimir Davydov <vdavydov.dev@gmail.com>
Cc: Bob Liu <liubo95@huawei.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/hmm.h | 155 +++++++++++++++++++++
 mm/hmm.c            | 379 +++++++++++++++++++++++++++++++++++++++++++++++++++-
 2 files changed, 533 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/hmm.h b/include/linux/hmm.h
index 61a6535fe438..16f916b437cc 100644
--- a/include/linux/hmm.h
+++ b/include/linux/hmm.h
@@ -72,6 +72,11 @@
 
 #if IS_ENABLED(CONFIG_HMM)
 
+#include <linux/migrate.h>
+#include <linux/memremap.h>
+#include <linux/completion.h>
+
+
 struct hmm;
 
 /*
@@ -322,6 +327,156 @@ int hmm_vma_fault(struct vm_area_struct *vma,
 #endif /* IS_ENABLED(CONFIG_HMM_MIRROR) */
 
 
+#if IS_ENABLED(CONFIG_DEVICE_PRIVATE)
+struct hmm_devmem;
+
+struct page *hmm_vma_alloc_locked_page(struct vm_area_struct *vma,
+				       unsigned long addr);
+
+/*
+ * struct hmm_devmem_ops - callback for ZONE_DEVICE memory events
+ *
+ * @free: call when refcount on page reach 1 and thus is no longer use
+ * @fault: call when there is a page fault to unaddressable memory
+ *
+ * Both callback happens from page_free() and page_fault() callback of struct
+ * dev_pagemap respectively. See include/linux/memremap.h for more details on
+ * those.
+ *
+ * The hmm_devmem_ops callback are just here to provide a coherent and
+ * uniq API to device driver and device driver should not register their
+ * own page_free() or page_fault() but rely on the hmm_devmem_ops call-
+ * back.
+ */
+struct hmm_devmem_ops {
+	/*
+	 * free() - free a device page
+	 * @devmem: device memory structure (see struct hmm_devmem)
+	 * @page: pointer to struct page being freed
+	 *
+	 * Call back occurs whenever a device page refcount reach 1 which
+	 * means that no one is holding any reference on the page anymore
+	 * (ZONE_DEVICE page have an elevated refcount of 1 as default so
+	 * that they are not release to the general page allocator).
+	 *
+	 * Note that callback has exclusive ownership of the page (as no
+	 * one is holding any reference).
+	 */
+	void (*free)(struct hmm_devmem *devmem, struct page *page);
+	/*
+	 * fault() - CPU page fault or get user page (GUP)
+	 * @devmem: device memory structure (see struct hmm_devmem)
+	 * @vma: virtual memory area containing the virtual address
+	 * @addr: virtual address that faulted or for which there is a GUP
+	 * @page: pointer to struct page backing virtual address (unreliable)
+	 * @flags: FAULT_FLAG_* (see include/linux/mm.h)
+	 * @pmdp: page middle directory
+	 * Returns: VM_FAULT_MINOR/MAJOR on success or one of VM_FAULT_ERROR
+	 *   on error
+	 *
+	 * The callback occurs whenever there is a CPU page fault or GUP on a
+	 * virtual address. This means that the device driver must migrate the
+	 * page back to regular memory (CPU accessible).
+	 *
+	 * The device driver is free to migrate more than one page from the
+	 * fault() callback as an optimization. However if device decide to
+	 * migrate more than one page it must always priotirize the faulting
+	 * address over the others.
+	 *
+	 * The struct page pointer is only given as an hint to allow quick
+	 * lookup of internal device driver data. A concurrent migration
+	 * might have already free that page and the virtual address might
+	 * not longer be back by it. So it should not be modified by the
+	 * callback.
+	 *
+	 * Note that mmap semaphore is held in read mode at least when this
+	 * callback occurs, hence the vma is valid upon callback entry.
+	 */
+	int (*fault)(struct hmm_devmem *devmem,
+		     struct vm_area_struct *vma,
+		     unsigned long addr,
+		     const struct page *page,
+		     unsigned int flags,
+		     pmd_t *pmdp);
+};
+
+/*
+ * struct hmm_devmem - track device memory
+ *
+ * @completion: completion object for device memory
+ * @pfn_first: first pfn for this resource (set by hmm_devmem_add())
+ * @pfn_last: last pfn for this resource (set by hmm_devmem_add())
+ * @resource: IO resource reserved for this chunk of memory
+ * @pagemap: device page map for that chunk
+ * @device: device to bind resource to
+ * @ops: memory operations callback
+ * @ref: per CPU refcount
+ *
+ * This an helper structure for device drivers that do not wish to implement
+ * the gory details related to hotplugging new memoy and allocating struct
+ * pages.
+ *
+ * Device drivers can directly use ZONE_DEVICE memory on their own if they
+ * wish to do so.
+ */
+struct hmm_devmem {
+	struct completion		completion;
+	unsigned long			pfn_first;
+	unsigned long			pfn_last;
+	struct resource			*resource;
+	struct device			*device;
+	struct dev_pagemap		pagemap;
+	const struct hmm_devmem_ops	*ops;
+	struct percpu_ref		ref;
+};
+
+/*
+ * To add (hotplug) device memory, HMM assumes that there is no real resource
+ * that reserves a range in the physical address space (this is intended to be
+ * use by unaddressable device memory). It will reserve a physical range big
+ * enough and allocate struct page for it.
+ *
+ * The device driver can wrap the hmm_devmem struct inside a private device
+ * driver struct. The device driver must call hmm_devmem_remove() before the
+ * device goes away and before freeing the hmm_devmem struct memory.
+ */
+struct hmm_devmem *hmm_devmem_add(const struct hmm_devmem_ops *ops,
+				  struct device *device,
+				  unsigned long size);
+void hmm_devmem_remove(struct hmm_devmem *devmem);
+
+/*
+ * hmm_devmem_page_set_drvdata - set per-page driver data field
+ *
+ * @page: pointer to struct page
+ * @data: driver data value to set
+ *
+ * Because page can not be on lru we have an unsigned long that driver can use
+ * to store a per page field. This just a simple helper to do that.
+ */
+static inline void hmm_devmem_page_set_drvdata(struct page *page,
+					       unsigned long data)
+{
+	unsigned long *drvdata = (unsigned long *)&page->pgmap;
+
+	drvdata[1] = data;
+}
+
+/*
+ * hmm_devmem_page_get_drvdata - get per page driver data field
+ *
+ * @page: pointer to struct page
+ * Return: driver data value
+ */
+static inline unsigned long hmm_devmem_page_get_drvdata(struct page *page)
+{
+	unsigned long *drvdata = (unsigned long *)&page->pgmap;
+
+	return drvdata[1];
+}
+#endif /* IS_ENABLED(CONFIG_DEVICE_PRIVATE) */
+
+
 /* Below are for HMM internal use only! Not to be used by device driver! */
 void hmm_mm_destroy(struct mm_struct *mm);
 
diff --git a/mm/hmm.c b/mm/hmm.c
index 3c6265d4254b..afb51078a5cf 100644
--- a/mm/hmm.c
+++ b/mm/hmm.c
@@ -23,10 +23,16 @@
 #include <linux/swap.h>
 #include <linux/slab.h>
 #include <linux/sched.h>
+#include <linux/mmzone.h>
+#include <linux/pagemap.h>
 #include <linux/swapops.h>
 #include <linux/hugetlb.h>
+#include <linux/memremap.h>
 #include <linux/jump_label.h>
 #include <linux/mmu_notifier.h>
+#include <linux/memory_hotplug.h>
+
+#define PA_SECTION_SIZE (1UL << PA_SECTION_SHIFT)
 
 
 /*
@@ -426,7 +432,15 @@ again:
 			 * This is a special swap entry, ignore migration, use
 			 * device and report anything else as error.
 			 */
-			if (is_migration_entry(entry)) {
+			if (is_device_private_entry(entry)) {
+				pfns[i] = hmm_pfn_t_from_pfn(swp_offset(entry));
+				if (is_write_device_private_entry(entry)) {
+					pfns[i] |= HMM_PFN_WRITE;
+				} else if (write_fault)
+					goto fault;
+				pfns[i] |= HMM_PFN_DEVICE_UNADDRESSABLE;
+				pfns[i] |= flag;
+			} else if (is_migration_entry(entry)) {
 				if (hmm_vma_walk->fault) {
 					pte_unmap(ptep);
 					hmm_vma_walk->last = addr;
@@ -720,3 +734,366 @@ int hmm_vma_fault(struct vm_area_struct *vma,
 }
 EXPORT_SYMBOL(hmm_vma_fault);
 #endif /* IS_ENABLED(CONFIG_HMM_MIRROR) */
+
+
+#if IS_ENABLED(CONFIG_DEVICE_PRIVATE)
+struct page *hmm_vma_alloc_locked_page(struct vm_area_struct *vma,
+				       unsigned long addr)
+{
+	struct page *page;
+
+	page = alloc_page_vma(GFP_HIGHUSER, vma, addr);
+	if (!page)
+		return NULL;
+	lock_page(page);
+	return page;
+}
+EXPORT_SYMBOL(hmm_vma_alloc_locked_page);
+
+
+static void hmm_devmem_ref_release(struct percpu_ref *ref)
+{
+	struct hmm_devmem *devmem;
+
+	devmem = container_of(ref, struct hmm_devmem, ref);
+	complete(&devmem->completion);
+}
+
+static void hmm_devmem_ref_exit(void *data)
+{
+	struct percpu_ref *ref = data;
+	struct hmm_devmem *devmem;
+
+	devmem = container_of(ref, struct hmm_devmem, ref);
+	percpu_ref_exit(ref);
+	devm_remove_action(devmem->device, &hmm_devmem_ref_exit, data);
+}
+
+static void hmm_devmem_ref_kill(void *data)
+{
+	struct percpu_ref *ref = data;
+	struct hmm_devmem *devmem;
+
+	devmem = container_of(ref, struct hmm_devmem, ref);
+	percpu_ref_kill(ref);
+	wait_for_completion(&devmem->completion);
+	devm_remove_action(devmem->device, &hmm_devmem_ref_kill, data);
+}
+
+static int hmm_devmem_fault(struct vm_area_struct *vma,
+			    unsigned long addr,
+			    const struct page *page,
+			    unsigned int flags,
+			    pmd_t *pmdp)
+{
+	struct hmm_devmem *devmem = page->pgmap->data;
+
+	return devmem->ops->fault(devmem, vma, addr, page, flags, pmdp);
+}
+
+static void hmm_devmem_free(struct page *page, void *data)
+{
+	struct hmm_devmem *devmem = data;
+
+	devmem->ops->free(devmem, page);
+}
+
+static DEFINE_MUTEX(hmm_devmem_lock);
+static RADIX_TREE(hmm_devmem_radix, GFP_KERNEL);
+
+static void hmm_devmem_radix_release(struct resource *resource)
+{
+	resource_size_t key, align_start, align_size, align_end;
+
+	align_start = resource->start & ~(PA_SECTION_SIZE - 1);
+	align_size = ALIGN(resource_size(resource), PA_SECTION_SIZE);
+	align_end = align_start + align_size - 1;
+
+	mutex_lock(&hmm_devmem_lock);
+	for (key = resource->start;
+	     key <= resource->end;
+	     key += PA_SECTION_SIZE)
+		radix_tree_delete(&hmm_devmem_radix, key >> PA_SECTION_SHIFT);
+	mutex_unlock(&hmm_devmem_lock);
+}
+
+static void hmm_devmem_release(struct device *dev, void *data)
+{
+	struct hmm_devmem *devmem = data;
+	struct resource *resource = devmem->resource;
+	unsigned long start_pfn, npages;
+	struct zone *zone;
+	struct page *page;
+
+	if (percpu_ref_tryget_live(&devmem->ref)) {
+		dev_WARN(dev, "%s: page mapping is still live!\n", __func__);
+		percpu_ref_put(&devmem->ref);
+	}
+
+	/* pages are dead and unused, undo the arch mapping */
+	start_pfn = (resource->start & ~(PA_SECTION_SIZE - 1)) >> PAGE_SHIFT;
+	npages = ALIGN(resource_size(resource), PA_SECTION_SIZE) >> PAGE_SHIFT;
+
+	page = pfn_to_page(start_pfn);
+	zone = page_zone(page);
+
+	mem_hotplug_begin();
+	__remove_pages(zone, start_pfn, npages);
+	mem_hotplug_done();
+
+	hmm_devmem_radix_release(resource);
+}
+
+static struct hmm_devmem *hmm_devmem_find(resource_size_t phys)
+{
+	WARN_ON_ONCE(!rcu_read_lock_held());
+
+	return radix_tree_lookup(&hmm_devmem_radix, phys >> PA_SECTION_SHIFT);
+}
+
+static int hmm_devmem_pages_create(struct hmm_devmem *devmem)
+{
+	resource_size_t key, align_start, align_size, align_end;
+	struct device *device = devmem->device;
+	int ret, nid, is_ram;
+	unsigned long pfn;
+
+	align_start = devmem->resource->start & ~(PA_SECTION_SIZE - 1);
+	align_size = ALIGN(devmem->resource->start +
+			   resource_size(devmem->resource),
+			   PA_SECTION_SIZE) - align_start;
+
+	is_ram = region_intersects(align_start, align_size,
+				   IORESOURCE_SYSTEM_RAM,
+				   IORES_DESC_NONE);
+	if (is_ram == REGION_MIXED) {
+		WARN_ONCE(1, "%s attempted on mixed region %pr\n",
+				__func__, devmem->resource);
+		return -ENXIO;
+	}
+	if (is_ram == REGION_INTERSECTS)
+		return -ENXIO;
+
+	devmem->pagemap.type = MEMORY_DEVICE_PRIVATE;
+	devmem->pagemap.res = devmem->resource;
+	devmem->pagemap.page_fault = hmm_devmem_fault;
+	devmem->pagemap.page_free = hmm_devmem_free;
+	devmem->pagemap.dev = devmem->device;
+	devmem->pagemap.ref = &devmem->ref;
+	devmem->pagemap.data = devmem;
+
+	mutex_lock(&hmm_devmem_lock);
+	align_end = align_start + align_size - 1;
+	for (key = align_start; key <= align_end; key += PA_SECTION_SIZE) {
+		struct hmm_devmem *dup;
+
+		rcu_read_lock();
+		dup = hmm_devmem_find(key);
+		rcu_read_unlock();
+		if (dup) {
+			dev_err(device, "%s: collides with mapping for %s\n",
+				__func__, dev_name(dup->device));
+			mutex_unlock(&hmm_devmem_lock);
+			ret = -EBUSY;
+			goto error;
+		}
+		ret = radix_tree_insert(&hmm_devmem_radix,
+					key >> PA_SECTION_SHIFT,
+					devmem);
+		if (ret) {
+			dev_err(device, "%s: failed: %d\n", __func__, ret);
+			mutex_unlock(&hmm_devmem_lock);
+			goto error_radix;
+		}
+	}
+	mutex_unlock(&hmm_devmem_lock);
+
+	nid = dev_to_node(device);
+	if (nid < 0)
+		nid = numa_mem_id();
+
+	mem_hotplug_begin();
+	/*
+	 * For device private memory we call add_pages() as we only need to
+	 * allocate and initialize struct page for the device memory. More-
+	 * over the device memory is un-accessible thus we do not want to
+	 * create a linear mapping for the memory like arch_add_memory()
+	 * would do.
+	 */
+	ret = add_pages(nid, align_start >> PAGE_SHIFT,
+			align_size >> PAGE_SHIFT, false);
+	if (ret) {
+		mem_hotplug_done();
+		goto error_add_memory;
+	}
+	move_pfn_range_to_zone(&NODE_DATA(nid)->node_zones[ZONE_DEVICE],
+				align_start >> PAGE_SHIFT,
+				align_size >> PAGE_SHIFT);
+	mem_hotplug_done();
+
+	for (pfn = devmem->pfn_first; pfn < devmem->pfn_last; pfn++) {
+		struct page *page = pfn_to_page(pfn);
+
+		page->pgmap = &devmem->pagemap;
+	}
+	return 0;
+
+error_add_memory:
+	untrack_pfn(NULL, PHYS_PFN(align_start), align_size);
+error_radix:
+	hmm_devmem_radix_release(devmem->resource);
+error:
+	return ret;
+}
+
+static int hmm_devmem_match(struct device *dev, void *data, void *match_data)
+{
+	struct hmm_devmem *devmem = data;
+
+	return devmem->resource == match_data;
+}
+
+static void hmm_devmem_pages_remove(struct hmm_devmem *devmem)
+{
+	devres_release(devmem->device, &hmm_devmem_release,
+		       &hmm_devmem_match, devmem->resource);
+}
+
+/*
+ * hmm_devmem_add() - hotplug ZONE_DEVICE memory for device memory
+ *
+ * @ops: memory event device driver callback (see struct hmm_devmem_ops)
+ * @device: device struct to bind the resource too
+ * @size: size in bytes of the device memory to add
+ * Returns: pointer to new hmm_devmem struct ERR_PTR otherwise
+ *
+ * This function first finds an empty range of physical address big enough to
+ * contain the new resource, and then hotplugs it as ZONE_DEVICE memory, which
+ * in turn allocates struct pages. It does not do anything beyond that; all
+ * events affecting the memory will go through the various callbacks provided
+ * by hmm_devmem_ops struct.
+ *
+ * Device driver should call this function during device initialization and
+ * is then responsible of memory management. HMM only provides helpers.
+ */
+struct hmm_devmem *hmm_devmem_add(const struct hmm_devmem_ops *ops,
+				  struct device *device,
+				  unsigned long size)
+{
+	struct hmm_devmem *devmem;
+	resource_size_t addr;
+	int ret;
+
+	static_branch_enable(&device_private_key);
+
+	devmem = devres_alloc_node(&hmm_devmem_release, sizeof(*devmem),
+				   GFP_KERNEL, dev_to_node(device));
+	if (!devmem)
+		return ERR_PTR(-ENOMEM);
+
+	init_completion(&devmem->completion);
+	devmem->pfn_first = -1UL;
+	devmem->pfn_last = -1UL;
+	devmem->resource = NULL;
+	devmem->device = device;
+	devmem->ops = ops;
+
+	ret = percpu_ref_init(&devmem->ref, &hmm_devmem_ref_release,
+			      0, GFP_KERNEL);
+	if (ret)
+		goto error_percpu_ref;
+
+	ret = devm_add_action(device, hmm_devmem_ref_exit, &devmem->ref);
+	if (ret)
+		goto error_devm_add_action;
+
+	size = ALIGN(size, PA_SECTION_SIZE);
+	addr = min((unsigned long)iomem_resource.end,
+		   (1UL << MAX_PHYSMEM_BITS) - 1);
+	addr = addr - size + 1UL;
+
+	/*
+	 * FIXME add a new helper to quickly walk resource tree and find free
+	 * range
+	 *
+	 * FIXME what about ioport_resource resource ?
+	 */
+	for (; addr > size && addr >= iomem_resource.start; addr -= size) {
+		ret = region_intersects(addr, size, 0, IORES_DESC_NONE);
+		if (ret != REGION_DISJOINT)
+			continue;
+
+		devmem->resource = devm_request_mem_region(device, addr, size,
+							   dev_name(device));
+		if (!devmem->resource) {
+			ret = -ENOMEM;
+			goto error_no_resource;
+		}
+		break;
+	}
+	if (!devmem->resource) {
+		ret = -ERANGE;
+		goto error_no_resource;
+	}
+
+	devmem->resource->desc = IORES_DESC_DEVICE_PRIVATE_MEMORY;
+	devmem->pfn_first = devmem->resource->start >> PAGE_SHIFT;
+	devmem->pfn_last = devmem->pfn_first +
+			   (resource_size(devmem->resource) >> PAGE_SHIFT);
+
+	ret = hmm_devmem_pages_create(devmem);
+	if (ret)
+		goto error_pages;
+
+	devres_add(device, devmem);
+
+	ret = devm_add_action(device, hmm_devmem_ref_kill, &devmem->ref);
+	if (ret) {
+		hmm_devmem_remove(devmem);
+		return ERR_PTR(ret);
+	}
+
+	return devmem;
+
+error_pages:
+	devm_release_mem_region(device, devmem->resource->start,
+				resource_size(devmem->resource));
+error_no_resource:
+error_devm_add_action:
+	hmm_devmem_ref_kill(&devmem->ref);
+	hmm_devmem_ref_exit(&devmem->ref);
+error_percpu_ref:
+	devres_free(devmem);
+	return ERR_PTR(ret);
+}
+EXPORT_SYMBOL(hmm_devmem_add);
+
+/*
+ * hmm_devmem_remove() - remove device memory (kill and free ZONE_DEVICE)
+ *
+ * @devmem: hmm_devmem struct use to track and manage the ZONE_DEVICE memory
+ *
+ * This will hot-unplug memory that was hotplugged by hmm_devmem_add on behalf
+ * of the device driver. It will free struct page and remove the resource that
+ * reserved the physical address range for this device memory.
+ */
+void hmm_devmem_remove(struct hmm_devmem *devmem)
+{
+	resource_size_t start, size;
+	struct device *device;
+
+	if (!devmem)
+		return;
+
+	device = devmem->device;
+	start = devmem->resource->start;
+	size = resource_size(devmem->resource);
+
+	hmm_devmem_ref_kill(&devmem->ref);
+	hmm_devmem_ref_exit(&devmem->ref);
+	hmm_devmem_pages_remove(devmem);
+
+	devm_release_mem_region(device, start, size);
+}
+EXPORT_SYMBOL(hmm_devmem_remove);
+#endif /* IS_ENABLED(CONFIG_DEVICE_PRIVATE) */
-- 
cgit v1.2.3


From 858b54dabf4363daa3a97b9a722130a8e7cea8c9 Mon Sep 17 00:00:00 2001
From: Jérôme Glisse <jglisse@redhat.com>
Date: Fri, 8 Sep 2017 16:12:02 -0700
Subject: mm/hmm/devmem: dummy HMM device for ZONE_DEVICE memory
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This introduce a dummy HMM device class so device driver can use it to
create hmm_device for the sole purpose of registering device memory.  It
is useful to device driver that want to manage multiple physical device
memory under same struct device umbrella.

Link: http://lkml.kernel.org/r/20170817000548.32038-13-jglisse@redhat.com
Signed-off-by: Jérôme Glisse <jglisse@redhat.com>
Signed-off-by: Evgeny Baskakov <ebaskakov@nvidia.com>
Signed-off-by: John Hubbard <jhubbard@nvidia.com>
Signed-off-by: Mark Hairgrove <mhairgrove@nvidia.com>
Signed-off-by: Sherry Cheung <SCheung@nvidia.com>
Signed-off-by: Subhash Gutti <sgutti@nvidia.com>
Cc: Aneesh Kumar <aneesh.kumar@linux.vnet.ibm.com>
Cc: Balbir Singh <bsingharora@gmail.com>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Dan Williams <dan.j.williams@intel.com>
Cc: David Nellans <dnellans@nvidia.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Ross Zwisler <ross.zwisler@linux.intel.com>
Cc: Vladimir Davydov <vdavydov.dev@gmail.com>
Cc: Bob Liu <liubo95@huawei.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/hmm.h | 22 ++++++++++++++-
 mm/hmm.c            | 81 +++++++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 102 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/hmm.h b/include/linux/hmm.h
index 16f916b437cc..67a03b20a2db 100644
--- a/include/linux/hmm.h
+++ b/include/linux/hmm.h
@@ -72,11 +72,11 @@
 
 #if IS_ENABLED(CONFIG_HMM)
 
+#include <linux/device.h>
 #include <linux/migrate.h>
 #include <linux/memremap.h>
 #include <linux/completion.h>
 
-
 struct hmm;
 
 /*
@@ -474,6 +474,26 @@ static inline unsigned long hmm_devmem_page_get_drvdata(struct page *page)
 
 	return drvdata[1];
 }
+
+
+/*
+ * struct hmm_device - fake device to hang device memory onto
+ *
+ * @device: device struct
+ * @minor: device minor number
+ */
+struct hmm_device {
+	struct device		device;
+	unsigned int		minor;
+};
+
+/*
+ * A device driver that wants to handle multiple devices memory through a
+ * single fake device can use hmm_device to do so. This is purely a helper and
+ * it is not strictly needed, in order to make use of any HMM functionality.
+ */
+struct hmm_device *hmm_device_new(void *drvdata);
+void hmm_device_put(struct hmm_device *hmm_device);
 #endif /* IS_ENABLED(CONFIG_DEVICE_PRIVATE) */
 
 
diff --git a/mm/hmm.c b/mm/hmm.c
index afb51078a5cf..c9d23ef80552 100644
--- a/mm/hmm.c
+++ b/mm/hmm.c
@@ -19,6 +19,7 @@
  */
 #include <linux/mm.h>
 #include <linux/hmm.h>
+#include <linux/init.h>
 #include <linux/rmap.h>
 #include <linux/swap.h>
 #include <linux/slab.h>
@@ -1096,4 +1097,84 @@ void hmm_devmem_remove(struct hmm_devmem *devmem)
 	devm_release_mem_region(device, start, size);
 }
 EXPORT_SYMBOL(hmm_devmem_remove);
+
+/*
+ * A device driver that wants to handle multiple devices memory through a
+ * single fake device can use hmm_device to do so. This is purely a helper
+ * and it is not needed to make use of any HMM functionality.
+ */
+#define HMM_DEVICE_MAX 256
+
+static DECLARE_BITMAP(hmm_device_mask, HMM_DEVICE_MAX);
+static DEFINE_SPINLOCK(hmm_device_lock);
+static struct class *hmm_device_class;
+static dev_t hmm_device_devt;
+
+static void hmm_device_release(struct device *device)
+{
+	struct hmm_device *hmm_device;
+
+	hmm_device = container_of(device, struct hmm_device, device);
+	spin_lock(&hmm_device_lock);
+	clear_bit(hmm_device->minor, hmm_device_mask);
+	spin_unlock(&hmm_device_lock);
+
+	kfree(hmm_device);
+}
+
+struct hmm_device *hmm_device_new(void *drvdata)
+{
+	struct hmm_device *hmm_device;
+
+	hmm_device = kzalloc(sizeof(*hmm_device), GFP_KERNEL);
+	if (!hmm_device)
+		return ERR_PTR(-ENOMEM);
+
+	spin_lock(&hmm_device_lock);
+	hmm_device->minor = find_first_zero_bit(hmm_device_mask, HMM_DEVICE_MAX);
+	if (hmm_device->minor >= HMM_DEVICE_MAX) {
+		spin_unlock(&hmm_device_lock);
+		kfree(hmm_device);
+		return ERR_PTR(-EBUSY);
+	}
+	set_bit(hmm_device->minor, hmm_device_mask);
+	spin_unlock(&hmm_device_lock);
+
+	dev_set_name(&hmm_device->device, "hmm_device%d", hmm_device->minor);
+	hmm_device->device.devt = MKDEV(MAJOR(hmm_device_devt),
+					hmm_device->minor);
+	hmm_device->device.release = hmm_device_release;
+	dev_set_drvdata(&hmm_device->device, drvdata);
+	hmm_device->device.class = hmm_device_class;
+	device_initialize(&hmm_device->device);
+
+	return hmm_device;
+}
+EXPORT_SYMBOL(hmm_device_new);
+
+void hmm_device_put(struct hmm_device *hmm_device)
+{
+	put_device(&hmm_device->device);
+}
+EXPORT_SYMBOL(hmm_device_put);
+
+static int __init hmm_init(void)
+{
+	int ret;
+
+	ret = alloc_chrdev_region(&hmm_device_devt, 0,
+				  HMM_DEVICE_MAX,
+				  "hmm_device");
+	if (ret)
+		return ret;
+
+	hmm_device_class = class_create(THIS_MODULE, "hmm_device");
+	if (IS_ERR(hmm_device_class)) {
+		unregister_chrdev_region(hmm_device_devt, HMM_DEVICE_MAX);
+		return PTR_ERR(hmm_device_class);
+	}
+	return 0;
+}
+
+device_initcall(hmm_init);
 #endif /* IS_ENABLED(CONFIG_DEVICE_PRIVATE) */
-- 
cgit v1.2.3


From 2916ecc0f9d435d849c98f4da50e453124c87531 Mon Sep 17 00:00:00 2001
From: Jérôme Glisse <jglisse@redhat.com>
Date: Fri, 8 Sep 2017 16:12:06 -0700
Subject: mm/migrate: new migrate mode MIGRATE_SYNC_NO_COPY
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Introduce a new migration mode that allow to offload the copy to a device
DMA engine.  This changes the workflow of migration and not all
address_space migratepage callback can support this.

This is intended to be use by migrate_vma() which itself is use for thing
like HMM (see include/linux/hmm.h).

No additional per-filesystem migratepage testing is needed.  I disables
MIGRATE_SYNC_NO_COPY in all problematic migratepage() callback and i
added comment in those to explain why (part of this patch).  The commit
message is unclear it should say that any callback that wish to support
this new mode need to be aware of the difference in the migration flow
from other mode.

Some of these callbacks do extra locking while copying (aio, zsmalloc,
balloon, ...) and for DMA to be effective you want to copy multiple
pages in one DMA operations.  But in the problematic case you can not
easily hold the extra lock accross multiple call to this callback.

Usual flow is:

For each page {
 1 - lock page
 2 - call migratepage() callback
 3 - (extra locking in some migratepage() callback)
 4 - migrate page state (freeze refcount, update page cache, buffer
     head, ...)
 5 - copy page
 6 - (unlock any extra lock of migratepage() callback)
 7 - return from migratepage() callback
 8 - unlock page
}

The new mode MIGRATE_SYNC_NO_COPY:
 1 - lock multiple pages
For each page {
 2 - call migratepage() callback
 3 - abort in all problematic migratepage() callback
 4 - migrate page state (freeze refcount, update page cache, buffer
     head, ...)
} // finished all calls to migratepage() callback
 5 - DMA copy multiple pages
 6 - unlock all the pages

To support MIGRATE_SYNC_NO_COPY in the problematic case we would need a
new callback migratepages() (for instance) that deals with multiple
pages in one transaction.

Because the problematic cases are not important for current usage I did
not wanted to complexify this patchset even more for no good reason.

Link: http://lkml.kernel.org/r/20170817000548.32038-14-jglisse@redhat.com
Signed-off-by: Jérôme Glisse <jglisse@redhat.com>
Cc: Aneesh Kumar <aneesh.kumar@linux.vnet.ibm.com>
Cc: Balbir Singh <bsingharora@gmail.com>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Dan Williams <dan.j.williams@intel.com>
Cc: David Nellans <dnellans@nvidia.com>
Cc: Evgeny Baskakov <ebaskakov@nvidia.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: John Hubbard <jhubbard@nvidia.com>
Cc: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Cc: Mark Hairgrove <mhairgrove@nvidia.com>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Ross Zwisler <ross.zwisler@linux.intel.com>
Cc: Sherry Cheung <SCheung@nvidia.com>
Cc: Subhash Gutti <sgutti@nvidia.com>
Cc: Vladimir Davydov <vdavydov.dev@gmail.com>
Cc: Bob Liu <liubo95@huawei.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/aio.c                     |  8 +++++++
 fs/f2fs/data.c               |  5 ++++-
 fs/hugetlbfs/inode.c         |  5 ++++-
 fs/ubifs/file.c              |  5 ++++-
 include/linux/migrate.h      |  5 +++++
 include/linux/migrate_mode.h |  5 +++++
 mm/balloon_compaction.c      |  8 +++++++
 mm/migrate.c                 | 52 ++++++++++++++++++++++++++++++++++----------
 mm/zsmalloc.c                |  8 +++++++
 9 files changed, 86 insertions(+), 15 deletions(-)

(limited to 'include')

diff --git a/fs/aio.c b/fs/aio.c
index 8f0127526299..b5d69f28d8b1 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -373,6 +373,14 @@ static int aio_migratepage(struct address_space *mapping, struct page *new,
 	pgoff_t idx;
 	int rc;
 
+	/*
+	 * We cannot support the _NO_COPY case here, because copy needs to
+	 * happen under the ctx->completion_lock. That does not work with the
+	 * migration workflow of MIGRATE_SYNC_NO_COPY.
+	 */
+	if (mode == MIGRATE_SYNC_NO_COPY)
+		return -EINVAL;
+
 	rc = 0;
 
 	/* mapping->private_lock here protects against the kioctx teardown.  */
diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c
index a791aac4c5af..fb96bb71da00 100644
--- a/fs/f2fs/data.c
+++ b/fs/f2fs/data.c
@@ -2253,7 +2253,10 @@ int f2fs_migrate_page(struct address_space *mapping,
 		SetPagePrivate(newpage);
 	set_page_private(newpage, page_private(page));
 
-	migrate_page_copy(newpage, page);
+	if (mode != MIGRATE_SYNC_NO_COPY)
+		migrate_page_copy(newpage, page);
+	else
+		migrate_page_states(newpage, page);
 
 	return MIGRATEPAGE_SUCCESS;
 }
diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index 7c02b3f738e1..8c6f4b8f910f 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -830,7 +830,10 @@ static int hugetlbfs_migrate_page(struct address_space *mapping,
 	rc = migrate_huge_page_move_mapping(mapping, newpage, page);
 	if (rc != MIGRATEPAGE_SUCCESS)
 		return rc;
-	migrate_page_copy(newpage, page);
+	if (mode != MIGRATE_SYNC_NO_COPY)
+		migrate_page_copy(newpage, page);
+	else
+		migrate_page_states(newpage, page);
 
 	return MIGRATEPAGE_SUCCESS;
 }
diff --git a/fs/ubifs/file.c b/fs/ubifs/file.c
index f90a466ea5db..a02aa59d1e24 100644
--- a/fs/ubifs/file.c
+++ b/fs/ubifs/file.c
@@ -1490,7 +1490,10 @@ static int ubifs_migrate_page(struct address_space *mapping,
 		SetPagePrivate(newpage);
 	}
 
-	migrate_page_copy(newpage, page);
+	if (mode != MIGRATE_SYNC_NO_COPY)
+		migrate_page_copy(newpage, page);
+	else
+		migrate_page_states(newpage, page);
 	return MIGRATEPAGE_SUCCESS;
 }
 #endif
diff --git a/include/linux/migrate.h b/include/linux/migrate.h
index ce15989521a1..7db4c812a2a6 100644
--- a/include/linux/migrate.h
+++ b/include/linux/migrate.h
@@ -72,6 +72,7 @@ extern void putback_movable_page(struct page *page);
 
 extern int migrate_prep(void);
 extern int migrate_prep_local(void);
+extern void migrate_page_states(struct page *newpage, struct page *page);
 extern void migrate_page_copy(struct page *newpage, struct page *page);
 extern int migrate_huge_page_move_mapping(struct address_space *mapping,
 				  struct page *newpage, struct page *page);
@@ -92,6 +93,10 @@ static inline int isolate_movable_page(struct page *page, isolate_mode_t mode)
 static inline int migrate_prep(void) { return -ENOSYS; }
 static inline int migrate_prep_local(void) { return -ENOSYS; }
 
+static inline void migrate_page_states(struct page *newpage, struct page *page)
+{
+}
+
 static inline void migrate_page_copy(struct page *newpage,
 				     struct page *page) {}
 
diff --git a/include/linux/migrate_mode.h b/include/linux/migrate_mode.h
index ebf3d89a3919..bdf66af9b937 100644
--- a/include/linux/migrate_mode.h
+++ b/include/linux/migrate_mode.h
@@ -6,11 +6,16 @@
  *	on most operations but not ->writepage as the potential stall time
  *	is too significant
  * MIGRATE_SYNC will block when migrating pages
+ * MIGRATE_SYNC_NO_COPY will block when migrating pages but will not copy pages
+ *	with the CPU. Instead, page copy happens outside the migratepage()
+ *	callback and is likely using a DMA engine. See migrate_vma() and HMM
+ *	(mm/hmm.c) for users of this mode.
  */
 enum migrate_mode {
 	MIGRATE_ASYNC,
 	MIGRATE_SYNC_LIGHT,
 	MIGRATE_SYNC,
+	MIGRATE_SYNC_NO_COPY,
 };
 
 #endif		/* MIGRATE_MODE_H_INCLUDED */
diff --git a/mm/balloon_compaction.c b/mm/balloon_compaction.c
index b06d9fe23a28..68d28924ba79 100644
--- a/mm/balloon_compaction.c
+++ b/mm/balloon_compaction.c
@@ -139,6 +139,14 @@ int balloon_page_migrate(struct address_space *mapping,
 {
 	struct balloon_dev_info *balloon = balloon_page_device(page);
 
+	/*
+	 * We can not easily support the no copy case here so ignore it as it
+	 * is unlikely to be use with ballon pages. See include/linux/hmm.h for
+	 * user of the MIGRATE_SYNC_NO_COPY mode.
+	 */
+	if (mode == MIGRATE_SYNC_NO_COPY)
+		return -EINVAL;
+
 	VM_BUG_ON_PAGE(!PageLocked(page), page);
 	VM_BUG_ON_PAGE(!PageLocked(newpage), newpage);
 
diff --git a/mm/migrate.c b/mm/migrate.c
index 1088cef6ef8b..71de36cfb673 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -634,15 +634,10 @@ static void copy_huge_page(struct page *dst, struct page *src)
 /*
  * Copy the page to its new location
  */
-void migrate_page_copy(struct page *newpage, struct page *page)
+void migrate_page_states(struct page *newpage, struct page *page)
 {
 	int cpupid;
 
-	if (PageHuge(page) || PageTransHuge(page))
-		copy_huge_page(newpage, page);
-	else
-		copy_highpage(newpage, page);
-
 	if (PageError(page))
 		SetPageError(newpage);
 	if (PageReferenced(page))
@@ -696,6 +691,17 @@ void migrate_page_copy(struct page *newpage, struct page *page)
 
 	mem_cgroup_migrate(page, newpage);
 }
+EXPORT_SYMBOL(migrate_page_states);
+
+void migrate_page_copy(struct page *newpage, struct page *page)
+{
+	if (PageHuge(page) || PageTransHuge(page))
+		copy_huge_page(newpage, page);
+	else
+		copy_highpage(newpage, page);
+
+	migrate_page_states(newpage, page);
+}
 EXPORT_SYMBOL(migrate_page_copy);
 
 /************************************************************
@@ -721,7 +727,10 @@ int migrate_page(struct address_space *mapping,
 	if (rc != MIGRATEPAGE_SUCCESS)
 		return rc;
 
-	migrate_page_copy(newpage, page);
+	if (mode != MIGRATE_SYNC_NO_COPY)
+		migrate_page_copy(newpage, page);
+	else
+		migrate_page_states(newpage, page);
 	return MIGRATEPAGE_SUCCESS;
 }
 EXPORT_SYMBOL(migrate_page);
@@ -771,12 +780,15 @@ int buffer_migrate_page(struct address_space *mapping,
 
 	SetPagePrivate(newpage);
 
-	migrate_page_copy(newpage, page);
+	if (mode != MIGRATE_SYNC_NO_COPY)
+		migrate_page_copy(newpage, page);
+	else
+		migrate_page_states(newpage, page);
 
 	bh = head;
 	do {
 		unlock_buffer(bh);
- 		put_bh(bh);
+		put_bh(bh);
 		bh = bh->b_this_page;
 
 	} while (bh != head);
@@ -835,8 +847,13 @@ static int fallback_migrate_page(struct address_space *mapping,
 {
 	if (PageDirty(page)) {
 		/* Only writeback pages in full synchronous migration */
-		if (mode != MIGRATE_SYNC)
+		switch (mode) {
+		case MIGRATE_SYNC:
+		case MIGRATE_SYNC_NO_COPY:
+			break;
+		default:
 			return -EBUSY;
+		}
 		return writeout(mapping, page);
 	}
 
@@ -973,7 +990,11 @@ static int __unmap_and_move(struct page *page, struct page *newpage,
 		 * the retry loop is too short and in the sync-light case,
 		 * the overhead of stalling is too much
 		 */
-		if (mode != MIGRATE_SYNC) {
+		switch (mode) {
+		case MIGRATE_SYNC:
+		case MIGRATE_SYNC_NO_COPY:
+			break;
+		default:
 			rc = -EBUSY;
 			goto out_unlock;
 		}
@@ -1243,8 +1264,15 @@ static int unmap_and_move_huge_page(new_page_t get_new_page,
 		return -ENOMEM;
 
 	if (!trylock_page(hpage)) {
-		if (!force || mode != MIGRATE_SYNC)
+		if (!force)
 			goto out;
+		switch (mode) {
+		case MIGRATE_SYNC:
+		case MIGRATE_SYNC_NO_COPY:
+			break;
+		default:
+			goto out;
+		}
 		lock_page(hpage);
 	}
 
diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c
index 62457eb82330..5ad75ec4151c 100644
--- a/mm/zsmalloc.c
+++ b/mm/zsmalloc.c
@@ -1969,6 +1969,14 @@ int zs_page_migrate(struct address_space *mapping, struct page *newpage,
 	unsigned int obj_idx;
 	int ret = -EAGAIN;
 
+	/*
+	 * We cannot support the _NO_COPY case here, because copy needs to
+	 * happen under the zs lock, which does not work with
+	 * MIGRATE_SYNC_NO_COPY workflow.
+	 */
+	if (mode == MIGRATE_SYNC_NO_COPY)
+		return -EINVAL;
+
 	VM_BUG_ON_PAGE(!PageMovable(page), page);
 	VM_BUG_ON_PAGE(!PageIsolated(page), page);
 
-- 
cgit v1.2.3


From 8763cb45ab967a92a5ee49e9c544c0f0ea90e2d6 Mon Sep 17 00:00:00 2001
From: Jérôme Glisse <jglisse@redhat.com>
Date: Fri, 8 Sep 2017 16:12:09 -0700
Subject: mm/migrate: new memory migration helper for use with device memory
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This patch add a new memory migration helpers, which migrate memory
backing a range of virtual address of a process to different memory (which
can be allocated through special allocator).  It differs from numa
migration by working on a range of virtual address and thus by doing
migration in chunk that can be large enough to use DMA engine or special
copy offloading engine.

Expected users are any one with heterogeneous memory where different
memory have different characteristics (latency, bandwidth, ...).  As an
example IBM platform with CAPI bus can make use of this feature to migrate
between regular memory and CAPI device memory.  New CPU architecture with
a pool of high performance memory not manage as cache but presented as
regular memory (while being faster and with lower latency than DDR) will
also be prime user of this patch.

Migration to private device memory will be useful for device that have
large pool of such like GPU, NVidia plans to use HMM for that.

Link: http://lkml.kernel.org/r/20170817000548.32038-15-jglisse@redhat.com
Signed-off-by: Jérôme Glisse <jglisse@redhat.com>
Signed-off-by: Evgeny Baskakov <ebaskakov@nvidia.com>
Signed-off-by: John Hubbard <jhubbard@nvidia.com>
Signed-off-by: Mark Hairgrove <mhairgrove@nvidia.com>
Signed-off-by: Sherry Cheung <SCheung@nvidia.com>
Signed-off-by: Subhash Gutti <sgutti@nvidia.com>
Cc: Aneesh Kumar <aneesh.kumar@linux.vnet.ibm.com>
Cc: Balbir Singh <bsingharora@gmail.com>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Dan Williams <dan.j.williams@intel.com>
Cc: David Nellans <dnellans@nvidia.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Ross Zwisler <ross.zwisler@linux.intel.com>
Cc: Vladimir Davydov <vdavydov.dev@gmail.com>
Cc: Bob Liu <liubo95@huawei.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/migrate.h | 104 ++++++++++
 mm/migrate.c            | 492 ++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 596 insertions(+)

(limited to 'include')

diff --git a/include/linux/migrate.h b/include/linux/migrate.h
index 7db4c812a2a6..8f73cebfc3f5 100644
--- a/include/linux/migrate.h
+++ b/include/linux/migrate.h
@@ -156,4 +156,108 @@ static inline int migrate_misplaced_transhuge_page(struct mm_struct *mm,
 }
 #endif /* CONFIG_NUMA_BALANCING && CONFIG_TRANSPARENT_HUGEPAGE*/
 
+
+#ifdef CONFIG_MIGRATION
+
+#define MIGRATE_PFN_VALID	(1UL << 0)
+#define MIGRATE_PFN_MIGRATE	(1UL << 1)
+#define MIGRATE_PFN_LOCKED	(1UL << 2)
+#define MIGRATE_PFN_WRITE	(1UL << 3)
+#define MIGRATE_PFN_ERROR	(1UL << 4)
+#define MIGRATE_PFN_SHIFT	5
+
+static inline struct page *migrate_pfn_to_page(unsigned long mpfn)
+{
+	if (!(mpfn & MIGRATE_PFN_VALID))
+		return NULL;
+	return pfn_to_page(mpfn >> MIGRATE_PFN_SHIFT);
+}
+
+static inline unsigned long migrate_pfn(unsigned long pfn)
+{
+	return (pfn << MIGRATE_PFN_SHIFT) | MIGRATE_PFN_VALID;
+}
+
+/*
+ * struct migrate_vma_ops - migrate operation callback
+ *
+ * @alloc_and_copy: alloc destination memory and copy source memory to it
+ * @finalize_and_map: allow caller to map the successfully migrated pages
+ *
+ *
+ * The alloc_and_copy() callback happens once all source pages have been locked,
+ * unmapped and checked (checked whether pinned or not). All pages that can be
+ * migrated will have an entry in the src array set with the pfn value of the
+ * page and with the MIGRATE_PFN_VALID and MIGRATE_PFN_MIGRATE flag set (other
+ * flags might be set but should be ignored by the callback).
+ *
+ * The alloc_and_copy() callback can then allocate destination memory and copy
+ * source memory to it for all those entries (ie with MIGRATE_PFN_VALID and
+ * MIGRATE_PFN_MIGRATE flag set). Once these are allocated and copied, the
+ * callback must update each corresponding entry in the dst array with the pfn
+ * value of the destination page and with the MIGRATE_PFN_VALID and
+ * MIGRATE_PFN_LOCKED flags set (destination pages must have their struct pages
+ * locked, via lock_page()).
+ *
+ * At this point the alloc_and_copy() callback is done and returns.
+ *
+ * Note that the callback does not have to migrate all the pages that are
+ * marked with MIGRATE_PFN_MIGRATE flag in src array unless this is a migration
+ * from device memory to system memory (ie the MIGRATE_PFN_DEVICE flag is also
+ * set in the src array entry). If the device driver cannot migrate a device
+ * page back to system memory, then it must set the corresponding dst array
+ * entry to MIGRATE_PFN_ERROR. This will trigger a SIGBUS if CPU tries to
+ * access any of the virtual addresses originally backed by this page. Because
+ * a SIGBUS is such a severe result for the userspace process, the device
+ * driver should avoid setting MIGRATE_PFN_ERROR unless it is really in an
+ * unrecoverable state.
+ *
+ * THE alloc_and_copy() CALLBACK MUST NOT CHANGE ANY OF THE SRC ARRAY ENTRIES
+ * OR BAD THINGS WILL HAPPEN !
+ *
+ *
+ * The finalize_and_map() callback happens after struct page migration from
+ * source to destination (destination struct pages are the struct pages for the
+ * memory allocated by the alloc_and_copy() callback).  Migration can fail, and
+ * thus the finalize_and_map() allows the driver to inspect which pages were
+ * successfully migrated, and which were not. Successfully migrated pages will
+ * have the MIGRATE_PFN_MIGRATE flag set for their src array entry.
+ *
+ * It is safe to update device page table from within the finalize_and_map()
+ * callback because both destination and source page are still locked, and the
+ * mmap_sem is held in read mode (hence no one can unmap the range being
+ * migrated).
+ *
+ * Once callback is done cleaning up things and updating its page table (if it
+ * chose to do so, this is not an obligation) then it returns. At this point,
+ * the HMM core will finish up the final steps, and the migration is complete.
+ *
+ * THE finalize_and_map() CALLBACK MUST NOT CHANGE ANY OF THE SRC OR DST ARRAY
+ * ENTRIES OR BAD THINGS WILL HAPPEN !
+ */
+struct migrate_vma_ops {
+	void (*alloc_and_copy)(struct vm_area_struct *vma,
+			       const unsigned long *src,
+			       unsigned long *dst,
+			       unsigned long start,
+			       unsigned long end,
+			       void *private);
+	void (*finalize_and_map)(struct vm_area_struct *vma,
+				 const unsigned long *src,
+				 const unsigned long *dst,
+				 unsigned long start,
+				 unsigned long end,
+				 void *private);
+};
+
+int migrate_vma(const struct migrate_vma_ops *ops,
+		struct vm_area_struct *vma,
+		unsigned long start,
+		unsigned long end,
+		unsigned long *src,
+		unsigned long *dst,
+		void *private);
+
+#endif /* CONFIG_MIGRATION */
+
 #endif /* _LINUX_MIGRATE_H */
diff --git a/mm/migrate.c b/mm/migrate.c
index 71de36cfb673..991e8886093f 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -428,6 +428,14 @@ int migrate_page_move_mapping(struct address_space *mapping,
 	int expected_count = 1 + extra_count;
 	void **pslot;
 
+	/*
+	 * ZONE_DEVICE pages have 1 refcount always held by their device
+	 *
+	 * Note that DAX memory will never reach that point as it does not have
+	 * the MEMORY_DEVICE_ALLOW_MIGRATE flag set (see memory_hotplug.h).
+	 */
+	expected_count += is_zone_device_page(page);
+
 	if (!mapping) {
 		/* Anonymous page without mapping */
 		if (page_count(page) != expected_count)
@@ -2106,3 +2114,487 @@ out_unlock:
 #endif /* CONFIG_NUMA_BALANCING */
 
 #endif /* CONFIG_NUMA */
+
+
+struct migrate_vma {
+	struct vm_area_struct	*vma;
+	unsigned long		*dst;
+	unsigned long		*src;
+	unsigned long		cpages;
+	unsigned long		npages;
+	unsigned long		start;
+	unsigned long		end;
+};
+
+static int migrate_vma_collect_hole(unsigned long start,
+				    unsigned long end,
+				    struct mm_walk *walk)
+{
+	struct migrate_vma *migrate = walk->private;
+	unsigned long addr;
+
+	for (addr = start & PAGE_MASK; addr < end; addr += PAGE_SIZE) {
+		migrate->dst[migrate->npages] = 0;
+		migrate->src[migrate->npages++] = 0;
+	}
+
+	return 0;
+}
+
+static int migrate_vma_collect_pmd(pmd_t *pmdp,
+				   unsigned long start,
+				   unsigned long end,
+				   struct mm_walk *walk)
+{
+	struct migrate_vma *migrate = walk->private;
+	struct vm_area_struct *vma = walk->vma;
+	struct mm_struct *mm = vma->vm_mm;
+	unsigned long addr = start;
+	spinlock_t *ptl;
+	pte_t *ptep;
+
+again:
+	if (pmd_none(*pmdp))
+		return migrate_vma_collect_hole(start, end, walk);
+
+	if (pmd_trans_huge(*pmdp)) {
+		struct page *page;
+
+		ptl = pmd_lock(mm, pmdp);
+		if (unlikely(!pmd_trans_huge(*pmdp))) {
+			spin_unlock(ptl);
+			goto again;
+		}
+
+		page = pmd_page(*pmdp);
+		if (is_huge_zero_page(page)) {
+			spin_unlock(ptl);
+			split_huge_pmd(vma, pmdp, addr);
+			if (pmd_trans_unstable(pmdp))
+				return migrate_vma_collect_hole(start, end,
+								walk);
+		} else {
+			int ret;
+
+			get_page(page);
+			spin_unlock(ptl);
+			if (unlikely(!trylock_page(page)))
+				return migrate_vma_collect_hole(start, end,
+								walk);
+			ret = split_huge_page(page);
+			unlock_page(page);
+			put_page(page);
+			if (ret || pmd_none(*pmdp))
+				return migrate_vma_collect_hole(start, end,
+								walk);
+		}
+	}
+
+	if (unlikely(pmd_bad(*pmdp)))
+		return migrate_vma_collect_hole(start, end, walk);
+
+	ptep = pte_offset_map_lock(mm, pmdp, addr, &ptl);
+	for (; addr < end; addr += PAGE_SIZE, ptep++) {
+		unsigned long mpfn, pfn;
+		struct page *page;
+		pte_t pte;
+
+		pte = *ptep;
+		pfn = pte_pfn(pte);
+
+		if (!pte_present(pte)) {
+			mpfn = pfn = 0;
+			goto next;
+		}
+
+		/* FIXME support THP */
+		page = vm_normal_page(migrate->vma, addr, pte);
+		if (!page || !page->mapping || PageTransCompound(page)) {
+			mpfn = pfn = 0;
+			goto next;
+		}
+
+		/*
+		 * By getting a reference on the page we pin it and that blocks
+		 * any kind of migration. Side effect is that it "freezes" the
+		 * pte.
+		 *
+		 * We drop this reference after isolating the page from the lru
+		 * for non device page (device page are not on the lru and thus
+		 * can't be dropped from it).
+		 */
+		get_page(page);
+		migrate->cpages++;
+		mpfn = migrate_pfn(pfn) | MIGRATE_PFN_MIGRATE;
+		mpfn |= pte_write(pte) ? MIGRATE_PFN_WRITE : 0;
+
+next:
+		migrate->src[migrate->npages++] = mpfn;
+	}
+	pte_unmap_unlock(ptep - 1, ptl);
+
+	return 0;
+}
+
+/*
+ * migrate_vma_collect() - collect pages over a range of virtual addresses
+ * @migrate: migrate struct containing all migration information
+ *
+ * This will walk the CPU page table. For each virtual address backed by a
+ * valid page, it updates the src array and takes a reference on the page, in
+ * order to pin the page until we lock it and unmap it.
+ */
+static void migrate_vma_collect(struct migrate_vma *migrate)
+{
+	struct mm_walk mm_walk;
+
+	mm_walk.pmd_entry = migrate_vma_collect_pmd;
+	mm_walk.pte_entry = NULL;
+	mm_walk.pte_hole = migrate_vma_collect_hole;
+	mm_walk.hugetlb_entry = NULL;
+	mm_walk.test_walk = NULL;
+	mm_walk.vma = migrate->vma;
+	mm_walk.mm = migrate->vma->vm_mm;
+	mm_walk.private = migrate;
+
+	walk_page_range(migrate->start, migrate->end, &mm_walk);
+
+	migrate->end = migrate->start + (migrate->npages << PAGE_SHIFT);
+}
+
+/*
+ * migrate_vma_check_page() - check if page is pinned or not
+ * @page: struct page to check
+ *
+ * Pinned pages cannot be migrated. This is the same test as in
+ * migrate_page_move_mapping(), except that here we allow migration of a
+ * ZONE_DEVICE page.
+ */
+static bool migrate_vma_check_page(struct page *page)
+{
+	/*
+	 * One extra ref because caller holds an extra reference, either from
+	 * isolate_lru_page() for a regular page, or migrate_vma_collect() for
+	 * a device page.
+	 */
+	int extra = 1;
+
+	/*
+	 * FIXME support THP (transparent huge page), it is bit more complex to
+	 * check them than regular pages, because they can be mapped with a pmd
+	 * or with a pte (split pte mapping).
+	 */
+	if (PageCompound(page))
+		return false;
+
+	if ((page_count(page) - extra) > page_mapcount(page))
+		return false;
+
+	return true;
+}
+
+/*
+ * migrate_vma_prepare() - lock pages and isolate them from the lru
+ * @migrate: migrate struct containing all migration information
+ *
+ * This locks pages that have been collected by migrate_vma_collect(). Once each
+ * page is locked it is isolated from the lru (for non-device pages). Finally,
+ * the ref taken by migrate_vma_collect() is dropped, as locked pages cannot be
+ * migrated by concurrent kernel threads.
+ */
+static void migrate_vma_prepare(struct migrate_vma *migrate)
+{
+	const unsigned long npages = migrate->npages;
+	bool allow_drain = true;
+	unsigned long i;
+
+	lru_add_drain();
+
+	for (i = 0; (i < npages) && migrate->cpages; i++) {
+		struct page *page = migrate_pfn_to_page(migrate->src[i]);
+
+		if (!page)
+			continue;
+
+		/*
+		 * Because we are migrating several pages there can be
+		 * a deadlock between 2 concurrent migration where each
+		 * are waiting on each other page lock.
+		 *
+		 * Make migrate_vma() a best effort thing and backoff
+		 * for any page we can not lock right away.
+		 */
+		if (!trylock_page(page)) {
+			migrate->src[i] = 0;
+			migrate->cpages--;
+			put_page(page);
+			continue;
+		}
+		migrate->src[i] |= MIGRATE_PFN_LOCKED;
+
+		if (!PageLRU(page) && allow_drain) {
+			/* Drain CPU's pagevec */
+			lru_add_drain_all();
+			allow_drain = false;
+		}
+
+		if (isolate_lru_page(page)) {
+			migrate->src[i] = 0;
+			unlock_page(page);
+			migrate->cpages--;
+			put_page(page);
+			continue;
+		}
+
+		if (!migrate_vma_check_page(page)) {
+			migrate->src[i] = 0;
+			unlock_page(page);
+			migrate->cpages--;
+
+			putback_lru_page(page);
+		}
+	}
+}
+
+/*
+ * migrate_vma_unmap() - replace page mapping with special migration pte entry
+ * @migrate: migrate struct containing all migration information
+ *
+ * Replace page mapping (CPU page table pte) with a special migration pte entry
+ * and check again if it has been pinned. Pinned pages are restored because we
+ * cannot migrate them.
+ *
+ * This is the last step before we call the device driver callback to allocate
+ * destination memory and copy contents of original page over to new page.
+ */
+static void migrate_vma_unmap(struct migrate_vma *migrate)
+{
+	int flags = TTU_MIGRATION | TTU_IGNORE_MLOCK | TTU_IGNORE_ACCESS;
+	const unsigned long npages = migrate->npages;
+	const unsigned long start = migrate->start;
+	unsigned long addr, i, restore = 0;
+
+	for (i = 0; i < npages; i++) {
+		struct page *page = migrate_pfn_to_page(migrate->src[i]);
+
+		if (!page || !(migrate->src[i] & MIGRATE_PFN_MIGRATE))
+			continue;
+
+		try_to_unmap(page, flags);
+		if (page_mapped(page) || !migrate_vma_check_page(page)) {
+			migrate->src[i] &= ~MIGRATE_PFN_MIGRATE;
+			migrate->cpages--;
+			restore++;
+		}
+	}
+
+	for (addr = start, i = 0; i < npages && restore; addr += PAGE_SIZE, i++) {
+		struct page *page = migrate_pfn_to_page(migrate->src[i]);
+
+		if (!page || (migrate->src[i] & MIGRATE_PFN_MIGRATE))
+			continue;
+
+		remove_migration_ptes(page, page, false);
+
+		migrate->src[i] = 0;
+		unlock_page(page);
+		restore--;
+
+		putback_lru_page(page);
+	}
+}
+
+/*
+ * migrate_vma_pages() - migrate meta-data from src page to dst page
+ * @migrate: migrate struct containing all migration information
+ *
+ * This migrates struct page meta-data from source struct page to destination
+ * struct page. This effectively finishes the migration from source page to the
+ * destination page.
+ */
+static void migrate_vma_pages(struct migrate_vma *migrate)
+{
+	const unsigned long npages = migrate->npages;
+	const unsigned long start = migrate->start;
+	unsigned long addr, i;
+
+	for (i = 0, addr = start; i < npages; addr += PAGE_SIZE, i++) {
+		struct page *newpage = migrate_pfn_to_page(migrate->dst[i]);
+		struct page *page = migrate_pfn_to_page(migrate->src[i]);
+		struct address_space *mapping;
+		int r;
+
+		if (!page || !newpage)
+			continue;
+		if (!(migrate->src[i] & MIGRATE_PFN_MIGRATE))
+			continue;
+
+		mapping = page_mapping(page);
+
+		r = migrate_page(mapping, newpage, page, MIGRATE_SYNC_NO_COPY);
+		if (r != MIGRATEPAGE_SUCCESS)
+			migrate->src[i] &= ~MIGRATE_PFN_MIGRATE;
+	}
+}
+
+/*
+ * migrate_vma_finalize() - restore CPU page table entry
+ * @migrate: migrate struct containing all migration information
+ *
+ * This replaces the special migration pte entry with either a mapping to the
+ * new page if migration was successful for that page, or to the original page
+ * otherwise.
+ *
+ * This also unlocks the pages and puts them back on the lru, or drops the extra
+ * refcount, for device pages.
+ */
+static void migrate_vma_finalize(struct migrate_vma *migrate)
+{
+	const unsigned long npages = migrate->npages;
+	unsigned long i;
+
+	for (i = 0; i < npages; i++) {
+		struct page *newpage = migrate_pfn_to_page(migrate->dst[i]);
+		struct page *page = migrate_pfn_to_page(migrate->src[i]);
+
+		if (!page)
+			continue;
+		if (!(migrate->src[i] & MIGRATE_PFN_MIGRATE) || !newpage) {
+			if (newpage) {
+				unlock_page(newpage);
+				put_page(newpage);
+			}
+			newpage = page;
+		}
+
+		remove_migration_ptes(page, newpage, false);
+		unlock_page(page);
+		migrate->cpages--;
+
+		putback_lru_page(page);
+
+		if (newpage != page) {
+			unlock_page(newpage);
+			putback_lru_page(newpage);
+		}
+	}
+}
+
+/*
+ * migrate_vma() - migrate a range of memory inside vma
+ *
+ * @ops: migration callback for allocating destination memory and copying
+ * @vma: virtual memory area containing the range to be migrated
+ * @start: start address of the range to migrate (inclusive)
+ * @end: end address of the range to migrate (exclusive)
+ * @src: array of hmm_pfn_t containing source pfns
+ * @dst: array of hmm_pfn_t containing destination pfns
+ * @private: pointer passed back to each of the callback
+ * Returns: 0 on success, error code otherwise
+ *
+ * This function tries to migrate a range of memory virtual address range, using
+ * callbacks to allocate and copy memory from source to destination. First it
+ * collects all the pages backing each virtual address in the range, saving this
+ * inside the src array. Then it locks those pages and unmaps them. Once the pages
+ * are locked and unmapped, it checks whether each page is pinned or not. Pages
+ * that aren't pinned have the MIGRATE_PFN_MIGRATE flag set (by this function)
+ * in the corresponding src array entry. It then restores any pages that are
+ * pinned, by remapping and unlocking those pages.
+ *
+ * At this point it calls the alloc_and_copy() callback. For documentation on
+ * what is expected from that callback, see struct migrate_vma_ops comments in
+ * include/linux/migrate.h
+ *
+ * After the alloc_and_copy() callback, this function goes over each entry in
+ * the src array that has the MIGRATE_PFN_VALID and MIGRATE_PFN_MIGRATE flag
+ * set. If the corresponding entry in dst array has MIGRATE_PFN_VALID flag set,
+ * then the function tries to migrate struct page information from the source
+ * struct page to the destination struct page. If it fails to migrate the struct
+ * page information, then it clears the MIGRATE_PFN_MIGRATE flag in the src
+ * array.
+ *
+ * At this point all successfully migrated pages have an entry in the src
+ * array with MIGRATE_PFN_VALID and MIGRATE_PFN_MIGRATE flag set and the dst
+ * array entry with MIGRATE_PFN_VALID flag set.
+ *
+ * It then calls the finalize_and_map() callback. See comments for "struct
+ * migrate_vma_ops", in include/linux/migrate.h for details about
+ * finalize_and_map() behavior.
+ *
+ * After the finalize_and_map() callback, for successfully migrated pages, this
+ * function updates the CPU page table to point to new pages, otherwise it
+ * restores the CPU page table to point to the original source pages.
+ *
+ * Function returns 0 after the above steps, even if no pages were migrated
+ * (The function only returns an error if any of the arguments are invalid.)
+ *
+ * Both src and dst array must be big enough for (end - start) >> PAGE_SHIFT
+ * unsigned long entries.
+ */
+int migrate_vma(const struct migrate_vma_ops *ops,
+		struct vm_area_struct *vma,
+		unsigned long start,
+		unsigned long end,
+		unsigned long *src,
+		unsigned long *dst,
+		void *private)
+{
+	struct migrate_vma migrate;
+
+	/* Sanity check the arguments */
+	start &= PAGE_MASK;
+	end &= PAGE_MASK;
+	if (!vma || is_vm_hugetlb_page(vma) || (vma->vm_flags & VM_SPECIAL))
+		return -EINVAL;
+	if (start < vma->vm_start || start >= vma->vm_end)
+		return -EINVAL;
+	if (end <= vma->vm_start || end > vma->vm_end)
+		return -EINVAL;
+	if (!ops || !src || !dst || start >= end)
+		return -EINVAL;
+
+	memset(src, 0, sizeof(*src) * ((end - start) >> PAGE_SHIFT));
+	migrate.src = src;
+	migrate.dst = dst;
+	migrate.start = start;
+	migrate.npages = 0;
+	migrate.cpages = 0;
+	migrate.end = end;
+	migrate.vma = vma;
+
+	/* Collect, and try to unmap source pages */
+	migrate_vma_collect(&migrate);
+	if (!migrate.cpages)
+		return 0;
+
+	/* Lock and isolate page */
+	migrate_vma_prepare(&migrate);
+	if (!migrate.cpages)
+		return 0;
+
+	/* Unmap pages */
+	migrate_vma_unmap(&migrate);
+	if (!migrate.cpages)
+		return 0;
+
+	/*
+	 * At this point pages are locked and unmapped, and thus they have
+	 * stable content and can safely be copied to destination memory that
+	 * is allocated by the callback.
+	 *
+	 * Note that migration can fail in migrate_vma_struct_page() for each
+	 * individual page.
+	 */
+	ops->alloc_and_copy(vma, src, dst, start, end, private);
+
+	/* This does the real migration of struct page */
+	migrate_vma_pages(&migrate);
+
+	ops->finalize_and_map(vma, src, dst, start, end, private);
+
+	/* Unlock and remap pages */
+	migrate_vma_finalize(&migrate);
+
+	return 0;
+}
+EXPORT_SYMBOL(migrate_vma);
-- 
cgit v1.2.3


From a5430dda8a3a1cdd532e37270e6f36436241b6e7 Mon Sep 17 00:00:00 2001
From: Jérôme Glisse <jglisse@redhat.com>
Date: Fri, 8 Sep 2017 16:12:17 -0700
Subject: mm/migrate: support un-addressable ZONE_DEVICE page in migration
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Allow to unmap and restore special swap entry of un-addressable
ZONE_DEVICE memory.

Link: http://lkml.kernel.org/r/20170817000548.32038-17-jglisse@redhat.com
Signed-off-by: Jérôme Glisse <jglisse@redhat.com>
Cc: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Cc: Aneesh Kumar <aneesh.kumar@linux.vnet.ibm.com>
Cc: Balbir Singh <bsingharora@gmail.com>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Dan Williams <dan.j.williams@intel.com>
Cc: David Nellans <dnellans@nvidia.com>
Cc: Evgeny Baskakov <ebaskakov@nvidia.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: John Hubbard <jhubbard@nvidia.com>
Cc: Mark Hairgrove <mhairgrove@nvidia.com>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Ross Zwisler <ross.zwisler@linux.intel.com>
Cc: Sherry Cheung <SCheung@nvidia.com>
Cc: Subhash Gutti <sgutti@nvidia.com>
Cc: Vladimir Davydov <vdavydov.dev@gmail.com>
Cc: Bob Liu <liubo95@huawei.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/migrate.h |  10 +++-
 mm/migrate.c            | 149 +++++++++++++++++++++++++++++++++++++++---------
 mm/page_vma_mapped.c    |  10 ++++
 mm/rmap.c               |  26 +++++++++
 4 files changed, 165 insertions(+), 30 deletions(-)

(limited to 'include')

diff --git a/include/linux/migrate.h b/include/linux/migrate.h
index 8f73cebfc3f5..8dc8f0a3f1af 100644
--- a/include/linux/migrate.h
+++ b/include/linux/migrate.h
@@ -159,12 +159,18 @@ static inline int migrate_misplaced_transhuge_page(struct mm_struct *mm,
 
 #ifdef CONFIG_MIGRATION
 
+/*
+ * Watch out for PAE architecture, which has an unsigned long, and might not
+ * have enough bits to store all physical address and flags. So far we have
+ * enough room for all our flags.
+ */
 #define MIGRATE_PFN_VALID	(1UL << 0)
 #define MIGRATE_PFN_MIGRATE	(1UL << 1)
 #define MIGRATE_PFN_LOCKED	(1UL << 2)
 #define MIGRATE_PFN_WRITE	(1UL << 3)
-#define MIGRATE_PFN_ERROR	(1UL << 4)
-#define MIGRATE_PFN_SHIFT	5
+#define MIGRATE_PFN_DEVICE	(1UL << 4)
+#define MIGRATE_PFN_ERROR	(1UL << 5)
+#define MIGRATE_PFN_SHIFT	6
 
 static inline struct page *migrate_pfn_to_page(unsigned long mpfn)
 {
diff --git a/mm/migrate.c b/mm/migrate.c
index 652b2c642eed..77cb2fef08ea 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -36,6 +36,7 @@
 #include <linux/hugetlb.h>
 #include <linux/hugetlb_cgroup.h>
 #include <linux/gfp.h>
+#include <linux/memremap.h>
 #include <linux/balloon_compaction.h>
 #include <linux/mmu_notifier.h>
 #include <linux/page_idle.h>
@@ -237,7 +238,13 @@ static bool remove_migration_pte(struct page *page, struct vm_area_struct *vma,
 		if (is_write_migration_entry(entry))
 			pte = maybe_mkwrite(pte, vma);
 
-		flush_dcache_page(new);
+		if (unlikely(is_zone_device_page(new)) &&
+		    is_device_private_page(new)) {
+			entry = make_device_private_entry(new, pte_write(pte));
+			pte = swp_entry_to_pte(entry);
+		} else
+			flush_dcache_page(new);
+
 #ifdef CONFIG_HUGETLB_PAGE
 		if (PageHuge(new)) {
 			pte = pte_mkhuge(pte);
@@ -2205,17 +2212,40 @@ again:
 		pte = *ptep;
 		pfn = pte_pfn(pte);
 
-		if (!pte_present(pte)) {
+		if (pte_none(pte)) {
 			mpfn = pfn = 0;
 			goto next;
 		}
 
+		if (!pte_present(pte)) {
+			mpfn = pfn = 0;
+
+			/*
+			 * Only care about unaddressable device page special
+			 * page table entry. Other special swap entries are not
+			 * migratable, and we ignore regular swapped page.
+			 */
+			entry = pte_to_swp_entry(pte);
+			if (!is_device_private_entry(entry))
+				goto next;
+
+			page = device_private_entry_to_page(entry);
+			mpfn = migrate_pfn(page_to_pfn(page))|
+				MIGRATE_PFN_DEVICE | MIGRATE_PFN_MIGRATE;
+			if (is_write_device_private_entry(entry))
+				mpfn |= MIGRATE_PFN_WRITE;
+		} else {
+			page = vm_normal_page(migrate->vma, addr, pte);
+			mpfn = migrate_pfn(pfn) | MIGRATE_PFN_MIGRATE;
+			mpfn |= pte_write(pte) ? MIGRATE_PFN_WRITE : 0;
+		}
+
 		/* FIXME support THP */
-		page = vm_normal_page(migrate->vma, addr, pte);
 		if (!page || !page->mapping || PageTransCompound(page)) {
 			mpfn = pfn = 0;
 			goto next;
 		}
+		pfn = page_to_pfn(page);
 
 		/*
 		 * By getting a reference on the page we pin it and that blocks
@@ -2228,8 +2258,6 @@ again:
 		 */
 		get_page(page);
 		migrate->cpages++;
-		mpfn = migrate_pfn(pfn) | MIGRATE_PFN_MIGRATE;
-		mpfn |= pte_write(pte) ? MIGRATE_PFN_WRITE : 0;
 
 		/*
 		 * Optimize for the common case where page is only mapped once
@@ -2256,10 +2284,13 @@ again:
 			 */
 			page_remove_rmap(page, false);
 			put_page(page);
-			unmapped++;
+
+			if (pte_present(pte))
+				unmapped++;
 		}
 
 next:
+		migrate->dst[migrate->npages] = 0;
 		migrate->src[migrate->npages++] = mpfn;
 	}
 	arch_leave_lazy_mmu_mode();
@@ -2329,6 +2360,28 @@ static bool migrate_vma_check_page(struct page *page)
 	if (PageCompound(page))
 		return false;
 
+	/* Page from ZONE_DEVICE have one extra reference */
+	if (is_zone_device_page(page)) {
+		/*
+		 * Private page can never be pin as they have no valid pte and
+		 * GUP will fail for those. Yet if there is a pending migration
+		 * a thread might try to wait on the pte migration entry and
+		 * will bump the page reference count. Sadly there is no way to
+		 * differentiate a regular pin from migration wait. Hence to
+		 * avoid 2 racing thread trying to migrate back to CPU to enter
+		 * infinite loop (one stoping migration because the other is
+		 * waiting on pte migration entry). We always return true here.
+		 *
+		 * FIXME proper solution is to rework migration_entry_wait() so
+		 * it does not need to take a reference on page.
+		 */
+		if (is_device_private_page(page))
+			return true;
+
+		/* Other ZONE_DEVICE memory type are not supported */
+		return false;
+	}
+
 	if ((page_count(page) - extra) > page_mapcount(page))
 		return false;
 
@@ -2379,24 +2432,30 @@ static void migrate_vma_prepare(struct migrate_vma *migrate)
 			migrate->src[i] |= MIGRATE_PFN_LOCKED;
 		}
 
-		if (!PageLRU(page) && allow_drain) {
-			/* Drain CPU's pagevec */
-			lru_add_drain_all();
-			allow_drain = false;
-		}
+		/* ZONE_DEVICE pages are not on LRU */
+		if (!is_zone_device_page(page)) {
+			if (!PageLRU(page) && allow_drain) {
+				/* Drain CPU's pagevec */
+				lru_add_drain_all();
+				allow_drain = false;
+			}
 
-		if (isolate_lru_page(page)) {
-			if (remap) {
-				migrate->src[i] &= ~MIGRATE_PFN_MIGRATE;
-				migrate->cpages--;
-				restore++;
-			} else {
-				migrate->src[i] = 0;
-				unlock_page(page);
-				migrate->cpages--;
-				put_page(page);
+			if (isolate_lru_page(page)) {
+				if (remap) {
+					migrate->src[i] &= ~MIGRATE_PFN_MIGRATE;
+					migrate->cpages--;
+					restore++;
+				} else {
+					migrate->src[i] = 0;
+					unlock_page(page);
+					migrate->cpages--;
+					put_page(page);
+				}
+				continue;
 			}
-			continue;
+
+			/* Drop the reference we took in collect */
+			put_page(page);
 		}
 
 		if (!migrate_vma_check_page(page)) {
@@ -2405,14 +2464,19 @@ static void migrate_vma_prepare(struct migrate_vma *migrate)
 				migrate->cpages--;
 				restore++;
 
-				get_page(page);
-				putback_lru_page(page);
+				if (!is_zone_device_page(page)) {
+					get_page(page);
+					putback_lru_page(page);
+				}
 			} else {
 				migrate->src[i] = 0;
 				unlock_page(page);
 				migrate->cpages--;
 
-				putback_lru_page(page);
+				if (!is_zone_device_page(page))
+					putback_lru_page(page);
+				else
+					put_page(page);
 			}
 		}
 	}
@@ -2483,7 +2547,10 @@ restore:
 		unlock_page(page);
 		restore--;
 
-		putback_lru_page(page);
+		if (is_zone_device_page(page))
+			put_page(page);
+		else
+			putback_lru_page(page);
 	}
 }
 
@@ -2514,6 +2581,26 @@ static void migrate_vma_pages(struct migrate_vma *migrate)
 
 		mapping = page_mapping(page);
 
+		if (is_zone_device_page(newpage)) {
+			if (is_device_private_page(newpage)) {
+				/*
+				 * For now only support private anonymous when
+				 * migrating to un-addressable device memory.
+				 */
+				if (mapping) {
+					migrate->src[i] &= ~MIGRATE_PFN_MIGRATE;
+					continue;
+				}
+			} else {
+				/*
+				 * Other types of ZONE_DEVICE page are not
+				 * supported.
+				 */
+				migrate->src[i] &= ~MIGRATE_PFN_MIGRATE;
+				continue;
+			}
+		}
+
 		r = migrate_page(mapping, newpage, page, MIGRATE_SYNC_NO_COPY);
 		if (r != MIGRATEPAGE_SUCCESS)
 			migrate->src[i] &= ~MIGRATE_PFN_MIGRATE;
@@ -2554,11 +2641,17 @@ static void migrate_vma_finalize(struct migrate_vma *migrate)
 		unlock_page(page);
 		migrate->cpages--;
 
-		putback_lru_page(page);
+		if (is_zone_device_page(page))
+			put_page(page);
+		else
+			putback_lru_page(page);
 
 		if (newpage != page) {
 			unlock_page(newpage);
-			putback_lru_page(newpage);
+			if (is_zone_device_page(newpage))
+				put_page(newpage);
+			else
+				putback_lru_page(newpage);
 		}
 	}
 }
diff --git a/mm/page_vma_mapped.c b/mm/page_vma_mapped.c
index 3bd3008db4cb..6a03946469a9 100644
--- a/mm/page_vma_mapped.c
+++ b/mm/page_vma_mapped.c
@@ -48,6 +48,7 @@ static bool check_pte(struct page_vma_mapped_walk *pvmw)
 		if (!is_swap_pte(*pvmw->pte))
 			return false;
 		entry = pte_to_swp_entry(*pvmw->pte);
+
 		if (!is_migration_entry(entry))
 			return false;
 		if (migration_entry_to_page(entry) - pvmw->page >=
@@ -60,6 +61,15 @@ static bool check_pte(struct page_vma_mapped_walk *pvmw)
 		WARN_ON_ONCE(1);
 #endif
 	} else {
+		if (is_swap_pte(*pvmw->pte)) {
+			swp_entry_t entry;
+
+			entry = pte_to_swp_entry(*pvmw->pte);
+			if (is_device_private_entry(entry) &&
+			    device_private_entry_to_page(entry) == pvmw->page)
+				return true;
+		}
+
 		if (!pte_present(*pvmw->pte))
 			return false;
 
diff --git a/mm/rmap.c b/mm/rmap.c
index 7dc9c02f7106..0618cd85b862 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -63,6 +63,7 @@
 #include <linux/hugetlb.h>
 #include <linux/backing-dev.h>
 #include <linux/page_idle.h>
+#include <linux/memremap.h>
 
 #include <asm/tlbflush.h>
 
@@ -1346,6 +1347,10 @@ static bool try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
 	if ((flags & TTU_MUNLOCK) && !(vma->vm_flags & VM_LOCKED))
 		return true;
 
+	if (IS_ENABLED(CONFIG_MIGRATION) && (flags & TTU_MIGRATION) &&
+	    is_zone_device_page(page) && !is_device_private_page(page))
+		return true;
+
 	if (flags & TTU_SPLIT_HUGE_PMD) {
 		split_huge_pmd_address(vma, address,
 				flags & TTU_SPLIT_FREEZE, page);
@@ -1403,6 +1408,27 @@ static bool try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
 		address = pvmw.address;
 
 
+		if (IS_ENABLED(CONFIG_MIGRATION) &&
+		    (flags & TTU_MIGRATION) &&
+		    is_zone_device_page(page)) {
+			swp_entry_t entry;
+			pte_t swp_pte;
+
+			pteval = ptep_get_and_clear(mm, pvmw.address, pvmw.pte);
+
+			/*
+			 * Store the pfn of the page in a special migration
+			 * pte. do_swap_page() will wait until the migration
+			 * pte is removed and then restart fault handling.
+			 */
+			entry = make_migration_entry(page, 0);
+			swp_pte = swp_entry_to_pte(entry);
+			if (pte_soft_dirty(pteval))
+				swp_pte = pte_swp_mksoft_dirty(swp_pte);
+			set_pte_at(mm, pvmw.address, pvmw.pte, swp_pte);
+			goto discard;
+		}
+
 		if (!(flags & TTU_IGNORE_ACCESS)) {
 			if (ptep_clear_flush_young_notify(vma, address,
 						pvmw.pte)) {
-- 
cgit v1.2.3


From 8315ada7f095bfa2cae0cd1e915b95bf6226897d Mon Sep 17 00:00:00 2001
From: Jérôme Glisse <jglisse@redhat.com>
Date: Fri, 8 Sep 2017 16:12:21 -0700
Subject: mm/migrate: allow migrate_vma() to alloc new page on empty entry
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This allows callers of migrate_vma() to allocate new page for empty CPU
page table entry (pte_none or back by zero page).  This is only for
anonymous memory and it won't allow new page to be instanced if the
userfaultfd is armed.

This is useful to device driver that want to migrate a range of virtual
address and would rather allocate new memory than having to fault later
on.

Link: http://lkml.kernel.org/r/20170817000548.32038-18-jglisse@redhat.com
Signed-off-by: Jérôme Glisse <jglisse@redhat.com>
Cc: Aneesh Kumar <aneesh.kumar@linux.vnet.ibm.com>
Cc: Balbir Singh <bsingharora@gmail.com>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Dan Williams <dan.j.williams@intel.com>
Cc: David Nellans <dnellans@nvidia.com>
Cc: Evgeny Baskakov <ebaskakov@nvidia.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: John Hubbard <jhubbard@nvidia.com>
Cc: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Cc: Mark Hairgrove <mhairgrove@nvidia.com>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Ross Zwisler <ross.zwisler@linux.intel.com>
Cc: Sherry Cheung <SCheung@nvidia.com>
Cc: Subhash Gutti <sgutti@nvidia.com>
Cc: Vladimir Davydov <vdavydov.dev@gmail.com>
Cc: Bob Liu <liubo95@huawei.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/migrate.h |   9 +++
 mm/migrate.c            | 205 +++++++++++++++++++++++++++++++++++++++++++++---
 2 files changed, 205 insertions(+), 9 deletions(-)

(limited to 'include')

diff --git a/include/linux/migrate.h b/include/linux/migrate.h
index 8dc8f0a3f1af..d4e6d12a0b40 100644
--- a/include/linux/migrate.h
+++ b/include/linux/migrate.h
@@ -218,6 +218,15 @@ static inline unsigned long migrate_pfn(unsigned long pfn)
  * driver should avoid setting MIGRATE_PFN_ERROR unless it is really in an
  * unrecoverable state.
  *
+ * For empty entry inside CPU page table (pte_none() or pmd_none() is true) we
+ * do set MIGRATE_PFN_MIGRATE flag inside the corresponding source array thus
+ * allowing device driver to allocate device memory for those unback virtual
+ * address. For this the device driver simply have to allocate device memory
+ * and properly set the destination entry like for regular migration. Note that
+ * this can still fails and thus inside the device driver must check if the
+ * migration was successful for those entry inside the finalize_and_map()
+ * callback just like for regular migration.
+ *
  * THE alloc_and_copy() CALLBACK MUST NOT CHANGE ANY OF THE SRC ARRAY ENTRIES
  * OR BAD THINGS WILL HAPPEN !
  *
diff --git a/mm/migrate.c b/mm/migrate.c
index 77cb2fef08ea..e581253ef330 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -37,6 +37,7 @@
 #include <linux/hugetlb_cgroup.h>
 #include <linux/gfp.h>
 #include <linux/memremap.h>
+#include <linux/userfaultfd_k.h>
 #include <linux/balloon_compaction.h>
 #include <linux/mmu_notifier.h>
 #include <linux/page_idle.h>
@@ -2140,6 +2141,22 @@ static int migrate_vma_collect_hole(unsigned long start,
 	struct migrate_vma *migrate = walk->private;
 	unsigned long addr;
 
+	for (addr = start & PAGE_MASK; addr < end; addr += PAGE_SIZE) {
+		migrate->src[migrate->npages++] = MIGRATE_PFN_MIGRATE;
+		migrate->dst[migrate->npages] = 0;
+		migrate->cpages++;
+	}
+
+	return 0;
+}
+
+static int migrate_vma_collect_skip(unsigned long start,
+				    unsigned long end,
+				    struct mm_walk *walk)
+{
+	struct migrate_vma *migrate = walk->private;
+	unsigned long addr;
+
 	for (addr = start & PAGE_MASK; addr < end; addr += PAGE_SIZE) {
 		migrate->dst[migrate->npages] = 0;
 		migrate->src[migrate->npages++] = 0;
@@ -2178,7 +2195,7 @@ again:
 			spin_unlock(ptl);
 			split_huge_pmd(vma, pmdp, addr);
 			if (pmd_trans_unstable(pmdp))
-				return migrate_vma_collect_hole(start, end,
+				return migrate_vma_collect_skip(start, end,
 								walk);
 		} else {
 			int ret;
@@ -2186,19 +2203,22 @@ again:
 			get_page(page);
 			spin_unlock(ptl);
 			if (unlikely(!trylock_page(page)))
-				return migrate_vma_collect_hole(start, end,
+				return migrate_vma_collect_skip(start, end,
 								walk);
 			ret = split_huge_page(page);
 			unlock_page(page);
 			put_page(page);
-			if (ret || pmd_none(*pmdp))
+			if (ret)
+				return migrate_vma_collect_skip(start, end,
+								walk);
+			if (pmd_none(*pmdp))
 				return migrate_vma_collect_hole(start, end,
 								walk);
 		}
 	}
 
 	if (unlikely(pmd_bad(*pmdp)))
-		return migrate_vma_collect_hole(start, end, walk);
+		return migrate_vma_collect_skip(start, end, walk);
 
 	ptep = pte_offset_map_lock(mm, pmdp, addr, &ptl);
 	arch_enter_lazy_mmu_mode();
@@ -2213,7 +2233,9 @@ again:
 		pfn = pte_pfn(pte);
 
 		if (pte_none(pte)) {
-			mpfn = pfn = 0;
+			mpfn = MIGRATE_PFN_MIGRATE;
+			migrate->cpages++;
+			pfn = 0;
 			goto next;
 		}
 
@@ -2235,6 +2257,12 @@ again:
 			if (is_write_device_private_entry(entry))
 				mpfn |= MIGRATE_PFN_WRITE;
 		} else {
+			if (is_zero_pfn(pfn)) {
+				mpfn = MIGRATE_PFN_MIGRATE;
+				migrate->cpages++;
+				pfn = 0;
+				goto next;
+			}
 			page = vm_normal_page(migrate->vma, addr, pte);
 			mpfn = migrate_pfn(pfn) | MIGRATE_PFN_MIGRATE;
 			mpfn |= pte_write(pte) ? MIGRATE_PFN_WRITE : 0;
@@ -2554,6 +2582,135 @@ restore:
 	}
 }
 
+static void migrate_vma_insert_page(struct migrate_vma *migrate,
+				    unsigned long addr,
+				    struct page *page,
+				    unsigned long *src,
+				    unsigned long *dst)
+{
+	struct vm_area_struct *vma = migrate->vma;
+	struct mm_struct *mm = vma->vm_mm;
+	struct mem_cgroup *memcg;
+	bool flush = false;
+	spinlock_t *ptl;
+	pte_t entry;
+	pgd_t *pgdp;
+	p4d_t *p4dp;
+	pud_t *pudp;
+	pmd_t *pmdp;
+	pte_t *ptep;
+
+	/* Only allow populating anonymous memory */
+	if (!vma_is_anonymous(vma))
+		goto abort;
+
+	pgdp = pgd_offset(mm, addr);
+	p4dp = p4d_alloc(mm, pgdp, addr);
+	if (!p4dp)
+		goto abort;
+	pudp = pud_alloc(mm, p4dp, addr);
+	if (!pudp)
+		goto abort;
+	pmdp = pmd_alloc(mm, pudp, addr);
+	if (!pmdp)
+		goto abort;
+
+	if (pmd_trans_huge(*pmdp) || pmd_devmap(*pmdp))
+		goto abort;
+
+	/*
+	 * Use pte_alloc() instead of pte_alloc_map().  We can't run
+	 * pte_offset_map() on pmds where a huge pmd might be created
+	 * from a different thread.
+	 *
+	 * pte_alloc_map() is safe to use under down_write(mmap_sem) or when
+	 * parallel threads are excluded by other means.
+	 *
+	 * Here we only have down_read(mmap_sem).
+	 */
+	if (pte_alloc(mm, pmdp, addr))
+		goto abort;
+
+	/* See the comment in pte_alloc_one_map() */
+	if (unlikely(pmd_trans_unstable(pmdp)))
+		goto abort;
+
+	if (unlikely(anon_vma_prepare(vma)))
+		goto abort;
+	if (mem_cgroup_try_charge(page, vma->vm_mm, GFP_KERNEL, &memcg, false))
+		goto abort;
+
+	/*
+	 * The memory barrier inside __SetPageUptodate makes sure that
+	 * preceding stores to the page contents become visible before
+	 * the set_pte_at() write.
+	 */
+	__SetPageUptodate(page);
+
+	if (is_zone_device_page(page) && is_device_private_page(page)) {
+		swp_entry_t swp_entry;
+
+		swp_entry = make_device_private_entry(page, vma->vm_flags & VM_WRITE);
+		entry = swp_entry_to_pte(swp_entry);
+	} else {
+		entry = mk_pte(page, vma->vm_page_prot);
+		if (vma->vm_flags & VM_WRITE)
+			entry = pte_mkwrite(pte_mkdirty(entry));
+	}
+
+	ptep = pte_offset_map_lock(mm, pmdp, addr, &ptl);
+
+	if (pte_present(*ptep)) {
+		unsigned long pfn = pte_pfn(*ptep);
+
+		if (!is_zero_pfn(pfn)) {
+			pte_unmap_unlock(ptep, ptl);
+			mem_cgroup_cancel_charge(page, memcg, false);
+			goto abort;
+		}
+		flush = true;
+	} else if (!pte_none(*ptep)) {
+		pte_unmap_unlock(ptep, ptl);
+		mem_cgroup_cancel_charge(page, memcg, false);
+		goto abort;
+	}
+
+	/*
+	 * Check for usefaultfd but do not deliver the fault. Instead,
+	 * just back off.
+	 */
+	if (userfaultfd_missing(vma)) {
+		pte_unmap_unlock(ptep, ptl);
+		mem_cgroup_cancel_charge(page, memcg, false);
+		goto abort;
+	}
+
+	inc_mm_counter(mm, MM_ANONPAGES);
+	page_add_new_anon_rmap(page, vma, addr, false);
+	mem_cgroup_commit_charge(page, memcg, false, false);
+	if (!is_zone_device_page(page))
+		lru_cache_add_active_or_unevictable(page, vma);
+	get_page(page);
+
+	if (flush) {
+		flush_cache_page(vma, addr, pte_pfn(*ptep));
+		ptep_clear_flush_notify(vma, addr, ptep);
+		set_pte_at_notify(mm, addr, ptep, entry);
+		update_mmu_cache(vma, addr, ptep);
+	} else {
+		/* No need to invalidate - it was non-present before */
+		set_pte_at(mm, addr, ptep, entry);
+		update_mmu_cache(vma, addr, ptep);
+	}
+
+	pte_unmap_unlock(ptep, ptl);
+	*src = MIGRATE_PFN_MIGRATE;
+	return;
+
+abort:
+	*src &= ~MIGRATE_PFN_MIGRATE;
+}
+
 /*
  * migrate_vma_pages() - migrate meta-data from src page to dst page
  * @migrate: migrate struct containing all migration information
@@ -2566,7 +2723,10 @@ static void migrate_vma_pages(struct migrate_vma *migrate)
 {
 	const unsigned long npages = migrate->npages;
 	const unsigned long start = migrate->start;
-	unsigned long addr, i;
+	struct vm_area_struct *vma = migrate->vma;
+	struct mm_struct *mm = vma->vm_mm;
+	unsigned long addr, i, mmu_start;
+	bool notified = false;
 
 	for (i = 0, addr = start; i < npages; addr += PAGE_SIZE, i++) {
 		struct page *newpage = migrate_pfn_to_page(migrate->dst[i]);
@@ -2574,10 +2734,27 @@ static void migrate_vma_pages(struct migrate_vma *migrate)
 		struct address_space *mapping;
 		int r;
 
-		if (!page || !newpage)
+		if (!newpage) {
+			migrate->src[i] &= ~MIGRATE_PFN_MIGRATE;
 			continue;
-		if (!(migrate->src[i] & MIGRATE_PFN_MIGRATE))
+		}
+
+		if (!page) {
+			if (!(migrate->src[i] & MIGRATE_PFN_MIGRATE)) {
+				continue;
+			}
+			if (!notified) {
+				mmu_start = addr;
+				notified = true;
+				mmu_notifier_invalidate_range_start(mm,
+								mmu_start,
+								migrate->end);
+			}
+			migrate_vma_insert_page(migrate, addr, newpage,
+						&migrate->src[i],
+						&migrate->dst[i]);
 			continue;
+		}
 
 		mapping = page_mapping(page);
 
@@ -2605,6 +2782,10 @@ static void migrate_vma_pages(struct migrate_vma *migrate)
 		if (r != MIGRATEPAGE_SUCCESS)
 			migrate->src[i] &= ~MIGRATE_PFN_MIGRATE;
 	}
+
+	if (notified)
+		mmu_notifier_invalidate_range_end(mm, mmu_start,
+						  migrate->end);
 }
 
 /*
@@ -2627,8 +2808,14 @@ static void migrate_vma_finalize(struct migrate_vma *migrate)
 		struct page *newpage = migrate_pfn_to_page(migrate->dst[i]);
 		struct page *page = migrate_pfn_to_page(migrate->src[i]);
 
-		if (!page)
+		if (!page) {
+			if (newpage) {
+				unlock_page(newpage);
+				put_page(newpage);
+			}
 			continue;
+		}
+
 		if (!(migrate->src[i] & MIGRATE_PFN_MIGRATE) || !newpage) {
 			if (newpage) {
 				unlock_page(newpage);
-- 
cgit v1.2.3


From df6ad69838fc9dcdbee0dcf2fc2c6f1113f8d609 Mon Sep 17 00:00:00 2001
From: Jérôme Glisse <jglisse@redhat.com>
Date: Fri, 8 Sep 2017 16:12:24 -0700
Subject: mm/device-public-memory: device memory cache coherent with CPU
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Platform with advance system bus (like CAPI or CCIX) allow device memory
to be accessible from CPU in a cache coherent fashion.  Add a new type of
ZONE_DEVICE to represent such memory.  The use case are the same as for
the un-addressable device memory but without all the corners cases.

Link: http://lkml.kernel.org/r/20170817000548.32038-19-jglisse@redhat.com
Signed-off-by: Jérôme Glisse <jglisse@redhat.com>
Cc: Aneesh Kumar <aneesh.kumar@linux.vnet.ibm.com>
Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Dan Williams <dan.j.williams@intel.com>
Cc: Ross Zwisler <ross.zwisler@linux.intel.com>
Cc: Balbir Singh <bsingharora@gmail.com>
Cc: David Nellans <dnellans@nvidia.com>
Cc: Evgeny Baskakov <ebaskakov@nvidia.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: John Hubbard <jhubbard@nvidia.com>
Cc: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Cc: Mark Hairgrove <mhairgrove@nvidia.com>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Sherry Cheung <SCheung@nvidia.com>
Cc: Subhash Gutti <sgutti@nvidia.com>
Cc: Vladimir Davydov <vdavydov.dev@gmail.com>
Cc: Bob Liu <liubo95@huawei.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/proc/task_mmu.c       |  2 +-
 include/linux/hmm.h      |  4 ++--
 include/linux/ioport.h   |  1 +
 include/linux/memremap.h | 21 ++++++++++++++++++
 include/linux/mm.h       | 20 ++++++++++-------
 kernel/memremap.c        |  8 +++----
 mm/Kconfig               | 11 ++++++++++
 mm/gup.c                 |  7 ++++++
 mm/hmm.c                 |  4 ++--
 mm/madvise.c             |  2 +-
 mm/memcontrol.c          | 12 +++++-----
 mm/memory.c              | 46 +++++++++++++++++++++++++++++++++-----
 mm/migrate.c             | 57 ++++++++++++++++++++++++++++++++----------------
 mm/swap.c                | 11 ++++++++++
 14 files changed, 159 insertions(+), 47 deletions(-)

(limited to 'include')

diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index 90ab657f8e56..281880c7e694 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -1267,7 +1267,7 @@ static pagemap_entry_t pte_to_pagemap_entry(struct pagemapread *pm,
 		if (pm->show_pfn)
 			frame = pte_pfn(pte);
 		flags |= PM_PRESENT;
-		page = vm_normal_page(vma, addr, pte);
+		page = _vm_normal_page(vma, addr, pte, true);
 		if (pte_soft_dirty(pte))
 			flags |= PM_SOFT_DIRTY;
 	} else if (is_swap_pte(pte)) {
diff --git a/include/linux/hmm.h b/include/linux/hmm.h
index 67a03b20a2db..6d3b0b4fed4e 100644
--- a/include/linux/hmm.h
+++ b/include/linux/hmm.h
@@ -327,7 +327,7 @@ int hmm_vma_fault(struct vm_area_struct *vma,
 #endif /* IS_ENABLED(CONFIG_HMM_MIRROR) */
 
 
-#if IS_ENABLED(CONFIG_DEVICE_PRIVATE)
+#if IS_ENABLED(CONFIG_DEVICE_PRIVATE) ||  IS_ENABLED(CONFIG_DEVICE_PUBLIC)
 struct hmm_devmem;
 
 struct page *hmm_vma_alloc_locked_page(struct vm_area_struct *vma,
@@ -494,7 +494,7 @@ struct hmm_device {
  */
 struct hmm_device *hmm_device_new(void *drvdata);
 void hmm_device_put(struct hmm_device *hmm_device);
-#endif /* IS_ENABLED(CONFIG_DEVICE_PRIVATE) */
+#endif /* CONFIG_DEVICE_PRIVATE || CONFIG_DEVICE_PUBLIC */
 
 
 /* Below are for HMM internal use only! Not to be used by device driver! */
diff --git a/include/linux/ioport.h b/include/linux/ioport.h
index 3a4f69137bc2..f5cf32e80041 100644
--- a/include/linux/ioport.h
+++ b/include/linux/ioport.h
@@ -131,6 +131,7 @@ enum {
 	IORES_DESC_PERSISTENT_MEMORY		= 4,
 	IORES_DESC_PERSISTENT_MEMORY_LEGACY	= 5,
 	IORES_DESC_DEVICE_PRIVATE_MEMORY	= 6,
+	IORES_DESC_DEVICE_PUBLIC_MEMORY		= 7,
 };
 
 /* helpers to define resources */
diff --git a/include/linux/memremap.h b/include/linux/memremap.h
index 8aa6b82679e2..f8ee1c73ad2d 100644
--- a/include/linux/memremap.h
+++ b/include/linux/memremap.h
@@ -57,10 +57,18 @@ static inline struct vmem_altmap *to_vmem_altmap(unsigned long memmap_start)
  *
  * A more complete discussion of unaddressable memory may be found in
  * include/linux/hmm.h and Documentation/vm/hmm.txt.
+ *
+ * MEMORY_DEVICE_PUBLIC:
+ * Device memory that is cache coherent from device and CPU point of view. This
+ * is use on platform that have an advance system bus (like CAPI or CCIX). A
+ * driver can hotplug the device memory using ZONE_DEVICE and with that memory
+ * type. Any page of a process can be migrated to such memory. However no one
+ * should be allow to pin such memory so that it can always be evicted.
  */
 enum memory_type {
 	MEMORY_DEVICE_HOST = 0,
 	MEMORY_DEVICE_PRIVATE,
+	MEMORY_DEVICE_PUBLIC,
 };
 
 /*
@@ -92,6 +100,8 @@ enum memory_type {
  * The page_free() callback is called once the page refcount reaches 1
  * (ZONE_DEVICE pages never reach 0 refcount unless there is a refcount bug.
  * This allows the device driver to implement its own memory management.)
+ *
+ * For MEMORY_DEVICE_PUBLIC only the page_free() callback matter.
  */
 typedef int (*dev_page_fault_t)(struct vm_area_struct *vma,
 				unsigned long addr,
@@ -134,6 +144,12 @@ static inline bool is_device_private_page(const struct page *page)
 	return is_zone_device_page(page) &&
 		page->pgmap->type == MEMORY_DEVICE_PRIVATE;
 }
+
+static inline bool is_device_public_page(const struct page *page)
+{
+	return is_zone_device_page(page) &&
+		page->pgmap->type == MEMORY_DEVICE_PUBLIC;
+}
 #else
 static inline void *devm_memremap_pages(struct device *dev,
 		struct resource *res, struct percpu_ref *ref,
@@ -157,6 +173,11 @@ static inline bool is_device_private_page(const struct page *page)
 {
 	return false;
 }
+
+static inline bool is_device_public_page(const struct page *page)
+{
+	return false;
+}
 #endif
 
 /**
diff --git a/include/linux/mm.h b/include/linux/mm.h
index eccdab4bb44a..de66a1127db4 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -800,15 +800,16 @@ static inline bool is_zone_device_page(const struct page *page)
 }
 #endif
 
-#ifdef CONFIG_DEVICE_PRIVATE
-void put_zone_device_private_page(struct page *page);
+#if IS_ENABLED(CONFIG_DEVICE_PRIVATE) ||  IS_ENABLED(CONFIG_DEVICE_PUBLIC)
+void put_zone_device_private_or_public_page(struct page *page);
 #else
-static inline void put_zone_device_private_page(struct page *page)
+static inline void put_zone_device_private_or_public_page(struct page *page)
 {
 }
-#endif
+#endif /* CONFIG_DEVICE_PRIVATE || CONFIG_DEVICE_PUBLIC */
 
 static inline bool is_device_private_page(const struct page *page);
+static inline bool is_device_public_page(const struct page *page);
 
 DECLARE_STATIC_KEY_FALSE(device_private_key);
 
@@ -834,8 +835,9 @@ static inline void put_page(struct page *page)
 	 * include/linux/memremap.h and HMM for details.
 	 */
 	if (static_branch_unlikely(&device_private_key) &&
-	    unlikely(is_device_private_page(page))) {
-		put_zone_device_private_page(page);
+	    unlikely(is_device_private_page(page) ||
+		     is_device_public_page(page))) {
+		put_zone_device_private_or_public_page(page);
 		return;
 	}
 
@@ -1224,8 +1226,10 @@ struct zap_details {
 	pgoff_t last_index;			/* Highest page->index to unmap */
 };
 
-struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr,
-		pte_t pte);
+struct page *_vm_normal_page(struct vm_area_struct *vma, unsigned long addr,
+			     pte_t pte, bool with_public_device);
+#define vm_normal_page(vma, addr, pte) _vm_normal_page(vma, addr, pte, false)
+
 struct page *vm_normal_page_pmd(struct vm_area_struct *vma, unsigned long addr,
 				pmd_t pmd);
 
diff --git a/kernel/memremap.c b/kernel/memremap.c
index ea0e18a2a5f2..6bcbfbf1a8fd 100644
--- a/kernel/memremap.c
+++ b/kernel/memremap.c
@@ -501,8 +501,8 @@ struct vmem_altmap *to_vmem_altmap(unsigned long memmap_start)
 #endif /* CONFIG_ZONE_DEVICE */
 
 
-#ifdef CONFIG_DEVICE_PRIVATE
-void put_zone_device_private_page(struct page *page)
+#if IS_ENABLED(CONFIG_DEVICE_PRIVATE) ||  IS_ENABLED(CONFIG_DEVICE_PUBLIC)
+void put_zone_device_private_or_public_page(struct page *page)
 {
 	int count = page_ref_dec_return(page);
 
@@ -522,5 +522,5 @@ void put_zone_device_private_page(struct page *page)
 	} else if (!count)
 		__put_page(page);
 }
-EXPORT_SYMBOL(put_zone_device_private_page);
-#endif /* CONFIG_DEVICE_PRIVATE */
+EXPORT_SYMBOL(put_zone_device_private_or_public_page);
+#endif /* CONFIG_DEVICE_PRIVATE || CONFIG_DEVICE_PUBLIC */
diff --git a/mm/Kconfig b/mm/Kconfig
index ec27855db133..7bea16697d87 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -720,12 +720,23 @@ config HMM_MIRROR
 config DEVICE_PRIVATE
 	bool "Unaddressable device memory (GPU memory, ...)"
 	depends on ARCH_HAS_HMM
+	select HMM
 
 	help
 	  Allows creation of struct pages to represent unaddressable device
 	  memory; i.e., memory that is only accessible from the device (or
 	  group of devices). You likely also want to select HMM_MIRROR.
 
+config DEVICE_PUBLIC
+	bool "Addressable device memory (like GPU memory)"
+	depends on ARCH_HAS_HMM
+	select HMM
+
+	help
+	  Allows creation of struct pages to represent addressable device
+	  memory; i.e., memory that is accessible from both the device and
+	  the CPU
+
 config FRAME_VECTOR
 	bool
 
diff --git a/mm/gup.c b/mm/gup.c
index 76fd199aaae2..b2b4d4263768 100644
--- a/mm/gup.c
+++ b/mm/gup.c
@@ -456,6 +456,13 @@ static int get_gate_page(struct mm_struct *mm, unsigned long address,
 		if ((gup_flags & FOLL_DUMP) || !is_zero_pfn(pte_pfn(*pte)))
 			goto unmap;
 		*page = pte_page(*pte);
+
+		/*
+		 * This should never happen (a device public page in the gate
+		 * area).
+		 */
+		if (is_device_public_page(*page))
+			goto unmap;
 	}
 	get_page(*page);
 out:
diff --git a/mm/hmm.c b/mm/hmm.c
index c9d23ef80552..b31d56662202 100644
--- a/mm/hmm.c
+++ b/mm/hmm.c
@@ -737,7 +737,7 @@ EXPORT_SYMBOL(hmm_vma_fault);
 #endif /* IS_ENABLED(CONFIG_HMM_MIRROR) */
 
 
-#if IS_ENABLED(CONFIG_DEVICE_PRIVATE)
+#if IS_ENABLED(CONFIG_DEVICE_PRIVATE) ||  IS_ENABLED(CONFIG_DEVICE_PUBLIC)
 struct page *hmm_vma_alloc_locked_page(struct vm_area_struct *vma,
 				       unsigned long addr)
 {
@@ -1177,4 +1177,4 @@ static int __init hmm_init(void)
 }
 
 device_initcall(hmm_init);
-#endif /* IS_ENABLED(CONFIG_DEVICE_PRIVATE) */
+#endif /* CONFIG_DEVICE_PRIVATE || CONFIG_DEVICE_PUBLIC */
diff --git a/mm/madvise.c b/mm/madvise.c
index eea1c733286f..21261ff0466f 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -355,7 +355,7 @@ static int madvise_free_pte_range(pmd_t *pmd, unsigned long addr,
 			continue;
 		}
 
-		page = vm_normal_page(vma, addr, ptent);
+		page = _vm_normal_page(vma, addr, ptent, true);
 		if (!page)
 			continue;
 
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 8aa98f9bc723..126a939b600a 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -4623,10 +4623,11 @@ out:
  *   2(MC_TARGET_SWAP): if the swap entry corresponding to this pte is a
  *     target for charge migration. if @target is not NULL, the entry is stored
  *     in target->ent.
- *   3(MC_TARGET_DEVICE): like MC_TARGET_PAGE  but page is MEMORY_DEVICE_PRIVATE
- *     (so ZONE_DEVICE page and thus not on the lru). For now we such page is
- *     charge like a regular page would be as for all intent and purposes it is
- *     just special memory taking the place of a regular page.
+ *   3(MC_TARGET_DEVICE): like MC_TARGET_PAGE  but page is MEMORY_DEVICE_PUBLIC
+ *     or MEMORY_DEVICE_PRIVATE (so ZONE_DEVICE page and thus not on the lru).
+ *     For now we such page is charge like a regular page would be as for all
+ *     intent and purposes it is just special memory taking the place of a
+ *     regular page.
  *
  *     See Documentations/vm/hmm.txt and include/linux/hmm.h
  *
@@ -4657,7 +4658,8 @@ static enum mc_target_type get_mctgt_type(struct vm_area_struct *vma,
 		 */
 		if (page->mem_cgroup == mc.from) {
 			ret = MC_TARGET_PAGE;
-			if (is_device_private_page(page))
+			if (is_device_private_page(page) ||
+			    is_device_public_page(page))
 				ret = MC_TARGET_DEVICE;
 			if (target)
 				target->page = page;
diff --git a/mm/memory.c b/mm/memory.c
index 079eeac0b009..ad0ea1af1f44 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -818,8 +818,8 @@ static void print_bad_pte(struct vm_area_struct *vma, unsigned long addr,
 #else
 # define HAVE_PTE_SPECIAL 0
 #endif
-struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr,
-				pte_t pte)
+struct page *_vm_normal_page(struct vm_area_struct *vma, unsigned long addr,
+			     pte_t pte, bool with_public_device)
 {
 	unsigned long pfn = pte_pfn(pte);
 
@@ -830,8 +830,31 @@ struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr,
 			return vma->vm_ops->find_special_page(vma, addr);
 		if (vma->vm_flags & (VM_PFNMAP | VM_MIXEDMAP))
 			return NULL;
-		if (!is_zero_pfn(pfn))
-			print_bad_pte(vma, addr, pte, NULL);
+		if (is_zero_pfn(pfn))
+			return NULL;
+
+		/*
+		 * Device public pages are special pages (they are ZONE_DEVICE
+		 * pages but different from persistent memory). They behave
+		 * allmost like normal pages. The difference is that they are
+		 * not on the lru and thus should never be involve with any-
+		 * thing that involve lru manipulation (mlock, numa balancing,
+		 * ...).
+		 *
+		 * This is why we still want to return NULL for such page from
+		 * vm_normal_page() so that we do not have to special case all
+		 * call site of vm_normal_page().
+		 */
+		if (likely(pfn < highest_memmap_pfn)) {
+			struct page *page = pfn_to_page(pfn);
+
+			if (is_device_public_page(page)) {
+				if (with_public_device)
+					return page;
+				return NULL;
+			}
+		}
+		print_bad_pte(vma, addr, pte, NULL);
 		return NULL;
 	}
 
@@ -1012,6 +1035,19 @@ copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm,
 		get_page(page);
 		page_dup_rmap(page, false);
 		rss[mm_counter(page)]++;
+	} else if (pte_devmap(pte)) {
+		page = pte_page(pte);
+
+		/*
+		 * Cache coherent device memory behave like regular page and
+		 * not like persistent memory page. For more informations see
+		 * MEMORY_DEVICE_CACHE_COHERENT in memory_hotplug.h
+		 */
+		if (is_device_public_page(page)) {
+			get_page(page);
+			page_dup_rmap(page, false);
+			rss[mm_counter(page)]++;
+		}
 	}
 
 out_set_pte:
@@ -1267,7 +1303,7 @@ again:
 		if (pte_present(ptent)) {
 			struct page *page;
 
-			page = vm_normal_page(vma, addr, ptent);
+			page = _vm_normal_page(vma, addr, ptent, true);
 			if (unlikely(details) && page) {
 				/*
 				 * unmap_shared_mapping_pages() wants to
diff --git a/mm/migrate.c b/mm/migrate.c
index e581253ef330..618aeb5e9cde 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -36,6 +36,7 @@
 #include <linux/hugetlb.h>
 #include <linux/hugetlb_cgroup.h>
 #include <linux/gfp.h>
+#include <linux/pfn_t.h>
 #include <linux/memremap.h>
 #include <linux/userfaultfd_k.h>
 #include <linux/balloon_compaction.h>
@@ -239,10 +240,14 @@ static bool remove_migration_pte(struct page *page, struct vm_area_struct *vma,
 		if (is_write_migration_entry(entry))
 			pte = maybe_mkwrite(pte, vma);
 
-		if (unlikely(is_zone_device_page(new)) &&
-		    is_device_private_page(new)) {
-			entry = make_device_private_entry(new, pte_write(pte));
-			pte = swp_entry_to_pte(entry);
+		if (unlikely(is_zone_device_page(new))) {
+			if (is_device_private_page(new)) {
+				entry = make_device_private_entry(new, pte_write(pte));
+				pte = swp_entry_to_pte(entry);
+			} else if (is_device_public_page(new)) {
+				pte = pte_mkdevmap(pte);
+				flush_dcache_page(new);
+			}
 		} else
 			flush_dcache_page(new);
 
@@ -437,12 +442,11 @@ int migrate_page_move_mapping(struct address_space *mapping,
 	void **pslot;
 
 	/*
-	 * ZONE_DEVICE pages have 1 refcount always held by their device
-	 *
-	 * Note that DAX memory will never reach that point as it does not have
-	 * the MEMORY_DEVICE_ALLOW_MIGRATE flag set (see memory_hotplug.h).
+	 * Device public or private pages have an extra refcount as they are
+	 * ZONE_DEVICE pages.
 	 */
-	expected_count += is_zone_device_page(page);
+	expected_count += is_device_private_page(page);
+	expected_count += is_device_public_page(page);
 
 	if (!mapping) {
 		/* Anonymous page without mapping */
@@ -2123,7 +2127,6 @@ out_unlock:
 
 #endif /* CONFIG_NUMA */
 
-
 struct migrate_vma {
 	struct vm_area_struct	*vma;
 	unsigned long		*dst;
@@ -2263,7 +2266,7 @@ again:
 				pfn = 0;
 				goto next;
 			}
-			page = vm_normal_page(migrate->vma, addr, pte);
+			page = _vm_normal_page(migrate->vma, addr, pte, true);
 			mpfn = migrate_pfn(pfn) | MIGRATE_PFN_MIGRATE;
 			mpfn |= pte_write(pte) ? MIGRATE_PFN_WRITE : 0;
 		}
@@ -2406,10 +2409,19 @@ static bool migrate_vma_check_page(struct page *page)
 		if (is_device_private_page(page))
 			return true;
 
-		/* Other ZONE_DEVICE memory type are not supported */
-		return false;
+		/*
+		 * Only allow device public page to be migrated and account for
+		 * the extra reference count imply by ZONE_DEVICE pages.
+		 */
+		if (!is_device_public_page(page))
+			return false;
+		extra++;
 	}
 
+	/* For file back page */
+	if (page_mapping(page))
+		extra += 1 + page_has_private(page);
+
 	if ((page_count(page) - extra) > page_mapcount(page))
 		return false;
 
@@ -2647,11 +2659,18 @@ static void migrate_vma_insert_page(struct migrate_vma *migrate,
 	 */
 	__SetPageUptodate(page);
 
-	if (is_zone_device_page(page) && is_device_private_page(page)) {
-		swp_entry_t swp_entry;
-
-		swp_entry = make_device_private_entry(page, vma->vm_flags & VM_WRITE);
-		entry = swp_entry_to_pte(swp_entry);
+	if (is_zone_device_page(page)) {
+		if (is_device_private_page(page)) {
+			swp_entry_t swp_entry;
+
+			swp_entry = make_device_private_entry(page, vma->vm_flags & VM_WRITE);
+			entry = swp_entry_to_pte(swp_entry);
+		} else if (is_device_public_page(page)) {
+			entry = pte_mkold(mk_pte(page, READ_ONCE(vma->vm_page_prot)));
+			if (vma->vm_flags & VM_WRITE)
+				entry = pte_mkwrite(pte_mkdirty(entry));
+			entry = pte_mkdevmap(entry);
+		}
 	} else {
 		entry = mk_pte(page, vma->vm_page_prot);
 		if (vma->vm_flags & VM_WRITE)
@@ -2768,7 +2787,7 @@ static void migrate_vma_pages(struct migrate_vma *migrate)
 					migrate->src[i] &= ~MIGRATE_PFN_MIGRATE;
 					continue;
 				}
-			} else {
+			} else if (!is_device_public_page(newpage)) {
 				/*
 				 * Other types of ZONE_DEVICE page are not
 				 * supported.
diff --git a/mm/swap.c b/mm/swap.c
index 62d96b8e5eb3..9295ae960d66 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -765,6 +765,17 @@ void release_pages(struct page **pages, int nr, bool cold)
 		if (is_huge_zero_page(page))
 			continue;
 
+		/* Device public page can not be huge page */
+		if (is_device_public_page(page)) {
+			if (locked_pgdat) {
+				spin_unlock_irqrestore(&locked_pgdat->lru_lock,
+						       flags);
+				locked_pgdat = NULL;
+			}
+			put_zone_device_private_or_public_page(page);
+			continue;
+		}
+
 		page = compound_head(page);
 		if (!put_page_testzero(page))
 			continue;
-- 
cgit v1.2.3


From d3df0a423397c9a1ae05c3857e8c04240dd85e68 Mon Sep 17 00:00:00 2001
From: Jérôme Glisse <jglisse@redhat.com>
Date: Fri, 8 Sep 2017 16:12:28 -0700
Subject: mm/hmm: add new helper to hotplug CDM memory region
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Unlike unaddressable memory, coherent device memory has a real resource
associated with it on the system (as CPU can address it).  Add a new
helper to hotplug such memory within the HMM framework.

Link: http://lkml.kernel.org/r/20170817000548.32038-20-jglisse@redhat.com
Signed-off-by: Jérôme Glisse <jglisse@redhat.com>
Reviewed-by: Balbir Singh <bsingharora@gmail.com>
Cc: Aneesh Kumar <aneesh.kumar@linux.vnet.ibm.com>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Dan Williams <dan.j.williams@intel.com>
Cc: David Nellans <dnellans@nvidia.com>
Cc: Evgeny Baskakov <ebaskakov@nvidia.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: John Hubbard <jhubbard@nvidia.com>
Cc: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Cc: Mark Hairgrove <mhairgrove@nvidia.com>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Ross Zwisler <ross.zwisler@linux.intel.com>
Cc: Sherry Cheung <SCheung@nvidia.com>
Cc: Subhash Gutti <sgutti@nvidia.com>
Cc: Vladimir Davydov <vdavydov.dev@gmail.com>
Cc: Bob Liu <liubo95@huawei.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/hmm.h |  3 ++
 mm/hmm.c            | 88 ++++++++++++++++++++++++++++++++++++++++++++++++++---
 2 files changed, 86 insertions(+), 5 deletions(-)

(limited to 'include')

diff --git a/include/linux/hmm.h b/include/linux/hmm.h
index 6d3b0b4fed4e..8385e75356ca 100644
--- a/include/linux/hmm.h
+++ b/include/linux/hmm.h
@@ -443,6 +443,9 @@ struct hmm_devmem {
 struct hmm_devmem *hmm_devmem_add(const struct hmm_devmem_ops *ops,
 				  struct device *device,
 				  unsigned long size);
+struct hmm_devmem *hmm_devmem_add_resource(const struct hmm_devmem_ops *ops,
+					   struct device *device,
+					   struct resource *res);
 void hmm_devmem_remove(struct hmm_devmem *devmem);
 
 /*
diff --git a/mm/hmm.c b/mm/hmm.c
index b31d56662202..bdb49b836bf2 100644
--- a/mm/hmm.c
+++ b/mm/hmm.c
@@ -839,7 +839,11 @@ static void hmm_devmem_release(struct device *dev, void *data)
 	zone = page_zone(page);
 
 	mem_hotplug_begin();
-	__remove_pages(zone, start_pfn, npages);
+	if (resource->desc == IORES_DESC_DEVICE_PRIVATE_MEMORY)
+		__remove_pages(zone, start_pfn, npages);
+	else
+		arch_remove_memory(start_pfn << PAGE_SHIFT,
+				   npages << PAGE_SHIFT);
 	mem_hotplug_done();
 
 	hmm_devmem_radix_release(resource);
@@ -875,7 +879,11 @@ static int hmm_devmem_pages_create(struct hmm_devmem *devmem)
 	if (is_ram == REGION_INTERSECTS)
 		return -ENXIO;
 
-	devmem->pagemap.type = MEMORY_DEVICE_PRIVATE;
+	if (devmem->resource->desc == IORES_DESC_DEVICE_PUBLIC_MEMORY)
+		devmem->pagemap.type = MEMORY_DEVICE_PUBLIC;
+	else
+		devmem->pagemap.type = MEMORY_DEVICE_PRIVATE;
+
 	devmem->pagemap.res = devmem->resource;
 	devmem->pagemap.page_fault = hmm_devmem_fault;
 	devmem->pagemap.page_free = hmm_devmem_free;
@@ -920,9 +928,15 @@ static int hmm_devmem_pages_create(struct hmm_devmem *devmem)
 	 * over the device memory is un-accessible thus we do not want to
 	 * create a linear mapping for the memory like arch_add_memory()
 	 * would do.
+	 *
+	 * For device public memory, which is accesible by the CPU, we do
+	 * want the linear mapping and thus use arch_add_memory().
 	 */
-	ret = add_pages(nid, align_start >> PAGE_SHIFT,
-			align_size >> PAGE_SHIFT, false);
+	if (devmem->pagemap.type == MEMORY_DEVICE_PUBLIC)
+		ret = arch_add_memory(nid, align_start, align_size, false);
+	else
+		ret = add_pages(nid, align_start >> PAGE_SHIFT,
+				align_size >> PAGE_SHIFT, false);
 	if (ret) {
 		mem_hotplug_done();
 		goto error_add_memory;
@@ -1069,6 +1083,67 @@ error_percpu_ref:
 }
 EXPORT_SYMBOL(hmm_devmem_add);
 
+struct hmm_devmem *hmm_devmem_add_resource(const struct hmm_devmem_ops *ops,
+					   struct device *device,
+					   struct resource *res)
+{
+	struct hmm_devmem *devmem;
+	int ret;
+
+	if (res->desc != IORES_DESC_DEVICE_PUBLIC_MEMORY)
+		return ERR_PTR(-EINVAL);
+
+	static_branch_enable(&device_private_key);
+
+	devmem = devres_alloc_node(&hmm_devmem_release, sizeof(*devmem),
+				   GFP_KERNEL, dev_to_node(device));
+	if (!devmem)
+		return ERR_PTR(-ENOMEM);
+
+	init_completion(&devmem->completion);
+	devmem->pfn_first = -1UL;
+	devmem->pfn_last = -1UL;
+	devmem->resource = res;
+	devmem->device = device;
+	devmem->ops = ops;
+
+	ret = percpu_ref_init(&devmem->ref, &hmm_devmem_ref_release,
+			      0, GFP_KERNEL);
+	if (ret)
+		goto error_percpu_ref;
+
+	ret = devm_add_action(device, hmm_devmem_ref_exit, &devmem->ref);
+	if (ret)
+		goto error_devm_add_action;
+
+
+	devmem->pfn_first = devmem->resource->start >> PAGE_SHIFT;
+	devmem->pfn_last = devmem->pfn_first +
+			   (resource_size(devmem->resource) >> PAGE_SHIFT);
+
+	ret = hmm_devmem_pages_create(devmem);
+	if (ret)
+		goto error_devm_add_action;
+
+	devres_add(device, devmem);
+
+	ret = devm_add_action(device, hmm_devmem_ref_kill, &devmem->ref);
+	if (ret) {
+		hmm_devmem_remove(devmem);
+		return ERR_PTR(ret);
+	}
+
+	return devmem;
+
+error_devm_add_action:
+	hmm_devmem_ref_kill(&devmem->ref);
+	hmm_devmem_ref_exit(&devmem->ref);
+error_percpu_ref:
+	devres_free(devmem);
+	return ERR_PTR(ret);
+}
+EXPORT_SYMBOL(hmm_devmem_add_resource);
+
 /*
  * hmm_devmem_remove() - remove device memory (kill and free ZONE_DEVICE)
  *
@@ -1082,6 +1157,7 @@ void hmm_devmem_remove(struct hmm_devmem *devmem)
 {
 	resource_size_t start, size;
 	struct device *device;
+	bool cdm = false;
 
 	if (!devmem)
 		return;
@@ -1090,11 +1166,13 @@ void hmm_devmem_remove(struct hmm_devmem *devmem)
 	start = devmem->resource->start;
 	size = resource_size(devmem->resource);
 
+	cdm = devmem->resource->desc == IORES_DESC_DEVICE_PUBLIC_MEMORY;
 	hmm_devmem_ref_kill(&devmem->ref);
 	hmm_devmem_ref_exit(&devmem->ref);
 	hmm_devmem_pages_remove(devmem);
 
-	devm_release_mem_region(device, start, size);
+	if (!cdm)
+		devm_release_mem_region(device, start, size);
 }
 EXPORT_SYMBOL(hmm_devmem_remove);
 
-- 
cgit v1.2.3


From 6b368cd4a44ce95b33f1d31f2f932e6ae707f319 Mon Sep 17 00:00:00 2001
From: Jérôme Glisse <jglisse@redhat.com>
Date: Fri, 8 Sep 2017 16:12:32 -0700
Subject: mm/hmm: avoid bloating arch that do not make use of HMM
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This moves all new code including new page migration helper behind kernel
Kconfig option so that there is no codee bloat for arch or user that do
not want to use HMM or any of its associated features.

arm allyesconfig (without all the patchset, then with and this patch):
   text	   data	    bss	    dec	    hex	filename
83721896	46511131	27582964	157815991	96814b7	../without/vmlinux
83722364	46511131	27582964	157816459	968168b	vmlinux

[jglisse@redhat.com: struct hmm is only use by HMM mirror functionality]
  Link: http://lkml.kernel.org/r/20170825213133.27286-1-jglisse@redhat.com
[sfr@canb.auug.org.au: fix build (arm multi_v7_defconfig)]
  Link: http://lkml.kernel.org/r/20170828181849.323ab81b@canb.auug.org.au
Link: http://lkml.kernel.org/r/20170818032858.7447-1-jglisse@redhat.com
Signed-off-by: Jérôme Glisse <jglisse@redhat.com>
Signed-off-by: Stephen Rothwell <sfr@canb.auug.org.au>
Cc: Dan Williams <dan.j.williams@intel.com>
Cc: Arnd Bergmann <arnd@arndb.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/hmm.h      |  9 ++++++---
 include/linux/memremap.h | 22 +++++++---------------
 include/linux/migrate.h  | 13 +++++++++++++
 include/linux/mm.h       | 26 +++++++++++++++++---------
 mm/Kconfig               |  4 ++++
 mm/Makefile              |  3 ++-
 mm/hmm.c                 |  7 +++----
 mm/migrate.c             |  2 ++
 8 files changed, 54 insertions(+), 32 deletions(-)

(limited to 'include')

diff --git a/include/linux/hmm.h b/include/linux/hmm.h
index 8385e75356ca..28e14345bd8d 100644
--- a/include/linux/hmm.h
+++ b/include/linux/hmm.h
@@ -501,18 +501,21 @@ void hmm_device_put(struct hmm_device *hmm_device);
 
 
 /* Below are for HMM internal use only! Not to be used by device driver! */
+#if IS_ENABLED(CONFIG_HMM_MIRROR)
 void hmm_mm_destroy(struct mm_struct *mm);
 
 static inline void hmm_mm_init(struct mm_struct *mm)
 {
 	mm->hmm = NULL;
 }
+#else /* IS_ENABLED(CONFIG_HMM_MIRROR) */
+static inline void hmm_mm_destroy(struct mm_struct *mm) {}
+static inline void hmm_mm_init(struct mm_struct *mm) {}
+#endif /* IS_ENABLED(CONFIG_HMM_MIRROR) */
 
-#else /* IS_ENABLED(CONFIG_HMM) */
 
-/* Below are for HMM internal use only! Not to be used by device driver! */
+#else /* IS_ENABLED(CONFIG_HMM) */
 static inline void hmm_mm_destroy(struct mm_struct *mm) {}
 static inline void hmm_mm_init(struct mm_struct *mm) {}
-
 #endif /* IS_ENABLED(CONFIG_HMM) */
 #endif /* LINUX_HMM_H */
diff --git a/include/linux/memremap.h b/include/linux/memremap.h
index f8ee1c73ad2d..79f8ba7c3894 100644
--- a/include/linux/memremap.h
+++ b/include/linux/memremap.h
@@ -138,18 +138,6 @@ void *devm_memremap_pages(struct device *dev, struct resource *res,
 struct dev_pagemap *find_dev_pagemap(resource_size_t phys);
 
 static inline bool is_zone_device_page(const struct page *page);
-
-static inline bool is_device_private_page(const struct page *page)
-{
-	return is_zone_device_page(page) &&
-		page->pgmap->type == MEMORY_DEVICE_PRIVATE;
-}
-
-static inline bool is_device_public_page(const struct page *page)
-{
-	return is_zone_device_page(page) &&
-		page->pgmap->type == MEMORY_DEVICE_PUBLIC;
-}
 #else
 static inline void *devm_memremap_pages(struct device *dev,
 		struct resource *res, struct percpu_ref *ref,
@@ -168,17 +156,21 @@ static inline struct dev_pagemap *find_dev_pagemap(resource_size_t phys)
 {
 	return NULL;
 }
+#endif
 
+#if defined(CONFIG_DEVICE_PRIVATE) || defined(CONFIG_DEVICE_PUBLIC)
 static inline bool is_device_private_page(const struct page *page)
 {
-	return false;
+	return is_zone_device_page(page) &&
+		page->pgmap->type == MEMORY_DEVICE_PRIVATE;
 }
 
 static inline bool is_device_public_page(const struct page *page)
 {
-	return false;
+	return is_zone_device_page(page) &&
+		page->pgmap->type == MEMORY_DEVICE_PUBLIC;
 }
-#endif
+#endif /* CONFIG_DEVICE_PRIVATE || CONFIG_DEVICE_PUBLIC */
 
 /**
  * get_dev_pagemap() - take a new live reference on the dev_pagemap for @pfn
diff --git a/include/linux/migrate.h b/include/linux/migrate.h
index d4e6d12a0b40..643c7ae7d7b4 100644
--- a/include/linux/migrate.h
+++ b/include/linux/migrate.h
@@ -265,6 +265,7 @@ struct migrate_vma_ops {
 				 void *private);
 };
 
+#if defined(CONFIG_MIGRATE_VMA_HELPER)
 int migrate_vma(const struct migrate_vma_ops *ops,
 		struct vm_area_struct *vma,
 		unsigned long start,
@@ -272,6 +273,18 @@ int migrate_vma(const struct migrate_vma_ops *ops,
 		unsigned long *src,
 		unsigned long *dst,
 		void *private);
+#else
+static inline int migrate_vma(const struct migrate_vma_ops *ops,
+			      struct vm_area_struct *vma,
+			      unsigned long start,
+			      unsigned long end,
+			      unsigned long *src,
+			      unsigned long *dst,
+			      void *private)
+{
+	return -EINVAL;
+}
+#endif /* IS_ENABLED(CONFIG_MIGRATE_VMA_HELPER) */
 
 #endif /* CONFIG_MIGRATION */
 
diff --git a/include/linux/mm.h b/include/linux/mm.h
index de66a1127db4..5195e272fc4a 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -800,18 +800,27 @@ static inline bool is_zone_device_page(const struct page *page)
 }
 #endif
 
-#if IS_ENABLED(CONFIG_DEVICE_PRIVATE) ||  IS_ENABLED(CONFIG_DEVICE_PUBLIC)
+#if defined(CONFIG_DEVICE_PRIVATE) || defined(CONFIG_DEVICE_PUBLIC)
 void put_zone_device_private_or_public_page(struct page *page);
-#else
+DECLARE_STATIC_KEY_FALSE(device_private_key);
+#define IS_HMM_ENABLED static_branch_unlikely(&device_private_key)
+static inline bool is_device_private_page(const struct page *page);
+static inline bool is_device_public_page(const struct page *page);
+#else /* CONFIG_DEVICE_PRIVATE || CONFIG_DEVICE_PUBLIC */
 static inline void put_zone_device_private_or_public_page(struct page *page)
 {
 }
+#define IS_HMM_ENABLED 0
+static inline bool is_device_private_page(const struct page *page)
+{
+	return false;
+}
+static inline bool is_device_public_page(const struct page *page)
+{
+	return false;
+}
 #endif /* CONFIG_DEVICE_PRIVATE || CONFIG_DEVICE_PUBLIC */
 
-static inline bool is_device_private_page(const struct page *page);
-static inline bool is_device_public_page(const struct page *page);
-
-DECLARE_STATIC_KEY_FALSE(device_private_key);
 
 static inline void get_page(struct page *page)
 {
@@ -834,9 +843,8 @@ static inline void put_page(struct page *page)
 	 * free and we need to inform the device driver through callback. See
 	 * include/linux/memremap.h and HMM for details.
 	 */
-	if (static_branch_unlikely(&device_private_key) &&
-	    unlikely(is_device_private_page(page) ||
-		     is_device_public_page(page))) {
+	if (IS_HMM_ENABLED && unlikely(is_device_private_page(page) ||
+	    unlikely(is_device_public_page(page)))) {
 		put_zone_device_private_or_public_page(page);
 		return;
 	}
diff --git a/mm/Kconfig b/mm/Kconfig
index 7bea16697d87..9c4bdddd80c2 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -702,8 +702,12 @@ config ARCH_HAS_HMM
 	depends on MEMORY_HOTREMOVE
 	depends on SPARSEMEM_VMEMMAP
 
+config MIGRATE_VMA_HELPER
+	bool
+
 config HMM
 	bool
+	select MIGRATE_VMA_HELPER
 
 config HMM_MIRROR
 	bool "HMM mirror CPU page table into a device page table"
diff --git a/mm/Makefile b/mm/Makefile
index 1cde2a8bed97..e3ac3aeb533b 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -39,7 +39,7 @@ obj-y			:= filemap.o mempool.o oom_kill.o \
 			   mm_init.o mmu_context.o percpu.o slab_common.o \
 			   compaction.o vmacache.o swap_slots.o \
 			   interval_tree.o list_lru.o workingset.o \
-			   debug.o hmm.o $(mmu-y)
+			   debug.o $(mmu-y)
 
 obj-y += init-mm.o
 
@@ -104,3 +104,4 @@ obj-$(CONFIG_FRAME_VECTOR) += frame_vector.o
 obj-$(CONFIG_DEBUG_PAGE_REF) += debug_page_ref.o
 obj-$(CONFIG_HARDENED_USERCOPY) += usercopy.o
 obj-$(CONFIG_PERCPU_STATS) += percpu-stats.o
+obj-$(CONFIG_HMM) += hmm.o
diff --git a/mm/hmm.c b/mm/hmm.c
index bdb49b836bf2..a88a847bccba 100644
--- a/mm/hmm.c
+++ b/mm/hmm.c
@@ -35,15 +35,16 @@
 
 #define PA_SECTION_SIZE (1UL << PA_SECTION_SHIFT)
 
-
+#if defined(CONFIG_DEVICE_PRIVATE) || defined(CONFIG_DEVICE_PUBLIC)
 /*
  * Device private memory see HMM (Documentation/vm/hmm.txt) or hmm.h
  */
 DEFINE_STATIC_KEY_FALSE(device_private_key);
 EXPORT_SYMBOL(device_private_key);
+#endif /* CONFIG_DEVICE_PRIVATE || CONFIG_DEVICE_PUBLIC */
 
 
-#ifdef CONFIG_HMM
+#if IS_ENABLED(CONFIG_HMM_MIRROR)
 static const struct mmu_notifier_ops hmm_mmu_notifier_ops;
 
 /*
@@ -128,9 +129,7 @@ void hmm_mm_destroy(struct mm_struct *mm)
 {
 	kfree(mm->hmm);
 }
-#endif /* CONFIG_HMM */
 
-#if IS_ENABLED(CONFIG_HMM_MIRROR)
 static void hmm_invalidate_range(struct hmm *hmm,
 				 enum hmm_update_type action,
 				 unsigned long start,
diff --git a/mm/migrate.c b/mm/migrate.c
index 618aeb5e9cde..6954c1435833 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -2127,6 +2127,7 @@ out_unlock:
 
 #endif /* CONFIG_NUMA */
 
+#if defined(CONFIG_MIGRATE_VMA_HELPER)
 struct migrate_vma {
 	struct vm_area_struct	*vma;
 	unsigned long		*dst;
@@ -2980,3 +2981,4 @@ int migrate_vma(const struct migrate_vma_ops *ops,
 	return 0;
 }
 EXPORT_SYMBOL(migrate_vma);
+#endif /* defined(MIGRATE_VMA_HELPER) */
-- 
cgit v1.2.3


From de540a9763efcd5b6339158ac2e5932fb3e691b9 Mon Sep 17 00:00:00 2001
From: Jérôme Glisse <jglisse@redhat.com>
Date: Fri, 8 Sep 2017 16:12:35 -0700
Subject: mm/hmm: fix build when HMM is disabled
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Combinatorial Kconfig is painfull. Withi this patch all below combination
build.

1)

2)
CONFIG_HMM_MIRROR=y

3)
CONFIG_DEVICE_PRIVATE=y

4)
CONFIG_DEVICE_PUBLIC=y

5)
CONFIG_HMM_MIRROR=y
CONFIG_DEVICE_PUBLIC=y

6)
CONFIG_HMM_MIRROR=y
CONFIG_DEVICE_PRIVATE=y

7)
CONFIG_DEVICE_PRIVATE=y
CONFIG_DEVICE_PUBLIC=y

8)
CONFIG_HMM_MIRROR=y
CONFIG_DEVICE_PRIVATE=y
CONFIG_DEVICE_PUBLIC=y

Link: http://lkml.kernel.org/r/20170826002149.20919-1-jglisse@redhat.com
Reported-by: Randy Dunlap <rdunlap@infradead.org>
Signed-off-by: Jérôme Glisse <jglisse@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/hmm.h | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/linux/hmm.h b/include/linux/hmm.h
index 28e14345bd8d..96e69979f84d 100644
--- a/include/linux/hmm.h
+++ b/include/linux/hmm.h
@@ -498,7 +498,7 @@ struct hmm_device {
 struct hmm_device *hmm_device_new(void *drvdata);
 void hmm_device_put(struct hmm_device *hmm_device);
 #endif /* CONFIG_DEVICE_PRIVATE || CONFIG_DEVICE_PUBLIC */
-
+#endif /* IS_ENABLED(CONFIG_HMM) */
 
 /* Below are for HMM internal use only! Not to be used by device driver! */
 #if IS_ENABLED(CONFIG_HMM_MIRROR)
@@ -517,5 +517,4 @@ static inline void hmm_mm_init(struct mm_struct *mm) {}
 #else /* IS_ENABLED(CONFIG_HMM) */
 static inline void hmm_mm_destroy(struct mm_struct *mm) {}
 static inline void hmm_mm_init(struct mm_struct *mm) {}
-#endif /* IS_ENABLED(CONFIG_HMM) */
 #endif /* LINUX_HMM_H */
-- 
cgit v1.2.3


From 3a321d2a3dde812142e06ab5c2f062ed860182a5 Mon Sep 17 00:00:00 2001
From: Kemi Wang <kemi.wang@intel.com>
Date: Fri, 8 Sep 2017 16:12:48 -0700
Subject: mm: change the call sites of numa statistics items

Patch series "Separate NUMA statistics from zone statistics", v2.

Each page allocation updates a set of per-zone statistics with a call to
zone_statistics().  As discussed in 2017 MM summit, these are a
substantial source of overhead in the page allocator and are very rarely
consumed.  This significant overhead in cache bouncing caused by zone
counters (NUMA associated counters) update in parallel in multi-threaded
page allocation (pointed out by Dave Hansen).

A link to the MM summit slides:
  http://people.netfilter.org/hawk/presentations/MM-summit2017/MM-summit2017-JesperBrouer.pdf

To mitigate this overhead, this patchset separates NUMA statistics from
zone statistics framework, and update NUMA counter threshold to a fixed
size of MAX_U16 - 2, as a small threshold greatly increases the update
frequency of the global counter from local per cpu counter (suggested by
Ying Huang).  The rationality is that these statistics counters don't
need to be read often, unlike other VM counters, so it's not a problem
to use a large threshold and make readers more expensive.

With this patchset, we see 31.3% drop of CPU cycles(537-->369, see
below) for per single page allocation and reclaim on Jesper's
page_bench03 benchmark.  Meanwhile, this patchset keeps the same style
of virtual memory statistics with little end-user-visible effects (only
move the numa stats to show behind zone page stats, see the first patch
for details).

I did an experiment of single page allocation and reclaim concurrently
using Jesper's page_bench03 benchmark on a 2-Socket Broadwell-based
server (88 processors with 126G memory) with different size of threshold
of pcp counter.

Benchmark provided by Jesper D Brouer(increase loop times to 10000000):
  https://github.com/netoptimizer/prototype-kernel/tree/master/kernel/mm/bench

   Threshold   CPU cycles    Throughput(88 threads)
      32        799         241760478
      64        640         301628829
      125       537         358906028 <==> system by default
      256       468         412397590
      512       428         450550704
      4096      399         482520943
      20000     394         489009617
      30000     395         488017817
      65533     369(-31.3%) 521661345(+45.3%) <==> with this patchset
      N/A       342(-36.3%) 562900157(+56.8%) <==> disable zone_statistics

This patch (of 3):

In this patch, NUMA statistics is separated from zone statistics
framework, all the call sites of NUMA stats are changed to use
numa-stats-specific functions, it does not have any functionality change
except that the number of NUMA stats is shown behind zone page stats
when users *read* the zone info.

E.g. cat /proc/zoneinfo
    ***Base***                           ***With this patch***
nr_free_pages 3976                         nr_free_pages 3976
nr_zone_inactive_anon 0                    nr_zone_inactive_anon 0
nr_zone_active_anon 0                      nr_zone_active_anon 0
nr_zone_inactive_file 0                    nr_zone_inactive_file 0
nr_zone_active_file 0                      nr_zone_active_file 0
nr_zone_unevictable 0                      nr_zone_unevictable 0
nr_zone_write_pending 0                    nr_zone_write_pending 0
nr_mlock     0                             nr_mlock     0
nr_page_table_pages 0                      nr_page_table_pages 0
nr_kernel_stack 0                          nr_kernel_stack 0
nr_bounce    0                             nr_bounce    0
nr_zspages   0                             nr_zspages   0
numa_hit 0                                *nr_free_cma  0*
numa_miss 0                                numa_hit     0
numa_foreign 0                             numa_miss    0
numa_interleave 0                          numa_foreign 0
numa_local   0                             numa_interleave 0
numa_other   0                             numa_local   0
*nr_free_cma 0*                            numa_other 0
    ...                                        ...
vm stats threshold: 10                     vm stats threshold: 10
    ...                                        ...

The next patch updates the numa stats counter size and threshold.

[akpm@linux-foundation.org: coding-style fixes]
Link: http://lkml.kernel.org/r/1503568801-21305-2-git-send-email-kemi.wang@intel.com
Signed-off-by: Kemi Wang <kemi.wang@intel.com>
Reported-by: Jesper Dangaard Brouer <brouer@redhat.com>
Acked-by: Mel Gorman <mgorman@techsingularity.net>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Christopher Lameter <cl@linux.com>
Cc: Dave Hansen <dave.hansen@intel.com>
Cc: Andi Kleen <andi.kleen@intel.com>
Cc: Ying Huang <ying.huang@intel.com>
Cc: Aaron Lu <aaron.lu@intel.com>
Cc: Tim Chen <tim.c.chen@intel.com>
Cc: Dave Hansen <dave.hansen@intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/base/node.c    |  22 ++++---
 include/linux/mmzone.h |  25 +++++---
 include/linux/vmstat.h |  29 ++++++++-
 mm/page_alloc.c        |  10 +--
 mm/vmstat.c            | 161 +++++++++++++++++++++++++++++++++++++++++++++++--
 5 files changed, 220 insertions(+), 27 deletions(-)

(limited to 'include')

diff --git a/drivers/base/node.c b/drivers/base/node.c
index d8dc83017d8d..3855902f2c5b 100644
--- a/drivers/base/node.c
+++ b/drivers/base/node.c
@@ -160,12 +160,12 @@ static ssize_t node_read_numastat(struct device *dev,
 		       "interleave_hit %lu\n"
 		       "local_node %lu\n"
 		       "other_node %lu\n",
-		       sum_zone_node_page_state(dev->id, NUMA_HIT),
-		       sum_zone_node_page_state(dev->id, NUMA_MISS),
-		       sum_zone_node_page_state(dev->id, NUMA_FOREIGN),
-		       sum_zone_node_page_state(dev->id, NUMA_INTERLEAVE_HIT),
-		       sum_zone_node_page_state(dev->id, NUMA_LOCAL),
-		       sum_zone_node_page_state(dev->id, NUMA_OTHER));
+		       sum_zone_numa_state(dev->id, NUMA_HIT),
+		       sum_zone_numa_state(dev->id, NUMA_MISS),
+		       sum_zone_numa_state(dev->id, NUMA_FOREIGN),
+		       sum_zone_numa_state(dev->id, NUMA_INTERLEAVE_HIT),
+		       sum_zone_numa_state(dev->id, NUMA_LOCAL),
+		       sum_zone_numa_state(dev->id, NUMA_OTHER));
 }
 static DEVICE_ATTR(numastat, S_IRUGO, node_read_numastat, NULL);
 
@@ -181,9 +181,17 @@ static ssize_t node_read_vmstat(struct device *dev,
 		n += sprintf(buf+n, "%s %lu\n", vmstat_text[i],
 			     sum_zone_node_page_state(nid, i));
 
-	for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++)
+#ifdef CONFIG_NUMA
+	for (i = 0; i < NR_VM_NUMA_STAT_ITEMS; i++)
 		n += sprintf(buf+n, "%s %lu\n",
 			     vmstat_text[i + NR_VM_ZONE_STAT_ITEMS],
+			     sum_zone_numa_state(nid, i));
+#endif
+
+	for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++)
+		n += sprintf(buf+n, "%s %lu\n",
+			     vmstat_text[i + NR_VM_ZONE_STAT_ITEMS +
+			     NR_VM_NUMA_STAT_ITEMS],
 			     node_page_state(pgdat, i));
 
 	return n;
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index e7e92c8f4883..e65d91c02e30 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -114,6 +114,20 @@ struct zone_padding {
 #define ZONE_PADDING(name)
 #endif
 
+#ifdef CONFIG_NUMA
+enum numa_stat_item {
+	NUMA_HIT,		/* allocated in intended node */
+	NUMA_MISS,		/* allocated in non intended node */
+	NUMA_FOREIGN,		/* was intended here, hit elsewhere */
+	NUMA_INTERLEAVE_HIT,	/* interleaver preferred this zone */
+	NUMA_LOCAL,		/* allocation from local node */
+	NUMA_OTHER,		/* allocation from other node */
+	NR_VM_NUMA_STAT_ITEMS
+};
+#else
+#define NR_VM_NUMA_STAT_ITEMS 0
+#endif
+
 enum zone_stat_item {
 	/* First 128 byte cacheline (assuming 64 bit words) */
 	NR_FREE_PAGES,
@@ -131,14 +145,6 @@ enum zone_stat_item {
 	NR_BOUNCE,
 #if IS_ENABLED(CONFIG_ZSMALLOC)
 	NR_ZSPAGES,		/* allocated in zsmalloc */
-#endif
-#ifdef CONFIG_NUMA
-	NUMA_HIT,		/* allocated in intended node */
-	NUMA_MISS,		/* allocated in non intended node */
-	NUMA_FOREIGN,		/* was intended here, hit elsewhere */
-	NUMA_INTERLEAVE_HIT,	/* interleaver preferred this zone */
-	NUMA_LOCAL,		/* allocation from local node */
-	NUMA_OTHER,		/* allocation from other node */
 #endif
 	NR_FREE_CMA_PAGES,
 	NR_VM_ZONE_STAT_ITEMS };
@@ -276,6 +282,8 @@ struct per_cpu_pageset {
 	struct per_cpu_pages pcp;
 #ifdef CONFIG_NUMA
 	s8 expire;
+	s8 numa_stat_threshold;
+	s8 vm_numa_stat_diff[NR_VM_NUMA_STAT_ITEMS];
 #endif
 #ifdef CONFIG_SMP
 	s8 stat_threshold;
@@ -496,6 +504,7 @@ struct zone {
 	ZONE_PADDING(_pad3_)
 	/* Zone statistics */
 	atomic_long_t		vm_stat[NR_VM_ZONE_STAT_ITEMS];
+	atomic_long_t		vm_numa_stat[NR_VM_NUMA_STAT_ITEMS];
 } ____cacheline_internodealigned_in_smp;
 
 enum pgdat_flags {
diff --git a/include/linux/vmstat.h b/include/linux/vmstat.h
index 97e11ab573f0..9ac82e29948f 100644
--- a/include/linux/vmstat.h
+++ b/include/linux/vmstat.h
@@ -107,8 +107,33 @@ static inline void vm_events_fold_cpu(int cpu)
  * Zone and node-based page accounting with per cpu differentials.
  */
 extern atomic_long_t vm_zone_stat[NR_VM_ZONE_STAT_ITEMS];
+extern atomic_long_t vm_numa_stat[NR_VM_NUMA_STAT_ITEMS];
 extern atomic_long_t vm_node_stat[NR_VM_NODE_STAT_ITEMS];
 
+#ifdef CONFIG_NUMA
+static inline void zone_numa_state_add(long x, struct zone *zone,
+				 enum numa_stat_item item)
+{
+	atomic_long_add(x, &zone->vm_numa_stat[item]);
+	atomic_long_add(x, &vm_numa_stat[item]);
+}
+
+static inline unsigned long global_numa_state(enum numa_stat_item item)
+{
+	long x = atomic_long_read(&vm_numa_stat[item]);
+
+	return x;
+}
+
+static inline unsigned long zone_numa_state(struct zone *zone,
+					enum numa_stat_item item)
+{
+	long x = atomic_long_read(&zone->vm_numa_stat[item]);
+
+	return x;
+}
+#endif /* CONFIG_NUMA */
+
 static inline void zone_page_state_add(long x, struct zone *zone,
 				 enum zone_stat_item item)
 {
@@ -194,8 +219,10 @@ static inline unsigned long node_page_state_snapshot(pg_data_t *pgdat,
 
 
 #ifdef CONFIG_NUMA
+extern void __inc_numa_state(struct zone *zone, enum numa_stat_item item);
 extern unsigned long sum_zone_node_page_state(int node,
-						enum zone_stat_item item);
+					      enum zone_stat_item item);
+extern unsigned long sum_zone_numa_state(int node, enum numa_stat_item item);
 extern unsigned long node_page_state(struct pglist_data *pgdat,
 						enum node_stat_item item);
 #else
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index a9add06fe768..45583cd8dd56 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -2741,18 +2741,18 @@ int __isolate_free_page(struct page *page, unsigned int order)
 static inline void zone_statistics(struct zone *preferred_zone, struct zone *z)
 {
 #ifdef CONFIG_NUMA
-	enum zone_stat_item local_stat = NUMA_LOCAL;
+	enum numa_stat_item local_stat = NUMA_LOCAL;
 
 	if (z->node != numa_node_id())
 		local_stat = NUMA_OTHER;
 
 	if (z->node == preferred_zone->node)
-		__inc_zone_state(z, NUMA_HIT);
+		__inc_numa_state(z, NUMA_HIT);
 	else {
-		__inc_zone_state(z, NUMA_MISS);
-		__inc_zone_state(preferred_zone, NUMA_FOREIGN);
+		__inc_numa_state(z, NUMA_MISS);
+		__inc_numa_state(preferred_zone, NUMA_FOREIGN);
 	}
-	__inc_zone_state(z, local_stat);
+	__inc_numa_state(z, local_stat);
 #endif
 }
 
diff --git a/mm/vmstat.c b/mm/vmstat.c
index c7e4b8458023..daea02833e2e 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -87,8 +87,10 @@ void vm_events_fold_cpu(int cpu)
  * vm_stat contains the global counters
  */
 atomic_long_t vm_zone_stat[NR_VM_ZONE_STAT_ITEMS] __cacheline_aligned_in_smp;
+atomic_long_t vm_numa_stat[NR_VM_NUMA_STAT_ITEMS] __cacheline_aligned_in_smp;
 atomic_long_t vm_node_stat[NR_VM_NODE_STAT_ITEMS] __cacheline_aligned_in_smp;
 EXPORT_SYMBOL(vm_zone_stat);
+EXPORT_SYMBOL(vm_numa_stat);
 EXPORT_SYMBOL(vm_node_stat);
 
 #ifdef CONFIG_SMP
@@ -192,7 +194,10 @@ void refresh_zone_stat_thresholds(void)
 
 			per_cpu_ptr(zone->pageset, cpu)->stat_threshold
 							= threshold;
-
+#ifdef CONFIG_NUMA
+			per_cpu_ptr(zone->pageset, cpu)->numa_stat_threshold
+							= threshold;
+#endif
 			/* Base nodestat threshold on the largest populated zone. */
 			pgdat_threshold = per_cpu_ptr(pgdat->per_cpu_nodestats, cpu)->stat_threshold;
 			per_cpu_ptr(pgdat->per_cpu_nodestats, cpu)->stat_threshold
@@ -226,9 +231,14 @@ void set_pgdat_percpu_threshold(pg_data_t *pgdat,
 			continue;
 
 		threshold = (*calculate_pressure)(zone);
-		for_each_online_cpu(cpu)
+		for_each_online_cpu(cpu) {
 			per_cpu_ptr(zone->pageset, cpu)->stat_threshold
 							= threshold;
+#ifdef CONFIG_NUMA
+			per_cpu_ptr(zone->pageset, cpu)->numa_stat_threshold
+							= threshold;
+#endif
+		}
 	}
 }
 
@@ -604,6 +614,32 @@ EXPORT_SYMBOL(dec_node_page_state);
  * Fold a differential into the global counters.
  * Returns the number of counters updated.
  */
+#ifdef CONFIG_NUMA
+static int fold_diff(int *zone_diff, int *numa_diff, int *node_diff)
+{
+	int i;
+	int changes = 0;
+
+	for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++)
+		if (zone_diff[i]) {
+			atomic_long_add(zone_diff[i], &vm_zone_stat[i]);
+			changes++;
+	}
+
+	for (i = 0; i < NR_VM_NUMA_STAT_ITEMS; i++)
+		if (numa_diff[i]) {
+			atomic_long_add(numa_diff[i], &vm_numa_stat[i]);
+			changes++;
+	}
+
+	for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++)
+		if (node_diff[i]) {
+			atomic_long_add(node_diff[i], &vm_node_stat[i]);
+			changes++;
+	}
+	return changes;
+}
+#else
 static int fold_diff(int *zone_diff, int *node_diff)
 {
 	int i;
@@ -622,6 +658,7 @@ static int fold_diff(int *zone_diff, int *node_diff)
 	}
 	return changes;
 }
+#endif /* CONFIG_NUMA */
 
 /*
  * Update the zone counters for the current cpu.
@@ -645,6 +682,9 @@ static int refresh_cpu_vm_stats(bool do_pagesets)
 	struct zone *zone;
 	int i;
 	int global_zone_diff[NR_VM_ZONE_STAT_ITEMS] = { 0, };
+#ifdef CONFIG_NUMA
+	int global_numa_diff[NR_VM_NUMA_STAT_ITEMS] = { 0, };
+#endif
 	int global_node_diff[NR_VM_NODE_STAT_ITEMS] = { 0, };
 	int changes = 0;
 
@@ -666,6 +706,18 @@ static int refresh_cpu_vm_stats(bool do_pagesets)
 			}
 		}
 #ifdef CONFIG_NUMA
+		for (i = 0; i < NR_VM_NUMA_STAT_ITEMS; i++) {
+			int v;
+
+			v = this_cpu_xchg(p->vm_numa_stat_diff[i], 0);
+			if (v) {
+
+				atomic_long_add(v, &zone->vm_numa_stat[i]);
+				global_numa_diff[i] += v;
+				__this_cpu_write(p->expire, 3);
+			}
+		}
+
 		if (do_pagesets) {
 			cond_resched();
 			/*
@@ -712,7 +764,12 @@ static int refresh_cpu_vm_stats(bool do_pagesets)
 		}
 	}
 
+#ifdef CONFIG_NUMA
+	changes += fold_diff(global_zone_diff, global_numa_diff,
+			     global_node_diff);
+#else
 	changes += fold_diff(global_zone_diff, global_node_diff);
+#endif
 	return changes;
 }
 
@@ -727,6 +784,9 @@ void cpu_vm_stats_fold(int cpu)
 	struct zone *zone;
 	int i;
 	int global_zone_diff[NR_VM_ZONE_STAT_ITEMS] = { 0, };
+#ifdef CONFIG_NUMA
+	int global_numa_diff[NR_VM_NUMA_STAT_ITEMS] = { 0, };
+#endif
 	int global_node_diff[NR_VM_NODE_STAT_ITEMS] = { 0, };
 
 	for_each_populated_zone(zone) {
@@ -743,6 +803,18 @@ void cpu_vm_stats_fold(int cpu)
 				atomic_long_add(v, &zone->vm_stat[i]);
 				global_zone_diff[i] += v;
 			}
+
+#ifdef CONFIG_NUMA
+		for (i = 0; i < NR_VM_NUMA_STAT_ITEMS; i++)
+			if (p->vm_numa_stat_diff[i]) {
+				int v;
+
+				v = p->vm_numa_stat_diff[i];
+				p->vm_numa_stat_diff[i] = 0;
+				atomic_long_add(v, &zone->vm_numa_stat[i]);
+				global_numa_diff[i] += v;
+			}
+#endif
 	}
 
 	for_each_online_pgdat(pgdat) {
@@ -761,7 +833,11 @@ void cpu_vm_stats_fold(int cpu)
 			}
 	}
 
+#ifdef CONFIG_NUMA
+	fold_diff(global_zone_diff, global_numa_diff, global_node_diff);
+#else
 	fold_diff(global_zone_diff, global_node_diff);
+#endif
 }
 
 /*
@@ -779,10 +855,38 @@ void drain_zonestat(struct zone *zone, struct per_cpu_pageset *pset)
 			atomic_long_add(v, &zone->vm_stat[i]);
 			atomic_long_add(v, &vm_zone_stat[i]);
 		}
+
+#ifdef CONFIG_NUMA
+	for (i = 0; i < NR_VM_NUMA_STAT_ITEMS; i++)
+		if (pset->vm_numa_stat_diff[i]) {
+			int v = pset->vm_numa_stat_diff[i];
+
+			pset->vm_numa_stat_diff[i] = 0;
+			atomic_long_add(v, &zone->vm_numa_stat[i]);
+			atomic_long_add(v, &vm_numa_stat[i]);
+		}
+#endif
 }
 #endif
 
 #ifdef CONFIG_NUMA
+void __inc_numa_state(struct zone *zone,
+				 enum numa_stat_item item)
+{
+	struct per_cpu_pageset __percpu *pcp = zone->pageset;
+	s8 __percpu *p = pcp->vm_numa_stat_diff + item;
+	s8 v, t;
+
+	v = __this_cpu_inc_return(*p);
+	t = __this_cpu_read(pcp->numa_stat_threshold);
+	if (unlikely(v > t)) {
+		s8 overstep = t >> 1;
+
+		zone_numa_state_add(v + overstep, zone, item);
+		__this_cpu_write(*p, -overstep);
+	}
+}
+
 /*
  * Determine the per node value of a stat item. This function
  * is called frequently in a NUMA machine, so try to be as
@@ -801,6 +905,19 @@ unsigned long sum_zone_node_page_state(int node,
 	return count;
 }
 
+unsigned long sum_zone_numa_state(int node,
+				 enum numa_stat_item item)
+{
+	struct zone *zones = NODE_DATA(node)->node_zones;
+	int i;
+	unsigned long count = 0;
+
+	for (i = 0; i < MAX_NR_ZONES; i++)
+		count += zone_numa_state(zones + i, item);
+
+	return count;
+}
+
 /*
  * Determine the per node value of a stat item.
  */
@@ -937,6 +1054,9 @@ const char * const vmstat_text[] = {
 #if IS_ENABLED(CONFIG_ZSMALLOC)
 	"nr_zspages",
 #endif
+	"nr_free_cma",
+
+	/* enum numa_stat_item counters */
 #ifdef CONFIG_NUMA
 	"numa_hit",
 	"numa_miss",
@@ -945,7 +1065,6 @@ const char * const vmstat_text[] = {
 	"numa_local",
 	"numa_other",
 #endif
-	"nr_free_cma",
 
 	/* Node-based counters */
 	"nr_inactive_anon",
@@ -1106,7 +1225,6 @@ const char * const vmstat_text[] = {
 };
 #endif /* CONFIG_PROC_FS || CONFIG_SYSFS || CONFIG_NUMA */
 
-
 #if (defined(CONFIG_DEBUG_FS) && defined(CONFIG_COMPACTION)) || \
      defined(CONFIG_PROC_FS)
 static void *frag_start(struct seq_file *m, loff_t *pos)
@@ -1384,7 +1502,8 @@ static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat,
 		seq_printf(m, "\n  per-node stats");
 		for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++) {
 			seq_printf(m, "\n      %-12s %lu",
-				vmstat_text[i + NR_VM_ZONE_STAT_ITEMS],
+				vmstat_text[i + NR_VM_ZONE_STAT_ITEMS +
+				NR_VM_NUMA_STAT_ITEMS],
 				node_page_state(pgdat, i));
 		}
 	}
@@ -1421,6 +1540,13 @@ static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat,
 		seq_printf(m, "\n      %-12s %lu", vmstat_text[i],
 				zone_page_state(zone, i));
 
+#ifdef CONFIG_NUMA
+	for (i = 0; i < NR_VM_NUMA_STAT_ITEMS; i++)
+		seq_printf(m, "\n      %-12s %lu",
+				vmstat_text[i + NR_VM_ZONE_STAT_ITEMS],
+				zone_numa_state(zone, i));
+#endif
+
 	seq_printf(m, "\n  pagesets");
 	for_each_online_cpu(i) {
 		struct per_cpu_pageset *pageset;
@@ -1497,6 +1623,7 @@ static void *vmstat_start(struct seq_file *m, loff_t *pos)
 	if (*pos >= ARRAY_SIZE(vmstat_text))
 		return NULL;
 	stat_items_size = NR_VM_ZONE_STAT_ITEMS * sizeof(unsigned long) +
+			  NR_VM_NUMA_STAT_ITEMS * sizeof(unsigned long) +
 			  NR_VM_NODE_STAT_ITEMS * sizeof(unsigned long) +
 			  NR_VM_WRITEBACK_STAT_ITEMS * sizeof(unsigned long);
 
@@ -1512,6 +1639,12 @@ static void *vmstat_start(struct seq_file *m, loff_t *pos)
 		v[i] = global_zone_page_state(i);
 	v += NR_VM_ZONE_STAT_ITEMS;
 
+#ifdef CONFIG_NUMA
+	for (i = 0; i < NR_VM_NUMA_STAT_ITEMS; i++)
+		v[i] = global_numa_state(i);
+	v += NR_VM_NUMA_STAT_ITEMS;
+#endif
+
 	for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++)
 		v[i] = global_node_page_state(i);
 	v += NR_VM_NODE_STAT_ITEMS;
@@ -1613,6 +1746,16 @@ int vmstat_refresh(struct ctl_table *table, int write,
 			err = -EINVAL;
 		}
 	}
+#ifdef CONFIG_NUMA
+	for (i = 0; i < NR_VM_NUMA_STAT_ITEMS; i++) {
+		val = atomic_long_read(&vm_numa_stat[i]);
+		if (val < 0) {
+			pr_warn("%s: %s %ld\n",
+				__func__, vmstat_text[i + NR_VM_ZONE_STAT_ITEMS], val);
+			err = -EINVAL;
+		}
+	}
+#endif
 	if (err)
 		return err;
 	if (write)
@@ -1654,13 +1797,19 @@ static bool need_update(int cpu)
 		struct per_cpu_pageset *p = per_cpu_ptr(zone->pageset, cpu);
 
 		BUILD_BUG_ON(sizeof(p->vm_stat_diff[0]) != 1);
+#ifdef CONFIG_NUMA
+		BUILD_BUG_ON(sizeof(p->vm_numa_stat_diff[0]) != 1);
+#endif
 		/*
 		 * The fast way of checking if there are any vmstat diffs.
 		 * This works because the diffs are byte sized items.
 		 */
 		if (memchr_inv(p->vm_stat_diff, 0, NR_VM_ZONE_STAT_ITEMS))
 			return true;
-
+#ifdef CONFIG_NUMA
+		if (memchr_inv(p->vm_numa_stat_diff, 0, NR_VM_NUMA_STAT_ITEMS))
+			return true;
+#endif
 	}
 	return false;
 }
-- 
cgit v1.2.3


From 1d90ca897cb05cf38bd62f36756d219e02913b7d Mon Sep 17 00:00:00 2001
From: Kemi Wang <kemi.wang@intel.com>
Date: Fri, 8 Sep 2017 16:12:52 -0700
Subject: mm: update NUMA counter threshold size

There is significant overhead in cache bouncing caused by zone counters
(NUMA associated counters) update in parallel in multi-threaded page
allocation (suggested by Dave Hansen).

This patch updates NUMA counter threshold to a fixed size of MAX_U16 - 2,
as a small threshold greatly increases the update frequency of the global
counter from local per cpu counter(suggested by Ying Huang).

The rationality is that these statistics counters don't affect the
kernel's decision, unlike other VM counters, so it's not a problem to use
a large threshold.

With this patchset, we see 31.3% drop of CPU cycles(537-->369) for per
single page allocation and reclaim on Jesper's page_bench03 benchmark.

Benchmark provided by Jesper D Brouer(increase loop times to 10000000):
https://github.com/netoptimizer/prototype-kernel/tree/master/kernel/mm/
bench

 Threshold   CPU cycles    Throughput(88 threads)
     32          799         241760478
     64          640         301628829
     125         537         358906028 <==> system by default (base)
     256         468         412397590
     512         428         450550704
     4096        399         482520943
     20000       394         489009617
     30000       395         488017817
     65533       369(-31.3%) 521661345(+45.3%) <==> with this patchset
     N/A         342(-36.3%) 562900157(+56.8%) <==> disable zone_statistics

Link: http://lkml.kernel.org/r/1503568801-21305-3-git-send-email-kemi.wang@intel.com
Signed-off-by: Kemi Wang <kemi.wang@intel.com>
Reported-by: Jesper Dangaard Brouer <brouer@redhat.com>
Suggested-by: Dave Hansen <dave.hansen@intel.com>
Suggested-by: Ying Huang <ying.huang@intel.com>
Acked-by: Mel Gorman <mgorman@techsingularity.net>
Cc: Aaron Lu <aaron.lu@intel.com>
Cc: Andi Kleen <andi.kleen@intel.com>
Cc: Christopher Lameter <cl@linux.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Tim Chen <tim.c.chen@intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/mmzone.h |  3 +--
 mm/vmstat.c            | 28 ++++++++++------------------
 2 files changed, 11 insertions(+), 20 deletions(-)

(limited to 'include')

diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index e65d91c02e30..356a814e7c8e 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -282,8 +282,7 @@ struct per_cpu_pageset {
 	struct per_cpu_pages pcp;
 #ifdef CONFIG_NUMA
 	s8 expire;
-	s8 numa_stat_threshold;
-	s8 vm_numa_stat_diff[NR_VM_NUMA_STAT_ITEMS];
+	u16 vm_numa_stat_diff[NR_VM_NUMA_STAT_ITEMS];
 #endif
 #ifdef CONFIG_SMP
 	s8 stat_threshold;
diff --git a/mm/vmstat.c b/mm/vmstat.c
index daea02833e2e..153d8129c155 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -30,6 +30,8 @@
 
 #include "internal.h"
 
+#define NUMA_STATS_THRESHOLD (U16_MAX - 2)
+
 #ifdef CONFIG_VM_EVENT_COUNTERS
 DEFINE_PER_CPU(struct vm_event_state, vm_event_states) = {{0}};
 EXPORT_PER_CPU_SYMBOL(vm_event_states);
@@ -194,10 +196,7 @@ void refresh_zone_stat_thresholds(void)
 
 			per_cpu_ptr(zone->pageset, cpu)->stat_threshold
 							= threshold;
-#ifdef CONFIG_NUMA
-			per_cpu_ptr(zone->pageset, cpu)->numa_stat_threshold
-							= threshold;
-#endif
+
 			/* Base nodestat threshold on the largest populated zone. */
 			pgdat_threshold = per_cpu_ptr(pgdat->per_cpu_nodestats, cpu)->stat_threshold;
 			per_cpu_ptr(pgdat->per_cpu_nodestats, cpu)->stat_threshold
@@ -231,14 +230,9 @@ void set_pgdat_percpu_threshold(pg_data_t *pgdat,
 			continue;
 
 		threshold = (*calculate_pressure)(zone);
-		for_each_online_cpu(cpu) {
+		for_each_online_cpu(cpu)
 			per_cpu_ptr(zone->pageset, cpu)->stat_threshold
 							= threshold;
-#ifdef CONFIG_NUMA
-			per_cpu_ptr(zone->pageset, cpu)->numa_stat_threshold
-							= threshold;
-#endif
-		}
 	}
 }
 
@@ -874,16 +868,14 @@ void __inc_numa_state(struct zone *zone,
 				 enum numa_stat_item item)
 {
 	struct per_cpu_pageset __percpu *pcp = zone->pageset;
-	s8 __percpu *p = pcp->vm_numa_stat_diff + item;
-	s8 v, t;
+	u16 __percpu *p = pcp->vm_numa_stat_diff + item;
+	u16 v;
 
 	v = __this_cpu_inc_return(*p);
-	t = __this_cpu_read(pcp->numa_stat_threshold);
-	if (unlikely(v > t)) {
-		s8 overstep = t >> 1;
 
-		zone_numa_state_add(v + overstep, zone, item);
-		__this_cpu_write(*p, -overstep);
+	if (unlikely(v > NUMA_STATS_THRESHOLD)) {
+		zone_numa_state_add(v, zone, item);
+		__this_cpu_write(*p, 0);
 	}
 }
 
@@ -1798,7 +1790,7 @@ static bool need_update(int cpu)
 
 		BUILD_BUG_ON(sizeof(p->vm_stat_diff[0]) != 1);
 #ifdef CONFIG_NUMA
-		BUILD_BUG_ON(sizeof(p->vm_numa_stat_diff[0]) != 1);
+		BUILD_BUG_ON(sizeof(p->vm_numa_stat_diff[0]) != 2);
 #endif
 		/*
 		 * The fast way of checking if there are any vmstat diffs.
-- 
cgit v1.2.3


From 638032224ed762a29baca1fc37f1168efc2554ae Mon Sep 17 00:00:00 2001
From: Kemi Wang <kemi.wang@intel.com>
Date: Fri, 8 Sep 2017 16:12:55 -0700
Subject: mm: consider the number in local CPUs when reading NUMA stats

To avoid deviation, the per cpu number of NUMA stats in
vm_numa_stat_diff[] is included when a user *reads* the NUMA stats.

Since NUMA stats does not be read by users frequently, and kernel does not
need it to make a decision, it will not be a problem to make the readers
more expensive.

Link: http://lkml.kernel.org/r/1503568801-21305-4-git-send-email-kemi.wang@intel.com
Signed-off-by: Kemi Wang <kemi.wang@intel.com>
Reported-by: Jesper Dangaard Brouer <brouer@redhat.com>
Acked-by: Mel Gorman <mgorman@techsingularity.net>
Cc: Aaron Lu <aaron.lu@intel.com>
Cc: Andi Kleen <andi.kleen@intel.com>
Cc: Christopher Lameter <cl@linux.com>
Cc: Dave Hansen <dave.hansen@intel.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Tim Chen <tim.c.chen@intel.com>
Cc: Ying Huang <ying.huang@intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/vmstat.h | 6 +++++-
 mm/vmstat.c            | 9 +++++++--
 2 files changed, 12 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/include/linux/vmstat.h b/include/linux/vmstat.h
index 9ac82e29948f..ade7cb5f1359 100644
--- a/include/linux/vmstat.h
+++ b/include/linux/vmstat.h
@@ -125,10 +125,14 @@ static inline unsigned long global_numa_state(enum numa_stat_item item)
 	return x;
 }
 
-static inline unsigned long zone_numa_state(struct zone *zone,
+static inline unsigned long zone_numa_state_snapshot(struct zone *zone,
 					enum numa_stat_item item)
 {
 	long x = atomic_long_read(&zone->vm_numa_stat[item]);
+	int cpu;
+
+	for_each_online_cpu(cpu)
+		x += per_cpu_ptr(zone->pageset, cpu)->vm_numa_stat_diff[item];
 
 	return x;
 }
diff --git a/mm/vmstat.c b/mm/vmstat.c
index 153d8129c155..4bb13e72ac97 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -897,6 +897,10 @@ unsigned long sum_zone_node_page_state(int node,
 	return count;
 }
 
+/*
+ * Determine the per node value of a numa stat item. To avoid deviation,
+ * the per cpu stat number in vm_numa_stat_diff[] is also included.
+ */
 unsigned long sum_zone_numa_state(int node,
 				 enum numa_stat_item item)
 {
@@ -905,7 +909,7 @@ unsigned long sum_zone_numa_state(int node,
 	unsigned long count = 0;
 
 	for (i = 0; i < MAX_NR_ZONES; i++)
-		count += zone_numa_state(zones + i, item);
+		count += zone_numa_state_snapshot(zones + i, item);
 
 	return count;
 }
@@ -1536,7 +1540,7 @@ static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat,
 	for (i = 0; i < NR_VM_NUMA_STAT_ITEMS; i++)
 		seq_printf(m, "\n      %-12s %lu",
 				vmstat_text[i + NR_VM_ZONE_STAT_ITEMS],
-				zone_numa_state(zone, i));
+				zone_numa_state_snapshot(zone, i));
 #endif
 
 	seq_printf(m, "\n  pagesets");
@@ -1792,6 +1796,7 @@ static bool need_update(int cpu)
 #ifdef CONFIG_NUMA
 		BUILD_BUG_ON(sizeof(p->vm_numa_stat_diff[0]) != 2);
 #endif
+
 		/*
 		 * The fast way of checking if there are any vmstat diffs.
 		 * This works because the diffs are byte sized items.
-- 
cgit v1.2.3


From 855d97657d4d335970622f0d95ca4966d3f77ee7 Mon Sep 17 00:00:00 2001
From: Alexey Dobriyan <adobriyan@gmail.com>
Date: Fri, 8 Sep 2017 16:13:38 -0700
Subject: proc: uninline proc_create()

Save some code from ~320 invocations all clearing last argument.

	add/remove: 3/0 grow/shrink: 0/158 up/down: 45/-702 (-657)
	function                                     old     new   delta
	proc_create                                    -      17     +17
	__ksymtab_proc_create                          -      16     +16
	__kstrtab_proc_create                          -      12     +12
	yam_init_driver                              301     298      -3

		...

	cifs_proc_init                               249     228     -21
	via_fb_pci_probe                            2304    2280     -24

Link: http://lkml.kernel.org/r/20170819094702.GA27864@avx2
Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/proc/generic.c       | 8 ++++++++
 include/linux/proc_fs.h | 8 +-------
 2 files changed, 9 insertions(+), 7 deletions(-)

(limited to 'include')

diff --git a/fs/proc/generic.c b/fs/proc/generic.c
index e3cda0b5968f..85566abe6f83 100644
--- a/fs/proc/generic.c
+++ b/fs/proc/generic.c
@@ -499,6 +499,14 @@ out:
 }
 EXPORT_SYMBOL(proc_create_data);
  
+struct proc_dir_entry *proc_create(const char *name, umode_t mode,
+				   struct proc_dir_entry *parent,
+				   const struct file_operations *proc_fops)
+{
+	return proc_create_data(name, mode, parent, proc_fops, NULL);
+}
+EXPORT_SYMBOL(proc_create);
+
 void proc_set_size(struct proc_dir_entry *de, loff_t size)
 {
 	de->size = size;
diff --git a/include/linux/proc_fs.h b/include/linux/proc_fs.h
index 2d2bf592d9db..76124dd4e36d 100644
--- a/include/linux/proc_fs.h
+++ b/include/linux/proc_fs.h
@@ -28,13 +28,7 @@ extern struct proc_dir_entry *proc_create_data(const char *, umode_t,
 					       const struct file_operations *,
 					       void *);
 
-static inline struct proc_dir_entry *proc_create(
-	const char *name, umode_t mode, struct proc_dir_entry *parent,
-	const struct file_operations *proc_fops)
-{
-	return proc_create_data(name, mode, parent, proc_fops, NULL);
-}
-
+struct proc_dir_entry *proc_create(const char *name, umode_t mode, struct proc_dir_entry *parent, const struct file_operations *proc_fops);
 extern void proc_set_size(struct proc_dir_entry *, loff_t);
 extern void proc_set_user(struct proc_dir_entry *, kuid_t, kgid_t);
 extern void *PDE_DATA(const struct inode *);
-- 
cgit v1.2.3


From 604df322363e5770735df85368f83cac4a955a24 Mon Sep 17 00:00:00 2001
From: Masahiro Yamada <yamada.masahiro@socionext.com>
Date: Fri, 8 Sep 2017 16:13:45 -0700
Subject: linux/kernel.h: move DIV_ROUND_DOWN_ULL() macro

This macro is useful to avoid link error on 32-bit systems.

We have the same definition in two drivers, so move it to
include/linux/kernel.h

While we are here, refactor DIV_ROUND_UP_ULL() by using
DIV_ROUND_DOWN_ULL().

Link: http://lkml.kernel.org/r/1500945156-12907-1-git-send-email-yamada.masahiro@socionext.com
Signed-off-by: Masahiro Yamada <yamada.masahiro@socionext.com>
Acked-by: Mark Brown <broonie@kernel.org>
Cc: Cyrille Pitchen <cyrille.pitchen@wedev4u.fr>
Cc: Jaroslav Kysela <perex@perex.cz>
Cc: Takashi Iwai <tiwai@suse.com>
Cc: Liam Girdwood <lgirdwood@gmail.com>
Cc: Boris Brezillon <boris.brezillon@free-electrons.com>
Cc: Marek Vasut <marek.vasut@gmail.com>
Cc: Brian Norris <computersforpeace@gmail.com>
Cc: Richard Weinberger <richard@nod.at>
Cc: David Woodhouse <dwmw2@infradead.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/mtd/nand/denali.c  | 3 ---
 include/linux/kernel.h     | 7 +++++--
 sound/soc/codecs/pcm512x.c | 3 ---
 3 files changed, 5 insertions(+), 8 deletions(-)

(limited to 'include')

diff --git a/drivers/mtd/nand/denali.c b/drivers/mtd/nand/denali.c
index d723be352148..3087b0ba7b7f 100644
--- a/drivers/mtd/nand/denali.c
+++ b/drivers/mtd/nand/denali.c
@@ -980,9 +980,6 @@ static int denali_erase(struct mtd_info *mtd, int page)
 	return irq_status & INTR__ERASE_COMP ? 0 : NAND_STATUS_FAIL;
 }
 
-#define DIV_ROUND_DOWN_ULL(ll, d) \
-	({ unsigned long long _tmp = (ll); do_div(_tmp, d); _tmp; })
-
 static int denali_setup_data_interface(struct mtd_info *mtd, int chipnr,
 				       const struct nand_data_interface *conf)
 {
diff --git a/include/linux/kernel.h b/include/linux/kernel.h
index 6607225d0ea4..0ad4c3044cf9 100644
--- a/include/linux/kernel.h
+++ b/include/linux/kernel.h
@@ -78,8 +78,11 @@
 
 #define FIELD_SIZEOF(t, f) (sizeof(((t*)0)->f))
 #define DIV_ROUND_UP __KERNEL_DIV_ROUND_UP
-#define DIV_ROUND_UP_ULL(ll,d) \
-	({ unsigned long long _tmp = (ll)+(d)-1; do_div(_tmp, d); _tmp; })
+
+#define DIV_ROUND_DOWN_ULL(ll, d) \
+	({ unsigned long long _tmp = (ll); do_div(_tmp, d); _tmp; })
+
+#define DIV_ROUND_UP_ULL(ll, d)		DIV_ROUND_DOWN_ULL((ll) + (d) - 1, (d))
 
 #if BITS_PER_LONG == 32
 # define DIV_ROUND_UP_SECTOR_T(ll,d) DIV_ROUND_UP_ULL(ll, d)
diff --git a/sound/soc/codecs/pcm512x.c b/sound/soc/codecs/pcm512x.c
index f1005a31c709..68feae262476 100644
--- a/sound/soc/codecs/pcm512x.c
+++ b/sound/soc/codecs/pcm512x.c
@@ -30,9 +30,6 @@
 
 #include "pcm512x.h"
 
-#define DIV_ROUND_DOWN_ULL(ll, d) \
-	({ unsigned long long _tmp = (ll); do_div(_tmp, d); _tmp; })
-
 #define PCM512x_NUM_SUPPLIES 3
 static const char * const pcm512x_supply_names[PCM512x_NUM_SUPPLIES] = {
 	"AVDD",
-- 
cgit v1.2.3


From 3b3c4babd898715926d24ae10aa64778ace33aae Mon Sep 17 00:00:00 2001
From: Matthew Wilcox <mawilcox@microsoft.com>
Date: Fri, 8 Sep 2017 16:13:48 -0700
Subject: lib/string.c: add multibyte memset functions

Patch series "Multibyte memset variations", v4.

A relatively common idiom we're missing is a function to fill an area of
memory with a pattern which is larger than a single byte.  I first
noticed this with a zram patch which wanted to fill a page with an
'unsigned long' value.  There turn out to be quite a few places in the
kernel which can benefit from using an optimised function rather than a
loop; sometimes text size, sometimes speed, and sometimes both.  The
optimised PowerPC version (not included here) improves performance by
about 30% on POWER8 on just the raw memset_l().

Most of the extra lines of code come from the three testcases I added.

This patch (of 8):

memset16(), memset32() and memset64() are like memset(), but allow the
caller to fill the destination with a value larger than a single byte.
memset_l() and memset_p() allow the caller to use unsigned long and
pointer values respectively.

Link: http://lkml.kernel.org/r/20170720184539.31609-2-willy@infradead.org
Signed-off-by: Matthew Wilcox <mawilcox@microsoft.com>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: "James E.J. Bottomley" <jejb@linux.vnet.ibm.com>
Cc: "Martin K. Petersen" <martin.petersen@oracle.com>
Cc: David Miller <davem@davemloft.net>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Ivan Kokshaysky <ink@jurassic.park.msu.ru>
Cc: Matt Turner <mattst88@gmail.com>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Minchan Kim <minchan@kernel.org>
Cc: Ralf Baechle <ralf@linux-mips.org>
Cc: Richard Henderson <rth@twiddle.net>
Cc: Russell King <rmk+kernel@armlinux.org.uk>
Cc: Sam Ravnborg <sam@ravnborg.org>
Cc: Sergey Senozhatsky <sergey.senozhatsky@gmail.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/string.h | 30 +++++++++++++++++++++++
 lib/string.c           | 66 ++++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 96 insertions(+)

(limited to 'include')

diff --git a/include/linux/string.h b/include/linux/string.h
index a467e617eeb0..c8bdafffd2f0 100644
--- a/include/linux/string.h
+++ b/include/linux/string.h
@@ -99,6 +99,36 @@ extern __kernel_size_t strcspn(const char *,const char *);
 #ifndef __HAVE_ARCH_MEMSET
 extern void * memset(void *,int,__kernel_size_t);
 #endif
+
+#ifndef __HAVE_ARCH_MEMSET16
+extern void *memset16(uint16_t *, uint16_t, __kernel_size_t);
+#endif
+
+#ifndef __HAVE_ARCH_MEMSET32
+extern void *memset32(uint32_t *, uint32_t, __kernel_size_t);
+#endif
+
+#ifndef __HAVE_ARCH_MEMSET64
+extern void *memset64(uint64_t *, uint64_t, __kernel_size_t);
+#endif
+
+static inline void *memset_l(unsigned long *p, unsigned long v,
+		__kernel_size_t n)
+{
+	if (BITS_PER_LONG == 32)
+		return memset32((uint32_t *)p, v, n);
+	else
+		return memset64((uint64_t *)p, v, n);
+}
+
+static inline void *memset_p(void **p, void *v, __kernel_size_t n)
+{
+	if (BITS_PER_LONG == 32)
+		return memset32((uint32_t *)p, (uintptr_t)v, n);
+	else
+		return memset64((uint64_t *)p, (uintptr_t)v, n);
+}
+
 #ifndef __HAVE_ARCH_MEMCPY
 extern void * memcpy(void *,const void *,__kernel_size_t);
 #endif
diff --git a/lib/string.c b/lib/string.c
index ebbb99c775bd..198148bb61fd 100644
--- a/lib/string.c
+++ b/lib/string.c
@@ -723,6 +723,72 @@ void memzero_explicit(void *s, size_t count)
 }
 EXPORT_SYMBOL(memzero_explicit);
 
+#ifndef __HAVE_ARCH_MEMSET16
+/**
+ * memset16() - Fill a memory area with a uint16_t
+ * @s: Pointer to the start of the area.
+ * @v: The value to fill the area with
+ * @count: The number of values to store
+ *
+ * Differs from memset() in that it fills with a uint16_t instead
+ * of a byte.  Remember that @count is the number of uint16_ts to
+ * store, not the number of bytes.
+ */
+void *memset16(uint16_t *s, uint16_t v, size_t count)
+{
+	uint16_t *xs = s;
+
+	while (count--)
+		*xs++ = v;
+	return s;
+}
+EXPORT_SYMBOL(memset16);
+#endif
+
+#ifndef __HAVE_ARCH_MEMSET32
+/**
+ * memset32() - Fill a memory area with a uint32_t
+ * @s: Pointer to the start of the area.
+ * @v: The value to fill the area with
+ * @count: The number of values to store
+ *
+ * Differs from memset() in that it fills with a uint32_t instead
+ * of a byte.  Remember that @count is the number of uint32_ts to
+ * store, not the number of bytes.
+ */
+void *memset32(uint32_t *s, uint32_t v, size_t count)
+{
+	uint32_t *xs = s;
+
+	while (count--)
+		*xs++ = v;
+	return s;
+}
+EXPORT_SYMBOL(memset32);
+#endif
+
+#ifndef __HAVE_ARCH_MEMSET64
+/**
+ * memset64() - Fill a memory area with a uint64_t
+ * @s: Pointer to the start of the area.
+ * @v: The value to fill the area with
+ * @count: The number of values to store
+ *
+ * Differs from memset() in that it fills with a uint64_t instead
+ * of a byte.  Remember that @count is the number of uint64_ts to
+ * store, not the number of bytes.
+ */
+void *memset64(uint64_t *s, uint64_t v, size_t count)
+{
+	uint64_t *xs = s;
+
+	while (count--)
+		*xs++ = v;
+	return s;
+}
+EXPORT_SYMBOL(memset64);
+#endif
+
 #ifndef __HAVE_ARCH_MEMCPY
 /**
  * memcpy - Copy one area of memory to another
-- 
cgit v1.2.3


From ac036f9570a2d318b7d8dbbdbf0e269d7cc68cef Mon Sep 17 00:00:00 2001
From: Matthew Wilcox <mawilcox@microsoft.com>
Date: Fri, 8 Sep 2017 16:14:15 -0700
Subject: vga: optimise console scrolling

Where possible, call memset16(), memmove() or memcpy() instead of using
open-coded loops.  I don't like the calling convention that uses a byte
count instead of a count of u16s, but it's a little late to change that.
Reduces code size of fbcon.o by almost 400 bytes on my laptop build.

[akpm@linux-foundation.org: fix build]
Link: http://lkml.kernel.org/r/20170720184539.31609-9-willy@infradead.org
Signed-off-by: Matthew Wilcox <mawilcox@microsoft.com>
Cc: Ralf Baechle <ralf@linux-mips.org>
Cc: David Miller <davem@davemloft.net>
Cc: Sam Ravnborg <sam@ravnborg.org>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: "James E.J. Bottomley" <jejb@linux.vnet.ibm.com>
Cc: "Martin K. Petersen" <martin.petersen@oracle.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Ivan Kokshaysky <ink@jurassic.park.msu.ru>
Cc: Matt Turner <mattst88@gmail.com>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Minchan Kim <minchan@kernel.org>
Cc: Richard Henderson <rth@twiddle.net>
Cc: Russell King <rmk+kernel@armlinux.org.uk>
Cc: Sergey Senozhatsky <sergey.senozhatsky@gmail.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/mips/include/asm/vga.h    |  7 +++++++
 arch/powerpc/include/asm/vga.h |  8 ++++++++
 arch/sparc/include/asm/vga.h   | 25 +++++++++++++++++++++++++
 include/linux/vt_buffer.h      | 13 +++++++++++++
 4 files changed, 53 insertions(+)

(limited to 'include')

diff --git a/arch/mips/include/asm/vga.h b/arch/mips/include/asm/vga.h
index f82c83749a08..975ff51f80c4 100644
--- a/arch/mips/include/asm/vga.h
+++ b/arch/mips/include/asm/vga.h
@@ -6,6 +6,7 @@
 #ifndef _ASM_VGA_H
 #define _ASM_VGA_H
 
+#include <linux/string.h>
 #include <asm/addrspace.h>
 #include <asm/byteorder.h>
 
@@ -40,9 +41,15 @@ static inline u16 scr_readw(volatile const u16 *addr)
 	return le16_to_cpu(*addr);
 }
 
+static inline void scr_memsetw(u16 *s, u16 v, unsigned int count)
+{
+	memset16(s, cpu_to_le16(v), count / 2);
+}
+
 #define scr_memcpyw(d, s, c) memcpy(d, s, c)
 #define scr_memmovew(d, s, c) memmove(d, s, c)
 #define VT_BUF_HAVE_MEMCPYW
 #define VT_BUF_HAVE_MEMMOVEW
+#define VT_BUF_HAVE_MEMSETW
 
 #endif /* _ASM_VGA_H */
diff --git a/arch/powerpc/include/asm/vga.h b/arch/powerpc/include/asm/vga.h
index ab3acd2f2786..7a7b541b7493 100644
--- a/arch/powerpc/include/asm/vga.h
+++ b/arch/powerpc/include/asm/vga.h
@@ -33,8 +33,16 @@ static inline u16 scr_readw(volatile const u16 *addr)
 	return le16_to_cpu(*addr);
 }
 
+#define VT_BUF_HAVE_MEMSETW
+static inline void scr_memsetw(u16 *s, u16 v, unsigned int n)
+{
+	memset16(s, cpu_to_le16(v), n / 2);
+}
+
 #define VT_BUF_HAVE_MEMCPYW
+#define VT_BUF_HAVE_MEMMOVEW
 #define scr_memcpyw	memcpy
+#define scr_memmovew	memmove
 
 #endif /* !CONFIG_VGA_CONSOLE && !CONFIG_MDA_CONSOLE */
 
diff --git a/arch/sparc/include/asm/vga.h b/arch/sparc/include/asm/vga.h
index ec0e9967d93d..f54e8b6fb197 100644
--- a/arch/sparc/include/asm/vga.h
+++ b/arch/sparc/include/asm/vga.h
@@ -8,9 +8,13 @@
 #define _LINUX_ASM_VGA_H_
 
 #include <linux/bug.h>
+#include <linux/string.h>
 #include <asm/types.h>
 
 #define VT_BUF_HAVE_RW
+#define VT_BUF_HAVE_MEMSETW
+#define VT_BUF_HAVE_MEMCPYW
+#define VT_BUF_HAVE_MEMMOVEW
 
 #undef scr_writew
 #undef scr_readw
@@ -29,6 +33,27 @@ static inline u16 scr_readw(const u16 *addr)
 	return *addr;
 }
 
+static inline void scr_memsetw(u16 *p, u16 v, unsigned int n)
+{
+	BUG_ON((long) p >= 0);
+
+	memset16(p, cpu_to_le16(v), n / 2);
+}
+
+static inline void scr_memcpyw(u16 *d, u16 *s, unsigned int n)
+{
+	BUG_ON((long) d >= 0);
+
+	memcpy(d, s, n);
+}
+
+static inline void scr_memmovew(u16 *d, u16 *s, unsigned int n)
+{
+	BUG_ON((long) d >= 0);
+
+	memmove(d, s, n);
+}
+
 #define VGA_MAP_MEM(x,s) (x)
 
 #endif
diff --git a/include/linux/vt_buffer.h b/include/linux/vt_buffer.h
index f38c10ba3ff5..30b6e0d2a942 100644
--- a/include/linux/vt_buffer.h
+++ b/include/linux/vt_buffer.h
@@ -13,6 +13,7 @@
 #ifndef _LINUX_VT_BUFFER_H_
 #define _LINUX_VT_BUFFER_H_
 
+#include <linux/string.h>
 
 #if defined(CONFIG_VGA_CONSOLE) || defined(CONFIG_MDA_CONSOLE)
 #include <asm/vga.h>
@@ -26,24 +27,33 @@
 #ifndef VT_BUF_HAVE_MEMSETW
 static inline void scr_memsetw(u16 *s, u16 c, unsigned int count)
 {
+#ifdef VT_BUF_HAVE_RW
 	count /= 2;
 	while (count--)
 		scr_writew(c, s++);
+#else
+	memset16(s, c, count / 2);
+#endif
 }
 #endif
 
 #ifndef VT_BUF_HAVE_MEMCPYW
 static inline void scr_memcpyw(u16 *d, const u16 *s, unsigned int count)
 {
+#ifdef VT_BUF_HAVE_RW
 	count /= 2;
 	while (count--)
 		scr_writew(scr_readw(s++), d++);
+#else
+	memcpy(d, s, count);
+#endif
 }
 #endif
 
 #ifndef VT_BUF_HAVE_MEMMOVEW
 static inline void scr_memmovew(u16 *d, const u16 *s, unsigned int count)
 {
+#ifdef VT_BUF_HAVE_RW
 	if (d < s)
 		scr_memcpyw(d, s, count);
 	else {
@@ -53,6 +63,9 @@ static inline void scr_memmovew(u16 *d, const u16 *s, unsigned int count)
 		while (count--)
 			scr_writew(scr_readw(--s), --d);
 	}
+#else
+	memmove(d, s, count);
+#endif
 }
 #endif
 
-- 
cgit v1.2.3


From 9b130ad5bb8255ee8534d92d67e12b2a4887eacb Mon Sep 17 00:00:00 2001
From: Alexey Dobriyan <adobriyan@gmail.com>
Date: Fri, 8 Sep 2017 16:14:18 -0700
Subject: treewide: make "nr_cpu_ids" unsigned

First, number of CPUs can't be negative number.

Second, different signnnedness leads to suboptimal code in the following
cases:

1)
	kmalloc(nr_cpu_ids * sizeof(X));

"int" has to be sign extended to size_t.

2)
	while (loff_t *pos < nr_cpu_ids)

MOVSXD is 1 byte longed than the same MOV.

Other cases exist as well. Basically compiler is told that nr_cpu_ids
can't be negative which can't be deduced if it is "int".

Code savings on allyesconfig kernel: -3KB

	add/remove: 0/0 grow/shrink: 25/264 up/down: 261/-3631 (-3370)
	function                                     old     new   delta
	coretemp_cpu_online                          450     512     +62
	rcu_init_one                                1234    1272     +38
	pci_device_probe                             374     399     +25

				...

	pgdat_reclaimable_pages                      628     556     -72
	select_fallback_rq                           446     369     -77
	task_numa_find_cpu                          1923    1807    -116

Link: http://lkml.kernel.org/r/20170819114959.GA30580@avx2
Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/arm64/kernel/smp.c              | 2 +-
 arch/powerpc/kernel/paca.c           | 2 +-
 arch/powerpc/kernel/setup-common.c   | 2 +-
 arch/powerpc/sysdev/xive/native.c    | 4 ++--
 arch/tile/kernel/setup.c             | 2 +-
 arch/x86/kernel/apic/apic.c          | 2 +-
 arch/x86/kernel/setup_percpu.c       | 2 +-
 arch/x86/kernel/smpboot.c            | 2 +-
 drivers/base/cpu.c                   | 4 ++--
 drivers/scsi/scsi_debug.c            | 2 +-
 include/linux/cpumask.h              | 6 +++---
 kernel/rcu/tree.c                    | 2 +-
 kernel/rcu/tree_plugin.h             | 2 +-
 kernel/sched/topology.c              | 2 +-
 kernel/smp.c                         | 2 +-
 kernel/trace/trace_functions_graph.c | 2 +-
 mm/slub.c                            | 2 +-
 17 files changed, 21 insertions(+), 21 deletions(-)

(limited to 'include')

diff --git a/arch/arm64/kernel/smp.c b/arch/arm64/kernel/smp.c
index ffe089942ac4..9f7195a5773e 100644
--- a/arch/arm64/kernel/smp.c
+++ b/arch/arm64/kernel/smp.c
@@ -690,7 +690,7 @@ void __init smp_init_cpus(void)
 				      acpi_parse_gic_cpu_interface, 0);
 
 	if (cpu_count > nr_cpu_ids)
-		pr_warn("Number of cores (%d) exceeds configured maximum of %d - clipping\n",
+		pr_warn("Number of cores (%d) exceeds configured maximum of %u - clipping\n",
 			cpu_count, nr_cpu_ids);
 
 	if (!bootcpu_valid) {
diff --git a/arch/powerpc/kernel/paca.c b/arch/powerpc/kernel/paca.c
index 70f073d6c3b2..2ff2b8a19f71 100644
--- a/arch/powerpc/kernel/paca.c
+++ b/arch/powerpc/kernel/paca.c
@@ -224,7 +224,7 @@ void __init allocate_pacas(void)
 	paca = __va(memblock_alloc_base(paca_size, PAGE_SIZE, limit));
 	memset(paca, 0, paca_size);
 
-	printk(KERN_DEBUG "Allocated %u bytes for %d pacas at %p\n",
+	printk(KERN_DEBUG "Allocated %u bytes for %u pacas at %p\n",
 		paca_size, nr_cpu_ids, paca);
 
 	allocate_lppacas(nr_cpu_ids, limit);
diff --git a/arch/powerpc/kernel/setup-common.c b/arch/powerpc/kernel/setup-common.c
index 7de73589d8e2..0ac741fae90e 100644
--- a/arch/powerpc/kernel/setup-common.c
+++ b/arch/powerpc/kernel/setup-common.c
@@ -551,7 +551,7 @@ void __init smp_setup_cpu_maps(void)
 		if (maxcpus > nr_cpu_ids) {
 			printk(KERN_WARNING
 			       "Partition configured for %d cpus, "
-			       "operating system maximum is %d.\n",
+			       "operating system maximum is %u.\n",
 			       maxcpus, nr_cpu_ids);
 			maxcpus = nr_cpu_ids;
 		} else
diff --git a/arch/powerpc/sysdev/xive/native.c b/arch/powerpc/sysdev/xive/native.c
index 44f3a25ca630..ebc244b08d67 100644
--- a/arch/powerpc/sysdev/xive/native.c
+++ b/arch/powerpc/sysdev/xive/native.c
@@ -511,13 +511,13 @@ static bool xive_parse_provisioning(struct device_node *np)
 static void xive_native_setup_pools(void)
 {
 	/* Allocate a pool big enough */
-	pr_debug("XIVE: Allocating VP block for pool size %d\n", nr_cpu_ids);
+	pr_debug("XIVE: Allocating VP block for pool size %u\n", nr_cpu_ids);
 
 	xive_pool_vps = xive_native_alloc_vp_block(nr_cpu_ids);
 	if (WARN_ON(xive_pool_vps == XIVE_INVALID_VP))
 		pr_err("XIVE: Failed to allocate pool VP, KVM might not function\n");
 
-	pr_debug("XIVE: Pool VPs allocated at 0x%x for %d max CPUs\n",
+	pr_debug("XIVE: Pool VPs allocated at 0x%x for %u max CPUs\n",
 		 xive_pool_vps, nr_cpu_ids);
 }
 
diff --git a/arch/tile/kernel/setup.c b/arch/tile/kernel/setup.c
index 443a70bccc1c..6becb96c60a0 100644
--- a/arch/tile/kernel/setup.c
+++ b/arch/tile/kernel/setup.c
@@ -1200,7 +1200,7 @@ static void __init validate_hv(void)
 	 * We use a struct cpumask for this, so it must be big enough.
 	 */
 	if ((smp_height * smp_width) > nr_cpu_ids)
-		early_panic("Hypervisor %d x %d grid too big for Linux NR_CPUS %d\n",
+		early_panic("Hypervisor %d x %d grid too big for Linux NR_CPUS %u\n",
 			    smp_height, smp_width, nr_cpu_ids);
 #endif
 
diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c
index 7834f73efbf1..8315e2f517a7 100644
--- a/arch/x86/kernel/apic/apic.c
+++ b/arch/x86/kernel/apic/apic.c
@@ -2097,7 +2097,7 @@ static int allocate_logical_cpuid(int apicid)
 
 	/* Allocate a new cpuid. */
 	if (nr_logical_cpuids >= nr_cpu_ids) {
-		WARN_ONCE(1, "APIC: NR_CPUS/possible_cpus limit of %i reached. "
+		WARN_ONCE(1, "APIC: NR_CPUS/possible_cpus limit of %u reached. "
 			     "Processor %d/0x%x and the rest are ignored.\n",
 			     nr_cpu_ids, nr_logical_cpuids, apicid);
 		return -EINVAL;
diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c
index 6e8fcb6f7e1e..28dafed6c682 100644
--- a/arch/x86/kernel/setup_percpu.c
+++ b/arch/x86/kernel/setup_percpu.c
@@ -168,7 +168,7 @@ void __init setup_per_cpu_areas(void)
 	unsigned long delta;
 	int rc;
 
-	pr_info("NR_CPUS:%d nr_cpumask_bits:%d nr_cpu_ids:%d nr_node_ids:%d\n",
+	pr_info("NR_CPUS:%d nr_cpumask_bits:%d nr_cpu_ids:%u nr_node_ids:%d\n",
 		NR_CPUS, nr_cpumask_bits, nr_cpu_ids, nr_node_ids);
 
 	/*
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index 54b9e89d4d6b..cd6622c3204e 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -1461,7 +1461,7 @@ __init void prefill_possible_map(void)
 
 	/* nr_cpu_ids could be reduced via nr_cpus= */
 	if (possible > nr_cpu_ids) {
-		pr_warn("%d Processors exceeds NR_CPUS limit of %d\n",
+		pr_warn("%d Processors exceeds NR_CPUS limit of %u\n",
 			possible, nr_cpu_ids);
 		possible = nr_cpu_ids;
 	}
diff --git a/drivers/base/cpu.c b/drivers/base/cpu.c
index 2c3b359b3536..321cd7b4d817 100644
--- a/drivers/base/cpu.c
+++ b/drivers/base/cpu.c
@@ -256,9 +256,9 @@ static ssize_t print_cpus_offline(struct device *dev,
 			buf[n++] = ',';
 
 		if (nr_cpu_ids == total_cpus-1)
-			n += snprintf(&buf[n], len - n, "%d", nr_cpu_ids);
+			n += snprintf(&buf[n], len - n, "%u", nr_cpu_ids);
 		else
-			n += snprintf(&buf[n], len - n, "%d-%d",
+			n += snprintf(&buf[n], len - n, "%u-%d",
 						      nr_cpu_ids, total_cpus-1);
 	}
 
diff --git a/drivers/scsi/scsi_debug.c b/drivers/scsi/scsi_debug.c
index 77a0335eb757..09ba494f8896 100644
--- a/drivers/scsi/scsi_debug.c
+++ b/drivers/scsi/scsi_debug.c
@@ -5465,7 +5465,7 @@ static int sdebug_driver_probe(struct device * dev)
 		return error;
 	}
 	if (submit_queues > nr_cpu_ids) {
-		pr_warn("%s: trim submit_queues (was %d) to nr_cpu_ids=%d\n",
+		pr_warn("%s: trim submit_queues (was %d) to nr_cpu_ids=%u\n",
 			my_name, submit_queues, nr_cpu_ids);
 		submit_queues = nr_cpu_ids;
 	}
diff --git a/include/linux/cpumask.h b/include/linux/cpumask.h
index 4bf4479a3a80..68c5a8290275 100644
--- a/include/linux/cpumask.h
+++ b/include/linux/cpumask.h
@@ -32,15 +32,15 @@ typedef struct cpumask { DECLARE_BITMAP(bits, NR_CPUS); } cpumask_t;
 #define cpumask_pr_args(maskp)		nr_cpu_ids, cpumask_bits(maskp)
 
 #if NR_CPUS == 1
-#define nr_cpu_ids		1
+#define nr_cpu_ids		1U
 #else
-extern int nr_cpu_ids;
+extern unsigned int nr_cpu_ids;
 #endif
 
 #ifdef CONFIG_CPUMASK_OFFSTACK
 /* Assuming NR_CPUS is huge, a runtime limit is more efficient.  Also,
  * not all bits may be allocated. */
-#define nr_cpumask_bits	((unsigned int)nr_cpu_ids)
+#define nr_cpumask_bits	nr_cpu_ids
 #else
 #define nr_cpumask_bits	((unsigned int)NR_CPUS)
 #endif
diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index 84fe96641b2e..1250e4bd4b85 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -4091,7 +4091,7 @@ static void __init rcu_init_geometry(void)
 	if (rcu_fanout_leaf == RCU_FANOUT_LEAF &&
 	    nr_cpu_ids == NR_CPUS)
 		return;
-	pr_info("RCU: Adjusting geometry for rcu_fanout_leaf=%d, nr_cpu_ids=%d\n",
+	pr_info("RCU: Adjusting geometry for rcu_fanout_leaf=%d, nr_cpu_ids=%u\n",
 		rcu_fanout_leaf, nr_cpu_ids);
 
 	/*
diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h
index 55bde94b9572..e012b9be777e 100644
--- a/kernel/rcu/tree_plugin.h
+++ b/kernel/rcu/tree_plugin.h
@@ -89,7 +89,7 @@ static void __init rcu_bootup_announce_oddness(void)
 	if (rcu_fanout_leaf != RCU_FANOUT_LEAF)
 		pr_info("\tBoot-time adjustment of leaf fanout to %d.\n", rcu_fanout_leaf);
 	if (nr_cpu_ids != NR_CPUS)
-		pr_info("\tRCU restricting CPUs from NR_CPUS=%d to nr_cpu_ids=%d.\n", NR_CPUS, nr_cpu_ids);
+		pr_info("\tRCU restricting CPUs from NR_CPUS=%d to nr_cpu_ids=%u.\n", NR_CPUS, nr_cpu_ids);
 #ifdef CONFIG_RCU_BOOST
 	pr_info("\tRCU priority boosting: priority %d delay %d ms.\n", kthread_prio, CONFIG_RCU_BOOST_DELAY);
 #endif
diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
index 6f7b43982f73..5d0062cc10cb 100644
--- a/kernel/sched/topology.c
+++ b/kernel/sched/topology.c
@@ -473,7 +473,7 @@ static int __init isolated_cpu_setup(char *str)
 	alloc_bootmem_cpumask_var(&cpu_isolated_map);
 	ret = cpulist_parse(str, cpu_isolated_map);
 	if (ret) {
-		pr_err("sched: Error, all isolcpus= values must be between 0 and %d\n", nr_cpu_ids);
+		pr_err("sched: Error, all isolcpus= values must be between 0 and %u\n", nr_cpu_ids);
 		return 0;
 	}
 	return 1;
diff --git a/kernel/smp.c b/kernel/smp.c
index 81cfca9b4cc3..c94dd85c8d41 100644
--- a/kernel/smp.c
+++ b/kernel/smp.c
@@ -550,7 +550,7 @@ static int __init maxcpus(char *str)
 early_param("maxcpus", maxcpus);
 
 /* Setup number of possible processor ids */
-int nr_cpu_ids __read_mostly = NR_CPUS;
+unsigned int nr_cpu_ids __read_mostly = NR_CPUS;
 EXPORT_SYMBOL(nr_cpu_ids);
 
 /* An arch may set nr_cpu_ids earlier if needed, so this would be redundant */
diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c
index d56123cdcc89..b8f1f54731af 100644
--- a/kernel/trace/trace_functions_graph.c
+++ b/kernel/trace/trace_functions_graph.c
@@ -1543,7 +1543,7 @@ fs_initcall(init_graph_tracefs);
 
 static __init int init_graph_trace(void)
 {
-	max_bytes_for_cpu = snprintf(NULL, 0, "%d", nr_cpu_ids - 1);
+	max_bytes_for_cpu = snprintf(NULL, 0, "%u", nr_cpu_ids - 1);
 
 	if (!register_trace_event(&graph_trace_entry_event)) {
 		pr_warn("Warning: could not register graph trace events\n");
diff --git a/mm/slub.c b/mm/slub.c
index ddb04576b342..d39a5d3834b3 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -4232,7 +4232,7 @@ void __init kmem_cache_init(void)
 	cpuhp_setup_state_nocalls(CPUHP_SLUB_DEAD, "slub:dead", NULL,
 				  slub_cpu_dead);
 
-	pr_info("SLUB: HWalign=%d, Order=%d-%d, MinObjects=%d, CPUs=%d, Nodes=%d\n",
+	pr_info("SLUB: HWalign=%d, Order=%d-%d, MinObjects=%d, CPUs=%u, Nodes=%d\n",
 		cache_line_size(),
 		slub_min_order, slub_max_order, slub_min_objects,
 		nr_cpu_ids, nr_node_ids);
-- 
cgit v1.2.3


From e9ef073a0796e46c24f037237291efe56fc28ad9 Mon Sep 17 00:00:00 2001
From: Babu Moger <babu.moger@oracle.com>
Date: Fri, 8 Sep 2017 16:14:29 -0700
Subject: include: warn for inconsistent endian config definition

We have seen some generic code use config parameter CONFIG_CPU_BIG_ENDIAN
to decide the endianness.

Here are the few examples.
include/asm-generic/qrwlock.h
drivers/of/base.c
drivers/of/fdt.c
drivers/tty/serial/earlycon.c
drivers/tty/serial/serial_core.c

Display warning if CPU_BIG_ENDIAN is not defined on big endian
architecture and also warn if it defined on little endian architectures.

Here is our original discussion
https://lkml.org/lkml/2017/5/24/620

Link: http://lkml.kernel.org/r/1499358861-179979-4-git-send-email-babu.moger@oracle.com
Signed-off-by: Babu Moger <babu.moger@oracle.com>
Suggested-by: Arnd Bergmann <arnd@arndb.de>
Acked-by: Geert Uytterhoeven <geert@linux-m68k.org>
Cc: "James E.J. Bottomley" <jejb@parisc-linux.org>
Cc: Alexander Viro <viro@zeniv.linux.org.uk>
Cc: David S. Miller <davem@davemloft.net>
Cc: Greg KH <gregkh@linuxfoundation.org>
Cc: Helge Deller <deller@gmx.de>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Jonas Bonn <jonas@southpole.se>
Cc: Max Filippov <jcmvbkbc@gmail.com>
Cc: Michael Ellerman <mpe@ellerman.id.au> (powerpc)
Cc: Michal Simek <monstr@monstr.eu>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stafford Horne <shorne@gmail.com>
Cc: Stefan Kristiansson <stefan.kristiansson@saunalahti.fi>
Cc: Yoshinori Sato <ysato@users.sourceforge.jp>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/byteorder/big_endian.h    | 4 ++++
 include/linux/byteorder/little_endian.h | 4 ++++
 2 files changed, 8 insertions(+)

(limited to 'include')

diff --git a/include/linux/byteorder/big_endian.h b/include/linux/byteorder/big_endian.h
index 392041475c72..ffd215988392 100644
--- a/include/linux/byteorder/big_endian.h
+++ b/include/linux/byteorder/big_endian.h
@@ -3,5 +3,9 @@
 
 #include <uapi/linux/byteorder/big_endian.h>
 
+#ifndef CONFIG_CPU_BIG_ENDIAN
+#warning inconsistent configuration, needs CONFIG_CPU_BIG_ENDIAN
+#endif
+
 #include <linux/byteorder/generic.h>
 #endif /* _LINUX_BYTEORDER_BIG_ENDIAN_H */
diff --git a/include/linux/byteorder/little_endian.h b/include/linux/byteorder/little_endian.h
index 08057377aa23..ba910bb9aad0 100644
--- a/include/linux/byteorder/little_endian.h
+++ b/include/linux/byteorder/little_endian.h
@@ -3,5 +3,9 @@
 
 #include <uapi/linux/byteorder/little_endian.h>
 
+#ifdef CONFIG_CPU_BIG_ENDIAN
+#warning inconsistent configuration, CONFIG_CPU_BIG_ENDIAN is set
+#endif
+
 #include <linux/byteorder/generic.h>
 #endif /* _LINUX_BYTEORDER_LITTLE_ENDIAN_H */
-- 
cgit v1.2.3


From c32ee3d9abd284b4fcaacc250b101f93829c7bae Mon Sep 17 00:00:00 2001
From: Matthias Kaehlcke <mka@chromium.org>
Date: Fri, 8 Sep 2017 16:14:33 -0700
Subject: bitops: avoid integer overflow in GENMASK(_ULL)

GENMASK(_ULL) performs a left-shift of ~0UL(L), which technically
results in an integer overflow.  clang raises a warning if the overflow
occurs in a preprocessor expression.  Clear the low-order bits through a
substraction instead of the left-shift to avoid the overflow.

(akpm: no change in .text size in my testing)

Link: http://lkml.kernel.org/r/20170803212020.24939-1-mka@chromium.org
Signed-off-by: Matthias Kaehlcke <mka@chromium.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/bitops.h | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/linux/bitops.h b/include/linux/bitops.h
index a83c822c35c2..8fbe259b197c 100644
--- a/include/linux/bitops.h
+++ b/include/linux/bitops.h
@@ -19,10 +19,11 @@
  * GENMASK_ULL(39, 21) gives us the 64bit vector 0x000000ffffe00000.
  */
 #define GENMASK(h, l) \
-	(((~0UL) << (l)) & (~0UL >> (BITS_PER_LONG - 1 - (h))))
+	(((~0UL) - (1UL << (l)) + 1) & (~0UL >> (BITS_PER_LONG - 1 - (h))))
 
 #define GENMASK_ULL(h, l) \
-	(((~0ULL) << (l)) & (~0ULL >> (BITS_PER_LONG_LONG - 1 - (h))))
+	(((~0ULL) - (1ULL << (l)) + 1) & \
+	 (~0ULL >> (BITS_PER_LONG_LONG - 1 - (h))))
 
 extern unsigned int __sw_hweight8(unsigned int w);
 extern unsigned int __sw_hweight16(unsigned int w);
-- 
cgit v1.2.3


From cd9e61ed1eebbcd5dfad59475d41ec58d9b64b6a Mon Sep 17 00:00:00 2001
From: Davidlohr Bueso <dave@stgolabs.net>
Date: Fri, 8 Sep 2017 16:14:36 -0700
Subject: rbtree: cache leftmost node internally

Patch series "rbtree: Cache leftmost node internally", v4.

A series to extending rbtrees to internally cache the leftmost node such
that we can have fast overlap check optimization for all interval tree
users[1].  The benefits of this series are that:

(i)   Unify users that do internal leftmost node caching.
(ii)  Optimize all interval tree users.
(iii) Convert at least two new users (epoll and procfs) to the new interface.

This patch (of 16):

Red-black tree semantics imply that nodes with smaller or greater (or
equal for duplicates) keys always be to the left and right,
respectively.  For the kernel this is extremely evident when considering
our rb_first() semantics.  Enabling lookups for the smallest node in the
tree in O(1) can save a good chunk of cycles in not having to walk down
the tree each time.  To this end there are a few core users that
explicitly do this, such as the scheduler and rtmutexes.  There is also
the desire for interval trees to have this optimization allowing faster
overlap checking.

This patch introduces a new 'struct rb_root_cached' which is just the
root with a cached pointer to the leftmost node.  The reason why the
regular rb_root was not extended instead of adding a new structure was
that this allows the user to have the choice between memory footprint
and actual tree performance.  The new wrappers on top of the regular
rb_root calls are:

 - rb_first_cached(cached_root) -- which is a fast replacement
     for rb_first.

 - rb_insert_color_cached(node, cached_root, new)

 - rb_erase_cached(node, cached_root)

In addition, augmented cached interfaces are also added for basic
insertion and deletion operations; which becomes important for the
interval tree changes.

With the exception of the inserts, which adds a bool for updating the
new leftmost, the interfaces are kept the same.  To this end, porting rb
users to the cached version becomes really trivial, and keeping current
rbtree semantics for users that don't care about the optimization
requires zero overhead.

Link: http://lkml.kernel.org/r/20170719014603.19029-2-dave@stgolabs.net
Signed-off-by: Davidlohr Bueso <dbueso@suse.de>
Reviewed-by: Jan Kara <jack@suse.cz>
Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 Documentation/rbtree.txt         | 33 +++++++++++++++++++++++++++++++++
 include/linux/rbtree.h           | 21 +++++++++++++++++++++
 include/linux/rbtree_augmented.h | 33 ++++++++++++++++++++++++++++++---
 lib/rbtree.c                     | 34 +++++++++++++++++++++++++++++-----
 4 files changed, 113 insertions(+), 8 deletions(-)

(limited to 'include')

diff --git a/Documentation/rbtree.txt b/Documentation/rbtree.txt
index b8a8c70b0188..c42a21b99046 100644
--- a/Documentation/rbtree.txt
+++ b/Documentation/rbtree.txt
@@ -193,6 +193,39 @@ Example::
   for (node = rb_first(&mytree); node; node = rb_next(node))
 	printk("key=%s\n", rb_entry(node, struct mytype, node)->keystring);
 
+Cached rbtrees
+--------------
+
+Computing the leftmost (smallest) node is quite a common task for binary
+search trees, such as for traversals or users relying on a the particular
+order for their own logic. To this end, users can use 'struct rb_root_cached'
+to optimize O(logN) rb_first() calls to a simple pointer fetch avoiding
+potentially expensive tree iterations. This is done at negligible runtime
+overhead for maintanence; albeit larger memory footprint.
+
+Similar to the rb_root structure, cached rbtrees are initialized to be
+empty via:
+
+  struct rb_root_cached mytree = RB_ROOT_CACHED;
+
+Cached rbtree is simply a regular rb_root with an extra pointer to cache the
+leftmost node. This allows rb_root_cached to exist wherever rb_root does,
+which permits augmented trees to be supported as well as only a few extra
+interfaces:
+
+  struct rb_node *rb_first_cached(struct rb_root_cached *tree);
+  void rb_insert_color_cached(struct rb_node *, struct rb_root_cached *, bool);
+  void rb_erase_cached(struct rb_node *node, struct rb_root_cached *);
+
+Both insert and erase calls have their respective counterpart of augmented
+trees:
+
+  void rb_insert_augmented_cached(struct rb_node *node, struct rb_root_cached *,
+				  bool, struct rb_augment_callbacks *);
+  void rb_erase_augmented_cached(struct rb_node *, struct rb_root_cached *,
+				 struct rb_augment_callbacks *);
+
+
 Support for Augmented rbtrees
 -----------------------------
 
diff --git a/include/linux/rbtree.h b/include/linux/rbtree.h
index e585018498d5..d574361943ea 100644
--- a/include/linux/rbtree.h
+++ b/include/linux/rbtree.h
@@ -44,10 +44,25 @@ struct rb_root {
 	struct rb_node *rb_node;
 };
 
+/*
+ * Leftmost-cached rbtrees.
+ *
+ * We do not cache the rightmost node based on footprint
+ * size vs number of potential users that could benefit
+ * from O(1) rb_last(). Just not worth it, users that want
+ * this feature can always implement the logic explicitly.
+ * Furthermore, users that want to cache both pointers may
+ * find it a bit asymmetric, but that's ok.
+ */
+struct rb_root_cached {
+	struct rb_root rb_root;
+	struct rb_node *rb_leftmost;
+};
 
 #define rb_parent(r)   ((struct rb_node *)((r)->__rb_parent_color & ~3))
 
 #define RB_ROOT	(struct rb_root) { NULL, }
+#define RB_ROOT_CACHED (struct rb_root_cached) { {NULL, }, NULL }
 #define	rb_entry(ptr, type, member) container_of(ptr, type, member)
 
 #define RB_EMPTY_ROOT(root)  (READ_ONCE((root)->rb_node) == NULL)
@@ -69,6 +84,12 @@ extern struct rb_node *rb_prev(const struct rb_node *);
 extern struct rb_node *rb_first(const struct rb_root *);
 extern struct rb_node *rb_last(const struct rb_root *);
 
+extern void rb_insert_color_cached(struct rb_node *,
+				   struct rb_root_cached *, bool);
+extern void rb_erase_cached(struct rb_node *node, struct rb_root_cached *);
+/* Same as rb_first(), but O(1) */
+#define rb_first_cached(root) (root)->rb_leftmost
+
 /* Postorder iteration - always visit the parent after its children */
 extern struct rb_node *rb_first_postorder(const struct rb_root *);
 extern struct rb_node *rb_next_postorder(const struct rb_node *);
diff --git a/include/linux/rbtree_augmented.h b/include/linux/rbtree_augmented.h
index 9702b6e183bc..6bfd2b581f75 100644
--- a/include/linux/rbtree_augmented.h
+++ b/include/linux/rbtree_augmented.h
@@ -41,7 +41,9 @@ struct rb_augment_callbacks {
 	void (*rotate)(struct rb_node *old, struct rb_node *new);
 };
 
-extern void __rb_insert_augmented(struct rb_node *node, struct rb_root *root,
+extern void __rb_insert_augmented(struct rb_node *node,
+				  struct rb_root *root,
+				  bool newleft, struct rb_node **leftmost,
 	void (*augment_rotate)(struct rb_node *old, struct rb_node *new));
 /*
  * Fixup the rbtree and update the augmented information when rebalancing.
@@ -57,7 +59,16 @@ static inline void
 rb_insert_augmented(struct rb_node *node, struct rb_root *root,
 		    const struct rb_augment_callbacks *augment)
 {
-	__rb_insert_augmented(node, root, augment->rotate);
+	__rb_insert_augmented(node, root, false, NULL, augment->rotate);
+}
+
+static inline void
+rb_insert_augmented_cached(struct rb_node *node,
+			   struct rb_root_cached *root, bool newleft,
+			   const struct rb_augment_callbacks *augment)
+{
+	__rb_insert_augmented(node, &root->rb_root,
+			      newleft, &root->rb_leftmost, augment->rotate);
 }
 
 #define RB_DECLARE_CALLBACKS(rbstatic, rbname, rbstruct, rbfield,	\
@@ -150,6 +161,7 @@ extern void __rb_erase_color(struct rb_node *parent, struct rb_root *root,
 
 static __always_inline struct rb_node *
 __rb_erase_augmented(struct rb_node *node, struct rb_root *root,
+		     struct rb_node **leftmost,
 		     const struct rb_augment_callbacks *augment)
 {
 	struct rb_node *child = node->rb_right;
@@ -157,6 +169,9 @@ __rb_erase_augmented(struct rb_node *node, struct rb_root *root,
 	struct rb_node *parent, *rebalance;
 	unsigned long pc;
 
+	if (leftmost && node == *leftmost)
+		*leftmost = rb_next(node);
+
 	if (!tmp) {
 		/*
 		 * Case 1: node to erase has no more than 1 child (easy!)
@@ -256,9 +271,21 @@ static __always_inline void
 rb_erase_augmented(struct rb_node *node, struct rb_root *root,
 		   const struct rb_augment_callbacks *augment)
 {
-	struct rb_node *rebalance = __rb_erase_augmented(node, root, augment);
+	struct rb_node *rebalance = __rb_erase_augmented(node, root,
+							 NULL, augment);
 	if (rebalance)
 		__rb_erase_color(rebalance, root, augment->rotate);
 }
 
+static __always_inline void
+rb_erase_augmented_cached(struct rb_node *node, struct rb_root_cached *root,
+			  const struct rb_augment_callbacks *augment)
+{
+	struct rb_node *rebalance = __rb_erase_augmented(node, &root->rb_root,
+							 &root->rb_leftmost,
+							 augment);
+	if (rebalance)
+		__rb_erase_color(rebalance, &root->rb_root, augment->rotate);
+}
+
 #endif	/* _LINUX_RBTREE_AUGMENTED_H */
diff --git a/lib/rbtree.c b/lib/rbtree.c
index 4ba2828a67c0..d102d9d2ffaa 100644
--- a/lib/rbtree.c
+++ b/lib/rbtree.c
@@ -95,10 +95,14 @@ __rb_rotate_set_parents(struct rb_node *old, struct rb_node *new,
 
 static __always_inline void
 __rb_insert(struct rb_node *node, struct rb_root *root,
+	    bool newleft, struct rb_node **leftmost,
 	    void (*augment_rotate)(struct rb_node *old, struct rb_node *new))
 {
 	struct rb_node *parent = rb_red_parent(node), *gparent, *tmp;
 
+	if (newleft)
+		*leftmost = node;
+
 	while (true) {
 		/*
 		 * Loop invariant: node is red
@@ -434,19 +438,38 @@ static const struct rb_augment_callbacks dummy_callbacks = {
 
 void rb_insert_color(struct rb_node *node, struct rb_root *root)
 {
-	__rb_insert(node, root, dummy_rotate);
+	__rb_insert(node, root, false, NULL, dummy_rotate);
 }
 EXPORT_SYMBOL(rb_insert_color);
 
 void rb_erase(struct rb_node *node, struct rb_root *root)
 {
 	struct rb_node *rebalance;
-	rebalance = __rb_erase_augmented(node, root, &dummy_callbacks);
+	rebalance = __rb_erase_augmented(node, root,
+					 NULL, &dummy_callbacks);
 	if (rebalance)
 		____rb_erase_color(rebalance, root, dummy_rotate);
 }
 EXPORT_SYMBOL(rb_erase);
 
+void rb_insert_color_cached(struct rb_node *node,
+			    struct rb_root_cached *root, bool leftmost)
+{
+	__rb_insert(node, &root->rb_root, leftmost,
+		    &root->rb_leftmost, dummy_rotate);
+}
+EXPORT_SYMBOL(rb_insert_color_cached);
+
+void rb_erase_cached(struct rb_node *node, struct rb_root_cached *root)
+{
+	struct rb_node *rebalance;
+	rebalance = __rb_erase_augmented(node, &root->rb_root,
+					 &root->rb_leftmost, &dummy_callbacks);
+	if (rebalance)
+		____rb_erase_color(rebalance, &root->rb_root, dummy_rotate);
+}
+EXPORT_SYMBOL(rb_erase_cached);
+
 /*
  * Augmented rbtree manipulation functions.
  *
@@ -455,9 +478,10 @@ EXPORT_SYMBOL(rb_erase);
  */
 
 void __rb_insert_augmented(struct rb_node *node, struct rb_root *root,
+			   bool newleft, struct rb_node **leftmost,
 	void (*augment_rotate)(struct rb_node *old, struct rb_node *new))
 {
-	__rb_insert(node, root, augment_rotate);
+	__rb_insert(node, root, newleft, leftmost, augment_rotate);
 }
 EXPORT_SYMBOL(__rb_insert_augmented);
 
@@ -502,7 +526,7 @@ struct rb_node *rb_next(const struct rb_node *node)
 	 * as we can.
 	 */
 	if (node->rb_right) {
-		node = node->rb_right; 
+		node = node->rb_right;
 		while (node->rb_left)
 			node=node->rb_left;
 		return (struct rb_node *)node;
@@ -534,7 +558,7 @@ struct rb_node *rb_prev(const struct rb_node *node)
 	 * as we can.
 	 */
 	if (node->rb_left) {
-		node = node->rb_left; 
+		node = node->rb_left;
 		while (node->rb_right)
 			node=node->rb_right;
 		return (struct rb_node *)node;
-- 
cgit v1.2.3


From a23ba907d5e65d6aeea3e59c82fda9cd206a7aad Mon Sep 17 00:00:00 2001
From: Davidlohr Bueso <dave@stgolabs.net>
Date: Fri, 8 Sep 2017 16:15:01 -0700
Subject: locking/rtmutex: replace top-waiter and pi_waiters leftmost caching

... with the generic rbtree flavor instead. No changes
in semantics whatsoever.

Link: http://lkml.kernel.org/r/20170719014603.19029-10-dave@stgolabs.net
Signed-off-by: Davidlohr Bueso <dbueso@suse.de>
Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/init_task.h       |  5 ++---
 include/linux/rtmutex.h         | 11 +++++------
 include/linux/sched.h           |  3 +--
 kernel/fork.c                   |  3 +--
 kernel/locking/rtmutex-debug.c  |  2 +-
 kernel/locking/rtmutex.c        | 35 +++++++++++------------------------
 kernel/locking/rtmutex_common.h | 12 ++++++------
 7 files changed, 27 insertions(+), 44 deletions(-)

(limited to 'include')

diff --git a/include/linux/init_task.h b/include/linux/init_task.h
index 0e849715e5be..3c07ace5b431 100644
--- a/include/linux/init_task.h
+++ b/include/linux/init_task.h
@@ -175,9 +175,8 @@ extern struct cred init_cred;
 
 #ifdef CONFIG_RT_MUTEXES
 # define INIT_RT_MUTEXES(tsk)						\
-	.pi_waiters = RB_ROOT,						\
-	.pi_top_task = NULL,						\
-	.pi_waiters_leftmost = NULL,
+	.pi_waiters = RB_ROOT_CACHED,					\
+	.pi_top_task = NULL,
 #else
 # define INIT_RT_MUTEXES(tsk)
 #endif
diff --git a/include/linux/rtmutex.h b/include/linux/rtmutex.h
index 44fd002f7cd5..53fcbe9de7fd 100644
--- a/include/linux/rtmutex.h
+++ b/include/linux/rtmutex.h
@@ -22,18 +22,17 @@ extern int max_lock_depth; /* for sysctl */
  * The rt_mutex structure
  *
  * @wait_lock:	spinlock to protect the structure
- * @waiters:	rbtree root to enqueue waiters in priority order
- * @waiters_leftmost: top waiter
+ * @waiters:	rbtree root to enqueue waiters in priority order;
+ *              caches top-waiter (leftmost node).
  * @owner:	the mutex owner
  */
 struct rt_mutex {
 	raw_spinlock_t		wait_lock;
-	struct rb_root          waiters;
-	struct rb_node          *waiters_leftmost;
+	struct rb_root_cached   waiters;
 	struct task_struct	*owner;
 #ifdef CONFIG_DEBUG_RT_MUTEXES
 	int			save_state;
-	const char 		*name, *file;
+	const char		*name, *file;
 	int			line;
 	void			*magic;
 #endif
@@ -84,7 +83,7 @@ do { \
 
 #define __RT_MUTEX_INITIALIZER(mutexname) \
 	{ .wait_lock = __RAW_SPIN_LOCK_UNLOCKED(mutexname.wait_lock) \
-	, .waiters = RB_ROOT \
+	, .waiters = RB_ROOT_CACHED \
 	, .owner = NULL \
 	__DEBUG_RT_MUTEX_INITIALIZER(mutexname) \
 	__DEP_MAP_RT_MUTEX_INITIALIZER(mutexname)}
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 68b38335d33c..92fb8dd5a9e4 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -812,8 +812,7 @@ struct task_struct {
 
 #ifdef CONFIG_RT_MUTEXES
 	/* PI waiters blocked on a rt_mutex held by this task: */
-	struct rb_root			pi_waiters;
-	struct rb_node			*pi_waiters_leftmost;
+	struct rb_root_cached		pi_waiters;
 	/* Updated under owner's pi_lock and rq lock */
 	struct task_struct		*pi_top_task;
 	/* Deadlock detection and priority inheritance handling: */
diff --git a/kernel/fork.c b/kernel/fork.c
index 2ccbbbfcb7b8..6f1b0af00bda 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1462,8 +1462,7 @@ static void rt_mutex_init_task(struct task_struct *p)
 {
 	raw_spin_lock_init(&p->pi_lock);
 #ifdef CONFIG_RT_MUTEXES
-	p->pi_waiters = RB_ROOT;
-	p->pi_waiters_leftmost = NULL;
+	p->pi_waiters = RB_ROOT_CACHED;
 	p->pi_top_task = NULL;
 	p->pi_blocked_on = NULL;
 #endif
diff --git a/kernel/locking/rtmutex-debug.c b/kernel/locking/rtmutex-debug.c
index ac35e648b0e5..f4a74e78d467 100644
--- a/kernel/locking/rtmutex-debug.c
+++ b/kernel/locking/rtmutex-debug.c
@@ -58,7 +58,7 @@ static void printk_lock(struct rt_mutex *lock, int print_owner)
 
 void rt_mutex_debug_task_free(struct task_struct *task)
 {
-	DEBUG_LOCKS_WARN_ON(!RB_EMPTY_ROOT(&task->pi_waiters));
+	DEBUG_LOCKS_WARN_ON(!RB_EMPTY_ROOT(&task->pi_waiters.rb_root));
 	DEBUG_LOCKS_WARN_ON(task->pi_blocked_on);
 }
 
diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c
index 649dc9d3951a..6f3dba6e4e9e 100644
--- a/kernel/locking/rtmutex.c
+++ b/kernel/locking/rtmutex.c
@@ -271,10 +271,10 @@ rt_mutex_waiter_equal(struct rt_mutex_waiter *left,
 static void
 rt_mutex_enqueue(struct rt_mutex *lock, struct rt_mutex_waiter *waiter)
 {
-	struct rb_node **link = &lock->waiters.rb_node;
+	struct rb_node **link = &lock->waiters.rb_root.rb_node;
 	struct rb_node *parent = NULL;
 	struct rt_mutex_waiter *entry;
-	int leftmost = 1;
+	bool leftmost = true;
 
 	while (*link) {
 		parent = *link;
@@ -283,15 +283,12 @@ rt_mutex_enqueue(struct rt_mutex *lock, struct rt_mutex_waiter *waiter)
 			link = &parent->rb_left;
 		} else {
 			link = &parent->rb_right;
-			leftmost = 0;
+			leftmost = false;
 		}
 	}
 
-	if (leftmost)
-		lock->waiters_leftmost = &waiter->tree_entry;
-
 	rb_link_node(&waiter->tree_entry, parent, link);
-	rb_insert_color(&waiter->tree_entry, &lock->waiters);
+	rb_insert_color_cached(&waiter->tree_entry, &lock->waiters, leftmost);
 }
 
 static void
@@ -300,20 +297,17 @@ rt_mutex_dequeue(struct rt_mutex *lock, struct rt_mutex_waiter *waiter)
 	if (RB_EMPTY_NODE(&waiter->tree_entry))
 		return;
 
-	if (lock->waiters_leftmost == &waiter->tree_entry)
-		lock->waiters_leftmost = rb_next(&waiter->tree_entry);
-
-	rb_erase(&waiter->tree_entry, &lock->waiters);
+	rb_erase_cached(&waiter->tree_entry, &lock->waiters);
 	RB_CLEAR_NODE(&waiter->tree_entry);
 }
 
 static void
 rt_mutex_enqueue_pi(struct task_struct *task, struct rt_mutex_waiter *waiter)
 {
-	struct rb_node **link = &task->pi_waiters.rb_node;
+	struct rb_node **link = &task->pi_waiters.rb_root.rb_node;
 	struct rb_node *parent = NULL;
 	struct rt_mutex_waiter *entry;
-	int leftmost = 1;
+	bool leftmost = true;
 
 	while (*link) {
 		parent = *link;
@@ -322,15 +316,12 @@ rt_mutex_enqueue_pi(struct task_struct *task, struct rt_mutex_waiter *waiter)
 			link = &parent->rb_left;
 		} else {
 			link = &parent->rb_right;
-			leftmost = 0;
+			leftmost = false;
 		}
 	}
 
-	if (leftmost)
-		task->pi_waiters_leftmost = &waiter->pi_tree_entry;
-
 	rb_link_node(&waiter->pi_tree_entry, parent, link);
-	rb_insert_color(&waiter->pi_tree_entry, &task->pi_waiters);
+	rb_insert_color_cached(&waiter->pi_tree_entry, &task->pi_waiters, leftmost);
 }
 
 static void
@@ -339,10 +330,7 @@ rt_mutex_dequeue_pi(struct task_struct *task, struct rt_mutex_waiter *waiter)
 	if (RB_EMPTY_NODE(&waiter->pi_tree_entry))
 		return;
 
-	if (task->pi_waiters_leftmost == &waiter->pi_tree_entry)
-		task->pi_waiters_leftmost = rb_next(&waiter->pi_tree_entry);
-
-	rb_erase(&waiter->pi_tree_entry, &task->pi_waiters);
+	rb_erase_cached(&waiter->pi_tree_entry, &task->pi_waiters);
 	RB_CLEAR_NODE(&waiter->pi_tree_entry);
 }
 
@@ -1657,8 +1645,7 @@ void __rt_mutex_init(struct rt_mutex *lock, const char *name,
 {
 	lock->owner = NULL;
 	raw_spin_lock_init(&lock->wait_lock);
-	lock->waiters = RB_ROOT;
-	lock->waiters_leftmost = NULL;
+	lock->waiters = RB_ROOT_CACHED;
 
 	if (name && key)
 		debug_rt_mutex_init(lock, name, key);
diff --git a/kernel/locking/rtmutex_common.h b/kernel/locking/rtmutex_common.h
index 8d039b928d61..7453be0485a5 100644
--- a/kernel/locking/rtmutex_common.h
+++ b/kernel/locking/rtmutex_common.h
@@ -45,7 +45,7 @@ struct rt_mutex_waiter {
 
 static inline int rt_mutex_has_waiters(struct rt_mutex *lock)
 {
-	return !RB_EMPTY_ROOT(&lock->waiters);
+	return !RB_EMPTY_ROOT(&lock->waiters.rb_root);
 }
 
 static inline struct rt_mutex_waiter *
@@ -53,8 +53,8 @@ rt_mutex_top_waiter(struct rt_mutex *lock)
 {
 	struct rt_mutex_waiter *w;
 
-	w = rb_entry(lock->waiters_leftmost, struct rt_mutex_waiter,
-		     tree_entry);
+	w = rb_entry(lock->waiters.rb_leftmost,
+		     struct rt_mutex_waiter, tree_entry);
 	BUG_ON(w->lock != lock);
 
 	return w;
@@ -62,14 +62,14 @@ rt_mutex_top_waiter(struct rt_mutex *lock)
 
 static inline int task_has_pi_waiters(struct task_struct *p)
 {
-	return !RB_EMPTY_ROOT(&p->pi_waiters);
+	return !RB_EMPTY_ROOT(&p->pi_waiters.rb_root);
 }
 
 static inline struct rt_mutex_waiter *
 task_top_pi_waiter(struct task_struct *p)
 {
-	return rb_entry(p->pi_waiters_leftmost, struct rt_mutex_waiter,
-			pi_tree_entry);
+	return rb_entry(p->pi_waiters.rb_leftmost,
+			struct rt_mutex_waiter, pi_tree_entry);
 }
 
 #else
-- 
cgit v1.2.3


From f808c13fd3738948e10196496959871130612b61 Mon Sep 17 00:00:00 2001
From: Davidlohr Bueso <dave@stgolabs.net>
Date: Fri, 8 Sep 2017 16:15:08 -0700
Subject: lib/interval_tree: fast overlap detection
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Allow interval trees to quickly check for overlaps to avoid unnecesary
tree lookups in interval_tree_iter_first().

As of this patch, all interval tree flavors will require using a
'rb_root_cached' such that we can have the leftmost node easily
available.  While most users will make use of this feature, those with
special functions (in addition to the generic insert, delete, search
calls) will avoid using the cached option as they can do funky things
with insertions -- for example, vma_interval_tree_insert_after().

[jglisse@redhat.com: fix deadlock from typo vm_lock_anon_vma()]
  Link: http://lkml.kernel.org/r/20170808225719.20723-1-jglisse@redhat.com
Link: http://lkml.kernel.org/r/20170719014603.19029-12-dave@stgolabs.net
Signed-off-by: Davidlohr Bueso <dbueso@suse.de>
Signed-off-by: Jérôme Glisse <jglisse@redhat.com>
Acked-by: Christian König <christian.koenig@amd.com>
Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Acked-by: Doug Ledford <dledford@redhat.com>
Acked-by: Michael S. Tsirkin <mst@redhat.com>
Cc: David Airlie <airlied@linux.ie>
Cc: Jason Wang <jasowang@redhat.com>
Cc: Christian Benvenuti <benve@cisco.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_mn.c             |  8 ++--
 drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c             |  7 ++--
 drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h             |  2 +-
 drivers/gpu/drm/drm_mm.c                           | 19 +++++----
 drivers/gpu/drm/drm_vma_manager.c                  |  2 +-
 drivers/gpu/drm/i915/i915_gem_userptr.c            |  6 +--
 drivers/gpu/drm/radeon/radeon.h                    |  2 +-
 drivers/gpu/drm/radeon/radeon_mn.c                 |  8 ++--
 drivers/gpu/drm/radeon/radeon_vm.c                 |  7 ++--
 drivers/infiniband/core/umem_rbtree.c              |  4 +-
 drivers/infiniband/core/uverbs_cmd.c               |  2 +-
 drivers/infiniband/hw/hfi1/mmu_rb.c                | 10 ++---
 drivers/infiniband/hw/usnic/usnic_uiom.c           |  6 +--
 drivers/infiniband/hw/usnic/usnic_uiom.h           |  2 +-
 .../infiniband/hw/usnic/usnic_uiom_interval_tree.c | 15 +++----
 .../infiniband/hw/usnic/usnic_uiom_interval_tree.h | 12 +++---
 drivers/vhost/vhost.c                              |  2 +-
 drivers/vhost/vhost.h                              |  2 +-
 fs/hugetlbfs/inode.c                               |  6 +--
 fs/inode.c                                         |  2 +-
 include/drm/drm_mm.h                               |  2 +-
 include/linux/fs.h                                 |  4 +-
 include/linux/interval_tree.h                      |  8 ++--
 include/linux/interval_tree_generic.h              | 46 +++++++++++++++++-----
 include/linux/mm.h                                 | 17 ++++----
 include/linux/rmap.h                               |  4 +-
 include/rdma/ib_umem_odp.h                         | 11 ++++--
 include/rdma/ib_verbs.h                            |  2 +-
 lib/interval_tree_test.c                           |  4 +-
 mm/interval_tree.c                                 | 10 ++---
 mm/memory.c                                        |  4 +-
 mm/mmap.c                                          | 10 ++---
 mm/rmap.c                                          |  4 +-
 33 files changed, 145 insertions(+), 105 deletions(-)

(limited to 'include')

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_mn.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_mn.c
index e1cde6b80027..3b0f2ec6eec7 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_mn.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_mn.c
@@ -51,7 +51,7 @@ struct amdgpu_mn {
 
 	/* objects protected by lock */
 	struct mutex		lock;
-	struct rb_root		objects;
+	struct rb_root_cached	objects;
 };
 
 struct amdgpu_mn_node {
@@ -76,8 +76,8 @@ static void amdgpu_mn_destroy(struct work_struct *work)
 	mutex_lock(&adev->mn_lock);
 	mutex_lock(&rmn->lock);
 	hash_del(&rmn->node);
-	rbtree_postorder_for_each_entry_safe(node, next_node, &rmn->objects,
-					     it.rb) {
+	rbtree_postorder_for_each_entry_safe(node, next_node,
+					     &rmn->objects.rb_root, it.rb) {
 		list_for_each_entry_safe(bo, next_bo, &node->bos, mn_list) {
 			bo->mn = NULL;
 			list_del_init(&bo->mn_list);
@@ -221,7 +221,7 @@ static struct amdgpu_mn *amdgpu_mn_get(struct amdgpu_device *adev)
 	rmn->mm = mm;
 	rmn->mn.ops = &amdgpu_mn_ops;
 	mutex_init(&rmn->lock);
-	rmn->objects = RB_ROOT;
+	rmn->objects = RB_ROOT_CACHED;
 
 	r = __mmu_notifier_register(&rmn->mn, mm);
 	if (r)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
index 6b1343e5541d..b9a5a77eedaf 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
@@ -2475,7 +2475,7 @@ int amdgpu_vm_init(struct amdgpu_device *adev, struct amdgpu_vm *vm,
 	u64 flags;
 	uint64_t init_pde_value = 0;
 
-	vm->va = RB_ROOT;
+	vm->va = RB_ROOT_CACHED;
 	vm->client_id = atomic64_inc_return(&adev->vm_manager.client_counter);
 	for (i = 0; i < AMDGPU_MAX_VMHUBS; i++)
 		vm->reserved_vmid[i] = NULL;
@@ -2596,10 +2596,11 @@ void amdgpu_vm_fini(struct amdgpu_device *adev, struct amdgpu_vm *vm)
 
 	amd_sched_entity_fini(vm->entity.sched, &vm->entity);
 
-	if (!RB_EMPTY_ROOT(&vm->va)) {
+	if (!RB_EMPTY_ROOT(&vm->va.rb_root)) {
 		dev_err(adev->dev, "still active bo inside vm\n");
 	}
-	rbtree_postorder_for_each_entry_safe(mapping, tmp, &vm->va, rb) {
+	rbtree_postorder_for_each_entry_safe(mapping, tmp,
+					     &vm->va.rb_root, rb) {
 		list_del(&mapping->list);
 		amdgpu_vm_it_remove(mapping, &vm->va);
 		kfree(mapping);
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h
index ba6691b58ee7..6716355403ec 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h
@@ -118,7 +118,7 @@ struct amdgpu_vm_pt {
 
 struct amdgpu_vm {
 	/* tree of virtual addresses mapped */
-	struct rb_root		va;
+	struct rb_root_cached	va;
 
 	/* protecting invalidated */
 	spinlock_t		status_lock;
diff --git a/drivers/gpu/drm/drm_mm.c b/drivers/gpu/drm/drm_mm.c
index f794089d30ac..61a1c8ea74bc 100644
--- a/drivers/gpu/drm/drm_mm.c
+++ b/drivers/gpu/drm/drm_mm.c
@@ -169,7 +169,7 @@ INTERVAL_TREE_DEFINE(struct drm_mm_node, rb,
 struct drm_mm_node *
 __drm_mm_interval_first(const struct drm_mm *mm, u64 start, u64 last)
 {
-	return drm_mm_interval_tree_iter_first((struct rb_root *)&mm->interval_tree,
+	return drm_mm_interval_tree_iter_first((struct rb_root_cached *)&mm->interval_tree,
 					       start, last) ?: (struct drm_mm_node *)&mm->head_node;
 }
 EXPORT_SYMBOL(__drm_mm_interval_first);
@@ -180,6 +180,7 @@ static void drm_mm_interval_tree_add_node(struct drm_mm_node *hole_node,
 	struct drm_mm *mm = hole_node->mm;
 	struct rb_node **link, *rb;
 	struct drm_mm_node *parent;
+	bool leftmost = true;
 
 	node->__subtree_last = LAST(node);
 
@@ -196,9 +197,10 @@ static void drm_mm_interval_tree_add_node(struct drm_mm_node *hole_node,
 
 		rb = &hole_node->rb;
 		link = &hole_node->rb.rb_right;
+		leftmost = false;
 	} else {
 		rb = NULL;
-		link = &mm->interval_tree.rb_node;
+		link = &mm->interval_tree.rb_root.rb_node;
 	}
 
 	while (*link) {
@@ -208,14 +210,15 @@ static void drm_mm_interval_tree_add_node(struct drm_mm_node *hole_node,
 			parent->__subtree_last = node->__subtree_last;
 		if (node->start < parent->start)
 			link = &parent->rb.rb_left;
-		else
+		else {
 			link = &parent->rb.rb_right;
+			leftmost = true;
+		}
 	}
 
 	rb_link_node(&node->rb, rb, link);
-	rb_insert_augmented(&node->rb,
-			    &mm->interval_tree,
-			    &drm_mm_interval_tree_augment);
+	rb_insert_augmented_cached(&node->rb, &mm->interval_tree, leftmost,
+				   &drm_mm_interval_tree_augment);
 }
 
 #define RB_INSERT(root, member, expr) do { \
@@ -577,7 +580,7 @@ void drm_mm_replace_node(struct drm_mm_node *old, struct drm_mm_node *new)
 	*new = *old;
 
 	list_replace(&old->node_list, &new->node_list);
-	rb_replace_node(&old->rb, &new->rb, &old->mm->interval_tree);
+	rb_replace_node(&old->rb, &new->rb, &old->mm->interval_tree.rb_root);
 
 	if (drm_mm_hole_follows(old)) {
 		list_replace(&old->hole_stack, &new->hole_stack);
@@ -863,7 +866,7 @@ void drm_mm_init(struct drm_mm *mm, u64 start, u64 size)
 	mm->color_adjust = NULL;
 
 	INIT_LIST_HEAD(&mm->hole_stack);
-	mm->interval_tree = RB_ROOT;
+	mm->interval_tree = RB_ROOT_CACHED;
 	mm->holes_size = RB_ROOT;
 	mm->holes_addr = RB_ROOT;
 
diff --git a/drivers/gpu/drm/drm_vma_manager.c b/drivers/gpu/drm/drm_vma_manager.c
index d9100b565198..28f1226576f8 100644
--- a/drivers/gpu/drm/drm_vma_manager.c
+++ b/drivers/gpu/drm/drm_vma_manager.c
@@ -147,7 +147,7 @@ struct drm_vma_offset_node *drm_vma_offset_lookup_locked(struct drm_vma_offset_m
 	struct rb_node *iter;
 	unsigned long offset;
 
-	iter = mgr->vm_addr_space_mm.interval_tree.rb_node;
+	iter = mgr->vm_addr_space_mm.interval_tree.rb_root.rb_node;
 	best = NULL;
 
 	while (likely(iter)) {
diff --git a/drivers/gpu/drm/i915/i915_gem_userptr.c b/drivers/gpu/drm/i915/i915_gem_userptr.c
index f152a38d7079..23fd18bd1b56 100644
--- a/drivers/gpu/drm/i915/i915_gem_userptr.c
+++ b/drivers/gpu/drm/i915/i915_gem_userptr.c
@@ -49,7 +49,7 @@ struct i915_mmu_notifier {
 	spinlock_t lock;
 	struct hlist_node node;
 	struct mmu_notifier mn;
-	struct rb_root objects;
+	struct rb_root_cached objects;
 	struct workqueue_struct *wq;
 };
 
@@ -123,7 +123,7 @@ static void i915_gem_userptr_mn_invalidate_range_start(struct mmu_notifier *_mn,
 	struct interval_tree_node *it;
 	LIST_HEAD(cancelled);
 
-	if (RB_EMPTY_ROOT(&mn->objects))
+	if (RB_EMPTY_ROOT(&mn->objects.rb_root))
 		return;
 
 	/* interval ranges are inclusive, but invalidate range is exclusive */
@@ -172,7 +172,7 @@ i915_mmu_notifier_create(struct mm_struct *mm)
 
 	spin_lock_init(&mn->lock);
 	mn->mn.ops = &i915_gem_userptr_notifier;
-	mn->objects = RB_ROOT;
+	mn->objects = RB_ROOT_CACHED;
 	mn->wq = alloc_workqueue("i915-userptr-release", WQ_UNBOUND, 0);
 	if (mn->wq == NULL) {
 		kfree(mn);
diff --git a/drivers/gpu/drm/radeon/radeon.h b/drivers/gpu/drm/radeon/radeon.h
index ec63bc5e9de7..8cbaeec090c9 100644
--- a/drivers/gpu/drm/radeon/radeon.h
+++ b/drivers/gpu/drm/radeon/radeon.h
@@ -924,7 +924,7 @@ struct radeon_vm_id {
 struct radeon_vm {
 	struct mutex		mutex;
 
-	struct rb_root		va;
+	struct rb_root_cached	va;
 
 	/* protecting invalidated and freed */
 	spinlock_t		status_lock;
diff --git a/drivers/gpu/drm/radeon/radeon_mn.c b/drivers/gpu/drm/radeon/radeon_mn.c
index 896f2cf51e4e..1d62288b7ee3 100644
--- a/drivers/gpu/drm/radeon/radeon_mn.c
+++ b/drivers/gpu/drm/radeon/radeon_mn.c
@@ -50,7 +50,7 @@ struct radeon_mn {
 
 	/* objects protected by lock */
 	struct mutex		lock;
-	struct rb_root		objects;
+	struct rb_root_cached	objects;
 };
 
 struct radeon_mn_node {
@@ -75,8 +75,8 @@ static void radeon_mn_destroy(struct work_struct *work)
 	mutex_lock(&rdev->mn_lock);
 	mutex_lock(&rmn->lock);
 	hash_del(&rmn->node);
-	rbtree_postorder_for_each_entry_safe(node, next_node, &rmn->objects,
-					     it.rb) {
+	rbtree_postorder_for_each_entry_safe(node, next_node,
+					     &rmn->objects.rb_root, it.rb) {
 
 		interval_tree_remove(&node->it, &rmn->objects);
 		list_for_each_entry_safe(bo, next_bo, &node->bos, mn_list) {
@@ -205,7 +205,7 @@ static struct radeon_mn *radeon_mn_get(struct radeon_device *rdev)
 	rmn->mm = mm;
 	rmn->mn.ops = &radeon_mn_ops;
 	mutex_init(&rmn->lock);
-	rmn->objects = RB_ROOT;
+	rmn->objects = RB_ROOT_CACHED;
 	
 	r = __mmu_notifier_register(&rmn->mn, mm);
 	if (r)
diff --git a/drivers/gpu/drm/radeon/radeon_vm.c b/drivers/gpu/drm/radeon/radeon_vm.c
index 5e82b408d522..e5c0e635e371 100644
--- a/drivers/gpu/drm/radeon/radeon_vm.c
+++ b/drivers/gpu/drm/radeon/radeon_vm.c
@@ -1185,7 +1185,7 @@ int radeon_vm_init(struct radeon_device *rdev, struct radeon_vm *vm)
 		vm->ids[i].last_id_use = NULL;
 	}
 	mutex_init(&vm->mutex);
-	vm->va = RB_ROOT;
+	vm->va = RB_ROOT_CACHED;
 	spin_lock_init(&vm->status_lock);
 	INIT_LIST_HEAD(&vm->invalidated);
 	INIT_LIST_HEAD(&vm->freed);
@@ -1232,10 +1232,11 @@ void radeon_vm_fini(struct radeon_device *rdev, struct radeon_vm *vm)
 	struct radeon_bo_va *bo_va, *tmp;
 	int i, r;
 
-	if (!RB_EMPTY_ROOT(&vm->va)) {
+	if (!RB_EMPTY_ROOT(&vm->va.rb_root)) {
 		dev_err(rdev->dev, "still active bo inside vm\n");
 	}
-	rbtree_postorder_for_each_entry_safe(bo_va, tmp, &vm->va, it.rb) {
+	rbtree_postorder_for_each_entry_safe(bo_va, tmp,
+					     &vm->va.rb_root, it.rb) {
 		interval_tree_remove(&bo_va->it, &vm->va);
 		r = radeon_bo_reserve(bo_va->bo, false);
 		if (!r) {
diff --git a/drivers/infiniband/core/umem_rbtree.c b/drivers/infiniband/core/umem_rbtree.c
index d176597b4d78..fc801920e341 100644
--- a/drivers/infiniband/core/umem_rbtree.c
+++ b/drivers/infiniband/core/umem_rbtree.c
@@ -72,7 +72,7 @@ INTERVAL_TREE_DEFINE(struct umem_odp_node, rb, u64, __subtree_last,
 /* @last is not a part of the interval. See comment for function
  * node_last.
  */
-int rbt_ib_umem_for_each_in_range(struct rb_root *root,
+int rbt_ib_umem_for_each_in_range(struct rb_root_cached *root,
 				  u64 start, u64 last,
 				  umem_call_back cb,
 				  void *cookie)
@@ -95,7 +95,7 @@ int rbt_ib_umem_for_each_in_range(struct rb_root *root,
 }
 EXPORT_SYMBOL(rbt_ib_umem_for_each_in_range);
 
-struct ib_umem_odp *rbt_ib_umem_lookup(struct rb_root *root,
+struct ib_umem_odp *rbt_ib_umem_lookup(struct rb_root_cached *root,
 				       u64 addr, u64 length)
 {
 	struct umem_odp_node *node;
diff --git a/drivers/infiniband/core/uverbs_cmd.c b/drivers/infiniband/core/uverbs_cmd.c
index e0cb99860934..4ab30d832ac5 100644
--- a/drivers/infiniband/core/uverbs_cmd.c
+++ b/drivers/infiniband/core/uverbs_cmd.c
@@ -118,7 +118,7 @@ ssize_t ib_uverbs_get_context(struct ib_uverbs_file *file,
 	ucontext->closing = 0;
 
 #ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING
-	ucontext->umem_tree = RB_ROOT;
+	ucontext->umem_tree = RB_ROOT_CACHED;
 	init_rwsem(&ucontext->umem_rwsem);
 	ucontext->odp_mrs_count = 0;
 	INIT_LIST_HEAD(&ucontext->no_private_counters);
diff --git a/drivers/infiniband/hw/hfi1/mmu_rb.c b/drivers/infiniband/hw/hfi1/mmu_rb.c
index 2f0d285dc278..175002c046ed 100644
--- a/drivers/infiniband/hw/hfi1/mmu_rb.c
+++ b/drivers/infiniband/hw/hfi1/mmu_rb.c
@@ -54,7 +54,7 @@
 
 struct mmu_rb_handler {
 	struct mmu_notifier mn;
-	struct rb_root root;
+	struct rb_root_cached root;
 	void *ops_arg;
 	spinlock_t lock;        /* protect the RB tree */
 	struct mmu_rb_ops *ops;
@@ -108,7 +108,7 @@ int hfi1_mmu_rb_register(void *ops_arg, struct mm_struct *mm,
 	if (!handlr)
 		return -ENOMEM;
 
-	handlr->root = RB_ROOT;
+	handlr->root = RB_ROOT_CACHED;
 	handlr->ops = ops;
 	handlr->ops_arg = ops_arg;
 	INIT_HLIST_NODE(&handlr->mn.hlist);
@@ -149,9 +149,9 @@ void hfi1_mmu_rb_unregister(struct mmu_rb_handler *handler)
 	INIT_LIST_HEAD(&del_list);
 
 	spin_lock_irqsave(&handler->lock, flags);
-	while ((node = rb_first(&handler->root))) {
+	while ((node = rb_first_cached(&handler->root))) {
 		rbnode = rb_entry(node, struct mmu_rb_node, node);
-		rb_erase(node, &handler->root);
+		rb_erase_cached(node, &handler->root);
 		/* move from LRU list to delete list */
 		list_move(&rbnode->list, &del_list);
 	}
@@ -300,7 +300,7 @@ static void mmu_notifier_mem_invalidate(struct mmu_notifier *mn,
 {
 	struct mmu_rb_handler *handler =
 		container_of(mn, struct mmu_rb_handler, mn);
-	struct rb_root *root = &handler->root;
+	struct rb_root_cached *root = &handler->root;
 	struct mmu_rb_node *node, *ptr = NULL;
 	unsigned long flags;
 	bool added = false;
diff --git a/drivers/infiniband/hw/usnic/usnic_uiom.c b/drivers/infiniband/hw/usnic/usnic_uiom.c
index c49db7c33979..4381c0a9a873 100644
--- a/drivers/infiniband/hw/usnic/usnic_uiom.c
+++ b/drivers/infiniband/hw/usnic/usnic_uiom.c
@@ -227,7 +227,7 @@ static void __usnic_uiom_reg_release(struct usnic_uiom_pd *pd,
 	vpn_last = vpn_start + npages - 1;
 
 	spin_lock(&pd->lock);
-	usnic_uiom_remove_interval(&pd->rb_root, vpn_start,
+	usnic_uiom_remove_interval(&pd->root, vpn_start,
 					vpn_last, &rm_intervals);
 	usnic_uiom_unmap_sorted_intervals(&rm_intervals, pd);
 
@@ -379,7 +379,7 @@ struct usnic_uiom_reg *usnic_uiom_reg_get(struct usnic_uiom_pd *pd,
 	err = usnic_uiom_get_intervals_diff(vpn_start, vpn_last,
 						(writable) ? IOMMU_WRITE : 0,
 						IOMMU_WRITE,
-						&pd->rb_root,
+						&pd->root,
 						&sorted_diff_intervals);
 	if (err) {
 		usnic_err("Failed disjoint interval vpn [0x%lx,0x%lx] err %d\n",
@@ -395,7 +395,7 @@ struct usnic_uiom_reg *usnic_uiom_reg_get(struct usnic_uiom_pd *pd,
 
 	}
 
-	err = usnic_uiom_insert_interval(&pd->rb_root, vpn_start, vpn_last,
+	err = usnic_uiom_insert_interval(&pd->root, vpn_start, vpn_last,
 					(writable) ? IOMMU_WRITE : 0);
 	if (err) {
 		usnic_err("Failed insert interval vpn [0x%lx,0x%lx] err %d\n",
diff --git a/drivers/infiniband/hw/usnic/usnic_uiom.h b/drivers/infiniband/hw/usnic/usnic_uiom.h
index 45ca7c1613a7..431efe4143f4 100644
--- a/drivers/infiniband/hw/usnic/usnic_uiom.h
+++ b/drivers/infiniband/hw/usnic/usnic_uiom.h
@@ -55,7 +55,7 @@ struct usnic_uiom_dev {
 struct usnic_uiom_pd {
 	struct iommu_domain		*domain;
 	spinlock_t			lock;
-	struct rb_root			rb_root;
+	struct rb_root_cached		root;
 	struct list_head		devs;
 	int				dev_cnt;
 };
diff --git a/drivers/infiniband/hw/usnic/usnic_uiom_interval_tree.c b/drivers/infiniband/hw/usnic/usnic_uiom_interval_tree.c
index 42b4b4c4e452..d399523206c7 100644
--- a/drivers/infiniband/hw/usnic/usnic_uiom_interval_tree.c
+++ b/drivers/infiniband/hw/usnic/usnic_uiom_interval_tree.c
@@ -100,9 +100,9 @@ static int interval_cmp(void *priv, struct list_head *a, struct list_head *b)
 }
 
 static void
-find_intervals_intersection_sorted(struct rb_root *root, unsigned long start,
-					unsigned long last,
-					struct list_head *list)
+find_intervals_intersection_sorted(struct rb_root_cached *root,
+				   unsigned long start, unsigned long last,
+				   struct list_head *list)
 {
 	struct usnic_uiom_interval_node *node;
 
@@ -118,7 +118,7 @@ find_intervals_intersection_sorted(struct rb_root *root, unsigned long start,
 
 int usnic_uiom_get_intervals_diff(unsigned long start, unsigned long last,
 					int flags, int flag_mask,
-					struct rb_root *root,
+					struct rb_root_cached *root,
 					struct list_head *diff_set)
 {
 	struct usnic_uiom_interval_node *interval, *tmp;
@@ -175,7 +175,7 @@ void usnic_uiom_put_interval_set(struct list_head *intervals)
 		kfree(interval);
 }
 
-int usnic_uiom_insert_interval(struct rb_root *root, unsigned long start,
+int usnic_uiom_insert_interval(struct rb_root_cached *root, unsigned long start,
 				unsigned long last, int flags)
 {
 	struct usnic_uiom_interval_node *interval, *tmp;
@@ -246,8 +246,9 @@ err_out:
 	return err;
 }
 
-void usnic_uiom_remove_interval(struct rb_root *root, unsigned long start,
-				unsigned long last, struct list_head *removed)
+void usnic_uiom_remove_interval(struct rb_root_cached *root,
+				unsigned long start, unsigned long last,
+				struct list_head *removed)
 {
 	struct usnic_uiom_interval_node *interval;
 
diff --git a/drivers/infiniband/hw/usnic/usnic_uiom_interval_tree.h b/drivers/infiniband/hw/usnic/usnic_uiom_interval_tree.h
index c0b0b876ab90..1d7fc3226bca 100644
--- a/drivers/infiniband/hw/usnic/usnic_uiom_interval_tree.h
+++ b/drivers/infiniband/hw/usnic/usnic_uiom_interval_tree.h
@@ -48,12 +48,12 @@ struct usnic_uiom_interval_node {
 
 extern void
 usnic_uiom_interval_tree_insert(struct usnic_uiom_interval_node *node,
-					struct rb_root *root);
+					struct rb_root_cached *root);
 extern void
 usnic_uiom_interval_tree_remove(struct usnic_uiom_interval_node *node,
-					struct rb_root *root);
+					struct rb_root_cached *root);
 extern struct usnic_uiom_interval_node *
-usnic_uiom_interval_tree_iter_first(struct rb_root *root,
+usnic_uiom_interval_tree_iter_first(struct rb_root_cached *root,
 					unsigned long start,
 					unsigned long last);
 extern struct usnic_uiom_interval_node *
@@ -63,7 +63,7 @@ usnic_uiom_interval_tree_iter_next(struct usnic_uiom_interval_node *node,
  * Inserts {start...last} into {root}.  If there are overlaps,
  * nodes will be broken up and merged
  */
-int usnic_uiom_insert_interval(struct rb_root *root,
+int usnic_uiom_insert_interval(struct rb_root_cached *root,
 				unsigned long start, unsigned long last,
 				int flags);
 /*
@@ -71,7 +71,7 @@ int usnic_uiom_insert_interval(struct rb_root *root,
  * 'removed.' The caller is responsibile for freeing memory of nodes in
  * 'removed.'
  */
-void usnic_uiom_remove_interval(struct rb_root *root,
+void usnic_uiom_remove_interval(struct rb_root_cached *root,
 				unsigned long start, unsigned long last,
 				struct list_head *removed);
 /*
@@ -81,7 +81,7 @@ void usnic_uiom_remove_interval(struct rb_root *root,
 int usnic_uiom_get_intervals_diff(unsigned long start,
 					unsigned long last, int flags,
 					int flag_mask,
-					struct rb_root *root,
+					struct rb_root_cached *root,
 					struct list_head *diff_set);
 /* Call this to free diff_set returned by usnic_uiom_get_intervals_diff */
 void usnic_uiom_put_interval_set(struct list_head *intervals);
diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
index 9cb3f722dce1..d6dbb28245e6 100644
--- a/drivers/vhost/vhost.c
+++ b/drivers/vhost/vhost.c
@@ -1271,7 +1271,7 @@ static struct vhost_umem *vhost_umem_alloc(void)
 	if (!umem)
 		return NULL;
 
-	umem->umem_tree = RB_ROOT;
+	umem->umem_tree = RB_ROOT_CACHED;
 	umem->numem = 0;
 	INIT_LIST_HEAD(&umem->umem_list);
 
diff --git a/drivers/vhost/vhost.h b/drivers/vhost/vhost.h
index bb7c29b8b9fc..d59a9cc65f9d 100644
--- a/drivers/vhost/vhost.h
+++ b/drivers/vhost/vhost.h
@@ -71,7 +71,7 @@ struct vhost_umem_node {
 };
 
 struct vhost_umem {
-	struct rb_root umem_tree;
+	struct rb_root_cached umem_tree;
 	struct list_head umem_list;
 	int numem;
 };
diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index 8c6f4b8f910f..59073e9f01a4 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -334,7 +334,7 @@ static void remove_huge_page(struct page *page)
 }
 
 static void
-hugetlb_vmdelete_list(struct rb_root *root, pgoff_t start, pgoff_t end)
+hugetlb_vmdelete_list(struct rb_root_cached *root, pgoff_t start, pgoff_t end)
 {
 	struct vm_area_struct *vma;
 
@@ -498,7 +498,7 @@ static int hugetlb_vmtruncate(struct inode *inode, loff_t offset)
 
 	i_size_write(inode, offset);
 	i_mmap_lock_write(mapping);
-	if (!RB_EMPTY_ROOT(&mapping->i_mmap))
+	if (!RB_EMPTY_ROOT(&mapping->i_mmap.rb_root))
 		hugetlb_vmdelete_list(&mapping->i_mmap, pgoff, 0);
 	i_mmap_unlock_write(mapping);
 	remove_inode_hugepages(inode, offset, LLONG_MAX);
@@ -523,7 +523,7 @@ static long hugetlbfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)
 
 		inode_lock(inode);
 		i_mmap_lock_write(mapping);
-		if (!RB_EMPTY_ROOT(&mapping->i_mmap))
+		if (!RB_EMPTY_ROOT(&mapping->i_mmap.rb_root))
 			hugetlb_vmdelete_list(&mapping->i_mmap,
 						hole_start >> PAGE_SHIFT,
 						hole_end  >> PAGE_SHIFT);
diff --git a/fs/inode.c b/fs/inode.c
index 6a1626e0edaf..210054157a49 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -353,7 +353,7 @@ void address_space_init_once(struct address_space *mapping)
 	init_rwsem(&mapping->i_mmap_rwsem);
 	INIT_LIST_HEAD(&mapping->private_list);
 	spin_lock_init(&mapping->private_lock);
-	mapping->i_mmap = RB_ROOT;
+	mapping->i_mmap = RB_ROOT_CACHED;
 }
 EXPORT_SYMBOL(address_space_init_once);
 
diff --git a/include/drm/drm_mm.h b/include/drm/drm_mm.h
index 49b292e98fec..8d10fc97801c 100644
--- a/include/drm/drm_mm.h
+++ b/include/drm/drm_mm.h
@@ -172,7 +172,7 @@ struct drm_mm {
 	 * according to the (increasing) start address of the memory node. */
 	struct drm_mm_node head_node;
 	/* Keep an interval_tree for fast lookup of drm_mm_nodes by address. */
-	struct rb_root interval_tree;
+	struct rb_root_cached interval_tree;
 	struct rb_root holes_size;
 	struct rb_root holes_addr;
 
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 509434aaf5a4..6111976848ff 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -392,7 +392,7 @@ struct address_space {
 	struct radix_tree_root	page_tree;	/* radix tree of all pages */
 	spinlock_t		tree_lock;	/* and lock protecting it */
 	atomic_t		i_mmap_writable;/* count VM_SHARED mappings */
-	struct rb_root		i_mmap;		/* tree of private and shared mappings */
+	struct rb_root_cached	i_mmap;		/* tree of private and shared mappings */
 	struct rw_semaphore	i_mmap_rwsem;	/* protect tree, count, list */
 	/* Protected by tree_lock together with the radix tree */
 	unsigned long		nrpages;	/* number of total pages */
@@ -487,7 +487,7 @@ static inline void i_mmap_unlock_read(struct address_space *mapping)
  */
 static inline int mapping_mapped(struct address_space *mapping)
 {
-	return	!RB_EMPTY_ROOT(&mapping->i_mmap);
+	return	!RB_EMPTY_ROOT(&mapping->i_mmap.rb_root);
 }
 
 /*
diff --git a/include/linux/interval_tree.h b/include/linux/interval_tree.h
index 724556aa3c95..202ee1283f4b 100644
--- a/include/linux/interval_tree.h
+++ b/include/linux/interval_tree.h
@@ -11,13 +11,15 @@ struct interval_tree_node {
 };
 
 extern void
-interval_tree_insert(struct interval_tree_node *node, struct rb_root *root);
+interval_tree_insert(struct interval_tree_node *node,
+		     struct rb_root_cached *root);
 
 extern void
-interval_tree_remove(struct interval_tree_node *node, struct rb_root *root);
+interval_tree_remove(struct interval_tree_node *node,
+		     struct rb_root_cached *root);
 
 extern struct interval_tree_node *
-interval_tree_iter_first(struct rb_root *root,
+interval_tree_iter_first(struct rb_root_cached *root,
 			 unsigned long start, unsigned long last);
 
 extern struct interval_tree_node *
diff --git a/include/linux/interval_tree_generic.h b/include/linux/interval_tree_generic.h
index 58370e1862ad..f096423c8cbd 100644
--- a/include/linux/interval_tree_generic.h
+++ b/include/linux/interval_tree_generic.h
@@ -65,11 +65,13 @@ RB_DECLARE_CALLBACKS(static, ITPREFIX ## _augment, ITSTRUCT, ITRB,	      \
 									      \
 /* Insert / remove interval nodes from the tree */			      \
 									      \
-ITSTATIC void ITPREFIX ## _insert(ITSTRUCT *node, struct rb_root *root)	      \
+ITSTATIC void ITPREFIX ## _insert(ITSTRUCT *node,			      \
+				  struct rb_root_cached *root)	 	      \
 {									      \
-	struct rb_node **link = &root->rb_node, *rb_parent = NULL;	      \
+	struct rb_node **link = &root->rb_root.rb_node, *rb_parent = NULL;    \
 	ITTYPE start = ITSTART(node), last = ITLAST(node);		      \
 	ITSTRUCT *parent;						      \
+	bool leftmost = true;						      \
 									      \
 	while (*link) {							      \
 		rb_parent = *link;					      \
@@ -78,18 +80,22 @@ ITSTATIC void ITPREFIX ## _insert(ITSTRUCT *node, struct rb_root *root)	      \
 			parent->ITSUBTREE = last;			      \
 		if (start < ITSTART(parent))				      \
 			link = &parent->ITRB.rb_left;			      \
-		else							      \
+		else {							      \
 			link = &parent->ITRB.rb_right;			      \
+			leftmost = false;				      \
+		}							      \
 	}								      \
 									      \
 	node->ITSUBTREE = last;						      \
 	rb_link_node(&node->ITRB, rb_parent, link);			      \
-	rb_insert_augmented(&node->ITRB, root, &ITPREFIX ## _augment);	      \
+	rb_insert_augmented_cached(&node->ITRB, root,			      \
+				   leftmost, &ITPREFIX ## _augment);	      \
 }									      \
 									      \
-ITSTATIC void ITPREFIX ## _remove(ITSTRUCT *node, struct rb_root *root)	      \
+ITSTATIC void ITPREFIX ## _remove(ITSTRUCT *node,			      \
+				  struct rb_root_cached *root)		      \
 {									      \
-	rb_erase_augmented(&node->ITRB, root, &ITPREFIX ## _augment);	      \
+	rb_erase_augmented_cached(&node->ITRB, root, &ITPREFIX ## _augment);  \
 }									      \
 									      \
 /*									      \
@@ -140,15 +146,35 @@ ITPREFIX ## _subtree_search(ITSTRUCT *node, ITTYPE start, ITTYPE last)	      \
 }									      \
 									      \
 ITSTATIC ITSTRUCT *							      \
-ITPREFIX ## _iter_first(struct rb_root *root, ITTYPE start, ITTYPE last)      \
+ITPREFIX ## _iter_first(struct rb_root_cached *root,			      \
+			ITTYPE start, ITTYPE last)			      \
 {									      \
-	ITSTRUCT *node;							      \
+	ITSTRUCT *node, *leftmost;					      \
 									      \
-	if (!root->rb_node)						      \
+	if (!root->rb_root.rb_node)					      \
 		return NULL;						      \
-	node = rb_entry(root->rb_node, ITSTRUCT, ITRB);			      \
+									      \
+	/*								      \
+	 * Fastpath range intersection/overlap between A: [a0, a1] and	      \
+	 * B: [b0, b1] is given by:					      \
+	 *								      \
+	 *         a0 <= b1 && b0 <= a1					      \
+	 *								      \
+	 *  ... where A holds the lock range and B holds the smallest	      \
+	 * 'start' and largest 'last' in the tree. For the later, we	      \
+	 * rely on the root node, which by augmented interval tree	      \
+	 * property, holds the largest value in its last-in-subtree.	      \
+	 * This allows mitigating some of the tree walk overhead for	      \
+	 * for non-intersecting ranges, maintained and consulted in O(1).     \
+	 */								      \
+	node = rb_entry(root->rb_root.rb_node, ITSTRUCT, ITRB);		      \
 	if (node->ITSUBTREE < start)					      \
 		return NULL;						      \
+									      \
+	leftmost = rb_entry(root->rb_leftmost, ITSTRUCT, ITRB);		      \
+	if (ITSTART(leftmost) > last)					      \
+		return NULL;						      \
+									      \
 	return ITPREFIX ## _subtree_search(node, start, last);		      \
 }									      \
 									      \
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 5195e272fc4a..f8c10d336e42 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -2034,13 +2034,13 @@ extern int nommu_shrink_inode_mappings(struct inode *, size_t, size_t);
 
 /* interval_tree.c */
 void vma_interval_tree_insert(struct vm_area_struct *node,
-			      struct rb_root *root);
+			      struct rb_root_cached *root);
 void vma_interval_tree_insert_after(struct vm_area_struct *node,
 				    struct vm_area_struct *prev,
-				    struct rb_root *root);
+				    struct rb_root_cached *root);
 void vma_interval_tree_remove(struct vm_area_struct *node,
-			      struct rb_root *root);
-struct vm_area_struct *vma_interval_tree_iter_first(struct rb_root *root,
+			      struct rb_root_cached *root);
+struct vm_area_struct *vma_interval_tree_iter_first(struct rb_root_cached *root,
 				unsigned long start, unsigned long last);
 struct vm_area_struct *vma_interval_tree_iter_next(struct vm_area_struct *node,
 				unsigned long start, unsigned long last);
@@ -2050,11 +2050,12 @@ struct vm_area_struct *vma_interval_tree_iter_next(struct vm_area_struct *node,
 	     vma; vma = vma_interval_tree_iter_next(vma, start, last))
 
 void anon_vma_interval_tree_insert(struct anon_vma_chain *node,
-				   struct rb_root *root);
+				   struct rb_root_cached *root);
 void anon_vma_interval_tree_remove(struct anon_vma_chain *node,
-				   struct rb_root *root);
-struct anon_vma_chain *anon_vma_interval_tree_iter_first(
-	struct rb_root *root, unsigned long start, unsigned long last);
+				   struct rb_root_cached *root);
+struct anon_vma_chain *
+anon_vma_interval_tree_iter_first(struct rb_root_cached *root,
+				  unsigned long start, unsigned long last);
 struct anon_vma_chain *anon_vma_interval_tree_iter_next(
 	struct anon_vma_chain *node, unsigned long start, unsigned long last);
 #ifdef CONFIG_DEBUG_VM_RB
diff --git a/include/linux/rmap.h b/include/linux/rmap.h
index f8ca2e74b819..733d3d8181e2 100644
--- a/include/linux/rmap.h
+++ b/include/linux/rmap.h
@@ -55,7 +55,9 @@ struct anon_vma {
 	 * is serialized by a system wide lock only visible to
 	 * mm_take_all_locks() (mm_all_locks_mutex).
 	 */
-	struct rb_root rb_root;	/* Interval tree of private "related" vmas */
+
+	/* Interval tree of private "related" vmas */
+	struct rb_root_cached rb_root;
 };
 
 /*
diff --git a/include/rdma/ib_umem_odp.h b/include/rdma/ib_umem_odp.h
index fb67554aabd6..5eb7f5bc8248 100644
--- a/include/rdma/ib_umem_odp.h
+++ b/include/rdma/ib_umem_odp.h
@@ -111,22 +111,25 @@ int ib_umem_odp_map_dma_pages(struct ib_umem *umem, u64 start_offset, u64 bcnt,
 void ib_umem_odp_unmap_dma_pages(struct ib_umem *umem, u64 start_offset,
 				 u64 bound);
 
-void rbt_ib_umem_insert(struct umem_odp_node *node, struct rb_root *root);
-void rbt_ib_umem_remove(struct umem_odp_node *node, struct rb_root *root);
+void rbt_ib_umem_insert(struct umem_odp_node *node,
+			struct rb_root_cached *root);
+void rbt_ib_umem_remove(struct umem_odp_node *node,
+			struct rb_root_cached *root);
 typedef int (*umem_call_back)(struct ib_umem *item, u64 start, u64 end,
 			      void *cookie);
 /*
  * Call the callback on each ib_umem in the range. Returns the logical or of
  * the return values of the functions called.
  */
-int rbt_ib_umem_for_each_in_range(struct rb_root *root, u64 start, u64 end,
+int rbt_ib_umem_for_each_in_range(struct rb_root_cached *root,
+				  u64 start, u64 end,
 				  umem_call_back cb, void *cookie);
 
 /*
  * Find first region intersecting with address range.
  * Return NULL if not found
  */
-struct ib_umem_odp *rbt_ib_umem_lookup(struct rb_root *root,
+struct ib_umem_odp *rbt_ib_umem_lookup(struct rb_root_cached *root,
 				       u64 addr, u64 length);
 
 static inline int ib_umem_mmu_notifier_retry(struct ib_umem *item,
diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h
index e6df68048517..bdb1279a415b 100644
--- a/include/rdma/ib_verbs.h
+++ b/include/rdma/ib_verbs.h
@@ -1457,7 +1457,7 @@ struct ib_ucontext {
 
 	struct pid             *tgid;
 #ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING
-	struct rb_root      umem_tree;
+	struct rb_root_cached   umem_tree;
 	/*
 	 * Protects .umem_rbroot and tree, as well as odp_mrs_count and
 	 * mmu notifiers registration.
diff --git a/lib/interval_tree_test.c b/lib/interval_tree_test.c
index df495fe81421..0e343fd29570 100644
--- a/lib/interval_tree_test.c
+++ b/lib/interval_tree_test.c
@@ -19,14 +19,14 @@ __param(bool, search_all, false, "Searches will iterate all nodes in the tree");
 
 __param(uint, max_endpoint, ~0, "Largest value for the interval's endpoint");
 
-static struct rb_root root = RB_ROOT;
+static struct rb_root_cached root = RB_ROOT_CACHED;
 static struct interval_tree_node *nodes = NULL;
 static u32 *queries = NULL;
 
 static struct rnd_state rnd;
 
 static inline unsigned long
-search(struct rb_root *root, unsigned long start, unsigned long last)
+search(struct rb_root_cached *root, unsigned long start, unsigned long last)
 {
 	struct interval_tree_node *node;
 	unsigned long results = 0;
diff --git a/mm/interval_tree.c b/mm/interval_tree.c
index f2c2492681bf..b47664358796 100644
--- a/mm/interval_tree.c
+++ b/mm/interval_tree.c
@@ -28,7 +28,7 @@ INTERVAL_TREE_DEFINE(struct vm_area_struct, shared.rb,
 /* Insert node immediately after prev in the interval tree */
 void vma_interval_tree_insert_after(struct vm_area_struct *node,
 				    struct vm_area_struct *prev,
-				    struct rb_root *root)
+				    struct rb_root_cached *root)
 {
 	struct rb_node **link;
 	struct vm_area_struct *parent;
@@ -55,7 +55,7 @@ void vma_interval_tree_insert_after(struct vm_area_struct *node,
 
 	node->shared.rb_subtree_last = last;
 	rb_link_node(&node->shared.rb, &parent->shared.rb, link);
-	rb_insert_augmented(&node->shared.rb, root,
+	rb_insert_augmented(&node->shared.rb, &root->rb_root,
 			    &vma_interval_tree_augment);
 }
 
@@ -74,7 +74,7 @@ INTERVAL_TREE_DEFINE(struct anon_vma_chain, rb, unsigned long, rb_subtree_last,
 		     static inline, __anon_vma_interval_tree)
 
 void anon_vma_interval_tree_insert(struct anon_vma_chain *node,
-				   struct rb_root *root)
+				   struct rb_root_cached *root)
 {
 #ifdef CONFIG_DEBUG_VM_RB
 	node->cached_vma_start = avc_start_pgoff(node);
@@ -84,13 +84,13 @@ void anon_vma_interval_tree_insert(struct anon_vma_chain *node,
 }
 
 void anon_vma_interval_tree_remove(struct anon_vma_chain *node,
-				   struct rb_root *root)
+				   struct rb_root_cached *root)
 {
 	__anon_vma_interval_tree_remove(node, root);
 }
 
 struct anon_vma_chain *
-anon_vma_interval_tree_iter_first(struct rb_root *root,
+anon_vma_interval_tree_iter_first(struct rb_root_cached *root,
 				  unsigned long first, unsigned long last)
 {
 	return __anon_vma_interval_tree_iter_first(root, first, last);
diff --git a/mm/memory.c b/mm/memory.c
index 0bbc1d612a63..ec4e15494901 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -2761,7 +2761,7 @@ static void unmap_mapping_range_vma(struct vm_area_struct *vma,
 	zap_page_range_single(vma, start_addr, end_addr - start_addr, details);
 }
 
-static inline void unmap_mapping_range_tree(struct rb_root *root,
+static inline void unmap_mapping_range_tree(struct rb_root_cached *root,
 					    struct zap_details *details)
 {
 	struct vm_area_struct *vma;
@@ -2825,7 +2825,7 @@ void unmap_mapping_range(struct address_space *mapping,
 		details.last_index = ULONG_MAX;
 
 	i_mmap_lock_write(mapping);
-	if (unlikely(!RB_EMPTY_ROOT(&mapping->i_mmap)))
+	if (unlikely(!RB_EMPTY_ROOT(&mapping->i_mmap.rb_root)))
 		unmap_mapping_range_tree(&mapping->i_mmap, &details);
 	i_mmap_unlock_write(mapping);
 }
diff --git a/mm/mmap.c b/mm/mmap.c
index 4c5981651407..680506faceae 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -685,7 +685,7 @@ int __vma_adjust(struct vm_area_struct *vma, unsigned long start,
 	struct mm_struct *mm = vma->vm_mm;
 	struct vm_area_struct *next = vma->vm_next, *orig_vma = vma;
 	struct address_space *mapping = NULL;
-	struct rb_root *root = NULL;
+	struct rb_root_cached *root = NULL;
 	struct anon_vma *anon_vma = NULL;
 	struct file *file = vma->vm_file;
 	bool start_changed = false, end_changed = false;
@@ -3340,7 +3340,7 @@ static DEFINE_MUTEX(mm_all_locks_mutex);
 
 static void vm_lock_anon_vma(struct mm_struct *mm, struct anon_vma *anon_vma)
 {
-	if (!test_bit(0, (unsigned long *) &anon_vma->root->rb_root.rb_node)) {
+	if (!test_bit(0, (unsigned long *) &anon_vma->root->rb_root.rb_root.rb_node)) {
 		/*
 		 * The LSB of head.next can't change from under us
 		 * because we hold the mm_all_locks_mutex.
@@ -3356,7 +3356,7 @@ static void vm_lock_anon_vma(struct mm_struct *mm, struct anon_vma *anon_vma)
 		 * anon_vma->root->rwsem.
 		 */
 		if (__test_and_set_bit(0, (unsigned long *)
-				       &anon_vma->root->rb_root.rb_node))
+				       &anon_vma->root->rb_root.rb_root.rb_node))
 			BUG();
 	}
 }
@@ -3458,7 +3458,7 @@ out_unlock:
 
 static void vm_unlock_anon_vma(struct anon_vma *anon_vma)
 {
-	if (test_bit(0, (unsigned long *) &anon_vma->root->rb_root.rb_node)) {
+	if (test_bit(0, (unsigned long *) &anon_vma->root->rb_root.rb_root.rb_node)) {
 		/*
 		 * The LSB of head.next can't change to 0 from under
 		 * us because we hold the mm_all_locks_mutex.
@@ -3472,7 +3472,7 @@ static void vm_unlock_anon_vma(struct anon_vma *anon_vma)
 		 * anon_vma->root->rwsem.
 		 */
 		if (!__test_and_clear_bit(0, (unsigned long *)
-					  &anon_vma->root->rb_root.rb_node))
+					  &anon_vma->root->rb_root.rb_root.rb_node))
 			BUG();
 		anon_vma_unlock_write(anon_vma);
 	}
diff --git a/mm/rmap.c b/mm/rmap.c
index 0618cd85b862..b874c4761e84 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -391,7 +391,7 @@ void unlink_anon_vmas(struct vm_area_struct *vma)
 		 * Leave empty anon_vmas on the list - we'll need
 		 * to free them outside the lock.
 		 */
-		if (RB_EMPTY_ROOT(&anon_vma->rb_root)) {
+		if (RB_EMPTY_ROOT(&anon_vma->rb_root.rb_root)) {
 			anon_vma->parent->degree--;
 			continue;
 		}
@@ -425,7 +425,7 @@ static void anon_vma_ctor(void *data)
 
 	init_rwsem(&anon_vma->rwsem);
 	atomic_set(&anon_vma->refcount, 0);
-	anon_vma->rb_root = RB_ROOT;
+	anon_vma->rb_root = RB_ROOT_CACHED;
 }
 
 void __init anon_vma_init(void)
-- 
cgit v1.2.3


From f2686bb48618718c7141f117e2d607594e959bfc Mon Sep 17 00:00:00 2001
From: Davidlohr Bueso <dave@stgolabs.net>
Date: Fri, 8 Sep 2017 16:15:12 -0700
Subject: lib/interval-tree: correct comment wrt generic flavor

interval_tree.h _is_ the generic flavor.

Link: http://lkml.kernel.org/r/20170719014603.19029-13-dave@stgolabs.net
Signed-off-by: Davidlohr Bueso <dbueso@suse.de>
Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/interval_tree_generic.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/interval_tree_generic.h b/include/linux/interval_tree_generic.h
index f096423c8cbd..1f97ce26cccc 100644
--- a/include/linux/interval_tree_generic.h
+++ b/include/linux/interval_tree_generic.h
@@ -33,7 +33,7 @@
  * ITSTATIC:   'static' or empty
  * ITPREFIX:   prefix to use for the inline tree definitions
  *
- * Note - before using this, please consider if non-generic version
+ * Note - before using this, please consider if generic version
  * (interval_tree.h) would work for you...
  */
 
-- 
cgit v1.2.3


From 60ef690018b262ddcd0d51edf10e40710deb9c9f Mon Sep 17 00:00:00 2001
From: Yury Norov <ynorov@caviumnetworks.com>
Date: Fri, 8 Sep 2017 16:15:41 -0700
Subject: bitmap: introduce BITMAP_FROM_U64()

The macro is the compile-time analogue of bitmap_from_u64() with the same
purpose: convert the 64-bit number to the properly ordered pair of 32-bit
parts, suitable for filling the bitmap in 32-bit BE environment.

Use it to make test_bitmap_parselist() correct for 32-bit BE ABIs.

Tested on BE mips/qemu.

[akpm@linux-foundation.org: tweak code comment]
Link: http://lkml.kernel.org/r/20170810172916.24144-1-ynorov@caviumnetworks.com
Signed-off-by: Yury Norov <ynorov@caviumnetworks.com>
Cc: Noam Camus <noamca@mellanox.com>
Cc: Rasmus Villemoes <linux@rasmusvillemoes.dk>
Cc: Matthew Wilcox <mawilcox@microsoft.com>
Cc: Mauro Carvalho Chehab <mchehab@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/bitmap.h | 32 ++++++++++++++++++++++++++++++++
 lib/test_bitmap.c      | 47 ++++++++++++++++++++++++++++++++---------------
 2 files changed, 64 insertions(+), 15 deletions(-)

(limited to 'include')

diff --git a/include/linux/bitmap.h b/include/linux/bitmap.h
index 5797ca6fdfe2..700cf5f67118 100644
--- a/include/linux/bitmap.h
+++ b/include/linux/bitmap.h
@@ -360,6 +360,38 @@ static inline int bitmap_parse(const char *buf, unsigned int buflen,
 	return __bitmap_parse(buf, buflen, 0, maskp, nmaskbits);
 }
 
+/*
+ * BITMAP_FROM_U64() - Represent u64 value in the format suitable for bitmap.
+ *
+ * Linux bitmaps are internally arrays of unsigned longs, i.e. 32-bit
+ * integers in 32-bit environment, and 64-bit integers in 64-bit one.
+ *
+ * There are four combinations of endianness and length of the word in linux
+ * ABIs: LE64, BE64, LE32 and BE32.
+ *
+ * On 64-bit kernels 64-bit LE and BE numbers are naturally ordered in
+ * bitmaps and therefore don't require any special handling.
+ *
+ * On 32-bit kernels 32-bit LE ABI orders lo word of 64-bit number in memory
+ * prior to hi, and 32-bit BE orders hi word prior to lo. The bitmap on the
+ * other hand is represented as an array of 32-bit words and the position of
+ * bit N may therefore be calculated as: word #(N/32) and bit #(N%32) in that
+ * word.  For example, bit #42 is located at 10th position of 2nd word.
+ * It matches 32-bit LE ABI, and we can simply let the compiler store 64-bit
+ * values in memory as it usually does. But for BE we need to swap hi and lo
+ * words manually.
+ *
+ * With all that, the macro BITMAP_FROM_U64() does explicit reordering of hi and
+ * lo parts of u64.  For LE32 it does nothing, and for BE environment it swaps
+ * hi and lo words, as is expected by bitmap.
+ */
+#if __BITS_PER_LONG == 64
+#define BITMAP_FROM_U64(n) (n)
+#else
+#define BITMAP_FROM_U64(n) ((unsigned long) ((u64)(n) & ULONG_MAX)), \
+				((unsigned long) ((u64)(n) >> 32))
+#endif
+
 /*
  * bitmap_from_u64 - Check and swap words within u64.
  *  @mask: source bitmap
diff --git a/lib/test_bitmap.c b/lib/test_bitmap.c
index 431c97fb1aa0..599c6713f2a2 100644
--- a/lib/test_bitmap.c
+++ b/lib/test_bitmap.c
@@ -175,24 +175,41 @@ struct test_bitmap_parselist{
 	const int flags;
 };
 
-static const unsigned long exp[] = {1, 2, 0x0000ffff, 0xffff0000, 0x55555555,
-				0xaaaaaaaa, 0x11111111, 0x22222222, 0xffffffff,
-				0xfffffffe, 0x3333333311111111, 0xffffffff77777777};
-static const unsigned long exp2[] = {0x3333333311111111, 0xffffffff77777777};
+static const unsigned long exp[] __initconst = {
+	BITMAP_FROM_U64(1),
+	BITMAP_FROM_U64(2),
+	BITMAP_FROM_U64(0x0000ffff),
+	BITMAP_FROM_U64(0xffff0000),
+	BITMAP_FROM_U64(0x55555555),
+	BITMAP_FROM_U64(0xaaaaaaaa),
+	BITMAP_FROM_U64(0x11111111),
+	BITMAP_FROM_U64(0x22222222),
+	BITMAP_FROM_U64(0xffffffff),
+	BITMAP_FROM_U64(0xfffffffe),
+	BITMAP_FROM_U64(0x3333333311111111),
+	BITMAP_FROM_U64(0xffffffff77777777)
+};
+
+static const unsigned long exp2[] __initconst = {
+	BITMAP_FROM_U64(0x3333333311111111),
+	BITMAP_FROM_U64(0xffffffff77777777)
+};
 
 static const struct test_bitmap_parselist parselist_tests[] __initconst = {
+#define step (sizeof(u64) / sizeof(unsigned long))
+
 	{0, "0",			&exp[0], 8, 0},
-	{0, "1",			&exp[1], 8, 0},
-	{0, "0-15",			&exp[2], 32, 0},
-	{0, "16-31",			&exp[3], 32, 0},
-	{0, "0-31:1/2",			&exp[4], 32, 0},
-	{0, "1-31:1/2",			&exp[5], 32, 0},
-	{0, "0-31:1/4",			&exp[6], 32, 0},
-	{0, "1-31:1/4",			&exp[7], 32, 0},
-	{0, "0-31:4/4",			&exp[8], 32, 0},
-	{0, "1-31:4/4",			&exp[9], 32, 0},
-	{0, "0-31:1/4,32-63:2/4",	&exp[10], 64, 0},
-	{0, "0-31:3/4,32-63:4/4",	&exp[11], 64, 0},
+	{0, "1",			&exp[1 * step], 8, 0},
+	{0, "0-15",			&exp[2 * step], 32, 0},
+	{0, "16-31",			&exp[3 * step], 32, 0},
+	{0, "0-31:1/2",			&exp[4 * step], 32, 0},
+	{0, "1-31:1/2",			&exp[5 * step], 32, 0},
+	{0, "0-31:1/4",			&exp[6 * step], 32, 0},
+	{0, "1-31:1/4",			&exp[7 * step], 32, 0},
+	{0, "0-31:4/4",			&exp[8 * step], 32, 0},
+	{0, "1-31:4/4",			&exp[9 * step], 32, 0},
+	{0, "0-31:1/4,32-63:2/4",	&exp[10 * step], 64, 0},
+	{0, "0-31:3/4,32-63:4/4",	&exp[11 * step], 64, 0},
 
 	{0, "0-31:1/4,32-63:2/4,64-95:3/4,96-127:4/4",	exp2, 128, 0},
 
-- 
cgit v1.2.3


From 895a60728f83684e738cce34d7d4e64631505ae4 Mon Sep 17 00:00:00 2001
From: Davidlohr Bueso <dave@stgolabs.net>
Date: Fri, 8 Sep 2017 16:15:45 -0700
Subject: lib/rhashtable: fix comment on locks_mul default value

As of commit 4cf0b354d92 ("rhashtable: avoid large lock-array
allocations"), the default value for the locks multiplier was reduced
from 128 to 32.

Update the header file to reflect this.

Link: http://lkml.kernel.org/r/20170815215401.30745-1-dave@stgolabs.net
Signed-off-by: Davidlohr Bueso <dbueso@suse.de>
Cc: Florian Westphal <fw@strlen.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/rhashtable.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/rhashtable.h b/include/linux/rhashtable.h
index 7d56a7ea2b2e..361c08e35dbc 100644
--- a/include/linux/rhashtable.h
+++ b/include/linux/rhashtable.h
@@ -127,7 +127,7 @@ struct rhashtable;
  * @head_offset: Offset of rhash_head in struct to be hashed
  * @max_size: Maximum size while expanding
  * @min_size: Minimum size while shrinking
- * @locks_mul: Number of bucket locks to allocate per cpu (default: 128)
+ * @locks_mul: Number of bucket locks to allocate per cpu (default: 32)
  * @automatic_shrinking: Enable automatic shrinking of tables
  * @nulls_base: Base value to generate nulls marker
  * @hashfn: Hash function (default: jhash2 if !(key_len % 4), or jhash)
-- 
cgit v1.2.3


From 42f46148217865a545e129612075f3d828a2c4e4 Mon Sep 17 00:00:00 2001
From: Ian Kent <raven@themaw.net>
Date: Fri, 8 Sep 2017 16:16:24 -0700
Subject: autofs: fix AT_NO_AUTOMOUNT not being honored

The fstatat(2) and statx() calls can pass the flag AT_NO_AUTOMOUNT which
is meant to clear the LOOKUP_AUTOMOUNT flag and prevent triggering of an
automount by the call.  But this flag is unconditionally cleared for all
stat family system calls except statx().

stat family system calls have always triggered mount requests for the
negative dentry case in follow_automount() which is intended but prevents
the fstatat(2) and statx() AT_NO_AUTOMOUNT case from being handled.

In order to handle the AT_NO_AUTOMOUNT for both system calls the negative
dentry case in follow_automount() needs to be changed to return ENOENT
when the LOOKUP_AUTOMOUNT flag is clear (and the other required flags are
clear).

AFAICT this change doesn't have any noticable side effects and may, in
some use cases (although I didn't see it in testing) prevent unnecessary
callbacks to the automount daemon.

It's also possible that a stat family call has been made with a path that
is in the process of being mounted by some other process.  But stat family
calls should return the automount state of the path as it is "now" so it
shouldn't wait for mount completion.

This is the same semantic as the positive dentry case already handled.

Link: http://lkml.kernel.org/r/150216641255.11652.4204561328197919771.stgit@pluto.themaw.net
Fixes: deccf497d804a4c5fca ("Make stat/lstat/fstatat pass AT_NO_AUTOMOUNT to vfs_statx()")
Signed-off-by: Ian Kent <raven@themaw.net>
Cc: David Howells <dhowells@redhat.com>
Cc: Colin Walters <walters@redhat.com>
Cc: Ondrej Holy <oholy@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/namei.c         | 15 ++++++++++++---
 include/linux/fs.h |  3 +--
 2 files changed, 13 insertions(+), 5 deletions(-)

(limited to 'include')

diff --git a/fs/namei.c b/fs/namei.c
index ddb6a7c2b3d4..1180f9c58093 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -1129,9 +1129,18 @@ static int follow_automount(struct path *path, struct nameidata *nd,
 	 * of the daemon to instantiate them before they can be used.
 	 */
 	if (!(nd->flags & (LOOKUP_PARENT | LOOKUP_DIRECTORY |
-			   LOOKUP_OPEN | LOOKUP_CREATE | LOOKUP_AUTOMOUNT)) &&
-	    path->dentry->d_inode)
-		return -EISDIR;
+			   LOOKUP_OPEN | LOOKUP_CREATE |
+			   LOOKUP_AUTOMOUNT))) {
+		/* Positive dentry that isn't meant to trigger an
+		 * automount, EISDIR will allow it to be used,
+		 * otherwise there's no mount here "now" so return
+		 * ENOENT.
+		 */
+		if (path->dentry->d_inode)
+			return -EISDIR;
+		else
+			return -ENOENT;
+	}
 
 	if (path->dentry->d_sb->s_user_ns != &init_user_ns)
 		return -EACCES;
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 6111976848ff..2d0e6748e46e 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -3043,8 +3043,7 @@ static inline int vfs_lstat(const char __user *name, struct kstat *stat)
 static inline int vfs_fstatat(int dfd, const char __user *filename,
 			      struct kstat *stat, int flags)
 {
-	return vfs_statx(dfd, filename, flags | AT_NO_AUTOMOUNT,
-			 stat, STATX_BASIC_STATS);
+	return vfs_statx(dfd, filename, flags, stat, STATX_BASIC_STATS);
 }
 static inline int vfs_fstat(int fd, struct kstat *stat)
 {
-- 
cgit v1.2.3


From 3dd8f7c3b78b9556582fd64bf5c9986723f9dca1 Mon Sep 17 00:00:00 2001
From: Ian Kent <raven@themaw.net>
Date: Fri, 8 Sep 2017 16:16:30 -0700
Subject: autofs: make dev ioctl version and ismountpoint user accessible

Some of the autofs miscellaneous device ioctls need to be accessable to
user space applications without CAP_SYS_ADMIN to get information about
autofs mounts.

Link: http://lkml.kernel.org/r/150216642517.11652.2338933266137331637.stgit@pluto.themaw.net
Signed-off-by: Ian Kent <raven@themaw.net>
Cc: Colin Walters <walters@redhat.com>
Cc: Ondrej Holy <oholy@redhat.com>
Cc: David Howells <dhowells@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/autofs4/dev-ioctl.c              | 12 ++++++++----
 include/uapi/linux/auto_dev-ioctl.h |  2 +-
 2 files changed, 9 insertions(+), 5 deletions(-)

(limited to 'include')

diff --git a/fs/autofs4/dev-ioctl.c b/fs/autofs4/dev-ioctl.c
index 218a4ecc75cc..ea8b3a1cddd2 100644
--- a/fs/autofs4/dev-ioctl.c
+++ b/fs/autofs4/dev-ioctl.c
@@ -628,10 +628,6 @@ static int _autofs_dev_ioctl(unsigned int command,
 	ioctl_fn fn = NULL;
 	int err = 0;
 
-	/* only root can play with this */
-	if (!capable(CAP_SYS_ADMIN))
-		return -EPERM;
-
 	cmd_first = _IOC_NR(AUTOFS_DEV_IOCTL_IOC_FIRST);
 	cmd = _IOC_NR(command);
 
@@ -640,6 +636,14 @@ static int _autofs_dev_ioctl(unsigned int command,
 		return -ENOTTY;
 	}
 
+	/* Only root can use ioctls other than AUTOFS_DEV_IOCTL_VERSION_CMD
+	 * and AUTOFS_DEV_IOCTL_ISMOUNTPOINT_CMD
+	 */
+	if (cmd != AUTOFS_DEV_IOCTL_VERSION_CMD &&
+	    cmd != AUTOFS_DEV_IOCTL_ISMOUNTPOINT_CMD &&
+	    !capable(CAP_SYS_ADMIN))
+		return -EPERM;
+
 	/* Copy the parameters into kernel space. */
 	param = copy_dev_ioctl(user);
 	if (IS_ERR(param))
diff --git a/include/uapi/linux/auto_dev-ioctl.h b/include/uapi/linux/auto_dev-ioctl.h
index 744b3d060968..5558db8e6646 100644
--- a/include/uapi/linux/auto_dev-ioctl.h
+++ b/include/uapi/linux/auto_dev-ioctl.h
@@ -16,7 +16,7 @@
 #define AUTOFS_DEVICE_NAME		"autofs"
 
 #define AUTOFS_DEV_IOCTL_VERSION_MAJOR	1
-#define AUTOFS_DEV_IOCTL_VERSION_MINOR	0
+#define AUTOFS_DEV_IOCTL_VERSION_MINOR	1
 
 #define AUTOFS_DEV_IOCTL_SIZE		sizeof(struct autofs_dev_ioctl)
 
-- 
cgit v1.2.3


From 1f28c5d055032e7e8ee5e48198dca7e125d0eec6 Mon Sep 17 00:00:00 2001
From: Tomohiro Kusumi <tkusumi@tuxera.com>
Date: Fri, 8 Sep 2017 16:16:34 -0700
Subject: autofs: remove unused AUTOFS_IOC_EXPIRE_DIRECT/INDIRECT

These are not used by either kernel or userspace, although
AUTOFS_IOC_EXPIRE_DIRECT once seems to have been used by userspace in
around 2006-2008, which was technically just an alias of the existing
ioctl AUTOFS_IOC_EXPIRE_MULTI.

ioctls for autofs are already complicated enough that they could be
removed unless these are staying here to be able to compile userspace code
of certain period of time from a decade ago.

Edit: raven@themaw.net
Yes, this is indeed very old and anything that still uses must
be updated becuase it will be using broken functionality.
End edit: raven@themaw.net

Link: http://lkml.kernel.org/r/150285067347.4670.11494624644273072003.stgit@pluto.themaw.net
Signed-off-by: Tomohiro Kusumi <tkusumi@tuxera.com>
Signed-off-by: Ian Kent <raven@themaw.net>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/uapi/linux/auto_fs4.h | 2 --
 1 file changed, 2 deletions(-)

(limited to 'include')

diff --git a/include/uapi/linux/auto_fs4.h b/include/uapi/linux/auto_fs4.h
index 7c6da423d54e..9453e9a07c9d 100644
--- a/include/uapi/linux/auto_fs4.h
+++ b/include/uapi/linux/auto_fs4.h
@@ -155,8 +155,6 @@ enum {
 };
 
 #define AUTOFS_IOC_EXPIRE_MULTI    _IOW(AUTOFS_IOCTL, AUTOFS_IOC_EXPIRE_MULTI_CMD, int)
-#define AUTOFS_IOC_EXPIRE_INDIRECT AUTOFS_IOC_EXPIRE_MULTI
-#define AUTOFS_IOC_EXPIRE_DIRECT   AUTOFS_IOC_EXPIRE_MULTI
 #define AUTOFS_IOC_PROTOSUBVER     _IOR(AUTOFS_IOCTL, AUTOFS_IOC_PROTOSUBVER_CMD, int)
 #define AUTOFS_IOC_ASKUMOUNT       _IOR(AUTOFS_IOCTL, AUTOFS_IOC_ASKUMOUNT_CMD, int)
 
-- 
cgit v1.2.3


From c1f3fa2a4fde2818623b42e3f749bd478be5dec7 Mon Sep 17 00:00:00 2001
From: "Luis R. Rodriguez" <mcgrof@kernel.org>
Date: Fri, 8 Sep 2017 16:17:08 -0700
Subject: kmod: split off umh headers into its own file

In the future usermode helper users do not need to carry in all the of
kmod headers declarations.

Since kmod.h still includes umh.h this change has no functional changes,
each umh user can be cleaned up separately later and with time.

Link: http://lkml.kernel.org/r/20170810180618.22457-4-mcgrof@kernel.org
Signed-off-by: Luis R. Rodriguez <mcgrof@kernel.org>
Cc: Kees Cook <keescook@chromium.org>
Cc: Dmitry Torokhov <dmitry.torokhov@gmail.com>
Cc: Jessica Yu <jeyu@redhat.com>
Cc: Rusty Russell <rusty@rustcorp.com.au>
Cc: Michal Marek <mmarek@suse.com>
Cc: Petr Mladek <pmladek@suse.com>
Cc: Miroslav Benes <mbenes@suse.cz>
Cc: Josh Poimboeuf <jpoimboe@redhat.com>
Cc: Guenter Roeck <linux@roeck-us.net>
Cc: "Eric W. Biederman" <ebiederm@xmission.com>
Cc: Matt Redfearn <matt.redfearn@imgtec.com>
Cc: Dan Carpenter <dan.carpenter@oracle.com>
Cc: Colin Ian King <colin.king@canonical.com>
Cc: Daniel Mentz <danielmentz@google.com>
Cc: David Binderman <dcb314@hotmail.com>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 MAINTAINERS          |  1 +
 include/linux/kmod.h | 60 +--------------------------------------------
 include/linux/umh.h  | 69 ++++++++++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 71 insertions(+), 59 deletions(-)
 create mode 100644 include/linux/umh.h

(limited to 'include')

diff --git a/MAINTAINERS b/MAINTAINERS
index 7aee06c1b775..ff3a349f24e4 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -7462,6 +7462,7 @@ M:	"Luis R. Rodriguez" <mcgrof@kernel.org>
 L:	linux-kernel@vger.kernel.org
 S:	Maintained
 F:	kernel/umh.c
+F:	include/linux/umh.h
 
 KERNEL VIRTUAL MACHINE (KVM)
 M:	Paolo Bonzini <pbonzini@redhat.com>
diff --git a/include/linux/kmod.h b/include/linux/kmod.h
index 655082c88fd9..40c89ad4bea6 100644
--- a/include/linux/kmod.h
+++ b/include/linux/kmod.h
@@ -19,6 +19,7 @@
  *      Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
  */
 
+#include <linux/umh.h>
 #include <linux/gfp.h>
 #include <linux/stddef.h>
 #include <linux/errno.h>
@@ -44,63 +45,4 @@ static inline int request_module_nowait(const char *name, ...) { return -ENOSYS;
 #define try_then_request_module(x, mod...) (x)
 #endif
 
-
-struct cred;
-struct file;
-
-#define UMH_NO_WAIT	0	/* don't wait at all */
-#define UMH_WAIT_EXEC	1	/* wait for the exec, but not the process */
-#define UMH_WAIT_PROC	2	/* wait for the process to complete */
-#define UMH_KILLABLE	4	/* wait for EXEC/PROC killable */
-
-struct subprocess_info {
-	struct work_struct work;
-	struct completion *complete;
-	const char *path;
-	char **argv;
-	char **envp;
-	int wait;
-	int retval;
-	int (*init)(struct subprocess_info *info, struct cred *new);
-	void (*cleanup)(struct subprocess_info *info);
-	void *data;
-} __randomize_layout;
-
-extern int
-call_usermodehelper(const char *path, char **argv, char **envp, int wait);
-
-extern struct subprocess_info *
-call_usermodehelper_setup(const char *path, char **argv, char **envp,
-			  gfp_t gfp_mask,
-			  int (*init)(struct subprocess_info *info, struct cred *new),
-			  void (*cleanup)(struct subprocess_info *), void *data);
-
-extern int
-call_usermodehelper_exec(struct subprocess_info *info, int wait);
-
-extern struct ctl_table usermodehelper_table[];
-
-enum umh_disable_depth {
-	UMH_ENABLED = 0,
-	UMH_FREEZING,
-	UMH_DISABLED,
-};
-
-extern int __usermodehelper_disable(enum umh_disable_depth depth);
-extern void __usermodehelper_set_disable_depth(enum umh_disable_depth depth);
-
-static inline int usermodehelper_disable(void)
-{
-	return __usermodehelper_disable(UMH_DISABLED);
-}
-
-static inline void usermodehelper_enable(void)
-{
-	__usermodehelper_set_disable_depth(UMH_ENABLED);
-}
-
-extern int usermodehelper_read_trylock(void);
-extern long usermodehelper_read_lock_wait(long timeout);
-extern void usermodehelper_read_unlock(void);
-
 #endif /* __LINUX_KMOD_H__ */
diff --git a/include/linux/umh.h b/include/linux/umh.h
new file mode 100644
index 000000000000..244aff638220
--- /dev/null
+++ b/include/linux/umh.h
@@ -0,0 +1,69 @@
+#ifndef __LINUX_UMH_H__
+#define __LINUX_UMH_H__
+
+#include <linux/gfp.h>
+#include <linux/stddef.h>
+#include <linux/errno.h>
+#include <linux/compiler.h>
+#include <linux/workqueue.h>
+#include <linux/sysctl.h>
+
+struct cred;
+struct file;
+
+#define UMH_NO_WAIT	0	/* don't wait at all */
+#define UMH_WAIT_EXEC	1	/* wait for the exec, but not the process */
+#define UMH_WAIT_PROC	2	/* wait for the process to complete */
+#define UMH_KILLABLE	4	/* wait for EXEC/PROC killable */
+
+struct subprocess_info {
+	struct work_struct work;
+	struct completion *complete;
+	const char *path;
+	char **argv;
+	char **envp;
+	int wait;
+	int retval;
+	int (*init)(struct subprocess_info *info, struct cred *new);
+	void (*cleanup)(struct subprocess_info *info);
+	void *data;
+} __randomize_layout;
+
+extern int
+call_usermodehelper(const char *path, char **argv, char **envp, int wait);
+
+extern struct subprocess_info *
+call_usermodehelper_setup(const char *path, char **argv, char **envp,
+			  gfp_t gfp_mask,
+			  int (*init)(struct subprocess_info *info, struct cred *new),
+			  void (*cleanup)(struct subprocess_info *), void *data);
+
+extern int
+call_usermodehelper_exec(struct subprocess_info *info, int wait);
+
+extern struct ctl_table usermodehelper_table[];
+
+enum umh_disable_depth {
+	UMH_ENABLED = 0,
+	UMH_FREEZING,
+	UMH_DISABLED,
+};
+
+extern int __usermodehelper_disable(enum umh_disable_depth depth);
+extern void __usermodehelper_set_disable_depth(enum umh_disable_depth depth);
+
+static inline int usermodehelper_disable(void)
+{
+	return __usermodehelper_disable(UMH_DISABLED);
+}
+
+static inline void usermodehelper_enable(void)
+{
+	__usermodehelper_set_disable_depth(UMH_ENABLED);
+}
+
+extern int usermodehelper_read_trylock(void);
+extern long usermodehelper_read_lock_wait(long timeout);
+extern void usermodehelper_read_unlock(void);
+
+#endif /* __LINUX_UMH_H__ */
-- 
cgit v1.2.3


From f22ef333c32cc683922d7e3361a83ebc31b2ac6d Mon Sep 17 00:00:00 2001
From: Alexey Dobriyan <adobriyan@gmail.com>
Date: Fri, 8 Sep 2017 16:17:15 -0700
Subject: cpumask: make cpumask_next() out-of-line

Every for_each_XXX_cpu() invocation calls cpumask_next() which is an
inline function:

	static inline unsigned int cpumask_next(int n, const struct cpumask *srcp)
	{
	        /* -1 is a legal arg here. */
	        if (n != -1)
	                cpumask_check(n);
	        return find_next_bit(cpumask_bits(srcp), nr_cpumask_bits, n + 1);
	}

However!

find_next_bit() is regular out-of-line function which means "nr_cpu_ids"
load and increment happen at the caller resulting in a lot of bloat

x86_64 defconfig:
	add/remove: 3/0 grow/shrink: 8/373 up/down: 155/-5668 (-5513)
x86_64 allyesconfig-ish:
	add/remove: 3/1 grow/shrink: 57/634 up/down: 3515/-28177 (-24662) !!!

Some archs redefine find_next_bit() but it is OK:

	m68k		inline but SMP is not supported
	arm		out-of-line
	unicore32	out-of-line

Function call will happen anyway, so move load and increment into callee.

Link: http://lkml.kernel.org/r/20170824230010.GA1593@avx2
Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/cpumask.h | 15 +--------------
 lib/cpumask.c           | 16 ++++++++++++++++
 2 files changed, 17 insertions(+), 14 deletions(-)

(limited to 'include')

diff --git a/include/linux/cpumask.h b/include/linux/cpumask.h
index 68c5a8290275..cd415b733c2a 100644
--- a/include/linux/cpumask.h
+++ b/include/linux/cpumask.h
@@ -178,20 +178,7 @@ static inline unsigned int cpumask_first(const struct cpumask *srcp)
 	return find_first_bit(cpumask_bits(srcp), nr_cpumask_bits);
 }
 
-/**
- * cpumask_next - get the next cpu in a cpumask
- * @n: the cpu prior to the place to search (ie. return will be > @n)
- * @srcp: the cpumask pointer
- *
- * Returns >= nr_cpu_ids if no further cpus set.
- */
-static inline unsigned int cpumask_next(int n, const struct cpumask *srcp)
-{
-	/* -1 is a legal arg here. */
-	if (n != -1)
-		cpumask_check(n);
-	return find_next_bit(cpumask_bits(srcp), nr_cpumask_bits, n+1);
-}
+unsigned int cpumask_next(int n, const struct cpumask *srcp);
 
 /**
  * cpumask_next_zero - get the next unset cpu in a cpumask
diff --git a/lib/cpumask.c b/lib/cpumask.c
index 4731a0895760..8b1a1bd77539 100644
--- a/lib/cpumask.c
+++ b/lib/cpumask.c
@@ -5,6 +5,22 @@
 #include <linux/export.h>
 #include <linux/bootmem.h>
 
+/**
+ * cpumask_next - get the next cpu in a cpumask
+ * @n: the cpu prior to the place to search (ie. return will be > @n)
+ * @srcp: the cpumask pointer
+ *
+ * Returns >= nr_cpu_ids if no further cpus set.
+ */
+unsigned int cpumask_next(int n, const struct cpumask *srcp)
+{
+	/* -1 is a legal arg here. */
+	if (n != -1)
+		cpumask_check(n);
+	return find_next_bit(cpumask_bits(srcp), nr_cpumask_bits, n + 1);
+}
+EXPORT_SYMBOL(cpumask_next);
+
 /**
  * cpumask_next_and - get the next cpu in *src1p & *src2p
  * @n: the cpu prior to the place to search (ie. return will be > @n)
-- 
cgit v1.2.3


From a2d818030135c293f878fbb772cf40e7a14c5acc Mon Sep 17 00:00:00 2001
From: "Robert P. J. Day" <rpjday@crashcourse.ca>
Date: Fri, 8 Sep 2017 16:17:19 -0700
Subject: drivers/pps: aesthetic tweaks to PPS-related content

Collection of aesthetic adjustments to various PPS-related files,
directories and Documentation, some quite minor just for the sake of
consistency, including:

 * Updated example of pps device tree node (courtesy Rodolfo G.)
 * "PPS-API" -> "PPS API"
 * "pps_source_info_s" -> "pps_source_info"
 * "ktimer driver" -> "pps-ktimer driver"
 * "ppstest /dev/pps0" -> "ppstest /dev/pps1" to match example
 * Add missing PPS-related entries to MAINTAINERS file
 * Other trivialities

Link: http://lkml.kernel.org/r/alpine.LFD.2.20.1708261048220.8106@localhost.localdomain
Signed-off-by: Robert P. J. Day <rpjday@crashcourse.ca>
Acked-by: Rodolfo Giometti <giometti@enneenne.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 Documentation/devicetree/bindings/pps/pps-gpio.txt |  8 +++-
 Documentation/pps/pps.txt                          | 44 +++++++++++-----------
 MAINTAINERS                                        |  3 ++
 include/linux/pps-gpio.h                           |  2 +-
 include/linux/pps_kernel.h                         | 16 ++++----
 include/uapi/linux/pps.h                           |  4 +-
 kernel/time/timekeeping.c                          |  2 +-
 7 files changed, 43 insertions(+), 36 deletions(-)

(limited to 'include')

diff --git a/Documentation/devicetree/bindings/pps/pps-gpio.txt b/Documentation/devicetree/bindings/pps/pps-gpio.txt
index 40bf9c3564a5..0de23b793657 100644
--- a/Documentation/devicetree/bindings/pps/pps-gpio.txt
+++ b/Documentation/devicetree/bindings/pps/pps-gpio.txt
@@ -13,8 +13,12 @@ Optional properties:
 
 Example:
 	pps {
-		compatible = "pps-gpio";
-		gpios = <&gpio2 6 0>;
+		pinctrl-names = "default";
+		pinctrl-0 = <&pinctrl_pps>;
 
+		gpios = <&gpio1 26 GPIO_ACTIVE_HIGH>;
 		assert-falling-edge;
+
+		compatible = "pps-gpio";
+		status = "okay";
 	};
diff --git a/Documentation/pps/pps.txt b/Documentation/pps/pps.txt
index 1fdbd5447216..99f5d8c4c652 100644
--- a/Documentation/pps/pps.txt
+++ b/Documentation/pps/pps.txt
@@ -48,12 +48,12 @@ problem:
    time_pps_create().
 
 This implies that the source has a /dev/... entry. This assumption is
-ok for the serial and parallel port, where you can do something
+OK for the serial and parallel port, where you can do something
 useful besides(!) the gathering of timestamps as it is the central
-task for a PPS-API. But this assumption does not work for a single
+task for a PPS API. But this assumption does not work for a single
 purpose GPIO line. In this case even basic file-related functionality
 (like read() and write()) makes no sense at all and should not be a
-precondition for the use of a PPS-API.
+precondition for the use of a PPS API.
 
 The problem can be simply solved if you consider that a PPS source is
 not always connected with a GPS data source.
@@ -88,13 +88,13 @@ Coding example
 --------------
 
 To register a PPS source into the kernel you should define a struct
-pps_source_info_s as follows:
+pps_source_info as follows:
 
     static struct pps_source_info pps_ktimer_info = {
 	    .name         = "ktimer",
 	    .path         = "",
-	    .mode         = PPS_CAPTUREASSERT | PPS_OFFSETASSERT | \
-			    PPS_ECHOASSERT | \
+	    .mode         = PPS_CAPTUREASSERT | PPS_OFFSETASSERT |
+			    PPS_ECHOASSERT |
 			    PPS_CANWAIT | PPS_TSFMT_TSPEC,
 	    .echo         = pps_ktimer_echo,
 	    .owner        = THIS_MODULE,
@@ -108,13 +108,13 @@ initialization routine as follows:
 
 The pps_register_source() prototype is:
 
-  int pps_register_source(struct pps_source_info_s *info, int default_params)
+  int pps_register_source(struct pps_source_info *info, int default_params)
 
 where "info" is a pointer to a structure that describes a particular
 PPS source, "default_params" tells the system what the initial default
 parameters for the device should be (it is obvious that these parameters
 must be a subset of ones defined in the struct
-pps_source_info_s which describe the capabilities of the driver).
+pps_source_info which describe the capabilities of the driver).
 
 Once you have registered a new PPS source into the system you can
 signal an assert event (for example in the interrupt handler routine)
@@ -142,8 +142,10 @@ If the SYSFS filesystem is enabled in the kernel it provides a new class:
 Every directory is the ID of a PPS sources defined in the system and
 inside you find several files:
 
-   $ ls /sys/class/pps/pps0/
-   assert	clear  echo  mode  name  path  subsystem@  uevent
+   $ ls -F /sys/class/pps/pps0/
+   assert     dev        mode       path       subsystem@
+   clear      echo       name       power/     uevent
+
 
 Inside each "assert" and "clear" file you can find the timestamp and a
 sequence number:
@@ -154,32 +156,32 @@ sequence number:
 Where before the "#" is the timestamp in seconds; after it is the
 sequence number. Other files are:
 
-* echo: reports if the PPS source has an echo function or not;
+ * echo: reports if the PPS source has an echo function or not;
 
-* mode: reports available PPS functioning modes;
+ * mode: reports available PPS functioning modes;
 
-* name: reports the PPS source's name;
+ * name: reports the PPS source's name;
 
-* path: reports the PPS source's device path, that is the device the
-  PPS source is connected to (if it exists).
+ * path: reports the PPS source's device path, that is the device the
+   PPS source is connected to (if it exists).
 
 
 Testing the PPS support
 -----------------------
 
 In order to test the PPS support even without specific hardware you can use
-the ktimer driver (see the client subsection in the PPS configuration menu)
+the pps-ktimer driver (see the client subsection in the PPS configuration menu)
 and the userland tools available in your distribution's pps-tools package,
-http://linuxpps.org , or https://github.com/ago/pps-tools .
+http://linuxpps.org , or https://github.com/redlab-i/pps-tools.
 
-Once you have enabled the compilation of ktimer just modprobe it (if
+Once you have enabled the compilation of pps-ktimer just modprobe it (if
 not statically compiled):
 
-   # modprobe ktimer
+   # modprobe pps-ktimer
 
 and the run ppstest as follow:
 
-   $ ./ppstest /dev/pps0
+   $ ./ppstest /dev/pps1
    trying PPS source "/dev/pps1"
    found PPS source "/dev/pps1"
    ok, found 1 source(s), now start fetching data...
@@ -187,7 +189,7 @@ and the run ppstest as follow:
    source 0 - assert 1186592700.388931295, sequence: 365 - clear  0.000000000, sequence: 0
    source 0 - assert 1186592701.389032765, sequence: 366 - clear  0.000000000, sequence: 0
 
-Please, note that to compile userland programs you need the file timepps.h .
+Please note that to compile userland programs, you need the file timepps.h.
 This is available in the pps-tools repository mentioned above.
 
 
diff --git a/MAINTAINERS b/MAINTAINERS
index ff3a349f24e4..109c5d9a04c4 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -10725,8 +10725,11 @@ W:	http://wiki.enneenne.com/index.php/LinuxPPS_support
 L:	linuxpps@ml.enneenne.com (subscribers-only)
 S:	Maintained
 F:	Documentation/pps/
+F:	Documentation/devicetree/bindings/pps/pps-gpio.txt
+F:	Documentation/ABI/testing/sysfs-pps
 F:	drivers/pps/
 F:	include/linux/pps*.h
+F:	include/uapi/linux/pps.h
 
 PPTP DRIVER
 M:	Dmitry Kozlov <xeb@mail.ru>
diff --git a/include/linux/pps-gpio.h b/include/linux/pps-gpio.h
index 0035abe41b9a..56f35dd3d01d 100644
--- a/include/linux/pps-gpio.h
+++ b/include/linux/pps-gpio.h
@@ -29,4 +29,4 @@ struct pps_gpio_platform_data {
 	const char *gpio_label;
 };
 
-#endif
+#endif /* _PPS_GPIO_H */
diff --git a/include/linux/pps_kernel.h b/include/linux/pps_kernel.h
index 35ac903956c7..80a980cc8d95 100644
--- a/include/linux/pps_kernel.h
+++ b/include/linux/pps_kernel.h
@@ -22,7 +22,6 @@
 #define LINUX_PPS_KERNEL_H
 
 #include <linux/pps.h>
-
 #include <linux/cdev.h>
 #include <linux/device.h>
 #include <linux/time.h>
@@ -35,9 +34,9 @@ struct pps_device;
 
 /* The specific PPS source info */
 struct pps_source_info {
-	char name[PPS_MAX_NAME_LEN];		/* simbolic name */
+	char name[PPS_MAX_NAME_LEN];		/* symbolic name */
 	char path[PPS_MAX_NAME_LEN];		/* path of connected device */
-	int mode;				/* PPS's allowed mode */
+	int mode;				/* PPS allowed mode */
 
 	void (*echo)(struct pps_device *pps,
 			int event, void *data);	/* PPS echo function */
@@ -57,10 +56,10 @@ struct pps_event_time {
 struct pps_device {
 	struct pps_source_info info;		/* PSS source info */
 
-	struct pps_kparams params;		/* PPS's current params */
+	struct pps_kparams params;		/* PPS current params */
 
-	__u32 assert_sequence;			/* PPS' assert event seq # */
-	__u32 clear_sequence;			/* PPS' clear event seq # */
+	__u32 assert_sequence;			/* PPS assert event seq # */
+	__u32 clear_sequence;			/* PPS clear event seq # */
 	struct pps_ktime assert_tu;
 	struct pps_ktime clear_tu;
 	int current_mode;			/* PPS mode at event time */
@@ -69,7 +68,7 @@ struct pps_device {
 	wait_queue_head_t queue;		/* PPS event queue */
 
 	unsigned int id;			/* PPS source unique ID */
-	void const *lookup_cookie;		/* pps_lookup_dev only */
+	void const *lookup_cookie;		/* For pps_lookup_dev() only */
 	struct cdev cdev;
 	struct device *dev;
 	struct fasync_struct *async_queue;	/* fasync method */
@@ -101,7 +100,7 @@ extern struct pps_device *pps_register_source(
 extern void pps_unregister_source(struct pps_device *pps);
 extern void pps_event(struct pps_device *pps,
 		struct pps_event_time *ts, int event, void *data);
-/* Look up a pps device by magic cookie */
+/* Look up a pps_device by magic cookie */
 struct pps_device *pps_lookup_dev(void const *cookie);
 
 static inline void timespec_to_pps_ktime(struct pps_ktime *kt,
@@ -132,4 +131,3 @@ static inline void pps_sub_ts(struct pps_event_time *ts, struct timespec64 delta
 }
 
 #endif /* LINUX_PPS_KERNEL_H */
-
diff --git a/include/uapi/linux/pps.h b/include/uapi/linux/pps.h
index c1cb3825a8bc..c29d6b791c08 100644
--- a/include/uapi/linux/pps.h
+++ b/include/uapi/linux/pps.h
@@ -95,8 +95,8 @@ struct pps_kparams {
 #define PPS_CAPTURECLEAR	0x02	/* capture clear events */
 #define PPS_CAPTUREBOTH		0x03	/* capture assert and clear events */
 
-#define PPS_OFFSETASSERT	0x10	/* apply compensation for assert ev. */
-#define PPS_OFFSETCLEAR		0x20	/* apply compensation for clear ev. */
+#define PPS_OFFSETASSERT	0x10	/* apply compensation for assert event */
+#define PPS_OFFSETCLEAR		0x20	/* apply compensation for clear event */
 
 #define PPS_CANWAIT		0x100	/* can we wait for an event? */
 #define PPS_CANPOLL		0x200	/* bit reserved for future use */
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index 8ea4fb315719..2cafb49aa65e 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -2316,7 +2316,7 @@ void hardpps(const struct timespec64 *phase_ts, const struct timespec64 *raw_ts)
 	raw_spin_unlock_irqrestore(&timekeeper_lock, flags);
 }
 EXPORT_SYMBOL(hardpps);
-#endif
+#endif /* CONFIG_NTP_PPS */
 
 /**
  * xtime_update() - advances the timekeeping infrastructure
-- 
cgit v1.2.3


From a2e0602c36ed9fe042714694dd5a889ecd8cb556 Mon Sep 17 00:00:00 2001
From: Elena Reshetova <elena.reshetova@intel.com>
Date: Fri, 8 Sep 2017 16:17:38 -0700
Subject: ipc: convert ipc_namespace.count from atomic_t to refcount_t

refcount_t type and corresponding API should be used instead of atomic_t
when the variable is used as a reference counter.  This allows to avoid
accidental refcounter overflows that might lead to use-after-free
situations.

Link: http://lkml.kernel.org/r/1499417992-3238-2-git-send-email-elena.reshetova@intel.com
Signed-off-by: Elena Reshetova <elena.reshetova@intel.com>
Signed-off-by: Hans Liljestrand <ishkamiel@gmail.com>
Signed-off-by: Kees Cook <keescook@chromium.org>
Signed-off-by: David Windsor <dwindsor@gmail.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: "Eric W. Biederman" <ebiederm@xmission.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Alexey Dobriyan <adobriyan@gmail.com>
Cc: Serge Hallyn <serge@hallyn.com>
Cc: <arozansk@redhat.com>
Cc: Davidlohr Bueso <dave@stgolabs.net>
Cc: Manfred Spraul <manfred@colorfullife.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/ipc_namespace.h | 5 +++--
 ipc/msgutil.c                 | 2 +-
 ipc/namespace.c               | 4 ++--
 3 files changed, 6 insertions(+), 5 deletions(-)

(limited to 'include')

diff --git a/include/linux/ipc_namespace.h b/include/linux/ipc_namespace.h
index 65327ee0936b..e81445cc7c57 100644
--- a/include/linux/ipc_namespace.h
+++ b/include/linux/ipc_namespace.h
@@ -7,6 +7,7 @@
 #include <linux/notifier.h>
 #include <linux/nsproxy.h>
 #include <linux/ns_common.h>
+#include <linux/refcount.h>
 
 struct user_namespace;
 
@@ -19,7 +20,7 @@ struct ipc_ids {
 };
 
 struct ipc_namespace {
-	atomic_t	count;
+	refcount_t	count;
 	struct ipc_ids	ids[3];
 
 	int		sem_ctls[4];
@@ -118,7 +119,7 @@ extern struct ipc_namespace *copy_ipcs(unsigned long flags,
 static inline struct ipc_namespace *get_ipc_ns(struct ipc_namespace *ns)
 {
 	if (ns)
-		atomic_inc(&ns->count);
+		refcount_inc(&ns->count);
 	return ns;
 }
 
diff --git a/ipc/msgutil.c b/ipc/msgutil.c
index bf74eaa5c39f..84598025a6ad 100644
--- a/ipc/msgutil.c
+++ b/ipc/msgutil.c
@@ -29,7 +29,7 @@ DEFINE_SPINLOCK(mq_lock);
  * and not CONFIG_IPC_NS.
  */
 struct ipc_namespace init_ipc_ns = {
-	.count		= ATOMIC_INIT(1),
+	.count		= REFCOUNT_INIT(1),
 	.user_ns = &init_user_ns,
 	.ns.inum = PROC_IPC_INIT_INO,
 #ifdef CONFIG_IPC_NS
diff --git a/ipc/namespace.c b/ipc/namespace.c
index b4d80f9f7246..7af6e6b883b9 100644
--- a/ipc/namespace.c
+++ b/ipc/namespace.c
@@ -50,7 +50,7 @@ static struct ipc_namespace *create_ipc_ns(struct user_namespace *user_ns,
 		goto fail_free;
 	ns->ns.ops = &ipcns_operations;
 
-	atomic_set(&ns->count, 1);
+	refcount_set(&ns->count, 1);
 	ns->user_ns = get_user_ns(user_ns);
 	ns->ucounts = ucounts;
 
@@ -144,7 +144,7 @@ static void free_ipc_ns(struct ipc_namespace *ns)
  */
 void put_ipc_ns(struct ipc_namespace *ns)
 {
-	if (atomic_dec_and_lock(&ns->count, &mq_lock)) {
+	if (refcount_dec_and_lock(&ns->count, &mq_lock)) {
 		mq_clear_sbinfo(ns);
 		spin_unlock(&mq_lock);
 		mq_put_mnt(ns);
-- 
cgit v1.2.3


From 9405c03ee778cd3e353e55fff6e16dfdd9609c02 Mon Sep 17 00:00:00 2001
From: Elena Reshetova <elena.reshetova@intel.com>
Date: Fri, 8 Sep 2017 16:17:45 -0700
Subject: ipc: convert kern_ipc_perm.refcount from atomic_t to refcount_t

refcount_t type and corresponding API should be used instead of atomic_t
when the variable is used as a reference counter.  This allows to avoid
accidental refcounter overflows that might lead to use-after-free
situations.

Link: http://lkml.kernel.org/r/1499417992-3238-4-git-send-email-elena.reshetova@intel.com
Signed-off-by: Elena Reshetova <elena.reshetova@intel.com>
Signed-off-by: Hans Liljestrand <ishkamiel@gmail.com>
Signed-off-by: Kees Cook <keescook@chromium.org>
Signed-off-by: David Windsor <dwindsor@gmail.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: "Eric W. Biederman" <ebiederm@xmission.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Alexey Dobriyan <adobriyan@gmail.com>
Cc: Serge Hallyn <serge@hallyn.com>
Cc: <arozansk@redhat.com>
Cc: Davidlohr Bueso <dave@stgolabs.net>
Cc: Manfred Spraul <manfred@colorfullife.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/ipc.h | 3 ++-
 ipc/util.c          | 6 +++---
 2 files changed, 5 insertions(+), 4 deletions(-)

(limited to 'include')

diff --git a/include/linux/ipc.h b/include/linux/ipc.h
index fadd579d577d..ae68980e9d48 100644
--- a/include/linux/ipc.h
+++ b/include/linux/ipc.h
@@ -4,6 +4,7 @@
 #include <linux/spinlock.h>
 #include <linux/uidgid.h>
 #include <uapi/linux/ipc.h>
+#include <linux/refcount.h>
 
 #define IPCMNI 32768  /* <= MAX_INT limit for ipc arrays (including sysctl changes) */
 
@@ -22,7 +23,7 @@ struct kern_ipc_perm {
 	void		*security;
 
 	struct rcu_head rcu;
-	atomic_t refcount;
+	refcount_t refcount;
 } ____cacheline_aligned_in_smp __randomize_layout;
 
 #endif /* _LINUX_IPC_H */
diff --git a/ipc/util.c b/ipc/util.c
index 1a2cb02467ab..069bb22c9f64 100644
--- a/ipc/util.c
+++ b/ipc/util.c
@@ -232,7 +232,7 @@ int ipc_addid(struct ipc_ids *ids, struct kern_ipc_perm *new, int size)
 
 	idr_preload(GFP_KERNEL);
 
-	atomic_set(&new->refcount, 1);
+	refcount_set(&new->refcount, 1);
 	spin_lock_init(&new->lock);
 	new->deleted = false;
 	rcu_read_lock();
@@ -397,13 +397,13 @@ void ipc_rmid(struct ipc_ids *ids, struct kern_ipc_perm *ipcp)
 
 int ipc_rcu_getref(struct kern_ipc_perm *ptr)
 {
-	return atomic_inc_not_zero(&ptr->refcount);
+	return refcount_inc_not_zero(&ptr->refcount);
 }
 
 void ipc_rcu_putref(struct kern_ipc_perm *ptr,
 			void (*func)(struct rcu_head *head))
 {
-	if (!atomic_dec_and_test(&ptr->refcount))
+	if (!refcount_dec_and_test(&ptr->refcount))
 		return;
 
 	call_rcu(&ptr->rcu, func);
-- 
cgit v1.2.3


From 0cfb6aee70bddbef6ec796b255f588ce0e126766 Mon Sep 17 00:00:00 2001
From: Guillaume Knispel <guillaume.knispel@supersonicimagine.com>
Date: Fri, 8 Sep 2017 16:17:55 -0700
Subject: ipc: optimize semget/shmget/msgget for lots of keys
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

ipc_findkey() used to scan all objects to look for the wanted key.  This
is slow when using a high number of keys.  This change adds an rhashtable
of kern_ipc_perm objects in ipc_ids, so that one lookup cease to be O(n).

This change gives a 865% improvement of benchmark reaim.jobs_per_min on a
56 threads Intel(R) Xeon(R) CPU E5-2695 v3 @ 2.30GHz with 256G memory [1]

Other (more micro) benchmark results, by the author: On an i5 laptop, the
following loop executed right after a reboot took, without and with this
change:

    for (int i = 0, k=0x424242; i < KEYS; ++i)
        semget(k++, 1, IPC_CREAT | 0600);

                 total       total          max single  max single
   KEYS        without        with        call without   call with

      1            3.5         4.9   Âµs            3.5         4.9
     10            7.6         8.6   Âµs            3.7         4.7
     32           16.2        15.9   Âµs            4.3         5.3
    100           72.9        41.8   Âµs            3.7         4.7
   1000        5,630.0       502.0   Âµs             *           *
  10000    1,340,000.0     7,240.0   Âµs             *           *
  31900   17,600,000.0    22,200.0   Âµs             *           *

 *: unreliable measure: high variance

The duration for a lookup-only usage was obtained by the same loop once
the keys are present:

                 total       total          max single  max single
   KEYS        without        with        call without   call with

      1            2.1         2.5   Âµs            2.1         2.5
     10            4.5         4.8   Âµs            2.2         2.3
     32           13.0        10.8   Âµs            2.3         2.8
    100           82.9        25.1   Âµs             *          2.3
   1000        5,780.0       217.0   Âµs             *           *
  10000    1,470,000.0     2,520.0   Âµs             *           *
  31900   17,400,000.0     7,810.0   Âµs             *           *

Finally, executing each semget() in a new process gave, when still
summing only the durations of these syscalls:

creation:
                 total       total
   KEYS        without        with

      1            3.7         5.0   Âµs
     10           32.9        36.7   Âµs
     32          125.0       109.0   Âµs
    100          523.0       353.0   Âµs
   1000       20,300.0     3,280.0   Âµs
  10000    2,470,000.0    46,700.0   Âµs
  31900   27,800,000.0   219,000.0   Âµs

lookup-only:
                 total       total
   KEYS        without        with

      1            2.5         2.7   Âµs
     10           25.4        24.4   Âµs
     32          106.0        72.6   Âµs
    100          591.0       352.0   Âµs
   1000       22,400.0     2,250.0   Âµs
  10000    2,510,000.0    25,700.0   Âµs
  31900   28,200,000.0   115,000.0   Âµs

[1] http://lkml.kernel.org/r/20170814060507.GE23258@yexl-desktop

Link: http://lkml.kernel.org/r/20170815194954.ck32ta2z35yuzpwp@debix
Signed-off-by: Guillaume Knispel <guillaume.knispel@supersonicimagine.com>
Reviewed-by: Marc Pardo <marc.pardo@supersonicimagine.com>
Cc: Davidlohr Bueso <dave@stgolabs.net>
Cc: Kees Cook <keescook@chromium.org>
Cc: Manfred Spraul <manfred@colorfullife.com>
Cc: Alexey Dobriyan <adobriyan@gmail.com>
Cc: "Eric W. Biederman" <ebiederm@xmission.com>
Cc: "Peter Zijlstra (Intel)" <peterz@infradead.org>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Cc: Serge Hallyn <serge@hallyn.com>
Cc: Andrey Vagin <avagin@openvz.org>
Cc: Guillaume Knispel <guillaume.knispel@supersonicimagine.com>
Cc: Marc Pardo <marc.pardo@supersonicimagine.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/ipc.h           |   3 ++
 include/linux/ipc_namespace.h |   3 ++
 ipc/msg.c                     |  10 +++--
 ipc/namespace.c               |  20 +++++++--
 ipc/sem.c                     |  11 +++--
 ipc/shm.c                     |  12 ++---
 ipc/util.c                    | 100 ++++++++++++++++++++++++++++++++----------
 ipc/util.h                    |  21 +++++----
 8 files changed, 130 insertions(+), 50 deletions(-)

(limited to 'include')

diff --git a/include/linux/ipc.h b/include/linux/ipc.h
index ae68980e9d48..92a2ccff80c5 100644
--- a/include/linux/ipc.h
+++ b/include/linux/ipc.h
@@ -3,6 +3,7 @@
 
 #include <linux/spinlock.h>
 #include <linux/uidgid.h>
+#include <linux/rhashtable.h>
 #include <uapi/linux/ipc.h>
 #include <linux/refcount.h>
 
@@ -22,6 +23,8 @@ struct kern_ipc_perm {
 	unsigned long	seq;
 	void		*security;
 
+	struct rhash_head khtnode;
+
 	struct rcu_head rcu;
 	refcount_t refcount;
 } ____cacheline_aligned_in_smp __randomize_layout;
diff --git a/include/linux/ipc_namespace.h b/include/linux/ipc_namespace.h
index e81445cc7c57..83f0bf7a587d 100644
--- a/include/linux/ipc_namespace.h
+++ b/include/linux/ipc_namespace.h
@@ -8,15 +8,18 @@
 #include <linux/nsproxy.h>
 #include <linux/ns_common.h>
 #include <linux/refcount.h>
+#include <linux/rhashtable.h>
 
 struct user_namespace;
 
 struct ipc_ids {
 	int in_use;
 	unsigned short seq;
+	bool tables_initialized;
 	struct rw_semaphore rwsem;
 	struct idr ipcs_idr;
 	int next_id;
+	struct rhashtable key_ht;
 };
 
 struct ipc_namespace {
diff --git a/ipc/msg.c b/ipc/msg.c
index 2c38f10d1483..df82bc9a5531 100644
--- a/ipc/msg.c
+++ b/ipc/msg.c
@@ -1011,7 +1011,7 @@ SYSCALL_DEFINE5(msgrcv, int, msqid, struct msgbuf __user *, msgp, size_t, msgsz,
 }
 
 
-void msg_init_ns(struct ipc_namespace *ns)
+int msg_init_ns(struct ipc_namespace *ns)
 {
 	ns->msg_ctlmax = MSGMAX;
 	ns->msg_ctlmnb = MSGMNB;
@@ -1019,7 +1019,7 @@ void msg_init_ns(struct ipc_namespace *ns)
 
 	atomic_set(&ns->msg_bytes, 0);
 	atomic_set(&ns->msg_hdrs, 0);
-	ipc_init_ids(&ns->ids[IPC_MSG_IDS]);
+	return ipc_init_ids(&ns->ids[IPC_MSG_IDS]);
 }
 
 #ifdef CONFIG_IPC_NS
@@ -1027,6 +1027,7 @@ void msg_exit_ns(struct ipc_namespace *ns)
 {
 	free_ipcs(ns, &msg_ids(ns), freeque);
 	idr_destroy(&ns->ids[IPC_MSG_IDS].ipcs_idr);
+	rhashtable_destroy(&ns->ids[IPC_MSG_IDS].key_ht);
 }
 #endif
 
@@ -1058,11 +1059,12 @@ static int sysvipc_msg_proc_show(struct seq_file *s, void *it)
 }
 #endif
 
-void __init msg_init(void)
+int __init msg_init(void)
 {
-	msg_init_ns(&init_ipc_ns);
+	const int err = msg_init_ns(&init_ipc_ns);
 
 	ipc_init_proc_interface("sysvipc/msg",
 				"       key      msqid perms      cbytes       qnum lspid lrpid   uid   gid  cuid  cgid      stime      rtime      ctime\n",
 				IPC_MSG_IDS, sysvipc_msg_proc_show);
+	return err;
 }
diff --git a/ipc/namespace.c b/ipc/namespace.c
index 7af6e6b883b9..fc850c526698 100644
--- a/ipc/namespace.c
+++ b/ipc/namespace.c
@@ -54,16 +54,28 @@ static struct ipc_namespace *create_ipc_ns(struct user_namespace *user_ns,
 	ns->user_ns = get_user_ns(user_ns);
 	ns->ucounts = ucounts;
 
-	err = mq_init_ns(ns);
+	err = sem_init_ns(ns);
 	if (err)
 		goto fail_put;
+	err = msg_init_ns(ns);
+	if (err)
+		goto fail_destroy_sem;
+	err = shm_init_ns(ns);
+	if (err)
+		goto fail_destroy_msg;
 
-	sem_init_ns(ns);
-	msg_init_ns(ns);
-	shm_init_ns(ns);
+	err = mq_init_ns(ns);
+	if (err)
+		goto fail_destroy_shm;
 
 	return ns;
 
+fail_destroy_shm:
+	shm_exit_ns(ns);
+fail_destroy_msg:
+	msg_exit_ns(ns);
+fail_destroy_sem:
+	sem_exit_ns(ns);
 fail_put:
 	put_user_ns(ns->user_ns);
 	ns_free_inum(&ns->ns);
diff --git a/ipc/sem.c b/ipc/sem.c
index 4c0cfaad560c..013c7981f3c7 100644
--- a/ipc/sem.c
+++ b/ipc/sem.c
@@ -183,14 +183,14 @@ static int sysvipc_sem_proc_show(struct seq_file *s, void *it);
 #define sc_semopm	sem_ctls[2]
 #define sc_semmni	sem_ctls[3]
 
-void sem_init_ns(struct ipc_namespace *ns)
+int sem_init_ns(struct ipc_namespace *ns)
 {
 	ns->sc_semmsl = SEMMSL;
 	ns->sc_semmns = SEMMNS;
 	ns->sc_semopm = SEMOPM;
 	ns->sc_semmni = SEMMNI;
 	ns->used_sems = 0;
-	ipc_init_ids(&ns->ids[IPC_SEM_IDS]);
+	return ipc_init_ids(&ns->ids[IPC_SEM_IDS]);
 }
 
 #ifdef CONFIG_IPC_NS
@@ -198,15 +198,18 @@ void sem_exit_ns(struct ipc_namespace *ns)
 {
 	free_ipcs(ns, &sem_ids(ns), freeary);
 	idr_destroy(&ns->ids[IPC_SEM_IDS].ipcs_idr);
+	rhashtable_destroy(&ns->ids[IPC_SEM_IDS].key_ht);
 }
 #endif
 
-void __init sem_init(void)
+int __init sem_init(void)
 {
-	sem_init_ns(&init_ipc_ns);
+	const int err = sem_init_ns(&init_ipc_ns);
+
 	ipc_init_proc_interface("sysvipc/sem",
 				"       key      semid perms      nsems   uid   gid  cuid  cgid      otime      ctime\n",
 				IPC_SEM_IDS, sysvipc_sem_proc_show);
+	return err;
 }
 
 /**
diff --git a/ipc/shm.c b/ipc/shm.c
index 8828b4c3a190..8fc97beb5234 100644
--- a/ipc/shm.c
+++ b/ipc/shm.c
@@ -72,14 +72,14 @@ static void shm_destroy(struct ipc_namespace *ns, struct shmid_kernel *shp);
 static int sysvipc_shm_proc_show(struct seq_file *s, void *it);
 #endif
 
-void shm_init_ns(struct ipc_namespace *ns)
+int shm_init_ns(struct ipc_namespace *ns)
 {
 	ns->shm_ctlmax = SHMMAX;
 	ns->shm_ctlall = SHMALL;
 	ns->shm_ctlmni = SHMMNI;
 	ns->shm_rmid_forced = 0;
 	ns->shm_tot = 0;
-	ipc_init_ids(&shm_ids(ns));
+	return ipc_init_ids(&shm_ids(ns));
 }
 
 /*
@@ -95,7 +95,7 @@ static void do_shm_rmid(struct ipc_namespace *ns, struct kern_ipc_perm *ipcp)
 	if (shp->shm_nattch) {
 		shp->shm_perm.mode |= SHM_DEST;
 		/* Do not find it any more */
-		shp->shm_perm.key = IPC_PRIVATE;
+		ipc_set_key_private(&shm_ids(ns), &shp->shm_perm);
 		shm_unlock(shp);
 	} else
 		shm_destroy(ns, shp);
@@ -106,13 +106,15 @@ void shm_exit_ns(struct ipc_namespace *ns)
 {
 	free_ipcs(ns, &shm_ids(ns), do_shm_rmid);
 	idr_destroy(&ns->ids[IPC_SHM_IDS].ipcs_idr);
+	rhashtable_destroy(&ns->ids[IPC_SHM_IDS].key_ht);
 }
 #endif
 
 static int __init ipc_ns_init(void)
 {
-	shm_init_ns(&init_ipc_ns);
-	return 0;
+	const int err = shm_init_ns(&init_ipc_ns);
+	WARN(err, "ipc: sysv shm_init_ns failed: %d\n", err);
+	return err;
 }
 
 pure_initcall(ipc_ns_init);
diff --git a/ipc/util.c b/ipc/util.c
index 069bb22c9f64..78755873cc5b 100644
--- a/ipc/util.c
+++ b/ipc/util.c
@@ -83,27 +83,46 @@ struct ipc_proc_iface {
  */
 static int __init ipc_init(void)
 {
-	sem_init();
-	msg_init();
+	int err_sem, err_msg;
+
+	err_sem = sem_init();
+	WARN(err_sem, "ipc: sysv sem_init failed: %d\n", err_sem);
+	err_msg = msg_init();
+	WARN(err_msg, "ipc: sysv msg_init failed: %d\n", err_msg);
 	shm_init();
-	return 0;
+
+	return err_msg ? err_msg : err_sem;
 }
 device_initcall(ipc_init);
 
+static const struct rhashtable_params ipc_kht_params = {
+	.head_offset		= offsetof(struct kern_ipc_perm, khtnode),
+	.key_offset		= offsetof(struct kern_ipc_perm, key),
+	.key_len		= FIELD_SIZEOF(struct kern_ipc_perm, key),
+	.locks_mul		= 1,
+	.automatic_shrinking	= true,
+};
+
 /**
  * ipc_init_ids	- initialise ipc identifiers
  * @ids: ipc identifier set
  *
  * Set up the sequence range to use for the ipc identifier range (limited
- * below IPCMNI) then initialise the ids idr.
+ * below IPCMNI) then initialise the keys hashtable and ids idr.
  */
-void ipc_init_ids(struct ipc_ids *ids)
+int ipc_init_ids(struct ipc_ids *ids)
 {
+	int err;
 	ids->in_use = 0;
 	ids->seq = 0;
 	ids->next_id = -1;
 	init_rwsem(&ids->rwsem);
+	err = rhashtable_init(&ids->key_ht, &ipc_kht_params);
+	if (err)
+		return err;
 	idr_init(&ids->ipcs_idr);
+	ids->tables_initialized = true;
+	return 0;
 }
 
 #ifdef CONFIG_PROC_FS
@@ -147,28 +166,20 @@ void __init ipc_init_proc_interface(const char *path, const char *header,
  * Returns the locked pointer to the ipc structure if found or NULL
  * otherwise. If key is found ipc points to the owning ipc structure
  *
- * Called with ipc_ids.rwsem held.
+ * Called with writer ipc_ids.rwsem held.
  */
 static struct kern_ipc_perm *ipc_findkey(struct ipc_ids *ids, key_t key)
 {
-	struct kern_ipc_perm *ipc;
-	int next_id;
-	int total;
-
-	for (total = 0, next_id = 0; total < ids->in_use; next_id++) {
-		ipc = idr_find(&ids->ipcs_idr, next_id);
-
-		if (ipc == NULL)
-			continue;
+	struct kern_ipc_perm *ipcp = NULL;
 
-		if (ipc->key != key) {
-			total++;
-			continue;
-		}
+	if (likely(ids->tables_initialized))
+		ipcp = rhashtable_lookup_fast(&ids->key_ht, &key,
+					      ipc_kht_params);
 
+	if (ipcp) {
 		rcu_read_lock();
-		ipc_lock_object(ipc);
-		return ipc;
+		ipc_lock_object(ipcp);
+		return ipcp;
 	}
 
 	return NULL;
@@ -221,13 +232,13 @@ int ipc_addid(struct ipc_ids *ids, struct kern_ipc_perm *new, int size)
 {
 	kuid_t euid;
 	kgid_t egid;
-	int id;
+	int id, err;
 	int next_id = ids->next_id;
 
 	if (size > IPCMNI)
 		size = IPCMNI;
 
-	if (ids->in_use >= size)
+	if (!ids->tables_initialized || ids->in_use >= size)
 		return -ENOSPC;
 
 	idr_preload(GFP_KERNEL);
@@ -246,6 +257,15 @@ int ipc_addid(struct ipc_ids *ids, struct kern_ipc_perm *new, int size)
 		       (next_id < 0) ? 0 : ipcid_to_idx(next_id), 0,
 		       GFP_NOWAIT);
 	idr_preload_end();
+
+	if (id >= 0 && new->key != IPC_PRIVATE) {
+		err = rhashtable_insert_fast(&ids->key_ht, &new->khtnode,
+					     ipc_kht_params);
+		if (err < 0) {
+			idr_remove(&ids->ipcs_idr, id);
+			id = err;
+		}
+	}
 	if (id < 0) {
 		spin_unlock(&new->lock);
 		rcu_read_unlock();
@@ -377,6 +397,20 @@ static int ipcget_public(struct ipc_namespace *ns, struct ipc_ids *ids,
 	return err;
 }
 
+/**
+ * ipc_kht_remove - remove an ipc from the key hashtable
+ * @ids: ipc identifier set
+ * @ipcp: ipc perm structure containing the key to remove
+ *
+ * ipc_ids.rwsem (as a writer) and the spinlock for this ID are held
+ * before this function is called, and remain locked on the exit.
+ */
+static void ipc_kht_remove(struct ipc_ids *ids, struct kern_ipc_perm *ipcp)
+{
+	if (ipcp->key != IPC_PRIVATE)
+		rhashtable_remove_fast(&ids->key_ht, &ipcp->khtnode,
+				       ipc_kht_params);
+}
 
 /**
  * ipc_rmid - remove an ipc identifier
@@ -391,10 +425,25 @@ void ipc_rmid(struct ipc_ids *ids, struct kern_ipc_perm *ipcp)
 	int lid = ipcid_to_idx(ipcp->id);
 
 	idr_remove(&ids->ipcs_idr, lid);
+	ipc_kht_remove(ids, ipcp);
 	ids->in_use--;
 	ipcp->deleted = true;
 }
 
+/**
+ * ipc_set_key_private - switch the key of an existing ipc to IPC_PRIVATE
+ * @ids: ipc identifier set
+ * @ipcp: ipc perm structure containing the key to modify
+ *
+ * ipc_ids.rwsem (as a writer) and the spinlock for this ID are held
+ * before this function is called, and remain locked on the exit.
+ */
+void ipc_set_key_private(struct ipc_ids *ids, struct kern_ipc_perm *ipcp)
+{
+	ipc_kht_remove(ids, ipcp);
+	ipcp->key = IPC_PRIVATE;
+}
+
 int ipc_rcu_getref(struct kern_ipc_perm *ptr)
 {
 	return refcount_inc_not_zero(&ptr->refcount);
@@ -485,7 +534,7 @@ void ipc64_perm_to_ipc_perm(struct ipc64_perm *in, struct ipc_perm *out)
 }
 
 /**
- * ipc_obtain_object
+ * ipc_obtain_object_idr
  * @ids: ipc identifier set
  * @id: ipc id to look for
  *
@@ -499,6 +548,9 @@ struct kern_ipc_perm *ipc_obtain_object_idr(struct ipc_ids *ids, int id)
 	struct kern_ipc_perm *out;
 	int lid = ipcid_to_idx(id);
 
+	if (unlikely(!ids->tables_initialized))
+		return ERR_PTR(-EINVAL);
+
 	out = idr_find(&ids->ipcs_idr, lid);
 	if (!out)
 		return ERR_PTR(-EINVAL);
diff --git a/ipc/util.h b/ipc/util.h
index c692010e6f0a..80c9f51c3f07 100644
--- a/ipc/util.h
+++ b/ipc/util.h
@@ -15,8 +15,8 @@
 
 #define SEQ_MULTIPLIER	(IPCMNI)
 
-void sem_init(void);
-void msg_init(void);
+int sem_init(void);
+int msg_init(void);
 void shm_init(void);
 
 struct ipc_namespace;
@@ -30,17 +30,17 @@ static inline void mq_put_mnt(struct ipc_namespace *ns) { }
 #endif
 
 #ifdef CONFIG_SYSVIPC
-void sem_init_ns(struct ipc_namespace *ns);
-void msg_init_ns(struct ipc_namespace *ns);
-void shm_init_ns(struct ipc_namespace *ns);
+int sem_init_ns(struct ipc_namespace *ns);
+int msg_init_ns(struct ipc_namespace *ns);
+int shm_init_ns(struct ipc_namespace *ns);
 
 void sem_exit_ns(struct ipc_namespace *ns);
 void msg_exit_ns(struct ipc_namespace *ns);
 void shm_exit_ns(struct ipc_namespace *ns);
 #else
-static inline void sem_init_ns(struct ipc_namespace *ns) { }
-static inline void msg_init_ns(struct ipc_namespace *ns) { }
-static inline void shm_init_ns(struct ipc_namespace *ns) { }
+static inline int sem_init_ns(struct ipc_namespace *ns) { return 0; }
+static inline int msg_init_ns(struct ipc_namespace *ns) { return 0; }
+static inline int shm_init_ns(struct ipc_namespace *ns) { return 0; }
 
 static inline void sem_exit_ns(struct ipc_namespace *ns) { }
 static inline void msg_exit_ns(struct ipc_namespace *ns) { }
@@ -79,7 +79,7 @@ struct ipc_ops {
 struct seq_file;
 struct ipc_ids;
 
-void ipc_init_ids(struct ipc_ids *);
+int ipc_init_ids(struct ipc_ids *);
 #ifdef CONFIG_PROC_FS
 void __init ipc_init_proc_interface(const char *path, const char *header,
 		int ids, int (*show)(struct seq_file *, void *));
@@ -104,6 +104,9 @@ int ipc_get_maxid(struct ipc_ids *);
 /* must be called with both locks acquired. */
 void ipc_rmid(struct ipc_ids *, struct kern_ipc_perm *);
 
+/* must be called with both locks acquired. */
+void ipc_set_key_private(struct ipc_ids *, struct kern_ipc_perm *);
+
 /* must be called with ipcp locked */
 int ipcperms(struct ipc_namespace *ns, struct kern_ipc_perm *ipcp, short flg);
 
-- 
cgit v1.2.3


From 5a67da2a71c64daeb456f6f3e87b5c7cecdc5ffa Mon Sep 17 00:00:00 2001
From: John Fastabend <john.fastabend@gmail.com>
Date: Fri, 8 Sep 2017 14:00:49 -0700
Subject: bpf: add support for sockmap detach programs

The bpf map sockmap supports adding programs via attach commands. This
patch adds the detach command to keep the API symmetric and allow
users to remove previously added programs. Otherwise the user would
have to delete the map and re-add it to get in this state.

This also adds a series of additional tests to capture detach operation
and also attaching/detaching invalid prog types.

API note: socks will run (or not run) programs depending on the state
of the map at the time the sock is added. We do not for example walk
the map and remove programs from previously attached socks.

Acked-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: John Fastabend <john.fastabend@gmail.com>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/bpf.h                     |  8 +++---
 kernel/bpf/sockmap.c                    |  2 +-
 kernel/bpf/syscall.c                    | 27 ++++++++++-------
 tools/testing/selftests/bpf/test_maps.c | 51 ++++++++++++++++++++++++++++++++-
 4 files changed, 72 insertions(+), 16 deletions(-)

(limited to 'include')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index c2cb1b5c094e..8390859e79e7 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -385,16 +385,16 @@ static inline void __dev_map_flush(struct bpf_map *map)
 
 #if defined(CONFIG_STREAM_PARSER) && defined(CONFIG_BPF_SYSCALL)
 struct sock  *__sock_map_lookup_elem(struct bpf_map *map, u32 key);
-int sock_map_attach_prog(struct bpf_map *map, struct bpf_prog *prog, u32 type);
+int sock_map_prog(struct bpf_map *map, struct bpf_prog *prog, u32 type);
 #else
 static inline struct sock  *__sock_map_lookup_elem(struct bpf_map *map, u32 key)
 {
 	return NULL;
 }
 
-static inline int sock_map_attach_prog(struct bpf_map *map,
-				       struct bpf_prog *prog,
-				       u32 type)
+static inline int sock_map_prog(struct bpf_map *map,
+				struct bpf_prog *prog,
+				u32 type)
 {
 	return -EOPNOTSUPP;
 }
diff --git a/kernel/bpf/sockmap.c b/kernel/bpf/sockmap.c
index f6ffde9c6a68..6424ce0e4969 100644
--- a/kernel/bpf/sockmap.c
+++ b/kernel/bpf/sockmap.c
@@ -792,7 +792,7 @@ out_progs:
 	return err;
 }
 
-int sock_map_attach_prog(struct bpf_map *map, struct bpf_prog *prog, u32 type)
+int sock_map_prog(struct bpf_map *map, struct bpf_prog *prog, u32 type)
 {
 	struct bpf_stab *stab = container_of(map, struct bpf_stab, map);
 	struct bpf_prog *orig;
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 70ad8e220343..cb17e1cd1d43 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -1096,10 +1096,10 @@ static int bpf_obj_get(const union bpf_attr *attr)
 
 #define BPF_PROG_ATTACH_LAST_FIELD attach_flags
 
-static int sockmap_get_from_fd(const union bpf_attr *attr)
+static int sockmap_get_from_fd(const union bpf_attr *attr, bool attach)
 {
+	struct bpf_prog *prog = NULL;
 	int ufd = attr->target_fd;
-	struct bpf_prog *prog;
 	struct bpf_map *map;
 	struct fd f;
 	int err;
@@ -1109,16 +1109,20 @@ static int sockmap_get_from_fd(const union bpf_attr *attr)
 	if (IS_ERR(map))
 		return PTR_ERR(map);
 
-	prog = bpf_prog_get_type(attr->attach_bpf_fd, BPF_PROG_TYPE_SK_SKB);
-	if (IS_ERR(prog)) {
-		fdput(f);
-		return PTR_ERR(prog);
+	if (attach) {
+		prog = bpf_prog_get_type(attr->attach_bpf_fd,
+					 BPF_PROG_TYPE_SK_SKB);
+		if (IS_ERR(prog)) {
+			fdput(f);
+			return PTR_ERR(prog);
+		}
 	}
 
-	err = sock_map_attach_prog(map, prog, attr->attach_type);
+	err = sock_map_prog(map, prog, attr->attach_type);
 	if (err) {
 		fdput(f);
-		bpf_prog_put(prog);
+		if (prog)
+			bpf_prog_put(prog);
 		return err;
 	}
 
@@ -1155,7 +1159,7 @@ static int bpf_prog_attach(const union bpf_attr *attr)
 		break;
 	case BPF_SK_SKB_STREAM_PARSER:
 	case BPF_SK_SKB_STREAM_VERDICT:
-		return sockmap_get_from_fd(attr);
+		return sockmap_get_from_fd(attr, true);
 	default:
 		return -EINVAL;
 	}
@@ -1204,7 +1208,10 @@ static int bpf_prog_detach(const union bpf_attr *attr)
 		ret = cgroup_bpf_update(cgrp, NULL, attr->attach_type, false);
 		cgroup_put(cgrp);
 		break;
-
+	case BPF_SK_SKB_STREAM_PARSER:
+	case BPF_SK_SKB_STREAM_VERDICT:
+		ret = sockmap_get_from_fd(attr, false);
+		break;
 	default:
 		return -EINVAL;
 	}
diff --git a/tools/testing/selftests/bpf/test_maps.c b/tools/testing/selftests/bpf/test_maps.c
index 4acc772a28c0..fe3a443a1102 100644
--- a/tools/testing/selftests/bpf/test_maps.c
+++ b/tools/testing/selftests/bpf/test_maps.c
@@ -558,7 +558,7 @@ static void test_sockmap(int tasks, void *data)
 		}
 	}
 
-	/* Test attaching bad fds */
+	/* Test attaching/detaching bad fds */
 	err = bpf_prog_attach(-1, fd, BPF_SK_SKB_STREAM_PARSER, 0);
 	if (!err) {
 		printf("Failed invalid parser prog attach\n");
@@ -571,6 +571,30 @@ static void test_sockmap(int tasks, void *data)
 		goto out_sockmap;
 	}
 
+	err = bpf_prog_attach(-1, fd, __MAX_BPF_ATTACH_TYPE, 0);
+	if (!err) {
+		printf("Failed unknown prog attach\n");
+		goto out_sockmap;
+	}
+
+	err = bpf_prog_detach(fd, BPF_SK_SKB_STREAM_PARSER);
+	if (err) {
+		printf("Failed empty parser prog detach\n");
+		goto out_sockmap;
+	}
+
+	err = bpf_prog_detach(fd, BPF_SK_SKB_STREAM_VERDICT);
+	if (err) {
+		printf("Failed empty verdict prog detach\n");
+		goto out_sockmap;
+	}
+
+	err = bpf_prog_detach(fd, __MAX_BPF_ATTACH_TYPE);
+	if (!err) {
+		printf("Detach invalid prog successful\n");
+		goto out_sockmap;
+	}
+
 	/* Load SK_SKB program and Attach */
 	err = bpf_prog_load(SOCKMAP_PARSE_PROG,
 			    BPF_PROG_TYPE_SK_SKB, &obj, &parse_prog);
@@ -643,6 +667,13 @@ static void test_sockmap(int tasks, void *data)
 		goto out_sockmap;
 	}
 
+	err = bpf_prog_attach(verdict_prog, map_fd_rx,
+			      __MAX_BPF_ATTACH_TYPE, 0);
+	if (!err) {
+		printf("Attached unknown bpf prog\n");
+		goto out_sockmap;
+	}
+
 	/* Test map update elem afterwards fd lives in fd and map_fd */
 	for (i = 0; i < 6; i++) {
 		err = bpf_map_update_elem(map_fd_rx, &i, &sfd[i], BPF_ANY);
@@ -809,6 +840,24 @@ static void test_sockmap(int tasks, void *data)
 		assert(status == 0);
 	}
 
+	err = bpf_prog_detach(map_fd_rx, __MAX_BPF_ATTACH_TYPE);
+	if (!err) {
+		printf("Detached an invalid prog type.\n");
+		goto out_sockmap;
+	}
+
+	err = bpf_prog_detach(map_fd_rx, BPF_SK_SKB_STREAM_PARSER);
+	if (err) {
+		printf("Failed parser prog detach\n");
+		goto out_sockmap;
+	}
+
+	err = bpf_prog_detach(map_fd_rx, BPF_SK_SKB_STREAM_VERDICT);
+	if (err) {
+		printf("Failed parser prog detach\n");
+		goto out_sockmap;
+	}
+
 	/* Test map close sockets */
 	for (i = 0; i < 6; i++)
 		close(sfd[i]);
-- 
cgit v1.2.3


From 9beb8bedb05c5f3a353dba62b8fa7cbbb9ec685e Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Sat, 9 Sep 2017 01:40:35 +0200
Subject: bpf: make error reporting in bpf_warn_invalid_xdp_action more clear

Differ between illegal XDP action code and just driver
unsupported one to provide better feedback when we throw
a one-time warning here. Reason is that with 814abfabef3c
("xdp: add bpf_redirect helper function") not all drivers
support the new XDP return code yet and thus they will
fall into their 'default' case when checking for return
codes after program return, which then triggers a
bpf_warn_invalid_xdp_action() stating that the return
code is illegal, but from XDP perspective it's not.

I decided not to place something like a XDP_ACT_MAX define
into uapi i) given we don't have this either for all other
program types, ii) future action codes could have further
encoding there, which would render such define unsuitable
and we wouldn't be able to rip it out again, and iii) we
rarely add new action codes.

Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/bpf.h | 4 ++--
 net/core/filter.c        | 6 +++++-
 2 files changed, 7 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index ba848b761cfb..43ab5c402f98 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -766,8 +766,8 @@ struct bpf_sock {
 
 /* User return codes for XDP prog type.
  * A valid XDP program must return one of these defined values. All other
- * return codes are reserved for future use. Unknown return codes will result
- * in packet drop.
+ * return codes are reserved for future use. Unknown return codes will
+ * result in packet drops and a warning via bpf_warn_invalid_xdp_action().
  */
 enum xdp_action {
 	XDP_ABORTED = 0,
diff --git a/net/core/filter.c b/net/core/filter.c
index 0848df2cd9bf..3a50a9b021e2 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -3609,7 +3609,11 @@ static bool xdp_is_valid_access(int off, int size,
 
 void bpf_warn_invalid_xdp_action(u32 act)
 {
-	WARN_ONCE(1, "Illegal XDP return value %u, expect packet loss\n", act);
+	const u32 act_max = XDP_REDIRECT;
+
+	WARN_ONCE(1, "%s XDP return value %u, expect packet loss!\n",
+		  act > act_max ? "Illegal" : "Driver unsupported",
+		  act);
 }
 EXPORT_SYMBOL_GPL(bpf_warn_invalid_xdp_action);
 
-- 
cgit v1.2.3


From 0bb9c2b27f5e503e683ad27a0f0a6286f5b5da34 Mon Sep 17 00:00:00 2001
From: Dhinakaran Pandiyan <dhinakaran.pandiyan@intel.com>
Date: Wed, 6 Sep 2017 17:14:58 -0700
Subject: drm/dp/mst: Sideband message transaction to power up/down nodes
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The POWER_DOWN_PHY and POWER_UP_PHY sideband message transactions allow
the source to reqest any node in a mst path or a whole path to be
powered down or up. This allows drivers to target a specific sink in the
MST topology, an improvement over just power managing the imediate
downstream device. Secondly, since the request-reply protocol waits for an
ACK, we can be sure that a downstream sink has enough time to respond to a
power up/down request.

v2: Fix memory leak (Lyude)
Cc: Lyude <lyude@redhat.com>
Cc: Ville Syrjälä <ville.syrjala@linux.intel.com>
Cc: Harry Wentland <harry.wentland@amd.com>
Signed-off-by: Dhinakaran Pandiyan <dhinakaran.pandiyan@intel.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20170907001458.9399-1-dhinakaran.pandiyan@intel.com
Reviewed-by: Lyude Paul <lyude@redhat.com>
Signed-off-by: Ville Syrjälä <ville.syrjala@linux.intel.com>
---
 drivers/gpu/drm/drm_dp_mst_topology.c | 74 +++++++++++++++++++++++++++++++++++
 include/drm/drm_dp_mst_helper.h       |  2 +
 2 files changed, 76 insertions(+)

(limited to 'include')

diff --git a/drivers/gpu/drm/drm_dp_mst_topology.c b/drivers/gpu/drm/drm_dp_mst_topology.c
index 41b492f99955..70dcfa58d3c2 100644
--- a/drivers/gpu/drm/drm_dp_mst_topology.c
+++ b/drivers/gpu/drm/drm_dp_mst_topology.c
@@ -294,6 +294,12 @@ static void drm_dp_encode_sideband_req(struct drm_dp_sideband_msg_req_body *req,
 		memcpy(&buf[idx], req->u.i2c_write.bytes, req->u.i2c_write.num_bytes);
 		idx += req->u.i2c_write.num_bytes;
 		break;
+
+	case DP_POWER_DOWN_PHY:
+	case DP_POWER_UP_PHY:
+		buf[idx] = (req->u.port_num.port_number & 0xf) << 4;
+		idx++;
+		break;
 	}
 	raw->cur_len = idx;
 }
@@ -538,6 +544,21 @@ fail_len:
 	return false;
 }
 
+static bool drm_dp_sideband_parse_power_updown_phy_ack(struct drm_dp_sideband_msg_rx *raw,
+						       struct drm_dp_sideband_msg_reply_body *repmsg)
+{
+	int idx = 1;
+
+	repmsg->u.port_number.port_number = (raw->msg[idx] >> 4) & 0xf;
+	idx++;
+	if (idx > raw->curlen) {
+		DRM_DEBUG_KMS("power up/down phy parse length fail %d %d\n",
+			      idx, raw->curlen);
+		return false;
+	}
+	return true;
+}
+
 static bool drm_dp_sideband_parse_reply(struct drm_dp_sideband_msg_rx *raw,
 					struct drm_dp_sideband_msg_reply_body *msg)
 {
@@ -567,6 +588,9 @@ static bool drm_dp_sideband_parse_reply(struct drm_dp_sideband_msg_rx *raw,
 		return drm_dp_sideband_parse_enum_path_resources_ack(raw, msg);
 	case DP_ALLOCATE_PAYLOAD:
 		return drm_dp_sideband_parse_allocate_payload_ack(raw, msg);
+	case DP_POWER_DOWN_PHY:
+	case DP_POWER_UP_PHY:
+		return drm_dp_sideband_parse_power_updown_phy_ack(raw, msg);
 	default:
 		DRM_ERROR("Got unknown reply 0x%02x\n", msg->req_type);
 		return false;
@@ -693,6 +717,22 @@ static int build_allocate_payload(struct drm_dp_sideband_msg_tx *msg, int port_n
 	return 0;
 }
 
+static int build_power_updown_phy(struct drm_dp_sideband_msg_tx *msg,
+				  int port_num, bool power_up)
+{
+	struct drm_dp_sideband_msg_req_body req;
+
+	if (power_up)
+		req.req_type = DP_POWER_UP_PHY;
+	else
+		req.req_type = DP_POWER_DOWN_PHY;
+
+	req.u.port_num.port_number = port_num;
+	drm_dp_encode_sideband_req(&req, msg);
+	msg->path_msg = true;
+	return 0;
+}
+
 static int drm_dp_mst_assign_payload_id(struct drm_dp_mst_topology_mgr *mgr,
 					struct drm_dp_vcpi *vcpi)
 {
@@ -1724,6 +1764,40 @@ fail_put:
 	return ret;
 }
 
+int drm_dp_send_power_updown_phy(struct drm_dp_mst_topology_mgr *mgr,
+				 struct drm_dp_mst_port *port, bool power_up)
+{
+	struct drm_dp_sideband_msg_tx *txmsg;
+	int len, ret;
+
+	port = drm_dp_get_validated_port_ref(mgr, port);
+	if (!port)
+		return -EINVAL;
+
+	txmsg = kzalloc(sizeof(*txmsg), GFP_KERNEL);
+	if (!txmsg) {
+		drm_dp_put_port(port);
+		return -ENOMEM;
+	}
+
+	txmsg->dst = port->parent;
+	len = build_power_updown_phy(txmsg, port->port_num, power_up);
+	drm_dp_queue_down_tx(mgr, txmsg);
+
+	ret = drm_dp_mst_wait_tx_reply(port->parent, txmsg);
+	if (ret > 0) {
+		if (txmsg->reply.reply_type == 1)
+			ret = -EINVAL;
+		else
+			ret = 0;
+	}
+	kfree(txmsg);
+	drm_dp_put_port(port);
+
+	return ret;
+}
+EXPORT_SYMBOL(drm_dp_send_power_updown_phy);
+
 static int drm_dp_create_payload_step1(struct drm_dp_mst_topology_mgr *mgr,
 				       int id,
 				       struct drm_dp_payload *payload)
diff --git a/include/drm/drm_dp_mst_helper.h b/include/drm/drm_dp_mst_helper.h
index d55abb75f29a..7f78d26a0766 100644
--- a/include/drm/drm_dp_mst_helper.h
+++ b/include/drm/drm_dp_mst_helper.h
@@ -631,5 +631,7 @@ int drm_dp_atomic_find_vcpi_slots(struct drm_atomic_state *state,
 int drm_dp_atomic_release_vcpi_slots(struct drm_atomic_state *state,
 				     struct drm_dp_mst_topology_mgr *mgr,
 				     int slots);
+int drm_dp_send_power_updown_phy(struct drm_dp_mst_topology_mgr *mgr,
+				 struct drm_dp_mst_port *port, bool power_up);
 
 #endif
-- 
cgit v1.2.3


From c3ca015fab6df124c933b91902f3f2a3473f9da5 Mon Sep 17 00:00:00 2001
From: Mikulas Patocka <mpatocka@redhat.com>
Date: Thu, 31 Aug 2017 21:47:43 -0400
Subject: dax: remove the pmem_dax_ops->flush abstraction

Commit abebfbe2f731 ("dm: add ->flush() dax operation support") is
buggy. A DM device may be composed of multiple underlying devices and
all of them need to be flushed. That commit just routes the flush
request to the first device and ignores the other devices.

It could be fixed by adding more complex logic to the device mapper. But
there is only one implementation of the method pmem_dax_ops->flush - that
is pmem_dax_flush() - and it calls arch_wb_cache_pmem(). Consequently, we
don't need the pmem_dax_ops->flush abstraction at all, we can call
arch_wb_cache_pmem() directly from dax_flush() because dax_dev->ops->flush
can't ever reach anything different from arch_wb_cache_pmem().

It should be also pointed out that for some uses of persistent memory it
is needed to flush only a very small amount of data (such as 1 cacheline),
and it would be overkill if we go through that device mapper machinery for
a single flushed cache line.

Fix this by removing the pmem_dax_ops->flush abstraction and call
arch_wb_cache_pmem() directly from dax_flush(). Also, remove the device
mapper code that forwards the flushes.

Fixes: abebfbe2f731 ("dm: add ->flush() dax operation support")
Cc: stable@vger.kernel.org
Signed-off-by: Mikulas Patocka <mpatocka@redhat.com>
Reviewed-by: Dan Williams <dan.j.williams@intel.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
---
 drivers/dax/super.c           | 21 ++++++++++++++-------
 drivers/md/dm-linear.c        | 15 ---------------
 drivers/md/dm-stripe.c        | 20 --------------------
 drivers/md/dm.c               | 19 -------------------
 drivers/nvdimm/pmem.c         |  7 -------
 fs/dax.c                      |  4 ++--
 include/linux/dax.h           |  5 +----
 include/linux/device-mapper.h |  3 ---
 8 files changed, 17 insertions(+), 77 deletions(-)

(limited to 'include')

diff --git a/drivers/dax/super.c b/drivers/dax/super.c
index 938eb4868f7f..8b458f1b30c7 100644
--- a/drivers/dax/super.c
+++ b/drivers/dax/super.c
@@ -189,8 +189,10 @@ static umode_t dax_visible(struct kobject *kobj, struct attribute *a, int n)
 	if (!dax_dev)
 		return 0;
 
-	if (a == &dev_attr_write_cache.attr && !dax_dev->ops->flush)
+#ifndef CONFIG_ARCH_HAS_PMEM_API
+	if (a == &dev_attr_write_cache.attr)
 		return 0;
+#endif
 	return a->mode;
 }
 
@@ -255,18 +257,23 @@ size_t dax_copy_from_iter(struct dax_device *dax_dev, pgoff_t pgoff, void *addr,
 }
 EXPORT_SYMBOL_GPL(dax_copy_from_iter);
 
-void dax_flush(struct dax_device *dax_dev, pgoff_t pgoff, void *addr,
-		size_t size)
+#ifdef CONFIG_ARCH_HAS_PMEM_API
+void arch_wb_cache_pmem(void *addr, size_t size);
+void dax_flush(struct dax_device *dax_dev, void *addr, size_t size)
 {
-	if (!dax_alive(dax_dev))
+	if (unlikely(!dax_alive(dax_dev)))
 		return;
 
-	if (!test_bit(DAXDEV_WRITE_CACHE, &dax_dev->flags))
+	if (unlikely(!test_bit(DAXDEV_WRITE_CACHE, &dax_dev->flags)))
 		return;
 
-	if (dax_dev->ops->flush)
-		dax_dev->ops->flush(dax_dev, pgoff, addr, size);
+	arch_wb_cache_pmem(addr, size);
 }
+#else
+void dax_flush(struct dax_device *dax_dev, void *addr, size_t size)
+{
+}
+#endif
 EXPORT_SYMBOL_GPL(dax_flush);
 
 void dax_write_cache(struct dax_device *dax_dev, bool wc)
diff --git a/drivers/md/dm-linear.c b/drivers/md/dm-linear.c
index 41971a090e34..208800610af8 100644
--- a/drivers/md/dm-linear.c
+++ b/drivers/md/dm-linear.c
@@ -184,20 +184,6 @@ static size_t linear_dax_copy_from_iter(struct dm_target *ti, pgoff_t pgoff,
 	return dax_copy_from_iter(dax_dev, pgoff, addr, bytes, i);
 }
 
-static void linear_dax_flush(struct dm_target *ti, pgoff_t pgoff, void *addr,
-		size_t size)
-{
-	struct linear_c *lc = ti->private;
-	struct block_device *bdev = lc->dev->bdev;
-	struct dax_device *dax_dev = lc->dev->dax_dev;
-	sector_t dev_sector, sector = pgoff * PAGE_SECTORS;
-
-	dev_sector = linear_map_sector(ti, sector);
-	if (bdev_dax_pgoff(bdev, dev_sector, ALIGN(size, PAGE_SIZE), &pgoff))
-		return;
-	dax_flush(dax_dev, pgoff, addr, size);
-}
-
 static struct target_type linear_target = {
 	.name   = "linear",
 	.version = {1, 4, 0},
@@ -212,7 +198,6 @@ static struct target_type linear_target = {
 	.iterate_devices = linear_iterate_devices,
 	.direct_access = linear_dax_direct_access,
 	.dax_copy_from_iter = linear_dax_copy_from_iter,
-	.dax_flush = linear_dax_flush,
 };
 
 int __init dm_linear_init(void)
diff --git a/drivers/md/dm-stripe.c b/drivers/md/dm-stripe.c
index a0375530b07f..1690bb299b3f 100644
--- a/drivers/md/dm-stripe.c
+++ b/drivers/md/dm-stripe.c
@@ -351,25 +351,6 @@ static size_t stripe_dax_copy_from_iter(struct dm_target *ti, pgoff_t pgoff,
 	return dax_copy_from_iter(dax_dev, pgoff, addr, bytes, i);
 }
 
-static void stripe_dax_flush(struct dm_target *ti, pgoff_t pgoff, void *addr,
-		size_t size)
-{
-	sector_t dev_sector, sector = pgoff * PAGE_SECTORS;
-	struct stripe_c *sc = ti->private;
-	struct dax_device *dax_dev;
-	struct block_device *bdev;
-	uint32_t stripe;
-
-	stripe_map_sector(sc, sector, &stripe, &dev_sector);
-	dev_sector += sc->stripe[stripe].physical_start;
-	dax_dev = sc->stripe[stripe].dev->dax_dev;
-	bdev = sc->stripe[stripe].dev->bdev;
-
-	if (bdev_dax_pgoff(bdev, dev_sector, ALIGN(size, PAGE_SIZE), &pgoff))
-		return;
-	dax_flush(dax_dev, pgoff, addr, size);
-}
-
 /*
  * Stripe status:
  *
@@ -491,7 +472,6 @@ static struct target_type stripe_target = {
 	.io_hints = stripe_io_hints,
 	.direct_access = stripe_dax_direct_access,
 	.dax_copy_from_iter = stripe_dax_copy_from_iter,
-	.dax_flush = stripe_dax_flush,
 };
 
 int __init dm_stripe_init(void)
diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index d669fddd9290..825eaffc24da 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -987,24 +987,6 @@ static size_t dm_dax_copy_from_iter(struct dax_device *dax_dev, pgoff_t pgoff,
 	return ret;
 }
 
-static void dm_dax_flush(struct dax_device *dax_dev, pgoff_t pgoff, void *addr,
-		size_t size)
-{
-	struct mapped_device *md = dax_get_private(dax_dev);
-	sector_t sector = pgoff * PAGE_SECTORS;
-	struct dm_target *ti;
-	int srcu_idx;
-
-	ti = dm_dax_get_live_target(md, sector, &srcu_idx);
-
-	if (!ti)
-		goto out;
-	if (ti->type->dax_flush)
-		ti->type->dax_flush(ti, pgoff, addr, size);
- out:
-	dm_put_live_table(md, srcu_idx);
-}
-
 /*
  * A target may call dm_accept_partial_bio only from the map routine.  It is
  * allowed for all bio types except REQ_PREFLUSH.
@@ -2992,7 +2974,6 @@ static const struct block_device_operations dm_blk_dops = {
 static const struct dax_operations dm_dax_ops = {
 	.direct_access = dm_dax_direct_access,
 	.copy_from_iter = dm_dax_copy_from_iter,
-	.flush = dm_dax_flush,
 };
 
 /*
diff --git a/drivers/nvdimm/pmem.c b/drivers/nvdimm/pmem.c
index f7099adaabc0..88c128258760 100644
--- a/drivers/nvdimm/pmem.c
+++ b/drivers/nvdimm/pmem.c
@@ -243,16 +243,9 @@ static size_t pmem_copy_from_iter(struct dax_device *dax_dev, pgoff_t pgoff,
 	return copy_from_iter_flushcache(addr, bytes, i);
 }
 
-static void pmem_dax_flush(struct dax_device *dax_dev, pgoff_t pgoff,
-		void *addr, size_t size)
-{
-	arch_wb_cache_pmem(addr, size);
-}
-
 static const struct dax_operations pmem_dax_ops = {
 	.direct_access = pmem_dax_direct_access,
 	.copy_from_iter = pmem_copy_from_iter,
-	.flush = pmem_dax_flush,
 };
 
 static const struct attribute_group *pmem_attribute_groups[] = {
diff --git a/fs/dax.c b/fs/dax.c
index 865d42c63e23..18d970fb0e09 100644
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -783,7 +783,7 @@ static int dax_writeback_one(struct block_device *bdev,
 	}
 
 	dax_mapping_entry_mkclean(mapping, index, pfn_t_to_pfn(pfn));
-	dax_flush(dax_dev, pgoff, kaddr, size);
+	dax_flush(dax_dev, kaddr, size);
 	/*
 	 * After we have flushed the cache, we can clear the dirty tag. There
 	 * cannot be new dirty data in the pfn after the flush has completed as
@@ -978,7 +978,7 @@ int __dax_zero_page_range(struct block_device *bdev,
 			return rc;
 		}
 		memset(kaddr + offset, 0, size);
-		dax_flush(dax_dev, pgoff, kaddr + offset, size);
+		dax_flush(dax_dev, kaddr + offset, size);
 		dax_read_unlock(id);
 	}
 	return 0;
diff --git a/include/linux/dax.h b/include/linux/dax.h
index df97b7af7e2c..0d8f35f6c53d 100644
--- a/include/linux/dax.h
+++ b/include/linux/dax.h
@@ -19,8 +19,6 @@ struct dax_operations {
 	/* copy_from_iter: required operation for fs-dax direct-i/o */
 	size_t (*copy_from_iter)(struct dax_device *, pgoff_t, void *, size_t,
 			struct iov_iter *);
-	/* flush: optional driver-specific cache management after writes */
-	void (*flush)(struct dax_device *, pgoff_t, void *, size_t);
 };
 
 extern struct attribute_group dax_attribute_group;
@@ -84,8 +82,7 @@ long dax_direct_access(struct dax_device *dax_dev, pgoff_t pgoff, long nr_pages,
 		void **kaddr, pfn_t *pfn);
 size_t dax_copy_from_iter(struct dax_device *dax_dev, pgoff_t pgoff, void *addr,
 		size_t bytes, struct iov_iter *i);
-void dax_flush(struct dax_device *dax_dev, pgoff_t pgoff, void *addr,
-		size_t size);
+void dax_flush(struct dax_device *dax_dev, void *addr, size_t size);
 void dax_write_cache(struct dax_device *dax_dev, bool wc);
 bool dax_write_cache_enabled(struct dax_device *dax_dev);
 
diff --git a/include/linux/device-mapper.h b/include/linux/device-mapper.h
index 3b5fdf308148..a5538433c927 100644
--- a/include/linux/device-mapper.h
+++ b/include/linux/device-mapper.h
@@ -134,8 +134,6 @@ typedef long (*dm_dax_direct_access_fn) (struct dm_target *ti, pgoff_t pgoff,
 		long nr_pages, void **kaddr, pfn_t *pfn);
 typedef size_t (*dm_dax_copy_from_iter_fn)(struct dm_target *ti, pgoff_t pgoff,
 		void *addr, size_t bytes, struct iov_iter *i);
-typedef void (*dm_dax_flush_fn)(struct dm_target *ti, pgoff_t pgoff, void *addr,
-		size_t size);
 #define PAGE_SECTORS (PAGE_SIZE / 512)
 
 void dm_error(const char *message);
@@ -186,7 +184,6 @@ struct target_type {
 	dm_io_hints_fn io_hints;
 	dm_dax_direct_access_fn direct_access;
 	dm_dax_copy_from_iter_fn dax_copy_from_iter;
-	dm_dax_flush_fn dax_flush;
 
 	/* For internal device-mapper use. */
 	struct list_head list;
-- 
cgit v1.2.3


From f8e9ec16611baa8db77a7d46facd2ba7aa525955 Mon Sep 17 00:00:00 2001
From: Greg Thelen <gthelen@google.com>
Date: Thu, 7 Sep 2017 17:36:36 -0700
Subject: block: tolerate tracing of NULL bio

__get_request() can call trace_block_getrq() with bio=NULL which causes
block_get_rq::TP_fast_assign() to deref a NULL pointer and panic.

Syzkaller fuzzer panics with
linux-next (1d53d908b79d7870d89063062584eead4cf83448):
  kasan: GPF could be caused by NULL-ptr deref or user memory access
  general protection fault: 0000 [#1] SMP KASAN
  Modules linked in:
  CPU: 0 PID: 2983 Comm: syzkaller401111 Not tainted 4.13.0-rc7-next-20170901+ #13
  task: ffff8801cf1da000 task.stack: ffff8801ce440000
  RIP: 0010:perf_trace_block_get_rq+0x697/0x970 include/trace/events/block.h:384
  RSP: 0018:ffff8801ce4473f0 EFLAGS: 00010246
  RAX: ffff8801cf1da000 RBX: 1ffff10039c88e84 RCX: 1ffffd1ffff84d27
  RDX: dffffc0000000001 RSI: 1ffff1003b643e7a RDI: ffffe8ffffc26938
  RBP: ffff8801ce447530 R08: 1ffff1003b643e6c R09: ffffe8ffffc26964
  R10: 0000000000000002 R11: fffff91ffff84d2d R12: ffffe8ffffc1f890
  R13: ffffe8ffffc26930 R14: ffffffff85cad9e0 R15: 0000000000000000
  FS:  0000000002641880(0000) GS:ffff8801db200000(0000) knlGS:0000000000000000
  CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
  CR2: 000000000043e670 CR3: 00000001d1d7a000 CR4: 00000000001406f0
  DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
  DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
  Call Trace:
    trace_block_getrq include/trace/events/block.h:423 [inline]
    __get_request block/blk-core.c:1283 [inline]
    get_request+0x1518/0x23b0 block/blk-core.c:1355
    blk_old_get_request block/blk-core.c:1402 [inline]
    blk_get_request+0x1d8/0x3c0 block/blk-core.c:1427
    sg_scsi_ioctl+0x117/0x750 block/scsi_ioctl.c:451
    sg_ioctl+0x192d/0x2ed0 drivers/scsi/sg.c:1070
    vfs_ioctl fs/ioctl.c:45 [inline]
    do_vfs_ioctl+0x1b1/0x1530 fs/ioctl.c:685
    SYSC_ioctl fs/ioctl.c:700 [inline]
    SyS_ioctl+0x8f/0xc0 fs/ioctl.c:691
    entry_SYSCALL_64_fastpath+0x1f/0xbe

block_get_rq::TP_fast_assign() has multiple redundant ->dev assignments.
Only one of them is NULL tolerant.  Favor the NULL tolerant one.

Fixes: 74d46992e0d9 ("block: replace bi_bdev with a gendisk pointer and partitions index")
Reviewed-by: Ming Lei <ming.lei@redhat.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Greg Thelen <gthelen@google.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/trace/events/block.h | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/include/trace/events/block.h b/include/trace/events/block.h
index f815aaaef755..1fd7ff1a46f7 100644
--- a/include/trace/events/block.h
+++ b/include/trace/events/block.h
@@ -397,7 +397,6 @@ DECLARE_EVENT_CLASS(block_get_rq,
 
 	TP_fast_assign(
 		__entry->dev		= bio ? bio_dev(bio) : 0;
-		__entry->dev		= bio_dev(bio);
 		__entry->sector		= bio ? bio->bi_iter.bi_sector : 0;
 		__entry->nr_sector	= bio ? bio_sectors(bio) : 0;
 		blk_fill_rwbs(__entry->rwbs,
@@ -414,7 +413,7 @@ DECLARE_EVENT_CLASS(block_get_rq,
 /**
  * block_getrq - get a free request entry in queue for block IO operations
  * @q: queue for operations
- * @bio: pending block IO operation
+ * @bio: pending block IO operation (can be %NULL)
  * @rw: low bit indicates a read (%0) or a write (%1)
  *
  * A request struct for queue @q has been allocated to handle the
@@ -430,7 +429,7 @@ DEFINE_EVENT(block_get_rq, block_getrq,
 /**
  * block_sleeprq - waiting to get a free request entry in queue for block IO operation
  * @q: queue for operation
- * @bio: pending block IO operation
+ * @bio: pending block IO operation (can be %NULL)
  * @rw: low bit indicates a read (%0) or a write (%1)
  *
  * In the case where a request struct cannot be provided for queue @q
-- 
cgit v1.2.3


From 044a9df1a7cbb89f48fcc0e9e39997989342966b Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Mon, 11 Sep 2017 12:09:28 -0400
Subject: nvme-pci: implement the HMB entry number and size limitations

Adds support for the new Host Memory Buffer Minimum Descriptor Entry Size
and Host Memory Maximum Descriptors Entries field that were added in
TP 4002 HMB Enhancements.  These allow the controller to advertise
limits for the usual number of segments in the host memory buffer, as
well as a minimum usable per-segment size.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Keith Busch <keith.busch@intel.com>
---
 drivers/nvme/host/core.c | 2 ++
 drivers/nvme/host/nvme.h | 3 +++
 drivers/nvme/host/pci.c  | 6 +++++-
 include/linux/nvme.h     | 4 +++-
 4 files changed, 13 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index 8040fc14fd15..acc816b67582 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -1897,6 +1897,8 @@ int nvme_init_identify(struct nvme_ctrl *ctrl)
 		ctrl->cntlid = le16_to_cpu(id->cntlid);
 		ctrl->hmpre = le32_to_cpu(id->hmpre);
 		ctrl->hmmin = le32_to_cpu(id->hmmin);
+		ctrl->hmminds = le32_to_cpu(id->hmminds);
+		ctrl->hmmaxd = le16_to_cpu(id->hmmaxd);
 	}
 
 	kfree(id);
diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h
index b8ba7c85e61b..d3f3c4447515 100644
--- a/drivers/nvme/host/nvme.h
+++ b/drivers/nvme/host/nvme.h
@@ -181,8 +181,11 @@ struct nvme_ctrl {
 	u64 ps_max_latency_us;
 	bool apst_enabled;
 
+	/* PCIe only: */
 	u32 hmpre;
 	u32 hmmin;
+	u32 hmminds;
+	u16 hmmaxd;
 
 	/* Fabrics only */
 	u16 sqsize;
diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
index 5ed12fbfaad6..4a2121335f48 100644
--- a/drivers/nvme/host/pci.c
+++ b/drivers/nvme/host/pci.c
@@ -1625,6 +1625,10 @@ static int __nvme_alloc_host_mem(struct nvme_dev *dev, u64 preferred,
 	tmp = (preferred + chunk_size - 1);
 	do_div(tmp, chunk_size);
 	max_entries = tmp;
+
+	if (dev->ctrl.hmmaxd && dev->ctrl.hmmaxd < max_entries)
+		max_entries = dev->ctrl.hmmaxd;
+
 	descs = dma_zalloc_coherent(dev->dev, max_entries * sizeof(*descs),
 			&descs_dma, GFP_KERNEL);
 	if (!descs)
@@ -1681,7 +1685,7 @@ static int nvme_alloc_host_mem(struct nvme_dev *dev, u64 min, u64 preferred)
 
 	/* start big and work our way down */
 	for (chunk_size = min_t(u64, preferred, PAGE_SIZE * MAX_ORDER_NR_PAGES);
-	     chunk_size >= PAGE_SIZE * 2;
+	     chunk_size >= max_t(u32, dev->ctrl.hmminds * 4096, PAGE_SIZE * 2);
 	     chunk_size /= 2) {
 		if (!__nvme_alloc_host_mem(dev, preferred, chunk_size)) {
 			if (!min || dev->host_mem_size >= min)
diff --git a/include/linux/nvme.h b/include/linux/nvme.h
index 5144f9103723..87723c86f136 100644
--- a/include/linux/nvme.h
+++ b/include/linux/nvme.h
@@ -226,7 +226,9 @@ struct nvme_id_ctrl {
 	__le16			mntmt;
 	__le16			mxtmt;
 	__le32			sanicap;
-	__u8			rsvd332[180];
+	__le32			hmminds;
+	__le16			hmmaxd;
+	__u8			rsvd338[174];
 	__u8			sqes;
 	__u8			cqes;
 	__le16			maxcmd;
-- 
cgit v1.2.3


From 1359798f9d4082eb04575efdd19512fbd9c28464 Mon Sep 17 00:00:00 2001
From: Martin Wilck <mwilck@suse.com>
Date: Wed, 6 Sep 2017 14:36:57 +0200
Subject: string.h: un-fortify memcpy_and_pad

The way I'd implemented the new helper memcpy_and_pad  with
__FORTIFY_INLINE caused compiler warnings for certain kernel
configurations.

This helper is only used in a single place at this time, and thus
doesn't benefit much from fortification. So simplify the code
by dropping fortification support for now.

Fixes: 01f33c336e2d "string.h: add memcpy_and_pad()"
Signed-off-by: Martin Wilck <mwilck@suse.com>
Acked-by: Arnd Bergmann <arnd@arndb.de>
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 include/linux/string.h | 15 ++-------------
 1 file changed, 2 insertions(+), 13 deletions(-)

(limited to 'include')

diff --git a/include/linux/string.h b/include/linux/string.h
index e1eeb0a8a969..54d21783e18d 100644
--- a/include/linux/string.h
+++ b/include/linux/string.h
@@ -434,20 +434,9 @@ __FORTIFY_INLINE char *strcpy(char *p, const char *q)
  * @count: The number of bytes to copy
  * @pad: Character to use for padding if space is left in destination.
  */
-__FORTIFY_INLINE void memcpy_and_pad(void *dest, size_t dest_len,
-				     const void *src, size_t count, int pad)
+static inline void memcpy_and_pad(void *dest, size_t dest_len,
+				  const void *src, size_t count, int pad)
 {
-	size_t dest_size = __builtin_object_size(dest, 0);
-	size_t src_size = __builtin_object_size(src, 0);
-
-	if (__builtin_constant_p(dest_len) && __builtin_constant_p(count)) {
-		if (dest_size < dest_len && dest_size < count)
-			__write_overflow();
-		else if (src_size < dest_len && src_size < count)
-			__read_overflow3();
-	}
-	if (dest_size < dest_len)
-		fortify_panic(__func__);
 	if (dest_len > count) {
 		memcpy(dest, src, count);
 		memset(dest + count, pad,  dest_len - count);
-- 
cgit v1.2.3


From 609320c8a22715b74b39796930c3542719f8ab62 Mon Sep 17 00:00:00 2001
From: Yonghong Song <yhs@fb.com>
Date: Thu, 7 Sep 2017 18:36:15 -0700
Subject: perf/bpf: fix a clang compilation issue

clang does not support variable length array for structure member.
It has the following error during compilation:

kernel/trace/trace_syscalls.c:568:17: error: fields must have a constant size:
'variable length array in structure' extension will never be supported
                unsigned long args[sys_data->nb_args];
                              ^

The fix is to use a fixed array length instead.

Reported-by: Nick Desaulniers <ndesaulniers@google.com>
Signed-off-by: Yonghong Song <yhs@fb.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/syscalls.h      | 2 ++
 kernel/trace/trace_syscalls.c | 2 +-
 2 files changed, 3 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
index 88951b795ee3..95606a2d556f 100644
--- a/include/linux/syscalls.h
+++ b/include/linux/syscalls.h
@@ -200,6 +200,8 @@ static inline int is_syscall_trace_event(struct trace_event_call *tp_event)
 #define SYSCALL_DEFINE5(name, ...) SYSCALL_DEFINEx(5, _##name, __VA_ARGS__)
 #define SYSCALL_DEFINE6(name, ...) SYSCALL_DEFINEx(6, _##name, __VA_ARGS__)
 
+#define SYSCALL_DEFINE_MAXARGS	6
+
 #define SYSCALL_DEFINEx(x, sname, ...)				\
 	SYSCALL_METADATA(sname, x, __VA_ARGS__)			\
 	__SYSCALL_DEFINEx(x, sname, __VA_ARGS__)
diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c
index 9c4eef20301c..696afe72d3b1 100644
--- a/kernel/trace/trace_syscalls.c
+++ b/kernel/trace/trace_syscalls.c
@@ -565,7 +565,7 @@ static int perf_call_bpf_enter(struct bpf_prog *prog, struct pt_regs *regs,
 	struct syscall_tp_t {
 		unsigned long long regs;
 		unsigned long syscall_nr;
-		unsigned long args[sys_data->nb_args];
+		unsigned long args[SYSCALL_DEFINE_MAXARGS];
 	} param;
 	int i;
 
-- 
cgit v1.2.3


From 96c5508e3012ed0984ab93821d64ac1ff3279c09 Mon Sep 17 00:00:00 2001
From: Jesper Dangaard Brouer <brouer@redhat.com>
Date: Sun, 10 Sep 2017 09:47:02 +0200
Subject: xdp: implement xdp_redirect_map for generic XDP

Using bpf_redirect_map is allowed for generic XDP programs, but the
appropriate map lookup was never performed in xdp_do_generic_redirect().

Instead the map-index is directly used as the ifindex.  For the
xdp_redirect_map sample in SKB-mode '-S', this resulted in trying
sending on ifindex 0 which isn't valid, resulting in getting SKB
packets dropped.  Thus, the reported performance numbers are wrong in
commit 24251c264798 ("samples/bpf: add option for native and skb mode
for redirect apps") for the 'xdp_redirect_map -S' case.

Before commit 109980b894e9 ("bpf: don't select potentially stale
ri->map from buggy xdp progs") it could crash the kernel.  Like this
commit also check that the map_owner owner is correct before
dereferencing the map pointer.  But make sure that this API misusage
can be caught by a tracepoint. Thus, allowing userspace via
tracepoints to detect misbehaving bpf_progs.

Fixes: 6103aa96ec07 ("net: implement XDP_REDIRECT for xdp generic")
Fixes: 24251c264798 ("samples/bpf: add option for native and skb mode for redirect apps")
Signed-off-by: Jesper Dangaard Brouer <brouer@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/trace/events/xdp.h |  4 ++--
 net/core/filter.c          | 38 ++++++++++++++++++++++++++------------
 2 files changed, 28 insertions(+), 14 deletions(-)

(limited to 'include')

diff --git a/include/trace/events/xdp.h b/include/trace/events/xdp.h
index 862575ac8da9..4e16c43fba10 100644
--- a/include/trace/events/xdp.h
+++ b/include/trace/events/xdp.h
@@ -138,11 +138,11 @@ DEFINE_EVENT_PRINT(xdp_redirect_template, xdp_redirect_map_err,
 
 #define _trace_xdp_redirect_map(dev, xdp, fwd, map, idx)		\
 	 trace_xdp_redirect_map(dev, xdp, fwd ? fwd->ifindex : 0,	\
-				0, map, idx);
+				0, map, idx)
 
 #define _trace_xdp_redirect_map_err(dev, xdp, fwd, map, idx, err)	\
 	 trace_xdp_redirect_map_err(dev, xdp, fwd ? fwd->ifindex : 0,	\
-				    err, map, idx);
+				    err, map, idx)
 
 #endif /* _TRACE_XDP_H */
 
diff --git a/net/core/filter.c b/net/core/filter.c
index 3a50a9b021e2..24dd33dd9f04 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -2506,21 +2506,19 @@ static int xdp_do_redirect_map(struct net_device *dev, struct xdp_buff *xdp,
 	struct redirect_info *ri = this_cpu_ptr(&redirect_info);
 	const struct bpf_prog *map_owner = ri->map_owner;
 	struct bpf_map *map = ri->map;
+	struct net_device *fwd = NULL;
 	u32 index = ri->ifindex;
-	struct net_device *fwd;
 	int err;
 
 	ri->ifindex = 0;
 	ri->map = NULL;
 	ri->map_owner = NULL;
 
-	/* This is really only caused by a deliberately crappy
-	 * BPF program, normally we would never hit that case,
-	 * so no need to inform someone via tracepoints either,
-	 * just bail out.
-	 */
-	if (unlikely(map_owner != xdp_prog))
-		return -EINVAL;
+	if (unlikely(map_owner != xdp_prog)) {
+		err = -EFAULT;
+		map = NULL;
+		goto err;
+	}
 
 	fwd = __dev_map_lookup_elem(map, index);
 	if (!fwd) {
@@ -2576,13 +2574,27 @@ int xdp_do_generic_redirect(struct net_device *dev, struct sk_buff *skb,
 			    struct bpf_prog *xdp_prog)
 {
 	struct redirect_info *ri = this_cpu_ptr(&redirect_info);
+	const struct bpf_prog *map_owner = ri->map_owner;
+	struct bpf_map *map = ri->map;
+	struct net_device *fwd = NULL;
 	u32 index = ri->ifindex;
-	struct net_device *fwd;
 	unsigned int len;
 	int err = 0;
 
-	fwd = dev_get_by_index_rcu(dev_net(dev), index);
 	ri->ifindex = 0;
+	ri->map = NULL;
+	ri->map_owner = NULL;
+
+	if (map) {
+		if (unlikely(map_owner != xdp_prog)) {
+			err = -EFAULT;
+			map = NULL;
+			goto err;
+		}
+		fwd = __dev_map_lookup_elem(map, index);
+	} else {
+		fwd = dev_get_by_index_rcu(dev_net(dev), index);
+	}
 	if (unlikely(!fwd)) {
 		err = -EINVAL;
 		goto err;
@@ -2600,10 +2612,12 @@ int xdp_do_generic_redirect(struct net_device *dev, struct sk_buff *skb,
 	}
 
 	skb->dev = fwd;
-	_trace_xdp_redirect(dev, xdp_prog, index);
+	map ? _trace_xdp_redirect_map(dev, xdp_prog, fwd, map, index)
+		: _trace_xdp_redirect(dev, xdp_prog, index);
 	return 0;
 err:
-	_trace_xdp_redirect_err(dev, xdp_prog, index, err);
+	map ? _trace_xdp_redirect_map_err(dev, xdp_prog, fwd, map, index, err)
+		: _trace_xdp_redirect_err(dev, xdp_prog, index, err);
 	return err;
 }
 EXPORT_SYMBOL_GPL(xdp_do_generic_redirect);
-- 
cgit v1.2.3


From d7fb60b9cafb982cb2e46a267646a8dfd4f2e5da Mon Sep 17 00:00:00 2001
From: Cong Wang <xiyou.wangcong@gmail.com>
Date: Mon, 11 Sep 2017 16:33:30 -0700
Subject: net_sched: get rid of tcfa_rcu

gen estimator has been rewritten in commit 1c0d32fde5bd
("net_sched: gen_estimator: complete rewrite of rate estimators"),
the caller is no longer needed to wait for a grace period.
So this patch gets rid of it.

This also completely closes a race condition between action free
path and filter chain add/remove path for the following patch.
Because otherwise the nested RCU callback can't be caught by
rcu_barrier().

Please see also the comments in code.

Cc: Jiri Pirko <jiri@mellanox.com>
Cc: Jamal Hadi Salim <jhs@mojatatu.com>
Cc: Eric Dumazet <edumazet@google.com>
Signed-off-by: Cong Wang <xiyou.wangcong@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/act_api.h |  2 --
 net/sched/act_api.c   | 17 ++++++++---------
 2 files changed, 8 insertions(+), 11 deletions(-)

(limited to 'include')

diff --git a/include/net/act_api.h b/include/net/act_api.h
index 8f3d5d8b5ae0..b944e0eb93be 100644
--- a/include/net/act_api.h
+++ b/include/net/act_api.h
@@ -34,7 +34,6 @@ struct tc_action {
 	struct gnet_stats_queue		tcfa_qstats;
 	struct net_rate_estimator __rcu *tcfa_rate_est;
 	spinlock_t			tcfa_lock;
-	struct rcu_head			tcfa_rcu;
 	struct gnet_stats_basic_cpu __percpu *cpu_bstats;
 	struct gnet_stats_queue __percpu *cpu_qstats;
 	struct tc_cookie	*act_cookie;
@@ -50,7 +49,6 @@ struct tc_action {
 #define tcf_qstats	common.tcfa_qstats
 #define tcf_rate_est	common.tcfa_rate_est
 #define tcf_lock	common.tcfa_lock
-#define tcf_rcu		common.tcfa_rcu
 
 /* Update lastuse only if needed, to avoid dirtying a cache line.
  * We use a temp variable to avoid fetching jiffies twice.
diff --git a/net/sched/act_api.c b/net/sched/act_api.c
index a306974e2fb4..fcd7dc7b807a 100644
--- a/net/sched/act_api.c
+++ b/net/sched/act_api.c
@@ -53,10 +53,13 @@ static void tcf_action_goto_chain_exec(const struct tc_action *a,
 	res->goto_tp = rcu_dereference_bh(chain->filter_chain);
 }
 
-static void free_tcf(struct rcu_head *head)
+/* XXX: For standalone actions, we don't need a RCU grace period either, because
+ * actions are always connected to filters and filters are already destroyed in
+ * RCU callbacks, so after a RCU grace period actions are already disconnected
+ * from filters. Readers later can not find us.
+ */
+static void free_tcf(struct tc_action *p)
 {
-	struct tc_action *p = container_of(head, struct tc_action, tcfa_rcu);
-
 	free_percpu(p->cpu_bstats);
 	free_percpu(p->cpu_qstats);
 
@@ -76,11 +79,7 @@ static void tcf_idr_remove(struct tcf_idrinfo *idrinfo, struct tc_action *p)
 	idr_remove_ext(&idrinfo->action_idr, p->tcfa_index);
 	spin_unlock_bh(&idrinfo->lock);
 	gen_kill_estimator(&p->tcfa_rate_est);
-	/*
-	 * gen_estimator est_timer() might access p->tcfa_lock
-	 * or bstats, wait a RCU grace period before freeing p
-	 */
-	call_rcu(&p->tcfa_rcu, free_tcf);
+	free_tcf(p);
 }
 
 int __tcf_idr_release(struct tc_action *p, bool bind, bool strict)
@@ -259,7 +258,7 @@ void tcf_idr_cleanup(struct tc_action *a, struct nlattr *est)
 {
 	if (est)
 		gen_kill_estimator(&a->tcfa_rate_est);
-	call_rcu(&a->tcfa_rcu, free_tcf);
+	free_tcf(a);
 }
 EXPORT_SYMBOL(tcf_idr_cleanup);
 
-- 
cgit v1.2.3


From 6f8bcc744aad50d719845e4ce06a7831e96e1109 Mon Sep 17 00:00:00 2001
From: Maarten Lankhorst <maarten.lankhorst@linux.intel.com>
Date: Tue, 12 Sep 2017 15:37:44 +0200
Subject: drm/atomic: Prepare drm_modeset_lock infrastructure for interruptible
 waiting, v2.

When we want to make drm_atomic_commit interruptible, there are a lot of
places that call the lock function, which we don't have control over.

Rather than trying to convert every single one, it's easier to toggle
interruptible waiting per acquire_ctx. If drm_modeset_acquire_init is
called with DRM_MODESET_ACQUIRE_INTERRUPTIBLE, then we will perform
interruptible waits in drm_modeset_lock and drm_modeset_backoff.

Changes since v1:
- Fix locking example in drm_modeset_lock.c to be compatible
  with interruptible waiting (xexaxo) and make it default.
  Uninterruptible waiting shouldn't happen except in corner cases,
  but the example will still apply if the flag is removed.
- Add drm_modeset_lock_single_interruptible() to documentation.
- Fix dead link to removed drm_modeset_lock_interruptible() in
  drm_modeset_lock().

Signed-off-by: Maarten Lankhorst <maarten.lankhorst@linux.intel.com>
Reviewed-by: Daniel Vetter <daniel.vetter@ffwll.ch> #v1
Cc: Emil Velikov <emil.l.velikov@gmail.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20170912133749.6532-2-maarten.lankhorst@linux.intel.com
Reviewed-by: Emil Velikov <emil.l.velikov@gmail.com>
---
 drivers/gpu/drm/drm_debugfs_crc.c  |  2 +-
 drivers/gpu/drm/drm_modeset_lock.c | 96 +++++++++++++++++++-------------------
 include/drm/drm_modeset_lock.h     | 12 +++--
 3 files changed, 57 insertions(+), 53 deletions(-)

(limited to 'include')

diff --git a/drivers/gpu/drm/drm_debugfs_crc.c b/drivers/gpu/drm/drm_debugfs_crc.c
index f9e26dda56d6..9dd879589a2c 100644
--- a/drivers/gpu/drm/drm_debugfs_crc.c
+++ b/drivers/gpu/drm/drm_debugfs_crc.c
@@ -155,7 +155,7 @@ static int crtc_crc_open(struct inode *inode, struct file *filep)
 	int ret = 0;
 
 	if (drm_drv_uses_atomic_modeset(crtc->dev)) {
-		ret = drm_modeset_lock_interruptible(&crtc->mutex, NULL);
+		ret = drm_modeset_lock_single_interruptible(&crtc->mutex);
 		if (ret)
 			return ret;
 
diff --git a/drivers/gpu/drm/drm_modeset_lock.c b/drivers/gpu/drm/drm_modeset_lock.c
index af4e906c630d..e123497da0ca 100644
--- a/drivers/gpu/drm/drm_modeset_lock.c
+++ b/drivers/gpu/drm/drm_modeset_lock.c
@@ -39,23 +39,28 @@
  *
  * The basic usage pattern is to::
  *
- *     drm_modeset_acquire_init(&ctx)
+ *     drm_modeset_acquire_init(ctx, DRM_MODESET_ACQUIRE_INTERRUPTIBLE)
  *     retry:
  *     foreach (lock in random_ordered_set_of_locks) {
- *         ret = drm_modeset_lock(lock, &ctx)
+ *         ret = drm_modeset_lock(lock, ctx)
  *         if (ret == -EDEADLK) {
- *             drm_modeset_backoff(&ctx);
- *             goto retry;
+ *             ret = drm_modeset_backoff(ctx);
+ *             if (!ret)
+ *                 goto retry;
  *         }
+ *         if (ret)
+ *             goto out;
  *     }
  *     ... do stuff ...
- *     drm_modeset_drop_locks(&ctx);
- *     drm_modeset_acquire_fini(&ctx);
+ *     out:
+ *     drm_modeset_drop_locks(ctx);
+ *     drm_modeset_acquire_fini(ctx);
  *
  * If all that is needed is a single modeset lock, then the &struct
  * drm_modeset_acquire_ctx is not needed and the locking can be simplified
- * by passing a NULL instead of ctx in the drm_modeset_lock()
- * call and, when done, by calling drm_modeset_unlock().
+ * by passing a NULL instead of ctx in the drm_modeset_lock() call or
+ * calling  drm_modeset_lock_single_interruptible(). To unlock afterwards
+ * call drm_modeset_unlock().
  *
  * On top of these per-object locks using &ww_mutex there's also an overall
  * &drm_mode_config.mutex, for protecting everything else. Mostly this means
@@ -178,7 +183,11 @@ EXPORT_SYMBOL(drm_warn_on_modeset_not_all_locked);
 /**
  * drm_modeset_acquire_init - initialize acquire context
  * @ctx: the acquire context
- * @flags: for future
+ * @flags: 0 or %DRM_MODESET_ACQUIRE_INTERRUPTIBLE
+ *
+ * When passing %DRM_MODESET_ACQUIRE_INTERRUPTIBLE to @flags,
+ * all calls to drm_modeset_lock() will perform an interruptible
+ * wait.
  */
 void drm_modeset_acquire_init(struct drm_modeset_acquire_ctx *ctx,
 		uint32_t flags)
@@ -186,6 +195,9 @@ void drm_modeset_acquire_init(struct drm_modeset_acquire_ctx *ctx,
 	memset(ctx, 0, sizeof(*ctx));
 	ww_acquire_init(&ctx->ww_ctx, &crtc_ww_class);
 	INIT_LIST_HEAD(&ctx->locked);
+
+	if (flags & DRM_MODESET_ACQUIRE_INTERRUPTIBLE)
+		ctx->interruptible = true;
 }
 EXPORT_SYMBOL(drm_modeset_acquire_init);
 
@@ -261,8 +273,19 @@ static inline int modeset_lock(struct drm_modeset_lock *lock,
 	return ret;
 }
 
-static int modeset_backoff(struct drm_modeset_acquire_ctx *ctx,
-		bool interruptible)
+/**
+ * drm_modeset_backoff - deadlock avoidance backoff
+ * @ctx: the acquire context
+ *
+ * If deadlock is detected (ie. drm_modeset_lock() returns -EDEADLK),
+ * you must call this function to drop all currently held locks and
+ * block until the contended lock becomes available.
+ *
+ * This function returns 0 on success, or -ERESTARTSYS if this context
+ * is initialized with %DRM_MODESET_ACQUIRE_INTERRUPTIBLE and the
+ * wait has been interrupted.
+ */
+int drm_modeset_backoff(struct drm_modeset_acquire_ctx *ctx)
 {
 	struct drm_modeset_lock *contended = ctx->contended;
 
@@ -273,35 +296,10 @@ static int modeset_backoff(struct drm_modeset_acquire_ctx *ctx,
 
 	drm_modeset_drop_locks(ctx);
 
-	return modeset_lock(contended, ctx, interruptible, true);
-}
-
-/**
- * drm_modeset_backoff - deadlock avoidance backoff
- * @ctx: the acquire context
- *
- * If deadlock is detected (ie. drm_modeset_lock() returns -EDEADLK),
- * you must call this function to drop all currently held locks and
- * block until the contended lock becomes available.
- */
-void drm_modeset_backoff(struct drm_modeset_acquire_ctx *ctx)
-{
-	modeset_backoff(ctx, false);
+	return modeset_lock(contended, ctx, ctx->interruptible, true);
 }
 EXPORT_SYMBOL(drm_modeset_backoff);
 
-/**
- * drm_modeset_backoff_interruptible - deadlock avoidance backoff
- * @ctx: the acquire context
- *
- * Interruptible version of drm_modeset_backoff()
- */
-int drm_modeset_backoff_interruptible(struct drm_modeset_acquire_ctx *ctx)
-{
-	return modeset_backoff(ctx, true);
-}
-EXPORT_SYMBOL(drm_modeset_backoff_interruptible);
-
 /**
  * drm_modeset_lock_init - initialize lock
  * @lock: lock to init
@@ -324,14 +322,18 @@ EXPORT_SYMBOL(drm_modeset_lock_init);
  * deadlock scenario has been detected and it is an error to attempt
  * to take any more locks without first calling drm_modeset_backoff().
  *
+ * If the @ctx is not NULL and initialized with
+ * %DRM_MODESET_ACQUIRE_INTERRUPTIBLE, this function will fail with
+ * -ERESTARTSYS when interrupted.
+ *
  * If @ctx is NULL then the function call behaves like a normal,
- * non-nesting mutex_lock() call.
+ * uninterruptible non-nesting mutex_lock() call.
  */
 int drm_modeset_lock(struct drm_modeset_lock *lock,
 		struct drm_modeset_acquire_ctx *ctx)
 {
 	if (ctx)
-		return modeset_lock(lock, ctx, false, false);
+		return modeset_lock(lock, ctx, ctx->interruptible, false);
 
 	ww_mutex_lock(&lock->mutex, NULL);
 	return 0;
@@ -339,21 +341,19 @@ int drm_modeset_lock(struct drm_modeset_lock *lock,
 EXPORT_SYMBOL(drm_modeset_lock);
 
 /**
- * drm_modeset_lock_interruptible - take modeset lock
+ * drm_modeset_lock_single_interruptible - take a single modeset lock
  * @lock: lock to take
- * @ctx: acquire ctx
  *
- * Interruptible version of drm_modeset_lock()
+ * This function behaves as drm_modeset_lock() with a NULL context,
+ * but performs interruptible waits.
+ *
+ * This function returns 0 on success, or -ERESTARTSYS when interrupted.
  */
-int drm_modeset_lock_interruptible(struct drm_modeset_lock *lock,
-		struct drm_modeset_acquire_ctx *ctx)
+int drm_modeset_lock_single_interruptible(struct drm_modeset_lock *lock)
 {
-	if (ctx)
-		return modeset_lock(lock, ctx, true, false);
-
 	return ww_mutex_lock_interruptible(&lock->mutex, NULL);
 }
-EXPORT_SYMBOL(drm_modeset_lock_interruptible);
+EXPORT_SYMBOL(drm_modeset_lock_single_interruptible);
 
 /**
  * drm_modeset_unlock - drop modeset lock
diff --git a/include/drm/drm_modeset_lock.h b/include/drm/drm_modeset_lock.h
index 4b27c2bb955c..a685d1bb21f2 100644
--- a/include/drm/drm_modeset_lock.h
+++ b/include/drm/drm_modeset_lock.h
@@ -34,6 +34,7 @@ struct drm_modeset_lock;
  * @contended: used internally for -EDEADLK handling
  * @locked: list of held locks
  * @trylock_only: trylock mode used in atomic contexts/panic notifiers
+ * @interruptible: whether interruptible locking should be used.
  *
  * Each thread competing for a set of locks must use one acquire
  * ctx.  And if any lock fxn returns -EDEADLK, it must backoff and
@@ -59,6 +60,9 @@ struct drm_modeset_acquire_ctx {
 	 * Trylock mode, use only for panic handlers!
 	 */
 	bool trylock_only;
+
+	/* Perform interruptible waits on this context. */
+	bool interruptible;
 };
 
 /**
@@ -82,12 +86,13 @@ struct drm_modeset_lock {
 	struct list_head head;
 };
 
+#define DRM_MODESET_ACQUIRE_INTERRUPTIBLE BIT(0)
+
 void drm_modeset_acquire_init(struct drm_modeset_acquire_ctx *ctx,
 		uint32_t flags);
 void drm_modeset_acquire_fini(struct drm_modeset_acquire_ctx *ctx);
 void drm_modeset_drop_locks(struct drm_modeset_acquire_ctx *ctx);
-void drm_modeset_backoff(struct drm_modeset_acquire_ctx *ctx);
-int drm_modeset_backoff_interruptible(struct drm_modeset_acquire_ctx *ctx);
+int drm_modeset_backoff(struct drm_modeset_acquire_ctx *ctx);
 
 void drm_modeset_lock_init(struct drm_modeset_lock *lock);
 
@@ -111,8 +116,7 @@ static inline bool drm_modeset_is_locked(struct drm_modeset_lock *lock)
 
 int drm_modeset_lock(struct drm_modeset_lock *lock,
 		struct drm_modeset_acquire_ctx *ctx);
-int drm_modeset_lock_interruptible(struct drm_modeset_lock *lock,
-		struct drm_modeset_acquire_ctx *ctx);
+int __must_check drm_modeset_lock_single_interruptible(struct drm_modeset_lock *lock);
 void drm_modeset_unlock(struct drm_modeset_lock *lock);
 
 struct drm_device;
-- 
cgit v1.2.3


From fa5f7b51fc3080c2b195fa87c7eca7c05e56f673 Mon Sep 17 00:00:00 2001
From: Dan Carpenter <dan.carpenter@oracle.com>
Date: Thu, 14 Sep 2017 02:00:54 +0300
Subject: sctp: potential read out of bounds in sctp_ulpevent_type_enabled()

This code causes a static checker warning because Smatch doesn't trust
anything that comes from skb->data.  I've reviewed this code and I do
think skb->data can be controlled by the user here.

The sctp_event_subscribe struct has 13 __u8 fields and we want to see
if ours is non-zero.  sn_type can be any value in the 0-USHRT_MAX range.
We're subtracting SCTP_SN_TYPE_BASE which is 1 << 15 so we could read
either before the start of the struct or after the end.

This is a very old bug and it's surprising that it would go undetected
for so long but my theory is that it just doesn't have a big impact so
it would be hard to notice.

Signed-off-by: Dan Carpenter <dan.carpenter@oracle.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/sctp/ulpevent.h | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/net/sctp/ulpevent.h b/include/net/sctp/ulpevent.h
index 1060494ac230..b8c86ec1a8f5 100644
--- a/include/net/sctp/ulpevent.h
+++ b/include/net/sctp/ulpevent.h
@@ -153,8 +153,12 @@ __u16 sctp_ulpevent_get_notification_type(const struct sctp_ulpevent *event);
 static inline int sctp_ulpevent_type_enabled(__u16 sn_type,
 					     struct sctp_event_subscribe *mask)
 {
+	int offset = sn_type - SCTP_SN_TYPE_BASE;
 	char *amask = (char *) mask;
-	return amask[sn_type - SCTP_SN_TYPE_BASE];
+
+	if (offset >= sizeof(struct sctp_event_subscribe))
+		return 0;
+	return amask[offset];
 }
 
 /* Given an event subscription, is this event enabled? */
-- 
cgit v1.2.3


From 0ee931c4e31a5efb134c76440405e9219f896e33 Mon Sep 17 00:00:00 2001
From: Michal Hocko <mhocko@suse.com>
Date: Wed, 13 Sep 2017 16:28:29 -0700
Subject: mm: treewide: remove GFP_TEMPORARY allocation flag

GFP_TEMPORARY was introduced by commit e12ba74d8ff3 ("Group short-lived
and reclaimable kernel allocations") along with __GFP_RECLAIMABLE.  It's
primary motivation was to allow users to tell that an allocation is
short lived and so the allocator can try to place such allocations close
together and prevent long term fragmentation.  As much as this sounds
like a reasonable semantic it becomes much less clear when to use the
highlevel GFP_TEMPORARY allocation flag.  How long is temporary? Can the
context holding that memory sleep? Can it take locks? It seems there is
no good answer for those questions.

The current implementation of GFP_TEMPORARY is basically GFP_KERNEL |
__GFP_RECLAIMABLE which in itself is tricky because basically none of
the existing caller provide a way to reclaim the allocated memory.  So
this is rather misleading and hard to evaluate for any benefits.

I have checked some random users and none of them has added the flag
with a specific justification.  I suspect most of them just copied from
other existing users and others just thought it might be a good idea to
use without any measuring.  This suggests that GFP_TEMPORARY just
motivates for cargo cult usage without any reasoning.

I believe that our gfp flags are quite complex already and especially
those with highlevel semantic should be clearly defined to prevent from
confusion and abuse.  Therefore I propose dropping GFP_TEMPORARY and
replace all existing users to simply use GFP_KERNEL.  Please note that
SLAB users with shrinkers will still get __GFP_RECLAIMABLE heuristic and
so they will be placed properly for memory fragmentation prevention.

I can see reasons we might want some gfp flag to reflect shorterm
allocations but I propose starting from a clear semantic definition and
only then add users with proper justification.

This was been brought up before LSF this year by Matthew [1] and it
turned out that GFP_TEMPORARY really doesn't have a clear semantic.  It
seems to be a heuristic without any measured advantage for most (if not
all) its current users.  The follow up discussion has revealed that
opinions on what might be temporary allocation differ a lot between
developers.  So rather than trying to tweak existing users into a
semantic which they haven't expected I propose to simply remove the flag
and start from scratch if we really need a semantic for short term
allocations.

[1] http://lkml.kernel.org/r/20170118054945.GD18349@bombadil.infradead.org

[akpm@linux-foundation.org: fix typo]
[akpm@linux-foundation.org: coding-style fixes]
[sfr@canb.auug.org.au: drm/i915: fix up]
  Link: http://lkml.kernel.org/r/20170816144703.378d4f4d@canb.auug.org.au
Link: http://lkml.kernel.org/r/20170728091904.14627-1-mhocko@kernel.org
Signed-off-by: Michal Hocko <mhocko@suse.com>
Signed-off-by: Stephen Rothwell <sfr@canb.auug.org.au>
Acked-by: Mel Gorman <mgorman@suse.de>
Acked-by: Vlastimil Babka <vbabka@suse.cz>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: Neil Brown <neilb@suse.de>
Cc: "Theodore Ts'o" <tytso@mit.edu>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/arc/kernel/setup.c                            |  2 +-
 arch/arc/kernel/troubleshoot.c                     |  2 +-
 arch/powerpc/kernel/rtas.c                         |  4 ++--
 arch/powerpc/platforms/pseries/suspend.c           |  2 +-
 drivers/gpu/drm/drm_blend.c                        |  2 +-
 drivers/gpu/drm/drm_dp_dual_mode_helper.c          |  2 +-
 drivers/gpu/drm/drm_scdc_helper.c                  |  2 +-
 drivers/gpu/drm/etnaviv/etnaviv_gem_submit.c       |  2 +-
 drivers/gpu/drm/i915/i915_gem.c                    |  2 +-
 drivers/gpu/drm/i915/i915_gem_execbuffer.c         | 12 ++++++------
 drivers/gpu/drm/i915/i915_gem_gtt.c                |  2 +-
 drivers/gpu/drm/i915/i915_gem_userptr.c            |  4 ++--
 drivers/gpu/drm/i915/i915_gpu_error.c              |  6 +++---
 drivers/gpu/drm/i915/selftests/i915_random.c       |  2 +-
 drivers/gpu/drm/i915/selftests/intel_breadcrumbs.c | 10 +++++-----
 drivers/gpu/drm/i915/selftests/intel_uncore.c      |  2 +-
 drivers/gpu/drm/lib/drm_random.c                   |  2 +-
 drivers/gpu/drm/msm/msm_gem_submit.c               |  2 +-
 drivers/gpu/drm/selftests/test-drm_mm.c            |  4 ++--
 drivers/misc/cxl/pci.c                             |  2 +-
 drivers/xen/gntalloc.c                             |  2 +-
 fs/coredump.c                                      |  2 +-
 fs/exec.c                                          |  4 ++--
 fs/overlayfs/copy_up.c                             |  2 +-
 fs/overlayfs/dir.c                                 |  2 +-
 fs/overlayfs/namei.c                               | 12 ++++++------
 fs/proc/base.c                                     |  8 ++++----
 fs/proc/task_mmu.c                                 |  2 +-
 include/linux/gfp.h                                |  2 --
 include/trace/events/mmflags.h                     |  1 -
 kernel/locking/test-ww_mutex.c                     |  2 +-
 kernel/trace/trace_events_filter.c                 |  2 +-
 lib/string_helpers.c                               |  4 ++--
 mm/shmem.c                                         |  2 +-
 mm/slub.c                                          |  2 +-
 tools/perf/builtin-kmem.c                          |  1 -
 36 files changed, 57 insertions(+), 61 deletions(-)

(limited to 'include')

diff --git a/arch/arc/kernel/setup.c b/arch/arc/kernel/setup.c
index c4ffb441716c..877cec8f5ea2 100644
--- a/arch/arc/kernel/setup.c
+++ b/arch/arc/kernel/setup.c
@@ -510,7 +510,7 @@ static int show_cpuinfo(struct seq_file *m, void *v)
 		goto done;
 	}
 
-	str = (char *)__get_free_page(GFP_TEMPORARY);
+	str = (char *)__get_free_page(GFP_KERNEL);
 	if (!str)
 		goto done;
 
diff --git a/arch/arc/kernel/troubleshoot.c b/arch/arc/kernel/troubleshoot.c
index 7e94476f3994..7d8c1d6c2f60 100644
--- a/arch/arc/kernel/troubleshoot.c
+++ b/arch/arc/kernel/troubleshoot.c
@@ -178,7 +178,7 @@ void show_regs(struct pt_regs *regs)
 	struct callee_regs *cregs;
 	char *buf;
 
-	buf = (char *)__get_free_page(GFP_TEMPORARY);
+	buf = (char *)__get_free_page(GFP_KERNEL);
 	if (!buf)
 		return;
 
diff --git a/arch/powerpc/kernel/rtas.c b/arch/powerpc/kernel/rtas.c
index b8a4987f58cf..1643e9e53655 100644
--- a/arch/powerpc/kernel/rtas.c
+++ b/arch/powerpc/kernel/rtas.c
@@ -914,7 +914,7 @@ int rtas_online_cpus_mask(cpumask_var_t cpus)
 	if (ret) {
 		cpumask_var_t tmp_mask;
 
-		if (!alloc_cpumask_var(&tmp_mask, GFP_TEMPORARY))
+		if (!alloc_cpumask_var(&tmp_mask, GFP_KERNEL))
 			return ret;
 
 		/* Use tmp_mask to preserve cpus mask from first failure */
@@ -962,7 +962,7 @@ int rtas_ibm_suspend_me(u64 handle)
 		return -EIO;
 	}
 
-	if (!alloc_cpumask_var(&offline_mask, GFP_TEMPORARY))
+	if (!alloc_cpumask_var(&offline_mask, GFP_KERNEL))
 		return -ENOMEM;
 
 	atomic_set(&data.working, 0);
diff --git a/arch/powerpc/platforms/pseries/suspend.c b/arch/powerpc/platforms/pseries/suspend.c
index e76aefae2aa2..89726f07d249 100644
--- a/arch/powerpc/platforms/pseries/suspend.c
+++ b/arch/powerpc/platforms/pseries/suspend.c
@@ -151,7 +151,7 @@ static ssize_t store_hibernate(struct device *dev,
 	if (!capable(CAP_SYS_ADMIN))
 		return -EPERM;
 
-	if (!alloc_cpumask_var(&offline_mask, GFP_TEMPORARY))
+	if (!alloc_cpumask_var(&offline_mask, GFP_KERNEL))
 		return -ENOMEM;
 
 	stream_id = simple_strtoul(buf, NULL, 16);
diff --git a/drivers/gpu/drm/drm_blend.c b/drivers/gpu/drm/drm_blend.c
index db6aeec50b82..2e5e089dd912 100644
--- a/drivers/gpu/drm/drm_blend.c
+++ b/drivers/gpu/drm/drm_blend.c
@@ -319,7 +319,7 @@ static int drm_atomic_helper_crtc_normalize_zpos(struct drm_crtc *crtc,
 	DRM_DEBUG_ATOMIC("[CRTC:%d:%s] calculating normalized zpos values\n",
 			 crtc->base.id, crtc->name);
 
-	states = kmalloc_array(total_planes, sizeof(*states), GFP_TEMPORARY);
+	states = kmalloc_array(total_planes, sizeof(*states), GFP_KERNEL);
 	if (!states)
 		return -ENOMEM;
 
diff --git a/drivers/gpu/drm/drm_dp_dual_mode_helper.c b/drivers/gpu/drm/drm_dp_dual_mode_helper.c
index 80e62f669321..0ef9011a1856 100644
--- a/drivers/gpu/drm/drm_dp_dual_mode_helper.c
+++ b/drivers/gpu/drm/drm_dp_dual_mode_helper.c
@@ -111,7 +111,7 @@ ssize_t drm_dp_dual_mode_write(struct i2c_adapter *adapter,
 	void *data;
 	int ret;
 
-	data = kmalloc(msg.len, GFP_TEMPORARY);
+	data = kmalloc(msg.len, GFP_KERNEL);
 	if (!data)
 		return -ENOMEM;
 
diff --git a/drivers/gpu/drm/drm_scdc_helper.c b/drivers/gpu/drm/drm_scdc_helper.c
index 7d1b0f011d33..935653eb3616 100644
--- a/drivers/gpu/drm/drm_scdc_helper.c
+++ b/drivers/gpu/drm/drm_scdc_helper.c
@@ -102,7 +102,7 @@ ssize_t drm_scdc_write(struct i2c_adapter *adapter, u8 offset,
 	void *data;
 	int err;
 
-	data = kmalloc(1 + size, GFP_TEMPORARY);
+	data = kmalloc(1 + size, GFP_KERNEL);
 	if (!data)
 		return -ENOMEM;
 
diff --git a/drivers/gpu/drm/etnaviv/etnaviv_gem_submit.c b/drivers/gpu/drm/etnaviv/etnaviv_gem_submit.c
index a7ff2e4c00d2..026ef4e02f85 100644
--- a/drivers/gpu/drm/etnaviv/etnaviv_gem_submit.c
+++ b/drivers/gpu/drm/etnaviv/etnaviv_gem_submit.c
@@ -37,7 +37,7 @@ static struct etnaviv_gem_submit *submit_create(struct drm_device *dev,
 	struct etnaviv_gem_submit *submit;
 	size_t sz = size_vstruct(nr, sizeof(submit->bos[0]), sizeof(*submit));
 
-	submit = kmalloc(sz, GFP_TEMPORARY | __GFP_NOWARN | __GFP_NORETRY);
+	submit = kmalloc(sz, GFP_KERNEL | __GFP_NOWARN | __GFP_NORETRY);
 	if (submit) {
 		submit->dev = dev;
 		submit->gpu = gpu;
diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
index 57317715977f..19404c96eeb1 100644
--- a/drivers/gpu/drm/i915/i915_gem.c
+++ b/drivers/gpu/drm/i915/i915_gem.c
@@ -2540,7 +2540,7 @@ static void *i915_gem_object_map(const struct drm_i915_gem_object *obj,
 
 	if (n_pages > ARRAY_SIZE(stack_pages)) {
 		/* Too big for stack -- allocate temporary array instead */
-		pages = kvmalloc_array(n_pages, sizeof(*pages), GFP_TEMPORARY);
+		pages = kvmalloc_array(n_pages, sizeof(*pages), GFP_KERNEL);
 		if (!pages)
 			return NULL;
 	}
diff --git a/drivers/gpu/drm/i915/i915_gem_execbuffer.c b/drivers/gpu/drm/i915/i915_gem_execbuffer.c
index 50d5e24f91a9..92437f455b43 100644
--- a/drivers/gpu/drm/i915/i915_gem_execbuffer.c
+++ b/drivers/gpu/drm/i915/i915_gem_execbuffer.c
@@ -293,7 +293,7 @@ static int eb_create(struct i915_execbuffer *eb)
 			 * as possible to perform the allocation and warn
 			 * if it fails.
 			 */
-			flags = GFP_TEMPORARY;
+			flags = GFP_KERNEL;
 			if (size > 1)
 				flags |= __GFP_NORETRY | __GFP_NOWARN;
 
@@ -1515,7 +1515,7 @@ static int eb_copy_relocations(const struct i915_execbuffer *eb)
 		urelocs = u64_to_user_ptr(eb->exec[i].relocs_ptr);
 		size = nreloc * sizeof(*relocs);
 
-		relocs = kvmalloc_array(size, 1, GFP_TEMPORARY);
+		relocs = kvmalloc_array(size, 1, GFP_KERNEL);
 		if (!relocs) {
 			kvfree(relocs);
 			err = -ENOMEM;
@@ -2077,7 +2077,7 @@ get_fence_array(struct drm_i915_gem_execbuffer2 *args,
 		return ERR_PTR(-EFAULT);
 
 	fences = kvmalloc_array(args->num_cliprects, sizeof(*fences),
-				__GFP_NOWARN | GFP_TEMPORARY);
+				__GFP_NOWARN | GFP_KERNEL);
 	if (!fences)
 		return ERR_PTR(-ENOMEM);
 
@@ -2463,9 +2463,9 @@ i915_gem_execbuffer(struct drm_device *dev, void *data,
 
 	/* Copy in the exec list from userland */
 	exec_list = kvmalloc_array(args->buffer_count, sizeof(*exec_list),
-				   __GFP_NOWARN | GFP_TEMPORARY);
+				   __GFP_NOWARN | GFP_KERNEL);
 	exec2_list = kvmalloc_array(args->buffer_count + 1, sz,
-				    __GFP_NOWARN | GFP_TEMPORARY);
+				    __GFP_NOWARN | GFP_KERNEL);
 	if (exec_list == NULL || exec2_list == NULL) {
 		DRM_DEBUG("Failed to allocate exec list for %d buffers\n",
 			  args->buffer_count);
@@ -2543,7 +2543,7 @@ i915_gem_execbuffer2(struct drm_device *dev, void *data,
 
 	/* Allocate an extra slot for use by the command parser */
 	exec2_list = kvmalloc_array(args->buffer_count + 1, sz,
-				    __GFP_NOWARN | GFP_TEMPORARY);
+				    __GFP_NOWARN | GFP_KERNEL);
 	if (exec2_list == NULL) {
 		DRM_DEBUG("Failed to allocate exec list for %d buffers\n",
 			  args->buffer_count);
diff --git a/drivers/gpu/drm/i915/i915_gem_gtt.c b/drivers/gpu/drm/i915/i915_gem_gtt.c
index 0d5a988b3867..e2410eb5d96e 100644
--- a/drivers/gpu/drm/i915/i915_gem_gtt.c
+++ b/drivers/gpu/drm/i915/i915_gem_gtt.c
@@ -3231,7 +3231,7 @@ intel_rotate_pages(struct intel_rotation_info *rot_info,
 	/* Allocate a temporary list of source pages for random access. */
 	page_addr_list = kvmalloc_array(n_pages,
 					sizeof(dma_addr_t),
-					GFP_TEMPORARY);
+					GFP_KERNEL);
 	if (!page_addr_list)
 		return ERR_PTR(ret);
 
diff --git a/drivers/gpu/drm/i915/i915_gem_userptr.c b/drivers/gpu/drm/i915/i915_gem_userptr.c
index 23fd18bd1b56..709efe2357ea 100644
--- a/drivers/gpu/drm/i915/i915_gem_userptr.c
+++ b/drivers/gpu/drm/i915/i915_gem_userptr.c
@@ -507,7 +507,7 @@ __i915_gem_userptr_get_pages_worker(struct work_struct *_work)
 	ret = -ENOMEM;
 	pinned = 0;
 
-	pvec = kvmalloc_array(npages, sizeof(struct page *), GFP_TEMPORARY);
+	pvec = kvmalloc_array(npages, sizeof(struct page *), GFP_KERNEL);
 	if (pvec != NULL) {
 		struct mm_struct *mm = obj->userptr.mm->mm;
 		unsigned int flags = 0;
@@ -643,7 +643,7 @@ i915_gem_userptr_get_pages(struct drm_i915_gem_object *obj)
 
 	if (mm == current->mm) {
 		pvec = kvmalloc_array(num_pages, sizeof(struct page *),
-				      GFP_TEMPORARY |
+				      GFP_KERNEL |
 				      __GFP_NORETRY |
 				      __GFP_NOWARN);
 		if (pvec) /* defer to worker if malloc fails */
diff --git a/drivers/gpu/drm/i915/i915_gpu_error.c b/drivers/gpu/drm/i915/i915_gpu_error.c
index ed5a1eb839ad..0c779671fe2d 100644
--- a/drivers/gpu/drm/i915/i915_gpu_error.c
+++ b/drivers/gpu/drm/i915/i915_gpu_error.c
@@ -787,16 +787,16 @@ int i915_error_state_buf_init(struct drm_i915_error_state_buf *ebuf,
 	 */
 	ebuf->size = count + 1 > PAGE_SIZE ? count + 1 : PAGE_SIZE;
 	ebuf->buf = kmalloc(ebuf->size,
-				GFP_TEMPORARY | __GFP_NORETRY | __GFP_NOWARN);
+				GFP_KERNEL | __GFP_NORETRY | __GFP_NOWARN);
 
 	if (ebuf->buf == NULL) {
 		ebuf->size = PAGE_SIZE;
-		ebuf->buf = kmalloc(ebuf->size, GFP_TEMPORARY);
+		ebuf->buf = kmalloc(ebuf->size, GFP_KERNEL);
 	}
 
 	if (ebuf->buf == NULL) {
 		ebuf->size = 128;
-		ebuf->buf = kmalloc(ebuf->size, GFP_TEMPORARY);
+		ebuf->buf = kmalloc(ebuf->size, GFP_KERNEL);
 	}
 
 	if (ebuf->buf == NULL)
diff --git a/drivers/gpu/drm/i915/selftests/i915_random.c b/drivers/gpu/drm/i915/selftests/i915_random.c
index d044bf9a6feb..222c511bea49 100644
--- a/drivers/gpu/drm/i915/selftests/i915_random.c
+++ b/drivers/gpu/drm/i915/selftests/i915_random.c
@@ -62,7 +62,7 @@ unsigned int *i915_random_order(unsigned int count, struct rnd_state *state)
 {
 	unsigned int *order, i;
 
-	order = kmalloc_array(count, sizeof(*order), GFP_TEMPORARY);
+	order = kmalloc_array(count, sizeof(*order), GFP_KERNEL);
 	if (!order)
 		return order;
 
diff --git a/drivers/gpu/drm/i915/selftests/intel_breadcrumbs.c b/drivers/gpu/drm/i915/selftests/intel_breadcrumbs.c
index 7276194c04f7..828904b7d468 100644
--- a/drivers/gpu/drm/i915/selftests/intel_breadcrumbs.c
+++ b/drivers/gpu/drm/i915/selftests/intel_breadcrumbs.c
@@ -117,12 +117,12 @@ static int igt_random_insert_remove(void *arg)
 
 	mock_engine_reset(engine);
 
-	waiters = kvmalloc_array(count, sizeof(*waiters), GFP_TEMPORARY);
+	waiters = kvmalloc_array(count, sizeof(*waiters), GFP_KERNEL);
 	if (!waiters)
 		goto out_engines;
 
 	bitmap = kcalloc(DIV_ROUND_UP(count, BITS_PER_LONG), sizeof(*bitmap),
-			 GFP_TEMPORARY);
+			 GFP_KERNEL);
 	if (!bitmap)
 		goto out_waiters;
 
@@ -187,12 +187,12 @@ static int igt_insert_complete(void *arg)
 
 	mock_engine_reset(engine);
 
-	waiters = kvmalloc_array(count, sizeof(*waiters), GFP_TEMPORARY);
+	waiters = kvmalloc_array(count, sizeof(*waiters), GFP_KERNEL);
 	if (!waiters)
 		goto out_engines;
 
 	bitmap = kcalloc(DIV_ROUND_UP(count, BITS_PER_LONG), sizeof(*bitmap),
-			 GFP_TEMPORARY);
+			 GFP_KERNEL);
 	if (!bitmap)
 		goto out_waiters;
 
@@ -368,7 +368,7 @@ static int igt_wakeup(void *arg)
 
 	mock_engine_reset(engine);
 
-	waiters = kvmalloc_array(count, sizeof(*waiters), GFP_TEMPORARY);
+	waiters = kvmalloc_array(count, sizeof(*waiters), GFP_KERNEL);
 	if (!waiters)
 		goto out_engines;
 
diff --git a/drivers/gpu/drm/i915/selftests/intel_uncore.c b/drivers/gpu/drm/i915/selftests/intel_uncore.c
index 2d0fef2cfca6..3cac22eb47ce 100644
--- a/drivers/gpu/drm/i915/selftests/intel_uncore.c
+++ b/drivers/gpu/drm/i915/selftests/intel_uncore.c
@@ -127,7 +127,7 @@ static int intel_uncore_check_forcewake_domains(struct drm_i915_private *dev_pri
 		return 0;
 
 	valid = kzalloc(BITS_TO_LONGS(FW_RANGE) * sizeof(*valid),
-			GFP_TEMPORARY);
+			GFP_KERNEL);
 	if (!valid)
 		return -ENOMEM;
 
diff --git a/drivers/gpu/drm/lib/drm_random.c b/drivers/gpu/drm/lib/drm_random.c
index 7b12a68c3b54..a78c4b483e8d 100644
--- a/drivers/gpu/drm/lib/drm_random.c
+++ b/drivers/gpu/drm/lib/drm_random.c
@@ -28,7 +28,7 @@ unsigned int *drm_random_order(unsigned int count, struct rnd_state *state)
 {
 	unsigned int *order, i;
 
-	order = kmalloc_array(count, sizeof(*order), GFP_TEMPORARY);
+	order = kmalloc_array(count, sizeof(*order), GFP_KERNEL);
 	if (!order)
 		return order;
 
diff --git a/drivers/gpu/drm/msm/msm_gem_submit.c b/drivers/gpu/drm/msm/msm_gem_submit.c
index 8a75c0bd8a78..5d0a75d4b249 100644
--- a/drivers/gpu/drm/msm/msm_gem_submit.c
+++ b/drivers/gpu/drm/msm/msm_gem_submit.c
@@ -40,7 +40,7 @@ static struct msm_gem_submit *submit_create(struct drm_device *dev,
 	if (sz > SIZE_MAX)
 		return NULL;
 
-	submit = kmalloc(sz, GFP_TEMPORARY | __GFP_NOWARN | __GFP_NORETRY);
+	submit = kmalloc(sz, GFP_KERNEL | __GFP_NOWARN | __GFP_NORETRY);
 	if (!submit)
 		return NULL;
 
diff --git a/drivers/gpu/drm/selftests/test-drm_mm.c b/drivers/gpu/drm/selftests/test-drm_mm.c
index dfdd858eda0a..86eb4c185a28 100644
--- a/drivers/gpu/drm/selftests/test-drm_mm.c
+++ b/drivers/gpu/drm/selftests/test-drm_mm.c
@@ -1627,7 +1627,7 @@ static int igt_topdown(void *ignored)
 		goto err;
 
 	bitmap = kzalloc(count / BITS_PER_LONG * sizeof(unsigned long),
-			 GFP_TEMPORARY);
+			 GFP_KERNEL);
 	if (!bitmap)
 		goto err_nodes;
 
@@ -1741,7 +1741,7 @@ static int igt_bottomup(void *ignored)
 		goto err;
 
 	bitmap = kzalloc(count / BITS_PER_LONG * sizeof(unsigned long),
-			 GFP_TEMPORARY);
+			 GFP_KERNEL);
 	if (!bitmap)
 		goto err_nodes;
 
diff --git a/drivers/misc/cxl/pci.c b/drivers/misc/cxl/pci.c
index d18b3d9292fd..3ba04f371380 100644
--- a/drivers/misc/cxl/pci.c
+++ b/drivers/misc/cxl/pci.c
@@ -1279,7 +1279,7 @@ ssize_t cxl_pci_afu_read_err_buffer(struct cxl_afu *afu, char *buf,
 	}
 
 	/* use bounce buffer for copy */
-	tbuf = (void *)__get_free_page(GFP_TEMPORARY);
+	tbuf = (void *)__get_free_page(GFP_KERNEL);
 	if (!tbuf)
 		return -ENOMEM;
 
diff --git a/drivers/xen/gntalloc.c b/drivers/xen/gntalloc.c
index 1bf55a32a4b3..3fa40c723e8e 100644
--- a/drivers/xen/gntalloc.c
+++ b/drivers/xen/gntalloc.c
@@ -294,7 +294,7 @@ static long gntalloc_ioctl_alloc(struct gntalloc_file_private_data *priv,
 		goto out;
 	}
 
-	gref_ids = kcalloc(op.count, sizeof(gref_ids[0]), GFP_TEMPORARY);
+	gref_ids = kcalloc(op.count, sizeof(gref_ids[0]), GFP_KERNEL);
 	if (!gref_ids) {
 		rc = -ENOMEM;
 		goto out;
diff --git a/fs/coredump.c b/fs/coredump.c
index 592683711c64..0eec03696707 100644
--- a/fs/coredump.c
+++ b/fs/coredump.c
@@ -161,7 +161,7 @@ static int cn_print_exe_file(struct core_name *cn)
 	if (!exe_file)
 		return cn_esc_printf(cn, "%s (path unknown)", current->comm);
 
-	pathbuf = kmalloc(PATH_MAX, GFP_TEMPORARY);
+	pathbuf = kmalloc(PATH_MAX, GFP_KERNEL);
 	if (!pathbuf) {
 		ret = -ENOMEM;
 		goto put_exe_file;
diff --git a/fs/exec.c b/fs/exec.c
index 01a9fb9d8ac3..daa19d85c066 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -1763,9 +1763,9 @@ static int do_execveat_common(int fd, struct filename *filename,
 		bprm->filename = filename->name;
 	} else {
 		if (filename->name[0] == '\0')
-			pathbuf = kasprintf(GFP_TEMPORARY, "/dev/fd/%d", fd);
+			pathbuf = kasprintf(GFP_KERNEL, "/dev/fd/%d", fd);
 		else
-			pathbuf = kasprintf(GFP_TEMPORARY, "/dev/fd/%d/%s",
+			pathbuf = kasprintf(GFP_KERNEL, "/dev/fd/%d/%s",
 					    fd, filename->name);
 		if (!pathbuf) {
 			retval = -ENOMEM;
diff --git a/fs/overlayfs/copy_up.c b/fs/overlayfs/copy_up.c
index acb6f97deb97..aad97b30d5e6 100644
--- a/fs/overlayfs/copy_up.c
+++ b/fs/overlayfs/copy_up.c
@@ -241,7 +241,7 @@ struct ovl_fh *ovl_encode_fh(struct dentry *lower, bool is_upper)
 	int buflen = MAX_HANDLE_SZ;
 	uuid_t *uuid = &lower->d_sb->s_uuid;
 
-	buf = kmalloc(buflen, GFP_TEMPORARY);
+	buf = kmalloc(buflen, GFP_KERNEL);
 	if (!buf)
 		return ERR_PTR(-ENOMEM);
 
diff --git a/fs/overlayfs/dir.c b/fs/overlayfs/dir.c
index 9cb0c80e5967..3309b1912241 100644
--- a/fs/overlayfs/dir.c
+++ b/fs/overlayfs/dir.c
@@ -833,7 +833,7 @@ static char *ovl_get_redirect(struct dentry *dentry, bool samedir)
 		goto out;
 	}
 
-	buf = ret = kmalloc(buflen, GFP_TEMPORARY);
+	buf = ret = kmalloc(buflen, GFP_KERNEL);
 	if (!buf)
 		goto out;
 
diff --git a/fs/overlayfs/namei.c b/fs/overlayfs/namei.c
index 8aef2b304b2d..c3addd1114f1 100644
--- a/fs/overlayfs/namei.c
+++ b/fs/overlayfs/namei.c
@@ -38,7 +38,7 @@ static int ovl_check_redirect(struct dentry *dentry, struct ovl_lookup_data *d,
 			return 0;
 		goto fail;
 	}
-	buf = kzalloc(prelen + res + strlen(post) + 1, GFP_TEMPORARY);
+	buf = kzalloc(prelen + res + strlen(post) + 1, GFP_KERNEL);
 	if (!buf)
 		return -ENOMEM;
 
@@ -103,7 +103,7 @@ static struct ovl_fh *ovl_get_origin_fh(struct dentry *dentry)
 	if (res == 0)
 		return NULL;
 
-	fh = kzalloc(res, GFP_TEMPORARY);
+	fh  = kzalloc(res, GFP_KERNEL);
 	if (!fh)
 		return ERR_PTR(-ENOMEM);
 
@@ -309,7 +309,7 @@ static int ovl_check_origin(struct dentry *upperdentry,
 
 	BUG_ON(*ctrp);
 	if (!*stackp)
-		*stackp = kmalloc(sizeof(struct path), GFP_TEMPORARY);
+		*stackp = kmalloc(sizeof(struct path), GFP_KERNEL);
 	if (!*stackp) {
 		dput(origin);
 		return -ENOMEM;
@@ -418,7 +418,7 @@ int ovl_verify_index(struct dentry *index, struct path *lowerstack,
 
 	err = -ENOMEM;
 	len = index->d_name.len / 2;
-	fh = kzalloc(len, GFP_TEMPORARY);
+	fh = kzalloc(len, GFP_KERNEL);
 	if (!fh)
 		goto fail;
 
@@ -478,7 +478,7 @@ int ovl_get_index_name(struct dentry *origin, struct qstr *name)
 		return PTR_ERR(fh);
 
 	err = -ENOMEM;
-	n = kzalloc(fh->len * 2, GFP_TEMPORARY);
+	n = kzalloc(fh->len * 2, GFP_KERNEL);
 	if (n) {
 		s  = bin2hex(n, fh, fh->len);
 		*name = (struct qstr) QSTR_INIT(n, s - n);
@@ -646,7 +646,7 @@ struct dentry *ovl_lookup(struct inode *dir, struct dentry *dentry,
 	if (!d.stop && poe->numlower) {
 		err = -ENOMEM;
 		stack = kcalloc(ofs->numlower, sizeof(struct path),
-				GFP_TEMPORARY);
+				GFP_KERNEL);
 		if (!stack)
 			goto out_put_upper;
 	}
diff --git a/fs/proc/base.c b/fs/proc/base.c
index e5d89a0d0b8a..ad3b0762cc3e 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -232,7 +232,7 @@ static ssize_t proc_pid_cmdline_read(struct file *file, char __user *buf,
 		goto out_mmput;
 	}
 
-	page = (char *)__get_free_page(GFP_TEMPORARY);
+	page = (char *)__get_free_page(GFP_KERNEL);
 	if (!page) {
 		rv = -ENOMEM;
 		goto out_mmput;
@@ -813,7 +813,7 @@ static ssize_t mem_rw(struct file *file, char __user *buf,
 	if (!mm)
 		return 0;
 
-	page = (char *)__get_free_page(GFP_TEMPORARY);
+	page = (char *)__get_free_page(GFP_KERNEL);
 	if (!page)
 		return -ENOMEM;
 
@@ -918,7 +918,7 @@ static ssize_t environ_read(struct file *file, char __user *buf,
 	if (!mm || !mm->env_end)
 		return 0;
 
-	page = (char *)__get_free_page(GFP_TEMPORARY);
+	page = (char *)__get_free_page(GFP_KERNEL);
 	if (!page)
 		return -ENOMEM;
 
@@ -1630,7 +1630,7 @@ out:
 
 static int do_proc_readlink(struct path *path, char __user *buffer, int buflen)
 {
-	char *tmp = (char*)__get_free_page(GFP_TEMPORARY);
+	char *tmp = (char *)__get_free_page(GFP_KERNEL);
 	char *pathname;
 	int len;
 
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index 7b40e11ede9b..5589b4bd4b85 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -1474,7 +1474,7 @@ static ssize_t pagemap_read(struct file *file, char __user *buf,
 	pm.show_pfn = file_ns_capable(file, &init_user_ns, CAP_SYS_ADMIN);
 
 	pm.len = (PAGEMAP_WALK_SIZE >> PAGE_SHIFT);
-	pm.buffer = kmalloc(pm.len * PM_ENTRY_BYTES, GFP_TEMPORARY);
+	pm.buffer = kmalloc(pm.len * PM_ENTRY_BYTES, GFP_KERNEL);
 	ret = -ENOMEM;
 	if (!pm.buffer)
 		goto out_mm;
diff --git a/include/linux/gfp.h b/include/linux/gfp.h
index bcfb9f7c46f5..f780718b7391 100644
--- a/include/linux/gfp.h
+++ b/include/linux/gfp.h
@@ -288,8 +288,6 @@ struct vm_area_struct;
 #define GFP_NOWAIT	(__GFP_KSWAPD_RECLAIM)
 #define GFP_NOIO	(__GFP_RECLAIM)
 #define GFP_NOFS	(__GFP_RECLAIM | __GFP_IO)
-#define GFP_TEMPORARY	(__GFP_RECLAIM | __GFP_IO | __GFP_FS | \
-			 __GFP_RECLAIMABLE)
 #define GFP_USER	(__GFP_RECLAIM | __GFP_IO | __GFP_FS | __GFP_HARDWALL)
 #define GFP_DMA		__GFP_DMA
 #define GFP_DMA32	__GFP_DMA32
diff --git a/include/trace/events/mmflags.h b/include/trace/events/mmflags.h
index 4c2e4737d7bc..fec6291a6703 100644
--- a/include/trace/events/mmflags.h
+++ b/include/trace/events/mmflags.h
@@ -18,7 +18,6 @@
 	{(unsigned long)GFP_HIGHUSER_MOVABLE,	"GFP_HIGHUSER_MOVABLE"},\
 	{(unsigned long)GFP_HIGHUSER,		"GFP_HIGHUSER"},	\
 	{(unsigned long)GFP_USER,		"GFP_USER"},		\
-	{(unsigned long)GFP_TEMPORARY,		"GFP_TEMPORARY"},	\
 	{(unsigned long)GFP_KERNEL_ACCOUNT,	"GFP_KERNEL_ACCOUNT"},	\
 	{(unsigned long)GFP_KERNEL,		"GFP_KERNEL"},		\
 	{(unsigned long)GFP_NOFS,		"GFP_NOFS"},		\
diff --git a/kernel/locking/test-ww_mutex.c b/kernel/locking/test-ww_mutex.c
index 39f56c870051..0e4cd64ad2c0 100644
--- a/kernel/locking/test-ww_mutex.c
+++ b/kernel/locking/test-ww_mutex.c
@@ -362,7 +362,7 @@ static int *get_random_order(int count)
 	int *order;
 	int n, r, tmp;
 
-	order = kmalloc_array(count, sizeof(*order), GFP_TEMPORARY);
+	order = kmalloc_array(count, sizeof(*order), GFP_KERNEL);
 	if (!order)
 		return order;
 
diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c
index 181e139a8057..61e7f0678d33 100644
--- a/kernel/trace/trace_events_filter.c
+++ b/kernel/trace/trace_events_filter.c
@@ -702,7 +702,7 @@ static void append_filter_err(struct filter_parse_state *ps,
 	int pos = ps->lasterr_pos;
 	char *buf, *pbuf;
 
-	buf = (char *)__get_free_page(GFP_TEMPORARY);
+	buf = (char *)__get_free_page(GFP_KERNEL);
 	if (!buf)
 		return;
 
diff --git a/lib/string_helpers.c b/lib/string_helpers.c
index ecaac2c0526f..29c490e5d478 100644
--- a/lib/string_helpers.c
+++ b/lib/string_helpers.c
@@ -576,7 +576,7 @@ char *kstrdup_quotable_cmdline(struct task_struct *task, gfp_t gfp)
 	char *buffer, *quoted;
 	int i, res;
 
-	buffer = kmalloc(PAGE_SIZE, GFP_TEMPORARY);
+	buffer = kmalloc(PAGE_SIZE, GFP_KERNEL);
 	if (!buffer)
 		return NULL;
 
@@ -612,7 +612,7 @@ char *kstrdup_quotable_file(struct file *file, gfp_t gfp)
 		return kstrdup("<unknown>", gfp);
 
 	/* We add 11 spaces for ' (deleted)' to be appended */
-	temp = kmalloc(PATH_MAX + 11, GFP_TEMPORARY);
+	temp = kmalloc(PATH_MAX + 11, GFP_KERNEL);
 	if (!temp)
 		return kstrdup("<no_memory>", gfp);
 
diff --git a/mm/shmem.c b/mm/shmem.c
index ace53a582be5..07a1d22807be 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -3685,7 +3685,7 @@ SYSCALL_DEFINE2(memfd_create,
 	if (len > MFD_NAME_MAX_LEN + 1)
 		return -EINVAL;
 
-	name = kmalloc(len + MFD_NAME_PREFIX_LEN, GFP_TEMPORARY);
+	name = kmalloc(len + MFD_NAME_PREFIX_LEN, GFP_KERNEL);
 	if (!name)
 		return -ENOMEM;
 
diff --git a/mm/slub.c b/mm/slub.c
index d39a5d3834b3..163352c537ab 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -4597,7 +4597,7 @@ static int list_locations(struct kmem_cache *s, char *buf,
 	struct kmem_cache_node *n;
 
 	if (!map || !alloc_loc_track(&t, PAGE_SIZE / sizeof(struct location),
-				     GFP_TEMPORARY)) {
+				     GFP_KERNEL)) {
 		kfree(map);
 		return sprintf(buf, "Out of memory\n");
 	}
diff --git a/tools/perf/builtin-kmem.c b/tools/perf/builtin-kmem.c
index a1497c516d85..24ee68ecdd42 100644
--- a/tools/perf/builtin-kmem.c
+++ b/tools/perf/builtin-kmem.c
@@ -627,7 +627,6 @@ static const struct {
 	{ "GFP_HIGHUSER_MOVABLE",	"HUM" },
 	{ "GFP_HIGHUSER",		"HU" },
 	{ "GFP_USER",			"U" },
-	{ "GFP_TEMPORARY",		"TMP" },
 	{ "GFP_KERNEL_ACCOUNT",		"KAC" },
 	{ "GFP_KERNEL",			"K" },
 	{ "GFP_NOFS",			"NF" },
-- 
cgit v1.2.3


From 488e32f1981e506976d666b3772d6ccc8b726fee Mon Sep 17 00:00:00 2001
From: Ladi Prosek <lprosek@redhat.com>
Date: Thu, 14 Sep 2017 11:50:07 +0200
Subject: KVM: trace events: update list of exit reasons
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Adding entries for exit reasons 23 - 27:

  KVM_EXIT_EPR
  KVM_EXIT_SYSTEM_EVENT
  KVM_EXIT_S390_STSI
  KVM_EXIT_IOAPIC_EOI
  KVM_EXIT_HYPERV

Signed-off-by: Ladi Prosek <lprosek@redhat.com>
Reviewed-by: Cornelia Huck <cohuck@redhat.com>
Signed-off-by: Radim Krčmář <rkrcmar@redhat.com>
---
 include/trace/events/kvm.h | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/trace/events/kvm.h b/include/trace/events/kvm.h
index 8ade3eb6c640..dcffedfac431 100644
--- a/include/trace/events/kvm.h
+++ b/include/trace/events/kvm.h
@@ -14,7 +14,9 @@
 	ERSN(SHUTDOWN), ERSN(FAIL_ENTRY), ERSN(INTR), ERSN(SET_TPR),	\
 	ERSN(TPR_ACCESS), ERSN(S390_SIEIC), ERSN(S390_RESET), ERSN(DCR),\
 	ERSN(NMI), ERSN(INTERNAL_ERROR), ERSN(OSI), ERSN(PAPR_HCALL),	\
-	ERSN(S390_UCONTROL), ERSN(WATCHDOG), ERSN(S390_TSCH)
+	ERSN(S390_UCONTROL), ERSN(WATCHDOG), ERSN(S390_TSCH), ERSN(EPR),\
+	ERSN(SYSTEM_EVENT), ERSN(S390_STSI), ERSN(IOAPIC_EOI),          \
+	ERSN(HYPERV)
 
 TRACE_EVENT(kvm_userspace_exit,
 	    TP_PROTO(__u32 reason, int errno),
-- 
cgit v1.2.3


From 2554db916586b228ce93e6f74a12fd7fe430a004 Mon Sep 17 00:00:00 2001
From: Tim Chen <tim.c.chen@linux.intel.com>
Date: Fri, 25 Aug 2017 09:13:54 -0700
Subject: sched/wait: Break up long wake list walk

We encountered workloads that have very long wake up list on large
systems. A waker takes a long time to traverse the entire wake list and
execute all the wake functions.

We saw page wait list that are up to 3700+ entries long in tests of
large 4 and 8 socket systems. It took 0.8 sec to traverse such list
during wake up. Any other CPU that contends for the list spin lock will
spin for a long time. It is a result of the numa balancing migration of
hot pages that are shared by many threads.

Multiple CPUs waking are queued up behind the lock, and the last one
queued has to wait until all CPUs did all the wakeups.

The page wait list is traversed with interrupt disabled, which caused
various problems. This was the original cause that triggered the NMI
watch dog timer in: https://patchwork.kernel.org/patch/9800303/ . Only
extending the NMI watch dog timer there helped.

This patch bookmarks the waker's scan position in wake list and break
the wake up walk, to allow access to the list before the waker resume
its walk down the rest of the wait list. It lowers the interrupt and
rescheduling latency.

This patch also provides a performance boost when combined with the next
patch to break up page wakeup list walk. We saw 22% improvement in the
will-it-scale file pread2 test on a Xeon Phi system running 256 threads.

[ v2: Merged in Linus' changes to remove the bookmark_wake_function, and
  simply access to flags. ]

Reported-by: Kan Liang <kan.liang@intel.com>
Tested-by: Kan Liang <kan.liang@intel.com>
Signed-off-by: Tim Chen <tim.c.chen@linux.intel.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/wait.h |  1 +
 kernel/sched/wait.c  | 78 ++++++++++++++++++++++++++++++++++++++++++----------
 2 files changed, 64 insertions(+), 15 deletions(-)

(limited to 'include')

diff --git a/include/linux/wait.h b/include/linux/wait.h
index dc19880c02f5..78401ef02d29 100644
--- a/include/linux/wait.h
+++ b/include/linux/wait.h
@@ -18,6 +18,7 @@ int default_wake_function(struct wait_queue_entry *wq_entry, unsigned mode, int
 /* wait_queue_entry::flags */
 #define WQ_FLAG_EXCLUSIVE	0x01
 #define WQ_FLAG_WOKEN		0x02
+#define WQ_FLAG_BOOKMARK	0x04
 
 /*
  * A single wait-queue entry structure:
diff --git a/kernel/sched/wait.c b/kernel/sched/wait.c
index d6afed6d0752..70701ef50465 100644
--- a/kernel/sched/wait.c
+++ b/kernel/sched/wait.c
@@ -53,6 +53,12 @@ void remove_wait_queue(struct wait_queue_head *wq_head, struct wait_queue_entry
 }
 EXPORT_SYMBOL(remove_wait_queue);
 
+/*
+ * Scan threshold to break wait queue walk.
+ * This allows a waker to take a break from holding the
+ * wait queue lock during the wait queue walk.
+ */
+#define WAITQUEUE_WALK_BREAK_CNT 64
 
 /*
  * The core wakeup function. Non-exclusive wakeups (nr_exclusive == 0) just
@@ -63,18 +69,67 @@ EXPORT_SYMBOL(remove_wait_queue);
  * started to run but is not in state TASK_RUNNING. try_to_wake_up() returns
  * zero in this (rare) case, and we handle it by continuing to scan the queue.
  */
-static void __wake_up_common(struct wait_queue_head *wq_head, unsigned int mode,
-			int nr_exclusive, int wake_flags, void *key)
+static int __wake_up_common(struct wait_queue_head *wq_head, unsigned int mode,
+			int nr_exclusive, int wake_flags, void *key,
+			wait_queue_entry_t *bookmark)
 {
 	wait_queue_entry_t *curr, *next;
+	int cnt = 0;
+
+	if (bookmark && (bookmark->flags & WQ_FLAG_BOOKMARK)) {
+		curr = list_next_entry(bookmark, entry);
 
-	list_for_each_entry_safe(curr, next, &wq_head->head, entry) {
+		list_del(&bookmark->entry);
+		bookmark->flags = 0;
+	} else
+		curr = list_first_entry(&wq_head->head, wait_queue_entry_t, entry);
+
+	if (&curr->entry == &wq_head->head)
+		return nr_exclusive;
+
+	list_for_each_entry_safe_from(curr, next, &wq_head->head, entry) {
 		unsigned flags = curr->flags;
-		int ret = curr->func(curr, mode, wake_flags, key);
+		int ret;
+
+		if (flags & WQ_FLAG_BOOKMARK)
+			continue;
+
+		ret = curr->func(curr, mode, wake_flags, key);
 		if (ret < 0)
 			break;
 		if (ret && (flags & WQ_FLAG_EXCLUSIVE) && !--nr_exclusive)
 			break;
+
+		if (bookmark && (++cnt > WAITQUEUE_WALK_BREAK_CNT) &&
+				(&next->entry != &wq_head->head)) {
+			bookmark->flags = WQ_FLAG_BOOKMARK;
+			list_add_tail(&bookmark->entry, &next->entry);
+			break;
+		}
+	}
+	return nr_exclusive;
+}
+
+static void __wake_up_common_lock(struct wait_queue_head *wq_head, unsigned int mode,
+			int nr_exclusive, int wake_flags, void *key)
+{
+	unsigned long flags;
+	wait_queue_entry_t bookmark;
+
+	bookmark.flags = 0;
+	bookmark.private = NULL;
+	bookmark.func = NULL;
+	INIT_LIST_HEAD(&bookmark.entry);
+
+	spin_lock_irqsave(&wq_head->lock, flags);
+	nr_exclusive = __wake_up_common(wq_head, mode, nr_exclusive, wake_flags, key, &bookmark);
+	spin_unlock_irqrestore(&wq_head->lock, flags);
+
+	while (bookmark.flags & WQ_FLAG_BOOKMARK) {
+		spin_lock_irqsave(&wq_head->lock, flags);
+		nr_exclusive = __wake_up_common(wq_head, mode, nr_exclusive,
+						wake_flags, key, &bookmark);
+		spin_unlock_irqrestore(&wq_head->lock, flags);
 	}
 }
 
@@ -91,11 +146,7 @@ static void __wake_up_common(struct wait_queue_head *wq_head, unsigned int mode,
 void __wake_up(struct wait_queue_head *wq_head, unsigned int mode,
 			int nr_exclusive, void *key)
 {
-	unsigned long flags;
-
-	spin_lock_irqsave(&wq_head->lock, flags);
-	__wake_up_common(wq_head, mode, nr_exclusive, 0, key);
-	spin_unlock_irqrestore(&wq_head->lock, flags);
+	__wake_up_common_lock(wq_head, mode, nr_exclusive, 0, key);
 }
 EXPORT_SYMBOL(__wake_up);
 
@@ -104,13 +155,13 @@ EXPORT_SYMBOL(__wake_up);
  */
 void __wake_up_locked(struct wait_queue_head *wq_head, unsigned int mode, int nr)
 {
-	__wake_up_common(wq_head, mode, nr, 0, NULL);
+	__wake_up_common(wq_head, mode, nr, 0, NULL, NULL);
 }
 EXPORT_SYMBOL_GPL(__wake_up_locked);
 
 void __wake_up_locked_key(struct wait_queue_head *wq_head, unsigned int mode, void *key)
 {
-	__wake_up_common(wq_head, mode, 1, 0, key);
+	__wake_up_common(wq_head, mode, 1, 0, key, NULL);
 }
 EXPORT_SYMBOL_GPL(__wake_up_locked_key);
 
@@ -134,7 +185,6 @@ EXPORT_SYMBOL_GPL(__wake_up_locked_key);
 void __wake_up_sync_key(struct wait_queue_head *wq_head, unsigned int mode,
 			int nr_exclusive, void *key)
 {
-	unsigned long flags;
 	int wake_flags = 1; /* XXX WF_SYNC */
 
 	if (unlikely(!wq_head))
@@ -143,9 +193,7 @@ void __wake_up_sync_key(struct wait_queue_head *wq_head, unsigned int mode,
 	if (unlikely(nr_exclusive != 1))
 		wake_flags = 0;
 
-	spin_lock_irqsave(&wq_head->lock, flags);
-	__wake_up_common(wq_head, mode, nr_exclusive, wake_flags, key);
-	spin_unlock_irqrestore(&wq_head->lock, flags);
+	__wake_up_common_lock(wq_head, mode, nr_exclusive, wake_flags, key);
 }
 EXPORT_SYMBOL_GPL(__wake_up_sync_key);
 
-- 
cgit v1.2.3


From 11a19c7b099f96d00a8dec52bfbb8475e89b6745 Mon Sep 17 00:00:00 2001
From: Tim Chen <tim.c.chen@linux.intel.com>
Date: Fri, 25 Aug 2017 09:13:55 -0700
Subject: sched/wait: Introduce wakeup boomark in wake_up_page_bit

Now that we have added breaks in the wait queue scan and allow bookmark
on scan position, we put this logic in the wake_up_page_bit function.

We can have very long page wait list in large system where multiple
pages share the same wait list. We break the wake up walk here to allow
other cpus a chance to access the list, and not to disable the interrupts
when traversing the list for too long.  This reduces the interrupt and
rescheduling latency, and excessive page wait queue lock hold time.

[ v2: Remove bookmark_wake_function ]

Signed-off-by: Tim Chen <tim.c.chen@linux.intel.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/wait.h |  2 ++
 kernel/sched/wait.c  |  7 +++++++
 mm/filemap.c         | 22 +++++++++++++++++++++-
 3 files changed, 30 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/wait.h b/include/linux/wait.h
index 78401ef02d29..87c4641023fb 100644
--- a/include/linux/wait.h
+++ b/include/linux/wait.h
@@ -185,6 +185,8 @@ __remove_wait_queue(struct wait_queue_head *wq_head, struct wait_queue_entry *wq
 
 void __wake_up(struct wait_queue_head *wq_head, unsigned int mode, int nr, void *key);
 void __wake_up_locked_key(struct wait_queue_head *wq_head, unsigned int mode, void *key);
+void __wake_up_locked_key_bookmark(struct wait_queue_head *wq_head,
+		unsigned int mode, void *key, wait_queue_entry_t *bookmark);
 void __wake_up_sync_key(struct wait_queue_head *wq_head, unsigned int mode, int nr, void *key);
 void __wake_up_locked(struct wait_queue_head *wq_head, unsigned int mode, int nr);
 void __wake_up_sync(struct wait_queue_head *wq_head, unsigned int mode, int nr);
diff --git a/kernel/sched/wait.c b/kernel/sched/wait.c
index 70701ef50465..98feab7933c7 100644
--- a/kernel/sched/wait.c
+++ b/kernel/sched/wait.c
@@ -165,6 +165,13 @@ void __wake_up_locked_key(struct wait_queue_head *wq_head, unsigned int mode, vo
 }
 EXPORT_SYMBOL_GPL(__wake_up_locked_key);
 
+void __wake_up_locked_key_bookmark(struct wait_queue_head *wq_head,
+		unsigned int mode, void *key, wait_queue_entry_t *bookmark)
+{
+	__wake_up_common(wq_head, mode, 1, 0, key, bookmark);
+}
+EXPORT_SYMBOL_GPL(__wake_up_locked_key_bookmark);
+
 /**
  * __wake_up_sync_key - wake up threads blocked on a waitqueue.
  * @wq_head: the waitqueue
diff --git a/mm/filemap.c b/mm/filemap.c
index 9d21afd692b9..8c88e186a773 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -909,13 +909,33 @@ static void wake_up_page_bit(struct page *page, int bit_nr)
 	wait_queue_head_t *q = page_waitqueue(page);
 	struct wait_page_key key;
 	unsigned long flags;
+	wait_queue_entry_t bookmark;
 
 	key.page = page;
 	key.bit_nr = bit_nr;
 	key.page_match = 0;
 
+	bookmark.flags = 0;
+	bookmark.private = NULL;
+	bookmark.func = NULL;
+	INIT_LIST_HEAD(&bookmark.entry);
+
 	spin_lock_irqsave(&q->lock, flags);
-	__wake_up_locked_key(q, TASK_NORMAL, &key);
+	__wake_up_locked_key_bookmark(q, TASK_NORMAL, &key, &bookmark);
+
+	while (bookmark.flags & WQ_FLAG_BOOKMARK) {
+		/*
+		 * Take a breather from holding the lock,
+		 * allow pages that finish wake up asynchronously
+		 * to acquire the lock and remove themselves
+		 * from wait queue
+		 */
+		spin_unlock_irqrestore(&q->lock, flags);
+		cpu_relax();
+		spin_lock_irqsave(&q->lock, flags);
+		__wake_up_locked_key_bookmark(q, TASK_NORMAL, &key, &bookmark);
+	}
+
 	/*
 	 * It is possible for other pages to have collided on the waitqueue
 	 * hash, so in that case check for a page match. That prevents a long-
-- 
cgit v1.2.3


From 711aab1dbb324d321e3d84368a435a78908c7bce Mon Sep 17 00:00:00 2001
From: Mimi Zohar <zohar@linux.vnet.ibm.com>
Date: Tue, 12 Sep 2017 22:45:33 -0400
Subject: vfs: constify path argument to kernel_read_file_from_path

This patch constifies the path argument to kernel_read_file_from_path().

Signed-off-by: Mimi Zohar <zohar@linux.vnet.ibm.com>
Cc: Christoph Hellwig <hch@infradead.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/exec.c                  | 2 +-
 include/linux/fs.h         | 2 +-
 sound/oss/sound_firmware.h | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/fs/exec.c b/fs/exec.c
index 69a543259aa5..ac34d9724684 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -955,7 +955,7 @@ out:
 }
 EXPORT_SYMBOL_GPL(kernel_read_file);
 
-int kernel_read_file_from_path(char *path, void **buf, loff_t *size,
+int kernel_read_file_from_path(const char *path, void **buf, loff_t *size,
 			       loff_t max_size, enum kernel_read_file_id id)
 {
 	struct file *file;
diff --git a/include/linux/fs.h b/include/linux/fs.h
index bc475dfeb4ce..339e73742e73 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -2815,7 +2815,7 @@ static inline const char *kernel_read_file_id_str(enum kernel_read_file_id id)
 
 extern int kernel_read_file(struct file *, void **, loff_t *, loff_t,
 			    enum kernel_read_file_id);
-extern int kernel_read_file_from_path(char *, void **, loff_t *, loff_t,
+extern int kernel_read_file_from_path(const char *, void **, loff_t *, loff_t,
 				      enum kernel_read_file_id);
 extern int kernel_read_file_from_fd(int, void **, loff_t *, loff_t,
 				    enum kernel_read_file_id);
diff --git a/sound/oss/sound_firmware.h b/sound/oss/sound_firmware.h
index da4c67e005ed..2be465277ba0 100644
--- a/sound/oss/sound_firmware.h
+++ b/sound/oss/sound_firmware.h
@@ -21,7 +21,7 @@ static inline int mod_firmware_load(const char *fn, char **fp)
 	loff_t size;
 	int err;
 
-	err = kernel_read_file_from_path((char *)fn, (void **)fp, &size,
+	err = kernel_read_file_from_path(fn, (void **)fp, &size,
 					 131072, READING_FIRMWARE);
 	if (err < 0)
 		return 0;
-- 
cgit v1.2.3


From 8cd641e3c7cbf86c7cbd2a17a160dd137d86c860 Mon Sep 17 00:00:00 2001
From: Davidlohr Bueso <dave@stgolabs.net>
Date: Wed, 13 Sep 2017 13:08:18 -0700
Subject: sched/wait: Add swq_has_sleeper()

Which is the equivalent of what we have in regular waitqueues.
I'm not crazy about the name, but this also helps us get both
apis closer -- which iirc comes originally from the -net folks.

We also duplicate the comments for the lockless swait_active(),
from wait.h. Future users will make use of this interface.

Signed-off-by: Davidlohr Bueso <dbueso@suse.de>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 include/linux/swait.h | 58 +++++++++++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 56 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/linux/swait.h b/include/linux/swait.h
index 4a4e180d0a35..73e97a08d3d0 100644
--- a/include/linux/swait.h
+++ b/include/linux/swait.h
@@ -79,9 +79,63 @@ extern void __init_swait_queue_head(struct swait_queue_head *q, const char *name
 	DECLARE_SWAIT_QUEUE_HEAD(name)
 #endif
 
-static inline int swait_active(struct swait_queue_head *q)
+/**
+ * swait_active -- locklessly test for waiters on the queue
+ * @wq: the waitqueue to test for waiters
+ *
+ * returns true if the wait list is not empty
+ *
+ * NOTE: this function is lockless and requires care, incorrect usage _will_
+ * lead to sporadic and non-obvious failure.
+ *
+ * NOTE2: this function has the same above implications as regular waitqueues.
+ *
+ * Use either while holding swait_queue_head::lock or when used for wakeups
+ * with an extra smp_mb() like:
+ *
+ *      CPU0 - waker                    CPU1 - waiter
+ *
+ *                                      for (;;) {
+ *      @cond = true;                     prepare_to_swait(&wq_head, &wait, state);
+ *      smp_mb();                         // smp_mb() from set_current_state()
+ *      if (swait_active(wq_head))        if (@cond)
+ *        wake_up(wq_head);                      break;
+ *                                        schedule();
+ *                                      }
+ *                                      finish_swait(&wq_head, &wait);
+ *
+ * Because without the explicit smp_mb() it's possible for the
+ * swait_active() load to get hoisted over the @cond store such that we'll
+ * observe an empty wait list while the waiter might not observe @cond.
+ * This, in turn, can trigger missing wakeups.
+ *
+ * Also note that this 'optimization' trades a spin_lock() for an smp_mb(),
+ * which (when the lock is uncontended) are of roughly equal cost.
+ */
+static inline int swait_active(struct swait_queue_head *wq)
+{
+	return !list_empty(&wq->task_list);
+}
+
+/**
+ * swq_has_sleeper - check if there are any waiting processes
+ * @wq: the waitqueue to test for waiters
+ *
+ * Returns true if @wq has waiting processes
+ *
+ * Please refer to the comment for swait_active.
+ */
+static inline bool swq_has_sleeper(struct swait_queue_head *wq)
 {
-	return !list_empty(&q->task_list);
+	/*
+	 * We need to be sure we are in sync with the list_add()
+	 * modifications to the wait queue (task_list).
+	 *
+	 * This memory barrier should be paired with one on the
+	 * waiting side.
+	 */
+	smp_mb();
+	return swait_active(wq);
 }
 
 extern void swake_up(struct swait_queue_head *q);
-- 
cgit v1.2.3


From d25adbeb0cdb860fb39e09cdd025e9cfc954c5ab Mon Sep 17 00:00:00 2001
From: Xin Long <lucien.xin@gmail.com>
Date: Fri, 15 Sep 2017 11:02:21 +0800
Subject: sctp: fix an use-after-free issue in sctp_sock_dump

Commit 86fdb3448cc1 ("sctp: ensure ep is not destroyed before doing the
dump") tried to fix an use-after-free issue by checking !sctp_sk(sk)->ep
with holding sock and sock lock.

But Paolo noticed that endpoint could be destroyed in sctp_rcv without
sock lock protection. It means the use-after-free issue still could be
triggered when sctp_rcv put and destroy ep after sctp_sock_dump checks
!ep, although it's pretty hard to reproduce.

I could reproduce it by mdelay in sctp_rcv while msleep in sctp_close
and sctp_sock_dump long time.

This patch is to add another param cb_done to sctp_for_each_transport
and dump ep->assocs with holding tsp after jumping out of transport's
traversal in it to avoid this issue.

It can also improve sctp diag dump to make it run faster, as no need
to save sk into cb->args[5] and keep calling sctp_for_each_transport
any more.

This patch is also to use int * instead of int for the pos argument
in sctp_for_each_transport, which could make postion increment only
in sctp_for_each_transport and no need to keep changing cb->args[2]
in sctp_sock_filter and sctp_sock_dump any more.

Fixes: 86fdb3448cc1 ("sctp: ensure ep is not destroyed before doing the dump")
Reported-by: Paolo Abeni <pabeni@redhat.com>
Signed-off-by: Xin Long <lucien.xin@gmail.com>
Acked-by: Marcelo Ricardo Leitner <marcelo.leitner@gmail.com>
Acked-by: Neil Horman <nhorman@tuxdriver.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/sctp/sctp.h |  3 ++-
 net/sctp/sctp_diag.c    | 32 +++++++++-----------------------
 net/sctp/socket.c       | 40 +++++++++++++++++++++++++---------------
 3 files changed, 36 insertions(+), 39 deletions(-)

(limited to 'include')

diff --git a/include/net/sctp/sctp.h b/include/net/sctp/sctp.h
index 06b4f515e157..d7d8cba01469 100644
--- a/include/net/sctp/sctp.h
+++ b/include/net/sctp/sctp.h
@@ -127,7 +127,8 @@ int sctp_transport_lookup_process(int (*cb)(struct sctp_transport *, void *),
 				  const union sctp_addr *laddr,
 				  const union sctp_addr *paddr, void *p);
 int sctp_for_each_transport(int (*cb)(struct sctp_transport *, void *),
-			    struct net *net, int pos, void *p);
+			    int (*cb_done)(struct sctp_transport *, void *),
+			    struct net *net, int *pos, void *p);
 int sctp_for_each_endpoint(int (*cb)(struct sctp_endpoint *, void *), void *p);
 int sctp_get_sctp_info(struct sock *sk, struct sctp_association *asoc,
 		       struct sctp_info *info);
diff --git a/net/sctp/sctp_diag.c b/net/sctp/sctp_diag.c
index e99518e79b52..7008a992749b 100644
--- a/net/sctp/sctp_diag.c
+++ b/net/sctp/sctp_diag.c
@@ -279,9 +279,11 @@ out:
 	return err;
 }
 
-static int sctp_sock_dump(struct sock *sk, void *p)
+static int sctp_sock_dump(struct sctp_transport *tsp, void *p)
 {
+	struct sctp_endpoint *ep = tsp->asoc->ep;
 	struct sctp_comm_param *commp = p;
+	struct sock *sk = ep->base.sk;
 	struct sk_buff *skb = commp->skb;
 	struct netlink_callback *cb = commp->cb;
 	const struct inet_diag_req_v2 *r = commp->r;
@@ -289,9 +291,7 @@ static int sctp_sock_dump(struct sock *sk, void *p)
 	int err = 0;
 
 	lock_sock(sk);
-	if (!sctp_sk(sk)->ep)
-		goto release;
-	list_for_each_entry(assoc, &sctp_sk(sk)->ep->asocs, asocs) {
+	list_for_each_entry(assoc, &ep->asocs, asocs) {
 		if (cb->args[4] < cb->args[1])
 			goto next;
 
@@ -327,40 +327,30 @@ next:
 		cb->args[4]++;
 	}
 	cb->args[1] = 0;
-	cb->args[2]++;
 	cb->args[3] = 0;
 	cb->args[4] = 0;
 release:
 	release_sock(sk);
-	sock_put(sk);
 	return err;
 }
 
-static int sctp_get_sock(struct sctp_transport *tsp, void *p)
+static int sctp_sock_filter(struct sctp_transport *tsp, void *p)
 {
 	struct sctp_endpoint *ep = tsp->asoc->ep;
 	struct sctp_comm_param *commp = p;
 	struct sock *sk = ep->base.sk;
-	struct netlink_callback *cb = commp->cb;
 	const struct inet_diag_req_v2 *r = commp->r;
 	struct sctp_association *assoc =
 		list_entry(ep->asocs.next, struct sctp_association, asocs);
 
 	/* find the ep only once through the transports by this condition */
 	if (tsp->asoc != assoc)
-		goto out;
+		return 0;
 
 	if (r->sdiag_family != AF_UNSPEC && sk->sk_family != r->sdiag_family)
-		goto out;
-
-	sock_hold(sk);
-	cb->args[5] = (long)sk;
+		return 0;
 
 	return 1;
-
-out:
-	cb->args[2]++;
-	return 0;
 }
 
 static int sctp_ep_dump(struct sctp_endpoint *ep, void *p)
@@ -503,12 +493,8 @@ skip:
 	if (!(idiag_states & ~(TCPF_LISTEN | TCPF_CLOSE)))
 		goto done;
 
-next:
-	cb->args[5] = 0;
-	sctp_for_each_transport(sctp_get_sock, net, cb->args[2], &commp);
-
-	if (cb->args[5] && !sctp_sock_dump((struct sock *)cb->args[5], &commp))
-		goto next;
+	sctp_for_each_transport(sctp_sock_filter, sctp_sock_dump,
+				net, (int *)&cb->args[2], &commp);
 
 done:
 	cb->args[1] = cb->args[4];
diff --git a/net/sctp/socket.c b/net/sctp/socket.c
index 1b00a1e09b93..d4730ada7f32 100644
--- a/net/sctp/socket.c
+++ b/net/sctp/socket.c
@@ -4658,29 +4658,39 @@ int sctp_transport_lookup_process(int (*cb)(struct sctp_transport *, void *),
 EXPORT_SYMBOL_GPL(sctp_transport_lookup_process);
 
 int sctp_for_each_transport(int (*cb)(struct sctp_transport *, void *),
-			    struct net *net, int pos, void *p) {
+			    int (*cb_done)(struct sctp_transport *, void *),
+			    struct net *net, int *pos, void *p) {
 	struct rhashtable_iter hti;
-	void *obj;
-	int err;
-
-	err = sctp_transport_walk_start(&hti);
-	if (err)
-		return err;
+	struct sctp_transport *tsp;
+	int ret;
 
-	obj = sctp_transport_get_idx(net, &hti, pos + 1);
-	for (; !IS_ERR_OR_NULL(obj); obj = sctp_transport_get_next(net, &hti)) {
-		struct sctp_transport *transport = obj;
+again:
+	ret = sctp_transport_walk_start(&hti);
+	if (ret)
+		return ret;
 
-		if (!sctp_transport_hold(transport))
+	tsp = sctp_transport_get_idx(net, &hti, *pos + 1);
+	for (; !IS_ERR_OR_NULL(tsp); tsp = sctp_transport_get_next(net, &hti)) {
+		if (!sctp_transport_hold(tsp))
 			continue;
-		err = cb(transport, p);
-		sctp_transport_put(transport);
-		if (err)
+		ret = cb(tsp, p);
+		if (ret)
 			break;
+		(*pos)++;
+		sctp_transport_put(tsp);
 	}
 	sctp_transport_walk_stop(&hti);
 
-	return err;
+	if (ret) {
+		if (cb_done && !cb_done(tsp, p)) {
+			(*pos)++;
+			sctp_transport_put(tsp);
+			goto again;
+		}
+		sctp_transport_put(tsp);
+	}
+
+	return ret;
 }
 EXPORT_SYMBOL_GPL(sctp_for_each_transport);
 
-- 
cgit v1.2.3


From a5ea8a6803fdec949f752d3aa36b1a88e453b929 Mon Sep 17 00:00:00 2001
From: Noralf Trønnes <noralf@tronnes.org>
Date: Mon, 11 Sep 2017 18:37:45 +0200
Subject: drm/gem-fb-helper: Use debug message on gem lookup failure
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

GEM lookup failure can easily be triggered by userspace so make
it a debug message, not an error message.

Also remove unnecessary inner parentheses and fix alphabetical
struct declaration order.

Cc: Laurent Pinchart <laurent.pinchart@ideasonboard.com>
Signed-off-by: Noralf Trønnes <noralf@tronnes.org>
Reviewed-by: Daniel Vetter <daniel.vetter@ffwll.ch>
Reviewed-by: Laurent Pinchart <laurent.pinchart@ideasonboard.com>
Link: https://patchwork.freedesktop.org/patch/msgid/1505147865-18194-2-git-send-email-noralf@tronnes.org
---
 drivers/gpu/drm/drm_gem_framebuffer_helper.c | 4 ++--
 include/drm/drm_gem_framebuffer_helper.h     | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/drivers/gpu/drm/drm_gem_framebuffer_helper.c b/drivers/gpu/drm/drm_gem_framebuffer_helper.c
index d54a083dc5dd..fc7e995541c9 100644
--- a/drivers/gpu/drm/drm_gem_framebuffer_helper.c
+++ b/drivers/gpu/drm/drm_gem_framebuffer_helper.c
@@ -154,7 +154,7 @@ drm_gem_fb_create_with_funcs(struct drm_device *dev, struct drm_file *file,
 
 		objs[i] = drm_gem_object_lookup(file, mode_cmd->handles[i]);
 		if (!objs[i]) {
-			DRM_DEV_ERROR(dev->dev, "Failed to lookup GEM\n");
+			DRM_DEBUG_KMS("Failed to lookup GEM object\n");
 			ret = -ENOENT;
 			goto err_gem_object_put;
 		}
@@ -232,7 +232,7 @@ int drm_gem_fb_prepare_fb(struct drm_plane *plane,
 	struct dma_buf *dma_buf;
 	struct dma_fence *fence;
 
-	if ((plane->state->fb == state->fb) || !state->fb)
+	if (plane->state->fb == state->fb || !state->fb)
 		return 0;
 
 	dma_buf = drm_gem_fb_get_obj(state->fb, 0)->dma_buf;
diff --git a/include/drm/drm_gem_framebuffer_helper.h b/include/drm/drm_gem_framebuffer_helper.h
index db9cfa07235e..5ca7cdc3f527 100644
--- a/include/drm/drm_gem_framebuffer_helper.h
+++ b/include/drm/drm_gem_framebuffer_helper.h
@@ -2,8 +2,8 @@
 #define __DRM_GEM_FB_HELPER_H__
 
 struct drm_device;
-struct drm_file;
 struct drm_fb_helper_surface_size;
+struct drm_file;
 struct drm_framebuffer;
 struct drm_framebuffer_funcs;
 struct drm_gem_object;
-- 
cgit v1.2.3


From bf29ed1567b67854dc13504f685c45a2ea9b2081 Mon Sep 17 00:00:00 2001
From: Thomas Garnier <thgarnie@google.com>
Date: Thu, 7 Sep 2017 08:30:44 -0700
Subject: syscalls: Use CHECK_DATA_CORRUPTION for addr_limit_user_check

Use CHECK_DATA_CORRUPTION instead of BUG_ON to provide more flexibility
on address limit failures. By default, send a SIGKILL signal to kill the
current process preventing exploitation of a bad address limit.

Make the TIF_FSCHECK flag optional so ARM can use this function.

Signed-off-by: Thomas Garnier <thgarnie@google.com>
Signed-off-by: Kees Cook <keescook@chromium.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Pratyush Anand <panand@redhat.com>
Cc: Dave Martin <Dave.Martin@arm.com>
Cc: Will Drewry <wad@chromium.org>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Will Deacon <will.deacon@arm.com>
Cc: Russell King <linux@armlinux.org.uk>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: David Howells <dhowells@redhat.com>
Cc: Dave Hansen <dave.hansen@intel.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: linux-api@vger.kernel.org
Cc: Yonghong Song <yhs@fb.com>
Cc: linux-arm-kernel@lists.infradead.org
Link: http://lkml.kernel.org/r/1504798247-48833-2-git-send-email-keescook@chromium.org
---
 include/linux/syscalls.h | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

(limited to 'include')

diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
index 95606a2d556f..a78186d826d7 100644
--- a/include/linux/syscalls.h
+++ b/include/linux/syscalls.h
@@ -221,21 +221,25 @@ static inline int is_syscall_trace_event(struct trace_event_call *tp_event)
 	}								\
 	static inline long SYSC##name(__MAP(x,__SC_DECL,__VA_ARGS__))
 
-#ifdef TIF_FSCHECK
 /*
  * Called before coming back to user-mode. Returning to user-mode with an
  * address limit different than USER_DS can allow to overwrite kernel memory.
  */
 static inline void addr_limit_user_check(void)
 {
-
+#ifdef TIF_FSCHECK
 	if (!test_thread_flag(TIF_FSCHECK))
 		return;
+#endif
 
-	BUG_ON(!segment_eq(get_fs(), USER_DS));
+	if (CHECK_DATA_CORRUPTION(!segment_eq(get_fs(), USER_DS),
+				  "Invalid address limit on user-mode return"))
+		force_sig(SIGKILL, current);
+
+#ifdef TIF_FSCHECK
 	clear_thread_flag(TIF_FSCHECK);
-}
 #endif
+}
 
 asmlinkage long sys32_quotactl(unsigned int cmd, const char __user *special,
 			       qid_t id, void __user *addr);
-- 
cgit v1.2.3


From c673fe7f0cd525629db9a87a77610fca137951e0 Mon Sep 17 00:00:00 2001
From: Dhinakaran Pandiyan <dhinakaran.pandiyan@intel.com>
Date: Wed, 13 Sep 2017 23:21:27 -0700
Subject: drm/dp: DPCD register defines for link status within ESI field

Link status is available in the ESI field on devices with DPCD r1.2 or
higher. DP spec also says "An MST upstream device shall use this field
instead of the Link/Sink Device Status field registers, starting from DPCD
Address 00200h."

v2: Prefixed DP_ (Jani)
    Rewrote commment to stay within 80 cols.
Cc: Jani Nikula <jani.nikula@intel.com>
Reviewed-by: Jani Nikula <jani.nikula@intel.com>
Signed-off-by: Dhinakaran Pandiyan <dhinakaran.pandiyan@intel.com>
Signed-off-by: Jani Nikula <jani.nikula@intel.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20170914062127.12047-1-dhinakaran.pandiyan@intel.com
---
 include/drm/drm_dp_helper.h | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'include')

diff --git a/include/drm/drm_dp_helper.h b/include/drm/drm_dp_helper.h
index 2c412a15cfa1..11c39f15f1b3 100644
--- a/include/drm/drm_dp_helper.h
+++ b/include/drm/drm_dp_helper.h
@@ -738,6 +738,11 @@
 #define DP_RECEIVER_ALPM_STATUS		    0x200b  /* eDP 1.4 */
 # define DP_ALPM_LOCK_TIMEOUT_ERROR	    (1 << 0)
 
+#define DP_LANE0_1_STATUS_ESI                  0x200c /* status same as 0x202 */
+#define DP_LANE2_3_STATUS_ESI                  0x200d /* status same as 0x203 */
+#define DP_LANE_ALIGN_STATUS_UPDATED_ESI       0x200e /* status same as 0x204 */
+#define DP_SINK_STATUS_ESI                     0x200f /* status same as 0x205 */
+
 #define DP_DPRX_FEATURE_ENUMERATION_LIST    0x2210  /* DP 1.3 */
 # define DP_GTC_CAP					(1 << 0)  /* DP 1.3 */
 # define DP_SST_SPLIT_SDP_CAP				(1 << 1)  /* DP 1.4 */
-- 
cgit v1.2.3


From 74378c5c8cdaf0ce9f65e67cbd0613286f2c3bad Mon Sep 17 00:00:00 2001
From: Geert Uytterhoeven <geert+renesas@glider.be>
Date: Tue, 5 Sep 2017 20:16:27 +0200
Subject: driver core: Fix link to device power management documentation

Correct location as of commit 2728b2d2e5be4b82 (PM / core / docs:
Convert sleep states API document to reST).

Fixes: 2728b2d2e5be4b82 (PM / core / docs: Convert sleep states API document to reST)
Signed-off-by: Geert Uytterhoeven <geert+renesas@glider.be>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 include/linux/device.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/device.h b/include/linux/device.h
index c6f27207dbe8..1d2607923a24 100644
--- a/include/linux/device.h
+++ b/include/linux/device.h
@@ -838,7 +838,7 @@ struct dev_links_info {
  * @driver_data: Private pointer for driver specific info.
  * @links:	Links to suppliers and consumers of this device.
  * @power:	For device power management.
- * 		See Documentation/power/admin-guide/devices.rst for details.
+ *		See Documentation/driver-api/pm/devices.rst for details.
  * @pm_domain:	Provide callbacks that are executed during system suspend,
  * 		hibernation, system resume and during runtime PM transitions
  * 		along with subsystem-level and driver-level callbacks.
-- 
cgit v1.2.3


From 4c7124413aa759b8ea0b90cd39177e525396e662 Mon Sep 17 00:00:00 2001
From: Yuchung Cheng <ycheng@google.com>
Date: Mon, 18 Sep 2017 11:05:16 -0700
Subject: tcp: remove two unused functions

remove tcp_may_send_now and tcp_snd_test that are no longer used

Fixes: 840a3cbe8969 ("tcp: remove forward retransmit feature")
Signed-off-by: Yuchung Cheng <ycheng@google.com>
Signed-off-by: Neal Cardwell <ncardwell@google.com>
Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/tcp.h     |  1 -
 net/ipv4/tcp_output.c | 34 ----------------------------------
 2 files changed, 35 deletions(-)

(limited to 'include')

diff --git a/include/net/tcp.h b/include/net/tcp.h
index b510f284427a..3bc910a9bfc6 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -544,7 +544,6 @@ u32 tcp_tso_autosize(const struct sock *sk, unsigned int mss_now,
 		     int min_tso_segs);
 void __tcp_push_pending_frames(struct sock *sk, unsigned int cur_mss,
 			       int nonagle);
-bool tcp_may_send_now(struct sock *sk);
 int __tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb, int segs);
 int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb, int segs);
 void tcp_retransmit_timer(struct sock *sk);
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 1c839c99114c..517d737059d1 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -1806,40 +1806,6 @@ static bool tcp_snd_wnd_test(const struct tcp_sock *tp,
 	return !after(end_seq, tcp_wnd_end(tp));
 }
 
-/* This checks if the data bearing packet SKB (usually tcp_send_head(sk))
- * should be put on the wire right now.  If so, it returns the number of
- * packets allowed by the congestion window.
- */
-static unsigned int tcp_snd_test(const struct sock *sk, struct sk_buff *skb,
-				 unsigned int cur_mss, int nonagle)
-{
-	const struct tcp_sock *tp = tcp_sk(sk);
-	unsigned int cwnd_quota;
-
-	tcp_init_tso_segs(skb, cur_mss);
-
-	if (!tcp_nagle_test(tp, skb, cur_mss, nonagle))
-		return 0;
-
-	cwnd_quota = tcp_cwnd_test(tp, skb);
-	if (cwnd_quota && !tcp_snd_wnd_test(tp, skb, cur_mss))
-		cwnd_quota = 0;
-
-	return cwnd_quota;
-}
-
-/* Test if sending is allowed right now. */
-bool tcp_may_send_now(struct sock *sk)
-{
-	const struct tcp_sock *tp = tcp_sk(sk);
-	struct sk_buff *skb = tcp_send_head(sk);
-
-	return skb &&
-		tcp_snd_test(sk, skb, tcp_current_mss(sk),
-			     (tcp_skb_is_last(sk, skb) ?
-			      tp->nonagle : TCP_NAGLE_PUSH));
-}
-
 /* Trim TSO SKB to LEN bytes, put the remaining data into a new packet
  * which is put after SKB on the list.  It is very much like
  * tcp_fragment() except that it may make several kinds of assumptions
-- 
cgit v1.2.3


From 0555ac4333d7da7cff83d5b06bd3679a3e1ef584 Mon Sep 17 00:00:00 2001
From: Tycho Andersen <tycho@docker.com>
Date: Mon, 18 Sep 2017 16:35:32 -0600
Subject: xen, arm64: drop dummy lookup_address()

This is unused, and conflicts with the definition that we'll add for XPFO.

Signed-off-by: Tycho Andersen <tycho@docker.com>
Reviewed-by: Julien Grall <julien.grall@arm.com>
CC: Boris Ostrovsky <boris.ostrovsky@oracle.com>
CC: Juergen Gross <jgross@suse.com>
CC: Stefano Stabellini <sstabellini@kernel.org>
Signed-off-by: Boris Ostrovsky <boris.ostrovsky@oracle.com>
---
 include/xen/arm/page.h | 10 ----------
 1 file changed, 10 deletions(-)

(limited to 'include')

diff --git a/include/xen/arm/page.h b/include/xen/arm/page.h
index 415dbc6e43fd..6adc2a955340 100644
--- a/include/xen/arm/page.h
+++ b/include/xen/arm/page.h
@@ -84,16 +84,6 @@ static inline xmaddr_t arbitrary_virt_to_machine(void *vaddr)
 	BUG();
 }
 
-/* TODO: this shouldn't be here but it is because the frontend drivers
- * are using it (its rolled in headers) even though we won't hit the code path.
- * So for right now just punt with this.
- */
-static inline pte_t *lookup_address(unsigned long address, unsigned int *level)
-{
-	BUG();
-	return NULL;
-}
-
 extern int set_foreign_p2m_mapping(struct gnttab_map_grant_ref *map_ops,
 				   struct gnttab_map_grant_ref *kmap_ops,
 				   struct page **pages, unsigned int count);
-- 
cgit v1.2.3


From ac6c35a4d8c77083525044a192cb1a8711381e94 Mon Sep 17 00:00:00 2001
From: Jani Nikula <jani.nikula@intel.com>
Date: Mon, 18 Sep 2017 21:20:03 +0300
Subject: drm: add backwards compatibility support for
 drm_kms_helper.edid_firmware
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Add drm_kms_helper.edid_firmware module parameter with param ops hooks
to set drm.edid_firmware instead, for backwards compatibility.

Suggested-by: Ville Syrjälä <ville.syrjala@linux.intel.com>
Cc: Abdiel Janulgue <abdiel.janulgue@linux.intel.com>
Cc: Daniel Vetter <daniel.vetter@ffwll.ch>
Cc: Ville Syrjälä <ville.syrjala@linux.intel.com>
Reviewed-by: Ville Syrjälä <ville.syrjala@linux.intel.com>
Acked-by: Dave Airlie <airlied@gmail.com>
Signed-off-by: Jani Nikula <jani.nikula@intel.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20170918182003.22238-2-jani.nikula@intel.com
---
 drivers/gpu/drm/drm_edid_load.c         | 16 ++++++++++++++++
 drivers/gpu/drm/drm_kms_helper_common.c | 28 ++++++++++++++++++++++++++++
 include/drm/drm_edid.h                  |  2 ++
 3 files changed, 46 insertions(+)

(limited to 'include')

diff --git a/drivers/gpu/drm/drm_edid_load.c b/drivers/gpu/drm/drm_edid_load.c
index 1c0495acf341..a4915099aaa9 100644
--- a/drivers/gpu/drm/drm_edid_load.c
+++ b/drivers/gpu/drm/drm_edid_load.c
@@ -31,6 +31,22 @@ module_param_string(edid_firmware, edid_firmware, sizeof(edid_firmware), 0644);
 MODULE_PARM_DESC(edid_firmware, "Do not probe monitor, use specified EDID blob "
 	"from built-in data or /lib/firmware instead. ");
 
+/* Use only for backward compatibility with drm_kms_helper.edid_firmware */
+int __drm_set_edid_firmware_path(const char *path)
+{
+	scnprintf(edid_firmware, sizeof(edid_firmware), "%s", path);
+
+	return 0;
+}
+EXPORT_SYMBOL(__drm_set_edid_firmware_path);
+
+/* Use only for backward compatibility with drm_kms_helper.edid_firmware */
+int __drm_get_edid_firmware_path(char *buf, size_t bufsize)
+{
+	return scnprintf(buf, bufsize, "%s", edid_firmware);
+}
+EXPORT_SYMBOL(__drm_get_edid_firmware_path);
+
 #define GENERIC_EDIDS 6
 static const char * const generic_edid_name[GENERIC_EDIDS] = {
 	"edid/800x600.bin",
diff --git a/drivers/gpu/drm/drm_kms_helper_common.c b/drivers/gpu/drm/drm_kms_helper_common.c
index 6e35a56a6102..93e2b30fe1a5 100644
--- a/drivers/gpu/drm/drm_kms_helper_common.c
+++ b/drivers/gpu/drm/drm_kms_helper_common.c
@@ -26,6 +26,7 @@
  */
 
 #include <linux/module.h>
+#include <drm/drmP.h>
 
 #include "drm_crtc_helper_internal.h"
 
@@ -33,6 +34,33 @@ MODULE_AUTHOR("David Airlie, Jesse Barnes");
 MODULE_DESCRIPTION("DRM KMS helper");
 MODULE_LICENSE("GPL and additional rights");
 
+#if IS_ENABLED(CONFIG_DRM_LOAD_EDID_FIRMWARE)
+
+/* Backward compatibility for drm_kms_helper.edid_firmware */
+static int edid_firmware_set(const char *val, const struct kernel_param *kp)
+{
+	DRM_NOTE("drm_kms_firmware.edid_firmware is deprecated, please use drm.edid_firmware intead.\n");
+
+	return __drm_set_edid_firmware_path(val);
+}
+
+static int edid_firmware_get(char *buffer, const struct kernel_param *kp)
+{
+	return __drm_get_edid_firmware_path(buffer, PAGE_SIZE);
+}
+
+static const struct kernel_param_ops edid_firmware_ops = {
+	.set = edid_firmware_set,
+	.get = edid_firmware_get,
+};
+
+module_param_cb(edid_firmware, &edid_firmware_ops, NULL, 0644);
+__MODULE_PARM_TYPE(edid_firmware, "charp");
+MODULE_PARM_DESC(edid_firmware,
+		 "DEPRECATED. Use drm.edid_firmware module parameter instead.");
+
+#endif
+
 static int __init drm_kms_helper_init(void)
 {
 	int ret;
diff --git a/include/drm/drm_edid.h b/include/drm/drm_edid.h
index 1e1908a6b1d6..6f35909b8add 100644
--- a/include/drm/drm_edid.h
+++ b/include/drm/drm_edid.h
@@ -341,6 +341,8 @@ int drm_av_sync_delay(struct drm_connector *connector,
 
 #ifdef CONFIG_DRM_LOAD_EDID_FIRMWARE
 struct edid *drm_load_edid_firmware(struct drm_connector *connector);
+int __drm_set_edid_firmware_path(const char *path);
+int __drm_get_edid_firmware_path(char *buf, size_t bufsize);
 #else
 static inline struct edid *
 drm_load_edid_firmware(struct drm_connector *connector)
-- 
cgit v1.2.3


From aa767cfb75341a6b93742f4d7d1589ac2532c29e Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Mon, 11 Sep 2017 22:07:50 +0200
Subject: of: provide inline helper for of_find_device_by_node

The ipmmu-vmsa driver fails in compile-testing on non-OF platforms:

drivers/iommu/ipmmu-vmsa.o: In function `ipmmu_of_xlate':
ipmmu-vmsa.c:(.text+0x740): undefined reference to `of_find_device_by_node'

It would be reasonable to assume that this interface works but
returns failure on non-OF builds, like it does on machines that
have been booted in another way, so this adds another inline
function helper.

Fixes: 7b2d59611fef ("iommu/ipmmu-vmsa: Replace local utlb code with fwspec ids")
Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Signed-off-by: Rob Herring <robh@kernel.org>
---
 include/linux/of_platform.h | 7 +++++++
 1 file changed, 7 insertions(+)

(limited to 'include')

diff --git a/include/linux/of_platform.h b/include/linux/of_platform.h
index e0d1946270f3..fb908e598348 100644
--- a/include/linux/of_platform.h
+++ b/include/linux/of_platform.h
@@ -57,7 +57,14 @@ extern const struct of_device_id of_default_bus_match_table[];
 extern struct platform_device *of_device_alloc(struct device_node *np,
 					 const char *bus_id,
 					 struct device *parent);
+#ifdef CONFIG_OF
 extern struct platform_device *of_find_device_by_node(struct device_node *np);
+#else
+static inline struct platform_device *of_find_device_by_node(struct device_node *np)
+{
+	return NULL;
+}
+#endif
 
 /* Platform devices and busses creation */
 extern struct platform_device *of_platform_device_create(struct device_node *np,
-- 
cgit v1.2.3


From 9e987b70ada27554c5d176421de1d167218c49b5 Mon Sep 17 00:00:00 2001
From: John Hubbard <jhubbard@nvidia.com>
Date: Fri, 15 Sep 2017 17:35:27 -0700
Subject: ACPI / bus: Make ACPI_HANDLE() work for non-GPL code again

Due to commit db3e50f3234b (device property: Get rid of struct
fwnode_handle type field), ACPI_HANDLE() inadvertently became
a GPL-only call. The call path that led to that was:

ACPI_HANDLE()
    ACPI_COMPANION()
        to_acpi_device_node()
            is_acpi_device_node()
                acpi_device_fwnode_ops
                    DECLARE_ACPI_FWNODE_OPS(acpi_device_fwnode_ops);

...and the new DECLARE_ACPI_FWNODE_OPS() includes
EXPORT_SYMBOL_GPL, whereas previously it was a static struct.

In order to avoid changing any of that, let's instead provide ever
so slightly better encapsulation of those struct fwnode_operations
instances. Those do not really need to be directly used in
inline function calls in header files. Simply moving two small
functions (is_acpi_device_node and is_acpi_data_node) out of
acpi_bus.h, and into a .c file, does that.

That leaves the internals of struct fwnode_operations as GPL-only
(which I think was the intent all along), but un-breaks any driver
code out there that relies on the ACPI subsystem's being (historically)
an EXPORT_SYMBOL-usable system. By that, I mean, ACPI_HANDLE() and
other basic ACPI calls were non-GPL-protected.

Also, while I'm there, remove a tiny bit of redundancy that was missed
in the earlier commit, by having is_acpi_node() use the other two
routines, instead of checking fwnode directly.

Fixes: db3e50f3234b (device property: Get rid of struct fwnode_handle type field)
Signed-off-by: John Hubbard <jhubbard@nvidia.com>
Acked-by: Sakari Ailus <sakari.ailus@linux.intel.com>
Acked-by: Mika Westerberg <mika.westerberg@linux.intel.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/acpi/property.c | 13 +++++++++++++
 include/acpi/acpi_bus.h | 18 ++++--------------
 2 files changed, 17 insertions(+), 14 deletions(-)

(limited to 'include')

diff --git a/drivers/acpi/property.c b/drivers/acpi/property.c
index c1c216163de3..1e3c2517a1ac 100644
--- a/drivers/acpi/property.c
+++ b/drivers/acpi/property.c
@@ -1293,3 +1293,16 @@ static int acpi_fwnode_graph_parse_endpoint(const struct fwnode_handle *fwnode,
 DECLARE_ACPI_FWNODE_OPS(acpi_device_fwnode_ops);
 DECLARE_ACPI_FWNODE_OPS(acpi_data_fwnode_ops);
 const struct fwnode_operations acpi_static_fwnode_ops;
+
+bool is_acpi_device_node(const struct fwnode_handle *fwnode)
+{
+	return !IS_ERR_OR_NULL(fwnode) &&
+		fwnode->ops == &acpi_device_fwnode_ops;
+}
+EXPORT_SYMBOL(is_acpi_device_node);
+
+bool is_acpi_data_node(const struct fwnode_handle *fwnode)
+{
+	return !IS_ERR_OR_NULL(fwnode) && fwnode->ops == &acpi_data_fwnode_ops;
+}
+EXPORT_SYMBOL(is_acpi_data_node);
diff --git a/include/acpi/acpi_bus.h b/include/acpi/acpi_bus.h
index dedf9d789166..fa1505292f6c 100644
--- a/include/acpi/acpi_bus.h
+++ b/include/acpi/acpi_bus.h
@@ -399,17 +399,12 @@ extern const struct fwnode_operations acpi_device_fwnode_ops;
 extern const struct fwnode_operations acpi_data_fwnode_ops;
 extern const struct fwnode_operations acpi_static_fwnode_ops;
 
+bool is_acpi_device_node(const struct fwnode_handle *fwnode);
+bool is_acpi_data_node(const struct fwnode_handle *fwnode);
+
 static inline bool is_acpi_node(const struct fwnode_handle *fwnode)
 {
-	return !IS_ERR_OR_NULL(fwnode) &&
-		(fwnode->ops == &acpi_device_fwnode_ops
-		 || fwnode->ops == &acpi_data_fwnode_ops);
-}
-
-static inline bool is_acpi_device_node(const struct fwnode_handle *fwnode)
-{
-	return !IS_ERR_OR_NULL(fwnode) &&
-		fwnode->ops == &acpi_device_fwnode_ops;
+	return (is_acpi_device_node(fwnode) || is_acpi_data_node(fwnode));
 }
 
 #define to_acpi_device_node(__fwnode)					\
@@ -422,11 +417,6 @@ static inline bool is_acpi_device_node(const struct fwnode_handle *fwnode)
 			NULL;						\
 	})
 
-static inline bool is_acpi_data_node(const struct fwnode_handle *fwnode)
-{
-	return !IS_ERR_OR_NULL(fwnode) && fwnode->ops == &acpi_data_fwnode_ops;
-}
-
 #define to_acpi_data_node(__fwnode)					\
 	({								\
 		typeof(__fwnode) __to_acpi_data_node_fwnode = __fwnode;	\
-- 
cgit v1.2.3


From ec9dd352d591f0c90402ec67a317c1ed4fb2e638 Mon Sep 17 00:00:00 2001
From: Yonghong Song <yhs@fb.com>
Date: Mon, 18 Sep 2017 16:38:36 -0700
Subject: bpf: one perf event close won't free bpf program attached by another
 perf event

This patch fixes a bug exhibited by the following scenario:
  1. fd1 = perf_event_open with attr.config = ID1
  2. attach bpf program prog1 to fd1
  3. fd2 = perf_event_open with attr.config = ID1
     <this will be successful>
  4. user program closes fd2 and prog1 is detached from the tracepoint.
  5. user program with fd1 does not work properly as tracepoint
     no output any more.

The issue happens at step 4. Multiple perf_event_open can be called
successfully, but only one bpf prog pointer in the tp_event. In the
current logic, any fd release for the same tp_event will free
the tp_event->prog.

The fix is to free tp_event->prog only when the closing fd
corresponds to the one which registered the program.

Signed-off-by: Yonghong Song <yhs@fb.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/trace_events.h | 1 +
 kernel/events/core.c         | 3 ++-
 2 files changed, 3 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/trace_events.h b/include/linux/trace_events.h
index 7f11050746ae..2e0f22298fe9 100644
--- a/include/linux/trace_events.h
+++ b/include/linux/trace_events.h
@@ -272,6 +272,7 @@ struct trace_event_call {
 	int				perf_refcount;
 	struct hlist_head __percpu	*perf_events;
 	struct bpf_prog			*prog;
+	struct perf_event		*bpf_prog_owner;
 
 	int	(*perf_perm)(struct trace_event_call *,
 			     struct perf_event *);
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 3e691b75b2db..6bc21e202ae4 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -8171,6 +8171,7 @@ static int perf_event_set_bpf_prog(struct perf_event *event, u32 prog_fd)
 		}
 	}
 	event->tp_event->prog = prog;
+	event->tp_event->bpf_prog_owner = event;
 
 	return 0;
 }
@@ -8185,7 +8186,7 @@ static void perf_event_free_bpf_prog(struct perf_event *event)
 		return;
 
 	prog = event->tp_event->prog;
-	if (prog) {
+	if (prog && event->tp_event->bpf_prog_owner == event) {
 		event->tp_event->prog = NULL;
 		bpf_prog_put(prog);
 	}
-- 
cgit v1.2.3


From 0551968add53777fddd18f4ffb4e3bbc1f646d79 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Thu, 21 Sep 2017 11:54:44 +0200
Subject: Revert "genirq: Restrict effective affinity to interrupts actually
 using it"

This reverts commit 74def747bcd09692bdbf8c6a15350795b0f11ca8.

The change to the helper function is only correct for the /proc/irq/
readout usage, but breaks the existing x86 usage of that function.

Reported-by: Yanko Kaneti <yaneti@declera.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Marc Zyngier <marc.zyngier@arm.com>
---
 include/linux/irq.h | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

(limited to 'include')

diff --git a/include/linux/irq.h b/include/linux/irq.h
index b99a784635ff..d4728bf6a537 100644
--- a/include/linux/irq.h
+++ b/include/linux/irq.h
@@ -783,10 +783,7 @@ static inline struct cpumask *irq_data_get_affinity_mask(struct irq_data *d)
 static inline
 struct cpumask *irq_data_get_effective_affinity_mask(struct irq_data *d)
 {
-	if (!cpumask_empty(d->common->effective_affinity))
-		return d->common->effective_affinity;
-
-	return d->common->affinity;
+	return d->common->effective_affinity;
 }
 static inline void irq_data_update_effective_affinity(struct irq_data *d,
 						      const struct cpumask *m)
-- 
cgit v1.2.3


From 19cab8872692960535aa6d12e3a295ac51d1a648 Mon Sep 17 00:00:00 2001
From: Florian Fainelli <f.fainelli@gmail.com>
Date: Wed, 20 Sep 2017 15:52:13 -0700
Subject: net: ethtool: Add back transceiver type

Commit 3f1ac7a700d0 ("net: ethtool: add new ETHTOOL_xLINKSETTINGS API")
deprecated the ethtool_cmd::transceiver field, which was fine in
premise, except that the PHY library was actually using it to report the
type of transceiver: internal or external.

Use the first word of the reserved field to put this __u8 transceiver
field back in. It is made read-only, and we don't expect the
ETHTOOL_xLINKSETTINGS API to be doing anything with this anyway, so this
is mostly for the legacy path where we do:

ethtool_get_settings()
-> dev->ethtool_ops->get_link_ksettings()
   -> convert_link_ksettings_to_legacy_settings()

to have no information loss compared to the legacy get_settings API.

Fixes: 3f1ac7a700d0 ("net: ethtool: add new ETHTOOL_xLINKSETTINGS API")
Signed-off-by: Florian Fainelli <f.fainelli@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/ethtool.h | 6 +++++-
 net/core/ethtool.c           | 2 ++
 2 files changed, 7 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/uapi/linux/ethtool.h b/include/uapi/linux/ethtool.h
index 9c041dae8e2c..5bd1b1de4ea0 100644
--- a/include/uapi/linux/ethtool.h
+++ b/include/uapi/linux/ethtool.h
@@ -1753,6 +1753,8 @@ enum ethtool_reset_flags {
  *	%ethtool_link_mode_bit_indices for the link modes, and other
  *	link features that the link partner advertised through
  *	autonegotiation; 0 if unknown or not applicable.  Read-only.
+ * @transceiver: Used to distinguish different possible PHY types,
+ *	reported consistently by PHYLIB.  Read-only.
  *
  * If autonegotiation is disabled, the speed and @duplex represent the
  * fixed link mode and are writable if the driver supports multiple
@@ -1804,7 +1806,9 @@ struct ethtool_link_settings {
 	__u8	eth_tp_mdix;
 	__u8	eth_tp_mdix_ctrl;
 	__s8	link_mode_masks_nwords;
-	__u32	reserved[8];
+	__u8	transceiver;
+	__u8	reserved1[3];
+	__u32	reserved[7];
 	__u32	link_mode_masks[0];
 	/* layout of link_mode_masks fields:
 	 * __u32 map_supported[link_mode_masks_nwords];
diff --git a/net/core/ethtool.c b/net/core/ethtool.c
index 6a582ae4c5d9..3228411ada0f 100644
--- a/net/core/ethtool.c
+++ b/net/core/ethtool.c
@@ -525,6 +525,8 @@ convert_link_ksettings_to_legacy_settings(
 		= link_ksettings->base.eth_tp_mdix;
 	legacy_settings->eth_tp_mdix_ctrl
 		= link_ksettings->base.eth_tp_mdix_ctrl;
+	legacy_settings->transceiver
+		= link_ksettings->base.transceiver;
 	return retval;
 }
 
-- 
cgit v1.2.3


From e8b95728f724797f958912fd9b765a695595d3a6 Mon Sep 17 00:00:00 2001
From: Dmitry Torokhov <dmitry.torokhov@gmail.com>
Date: Fri, 1 Sep 2017 17:13:43 -0700
Subject: Input: uinput - avoid FF flush when destroying device
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Normally, when input device supporting force feedback effects is being
destroyed, we try to "flush" currently playing effects, so that the
physical device does not continue vibrating (or executing other effects).
Unfortunately this does not work well for uinput as flushing of the effects
deadlocks with the destroy action:

- if device is being destroyed because the file descriptor is being closed,
  then there is noone to even service FF requests;

- if device is being destroyed because userspace sent UI_DEV_DESTROY,
  while theoretically it could be possible to service FF requests,
  userspace is unlikely to do so (they'd need to make sure FF handling
  happens on a separate thread) even if kernel solves the issue with FF
  ioctls deadlocking with UI_DEV_DESTROY ioctl on udev->mutex.

To avoid lockups like the one below, let's install a custom input device
flush handler, and avoid trying to flush force feedback effects when we
destroying the device, and instead rely on uinput to shut off the device
properly.

NMI watchdog: Watchdog detected hard LOCKUP on cpu 3
...
 <<EOE>>  [<ffffffff817a0307>] _raw_spin_lock_irqsave+0x37/0x40
 [<ffffffff810e633d>] complete+0x1d/0x50
 [<ffffffffa00ba08c>] uinput_request_done+0x3c/0x40 [uinput]
 [<ffffffffa00ba587>] uinput_request_submit.part.7+0x47/0xb0 [uinput]
 [<ffffffffa00bb62b>] uinput_dev_erase_effect+0x5b/0x76 [uinput]
 [<ffffffff815d91ad>] erase_effect+0xad/0xf0
 [<ffffffff815d929d>] flush_effects+0x4d/0x90
 [<ffffffff815d4cc0>] input_flush_device+0x40/0x60
 [<ffffffff815daf1c>] evdev_cleanup+0xac/0xc0
 [<ffffffff815daf5b>] evdev_disconnect+0x2b/0x60
 [<ffffffff815d74ac>] __input_unregister_device+0xac/0x150
 [<ffffffff815d75f7>] input_unregister_device+0x47/0x70
 [<ffffffffa00bac45>] uinput_destroy_device+0xb5/0xc0 [uinput]
 [<ffffffffa00bb2de>] uinput_ioctl_handler.isra.9+0x65e/0x740 [uinput]
 [<ffffffff811231ab>] ? do_futex+0x12b/0xad0
 [<ffffffffa00bb3f8>] uinput_ioctl+0x18/0x20 [uinput]
 [<ffffffff81241248>] do_vfs_ioctl+0x298/0x480
 [<ffffffff81337553>] ? security_file_ioctl+0x43/0x60
 [<ffffffff812414a9>] SyS_ioctl+0x79/0x90
 [<ffffffff817a04ee>] entry_SYSCALL_64_fastpath+0x12/0x71

Reported-by: Rodrigo Rivas Costa <rodrigorivascosta@gmail.com>
Reported-by: Clément VUCHENER <clement.vuchener@gmail.com>
Fixes: https://bugzilla.kernel.org/show_bug.cgi?id=193741
Signed-off-by: Dmitry Torokhov <dmitry.torokhov@gmail.com>
---
 drivers/input/ff-core.c     | 13 ++++++++++---
 drivers/input/misc/uinput.c | 18 ++++++++++++++++++
 include/linux/input.h       |  1 +
 3 files changed, 29 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/drivers/input/ff-core.c b/drivers/input/ff-core.c
index 8f2042432c85..66a46c84e28f 100644
--- a/drivers/input/ff-core.c
+++ b/drivers/input/ff-core.c
@@ -237,9 +237,15 @@ int input_ff_erase(struct input_dev *dev, int effect_id, struct file *file)
 EXPORT_SYMBOL_GPL(input_ff_erase);
 
 /*
- * flush_effects - erase all effects owned by a file handle
+ * input_ff_flush - erase all effects owned by a file handle
+ * @dev: input device to erase effect from
+ * @file: purported owner of the effects
+ *
+ * This function erases all force-feedback effects associated with
+ * the given owner from specified device. Note that @file may be %NULL,
+ * in which case all effects will be erased.
  */
-static int flush_effects(struct input_dev *dev, struct file *file)
+int input_ff_flush(struct input_dev *dev, struct file *file)
 {
 	struct ff_device *ff = dev->ff;
 	int i;
@@ -255,6 +261,7 @@ static int flush_effects(struct input_dev *dev, struct file *file)
 
 	return 0;
 }
+EXPORT_SYMBOL_GPL(input_ff_flush);
 
 /**
  * input_ff_event() - generic handler for force-feedback events
@@ -343,7 +350,7 @@ int input_ff_create(struct input_dev *dev, unsigned int max_effects)
 	mutex_init(&ff->mutex);
 
 	dev->ff = ff;
-	dev->flush = flush_effects;
+	dev->flush = input_ff_flush;
 	dev->event = input_ff_event;
 	__set_bit(EV_FF, dev->evbit);
 
diff --git a/drivers/input/misc/uinput.c b/drivers/input/misc/uinput.c
index 022be0e22eba..2cff40be8860 100644
--- a/drivers/input/misc/uinput.c
+++ b/drivers/input/misc/uinput.c
@@ -230,6 +230,18 @@ static int uinput_dev_erase_effect(struct input_dev *dev, int effect_id)
 	return uinput_request_submit(udev, &request);
 }
 
+static int uinput_dev_flush(struct input_dev *dev, struct file *file)
+{
+	/*
+	 * If we are called with file == NULL that means we are tearing
+	 * down the device, and therefore we can not handle FF erase
+	 * requests: either we are handling UI_DEV_DESTROY (and holding
+	 * the udev->mutex), or the file descriptor is closed and there is
+	 * nobody on the other side anymore.
+	 */
+	return file ? input_ff_flush(dev, file) : 0;
+}
+
 static void uinput_destroy_device(struct uinput_device *udev)
 {
 	const char *name, *phys;
@@ -297,6 +309,12 @@ static int uinput_create_device(struct uinput_device *udev)
 		dev->ff->playback = uinput_dev_playback;
 		dev->ff->set_gain = uinput_dev_set_gain;
 		dev->ff->set_autocenter = uinput_dev_set_autocenter;
+		/*
+		 * The standard input_ff_flush() implementation does
+		 * not quite work for uinput as we can't reasonably
+		 * handle FF requests during device teardown.
+		 */
+		dev->flush = uinput_dev_flush;
 	}
 
 	error = input_register_device(udev->dev);
diff --git a/include/linux/input.h b/include/linux/input.h
index a65e3b24fb18..fb5e23c7ed98 100644
--- a/include/linux/input.h
+++ b/include/linux/input.h
@@ -529,6 +529,7 @@ int input_ff_event(struct input_dev *dev, unsigned int type, unsigned int code,
 
 int input_ff_upload(struct input_dev *dev, struct ff_effect *effect, struct file *file);
 int input_ff_erase(struct input_dev *dev, int effect_id, struct file *file);
+int input_ff_flush(struct input_dev *dev, struct file *file);
 
 int input_ff_create_memless(struct input_dev *dev, void *data,
 		int (*play_effect)(struct input_dev *, void *, struct ff_effect *));
-- 
cgit v1.2.3


From 222d7dbd258dad4cd5241c43ef818141fad5a87a Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Thu, 21 Sep 2017 09:15:46 -0700
Subject: net: prevent dst uses after free
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

In linux-4.13, Wei worked hard to convert dst to a traditional
refcounted model, removing GC.

We now want to make sure a dst refcount can not transition from 0 back
to 1.

The problem here is that input path attached a not refcounted dst to an
skb. Then later, because packet is forwarded and hits skb_dst_force()
before exiting RCU section, we might try to take a refcount on one dst
that is about to be freed, if another cpu saw 1 -> 0 transition in
dst_release() and queued the dst for freeing after one RCU grace period.

Lets unify skb_dst_force() and skb_dst_force_safe(), since we should
always perform the complete check against dst refcount, and not assume
it is not zero.

Bugzilla : https://bugzilla.kernel.org/show_bug.cgi?id=197005

[  989.919496]  skb_dst_force+0x32/0x34
[  989.919498]  __dev_queue_xmit+0x1ad/0x482
[  989.919501]  ? eth_header+0x28/0xc6
[  989.919502]  dev_queue_xmit+0xb/0xd
[  989.919504]  neigh_connected_output+0x9b/0xb4
[  989.919507]  ip_finish_output2+0x234/0x294
[  989.919509]  ? ipt_do_table+0x369/0x388
[  989.919510]  ip_finish_output+0x12c/0x13f
[  989.919512]  ip_output+0x53/0x87
[  989.919513]  ip_forward_finish+0x53/0x5a
[  989.919515]  ip_forward+0x2cb/0x3e6
[  989.919516]  ? pskb_trim_rcsum.part.9+0x4b/0x4b
[  989.919518]  ip_rcv_finish+0x2e2/0x321
[  989.919519]  ip_rcv+0x26f/0x2eb
[  989.919522]  ? vlan_do_receive+0x4f/0x289
[  989.919523]  __netif_receive_skb_core+0x467/0x50b
[  989.919526]  ? tcp_gro_receive+0x239/0x239
[  989.919529]  ? inet_gro_receive+0x226/0x238
[  989.919530]  __netif_receive_skb+0x4d/0x5f
[  989.919532]  netif_receive_skb_internal+0x5c/0xaf
[  989.919533]  napi_gro_receive+0x45/0x81
[  989.919536]  ixgbe_poll+0xc8a/0xf09
[  989.919539]  ? kmem_cache_free_bulk+0x1b6/0x1f7
[  989.919540]  net_rx_action+0xf4/0x266
[  989.919543]  __do_softirq+0xa8/0x19d
[  989.919545]  irq_exit+0x5d/0x6b
[  989.919546]  do_IRQ+0x9c/0xb5
[  989.919548]  common_interrupt+0x93/0x93
[  989.919548]  </IRQ>

Similarly dst_clone() can use dst_hold() helper to have additional
debugging, as a follow up to commit 44ebe79149ff ("net: add debug
atomic_inc_not_zero() in dst_hold()")

In net-next we will convert dst atomic_t to refcount_t for peace of
mind.

Fixes: a4c2fd7f7891 ("net: remove DST_NOCACHE flag")
Signed-off-by: Eric Dumazet <edumazet@google.com>
Cc: Wei Wang <weiwan@google.com>
Reported-by: Paweł Staszewski <pstaszewski@itcare.pl>
Bisected-by: Paweł Staszewski <pstaszewski@itcare.pl>
Acked-by: Wei Wang <weiwan@google.com>
Acked-by: Martin KaFai Lau <kafai@fb.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/dst.h   | 22 ++++------------------
 include/net/route.h |  2 +-
 include/net/sock.h  |  2 +-
 3 files changed, 6 insertions(+), 20 deletions(-)

(limited to 'include')

diff --git a/include/net/dst.h b/include/net/dst.h
index 93568bd0a352..06a6765da074 100644
--- a/include/net/dst.h
+++ b/include/net/dst.h
@@ -271,7 +271,7 @@ static inline void dst_use_noref(struct dst_entry *dst, unsigned long time)
 static inline struct dst_entry *dst_clone(struct dst_entry *dst)
 {
 	if (dst)
-		atomic_inc(&dst->__refcnt);
+		dst_hold(dst);
 	return dst;
 }
 
@@ -311,21 +311,6 @@ static inline void skb_dst_copy(struct sk_buff *nskb, const struct sk_buff *oskb
 	__skb_dst_copy(nskb, oskb->_skb_refdst);
 }
 
-/**
- * skb_dst_force - makes sure skb dst is refcounted
- * @skb: buffer
- *
- * If dst is not yet refcounted, let's do it
- */
-static inline void skb_dst_force(struct sk_buff *skb)
-{
-	if (skb_dst_is_noref(skb)) {
-		WARN_ON(!rcu_read_lock_held());
-		skb->_skb_refdst &= ~SKB_DST_NOREF;
-		dst_clone(skb_dst(skb));
-	}
-}
-
 /**
  * dst_hold_safe - Take a reference on a dst if possible
  * @dst: pointer to dst entry
@@ -339,16 +324,17 @@ static inline bool dst_hold_safe(struct dst_entry *dst)
 }
 
 /**
- * skb_dst_force_safe - makes sure skb dst is refcounted
+ * skb_dst_force - makes sure skb dst is refcounted
  * @skb: buffer
  *
  * If dst is not yet refcounted and not destroyed, grab a ref on it.
  */
-static inline void skb_dst_force_safe(struct sk_buff *skb)
+static inline void skb_dst_force(struct sk_buff *skb)
 {
 	if (skb_dst_is_noref(skb)) {
 		struct dst_entry *dst = skb_dst(skb);
 
+		WARN_ON(!rcu_read_lock_held());
 		if (!dst_hold_safe(dst))
 			dst = NULL;
 
diff --git a/include/net/route.h b/include/net/route.h
index 1b09a9368c68..57dfc6850d37 100644
--- a/include/net/route.h
+++ b/include/net/route.h
@@ -190,7 +190,7 @@ static inline int ip_route_input(struct sk_buff *skb, __be32 dst, __be32 src,
 	rcu_read_lock();
 	err = ip_route_input_noref(skb, dst, src, tos, devin);
 	if (!err) {
-		skb_dst_force_safe(skb);
+		skb_dst_force(skb);
 		if (!skb_dst(skb))
 			err = -EINVAL;
 	}
diff --git a/include/net/sock.h b/include/net/sock.h
index 03a362568357..a6b9a8d1a6df 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -856,7 +856,7 @@ void sk_stream_write_space(struct sock *sk);
 static inline void __sk_add_backlog(struct sock *sk, struct sk_buff *skb)
 {
 	/* dont let skb dst not refcounted, we are going to leave rcu lock */
-	skb_dst_force_safe(skb);
+	skb_dst_force(skb);
 
 	if (!sk->sk_backlog.tail)
 		sk->sk_backlog.head = skb;
-- 
cgit v1.2.3